diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon new file mode 100644 index 0000000000000..2744f21b5a6b3 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -0,0 +1,346 @@ +what: /sys/kernel/mm/damon/ +Date: Mar 2022 +Contact: SeongJae Park +Description: Interface for Data Access MONitoring (DAMON). Contains files + for controlling DAMON. For more details on DAMON itself, + please refer to Documentation/admin-guide/mm/damon/index.rst. + +What: /sys/kernel/mm/damon/admin/ +Date: Mar 2022 +Contact: SeongJae Park +Description: Interface for privileged users of DAMON. Contains files for + controlling DAMON that aimed to be used by privileged users. + +What: /sys/kernel/mm/damon/admin/kdamonds/nr_kdamonds +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing a number 'N' to this file creates the number of + directories for controlling each DAMON worker thread (kdamond) + named '0' to 'N-1' under the kdamonds/ directory. + +What: /sys/kernel/mm/damon/admin/kdamonds//state +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing 'on' or 'off' to this file makes the kdamond starts or + stops, respectively. Reading the file returns the keywords + based on the current status. Writing 'commit' to this file + makes the kdamond reads the user inputs in the sysfs files + except 'state' again. Writing 'update_schemes_stats' to the + file updates contents of schemes stats files of the kdamond. + Writing 'update_schemes_tried_regions' to the file updates + contents of 'tried_regions' directory of every scheme directory + of this kdamond. Writing 'clear_schemes_tried_regions' to the + file removes contents of the 'tried_regions' directory. + +What: /sys/kernel/mm/damon/admin/kdamonds//pid +Date: Mar 2022 +Contact: SeongJae Park +Description: Reading this file returns the pid of the kdamond if it is + running. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts/nr_contexts +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing a number 'N' to this file creates the number of + directories for controlling each DAMON context named '0' to + 'N-1' under the contexts/ directory. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//avail_operations +Date: Apr 2022 +Contact: SeongJae Park +Description: Reading this file returns the available monitoring operations + sets on the currently running kernel. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//operations +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing a keyword for a monitoring operations set ('vaddr' for + virtual address spaces monitoring, 'fvaddr' for fixed virtual + address ranges monitoring, and 'paddr' for the physical address + space monitoring) to this file makes the context to use the + operations set. Reading the file returns the keyword for the + operations set the context is set to use. + + Note that only the operations sets that listed in + 'avail_operations' file are valid inputs. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/sample_us +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing a value to this file sets the sampling interval of the + DAMON context in microseconds as the value. Reading this file + returns the value. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/aggr_us +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing a value to this file sets the aggregation interval of + the DAMON context in microseconds as the value. Reading this + file returns the value. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/update_us +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing a value to this file sets the update interval of the + DAMON context in microseconds as the value. Reading this file + returns the value. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/nr_regions/min + +WDate: Mar 2022 +Contact: SeongJae Park +Description: Writing a value to this file sets the minimum number of + monitoring regions of the DAMON context as the value. Reading + this file returns the value. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/nr_regions/max +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing a value to this file sets the maximum number of + monitoring regions of the DAMON context as the value. Reading + this file returns the value. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//targets/nr_targets +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing a number 'N' to this file creates the number of + directories for controlling each DAMON target of the context + named '0' to 'N-1' under the contexts/ directory. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//targets//pid_target +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the pid of + the target process if the context is for virtual address spaces + monitoring, respectively. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//targets//regions/nr_regions +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing a number 'N' to this file creates the number of + directories for setting each DAMON target memory region of the + context named '0' to 'N-1' under the regions/ directory. In + case of the virtual address space monitoring, DAMON + automatically sets the target memory region based on the target + processes' mappings. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//targets//regions//start +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the start + address of the monitoring region. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//targets//regions//end +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the end + address of the monitoring region. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes/nr_schemes +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing a number 'N' to this file creates the number of + directories for controlling each DAMON-based operation scheme + of the context named '0' to 'N-1' under the schemes/ directory. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//action +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the action + of the scheme. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//access_pattern/sz/min +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the mimimum + size of the scheme's target regions in bytes. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//access_pattern/sz/max +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the maximum + size of the scheme's target regions in bytes. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//access_pattern/nr_accesses/min +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the manimum + 'nr_accesses' of the scheme's target regions. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//access_pattern/nr_accesses/max +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the maximum + 'nr_accesses' of the scheme's target regions. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//access_pattern/age/min +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the minimum + 'age' of the scheme's target regions. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//access_pattern/age/max +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the maximum + 'age' of the scheme's target regions. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/ms +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the time + quota of the scheme in milliseconds. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/bytes +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the size + quota of the scheme in bytes. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/reset_interval_ms +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the quotas + charge reset interval of the scheme in milliseconds. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/weights/sz_permil +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the + under-quota limit regions prioritization weight for 'size' in + permil. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/weights/nr_accesses_permil +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the + under-quota limit regions prioritization weight for + 'nr_accesses' in permil. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/weights/age_permil +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the + under-quota limit regions prioritization weight for 'age' in + permil. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//watermarks/metric +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the metric + of the watermarks for the scheme. The writable/readable + keywords for this file are 'none' for disabling the watermarks + feature, or 'free_mem_rate' for the system's global free memory + rate in permil. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//watermarks/interval_us +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the metric + check interval of the watermarks for the scheme in + microseconds. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//watermarks/high +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the high + watermark of the scheme in permil. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//watermarks/mid +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the mid + watermark of the scheme in permil. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//watermarks/low +Date: Mar 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the low + watermark of the scheme in permil. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters/nr_filters +Date: Dec 2022 +Contact: SeongJae Park +Description: Writing a number 'N' to this file creates the number of + directories for setting filters of the scheme named '0' to + 'N-1' under the filters/ directory. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//type +Date: Dec 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the type of + the memory of the interest. 'anon' for anonymous pages, or + 'memcg' for specific memory cgroup can be written and read. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//memcg_path +Date: Dec 2022 +Contact: SeongJae Park +Description: If 'memcg' is written to the 'type' file, writing to and + reading from this file sets and gets the path to the memory + cgroup of the interest. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//matching +Date: Dec 2022 +Contact: SeongJae Park +Description: Writing 'Y' or 'N' to this file sets whether to filter out + pages that do or do not match to the 'type' and 'memcg_path', + respectively. Filter out means the action of the scheme will + not be applied to. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/nr_tried +Date: Mar 2022 +Contact: SeongJae Park +Description: Reading this file returns the number of regions that the action + of the scheme has tried to be applied. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/sz_tried +Date: Mar 2022 +Contact: SeongJae Park +Description: Reading this file returns the total size of regions that the + action of the scheme has tried to be applied in bytes. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/nr_applied +Date: Mar 2022 +Contact: SeongJae Park +Description: Reading this file returns the number of regions that the action + of the scheme has successfully applied. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/sz_applied +Date: Mar 2022 +Contact: SeongJae Park +Description: Reading this file returns the total size of regions that the + action of the scheme has successfully applied in bytes. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/qt_exceeds +Date: Mar 2022 +Contact: SeongJae Park +Description: Reading this file returns the number of the exceed events of + the scheme's quotas. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//start +Date: Oct 2022 +Contact: SeongJae Park +Description: Reading this file returns the start address of a memory region + that corresponding DAMON-based Operation Scheme's action has + tried to be applied. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//end +Date: Oct 2022 +Contact: SeongJae Park +Description: Reading this file returns the end address of a memory region + that corresponding DAMON-based Operation Scheme's action has + tried to be applied. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//nr_accesses +Date: Oct 2022 +Contact: SeongJae Park +Description: Reading this file returns the 'nr_accesses' of a memory region + that corresponding DAMON-based Operation Scheme's action has + tried to be applied. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//age +Date: Oct 2022 +Contact: SeongJae Park +Description: Reading this file returns the 'age' of a memory region that + corresponding DAMON-based Operation Scheme's action has tried + to be applied. diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 84742be223ff8..7c3cf24cf6728 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -15,6 +15,7 @@ are configurable at compile, boot or run time. tsx_async_abort multihit.rst special-register-buffer-data-sampling.rst + l1d_flush.rst processor_mmio_stale_data.rst gather_data_sampling.rst srso diff --git a/Documentation/admin-guide/hw-vuln/l1d_flush.rst b/Documentation/admin-guide/hw-vuln/l1d_flush.rst new file mode 100644 index 0000000000000..210020bc3f568 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/l1d_flush.rst @@ -0,0 +1,69 @@ +L1D Flushing +============ + +With an increasing number of vulnerabilities being reported around data +leaks from the Level 1 Data cache (L1D) the kernel provides an opt-in +mechanism to flush the L1D cache on context switch. + +This mechanism can be used to address e.g. CVE-2020-0550. For applications +the mechanism keeps them safe from vulnerabilities, related to leaks +(snooping of) from the L1D cache. + + +Related CVEs +------------ +The following CVEs can be addressed by this +mechanism + + ============= ======================== ================== + CVE-2020-0550 Improper Data Forwarding OS related aspects + ============= ======================== ================== + +Usage Guidelines +---------------- + +Please see document: :ref:`Documentation/userspace-api/spec_ctrl.rst +` for details. + +**NOTE**: The feature is disabled by default, applications need to +specifically opt into the feature to enable it. + +Mitigation +---------- + +When PR_SET_L1D_FLUSH is enabled for a task a flush of the L1D cache is +performed when the task is scheduled out and the incoming task belongs to a +different process and therefore to a different address space. + +If the underlying CPU supports L1D flushing in hardware, the hardware +mechanism is used, software fallback for the mitigation, is not supported. + +Mitigation control on the kernel command line +--------------------------------------------- + +The kernel command line allows to control the L1D flush mitigations at boot +time with the option "l1d_flush=". The valid arguments for this option are: + + ============ ============================================================= + on Enables the prctl interface, applications trying to use + the prctl() will fail with an error if l1d_flush is not + enabled + ============ ============================================================= + +By default the mechanism is disabled. + +Limitations +----------- + +The mechanism does not mitigate L1D data leaks between tasks belonging to +different processes which are concurrently executing on sibling threads of +a physical CPU core when SMT is enabled on the system. + +This can be addressed by controlled placement of processes on physical CPU +cores or by disabling SMT. See the relevant chapter in the L1TF mitigation +document: :ref:`Documentation/admin-guide/hw-vuln/l1tf.rst `. + +**NOTE** : The opt-in of a task for L1D flushing works only when the task's +affinity is limited to cores running in non-SMT mode. If a task which +requested L1D flushing is scheduled on a SMT-enabled core the kernel sends +a SIGBUS to the task. diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 0fba3758d0da8..52b5ef785420e 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -422,14 +422,6 @@ The possible values in this file are: 'RSB filling' Protection of RSB on context switch enabled ============= =========================================== - - EIBRS Post-barrier Return Stack Buffer (PBRSB) protection status: - - =========================== ======================================================= - 'PBRSB-eIBRS: SW sequence' CPU is affected and protection of RSB on VMEXIT enabled - 'PBRSB-eIBRS: Vulnerable' CPU is vulnerable - 'PBRSB-eIBRS: Not affected' CPU is not affected by PBRSB - =========================== ======================================================= - Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will report vulnerability. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1f7c068cf65b..3bc110326a647 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2381,6 +2381,23 @@ feature (tagged TLBs) on capable Intel chips. Default is 1 (enabled) + l1d_flush= [X86,INTEL] + Control mitigation for L1D based snooping vulnerability. + + Certain CPUs are vulnerable to an exploit against CPU + internal buffers which can forward information to a + disclosure gadget under certain conditions. + + In vulnerable processors, the speculatively + forwarded data can be used in a cache side channel + attack, to access data to which the attacker does + not have direct access. + + This parameter controls the mitigation. The + options are: + + on - enable the interface for the mitigation + l1tf= [X86] Control mitigation of the L1TF vulnerability on affected CPUs @@ -2832,6 +2849,23 @@ seconds. Use this parameter to check at some other rate. 0 disables periodic checking. + memory_hotplug.memmap_on_memory + [KNL,X86,ARM] Boolean flag to enable this feature. + Format: {on | off (default)} + When enabled, runtime hotplugged memory will + allocate its internal metadata (struct pages) + from the hotadded memory which will allow to + hotadd a lot of memory without requiring + additional memory to do so. + This feature is disabled by default because it + has some implication on large (e.g. GB) + allocations in some configurations (e.g. small + memory blocks). + The state of the flag can be read in + /sys/module/memory_hotplug/parameters/memmap_on_memory. + Note that even when enabled, there are a few cases where + the feature is not effective. + memtest= [KNL,X86,ARM,PPC] Enable memtest Format: default : 0 diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst new file mode 100644 index 0000000000000..b4d029f418a91 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -0,0 +1,17 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +DAMON: Data Access MONitor +========================== + +:doc:`DAMON ` allows light-weight data access monitoring. +Using DAMON, users can analyze the memory access patterns of their systems and +optimize those. + +.. toctree:: + :maxdepth: 2 + + start + usage + reclaim + lru_sort diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst new file mode 100644 index 0000000000000..7b0775d281b48 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -0,0 +1,294 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================= +DAMON-based LRU-lists Sorting +============================= + +DAMON-based LRU-lists Sorting (DAMON_LRU_SORT) is a static kernel module that +aimed to be used for proactive and lightweight data access pattern based +(de)prioritization of pages on their LRU-lists for making LRU-lists a more +trusworthy data access pattern source. + +Where Proactive LRU-lists Sorting is Required? +============================================== + +As page-granularity access checking overhead could be significant on huge +systems, LRU lists are normally not proactively sorted but partially and +reactively sorted for special events including specific user requests, system +calls and memory pressure. As a result, LRU lists are sometimes not so +perfectly prepared to be used as a trustworthy access pattern source for some +situations including reclamation target pages selection under sudden memory +pressure. + +Because DAMON can identify access patterns of best-effort accuracy while +inducing only user-specified range of overhead, proactively running +DAMON_LRU_SORT could be helpful for making LRU lists more trustworthy access +pattern source with low and controlled overhead. + +How It Works? +============= + +DAMON_LRU_SORT finds hot pages (pages of memory regions that showing access +rates that higher than a user-specified threshold) and cold pages (pages of +memory regions that showing no access for a time that longer than a +user-specified threshold) using DAMON, and prioritizes hot pages while +deprioritizing cold pages on their LRU-lists. To avoid it consuming too much +CPU for the prioritizations, a CPU time usage limit can be configured. Under +the limit, it prioritizes and deprioritizes more hot and cold pages first, +respectively. System administrators can also configure under what situation +this scheme should automatically activated and deactivated with three memory +pressure watermarks. + +Its default parameters for hotness/coldness thresholds and CPU quota limit are +conservatively chosen. That is, the module under its default parameters could +be widely used without harm for common situations while providing a level of +benefits for systems having clear hot/cold access patterns under memory +pressure while consuming only a limited small portion of CPU time. + +Interface: Module Parameters +============================ + +To use this feature, you should first ensure your system is running on a kernel +that is built with ``CONFIG_DAMON_LRU_SORT=y``. + +To let sysadmins enable or disable it and tune for the given system, +DAMON_LRU_SORT utilizes module parameters. That is, you can put +``damon_lru_sort.=`` on the kernel boot command line or write +proper values to ``/sys/module/damon_lru_sort/parameters/`` files. + +Below are the description of each parameter. + +enabled +------- + +Enable or disable DAMON_LRU_SORT. + +You can enable DAMON_LRU_SORT by setting the value of this parameter as ``Y``. +Setting it as ``N`` disables DAMON_LRU_SORT. Note that DAMON_LRU_SORT could do +no real monitoring and LRU-lists sorting due to the watermarks-based activation +condition. Refer to below descriptions for the watermarks parameter for this. + +commit_inputs +------------- + +Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``. + +Input parameters that updated while DAMON_LRU_SORT is running are not applied +by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT reads values +of parametrs except ``enabled`` again. Once the re-reading is done, this +parameter is set as ``N``. If invalid parameters are found while the +re-reading, DAMON_LRU_SORT will be disabled. + +hot_thres_access_freq +--------------------- + +Access frequency threshold for hot memory regions identification in permil. + +If a memory region is accessed in frequency of this or higher, DAMON_LRU_SORT +identifies the region as hot, and mark it as accessed on the LRU list, so that +it could not be reclaimed under memory pressure. 50% by default. + +cold_min_age +------------ + +Time threshold for cold memory regions identification in microseconds. + +If a memory region is not accessed for this or longer time, DAMON_LRU_SORT +identifies the region as cold, and mark it as unaccessed on the LRU list, so +that it could be reclaimed first under memory pressure. 120 seconds by +default. + +quota_ms +-------- + +Limit of time for trying the LRU lists sorting in milliseconds. + +DAMON_LRU_SORT tries to use only up to this time within a time window +(quota_reset_interval_ms) for trying LRU lists sorting. This can be used +for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the +limit is disabled. + +10 ms by default. + +quota_reset_interval_ms +----------------------- + +The time quota charge reset interval in milliseconds. + +The charge reset interval for the quota of time (quota_ms). That is, +DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms +milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds. + +1 second by default. + +wmarks_interval +--------------- + +The watermarks check time interval in microseconds. + +Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is +enabled but inactive due to its watermarks rule. 5 seconds by default. + +wmarks_high +----------- + +Free memory rate (per thousand) for the high watermark. + +If free memory of the system in bytes per thousand bytes is higher than this, +DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the +watermarks. 200 (20%) by default. + +wmarks_mid +---------- + +Free memory rate (per thousand) for the middle watermark. + +If free memory of the system in bytes per thousand bytes is between this and +the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring and +the LRU-lists sorting. 150 (15%) by default. + +wmarks_low +---------- + +Free memory rate (per thousand) for the low watermark. + +If free memory of the system in bytes per thousand bytes is lower than this, +DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the +watermarks. 50 (5%) by default. + +sample_interval +--------------- + +Sampling interval for the monitoring in microseconds. + +The sampling interval of DAMON for the cold memory monitoring. Please refer to +the DAMON documentation (:doc:`usage`) for more detail. 5ms by default. + +aggr_interval +------------- + +Aggregation interval for the monitoring in microseconds. + +The aggregation interval of DAMON for the cold memory monitoring. Please +refer to the DAMON documentation (:doc:`usage`) for more detail. 100ms by +default. + +min_nr_regions +-------------- + +Minimum number of monitoring regions. + +The minimal number of monitoring regions of DAMON for the cold memory +monitoring. This can be used to set lower-bound of the monitoring quality. +But, setting this too high could result in increased monitoring overhead. +Please refer to the DAMON documentation (:doc:`usage`) for more detail. 10 by +default. + +max_nr_regions +-------------- + +Maximum number of monitoring regions. + +The maximum number of monitoring regions of DAMON for the cold memory +monitoring. This can be used to set upper-bound of the monitoring overhead. +However, setting this too low could result in bad monitoring quality. Please +refer to the DAMON documentation (:doc:`usage`) for more detail. 1000 by +defaults. + +monitor_region_start +-------------------- + +Start of target memory region in physical address. + +The start physical address of memory region that DAMON_LRU_SORT will do work +against. By default, biggest System RAM is used as the region. + +monitor_region_end +------------------ + +End of target memory region in physical address. + +The end physical address of memory region that DAMON_LRU_SORT will do work +against. By default, biggest System RAM is used as the region. + +kdamond_pid +----------- + +PID of the DAMON thread. + +If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. Else, +-1. + +nr_lru_sort_tried_hot_regions +----------------------------- + +Number of hot memory regions that tried to be LRU-sorted. + +bytes_lru_sort_tried_hot_regions +-------------------------------- + +Total bytes of hot memory regions that tried to be LRU-sorted. + +nr_lru_sorted_hot_regions +------------------------- + +Number of hot memory regions that successfully be LRU-sorted. + +bytes_lru_sorted_hot_regions +---------------------------- + +Total bytes of hot memory regions that successfully be LRU-sorted. + +nr_hot_quota_exceeds +-------------------- + +Number of times that the time quota limit for hot regions have exceeded. + +nr_lru_sort_tried_cold_regions +------------------------------ + +Number of cold memory regions that tried to be LRU-sorted. + +bytes_lru_sort_tried_cold_regions +--------------------------------- + +Total bytes of cold memory regions that tried to be LRU-sorted. + +nr_lru_sorted_cold_regions +-------------------------- + +Number of cold memory regions that successfully be LRU-sorted. + +bytes_lru_sorted_cold_regions +----------------------------- + +Total bytes of cold memory regions that successfully be LRU-sorted. + +nr_cold_quota_exceeds +--------------------- + +Number of times that the time quota limit for cold regions have exceeded. + +Example +======= + +Below runtime example commands make DAMON_LRU_SORT to find memory regions +having >=50% access frequency and LRU-prioritize while LRU-deprioritizing +memory regions that not accessed for 120 seconds. The prioritization and +deprioritization is limited to be done using only up to 1% CPU time to avoid +DAMON_LRU_SORT consuming too much CPU time for the (de)prioritization. It also +asks DAMON_LRU_SORT to do nothing if the system's free memory rate is more than +50%, but start the real works if it becomes lower than 40%. If DAMON_RECLAIM +doesn't make progress and therefore the free memory rate becomes lower than +20%, it asks DAMON_LRU_SORT to do nothing again, so that we can fall back to +the LRU-list based page granularity reclamation. :: + + # cd /sys/module/damon_lru_sort/parameters + # echo 500 > hot_thres_access_freq + # echo 120000000 > cold_min_age + # echo 10 > quota_ms + # echo 1000 > quota_reset_interval_ms + # echo 500 > wmarks_high + # echo 400 > wmarks_mid + # echo 200 > wmarks_low + # echo Y > enabled diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst new file mode 100644 index 0000000000000..3394191db9851 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -0,0 +1,274 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +DAMON-based Reclamation +======================= + +DAMON-based Reclamation (DAMON_RECLAIM) is a static kernel module that aimed to +be used for proactive and lightweight reclamation under light memory pressure. +It doesn't aim to replace the LRU-list based page_granularity reclamation, but +to be selectively used for different level of memory pressure and requirements. + +Where Proactive Reclamation is Required? +======================================== + +On general memory over-committed systems, proactively reclaiming cold pages +helps saving memory and reducing latency spikes that incurred by the direct +reclaim of the process or CPU consumption of kswapd, while incurring only +minimal performance degradation [1]_ [2]_ . + +Free Pages Reporting [3]_ based memory over-commit virtualization systems are +good example of the cases. In such systems, the guest VMs reports their free +memory to host, and the host reallocates the reported memory to other guests. +As a result, the memory of the systems are fully utilized. However, the +guests could be not so memory-frugal, mainly because some kernel subsystems and +user-space applications are designed to use as much memory as available. Then, +guests could report only small amount of memory as free to host, results in +memory utilization drop of the systems. Running the proactive reclamation in +guests could mitigate this problem. + +How It Works? +============= + +DAMON_RECLAIM finds memory regions that didn't accessed for specific time +duration and page out. To avoid it consuming too much CPU for the paging out +operation, a speed limit can be configured. Under the speed limit, it pages +out memory regions that didn't accessed longer time first. System +administrators can also configure under what situation this scheme should +automatically activated and deactivated with three memory pressure watermarks. + +Interface: Module Parameters +============================ + +To use this feature, you should first ensure your system is running on a kernel +that is built with ``CONFIG_DAMON_RECLAIM=y``. + +To let sysadmins enable or disable it and tune for the given system, +DAMON_RECLAIM utilizes module parameters. That is, you can put +``damon_reclaim.=`` on the kernel boot command line or write +proper values to ``/sys/module/damon_reclaim/parameters/`` files. + +Below are the description of each parameter. + +enabled +------- + +Enable or disable DAMON_RECLAIM. + +You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``. +Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could do +no real monitoring and reclamation due to the watermarks-based activation +condition. Refer to below descriptions for the watermarks parameter for this. + +commit_inputs +------------- + +Make DAMON_RECLAIM reads the input parameters again, except ``enabled``. + +Input parameters that updated while DAMON_RECLAIM is running are not applied +by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values +of parametrs except ``enabled`` again. Once the re-reading is done, this +parameter is set as ``N``. If invalid parameters are found while the +re-reading, DAMON_RECLAIM will be disabled. + +min_age +------- + +Time threshold for cold memory regions identification in microseconds. + +If a memory region is not accessed for this or longer time, DAMON_RECLAIM +identifies the region as cold, and reclaims it. + +120 seconds by default. + +quota_ms +-------- + +Limit of time for the reclamation in milliseconds. + +DAMON_RECLAIM tries to use only up to this time within a time window +(quota_reset_interval_ms) for trying reclamation of cold pages. This can be +used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, the +limit is disabled. + +10 ms by default. + +quota_sz +-------- + +Limit of size of memory for the reclamation in bytes. + +DAMON_RECLAIM charges amount of memory which it tried to reclaim within a time +window (quota_reset_interval_ms) and makes no more than this limit is tried. +This can be used for limiting consumption of CPU and IO. If this value is +zero, the limit is disabled. + +128 MiB by default. + +quota_reset_interval_ms +----------------------- + +The time/size quota charge reset interval in milliseconds. + +The charget reset interval for the quota of time (quota_ms) and size +(quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than +quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms +milliseconds. + +1 second by default. + +wmarks_interval +--------------- + +Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is +enabled but inactive due to its watermarks rule. + +wmarks_high +----------- + +Free memory rate (per thousand) for the high watermark. + +If free memory of the system in bytes per thousand bytes is higher than this, +DAMON_RECLAIM becomes inactive, so it does nothing but only periodically checks +the watermarks. + +wmarks_mid +---------- + +Free memory rate (per thousand) for the middle watermark. + +If free memory of the system in bytes per thousand bytes is between this and +the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring and +the reclaiming. + +wmarks_low +---------- + +Free memory rate (per thousand) for the low watermark. + +If free memory of the system in bytes per thousand bytes is lower than this, +DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks the +watermarks. In the case, the system falls back to the LRU-list based page +granularity reclamation logic. + +sample_interval +--------------- + +Sampling interval for the monitoring in microseconds. + +The sampling interval of DAMON for the cold memory monitoring. Please refer to +the DAMON documentation (:doc:`usage`) for more detail. + +aggr_interval +------------- + +Aggregation interval for the monitoring in microseconds. + +The aggregation interval of DAMON for the cold memory monitoring. Please +refer to the DAMON documentation (:doc:`usage`) for more detail. + +min_nr_regions +-------------- + +Minimum number of monitoring regions. + +The minimal number of monitoring regions of DAMON for the cold memory +monitoring. This can be used to set lower-bound of the monitoring quality. +But, setting this too high could result in increased monitoring overhead. +Please refer to the DAMON documentation (:doc:`usage`) for more detail. + +max_nr_regions +-------------- + +Maximum number of monitoring regions. + +The maximum number of monitoring regions of DAMON for the cold memory +monitoring. This can be used to set upper-bound of the monitoring overhead. +However, setting this too low could result in bad monitoring quality. Please +refer to the DAMON documentation (:doc:`usage`) for more detail. + +monitor_region_start +-------------------- + +Start of target memory region in physical address. + +The start physical address of memory region that DAMON_RECLAIM will do work +against. That is, DAMON_RECLAIM will find cold memory regions in this region +and reclaims. By default, biggest System RAM is used as the region. + +monitor_region_end +------------------ + +End of target memory region in physical address. + +The end physical address of memory region that DAMON_RECLAIM will do work +against. That is, DAMON_RECLAIM will find cold memory regions in this region +and reclaims. By default, biggest System RAM is used as the region. + +skip_anon +--------- + +Skip anonymous pages reclamation. + +If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous +pages. By default, ``N``. + + +kdamond_pid +----------- + +PID of the DAMON thread. + +If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else, +-1. + +nr_reclaim_tried_regions +------------------------ + +Number of memory regions that tried to be reclaimed by DAMON_RECLAIM. + +bytes_reclaim_tried_regions +--------------------------- + +Total bytes of memory regions that tried to be reclaimed by DAMON_RECLAIM. + +nr_reclaimed_regions +-------------------- + +Number of memory regions that successfully be reclaimed by DAMON_RECLAIM. + +bytes_reclaimed_regions +----------------------- + +Total bytes of memory regions that successfully be reclaimed by DAMON_RECLAIM. + +nr_quota_exceeds +---------------- + +Number of times that the time/space quota limits have exceeded. + +Example +======= + +Below runtime example commands make DAMON_RECLAIM to find memory regions that +not accessed for 30 seconds or more and pages out. The reclamation is limited +to be done only up to 1 GiB per second to avoid DAMON_RECLAIM consuming too +much CPU time for the paging out operation. It also asks DAMON_RECLAIM to do +nothing if the system's free memory rate is more than 50%, but start the real +works if it becomes lower than 40%. If DAMON_RECLAIM doesn't make progress and +therefore the free memory rate becomes lower than 20%, it asks DAMON_RECLAIM to +do nothing again, so that we can fall back to the LRU-list based page +granularity reclamation. :: + + # cd /sys/module/damon_reclaim/parameters + # echo 30000000 > min_age + # echo $((1 * 1024 * 1024 * 1024)) > quota_sz + # echo 1000 > quota_reset_interval_ms + # echo 500 > wmarks_high + # echo 400 > wmarks_mid + # echo 200 > wmarks_low + # echo Y > enabled + +.. [1] https://research.google/pubs/pub48551/ +.. [2] https://lwn.net/Articles/787611/ +.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst new file mode 100644 index 0000000000000..9f88afc734da4 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -0,0 +1,127 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Getting Started +=============== + +This document briefly describes how you can use DAMON by demonstrating its +default user space tool. Please note that this document describes only a part +of its features for brevity. Please refer to the usage `doc +`_ of the tool for more +details. + + +Prerequisites +============= + +Kernel +------ + +You should first ensure your system is running on a kernel built with +``CONFIG_DAMON_*=y``. + + +User Space Tool +--------------- + +For the demonstration, we will use the default user space tool for DAMON, +called DAMON Operator (DAMO). It is available at +https://github.com/awslabs/damo. The examples below assume that ``damo`` is on +your ``$PATH``. It's not mandatory, though. + +Because DAMO is using the sysfs interface (refer to :doc:`usage` for the +detail) of DAMON, you should ensure :doc:`sysfs ` is +mounted. + + +Recording Data Access Patterns +============================== + +The commands below record the memory access patterns of a program and save the +monitoring results to a file. :: + + $ git clone https://github.com/sjp38/masim + $ cd masim; make; ./masim ./configs/zigzag.cfg & + $ sudo damo record -o damon.data $(pidof masim) + +The first two lines of the commands download an artificial memory access +generator program and run it in the background. The generator will repeatedly +access two 100 MiB sized memory regions one by one. You can substitute this +with your real workload. The last line asks ``damo`` to record the access +pattern in the ``damon.data`` file. + + +Visualizing Recorded Patterns +============================= + +You can visualize the pattern in a heatmap, showing which memory region +(x-axis) got accessed when (y-axis) and how frequently (number).:: + + $ sudo damo report heats --heatmap stdout + 22222222222222222222222222222222222222211111111111111111111111111111111111111100 + 44444444444444444444444444444444444444434444444444444444444444444444444444443200 + 44444444444444444444444444444444444444433444444444444444444444444444444444444200 + 33333333333333333333333333333333333333344555555555555555555555555555555555555200 + 33333333333333333333333333333333333344444444444444444444444444444444444444444200 + 22222222222222222222222222222222222223355555555555555555555555555555555555555200 + 00000000000000000000000000000000000000288888888888888888888888888888888888888400 + 00000000000000000000000000000000000000288888888888888888888888888888888888888400 + 33333333333333333333333333333333333333355555555555555555555555555555555555555200 + 88888888888888888888888888888888888888600000000000000000000000000000000000000000 + 88888888888888888888888888888888888888600000000000000000000000000000000000000000 + 33333333333333333333333333333333333333444444444444444444444444444444444444443200 + 00000000000000000000000000000000000000288888888888888888888888888888888888888400 + [...] + # access_frequency: 0 1 2 3 4 5 6 7 8 9 + # x-axis: space (139728247021568-139728453431248: 196.848 MiB) + # y-axis: time (15256597248362-15326899978162: 1 m 10.303 s) + # resolution: 80x40 (2.461 MiB and 1.758 s for each character) + +You can also visualize the distribution of the working set size, sorted by the +size.:: + + $ sudo damo report wss --range 0 101 10 + # + # target_id 18446632103789443072 + # avr: 107.708 MiB + 0 0 B | | + 10 95.328 MiB |**************************** | + 20 95.332 MiB |**************************** | + 30 95.340 MiB |**************************** | + 40 95.387 MiB |**************************** | + 50 95.387 MiB |**************************** | + 60 95.398 MiB |**************************** | + 70 95.398 MiB |**************************** | + 80 95.504 MiB |**************************** | + 90 190.703 MiB |********************************************************* | + 100 196.875 MiB |***********************************************************| + +Using ``--sortby`` option with the above command, you can show how the working +set size has chronologically changed.:: + + $ sudo damo report wss --range 0 101 10 --sortby time + # + # target_id 18446632103789443072 + # avr: 107.708 MiB + 0 3.051 MiB | | + 10 190.703 MiB |***********************************************************| + 20 95.336 MiB |***************************** | + 30 95.328 MiB |***************************** | + 40 95.387 MiB |***************************** | + 50 95.332 MiB |***************************** | + 60 95.320 MiB |***************************** | + 70 95.398 MiB |***************************** | + 80 95.398 MiB |***************************** | + 90 95.340 MiB |***************************** | + 100 95.398 MiB |***************************** | + + +Data Access Pattern Aware Memory Management +=========================================== + +Below three commands make every memory region of size >=4K that doesn't +accessed for >=60 seconds in your workload to be swapped out. :: + + $ echo "#min-size max-size min-acc max-acc min-age max-age action" > test_scheme + $ echo "4K max 0 0 60s max pageout" >> test_scheme + $ damo schemes -c test_scheme diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst new file mode 100644 index 0000000000000..1772770eedbe4 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -0,0 +1,810 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Detailed Usages +=============== + +DAMON provides below interfaces for different users. + +- *DAMON user space tool.* + `This `_ is for privileged people such as + system administrators who want a just-working human-friendly interface. + Using this, users can use the DAMON’s major features in a human-friendly way. + It may not be highly tuned for special cases, though. It supports both + virtual and physical address spaces monitoring. For more detail, please + refer to its `usage document + `_. +- *sysfs interface.* + :ref:`This ` is for privileged user space programmers who + want more optimized use of DAMON. Using this, users can use DAMON’s major + features by reading from and writing to special sysfs files. Therefore, + you can write and use your personalized DAMON sysfs wrapper programs that + reads/writes the sysfs files instead of you. The `DAMON user space tool + `_ is one example of such programs. It + supports both virtual and physical address spaces monitoring. Note that this + interface provides only simple :ref:`statistics ` for the + monitoring results. For detailed monitoring results, DAMON provides a + :ref:`tracepoint `. +- *debugfs interface. (DEPRECATED!)* + :ref:`This ` is almost identical to :ref:`sysfs interface + `. This is deprecated, so users should move to the + :ref:`sysfs interface `. If you depend on this and cannot + move, please report your usecase to damon@lists.linux.dev and + linux-mm@kvack.org. +- *Kernel Space Programming Interface.* + :doc:`This ` is for kernel space programmers. Using this, + users can utilize every feature of DAMON most flexibly and efficiently by + writing kernel space DAMON application programs for you. You can even extend + DAMON for various address spaces. For detail, please refer to the interface + :doc:`document `. + +.. _sysfs_interface: + +sysfs Interface +=============== + +DAMON sysfs interface is built when ``CONFIG_DAMON_SYSFS`` is defined. It +creates multiple directories and files under its sysfs directory, +``/kernel/mm/damon/``. You can control DAMON by writing to and reading +from the files under the directory. + +For a short example, users can monitor the virtual address space of a given +workload as below. :: + + # cd /sys/kernel/mm/damon/admin/ + # echo 1 > kdamonds/nr_kdamonds && echo 1 > kdamonds/0/contexts/nr_contexts + # echo vaddr > kdamonds/0/contexts/0/operations + # echo 1 > kdamonds/0/contexts/0/targets/nr_targets + # echo $(pidof ) > kdamonds/0/contexts/0/targets/0/pid_target + # echo on > kdamonds/0/state + +Files Hierarchy +--------------- + +The files hierarchy of DAMON sysfs interface is shown below. In the below +figure, parents-children relations are represented with indentations, each +directory is having ``/`` suffix, and files in each directory are separated by +comma (","). :: + + /sys/kernel/mm/damon/admin + │ kdamonds/nr_kdamonds + │ │ 0/state,pid + │ │ │ contexts/nr_contexts + │ │ │ │ 0/avail_operations,operations + │ │ │ │ │ monitoring_attrs/ + │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us + │ │ │ │ │ │ nr_regions/min,max + │ │ │ │ │ targets/nr_targets + │ │ │ │ │ │ 0/pid_target + │ │ │ │ │ │ │ regions/nr_regions + │ │ │ │ │ │ │ │ 0/start,end + │ │ │ │ │ │ │ │ ... + │ │ │ │ │ │ ... + │ │ │ │ │ schemes/nr_schemes + │ │ │ │ │ │ 0/action + │ │ │ │ │ │ │ access_pattern/ + │ │ │ │ │ │ │ │ sz/min,max + │ │ │ │ │ │ │ │ nr_accesses/min,max + │ │ │ │ │ │ │ │ age/min,max + │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms + │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil + │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low + │ │ │ │ │ │ │ filters/nr_filters + │ │ │ │ │ │ │ │ 0/type,matching,memcg_id + │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds + │ │ │ │ │ │ │ tried_regions/ + │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age + │ │ │ │ │ │ │ │ ... + │ │ │ │ │ │ ... + │ │ │ │ ... + │ │ ... + +Root +---- + +The root of the DAMON sysfs interface is ``/kernel/mm/damon/``, and it +has one directory named ``admin``. The directory contains the files for +privileged user space programs' control of DAMON. User space tools or deamons +having the root permission could use this directory. + +kdamonds/ +--------- + +The monitoring-related information including request specifications and results +are called DAMON context. DAMON executes each context with a kernel thread +called kdamond, and multiple kdamonds could run in parallel. + +Under the ``admin`` directory, one directory, ``kdamonds``, which has files for +controlling the kdamonds exist. In the beginning, this directory has only one +file, ``nr_kdamonds``. Writing a number (``N``) to the file creates the number +of child directories named ``0`` to ``N-1``. Each directory represents each +kdamond. + +kdamonds// +------------- + +In each kdamond directory, two files (``state`` and ``pid``) and one directory +(``contexts``) exist. + +Reading ``state`` returns ``on`` if the kdamond is currently running, or +``off`` if it is not running. Writing ``on`` or ``off`` makes the kdamond be +in the state. Writing ``commit`` to the ``state`` file makes kdamond reads the +user inputs in the sysfs files except ``state`` file again. Writing +``update_schemes_stats`` to ``state`` file updates the contents of stats files +for each DAMON-based operation scheme of the kdamond. For details of the +stats, please refer to :ref:`stats section `. Writing +``update_schemes_tried_regions`` to ``state`` file updates the DAMON-based +operation scheme action tried regions directory for each DAMON-based operation +scheme of the kdamond. Writing ``clear_schemes_tried_regions`` to ``state`` +file clears the DAMON-based operating scheme action tried regions directory for +each DAMON-based operation scheme of the kdamond. For details of the +DAMON-based operation scheme action tried regions directory, please refer to +:ref:tried_regions section `. + +If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread. + +``contexts`` directory contains files for controlling the monitoring contexts +that this kdamond will execute. + +kdamonds//contexts/ +---------------------- + +In the beginning, this directory has only one file, ``nr_contexts``. Writing a +number (``N``) to the file creates the number of child directories named as +``0`` to ``N-1``. Each directory represents each monitoring context. At the +moment, only one context per kdamond is supported, so only ``0`` or ``1`` can +be written to the file. + +.. _sysfs_contexts: + +contexts// +------------- + +In each context directory, two files (``avail_operations`` and ``operations``) +and three directories (``monitoring_attrs``, ``targets``, and ``schemes``) +exist. + +DAMON supports multiple types of monitoring operations, including those for +virtual address space and the physical address space. You can get the list of +available monitoring operations set on the currently running kernel by reading +``avail_operations`` file. Based on the kernel configuration, the file will +list some or all of below keywords. + + - vaddr: Monitor virtual address spaces of specific processes + - fvaddr: Monitor fixed virtual address ranges + - paddr: Monitor the physical address space of the system + +Please refer to :ref:`regions sysfs directory ` for detailed +differences between the operations sets in terms of the monitoring target +regions. + +You can set and get what type of monitoring operations DAMON will use for the +context by writing one of the keywords listed in ``avail_operations`` file and +reading from the ``operations`` file. + +.. _sysfs_monitoring_attrs: + +contexts//monitoring_attrs/ +------------------------------ + +Files for specifying attributes of the monitoring including required quality +and efficiency of the monitoring are in ``monitoring_attrs`` directory. +Specifically, two directories, ``intervals`` and ``nr_regions`` exist in this +directory. + +Under ``intervals`` directory, three files for DAMON's sampling interval +(``sample_us``), aggregation interval (``aggr_us``), and update interval +(``update_us``) exist. You can set and get the values in micro-seconds by +writing to and reading from the files. + +Under ``nr_regions`` directory, two files for the lower-bound and upper-bound +of DAMON's monitoring regions (``min`` and ``max``, respectively), which +controls the monitoring overhead, exist. You can set and get the values by +writing to and rading from the files. + +For more details about the intervals and monitoring regions range, please refer +to the Design document (:doc:`/vm/damon/design`). + +contexts//targets/ +--------------------- + +In the beginning, this directory has only one file, ``nr_targets``. Writing a +number (``N``) to the file creates the number of child directories named ``0`` +to ``N-1``. Each directory represents each monitoring target. + +targets// +------------ + +In each target directory, one file (``pid_target``) and one directory +(``regions``) exist. + +If you wrote ``vaddr`` to the ``contexts//operations``, each target should +be a process. You can specify the process to DAMON by writing the pid of the +process to the ``pid_target`` file. + +.. _sysfs_regions: + +targets//regions +------------------- + +When ``vaddr`` monitoring operations set is being used (``vaddr`` is written to +the ``contexts//operations`` file), DAMON automatically sets and updates the +monitoring target regions so that entire memory mappings of target processes +can be covered. However, users could want to set the initial monitoring region +to specific address ranges. + +In contrast, DAMON do not automatically sets and updates the monitoring target +regions when ``fvaddr`` or ``paddr`` monitoring operations sets are being used +(``fvaddr`` or ``paddr`` have written to the ``contexts//operations``). +Therefore, users should set the monitoring target regions by themselves in the +cases. + +For such cases, users can explicitly set the initial monitoring target regions +as they want, by writing proper values to the files under this directory. + +In the beginning, this directory has only one file, ``nr_regions``. Writing a +number (``N``) to the file creates the number of child directories named ``0`` +to ``N-1``. Each directory represents each initial monitoring target region. + +regions// +------------ + +In each region directory, you will find two files (``start`` and ``end``). You +can set and get the start and end addresses of the initial monitoring target +region by writing to and reading from the files, respectively. + +Each region should not overlap with others. ``end`` of directory ``N`` should +be equal or smaller than ``start`` of directory ``N+1``. + +contexts//schemes/ +--------------------- + +For usual DAMON-based data access aware memory management optimizations, users +would normally want the system to apply a memory management action to a memory +region of a specific access pattern. DAMON receives such formalized operation +schemes from the user and applies those to the target memory regions. Users +can get and set the schemes by reading from and writing to files under this +directory. + +In the beginning, this directory has only one file, ``nr_schemes``. Writing a +number (``N``) to the file creates the number of child directories named ``0`` +to ``N-1``. Each directory represents each DAMON-based operation scheme. + +schemes// +------------ + +In each scheme directory, five directories (``access_pattern``, ``quotas``, +``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and one file +(``action``) exist. + +The ``action`` file is for setting and getting what action you want to apply to +memory regions having specific access pattern of the interest. The keywords +that can be written to and read from the file and their meaning are as below. + +Note that support of each action depends on the running DAMON operations set +`implementation `. + + - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. + Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set. + - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``lru_prio``: Prioritize the region on its LRU lists. + Supported by ``paddr`` operations set. + - ``lru_deprio``: Deprioritize the region on its LRU lists. + Supported by ``paddr`` operations set. + - ``stat``: Do nothing but count the statistics. + Supported by all operations sets. + +schemes//access_pattern/ +--------------------------- + +The target access pattern of each DAMON-based operation scheme is constructed +with three ranges including the size of the region in bytes, number of +monitored accesses per aggregate interval, and number of aggregated intervals +for the age of the region. + +Under the ``access_pattern`` directory, three directories (``sz``, +``nr_accesses``, and ``age``) each having two files (``min`` and ``max``) +exist. You can set and get the access pattern for the given scheme by writing +to and reading from the ``min`` and ``max`` files under ``sz``, +``nr_accesses``, and ``age`` directories, respectively. + +schemes//quotas/ +------------------- + +Optimal ``target access pattern`` for each ``action`` is workload dependent, so +not easy to find. Worse yet, setting a scheme of some action too aggressive +can cause severe overhead. To avoid such overhead, users can limit time and +size quota for each scheme. In detail, users can ask DAMON to try to use only +up to specific time (``time quota``) for applying the action, and to apply the +action to only up to specific amount (``size quota``) of memory regions having +the target access pattern within a given time interval (``reset interval``). + +When the quota limit is expected to be exceeded, DAMON prioritizes found memory +regions of the ``target access pattern`` based on their size, access frequency, +and age. For personalized prioritization, users can set the weights for the +three properties. + +Under ``quotas`` directory, three files (``ms``, ``bytes``, +``reset_interval_ms``) and one directory (``weights``) having three files +(``sz_permil``, ``nr_accesses_permil``, and ``age_permil``) in it exist. + +You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and +``reset interval`` in milliseconds by writing the values to the three files, +respectively. You can also set the prioritization weights for size, access +frequency, and age in per-thousand unit by writing the values to the three +files under the ``weights`` directory. + +schemes//watermarks/ +----------------------- + +To allow easy activation and deactivation of each scheme based on system +status, DAMON provides a feature called watermarks. The feature receives five +values called ``metric``, ``interval``, ``high``, ``mid``, and ``low``. The +``metric`` is the system metric such as free memory ratio that can be measured. +If the metric value of the system is higher than the value in ``high`` or lower +than ``low`` at the memoent, the scheme is deactivated. If the value is lower +than ``mid``, the scheme is activated. + +Under the watermarks directory, five files (``metric``, ``interval_us``, +``high``, ``mid``, and ``low``) for setting each value exist. You can set and +get the five values by writing to the files, respectively. + +Keywords and meanings of those that can be written to the ``metric`` file are +as below. + + - none: Ignore the watermarks + - free_mem_rate: System's free memory rate (per thousand) + +The ``interval`` should written in microseconds unit. + +schemes//filters/ +-------------------- + +Users could know something more than the kernel for specific types of memory. +In the case, users could do their own management for the memory and hence +doesn't want DAMOS bothers that. Users could limit DAMOS by setting the access +pattern of the scheme and/or the monitoring regions for the purpose, but that +can be inefficient in some cases. In such cases, users could set non-access +pattern driven filters using files in this directory. + +In the beginning, this directory has only one file, ``nr_filters``. Writing a +number (``N``) to the file creates the number of child directories named ``0`` +to ``N-1``. Each directory represents each filter. The filters are evaluated +in the numeric order. + +Each filter directory contains three files, namely ``type``, ``matcing``, and +``memcg_path``. You can write one of two special keywords, ``anon`` for +anonymous pages, or ``memcg`` for specific memory cgroup filtering. In case of +the memory cgroup filtering, you can specify the memory cgroup of the interest +by writing the path of the memory cgroup from the cgroups mount point to +``memcg_path`` file. You can write ``Y`` or ``N`` to ``matching`` file to +filter out pages that does or does not match to the type, respectively. Then, +the scheme's action will not be applied to the pages that specified to be +filtered out. + +For example, below restricts a DAMOS action to be applied to only non-anonymous +pages of all memory cgroups except ``/having_care_already``.:: + + # echo 2 > nr_filters + # # filter out anonymous pages + echo anon > 0/type + echo Y > 0/matching + # # further filter out all cgroups except one at '/having_care_already' + echo memcg > 1/type + echo /having_care_already > 1/memcg_path + echo N > 1/matching + +Note that filters are currently supported only when ``paddr`` +`implementation ` is being used. + +.. _sysfs_schemes_stats: + +schemes//stats/ +------------------ + +DAMON counts the total number and bytes of regions that each scheme is tried to +be applied, the two numbers for the regions that each scheme is successfully +applied, and the total number of the quota limit exceeds. This statistics can +be used for online analysis or tuning of the schemes. + +The statistics can be retrieved by reading the files under ``stats`` directory +(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, and +``qt_exceeds``), respectively. The files are not updated in real time, so you +should ask DAMON sysfs interface to updte the content of the files for the +stats by writing a special keyword, ``update_schemes_stats`` to the relevant +``kdamonds//state`` file. + +.. _sysfs_schemes_tried_regions: + +schemes//tried_regions/ +-------------------------- + +When a special keyword, ``update_schemes_tried_regions``, is written to the +relevant ``kdamonds//state`` file, DAMON creates directories named integer +starting from ``0`` under this directory. Each directory contains files +exposing detailed information about each of the memory region that the +corresponding scheme's ``action`` has tried to be applied under this directory, +during next :ref:`aggregation interval `. The +information includes address range, ``nr_accesses``, , and ``age`` of the +region. + +The directories will be removed when another special keyword, +``clear_schemes_tried_regions``, is written to the relevant +``kdamonds//state`` file. + +tried_regions// +------------------ + +In each region directory, you will find four files (``start``, ``end``, +``nr_accesses``, and ``age``). Reading the files will show the start and end +addresses, ``nr_accesses``, and ``age`` of the region that corresponding +DAMON-based operation scheme ``action`` has tried to be applied. + +Example +~~~~~~~ + +Below commands applies a scheme saying "If a memory region of size in [4KiB, +8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate +interval in [10, 20], page out the region. For the paging out, use only up to +10ms per second, and also don't page out more than 1GiB per second. Under the +limitation, page out memory regions having longer age first. Also, check the +free memory rate of the system every 5 seconds, start the monitoring and paging +out when the free memory rate becomes lower than 50%, but stop it if the free +memory rate becomes larger than 60%, or lower than 30%". :: + + # cd /kernel/mm/damon/admin + # # populate directories + # echo 1 > kdamonds/nr_kdamonds; echo 1 > kdamonds/0/contexts/nr_contexts; + # echo 1 > kdamonds/0/contexts/0/schemes/nr_schemes + # cd kdamonds/0/contexts/0/schemes/0 + # # set the basic access pattern and the action + # echo 4096 > access_pattern/sz/min + # echo 8192 > access_pattern/sz/max + # echo 0 > access_pattern/nr_accesses/min + # echo 5 > access_pattern/nr_accesses/max + # echo 10 > access_pattern/age/min + # echo 20 > access_pattern/age/max + # echo pageout > action + # # set quotas + # echo 10 > quotas/ms + # echo $((1024*1024*1024)) > quotas/bytes + # echo 1000 > quotas/reset_interval_ms + # # set watermark + # echo free_mem_rate > watermarks/metric + # echo 5000000 > watermarks/interval_us + # echo 600 > watermarks/high + # echo 500 > watermarks/mid + # echo 300 > watermarks/low + +Please note that it's highly recommended to use user space tools like `damo +`_ rather than manually reading and writing +the files as above. Above is only for an example. + +.. _debugfs_interface: + +debugfs Interface (DEPRECATED!) +=============================== + +.. note:: + + THIS IS DEPRECATED! + + DAMON debugfs interface is deprecated, so users should move to the + :ref:`sysfs interface `. If you depend on this and cannot + move, please report your usecase to damon@lists.linux.dev and + linux-mm@kvack.org. + +DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, +``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and +``rm_contexts`` under its debugfs directory, ``/damon/``. + + +Attributes +---------- + +Users can get and set the ``sampling interval``, ``aggregation interval``, +``update interval``, and min/max number of monitoring target regions by +reading from and writing to the ``attrs`` file. To know about the monitoring +attributes in detail, please refer to the :doc:`/vm/damon/design`. For +example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and +1000, and then check it again:: + + # cd /damon + # echo 5000 100000 1000000 10 1000 > attrs + # cat attrs + 5000 100000 1000000 10 1000 + + +Target IDs +---------- + +Some types of address spaces supports multiple monitoring target. For example, +the virtual memory address spaces monitoring can have multiple processes as the +monitoring targets. Users can set the targets by writing relevant id values of +the targets to, and get the ids of the current targets by reading from the +``target_ids`` file. In case of the virtual address spaces monitoring, the +values should be pids of the monitoring target processes. For example, below +commands set processes having pids 42 and 4242 as the monitoring targets and +check it again:: + + # cd /damon + # echo 42 4242 > target_ids + # cat target_ids + 42 4242 + +Users can also monitor the physical memory address space of the system by +writing a special keyword, "``paddr\n``" to the file. Because physical address +space monitoring doesn't support multiple targets, reading the file will show a +fake value, ``42``, as below:: + + # cd /damon + # echo paddr > target_ids + # cat target_ids + 42 + +Note that setting the target ids doesn't start the monitoring. + + +Initial Monitoring Target Regions +--------------------------------- + +In case of the virtual address space monitoring, DAMON automatically sets and +updates the monitoring target regions so that entire memory mappings of target +processes can be covered. However, users can want to limit the monitoring +region to specific address ranges, such as the heap, the stack, or specific +file-mapped area. Or, some users can know the initial access pattern of their +workloads and therefore want to set optimal initial regions for the 'adaptive +regions adjustment'. + +In contrast, DAMON do not automatically sets and updates the monitoring target +regions in case of physical memory monitoring. Therefore, users should set the +monitoring target regions by themselves. + +In such cases, users can explicitly set the initial monitoring target regions +as they want, by writing proper values to the ``init_regions`` file. The input +should be a sequence of three integers separated by white spaces that represent +one region in below form.:: + + + +The ``target idx`` should be the index of the target in ``target_ids`` file, +starting from ``0``, and the regions should be passed in address order. For +example, below commands will set a couple of address ranges, ``1-100`` and +``100-200`` as the initial monitoring target region of pid 42, which is the +first one (index ``0``) in ``target_ids``, and another couple of address +ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one +(index ``1``) in ``target_ids``.:: + + # cd /damon + # cat target_ids + 42 4242 + # echo "0 1 100 \ + 0 100 200 \ + 1 20 40 \ + 1 50 100" > init_regions + +Note that this sets the initial monitoring target regions only. In case of +virtual memory monitoring, DAMON will automatically updates the boundary of the +regions after one ``update interval``. Therefore, users should set the +``update interval`` large enough in this case, if they don't want the +update. + + +Schemes +------- + +For usual DAMON-based data access aware memory management optimizations, users +would simply want the system to apply a memory management action to a memory +region of a specific access pattern. DAMON receives such formalized operation +schemes from the user and applies those to the target processes. + +Users can get and set the schemes by reading from and writing to ``schemes`` +debugfs file. Reading the file also shows the statistics of each scheme. To +the file, each of the schemes should be represented in each line in below +form:: + + + +You can disable schemes by simply writing an empty string to the file. + +Target Access Pattern +~~~~~~~~~~~~~~~~~~~~~ + +The ```` is constructed with three ranges in below +form:: + + min-size max-size min-acc max-acc min-age max-age + +Specifically, bytes for the size of regions (``min-size`` and ``max-size``), +number of monitored accesses per aggregate interval for access frequency +(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of +regions (``min-age`` and ``max-age``) are specified. Note that the ranges are +closed interval. + +Action +~~~~~~ + +The ```` is a predefined integer for memory management actions, which +DAMON will apply to the regions having the target access pattern. The +supported numbers and their meanings are as below. + + - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``. Ignored if + ``target`` is ``paddr``. + - 1: Call ``madvise()`` for the region with ``MADV_COLD``. Ignored if + ``target`` is ``paddr``. + - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. + - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. Ignored if + ``target`` is ``paddr``. + - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. Ignored if + ``target`` is ``paddr``. + - 5: Do nothing but count the statistics + +Quota +~~~~~ + +Optimal ``target access pattern`` for each ``action`` is workload dependent, so +not easy to find. Worse yet, setting a scheme of some action too aggressive +can cause severe overhead. To avoid such overhead, users can limit time and +size quota for the scheme via the ```` in below form:: + + + +This makes DAMON to try to use only up to ```` milliseconds for applying +the action to memory regions of the ``target access pattern`` within the +```` milliseconds, and to apply the action to only up to +```` bytes of memory regions within the ````. Setting both +```` and ```` zero disables the quota limits. + +When the quota limit is expected to be exceeded, DAMON prioritizes found memory +regions of the ``target access pattern`` based on their size, access frequency, +and age. For personalized prioritization, users can set the weights for the +three properties in ```` in below form:: + + + +Watermarks +~~~~~~~~~~ + +Some schemes would need to run based on current value of the system's specific +metrics like free memory ratio. For such cases, users can specify watermarks +for the condition.:: + + + +```` is a predefined integer for the metric to be checked. The +supported numbers and their meanings are as below. + + - 0: Ignore the watermarks + - 1: System's free memory rate (per thousand) + +The value of the metric is checked every ```` microseconds. + +If the value is higher than ```` or lower than ````, the +scheme is deactivated. If the value is lower than ````, the scheme +is activated. + +.. _damos_stats: + +Statistics +~~~~~~~~~~ + +It also counts the total number and bytes of regions that each scheme is tried +to be applied, the two numbers for the regions that each scheme is successfully +applied, and the total number of the quota limit exceeds. This statistics can +be used for online analysis or tuning of the schemes. + +The statistics can be shown by reading the ``schemes`` file. Reading the file +will show each scheme you entered in each line, and the five numbers for the +statistics will be added at the end of each line. + +Example +~~~~~~~ + +Below commands applies a scheme saying "If a memory region of size in [4KiB, +8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate +interval in [10, 20], page out the region. For the paging out, use only up to +10ms per second, and also don't page out more than 1GiB per second. Under the +limitation, page out memory regions having longer age first. Also, check the +free memory rate of the system every 5 seconds, start the monitoring and paging +out when the free memory rate becomes lower than 50%, but stop it if the free +memory rate becomes larger than 60%, or lower than 30%".:: + + # cd /damon + # scheme="4096 8192 0 5 10 20 2" # target access pattern and action + # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas + # scheme+=" 0 0 100" # prioritization weights + # scheme+=" 1 5000000 600 500 300" # watermarks + # echo "$scheme" > schemes + + +Turning On/Off +-------------- + +Setting the files as described above doesn't incur effect unless you explicitly +start the monitoring. You can start, stop, and check the current status of the +monitoring by writing to and reading from the ``monitor_on`` file. Writing +``on`` to the file starts the monitoring of the targets with the attributes. +Writing ``off`` to the file stops those. DAMON also stops if every target +process is terminated. Below example commands turn on, off, and check the +status of DAMON:: + + # cd /damon + # echo on > monitor_on + # echo off > monitor_on + # cat monitor_on + off + +Please note that you cannot write to the above-mentioned debugfs files while +the monitoring is turned on. If you write to the files while DAMON is running, +an error code such as ``-EBUSY`` will be returned. + + +Monitoring Thread PID +--------------------- + +DAMON does requested monitoring with a kernel thread called ``kdamond``. You +can get the pid of the thread by reading the ``kdamond_pid`` file. When the +monitoring is turned off, reading the file returns ``none``. :: + + # cd /damon + # cat monitor_on + off + # cat kdamond_pid + none + # echo on > monitor_on + # cat kdamond_pid + 18594 + + +Using Multiple Monitoring Threads +--------------------------------- + +One ``kdamond`` thread is created for each monitoring context. You can create +and remove monitoring contexts for multiple ``kdamond`` required use case using +the ``mk_contexts`` and ``rm_contexts`` files. + +Writing the name of the new context to the ``mk_contexts`` file creates a +directory of the name on the DAMON debugfs directory. The directory will have +DAMON debugfs files for the context. :: + + # cd /damon + # ls foo + # ls: cannot access 'foo': No such file or directory + # echo foo > mk_contexts + # ls foo + # attrs init_regions kdamond_pid schemes target_ids + +If the context is not needed anymore, you can remove it and the corresponding +directory by putting the name of the context to the ``rm_contexts`` file. :: + + # echo foo > rm_contexts + # ls foo + # ls: cannot access 'foo': No such file or directory + +Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the +root directory only. + + +.. _tracepoint: + +Tracepoint for Monitoring Results +================================= + +DAMON provides the monitoring results via a tracepoint, +``damon:damon_aggregated``. While the monitoring is turned on, you could +record the tracepoint events and show results using tracepoint supporting tools +like ``perf``. For example:: + + # echo on > monitor_on + # perf record -e damon:damon_aggregated & + # sleep 5 + # kill 9 $(pidof perf) + # echo off > monitor_on + # perf script diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index cd727cfc1b040..32c27fbf1913c 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -27,6 +27,7 @@ the Linux memory management. concepts cma_debugfs + damon/index hugetlbpage idle_page_tracking ksm diff --git a/Documentation/misc-devices/sysgenid.rst b/Documentation/misc-devices/sysgenid.rst new file mode 100644 index 0000000000000..0b8199b8d5163 --- /dev/null +++ b/Documentation/misc-devices/sysgenid.rst @@ -0,0 +1,229 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======== +SYSGENID +======== + +The System Generation ID feature is required in virtualized or +containerized environments by applications that work with local copies +or caches of world-unique data such as random values, UUIDs, +monotonically increasing counters, etc. +Such applications can be negatively affected by VM or container +snapshotting when the VM or container is either cloned or returned to +an earlier point in time. + +The System Generation ID is meant to help in these scenarios by +providing a monotonically increasing counter that changes each time the +VM or container is restored from a snapshot. The driver for it lives at +``drivers/misc/sysgenid.c``. + +The ``sysgenid`` driver exposes a monotonic incremental System +Generation u32 counter via a char-dev filesystem interface accessible +through ``/dev/sysgenid`` that provides sync and async SysGen counter +update notifications. It also provides SysGen counter retrieval and +confirmation mechanisms. + +The counter starts from zero when the driver is initialized and +monotonically increments every time the system generation changes. + +The ``sysgenid`` driver exports the ``void sysgenid_bump_generation()`` +symbol which can be used by backend drivers to drive system generation +changes based on hardware events. +System generation changes can also be driven by userspace software +through a dedicated driver ioctl. + +Userspace applications or libraries can (a)synchronously consume the +system generation counter through the provided filesystem interface, to +make any necessary internal adjustments following a system generation +update. + +**Please note**, SysGenID alone does not guarantee complete snapshot +safety to applications using it. A certain workflow needs to be +followed at the system level, in order to make the system +snapshot-resilient. Please see the "Snapshot Safety Prerequisites" +section below. + +Driver filesystem interface +=========================== + +``open()``: + When the device is opened, a copy of the current SysGenID (counter) + is associated with the open file descriptor. Every open file + descriptor will have readable data available (EPOLLIN) while its + current copy of the SysGenID is outdated. Reading from the fd will + provide the latest SysGenID, while writing to the fd will update the + fd-local copy of the SysGenID and is used as a confirmation + mechanism. + +``read()``: + Read is meant to provide the *new* system generation counter when a + generation change takes place. The read operation blocks until the + associated counter is no longer up to date, at which point the new + counter is provided/returned. Nonblocking ``read()`` returns + ``EAGAIN`` to signal that there is no *new* counter value available. + The generation counter is considered *new* for each open file + descriptor that hasn't confirmed the new value following a generation + change. Therefore, once a generation change takes place, all + ``read()`` calls will immediately return the new generation counter + and will continue to do so until the new value is confirmed back to + the driver through ``write()``. + Partial reads are not allowed - read buffer needs to be at least + 32 bits in size. + +``write()``: + Write is used to confirm the up-to-date SysGenID counter back to the + driver. + Following a VM generation change, all existing watchers are marked + as *outdated*. Each file descriptor will maintain the *outdated* + status until a ``write()`` containing the new up-to-date generation + counter is used as an update confirmation mechanism. + Partial writes are not allowed - write buffer should be exactly + 32 bits in size. + +``poll()``: + Poll is implemented to allow polling for generation counter updates. + Such updates result in ``EPOLLIN`` polling status until the new + up-to-date counter is confirmed back to the driver through a + ``write()``. + +``ioctl()``: + The driver also adds support for waiting on open file descriptors + that haven't acknowledged a generation counter update, as well as a + mechanism for userspace to *trigger* a generation update: + + - SYSGENID_SET_WATCHER_TRACKING: takes a bool argument to set tracking + status for current file descriptor. When watcher tracking is + enabled, the driver tracks this file descriptor as an independent + *watcher*. The driver keeps accounting of how many watchers have + confirmed the latest Sys-Gen-Id counter and how many of them are + *outdated*; an outdated watcher is a *tracked* open file descriptor + that has lived through a Sys-Gen-Id change but has not yet confirmed + the new generation counter. + Software that wants to be waited on by the system while it adjusts + to generation changes, should turn tracking on. The sysgenid driver + then keeps track of it and can block system-level adjustment process + until the software has finished adjusting and confirmed it through a + ``write()``. + Tracking is disabled by default and file descriptors need to + explicitly opt-in using this IOCTL. + - SYSGENID_WAIT_WATCHERS: blocks until there are no more *outdated* + tracked watchers or, if a ``timeout`` argument is provided, until + the timeout expires. + If the current caller is *outdated* or a generation change happens + while waiting (thus making current caller *outdated*), the ioctl + returns ``-EINTR`` to signal the user to handle event and retry. + - SYSGENID_TRIGGER_GEN_UPDATE: triggers a generation counter increment. + It takes a ``minimum-generation`` argument which represents the + minimum value the generation counter will be set to. For example if + current generation is ``5`` and ``SYSGENID_TRIGGER_GEN_UPDATE(8)`` + is called, the generation counter will increment to ``8``. + This IOCTL can only be used by processes with CAP_CHECKPOINT_RESTORE + or CAP_SYS_ADMIN capabilities. + +``mmap()``: + The driver supports ``PROT_READ, MAP_SHARED`` mmaps of a single page + in size. The first 4 bytes of the mapped page will contain an + up-to-date u32 copy of the system generation counter. + The mapped memory can be used as a low-latency generation counter + probe mechanism in critical sections. + The mmap() interface is targeted at libraries or code that needs to + check for generation changes in-line, where an event loop is not + available or read()/write() syscalls are too expensive. + In such cases, logic can be added in-line with the sensitive code to + check and trigger on-demand/just-in-time readjustments when changes + are detected on the memory mapped generation counter. + Users of this interface that plan to lazily adjust should not enable + watcher tracking, since waiting on them doesn't make sense. + +``close()``: + Removes the file descriptor as a system generation counter *watcher*. + +Snapshot Safety Prerequisites +============================= + +If VM, container or other system-level snapshots happen asynchronously, +at arbitrary times during an active workload there is no practical way +to ensure that in-flight local copies or caches of world-unique data +such as random values, secrets, UUIDs, etc are properly scrubbed and +regenerated. +The challenge stems from the fact that the categorization of data as +snapshot-sensitive is only known to the software working with it, and +this software has no logical control over the moment in time when an +external system snapshot occurs. + +Let's take an OpenSSL session token for example. Even if the library +code is made 100% snapshot-safe, meaning the library guarantees that +the session token is unique (any snapshot that happened during the +library call did not duplicate or leak the token), the token is still +vulnerable to snapshot events while it transits the various layers of +the library caller, then the various layers of the OS before leaving +the system. + +To catch a secret while it's in-flight, we'd have to validate system +generation at every layer, every step of the way. Even if that would +be deemed the right solution, it would be a long road and a whole +universe to patch before we get there. + +Bottom line is we don't have a way to track all of these in-flight +secrets and dynamically scrub them from existence with snapshot +events happening arbitrarily. + +Simplifyng assumption - safety prerequisite +------------------------------------------- + +**Control the snapshot flow**, disallow snapshots coming at arbitrary +moments in the workload lifetime. + +Use a system-level overseer entity that quiesces the system before +snapshot, and post-snapshot-resume oversees that software components +have readjusted to new environment, to the new generation. Only after, +will the overseer un-quiesce the system and allow active workloads. + +Software components can choose whether they want to be tracked and +waited on by the overseer by using the ``SYSGENID_SET_WATCHER_TRACKING`` +IOCTL. + +The sysgenid framework standardizes the API for system software to +find out about needing to readjust and at the same time provides a +mechanism for the overseer entity to wait for everyone to be done, the +system to have readjusted, so it can un-quiesce. + +Example snapshot-safe workflow +------------------------------ + +1) Before taking a snapshot, quiesce the VM/container/system. Exactly + how this is achieved is very workload-specific, but the general + description is to get all software to an expected state where their + event loops dry up and they are effectively quiesced. +2) Take snapshot. +3) Resume the VM/container/system from said snapshot. +4) SysGenID counter will either automatically increment if there is + a vmgenid backend (hw-driven), or overseer will trigger generation + bump using ``SYSGENID_TRIGGER_GEN_UPDATE`` IOCLT (sw-driven). +5) Software components which have ``/dev/sysgenid`` in their event + loops (either using ``poll()`` or ``read()``) are notified of the + generation change. + They do their specific internal adjustments. Some may have requested + to be tracked and waited on by the overseer, others might choose to + do their adjustments out of band and not block the overseer. + Tracked ones *must* signal when they are done/ready with a ``write()`` + while the rest *should* also do so for cleanliness, but it's not + mandatory. +6) Overseer will block and wait for all tracked watchers by using the + ``SYSGENID_WAIT_WATCHERS`` IOCTL. Once all tracked watchers are done + in step 5, this overseer will return from this blocking ioctl knowing + that the system has readjusted and is ready for active workload. +7) Overseer un-quiesces system. +8) There is a class of software, usually libraries, most notably PRNGs + or SSLs, that don't fit the event-loop model and also have strict + latency requirements. These can take advantage of the ``mmap()`` + interface and lazily adjust on-demand whenever they are called after + un-quiesce. + For a well-designed service stack, these libraries should not be + called while system is quiesced. When workload is resumed by the + overseer, on the first call into these libs, they will safely JIT + readjust. + Users of this lazy on-demand readjustment model should not enable + watcher tracking since doing so would introduce a logical deadlock: + lazy adjustments happen only after un-quiesce, but un-quiesce is + blocked until all tracked watchers are up-to-date. diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 252212998378e..8b3031e3ca174 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -949,6 +949,64 @@ tcp_rx_skb_cache - BOOLEAN Default: 0 (disabled) +tcp_plb_enabled - BOOLEAN + If set, TCP PLB (Protective Load Balancing) is enabled. PLB is + described in the following paper: + https://doi.org/10.1145/3544216.3544226. Based on PLB parameters, + upon sensing sustained congestion, TCP triggers a change in + flow label field for outgoing IPv6 packets. A change in flow label + field potentially changes the path of outgoing packets for switches + that use ECMP/WCMP for routing. + + Default: 0 + +tcp_plb_cong_thresh - INTEGER + Fraction of packets marked with congestion over a round (RTT) to + tag that round as congested. This is referred to as K in the PLB paper: + https://doi.org/10.1145/3544216.3544226. + + The 0-1 fraction range is mapped to 0-256 range to avoid floating + point operations. For example, 128 means that if at least 50% of + the packets in a round were marked as congested then the round + will be tagged as congested. + + Possible Values: 0 - 256 + + Default: 128 + +tcp_plb_idle_rehash_rounds - INTEGER + Number of consecutive congested rounds (RTT) seen after which + a rehash can be performed, given there are no packets in flight. + This is referred to as M in PLB paper: + https://doi.org/10.1145/3544216.3544226. + + Possible Values: 0 - 31 + + Default: 3 + +tcp_plb_rehash_rounds - INTEGER + Number of consecutive congested rounds (RTT) seen after which + a forced rehash can be performed. Be careful when setting this + parameter, as a small value increases the risk of retransmissions. + This is referred to as N in PLB paper: + https://doi.org/10.1145/3544216.3544226. + + Possible Values: 0 - 31 + + Default: 12 + +tcp_plb_suspend_rto_sec - INTEGER + Time, in seconds, to suspend PLB in event of an RTO. In order to avoid + having PLB repath onto a connectivity "black hole", after an RTO a TCP + connection suspends PLB repathing for a random duration between 1x and + 2x of this parameter. Randomness is added to avoid concurrent rehashing + of multiple TCP connections. This should be set corresponding to the + amount of time it takes to repair a failed link. + + Possible Values: 0 - 255 + + Default: 60 + UDP variables ============= diff --git a/Documentation/scsi/mpi3mr.rst b/Documentation/scsi/mpi3mr.rst new file mode 100644 index 0000000000000..6b39a91093198 --- /dev/null +++ b/Documentation/scsi/mpi3mr.rst @@ -0,0 +1,38 @@ +This file lists the module parameters supported by the mpi3mr driver and their use. + +poll_queues: Number of queues for io_uring poll mode (allowed values: 0 to 126, default=0). +The mpi3mr driver supports io_uring in the kernel versions >=5.13 and this module parameter allows the user to specify 1 or more queues to be designated as poll_queues. +The poll_queues are disabled by default and this value is set to 0 by default. + +enable_segqueue: Enable segmented operational request & reply queues in the supported controllers (allowed values: 0 and 1, default = 1) +Certain controllers managed by the mpi3mr driver can support the creation of operational request and reply queues with non-contiguous(segmented) memory for the queues. +This option when set to 1, allows the driver to create the queues with segmented memory in the supported controllers. +This option when set to 0, allows the driver to create the queues with contiguous memory. +In the controllers that do not support the segmented queue creation, irrespective of this module parameter value, the driver always uses contiguous memory for queue creation. + +drv_dbg_level: Driver diagnostic buffer level (allowed values: 0,1 and 2, default=1). +The mpi3mr driver supports saving some of the driver/kernel log messages from the dmesg log into the controller's persistent memory when certain fault conditions occur in the controller. +This feature helps to save information that could otherwise get lost in cases like the OS drive present behind the controller which gets into fault. +This option when set to +0(disabled): disables the saving of messages into the controller's persistent memory. +1(minidump): captures the prints related to the specific controller instance that is faulting to the available persistent memory size. +2(fulldump): captures the minidump and in addition captures the complete dmesg logs to the available persistent memory size. + +logging_level: Enable additional debug prints in the driver (allowed values: 0 to 0x7fffffff, default=0) +The mpi3mr driver has only mandatorily required information logging by default to avoid cluttering the kernel log. +The additional debug logging prints can be dynamically enabled by providing the logging level through this module parameter or dynamically changing the logging level value through sysfs on a per controller basis. +The logging level set through module parameter will be applicable to all the controllers managed by the driver. +To turn off the additional logging, the logging level has to be set to 0. +The logging level is a bitmap and the individual values can be be found in the "mpi3mr_debug.h" file. Setting this value to 0xFFFF will turn on pertinent logs required to support debugging many generic issues. + +enable_dif: Enable Data Intgerity Format (DIF) for the supported drives (allowed values: 0 and 1, default=1) +The controllers managed by the mpi3mr driver are capable of generating, checking, removing Protection Information(PI) for the drives which support DIF. +The driver by default enables the feature in the controller and let the kernel know that the driver and controller are capable of doing the PI generation and checking. +When this parameter is set to 0, the driver will inform the kernel that the driver and controllers are not capable of supporting DIF. + +enable_dix: Enable Data Intgerity Extension (DIX) for the supported drives (allowed values: 0 and 1, default=0) +The controllers managed by the mpi3mr driver and the driver are capable of passing the Protection Information(PI) from the upper layers in the operatings system for the DIF supported drives. +By default, this capability is disabled in the controller and not exposed to the OS by the driver. +When this parameter is set to 0, the driver will inform the kernel about the DIX capabilities supported by the driver and controller and will handle the I/O requests sent with PI. + + diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index a7373d4e3984c..16efa0199c8df 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -354,6 +354,7 @@ Code Seq# Include File Comments 0xDB 00-0F drivers/char/mwave/mwavepub.h 0xDD 00-3F ZFCP device driver see drivers/s390/scsi/ +0xE4 01-03 uapi/linux/sysgenid.h SysGenID misc driver 0xE5 00-3F linux/fuse.h 0xEC 00-01 drivers/platform/chrome/cros_ec_dev.h ChromeOS EC driver 0xF3 00-3F drivers/usb/misc/sisusbvga/sisusb.h sisfb (in development) diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst index 7ddd8f667459b..5e8ed9eef9aa8 100644 --- a/Documentation/userspace-api/spec_ctrl.rst +++ b/Documentation/userspace-api/spec_ctrl.rst @@ -106,3 +106,11 @@ Speculation misfeature controls * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0); + +- PR_SPEC_L1D_FLUSH: Flush L1D Cache on context switch out of the task + (works only when tasks run on non SMT cores) + + Invocations: + * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, 0, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, PR_SPEC_ENABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, PR_SPEC_DISABLE, 0, 0); diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 08295f488d057..02f821ca63c66 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6405,6 +6405,15 @@ guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf (0x40000001). Otherwise, a guest may use the paravirtual features regardless of what has actually been exposed through the CPUID leaf. +8.29 KVM_CAP_PTP_KVM +-------------------- + +:Architectures: arm64 + +This capability indicates that the KVM virtual PTP service is +supported in the host. A VMM can check whether the service is +available to the guest on migration. + 9. Known KVM API problems ========================= diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst index 3e2b2aba90fcc..78a9b670aafee 100644 --- a/Documentation/virt/kvm/arm/index.rst +++ b/Documentation/virt/kvm/arm/index.rst @@ -10,3 +10,4 @@ ARM hyp-abi psci pvtime + ptp_kvm diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst new file mode 100644 index 0000000000000..68cffb50d8bf0 --- /dev/null +++ b/Documentation/virt/kvm/arm/ptp_kvm.rst @@ -0,0 +1,25 @@ +.. SPDX-License-Identifier: GPL-2.0 + +PTP_KVM support for arm/arm64 +============================= + +PTP_KVM is used for high precision time sync between host and guests. +It relies on transferring the wall clock and counter value from the +host to the guest using a KVM-specific hypercall. + +* ARM_SMCCC_HYP_KVM_PTP_FUNC_ID: 0x86000001 + +This hypercall uses the SMC32/HVC32 calling convention: + +ARM_SMCCC_HYP_KVM_PTP_FUNC_ID + ============= ========== ========== + Function ID: (uint32) 0x86000001 + Arguments: (uint32) KVM_PTP_VIRT_COUNTER(0) + KVM_PTP_PHYS_COUNTER(1) + Return Values: (int32) NOT_SUPPORTED(-1) on error, or + (uint32) Upper 32 bits of wall clock time (r0) + (uint32) Lower 32 bits of wall clock time (r1) + (uint32) Upper 32 bits of counter (r2) + (uint32) Lower 32 bits of counter (r3) + Endianness: No Restrictions. + ============= ========== ========== diff --git a/Documentation/virt/ne_overview.rst b/Documentation/virt/ne_overview.rst index 39b0c8fe2654a..74c2f5919c886 100644 --- a/Documentation/virt/ne_overview.rst +++ b/Documentation/virt/ne_overview.rst @@ -14,12 +14,15 @@ instances [1]. For example, an application that processes sensitive data and runs in a VM, can be separated from other applications running in the same VM. This application then runs in a separate VM than the primary VM, namely an enclave. +It runs alongside the VM that spawned it. This setup matches low latency +applications needs. -An enclave runs alongside the VM that spawned it. This setup matches low latency -applications needs. The resources that are allocated for the enclave, such as -memory and CPUs, are carved out of the primary VM. Each enclave is mapped to a -process running in the primary VM, that communicates with the NE driver via an -ioctl interface. +The current supported architectures for the NE kernel driver, available in the +upstream Linux kernel, are x86 and ARM64. + +The resources that are allocated for the enclave, such as memory and CPUs, are +carved out of the primary VM. Each enclave is mapped to a process running in the +primary VM, that communicates with the NE kernel driver via an ioctl interface. In this sense, there are two components: @@ -43,8 +46,8 @@ for the enclave VM. An enclave does not have persistent storage attached. The memory regions carved out of the primary VM and given to an enclave need to be aligned 2 MiB / 1 GiB physically contiguous memory regions (or multiple of this size e.g. 8 MiB). The memory can be allocated e.g. by using hugetlbfs from -user space [2][3]. The memory size for an enclave needs to be at least 64 MiB. -The enclave memory and CPUs need to be from the same NUMA node. +user space [2][3][7]. The memory size for an enclave needs to be at least +64 MiB. The enclave memory and CPUs need to be from the same NUMA node. An enclave runs on dedicated cores. CPU 0 and its CPU siblings need to remain available for the primary VM. A CPU pool has to be set for NE purposes by an @@ -61,7 +64,7 @@ device is placed in memory below the typical 4 GiB. The application that runs in the enclave needs to be packaged in an enclave image together with the OS ( e.g. kernel, ramdisk, init ) that will run in the enclave VM. The enclave VM has its own kernel and follows the standard Linux -boot protocol [6]. +boot protocol [6][8]. The kernel bzImage, the kernel command line, the ramdisk(s) are part of the Enclave Image Format (EIF); plus an EIF header including metadata such as magic @@ -93,3 +96,5 @@ enclave process can exit. [4] https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html [5] https://man7.org/linux/man-pages/man7/vsock.7.html [6] https://www.kernel.org/doc/html/latest/x86/boot.html +[7] https://www.kernel.org/doc/html/latest/arm64/hugetlbpage.html +[8] https://www.kernel.org/doc/html/latest/arm64/booting.html diff --git a/Documentation/virt/vmgenid.rst b/Documentation/virt/vmgenid.rst new file mode 100644 index 0000000000000..a429c2a347ef3 --- /dev/null +++ b/Documentation/virt/vmgenid.rst @@ -0,0 +1,36 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======= +VMGENID +======= + +The VM Generation ID is a feature defined by Microsoft (paper: +http://go.microsoft.com/fwlink/?LinkId=260709) and supported by +multiple hypervisor vendors. + +The feature is required in virtualized environments by applications +that work with local copies/caches of world-unique data such as random +values, UUIDs, monotonically increasing counters, etc. +Such applications can be negatively affected by VM snapshotting when +the VM is either cloned or returned to an earlier point in time. + +The VM Generation ID is a simple concept through which a hypevisor +notifies its guest that a snapshot has taken place. The vmgenid device +provides a unique ID that changes each time the VM is restored from a +snapshot. The hardware provided UUID value can be used to differentiate +between VMs or different generations of the same VM. + +The VM Generation ID is exposed through an ACPI device by multiple +hypervisor vendors. The driver for it lives at +``drivers/virt/vmgenid.c`` + +The ``vmgenid`` driver acts as a backend for the ``sysgenid`` kernel module +(``drivers/misc/sysgenid.c``, ``Documentation/misc-devices/sysgenid.rst``) +to drive changes to the "System Generation Id" which is further exposed +to userspace as a monotonically increasing counter. + +The driver uses ACPI events to be notified by hardware of changes to the +128-bit Vm Gen Id UUID. Since the actual UUID value is not directly exposed +to userspace, but only used to drive the System Generation Counter, the +driver also adds it as device randomness to improve kernel entropy +following VM snapshot events. diff --git a/Documentation/vm/damon/api.rst b/Documentation/vm/damon/api.rst new file mode 100644 index 0000000000000..08f34df45523a --- /dev/null +++ b/Documentation/vm/damon/api.rst @@ -0,0 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +API Reference +============= + +Kernel space programs can use every feature of DAMON using below APIs. All you +need to do is including ``damon.h``, which is located in ``include/linux/`` of +the source tree. + +Structures +========== + +.. kernel-doc:: include/linux/damon.h + + +Functions +========= + +.. kernel-doc:: mm/damon/core.c diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst new file mode 100644 index 0000000000000..0cff6fac6b7e8 --- /dev/null +++ b/Documentation/vm/damon/design.rst @@ -0,0 +1,176 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +Design +====== + +Configurable Layers +=================== + +DAMON provides data access monitoring functionality while making the accuracy +and the overhead controllable. The fundamental access monitorings require +primitives that dependent on and optimized for the target address space. On +the other hand, the accuracy and overhead tradeoff mechanism, which is the core +of DAMON, is in the pure logic space. DAMON separates the two parts in +different layers and defines its interface to allow various low level +primitives implementations configurable with the core logic. We call the low +level primitives implementations monitoring operations. + +Due to this separated design and the configurable interface, users can extend +DAMON for any address space by configuring the core logics with appropriate +monitoring operations. If appropriate one is not provided, users can implement +the operations on their own. + +For example, physical memory, virtual memory, swap space, those for specific +processes, NUMA nodes, files, and backing memory devices would be supportable. +Also, if some architectures or devices support special optimized access check +primitives, those will be easily configurable. + + +Reference Implementations of Address Space Specific Monitoring Operations +========================================================================= + +The monitoring operations are defined in two parts: + +1. Identification of the monitoring target address range for the address space. +2. Access check of specific address range in the target space. + +DAMON currently provides the implementations of the operations for the physical +and virtual address spaces. Below two subsections describe how those work. + + +VMA-based Target Address Range Construction +------------------------------------------- + +This is only for the virtual address space monitoring operations +implementation. That for the physical address space simply asks users to +manually set the monitoring target address ranges. + +Only small parts in the super-huge virtual address space of the processes are +mapped to the physical memory and accessed. Thus, tracking the unmapped +address regions is just wasteful. However, because DAMON can deal with some +level of noise using the adaptive regions adjustment mechanism, tracking every +mapping is not strictly required but could even incur a high overhead in some +cases. That said, too huge unmapped areas inside the monitoring target should +be removed to not take the time for the adaptive mechanism. + +For the reason, this implementation converts the complex mappings to three +distinct regions that cover every mapped area of the address space. The two +gaps between the three regions are the two biggest unmapped areas in the given +address space. The two biggest unmapped areas would be the gap between the +heap and the uppermost mmap()-ed region, and the gap between the lowermost +mmap()-ed region and the stack in most of the cases. Because these gaps are +exceptionally huge in usual address spaces, excluding these will be sufficient +to make a reasonable trade-off. Below shows this in detail:: + + + + + (small mmap()-ed regions and munmap()-ed regions) + + + + + +PTE Accessed-bit Based Access Check +----------------------------------- + +Both of the implementations for physical and virtual address spaces use PTE +Accessed-bit for basic access checks. Only one difference is the way of +finding the relevant PTE Accessed bit(s) from the address. While the +implementation for the virtual address walks the page table for the target task +of the address, the implementation for the physical address walks every page +table having a mapping to the address. In this way, the implementations find +and clear the bit(s) for next sampling target address and checks whether the +bit(s) set again after one sampling period. This could disturb other kernel +subsystems using the Accessed bits, namely Idle page tracking and the reclaim +logic. DAMON does nothing to avoid disturbing Idle page tracking, so handling +the interference is the responsibility of sysadmins. However, it solves the +conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags, +as Idle page tracking does. + + +Address Space Independent Core Mechanisms +========================================= + +Below four sections describe each of the DAMON core mechanisms and the five +monitoring attributes, ``sampling interval``, ``aggregation interval``, +``update interval``, ``minimum number of regions``, and ``maximum number of +regions``. + + +Access Frequency Monitoring +--------------------------- + +The output of DAMON says what pages are how frequently accessed for a given +duration. The resolution of the access frequency is controlled by setting +``sampling interval`` and ``aggregation interval``. In detail, DAMON checks +access to each page per ``sampling interval`` and aggregates the results. In +other words, counts the number of the accesses to each page. After each +``aggregation interval`` passes, DAMON calls callback functions that previously +registered by users so that users can read the aggregated results and then +clears the results. This can be described in below simple pseudo-code:: + + while monitoring_on: + for page in monitoring_target: + if accessed(page): + nr_accesses[page] += 1 + if time() % aggregation_interval == 0: + for callback in user_registered_callbacks: + callback(monitoring_target, nr_accesses) + for page in monitoring_target: + nr_accesses[page] = 0 + sleep(sampling interval) + +The monitoring overhead of this mechanism will arbitrarily increase as the +size of the target workload grows. + + +Region Based Sampling +--------------------- + +To avoid the unbounded increase of the overhead, DAMON groups adjacent pages +that assumed to have the same access frequencies into a region. As long as the +assumption (pages in a region have the same access frequencies) is kept, only +one page in the region is required to be checked. Thus, for each ``sampling +interval``, DAMON randomly picks one page in each region, waits for one +``sampling interval``, checks whether the page is accessed meanwhile, and +increases the access frequency of the region if so. Therefore, the monitoring +overhead is controllable by setting the number of regions. DAMON allows users +to set the minimum and the maximum number of regions for the trade-off. + +This scheme, however, cannot preserve the quality of the output if the +assumption is not guaranteed. + + +Adaptive Regions Adjustment +--------------------------- + +Even somehow the initial monitoring target regions are well constructed to +fulfill the assumption (pages in same region have similar access frequencies), +the data access pattern can be dynamically changed. This will result in low +monitoring quality. To keep the assumption as much as possible, DAMON +adaptively merges and splits each region based on their access frequency. + +For each ``aggregation interval``, it compares the access frequencies of +adjacent regions and merges those if the frequency difference is small. Then, +after it reports and clears the aggregated access frequency of each region, it +splits each region into two or three regions if the total number of regions +will not exceed the user-specified maximum number of regions after the split. + +In this way, DAMON provides its best-effort quality and minimal overhead while +keeping the bounds users set for their trade-off. + + +Dynamic Target Space Updates Handling +------------------------------------- + +The monitoring target address range could dynamically changed. For example, +virtual memory could be dynamically mapped and unmapped. Physical memory could +be hot-plugged. + +As the changes could be quite frequent in some cases, DAMON allows the +monitoring operations to check dynamic changes including memory mapping changes +and applies it to monitoring operations-related data structures such as the +abstracted monitoring target memory area only for each of a user-specified time +interval (``update interval``). diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst new file mode 100644 index 0000000000000..dde7e2414ee60 --- /dev/null +++ b/Documentation/vm/damon/faq.rst @@ -0,0 +1,50 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +Frequently Asked Questions +========================== + +Why a new subsystem, instead of extending perf or other user space tools? +========================================================================= + +First, because it needs to be lightweight as much as possible so that it can be +used online, any unnecessary overhead such as kernel - user space context +switching cost should be avoided. Second, DAMON aims to be used by other +programs including the kernel. Therefore, having a dependency on specific +tools like perf is not desirable. These are the two biggest reasons why DAMON +is implemented in the kernel space. + + +Can 'idle pages tracking' or 'perf mem' substitute DAMON? +========================================================= + +Idle page tracking is a low level primitive for access check of the physical +address space. 'perf mem' is similar, though it can use sampling to minimize +the overhead. On the other hand, DAMON is a higher-level framework for the +monitoring of various address spaces. It is focused on memory management +optimization and provides sophisticated accuracy/overhead handling mechanisms. +Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of +DAMON's output, but cannot substitute DAMON. + + +Does DAMON support virtual memory only? +======================================= + +No. The core of the DAMON is address space independent. The address space +specific monitoring operations including monitoring target regions +constructions and actual access checks can be implemented and configured on the +DAMON core by the users. In this way, DAMON users can monitor any address +space with any access check technique. + +Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based +implementations of the address space dependent functions for the virtual memory +and the physical memory by default, for a reference and convenient use. + + +Can I simply monitor page granularity? +====================================== + +Yes. You can do so by setting the ``min_nr_regions`` attribute higher than the +working set size divided by the page size. Because the monitoring target +regions size is forced to be ``>=page size``, the region split will make no +effect. diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst new file mode 100644 index 0000000000000..5e0a505835005 --- /dev/null +++ b/Documentation/vm/damon/index.rst @@ -0,0 +1,35 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +DAMON: Data Access MONitor +========================== + +DAMON is a Linux kernel subsystem that provides a framework for data access +monitoring and the monitoring results based system operations. The core +monitoring mechanisms of DAMON (refer to :doc:`design` for the detail) make it + + - *accurate* (the monitoring output is useful enough for DRAM level memory + management; It might not appropriate for CPU Cache levels, though), + - *light-weight* (the monitoring overhead is low enough to be applied online), + and + - *scalable* (the upper-bound of the overhead is in constant range regardless + of the size of target workloads). + +Using this framework, therefore, the kernel can operate system in an +access-aware fashion. Because the features are also exposed to the user space, +users who have special information about their workloads can write personalized +applications for better understanding and optimizations of their workloads and +systems. + +For easier development of such systems, DAMON provides a feature called DAMOS +(DAMon-based Operation Schemes) in addition to the monitoring. Using the +feature, DAMON users in both kernel and user spaces can do access-aware system +operations with no code but simple configurations. + +.. toctree:: + :maxdepth: 2 + + faq + design + api + maintainer-profile diff --git a/Documentation/vm/damon/maintainer-profile.rst b/Documentation/vm/damon/maintainer-profile.rst new file mode 100644 index 0000000000000..24a202f03de82 --- /dev/null +++ b/Documentation/vm/damon/maintainer-profile.rst @@ -0,0 +1,62 @@ +.. SPDX-License-Identifier: GPL-2.0 + +DAMON Maintainer Entry Profile +============================== + +The DAMON subsystem covers the files that listed in 'DATA ACCESS MONITOR' +section of 'MAINTAINERS' file. + +The mailing lists for the subsystem are damon@lists.linux.dev and +linux-mm@kvack.org. Patches should be made against the mm-unstable tree [1]_ +whenever possible and posted to the mailing lists. + +SCM Trees +--------- + +There are multiple Linux trees for DAMON development. Patches under +development or testing are queued in damon/next [2]_ by the DAMON maintainer. +Suffieicntly reviewed patches will be queued in mm-unstable [1]_ by the memory +management subsystem maintainer. After more sufficient tests, the patches will +be queued in mm-stable [3]_ , and finally pull-requested to the mainline by the +memory management subsystem maintainer. + +Note again the patches for review should be made against the mm-unstable +tree[1] whenever possible. damon/next is only for preview of others' works in +progress. + +Submit checklist addendum +------------------------- + +When making DAMON changes, you should do below. + +- Build changes related outputs including kernel and documents. +- Ensure the builds introduce no new errors or warnings. +- Run and ensure no new failures for DAMON selftests [4]_ and kunittests [5]_ . + +Further doing below and putting the results will be helpful. + +- Run damon-tests/corr [6]_ for normal changes. +- Run damon-tests/perf [7]_ for performance changes. + +Key cycle dates +--------------- + +Patches can be sent anytime. Key cycle dates of the mm-unstable[1] and +mm-stable[3] trees depend on the memory management subsystem maintainer. + +Review cadence +-------------- + +The DAMON maintainer does the work on the usual work hour (09:00 to 17:00, +Mon-Fri) in PST. The response to patches will occasionally be slow. Do not +hesitate to send a ping if you have not heard back within a week of sending a +patch. + + +.. [1] https://git.kernel.org/akpm/mm/h/mm-unstable +.. [2] https://git.kernel.org/sj/h/damon/next +.. [3] https://git.kernel.org/akpm/mm/h/mm-stable +.. [4] https://github.com/awslabs/damon-tests/blob/master/corr/run.sh#L49 +.. [5] https://github.com/awslabs/damon-tests/blob/master/corr/tests/kunit.sh +.. [6] https://github.com/awslabs/damon-tests/tree/master/corr +.. [7] https://github.com/awslabs/damon-tests/tree/master/perf diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index eff5fbd492d08..b51f0d8992f8f 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -32,6 +32,7 @@ descriptions of data structures and algorithms. arch_pgtable_helpers balance cleancache + damon/index free_page_reporting frontswap highmem diff --git a/MAINTAINERS b/MAINTAINERS index cdb5f1f22f4c4..5d3220850dfc6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4872,6 +4872,18 @@ F: net/ax25/ax25_out.c F: net/ax25/ax25_timer.c F: net/ax25/sysctl_net_ax25.c +DATA ACCESS MONITOR +M: SeongJae Park +L: linux-mm@kvack.org +S: Maintained +F: Documentation/ABI/testing/sysfs-kernel-mm-damon +F: Documentation/admin-guide/mm/damon/ +F: Documentation/vm/damon/ +F: include/linux/damon.h +F: include/trace/events/damon.h +F: mm/damon/ +F: tools/testing/selftests/damon/ + DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER L: netdev@vger.kernel.org S: Orphan @@ -13184,6 +13196,13 @@ F: include/net/page_pool.h F: include/trace/events/page_pool.h F: net/core/page_pool.c +PAGE TOUCHING DMA +M: James Gowans +L: ec2-memo@amazon.com +S: Supported +F: include/linux/dma-page-touching.h +F: kernel/dma/page_touching.c + PANASONIC LAPTOP ACPI EXTRAS DRIVER M: Harald Welte L: platform-driver-x86@vger.kernel.org @@ -16935,6 +16954,14 @@ L: linux-mmc@vger.kernel.org S: Maintained F: drivers/mmc/host/sdhci-pci-dwc-mshc.c +SYSGENID +M: Adrian Catangiu +L: linux-kernel@vger.kernel.org +S: Supported +F: Documentation/misc-devices/sysgenid.rst +F: drivers/misc/sysgenid.c +F: include/uapi/linux/sysgenid.h + SYSTEM CONFIGURATION (SYSCON) M: Lee Jones M: Arnd Bergmann @@ -18707,6 +18734,13 @@ F: drivers/staging/vme/ F: drivers/vme/ F: include/linux/vme* +VMGENID +M: Adrian Catangiu +L: linux-kernel@vger.kernel.org +S: Supported +F: Documentation/virt/vmgenid.rst +F: drivers/virt/vmgenid.c + VMWARE BALLOON DRIVER M: Nadav Amit M: "VMware, Inc." diff --git a/Makefile b/Makefile index bd2f457703634..112c3473c71b3 100644 --- a/Makefile +++ b/Makefile @@ -1001,9 +1001,6 @@ KBUILD_CFLAGS += $(KCFLAGS) KBUILD_LDFLAGS_MODULE += --build-id=sha1 LDFLAGS_vmlinux += --build-id=sha1 -KBUILD_LDFLAGS += -z noexecstack -KBUILD_LDFLAGS += $(call ld-option,--no-warn-rwx-segments) - ifeq ($(CONFIG_STRIP_ASM_SYMS),y) LDFLAGS_vmlinux += $(call ld-option, -X,) endif diff --git a/arch/Kconfig b/arch/Kconfig index 240277d5626c8..e5549cc65fbc0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1065,6 +1065,9 @@ config ARCH_SPLIT_ARG64 If a 32-bit architecture requires 64-bit arguments to be split into pairs of 32-bit arguments, select this option. +config ARCH_HAS_PARANOID_L1D_FLUSH + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/arm/include/asm/hypervisor.h b/arch/arm/include/asm/hypervisor.h index df8524365637a..bd61502b97153 100644 --- a/arch/arm/include/asm/hypervisor.h +++ b/arch/arm/include/asm/hypervisor.h @@ -4,4 +4,7 @@ #include +void kvm_init_hyp_services(void); +bool kvm_arm_hyp_service_available(u32 func_id); + #endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 34bd4cba81e66..b627678ee5160 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -159,6 +159,9 @@ config ARM64 select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS \ if $(cc-option,-fpatchable-function-entry=2) + select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \ + if DYNAMIC_FTRACE_WITH_REGS + select HAVE_RELIABLE_STACKTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD @@ -168,6 +171,7 @@ config ARM64 select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_LIVEPATCH select HAVE_NMI select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS @@ -179,6 +183,7 @@ config ARM64 select MMU_GATHER_RCU_TABLE_FREE select HAVE_RSEQ select HAVE_STACKPROTECTOR + select HAVE_STACK_VALIDATION select HAVE_SYSCALL_TRACEPOINTS select HAVE_KPROBES select HAVE_KRETPROBES @@ -302,6 +307,27 @@ config ARCH_ENABLE_MEMORY_HOTPLUG config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y +config ARCH_MEMORY_PROBE + bool "Enable /sys/devices/system/memory/probe interface" + depends on MEMORY_HOTPLUG + help + This option enables a sysfs /sys/devices/system/memory/probe + interface for testing. See Documentation/memory-hotplug.txt + for more information. If you are unsure how to answer this + question, answer N. + +config ARCH_MEMORY_REMOVE + bool "Enable /sys/devices/system/memory/remove interface" + depends on MEMORY_HOTREMOVE + help + This option enables a sysfs /sys/devices/system/memory/remove + interface for testing. See Documentation/memory-hotplug.txt + for more information. If you are unsure how to answer this + question, answer N. + +config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + def_bool y + config SMP def_bool y @@ -1989,3 +2015,5 @@ source "arch/arm64/kvm/Kconfig" if CRYPTO source "arch/arm64/crypto/Kconfig" endif + +source "kernel/livepatch/Kconfig" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 485b7dbd4f9e3..d1c4a4ad15f47 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -131,6 +131,10 @@ ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y) CC_FLAGS_FTRACE := -fpatchable-function-entry=2 endif +ifeq ($(CONFIG_STACK_VALIDATION),y) +KBUILD_CFLAGS += -fno-jump-tables +endif + # Default value head-y := arch/arm64/kernel/head.o diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S index 63a52ad9a75c0..d5bca3b7d0ea5 100644 --- a/arch/arm64/crypto/aes-neonbs-core.S +++ b/arch/arm64/crypto/aes-neonbs-core.S @@ -367,15 +367,15 @@ .align 6 -M0: .octa 0x0004080c0105090d02060a0e03070b0f +SYM_DATA_LOCAL(M0, .octa 0x0004080c0105090d02060a0e03070b0f) -M0SR: .octa 0x0004080c05090d010a0e02060f03070b -SR: .octa 0x0f0e0d0c0a09080b0504070600030201 -SRM0: .octa 0x01060b0c0207080d0304090e00050a0f +SYM_DATA_LOCAL(M0SR, .octa 0x0004080c05090d010a0e02060f03070b) +SYM_DATA_LOCAL(SR, .octa 0x0f0e0d0c0a09080b0504070600030201) +SYM_DATA_LOCAL(SRM0, .octa 0x01060b0c0207080d0304090e00050a0f) -M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 -ISR: .octa 0x0f0e0d0c080b0a090504070602010003 -ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f +SYM_DATA_LOCAL(M0ISR, .octa 0x0004080c0d0105090a0e0206070b0f03) +SYM_DATA_LOCAL(ISR, .octa 0x0f0e0d0c080b0a090504070602010003) +SYM_DATA_LOCAL(ISRM0, .octa 0x0306090c00070a0d01040b0e0205080f) /* * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S index 111d9c9abddd1..ec6f97180c747 100644 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -207,6 +207,7 @@ SYM_FUNC_END(__pmull_p8_core) pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B .endif + ANNOTATE_INTRA_FUNCTION_CALL bl .L__pmull_p8_core\i eor \rq\().16b, \rq\().16b, t4.16b diff --git a/arch/arm64/crypto/sha256-core.S_shipped b/arch/arm64/crypto/sha256-core.S_shipped index 7c7ce2e3bad6b..571323f6cca33 100644 --- a/arch/arm64/crypto/sha256-core.S_shipped +++ b/arch/arm64/crypto/sha256-core.S_shipped @@ -59,6 +59,8 @@ // deliver much less improvement, likely *negative* on Cortex-A5x. // Which is why NEON support is limited to SHA256.] +#include + #ifndef __KERNEL__ # include "arm_arch.h" #endif @@ -72,11 +74,11 @@ sha256_block_data_order: #ifndef __KERNEL__ # ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P + ldrsw x16,OPENSSL_armcap_P_rel # else - ldr x16,.LOPENSSL_armcap_P + ldr x16,OPENSSL_armcap_P_rel # endif - adr x17,.LOPENSSL_armcap_P + adr x17,OPENSSL_armcap_P_rel add x16,x16,x17 ldr w16,[x16] tst w16,#ARMV8_SHA256 @@ -99,7 +101,7 @@ sha256_block_data_order: ldp w24,w25,[x0,#4*4] add x2,x1,x2,lsl#6 // end of input ldp w26,w27,[x0,#6*4] - adr x30,.LK256 + adr x30,K256 stp x0,x2,[x29,#96] .Loop: @@ -1047,8 +1049,7 @@ sha256_block_data_order: .size sha256_block_data_order,.-sha256_block_data_order .align 6 -.type .LK256,%object -.LK256: +SYM_DATA_START_LOCAL(K256) .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 @@ -1066,17 +1067,20 @@ sha256_block_data_order: .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator -.size .LK256,.-.LK256 +SYM_DATA_END(K256) #ifndef __KERNEL__ .align 3 -.LOPENSSL_armcap_P: +SYM_DATA_START_LOCAL(OPENSSL_armcap_P_rel) # ifdef __ILP32__ .long OPENSSL_armcap_P-. # else .quad OPENSSL_armcap_P-. # endif +SYM_DATA_END(OPENSSL_armcap_P_rel) #endif +SYM_DATA_START_LOCAL(OPENSSL_str) .asciz "SHA256 block transform for ARMv8, CRYPTOGAMS by " +SYM_DATA_END(OPENSSL_str) .align 2 #ifndef __KERNEL__ .type sha256_block_armv8,%function @@ -1087,7 +1091,7 @@ sha256_block_armv8: add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] - adr x3,.LK256 + adr x3,K256 .Loop_hw: ld1 {v4.16b-v7.16b},[x1],#64 @@ -1225,11 +1229,9 @@ sha256_block_armv8: .align 4 sha256_block_neon: .Lneon_entry: - stp x29, x30, [sp, #-16]! - mov x29, sp sub sp,sp,#16*4 - adr x16,.LK256 + adr x16,K256 add x2,x1,x2,lsl#6 // len to point at the end of inp ld1 {v0.16b},[x1], #16 @@ -2060,8 +2062,7 @@ sha256_block_neon: mov x17,sp b.ne .L_00_48 - ldr x29,[x29] - add sp,sp,#16*4+16 + add sp,sp,#16*4 ret .size sha256_block_neon,.-sha256_block_neon #ifndef __KERNEL__ diff --git a/arch/arm64/crypto/sha512-core.S_shipped b/arch/arm64/crypto/sha512-core.S_shipped index e063a61067201..8477c90cf4baa 100644 --- a/arch/arm64/crypto/sha512-core.S_shipped +++ b/arch/arm64/crypto/sha512-core.S_shipped @@ -59,6 +59,8 @@ // deliver much less improvement, likely *negative* on Cortex-A5x. // Which is why NEON support is limited to SHA256.] +#include + #ifndef __KERNEL__ # include "arm_arch.h" #endif @@ -85,7 +87,7 @@ sha512_block_data_order: ldp x24,x25,[x0,#4*8] add x2,x1,x2,lsl#7 // end of input ldp x26,x27,[x0,#6*8] - adr x30,.LK512 + adr x30,K512 stp x0,x2,[x29,#96] .Loop: @@ -1033,8 +1035,7 @@ sha512_block_data_order: .size sha512_block_data_order,.-sha512_block_data_order .align 6 -.type .LK512,%object -.LK512: +SYM_DATA_START_LOCAL(K512) .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 @@ -1076,17 +1077,21 @@ sha512_block_data_order: .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator -.size .LK512,.-.LK512 +SYM_DATA_END(K512) + #ifndef __KERNEL__ .align 3 -.LOPENSSL_armcap_P: +SYM_DATA_START_LOCAL(OPENSSL_armcap_P_rel) # ifdef __ILP32__ .long OPENSSL_armcap_P-. # else .quad OPENSSL_armcap_P-. # endif +SYM_DATA_END(OPENSSL_armcap_P_rel) #endif +SYM_DATA_START_LOCAL(OPENSSL_str) .asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by " +SYM_DATA_END(OPENSSL_str) .align 2 #ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h new file mode 100644 index 0000000000000..c01edf4d988db --- /dev/null +++ b/arch/arm64/include/asm/alternative-macros.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_ALTERNATIVE_MACROS_H +#define __ASM_ALTERNATIVE_MACROS_H + +#include + +#define ARM64_CB_PATCH ARM64_NCAPS + +#ifndef __ASSEMBLY__ + +#include + +#define ALTINSTR_ENTRY(feature) \ + " .word 661b - .\n" /* label */ \ + " .word 663f - .\n" /* new instruction */ \ + " .hword " __stringify(feature) "\n" /* feature bit */ \ + " .byte 662b-661b\n" /* source len */ \ + " .byte 664f-663f\n" /* replacement len */ + +#define ALTINSTR_ENTRY_CB(feature, cb) \ + " .word 661b - .\n" /* label */ \ + " .word " __stringify(cb) "- .\n" /* callback */ \ + " .hword " __stringify(feature) "\n" /* feature bit */ \ + " .byte 662b-661b\n" /* source len */ \ + " .byte 664f-663f\n" /* replacement len */ + +/* + * alternative assembly primitive: + * + * If any of these .org directive fail, it means that insn1 and insn2 + * don't have the same length. This used to be written as + * + * .if ((664b-663b) != (662b-661b)) + * .error "Alternatives instruction length mismatch" + * .endif + * + * but most assemblers die if insn1 or insn2 have a .inst. This should + * be fixed in a binutils release posterior to 2.25.51.0.2 (anything + * containing commit 4e4d08cf7399b606 or c1baaddf8861). + * + * Alternatives with callbacks do not generate replacement instructions. + */ +#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled) \ + ".if "__stringify(cfg_enabled)" == 1\n" \ + "661:\n\t" \ + oldinstr "\n" \ + "662:\n" \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature) \ + ".popsection\n" \ + ".subsection 1\n" \ + "663:\n\t" \ + newinstr "\n" \ + "664:\n\t" \ + ".org . - (664b-663b) + (662b-661b)\n\t" \ + ".org . - (662b-661b) + (664b-663b)\n\t" \ + ".previous\n" \ + ".endif\n" + +#define __ALTERNATIVE_CFG_CB(oldinstr, feature, cfg_enabled, cb) \ + ".if "__stringify(cfg_enabled)" == 1\n" \ + "661:\n\t" \ + oldinstr "\n" \ + "662:\n" \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY_CB(feature, cb) \ + ".popsection\n" \ + "663:\n\t" \ + "664:\n\t" \ + ".endif\n" + +#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...) \ + __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg)) + +#define ALTERNATIVE_CB(oldinstr, cb) \ + __ALTERNATIVE_CFG_CB(oldinstr, ARM64_CB_PATCH, 1, cb) +#else + +#include + +.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len + .word \orig_offset - . + .word \alt_offset - . + .hword \feature + .byte \orig_len + .byte \alt_len +.endm + +.macro alternative_insn insn1, insn2, cap, enable = 1 + .if \enable +661: \insn1 +662: .pushsection .altinstructions, "a" + altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f + .popsection + .subsection 1 +663: \insn2 +664: .previous + .org . - (664b-663b) + (662b-661b) + .org . - (662b-661b) + (664b-663b) + .endif +.endm + +/* + * Alternative sequences + * + * The code for the case where the capability is not present will be + * assembled and linked as normal. There are no restrictions on this + * code. + * + * The code for the case where the capability is present will be + * assembled into a special section to be used for dynamic patching. + * Code for that case must: + * + * 1. Be exactly the same length (in bytes) as the default code + * sequence. + * + * 2. Not contain a branch target that is used outside of the + * alternative sequence it is defined in (branches into an + * alternative sequence are not fixed up). + */ + +/* + * Begin an alternative code sequence. + */ +.macro alternative_if_not cap + .set .Lasm_alt_mode, 0 + .pushsection .altinstructions, "a" + altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f + .popsection +661: +.endm + +.macro alternative_if cap + .set .Lasm_alt_mode, 1 + .pushsection .altinstructions, "a" + altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f + .popsection + .subsection 1 + .align 2 /* So GAS knows label 661 is suitably aligned */ +661: +.endm + +.macro alternative_cb cb + .set .Lasm_alt_mode, 0 + .pushsection .altinstructions, "a" + altinstruction_entry 661f, \cb, ARM64_CB_PATCH, 662f-661f, 0 + .popsection +661: +.endm + +/* + * Provide the other half of the alternative code sequence. + */ +.macro alternative_else +662: + .if .Lasm_alt_mode==0 + .subsection 1 + .else + .previous + .endif +663: +.endm + +/* + * Complete an alternative code sequence. + */ +.macro alternative_endif +664: + .if .Lasm_alt_mode==0 + .previous + .endif + .org . - (664b-663b) + (662b-661b) + .org . - (662b-661b) + (664b-663b) +.endm + +/* + * Callback-based alternative epilogue + */ +.macro alternative_cb_end +662: +.endm + +/* + * Provides a trivial alternative or default sequence consisting solely + * of NOPs. The number of NOPs is chosen automatically to match the + * previous case. + */ +.macro alternative_else_nop_endif +alternative_else + nops (662b-661b) / AARCH64_INSN_SIZE +alternative_endif +.endm + +#define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ + alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) + +.macro user_alt, label, oldinstr, newinstr, cond +9999: alternative_insn "\oldinstr", "\newinstr", \cond + _asm_extable 9999b, \label +.endm + +#endif /* __ASSEMBLY__ */ + +/* + * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature)); + * + * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO)); + * N.B. If CONFIG_FOO is specified, but not selected, the whole block + * will be omitted, including oldinstr. + */ +#define ALTERNATIVE(oldinstr, newinstr, ...) \ + _ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1) + +#endif /* __ASM_ALTERNATIVE_MACROS_H */ diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 3cb3c4ab3ea56..a38b92e11811e 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -2,17 +2,13 @@ #ifndef __ASM_ALTERNATIVE_H #define __ASM_ALTERNATIVE_H -#include -#include - -#define ARM64_CB_PATCH ARM64_NCAPS +#include #ifndef __ASSEMBLY__ #include #include #include -#include struct alt_instr { s32 orig_offset; /* offset to original instruction */ @@ -35,264 +31,5 @@ void apply_alternatives_module(void *start, size_t length); static inline void apply_alternatives_module(void *start, size_t length) { } #endif -#define ALTINSTR_ENTRY(feature) \ - " .word 661b - .\n" /* label */ \ - " .word 663f - .\n" /* new instruction */ \ - " .hword " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* source len */ \ - " .byte 664f-663f\n" /* replacement len */ - -#define ALTINSTR_ENTRY_CB(feature, cb) \ - " .word 661b - .\n" /* label */ \ - " .word " __stringify(cb) "- .\n" /* callback */ \ - " .hword " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* source len */ \ - " .byte 664f-663f\n" /* replacement len */ - -/* - * alternative assembly primitive: - * - * If any of these .org directive fail, it means that insn1 and insn2 - * don't have the same length. This used to be written as - * - * .if ((664b-663b) != (662b-661b)) - * .error "Alternatives instruction length mismatch" - * .endif - * - * but most assemblers die if insn1 or insn2 have a .inst. This should - * be fixed in a binutils release posterior to 2.25.51.0.2 (anything - * containing commit 4e4d08cf7399b606 or c1baaddf8861). - * - * Alternatives with callbacks do not generate replacement instructions. - */ -#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled) \ - ".if "__stringify(cfg_enabled)" == 1\n" \ - "661:\n\t" \ - oldinstr "\n" \ - "662:\n" \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(feature) \ - ".popsection\n" \ - ".subsection 1\n" \ - "663:\n\t" \ - newinstr "\n" \ - "664:\n\t" \ - ".org . - (664b-663b) + (662b-661b)\n\t" \ - ".org . - (662b-661b) + (664b-663b)\n\t" \ - ".previous\n" \ - ".endif\n" - -#define __ALTERNATIVE_CFG_CB(oldinstr, feature, cfg_enabled, cb) \ - ".if "__stringify(cfg_enabled)" == 1\n" \ - "661:\n\t" \ - oldinstr "\n" \ - "662:\n" \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY_CB(feature, cb) \ - ".popsection\n" \ - "663:\n\t" \ - "664:\n\t" \ - ".endif\n" - -#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...) \ - __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg)) - -#define ALTERNATIVE_CB(oldinstr, cb) \ - __ALTERNATIVE_CFG_CB(oldinstr, ARM64_CB_PATCH, 1, cb) -#else - -#include - -.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len - .word \orig_offset - . - .word \alt_offset - . - .hword \feature - .byte \orig_len - .byte \alt_len -.endm - -.macro alternative_insn insn1, insn2, cap, enable = 1 - .if \enable -661: \insn1 -662: .pushsection .altinstructions, "a" - altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f - .popsection - .subsection 1 -663: \insn2 -664: .org . - (664b-663b) + (662b-661b) - .org . - (662b-661b) + (664b-663b) - .previous - .endif -.endm - -/* - * Alternative sequences - * - * The code for the case where the capability is not present will be - * assembled and linked as normal. There are no restrictions on this - * code. - * - * The code for the case where the capability is present will be - * assembled into a special section to be used for dynamic patching. - * Code for that case must: - * - * 1. Be exactly the same length (in bytes) as the default code - * sequence. - * - * 2. Not contain a branch target that is used outside of the - * alternative sequence it is defined in (branches into an - * alternative sequence are not fixed up). - */ - -/* - * Begin an alternative code sequence. - */ -.macro alternative_if_not cap - .set .Lasm_alt_mode, 0 - .pushsection .altinstructions, "a" - altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f - .popsection -661: -.endm - -.macro alternative_if cap - .set .Lasm_alt_mode, 1 - .pushsection .altinstructions, "a" - altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f - .popsection - .subsection 1 - .align 2 /* So GAS knows label 661 is suitably aligned */ -661: -.endm - -.macro alternative_cb cb - .set .Lasm_alt_mode, 0 - .pushsection .altinstructions, "a" - altinstruction_entry 661f, \cb, ARM64_CB_PATCH, 662f-661f, 0 - .popsection -661: -.endm - -/* - * Provide the other half of the alternative code sequence. - */ -.macro alternative_else -662: - .if .Lasm_alt_mode==0 - .subsection 1 - .else - .previous - .endif -663: -.endm - -/* - * Complete an alternative code sequence. - */ -.macro alternative_endif -664: - .org . - (664b-663b) + (662b-661b) - .org . - (662b-661b) + (664b-663b) - .if .Lasm_alt_mode==0 - .previous - .endif -.endm - -/* - * Callback-based alternative epilogue - */ -.macro alternative_cb_end -662: -.endm - -/* - * Provides a trivial alternative or default sequence consisting solely - * of NOPs. The number of NOPs is chosen automatically to match the - * previous case. - */ -.macro alternative_else_nop_endif -alternative_else - nops (662b-661b) / AARCH64_INSN_SIZE -alternative_endif -.endm - -#define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ - alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) - -.macro user_alt, label, oldinstr, newinstr, cond -9999: alternative_insn "\oldinstr", "\newinstr", \cond - _asm_extable 9999b, \label -.endm - -/* - * Generate the assembly for UAO alternatives with exception table entries. - * This is complicated as there is no post-increment or pair versions of the - * unprivileged instructions, and USER() only works for single instructions. - */ -#ifdef CONFIG_ARM64_UAO - .macro uao_ldp l, reg1, reg2, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: ldp \reg1, \reg2, [\addr], \post_inc; -8889: nop; - nop; - alternative_else - ldtr \reg1, [\addr]; - ldtr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - _asm_extable 8889b,\l; - .endm - - .macro uao_stp l, reg1, reg2, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: stp \reg1, \reg2, [\addr], \post_inc; -8889: nop; - nop; - alternative_else - sttr \reg1, [\addr]; - sttr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - _asm_extable 8889b,\l; - .endm - - .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: \inst \reg, [\addr], \post_inc; - nop; - alternative_else - \alt_inst \reg, [\addr]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - .endm -#else - .macro uao_ldp l, reg1, reg2, addr, post_inc - USER(\l, ldp \reg1, \reg2, [\addr], \post_inc) - .endm - .macro uao_stp l, reg1, reg2, addr, post_inc - USER(\l, stp \reg1, \reg2, [\addr], \post_inc) - .endm - .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc - USER(\l, \inst \reg, [\addr], \post_inc) - .endm -#endif - -#endif /* __ASSEMBLY__ */ - -/* - * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature)); - * - * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO)); - * N.B. If CONFIG_FOO is specified, but not selected, the whole block - * will be omitted, including oldinstr. - */ -#define ALTERNATIVE(oldinstr, newinstr, ...) \ - _ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1) - +#endif /* __ASSEMBLY__ */ #endif /* __ASM_ALTERNATIVE_H */ diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 5ef624fef44a2..54611cebfca77 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -2,7 +2,7 @@ #ifndef __ASM_ASM_UACCESS_H #define __ASM_ASM_UACCESS_H -#include +#include #include #include #include @@ -58,4 +58,63 @@ alternative_else_nop_endif .endm #endif +/* + * Generate the assembly for UAO alternatives with exception table entries. + * This is complicated as there is no post-increment or pair versions of the + * unprivileged instructions, and USER() only works for single instructions. + */ +#ifdef CONFIG_ARM64_UAO + .macro uao_ldp l, reg1, reg2, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: ldp \reg1, \reg2, [\addr], \post_inc; +8889: nop; + nop; + alternative_else + ldtr \reg1, [\addr]; + ldtr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + alternative_endif + + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; + .endm + + .macro uao_stp l, reg1, reg2, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: stp \reg1, \reg2, [\addr], \post_inc; +8889: nop; + nop; + alternative_else + sttr \reg1, [\addr]; + sttr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + alternative_endif + + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; + .endm + + .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: \inst \reg, [\addr], \post_inc; + nop; + alternative_else + \alt_inst \reg, [\addr]; + add \addr, \addr, \post_inc; + alternative_endif + + _asm_extable 8888b,\l; + .endm +#else + .macro uao_ldp l, reg1, reg2, addr, post_inc + USER(\l, ldp \reg1, \reg2, [\addr], \post_inc) + .endm + .macro uao_stp l, reg1, reg2, addr, post_inc + USER(\l, stp \reg1, \reg2, [\addr], \post_inc) + .endm + .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc + USER(\l, \inst \reg, [\addr], \post_inc) + .endm +#endif + #endif diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 011e681a23366..f507feb045e35 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -22,6 +22,7 @@ #include #include #include +#include .macro save_and_disable_daif, flags mrs \flags, daif @@ -150,6 +151,7 @@ lr .req x30 // link register */ .macro ventry label .align 7 + UNWIND_HINT_EMPTY b \label .endm diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 28be048db3f63..9917429971d48 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -19,7 +19,11 @@ unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags)) +#define __WARN_FLAGS(flags) \ +do { \ + __BUG_FLAGS(BUGFLAG_WARNING|(flags)); \ + annotate_reachable(); \ +} while (0) #define HAVE_ARCH_BUG diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h index f9cc1d0217915..0ae427f352c8c 100644 --- a/arch/arm64/include/asm/hypervisor.h +++ b/arch/arm64/include/asm/hypervisor.h @@ -4,4 +4,7 @@ #include +void kvm_init_hyp_services(void); +bool kvm_arm_hyp_service_available(u32 func_id); + #endif diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index d45b42295254d..c9e95848042b9 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -11,7 +11,7 @@ #include /* A64 instructions are always 32 bits. */ -#define AARCH64_INSN_SIZE 4 +#define AARCH64_INSN_SIZE 4 #ifndef __ASSEMBLY__ /* @@ -31,6 +31,7 @@ */ enum aarch64_insn_encoding_class { AARCH64_INSN_CLS_UNKNOWN, /* UNALLOCATED */ + AARCH64_INSN_CLS_SVE, /* SVE instructions */ AARCH64_INSN_CLS_DP_IMM, /* Data processing - immediate */ AARCH64_INSN_CLS_DP_REG, /* Data processing - register */ AARCH64_INSN_CLS_DP_FPSIMD, /* Data processing - SIMD and FP */ @@ -296,6 +297,12 @@ __AARCH64_INSN_FUNCS(adr, 0x9F000000, 0x10000000) __AARCH64_INSN_FUNCS(adrp, 0x9F000000, 0x90000000) __AARCH64_INSN_FUNCS(prfm, 0x3FC00000, 0x39800000) __AARCH64_INSN_FUNCS(prfm_lit, 0xFF000000, 0xD8000000) +__AARCH64_INSN_FUNCS(store_imm, 0x3FC00000, 0x39000000) +__AARCH64_INSN_FUNCS(load_imm, 0x3FC00000, 0x39400000) +__AARCH64_INSN_FUNCS(store_pre, 0x3FE00C00, 0x38000C00) +__AARCH64_INSN_FUNCS(load_pre, 0x3FE00C00, 0x38400C00) +__AARCH64_INSN_FUNCS(store_post, 0x3FE00C00, 0x38000400) +__AARCH64_INSN_FUNCS(load_post, 0x3FE00C00, 0x38400400) __AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800) __AARCH64_INSN_FUNCS(ldadd, 0x3F20FC00, 0x38200000) __AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800) @@ -304,6 +311,8 @@ __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) __AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000) __AARCH64_INSN_FUNCS(load_ex, 0x3F400000, 0x08400000) __AARCH64_INSN_FUNCS(store_ex, 0x3F400000, 0x08000000) +__AARCH64_INSN_FUNCS(stp, 0x7FC00000, 0x29000000) +__AARCH64_INSN_FUNCS(ldp, 0x7FC00000, 0x29400000) __AARCH64_INSN_FUNCS(stp_post, 0x7FC00000, 0x28800000) __AARCH64_INSN_FUNCS(ldp_post, 0x7FC00000, 0x28C00000) __AARCH64_INSN_FUNCS(stp_pre, 0x7FC00000, 0x29800000) @@ -336,6 +345,7 @@ __AARCH64_INSN_FUNCS(rev64, 0x7FFFFC00, 0x5AC00C00) __AARCH64_INSN_FUNCS(and, 0x7F200000, 0x0A000000) __AARCH64_INSN_FUNCS(bic, 0x7F200000, 0x0A200000) __AARCH64_INSN_FUNCS(orr, 0x7F200000, 0x2A000000) +__AARCH64_INSN_FUNCS(mov_reg, 0x7FE0FFE0, 0x2A0003E0) __AARCH64_INSN_FUNCS(orn, 0x7F200000, 0x2A200000) __AARCH64_INSN_FUNCS(eor, 0x7F200000, 0x4A000000) __AARCH64_INSN_FUNCS(eon, 0x7F200000, 0x4A200000) @@ -370,6 +380,14 @@ __AARCH64_INSN_FUNCS(eret_auth, 0xFFFFFBFF, 0xD69F0BFF) __AARCH64_INSN_FUNCS(mrs, 0xFFF00000, 0xD5300000) __AARCH64_INSN_FUNCS(msr_imm, 0xFFF8F01F, 0xD500401F) __AARCH64_INSN_FUNCS(msr_reg, 0xFFF00000, 0xD5100000) +__AARCH64_INSN_FUNCS(dmb, 0xFFFFF0FF, 0xD50330BF) +__AARCH64_INSN_FUNCS(dsb_base, 0xFFFFF0FF, 0xD503309F) +__AARCH64_INSN_FUNCS(dsb_nxs, 0xFFFFF3FF, 0xD503323F) +__AARCH64_INSN_FUNCS(isb, 0xFFFFF0FF, 0xD50330DF) +__AARCH64_INSN_FUNCS(sb, 0xFFFFFFFF, 0xD50330FF) +__AARCH64_INSN_FUNCS(clrex, 0xFFFFF0FF, 0xD503305F) +__AARCH64_INSN_FUNCS(ssbb, 0xFFFFFFFF, 0xD503309F) +__AARCH64_INSN_FUNCS(pssbb, 0xFFFFFFFF, 0xD503349F) #undef __AARCH64_INSN_FUNCS @@ -381,8 +399,48 @@ static inline bool aarch64_insn_is_adr_adrp(u32 insn) return aarch64_insn_is_adr(insn) || aarch64_insn_is_adrp(insn); } -int aarch64_insn_read(void *addr, u32 *insnp); -int aarch64_insn_write(void *addr, u32 insn); +static inline bool aarch64_insn_is_dsb(u32 insn) +{ + return (aarch64_insn_is_dsb_base(insn) && (insn & 0xb00)) || + aarch64_insn_is_dsb_nxs(insn); +} + +static inline bool aarch64_insn_is_barrier(u32 insn) +{ + return aarch64_insn_is_dmb(insn) || aarch64_insn_is_dsb(insn) || + aarch64_insn_is_isb(insn) || aarch64_insn_is_sb(insn) || + aarch64_insn_is_clrex(insn) || aarch64_insn_is_ssbb(insn) || + aarch64_insn_is_pssbb(insn); +} + +static inline bool aarch64_insn_is_store_single(u32 insn) +{ + return aarch64_insn_is_store_imm(insn) || + aarch64_insn_is_store_pre(insn) || + aarch64_insn_is_store_post(insn); +} + +static inline bool aarch64_insn_is_store_pair(u32 insn) +{ + return aarch64_insn_is_stp(insn) || + aarch64_insn_is_stp_pre(insn) || + aarch64_insn_is_stp_post(insn); +} + +static inline bool aarch64_insn_is_load_single(u32 insn) +{ + return aarch64_insn_is_load_imm(insn) || + aarch64_insn_is_load_pre(insn) || + aarch64_insn_is_load_post(insn); +} + +static inline bool aarch64_insn_is_load_pair(u32 insn) +{ + return aarch64_insn_is_ldp(insn) || + aarch64_insn_is_ldp_pre(insn) || + aarch64_insn_is_ldp_post(insn); +} + enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn); bool aarch64_insn_uses_literal(u32 insn); bool aarch64_insn_is_branch(u32 insn); @@ -489,9 +547,6 @@ u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base, s32 aarch64_get_branch_offset(u32 insn); u32 aarch64_set_branch_offset(u32 insn, s32 offset); -int aarch64_insn_patch_text_nosync(void *addr, u32 insn); -int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); - s32 aarch64_insn_adrp_get_offset(u32 insn); u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset); diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h index 21fc85e9d2bed..a8cb91d8d59b3 100644 --- a/arch/arm64/include/asm/kgdb.h +++ b/arch/arm64/include/asm/kgdb.h @@ -19,6 +19,7 @@ static inline void arch_kgdb_breakpoint(void) { asm ("brk %0" : : "I" (KGDB_COMPILED_DBG_BRK_IMM)); + annotate_reachable(); } extern void kgdb_handle_bus_error(void); diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h index ba89a9af820ab..3b5f1fd332b01 100644 --- a/arch/arm64/include/asm/linkage.h +++ b/arch/arm64/include/asm/linkage.h @@ -60,4 +60,16 @@ SYM_FUNC_END(x); \ SYM_FUNC_END_ALIAS(__pi_##x) +/* + * Record the address range of each SYM_CODE function in a struct code_range + * in a special section. + */ +#define SYM_CODE_END(name) \ + SYM_END(name, SYM_T_NONE) ;\ + 99: ;\ + .pushsection "sym_code_functions", "aw" ;\ + .quad name ;\ + .quad 99b ;\ + .popsection + #endif diff --git a/arch/arm64/include/asm/livepatch.h b/arch/arm64/include/asm/livepatch.h new file mode 100644 index 0000000000000..9bbd18774680b --- /dev/null +++ b/arch/arm64/include/asm/livepatch.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * livepatch.h - arm64-specific Kernel Live Patching Core + */ +#ifndef _ASM_ARM64_LIVEPATCH_H +#define _ASM_ARM64_LIVEPATCH_H + +#include + +static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->pc = ip; +} + +/* + * klp_get_ftrace_location is expected to return the address of the BL to the + * relevant ftrace handler in the callsite. The location of this can vary based + * on several compilation options. + * CONFIG_DYNAMIC_FTRACE_WITH_REGS + * - Inserts 2 nops on function entry the second of which is the BL + * referenced above. (See ftrace_init_nop() for the callsite sequence) + * (this is required by livepatch and must be selected) + * CONFIG_ARM64_BTI_KERNEL: + * - Inserts a hint(BTI C) on function entry if the function is called + * indirectly (to satisfy BTI requirements), which is inserted before + * the two nops from above. + */ +#define klp_get_ftrace_location klp_get_ftrace_location +static inline unsigned long klp_get_ftrace_location(unsigned long faddr) +{ + unsigned long addr = faddr + AARCH64_INSN_SIZE; + +#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) + addr = ftrace_location_range(addr, addr + AARCH64_INSN_SIZE); +#endif + + return addr; +} + +#endif /* _ASM_ARM64_LIVEPATCH_H */ diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/patching.h new file mode 100644 index 0000000000000..6bf5adc562950 --- /dev/null +++ b/arch/arm64/include/asm/patching.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_PATCHING_H +#define __ASM_PATCHING_H + +#include + +int aarch64_insn_read(void *addr, u32 *insnp); +int aarch64_insn_write(void *addr, u32 insn); + +int aarch64_insn_patch_text_nosync(void *addr, u32 insn); +int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); + +#endif /* __ASM_PATCHING_H */ diff --git a/arch/arm64/include/asm/rwonce.h b/arch/arm64/include/asm/rwonce.h new file mode 100644 index 0000000000000..1bce62fa908a3 --- /dev/null +++ b/arch/arm64/include/asm/rwonce.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Google LLC. + */ +#ifndef __ASM_RWONCE_H +#define __ASM_RWONCE_H + +#ifdef CONFIG_LTO + +#include +#include + +#ifndef BUILD_VDSO + +#ifdef CONFIG_AS_HAS_LDAPR +#define __LOAD_RCPC(sfx, regs...) \ + ALTERNATIVE( \ + "ldar" #sfx "\t" #regs, \ + ".arch_extension rcpc\n" \ + "ldapr" #sfx "\t" #regs, \ + ARM64_HAS_LDAPR) +#else +#define __LOAD_RCPC(sfx, regs...) "ldar" #sfx "\t" #regs +#endif /* CONFIG_AS_HAS_LDAPR */ + +/* + * When building with LTO, there is an increased risk of the compiler + * converting an address dependency headed by a READ_ONCE() invocation + * into a control dependency and consequently allowing for harmful + * reordering by the CPU. + * + * Ensure that such transformations are harmless by overriding the generic + * READ_ONCE() definition with one that provides RCpc acquire semantics + * when building with LTO. + */ +#define __READ_ONCE(x) \ +({ \ + typeof(&(x)) __x = &(x); \ + int atomic = 1; \ + union { __unqual_scalar_typeof(*__x) __val; char __c[1]; } __u; \ + switch (sizeof(x)) { \ + case 1: \ + asm volatile(__LOAD_RCPC(b, %w0, %1) \ + : "=r" (*(__u8 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + case 2: \ + asm volatile(__LOAD_RCPC(h, %w0, %1) \ + : "=r" (*(__u16 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + case 4: \ + asm volatile(__LOAD_RCPC(, %w0, %1) \ + : "=r" (*(__u32 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + case 8: \ + asm volatile(__LOAD_RCPC(, %0, %1) \ + : "=r" (*(__u64 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + default: \ + atomic = 0; \ + } \ + atomic ? (typeof(*__x))__u.__val : (*(volatile typeof(__x))__x);\ +}) + +#endif /* !BUILD_VDSO */ +#endif /* CONFIG_LTO */ + +#include + +#endif /* __ASM_RWONCE_H */ diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 6a45c26da46e3..f17dbece80bb2 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -18,6 +18,7 @@ extern char __exittext_begin[], __exittext_end[]; extern char __irqentry_text_start[], __irqentry_text_end[]; extern char __mmuoff_data_start[], __mmuoff_data_end[]; extern char __entry_tramp_text_start[], __entry_tramp_text_end[]; +extern char __sym_code_functions_start[], __sym_code_functions_end[]; static inline size_t entry_tramp_text_size(void) { diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index eb29b1fe8255e..4c822ef7f5885 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -49,6 +49,13 @@ struct stack_info { * * @graph: When FUNCTION_GRAPH_TRACER is selected, holds the index of a * replacement lr value in the ftrace graph stack. + * + * @reliable: Is this stack frame reliable? There are several checks that + * need to be performed in unwind_frame() before a stack frame + * is truly reliable. Until all the checks are present, this flag + * is just a place holder. Once all the checks are implemented, + * this comment will be updated and the flag can be used by the + * caller of unwind_frame(). */ struct stackframe { unsigned long fp; @@ -59,6 +66,7 @@ struct stackframe { #ifdef CONFIG_FUNCTION_GRAPH_TRACER int graph; #endif + bool reliable; }; extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); @@ -169,6 +177,7 @@ static inline void start_backtrace(struct stackframe *frame, bitmap_zero(frame->stacks_done, __NR_STACK_TYPES); frame->prev_fp = 0; frame->prev_type = STACK_TYPE_UNKNOWN; + frame->reliable = true; } #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index cdcf307764aad..fda0458a98681 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -74,6 +74,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ #define TIF_SECCOMP 11 /* syscall secure computing */ #define TIF_SYSCALL_EMU 12 /* syscall emulation active */ +#define TIF_PATCH_PENDING 13 /* pending live patching update */ #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ #define TIF_FREEZE 19 #define TIF_RESTORE_SIGMASK 20 @@ -100,11 +101,12 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT | \ - _TIF_NOTIFY_SIGNAL) + _TIF_NOTIFY_SIGNAL | _TIF_PATCH_PENDING) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/unwind_hints.h b/arch/arm64/include/asm/unwind_hints.h new file mode 100644 index 0000000000000..60f866e4e12c8 --- /dev/null +++ b/arch/arm64/include/asm/unwind_hints.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_UNWIND_HINTS_H +#define __ASM_UNWIND_HINTS_H + +#include + +#define UNWIND_HINT_REG_UNDEFINED 0xff +#define UNWIND_HINT_REG_SP 31 + +#ifdef __ASSEMBLY__ + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=UNWIND_HINT_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL +.endm + +.macro UNWIND_HINT_FUNC sp_offset=0 + UNWIND_HINT sp_reg=UNWIND_HINT_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL +.endm + +.macro UNWIND_HINT_REGS base=UNWIND_HINT_REG_SP offset=0 + UNWIND_HINT sp_reg=\base sp_offset=\offset type=UNWIND_HINT_TYPE_REGS +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_UNWIND_HINTS_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index bbaf0bc4ad609..4a39d9525d788 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -13,11 +13,11 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ entry-common.o entry-fpsimd.o process.o ptrace.o \ setup.o signal.o sys.o stacktrace.o time.o traps.o \ - io.o vdso.o hyp-stub.o psci.o cpu_ops.o insn.o \ + io.o vdso.o hyp-stub.o psci.o cpu_ops.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o proton-pack.o + syscall.o proton-pack.o patching.o targets += efi-entry.o @@ -28,7 +28,13 @@ $(obj)/%.stub.o: $(obj)/%.o FORCE obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o obj-$(CONFIG_COMPAT) += sigreturn32.o +ifeq ($(CONFIG_COMPAT), y) +OBJECT_FILES_NON_STANDARD_sigreturn32.o := y +endif obj-$(CONFIG_KUSER_HELPERS) += kuser32.o + +OBJECT_FILES_NON_STANDARD_kuser32.o := y + obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index cada0b816c8a3..765070aff31d2 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -117,6 +117,7 @@ bool acpi_psci_use_hvc(void) { return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC; } +EXPORT_SYMBOL_GPL(acpi_psci_use_hvc); /* * acpi_fadt_sanity_check() - Check FADT presence and carry out sanity diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 7d32fc959b1a0..c10c4d8313aa7 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -73,7 +73,7 @@ int main(void) DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); - DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); + DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); #ifdef CONFIG_COMPAT DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0)); diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 37721eb6f9a14..fcbf7bde6b2c9 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -10,6 +10,7 @@ #include #include #include +#include #include .text @@ -30,6 +31,7 @@ * flat identity mapping. */ SYM_CODE_START(__cpu_soft_restart) + UNWIND_HINT_EMPTY /* Clear sctlr_el1 flags. */ mrs x12, sctlr_el1 mov_q x13, SCTLR_ELx_FLAGS diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index 0073b24b5d25e..e8f930a43820f 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -9,10 +9,12 @@ #include #include +#include __INIT SYM_CODE_START(efi_enter_kernel) + UNWIND_HINT_EMPTY /* * efi_pe_entry() will have copied the kernel image if necessary and we * end up here with device tree address in x1 and the kernel entry diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index a71844fb923ee..5fb744d8acf24 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -7,30 +7,49 @@ #include #include + .macro efi_signature_nop +#ifdef CONFIG_EFI +.L_head: + /* + * This ccmp instruction has no meaningful effect except that + * its opcode forms the magic "MZ" signature required by UEFI. + */ + ccmp x18, #0, #0xd, pl +#else + /* + * Bootloaders may inspect the opcode at the start of the kernel + * image to decide if the kernel is capable of booting via UEFI. + * So put an ordinary NOP here, not the "MZ.." pseudo-nop above. + */ + nop +#endif + .endm + .macro __EFI_PE_HEADER +#ifdef CONFIG_EFI + .set .Lpe_header_offset, . - .L_head +SYM_DATA_START_LOCAL(arm64_efi_header) .long PE_MAGIC -coff_header: .short IMAGE_FILE_MACHINE_ARM64 // Machine - .short section_count // NumberOfSections + .short .Lsection_count // NumberOfSections .long 0 // TimeDateStamp .long 0 // PointerToSymbolTable .long 0 // NumberOfSymbols - .short section_table - optional_header // SizeOfOptionalHeader + .short .Lsection_table - .Loptional_header // SizeOfOptionalHeader .short IMAGE_FILE_DEBUG_STRIPPED | \ IMAGE_FILE_EXECUTABLE_IMAGE | \ IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics -optional_header: +.Loptional_header: .short PE_OPT_MAGIC_PE32PLUS // PE32+ format .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion - .long __initdata_begin - efi_header_end // SizeOfCode + .long __initdata_begin - .Lefi_header_end // SizeOfCode .long __pecoff_data_size // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_efi_pe_entry - _head // AddressOfEntryPoint - .long efi_header_end - _head // BaseOfCode + .long __efistub_efi_pe_entry - .L_head // AddressOfEntryPoint + .long .Lefi_header_end - .L_head // BaseOfCode -extra_header_fields: .quad 0 // ImageBase .long SEGMENT_ALIGN // SectionAlignment .long PECOFF_FILE_ALIGNMENT // FileAlignment @@ -42,10 +61,10 @@ extra_header_fields: .short 0 // MinorSubsystemVersion .long 0 // Win32VersionValue - .long _end - _head // SizeOfImage + .long _end - .L_head // SizeOfImage // Everything before the kernel image is considered part of the header - .long efi_header_end - _head // SizeOfHeaders + .long .Lefi_header_end - .L_head // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem .short 0 // DllCharacteristics @@ -54,7 +73,7 @@ extra_header_fields: .quad 0 // SizeOfHeapReserve .quad 0 // SizeOfHeapCommit .long 0 // LoaderFlags - .long (section_table - .) / 8 // NumberOfRvaAndSizes + .long (.Lsection_table - .) / 8 // NumberOfRvaAndSizes .quad 0 // ExportTable .quad 0 // ImportTable @@ -64,17 +83,17 @@ extra_header_fields: .quad 0 // BaseRelocationTable #ifdef CONFIG_DEBUG_EFI - .long efi_debug_table - _head // DebugTable - .long efi_debug_table_size + .long .Lefi_debug_table - .L_head // DebugTable + .long .Lefi_debug_table_size #endif // Section table -section_table: +.Lsection_table: .ascii ".text\0\0\0" - .long __initdata_begin - efi_header_end // VirtualSize - .long efi_header_end - _head // VirtualAddress - .long __initdata_begin - efi_header_end // SizeOfRawData - .long efi_header_end - _head // PointerToRawData + .long __initdata_begin - .Lefi_header_end // VirtualSize + .long .Lefi_header_end - .L_head // VirtualAddress + .long __initdata_begin - .Lefi_header_end // SizeOfRawData + .long .Lefi_header_end - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -86,9 +105,9 @@ section_table: .ascii ".data\0\0\0" .long __pecoff_data_size // VirtualSize - .long __initdata_begin - _head // VirtualAddress + .long __initdata_begin - .L_head // VirtualAddress .long __pecoff_data_rawsize // SizeOfRawData - .long __initdata_begin - _head // PointerToRawData + .long __initdata_begin - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -98,7 +117,7 @@ section_table: IMAGE_SCN_MEM_READ | \ IMAGE_SCN_MEM_WRITE // Characteristics - .set section_count, (. - section_table) / 40 + .set .Lsection_count, (. - .Lsection_table) / 40 #ifdef CONFIG_DEBUG_EFI /* @@ -114,21 +133,21 @@ section_table: __INITRODATA .align 2 -efi_debug_table: +.Lefi_debug_table: // EFI_IMAGE_DEBUG_DIRECTORY_ENTRY .long 0 // Characteristics .long 0 // TimeDateStamp .short 0 // MajorVersion .short 0 // MinorVersion .long IMAGE_DEBUG_TYPE_CODEVIEW // Type - .long efi_debug_entry_size // SizeOfData + .long .Lefi_debug_entry_size // SizeOfData .long 0 // RVA - .long efi_debug_entry - _head // FileOffset + .long .Lefi_debug_entry - .L_head // FileOffset - .set efi_debug_table_size, . - efi_debug_table + .set .Lefi_debug_table_size, . - .Lefi_debug_table .previous -efi_debug_entry: +.Lefi_debug_entry: // EFI_IMAGE_DEBUG_CODEVIEW_NB10_ENTRY .ascii "NB10" // Signature .long 0 // Unknown @@ -137,7 +156,7 @@ efi_debug_entry: .asciz VMLINUX_PATH - .set efi_debug_entry_size, . - efi_debug_entry + .set .Lefi_debug_entry_size, . - .Lefi_debug_entry #endif /* @@ -148,5 +167,9 @@ efi_debug_entry: * placed at a 4k boundary in the Image to begin with. */ .balign SEGMENT_ALIGN -efi_header_end: +.Lefi_header_end: +SYM_DATA_END_LABEL(arm64_efi_header, SYM_L_LOCAL, efi_header_end) +#else + .set .Lpe_header_offset, 0x0 +#endif .endm diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 67f68c9ef94c4..8cf970d219f5d 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -35,7 +35,7 @@ */ .macro ftrace_regs_entry, allregs=0 /* Make room for pt_regs, plus a callee frame */ - sub sp, sp, #(S_FRAME_SIZE + 16) + sub sp, sp, #(PT_REGS_SIZE + 16) /* Save function arguments (and x9 for simplicity) */ stp x0, x1, [sp, #S_X0] @@ -61,15 +61,15 @@ .endif /* Save the callsite's SP and LR */ - add x10, sp, #(S_FRAME_SIZE + 16) + add x10, sp, #(PT_REGS_SIZE + 16) stp x9, x10, [sp, #S_LR] /* Save the PC after the ftrace callsite */ str x30, [sp, #S_PC] /* Create a frame record for the callsite above pt_regs */ - stp x29, x9, [sp, #S_FRAME_SIZE] - add x29, sp, #S_FRAME_SIZE + stp x29, x9, [sp, #PT_REGS_SIZE] + add x29, sp, #PT_REGS_SIZE /* Create our frame record within pt_regs. */ stp x29, x30, [sp, #S_STACKFRAME] @@ -126,7 +126,7 @@ ftrace_common_return: ldr x9, [sp, #S_PC] /* Restore the callsite's SP */ - add sp, sp, #S_FRAME_SIZE + 16 + add sp, sp, #PT_REGS_SIZE + 16 ret x9 SYM_CODE_END(ftrace_common) @@ -136,7 +136,7 @@ SYM_CODE_START(ftrace_graph_caller) ldr x0, [sp, #S_PC] sub x0, x0, #AARCH64_INSN_SIZE // ip (callsite's BL insn) add x1, sp, #S_LR // parent_ip (callsite's LR) - ldr x2, [sp, #S_FRAME_SIZE] // parent fp (callsite's FP) + ldr x2, [sp, #PT_REGS_SIZE] // parent fp (callsite's FP) bl prepare_ftrace_return b ftrace_common_return SYM_CODE_END(ftrace_graph_caller) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 55e477f73158d..9b6a50ff7337f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -28,6 +28,7 @@ #include #include #include +#include /* * Context tracking and irqflag tracing need to instrument transitions between @@ -62,6 +63,7 @@ .macro kernel_ventry, el, label, regsize = 64 .align 7 + UNWIND_HINT_EMPTY .Lventry_start\@: .if \el == 0 /* @@ -78,7 +80,7 @@ .Lskip_tramp_vectors_cleanup\@: .endif - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE #ifdef CONFIG_VMAP_STACK /* * Test whether the SP has overflowed, without corrupting a GPR. @@ -90,6 +92,7 @@ tbnz x0, #THREAD_SHIFT, 0f sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp + UNWIND_HINT_FUNC sp_offset=PT_REGS_SIZE b el\()\el\()_\label 0: @@ -99,7 +102,7 @@ * userspace, and can clobber EL0 registers to free up GPRs. */ - /* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */ + /* Stash the original SP (minus PT_REGS_SIZE) in tpidr_el0. */ msr tpidr_el0, x0 /* Recover the original x0 value and stash it in tpidrro_el0 */ @@ -122,6 +125,7 @@ sub sp, sp, x0 mrs x0, tpidrro_el0 #endif + UNWIND_HINT_FUNC sp_offset=PT_REGS_SIZE b el\()\el\()_\label .org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm @@ -223,7 +227,7 @@ alternative_else_nop_endif scs_load_current .else - add x21, sp, #S_FRAME_SIZE + add x21, sp, #PT_REGS_SIZE get_current_task tsk /* Save the task's original addr_limit and set USER_DS */ ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] @@ -237,9 +241,9 @@ alternative_else_nop_endif stp lr, x21, [sp, #S_LR] /* - * In order to be able to dump the contents of struct pt_regs at the - * time the exception was taken (in case we attempt to walk the call - * stack later), chain it together with the stack frames. + * For exceptions from EL0, create a final frame record. + * For exceptions from EL1, create a synthetic frame record so the + * interrupted code shows up in the backtrace. */ .if \el == 0 stp xzr, xzr, [sp, #S_STACKFRAME] @@ -362,7 +366,7 @@ alternative_else_nop_endif .if \el == 0 alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 ldr lr, [sp, #S_LR] - add sp, sp, #S_FRAME_SIZE // restore sp + add sp, sp, #PT_REGS_SIZE // restore sp eret alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 @@ -376,7 +380,7 @@ alternative_else_nop_endif #endif .else ldr lr, [sp, #S_LR] - add sp, sp, #S_FRAME_SIZE // restore sp + add sp, sp, #PT_REGS_SIZE // restore sp /* Ensure any device/NC reads complete */ alternative_insn nop, "dmb sy", ARM64_WORKAROUND_1508412 @@ -456,6 +460,13 @@ SYM_CODE_END(__swpan_exit_el0) #endif 9998: + /* + * The irq stack might either have no content or already contain a + * pt_regs frame. Objtool currently does not support instructions that + * can have different stack states, so lets pretend we always have + * a clean stack. + */ + UNWIND_HINT_FUNC .endm /* @@ -466,6 +477,8 @@ SYM_CODE_END(__swpan_exit_el0) .macro irq_stack_exit mov sp, x19 scs_load_current + /* Switch back to the stack that had the PT regs */ + UNWIND_HINT_REGS .endm /* GPRs used by entry code */ @@ -591,12 +604,12 @@ __bad_stack: /* * Store the original GPRs to the new stack. The orginal SP (minus - * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + * PT_REGS_SIZE) was stashed in tpidr_el0 by kernel_ventry. */ - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE kernel_entry 1 mrs x0, tpidr_el0 - add x0, x0, #S_FRAME_SIZE + add x0, x0, #PT_REGS_SIZE str x0, [sp, #S_SP] /* Stash the regs for handle_bad_stack */ @@ -619,6 +632,11 @@ __bad_stack: ASM_BUG() .endm +SYM_CODE_START_LOCAL(el0_fiq_invalid) + inv_entry 0, BAD_FIQ +SYM_CODE_END(el0_fiq_invalid) + +#ifndef CONFIG_COMPAT SYM_CODE_START_LOCAL(el0_sync_invalid) inv_entry 0, BAD_SYNC SYM_CODE_END(el0_sync_invalid) @@ -627,19 +645,16 @@ SYM_CODE_START_LOCAL(el0_irq_invalid) inv_entry 0, BAD_IRQ SYM_CODE_END(el0_irq_invalid) -SYM_CODE_START_LOCAL(el0_fiq_invalid) - inv_entry 0, BAD_FIQ -SYM_CODE_END(el0_fiq_invalid) - SYM_CODE_START_LOCAL(el0_error_invalid) inv_entry 0, BAD_ERROR SYM_CODE_END(el0_error_invalid) -#ifdef CONFIG_COMPAT +#else + SYM_CODE_START_LOCAL(el0_fiq_invalid_compat) inv_entry 0, BAD_FIQ, 32 SYM_CODE_END(el0_fiq_invalid_compat) -#endif +#endif /* CONFIG_COMPAT */ SYM_CODE_START_LOCAL(el1_sync_invalid) inv_entry 1, BAD_SYNC @@ -741,13 +756,15 @@ SYM_CODE_END(el0_error) * "slow" syscall return path. */ SYM_CODE_START_LOCAL(ret_to_user) + UNWIND_HINT_REGS disable_daif gic_prio_kentry_setup tmp=x3 #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off #endif ldr x19, [tsk, #TSK_TI_FLAGS] - and x2, x19, #_TIF_WORK_MASK + movz x2, #_TIF_WORK_MASK + and x2, x19, x2 cbnz x2, work_pending finish_ret_to_user: user_enter_irqoff @@ -827,6 +844,7 @@ alternative_else_nop_endif .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 + UNWIND_HINT_EMPTY 1: .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry @@ -851,6 +869,7 @@ alternative_else_nop_endif * entry onto the return stack and using a RET instruction to * enter the full-fat kernel vectors. */ + ANNOTATE_INTRA_FUNCTION_CALL bl 2f b . 2: @@ -892,7 +911,7 @@ alternative_else_nop_endif .if \regsize == 64 mrs x29, far_el1 .endif - add sp, sp, #S_FRAME_SIZE // restore sp + add sp, sp, #PT_REGS_SIZE // restore sp eret sb .endm @@ -926,11 +945,13 @@ SYM_CODE_START_NOALIGN(tramp_vectors) generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE SYM_CODE_END(tramp_vectors) -SYM_CODE_START(tramp_exit_native) +SYM_CODE_START_LOCAL(tramp_exit_native) + UNWIND_HINT_EMPTY tramp_exit SYM_CODE_END(tramp_exit_native) -SYM_CODE_START(tramp_exit_compat) +SYM_CODE_START_LOCAL(tramp_exit_compat) + UNWIND_HINT_EMPTY tramp_exit 32 SYM_CODE_END(tramp_exit_compat) @@ -1030,6 +1051,7 @@ NOKPROBE(cpu_switch_to) * This is how we return from a fork. */ SYM_CODE_START(ret_from_fork) + UNWIND_HINT_REGS bl schedule_tail cbz x19, 1f // not a kernel thread mov x0, x20 diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 402a24f845b9e..cf49256e007af 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef CONFIG_DYNAMIC_FTRACE /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 351ee64c7deb4..e5c82863bee4a 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include #include #include "efi-header.S" @@ -58,36 +60,24 @@ * in the entry routines. */ __HEAD -_head: /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ -#ifdef CONFIG_EFI - /* - * This add instruction has no meaningful effect except that - * its opcode forms the magic "MZ" signature required by UEFI. - */ - add x13, x18, #0x16 - b primary_entry -#else +SYM_DATA_LOCAL(efi_nop, efi_signature_nop) // special NOP to identity as PE/COFF executable + UNWIND_HINT_EMPTY b primary_entry // branch to kernel start, magic - .long 0 // reserved -#endif - .quad 0 // Image load offset from start of RAM, little-endian +SYM_DATA_LOCAL(_zero_reserved, .quad 0) // Image load offset from start of RAM, little-endian +SYM_DATA_START_LOCAL(_arm64_common_header) le64sym _kernel_size_le // Effective size of kernel image, little-endian le64sym _kernel_flags_le // Informative flags, little-endian .quad 0 // reserved .quad 0 // reserved .quad 0 // reserved .ascii ARM64_IMAGE_MAGIC // Magic number -#ifdef CONFIG_EFI - .long pe_header - _head // Offset to the PE header. + .long .Lpe_header_offset // Offset to the PE header. +SYM_DATA_END(_arm64_common_header) -pe_header: __EFI_PE_HEADER -#else - .long 0 // reserved -#endif __INIT @@ -104,7 +94,7 @@ pe_header: */ SYM_CODE_START(primary_entry) bl preserve_boot_args - bl el2_setup // Drop to EL1, w0=cpu_boot_mode + bl init_kernel_el // w0=cpu_boot_mode adrp x23, __PHYS_OFFSET and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 bl set_cpu_boot_mode_flag @@ -123,6 +113,7 @@ SYM_CODE_END(primary_entry) * Preserve the arguments passed by the bootloader in x0 .. x3 */ SYM_CODE_START_LOCAL(preserve_boot_args) + UNWIND_HINT_EMPTY mov x21, x0 // x21=FDT adr_l x0, boot_args // record the contents of @@ -273,7 +264,8 @@ SYM_CODE_END(preserve_boot_args) * - first few MB of the kernel linear mapping to jump to once the MMU has * been enabled */ -SYM_FUNC_START_LOCAL(__create_page_tables) +SYM_CODE_START_LOCAL(__create_page_tables) + UNWIND_HINT_EMPTY mov x28, lr /* @@ -407,14 +399,26 @@ SYM_FUNC_START_LOCAL(__create_page_tables) bl __inval_dcache_area ret x28 -SYM_FUNC_END(__create_page_tables) +SYM_CODE_END(__create_page_tables) + + /* + * Create a final frame record at task_pt_regs(current)->stackframe, so + * that the unwinder can identify the final frame record of any task by + * its location in the task stack. We reserve the entire pt_regs space + * for consistency with user tasks and kthreads. + */ + .macro setup_final_frame + sub sp, sp, #PT_REGS_SIZE + stp xzr, xzr, [sp, #S_STACKFRAME] + add x29, sp, #S_STACKFRAME + .endm /* * The following fragment of code is executed with the MMU enabled. * * x0 = __PHYS_OFFSET */ -SYM_FUNC_START_LOCAL(__primary_switched) +SYM_CODE_START_LOCAL(__primary_switched) adrp x4, init_thread_union add sp, x4, #THREAD_SIZE adr_l x5, init_task @@ -464,10 +468,10 @@ SYM_FUNC_START_LOCAL(__primary_switched) 0: #endif add sp, sp, #16 - mov x29, #0 - mov x30, #0 - b start_kernel -SYM_FUNC_END(__primary_switched) + setup_final_frame + bl start_kernel + ASM_BUG() +SYM_CODE_END(__primary_switched) .pushsection ".rodata", "a" SYM_DATA_START(kimage_vaddr) @@ -483,13 +487,15 @@ EXPORT_SYMBOL(kimage_vaddr) .section ".idmap.text","awx" /* - * If we're fortunate enough to boot at EL2, ensure that the world is - * sane before dropping to EL1. + * Starting from EL2 or EL1, configure the CPU to execute at the highest + * reachable EL supported by the kernel in a chosen default state. If dropping + * from EL2 to EL1, configure EL2 before configuring EL1. * * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if * booted in EL1 or EL2 respectively. */ -SYM_FUNC_START(el2_setup) +SYM_CODE_START(init_kernel_el) + UNWIND_HINT_EMPTY msr SPsel, #1 // We want to use SP_EL{1,2} mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 @@ -650,13 +656,14 @@ SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL) msr elr_el2, lr mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 eret -SYM_FUNC_END(el2_setup) +SYM_CODE_END(init_kernel_el) /* * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed * in w0. See arch/arm64/include/asm/virt.h for more info. */ -SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag) +SYM_CODE_START_LOCAL(set_cpu_boot_mode_flag) + UNWIND_HINT_EMPTY adr_l x1, __boot_cpu_mode cmp w0, #BOOT_CPU_MODE_EL2 b.ne 1f @@ -665,7 +672,7 @@ SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag) dmb sy dc ivac, x1 // Invalidate potentially stale cache line ret -SYM_FUNC_END(set_cpu_boot_mode_flag) +SYM_CODE_END(set_cpu_boot_mode_flag) /* * These values are written with the MMU off, but read with the MMU on. @@ -699,8 +706,9 @@ SYM_DATA_END(__early_cpu_boot_status) * This provides a "holding pen" for platforms to hold all secondary * cores are held until we're ready for them to initialise. */ -SYM_FUNC_START(secondary_holding_pen) - bl el2_setup // Drop to EL1, w0=cpu_boot_mode +SYM_CODE_START(secondary_holding_pen) + UNWIND_HINT_EMPTY + bl init_kernel_el // w0=cpu_boot_mode bl set_cpu_boot_mode_flag mrs x0, mpidr_el1 mov_q x1, MPIDR_HWID_BITMASK @@ -711,19 +719,20 @@ pen: ldr x4, [x3] b.eq secondary_startup wfe b pen -SYM_FUNC_END(secondary_holding_pen) +SYM_CODE_END(secondary_holding_pen) /* * Secondary entry point that jumps straight into the kernel. Only to * be used where CPUs are brought online dynamically by the kernel. */ -SYM_FUNC_START(secondary_entry) - bl el2_setup // Drop to EL1 +SYM_CODE_START(secondary_entry) + UNWIND_HINT_EMPTY + bl init_kernel_el // w0=cpu_boot_mode bl set_cpu_boot_mode_flag b secondary_startup -SYM_FUNC_END(secondary_entry) +SYM_CODE_END(secondary_entry) -SYM_FUNC_START_LOCAL(secondary_startup) +SYM_CODE_START_LOCAL(secondary_startup) /* * Common entry point for secondary CPUs. */ @@ -733,9 +742,10 @@ SYM_FUNC_START_LOCAL(secondary_startup) bl __enable_mmu ldr x8, =__secondary_switched br x8 -SYM_FUNC_END(secondary_startup) +SYM_CODE_END(secondary_startup) -SYM_FUNC_START_LOCAL(__secondary_switched) +SYM_CODE_START_LOCAL(__secondary_switched) + UNWIND_HINT_EMPTY adr_l x5, vectors msr vbar_el1, x5 isb @@ -748,21 +758,22 @@ SYM_FUNC_START_LOCAL(__secondary_switched) cbz x2, __secondary_too_slow msr sp_el0, x2 scs_load_current - mov x29, #0 - mov x30, #0 + setup_final_frame #ifdef CONFIG_ARM64_PTR_AUTH ptrauth_keys_init_cpu x2, x3, x4, x5 #endif - b secondary_start_kernel -SYM_FUNC_END(__secondary_switched) + bl secondary_start_kernel + ASM_BUG() +SYM_CODE_END(__secondary_switched) -SYM_FUNC_START_LOCAL(__secondary_too_slow) +SYM_CODE_START_LOCAL(__secondary_too_slow) + UNWIND_HINT_EMPTY wfe wfi b __secondary_too_slow -SYM_FUNC_END(__secondary_too_slow) +SYM_CODE_END(__secondary_too_slow) /* * The booting CPU updates the failed status @__early_cpu_boot_status, @@ -794,7 +805,8 @@ SYM_FUNC_END(__secondary_too_slow) * Checks if the selected granule size is supported by the CPU. * If it isn't, park the CPU */ -SYM_FUNC_START(__enable_mmu) +SYM_CODE_START(__enable_mmu) + UNWIND_HINT_EMPTY mrs x2, ID_AA64MMFR0_EL1 ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4 cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN @@ -820,9 +832,10 @@ SYM_FUNC_START(__enable_mmu) dsb nsh isb ret -SYM_FUNC_END(__enable_mmu) +SYM_CODE_END(__enable_mmu) -SYM_FUNC_START(__cpu_secondary_check52bitva) +SYM_CODE_START_LOCAL(__cpu_secondary_check52bitva) + UNWIND_HINT_EMPTY #ifdef CONFIG_ARM64_VA_BITS_52 ldr_l x0, vabits_actual cmp x0, #52 @@ -840,9 +853,9 @@ SYM_FUNC_START(__cpu_secondary_check52bitva) #endif 2: ret -SYM_FUNC_END(__cpu_secondary_check52bitva) +SYM_CODE_END(__cpu_secondary_check52bitva) -SYM_FUNC_START_LOCAL(__no_granule_support) +SYM_CODE_START_LOCAL(__no_granule_support) /* Indicate that this CPU can't boot and is stuck in the kernel */ update_early_cpu_boot_status \ CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_NO_GRAN, x1, x2 @@ -850,10 +863,11 @@ SYM_FUNC_START_LOCAL(__no_granule_support) wfe wfi b 1b -SYM_FUNC_END(__no_granule_support) +SYM_CODE_END(__no_granule_support) #ifdef CONFIG_RELOCATABLE -SYM_FUNC_START_LOCAL(__relocate_kernel) +SYM_CODE_START_LOCAL(__relocate_kernel) + UNWIND_HINT_EMPTY /* * Iterate over each entry in the relocation table, and apply the * relocations in place. @@ -955,10 +969,10 @@ SYM_FUNC_START_LOCAL(__relocate_kernel) #endif ret -SYM_FUNC_END(__relocate_kernel) +SYM_CODE_END(__relocate_kernel) #endif -SYM_FUNC_START_LOCAL(__primary_switch) +SYM_CODE_START_LOCAL(__primary_switch) #ifdef CONFIG_RANDOMIZE_BASE mov x19, x0 // preserve new SCTLR_EL1 value mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value @@ -1002,4 +1016,4 @@ SYM_FUNC_START_LOCAL(__primary_switch) ldr x8, =__primary_switched adrp x0, __PHYS_OFFSET br x8 -SYM_FUNC_END(__primary_switch) +SYM_CODE_END(__primary_switch) diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S index 8ccca660034e4..8ce1bfe86546b 100644 --- a/arch/arm64/kernel/hibernate-asm.S +++ b/arch/arm64/kernel/hibernate-asm.S @@ -13,6 +13,7 @@ #include #include #include +#include #include /* @@ -66,6 +67,7 @@ */ .pushsection ".hibernate_exit.text", "ax" SYM_CODE_START(swsusp_arch_suspend_exit) + UNWIND_HINT_EMPTY /* * We execute from ttbr0, change ttbr1 to our copied linear map tables * with a break-before-make via the zero page @@ -120,12 +122,14 @@ SYM_CODE_END(swsusp_arch_suspend_exit) * x24: The physical address of __hyp_stub_vectors */ SYM_CODE_START_LOCAL(el1_sync) + UNWIND_HINT_EMPTY msr vbar_el2, x24 eret SYM_CODE_END(el1_sync) .macro invalid_vector label SYM_CODE_START_LOCAL(\label) + UNWIND_HINT_EMPTY b \label SYM_CODE_END(\label) .endm diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index 9a8a0ae1e75f8..fc98037e12205 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -8,6 +8,7 @@ #include #include #include +#include void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index e4e95821b1f6c..1a25c912572df 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -18,6 +18,7 @@ #include #include #include +#include struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { { "x0", 8, offsetof(struct pt_regs, regs[0])}, diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 2a1ad95d9b2cc..23b9914ef5de5 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -19,6 +19,7 @@ #include #include #include +#include void *module_alloc(unsigned long size) { @@ -151,7 +152,8 @@ enum aarch64_insn_movw_imm_type { }; static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, - int lsb, enum aarch64_insn_movw_imm_type imm_type) + int lsb, enum aarch64_insn_movw_imm_type imm_type, + bool early) { u64 imm; s64 sval; @@ -183,7 +185,10 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, /* Update the instruction with the new encoding. */ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); - *place = cpu_to_le32(insn); + if (early) + *place = cpu_to_le32(insn); + else + aarch64_insn_write(place, insn); if (imm > U16_MAX) return -ERANGE; @@ -192,7 +197,8 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, } static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, - int lsb, int len, enum aarch64_insn_imm_type imm_type) + int lsb, int len, enum aarch64_insn_imm_type imm_type, + bool early) { u64 imm, imm_mask; s64 sval; @@ -208,7 +214,10 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, /* Update the instruction's immediate field. */ insn = aarch64_insn_encode_immediate(imm_type, insn, imm); - *place = cpu_to_le32(insn); + if (early) + *place = cpu_to_le32(insn); + else + aarch64_insn_write(place, insn); /* * Extract the upper value bits (including the sign bit) and @@ -227,17 +236,17 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, } static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, - __le32 *place, u64 val) + __le32 *place, u64 val, bool early) { u32 insn; if (!is_forbidden_offset_for_adrp(place)) return reloc_insn_imm(RELOC_OP_PAGE, place, val, 12, 21, - AARCH64_INSN_IMM_ADR); + AARCH64_INSN_IMM_ADR, early); /* patch ADRP to ADR if it is in range */ if (!reloc_insn_imm(RELOC_OP_PREL, place, val & ~0xfff, 0, 21, - AARCH64_INSN_IMM_ADR)) { + AARCH64_INSN_IMM_ADR, early)) { insn = le32_to_cpu(*place); insn &= ~BIT(31); } else { @@ -249,7 +258,10 @@ static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, AARCH64_INSN_BRANCH_NOLINK); } - *place = cpu_to_le32(insn); + if (early) + *place = cpu_to_le32(insn); + else + aarch64_insn_write(place, insn); return 0; } @@ -266,6 +278,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, void *loc; u64 val; Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + bool early = me->state == MODULE_STATE_UNFORMED; for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* loc corresponds to P in the AArch64 ELF document. */ @@ -318,88 +331,88 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, fallthrough; case R_AARCH64_MOVW_UABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_UABS_G1_NC: overflow_check = false; fallthrough; case R_AARCH64_MOVW_UABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_UABS_G2_NC: overflow_check = false; fallthrough; case R_AARCH64_MOVW_UABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_UABS_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_SABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_SABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_SABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_PREL_G0_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_PREL_G0: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_PREL_G1_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_PREL_G1: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_PREL_G2_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, early); break; case R_AARCH64_MOVW_PREL_G2: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; case R_AARCH64_MOVW_PREL_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 48, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, early); break; /* Immediate instruction relocations. */ case R_AARCH64_LD_PREL_LO19: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19, - AARCH64_INSN_IMM_19); + AARCH64_INSN_IMM_19, early); break; case R_AARCH64_ADR_PREL_LO21: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21, - AARCH64_INSN_IMM_ADR); + AARCH64_INSN_IMM_ADR, early); break; case R_AARCH64_ADR_PREL_PG_HI21_NC: overflow_check = false; fallthrough; case R_AARCH64_ADR_PREL_PG_HI21: - ovf = reloc_insn_adrp(me, sechdrs, loc, val); + ovf = reloc_insn_adrp(me, sechdrs, loc, val, early); if (ovf && ovf != -ERANGE) return ovf; break; @@ -407,40 +420,40 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_LDST8_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 0, 12, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, early); break; case R_AARCH64_LDST16_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 1, 11, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, early); break; case R_AARCH64_LDST32_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 2, 10, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, early); break; case R_AARCH64_LDST64_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 3, 9, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, early); break; case R_AARCH64_LDST128_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 4, 8, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, early); break; case R_AARCH64_TSTBR14: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 14, - AARCH64_INSN_IMM_14); + AARCH64_INSN_IMM_14, early); break; case R_AARCH64_CONDBR19: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19, - AARCH64_INSN_IMM_19); + AARCH64_INSN_IMM_19, early); break; case R_AARCH64_JUMP26: case R_AARCH64_CALL26: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, - AARCH64_INSN_IMM_26); + AARCH64_INSN_IMM_26, early); if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && ovf == -ERANGE) { @@ -448,7 +461,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, if (!val) return -ENOEXEC; ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, - 26, AARCH64_INSN_IMM_26); + 26, AARCH64_INSN_IMM_26, early); } break; diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c new file mode 100644 index 0000000000000..65942fa5dc48c --- /dev/null +++ b/arch/arm64/kernel/patching.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static DEFINE_RAW_SPINLOCK(patch_lock); + +static bool is_exit_text(unsigned long addr) +{ + /* discarded with init text/data */ + return system_state < SYSTEM_RUNNING && + addr >= (unsigned long)__exittext_begin && + addr < (unsigned long)__exittext_end; +} + +static bool is_image_text(unsigned long addr) +{ + return core_kernel_text(addr) || is_exit_text(addr); +} + +static void __kprobes *patch_map(void *addr, int fixmap) +{ + unsigned long uintaddr = (uintptr_t) addr; + bool image = is_image_text(uintaddr); + struct page *page; + + if (image) + page = phys_to_page(__pa_symbol(addr)); + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + page = vmalloc_to_page(addr); + else + return addr; + + BUG_ON(!page); + return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + + (uintaddr & ~PAGE_MASK)); +} + +static void __kprobes patch_unmap(int fixmap) +{ + clear_fixmap(fixmap); +} +/* + * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always + * little-endian. + */ +int __kprobes aarch64_insn_read(void *addr, u32 *insnp) +{ + int ret; + __le32 val; + + ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE); + if (!ret) + *insnp = le32_to_cpu(val); + + return ret; +} + +static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) +{ + void *waddr = addr; + unsigned long flags = 0; + int ret; + + raw_spin_lock_irqsave(&patch_lock, flags); + waddr = patch_map(addr, FIX_TEXT_POKE0); + + ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE); + + patch_unmap(FIX_TEXT_POKE0); + raw_spin_unlock_irqrestore(&patch_lock, flags); + + return ret; +} + +int __kprobes aarch64_insn_write(void *addr, u32 insn) +{ + return __aarch64_insn_write(addr, cpu_to_le32(insn)); +} + +int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) +{ + u32 *tp = addr; + int ret; + + /* A64 instructions must be word aligned */ + if ((uintptr_t)tp & 0x3) + return -EINVAL; + + ret = aarch64_insn_write(tp, insn); + if (ret == 0) + __flush_icache_range((uintptr_t)tp, + (uintptr_t)tp + AARCH64_INSN_SIZE); + + return ret; +} + +struct aarch64_insn_patch { + void **text_addrs; + u32 *new_insns; + int insn_cnt; + atomic_t cpu_count; +}; + +static int __kprobes aarch64_insn_patch_text_cb(void *arg) +{ + int i, ret = 0; + struct aarch64_insn_patch *pp = arg; + + /* The last CPU becomes master */ + if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) { + for (i = 0; ret == 0 && i < pp->insn_cnt; i++) + ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i], + pp->new_insns[i]); + /* Notify other processors with an additional increment. */ + atomic_inc(&pp->cpu_count); + } else { + while (atomic_read(&pp->cpu_count) <= num_online_cpus()) + cpu_relax(); + isb(); + } + + return ret; +} + +int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) +{ + struct aarch64_insn_patch patch = { + .text_addrs = addrs, + .new_insns = insns, + .insn_cnt = cnt, + .cpu_count = ATOMIC_INIT(0), + }; + + if (cnt <= 0) + return -EINVAL; + + return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, + cpu_online_mask); +} diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 798c3e78b84bb..155daf11e56df 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "decode-insn.h" diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 890ca72c5a514..288a84e253ccb 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -25,7 +25,7 @@ stp x24, x25, [sp, #S_X24] stp x26, x27, [sp, #S_X26] stp x28, x29, [sp, #S_X28] - add x0, sp, #S_FRAME_SIZE + add x0, sp, #PT_REGS_SIZE stp lr, x0, [sp, #S_LR] /* * Construct a useful saved PSTATE @@ -62,7 +62,7 @@ .endm SYM_CODE_START(kretprobe_trampoline) - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE save_all_base_regs @@ -76,7 +76,7 @@ SYM_CODE_START(kretprobe_trampoline) restore_all_base_regs - add sp, sp, #S_FRAME_SIZE + add sp, sp, #PT_REGS_SIZE ret SYM_CODE_END(kretprobe_trampoline) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 3696dbcbfa80c..f80bc9dc43df8 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -438,6 +438,11 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, } p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; + /* + * For the benefit of the unwinder, set up childregs->stackframe + * as the final frame for the new task. + */ + p->thread.cpu_context.fp = (unsigned long)childregs->stackframe; ptrace_hw_copy_thread(p); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index faa8a6bf2376e..93fc7f55ae839 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -300,6 +301,7 @@ static void qcom_link_stack_sanitisation(void) "mov x30, %0 \n" : "=&r" (tmp)); } +STACK_FRAME_NON_STANDARD(qcom_link_stack_sanitisation); static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void) { @@ -1081,7 +1083,13 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { switch (spectre_bhb_loop_affected(SCOPE_SYSTEM)) { case 8: - kvm_setup_bhb_slot(__spectre_bhb_loop_k8); + /* + * A57/A72-r0 will already have selected the + * spectre-indirect vector, which is sufficient + * for BHB too. + */ + if (!__this_cpu_read(bp_hardening_data.fn)) + kvm_setup_bhb_slot(__spectre_bhb_loop_k8); break; case 24: kvm_setup_bhb_slot(__spectre_bhb_loop_k24); diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 84eec95ec06cc..fa4ddf37a24cf 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -13,6 +13,7 @@ #include #include #include +#include /* * arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it. @@ -27,7 +28,7 @@ * during the copy operation. */ SYM_CODE_START(arm64_relocate_new_kernel) - + UNWIND_HINT_EMPTY /* Setup the list loop variables. */ mov x18, x2 /* x18 = dtb address */ mov x17, x1 /* x17 = kimage_start */ @@ -106,10 +107,13 @@ SYM_CODE_END(arm64_relocate_new_kernel) .Lcopy_end: .org KEXEC_CONTROL_PAGE_SIZE +.pushsection ".rodata", "a" /* * arm64_relocate_new_kernel_size - Number of bytes to copy to the * control_code_page. */ .globl arm64_relocate_new_kernel_size +.align 8 arm64_relocate_new_kernel_size: .quad .Lcopy_end - arm64_relocate_new_kernel +.popsection diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index b6fbbd527dd79..7820d496d16b2 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -938,6 +939,9 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, (void __user *)NULL, current); } + if (thread_flags & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index ba40d57757d63..f0df495b68cdd 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -4,6 +4,7 @@ #include #include #include +#include .text /* @@ -91,6 +92,7 @@ SYM_FUNC_START(__cpu_suspend_enter) str x0, [x1] add x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS stp x29, lr, [sp, #-16]! + mov x29, sp bl cpu_do_suspend ldp x29, lr, [sp], #16 mov x0, #1 @@ -99,7 +101,8 @@ SYM_FUNC_END(__cpu_suspend_enter) .pushsection ".idmap.text", "awx" SYM_CODE_START(cpu_resume) - bl el2_setup // if in EL2 drop to EL1 cleanly + UNWIND_HINT_EMPTY + bl init_kernel_el bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir @@ -110,7 +113,8 @@ SYM_CODE_END(cpu_resume) .ltorg .popsection -SYM_FUNC_START(_cpu_resume) +SYM_CODE_START(_cpu_resume) + UNWIND_HINT_EMPTY mrs x1, mpidr_el1 adr_l x8, mpidr_hash // x8 = struct mpidr_hash virt address @@ -146,4 +150,4 @@ SYM_FUNC_START(_cpu_resume) ldp x29, lr, [x29] mov x0, #0 ret -SYM_FUNC_END(_cpu_resume) +SYM_CODE_END(_cpu_resume) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index c445828ecc3aa..4fd04613ebc94 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -12,12 +12,116 @@ #include #include #include +#include #include #include #include #include +struct code_range { + unsigned long start; + unsigned long end; +}; + +static struct code_range *sym_code_functions; +static int num_sym_code_functions; + +int __init init_sym_code_functions(void) +{ + size_t size; + + size = (unsigned long)__sym_code_functions_end - + (unsigned long)__sym_code_functions_start; + + sym_code_functions = kmalloc(size, GFP_KERNEL); + if (!sym_code_functions) + return -ENOMEM; + + memcpy(sym_code_functions, __sym_code_functions_start, size); + /* Update num_sym_code_functions after copying sym_code_functions. */ + smp_mb(); + num_sym_code_functions = size / sizeof(struct code_range); + + return 0; +} +early_initcall(init_sym_code_functions); + +/* + * Check the return PC against sym_code_functions[]. If there is a match, then + * the consider the stack frame unreliable. These functions contain low-level + * code where the frame pointer and/or the return address register cannot be + * relied upon. This addresses the following situations: + * + * - Exception handlers and entry assembly + * - Trampoline assembly (e.g., ftrace, kprobes) + * - Hypervisor-related assembly + * - Hibernation-related assembly + * - CPU start-stop, suspend-resume assembly + * - Kernel relocation assembly + * + * Some special cases covered by sym_code_functions[] deserve a mention here: + * + * - All EL1 interrupt and exception stack traces will be considered + * unreliable. This is the correct behavior as interrupts and exceptions + * can happen on any instruction including ones in the frame pointer + * prolog and epilog. Unless stack metadata is available so the unwinder + * can unwind through these special cases, such stack traces will be + * considered unreliable. + * + * - A task can get preempted at the end of an interrupt. Stack traces + * of preempted tasks will show the interrupt frame in the stack trace + * and will be considered unreliable. + * + * - Breakpoints are exceptions. So, all stack traces in the break point + * handler (including probes) will be considered unreliable. + * + * - All of the ftrace entry trampolines are considered unreliable. So, + * all stack traces taken from tracer functions will be considered + * unreliable. + * + * - The Function Graph Tracer return trampoline (return_to_handler) + * and the Kretprobe return trampoline (kretprobe_trampoline) are + * also considered unreliable. + * + * Some of the special cases above can be unwound through using special logic + * in unwind_frame(). + * + * - return_to_handler() is handled by the unwinder by attempting to + * retrieve the original return address from the per-task return + * address stack. + * + * - kretprobe_trampoline() can be handled in a similar fashion by + * attempting to retrieve the original return address from the per-task + * kretprobe instance list. + * + * - I reckon optprobes can be handled in a similar fashion in the future? + * + * - Stack traces taken from the FTrace tracer functions can be handled + * as well. ftrace_call is an inner label defined in the Ftrace entry + * trampoline. This is the location where the call to a tracer function + * is patched. So, if the return PC equals ftrace_call+4, it is + * reliable. At that point, proper stack frames have already been set + * up for the traced function and its caller. + */ +static bool unwinder_is_unreliable(unsigned long pc) +{ + const struct code_range *range; + int i; + + /* + * If sym_code_functions[] were sorted, a binary search could be + * done to make this more performant. + */ + for (i = 0; i < num_sym_code_functions; i++) { + range = &sym_code_functions[i]; + if (pc >= range->start && pc < range->end) + return true; + } + + return false; +} + /* * AArch64 PCS assigns the frame pointer to x29. * @@ -44,17 +148,25 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) unsigned long fp = frame->fp; struct stack_info info; - if (fp & 0xf) - return -EINVAL; + frame->reliable = true; if (!tsk) tsk = current; - if (!on_accessible_stack(tsk, fp, &info)) + if (fp & 0xf) { + frame->reliable = false; return -EINVAL; + } - if (test_bit(info.type, frame->stacks_done)) + if (!on_accessible_stack(tsk, fp, &info)) { + frame->reliable = false; return -EINVAL; + } + + if (test_bit(info.type, frame->stacks_done)) { + frame->reliable = false; + return -EINVAL; + } /* * As stacks grow downward, any valid record on the same stack must be @@ -70,8 +182,10 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) * stack. */ if (info.type == frame->prev_type) { - if (fp <= frame->prev_fp) + if (fp <= frame->prev_fp) { + frame->reliable = false; return -EINVAL; + } } else { set_bit(frame->prev_type, frame->stacks_done); } @@ -96,8 +210,10 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) * So replace it to an original value. */ ret_stack = ftrace_graph_get_ret_stack(tsk, frame->graph++); - if (WARN_ON_ONCE(!ret_stack)) + if (WARN_ON_ONCE(!ret_stack)) { + frame->reliable = false; return -EINVAL; + } frame->pc = ret_stack->ret; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -105,13 +221,37 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) frame->pc = ptrauth_strip_insn_pac(frame->pc); /* - * Frames created upon entry from EL0 have NULL FP and PC values, so - * don't bother reporting these. Frames created by __noreturn functions - * might have a valid FP even if PC is bogus, so only terminate where - * both are NULL. + * Check the return PC for conditions that make unwinding unreliable. + * In each case, mark the stack trace as such. */ - if (!frame->fp && !frame->pc) - return -EINVAL; + + /* + * Make sure that the return address is a proper kernel text address. + * A NULL or invalid return address could mean: + * + * - generated code such as eBPF and optprobe trampolines + * - Foreign code (e.g. EFI runtime services) + * - Procedure Linkage Table (PLT) entries and veneer functions + */ + if (!__kernel_text_address(frame->pc)) { + frame->reliable = false; + return 0; + } + + /* + * If the final frame has been reached, there is no more unwinding + * to do. There is no need to check if the return PC is considered + * unreliable by the unwinder. + */ + if (!frame->fp) + return 0; + + /* Final frame; nothing to unwind */ + if (frame->fp == (unsigned long)task_pt_regs(tsk)->stackframe) + return -ENOENT; + + if (unwinder_is_unreliable(frame->pc)) + frame->reliable = false; return 0; } @@ -218,4 +358,39 @@ noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry, walk_stackframe(task, &frame, consume_entry, cookie); } +/* + * Walk the stack like arch_stack_walk() but stop the walk as soon as + * some unreliability is detected in the stack. + */ +int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) +{ + struct stackframe frame; + int ret = 0; + + if (task == current) { + start_backtrace(&frame, + (unsigned long)__builtin_frame_address(0), + (unsigned long)arch_stack_walk_reliable); + } else { + /* + * The task must not be running anywhere for the duration of + * arch_stack_walk_reliable(). The caller must guarantee + * this. + */ + start_backtrace(&frame, thread_saved_fp(task), + thread_saved_pc(task)); + } + + while (!ret) { + if (!frame.reliable) + return -EINVAL; + if (!consume_entry(cookie, frame.pc)) + return -EINVAL; + ret = unwind_frame(task, &frame); + } + + return ret == -ENOENT ? 0 : -EINVAL; +} + #endif diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 2cdd53425509d..1e010cf19e307 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index d65f52264abae..a8f8e409e2bfb 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -28,7 +28,7 @@ ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \ $(btildflags-y) -T ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 -ccflags-y += -DDISABLE_BRANCH_PROFILING +ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS) KASAN_SANITIZE := n diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index abad38c576e1d..57b28f1e5d97c 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -37,7 +37,7 @@ cc32-as-instr = $(call try-run,\ # As a result we set our own flags here. # KBUILD_CPPFLAGS and NOSTDINC_FLAGS from top-level Makefile -VDSO_CPPFLAGS := -D__KERNEL__ -nostdinc +VDSO_CPPFLAGS := -DBUILD_VDSO -D__KERNEL__ -nostdinc VDSO_CPPFLAGS += -isystem $(shell $(CC_COMPAT) -print-file-name=include 2>/dev/null) VDSO_CPPFLAGS += $(LINUXINCLUDE) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 71f4b5f24d15f..df578971e373a 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -85,6 +85,12 @@ jiffies = jiffies_64; #define TRAMP_TEXT #endif +#define SYM_CODE_FUNCTIONS \ + . = ALIGN(16); \ + __sym_code_functions_start = .; \ + KEEP(*(sym_code_functions)) \ + __sym_code_functions_end = .; + /* * The size of the PE/COFF section that covers the kernel image, which * runs from _stext to _edata, must be a round multiple of the PE/COFF @@ -199,7 +205,8 @@ SECTIONS INIT_CALLS CON_INITCALL INIT_RAM_FS - *(.init.rodata.* .init.bss) /* from the EFI stub */ + *(.init.altinstructions .init.bss) /* from the EFI stub */ + SYM_CODE_FUNCTIONS } .exit.data : { EXIT_DATA diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 4d63fcd7574b2..67b8d2271d61f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -198,6 +198,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: case KVM_CAP_ARM_NISV_TO_USER: case KVM_CAP_ARM_INJECT_EXT_DABT: + case KVM_CAP_PTP_KVM: r = 1; break; case KVM_CAP_ARM_SET_DEVICE_ADDR: diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 0c66a1d408fd7..34c35069a01f9 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -14,13 +14,15 @@ #include #include #include +#include .text /* * u64 __guest_enter(struct kvm_vcpu *vcpu); */ -SYM_FUNC_START(__guest_enter) +SYM_CODE_START(__guest_enter) + UNWIND_HINT_FUNC // x0: vcpu // x1-x17: clobbered by macros // x29: guest context @@ -104,6 +106,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // x1: vcpu // x2-x29,lr: vcpu regs // vcpu x0-x1 on the stack + UNWIND_HINT_FUNC sp_offset=16 add x1, x1, #VCPU_CONTEXT @@ -203,4 +206,4 @@ abort_guest_exit_end: msr spsr_el2, x4 orr x0, x0, x5 1: ret -SYM_FUNC_END(__guest_enter) +SYM_CODE_END(__guest_enter) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index bc06243cf4225..497f69819395b 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -149,6 +149,7 @@ SYM_CODE_END(\label) .macro valid_vect target .align 7 + UNWIND_HINT_EMPTY 661: esb stp x0, x1, [sp, #-16]! @@ -160,6 +161,7 @@ check_preamble_length 661b, 662b .macro invalid_vect target .align 7 + UNWIND_HINT_EMPTY 661: nop stp x0, x1, [sp, #-16]! diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 452f4cacd6743..b47aac452fe18 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -164,7 +164,7 @@ static u32 __vgic_v3_read_ap0rn(int n) val = read_gicreg(ICH_AP0R3_EL2); break; default: - unreachable(); + BUG(); } return val; @@ -188,7 +188,7 @@ static u32 __vgic_v3_read_ap1rn(int n) val = read_gicreg(ICH_AP1R3_EL2); break; default: - unreachable(); + BUG(); } return val; diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index bc111a1aff032..39e34d88acf60 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -9,16 +9,65 @@ #include #include +static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) +{ + struct system_time_snapshot systime_snapshot; + u64 cycles = ~0UL; + u32 feature; + + /* + * system time and counter value must captured at the same + * time to keep consistency and precision. + */ + ktime_get_snapshot(&systime_snapshot); + + /* + * This is only valid if the current clocksource is the + * architected counter, as this is the only one the guest + * can see. + */ + if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER) + return; + + /* + * The guest selects one of the two reference counters + * (virtual or physical) with the first argument of the SMCCC + * call. In case the identifier is not supported, error out. + */ + feature = smccc_get_arg1(vcpu); + switch (feature) { + case KVM_PTP_VIRT_COUNTER: + cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2); + break; + case KVM_PTP_PHYS_COUNTER: + cycles = systime_snapshot.cycles; + break; + default: + return; + } + + /* + * This relies on the top bit of val[0] never being set for + * valid values of system time, because that is *really* far + * in the future (about 292 years from 1970, and at that stage + * nobody will give a damn about it). + */ + val[0] = upper_32_bits(systime_snapshot.real); + val[1] = lower_32_bits(systime_snapshot.real); + val[2] = upper_32_bits(cycles); + val[3] = lower_32_bits(cycles); +} + int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) { u32 func_id = smccc_get_function(vcpu); - long val = SMCCC_RET_NOT_SUPPORTED; + u64 val[4] = {SMCCC_RET_NOT_SUPPORTED}; u32 feature; gpa_t gpa; switch (func_id) { case ARM_SMCCC_VERSION_FUNC_ID: - val = ARM_SMCCC_VERSION_1_1; + val[0] = ARM_SMCCC_VERSION_1_1; break; case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: feature = smccc_get_arg1(vcpu); @@ -28,10 +77,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) case SPECTRE_VULNERABLE: break; case SPECTRE_MITIGATED: - val = SMCCC_RET_SUCCESS; + val[0] = SMCCC_RET_SUCCESS; break; case SPECTRE_UNAFFECTED: - val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; + val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; break; } break; @@ -54,7 +103,7 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) break; fallthrough; case SPECTRE_UNAFFECTED: - val = SMCCC_RET_NOT_REQUIRED; + val[0] = SMCCC_RET_NOT_REQUIRED; break; } break; @@ -63,30 +112,43 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) case SPECTRE_VULNERABLE: break; case SPECTRE_MITIGATED: - val = SMCCC_RET_SUCCESS; + val[0] = SMCCC_RET_SUCCESS; break; case SPECTRE_UNAFFECTED: - val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; + val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; break; } break; case ARM_SMCCC_HV_PV_TIME_FEATURES: - val = SMCCC_RET_SUCCESS; + val[0] = SMCCC_RET_SUCCESS; break; } break; case ARM_SMCCC_HV_PV_TIME_FEATURES: - val = kvm_hypercall_pv_features(vcpu); + val[0] = kvm_hypercall_pv_features(vcpu); break; case ARM_SMCCC_HV_PV_TIME_ST: gpa = kvm_init_stolen_time(vcpu); if (gpa != GPA_INVALID) - val = gpa; + val[0] = gpa; + break; + case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: + val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0; + val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1; + val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2; + val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: + val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP); + break; + case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: + kvm_ptp_get_time(vcpu, val); break; default: return kvm_psci_call(vcpu); } - smccc_set_retval(vcpu, val, 0, 0, 0); + smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); return 1; } diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index d31e1169d9b8e..9cd83908717da 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 lib-y := clear_user.o delay.o copy_from_user.o \ copy_to_user.o copy_in_user.o copy_page.o \ - clear_page.o csum.o memchr.o memcpy.o memmove.o \ - memset.o memcmp.o strcmp.o strncmp.o strlen.o \ - strnlen.o strchr.o strrchr.o tishift.o + clear_page.o csum.o insn.o memchr.o memcpy.o \ + memmove.o memset.o memcmp.o strcmp.o strncmp.o \ + strlen.o strnlen.o strchr.o strrchr.o tishift.o ifeq ($(CONFIG_KERNEL_MODE_NEON), y) obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/lib/insn.c similarity index 92% rename from arch/arm64/kernel/insn.c rename to arch/arm64/lib/insn.c index 7d4fdf9745428..c2374373a70e1 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/lib/insn.c @@ -7,21 +7,14 @@ */ #include #include -#include -#include -#include -#include -#include -#include +#include +#include #include -#include -#include -#include -#include +#include /* __ignore_sync_check__ */ +#include #include -#include -#include +#include /* __ignore_sync_check__ */ #define AARCH64_INSN_SF_BIT BIT(31) #define AARCH64_INSN_N_BIT BIT(22) @@ -30,7 +23,7 @@ static const int aarch64_insn_encoding_class[] = { AARCH64_INSN_CLS_UNKNOWN, AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_UNKNOWN, + AARCH64_INSN_CLS_SVE, AARCH64_INSN_CLS_UNKNOWN, AARCH64_INSN_CLS_LDST, AARCH64_INSN_CLS_DP_REG, @@ -83,81 +76,6 @@ bool aarch64_insn_is_branch_imm(u32 insn) aarch64_insn_is_bcond(insn)); } -static DEFINE_RAW_SPINLOCK(patch_lock); - -static bool is_exit_text(unsigned long addr) -{ - /* discarded with init text/data */ - return system_state < SYSTEM_RUNNING && - addr >= (unsigned long)__exittext_begin && - addr < (unsigned long)__exittext_end; -} - -static bool is_image_text(unsigned long addr) -{ - return core_kernel_text(addr) || is_exit_text(addr); -} - -static void __kprobes *patch_map(void *addr, int fixmap) -{ - unsigned long uintaddr = (uintptr_t) addr; - bool image = is_image_text(uintaddr); - struct page *page; - - if (image) - page = phys_to_page(__pa_symbol(addr)); - else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - page = vmalloc_to_page(addr); - else - return addr; - - BUG_ON(!page); - return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + - (uintaddr & ~PAGE_MASK)); -} - -static void __kprobes patch_unmap(int fixmap) -{ - clear_fixmap(fixmap); -} -/* - * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always - * little-endian. - */ -int __kprobes aarch64_insn_read(void *addr, u32 *insnp) -{ - int ret; - __le32 val; - - ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE); - if (!ret) - *insnp = le32_to_cpu(val); - - return ret; -} - -static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) -{ - void *waddr = addr; - unsigned long flags = 0; - int ret; - - raw_spin_lock_irqsave(&patch_lock, flags); - waddr = patch_map(addr, FIX_TEXT_POKE0); - - ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE); - - patch_unmap(FIX_TEXT_POKE0); - raw_spin_unlock_irqrestore(&patch_lock, flags); - - return ret; -} - -int __kprobes aarch64_insn_write(void *addr, u32 insn) -{ - return __aarch64_insn_write(addr, cpu_to_le32(insn)); -} - bool __kprobes aarch64_insn_uses_literal(u32 insn) { /* ldr/ldrsw (literal), prfm */ @@ -187,67 +105,6 @@ bool __kprobes aarch64_insn_is_branch(u32 insn) aarch64_insn_is_bcond(insn); } -int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) -{ - u32 *tp = addr; - int ret; - - /* A64 instructions must be word aligned */ - if ((uintptr_t)tp & 0x3) - return -EINVAL; - - ret = aarch64_insn_write(tp, insn); - if (ret == 0) - __flush_icache_range((uintptr_t)tp, - (uintptr_t)tp + AARCH64_INSN_SIZE); - - return ret; -} - -struct aarch64_insn_patch { - void **text_addrs; - u32 *new_insns; - int insn_cnt; - atomic_t cpu_count; -}; - -static int __kprobes aarch64_insn_patch_text_cb(void *arg) -{ - int i, ret = 0; - struct aarch64_insn_patch *pp = arg; - - /* The last CPU becomes master */ - if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) { - for (i = 0; ret == 0 && i < pp->insn_cnt; i++) - ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i], - pp->new_insns[i]); - /* Notify other processors with an additional increment. */ - atomic_inc(&pp->cpu_count); - } else { - while (atomic_read(&pp->cpu_count) <= num_online_cpus()) - cpu_relax(); - isb(); - } - - return ret; -} - -int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) -{ - struct aarch64_insn_patch patch = { - .text_addrs = addrs, - .new_insns = insns, - .insn_cnt = cnt, - .cpu_count = ATOMIC_INIT(0), - }; - - if (cnt <= 0) - return -EINVAL; - - return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, - cpu_online_mask); -} - static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type, u32 *maskp, int *shiftp) { diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index 03ca6d8b86706..cceed41bba153 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -4,7 +4,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index aacc7eab9b2ff..72d2181ae38dd 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -222,8 +222,8 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1) * * Called exactly once from stop_machine context by each CPU found during boot. */ -__idmap_kpti_flag: - .long 1 +SYM_DATA_LOCAL(__idmap_kpti_flag, .long 1) + SYM_FUNC_START(idmap_kpti_install_ng_mappings) cpu .req w0 num_cpus .req w1 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6dc670e363939..76424a04383d7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -104,6 +104,7 @@ config X86 select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 + select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE @@ -2525,6 +2526,9 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y depends on MEMORY_HOTPLUG +config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + def_bool y + config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index b92fffbe761fd..8c27630a774d0 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -1,3 +1,4 @@ + // SPDX-License-Identifier: GPL-2.0 /* * kaslr.c @@ -32,6 +33,9 @@ #include #include +/* xen_cpuid_base/hypervisor_cpuid_base inlines */ +#include + /* Macros used by the included decompressor code below. */ #define STATIC #include @@ -839,6 +843,10 @@ void choose_random_location(unsigned long input, warn("KASLR disabled: 'nokaslr' on cmdline."); return; } + if (xen_cpuid_base() != 0) { + warn("KASLR disabled: Xen hypervisor detected."); + return; + } boot_params->hdr.loadflags |= KASLR_FLAG; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5a54c3685a066..bebeea9e771e3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -298,7 +298,6 @@ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ #define X86_FEATURE_MSR_TSX_CTRL (11*32+18) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ #define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ @@ -446,7 +445,7 @@ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ #define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ #define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ -#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_SMT_RSB X86_BUG(28) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ #define X86_BUG_GDS X86_BUG(29) /* CPU is affected by Gather Data Sampling */ /* BUG word 2 */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3e9f1c820edbf..64cd6fb22b325 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -668,7 +668,7 @@ struct kvm_vcpu_arch { } st; u64 l1_tsc_offset; - u64 tsc_offset; + u64 tsc_offset; /* current tsc offset */ u64 last_guest_tsc; u64 last_host_tsc; u64 tsc_offset_adjustment; @@ -682,7 +682,8 @@ struct kvm_vcpu_arch { u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; u64 msr_ia32_power_ctl; - u64 tsc_scaling_ratio; + u64 l1_tsc_scaling_ratio; + u64 tsc_scaling_ratio; /* current scaling ratio */ atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -1195,8 +1196,10 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - /* Returns actual tsc_offset set in active VMCS */ - u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); + u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); /* * Retrieve somewhat arbitrary exit information. Intended to be used @@ -1682,8 +1685,10 @@ void kvm_define_user_return_msr(unsigned index, u32 msr); int kvm_probe_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); +u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier); +u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); diff --git a/arch/x86/include/asm/l1d_flush.h b/arch/x86/include/asm/l1d_flush.h new file mode 100644 index 0000000000000..fdb798e634614 --- /dev/null +++ b/arch/x86/include/asm/l1d_flush.h @@ -0,0 +1,7 @@ +#ifndef _L1D_FLUSH_H +#define _L1D_FLUSH_H + +void l1d_flush_init(void); +void l1d_flush_sw(void); + +#endif /* _L1D_FLUSH_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 202a52e42a368..cdeaa099ad0e7 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -153,10 +153,6 @@ * are restricted to targets in * kernel. */ -#define ARCH_CAP_PBRSB_NO BIT(24) /* - * Not susceptible to Post-Barrier - * Return Stack Buffer Predictions. - */ #define ARCH_CAP_GDS_CTRL BIT(25) /* * CPU is vulnerable to Gather * Data Sampling (GDS) and diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 99fbce2c1c7c1..6a1011989e471 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -147,10 +147,9 @@ * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP * monstrosity above, manually. */ -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS) - ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \ - __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \ - __stringify(__FILL_ONE_RETURN), \ftr2 +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req + ALTERNATIVE "jmp .Lskip_rsb_\@", \ + __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr .Lskip_rsb_\@: .endm @@ -352,6 +351,8 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); DECLARE_STATIC_KEY_FALSE(mds_user_clear); DECLARE_STATIC_KEY_FALSE(mds_idle_clear); +DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); + DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); #include diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index fdbffec4cfdea..5a2baf28a1dcd 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -40,6 +40,8 @@ #define ORC_REG_MAX 15 #ifndef __ASSEMBLY__ +#include + /* * This struct is more or less a vastly simplified version of the DWARF Call * Frame Information standard. It contains only the necessary parts of DWARF @@ -51,10 +53,18 @@ struct orc_entry { s16 sp_offset; s16 bp_offset; +#if defined(__LITTLE_ENDIAN_BITFIELD) unsigned sp_reg:4; unsigned bp_reg:4; unsigned type:2; unsigned end:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + unsigned bp_reg:4; + unsigned sp_reg:4; + unsigned unused:5; + unsigned end:1; + unsigned type:2; +#endif } __packed; #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2dd9b661a5fd5..5ecf1d6a20dca 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -136,6 +136,8 @@ struct cpuinfo_x86 { u16 logical_die_id; /* Index into per_cpu list: */ u16 cpu_index; + /* Is SMT active on this core? */ + bool smt_active; u32 microcode; /* Address space bits used by the cache internally */ u8 x86_cache_bits; diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index ea1d8eb644cb7..d17b39893b797 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -55,9 +55,8 @@ static __always_inline unsigned long smap_save(void) unsigned long flags; asm volatile ("# smap_save\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "pushf; pop %0; " __ASM_CLAC "\n\t" - "1:" + ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t", + X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); return flags; @@ -66,9 +65,8 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { asm volatile ("# smap_restore\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "push %0; popf\n\t" - "1:" + ALTERNATIVE("", "push %0; popf\n\t", + X86_FEATURE_SMAP) : : "g" (flags) : "memory", "cc"); } diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 012c8ee93b67f..753053cd50380 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -84,7 +84,7 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ -#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ +#define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_PATCH_PENDING 13 /* pending live patching update */ @@ -97,6 +97,7 @@ struct thread_info { #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ +#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ @@ -114,7 +115,7 @@ struct thread_info { #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) -#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) +#define _TIF_SPEC_L1D_FLUSH (1 << TIF_SPEC_L1D_FLUSH) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) @@ -126,6 +127,7 @@ struct thread_info { #define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) +#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 8c87a2e0b660c..a927d40664df7 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -83,7 +83,7 @@ struct tlb_state { /* Last user mm for optimizing IBPB */ union { struct mm_struct *last_user_mm; - unsigned long last_user_mm_ibpb; + unsigned long last_user_mm_spec; }; u16 loaded_mm_asid; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index c06f3a961d647..1419853536995 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -169,3 +169,5 @@ ifeq ($(CONFIG_X86_64),y) endif obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o + +obj-y += l1d_flush.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index f1bb57b0e41ea..cf340d85946a8 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index c8daa92f38dcd..b57333f567bcf 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ .text #include +#include #include #include #include @@ -126,6 +127,7 @@ SYM_FUNC_START(do_suspend_lowlevel) FRAME_END jmp restore_processor_state SYM_FUNC_END(do_suspend_lowlevel) +STACK_FRAME_NON_STANDARD do_suspend_lowlevel .data saved_rbp: .quad 0 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d31639e3ce282..c09192504eafe 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "cpu.h" @@ -45,6 +46,7 @@ static void __init md_clear_select_mitigation(void); static void __init taa_select_mitigation(void); static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); +static void __init l1d_flush_select_mitigation(void); static void __init gds_select_mitigation(void); static void __init srso_select_mitigation(void); @@ -114,6 +116,13 @@ EXPORT_SYMBOL_GPL(mds_user_clear); DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); +/* + * Controls whether l1d flush based mitigations are enabled, + * based on hw features and admin setting via boot parameter + * defaults to false + */ +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); + /* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); EXPORT_SYMBOL_GPL(mmio_stale_data_clear); @@ -155,6 +164,7 @@ void __init cpu_select_mitigations(void) l1tf_select_mitigation(); md_clear_select_mitigation(); srbds_select_mitigation(); + l1d_flush_select_mitigation(); gds_select_mitigation(); srso_select_mitigation(); } @@ -614,6 +624,37 @@ static int __init srbds_parse_cmdline(char *str) } early_param("srbds", srbds_parse_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "L1D Flush : " fmt + +enum l1d_flush_mitigations { + L1D_FLUSH_OFF = 0, + L1D_FLUSH_ON, +}; + +static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF; + +static void __init l1d_flush_select_mitigation(void) +{ + if (!l1d_flush_mitigation) + return; + + if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) + l1d_flush_init(); + + static_branch_enable(&switch_mm_cond_l1d_flush); + pr_info("Conditional flush on switch_mm() enabled\n"); +} + +static int __init l1d_flush_parse_cmdline(char *str) +{ + if (!strcmp(str, "on")) + l1d_flush_mitigation = L1D_FLUSH_ON; + + return 0; +} +early_param("l1d_flush", l1d_flush_parse_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "GDS: " fmt @@ -1422,53 +1463,6 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) } } -static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) -{ - /* - * Similar to context switches, there are two types of RSB attacks - * after VM exit: - * - * 1) RSB underflow - * - * 2) Poisoned RSB entry - * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. - * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. - * - * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB - * bug is present then a LITE version of RSB protection is required, - * just a single call needs to retire before a RET is executed. - */ - switch (mode) { - case SPECTRE_V2_NONE: - return; - - case SPECTRE_V2_EIBRS_LFENCE: - case SPECTRE_V2_EIBRS: - if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); - pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); - } - return; - - case SPECTRE_V2_EIBRS_RETPOLINE: - case SPECTRE_V2_RETPOLINE: - case SPECTRE_V2_LFENCE: - case SPECTRE_V2_IBRS: - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); - pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); - return; - } - - pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); - dump_stack(); -} - static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1619,7 +1613,28 @@ static void __init spectre_v2_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); - spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + /* + * Similar to context switches, there are two types of RSB attacks + * after vmexit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS, on the other hand, has RSB-poisoning protections, so it + * doesn't need RSB clearing after vmexit. + */ + if (boot_cpu_has(X86_FEATURE_RETPOLINE) || + boot_cpu_has(X86_FEATURE_KERNEL_IBRS)) + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS @@ -1922,6 +1937,24 @@ static void task_update_spec_tif(struct task_struct *tsk) speculation_ctrl_update_current(); } +static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + + if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) + return -EPERM; + + switch (ctrl) { + case PR_SPEC_ENABLE: + set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); + return 0; + case PR_SPEC_DISABLE: + clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); + return 0; + default: + return -ERANGE; + } +} + static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) { if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && @@ -2033,6 +2066,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, return ssb_prctl_set(task, ctrl); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_set(task, ctrl); + case PR_SPEC_L1D_FLUSH: + return l1d_flush_prctl_set(task, ctrl); default: return -ENODEV; } @@ -2049,6 +2084,17 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) } #endif +static int l1d_flush_prctl_get(struct task_struct *task) +{ + if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) + return PR_SPEC_FORCE_DISABLE; + + if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH)) + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + else + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; +} + static int ssb_prctl_get(struct task_struct *task) { switch (ssb_mode) { @@ -2099,6 +2145,8 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) return ssb_prctl_get(task); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_get(task); + case PR_SPEC_L1D_FLUSH: + return l1d_flush_prctl_get(task); default: return -ENODEV; } @@ -2536,19 +2584,6 @@ static char *ibpb_state(void) return ""; } -static char *pbrsb_eibrs_state(void) -{ - if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { - if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || - boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) - return ", PBRSB-eIBRS: SW sequence"; - else - return ", PBRSB-eIBRS: Vulnerable"; - } else { - return ", PBRSB-eIBRS: Not affected"; - } -} - static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) @@ -2561,13 +2596,12 @@ static ssize_t spectre_v2_show_state(char *buf) spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sprintf(buf, "%s%s%s%s%s%s%s\n", + return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", stibp_state(), boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - pbrsb_eibrs_state(), spectre_v2_module_string()); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1f2360309120..ad85ac6c83297 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1036,7 +1036,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_ITLB_MULTIHIT BIT(7) #define NO_SPECTRE_V2 BIT(8) #define NO_MMIO BIT(9) -#define NO_EIBRS_PBRSB BIT(10) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) @@ -1082,7 +1081,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1092,9 +1091,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ - VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), - VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), - VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), /* AMD Family 0xf - 0x12 */ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), @@ -1172,8 +1169,8 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED | SRSO), - VULNBL_HYGON(0x18, RETBLEED), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), VULNBL_AMD(0x19, SRSO), {} }; @@ -1287,10 +1284,8 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_RETBLEED); } - if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && - !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) - setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) + setup_force_cpu_bug(X86_BUG_SMT_RSB); /* * Check if CPU is vulnerable to GDS. If running in a virtual machine on @@ -1448,6 +1443,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_set_core_cap_bits(c); + fpu__init_system(); + #ifdef CONFIG_X86_32 /* * Regardless of whether PCID is enumerated, the SDM says @@ -2144,6 +2141,8 @@ void cpu_init(void) doublefault_init_cpu_tss(); + fpu__init_cpu(); + if (is_uv_system()) uv_cpu_init(); @@ -2159,7 +2158,6 @@ void cpu_init_secondary(void) */ cpu_init_exception_handling(); cpu_init(); - fpu__init_cpu(); } #endif @@ -2254,13 +2252,6 @@ void __init arch_cpu_finalize_init(void) '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); } - /* - * Must be before alternatives because it might set or clear - * feature bits. - */ - fpu__init_system(); - fpu__init_cpu(); - alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 5b2dabedcf664..b3b7ecbed0a32 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -322,8 +322,7 @@ SYM_FUNC_START(ftrace_graph_caller) RET SYM_FUNC_END(ftrace_graph_caller) -SYM_CODE_START(return_to_handler) - UNWIND_HINT_EMPTY +SYM_FUNC_START(return_to_handler) subq $16, %rsp /* Save the return values */ @@ -350,5 +349,5 @@ SYM_CODE_START(return_to_handler) mov %rdi, (%rsp) UNWIND_HINT_FUNC RET -SYM_CODE_END(return_to_handler) +SYM_FUNC_END(return_to_handler) #endif diff --git a/arch/x86/kernel/l1d_flush.c b/arch/x86/kernel/l1d_flush.c new file mode 100644 index 0000000000000..f40efc5095d72 --- /dev/null +++ b/arch/x86/kernel/l1d_flush.c @@ -0,0 +1,61 @@ +/* + * This software-based L1D flush implementation is taken from the following + * functions from KVM: + * + * o arch/x86/kvm/vmx/vmx.c::vmx_setup_l1d_flush() + * o arch/x86/kvm/vmx/vmx.c::vmx_l1d_flush() + * + * As we did in the AL2 5.4 version this version doesn't populate the + * TLB and only performs the cache filling part. + */ +#include +#include + +#define L1D_CACHE_ORDER 4 + +static void *l1d_flush_pages; + +void __init l1d_flush_init(void) +{ + struct page *page; + int i; + + /* + * This allocation for l1d_flush_pages is not tied to a task's + * lifetime and so should not be charged to a memcg. + */ + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); + BUG_ON(!page); + + l1d_flush_pages = page_address(page); + + /* + * The original implementation in vmx_l1d_flush() does this + * initialization to protect against KSM for nested Virt. + * Let's keep it just in case. + */ + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { + memset(l1d_flush_pages + i * PAGE_SIZE, i + 1, + PAGE_SIZE); + } +} + +void l1d_flush_sw(void) +{ + int size = PAGE_SIZE << L1D_CACHE_ORDER; + + BUG_ON(!l1d_flush_pages); + + asm volatile( + /* Fill the cache */ + "xorl %%eax, %%eax\n" + ".Lfill_cache:\n" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $64, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lfill_cache\n\t" + "lfence\n" + :: [flush_pages] "r" (l1d_flush_pages), + [size] "r" (size) + : "eax", "ecx"); +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 065152d9265e4..784cb75bb998d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -47,6 +47,7 @@ #include #include #include +#include #include /* @@ -839,6 +840,9 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; #endif +#ifdef CONFIG_RANDOMIZE_BASE + printk(KERN_INFO "KASLR %s\n", kaslr_enabled() ? "enabled" : "disabled"); +#endif /* * If we have OLPC OFW, we might end up relocating the fixmap due to diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d2403da17842b..9aece7f7cd081 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -616,6 +616,9 @@ void set_cpu_sibling_map(int cpu) if (threads > __max_smt_threads) __max_smt_threads = threads; + for_each_cpu(i, topology_sibling_cpumask(cpu)) + cpu_data(i).smt_active = threads > 1; + /* * This needs a separate iteration over the cpus because we rely on all * topology_sibling_cpumask links to be set-up. @@ -1560,8 +1563,13 @@ static void remove_siblinginfo(int cpu) for_each_cpu(sibling, topology_die_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); - for_each_cpu(sibling, topology_sibling_cpumask(cpu)) + + for_each_cpu(sibling, topology_sibling_cpumask(cpu)) { cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); + if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1) + cpu_data(sibling).smt_active = false; + } + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); cpumask_clear(cpu_llc_shared_mask(cpu)); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 13d1a0ac8916a..fdf97fe5b08db 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -1574,3 +1575,31 @@ unsigned long calibrate_delay_is_known(void) return 0; } #endif + +static int tsc_pm_notifier(struct notifier_block *notifier, + unsigned long pm_event, void *unused) +{ + switch (pm_event) { + case PM_HIBERNATION_PREPARE: + clear_sched_clock_stable(); + break; + case PM_POST_HIBERNATION: + /* Set back to the default */ + if (!check_tsc_unstable()) + set_sched_clock_stable(); + break; + } + + return 0; +}; + +static struct notifier_block tsc_pm_notifier_block = { + .notifier_call = tsc_pm_notifier, +}; + +static int tsc_setup_pm_notifier(void) +{ + return register_pm_notifier(&tsc_pm_notifier_block); +} + +subsys_initcall(tsc_setup_pm_notifier); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8544bca6b3356..352d8300263c9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1061,26 +1061,33 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - u64 g_tsc_offset = 0; - if (is_guest_mode(vcpu)) { - /* Write L1's TSC offset. */ - g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->nested.hsave->control.tsc_offset; - svm->nested.hsave->control.tsc_offset = offset; - } + return svm->nested.ctl.tsc_offset; +} - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - svm->vmcb->control.tsc_offset - g_tsc_offset, - offset); +static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + return kvm_default_tsc_scaling_ratio; +} - svm->vmcb->control.tsc_offset = offset + g_tsc_offset; +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (is_guest_mode(&svm->vcpu)) { + svm->nested.hsave->control.tsc_offset = + vcpu->arch.l1_tsc_offset; + } + svm->vmcb->control.tsc_offset = offset; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - return svm->vmcb->control.tsc_offset; +} + +static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +{ + wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); } static void svm_check_invpcid(struct vcpu_svm *svm) @@ -4294,7 +4301,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .write_l1_tsc_offset = svm_write_l1_tsc_offset, + .get_l2_tsc_offset = svm_get_l2_tsc_offset, + .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, + .write_tsc_offset = svm_write_tsc_offset, + .write_tsc_multiplier = svm_write_tsc_multiplier, .load_mmu_pgd = svm_load_mmu_pgd, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c165ddbb672fe..f545e62acc04f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2308,8 +2308,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_DESC); - + SECONDARY_EXEC_DESC | + SECONDARY_EXEC_TSC_SCALING); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & @@ -2569,10 +2569,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); } - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + vcpu->arch.l1_tsc_offset, + vmx_get_l2_tsc_offset(vcpu), + vmx_get_l2_tsc_multiplier(vcpu)); + vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( + vcpu->arch.l1_tsc_scaling_ratio, + vmx_get_l2_tsc_multiplier(vcpu)); + + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); @@ -3410,8 +3418,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, } enter_guest_mode(vcpu); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) - vcpu->arch.tsc_offset += vmcs12->tsc_offset; if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; @@ -4517,8 +4523,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (nested_cpu_has_preemption_timer(vmcs12)) hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) - vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { + vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) + vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; + } if (likely(!vmx->fail)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); @@ -4577,12 +4586,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (kvm_has_tsc_control) + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); + if (vmx->nested.l1_tpr_threshold != -1) vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); - if (vmx->nested.change_vmcs01_virtual_apic_mode) { vmx->nested.change_vmcs01_virtual_apic_mode = false; vmx_set_virtual_apic_mode(vcpu); @@ -6411,6 +6420,40 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void) } } +/* + * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo + * that madness to get the encoding for comparison. + */ +#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) + +static u64 nested_vmx_calc_vmcs_enum_msr(void) +{ + /* + * Note these are the so called "index" of the VMCS field encoding, not + * the index into vmcs12. + */ + unsigned int max_idx, idx; + int i; + + /* + * For better or worse, KVM allows VMREAD/VMWRITE to all fields in + * vmcs12, regardless of whether or not the associated feature is + * exposed to L1. Simply find the field with the highest index. + */ + max_idx = 0; + for (i = 0; i < nr_vmcs12_fields; i++) { + /* The vmcs12 table is very, very sparsely populated. */ + if (!vmcs_field_to_offset_table[i]) + continue; + + idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); + if (idx > max_idx) + max_idx = idx; + } + + return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; +} + /* * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be * returned for the various VMX controls MSRs when nested VMX is enabled. @@ -6547,7 +6590,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_TSC_SCALING; /* * We can emulate "VMCS shadowing," even if the hardware @@ -6651,8 +6695,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); - /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; + msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } void nested_vmx_hardware_unsetup(void) diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 69c147df957fd..4b0601a82f7fa 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -174,4 +174,12 @@ static inline int vmcs_field_readonly(unsigned long field) return (((field >> 10) & 0x3) == 1); } +#define VMCS_FIELD_INDEX_SHIFT (1) +#define VMCS_FIELD_INDEX_MASK GENMASK(9, 1) + +static inline unsigned int vmcs_field_index(unsigned long field) +{ + return (field & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT; +} + #endif /* __KVM_X86_VMX_VMCS_H */ diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index c8e51c004f782..989e867e4056f 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -37,6 +37,7 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), FIELD64(PML_ADDRESS, pml_address), FIELD64(TSC_OFFSET, tsc_offset), + FIELD64(TSC_MULTIPLIER, tsc_multiplier), FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 80232daf00ff1..d87a6f828a2b1 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -69,7 +69,8 @@ struct __packed vmcs12 { u64 vm_function_control; u64 eptp_list_address; u64 pml_address; - u64 padding64[3]; /* room for future expansion */ + u64 tsc_multiplier; + u64 padding64[2]; /* room for future expansion */ /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have @@ -203,12 +204,6 @@ struct __packed vmcs12 { */ #define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE -/* - * VMCS12_MAX_FIELD_INDEX is the highest index value used in any - * supported VMCS12 field encoding. - */ -#define VMCS12_MAX_FIELD_INDEX 0x17 - /* * For save/restore compatibility, the vmcs12 field offsets must not change. */ @@ -256,6 +251,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(vm_function_control, 296); CHECK_OFFSET(eptp_list_address, 304); CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(tsc_multiplier, 320); CHECK_OFFSET(cr0_guest_host_mask, 344); CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 982138bebb70f..857fa0fc49faf 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -197,13 +197,11 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) * entries and (in some cases) RSB underflow. * * eIBRS has its own protection against poisoned RSB, so it doesn't - * need the RSB filling sequence. But it does need to be enabled, and a - * single call to retire, before the first unbalanced RET. + * need the RSB filling sequence. But it does need to be enabled + * before the first unbalanced RET. */ - FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\ - X86_FEATURE_RSB_VMEXIT_LITE - + FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT pop %_ASM_ARG2 /* @flags */ pop %_ASM_ARG1 /* @vmx */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2445c61038954..16271591264e2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1462,11 +1462,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, vmx->loaded_vmcs->cpu = cpu; } - - /* Setup TSC multiplier */ - if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) - decache_tsc_multiplier(vmx); } /* @@ -1826,26 +1821,35 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx_update_msr_bitmap(&vmx->vcpu); } -static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u64 g_tsc_offset = 0; - /* - * We're here if L1 chose not to trap WRMSR to TSC. According - * to the spec, this should set L1's TSC; The offset that L1 - * set for L2 remains unchanged, and still needs to be added - * to the newly set TSC to get L2's TSC. - */ - if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) - g_tsc_offset = vmcs12->tsc_offset; + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) + return vmcs12->tsc_offset; + + return 0; +} + +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) + return vmcs12->tsc_multiplier; - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - vcpu->arch.tsc_offset - g_tsc_offset, - offset); - vmcs_write64(TSC_OFFSET, offset + g_tsc_offset); - return offset + g_tsc_offset; + return kvm_default_tsc_scaling_ratio; +} + +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + vmcs_write64(TSC_OFFSET, offset); +} + +static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +{ + vmcs_write64(TSC_MULTIPLIER, multiplier); } /* @@ -6748,9 +6752,12 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, * For legacy IBRS, the IBRS bit always needs to be written after * transitioning from a less privileged predictor mode, regardless of * whether the guest/host values differ. + * + * For eIBRS affected by Post Barrier RSB Predictions a serialising + * instruction (wrmsr) must be executed. */ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || - vmx->spec_ctrl != hostval) + vmx->spec_ctrl != hostval || (hostval & SPEC_CTRL_IBRS)) native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); barrier_nospec(); @@ -7622,10 +7629,10 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, kvm_tsc_scaling_ratio_frac_bits, - vcpu->arch.tsc_scaling_ratio, &delta_tsc)) + vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; /* @@ -7877,7 +7884,10 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .write_l1_tsc_offset = vmx_write_l1_tsc_offset, + .get_l2_tsc_offset = vmx_get_l2_tsc_offset, + .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, + .write_tsc_offset = vmx_write_tsc_offset, + .write_tsc_multiplier = vmx_write_tsc_multiplier, .load_mmu_pgd = vmx_load_mmu_pgd, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index ed4b6da83aa87..6dbb13725a11e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -289,8 +289,6 @@ struct vcpu_vmx { /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - u64 current_tsc_ratio; - unsigned long host_debugctlmsr; /* @@ -373,6 +371,9 @@ bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); + static inline u8 vmx_get_rvi(void) { return vmcs_read16(GUEST_INTR_STATUS) & 0xff; @@ -491,12 +492,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow) GFP_KERNEL_ACCOUNT); } -static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) -{ - vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); -} - static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) { return secondary_exec_controls_get(vmx) & diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf47392005663..f6149ac506af8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -163,6 +163,10 @@ module_param(force_emulation_prefix, bool, S_IRUGO); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); +/* Enable/disable SMT_RSB bug mitigation */ +bool __read_mostly mitigate_smt_rsb; +module_param(mitigate_smt_rsb, bool, 0444); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU @@ -1391,7 +1395,7 @@ static unsigned int num_msr_based_features; ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ - ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO) + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_GDS_NO) static u64 kvm_get_arch_capabilities(void) { @@ -2114,13 +2118,15 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm) return v; } +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier); + static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { u64 ratio; /* Guest TSC same frequency as host TSC? */ if (!scale) { - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); return 0; } @@ -2146,7 +2152,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) return -1; } - vcpu->arch.tsc_scaling_ratio = ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, ratio); return 0; } @@ -2158,7 +2164,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); return -1; } @@ -2240,10 +2246,9 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc) return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio) { u64 _tsc = tsc; - u64 ratio = vcpu->arch.tsc_scaling_ratio; if (ratio != kvm_default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); @@ -2252,25 +2257,86 @@ u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) } EXPORT_SYMBOL_GPL(kvm_scale_tsc); -static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = kvm_scale_tsc(vcpu, rdtsc()); + tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); return target_tsc - tsc; } u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + return vcpu->arch.l1_tsc_offset + + kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); -static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) +{ + u64 nested_offset; + + if (l2_multiplier == kvm_default_tsc_scaling_ratio) + nested_offset = l1_offset; + else + nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, + kvm_tsc_scaling_ratio_frac_bits); + + nested_offset += l2_offset; + return nested_offset; +} +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); + +u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) +{ + if (l2_multiplier != kvm_default_tsc_scaling_ratio) + return mul_u64_u64_shr(l1_multiplier, l2_multiplier, + kvm_tsc_scaling_ratio_frac_bits); + + return l1_multiplier; +} +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); + +static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) +{ + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + vcpu->arch.l1_tsc_offset, + l1_offset); + + vcpu->arch.l1_tsc_offset = l1_offset; + + /* + * If we are here because L1 chose not to trap WRMSR to TSC then + * according to the spec this should set L1's TSC (as opposed to + * setting L1's offset for L2). + */ + if (is_guest_mode(vcpu)) + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + l1_offset, + kvm_x86_ops.get_l2_tsc_offset(vcpu), + kvm_x86_ops.get_l2_tsc_multiplier(vcpu)); + else + vcpu->arch.tsc_offset = l1_offset; + + kvm_x86_ops.write_tsc_offset(vcpu, vcpu->arch.tsc_offset); +} + +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) { - vcpu->arch.l1_tsc_offset = offset; - vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); + vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier; + + /* Userspace is changing the multiplier while L2 is active */ + if (is_guest_mode(vcpu)) + vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( + l1_multiplier, + kvm_x86_ops.get_l2_tsc_multiplier(vcpu)); + else + vcpu->arch.tsc_scaling_ratio = l1_multiplier; + + if (kvm_has_tsc_control) + kvm_x86_ops.write_tsc_multiplier( + vcpu, vcpu->arch.tsc_scaling_ratio); } static inline bool kvm_check_tsc_unstable(void) @@ -2296,7 +2362,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = kvm_compute_tsc_offset(vcpu, data); + offset = kvm_compute_l1_tsc_offset(vcpu, data); ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; @@ -2335,7 +2401,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; - offset = kvm_compute_tsc_offset(vcpu, data); + offset = kvm_compute_l1_tsc_offset(vcpu, data); } matched = true; already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); @@ -2394,9 +2460,10 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); + adjustment = kvm_scale_tsc(vcpu, (u64) adjustment, + vcpu->arch.l1_tsc_scaling_ratio); adjust_tsc_offset_guest(vcpu, adjustment); } @@ -2773,7 +2840,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) - tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); + tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz, + v->arch.l1_tsc_scaling_ratio); if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, @@ -3240,7 +3308,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, data); } else { - u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; + u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; } @@ -3538,10 +3606,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * return L1's TSC value to ensure backwards-compatible * behavior for migration. */ - u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : - vcpu->arch.tsc_offset; + u64 offset, ratio; + + if (msr_info->host_initiated) { + offset = vcpu->arch.l1_tsc_offset; + ratio = vcpu->arch.l1_tsc_scaling_ratio; + } else { + offset = vcpu->arch.tsc_offset; + ratio = vcpu->arch.tsc_scaling_ratio; + } - msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset; + msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset; break; } case MSR_MTRRcap: @@ -3879,10 +3954,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_CLOCK_TSC_STABLE; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | - KVM_X86_DISABLE_EXITS_CSTATE; - if(kvm_can_mwait_in_guest()) - r |= KVM_X86_DISABLE_EXITS_MWAIT; + r = KVM_X86_DISABLE_EXITS_PAUSE; + + if (!mitigate_smt_rsb) { + r |= KVM_X86_DISABLE_EXITS_HLT | + KVM_X86_DISABLE_EXITS_CSTATE; + + if (kvm_can_mwait_in_guest()) + r |= KVM_X86_DISABLE_EXITS_MWAIT; + } break; case KVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, @@ -4078,7 +4158,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mark_tsc_unstable("KVM discovered backwards TSC"); if (kvm_check_tsc_unstable()) { - u64 offset = kvm_compute_tsc_offset(vcpu, + u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; @@ -5409,15 +5489,26 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) break; - if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && - kvm_can_mwait_in_guest()) - kvm->arch.mwait_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) - kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) - kvm->arch.cstate_in_guest = true; + +#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \ + "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests." + + if (!mitigate_smt_rsb) { + if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() && + (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) + pr_warn_once(SMT_RSB_MSG); + + if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && + kvm_can_mwait_in_guest()) + kvm->arch.mwait_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) + kvm->arch.hlt_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; + } + r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: @@ -10200,8 +10291,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - kvm_set_tsc_khz(vcpu, max_tsc_khz); - r = kvm_mmu_create(vcpu); if (r < 0) return r; @@ -10271,6 +10360,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); + kvm_set_tsc_khz(vcpu, max_tsc_khz); kvm_vcpu_reset(vcpu, false); kvm_init_mmu(vcpu, false); vcpu_put(vcpu); @@ -11637,6 +11727,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); static int __init kvm_x86_init(void) { kvm_mmu_x86_module_init(); + mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); return 0; } module_init(kvm_x86_init); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 569ac1d57f55a..318f35ac99d84 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -8,12 +8,15 @@ #include #include #include +#include #include #include #include #include +#include #include +#include #include "mm_internal.h" @@ -42,10 +45,15 @@ */ /* - * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is - * stored in cpu_tlb_state.last_user_mm_ibpb. + * Bits to mangle the TIF_SPEC_* state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_spec. */ #define LAST_USER_MM_IBPB 0x1UL +#define LAST_USER_MM_L1D_FLUSH 0x2UL +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH) + +/* Bits to set when tlbstate and flush is (re)initialized */ +#define LAST_USER_MM_INIT LAST_USER_MM_IBPB /* * The x86 feature is called PCID (Process Context IDentifier). It is similar @@ -316,20 +324,80 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } -static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +/* + * Invoked from return to user/guest by a task that opted-in to L1D + * flushing but ended up running on an SMT enabled core due to wrong + * affinity settings or CPU hotplug. This is part of the paranoid L1D flush + * contract which this task requested. + */ +static void l1d_flush_force_sigbus(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void l1d_do_flush(void) +{ + if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + return; + } + + l1d_flush_sw(); +} + +static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, + struct task_struct *next) +{ + /* Flush L1D if the outgoing task requests it */ + if (prev_mm & LAST_USER_MM_L1D_FLUSH) + l1d_do_flush(); + + /* Check whether the incoming task opted in for L1D flush */ + if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) + return; + + /* + * Validate that it is not running on an SMT sibling as this would + * make the excercise pointless because the siblings share L1D. If + * it runs on a SMT sibling, notify it with SIGBUS on return to + * user/guest + */ + if (this_cpu_read(cpu_info.smt_active)) { + clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH); + next->l1d_flush_kill.func = l1d_flush_force_sigbus; + task_work_add(next, &next->l1d_flush_kill, TWA_RESUME); + } +} + +static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; - unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; + unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; - return (unsigned long)next->mm | ibpb; + /* + * Ensure that the bit shift above works as expected and the two flags + * end up in bit 0 and 1. + */ + BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1); + + return (unsigned long)next->mm | spec_bits; } -static void cond_ibpb(struct task_struct *next) +static void cond_mitigation(struct task_struct *next) { + unsigned long prev_mm, next_mm; + if (!next || !next->mm) return; + next_mm = mm_mangle_tif_spec_bits(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); + /* + * Avoid user/user BTB poisoning by flushing the branch predictor + * when switching between processes. This stops one process from + * doing Spectre-v2 attacks on another. + * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the * same process. Using the mm pointer instead of mm->context.ctx_id @@ -339,8 +407,6 @@ static void cond_ibpb(struct task_struct *next) * exposed data is not really interesting. */ if (static_branch_likely(&switch_mm_cond_ibpb)) { - unsigned long prev_mm, next_mm; - /* * This is a bit more complex than the always mode because * it has to handle two cases: @@ -370,20 +436,14 @@ static void cond_ibpb(struct task_struct *next) * Optimize this with reasonably small overhead for the * above cases. Mangle the TIF_SPEC_IB bit into the mm * pointer of the incoming task which is stored in - * cpu_tlbstate.last_user_mm_ibpb for comparison. - */ - next_mm = mm_mangle_tif_spec_ib(next); - prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); - - /* + * cpu_tlbstate.last_user_mm_spec for comparison. + * * Issue IBPB only if the mm's are different and one or * both have the IBPB bit set. */ if (next_mm != prev_mm && (next_mm | prev_mm) & LAST_USER_MM_IBPB) indirect_branch_prediction_barrier(); - - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); } if (static_branch_unlikely(&switch_mm_always_ibpb)) { @@ -392,11 +452,22 @@ static void cond_ibpb(struct task_struct *next) * different context than the user space task which ran * last on this CPU. */ - if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { + if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != + (unsigned long)next->mm) indirect_branch_prediction_barrier(); - this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); - } } + + if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) { + /* + * Flush L1D when the outgoing task requested it and/or + * check whether the incoming task requested L1D flushing + * and ended up on an SMT sibling. + */ + if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH)) + l1d_flush_evaluate(prev_mm, next_mm, next); + } + + this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm); } #ifdef CONFIG_PERF_EVENTS @@ -524,11 +595,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, need_flush = true; } else { /* - * Avoid user/user BTB poisoning by flushing the branch - * predictor when switching between processes. This stops - * one process from doing Spectre-v2 attacks on another. + * Apply process to process speculation vulnerability + * mitigations if applicable. */ - cond_ibpb(tsk); + cond_mitigation(tsk); /* * Stop remote flushes for the previous mm. @@ -636,7 +706,7 @@ void initialize_tlbstate_and_flush(void) write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); + this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 6907b523e856b..3ff80156f21a6 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y # __restore_processor_state() restores %gs after S3 resume and so should not # itself be stack-protected diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 3ae7a3d7d61e5..186d884e29836 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -22,6 +22,52 @@ #include #include + /* code below belongs to the image kernel */ + .align PAGE_SIZE +SYM_FUNC_START(restore_registers) + /* go back to the original page tables */ + movq %r9, %cr3 + + /* Flush TLB, including "global" things (vmalloc) */ + movq mmu_cr4_features(%rip), %rax + movq %rax, %rdx + andq $~(X86_CR4_PGE), %rdx + movq %rdx, %cr4; # turn off PGE + movq %cr3, %rcx; # flush TLB + movq %rcx, %cr3 + movq %rax, %cr4; # turn PGE back on + + /* We don't restore %rax, it must be 0 anyway */ + movq $saved_context, %rax + movq pt_regs_sp(%rax), %rsp + movq pt_regs_bp(%rax), %rbp + movq pt_regs_si(%rax), %rsi + movq pt_regs_di(%rax), %rdi + movq pt_regs_bx(%rax), %rbx + movq pt_regs_cx(%rax), %rcx + movq pt_regs_dx(%rax), %rdx + mov pt_regs_r8(%rax), %r8 + movq pt_regs_r9(%rax), %r9 + movq pt_regs_r10(%rax), %r10 + movq pt_regs_r11(%rax), %r11 + movq pt_regs_r12(%rax), %r12 + movq pt_regs_r13(%rax), %r13 + movq pt_regs_r14(%rax), %r14 + movq pt_regs_r15(%rax), %r15 + pushq pt_regs_flags(%rax) + popfq + + /* Saved in save_processor_state. */ + lgdt saved_context_gdt_desc(%rax) + + xorl %eax, %eax + + /* tell the hibernation core that we've just restored the memory */ + movq %rax, in_suspend(%rip) + + RET +SYM_FUNC_END(restore_registers) + SYM_FUNC_START(swsusp_arch_suspend) movq $saved_context, %rax movq %rsp, pt_regs_sp(%rax) @@ -52,7 +98,7 @@ SYM_FUNC_START(swsusp_arch_suspend) RET SYM_FUNC_END(swsusp_arch_suspend) -SYM_CODE_START(restore_image) +SYM_FUNC_START(restore_image) /* prepare to jump to the image kernel */ movq restore_jump_address(%rip), %r8 movq restore_cr3(%rip), %r9 @@ -67,10 +113,10 @@ SYM_CODE_START(restore_image) /* jump to relocated restore code */ movq relocated_restore_code(%rip), %rcx jmpq *%rcx -SYM_CODE_END(restore_image) +SYM_FUNC_END(restore_image) /* code below has been relocated to a safe page */ -SYM_CODE_START(core_restore_code) +SYM_FUNC_START(core_restore_code) /* switch to temporary page tables */ movq %rax, %cr3 /* flush TLB */ @@ -98,50 +144,4 @@ SYM_CODE_START(core_restore_code) .Ldone: /* jump to the restore_registers address from the image header */ jmpq *%r8 -SYM_CODE_END(core_restore_code) - - /* code below belongs to the image kernel */ - .align PAGE_SIZE -SYM_FUNC_START(restore_registers) - /* go back to the original page tables */ - movq %r9, %cr3 - - /* Flush TLB, including "global" things (vmalloc) */ - movq mmu_cr4_features(%rip), %rax - movq %rax, %rdx - andq $~(X86_CR4_PGE), %rdx - movq %rdx, %cr4; # turn off PGE - movq %cr3, %rcx; # flush TLB - movq %rcx, %cr3 - movq %rax, %cr4; # turn PGE back on - - /* We don't restore %rax, it must be 0 anyway */ - movq $saved_context, %rax - movq pt_regs_sp(%rax), %rsp - movq pt_regs_bp(%rax), %rbp - movq pt_regs_si(%rax), %rsi - movq pt_regs_di(%rax), %rdi - movq pt_regs_bx(%rax), %rbx - movq pt_regs_cx(%rax), %rcx - movq pt_regs_dx(%rax), %rdx - movq pt_regs_r8(%rax), %r8 - movq pt_regs_r9(%rax), %r9 - movq pt_regs_r10(%rax), %r10 - movq pt_regs_r11(%rax), %r11 - movq pt_regs_r12(%rax), %r12 - movq pt_regs_r13(%rax), %r13 - movq pt_regs_r14(%rax), %r14 - movq pt_regs_r15(%rax), %r15 - pushq pt_regs_flags(%rax) - popfq - - /* Saved in save_processor_state. */ - lgdt saved_context_gdt_desc(%rax) - - xorl %eax, %eax - - /* tell the hibernation core that we've just restored the memory */ - movq %rax, in_suspend(%rip) - - RET -SYM_FUNC_END(restore_registers) +SYM_FUNC_END(core_restore_code) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index ec50b7423a4c8..3970821ec2928 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -29,6 +29,13 @@ static unsigned long shared_info_pfn; +void xen_hvm_map_shared_info(void) +{ + xen_hvm_init_shared_info(); + if(shared_info_pfn) + HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn)); +} + void xen_hvm_init_shared_info(void) { struct xen_add_to_physmap xatp; @@ -212,6 +219,7 @@ static void __init xen_hvm_guest_init(void) if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; + xen_setup_syscore_ops(); xen_hvm_smp_init(); WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm)); xen_unplug_emulated_devices(); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 755e939db3ed3..64873937cd1d7 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -64,7 +63,6 @@ static void cpu_bringup(void) cr4_init(); cpu_init(); - fpu__init_cpu(); touch_softlockup_watchdog(); preempt_disable(); diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d83152c761bc..8be6ffa6bfbea 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -2,17 +2,22 @@ #include #include #include +#include +#include #include #include +#include #include #include +#include #include #include #include #include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -82,3 +87,65 @@ void xen_arch_suspend(void) on_each_cpu(xen_vcpu_notify_suspend, NULL, 1); } + +static int xen_syscore_suspend(void) +{ + struct xen_remove_from_physmap xrfp; + int cpu, ret; + + /* Xen suspend does similar stuffs in its own logic */ + if (xen_suspend_mode_is_xen_suspend()) + return 0; + + for_each_present_cpu(cpu) { + /* + * Nonboot CPUs are already offline, but the last copy of + * runstate info is still accessible. + */ + xen_save_steal_clock(cpu); + } + + xen_shutdown_pirqs(); + + xrfp.domid = DOMID_SELF; + xrfp.gpfn = __pa(HYPERVISOR_shared_info) >> PAGE_SHIFT; + + ret = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrfp); + if (!ret) + HYPERVISOR_shared_info = &xen_dummy_shared_info; + + return ret; +} + +static void xen_syscore_resume(void) +{ + /* Xen suspend does similar stuffs in its own logic */ + if (xen_suspend_mode_is_xen_suspend()) + return; + + /* No need to setup vcpu_info as it's already moved off */ + xen_hvm_map_shared_info(); + + pvclock_resume(); + + /* Nonboot CPUs will be resumed when they're brought up */ + xen_restore_steal_clock(smp_processor_id()); + + gnttab_resume(); + +} + +/* + * These callbacks will be called with interrupts disabled and when having only + * one CPU online. + */ +static struct syscore_ops xen_hvm_syscore_ops = { + .suspend = xen_syscore_suspend, + .resume = xen_syscore_resume +}; + +void __init xen_setup_syscore_ops(void) +{ + if (xen_hvm_domain()) + register_syscore_ops(&xen_hvm_syscore_ops); +} diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 8183d17e1cf17..4cb4491ba4e26 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -546,6 +546,9 @@ static void xen_hvm_setup_cpu_clockevents(void) { int cpu = smp_processor_id(); xen_setup_runstate_info(cpu); + if (cpu) + xen_restore_steal_clock(cpu); + /* * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence * doing it xen_hvm_cpu_notify (which gets called by smp_init during diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8695809b88f08..75cca4fc20473 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -55,6 +55,8 @@ void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); +void xen_callback_vector(void); +void xen_hvm_map_shared_info(void); void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c index e8a4165a18742..b1d7b5a6e61c1 100644 --- a/crypto/jitterentropy-kcapi.c +++ b/crypto/jitterentropy-kcapi.c @@ -214,7 +214,7 @@ static void __exit jent_mod_exit(void) crypto_unregister_rng(&jent_alg); } -module_init(jent_mod_init); +subsys_initcall(jent_mod_init); module_exit(jent_mod_exit); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index a64a639eddfa4..eef9142bfec93 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -5003,28 +5003,24 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "hmac(sha3-224)", .test = alg_test_hash, - .fips_allowed = 1, .suite = { .hash = __VECS(hmac_sha3_224_tv_template) } }, { .alg = "hmac(sha3-256)", .test = alg_test_hash, - .fips_allowed = 1, .suite = { .hash = __VECS(hmac_sha3_256_tv_template) } }, { .alg = "hmac(sha3-384)", .test = alg_test_hash, - .fips_allowed = 1, .suite = { .hash = __VECS(hmac_sha3_384_tv_template) } }, { .alg = "hmac(sha3-512)", .test = alg_test_hash, - .fips_allowed = 1, .suite = { .hash = __VECS(hmac_sha3_512_tv_template) } @@ -5346,28 +5342,24 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "sha3-224", .test = alg_test_hash, - .fips_allowed = 1, .suite = { .hash = __VECS(sha3_224_tv_template) } }, { .alg = "sha3-256", .test = alg_test_hash, - .fips_allowed = 1, .suite = { .hash = __VECS(sha3_256_tv_template) } }, { .alg = "sha3-384", .test = alg_test_hash, - .fips_allowed = 1, .suite = { .hash = __VECS(sha3_384_tv_template) } }, { .alg = "sha3-512", .test = alg_test_hash, - .fips_allowed = 1, .suite = { .hash = __VECS(sha3_512_tv_template) } @@ -5536,6 +5528,10 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(xxhash64_tv_template) } + }, { + .alg = "zlib", + .test = alg_test_null, + .fips_allowed = 1, }, { .alg = "zlib-deflate", .test = alg_test_comp, diff --git a/drivers/Kconfig b/drivers/Kconfig index dcecc9f6e33f7..fff51c18a5896 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -235,4 +235,6 @@ source "drivers/interconnect/Kconfig" source "drivers/counter/Kconfig" source "drivers/most/Kconfig" + +source "drivers/amazon/Kconfig" endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 5762280377186..4b9dfb802c301 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -189,3 +189,4 @@ obj-$(CONFIG_GNSS) += gnss/ obj-$(CONFIG_INTERCONNECT) += interconnect/ obj-$(CONFIG_COUNTER) += counter/ obj-$(CONFIG_MOST) += most/ +obj-$(CONFIG_AMAZON_DRIVER_UPDATES) += amazon/ diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index b02fd51e55896..8cc195c4c8619 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -171,6 +171,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) acpi_handle handle = mem_device->device->handle; int result, num_enabled = 0; struct acpi_memory_info *info; + mhp_t mhp_flags = MHP_NONE; int node; node = acpi_get_node(handle); @@ -194,8 +195,10 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) if (node < 0) node = memory_add_physaddr_to_nid(info->start_addr); + if (mhp_supports_memmap_on_memory(info->length)) + mhp_flags |= MHP_MEMMAP_ON_MEMORY; result = __add_memory(node, info->start_addr, info->length, - MHP_NONE); + mhp_flags); /* * If the memory block has been used by the kernel, add_memory() diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 67a5ee2fedfd3..7dfb66ad1f907 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1512,6 +1513,11 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, arch_setup_dma_ops(dev, dma_addr, size, iommu, attr == DEV_DMA_COHERENT); +#ifdef CONFIG_DMA_PAGE_TOUCHING + if (!dev->dma_ops) + setup_dma_page_touching_ops(dev); +#endif + return 0; } EXPORT_SYMBOL_GPL(acpi_dma_configure_id); diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig new file mode 100644 index 0000000000000..098784081e0bf --- /dev/null +++ b/drivers/amazon/Kconfig @@ -0,0 +1,60 @@ +# +# Amazon driver updates configuration +# + +config AMAZON_DRIVER_UPDATES + bool "Amazon Driver Updates" + default y + depends on PCI || EXPERIMENTAL + help + Amazon driver updates includes out-of-tree drivers and/or modifeid + versions of the drivers present in the stable kernel tree. + +if AMAZON_DRIVER_UPDATES + +config AMAZON_ENA_ETHERNET + tristate "Elastic Network Adapter (ENA) support" + depends on PCI_MSI && !ENA_ETHERNET + help + This driver supports Elastic Network Adapter (ENA) + + To compile this driver as a module, choose M here. + The module will be called ena. + +config AMAZON_EFA_INFINIBAND + tristate "Elastic Fabric Adapter (EFA) support" + depends on INFINIBAND_USER_ACCESS && AMAZON_ENA_ETHERNET + help + This driver support Elastic Fabric Adapter (EFA) + + To compile this driver as a module, choose M here. + The module will be called efa + +config AMAZON_IGB_UIO + tristate "DPDK igb_uio driver" + help + This is the direct PCI access driver for igb and + other PCI network devices, for DPDK. + + To compile this driver as a module, choose M here. + The module will be called igb_uio. + +config AMAZON_SCSI_SMARTPQI + tristate "Microsemi PQI Driver" + depends on PCI && SCSI && !S390 + select SCSI_SAS_ATTRS + select RAID_ATTRS + help + This driver supports Microsemi PQI controllers. + + + + To compile this driver as a module, choose M here: the + module will be called smartpqi. + + Note: the aacraid driver will not manage a smartpqi + controller. You need to enable smartpqi for smartpqi + controllers. For more information, please see + Documentation/scsi/smartpqi.rst + +endif # AMAZON_DRIVER_UPDATES diff --git a/drivers/amazon/Makefile b/drivers/amazon/Makefile new file mode 100644 index 0000000000000..b10122feac02a --- /dev/null +++ b/drivers/amazon/Makefile @@ -0,0 +1,5 @@ +# +# Amazon Driver Updates +# +obj-$(CONFIG_AMAZON_DRIVER_UPDATES) += net/ +obj-$(CONFIG_AMAZON_DRIVER_UPDATES) += scsi/ diff --git a/drivers/amazon/net/Makefile b/drivers/amazon/net/Makefile new file mode 100644 index 0000000000000..7eb6f214798ee --- /dev/null +++ b/drivers/amazon/net/Makefile @@ -0,0 +1,6 @@ +# +# Amazon Driver Updates +# +obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena/ +obj-$(CONFIG_AMAZON_EFA_INFINIBAND) += efa/ +obj-$(CONFIG_AMAZON_IGB_UIO) += igb_uio/ diff --git a/drivers/amazon/net/efa/Makefile b/drivers/amazon/net/efa/Makefile new file mode 100644 index 0000000000000..4399f594a93bf --- /dev/null +++ b/drivers/amazon/net/efa/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the Elastic Fabric Adapter (EFA) device drivers. +# EFA Source is: https://github.com/amzn/amzn-drivers. + +obj-$(CONFIG_AMAZON_EFA_INFINIBAND) += efa.o + +efa-y := efa_com.o efa_com_cmd.o efa_gdr.o efa_main.o efa_neuron.o efa_p2p.o +efa-y += efa_verbs.o + +efa-$(CONFIG_SYSFS) += efa_sysfs.o + +ccflags-y += -include $(srctree)/drivers/amazon/net/efa/config.h diff --git a/drivers/amazon/net/efa/config.h b/drivers/amazon/net/efa/config.h new file mode 100644 index 0000000000000..b86d2e69cd96c --- /dev/null +++ b/drivers/amazon/net/efa/config.h @@ -0,0 +1,48 @@ +#define HAVE_UMEM_SCATTERLIST_IF 1 +#define HAVE_CREATE_CQ_ATTR 1 +#define HAVE_CREATE_AH_RDMA_ATTR 1 +#define HAVE_DEV_PARENT 1 +#define HAVE_POST_CONST_WR 1 +#define HAVE_MAX_SEND_RCV_SGE 1 +#define HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS 1 +#define HAVE_IB_DEV_OPS 1 +#define HAVE_SG_DMA_PAGE_ITER 1 +#define HAVE_PD_CORE_ALLOCATION 1 +#define HAVE_UCONTEXT_CORE_ALLOCATION 1 +#define HAVE_NO_KVERBS_DRIVERS 1 +#define HAVE_UDATA_TO_DRV_CONTEXT 1 +#define HAVE_SAFE_IB_ALLOC_DEVICE 1 +#define HAVE_AH_CORE_ALLOCATION 1 +#define HAVE_ALLOC_PD_NO_UCONTEXT 1 +#define HAVE_DEREG_MR_UDATA 1 +#define HAVE_DESTROY_CQ_UDATA 1 +#define HAVE_DESTROY_QP_UDATA 1 +#define HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE 1 +#define HAVE_UPSTREAM_EFA 1 +#define HAVE_IB_DEVICE_OPS_COMMON 1 +#define HAVE_CQ_CORE_ALLOCATION 1 +#define HAVE_IB_PORT_PHYS_STATE_LINK_UP 1 +#define HAVE_KVZALLOC 1 +#define HAVE_IBDEV_PRINT_RATELIMITED 1 +#define HAVE_IBDEV_PRINT 1 +#define HAVE_IB_QPT_DRIVER 1 +#define HAVE_IB_IS_UDATA_CLEARED 1 +#define HAVE_IB_MR_LENGTH 1 +#define HAVE_PCI_VENDOR_ID_AMAZON 1 +#define HAVE_IB_UMEM_GET_NO_DMASYNC 1 +#define HAVE_CORE_MMAP_XA 1 +#define HAVE_RDMA_NODE_UNSPECIFIED 1 +#define HAVE_BITFIELD_H 1 +#define HAVE_IB_UMEM_GET_DEVICE_PARAM 1 +#define HAVE_IB_ACCESS_OPTIONAL 1 +#define HAVE_CREATE_AH_INIT_ATTR 1 +#define HAVE_ATOMIC64_FETCH_INC 1 +#define HAVE_DEALLOC_PD_UDATA_RC 1 +#define HAVE_AH_CORE_ALLOCATION_DESTROY_RC 1 +#define HAVE_IB_INT_DESTROY_CQ 1 +#define HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK 1 +#define HAVE_IB_UMEM_NUM_DMA_BLOCKS 1 +#define HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM 1 +#define HAVE_SYSFS_EMIT 1 +#define HAVE_XARRAY 1 +#define HAVE_EFA_P2P 1 \ No newline at end of file diff --git a/drivers/amazon/net/efa/efa-abi.h b/drivers/amazon/net/efa/efa-abi.h new file mode 100644 index 0000000000000..163ac79556d68 --- /dev/null +++ b/drivers/amazon/net/efa/efa-abi.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ +/* + * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef EFA_ABI_USER_H +#define EFA_ABI_USER_H + +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define EFA_UVERBS_ABI_VERSION 1 + +/* + * Keep structs aligned to 8 bytes. + * Keep reserved fields as arrays of __u8 named reserved_XXX where XXX is the + * hex bit offset of the field. + */ + +enum { + EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH = 1 << 0, + EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR = 1 << 1, +}; + +struct efa_ibv_alloc_ucontext_cmd { + __u32 comp_mask; + __u8 reserved_20[4]; +}; + +enum efa_ibv_user_cmds_supp_udata { + EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0, + EFA_USER_CMDS_SUPP_UDATA_CREATE_AH = 1 << 1, +}; + +struct efa_ibv_alloc_ucontext_resp { + __u32 comp_mask; + __u32 cmds_supp_udata_mask; + __u16 sub_cqs_per_cq; + __u16 inline_buf_size; + __u32 max_llq_size; /* bytes */ + __u16 max_tx_batch; /* units of 64 bytes */ + __u16 min_sq_wr; + __u8 reserved_a0[4]; +}; + +struct efa_ibv_alloc_pd_resp { + __u32 comp_mask; + __u16 pdn; + __u8 reserved_30[2]; +}; + +enum { + EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL = 1 << 0, + EFA_CREATE_CQ_WITH_SGID = 1 << 1, +}; + +struct efa_ibv_create_cq { + __u32 comp_mask; + __u32 cq_entry_size; + __u16 num_sub_cqs; + __u8 flags; + __u8 reserved_58[5]; +}; + +enum { + EFA_CREATE_CQ_RESP_DB_OFF = 1 << 0, +}; + +struct efa_ibv_create_cq_resp { + __u32 comp_mask; + __u8 reserved_20[4]; + __aligned_u64 q_mmap_key; + __aligned_u64 q_mmap_size; + __u16 cq_idx; + __u8 reserved_d0[2]; + __u32 db_off; + __aligned_u64 db_mmap_key; +}; + +enum { + EFA_QP_DRIVER_TYPE_SRD = 0, +}; + +struct efa_ibv_create_qp { + __u32 comp_mask; + __u32 rq_ring_size; /* bytes */ + __u32 sq_ring_size; /* bytes */ + __u32 driver_qp_type; +}; + +struct efa_ibv_create_qp_resp { + __u32 comp_mask; + /* the offset inside the page of the rq db */ + __u32 rq_db_offset; + /* the offset inside the page of the sq db */ + __u32 sq_db_offset; + /* the offset inside the page of descriptors buffer */ + __u32 llq_desc_offset; + __aligned_u64 rq_mmap_key; + __aligned_u64 rq_mmap_size; + __aligned_u64 rq_db_mmap_key; + __aligned_u64 sq_db_mmap_key; + __aligned_u64 llq_desc_mmap_key; + __u16 send_sub_cq_idx; + __u16 recv_sub_cq_idx; + __u8 reserved_1e0[4]; +}; + +struct efa_ibv_create_ah_resp { + __u32 comp_mask; + __u16 efa_address_handle; + __u8 reserved_30[2]; +}; + +enum { + EFA_QUERY_DEVICE_CAPS_RDMA_READ = 1 << 0, + EFA_QUERY_DEVICE_CAPS_RNR_RETRY = 1 << 1, + EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2, + EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID = 1 << 3, +}; + +struct efa_ibv_ex_query_device_resp { + __u32 comp_mask; + __u32 max_sq_wr; + __u32 max_rq_wr; + __u16 max_sq_sge; + __u16 max_rq_sge; + __u32 max_rdma_size; + __u32 device_caps; +}; + +#endif /* EFA_ABI_USER_H */ diff --git a/drivers/amazon/net/efa/efa.h b/drivers/amazon/net/efa/efa.h new file mode 100644 index 0000000000000..34ccbac76b451 --- /dev/null +++ b/drivers/amazon/net/efa/efa.h @@ -0,0 +1,315 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_H_ +#define _EFA_H_ + +#include "kcompat.h" +#include +#include +#include +#include + +#include + +#include "efa-abi.h" +#include "efa_com_cmd.h" + +#define DRV_MODULE_NAME "efa" +#define DEVICE_NAME "Elastic Fabric Adapter (EFA)" + +#define EFA_IRQNAME_SIZE 40 + +#define EFA_MGMNT_MSIX_VEC_IDX 0 +#define EFA_COMP_EQS_VEC_BASE 1 + +struct efa_irq { + irq_handler_t handler; + void *data; + u32 irqn; + u32 vector; + cpumask_t affinity_hint_mask; + char name[EFA_IRQNAME_SIZE]; +}; + +/* Don't use anything other than atomic64 */ +struct efa_stats { + atomic64_t alloc_pd_err; + atomic64_t create_qp_err; + atomic64_t create_cq_err; + atomic64_t reg_mr_err; + atomic64_t alloc_ucontext_err; + atomic64_t create_ah_err; + atomic64_t mmap_err; + atomic64_t keep_alive_rcvd; +}; + +struct efa_dev { + struct ib_device ibdev; + struct efa_com_dev edev; + struct pci_dev *pdev; + struct efa_com_get_device_attr_result dev_attr; + + u64 reg_bar_addr; + u64 reg_bar_len; + u64 mem_bar_addr; + u64 mem_bar_len; + u64 db_bar_addr; + u64 db_bar_len; + + int admin_msix_vector_idx; + struct efa_irq admin_irq; + + struct efa_stats stats; + + /* Array of completion EQs */ + struct efa_eq *eqs; + unsigned int neqs; + +#ifdef HAVE_XARRAY + /* Only stores CQs with interrupts enabled */ + struct xarray cqs_xa; +#else + /* If xarray isn't available keep an array of all possible CQs */ + struct efa_cq *cqs_arr[BIT(sizeof_field(struct efa_admin_create_cq_resp, + cq_idx) * 8)]; +#endif +}; + +struct efa_ucontext { + struct ib_ucontext ibucontext; + u16 uarn; +#ifndef HAVE_CORE_MMAP_XA + /* Protects ucontext state */ + struct mutex lock; + struct list_head pending_mmaps; + u32 mmap_page; +#endif /* !defined(HAVE_CORE_MMAP_XA) */ +}; + +struct efa_pd { + struct ib_pd ibpd; + u16 pdn; +}; + +struct efa_mr { + struct ib_mr ibmr; + struct ib_umem *umem; +#ifdef HAVE_EFA_P2P + struct efa_p2pmem *p2pmem; + u64 p2p_ticket; +#endif +}; + +struct efa_cq { + struct ib_cq ibcq; + struct efa_ucontext *ucontext; + dma_addr_t dma_addr; + void *cpu_addr; + struct rdma_user_mmap_entry *mmap_entry; + struct rdma_user_mmap_entry *db_mmap_entry; + size_t size; + u16 cq_idx; + /* NULL when no interrupts requested */ + struct efa_eq *eq; +}; + +struct efa_qp { + struct ib_qp ibqp; + dma_addr_t rq_dma_addr; + void *rq_cpu_addr; + size_t rq_size; + enum ib_qp_state state; + + /* Used for saving mmap_xa entries */ + struct rdma_user_mmap_entry *sq_db_mmap_entry; + struct rdma_user_mmap_entry *llq_desc_mmap_entry; + struct rdma_user_mmap_entry *rq_db_mmap_entry; + struct rdma_user_mmap_entry *rq_mmap_entry; + + u32 qp_handle; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_send_sge; + u32 max_recv_sge; + u32 max_inline_data; +}; + +struct efa_ah { + struct ib_ah ibah; + u16 ah; + /* dest_addr */ + u8 id[EFA_GID_SIZE]; +}; + +struct efa_eq { + struct efa_com_eq eeq; + struct efa_irq irq; +}; + +int efa_query_device(struct ib_device *ibdev, + struct ib_device_attr *props, + struct ib_udata *udata); +int efa_query_port(struct ib_device *ibdev, port_t port, + struct ib_port_attr *props); +int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int efa_query_gid(struct ib_device *ibdev, port_t port, int index, + union ib_gid *gid); +int efa_query_pkey(struct ib_device *ibdev, port_t port, u16 index, + u16 *pkey); +#ifdef HAVE_ALLOC_PD_NO_UCONTEXT +int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +#else +int efa_alloc_pd(struct ib_pd *ibpd, + struct ib_ucontext *ibucontext, + struct ib_udata *udata); +#endif +#ifdef HAVE_DEALLOC_PD_UDATA_RC +int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +#elif defined(HAVE_DEALLOC_PD_UDATA) +void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +#elif defined(HAVE_PD_CORE_ALLOCATION) +void efa_dealloc_pd(struct ib_pd *ibpd); +#else +int efa_dealloc_pd(struct ib_pd *ibpd); +struct ib_pd *efa_kzalloc_pd(struct ib_device *ibdev, + struct ib_ucontext *ibucontext, + struct ib_udata *udata); +#endif +#ifdef HAVE_DESTROY_QP_UDATA +int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); +#else +int efa_destroy_qp(struct ib_qp *ibqp); +#endif +#ifdef HAVE_QP_CORE_ALLOCATION +int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +#else +struct ib_qp *efa_kzalloc_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +#endif +#ifdef HAVE_IB_INT_DESTROY_CQ +int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +#elif defined(HAVE_IB_VOID_DESTROY_CQ) +void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +#elif defined(HAVE_DESTROY_CQ_UDATA) +int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +#else +int efa_destroy_cq(struct ib_cq *ibcq); +#endif +int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); +#ifndef HAVE_CQ_CORE_ALLOCATION +#ifdef HAVE_CREATE_CQ_NO_UCONTEXT +struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata); +#elif defined(HAVE_CREATE_CQ_ATTR) +struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *ibucontext, + struct ib_udata *udata); +#endif +#endif +struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +#ifdef HAVE_MR_DMABUF +struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct ib_udata *udata); +#endif +#ifdef HAVE_DEREG_MR_UDATA +int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); +#else +int efa_dereg_mr(struct ib_mr *ibmr); +#endif +int efa_get_port_immutable(struct ib_device *ibdev, port_t port_num, + struct ib_port_immutable *immutable); +int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata); +#ifdef HAVE_UCONTEXT_CORE_ALLOCATION +void efa_dealloc_ucontext(struct ib_ucontext *ibucontext); +#else +int efa_dealloc_ucontext(struct ib_ucontext *ibucontext); +struct ib_ucontext *efa_kzalloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata); +#endif +int efa_mmap(struct ib_ucontext *ibucontext, + struct vm_area_struct *vma); +#ifdef HAVE_CORE_MMAP_XA +void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +#endif +int efa_create_ah(struct ib_ah *ibah, +#ifdef HAVE_CREATE_AH_INIT_ATTR + struct rdma_ah_init_attr *init_attr, +#else + struct rdma_ah_attr *ah_attr, + u32 flags, +#endif + struct ib_udata *udata); +#ifndef HAVE_AH_CORE_ALLOCATION +#ifdef HAVE_CREATE_DESTROY_AH_FLAGS +struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd, + struct rdma_ah_attr *ah_attr, + u32 flags, + struct ib_udata *udata); +#elif defined(HAVE_CREATE_AH_RDMA_ATTR) +struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd, + struct rdma_ah_attr *ah_attr, + struct ib_udata *udata); +#endif +#endif +#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC +int efa_destroy_ah(struct ib_ah *ibah, u32 flags); +#elif defined(HAVE_AH_CORE_ALLOCATION) +void efa_destroy_ah(struct ib_ah *ibah, u32 flags); +#elif defined(HAVE_CREATE_DESTROY_AH_FLAGS) +int efa_destroy_ah(struct ib_ah *ibah, u32 flags); +#else +int efa_destroy_ah(struct ib_ah *ibah); +#endif +#ifndef HAVE_NO_KVERBS_DRIVERS +#ifdef HAVE_POST_CONST_WR +int efa_post_send(struct ib_qp *ibqp, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +#else +int efa_post_send(struct ib_qp *ibqp, + struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +#endif +#ifdef HAVE_POST_CONST_WR +int efa_post_recv(struct ib_qp *ibqp, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +#else +int efa_post_recv(struct ib_qp *ibqp, + struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +#endif +int efa_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc); +int efa_req_notify_cq(struct ib_cq *ibcq, + enum ib_cq_notify_flags flags); +struct ib_mr *efa_get_dma_mr(struct ib_pd *ibpd, int acc); +#endif +int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_udata *udata); +enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev, + port_t port_num); +#ifdef HAVE_SPLIT_STATS_ALLOC +struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev, port_t port_num); +struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev); +#else +struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, port_t port_num); +#endif +int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + port_t port_num, int index); + +#endif /* _EFA_H_ */ diff --git a/drivers/amazon/net/efa/efa_admin_cmds_defs.h b/drivers/amazon/net/efa/efa_admin_cmds_defs.h new file mode 100644 index 0000000000000..d4b9226088bd0 --- /dev/null +++ b/drivers/amazon/net/efa/efa_admin_cmds_defs.h @@ -0,0 +1,1013 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_ADMIN_CMDS_H_ +#define _EFA_ADMIN_CMDS_H_ + +#define EFA_ADMIN_API_VERSION_MAJOR 0 +#define EFA_ADMIN_API_VERSION_MINOR 1 + +/* EFA admin queue opcodes */ +enum efa_admin_aq_opcode { + EFA_ADMIN_CREATE_QP = 1, + EFA_ADMIN_MODIFY_QP = 2, + EFA_ADMIN_QUERY_QP = 3, + EFA_ADMIN_DESTROY_QP = 4, + EFA_ADMIN_CREATE_AH = 5, + EFA_ADMIN_DESTROY_AH = 6, + EFA_ADMIN_REG_MR = 7, + EFA_ADMIN_DEREG_MR = 8, + EFA_ADMIN_CREATE_CQ = 9, + EFA_ADMIN_DESTROY_CQ = 10, + EFA_ADMIN_GET_FEATURE = 11, + EFA_ADMIN_SET_FEATURE = 12, + EFA_ADMIN_GET_STATS = 13, + EFA_ADMIN_ALLOC_PD = 14, + EFA_ADMIN_DEALLOC_PD = 15, + EFA_ADMIN_ALLOC_UAR = 16, + EFA_ADMIN_DEALLOC_UAR = 17, + EFA_ADMIN_CREATE_EQ = 18, + EFA_ADMIN_DESTROY_EQ = 19, + EFA_ADMIN_MAX_OPCODE = 19, +}; + +enum efa_admin_aq_feature_id { + EFA_ADMIN_DEVICE_ATTR = 1, + EFA_ADMIN_AENQ_CONFIG = 2, + EFA_ADMIN_NETWORK_ATTR = 3, + EFA_ADMIN_QUEUE_ATTR = 4, + EFA_ADMIN_HW_HINTS = 5, + EFA_ADMIN_HOST_INFO = 6, + EFA_ADMIN_EVENT_QUEUE_ATTR = 7, +}; + +/* QP transport type */ +enum efa_admin_qp_type { + /* Unreliable Datagram */ + EFA_ADMIN_QP_TYPE_UD = 1, + /* Scalable Reliable Datagram */ + EFA_ADMIN_QP_TYPE_SRD = 2, +}; + +/* QP state */ +enum efa_admin_qp_state { + EFA_ADMIN_QP_STATE_RESET = 0, + EFA_ADMIN_QP_STATE_INIT = 1, + EFA_ADMIN_QP_STATE_RTR = 2, + EFA_ADMIN_QP_STATE_RTS = 3, + EFA_ADMIN_QP_STATE_SQD = 4, + EFA_ADMIN_QP_STATE_SQE = 5, + EFA_ADMIN_QP_STATE_ERR = 6, +}; + +enum efa_admin_get_stats_type { + EFA_ADMIN_GET_STATS_TYPE_BASIC = 0, + EFA_ADMIN_GET_STATS_TYPE_MESSAGES = 1, + EFA_ADMIN_GET_STATS_TYPE_RDMA_READ = 2, +}; + +enum efa_admin_get_stats_scope { + EFA_ADMIN_GET_STATS_SCOPE_ALL = 0, + EFA_ADMIN_GET_STATS_SCOPE_QUEUE = 1, +}; + +/* + * QP allocation sizes, converted by fabric QueuePair (QP) create command + * from QP capabilities. + */ +struct efa_admin_qp_alloc_size { + /* Send descriptor ring size in bytes */ + u32 send_queue_ring_size; + + /* Max number of WQEs that can be outstanding on send queue. */ + u32 send_queue_depth; + + /* + * Recv descriptor ring size in bytes, sufficient for user-provided + * number of WQEs + */ + u32 recv_queue_ring_size; + + /* Max number of WQEs that can be outstanding on recv queue */ + u32 recv_queue_depth; +}; + +struct efa_admin_create_qp_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Protection Domain associated with this QP */ + u16 pd; + + /* QP type */ + u8 qp_type; + + /* + * 0 : sq_virt - If set, SQ ring base address is + * virtual (IOVA returned by MR registration) + * 1 : rq_virt - If set, RQ ring base address is + * virtual (IOVA returned by MR registration) + * 7:2 : reserved - MBZ + */ + u8 flags; + + /* + * Send queue (SQ) ring base physical address. This field is not + * used if this is a Low Latency Queue(LLQ). + */ + u64 sq_base_addr; + + /* Receive queue (RQ) ring base address. */ + u64 rq_base_addr; + + /* Index of CQ to be associated with Send Queue completions */ + u32 send_cq_idx; + + /* Index of CQ to be associated with Recv Queue completions */ + u32 recv_cq_idx; + + /* + * Memory registration key for the SQ ring, used only when not in + * LLQ mode and base address is virtual + */ + u32 sq_l_key; + + /* + * Memory registration key for the RQ ring, used only when base + * address is virtual + */ + u32 rq_l_key; + + /* Requested QP allocation sizes */ + struct efa_admin_qp_alloc_size qp_alloc_size; + + /* UAR number */ + u16 uar; + + /* MBZ */ + u16 reserved; + + /* MBZ */ + u32 reserved2; +}; + +struct efa_admin_create_qp_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* + * Opaque handle to be used for consequent admin operations on the + * QP + */ + u32 qp_handle; + + /* + * QP number in the given EFA virtual device. Least-significant bits (as + * needed according to max_qp) carry unique QP ID + */ + u16 qp_num; + + /* MBZ */ + u16 reserved; + + /* Index of sub-CQ for Send Queue completions */ + u16 send_sub_cq_idx; + + /* Index of sub-CQ for Receive Queue completions */ + u16 recv_sub_cq_idx; + + /* SQ doorbell address, as offset to PCIe DB BAR */ + u32 sq_db_offset; + + /* RQ doorbell address, as offset to PCIe DB BAR */ + u32 rq_db_offset; + + /* + * low latency send queue ring base address as an offset to PCIe + * MMIO LLQ_MEM BAR + */ + u32 llq_descriptors_offset; +}; + +struct efa_admin_modify_qp_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* + * Mask indicating which fields should be updated + * 0 : qp_state + * 1 : cur_qp_state + * 2 : qkey + * 3 : sq_psn + * 4 : sq_drained_async_notify + * 5 : rnr_retry + * 31:6 : reserved + */ + u32 modify_mask; + + /* QP handle returned by create_qp command */ + u32 qp_handle; + + /* QP state */ + u32 qp_state; + + /* Override current QP state (before applying the transition) */ + u32 cur_qp_state; + + /* QKey */ + u32 qkey; + + /* SQ PSN */ + u32 sq_psn; + + /* Enable async notification when SQ is drained */ + u8 sq_drained_async_notify; + + /* Number of RNR retries (valid only for SRD QPs) */ + u8 rnr_retry; + + /* MBZ */ + u16 reserved2; +}; + +struct efa_admin_modify_qp_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; +}; + +struct efa_admin_query_qp_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* QP handle returned by create_qp command */ + u32 qp_handle; +}; + +struct efa_admin_query_qp_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* QP state */ + u32 qp_state; + + /* QKey */ + u32 qkey; + + /* SQ PSN */ + u32 sq_psn; + + /* Indicates that draining is in progress */ + u8 sq_draining; + + /* Number of RNR retries (valid only for SRD QPs) */ + u8 rnr_retry; + + /* MBZ */ + u16 reserved2; +}; + +struct efa_admin_destroy_qp_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* QP handle returned by create_qp command */ + u32 qp_handle; +}; + +struct efa_admin_destroy_qp_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; +}; + +/* + * Create Address Handle command parameters. Must not be called more than + * once for the same destination + */ +struct efa_admin_create_ah_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Destination address in network byte order */ + u8 dest_addr[16]; + + /* PD number */ + u16 pd; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_create_ah_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* Target interface address handle (opaque) */ + u16 ah; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_destroy_ah_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Target interface address handle (opaque) */ + u16 ah; + + /* PD number */ + u16 pd; +}; + +struct efa_admin_destroy_ah_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; +}; + +/* + * Registration of MemoryRegion, required for QP working with Virtual + * Addresses. In standard verbs semantics, region length is limited to 2GB + * space, but EFA offers larger MR support for large memory space, to ease + * on users working with very large datasets (i.e. full GPU memory mapping). + */ +struct efa_admin_reg_mr_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Protection Domain */ + u16 pd; + + /* MBZ */ + u16 reserved16_w1; + + /* Physical Buffer List, each element is page-aligned. */ + union { + /* + * Inline array of guest-physical page addresses of user + * memory pages (optimization for short region + * registrations) + */ + u64 inline_pbl_array[4]; + + /* points to PBL (direct or indirect, chained if needed) */ + struct efa_admin_ctrl_buff_info pbl; + } pbl; + + /* Memory region length, in bytes. */ + u64 mr_length; + + /* + * flags and page size + * 4:0 : phys_page_size_shift - page size is (1 << + * phys_page_size_shift). Page size is used for + * building the Virtual to Physical address mapping + * 6:5 : reserved - MBZ + * 7 : mem_addr_phy_mode_en - Enable bit for physical + * memory registration (no translation), can be used + * only by privileged clients. If set, PBL must + * contain a single entry. + */ + u8 flags; + + /* + * permissions + * 0 : local_write_enable - Local write permissions: + * must be set for RQ buffers and buffers posted for + * RDMA Read requests + * 1 : reserved1 - MBZ + * 2 : remote_read_enable - Remote read permissions: + * must be set to enable RDMA read from the region + * 7:3 : reserved2 - MBZ + */ + u8 permissions; + + /* MBZ */ + u16 reserved16_w5; + + /* number of pages in PBL (redundant, could be calculated) */ + u32 page_num; + + /* + * IO Virtual Address associated with this MR. If + * mem_addr_phy_mode_en is set, contains the physical address of + * the region. + */ + u64 iova; +}; + +struct efa_admin_reg_mr_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* + * L_Key, to be used in conjunction with local buffer references in + * SQ and RQ WQE, or with virtual RQ/CQ rings + */ + u32 l_key; + + /* + * R_Key, to be used in RDMA messages to refer to remotely accessed + * memory region + */ + u32 r_key; +}; + +struct efa_admin_dereg_mr_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* L_Key, memory region's l_key */ + u32 l_key; +}; + +struct efa_admin_dereg_mr_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; +}; + +struct efa_admin_create_cq_cmd { + struct efa_admin_aq_common_desc aq_common_desc; + + /* + * 4:0 : reserved5 - MBZ + * 5 : interrupt_mode_enabled - if set, cq operates + * in interrupt mode (i.e. CQ events and EQ elements + * are generated), otherwise - polling + * 6 : virt - If set, ring base address is virtual + * (IOVA returned by MR registration) + * 7 : reserved6 - MBZ + */ + u8 cq_caps_1; + + /* + * 4:0 : cq_entry_size_words - size of CQ entry in + * 32-bit words, valid values: 4, 8. + * 5 : set_src_addr - If set, source address will be + * filled on RX completions from unknown senders. + * Requires 8 words CQ entry size. + * 7:6 : reserved7 - MBZ + */ + u8 cq_caps_2; + + /* completion queue depth in # of entries. must be power of 2 */ + u16 cq_depth; + + /* EQ number assigned to this cq */ + u16 eqn; + + /* MBZ */ + u16 reserved; + + /* + * CQ ring base address, virtual or physical depending on 'virt' + * flag + */ + struct efa_common_mem_addr cq_ba; + + /* + * Memory registration key for the ring, used only when base + * address is virtual + */ + u32 l_key; + + /* + * number of sub cqs - must be equal to sub_cqs_per_cq of queue + * attributes. + */ + u16 num_sub_cqs; + + /* UAR number */ + u16 uar; +}; + +struct efa_admin_create_cq_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + u16 cq_idx; + + /* actual cq depth in number of entries */ + u16 cq_actual_depth; + + /* CQ doorbell address, as offset to PCIe DB BAR */ + u32 db_offset; + + /* + * 0 : db_valid - If set, doorbell offset is valid. + * Always set when interrupts are requested. + */ + u32 flags; +}; + +struct efa_admin_destroy_cq_cmd { + struct efa_admin_aq_common_desc aq_common_desc; + + u16 cq_idx; + + /* MBZ */ + u16 reserved1; +}; + +struct efa_admin_destroy_cq_resp { + struct efa_admin_acq_common_desc acq_common_desc; +}; + +/* + * EFA AQ Get Statistics command. Extended statistics are placed in control + * buffer pointed by AQ entry + */ +struct efa_admin_aq_get_stats_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + union { + /* command specific inline data */ + u32 inline_data_w1[3]; + + struct efa_admin_ctrl_buff_info control_buffer; + } u; + + /* stats type as defined in enum efa_admin_get_stats_type */ + u8 type; + + /* stats scope defined in enum efa_admin_get_stats_scope */ + u8 scope; + + u16 scope_modifier; +}; + +struct efa_admin_basic_stats { + u64 tx_bytes; + + u64 tx_pkts; + + u64 rx_bytes; + + u64 rx_pkts; + + u64 rx_drops; +}; + +struct efa_admin_messages_stats { + u64 send_bytes; + + u64 send_wrs; + + u64 recv_bytes; + + u64 recv_wrs; +}; + +struct efa_admin_rdma_read_stats { + u64 read_wrs; + + u64 read_bytes; + + u64 read_wr_err; + + u64 read_resp_bytes; +}; + +struct efa_admin_acq_get_stats_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + union { + struct efa_admin_basic_stats basic_stats; + + struct efa_admin_messages_stats messages_stats; + + struct efa_admin_rdma_read_stats rdma_read_stats; + } u; +}; + +struct efa_admin_get_set_feature_common_desc { + /* MBZ */ + u8 reserved0; + + /* as appears in efa_admin_aq_feature_id */ + u8 feature_id; + + /* MBZ */ + u16 reserved16; +}; + +struct efa_admin_feature_device_attr_desc { + /* Bitmap of efa_admin_aq_feature_id */ + u64 supported_features; + + /* Bitmap of supported page sizes in MR registrations */ + u64 page_size_cap; + + u32 fw_version; + + u32 admin_api_version; + + u32 device_version; + + /* Bar used for SQ and RQ doorbells */ + u16 db_bar; + + /* Indicates how many bits are used on physical address access */ + u8 phys_addr_width; + + /* Indicates how many bits are used on virtual address access */ + u8 virt_addr_width; + + /* + * 0 : rdma_read - If set, RDMA Read is supported on + * TX queues + * 1 : rnr_retry - If set, RNR retry is supported on + * modify QP command + * 31:2 : reserved - MBZ + */ + u32 device_caps; + + /* Max RDMA transfer size in bytes */ + u32 max_rdma_size; +}; + +struct efa_admin_feature_queue_attr_desc { + /* The maximum number of queue pairs supported */ + u32 max_qp; + + /* Maximum number of WQEs per Send Queue */ + u32 max_sq_depth; + + /* Maximum size of data that can be sent inline in a Send WQE */ + u32 inline_buf_size; + + /* Maximum number of buffer descriptors per Recv Queue */ + u32 max_rq_depth; + + /* The maximum number of completion queues supported per VF */ + u32 max_cq; + + /* Maximum number of CQEs per Completion Queue */ + u32 max_cq_depth; + + /* Number of sub-CQs to be created for each CQ */ + u16 sub_cqs_per_cq; + + /* Minimum number of WQEs per SQ */ + u16 min_sq_depth; + + /* Maximum number of SGEs (buffers) allowed for a single send WQE */ + u16 max_wr_send_sges; + + /* Maximum number of SGEs allowed for a single recv WQE */ + u16 max_wr_recv_sges; + + /* The maximum number of memory regions supported */ + u32 max_mr; + + /* The maximum number of pages can be registered */ + u32 max_mr_pages; + + /* The maximum number of protection domains supported */ + u32 max_pd; + + /* The maximum number of address handles supported */ + u32 max_ah; + + /* The maximum size of LLQ in bytes */ + u32 max_llq_size; + + /* Maximum number of SGEs for a single RDMA read WQE */ + u16 max_wr_rdma_sges; + + /* + * Maximum number of bytes that can be written to SQ between two + * consecutive doorbells (in units of 64B). Driver must ensure that only + * complete WQEs are written to queue before issuing a doorbell. + * Examples: max_tx_batch=16 and WQE size = 64B, means up to 16 WQEs can + * be written to SQ between two consecutive doorbells. max_tx_batch=11 + * and WQE size = 128B, means up to 5 WQEs can be written to SQ between + * two consecutive doorbells. Zero means unlimited. + */ + u16 max_tx_batch; +}; + +struct efa_admin_event_queue_attr_desc { + /* The maximum number of event queues supported */ + u32 max_eq; + + /* Maximum number of EQEs per Event Queue */ + u32 max_eq_depth; + + /* Supported events bitmask */ + u32 event_bitmask; +}; + +struct efa_admin_feature_aenq_desc { + /* bitmask for AENQ groups the device can report */ + u32 supported_groups; + + /* bitmask for AENQ groups to report */ + u32 enabled_groups; +}; + +struct efa_admin_feature_network_attr_desc { + /* Raw address data in network byte order */ + u8 addr[16]; + + /* max packet payload size in bytes */ + u32 mtu; +}; + +/* + * When hint value is 0, hints capabilities are not supported or driver + * should use its own predefined value + */ +struct efa_admin_hw_hints { + /* value in ms */ + u16 mmio_read_timeout; + + /* value in ms */ + u16 driver_watchdog_timeout; + + /* value in ms */ + u16 admin_completion_timeout; + + /* poll interval in ms */ + u16 poll_interval; +}; + +struct efa_admin_get_feature_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + struct efa_admin_ctrl_buff_info control_buffer; + + struct efa_admin_get_set_feature_common_desc feature_common; + + u32 raw[11]; +}; + +struct efa_admin_get_feature_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + union { + u32 raw[14]; + + struct efa_admin_feature_device_attr_desc device_attr; + + struct efa_admin_feature_aenq_desc aenq; + + struct efa_admin_feature_network_attr_desc network_attr; + + struct efa_admin_feature_queue_attr_desc queue_attr; + + struct efa_admin_event_queue_attr_desc event_queue_attr; + + struct efa_admin_hw_hints hw_hints; + } u; +}; + +struct efa_admin_set_feature_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + struct efa_admin_ctrl_buff_info control_buffer; + + struct efa_admin_get_set_feature_common_desc feature_common; + + union { + u32 raw[11]; + + /* AENQ configuration */ + struct efa_admin_feature_aenq_desc aenq; + } u; +}; + +struct efa_admin_set_feature_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + union { + u32 raw[14]; + } u; +}; + +struct efa_admin_alloc_pd_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; +}; + +struct efa_admin_alloc_pd_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + /* PD number */ + u16 pd; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_dealloc_pd_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + /* PD number */ + u16 pd; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_dealloc_pd_resp { + struct efa_admin_acq_common_desc acq_common_desc; +}; + +struct efa_admin_alloc_uar_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; +}; + +struct efa_admin_alloc_uar_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + /* UAR number */ + u16 uar; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_dealloc_uar_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + /* UAR number */ + u16 uar; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_dealloc_uar_resp { + struct efa_admin_acq_common_desc acq_common_desc; +}; + +struct efa_admin_create_eq_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + /* Size of the EQ in entries, must be power of 2 */ + u16 depth; + + /* MSI-X table entry index */ + u8 msix_vec; + + /* + * 4:0 : entry_size_words - size of EQ entry in + * 32-bit words + * 7:5 : reserved - MBZ + */ + u8 caps; + + /* EQ ring base address */ + struct efa_common_mem_addr ba; + + /* + * Enabled events on this EQ + * 0 : completion_events - Enable completion events + * 31:1 : reserved - MBZ + */ + u32 event_bitmask; + + /* MBZ */ + u32 reserved; +}; + +struct efa_admin_create_eq_resp { + struct efa_admin_acq_common_desc acq_common_desc; + + /* EQ number */ + u16 eqn; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_destroy_eq_cmd { + struct efa_admin_aq_common_desc aq_common_descriptor; + + /* EQ number */ + u16 eqn; + + /* MBZ */ + u16 reserved; +}; + +struct efa_admin_destroy_eq_resp { + struct efa_admin_acq_common_desc acq_common_desc; +}; + +/* asynchronous event notification groups */ +enum efa_admin_aenq_group { + EFA_ADMIN_FATAL_ERROR = 1, + EFA_ADMIN_WARNING = 2, + EFA_ADMIN_NOTIFICATION = 3, + EFA_ADMIN_KEEP_ALIVE = 4, + EFA_ADMIN_AENQ_GROUPS_NUM = 5, +}; + +struct efa_admin_mmio_req_read_less_resp { + u16 req_id; + + u16 reg_off; + + /* value is valid when poll is cleared */ + u32 reg_val; +}; + +enum efa_admin_os_type { + EFA_ADMIN_OS_LINUX = 0, +}; + +struct efa_admin_host_info { + /* OS distribution string format */ + u8 os_dist_str[128]; + + /* Defined in enum efa_admin_os_type */ + u32 os_type; + + /* Kernel version string format */ + u8 kernel_ver_str[32]; + + /* Kernel version numeric format */ + u32 kernel_ver; + + /* + * 7:0 : driver_module_type + * 15:8 : driver_sub_minor + * 23:16 : driver_minor + * 31:24 : driver_major + */ + u32 driver_ver; + + /* + * Device's Bus, Device and Function + * 2:0 : function + * 7:3 : device + * 15:8 : bus + */ + u16 bdf; + + /* + * Spec version + * 7:0 : spec_minor + * 15:8 : spec_major + */ + u16 spec_ver; + + /* + * 0 : intree - Intree driver + * 1 : gdr - GPUDirect RDMA supported + * 31:2 : reserved2 + */ + u32 flags; +}; + +/* create_qp_cmd */ +#define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK BIT(0) +#define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK BIT(1) + +/* modify_qp_cmd */ +#define EFA_ADMIN_MODIFY_QP_CMD_QP_STATE_MASK BIT(0) +#define EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE_MASK BIT(1) +#define EFA_ADMIN_MODIFY_QP_CMD_QKEY_MASK BIT(2) +#define EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN_MASK BIT(3) +#define EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY_MASK BIT(4) +#define EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY_MASK BIT(5) + +/* reg_mr_cmd */ +#define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK GENMASK(4, 0) +#define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK BIT(7) +#define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK BIT(0) +#define EFA_ADMIN_REG_MR_CMD_REMOTE_READ_ENABLE_MASK BIT(2) + +/* create_cq_cmd */ +#define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5) +#define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK BIT(6) +#define EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0) +#define EFA_ADMIN_CREATE_CQ_CMD_SET_SRC_ADDR_MASK BIT(5) + +/* create_cq_resp */ +#define EFA_ADMIN_CREATE_CQ_RESP_DB_VALID_MASK BIT(0) + +/* feature_device_attr_desc */ +#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK BIT(0) +#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK BIT(1) + +/* create_eq_cmd */ +#define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0) +#define EFA_ADMIN_CREATE_EQ_CMD_VIRT_MASK BIT(6) +#define EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS_MASK BIT(0) + +/* host_info */ +#define EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE_MASK GENMASK(7, 0) +#define EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR_MASK GENMASK(15, 8) +#define EFA_ADMIN_HOST_INFO_DRIVER_MINOR_MASK GENMASK(23, 16) +#define EFA_ADMIN_HOST_INFO_DRIVER_MAJOR_MASK GENMASK(31, 24) +#define EFA_ADMIN_HOST_INFO_FUNCTION_MASK GENMASK(2, 0) +#define EFA_ADMIN_HOST_INFO_DEVICE_MASK GENMASK(7, 3) +#define EFA_ADMIN_HOST_INFO_BUS_MASK GENMASK(15, 8) +#define EFA_ADMIN_HOST_INFO_SPEC_MINOR_MASK GENMASK(7, 0) +#define EFA_ADMIN_HOST_INFO_SPEC_MAJOR_MASK GENMASK(15, 8) +#define EFA_ADMIN_HOST_INFO_INTREE_MASK BIT(0) +#define EFA_ADMIN_HOST_INFO_GDR_MASK BIT(1) + +#endif /* _EFA_ADMIN_CMDS_H_ */ diff --git a/drivers/amazon/net/efa/efa_admin_defs.h b/drivers/amazon/net/efa/efa_admin_defs.h new file mode 100644 index 0000000000000..83f20c38a8400 --- /dev/null +++ b/drivers/amazon/net/efa/efa_admin_defs.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_ADMIN_H_ +#define _EFA_ADMIN_H_ + +enum efa_admin_aq_completion_status { + EFA_ADMIN_SUCCESS = 0, + EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE = 1, + EFA_ADMIN_BAD_OPCODE = 2, + EFA_ADMIN_UNSUPPORTED_OPCODE = 3, + EFA_ADMIN_MALFORMED_REQUEST = 4, + /* Additional status is provided in ACQ entry extended_status */ + EFA_ADMIN_ILLEGAL_PARAMETER = 5, + EFA_ADMIN_UNKNOWN_ERROR = 6, + EFA_ADMIN_RESOURCE_BUSY = 7, +}; + +struct efa_admin_aq_common_desc { + /* + * 11:0 : command_id + * 15:12 : reserved12 + */ + u16 command_id; + + /* as appears in efa_admin_aq_opcode */ + u8 opcode; + + /* + * 0 : phase + * 1 : ctrl_data - control buffer address valid + * 2 : ctrl_data_indirect - control buffer address + * points to list of pages with addresses of control + * buffers + * 7:3 : reserved3 + */ + u8 flags; +}; + +/* + * used in efa_admin_aq_entry. Can point directly to control data, or to a + * page list chunk. Used also at the end of indirect mode page list chunks, + * for chaining. + */ +struct efa_admin_ctrl_buff_info { + u32 length; + + struct efa_common_mem_addr address; +}; + +struct efa_admin_aq_entry { + struct efa_admin_aq_common_desc aq_common_descriptor; + + union { + u32 inline_data_w1[3]; + + struct efa_admin_ctrl_buff_info control_buffer; + } u; + + u32 inline_data_w4[12]; +}; + +struct efa_admin_acq_common_desc { + /* + * command identifier to associate it with the aq descriptor + * 11:0 : command_id + * 15:12 : reserved12 + */ + u16 command; + + u8 status; + + /* + * 0 : phase + * 7:1 : reserved1 + */ + u8 flags; + + u16 extended_status; + + /* + * indicates to the driver which AQ entry has been consumed by the + * device and could be reused + */ + u16 sq_head_indx; +}; + +struct efa_admin_acq_entry { + struct efa_admin_acq_common_desc acq_common_descriptor; + + u32 response_specific_data[14]; +}; + +struct efa_admin_aenq_common_desc { + u16 group; + + u16 syndrom; + + /* + * 0 : phase + * 7:1 : reserved - MBZ + */ + u8 flags; + + u8 reserved1[3]; + + u32 timestamp_low; + + u32 timestamp_high; +}; + +struct efa_admin_aenq_entry { + struct efa_admin_aenq_common_desc aenq_common_desc; + + /* command specific inline data */ + u32 inline_data_w4[12]; +}; + +enum efa_admin_eqe_event_type { + EFA_ADMIN_EQE_EVENT_TYPE_COMPLETION = 0, +}; + +/* Completion event */ +struct efa_admin_comp_event { + /* CQ number */ + u16 cqn; + + /* MBZ */ + u16 reserved; + + /* MBZ */ + u32 reserved2; +}; + +/* Event Queue Element */ +struct efa_admin_eqe { + /* + * 0 : phase + * 8:1 : event_type - Event type + * 31:9 : reserved - MBZ + */ + u32 common; + + /* MBZ */ + u32 reserved; + + union { + /* Event data */ + u32 event_data[2]; + + /* Completion Event */ + struct efa_admin_comp_event comp_event; + } u; +}; + +/* aq_common_desc */ +#define EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0) +#define EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK BIT(0) +#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK BIT(1) +#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK BIT(2) + +/* acq_common_desc */ +#define EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0) +#define EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK BIT(0) + +/* aenq_common_desc */ +#define EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK BIT(0) + +/* eqe */ +#define EFA_ADMIN_EQE_PHASE_MASK BIT(0) +#define EFA_ADMIN_EQE_EVENT_TYPE_MASK GENMASK(8, 1) + +#endif /* _EFA_ADMIN_H_ */ diff --git a/drivers/amazon/net/efa/efa_com.c b/drivers/amazon/net/efa/efa_com.c new file mode 100644 index 0000000000000..d0b13097a0967 --- /dev/null +++ b/drivers/amazon/net/efa/efa_com.c @@ -0,0 +1,1251 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "efa_com.h" +#include "efa_regs_defs.h" + +#define ADMIN_CMD_TIMEOUT_US 30000000 /* usecs */ + +#define EFA_REG_READ_TIMEOUT_US 50000 /* usecs */ +#define EFA_MMIO_READ_INVALID 0xffffffff + +#define EFA_POLL_INTERVAL_MS 100 /* msecs */ + +#define EFA_ASYNC_QUEUE_DEPTH 16 +#define EFA_ADMIN_QUEUE_DEPTH 32 + +#define EFA_CTRL_MAJOR 0 +#define EFA_CTRL_MINOR 0 +#define EFA_CTRL_SUB_MINOR 1 + +enum efa_cmd_status { + EFA_CMD_SUBMITTED, + EFA_CMD_COMPLETED, +}; + +struct efa_comp_ctx { + struct completion wait_event; + struct efa_admin_acq_entry *user_cqe; + u32 comp_size; + enum efa_cmd_status status; + u8 cmd_opcode; + u8 occupied; +}; + +static const char *efa_com_cmd_str(u8 cmd) +{ +#define EFA_CMD_STR_CASE(_cmd) case EFA_ADMIN_##_cmd: return #_cmd + + switch (cmd) { + EFA_CMD_STR_CASE(CREATE_QP); + EFA_CMD_STR_CASE(MODIFY_QP); + EFA_CMD_STR_CASE(QUERY_QP); + EFA_CMD_STR_CASE(DESTROY_QP); + EFA_CMD_STR_CASE(CREATE_AH); + EFA_CMD_STR_CASE(DESTROY_AH); + EFA_CMD_STR_CASE(REG_MR); + EFA_CMD_STR_CASE(DEREG_MR); + EFA_CMD_STR_CASE(CREATE_CQ); + EFA_CMD_STR_CASE(DESTROY_CQ); + EFA_CMD_STR_CASE(GET_FEATURE); + EFA_CMD_STR_CASE(SET_FEATURE); + EFA_CMD_STR_CASE(GET_STATS); + EFA_CMD_STR_CASE(ALLOC_PD); + EFA_CMD_STR_CASE(DEALLOC_PD); + EFA_CMD_STR_CASE(ALLOC_UAR); + EFA_CMD_STR_CASE(DEALLOC_UAR); + EFA_CMD_STR_CASE(CREATE_EQ); + EFA_CMD_STR_CASE(DESTROY_EQ); + default: return "unknown command opcode"; + } +#undef EFA_CMD_STR_CASE +} + +void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low) +{ + *addr_low = lower_32_bits(addr); + *addr_high = upper_32_bits(addr); +} + +static u32 efa_com_reg_read32(struct efa_com_dev *edev, u16 offset) +{ + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; + struct efa_admin_mmio_req_read_less_resp *read_resp; + unsigned long exp_time; + u32 mmio_read_reg = 0; + u32 err; + + read_resp = mmio_read->read_resp; + + spin_lock(&mmio_read->lock); + mmio_read->seq_num++; + + /* trash DMA req_id to identify when hardware is done */ + read_resp->req_id = mmio_read->seq_num + 0x9aL; + EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REG_OFF, offset); + EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REQ_ID, + mmio_read->seq_num); + + writel(mmio_read_reg, edev->reg_bar + EFA_REGS_MMIO_REG_READ_OFF); + + exp_time = jiffies + usecs_to_jiffies(mmio_read->mmio_read_timeout); + do { + if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num) + break; + udelay(1); + } while (time_is_after_jiffies(exp_time)); + + if (read_resp->req_id != mmio_read->seq_num) { + ibdev_err_ratelimited( + edev->efa_dev, + "Reading register timed out. expected: req id[%u] offset[%#x] actual: req id[%u] offset[%#x]\n", + mmio_read->seq_num, offset, read_resp->req_id, + read_resp->reg_off); + err = EFA_MMIO_READ_INVALID; + goto out; + } + + if (read_resp->reg_off != offset) { + ibdev_err_ratelimited( + edev->efa_dev, + "Reading register failed: wrong offset provided\n"); + err = EFA_MMIO_READ_INVALID; + goto out; + } + + err = read_resp->reg_val; +out: + spin_unlock(&mmio_read->lock); + return err; +} + +static int efa_com_admin_init_sq(struct efa_com_dev *edev) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_com_admin_sq *sq = &aq->sq; + u16 size = aq->depth * sizeof(*sq->entries); + u32 aq_caps = 0; + u32 addr_high; + u32 addr_low; + + sq->entries = + dma_alloc_coherent(aq->dmadev, size, &sq->dma_addr, GFP_KERNEL); + if (!sq->entries) + return -ENOMEM; + + spin_lock_init(&sq->lock); + + sq->cc = 0; + sq->pc = 0; + sq->phase = 1; + + sq->db_addr = (u32 __iomem *)(edev->reg_bar + EFA_REGS_AQ_PROD_DB_OFF); + + addr_high = upper_32_bits(sq->dma_addr); + addr_low = lower_32_bits(sq->dma_addr); + + writel(addr_low, edev->reg_bar + EFA_REGS_AQ_BASE_LO_OFF); + writel(addr_high, edev->reg_bar + EFA_REGS_AQ_BASE_HI_OFF); + + EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_DEPTH, aq->depth); + EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE, + sizeof(struct efa_admin_aq_entry)); + + writel(aq_caps, edev->reg_bar + EFA_REGS_AQ_CAPS_OFF); + + return 0; +} + +static int efa_com_admin_init_cq(struct efa_com_dev *edev) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_com_admin_cq *cq = &aq->cq; + u16 size = aq->depth * sizeof(*cq->entries); + u32 acq_caps = 0; + u32 addr_high; + u32 addr_low; + + cq->entries = + dma_alloc_coherent(aq->dmadev, size, &cq->dma_addr, GFP_KERNEL); + if (!cq->entries) + return -ENOMEM; + + spin_lock_init(&cq->lock); + + cq->cc = 0; + cq->phase = 1; + + addr_high = upper_32_bits(cq->dma_addr); + addr_low = lower_32_bits(cq->dma_addr); + + writel(addr_low, edev->reg_bar + EFA_REGS_ACQ_BASE_LO_OFF); + writel(addr_high, edev->reg_bar + EFA_REGS_ACQ_BASE_HI_OFF); + + EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_DEPTH, aq->depth); + EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE, + sizeof(struct efa_admin_acq_entry)); + EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR, + aq->msix_vector_idx); + + writel(acq_caps, edev->reg_bar + EFA_REGS_ACQ_CAPS_OFF); + + return 0; +} + +static int efa_com_admin_init_aenq(struct efa_com_dev *edev, + struct efa_aenq_handlers *aenq_handlers) +{ + struct efa_com_aenq *aenq = &edev->aenq; + u32 addr_low, addr_high; + u32 aenq_caps = 0; + u16 size; + + if (!aenq_handlers) { + ibdev_err(edev->efa_dev, "aenq handlers pointer is NULL\n"); + return -EINVAL; + } + + size = EFA_ASYNC_QUEUE_DEPTH * sizeof(*aenq->entries); + aenq->entries = dma_alloc_coherent(edev->dmadev, size, &aenq->dma_addr, + GFP_KERNEL); + if (!aenq->entries) + return -ENOMEM; + + aenq->aenq_handlers = aenq_handlers; + aenq->depth = EFA_ASYNC_QUEUE_DEPTH; + aenq->cc = 0; + aenq->phase = 1; + + addr_low = lower_32_bits(aenq->dma_addr); + addr_high = upper_32_bits(aenq->dma_addr); + + writel(addr_low, edev->reg_bar + EFA_REGS_AENQ_BASE_LO_OFF); + writel(addr_high, edev->reg_bar + EFA_REGS_AENQ_BASE_HI_OFF); + + EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_DEPTH, aenq->depth); + EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE, + sizeof(struct efa_admin_aenq_entry)); + EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR, + aenq->msix_vector_idx); + writel(aenq_caps, edev->reg_bar + EFA_REGS_AENQ_CAPS_OFF); + + /* + * Init cons_db to mark that all entries in the queue + * are initially available + */ + writel(edev->aenq.cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF); + + return 0; +} + +/* ID to be used with efa_com_get_comp_ctx */ +static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq) +{ + u16 ctx_id; + + spin_lock(&aq->comp_ctx_lock); + ctx_id = aq->comp_ctx_pool[aq->comp_ctx_pool_next]; + aq->comp_ctx_pool_next++; + spin_unlock(&aq->comp_ctx_lock); + + return ctx_id; +} + +static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq, + u16 ctx_id) +{ + spin_lock(&aq->comp_ctx_lock); + aq->comp_ctx_pool_next--; + aq->comp_ctx_pool[aq->comp_ctx_pool_next] = ctx_id; + spin_unlock(&aq->comp_ctx_lock); +} + +static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq, + struct efa_comp_ctx *comp_ctx) +{ + u16 cmd_id = EFA_GET(&comp_ctx->user_cqe->acq_common_descriptor.command, + EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); + u16 ctx_id = cmd_id & (aq->depth - 1); + + ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id); + comp_ctx->occupied = 0; + efa_com_dealloc_ctx_id(aq, ctx_id); +} + +static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq, + u16 cmd_id, bool capture) +{ + u16 ctx_id = cmd_id & (aq->depth - 1); + + if (aq->comp_ctx[ctx_id].occupied && capture) { + ibdev_err_ratelimited( + aq->efa_dev, + "Completion context for command_id %#x is occupied\n", + cmd_id); + return NULL; + } + + if (capture) { + aq->comp_ctx[ctx_id].occupied = 1; + ibdev_dbg(aq->efa_dev, + "Take completion ctxt for command_id %#x\n", cmd_id); + } + + return &aq->comp_ctx[ctx_id]; +} + +static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq, + struct efa_admin_aq_entry *cmd, + size_t cmd_size_in_bytes, + struct efa_admin_acq_entry *comp, + size_t comp_size_in_bytes) +{ + struct efa_admin_aq_entry *aqe; + struct efa_comp_ctx *comp_ctx; + u16 queue_size_mask; + u16 cmd_id; + u16 ctx_id; + u16 pi; + + queue_size_mask = aq->depth - 1; + pi = aq->sq.pc & queue_size_mask; + + ctx_id = efa_com_alloc_ctx_id(aq); + + /* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */ + cmd_id = ctx_id & queue_size_mask; + cmd_id |= aq->sq.pc & ~queue_size_mask; + cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK; + + cmd->aq_common_descriptor.command_id = cmd_id; + EFA_SET(&cmd->aq_common_descriptor.flags, + EFA_ADMIN_AQ_COMMON_DESC_PHASE, aq->sq.phase); + + comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true); + if (!comp_ctx) { + efa_com_dealloc_ctx_id(aq, ctx_id); + return ERR_PTR(-EINVAL); + } + + comp_ctx->status = EFA_CMD_SUBMITTED; + comp_ctx->comp_size = comp_size_in_bytes; + comp_ctx->user_cqe = comp; + comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode; + + reinit_completion(&comp_ctx->wait_event); + + aqe = &aq->sq.entries[pi]; + memset(aqe, 0, sizeof(*aqe)); + memcpy(aqe, cmd, cmd_size_in_bytes); + + aq->sq.pc++; + atomic64_inc(&aq->stats.submitted_cmd); + + if ((aq->sq.pc & queue_size_mask) == 0) + aq->sq.phase = !aq->sq.phase; + + /* barrier not needed in case of writel */ + writel(aq->sq.pc, aq->sq.db_addr); + + return comp_ctx; +} + +static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq) +{ + size_t pool_size = aq->depth * sizeof(*aq->comp_ctx_pool); + size_t size = aq->depth * sizeof(struct efa_comp_ctx); + struct efa_comp_ctx *comp_ctx; + u16 i; + + aq->comp_ctx = devm_kzalloc(aq->dmadev, size, GFP_KERNEL); + aq->comp_ctx_pool = devm_kzalloc(aq->dmadev, pool_size, GFP_KERNEL); + if (!aq->comp_ctx || !aq->comp_ctx_pool) { + devm_kfree(aq->dmadev, aq->comp_ctx_pool); + devm_kfree(aq->dmadev, aq->comp_ctx); + return -ENOMEM; + } + + for (i = 0; i < aq->depth; i++) { + comp_ctx = efa_com_get_comp_ctx(aq, i, false); + if (comp_ctx) + init_completion(&comp_ctx->wait_event); + + aq->comp_ctx_pool[i] = i; + } + + spin_lock_init(&aq->comp_ctx_lock); + + aq->comp_ctx_pool_next = 0; + + return 0; +} + +static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq, + struct efa_admin_aq_entry *cmd, + size_t cmd_size_in_bytes, + struct efa_admin_acq_entry *comp, + size_t comp_size_in_bytes) +{ + struct efa_comp_ctx *comp_ctx; + + spin_lock(&aq->sq.lock); + if (!test_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state)) { + ibdev_err_ratelimited(aq->efa_dev, "Admin queue is closed\n"); + spin_unlock(&aq->sq.lock); + return ERR_PTR(-ENODEV); + } + + comp_ctx = __efa_com_submit_admin_cmd(aq, cmd, cmd_size_in_bytes, comp, + comp_size_in_bytes); + spin_unlock(&aq->sq.lock); + if (IS_ERR(comp_ctx)) + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + + return comp_ctx; +} + +static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq, + struct efa_admin_acq_entry *cqe) +{ + struct efa_comp_ctx *comp_ctx; + u16 cmd_id; + + cmd_id = EFA_GET(&cqe->acq_common_descriptor.command, + EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); + + comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false); + if (!comp_ctx) { + ibdev_err( + aq->efa_dev, + "comp_ctx is NULL. Changing the admin queue running state\n"); + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + return; + } + + comp_ctx->status = EFA_CMD_COMPLETED; + memcpy(comp_ctx->user_cqe, cqe, comp_ctx->comp_size); + + if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state)) + complete(&comp_ctx->wait_event); +} + +static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) +{ + struct efa_admin_acq_entry *cqe; + u16 queue_size_mask; + u16 comp_num = 0; + u8 phase; + u16 ci; + + queue_size_mask = aq->depth - 1; + + ci = aq->cq.cc & queue_size_mask; + phase = aq->cq.phase; + + cqe = &aq->cq.entries[ci]; + + /* Go over all the completions */ + while ((READ_ONCE(cqe->acq_common_descriptor.flags) & + EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) { + /* + * Do not read the rest of the completion entry before the + * phase bit was validated + */ + dma_rmb(); + efa_com_handle_single_admin_completion(aq, cqe); + + ci++; + comp_num++; + if (ci == aq->depth) { + ci = 0; + phase = !phase; + } + + cqe = &aq->cq.entries[ci]; + } + + aq->cq.cc += comp_num; + aq->cq.phase = phase; + aq->sq.cc += comp_num; + atomic64_add(comp_num, &aq->stats.completed_cmd); +} + +static int efa_com_comp_status_to_errno(u8 comp_status) +{ + switch (comp_status) { + case EFA_ADMIN_SUCCESS: + return 0; + case EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE: + return -ENOMEM; + case EFA_ADMIN_UNSUPPORTED_OPCODE: + return -EOPNOTSUPP; + case EFA_ADMIN_BAD_OPCODE: + case EFA_ADMIN_MALFORMED_REQUEST: + case EFA_ADMIN_ILLEGAL_PARAMETER: + case EFA_ADMIN_UNKNOWN_ERROR: + return -EINVAL; + default: + return -EINVAL; + } +} + +static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_ctx, + struct efa_com_admin_queue *aq) +{ + unsigned long timeout; + unsigned long flags; + int err; + + timeout = jiffies + usecs_to_jiffies(aq->completion_timeout); + + while (1) { + spin_lock_irqsave(&aq->cq.lock, flags); + efa_com_handle_admin_completion(aq); + spin_unlock_irqrestore(&aq->cq.lock, flags); + + if (comp_ctx->status != EFA_CMD_SUBMITTED) + break; + + if (time_is_before_jiffies(timeout)) { + ibdev_err_ratelimited( + aq->efa_dev, + "Wait for completion (polling) timeout\n"); + /* EFA didn't have any completion */ + atomic64_inc(&aq->stats.no_completion); + + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + err = -ETIME; + goto out; + } + + msleep(aq->poll_interval); + } + + err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status); +out: + efa_com_put_comp_ctx(aq, comp_ctx); + return err; +} + +static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *comp_ctx, + struct efa_com_admin_queue *aq) +{ + unsigned long flags; + int err; + + wait_for_completion_timeout(&comp_ctx->wait_event, + usecs_to_jiffies(aq->completion_timeout)); + + /* + * In case the command wasn't completed find out the root cause. + * There might be 2 kinds of errors + * 1) No completion (timeout reached) + * 2) There is completion but the device didn't get any msi-x interrupt. + */ + if (comp_ctx->status == EFA_CMD_SUBMITTED) { + spin_lock_irqsave(&aq->cq.lock, flags); + efa_com_handle_admin_completion(aq); + spin_unlock_irqrestore(&aq->cq.lock, flags); + + atomic64_inc(&aq->stats.no_completion); + + if (comp_ctx->status == EFA_CMD_COMPLETED) + ibdev_err_ratelimited( + aq->efa_dev, + "The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (ctx: 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n", + efa_com_cmd_str(comp_ctx->cmd_opcode), + comp_ctx->cmd_opcode, comp_ctx->status, + comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc); + else + ibdev_err_ratelimited( + aq->efa_dev, + "The device didn't send any completion for admin cmd %s(%d) status %d (ctx 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n", + efa_com_cmd_str(comp_ctx->cmd_opcode), + comp_ctx->cmd_opcode, comp_ctx->status, + comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc); + + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + err = -ETIME; + goto out; + } + + err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status); +out: + efa_com_put_comp_ctx(aq, comp_ctx); + return err; +} + +/* + * There are two types to wait for completion. + * Polling mode - wait until the completion is available. + * Async mode - wait on wait queue until the completion is ready + * (or the timeout expired). + * It is expected that the IRQ called efa_com_handle_admin_completion + * to mark the completions. + */ +static int efa_com_wait_and_process_admin_cq(struct efa_comp_ctx *comp_ctx, + struct efa_com_admin_queue *aq) +{ + if (test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state)) + return efa_com_wait_and_process_admin_cq_polling(comp_ctx, aq); + + return efa_com_wait_and_process_admin_cq_interrupts(comp_ctx, aq); +} + +/** + * efa_com_cmd_exec - Execute admin command + * @aq: admin queue. + * @cmd: the admin command to execute. + * @cmd_size: the command size. + * @comp: command completion return entry. + * @comp_size: command completion size. + * Submit an admin command and then wait until the device will return a + * completion. + * The completion will be copied into comp. + * + * @return - 0 on success, negative value on failure. + */ +int efa_com_cmd_exec(struct efa_com_admin_queue *aq, + struct efa_admin_aq_entry *cmd, + size_t cmd_size, + struct efa_admin_acq_entry *comp, + size_t comp_size) +{ + struct efa_comp_ctx *comp_ctx; + int err; + + might_sleep(); + + /* In case of queue FULL */ + down(&aq->avail_cmds); + + ibdev_dbg(aq->efa_dev, "%s (opcode %d)\n", + efa_com_cmd_str(cmd->aq_common_descriptor.opcode), + cmd->aq_common_descriptor.opcode); + comp_ctx = efa_com_submit_admin_cmd(aq, cmd, cmd_size, comp, comp_size); + if (IS_ERR(comp_ctx)) { + ibdev_err_ratelimited( + aq->efa_dev, + "Failed to submit command %s (opcode %u) err %ld\n", + efa_com_cmd_str(cmd->aq_common_descriptor.opcode), + cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx)); + + up(&aq->avail_cmds); + atomic64_inc(&aq->stats.cmd_err); + return PTR_ERR(comp_ctx); + } + + err = efa_com_wait_and_process_admin_cq(comp_ctx, aq); + if (err) { + ibdev_err_ratelimited( + aq->efa_dev, + "Failed to process command %s (opcode %u) comp_status %d err %d\n", + efa_com_cmd_str(cmd->aq_common_descriptor.opcode), + cmd->aq_common_descriptor.opcode, + comp_ctx->user_cqe->acq_common_descriptor.status, err); + atomic64_inc(&aq->stats.cmd_err); + } + + up(&aq->avail_cmds); + + return err; +} + +/** + * efa_com_admin_destroy - Destroy the admin and the async events queues. + * @edev: EFA communication layer struct + */ +void efa_com_admin_destroy(struct efa_com_dev *edev) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_com_aenq *aenq = &edev->aenq; + struct efa_com_admin_cq *cq = &aq->cq; + struct efa_com_admin_sq *sq = &aq->sq; + u16 size; + + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + + devm_kfree(edev->dmadev, aq->comp_ctx_pool); + devm_kfree(edev->dmadev, aq->comp_ctx); + + size = aq->depth * sizeof(*sq->entries); + dma_free_coherent(edev->dmadev, size, sq->entries, sq->dma_addr); + + size = aq->depth * sizeof(*cq->entries); + dma_free_coherent(edev->dmadev, size, cq->entries, cq->dma_addr); + + size = aenq->depth * sizeof(*aenq->entries); + dma_free_coherent(edev->dmadev, size, aenq->entries, aenq->dma_addr); +} + +/** + * efa_com_set_admin_polling_mode - Set the admin completion queue polling mode + * @edev: EFA communication layer struct + * @polling: Enable/Disable polling mode + * + * Set the admin completion mode. + */ +void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling) +{ + u32 mask_value = 0; + + if (polling) + EFA_SET(&mask_value, EFA_REGS_INTR_MASK_EN, 1); + + writel(mask_value, edev->reg_bar + EFA_REGS_INTR_MASK_OFF); + if (polling) + set_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state); + else + clear_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state); +} + +static void efa_com_stats_init(struct efa_com_dev *edev) +{ + atomic64_t *s = (atomic64_t *)&edev->aq.stats; + int i; + + for (i = 0; i < sizeof(edev->aq.stats) / sizeof(*s); i++, s++) + atomic64_set(s, 0); +} + +/** + * efa_com_admin_init - Init the admin and the async queues + * @edev: EFA communication layer struct + * @aenq_handlers: Those handlers to be called upon event. + * + * Initialize the admin submission and completion queues. + * Initialize the asynchronous events notification queues. + * + * @return - 0 on success, negative value on failure. + */ +int efa_com_admin_init(struct efa_com_dev *edev, + struct efa_aenq_handlers *aenq_handlers) +{ + struct efa_com_admin_queue *aq = &edev->aq; + u32 timeout; + u32 dev_sts; + u32 cap; + int err; + + dev_sts = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF); + if (!EFA_GET(&dev_sts, EFA_REGS_DEV_STS_READY)) { + ibdev_err(edev->efa_dev, + "Device isn't ready, abort com init %#x\n", dev_sts); + return -ENODEV; + } + + aq->depth = EFA_ADMIN_QUEUE_DEPTH; + + aq->dmadev = edev->dmadev; + aq->efa_dev = edev->efa_dev; + set_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state); + + sema_init(&aq->avail_cmds, aq->depth); + + efa_com_stats_init(edev); + + err = efa_com_init_comp_ctxt(aq); + if (err) + return err; + + err = efa_com_admin_init_sq(edev); + if (err) + goto err_destroy_comp_ctxt; + + err = efa_com_admin_init_cq(edev); + if (err) + goto err_destroy_sq; + + efa_com_set_admin_polling_mode(edev, false); + + err = efa_com_admin_init_aenq(edev, aenq_handlers); + if (err) + goto err_destroy_cq; + + cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF); + timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO); + if (timeout) + /* the resolution of timeout reg is 100ms */ + aq->completion_timeout = timeout * 100000; + else + aq->completion_timeout = ADMIN_CMD_TIMEOUT_US; + + aq->poll_interval = EFA_POLL_INTERVAL_MS; + + set_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); + + return 0; + +err_destroy_cq: + dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->cq.entries), + aq->cq.entries, aq->cq.dma_addr); +err_destroy_sq: + dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->sq.entries), + aq->sq.entries, aq->sq.dma_addr); +err_destroy_comp_ctxt: + devm_kfree(edev->dmadev, aq->comp_ctx); + + return err; +} + +/** + * efa_com_admin_q_comp_intr_handler - admin queue interrupt handler + * @edev: EFA communication layer struct + * + * This method goes over the admin completion queue and wakes up + * all the pending threads that wait on the commands wait event. + * + * Note: Should be called after MSI-X interrupt. + */ +void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev) +{ + unsigned long flags; + + spin_lock_irqsave(&edev->aq.cq.lock, flags); + efa_com_handle_admin_completion(&edev->aq); + spin_unlock_irqrestore(&edev->aq.cq.lock, flags); +} + +/* + * efa_handle_specific_aenq_event: + * return the handler that is relevant to the specific event group + */ +static efa_aenq_handler efa_com_get_specific_aenq_cb(struct efa_com_dev *edev, + u16 group) +{ + struct efa_aenq_handlers *aenq_handlers = edev->aenq.aenq_handlers; + + if (group < EFA_MAX_HANDLERS && aenq_handlers->handlers[group]) + return aenq_handlers->handlers[group]; + + return aenq_handlers->unimplemented_handler; +} + +/** + * efa_com_aenq_intr_handler - AENQ interrupt handler + * @edev: EFA communication layer struct + * @data: Data of interrupt handler. + * + * Go over the async event notification queue and call the proper aenq handler. + */ +void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data) +{ + struct efa_admin_aenq_common_desc *aenq_common; + struct efa_com_aenq *aenq = &edev->aenq; + struct efa_admin_aenq_entry *aenq_e; + efa_aenq_handler handler_cb; + u32 processed = 0; + u8 phase; + u32 ci; + + ci = aenq->cc & (aenq->depth - 1); + phase = aenq->phase; + aenq_e = &aenq->entries[ci]; /* Get first entry */ + aenq_common = &aenq_e->aenq_common_desc; + + /* Go over all the events */ + while ((READ_ONCE(aenq_common->flags) & + EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) { + /* + * Do not read the rest of the completion entry before the + * phase bit was validated + */ + dma_rmb(); + + /* Handle specific event*/ + handler_cb = efa_com_get_specific_aenq_cb(edev, + aenq_common->group); + handler_cb(data, aenq_e); /* call the actual event handler*/ + + /* Get next event entry */ + ci++; + processed++; + + if (ci == aenq->depth) { + ci = 0; + phase = !phase; + } + aenq_e = &aenq->entries[ci]; + aenq_common = &aenq_e->aenq_common_desc; + } + + aenq->cc += processed; + aenq->phase = phase; + + /* Don't update aenq doorbell if there weren't any processed events */ + if (!processed) + return; + + /* barrier not needed in case of writel */ + writel(aenq->cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF); +} + +static void efa_com_mmio_reg_read_resp_addr_init(struct efa_com_dev *edev) +{ + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; + u32 addr_high; + u32 addr_low; + + /* dma_addr_bits is unknown at this point */ + addr_high = (mmio_read->read_resp_dma_addr >> 32) & GENMASK(31, 0); + addr_low = mmio_read->read_resp_dma_addr & GENMASK(31, 0); + + writel(addr_high, edev->reg_bar + EFA_REGS_MMIO_RESP_HI_OFF); + writel(addr_low, edev->reg_bar + EFA_REGS_MMIO_RESP_LO_OFF); +} + +int efa_com_mmio_reg_read_init(struct efa_com_dev *edev) +{ + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; + + spin_lock_init(&mmio_read->lock); + mmio_read->read_resp = + dma_alloc_coherent(edev->dmadev, sizeof(*mmio_read->read_resp), + &mmio_read->read_resp_dma_addr, GFP_KERNEL); + if (!mmio_read->read_resp) + return -ENOMEM; + + efa_com_mmio_reg_read_resp_addr_init(edev); + + mmio_read->read_resp->req_id = 0; + mmio_read->seq_num = 0; + mmio_read->mmio_read_timeout = EFA_REG_READ_TIMEOUT_US; + + return 0; +} + +void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev) +{ + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; + + dma_free_coherent(edev->dmadev, sizeof(*mmio_read->read_resp), + mmio_read->read_resp, mmio_read->read_resp_dma_addr); +} + +int efa_com_validate_version(struct efa_com_dev *edev) +{ + u32 min_ctrl_ver = 0; + u32 ctrl_ver_masked; + u32 min_ver = 0; + u32 ctrl_ver; + u32 ver; + + /* + * Make sure the EFA version and the controller version are at least + * as the driver expects + */ + ver = efa_com_reg_read32(edev, EFA_REGS_VERSION_OFF); + ctrl_ver = efa_com_reg_read32(edev, + EFA_REGS_CONTROLLER_VERSION_OFF); + + ibdev_dbg(edev->efa_dev, "efa device version: %d.%d\n", + EFA_GET(&ver, EFA_REGS_VERSION_MAJOR_VERSION), + EFA_GET(&ver, EFA_REGS_VERSION_MINOR_VERSION)); + + EFA_SET(&min_ver, EFA_REGS_VERSION_MAJOR_VERSION, + EFA_ADMIN_API_VERSION_MAJOR); + EFA_SET(&min_ver, EFA_REGS_VERSION_MINOR_VERSION, + EFA_ADMIN_API_VERSION_MINOR); + if (ver < min_ver) { + ibdev_err( + edev->efa_dev, + "EFA version is lower than the minimal version the driver supports\n"); + return -EOPNOTSUPP; + } + + ibdev_dbg( + edev->efa_dev, + "efa controller version: %d.%d.%d implementation version %d\n", + EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION), + EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION), + EFA_GET(&ctrl_ver, + EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION), + EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_IMPL_ID)); + + ctrl_ver_masked = + EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION) | + EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION) | + EFA_GET(&ctrl_ver, + EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION); + + EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION, + EFA_CTRL_MAJOR); + EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION, + EFA_CTRL_MINOR); + EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION, + EFA_CTRL_SUB_MINOR); + /* Validate the ctrl version without the implementation ID */ + if (ctrl_ver_masked < min_ctrl_ver) { + ibdev_err( + edev->efa_dev, + "EFA ctrl version is lower than the minimal ctrl version the driver supports\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +/** + * efa_com_get_dma_width - Retrieve physical dma address width the device + * supports. + * @edev: EFA communication layer struct + * + * Retrieve the maximum physical address bits the device can handle. + * + * @return: > 0 on Success and negative value otherwise. + */ +int efa_com_get_dma_width(struct efa_com_dev *edev) +{ + u32 caps = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF); + int width; + + width = EFA_GET(&caps, EFA_REGS_CAPS_DMA_ADDR_WIDTH); + + ibdev_dbg(edev->efa_dev, "DMA width: %d\n", width); + + if (width < 32 || width > 64) { + ibdev_err(edev->efa_dev, "DMA width illegal value: %d\n", + width); + return -EINVAL; + } + + edev->dma_addr_bits = width; + + return width; +} + +static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout, int on) +{ + u32 val, i; + + for (i = 0; i < timeout; i++) { + val = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF); + + if (EFA_GET(&val, EFA_REGS_DEV_STS_RESET_IN_PROGRESS) == on) + return 0; + + ibdev_dbg(edev->efa_dev, "Reset indication val %d\n", val); + msleep(EFA_POLL_INTERVAL_MS); + } + + return -ETIME; +} + +/** + * efa_com_dev_reset - Perform device FLR to the device. + * @edev: EFA communication layer struct + * @reset_reason: Specify what is the trigger for the reset in case of an error. + * + * @return - 0 on success, negative value on failure. + */ +int efa_com_dev_reset(struct efa_com_dev *edev, + enum efa_regs_reset_reason_types reset_reason) +{ + u32 stat, timeout, cap; + u32 reset_val = 0; + int err; + + stat = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF); + cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF); + + if (!EFA_GET(&stat, EFA_REGS_DEV_STS_READY)) { + ibdev_err(edev->efa_dev, + "Device isn't ready, can't reset device\n"); + return -EINVAL; + } + + timeout = EFA_GET(&cap, EFA_REGS_CAPS_RESET_TIMEOUT); + if (!timeout) { + ibdev_err(edev->efa_dev, "Invalid timeout value\n"); + return -EINVAL; + } + + /* start reset */ + EFA_SET(&reset_val, EFA_REGS_DEV_CTL_DEV_RESET, 1); + EFA_SET(&reset_val, EFA_REGS_DEV_CTL_RESET_REASON, reset_reason); + writel(reset_val, edev->reg_bar + EFA_REGS_DEV_CTL_OFF); + + /* reset clears the mmio readless address, restore it */ + efa_com_mmio_reg_read_resp_addr_init(edev); + + err = wait_for_reset_state(edev, timeout, 1); + if (err) { + ibdev_err(edev->efa_dev, "Reset indication didn't turn on\n"); + return err; + } + + /* reset done */ + writel(0, edev->reg_bar + EFA_REGS_DEV_CTL_OFF); + err = wait_for_reset_state(edev, timeout, 0); + if (err) { + ibdev_err(edev->efa_dev, "Reset indication didn't turn off\n"); + return err; + } + + timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO); + if (timeout) + /* the resolution of timeout reg is 100ms */ + edev->aq.completion_timeout = timeout * 100000; + else + edev->aq.completion_timeout = ADMIN_CMD_TIMEOUT_US; + + return 0; +} + +static int efa_com_create_eq(struct efa_com_dev *edev, + struct efa_com_create_eq_params *params, + struct efa_com_create_eq_result *result) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_create_eq_resp resp = {}; + struct efa_admin_create_eq_cmd cmd = {}; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_CREATE_EQ; + EFA_SET(&cmd.caps, EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS, + params->entry_size_in_bytes / 4); + cmd.depth = params->depth; + cmd.event_bitmask = params->event_bitmask; + cmd.msix_vec = params->msix_vec; + + efa_com_set_dma_addr(params->dma_addr, &cmd.ba.mem_addr_high, + &cmd.ba.mem_addr_low); + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to create eq[%d]\n", err); + return err; + } + + result->eqn = resp.eqn; + + return 0; +} + +static void efa_com_destroy_eq(struct efa_com_dev *edev, + struct efa_com_destroy_eq_params *params) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_destroy_eq_resp resp = {}; + struct efa_admin_destroy_eq_cmd cmd = {}; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_DESTROY_EQ; + cmd.eqn = params->eqn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) + ibdev_err_ratelimited(edev->efa_dev, + "Failed to destroy EQ-%u [%d]\n", cmd.eqn, + err); +} + +static void efa_com_arm_eq(struct efa_com_dev *edev, struct efa_com_eq *eeq) +{ + u32 val = 0; + + EFA_SET(&val, EFA_REGS_EQ_DB_EQN, eeq->eqn); + EFA_SET(&val, EFA_REGS_EQ_DB_ARM, 1); + + writel(val, edev->reg_bar + EFA_REGS_EQ_DB_OFF); +} + +void efa_com_eq_comp_intr_handler(struct efa_com_dev *edev, + struct efa_com_eq *eeq) +{ + struct efa_admin_eqe *eqe; + u32 processed = 0; + u8 phase; + u32 ci; + + ci = eeq->cc & (eeq->depth - 1); + phase = eeq->phase; + eqe = &eeq->eqes[ci]; + + /* Go over all the events */ + while ((READ_ONCE(eqe->common) & EFA_ADMIN_EQE_PHASE_MASK) == phase) { + /* + * Do not read the rest of the completion entry before the + * phase bit was validated + */ + dma_rmb(); + + eeq->cb(eeq, eqe); + + /* Get next event entry */ + ci++; + processed++; + + if (ci == eeq->depth) { + ci = 0; + phase = !phase; + } + + eqe = &eeq->eqes[ci]; + } + + eeq->cc += processed; + eeq->phase = phase; + efa_com_arm_eq(eeq->edev, eeq); +} + +void efa_com_eq_destroy(struct efa_com_dev *edev, struct efa_com_eq *eeq) +{ + struct efa_com_destroy_eq_params params = { + .eqn = eeq->eqn, + }; + + efa_com_destroy_eq(edev, ¶ms); + dma_free_coherent(edev->dmadev, eeq->depth * sizeof(*eeq->eqes), + eeq->eqes, eeq->dma_addr); +} + +int efa_com_eq_init(struct efa_com_dev *edev, struct efa_com_eq *eeq, + efa_eqe_handler cb, u16 depth, u8 msix_vec) +{ + struct efa_com_create_eq_params params = {}; + struct efa_com_create_eq_result result = {}; + int err; + + params.depth = depth; + params.entry_size_in_bytes = sizeof(*eeq->eqes); + EFA_SET(¶ms.event_bitmask, + EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS, 1); + params.msix_vec = msix_vec; + + eeq->eqes = dma_alloc_coherent(edev->dmadev, + params.depth * sizeof(*eeq->eqes), + ¶ms.dma_addr, GFP_KERNEL); + if (!eeq->eqes) + return -ENOMEM; + + err = efa_com_create_eq(edev, ¶ms, &result); + if (err) + goto err_free_coherent; + + eeq->eqn = result.eqn; + eeq->edev = edev; + eeq->dma_addr = params.dma_addr; + eeq->phase = 1; + eeq->depth = params.depth; + eeq->cb = cb; + efa_com_arm_eq(edev, eeq); + + return 0; + +err_free_coherent: + dma_free_coherent(edev->dmadev, params.depth * sizeof(*eeq->eqes), + eeq->eqes, params.dma_addr); + return err; +} diff --git a/drivers/amazon/net/efa/efa_com.h b/drivers/amazon/net/efa/efa_com.h new file mode 100644 index 0000000000000..bced7c3981792 --- /dev/null +++ b/drivers/amazon/net/efa/efa_com.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_COM_H_ +#define _EFA_COM_H_ + +#include +#include +#include +#include +#include + +#include +#include "kcompat.h" + +#include "efa_common_defs.h" +#include "efa_admin_defs.h" +#include "efa_admin_cmds_defs.h" +#include "efa_regs_defs.h" + +#define EFA_MAX_HANDLERS 256 + +struct efa_com_admin_cq { + struct efa_admin_acq_entry *entries; + dma_addr_t dma_addr; + spinlock_t lock; /* Protects ACQ */ + + u16 cc; /* consumer counter */ + u8 phase; +}; + +struct efa_com_admin_sq { + struct efa_admin_aq_entry *entries; + dma_addr_t dma_addr; + spinlock_t lock; /* Protects ASQ */ + + u32 __iomem *db_addr; + + u16 cc; /* consumer counter */ + u16 pc; /* producer counter */ + u8 phase; + +}; + +/* Don't use anything other than atomic64 */ +struct efa_com_stats_admin { + atomic64_t submitted_cmd; + atomic64_t completed_cmd; + atomic64_t cmd_err; + atomic64_t no_completion; +}; + +enum { + EFA_AQ_STATE_RUNNING_BIT = 0, + EFA_AQ_STATE_POLLING_BIT = 1, +}; + +struct efa_com_admin_queue { + void *dmadev; + void *efa_dev; + struct efa_comp_ctx *comp_ctx; + u32 completion_timeout; /* usecs */ + u16 poll_interval; /* msecs */ + u16 depth; + struct efa_com_admin_cq cq; + struct efa_com_admin_sq sq; + u16 msix_vector_idx; + + unsigned long state; + + /* Count the number of available admin commands */ + struct semaphore avail_cmds; + + struct efa_com_stats_admin stats; + + spinlock_t comp_ctx_lock; /* Protects completion context pool */ + u32 *comp_ctx_pool; + u16 comp_ctx_pool_next; +}; + +struct efa_aenq_handlers; +struct efa_com_eq; +typedef void (*efa_eqe_handler)(struct efa_com_eq *eeq, + struct efa_admin_eqe *eqe); + +struct efa_com_aenq { + struct efa_admin_aenq_entry *entries; + struct efa_aenq_handlers *aenq_handlers; + dma_addr_t dma_addr; + u32 cc; /* consumer counter */ + u16 msix_vector_idx; + u16 depth; + u8 phase; +}; + +struct efa_com_mmio_read { + struct efa_admin_mmio_req_read_less_resp *read_resp; + dma_addr_t read_resp_dma_addr; + u16 seq_num; + u16 mmio_read_timeout; /* usecs */ + /* serializes mmio reads */ + spinlock_t lock; +}; + +struct efa_com_dev { + struct efa_com_admin_queue aq; + struct efa_com_aenq aenq; + u8 __iomem *reg_bar; + void *dmadev; + void *efa_dev; + u32 supported_features; + u32 dma_addr_bits; + + struct efa_com_mmio_read mmio_read; +}; + +struct efa_com_eq { + struct efa_com_dev *edev; + struct efa_admin_eqe *eqes; + dma_addr_t dma_addr; + u32 cc; /* Consumer counter */ + u16 eqn; + u16 depth; + u8 phase; + efa_eqe_handler cb; +}; + +struct efa_com_create_eq_params { + dma_addr_t dma_addr; + u32 event_bitmask; + u16 depth; + u8 entry_size_in_bytes; + u8 msix_vec; +}; + +struct efa_com_create_eq_result { + u16 eqn; +}; + +struct efa_com_destroy_eq_params { + u16 eqn; +}; + +typedef void (*efa_aenq_handler)(void *data, + struct efa_admin_aenq_entry *aenq_e); + +/* Holds aenq handlers. Indexed by AENQ event group */ +struct efa_aenq_handlers { + efa_aenq_handler handlers[EFA_MAX_HANDLERS]; + efa_aenq_handler unimplemented_handler; +}; + +void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low); +int efa_com_admin_init(struct efa_com_dev *edev, + struct efa_aenq_handlers *aenq_handlers); +void efa_com_admin_destroy(struct efa_com_dev *edev); +int efa_com_eq_init(struct efa_com_dev *edev, struct efa_com_eq *eeq, + efa_eqe_handler cb, u16 depth, u8 msix_vec); +void efa_com_eq_destroy(struct efa_com_dev *edev, struct efa_com_eq *eeq); +int efa_com_dev_reset(struct efa_com_dev *edev, + enum efa_regs_reset_reason_types reset_reason); +void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling); +void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev); +int efa_com_mmio_reg_read_init(struct efa_com_dev *edev); +void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev); + +int efa_com_validate_version(struct efa_com_dev *edev); +int efa_com_get_dma_width(struct efa_com_dev *edev); + +int efa_com_cmd_exec(struct efa_com_admin_queue *aq, + struct efa_admin_aq_entry *cmd, + size_t cmd_size, + struct efa_admin_acq_entry *comp, + size_t comp_size); +void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data); +void efa_com_eq_comp_intr_handler(struct efa_com_dev *edev, + struct efa_com_eq *eeq); + +#endif /* _EFA_COM_H_ */ diff --git a/drivers/amazon/net/efa/efa_com_cmd.c b/drivers/amazon/net/efa/efa_com_cmd.c new file mode 100644 index 0000000000000..e107c354bc349 --- /dev/null +++ b/drivers/amazon/net/efa/efa_com_cmd.c @@ -0,0 +1,801 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "efa_com.h" +#include "efa_com_cmd.h" + +int efa_com_create_qp(struct efa_com_dev *edev, + struct efa_com_create_qp_params *params, + struct efa_com_create_qp_result *res) +{ + struct efa_admin_create_qp_cmd create_qp_cmd = {}; + struct efa_admin_create_qp_resp cmd_completion; + struct efa_com_admin_queue *aq = &edev->aq; + int err; + + create_qp_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_QP; + + create_qp_cmd.pd = params->pd; + create_qp_cmd.qp_type = params->qp_type; + create_qp_cmd.rq_base_addr = params->rq_base_addr; + create_qp_cmd.send_cq_idx = params->send_cq_idx; + create_qp_cmd.recv_cq_idx = params->recv_cq_idx; + create_qp_cmd.qp_alloc_size.send_queue_ring_size = + params->sq_ring_size_in_bytes; + create_qp_cmd.qp_alloc_size.send_queue_depth = + params->sq_depth; + create_qp_cmd.qp_alloc_size.recv_queue_ring_size = + params->rq_ring_size_in_bytes; + create_qp_cmd.qp_alloc_size.recv_queue_depth = + params->rq_depth; + create_qp_cmd.uar = params->uarn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&create_qp_cmd, + sizeof(create_qp_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to create qp [%d]\n", err); + return err; + } + + res->qp_handle = cmd_completion.qp_handle; + res->qp_num = cmd_completion.qp_num; + res->sq_db_offset = cmd_completion.sq_db_offset; + res->rq_db_offset = cmd_completion.rq_db_offset; + res->llq_descriptors_offset = cmd_completion.llq_descriptors_offset; + res->send_sub_cq_idx = cmd_completion.send_sub_cq_idx; + res->recv_sub_cq_idx = cmd_completion.recv_sub_cq_idx; + + return 0; +} + +int efa_com_modify_qp(struct efa_com_dev *edev, + struct efa_com_modify_qp_params *params) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_modify_qp_cmd cmd = {}; + struct efa_admin_modify_qp_resp resp; + int err; + + cmd.aq_common_desc.opcode = EFA_ADMIN_MODIFY_QP; + cmd.modify_mask = params->modify_mask; + cmd.qp_handle = params->qp_handle; + cmd.qp_state = params->qp_state; + cmd.cur_qp_state = params->cur_qp_state; + cmd.qkey = params->qkey; + cmd.sq_psn = params->sq_psn; + cmd.sq_drained_async_notify = params->sq_drained_async_notify; + cmd.rnr_retry = params->rnr_retry; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to modify qp-%u modify_mask[%#x] [%d]\n", + cmd.qp_handle, cmd.modify_mask, err); + return err; + } + + return 0; +} + +int efa_com_query_qp(struct efa_com_dev *edev, + struct efa_com_query_qp_params *params, + struct efa_com_query_qp_result *result) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_query_qp_cmd cmd = {}; + struct efa_admin_query_qp_resp resp; + int err; + + cmd.aq_common_desc.opcode = EFA_ADMIN_QUERY_QP; + cmd.qp_handle = params->qp_handle; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to query qp-%u [%d]\n", + cmd.qp_handle, err); + return err; + } + + result->qp_state = resp.qp_state; + result->qkey = resp.qkey; + result->sq_draining = resp.sq_draining; + result->sq_psn = resp.sq_psn; + result->rnr_retry = resp.rnr_retry; + + return 0; +} + +int efa_com_destroy_qp(struct efa_com_dev *edev, + struct efa_com_destroy_qp_params *params) +{ + struct efa_admin_destroy_qp_resp cmd_completion; + struct efa_admin_destroy_qp_cmd qp_cmd = {}; + struct efa_com_admin_queue *aq = &edev->aq; + int err; + + qp_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_QP; + qp_cmd.qp_handle = params->qp_handle; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&qp_cmd, + sizeof(qp_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to destroy qp-%u [%d]\n", + qp_cmd.qp_handle, err); + return err; + } + + return 0; +} + +int efa_com_create_cq(struct efa_com_dev *edev, + struct efa_com_create_cq_params *params, + struct efa_com_create_cq_result *result) +{ + struct efa_admin_create_cq_resp cmd_completion = {}; + struct efa_admin_create_cq_cmd create_cmd = {}; + struct efa_com_admin_queue *aq = &edev->aq; + int err; + + create_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_CQ; + EFA_SET(&create_cmd.cq_caps_2, + EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS, + params->entry_size_in_bytes / 4); + create_cmd.cq_depth = params->cq_depth; + create_cmd.num_sub_cqs = params->num_sub_cqs; + create_cmd.uar = params->uarn; + if (params->interrupt_mode_enabled) { + EFA_SET(&create_cmd.cq_caps_1, + EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED, 1); + create_cmd.eqn = params->eqn; + } + if (params->set_src_addr) { + EFA_SET(&create_cmd.cq_caps_2, + EFA_ADMIN_CREATE_CQ_CMD_SET_SRC_ADDR, 1); + } + efa_com_set_dma_addr(params->dma_addr, + &create_cmd.cq_ba.mem_addr_high, + &create_cmd.cq_ba.mem_addr_low); + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&create_cmd, + sizeof(create_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to create cq[%d]\n", err); + return err; + } + + result->cq_idx = cmd_completion.cq_idx; + result->actual_depth = params->cq_depth; + result->db_off = cmd_completion.db_offset; + result->db_valid = EFA_GET(&cmd_completion.flags, + EFA_ADMIN_CREATE_CQ_RESP_DB_VALID); + + return 0; +} + +int efa_com_destroy_cq(struct efa_com_dev *edev, + struct efa_com_destroy_cq_params *params) +{ + struct efa_admin_destroy_cq_cmd destroy_cmd = {}; + struct efa_admin_destroy_cq_resp destroy_resp; + struct efa_com_admin_queue *aq = &edev->aq; + int err; + + destroy_cmd.cq_idx = params->cq_idx; + destroy_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_CQ; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&destroy_cmd, + sizeof(destroy_cmd), + (struct efa_admin_acq_entry *)&destroy_resp, + sizeof(destroy_resp)); + + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to destroy CQ-%u [%d]\n", + params->cq_idx, err); + return err; + } + + return 0; +} + +int efa_com_register_mr(struct efa_com_dev *edev, + struct efa_com_reg_mr_params *params, + struct efa_com_reg_mr_result *result) +{ + struct efa_admin_reg_mr_resp cmd_completion; + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_reg_mr_cmd mr_cmd = {}; + int err; + + mr_cmd.aq_common_desc.opcode = EFA_ADMIN_REG_MR; + mr_cmd.pd = params->pd; + mr_cmd.mr_length = params->mr_length_in_bytes; + EFA_SET(&mr_cmd.flags, EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT, + params->page_shift); + mr_cmd.iova = params->iova; + mr_cmd.permissions = params->permissions; + + if (params->inline_pbl) { + memcpy(mr_cmd.pbl.inline_pbl_array, + params->pbl.inline_pbl_array, + sizeof(mr_cmd.pbl.inline_pbl_array)); + } else { + mr_cmd.pbl.pbl.length = params->pbl.pbl.length; + mr_cmd.pbl.pbl.address.mem_addr_low = + params->pbl.pbl.address.mem_addr_low; + mr_cmd.pbl.pbl.address.mem_addr_high = + params->pbl.pbl.address.mem_addr_high; + EFA_SET(&mr_cmd.aq_common_desc.flags, + EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1); + if (params->indirect) + EFA_SET(&mr_cmd.aq_common_desc.flags, + EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT, 1); + } + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&mr_cmd, + sizeof(mr_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to register mr [%d]\n", err); + return err; + } + + result->l_key = cmd_completion.l_key; + result->r_key = cmd_completion.r_key; + + return 0; +} + +int efa_com_dereg_mr(struct efa_com_dev *edev, + struct efa_com_dereg_mr_params *params) +{ + struct efa_admin_dereg_mr_resp cmd_completion; + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_dereg_mr_cmd mr_cmd = {}; + int err; + + mr_cmd.aq_common_desc.opcode = EFA_ADMIN_DEREG_MR; + mr_cmd.l_key = params->l_key; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&mr_cmd, + sizeof(mr_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to de-register mr(lkey-%u) [%d]\n", + mr_cmd.l_key, err); + return err; + } + + return 0; +} + +int efa_com_create_ah(struct efa_com_dev *edev, + struct efa_com_create_ah_params *params, + struct efa_com_create_ah_result *result) +{ + struct efa_admin_create_ah_resp cmd_completion; + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_create_ah_cmd ah_cmd = {}; + int err; + + ah_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_AH; + + memcpy(ah_cmd.dest_addr, params->dest_addr, sizeof(ah_cmd.dest_addr)); + ah_cmd.pd = params->pdn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&ah_cmd, + sizeof(ah_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to create ah for %pI6 [%d]\n", + ah_cmd.dest_addr, err); + return err; + } + + result->ah = cmd_completion.ah; + + return 0; +} + +int efa_com_destroy_ah(struct efa_com_dev *edev, + struct efa_com_destroy_ah_params *params) +{ + struct efa_admin_destroy_ah_resp cmd_completion; + struct efa_admin_destroy_ah_cmd ah_cmd = {}; + struct efa_com_admin_queue *aq = &edev->aq; + int err; + + ah_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_AH; + ah_cmd.ah = params->ah; + ah_cmd.pd = params->pdn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&ah_cmd, + sizeof(ah_cmd), + (struct efa_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to destroy ah-%d pd-%d [%d]\n", + ah_cmd.ah, ah_cmd.pd, err); + return err; + } + + return 0; +} + +bool +efa_com_check_supported_feature_id(struct efa_com_dev *edev, + enum efa_admin_aq_feature_id feature_id) +{ + u32 feature_mask = 1 << feature_id; + + /* Device attributes is always supported */ + if (feature_id != EFA_ADMIN_DEVICE_ATTR && + !(edev->supported_features & feature_mask)) + return false; + + return true; +} + +static int efa_com_get_feature_ex(struct efa_com_dev *edev, + struct efa_admin_get_feature_resp *get_resp, + enum efa_admin_aq_feature_id feature_id, + dma_addr_t control_buf_dma_addr, + u32 control_buff_size) +{ + struct efa_admin_get_feature_cmd get_cmd = {}; + struct efa_com_admin_queue *aq; + int err; + + if (!efa_com_check_supported_feature_id(edev, feature_id)) { + ibdev_err_ratelimited(edev->efa_dev, + "Feature %d isn't supported\n", + feature_id); + return -EOPNOTSUPP; + } + + aq = &edev->aq; + + get_cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_FEATURE; + + if (control_buff_size) + EFA_SET(&get_cmd.aq_common_descriptor.flags, + EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1); + + efa_com_set_dma_addr(control_buf_dma_addr, + &get_cmd.control_buffer.address.mem_addr_high, + &get_cmd.control_buffer.address.mem_addr_low); + + get_cmd.control_buffer.length = control_buff_size; + get_cmd.feature_common.feature_id = feature_id; + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *) + &get_cmd, + sizeof(get_cmd), + (struct efa_admin_acq_entry *) + get_resp, + sizeof(*get_resp)); + + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to submit get_feature command %d [%d]\n", + feature_id, err); + return err; + } + + return 0; +} + +static int efa_com_get_feature(struct efa_com_dev *edev, + struct efa_admin_get_feature_resp *get_resp, + enum efa_admin_aq_feature_id feature_id) +{ + return efa_com_get_feature_ex(edev, get_resp, feature_id, 0, 0); +} + +int efa_com_get_device_attr(struct efa_com_dev *edev, + struct efa_com_get_device_attr_result *result) +{ + struct efa_admin_get_feature_resp resp; + int err; + + err = efa_com_get_feature(edev, &resp, EFA_ADMIN_DEVICE_ATTR); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to get device attributes %d\n", + err); + return err; + } + + result->page_size_cap = resp.u.device_attr.page_size_cap; + result->fw_version = resp.u.device_attr.fw_version; + result->admin_api_version = resp.u.device_attr.admin_api_version; + result->device_version = resp.u.device_attr.device_version; + result->supported_features = resp.u.device_attr.supported_features; + result->phys_addr_width = resp.u.device_attr.phys_addr_width; + result->virt_addr_width = resp.u.device_attr.virt_addr_width; + result->db_bar = resp.u.device_attr.db_bar; + result->max_rdma_size = resp.u.device_attr.max_rdma_size; + result->device_caps = resp.u.device_attr.device_caps; + + if (result->admin_api_version < 1) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to get device attr api version [%u < 1]\n", + result->admin_api_version); + return -EINVAL; + } + + edev->supported_features = resp.u.device_attr.supported_features; + err = efa_com_get_feature(edev, &resp, + EFA_ADMIN_QUEUE_ATTR); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to get queue attributes %d\n", + err); + return err; + } + + result->max_qp = resp.u.queue_attr.max_qp; + result->max_sq_depth = resp.u.queue_attr.max_sq_depth; + result->max_rq_depth = resp.u.queue_attr.max_rq_depth; + result->max_cq = resp.u.queue_attr.max_cq; + result->max_cq_depth = resp.u.queue_attr.max_cq_depth; + result->inline_buf_size = resp.u.queue_attr.inline_buf_size; + result->max_sq_sge = resp.u.queue_attr.max_wr_send_sges; + result->max_rq_sge = resp.u.queue_attr.max_wr_recv_sges; + result->max_mr = resp.u.queue_attr.max_mr; + result->max_mr_pages = resp.u.queue_attr.max_mr_pages; + result->max_pd = resp.u.queue_attr.max_pd; + result->max_ah = resp.u.queue_attr.max_ah; + result->max_llq_size = resp.u.queue_attr.max_llq_size; + result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq; + result->max_wr_rdma_sge = resp.u.queue_attr.max_wr_rdma_sges; + result->max_tx_batch = resp.u.queue_attr.max_tx_batch; + result->min_sq_depth = resp.u.queue_attr.min_sq_depth; + + err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to get network attributes %d\n", + err); + return err; + } + + memcpy(result->addr, resp.u.network_attr.addr, + sizeof(resp.u.network_attr.addr)); + result->mtu = resp.u.network_attr.mtu; + + if (efa_com_check_supported_feature_id(edev, + EFA_ADMIN_EVENT_QUEUE_ATTR)) { + err = efa_com_get_feature(edev, &resp, + EFA_ADMIN_EVENT_QUEUE_ATTR); + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to get event queue attributes %d\n", + err); + return err; + } + + result->max_eq = resp.u.event_queue_attr.max_eq; + result->max_eq_depth = resp.u.event_queue_attr.max_eq_depth; + result->event_bitmask = resp.u.event_queue_attr.event_bitmask; + } + + return 0; +} + +int efa_com_get_hw_hints(struct efa_com_dev *edev, + struct efa_com_get_hw_hints_result *result) +{ + struct efa_admin_get_feature_resp resp; + int err; + + err = efa_com_get_feature(edev, &resp, EFA_ADMIN_HW_HINTS); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to get hw hints %d\n", err); + return err; + } + + result->admin_completion_timeout = resp.u.hw_hints.admin_completion_timeout; + result->driver_watchdog_timeout = resp.u.hw_hints.driver_watchdog_timeout; + result->mmio_read_timeout = resp.u.hw_hints.mmio_read_timeout; + result->poll_interval = resp.u.hw_hints.poll_interval; + + return 0; +} + +int efa_com_set_feature_ex(struct efa_com_dev *edev, + struct efa_admin_set_feature_resp *set_resp, + struct efa_admin_set_feature_cmd *set_cmd, + enum efa_admin_aq_feature_id feature_id, + dma_addr_t control_buf_dma_addr, + u32 control_buff_size) +{ + struct efa_com_admin_queue *aq; + int err; + + if (!efa_com_check_supported_feature_id(edev, feature_id)) { + ibdev_err_ratelimited(edev->efa_dev, + "Feature %d isn't supported\n", + feature_id); + return -EOPNOTSUPP; + } + + aq = &edev->aq; + + set_cmd->aq_common_descriptor.opcode = EFA_ADMIN_SET_FEATURE; + if (control_buff_size) { + set_cmd->aq_common_descriptor.flags = 0; + EFA_SET(&set_cmd->aq_common_descriptor.flags, + EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1); + efa_com_set_dma_addr(control_buf_dma_addr, + &set_cmd->control_buffer.address.mem_addr_high, + &set_cmd->control_buffer.address.mem_addr_low); + } + + set_cmd->control_buffer.length = control_buff_size; + set_cmd->feature_common.feature_id = feature_id; + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)set_cmd, + sizeof(*set_cmd), + (struct efa_admin_acq_entry *)set_resp, + sizeof(*set_resp)); + + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to submit set_feature command %d error: %d\n", + feature_id, err); + return err; + } + + return 0; +} + +static int efa_com_set_feature(struct efa_com_dev *edev, + struct efa_admin_set_feature_resp *set_resp, + struct efa_admin_set_feature_cmd *set_cmd, + enum efa_admin_aq_feature_id feature_id) +{ + return efa_com_set_feature_ex(edev, set_resp, set_cmd, feature_id, + 0, 0); +} + +int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups) +{ + struct efa_admin_get_feature_resp get_resp; + struct efa_admin_set_feature_resp set_resp; + struct efa_admin_set_feature_cmd cmd = {}; + int err; + + ibdev_dbg(edev->efa_dev, "Configuring aenq with groups[%#x]\n", groups); + + err = efa_com_get_feature(edev, &get_resp, EFA_ADMIN_AENQ_CONFIG); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to get aenq attributes: %d\n", + err); + return err; + } + + ibdev_dbg(edev->efa_dev, + "Get aenq groups: supported[%#x] enabled[%#x]\n", + get_resp.u.aenq.supported_groups, + get_resp.u.aenq.enabled_groups); + + if ((get_resp.u.aenq.supported_groups & groups) != groups) { + ibdev_err_ratelimited( + edev->efa_dev, + "Trying to set unsupported aenq groups[%#x] supported[%#x]\n", + groups, get_resp.u.aenq.supported_groups); + return -EOPNOTSUPP; + } + + cmd.u.aenq.enabled_groups = groups; + err = efa_com_set_feature(edev, &set_resp, &cmd, + EFA_ADMIN_AENQ_CONFIG); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to set aenq attributes: %d\n", + err); + return err; + } + + return 0; +} + +int efa_com_alloc_pd(struct efa_com_dev *edev, + struct efa_com_alloc_pd_result *result) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_alloc_pd_cmd cmd = {}; + struct efa_admin_alloc_pd_resp resp; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_PD; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to allocate pd[%d]\n", err); + return err; + } + + result->pdn = resp.pd; + + return 0; +} + +int efa_com_dealloc_pd(struct efa_com_dev *edev, + struct efa_com_dealloc_pd_params *params) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_dealloc_pd_cmd cmd = {}; + struct efa_admin_dealloc_pd_resp resp; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_PD; + cmd.pd = params->pdn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to deallocate pd-%u [%d]\n", + cmd.pd, err); + return err; + } + + return 0; +} + +int efa_com_alloc_uar(struct efa_com_dev *edev, + struct efa_com_alloc_uar_result *result) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_alloc_uar_cmd cmd = {}; + struct efa_admin_alloc_uar_resp resp; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_UAR; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to allocate uar[%d]\n", err); + return err; + } + + result->uarn = resp.uar; + + return 0; +} + +int efa_com_dealloc_uar(struct efa_com_dev *edev, + struct efa_com_dealloc_uar_params *params) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_dealloc_uar_cmd cmd = {}; + struct efa_admin_dealloc_uar_resp resp; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_UAR; + cmd.uar = params->uarn; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited(edev->efa_dev, + "Failed to deallocate uar-%u [%d]\n", + cmd.uar, err); + return err; + } + + return 0; +} + +int efa_com_get_stats(struct efa_com_dev *edev, + struct efa_com_get_stats_params *params, + union efa_com_get_stats_result *result) +{ + struct efa_com_admin_queue *aq = &edev->aq; + struct efa_admin_aq_get_stats_cmd cmd = {}; + struct efa_admin_acq_get_stats_resp resp; + int err; + + cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_STATS; + cmd.type = params->type; + cmd.scope = params->scope; + cmd.scope_modifier = params->scope_modifier; + + err = efa_com_cmd_exec(aq, + (struct efa_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct efa_admin_acq_entry *)&resp, + sizeof(resp)); + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to get stats type-%u scope-%u.%u [%d]\n", + cmd.type, cmd.scope, cmd.scope_modifier, err); + return err; + } + + switch (cmd.type) { + case EFA_ADMIN_GET_STATS_TYPE_BASIC: + result->basic_stats.tx_bytes = resp.u.basic_stats.tx_bytes; + result->basic_stats.tx_pkts = resp.u.basic_stats.tx_pkts; + result->basic_stats.rx_bytes = resp.u.basic_stats.rx_bytes; + result->basic_stats.rx_pkts = resp.u.basic_stats.rx_pkts; + result->basic_stats.rx_drops = resp.u.basic_stats.rx_drops; + break; + case EFA_ADMIN_GET_STATS_TYPE_MESSAGES: + result->messages_stats.send_bytes = resp.u.messages_stats.send_bytes; + result->messages_stats.send_wrs = resp.u.messages_stats.send_wrs; + result->messages_stats.recv_bytes = resp.u.messages_stats.recv_bytes; + result->messages_stats.recv_wrs = resp.u.messages_stats.recv_wrs; + break; + case EFA_ADMIN_GET_STATS_TYPE_RDMA_READ: + result->rdma_read_stats.read_wrs = resp.u.rdma_read_stats.read_wrs; + result->rdma_read_stats.read_bytes = resp.u.rdma_read_stats.read_bytes; + result->rdma_read_stats.read_wr_err = resp.u.rdma_read_stats.read_wr_err; + result->rdma_read_stats.read_resp_bytes = resp.u.rdma_read_stats.read_resp_bytes; + break; + } + + return 0; +} diff --git a/drivers/amazon/net/efa/efa_com_cmd.h b/drivers/amazon/net/efa/efa_com_cmd.h new file mode 100644 index 0000000000000..0898ad5bc3405 --- /dev/null +++ b/drivers/amazon/net/efa/efa_com_cmd.h @@ -0,0 +1,322 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_COM_CMD_H_ +#define _EFA_COM_CMD_H_ + +#include "efa_com.h" + +#define EFA_GID_SIZE 16 + +struct efa_com_create_qp_params { + u64 rq_base_addr; + u32 send_cq_idx; + u32 recv_cq_idx; + /* + * Send descriptor ring size in bytes, + * sufficient for user-provided number of WQEs and SGL size + */ + u32 sq_ring_size_in_bytes; + /* Max number of WQEs that will be posted on send queue */ + u32 sq_depth; + /* Recv descriptor ring size in bytes */ + u32 rq_ring_size_in_bytes; + u32 rq_depth; + u16 pd; + u16 uarn; + u8 qp_type; +}; + +struct efa_com_create_qp_result { + u32 qp_handle; + u32 qp_num; + u32 sq_db_offset; + u32 rq_db_offset; + u32 llq_descriptors_offset; + u16 send_sub_cq_idx; + u16 recv_sub_cq_idx; +}; + +struct efa_com_modify_qp_params { + u32 modify_mask; + u32 qp_handle; + u32 qp_state; + u32 cur_qp_state; + u32 qkey; + u32 sq_psn; + u8 sq_drained_async_notify; + u8 rnr_retry; +}; + +struct efa_com_query_qp_params { + u32 qp_handle; +}; + +struct efa_com_query_qp_result { + u32 qp_state; + u32 qkey; + u32 sq_draining; + u32 sq_psn; + u8 rnr_retry; +}; + +struct efa_com_destroy_qp_params { + u32 qp_handle; +}; + +struct efa_com_create_cq_params { + /* cq physical base address in OS memory */ + dma_addr_t dma_addr; + /* completion queue depth in # of entries */ + u16 cq_depth; + u16 num_sub_cqs; + u16 uarn; + u16 eqn; + u8 entry_size_in_bytes; + u8 interrupt_mode_enabled : 1; + u8 set_src_addr : 1; +}; + +struct efa_com_create_cq_result { + /* cq identifier */ + u16 cq_idx; + /* actual cq depth in # of entries */ + u16 actual_depth; + u32 db_off; + bool db_valid; +}; + +struct efa_com_destroy_cq_params { + u16 cq_idx; +}; + +struct efa_com_create_ah_params { + u16 pdn; + /* Destination address in network byte order */ + u8 dest_addr[EFA_GID_SIZE]; +}; + +struct efa_com_create_ah_result { + u16 ah; +}; + +struct efa_com_destroy_ah_params { + u16 ah; + u16 pdn; +}; + +struct efa_com_get_device_attr_result { + u8 addr[EFA_GID_SIZE]; + u64 page_size_cap; + u64 max_mr_pages; + u32 mtu; + u32 fw_version; + u32 admin_api_version; + u32 device_version; + u32 supported_features; + u32 phys_addr_width; + u32 virt_addr_width; + u32 max_qp; + u32 max_sq_depth; /* wqes */ + u32 max_rq_depth; /* wqes */ + u32 max_cq; + u32 max_cq_depth; /* cqes */ + u32 inline_buf_size; + u32 max_mr; + u32 max_pd; + u32 max_ah; + u32 max_llq_size; + u32 max_rdma_size; + u32 device_caps; + u32 max_eq; + u32 max_eq_depth; + u32 event_bitmask; /* EQ events bitmask */ + u16 sub_cqs_per_cq; + u16 max_sq_sge; + u16 max_rq_sge; + u16 max_wr_rdma_sge; + u16 max_tx_batch; + u16 min_sq_depth; + u8 db_bar; +}; + +struct efa_com_get_hw_hints_result { + u16 mmio_read_timeout; + u16 driver_watchdog_timeout; + u16 admin_completion_timeout; + u16 poll_interval; + u32 reserved[4]; +}; + +struct efa_com_mem_addr { + u32 mem_addr_low; + u32 mem_addr_high; +}; + +/* Used at indirect mode page list chunks for chaining */ +struct efa_com_ctrl_buff_info { + /* indicates length of the buffer pointed by control_buffer_address. */ + u32 length; + /* points to control buffer (direct or indirect) */ + struct efa_com_mem_addr address; +}; + +struct efa_com_reg_mr_params { + /* Memory region length, in bytes. */ + u64 mr_length_in_bytes; + /* IO Virtual Address associated with this MR. */ + u64 iova; + /* words 8:15: Physical Buffer List, each element is page-aligned. */ + union { + /* + * Inline array of physical addresses of app pages + * (optimization for short region reservations) + */ + u64 inline_pbl_array[4]; + /* + * Describes the next physically contiguous chunk of indirect + * page list. A page list contains physical addresses of command + * data pages. Data pages are 4KB; page list chunks are + * variable-sized. + */ + struct efa_com_ctrl_buff_info pbl; + } pbl; + /* number of pages in PBL (redundant, could be calculated) */ + u32 page_num; + /* Protection Domain */ + u16 pd; + /* + * phys_page_size_shift - page size is (1 << phys_page_size_shift) + * Page size is used for building the Virtual to Physical + * address mapping + */ + u8 page_shift; + /* see permissions field of struct efa_admin_reg_mr_cmd */ + u8 permissions; + u8 inline_pbl; + u8 indirect; +}; + +struct efa_com_reg_mr_result { + /* + * To be used in conjunction with local buffers references in SQ and + * RQ WQE + */ + u32 l_key; + /* + * To be used in incoming RDMA semantics messages to refer to remotely + * accessed memory region + */ + u32 r_key; +}; + +struct efa_com_dereg_mr_params { + u32 l_key; +}; + +struct efa_com_alloc_pd_result { + u16 pdn; +}; + +struct efa_com_dealloc_pd_params { + u16 pdn; +}; + +struct efa_com_alloc_uar_result { + u16 uarn; +}; + +struct efa_com_dealloc_uar_params { + u16 uarn; +}; + +struct efa_com_get_stats_params { + /* see enum efa_admin_get_stats_type */ + u8 type; + /* see enum efa_admin_get_stats_scope */ + u8 scope; + u16 scope_modifier; +}; + +struct efa_com_basic_stats { + u64 tx_bytes; + u64 tx_pkts; + u64 rx_bytes; + u64 rx_pkts; + u64 rx_drops; +}; + +struct efa_com_messages_stats { + u64 send_bytes; + u64 send_wrs; + u64 recv_bytes; + u64 recv_wrs; +}; + +struct efa_com_rdma_read_stats { + u64 read_wrs; + u64 read_bytes; + u64 read_wr_err; + u64 read_resp_bytes; +}; + +union efa_com_get_stats_result { + struct efa_com_basic_stats basic_stats; + struct efa_com_messages_stats messages_stats; + struct efa_com_rdma_read_stats rdma_read_stats; +}; + +int efa_com_create_qp(struct efa_com_dev *edev, + struct efa_com_create_qp_params *params, + struct efa_com_create_qp_result *res); +int efa_com_modify_qp(struct efa_com_dev *edev, + struct efa_com_modify_qp_params *params); +int efa_com_query_qp(struct efa_com_dev *edev, + struct efa_com_query_qp_params *params, + struct efa_com_query_qp_result *result); +int efa_com_destroy_qp(struct efa_com_dev *edev, + struct efa_com_destroy_qp_params *params); +int efa_com_create_cq(struct efa_com_dev *edev, + struct efa_com_create_cq_params *params, + struct efa_com_create_cq_result *result); +int efa_com_destroy_cq(struct efa_com_dev *edev, + struct efa_com_destroy_cq_params *params); +int efa_com_register_mr(struct efa_com_dev *edev, + struct efa_com_reg_mr_params *params, + struct efa_com_reg_mr_result *result); +int efa_com_dereg_mr(struct efa_com_dev *edev, + struct efa_com_dereg_mr_params *params); +int efa_com_create_ah(struct efa_com_dev *edev, + struct efa_com_create_ah_params *params, + struct efa_com_create_ah_result *result); +int efa_com_destroy_ah(struct efa_com_dev *edev, + struct efa_com_destroy_ah_params *params); +int efa_com_get_device_attr(struct efa_com_dev *edev, + struct efa_com_get_device_attr_result *result); +int efa_com_get_hw_hints(struct efa_com_dev *edev, + struct efa_com_get_hw_hints_result *result); +bool +efa_com_check_supported_feature_id(struct efa_com_dev *edev, + enum efa_admin_aq_feature_id feature_id); +int efa_com_set_feature_ex(struct efa_com_dev *edev, + struct efa_admin_set_feature_resp *set_resp, + struct efa_admin_set_feature_cmd *set_cmd, + enum efa_admin_aq_feature_id feature_id, + dma_addr_t control_buf_dma_addr, + u32 control_buff_size); +int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups); +int efa_com_alloc_pd(struct efa_com_dev *edev, + struct efa_com_alloc_pd_result *result); +int efa_com_dealloc_pd(struct efa_com_dev *edev, + struct efa_com_dealloc_pd_params *params); +int efa_com_alloc_uar(struct efa_com_dev *edev, + struct efa_com_alloc_uar_result *result); +int efa_com_dealloc_uar(struct efa_com_dev *edev, + struct efa_com_dealloc_uar_params *params); +int efa_com_get_stats(struct efa_com_dev *edev, + struct efa_com_get_stats_params *params, + union efa_com_get_stats_result *result); + +#endif /* _EFA_COM_CMD_H_ */ diff --git a/drivers/amazon/net/efa/efa_common_defs.h b/drivers/amazon/net/efa/efa_common_defs.h new file mode 100644 index 0000000000000..bbcf48f0eaca4 --- /dev/null +++ b/drivers/amazon/net/efa/efa_common_defs.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_COMMON_H_ +#define _EFA_COMMON_H_ + +#ifdef HAVE_BITFIELD_H +#include +#endif + +#define EFA_COMMON_SPEC_VERSION_MAJOR 2 +#define EFA_COMMON_SPEC_VERSION_MINOR 0 + +#define EFA_GET(ptr, mask) FIELD_GET(mask##_MASK, *(ptr)) + +#define EFA_SET(ptr, mask, value) \ + ({ \ + typeof(ptr) _ptr = ptr; \ + *_ptr = (*_ptr & ~(mask##_MASK)) | \ + FIELD_PREP(mask##_MASK, value); \ + }) + +struct efa_common_mem_addr { + u32 mem_addr_low; + + u32 mem_addr_high; +}; + +#endif /* _EFA_COMMON_H_ */ diff --git a/drivers/amazon/net/efa/efa_gdr.c b/drivers/amazon/net/efa/efa_gdr.c new file mode 100644 index 0000000000000..24f8a082d10d5 --- /dev/null +++ b/drivers/amazon/net/efa/efa_gdr.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include + +#include "efa_p2p.h" +#include "nv-p2p.h" + +#define GPU_PAGE_SHIFT 16 +#define GPU_PAGE_SIZE BIT_ULL(GPU_PAGE_SHIFT) + +struct efa_nvmem_ops { + int (*get_pages)(u64 p2p_token, u32 va_space, u64 virtual_address, + u64 length, struct nvidia_p2p_page_table **page_table, + void (*free_callback)(void *data), void *data); + int (*dma_map_pages)(struct pci_dev *peer, + struct nvidia_p2p_page_table *page_table, + struct nvidia_p2p_dma_mapping **dma_mapping); + int (*put_pages)(u64 p2p_token, u32 va_space, u64 virtual_address, + struct nvidia_p2p_page_table *page_table); + int (*dma_unmap_pages)(struct pci_dev *peer, + struct nvidia_p2p_page_table *page_table, + struct nvidia_p2p_dma_mapping *dma_mapping); +}; + +struct efa_nvmem { + struct efa_p2pmem p2pmem; + struct efa_nvmem_ops ops; + struct nvidia_p2p_page_table *pgtbl; + struct nvidia_p2p_dma_mapping *dma_mapping; + u64 virt_start; +}; + +static unsigned int nvmem_pgsz(struct efa_dev *dev, struct efa_p2pmem *p2pmem) +{ + struct efa_nvmem *nvmem; + + nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem); + + switch (nvmem->pgtbl->page_size) { + case NVIDIA_P2P_PAGE_SIZE_4KB: + return SZ_4K; + case NVIDIA_P2P_PAGE_SIZE_64KB: + return SZ_64K; + case NVIDIA_P2P_PAGE_SIZE_128KB: + return SZ_128K; + default: + return 0; + } +} + +static int nvmem_get_fp(struct efa_nvmem *nvmem) +{ + nvmem->ops.get_pages = symbol_get(nvidia_p2p_get_pages); + if (!nvmem->ops.get_pages) + goto err_out; + + nvmem->ops.put_pages = symbol_get(nvidia_p2p_put_pages); + if (!nvmem->ops.put_pages) + goto err_put_get_pages; + + nvmem->ops.dma_map_pages = symbol_get(nvidia_p2p_dma_map_pages); + if (!nvmem->ops.dma_map_pages) + goto err_put_put_pages; + + nvmem->ops.dma_unmap_pages = symbol_get(nvidia_p2p_dma_unmap_pages); + if (!nvmem->ops.dma_unmap_pages) + goto err_put_dma_map_pages; + + return 0; + +err_put_dma_map_pages: + symbol_put(nvidia_p2p_dma_map_pages); +err_put_put_pages: + symbol_put(nvidia_p2p_put_pages); +err_put_get_pages: + symbol_put(nvidia_p2p_get_pages); +err_out: + return -EINVAL; +} + +static void nvmem_put_fp(void) +{ + symbol_put(nvidia_p2p_dma_unmap_pages); + symbol_put(nvidia_p2p_dma_map_pages); + symbol_put(nvidia_p2p_put_pages); + symbol_put(nvidia_p2p_get_pages); +} + +static void nvmem_free_cb(void *data) +{ + pr_debug("Free callback ticket %llu\n", (u64)data); + efa_p2p_put((u64)data, true); +} + +static int nvmem_get_pages(struct efa_dev *dev, struct efa_nvmem *nvmem, + u64 addr, u64 size, u64 ticket) +{ + int err; + + err = nvmem->ops.get_pages(0, 0, addr, size, &nvmem->pgtbl, + nvmem_free_cb, (void *)ticket); + if (err) { + ibdev_dbg(&dev->ibdev, "nvidia_p2p_get_pages failed %d\n", err); + return err; + } + + if (!NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(nvmem->pgtbl)) { + ibdev_dbg(&dev->ibdev, "Incompatible page table version %#08x\n", + nvmem->pgtbl->version); + nvmem->ops.put_pages(0, 0, addr, nvmem->pgtbl); + nvmem->pgtbl = NULL; + return -EINVAL; + } + + return 0; +} + +static int nvmem_dma_map(struct efa_dev *dev, struct efa_nvmem *nvmem) +{ + int err; + + err = nvmem->ops.dma_map_pages(dev->pdev, nvmem->pgtbl, + &nvmem->dma_mapping); + if (err) { + ibdev_dbg(&dev->ibdev, "nvidia_p2p_dma_map_pages failed %d\n", + err); + return err; + } + + if (!NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(nvmem->dma_mapping)) { + ibdev_dbg(&dev->ibdev, "Incompatible DMA mapping version %#08x\n", + nvmem->dma_mapping->version); + nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl, + nvmem->dma_mapping); + nvmem->dma_mapping = NULL; + return -EINVAL; + } + + return 0; +} + +static struct efa_p2pmem *nvmem_get(struct efa_dev *dev, u64 ticket, u64 start, + u64 length) +{ + struct efa_nvmem *nvmem; + u64 virt_start; + u64 virt_end; + u64 pinsz; + int err; + + nvmem = kzalloc(sizeof(*nvmem), GFP_KERNEL); + if (!nvmem) + return NULL; + + virt_start = ALIGN_DOWN(start, GPU_PAGE_SIZE); + virt_end = ALIGN(start + length, GPU_PAGE_SIZE); + pinsz = virt_end - virt_start; + nvmem->virt_start = virt_start; + + err = nvmem_get_fp(nvmem); + if (err) + /* Nvidia module is not loaded */ + goto err_free; + + err = nvmem_get_pages(dev, nvmem, virt_start, pinsz, ticket); + if (err) + /* Most likely not our pages */ + goto err_put_fp; + + err = nvmem_dma_map(dev, nvmem); + if (err) + goto err_put; + + return &nvmem->p2pmem; + +err_put: + nvmem->ops.put_pages(0, 0, virt_start, nvmem->pgtbl); +err_put_fp: + nvmem_put_fp(); +err_free: + kfree(nvmem); + return NULL; +} + +static int nvmem_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + u64 *page_list) +{ + struct nvidia_p2p_dma_mapping *dma_mapping; + struct efa_nvmem *nvmem; + int i; + + nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem); + dma_mapping = nvmem->dma_mapping; + + for (i = 0; i < dma_mapping->entries; i++) + page_list[i] = dma_mapping->dma_addresses[i]; + + return 0; +} + +static void nvmem_release(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + bool in_cb) +{ + struct efa_nvmem *nvmem; + + nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem); + + if (!in_cb) { + nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl, + nvmem->dma_mapping); + nvmem->ops.put_pages(0, 0, nvmem->virt_start, nvmem->pgtbl); + } + + nvmem_put_fp(); + kfree(nvmem); +} + +bool nvmem_is_supported(void) +{ + struct efa_nvmem dummynv = {}; + + if (nvmem_get_fp(&dummynv)) + return false; + nvmem_put_fp(); + + return true; +} + +struct nvmem_provider { + struct efa_p2p_provider p2p; +}; + +static const struct nvmem_provider prov = { + .p2p = { + .ops = { + .try_get = nvmem_get, + .to_page_list = nvmem_to_page_list, + .release = nvmem_release, + .get_page_size = nvmem_pgsz, + }, + .type = EFA_P2P_PROVIDER_NVMEM, + }, +}; + +const struct efa_p2p_provider *nvmem_get_provider(void) +{ + return &prov.p2p; +} diff --git a/drivers/amazon/net/efa/efa_io_defs.h b/drivers/amazon/net/efa/efa_io_defs.h new file mode 100644 index 0000000000000..17ba8984b11e9 --- /dev/null +++ b/drivers/amazon/net/efa/efa_io_defs.h @@ -0,0 +1,289 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_IO_H_ +#define _EFA_IO_H_ + +#define EFA_IO_TX_DESC_NUM_BUFS 2 +#define EFA_IO_TX_DESC_NUM_RDMA_BUFS 1 +#define EFA_IO_TX_DESC_INLINE_MAX_SIZE 32 +#define EFA_IO_TX_DESC_IMM_DATA_SIZE 4 + +enum efa_io_queue_type { + /* send queue (of a QP) */ + EFA_IO_SEND_QUEUE = 1, + /* recv queue (of a QP) */ + EFA_IO_RECV_QUEUE = 2, +}; + +enum efa_io_send_op_type { + /* send message */ + EFA_IO_SEND = 0, + /* RDMA read */ + EFA_IO_RDMA_READ = 1, +}; + +enum efa_io_comp_status { + /* Successful completion */ + EFA_IO_COMP_STATUS_OK = 0, + /* Flushed during QP destroy */ + EFA_IO_COMP_STATUS_FLUSHED = 1, + /* Internal QP error */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR = 2, + /* Bad operation type */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE = 3, + /* Bad AH */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH = 4, + /* LKEY not registered or does not match IOVA */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY = 5, + /* Message too long */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH = 6, + /* Destination ENI is down or does not run EFA */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS = 7, + /* Connection was reset by remote side */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT = 8, + /* Bad dest QP number (QP does not exist or is in error state) */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN = 9, + /* Destination resource not ready (no WQEs posted on RQ) */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR = 10, + /* Receiver SGL too short */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH = 11, + /* Unexpected status returned by responder */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS = 12, + /* Unresponsive remote - detected locally */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE = 13, +}; + +struct efa_io_tx_meta_desc { + /* Verbs-generated Request ID */ + u16 req_id; + + /* + * control flags + * 3:0 : op_type - operation type: send/rdma/fast mem + * ops/etc + * 4 : has_imm - immediate_data field carries valid + * data. + * 5 : inline_msg - inline mode - inline message data + * follows this descriptor (no buffer descriptors). + * Note that it is different from immediate data + * 6 : meta_extension - Extended metadata. MBZ + * 7 : meta_desc - Indicates metadata descriptor. + * Must be set. + */ + u8 ctrl1; + + /* + * control flags + * 0 : phase + * 1 : reserved25 - MBZ + * 2 : first - Indicates first descriptor in + * transaction. Must be set. + * 3 : last - Indicates last descriptor in + * transaction. Must be set. + * 4 : comp_req - Indicates whether completion should + * be posted, after packet is transmitted. Valid only + * for the first descriptor + * 7:5 : reserved29 - MBZ + */ + u8 ctrl2; + + u16 dest_qp_num; + + /* + * If inline_msg bit is set, length of inline message in bytes, + * otherwise length of SGL (number of buffers). + */ + u16 length; + + /* + * immediate data: if has_imm is set, then this field is included + * within Tx message and reported in remote Rx completion. + */ + u32 immediate_data; + + u16 ah; + + u16 reserved; + + /* Queue key */ + u32 qkey; + + u8 reserved2[12]; +}; + +/* + * Tx queue buffer descriptor, for any transport type. Preceded by metadata + * descriptor. + */ +struct efa_io_tx_buf_desc { + /* length in bytes */ + u32 length; + + /* + * 23:0 : lkey - local memory translation key + * 31:24 : reserved - MBZ + */ + u32 lkey; + + /* Buffer address bits[31:0] */ + u32 buf_addr_lo; + + /* Buffer address bits[63:32] */ + u32 buf_addr_hi; +}; + +struct efa_io_remote_mem_addr { + /* length in bytes */ + u32 length; + + /* remote memory translation key */ + u32 rkey; + + /* Buffer address bits[31:0] */ + u32 buf_addr_lo; + + /* Buffer address bits[63:32] */ + u32 buf_addr_hi; +}; + +struct efa_io_rdma_req { + /* Remote memory address */ + struct efa_io_remote_mem_addr remote_mem; + + /* Local memory address */ + struct efa_io_tx_buf_desc local_mem[1]; +}; + +/* + * Tx WQE, composed of tx meta descriptors followed by either tx buffer + * descriptors or inline data + */ +struct efa_io_tx_wqe { + /* TX meta */ + struct efa_io_tx_meta_desc meta; + + union { + /* Send buffer descriptors */ + struct efa_io_tx_buf_desc sgl[2]; + + u8 inline_data[32]; + + /* RDMA local and remote memory addresses */ + struct efa_io_rdma_req rdma_req; + } data; +}; + +/* + * Rx buffer descriptor; RX WQE is composed of one or more RX buffer + * descriptors. + */ +struct efa_io_rx_desc { + /* Buffer address bits[31:0] */ + u32 buf_addr_lo; + + /* Buffer Pointer[63:32] */ + u32 buf_addr_hi; + + /* Verbs-generated request id. */ + u16 req_id; + + /* Length in bytes. */ + u16 length; + + /* + * LKey and control flags + * 23:0 : lkey + * 29:24 : reserved - MBZ + * 30 : first - Indicates first descriptor in WQE + * 31 : last - Indicates last descriptor in WQE + */ + u32 lkey_ctrl; +}; + +/* Common IO completion descriptor */ +struct efa_io_cdesc_common { + /* + * verbs-generated request ID, as provided in the completed tx or rx + * descriptor. + */ + u16 req_id; + + u8 status; + + /* + * flags + * 0 : phase - Phase bit + * 2:1 : q_type - enum efa_io_queue_type: send/recv + * 3 : has_imm - indicates that immediate data is + * present - for RX completions only + * 7:4 : reserved28 - MBZ + */ + u8 flags; + + /* local QP number */ + u16 qp_num; + + /* Transferred length */ + u16 length; +}; + +/* Tx completion descriptor */ +struct efa_io_tx_cdesc { + /* Common completion info */ + struct efa_io_cdesc_common common; +}; + +/* Rx Completion Descriptor */ +struct efa_io_rx_cdesc { + /* Common completion info */ + struct efa_io_cdesc_common common; + + /* Remote Address Handle FW index, 0xFFFF indicates invalid ah */ + u16 ah; + + u16 src_qp_num; + + /* Immediate data */ + u32 imm; +}; + +/* Extended Rx Completion Descriptor */ +struct efa_io_rx_cdesc_ex { + /* Base RX completion info */ + struct efa_io_rx_cdesc rx_cdesc_base; + + /* + * Valid only in case of unknown AH (0xFFFF) and CQ set_src_addr is + * enabled. + */ + u8 src_addr[16]; +}; + +/* tx_meta_desc */ +#define EFA_IO_TX_META_DESC_OP_TYPE_MASK GENMASK(3, 0) +#define EFA_IO_TX_META_DESC_HAS_IMM_MASK BIT(4) +#define EFA_IO_TX_META_DESC_INLINE_MSG_MASK BIT(5) +#define EFA_IO_TX_META_DESC_META_EXTENSION_MASK BIT(6) +#define EFA_IO_TX_META_DESC_META_DESC_MASK BIT(7) +#define EFA_IO_TX_META_DESC_PHASE_MASK BIT(0) +#define EFA_IO_TX_META_DESC_FIRST_MASK BIT(2) +#define EFA_IO_TX_META_DESC_LAST_MASK BIT(3) +#define EFA_IO_TX_META_DESC_COMP_REQ_MASK BIT(4) + +/* tx_buf_desc */ +#define EFA_IO_TX_BUF_DESC_LKEY_MASK GENMASK(23, 0) + +/* rx_desc */ +#define EFA_IO_RX_DESC_LKEY_MASK GENMASK(23, 0) +#define EFA_IO_RX_DESC_FIRST_MASK BIT(30) +#define EFA_IO_RX_DESC_LAST_MASK BIT(31) + +/* cdesc_common */ +#define EFA_IO_CDESC_COMMON_PHASE_MASK BIT(0) +#define EFA_IO_CDESC_COMMON_Q_TYPE_MASK GENMASK(2, 1) +#define EFA_IO_CDESC_COMMON_HAS_IMM_MASK BIT(3) + +#endif /* _EFA_IO_H_ */ diff --git a/drivers/amazon/net/efa/efa_main.c b/drivers/amazon/net/efa/efa_main.c new file mode 100644 index 0000000000000..34a8e13273556 --- /dev/null +++ b/drivers/amazon/net/efa/efa_main.c @@ -0,0 +1,889 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include + +#include + +#include "efa.h" +#include "efa_sysfs.h" + +#ifdef HAVE_EFA_P2P +#include "efa_p2p.h" +#endif + +#ifndef HAVE_PCI_VENDOR_ID_AMAZON +#define PCI_VENDOR_ID_AMAZON 0x1d0f +#endif +#define PCI_DEV_ID_EFA0_VF 0xefa0 +#define PCI_DEV_ID_EFA1_VF 0xefa1 +#define PCI_DEV_ID_EFA2_VF 0xefa2 + +static const struct pci_device_id efa_pci_tbl[] = { + { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) }, + { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) }, + { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA2_VF) }, + { } +}; + +#define DRV_MODULE_VER_MAJOR 2 +#define DRV_MODULE_VER_MINOR 1 +#define DRV_MODULE_VER_SUBMINOR 0 + +#ifndef DRV_MODULE_VERSION +#define DRV_MODULE_VERSION \ + __stringify(DRV_MODULE_VER_MAJOR) "." \ + __stringify(DRV_MODULE_VER_MINOR) "." \ + __stringify(DRV_MODULE_VER_SUBMINOR) "g" +#endif + +MODULE_VERSION(DRV_MODULE_VERSION); +MODULE_SOFTDEP("pre: ib_uverbs"); + +static char version[] = DEVICE_NAME " v" DRV_MODULE_VERSION; + +MODULE_AUTHOR("Amazon.com, Inc. or its affiliates"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION(DEVICE_NAME); +MODULE_DEVICE_TABLE(pci, efa_pci_tbl); + +#define EFA_REG_BAR 0 +#define EFA_MEM_BAR 2 +#define EFA_BASE_BAR_MASK (BIT(EFA_REG_BAR) | BIT(EFA_MEM_BAR)) + +#define EFA_AENQ_ENABLED_GROUPS \ + (BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \ + BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE)) + +/* This handler will called for unknown event group or unimplemented handlers */ +static void unimplemented_aenq_handler(void *data, + struct efa_admin_aenq_entry *aenq_e) +{ + struct efa_dev *dev = (struct efa_dev *)data; + + ibdev_err(&dev->ibdev, + "Unknown event was received or event with unimplemented handler\n"); +} + +static void efa_keep_alive(void *data, struct efa_admin_aenq_entry *aenq_e) +{ + struct efa_dev *dev = (struct efa_dev *)data; + + atomic64_inc(&dev->stats.keep_alive_rcvd); +} + +static struct efa_aenq_handlers aenq_handlers = { + .handlers = { + [EFA_ADMIN_KEEP_ALIVE] = efa_keep_alive, + }, + .unimplemented_handler = unimplemented_aenq_handler +}; + +static void efa_release_bars(struct efa_dev *dev, int bars_mask) +{ + struct pci_dev *pdev = dev->pdev; + int release_bars; + + release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & bars_mask; + pci_release_selected_regions(pdev, release_bars); +} + +static void efa_process_comp_eqe(struct efa_dev *dev, struct efa_admin_eqe *eqe) +{ + u16 cqn = eqe->u.comp_event.cqn; + struct efa_cq *cq; + +#ifdef HAVE_XARRAY + /* Safe to load as we're in irq and removal calls synchronize_irq() */ + cq = xa_load(&dev->cqs_xa, cqn); +#else + cq = dev->cqs_arr[cqn]; +#endif + if (unlikely(!cq)) { + ibdev_err_ratelimited(&dev->ibdev, + "Completion event on non-existent CQ[%u]", + cqn); + return; + } + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + +static void efa_process_eqe(struct efa_com_eq *eeq, struct efa_admin_eqe *eqe) +{ + struct efa_dev *dev = container_of(eeq->edev, struct efa_dev, edev); + + if (likely(EFA_GET(&eqe->common, EFA_ADMIN_EQE_EVENT_TYPE) == + EFA_ADMIN_EQE_EVENT_TYPE_COMPLETION)) + efa_process_comp_eqe(dev, eqe); + else + ibdev_err_ratelimited(&dev->ibdev, + "Unknown event type received %lu", + EFA_GET(&eqe->common, + EFA_ADMIN_EQE_EVENT_TYPE)); +} + +static irqreturn_t efa_intr_msix_comp(int irq, void *data) +{ + struct efa_eq *eq = data; + struct efa_com_dev *edev = eq->eeq.edev; + + efa_com_eq_comp_intr_handler(edev, &eq->eeq); + + return IRQ_HANDLED; +} + +static irqreturn_t efa_intr_msix_mgmnt(int irq, void *data) +{ + struct efa_dev *dev = data; + + efa_com_admin_q_comp_intr_handler(&dev->edev); + efa_com_aenq_intr_handler(&dev->edev, data); + + return IRQ_HANDLED; +} + +static int efa_request_irq(struct efa_dev *dev, struct efa_irq *irq) +{ + int err; + + err = request_irq(irq->irqn, irq->handler, 0, irq->name, irq->data); + if (err) { + dev_err(&dev->pdev->dev, "Failed to request irq %s (%d)\n", + irq->name, err); + return err; + } + + irq_set_affinity_hint(irq->irqn, &irq->affinity_hint_mask); + + return 0; +} + +static void efa_setup_comp_irq(struct efa_dev *dev, struct efa_eq *eq, + int vector) +{ + u32 cpu; + + cpu = vector - EFA_COMP_EQS_VEC_BASE; + snprintf(eq->irq.name, EFA_IRQNAME_SIZE, "efa-comp%d@pci:%s", cpu, + pci_name(dev->pdev)); + eq->irq.handler = efa_intr_msix_comp; + eq->irq.data = eq; + eq->irq.vector = vector; + eq->irq.irqn = pci_irq_vector(dev->pdev, vector); + cpumask_set_cpu(cpu, &eq->irq.affinity_hint_mask); +} + +static void efa_free_irq(struct efa_dev *dev, struct efa_irq *irq) +{ + irq_set_affinity_hint(irq->irqn, NULL); + free_irq(irq->irqn, irq->data); +} + +static void efa_setup_mgmnt_irq(struct efa_dev *dev) +{ + u32 cpu; + + snprintf(dev->admin_irq.name, EFA_IRQNAME_SIZE, + "efa-mgmnt@pci:%s", pci_name(dev->pdev)); + dev->admin_irq.handler = efa_intr_msix_mgmnt; + dev->admin_irq.data = dev; + dev->admin_irq.vector = dev->admin_msix_vector_idx; + dev->admin_irq.irqn = pci_irq_vector(dev->pdev, + dev->admin_msix_vector_idx); + cpu = cpumask_first(cpu_online_mask); + cpumask_set_cpu(cpu, + &dev->admin_irq.affinity_hint_mask); + dev_info(&dev->pdev->dev, "Setup irq:%d name:%s\n", + dev->admin_irq.irqn, + dev->admin_irq.name); +} + +static int efa_set_mgmnt_irq(struct efa_dev *dev) +{ + efa_setup_mgmnt_irq(dev); + + return efa_request_irq(dev, &dev->admin_irq); +} + +static int efa_request_doorbell_bar(struct efa_dev *dev) +{ + u8 db_bar_idx = dev->dev_attr.db_bar; + struct pci_dev *pdev = dev->pdev; + int bars; + int err; + + if (!(BIT(db_bar_idx) & EFA_BASE_BAR_MASK)) { + bars = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(db_bar_idx); + + err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + if (err) { + dev_err(&dev->pdev->dev, + "pci_request_selected_regions for bar %d failed %d\n", + db_bar_idx, err); + return err; + } + } + + dev->db_bar_addr = pci_resource_start(dev->pdev, db_bar_idx); + dev->db_bar_len = pci_resource_len(dev->pdev, db_bar_idx); + + return 0; +} + +static void efa_release_doorbell_bar(struct efa_dev *dev) +{ + if (!(BIT(dev->dev_attr.db_bar) & EFA_BASE_BAR_MASK)) + efa_release_bars(dev, BIT(dev->dev_attr.db_bar)); +} + +static void efa_update_hw_hints(struct efa_dev *dev, + struct efa_com_get_hw_hints_result *hw_hints) +{ + struct efa_com_dev *edev = &dev->edev; + + if (hw_hints->mmio_read_timeout) + edev->mmio_read.mmio_read_timeout = + hw_hints->mmio_read_timeout * 1000; + + if (hw_hints->poll_interval) + edev->aq.poll_interval = hw_hints->poll_interval; + + if (hw_hints->admin_completion_timeout) + edev->aq.completion_timeout = + hw_hints->admin_completion_timeout; +} + +static void efa_stats_init(struct efa_dev *dev) +{ + atomic64_t *s = (atomic64_t *)&dev->stats; + int i; + + for (i = 0; i < sizeof(dev->stats) / sizeof(*s); i++, s++) + atomic64_set(s, 0); +} + +static void efa_set_host_info(struct efa_dev *dev) +{ + struct efa_admin_set_feature_resp resp = {}; + struct efa_admin_set_feature_cmd cmd = {}; + struct efa_admin_host_info *hinf; + u32 bufsz = sizeof(*hinf); + dma_addr_t hinf_dma; + + if (!efa_com_check_supported_feature_id(&dev->edev, + EFA_ADMIN_HOST_INFO)) + return; + + /* Failures in host info set shall not disturb probe */ + hinf = dma_alloc_coherent(&dev->pdev->dev, bufsz, &hinf_dma, + GFP_KERNEL); + if (!hinf) + return; + + strscpy(hinf->os_dist_str, utsname()->release, + sizeof(hinf->os_dist_str)); + hinf->os_type = EFA_ADMIN_OS_LINUX; + strscpy(hinf->kernel_ver_str, utsname()->version, + sizeof(hinf->kernel_ver_str)); + hinf->kernel_ver = LINUX_VERSION_CODE; + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MAJOR, + DRV_MODULE_VER_MAJOR); + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MINOR, + DRV_MODULE_VER_MINOR); + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR, + DRV_MODULE_VER_SUBMINOR); + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE, + "g"[0]); + EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_BUS, dev->pdev->bus->number); + EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_DEVICE, + PCI_SLOT(dev->pdev->devfn)); + EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_FUNCTION, + PCI_FUNC(dev->pdev->devfn)); + EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MAJOR, + EFA_COMMON_SPEC_VERSION_MAJOR); + EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MINOR, + EFA_COMMON_SPEC_VERSION_MINOR); +#ifdef HAVE_EFA_P2P + EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_GDR, 1); +#endif + + efa_com_set_feature_ex(&dev->edev, &resp, &cmd, EFA_ADMIN_HOST_INFO, + hinf_dma, bufsz); + + dma_free_coherent(&dev->pdev->dev, bufsz, hinf, hinf_dma); +} + +static void efa_destroy_eq(struct efa_dev *dev, struct efa_eq *eq) +{ + efa_com_eq_destroy(&dev->edev, &eq->eeq); + efa_free_irq(dev, &eq->irq); +} + +static int efa_create_eq(struct efa_dev *dev, struct efa_eq *eq, u8 msix_vec) +{ + int err; + + efa_setup_comp_irq(dev, eq, msix_vec); + err = efa_request_irq(dev, &eq->irq); + if (err) + return err; + + err = efa_com_eq_init(&dev->edev, &eq->eeq, efa_process_eqe, + dev->dev_attr.max_eq_depth, msix_vec); + if (err) + goto err_free_comp_irq; + + return 0; + +err_free_comp_irq: + efa_free_irq(dev, &eq->irq); + return err; +} + +static int efa_create_eqs(struct efa_dev *dev) +{ + unsigned int neqs = dev->dev_attr.max_eq; + int err; + int i; + + neqs = min_t(unsigned int, neqs, num_online_cpus()); + dev->neqs = neqs; + dev->eqs = kcalloc(neqs, sizeof(*dev->eqs), GFP_KERNEL); + if (!dev->eqs) + return -ENOMEM; + + for (i = 0; i < neqs; i++) { + err = efa_create_eq(dev, &dev->eqs[i], + i + EFA_COMP_EQS_VEC_BASE); + if (err) + goto err_destroy_eqs; + } + + return 0; + +err_destroy_eqs: + for (i--; i >= 0; i--) + efa_destroy_eq(dev, &dev->eqs[i]); + kfree(dev->eqs); + + return err; +} + +static void efa_destroy_eqs(struct efa_dev *dev) +{ + int i; + + for (i = 0; i < dev->neqs; i++) + efa_destroy_eq(dev, &dev->eqs[i]); + + kfree(dev->eqs); +} + +#ifdef HAVE_IB_DEV_OPS +static const struct ib_device_ops efa_dev_ops = { +#ifdef HAVE_IB_DEVICE_OPS_COMMON + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_EFA, + .uverbs_abi_ver = EFA_UVERBS_ABI_VERSION, +#endif + +#ifdef HAVE_SPLIT_STATS_ALLOC + .alloc_hw_port_stats = efa_alloc_hw_port_stats, + .alloc_hw_device_stats = efa_alloc_hw_device_stats, +#else + .alloc_hw_stats = efa_alloc_hw_stats, +#endif +#ifdef HAVE_PD_CORE_ALLOCATION + .alloc_pd = efa_alloc_pd, +#else + .alloc_pd = efa_kzalloc_pd, +#endif +#ifdef HAVE_UCONTEXT_CORE_ALLOCATION + .alloc_ucontext = efa_alloc_ucontext, +#else + .alloc_ucontext = efa_kzalloc_ucontext, +#endif +#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED +#ifdef HAVE_AH_CORE_ALLOCATION + .create_ah = efa_create_ah, +#else + .create_ah = efa_kzalloc_ah, +#endif +#endif +#ifdef HAVE_CQ_CORE_ALLOCATION + .create_cq = efa_create_cq, +#else + .create_cq = efa_kzalloc_cq, +#endif +#ifdef HAVE_QP_CORE_ALLOCATION + .create_qp = efa_create_qp, +#else + .create_qp = efa_kzalloc_qp, +#endif +#ifdef HAVE_UVERBS_CMD_MASK_NOT_NEEDED + .create_user_ah = efa_create_ah, +#endif + .dealloc_pd = efa_dealloc_pd, + .dealloc_ucontext = efa_dealloc_ucontext, + .dereg_mr = efa_dereg_mr, + .destroy_ah = efa_destroy_ah, + .destroy_cq = efa_destroy_cq, + .destroy_qp = efa_destroy_qp, +#ifndef HAVE_NO_KVERBS_DRIVERS + .get_dma_mr = efa_get_dma_mr, +#endif + .get_hw_stats = efa_get_hw_stats, + .get_link_layer = efa_port_link_layer, + .get_port_immutable = efa_get_port_immutable, + .mmap = efa_mmap, +#ifdef HAVE_CORE_MMAP_XA + .mmap_free = efa_mmap_free, +#endif + .modify_qp = efa_modify_qp, +#ifndef HAVE_NO_KVERBS_DRIVERS + .poll_cq = efa_poll_cq, + .post_recv = efa_post_recv, + .post_send = efa_post_send, +#endif + .query_device = efa_query_device, + .query_gid = efa_query_gid, + .query_pkey = efa_query_pkey, + .query_port = efa_query_port, + .query_qp = efa_query_qp, + .reg_user_mr = efa_reg_mr, +#ifdef HAVE_MR_DMABUF + .reg_user_mr_dmabuf = efa_reg_user_mr_dmabuf, +#endif +#ifndef HAVE_NO_KVERBS_DRIVERS + .req_notify_cq = efa_req_notify_cq, +#endif + +#ifdef HAVE_AH_CORE_ALLOCATION + INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah), +#endif +#ifdef HAVE_CQ_CORE_ALLOCATION + INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq), +#endif +#ifdef HAVE_PD_CORE_ALLOCATION + INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd), +#endif +#ifdef HAVE_QP_CORE_ALLOCATION + INIT_RDMA_OBJ_SIZE(ib_qp, efa_qp, ibqp), +#endif +#ifdef HAVE_UCONTEXT_CORE_ALLOCATION + INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext), +#endif +}; +#endif + +static int efa_ib_device_add(struct efa_dev *dev) +{ + struct efa_com_get_hw_hints_result hw_hints; + struct pci_dev *pdev = dev->pdev; + int err; + + efa_stats_init(dev); + + err = efa_com_get_device_attr(&dev->edev, &dev->dev_attr); + if (err) + return err; + + dev_dbg(&dev->pdev->dev, "Doorbells bar (%d)\n", dev->dev_attr.db_bar); + err = efa_request_doorbell_bar(dev); + if (err) + return err; + + err = efa_com_get_hw_hints(&dev->edev, &hw_hints); + if (err) + goto err_release_doorbell_bar; + + efa_update_hw_hints(dev, &hw_hints); + + /* Try to enable all the available aenq groups */ + err = efa_com_set_aenq_config(&dev->edev, EFA_AENQ_ENABLED_GROUPS); + if (err) + goto err_release_doorbell_bar; + + err = efa_create_eqs(dev); + if (err) + goto err_release_doorbell_bar; + + efa_set_host_info(dev); + + dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED; + dev->ibdev.phys_port_cnt = 1; + dev->ibdev.num_comp_vectors = dev->neqs ?: 1; +#ifdef HAVE_DEV_PARENT + dev->ibdev.dev.parent = &pdev->dev; +#else + dev->ibdev.dma_device = &pdev->dev; +#endif + +#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED + dev->ibdev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH); +#endif + +#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED + dev->ibdev.uverbs_ex_cmd_mask = + (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE); +#endif + +#ifndef HAVE_IB_DEVICE_OPS_COMMON +#ifdef HAVE_DRIVER_ID + dev->ibdev.driver_id = RDMA_DRIVER_EFA; +#endif + dev->ibdev.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION; + dev->ibdev.owner = THIS_MODULE; +#endif +#ifdef HAVE_IB_DEV_OPS + ib_set_device_ops(&dev->ibdev, &efa_dev_ops); +#else + dev->ibdev.alloc_hw_stats = efa_alloc_hw_stats; + dev->ibdev.alloc_pd = efa_kzalloc_pd; + dev->ibdev.alloc_ucontext = efa_kzalloc_ucontext; + dev->ibdev.create_ah = efa_kzalloc_ah; + dev->ibdev.create_cq = efa_kzalloc_cq; + dev->ibdev.create_qp = efa_kzalloc_qp; + dev->ibdev.dealloc_pd = efa_dealloc_pd; + dev->ibdev.dealloc_ucontext = efa_dealloc_ucontext; + dev->ibdev.dereg_mr = efa_dereg_mr; + dev->ibdev.destroy_ah = efa_destroy_ah; + dev->ibdev.destroy_cq = efa_destroy_cq; + dev->ibdev.destroy_qp = efa_destroy_qp; + dev->ibdev.get_dma_mr = efa_get_dma_mr; + dev->ibdev.get_hw_stats = efa_get_hw_stats; + dev->ibdev.get_link_layer = efa_port_link_layer; + dev->ibdev.get_port_immutable = efa_get_port_immutable; + dev->ibdev.mmap = efa_mmap; + dev->ibdev.modify_qp = efa_modify_qp; + dev->ibdev.poll_cq = efa_poll_cq; + dev->ibdev.post_recv = efa_post_recv; + dev->ibdev.post_send = efa_post_send; + dev->ibdev.query_device = efa_query_device; + dev->ibdev.query_gid = efa_query_gid; + dev->ibdev.query_pkey = efa_query_pkey; + dev->ibdev.query_port = efa_query_port; + dev->ibdev.query_qp = efa_query_qp; + dev->ibdev.reg_user_mr = efa_reg_mr; + dev->ibdev.req_notify_cq = efa_req_notify_cq; +#endif + +#ifdef HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM + err = ib_register_device(&dev->ibdev, "efa_%d", &pdev->dev); +#elif defined(HAVE_IB_REGISTER_DEVICE_TWO_PARAMS) + err = ib_register_device(&dev->ibdev, "efa_%d"); +#elif defined(HAVE_IB_REGISTER_DEVICE_NAME_PARAM) + err = ib_register_device(&dev->ibdev, "efa_%d", NULL); +#else + strscpy(dev->ibdev.name, "efa_%d", + sizeof(dev->ibdev.name)); + + err = ib_register_device(&dev->ibdev, NULL); +#endif + if (err) + goto err_destroy_eqs; + + ibdev_info(&dev->ibdev, "IB device registered\n"); + + return 0; + +err_destroy_eqs: + efa_destroy_eqs(dev); +err_release_doorbell_bar: + efa_release_doorbell_bar(dev); + return err; +} + +static void efa_ib_device_remove(struct efa_dev *dev) +{ + ibdev_info(&dev->ibdev, "Unregister ib device\n"); + ib_unregister_device(&dev->ibdev); + efa_destroy_eqs(dev); + efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL); + efa_release_doorbell_bar(dev); +} + +static void efa_disable_msix(struct efa_dev *dev) +{ + pci_free_irq_vectors(dev->pdev); +} + +static int efa_enable_msix(struct efa_dev *dev) +{ + int msix_vecs, irq_num; + + /* + * Reserve the max msix vectors we might need, one vector is reserved + * for admin. + */ + msix_vecs = min_t(int, pci_msix_vec_count(dev->pdev), + num_online_cpus() + 1); + dev_dbg(&dev->pdev->dev, "Trying to enable MSI-X, vectors %d\n", + msix_vecs); + + dev->admin_msix_vector_idx = EFA_MGMNT_MSIX_VEC_IDX; + irq_num = pci_alloc_irq_vectors(dev->pdev, msix_vecs, + msix_vecs, PCI_IRQ_MSIX); + + if (irq_num < 0) { + dev_err(&dev->pdev->dev, "Failed to enable MSI-X. irq_num %d\n", + irq_num); + return -ENOSPC; + } + + if (irq_num != msix_vecs) { + efa_disable_msix(dev); + dev_err(&dev->pdev->dev, + "Allocated %d MSI-X (out of %d requested)\n", + irq_num, msix_vecs); + return -ENOSPC; + } + + return 0; +} + +static int efa_device_init(struct efa_com_dev *edev, struct pci_dev *pdev) +{ + int dma_width; + int err; + + err = efa_com_dev_reset(edev, EFA_REGS_RESET_NORMAL); + if (err) + return err; + + err = efa_com_validate_version(edev); + if (err) + return err; + + dma_width = efa_com_get_dma_width(edev); + if (dma_width < 0) { + err = dma_width; + return err; + } + + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(dma_width)); + if (err) { + dev_err(&pdev->dev, "dma_set_mask_and_coherent failed %d\n", err); + return err; + } + + dma_set_max_seg_size(&pdev->dev, UINT_MAX); + return 0; +} + +static struct efa_dev *efa_probe_device(struct pci_dev *pdev) +{ + struct efa_com_dev *edev; + struct efa_dev *dev; + int bars; + int err; + + err = pci_enable_device_mem(pdev); + if (err) { + dev_err(&pdev->dev, "pci_enable_device_mem() failed!\n"); + return ERR_PTR(err); + } + + pci_set_master(pdev); + +#ifdef HAVE_SAFE_IB_ALLOC_DEVICE + dev = ib_alloc_device(efa_dev, ibdev); +#else + dev = (struct efa_dev *)ib_alloc_device(sizeof(*dev)); +#endif + if (!dev) { + dev_err(&pdev->dev, "Device alloc failed\n"); + err = -ENOMEM; + goto err_disable_device; + } + + pci_set_drvdata(pdev, dev); + edev = &dev->edev; + edev->efa_dev = dev; + edev->dmadev = &pdev->dev; + dev->pdev = pdev; +#ifdef HAVE_XARRAY + xa_init(&dev->cqs_xa); +#else + memset(dev->cqs_arr, 0, sizeof(dev->cqs_arr)); +#endif + + bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK; + err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + if (err) { + dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n", + err); + goto err_ibdev_destroy; + } + + dev->reg_bar_addr = pci_resource_start(pdev, EFA_REG_BAR); + dev->reg_bar_len = pci_resource_len(pdev, EFA_REG_BAR); + dev->mem_bar_addr = pci_resource_start(pdev, EFA_MEM_BAR); + dev->mem_bar_len = pci_resource_len(pdev, EFA_MEM_BAR); + + edev->reg_bar = devm_ioremap(&pdev->dev, + dev->reg_bar_addr, + dev->reg_bar_len); + if (!edev->reg_bar) { + dev_err(&pdev->dev, "Failed to remap register bar\n"); + err = -EFAULT; + goto err_release_bars; + } + + err = efa_com_mmio_reg_read_init(edev); + if (err) { + dev_err(&pdev->dev, "Failed to init readless MMIO\n"); + goto err_iounmap; + } + + err = efa_device_init(edev, pdev); + if (err) { + dev_err(&pdev->dev, "EFA device init failed\n"); + if (err == -ETIME) + err = -EPROBE_DEFER; + goto err_reg_read_destroy; + } + + err = efa_enable_msix(dev); + if (err) + goto err_reg_read_destroy; + + edev->aq.msix_vector_idx = dev->admin_msix_vector_idx; + edev->aenq.msix_vector_idx = dev->admin_msix_vector_idx; + + err = efa_set_mgmnt_irq(dev); + if (err) + goto err_disable_msix; + + err = efa_com_admin_init(edev, &aenq_handlers); + if (err) + goto err_free_mgmnt_irq; + + err = efa_sysfs_init(dev); + if (err) + goto err_admin_destroy; + + return dev; + +err_admin_destroy: + efa_com_admin_destroy(edev); +err_free_mgmnt_irq: + efa_free_irq(dev, &dev->admin_irq); +err_disable_msix: + efa_disable_msix(dev); +err_reg_read_destroy: + efa_com_mmio_reg_read_destroy(edev); +err_iounmap: + devm_iounmap(&pdev->dev, edev->reg_bar); +err_release_bars: + efa_release_bars(dev, EFA_BASE_BAR_MASK); +err_ibdev_destroy: + ib_dealloc_device(&dev->ibdev); +err_disable_device: + pci_disable_device(pdev); + return ERR_PTR(err); +} + +static void efa_remove_device(struct pci_dev *pdev) +{ + struct efa_dev *dev = pci_get_drvdata(pdev); + struct efa_com_dev *edev; + + edev = &dev->edev; + efa_sysfs_destroy(dev); + efa_com_admin_destroy(edev); + efa_free_irq(dev, &dev->admin_irq); + efa_disable_msix(dev); + efa_com_mmio_reg_read_destroy(edev); + devm_iounmap(&pdev->dev, edev->reg_bar); + efa_release_bars(dev, EFA_BASE_BAR_MASK); +#ifdef HAVE_XARRAY + xa_destroy(&dev->cqs_xa); +#endif + ib_dealloc_device(&dev->ibdev); + pci_disable_device(pdev); +} + +static int efa_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + struct efa_dev *dev; + int err; + + dev = efa_probe_device(pdev); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + err = efa_ib_device_add(dev); + if (err) + goto err_remove_device; + + return 0; + +err_remove_device: + efa_remove_device(pdev); + return err; +} + +static void efa_remove(struct pci_dev *pdev) +{ + struct efa_dev *dev = pci_get_drvdata(pdev); + + efa_ib_device_remove(dev); + efa_remove_device(pdev); +} + +static struct pci_driver efa_pci_driver = { + .name = DRV_MODULE_NAME, + .id_table = efa_pci_tbl, + .probe = efa_probe, + .remove = efa_remove, +}; + +static int __init efa_init(void) +{ + int err; + + pr_info("%s\n", version); + + err = pci_register_driver(&efa_pci_driver); + if (err) { + pr_err("Couldn't register efa driver\n"); + return err; + } + +#ifdef HAVE_EFA_P2P + efa_p2p_init(); +#endif + + return 0; +} + +static void __exit efa_exit(void) +{ + pci_unregister_driver(&efa_pci_driver); +} + +module_init(efa_init); +module_exit(efa_exit); diff --git a/drivers/amazon/net/efa/efa_neuron.c b/drivers/amazon/net/efa/efa_neuron.c new file mode 100644 index 0000000000000..ec2644e3079c4 --- /dev/null +++ b/drivers/amazon/net/efa/efa_neuron.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include + +#include "efa_p2p.h" +#include "neuron_p2p.h" + +#define NEURON_PAGE_SHIFT 12 +#define NEURON_PAGE_SIZE BIT_ULL(NEURON_PAGE_SHIFT) + +struct efa_neuronmem_ops { + int (*register_va)(u64 virtual_address, u64 length, + struct neuron_p2p_va_info **vainfo, + void (*free_callback)(void *data), + void *data); + int (*unregister_va)(struct neuron_p2p_va_info *vainfo); +}; + +struct efa_neuronmem { + struct efa_p2pmem p2pmem; + struct efa_neuronmem_ops ops; + struct neuron_p2p_va_info *va_info; + u64 virt_start; +}; + +static unsigned int neuronmem_pgsz(struct efa_dev *dev, + struct efa_p2pmem *p2pmem) +{ + struct efa_neuronmem *neuronmem; + + neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem); + return BIT(neuronmem->va_info->shift_page_size); +} + +static int neuronmem_get_fp(struct efa_neuronmem *neuronmem) +{ + neuronmem->ops.register_va = symbol_get(neuron_p2p_register_va); + if (!neuronmem->ops.register_va) + goto err_out; + + neuronmem->ops.unregister_va = symbol_get(neuron_p2p_unregister_va); + if (!neuronmem->ops.unregister_va) + goto err_put_register_va; + + return 0; + +err_put_register_va: + symbol_put(neuron_p2p_register_va); +err_out: + return -EINVAL; +} + +static void neuronmem_put_fp(void) +{ + symbol_put(neuron_p2p_unregister_va); + symbol_put(neuron_p2p_register_va); +} + +static void neuronmem_free_cb(void *data) +{ + pr_debug("Free callback ticket %llu\n", (u64)data); + efa_p2p_put((u64)data, true); +} + +static int neuronmem_register_va(struct efa_dev *dev, struct efa_neuronmem *neuronmem, + u64 addr, u64 size, u64 ticket) +{ + int err; + + err = neuronmem->ops.register_va(addr, size, &neuronmem->va_info, + neuronmem_free_cb, (void *)ticket); + if (err) { + ibdev_dbg(&dev->ibdev, "neuron_p2p_register_va failed %d\n", err); + return err; + } + + return 0; +} + +static struct efa_p2pmem *neuronmem_get(struct efa_dev *dev, u64 ticket, u64 start, + u64 length) +{ + struct efa_neuronmem *neuronmem; + u64 virt_start; + u64 virt_end; + u64 pinsz; + int err; + + neuronmem = kzalloc(sizeof(*neuronmem), GFP_KERNEL); + if (!neuronmem) + return NULL; + + virt_start = ALIGN_DOWN(start, NEURON_PAGE_SIZE); + virt_end = ALIGN(start + length, NEURON_PAGE_SIZE); + pinsz = virt_end - virt_start; + neuronmem->virt_start = virt_start; + + err = neuronmem_get_fp(neuronmem); + if (err) + /* Neuron module is not loaded */ + goto err_free; + + err = neuronmem_register_va(dev, neuronmem, virt_start, pinsz, ticket); + if (err) + /* Most likely not our pages */ + goto err_put_fp; + + return &neuronmem->p2pmem; + +err_put_fp: + neuronmem_put_fp(); +err_free: + kfree(neuronmem); + return NULL; +} + +static int neuronmem_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + u64 *page_list) +{ + struct neuron_p2p_page_info *pg_info; + struct neuron_p2p_va_info *va_info; + struct efa_neuronmem *neuronmem; + int ent_idx, pa_idx; + int pg_idx = 0; + u64 pa; + + neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem); + va_info = neuronmem->va_info; + + for (ent_idx = 0; ent_idx < va_info->entries; ent_idx++) { + pg_info = va_info->page_info + ent_idx; + pa = pg_info->physical_address; + for (pa_idx = 0; pa_idx < pg_info->page_count; pa_idx++) { + page_list[pg_idx++] = pa; + pa += BIT(va_info->shift_page_size); + } + } + + return 0; +} + +static void neuronmem_release(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + bool in_cb) +{ + struct efa_neuronmem *neuronmem; + + neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem); + + neuronmem->ops.unregister_va(neuronmem->va_info); + neuronmem_put_fp(); + kfree(neuronmem); +} + +struct neuronmem_provider { + struct efa_p2p_provider p2p; +}; + +static const struct neuronmem_provider prov = { + .p2p = { + .ops = { + .try_get = neuronmem_get, + .to_page_list = neuronmem_to_page_list, + .release = neuronmem_release, + .get_page_size = neuronmem_pgsz, + }, + .type = EFA_P2P_PROVIDER_NEURON, + }, +}; + +const struct efa_p2p_provider *neuronmem_get_provider(void) +{ + return &prov.p2p; +} diff --git a/drivers/amazon/net/efa/efa_p2p.c b/drivers/amazon/net/efa/efa_p2p.c new file mode 100644 index 0000000000000..9daf101288f43 --- /dev/null +++ b/drivers/amazon/net/efa/efa_p2p.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "efa_p2p.h" + +static struct mutex p2p_list_lock; +static struct list_head p2p_list; +static atomic64_t next_p2p_ticket; + +static const struct efa_p2p_provider *prov_arr[EFA_P2P_PROVIDER_MAX]; + +/* Register all providers here */ +static void p2p_providers_init(void) +{ + prov_arr[EFA_P2P_PROVIDER_NVMEM] = nvmem_get_provider(); + prov_arr[EFA_P2P_PROVIDER_NEURON] = neuronmem_get_provider(); +} + +void efa_p2p_init(void) +{ + mutex_init(&p2p_list_lock); + INIT_LIST_HEAD(&p2p_list); + /* + * Ideally, first ticket would be zero, but that would make callback + * data NULL which is invalid. + */ + atomic64_set(&next_p2p_ticket, 1); + + p2p_providers_init(); +} + +static struct efa_p2pmem *ticket_to_p2p(u64 ticket) +{ + struct efa_p2pmem *p2pmem; + + lockdep_assert_held(&p2p_list_lock); + list_for_each_entry(p2pmem, &p2p_list, list) { + if (p2pmem->ticket == ticket) + return p2pmem; + } + + return NULL; +} + +int efa_p2p_put(u64 ticket, bool in_cb) +{ + struct efa_com_dereg_mr_params params = {}; + struct efa_p2pmem *p2pmem; + struct efa_dev *dev; + int err; + + mutex_lock(&p2p_list_lock); + p2pmem = ticket_to_p2p(ticket); + if (!p2pmem) { + pr_debug("Ticket %llu not found in the p2pmem list\n", ticket); + mutex_unlock(&p2p_list_lock); + return 0; + } + + dev = p2pmem->dev; + if (p2pmem->needs_dereg) { + params.l_key = p2pmem->lkey; + err = efa_com_dereg_mr(&dev->edev, ¶ms); + if (err) { + mutex_unlock(&p2p_list_lock); + return err; + } + p2pmem->needs_dereg = false; + } + + list_del(&p2pmem->list); + mutex_unlock(&p2p_list_lock); + p2pmem->prov->ops.release(dev, p2pmem, in_cb); + + return 0; +} + +struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start, + u64 length) +{ + const struct efa_p2p_provider *prov; + struct efa_p2pmem *p2pmem; + u64 ticket; + int i; + + ticket = atomic64_fetch_inc(&next_p2p_ticket); + for (i = 0; i < EFA_P2P_PROVIDER_MAX; i++) { + prov = prov_arr[i]; + p2pmem = prov->ops.try_get(dev, ticket, start, length); + if (p2pmem) + break; + } + if (!p2pmem) + /* No provider was found, most likely cpu pages */ + return NULL; + + p2pmem->dev = dev; + p2pmem->ticket = ticket; + p2pmem->prov = prov; + mr->p2p_ticket = p2pmem->ticket; + + mutex_lock(&p2p_list_lock); + list_add(&p2pmem->list, &p2p_list); + mutex_unlock(&p2p_list_lock); + + return p2pmem; +} + +int efa_p2p_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + u64 *page_list) +{ + return p2pmem->prov->ops.to_page_list(dev, p2pmem, page_list); +} + +unsigned int efa_p2p_get_page_size(struct efa_dev *dev, + struct efa_p2pmem *p2pmem) +{ + return p2pmem->prov->ops.get_page_size(dev, p2pmem); +} diff --git a/drivers/amazon/net/efa/efa_p2p.h b/drivers/amazon/net/efa/efa_p2p.h new file mode 100644 index 0000000000000..89ee7a9935c11 --- /dev/null +++ b/drivers/amazon/net/efa/efa_p2p.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_P2P_H_ +#define _EFA_P2P_H_ + +#include "efa.h" + +struct efa_p2p_ops { + struct efa_p2pmem *(*try_get)(struct efa_dev *dev, u64 ticket, u64 start, + u64 length); + int (*to_page_list)(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + u64 *page_list); + void (*release)(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + bool in_cb); + unsigned int (*get_page_size)(struct efa_dev *dev, + struct efa_p2pmem *p2pmem); +}; + +enum efa_p2p_prov { + EFA_P2P_PROVIDER_NVMEM, + EFA_P2P_PROVIDER_NEURON, + EFA_P2P_PROVIDER_MAX, +}; + +struct efa_p2p_provider { + const struct efa_p2p_ops ops; + enum efa_p2p_prov type; +}; + +struct efa_p2pmem { + struct efa_dev *dev; + const struct efa_p2p_provider *prov; + u64 ticket; + u32 lkey; + bool needs_dereg; + struct list_head list; /* member of efa_p2p_list */ +}; + +void efa_p2p_init(void); +struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start, + u64 length); +unsigned int efa_p2p_get_page_size(struct efa_dev *dev, + struct efa_p2pmem *p2pmem); +int efa_p2p_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem, + u64 *page_list); +int efa_p2p_put(u64 ticket, bool in_cb); + +/* Provider specific stuff go here */ +const struct efa_p2p_provider *nvmem_get_provider(void); +bool nvmem_is_supported(void); + +const struct efa_p2p_provider *neuronmem_get_provider(void); + +#endif /* _EFA_P2P_H_ */ diff --git a/drivers/amazon/net/efa/efa_regs_defs.h b/drivers/amazon/net/efa/efa_regs_defs.h new file mode 100644 index 0000000000000..714ae62588004 --- /dev/null +++ b/drivers/amazon/net/efa/efa_regs_defs.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_REGS_H_ +#define _EFA_REGS_H_ + +enum efa_regs_reset_reason_types { + EFA_REGS_RESET_NORMAL = 0, + /* Keep alive timeout */ + EFA_REGS_RESET_KEEP_ALIVE_TO = 1, + EFA_REGS_RESET_ADMIN_TO = 2, + EFA_REGS_RESET_INIT_ERR = 3, + EFA_REGS_RESET_DRIVER_INVALID_STATE = 4, + EFA_REGS_RESET_OS_TRIGGER = 5, + EFA_REGS_RESET_SHUTDOWN = 6, + EFA_REGS_RESET_USER_TRIGGER = 7, + EFA_REGS_RESET_GENERIC = 8, +}; + +/* efa_registers offsets */ + +/* 0 base */ +#define EFA_REGS_VERSION_OFF 0x0 +#define EFA_REGS_CONTROLLER_VERSION_OFF 0x4 +#define EFA_REGS_CAPS_OFF 0x8 +#define EFA_REGS_AQ_BASE_LO_OFF 0x10 +#define EFA_REGS_AQ_BASE_HI_OFF 0x14 +#define EFA_REGS_AQ_CAPS_OFF 0x18 +#define EFA_REGS_ACQ_BASE_LO_OFF 0x20 +#define EFA_REGS_ACQ_BASE_HI_OFF 0x24 +#define EFA_REGS_ACQ_CAPS_OFF 0x28 +#define EFA_REGS_AQ_PROD_DB_OFF 0x2c +#define EFA_REGS_AENQ_CAPS_OFF 0x34 +#define EFA_REGS_AENQ_BASE_LO_OFF 0x38 +#define EFA_REGS_AENQ_BASE_HI_OFF 0x3c +#define EFA_REGS_AENQ_CONS_DB_OFF 0x40 +#define EFA_REGS_INTR_MASK_OFF 0x4c +#define EFA_REGS_DEV_CTL_OFF 0x54 +#define EFA_REGS_DEV_STS_OFF 0x58 +#define EFA_REGS_MMIO_REG_READ_OFF 0x5c +#define EFA_REGS_MMIO_RESP_LO_OFF 0x60 +#define EFA_REGS_MMIO_RESP_HI_OFF 0x64 +#define EFA_REGS_EQ_DB_OFF 0x68 + +/* version register */ +#define EFA_REGS_VERSION_MINOR_VERSION_MASK 0xff +#define EFA_REGS_VERSION_MAJOR_VERSION_MASK 0xff00 + +/* controller_version register */ +#define EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK 0xff +#define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK 0xff00 +#define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK 0xff0000 +#define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK 0xff000000 + +/* caps register */ +#define EFA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK 0x1 +#define EFA_REGS_CAPS_RESET_TIMEOUT_MASK 0x3e +#define EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK 0xff00 +#define EFA_REGS_CAPS_ADMIN_CMD_TO_MASK 0xf0000 + +/* aq_caps register */ +#define EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK 0xffff +#define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK 0xffff0000 + +/* acq_caps register */ +#define EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK 0xffff +#define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK 0xff0000 +#define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK 0xff000000 + +/* aenq_caps register */ +#define EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK 0xffff +#define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK 0xff0000 +#define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK 0xff000000 + +/* intr_mask register */ +#define EFA_REGS_INTR_MASK_EN_MASK 0x1 + +/* dev_ctl register */ +#define EFA_REGS_DEV_CTL_DEV_RESET_MASK 0x1 +#define EFA_REGS_DEV_CTL_AQ_RESTART_MASK 0x2 +#define EFA_REGS_DEV_CTL_RESET_REASON_MASK 0xf0000000 + +/* dev_sts register */ +#define EFA_REGS_DEV_STS_READY_MASK 0x1 +#define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK 0x2 +#define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK 0x4 +#define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK 0x8 +#define EFA_REGS_DEV_STS_RESET_FINISHED_MASK 0x10 +#define EFA_REGS_DEV_STS_FATAL_ERROR_MASK 0x20 + +/* mmio_reg_read register */ +#define EFA_REGS_MMIO_REG_READ_REQ_ID_MASK 0xffff +#define EFA_REGS_MMIO_REG_READ_REG_OFF_MASK 0xffff0000 + +/* eq_db register */ +#define EFA_REGS_EQ_DB_EQN_MASK 0xffff +#define EFA_REGS_EQ_DB_ARM_MASK 0x80000000 + +#endif /* _EFA_REGS_H_ */ diff --git a/drivers/amazon/net/efa/efa_sysfs.c b/drivers/amazon/net/efa/efa_sysfs.c new file mode 100644 index 0000000000000..8e8b2bd210db1 --- /dev/null +++ b/drivers/amazon/net/efa/efa_sysfs.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "efa_sysfs.h" +#include "kcompat.h" + +#include +#include + +#ifndef HAVE_SYSFS_EMIT +#include + +static int sysfs_emit(char *buf, const char *fmt, ...) +{ + va_list args; + int len; + + if (!buf) + return 0; + + va_start(args, fmt); + len = vscnprintf(buf, PAGE_SIZE, fmt, args); + va_end(args); + + return len; +} +#endif + +#ifdef HAVE_EFA_P2P +#include "efa_p2p.h" + +static ssize_t gdr_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + if (nvmem_is_supported()) + return sysfs_emit(buf, "1\n"); + + return sysfs_emit(buf, "0\n"); +} + +static DEVICE_ATTR_RO(gdr); +#endif + +int efa_sysfs_init(struct efa_dev *dev) +{ +#ifdef HAVE_EFA_P2P + struct device *device = &dev->pdev->dev; + + if (device_create_file(device, &dev_attr_gdr)) + dev_err(device, "Failed to create GDR sysfs file\n"); +#endif + return 0; +} + +void efa_sysfs_destroy(struct efa_dev *dev) +{ +#ifdef HAVE_EFA_P2P + device_remove_file(&dev->pdev->dev, &dev_attr_gdr); +#endif +} diff --git a/drivers/amazon/net/efa/efa_sysfs.h b/drivers/amazon/net/efa/efa_sysfs.h new file mode 100644 index 0000000000000..c390aa547e5a6 --- /dev/null +++ b/drivers/amazon/net/efa/efa_sysfs.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_SYSFS_H_ +#define _EFA_SYSFS_H_ + +#include "efa.h" + +int efa_sysfs_init(struct efa_dev *dev); + +void efa_sysfs_destroy(struct efa_dev *dev); + +#endif /* _EFA_SYSFS_H_ */ diff --git a/drivers/amazon/net/efa/efa_verbs.c b/drivers/amazon/net/efa/efa_verbs.c new file mode 100644 index 0000000000000..c9535ee90108b --- /dev/null +++ b/drivers/amazon/net/efa/efa_verbs.c @@ -0,0 +1,3022 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "kcompat.h" +#ifdef HAVE_MR_DMABUF +#include +#include +#endif +#include +#include + +#include +#include +#include +#include +#ifdef HAVE_UDATA_TO_DRV_CONTEXT +#include +#endif + +#include "efa.h" +#include "efa_io_defs.h" + +#ifdef HAVE_EFA_P2P +#include "efa_p2p.h" +#endif + +enum { + EFA_MMAP_DMA_PAGE = 0, + EFA_MMAP_IO_WC, + EFA_MMAP_IO_NC, +}; + +#define EFA_AENQ_ENABLED_GROUPS \ + (BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \ + BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE)) + +struct efa_user_mmap_entry { + struct rdma_user_mmap_entry rdma_entry; +#ifndef HAVE_CORE_MMAP_XA + struct list_head list; +#endif + u64 address; + u8 mmap_flag; +}; + +#define EFA_DEFINE_DEVICE_STATS(op) \ + op(EFA_SUBMITTED_CMDS, "submitted_cmds") \ + op(EFA_COMPLETED_CMDS, "completed_cmds") \ + op(EFA_CMDS_ERR, "cmds_err") \ + op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \ + op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \ + op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \ + op(EFA_CREATE_QP_ERR, "create_qp_err") \ + op(EFA_CREATE_CQ_ERR, "create_cq_err") \ + op(EFA_REG_MR_ERR, "reg_mr_err") \ + op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \ + op(EFA_CREATE_AH_ERR, "create_ah_err") \ + op(EFA_MMAP_ERR, "mmap_err") + +#define EFA_DEFINE_PORT_STATS(op) \ + op(EFA_TX_BYTES, "tx_bytes") \ + op(EFA_TX_PKTS, "tx_pkts") \ + op(EFA_RX_BYTES, "rx_bytes") \ + op(EFA_RX_PKTS, "rx_pkts") \ + op(EFA_RX_DROPS, "rx_drops") \ + op(EFA_SEND_BYTES, "send_bytes") \ + op(EFA_SEND_WRS, "send_wrs") \ + op(EFA_RECV_BYTES, "recv_bytes") \ + op(EFA_RECV_WRS, "recv_wrs") \ + op(EFA_RDMA_READ_WRS, "rdma_read_wrs") \ + op(EFA_RDMA_READ_BYTES, "rdma_read_bytes") \ + op(EFA_RDMA_READ_WR_ERR, "rdma_read_wr_err") \ + op(EFA_RDMA_READ_RESP_BYTES, "rdma_read_resp_bytes") \ + +#define EFA_STATS_ENUM(ename, name) ename, +#ifdef HAVE_STAT_DESC_STRUCT +#define EFA_STATS_STR(ename, nam) \ + [ename].name = nam, +#else +#define EFA_STATS_STR(ename, nam) \ + [ename] = nam, +#endif + +enum efa_hw_device_stats { + EFA_DEFINE_DEVICE_STATS(EFA_STATS_ENUM) +}; + +#ifdef HAVE_STAT_DESC_STRUCT +static const struct rdma_stat_desc efa_device_stats_descs[] = { +#else +static const char *const efa_device_stats_descs[] = { +#endif + EFA_DEFINE_DEVICE_STATS(EFA_STATS_STR) +}; + +enum efa_hw_port_stats { + EFA_DEFINE_PORT_STATS(EFA_STATS_ENUM) +}; + +#ifdef HAVE_STAT_DESC_STRUCT +static const struct rdma_stat_desc efa_port_stats_descs[] = { +#else +static const char *const efa_port_stats_descs[] = { +#endif + EFA_DEFINE_PORT_STATS(EFA_STATS_STR) +}; + +#define EFA_CHUNK_PAYLOAD_SHIFT 12 +#define EFA_CHUNK_PAYLOAD_SIZE BIT(EFA_CHUNK_PAYLOAD_SHIFT) +#define EFA_CHUNK_PAYLOAD_PTR_SIZE 8 + +#define EFA_CHUNK_SHIFT 12 +#define EFA_CHUNK_SIZE BIT(EFA_CHUNK_SHIFT) +#define EFA_CHUNK_PTR_SIZE sizeof(struct efa_com_ctrl_buff_info) + +#define EFA_PTRS_PER_CHUNK \ + ((EFA_CHUNK_SIZE - EFA_CHUNK_PTR_SIZE) / EFA_CHUNK_PAYLOAD_PTR_SIZE) + +#define EFA_CHUNK_USED_SIZE \ + ((EFA_PTRS_PER_CHUNK * EFA_CHUNK_PAYLOAD_PTR_SIZE) + EFA_CHUNK_PTR_SIZE) + +struct pbl_chunk { + dma_addr_t dma_addr; + u64 *buf; + u32 length; +}; + +struct pbl_chunk_list { + struct pbl_chunk *chunks; + unsigned int size; +}; + +struct pbl_context { + union { + struct { + dma_addr_t dma_addr; + } continuous; + struct { + u32 pbl_buf_size_in_pages; + struct scatterlist *sgl; + int sg_dma_cnt; + struct pbl_chunk_list chunk_list; + } indirect; + } phys; + u64 *pbl_buf; + u32 pbl_buf_size_in_bytes; + u8 physically_continuous; +}; + +static inline struct efa_dev *to_edev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct efa_dev, ibdev); +} + +static inline struct efa_ucontext *to_eucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct efa_ucontext, ibucontext); +} + +static inline struct efa_pd *to_epd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct efa_pd, ibpd); +} + +static inline struct efa_mr *to_emr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct efa_mr, ibmr); +} + +static inline struct efa_qp *to_eqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct efa_qp, ibqp); +} + +static inline struct efa_cq *to_ecq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct efa_cq, ibcq); +} + +static inline struct efa_ah *to_eah(struct ib_ah *ibah) +{ + return container_of(ibah, struct efa_ah, ibah); +} + +static inline struct efa_user_mmap_entry * +to_emmap(struct rdma_user_mmap_entry *rdma_entry) +{ + return container_of(rdma_entry, struct efa_user_mmap_entry, rdma_entry); +} + +#define EFA_DEV_CAP(dev, cap) \ + ((dev)->dev_attr.device_caps & \ + EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_##cap##_MASK) + +#define is_reserved_cleared(reserved) \ + !memchr_inv(reserved, 0, sizeof(reserved)) + +static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr, + size_t size, enum dma_data_direction dir) +{ + void *addr; + + addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); + if (!addr) + return NULL; + + *dma_addr = dma_map_single(&dev->pdev->dev, addr, size, dir); + if (dma_mapping_error(&dev->pdev->dev, *dma_addr)) { + ibdev_err(&dev->ibdev, "Failed to map DMA address\n"); + free_pages_exact(addr, size); + return NULL; + } + + return addr; +} + +static void efa_free_mapped(struct efa_dev *dev, void *cpu_addr, + dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir) +{ + dma_unmap_single(&dev->pdev->dev, dma_addr, size, dir); + free_pages_exact(cpu_addr, size); +} + +#ifndef HAVE_CORE_MMAP_XA +/* + * This is only called when the ucontext is destroyed and there can be no + * concurrent query via mmap or allocate on the database, thus we can be sure no + * other thread is using the entry pointer. We also know that all the BAR + * pages have either been zap'd or munmaped at this point. Normal pages are + * refcounted and will be freed at the proper time. + */ +static void mmap_entries_remove_free(struct efa_dev *dev, + struct efa_ucontext *ucontext) +{ + struct efa_user_mmap_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) { + list_del(&entry->list); + ibdev_dbg( + &dev->ibdev, + "mmap: key[%#llx] addr[%#llx] len[%#zx] removed\n", + rdma_user_mmap_get_offset(&entry->rdma_entry), + entry->address, entry->rdma_entry.npages * PAGE_SIZE); + kfree(entry); + } +} + +static int mmap_entry_validate(struct efa_ucontext *ucontext, + struct vm_area_struct *vma) +{ + size_t length = vma->vm_end - vma->vm_start; + + if (length % PAGE_SIZE != 0 || !(vma->vm_flags & VM_SHARED)) { + ibdev_dbg(ucontext->ibucontext.device, + "length[%#zx] is not page size aligned[%#lx] or VM_SHARED is not set [%#lx]\n", + length, PAGE_SIZE, vma->vm_flags); + return -EINVAL; + } + + return 0; +} + +struct rdma_user_mmap_entry * +rdma_user_mmap_entry_get(struct ib_ucontext *ibucontext, + struct vm_area_struct *vma) +{ + struct efa_ucontext *ucontext = to_eucontext(ibucontext); + size_t length = vma->vm_end - vma->vm_start; + struct efa_user_mmap_entry *entry, *tmp; + u64 key = vma->vm_pgoff << PAGE_SHIFT; + int err; + + err = mmap_entry_validate(ucontext, vma); + if (err) + return NULL; + + mutex_lock(&ucontext->lock); + list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) { + if (rdma_user_mmap_get_offset(&entry->rdma_entry) == key && + entry->rdma_entry.npages * PAGE_SIZE == length) { + ibdev_dbg(ibucontext->device, + "mmap: key[%#llx] addr[%#llx] len[%#zx] removed\n", + key, entry->address, + entry->rdma_entry.npages * PAGE_SIZE); + mutex_unlock(&ucontext->lock); + return &entry->rdma_entry; + } + } + mutex_unlock(&ucontext->lock); + + return NULL; +} +#endif /* !defined (HAVE_CORE_MMAP_XA) */ + +int efa_query_device(struct ib_device *ibdev, + struct ib_device_attr *props, + struct ib_udata *udata) +{ + struct efa_com_get_device_attr_result *dev_attr; + struct efa_ibv_ex_query_device_resp resp = {}; + struct efa_dev *dev = to_edev(ibdev); + int err; + + if (udata && udata->inlen && + !ib_is_udata_cleared(udata, 0, udata->inlen)) { + ibdev_dbg(ibdev, + "Incompatible ABI params, udata not cleared\n"); + return -EINVAL; + } + + dev_attr = &dev->dev_attr; + + memset(props, 0, sizeof(*props)); + props->max_mr_size = dev_attr->max_mr_pages * PAGE_SIZE; + props->page_size_cap = dev_attr->page_size_cap; + props->vendor_id = dev->pdev->vendor; + props->vendor_part_id = dev->pdev->device; + props->hw_ver = dev->pdev->subsystem_device; + props->max_qp = dev_attr->max_qp; + props->max_cq = dev_attr->max_cq; + props->max_pd = dev_attr->max_pd; + props->max_mr = dev_attr->max_mr; + props->max_ah = dev_attr->max_ah; + props->max_cqe = dev_attr->max_cq_depth; + props->max_qp_wr = min_t(u32, dev_attr->max_sq_depth, + dev_attr->max_rq_depth); +#ifdef HAVE_MAX_SEND_RCV_SGE + props->max_send_sge = dev_attr->max_sq_sge; + props->max_recv_sge = dev_attr->max_rq_sge; +#else + props->max_sge = min_t(u16, dev_attr->max_sq_sge, + dev_attr->max_rq_sge); +#endif + props->max_sge_rd = dev_attr->max_wr_rdma_sge; + props->max_pkeys = 1; + + if (udata && udata->outlen) { + resp.max_sq_sge = dev_attr->max_sq_sge; + resp.max_rq_sge = dev_attr->max_rq_sge; + resp.max_sq_wr = dev_attr->max_sq_depth; + resp.max_rq_wr = dev_attr->max_rq_depth; + resp.max_rdma_size = dev_attr->max_rdma_size; + + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID; + if (EFA_DEV_CAP(dev, RDMA_READ)) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_READ; + + if (EFA_DEV_CAP(dev, RNR_RETRY)) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RNR_RETRY; + + if (dev->neqs) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS; + + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(ibdev, + "Failed to copy udata for query_device\n"); + return err; + } + } + + return 0; +} + +int efa_query_port(struct ib_device *ibdev, port_t port, + struct ib_port_attr *props) +{ + struct efa_dev *dev = to_edev(ibdev); + + props->lmc = 1; + + props->state = IB_PORT_ACTIVE; + props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->active_speed = IB_SPEED_EDR; + props->active_width = IB_WIDTH_4X; + props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu); + props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu); + props->max_msg_sz = dev->dev_attr.mtu; + props->max_vl_num = 1; + + return 0; +} + +int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct efa_dev *dev = to_edev(ibqp->device); + struct efa_com_query_qp_params params = {}; + struct efa_com_query_qp_result result; + struct efa_qp *qp = to_eqp(ibqp); + int err; + +#define EFA_QUERY_QP_SUPP_MASK \ + (IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | \ + IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP | IB_QP_RNR_RETRY) + + if (qp_attr_mask & ~EFA_QUERY_QP_SUPP_MASK) { + ibdev_dbg(&dev->ibdev, + "Unsupported qp_attr_mask[%#x] supported[%#x]\n", + qp_attr_mask, EFA_QUERY_QP_SUPP_MASK); + return -EOPNOTSUPP; + } + + memset(qp_attr, 0, sizeof(*qp_attr)); + memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + + params.qp_handle = qp->qp_handle; + err = efa_com_query_qp(&dev->edev, ¶ms, &result); + if (err) + return err; + + qp_attr->qp_state = result.qp_state; + qp_attr->qkey = result.qkey; + qp_attr->sq_psn = result.sq_psn; + qp_attr->sq_draining = result.sq_draining; + qp_attr->port_num = 1; + qp_attr->rnr_retry = result.rnr_retry; + + qp_attr->cap.max_send_wr = qp->max_send_wr; + qp_attr->cap.max_recv_wr = qp->max_recv_wr; + qp_attr->cap.max_send_sge = qp->max_send_sge; + qp_attr->cap.max_recv_sge = qp->max_recv_sge; + qp_attr->cap.max_inline_data = qp->max_inline_data; + + qp_init_attr->qp_type = ibqp->qp_type; + qp_init_attr->recv_cq = ibqp->recv_cq; + qp_init_attr->send_cq = ibqp->send_cq; + qp_init_attr->qp_context = ibqp->qp_context; + qp_init_attr->cap = qp_attr->cap; + + return 0; +} + +int efa_query_gid(struct ib_device *ibdev, port_t port, int index, + union ib_gid *gid) +{ + struct efa_dev *dev = to_edev(ibdev); + + memcpy(gid->raw, dev->dev_attr.addr, sizeof(dev->dev_attr.addr)); + + return 0; +} + +int efa_query_pkey(struct ib_device *ibdev, port_t port, u16 index, + u16 *pkey) +{ + if (index > 0) + return -EINVAL; + + *pkey = 0xffff; + return 0; +} + +static int efa_pd_dealloc(struct efa_dev *dev, u16 pdn) +{ + struct efa_com_dealloc_pd_params params = { + .pdn = pdn, + }; + + return efa_com_dealloc_pd(&dev->edev, ¶ms); +} + +#ifdef HAVE_ALLOC_PD_NO_UCONTEXT +int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +#else +int efa_alloc_pd(struct ib_pd *ibpd, + struct ib_ucontext *ibucontext, + struct ib_udata *udata) +#endif +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_ibv_alloc_pd_resp resp = {}; + struct efa_com_alloc_pd_result result; + struct efa_pd *pd = to_epd(ibpd); + int err; + +#ifndef HAVE_NO_KVERBS_DRIVERS + if (!udata) { + ibdev_dbg(&dev->ibdev, "udata is NULL\n"); + err = -EOPNOTSUPP; + goto err_out; + } +#endif + + if (udata->inlen && + !ib_is_udata_cleared(udata, 0, udata->inlen)) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, udata not cleared\n"); + err = -EINVAL; + goto err_out; + } + + err = efa_com_alloc_pd(&dev->edev, &result); + if (err) + goto err_out; + + pd->pdn = result.pdn; + resp.pdn = result.pdn; + + if (udata->outlen) { + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Failed to copy udata for alloc_pd\n"); + goto err_dealloc_pd; + } + } + + ibdev_dbg(&dev->ibdev, "Allocated pd[%d]\n", pd->pdn); + + return 0; + +err_dealloc_pd: + efa_pd_dealloc(dev, result.pdn); +err_out: + atomic64_inc(&dev->stats.alloc_pd_err); + return err; +} + +#ifndef HAVE_PD_CORE_ALLOCATION +struct ib_pd *efa_kzalloc_pd(struct ib_device *ibdev, + struct ib_ucontext *ibucontext, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibdev); + struct efa_pd *pd; + int err; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) { + atomic64_inc(&dev->stats.alloc_pd_err); + return ERR_PTR(-ENOMEM); + } + + pd->ibpd.device = ibdev; + +#ifdef HAVE_ALLOC_PD_NO_UCONTEXT + err = efa_alloc_pd(&pd->ibpd, udata); +#else + err = efa_alloc_pd(&pd->ibpd, ibucontext, udata); +#endif + if (err) + goto err_free; + + return &pd->ibpd; + +err_free: + kfree(pd); + return ERR_PTR(err); +} +#endif + +#ifdef HAVE_DEALLOC_PD_UDATA_RC +int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +#elif defined(HAVE_DEALLOC_PD_UDATA) +void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +#elif defined(HAVE_PD_CORE_ALLOCATION) +void efa_dealloc_pd(struct ib_pd *ibpd) +#else +int efa_dealloc_pd(struct ib_pd *ibpd) +#endif +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_pd *pd = to_epd(ibpd); + + ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn); + efa_pd_dealloc(dev, pd->pdn); +#ifndef HAVE_PD_CORE_ALLOCATION + kfree(pd); + + return 0; +#elif defined(HAVE_DEALLOC_PD_UDATA_RC) + return 0; +#endif +} + +static int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle) +{ + struct efa_com_destroy_qp_params params = { .qp_handle = qp_handle }; + + return efa_com_destroy_qp(&dev->edev, ¶ms); +} + +static void efa_qp_user_mmap_entries_remove(struct efa_qp *qp) +{ + rdma_user_mmap_entry_remove(qp->rq_mmap_entry); + rdma_user_mmap_entry_remove(qp->rq_db_mmap_entry); + rdma_user_mmap_entry_remove(qp->llq_desc_mmap_entry); + rdma_user_mmap_entry_remove(qp->sq_db_mmap_entry); +} + +#ifdef HAVE_DESTROY_QP_UDATA +int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +#else +int efa_destroy_qp(struct ib_qp *ibqp) +#endif +{ + struct efa_dev *dev = to_edev(ibqp->pd->device); + struct efa_qp *qp = to_eqp(ibqp); + int err; + + ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num); + + efa_qp_user_mmap_entries_remove(qp); + + err = efa_destroy_qp_handle(dev, qp->qp_handle); + if (err) + return err; + + if (qp->rq_cpu_addr) { + ibdev_dbg(&dev->ibdev, + "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n", + qp->rq_cpu_addr, qp->rq_size, + &qp->rq_dma_addr); + efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr, + qp->rq_size, DMA_TO_DEVICE); + } + +#ifndef HAVE_QP_CORE_ALLOCATION + kfree(qp); +#endif + return 0; +} + +#ifdef HAVE_CORE_MMAP_XA +static struct rdma_user_mmap_entry* +efa_user_mmap_entry_insert(struct ib_ucontext *ucontext, + u64 address, size_t length, + u8 mmap_flag, u64 *offset) +{ + struct efa_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL); + int err; + + if (!entry) + return NULL; + + entry->address = address; + entry->mmap_flag = mmap_flag; + + err = rdma_user_mmap_entry_insert(ucontext, &entry->rdma_entry, + length); + if (err) { + kfree(entry); + return NULL; + } + *offset = rdma_user_mmap_get_offset(&entry->rdma_entry); + + return &entry->rdma_entry; +} +#else +static struct rdma_user_mmap_entry * +efa_user_mmap_entry_insert(struct ib_ucontext *ibucontext, u64 address, + size_t length, u8 mmap_flag, u64 *offset) +{ + struct efa_ucontext *ucontext = to_eucontext(ibucontext); + struct efa_user_mmap_entry *entry; + u64 next_mmap_page; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + entry->address = address; + entry->rdma_entry.npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE); + entry->mmap_flag = mmap_flag; + + mutex_lock(&ucontext->lock); + next_mmap_page = ucontext->mmap_page + (length >> PAGE_SHIFT); + if (next_mmap_page >= U32_MAX) { + ibdev_dbg(ucontext->ibucontext.device, "Too many mmap pages\n"); + mutex_unlock(&ucontext->lock); + kfree(entry); + return NULL; + } + + entry->rdma_entry.start_pgoff = ucontext->mmap_page; + ucontext->mmap_page = next_mmap_page; + list_add_tail(&entry->list, &ucontext->pending_mmaps); + mutex_unlock(&ucontext->lock); + + *offset = rdma_user_mmap_get_offset(&entry->rdma_entry); + ibdev_dbg( + ucontext->ibucontext.device, + "mmap: addr[%#llx], len[%#zx], key[%#llx] inserted\n", + entry->address, entry->rdma_entry.npages * PAGE_SIZE, + rdma_user_mmap_get_offset(&entry->rdma_entry)); + + return &entry->rdma_entry; +} +#endif + +static int qp_mmap_entries_setup(struct efa_qp *qp, + struct efa_dev *dev, + struct efa_ucontext *ucontext, + struct efa_com_create_qp_params *params, + struct efa_ibv_create_qp_resp *resp) +{ + size_t length; + u64 address; + + address = dev->db_bar_addr + resp->sq_db_offset; + qp->sq_db_mmap_entry = + efa_user_mmap_entry_insert(&ucontext->ibucontext, + address, + PAGE_SIZE, EFA_MMAP_IO_NC, + &resp->sq_db_mmap_key); + if (!qp->sq_db_mmap_entry) + return -ENOMEM; + + resp->sq_db_offset &= ~PAGE_MASK; + + address = dev->mem_bar_addr + resp->llq_desc_offset; + length = PAGE_ALIGN(params->sq_ring_size_in_bytes + + (resp->llq_desc_offset & ~PAGE_MASK)); + + qp->llq_desc_mmap_entry = + efa_user_mmap_entry_insert(&ucontext->ibucontext, + address, length, + EFA_MMAP_IO_WC, + &resp->llq_desc_mmap_key); + if (!qp->llq_desc_mmap_entry) + goto err_remove_mmap; + + resp->llq_desc_offset &= ~PAGE_MASK; + + if (qp->rq_size) { + address = dev->db_bar_addr + resp->rq_db_offset; + + qp->rq_db_mmap_entry = + efa_user_mmap_entry_insert(&ucontext->ibucontext, + address, PAGE_SIZE, + EFA_MMAP_IO_NC, + &resp->rq_db_mmap_key); + if (!qp->rq_db_mmap_entry) + goto err_remove_mmap; + + resp->rq_db_offset &= ~PAGE_MASK; + + address = virt_to_phys(qp->rq_cpu_addr); + qp->rq_mmap_entry = + efa_user_mmap_entry_insert(&ucontext->ibucontext, + address, qp->rq_size, + EFA_MMAP_DMA_PAGE, + &resp->rq_mmap_key); + if (!qp->rq_mmap_entry) + goto err_remove_mmap; + + resp->rq_mmap_size = qp->rq_size; + } + + return 0; + +err_remove_mmap: + efa_qp_user_mmap_entries_remove(qp); + + return -ENOMEM; +} + +static int efa_qp_validate_cap(struct efa_dev *dev, + struct ib_qp_init_attr *init_attr) +{ + if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) { + ibdev_dbg(&dev->ibdev, + "qp: requested send wr[%u] exceeds the max[%u]\n", + init_attr->cap.max_send_wr, + dev->dev_attr.max_sq_depth); + return -EINVAL; + } + if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) { + ibdev_dbg(&dev->ibdev, + "qp: requested receive wr[%u] exceeds the max[%u]\n", + init_attr->cap.max_recv_wr, + dev->dev_attr.max_rq_depth); + return -EINVAL; + } + if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) { + ibdev_dbg(&dev->ibdev, + "qp: requested sge send[%u] exceeds the max[%u]\n", + init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge); + return -EINVAL; + } + if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) { + ibdev_dbg(&dev->ibdev, + "qp: requested sge recv[%u] exceeds the max[%u]\n", + init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge); + return -EINVAL; + } + if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) { + ibdev_dbg(&dev->ibdev, + "qp: requested inline data[%u] exceeds the max[%u]\n", + init_attr->cap.max_inline_data, + dev->dev_attr.inline_buf_size); + return -EINVAL; + } + + return 0; +} + +static int efa_qp_validate_attr(struct efa_dev *dev, + struct ib_qp_init_attr *init_attr) +{ + if (init_attr->qp_type != IB_QPT_DRIVER && + init_attr->qp_type != IB_QPT_UD) { + ibdev_dbg(&dev->ibdev, + "Unsupported qp type %d\n", init_attr->qp_type); + return -EOPNOTSUPP; + } + + if (init_attr->srq) { + ibdev_dbg(&dev->ibdev, "SRQ is not supported\n"); + return -EOPNOTSUPP; + } + + if (init_attr->create_flags) { + ibdev_dbg(&dev->ibdev, "Unsupported create flags\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct efa_com_create_qp_params create_qp_params = {}; + struct efa_com_create_qp_result create_qp_resp; + struct efa_dev *dev = to_edev(ibqp->device); + struct efa_ibv_create_qp_resp resp = {}; + struct efa_ibv_create_qp cmd = {}; + struct efa_qp *qp = to_eqp(ibqp); + struct efa_ucontext *ucontext; + int err; + +#ifndef HAVE_NO_KVERBS_DRIVERS + if (!udata) { + ibdev_dbg(&dev->ibdev, "udata is NULL\n"); + err = -EOPNOTSUPP; + goto err_out; + } +#endif + +#ifdef HAVE_UDATA_TO_DRV_CONTEXT + ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext, + ibucontext); +#else + ucontext = ibqp->pd->uobject ? to_eucontext(ibqp->pd->uobject->context) : + NULL; +#endif + + err = efa_qp_validate_cap(dev, init_attr); + if (err) + goto err_out; + + err = efa_qp_validate_attr(dev, init_attr); + if (err) + goto err_out; + + if (offsetofend(typeof(cmd), driver_qp_type) > udata->inlen) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, no input udata\n"); + err = -EINVAL; + goto err_out; + } + + if (udata->inlen > sizeof(cmd) && + !ib_is_udata_cleared(udata, sizeof(cmd), + udata->inlen - sizeof(cmd))) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, unknown fields in udata\n"); + err = -EINVAL; + goto err_out; + } + + err = ib_copy_from_udata(&cmd, udata, + min(sizeof(cmd), udata->inlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Cannot copy udata for create_qp\n"); + goto err_out; + } + + if (cmd.comp_mask) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, unknown fields in udata\n"); + err = -EINVAL; + goto err_out; + } + + create_qp_params.uarn = ucontext->uarn; + create_qp_params.pd = to_epd(ibqp->pd)->pdn; + + if (init_attr->qp_type == IB_QPT_UD) { + create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD; + } else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) { + create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD; + } else { + ibdev_dbg(&dev->ibdev, + "Unsupported qp type %d driver qp type %d\n", + init_attr->qp_type, cmd.driver_qp_type); + err = -EOPNOTSUPP; + goto err_out; + } + + ibdev_dbg(&dev->ibdev, "Create QP: qp type %d driver qp type %#x\n", + init_attr->qp_type, cmd.driver_qp_type); + create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx; + create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx; + create_qp_params.sq_depth = init_attr->cap.max_send_wr; + create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size; + + create_qp_params.rq_depth = init_attr->cap.max_recv_wr; + create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size; + qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes); + if (qp->rq_size) { + qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr, + qp->rq_size, DMA_TO_DEVICE); + if (!qp->rq_cpu_addr) { + err = -ENOMEM; + goto err_out; + } + + ibdev_dbg(&dev->ibdev, + "qp->cpu_addr[0x%p] allocated: size[%lu], dma[%pad]\n", + qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr); + create_qp_params.rq_base_addr = qp->rq_dma_addr; + } + + err = efa_com_create_qp(&dev->edev, &create_qp_params, + &create_qp_resp); + if (err) + goto err_free_mapped; + + resp.sq_db_offset = create_qp_resp.sq_db_offset; + resp.rq_db_offset = create_qp_resp.rq_db_offset; + resp.llq_desc_offset = create_qp_resp.llq_descriptors_offset; + resp.send_sub_cq_idx = create_qp_resp.send_sub_cq_idx; + resp.recv_sub_cq_idx = create_qp_resp.recv_sub_cq_idx; + + err = qp_mmap_entries_setup(qp, dev, ucontext, &create_qp_params, + &resp); + if (err) + goto err_destroy_qp; + + qp->qp_handle = create_qp_resp.qp_handle; + qp->ibqp.qp_num = create_qp_resp.qp_num; + qp->max_send_wr = init_attr->cap.max_send_wr; + qp->max_recv_wr = init_attr->cap.max_recv_wr; + qp->max_send_sge = init_attr->cap.max_send_sge; + qp->max_recv_sge = init_attr->cap.max_recv_sge; + qp->max_inline_data = init_attr->cap.max_inline_data; + + if (udata->outlen) { + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Failed to copy udata for qp[%u]\n", + create_qp_resp.qp_num); + goto err_remove_mmap_entries; + } + } + + ibdev_dbg(&dev->ibdev, "Created qp[%d]\n", qp->ibqp.qp_num); + + return 0; + +err_remove_mmap_entries: + efa_qp_user_mmap_entries_remove(qp); +err_destroy_qp: + efa_destroy_qp_handle(dev, create_qp_resp.qp_handle); +err_free_mapped: + if (qp->rq_size) + efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr, + qp->rq_size, DMA_TO_DEVICE); +err_out: + atomic64_inc(&dev->stats.create_qp_err); + return err; +} + +#ifndef HAVE_QP_CORE_ALLOCATION +struct ib_qp *efa_kzalloc_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_qp *qp; + int err; + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + atomic64_inc(&dev->stats.create_qp_err); + err = -ENOMEM; + goto err_out; + } + + qp->ibqp.device = ibpd->device; + qp->ibqp.pd = ibpd; + qp->ibqp.qp_type = init_attr->qp_type; + err = efa_create_qp(&qp->ibqp, init_attr, udata); + if (err) + goto err_free_qp; + + return &qp->ibqp; + +err_free_qp: + kfree(qp); +err_out: + return ERR_PTR(err); +} +#endif + +static const struct { + int valid; + enum ib_qp_attr_mask req_param; + enum ib_qp_attr_mask opt_param; +} srd_qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .req_param = IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY, + }, + }, + [IB_QPS_INIT] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .opt_param = IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY, + }, + [IB_QPS_RTR] = { + .valid = 1, + .opt_param = IB_QP_PKEY_INDEX | + IB_QP_QKEY, + }, + }, + [IB_QPS_RTR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .req_param = IB_QP_SQ_PSN, + .opt_param = IB_QP_CUR_STATE | + IB_QP_QKEY | + IB_QP_RNR_RETRY, + + } + }, + [IB_QPS_RTS] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = IB_QP_CUR_STATE | + IB_QP_QKEY, + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = IB_QP_EN_SQD_ASYNC_NOTIFY, + }, + }, + [IB_QPS_SQD] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = IB_QP_CUR_STATE | + IB_QP_QKEY, + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = IB_QP_PKEY_INDEX | + IB_QP_QKEY, + } + }, + [IB_QPS_SQE] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = IB_QP_CUR_STATE | + IB_QP_QKEY, + } + }, + [IB_QPS_ERR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + } +}; + +static bool efa_modify_srd_qp_is_ok(enum ib_qp_state cur_state, + enum ib_qp_state next_state, + enum ib_qp_attr_mask mask) +{ + enum ib_qp_attr_mask req_param, opt_param; + + if (mask & IB_QP_CUR_STATE && + cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && + cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) + return false; + + if (!srd_qp_state_table[cur_state][next_state].valid) + return false; + + req_param = srd_qp_state_table[cur_state][next_state].req_param; + opt_param = srd_qp_state_table[cur_state][next_state].opt_param; + + if ((mask & req_param) != req_param) + return false; + + if (mask & ~(req_param | opt_param | IB_QP_STATE)) + return false; + + return true; +} + +static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + enum ib_qp_state cur_state, + enum ib_qp_state new_state) +{ + int err; + +#define EFA_MODIFY_QP_SUPP_MASK \ + (IB_QP_STATE | IB_QP_CUR_STATE | IB_QP_EN_SQD_ASYNC_NOTIFY | \ + IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN | \ + IB_QP_RNR_RETRY) + + if (qp_attr_mask & ~EFA_MODIFY_QP_SUPP_MASK) { + ibdev_dbg(&dev->ibdev, + "Unsupported qp_attr_mask[%#x] supported[%#x]\n", + qp_attr_mask, EFA_MODIFY_QP_SUPP_MASK); + return -EOPNOTSUPP; + } + + if (qp->ibqp.qp_type == IB_QPT_DRIVER) + err = !efa_modify_srd_qp_is_ok(cur_state, new_state, + qp_attr_mask); + else +#ifdef HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS + err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD, + qp_attr_mask); +#else + err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD, + qp_attr_mask, + IB_LINK_LAYER_UNSPECIFIED); +#endif + + if (err) { + ibdev_dbg(&dev->ibdev, "Invalid modify QP parameters\n"); + return -EINVAL; + } + + if ((qp_attr_mask & IB_QP_PORT) && qp_attr->port_num != 1) { + ibdev_dbg(&dev->ibdev, "Can't change port num\n"); + return -EOPNOTSUPP; + } + + if ((qp_attr_mask & IB_QP_PKEY_INDEX) && qp_attr->pkey_index) { + ibdev_dbg(&dev->ibdev, "Can't change pkey index\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibqp->device); + struct efa_com_modify_qp_params params = {}; + struct efa_qp *qp = to_eqp(ibqp); + enum ib_qp_state cur_state; + enum ib_qp_state new_state; + int err; + +#ifndef HAVE_NO_KVERBS_DRIVERS + if (!udata) { + ibdev_dbg(&dev->ibdev, "udata is NULL\n"); + return -EOPNOTSUPP; + } +#endif + +#ifdef HAVE_UVERBS_CMD_MASK_NOT_NEEDED + if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS) + return -EOPNOTSUPP; +#endif + + if (udata->inlen && + !ib_is_udata_cleared(udata, 0, udata->inlen)) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, udata not cleared\n"); + return -EINVAL; + } + + cur_state = qp_attr_mask & IB_QP_CUR_STATE ? qp_attr->cur_qp_state : + qp->state; + new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state; + + err = efa_modify_qp_validate(dev, qp, qp_attr, qp_attr_mask, cur_state, + new_state); + if (err) + return err; + + params.qp_handle = qp->qp_handle; + + if (qp_attr_mask & IB_QP_STATE) { + EFA_SET(¶ms.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QP_STATE, + 1); + EFA_SET(¶ms.modify_mask, + EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE, 1); + params.cur_qp_state = cur_state; + params.qp_state = new_state; + } + + if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) { + EFA_SET(¶ms.modify_mask, + EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY, 1); + params.sq_drained_async_notify = qp_attr->en_sqd_async_notify; + } + + if (qp_attr_mask & IB_QP_QKEY) { + EFA_SET(¶ms.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QKEY, 1); + params.qkey = qp_attr->qkey; + } + + if (qp_attr_mask & IB_QP_SQ_PSN) { + EFA_SET(¶ms.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN, 1); + params.sq_psn = qp_attr->sq_psn; + } + + if (qp_attr_mask & IB_QP_RNR_RETRY) { + EFA_SET(¶ms.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY, + 1); + params.rnr_retry = qp_attr->rnr_retry; + } + + err = efa_com_modify_qp(&dev->edev, ¶ms); + if (err) + return err; + + qp->state = new_state; + + return 0; +} + +static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx) +{ + struct efa_com_destroy_cq_params params = { .cq_idx = cq_idx }; + + return efa_com_destroy_cq(&dev->edev, ¶ms); +} + +static void efa_cq_user_mmap_entries_remove(struct efa_cq *cq) +{ + rdma_user_mmap_entry_remove(cq->db_mmap_entry); + rdma_user_mmap_entry_remove(cq->mmap_entry); +} + +#if defined(HAVE_IB_VOID_DESTROY_CQ) || defined(HAVE_IB_INT_DESTROY_CQ) +#ifdef HAVE_IB_INT_DESTROY_CQ +int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +#else +void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +#endif +{ + struct efa_dev *dev = to_edev(ibcq->device); + struct efa_cq *cq = to_ecq(ibcq); + + ibdev_dbg(&dev->ibdev, + "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n", + cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr); + + efa_cq_user_mmap_entries_remove(cq); + efa_destroy_cq_idx(dev, cq->cq_idx); + if (cq->eq) { +#ifdef HAVE_XARRAY + xa_erase(&dev->cqs_xa, cq->cq_idx); +#else + dev->cqs_arr[cq->cq_idx] = NULL; +#endif + synchronize_irq(cq->eq->irq.irqn); + } + efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, + DMA_FROM_DEVICE); +#ifndef HAVE_CQ_CORE_ALLOCATION + kfree(cq); +#endif +#ifdef HAVE_IB_INT_DESTROY_CQ + return 0; +#endif +} +#else +#ifdef HAVE_DESTROY_CQ_UDATA +int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +#else +int efa_destroy_cq(struct ib_cq *ibcq) +#endif +{ + struct efa_dev *dev = to_edev(ibcq->device); + struct efa_cq *cq = to_ecq(ibcq); + int err; + + ibdev_dbg(&dev->ibdev, + "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n", + cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr); + + efa_cq_user_mmap_entries_remove(cq); + err = efa_destroy_cq_idx(dev, cq->cq_idx); + if (err) + return err; + + if (cq->eq) { +#ifdef HAVE_XARRAY + xa_erase(&dev->cqs_xa, cq->cq_idx); +#else + dev->cqs_arr[cq->cq_idx] = NULL; +#endif + synchronize_irq(cq->eq->irq.irqn); + } + efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, + DMA_FROM_DEVICE); + + kfree(cq); + return 0; +} +#endif + +static struct efa_eq *efa_vec2eq(struct efa_dev *dev, int vec) +{ + return &dev->eqs[vec]; +} + +static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, + struct efa_ibv_create_cq_resp *resp, + bool db_valid) +{ + resp->q_mmap_size = cq->size; + cq->mmap_entry = efa_user_mmap_entry_insert(&cq->ucontext->ibucontext, + virt_to_phys(cq->cpu_addr), + cq->size, EFA_MMAP_DMA_PAGE, + &resp->q_mmap_key); + if (!cq->mmap_entry) + return -ENOMEM; + + if (db_valid) { + cq->db_mmap_entry = + efa_user_mmap_entry_insert(&cq->ucontext->ibucontext, + dev->db_bar_addr + resp->db_off, + PAGE_SIZE, EFA_MMAP_IO_NC, + &resp->db_mmap_key); + if (!cq->db_mmap_entry) { + rdma_user_mmap_entry_remove(cq->mmap_entry); + return -ENOMEM; + } + + resp->db_off &= ~PAGE_MASK; + resp->comp_mask |= EFA_CREATE_CQ_RESP_DB_OFF; + } + + return 0; +} + +int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ +#ifdef HAVE_UDATA_TO_DRV_CONTEXT + struct efa_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct efa_ucontext, ibucontext); +#else + struct efa_ucontext *ucontext = to_ecq(ibcq)->ucontext; +#endif + struct efa_com_create_cq_params params = {}; + struct efa_ibv_create_cq_resp resp = {}; + struct efa_com_create_cq_result result; + struct ib_device *ibdev = ibcq->device; + struct efa_dev *dev = to_edev(ibdev); + struct efa_ibv_create_cq cmd = {}; + struct efa_cq *cq = to_ecq(ibcq); + int entries = attr->cqe; + bool set_src_addr; + int err; + + ibdev_dbg(ibdev, "create_cq entries %d\n", entries); + + if (attr->flags) + return -EOPNOTSUPP; + + if (entries < 1 || entries > dev->dev_attr.max_cq_depth) { + ibdev_dbg(ibdev, + "cq: requested entries[%u] non-positive or greater than max[%u]\n", + entries, dev->dev_attr.max_cq_depth); + err = -EINVAL; + goto err_out; + } + +#ifndef HAVE_NO_KVERBS_DRIVERS + if (!udata) { + ibdev_dbg(ibdev, "udata is NULL\n"); + err = -EOPNOTSUPP; + goto err_out; + } +#endif + + if (offsetofend(typeof(cmd), num_sub_cqs) > udata->inlen) { + ibdev_dbg(ibdev, + "Incompatible ABI params, no input udata\n"); + err = -EINVAL; + goto err_out; + } + + if (udata->inlen > sizeof(cmd) && + !ib_is_udata_cleared(udata, sizeof(cmd), + udata->inlen - sizeof(cmd))) { + ibdev_dbg(ibdev, + "Incompatible ABI params, unknown fields in udata\n"); + err = -EINVAL; + goto err_out; + } + + err = ib_copy_from_udata(&cmd, udata, + min(sizeof(cmd), udata->inlen)); + if (err) { + ibdev_dbg(ibdev, "Cannot copy udata for create_cq\n"); + goto err_out; + } + + if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_58)) { + ibdev_dbg(ibdev, + "Incompatible ABI params, unknown fields in udata\n"); + err = -EINVAL; + goto err_out; + } + + set_src_addr = !!(cmd.flags & EFA_CREATE_CQ_WITH_SGID); + if ((cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc_ex)) && + (set_src_addr || cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc))) { + ibdev_dbg(ibdev, + "Invalid entry size [%u]\n", cmd.cq_entry_size); + err = -EINVAL; + goto err_out; + } + + if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) { + ibdev_dbg(ibdev, + "Invalid number of sub cqs[%u] expected[%u]\n", + cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq); + err = -EINVAL; + goto err_out; + } + + cq->ucontext = ucontext; + cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs); + cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size, + DMA_FROM_DEVICE); + if (!cq->cpu_addr) { + err = -ENOMEM; + goto err_out; + } + + params.uarn = cq->ucontext->uarn; + params.cq_depth = entries; + params.dma_addr = cq->dma_addr; + params.entry_size_in_bytes = cmd.cq_entry_size; + params.num_sub_cqs = cmd.num_sub_cqs; + params.set_src_addr = set_src_addr; + if (cmd.flags & EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL) { + cq->eq = efa_vec2eq(dev, attr->comp_vector); + params.eqn = cq->eq->eeq.eqn; + params.interrupt_mode_enabled = true; + } + + err = efa_com_create_cq(&dev->edev, ¶ms, &result); + if (err) + goto err_free_mapped; + + resp.db_off = result.db_off; + resp.cq_idx = result.cq_idx; + cq->cq_idx = result.cq_idx; + cq->ibcq.cqe = result.actual_depth; + WARN_ON_ONCE(entries != result.actual_depth); + + err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid); + if (err) { + ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n", + cq->cq_idx); + goto err_destroy_cq; + } + + if (cq->eq) { +#ifdef HAVE_XARRAY + err = xa_err(xa_store(&dev->cqs_xa, cq->cq_idx, cq, GFP_KERNEL)); +#else + dev->cqs_arr[cq->cq_idx] = cq; +#endif + if (err) { + ibdev_dbg(ibdev, "Failed to store cq[%u] in xarray\n", + cq->cq_idx); + goto err_remove_mmap; + } + } + + if (udata->outlen) { + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(ibdev, + "Failed to copy udata for create_cq\n"); + goto err_xa_erase; + } + } + + ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n", + cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr); + + return 0; + +err_xa_erase: + if (cq->eq) +#ifdef HAVE_XARRAY + xa_erase(&dev->cqs_xa, cq->cq_idx); +#else + dev->cqs_arr[cq->cq_idx] = NULL; +#endif +err_remove_mmap: + efa_cq_user_mmap_entries_remove(cq); +err_destroy_cq: + efa_destroy_cq_idx(dev, cq->cq_idx); +err_free_mapped: + efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, + DMA_FROM_DEVICE); + +err_out: + atomic64_inc(&dev->stats.create_cq_err); + return err; +} + +#ifndef HAVE_CQ_CORE_ALLOCATION +#ifdef HAVE_CREATE_CQ_NO_UCONTEXT +struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +#elif defined(HAVE_CREATE_CQ_ATTR) +struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *ibucontext, + struct ib_udata *udata) +#endif +{ + struct efa_dev *dev = to_edev(ibdev); + struct efa_cq *cq; + int err; + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + atomic64_inc(&dev->stats.create_cq_err); + return ERR_PTR(-ENOMEM); + } + +#ifdef HAVE_UDATA_TO_DRV_CONTEXT + cq->ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext, + ibucontext); +#else + cq->ucontext = to_eucontext(ibucontext); +#endif + + cq->ibcq.device = ibdev; + err = efa_create_cq(&cq->ibcq, attr, udata); + if (err) + goto err_free_cq; + + return &cq->ibcq; + +err_free_cq: + kfree(cq); + return ERR_PTR(err); +} +#endif + +#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE +static int umem_to_page_list(struct efa_dev *dev, + struct ib_umem *umem, + u64 *page_list, + u32 hp_cnt, + u8 hp_shift) +{ + u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT); + struct ib_block_iter biter; + unsigned int hp_idx = 0; + + ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n", + hp_cnt, pages_in_hp); + + rdma_umem_for_each_dma_block(umem, &biter, BIT(hp_shift)) + page_list[hp_idx++] = rdma_block_iter_dma_address(&biter); + + return 0; +} +#elif defined(HAVE_SG_DMA_PAGE_ITER) +static int umem_to_page_list(struct efa_dev *dev, + struct ib_umem *umem, + u64 *page_list, + u32 hp_cnt, + u8 hp_shift) +{ + u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT); + struct sg_dma_page_iter sg_iter; + unsigned int page_idx = 0; + unsigned int hp_idx = 0; + + ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n", + hp_cnt, pages_in_hp); + + for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + if (page_idx % pages_in_hp == 0) { + page_list[hp_idx] = sg_page_iter_dma_address(&sg_iter); + hp_idx++; + } + + page_idx++; + } + + return 0; +} +#elif defined(HAVE_UMEM_SCATTERLIST_IF) +static int umem_to_page_list(struct efa_dev *dev, + struct ib_umem *umem, + u64 *page_list, + u32 hp_cnt, + u8 hp_shift) +{ + u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT); + unsigned int page_idx = 0; + unsigned int pages_in_sg; + unsigned int hp_idx = 0; + struct scatterlist *sg; + unsigned int entry; + unsigned int i; + + ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n", + hp_cnt, pages_in_hp); + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + if (sg_dma_len(sg) & ~PAGE_MASK) { + ibdev_dbg(&dev->ibdev, + "sg_dma_len[%u] does not divide by PAGE_SIZE[%lu]\n", + sg_dma_len(sg), PAGE_SIZE); + return -EINVAL; + } + + pages_in_sg = sg_dma_len(sg) >> PAGE_SHIFT; + for (i = 0; i < pages_in_sg; i++) { + if (page_idx % pages_in_hp == 0) { + page_list[hp_idx] = sg_dma_address(sg) + + i * PAGE_SIZE; + hp_idx++; + } + + page_idx++; + } + } + + return 0; +} +#endif + +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt) +{ + struct scatterlist *sglist; + struct page *pg; + int i; + + sglist = kmalloc_array(page_cnt, sizeof(*sglist), GFP_KERNEL); + if (!sglist) + return NULL; + sg_init_table(sglist, page_cnt); + for (i = 0; i < page_cnt; i++) { + pg = vmalloc_to_page(buf); + if (!pg) + goto err; + sg_set_page(&sglist[i], pg, PAGE_SIZE, 0); + buf += PAGE_SIZE / sizeof(*buf); + } + return sglist; + +err: + kfree(sglist); + return NULL; +} + +/* + * create a chunk list of physical pages dma addresses from the supplied + * scatter gather list + */ +static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl) +{ + struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list; + int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages; + struct scatterlist *pages_sgl = pbl->phys.indirect.sgl; + unsigned int chunk_list_size, chunk_idx, payload_idx; + int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt; + struct efa_com_ctrl_buff_info *ctrl_buf; + u64 *cur_chunk_buf, *prev_chunk_buf; +#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE + struct ib_block_iter biter; +#else + struct scatterlist *sg; + unsigned int entry, payloads_in_sg; +#endif + dma_addr_t dma_addr; + int i; + + /* allocate a chunk list that consists of 4KB chunks */ + chunk_list_size = DIV_ROUND_UP(page_cnt, EFA_PTRS_PER_CHUNK); + + chunk_list->size = chunk_list_size; + chunk_list->chunks = kcalloc(chunk_list_size, + sizeof(*chunk_list->chunks), + GFP_KERNEL); + if (!chunk_list->chunks) + return -ENOMEM; + + ibdev_dbg(&dev->ibdev, + "chunk_list_size[%u] - pages[%u]\n", chunk_list_size, + page_cnt); + + /* allocate chunk buffers: */ + for (i = 0; i < chunk_list_size; i++) { + chunk_list->chunks[i].buf = kzalloc(EFA_CHUNK_SIZE, GFP_KERNEL); + if (!chunk_list->chunks[i].buf) + goto chunk_list_dealloc; + + chunk_list->chunks[i].length = EFA_CHUNK_USED_SIZE; + } + chunk_list->chunks[chunk_list_size - 1].length = + ((page_cnt % EFA_PTRS_PER_CHUNK) * EFA_CHUNK_PAYLOAD_PTR_SIZE) + + EFA_CHUNK_PTR_SIZE; + + /* fill the dma addresses of sg list pages to chunks: */ + chunk_idx = 0; + payload_idx = 0; + cur_chunk_buf = chunk_list->chunks[0].buf; +#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE + rdma_for_each_block(pages_sgl, &biter, sg_dma_cnt, + EFA_CHUNK_PAYLOAD_SIZE) { + cur_chunk_buf[payload_idx++] = + rdma_block_iter_dma_address(&biter); + + if (payload_idx == EFA_PTRS_PER_CHUNK) { + chunk_idx++; + cur_chunk_buf = chunk_list->chunks[chunk_idx].buf; + payload_idx = 0; + } + } +#else + for_each_sg(pages_sgl, sg, sg_dma_cnt, entry) { + payloads_in_sg = sg_dma_len(sg) >> EFA_CHUNK_PAYLOAD_SHIFT; + for (i = 0; i < payloads_in_sg; i++) { + cur_chunk_buf[payload_idx++] = + (sg_dma_address(sg) & ~(EFA_CHUNK_PAYLOAD_SIZE - 1)) + + (EFA_CHUNK_PAYLOAD_SIZE * i); + + if (payload_idx == EFA_PTRS_PER_CHUNK) { + chunk_idx++; + cur_chunk_buf = chunk_list->chunks[chunk_idx].buf; + payload_idx = 0; + } + } + } +#endif + + /* map chunks to dma and fill chunks next ptrs */ + for (i = chunk_list_size - 1; i >= 0; i--) { + dma_addr = dma_map_single(&dev->pdev->dev, + chunk_list->chunks[i].buf, + chunk_list->chunks[i].length, + DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, dma_addr)) { + ibdev_err(&dev->ibdev, + "chunk[%u] dma_map_failed\n", i); + goto chunk_list_unmap; + } + + chunk_list->chunks[i].dma_addr = dma_addr; + ibdev_dbg(&dev->ibdev, + "chunk[%u] mapped at [%pad]\n", i, &dma_addr); + + if (!i) + break; + + prev_chunk_buf = chunk_list->chunks[i - 1].buf; + + ctrl_buf = (struct efa_com_ctrl_buff_info *) + &prev_chunk_buf[EFA_PTRS_PER_CHUNK]; + ctrl_buf->length = chunk_list->chunks[i].length; + + efa_com_set_dma_addr(dma_addr, + &ctrl_buf->address.mem_addr_high, + &ctrl_buf->address.mem_addr_low); + } + + return 0; + +chunk_list_unmap: + for (; i < chunk_list_size; i++) { + dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr, + chunk_list->chunks[i].length, DMA_TO_DEVICE); + } +chunk_list_dealloc: + for (i = 0; i < chunk_list_size; i++) + kfree(chunk_list->chunks[i].buf); + + kfree(chunk_list->chunks); + return -ENOMEM; +} + +static void pbl_chunk_list_destroy(struct efa_dev *dev, struct pbl_context *pbl) +{ + struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list; + int i; + + for (i = 0; i < chunk_list->size; i++) { + dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr, + chunk_list->chunks[i].length, DMA_TO_DEVICE); + kfree(chunk_list->chunks[i].buf); + } + + kfree(chunk_list->chunks); +} + +/* initialize pbl continuous mode: map pbl buffer to a dma address. */ +static int pbl_continuous_initialize(struct efa_dev *dev, + struct pbl_context *pbl) +{ + dma_addr_t dma_addr; + + dma_addr = dma_map_single(&dev->pdev->dev, pbl->pbl_buf, + pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, dma_addr)) { + ibdev_err(&dev->ibdev, "Unable to map pbl to DMA address\n"); + return -ENOMEM; + } + + pbl->phys.continuous.dma_addr = dma_addr; + ibdev_dbg(&dev->ibdev, + "pbl continuous - dma_addr = %pad, size[%u]\n", + &dma_addr, pbl->pbl_buf_size_in_bytes); + + return 0; +} + +/* + * initialize pbl indirect mode: + * create a chunk list out of the dma addresses of the physical pages of + * pbl buffer. + */ +static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl) +{ + u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, PAGE_SIZE); + struct scatterlist *sgl; + int sg_dma_cnt, err; + + BUILD_BUG_ON(EFA_CHUNK_PAYLOAD_SIZE > PAGE_SIZE); + sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages); + if (!sgl) + return -ENOMEM; + + sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE); + if (!sg_dma_cnt) { + err = -EINVAL; + goto err_map; + } + + pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages; + pbl->phys.indirect.sgl = sgl; + pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt; + err = pbl_chunk_list_create(dev, pbl); + if (err) { + ibdev_dbg(&dev->ibdev, + "chunk_list creation failed[%d]\n", err); + goto err_chunk; + } + + ibdev_dbg(&dev->ibdev, + "pbl indirect - size[%u], chunks[%u]\n", + pbl->pbl_buf_size_in_bytes, + pbl->phys.indirect.chunk_list.size); + + return 0; + +err_chunk: + dma_unmap_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE); +err_map: + kfree(sgl); + return err; +} + +static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl) +{ + pbl_chunk_list_destroy(dev, pbl); + dma_unmap_sg(&dev->pdev->dev, pbl->phys.indirect.sgl, + pbl->phys.indirect.pbl_buf_size_in_pages, DMA_TO_DEVICE); + kfree(pbl->phys.indirect.sgl); +} + +/* create a page buffer list from a mapped user memory region */ +static int pbl_create(struct efa_dev *dev, + struct pbl_context *pbl, +#ifdef HAVE_EFA_P2P + struct efa_mr *mr, +#else + struct ib_umem *umem, +#endif + int hp_cnt, + u8 hp_shift) +{ + int err; + + pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_CHUNK_PAYLOAD_PTR_SIZE; + pbl->pbl_buf = kvzalloc(pbl->pbl_buf_size_in_bytes, GFP_KERNEL); + if (!pbl->pbl_buf) + return -ENOMEM; + + if (is_vmalloc_addr(pbl->pbl_buf)) { + pbl->physically_continuous = 0; +#ifdef HAVE_EFA_P2P + if (mr->p2pmem) + err = efa_p2p_to_page_list(dev, mr->p2pmem, pbl->pbl_buf); + else + err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt, + hp_shift); +#else + err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt, + hp_shift); +#endif + if (err) + goto err_free; + + err = pbl_indirect_initialize(dev, pbl); + if (err) + goto err_free; + } else { + pbl->physically_continuous = 1; +#ifdef HAVE_EFA_P2P + if (mr->p2pmem) + err = efa_p2p_to_page_list(dev, mr->p2pmem, pbl->pbl_buf); + else + err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt, + hp_shift); +#else + err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt, + hp_shift); +#endif + if (err) + goto err_free; + + err = pbl_continuous_initialize(dev, pbl); + if (err) + goto err_free; + } + + ibdev_dbg(&dev->ibdev, + "user_pbl_created: user_pages[%u], continuous[%u]\n", + hp_cnt, pbl->physically_continuous); + + return 0; + +err_free: + kvfree(pbl->pbl_buf); + return err; +} + +static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl) +{ + if (pbl->physically_continuous) + dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr, + pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE); + else + pbl_indirect_terminate(dev, pbl); + + kvfree(pbl->pbl_buf); +} + +static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr, + struct efa_com_reg_mr_params *params) +{ + int err; + + params->inline_pbl = 1; +#ifdef HAVE_EFA_P2P + if (mr->p2pmem) + err = efa_p2p_to_page_list(dev, mr->p2pmem, + params->pbl.inline_pbl_array); + else + err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array, + params->page_num, params->page_shift); +#else + err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array, + params->page_num, params->page_shift); +#endif + if (err) + return err; + + ibdev_dbg(&dev->ibdev, + "inline_pbl_array - pages[%u]\n", params->page_num); + + return 0; +} + +static int efa_create_pbl(struct efa_dev *dev, + struct pbl_context *pbl, + struct efa_mr *mr, + struct efa_com_reg_mr_params *params) +{ + int err; + +#ifdef HAVE_EFA_P2P + err = pbl_create(dev, pbl, mr, params->page_num, + params->page_shift); +#else + err = pbl_create(dev, pbl, mr->umem, params->page_num, + params->page_shift); +#endif + if (err) { + ibdev_dbg(&dev->ibdev, "Failed to create pbl[%d]\n", err); + return err; + } + + params->inline_pbl = 0; + params->indirect = !pbl->physically_continuous; + if (pbl->physically_continuous) { + params->pbl.pbl.length = pbl->pbl_buf_size_in_bytes; + + efa_com_set_dma_addr(pbl->phys.continuous.dma_addr, + ¶ms->pbl.pbl.address.mem_addr_high, + ¶ms->pbl.pbl.address.mem_addr_low); + } else { + params->pbl.pbl.length = + pbl->phys.indirect.chunk_list.chunks[0].length; + + efa_com_set_dma_addr(pbl->phys.indirect.chunk_list.chunks[0].dma_addr, + ¶ms->pbl.pbl.address.mem_addr_high, + ¶ms->pbl.pbl.address.mem_addr_low); + } + + return 0; +} + +#ifndef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE +static unsigned long efa_cont_pages(struct ib_umem *umem, + unsigned long page_size_cap, + u64 addr) +{ + unsigned long max_page_shift = fls64(page_size_cap); + struct scatterlist *sg; + u64 base = ~0, p = 0; + unsigned long tmp; + unsigned long m; + u64 len, pfn; + int i = 0; + int entry; + + addr = addr >> PAGE_SHIFT; + tmp = (unsigned long)addr; + m = find_first_bit(&tmp, BITS_PER_LONG); + m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m); + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = DIV_ROUND_UP(sg_dma_len(sg), PAGE_SIZE); + pfn = sg_dma_address(sg) >> PAGE_SHIFT; + if (base + p != pfn) { + /* + * If either the offset or the new + * base are unaligned update m + */ + tmp = (unsigned long)(pfn | p); + if (!IS_ALIGNED(tmp, 1 << m)) + m = find_first_bit(&tmp, BITS_PER_LONG); + + base = pfn; + p = 0; + } + + p += len; + i += len; + } + + if (i) + m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); + else + m = 0; + + return BIT(PAGE_SHIFT + m); +} +#endif + +static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + int supp_access_flags; + struct efa_mr *mr; + +#ifndef HAVE_NO_KVERBS_DRIVERS + if (!udata) { + ibdev_dbg(&dev->ibdev, "udata is NULL\n"); + return ERR_PTR(-EINVAL); + } +#endif + + if (udata && udata->inlen && + !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) { + ibdev_dbg(&dev->ibdev, + "Incompatible ABI params, udata not cleared\n"); + return ERR_PTR(-EINVAL); + } + + supp_access_flags = + IB_ACCESS_LOCAL_WRITE | + (EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0); + +#ifdef HAVE_IB_ACCESS_OPTIONAL + access_flags &= ~IB_ACCESS_OPTIONAL; +#endif + if (access_flags & ~supp_access_flags) { + ibdev_dbg(&dev->ibdev, + "Unsupported access flags[%#x], supported[%#x]\n", + access_flags, supp_access_flags); + return ERR_PTR(-EOPNOTSUPP); + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + return mr; +} + +static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start, + u64 length, u64 virt_addr, int access_flags) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_com_reg_mr_params params = {}; + struct efa_com_reg_mr_result result = {}; + struct pbl_context pbl; + unsigned int pg_sz; + int inline_size; + int err; + + params.pd = to_epd(ibpd)->pdn; + params.iova = virt_addr; + params.mr_length_in_bytes = length; + params.permissions = access_flags; + +#ifdef HAVE_EFA_P2P + if (mr->p2pmem) { + pg_sz = efa_p2p_get_page_size(dev, mr->p2pmem); + goto skip_umem_pg_sz; + } +#endif + +#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE + pg_sz = ib_umem_find_best_pgsz(mr->umem, + dev->dev_attr.page_size_cap, + virt_addr); + if (!pg_sz) { + ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n", + dev->dev_attr.page_size_cap); + return -EOPNOTSUPP; + } +#else + pg_sz = efa_cont_pages(mr->umem, dev->dev_attr.page_size_cap, + virt_addr); +#endif /* defined(HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE) */ + +#ifdef HAVE_EFA_P2P +skip_umem_pg_sz: +#endif + params.page_shift = order_base_2(pg_sz); +#ifdef HAVE_IB_UMEM_NUM_DMA_BLOCKS +#ifdef HAVE_EFA_P2P + if (mr->p2pmem) + params.page_num = DIV_ROUND_UP(length + + (virt_addr & (pg_sz - 1)), + pg_sz); + else + params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz); +#else + params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz); +#endif +#else + params.page_num = DIV_ROUND_UP(length + (virt_addr & (pg_sz - 1)), + pg_sz); +#endif + + ibdev_dbg(&dev->ibdev, + "start %#llx length %#llx params.page_shift %u params.page_num %u\n", + start, length, params.page_shift, params.page_num); + + inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array); + if (params.page_num <= inline_size) { + err = efa_create_inline_pbl(dev, mr, ¶ms); + if (err) + return err; + + err = efa_com_register_mr(&dev->edev, ¶ms, &result); + if (err) + return err; + } else { + err = efa_create_pbl(dev, &pbl, mr, ¶ms); + if (err) + return err; + + err = efa_com_register_mr(&dev->edev, ¶ms, &result); + pbl_destroy(dev, &pbl); + + if (err) + return err; + } + + mr->ibmr.lkey = result.l_key; + mr->ibmr.rkey = result.r_key; +#ifdef HAVE_IB_MR_LENGTH + mr->ibmr.length = length; +#endif +#ifdef HAVE_EFA_P2P + if (mr->p2pmem) { + mr->p2pmem->lkey = result.l_key; + mr->p2pmem->needs_dereg = true; + } +#endif + ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey); + + return 0; +} + +#ifdef HAVE_MR_DMABUF +struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct ib_umem_dmabuf *umem_dmabuf; + struct efa_mr *mr; + int err; + + mr = efa_alloc_mr(ibpd, access_flags, udata); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto err_out; + } + + umem_dmabuf = ib_umem_dmabuf_get_pinned(ibpd->device, start, length, fd, + access_flags); + if (IS_ERR(umem_dmabuf)) { + err = PTR_ERR(umem_dmabuf); + ibdev_dbg(&dev->ibdev, "Failed to get dmabuf umem[%d]\n", err); + goto err_free; + } + + mr->umem = &umem_dmabuf->umem; + err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags); + if (err) + goto err_release; + + return &mr->ibmr; + +err_release: +#ifndef HAVE_IB_UMEM_DMABUF_PINNED + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); + dma_buf_unpin(umem_dmabuf->attach); + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); +#endif + ib_umem_release(mr->umem); +err_free: + kfree(mr); +err_out: + atomic64_inc(&dev->stats.reg_mr_err); + return ERR_PTR(err); +} +#endif + +struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_mr *mr; + int err; + + mr = efa_alloc_mr(ibpd, access_flags, udata); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto err_out; + } + +#ifdef HAVE_IB_UMEM_GET_DEVICE_PARAM + mr->umem = ib_umem_get(ibpd->device, start, length, access_flags); +#elif defined(HAVE_IB_UMEM_GET_NO_DMASYNC) + mr->umem = ib_umem_get(udata, start, length, access_flags); +#elif defined(HAVE_IB_UMEM_GET_UDATA) + mr->umem = ib_umem_get(udata, start, length, access_flags, 0); +#else + mr->umem = ib_umem_get(ibpd->uobject->context, start, length, + access_flags, 0); +#endif + if (IS_ERR(mr->umem)) { +#ifdef HAVE_EFA_P2P + mr->p2pmem = efa_p2p_get(dev, mr, start, length); + if (mr->p2pmem) { + /* Avoid referencing an error-pointer later on */ + mr->umem = NULL; + goto reg_mr; + } +#endif + err = PTR_ERR(mr->umem); + ibdev_dbg(&dev->ibdev, + "Failed to pin and map user space memory[%d]\n", err); + goto err_free; + } + +#ifdef HAVE_EFA_P2P +reg_mr: +#endif + err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags); + if (err) + goto err_release; + + return &mr->ibmr; + +err_release: +#ifdef HAVE_EFA_P2P + if (mr->p2pmem) + efa_p2p_put(mr->p2pmem->ticket, false); + else + ib_umem_release(mr->umem); +#else + ib_umem_release(mr->umem); +#endif +err_free: + kfree(mr); +err_out: + atomic64_inc(&dev->stats.reg_mr_err); + return ERR_PTR(err); +} + +#ifdef HAVE_DEREG_MR_UDATA +int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +#else +int efa_dereg_mr(struct ib_mr *ibmr) +#endif +{ + struct efa_dev *dev = to_edev(ibmr->device); + struct efa_com_dereg_mr_params params; + struct efa_mr *mr = to_emr(ibmr); + int err; + + ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey); + +#ifdef HAVE_EFA_P2P + if (mr->p2pmem) { + err = efa_p2p_put(mr->p2p_ticket, false); + if (err) + return err; + + kfree(mr); + return 0; + } +#endif + params.l_key = mr->ibmr.lkey; + err = efa_com_dereg_mr(&dev->edev, ¶ms); + if (err) + return err; + +#if defined(HAVE_MR_DMABUF) && !defined(HAVE_IB_UMEM_DMABUF_PINNED) + if (mr->umem->is_dmabuf) { + struct ib_umem_dmabuf *umem_dmabuf; + + umem_dmabuf = to_ib_umem_dmabuf(mr->umem); + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); + dma_buf_unpin(umem_dmabuf->attach); + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + } +#endif + + ib_umem_release(mr->umem); + kfree(mr); + + return 0; +} + +int efa_get_port_immutable(struct ib_device *ibdev, port_t port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) { + ibdev_dbg(ibdev, "Couldn't query port err[%d]\n", err); + return err; + } + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + + return 0; +} + +static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn) +{ + struct efa_com_dealloc_uar_params params = { + .uarn = uarn, + }; + + return efa_com_dealloc_uar(&dev->edev, ¶ms); +} + +#define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \ + (_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \ + NULL : #_attr) + +static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext, + const struct efa_ibv_alloc_ucontext_cmd *cmd) +{ + struct efa_dev *dev = to_edev(ibucontext->device); + char *attr_str; + + if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch, + EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str)) + goto err; + + if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth, + EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR, + attr_str)) + goto err; + + return 0; + +err: + ibdev_dbg(&dev->ibdev, "Userspace handshake failed for %s attribute\n", + attr_str); + return -EOPNOTSUPP; +} + +int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) +{ + struct efa_ucontext *ucontext = to_eucontext(ibucontext); + struct efa_dev *dev = to_edev(ibucontext->device); + struct efa_ibv_alloc_ucontext_resp resp = {}; + struct efa_ibv_alloc_ucontext_cmd cmd = {}; + struct efa_com_alloc_uar_result result; + int err; + + /* + * it's fine if the driver does not know all request fields, + * we will ack input fields in our response. + */ + + err = ib_copy_from_udata(&cmd, udata, + min(sizeof(cmd), udata->inlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Cannot copy udata for alloc_ucontext\n"); + goto err_out; + } + + err = efa_user_comp_handshake(ibucontext, &cmd); + if (err) + goto err_out; + + err = efa_com_alloc_uar(&dev->edev, &result); + if (err) + goto err_out; + + ucontext->uarn = result.uarn; +#ifndef HAVE_CORE_MMAP_XA + mutex_init(&ucontext->lock); + INIT_LIST_HEAD(&ucontext->pending_mmaps); +#endif /* !defined(HAVE_CORE_MMAP_XA) */ + + resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE; + resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH; + resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq; + resp.inline_buf_size = dev->dev_attr.inline_buf_size; + resp.max_llq_size = dev->dev_attr.max_llq_size; + resp.max_tx_batch = dev->dev_attr.max_tx_batch; + resp.min_sq_wr = dev->dev_attr.min_sq_depth; + + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) + goto err_dealloc_uar; + + return 0; + +err_dealloc_uar: + efa_dealloc_uar(dev, result.uarn); +err_out: + atomic64_inc(&dev->stats.alloc_ucontext_err); + return err; +} + +#ifndef HAVE_UCONTEXT_CORE_ALLOCATION +struct ib_ucontext *efa_kzalloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibdev); + struct efa_ucontext *ucontext; + int err; + + /* + * it's fine if the driver does not know all request fields, + * we will ack input fields in our response. + */ + + ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL); + if (!ucontext) { + atomic64_inc(&dev->stats.alloc_ucontext_err); + return ERR_PTR(-ENOMEM); + } + + ucontext->ibucontext.device = ibdev; + err = efa_alloc_ucontext(&ucontext->ibucontext, udata); + if (err) + goto err_free_ucontext; + + return &ucontext->ibucontext; + +err_free_ucontext: + kfree(ucontext); + return ERR_PTR(err); +} +#endif + +#ifdef HAVE_UCONTEXT_CORE_ALLOCATION +void efa_dealloc_ucontext(struct ib_ucontext *ibucontext) +#else +int efa_dealloc_ucontext(struct ib_ucontext *ibucontext) +#endif +{ + struct efa_ucontext *ucontext = to_eucontext(ibucontext); + struct efa_dev *dev = to_edev(ibucontext->device); + +#ifndef HAVE_CORE_MMAP_XA + mmap_entries_remove_free(dev, ucontext); +#endif + efa_dealloc_uar(dev, ucontext->uarn); +#ifndef HAVE_UCONTEXT_CORE_ALLOCATION + kfree(ucontext); + + return 0; +#endif +} + +#ifdef HAVE_CORE_MMAP_XA +void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry) +{ + struct efa_user_mmap_entry *entry = to_emmap(rdma_entry); + + kfree(entry); +} +#endif + +static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext, + struct vm_area_struct *vma) +{ + struct rdma_user_mmap_entry *rdma_entry; + struct efa_user_mmap_entry *entry; + unsigned long va; + int err = 0; + u64 pfn; + + rdma_entry = rdma_user_mmap_entry_get(&ucontext->ibucontext, vma); + if (!rdma_entry) { + ibdev_dbg(&dev->ibdev, + "pgoff[%#lx] does not have valid entry\n", + vma->vm_pgoff); + atomic64_inc(&dev->stats.mmap_err); + return -EINVAL; + } + entry = to_emmap(rdma_entry); + + ibdev_dbg(&dev->ibdev, + "Mapping address[%#llx], length[%#zx], mmap_flag[%d]\n", + entry->address, rdma_entry->npages * PAGE_SIZE, + entry->mmap_flag); + + pfn = entry->address >> PAGE_SHIFT; + switch (entry->mmap_flag) { + case EFA_MMAP_IO_NC: +#ifdef HAVE_CORE_MMAP_XA + err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, + entry->rdma_entry.npages * PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot), + rdma_entry); +#elif defined(HAVE_RDMA_USER_MMAP_IO) + err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, + entry->rdma_entry.npages * PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot)); +#else + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + err = io_remap_pfn_range(vma, vma->vm_start, pfn, + entry->rdma_entry.npages * PAGE_SIZE, + vma->vm_page_prot); +#endif + break; + case EFA_MMAP_IO_WC: +#ifdef HAVE_CORE_MMAP_XA + err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, + entry->rdma_entry.npages * PAGE_SIZE, + pgprot_writecombine(vma->vm_page_prot), + rdma_entry); +#elif defined(HAVE_RDMA_USER_MMAP_IO) + err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, + entry->rdma_entry.npages * PAGE_SIZE, + pgprot_writecombine(vma->vm_page_prot)); +#else + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + err = io_remap_pfn_range(vma, vma->vm_start, pfn, + entry->rdma_entry.npages * PAGE_SIZE, + vma->vm_page_prot); +#endif + break; + case EFA_MMAP_DMA_PAGE: + for (va = vma->vm_start; va < vma->vm_end; + va += PAGE_SIZE, pfn++) { + err = vm_insert_page(vma, va, pfn_to_page(pfn)); + if (err) + break; + } + break; + default: + err = -EINVAL; + } + + if (err) { + ibdev_dbg( + &dev->ibdev, + "Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n", + entry->address, rdma_entry->npages * PAGE_SIZE, + entry->mmap_flag, err); + atomic64_inc(&dev->stats.mmap_err); + } + + rdma_user_mmap_entry_put(rdma_entry); + return err; +} + +int efa_mmap(struct ib_ucontext *ibucontext, + struct vm_area_struct *vma) +{ + struct efa_ucontext *ucontext = to_eucontext(ibucontext); + struct efa_dev *dev = to_edev(ibucontext->device); + size_t length = vma->vm_end - vma->vm_start; + + ibdev_dbg(&dev->ibdev, + "start %#lx, end %#lx, length = %#zx, pgoff = %#lx\n", + vma->vm_start, vma->vm_end, length, vma->vm_pgoff); + + return __efa_mmap(dev, ucontext, vma); +} + +static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah) +{ + struct efa_com_destroy_ah_params params = { + .ah = ah->ah, + .pdn = to_epd(ah->ibah.pd)->pdn, + }; + + return efa_com_destroy_ah(&dev->edev, ¶ms); +} + +int efa_create_ah(struct ib_ah *ibah, +#ifdef HAVE_CREATE_AH_INIT_ATTR + struct rdma_ah_init_attr *init_attr, +#else + struct rdma_ah_attr *ah_attr, + u32 flags, +#endif + struct ib_udata *udata) +{ +#ifdef HAVE_CREATE_AH_INIT_ATTR + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; +#endif + struct efa_dev *dev = to_edev(ibah->device); + struct efa_com_create_ah_params params = {}; + struct efa_ibv_create_ah_resp resp = {}; + struct efa_com_create_ah_result result; + struct efa_ah *ah = to_eah(ibah); + int err; + +#if defined(HAVE_CREATE_DESTROY_AH_FLAGS) || defined(HAVE_CREATE_AH_INIT_ATTR) +#ifdef HAVE_CREATE_AH_INIT_ATTR + if (!(init_attr->flags & RDMA_CREATE_AH_SLEEPABLE)) { +#else + if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) { +#endif + ibdev_dbg(&dev->ibdev, + "Create address handle is not supported in atomic context\n"); + err = -EOPNOTSUPP; + goto err_out; + } +#endif + +#ifndef HAVE_NO_KVERBS_DRIVERS + if (!udata) { + ibdev_dbg(&dev->ibdev, "udata is NULL\n"); + err = -EOPNOTSUPP; + goto err_out; + } +#endif + + if (udata->inlen && + !ib_is_udata_cleared(udata, 0, udata->inlen)) { + ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n"); + err = -EINVAL; + goto err_out; + } + + memcpy(params.dest_addr, ah_attr->grh.dgid.raw, + sizeof(params.dest_addr)); + params.pdn = to_epd(ibah->pd)->pdn; + err = efa_com_create_ah(&dev->edev, ¶ms, &result); + if (err) + goto err_out; + + memcpy(ah->id, ah_attr->grh.dgid.raw, sizeof(ah->id)); + ah->ah = result.ah; + + resp.efa_address_handle = result.ah; + + if (udata->outlen) { + err = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Failed to copy udata for create_ah response\n"); + goto err_destroy_ah; + } + } + ibdev_dbg(&dev->ibdev, "Created ah[%d]\n", ah->ah); + + return 0; + +err_destroy_ah: + efa_ah_destroy(dev, ah); +err_out: + atomic64_inc(&dev->stats.create_ah_err); + return err; +} + +#ifndef HAVE_AH_CORE_ALLOCATION +#ifdef HAVE_CREATE_DESTROY_AH_FLAGS +struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd, + struct rdma_ah_attr *ah_attr, + u32 flags, + struct ib_udata *udata) +#elif defined(HAVE_CREATE_AH_RDMA_ATTR) +struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd, + struct rdma_ah_attr *ah_attr, + struct ib_udata *udata) +#endif +{ + struct efa_ah *ah; + int err; +#ifndef HAVE_CREATE_DESTROY_AH_FLAGS + u32 flags = 0; +#endif + + ah = kzalloc(sizeof(*ah), GFP_KERNEL); + if (!ah) + return ERR_PTR(-ENOMEM); + + ah->ibah.device = ibpd->device; + ah->ibah.pd = ibpd; + err = efa_create_ah(&ah->ibah, ah_attr, flags, udata); + if (err) + goto err_free; + + return &ah->ibah; + +err_free: + kfree(ah); + return ERR_PTR(err); +} +#endif + +#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC +int efa_destroy_ah(struct ib_ah *ibah, u32 flags) +#elif defined(HAVE_AH_CORE_ALLOCATION) +void efa_destroy_ah(struct ib_ah *ibah, u32 flags) +#elif defined(HAVE_CREATE_DESTROY_AH_FLAGS) +int efa_destroy_ah(struct ib_ah *ibah, u32 flags) +#else +int efa_destroy_ah(struct ib_ah *ibah) +#endif +{ + struct efa_dev *dev = to_edev(ibah->pd->device); + struct efa_ah *ah = to_eah(ibah); +#if !defined(HAVE_AH_CORE_ALLOCATION) && !defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC) + int err; +#endif + + ibdev_dbg(&dev->ibdev, "Destroy ah[%d]\n", ah->ah); + +#if defined(HAVE_CREATE_DESTROY_AH_FLAGS) + if (!(flags & RDMA_DESTROY_AH_SLEEPABLE)) { + ibdev_dbg(&dev->ibdev, + "Destroy address handle is not supported in atomic context\n"); +#if defined(HAVE_AH_CORE_ALLOCATION) && !defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC) + return; +#else + return -EOPNOTSUPP; +#endif + } +#endif + +#if defined(HAVE_AH_CORE_ALLOCATION) || defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC) + efa_ah_destroy(dev, ah); +#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC + return 0; +#endif +#else + err = efa_ah_destroy(dev, ah); + if (err) + return err; + kfree(ah); + return 0; +#endif +} + +#ifdef HAVE_SPLIT_STATS_ALLOC +struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev, + port_t port_num) +{ + return rdma_alloc_hw_stats_struct(efa_port_stats_descs, + ARRAY_SIZE(efa_port_stats_descs), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev) +{ + return rdma_alloc_hw_stats_struct(efa_device_stats_descs, + ARRAY_SIZE(efa_device_stats_descs), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} +#else +struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, port_t port_num) +{ + if (port_num) + return rdma_alloc_hw_stats_struct(efa_port_stats_descs, + ARRAY_SIZE(efa_port_stats_descs), + RDMA_HW_STATS_DEFAULT_LIFESPAN); + else + return rdma_alloc_hw_stats_struct(efa_device_stats_descs, + ARRAY_SIZE(efa_device_stats_descs), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} +#endif + +static int efa_fill_device_stats(struct efa_dev *dev, + struct rdma_hw_stats *stats) +{ + struct efa_com_stats_admin *as = &dev->edev.aq.stats; + struct efa_stats *s = &dev->stats; + + stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd); + stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd); + stats->value[EFA_CMDS_ERR] = atomic64_read(&as->cmd_err); + stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion); + + stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd); + stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->alloc_pd_err); + stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->create_qp_err); + stats->value[EFA_CREATE_CQ_ERR] = atomic64_read(&s->create_cq_err); + stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->reg_mr_err); + stats->value[EFA_ALLOC_UCONTEXT_ERR] = + atomic64_read(&s->alloc_ucontext_err); + stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->create_ah_err); + stats->value[EFA_MMAP_ERR] = atomic64_read(&s->mmap_err); + + return ARRAY_SIZE(efa_device_stats_descs); +} + +static int efa_fill_port_stats(struct efa_dev *dev, struct rdma_hw_stats *stats, + port_t port_num) +{ + struct efa_com_get_stats_params params = {}; + union efa_com_get_stats_result result; + struct efa_com_rdma_read_stats *rrs; + struct efa_com_messages_stats *ms; + struct efa_com_basic_stats *bs; + int err; + + params.scope = EFA_ADMIN_GET_STATS_SCOPE_ALL; + params.type = EFA_ADMIN_GET_STATS_TYPE_BASIC; + + err = efa_com_get_stats(&dev->edev, ¶ms, &result); + if (err) + return err; + + bs = &result.basic_stats; + stats->value[EFA_TX_BYTES] = bs->tx_bytes; + stats->value[EFA_TX_PKTS] = bs->tx_pkts; + stats->value[EFA_RX_BYTES] = bs->rx_bytes; + stats->value[EFA_RX_PKTS] = bs->rx_pkts; + stats->value[EFA_RX_DROPS] = bs->rx_drops; + + params.type = EFA_ADMIN_GET_STATS_TYPE_MESSAGES; + err = efa_com_get_stats(&dev->edev, ¶ms, &result); + if (err) + return err; + + ms = &result.messages_stats; + stats->value[EFA_SEND_BYTES] = ms->send_bytes; + stats->value[EFA_SEND_WRS] = ms->send_wrs; + stats->value[EFA_RECV_BYTES] = ms->recv_bytes; + stats->value[EFA_RECV_WRS] = ms->recv_wrs; + + params.type = EFA_ADMIN_GET_STATS_TYPE_RDMA_READ; + err = efa_com_get_stats(&dev->edev, ¶ms, &result); + if (err) + return err; + + rrs = &result.rdma_read_stats; + stats->value[EFA_RDMA_READ_WRS] = rrs->read_wrs; + stats->value[EFA_RDMA_READ_BYTES] = rrs->read_bytes; + stats->value[EFA_RDMA_READ_WR_ERR] = rrs->read_wr_err; + stats->value[EFA_RDMA_READ_RESP_BYTES] = rrs->read_resp_bytes; + + return ARRAY_SIZE(efa_port_stats_descs); +} + +int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + port_t port_num, int index) +{ + if (port_num) + return efa_fill_port_stats(to_edev(ibdev), stats, port_num); + else + return efa_fill_device_stats(to_edev(ibdev), stats); +} + +#ifndef HAVE_NO_KVERBS_DRIVERS +#ifdef HAVE_POST_CONST_WR +int efa_post_send(struct ib_qp *ibqp, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +#else +int efa_post_send(struct ib_qp *ibqp, + struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +#endif +{ + struct efa_dev *dev = to_edev(ibqp->device); + + ibdev_warn(&dev->ibdev.dev, "Function not supported\n"); + return -EOPNOTSUPP; +} + +#ifdef HAVE_POST_CONST_WR +int efa_post_recv(struct ib_qp *ibqp, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +#else +int efa_post_recv(struct ib_qp *ibqp, + struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +#endif +{ + struct efa_dev *dev = to_edev(ibqp->device); + + ibdev_warn(&dev->ibdev.dev, "Function not supported\n"); + return -EOPNOTSUPP; +} + +int efa_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc) +{ + struct efa_dev *dev = to_edev(ibcq->device); + + ibdev_warn(&dev->ibdev.dev, "Function not supported\n"); + return -EOPNOTSUPP; +} + +int efa_req_notify_cq(struct ib_cq *ibcq, + enum ib_cq_notify_flags flags) +{ + struct efa_dev *dev = to_edev(ibcq->device); + + ibdev_warn(&dev->ibdev.dev, "Function not supported\n"); + return -EOPNOTSUPP; +} + +struct ib_mr *efa_get_dma_mr(struct ib_pd *ibpd, int acc) +{ + struct efa_dev *dev = to_edev(ibpd->device); + + ibdev_warn(&dev->ibdev.dev, "Function not supported\n"); + return ERR_PTR(-EOPNOTSUPP); +} +#endif + +enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev, + port_t port_num) +{ + return IB_LINK_LAYER_UNSPECIFIED; +} + diff --git a/drivers/amazon/net/efa/kcompat.h b/drivers/amazon/net/efa/kcompat.h new file mode 100644 index 0000000000000..713dcc00b394c --- /dev/null +++ b/drivers/amazon/net/efa/kcompat.h @@ -0,0 +1,243 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _KCOMPAT_H_ +#define _KCOMPAT_H_ + +#include + +#include "config.h" + +#ifndef HAVE_IB_IS_UDATA_CLEARED +#include +#include +#include + +static inline bool ib_is_udata_cleared(struct ib_udata *udata, + size_t offset, + size_t len) +{ + const void __user *p = udata->inbuf + offset; + bool ret = false; + u8 *buf; + + if (len > USHRT_MAX) + return false; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return false; + + if (copy_from_user(buf, p, len)) + goto free; + + ret = !memchr_inv(buf, 0, len); + +free: + kfree(buf); + return ret; +} +#endif + +#ifndef HAVE_IB_QPT_DRIVER +#define IB_QPT_DRIVER 0xFF +#endif + +#if defined(HAVE_DRIVER_ID) && !defined(HAVE_UPSTREAM_EFA) +#define RDMA_DRIVER_EFA 17 +#endif + +#ifndef HAVE_IBDEV_PRINT +#define ibdev_err(_ibdev, format, arg...) \ + dev_err(&((struct ib_device *)(_ibdev))->dev, format, ##arg) +#define ibdev_dbg(_ibdev, format, arg...) \ + dev_dbg(&((struct ib_device *)(_ibdev))->dev, format, ##arg) +#define ibdev_warn(_ibdev, format, arg...) \ + dev_warn(&((struct ib_device *)(_ibdev))->dev, format, ##arg) +#define ibdev_info(_ibdev, format, arg...) \ + dev_info(&((struct ib_device *)(_ibdev))->dev, format, ##arg) +#endif + +#ifndef HAVE_IBDEV_PRINT_RATELIMITED +#define ibdev_err_ratelimited(_ibdev, format, arg...) \ + dev_err_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg) +#define ibdev_dbg_ratelimited(_ibdev, format, arg...) \ + dev_dbg_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg) +#define ibdev_warn_ratelimited(_ibdev, format, arg...) \ + dev_warn_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg) +#define ibdev_info_ratelimited(_ibdev, format, arg...) \ + dev_info_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg) +#endif + +#ifndef HAVE_KVZALLOC +#include +#include + +static inline void *kvzalloc(size_t size, gfp_t flags) +{ + void *addr; + + addr = kzalloc(size, flags | __GFP_NOWARN); + if (addr) + return addr; + + return vzalloc(size); +} +#endif + +#ifndef HAVE_IB_PORT_PHYS_STATE_LINK_UP +#define IB_PORT_PHYS_STATE_LINK_UP 5 +#endif + +#ifndef HAVE_CORE_MMAP_XA +#include +#include + +struct rdma_user_mmap_entry { + struct ib_ucontext *ucontext; + unsigned long start_pgoff; + size_t npages; +}; + +/* Return the offset (in bytes) the user should pass to libc's mmap() */ +static inline u64 +rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry) +{ + return (u64)entry->start_pgoff << PAGE_SHIFT; +} + +/* + * Backported kernels don't keep refcnt on entries, hence they should not + * be removed. + */ +static inline void +rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry) +{ +} + +static inline void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry) +{ +} +#endif + +#ifndef sizeof_field +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) +#endif + +#ifndef HAVE_BITFIELD_H +#define __bf_shf(x) (__builtin_ffsll(x) - 1) + +#define FIELD_PREP(_mask, _val) \ + ({ \ + ((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask); \ + }) + +#define FIELD_GET(_mask, _reg) \ + ({ \ + (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ + }) +#endif + +#ifndef HAVE_RDMA_NODE_UNSPECIFIED +enum { + RDMA_NODE_UNSPECIFIED = 7, +}; +#endif + +#ifndef HAVE_ATOMIC64_FETCH_INC +static __always_inline s64 +atomic64_fetch_inc(atomic64_t *v) +{ + return atomic64_inc_return(v) - 1; +} +#endif + +#if !defined(HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK) && defined(HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE) +#include + +static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter, + struct ib_umem *umem, + unsigned long pgsz) +{ + __rdma_block_iter_start(biter, umem->sg_head.sgl, umem->nmap, pgsz); +} + +/** + * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem + * @umem: umem to iterate over + * @pgsz: Page size to split the list into + * + * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The + * returned DMA blocks will be aligned to pgsz and span the range: + * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz) + * + * Performs exactly ib_umem_num_dma_blocks() iterations. + */ +#define rdma_umem_for_each_dma_block(umem, biter, pgsz) \ + for (__rdma_umem_block_iter_start(biter, umem, pgsz); \ + __rdma_block_iter_next(biter);) +#endif + +#ifdef HAVE_U32_PORT +typedef u32 port_t; +#else +typedef u8 port_t; +#endif + +#if defined(HAVE_MR_DMABUF) && !defined(HAVE_IB_UMEM_DMABUF_PINNED) +#include +#include +#include + +static inline void +ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach) +{ + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; + + ibdev_warn_ratelimited(umem_dmabuf->umem.ibdev, + "Invalidate callback should not be called when memory is pinned\n"); +} + +static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { + .allow_peer2peer = true, + .move_notify = ib_umem_dmabuf_unsupported_move_notify, +}; + +static inline +struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, + unsigned long offset, + size_t size, int fd, + int access) +{ + struct ib_umem_dmabuf *umem_dmabuf; + int err; + + umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access, + &ib_umem_dmabuf_attach_pinned_ops); + if (IS_ERR(umem_dmabuf)) + return umem_dmabuf; + + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); + err = dma_buf_pin(umem_dmabuf->attach); + if (err) + goto err_release; + + err = ib_umem_dmabuf_map_pages(umem_dmabuf); + if (err) + goto err_unpin; + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + + return umem_dmabuf; + +err_unpin: + dma_buf_unpin(umem_dmabuf->attach); +err_release: + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + ib_umem_release(&umem_dmabuf->umem); + return ERR_PTR(err); +} +#endif /* !HAVE_IB_UMEM_DMABUF_PINNED */ + +#endif /* _KCOMPAT_H_ */ diff --git a/drivers/amazon/net/efa/neuron_p2p.h b/drivers/amazon/net/efa/neuron_p2p.h new file mode 100644 index 0000000000000..a1ce44003463f --- /dev/null +++ b/drivers/amazon/net/efa/neuron_p2p.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef __NEURON_P2P_H__ +#define __NEURON_P2P_H__ + +struct neuron_p2p_page_info { + u64 physical_address; // PA's that map to the VA (page aligned as defined in va_info) + u32 page_count; // page count each page is shift_page_size size +}; + +struct neuron_p2p_va_info { + void *virtual_address; // Virtual address for which the PA's need to be obtained + u64 size; // The actual size of the memory pointed by the virtual_address + u32 shift_page_size; // log2 of the page size + u32 device_index; // Neuron Device index. + u32 entries; // Number of page_info entries + struct neuron_p2p_page_info page_info[]; +}; + +/** Given the virtual address and length returns the physical address + * + * @param[in] virtual_address - Virtual address of device memory + * @param[in] length - Length of the memory + * @param[out] va_info - Set of physical addresses + * @param[in] free_callback - Callback function to be called. This will be called with a lock held. + * @param[in] data - Data to be used for the callback + * + * @return 0 - Success. + */ +int neuron_p2p_register_va(u64 virtual_address, u64 length, struct neuron_p2p_va_info **vainfo, void (*free_callback) (void *data), void *data); + +/** Give the pa, release the pa from being used by third-party device + * + * @param[in] va_info - Set of physical addresses + * + * @return 0 - Success. + */ +int neuron_p2p_unregister_va(struct neuron_p2p_va_info *vainfo); + +#endif diff --git a/drivers/amazon/net/efa/nv-p2p.h b/drivers/amazon/net/efa/nv-p2p.h new file mode 100644 index 0000000000000..d74e024963d5a --- /dev/null +++ b/drivers/amazon/net/efa/nv-p2p.h @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _NV_P2P_H_ +#define _NV_P2P_H_ + +/* + * NVIDIA P2P Structure Versioning + * + * For the nvidia_p2p_*_t structures allocated by the NVIDIA driver, it will + * set the version field of the structure according to the definition used by + * the NVIDIA driver. The "major" field of the version is defined as the upper + * 16 bits, and the "minor" field of the version is defined as the lower 16 + * bits. The version field will always be the first 4 bytes of the structure, + * and third-party drivers should check the value of this field in structures + * allocated by the NVIDIA driver to ensure runtime compatibility. + * + * In general, version numbers will be incremented as follows: + * - When a backwards-compatible change is made to the structure layout, the + * minor version for that structure will be incremented. Third-party drivers + * built against an older minor version will continue to work with the newer + * minor version used by the NVIDIA driver, without recompilation. + * - When a breaking change is made to the structure layout, the major version + * will be incremented. Third-party drivers built against an older major + * version require at least recompilation and potentially additional updates + * to use the new API. + */ +#define NVIDIA_P2P_MAJOR_VERSION_MASK 0xffff0000 +#define NVIDIA_P2P_MINOR_VERSION_MASK 0x0000ffff + +#define NVIDIA_P2P_MAJOR_VERSION(v) \ + (((v) & NVIDIA_P2P_MAJOR_VERSION_MASK) >> 16) + +#define NVIDIA_P2P_MINOR_VERSION(v) \ + (((v) & NVIDIA_P2P_MINOR_VERSION_MASK)) + +#define NVIDIA_P2P_MAJOR_VERSION_MATCHES(p, v) \ + (NVIDIA_P2P_MAJOR_VERSION((p)->version) == NVIDIA_P2P_MAJOR_VERSION(v)) + +#define NVIDIA_P2P_VERSION_COMPATIBLE(p, v) \ + (NVIDIA_P2P_MAJOR_VERSION_MATCHES(p, v) && \ + (NVIDIA_P2P_MINOR_VERSION((p)->version) >= (NVIDIA_P2P_MINOR_VERSION(v)))) + +enum { + NVIDIA_P2P_ARCHITECTURE_TESLA = 0, + NVIDIA_P2P_ARCHITECTURE_FERMI, + NVIDIA_P2P_ARCHITECTURE_CURRENT = NVIDIA_P2P_ARCHITECTURE_FERMI +}; + +#define NVIDIA_P2P_PARAMS_VERSION 0x00010001 + +enum { + NVIDIA_P2P_PARAMS_ADDRESS_INDEX_GPU = 0, + NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE, + NVIDIA_P2P_PARAMS_ADDRESS_INDEX_MAX = \ + NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE +}; + +typedef +struct nvidia_p2p_params { + u32 version; + u32 architecture; + union nvidia_p2p_mailbox_addresses { + struct { + u64 wmb_addr; + u64 wmb_data; + u64 rreq_addr; + u64 rcomp_addr; + u64 reserved[2]; + } fermi; + } addresses[NVIDIA_P2P_PARAMS_ADDRESS_INDEX_MAX+1]; +} nvidia_p2p_params_t; + +/* + * @brief + * Initializes a third-party P2P mapping between an NVIDIA + * GPU and a third-party device. + * + * @param[in] p2p_token + * A token that uniquely identifies the P2P mapping. + * @param[in,out] params + * A pointer to a structure with P2P mapping parameters. + * @param[in] destroy_callback + * A pointer to the function to be invoked when the P2P mapping + * is destroyed implictly. + * @param[in] data + * An opaque pointer to private data to be passed to the + * callback function. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -ENOTSUPP if the requested configuration is not supported. + * -ENOMEM if the driver failed to allocate memory. + * -EBUSY if the mapping has already been initialized. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_init_mapping(u64 p2p_token, + struct nvidia_p2p_params *params, + void (*destroy_callback)(void *data), + void *data); + +/* + * @brief + * Tear down a previously initialized third-party P2P mapping. + * + * @param[in] p2p_token + * A token that uniquely identifies the mapping. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -ENOTSUPP if the requested configuration is not supported. + * -ENOMEM if the driver failed to allocate memory. + */ +int nvidia_p2p_destroy_mapping(u64 p2p_token); + +enum nvidia_p2p_page_size_type { + NVIDIA_P2P_PAGE_SIZE_4KB = 0, + NVIDIA_P2P_PAGE_SIZE_64KB, + NVIDIA_P2P_PAGE_SIZE_128KB, + NVIDIA_P2P_PAGE_SIZE_COUNT +}; + +typedef +struct nvidia_p2p_page { + u64 physical_address; + union nvidia_p2p_request_registers { + struct { + u32 wreqmb_h; + u32 rreqmb_h; + u32 rreqmb_0; + u32 reserved[3]; + } fermi; + } registers; +} nvidia_p2p_page_t; + +#define NVIDIA_P2P_PAGE_TABLE_VERSION 0x00010002 + +#define NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(p) \ + NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_PAGE_TABLE_VERSION) + +typedef +struct nvidia_p2p_page_table { + u32 version; + u32 page_size; /* enum nvidia_p2p_page_size_type */ + struct nvidia_p2p_page **pages; + u32 entries; + u8 *gpu_uuid; +} nvidia_p2p_page_table_t; + +/* + * @brief + * Make the pages underlying a range of GPU virtual memory + * accessible to a third-party device. + * + * This API only supports pinned, GPU-resident memory, such as that provided + * by cudaMalloc(). + * + * This API may sleep. + * + * @param[in] p2p_token + * A token that uniquely identifies the P2P mapping. + * @param[in] va_space + * A GPU virtual address space qualifier. + * @param[in] virtual_address + * The start address in the specified virtual address space. + * Address must be aligned to the 64KB boundary. + * @param[in] length + * The length of the requested P2P mapping. + * Length must be a multiple of 64KB. + * @param[out] page_table + * A pointer to an array of structures with P2P PTEs. + * @param[in] free_callback + * A non-NULL pointer to the function to be invoked when the pages + * underlying the virtual address range are freed + * implicitly. Must be non NULL. + * @param[in] data + * A non-NULL opaque pointer to private data to be passed to the + * callback function. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -ENOTSUPP if the requested operation is not supported. + * -ENOMEM if the driver failed to allocate memory or if + * insufficient resources were available to complete the operation. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_get_pages(u64 p2p_token, u32 va_space, + u64 virtual_address, + u64 length, + struct nvidia_p2p_page_table **page_table, + void (*free_callback)(void *data), + void *data); + +#define NVIDIA_P2P_DMA_MAPPING_VERSION 0x00020003 + +#define NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(p) \ + NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_DMA_MAPPING_VERSION) + +struct pci_dev; + +typedef +struct nvidia_p2p_dma_mapping { + u32 version; + enum nvidia_p2p_page_size_type page_size_type; + u32 entries; + u64 *dma_addresses; + void *private; + struct pci_dev *pci_dev; +} nvidia_p2p_dma_mapping_t; + +/* + * @brief + * Make the physical pages retrieved using nvidia_p2p_get_pages accessible to + * a third-party device. + * + * @param[in] peer + * The struct pci_dev * of the peer device that needs to DMA to/from the + * mapping. + * @param[in] page_table + * The page table outlining the physical pages underlying the mapping, as + * retrieved with nvidia_p2p_get_pages(). + * @param[out] dma_mapping + * The DMA mapping containing the DMA addresses to use on the third-party + * device. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -ENOTSUPP if the requested operation is not supported. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_dma_map_pages(struct pci_dev *peer, + struct nvidia_p2p_page_table *page_table, + struct nvidia_p2p_dma_mapping **dma_mapping); + +/* + * @brief + * Unmap the physical pages previously mapped to the third-party device by + * nvidia_p2p_dma_map_pages(). + * + * @param[in] peer + * The struct pci_dev * of the peer device that the DMA mapping belongs to. + * @param[in] page_table + * The page table backing the DMA mapping to be unmapped. + * @param[in] dma_mapping + * The DMA mapping containing the DMA addresses used by the third-party + * device, as retrieved with nvidia_p2p_dma_map_pages(). After this call + * returns, neither this struct nor the addresses contained within will be + * valid for use by the third-party device. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_dma_unmap_pages(struct pci_dev *peer, + struct nvidia_p2p_page_table *page_table, + struct nvidia_p2p_dma_mapping *dma_mapping); + +/* + * @brief + * Release a set of pages previously made accessible to + * a third-party device. + * + * @param[in] p2p_token + * A token that uniquely identifies the P2P mapping. + * @param[in] va_space + * A GPU virtual address space qualifier. + * @param[in] virtual_address + * The start address in the specified virtual address space. + * @param[in] page_table + * A pointer to the array of structures with P2P PTEs. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + * -EIO if an unknown error occurred. + */ +int nvidia_p2p_put_pages(u64 p2p_token, u32 va_space, + u64 virtual_address, + struct nvidia_p2p_page_table *page_table); + +/* + * @brief + * Free a third-party P2P page table. (This function is a no-op.) + * + * @param[in] page_table + * A pointer to the array of structures with P2P PTEs. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + */ +int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table); + +/* + * @brief + * Free a third-party P2P DMA mapping. (This function is a no-op.) + * + * @param[in] dma_mapping + * A pointer to the DMA mapping structure. + * + * @return + * 0 upon successful completion. + * -EINVAL if an invalid argument was supplied. + */ +int nvidia_p2p_free_dma_mapping(struct nvidia_p2p_dma_mapping *dma_mapping); + +#define NVIDIA_P2P_RSYNC_DRIVER_VERSION 0x00010001 + +#define NVIDIA_P2P_RSYNC_DRIVER_VERSION_COMPATIBLE(p) \ + NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_RSYNC_DRIVER_VERSION) + +typedef +struct nvidia_p2p_rsync_driver { + u32 version; + int (*get_relaxed_ordering_mode)(int *mode, void *data); + void (*put_relaxed_ordering_mode)(int mode, void *data); + void (*wait_for_rsync)(struct pci_dev *gpu, void *data); +} nvidia_p2p_rsync_driver_t; + +/* + * @brief + * Registers the rsync driver. + * + * @param[in] driver + * A pointer to the rsync driver structure. The NVIDIA driver would use, + * + * get_relaxed_ordering_mode to obtain a reference to the current relaxed + * ordering mode (treated as a boolean) from the rsync driver. + * + * put_relaxed_ordering_mode to release a reference to the current relaxed + * ordering mode back to the rsync driver. The NVIDIA driver will call this + * function once for each successful call to get_relaxed_ordering_mode, and + * the relaxed ordering mode must not change until the last reference is + * released. + * + * wait_for_rsync to call into the rsync module to issue RSYNC. This callback + * can't sleep or re-schedule as it may arrive under spinlocks. + * @param[in] data + * A pointer to the rsync driver's private data. + * + * @Returns + * 0 upon successful completion. + * -EINVAL parameters are incorrect. + * -EBUSY if a module is already registered or GPU devices are in use. + */ +int nvidia_p2p_register_rsync_driver(nvidia_p2p_rsync_driver_t *driver, + void *data); + +/* + * @brief + * Unregisters the rsync driver. + * + * @param[in] driver + * A pointer to the rsync driver structure. + * @param[in] data + * A pointer to the rsync driver's private data. + */ +void nvidia_p2p_unregister_rsync_driver(nvidia_p2p_rsync_driver_t *driver, + void *data); + +#define NVIDIA_P2P_RSYNC_REG_INFO_VERSION 0x00020001 + +#define NVIDIA_P2P_RSYNC_REG_INFO_VERSION_COMPATIBLE(p) \ + NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_RSYNC_REG_INFO_VERSION) + +typedef struct nvidia_p2p_rsync_reg { + void *ptr; + size_t size; + struct pci_dev *ibmnpu; + struct pci_dev *gpu; + u32 cluster_id; + u32 socket_id; +} nvidia_p2p_rsync_reg_t; + +typedef struct nvidia_p2p_rsync_reg_info { + u32 version; + nvidia_p2p_rsync_reg_t *regs; + size_t entries; +} nvidia_p2p_rsync_reg_info_t; + +/* + * @brief + * Gets rsync (GEN-ID) register information associated with the supported + * NPUs. + * + * The caller would use the returned information {GPU device, NPU device, + * socket-id, cluster-id} to pick the optimal generation registers to issue + * RSYNC (NVLink HW flush). + * + * The interface allocates structures to return the information, hence + * nvidia_p2p_put_rsync_registers() must be called to free the structures. + * + * Note, cluster-id is hardcoded to zero as early system configurations would + * only support cluster mode i.e. all devices would share the same cluster-id + * (0). In the future, appropriate kernel support would be needed to query + * cluster-ids. + * + * @param[out] reg_info + * A pointer to the rsync reg info structure. + * + * @Returns + * 0 Upon successful completion. Otherwise, returns negative value. + */ +int nvidia_p2p_get_rsync_registers(nvidia_p2p_rsync_reg_info_t **reg_info); + +/* + * @brief + * Frees the structures allocated by nvidia_p2p_get_rsync_registers(). + * + * @param[in] reg_info + * A pointer to the rsync reg info structure. + */ +void nvidia_p2p_put_rsync_registers(nvidia_p2p_rsync_reg_info_t *reg_info); + +#endif /* _NV_P2P_H_ */ diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile new file mode 100644 index 0000000000000..b61366782d8d6 --- /dev/null +++ b/drivers/amazon/net/ena/Makefile @@ -0,0 +1,20 @@ +# +# Makefile for the Elastic Network Adapter (ENA) device drivers. +# ENA Source is: https://github.com/amzn/amzn-drivers. +# Current ENA source is based on ena_linux_2.4.0 tag. +# + +obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o + +ena-y := ena_netdev.o ena_ethtool.o ena_lpc.o ena_phc.o ena_xdp.o dim.o \ + ena_devlink.o net_dim.o ena_com.o ena_eth_com.o + +ena-$(CONFIG_SYSFS) += ena_sysfs.o + +ifdef TEST_AF_XDP + ccflags-y += -DENA_TEST_AF_XDP +endif + +ifdef ENA_PHC_INCLUDE + ccflags-y += -DENA_PHC_INCLUDE +endif diff --git a/drivers/amazon/net/ena/dim.c b/drivers/amazon/net/ena/dim.c new file mode 100644 index 0000000000000..1b200be4b3709 --- /dev/null +++ b/drivers/amazon/net/ena/dim.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. + */ + +#include "dim.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) + +bool dim_on_top(struct dim *dim) +{ + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + case DIM_PARKING_TIRED: + return true; + case DIM_GOING_RIGHT: + return (dim->steps_left > 1) && (dim->steps_right == 1); + default: /* DIM_GOING_LEFT */ + return (dim->steps_right > 1) && (dim->steps_left == 1); + } +} + +void dim_turn(struct dim *dim) +{ + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + case DIM_PARKING_TIRED: + break; + case DIM_GOING_RIGHT: + dim->tune_state = DIM_GOING_LEFT; + dim->steps_left = 0; + break; + case DIM_GOING_LEFT: + dim->tune_state = DIM_GOING_RIGHT; + dim->steps_right = 0; + break; + } +} + +void dim_park_on_top(struct dim *dim) +{ + dim->steps_right = 0; + dim->steps_left = 0; + dim->tired = 0; + dim->tune_state = DIM_PARKING_ON_TOP; +} + +void dim_park_tired(struct dim *dim) +{ + dim->steps_right = 0; + dim->steps_left = 0; + dim->tune_state = DIM_PARKING_TIRED; +} + +void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, + struct dim_stats *curr_stats) +{ + /* u32 holds up to 71 minutes, should be enough */ + u32 delta_us = ktime_us_delta(end->time, start->time); + u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr); + u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr, + start->byte_ctr); + u32 ncomps = BIT_GAP(BITS_PER_TYPE(u32), end->comp_ctr, + start->comp_ctr); + + if (!delta_us) + return; + + curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us); + curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us); + curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC, + delta_us); + curr_stats->cpms = DIV_ROUND_UP(ncomps * USEC_PER_MSEC, delta_us); + if (curr_stats->epms != 0) + curr_stats->cpe_ratio = DIV_ROUND_DOWN_ULL( + curr_stats->cpms * 100, curr_stats->epms); + else + curr_stats->cpe_ratio = 0; + +} + +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */ diff --git a/drivers/amazon/net/ena/dim.h b/drivers/amazon/net/ena/dim.h new file mode 100644 index 0000000000000..633c2473e73ad --- /dev/null +++ b/drivers/amazon/net/ena/dim.h @@ -0,0 +1,338 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef DIM_H +#define DIM_H + +#include +#include "kcompat.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) + +/** + * Number of events between DIM iterations. + * Causes a moderation of the algorithm run. + */ +#define DIM_NEVENTS 64 + +/** + * Is a difference between values justifies taking an action. + * We consider 10% difference as significant. + */ +#define IS_SIGNIFICANT_DIFF(val, ref) \ + (((100UL * abs((val) - (ref))) / (ref)) > 10) + +/** + * Calculate the gap between two values. + * Take wrap-around and variable size into consideration. + */ +#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \ + & (BIT_ULL(bits) - 1)) + +/** + * Structure for CQ moderation values. + * Used for communications between DIM and its consumer. + * + * @usec: CQ timer suggestion (by DIM) + * @pkts: CQ packet counter suggestion (by DIM) + * @cq_period_mode: CQ priod count mode (from CQE/EQE) + */ +struct dim_cq_moder { + u16 usec; + u16 pkts; + u16 comps; + u8 cq_period_mode; +}; + +/** + * Structure for DIM sample data. + * Used for communications between DIM and its consumer. + * + * @time: Sample timestamp + * @pkt_ctr: Number of packets + * @byte_ctr: Number of bytes + * @event_ctr: Number of events + */ +struct dim_sample { + ktime_t time; + u32 pkt_ctr; + u32 byte_ctr; + u16 event_ctr; + u32 comp_ctr; +}; + +/** + * Structure for DIM stats. + * Used for holding current measured rates. + * + * @ppms: Packets per msec + * @bpms: Bytes per msec + * @epms: Events per msec + */ +struct dim_stats { + int ppms; /* packets per msec */ + int bpms; /* bytes per msec */ + int epms; /* events per msec */ + int cpms; /* completions per msec */ + int cpe_ratio; /* ratio of completions to events */ +}; + +/** + * Main structure for dynamic interrupt moderation (DIM). + * Used for holding all information about a specific DIM instance. + * + * @state: Algorithm state (see below) + * @prev_stats: Measured rates from previous iteration (for comparison) + * @start_sample: Sampled data at start of current iteration + * @work: Work to perform on action required + * @priv: A pointer to the struct that points to dim + * @profile_ix: Current moderation profile + * @mode: CQ period count mode + * @tune_state: Algorithm tuning state (see below) + * @steps_right: Number of steps taken towards higher moderation + * @steps_left: Number of steps taken towards lower moderation + * @tired: Parking depth counter + */ +struct dim { + u8 state; + struct dim_stats prev_stats; + struct dim_sample start_sample; + struct dim_sample measuring_sample; + struct work_struct work; + void *priv; + u8 profile_ix; + u8 mode; + u8 tune_state; + u8 steps_right; + u8 steps_left; + u8 tired; +}; + +/** + * enum dim_cq_period_mode + * + * These are the modes for CQ period count. + * + * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE + * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset) + * @DIM_CQ_PERIOD_NUM_MODES: Number of modes + */ +enum { + DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0, + DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1, + DIM_CQ_PERIOD_NUM_MODES +}; + +/** + * enum dim_state + * + * These are the DIM algorithm states. + * These will determine if the algorithm is in a valid state to start an iteration. + * + * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile) + * @DIM_MEASURE_IN_PROGRESS: Algorithm is already in progress - check if + * need to perform an action + * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure + */ +enum { + DIM_START_MEASURE, + DIM_MEASURE_IN_PROGRESS, + DIM_APPLY_NEW_PROFILE, +}; + +/** + * enum dim_tune_state + * + * These are the DIM algorithm tune states. + * These will determine which action the algorithm should perform. + * + * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference + * @DIM_PARKING_TIRED: Algorithm found a deep top point - don't exit if tired > 0 + * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels + * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels + */ +enum { + DIM_PARKING_ON_TOP, + DIM_PARKING_TIRED, + DIM_GOING_RIGHT, + DIM_GOING_LEFT, +}; + +/** + * enum dim_stats_state + * + * These are the DIM algorithm statistics states. + * These will determine the verdict of current iteration. + * + * @DIM_STATS_WORSE: Current iteration shows worse performance than before + * @DIM_STATS_WORSE: Current iteration shows same performance than before + * @DIM_STATS_WORSE: Current iteration shows better performance than before + */ +enum { + DIM_STATS_WORSE, + DIM_STATS_SAME, + DIM_STATS_BETTER, +}; + +/** + * enum dim_step_result + * + * These are the DIM algorithm step results. + * These describe the result of a step. + * + * @DIM_STEPPED: Performed a regular step + * @DIM_TOO_TIRED: Same kind of step was done multiple times - should go to + * tired parking + * @DIM_ON_EDGE: Stepped to the most left/right profile + */ +enum { + DIM_STEPPED, + DIM_TOO_TIRED, + DIM_ON_EDGE, +}; + +/** + * dim_on_top - check if current state is a good place to stop (top location) + * @dim: DIM context + * + * Check if current profile is a good place to park at. + * This will result in reducing the DIM checks frequency as we assume we + * shouldn't probably change profiles, unless traffic pattern wasn't changed. + */ +bool dim_on_top(struct dim *dim); + +/** + * dim_turn - change profile alterning direction + * @dim: DIM context + * + * Go left if we were going right and vice-versa. + * Do nothing if currently parking. + */ +void dim_turn(struct dim *dim); + +/** + * dim_park_on_top - enter a parking state on a top location + * @dim: DIM context + * + * Enter parking state. + * Clear all movement history. + */ +void dim_park_on_top(struct dim *dim); + +/** + * dim_park_tired - enter a tired parking state + * @dim: DIM context + * + * Enter parking state. + * Clear all movement history and cause DIM checks frequency to reduce. + */ +void dim_park_tired(struct dim *dim); + +/** + * dim_calc_stats - calculate the difference between two samples + * @start: start sample + * @end: end sample + * @curr_stats: delta between samples + * + * Calculate the delta between two samples (in data rates). + * Takes into consideration counter wrap-around. + */ +void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, + struct dim_stats *curr_stats); + +/** + * dim_update_sample - set a sample's fields with give values + * @event_ctr: number of events to set + * @packets: number of packets to set + * @bytes: number of bytes to set + * @s: DIM sample + */ +static inline void +dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s) +{ + s->time = ktime_get(); + s->pkt_ctr = packets; + s->byte_ctr = bytes; + s->event_ctr = event_ctr; +} + +/** + * dim_update_sample_with_comps - set a sample's fields with given + * values including the completion parameter + * @event_ctr: number of events to set + * @packets: number of packets to set + * @bytes: number of bytes to set + * @comps: number of completions to set + * @s: DIM sample + */ +static inline void +dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps, + struct dim_sample *s) +{ + dim_update_sample(event_ctr, packets, bytes, s); + s->comp_ctr = comps; +} + +/* Net DIM */ + +/** + * net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile + * @cq_period_mode: CQ period mode + * @ix: Profile index + */ +struct dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix); + +/** + * net_dim_get_def_rx_moderation - provide the default RX moderation + * @cq_period_mode: CQ period mode + */ +struct dim_cq_moder net_dim_get_def_rx_moderation(u8 cq_period_mode); + +/** + * net_dim_get_tx_moderation - provide a CQ moderation object for the given TX profile + * @cq_period_mode: CQ period mode + * @ix: Profile index + */ +struct dim_cq_moder net_dim_get_tx_moderation(u8 cq_period_mode, int ix); + +/** + * net_dim_get_def_tx_moderation - provide the default TX moderation + * @cq_period_mode: CQ period mode + */ +struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode); + +/** + * net_dim - main DIM algorithm entry point + * @dim: DIM instance information + * @end_sample: Current data measurement + * + * Called by the consumer. + * This is the main logic of the algorithm, where data is processed in order to decide on next + * required action. + */ +void net_dim(struct dim *dim, struct dim_sample end_sample); + +/* RDMA DIM */ + +/* + * RDMA DIM profile: + * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES. + */ +#define RDMA_DIM_PARAMS_NUM_PROFILES 9 +#define RDMA_DIM_START_PROFILE 0 + +/** + * rdma_dim - Runs the adaptive moderation. + * @dim: The moderation struct. + * @completions: The number of completions collected in this round. + * + * Each call to rdma_dim takes the latest amount of completions that + * have been collected and counts them as a new event. + * Once enough events have been collected the algorithm decides a new + * moderation level. + */ +void rdma_dim(struct dim *dim, u64 completions); + +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */ + +#endif /* DIM_H */ diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h new file mode 100644 index 0000000000000..f34b44a6fa230 --- /dev/null +++ b/drivers/amazon/net/ena/ena_admin_defs.h @@ -0,0 +1,1363 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ +#ifndef _ENA_ADMIN_H_ +#define _ENA_ADMIN_H_ + +#define ENA_ADMIN_EXTRA_PROPERTIES_STRING_LEN 32 +#define ENA_ADMIN_EXTRA_PROPERTIES_COUNT 32 + +#define ENA_ADMIN_RSS_KEY_PARTS 10 + +#define ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK 0x3F +#define ENA_ADMIN_CUSTOMER_METRICS_MIN_SUPPORT_MASK 0x1F + + /* customer metrics - in correlation with + * ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK + */ +enum ena_admin_customer_metrics_id { + ENA_ADMIN_BW_IN_ALLOWANCE_EXCEEDED = 0, + ENA_ADMIN_BW_OUT_ALLOWANCE_EXCEEDED = 1, + ENA_ADMIN_PPS_ALLOWANCE_EXCEEDED = 2, + ENA_ADMIN_CONNTRACK_ALLOWANCE_EXCEEDED = 3, + ENA_ADMIN_LINKLOCAL_ALLOWANCE_EXCEEDED = 4, + ENA_ADMIN_CONNTRACK_ALLOWANCE_AVAILABLE = 5, +}; + +enum ena_admin_aq_opcode { + ENA_ADMIN_CREATE_SQ = 1, + ENA_ADMIN_DESTROY_SQ = 2, + ENA_ADMIN_CREATE_CQ = 3, + ENA_ADMIN_DESTROY_CQ = 4, + ENA_ADMIN_GET_FEATURE = 8, + ENA_ADMIN_SET_FEATURE = 9, + ENA_ADMIN_GET_STATS = 11, +}; + +enum ena_admin_aq_completion_status { + ENA_ADMIN_SUCCESS = 0, + ENA_ADMIN_RESOURCE_ALLOCATION_FAILURE = 1, + ENA_ADMIN_BAD_OPCODE = 2, + ENA_ADMIN_UNSUPPORTED_OPCODE = 3, + ENA_ADMIN_MALFORMED_REQUEST = 4, + /* Additional status is provided in ACQ entry extended_status */ + ENA_ADMIN_ILLEGAL_PARAMETER = 5, + ENA_ADMIN_UNKNOWN_ERROR = 6, + ENA_ADMIN_RESOURCE_BUSY = 7, +}; + +/* subcommands for the set/get feature admin commands */ +enum ena_admin_aq_feature_id { + ENA_ADMIN_DEVICE_ATTRIBUTES = 1, + ENA_ADMIN_MAX_QUEUES_NUM = 2, + ENA_ADMIN_HW_HINTS = 3, + ENA_ADMIN_LLQ = 4, + ENA_ADMIN_EXTRA_PROPERTIES_STRINGS = 5, + ENA_ADMIN_EXTRA_PROPERTIES_FLAGS = 6, + ENA_ADMIN_MAX_QUEUES_EXT = 7, + ENA_ADMIN_RSS_HASH_FUNCTION = 10, + ENA_ADMIN_STATELESS_OFFLOAD_CONFIG = 11, + ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG = 12, + ENA_ADMIN_MTU = 14, + ENA_ADMIN_RSS_HASH_INPUT = 18, + ENA_ADMIN_INTERRUPT_MODERATION = 20, + ENA_ADMIN_AENQ_CONFIG = 26, + ENA_ADMIN_LINK_CONFIG = 27, + ENA_ADMIN_HOST_ATTR_CONFIG = 28, + ENA_ADMIN_PHC_CONFIG = 29, + ENA_ADMIN_FEATURES_OPCODE_NUM = 32, +}; + +/* device capabilities */ +enum ena_admin_aq_caps_id { + ENA_ADMIN_ENI_STATS = 0, + /* ENA SRD customer metrics */ + ENA_ADMIN_ENA_SRD_INFO = 1, + ENA_ADMIN_CUSTOMER_METRICS = 2, +}; + +enum ena_admin_placement_policy_type { + /* descriptors and headers are in host memory */ + ENA_ADMIN_PLACEMENT_POLICY_HOST = 1, + /* descriptors and headers are in device memory (a.k.a Low Latency + * Queue) + */ + ENA_ADMIN_PLACEMENT_POLICY_DEV = 3, +}; + +enum ena_admin_link_types { + ENA_ADMIN_LINK_SPEED_1G = 0x1, + ENA_ADMIN_LINK_SPEED_2_HALF_G = 0x2, + ENA_ADMIN_LINK_SPEED_5G = 0x4, + ENA_ADMIN_LINK_SPEED_10G = 0x8, + ENA_ADMIN_LINK_SPEED_25G = 0x10, + ENA_ADMIN_LINK_SPEED_40G = 0x20, + ENA_ADMIN_LINK_SPEED_50G = 0x40, + ENA_ADMIN_LINK_SPEED_100G = 0x80, + ENA_ADMIN_LINK_SPEED_200G = 0x100, + ENA_ADMIN_LINK_SPEED_400G = 0x200, +}; + +enum ena_admin_completion_policy_type { + /* completion queue entry for each sq descriptor */ + ENA_ADMIN_COMPLETION_POLICY_DESC = 0, + /* completion queue entry upon request in sq descriptor */ + ENA_ADMIN_COMPLETION_POLICY_DESC_ON_DEMAND = 1, + /* current queue head pointer is updated in OS memory upon sq + * descriptor request + */ + ENA_ADMIN_COMPLETION_POLICY_HEAD_ON_DEMAND = 2, + /* current queue head pointer is updated in OS memory for each sq + * descriptor + */ + ENA_ADMIN_COMPLETION_POLICY_HEAD = 3, +}; + +/* basic stats return ena_admin_basic_stats while extanded stats return a + * buffer (string format) with additional statistics per queue and per + * device id + */ +enum ena_admin_get_stats_type { + ENA_ADMIN_GET_STATS_TYPE_BASIC = 0, + ENA_ADMIN_GET_STATS_TYPE_EXTENDED = 1, + /* extra HW stats for specific network interface */ + ENA_ADMIN_GET_STATS_TYPE_ENI = 2, + /* extra HW stats for ENA SRD */ + ENA_ADMIN_GET_STATS_TYPE_ENA_SRD = 3, + ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS = 4, + +}; + +enum ena_admin_get_stats_scope { + ENA_ADMIN_SPECIFIC_QUEUE = 0, + ENA_ADMIN_ETH_TRAFFIC = 1, +}; + +enum ena_admin_get_phc_type { + ENA_ADMIN_PHC_TYPE_READLESS = 0, +}; + +/* ENA SRD configuration for ENI */ +enum ena_admin_ena_srd_flags { + /* Feature enabled */ + ENA_ADMIN_ENA_SRD_ENABLED = BIT(0), + /* UDP support enabled */ + ENA_ADMIN_ENA_SRD_UDP_ENABLED = BIT(1), + /* Bypass Rx UDP ordering */ + ENA_ADMIN_ENA_SRD_UDP_ORDERING_BYPASS_ENABLED = BIT(2), +}; + +struct ena_admin_aq_common_desc { + /* 11:0 : command_id + * 15:12 : reserved12 + */ + u16 command_id; + + /* as appears in ena_admin_aq_opcode */ + u8 opcode; + + /* 0 : phase + * 1 : ctrl_data - control buffer address valid + * 2 : ctrl_data_indirect - control buffer address + * points to list of pages with addresses of control + * buffers + * 7:3 : reserved3 + */ + u8 flags; +}; + +/* used in ena_admin_aq_entry. Can point directly to control data, or to a + * page list chunk. Used also at the end of indirect mode page list chunks, + * for chaining. + */ +struct ena_admin_ctrl_buff_info { + u32 length; + + struct ena_common_mem_addr address; +}; + +struct ena_admin_sq { + u16 sq_idx; + + /* 4:0 : reserved + * 7:5 : sq_direction - 0x1 - Tx; 0x2 - Rx + */ + u8 sq_identity; + + u8 reserved1; +}; + +struct ena_admin_aq_entry { + struct ena_admin_aq_common_desc aq_common_descriptor; + + union { + u32 inline_data_w1[3]; + + struct ena_admin_ctrl_buff_info control_buffer; + } u; + + u32 inline_data_w4[12]; +}; + +struct ena_admin_acq_common_desc { + /* command identifier to associate it with the aq descriptor + * 11:0 : command_id + * 15:12 : reserved12 + */ + u16 command; + + u8 status; + + /* 0 : phase + * 7:1 : reserved1 + */ + u8 flags; + + u16 extended_status; + + /* indicates to the driver which AQ entry has been consumed by the + * device and could be reused + */ + u16 sq_head_indx; +}; + +struct ena_admin_acq_entry { + struct ena_admin_acq_common_desc acq_common_descriptor; + + u32 response_specific_data[14]; +}; + +struct ena_admin_aq_create_sq_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + /* 4:0 : reserved0_w1 + * 7:5 : sq_direction - 0x1 - Tx, 0x2 - Rx + */ + u8 sq_identity; + + u8 reserved8_w1; + + /* 3:0 : placement_policy - Describing where the SQ + * descriptor ring and the SQ packet headers reside: + * 0x1 - descriptors and headers are in OS memory, + * 0x3 - descriptors and headers in device memory + * (a.k.a Low Latency Queue) + * 6:4 : completion_policy - Describing what policy + * to use for generation completion entry (cqe) in + * the CQ associated with this SQ: 0x0 - cqe for each + * sq descriptor, 0x1 - cqe upon request in sq + * descriptor, 0x2 - current queue head pointer is + * updated in OS memory upon sq descriptor request + * 0x3 - current queue head pointer is updated in OS + * memory for each sq descriptor + * 7 : reserved15_w1 + */ + u8 sq_caps_2; + + /* 0 : is_physically_contiguous - Described if the + * queue ring memory is allocated in physical + * contiguous pages or split. + * 7:1 : reserved17_w1 + */ + u8 sq_caps_3; + + /* associated completion queue id. This CQ must be created prior to SQ + * creation + */ + u16 cq_idx; + + /* submission queue depth in entries */ + u16 sq_depth; + + /* SQ physical base address in OS memory. This field should not be + * used for Low Latency queues. Has to be page aligned. + */ + struct ena_common_mem_addr sq_ba; + + /* specifies queue head writeback location in OS memory. Valid if + * completion_policy is set to completion_policy_head_on_demand or + * completion_policy_head. Has to be cache aligned + */ + struct ena_common_mem_addr sq_head_writeback; + + u32 reserved0_w7; + + u32 reserved0_w8; +}; + +enum ena_admin_sq_direction { + ENA_ADMIN_SQ_DIRECTION_TX = 1, + ENA_ADMIN_SQ_DIRECTION_RX = 2, +}; + +struct ena_admin_acq_create_sq_resp_desc { + struct ena_admin_acq_common_desc acq_common_desc; + + u16 sq_idx; + + u16 reserved; + + /* queue doorbell address as an offset to PCIe MMIO REG BAR */ + u32 sq_doorbell_offset; + + /* low latency queue ring base address as an offset to PCIe MMIO + * LLQ_MEM BAR + */ + u32 llq_descriptors_offset; + + /* low latency queue headers' memory as an offset to PCIe MMIO + * LLQ_MEM BAR + */ + u32 llq_headers_offset; +}; + +struct ena_admin_aq_destroy_sq_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + struct ena_admin_sq sq; +}; + +struct ena_admin_acq_destroy_sq_resp_desc { + struct ena_admin_acq_common_desc acq_common_desc; +}; + +struct ena_admin_aq_create_cq_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + /* 4:0 : reserved5 + * 5 : interrupt_mode_enabled - if set, cq operates + * in interrupt mode, otherwise - polling + * 7:6 : reserved6 + */ + u8 cq_caps_1; + + /* 4:0 : cq_entry_size_words - size of CQ entry in + * 32-bit words, valid values: 4, 8. + * 7:5 : reserved7 + */ + u8 cq_caps_2; + + /* completion queue depth in # of entries. must be power of 2 */ + u16 cq_depth; + + /* msix vector assigned to this cq */ + u32 msix_vector; + + /* cq physical base address in OS memory. CQ must be physically + * contiguous + */ + struct ena_common_mem_addr cq_ba; +}; + +struct ena_admin_acq_create_cq_resp_desc { + struct ena_admin_acq_common_desc acq_common_desc; + + u16 cq_idx; + + /* actual cq depth in number of entries */ + u16 cq_actual_depth; + + u32 numa_node_register_offset; + + u32 cq_head_db_register_offset; + + u32 cq_interrupt_unmask_register_offset; +}; + +struct ena_admin_aq_destroy_cq_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + u16 cq_idx; + + u16 reserved1; +}; + +struct ena_admin_acq_destroy_cq_resp_desc { + struct ena_admin_acq_common_desc acq_common_desc; +}; + +/* ENA AQ Get Statistics command. Extended statistics are placed in control + * buffer pointed by AQ entry + */ +struct ena_admin_aq_get_stats_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + union { + /* command specific inline data */ + u32 inline_data_w1[3]; + + struct ena_admin_ctrl_buff_info control_buffer; + } u; + + /* stats type as defined in enum ena_admin_get_stats_type */ + u8 type; + + /* stats scope defined in enum ena_admin_get_stats_scope */ + u8 scope; + + u16 reserved3; + + /* queue id. used when scope is specific_queue */ + u16 queue_idx; + + /* device id, value 0xFFFF means mine. only privileged device can get + * stats of other device + */ + u16 device_id; + + /* a bitmap representing the requested metric values */ + u64 requested_metrics; +}; + +/* Basic Statistics Command. */ +struct ena_admin_basic_stats { + u32 tx_bytes_low; + + u32 tx_bytes_high; + + u32 tx_pkts_low; + + u32 tx_pkts_high; + + u32 rx_bytes_low; + + u32 rx_bytes_high; + + u32 rx_pkts_low; + + u32 rx_pkts_high; + + u32 rx_drops_low; + + u32 rx_drops_high; + + u32 tx_drops_low; + + u32 tx_drops_high; + + u32 rx_overruns_low; + + u32 rx_overruns_high; +}; + +/* ENI Statistics Command. */ +struct ena_admin_eni_stats { + /* The number of packets shaped due to inbound aggregate BW + * allowance being exceeded + */ + u64 bw_in_allowance_exceeded; + + /* The number of packets shaped due to outbound aggregate BW + * allowance being exceeded + */ + u64 bw_out_allowance_exceeded; + + /* The number of packets shaped due to PPS allowance being exceeded */ + u64 pps_allowance_exceeded; + + /* The number of packets shaped due to connection tracking + * allowance being exceeded and leading to failure in establishment + * of new connections + */ + u64 conntrack_allowance_exceeded; + + /* The number of packets shaped due to linklocal packet rate + * allowance being exceeded + */ + u64 linklocal_allowance_exceeded; +}; + +struct ena_admin_ena_srd_stats { + /* Number of packets transmitted over ENA SRD */ + u64 ena_srd_tx_pkts; + + /* Number of packets transmitted or could have been + * transmitted over ENA SRD + */ + u64 ena_srd_eligible_tx_pkts; + + /* Number of packets received over ENA SRD */ + u64 ena_srd_rx_pkts; + + /* Percentage of the ENA SRD resources that is in use */ + u64 ena_srd_resource_utilization; +}; + +/* ENA SRD Statistics Command */ +struct ena_admin_ena_srd_info { + /* ENA SRD configuration bitmap. See ena_admin_ena_srd_flags for + * details + */ + u64 flags; + + struct ena_admin_ena_srd_stats ena_srd_stats; +}; + +/* Customer Metrics Command. */ +struct ena_admin_customer_metrics { + /* A bitmap representing the reported customer metrics according to + * the order they are reported + */ + u64 reported_metrics; +}; + +struct ena_admin_acq_get_stats_resp { + struct ena_admin_acq_common_desc acq_common_desc; + + union { + u64 raw[7]; + + struct ena_admin_basic_stats basic_stats; + + struct ena_admin_eni_stats eni_stats; + + struct ena_admin_ena_srd_info ena_srd_info; + + struct ena_admin_customer_metrics customer_metrics; + } u; +}; + +struct ena_admin_get_set_feature_common_desc { + /* 1:0 : select - 0x1 - current value; 0x3 - default + * value + * 7:3 : reserved3 + */ + u8 flags; + + /* as appears in ena_admin_aq_feature_id */ + u8 feature_id; + + /* The driver specifies the max feature version it supports and the + * device responds with the currently supported feature version. The + * field is zero based + */ + u8 feature_version; + + u8 reserved8; +}; + +struct ena_admin_device_attr_feature_desc { + u32 impl_id; + + u32 device_version; + + /* bitmap of ena_admin_aq_feature_id, which represents supported + * subcommands for the set/get feature admin commands. + */ + u32 supported_features; + + /* bitmap of ena_admin_aq_caps_id, which represents device + * capabilities. + */ + u32 capabilities; + + /* Indicates how many bits are used physical address access. */ + u32 phys_addr_width; + + /* Indicates how many bits are used virtual address access. */ + u32 virt_addr_width; + + /* unicast MAC address (in Network byte order) */ + u8 mac_addr[6]; + + u8 reserved7[2]; + + u32 max_mtu; +}; + +enum ena_admin_llq_header_location { + /* header is in descriptor list */ + ENA_ADMIN_INLINE_HEADER = 1, + /* header in a separate ring, implies 16B descriptor list entry */ + ENA_ADMIN_HEADER_RING = 2, +}; + +enum ena_admin_llq_ring_entry_size { + ENA_ADMIN_LIST_ENTRY_SIZE_128B = 1, + ENA_ADMIN_LIST_ENTRY_SIZE_192B = 2, + ENA_ADMIN_LIST_ENTRY_SIZE_256B = 4, +}; + +enum ena_admin_llq_num_descs_before_header { + ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_0 = 0, + ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1 = 1, + ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2 = 2, + ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4 = 4, + ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8 = 8, +}; + +/* packet descriptor list entry always starts with one or more descriptors, + * followed by a header. The rest of the descriptors are located in the + * beginning of the subsequent entry. Stride refers to how the rest of the + * descriptors are placed. This field is relevant only for inline header + * mode + */ +enum ena_admin_llq_stride_ctrl { + ENA_ADMIN_SINGLE_DESC_PER_ENTRY = 1, + ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY = 2, +}; + +enum ena_admin_accel_mode_feat { + ENA_ADMIN_DISABLE_META_CACHING = 0, + ENA_ADMIN_LIMIT_TX_BURST = 1, +}; + +struct ena_admin_accel_mode_get { + /* bit field of enum ena_admin_accel_mode_feat */ + u16 supported_flags; + + /* maximum burst size between two doorbells. The size is in bytes */ + u16 max_tx_burst_size; +}; + +struct ena_admin_accel_mode_set { + /* bit field of enum ena_admin_accel_mode_feat */ + u16 enabled_flags; + + u16 reserved; +}; + +struct ena_admin_accel_mode_req { + union { + u32 raw[2]; + + struct ena_admin_accel_mode_get get; + + struct ena_admin_accel_mode_set set; + } u; +}; + +struct ena_admin_feature_llq_desc { + u32 max_llq_num; + + u32 max_llq_depth; + + /* specify the header locations the device supports. bitfield of enum + * ena_admin_llq_header_location. + */ + u16 header_location_ctrl_supported; + + /* the header location the driver selected to use. */ + u16 header_location_ctrl_enabled; + + /* if inline header is specified - this is the size of descriptor list + * entry. If header in a separate ring is specified - this is the size + * of header ring entry. bitfield of enum ena_admin_llq_ring_entry_size. + * specify the entry sizes the device supports + */ + u16 entry_size_ctrl_supported; + + /* the entry size the driver selected to use. */ + u16 entry_size_ctrl_enabled; + + /* valid only if inline header is specified. First entry associated with + * the packet includes descriptors and header. Rest of the entries + * occupied by descriptors. This parameter defines the max number of + * descriptors precedding the header in the first entry. The field is + * bitfield of enum ena_admin_llq_num_descs_before_header and specify + * the values the device supports + */ + u16 desc_num_before_header_supported; + + /* the desire field the driver selected to use */ + u16 desc_num_before_header_enabled; + + /* valid only if inline was chosen. bitfield of enum + * ena_admin_llq_stride_ctrl + */ + u16 descriptors_stride_ctrl_supported; + + /* the stride control the driver selected to use */ + u16 descriptors_stride_ctrl_enabled; + + /* reserved */ + u32 reserved1; + + /* accelerated low latency queues requirement. driver needs to + * support those requirements in order to use accelerated llq + */ + struct ena_admin_accel_mode_req accel_mode; +}; + +struct ena_admin_queue_ext_feature_fields { + u32 max_tx_sq_num; + + u32 max_tx_cq_num; + + u32 max_rx_sq_num; + + u32 max_rx_cq_num; + + u32 max_tx_sq_depth; + + u32 max_tx_cq_depth; + + u32 max_rx_sq_depth; + + u32 max_rx_cq_depth; + + u32 max_tx_header_size; + + /* Maximum Descriptors number, including meta descriptor, allowed for a + * single Tx packet + */ + u16 max_per_packet_tx_descs; + + /* Maximum Descriptors number allowed for a single Rx packet */ + u16 max_per_packet_rx_descs; +}; + +struct ena_admin_queue_feature_desc { + u32 max_sq_num; + + u32 max_sq_depth; + + u32 max_cq_num; + + u32 max_cq_depth; + + u32 max_legacy_llq_num; + + u32 max_legacy_llq_depth; + + u32 max_header_size; + + /* Maximum Descriptors number, including meta descriptor, allowed for a + * single Tx packet + */ + u16 max_packet_tx_descs; + + /* Maximum Descriptors number allowed for a single Rx packet */ + u16 max_packet_rx_descs; +}; + +struct ena_admin_set_feature_mtu_desc { + /* exclude L2 */ + u32 mtu; +}; + +struct ena_admin_get_extra_properties_strings_desc { + u32 count; +}; + +struct ena_admin_get_extra_properties_flags_desc { + u32 flags; +}; + +struct ena_admin_set_feature_host_attr_desc { + /* host OS info base address in OS memory. host info is 4KB of + * physically contiguous + */ + struct ena_common_mem_addr os_info_ba; + + /* host debug area base address in OS memory. debug area must be + * physically contiguous + */ + struct ena_common_mem_addr debug_ba; + + /* debug area size */ + u32 debug_area_size; +}; + +struct ena_admin_feature_intr_moder_desc { + /* interrupt delay granularity in usec */ + u16 intr_delay_resolution; + + u16 reserved; +}; + +struct ena_admin_get_feature_link_desc { + /* Link speed in Mb */ + u32 speed; + + /* bit field of enum ena_admin_link types */ + u32 supported; + + /* 0 : autoneg + * 1 : duplex - Full Duplex + * 31:2 : reserved2 + */ + u32 flags; +}; + +struct ena_admin_feature_aenq_desc { + /* bitmask for AENQ groups the device can report */ + u32 supported_groups; + + /* bitmask for AENQ groups to report */ + u32 enabled_groups; +}; + +struct ena_admin_feature_offload_desc { + /* 0 : TX_L3_csum_ipv4 + * 1 : TX_L4_ipv4_csum_part - The checksum field + * should be initialized with pseudo header checksum + * 2 : TX_L4_ipv4_csum_full + * 3 : TX_L4_ipv6_csum_part - The checksum field + * should be initialized with pseudo header checksum + * 4 : TX_L4_ipv6_csum_full + * 5 : tso_ipv4 + * 6 : tso_ipv6 + * 7 : tso_ecn + */ + u32 tx; + + /* Receive side supported stateless offload + * 0 : RX_L3_csum_ipv4 - IPv4 checksum + * 1 : RX_L4_ipv4_csum - TCP/UDP/IPv4 checksum + * 2 : RX_L4_ipv6_csum - TCP/UDP/IPv6 checksum + * 3 : RX_hash - Hash calculation + */ + u32 rx_supported; + + u32 rx_enabled; +}; + +enum ena_admin_hash_functions { + ENA_ADMIN_TOEPLITZ = 1, + ENA_ADMIN_CRC32 = 2, +}; + +struct ena_admin_feature_rss_flow_hash_control { + u32 key_parts; + + u32 reserved; + + u32 key[ENA_ADMIN_RSS_KEY_PARTS]; +}; + +struct ena_admin_feature_rss_flow_hash_function { + /* 7:0 : funcs - bitmask of ena_admin_hash_functions */ + u32 supported_func; + + /* 7:0 : selected_func - bitmask of + * ena_admin_hash_functions + */ + u32 selected_func; + + /* initial value */ + u32 init_val; +}; + +/* RSS flow hash protocols */ +enum ena_admin_flow_hash_proto { + ENA_ADMIN_RSS_TCP4 = 0, + ENA_ADMIN_RSS_UDP4 = 1, + ENA_ADMIN_RSS_TCP6 = 2, + ENA_ADMIN_RSS_UDP6 = 3, + ENA_ADMIN_RSS_IP4 = 4, + ENA_ADMIN_RSS_IP6 = 5, + ENA_ADMIN_RSS_IP4_FRAG = 6, + ENA_ADMIN_RSS_NOT_IP = 7, + /* TCPv6 with extension header */ + ENA_ADMIN_RSS_TCP6_EX = 8, + /* IPv6 with extension header */ + ENA_ADMIN_RSS_IP6_EX = 9, + ENA_ADMIN_RSS_PROTO_NUM = 16, +}; + +/* RSS flow hash fields */ +enum ena_admin_flow_hash_fields { + /* Ethernet Dest Addr */ + ENA_ADMIN_RSS_L2_DA = BIT(0), + /* Ethernet Src Addr */ + ENA_ADMIN_RSS_L2_SA = BIT(1), + /* ipv4/6 Dest Addr */ + ENA_ADMIN_RSS_L3_DA = BIT(2), + /* ipv4/6 Src Addr */ + ENA_ADMIN_RSS_L3_SA = BIT(3), + /* tcp/udp Dest Port */ + ENA_ADMIN_RSS_L4_DP = BIT(4), + /* tcp/udp Src Port */ + ENA_ADMIN_RSS_L4_SP = BIT(5), +}; + +struct ena_admin_proto_input { + /* flow hash fields (bitwise according to ena_admin_flow_hash_fields) */ + u16 fields; + + u16 reserved2; +}; + +struct ena_admin_feature_rss_hash_control { + struct ena_admin_proto_input supported_fields[ENA_ADMIN_RSS_PROTO_NUM]; + + struct ena_admin_proto_input selected_fields[ENA_ADMIN_RSS_PROTO_NUM]; + + struct ena_admin_proto_input reserved2[ENA_ADMIN_RSS_PROTO_NUM]; + + struct ena_admin_proto_input reserved3[ENA_ADMIN_RSS_PROTO_NUM]; +}; + +struct ena_admin_feature_rss_flow_hash_input { + /* supported hash input sorting + * 1 : L3_sort - support swap L3 addresses if DA is + * smaller than SA + * 2 : L4_sort - support swap L4 ports if DP smaller + * SP + */ + u16 supported_input_sort; + + /* enabled hash input sorting + * 1 : enable_L3_sort - enable swap L3 addresses if + * DA smaller than SA + * 2 : enable_L4_sort - enable swap L4 ports if DP + * smaller than SP + */ + u16 enabled_input_sort; +}; + +enum ena_admin_os_type { + ENA_ADMIN_OS_LINUX = 1, + ENA_ADMIN_OS_WIN = 2, + ENA_ADMIN_OS_DPDK = 3, + ENA_ADMIN_OS_FREEBSD = 4, + ENA_ADMIN_OS_IPXE = 5, + ENA_ADMIN_OS_ESXI = 6, + ENA_ADMIN_OS_MACOS = 7, + ENA_ADMIN_OS_GROUPS_NUM = 7, +}; + +struct ena_admin_host_info { + /* defined in enum ena_admin_os_type */ + u32 os_type; + + /* os distribution string format */ + u8 os_dist_str[128]; + + /* OS distribution numeric format */ + u32 os_dist; + + /* kernel version string format */ + u8 kernel_ver_str[32]; + + /* Kernel version numeric format */ + u32 kernel_ver; + + /* 7:0 : major + * 15:8 : minor + * 23:16 : sub_minor + * 31:24 : module_type + */ + u32 driver_version; + + /* features bitmap */ + u32 supported_network_features[2]; + + /* ENA spec version of driver */ + u16 ena_spec_version; + + /* ENA device's Bus, Device and Function + * 2:0 : function + * 7:3 : device + * 15:8 : bus + */ + u16 bdf; + + /* Number of CPUs */ + u16 num_cpus; + + u16 reserved; + + /* 0 : reserved + * 1 : rx_offset + * 2 : interrupt_moderation + * 3 : rx_buf_mirroring + * 4 : rss_configurable_function_key + * 5 : reserved + * 6 : rx_page_reuse + * 31:7 : reserved + */ + u32 driver_supported_features; +}; + +struct ena_admin_rss_ind_table_entry { + u16 cq_idx; + + u16 reserved; +}; + +struct ena_admin_feature_rss_ind_table { + /* min supported table size (2^min_size) */ + u16 min_size; + + /* max supported table size (2^max_size) */ + u16 max_size; + + /* table size (2^size) */ + u16 size; + + /* 0 : one_entry_update - The ENA device supports + * setting a single RSS table entry + */ + u8 flags; + + u8 reserved; + + /* index of the inline entry. 0xFFFFFFFF means invalid */ + u32 inline_index; + + /* used for updating single entry, ignored when setting the entire + * table through the control buffer. + */ + struct ena_admin_rss_ind_table_entry inline_entry; +}; + +/* When hint value is 0, driver should use it's own predefined value */ +struct ena_admin_ena_hw_hints { + /* value in ms */ + u16 mmio_read_timeout; + + /* value in ms */ + u16 driver_watchdog_timeout; + + /* Per packet tx completion timeout. value in ms */ + u16 missing_tx_completion_timeout; + + u16 missed_tx_completion_count_threshold_to_reset; + + /* value in ms */ + u16 admin_completion_tx_timeout; + + u16 netdev_wd_timeout; + + u16 max_tx_sgl_size; + + u16 max_rx_sgl_size; + + u16 reserved[8]; +}; + +struct ena_admin_get_feat_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + struct ena_admin_ctrl_buff_info control_buffer; + + struct ena_admin_get_set_feature_common_desc feat_common; + + u32 raw[11]; +}; + +struct ena_admin_queue_ext_feature_desc { + /* version */ + u8 version; + + u8 reserved1[3]; + + union { + struct ena_admin_queue_ext_feature_fields max_queue_ext; + + u32 raw[10]; + }; +}; + +struct ena_admin_feature_phc_desc { + /* PHC type as defined in enum ena_admin_get_phc_type, + * used only for GET command. + */ + u8 type; + + /* Reserved - MBZ */ + u8 reserved1[3]; + + /* PHC doorbell address as an offset to PCIe MMIO REG BAR, + * used only for GET command. + */ + u32 doorbell_offset; + + /* Max time for valid PHC retrieval, passing this threshold will + * fail the get-time request and block PHC requests for + * block_timeout_usec, used only for GET command. + */ + u32 expire_timeout_usec; + + /* PHC requests block period, blocking starts if PHC request expired + * in order to prevent floods on busy device, + * used only for GET command. + */ + u32 block_timeout_usec; + + /* Shared PHC physical address (ena_admin_phc_resp), + * used only for SET command. + */ + struct ena_common_mem_addr output_address; + + /* Shared PHC Size (ena_admin_phc_resp), + * used only for SET command. + */ + u32 output_length; +}; + +struct ena_admin_get_feat_resp { + struct ena_admin_acq_common_desc acq_common_desc; + + union { + u32 raw[14]; + + struct ena_admin_device_attr_feature_desc dev_attr; + + struct ena_admin_feature_llq_desc llq; + + struct ena_admin_queue_feature_desc max_queue; + + struct ena_admin_queue_ext_feature_desc max_queue_ext; + + struct ena_admin_feature_aenq_desc aenq; + + struct ena_admin_get_feature_link_desc link; + + struct ena_admin_feature_offload_desc offload; + + struct ena_admin_feature_rss_flow_hash_function flow_hash_func; + + struct ena_admin_feature_rss_flow_hash_input flow_hash_input; + + struct ena_admin_feature_rss_ind_table ind_table; + + struct ena_admin_feature_intr_moder_desc intr_moderation; + + struct ena_admin_ena_hw_hints hw_hints; + + struct ena_admin_feature_phc_desc phc; + + struct ena_admin_get_extra_properties_strings_desc extra_properties_strings; + + struct ena_admin_get_extra_properties_flags_desc extra_properties_flags; + } u; +}; + +struct ena_admin_set_feat_cmd { + struct ena_admin_aq_common_desc aq_common_descriptor; + + struct ena_admin_ctrl_buff_info control_buffer; + + struct ena_admin_get_set_feature_common_desc feat_common; + + union { + u32 raw[11]; + + /* mtu size */ + struct ena_admin_set_feature_mtu_desc mtu; + + /* host attributes */ + struct ena_admin_set_feature_host_attr_desc host_attr; + + /* AENQ configuration */ + struct ena_admin_feature_aenq_desc aenq; + + /* rss flow hash function */ + struct ena_admin_feature_rss_flow_hash_function flow_hash_func; + + /* rss flow hash input */ + struct ena_admin_feature_rss_flow_hash_input flow_hash_input; + + /* rss indirection table */ + struct ena_admin_feature_rss_ind_table ind_table; + + /* LLQ configuration */ + struct ena_admin_feature_llq_desc llq; + + /* PHC configuration */ + struct ena_admin_feature_phc_desc phc; + } u; +}; + +struct ena_admin_set_feat_resp { + struct ena_admin_acq_common_desc acq_common_desc; + + union { + u32 raw[14]; + } u; +}; + +struct ena_admin_aenq_common_desc { + u16 group; + + u16 syndrome; + + /* 0 : phase + * 7:1 : reserved - MBZ + */ + u8 flags; + + u8 reserved1[3]; + + u32 timestamp_low; + + u32 timestamp_high; +}; + +/* asynchronous event notification groups */ +enum ena_admin_aenq_group { + ENA_ADMIN_LINK_CHANGE = 0, + ENA_ADMIN_FATAL_ERROR = 1, + ENA_ADMIN_WARNING = 2, + ENA_ADMIN_NOTIFICATION = 3, + ENA_ADMIN_KEEP_ALIVE = 4, + ENA_ADMIN_REFRESH_CAPABILITIES = 5, + ENA_ADMIN_AENQ_GROUPS_NUM = 6, +}; + +enum ena_admin_aenq_notification_syndrome { + ENA_ADMIN_UPDATE_HINTS = 2, +}; + +struct ena_admin_aenq_entry { + struct ena_admin_aenq_common_desc aenq_common_desc; + + /* command specific inline data */ + u32 inline_data_w4[12]; +}; + +struct ena_admin_aenq_link_change_desc { + struct ena_admin_aenq_common_desc aenq_common_desc; + + /* 0 : link_status */ + u32 flags; +}; + +struct ena_admin_aenq_keep_alive_desc { + struct ena_admin_aenq_common_desc aenq_common_desc; + + u32 rx_drops_low; + + u32 rx_drops_high; + + u32 tx_drops_low; + + u32 tx_drops_high; + + u32 rx_overruns_low; + + u32 rx_overruns_high; +}; + +struct ena_admin_ena_mmio_req_read_less_resp { + u16 req_id; + + u16 reg_off; + + /* value is valid when poll is cleared */ + u32 reg_val; +}; + +struct ena_admin_phc_resp { + u16 req_id; + + u8 reserved1[6]; + + u64 timestamp; + + u8 reserved2[48]; +}; + +/* aq_common_desc */ +#define ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0) +#define ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK BIT(0) +#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_SHIFT 1 +#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK BIT(1) +#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_SHIFT 2 +#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK BIT(2) + +/* sq */ +#define ENA_ADMIN_SQ_SQ_DIRECTION_SHIFT 5 +#define ENA_ADMIN_SQ_SQ_DIRECTION_MASK GENMASK(7, 5) + +/* acq_common_desc */ +#define ENA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0) +#define ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK BIT(0) + +/* aq_create_sq_cmd */ +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_SHIFT 5 +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_MASK GENMASK(7, 5) +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_PLACEMENT_POLICY_MASK GENMASK(3, 0) +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_SHIFT 4 +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_MASK GENMASK(6, 4) +#define ENA_ADMIN_AQ_CREATE_SQ_CMD_IS_PHYSICALLY_CONTIGUOUS_MASK BIT(0) + +/* aq_create_cq_cmd */ +#define ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_SHIFT 5 +#define ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5) +#define ENA_ADMIN_AQ_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0) + +/* get_set_feature_common_desc */ +#define ENA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK GENMASK(1, 0) + +/* get_feature_link_desc */ +#define ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK BIT(0) +#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_SHIFT 1 +#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_MASK BIT(1) + +/* feature_offload_desc */ +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L3_CSUM_IPV4_MASK BIT(0) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_SHIFT 1 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK BIT(1) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_FULL_SHIFT 2 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_FULL_MASK BIT(2) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_SHIFT 3 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK BIT(3) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_SHIFT 4 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_MASK BIT(4) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_SHIFT 5 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK BIT(5) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_SHIFT 6 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK BIT(6) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_SHIFT 7 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_MASK BIT(7) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L3_CSUM_IPV4_MASK BIT(0) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_SHIFT 1 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_MASK BIT(1) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_SHIFT 2 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_MASK BIT(2) +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_SHIFT 3 +#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_MASK BIT(3) + +/* feature_rss_flow_hash_function */ +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_FUNCTION_FUNCS_MASK GENMASK(7, 0) +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_FUNCTION_SELECTED_FUNC_MASK GENMASK(7, 0) + +/* feature_rss_flow_hash_input */ +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_SHIFT 1 +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_MASK BIT(1) +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_SHIFT 2 +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_MASK BIT(2) +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L3_SORT_SHIFT 1 +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L3_SORT_MASK BIT(1) +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L4_SORT_SHIFT 2 +#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L4_SORT_MASK BIT(2) + +/* host_info */ +#define ENA_ADMIN_HOST_INFO_MAJOR_MASK GENMASK(7, 0) +#define ENA_ADMIN_HOST_INFO_MINOR_SHIFT 8 +#define ENA_ADMIN_HOST_INFO_MINOR_MASK GENMASK(15, 8) +#define ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT 16 +#define ENA_ADMIN_HOST_INFO_SUB_MINOR_MASK GENMASK(23, 16) +#define ENA_ADMIN_HOST_INFO_MODULE_TYPE_SHIFT 24 +#define ENA_ADMIN_HOST_INFO_MODULE_TYPE_MASK GENMASK(31, 24) +#define ENA_ADMIN_HOST_INFO_FUNCTION_MASK GENMASK(2, 0) +#define ENA_ADMIN_HOST_INFO_DEVICE_SHIFT 3 +#define ENA_ADMIN_HOST_INFO_DEVICE_MASK GENMASK(7, 3) +#define ENA_ADMIN_HOST_INFO_BUS_SHIFT 8 +#define ENA_ADMIN_HOST_INFO_BUS_MASK GENMASK(15, 8) +#define ENA_ADMIN_HOST_INFO_RX_OFFSET_SHIFT 1 +#define ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK BIT(1) +#define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_SHIFT 2 +#define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK BIT(2) +#define ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_SHIFT 3 +#define ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK BIT(3) +#define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_SHIFT 4 +#define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK BIT(4) +#define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_SHIFT 6 +#define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK BIT(6) + +/* feature_rss_ind_table */ +#define ENA_ADMIN_FEATURE_RSS_IND_TABLE_ONE_ENTRY_UPDATE_MASK BIT(0) + +/* aenq_common_desc */ +#define ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK BIT(0) + +/* aenq_link_change_desc */ +#define ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK BIT(0) + +#endif /* _ENA_ADMIN_H_ */ diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c new file mode 100644 index 0000000000000..889d3412a72df --- /dev/null +++ b/drivers/amazon/net/ena/ena_com.c @@ -0,0 +1,3345 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "ena_com.h" + +/*****************************************************************************/ +/*****************************************************************************/ + +/* Timeout in micro-sec */ +#define ADMIN_CMD_TIMEOUT_US (3000000) + +#define ENA_ASYNC_QUEUE_DEPTH 16 +#define ENA_ADMIN_QUEUE_DEPTH 32 + + +#define ENA_CTRL_MAJOR 0 +#define ENA_CTRL_MINOR 0 +#define ENA_CTRL_SUB_MINOR 1 + +#define MIN_ENA_CTRL_VER \ + (((ENA_CTRL_MAJOR) << \ + (ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT)) | \ + ((ENA_CTRL_MINOR) << \ + (ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT)) | \ + (ENA_CTRL_SUB_MINOR)) + +#define ENA_DMA_ADDR_TO_UINT32_LOW(x) ((u32)((u64)(x))) +#define ENA_DMA_ADDR_TO_UINT32_HIGH(x) ((u32)(((u64)(x)) >> 32)) + +#define ENA_MMIO_READ_TIMEOUT 0xFFFFFFFF + +#define ENA_COM_BOUNCE_BUFFER_CNTRL_CNT 4 + +#define ENA_REGS_ADMIN_INTR_MASK 1 + +#define ENA_MIN_ADMIN_POLL_US 100 + +#define ENA_MAX_ADMIN_POLL_US 5000 + +/* PHC definitions */ +#define ENA_PHC_DEFAULT_EXPIRE_TIMEOUT_USEC 20 +#define ENA_PHC_DEFAULT_BLOCK_TIMEOUT_USEC 1000 +#define ENA_PHC_TIMESTAMP_ERROR 0xFFFFFFFFFFFFFFFF +#define ENA_PHC_REQ_ID_OFFSET 0xDEAD + +/*****************************************************************************/ +/*****************************************************************************/ +/*****************************************************************************/ + +enum ena_cmd_status { + ENA_CMD_SUBMITTED, + ENA_CMD_COMPLETED, + /* Abort - canceled by the driver */ + ENA_CMD_ABORTED, +}; + +struct ena_comp_ctx { + struct completion wait_event; + struct ena_admin_acq_entry *user_cqe; + u32 comp_size; + enum ena_cmd_status status; + /* status from the device */ + u8 comp_status; + u8 cmd_opcode; + bool occupied; +}; + +struct ena_com_stats_ctx { + struct ena_admin_aq_get_stats_cmd get_cmd; + struct ena_admin_acq_get_stats_resp get_resp; +}; + +static int ena_com_mem_addr_set(struct ena_com_dev *ena_dev, + struct ena_common_mem_addr *ena_addr, + dma_addr_t addr) +{ + if ((addr & GENMASK_ULL(ena_dev->dma_addr_bits - 1, 0)) != addr) { + netdev_err(ena_dev->net_device, + "DMA address has more bits that the device supports\n"); + return -EINVAL; + } + + ena_addr->mem_addr_low = lower_32_bits(addr); + ena_addr->mem_addr_high = (u16)upper_32_bits(addr); + + return 0; +} + +static int ena_com_admin_init_sq(struct ena_com_admin_queue *admin_queue) +{ + struct ena_com_dev *ena_dev = admin_queue->ena_dev; + struct ena_com_admin_sq *sq = &admin_queue->sq; + u16 size = ADMIN_SQ_SIZE(admin_queue->q_depth); + + sq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size, + &sq->dma_addr, GFP_KERNEL); + + if (!sq->entries) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + + sq->head = 0; + sq->tail = 0; + sq->phase = 1; + + sq->db_addr = NULL; + + return 0; +} + +static int ena_com_admin_init_cq(struct ena_com_admin_queue *admin_queue) +{ + struct ena_com_dev *ena_dev = admin_queue->ena_dev; + struct ena_com_admin_cq *cq = &admin_queue->cq; + u16 size = ADMIN_CQ_SIZE(admin_queue->q_depth); + + cq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size, + &cq->dma_addr, GFP_KERNEL); + + if (!cq->entries) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + + cq->head = 0; + cq->phase = 1; + + return 0; +} + +static int ena_com_admin_init_aenq(struct ena_com_dev *ena_dev, + struct ena_aenq_handlers *aenq_handlers) +{ + struct ena_com_aenq *aenq = &ena_dev->aenq; + u32 addr_low, addr_high, aenq_caps; + u16 size; + + ena_dev->aenq.q_depth = ENA_ASYNC_QUEUE_DEPTH; + size = ADMIN_AENQ_SIZE(ENA_ASYNC_QUEUE_DEPTH); + aenq->entries = dma_zalloc_coherent(ena_dev->dmadev, size, + &aenq->dma_addr, GFP_KERNEL); + + if (!aenq->entries) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + + aenq->head = aenq->q_depth; + aenq->phase = 1; + + addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(aenq->dma_addr); + addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(aenq->dma_addr); + + writel(addr_low, ena_dev->reg_bar + ENA_REGS_AENQ_BASE_LO_OFF); + writel(addr_high, ena_dev->reg_bar + ENA_REGS_AENQ_BASE_HI_OFF); + + aenq_caps = 0; + aenq_caps |= ena_dev->aenq.q_depth & ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK; + aenq_caps |= (sizeof(struct ena_admin_aenq_entry) + << ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) & + ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK; + writel(aenq_caps, ena_dev->reg_bar + ENA_REGS_AENQ_CAPS_OFF); + + if (unlikely(!aenq_handlers)) { + netdev_err(ena_dev->net_device, + "AENQ handlers pointer is NULL\n"); + return -EINVAL; + } + + aenq->aenq_handlers = aenq_handlers; + + return 0; +} + +static void comp_ctxt_release(struct ena_com_admin_queue *queue, + struct ena_comp_ctx *comp_ctx) +{ + comp_ctx->occupied = false; + atomic_dec(&queue->outstanding_cmds); +} + +static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *admin_queue, + u16 command_id, bool capture) +{ + if (unlikely(command_id >= admin_queue->q_depth)) { + netdev_err(admin_queue->ena_dev->net_device, + "Command id is larger than the queue size. cmd_id: %u queue size %d\n", + command_id, admin_queue->q_depth); + return NULL; + } + + if (unlikely(!admin_queue->comp_ctx)) { + netdev_err(admin_queue->ena_dev->net_device, + "Completion context is NULL\n"); + return NULL; + } + + if (unlikely(admin_queue->comp_ctx[command_id].occupied && capture)) { + netdev_err(admin_queue->ena_dev->net_device, + "Completion context is occupied\n"); + return NULL; + } + + if (capture) { + atomic_inc(&admin_queue->outstanding_cmds); + admin_queue->comp_ctx[command_id].occupied = true; + } + + return &admin_queue->comp_ctx[command_id]; +} + +static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queue *admin_queue, + struct ena_admin_aq_entry *cmd, + size_t cmd_size_in_bytes, + struct ena_admin_acq_entry *comp, + size_t comp_size_in_bytes) +{ + struct ena_comp_ctx *comp_ctx; + u16 tail_masked, cmd_id; + u16 queue_size_mask; + u16 cnt; + + queue_size_mask = admin_queue->q_depth - 1; + + tail_masked = admin_queue->sq.tail & queue_size_mask; + + /* In case of queue FULL */ + cnt = (u16)atomic_read(&admin_queue->outstanding_cmds); + if (cnt >= admin_queue->q_depth) { + netdev_dbg(admin_queue->ena_dev->net_device, + "Admin queue is full.\n"); + admin_queue->stats.out_of_space++; + return ERR_PTR(-ENOSPC); + } + + cmd_id = admin_queue->curr_cmd_id; + + cmd->aq_common_descriptor.flags |= admin_queue->sq.phase & + ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK; + + cmd->aq_common_descriptor.command_id |= cmd_id & + ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK; + + comp_ctx = get_comp_ctxt(admin_queue, cmd_id, true); + if (unlikely(!comp_ctx)) + return ERR_PTR(-EINVAL); + + comp_ctx->status = ENA_CMD_SUBMITTED; + comp_ctx->comp_size = (u32)comp_size_in_bytes; + comp_ctx->user_cqe = comp; + comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode; + + reinit_completion(&comp_ctx->wait_event); + + memcpy(&admin_queue->sq.entries[tail_masked], cmd, cmd_size_in_bytes); + + admin_queue->curr_cmd_id = (admin_queue->curr_cmd_id + 1) & + queue_size_mask; + + admin_queue->sq.tail++; + admin_queue->stats.submitted_cmd++; + + if (unlikely((admin_queue->sq.tail & queue_size_mask) == 0)) + admin_queue->sq.phase = !admin_queue->sq.phase; + + writel(admin_queue->sq.tail, admin_queue->sq.db_addr); + + return comp_ctx; +} + +static int ena_com_init_comp_ctxt(struct ena_com_admin_queue *admin_queue) +{ + struct ena_com_dev *ena_dev = admin_queue->ena_dev; + size_t size = admin_queue->q_depth * sizeof(struct ena_comp_ctx); + struct ena_comp_ctx *comp_ctx; + u16 i; + + admin_queue->comp_ctx = + devm_kzalloc(admin_queue->q_dmadev, size, GFP_KERNEL); + if (unlikely(!admin_queue->comp_ctx)) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + + for (i = 0; i < admin_queue->q_depth; i++) { + comp_ctx = get_comp_ctxt(admin_queue, i, false); + if (comp_ctx) + init_completion(&comp_ctx->wait_event); + } + + return 0; +} + +static struct ena_comp_ctx *ena_com_submit_admin_cmd(struct ena_com_admin_queue *admin_queue, + struct ena_admin_aq_entry *cmd, + size_t cmd_size_in_bytes, + struct ena_admin_acq_entry *comp, + size_t comp_size_in_bytes) +{ + unsigned long flags = 0; + struct ena_comp_ctx *comp_ctx; + + spin_lock_irqsave(&admin_queue->q_lock, flags); + if (unlikely(!admin_queue->running_state)) { + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + return ERR_PTR(-ENODEV); + } + comp_ctx = __ena_com_submit_admin_cmd(admin_queue, cmd, + cmd_size_in_bytes, + comp, + comp_size_in_bytes); + if (IS_ERR(comp_ctx)) + admin_queue->running_state = false; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + + return comp_ctx; +} + +static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, + struct ena_com_create_io_ctx *ctx, + struct ena_com_io_sq *io_sq) +{ + size_t size; + int dev_node = 0; + + memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr)); + + io_sq->dma_addr_bits = (u8)ena_dev->dma_addr_bits; + io_sq->desc_entry_size = + (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ? + sizeof(struct ena_eth_io_tx_desc) : + sizeof(struct ena_eth_io_rx_desc); + + size = io_sq->desc_entry_size * io_sq->q_depth; + io_sq->bus = ena_dev->bus; + + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) { + dev_node = dev_to_node(ena_dev->dmadev); + set_dev_node(ena_dev->dmadev, ctx->numa_node); + io_sq->desc_addr.virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, size, + &io_sq->desc_addr.phys_addr, + GFP_KERNEL); + set_dev_node(ena_dev->dmadev, dev_node); + if (!io_sq->desc_addr.virt_addr) { + io_sq->desc_addr.virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, size, + &io_sq->desc_addr.phys_addr, + GFP_KERNEL); + } + + if (!io_sq->desc_addr.virt_addr) { + netdev_err(ena_dev->net_device, + "Memory allocation failed\n"); + return -ENOMEM; + } + } + + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + /* Allocate bounce buffers */ + io_sq->bounce_buf_ctrl.buffer_size = + ena_dev->llq_info.desc_list_entry_size; + io_sq->bounce_buf_ctrl.buffers_num = + ENA_COM_BOUNCE_BUFFER_CNTRL_CNT; + io_sq->bounce_buf_ctrl.next_to_use = 0; + + size = (size_t)io_sq->bounce_buf_ctrl.buffer_size * + io_sq->bounce_buf_ctrl.buffers_num; + + dev_node = dev_to_node(ena_dev->dmadev); + set_dev_node(ena_dev->dmadev, ctx->numa_node); + io_sq->bounce_buf_ctrl.base_buffer = + devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL); + set_dev_node(ena_dev->dmadev, dev_node); + if (!io_sq->bounce_buf_ctrl.base_buffer) + io_sq->bounce_buf_ctrl.base_buffer = + devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL); + + if (!io_sq->bounce_buf_ctrl.base_buffer) { + netdev_err(ena_dev->net_device, + "Bounce buffer memory allocation failed\n"); + return -ENOMEM; + } + + memcpy(&io_sq->llq_info, &ena_dev->llq_info, + sizeof(io_sq->llq_info)); + + /* Initiate the first bounce buffer */ + io_sq->llq_buf_ctrl.curr_bounce_buf = + ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl); + memset(io_sq->llq_buf_ctrl.curr_bounce_buf, + 0x0, io_sq->llq_info.desc_list_entry_size); + io_sq->llq_buf_ctrl.descs_left_in_line = + io_sq->llq_info.descs_num_before_header; + io_sq->disable_meta_caching = + io_sq->llq_info.disable_meta_caching; + + if (io_sq->llq_info.max_entries_in_tx_burst > 0) + io_sq->entries_in_tx_burst_left = + io_sq->llq_info.max_entries_in_tx_burst; + } + + io_sq->tail = 0; + io_sq->next_to_comp = 0; + io_sq->phase = 1; + + return 0; +} + +static int ena_com_init_io_cq(struct ena_com_dev *ena_dev, + struct ena_com_create_io_ctx *ctx, + struct ena_com_io_cq *io_cq) +{ + size_t size; + int prev_node = 0; + + memset(&io_cq->cdesc_addr, 0x0, sizeof(io_cq->cdesc_addr)); + + /* Use the basic completion descriptor for Rx */ + io_cq->cdesc_entry_size_in_bytes = + (io_cq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ? + sizeof(struct ena_eth_io_tx_cdesc) : + sizeof(struct ena_eth_io_rx_cdesc_base); + + size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth; + io_cq->bus = ena_dev->bus; + + prev_node = dev_to_node(ena_dev->dmadev); + set_dev_node(ena_dev->dmadev, ctx->numa_node); + io_cq->cdesc_addr.virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, size, + &io_cq->cdesc_addr.phys_addr, GFP_KERNEL); + set_dev_node(ena_dev->dmadev, prev_node); + if (!io_cq->cdesc_addr.virt_addr) { + io_cq->cdesc_addr.virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, size, + &io_cq->cdesc_addr.phys_addr, + GFP_KERNEL); + } + + if (!io_cq->cdesc_addr.virt_addr) { + netdev_err(ena_dev->net_device, "Memory allocation failed\n"); + return -ENOMEM; + } + + io_cq->phase = 1; + io_cq->head = 0; + + return 0; +} + +static void ena_com_handle_single_admin_completion(struct ena_com_admin_queue *admin_queue, + struct ena_admin_acq_entry *cqe) +{ + struct ena_comp_ctx *comp_ctx; + u16 cmd_id; + + cmd_id = cqe->acq_common_descriptor.command & + ENA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK; + + comp_ctx = get_comp_ctxt(admin_queue, cmd_id, false); + if (unlikely(!comp_ctx)) { + netdev_err(admin_queue->ena_dev->net_device, + "comp_ctx is NULL. Changing the admin queue running state\n"); + admin_queue->running_state = false; + return; + } + + comp_ctx->status = ENA_CMD_COMPLETED; + comp_ctx->comp_status = cqe->acq_common_descriptor.status; + + if (comp_ctx->user_cqe) + memcpy(comp_ctx->user_cqe, (void *)cqe, comp_ctx->comp_size); + + if (!admin_queue->polling) + complete(&comp_ctx->wait_event); +} + +static void ena_com_handle_admin_completion(struct ena_com_admin_queue *admin_queue) +{ + struct ena_admin_acq_entry *cqe = NULL; + u16 comp_num = 0; + u16 head_masked; + u8 phase; + + head_masked = admin_queue->cq.head & (admin_queue->q_depth - 1); + phase = admin_queue->cq.phase; + + cqe = &admin_queue->cq.entries[head_masked]; + + /* Go over all the completions */ + while ((READ_ONCE(cqe->acq_common_descriptor.flags) & + ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) { + /* Do not read the rest of the completion entry before the + * phase bit was validated + */ + dma_rmb(); + ena_com_handle_single_admin_completion(admin_queue, cqe); + + head_masked++; + comp_num++; + if (unlikely(head_masked == admin_queue->q_depth)) { + head_masked = 0; + phase = !phase; + } + + cqe = &admin_queue->cq.entries[head_masked]; + } + + admin_queue->cq.head += comp_num; + admin_queue->cq.phase = phase; + admin_queue->sq.head += comp_num; + admin_queue->stats.completed_cmd += comp_num; +} + +static int ena_com_comp_status_to_errno(struct ena_com_admin_queue *admin_queue, + u8 comp_status) +{ + if (unlikely(comp_status != 0)) + netdev_err(admin_queue->ena_dev->net_device, + "Admin command failed[%u]\n", comp_status); + + switch (comp_status) { + case ENA_ADMIN_SUCCESS: + return 0; + case ENA_ADMIN_RESOURCE_ALLOCATION_FAILURE: + return -ENOMEM; + case ENA_ADMIN_UNSUPPORTED_OPCODE: + return -EOPNOTSUPP; + case ENA_ADMIN_BAD_OPCODE: + case ENA_ADMIN_MALFORMED_REQUEST: + case ENA_ADMIN_ILLEGAL_PARAMETER: + case ENA_ADMIN_UNKNOWN_ERROR: + return -EINVAL; + case ENA_ADMIN_RESOURCE_BUSY: + return -EAGAIN; + } + + return -EINVAL; +} + +static void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us) +{ + delay_us = max_t(u32, ENA_MIN_ADMIN_POLL_US, delay_us); + delay_us = min_t(u32, delay_us * (1U << exp), ENA_MAX_ADMIN_POLL_US); + usleep_range(delay_us, 2 * delay_us); +} + +static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx, + struct ena_com_admin_queue *admin_queue) +{ + unsigned long flags = 0; + unsigned long timeout; + int ret; + u32 exp = 0; + + timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout); + + while (1) { + spin_lock_irqsave(&admin_queue->q_lock, flags); + ena_com_handle_admin_completion(admin_queue); + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + + if (comp_ctx->status != ENA_CMD_SUBMITTED) + break; + + if (time_is_before_jiffies(timeout)) { + netdev_err(admin_queue->ena_dev->net_device, + "Wait for completion (polling) timeout\n"); + /* ENA didn't have any completion */ + spin_lock_irqsave(&admin_queue->q_lock, flags); + admin_queue->stats.no_completion++; + admin_queue->running_state = false; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + + ret = -ETIME; + goto err; + } + + ena_delay_exponential_backoff_us(exp++, + admin_queue->ena_dev->ena_min_poll_delay_us); + } + + if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) { + netdev_err(admin_queue->ena_dev->net_device, + "Command was aborted\n"); + spin_lock_irqsave(&admin_queue->q_lock, flags); + admin_queue->stats.aborted_cmd++; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + ret = -ENODEV; + goto err; + } + + WARN(comp_ctx->status != ENA_CMD_COMPLETED, "Invalid comp status %d\n", + comp_ctx->status); + + ret = ena_com_comp_status_to_errno(admin_queue, comp_ctx->comp_status); +err: + comp_ctxt_release(admin_queue, comp_ctx); + return ret; +} + +/* + * Set the LLQ configurations of the firmware + * + * The driver provides only the enabled feature values to the device, + * which in turn, checks if they are supported. + */ +static int ena_com_set_llq(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + struct ena_com_llq_info *llq_info = &ena_dev->llq_info; + int ret; + + memset(&cmd, 0x0, sizeof(cmd)); + admin_queue = &ena_dev->admin_queue; + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.feat_common.feature_id = ENA_ADMIN_LLQ; + + cmd.u.llq.header_location_ctrl_enabled = llq_info->header_location_ctrl; + cmd.u.llq.entry_size_ctrl_enabled = llq_info->desc_list_entry_size_ctrl; + cmd.u.llq.desc_num_before_header_enabled = llq_info->descs_num_before_header; + cmd.u.llq.descriptors_stride_ctrl_enabled = llq_info->desc_stride_ctrl; + + cmd.u.llq.accel_mode.u.set.enabled_flags = + BIT(ENA_ADMIN_DISABLE_META_CACHING) | + BIT(ENA_ADMIN_LIMIT_TX_BURST); + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, + "Failed to set LLQ configurations: %d\n", ret); + + return ret; +} + +static int ena_com_config_llq_info(struct ena_com_dev *ena_dev, + struct ena_admin_feature_llq_desc *llq_features, + struct ena_llq_configurations *llq_default_cfg) +{ + struct ena_com_llq_info *llq_info = &ena_dev->llq_info; + struct ena_admin_accel_mode_get llq_accel_mode_get; + u16 supported_feat; + int rc; + + memset(llq_info, 0, sizeof(*llq_info)); + + supported_feat = llq_features->header_location_ctrl_supported; + + if (likely(supported_feat & llq_default_cfg->llq_header_location)) { + llq_info->header_location_ctrl = + llq_default_cfg->llq_header_location; + } else { + netdev_err(ena_dev->net_device, + "Invalid header location control, supported: 0x%x\n", + supported_feat); + return -EINVAL; + } + + if (likely(llq_info->header_location_ctrl == ENA_ADMIN_INLINE_HEADER)) { + supported_feat = llq_features->descriptors_stride_ctrl_supported; + if (likely(supported_feat & llq_default_cfg->llq_stride_ctrl)) { + llq_info->desc_stride_ctrl = llq_default_cfg->llq_stride_ctrl; + } else { + if (supported_feat & ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY) { + llq_info->desc_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY; + } else if (supported_feat & ENA_ADMIN_SINGLE_DESC_PER_ENTRY) { + llq_info->desc_stride_ctrl = ENA_ADMIN_SINGLE_DESC_PER_ENTRY; + } else { + netdev_err(ena_dev->net_device, + "Invalid desc_stride_ctrl, supported: 0x%x\n", + supported_feat); + return -EINVAL; + } + + netdev_err(ena_dev->net_device, + "Default llq stride ctrl is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n", + llq_default_cfg->llq_stride_ctrl, + supported_feat, llq_info->desc_stride_ctrl); + } + } else { + llq_info->desc_stride_ctrl = 0; + } + + supported_feat = llq_features->entry_size_ctrl_supported; + if (likely(supported_feat & llq_default_cfg->llq_ring_entry_size)) { + llq_info->desc_list_entry_size_ctrl = llq_default_cfg->llq_ring_entry_size; + llq_info->desc_list_entry_size = llq_default_cfg->llq_ring_entry_size_value; + } else { + if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_128B) { + llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_128B; + llq_info->desc_list_entry_size = 128; + } else if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_192B) { + llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_192B; + llq_info->desc_list_entry_size = 192; + } else if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_256B) { + llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_256B; + llq_info->desc_list_entry_size = 256; + } else { + netdev_err(ena_dev->net_device, + "Invalid entry_size_ctrl, supported: 0x%x\n", + supported_feat); + return -EINVAL; + } + + netdev_err(ena_dev->net_device, + "Default llq ring entry size is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n", + llq_default_cfg->llq_ring_entry_size, supported_feat, + llq_info->desc_list_entry_size); + } + if (unlikely(llq_info->desc_list_entry_size & 0x7)) { + /* The desc list entry size should be whole multiply of 8 + * This requirement comes from __iowrite64_copy() + */ + netdev_err(ena_dev->net_device, "Illegal entry size %d\n", + llq_info->desc_list_entry_size); + return -EINVAL; + } + + if (llq_info->desc_stride_ctrl == ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY) + llq_info->descs_per_entry = llq_info->desc_list_entry_size / + sizeof(struct ena_eth_io_tx_desc); + else + llq_info->descs_per_entry = 1; + + supported_feat = llq_features->desc_num_before_header_supported; + if (likely(supported_feat & llq_default_cfg->llq_num_decs_before_header)) { + llq_info->descs_num_before_header = llq_default_cfg->llq_num_decs_before_header; + } else { + if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2) { + llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2; + } else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1) { + llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1; + } else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4) { + llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4; + } else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8) { + llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8; + } else { + netdev_err(ena_dev->net_device, + "Invalid descs_num_before_header, supported: 0x%x\n", + supported_feat); + return -EINVAL; + } + + netdev_err(ena_dev->net_device, + "Default llq num descs before header is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n", + llq_default_cfg->llq_num_decs_before_header, + supported_feat, llq_info->descs_num_before_header); + } + /* Check for accelerated queue supported */ + llq_accel_mode_get = llq_features->accel_mode.u.get; + + llq_info->disable_meta_caching = + !!(llq_accel_mode_get.supported_flags & + BIT(ENA_ADMIN_DISABLE_META_CACHING)); + + if (llq_accel_mode_get.supported_flags & BIT(ENA_ADMIN_LIMIT_TX_BURST)) + llq_info->max_entries_in_tx_burst = + llq_accel_mode_get.max_tx_burst_size / + llq_default_cfg->llq_ring_entry_size_value; + + rc = ena_com_set_llq(ena_dev); + if (rc) + netdev_err(ena_dev->net_device, + "Cannot set LLQ configuration: %d\n", rc); + + return rc; +} + +static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *comp_ctx, + struct ena_com_admin_queue *admin_queue) +{ + unsigned long flags = 0; + int ret; + + wait_for_completion_timeout(&comp_ctx->wait_event, + usecs_to_jiffies( + admin_queue->completion_timeout)); + + /* In case the command wasn't completed find out the root cause. + * There might be 2 kinds of errors + * 1) No completion (timeout reached) + * 2) There is completion but the device didn't get any msi-x interrupt. + */ + if (unlikely(comp_ctx->status == ENA_CMD_SUBMITTED)) { + spin_lock_irqsave(&admin_queue->q_lock, flags); + ena_com_handle_admin_completion(admin_queue); + admin_queue->stats.no_completion++; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + + if (comp_ctx->status == ENA_CMD_COMPLETED) { + netdev_err(admin_queue->ena_dev->net_device, + "The ena device sent a completion but the driver didn't receive a MSI-X interrupt (cmd %d), autopolling mode is %s\n", + comp_ctx->cmd_opcode, + admin_queue->auto_polling ? "ON" : "OFF"); + /* Check if fallback to polling is enabled */ + if (admin_queue->auto_polling) + admin_queue->polling = true; + } else { + netdev_err(admin_queue->ena_dev->net_device, + "The ena device didn't send a completion for the admin cmd %d status %d\n", + comp_ctx->cmd_opcode, comp_ctx->status); + } + /* Check if shifted to polling mode. + * This will happen if there is a completion without an interrupt + * and autopolling mode is enabled. Continuing normal execution in such case + */ + if (!admin_queue->polling) { + admin_queue->running_state = false; + ret = -ETIME; + goto err; + } + } + + ret = ena_com_comp_status_to_errno(admin_queue, comp_ctx->comp_status); +err: + comp_ctxt_release(admin_queue, comp_ctx); + return ret; +} + +/* This method read the hardware device register through posting writes + * and waiting for response + * On timeout the function will return ENA_MMIO_READ_TIMEOUT + */ +static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset) +{ + struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read; + volatile struct ena_admin_ena_mmio_req_read_less_resp *read_resp = + mmio_read->read_resp; + u32 mmio_read_reg, ret, i; + unsigned long flags = 0; + u32 timeout = mmio_read->reg_read_to; + + might_sleep(); + + if (timeout == 0) + timeout = ENA_REG_READ_TIMEOUT; + + /* If readless is disabled, perform regular read */ + if (!mmio_read->readless_supported) + return readl(ena_dev->reg_bar + offset); + + spin_lock_irqsave(&mmio_read->lock, flags); + mmio_read->seq_num++; + + read_resp->req_id = mmio_read->seq_num + 0xDEAD; + mmio_read_reg = (offset << ENA_REGS_MMIO_REG_READ_REG_OFF_SHIFT) & + ENA_REGS_MMIO_REG_READ_REG_OFF_MASK; + mmio_read_reg |= mmio_read->seq_num & + ENA_REGS_MMIO_REG_READ_REQ_ID_MASK; + + writel(mmio_read_reg, ena_dev->reg_bar + ENA_REGS_MMIO_REG_READ_OFF); + + for (i = 0; i < timeout; i++) { + if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num) + break; + + udelay(1); + } + + if (unlikely(i == timeout)) { + netdev_err(ena_dev->net_device, + "Reading reg failed for timeout. expected: req id[%u] offset[%u] actual: req id[%u] offset[%u]\n", + mmio_read->seq_num, offset, read_resp->req_id, + read_resp->reg_off); + ret = ENA_MMIO_READ_TIMEOUT; + goto err; + } + + if (read_resp->reg_off != offset) { + netdev_err(ena_dev->net_device, + "Read failure: wrong offset provided\n"); + ret = ENA_MMIO_READ_TIMEOUT; + } else { + ret = read_resp->reg_val; + } +err: + spin_unlock_irqrestore(&mmio_read->lock, flags); + + return ret; +} + +/* There are two types to wait for completion. + * Polling mode - wait until the completion is available. + * Async mode - wait on wait queue until the completion is ready + * (or the timeout expired). + * It is expected that the IRQ called ena_com_handle_admin_completion + * to mark the completions. + */ +static int ena_com_wait_and_process_admin_cq(struct ena_comp_ctx *comp_ctx, + struct ena_com_admin_queue *admin_queue) +{ + if (admin_queue->polling) + return ena_com_wait_and_process_admin_cq_polling(comp_ctx, + admin_queue); + + return ena_com_wait_and_process_admin_cq_interrupts(comp_ctx, + admin_queue); +} + +static int ena_com_destroy_io_sq(struct ena_com_dev *ena_dev, + struct ena_com_io_sq *io_sq) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_admin_aq_destroy_sq_cmd destroy_cmd; + struct ena_admin_acq_destroy_sq_resp_desc destroy_resp; + u8 direction; + int ret; + + memset(&destroy_cmd, 0x0, sizeof(destroy_cmd)); + + if (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) + direction = ENA_ADMIN_SQ_DIRECTION_TX; + else + direction = ENA_ADMIN_SQ_DIRECTION_RX; + + destroy_cmd.sq.sq_identity |= (direction << + ENA_ADMIN_SQ_SQ_DIRECTION_SHIFT) & + ENA_ADMIN_SQ_SQ_DIRECTION_MASK; + + destroy_cmd.sq.sq_idx = io_sq->idx; + destroy_cmd.aq_common_descriptor.opcode = ENA_ADMIN_DESTROY_SQ; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&destroy_cmd, + sizeof(destroy_cmd), + (struct ena_admin_acq_entry *)&destroy_resp, + sizeof(destroy_resp)); + + if (unlikely(ret && (ret != -ENODEV))) + netdev_err(ena_dev->net_device, + "Failed to destroy io sq error: %d\n", ret); + + return ret; +} + +static void ena_com_io_queue_free(struct ena_com_dev *ena_dev, + struct ena_com_io_sq *io_sq, + struct ena_com_io_cq *io_cq) +{ + size_t size; + + if (io_cq->cdesc_addr.virt_addr) { + size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth; + + dma_free_coherent(ena_dev->dmadev, size, + io_cq->cdesc_addr.virt_addr, + io_cq->cdesc_addr.phys_addr); + + io_cq->cdesc_addr.virt_addr = NULL; + } + + if (io_sq->desc_addr.virt_addr) { + size = io_sq->desc_entry_size * io_sq->q_depth; + + dma_free_coherent(ena_dev->dmadev, size, + io_sq->desc_addr.virt_addr, + io_sq->desc_addr.phys_addr); + + io_sq->desc_addr.virt_addr = NULL; + } + + if (io_sq->bounce_buf_ctrl.base_buffer) { + devm_kfree(ena_dev->dmadev, io_sq->bounce_buf_ctrl.base_buffer); + io_sq->bounce_buf_ctrl.base_buffer = NULL; + } +} + +static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout, + u16 exp_state) +{ + u32 val, exp = 0; + unsigned long timeout_stamp; + + /* Convert timeout from resolution of 100ms to us resolution. */ + timeout_stamp = jiffies + usecs_to_jiffies(100 * 1000 * timeout); + + while (1) { + val = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF); + + if (unlikely(val == ENA_MMIO_READ_TIMEOUT)) { + netdev_err(ena_dev->net_device, + "Reg read timeout occurred\n"); + return -ETIME; + } + + if ((val & ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK) == + exp_state) + return 0; + + if (time_is_before_jiffies(timeout_stamp)) + return -ETIME; + + ena_delay_exponential_backoff_us(exp++, ena_dev->ena_min_poll_delay_us); + } +} + +static bool ena_com_check_supported_feature_id(struct ena_com_dev *ena_dev, + enum ena_admin_aq_feature_id feature_id) +{ + u32 feature_mask = 1 << feature_id; + + /* Device attributes is always supported */ + if ((feature_id != ENA_ADMIN_DEVICE_ATTRIBUTES) && + !(ena_dev->supported_features & feature_mask)) + return false; + + return true; +} + +static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev, + struct ena_admin_get_feat_resp *get_resp, + enum ena_admin_aq_feature_id feature_id, + dma_addr_t control_buf_dma_addr, + u32 control_buff_size, + u8 feature_ver) +{ + struct ena_com_admin_queue *admin_queue; + struct ena_admin_get_feat_cmd get_cmd; + int ret; + + if (!ena_com_check_supported_feature_id(ena_dev, feature_id)) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", + feature_id); + return -EOPNOTSUPP; + } + + memset(&get_cmd, 0x0, sizeof(get_cmd)); + admin_queue = &ena_dev->admin_queue; + + get_cmd.aq_common_descriptor.opcode = ENA_ADMIN_GET_FEATURE; + + if (control_buff_size) + get_cmd.aq_common_descriptor.flags = + ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK; + else + get_cmd.aq_common_descriptor.flags = 0; + + ret = ena_com_mem_addr_set(ena_dev, + &get_cmd.control_buffer.address, + control_buf_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + get_cmd.control_buffer.length = control_buff_size; + get_cmd.feat_common.feature_version = feature_ver; + get_cmd.feat_common.feature_id = feature_id; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *) + &get_cmd, + sizeof(get_cmd), + (struct ena_admin_acq_entry *) + get_resp, + sizeof(*get_resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, + "Failed to submit get_feature command %d error: %d\n", + feature_id, ret); + + return ret; +} + +static int ena_com_get_feature(struct ena_com_dev *ena_dev, + struct ena_admin_get_feat_resp *get_resp, + enum ena_admin_aq_feature_id feature_id, + u8 feature_ver) +{ + return ena_com_get_feature_ex(ena_dev, + get_resp, + feature_id, + 0, + 0, + feature_ver); +} + +int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev) +{ + return ena_dev->rss.hash_func; +} + +static void ena_com_hash_key_fill_default_key(struct ena_com_dev *ena_dev) +{ + struct ena_admin_feature_rss_flow_hash_control *hash_key = + (ena_dev->rss).hash_key; + + netdev_rss_key_fill(&hash_key->key, sizeof(hash_key->key)); + /* The key buffer is stored in the device in an array of + * uint32 elements. + */ + hash_key->key_parts = ENA_ADMIN_RSS_KEY_PARTS; +} + +static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + + if (!ena_com_check_supported_feature_id(ena_dev, + ENA_ADMIN_RSS_HASH_FUNCTION)) + return -EOPNOTSUPP; + + rss->hash_key = + dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key), + &rss->hash_key_dma_addr, GFP_KERNEL); + + if (unlikely(!rss->hash_key)) + return -ENOMEM; + + return 0; +} + +static void ena_com_hash_key_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + + if (rss->hash_key) + dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_key), + rss->hash_key, rss->hash_key_dma_addr); + rss->hash_key = NULL; +} + +static int ena_com_hash_ctrl_init(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + + rss->hash_ctrl = + dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl), + &rss->hash_ctrl_dma_addr, GFP_KERNEL); + + if (unlikely(!rss->hash_ctrl)) + return -ENOMEM; + + return 0; +} + +static void ena_com_hash_ctrl_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + + if (rss->hash_ctrl) + dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl), + rss->hash_ctrl, rss->hash_ctrl_dma_addr); + rss->hash_ctrl = NULL; +} + +static int ena_com_indirect_table_allocate(struct ena_com_dev *ena_dev, + u16 log_size) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_get_feat_resp get_resp; + size_t tbl_size; + int ret; + + ret = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG, 0); + if (unlikely(ret)) + return ret; + + if ((get_resp.u.ind_table.min_size > log_size) || + (get_resp.u.ind_table.max_size < log_size)) { + netdev_err(ena_dev->net_device, + "Indirect table size doesn't fit. requested size: %d while min is:%d and max %d\n", + 1 << log_size, 1 << get_resp.u.ind_table.min_size, + 1 << get_resp.u.ind_table.max_size); + return -EINVAL; + } + + tbl_size = (1ULL << log_size) * + sizeof(struct ena_admin_rss_ind_table_entry); + + rss->rss_ind_tbl = + dma_zalloc_coherent(ena_dev->dmadev, tbl_size, + &rss->rss_ind_tbl_dma_addr, GFP_KERNEL); + if (unlikely(!rss->rss_ind_tbl)) + goto mem_err1; + + tbl_size = (1ULL << log_size) * sizeof(u16); + rss->host_rss_ind_tbl = + devm_kzalloc(ena_dev->dmadev, tbl_size, GFP_KERNEL); + if (unlikely(!rss->host_rss_ind_tbl)) + goto mem_err2; + + rss->tbl_log_size = log_size; + + return 0; + +mem_err2: + tbl_size = (1ULL << log_size) * + sizeof(struct ena_admin_rss_ind_table_entry); + + dma_free_coherent(ena_dev->dmadev, tbl_size, rss->rss_ind_tbl, + rss->rss_ind_tbl_dma_addr); + rss->rss_ind_tbl = NULL; +mem_err1: + rss->tbl_log_size = 0; + return -ENOMEM; +} + +static void ena_com_indirect_table_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + size_t tbl_size = (1ULL << rss->tbl_log_size) * + sizeof(struct ena_admin_rss_ind_table_entry); + + if (rss->rss_ind_tbl) + dma_free_coherent(ena_dev->dmadev, tbl_size, rss->rss_ind_tbl, + rss->rss_ind_tbl_dma_addr); + rss->rss_ind_tbl = NULL; + + if (rss->host_rss_ind_tbl) + devm_kfree(ena_dev->dmadev, rss->host_rss_ind_tbl); + rss->host_rss_ind_tbl = NULL; +} + +static int ena_com_create_io_sq(struct ena_com_dev *ena_dev, + struct ena_com_io_sq *io_sq, u16 cq_idx) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_admin_aq_create_sq_cmd create_cmd; + struct ena_admin_acq_create_sq_resp_desc cmd_completion; + u8 direction; + int ret; + + memset(&create_cmd, 0x0, sizeof(create_cmd)); + + create_cmd.aq_common_descriptor.opcode = ENA_ADMIN_CREATE_SQ; + + if (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) + direction = ENA_ADMIN_SQ_DIRECTION_TX; + else + direction = ENA_ADMIN_SQ_DIRECTION_RX; + + create_cmd.sq_identity |= (direction << + ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_SHIFT) & + ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_MASK; + + create_cmd.sq_caps_2 |= io_sq->mem_queue_type & + ENA_ADMIN_AQ_CREATE_SQ_CMD_PLACEMENT_POLICY_MASK; + + create_cmd.sq_caps_2 |= (ENA_ADMIN_COMPLETION_POLICY_DESC << + ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_SHIFT) & + ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_MASK; + + create_cmd.sq_caps_3 |= + ENA_ADMIN_AQ_CREATE_SQ_CMD_IS_PHYSICALLY_CONTIGUOUS_MASK; + + create_cmd.cq_idx = cq_idx; + create_cmd.sq_depth = io_sq->q_depth; + + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) { + ret = ena_com_mem_addr_set(ena_dev, + &create_cmd.sq_ba, + io_sq->desc_addr.phys_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, + "Memory address set failed\n"); + return ret; + } + } + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&create_cmd, + sizeof(create_cmd), + (struct ena_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, + "Failed to create IO SQ. error: %d\n", ret); + return ret; + } + + io_sq->idx = cmd_completion.sq_idx; + + io_sq->db_addr = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar + + (uintptr_t)cmd_completion.sq_doorbell_offset); + + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + io_sq->desc_addr.pbuf_dev_addr = + (u8 __iomem *)((uintptr_t)ena_dev->mem_bar + + cmd_completion.llq_descriptors_offset); + } + + netdev_dbg(ena_dev->net_device, "Created sq[%u], depth[%u]\n", + io_sq->idx, io_sq->q_depth); + + return ret; +} + +static int ena_com_ind_tbl_convert_to_device(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_com_io_sq *io_sq; + u16 qid; + int i; + + for (i = 0; i < 1 << rss->tbl_log_size; i++) { + qid = rss->host_rss_ind_tbl[i]; + if (qid >= ENA_TOTAL_NUM_QUEUES) + return -EINVAL; + + io_sq = &ena_dev->io_sq_queues[qid]; + + if (io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX) + return -EINVAL; + + rss->rss_ind_tbl[i].cq_idx = io_sq->idx; + } + + return 0; +} + +static void ena_com_update_intr_delay_resolution(struct ena_com_dev *ena_dev, + u16 intr_delay_resolution) +{ + u16 prev_intr_delay_resolution = ena_dev->intr_delay_resolution; + + if (unlikely(!intr_delay_resolution)) { + netdev_err(ena_dev->net_device, + "Illegal intr_delay_resolution provided. Going to use default 1 usec resolution\n"); + intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION; + } + + /* update Rx */ + ena_dev->intr_moder_rx_interval = + ena_dev->intr_moder_rx_interval * + prev_intr_delay_resolution / + intr_delay_resolution; + + /* update Tx */ + ena_dev->intr_moder_tx_interval = + ena_dev->intr_moder_tx_interval * + prev_intr_delay_resolution / + intr_delay_resolution; + + ena_dev->intr_delay_resolution = intr_delay_resolution; +} + +/*****************************************************************************/ +/******************************* API ******************************/ +/*****************************************************************************/ + +int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue, + struct ena_admin_aq_entry *cmd, + size_t cmd_size, + struct ena_admin_acq_entry *comp, + size_t comp_size) +{ + struct ena_comp_ctx *comp_ctx; + int ret; + + comp_ctx = ena_com_submit_admin_cmd(admin_queue, cmd, cmd_size, + comp, comp_size); + if (IS_ERR(comp_ctx)) { + ret = PTR_ERR(comp_ctx); + if (ret == -ENODEV) + netdev_dbg(admin_queue->ena_dev->net_device, + "Failed to submit command [%d]\n", ret); + else + netdev_err(admin_queue->ena_dev->net_device, + "Failed to submit command [%d]\n", ret); + + return ret; + } + + ret = ena_com_wait_and_process_admin_cq(comp_ctx, admin_queue); + if (unlikely(ret)) { + if (admin_queue->running_state) + netdev_err(admin_queue->ena_dev->net_device, + "Failed to process command. ret = %d\n", ret); + else + netdev_dbg(admin_queue->ena_dev->net_device, + "Failed to process command. ret = %d\n", ret); + } + return ret; +} + +int ena_com_create_io_cq(struct ena_com_dev *ena_dev, + struct ena_com_io_cq *io_cq) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_admin_aq_create_cq_cmd create_cmd; + struct ena_admin_acq_create_cq_resp_desc cmd_completion; + int ret; + + memset(&create_cmd, 0x0, sizeof(create_cmd)); + + create_cmd.aq_common_descriptor.opcode = ENA_ADMIN_CREATE_CQ; + + create_cmd.cq_caps_2 |= (io_cq->cdesc_entry_size_in_bytes / 4) & + ENA_ADMIN_AQ_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK; + create_cmd.cq_caps_1 |= + ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK; + + create_cmd.msix_vector = io_cq->msix_vector; + create_cmd.cq_depth = io_cq->q_depth; + + ret = ena_com_mem_addr_set(ena_dev, + &create_cmd.cq_ba, + io_cq->cdesc_addr.phys_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&create_cmd, + sizeof(create_cmd), + (struct ena_admin_acq_entry *)&cmd_completion, + sizeof(cmd_completion)); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, + "Failed to create IO CQ. error: %d\n", ret); + return ret; + } + + io_cq->idx = cmd_completion.cq_idx; + + io_cq->unmask_reg = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar + + cmd_completion.cq_interrupt_unmask_register_offset); + + if (cmd_completion.numa_node_register_offset) + io_cq->numa_node_cfg_reg = + (u32 __iomem *)((uintptr_t)ena_dev->reg_bar + + cmd_completion.numa_node_register_offset); + + netdev_dbg(ena_dev->net_device, "Created cq[%u], depth[%u]\n", + io_cq->idx, io_cq->q_depth); + + return ret; +} + +int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid, + struct ena_com_io_sq **io_sq, + struct ena_com_io_cq **io_cq) +{ + if (qid >= ENA_TOTAL_NUM_QUEUES) { + netdev_err(ena_dev->net_device, + "Invalid queue number %d but the max is %d\n", qid, + ENA_TOTAL_NUM_QUEUES); + return -EINVAL; + } + + *io_sq = &ena_dev->io_sq_queues[qid]; + *io_cq = &ena_dev->io_cq_queues[qid]; + + return 0; +} + +void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_comp_ctx *comp_ctx; + u16 i; + + if (!admin_queue->comp_ctx) + return; + + for (i = 0; i < admin_queue->q_depth; i++) { + comp_ctx = get_comp_ctxt(admin_queue, i, false); + if (unlikely(!comp_ctx)) + break; + + comp_ctx->status = ENA_CMD_ABORTED; + + complete(&comp_ctx->wait_event); + } +} + +void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + unsigned long flags = 0; + u32 exp = 0; + + spin_lock_irqsave(&admin_queue->q_lock, flags); + while (atomic_read(&admin_queue->outstanding_cmds) != 0) { + spin_unlock_irqrestore(&admin_queue->q_lock, flags); + ena_delay_exponential_backoff_us(exp++, + ena_dev->ena_min_poll_delay_us); + spin_lock_irqsave(&admin_queue->q_lock, flags); + } + spin_unlock_irqrestore(&admin_queue->q_lock, flags); +} + +int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev, + struct ena_com_io_cq *io_cq) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_admin_aq_destroy_cq_cmd destroy_cmd; + struct ena_admin_acq_destroy_cq_resp_desc destroy_resp; + int ret; + + memset(&destroy_cmd, 0x0, sizeof(destroy_cmd)); + + destroy_cmd.cq_idx = io_cq->idx; + destroy_cmd.aq_common_descriptor.opcode = ENA_ADMIN_DESTROY_CQ; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&destroy_cmd, + sizeof(destroy_cmd), + (struct ena_admin_acq_entry *)&destroy_resp, + sizeof(destroy_resp)); + + if (unlikely(ret && (ret != -ENODEV))) + netdev_err(ena_dev->net_device, + "Failed to destroy IO CQ. error: %d\n", ret); + + return ret; +} + +bool ena_com_get_admin_running_state(struct ena_com_dev *ena_dev) +{ + return ena_dev->admin_queue.running_state; +} + +void ena_com_set_admin_running_state(struct ena_com_dev *ena_dev, bool state) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + unsigned long flags = 0; + + spin_lock_irqsave(&admin_queue->q_lock, flags); + ena_dev->admin_queue.running_state = state; + spin_unlock_irqrestore(&admin_queue->q_lock, flags); +} + +void ena_com_admin_aenq_enable(struct ena_com_dev *ena_dev) +{ + u16 depth = ena_dev->aenq.q_depth; + + WARN(ena_dev->aenq.head != depth, "Invalid AENQ state\n"); + + /* Init head_db to mark that all entries in the queue + * are initially available + */ + writel(depth, ena_dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF); +} + +int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag) +{ + struct ena_com_admin_queue *admin_queue; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + struct ena_admin_get_feat_resp get_resp; + int ret; + + ret = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_AENQ_CONFIG, 0); + if (ret) { + dev_info(ena_dev->dmadev, "Can't get aenq configuration\n"); + return ret; + } + + if ((get_resp.u.aenq.supported_groups & groups_flag) != groups_flag) { + netdev_warn(ena_dev->net_device, + "Trying to set unsupported aenq events. supported flag: 0x%x asked flag: 0x%x\n", + get_resp.u.aenq.supported_groups, groups_flag); + return -EOPNOTSUPP; + } + + memset(&cmd, 0x0, sizeof(cmd)); + admin_queue = &ena_dev->admin_queue; + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.aq_common_descriptor.flags = 0; + cmd.feat_common.feature_id = ENA_ADMIN_AENQ_CONFIG; + cmd.u.aenq.enabled_groups = groups_flag; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, + "Failed to config AENQ ret: %d\n", ret); + + return ret; +} + +int ena_com_get_dma_width(struct ena_com_dev *ena_dev) +{ + u32 caps = ena_com_reg_bar_read32(ena_dev, ENA_REGS_CAPS_OFF); + u32 width; + + if (unlikely(caps == ENA_MMIO_READ_TIMEOUT)) { + netdev_err(ena_dev->net_device, "Reg read timeout occurred\n"); + return -ETIME; + } + + width = (caps & ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK) >> + ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT; + + netdev_dbg(ena_dev->net_device, "ENA dma width: %d\n", width); + + if ((width < 32) || width > ENA_MAX_PHYS_ADDR_SIZE_BITS) { + netdev_err(ena_dev->net_device, "DMA width illegal value: %d\n", + width); + return -EINVAL; + } + + ena_dev->dma_addr_bits = width; + + return width; +} + +int ena_com_validate_version(struct ena_com_dev *ena_dev) +{ + u32 ver; + u32 ctrl_ver; + u32 ctrl_ver_masked; + + /* Make sure the ENA version and the controller version are at least + * as the driver expects + */ + ver = ena_com_reg_bar_read32(ena_dev, ENA_REGS_VERSION_OFF); + ctrl_ver = ena_com_reg_bar_read32(ena_dev, + ENA_REGS_CONTROLLER_VERSION_OFF); + + if (unlikely((ver == ENA_MMIO_READ_TIMEOUT) || + (ctrl_ver == ENA_MMIO_READ_TIMEOUT))) { + netdev_err(ena_dev->net_device, "Reg read timeout occurred\n"); + return -ETIME; + } + + dev_info(ena_dev->dmadev, "ENA device version: %d.%d\n", + (ver & ENA_REGS_VERSION_MAJOR_VERSION_MASK) >> + ENA_REGS_VERSION_MAJOR_VERSION_SHIFT, + ver & ENA_REGS_VERSION_MINOR_VERSION_MASK); + + dev_info(ena_dev->dmadev, + "ENA controller version: %d.%d.%d implementation version %d\n", + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >> + ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT, + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >> + ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT, + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK), + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >> + ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT); + + ctrl_ver_masked = + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) | + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) | + (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK); + + /* Validate the ctrl version without the implementation ID */ + if (ctrl_ver_masked < MIN_ENA_CTRL_VER) { + netdev_err(ena_dev->net_device, + "ENA ctrl version is lower than the minimal ctrl version the driver supports\n"); + return -1; + } + + return 0; +} + +static void +ena_com_free_ena_admin_queue_comp_ctx(struct ena_com_dev *ena_dev, + struct ena_com_admin_queue *admin_queue) + +{ + if (!admin_queue->comp_ctx) + return; + + devm_kfree(ena_dev->dmadev, admin_queue->comp_ctx); + + admin_queue->comp_ctx = NULL; +} + +void ena_com_admin_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_com_admin_cq *cq = &admin_queue->cq; + struct ena_com_admin_sq *sq = &admin_queue->sq; + struct ena_com_aenq *aenq = &ena_dev->aenq; + u16 size; + + ena_com_free_ena_admin_queue_comp_ctx(ena_dev, admin_queue); + + size = ADMIN_SQ_SIZE(admin_queue->q_depth); + if (sq->entries) + dma_free_coherent(ena_dev->dmadev, size, sq->entries, + sq->dma_addr); + sq->entries = NULL; + + size = ADMIN_CQ_SIZE(admin_queue->q_depth); + if (cq->entries) + dma_free_coherent(ena_dev->dmadev, size, cq->entries, + cq->dma_addr); + cq->entries = NULL; + + size = ADMIN_AENQ_SIZE(aenq->q_depth); + if (ena_dev->aenq.entries) + dma_free_coherent(ena_dev->dmadev, size, aenq->entries, + aenq->dma_addr); + aenq->entries = NULL; +} + +void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling) +{ + u32 mask_value = 0; + + if (polling) + mask_value = ENA_REGS_ADMIN_INTR_MASK; + + writel(mask_value, ena_dev->reg_bar + ENA_REGS_INTR_MASK_OFF); + ena_dev->admin_queue.polling = polling; +} + +bool ena_com_get_admin_polling_mode(struct ena_com_dev *ena_dev) +{ + return ena_dev->admin_queue.polling; +} + +void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev, + bool polling) +{ + ena_dev->admin_queue.auto_polling = polling; +} + +bool ena_com_phc_supported(struct ena_com_dev *ena_dev) +{ + return ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_PHC_CONFIG); +} + +int ena_com_phc_init(struct ena_com_dev *ena_dev) +{ + struct ena_com_phc_info *phc = &ena_dev->phc; + + memset(phc, 0x0, sizeof(*phc)); + + /* Allocate shared mem used PHC timestamp retrieved from device */ + phc->virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr), + &phc->phys_addr, GFP_KERNEL); + if (unlikely(!phc->virt_addr)) + return -ENOMEM; + + spin_lock_init(&phc->lock); + + phc->virt_addr->req_id = 0; + phc->virt_addr->timestamp = 0; + + return 0; +} + +int ena_com_phc_config(struct ena_com_dev *ena_dev) +{ + struct ena_com_phc_info *phc = &ena_dev->phc; + struct ena_admin_get_feat_resp get_feat_resp; + struct ena_admin_set_feat_resp set_feat_resp; + struct ena_admin_set_feat_cmd set_feat_cmd; + int ret = 0; + + /* Get device PHC default configuration */ + ret = ena_com_get_feature(ena_dev, &get_feat_resp, ENA_ADMIN_PHC_CONFIG, 0); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, + "Failed to get PHC feature configuration, error: %d\n", + ret); + return ret; + } + + /* Suporting only readless PHC retrieval */ + if (get_feat_resp.u.phc.type != ENA_ADMIN_PHC_TYPE_READLESS) { + netdev_err(ena_dev->net_device, + "Unsupprted PHC type, error: %d\n", -EOPNOTSUPP); + return -EOPNOTSUPP; + } + + /* Update PHC doorbell offset according to device value, used to write req_id to PHC bar */ + phc->doorbell_offset = get_feat_resp.u.phc.doorbell_offset; + + /* Update PHC expire timeout according to device or default driver value */ + phc->expire_timeout_usec = (get_feat_resp.u.phc.expire_timeout_usec) ? + get_feat_resp.u.phc.expire_timeout_usec : + ENA_PHC_DEFAULT_EXPIRE_TIMEOUT_USEC; + + /* Update PHC block timeout according to device or default driver value */ + phc->block_timeout_usec = (get_feat_resp.u.phc.block_timeout_usec) ? + get_feat_resp.u.phc.block_timeout_usec : + ENA_PHC_DEFAULT_BLOCK_TIMEOUT_USEC; + + /* Sanity check - expire timeout must not be above skip timeout */ + if (phc->expire_timeout_usec > phc->block_timeout_usec) + phc->expire_timeout_usec = phc->block_timeout_usec; + + /* Prepare PHC feature command with PHC output address */ + memset(&set_feat_cmd, 0x0, sizeof(set_feat_cmd)); + set_feat_cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + set_feat_cmd.feat_common.feature_id = ENA_ADMIN_PHC_CONFIG; + set_feat_cmd.u.phc.output_length = sizeof(*phc->virt_addr); + ret = ena_com_mem_addr_set(ena_dev, &set_feat_cmd.u.phc.output_address, phc->phys_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, + "Failed setting PHC output address, error: %d\n", + ret); + return ret; + } + + /* Send PHC feature command to the device */ + ret = ena_com_execute_admin_command(&ena_dev->admin_queue, + (struct ena_admin_aq_entry *)&set_feat_cmd, + sizeof(set_feat_cmd), + (struct ena_admin_acq_entry *)&set_feat_resp, + sizeof(set_feat_resp)); + + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, + "Failed to enable PHC, error: %d\n", ret); + return ret; + } + + phc->active = true; + netdev_dbg(ena_dev->net_device, "PHC is active in the device\n"); + + return ret; +} + +void ena_com_phc_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_com_phc_info *phc = &ena_dev->phc; + + phc->active = false; + + /* In case PHC is not supported by the device, silently exiting */ + if (!phc->virt_addr) + return; + + dma_free_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr), + phc->virt_addr, phc->phys_addr); + phc->virt_addr = NULL; +} + +int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp) +{ + volatile struct ena_admin_phc_resp *read_resp = ena_dev->phc.virt_addr; + struct ena_com_phc_info *phc = &ena_dev->phc; + ktime_t initial_time = ktime_set(0, 0); + static ktime_t start_time; + unsigned long flags = 0; + ktime_t expire_time; + ktime_t block_time; + int ret = 0; + + if (!phc->active) { + netdev_err(ena_dev->net_device, + "PHC feature is not active in the device\n"); + return -EOPNOTSUPP; + } + + spin_lock_irqsave(&phc->lock, flags); + + /* Check if PHC is in blocked state */ + if (unlikely(ktime_compare(start_time, initial_time))) { + /* Check if blocking time expired */ + block_time = ktime_add_us(start_time, phc->block_timeout_usec); + if (!ktime_after(ktime_get(), block_time)) { + /* PHC is still in blocked state, skip PHC request */ + phc->stats.phc_skp++; + ret = -EBUSY; + goto skip; + } + + /* PHC is in active state, update statistics according to req_id and timestamp */ + if ((READ_ONCE(read_resp->req_id) != phc->req_id) || + (read_resp->timestamp == ENA_PHC_TIMESTAMP_ERROR)) { + /* Device didn't update req_id during blocking time or timestamp is invalid, + * this indicates on a device error + */ + phc->stats.phc_err++; + } else { + /* Device updated req_id during blocking time with valid timestamp */ + phc->stats.phc_exp++; + } + } + + /* Setting relative timeouts */ + start_time = ktime_get(); + block_time = ktime_add_us(start_time, phc->block_timeout_usec); + expire_time = ktime_add_us(start_time, phc->expire_timeout_usec); + + /* We expect the device to return this req_id once the new PHC timestamp is updated */ + phc->req_id++; + + /* Initialize PHC shared memory with different req_id value to be able to identify once the + * device changes it to req_id + */ + read_resp->req_id = phc->req_id + ENA_PHC_REQ_ID_OFFSET; + + /* Writing req_id to PHC bar */ + writel(phc->req_id, ena_dev->reg_bar + phc->doorbell_offset); + + /* Stalling until the device updates req_id */ + while (1) { + if (unlikely(ktime_after(ktime_get(), expire_time))) { + /* Gave up waiting for updated req_id, PHC enters into + * blocked state until passing blocking time + */ + ret = -EBUSY; + break; + } + + /* Check if req_id was updated by the device */ + if (READ_ONCE(read_resp->req_id) != phc->req_id) { + /* req_id was not updated by the device, check again on next loop */ + continue; + } + + /* req_id was updated which indicates that PHC timestamp was updated too */ + *timestamp = read_resp->timestamp; + + /* PHC timestamp validty check */ + if (unlikely(*timestamp == ENA_PHC_TIMESTAMP_ERROR)) { + /* Retrieved invalid PHC timestamp, PHC enters into + * blocked state until passing blocking time + */ + ret = -EBUSY; + break; + } + + /* Retrieved valid PHC timestamp */ + phc->stats.phc_cnt++; + + /* This indicates PHC state is active */ + start_time = initial_time; + break; + } + +skip: + spin_unlock_irqrestore(&phc->lock, flags); + + return ret; +} + +int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev) +{ + struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read; + + spin_lock_init(&mmio_read->lock); + mmio_read->read_resp = + dma_zalloc_coherent(ena_dev->dmadev, + sizeof(*mmio_read->read_resp), + &mmio_read->read_resp_dma_addr, GFP_KERNEL); + if (unlikely(!mmio_read->read_resp)) + goto err; + + ena_com_mmio_reg_read_request_write_dev_addr(ena_dev); + + mmio_read->read_resp->req_id = 0x0; + mmio_read->seq_num = 0x0; + mmio_read->readless_supported = true; + + return 0; + +err: + + return -ENOMEM; +} + +void ena_com_set_mmio_read_mode(struct ena_com_dev *ena_dev, bool readless_supported) +{ + struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read; + + mmio_read->readless_supported = readless_supported; +} + +void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev) +{ + struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read; + + writel(0x0, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_LO_OFF); + writel(0x0, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_HI_OFF); + + dma_free_coherent(ena_dev->dmadev, sizeof(*mmio_read->read_resp), + mmio_read->read_resp, mmio_read->read_resp_dma_addr); + + mmio_read->read_resp = NULL; +} + +void ena_com_mmio_reg_read_request_write_dev_addr(struct ena_com_dev *ena_dev) +{ + struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read; + u32 addr_low, addr_high; + + addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(mmio_read->read_resp_dma_addr); + addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(mmio_read->read_resp_dma_addr); + + writel(addr_low, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_LO_OFF); + writel(addr_high, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_HI_OFF); +} + +int ena_com_admin_init(struct ena_com_dev *ena_dev, + struct ena_aenq_handlers *aenq_handlers) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + u32 aq_caps, acq_caps, dev_sts, addr_low, addr_high; + int ret; + + dev_sts = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF); + + if (unlikely(dev_sts == ENA_MMIO_READ_TIMEOUT)) { + netdev_err(ena_dev->net_device, "Reg read timeout occurred\n"); + return -ETIME; + } + + if (!(dev_sts & ENA_REGS_DEV_STS_READY_MASK)) { + netdev_err(ena_dev->net_device, + "Device isn't ready, abort com init\n"); + return -ENODEV; + } + + admin_queue->q_depth = ENA_ADMIN_QUEUE_DEPTH; + + admin_queue->bus = ena_dev->bus; + admin_queue->q_dmadev = ena_dev->dmadev; + admin_queue->polling = false; + admin_queue->curr_cmd_id = 0; + + atomic_set(&admin_queue->outstanding_cmds, 0); + + spin_lock_init(&admin_queue->q_lock); + + ret = ena_com_init_comp_ctxt(admin_queue); + if (ret) + goto error; + + ret = ena_com_admin_init_sq(admin_queue); + if (ret) + goto error; + + ret = ena_com_admin_init_cq(admin_queue); + if (ret) + goto error; + + admin_queue->sq.db_addr = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar + + ENA_REGS_AQ_DB_OFF); + + addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(admin_queue->sq.dma_addr); + addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(admin_queue->sq.dma_addr); + + writel(addr_low, ena_dev->reg_bar + ENA_REGS_AQ_BASE_LO_OFF); + writel(addr_high, ena_dev->reg_bar + ENA_REGS_AQ_BASE_HI_OFF); + + addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(admin_queue->cq.dma_addr); + addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(admin_queue->cq.dma_addr); + + writel(addr_low, ena_dev->reg_bar + ENA_REGS_ACQ_BASE_LO_OFF); + writel(addr_high, ena_dev->reg_bar + ENA_REGS_ACQ_BASE_HI_OFF); + + aq_caps = 0; + aq_caps |= admin_queue->q_depth & ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK; + aq_caps |= (sizeof(struct ena_admin_aq_entry) << + ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT) & + ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK; + + acq_caps = 0; + acq_caps |= admin_queue->q_depth & ENA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK; + acq_caps |= (sizeof(struct ena_admin_acq_entry) << + ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT) & + ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK; + + writel(aq_caps, ena_dev->reg_bar + ENA_REGS_AQ_CAPS_OFF); + writel(acq_caps, ena_dev->reg_bar + ENA_REGS_ACQ_CAPS_OFF); + ret = ena_com_admin_init_aenq(ena_dev, aenq_handlers); + if (ret) + goto error; + + admin_queue->ena_dev = ena_dev; + admin_queue->running_state = true; + + return 0; +error: + ena_com_admin_destroy(ena_dev); + + return ret; +} + +int ena_com_create_io_queue(struct ena_com_dev *ena_dev, + struct ena_com_create_io_ctx *ctx) +{ + struct ena_com_io_sq *io_sq; + struct ena_com_io_cq *io_cq; + int ret; + + if (ctx->qid >= ENA_TOTAL_NUM_QUEUES) { + netdev_err(ena_dev->net_device, + "Qid (%d) is bigger than max num of queues (%d)\n", + ctx->qid, ENA_TOTAL_NUM_QUEUES); + return -EINVAL; + } + + io_sq = &ena_dev->io_sq_queues[ctx->qid]; + io_cq = &ena_dev->io_cq_queues[ctx->qid]; + + memset(io_sq, 0x0, sizeof(*io_sq)); + memset(io_cq, 0x0, sizeof(*io_cq)); + + /* Init CQ */ + io_cq->q_depth = ctx->queue_size; + io_cq->direction = ctx->direction; + io_cq->qid = ctx->qid; + + io_cq->msix_vector = ctx->msix_vector; + + io_sq->q_depth = ctx->queue_size; + io_sq->direction = ctx->direction; + io_sq->qid = ctx->qid; + + io_sq->mem_queue_type = ctx->mem_queue_type; + + if (ctx->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) + /* header length is limited to 8 bits */ + io_sq->tx_max_header_size = + min_t(u32, ena_dev->tx_max_header_size, SZ_256); + + ret = ena_com_init_io_sq(ena_dev, ctx, io_sq); + if (ret) + goto error; + ret = ena_com_init_io_cq(ena_dev, ctx, io_cq); + if (ret) + goto error; + + ret = ena_com_create_io_cq(ena_dev, io_cq); + if (ret) + goto error; + + ret = ena_com_create_io_sq(ena_dev, io_sq, io_cq->idx); + if (ret) + goto destroy_io_cq; + + return 0; + +destroy_io_cq: + ena_com_destroy_io_cq(ena_dev, io_cq); +error: + ena_com_io_queue_free(ena_dev, io_sq, io_cq); + return ret; +} + +void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid) +{ + struct ena_com_io_sq *io_sq; + struct ena_com_io_cq *io_cq; + + if (qid >= ENA_TOTAL_NUM_QUEUES) { + netdev_err(ena_dev->net_device, + "Qid (%d) is bigger than max num of queues (%d)\n", + qid, ENA_TOTAL_NUM_QUEUES); + return; + } + + io_sq = &ena_dev->io_sq_queues[qid]; + io_cq = &ena_dev->io_cq_queues[qid]; + + ena_com_destroy_io_sq(ena_dev, io_sq); + ena_com_destroy_io_cq(ena_dev, io_cq); + + ena_com_io_queue_free(ena_dev, io_sq, io_cq); +} + +int ena_com_get_link_params(struct ena_com_dev *ena_dev, + struct ena_admin_get_feat_resp *resp) +{ + return ena_com_get_feature(ena_dev, resp, ENA_ADMIN_LINK_CONFIG, 0); +} + +static int ena_get_dev_stats(struct ena_com_dev *ena_dev, + struct ena_com_stats_ctx *ctx, + enum ena_admin_get_stats_type type) +{ + struct ena_admin_acq_get_stats_resp *get_resp = &ctx->get_resp; + struct ena_admin_aq_get_stats_cmd *get_cmd = &ctx->get_cmd; + struct ena_com_admin_queue *admin_queue; + int ret; + + admin_queue = &ena_dev->admin_queue; + + get_cmd->aq_common_descriptor.opcode = ENA_ADMIN_GET_STATS; + get_cmd->aq_common_descriptor.flags = 0; + get_cmd->type = type; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)get_cmd, + sizeof(*get_cmd), + (struct ena_admin_acq_entry *)get_resp, + sizeof(*get_resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, + "Failed to get stats. error: %d\n", ret); + + return ret; +} + +static void ena_com_set_supported_customer_metrics(struct ena_com_dev *ena_dev) +{ + struct ena_customer_metrics *customer_metrics; + struct ena_com_stats_ctx ctx; + int ret; + + customer_metrics = &ena_dev->customer_metrics; + if (!ena_com_get_cap(ena_dev, ENA_ADMIN_CUSTOMER_METRICS)) { + customer_metrics->supported_metrics = ENA_ADMIN_CUSTOMER_METRICS_MIN_SUPPORT_MASK; + return; + } + + memset(&ctx, 0x0, sizeof(ctx)); + ctx.get_cmd.requested_metrics = ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK; + ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS); + if (likely(ret == 0)) + customer_metrics->supported_metrics = + ctx.get_resp.u.customer_metrics.reported_metrics; + else + netdev_err(ena_dev->net_device, + "Failed to query customer metrics support. error: %d\n", + ret); +} + +int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev, + struct ena_com_dev_get_features_ctx *get_feat_ctx) +{ + struct ena_admin_get_feat_resp get_resp; + int rc; + + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_DEVICE_ATTRIBUTES, 0); + if (rc) + return rc; + + memcpy(&get_feat_ctx->dev_attr, &get_resp.u.dev_attr, + sizeof(get_resp.u.dev_attr)); + + ena_dev->supported_features = get_resp.u.dev_attr.supported_features; + ena_dev->capabilities = get_resp.u.dev_attr.capabilities; + + if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) { + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_MAX_QUEUES_EXT, + ENA_FEATURE_MAX_QUEUE_EXT_VER); + if (rc) + return rc; + + if (get_resp.u.max_queue_ext.version != + ENA_FEATURE_MAX_QUEUE_EXT_VER) + return -EINVAL; + + memcpy(&get_feat_ctx->max_queue_ext, &get_resp.u.max_queue_ext, + sizeof(get_resp.u.max_queue_ext)); + ena_dev->tx_max_header_size = + get_resp.u.max_queue_ext.max_queue_ext.max_tx_header_size; + } else { + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_MAX_QUEUES_NUM, 0); + memcpy(&get_feat_ctx->max_queues, &get_resp.u.max_queue, + sizeof(get_resp.u.max_queue)); + ena_dev->tx_max_header_size = + get_resp.u.max_queue.max_header_size; + + if (rc) + return rc; + } + + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_AENQ_CONFIG, 0); + if (rc) + return rc; + + memcpy(&get_feat_ctx->aenq, &get_resp.u.aenq, + sizeof(get_resp.u.aenq)); + + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_STATELESS_OFFLOAD_CONFIG, 0); + if (rc) + return rc; + + memcpy(&get_feat_ctx->offload, &get_resp.u.offload, + sizeof(get_resp.u.offload)); + + /* Driver hints isn't mandatory admin command. So in case the + * command isn't supported set driver hints to 0 + */ + rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_HW_HINTS, 0); + + if (!rc) + memcpy(&get_feat_ctx->hw_hints, &get_resp.u.hw_hints, + sizeof(get_resp.u.hw_hints)); + else if (rc == -EOPNOTSUPP) + memset(&get_feat_ctx->hw_hints, 0x0, + sizeof(get_feat_ctx->hw_hints)); + else + return rc; + + rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_LLQ, 0); + if (!rc) + memcpy(&get_feat_ctx->llq, &get_resp.u.llq, + sizeof(get_resp.u.llq)); + else if (rc == -EOPNOTSUPP) + memset(&get_feat_ctx->llq, 0x0, sizeof(get_feat_ctx->llq)); + else + return rc; + + ena_com_set_supported_customer_metrics(ena_dev); + + return 0; +} + +void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev) +{ + ena_com_handle_admin_completion(&ena_dev->admin_queue); +} + +/* ena_handle_specific_aenq_event: + * return the handler that is relevant to the specific event group + */ +static ena_aenq_handler ena_com_get_specific_aenq_cb(struct ena_com_dev *ena_dev, + u16 group) +{ + struct ena_aenq_handlers *aenq_handlers = ena_dev->aenq.aenq_handlers; + + if ((group < ENA_MAX_HANDLERS) && aenq_handlers->handlers[group]) + return aenq_handlers->handlers[group]; + + return aenq_handlers->unimplemented_handler; +} + +/* ena_aenq_intr_handler: + * handles the aenq incoming events. + * pop events from the queue and apply the specific handler + */ +void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data) +{ + struct ena_admin_aenq_entry *aenq_e; + struct ena_admin_aenq_common_desc *aenq_common; + struct ena_com_aenq *aenq = &ena_dev->aenq; + u64 timestamp; + ena_aenq_handler handler_cb; + u16 masked_head, processed = 0; + u8 phase; + + masked_head = aenq->head & (aenq->q_depth - 1); + phase = aenq->phase; + aenq_e = &aenq->entries[masked_head]; /* Get first entry */ + aenq_common = &aenq_e->aenq_common_desc; + + /* Go over all the events */ + while ((READ_ONCE(aenq_common->flags) & + ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) { + /* Make sure the phase bit (ownership) is as expected before + * reading the rest of the descriptor. + */ + dma_rmb(); + + timestamp = (u64)aenq_common->timestamp_low | + ((u64)aenq_common->timestamp_high << 32); + + netdev_dbg(ena_dev->net_device, + "AENQ! Group[%x] Syndrome[%x] timestamp: [%llus]\n", + aenq_common->group, aenq_common->syndrome, timestamp); + + /* Handle specific event*/ + handler_cb = ena_com_get_specific_aenq_cb(ena_dev, + aenq_common->group); + handler_cb(data, aenq_e); /* call the actual event handler*/ + + /* Get next event entry */ + masked_head++; + processed++; + + if (unlikely(masked_head == aenq->q_depth)) { + masked_head = 0; + phase = !phase; + } + aenq_e = &aenq->entries[masked_head]; + aenq_common = &aenq_e->aenq_common_desc; + } + + aenq->head += processed; + aenq->phase = phase; + + /* Don't update aenq doorbell if there weren't any processed events */ + if (!processed) + return; + + /* write the aenq doorbell after all AENQ descriptors were read */ + mb(); + writel_relaxed((u32)aenq->head, + ena_dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF); +#ifndef MMIOWB_NOT_DEFINED + mmiowb(); +#endif +} + +int ena_com_dev_reset(struct ena_com_dev *ena_dev, + enum ena_regs_reset_reason_types reset_reason) +{ + u32 stat, timeout, cap, reset_val; + int rc; + + stat = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF); + cap = ena_com_reg_bar_read32(ena_dev, ENA_REGS_CAPS_OFF); + + if (unlikely((stat == ENA_MMIO_READ_TIMEOUT) || + (cap == ENA_MMIO_READ_TIMEOUT))) { + netdev_err(ena_dev->net_device, "Reg read32 timeout occurred\n"); + return -ETIME; + } + + if ((stat & ENA_REGS_DEV_STS_READY_MASK) == 0) { + netdev_err(ena_dev->net_device, + "Device isn't ready, can't reset device\n"); + return -EINVAL; + } + + timeout = (cap & ENA_REGS_CAPS_RESET_TIMEOUT_MASK) >> + ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT; + if (timeout == 0) { + netdev_err(ena_dev->net_device, "Invalid timeout value\n"); + return -EINVAL; + } + + /* start reset */ + reset_val = ENA_REGS_DEV_CTL_DEV_RESET_MASK; + reset_val |= (reset_reason << ENA_REGS_DEV_CTL_RESET_REASON_SHIFT) & + ENA_REGS_DEV_CTL_RESET_REASON_MASK; + writel(reset_val, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF); + + /* Write again the MMIO read request address */ + ena_com_mmio_reg_read_request_write_dev_addr(ena_dev); + + rc = wait_for_reset_state(ena_dev, timeout, + ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK); + if (rc != 0) { + netdev_err(ena_dev->net_device, + "Reset indication didn't turn on\n"); + return rc; + } + + /* reset done */ + writel(0, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF); + rc = wait_for_reset_state(ena_dev, timeout, 0); + if (rc != 0) { + netdev_err(ena_dev->net_device, + "Reset indication didn't turn off\n"); + return rc; + } + + timeout = (cap & ENA_REGS_CAPS_ADMIN_CMD_TO_MASK) >> + ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT; + if (timeout) + /* the resolution of timeout reg is 100ms */ + ena_dev->admin_queue.completion_timeout = timeout * 100000; + else + ena_dev->admin_queue.completion_timeout = ADMIN_CMD_TIMEOUT_US; + + return 0; +} + +int ena_com_get_eni_stats(struct ena_com_dev *ena_dev, + struct ena_admin_eni_stats *stats) +{ + struct ena_com_stats_ctx ctx; + int ret; + + if (!ena_com_get_cap(ena_dev, ENA_ADMIN_ENI_STATS)) { + netdev_err(ena_dev->net_device, + "Capability %d isn't supported\n", + ENA_ADMIN_ENI_STATS); + return -EOPNOTSUPP; + } + + memset(&ctx, 0x0, sizeof(ctx)); + ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_ENI); + if (likely(ret == 0)) + memcpy(stats, &ctx.get_resp.u.eni_stats, + sizeof(ctx.get_resp.u.eni_stats)); + + return ret; +} + +int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev, + struct ena_admin_ena_srd_info *info) +{ + struct ena_com_stats_ctx ctx; + int ret; + + if (!ena_com_get_cap(ena_dev, ENA_ADMIN_ENA_SRD_INFO)) { + netdev_err(ena_dev->net_device, + "Capability %d isn't supported\n", + ENA_ADMIN_ENA_SRD_INFO); + return -EOPNOTSUPP; + } + + memset(&ctx, 0x0, sizeof(ctx)); + ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_ENA_SRD); + if (likely(ret == 0)) + memcpy(info, &ctx.get_resp.u.ena_srd_info, + sizeof(ctx.get_resp.u.ena_srd_info)); + + return ret; +} + +int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev, + struct ena_admin_basic_stats *stats) +{ + struct ena_com_stats_ctx ctx; + int ret; + + memset(&ctx, 0x0, sizeof(ctx)); + ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_BASIC); + if (likely(ret == 0)) + memcpy(stats, &ctx.get_resp.u.basic_stats, + sizeof(ctx.get_resp.u.basic_stats)); + + return ret; +} + +int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32 len) +{ + struct ena_admin_aq_get_stats_cmd *get_cmd; + struct ena_com_stats_ctx ctx; + int ret; + + if (unlikely(len > ena_dev->customer_metrics.buffer_len)) { + netdev_err(ena_dev->net_device, + "Invalid buffer size %u. The given buffer is too big.\n", + len); + return -EINVAL; + } + + if (!ena_com_get_cap(ena_dev, ENA_ADMIN_CUSTOMER_METRICS)) { + netdev_err(ena_dev->net_device, "Capability %d not supported.\n", + ENA_ADMIN_CUSTOMER_METRICS); + return -EOPNOTSUPP; + } + + if (!ena_dev->customer_metrics.supported_metrics) { + netdev_err(ena_dev->net_device, + "No supported customer metrics.\n"); + return -EOPNOTSUPP; + } + + get_cmd = &ctx.get_cmd; + memset(&ctx, 0x0, sizeof(ctx)); + ret = ena_com_mem_addr_set(ena_dev, + &get_cmd->u.control_buffer.address, + ena_dev->customer_metrics.buffer_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed.\n"); + return ret; + } + + get_cmd->u.control_buffer.length = ena_dev->customer_metrics.buffer_len; + get_cmd->requested_metrics = ena_dev->customer_metrics.supported_metrics; + ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS); + if (likely(ret == 0)) + memcpy(buffer, ena_dev->customer_metrics.buffer_virt_addr, len); + else + netdev_err(ena_dev->net_device, + "Failed to get customer metrics. error: %d\n", ret); + + return ret; +} + +int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu) +{ + struct ena_com_admin_queue *admin_queue; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + int ret; + + if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_MTU)) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", + ENA_ADMIN_MTU); + return -EOPNOTSUPP; + } + + memset(&cmd, 0x0, sizeof(cmd)); + admin_queue = &ena_dev->admin_queue; + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.aq_common_descriptor.flags = 0; + cmd.feat_common.feature_id = ENA_ADMIN_MTU; + cmd.u.mtu.mtu = mtu; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, + "Failed to set mtu %d. error: %d\n", mtu, ret); + + return ret; +} + +int ena_com_get_offload_settings(struct ena_com_dev *ena_dev, + struct ena_admin_feature_offload_desc *offload) +{ + int ret; + struct ena_admin_get_feat_resp resp; + + ret = ena_com_get_feature(ena_dev, &resp, + ENA_ADMIN_STATELESS_OFFLOAD_CONFIG, 0); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, + "Failed to get offload capabilities %d\n", ret); + return ret; + } + + memcpy(offload, &resp.u.offload, sizeof(resp.u.offload)); + + return 0; +} + +int ena_com_set_hash_function(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + struct ena_admin_get_feat_resp get_resp; + int ret; + + if (!ena_com_check_supported_feature_id(ena_dev, + ENA_ADMIN_RSS_HASH_FUNCTION)) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", + ENA_ADMIN_RSS_HASH_FUNCTION); + return -EOPNOTSUPP; + } + + /* Validate hash function is supported */ + ret = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_RSS_HASH_FUNCTION, 0); + if (unlikely(ret)) + return ret; + + if (!(get_resp.u.flow_hash_func.supported_func & BIT(rss->hash_func))) { + netdev_err(ena_dev->net_device, + "Func hash %d isn't supported by device, abort\n", + rss->hash_func); + return -EOPNOTSUPP; + } + + memset(&cmd, 0x0, sizeof(cmd)); + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.aq_common_descriptor.flags = + ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK; + cmd.feat_common.feature_id = ENA_ADMIN_RSS_HASH_FUNCTION; + cmd.u.flow_hash_func.init_val = rss->hash_init_val; + cmd.u.flow_hash_func.selected_func = 1 << rss->hash_func; + + ret = ena_com_mem_addr_set(ena_dev, + &cmd.control_buffer.address, + rss->hash_key_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + cmd.control_buffer.length = sizeof(*rss->hash_key); + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, + "Failed to set hash function %d. error: %d\n", + rss->hash_func, ret); + return -EINVAL; + } + + return 0; +} + +int ena_com_fill_hash_function(struct ena_com_dev *ena_dev, + enum ena_admin_hash_functions func, + const u8 *key, u16 key_len, u32 init_val) +{ + struct ena_admin_feature_rss_flow_hash_control *hash_key; + struct ena_admin_get_feat_resp get_resp; + enum ena_admin_hash_functions old_func; + struct ena_rss *rss = &ena_dev->rss; + int rc; + + hash_key = rss->hash_key; + + /* Make sure size is a mult of DWs */ + if (unlikely(key_len & 0x3)) + return -EINVAL; + + rc = ena_com_get_feature_ex(ena_dev, &get_resp, + ENA_ADMIN_RSS_HASH_FUNCTION, + rss->hash_key_dma_addr, + sizeof(*rss->hash_key), 0); + if (unlikely(rc)) + return rc; + + if (!(BIT(func) & get_resp.u.flow_hash_func.supported_func)) { + netdev_err(ena_dev->net_device, + "Flow hash function %d isn't supported\n", func); + return -EOPNOTSUPP; + } + + if ((func == ENA_ADMIN_TOEPLITZ) && key) { + if (key_len != sizeof(hash_key->key)) { + netdev_err(ena_dev->net_device, + "key len (%u) doesn't equal the supported size (%zu)\n", + key_len, sizeof(hash_key->key)); + return -EINVAL; + } + memcpy(hash_key->key, key, key_len); + hash_key->key_parts = key_len / sizeof(hash_key->key[0]); + } + + rss->hash_init_val = init_val; + old_func = rss->hash_func; + rss->hash_func = func; + rc = ena_com_set_hash_function(ena_dev); + + /* Restore the old function */ + if (unlikely(rc)) + rss->hash_func = old_func; + + return rc; +} + +int ena_com_get_hash_function(struct ena_com_dev *ena_dev, + enum ena_admin_hash_functions *func) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_get_feat_resp get_resp; + int rc; + + if (unlikely(!func)) + return -EINVAL; + + rc = ena_com_get_feature_ex(ena_dev, &get_resp, + ENA_ADMIN_RSS_HASH_FUNCTION, + rss->hash_key_dma_addr, + sizeof(*rss->hash_key), 0); + if (unlikely(rc)) + return rc; + + /* ffs() returns 1 in case the lsb is set */ + rss->hash_func = ffs(get_resp.u.flow_hash_func.selected_func); + if (rss->hash_func) + rss->hash_func--; + + *func = rss->hash_func; + + return 0; +} + +int ena_com_get_hash_key(struct ena_com_dev *ena_dev, u8 *key) +{ + struct ena_admin_feature_rss_flow_hash_control *hash_key = + ena_dev->rss.hash_key; + + if (key) + memcpy(key, hash_key->key, + (size_t)(hash_key->key_parts) * sizeof(hash_key->key[0])); + + return 0; +} + +int ena_com_get_hash_ctrl(struct ena_com_dev *ena_dev, + enum ena_admin_flow_hash_proto proto, + u16 *fields) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_get_feat_resp get_resp; + int rc; + + rc = ena_com_get_feature_ex(ena_dev, &get_resp, + ENA_ADMIN_RSS_HASH_INPUT, + rss->hash_ctrl_dma_addr, + sizeof(*rss->hash_ctrl), 0); + if (unlikely(rc)) + return rc; + + if (fields) + *fields = rss->hash_ctrl->selected_fields[proto].fields; + + return 0; +} + +int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_feature_rss_hash_control *hash_ctrl = rss->hash_ctrl; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + int ret; + + if (!ena_com_check_supported_feature_id(ena_dev, + ENA_ADMIN_RSS_HASH_INPUT)) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", + ENA_ADMIN_RSS_HASH_INPUT); + return -EOPNOTSUPP; + } + + memset(&cmd, 0x0, sizeof(cmd)); + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.aq_common_descriptor.flags = + ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK; + cmd.feat_common.feature_id = ENA_ADMIN_RSS_HASH_INPUT; + cmd.u.flow_hash_input.enabled_input_sort = + ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_MASK | + ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_MASK; + + ret = ena_com_mem_addr_set(ena_dev, + &cmd.control_buffer.address, + rss->hash_ctrl_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + cmd.control_buffer.length = sizeof(*hash_ctrl); + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + if (unlikely(ret)) + netdev_err(ena_dev->net_device, + "Failed to set hash input. error: %d\n", ret); + + return ret; +} + +int ena_com_set_default_hash_ctrl(struct ena_com_dev *ena_dev) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_feature_rss_hash_control *hash_ctrl = + rss->hash_ctrl; + u16 available_fields = 0; + int rc, i; + + /* Get the supported hash input */ + rc = ena_com_get_hash_ctrl(ena_dev, 0, NULL); + if (unlikely(rc)) + return rc; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_TCP4].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA | + ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_UDP4].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA | + ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_TCP6].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA | + ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_UDP6].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA | + ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP4].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP6].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP4_FRAG].fields = + ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA; + + hash_ctrl->selected_fields[ENA_ADMIN_RSS_NOT_IP].fields = + ENA_ADMIN_RSS_L2_DA | ENA_ADMIN_RSS_L2_SA; + + for (i = 0; i < ENA_ADMIN_RSS_PROTO_NUM; i++) { + available_fields = hash_ctrl->selected_fields[i].fields & + hash_ctrl->supported_fields[i].fields; + if (available_fields != hash_ctrl->selected_fields[i].fields) { + netdev_err(ena_dev->net_device, + "Hash control doesn't support all the desire configuration. proto %x supported %x selected %x\n", + i, hash_ctrl->supported_fields[i].fields, + hash_ctrl->selected_fields[i].fields); + return -EOPNOTSUPP; + } + } + + rc = ena_com_set_hash_ctrl(ena_dev); + + /* In case of failure, restore the old hash ctrl */ + if (unlikely(rc)) + ena_com_get_hash_ctrl(ena_dev, 0, NULL); + + return rc; +} + +int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev, + enum ena_admin_flow_hash_proto proto, + u16 hash_fields) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_feature_rss_hash_control *hash_ctrl = rss->hash_ctrl; + u16 supported_fields; + int rc; + + if (proto >= ENA_ADMIN_RSS_PROTO_NUM) { + netdev_err(ena_dev->net_device, "Invalid proto num (%u)\n", + proto); + return -EINVAL; + } + + /* Get the ctrl table */ + rc = ena_com_get_hash_ctrl(ena_dev, proto, NULL); + if (unlikely(rc)) + return rc; + + /* Make sure all the fields are supported */ + supported_fields = hash_ctrl->supported_fields[proto].fields; + if ((hash_fields & supported_fields) != hash_fields) { + netdev_err(ena_dev->net_device, + "Proto %d doesn't support the required fields %x. supports only: %x\n", + proto, hash_fields, supported_fields); + } + + hash_ctrl->selected_fields[proto].fields = hash_fields; + + rc = ena_com_set_hash_ctrl(ena_dev); + + /* In case of failure, restore the old hash ctrl */ + if (unlikely(rc)) + ena_com_get_hash_ctrl(ena_dev, 0, NULL); + + return 0; +} + +int ena_com_indirect_table_fill_entry(struct ena_com_dev *ena_dev, + u16 entry_idx, u16 entry_value) +{ + struct ena_rss *rss = &ena_dev->rss; + + if (unlikely(entry_idx >= (1 << rss->tbl_log_size))) + return -EINVAL; + + if (unlikely((entry_value > ENA_TOTAL_NUM_QUEUES))) + return -EINVAL; + + rss->host_rss_ind_tbl[entry_idx] = entry_value; + + return 0; +} + +int ena_com_indirect_table_set(struct ena_com_dev *ena_dev) +{ + struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + int ret; + + if (!ena_com_check_supported_feature_id( + ena_dev, ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG)) { + netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n", + ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG); + return -EOPNOTSUPP; + } + + ret = ena_com_ind_tbl_convert_to_device(ena_dev); + if (ret) { + netdev_err(ena_dev->net_device, + "Failed to convert host indirection table to device table\n"); + return ret; + } + + memset(&cmd, 0x0, sizeof(cmd)); + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.aq_common_descriptor.flags = + ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK; + cmd.feat_common.feature_id = ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG; + cmd.u.ind_table.size = rss->tbl_log_size; + cmd.u.ind_table.inline_index = 0xFFFFFFFF; + + ret = ena_com_mem_addr_set(ena_dev, + &cmd.control_buffer.address, + rss->rss_ind_tbl_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + cmd.control_buffer.length = (1ULL << rss->tbl_log_size) * + sizeof(struct ena_admin_rss_ind_table_entry); + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, + "Failed to set indirect table. error: %d\n", ret); + + return ret; +} + +int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl) +{ + struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_get_feat_resp get_resp; + u32 tbl_size; + int i, rc; + + tbl_size = (1ULL << rss->tbl_log_size) * + sizeof(struct ena_admin_rss_ind_table_entry); + + rc = ena_com_get_feature_ex(ena_dev, &get_resp, + ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG, + rss->rss_ind_tbl_dma_addr, + tbl_size, 0); + if (unlikely(rc)) + return rc; + + if (!ind_tbl) + return 0; + + for (i = 0; i < (1 << rss->tbl_log_size); i++) + ind_tbl[i] = rss->host_rss_ind_tbl[i]; + + return 0; +} + +int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 indr_tbl_log_size) +{ + int rc; + + memset(&ena_dev->rss, 0x0, sizeof(ena_dev->rss)); + + rc = ena_com_indirect_table_allocate(ena_dev, indr_tbl_log_size); + if (unlikely(rc)) + goto err_indr_tbl; + + /* The following function might return unsupported in case the + * device doesn't support setting the key / hash function. We can safely + * ignore this error and have indirection table support only. + */ + rc = ena_com_hash_key_allocate(ena_dev); + if (likely(!rc)) + ena_com_hash_key_fill_default_key(ena_dev); + else if (rc != -EOPNOTSUPP) + goto err_hash_key; + + rc = ena_com_hash_ctrl_init(ena_dev); + if (unlikely(rc)) + goto err_hash_ctrl; + + return 0; + +err_hash_ctrl: + ena_com_hash_key_destroy(ena_dev); +err_hash_key: + ena_com_indirect_table_destroy(ena_dev); +err_indr_tbl: + + return rc; +} + +void ena_com_rss_destroy(struct ena_com_dev *ena_dev) +{ + ena_com_indirect_table_destroy(ena_dev); + ena_com_hash_key_destroy(ena_dev); + ena_com_hash_ctrl_destroy(ena_dev); + + memset(&ena_dev->rss, 0x0, sizeof(ena_dev->rss)); +} + +int ena_com_allocate_host_info(struct ena_com_dev *ena_dev) +{ + struct ena_host_attribute *host_attr = &ena_dev->host_attr; + + host_attr->host_info = + dma_zalloc_coherent(ena_dev->dmadev, SZ_4K, + &host_attr->host_info_dma_addr, GFP_KERNEL); + if (unlikely(!host_attr->host_info)) + return -ENOMEM; + + host_attr->host_info->ena_spec_version = ((ENA_COMMON_SPEC_VERSION_MAJOR << + ENA_REGS_VERSION_MAJOR_VERSION_SHIFT) | + (ENA_COMMON_SPEC_VERSION_MINOR)); + + return 0; +} + +int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev, + u32 debug_area_size) +{ + struct ena_host_attribute *host_attr = &ena_dev->host_attr; + + host_attr->debug_area_virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, debug_area_size, + &host_attr->debug_area_dma_addr, GFP_KERNEL); + if (unlikely(!host_attr->debug_area_virt_addr)) { + host_attr->debug_area_size = 0; + return -ENOMEM; + } + + host_attr->debug_area_size = debug_area_size; + + return 0; +} + +int ena_com_allocate_customer_metrics_buffer(struct ena_com_dev *ena_dev) +{ + struct ena_customer_metrics *customer_metrics = &ena_dev->customer_metrics; + + customer_metrics->buffer_len = ENA_CUSTOMER_METRICS_BUFFER_SIZE; + customer_metrics->buffer_virt_addr = + dma_zalloc_coherent(ena_dev->dmadev, + customer_metrics->buffer_len, + &customer_metrics->buffer_dma_addr, + GFP_KERNEL); + if (!customer_metrics->buffer_virt_addr) + return -ENOMEM; + + return 0; +} + +void ena_com_delete_host_info(struct ena_com_dev *ena_dev) +{ + struct ena_host_attribute *host_attr = &ena_dev->host_attr; + + if (host_attr->host_info) { + dma_free_coherent(ena_dev->dmadev, SZ_4K, host_attr->host_info, + host_attr->host_info_dma_addr); + host_attr->host_info = NULL; + } +} + +void ena_com_delete_debug_area(struct ena_com_dev *ena_dev) +{ + struct ena_host_attribute *host_attr = &ena_dev->host_attr; + + if (host_attr->debug_area_virt_addr) { + dma_free_coherent(ena_dev->dmadev, host_attr->debug_area_size, + host_attr->debug_area_virt_addr, + host_attr->debug_area_dma_addr); + host_attr->debug_area_virt_addr = NULL; + } +} + +void ena_com_delete_customer_metrics_buffer(struct ena_com_dev *ena_dev) +{ + struct ena_customer_metrics *customer_metrics = &ena_dev->customer_metrics; + + if (customer_metrics->buffer_virt_addr) { + dma_free_coherent(ena_dev->dmadev, customer_metrics->buffer_len, + customer_metrics->buffer_virt_addr, + customer_metrics->buffer_dma_addr); + customer_metrics->buffer_virt_addr = NULL; + } +} + +int ena_com_set_host_attributes(struct ena_com_dev *ena_dev) +{ + struct ena_host_attribute *host_attr = &ena_dev->host_attr; + struct ena_com_admin_queue *admin_queue; + struct ena_admin_set_feat_cmd cmd; + struct ena_admin_set_feat_resp resp; + + int ret; + + /* Host attribute config is called before ena_com_get_dev_attr_feat + * so ena_com can't check if the feature is supported. + */ + + memset(&cmd, 0x0, sizeof(cmd)); + admin_queue = &ena_dev->admin_queue; + + cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE; + cmd.feat_common.feature_id = ENA_ADMIN_HOST_ATTR_CONFIG; + + ret = ena_com_mem_addr_set(ena_dev, + &cmd.u.host_attr.debug_ba, + host_attr->debug_area_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + ret = ena_com_mem_addr_set(ena_dev, + &cmd.u.host_attr.os_info_ba, + host_attr->host_info_dma_addr); + if (unlikely(ret)) { + netdev_err(ena_dev->net_device, "Memory address set failed\n"); + return ret; + } + + cmd.u.host_attr.debug_area_size = host_attr->debug_area_size; + + ret = ena_com_execute_admin_command(admin_queue, + (struct ena_admin_aq_entry *)&cmd, + sizeof(cmd), + (struct ena_admin_acq_entry *)&resp, + sizeof(resp)); + + if (unlikely(ret)) + netdev_err(ena_dev->net_device, + "Failed to set host attributes: %d\n", ret); + + return ret; +} + +/* Interrupt moderation */ +bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev) +{ + return ena_com_check_supported_feature_id(ena_dev, + ENA_ADMIN_INTERRUPT_MODERATION); +} + +static int ena_com_update_nonadaptive_moderation_interval(struct ena_com_dev *ena_dev, + u32 coalesce_usecs, + u32 intr_delay_resolution, + u32 *intr_moder_interval) +{ + if (!intr_delay_resolution) { + netdev_err(ena_dev->net_device, + "Illegal interrupt delay granularity value\n"); + return -EFAULT; + } + + *intr_moder_interval = coalesce_usecs / intr_delay_resolution; + + return 0; +} + +int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev, + u32 tx_coalesce_usecs) +{ + return ena_com_update_nonadaptive_moderation_interval(ena_dev, + tx_coalesce_usecs, + ena_dev->intr_delay_resolution, + &ena_dev->intr_moder_tx_interval); +} + +int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev, + u32 rx_coalesce_usecs) +{ + return ena_com_update_nonadaptive_moderation_interval(ena_dev, + rx_coalesce_usecs, + ena_dev->intr_delay_resolution, + &ena_dev->intr_moder_rx_interval); +} + +int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev) +{ + struct ena_admin_get_feat_resp get_resp; + u16 delay_resolution; + int rc; + + rc = ena_com_get_feature(ena_dev, &get_resp, + ENA_ADMIN_INTERRUPT_MODERATION, 0); + + if (rc) { + if (rc == -EOPNOTSUPP) { + netdev_dbg(ena_dev->net_device, + "Feature %d isn't supported\n", + ENA_ADMIN_INTERRUPT_MODERATION); + rc = 0; + } else { + netdev_err(ena_dev->net_device, + "Failed to get interrupt moderation admin cmd. rc: %d\n", + rc); + } + + /* no moderation supported, disable adaptive support */ + ena_com_disable_adaptive_moderation(ena_dev); + return rc; + } + + /* if moderation is supported by device we set adaptive moderation */ + delay_resolution = get_resp.u.intr_moderation.intr_delay_resolution; + ena_com_update_intr_delay_resolution(ena_dev, delay_resolution); + + /* Disable adaptive moderation by default - can be enabled later */ + ena_com_disable_adaptive_moderation(ena_dev); + + return 0; +} + +unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev) +{ + return ena_dev->intr_moder_tx_interval; +} + +unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev) +{ + return ena_dev->intr_moder_rx_interval; +} + +int ena_com_config_dev_mode(struct ena_com_dev *ena_dev, + struct ena_admin_feature_llq_desc *llq_features, + struct ena_llq_configurations *llq_default_cfg) +{ + struct ena_com_llq_info *llq_info = &ena_dev->llq_info; + int rc; + + if (!llq_features->max_llq_num) { + ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; + return 0; + } + + rc = ena_com_config_llq_info(ena_dev, llq_features, llq_default_cfg); + if (rc) + return rc; + + ena_dev->tx_max_header_size = llq_info->desc_list_entry_size - + (llq_info->descs_num_before_header * sizeof(struct ena_eth_io_tx_desc)); + + if (unlikely(ena_dev->tx_max_header_size == 0)) { + netdev_err(ena_dev->net_device, + "The size of the LLQ entry is smaller than needed\n"); + return -EINVAL; + } + + ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_DEV; + + return 0; +} diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h new file mode 100644 index 0000000000000..f44e59176e459 --- /dev/null +++ b/drivers/amazon/net/ena/ena_com.h @@ -0,0 +1,1193 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_COM +#define ENA_COM + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kcompat.h" +#include "ena_common_defs.h" +#include "ena_admin_defs.h" +#include "ena_eth_io_defs.h" +#include "ena_regs_defs.h" + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define ENA_MAX_NUM_IO_QUEUES 128U +/* We need to queues for each IO (on for Tx and one for Rx) */ +#define ENA_TOTAL_NUM_QUEUES (2 * (ENA_MAX_NUM_IO_QUEUES)) + +#define ENA_MAX_HANDLERS 256 + +#define ENA_MAX_PHYS_ADDR_SIZE_BITS 48 + +/* Unit in usec */ +#define ENA_REG_READ_TIMEOUT 200000 + +#define ADMIN_SQ_SIZE(depth) ((depth) * sizeof(struct ena_admin_aq_entry)) +#define ADMIN_CQ_SIZE(depth) ((depth) * sizeof(struct ena_admin_acq_entry)) +#define ADMIN_AENQ_SIZE(depth) ((depth) * sizeof(struct ena_admin_aenq_entry)) + +#define ENA_CUSTOMER_METRICS_BUFFER_SIZE 512 + +/*****************************************************************************/ +/*****************************************************************************/ +/* ENA adaptive interrupt moderation settings */ + +#define ENA_INTR_INITIAL_TX_INTERVAL_USECS 64 +#define ENA_INTR_INITIAL_RX_INTERVAL_USECS 20 +#define ENA_DEFAULT_INTR_DELAY_RESOLUTION 1 + +#define ENA_HASH_KEY_SIZE 40 + +#define ENA_HW_HINTS_NO_TIMEOUT 0xFFFF + +#define ENA_FEATURE_MAX_QUEUE_EXT_VER 1 + +struct ena_llq_configurations { + enum ena_admin_llq_header_location llq_header_location; + enum ena_admin_llq_ring_entry_size llq_ring_entry_size; + enum ena_admin_llq_stride_ctrl llq_stride_ctrl; + enum ena_admin_llq_num_descs_before_header llq_num_decs_before_header; + u16 llq_ring_entry_size_value; +}; + +enum queue_direction { + ENA_COM_IO_QUEUE_DIRECTION_TX, + ENA_COM_IO_QUEUE_DIRECTION_RX +}; + +struct ena_com_buf { + dma_addr_t paddr; /**< Buffer physical address */ + u16 len; /**< Buffer length in bytes */ +}; + +struct ena_com_rx_buf_info { + u16 len; + u16 req_id; +}; + +struct ena_com_io_desc_addr { + u8 __iomem *pbuf_dev_addr; /* LLQ address */ + u8 *virt_addr; + dma_addr_t phys_addr; +}; + +struct ena_com_tx_meta { + u16 mss; + u16 l3_hdr_len; + u16 l3_hdr_offset; + u16 l4_hdr_len; /* In words */ +}; + +struct ena_com_llq_info { + u16 header_location_ctrl; + u16 desc_stride_ctrl; + u16 desc_list_entry_size_ctrl; + u16 desc_list_entry_size; + u16 descs_num_before_header; + u16 descs_per_entry; + u16 max_entries_in_tx_burst; + bool disable_meta_caching; +}; + +struct ena_com_io_cq { + struct ena_com_io_desc_addr cdesc_addr; + void *bus; + + /* Interrupt unmask register */ + u32 __iomem *unmask_reg; + + + /* numa configuration register (for TPH) */ + u32 __iomem *numa_node_cfg_reg; + + /* The value to write to the above register to unmask + * the interrupt of this queue + */ + u32 msix_vector ____cacheline_aligned; + + enum queue_direction direction; + + /* holds the number of cdesc of the current packet */ + u16 cur_rx_pkt_cdesc_count; + /* save the first cdesc idx of the current packet */ + u16 cur_rx_pkt_cdesc_start_idx; + + u16 q_depth; + /* Caller qid */ + u16 qid; + + /* Device queue index */ + u16 idx; + u16 head; + u8 phase; + u8 cdesc_entry_size_in_bytes; + +} ____cacheline_aligned; + +struct ena_com_io_bounce_buffer_control { + u8 *base_buffer; + u16 next_to_use; + u16 buffer_size; + u16 buffers_num; /* Must be a power of 2 */ +}; + +/* This struct is to keep tracking the current location of the next llq entry */ +struct ena_com_llq_pkt_ctrl { + u8 *curr_bounce_buf; + u16 idx; + u16 descs_left_in_line; +}; + +struct ena_com_io_sq { + struct ena_com_io_desc_addr desc_addr; + void *bus; + + u32 __iomem *db_addr; + + enum queue_direction direction; + enum ena_admin_placement_policy_type mem_queue_type; + + bool disable_meta_caching; + + u32 msix_vector; + struct ena_com_tx_meta cached_tx_meta; + struct ena_com_llq_info llq_info; + struct ena_com_llq_pkt_ctrl llq_buf_ctrl; + struct ena_com_io_bounce_buffer_control bounce_buf_ctrl; + + u16 q_depth; + u16 qid; + + u16 idx; + u16 tail; + u16 next_to_comp; + u16 llq_last_copy_tail; + u32 tx_max_header_size; + u8 phase; + u8 desc_entry_size; + u8 dma_addr_bits; + u16 entries_in_tx_burst_left; +} ____cacheline_aligned; + +struct ena_com_admin_cq { + struct ena_admin_acq_entry *entries; + dma_addr_t dma_addr; + + u16 head; + u8 phase; +}; + +struct ena_com_admin_sq { + struct ena_admin_aq_entry *entries; + dma_addr_t dma_addr; + + u32 __iomem *db_addr; + + u16 head; + u16 tail; + u8 phase; + +}; + +struct ena_com_stats_admin { + u64 aborted_cmd; + u64 submitted_cmd; + u64 completed_cmd; + u64 out_of_space; + u64 no_completion; +}; + +struct ena_com_stats_phc { + u64 phc_cnt; + u64 phc_exp; + u64 phc_skp; + u64 phc_err; +}; + +struct ena_com_admin_queue { + void *q_dmadev; + void *bus; + struct ena_com_dev *ena_dev; + spinlock_t q_lock; /* spinlock for the admin queue */ + + struct ena_comp_ctx *comp_ctx; + u32 completion_timeout; + u16 q_depth; + struct ena_com_admin_cq cq; + struct ena_com_admin_sq sq; + + /* Indicate if the admin queue should poll for completion */ + bool polling; + + /* Define if fallback to polling mode should occur */ + bool auto_polling; + + u16 curr_cmd_id; + + /* Indicate that the ena was initialized and can + * process new admin commands + */ + bool running_state; + + /* Count the number of outstanding admin commands */ + atomic_t outstanding_cmds; + + struct ena_com_stats_admin stats; +}; + +struct ena_aenq_handlers; + +struct ena_com_aenq { + u16 head; + u8 phase; + struct ena_admin_aenq_entry *entries; + dma_addr_t dma_addr; + u16 q_depth; + struct ena_aenq_handlers *aenq_handlers; +}; + +struct ena_com_mmio_read { + struct ena_admin_ena_mmio_req_read_less_resp *read_resp; + dma_addr_t read_resp_dma_addr; + u32 reg_read_to; /* in us */ + u16 seq_num; + bool readless_supported; + /* spin lock to ensure a single outstanding read */ + spinlock_t lock; +}; + +/* PTP hardware clock (PHC) MMIO read data info */ +struct ena_com_phc_info { + /* Internal PHC statistics */ + struct ena_com_stats_phc stats; + + /* PHC shared memory - virtual address */ + struct ena_admin_phc_resp *virt_addr; + + /* Spin lock to ensure a single outstanding PHC read */ + spinlock_t lock; + + /* PHC doorbell address as an offset to PCIe MMIO REG BAR */ + u32 doorbell_offset; + + /* Shared memory read expire timeout (usec) + * Max time for valid PHC retrieval, passing this threshold will fail the get time request + * and block new PHC requests for block_timeout_usec in order to prevent floods on busy + * device + */ + u32 expire_timeout_usec; + + /* Shared memory read abort timeout (usec) + * PHC requests block period, blocking starts once PHC request expired in order to prevent + * floods on busy device, any PHC requests during block period will be skipped + */ + u32 block_timeout_usec; + + /* Request id sent to the device */ + u16 req_id; + + /* True if PHC is active in the device */ + bool active; + + /* PHC shared memory - memory handle */ + + /* PHC shared memory - physical address */ + dma_addr_t phys_addr; +}; + +struct ena_rss { + /* Indirect table */ + u16 *host_rss_ind_tbl; + struct ena_admin_rss_ind_table_entry *rss_ind_tbl; + dma_addr_t rss_ind_tbl_dma_addr; + u16 tbl_log_size; + + /* Hash key */ + enum ena_admin_hash_functions hash_func; + struct ena_admin_feature_rss_flow_hash_control *hash_key; + dma_addr_t hash_key_dma_addr; + u32 hash_init_val; + + /* Flow Control */ + struct ena_admin_feature_rss_hash_control *hash_ctrl; + dma_addr_t hash_ctrl_dma_addr; + +}; + +struct ena_customer_metrics { + /* in correlation with ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK + * and ena_admin_customer_metrics_id + */ + u64 supported_metrics; + dma_addr_t buffer_dma_addr; + void *buffer_virt_addr; + u32 buffer_len; +}; + +struct ena_host_attribute { + /* Debug area */ + u8 *debug_area_virt_addr; + dma_addr_t debug_area_dma_addr; + u32 debug_area_size; + + /* Host information */ + struct ena_admin_host_info *host_info; + dma_addr_t host_info_dma_addr; +}; + +/* Each ena_dev is a PCI function. */ +struct ena_com_dev { + struct ena_com_admin_queue admin_queue; + struct ena_com_aenq aenq; + struct ena_com_io_cq io_cq_queues[ENA_TOTAL_NUM_QUEUES]; + struct ena_com_io_sq io_sq_queues[ENA_TOTAL_NUM_QUEUES]; + u8 __iomem *reg_bar; + void __iomem *mem_bar; + void *dmadev; + void *bus; + struct net_device *net_device; + + enum ena_admin_placement_policy_type tx_mem_queue_type; + u32 tx_max_header_size; + u16 stats_func; /* Selected function for extended statistic dump */ + u16 stats_queue; /* Selected queue for extended statistic dump */ + + u32 ena_min_poll_delay_us; + + struct ena_com_mmio_read mmio_read; + struct ena_com_phc_info phc; + + struct ena_rss rss; + u32 supported_features; + u32 capabilities; + u32 dma_addr_bits; + + struct ena_host_attribute host_attr; + bool adaptive_coalescing; + u16 intr_delay_resolution; + + /* interrupt moderation intervals are in usec divided by + * intr_delay_resolution, which is supplied by the device. + */ + u32 intr_moder_tx_interval; + u32 intr_moder_rx_interval; + + struct ena_intr_moder_entry *intr_moder_tbl; + + struct ena_com_llq_info llq_info; + + struct ena_customer_metrics customer_metrics; +}; + +struct ena_com_dev_get_features_ctx { + struct ena_admin_queue_feature_desc max_queues; + struct ena_admin_queue_ext_feature_desc max_queue_ext; + struct ena_admin_device_attr_feature_desc dev_attr; + struct ena_admin_feature_aenq_desc aenq; + struct ena_admin_feature_offload_desc offload; + struct ena_admin_ena_hw_hints hw_hints; + struct ena_admin_feature_llq_desc llq; +}; + +struct ena_com_create_io_ctx { + enum ena_admin_placement_policy_type mem_queue_type; + enum queue_direction direction; + int numa_node; + u32 msix_vector; + u16 queue_size; + u16 qid; +}; + +typedef void (*ena_aenq_handler)(void *data, + struct ena_admin_aenq_entry *aenq_e); + +/* Holds aenq handlers. Indexed by AENQ event group */ +struct ena_aenq_handlers { + ena_aenq_handler handlers[ENA_MAX_HANDLERS]; + ena_aenq_handler unimplemented_handler; +}; + +/*****************************************************************************/ +/*****************************************************************************/ + +/* ena_com_mmio_reg_read_request_init - Init the mmio reg read mechanism + * @ena_dev: ENA communication layer struct + * + * Initialize the register read mechanism. + * + * @note: This method must be the first stage in the initialization sequence. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev); + +/* ena_com_phc_init - Allocate and initialize PHC feature + * @ena_dev: ENA communication layer struct + * @note: This method assumes PHC is supported by the device + * @return - 0 on success, negative value on failure + */ +int ena_com_phc_init(struct ena_com_dev *ena_dev); + +/* ena_com_phc_supported - Return if PHC feature is supported by the device + * @ena_dev: ENA communication layer struct + * @note: This method must be called after getting supported features + * @return - supported or not + */ +bool ena_com_phc_supported(struct ena_com_dev *ena_dev); + +/* ena_com_phc_config - Configure PHC feature + * @ena_dev: ENA communication layer struct + * Configure PHC feature in driver and device + * @note: This method assumes PHC is supported by the device + * @return - 0 on success, negative value on failure + */ +int ena_com_phc_config(struct ena_com_dev *ena_dev); + +/* ena_com_phc_destroy - Destroy PHC feature + * @ena_dev: ENA communication layer struct + */ +void ena_com_phc_destroy(struct ena_com_dev *ena_dev); + +/* ena_com_phc_get - Retrieve PHC timestamp + * @ena_dev: ENA communication layer struct + * @timestamp: Retrieve PHC timestamp + * @return - 0 on success, negative value on failure + */ +int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp); + +/* ena_com_set_mmio_read_mode - Enable/disable the indirect mmio reg read mechanism + * @ena_dev: ENA communication layer struct + * @readless_supported: readless mode (enable/disable) + */ +void ena_com_set_mmio_read_mode(struct ena_com_dev *ena_dev, + bool readless_supported); + +/* ena_com_mmio_reg_read_request_write_dev_addr - Write the mmio reg read return + * value physical address. + * @ena_dev: ENA communication layer struct + */ +void ena_com_mmio_reg_read_request_write_dev_addr(struct ena_com_dev *ena_dev); + +/* ena_com_mmio_reg_read_request_destroy - Destroy the mmio reg read mechanism + * @ena_dev: ENA communication layer struct + */ +void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev); + +/* ena_com_admin_init - Init the admin and the async queues + * @ena_dev: ENA communication layer struct + * @aenq_handlers: Those handlers to be called upon event. + * + * Initialize the admin submission and completion queues. + * Initialize the asynchronous events notification queues. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_admin_init(struct ena_com_dev *ena_dev, + struct ena_aenq_handlers *aenq_handlers); + +/* ena_com_admin_destroy - Destroy the admin and the async events queues. + * @ena_dev: ENA communication layer struct + * + * @note: Before calling this method, the caller must validate that the device + * won't send any additional admin completions/aenq. + * To achieve that, a FLR is recommended. + */ +void ena_com_admin_destroy(struct ena_com_dev *ena_dev); + +/* ena_com_dev_reset - Perform device FLR to the device. + * @ena_dev: ENA communication layer struct + * @reset_reason: Specify what is the trigger for the reset in case of an error. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_dev_reset(struct ena_com_dev *ena_dev, + enum ena_regs_reset_reason_types reset_reason); + +/* ena_com_create_io_queue - Create io queue. + * @ena_dev: ENA communication layer struct + * @ctx - create context structure + * + * Create the submission and the completion queues. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_create_io_queue(struct ena_com_dev *ena_dev, + struct ena_com_create_io_ctx *ctx); + +/* ena_com_destroy_io_queue - Destroy IO queue with the queue id - qid. + * @ena_dev: ENA communication layer struct + * @qid - the caller virtual queue id. + */ +void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid); + +/* ena_com_get_io_handlers - Return the io queue handlers + * @ena_dev: ENA communication layer struct + * @qid - the caller virtual queue id. + * @io_sq - IO submission queue handler + * @io_cq - IO completion queue handler. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid, + struct ena_com_io_sq **io_sq, + struct ena_com_io_cq **io_cq); + +/* ena_com_admin_aenq_enable - ENAble asynchronous event notifications + * @ena_dev: ENA communication layer struct + * + * After this method, aenq event can be received via AENQ. + */ +void ena_com_admin_aenq_enable(struct ena_com_dev *ena_dev); + +/* ena_com_set_admin_running_state - Set the state of the admin queue + * @ena_dev: ENA communication layer struct + * + * Change the state of the admin queue (enable/disable) + */ +void ena_com_set_admin_running_state(struct ena_com_dev *ena_dev, bool state); + +/* ena_com_get_admin_running_state - Get the admin queue state + * @ena_dev: ENA communication layer struct + * + * Retrieve the state of the admin queue (enable/disable) + * + * @return - current polling mode (enable/disable) + */ +bool ena_com_get_admin_running_state(struct ena_com_dev *ena_dev); + +/* ena_com_set_admin_polling_mode - Set the admin completion queue polling mode + * @ena_dev: ENA communication layer struct + * @polling: ENAble/Disable polling mode + * + * Set the admin completion mode. + */ +void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling); + +/* ena_com_get_admin_polling_mode - Get the admin completion queue polling mode + * @ena_dev: ENA communication layer struct + * + * Get the admin completion mode. + * If polling mode is on, ena_com_execute_admin_command will perform a + * polling on the admin completion queue for the commands completion, + * otherwise it will wait on wait event. + * + * @return state + */ +bool ena_com_get_admin_polling_mode(struct ena_com_dev *ena_dev); + +/* ena_com_set_admin_auto_polling_mode - Enable autoswitch to polling mode + * @ena_dev: ENA communication layer struct + * @polling: Enable/Disable polling mode + * + * Set the autopolling mode. + * If autopolling is on: + * In case of missing interrupt when data is available switch to polling. + */ +void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev, + bool polling); + +/* ena_com_admin_q_comp_intr_handler - admin queue interrupt handler + * @ena_dev: ENA communication layer struct + * + * This method goes over the admin completion queue and wakes up all the pending + * threads that wait on the commands wait event. + * + * @note: Should be called after MSI-X interrupt. + */ +void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev); + +/* ena_com_aenq_intr_handler - AENQ interrupt handler + * @ena_dev: ENA communication layer struct + * + * This method goes over the async event notification queue and calls the proper + * aenq handler. + */ +void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data); + +/* ena_com_abort_admin_commands - Abort all the outstanding admin commands. + * @ena_dev: ENA communication layer struct + * + * This method aborts all the outstanding admin commands. + * The caller should then call ena_com_wait_for_abort_completion to make sure + * all the commands were completed. + */ +void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev); + +/* ena_com_wait_for_abort_completion - Wait for admin commands abort. + * @ena_dev: ENA communication layer struct + * + * This method waits until all the outstanding admin commands are completed. + */ +void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev); + +/* ena_com_validate_version - Validate the device parameters + * @ena_dev: ENA communication layer struct + * + * This method verifies the device parameters are the same as the saved + * parameters in ena_dev. + * This method is useful after device reset, to validate the device mac address + * and the device offloads are the same as before the reset. + * + * @return - 0 on success negative value otherwise. + */ +int ena_com_validate_version(struct ena_com_dev *ena_dev); + +/* ena_com_get_link_params - Retrieve physical link parameters. + * @ena_dev: ENA communication layer struct + * @resp: Link parameters + * + * Retrieve the physical link parameters, + * like speed, auto-negotiation and full duplex support. + * + * @return - 0 on Success negative value otherwise. + */ +int ena_com_get_link_params(struct ena_com_dev *ena_dev, + struct ena_admin_get_feat_resp *resp); + +/* ena_com_get_dma_width - Retrieve physical dma address width the device + * supports. + * @ena_dev: ENA communication layer struct + * + * Retrieve the maximum physical address bits the device can handle. + * + * @return: > 0 on Success and negative value otherwise. + */ +int ena_com_get_dma_width(struct ena_com_dev *ena_dev); + +/* ena_com_set_aenq_config - Set aenq groups configurations + * @ena_dev: ENA communication layer struct + * @groups flag: bit fields flags of enum ena_admin_aenq_group. + * + * Configure which aenq event group the driver would like to receive. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag); + +/* ena_com_get_dev_attr_feat - Get device features + * @ena_dev: ENA communication layer struct + * @get_feat_ctx: returned context that contain the get features. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev, + struct ena_com_dev_get_features_ctx *get_feat_ctx); + +/* ena_com_get_dev_basic_stats - Get device basic statistics + * @ena_dev: ENA communication layer struct + * @stats: stats return value + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev, + struct ena_admin_basic_stats *stats); + +/* ena_com_get_eni_stats - Get extended network interface statistics + * @ena_dev: ENA communication layer struct + * @stats: stats return value + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_eni_stats(struct ena_com_dev *ena_dev, + struct ena_admin_eni_stats *stats); + +/* ena_com_get_ena_srd_info - Get ENA SRD network interface statistics + * @ena_dev: ENA communication layer struct + * @info: ena srd stats and flags + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev, + struct ena_admin_ena_srd_info *info); + +/* ena_com_get_customer_metrics - Get customer metrics for network interface + * @ena_dev: ENA communication layer struct + * @buffer: buffer for returned customer metrics + * @len: size of the buffer + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32 len); + +/* ena_com_set_dev_mtu - Configure the device mtu. + * @ena_dev: ENA communication layer struct + * @mtu: mtu value + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu); + +/* ena_com_get_offload_settings - Retrieve the device offloads capabilities + * @ena_dev: ENA communication layer struct + * @offlad: offload return value + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_offload_settings(struct ena_com_dev *ena_dev, + struct ena_admin_feature_offload_desc *offload); + +/* ena_com_rss_init - Init RSS + * @ena_dev: ENA communication layer struct + * @log_size: indirection log size + * + * Allocate RSS/RFS resources. + * The caller then can configure rss using ena_com_set_hash_function, + * ena_com_set_hash_ctrl and ena_com_indirect_table_set. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 log_size); + +/* ena_com_rss_destroy - Destroy rss + * @ena_dev: ENA communication layer struct + * + * Free all the RSS/RFS resources. + */ +void ena_com_rss_destroy(struct ena_com_dev *ena_dev); + +/* ena_com_get_current_hash_function - Get RSS hash function + * @ena_dev: ENA communication layer struct + * + * Return the current hash function. + * @return: 0 or one of the ena_admin_hash_functions values. + */ +int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev); + +/* ena_com_fill_hash_function - Fill RSS hash function + * @ena_dev: ENA communication layer struct + * @func: The hash function (Toeplitz or crc) + * @key: Hash key (for toeplitz hash) + * @key_len: key length (max length 10 DW) + * @init_val: initial value for the hash function + * + * Fill the ena_dev resources with the desire hash function, hash key, key_len + * and key initial value (if needed by the hash function). + * To flush the key into the device the caller should call + * ena_com_set_hash_function. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_fill_hash_function(struct ena_com_dev *ena_dev, + enum ena_admin_hash_functions func, + const u8 *key, u16 key_len, u32 init_val); + +/* ena_com_set_hash_function - Flush the hash function and it dependencies to + * the device. + * @ena_dev: ENA communication layer struct + * + * Flush the hash function and it dependencies (key, key length and + * initial value) if needed. + * + * @note: Prior to this method the caller should call ena_com_fill_hash_function + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_hash_function(struct ena_com_dev *ena_dev); + +/* ena_com_get_hash_function - Retrieve the hash function from the device. + * @ena_dev: ENA communication layer struct + * @func: hash function + * + * Retrieve the hash function from the device. + * + * @note: If the caller called ena_com_fill_hash_function but didn't flush + * it to the device, the new configuration will be lost. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_hash_function(struct ena_com_dev *ena_dev, + enum ena_admin_hash_functions *func); + +/* ena_com_get_hash_key - Retrieve the hash key + * @ena_dev: ENA communication layer struct + * @key: hash key + * + * Retrieve the hash key. + * + * @note: If the caller called ena_com_fill_hash_key but didn't flush + * it to the device, the new configuration will be lost. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_hash_key(struct ena_com_dev *ena_dev, u8 *key); +/* ena_com_fill_hash_ctrl - Fill RSS hash control + * @ena_dev: ENA communication layer struct. + * @proto: The protocol to configure. + * @hash_fields: bit mask of ena_admin_flow_hash_fields + * + * Fill the ena_dev resources with the desire hash control (the ethernet + * fields that take part of the hash) for a specific protocol. + * To flush the hash control to the device, the caller should call + * ena_com_set_hash_ctrl. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev, + enum ena_admin_flow_hash_proto proto, + u16 hash_fields); + +/* ena_com_set_hash_ctrl - Flush the hash control resources to the device. + * @ena_dev: ENA communication layer struct + * + * Flush the hash control (the ethernet fields that take part of the hash) + * + * @note: Prior to this method the caller should call ena_com_fill_hash_ctrl. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev); + +/* ena_com_get_hash_ctrl - Retrieve the hash control from the device. + * @ena_dev: ENA communication layer struct + * @proto: The protocol to retrieve. + * @fields: bit mask of ena_admin_flow_hash_fields. + * + * Retrieve the hash control from the device. + * + * @note: If the caller called ena_com_fill_hash_ctrl but didn't flush + * it to the device, the new configuration will be lost. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_get_hash_ctrl(struct ena_com_dev *ena_dev, + enum ena_admin_flow_hash_proto proto, + u16 *fields); + +/* ena_com_set_default_hash_ctrl - Set the hash control to a default + * configuration. + * @ena_dev: ENA communication layer struct + * + * Fill the ena_dev resources with the default hash control configuration. + * To flush the hash control to the device, the caller should call + * ena_com_set_hash_ctrl. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_default_hash_ctrl(struct ena_com_dev *ena_dev); + +/* ena_com_indirect_table_fill_entry - Fill a single entry in the RSS + * indirection table + * @ena_dev: ENA communication layer struct. + * @entry_idx - indirection table entry. + * @entry_value - redirection value + * + * Fill a single entry of the RSS indirection table in the ena_dev resources. + * To flush the indirection table to the device, the called should call + * ena_com_indirect_table_set. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_indirect_table_fill_entry(struct ena_com_dev *ena_dev, + u16 entry_idx, u16 entry_value); + +/* ena_com_indirect_table_set - Flush the indirection table to the device. + * @ena_dev: ENA communication layer struct + * + * Flush the indirection hash control to the device. + * Prior to this method the caller should call ena_com_indirect_table_fill_entry + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_indirect_table_set(struct ena_com_dev *ena_dev); + +/* ena_com_indirect_table_get - Retrieve the indirection table from the device. + * @ena_dev: ENA communication layer struct + * @ind_tbl: indirection table + * + * Retrieve the RSS indirection table from the device. + * + * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flush + * it to the device, the new configuration will be lost. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl); + +/* ena_com_allocate_host_info - Allocate host info resources. + * @ena_dev: ENA communication layer struct + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_allocate_host_info(struct ena_com_dev *ena_dev); + +/* ena_com_allocate_debug_area - Allocate debug area. + * @ena_dev: ENA communication layer struct + * @debug_area_size - debug area size. + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev, + u32 debug_area_size); + +/* ena_com_allocate_customer_metrics_buffer - Allocate customer metrics resources. + * @ena_dev: ENA communication layer struct + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_allocate_customer_metrics_buffer(struct ena_com_dev *ena_dev); + +/* ena_com_delete_debug_area - Free the debug area resources. + * @ena_dev: ENA communication layer struct + * + * Free the allocated debug area. + */ +void ena_com_delete_debug_area(struct ena_com_dev *ena_dev); + +/* ena_com_delete_host_info - Free the host info resources. + * @ena_dev: ENA communication layer struct + * + * Free the allocated host info. + */ +void ena_com_delete_host_info(struct ena_com_dev *ena_dev); + +/* ena_com_delete_customer_metrics_buffer - Free the customer metrics resources. + * @ena_dev: ENA communication layer struct + * + * Free the allocated customer metrics area. + */ +void ena_com_delete_customer_metrics_buffer(struct ena_com_dev *ena_dev); + +/* ena_com_set_host_attributes - Update the device with the host + * attributes (debug area and host info) base address. + * @ena_dev: ENA communication layer struct + * + * @return: 0 on Success and negative value otherwise. + */ +int ena_com_set_host_attributes(struct ena_com_dev *ena_dev); + +/* ena_com_create_io_cq - Create io completion queue. + * @ena_dev: ENA communication layer struct + * @io_cq - io completion queue handler + + * Create IO completion queue. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_create_io_cq(struct ena_com_dev *ena_dev, + struct ena_com_io_cq *io_cq); + +/* ena_com_destroy_io_cq - Destroy io completion queue. + * @ena_dev: ENA communication layer struct + * @io_cq - io completion queue handler + + * Destroy IO completion queue. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev, + struct ena_com_io_cq *io_cq); + +/* ena_com_execute_admin_command - Execute admin command + * @admin_queue: admin queue. + * @cmd: the admin command to execute. + * @cmd_size: the command size. + * @cmd_completion: command completion return value. + * @cmd_comp_size: command completion size. + + * Submit an admin command and then wait until the device returns a + * completion. + * The completion will be copied into cmd_comp. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue, + struct ena_admin_aq_entry *cmd, + size_t cmd_size, + struct ena_admin_acq_entry *cmd_comp, + size_t cmd_comp_size); + +/* ena_com_init_interrupt_moderation - Init interrupt moderation + * @ena_dev: ENA communication layer struct + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev); + +/* ena_com_interrupt_moderation_supported - Return if interrupt moderation + * capability is supported by the device. + * + * @return - supported or not. + */ +bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev); + +/* ena_com_update_nonadaptive_moderation_interval_tx - Update the + * non-adaptive interval in Tx direction. + * @ena_dev: ENA communication layer struct + * @tx_coalesce_usecs: Interval in usec. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev, + u32 tx_coalesce_usecs); + +/* ena_com_update_nonadaptive_moderation_interval_rx - Update the + * non-adaptive interval in Rx direction. + * @ena_dev: ENA communication layer struct + * @rx_coalesce_usecs: Interval in usec. + * + * @return - 0 on success, negative value on failure. + */ +int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev, + u32 rx_coalesce_usecs); + +/* ena_com_get_nonadaptive_moderation_interval_tx - Retrieve the + * non-adaptive interval in Tx direction. + * @ena_dev: ENA communication layer struct + * + * @return - interval in usec + */ +unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev); + +/* ena_com_get_nonadaptive_moderation_interval_rx - Retrieve the + * non-adaptive interval in Rx direction. + * @ena_dev: ENA communication layer struct + * + * @return - interval in usec + */ +unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev); + +/* ena_com_config_dev_mode - Configure the placement policy of the device. + * @ena_dev: ENA communication layer struct + * @llq_features: LLQ feature descriptor, retrieve via + * ena_com_get_dev_attr_feat. + * @ena_llq_config: The default driver LLQ parameters configurations + */ +int ena_com_config_dev_mode(struct ena_com_dev *ena_dev, + struct ena_admin_feature_llq_desc *llq_features, + struct ena_llq_configurations *llq_default_config); + +/* ena_com_io_sq_to_ena_dev - Extract ena_com_dev using contained field io_sq. + * @io_sq: IO submit queue struct + * + * @return - ena_com_dev struct extracted from io_sq + */ +static inline struct ena_com_dev *ena_com_io_sq_to_ena_dev(struct ena_com_io_sq *io_sq) +{ + return container_of(io_sq, struct ena_com_dev, io_sq_queues[io_sq->qid]); +} + +/* ena_com_io_cq_to_ena_dev - Extract ena_com_dev using contained field io_cq. + * @io_sq: IO submit queue struct + * + * @return - ena_com_dev struct extracted from io_sq + */ +static inline struct ena_com_dev *ena_com_io_cq_to_ena_dev(struct ena_com_io_cq *io_cq) +{ + return container_of(io_cq, struct ena_com_dev, io_cq_queues[io_cq->qid]); +} + +static inline bool ena_com_get_adaptive_moderation_enabled(struct ena_com_dev *ena_dev) +{ + return ena_dev->adaptive_coalescing; +} + +static inline void ena_com_enable_adaptive_moderation(struct ena_com_dev *ena_dev) +{ + ena_dev->adaptive_coalescing = true; +} + +static inline void ena_com_disable_adaptive_moderation(struct ena_com_dev *ena_dev) +{ + ena_dev->adaptive_coalescing = false; +} + +/* ena_com_get_cap - query whether device supports a capability. + * @ena_dev: ENA communication layer struct + * @cap_id: enum value representing the capability + * + * @return - true if capability is supported or false otherwise + */ +static inline bool ena_com_get_cap(struct ena_com_dev *ena_dev, + enum ena_admin_aq_caps_id cap_id) +{ + return !!(ena_dev->capabilities & BIT(cap_id)); +} + +/* ena_com_get_customer_metric_support - query whether device supports a given customer metric. + * @ena_dev: ENA communication layer struct + * @metric_id: enum value representing the customer metric + * + * @return - true if customer metric is supported or false otherwise + */ +static inline bool ena_com_get_customer_metric_support(struct ena_com_dev *ena_dev, + enum ena_admin_customer_metrics_id metric_id) +{ + return !!(ena_dev->customer_metrics.supported_metrics & BIT(metric_id)); +} + +/* ena_com_get_customer_metric_count - return the number of supported customer metrics. + * @ena_dev: ENA communication layer struct + * + * @return - the number of supported customer metrics + */ +static inline int ena_com_get_customer_metric_count(struct ena_com_dev *ena_dev) +{ + return hweight64(ena_dev->customer_metrics.supported_metrics); +} + +/* ena_com_update_intr_reg - Prepare interrupt register + * @intr_reg: interrupt register to update. + * @rx_delay_interval: Rx interval in usecs + * @tx_delay_interval: Tx interval in usecs + * @unmask: unmask enable/disable + * @no_moderation_update: 0 - Indicates that any of the TX/RX intervals was + * updated, 1 - otherwise + * + * Prepare interrupt update register with the supplied parameters. + */ +static inline void ena_com_update_intr_reg(struct ena_eth_io_intr_reg *intr_reg, + u32 rx_delay_interval, + u32 tx_delay_interval, + bool unmask, + bool no_moderation_update) +{ + intr_reg->intr_control = 0; + intr_reg->intr_control |= rx_delay_interval & + ENA_ETH_IO_INTR_REG_RX_INTR_DELAY_MASK; + + intr_reg->intr_control |= + (tx_delay_interval << ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_SHIFT) + & ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK; + + if (unmask) + intr_reg->intr_control |= ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK; + + intr_reg->intr_control |= + (((u32)no_moderation_update) << ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_SHIFT) & + ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_MASK; +} + +static inline u8 *ena_com_get_next_bounce_buffer(struct ena_com_io_bounce_buffer_control *bounce_buf_ctrl) +{ + u16 size, buffers_num; + u8 *buf; + + size = bounce_buf_ctrl->buffer_size; + buffers_num = bounce_buf_ctrl->buffers_num; + + buf = bounce_buf_ctrl->base_buffer + + (bounce_buf_ctrl->next_to_use++ & (buffers_num - 1)) * size; + + prefetchw(bounce_buf_ctrl->base_buffer + + (bounce_buf_ctrl->next_to_use & (buffers_num - 1)) * size); + + return buf; +} + +#endif /* !(ENA_COM) */ diff --git a/drivers/amazon/net/ena/ena_common_defs.h b/drivers/amazon/net/ena/ena_common_defs.h new file mode 100755 index 0000000000000..e210c8a81fc0e --- /dev/null +++ b/drivers/amazon/net/ena/ena_common_defs.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ +#ifndef _ENA_COMMON_H_ +#define _ENA_COMMON_H_ + +#define ENA_COMMON_SPEC_VERSION_MAJOR 2 +#define ENA_COMMON_SPEC_VERSION_MINOR 0 + +/* ENA operates with 48-bit memory addresses. ena_mem_addr_t */ +struct ena_common_mem_addr { + u32 mem_addr_low; + + u16 mem_addr_high; + + /* MBZ */ + u16 reserved16; +}; + +#endif /* _ENA_COMMON_H_ */ diff --git a/drivers/amazon/net/ena/ena_devlink.c b/drivers/amazon/net/ena/ena_devlink.c new file mode 100644 index 0000000000000..43ce1ae2cebaa --- /dev/null +++ b/drivers/amazon/net/ena/ena_devlink.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "linux/pci.h" + +#include "ena_devlink.h" +#ifdef ENA_DEVLINK_SUPPORT +#ifdef ENA_PHC_SUPPORT +#include "ena_phc.h" + +static int ena_devlink_phc_enable_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack); +#endif /* ENA_PHC_SUPPORT */ + +static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack); + +enum ena_devlink_param_id { + ENA_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX, + ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE, +#ifdef ENA_PHC_SUPPORT + ENA_DEVLINK_PARAM_ID_PHC_ENABLE, +#endif /* ENA_PHC_SUPPORT */ +}; + +static const struct devlink_param ena_devlink_params[] = { + DEVLINK_PARAM_DRIVER(ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE, + "large_llq_header", DEVLINK_PARAM_TYPE_BOOL, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, ena_devlink_llq_header_validate), +#ifdef ENA_PHC_SUPPORT + DEVLINK_PARAM_DRIVER(ENA_DEVLINK_PARAM_ID_PHC_ENABLE, + "phc_enable", DEVLINK_PARAM_TYPE_BOOL, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, ena_devlink_phc_enable_validate), + #endif /* ENA_PHC_SUPPORT */ +}; + +static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink); + bool value = val.vbool; + + if (!value) + return 0; + + if (adapter->ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) { + NL_SET_ERR_MSG_MOD(extack, "Instance doesn't support LLQ"); + return -EOPNOTSUPP; + } + + if (!adapter->large_llq_header_supported) { + NL_SET_ERR_MSG_MOD(extack, "Instance doesn't support large LLQ"); + return -EOPNOTSUPP; + } + + return 0; +} + +#ifdef ENA_PHC_SUPPORT +static int ena_devlink_phc_enable_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink); + + if (!val.vbool) + return 0; + + if (!ena_com_phc_supported(adapter->ena_dev)) { + NL_SET_ERR_MSG_MOD(extack, "Device doesn't support PHC"); + return -EOPNOTSUPP; + } + + return 0; +} + +#endif /* ENA_PHC_SUPPORT */ +#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER +/* Determines if ena_devlink_register has been called. + * Prefer to check if the driver enabled reloading capabilities, but fallback + * to check if driver configured 'dev' devlink attribute for older kernels. + */ +bool ena_is_devlink_params_registered(struct devlink *devlink) +{ +#if defined(ENA_DEVLINK_RELOAD_ENABLING_REQUIRED) + return devlink->reload_enabled; +#elif !defined(ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC) + return devlink->dev; +#endif +} + +#endif +void ena_devlink_params_get(struct devlink *devlink) +{ + struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink); + union devlink_param_value val; + int err; + +#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER + /* If devlink params aren't registered, don't access them */ + if (!ena_is_devlink_params_registered(devlink)) + return; +#endif + err = devl_param_driverinit_value_get(devlink, + ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE, + &val); + if (err) { + netdev_err(adapter->netdev, "Failed to query LLQ header size param\n"); + return; + } + + adapter->large_llq_header_enabled = val.vbool; +#ifdef ENA_PHC_SUPPORT + + err = devl_param_driverinit_value_get(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, &val); + if (err) { + netdev_err(adapter->netdev, "Failed to query PHC param\n"); + return; + } + + ena_phc_enable(adapter, val.vbool); +#endif /* ENA_PHC_SUPPORT */ +} + +void ena_devlink_disable_large_llq_header_param(struct devlink *devlink) +{ + union devlink_param_value value; + +#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER + /* If devlink params aren't registered, don't access them */ + if (!ena_is_devlink_params_registered(devlink)) + return; + +#endif + value.vbool = false; + devl_param_driverinit_value_set(devlink, + ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE, + value); +} + +#ifdef ENA_PHC_SUPPORT +void ena_devlink_disable_phc_param(struct devlink *devlink) +{ + union devlink_param_value value; + +#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER + /* If devlink params aren't registered, don't access them */ + if (!ena_is_devlink_params_registered(devlink)) + return; + +#endif + value.vbool = false; + devl_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value); +} + +#endif /* ENA_PHC_SUPPORT */ +static int ena_devlink_reload_down(struct devlink *devlink, +#ifdef ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT + bool netns_change, +#endif +#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT + enum devlink_reload_action action, + enum devlink_reload_limit limit, +#endif + struct netlink_ext_ack *extack) +{ + struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink); + +#ifdef ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT + if (netns_change) { + NL_SET_ERR_MSG_MOD(extack, "Namespace change is not supported"); + return -EOPNOTSUPP; + } + +#endif +#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT + if (action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) { + NL_SET_ERR_MSG_MOD(extack, "Action is not supported"); + return -EOPNOTSUPP; + } + + if (limit != DEVLINK_RELOAD_LIMIT_UNSPEC) { + NL_SET_ERR_MSG_MOD(extack, "Driver reload doesn't support limitations"); + return -EOPNOTSUPP; + } + +#endif + rtnl_lock(); + ena_destroy_device(adapter, false); + rtnl_unlock(); + + return 0; +} + +static int ena_devlink_reload_up(struct devlink *devlink, +#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT + enum devlink_reload_action action, + enum devlink_reload_limit limit, + u32 *actions_performed, +#endif + struct netlink_ext_ack *extack) +{ + struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink); + int err = 0; + +#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT + if (action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) { + NL_SET_ERR_MSG_MOD(extack, "Action is not supported"); + return -EOPNOTSUPP; + } + + if (limit != DEVLINK_RELOAD_LIMIT_UNSPEC) { + NL_SET_ERR_MSG_MOD(extack, "Driver reload doesn't support limitations"); + return -EOPNOTSUPP; + } + +#endif + rtnl_lock(); + /* Check that no other routine initialized the device (e.g. + * ena_fw_reset_device()). Also we're under devlink_mutex here, + * so devlink isn't freed under our feet. + */ + if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags)) + err = ena_restore_device(adapter); + + rtnl_unlock(); + +#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT + if (!err) + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); + +#endif + return err; +} +#ifndef ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED + +static int ena_devlink_reload(struct devlink *devlink, struct netlink_ext_ack *extack) +{ + /* This function always succeeds when called from this function */ + ena_devlink_reload_down(devlink, extack); + + return ena_devlink_reload_up(devlink, extack); +} + +#endif + +static const struct devlink_ops ena_devlink_ops = { +#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT), +#endif +#ifdef ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED + .reload_down = ena_devlink_reload_down, + .reload_up = ena_devlink_reload_up, +#else + .reload = ena_devlink_reload, +#endif +}; + +static int ena_devlink_configure_params(struct devlink *devlink) +{ + struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink); + union devlink_param_value value; + int rc; + + rc = devlink_params_register(devlink, ena_devlink_params, + ARRAY_SIZE(ena_devlink_params)); + if (rc) { + netdev_err(adapter->netdev, "Failed to register devlink params\n"); + return rc; + } + + value.vbool = adapter->large_llq_header_enabled; + devl_param_driverinit_value_set(devlink, + ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE, + value); + +#ifdef ENA_PHC_SUPPORT + value.vbool = ena_phc_is_enabled(adapter); + devl_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value); + +#endif /* ENA_PHC_SUPPORT */ +#ifdef ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED + devlink_set_features(devlink, DEVLINK_F_RELOAD); + +#endif +#ifdef ENA_DEVLINK_PUBLISH_REQUIRED + devlink_params_publish(devlink); + +#endif +#ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED + devlink_reload_enable(devlink); + +#endif + return 0; +} + +struct devlink *ena_devlink_alloc(struct ena_adapter *adapter) +{ +#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC + struct device *dev = &adapter->pdev->dev; +#endif + struct devlink *devlink; + +#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC + devlink = devlink_alloc(&ena_devlink_ops, sizeof(struct ena_adapter *), dev); +#else + devlink = devlink_alloc(&ena_devlink_ops, sizeof(struct ena_adapter *)); +#endif + if (!devlink) { + netdev_err(adapter->netdev, "Failed to allocate devlink struct\n"); + return NULL; + } + + ENA_DEVLINK_PRIV(devlink) = adapter; + adapter->devlink = devlink; + +#ifndef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER + if (ena_devlink_configure_params(devlink)) + goto free_devlink; + + return devlink; +free_devlink: + devlink_free(devlink); + + return NULL; +#else + return devlink; +#endif +} + +static void ena_devlink_configure_params_clean(struct devlink *devlink) +{ +#ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED + devlink_reload_disable(devlink); + +#endif +#ifdef ENA_DEVLINK_PUBLISH_REQUIRED + devlink_params_unpublish(devlink); + +#endif + devlink_params_unregister(devlink, ena_devlink_params, + ARRAY_SIZE(ena_devlink_params)); +} + +void ena_devlink_free(struct devlink *devlink) +{ +#ifndef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER + ena_devlink_configure_params_clean(devlink); + +#endif + devlink_free(devlink); +} + +void ena_devlink_register(struct devlink *devlink, struct device *dev) +{ +#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC + devlink_register(devlink); +#else + devlink_register(devlink, dev); +#endif +#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER + ena_devlink_configure_params(devlink); +#endif +} + +void ena_devlink_unregister(struct devlink *devlink) +{ +#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER + ena_devlink_configure_params_clean(devlink); +#endif + devlink_unregister(devlink); +} +#endif /* ENA_DEVLINK_SUPPORT */ diff --git a/drivers/amazon/net/ena/ena_devlink.h b/drivers/amazon/net/ena/ena_devlink.h new file mode 100644 index 0000000000000..85c05cba00bd1 --- /dev/null +++ b/drivers/amazon/net/ena/ena_devlink.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef DEVLINK_H +#define DEVLINK_H + +#include "ena_netdev.h" +#ifndef ENA_NO_DEVLINK_HEADERS +#include +#endif + +#ifdef ENA_DEVLINK_SUPPORT +#define ENA_DEVLINK_PRIV(devlink) \ + (*(struct ena_adapter **)devlink_priv(devlink)) + +struct devlink *ena_devlink_alloc(struct ena_adapter *adapter); +void ena_devlink_free(struct devlink *devlink); +void ena_devlink_register(struct devlink *devlink, struct device *dev); +void ena_devlink_unregister(struct devlink *devlink); +void ena_devlink_params_get(struct devlink *devlink); +void ena_devlink_disable_large_llq_header_param(struct devlink *devlink); +void ena_devlink_disable_phc_param(struct devlink *devlink); + +#else /* ENA_DEVLINK_SUPPORT */ +#ifdef ENA_NO_DEVLINK_HEADERS +struct devlink {}; +#endif + +/* Return a value of 1 so the caller wouldn't think the function failed (returned NULL) */ +static inline struct devlink *ena_devlink_alloc(struct ena_adapter *adapter) +{ + return (struct devlink *)1; +} +static inline void ena_devlink_free(struct devlink *devlink) { } +static inline void ena_devlink_register(struct devlink *devlink, struct device *dev) { }; +static inline void ena_devlink_unregister(struct devlink *devlink) { } +static inline void ena_devlink_params_get(struct devlink *devlink) { } +static inline void ena_devlink_disable_large_llq_header_param(struct devlink *devlink) { } +static inline void ena_devlink_disable_phc_param(struct devlink *devlink) { } + +#endif /* ENA_DEVLINK_SUPPORT */ +#endif /* DEVLINK_H */ diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c new file mode 100644 index 0000000000000..50afe66efb57a --- /dev/null +++ b/drivers/amazon/net/ena/ena_eth_com.c @@ -0,0 +1,662 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "ena_eth_com.h" + +static struct ena_eth_io_rx_cdesc_base *ena_com_get_next_rx_cdesc( + struct ena_com_io_cq *io_cq) +{ + struct ena_eth_io_rx_cdesc_base *cdesc; + u16 expected_phase, head_masked; + u16 desc_phase; + + head_masked = io_cq->head & (io_cq->q_depth - 1); + expected_phase = io_cq->phase; + + cdesc = (struct ena_eth_io_rx_cdesc_base *)(io_cq->cdesc_addr.virt_addr + + (head_masked * io_cq->cdesc_entry_size_in_bytes)); + + desc_phase = (READ_ONCE(cdesc->status) & + ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT; + + if (desc_phase != expected_phase) + return NULL; + + /* Make sure we read the rest of the descriptor after the phase bit + * has been read + */ + dma_rmb(); + + return cdesc; +} + +static void *get_sq_desc_regular_queue(struct ena_com_io_sq *io_sq) +{ + u16 tail_masked; + u32 offset; + + tail_masked = io_sq->tail & (io_sq->q_depth - 1); + + offset = tail_masked * io_sq->desc_entry_size; + + return (void *)((uintptr_t)io_sq->desc_addr.virt_addr + offset); +} + +static int ena_com_write_bounce_buffer_to_dev(struct ena_com_io_sq *io_sq, + u8 *bounce_buffer) +{ + struct ena_com_llq_info *llq_info = &io_sq->llq_info; + + u16 dst_tail_mask; + u32 dst_offset; + + dst_tail_mask = io_sq->tail & (io_sq->q_depth - 1); + dst_offset = dst_tail_mask * llq_info->desc_list_entry_size; + + if (is_llq_max_tx_burst_exists(io_sq)) { + if (unlikely(!io_sq->entries_in_tx_burst_left)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Error: trying to send more packets than tx burst allows\n"); + return -ENOSPC; + } + + io_sq->entries_in_tx_burst_left--; + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Decreasing entries_in_tx_burst_left of queue %d to %d\n", + io_sq->qid, io_sq->entries_in_tx_burst_left); + } + + /* Make sure everything was written into the bounce buffer before + * writing the bounce buffer to the device + */ + wmb(); + + /* The line is completed. Copy it to dev */ + __iowrite64_copy(io_sq->desc_addr.pbuf_dev_addr + dst_offset, + bounce_buffer, (llq_info->desc_list_entry_size) / 8); + + io_sq->tail++; + + /* Switch phase bit in case of wrap around */ + if (unlikely((io_sq->tail & (io_sq->q_depth - 1)) == 0)) + io_sq->phase ^= 1; + + return 0; +} + +static int ena_com_write_header_to_bounce(struct ena_com_io_sq *io_sq, + u8 *header_src, + u16 header_len) +{ + struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl; + struct ena_com_llq_info *llq_info = &io_sq->llq_info; + u8 *bounce_buffer = pkt_ctrl->curr_bounce_buf; + u16 header_offset; + + if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)) + return 0; + + header_offset = + llq_info->descs_num_before_header * io_sq->desc_entry_size; + + if (unlikely((header_offset + header_len) > + llq_info->desc_list_entry_size)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Trying to write header larger than llq entry can accommodate\n"); + return -EFAULT; + } + + if (unlikely(!bounce_buffer)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Bounce buffer is NULL\n"); + return -EFAULT; + } + + memcpy(bounce_buffer + header_offset, header_src, header_len); + + return 0; +} + +static void *get_sq_desc_llq(struct ena_com_io_sq *io_sq) +{ + struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl; + u8 *bounce_buffer; + void *sq_desc; + + bounce_buffer = pkt_ctrl->curr_bounce_buf; + + if (unlikely(!bounce_buffer)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Bounce buffer is NULL\n"); + return NULL; + } + + sq_desc = bounce_buffer + pkt_ctrl->idx * io_sq->desc_entry_size; + pkt_ctrl->idx++; + pkt_ctrl->descs_left_in_line--; + + return sq_desc; +} + +static int ena_com_close_bounce_buffer(struct ena_com_io_sq *io_sq) +{ + struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl; + struct ena_com_llq_info *llq_info = &io_sq->llq_info; + int rc; + + if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)) + return 0; + + /* bounce buffer was used, so write it and get a new one */ + if (likely(pkt_ctrl->idx)) { + rc = ena_com_write_bounce_buffer_to_dev(io_sq, + pkt_ctrl->curr_bounce_buf); + if (unlikely(rc)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to write bounce buffer to device\n"); + return rc; + } + + pkt_ctrl->curr_bounce_buf = + ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl); + memset(io_sq->llq_buf_ctrl.curr_bounce_buf, + 0x0, llq_info->desc_list_entry_size); + } + + pkt_ctrl->idx = 0; + pkt_ctrl->descs_left_in_line = llq_info->descs_num_before_header; + return 0; +} + +static void *get_sq_desc(struct ena_com_io_sq *io_sq) +{ + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + return get_sq_desc_llq(io_sq); + + return get_sq_desc_regular_queue(io_sq); +} + +static int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq) +{ + struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl; + struct ena_com_llq_info *llq_info = &io_sq->llq_info; + int rc; + + if (!pkt_ctrl->descs_left_in_line) { + rc = ena_com_write_bounce_buffer_to_dev(io_sq, + pkt_ctrl->curr_bounce_buf); + if (unlikely(rc)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to write bounce buffer to device\n"); + return rc; + } + + pkt_ctrl->curr_bounce_buf = + ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl); + memset(io_sq->llq_buf_ctrl.curr_bounce_buf, + 0x0, llq_info->desc_list_entry_size); + + pkt_ctrl->idx = 0; + if (unlikely(llq_info->desc_stride_ctrl == ENA_ADMIN_SINGLE_DESC_PER_ENTRY)) + pkt_ctrl->descs_left_in_line = 1; + else + pkt_ctrl->descs_left_in_line = + llq_info->desc_list_entry_size / io_sq->desc_entry_size; + } + + return 0; +} + +static int ena_com_sq_update_tail(struct ena_com_io_sq *io_sq) +{ + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + return ena_com_sq_update_llq_tail(io_sq); + + io_sq->tail++; + + /* Switch phase bit in case of wrap around */ + if (unlikely((io_sq->tail & (io_sq->q_depth - 1)) == 0)) + io_sq->phase ^= 1; + + return 0; +} + +static struct ena_eth_io_rx_cdesc_base * + ena_com_rx_cdesc_idx_to_ptr(struct ena_com_io_cq *io_cq, u16 idx) +{ + idx &= (io_cq->q_depth - 1); + return (struct ena_eth_io_rx_cdesc_base *) + ((uintptr_t)io_cq->cdesc_addr.virt_addr + + idx * io_cq->cdesc_entry_size_in_bytes); +} + +static int ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq, + u16 *first_cdesc_idx, + u16 *num_descs) +{ + u16 count = io_cq->cur_rx_pkt_cdesc_count, head_masked; + struct ena_eth_io_rx_cdesc_base *cdesc; + u32 last = 0; + + do { + u32 status; + + cdesc = ena_com_get_next_rx_cdesc(io_cq); + if (!cdesc) + break; + status = READ_ONCE(cdesc->status); + + ena_com_cq_inc_head(io_cq); + if (unlikely((status & ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT && count != 0)) { + struct ena_com_dev *dev = ena_com_io_cq_to_ena_dev(io_cq); + + netdev_err(dev->net_device, + "First bit is on in descriptor #%d on q_id: %d, req_id: %u\n", + count, io_cq->qid, cdesc->req_id); + return -EFAULT; + } + count++; + last = (status & ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT; + } while (!last); + + if (last) { + *first_cdesc_idx = io_cq->cur_rx_pkt_cdesc_start_idx; + + head_masked = io_cq->head & (io_cq->q_depth - 1); + + *num_descs = count; + io_cq->cur_rx_pkt_cdesc_count = 0; + io_cq->cur_rx_pkt_cdesc_start_idx = head_masked; + + netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "ENA q_id: %d packets were completed. first desc idx %u descs# %d\n", + io_cq->qid, *first_cdesc_idx, count); + } else { + io_cq->cur_rx_pkt_cdesc_count = count; + *num_descs = 0; + } + + return 0; +} + +static int ena_com_create_meta(struct ena_com_io_sq *io_sq, + struct ena_com_tx_meta *ena_meta) +{ + struct ena_eth_io_tx_meta_desc *meta_desc = NULL; + + meta_desc = get_sq_desc(io_sq); + if (unlikely(!meta_desc)) + return -EFAULT; + + memset(meta_desc, 0x0, sizeof(struct ena_eth_io_tx_meta_desc)); + + meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_DESC_MASK; + + meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK; + + /* bits 0-9 of the mss */ + meta_desc->word2 |= ((u32)ena_meta->mss << + ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT) & + ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK; + /* bits 10-13 of the mss */ + meta_desc->len_ctrl |= ((ena_meta->mss >> 10) << + ENA_ETH_IO_TX_META_DESC_MSS_HI_SHIFT) & + ENA_ETH_IO_TX_META_DESC_MSS_HI_MASK; + + /* Extended meta desc */ + meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK; + meta_desc->len_ctrl |= ((u32)io_sq->phase << + ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT) & + ENA_ETH_IO_TX_META_DESC_PHASE_MASK; + + meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_FIRST_MASK; + meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_STORE_MASK; + + meta_desc->word2 |= ena_meta->l3_hdr_len & + ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK; + meta_desc->word2 |= (ena_meta->l3_hdr_offset << + ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT) & + ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK; + + meta_desc->word2 |= ((u32)ena_meta->l4_hdr_len << + ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT) & + ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK; + + return ena_com_sq_update_tail(io_sq); +} + +static int ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io_sq, + struct ena_com_tx_ctx *ena_tx_ctx, + bool *have_meta) +{ + struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta; + + /* When disable meta caching is set, don't bother to save the meta and + * compare it to the stored version, just create the meta + */ + if (io_sq->disable_meta_caching) { + *have_meta = true; + return ena_com_create_meta(io_sq, ena_meta); + } + + if (ena_com_meta_desc_changed(io_sq, ena_tx_ctx)) { + *have_meta = true; + /* Cache the meta desc */ + memcpy(&io_sq->cached_tx_meta, ena_meta, + sizeof(struct ena_com_tx_meta)); + return ena_com_create_meta(io_sq, ena_meta); + } + + *have_meta = false; + return 0; +} + +static void ena_com_rx_set_flags(struct ena_com_io_cq *io_cq, + struct ena_com_rx_ctx *ena_rx_ctx, + struct ena_eth_io_rx_cdesc_base *cdesc) +{ + ena_rx_ctx->l3_proto = cdesc->status & + ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK; + ena_rx_ctx->l4_proto = + (cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT; + ena_rx_ctx->l3_csum_err = + !!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT); + ena_rx_ctx->l4_csum_err = + !!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT); + ena_rx_ctx->l4_csum_checked = + !!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_SHIFT); + ena_rx_ctx->hash = cdesc->hash; + ena_rx_ctx->frag = + (cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK) >> + ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT; + + netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "l3_proto %d l4_proto %d l3_csum_err %d l4_csum_err %d hash %d frag %d cdesc_status %x\n", + ena_rx_ctx->l3_proto, ena_rx_ctx->l4_proto, + ena_rx_ctx->l3_csum_err, ena_rx_ctx->l4_csum_err, + ena_rx_ctx->hash, ena_rx_ctx->frag, cdesc->status); +} + +/*****************************************************************************/ +/***************************** API **********************************/ +/*****************************************************************************/ + +int ena_com_prepare_tx(struct ena_com_io_sq *io_sq, + struct ena_com_tx_ctx *ena_tx_ctx, + int *nb_hw_desc) +{ + struct ena_eth_io_tx_desc *desc = NULL; + struct ena_com_buf *ena_bufs = ena_tx_ctx->ena_bufs; + void *buffer_to_push = ena_tx_ctx->push_header; + u16 header_len = ena_tx_ctx->header_len; + u16 num_bufs = ena_tx_ctx->num_bufs; + u16 start_tail = io_sq->tail; + int i, rc; + bool have_meta; + u64 addr_hi; + + WARN(io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_TX, "wrong Q type"); + + /* num_bufs +1 for potential meta desc */ + if (unlikely(!ena_com_sq_have_enough_space(io_sq, num_bufs + 1))) { + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Not enough space in the tx queue\n"); + return -ENOMEM; + } + + if (unlikely(header_len > io_sq->tx_max_header_size)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Header size is too large %d max header: %d\n", + header_len, io_sq->tx_max_header_size); + return -EINVAL; + } + + if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV && + !buffer_to_push)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Push header wasn't provided in LLQ mode\n"); + return -EINVAL; + } + + rc = ena_com_write_header_to_bounce(io_sq, buffer_to_push, header_len); + if (unlikely(rc)) + return rc; + + rc = ena_com_create_and_store_tx_meta_desc(io_sq, ena_tx_ctx, &have_meta); + if (unlikely(rc)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to create and store tx meta desc\n"); + return rc; + } + + /* If the caller doesn't want to send packets */ + if (unlikely(!num_bufs && !header_len)) { + rc = ena_com_close_bounce_buffer(io_sq); + if (rc) + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to write buffers to LLQ\n"); + *nb_hw_desc = io_sq->tail - start_tail; + return rc; + } + + desc = get_sq_desc(io_sq); + if (unlikely(!desc)) + return -EFAULT; + memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc)); + + /* Set first desc when we don't have meta descriptor */ + if (!have_meta) + desc->len_ctrl |= ENA_ETH_IO_TX_DESC_FIRST_MASK; + + desc->buff_addr_hi_hdr_sz |= ((u32)header_len << + ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT) & + ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK; + desc->len_ctrl |= ((u32)io_sq->phase << ENA_ETH_IO_TX_DESC_PHASE_SHIFT) & + ENA_ETH_IO_TX_DESC_PHASE_MASK; + + desc->len_ctrl |= ENA_ETH_IO_TX_DESC_COMP_REQ_MASK; + + /* Bits 0-9 */ + desc->meta_ctrl |= ((u32)ena_tx_ctx->req_id << + ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT) & + ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK; + + desc->meta_ctrl |= (ena_tx_ctx->df << + ENA_ETH_IO_TX_DESC_DF_SHIFT) & + ENA_ETH_IO_TX_DESC_DF_MASK; + + /* Bits 10-15 */ + desc->len_ctrl |= ((ena_tx_ctx->req_id >> 10) << + ENA_ETH_IO_TX_DESC_REQ_ID_HI_SHIFT) & + ENA_ETH_IO_TX_DESC_REQ_ID_HI_MASK; + + if (ena_tx_ctx->meta_valid) { + desc->meta_ctrl |= (ena_tx_ctx->tso_enable << + ENA_ETH_IO_TX_DESC_TSO_EN_SHIFT) & + ENA_ETH_IO_TX_DESC_TSO_EN_MASK; + desc->meta_ctrl |= ena_tx_ctx->l3_proto & + ENA_ETH_IO_TX_DESC_L3_PROTO_IDX_MASK; + desc->meta_ctrl |= (ena_tx_ctx->l4_proto << + ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_SHIFT) & + ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_MASK; + desc->meta_ctrl |= (ena_tx_ctx->l3_csum_enable << + ENA_ETH_IO_TX_DESC_L3_CSUM_EN_SHIFT) & + ENA_ETH_IO_TX_DESC_L3_CSUM_EN_MASK; + desc->meta_ctrl |= (ena_tx_ctx->l4_csum_enable << + ENA_ETH_IO_TX_DESC_L4_CSUM_EN_SHIFT) & + ENA_ETH_IO_TX_DESC_L4_CSUM_EN_MASK; + desc->meta_ctrl |= (ena_tx_ctx->l4_csum_partial << + ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_SHIFT) & + ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_MASK; + } + + for (i = 0; i < num_bufs; i++) { + /* The first desc share the same desc as the header */ + if (likely(i != 0)) { + rc = ena_com_sq_update_tail(io_sq); + if (unlikely(rc)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to update sq tail\n"); + return rc; + } + + desc = get_sq_desc(io_sq); + if (unlikely(!desc)) + return -EFAULT; + + memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc)); + + desc->len_ctrl |= ((u32)io_sq->phase << + ENA_ETH_IO_TX_DESC_PHASE_SHIFT) & + ENA_ETH_IO_TX_DESC_PHASE_MASK; + } + + desc->len_ctrl |= ena_bufs->len & + ENA_ETH_IO_TX_DESC_LENGTH_MASK; + + addr_hi = ((ena_bufs->paddr & + GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32); + + desc->buff_addr_lo = (u32)ena_bufs->paddr; + desc->buff_addr_hi_hdr_sz |= addr_hi & + ENA_ETH_IO_TX_DESC_ADDR_HI_MASK; + ena_bufs++; + } + + /* set the last desc indicator */ + desc->len_ctrl |= ENA_ETH_IO_TX_DESC_LAST_MASK; + + rc = ena_com_sq_update_tail(io_sq); + if (unlikely(rc)) { + netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Failed to update sq tail of the last descriptor\n"); + return rc; + } + + rc = ena_com_close_bounce_buffer(io_sq); + + *nb_hw_desc = io_sq->tail - start_tail; + return rc; +} + +int ena_com_rx_pkt(struct ena_com_io_cq *io_cq, + struct ena_com_io_sq *io_sq, + struct ena_com_rx_ctx *ena_rx_ctx) +{ + struct ena_com_rx_buf_info *ena_buf = &ena_rx_ctx->ena_bufs[0]; + struct ena_eth_io_rx_cdesc_base *cdesc = NULL; + u16 q_depth = io_cq->q_depth; + u16 cdesc_idx = 0; + u16 nb_hw_desc; + u16 i = 0; + int rc; + + WARN(io_cq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type"); + + rc = ena_com_cdesc_rx_pkt_get(io_cq, &cdesc_idx, &nb_hw_desc); + if (unlikely(rc != 0)) + return -EFAULT; + + if (nb_hw_desc == 0) { + ena_rx_ctx->descs = nb_hw_desc; + return 0; + } + + netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "Fetch rx packet: queue %d completed desc: %d\n", io_cq->qid, + nb_hw_desc); + + if (unlikely(nb_hw_desc > ena_rx_ctx->max_bufs)) { + netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "Too many RX cdescs (%d) > MAX(%d)\n", nb_hw_desc, + ena_rx_ctx->max_bufs); + return -ENOSPC; + } + + cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx); + ena_rx_ctx->pkt_offset = cdesc->offset; + + do { + ena_buf[i].len = cdesc->length; + ena_buf[i].req_id = cdesc->req_id; + if (unlikely(ena_buf[i].req_id >= q_depth)) + return -EIO; + + if (++i >= nb_hw_desc) + break; + + cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i); + + } while (1); + + /* Update SQ head ptr */ + io_sq->next_to_comp += nb_hw_desc; + + netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "[%s][QID#%d] Updating SQ head to: %d\n", __func__, + io_sq->qid, io_sq->next_to_comp); + + /* Get rx flags from the last pkt */ + ena_com_rx_set_flags(io_cq, ena_rx_ctx, cdesc); + + ena_rx_ctx->descs = nb_hw_desc; + + return 0; +} + +int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq, + struct ena_com_buf *ena_buf, + u16 req_id) +{ + struct ena_eth_io_rx_desc *desc; + + WARN(io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type"); + + if (unlikely(!ena_com_sq_have_enough_space(io_sq, 1))) + return -ENOSPC; + + desc = get_sq_desc(io_sq); + if (unlikely(!desc)) + return -EFAULT; + + memset(desc, 0x0, sizeof(struct ena_eth_io_rx_desc)); + + desc->length = ena_buf->len; + + desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK | + ENA_ETH_IO_RX_DESC_LAST_MASK | + ENA_ETH_IO_RX_DESC_COMP_REQ_MASK | + (io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK); + + desc->req_id = req_id; + + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "[%s] Adding single RX desc, Queue: %u, req_id: %u\n", + __func__, io_sq->qid, req_id); + + desc->buff_addr_lo = (u32)ena_buf->paddr; + desc->buff_addr_hi = + ((ena_buf->paddr & GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32); + + return ena_com_sq_update_tail(io_sq); +} + +bool ena_com_cq_empty(struct ena_com_io_cq *io_cq) +{ + struct ena_eth_io_rx_cdesc_base *cdesc; + + cdesc = ena_com_get_next_rx_cdesc(io_cq); + if (cdesc) + return false; + else + return true; +} diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h new file mode 100644 index 0000000000000..028270a069d86 --- /dev/null +++ b/drivers/amazon/net/ena/ena_eth_com.h @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_ETH_COM_H_ +#define ENA_ETH_COM_H_ + +#include "ena_com.h" + +struct ena_com_tx_ctx { + struct ena_com_tx_meta ena_meta; + struct ena_com_buf *ena_bufs; + /* For LLQ, header buffer - pushed to the device mem space */ + void *push_header; + + enum ena_eth_io_l3_proto_index l3_proto; + enum ena_eth_io_l4_proto_index l4_proto; + u16 num_bufs; + u16 req_id; + /* For regular queue, indicate the size of the header + * For LLQ, indicate the size of the pushed buffer + */ + u16 header_len; + + u8 meta_valid; + u8 tso_enable; + u8 l3_csum_enable; + u8 l4_csum_enable; + u8 l4_csum_partial; + u8 df; /* Don't fragment */ +}; + +struct ena_com_rx_ctx { + struct ena_com_rx_buf_info *ena_bufs; + enum ena_eth_io_l3_proto_index l3_proto; + enum ena_eth_io_l4_proto_index l4_proto; + bool l3_csum_err; + bool l4_csum_err; + u8 l4_csum_checked; + /* fragmented packet */ + bool frag; + u32 hash; + u16 descs; + u16 max_bufs; + u8 pkt_offset; +}; + +int ena_com_prepare_tx(struct ena_com_io_sq *io_sq, + struct ena_com_tx_ctx *ena_tx_ctx, + int *nb_hw_desc); + +int ena_com_rx_pkt(struct ena_com_io_cq *io_cq, + struct ena_com_io_sq *io_sq, + struct ena_com_rx_ctx *ena_rx_ctx); + +int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq, + struct ena_com_buf *ena_buf, + u16 req_id); + +bool ena_com_cq_empty(struct ena_com_io_cq *io_cq); + +static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq, + struct ena_eth_io_intr_reg *intr_reg) +{ + writel(intr_reg->intr_control, io_cq->unmask_reg); +} + +static inline int ena_com_free_q_entries(struct ena_com_io_sq *io_sq) +{ + u16 tail, next_to_comp, cnt; + + next_to_comp = io_sq->next_to_comp; + tail = io_sq->tail; + cnt = tail - next_to_comp; + + return io_sq->q_depth - 1 - cnt; +} + +/* Check if the submission queue has enough space to hold required_buffers */ +static inline bool ena_com_sq_have_enough_space(struct ena_com_io_sq *io_sq, + u16 required_buffers) +{ + int temp; + + if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) + return ena_com_free_q_entries(io_sq) >= required_buffers; + + /* This calculation doesn't need to be 100% accurate. So to reduce + * the calculation overhead just Subtract 2 lines from the free descs + * (one for the header line and one to compensate the devision + * down calculation. + */ + temp = required_buffers / io_sq->llq_info.descs_per_entry + 2; + + return ena_com_free_q_entries(io_sq) > temp; +} + +static inline bool ena_com_meta_desc_changed(struct ena_com_io_sq *io_sq, + struct ena_com_tx_ctx *ena_tx_ctx) +{ + if (!ena_tx_ctx->meta_valid) + return false; + + return !!memcmp(&io_sq->cached_tx_meta, + &ena_tx_ctx->ena_meta, + sizeof(struct ena_com_tx_meta)); +} + +static inline bool is_llq_max_tx_burst_exists(struct ena_com_io_sq *io_sq) +{ + return (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) && + io_sq->llq_info.max_entries_in_tx_burst > 0; +} + +static inline bool ena_com_is_doorbell_needed(struct ena_com_io_sq *io_sq, + struct ena_com_tx_ctx *ena_tx_ctx) +{ + struct ena_com_llq_info *llq_info; + int descs_after_first_entry; + int num_entries_needed = 1; + u16 num_descs; + + if (!is_llq_max_tx_burst_exists(io_sq)) + return false; + + llq_info = &io_sq->llq_info; + num_descs = ena_tx_ctx->num_bufs; + + if (llq_info->disable_meta_caching || + unlikely(ena_com_meta_desc_changed(io_sq, ena_tx_ctx))) + ++num_descs; + + if (num_descs > llq_info->descs_num_before_header) { + descs_after_first_entry = num_descs - llq_info->descs_num_before_header; + num_entries_needed += DIV_ROUND_UP(descs_after_first_entry, + llq_info->descs_per_entry); + } + + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Queue: %d num_descs: %d num_entries_needed: %d\n", + io_sq->qid, num_descs, num_entries_needed); + + return num_entries_needed > io_sq->entries_in_tx_burst_left; +} + +static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq) +{ + u16 max_entries_in_tx_burst = io_sq->llq_info.max_entries_in_tx_burst; + u16 tail = io_sq->tail; + + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Write submission queue doorbell for queue: %d tail: %d\n", + io_sq->qid, tail); + + writel(tail, io_sq->db_addr); + + if (is_llq_max_tx_burst_exists(io_sq)) { + netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device, + "Reset available entries in tx burst for queue %d to %d\n", + io_sq->qid, max_entries_in_tx_burst); + io_sq->entries_in_tx_burst_left = max_entries_in_tx_burst; + } + + return 0; +} + +static inline void ena_com_update_numa_node(struct ena_com_io_cq *io_cq, + u8 numa_node) +{ + struct ena_eth_io_numa_node_cfg_reg numa_cfg; + + if (!io_cq->numa_node_cfg_reg) + return; + + numa_cfg.numa_cfg = (numa_node & ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK) + | ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK; + + writel(numa_cfg.numa_cfg, io_cq->numa_node_cfg_reg); +} + +static inline void ena_com_comp_ack(struct ena_com_io_sq *io_sq, u16 elem) +{ + io_sq->next_to_comp += elem; +} + +static inline void ena_com_cq_inc_head(struct ena_com_io_cq *io_cq) +{ + io_cq->head++; + + /* Switch phase bit in case of wrap around */ + if (unlikely((io_cq->head & (io_cq->q_depth - 1)) == 0)) + io_cq->phase ^= 1; +} + +static inline int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq, + u16 *req_id) +{ + u8 expected_phase, cdesc_phase; + struct ena_eth_io_tx_cdesc *cdesc; + u16 masked_head; + + masked_head = io_cq->head & (io_cq->q_depth - 1); + expected_phase = io_cq->phase; + + cdesc = (struct ena_eth_io_tx_cdesc *) + ((uintptr_t)io_cq->cdesc_addr.virt_addr + + (masked_head * io_cq->cdesc_entry_size_in_bytes)); + + /* When the current completion descriptor phase isn't the same as the + * expected, it mean that the device still didn't update + * this completion. + */ + cdesc_phase = READ_ONCE(cdesc->flags) & ENA_ETH_IO_TX_CDESC_PHASE_MASK; + if (cdesc_phase != expected_phase) + return -EAGAIN; + + dma_rmb(); + + *req_id = READ_ONCE(cdesc->req_id); + if (unlikely(*req_id >= io_cq->q_depth)) { + netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device, + "Invalid req id %d\n", cdesc->req_id); + return -EINVAL; + } + + ena_com_cq_inc_head(io_cq); + + return 0; +} + +#endif /* ENA_ETH_COM_H_ */ diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h new file mode 100755 index 0000000000000..a4d6d0ee0193c --- /dev/null +++ b/drivers/amazon/net/ena/ena_eth_io_defs.h @@ -0,0 +1,393 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ +#ifndef _ENA_ETH_IO_H_ +#define _ENA_ETH_IO_H_ + +enum ena_eth_io_l3_proto_index { + ENA_ETH_IO_L3_PROTO_UNKNOWN = 0, + ENA_ETH_IO_L3_PROTO_IPV4 = 8, + ENA_ETH_IO_L3_PROTO_IPV6 = 11, + ENA_ETH_IO_L3_PROTO_FCOE = 21, + ENA_ETH_IO_L3_PROTO_ROCE = 22, +}; + +enum ena_eth_io_l4_proto_index { + ENA_ETH_IO_L4_PROTO_UNKNOWN = 0, + ENA_ETH_IO_L4_PROTO_TCP = 12, + ENA_ETH_IO_L4_PROTO_UDP = 13, + ENA_ETH_IO_L4_PROTO_ROUTEABLE_ROCE = 23, +}; + +struct ena_eth_io_tx_desc { + /* 15:0 : length - Buffer length in bytes, must + * include any packet trailers that the ENA supposed + * to update like End-to-End CRC, Authentication GMAC + * etc. This length must not include the + * 'Push_Buffer' length. This length must not include + * the 4-byte added in the end for 802.3 Ethernet FCS + * 21:16 : req_id_hi - Request ID[15:10] + * 22 : reserved22 - MBZ + * 23 : meta_desc - MBZ + * 24 : phase + * 25 : reserved1 - MBZ + * 26 : first - Indicates first descriptor in + * transaction + * 27 : last - Indicates last descriptor in + * transaction + * 28 : comp_req - Indicates whether completion + * should be posted, after packet is transmitted. + * Valid only for first descriptor + * 30:29 : reserved29 - MBZ + * 31 : reserved31 - MBZ + */ + u32 len_ctrl; + + /* 3:0 : l3_proto_idx - L3 protocol. This field + * required when l3_csum_en,l3_csum or tso_en are set. + * 4 : DF - IPv4 DF, must be 0 if packet is IPv4 and + * DF flags of the IPv4 header is 0. Otherwise must + * be set to 1 + * 6:5 : reserved5 + * 7 : tso_en - Enable TSO, For TCP only. + * 12:8 : l4_proto_idx - L4 protocol. This field need + * to be set when l4_csum_en or tso_en are set. + * 13 : l3_csum_en - enable IPv4 header checksum. + * 14 : l4_csum_en - enable TCP/UDP checksum. + * 15 : ethernet_fcs_dis - when set, the controller + * will not append the 802.3 Ethernet Frame Check + * Sequence to the packet + * 16 : reserved16 + * 17 : l4_csum_partial - L4 partial checksum. when + * set to 0, the ENA calculates the L4 checksum, + * where the Destination Address required for the + * TCP/UDP pseudo-header is taken from the actual + * packet L3 header. when set to 1, the ENA doesn't + * calculate the sum of the pseudo-header, instead, + * the checksum field of the L4 is used instead. When + * TSO enabled, the checksum of the pseudo-header + * must not include the tcp length field. L4 partial + * checksum should be used for IPv6 packet that + * contains Routing Headers. + * 20:18 : reserved18 - MBZ + * 21 : reserved21 - MBZ + * 31:22 : req_id_lo - Request ID[9:0] + */ + u32 meta_ctrl; + + u32 buff_addr_lo; + + /* address high and header size + * 15:0 : addr_hi - Buffer Pointer[47:32] + * 23:16 : reserved16_w2 + * 31:24 : header_length - Header length. For Low + * Latency Queues, this fields indicates the number + * of bytes written to the headers' memory. For + * normal queues, if packet is TCP or UDP, and longer + * than max_header_size, then this field should be + * set to the sum of L4 header offset and L4 header + * size(without options), otherwise, this field + * should be set to 0. For both modes, this field + * must not exceed the max_header_size. + * max_header_size value is reported by the Max + * Queues Feature descriptor + */ + u32 buff_addr_hi_hdr_sz; +}; + +struct ena_eth_io_tx_meta_desc { + /* 9:0 : req_id_lo - Request ID[9:0] + * 11:10 : reserved10 - MBZ + * 12 : reserved12 - MBZ + * 13 : reserved13 - MBZ + * 14 : ext_valid - if set, offset fields in Word2 + * are valid Also MSS High in Word 0 and bits [31:24] + * in Word 3 + * 15 : reserved15 + * 19:16 : mss_hi + * 20 : eth_meta_type - 0: Tx Metadata Descriptor, 1: + * Extended Metadata Descriptor + * 21 : meta_store - Store extended metadata in queue + * cache + * 22 : reserved22 - MBZ + * 23 : meta_desc - MBO + * 24 : phase + * 25 : reserved25 - MBZ + * 26 : first - Indicates first descriptor in + * transaction + * 27 : last - Indicates last descriptor in + * transaction + * 28 : comp_req - Indicates whether completion + * should be posted, after packet is transmitted. + * Valid only for first descriptor + * 30:29 : reserved29 - MBZ + * 31 : reserved31 - MBZ + */ + u32 len_ctrl; + + /* 5:0 : req_id_hi + * 31:6 : reserved6 - MBZ + */ + u32 word1; + + /* 7:0 : l3_hdr_len + * 15:8 : l3_hdr_off + * 21:16 : l4_hdr_len_in_words - counts the L4 header + * length in words. there is an explicit assumption + * that L4 header appears right after L3 header and + * L4 offset is based on l3_hdr_off+l3_hdr_len + * 31:22 : mss_lo + */ + u32 word2; + + u32 reserved; +}; + +struct ena_eth_io_tx_cdesc { + /* Request ID[15:0] */ + u16 req_id; + + u8 status; + + /* flags + * 0 : phase + * 7:1 : reserved1 + */ + u8 flags; + + u16 sub_qid; + + u16 sq_head_idx; +}; + +struct ena_eth_io_rx_desc { + /* In bytes. 0 means 64KB */ + u16 length; + + /* MBZ */ + u8 reserved2; + + /* 0 : phase + * 1 : reserved1 - MBZ + * 2 : first - Indicates first descriptor in + * transaction + * 3 : last - Indicates last descriptor in transaction + * 4 : comp_req + * 5 : reserved5 - MBO + * 7:6 : reserved6 - MBZ + */ + u8 ctrl; + + u16 req_id; + + /* MBZ */ + u16 reserved6; + + u32 buff_addr_lo; + + u16 buff_addr_hi; + + /* MBZ */ + u16 reserved16_w3; +}; + +/* 4-word format Note: all ethernet parsing information are valid only when + * last=1 + */ +struct ena_eth_io_rx_cdesc_base { + /* 4:0 : l3_proto_idx + * 6:5 : src_vlan_cnt + * 7 : reserved7 - MBZ + * 12:8 : l4_proto_idx + * 13 : l3_csum_err - when set, either the L3 + * checksum error detected, or, the controller didn't + * validate the checksum. This bit is valid only when + * l3_proto_idx indicates IPv4 packet + * 14 : l4_csum_err - when set, either the L4 + * checksum error detected, or, the controller didn't + * validate the checksum. This bit is valid only when + * l4_proto_idx indicates TCP/UDP packet, and, + * ipv4_frag is not set. This bit is valid only when + * l4_csum_checked below is set. + * 15 : ipv4_frag - Indicates IPv4 fragmented packet + * 16 : l4_csum_checked - L4 checksum was verified + * (could be OK or error), when cleared the status of + * checksum is unknown + * 23:17 : reserved17 - MBZ + * 24 : phase + * 25 : l3_csum2 - second checksum engine result + * 26 : first - Indicates first descriptor in + * transaction + * 27 : last - Indicates last descriptor in + * transaction + * 29:28 : reserved28 + * 30 : buffer - 0: Metadata descriptor. 1: Buffer + * Descriptor was used + * 31 : reserved31 + */ + u32 status; + + u16 length; + + u16 req_id; + + /* 32-bit hash result */ + u32 hash; + + u16 sub_qid; + + u8 offset; + + u8 reserved; +}; + +/* 8-word format */ +struct ena_eth_io_rx_cdesc_ext { + struct ena_eth_io_rx_cdesc_base base; + + u32 buff_addr_lo; + + u16 buff_addr_hi; + + u16 reserved16; + + u32 reserved_w6; + + u32 reserved_w7; +}; + +struct ena_eth_io_intr_reg { + /* 14:0 : rx_intr_delay + * 29:15 : tx_intr_delay + * 30 : intr_unmask + * 31 : no_moderation_update - 0 - moderation + * updated, 1 - moderation not updated + */ + u32 intr_control; +}; + +struct ena_eth_io_numa_node_cfg_reg { + /* 7:0 : numa + * 30:8 : reserved + * 31 : enabled + */ + u32 numa_cfg; +}; + +/* tx_desc */ +#define ENA_ETH_IO_TX_DESC_LENGTH_MASK GENMASK(15, 0) +#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_SHIFT 16 +#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_MASK GENMASK(21, 16) +#define ENA_ETH_IO_TX_DESC_META_DESC_SHIFT 23 +#define ENA_ETH_IO_TX_DESC_META_DESC_MASK BIT(23) +#define ENA_ETH_IO_TX_DESC_PHASE_SHIFT 24 +#define ENA_ETH_IO_TX_DESC_PHASE_MASK BIT(24) +#define ENA_ETH_IO_TX_DESC_FIRST_SHIFT 26 +#define ENA_ETH_IO_TX_DESC_FIRST_MASK BIT(26) +#define ENA_ETH_IO_TX_DESC_LAST_SHIFT 27 +#define ENA_ETH_IO_TX_DESC_LAST_MASK BIT(27) +#define ENA_ETH_IO_TX_DESC_COMP_REQ_SHIFT 28 +#define ENA_ETH_IO_TX_DESC_COMP_REQ_MASK BIT(28) +#define ENA_ETH_IO_TX_DESC_L3_PROTO_IDX_MASK GENMASK(3, 0) +#define ENA_ETH_IO_TX_DESC_DF_SHIFT 4 +#define ENA_ETH_IO_TX_DESC_DF_MASK BIT(4) +#define ENA_ETH_IO_TX_DESC_TSO_EN_SHIFT 7 +#define ENA_ETH_IO_TX_DESC_TSO_EN_MASK BIT(7) +#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_SHIFT 8 +#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_MASK GENMASK(12, 8) +#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_SHIFT 13 +#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_MASK BIT(13) +#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_SHIFT 14 +#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_MASK BIT(14) +#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_SHIFT 15 +#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_MASK BIT(15) +#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_SHIFT 17 +#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_MASK BIT(17) +#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT 22 +#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK GENMASK(31, 22) +#define ENA_ETH_IO_TX_DESC_ADDR_HI_MASK GENMASK(15, 0) +#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT 24 +#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK GENMASK(31, 24) + +/* tx_meta_desc */ +#define ENA_ETH_IO_TX_META_DESC_REQ_ID_LO_MASK GENMASK(9, 0) +#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_SHIFT 14 +#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK BIT(14) +#define ENA_ETH_IO_TX_META_DESC_MSS_HI_SHIFT 16 +#define ENA_ETH_IO_TX_META_DESC_MSS_HI_MASK GENMASK(19, 16) +#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_SHIFT 20 +#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK BIT(20) +#define ENA_ETH_IO_TX_META_DESC_META_STORE_SHIFT 21 +#define ENA_ETH_IO_TX_META_DESC_META_STORE_MASK BIT(21) +#define ENA_ETH_IO_TX_META_DESC_META_DESC_SHIFT 23 +#define ENA_ETH_IO_TX_META_DESC_META_DESC_MASK BIT(23) +#define ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT 24 +#define ENA_ETH_IO_TX_META_DESC_PHASE_MASK BIT(24) +#define ENA_ETH_IO_TX_META_DESC_FIRST_SHIFT 26 +#define ENA_ETH_IO_TX_META_DESC_FIRST_MASK BIT(26) +#define ENA_ETH_IO_TX_META_DESC_LAST_SHIFT 27 +#define ENA_ETH_IO_TX_META_DESC_LAST_MASK BIT(27) +#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_SHIFT 28 +#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_MASK BIT(28) +#define ENA_ETH_IO_TX_META_DESC_REQ_ID_HI_MASK GENMASK(5, 0) +#define ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK GENMASK(7, 0) +#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT 8 +#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK GENMASK(15, 8) +#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT 16 +#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK GENMASK(21, 16) +#define ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT 22 +#define ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK GENMASK(31, 22) + +/* tx_cdesc */ +#define ENA_ETH_IO_TX_CDESC_PHASE_MASK BIT(0) + +/* rx_desc */ +#define ENA_ETH_IO_RX_DESC_PHASE_MASK BIT(0) +#define ENA_ETH_IO_RX_DESC_FIRST_SHIFT 2 +#define ENA_ETH_IO_RX_DESC_FIRST_MASK BIT(2) +#define ENA_ETH_IO_RX_DESC_LAST_SHIFT 3 +#define ENA_ETH_IO_RX_DESC_LAST_MASK BIT(3) +#define ENA_ETH_IO_RX_DESC_COMP_REQ_SHIFT 4 +#define ENA_ETH_IO_RX_DESC_COMP_REQ_MASK BIT(4) + +/* rx_cdesc_base */ +#define ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK GENMASK(4, 0) +#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_SHIFT 5 +#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_MASK GENMASK(6, 5) +#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT 8 +#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK GENMASK(12, 8) +#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT 13 +#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK BIT(13) +#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT 14 +#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK BIT(14) +#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT 15 +#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK BIT(15) +#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_SHIFT 16 +#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_MASK BIT(16) +#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT 24 +#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK BIT(24) +#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_SHIFT 25 +#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_MASK BIT(25) +#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT 26 +#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK BIT(26) +#define ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT 27 +#define ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK BIT(27) +#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_SHIFT 30 +#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_MASK BIT(30) + +/* intr_reg */ +#define ENA_ETH_IO_INTR_REG_RX_INTR_DELAY_MASK GENMASK(14, 0) +#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_SHIFT 15 +#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK GENMASK(29, 15) +#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_SHIFT 30 +#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK BIT(30) +#define ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_SHIFT 31 +#define ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_MASK BIT(31) + +/* numa_node_cfg_reg */ +#define ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK GENMASK(7, 0) +#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_SHIFT 31 +#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK BIT(31) + +#endif /* _ENA_ETH_IO_H_ */ diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c new file mode 100644 index 0000000000000..f09801591d840 --- /dev/null +++ b/drivers/amazon/net/ena/ena_ethtool.c @@ -0,0 +1,1411 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include +#include +#include + +#include "ena_netdev.h" +#include "ena_xdp.h" +#include "ena_phc.h" + +struct ena_stats { + char name[ETH_GSTRING_LEN]; + int stat_offset; +}; + +struct ena_hw_metrics { + char name[ETH_GSTRING_LEN]; +}; + +#define ENA_STAT_ENA_COM_ADMIN_ENTRY(stat) { \ + .name = #stat, \ + .stat_offset = offsetof(struct ena_com_stats_admin, stat) / sizeof(u64) \ +} + +#define ENA_STAT_ENA_COM_PHC_ENTRY(stat) { \ + .name = #stat, \ + .stat_offset = offsetof(struct ena_com_stats_phc, stat) / sizeof(u64) \ +} + +#define ENA_STAT_ENTRY(stat, stat_type) { \ + .name = #stat, \ + .stat_offset = offsetof(struct ena_stats_##stat_type, stat) / sizeof(u64) \ +} + +#define ENA_STAT_HW_ENTRY(stat, stat_type) { \ + .name = #stat, \ + .stat_offset = offsetof(struct ena_admin_##stat_type, stat) / sizeof(u64) \ +} + +#define ENA_STAT_RX_ENTRY(stat) \ + ENA_STAT_ENTRY(stat, rx) + +#define ENA_STAT_TX_ENTRY(stat) \ + ENA_STAT_ENTRY(stat, tx) + +#define ENA_STAT_GLOBAL_ENTRY(stat) \ + ENA_STAT_ENTRY(stat, dev) + +#define ENA_STAT_ENI_ENTRY(stat) \ + ENA_STAT_HW_ENTRY(stat, eni_stats) + +#define ENA_STAT_ENA_SRD_ENTRY(stat) \ + ENA_STAT_HW_ENTRY(stat, ena_srd_stats) + +#define ENA_STAT_ENA_SRD_MODE_ENTRY(stat) { \ + .name = #stat, \ + .stat_offset = offsetof(struct ena_admin_ena_srd_info, flags) / sizeof(u64) \ +} + +#define ENA_METRIC_ENI_ENTRY(stat) { \ + .name = #stat \ +} + +static const struct ena_stats ena_stats_global_strings[] = { + ENA_STAT_GLOBAL_ENTRY(tx_timeout), + ENA_STAT_GLOBAL_ENTRY(suspend), + ENA_STAT_GLOBAL_ENTRY(resume), + ENA_STAT_GLOBAL_ENTRY(wd_expired), + ENA_STAT_GLOBAL_ENTRY(interface_up), + ENA_STAT_GLOBAL_ENTRY(interface_down), + ENA_STAT_GLOBAL_ENTRY(admin_q_pause), + ENA_STAT_GLOBAL_ENTRY(reset_fail), +}; + +/* A partial list of hw stats. Used when admin command + * with type ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS is not supported + */ +static const struct ena_stats ena_stats_eni_strings[] = { + ENA_STAT_ENI_ENTRY(bw_in_allowance_exceeded), + ENA_STAT_ENI_ENTRY(bw_out_allowance_exceeded), + ENA_STAT_ENI_ENTRY(pps_allowance_exceeded), + ENA_STAT_ENI_ENTRY(conntrack_allowance_exceeded), + ENA_STAT_ENI_ENTRY(linklocal_allowance_exceeded), +}; + +static const struct ena_hw_metrics ena_hw_stats_strings[] = { + ENA_METRIC_ENI_ENTRY(bw_in_allowance_exceeded), + ENA_METRIC_ENI_ENTRY(bw_out_allowance_exceeded), + ENA_METRIC_ENI_ENTRY(pps_allowance_exceeded), + ENA_METRIC_ENI_ENTRY(conntrack_allowance_exceeded), + ENA_METRIC_ENI_ENTRY(linklocal_allowance_exceeded), + ENA_METRIC_ENI_ENTRY(conntrack_allowance_available), +}; + +static const struct ena_stats ena_srd_info_strings[] = { + ENA_STAT_ENA_SRD_MODE_ENTRY(ena_srd_mode), + ENA_STAT_ENA_SRD_ENTRY(ena_srd_tx_pkts), + ENA_STAT_ENA_SRD_ENTRY(ena_srd_eligible_tx_pkts), + ENA_STAT_ENA_SRD_ENTRY(ena_srd_rx_pkts), + ENA_STAT_ENA_SRD_ENTRY(ena_srd_resource_utilization) +}; + +static const struct ena_stats ena_stats_tx_strings[] = { + ENA_STAT_TX_ENTRY(cnt), + ENA_STAT_TX_ENTRY(bytes), + ENA_STAT_TX_ENTRY(queue_stop), + ENA_STAT_TX_ENTRY(queue_wakeup), + ENA_STAT_TX_ENTRY(dma_mapping_err), + ENA_STAT_TX_ENTRY(linearize), + ENA_STAT_TX_ENTRY(linearize_failed), + ENA_STAT_TX_ENTRY(napi_comp), + ENA_STAT_TX_ENTRY(tx_poll), + ENA_STAT_TX_ENTRY(doorbells), + ENA_STAT_TX_ENTRY(prepare_ctx_err), + ENA_STAT_TX_ENTRY(bad_req_id), + ENA_STAT_TX_ENTRY(llq_buffer_copy), + ENA_STAT_TX_ENTRY(missed_tx), + ENA_STAT_TX_ENTRY(unmask_interrupt), +#ifdef ENA_AF_XDP_SUPPORT + ENA_STAT_TX_ENTRY(xsk_need_wakeup_set), + ENA_STAT_TX_ENTRY(xsk_wakeup_request), +#endif /* ENA_AF_XDP_SUPPORT */ +}; + +static const struct ena_stats ena_stats_rx_strings[] = { + ENA_STAT_RX_ENTRY(cnt), + ENA_STAT_RX_ENTRY(bytes), + ENA_STAT_RX_ENTRY(rx_copybreak_pkt), + ENA_STAT_RX_ENTRY(csum_good), + ENA_STAT_RX_ENTRY(refil_partial), + ENA_STAT_RX_ENTRY(csum_bad), + ENA_STAT_RX_ENTRY(page_alloc_fail), + ENA_STAT_RX_ENTRY(skb_alloc_fail), + ENA_STAT_RX_ENTRY(dma_mapping_err), + ENA_STAT_RX_ENTRY(bad_desc_num), +#ifdef ENA_BUSY_POLL_SUPPORT + ENA_STAT_RX_ENTRY(bp_yield), + ENA_STAT_RX_ENTRY(bp_missed), + ENA_STAT_RX_ENTRY(bp_cleaned), +#endif + ENA_STAT_RX_ENTRY(bad_req_id), + ENA_STAT_RX_ENTRY(empty_rx_ring), + ENA_STAT_RX_ENTRY(csum_unchecked), +#ifdef ENA_XDP_SUPPORT + ENA_STAT_RX_ENTRY(xdp_aborted), + ENA_STAT_RX_ENTRY(xdp_drop), + ENA_STAT_RX_ENTRY(xdp_pass), + ENA_STAT_RX_ENTRY(xdp_tx), + ENA_STAT_RX_ENTRY(xdp_invalid), + ENA_STAT_RX_ENTRY(xdp_redirect), +#endif + ENA_STAT_RX_ENTRY(lpc_warm_up), + ENA_STAT_RX_ENTRY(lpc_full), + ENA_STAT_RX_ENTRY(lpc_wrong_numa), +#ifdef ENA_AF_XDP_SUPPORT + ENA_STAT_RX_ENTRY(xsk_need_wakeup_set), + ENA_STAT_RX_ENTRY(zc_queue_pkt_copy), +#endif /* ENA_AF_XDP_SUPPORT */ +}; + +static const struct ena_stats ena_stats_ena_com_admin_strings[] = { + ENA_STAT_ENA_COM_ADMIN_ENTRY(aborted_cmd), + ENA_STAT_ENA_COM_ADMIN_ENTRY(submitted_cmd), + ENA_STAT_ENA_COM_ADMIN_ENTRY(completed_cmd), + ENA_STAT_ENA_COM_ADMIN_ENTRY(out_of_space), + ENA_STAT_ENA_COM_ADMIN_ENTRY(no_completion), +}; + +static const struct ena_stats ena_stats_ena_com_phc_strings[] = { + ENA_STAT_ENA_COM_PHC_ENTRY(phc_cnt), + ENA_STAT_ENA_COM_PHC_ENTRY(phc_exp), + ENA_STAT_ENA_COM_PHC_ENTRY(phc_skp), + ENA_STAT_ENA_COM_PHC_ENTRY(phc_err), +}; + +#define ENA_STATS_ARRAY_GLOBAL ARRAY_SIZE(ena_stats_global_strings) +#define ENA_STATS_ARRAY_TX ARRAY_SIZE(ena_stats_tx_strings) +#define ENA_STATS_ARRAY_RX ARRAY_SIZE(ena_stats_rx_strings) +#define ENA_STATS_ARRAY_ENA_COM_ADMIN ARRAY_SIZE(ena_stats_ena_com_admin_strings) +#define ENA_STATS_ARRAY_ENA_COM_PHC ARRAY_SIZE(ena_stats_ena_com_phc_strings) +#define ENA_STATS_ARRAY_ENI ARRAY_SIZE(ena_stats_eni_strings) +#define ENA_STATS_ARRAY_ENA_SRD ARRAY_SIZE(ena_srd_info_strings) +#define ENA_METRICS_ARRAY_ENI ARRAY_SIZE(ena_hw_stats_strings) + +static const char ena_priv_flags_strings[][ETH_GSTRING_LEN] = { +#define ENA_PRIV_FLAGS_LPC BIT(0) + "local_page_cache", +}; + +#define ENA_PRIV_FLAGS_NR ARRAY_SIZE(ena_priv_flags_strings) + +static void ena_safe_update_stat(u64 *src, u64 *dst, + struct u64_stats_sync *syncp) +{ + unsigned int start; + + do { + start = ena_u64_stats_fetch_begin(syncp); + *(dst) = *src; + } while (ena_u64_stats_fetch_retry(syncp, start)); +} + + +static void ena_metrics_stats(struct ena_adapter *adapter, u64 **data) +{ + struct ena_com_dev *dev = adapter->ena_dev; + const struct ena_stats *ena_stats; + u64 *ptr; + int i; + + if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) { + u32 supported_metrics_count; + int len; + + supported_metrics_count = ena_com_get_customer_metric_count(dev); + len = supported_metrics_count * sizeof(u64); + + /* Fill the data buffer, and advance its pointer */ + ena_com_get_customer_metrics(adapter->ena_dev, (char *)(*data), len); + (*data) += supported_metrics_count; + + } else if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) { + ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats); + /* Updating regardless of rc - once we told ethtool how many stats we have + * it will print that much stats. We can't leave holes in the stats + */ + for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) { + ena_stats = &ena_stats_eni_strings[i]; + + ptr = (u64 *)&adapter->eni_stats + + ena_stats->stat_offset; + + ena_safe_update_stat(ptr, (*data)++, &adapter->syncp); + } + } + + if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) { + ena_com_get_ena_srd_info(adapter->ena_dev, &adapter->ena_srd_info); + /* Get ENA SRD mode */ + ptr = (u64 *)&adapter->ena_srd_info; + ena_safe_update_stat(ptr, (*data)++, &adapter->syncp); + for (i = 1; i < ENA_STATS_ARRAY_ENA_SRD; i++) { + ena_stats = &ena_srd_info_strings[i]; + /* Wrapped within an outer struct - need to accommodate an + * additional offset of the ENA SRD mode that was already processed + */ + ptr = (u64 *)&adapter->ena_srd_info + + ena_stats->stat_offset + 1; + + ena_safe_update_stat(ptr, (*data)++, &adapter->syncp); + } + } +} + +static void ena_queue_stats(struct ena_adapter *adapter, u64 **data) +{ + const struct ena_stats *ena_stats; + struct ena_ring *ring; + + u64 *ptr; + int i, j; + + for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) { + /* Tx stats */ + ring = &adapter->tx_ring[i]; + + for (j = 0; j < ENA_STATS_ARRAY_TX; j++) { + ena_stats = &ena_stats_tx_strings[j]; + + ptr = (u64 *)&ring->tx_stats + ena_stats->stat_offset; + + ena_safe_update_stat(ptr, (*data)++, &ring->syncp); + } + /* XDP TX queues don't have a RX queue counterpart */ + if (!ENA_IS_XDP_INDEX(adapter, i)) { + /* Rx stats */ + ring = &adapter->rx_ring[i]; + + for (j = 0; j < ENA_STATS_ARRAY_RX; j++) { + ena_stats = &ena_stats_rx_strings[j]; + + ptr = (u64 *)&ring->rx_stats + + ena_stats->stat_offset; + + ena_safe_update_stat(ptr, (*data)++, &ring->syncp); + } + } + } +} + +static void ena_com_admin_queue_stats(struct ena_adapter *adapter, u64 **data) +{ + const struct ena_stats *ena_stats; + u64 *ptr; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_ADMIN; i++) { + ena_stats = &ena_stats_ena_com_admin_strings[i]; + + ptr = (u64 *)&adapter->ena_dev->admin_queue.stats + + ena_stats->stat_offset; + + *(*data)++ = *ptr; + } +} + +static void ena_com_phc_stats(struct ena_adapter *adapter, u64 **data) +{ + const struct ena_stats *ena_stats; + u64 *ptr; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_PHC; i++) { + ena_stats = &ena_stats_ena_com_phc_strings[i]; + ptr = (u64 *)&adapter->ena_dev->phc.stats + ena_stats->stat_offset; + *(*data)++ = *ptr; + } +} + +static void ena_get_stats(struct ena_adapter *adapter, + u64 *data, + bool hw_stats_needed) +{ + const struct ena_stats *ena_stats; + u64 *ptr; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) { + ena_stats = &ena_stats_global_strings[i]; + + ptr = (u64 *)&adapter->dev_stats + ena_stats->stat_offset; + + ena_safe_update_stat(ptr, data++, &adapter->syncp); + } + + if (hw_stats_needed) + ena_metrics_stats(adapter, &data); + + ena_queue_stats(adapter, &data); + ena_com_admin_queue_stats(adapter, &data); + + if (ena_phc_is_active(adapter)) + ena_com_phc_stats(adapter, &data); +} + +static void ena_get_ethtool_stats(struct net_device *netdev, + struct ethtool_stats *stats, + u64 *data) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + ena_get_stats(adapter, data, true); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) +static int ena_get_ts_info(struct net_device *netdev, struct ethtool_ts_info *info) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + + info->phc_index = ena_phc_get_index(adapter); + + return 0; +} + +#endif +static int ena_get_sw_stats_count(struct ena_adapter *adapter) +{ + int count = adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX) + + adapter->xdp_num_queues * ENA_STATS_ARRAY_TX + + ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM_ADMIN; + + if (ena_phc_is_active(adapter)) + count += ENA_STATS_ARRAY_ENA_COM_PHC; + + return count; +} + +static int ena_get_hw_stats_count(struct ena_adapter *adapter) +{ + struct ena_com_dev *dev = adapter->ena_dev; + int count = ENA_STATS_ARRAY_ENA_SRD * + ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO); + + if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) + count += ena_com_get_customer_metric_count(dev); + else if (ena_com_get_cap(dev, ENA_ADMIN_ENI_STATS)) + count += ENA_STATS_ARRAY_ENI; + + return count; +} + +int ena_get_sset_count(struct net_device *netdev, int sset) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + switch (sset) { + case ETH_SS_STATS: + return ena_get_sw_stats_count(adapter) + + ena_get_hw_stats_count(adapter); + case ETH_SS_PRIV_FLAGS: + return ENA_PRIV_FLAGS_NR; + } + + return -EOPNOTSUPP; +} + +static void ena_metrics_stats_strings(struct ena_adapter *adapter, u8 **data) +{ + struct ena_com_dev *dev = adapter->ena_dev; + const struct ena_hw_metrics *ena_metrics; + const struct ena_stats *ena_stats; + int i; + + if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) { + for (i = 0; i < ENA_METRICS_ARRAY_ENI; i++) { + if (ena_com_get_customer_metric_support(dev, i)) { + ena_metrics = &ena_hw_stats_strings[i]; + ethtool_sprintf(data, ena_metrics->name); + } + } + } else if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) { + for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) { + ena_stats = &ena_stats_eni_strings[i]; + ethtool_sprintf(data, ena_stats->name); + } + } + + if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) { + for (i = 0; i < ENA_STATS_ARRAY_ENA_SRD; i++) { + ena_stats = &ena_srd_info_strings[i]; + ethtool_sprintf(data, ena_stats->name); + } + } +} + +static void ena_queue_strings(struct ena_adapter *adapter, u8 **data) +{ + const struct ena_stats *ena_stats; + bool is_xdp; + int i, j; + + for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) { + is_xdp = ENA_IS_XDP_INDEX(adapter, i); + /* Tx stats */ + for (j = 0; j < ENA_STATS_ARRAY_TX; j++) { + ena_stats = &ena_stats_tx_strings[j]; + + ethtool_sprintf(data, + "queue_%u_%s_%s", i, + is_xdp ? "xdp_tx" : "tx", + ena_stats->name); + } + + /* In XDP there isn't an RX queue counterpart */ + if (is_xdp) + continue; + + for (j = 0; j < ENA_STATS_ARRAY_RX; j++) { + ena_stats = &ena_stats_rx_strings[j]; + + ethtool_sprintf(data, + "queue_%u_rx_%s", i, + ena_stats->name); + } + } +} + +static void ena_com_admin_strings(u8 **data) +{ + const struct ena_stats *ena_stats; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_ADMIN; i++) { + ena_stats = &ena_stats_ena_com_admin_strings[i]; + + ethtool_sprintf(data, + "ena_admin_q_%s", ena_stats->name); + } +} + +static void ena_com_phc_strings(u8 **data) +{ + const struct ena_stats *ena_stats; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_PHC; i++) { + ena_stats = &ena_stats_ena_com_phc_strings[i]; + ethtool_sprintf(data, "%s", ena_stats->name); + } +} + +static void ena_get_strings(struct ena_adapter *adapter, + u8 *data, + bool hw_stats_needed) +{ + const struct ena_stats *ena_stats; + int i; + + for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) { + ena_stats = &ena_stats_global_strings[i]; + ethtool_sprintf(&data, ena_stats->name); + } + + if (hw_stats_needed) + ena_metrics_stats_strings(adapter, &data); + + ena_queue_strings(adapter, &data); + ena_com_admin_strings(&data); + + if (ena_phc_is_active(adapter)) + ena_com_phc_strings(&data); +} + +static void ena_get_ethtool_strings(struct net_device *netdev, + u32 sset, + u8 *data) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + switch (sset) { + case ETH_SS_STATS: + ena_get_strings(adapter, data, true); + break; + case ETH_SS_PRIV_FLAGS: + memcpy(data, ena_priv_flags_strings, sizeof(ena_priv_flags_strings)); + break; + } +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) +static int ena_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *link_ksettings) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct ena_admin_get_feature_link_desc *link; + struct ena_admin_get_feat_resp feat_resp; + int rc; + + rc = ena_com_get_link_params(ena_dev, &feat_resp); + if (rc) + return rc; + + link = &feat_resp.u.link; + link_ksettings->base.speed = link->speed; + + if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK) { + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, Autoneg); + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, Autoneg); + } + + link_ksettings->base.autoneg = + (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK) ? + AUTONEG_ENABLE : AUTONEG_DISABLE; + + link_ksettings->base.duplex = DUPLEX_FULL; + + return 0; +} + +#else +static int ena_get_settings(struct net_device *netdev, + struct ethtool_cmd *ecmd) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct ena_admin_get_feature_link_desc *link; + struct ena_admin_get_feat_resp feat_resp; + int rc; + + rc = ena_com_get_link_params(ena_dev, &feat_resp); + if (rc) + return rc; + + link = &feat_resp.u.link; + + ethtool_cmd_speed_set(ecmd, link->speed); + + if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_MASK) + ecmd->duplex = DUPLEX_FULL; + else + ecmd->duplex = DUPLEX_HALF; + + if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK) + ecmd->autoneg = AUTONEG_ENABLE; + else + ecmd->autoneg = AUTONEG_DISABLE; + + return 0; +} + +#endif +static int ena_get_coalesce(struct net_device *net_dev, +#ifdef ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED + struct ethtool_coalesce *coalesce, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +#else + struct ethtool_coalesce *coalesce) +#endif +{ + struct ena_adapter *adapter = netdev_priv(net_dev); + struct ena_com_dev *ena_dev = adapter->ena_dev; + + if (!ena_com_interrupt_moderation_supported(ena_dev)) + return -EOPNOTSUPP; + + coalesce->tx_coalesce_usecs = + ena_com_get_nonadaptive_moderation_interval_tx(ena_dev) * + ena_dev->intr_delay_resolution; + + coalesce->rx_coalesce_usecs = + ena_com_get_nonadaptive_moderation_interval_rx(ena_dev) + * ena_dev->intr_delay_resolution; + + coalesce->use_adaptive_rx_coalesce = + ena_com_get_adaptive_moderation_enabled(ena_dev); + + return 0; +} + +static void ena_update_tx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter) +{ + unsigned int val; + int i; + + val = ena_com_get_nonadaptive_moderation_interval_tx(adapter->ena_dev); + + for (i = 0; i < adapter->num_io_queues; i++) { + adapter->tx_ring[i].interrupt_interval_changed = + adapter->tx_ring[i].interrupt_interval != val; + adapter->tx_ring[i].interrupt_interval = val; + } +} + +static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter) +{ + unsigned int val; + int i; + + val = ena_com_get_nonadaptive_moderation_interval_rx(adapter->ena_dev); + + for (i = 0; i < adapter->num_io_queues; i++) { + adapter->rx_ring[i].interrupt_interval_changed = + adapter->rx_ring[i].interrupt_interval != val; + adapter->rx_ring[i].interrupt_interval = val; + } +} + +static int ena_set_coalesce(struct net_device *net_dev, +#ifdef ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED + struct ethtool_coalesce *coalesce, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +#else + struct ethtool_coalesce *coalesce) +#endif +{ + struct ena_adapter *adapter = netdev_priv(net_dev); + struct ena_com_dev *ena_dev = adapter->ena_dev; + int rc; + + if (!ena_com_interrupt_moderation_supported(ena_dev)) + return -EOPNOTSUPP; + + rc = ena_com_update_nonadaptive_moderation_interval_tx(ena_dev, + coalesce->tx_coalesce_usecs); + if (rc) + return rc; + + ena_update_tx_rings_nonadaptive_intr_moderation(adapter); + + rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev, + coalesce->rx_coalesce_usecs); + if (rc) + return rc; + + ena_update_rx_rings_nonadaptive_intr_moderation(adapter); + + if (coalesce->use_adaptive_rx_coalesce && + !ena_com_get_adaptive_moderation_enabled(ena_dev)) + ena_com_enable_adaptive_moderation(ena_dev); + + if (!coalesce->use_adaptive_rx_coalesce && + ena_com_get_adaptive_moderation_enabled(ena_dev)) + ena_com_disable_adaptive_moderation(ena_dev); + + return 0; +} + +static u32 ena_get_msglevel(struct net_device *netdev) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + return adapter->msg_enable; +} + +static void ena_set_msglevel(struct net_device *netdev, u32 value) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + adapter->msg_enable = value; +} + +static void ena_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + struct ena_adapter *adapter = netdev_priv(dev); + ssize_t ret = 0; + + ret = strscpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver)); + if (ret < 0) + netif_info(adapter, drv, dev, + "module name will be truncated, status = %zd\n", ret); + + ret = strscpy(info->version, DRV_MODULE_GENERATION, sizeof(info->version)); + if (ret < 0) + netif_info(adapter, drv, dev, + "module version will be truncated, status = %zd\n", ret); + + ret = strscpy(info->bus_info, pci_name(adapter->pdev), + sizeof(info->bus_info)); + if (ret < 0) + netif_info(adapter, drv, dev, + "bus info will be truncated, status = %zd\n", ret); + + info->n_priv_flags = ENA_PRIV_FLAGS_NR; +} + +static void ena_get_ringparam(struct net_device *netdev, +#ifdef ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) +#else + struct ethtool_ringparam *ring) +#endif +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + ring->tx_max_pending = adapter->max_tx_ring_size; + ring->rx_max_pending = adapter->max_rx_ring_size; + ring->tx_pending = adapter->tx_ring[0].ring_size; + ring->rx_pending = adapter->rx_ring[0].ring_size; +} + +static int ena_set_ringparam(struct net_device *netdev, +#ifdef ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) +#else + struct ethtool_ringparam *ring) +#endif +{ + struct ena_adapter *adapter = netdev_priv(netdev); + u32 new_tx_size, new_rx_size; + + if (ring->rx_mini_pending || ring->rx_jumbo_pending) + return -EINVAL; + + new_tx_size = clamp_val(ring->tx_pending, ENA_MIN_RING_SIZE, + adapter->max_tx_ring_size); + new_tx_size = rounddown_pow_of_two(new_tx_size); + + new_rx_size = clamp_val(ring->rx_pending, ENA_MIN_RING_SIZE, + adapter->max_rx_ring_size); + new_rx_size = rounddown_pow_of_two(new_rx_size); + + if (new_tx_size == adapter->requested_tx_ring_size && + new_rx_size == adapter->requested_rx_ring_size) + return 0; + + return ena_update_queue_sizes(adapter, new_tx_size, new_rx_size); +} + +#ifdef ETHTOOL_GRXRINGS +static u32 ena_flow_hash_to_flow_type(u16 hash_fields) +{ + u32 data = 0; + + if (hash_fields & ENA_ADMIN_RSS_L2_DA) + data |= RXH_L2DA; + + if (hash_fields & ENA_ADMIN_RSS_L3_DA) + data |= RXH_IP_DST; + + if (hash_fields & ENA_ADMIN_RSS_L3_SA) + data |= RXH_IP_SRC; + + if (hash_fields & ENA_ADMIN_RSS_L4_DP) + data |= RXH_L4_B_2_3; + + if (hash_fields & ENA_ADMIN_RSS_L4_SP) + data |= RXH_L4_B_0_1; + + return data; +} + +static u16 ena_flow_data_to_flow_hash(u32 hash_fields) +{ + u16 data = 0; + + if (hash_fields & RXH_L2DA) + data |= ENA_ADMIN_RSS_L2_DA; + + if (hash_fields & RXH_IP_DST) + data |= ENA_ADMIN_RSS_L3_DA; + + if (hash_fields & RXH_IP_SRC) + data |= ENA_ADMIN_RSS_L3_SA; + + if (hash_fields & RXH_L4_B_2_3) + data |= ENA_ADMIN_RSS_L4_DP; + + if (hash_fields & RXH_L4_B_0_1) + data |= ENA_ADMIN_RSS_L4_SP; + + return data; +} + +static int ena_get_rss_hash(struct ena_com_dev *ena_dev, + struct ethtool_rxnfc *cmd) +{ + enum ena_admin_flow_hash_proto proto; + u16 hash_fields; + int rc; + + cmd->data = 0; + + switch (cmd->flow_type) { + case TCP_V4_FLOW: + proto = ENA_ADMIN_RSS_TCP4; + break; + case UDP_V4_FLOW: + proto = ENA_ADMIN_RSS_UDP4; + break; + case TCP_V6_FLOW: + proto = ENA_ADMIN_RSS_TCP6; + break; + case UDP_V6_FLOW: + proto = ENA_ADMIN_RSS_UDP6; + break; + case IPV4_FLOW: + proto = ENA_ADMIN_RSS_IP4; + break; + case IPV6_FLOW: + proto = ENA_ADMIN_RSS_IP6; + break; + case ETHER_FLOW: + proto = ENA_ADMIN_RSS_NOT_IP; + break; + case AH_V4_FLOW: + case ESP_V4_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case SCTP_V4_FLOW: + case AH_ESP_V4_FLOW: + return -EOPNOTSUPP; + default: + return -EINVAL; + } + + rc = ena_com_get_hash_ctrl(ena_dev, proto, &hash_fields); + if (rc) + return rc; + + cmd->data = ena_flow_hash_to_flow_type(hash_fields); + + return 0; +} + +static int ena_set_rss_hash(struct ena_com_dev *ena_dev, + struct ethtool_rxnfc *cmd) +{ + enum ena_admin_flow_hash_proto proto; + u16 hash_fields; + + switch (cmd->flow_type) { + case TCP_V4_FLOW: + proto = ENA_ADMIN_RSS_TCP4; + break; + case UDP_V4_FLOW: + proto = ENA_ADMIN_RSS_UDP4; + break; + case TCP_V6_FLOW: + proto = ENA_ADMIN_RSS_TCP6; + break; + case UDP_V6_FLOW: + proto = ENA_ADMIN_RSS_UDP6; + break; + case IPV4_FLOW: + proto = ENA_ADMIN_RSS_IP4; + break; + case IPV6_FLOW: + proto = ENA_ADMIN_RSS_IP6; + break; + case ETHER_FLOW: + proto = ENA_ADMIN_RSS_NOT_IP; + break; + case AH_V4_FLOW: + case ESP_V4_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case SCTP_V4_FLOW: + case AH_ESP_V4_FLOW: + return -EOPNOTSUPP; + default: + return -EINVAL; + } + + hash_fields = ena_flow_data_to_flow_hash(cmd->data); + + return ena_com_fill_hash_ctrl(ena_dev, proto, hash_fields); +} + +static int ena_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int rc = 0; + + switch (info->cmd) { + case ETHTOOL_SRXFH: + rc = ena_set_rss_hash(adapter->ena_dev, info); + break; + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + default: + netif_err(adapter, drv, netdev, + "Command parameter %d is not supported\n", info->cmd); + rc = -EOPNOTSUPP; + } + + return rc; +} + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(3, 2, 0) +static int ena_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info, + void *rules) +#else +static int ena_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info, + u32 *rules) +#endif +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int rc = 0; + + switch (info->cmd) { + case ETHTOOL_GRXRINGS: + info->data = adapter->num_io_queues; + rc = 0; + break; + case ETHTOOL_GRXFH: + rc = ena_get_rss_hash(adapter->ena_dev, info); + break; + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + default: + netif_err(adapter, drv, netdev, + "Command parameter %d is not supported\n", info->cmd); + rc = -EOPNOTSUPP; + } + + return rc; +} +#endif /* ETHTOOL_GRXRINGS */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +static u32 ena_get_rxfh_indir_size(struct net_device *netdev) +{ + return ENA_RX_RSS_TABLE_SIZE; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +static u32 ena_get_rxfh_key_size(struct net_device *netdev) +{ + return ENA_HASH_KEY_SIZE; +} +#endif + +static int ena_indirection_table_set(struct ena_adapter *adapter, + const u32 *indir) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int i, rc; + + for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) { + rc = ena_com_indirect_table_fill_entry(ena_dev, + i, + ENA_IO_RXQ_IDX(indir[i])); + if (unlikely(rc)) { + netif_err(adapter, drv, adapter->netdev, + "Cannot fill indirect table (index is too large)\n"); + return rc; + } + } + + rc = ena_com_indirect_table_set(ena_dev); + if (rc) { + netif_err(adapter, drv, adapter->netdev, + "Cannot set indirect table\n"); + return rc == -EPERM ? -EOPNOTSUPP : rc; + } + return rc; +} + +static int ena_indirection_table_get(struct ena_adapter *adapter, u32 *indir) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int i, rc; + + if (!indir) + return 0; + + rc = ena_com_indirect_table_get(ena_dev, indir); + if (rc) + return rc; + + /* Our internal representation of the indices is: even indices + * for Tx and uneven indices for Rx. We need to convert the Rx + * indices to be consecutive + */ + for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) + indir[i] = ENA_IO_RXQ_IDX_TO_COMBINED_IDX(indir[i]); + + return rc; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) +static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, + u8 *hfunc) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + enum ena_admin_hash_functions ena_func; + u8 func; + int rc; + + rc = ena_indirection_table_get(adapter, indir); + if (rc) + return rc; + + /* We call this function in order to check if the device + * supports getting/setting the hash function. + */ + rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func); + if (rc) { + if (rc == -EOPNOTSUPP) + rc = 0; + + return rc; + } + + rc = ena_com_get_hash_key(adapter->ena_dev, key); + if (rc) + return rc; + + switch (ena_func) { + case ENA_ADMIN_TOEPLITZ: + func = ETH_RSS_HASH_TOP; + break; + case ENA_ADMIN_CRC32: + func = ETH_RSS_HASH_CRC32; + break; + default: + netif_err(adapter, drv, netdev, + "Command parameter is not supported\n"); + return -EOPNOTSUPP; + } + + if (hfunc) + *hfunc = func; + + return 0; +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + enum ena_admin_hash_functions ena_func; + int rc; + + rc = ena_indirection_table_get(adapter, indir); + if (rc) + return rc; + + /* We call this function in order to check if the device + * supports getting/setting the hash function. + */ + rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func); + if (rc) { + if (rc == -EOPNOTSUPP) + rc = 0; + + return rc; + } + + rc = ena_com_get_hash_key(adapter->ena_dev, key); + if (rc) + return rc; + + return rc; +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)/* >= 3.16.0 */ +static int ena_get_rxfh(struct net_device *netdev, u32 *indir) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + return ena_indirection_table_get(adapter, indir); +} +#endif /* >= 3.8.0 */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) +static int ena_set_rxfh(struct net_device *netdev, const u32 *indir, + const u8 *key, const u8 hfunc) +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +static int ena_set_rxfh(struct net_device *netdev, const u32 *indir, + const u8 *key) +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_com_dev *ena_dev = adapter->ena_dev; + enum ena_admin_hash_functions func = 0; + int rc; + + if (indir) { + rc = ena_indirection_table_set(adapter, indir); + if (rc) + return rc; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) + switch (hfunc) { + case ETH_RSS_HASH_NO_CHANGE: + func = ena_com_get_current_hash_function(ena_dev); + break; + case ETH_RSS_HASH_TOP: + func = ENA_ADMIN_TOEPLITZ; + break; + case ETH_RSS_HASH_CRC32: + func = ENA_ADMIN_CRC32; + break; + default: + netif_err(adapter, drv, netdev, "Unsupported hfunc %d\n", + hfunc); + return -EOPNOTSUPP; + } +#else /* Kernel 3.19 */ + func = ENA_ADMIN_TOEPLITZ; +#endif + + if (key || func) { + rc = ena_com_fill_hash_function(ena_dev, func, key, + ENA_HASH_KEY_SIZE, + 0xFFFFFFFF); + if (unlikely(rc)) { + netif_err(adapter, drv, netdev, "Cannot fill key\n"); + return rc == -EPERM ? -EOPNOTSUPP : rc; + } + } + + return 0; +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) /* Kernel > 3.16 */ +static int ena_set_rxfh(struct net_device *netdev, const u32 *indir) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int rc = 0; + + if (indir) + rc = ena_indirection_table_set(adapter, indir); + + return rc; +} +#endif /* Kernel >= 3.8 */ +#endif /* ETHTOOL_GRXFH */ +#ifndef HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT + +#ifdef ETHTOOL_SCHANNELS +static void ena_get_channels(struct net_device *netdev, + struct ethtool_channels *channels) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + channels->max_combined = adapter->max_num_io_queues; + channels->combined_count = adapter->num_io_queues; +} + +static int ena_set_channels(struct net_device *netdev, + struct ethtool_channels *channels) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + u32 count = channels->combined_count; + /* The check for max value is already done in ethtool */ +#ifdef ENA_XDP_SUPPORT + if (count < ENA_MIN_NUM_IO_QUEUES || + (ena_xdp_present(adapter) && + !ena_xdp_legal_queue_count(adapter, count))) +#else + if (count < ENA_MIN_NUM_IO_QUEUES) +#endif /* ENA_XDP_SUPPORT */ + return -EINVAL; + + if (count > adapter->max_num_io_queues) + return -EINVAL; + if (count != adapter->num_io_queues && ena_is_zc_q_exist(adapter)) { + netdev_err(adapter->netdev, + "Changing channel count not supported with xsk pool loaded\n"); + return -EOPNOTSUPP; + } + + return ena_update_queue_count(adapter, count); +} +#endif /* ETHTOOL_SCHANNELS */ + +#endif /* HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) +static int ena_get_tunable(struct net_device *netdev, + const struct ethtool_tunable *tuna, void *data) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int ret = 0; + + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + *(u32 *)data = adapter->rx_copybreak; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int ena_set_tunable(struct net_device *netdev, + const struct ethtool_tunable *tuna, + const void *data) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int ret = 0; + u32 len; + + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + len = *(u32 *)data; + ret = ena_set_rx_copybreak(adapter, len); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} +#endif /* 3.18.0 */ + +static u32 ena_get_priv_flags(struct net_device *netdev) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + u32 priv_flags = 0; + + if (adapter->rx_ring->page_cache) + priv_flags |= ENA_PRIV_FLAGS_LPC; + + return priv_flags; +} + +static int ena_set_priv_flags(struct net_device *netdev, u32 priv_flags) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + /* LPC is the only supported private flag for now */ + return ena_set_lpc_state(adapter, !!(priv_flags & ENA_PRIV_FLAGS_LPC)); +} + +static const struct ethtool_ops ena_ethtool_ops = { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_USE_ADAPTIVE_RX, +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) + .get_link_ksettings = ena_get_link_ksettings, +#else + .get_settings = ena_get_settings, +#endif + .get_drvinfo = ena_get_drvinfo, + .get_msglevel = ena_get_msglevel, + .set_msglevel = ena_set_msglevel, + .get_link = ethtool_op_get_link, + .get_coalesce = ena_get_coalesce, + .set_coalesce = ena_set_coalesce, + .get_ringparam = ena_get_ringparam, + .set_ringparam = ena_set_ringparam, + .get_sset_count = ena_get_sset_count, + .get_strings = ena_get_ethtool_strings, + .get_ethtool_stats = ena_get_ethtool_stats, +#ifdef ETHTOOL_GRXRINGS + .get_rxnfc = ena_get_rxnfc, + .set_rxnfc = ena_set_rxnfc, +#endif /* ETHTOOL_GRXRINGS */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) + .get_rxfh_indir_size = ena_get_rxfh_indir_size, +#endif /* >= 3.8.0 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) + .get_rxfh_key_size = ena_get_rxfh_key_size, + .get_rxfh = ena_get_rxfh, + .set_rxfh = ena_set_rxfh, +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) + .get_rxfh_indir = ena_get_rxfh, + .set_rxfh_indir = ena_set_rxfh, +#endif /* >= 3.8.0 */ +#ifndef HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT +#ifdef ETHTOOL_SCHANNELS + .get_channels = ena_get_channels, + .set_channels = ena_set_channels, +#endif /* ETHTOOL_SCHANNELS */ +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) + .get_tunable = ena_get_tunable, + .set_tunable = ena_set_tunable, +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) + .get_ts_info = ena_get_ts_info, +#endif + .get_priv_flags = ena_get_priv_flags, + .set_priv_flags = ena_set_priv_flags, +}; + +void ena_set_ethtool_ops(struct net_device *netdev) +{ + netdev->ethtool_ops = &ena_ethtool_ops; +} + +static void ena_dump_stats_ex(struct ena_adapter *adapter, u8 *buf) +{ + struct net_device *netdev = adapter->netdev; + u8 *strings_buf; + u64 *data_buf; + int strings_num; + int i, rc; + + strings_num = ena_get_sw_stats_count(adapter); + if (strings_num <= 0) { + netif_err(adapter, drv, netdev, "Can't get stats num\n"); + return; + } + + strings_buf = devm_kcalloc(&adapter->pdev->dev, + ETH_GSTRING_LEN, strings_num, + GFP_ATOMIC); + if (!strings_buf) { + netif_err(adapter, drv, netdev, + "Failed to allocate strings_buf\n"); + return; + } + + data_buf = devm_kcalloc(&adapter->pdev->dev, + strings_num, sizeof(u64), + GFP_ATOMIC); + if (!data_buf) { + netif_err(adapter, drv, netdev, + "Failed to allocate data buf\n"); + devm_kfree(&adapter->pdev->dev, strings_buf); + return; + } + + ena_get_strings(adapter, strings_buf, false); + ena_get_stats(adapter, data_buf, false); + + /* If there is a buffer, dump stats, otherwise print them to dmesg */ + if (buf) + for (i = 0; i < strings_num; i++) { + rc = snprintf(buf, ETH_GSTRING_LEN + sizeof(u64), + "%s %llu\n", + strings_buf + i * ETH_GSTRING_LEN, + data_buf[i]); + buf += rc; + } + else + for (i = 0; i < strings_num; i++) + netif_err(adapter, drv, netdev, "%s: %llu\n", + strings_buf + i * ETH_GSTRING_LEN, + data_buf[i]); + + devm_kfree(&adapter->pdev->dev, strings_buf); + devm_kfree(&adapter->pdev->dev, data_buf); +} + +void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf) +{ + if (!buf) + return; + + ena_dump_stats_ex(adapter, buf); +} + +void ena_dump_stats_to_dmesg(struct ena_adapter *adapter) +{ + ena_dump_stats_ex(adapter, NULL); +} diff --git a/drivers/amazon/net/ena/ena_lpc.c b/drivers/amazon/net/ena/ena_lpc.c new file mode 100644 index 0000000000000..64c3d2d24f398 --- /dev/null +++ b/drivers/amazon/net/ena/ena_lpc.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ +#include "ena_lpc.h" +#include "ena_xdp.h" + +static void ena_free_ring_page_cache(struct ena_ring *rx_ring); + +static void ena_put_unmap_cache_page(struct ena_ring *rx_ring, struct ena_page *ena_page) +{ + dma_unmap_page(rx_ring->dev, ena_page->dma_addr, ENA_PAGE_SIZE, + DMA_BIDIRECTIONAL); + + put_page(ena_page->page); +} + +/* Removes a page from page cache and allocate a new one instead. If an + * allocation of a new page fails, the cache entry isn't changed + */ +static void ena_replace_cache_page(struct ena_ring *rx_ring, + struct ena_page *ena_page) +{ + struct page *new_page; + dma_addr_t dma; + + new_page = ena_alloc_map_page(rx_ring, &dma); + + if (unlikely(IS_ERR(new_page))) + return; + + ena_put_unmap_cache_page(rx_ring, ena_page); + + ena_page->page = new_page; + ena_page->dma_addr = dma; +} + +/* Mark the cache page as used and return it. If the page belongs to a different + * NUMA than the current one, free the cache page and allocate another one + * instead. + */ +static struct page *ena_return_cache_page(struct ena_ring *rx_ring, + struct ena_page *ena_page, + dma_addr_t *dma) +{ + /* Remove pages belonging to different node than the one the CPU runs on */ + if (unlikely(page_to_nid(ena_page->page) != numa_mem_id())) { + ena_increase_stat(&rx_ring->rx_stats.lpc_wrong_numa, 1, &rx_ring->syncp); + ena_replace_cache_page(rx_ring, ena_page); + } + + /* Make sure no writes are pending for this page */ + dma_sync_single_for_device(rx_ring->dev, ena_page->dma_addr, + ENA_PAGE_SIZE, + DMA_BIDIRECTIONAL); + + /* Increase refcount to 2 so that the page is returned to the + * cache after being freed + */ + page_ref_inc(ena_page->page); + + *dma = ena_page->dma_addr; + + return ena_page->page; +} + +struct page *ena_lpc_get_page(struct ena_ring *rx_ring, dma_addr_t *dma, + bool *is_lpc_page) +{ + struct ena_page_cache *page_cache = rx_ring->page_cache; + u32 head, cache_current_size; + struct ena_page *ena_page; + + /* Cache size of zero indicates disabled cache */ + if (!page_cache) { + *is_lpc_page = false; + return ena_alloc_map_page(rx_ring, dma); + } + + *is_lpc_page = true; + + cache_current_size = page_cache->current_size; + head = page_cache->head; + + ena_page = &page_cache->cache[head]; + /* Warm up phase. We fill the pages for the first time. The + * phase is done in the napi context to improve the chances we + * allocate on the correct NUMA node + */ + if (unlikely(cache_current_size < page_cache->max_size)) { + /* Check if oldest allocated page is free */ + if (ena_page->page && page_ref_count(ena_page->page) == 1) { + page_cache->head = (head + 1) % cache_current_size; + return ena_return_cache_page(rx_ring, ena_page, dma); + } + + ena_page = &page_cache->cache[cache_current_size]; + + /* Add a new page to the cache */ + ena_page->page = ena_alloc_map_page(rx_ring, dma); + if (unlikely(IS_ERR(ena_page->page))) + return ena_page->page; + + ena_page->dma_addr = *dma; + + /* Increase refcount to 2 so that the page is returned to the + * cache after being freed + */ + page_ref_inc(ena_page->page); + + page_cache->current_size++; + + ena_increase_stat(&rx_ring->rx_stats.lpc_warm_up, 1, &rx_ring->syncp); + + return ena_page->page; + } + + /* Next page is still in use, so we allocate outside the cache */ + if (unlikely(page_ref_count(ena_page->page) != 1)) { + ena_increase_stat(&rx_ring->rx_stats.lpc_full, 1, &rx_ring->syncp); + *is_lpc_page = false; + return ena_alloc_map_page(rx_ring, dma); + } + + page_cache->head = (head + 1) & (page_cache->max_size - 1); + + return ena_return_cache_page(rx_ring, ena_page, dma); +} + +bool ena_is_lpc_supported(struct ena_adapter *adapter, + struct ena_ring *rx_ring, + bool error_print) +{ +#ifdef ENA_NETDEV_LOGS_WITHOUT_RV + void (*print_log)(const struct net_device *dev, const char *format, ...); +#else + int (*print_log)(const struct net_device *dev, const char *format, ...); +#endif + int channels_nr = adapter->num_io_queues + adapter->xdp_num_queues; + + print_log = (error_print) ? netdev_err : netdev_info; + + /* LPC is disabled below min number of channels */ + if (channels_nr < ENA_LPC_MIN_NUM_OF_CHANNELS) { + print_log(adapter->netdev, + "Local page cache is disabled for less than %d channels\n", + ENA_LPC_MIN_NUM_OF_CHANNELS); + + /* Disable LPC for such case. It can enabled again through + * ethtool private-flag. + */ + adapter->used_lpc_size = 0; + + return false; + } +#ifdef ENA_XDP_SUPPORT + + /* The driver doesn't support page caches under XDP */ + if (ena_xdp_present_ring(rx_ring)) { + print_log(adapter->netdev, + "Local page cache is disabled when using XDP\n"); + return false; + } +#endif /* ENA_XDP_SUPPORT */ + + return true; +} + +/* Calculate the size of the Local Page Cache. If LPC should be disabled, return + * a size of 0. + */ +static u32 ena_calculate_cache_size(struct ena_adapter *adapter, + struct ena_ring *rx_ring) +{ + u32 page_cache_size = adapter->used_lpc_size; + + /* LPC cache size of 0 means disabled cache */ + if (page_cache_size == 0) + return 0; + + if (!ena_is_lpc_supported(adapter, rx_ring, false)) + return 0; + + /* Clap the LPC size to its maximum value */ + if (page_cache_size > ENA_LPC_MAX_MULTIPLIER) { + netdev_info(adapter->netdev, + "Configured LPC size %d is too large, reducing to %d (max)\n", + adapter->configured_lpc_size, ENA_LPC_MAX_MULTIPLIER); + + /* Override LPC size to avoid printing this message + * every up/down operation + */ + adapter->configured_lpc_size = ENA_LPC_MAX_MULTIPLIER; + adapter->used_lpc_size = page_cache_size = ENA_LPC_MAX_MULTIPLIER; + } + + page_cache_size = page_cache_size * ENA_LPC_MULTIPLIER_UNIT; + page_cache_size = roundup_pow_of_two(page_cache_size); + + return page_cache_size; +} + +int ena_create_page_caches(struct ena_adapter *adapter) +{ + struct ena_page_cache *cache; + u32 page_cache_size; + int i; + + for (i = 0; i < adapter->num_io_queues; i++) { + struct ena_ring *rx_ring = &adapter->rx_ring[i]; + + page_cache_size = ena_calculate_cache_size(adapter, rx_ring); + + if (!page_cache_size) + return 0; + + cache = vzalloc(sizeof(struct ena_page_cache) + + sizeof(struct ena_page) * page_cache_size); + if (!cache) + goto err_cache_alloc; + + cache->max_size = page_cache_size; + rx_ring->page_cache = cache; + } + + return 0; +err_cache_alloc: + netif_err(adapter, ifup, adapter->netdev, + "Failed to initialize local page caches (LPCs)\n"); + while (--i >= 0) { + struct ena_ring *rx_ring = &adapter->rx_ring[i]; + + ena_free_ring_page_cache(rx_ring); + } + + return -ENOMEM; +} + +/* Release all pages from the page cache */ +static void ena_free_ring_cache_pages(struct ena_adapter *adapter, int qid) +{ + struct ena_ring *rx_ring = &adapter->rx_ring[qid]; + struct ena_page_cache *page_cache; + int i; + + /* Page cache is disabled */ + if (!rx_ring->page_cache) + return; + + page_cache = rx_ring->page_cache; + + /* We check size value to make sure we don't + * free pages that weren't allocated. + */ + for (i = 0; i < page_cache->current_size; i++) { + struct ena_page *ena_page = &page_cache->cache[i]; + + WARN_ON(!ena_page->page); + + dma_unmap_page(rx_ring->dev, ena_page->dma_addr, + ENA_PAGE_SIZE, + DMA_BIDIRECTIONAL); + + /* If the page is also in the rx buffer, then this operation + * would only decrease its reference count + */ + __free_page(ena_page->page); + } + + page_cache->head = page_cache->current_size = 0; +} + +void ena_free_all_cache_pages(struct ena_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) + ena_free_ring_cache_pages(adapter, i); +} + +static void ena_free_ring_page_cache(struct ena_ring *rx_ring) +{ + if(!rx_ring->page_cache) + return; + + vfree(rx_ring->page_cache); + rx_ring->page_cache = NULL; +} + +void ena_free_page_caches(struct ena_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) { + struct ena_ring *rx_ring = &adapter->rx_ring[i]; + + ena_free_ring_page_cache(rx_ring); + } +} diff --git a/drivers/amazon/net/ena/ena_lpc.h b/drivers/amazon/net/ena/ena_lpc.h new file mode 100644 index 0000000000000..2953eb24ac4dd --- /dev/null +++ b/drivers/amazon/net/ena/ena_lpc.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "ena_netdev.h" + +/* LPC definitions */ +#define ENA_LPC_DEFAULT_MULTIPLIER 2 +#define ENA_LPC_MAX_MULTIPLIER 32 +#define ENA_LPC_MULTIPLIER_UNIT 1024 +#define ENA_LPC_MIN_NUM_OF_CHANNELS 16 + +/* Store DMA address along with the page */ +struct ena_page { + struct page *page; + dma_addr_t dma_addr; +}; + +struct ena_page_cache { + /* How many pages are produced */ + u32 head; + /* How many of the entries were initialized */ + u32 current_size; + /* Maximum number of pages the cache can hold */ + u32 max_size; + + struct ena_page cache[0]; +} ____cacheline_aligned; + +int ena_create_page_caches(struct ena_adapter *adapter); +void ena_free_page_caches(struct ena_adapter *adapter); +void ena_free_all_cache_pages(struct ena_adapter *adapter); +struct page *ena_lpc_get_page(struct ena_ring *rx_ring, dma_addr_t *dma, + bool *is_lpc_page); +bool ena_is_lpc_supported(struct ena_adapter *adapter, + struct ena_ring *rx_ring, + bool error_print); diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c new file mode 100644 index 0000000000000..759926e8f8716 --- /dev/null +++ b/drivers/amazon/net/ena/ena_netdev.c @@ -0,0 +1,5071 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#ifdef CONFIG_RFS_ACCEL +#include +#endif /* CONFIG_RFS_ACCEL */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_NET_RX_BUSY_POLL) && (LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0)) +#include +#endif +#include + +#include "ena_netdev.h" +#include "ena_pci_id_tbl.h" +#include "ena_sysfs.h" +#include "ena_xdp.h" + +#include "ena_lpc.h" + +#include "ena_phc.h" +#include "ena_devlink.h" + +static char version[] = DEVICE_NAME " v" DRV_MODULE_GENERATION "\n"; + +MODULE_AUTHOR("Amazon.com, Inc. or its affiliates"); +MODULE_DESCRIPTION(DEVICE_NAME); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_MODULE_GENERATION); + +/* Time in jiffies before concluding the transmitter is hung. */ +#define TX_TIMEOUT (5 * HZ) + +#define ENA_MAX_RINGS min_t(unsigned int, ENA_MAX_NUM_IO_QUEUES, num_possible_cpus()) + +#define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | \ + NETIF_MSG_IFDOWN | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR) + +#define ENA_HIGH_LOW_TO_U64(high, low) ((((u64)(high)) << 32) | (low)) +#ifndef ENA_LINEAR_FRAG_SUPPORTED + +#define ENA_SKB_PULL_MIN_LEN 64 +#endif + +static int debug = -1; +module_param(debug, int, 0444); +MODULE_PARM_DESC(debug, "Debug level (-1=default,0=none,...,16=all)"); + +static int rx_queue_size = ENA_DEFAULT_RING_SIZE; +module_param(rx_queue_size, int, 0444); +MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Depending on instance type, max value can be up to 16K\n"); + +static int force_large_llq_header = 0; +module_param(force_large_llq_header, int, 0444); +MODULE_PARM_DESC(force_large_llq_header, "Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum TX queue size by half.\n"); + +static int num_io_queues = ENA_MAX_NUM_IO_QUEUES; +module_param(num_io_queues, int, 0444); +MODULE_PARM_DESC(num_io_queues, "Sets number of RX/TX queues to allocate to device. The maximum value depends on the device and number of online CPUs.\n"); + +static int enable_bql = 0; +module_param(enable_bql, int, 0444); +MODULE_PARM_DESC(enable_bql, "Enable BQL.\n"); + +static int lpc_size = ENA_LPC_DEFAULT_MULTIPLIER; +module_param(lpc_size, uint, 0444); +MODULE_PARM_DESC(lpc_size, "Each local page cache (lpc) holds N * 1024 pages. This parameter sets N which is rounded up to a multiplier of 2. If zero, the page cache is disabled. Max: 32\n"); + +#ifdef ENA_PHC_SUPPORT +static int phc_enable = 0; +module_param(phc_enable, uint, 0444); +MODULE_PARM_DESC(phc_enable, "Enable PHC.\n"); + +#endif /* ENA_PHC_SUPPORT */ +static struct ena_aenq_handlers aenq_handlers; + +static struct workqueue_struct *ena_wq; + +MODULE_DEVICE_TABLE(pci, ena_pci_tbl); + +static int ena_rss_init_default(struct ena_adapter *adapter); +static void check_for_admin_com_state(struct ena_adapter *adapter); +static int ena_calc_io_queue_size(struct ena_adapter *adapter, + struct ena_com_dev_get_features_ctx *get_feat_ctx); +static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat, + struct net_device *netdev); + +static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue) +{ + enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_OS_NETDEV_WD; + struct ena_adapter *adapter = netdev_priv(dev); + unsigned int time_since_last_napi, threshold; + struct ena_ring *tx_ring; + int napi_scheduled; + + if (txqueue >= adapter->num_io_queues) { + netdev_err(dev, "TX timeout on invalid queue %u\n", txqueue); + goto schedule_reset; + } + + threshold = jiffies_to_usecs(dev->watchdog_timeo); + tx_ring = &adapter->tx_ring[txqueue]; + + time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies); + napi_scheduled = !!(tx_ring->napi->state & NAPIF_STATE_SCHED); + + netdev_err(dev, + "TX q %d is paused for too long (threshold %u). Time since last napi %u usec. napi scheduled: %d\n", + txqueue, + threshold, + time_since_last_napi, + napi_scheduled); + + if (threshold < time_since_last_napi && napi_scheduled) { + netdev_err(dev, + "napi handler hasn't been called for a long time but is scheduled\n"); + reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION; + } +schedule_reset: + /* Change the state of the device to trigger reset + * Check that we are not in the middle or a trigger already + */ + if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) + return; + + ena_reset_device(adapter, reset_reason); + ena_increase_stat(&adapter->dev_stats.tx_timeout, 1, &adapter->syncp); +} + +#ifndef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER +/* This function is called by the kernel's watchdog and indicates that the queue + * has been closed longer than dev->watchdog_timeo value allows. + * In older kernels the called function doesn't contain the id of the queue + * that's been closed for too long. This helper function retrieves this + * information + */ +static void ena_find_and_timeout_queue(struct net_device *dev) +{ + struct ena_adapter *adapter = netdev_priv(dev); + unsigned long trans_start; + struct netdev_queue *txq; + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(dev, i); + trans_start = txq->trans_start; + if (netif_xmit_stopped(txq) && + time_after(jiffies, (trans_start + dev->watchdog_timeo))) { + ena_tx_timeout(dev, i); + return; + } + } + + netdev_warn(dev, "timeout was called, but no offending queue was found\n"); + + /* Change the state of the device to trigger reset + * Check that we are not in the middle or a trigger already + */ + if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) + return; + + ena_reset_device(adapter, ENA_REGS_RESET_OS_NETDEV_WD); + ena_increase_stat(&adapter->dev_stats.tx_timeout, 1, &adapter->syncp); +} + +#endif +static void update_rx_ring_mtu(struct ena_adapter *adapter, int mtu) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) + adapter->rx_ring[i].mtu = mtu; +} + +static int ena_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ena_adapter *adapter = netdev_priv(dev); + int ret; + +#ifndef HAVE_MTU_MIN_MAX_IN_NET_DEVICE + if ((new_mtu > adapter->max_mtu) || (new_mtu < ENA_MIN_MTU)) { + netif_err(adapter, drv, dev, + "Invalid MTU setting. new_mtu: %d max mtu: %d min mtu: %d\n", + new_mtu, adapter->max_mtu, ENA_MIN_MTU); + return -EINVAL; + } +#endif + ret = ena_com_set_dev_mtu(adapter->ena_dev, new_mtu); + if (!ret) { + netif_dbg(adapter, drv, dev, "Set MTU to %d\n", new_mtu); + update_rx_ring_mtu(adapter, new_mtu); + dev->mtu = new_mtu; + } else { + netif_err(adapter, drv, dev, "Failed to set MTU to %d\n", + new_mtu); + } + + return ret; +} + +int ena_xmit_common(struct ena_adapter *adapter, + struct ena_ring *ring, + struct ena_tx_buffer *tx_info, + struct ena_com_tx_ctx *ena_tx_ctx, + u16 next_to_use, + u32 bytes) +{ + int rc, nb_hw_desc; + + if (unlikely(ena_com_is_doorbell_needed(ring->ena_com_io_sq, + ena_tx_ctx))) { + netif_dbg(adapter, tx_queued, adapter->netdev, + "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n", + ring->qid); + ena_ring_tx_doorbell(ring); + } + + /* prepare the packet's descriptors to dma engine */ + rc = ena_com_prepare_tx(ring->ena_com_io_sq, ena_tx_ctx, + &nb_hw_desc); + + /* In case there isn't enough space in the queue for the packet, + * we simply drop it. All other failure reasons of + * ena_com_prepare_tx() are fatal and therefore require a device reset. + */ + if (unlikely(rc)) { + netif_err(adapter, tx_queued, adapter->netdev, + "Failed to prepare tx bufs\n"); + ena_increase_stat(&ring->tx_stats.prepare_ctx_err, 1, + &ring->syncp); + if (rc != -ENOMEM) + ena_reset_device(adapter, + ENA_REGS_RESET_DRIVER_INVALID_STATE); + return rc; + } + + u64_stats_update_begin(&ring->syncp); + ring->tx_stats.cnt++; + ring->tx_stats.bytes += bytes; + u64_stats_update_end(&ring->syncp); + + tx_info->tx_descs = nb_hw_desc; + tx_info->total_tx_size = bytes; + tx_info->last_jiffies = jiffies; + tx_info->print_once = 0; + + ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use, + ring->ring_size); + return 0; +} + +static int ena_init_rx_cpu_rmap(struct ena_adapter *adapter) +{ +#ifdef CONFIG_RFS_ACCEL + u32 i; + int rc; + + adapter->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(adapter->num_io_queues); + if (!adapter->netdev->rx_cpu_rmap) + return -ENOMEM; + for (i = 0; i < adapter->num_io_queues; i++) { + int irq_idx = ENA_IO_IRQ_IDX(i); + + rc = irq_cpu_rmap_add(adapter->netdev->rx_cpu_rmap, +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + adapter->msix_entries[irq_idx].vector); +#else + pci_irq_vector(adapter->pdev, irq_idx)); +#endif + if (rc) { + free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap); + adapter->netdev->rx_cpu_rmap = NULL; + return rc; + } + } +#endif /* CONFIG_RFS_ACCEL */ + return 0; +} + +static void ena_init_io_rings_common(struct ena_adapter *adapter, + struct ena_ring *ring, u16 qid) +{ + ring->qid = qid; + ring->pdev = adapter->pdev; + ring->dev = &adapter->pdev->dev; + ring->netdev = adapter->netdev; + ring->napi = &adapter->ena_napi[qid].napi; + ring->adapter = adapter; + ring->ena_dev = adapter->ena_dev; + ring->per_napi_packets = 0; + ring->cpu = 0; + ring->numa_node = 0; + ring->no_interrupt_event_cnt = 0; + u64_stats_init(&ring->syncp); +} + +void ena_init_io_rings(struct ena_adapter *adapter, + int first_index, int count) +{ + struct ena_com_dev *ena_dev; + struct ena_ring *txr, *rxr; + int i; + + ena_dev = adapter->ena_dev; + + for (i = first_index; i < first_index + count; i++) { + txr = &adapter->tx_ring[i]; + rxr = &adapter->rx_ring[i]; + + /* TX common ring state */ + ena_init_io_rings_common(adapter, txr, i); + + /* TX specific ring state */ + txr->ring_size = adapter->requested_tx_ring_size; + txr->tx_max_header_size = ena_dev->tx_max_header_size; + txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type; + txr->sgl_size = adapter->max_tx_sgl_size; + txr->enable_bql = enable_bql; + txr->interrupt_interval = + ena_com_get_nonadaptive_moderation_interval_tx(ena_dev); + /* Initial value, mark as true */ + txr->interrupt_interval_changed = true; + txr->disable_meta_caching = adapter->disable_meta_caching; +#ifdef ENA_XDP_SUPPORT + spin_lock_init(&txr->xdp_tx_lock); +#endif + + /* Don't init RX queues for xdp queues */ + if (!ENA_IS_XDP_INDEX(adapter, i)) { + /* RX common ring state */ + ena_init_io_rings_common(adapter, rxr, i); + + /* RX specific ring state */ + rxr->ring_size = adapter->requested_rx_ring_size; + rxr->rx_copybreak = adapter->rx_copybreak; + rxr->sgl_size = adapter->max_rx_sgl_size; + rxr->interrupt_interval = + ena_com_get_nonadaptive_moderation_interval_rx(ena_dev); + /* Initial value, mark as true */ + rxr->interrupt_interval_changed = true; + rxr->empty_rx_queue = 0; + rxr->rx_headroom = NET_SKB_PAD; + adapter->ena_napi[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; +#ifdef ENA_XDP_SUPPORT + rxr->xdp_ring = &adapter->tx_ring[i + adapter->num_io_queues]; +#endif + } + } +} + +/* ena_setup_tx_resources - allocate I/O Tx resources (Descriptors) + * @adapter: network interface device structure + * @qid: queue index + * + * Return 0 on success, negative on failure + */ +static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid) +{ + struct ena_ring *tx_ring = &adapter->tx_ring[qid]; + struct ena_irq *ena_irq = &adapter->irq_tbl[ENA_IO_IRQ_IDX(qid)]; + int size, i, node; + + if (tx_ring->tx_buffer_info) { + netif_err(adapter, ifup, + adapter->netdev, "tx_buffer_info info is not NULL"); + return -EEXIST; + } + + size = sizeof(struct ena_tx_buffer) * tx_ring->ring_size; + node = cpu_to_node(ena_irq->cpu); + + tx_ring->tx_buffer_info = vzalloc_node(size, node); + if (!tx_ring->tx_buffer_info) { + tx_ring->tx_buffer_info = vzalloc(size); + if (!tx_ring->tx_buffer_info) + goto err_tx_buffer_info; + } + + size = sizeof(u16) * tx_ring->ring_size; + tx_ring->free_ids = vzalloc_node(size, node); + if (!tx_ring->free_ids) { + tx_ring->free_ids = vzalloc(size); + if (!tx_ring->free_ids) + goto err_tx_free_ids; + } + + size = tx_ring->tx_max_header_size; + tx_ring->push_buf_intermediate_buf = vzalloc_node(size, node); + if (!tx_ring->push_buf_intermediate_buf) { + tx_ring->push_buf_intermediate_buf = vzalloc(size); + if (!tx_ring->push_buf_intermediate_buf) + goto err_push_buf_intermediate_buf; + } + + /* Req id ring for TX out of order completions */ + for (i = 0; i < tx_ring->ring_size; i++) + tx_ring->free_ids[i] = i; + + /* Reset tx statistics */ + memset(&tx_ring->tx_stats, 0x0, sizeof(tx_ring->tx_stats)); + + tx_ring->next_to_use = 0; + tx_ring->next_to_clean = 0; + tx_ring->cpu = ena_irq->cpu; + tx_ring->numa_node = node; + return 0; + +err_push_buf_intermediate_buf: + vfree(tx_ring->free_ids); + tx_ring->free_ids = NULL; +err_tx_free_ids: + vfree(tx_ring->tx_buffer_info); + tx_ring->tx_buffer_info = NULL; +err_tx_buffer_info: + return -ENOMEM; +} + +/* ena_free_tx_resources - Free I/O Tx Resources per Queue + * @adapter: network interface device structure + * @qid: queue index + * + * Free all transmit software resources + */ +static void ena_free_tx_resources(struct ena_adapter *adapter, int qid) +{ + struct ena_ring *tx_ring = &adapter->tx_ring[qid]; + + vfree(tx_ring->tx_buffer_info); + tx_ring->tx_buffer_info = NULL; + + vfree(tx_ring->free_ids); + tx_ring->free_ids = NULL; + + vfree(tx_ring->push_buf_intermediate_buf); + tx_ring->push_buf_intermediate_buf = NULL; +} + +int ena_setup_tx_resources_in_range(struct ena_adapter *adapter, + int first_index, int count) +{ + int i, rc = 0; + + for (i = first_index; i < first_index + count; i++) { + rc = ena_setup_tx_resources(adapter, i); + if (rc) + goto err_setup_tx; + } + + return 0; + +err_setup_tx: + + netif_err(adapter, ifup, adapter->netdev, + "Tx queue %d: allocation failed\n", i); + + /* rewind the index freeing the rings as we go */ + while (first_index < i--) + ena_free_tx_resources(adapter, i); + return rc; +} + +void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter, + int first_index, int count) +{ + int i; + + for (i = first_index; i < first_index + count; i++) + ena_free_tx_resources(adapter, i); +} + +/* ena_free_all_io_tx_resources - Free I/O Tx Resources for All Queues + * @adapter: board private structure + * + * Free all transmit software resources + */ +void ena_free_all_io_tx_resources(struct ena_adapter *adapter) +{ + ena_free_all_io_tx_resources_in_range(adapter, + 0, + adapter->xdp_num_queues + + adapter->num_io_queues); +} + +/* ena_setup_rx_resources - allocate I/O Rx resources (Descriptors) + * @adapter: network interface device structure + * @qid: queue index + * + * Returns 0 on success, negative on failure + */ +static int ena_setup_rx_resources(struct ena_adapter *adapter, + u32 qid) +{ + struct ena_ring *rx_ring = &adapter->rx_ring[qid]; + struct ena_irq *ena_irq = &adapter->irq_tbl[ENA_IO_IRQ_IDX(qid)]; + int size, node, i; + + if (rx_ring->rx_buffer_info) { + netif_err(adapter, ifup, adapter->netdev, + "rx_buffer_info is not NULL"); + return -EEXIST; + } + + /* alloc extra element so in rx path + * we can always prefetch rx_info + 1 + */ + size = sizeof(struct ena_rx_buffer) * (rx_ring->ring_size + 1); + node = cpu_to_node(ena_irq->cpu); + + rx_ring->rx_buffer_info = vzalloc_node(size, node); + if (!rx_ring->rx_buffer_info) { + rx_ring->rx_buffer_info = vzalloc(size); + if (!rx_ring->rx_buffer_info) + return -ENOMEM; + } + + size = sizeof(u16) * rx_ring->ring_size; + rx_ring->free_ids = vzalloc_node(size, node); + if (!rx_ring->free_ids) { + rx_ring->free_ids = vzalloc(size); + if (!rx_ring->free_ids) { + vfree(rx_ring->rx_buffer_info); + rx_ring->rx_buffer_info = NULL; + return -ENOMEM; + } + } + + /* Req id ring for receiving RX pkts out of order */ + for (i = 0; i < rx_ring->ring_size; i++) + rx_ring->free_ids[i] = i; + + /* Reset rx statistics */ + memset(&rx_ring->rx_stats, 0x0, sizeof(rx_ring->rx_stats)); + +#ifdef ENA_BUSY_POLL_SUPPORT + ena_bp_init_lock(rx_ring); +#endif + rx_ring->next_to_clean = 0; + rx_ring->next_to_use = 0; + rx_ring->cpu = ena_irq->cpu; + rx_ring->numa_node = node; + + return 0; +} + +/* ena_free_rx_resources - Free I/O Rx Resources + * @adapter: network interface device structure + * @qid: queue index + * + * Free all receive software resources + */ +static void ena_free_rx_resources(struct ena_adapter *adapter, + u32 qid) +{ + struct ena_ring *rx_ring = &adapter->rx_ring[qid]; + + vfree(rx_ring->rx_buffer_info); + rx_ring->rx_buffer_info = NULL; + + vfree(rx_ring->free_ids); + rx_ring->free_ids = NULL; +} + +/* ena_setup_all_rx_resources - allocate I/O Rx queues resources for all queues + * @adapter: board private structure + * + * Return 0 on success, negative on failure + */ +static int ena_setup_all_rx_resources(struct ena_adapter *adapter) +{ + int i, rc = 0; + + for (i = 0; i < adapter->num_io_queues; i++) { + rc = ena_setup_rx_resources(adapter, i); + if (rc) + goto err_setup_rx; + } + + return 0; + +err_setup_rx: + + netif_err(adapter, ifup, adapter->netdev, + "Rx queue %d: allocation failed\n", i); + + /* rewind the index freeing the rings as we go */ + while (i--) + ena_free_rx_resources(adapter, i); + return rc; +} + +/* ena_free_all_io_rx_resources - Free I/O Rx Resources for All Queues + * @adapter: board private structure + * + * Free all receive software resources + */ +static void ena_free_all_io_rx_resources(struct ena_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) + ena_free_rx_resources(adapter, i); +} + +struct page *ena_alloc_map_page(struct ena_ring *rx_ring, + dma_addr_t *dma) +{ + struct page *page; + + /* This would allocate the page on the same NUMA node the executing code + * is running on. + */ + page = dev_alloc_page(); + if (!page) { + ena_increase_stat(&rx_ring->rx_stats.page_alloc_fail, 1, + &rx_ring->syncp); + return ERR_PTR(-ENOSPC); + } + + /* To enable NIC-side port-mirroring, AKA SPAN port, + * we make the buffer readable from the nic as well + */ + *dma = dma_map_page(rx_ring->dev, page, 0, ENA_PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (unlikely(dma_mapping_error(rx_ring->dev, *dma))) { + ena_increase_stat(&rx_ring->rx_stats.dma_mapping_err, 1, + &rx_ring->syncp); + __free_page(page); + return ERR_PTR(-EIO); + } + + return page; +} + +static int ena_alloc_rx_buffer(struct ena_ring *rx_ring, + struct ena_rx_buffer *rx_info) +{ + int headroom = rx_ring->rx_headroom; + struct ena_com_buf *ena_buf; + struct page *page; + dma_addr_t dma; + int tailroom; + + /* restore page offset value in case it has been changed by device */ + rx_info->buf_offset = headroom; + + /* if previous allocated page is not used */ + if (unlikely(rx_info->page)) + return 0; + + tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + ena_buf = &rx_info->ena_buf; + +#ifdef ENA_AF_XDP_SUPPORT + if (unlikely(ENA_IS_XSK_RING(rx_ring))) { + struct xdp_buff *xdp; + + xdp = xsk_buff_alloc(rx_ring->xsk_pool); + if (!xdp) + return -ENOMEM; + + ena_buf->paddr = xsk_buff_xdp_get_dma(xdp); + ena_buf->len = xsk_pool_get_rx_frame_size(rx_ring->xsk_pool); + + rx_info->xdp = xdp; + + return 0; + } +#endif /* ENA_AF_XDP_SUPPORT */ + + /* We handle DMA here */ + page = ena_lpc_get_page(rx_ring, &dma, &rx_info->is_lpc_page); + if (unlikely(IS_ERR(page))) + return PTR_ERR(page); + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "Allocate page %p, rx_info %p\n", page, rx_info); + + rx_info->page = page; + rx_info->dma_addr = dma; + rx_info->page_offset = 0; + ena_buf->paddr = dma + headroom; + ena_buf->len = ENA_PAGE_SIZE - headroom - tailroom; + + return 0; +} + +static void ena_unmap_rx_buff_attrs(struct ena_ring *rx_ring, + struct ena_rx_buffer *rx_info, + unsigned long attrs) +{ + /* LPC pages are unmapped at cache destruction */ + if (rx_info->is_lpc_page) + return; + + ena_dma_unmap_page_attrs(rx_ring->dev, rx_info->dma_addr, ENA_PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); +} + +static void ena_unmap_rx_buff(struct ena_ring *rx_ring, + struct ena_rx_buffer *rx_info) +{ + ena_unmap_rx_buff_attrs(rx_ring, rx_info, 0); +} + +static void ena_free_rx_page(struct ena_ring *rx_ring, + struct ena_rx_buffer *rx_info) +{ + struct page *page = rx_info->page; + + if (unlikely(!page)) { + netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, + "Trying to free unallocated buffer\n"); + return; + } + + ena_unmap_rx_buff(rx_ring, rx_info); + + __free_page(page); + rx_info->page = NULL; +} + +int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num) +{ + u16 next_to_use, req_id; + u32 i; + int rc; + + next_to_use = rx_ring->next_to_use; + + for (i = 0; i < num; i++) { + struct ena_rx_buffer *rx_info; + + req_id = rx_ring->free_ids[next_to_use]; + + rx_info = &rx_ring->rx_buffer_info[req_id]; + + rc = ena_alloc_rx_buffer(rx_ring, rx_info); + if (unlikely(rc < 0)) { + if (!ENA_IS_XSK_RING(rx_ring)) + netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, + "Failed to allocate buffer for rx queue %d\n", + rx_ring->qid); + break; + } + rc = ena_com_add_single_rx_desc(rx_ring->ena_com_io_sq, + &rx_info->ena_buf, + req_id); + if (unlikely(rc)) { + netif_warn(rx_ring->adapter, rx_status, rx_ring->netdev, + "Failed to add buffer for rx queue %d\n", + rx_ring->qid); + break; + } + next_to_use = ENA_RX_RING_IDX_NEXT(next_to_use, + rx_ring->ring_size); + } + + if (unlikely(i < num)) { + ena_increase_stat(&rx_ring->rx_stats.refil_partial, 1, + &rx_ring->syncp); + if (!ENA_IS_XSK_RING(rx_ring)) + netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, + "Refilled rx qid %d with only %d buffers (from %d)\n", + rx_ring->qid, i, num); + } + + /* ena_com_write_sq_doorbell issues a wmb() */ + if (likely(i)) + ena_com_write_sq_doorbell(rx_ring->ena_com_io_sq); + + rx_ring->next_to_use = next_to_use; + + return i; +} + +static void ena_free_rx_bufs(struct ena_adapter *adapter, + u32 qid) +{ + struct ena_ring *rx_ring = &adapter->rx_ring[qid]; + u32 i; + + if (ENA_IS_XSK_RING(rx_ring)) { + ena_xdp_free_rx_bufs_zc(adapter, qid); + return; + } + + for (i = 0; i < rx_ring->ring_size; i++) { + struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i]; + + if (rx_info->page) + ena_free_rx_page(rx_ring, rx_info); + } +} + +/* ena_refill_all_rx_bufs - allocate all queues Rx buffers + * @adapter: board private structure + */ +static void ena_refill_all_rx_bufs(struct ena_adapter *adapter) +{ + struct ena_ring *rx_ring; + int i, rc, bufs_num; + + for (i = 0; i < adapter->num_io_queues; i++) { + rx_ring = &adapter->rx_ring[i]; + bufs_num = rx_ring->ring_size - 1; + rc = ena_refill_rx_bufs(rx_ring, bufs_num); + + if (unlikely(rc != bufs_num)) + netif_warn(rx_ring->adapter, rx_status, rx_ring->netdev, + "Refilling Queue %d failed. allocated %d buffers from: %d\n", + i, rc, bufs_num); + } +} + +static void ena_free_all_rx_bufs(struct ena_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) + ena_free_rx_bufs(adapter, i); +} + +void ena_unmap_tx_buff(struct ena_ring *tx_ring, + struct ena_tx_buffer *tx_info) +{ + struct ena_com_buf *ena_buf; + u32 cnt; + int i; + + ena_buf = tx_info->bufs; + cnt = tx_info->num_of_bufs; + + if (unlikely(!cnt)) + return; + + if (tx_info->map_linear_data) { + dma_unmap_single(tx_ring->dev, + dma_unmap_addr(ena_buf, paddr), + dma_unmap_len(ena_buf, len), + DMA_TO_DEVICE); + ena_buf++; + cnt--; + } + + /* unmap remaining mapped pages */ + for (i = 0; i < cnt; i++) { + dma_unmap_page(tx_ring->dev, dma_unmap_addr(ena_buf, paddr), + dma_unmap_len(ena_buf, len), DMA_TO_DEVICE); + ena_buf++; + } +} + +/* ena_free_tx_bufs - Free Tx Buffers per Queue + * @tx_ring: TX ring for which buffers be freed + */ +static void ena_free_tx_bufs(struct ena_ring *tx_ring) +{ + bool print_once = true; + u32 i; + + for (i = 0; i < tx_ring->ring_size; i++) { + struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i]; + + if (!tx_info->skb) + continue; + + if (print_once) { + netif_notice(tx_ring->adapter, ifdown, tx_ring->netdev, + "Free uncompleted tx skb qid %d idx 0x%x\n", + tx_ring->qid, i); + print_once = false; + } else { + netif_dbg(tx_ring->adapter, ifdown, tx_ring->netdev, + "Free uncompleted tx skb qid %d idx 0x%x\n", + tx_ring->qid, i); + } + + ena_unmap_tx_buff(tx_ring, tx_info); + + dev_kfree_skb_any(tx_info->skb); + } + netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev, + tx_ring->qid)); +} + +static void ena_free_all_tx_bufs(struct ena_adapter *adapter) +{ + struct ena_ring *tx_ring; + int i; + + for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) { + tx_ring = &adapter->tx_ring[i]; + if (ENA_IS_XSK_RING(tx_ring)) { + ena_xdp_free_tx_bufs_zc(tx_ring); + continue; + } + ena_free_tx_bufs(tx_ring); + } +} + +static void ena_destroy_all_tx_queues(struct ena_adapter *adapter) +{ + u16 ena_qid; + int i; + + for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) { + ena_qid = ENA_IO_TXQ_IDX(i); + ena_com_destroy_io_queue(adapter->ena_dev, ena_qid); + } +} + +static void ena_destroy_all_rx_queues(struct ena_adapter *adapter) +{ + u16 ena_qid; + int i; + + for (i = 0; i < adapter->num_io_queues; i++) { + ena_qid = ENA_IO_RXQ_IDX(i); + cancel_work_sync(&adapter->ena_napi[i].dim.work); + ena_xdp_unregister_rxq_info(&adapter->rx_ring[i]); + ena_com_destroy_io_queue(adapter->ena_dev, ena_qid); + } +} + +static void ena_destroy_all_io_queues(struct ena_adapter *adapter) +{ + ena_destroy_all_tx_queues(adapter); + ena_destroy_all_rx_queues(adapter); +} + +int handle_invalid_req_id(struct ena_ring *ring, u16 req_id, + struct ena_tx_buffer *tx_info, bool is_xdp) +{ + if (tx_info) + netif_err(ring->adapter, + tx_done, + ring->netdev, + "tx_info doesn't have valid %s. qid %u req_id %u", + is_xdp ? "xdp frame" : "skb", ring->qid, req_id); + else + netif_err(ring->adapter, + tx_done, + ring->netdev, + "Invalid req_id %u in qid %u\n", + req_id, ring->qid); + + ena_increase_stat(&ring->tx_stats.bad_req_id, 1, &ring->syncp); + ena_reset_device(ring->adapter, ENA_REGS_RESET_INV_TX_REQ_ID); + + return -EFAULT; +} + +static int validate_tx_req_id(struct ena_ring *tx_ring, u16 req_id) +{ + struct ena_tx_buffer *tx_info; + + tx_info = &tx_ring->tx_buffer_info[req_id]; + if (likely(tx_info->skb)) + return 0; + + return handle_invalid_req_id(tx_ring, req_id, tx_info, false); +} + +static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget) +{ + struct netdev_queue *txq; + bool above_thresh; + u32 tx_bytes = 0; + u32 total_done = 0; + u16 next_to_clean; + u16 req_id; + int tx_pkts = 0; + int rc; + + next_to_clean = tx_ring->next_to_clean; + txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->qid); + + while (tx_pkts < budget) { + struct ena_tx_buffer *tx_info; + struct sk_buff *skb; + + rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq, + &req_id); + if (rc) { + if (unlikely(rc == -EINVAL)) + handle_invalid_req_id(tx_ring, req_id, NULL, + false); + break; + } + + /* validate that the request id points to a valid skb */ + rc = validate_tx_req_id(tx_ring, req_id); + if (rc) + break; + + tx_info = &tx_ring->tx_buffer_info[req_id]; + skb = tx_info->skb; + + /* prefetch skb_end_pointer() to speedup skb_shinfo(skb) */ + prefetch(&skb->end); + + tx_info->skb = NULL; + tx_info->last_jiffies = 0; + + ena_unmap_tx_buff(tx_ring, tx_info); + + netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev, + "tx_poll: q %d skb %p completed\n", tx_ring->qid, + skb); + + tx_bytes += tx_info->total_tx_size; + dev_kfree_skb(skb); + tx_pkts++; + total_done += tx_info->tx_descs; + + tx_ring->free_ids[next_to_clean] = req_id; + next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean, + tx_ring->ring_size); + } + + tx_ring->next_to_clean = next_to_clean; + ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done); + + if (tx_ring->enable_bql) + netdev_tx_completed_queue(txq, tx_pkts, tx_bytes); + + netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev, + "tx_poll: q %d done. total pkts: %d\n", + tx_ring->qid, tx_pkts); + + /* need to make the rings circular update visible to + * ena_start_xmit() before checking for netif_queue_stopped(). + */ + smp_mb(); + + above_thresh = ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, + ENA_TX_WAKEUP_THRESH); + if (unlikely(netif_tx_queue_stopped(txq) && above_thresh)) { + __netif_tx_lock(txq, smp_processor_id()); + above_thresh = + ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, + ENA_TX_WAKEUP_THRESH); + if (netif_tx_queue_stopped(txq) && above_thresh && + test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags)) { + netif_tx_wake_queue(txq); + ena_increase_stat(&tx_ring->tx_stats.queue_wakeup, 1, + &tx_ring->syncp); + } + __netif_tx_unlock(txq); + } + + return tx_pkts; +} + +static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag, u16 len) +{ + struct sk_buff *skb; + +#ifdef ENA_LINEAR_FRAG_SUPPORTED + if (!first_frag) + skb = napi_alloc_skb(rx_ring->napi, len); + else + skb = build_skb(first_frag, len); +#else + if (!first_frag) + skb = napi_alloc_skb(rx_ring->napi, len); + else + skb = napi_alloc_skb(rx_ring->napi, + ENA_SKB_PULL_MIN_LEN); +#endif /* ENA_LINEAR_FRAG_SUPPORTED */ + + if (unlikely(!skb)) { + ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1, + &rx_ring->syncp); + + netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev, + "Failed to allocate skb. first_frag %s\n", + first_frag ? "provided" : "not provided"); + } + + return skb; +} + +static bool ena_try_rx_buf_page_reuse(struct ena_rx_buffer *rx_info, u16 buf_len, + u16 len, int pkt_offset) +{ + struct ena_com_buf *ena_buf = &rx_info->ena_buf; + + /* More than ENA_MIN_RX_BUF_SIZE left in the reused buffer + * for data + headroom + tailroom. + */ + if (SKB_DATA_ALIGN(len + pkt_offset) + ENA_MIN_RX_BUF_SIZE <= ena_buf->len) { + page_ref_inc(rx_info->page); + rx_info->page_offset += buf_len; + ena_buf->paddr += buf_len; + ena_buf->len -= buf_len; + return true; + } + + return false; +} + +static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring, + struct ena_com_rx_buf_info *ena_bufs, + u32 descs, + u16 *next_to_clean) +{ + int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + bool is_xdp_loaded = ena_xdp_present_ring(rx_ring); + struct ena_rx_buffer *rx_info; + struct ena_adapter *adapter; + int page_offset, pkt_offset; + dma_addr_t pre_reuse_paddr; + u16 len, req_id, buf = 0; + bool reuse_rx_buf_page; + struct sk_buff *skb; + void *buf_addr; + int buf_offset; + u16 buf_len; +#ifndef ENA_LINEAR_FRAG_SUPPORTED + void *data_addr; + u16 hlen; +#endif + + len = ena_bufs[buf].len; + req_id = ena_bufs[buf].req_id; + + rx_info = &rx_ring->rx_buffer_info[req_id]; + + if (unlikely(!rx_info->page)) { + adapter = rx_ring->adapter; + netif_err(adapter, rx_err, rx_ring->netdev, + "Page is NULL. qid %u req_id %u\n", rx_ring->qid, req_id); + ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1, &rx_ring->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID); + return NULL; + } + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "rx_info %p page %p\n", + rx_info, rx_info->page); + + buf_offset = rx_info->buf_offset; + pkt_offset = buf_offset - rx_ring->rx_headroom; + page_offset = rx_info->page_offset; + buf_addr = page_address(rx_info->page) + page_offset; + + if (len <= rx_ring->rx_copybreak) { + skb = ena_alloc_skb(rx_ring, NULL, len); + if (unlikely(!skb)) + return NULL; + + /* sync this buffer for CPU use */ + dma_sync_single_for_cpu(rx_ring->dev, + dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset, + len, + DMA_FROM_DEVICE); + skb_copy_to_linear_data(skb, buf_addr + buf_offset, len); + dma_sync_single_for_device(rx_ring->dev, + dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset, + len, + DMA_FROM_DEVICE); + + skb_put(skb, len); + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "RX allocated small packet. len %d.\n", skb->len); +#ifdef ENA_BUSY_POLL_SUPPORT + skb_mark_napi_id(skb, rx_ring->napi); +#endif + skb->protocol = eth_type_trans(skb, rx_ring->netdev); + rx_ring->free_ids[*next_to_clean] = req_id; + *next_to_clean = ENA_RX_RING_IDX_ADD(*next_to_clean, descs, + rx_ring->ring_size); + return skb; + } + + buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom); + + pre_reuse_paddr = dma_unmap_addr(&rx_info->ena_buf, paddr); + + /* If XDP isn't loaded try to reuse part of the RX buffer */ + reuse_rx_buf_page = !is_xdp_loaded && + ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset); + + dma_sync_single_for_cpu(rx_ring->dev, + pre_reuse_paddr + pkt_offset, + len, + DMA_FROM_DEVICE); + + if (!reuse_rx_buf_page) + ena_unmap_rx_buff_attrs(rx_ring, rx_info, ENA_DMA_ATTR_SKIP_CPU_SYNC); + + + skb = ena_alloc_skb(rx_ring, buf_addr, buf_len); + if (unlikely(!skb)) + return NULL; + +#ifdef ENA_LINEAR_FRAG_SUPPORTED + /* Populate skb's linear part */ + skb_reserve(skb, buf_offset); + skb_put(skb, len); +#else + data_addr = buf_addr + buf_offset; + + /* GRO expects us to have the ethernet header in the linear part. + * Copy the first ENA_SKB_PULL_MIN_LEN bytes because it is more + * efficient. + */ + hlen = min_t(u16, len, ENA_SKB_PULL_MIN_LEN); + memcpy(__skb_put(skb, hlen), data_addr, hlen); + if (hlen < len) + skb_add_rx_frag(skb, 0, rx_info->page, + page_offset + buf_offset + hlen, + len - hlen, buf_len); +#endif + skb->protocol = eth_type_trans(skb, rx_ring->netdev); + + do { + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "RX skb updated. len %d. data_len %d\n", + skb->len, skb->data_len); + + if (!reuse_rx_buf_page) + rx_info->page = NULL; + + rx_ring->free_ids[*next_to_clean] = req_id; + *next_to_clean = + ENA_RX_RING_IDX_NEXT(*next_to_clean, + rx_ring->ring_size); + if (likely(--descs == 0)) + break; + + buf++; + len = ena_bufs[buf].len; + req_id = ena_bufs[buf].req_id; + + rx_info = &rx_ring->rx_buffer_info[req_id]; + + /* rx_info->buf_offset includes rx_ring->rx_headroom */ + buf_offset = rx_info->buf_offset; + pkt_offset = buf_offset - rx_ring->rx_headroom; + buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom); + page_offset = rx_info->page_offset; + + pre_reuse_paddr = dma_unmap_addr(&rx_info->ena_buf, paddr); + + reuse_rx_buf_page = !is_xdp_loaded && + ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset); + + dma_sync_single_for_cpu(rx_ring->dev, + pre_reuse_paddr + pkt_offset, + len, + DMA_FROM_DEVICE); + + if (!reuse_rx_buf_page) + ena_unmap_rx_buff_attrs(rx_ring, rx_info, ENA_DMA_ATTR_SKIP_CPU_SYNC); + + + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page, + page_offset + buf_offset, len, buf_len); + + } while (1); + +#ifdef ENA_BUSY_POLL_SUPPORT + skb_mark_napi_id(skb, rx_ring->napi); + +#endif + return skb; +} + +/* ena_rx_checksum - indicate in skb if hw indicated a good cksum + * @adapter: structure containing adapter specific data + * @ena_rx_ctx: received packet context/metadata + * @skb: skb currently being received and modified + */ +void ena_rx_checksum(struct ena_ring *rx_ring, + struct ena_com_rx_ctx *ena_rx_ctx, + struct sk_buff *skb) +{ + /* Rx csum disabled */ + if (unlikely(!(rx_ring->netdev->features & NETIF_F_RXCSUM))) { + skb->ip_summed = CHECKSUM_NONE; + return; + } + + /* For fragmented packets the checksum isn't valid */ + if (ena_rx_ctx->frag) { + skb->ip_summed = CHECKSUM_NONE; + return; + } + + /* if IP and error */ + if (unlikely((ena_rx_ctx->l3_proto == ENA_ETH_IO_L3_PROTO_IPV4) && + (ena_rx_ctx->l3_csum_err))) { + /* ipv4 checksum error */ + skb->ip_summed = CHECKSUM_NONE; + ena_increase_stat(&rx_ring->rx_stats.csum_bad, 1, + &rx_ring->syncp); + netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev, + "RX IPv4 header checksum error\n"); + return; + } + + /* if TCP/UDP */ + if (likely((ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_TCP) || + (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP))) { + if (unlikely(ena_rx_ctx->l4_csum_err)) { + /* TCP/UDP checksum error */ + ena_increase_stat(&rx_ring->rx_stats.csum_bad, 1, + &rx_ring->syncp); + netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev, + "RX L4 checksum error\n"); + skb->ip_summed = CHECKSUM_NONE; + return; + } + + if (likely(ena_rx_ctx->l4_csum_checked)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + ena_increase_stat(&rx_ring->rx_stats.csum_good, 1, + &rx_ring->syncp); + } else { + ena_increase_stat(&rx_ring->rx_stats.csum_unchecked, 1, + &rx_ring->syncp); + skb->ip_summed = CHECKSUM_NONE; + } + } else { + skb->ip_summed = CHECKSUM_NONE; + return; + } + +} + +void ena_set_rx_hash(struct ena_ring *rx_ring, + struct ena_com_rx_ctx *ena_rx_ctx, + struct sk_buff *skb) +{ +#ifdef NETIF_F_RXHASH + enum pkt_hash_types hash_type; + + if (likely(rx_ring->netdev->features & NETIF_F_RXHASH)) { + if (likely((ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_TCP) || + (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP))) + + hash_type = PKT_HASH_TYPE_L4; + else + hash_type = PKT_HASH_TYPE_NONE; + + /* Override hash type if the packet is fragmented */ + if (ena_rx_ctx->frag) + hash_type = PKT_HASH_TYPE_NONE; + + skb_set_hash(skb, ena_rx_ctx->hash, hash_type); + } +#endif /* NETIF_F_RXHASH */ +} + +#ifdef ENA_XDP_SUPPORT +static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp, u16 num_descs) +{ + struct ena_rx_buffer *rx_info; + int ret; + + /* XDP multi-buffer packets not supported */ + if (unlikely(num_descs > 1)) { + netdev_err_once(rx_ring->adapter->netdev, + "xdp: dropped multi-buffer packets. RX packets must be < %lu\n", + ENA_XDP_MAX_MTU); + ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp); + return ENA_XDP_DROP; + } + + rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]; + xdp_prepare_buff(xdp, page_address(rx_info->page), + rx_info->buf_offset, + rx_ring->ena_bufs[0].len, false); + + ret = ena_xdp_execute(rx_ring, xdp); + + /* The xdp program might expand the headers */ + if (ret == ENA_XDP_PASS) { + rx_info->buf_offset = xdp->data - xdp->data_hard_start; + rx_ring->ena_bufs[0].len = xdp->data_end - xdp->data; + } + + return ret; +} + +#endif /* ENA_XDP_SUPPORT */ +/* ena_clean_rx_irq - Cleanup RX irq + * @rx_ring: RX ring to clean + * @napi: napi handler + * @budget: how many packets driver is allowed to clean + * + * Returns the number of cleaned buffers. + */ +static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, + u32 budget) +{ + u16 next_to_clean = rx_ring->next_to_clean; + struct ena_com_rx_ctx ena_rx_ctx; + struct ena_rx_buffer *rx_info; + struct ena_adapter *adapter; + u32 res_budget, work_done; + int rx_copybreak_pkt = 0; + int refill_threshold; + struct sk_buff *skb; + int refill_required; +#ifdef ENA_XDP_SUPPORT + struct xdp_buff xdp; + int xdp_flags = 0; +#endif /* ENA_XDP_SUPPORT */ + int total_len = 0; +#ifdef ENA_XDP_SUPPORT + int xdp_verdict; +#endif /* ENA_XDP_SUPPORT */ + int rc = 0; + int i; + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "%s qid %d\n", __func__, rx_ring->qid); + res_budget = budget; +#ifdef ENA_XDP_SUPPORT + xdp_init_buff(&xdp, ENA_PAGE_SIZE, &rx_ring->xdp_rxq); +#endif /* ENA_XDP_SUPPORT */ + + do { +#ifdef ENA_XDP_SUPPORT + xdp_verdict = ENA_XDP_PASS; + skb = NULL; +#endif /* ENA_XDP_SUPPORT */ + ena_rx_ctx.ena_bufs = rx_ring->ena_bufs; + ena_rx_ctx.max_bufs = rx_ring->sgl_size; + ena_rx_ctx.descs = 0; + ena_rx_ctx.pkt_offset = 0; + rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq, + rx_ring->ena_com_io_sq, + &ena_rx_ctx); + if (unlikely(rc)) + goto error; + + if (unlikely(ena_rx_ctx.descs == 0)) + break; + + /* First descriptor might have an offset set by the device */ + rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]; + rx_info->buf_offset += ena_rx_ctx.pkt_offset; + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n", + rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto, + ena_rx_ctx.l4_proto, ena_rx_ctx.hash); + +#ifdef ENA_XDP_SUPPORT + if (ena_xdp_present_ring(rx_ring)) + xdp_verdict = ena_xdp_handle_buff(rx_ring, &xdp, ena_rx_ctx.descs); + + /* allocate skb and fill it */ + if (xdp_verdict == ENA_XDP_PASS) + skb = ena_rx_skb(rx_ring, + rx_ring->ena_bufs, + ena_rx_ctx.descs, + &next_to_clean); +#else + skb = ena_rx_skb(rx_ring, rx_ring->ena_bufs, ena_rx_ctx.descs, + &next_to_clean); +#endif /* ENA_XDP_SUPPORT */ + + if (unlikely(!skb)) { + for (i = 0; i < ena_rx_ctx.descs; i++) { + int req_id = rx_ring->ena_bufs[i].req_id; + + rx_ring->free_ids[next_to_clean] = req_id; + next_to_clean = + ENA_RX_RING_IDX_NEXT(next_to_clean, + rx_ring->ring_size); + +#ifdef ENA_XDP_SUPPORT + /* Packets was passed for transmission, unmap it + * from RX side. + */ + if (xdp_verdict & ENA_XDP_FORWARDED) { + ena_unmap_rx_buff(rx_ring, + &rx_ring->rx_buffer_info[req_id]); + rx_ring->rx_buffer_info[req_id].page = NULL; + } +#endif /* ENA_XDP_SUPPORT */ + } +#ifdef ENA_XDP_SUPPORT + if (xdp_verdict != ENA_XDP_PASS) { + xdp_flags |= xdp_verdict; + total_len += ena_rx_ctx.ena_bufs[0].len; + res_budget--; + continue; + } +#endif /* ENA_XDP_SUPPORT */ + break; + } + + ena_rx_checksum(rx_ring, &ena_rx_ctx, skb); + + ena_set_rx_hash(rx_ring, &ena_rx_ctx, skb); + + skb_record_rx_queue(skb, rx_ring->qid); + + if (rx_ring->ena_bufs[0].len <= rx_ring->rx_copybreak) + rx_copybreak_pkt++; + + total_len += skb->len; + +#ifdef ENA_BUSY_POLL_SUPPORT + if (ena_bp_busy_polling(rx_ring)) + netif_receive_skb(skb); + else + napi_gro_receive(napi, skb); +#else + napi_gro_receive(napi, skb); +#endif /* ENA_BUSY_POLL_SUPPORT */ + + res_budget--; + } while (likely(res_budget)); + + work_done = budget - res_budget; + rx_ring->per_napi_packets += work_done; + u64_stats_update_begin(&rx_ring->syncp); + rx_ring->rx_stats.bytes += total_len; + rx_ring->rx_stats.cnt += work_done; + rx_ring->rx_stats.rx_copybreak_pkt += rx_copybreak_pkt; + u64_stats_update_end(&rx_ring->syncp); + + rx_ring->next_to_clean = next_to_clean; + + refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq); + refill_threshold = + min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER, + ENA_RX_REFILL_THRESH_PACKET); + + /* Optimization, try to batch new rx buffers */ + if (refill_required > refill_threshold) + ena_refill_rx_bufs(rx_ring, refill_required); + +#ifdef ENA_XDP_SUPPORT + if (xdp_flags & ENA_XDP_REDIRECT) + xdp_do_flush_map(); +#endif + + return work_done; + +error: + adapter = netdev_priv(rx_ring->netdev); + + if (rc == -ENOSPC) { + ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1, + &rx_ring->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_TOO_MANY_RX_DESCS); + } else if (rc == -EFAULT) { + ena_reset_device(adapter, ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED); + } else { + ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1, + &rx_ring->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID); + } + return 0; +} + +static void ena_dim_work(struct work_struct *w) +{ + struct dim *dim = container_of(w, struct dim, work); + struct dim_cq_moder cur_moder = + net_dim_get_rx_moderation(dim->mode, dim->profile_ix); + struct ena_napi *ena_napi = container_of(dim, struct ena_napi, dim); + + ena_napi->rx_ring->interrupt_interval = cur_moder.usec; + /* DIM will schedule the work in case there was a change in the profile. */ + ena_napi->rx_ring->interrupt_interval_changed = true; + + dim->state = DIM_START_MEASURE; +} + +static void ena_adjust_adaptive_rx_intr_moderation(struct ena_napi *ena_napi) +{ + struct dim_sample dim_sample; + struct ena_ring *rx_ring = ena_napi->rx_ring; + + if (!rx_ring->per_napi_packets) + return; + + rx_ring->non_empty_napi_events++; + + dim_update_sample(rx_ring->non_empty_napi_events, + rx_ring->rx_stats.cnt, + rx_ring->rx_stats.bytes, + &dim_sample); + + net_dim(&ena_napi->dim, dim_sample); + + rx_ring->per_napi_packets = 0; +} + +void ena_unmask_interrupt(struct ena_ring *tx_ring, + struct ena_ring *rx_ring) +{ + u32 rx_interval = tx_ring->interrupt_interval; + struct ena_eth_io_intr_reg intr_reg; + bool no_moderation_update = true; + + /* Rx ring can be NULL when for XDP tx queues which don't have an + * accompanying rx_ring pair. + */ + if (rx_ring) { + rx_interval = ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev) ? + rx_ring->interrupt_interval : + ena_com_get_nonadaptive_moderation_interval_rx(rx_ring->ena_dev); + + no_moderation_update &= !rx_ring->interrupt_interval_changed; + rx_ring->interrupt_interval_changed = false; + } + + no_moderation_update &= !tx_ring->interrupt_interval_changed; + tx_ring->interrupt_interval_changed = false; + + /* Update intr register: rx intr delay, + * tx intr delay and interrupt unmask + */ + ena_com_update_intr_reg(&intr_reg, + rx_interval, + tx_ring->interrupt_interval, + true, + no_moderation_update); + + ena_increase_stat(&tx_ring->tx_stats.unmask_interrupt, 1, + &tx_ring->syncp); + + /* It is a shared MSI-X. + * Tx and Rx CQ have pointer to it. + * So we use one of them to reach the intr reg + * The Tx ring is used because the rx_ring is NULL for XDP queues + */ + ena_com_unmask_intr(tx_ring->ena_com_io_cq, &intr_reg); +} + +void ena_update_ring_numa_node(struct ena_ring *tx_ring, + struct ena_ring *rx_ring) +{ + int cpu = get_cpu(); + int numa_node; + + /* Check only one ring since the 2 rings are running on the same cpu */ + if (likely(tx_ring->cpu == cpu)) + goto out; + + tx_ring->cpu = cpu; + if (rx_ring) + rx_ring->cpu = cpu; + + numa_node = cpu_to_node(cpu); + + if (likely(tx_ring->numa_node == numa_node)) + goto out; + + put_cpu(); + + if (numa_node != NUMA_NO_NODE) { + ena_com_update_numa_node(tx_ring->ena_com_io_cq, numa_node); + tx_ring->numa_node = numa_node; + if (rx_ring) { + rx_ring->numa_node = numa_node; + ena_com_update_numa_node(rx_ring->ena_com_io_cq, + numa_node); + } + } + + return; +out: + put_cpu(); +} + + +static int ena_io_poll(struct napi_struct *napi, int budget) +{ + struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi); + struct ena_ring *tx_ring, *rx_ring; + int tx_work_done; + int rx_work_done = 0; + int tx_budget; + int napi_comp_call = 0; + int ret; + + tx_ring = ena_napi->tx_ring; + rx_ring = ena_napi->rx_ring; + + tx_budget = tx_ring->ring_size / ENA_TX_POLL_BUDGET_DIVIDER; + + if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) || + test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags)) { + napi_complete_done(napi, 0); + return 0; + } +#ifdef ENA_BUSY_POLL_SUPPORT + if (!ena_bp_lock_napi(rx_ring)) + return budget; +#endif + + tx_work_done = ena_clean_tx_irq(tx_ring, tx_budget); + /* On netpoll the budget is zero and the handler should only clean the + * tx completions. + */ + if (likely(budget)) + rx_work_done = ena_clean_rx_irq(rx_ring, napi, budget); + + /* If the device is about to reset or down, avoid unmask + * the interrupt and return 0 so NAPI won't reschedule + */ + if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) || + test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags))) { + napi_complete_done(napi, 0); + ret = 0; + + } else if ((budget > rx_work_done) && (tx_budget > tx_work_done)) { + napi_comp_call = 1; + + /* Update numa and unmask the interrupt only when schedule + * from the interrupt context (vs from sk_busy_loop) + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) + if (napi_complete_done(napi, rx_work_done) && + READ_ONCE(ena_napi->interrupts_masked)) { +#else + napi_complete_done(napi, rx_work_done); + if (READ_ONCE(ena_napi->interrupts_masked)) { +#endif + smp_rmb(); /* make sure interrupts_masked is read */ + WRITE_ONCE(ena_napi->interrupts_masked, false); + /* We apply adaptive moderation on Rx path only. + * Tx uses static interrupt moderation. + */ + if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev)) + ena_adjust_adaptive_rx_intr_moderation(ena_napi); + + ena_update_ring_numa_node(tx_ring, rx_ring); + ena_unmask_interrupt(tx_ring, rx_ring); + } + + ret = rx_work_done; + } else { + ret = budget; + } + + u64_stats_update_begin(&tx_ring->syncp); + tx_ring->tx_stats.napi_comp += napi_comp_call; + tx_ring->tx_stats.tx_poll++; + u64_stats_update_end(&tx_ring->syncp); + +#ifdef ENA_BUSY_POLL_SUPPORT + ena_bp_unlock_napi(rx_ring); +#endif + tx_ring->tx_stats.last_napi_jiffies = jiffies; + + return ret; +} + +static irqreturn_t ena_intr_msix_mgmnt(int irq, void *data) +{ + struct ena_adapter *adapter = (struct ena_adapter *)data; + + ena_com_admin_q_comp_intr_handler(adapter->ena_dev); + + /* Don't call the aenq handler before probe is done */ + if (likely(test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))) + ena_com_aenq_intr_handler(adapter->ena_dev, data); + + return IRQ_HANDLED; +} + +/* ena_intr_msix_io - MSI-X Interrupt Handler for Tx/Rx + * @irq: interrupt number + * @data: pointer to a network interface private napi device structure + */ +static irqreturn_t ena_intr_msix_io(int irq, void *data) +{ + struct ena_napi *ena_napi = data; + + /* Used to check HW health */ + WRITE_ONCE(ena_napi->first_interrupt, true); + + WRITE_ONCE(ena_napi->interrupts_masked, true); + smp_wmb(); /* write interrupts_masked before calling napi */ + + napi_schedule_irqoff(&ena_napi->napi); + + return IRQ_HANDLED; +} + +/* Reserve a single MSI-X vector for management (admin + aenq). + * plus reserve one vector for each potential io queue. + * the number of potential io queues is the minimum of what the device + * supports and the number of vCPUs. + */ +static int ena_enable_msix(struct ena_adapter *adapter) +{ + int msix_vecs, irq_cnt; +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + int i; +#endif + + if (test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) { + netif_err(adapter, probe, adapter->netdev, + "Error, MSI-X is already enabled\n"); + return -EPERM; + } + + /* Reserved the max msix vectors we might need */ + msix_vecs = ENA_MAX_MSIX_VEC(adapter->max_num_io_queues); + netif_dbg(adapter, probe, adapter->netdev, + "Trying to enable MSI-X, vectors %d\n", msix_vecs); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + adapter->msix_entries = vzalloc(msix_vecs * sizeof(struct msix_entry)); + + if (!adapter->msix_entries) + return -ENOMEM; + + for (i = 0; i < msix_vecs; i++) + adapter->msix_entries[i].entry = i; + + irq_cnt = pci_enable_msix_range(adapter->pdev, adapter->msix_entries, + ENA_MIN_MSIX_VEC, msix_vecs); +#else + irq_cnt = pci_alloc_irq_vectors(adapter->pdev, ENA_MIN_MSIX_VEC, + msix_vecs, PCI_IRQ_MSIX); +#endif + + if (irq_cnt < 0) { + netif_err(adapter, probe, adapter->netdev, + "Failed to enable MSI-X. irq_cnt %d\n", irq_cnt); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + vfree(adapter->msix_entries); + adapter->msix_entries = NULL; +#endif + return -ENOSPC; + } + + if (irq_cnt != msix_vecs) { + netif_notice(adapter, probe, adapter->netdev, + "Enable only %d MSI-X (out of %d), reduce the number of queues\n", + irq_cnt, msix_vecs); + adapter->num_io_queues = irq_cnt - ENA_ADMIN_MSIX_VEC; + } + + if (ena_init_rx_cpu_rmap(adapter)) + netif_warn(adapter, probe, adapter->netdev, + "Failed to map IRQs to CPUs\n"); + + adapter->msix_vecs = irq_cnt; + set_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags); + + return 0; +} + +static void ena_setup_mgmnt_intr(struct ena_adapter *adapter) +{ + u32 cpu; + + snprintf(adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].name, + ENA_IRQNAME_SIZE, "ena-mgmnt@pci:%s", + pci_name(adapter->pdev)); + adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].handler = + ena_intr_msix_mgmnt; + adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].data = adapter; + adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].vector = +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + adapter->msix_entries[ENA_MGMNT_IRQ_IDX].vector; +#else + pci_irq_vector(adapter->pdev, ENA_MGMNT_IRQ_IDX); +#endif + cpu = cpumask_first(cpu_online_mask); + adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].cpu = cpu; + cpumask_set_cpu(cpu, + &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].affinity_hint_mask); +} + +static void ena_setup_io_intr(struct ena_adapter *adapter) +{ + struct net_device *netdev; + int irq_idx, i, cpu; + int io_queue_count; + + netdev = adapter->netdev; + io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + + for (i = 0; i < io_queue_count; i++) { + irq_idx = ENA_IO_IRQ_IDX(i); + cpu = i % num_online_cpus(); + + snprintf(adapter->irq_tbl[irq_idx].name, ENA_IRQNAME_SIZE, + "%s-Tx-Rx-%d", netdev->name, i); + adapter->irq_tbl[irq_idx].handler = ena_intr_msix_io; + adapter->irq_tbl[irq_idx].data = &adapter->ena_napi[i]; + adapter->irq_tbl[irq_idx].vector = +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + adapter->msix_entries[irq_idx].vector; +#else + pci_irq_vector(adapter->pdev, irq_idx); +#endif + adapter->irq_tbl[irq_idx].cpu = cpu; + + cpumask_set_cpu(cpu, + &adapter->irq_tbl[irq_idx].affinity_hint_mask); + } +} + +static int ena_request_mgmnt_irq(struct ena_adapter *adapter) +{ + unsigned long flags = 0; + struct ena_irq *irq; + int rc; + + irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX]; + rc = request_irq(irq->vector, irq->handler, flags, irq->name, + irq->data); + if (rc) { + netif_err(adapter, probe, adapter->netdev, + "Failed to request admin irq\n"); + return rc; + } + + netif_dbg(adapter, probe, adapter->netdev, + "Set affinity hint of mgmnt irq.to 0x%lx (irq vector: %d)\n", + irq->affinity_hint_mask.bits[0], irq->vector); + + return rc; +} + +static int ena_request_io_irq(struct ena_adapter *adapter) +{ + u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + unsigned long flags = 0; + struct ena_irq *irq; + int rc = 0, i, k; + + if (!test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to request I/O IRQ: MSI-X is not enabled\n"); + return -EINVAL; + } + + for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) { + irq = &adapter->irq_tbl[i]; + rc = request_irq(irq->vector, irq->handler, flags, irq->name, + irq->data); + if (rc) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to request I/O IRQ. index %d rc %d\n", + i, rc); + goto err; + } + + netif_dbg(adapter, ifup, adapter->netdev, + "Set affinity hint of irq. index %d to 0x%lx (irq vector: %d)\n", + i, irq->affinity_hint_mask.bits[0], irq->vector); + } + + return rc; + +err: + for (k = ENA_IO_IRQ_FIRST_IDX; k < i; k++) { + irq = &adapter->irq_tbl[k]; + free_irq(irq->vector, irq->data); + } + + return rc; +} + +static void ena_free_mgmnt_irq(struct ena_adapter *adapter) +{ + struct ena_irq *irq; + + irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX]; + synchronize_irq(irq->vector); + irq_set_affinity_hint(irq->vector, NULL); + free_irq(irq->vector, irq->data); +} + +static void ena_free_io_irq(struct ena_adapter *adapter) +{ + u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + struct ena_irq *irq; + int i; + +#ifdef CONFIG_RFS_ACCEL + if (adapter->msix_vecs >= 1) { + free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap); + adapter->netdev->rx_cpu_rmap = NULL; + } +#endif /* CONFIG_RFS_ACCEL */ + + for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) { + irq = &adapter->irq_tbl[i]; + irq_set_affinity_hint(irq->vector, NULL); + free_irq(irq->vector, irq->data); + } +} + +static void ena_disable_msix(struct ena_adapter *adapter) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + if (test_and_clear_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) + pci_disable_msix(adapter->pdev); + + if (adapter->msix_entries) + vfree(adapter->msix_entries); + adapter->msix_entries = NULL; +#else + if (test_and_clear_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) + pci_free_irq_vectors(adapter->pdev); +#endif +} + +static void ena_disable_io_intr_sync(struct ena_adapter *adapter) +{ + u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + int i; + + if (!netif_running(adapter->netdev)) + return; + + for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) + synchronize_irq(adapter->irq_tbl[i].vector); +} + +static void ena_del_napi_in_range(struct ena_adapter *adapter, + int first_index, + int count) +{ + int i; + + for (i = first_index; i < first_index + count; i++) { +#ifdef ENA_BUSY_POLL_SUPPORT + napi_hash_del(&adapter->ena_napi[i].napi); +#endif /* ENA_BUSY_POLL_SUPPORT */ + netif_napi_del(&adapter->ena_napi[i].napi); + +#ifdef ENA_XDP_SUPPORT + WARN_ON(ENA_IS_XDP_INDEX(adapter, i) && + adapter->ena_napi[i].rx_ring); +#endif /* ENA_XDP_SUPPORT */ + } +#ifdef ENA_BUSY_POLL_SUPPORT + + /* Wait until all uses of napi struct complete */ + synchronize_net(); +#endif /* ENA_BUSY_POLL_SUPPORT */ +} + +static void ena_init_napi_in_range(struct ena_adapter *adapter, + int first_index, int count) +{ + int i; + int (*napi_handler)(struct napi_struct *napi, int budget); + + for (i = first_index; i < first_index + count; i++) { + struct ena_napi *napi = &adapter->ena_napi[i]; + struct ena_ring *rx_ring, *tx_ring; + + memset(napi, 0, sizeof(*napi)); + + rx_ring = &adapter->rx_ring[i]; + tx_ring = &adapter->tx_ring[i]; + + napi_handler = ena_io_poll; +#ifdef ENA_XDP_SUPPORT + if (ENA_IS_XDP_INDEX(adapter, i) || ENA_IS_XSK_RING(rx_ring)) + napi_handler = ena_xdp_io_poll; +#endif /* ENA_XDP_SUPPORT */ + + ena_netif_napi_add(adapter->netdev, &napi->napi, napi_handler); + +#ifdef ENA_BUSY_POLL_SUPPORT + napi_hash_add(&adapter->ena_napi[i].napi); + +#endif /* ENA_BUSY_POLL_SUPPORT */ + if (!ENA_IS_XDP_INDEX(adapter, i)) + napi->rx_ring = rx_ring; + + napi->tx_ring = tx_ring; + napi->qid = i; + } +} + +#ifdef ENA_BUSY_POLL_SUPPORT +static void ena_napi_disable_in_range(struct ena_adapter *adapter, + int first_index, + int count) +{ + struct ena_ring *rx_ring; + int i, timeout; + + for (i = first_index; i < first_index + count; i++) { + napi_disable(&adapter->ena_napi[i].napi); + + rx_ring = &adapter->rx_ring[i]; + timeout = 1000; + while (!ena_bp_disable(rx_ring)) { + netif_info(adapter, ifdown, adapter->netdev, + "Rx queue %d locked\n", i); + usleep_range(1000, 2000); + timeout--; + + if (!timeout) { + WARN(!ena_bp_disable(rx_ring), + "Unable to disable busy poll at ring %d\n", i); + break; + } + } + } +} +#else +static void ena_napi_disable_in_range(struct ena_adapter *adapter, + int first_index, + int count) +{ + int i; + + for (i = first_index; i < first_index + count; i++) + napi_disable(&adapter->ena_napi[i].napi); +} +#endif + +static void ena_napi_enable_in_range(struct ena_adapter *adapter, + int first_index, + int count) +{ + int i; + + for (i = first_index; i < first_index + count; i++) + napi_enable(&adapter->ena_napi[i].napi); +} + +/* Configure the Rx forwarding */ +static int ena_rss_configure(struct ena_adapter *adapter) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int rc; + + /* In case the RSS table wasn't initialized by probe */ + if (!ena_dev->rss.tbl_log_size) { + rc = ena_rss_init_default(adapter); + if (rc && (rc != -EOPNOTSUPP)) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to init RSS rc: %d\n", rc); + return rc; + } + } + + /* Set indirect table */ + rc = ena_com_indirect_table_set(ena_dev); + if (unlikely(rc && rc != -EOPNOTSUPP)) + return rc; + + /* Configure hash function (if supported) */ + rc = ena_com_set_hash_function(ena_dev); + if (unlikely(rc && (rc != -EOPNOTSUPP))) + return rc; + + /* Configure hash inputs (if supported) */ + rc = ena_com_set_hash_ctrl(ena_dev); + if (unlikely(rc && (rc != -EOPNOTSUPP))) + return rc; + + return 0; +} + +static int ena_up_complete(struct ena_adapter *adapter) +{ + int rc; + + rc = ena_rss_configure(adapter); + if (rc) + return rc; + + ena_change_mtu(adapter->netdev, adapter->netdev->mtu); + + ena_refill_all_rx_bufs(adapter); + + /* enable transmits */ + netif_tx_start_all_queues(adapter->netdev); + + ena_napi_enable_in_range(adapter, + 0, + adapter->xdp_num_queues + adapter->num_io_queues); + + return 0; +} + +static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid) +{ + struct ena_com_create_io_ctx ctx; + struct ena_com_dev *ena_dev; + struct ena_ring *tx_ring; + u32 msix_vector; + u16 ena_qid; + int rc; + + ena_dev = adapter->ena_dev; + + tx_ring = &adapter->tx_ring[qid]; + msix_vector = ENA_IO_IRQ_IDX(qid); + ena_qid = ENA_IO_TXQ_IDX(qid); + + memset(&ctx, 0x0, sizeof(ctx)); + + ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_TX; + ctx.qid = ena_qid; + ctx.mem_queue_type = ena_dev->tx_mem_queue_type; + ctx.msix_vector = msix_vector; + ctx.queue_size = tx_ring->ring_size; + ctx.numa_node = tx_ring->numa_node; + + rc = ena_com_create_io_queue(ena_dev, &ctx); + if (rc) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to create I/O TX queue num %d rc: %d\n", + qid, rc); + return rc; + } + + rc = ena_com_get_io_handlers(ena_dev, ena_qid, + &tx_ring->ena_com_io_sq, + &tx_ring->ena_com_io_cq); + if (rc) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to get TX queue handlers. TX queue num %d rc: %d\n", + qid, rc); + ena_com_destroy_io_queue(ena_dev, ena_qid); + return rc; + } + + ena_com_update_numa_node(tx_ring->ena_com_io_cq, ctx.numa_node); + return rc; +} + +int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter, + int first_index, int count) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int rc, i; + + for (i = first_index; i < first_index + count; i++) { + rc = ena_create_io_tx_queue(adapter, i); + if (rc) + goto create_err; + } + + return 0; + +create_err: + while (i-- > first_index) + ena_com_destroy_io_queue(ena_dev, ENA_IO_TXQ_IDX(i)); + + return rc; +} + +static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid) +{ + struct ena_com_dev *ena_dev; + struct ena_com_create_io_ctx ctx; + struct ena_ring *rx_ring; + u32 msix_vector; + u16 ena_qid; + int rc; + + ena_dev = adapter->ena_dev; + + rx_ring = &adapter->rx_ring[qid]; + msix_vector = ENA_IO_IRQ_IDX(qid); + ena_qid = ENA_IO_RXQ_IDX(qid); + + memset(&ctx, 0x0, sizeof(ctx)); + + ctx.qid = ena_qid; + ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_RX; + ctx.mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; + ctx.msix_vector = msix_vector; + ctx.queue_size = rx_ring->ring_size; + ctx.numa_node = rx_ring->numa_node; + + rc = ena_com_create_io_queue(ena_dev, &ctx); + if (rc) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to create I/O RX queue num %d rc: %d\n", + qid, rc); + return rc; + } + + rc = ena_com_get_io_handlers(ena_dev, ena_qid, + &rx_ring->ena_com_io_sq, + &rx_ring->ena_com_io_cq); + if (rc) { + netif_err(adapter, ifup, adapter->netdev, + "Failed to get RX queue handlers. RX queue num %d rc: %d\n", + qid, rc); + goto err; + } + + ena_com_update_numa_node(rx_ring->ena_com_io_cq, ctx.numa_node); + + return rc; +err: + ena_com_destroy_io_queue(ena_dev, ena_qid); + return rc; +} + +static int ena_create_all_io_rx_queues(struct ena_adapter *adapter) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int rc, i; + + for (i = 0; i < adapter->num_io_queues; i++) { + rc = ena_create_io_rx_queue(adapter, i); + if (rc) + goto create_err; + INIT_WORK(&adapter->ena_napi[i].dim.work, ena_dim_work); + + ena_xdp_register_rxq_info(&adapter->rx_ring[i]); + } + + return 0; + +create_err: + while (i--) { + ena_xdp_unregister_rxq_info(&adapter->rx_ring[i]); + cancel_work_sync(&adapter->ena_napi[i].dim.work); + ena_com_destroy_io_queue(ena_dev, ENA_IO_RXQ_IDX(i)); + } + + return rc; +} + +static void set_io_rings_size(struct ena_adapter *adapter, + int new_tx_size, + int new_rx_size) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) { + adapter->tx_ring[i].ring_size = new_tx_size; + adapter->rx_ring[i].ring_size = new_rx_size; + } +} + +/* This function allows queue allocation to backoff when the system is + * low on memory. If there is not enough memory to allocate io queues + * the driver will try to allocate smaller queues. + * + * The backoff algorithm is as follows: + * 1. Try to allocate TX and RX and if successful. + * 1.1. return success + * + * 2. Divide by 2 the size of the larger of RX and TX queues (or both if their size is the same). + * + * 3. If TX or RX is smaller than 256 + * 3.1. return failure. + * 4. else + * 4.1. go back to 1. + */ +static int create_queues_with_size_backoff(struct ena_adapter *adapter) +{ + int rc, cur_rx_ring_size, cur_tx_ring_size; + int new_rx_ring_size, new_tx_ring_size; + + /* current queue sizes might be set to smaller than the requested + * ones due to past queue allocation failures. + */ + set_io_rings_size(adapter, adapter->requested_tx_ring_size, + adapter->requested_rx_ring_size); + + while (1) { +#ifdef ENA_XDP_SUPPORT + if (ena_xdp_present(adapter)) { + rc = ena_setup_and_create_all_xdp_queues(adapter); + + if (rc) + goto err_setup_tx; + } +#endif /* ENA_XDP_SUPPORT */ + rc = ena_setup_tx_resources_in_range(adapter, + 0, + adapter->num_io_queues); + if (rc) + goto err_setup_tx; + + rc = ena_create_io_tx_queues_in_range(adapter, + 0, + adapter->num_io_queues); + if (rc) + goto err_create_tx_queues; + + rc = ena_setup_all_rx_resources(adapter); + if (rc) + goto err_setup_rx; + + rc = ena_create_all_io_rx_queues(adapter); + if (rc) + goto err_create_rx_queues; + + rc = ena_create_page_caches(adapter); + if (rc) /* Cache memory is freed in case of failure */ + goto err_create_rx_queues; + + return 0; + +err_create_rx_queues: + ena_free_all_io_rx_resources(adapter); +err_setup_rx: + ena_destroy_all_tx_queues(adapter); +err_create_tx_queues: + ena_free_all_io_tx_resources(adapter); +err_setup_tx: + if (rc != -ENOMEM) { + netif_err(adapter, ifup, adapter->netdev, + "Queue creation failed with error code %d\n", + rc); + return rc; + } + + cur_tx_ring_size = adapter->tx_ring[0].ring_size; + cur_rx_ring_size = adapter->rx_ring[0].ring_size; + + netif_err(adapter, ifup, adapter->netdev, + "Not enough memory to create queues with sizes TX=%d, RX=%d\n", + cur_tx_ring_size, cur_rx_ring_size); + + new_tx_ring_size = cur_tx_ring_size; + new_rx_ring_size = cur_rx_ring_size; + + /* Decrease the size of the larger queue, or + * decrease both if they are the same size. + */ + if (cur_rx_ring_size <= cur_tx_ring_size) + new_tx_ring_size = cur_tx_ring_size / 2; + if (cur_rx_ring_size >= cur_tx_ring_size) + new_rx_ring_size = cur_rx_ring_size / 2; + + if (new_tx_ring_size < ENA_MIN_RING_SIZE || + new_rx_ring_size < ENA_MIN_RING_SIZE) { + netif_err(adapter, ifup, adapter->netdev, + "Queue creation failed with the smallest possible queue size of %d for both queues. Not retrying with smaller queues\n", + ENA_MIN_RING_SIZE); + return rc; + } + + netif_err(adapter, ifup, adapter->netdev, + "Retrying queue creation with sizes TX=%d, RX=%d\n", + new_tx_ring_size, + new_rx_ring_size); + + set_io_rings_size(adapter, new_tx_ring_size, + new_rx_ring_size); + } +} + +int ena_up(struct ena_adapter *adapter) +{ + int io_queue_count, rc, i; + + netif_dbg(adapter, ifup, adapter->netdev, "%s\n", __func__); + + io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + ena_setup_io_intr(adapter); + + /* napi poll functions should be initialized before running + * request_irq(), to handle a rare condition where there is a pending + * interrupt, causing the ISR to fire immediately while the poll + * function wasn't set yet, causing a null dereference + */ + ena_init_napi_in_range(adapter, 0, io_queue_count); + + /* Enabling DIM needs to happen before enabling IRQs since DIM + * is run from napi routine + */ + if (ena_com_interrupt_moderation_supported(adapter->ena_dev)) + ena_com_enable_adaptive_moderation(adapter->ena_dev); + + rc = ena_request_io_irq(adapter); + if (rc) + goto err_req_irq; + + rc = create_queues_with_size_backoff(adapter); + if (rc) + goto err_create_queues_with_backoff; + + rc = ena_up_complete(adapter); + if (rc) + goto err_up; + + if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags)) + netif_carrier_on(adapter->netdev); + + ena_increase_stat(&adapter->dev_stats.interface_up, 1, + &adapter->syncp); + + set_bit(ENA_FLAG_DEV_UP, &adapter->flags); + + /* Enable completion queues interrupt */ + for (i = 0; i < adapter->num_io_queues; i++) + ena_unmask_interrupt(&adapter->tx_ring[i], + &adapter->rx_ring[i]); + + /* schedule napi in case we had pending packets + * from the last time we disable napi + */ + for (i = 0; i < io_queue_count; i++) + napi_schedule(&adapter->ena_napi[i].napi); + + return rc; + +err_up: + ena_free_page_caches(adapter); + ena_destroy_all_tx_queues(adapter); + ena_free_all_io_tx_resources(adapter); + ena_destroy_all_rx_queues(adapter); + ena_free_all_io_rx_resources(adapter); +err_create_queues_with_backoff: + ena_free_io_irq(adapter); +err_req_irq: + ena_del_napi_in_range(adapter, 0, io_queue_count); + + return rc; +} + +void ena_down(struct ena_adapter *adapter) +{ + int io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues; + + netif_dbg(adapter, ifdown, adapter->netdev, "%s\n", __func__); + + clear_bit(ENA_FLAG_DEV_UP, &adapter->flags); + + ena_increase_stat(&adapter->dev_stats.interface_down, 1, + &adapter->syncp); + + netif_carrier_off(adapter->netdev); + netif_tx_disable(adapter->netdev); + + /* After this point the napi handler won't enable the tx queue */ + ena_napi_disable_in_range(adapter, 0, io_queue_count); + + if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) { + int rc; + + rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason); + if (rc) + netif_err(adapter, ifdown, adapter->netdev, + "Device reset failed\n"); + /* stop submitting admin commands on a device that was reset */ + ena_com_set_admin_running_state(adapter->ena_dev, false); + } + + ena_destroy_all_io_queues(adapter); + + ena_disable_io_intr_sync(adapter); + ena_free_io_irq(adapter); + ena_del_napi_in_range(adapter, 0, io_queue_count); + + ena_free_all_tx_bufs(adapter); + ena_free_all_rx_bufs(adapter); + ena_free_all_cache_pages(adapter); + ena_free_page_caches(adapter); + ena_free_all_io_tx_resources(adapter); + ena_free_all_io_rx_resources(adapter); +} + +/* ena_open - Called when a network interface is made active + * @netdev: network interface device structure + * + * Returns 0 on success, negative value on failure + * + * The open entry point is called when a network interface is made + * active by the system (IFF_UP). At this point all resources needed + * for transmit and receive operations are allocated, the interrupt + * handler is registered with the OS, the watchdog timer is started, + * and the stack is notified that the interface is ready. + */ +static int ena_open(struct net_device *netdev) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + int rc; + + /* Notify the stack of the actual queue counts. */ + rc = netif_set_real_num_tx_queues(netdev, adapter->num_io_queues); + if (rc) { + netif_err(adapter, ifup, netdev, "Can't set num tx queues\n"); + return rc; + } + + rc = netif_set_real_num_rx_queues(netdev, adapter->num_io_queues); + if (rc) { + netif_err(adapter, ifup, netdev, "Can't set num rx queues\n"); + return rc; + } + + rc = ena_up(adapter); + if (rc) + return rc; + + return rc; +} + +/* ena_close - Disables a network interface + * @netdev: network interface device structure + * + * Returns 0, this is not allowed to fail + * + * The close entry point is called when an interface is de-activated + * by the OS. The hardware is still under the drivers control, but + * needs to be disabled. A global MAC reset is issued to stop the + * hardware, and all transmit and receive resources are freed. + */ +static int ena_close(struct net_device *netdev) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + + netif_dbg(adapter, ifdown, netdev, "%s\n", __func__); + + if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags)) + return 0; + + if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + ena_down(adapter); + + /* Check for device status and issue reset if needed*/ + check_for_admin_com_state(adapter); + if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) { + netif_err(adapter, ifdown, adapter->netdev, + "Destroy failure, restarting device\n"); + ena_dump_stats_to_dmesg(adapter); + /* rtnl lock already obtained in dev_ioctl() layer */ + ena_destroy_device(adapter, false); + ena_restore_device(adapter); + } + + return 0; +} + +int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled) +{ + /* In XDP, lpc_size might be positive even with LPC disabled, use cache + * pointer instead. + */ + struct ena_page_cache *page_cache = adapter->rx_ring->page_cache; + + /* Exit early if LPC state doesn't change */ + if (enabled == !!page_cache) + return 0; + + if (enabled && !ena_is_lpc_supported(adapter, adapter->rx_ring, true)) + return -EOPNOTSUPP; + + adapter->used_lpc_size = enabled ? adapter->configured_lpc_size : 0; + + /* rtnl lock is already obtained in dev_ioctl() layer, so it's safe to + * re-initialize IO resources. + */ + if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) { + ena_close(adapter->netdev); + ena_up(adapter); + } + + return 0; +} + +int ena_update_queue_sizes(struct ena_adapter *adapter, + u32 new_tx_size, + u32 new_rx_size) +{ + bool dev_was_up; + + dev_was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags); + ena_close(adapter->netdev); + adapter->requested_tx_ring_size = new_tx_size; + adapter->requested_rx_ring_size = new_rx_size; + ena_init_io_rings(adapter, + 0, + adapter->xdp_num_queues + + adapter->num_io_queues); + return dev_was_up ? ena_up(adapter) : 0; +} + +int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak) +{ + struct ena_ring *rx_ring; + int i; + + if (rx_copybreak > min_t(u16, adapter->netdev->mtu, ENA_PAGE_SIZE)) + return -EINVAL; + + adapter->rx_copybreak = rx_copybreak; + + for (i = 0; i < adapter->num_io_queues; i++) { + rx_ring = &adapter->rx_ring[i]; + rx_ring->rx_copybreak = rx_copybreak; + } + + return 0; +} + +int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; +#ifdef ENA_XDP_SUPPORT + int prev_channel_count; +#endif /* ENA_XDP_SUPPORT */ + bool dev_was_up; + + dev_was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags); + ena_close(adapter->netdev); +#ifdef ENA_XDP_SUPPORT + prev_channel_count = adapter->num_io_queues; +#endif /* ENA_XDP_SUPPORT */ + adapter->num_io_queues = new_channel_count; +#ifdef ENA_XDP_SUPPORT + if (ena_xdp_present(adapter) && + ena_xdp_allowed(adapter) == ENA_XDP_ALLOWED) { + adapter->xdp_first_ring = new_channel_count; + adapter->xdp_num_queues = new_channel_count; + if (prev_channel_count > new_channel_count) + ena_xdp_exchange_program_rx_in_range(adapter, + NULL, + new_channel_count, + prev_channel_count); + else + ena_xdp_exchange_program_rx_in_range(adapter, + adapter->xdp_bpf_prog, + prev_channel_count, + new_channel_count); + } +#endif /* ENA_XDP_SUPPORT */ + + /* We need to destroy the rss table so that the indirection + * table will be reinitialized by ena_up() + */ + ena_com_rss_destroy(ena_dev); + ena_init_io_rings(adapter, + 0, + adapter->xdp_num_queues + + adapter->num_io_queues); + return dev_was_up ? ena_open(adapter->netdev) : 0; +} + +static void ena_tx_csum(struct ena_com_tx_ctx *ena_tx_ctx, + struct sk_buff *skb, + bool disable_meta_caching) +{ + u32 mss = skb_shinfo(skb)->gso_size; + struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta; + u8 l4_protocol = 0; + + if ((skb->ip_summed == CHECKSUM_PARTIAL) || mss) { + ena_tx_ctx->l4_csum_enable = 1; + if (mss) { + ena_tx_ctx->tso_enable = 1; + ena_meta->l4_hdr_len = tcp_hdr(skb)->doff; + ena_tx_ctx->l4_csum_partial = 0; + } else { + ena_tx_ctx->tso_enable = 0; + ena_meta->l4_hdr_len = 0; + ena_tx_ctx->l4_csum_partial = 1; + } + + switch (ip_hdr(skb)->version) { + case IPVERSION: + ena_tx_ctx->l3_proto = ENA_ETH_IO_L3_PROTO_IPV4; + if (ip_hdr(skb)->frag_off & htons(IP_DF)) + ena_tx_ctx->df = 1; + if (mss) + ena_tx_ctx->l3_csum_enable = 1; + l4_protocol = ip_hdr(skb)->protocol; + break; + case 6: + ena_tx_ctx->l3_proto = ENA_ETH_IO_L3_PROTO_IPV6; + l4_protocol = ipv6_hdr(skb)->nexthdr; + break; + default: + break; + } + + if (l4_protocol == IPPROTO_TCP) + ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_TCP; + else + ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_UDP; + + ena_meta->mss = mss; + ena_meta->l3_hdr_len = skb_network_header_len(skb); + ena_meta->l3_hdr_offset = skb_network_offset(skb); + ena_tx_ctx->meta_valid = 1; + } else if (disable_meta_caching) { + memset(ena_meta, 0, sizeof(*ena_meta)); + ena_tx_ctx->meta_valid = 1; + } else { + ena_tx_ctx->meta_valid = 0; + } +} + +static int ena_check_and_linearize_skb(struct ena_ring *tx_ring, + struct sk_buff *skb) +{ + int num_frags, header_len, rc; + + num_frags = skb_shinfo(skb)->nr_frags; + header_len = skb_headlen(skb); + + if (num_frags < tx_ring->sgl_size) + return 0; + + if ((num_frags == tx_ring->sgl_size) && + (header_len < tx_ring->tx_max_header_size)) + return 0; + + ena_increase_stat(&tx_ring->tx_stats.linearize, 1, &tx_ring->syncp); + + rc = skb_linearize(skb); + if (unlikely(rc)) { + ena_increase_stat(&tx_ring->tx_stats.linearize_failed, 1, + &tx_ring->syncp); + } + + return rc; +} + +static int ena_tx_map_skb(struct ena_ring *tx_ring, + struct ena_tx_buffer *tx_info, + struct sk_buff *skb, + void **push_hdr, + u16 *header_len) +{ + struct ena_adapter *adapter = tx_ring->adapter; + struct ena_com_buf *ena_buf; + dma_addr_t dma; + u32 skb_head_len, frag_len, last_frag; + u16 push_len = 0; + u16 delta = 0; + int i = 0; + + skb_head_len = skb_headlen(skb); + tx_info->skb = skb; + ena_buf = tx_info->bufs; + + if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + /* When the device is LLQ mode, the driver will copy + * the header into the device memory space. + * the ena_com layer assume the header is in a linear + * memory space. + * This assumption might be wrong since part of the header + * can be in the fragmented buffers. + * Use skb_header_pointer to make sure the header is in a + * linear memory space. + */ + + push_len = min_t(u32, skb->len, tx_ring->tx_max_header_size); + *push_hdr = skb_header_pointer(skb, 0, push_len, + tx_ring->push_buf_intermediate_buf); + *header_len = push_len; + if (unlikely(skb->data != *push_hdr)) { + ena_increase_stat(&tx_ring->tx_stats.llq_buffer_copy, 1, + &tx_ring->syncp); + + delta = push_len - skb_head_len; + } + } else { + *push_hdr = NULL; + *header_len = min_t(u32, skb_head_len, + tx_ring->tx_max_header_size); + } + + netif_dbg(adapter, tx_queued, adapter->netdev, + "skb: %p header_buf->vaddr: %p push_len: %d\n", skb, + *push_hdr, push_len); + + if (skb_head_len > push_len) { + dma = dma_map_single(tx_ring->dev, skb->data + push_len, + skb_head_len - push_len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) + goto error_report_dma_error; + + ena_buf->paddr = dma; + ena_buf->len = skb_head_len - push_len; + + ena_buf++; + tx_info->num_of_bufs++; + tx_info->map_linear_data = 1; + } else { + tx_info->map_linear_data = 0; + } + + last_frag = skb_shinfo(skb)->nr_frags; + + for (i = 0; i < last_frag; i++) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + frag_len = skb_frag_size(frag); + + if (unlikely(delta >= frag_len)) { + delta -= frag_len; + continue; + } + + dma = skb_frag_dma_map(tx_ring->dev, frag, delta, + frag_len - delta, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) + goto error_report_dma_error; + + ena_buf->paddr = dma; + ena_buf->len = frag_len - delta; + ena_buf++; + tx_info->num_of_bufs++; + delta = 0; + } + + return 0; + +error_report_dma_error: + ena_increase_stat(&tx_ring->tx_stats.dma_mapping_err, 1, + &tx_ring->syncp); + netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map skb\n"); + + tx_info->skb = NULL; + + tx_info->num_of_bufs += i; + ena_unmap_tx_buff(tx_ring, tx_info); + + return -EINVAL; +} + +/* Called with netif_tx_lock. */ +static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ena_adapter *adapter = netdev_priv(dev); + struct ena_tx_buffer *tx_info; + struct ena_com_tx_ctx ena_tx_ctx; + struct ena_ring *tx_ring; + struct netdev_queue *txq; + void *push_hdr; + u16 next_to_use, req_id, header_len; + int qid, rc; + + netif_dbg(adapter, tx_queued, dev, "%s skb %p\n", __func__, skb); + /* Determine which tx ring we will be placed on */ + qid = skb_get_queue_mapping(skb); + tx_ring = &adapter->tx_ring[qid]; + txq = netdev_get_tx_queue(dev, qid); + + rc = ena_check_and_linearize_skb(tx_ring, skb); + if (unlikely(rc)) + goto error_drop_packet; + + next_to_use = tx_ring->next_to_use; + req_id = tx_ring->free_ids[next_to_use]; + tx_info = &tx_ring->tx_buffer_info[req_id]; + tx_info->num_of_bufs = 0; + + WARN(tx_info->skb, "SKB isn't NULL req_id %d\n", req_id); + + rc = ena_tx_map_skb(tx_ring, tx_info, skb, &push_hdr, &header_len); + if (unlikely(rc)) + goto error_drop_packet; + + memset(&ena_tx_ctx, 0x0, sizeof(struct ena_com_tx_ctx)); + ena_tx_ctx.ena_bufs = tx_info->bufs; + ena_tx_ctx.push_header = push_hdr; + ena_tx_ctx.num_bufs = tx_info->num_of_bufs; + ena_tx_ctx.req_id = req_id; + ena_tx_ctx.header_len = header_len; + + /* set flags and meta data */ + ena_tx_csum(&ena_tx_ctx, skb, tx_ring->disable_meta_caching); + + rc = ena_xmit_common(adapter, + tx_ring, + tx_info, + &ena_tx_ctx, + next_to_use, + skb->len); + if (rc) + goto error_unmap_dma; + + if (tx_ring->enable_bql) + netdev_tx_sent_queue(txq, skb->len); + + /* stop the queue when no more space available, the packet can have up + * to sgl_size + 2. one for the meta descriptor and one for header + * (if the header is larger than tx_max_header_size). + */ + if (unlikely(!ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, + tx_ring->sgl_size + 2))) { + netif_dbg(adapter, tx_queued, dev, "%s stop queue %d\n", + __func__, qid); + + netif_tx_stop_queue(txq); + ena_increase_stat(&tx_ring->tx_stats.queue_stop, 1, + &tx_ring->syncp); + + /* There is a rare condition where this function decide to + * stop the queue but meanwhile clean_tx_irq updates + * next_to_completion and terminates. + * The queue will remain stopped forever. + * To solve this issue add a mb() to make sure that + * netif_tx_stop_queue() write is vissible before checking if + * there is additional space in the queue. + */ + smp_mb(); + + if (ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, + ENA_TX_WAKEUP_THRESH)) { + netif_tx_wake_queue(txq); + ena_increase_stat(&tx_ring->tx_stats.queue_wakeup, 1, + &tx_ring->syncp); + } + } + + skb_tx_timestamp(skb); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) +#ifdef HAVE_NETDEV_XMIT_MORE + if (netif_xmit_stopped(txq) || !netdev_xmit_more()) +#else + if (netif_xmit_stopped(txq) || !skb->xmit_more) +#endif /* HAVE_NETDEV_XMIT_MORE */ +#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) */ + /* trigger the dma engine. ena_ring_tx_doorbell() + * calls a memory barrier inside it. + */ + ena_ring_tx_doorbell(tx_ring); + + return NETDEV_TX_OK; + +error_unmap_dma: + ena_unmap_tx_buff(tx_ring, tx_info); + tx_info->skb = NULL; + +error_drop_packet: + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +#if defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3 +static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2 +static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev, + select_queue_fallback_t fallback) +#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1 +static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, + select_queue_fallback_t fallback) +#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL +/* Return subqueue id on this core (one per core). */ +static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) +#else +static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb) +#endif +{ + u16 qid; + /* we suspect that this is good for in--kernel network services that + * want to loop incoming skb rx to tx in normal user generated traffic, + * most probably we will not get to this + */ + if (skb_rx_queue_recorded(skb)) + qid = skb_get_rx_queue(skb); + else +#if (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3) + qid = netdev_pick_tx(dev, skb, NULL); +#elif (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2) + qid = fallback(dev, skb, NULL); +#elif (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1) + qid = fallback(dev, skb); +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0) + qid = __netdev_pick_tx(dev, skb); +#else + qid = skb_tx_hash(dev, skb); +#endif /* HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2 */ + + return qid; +} +#ifdef HAVE_SET_RX_MODE + +/* Unicast, Multicast and Promiscuous mode set + * @netdev: network interface device structure + * + * The set_rx_mode entry point is called whenever the unicast or multicast + * address lists or the network interface flags are updated. This routine is + * responsible for configuring the hardware for proper unicast, multicast, + * promiscuous mode, and all-multi behavior. + */ +static void ena_set_rx_mode(struct net_device *netdev) +{ +/* struct ena_adapter *adapter = netdev_priv(netdev); */ + /* TODO set Rx mode */ + + if (netdev->flags & IFF_PROMISC) { + } else if (netdev->flags & IFF_ALLMULTI) { + } else if (netdev_mc_empty(netdev)) { + } else { + } +} +#endif /* HAVE_SET_RX_MODE */ + +static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pdev) +{ + struct device *dev = &pdev->dev; + struct ena_admin_host_info *host_info; + ssize_t ret; + int rc; + + /* Allocate only the host info */ + rc = ena_com_allocate_host_info(ena_dev); + if (rc) { + dev_err(dev, "Cannot allocate host info\n"); + return; + } + + host_info = ena_dev->host_attr.host_info; + + host_info->bdf = (pdev->bus->number << 8) | pdev->devfn; + host_info->os_type = ENA_ADMIN_OS_LINUX; + host_info->kernel_ver = LINUX_VERSION_CODE; + ret = strscpy(host_info->kernel_ver_str, utsname()->version, + sizeof(host_info->kernel_ver_str) - 1); + if (ret < 0) + dev_info(dev, + "kernel version string will be truncated, status = %zd\n", ret); + host_info->os_dist = 0; + strncpy(host_info->os_dist_str, utsname()->release, + sizeof(host_info->os_dist_str) - 1); + host_info->driver_version = + (DRV_MODULE_GEN_MAJOR) | + (DRV_MODULE_GEN_MINOR << ENA_ADMIN_HOST_INFO_MINOR_SHIFT) | + (DRV_MODULE_GEN_SUBMINOR << ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT) | + ("g"[0] << ENA_ADMIN_HOST_INFO_MODULE_TYPE_SHIFT); + host_info->num_cpus = num_online_cpus(); + + host_info->driver_supported_features = + ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK | + ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK | + ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK | + ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK | + ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK; + + rc = ena_com_set_host_attributes(ena_dev); + if (rc) { + if (rc == -EOPNOTSUPP) + dev_warn(dev, "Cannot set host attributes\n"); + else + dev_err(dev, "Cannot set host attributes\n"); + + goto err; + } + + return; + +err: + ena_com_delete_host_info(ena_dev); +} + +static void ena_config_debug_area(struct ena_adapter *adapter) +{ + u32 debug_area_size; + int rc, ss_count; + + ss_count = ena_get_sset_count(adapter->netdev, ETH_SS_STATS); + if (ss_count <= 0) { + netif_err(adapter, drv, adapter->netdev, + "SS count is negative\n"); + return; + } + + /* allocate 32 bytes for each string and 64bit for the value */ + debug_area_size = ss_count * ETH_GSTRING_LEN + sizeof(u64) * ss_count; + + rc = ena_com_allocate_debug_area(adapter->ena_dev, debug_area_size); + if (rc) { + netif_err(adapter, drv, adapter->netdev, + "Cannot allocate debug area\n"); + return; + } + + rc = ena_com_set_host_attributes(adapter->ena_dev); + if (rc) { + if (rc == -EOPNOTSUPP) + netif_warn(adapter, drv, adapter->netdev, + "Cannot set host attributes\n"); + else + netif_err(adapter, drv, adapter->netdev, + "Cannot set host attributes\n"); + goto err; + } + + return; +err: + ena_com_delete_debug_area(adapter->ena_dev); +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)) +#ifdef NDO_GET_STATS_64_V2 +static void ena_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +#else +static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +#endif +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_ring *rx_ring, *tx_ring; + u64 xdp_rx_drops = 0; + unsigned int start; + u64 rx_overruns; + u64 rx_drops; + u64 tx_drops; + int i; + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) +#ifdef NDO_GET_STATS_64_V2 + return; +#else + return NULL; +#endif + + for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) { + u64 bytes, packets; + + tx_ring = &adapter->tx_ring[i]; + + do { + start = ena_u64_stats_fetch_begin(&tx_ring->syncp); + packets = tx_ring->tx_stats.cnt; + bytes = tx_ring->tx_stats.bytes; + } while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start)); + + stats->tx_packets += packets; + stats->tx_bytes += bytes; + + /* In XDP there isn't an RX queue counterpart */ + if (ENA_IS_XDP_INDEX(adapter, i)) + continue; + + rx_ring = &adapter->rx_ring[i]; + + do { + start = ena_u64_stats_fetch_begin(&rx_ring->syncp); + packets = rx_ring->rx_stats.cnt; + bytes = rx_ring->rx_stats.bytes; + xdp_rx_drops += ena_ring_xdp_drops_cnt(rx_ring); + } while (ena_u64_stats_fetch_retry(&rx_ring->syncp, start)); + + stats->rx_packets += packets; + stats->rx_bytes += bytes; + } + + do { + start = ena_u64_stats_fetch_begin(&adapter->syncp); + rx_drops = adapter->dev_stats.rx_drops; + tx_drops = adapter->dev_stats.tx_drops; + rx_overruns = adapter->dev_stats.rx_overruns; + } while (ena_u64_stats_fetch_retry(&adapter->syncp, start)); + + stats->rx_dropped = rx_drops + xdp_rx_drops; + stats->tx_dropped = tx_drops; + + stats->multicast = 0; + stats->collisions = 0; + + stats->rx_length_errors = 0; + stats->rx_crc_errors = 0; + stats->rx_frame_errors = 0; + stats->rx_fifo_errors = 0; + stats->rx_missed_errors = 0; + stats->tx_window_errors = 0; + stats->rx_over_errors = rx_overruns; + + stats->rx_errors = stats->rx_over_errors; + stats->tx_errors = 0; +#ifndef NDO_GET_STATS_64_V2 + return stats; +#endif +} +#else /* kernel > 2.6.36 */ +static struct net_device_stats *ena_get_stats(struct net_device *netdev) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_ring *rx_ring, *tx_ring; + unsigned long rx_drops; + struct net_device_stats *stats = &netdev->stats; + unsigned int start; + int i; + + memset(stats, 0, sizeof(*stats)); + for (i = 0; i < adapter->num_io_queues; i++) { + unsigned long bytes, packets; + + tx_ring = &adapter->tx_ring[i]; + do { + start = ena_u64_stats_fetch_begin(&tx_ring->syncp); + packets = (unsigned long)tx_ring->tx_stats.cnt; + bytes = (unsigned long)tx_ring->tx_stats.bytes; + } while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start)); + + stats->tx_packets += packets; + stats->tx_bytes += bytes; + + rx_ring = &adapter->rx_ring[i]; + + do { + start = ena_u64_stats_fetch_begin(&tx_ring->syncp); + packets = (unsigned long)rx_ring->rx_stats.cnt; + bytes = (unsigned long)rx_ring->rx_stats.bytes; + } while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start)); + + stats->rx_packets += packets; + stats->rx_bytes += bytes; + } + + do { + start = ena_u64_stats_fetch_begin(&tx_ring->syncp); + rx_drops = (unsigned long)adapter->dev_stats.rx_drops; + } while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start)); + + stats->rx_dropped = rx_drops; + + stats->multicast = 0; + stats->collisions = 0; + + stats->rx_length_errors = 0; + stats->rx_crc_errors = 0; + stats->rx_frame_errors = 0; + stats->rx_fifo_errors = 0; + stats->rx_missed_errors = 0; + stats->tx_window_errors = 0; + + stats->rx_errors = 0; + stats->tx_errors = 0; + + return stats; +} +#endif +#ifdef ENA_BUSY_POLL_SUPPORT + +#define ENA_BP_NAPI_BUDGET 8 +static int ena_busy_poll(struct napi_struct *napi) +{ + struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi); + struct ena_ring *rx_ring = ena_napi->rx_ring; + struct ena_adapter *adapter= rx_ring->adapter; + int done; + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + return LL_FLUSH_FAILED; + + if (!ena_bp_lock_poll(rx_ring)) + return LL_FLUSH_BUSY; + + done = ena_clean_rx_irq(rx_ring, napi, ENA_BP_NAPI_BUDGET); + if (likely(done)) + rx_ring->rx_stats.bp_cleaned += done; + else + rx_ring->rx_stats.bp_missed++; + + ena_bp_unlock_poll(rx_ring); + + return done; +} +#endif + +static const struct net_device_ops ena_netdev_ops = { + .ndo_open = ena_open, + .ndo_stop = ena_close, + .ndo_start_xmit = ena_start_xmit, + .ndo_select_queue = ena_select_queue, +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)) + .ndo_get_stats64 = ena_get_stats64, +#else + .ndo_get_stats = ena_get_stats, +#endif +#ifdef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER + .ndo_tx_timeout = ena_tx_timeout, +#else + .ndo_tx_timeout = ena_find_and_timeout_queue, +#endif + .ndo_change_mtu = ena_change_mtu, + .ndo_set_mac_address = NULL, +#ifdef HAVE_SET_RX_MODE + .ndo_set_rx_mode = ena_set_rx_mode, +#endif + .ndo_validate_addr = eth_validate_addr, +#ifdef ENA_BUSY_POLL_SUPPORT + .ndo_busy_poll = ena_busy_poll, +#endif +#ifdef ENA_XDP_SUPPORT + .ndo_bpf = ena_xdp, + .ndo_xdp_xmit = ena_xdp_xmit, +#if defined(ENA_TEST_AF_XDP) && defined(ENA_AF_XDP_SUPPORT) + .ndo_xsk_wakeup = ena_xdp_xsk_wakeup, +#endif /* defined(ENA_TEST_AF_XDP) && defined(ENA_AF_XDP_SUPPORT) */ +#endif /* ENA_XDP_SUPPORT */ +}; + +static int ena_device_validate_params(struct ena_adapter *adapter, + struct ena_com_dev_get_features_ctx *get_feat_ctx) +{ + struct net_device *netdev = adapter->netdev; + int rc; + + rc = ether_addr_equal(get_feat_ctx->dev_attr.mac_addr, + adapter->mac_addr); + if (!rc) { + netif_err(adapter, drv, netdev, + "Error, mac address are different\n"); + return -EINVAL; + } + + if (get_feat_ctx->dev_attr.max_mtu < netdev->mtu) { + netif_err(adapter, drv, netdev, + "Error, device max mtu is smaller than netdev MTU\n"); + return -EINVAL; + } + + return 0; +} + +static void set_default_llq_configurations(struct ena_adapter *adapter, + struct ena_llq_configurations *llq_config, + struct ena_admin_feature_llq_desc *llq) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + + llq_config->llq_header_location = ENA_ADMIN_INLINE_HEADER; + llq_config->llq_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY; + llq_config->llq_num_decs_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2; + + adapter->large_llq_header_supported = + !!(ena_dev->supported_features & (1 << ENA_ADMIN_LLQ)); + adapter->large_llq_header_supported &= + !!(llq->entry_size_ctrl_supported & + ENA_ADMIN_LIST_ENTRY_SIZE_256B); + + if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) && + adapter->large_llq_header_enabled) { + llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_256B; + llq_config->llq_ring_entry_size_value = 256; + } else { + llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_128B; + llq_config->llq_ring_entry_size_value = 128; + } +} + +static int ena_set_queues_placement_policy(struct pci_dev *pdev, + struct ena_com_dev *ena_dev, + struct ena_admin_feature_llq_desc *llq, + struct ena_llq_configurations *llq_default_configurations) +{ + int rc; + u32 llq_feature_mask; + + llq_feature_mask = 1 << ENA_ADMIN_LLQ; + if (!(ena_dev->supported_features & llq_feature_mask)) { + dev_warn(&pdev->dev, + "LLQ is not supported Fallback to host mode policy.\n"); + ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; + return 0; + } + + if (!ena_dev->mem_bar) { + netdev_err(ena_dev->net_device, + "LLQ is advertised as supported but device doesn't expose mem bar\n"); + ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; + return 0; + } + + rc = ena_com_config_dev_mode(ena_dev, llq, llq_default_configurations); + if (unlikely(rc)) { + dev_err(&pdev->dev, + "Failed to configure the device mode. Fallback to host mode policy.\n"); + ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; + } + + return 0; +} + +static int ena_map_llq_mem_bar(struct pci_dev *pdev, struct ena_com_dev *ena_dev, + int bars) +{ + bool has_mem_bar = !!(bars & BIT(ENA_MEM_BAR)); + + if (!has_mem_bar) + return 0; + + ena_dev->mem_bar = devm_ioremap_wc(&pdev->dev, + pci_resource_start(pdev, ENA_MEM_BAR), + pci_resource_len(pdev, ENA_MEM_BAR)); + + if (!ena_dev->mem_bar) + return -EFAULT; + + return 0; +} + +static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev, + struct ena_com_dev_get_features_ctx *get_feat_ctx, + bool *wd_state) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct net_device *netdev = adapter->netdev; + struct ena_llq_configurations llq_config; + netdev_features_t prev_netdev_features; + struct device *dev = &pdev->dev; + bool readless_supported; + u32 aenq_groups; + int dma_width; + int rc; + + rc = ena_com_mmio_reg_read_request_init(ena_dev); + if (rc) { + dev_err(dev, "Failed to init mmio read less\n"); + return rc; + } + + /* The PCIe configuration space revision id indicate if mmio reg + * read is disabled + */ + readless_supported = !(pdev->revision & ENA_MMIO_DISABLE_REG_READ); + ena_com_set_mmio_read_mode(ena_dev, readless_supported); + + rc = ena_com_dev_reset(ena_dev, ENA_REGS_RESET_NORMAL); + if (rc) { + dev_err(dev, "Can not reset device\n"); + goto err_mmio_read_less; + } + + rc = ena_com_validate_version(ena_dev); + if (rc) { + dev_err(dev, "Device version is too low\n"); + goto err_mmio_read_less; + } + + dma_width = ena_com_get_dma_width(ena_dev); + if (dma_width < 0) { + dev_err(dev, "Invalid dma width value %d", dma_width); + rc = dma_width; + goto err_mmio_read_less; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) + rc = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(dma_width)); + if (rc) { + dev_err(dev, "dma_set_mask_and_coherent failed %d\n", rc); + goto err_mmio_read_less; + } +#else + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(dma_width)); + if (rc) { + dev_err(dev, "pci_set_dma_mask failed %d\n", rc); + goto err_mmio_read_less; + } + + rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(dma_width)); + if (rc) { + dev_err(dev, "err_pci_set_consistent_dma_mask failed %d\n", + rc); + goto err_mmio_read_less; + } +#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */ + + ena_devlink_params_get(adapter->devlink); + + /* ENA admin level init */ + rc = ena_com_admin_init(ena_dev, &aenq_handlers); + if (rc) { + dev_err(dev, + "Can not initialize ena admin queue with device\n"); + goto err_mmio_read_less; + } + + /* To enable the msix interrupts the driver needs to know the number + * of queues. So the driver uses polling mode to retrieve this + * information + */ + ena_com_set_admin_polling_mode(ena_dev, true); + + ena_config_host_info(ena_dev, pdev); + + /* Get Device Attributes*/ + rc = ena_com_get_dev_attr_feat(ena_dev, get_feat_ctx); + if (rc) { + dev_err(dev, "Cannot get attribute for ena device rc=%d\n", rc); + goto err_admin_init; + } + + /* Try to turn all the available aenq groups */ + aenq_groups = BIT(ENA_ADMIN_LINK_CHANGE) | + BIT(ENA_ADMIN_FATAL_ERROR) | + BIT(ENA_ADMIN_WARNING) | + BIT(ENA_ADMIN_NOTIFICATION) | + BIT(ENA_ADMIN_KEEP_ALIVE); + + aenq_groups &= get_feat_ctx->aenq.supported_groups; + + rc = ena_com_set_aenq_config(ena_dev, aenq_groups); + if (rc) { + dev_err(dev, "Cannot configure aenq groups rc= %d\n", rc); + goto err_admin_init; + } + + *wd_state = !!(aenq_groups & BIT(ENA_ADMIN_KEEP_ALIVE)); + + set_default_llq_configurations(adapter, &llq_config, &get_feat_ctx->llq); + + rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx->llq, + &llq_config); + if (rc) { + netdev_err(netdev, "Cannot set queues placement policy rc= %d\n", rc); + goto err_admin_init; + } + + rc = ena_calc_io_queue_size(adapter, get_feat_ctx); + if (unlikely(rc)) + goto err_admin_init; + + /* Turned on features shouldn't change due to reset. */ + prev_netdev_features = adapter->netdev->features; + ena_set_dev_offloads(get_feat_ctx, adapter->netdev); + adapter->netdev->features = prev_netdev_features; + + rc = ena_phc_init(adapter); + if (unlikely(rc && (rc != -EOPNOTSUPP))) { + netdev_err(netdev, "Failed initiating PHC, error: %d\n", rc); + goto err_admin_init; + } + + return 0; + +err_admin_init: + ena_com_abort_admin_commands(ena_dev); + ena_com_wait_for_abort_completion(ena_dev); + ena_com_delete_host_info(ena_dev); + ena_com_admin_destroy(ena_dev); +err_mmio_read_less: + ena_com_mmio_reg_read_request_destroy(ena_dev); + + return rc; +} + +static int ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *adapter) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct device *dev = &adapter->pdev->dev; + int rc; + + rc = ena_enable_msix(adapter); + if (rc) { + dev_err(dev, "Can not reserve msix vectors\n"); + return rc; + } + + ena_setup_mgmnt_intr(adapter); + + rc = ena_request_mgmnt_irq(adapter); + if (rc) { + dev_err(dev, "Can not setup management interrupts\n"); + goto err_disable_msix; + } + + ena_com_set_admin_polling_mode(ena_dev, false); + + ena_com_admin_aenq_enable(ena_dev); + + return 0; + +err_disable_msix: + ena_disable_msix(adapter); + + return rc; +} + +int ena_destroy_device(struct ena_adapter *adapter, bool graceful) +{ + struct net_device *netdev = adapter->netdev; + struct ena_com_dev *ena_dev = adapter->ena_dev; + bool dev_up; + int rc = 0; + + if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags)) + return 0; + + netif_carrier_off(netdev); + + del_timer_sync(&adapter->timer_service); + + dev_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags); + adapter->dev_up_before_reset = dev_up; + ena_sysfs_terminate(&adapter->pdev->dev); + if (!graceful) + ena_com_set_admin_running_state(ena_dev, false); + + if (dev_up) + ena_down(adapter); + + /* Stop the device from sending AENQ events (in case reset flag is set + * and device is up, ena_down() already reset the device. + */ + if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up)) + rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason); + + ena_free_mgmnt_irq(adapter); + + ena_disable_msix(adapter); + + ena_com_abort_admin_commands(ena_dev); + + ena_com_wait_for_abort_completion(ena_dev); + + ena_com_admin_destroy(ena_dev); + + ena_phc_destroy(adapter); + + ena_com_mmio_reg_read_request_destroy(ena_dev); + + /* return reset reason to default value */ + adapter->reset_reason = ENA_REGS_RESET_NORMAL; + + clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags); + clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags); + + return rc; +} + +int ena_restore_device(struct ena_adapter *adapter) +{ + struct ena_com_dev_get_features_ctx get_feat_ctx; + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct pci_dev *pdev = adapter->pdev; + struct ena_ring *txr; + int rc, count, i; + bool wd_state; + + set_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags); + rc = ena_device_init(adapter, adapter->pdev, &get_feat_ctx, &wd_state); + if (rc) { + dev_err(&pdev->dev, "Can not initialize device\n"); + goto err; + } + adapter->wd_state = wd_state; + + count = adapter->xdp_num_queues + adapter->num_io_queues; + for (i = 0 ; i < count; i++) { + txr = &adapter->tx_ring[i]; + txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type; + txr->tx_max_header_size = ena_dev->tx_max_header_size; + } + + rc = ena_device_validate_params(adapter, &get_feat_ctx); + if (rc) { + dev_err(&pdev->dev, "Validation of device parameters failed\n"); + goto err_device_destroy; + } + + rc = ena_enable_msix_and_set_admin_interrupts(adapter); + if (rc) { + dev_err(&pdev->dev, "Enable MSI-X failed\n"); + goto err_device_destroy; + } + rc = ena_sysfs_init(&pdev->dev); + if (rc) { + dev_err(&pdev->dev, "Cannot initialize sysfs\n"); + goto err_disable_msix; + } + /* If the interface was up before the reset bring it up */ + if (adapter->dev_up_before_reset) { + rc = ena_up(adapter); + if (rc) { + dev_err(&pdev->dev, "Failed to create I/O queues\n"); + goto err_sysfs_terminate; + } + } + + set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags); + + clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags); + if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags)) + netif_carrier_on(adapter->netdev); + + mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ)); + adapter->last_keep_alive_jiffies = jiffies; + + return rc; +err_sysfs_terminate: + ena_sysfs_terminate(&pdev->dev); +err_disable_msix: + ena_free_mgmnt_irq(adapter); + ena_disable_msix(adapter); +err_device_destroy: + ena_com_abort_admin_commands(ena_dev); + ena_com_wait_for_abort_completion(ena_dev); + ena_com_admin_destroy(ena_dev); + ena_com_dev_reset(ena_dev, ENA_REGS_RESET_DRIVER_INVALID_STATE); + ena_phc_destroy(adapter); + ena_com_mmio_reg_read_request_destroy(ena_dev); +err: + clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags); + clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags); + dev_err(&pdev->dev, + "Reset attempt failed. Can not reset the device\n"); + + return rc; +} + +static void ena_fw_reset_device(struct work_struct *work) +{ + int rc = 0; + + struct ena_adapter *adapter = + container_of(work, struct ena_adapter, reset_task); + + rtnl_lock(); + + if (likely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) { + rc |= ena_destroy_device(adapter, false); + rc |= ena_restore_device(adapter); + adapter->dev_stats.reset_fail += !!rc; + + dev_err(&adapter->pdev->dev, + "Device reset completed successfully, Driver info: %s\n", + version); + } + + rtnl_unlock(); +} + +static int check_for_rx_interrupt_queue(struct ena_adapter *adapter, + struct ena_ring *rx_ring) +{ + struct ena_napi *ena_napi = container_of(rx_ring->napi, struct ena_napi, napi); + + if (likely(READ_ONCE(ena_napi->first_interrupt))) + return 0; + + if (ena_com_cq_empty(rx_ring->ena_com_io_cq)) + return 0; + + rx_ring->no_interrupt_event_cnt++; + + if (rx_ring->no_interrupt_event_cnt == ENA_MAX_NO_INTERRUPT_ITERATIONS) { + netif_err(adapter, rx_err, adapter->netdev, + "Potential MSIX issue on Rx side Queue = %d. Reset the device\n", + rx_ring->qid); + + ena_reset_device(adapter, ENA_REGS_RESET_MISS_INTERRUPT); + return -EIO; + } + + return 0; +} + +static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter, + struct ena_ring *tx_ring) +{ + struct ena_napi *ena_napi = container_of(tx_ring->napi, struct ena_napi, napi); + enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_MISS_TX_CMPL; + unsigned int time_since_last_napi; + unsigned int missing_tx_comp_to; + bool is_tx_comp_time_expired; + struct ena_tx_buffer *tx_buf; + unsigned long last_jiffies; + int napi_scheduled; + u32 missed_tx = 0; + int i, rc = 0; + + missing_tx_comp_to = jiffies_to_msecs(adapter->missing_tx_completion_to); + + for (i = 0; i < tx_ring->ring_size; i++) { + tx_buf = &tx_ring->tx_buffer_info[i]; + last_jiffies = tx_buf->last_jiffies; + + if (last_jiffies == 0) + /* no pending Tx at this location */ + continue; + + is_tx_comp_time_expired = time_is_before_jiffies(last_jiffies + + 2 * adapter->missing_tx_completion_to); + + if (unlikely(!READ_ONCE(ena_napi->first_interrupt) && is_tx_comp_time_expired)) { + /* If after graceful period interrupt is still not + * received, we schedule a reset + */ + netif_err(adapter, tx_err, adapter->netdev, + "Potential MSIX issue on Tx side Queue = %d. Reset the device\n", + tx_ring->qid); + ena_reset_device(adapter, ENA_REGS_RESET_MISS_INTERRUPT); + return -EIO; + } + + is_tx_comp_time_expired = time_is_before_jiffies(last_jiffies + + adapter->missing_tx_completion_to); + + if (unlikely(is_tx_comp_time_expired)) { + + time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies); + napi_scheduled = !!(ena_napi->napi.state & NAPIF_STATE_SCHED); + + if (missing_tx_comp_to < time_since_last_napi && napi_scheduled) { + /* We suspect napi isn't called because the + * bottom half is not run. Require a bigger + * timeout for these cases + */ + if (!time_is_before_jiffies(last_jiffies + + 2 * adapter->missing_tx_completion_to)) + continue; + + reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION; + } + + if (tx_buf->print_once) + continue; + + netif_notice(adapter, tx_err, adapter->netdev, + "TX hasn't completed, qid %d, index %d. %u usecs from last napi execution, napi scheduled: %d\n", + tx_ring->qid, i, time_since_last_napi, napi_scheduled); + + missed_tx++; + tx_buf->print_once = 1; + } + } + + if (unlikely(missed_tx > adapter->missing_tx_completion_threshold)) { + netif_err(adapter, tx_err, adapter->netdev, + "Lost TX completions are above the threshold (%d > %d). Completion transmission timeout: %u.\n", + missed_tx, + adapter->missing_tx_completion_threshold, + missing_tx_comp_to); + netif_err(adapter, tx_err, adapter->netdev, + "Resetting the device\n"); + + ena_reset_device(adapter, reset_reason); + rc = -EIO; + } + + ena_increase_stat(&tx_ring->tx_stats.missed_tx, missed_tx, + &tx_ring->syncp); + + return rc; +} + +static void check_for_missing_completions(struct ena_adapter *adapter) +{ + struct ena_ring *tx_ring; + struct ena_ring *rx_ring; + int i, budget, rc; + int io_queue_count; + + io_queue_count = adapter->xdp_num_queues + adapter->num_io_queues; + /* Make sure the driver doesn't turn the device in other process */ + smp_rmb(); + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + return; + + if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) + return; + + if (adapter->missing_tx_completion_to == ENA_HW_HINTS_NO_TIMEOUT) + return; + + budget = ENA_MONITORED_TX_QUEUES; + + for (i = adapter->last_monitored_tx_qid; i < io_queue_count; i++) { + tx_ring = &adapter->tx_ring[i]; + rx_ring = &adapter->rx_ring[i]; + + rc = check_missing_comp_in_tx_queue(adapter, tx_ring); + if (unlikely(rc)) + return; + + rc = !ENA_IS_XDP_INDEX(adapter, i) ? + check_for_rx_interrupt_queue(adapter, rx_ring) : 0; + if (unlikely(rc)) + return; + + budget--; + if (!budget) + break; + } + + adapter->last_monitored_tx_qid = i % io_queue_count; +} + +/* trigger napi schedule after 2 consecutive detections */ +#define EMPTY_RX_REFILL 2 +/* For the rare case where the device runs out of Rx descriptors and the + * napi handler failed to refill new Rx descriptors (due to a lack of memory + * for example). + * This case will lead to a deadlock: + * The device won't send interrupts since all the new Rx packets will be dropped + * The napi handler won't allocate new Rx descriptors so the device will be + * able to send new packets. + * + * This scenario can happen when the kernel's vm.min_free_kbytes is too small. + * It is recommended to have at least 512MB, with a minimum of 128MB for + * constrained environment). + * + * When such a situation is detected - Reschedule napi + */ +static void check_for_empty_rx_ring(struct ena_adapter *adapter) +{ + struct ena_ring *rx_ring; + int i, refill_required; + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + return; + + if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) + return; + + for (i = 0; i < adapter->num_io_queues; i++) { + rx_ring = &adapter->rx_ring[i]; + + /* If using UMEM, app might not provide RX buffers and the ring + * can be empty + */ + if (ENA_IS_XSK_RING(rx_ring)) + continue; + + refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq); + if (unlikely(refill_required == (rx_ring->ring_size - 1))) { + rx_ring->empty_rx_queue++; + + if (rx_ring->empty_rx_queue >= EMPTY_RX_REFILL) { + ena_increase_stat(&rx_ring->rx_stats.empty_rx_ring, 1, + &rx_ring->syncp); + + netif_err(adapter, drv, adapter->netdev, + "Trigger refill for ring %d\n", i); + + napi_schedule(rx_ring->napi); + rx_ring->empty_rx_queue = 0; + } + } else { + rx_ring->empty_rx_queue = 0; + } + } +} + +/* Check for keep alive expiration */ +static void check_for_missing_keep_alive(struct ena_adapter *adapter) +{ + unsigned long keep_alive_expired; + + if (!adapter->wd_state) + return; + + if (adapter->keep_alive_timeout == ENA_HW_HINTS_NO_TIMEOUT) + return; + + keep_alive_expired = adapter->last_keep_alive_jiffies + + adapter->keep_alive_timeout; + if (unlikely(time_is_before_jiffies(keep_alive_expired))) { + netif_err(adapter, drv, adapter->netdev, + "Keep alive watchdog timeout.\n"); + ena_increase_stat(&adapter->dev_stats.wd_expired, 1, + &adapter->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_KEEP_ALIVE_TO); + } +} + +static void check_for_admin_com_state(struct ena_adapter *adapter) +{ + if (unlikely(!ena_com_get_admin_running_state(adapter->ena_dev))) { + netif_err(adapter, drv, adapter->netdev, + "ENA admin queue is not in running state!\n"); + ena_increase_stat(&adapter->dev_stats.admin_q_pause, 1, + &adapter->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_ADMIN_TO); + } +} + +static void ena_update_hints(struct ena_adapter *adapter, + struct ena_admin_ena_hw_hints *hints) +{ + struct net_device *netdev = adapter->netdev; + + if (hints->admin_completion_tx_timeout) + adapter->ena_dev->admin_queue.completion_timeout = + hints->admin_completion_tx_timeout * 1000; + + if (hints->mmio_read_timeout) + /* convert to usec */ + adapter->ena_dev->mmio_read.reg_read_to = + hints->mmio_read_timeout * 1000; + + if (hints->missed_tx_completion_count_threshold_to_reset) + adapter->missing_tx_completion_threshold = + hints->missed_tx_completion_count_threshold_to_reset; + + if (hints->missing_tx_completion_timeout) { + if (hints->missing_tx_completion_timeout == ENA_HW_HINTS_NO_TIMEOUT) + adapter->missing_tx_completion_to = ENA_HW_HINTS_NO_TIMEOUT; + else + adapter->missing_tx_completion_to = + msecs_to_jiffies(hints->missing_tx_completion_timeout); + } + + if (hints->netdev_wd_timeout) + netdev->watchdog_timeo = msecs_to_jiffies(hints->netdev_wd_timeout); + + if (hints->driver_watchdog_timeout) { + if (hints->driver_watchdog_timeout == ENA_HW_HINTS_NO_TIMEOUT) + adapter->keep_alive_timeout = ENA_HW_HINTS_NO_TIMEOUT; + else + adapter->keep_alive_timeout = + msecs_to_jiffies(hints->driver_watchdog_timeout); + } +} + +static void ena_update_host_info(struct ena_admin_host_info *host_info, + struct net_device *netdev) +{ + host_info->supported_network_features[0] = + netdev->features & GENMASK_ULL(31, 0); + host_info->supported_network_features[1] = + (netdev->features & GENMASK_ULL(63, 32)) >> 32; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) +static void ena_timer_service(struct timer_list *t) +{ + struct ena_adapter *adapter = from_timer(adapter, t, timer_service); +#else +static void ena_timer_service(unsigned long data) +{ + struct ena_adapter *adapter = (struct ena_adapter *)data; +#endif + u8 *debug_area = adapter->ena_dev->host_attr.debug_area_virt_addr; + struct ena_admin_host_info *host_info = + adapter->ena_dev->host_attr.host_info; + + check_for_missing_keep_alive(adapter); + + check_for_admin_com_state(adapter); + + check_for_missing_completions(adapter); + + check_for_empty_rx_ring(adapter); + + if (debug_area) + ena_dump_stats_to_buf(adapter, debug_area); + + if (host_info) + ena_update_host_info(host_info, adapter->netdev); + + if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) { + /* We don't destroy driver resources if we're not able to + * communicate with the device. Failure in validating the + * version implies unresponsive device. + */ + if (ena_com_validate_version(adapter->ena_dev) == -ETIME) { + netif_err(adapter, drv, adapter->netdev, + "FW isn't responsive, skipping reset routine\n"); + mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ)); + return; + } + + netif_err(adapter, drv, adapter->netdev, + "Trigger reset is on\n"); + + if (adapter->reset_reason != ENA_REGS_RESET_NORMAL) + ena_dump_stats_to_dmesg(adapter); + + queue_work(ena_wq, &adapter->reset_task); + return; + } + + /* Reset the timer */ + mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ)); +} + +static u32 ena_calc_max_io_queue_num(struct pci_dev *pdev, + struct ena_com_dev *ena_dev, + struct ena_com_dev_get_features_ctx *get_feat_ctx) +{ + u32 io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues; + + if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) { + struct ena_admin_queue_ext_feature_fields *max_queue_ext = + &get_feat_ctx->max_queue_ext.max_queue_ext; + io_rx_num = min_t(u32, max_queue_ext->max_rx_sq_num, + max_queue_ext->max_rx_cq_num); + + io_tx_sq_num = max_queue_ext->max_tx_sq_num; + io_tx_cq_num = max_queue_ext->max_tx_cq_num; + } else { + struct ena_admin_queue_feature_desc *max_queues = + &get_feat_ctx->max_queues; + io_tx_sq_num = max_queues->max_sq_num; + io_tx_cq_num = max_queues->max_cq_num; + io_rx_num = min_t(u32, io_tx_sq_num, io_tx_cq_num); + } + + /* In case of LLQ use the llq fields for the tx SQ/CQ */ + if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + io_tx_sq_num = get_feat_ctx->llq.max_llq_num; + + max_num_io_queues = min_t(u32, num_online_cpus(), ENA_MAX_NUM_IO_QUEUES); + max_num_io_queues = min_t(u32, max_num_io_queues, io_rx_num); + max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_sq_num); + max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_cq_num); + /* 1 IRQ for mgmnt and 1 IRQs for each IO direction */ + max_num_io_queues = min_t(u32, max_num_io_queues, pci_msix_vec_count(pdev) - 1); + + return max_num_io_queues; +} + +static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat, + struct net_device *netdev) +{ + netdev_features_t dev_features = 0; + + /* Set offload features */ + if (feat->offload.tx & + ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK) + dev_features |= NETIF_F_IP_CSUM; + + if (feat->offload.tx & + ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK) + dev_features |= NETIF_F_IPV6_CSUM; + + if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK) + dev_features |= NETIF_F_TSO; + + if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK) + dev_features |= NETIF_F_TSO6; + + if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_MASK) + dev_features |= NETIF_F_TSO_ECN; + + if (feat->offload.rx_supported & + ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_MASK) + dev_features |= NETIF_F_RXCSUM; + + if (feat->offload.rx_supported & + ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_MASK) + dev_features |= NETIF_F_RXCSUM; + + netdev->features = + dev_features | + NETIF_F_SG | +#ifdef NETIF_F_RXHASH + NETIF_F_RXHASH | +#endif /* NETIF_F_RXHASH */ + NETIF_F_HIGHDMA; + +#ifdef HAVE_RHEL6_NET_DEVICE_OPS_EXT + do { + u32 hw_features = get_netdev_hw_features(netdev); + hw_features |= netdev->features; + set_netdev_hw_features(netdev, hw_features); + } while (0); +#else + netdev->hw_features |= netdev->features; +#endif + netdev->vlan_features |= netdev->features; +} + +static void ena_set_conf_feat_params(struct ena_adapter *adapter, + struct ena_com_dev_get_features_ctx *feat) +{ + struct net_device *netdev = adapter->netdev; + + /* Copy mac address */ + if (!is_valid_ether_addr(feat->dev_attr.mac_addr)) { + eth_hw_addr_random(netdev); + ether_addr_copy(adapter->mac_addr, netdev->dev_addr); + } else { + ether_addr_copy(adapter->mac_addr, feat->dev_attr.mac_addr); + eth_hw_addr_set(netdev, adapter->mac_addr); + } + + /* Set offload features */ + ena_set_dev_offloads(feat, netdev); + + adapter->max_mtu = feat->dev_attr.max_mtu; +#ifdef HAVE_MTU_MIN_MAX_IN_NET_DEVICE + netdev->max_mtu = adapter->max_mtu; + netdev->min_mtu = ENA_MIN_MTU; +#endif +} + +static int ena_rss_init_default(struct ena_adapter *adapter) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct device *dev = &adapter->pdev->dev; + int rc, i; + u32 val; + + rc = ena_com_rss_init(ena_dev, ENA_RX_RSS_TABLE_LOG_SIZE); + if (unlikely(rc)) { + dev_err(dev, "Cannot init indirect table\n"); + goto err_rss_init; + } + + for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) { + val = ethtool_rxfh_indir_default(i, adapter->num_io_queues); + rc = ena_com_indirect_table_fill_entry(ena_dev, i, + ENA_IO_RXQ_IDX(val)); + if (unlikely(rc)) { + dev_err(dev, "Cannot fill indirect table\n"); + goto err_fill_indir; + } + } + + rc = ena_com_fill_hash_function(ena_dev, ENA_ADMIN_TOEPLITZ, NULL, + ENA_HASH_KEY_SIZE, 0xFFFFFFFF); + if (unlikely(rc && (rc != -EOPNOTSUPP))) { + dev_err(dev, "Cannot fill hash function\n"); + goto err_fill_indir; + } + + rc = ena_com_set_default_hash_ctrl(ena_dev); + if (unlikely(rc && (rc != -EOPNOTSUPP))) { + dev_err(dev, "Cannot fill hash control\n"); + goto err_fill_indir; + } + + return 0; + +err_fill_indir: + ena_com_rss_destroy(ena_dev); +err_rss_init: + + return rc; +} + +static void ena_release_bars(struct ena_com_dev *ena_dev, struct pci_dev *pdev) +{ + int release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK; + + pci_release_selected_regions(pdev, release_bars); +} + + +static int ena_calc_io_queue_size(struct ena_adapter *adapter, + struct ena_com_dev_get_features_ctx *get_feat_ctx) +{ + struct ena_admin_feature_llq_desc *llq = &get_feat_ctx->llq; + struct ena_com_dev *ena_dev = adapter->ena_dev; + u32 tx_queue_size = ENA_DEFAULT_RING_SIZE; + bool tx_configured, rx_configured; + u32 max_tx_queue_size; + u32 max_rx_queue_size; + + /* If this function is called after driver load, the ring sizes have + * already been configured. Take it into account when recalculating ring + * size. + */ + tx_configured = !!adapter->tx_ring[0].ring_size; + rx_configured = !!adapter->rx_ring[0].ring_size; + tx_queue_size = tx_configured ? adapter->tx_ring[0].ring_size : tx_queue_size; + rx_queue_size = rx_configured ? adapter->rx_ring[0].ring_size : rx_queue_size; + + if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) { + struct ena_admin_queue_ext_feature_fields *max_queue_ext = + &get_feat_ctx->max_queue_ext.max_queue_ext; + max_rx_queue_size = min_t(u32, max_queue_ext->max_rx_cq_depth, + max_queue_ext->max_rx_sq_depth); + max_tx_queue_size = max_queue_ext->max_tx_cq_depth; + + if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + max_tx_queue_size = min_t(u32, max_tx_queue_size, + llq->max_llq_depth); + else + max_tx_queue_size = min_t(u32, max_tx_queue_size, + max_queue_ext->max_tx_sq_depth); + + adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS, + max_queue_ext->max_per_packet_tx_descs); + adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS, + max_queue_ext->max_per_packet_rx_descs); + } else { + struct ena_admin_queue_feature_desc *max_queues = + &get_feat_ctx->max_queues; + max_rx_queue_size = min_t(u32, max_queues->max_cq_depth, + max_queues->max_sq_depth); + max_tx_queue_size = max_queues->max_cq_depth; + + if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + max_tx_queue_size = min_t(u32, max_tx_queue_size, + llq->max_llq_depth); + else + max_tx_queue_size = min_t(u32, max_tx_queue_size, + max_queues->max_sq_depth); + + adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS, + max_queues->max_packet_tx_descs); + adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS, + max_queues->max_packet_rx_descs); + } + + max_tx_queue_size = rounddown_pow_of_two(max_tx_queue_size); + max_rx_queue_size = rounddown_pow_of_two(max_rx_queue_size); + + if (max_tx_queue_size < ENA_MIN_RING_SIZE) { + netdev_err(adapter->netdev, "Device max TX queue size: %d < minimum: %d\n", + max_tx_queue_size, ENA_MIN_RING_SIZE); + return -EFAULT; + } + + if (max_rx_queue_size < ENA_MIN_RING_SIZE) { + netdev_err(adapter->netdev, "Device max RX queue size: %d < minimum: %d\n", + max_rx_queue_size, ENA_MIN_RING_SIZE); + return -EFAULT; + } + + /* When forcing large headers, we multiply the entry size by 2, + * and therefore divide the queue size by 2, leaving the amount + * of memory used by the queues unchanged. + */ + if (adapter->large_llq_header_enabled) { + if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) && + (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)) { + max_tx_queue_size /= 2; + dev_info(&adapter->pdev->dev, + "Forcing large headers and decreasing maximum TX queue size to %d\n", + max_tx_queue_size); + } else { + dev_err(&adapter->pdev->dev, + "Forcing large headers failed: LLQ is disabled or device does not support large headers\n"); + + adapter->large_llq_header_enabled = false; + ena_devlink_disable_large_llq_header_param(adapter->devlink); + } + } + + tx_queue_size = clamp_val(tx_queue_size, ENA_MIN_RING_SIZE, + max_tx_queue_size); + rx_queue_size = clamp_val(rx_queue_size, ENA_MIN_RING_SIZE, + max_rx_queue_size); + + tx_queue_size = rounddown_pow_of_two(tx_queue_size); + rx_queue_size = rounddown_pow_of_two(rx_queue_size); + + adapter->max_tx_ring_size = max_tx_queue_size; + adapter->max_rx_ring_size = max_rx_queue_size; + adapter->requested_tx_ring_size = tx_queue_size; + adapter->requested_rx_ring_size = rx_queue_size; + + return 0; +} + +/* ena_probe - Device Initialization Routine + * @pdev: PCI device information struct + * @ent: entry in ena_pci_tbl + * + * Returns 0 on success, negative on failure + * + * ena_probe initializes an adapter identified by a pci_dev structure. + * The OS initialization, configuring of the adapter private structure, + * and a hardware reset occur. + */ +static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + struct ena_com_dev_get_features_ctx get_feat_ctx; + struct ena_com_dev *ena_dev = NULL; + struct ena_adapter *adapter; + struct net_device *netdev; + static int adapters_found; + struct devlink *devlink; + u32 max_num_io_queues; + bool wd_state; + int bars, rc; + + dev_dbg(&pdev->dev, "%s\n", __func__); + + dev_info_once(&pdev->dev, "%s", version); + + rc = pci_enable_device_mem(pdev); + if (rc) { + dev_err(&pdev->dev, "pci_enable_device_mem() failed!\n"); + return rc; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS)); + if (rc) { + dev_err(&pdev->dev, "dma_set_mask_and_coherent failed %d\n", rc); + goto err_disable_device; + } +#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */ + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS)); + if (rc) { + dev_err(&pdev->dev, "pci_set_dma_mask failed %d\n", rc); + goto err_disable_device; + } + + rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS)); + if (rc) { + dev_err(&pdev->dev, "err_pci_set_consistent_dma_mask failed %d\n", + rc); + goto err_disable_device; + } +#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */ + + pci_set_master(pdev); + + ena_dev = vzalloc(sizeof(*ena_dev)); + if (!ena_dev) { + rc = -ENOMEM; + goto err_disable_device; + } + + bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK; + rc = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + if (rc) { + dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n", + rc); + goto err_free_ena_dev; + } + + ena_dev->reg_bar = devm_ioremap(&pdev->dev, + pci_resource_start(pdev, ENA_REG_BAR), + pci_resource_len(pdev, ENA_REG_BAR)); + if (!ena_dev->reg_bar) { + dev_err(&pdev->dev, "Failed to remap regs bar\n"); + rc = -EFAULT; + goto err_free_region; + } + + ena_dev->ena_min_poll_delay_us = ENA_ADMIN_POLL_DELAY_US; + + ena_dev->dmadev = &pdev->dev; + + netdev = alloc_etherdev_mq(sizeof(struct ena_adapter), ENA_MAX_RINGS); + if (!netdev) { + dev_err(&pdev->dev, "alloc_etherdev_mq failed\n"); + rc = -ENOMEM; + goto err_free_region; + } + + SET_NETDEV_DEV(netdev, &pdev->dev); + adapter = netdev_priv(netdev); + adapter->ena_dev = ena_dev; + adapter->netdev = netdev; + adapter->pdev = pdev; + adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE); + + ena_dev->net_device = netdev; + + pci_set_drvdata(pdev, adapter); + + rc = ena_phc_alloc(adapter); + if (rc) { + netdev_err(netdev, "ena_phc_alloc failed\n"); + goto err_netdev_destroy; + } + + adapter->large_llq_header_enabled = !!force_large_llq_header; + +#ifdef ENA_PHC_SUPPORT + ena_phc_enable(adapter, !!phc_enable); + +#endif /* ENA_PHC_SUPPORT */ + rc = ena_com_allocate_customer_metrics_buffer(ena_dev); + if (rc) { + netdev_err(netdev, "ena_com_allocate_customer_metrics_buffer failed\n"); + goto err_free_phc; + } + + devlink = ena_devlink_alloc(adapter); + if (!devlink) { + netdev_err(netdev, "ena_devlink_alloc failed\n"); + goto err_metrics_destroy; + } + + rc = ena_map_llq_mem_bar(pdev, ena_dev, bars); + if (rc) { + dev_err(&pdev->dev, "ENA LLQ bar mapping failed\n"); + goto err_devlink_destroy; + } + + rc = ena_device_init(adapter, pdev, &get_feat_ctx, &wd_state); + if (rc) { + dev_err(&pdev->dev, "ENA device init failed\n"); + if (rc == -ETIME) + rc = -EPROBE_DEFER; + goto err_devlink_destroy; + } + + /* Initial TX and RX interrupt delay. Assumes 1 usec granularity. + * Updated during device initialization with the real granularity + */ + ena_dev->intr_moder_tx_interval = ENA_INTR_INITIAL_TX_INTERVAL_USECS; + ena_dev->intr_moder_rx_interval = ENA_INTR_INITIAL_RX_INTERVAL_USECS; + ena_dev->intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION; + max_num_io_queues = ena_calc_max_io_queue_num(pdev, ena_dev, &get_feat_ctx); + if (unlikely(!max_num_io_queues)) { + rc = -EFAULT; + goto err_device_destroy; + } + + ena_set_conf_feat_params(adapter, &get_feat_ctx); + + adapter->reset_reason = ENA_REGS_RESET_NORMAL; + + adapter->num_io_queues = clamp_val(num_io_queues, ENA_MIN_NUM_IO_QUEUES, + max_num_io_queues); + adapter->used_lpc_size = lpc_size; + /* When LPC is enabled after driver load, the configured_lpc_size is + * used. Leaving it as 0, wouldn't change LPC state so we set it to + * different value + */ + adapter->configured_lpc_size = lpc_size ? : ENA_LPC_DEFAULT_MULTIPLIER; + adapter->max_num_io_queues = max_num_io_queues; + adapter->last_monitored_tx_qid = 0; + + adapter->xdp_first_ring = 0; + adapter->xdp_num_queues = 0; + + adapter->rx_copybreak = ENA_DEFAULT_RX_COPYBREAK; + if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) + adapter->disable_meta_caching = + !!(get_feat_ctx.llq.accel_mode.u.get.supported_flags & + BIT(ENA_ADMIN_DISABLE_META_CACHING)); + + adapter->wd_state = wd_state; + + snprintf(adapter->name, ENA_NAME_MAX_LEN, "ena_%d", adapters_found); + + rc = ena_com_init_interrupt_moderation(adapter->ena_dev); + if (rc) { + dev_err(&pdev->dev, + "Failed to query interrupt moderation feature\n"); + goto err_device_destroy; + } + + ena_init_io_rings(adapter, + 0, + adapter->xdp_num_queues + + adapter->num_io_queues); + + netdev->netdev_ops = &ena_netdev_ops; + netdev->watchdog_timeo = TX_TIMEOUT; + ena_set_ethtool_ops(netdev); + +#if defined(NETIF_F_MQ_TX_LOCK_OPT) + netdev->features &= ~NETIF_F_MQ_TX_LOCK_OPT; +#endif /* defined(NETIF_F_MQ_TX_LOCK_OPT) */ +#ifdef IFF_UNICAST_FLT + netdev->priv_flags |= IFF_UNICAST_FLT; +#endif /* IFF_UNICAST_FLT */ + + u64_stats_init(&adapter->syncp); + + rc = ena_enable_msix_and_set_admin_interrupts(adapter); + if (rc) { + dev_err(&pdev->dev, + "Failed to enable and set the admin interrupts\n"); + goto err_worker_destroy; + } + rc = ena_sysfs_init(&adapter->pdev->dev); + if (rc) { + dev_err(&pdev->dev, "Cannot init sysfs\n"); + goto err_free_msix; + } + rc = ena_rss_init_default(adapter); + if (rc && (rc != -EOPNOTSUPP)) { + dev_err(&pdev->dev, "Cannot init RSS rc: %d\n", rc); + goto err_terminate_sysfs; + } + + ena_config_debug_area(adapter); + + memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len); + + netif_carrier_off(netdev); + + rc = register_netdev(netdev); + if (rc) { + dev_err(&pdev->dev, "Cannot register net device\n"); + goto err_rss; + } + + INIT_WORK(&adapter->reset_task, ena_fw_reset_device); + + adapter->last_keep_alive_jiffies = jiffies; + adapter->keep_alive_timeout = ENA_DEVICE_KALIVE_TIMEOUT; + adapter->missing_tx_completion_to = TX_TIMEOUT; + adapter->missing_tx_completion_threshold = MAX_NUM_OF_TIMEOUTED_PACKETS; + + ena_update_hints(adapter, &get_feat_ctx.hw_hints); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) + timer_setup(&adapter->timer_service, ena_timer_service, 0); +#else + setup_timer(&adapter->timer_service, ena_timer_service, + (unsigned long)adapter); +#endif + mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ)); + + dev_info(&pdev->dev, + "%s found at mem %lx, mac addr %pM\n", + DEVICE_NAME, (long)pci_resource_start(pdev, 0), + netdev->dev_addr); + + set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags); + + adapters_found++; + + ena_devlink_register(devlink, &pdev->dev); + + return 0; + +err_rss: + ena_com_delete_debug_area(ena_dev); + ena_com_rss_destroy(ena_dev); +err_terminate_sysfs: + ena_sysfs_terminate(&pdev->dev); +err_free_msix: + ena_com_dev_reset(ena_dev, ENA_REGS_RESET_INIT_ERR); + /* stop submitting admin commands on a device that was reset */ + ena_com_set_admin_running_state(ena_dev, false); + ena_free_mgmnt_irq(adapter); + ena_disable_msix(adapter); +err_worker_destroy: + del_timer(&adapter->timer_service); +err_device_destroy: + ena_com_delete_host_info(ena_dev); + ena_com_admin_destroy(ena_dev); +err_devlink_destroy: + ena_devlink_free(devlink); +err_metrics_destroy: + ena_com_delete_customer_metrics_buffer(ena_dev); +err_free_phc: + ena_phc_free(adapter); +err_netdev_destroy: + free_netdev(netdev); +err_free_region: + ena_release_bars(ena_dev, pdev); +err_free_ena_dev: + vfree(ena_dev); +err_disable_device: + pci_disable_device(pdev); + return rc; +} + +/*****************************************************************************/ + +/* __ena_shutoff - Helper used in both PCI remove/shutdown routines + * @pdev: PCI device information struct + * @shutdown: Is it a shutdown operation? If false, means it is a removal + * + * __ena_shutoff is a helper routine that does the real work on shutdown and + * removal paths; the difference between those paths is with regards to whether + * dettach or unregister the netdevice. + */ +static void __ena_shutoff(struct pci_dev *pdev, bool shutdown) +{ + struct ena_adapter *adapter = pci_get_drvdata(pdev); + struct ena_com_dev *ena_dev; + struct net_device *netdev; + struct devlink *devlink; + + ena_dev = adapter->ena_dev; + netdev = adapter->netdev; + + devlink = adapter->devlink; + ena_devlink_unregister(devlink); + ena_devlink_free(devlink); + +#ifdef CONFIG_RFS_ACCEL + if ((adapter->msix_vecs >= 1) && (netdev->rx_cpu_rmap)) { + free_irq_cpu_rmap(netdev->rx_cpu_rmap); + netdev->rx_cpu_rmap = NULL; + } + +#endif /* CONFIG_RFS_ACCEL */ + /* Make sure timer and reset routine won't be called after + * freeing device resources. + */ + del_timer_sync(&adapter->timer_service); + cancel_work_sync(&adapter->reset_task); + + rtnl_lock(); /* lock released inside the below if-else block */ + adapter->reset_reason = ENA_REGS_RESET_SHUTDOWN; + ena_destroy_device(adapter, true); + + if (shutdown) { + netif_device_detach(netdev); + dev_close(netdev); + rtnl_unlock(); + } else { + rtnl_unlock(); + unregister_netdev(netdev); + free_netdev(netdev); + } + + ena_com_rss_destroy(ena_dev); + + ena_com_delete_debug_area(ena_dev); + + ena_com_delete_host_info(ena_dev); + + ena_com_delete_customer_metrics_buffer(ena_dev); + + ena_phc_free(adapter); + + ena_release_bars(ena_dev, pdev); + + pci_disable_device(pdev); + + vfree(ena_dev); +} + +/* ena_remove - Device Removal Routine + * @pdev: PCI device information struct + * + * ena_remove is called by the PCI subsystem to alert the driver + * that it should release a PCI device. + */ + +static void ena_remove(struct pci_dev *pdev) +{ + __ena_shutoff(pdev, false); +} + +/* ena_shutdown - Device Shutdown Routine + * @pdev: PCI device information struct + * + * ena_shutdown is called by the PCI subsystem to alert the driver that + * a shutdown/reboot (or kexec) is happening and device must be disabled. + */ + +static void ena_shutdown(struct pci_dev *pdev) +{ + __ena_shutoff(pdev, true); +} + +#ifdef CONFIG_PM +#ifdef ENA_GENERIC_PM_OPS +/* ena_suspend - PM suspend callback + * @dev_d: Device information struct + */ +static int __maybe_unused ena_suspend(struct device *dev_d) +{ + struct pci_dev *pdev = to_pci_dev(dev_d); +#else /* ENA_GENERIC_PM_OPS */ +/* ena_suspend - PM suspend callback + * @pdev: PCI device information struct + * @state:power state + */ +static int ena_suspend(struct pci_dev *pdev, pm_message_t state) +{ +#endif /* ENA_GENERIC_PM_OPS */ + struct ena_adapter *adapter = pci_get_drvdata(pdev); + + ena_increase_stat(&adapter->dev_stats.suspend, 1, &adapter->syncp); + + rtnl_lock(); + if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) { + dev_err(&pdev->dev, + "Ignoring device reset request as the device is being suspended\n"); + clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags); + } + ena_destroy_device(adapter, true); + rtnl_unlock(); + return 0; +} + +#ifdef ENA_GENERIC_PM_OPS +/* ena_resume - PM resume callback + * @dev_d: Device information struct + */ +static int __maybe_unused ena_resume(struct device *dev_d) +{ + struct ena_adapter *adapter = dev_get_drvdata(dev_d); +#else /* ENA_GENERIC_PM_OPS */ +/* ena_resume - PM resume callback + * @pdev: PCI device information struct + * + */ +static int ena_resume(struct pci_dev *pdev) +{ + struct ena_adapter *adapter = pci_get_drvdata(pdev); +#endif /* ENA_GENERIC_PM_OPS */ + int rc; + + ena_increase_stat(&adapter->dev_stats.resume, 1, &adapter->syncp); + + rtnl_lock(); +#if LINUX_VERSION_CODE < KERNEL_VERSION(5,5,0) + pci_set_power_state(pdev, PCI_D0); +#endif + rc = ena_restore_device(adapter); + rtnl_unlock(); + return rc; +} +#endif /* CONFIG_PM */ +#ifdef ENA_GENERIC_PM_OPS + +static SIMPLE_DEV_PM_OPS(ena_pm_ops, ena_suspend, ena_resume); +#endif /* ENA_GENERIC_PM_OPS */ + +static struct pci_driver ena_pci_driver = { + .name = DRV_MODULE_NAME, + .id_table = ena_pci_tbl, + .probe = ena_probe, + .remove = ena_remove, + .shutdown = ena_shutdown, +#ifdef ENA_GENERIC_PM_OPS + .driver.pm = &ena_pm_ops, +#else /* ENA_GENERIC_PM_OPS */ +#ifdef CONFIG_PM + .suspend = ena_suspend, + .resume = ena_resume, +#endif /* CONFIG_PM */ +#endif /* ENA_GENERIC_PM_OPS */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0) + .sriov_configure = pci_sriov_configure_simple, +#endif +}; + +static int __init ena_init(void) +{ + int ret; + + ena_wq = create_singlethread_workqueue(DRV_MODULE_NAME); + if (!ena_wq) { + pr_err("Failed to create workqueue\n"); + return -ENOMEM; + } + + ret = pci_register_driver(&ena_pci_driver); + if (ret) + destroy_workqueue(ena_wq); + + return ret; +} + +static void __exit ena_cleanup(void) +{ + pci_unregister_driver(&ena_pci_driver); + + if (ena_wq) { + destroy_workqueue(ena_wq); + ena_wq = NULL; + } +} + +/****************************************************************************** + ******************************** AENQ Handlers ******************************* + *****************************************************************************/ +/* ena_update_on_link_change: + * Notify the network interface about the change in link status + */ +static void ena_update_on_link_change(void *adapter_data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)adapter_data; + struct ena_admin_aenq_link_change_desc *aenq_desc = + (struct ena_admin_aenq_link_change_desc *)aenq_e; + int status = aenq_desc->flags & + ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK; + + if (status) { + netif_dbg(adapter, ifup, adapter->netdev, "%s\n", __func__); + set_bit(ENA_FLAG_LINK_UP, &adapter->flags); + if (!test_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags)) + netif_carrier_on(adapter->netdev); + } else { + clear_bit(ENA_FLAG_LINK_UP, &adapter->flags); + netif_carrier_off(adapter->netdev); + } +} + +static void ena_keep_alive_wd(void *adapter_data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)adapter_data; + struct ena_admin_aenq_keep_alive_desc *desc; + u64 rx_overruns; + u64 rx_drops; + u64 tx_drops; + + desc = (struct ena_admin_aenq_keep_alive_desc *)aenq_e; + adapter->last_keep_alive_jiffies = jiffies; + + rx_drops = ENA_HIGH_LOW_TO_U64(desc->rx_drops_high, desc->rx_drops_low); + tx_drops = ENA_HIGH_LOW_TO_U64(desc->tx_drops_high, desc->tx_drops_low); + rx_overruns = ENA_HIGH_LOW_TO_U64(desc->rx_overruns_high, desc->rx_overruns_low); + + u64_stats_update_begin(&adapter->syncp); + /* These stats are accumulated by the device, so the counters indicate + * all drops since last reset. + */ + adapter->dev_stats.rx_drops = rx_drops; + adapter->dev_stats.tx_drops = tx_drops; + adapter->dev_stats.rx_overruns = rx_overruns; + u64_stats_update_end(&adapter->syncp); +} + +static void ena_notification(void *adapter_data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)adapter_data; + struct ena_admin_ena_hw_hints *hints; + + WARN(aenq_e->aenq_common_desc.group != ENA_ADMIN_NOTIFICATION, + "Invalid group(%x) expected %x\n", + aenq_e->aenq_common_desc.group, + ENA_ADMIN_NOTIFICATION); + + switch (aenq_e->aenq_common_desc.syndrome) { + case ENA_ADMIN_UPDATE_HINTS: + hints = (struct ena_admin_ena_hw_hints *) + (&aenq_e->inline_data_w4); + ena_update_hints(adapter, hints); + break; + default: + netif_err(adapter, drv, adapter->netdev, + "Invalid aenq notification link state %d\n", + aenq_e->aenq_common_desc.syndrome); + } +} + +static void ena_refresh_fw_capabilites(void *adapter_data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)adapter_data; + + netdev_info(adapter->netdev, "Received requet to refresh capabilities\n"); + + set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags); +} + +/* This handler will called for unknown event group or unimplemented handlers*/ +static void unimplemented_aenq_handler(void *data, + struct ena_admin_aenq_entry *aenq_e) +{ + struct ena_adapter *adapter = (struct ena_adapter *)data; + + netif_err(adapter, drv, adapter->netdev, + "Unknown event was received or event with unimplemented handler\n"); +} + +static struct ena_aenq_handlers aenq_handlers = { + .handlers = { + [ENA_ADMIN_LINK_CHANGE] = ena_update_on_link_change, + [ENA_ADMIN_NOTIFICATION] = ena_notification, + [ENA_ADMIN_KEEP_ALIVE] = ena_keep_alive_wd, + [ENA_ADMIN_REFRESH_CAPABILITIES] = ena_refresh_fw_capabilites, + }, + .unimplemented_handler = unimplemented_aenq_handler +}; + +module_init(ena_init); +module_exit(ena_cleanup); diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h new file mode 100644 index 0000000000000..97bdd08853400 --- /dev/null +++ b/drivers/amazon/net/ena/ena_netdev.h @@ -0,0 +1,630 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_H +#define ENA_H + +#include "kcompat.h" +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) +#include "dim.h" +#else +#include +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */ +#include +#include +#include +#include +#include +#include +#ifdef HAS_BPF_HEADER +#include +#endif +#include + +#include "ena_com.h" +#include "ena_eth_com.h" + +#define DRV_MODULE_GEN_MAJOR 2 +#define DRV_MODULE_GEN_MINOR 8 +#define DRV_MODULE_GEN_SUBMINOR 9 + +#define DRV_MODULE_NAME "ena" +#ifndef DRV_MODULE_GENERATION +#define DRV_MODULE_GENERATION \ + __stringify(DRV_MODULE_GEN_MAJOR) "." \ + __stringify(DRV_MODULE_GEN_MINOR) "." \ + __stringify(DRV_MODULE_GEN_SUBMINOR) "g" +#endif + +#define DEVICE_NAME "Elastic Network Adapter (ENA)" + +/* 1 for AENQ + ADMIN */ +#define ENA_ADMIN_MSIX_VEC 1 +#define ENA_MAX_MSIX_VEC(io_queues) (ENA_ADMIN_MSIX_VEC + (io_queues)) + +/* The ENA buffer length fields is 16 bit long. So when PAGE_SIZE == 64kB the + * driver passes 0. + * Since the max packet size the ENA handles is ~9kB limit the buffer length to + * 16kB. + */ +#if PAGE_SIZE > SZ_16K +#define ENA_PAGE_SIZE (_AC(SZ_16K, UL)) +#else +#define ENA_PAGE_SIZE PAGE_SIZE +#endif + +#define ENA_MIN_MSIX_VEC 2 + +#define ENA_REG_BAR 0 +#define ENA_MEM_BAR 2 +#define ENA_BAR_MASK (BIT(ENA_REG_BAR) | BIT(ENA_MEM_BAR)) + +#define ENA_DEFAULT_RING_SIZE (1024) +#define ENA_MIN_RING_SIZE (256) + +#define ENA_MIN_RX_BUF_SIZE (2048) + +#define ENA_MIN_NUM_IO_QUEUES (1) + +#define ENA_TX_WAKEUP_THRESH (MAX_SKB_FRAGS + 2) +#define ENA_DEFAULT_RX_COPYBREAK (256 - NET_IP_ALIGN) + +#define ENA_MIN_MTU 128 + +#define ENA_NAME_MAX_LEN 20 +#define ENA_IRQNAME_SIZE 40 + +#define ENA_PKT_MAX_BUFS 19 + +#define ENA_RX_RSS_TABLE_LOG_SIZE 7 +#define ENA_RX_RSS_TABLE_SIZE (1 << ENA_RX_RSS_TABLE_LOG_SIZE) + +/* The number of tx packet completions that will be handled each NAPI poll + * cycle is ring_size / ENA_TX_POLL_BUDGET_DIVIDER. + */ +#define ENA_TX_POLL_BUDGET_DIVIDER 4 + +/* Refill Rx queue when number of required descriptors is above + * QUEUE_SIZE / ENA_RX_REFILL_THRESH_DIVIDER or ENA_RX_REFILL_THRESH_PACKET + */ +#define ENA_RX_REFILL_THRESH_DIVIDER 8 +#define ENA_RX_REFILL_THRESH_PACKET 256 + +/* Number of queues to check for missing queues per timer service */ +#define ENA_MONITORED_TX_QUEUES 4 +/* Max timeout packets before device reset */ +#define MAX_NUM_OF_TIMEOUTED_PACKETS 128 + +#define ENA_TX_RING_IDX_NEXT(idx, ring_size) (((idx) + 1) & ((ring_size) - 1)) + +#define ENA_RX_RING_IDX_NEXT(idx, ring_size) (((idx) + 1) & ((ring_size) - 1)) +#define ENA_RX_RING_IDX_ADD(idx, n, ring_size) \ + (((idx) + (n)) & ((ring_size) - 1)) + +#define ENA_IO_TXQ_IDX(q) (2 * (q)) +#define ENA_IO_RXQ_IDX(q) (2 * (q) + 1) +#define ENA_IO_TXQ_IDX_TO_COMBINED_IDX(q) ((q) / 2) +#define ENA_IO_RXQ_IDX_TO_COMBINED_IDX(q) (((q) - 1) / 2) + +#define ENA_MGMNT_IRQ_IDX 0 +#define ENA_IO_IRQ_FIRST_IDX 1 +#define ENA_IO_IRQ_IDX(q) (ENA_IO_IRQ_FIRST_IDX + (q)) + +#define ENA_ADMIN_POLL_DELAY_US 5000 + +/* ENA device should send keep alive msg every 1 sec. + * We wait for 6 sec just to be on the safe side. + */ +#define ENA_DEVICE_KALIVE_TIMEOUT (6 * HZ) +#define ENA_MAX_NO_INTERRUPT_ITERATIONS 3 + +#define ENA_MMIO_DISABLE_REG_READ BIT(0) + +struct ena_page_cache; + +#ifdef ENA_PHC_SUPPORT +struct ena_phc_info; + +#endif +struct ena_irq { + irq_handler_t handler; + void *data; + int cpu; + u32 vector; + cpumask_t affinity_hint_mask; + char name[ENA_IRQNAME_SIZE]; +}; + +struct ena_napi { + u8 first_interrupt ____cacheline_aligned; + u8 interrupts_masked; + struct napi_struct napi; + struct ena_ring *tx_ring; + struct ena_ring *rx_ring; + u32 qid; + struct dim dim; +}; + +struct ena_tx_buffer { + union { + struct sk_buff *skb; +#ifdef ENA_XDP_SUPPORT + /* XDP buffer structure which is used for sending packets in + * the xdp queues + */ + struct xdp_frame *xdpf; +#endif /* ENA_XDP_SUPPORT */ + }; + /* num of ena desc for this specific skb + * (includes data desc and metadata desc) + */ + u32 tx_descs; + /* num of buffers used by this skb */ + u32 num_of_bufs; + + /* Total size of all buffers */ + u32 total_tx_size; + + /* Indicate if bufs[0] map the linear data of the skb. */ + u8 map_linear_data; + + /* Used for detect missing tx packets to limit the number of prints */ + u8 print_once; + /* Save the last jiffies to detect missing tx packets + * + * sets to non zero value on ena_start_xmit and set to zero on + * napi and timer_Service_routine. + * + * while this value is not protected by lock, + * a given packet is not expected to be handled by ena_start_xmit + * and by napi/timer_service at the same time. + */ + unsigned long last_jiffies; + struct ena_com_buf bufs[ENA_PKT_MAX_BUFS]; +} ____cacheline_aligned; + +struct ena_rx_buffer { + struct sk_buff *skb; + union { + struct { + struct page *page; + dma_addr_t dma_addr; + }; +#ifdef ENA_XDP_SUPPORT + /* XSK pool buffer */ + struct xdp_buff *xdp; +#endif + }; + u32 page_offset; + u32 buf_offset; + struct ena_com_buf ena_buf; + bool is_lpc_page; +} ____cacheline_aligned; + +struct ena_stats_tx { + u64 cnt; + u64 bytes; + u64 queue_stop; + u64 prepare_ctx_err; + u64 queue_wakeup; + u64 dma_mapping_err; + u64 linearize; + u64 linearize_failed; + u64 napi_comp; + u64 tx_poll; + u64 doorbells; + u64 bad_req_id; + u64 llq_buffer_copy; + u64 missed_tx; + u64 unmask_interrupt; + u64 last_napi_jiffies; +#ifdef ENA_AF_XDP_SUPPORT + u64 xsk_need_wakeup_set; + u64 xsk_wakeup_request; +#endif /* ENA_AF_XDP_SUPPORT */ +}; + +struct ena_stats_rx { + u64 cnt; + u64 bytes; + u64 rx_copybreak_pkt; + u64 csum_good; + u64 refil_partial; + u64 csum_bad; + u64 page_alloc_fail; + u64 skb_alloc_fail; + u64 dma_mapping_err; + u64 bad_desc_num; +#ifdef ENA_BUSY_POLL_SUPPORT + u64 bp_yield; + u64 bp_missed; + u64 bp_cleaned; +#endif + u64 bad_req_id; + u64 empty_rx_ring; + u64 csum_unchecked; +#ifdef ENA_XDP_SUPPORT + u64 xdp_aborted; + u64 xdp_drop; + u64 xdp_pass; + u64 xdp_tx; + u64 xdp_invalid; + u64 xdp_redirect; +#endif + u64 lpc_warm_up; + u64 lpc_full; + u64 lpc_wrong_numa; +#ifdef ENA_AF_XDP_SUPPORT + u64 xsk_need_wakeup_set; + u64 zc_queue_pkt_copy; +#endif /* ENA_AF_XDP_SUPPORT */ +}; + +struct ena_ring { + /* Holds the empty requests for TX/RX + * out of order completions + */ + u16 *free_ids; + + union { + struct ena_tx_buffer *tx_buffer_info; + struct ena_rx_buffer *rx_buffer_info; + }; + + /* cache ptr to avoid using the adapter */ + struct device *dev; + struct pci_dev *pdev; + struct napi_struct *napi; + struct net_device *netdev; + struct ena_page_cache *page_cache; + struct ena_com_dev *ena_dev; + struct ena_adapter *adapter; + struct ena_com_io_cq *ena_com_io_cq; + struct ena_com_io_sq *ena_com_io_sq; +#ifdef ENA_XDP_SUPPORT + struct bpf_prog *xdp_bpf_prog; + struct xdp_rxq_info xdp_rxq; + spinlock_t xdp_tx_lock; /* synchronize XDP TX/Redirect traffic */ + /* Used for rx queues only to point to the xdp tx ring, to + * which traffic should be redirected from this rx ring. + */ + struct ena_ring *xdp_ring; +#ifdef ENA_AF_XDP_SUPPORT + struct xsk_buff_pool *xsk_pool; +#endif /* ENA_AF_XDP_SUPPORT */ +#endif /* ENA_XDP_SUPPORT */ + + u16 next_to_use; + u16 next_to_clean; + u16 rx_copybreak; + u16 rx_headroom; + u16 qid; + u16 mtu; + u16 sgl_size; + u8 enable_bql; + + /* The maximum header length the device can handle */ + u8 tx_max_header_size; + + bool disable_meta_caching; + u16 no_interrupt_event_cnt; + + /* cpu and NUMA for TPH */ + int cpu; + int numa_node; + + /* number of tx/rx_buffer_info's entries */ + int ring_size; + + enum ena_admin_placement_policy_type tx_mem_queue_type; + + struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS]; + u32 interrupt_interval; + /* Indicates whether interrupt interval has changed since previous set. + * This flag will be kept up, until cleared by the routine which updates + * the device with the modified interrupt interval value. + */ + bool interrupt_interval_changed; + u32 per_napi_packets; + u16 non_empty_napi_events; + struct u64_stats_sync syncp; + union { + struct ena_stats_tx tx_stats; + struct ena_stats_rx rx_stats; + }; + + u8 *push_buf_intermediate_buf; + int empty_rx_queue; +#ifdef ENA_BUSY_POLL_SUPPORT + atomic_t bp_state; +#endif +} ____cacheline_aligned; + +#ifdef ENA_BUSY_POLL_SUPPORT +enum ena_busy_poll_state_t { + ENA_BP_STATE_IDLE = 0, + ENA_BP_STATE_NAPI, + ENA_BP_STATE_POLL, + ENA_BP_STATE_DISABLE +}; +#endif +struct ena_stats_dev { + u64 tx_timeout; + u64 suspend; + u64 resume; + u64 wd_expired; + u64 interface_up; + u64 interface_down; + u64 admin_q_pause; + u64 rx_drops; + u64 tx_drops; + u64 rx_overruns; + u64 reset_fail; +}; + +enum ena_flags_t { + ENA_FLAG_DEVICE_RUNNING, + ENA_FLAG_DEV_UP, + ENA_FLAG_LINK_UP, + ENA_FLAG_MSIX_ENABLED, + ENA_FLAG_TRIGGER_RESET, + ENA_FLAG_ONGOING_RESET +}; + +/* adapter specific private data structure */ +struct ena_adapter { + struct ena_com_dev *ena_dev; + /* OS defined structs */ + struct net_device *netdev; + struct pci_dev *pdev; + + struct devlink *devlink; + + /* rx packets that are shorter than this len will be copied to the skb + * header + */ + u32 rx_copybreak; + u32 max_mtu; + + u32 num_io_queues; + u32 max_num_io_queues; + + /* Local page cache size when it's enabled */ + u32 configured_lpc_size; + /* Current Local page cache size */ + u32 used_lpc_size; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) + struct msix_entry *msix_entries; +#endif + int msix_vecs; + + u32 missing_tx_completion_threshold; + + u32 requested_tx_ring_size; + u32 requested_rx_ring_size; + + u32 max_tx_ring_size; + u32 max_rx_ring_size; + + u32 msg_enable; + + /* The flag is used for two purposes: + * 1. Indicates that large LLQ has been requested. + * 2. Indicates whether large LLQ is set or not after device + * initialization / configuration. + */ + bool large_llq_header_enabled; + bool large_llq_header_supported; + + u16 max_tx_sgl_size; + u16 max_rx_sgl_size; + + u8 mac_addr[ETH_ALEN]; + + unsigned long keep_alive_timeout; + unsigned long missing_tx_completion_to; + + char name[ENA_NAME_MAX_LEN]; +#ifdef ENA_PHC_SUPPORT + + struct ena_phc_info *phc_info; +#endif + + unsigned long flags; + /* TX */ + struct ena_ring tx_ring[ENA_MAX_NUM_IO_QUEUES] + ____cacheline_aligned_in_smp; + + /* RX */ + struct ena_ring rx_ring[ENA_MAX_NUM_IO_QUEUES] + ____cacheline_aligned_in_smp; + + struct ena_napi ena_napi[ENA_MAX_NUM_IO_QUEUES]; + + struct ena_irq irq_tbl[ENA_MAX_MSIX_VEC(ENA_MAX_NUM_IO_QUEUES)]; + + /* timer service */ + struct work_struct reset_task; + struct timer_list timer_service; + + bool wd_state; + bool dev_up_before_reset; + bool disable_meta_caching; + unsigned long last_keep_alive_jiffies; + + struct u64_stats_sync syncp; + struct ena_stats_dev dev_stats; + struct ena_admin_eni_stats eni_stats; + struct ena_admin_ena_srd_info ena_srd_info; + + /* last queue index that was checked for uncompleted tx packets */ + u32 last_monitored_tx_qid; + + enum ena_regs_reset_reason_types reset_reason; + +#ifdef ENA_XDP_SUPPORT + struct bpf_prog *xdp_bpf_prog; +#endif + u32 xdp_first_ring; + u32 xdp_num_queues; +}; + +void ena_set_ethtool_ops(struct net_device *netdev); + +void ena_dump_stats_to_dmesg(struct ena_adapter *adapter); + +void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf); + + +int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled); + +int ena_update_queue_sizes(struct ena_adapter *adapter, + u32 new_tx_size, + u32 new_rx_size); + +int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count); + +int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak); + +int ena_get_sset_count(struct net_device *netdev, int sset); +#ifdef ENA_BUSY_POLL_SUPPORT +static inline void ena_bp_init_lock(struct ena_ring *rx_ring) +{ + /* reset state to idle */ + atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE); +} + +/* called from the napi routine to get ownership of the ring */ +static inline bool ena_bp_lock_napi(struct ena_ring *rx_ring) +{ + int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE, + ENA_BP_STATE_NAPI); + if (rc != ENA_BP_STATE_IDLE) { + u64_stats_update_begin(&rx_ring->syncp); + rx_ring->rx_stats.bp_yield++; + u64_stats_update_end(&rx_ring->syncp); + } + + return rc == ENA_BP_STATE_IDLE; +} + +static inline void ena_bp_unlock_napi(struct ena_ring *rx_ring) +{ + WARN_ON(atomic_read(&rx_ring->bp_state) != ENA_BP_STATE_NAPI); + + /* flush any outstanding Rx frames */ + if (rx_ring->napi->gro_list) + napi_gro_flush(rx_ring->napi, false); + + /* reset state to idle */ + atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE); +} + +/* called from ena_ll_busy_poll() */ +static inline bool ena_bp_lock_poll(struct ena_ring *rx_ring) +{ + int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE, + ENA_BP_STATE_POLL); + if (rc != ENA_BP_STATE_IDLE) { + u64_stats_update_begin(&rx_ring->syncp); + rx_ring->rx_stats.bp_yield++; + u64_stats_update_end(&rx_ring->syncp); + } + + return rc == ENA_BP_STATE_IDLE; +} + +static inline void ena_bp_unlock_poll(struct ena_ring *rx_ring) +{ + WARN_ON(atomic_read(&rx_ring->bp_state) != ENA_BP_STATE_POLL); + + /* reset state to idle */ + atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE); +} + +/* true if a socket is polling, even if it did not get the lock */ +static inline bool ena_bp_busy_polling(struct ena_ring *rx_ring) +{ + return atomic_read(&rx_ring->bp_state) == ENA_BP_STATE_POLL; +} + +static inline bool ena_bp_disable(struct ena_ring *rx_ring) +{ + int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE, + ENA_BP_STATE_DISABLE); + + return rc == ENA_BP_STATE_IDLE; +} +#endif /* ENA_BUSY_POLL_SUPPORT */ + +static inline void ena_reset_device(struct ena_adapter *adapter, + enum ena_regs_reset_reason_types reset_reason) +{ + adapter->reset_reason = reset_reason; + /* Make sure reset reason is set before triggering the reset */ + smp_mb__before_atomic(); + set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags); +} + +/* Allocate a page and DMA map it + * @rx_ring: The IO queue pair which requests the allocation + * + * @return: address of the mapped page in DMA and allocated page address is + * succeeded, or NULL + */ +struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma); + +int ena_destroy_device(struct ena_adapter *adapter, bool graceful); +int ena_restore_device(struct ena_adapter *adapter); +int handle_invalid_req_id(struct ena_ring *ring, u16 req_id, + struct ena_tx_buffer *tx_info, bool is_xdp); + +/* Increase a stat by cnt while holding syncp seqlock on 32bit machines */ +static inline void ena_increase_stat(u64 *statp, u64 cnt, + struct u64_stats_sync *syncp) +{ + u64_stats_update_begin(syncp); + (*statp) += cnt; + u64_stats_update_end(syncp); +} + +static inline void ena_ring_tx_doorbell(struct ena_ring *tx_ring) +{ + ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq); + ena_increase_stat(&tx_ring->tx_stats.doorbells, 1, &tx_ring->syncp); +} + +int ena_xmit_common(struct ena_adapter *adapter, + struct ena_ring *ring, + struct ena_tx_buffer *tx_info, + struct ena_com_tx_ctx *ena_tx_ctx, + u16 next_to_use, + u32 bytes); +void ena_unmap_tx_buff(struct ena_ring *tx_ring, + struct ena_tx_buffer *tx_info); +void ena_init_io_rings(struct ena_adapter *adapter, + int first_index, int count); +int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter, + int first_index, int count); +int ena_setup_tx_resources_in_range(struct ena_adapter *adapter, + int first_index, int count); +void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter, + int first_index, int count); +void ena_free_all_io_tx_resources(struct ena_adapter *adapter); +void ena_down(struct ena_adapter *adapter); +int ena_up(struct ena_adapter *adapter); +void ena_unmask_interrupt(struct ena_ring *tx_ring, struct ena_ring *rx_ring); +void ena_update_ring_numa_node(struct ena_ring *tx_ring, + struct ena_ring *rx_ring); +void ena_rx_checksum(struct ena_ring *rx_ring, + struct ena_com_rx_ctx *ena_rx_ctx, + struct sk_buff *skb); +void ena_set_rx_hash(struct ena_ring *rx_ring, + struct ena_com_rx_ctx *ena_rx_ctx, + struct sk_buff *skb); +int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num); +#endif /* !(ENA_H) */ diff --git a/drivers/amazon/net/ena/ena_pci_id_tbl.h b/drivers/amazon/net/ena/ena_pci_id_tbl.h new file mode 100755 index 0000000000000..3ecdf29160ca7 --- /dev/null +++ b/drivers/amazon/net/ena/ena_pci_id_tbl.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_PCI_ID_TBL_H_ +#define ENA_PCI_ID_TBL_H_ + +#ifndef PCI_VENDOR_ID_AMAZON +#define PCI_VENDOR_ID_AMAZON 0x1d0f +#endif + +#ifndef PCI_DEV_ID_ENA_PF +#define PCI_DEV_ID_ENA_PF 0x0ec2 +#endif + +#ifndef PCI_DEV_ID_ENA_LLQ_PF +#define PCI_DEV_ID_ENA_LLQ_PF 0x1ec2 +#endif + +#ifndef PCI_DEV_ID_ENA_VF +#define PCI_DEV_ID_ENA_VF 0xec20 +#endif + +#ifndef PCI_DEV_ID_ENA_LLQ_VF +#define PCI_DEV_ID_ENA_LLQ_VF 0xec21 +#endif + +#ifndef PCI_DEV_ID_ENA_RESRV0 +#define PCI_DEV_ID_ENA_RESRV0 0x0051 +#endif + +#define ENA_PCI_ID_TABLE_ENTRY(devid) \ + {PCI_DEVICE(PCI_VENDOR_ID_AMAZON, devid)}, + +static const struct pci_device_id ena_pci_tbl[] = { + ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_RESRV0) + ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_PF) + ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_LLQ_PF) + ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_VF) + ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_LLQ_VF) + { } +}; + +#endif /* ENA_PCI_ID_TBL_H_ */ diff --git a/drivers/amazon/net/ena/ena_phc.c b/drivers/amazon/net/ena/ena_phc.c new file mode 100644 index 0000000000000..5b637ef79bc04 --- /dev/null +++ b/drivers/amazon/net/ena/ena_phc.c @@ -0,0 +1,284 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "ena_devlink.h" +#include "ena_phc.h" + +#ifdef ENA_PHC_SUPPORT +#ifdef ENA_PHC_SUPPORT_ADJFREQ +static int ena_phc_adjfreq(struct ptp_clock_info *clock_info, s32 ppb) +{ + return -EOPNOTSUPP; +} +#endif /* ENA_PHC_SUPPORT_ADJFREQ */ + +static int ena_phc_adjtime(struct ptp_clock_info *clock_info, s64 delta) +{ + return -EOPNOTSUPP; +} + +static int ena_phc_feature_enable(struct ptp_clock_info *clock_info, struct ptp_clock_request *rq, + int on) +{ + return -EOPNOTSUPP; +} + +#ifdef ENA_PHC_SUPPORT_GETTIME64 +#ifdef ENA_PHC_SUPPORT_GETTIME64_EXTENDED +static int ena_phc_gettimex64(struct ptp_clock_info *clock_info, struct timespec64 *ts, + struct ptp_system_timestamp *sts) +{ + struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info); + unsigned long flags; + u64 timestamp_nsec; + int rc; + + spin_lock_irqsave(&phc_info->lock, flags); + + ptp_read_system_prets(sts); + + rc = ena_com_phc_get(phc_info->adapter->ena_dev, ×tamp_nsec); + + ptp_read_system_postts(sts); + + spin_unlock_irqrestore(&phc_info->lock, flags); + + *ts = ns_to_timespec64(timestamp_nsec); + + return rc; +} + +#else /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */ +static int ena_phc_gettime64(struct ptp_clock_info *clock_info, struct timespec64 *ts) +{ + struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info); + unsigned long flags; + u64 timestamp_nsec; + int rc; + + spin_lock_irqsave(&phc_info->lock, flags); + + rc = ena_com_phc_get(phc_info->adapter->ena_dev, ×tamp_nsec); + + spin_unlock_irqrestore(&phc_info->lock, flags); + + *ts = ns_to_timespec64(timestamp_nsec); + + return rc; +} + +#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */ +static int ena_phc_settime64(struct ptp_clock_info *clock_info, + const struct timespec64 *ts) +{ + return -EOPNOTSUPP; +} + +#else /* ENA_PHC_SUPPORT_GETTIME64 */ +static int ena_phc_gettime(struct ptp_clock_info *clock_info, struct timespec *ts) +{ + struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info); + unsigned long flags; + u64 timestamp_nsec; + u32 remainder; + int rc; + + spin_lock_irqsave(&phc_info->lock, flags); + + rc = ena_com_phc_get(phc_info->adapter->ena_dev, ×tamp_nsec); + + spin_unlock_irqrestore(&phc_info->lock, flags); + + ts->tv_sec = div_u64_rem(timestamp_nsec, NSEC_PER_SEC, &remainder); + ts->tv_nsec = remainder; + + return rc; +} + +static int ena_phc_settime(struct ptp_clock_info *clock_info, const struct timespec *ts) +{ + return -EOPNOTSUPP; +} + +#endif /* ENA_PHC_SUPPORT_GETTIME64 */ + +static struct ptp_clock_info ena_ptp_clock_info = { + .owner = THIS_MODULE, + .n_alarm = 0, + .n_ext_ts = 0, + .n_per_out = 0, + .pps = 0, +#ifdef ENA_PHC_SUPPORT_ADJFREQ + .adjfreq = ena_phc_adjfreq, +#endif /* ENA_PHC_SUPPORT_ADJFREQ */ + .adjtime = ena_phc_adjtime, +#ifdef ENA_PHC_SUPPORT_GETTIME64 +#ifdef ENA_PHC_SUPPORT_GETTIME64_EXTENDED + .gettimex64 = ena_phc_gettimex64, +#else /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */ + .gettime64 = ena_phc_gettime64, +#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */ + .settime64 = ena_phc_settime64, +#else /* ENA_PHC_SUPPORT_GETTIME64 */ + .gettime = ena_phc_gettime, + .settime = ena_phc_settime, +#endif /* ENA_PHC_SUPPORT_GETTIME64 */ + .enable = ena_phc_feature_enable, +}; + +/* Enable/Disable PHC by the kernel, affects on the next init flow */ +void ena_phc_enable(struct ena_adapter *adapter, bool enable) +{ + struct ena_phc_info *phc_info = adapter->phc_info; + + if (!phc_info) { + netdev_err(adapter->netdev, "phc_info is not allocated\n"); + return; + } + + phc_info->enabled = enable; +} + +/* Check if PHC is enabled by the kernel */ +bool ena_phc_is_enabled(struct ena_adapter *adapter) +{ + struct ena_phc_info *phc_info = adapter->phc_info; + + return (phc_info && phc_info->enabled); +} + +/* PHC is activated if ptp clock is registered in the kernel */ +bool ena_phc_is_active(struct ena_adapter *adapter) +{ + struct ena_phc_info *phc_info = adapter->phc_info; + + return (phc_info && phc_info->clock); +} + +static int ena_phc_register(struct ena_adapter *adapter) +{ + struct pci_dev *pdev = adapter->pdev; + struct ptp_clock_info *clock_info; + struct ena_phc_info *phc_info; + int rc = 0; + + phc_info = adapter->phc_info; + clock_info = &phc_info->clock_info; + + phc_info->adapter = adapter; + + spin_lock_init(&phc_info->lock); + + /* Fill the ptp_clock_info struct and register PTP clock */ + *clock_info = ena_ptp_clock_info; + snprintf(clock_info->name, + sizeof(clock_info->name), + "ena-ptp-%02x", + PCI_SLOT(pdev->devfn)); + + phc_info->clock = ptp_clock_register(clock_info, &pdev->dev); + if (IS_ERR(phc_info->clock)) { + rc = PTR_ERR(phc_info->clock); + netdev_err(adapter->netdev, "Failed registering ptp clock, error: %d\n", rc); + phc_info->clock = NULL; + } + + return rc; +} + +static void ena_phc_unregister(struct ena_adapter *adapter) +{ + struct ena_phc_info *phc_info = adapter->phc_info; + + if (ena_phc_is_active(adapter)) { + ptp_clock_unregister(phc_info->clock); + phc_info->clock = NULL; + } +} + +int ena_phc_alloc(struct ena_adapter *adapter) +{ + /* Allocate driver specific PHC info */ + adapter->phc_info = vzalloc(sizeof(*adapter->phc_info)); + if (unlikely(!adapter->phc_info)) { + netdev_err(adapter->netdev, "Failed to alloc phc_info\n"); + return -ENOMEM; + } + + return 0; +} + +void ena_phc_free(struct ena_adapter *adapter) +{ + if (adapter->phc_info) { + vfree(adapter->phc_info); + adapter->phc_info = NULL; + } +} + +int ena_phc_init(struct ena_adapter *adapter) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + struct net_device *netdev = adapter->netdev; + int rc = -EOPNOTSUPP; + + /* Validate PHC feature is supported in the device */ + if (!ena_com_phc_supported(ena_dev)) { + netdev_dbg(netdev, "PHC feature is not supported by the device\n"); + goto err_ena_com_phc_init; + } + + /* Validate PHC feature is enabled by the kernel */ + if (!ena_phc_is_enabled(adapter)) { + netdev_dbg(netdev, "PHC feature is not enabled by the kernel\n"); + goto err_ena_com_phc_init; + } + + /* Initialize device specific PHC info */ + rc = ena_com_phc_init(ena_dev); + if (unlikely(rc)) { + netdev_err(netdev, "Failed to init phc, error: %d\n", rc); + goto err_ena_com_phc_init; + } + + /* Configure PHC feature in driver and device */ + rc = ena_com_phc_config(ena_dev); + if (unlikely(rc)) { + netdev_err(netdev, "Failed to config phc, error: %d\n", rc); + goto err_ena_com_phc_config; + } + + /* Register to PTP class driver */ + rc = ena_phc_register(adapter); + if (unlikely(rc)) { + netdev_err(netdev, "Failed to register phc, error: %d\n", rc); + goto err_ena_com_phc_config; + } + + return 0; + +err_ena_com_phc_config: + ena_com_phc_destroy(ena_dev); +err_ena_com_phc_init: + ena_phc_enable(adapter, false); + ena_devlink_disable_phc_param(adapter->devlink); + return rc; +} + +void ena_phc_destroy(struct ena_adapter *adapter) +{ + ena_phc_unregister(adapter); + ena_com_phc_destroy(adapter->ena_dev); +} + +int ena_phc_get_index(struct ena_adapter *adapter) +{ + if (ena_phc_is_active(adapter)) + return ptp_clock_index(adapter->phc_info->clock); + + return -1; +} + +#endif /* ENA_PHC_SUPPORT */ diff --git a/drivers/amazon/net/ena/ena_phc.h b/drivers/amazon/net/ena/ena_phc.h new file mode 100644 index 0000000000000..bb644d5f928fa --- /dev/null +++ b/drivers/amazon/net/ena/ena_phc.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_PHC_H +#define ENA_PHC_H + +#ifdef ENA_PHC_SUPPORT + +#include + +struct ena_phc_info { + /* PTP hardware capabilities */ + struct ptp_clock_info clock_info; + + /* Registered PTP clock device */ + struct ptp_clock *clock; + + /* Adapter specific private data structure */ + struct ena_adapter *adapter; + + /* PHC lock */ + spinlock_t lock; + + /* Enabled by kernel */ + bool enabled; +}; + +void ena_phc_enable(struct ena_adapter *adapter, bool enable); +bool ena_phc_is_enabled(struct ena_adapter *adapter); +bool ena_phc_is_active(struct ena_adapter *adapter); +int ena_phc_get_index(struct ena_adapter *adapter); +int ena_phc_init(struct ena_adapter *adapter); +void ena_phc_destroy(struct ena_adapter *adapter); +int ena_phc_alloc(struct ena_adapter *adapter); +void ena_phc_free(struct ena_adapter *adapter); +#else /* ENA_PHC_SUPPORT */ + +static inline void ena_phc_enable(struct ena_adapter *adapter, bool enable) { } +static inline bool ena_phc_is_enabled(struct ena_adapter *adapter) { return false; } +static inline bool ena_phc_is_active(struct ena_adapter *adapter) { return false; } +static inline int ena_phc_get_index(struct ena_adapter *adapter) { return -1; } +static inline int ena_phc_init(struct ena_adapter *adapter) { return 0; } +static inline void ena_phc_destroy(struct ena_adapter *adapter) { } +static inline int ena_phc_alloc(struct ena_adapter *adapter) { return 0; } +static inline void ena_phc_free(struct ena_adapter *adapter) { } +#endif /* ENA_PHC_SUPPORT */ + +#endif /* ENA_PHC_H */ diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h new file mode 100755 index 0000000000000..bdbbc8b18df63 --- /dev/null +++ b/drivers/amazon/net/ena/ena_regs_defs.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ +#ifndef _ENA_REGS_H_ +#define _ENA_REGS_H_ + +enum ena_regs_reset_reason_types { + ENA_REGS_RESET_NORMAL = 0, + ENA_REGS_RESET_KEEP_ALIVE_TO = 1, + ENA_REGS_RESET_ADMIN_TO = 2, + ENA_REGS_RESET_MISS_TX_CMPL = 3, + ENA_REGS_RESET_INV_RX_REQ_ID = 4, + ENA_REGS_RESET_INV_TX_REQ_ID = 5, + ENA_REGS_RESET_TOO_MANY_RX_DESCS = 6, + ENA_REGS_RESET_INIT_ERR = 7, + ENA_REGS_RESET_DRIVER_INVALID_STATE = 8, + ENA_REGS_RESET_OS_TRIGGER = 9, + ENA_REGS_RESET_OS_NETDEV_WD = 10, + ENA_REGS_RESET_SHUTDOWN = 11, + ENA_REGS_RESET_USER_TRIGGER = 12, + ENA_REGS_RESET_GENERIC = 13, + ENA_REGS_RESET_MISS_INTERRUPT = 14, + ENA_REGS_RESET_SUSPECTED_POLL_STARVATION = 15, + ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED = 16, + ENA_REGS_RESET_LAST, +}; + +/* ena_registers offsets */ + +/* 0 base */ +#define ENA_REGS_VERSION_OFF 0x0 +#define ENA_REGS_CONTROLLER_VERSION_OFF 0x4 +#define ENA_REGS_CAPS_OFF 0x8 +#define ENA_REGS_CAPS_EXT_OFF 0xc +#define ENA_REGS_AQ_BASE_LO_OFF 0x10 +#define ENA_REGS_AQ_BASE_HI_OFF 0x14 +#define ENA_REGS_AQ_CAPS_OFF 0x18 +#define ENA_REGS_ACQ_BASE_LO_OFF 0x20 +#define ENA_REGS_ACQ_BASE_HI_OFF 0x24 +#define ENA_REGS_ACQ_CAPS_OFF 0x28 +#define ENA_REGS_AQ_DB_OFF 0x2c +#define ENA_REGS_ACQ_TAIL_OFF 0x30 +#define ENA_REGS_AENQ_CAPS_OFF 0x34 +#define ENA_REGS_AENQ_BASE_LO_OFF 0x38 +#define ENA_REGS_AENQ_BASE_HI_OFF 0x3c +#define ENA_REGS_AENQ_HEAD_DB_OFF 0x40 +#define ENA_REGS_AENQ_TAIL_OFF 0x44 +#define ENA_REGS_INTR_MASK_OFF 0x4c +#define ENA_REGS_DEV_CTL_OFF 0x54 +#define ENA_REGS_DEV_STS_OFF 0x58 +#define ENA_REGS_MMIO_REG_READ_OFF 0x5c +#define ENA_REGS_MMIO_RESP_LO_OFF 0x60 +#define ENA_REGS_MMIO_RESP_HI_OFF 0x64 +#define ENA_REGS_RSS_IND_ENTRY_UPDATE_OFF 0x68 + +/* phc_registers offsets */ + +/* 100 base */ +#define ENA_REGS_PHC_DB_OFF 0x100 + +/* version register */ +#define ENA_REGS_VERSION_MINOR_VERSION_MASK 0xff +#define ENA_REGS_VERSION_MAJOR_VERSION_SHIFT 8 +#define ENA_REGS_VERSION_MAJOR_VERSION_MASK 0xff00 + +/* controller_version register */ +#define ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK 0xff +#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT 8 +#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK 0xff00 +#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT 16 +#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK 0xff0000 +#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT 24 +#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK 0xff000000 + +/* caps register */ +#define ENA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK 0x1 +#define ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT 1 +#define ENA_REGS_CAPS_RESET_TIMEOUT_MASK 0x3e +#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT 8 +#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK 0xff00 +#define ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT 16 +#define ENA_REGS_CAPS_ADMIN_CMD_TO_MASK 0xf0000 + +/* aq_caps register */ +#define ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK 0xffff +#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT 16 +#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK 0xffff0000 + +/* acq_caps register */ +#define ENA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK 0xffff +#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT 16 +#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK 0xffff0000 + +/* aenq_caps register */ +#define ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK 0xffff +#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT 16 +#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK 0xffff0000 + +/* dev_ctl register */ +#define ENA_REGS_DEV_CTL_DEV_RESET_MASK 0x1 +#define ENA_REGS_DEV_CTL_AQ_RESTART_SHIFT 1 +#define ENA_REGS_DEV_CTL_AQ_RESTART_MASK 0x2 +#define ENA_REGS_DEV_CTL_QUIESCENT_SHIFT 2 +#define ENA_REGS_DEV_CTL_QUIESCENT_MASK 0x4 +#define ENA_REGS_DEV_CTL_IO_RESUME_SHIFT 3 +#define ENA_REGS_DEV_CTL_IO_RESUME_MASK 0x8 +#define ENA_REGS_DEV_CTL_RESET_REASON_SHIFT 28 +#define ENA_REGS_DEV_CTL_RESET_REASON_MASK 0xf0000000 + +/* dev_sts register */ +#define ENA_REGS_DEV_STS_READY_MASK 0x1 +#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT 1 +#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK 0x2 +#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_SHIFT 2 +#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK 0x4 +#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_SHIFT 3 +#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK 0x8 +#define ENA_REGS_DEV_STS_RESET_FINISHED_SHIFT 4 +#define ENA_REGS_DEV_STS_RESET_FINISHED_MASK 0x10 +#define ENA_REGS_DEV_STS_FATAL_ERROR_SHIFT 5 +#define ENA_REGS_DEV_STS_FATAL_ERROR_MASK 0x20 +#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_SHIFT 6 +#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_MASK 0x40 +#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_SHIFT 7 +#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_MASK 0x80 + +/* mmio_reg_read register */ +#define ENA_REGS_MMIO_REG_READ_REQ_ID_MASK 0xffff +#define ENA_REGS_MMIO_REG_READ_REG_OFF_SHIFT 16 +#define ENA_REGS_MMIO_REG_READ_REG_OFF_MASK 0xffff0000 + +/* rss_ind_entry_update register */ +#define ENA_REGS_RSS_IND_ENTRY_UPDATE_INDEX_MASK 0xffff +#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT 16 +#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK 0xffff0000 + +/* phc_db_req_id register */ +#define ENA_REGS_PHC_DB_REQ_ID_MASK 0xffff + +#endif /* _ENA_REGS_H_ */ diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c new file mode 100755 index 0000000000000..98e1f7ecd0f09 --- /dev/null +++ b/drivers/amazon/net/ena/ena_sysfs.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include + +#include "ena_com.h" +#include "ena_netdev.h" +#include "ena_sysfs.h" + + +static ssize_t ena_store_rx_copybreak(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct ena_adapter *adapter = dev_get_drvdata(dev); + unsigned long rx_copybreak; + int rc; + + rc = kstrtoul(buf, 10, &rx_copybreak); + if (rc < 0) + goto exit; + + rtnl_lock(); + rc = ena_set_rx_copybreak(adapter, rx_copybreak); + if (rc) + goto unlock; + rtnl_unlock(); + + return len; +unlock: + rtnl_unlock(); +exit: + return rc; +} + +#define ENA_RX_COPYBREAK_STR_MAX_LEN 7 + +static ssize_t ena_show_rx_copybreak(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ena_adapter *adapter = dev_get_drvdata(dev); + + return snprintf(buf, ENA_RX_COPYBREAK_STR_MAX_LEN, "%d\n", + adapter->rx_copybreak); +} + +static DEVICE_ATTR(rx_copybreak, S_IRUGO | S_IWUSR, ena_show_rx_copybreak, + ena_store_rx_copybreak); + +/****************************************************************************** + *****************************************************************************/ +int ena_sysfs_init(struct device *dev) +{ + + if (device_create_file(dev, &dev_attr_rx_copybreak)) + dev_err(dev, "Failed to create rx_copybreak sysfs entry"); + return 0; +} + +/****************************************************************************** + *****************************************************************************/ +void ena_sysfs_terminate(struct device *dev) +{ + device_remove_file(dev, &dev_attr_rx_copybreak); +} diff --git a/drivers/amazon/net/ena/ena_sysfs.h b/drivers/amazon/net/ena/ena_sysfs.h new file mode 100755 index 0000000000000..8c572eee268f3 --- /dev/null +++ b/drivers/amazon/net/ena/ena_sysfs.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef __ENA_SYSFS_H__ +#define __ENA_SYSFS_H__ + +#ifdef CONFIG_SYSFS + +int ena_sysfs_init(struct device *dev); + +void ena_sysfs_terminate(struct device *dev); + +#else /* CONFIG_SYSFS */ + +static inline int ena_sysfs_init(struct device *dev) +{ + return 0; +} + +static inline void ena_sysfs_terminate(struct device *dev) +{ +} + +#endif /* CONFIG_SYSFS */ + +#endif /* __ENA_SYSFS_H__ */ diff --git a/drivers/amazon/net/ena/ena_xdp.c b/drivers/amazon/net/ena/ena_xdp.c new file mode 100644 index 0000000000000..4d8c1709598de --- /dev/null +++ b/drivers/amazon/net/ena/ena_xdp.c @@ -0,0 +1,977 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "ena_xdp.h" +#ifdef ENA_XDP_SUPPORT + +static int validate_xdp_req_id(struct ena_ring *tx_ring, u16 req_id) +{ + struct ena_tx_buffer *tx_info; + + tx_info = &tx_ring->tx_buffer_info[req_id]; + if (likely(tx_info->total_tx_size)) + return 0; + + return handle_invalid_req_id(tx_ring, req_id, tx_info, true); +} + +static int ena_xdp_tx_map_frame(struct ena_ring *tx_ring, + struct ena_tx_buffer *tx_info, + struct xdp_frame *xdpf, + struct ena_com_tx_ctx *ena_tx_ctx) +{ + struct ena_adapter *adapter = tx_ring->adapter; + struct ena_com_buf *ena_buf; + int push_len = 0; + dma_addr_t dma; + void *data; + u32 size; + + tx_info->xdpf = xdpf; + data = tx_info->xdpf->data; + size = tx_info->xdpf->len; + + if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + /* Designate part of the packet for LLQ */ + push_len = min_t(u32, size, tx_ring->tx_max_header_size); + + ena_tx_ctx->push_header = data; + + size -= push_len; + data += push_len; + } + + ena_tx_ctx->header_len = push_len; + + if (size > 0) { + dma = dma_map_single(tx_ring->dev, + data, + size, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) + goto error_report_dma_error; + + tx_info->map_linear_data = 0; + + ena_buf = tx_info->bufs; + ena_buf->paddr = dma; + ena_buf->len = size; + + ena_tx_ctx->ena_bufs = ena_buf; + ena_tx_ctx->num_bufs = tx_info->num_of_bufs = 1; + } + + return 0; + +error_report_dma_error: + ena_increase_stat(&tx_ring->tx_stats.dma_mapping_err, 1, + &tx_ring->syncp); + netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map xdp buff\n"); + + return -EINVAL; +} + +int ena_xdp_xmit_frame(struct ena_ring *tx_ring, + struct ena_adapter *adapter, + struct xdp_frame *xdpf, + int flags) +{ + struct ena_com_tx_ctx ena_tx_ctx = {}; + struct ena_tx_buffer *tx_info; + u16 next_to_use, req_id; + int rc; + + next_to_use = tx_ring->next_to_use; + req_id = tx_ring->free_ids[next_to_use]; + tx_info = &tx_ring->tx_buffer_info[req_id]; + tx_info->num_of_bufs = 0; + + rc = ena_xdp_tx_map_frame(tx_ring, tx_info, xdpf, &ena_tx_ctx); + if (unlikely(rc)) + return rc; + + ena_tx_ctx.req_id = req_id; + + rc = ena_xmit_common(adapter, + tx_ring, + tx_info, + &ena_tx_ctx, + next_to_use, + xdpf->len); + if (rc) + goto error_unmap_dma; + + /* trigger the dma engine. ena_ring_tx_doorbell() + * calls a memory barrier inside it. + */ + if (flags & XDP_XMIT_FLUSH) + ena_ring_tx_doorbell(tx_ring); + + return rc; + +error_unmap_dma: + ena_unmap_tx_buff(tx_ring, tx_info); + tx_info->xdpf = NULL; + return rc; +} + +int ena_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames, u32 flags) +{ + struct ena_adapter *adapter = netdev_priv(dev); + struct ena_ring *tx_ring; + int qid, i, nxmit = 0; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + return -ENETDOWN; + + /* We assume that all rings have the same XDP program */ + if (!READ_ONCE(adapter->rx_ring->xdp_bpf_prog)) + return -ENXIO; + + qid = smp_processor_id() % adapter->xdp_num_queues; + qid += adapter->xdp_first_ring; + tx_ring = &adapter->tx_ring[qid]; + + /* Other CPU ids might try to send thorugh this queue */ + spin_lock(&tx_ring->xdp_tx_lock); + + for (i = 0; i < n; i++) { + if (ena_xdp_xmit_frame(tx_ring, adapter, frames[i], 0)) + break; + nxmit++; + } + + /* Ring doorbell to make device aware of the packets */ + if (flags & XDP_XMIT_FLUSH) + ena_ring_tx_doorbell(tx_ring); + + spin_unlock(&tx_ring->xdp_tx_lock); + +#ifndef ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY + for (i = nxmit; unlikely(i < n); i++) + xdp_return_frame(frames[i]); + +#endif + /* Return number of packets sent */ + return nxmit; +} + +static void ena_init_all_xdp_queues(struct ena_adapter *adapter) +{ + adapter->xdp_first_ring = adapter->num_io_queues; + adapter->xdp_num_queues = adapter->num_io_queues; + + ena_init_io_rings(adapter, + adapter->xdp_first_ring, + adapter->xdp_num_queues); +} + +int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter) +{ + int rc = 0; + + rc = ena_setup_tx_resources_in_range(adapter, adapter->xdp_first_ring, + adapter->xdp_num_queues); + if (rc) + goto setup_err; + + rc = ena_create_io_tx_queues_in_range(adapter, + adapter->xdp_first_ring, + adapter->xdp_num_queues); + if (rc) + goto create_err; + + return 0; + +create_err: + ena_free_all_io_tx_resources_in_range(adapter, adapter->xdp_first_ring, + adapter->xdp_num_queues); +setup_err: + return rc; +} + +/* Provides a way for both kernel and bpf-prog to know + * more about the RX-queue a given XDP frame arrived on. + */ +int ena_xdp_register_rxq_info(struct ena_ring *rx_ring) +{ + int rc; + +#ifdef AF_XDP_BUSY_POLL_SUPPORTED + rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, + rx_ring->napi->napi_id < 0); +#else + rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid); +#endif + + netif_dbg(rx_ring->adapter, ifup, rx_ring->netdev, "Registering RX info for queue %d", + rx_ring->qid); + if (rc) { + netif_err(rx_ring->adapter, ifup, rx_ring->netdev, + "Failed to register xdp rx queue info. RX queue num %d rc: %d\n", + rx_ring->qid, rc); + goto err; + } + + if (ENA_IS_XSK_RING(rx_ring)) { + rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_XSK_BUFF_POOL, NULL); + xsk_pool_set_rxq_info(rx_ring->xsk_pool, &rx_ring->xdp_rxq); + } else { + rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, + NULL); + } + + if (rc) { + netif_err(rx_ring->adapter, ifup, rx_ring->netdev, + "Failed to register xdp rx queue info memory model. RX queue num %d rc: %d\n", + rx_ring->qid, rc); + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); + } + +err: + return rc; +} + +#ifdef ENA_AF_XDP_SUPPORT +void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring) +{ + struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool; + int i, xsk_frames = 0; + + for (i = 0; i < tx_ring->ring_size; i++) { + struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i]; + + if (tx_info->last_jiffies) + xsk_frames++; + + tx_info->last_jiffies = 0; + } + + if (xsk_frames) + xsk_tx_completed(xsk_pool, xsk_frames); +} + +void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid) +{ + struct ena_ring *rx_ring = &adapter->rx_ring[qid]; + int i = 0; + + for (i = 0; i < rx_ring->ring_size; i++) { + struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i]; + + if (rx_info->xdp) + xsk_buff_free(rx_info->xdp); + + rx_info->xdp = NULL; + } +} + +#endif /* ENA_AF_XDP_SUPPORT */ +void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring) +{ + netif_dbg(rx_ring->adapter, ifdown, rx_ring->netdev, + "Unregistering RX info for queue %d", + rx_ring->qid); + xdp_rxq_info_unreg_mem_model(&rx_ring->xdp_rxq); + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); +} + +void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter, + struct bpf_prog *prog, + int first, int count) +{ + struct bpf_prog *old_bpf_prog; + struct ena_ring *rx_ring; + int i = 0; + + for (i = first; i < count; i++) { + rx_ring = &adapter->rx_ring[i]; + old_bpf_prog = xchg(&rx_ring->xdp_bpf_prog, prog); + + if (!old_bpf_prog && prog) { + rx_ring->rx_headroom = XDP_PACKET_HEADROOM; + } else if (old_bpf_prog && !prog) { + rx_ring->rx_headroom = NET_SKB_PAD; + } + } +} + +static void ena_xdp_exchange_program(struct ena_adapter *adapter, + struct bpf_prog *prog) +{ + struct bpf_prog *old_bpf_prog = xchg(&adapter->xdp_bpf_prog, prog); + + ena_xdp_exchange_program_rx_in_range(adapter, + prog, + 0, + adapter->num_io_queues); + + if (old_bpf_prog) + bpf_prog_put(old_bpf_prog); +} + +static int ena_destroy_and_free_all_xdp_queues(struct ena_adapter *adapter) +{ + bool was_up; + int rc; + + was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags); + + if (was_up) + ena_down(adapter); + + adapter->xdp_first_ring = 0; + adapter->xdp_num_queues = 0; + ena_xdp_exchange_program(adapter, NULL); + if (was_up) { + rc = ena_up(adapter); + if (rc) + return rc; + } + return 0; +} + +static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct bpf_prog *prog = bpf->prog; + struct bpf_prog *old_bpf_prog; + int rc, prev_mtu; + bool is_up; + + is_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags); + rc = ena_xdp_allowed(adapter); + if (rc == ENA_XDP_ALLOWED) { + old_bpf_prog = adapter->xdp_bpf_prog; + if (prog) { + if (!is_up) { + ena_init_all_xdp_queues(adapter); + } else if (!old_bpf_prog) { + ena_down(adapter); + ena_init_all_xdp_queues(adapter); + } + ena_xdp_exchange_program(adapter, prog); + + netif_dbg(adapter, drv, adapter->netdev, "Set a new XDP program\n"); + + if (is_up && !old_bpf_prog) { + rc = ena_up(adapter); + if (rc) + return rc; + } + } else if (old_bpf_prog) { + netif_dbg(adapter, drv, adapter->netdev, + "Removing XDP program\n"); + + rc = ena_destroy_and_free_all_xdp_queues(adapter); + if (rc) + return rc; + } + + prev_mtu = netdev->max_mtu; + netdev->max_mtu = prog ? ENA_XDP_MAX_MTU : adapter->max_mtu; + + if (!old_bpf_prog) + netif_info(adapter, drv, adapter->netdev, + "XDP program is set, changing the max_mtu from %d to %d", + prev_mtu, netdev->max_mtu); + + } else if (rc == ENA_XDP_CURRENT_MTU_TOO_LARGE) { + netif_err(adapter, drv, adapter->netdev, + "Failed to set xdp program, the current MTU (%d) is larger than the maximum allowed MTU (%lu) while xdp is on", + netdev->mtu, ENA_XDP_MAX_MTU); + NL_SET_ERR_MSG_MOD(bpf->extack, + "Failed to set xdp program, the current MTU is larger than the maximum allowed MTU. Check the dmesg for more info"); + return -EINVAL; + } else if (rc == ENA_XDP_NO_ENOUGH_QUEUES) { + netif_err(adapter, drv, adapter->netdev, + "Failed to set xdp program, the Rx/Tx channel count should be at most half of the maximum allowed channel count. The current queue count (%d), the maximal queue count (%d)\n", + adapter->num_io_queues, adapter->max_num_io_queues); + NL_SET_ERR_MSG_MOD(bpf->extack, + "Failed to set xdp program, there is no enough space for allocating XDP queues, Check the dmesg for more info"); + return -EINVAL; + } + + return 0; +} + +#ifdef ENA_AF_XDP_SUPPORT +static bool ena_is_xsk_pool_params_allowed(struct xsk_buff_pool *pool) +{ + return xsk_pool_get_headroom(pool) == 0 && + xsk_pool_get_chunk_size(pool) == ENA_PAGE_SIZE; +} + +static int ena_xsk_pool_enable(struct ena_adapter *adapter, + struct xsk_buff_pool *pool, + u16 qid) +{ + struct ena_ring *rx_ring, *tx_ring; + bool dev_was_up = false; + int err; + + if (!ena_xdp_legal_queue_count(adapter, qid)) { + netdev_err(adapter->netdev, + "Max qid for XSK pool is %d (received %d)\n", + adapter->max_num_io_queues >> 1, qid); + return -EINVAL; + } + + if (ena_is_xsk_pool_params_allowed(pool)) + return -EINVAL; + + rx_ring = &adapter->rx_ring[qid]; + tx_ring = &adapter->tx_ring[qid]; + + err = xsk_pool_dma_map(pool, adapter->ena_dev->dmadev, 0); + if (err) { + ena_increase_stat(&rx_ring->rx_stats.dma_mapping_err, 1, + &rx_ring->syncp); + netif_err(adapter, drv, adapter->netdev, + "Failed to DMA map XSK pool for qid %d\n", qid); + return err; + } + + if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) { + dev_was_up = true; + ena_down(adapter); + } + + rx_ring->xsk_pool = tx_ring->xsk_pool = pool; + + netif_dbg(adapter, drv, adapter->netdev, + "Setting XSK pool for queue %d\n", qid); + + return dev_was_up ? ena_up(adapter) : 0; +} + +static int ena_xsk_pool_disable(struct ena_adapter *adapter, + u16 qid) +{ + struct ena_ring *rx_ring, *tx_ring; + bool dev_was_up = false; + + if (qid >= adapter->num_io_queues) + return -EINVAL; + + rx_ring = &adapter->rx_ring[qid]; + tx_ring = &adapter->tx_ring[qid]; + + /* XSK pool isn't attached to this ring */ + if (!rx_ring->xsk_pool) + return 0; + + if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) { + dev_was_up = true; + ena_down(adapter); + } + + xsk_pool_dma_unmap(rx_ring->xsk_pool, 0); + + rx_ring->xsk_pool = tx_ring->xsk_pool = NULL; + + netif_dbg(adapter, drv, adapter->netdev, + "Removing XSK pool for queue %d\n", qid); + + return dev_was_up ? ena_up(adapter) : 0; +} + +static int ena_xsk_pool_setup(struct ena_adapter *adapter, + struct xsk_buff_pool *pool, + u16 qid) +{ + return pool ? ena_xsk_pool_enable(adapter, pool, qid) : + ena_xsk_pool_disable(adapter, qid); +} + +#endif /* ENA_AF_XDP_SUPPORT */ +/* This is the main xdp callback, it's used by the kernel to set/unset the xdp + * program as well as to query the current xdp program id. + */ +int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf) +{ +#if !defined(ENA_XDP_QUERY_IN_KERNEL) || defined(ENA_AF_XDP_SUPPORT) + struct ena_adapter *adapter = netdev_priv(netdev); + +#endif /* ENA_XDP_QUERY_IN_KERNEL || ENA_AF_XDP_SUPPORT */ + switch (bpf->command) { + case XDP_SETUP_PROG: + return ena_xdp_set(netdev, bpf); +#ifdef ENA_AF_XDP_SUPPORT + case XDP_SETUP_XSK_POOL: + return ena_xsk_pool_setup(adapter, bpf->xsk.pool, bpf->xsk.queue_id); +#endif /* ENA_AF_XDP_SUPPORT */ +#ifndef ENA_XDP_QUERY_IN_KERNEL + case XDP_QUERY_PROG: + bpf->prog_id = adapter->xdp_bpf_prog ? + adapter->xdp_bpf_prog->aux->id : 0; + break; +#endif + default: + return -EINVAL; + } + return 0; +} + +#ifdef ENA_AF_XDP_SUPPORT +int ena_xdp_xsk_wakeup(struct net_device *netdev, u32 qid, u32 flags) +{ + struct ena_adapter *adapter = netdev_priv(netdev); + struct ena_ring *tx_ring; + struct napi_struct *napi; + + if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + return -ENETDOWN; + + if (qid >= adapter->num_io_queues) + return -EINVAL; + + if (!adapter->xdp_bpf_prog) + return -ENXIO; + + tx_ring = &adapter->tx_ring[qid]; + + if (!ENA_IS_XSK_RING(tx_ring)) + return -ENXIO; + + ena_increase_stat(&tx_ring->tx_stats.xsk_wakeup_request, 1, + &tx_ring->syncp); + + napi = tx_ring->napi; + + napi_schedule(napi); + + return 0; +} + +#endif /* ENA_AF_XDP_SUPPORT */ +static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget) +{ + + bool is_zc_q = ENA_IS_XSK_RING(tx_ring); + u32 total_done = 0; + u16 next_to_clean; + bool needs_wakeup; + u32 tx_bytes = 0; + int tx_pkts = 0; + u16 req_id; + int rc; + + if (unlikely(!tx_ring)) + return 0; + next_to_clean = tx_ring->next_to_clean; + + while (tx_pkts < budget) { + struct ena_tx_buffer *tx_info; + struct xdp_frame *xdpf; + + rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq, + &req_id); + if (rc) { + if (unlikely(rc == -EINVAL)) + handle_invalid_req_id(tx_ring, req_id, NULL, + true); + break; + } + + /* validate that the request id points to a valid xdp_frame */ + rc = validate_xdp_req_id(tx_ring, req_id); + if (rc) + break; + + tx_info = &tx_ring->tx_buffer_info[req_id]; + + tx_info->last_jiffies = 0; + + if (!is_zc_q) { + xdpf = tx_info->xdpf; + tx_info->xdpf = NULL; + ena_unmap_tx_buff(tx_ring, tx_info); + xdp_return_frame(xdpf); + } + + netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev, + "tx_poll: q %d pkt #%d req_id %d\n", tx_ring->qid, tx_pkts, req_id); + + tx_bytes += tx_info->total_tx_size; + tx_pkts++; + total_done += tx_info->tx_descs; + + tx_info->total_tx_size = 0; + + tx_ring->free_ids[next_to_clean] = req_id; + next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean, + tx_ring->ring_size); + } + + tx_ring->next_to_clean = next_to_clean; + ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done); + + netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev, + "tx_poll: q %d done. total pkts: %d\n", + tx_ring->qid, tx_pkts); + + needs_wakeup = tx_pkts < budget; +#ifdef ENA_AF_XDP_SUPPORT + if (is_zc_q) { + struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool; + + if (tx_pkts) + xsk_tx_completed(xsk_pool, tx_pkts); + + if (xsk_uses_need_wakeup(xsk_pool)) { + if (needs_wakeup) + xsk_set_tx_need_wakeup(xsk_pool); + else + xsk_clear_tx_need_wakeup(xsk_pool); + } + } +#endif /* ENA_AF_XDP_SUPPORT */ + + return needs_wakeup; +} + +#ifdef ENA_AF_XDP_SUPPORT +static bool ena_xdp_xmit_irq_zc(struct ena_ring *tx_ring, + struct napi_struct *napi, + int budget) +{ + struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool; + int size, rc, push_len = 0, work_done = 0; + struct ena_tx_buffer *tx_info; + struct ena_com_buf *ena_buf; + u16 next_to_use, req_id; + bool need_wakeup = true; + struct xdp_desc desc; + dma_addr_t dma; + + while (likely(work_done < budget)) { + struct ena_com_tx_ctx ena_tx_ctx = {}; + + /* We assume the maximum number of descriptors, which is two + * (meta data included) + */ + if (unlikely(!ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, 2))) + break; + + if (!xsk_tx_peek_desc(xsk_pool, &desc)) + break; + + next_to_use = tx_ring->next_to_use; + req_id = tx_ring->free_ids[next_to_use]; + tx_info = &tx_ring->tx_buffer_info[req_id]; + + size = desc.len; + + if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + /* Designate part of the packet for LLQ */ + push_len = min_t(u32, size, tx_ring->tx_max_header_size); + ena_tx_ctx.push_header = xsk_buff_raw_get_data(xsk_pool, desc.addr); + ena_tx_ctx.header_len = push_len; + + size -= push_len; + if (!size) + goto xmit_desc; + } + + /* Pass the rest of the descriptor as a DMA address. Assuming + * single page descriptor. + */ + dma = xsk_buff_raw_get_dma(xsk_pool, desc.addr); + ena_buf = tx_info->bufs; + ena_buf->paddr = dma + push_len; + ena_buf->len = size; + + ena_tx_ctx.ena_bufs = ena_buf; + ena_tx_ctx.num_bufs = 1; + +xmit_desc: + ena_tx_ctx.req_id = req_id; + + netif_dbg(tx_ring->adapter, tx_queued, tx_ring->netdev, + "Queueing zc packet on q %d, %s DMA part (req-id %d)\n", + tx_ring->qid, ena_tx_ctx.num_bufs ? "with" : "without", req_id); + + rc = ena_xmit_common(tx_ring->adapter, + tx_ring, + tx_info, + &ena_tx_ctx, + next_to_use, + desc.len); + if (rc) + break; + + work_done++; + } + + if (work_done) { + xsk_tx_release(xsk_pool); + ena_ring_tx_doorbell(tx_ring); + } + + if (work_done == budget) { + need_wakeup = false; + if (xsk_uses_need_wakeup(xsk_pool)) + xsk_clear_tx_need_wakeup(xsk_pool); + } + + return need_wakeup; +} + +static struct sk_buff *ena_xdp_rx_skb_zc(struct ena_ring *rx_ring, struct xdp_buff *xdp) +{ + u32 headroom, data_len; + struct sk_buff *skb; + void *data_addr; + + /* Assuming single-page packets for XDP */ + headroom = xdp->data - xdp->data_hard_start; + data_len = xdp->data_end - xdp->data; + data_addr = xdp->data; + + /* allocate a skb to store the frags */ + skb = __napi_alloc_skb(rx_ring->napi, + headroom + data_len, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!skb)) { + ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1, + &rx_ring->syncp); + netif_err(rx_ring->adapter, rx_err, rx_ring->netdev, + "Failed to allocate skb in zc queue %d\n", rx_ring->qid); + return NULL; + } + + skb_reserve(skb, headroom); + memcpy(__skb_put(skb, data_len), data_addr, data_len); + + skb->protocol = eth_type_trans(skb, rx_ring->netdev); + + return skb; +} + +static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring, + struct napi_struct *napi, + int budget) +{ + int i, refill_required, work_done, refill_threshold, pkt_copy; + u16 next_to_clean = rx_ring->next_to_clean; + int xdp_verdict, req_id, rc, total_len; + struct ena_com_rx_ctx ena_rx_ctx; + struct ena_rx_buffer *rx_info; + bool xdp_prog_present; + struct xdp_buff *xdp; + struct sk_buff *skb; + u32 xdp_flags = 0; + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "%s qid %d\n", __func__, rx_ring->qid); + + ena_rx_ctx.ena_bufs = rx_ring->ena_bufs; + ena_rx_ctx.max_bufs = rx_ring->sgl_size; + + xdp_prog_present = ena_xdp_present_ring(rx_ring); + + work_done = 0; + total_len = 0; + pkt_copy = 0; + + do { + xdp_verdict = ENA_XDP_PASS; + + /* Poll a packet from HW */ + rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq, + rx_ring->ena_com_io_sq, + &ena_rx_ctx); + if (unlikely(rc)) + break; + + /* Polled all RX packets */ + if (unlikely(ena_rx_ctx.descs == 0)) + break; + + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, + "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n", + rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto, + ena_rx_ctx.l4_proto, ena_rx_ctx.hash); + + /* First descriptor might have an offset set by the device */ + rx_info = &rx_ring->rx_buffer_info[ena_rx_ctx.ena_bufs[0].req_id]; + xdp = rx_info->xdp; + xdp->data += ena_rx_ctx.pkt_offset; + xdp->data_end = xdp->data + ena_rx_ctx.ena_bufs[0].len; + xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool); + + /* XDP multi-buffer packets not supported */ + if (unlikely(ena_rx_ctx.descs > 1)) { + netdev_err_once(rx_ring->adapter->netdev, + "xdp: dropped multi-buffer packets. RX packets must be < %lu\n", + ENA_XDP_MAX_MTU); + ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp); + xdp_verdict = ENA_XDP_DROP; + goto skip_xdp_prog; + } + + if (likely(xdp_prog_present)) + xdp_verdict = ena_xdp_execute(rx_ring, xdp); + +skip_xdp_prog: + /* Note that there can be several descriptors, since device + * might not honor MTU + */ + for (i = 0; i < ena_rx_ctx.descs; i++) { + req_id = rx_ring->ena_bufs[i].req_id; + rx_ring->free_ids[next_to_clean] = req_id; + next_to_clean = + ENA_RX_RING_IDX_NEXT(next_to_clean, + rx_ring->ring_size); + } + + if (likely(xdp_verdict)) { + work_done++; + total_len += ena_rx_ctx.ena_bufs[0].len; + xdp_flags |= xdp_verdict; + + /* Mark buffer as consumed when it is redirected */ + if (likely(xdp_verdict & ENA_XDP_FORWARDED)) + rx_info->xdp = NULL; + + continue; + } + + /* XDP PASS */ + skb = ena_xdp_rx_skb_zc(rx_ring, xdp); + if (unlikely(!skb)) { + rc = -ENOMEM; + break; + } + + pkt_copy++; + work_done++; + total_len += ena_rx_ctx.ena_bufs[0].len; + ena_rx_checksum(rx_ring, &ena_rx_ctx, skb); + ena_set_rx_hash(rx_ring, &ena_rx_ctx, skb); + skb_record_rx_queue(skb, rx_ring->qid); + napi_gro_receive(napi, skb); + + } while (likely(work_done <= budget)); + + rx_ring->per_napi_packets += work_done; + u64_stats_update_begin(&rx_ring->syncp); + rx_ring->rx_stats.bytes += total_len; + rx_ring->rx_stats.cnt += work_done; + rx_ring->rx_stats.zc_queue_pkt_copy += pkt_copy; + u64_stats_update_end(&rx_ring->syncp); + + rx_ring->next_to_clean = next_to_clean; + + if (xdp_flags & ENA_XDP_REDIRECT) + xdp_do_flush_map(); + + refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq); + refill_threshold = + min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER, + ENA_RX_REFILL_THRESH_PACKET); + /* Optimization, try to batch new rx buffers */ + if (refill_required > refill_threshold) + ena_refill_rx_bufs(rx_ring, refill_required); + + if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) { + if (likely(rc || work_done < budget)) { + xsk_set_rx_need_wakeup(rx_ring->xsk_pool); + ena_increase_stat(&rx_ring->rx_stats.xsk_need_wakeup_set, 1, + &rx_ring->syncp); + } else { + xsk_clear_rx_need_wakeup(rx_ring->xsk_pool); + } + } + + if (unlikely(rc)) { + struct ena_adapter *adapter = netdev_priv(rx_ring->netdev); + + if (rc == -ENOSPC) { + ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1, + &rx_ring->syncp); + ena_reset_device(adapter, + ENA_REGS_RESET_TOO_MANY_RX_DESCS); + } else if (rc == -EIO) { + ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1, + &rx_ring->syncp); + ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID); + } + + return 0; + } + + return work_done; +} + +#endif /* ENA_AF_XDP_SUPPORT */ +/* This is the XDP napi callback. XDP queues use a separate napi callback + * than Rx/Tx queues. + */ +int ena_xdp_io_poll(struct napi_struct *napi, int budget) +{ + struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi); + struct ena_ring *rx_ring, *tx_ring; + bool needs_wakeup = true; + u32 rx_work_done = 0; + int ret; + + rx_ring = ena_napi->rx_ring; + tx_ring = ena_napi->tx_ring; + + if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) || + test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags)) { + napi_complete_done(napi, 0); + return 0; + } + + needs_wakeup &= ena_clean_xdp_irq(tx_ring, budget); + +#ifdef ENA_AF_XDP_SUPPORT + if (!ENA_IS_XSK_RING(tx_ring)) + goto polling_done; + + needs_wakeup &= ena_xdp_xmit_irq_zc(tx_ring, napi, budget); + + rx_work_done = ena_xdp_clean_rx_irq_zc(rx_ring, napi, budget); + needs_wakeup &= rx_work_done < budget; + +polling_done: +#endif /* ENA_AF_XDP_SUPPORT */ + /* If the device is about to reset or down, avoid unmask + * the interrupt and return 0 so NAPI won't reschedule + */ + if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags))) { + napi_complete_done(napi, 0); + ret = 0; + } else if (needs_wakeup) { + ena_increase_stat(&tx_ring->tx_stats.napi_comp, 1, + &tx_ring->syncp); + if (napi_complete_done(napi, rx_work_done) && + READ_ONCE(ena_napi->interrupts_masked)) { + smp_rmb(); /* make sure interrupts_masked is read */ + WRITE_ONCE(ena_napi->interrupts_masked, false); + ena_unmask_interrupt(tx_ring, NULL); + } + + ena_update_ring_numa_node(tx_ring, NULL); + ret = rx_work_done; + } else { + ret = budget; + } + + u64_stats_update_begin(&tx_ring->syncp); + tx_ring->tx_stats.tx_poll++; + u64_stats_update_end(&tx_ring->syncp); + tx_ring->tx_stats.last_napi_jiffies = jiffies; + + return ret; +} +#endif /* ENA_XDP_SUPPORT */ diff --git a/drivers/amazon/net/ena/ena_xdp.h b/drivers/amazon/net/ena/ena_xdp.h new file mode 100644 index 0000000000000..dde8f9053f707 --- /dev/null +++ b/drivers/amazon/net/ena/ena_xdp.h @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef ENA_XDP_H +#define ENA_XDP_H + +#include "ena_netdev.h" +#ifdef ENA_XDP_SUPPORT +#include +#ifdef ENA_AF_XDP_SUPPORT +#include +#endif /* ENA_AF_XDP_SUPPORT */ + +#ifdef ENA_AF_XDP_SUPPORT +#define ENA_IS_XSK_RING(ring) (!!(ring)->xsk_pool) +#endif /* ENA_AF_XDP_SUPPORT */ + +/* The max MTU size is configured to be the ethernet frame size without + * the overhead of the ethernet header, which can have a VLAN header, and + * a frame check sequence (FCS). + * The buffer size we share with the device is defined to be ENA_PAGE_SIZE + */ +#ifdef XDP_HAS_FRAME_SZ +#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \ + VLAN_HLEN - XDP_PACKET_HEADROOM - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) +#else +#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \ + VLAN_HLEN - XDP_PACKET_HEADROOM) +#endif + +#define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \ + ((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues)) + +enum ENA_XDP_ACTIONS { + ENA_XDP_PASS = 0, + ENA_XDP_TX = BIT(0), + ENA_XDP_REDIRECT = BIT(1), + ENA_XDP_DROP = BIT(2) +}; + +#define ENA_XDP_FORWARDED (ENA_XDP_TX | ENA_XDP_REDIRECT) + +int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter); +void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter, + struct bpf_prog *prog, + int first, int count); +int ena_xdp_io_poll(struct napi_struct *napi, int budget); +int ena_xdp_xmit_frame(struct ena_ring *tx_ring, + struct ena_adapter *adapter, + struct xdp_frame *xdpf, + int flags); +int ena_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames, u32 flags); +int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf); +int ena_xdp_register_rxq_info(struct ena_ring *rx_ring); +void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring); +#ifdef ENA_AF_XDP_SUPPORT +void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring); +void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid); +int ena_xdp_xsk_wakeup(struct net_device *netdev, u32 qid, u32 flags); +#endif + +enum ena_xdp_errors_t { + ENA_XDP_ALLOWED = 0, + ENA_XDP_CURRENT_MTU_TOO_LARGE, + ENA_XDP_NO_ENOUGH_QUEUES, +}; + +static inline bool ena_xdp_present(struct ena_adapter *adapter) +{ + return !!adapter->xdp_bpf_prog; +} + +static inline bool ena_xdp_present_ring(struct ena_ring *ring) +{ + return !!ring->xdp_bpf_prog; +} + +static inline bool ena_xdp_legal_queue_count(struct ena_adapter *adapter, + u32 queues) +{ + return 2 * queues <= adapter->max_num_io_queues; +} + +static inline enum ena_xdp_errors_t ena_xdp_allowed(struct ena_adapter *adapter) +{ + enum ena_xdp_errors_t rc = ENA_XDP_ALLOWED; + + if (adapter->netdev->mtu > ENA_XDP_MAX_MTU) + rc = ENA_XDP_CURRENT_MTU_TOO_LARGE; + else if (!ena_xdp_legal_queue_count(adapter, adapter->num_io_queues)) + rc = ENA_XDP_NO_ENOUGH_QUEUES; + + return rc; +} + +static inline u64 ena_ring_xdp_drops_cnt(struct ena_ring *rx_ring) +{ + return rx_ring->rx_stats.xdp_drop; +} + +#ifdef ENA_AF_XDP_SUPPORT +static inline bool ena_is_zc_q_exist(struct ena_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_io_queues; i++) + if (ENA_IS_XSK_RING(&adapter->rx_ring[i])) + return true; + + return false; +} + +#endif /* ENA_AF_XDP_SUPPORT */ +static inline int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) +{ + u32 verdict = ENA_XDP_PASS; + struct bpf_prog *xdp_prog; + struct ena_ring *xdp_ring; + struct xdp_frame *xdpf; + u64 *xdp_stat; + + xdp_prog = READ_ONCE(rx_ring->xdp_bpf_prog); + + verdict = bpf_prog_run_xdp(xdp_prog, xdp); + + switch (verdict) { + case XDP_TX: +#ifdef XDP_CONVERT_TO_FRAME_NAME_CHANGED + xdpf = xdp_convert_buff_to_frame(xdp); +#else + xdpf = convert_to_xdp_frame(xdp); +#endif + if (unlikely(!xdpf)) { + trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); + xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = ENA_XDP_DROP; + break; + } + + /* Find xmit queue */ + xdp_ring = rx_ring->xdp_ring; + + /* The XDP queues are shared between XDP_TX and XDP_REDIRECT */ + spin_lock(&xdp_ring->xdp_tx_lock); + + if (ena_xdp_xmit_frame(xdp_ring, rx_ring->adapter, xdpf, + XDP_XMIT_FLUSH)) + xdp_return_frame(xdpf); + + spin_unlock(&xdp_ring->xdp_tx_lock); + xdp_stat = &rx_ring->rx_stats.xdp_tx; + verdict = ENA_XDP_TX; + break; + case XDP_REDIRECT: + if (likely(!xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog))) { + xdp_stat = &rx_ring->rx_stats.xdp_redirect; + verdict = ENA_XDP_REDIRECT; + break; + } + trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); + xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = ENA_XDP_DROP; + break; + case XDP_ABORTED: + trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); + xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = ENA_XDP_DROP; + break; + case XDP_DROP: + xdp_stat = &rx_ring->rx_stats.xdp_drop; + verdict = ENA_XDP_DROP; + break; + case XDP_PASS: + xdp_stat = &rx_ring->rx_stats.xdp_pass; + verdict = ENA_XDP_PASS; + break; + default: + bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, verdict); + xdp_stat = &rx_ring->rx_stats.xdp_invalid; + verdict = ENA_XDP_DROP; + } + + ena_increase_stat(xdp_stat, 1, &rx_ring->syncp); + + return verdict; +} +#else /* ENA_XDP_SUPPORT */ + +#define ENA_IS_XDP_INDEX(adapter, index) (false) + +static inline bool ena_xdp_present_ring(struct ena_ring *ring) +{ + return false; +} + +static inline u64 ena_ring_xdp_drops_cnt(struct ena_ring *rx_ring) +{ + return 0; +} + +static inline int ena_xdp_register_rxq_info(struct ena_ring *rx_ring) +{ + return 0; +} + +static inline void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring) {} + +#endif /* ENA_XDP_SUPPORT */ +#ifndef ENA_AF_XDP_SUPPORT /* stabs for AF XDP code */ + +/* Define (or override if it's defined) these enum and function to make sure + * that the code that uses them would always compile. If AF XDP isn't supported, it + * won't be used anyway. + */ +#define MEM_TYPE_XSK_BUFF_POOL 0 +#define xsk_pool_set_rxq_info(pool, rxq) + +static inline void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring) {} +static inline void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid) {} + +#define ENA_IS_XSK_RING(ring) false + +static inline bool ena_is_zc_q_exist(struct ena_adapter *adapter) +{ + return false; +} +#endif /* ENA_AF_XDP_SUPPORT */ +#endif /* ENA_XDP_H */ diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h new file mode 100644 index 0000000000000..62ddd400e787f --- /dev/null +++ b/drivers/amazon/net/ena/kcompat.h @@ -0,0 +1,1171 @@ +/******************************************************************************* +Modified by Amazon 2015-2016. +Copyright 2015-2016, Amazon.com, Inc. or its affiliates. All Rights Reserved. + +Modifications subject to the terms and conditions of the GNU General +Public License, version 2. +*******************************************************************************/ + +/******************************************************************************* + +Intel 10 Gigabit PCI Express Linux driver +Copyright(c) 1999 - 2013 Intel Corporation. + +This program is free software; you can redistribute it and/or modify it +under the terms and conditions of the GNU General Public License, +version 2, as published by the Free Software Foundation. + +This program is distributed in the hope it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + +The full GNU General Public License is included in this distribution in +the file called "COPYING". + +Contact Information: +e1000-devel Mailing List +Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 + +*******************************************************************************/ + +#ifndef _KCOMPAT_H_ +#define _KCOMPAT_H_ + +#ifndef LINUX_VERSION_CODE +#include +#endif + +#ifndef KERNEL_VERSION +#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c)) +#endif + +#include + +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) ) +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0) +#include +#endif + +/* For ACCESS_ONCE, WRITE_ONCE and READ_ONCE macros */ +#include + +#ifndef SZ_256 +#define SZ_256 0x0000100 +#endif + +#ifndef SZ_4K +#define SZ_4K 0x00001000 +#endif + +#ifndef SZ_16K +#define SZ_16K 0x00004000 +#endif + +#ifdef HAVE_POLL_CONTROLLER +#define CONFIG_NET_POLL_CONTROLLER +#endif + +#ifndef __GFP_COLD +#define __GFP_COLD 0 +#endif + +#if defined(CONFIG_NET_RX_BUSY_POLL) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) && \ + LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) +#define ENA_BUSY_POLL_SUPPORT +#endif + +/* Distribution kernel version comparison macros. + * Distribution kernel versioning format may be A.B.C-D.E.F and standard + * KERNEL_VERSION macro covers only the first 3 subversions. + * Using 20bit per subversion, as in some cases, subversion D may be a large + * number (6 digits). + */ +#define ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3) ((SV1 << 40) | (SV2 << 20) | (SV3)) +#define ENA_KERNEL_VERSION_MAJOR(SV1, SV2, SV3) ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3) +#define ENA_KERNEL_VERSION_MINOR(SV1, SV2, SV3) ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3) + +#define ENA_KERNEL_VERSION_GTE(SV1, SV2, SV3, SV4, SV5, SV6) \ + ((ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) > \ + ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3))) || \ + (ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) == \ + ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3)) && \ + ENA_KERNEL_VERSION_MINOR(ENA_KERNEL_SUBVERSION_4, ENA_KERNEL_SUBVERSION_5, ENA_KERNEL_SUBVERSION_6) >= \ + ENA_KERNEL_VERSION_MINOR((SV4), (SV5), (SV6)))) + +#define ENA_KERNEL_VERSION_LTE(SV1, SV2, SV3, SV4, SV5, SV6) \ + ((ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) < \ + ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3))) || \ + (ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) == \ + ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3)) && \ + ENA_KERNEL_VERSION_MINOR(ENA_KERNEL_SUBVERSION_4, ENA_KERNEL_SUBVERSION_5, ENA_KERNEL_SUBVERSION_6) <= \ + ENA_KERNEL_VERSION_MINOR((SV4), (SV5), (SV6)))) + +/******************************************************************************/ +/************************** Ubuntu macros *************************************/ +/******************************************************************************/ + +/* Ubuntu Release ABI is the 4th digit of their kernel version. You can find + * it in /usr/src/linux/$(uname -r)/include/generated/utsrelease.h for new + * enough versions of Ubuntu. Otherwise you can simply see it in the output of + * uname as the 4th digit of the kernel. The UTS_UBUNTU_RELEASE_ABI is not in + * the linux-source package, but in the linux-headers package. It begins to + * appear in later releases of 14.04 and 14.10. + * + * Ex: + * + * $uname -r + * 3.13.0-45-generic + * ABI is 45 + * + * + * $uname -r + * 3.16.0-23-generic + * ABI is 23 + */ +#ifdef UTS_UBUNTU_RELEASE_ABI + +#if UTS_UBUNTU_RELEASE_ABI > 255 +#undef UTS_UBUNTU_RELEASE_ABI +#define UTS_UBUNTU_RELEASE_ABI 0 +#endif /* UTS_UBUNTU_RELEASE_ABI > 255 */ + +/* Ubuntu does not provide actual release version macro, so we use the kernel + * version plus the ABI to generate a unique version code specific to Ubuntu. + * In addition, we mask the lower 8 bits of LINUX_VERSION_CODE in order to + * ignore differences in sublevel which are not important since we have the + * ABI value. Otherwise, it becomes impossible to correlate ABI to version for + * ordering checks. + */ +#define UBUNTU_VERSION_CODE (((LINUX_VERSION_CODE & ~0xFF) << 8) + (UTS_UBUNTU_RELEASE_ABI)) + +#endif /* UTS_UBUNTU_RELEASE_ABI */ + +/* Note that the 3rd digit is always zero, and will be ignored. This is + * because Ubuntu kernels are based on x.y.0-ABI values, and while their linux + * version codes are 3 digit, this 3rd digit is superseded by the ABI value. + */ +#define UBUNTU_VERSION(a,b,c,d) ((KERNEL_VERSION(a,b,0) << 8) + (d)) + +/******************************************************************************/ +/**************************** SuSE macros *************************************/ +/******************************************************************************/ + +/* SuSE version macro is the same as Linux kernel version */ +#ifndef SLE_VERSION +#define SLE_VERSION(a,b,c) KERNEL_VERSION(a,b,c) +#endif +#ifdef CONFIG_SUSE_KERNEL +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 14) +#include +#endif +#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,28) ) +/* SLES12 is at least 3.12.28+ based */ +#define SLE_VERSION_CODE SLE_VERSION(12,0,0) +#endif /* LINUX_VERSION_CODE == KERNEL_VERSION(x,y,z) */ +#endif /* CONFIG_SUSE_KERNEL */ +#ifndef SLE_VERSION_CODE +#define SLE_VERSION_CODE 0 +#endif /* SLE_VERSION_CODE */ +#ifndef SUSE_VERSION +#define SUSE_VERSION 0 +#endif /* SUSE_VERSION */ + +/******************************************************************************/ +/**************************** RHEL macros *************************************/ +/******************************************************************************/ + +#ifndef RHEL_RELEASE_VERSION +#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b)) +#endif +#ifndef AX_RELEASE_VERSION +#define AX_RELEASE_VERSION(a,b) (((a) << 8) + (b)) +#endif + +#ifndef AX_RELEASE_CODE +#define AX_RELEASE_CODE 0 +#endif + +#ifndef RHEL_RELEASE_CODE +#define RHEL_RELEASE_CODE 0 +#endif + +#if (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,0)) +#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,0) +#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,1)) +#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,1) +#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,2)) +#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,3) +#endif + +/*****************************************************************************/ +#if (RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,6)) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) +#define HAVE_RHEL6_NET_DEVICE_OPS_EXT +#endif + +#if (RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,4)) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) +#define HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT +#endif /* RHEL >= 6.4 && RHEL < 7.0 */ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) || \ + (SLE_VERSION_CODE && \ + LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,48))) +#define HAVE_MTU_MIN_MAX_IN_NET_DEVICE +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0) || \ + (RHEL_RELEASE_CODE && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)) || \ + (SLE_VERSION_CODE && \ + LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,50))) +#define NDO_GET_STATS_64_V2 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0) || \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,5)) +#include +#endif + +/*****************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) ) +/* The function netif_set_real_num_tx_queues() doesn't return value for + * kernels < 2.6.37 + */ +static inline int _kc_netif_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq) +{ + netif_set_real_num_tx_queues(dev, txq); + return 0; +} +#define netif_set_real_num_tx_queues(dev, txq) \ + _kc_netif_set_real_num_tx_queues(dev, txq) + +#endif /* < 2.6.37 */ + +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ) +#if !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,5)) +typedef u32 netdev_features_t; +#endif +#undef PCI_EXP_TYPE_RC_EC +#define PCI_EXP_TYPE_RC_EC 0xa /* Root Complex Event Collector */ +#ifndef CONFIG_BQL +#define netdev_tx_completed_queue(_q, _p, _b) do {} while (0) +#define netdev_completed_queue(_n, _p, _b) do {} while (0) +#define netdev_tx_sent_queue(_q, _b) do {} while (0) +#define netdev_sent_queue(_n, _b) do {} while (0) +#define netdev_tx_reset_queue(_q) do {} while (0) +#define netdev_reset_queue(_n) do {} while (0) +#endif + +#endif /* < 3.3.0 */ + +/******************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) ) +#ifdef NET_ADDR_RANDOM +#define eth_hw_addr_random(N) do { \ + eth_random_addr(N->dev_addr); \ + N->addr_assign_type |= NET_ADDR_RANDOM; \ + } while (0) +#else /* NET_ADDR_RANDOM */ +#define eth_hw_addr_random(N) eth_random_addr(N->dev_addr) +#endif /* NET_ADDR_RANDOM */ +#if !(RHEL_RELEASE_CODE) +/* If probe retry doesn't define, return no device */ +#define EPROBE_DEFER ENODEV +#endif +#endif /* >= 3.4.0 */ + +/*****************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) ) +#if !(RHEL_RELEASE_CODE) +static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) +{ + const u16 *a = (const u16 *)addr1; + const u16 *b = (const u16 *)addr2; + + return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; +} +#endif +#endif /* >= 3.5.0 */ + +/******************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) ) +#ifndef eth_random_addr +#define eth_random_addr _kc_eth_random_addr +static inline void _kc_eth_random_addr(u8 *addr) +{ + get_random_bytes(addr, ETH_ALEN); + addr[0] &= 0xfe; /* clear multicast */ + addr[0] |= 0x02; /* set local assignment */ +} +#endif +#endif /* < 3.6.0 */ + +/******************************************************************************/ +#ifndef CONFIG_NET_RX_BUSY_POLL +static inline void skb_mark_napi_id(struct sk_buff *skb, + struct napi_struct *napi) +{ + +} + +static inline void napi_hash_del(struct napi_struct *napi) +{ + +} + +static inline void napi_hash_add(struct napi_struct *napi) +{ + +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +/*****************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) ) +/* cpu_rmap is buggy on older version and causes dead lock */ +#ifdef CONFIG_RFS_ACCEL +#undef CONFIG_RFS_ACCEL +#endif + +#if !(RHEL_RELEASE_CODE) +static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) +{ + return index % n_rx_rings; +} +#endif +#endif /* >= 3.8.0 */ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5,2,0)) +#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3 +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4,19,0)) || \ + (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0))) || \ + (SUSE_VERSION && ((SUSE_VERSION == 12 && SUSE_PATCHLEVEL >= 5) || \ + (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 1) || \ + (SUSE_VERSION > 15))) +#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2 +#else + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) && \ + RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))) || \ + (LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0) && \ + SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) || \ + (LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0))) +#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0) +#if defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,24) +#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1 +#else +#define HAVE_NDO_SELECT_QUEUE_ACCEL +#endif +#endif /* >= 3.13 */ +#endif /* < 4.19 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) +#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) +# define u64_stats_init(syncp) seqcount_init(syncp.seq) +#else +# define u64_stats_init(syncp) do { } while (0) +#endif + +#if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \ + !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) \ + || (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1)))) && \ + !defined(UEK3_RELEASE) +static inline void reinit_completion(struct completion *x) +{ + x->done = 0; +} +#endif /* SLE 12 */ + +#endif /* < 3.13.0 */ + +#if (( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) ) && \ + (!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,0) && \ + RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0))) \ + && !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))&& \ + !defined(UEK3_RELEASE))) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30)) +static inline int pci_enable_msix_range(struct pci_dev *dev, + struct msix_entry *entries, + int minvec, + int maxvec) +{ + int nvec = maxvec; + int rc; + + if (maxvec < minvec) + return -ERANGE; + + do { + rc = pci_enable_msix(dev, entries, nvec); + if (rc < 0) { + return rc; + } else if (rc > 0) { + if (rc < minvec) + return -ENOSPC; + nvec = rc; + } + } while (rc); + + return nvec; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) && \ + !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1)) +static inline void *devm_kcalloc(struct device *dev, + size_t n, size_t size, gfp_t flags) +{ + return devm_kzalloc(dev, n * size, flags | __GFP_ZERO); +} +#endif + +/*****************************************************************************/ +#if (( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,8) ) && \ + !RHEL_RELEASE_CODE && \ + !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30)) +enum pkt_hash_types { + PKT_HASH_TYPE_NONE, /* Undefined type */ + PKT_HASH_TYPE_L2, /* Input: src_MAC, dest_MAC */ + PKT_HASH_TYPE_L3, /* Input: src_IP, dst_IP */ + PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */ +}; + +static inline void skb_set_hash(struct sk_buff *skb, __u32 hash, + enum pkt_hash_types type) +{ + skb->l4_rxhash = (type == PKT_HASH_TYPE_L4); + skb->rxhash = hash; +} +#endif + +/*****************************************************************************/ +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) +#if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0) && \ + RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(6,6)) \ + && !(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,105)) +static inline int pci_msix_vec_count(struct pci_dev *dev) +{ + int pos; + u16 control; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + if (!pos) + return -EINVAL; + + pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control); + return (control & 0x7FF) + 1; +} +#if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \ + !(RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(7,0)) +static inline void ether_addr_copy(u8 *dst, const u8 *src) +{ + memcpy(dst, src, 6); +} +#endif /* SLE 12 */ +#endif /* RHEL 7 */ +#endif + +#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(6,8))) +#define napi_gro_flush(napi, flush_old) napi_gro_flush(napi) +#endif + +#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,30))) || \ + (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) || \ + (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0) \ + && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,1)) +#else +static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, + unsigned int start) +{ + return u64_stats_fetch_retry(syncp, start); +} + +static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) +{ + return u64_stats_fetch_begin(syncp); +} + +#endif + +static inline bool ena_u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) + return u64_stats_fetch_retry_irq(syncp, start); +#else + return u64_stats_fetch_retry(syncp, start); +#endif +} + +static inline unsigned int ena_u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) + return u64_stats_fetch_begin_irq(syncp); +#else + return u64_stats_fetch_begin(syncp); +#endif +} + +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) && \ + !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1)))) + +#define smp_mb__before_atomic() smp_mb() + +#endif + +/*****************************************************************************/ +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) ) +#undef GENMASK +#define GENMASK(h, l) (((U32_C(1) << ((h) - (l) + 1)) - 1) << (l)) +#undef GENMASK_ULL +#define GENMASK_ULL(h, l) (((U64_C(1) << ((h) - (l) + 1)) - 1) << (l)) +#endif +/*****************************************************************************/ + +#ifndef dma_rmb +#define dma_rmb rmb +#endif + +#ifndef writel_relaxed +#define writel_relaxed writel +#endif + +#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) ) \ + || (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) \ + || (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0)) +#else +static inline void netdev_rss_key_fill(void *buffer, size_t len) +{ + get_random_bytes(buffer, len); +} +#endif + +#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) ) && \ + !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \ + !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) + +static inline void napi_schedule_irqoff(struct napi_struct *n) +{ + napi_schedule(n); +} + +static inline void __napi_schedule_irqoff(struct napi_struct *n) +{ + __napi_schedule(n); +} + +#ifndef READ_ONCE +#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var)))) +#endif +#endif /* Kernel 3.19 */ + +/*****************************************************************************/ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) \ + || (RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,7)) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) \ + || RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) \ + || (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,19,0,51)) +#else +static inline void napi_complete_done(struct napi_struct *n, int work_done) +{ + napi_complete(n); +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) \ + || (defined(UBUNTU_VERSION_CODE) && \ + (UBUNTU_VERSION(3,13,0,126) <= UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,14,0,0))) \ + || (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)) + +#else + +static inline void ioremap_release(struct device *dev, void *res) +{ + iounmap(*(void __iomem **)res); +} + + +static inline void __iomem *devm_ioremap_wc(struct device *dev, + resource_size_t offset, + resource_size_t size) +{ + void __iomem **ptr, *addr; + + ptr = devres_alloc(ioremap_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + addr = ioremap_wc(offset, size); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else + devres_free(ptr); + + return addr; +} +#endif + +#if RHEL_RELEASE_CODE && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) +#define ndo_change_mtu ndo_change_mtu_rh74 +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)) +#ifndef dma_zalloc_coherent +#define dma_zalloc_coherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) +#endif +#endif + +#ifndef dev_info_once +#ifdef CONFIG_PRINTK +#define dev_info_once(dev, fmt, ...) \ +do { \ + static bool __print_once __read_mostly; \ + \ + if (!__print_once) { \ + __print_once = true; \ + dev_info(dev, fmt, ##__VA_ARGS__); \ + } \ +} while (0) +#else +#define dev_info_once(dev, fmt, ...) \ +do { \ + if (0) \ + dev_info(dev, fmt, ##__VA_ARGS__); \ +} while (0) +#endif +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)) || \ + (RHEL_RELEASE_CODE && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 2)) +#define HAVE_NETDEV_XMIT_MORE +#endif + +#ifndef mmiowb +#define MMIOWB_NOT_DEFINED +#endif + +/* In the driver we currently only support CRC32 and Toeplitz. + * Since in kernel erlier than 4.12 the CRC32 define didn't exist + * We define it here to be XOR. Any user who wishes to select CRC32 + * as the hash function, can do so by choosing xor through ethtool. + */ +#ifndef ETH_RSS_HASH_CRC32 +#define ETH_RSS_HASH_CRC32 ETH_RSS_HASH_XOR +#endif + +#ifndef _ULL +#define _ULL(x) (_AC(x, ULL)) +#endif + +#ifndef ULL +#define ULL(x) (_ULL(x)) +#endif + +#ifndef BIT_ULL +#define BIT_ULL(nr) (ULL(1) << (nr)) +#endif + +#ifndef BITS_PER_TYPE +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) +#endif + +#ifndef DIV_ROUND_DOWN_ULL +#define DIV_ROUND_DOWN_ULL(ll, d) \ + ({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; }) +#endif + +/* values are taken from here: https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md */ + +#if defined(CONFIG_BPF) && LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) +#define ENA_XDP_SUPPORT +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5,8,0)) || \ + (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 3) +#define XDP_HAS_FRAME_SZ +#define XDP_CONVERT_TO_FRAME_NAME_CHANGED +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) +#define ENA_XDP_QUERY_IN_KERNEL +#endif + +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0) || \ + (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 3))) || \ + (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 3) + +#define HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER +#endif + +#if defined(CONFIG_NET_DEVLINK) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) +#define ENA_DEVLINK_SUPPORT +#endif + +#if !defined(CONFIG_NET_DEVLINK) && !defined(CONFIG_NET_DEVLINK_MODULE) && !defined(CONFIG_MAY_USE_DEVLINK) +#define ENA_NO_DEVLINK_HEADERS +#endif + +#if defined(CONFIG_NET_DEVLINK) && \ + (KERNEL_VERSION(5, 1, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \ + !((SUSE_VERSION != 0) && (SUSE_VERSION == 15 && (SUSE_PATCHLEVEL < 2 || SUSE_PATCHLEVEL >= 4))) && \ + !(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE > UBUNTU_VERSION(5, 16, 0, 0)) && \ + !(RHEL_RELEASE_CODE)) +#define ENA_DEVLINK_PUBLISH_REQUIRED +#endif + +#if defined(CONFIG_NET_DEVLINK) && \ + (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) || \ + (SUSE_VERSION && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 18))) +#define ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED +#endif + +#if defined(CONFIG_NET_DEVLINK) && \ + (KERNEL_VERSION(5, 4, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \ + !(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \ + !(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))) +#define ENA_DEVLINK_RELOAD_ENABLING_REQUIRED +#endif + +#if defined(CONFIG_NET_DEVLINK) && \ + (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 5, 0) || \ + (SUSE_VERSION && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 18))) +#define ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT +#endif + +#if defined(CONFIG_NET_DEVLINK) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +#define ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \ + (defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) || \ + (defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) +#define ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC +#endif + +#if (KERNEL_VERSION(5, 16, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0)) || \ + (defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) || \ + (defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0) && !(defined(FEDORA_RELEASE))) +#define ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \ + !(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \ + !(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) +#define ENA_DEVLINK_CONFIGURE_AFTER_REGISTER +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) && \ + !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1)) && \ + (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6, 6)))) && \ + !defined(UBUNTU_VERSION_CODE) && \ + !defined(UEK3_RELEASE) && (!defined(DEBIAN_VERSION) || DEBIAN_VERSION != 8) + +#define DO_ONCE(func, ...) \ + ({ \ + static bool ___done = false; \ + if (unlikely(!___done)) { \ + func(__VA_ARGS__); \ + ___done = true; \ + } \ + }) + +#define get_random_once(buf, nbytes) \ + DO_ONCE(get_random_bytes, (buf), (nbytes)) + +#define net_get_random_once(buf, nbytes) \ + get_random_once((buf), (nbytes)) + +/* RSS keys are 40 or 52 bytes long */ +#define NETDEV_RSS_KEY_LEN 52 +static u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; + +static inline void netdev_rss_key_fill(void *buffer, size_t len) +{ + BUG_ON(len > sizeof(netdev_rss_key)); + net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); + memcpy(buffer, netdev_rss_key, len); +} +#endif + +#ifndef WRITE_ONCE +#define WRITE_ONCE(x, val) (ACCESS_ONCE(x) = val) +#endif +#ifndef READ_ONCE +#define READ_ONCE(x) ACCESS_ONCE(x) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9 ,0) +#define ENA_GENERIC_PM_OPS +#endif + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4, 6 ,0)) && \ + !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3))) +/* Linux versions 4.4.216 - 4.5 (non inclusive) back propagated + * page_ref_count() from kernel 4.6. + * Ubuntu didn't add these changes to its 4.4.* kernels. + * UEK added this function in kernel 4.1.12-124.43.1 + * Here is a figure that shows all of the cases: + * Legend: + * -------- page_ref_count() is present in the kernel + * ******** page_ref_count() is missing in the kernel + * + * Distro\Kernel 4.1.12-124.43.1 4.4.216 4.5 4.6 + * | | | | + * Upstrem kernel ***********|**************|--------|******| + * | | | | + * Ubuntu ***********|**************|********|******| + * | | | | + * UEK ***********|--------------|--------|------| + */ +#if (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 124, 43, 1)) || \ + (defined(UBUNTU_VERSION_CODE)) || \ + (!defined(IS_UEK) && !defined(UBUNTU_VERSION_CODE) && \ + !(KERNEL_VERSION(4, 4, 216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))) +static inline int page_ref_count(struct page *page) +{ + return atomic_read(&page->_count); +} +#endif /* (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 124, 43, 1)) ... */ + +static inline void page_ref_inc(struct page *page) +{ + atomic_inc(&page->_count); +} +#endif + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)) && \ + !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))) +static inline struct page *dev_alloc_page(void) +{ + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; + + gfp_mask |= __GFP_COLD | __GFP_COMP; + + return alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0); +} +#endif + +/* This entry might seem strange because of the #ifndef numa_mem_id(), + * but these defines were taken from the Linux kernel + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) +#ifndef numa_mem_id +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +static inline int numa_mem_id(void) +{ + return __this_cpu_read(_numa_mem_); +} +#else /* CONFIG_HAVE_MEMORYLESS_NODES */ +static inline int numa_mem_id(void) +{ + return numa_node_id(); +} +#endif /* CONFIG_HAVE_MEMORYLESS_NODES */ +#endif /* numa_mem_id */ +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) */ + +#ifndef fallthrough +#define fallthrough do {} while (0) /* fallthrough */ +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) +#define AF_XDP_BUSY_POLL_SUPPORTED +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) +#define ENA_LINEAR_FRAG_SUPPORTED +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) +#define ENA_NETDEV_LOGS_WITHOUT_RV +#endif + +#if defined(ENA_XDP_SUPPORT) && \ + (LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0) && \ + !(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \ + ENA_KERNEL_VERSION_GTE(5, 3, 18, 150300, 59, 49))) +static __always_inline void +xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) +{ + xdp->rxq = rxq; +#ifdef XDP_HAS_FRAME_SZ + xdp->frame_sz = frame_sz; +#endif +} + +static __always_inline void +xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, + int headroom, int data_len, const bool meta_valid) +{ + unsigned char *data = hard_start + headroom; + + xdp->data_hard_start = hard_start; + xdp->data = data; + xdp->data_end = data + data_len; + xdp->data_meta = meta_valid ? data : data + 1; +} + +#endif /* defined(ENA_XDP_SUPPORT) && (LINUX_VERSION_CODE <= KERNEL_VERSION(5, 12, 0) && !SUSE_VERSION(...)) */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0) +#define ethtool_sprintf(data, fmt, args...) \ + do { \ + snprintf(*data, ETH_GSTRING_LEN, fmt, ##args); \ + (*data) += ETH_GSTRING_LEN; \ + } while(0) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 13, 0) +#define ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \ + !(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 188) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0)) && \ + !(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 251) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(5, 5, 0))) && \ + !(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6)) && \ + !(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \ + !(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \ + ENA_KERNEL_VERSION_GTE(5, 3, 18, 150300, 59, 43)) +static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) +{ + memcpy(dev->dev_addr, addr, ETH_ALEN); +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \ + (defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6) && \ + RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0)) || \ + (defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) +#define ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) || \ + (defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 7) && \ + RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0)) || \ + (defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 5)) +#define ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE +#endif + +#if defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +#define ENA_AF_XDP_SUPPORT +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) +/* kernels older than 3.3.0 didn't have this function and + * used netif_tx_queue_stopped() for the same purpose + */ +static inline int netif_xmit_stopped(const struct netdev_queue *dev_queue) +{ + return netif_tx_queue_stopped(dev_queue); +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) +#define NAPIF_STATE_SCHED BIT(NAPI_STATE_SCHED) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0) +#define bpf_warn_invalid_xdp_action(netdev, xdp_prog, verdict) \ + bpf_warn_invalid_xdp_action(verdict) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) +#define HAS_BPF_HEADER +#endif + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 8, 0)) && \ + !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 7)))) +static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) +{ + if (cmp1.tv64 < cmp2.tv64) + return -1; + if (cmp1.tv64 > cmp2.tv64) + return 1; + return 0; +} +#endif + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)) && \ + !(RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 7)) && \ + (RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 0)) && \ + (RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1)))) +static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2) +{ + return ktime_compare(cmp1, cmp2) > 0; +} +#endif + +#if IS_ENABLED(CONFIG_PTP_1588_CLOCK) + +#if defined(ENA_PHC_INCLUDE) && ((LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)) || \ + (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4))) +#define ENA_PHC_SUPPORT +#endif /* ENA_PHC_SUPPORT */ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 1, 0)) || \ + (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 2)) +#define ENA_PHC_SUPPORT_GETTIME64 +#endif /* ENA_PHC_SUPPORT_GETTIME64 */ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)) || \ + (RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 7)) && \ + (RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(8, 0))) +#define ENA_PHC_SUPPORT_GETTIME64_EXTENDED +#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */ + +#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)) && (LINUX_VERSION_CODE < KERNEL_VERSION(6, 2, 0))) +#define ENA_PHC_SUPPORT_ADJFREQ +#endif /* ENA_PHC_SUPPORT_ADJFREQ */ + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0)) && \ + !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4)))) +#define ptp_clock_register(info, parent) ptp_clock_register(info) +#endif + +#endif /* CONFIG_PTP_1588_CLOCK */ + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)) && \ + !(RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 2))) +static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, + unsigned int length) +{ + return netdev_alloc_skb_ip_align(napi->dev, length); +} +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)) && \ + !(RHEL_RELEASE_CODE && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 7))) +static inline ssize_t strscpy(char *dest, const char *src, size_t count) +{ + return (ssize_t)strlcpy(dest, src, count); +} +#endif + +static inline void ena_netif_napi_add(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int)) +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)) && \ + !(RHEL_RELEASE_CODE && \ + ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 8)) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 0))) || \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 2))) +#ifndef NAPI_POLL_WEIGHT +#define NAPI_POLL_WEIGHT 64 +#endif + netif_napi_add(dev, napi, poll, NAPI_POLL_WEIGHT); +#else + netif_napi_add(dev, napi, poll); +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) */ +} + +#if defined(ENA_DEVLINK_SUPPORT) && LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0) +#define devl_param_driverinit_value_get devlink_param_driverinit_value_get +#define devl_param_driverinit_value_set devlink_param_driverinit_value_set +#endif + +#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4))) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \ + (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 105, 0, 0)) +static inline void dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->unmap_page) + ops->unmap_page(dev, addr, size, dir, attrs); + debug_dma_unmap_page(dev, addr, size, dir, false); +} +#endif /* RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4)) */ + +#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 9)) && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 0)) && \ + (LINUX_VERSION_CODE != KERNEL_VERSION(4, 14, 0))) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \ + (defined(IS_UEK) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 13)) +#define ENA_DMA_ATTR_SKIP_CPU_SYNC (1 << DMA_ATTR_SKIP_CPU_SYNC) +#elif (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(6, 10))) +#define ENA_DMA_ATTR_SKIP_CPU_SYNC 0 +#else +#define ENA_DMA_ATTR_SKIP_CPU_SYNC DMA_ATTR_SKIP_CPU_SYNC +#endif + +static inline void ena_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ +#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 9)) && \ + (LINUX_VERSION_CODE != KERNEL_VERSION(4, 14, 0))) || \ + (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \ + (defined(IS_UEK) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 13)) + struct dma_attrs dma_attrs; + + init_dma_attrs(&dma_attrs); + dma_attrs.flags[0] = attrs; + dma_unmap_page_attrs(dev, addr, size, dir, &dma_attrs); +#else + dma_unmap_page_attrs(dev, addr, size, dir, attrs); +#endif +} + +#endif /* _KCOMPAT_H_ */ diff --git a/drivers/amazon/net/ena/net_dim.c b/drivers/amazon/net/ena/net_dim.c new file mode 100644 index 0000000000000..af46903cd53e2 --- /dev/null +++ b/drivers/amazon/net/ena/net_dim.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + */ + +#include "dim.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) + +/* + * Net DIM profiles: + * There are different set of profiles for each CQ period mode. + * There are different set of profiles for RX/TX CQs. + * Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES + */ +#define NET_DIM_PARAMS_NUM_PROFILES 5 +#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128 +#define NET_DIM_DEF_PROFILE_CQE 1 +#define NET_DIM_DEF_PROFILE_EQE 1 + +#define NET_DIM_RX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ +} + +#define NET_DIM_RX_CQE_PROFILES { \ + {2, 256}, \ + {8, 128}, \ + {16, 64}, \ + {32, 64}, \ + {64, 64} \ +} + +#define NET_DIM_TX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {32, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE} \ +} + +#define NET_DIM_TX_CQE_PROFILES { \ + {5, 128}, \ + {8, 64}, \ + {16, 32}, \ + {32, 32}, \ + {64, 32} \ +} + +static const struct dim_cq_moder +rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_RX_EQE_PROFILES, + NET_DIM_RX_CQE_PROFILES, +}; + +static const struct dim_cq_moder +tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_TX_EQE_PROFILES, + NET_DIM_TX_CQE_PROFILES, +}; + +struct dim_cq_moder +net_dim_get_rx_moderation(u8 cq_period_mode, int ix) +{ + struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix]; + + cq_moder.cq_period_mode = cq_period_mode; + return cq_moder; +} + +struct dim_cq_moder +net_dim_get_def_rx_moderation(u8 cq_period_mode) +{ + u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ? + NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE; + + return net_dim_get_rx_moderation(cq_period_mode, profile_ix); +} + +struct dim_cq_moder +net_dim_get_tx_moderation(u8 cq_period_mode, int ix) +{ + struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix]; + + cq_moder.cq_period_mode = cq_period_mode; + return cq_moder; +} + +struct dim_cq_moder +net_dim_get_def_tx_moderation(u8 cq_period_mode) +{ + u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ? + NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE; + + return net_dim_get_tx_moderation(cq_period_mode, profile_ix); +} + +static int net_dim_step(struct dim *dim) +{ + if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2)) + return DIM_TOO_TIRED; + + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + case DIM_PARKING_TIRED: + break; + case DIM_GOING_RIGHT: + if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1)) + return DIM_ON_EDGE; + dim->profile_ix++; + dim->steps_right++; + break; + case DIM_GOING_LEFT: + if (dim->profile_ix == 0) + return DIM_ON_EDGE; + dim->profile_ix--; + dim->steps_left++; + break; + } + + dim->tired++; + return DIM_STEPPED; +} + +static void net_dim_exit_parking(struct dim *dim) +{ + dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT; + net_dim_step(dim); +} + +static int net_dim_stats_compare(struct dim_stats *curr, + struct dim_stats *prev) +{ + if (!prev->bpms) + return curr->bpms ? DIM_STATS_BETTER : DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms)) + return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + if (!prev->ppms) + return curr->ppms ? DIM_STATS_BETTER : + DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms)) + return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + if (!prev->epms) + return DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms)) + return (curr->epms < prev->epms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + return DIM_STATS_SAME; +} + +static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim) +{ + int prev_state = dim->tune_state; + int prev_ix = dim->profile_ix; + int stats_res; + int step_res; + + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + stats_res = net_dim_stats_compare(curr_stats, + &dim->prev_stats); + if (stats_res != DIM_STATS_SAME) + net_dim_exit_parking(dim); + break; + + case DIM_PARKING_TIRED: + dim->tired--; + if (!dim->tired) + net_dim_exit_parking(dim); + break; + + case DIM_GOING_RIGHT: + case DIM_GOING_LEFT: + stats_res = net_dim_stats_compare(curr_stats, + &dim->prev_stats); + if (stats_res != DIM_STATS_BETTER) + dim_turn(dim); + + if (dim_on_top(dim)) { + dim_park_on_top(dim); + break; + } + + step_res = net_dim_step(dim); + switch (step_res) { + case DIM_ON_EDGE: + dim_park_on_top(dim); + break; + case DIM_TOO_TIRED: + dim_park_tired(dim); + break; + } + + break; + } + + if (prev_state != DIM_PARKING_ON_TOP || + dim->tune_state != DIM_PARKING_ON_TOP) + dim->prev_stats = *curr_stats; + + return dim->profile_ix != prev_ix; +} + +void net_dim(struct dim *dim, struct dim_sample end_sample) +{ + struct dim_stats curr_stats; + u16 nevents; + + switch (dim->state) { + case DIM_MEASURE_IN_PROGRESS: + nevents = BIT_GAP(BITS_PER_TYPE(u16), + end_sample.event_ctr, + dim->start_sample.event_ctr); + if (nevents < DIM_NEVENTS) + break; + dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats); + if (net_dim_decision(&curr_stats, dim)) { + dim->state = DIM_APPLY_NEW_PROFILE; + schedule_work(&dim->work); + break; + } + /* fall through */ + case DIM_START_MEASURE: + dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr, + end_sample.byte_ctr, &dim->start_sample); + dim->state = DIM_MEASURE_IN_PROGRESS; + break; + case DIM_APPLY_NEW_PROFILE: + break; + } +} + +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */ diff --git a/drivers/amazon/net/igb_uio/Makefile b/drivers/amazon/net/igb_uio/Makefile new file mode 100644 index 0000000000000..ebced2786f7c8 --- /dev/null +++ b/drivers/amazon/net/igb_uio/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_AMAZON_IGB_UIO) += igb_uio.o diff --git a/drivers/amazon/net/igb_uio/compat.h b/drivers/amazon/net/igb_uio/compat.h new file mode 100644 index 0000000000000..8dbb896ae1185 --- /dev/null +++ b/drivers/amazon/net/igb_uio/compat.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Minimal wrappers to allow compiling igb_uio on older kernels. + */ + +#ifndef RHEL_RELEASE_VERSION +#define RHEL_RELEASE_VERSION(a, b) (((a) << 8) + (b)) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) +#define pci_cfg_access_lock pci_block_user_cfg_access +#define pci_cfg_access_unlock pci_unblock_user_cfg_access +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) +#define HAVE_PTE_MASK_PAGE_IOMAP +#endif + +#ifndef PCI_MSIX_ENTRY_SIZE +#define PCI_MSIX_ENTRY_SIZE 16 +#define PCI_MSIX_ENTRY_VECTOR_CTRL 12 +#define PCI_MSIX_ENTRY_CTRL_MASKBIT 1 +#endif + +/* + * for kernels < 2.6.38 and backported patch that moves MSI-X entry definition + * to pci_regs.h Those kernels has PCI_MSIX_ENTRY_SIZE defined but not + * PCI_MSIX_ENTRY_CTRL_MASKBIT + */ +#ifndef PCI_MSIX_ENTRY_CTRL_MASKBIT +#define PCI_MSIX_ENTRY_CTRL_MASKBIT 1 +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) && \ + (!(defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(5, 9))) + +static int pci_num_vf(struct pci_dev *dev) +{ + struct iov { + int pos; + int nres; + u32 cap; + u16 ctrl; + u16 total; + u16 initial; + u16 nr_virtfn; + } *iov = (struct iov *)dev->sriov; + + if (!dev->is_physfn) + return 0; + + return iov->nr_virtfn; +} + +#endif /* < 2.6.34 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 39) && \ + (!(defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4))) + +#define kstrtoul strict_strtoul + +#endif /* < 2.6.39 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) && \ + (!(defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 3))) + +/* Check if INTX works to control irq's. + * Set's INTX_DISABLE flag and reads it back + */ +static bool pci_intx_mask_supported(struct pci_dev *pdev) +{ + bool mask_supported = false; + uint16_t orig, new; + + pci_block_user_cfg_access(pdev); + pci_read_config_word(pdev, PCI_COMMAND, &orig); + pci_write_config_word(pdev, PCI_COMMAND, + orig ^ PCI_COMMAND_INTX_DISABLE); + pci_read_config_word(pdev, PCI_COMMAND, &new); + + if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) { + dev_err(&pdev->dev, "Command register changed from " + "0x%x to 0x%x: driver or hardware bug?\n", orig, new); + } else if ((new ^ orig) & PCI_COMMAND_INTX_DISABLE) { + mask_supported = true; + pci_write_config_word(pdev, PCI_COMMAND, orig); + } + pci_unblock_user_cfg_access(pdev); + + return mask_supported; +} + +static bool pci_check_and_mask_intx(struct pci_dev *pdev) +{ + bool pending; + uint32_t status; + + pci_block_user_cfg_access(pdev); + pci_read_config_dword(pdev, PCI_COMMAND, &status); + + /* interrupt is not ours, goes to out */ + pending = (((status >> 16) & PCI_STATUS_INTERRUPT) != 0); + if (pending) { + uint16_t old, new; + + old = status; + if (status != 0) + new = old & (~PCI_COMMAND_INTX_DISABLE); + else + new = old | PCI_COMMAND_INTX_DISABLE; + + if (old != new) + pci_write_config_word(pdev, PCI_COMMAND, new); + } + pci_unblock_user_cfg_access(pdev); + + return pending; +} + +#endif /* < 3.3.0 */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +#define HAVE_PCI_IS_BRIDGE_API 1 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +#define HAVE_MSI_LIST_IN_GENERIC_DEVICE 1 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#define HAVE_PCI_MSI_MASK_IRQ 1 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0) +#define HAVE_ALLOC_IRQ_VECTORS 1 +#endif + +static inline bool igbuio_kernel_is_locked_down(void) +{ +#ifdef CONFIG_LOCK_DOWN_KERNEL +#ifdef CONFIG_LOCK_DOWN_IN_EFI_SECURE_BOOT + return kernel_is_locked_down(NULL); +#elif defined(CONFIG_EFI_SECURE_BOOT_LOCK_DOWN) + return kernel_is_locked_down(); +#else + return false; +#endif +#else + return false; +#endif +} diff --git a/drivers/amazon/net/igb_uio/igb_uio.c b/drivers/amazon/net/igb_uio/igb_uio.c new file mode 100644 index 0000000000000..ea439d131de1a --- /dev/null +++ b/drivers/amazon/net/igb_uio/igb_uio.c @@ -0,0 +1,674 @@ +// SPDX-License-Identifier: GPL-2.0 +/*- + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * These enum and macro definitions are copied from the + * file rte_pci_dev_features.h + */ +enum rte_intr_mode { + RTE_INTR_MODE_NONE = 0, + RTE_INTR_MODE_LEGACY, + RTE_INTR_MODE_MSI, + RTE_INTR_MODE_MSIX +}; +#define RTE_INTR_MODE_NONE_NAME "none" +#define RTE_INTR_MODE_LEGACY_NAME "legacy" +#define RTE_INTR_MODE_MSI_NAME "msi" +#define RTE_INTR_MODE_MSIX_NAME "msix" + + +#include "compat.h" + +/** + * A structure describing the private information for a uio device. + */ +struct rte_uio_pci_dev { + struct uio_info info; + struct pci_dev *pdev; + enum rte_intr_mode mode; + atomic_t refcnt; +}; + +static int wc_activate; +static char *intr_mode; +static enum rte_intr_mode igbuio_intr_mode_preferred = RTE_INTR_MODE_MSIX; +/* sriov sysfs */ +static ssize_t +show_max_vfs(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 10, "%u\n", dev_num_vf(dev)); +} + +static ssize_t +store_max_vfs(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int err = 0; + unsigned long max_vfs; + struct pci_dev *pdev = to_pci_dev(dev); + + if (0 != kstrtoul(buf, 0, &max_vfs)) + return -EINVAL; + + if (0 == max_vfs) + pci_disable_sriov(pdev); + else if (0 == pci_num_vf(pdev)) + err = pci_enable_sriov(pdev, max_vfs); + else /* do nothing if change max_vfs number */ + err = -EINVAL; + + return err ? err : count; +} + +static DEVICE_ATTR(max_vfs, S_IRUGO | S_IWUSR, show_max_vfs, store_max_vfs); + +static struct attribute *dev_attrs[] = { + &dev_attr_max_vfs.attr, + NULL, +}; + +static const struct attribute_group dev_attr_grp = { + .attrs = dev_attrs, +}; + +#ifndef HAVE_PCI_MSI_MASK_IRQ +/* + * It masks the msix on/off of generating MSI-X messages. + */ +static void +igbuio_msix_mask_irq(struct msi_desc *desc, s32 state) +{ + u32 mask_bits = desc->masked; + unsigned int offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_VECTOR_CTRL; + + if (state != 0) + mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; + else + mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT; + + if (mask_bits != desc->masked) { + writel(mask_bits, desc->mask_base + offset); + readl(desc->mask_base); + desc->masked = mask_bits; + } +} + +/* + * It masks the msi on/off of generating MSI messages. + */ +static void +igbuio_msi_mask_irq(struct pci_dev *pdev, struct msi_desc *desc, int32_t state) +{ + u32 mask_bits = desc->masked; + u32 offset = desc->irq - pdev->irq; + u32 mask = 1 << offset; + + if (!desc->msi_attrib.maskbit) + return; + + if (state != 0) + mask_bits &= ~mask; + else + mask_bits |= mask; + + if (mask_bits != desc->masked) { + pci_write_config_dword(pdev, desc->mask_pos, mask_bits); + desc->masked = mask_bits; + } +} + +static void +igbuio_mask_irq(struct pci_dev *pdev, enum rte_intr_mode mode, s32 irq_state) +{ + struct msi_desc *desc; + struct list_head *msi_list; + +#ifdef HAVE_MSI_LIST_IN_GENERIC_DEVICE + msi_list = &pdev->dev.msi_list; +#else + msi_list = &pdev->msi_list; +#endif + + if (mode == RTE_INTR_MODE_MSIX) { + list_for_each_entry(desc, msi_list, list) + igbuio_msix_mask_irq(desc, irq_state); + } else if (mode == RTE_INTR_MODE_MSI) { + list_for_each_entry(desc, msi_list, list) + igbuio_msi_mask_irq(pdev, desc, irq_state); + } +} +#endif + +/** + * This is the irqcontrol callback to be registered to uio_info. + * It can be used to disable/enable interrupt from user space processes. + * + * @param info + * pointer to uio_info. + * @param irq_state + * state value. 1 to enable interrupt, 0 to disable interrupt. + * + * @return + * - On success, 0. + * - On failure, a negative value. + */ +static int +igbuio_pci_irqcontrol(struct uio_info *info, s32 irq_state) +{ + struct rte_uio_pci_dev *udev = info->priv; + struct pci_dev *pdev = udev->pdev; + +#ifdef HAVE_PCI_MSI_MASK_IRQ + struct irq_data *irq = irq_get_irq_data(udev->info.irq); +#endif + + pci_cfg_access_lock(pdev); + + if (udev->mode == RTE_INTR_MODE_MSIX || udev->mode == RTE_INTR_MODE_MSI) { +#ifdef HAVE_PCI_MSI_MASK_IRQ + if (irq_state == 1) + pci_msi_unmask_irq(irq); + else + pci_msi_mask_irq(irq); +#else + igbuio_mask_irq(pdev, udev->mode, irq_state); +#endif + } + + if (udev->mode == RTE_INTR_MODE_LEGACY) + pci_intx(pdev, !!irq_state); + + pci_cfg_access_unlock(pdev); + + return 0; +} + +/** + * This is interrupt handler which will check if the interrupt is for the right device. + * If yes, disable it here and will be enable later. + */ +static irqreturn_t +igbuio_pci_irqhandler(int irq, void *dev_id) +{ + struct rte_uio_pci_dev *udev = (struct rte_uio_pci_dev *)dev_id; + struct uio_info *info = &udev->info; + + /* Legacy mode need to mask in hardware */ + if (udev->mode == RTE_INTR_MODE_LEGACY && + !pci_check_and_mask_intx(udev->pdev)) + return IRQ_NONE; + + uio_event_notify(info); + + /* Message signal mode, no share IRQ and automasked */ + return IRQ_HANDLED; +} + +static int +igbuio_pci_enable_interrupts(struct rte_uio_pci_dev *udev) +{ + int err = 0; +#ifndef HAVE_ALLOC_IRQ_VECTORS + struct msix_entry msix_entry; +#endif + + switch (igbuio_intr_mode_preferred) { + case RTE_INTR_MODE_MSIX: + /* Only 1 msi-x vector needed */ +#ifndef HAVE_ALLOC_IRQ_VECTORS + msix_entry.entry = 0; + if (pci_enable_msix(udev->pdev, &msix_entry, 1) == 0) { + dev_dbg(&udev->pdev->dev, "using MSI-X"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = msix_entry.vector; + udev->mode = RTE_INTR_MODE_MSIX; + break; + } +#else + if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSIX) == 1) { + dev_dbg(&udev->pdev->dev, "using MSI-X"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = pci_irq_vector(udev->pdev, 0); + udev->mode = RTE_INTR_MODE_MSIX; + break; + } +#endif + + /* falls through - to MSI */ + case RTE_INTR_MODE_MSI: +#ifndef HAVE_ALLOC_IRQ_VECTORS + if (pci_enable_msi(udev->pdev) == 0) { + dev_dbg(&udev->pdev->dev, "using MSI"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = udev->pdev->irq; + udev->mode = RTE_INTR_MODE_MSI; + break; + } +#else + if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSI) == 1) { + dev_dbg(&udev->pdev->dev, "using MSI"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = pci_irq_vector(udev->pdev, 0); + udev->mode = RTE_INTR_MODE_MSI; + break; + } +#endif + /* falls through - to INTX */ + case RTE_INTR_MODE_LEGACY: + if (pci_intx_mask_supported(udev->pdev)) { + dev_dbg(&udev->pdev->dev, "using INTX"); + udev->info.irq_flags = IRQF_SHARED | IRQF_NO_THREAD; + udev->info.irq = udev->pdev->irq; + udev->mode = RTE_INTR_MODE_LEGACY; + break; + } + dev_notice(&udev->pdev->dev, "PCI INTX mask not supported\n"); + /* falls through - to no IRQ */ + case RTE_INTR_MODE_NONE: + udev->mode = RTE_INTR_MODE_NONE; + udev->info.irq = UIO_IRQ_NONE; + break; + + default: + dev_err(&udev->pdev->dev, "invalid IRQ mode %u", + igbuio_intr_mode_preferred); + udev->info.irq = UIO_IRQ_NONE; + err = -EINVAL; + } + + if (udev->info.irq != UIO_IRQ_NONE) + err = request_irq(udev->info.irq, igbuio_pci_irqhandler, + udev->info.irq_flags, udev->info.name, + udev); + dev_info(&udev->pdev->dev, "uio device registered with irq %ld\n", + udev->info.irq); + + return err; +} + +static void +igbuio_pci_disable_interrupts(struct rte_uio_pci_dev *udev) +{ + if (udev->info.irq) { + free_irq(udev->info.irq, udev); + udev->info.irq = 0; + } + +#ifndef HAVE_ALLOC_IRQ_VECTORS + if (udev->mode == RTE_INTR_MODE_MSIX) + pci_disable_msix(udev->pdev); + if (udev->mode == RTE_INTR_MODE_MSI) + pci_disable_msi(udev->pdev); +#else + if (udev->mode == RTE_INTR_MODE_MSIX || + udev->mode == RTE_INTR_MODE_MSI) + pci_free_irq_vectors(udev->pdev); +#endif +} + + +/** + * This gets called while opening uio device file. + */ +static int +igbuio_pci_open(struct uio_info *info, struct inode *inode) +{ + struct rte_uio_pci_dev *udev = info->priv; + struct pci_dev *dev = udev->pdev; + int err; + + if (atomic_inc_return(&udev->refcnt) != 1) + return 0; + + /* set bus master, which was cleared by the reset function */ + pci_set_master(dev); + + /* enable interrupts */ + err = igbuio_pci_enable_interrupts(udev); + if (err) { + atomic_dec(&udev->refcnt); + dev_err(&dev->dev, "Enable interrupt fails\n"); + } + return err; +} + +static int +igbuio_pci_release(struct uio_info *info, struct inode *inode) +{ + struct rte_uio_pci_dev *udev = info->priv; + struct pci_dev *dev = udev->pdev; + + if (atomic_dec_and_test(&udev->refcnt)) { + /* disable interrupts */ + igbuio_pci_disable_interrupts(udev); + + /* stop the device from further DMA */ + pci_clear_master(dev); + } + + return 0; +} + +/* Remap pci resources described by bar #pci_bar in uio resource n. */ +static int +igbuio_pci_setup_iomem(struct pci_dev *dev, struct uio_info *info, + int n, int pci_bar, const char *name) +{ + unsigned long addr, len; + void *internal_addr; + + if (n >= ARRAY_SIZE(info->mem)) + return -EINVAL; + + addr = pci_resource_start(dev, pci_bar); + len = pci_resource_len(dev, pci_bar); + if (addr == 0 || len == 0) + return -1; + if (wc_activate == 0) { + internal_addr = ioremap(addr, len); + if (internal_addr == NULL) + return -1; + } else { + internal_addr = NULL; + } + info->mem[n].name = name; + info->mem[n].addr = addr; + info->mem[n].internal_addr = internal_addr; + info->mem[n].size = len; + info->mem[n].memtype = UIO_MEM_PHYS; + return 0; +} + +/* Get pci port io resources described by bar #pci_bar in uio resource n. */ +static int +igbuio_pci_setup_ioport(struct pci_dev *dev, struct uio_info *info, + int n, int pci_bar, const char *name) +{ + unsigned long addr, len; + + if (n >= ARRAY_SIZE(info->port)) + return -EINVAL; + + addr = pci_resource_start(dev, pci_bar); + len = pci_resource_len(dev, pci_bar); + if (addr == 0 || len == 0) + return -EINVAL; + + info->port[n].name = name; + info->port[n].start = addr; + info->port[n].size = len; + info->port[n].porttype = UIO_PORT_X86; + + return 0; +} + +/* Unmap previously ioremap'd resources */ +static void +igbuio_pci_release_iomem(struct uio_info *info) +{ + int i; + + for (i = 0; i < MAX_UIO_MAPS; i++) { + if (info->mem[i].internal_addr) + iounmap(info->mem[i].internal_addr); + } +} + +static int +igbuio_setup_bars(struct pci_dev *dev, struct uio_info *info) +{ + int i, iom, iop, ret; + unsigned long flags; + static const char *bar_names[PCI_STD_RESOURCE_END + 1] = { + "BAR0", + "BAR1", + "BAR2", + "BAR3", + "BAR4", + "BAR5", + }; + + iom = 0; + iop = 0; + + for (i = 0; i < ARRAY_SIZE(bar_names); i++) { + if (pci_resource_len(dev, i) != 0 && + pci_resource_start(dev, i) != 0) { + flags = pci_resource_flags(dev, i); + if (flags & IORESOURCE_MEM) { + ret = igbuio_pci_setup_iomem(dev, info, iom, + i, bar_names[i]); + if (ret != 0) + return ret; + iom++; + } else if (flags & IORESOURCE_IO) { + ret = igbuio_pci_setup_ioport(dev, info, iop, + i, bar_names[i]); + if (ret != 0) + return ret; + iop++; + } + } + } + + return (iom != 0 || iop != 0) ? ret : -ENOENT; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 8, 0) +static int __devinit +#else +static int +#endif +igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + struct rte_uio_pci_dev *udev; + dma_addr_t map_dma_addr; + void *map_addr; + int err; + +#ifdef HAVE_PCI_IS_BRIDGE_API + if (pci_is_bridge(dev)) { + dev_warn(&dev->dev, "Ignoring PCI bridge device\n"); + return -ENODEV; + } +#endif + + udev = kzalloc(sizeof(struct rte_uio_pci_dev), GFP_KERNEL); + if (!udev) + return -ENOMEM; + + /* + * enable device: ask low-level code to enable I/O and + * memory + */ + err = pci_enable_device(dev); + if (err != 0) { + dev_err(&dev->dev, "Cannot enable PCI device\n"); + goto fail_free; + } + + /* enable bus mastering on the device */ + pci_set_master(dev); + + /* remap IO memory */ + err = igbuio_setup_bars(dev, &udev->info); + if (err != 0) + goto fail_release_iomem; + + /* set 64-bit DMA mask */ + err = pci_set_dma_mask(dev, DMA_BIT_MASK(64)); + if (err != 0) { + dev_err(&dev->dev, "Cannot set DMA mask\n"); + goto fail_release_iomem; + } + + err = pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(64)); + if (err != 0) { + dev_err(&dev->dev, "Cannot set consistent DMA mask\n"); + goto fail_release_iomem; + } + + /* fill uio infos */ + udev->info.name = "igb_uio"; + udev->info.version = "0.1"; + udev->info.irqcontrol = igbuio_pci_irqcontrol; + udev->info.open = igbuio_pci_open; + udev->info.release = igbuio_pci_release; + udev->info.priv = udev; + udev->pdev = dev; + atomic_set(&udev->refcnt, 0); + + err = sysfs_create_group(&dev->dev.kobj, &dev_attr_grp); + if (err != 0) + goto fail_release_iomem; + + /* register uio driver */ + err = uio_register_device(&dev->dev, &udev->info); + if (err != 0) + goto fail_remove_group; + + pci_set_drvdata(dev, udev); + + /* + * Doing a harmless dma mapping for attaching the device to + * the iommu identity mapping if kernel boots with iommu=pt. + * Note this is not a problem if no IOMMU at all. + */ + map_addr = dma_alloc_coherent(&dev->dev, 1024, &map_dma_addr, + GFP_KERNEL); + if (map_addr) + memset(map_addr, 0, 1024); + + if (!map_addr) + dev_info(&dev->dev, "dma mapping failed\n"); + else { + dev_info(&dev->dev, "mapping 1K dma=%#llx host=%p\n", + (unsigned long long)map_dma_addr, map_addr); + + dma_free_coherent(&dev->dev, 1024, map_addr, map_dma_addr); + dev_info(&dev->dev, "unmapping 1K dma=%#llx host=%p\n", + (unsigned long long)map_dma_addr, map_addr); + } + + return 0; + +fail_remove_group: + sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp); +fail_release_iomem: + igbuio_pci_release_iomem(&udev->info); + pci_disable_device(dev); +fail_free: + kfree(udev); + + return err; +} + +static void +igbuio_pci_remove(struct pci_dev *dev) +{ + struct rte_uio_pci_dev *udev = pci_get_drvdata(dev); + + igbuio_pci_release(&udev->info, NULL); + + sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp); + uio_unregister_device(&udev->info); + igbuio_pci_release_iomem(&udev->info); + pci_disable_device(dev); + pci_set_drvdata(dev, NULL); + kfree(udev); +} + +static int +igbuio_config_intr_mode(char *intr_str) +{ + if (!intr_str) { + pr_info("Use MSIX interrupt by default\n"); + return 0; + } + + if (!strcmp(intr_str, RTE_INTR_MODE_MSIX_NAME)) { + igbuio_intr_mode_preferred = RTE_INTR_MODE_MSIX; + pr_info("Use MSIX interrupt\n"); + } else if (!strcmp(intr_str, RTE_INTR_MODE_MSI_NAME)) { + igbuio_intr_mode_preferred = RTE_INTR_MODE_MSI; + pr_info("Use MSI interrupt\n"); + } else if (!strcmp(intr_str, RTE_INTR_MODE_LEGACY_NAME)) { + igbuio_intr_mode_preferred = RTE_INTR_MODE_LEGACY; + pr_info("Use legacy interrupt\n"); + } else { + pr_info("Error: bad parameter - %s\n", intr_str); + return -EINVAL; + } + + return 0; +} + +static struct pci_driver igbuio_pci_driver = { + .name = "igb_uio", + .id_table = NULL, + .probe = igbuio_pci_probe, + .remove = igbuio_pci_remove, +}; + +static int __init +igbuio_pci_init_module(void) +{ + int ret; + + if (igbuio_kernel_is_locked_down()) { + pr_err("Not able to use module, kernel lock down is enabled\n"); + return -EINVAL; + } + + if (wc_activate != 0) + pr_info("wc_activate is set\n"); + + ret = igbuio_config_intr_mode(intr_mode); + if (ret < 0) + return ret; + + return pci_register_driver(&igbuio_pci_driver); +} + +static void __exit +igbuio_pci_exit_module(void) +{ + pci_unregister_driver(&igbuio_pci_driver); +} + +module_init(igbuio_pci_init_module); +module_exit(igbuio_pci_exit_module); + +module_param(intr_mode, charp, S_IRUGO); +MODULE_PARM_DESC(intr_mode, +"igb_uio interrupt mode (default=msix):\n" +" " RTE_INTR_MODE_MSIX_NAME " Use MSIX interrupt\n" +" " RTE_INTR_MODE_MSI_NAME " Use MSI interrupt\n" +" " RTE_INTR_MODE_LEGACY_NAME " Use Legacy interrupt\n" +"\n"); + +module_param(wc_activate, int, 0); +MODULE_PARM_DESC(wc_activate, +"Activate support for write combining (WC) (default=0)\n" +" 0 - disable\n" +" other - enable\n"); + +MODULE_DESCRIPTION("UIO driver for Intel IGB PCI cards"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/amazon/scsi/Makefile b/drivers/amazon/scsi/Makefile new file mode 100644 index 0000000000000..760bfe47e4cb5 --- /dev/null +++ b/drivers/amazon/scsi/Makefile @@ -0,0 +1,4 @@ +# +# Amazon Driver Updates +# +obj-$(CONFIG_AMAZON_SCSI_SMARTPQI) += smartpqi/ diff --git a/drivers/amazon/scsi/smartpqi/Makefile b/drivers/amazon/scsi/smartpqi/Makefile new file mode 100644 index 0000000000000..f4c5373e4513b --- /dev/null +++ b/drivers/amazon/scsi/smartpqi/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_AMAZON_SCSI_SMARTPQI) += smartpqi.o +smartpqi-objs := smartpqi_init.o smartpqi_sis.o smartpqi_sas_transport.o smartpqi_kernel_compat.o +EXTRA_CFLAGS += -DKCLASS5D +EXTRA_CFLAGS += -DKFEATURE_HAS_SCSI_CMD_PRIV -DKFEATURE_HAS_HOST_TAGSET_SUPPORT + diff --git a/drivers/amazon/scsi/smartpqi/smartpqi.h b/drivers/amazon/scsi/smartpqi/smartpqi.h new file mode 100644 index 0000000000000..942682598a107 --- /dev/null +++ b/drivers/amazon/scsi/smartpqi/smartpqi.h @@ -0,0 +1,1719 @@ +/* + * driver for Microchip PQI-based storage controllers + * Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries + * Copyright (c) 2016-2018 Microsemi Corporation + * Copyright (c) 2016 PMC-Sierra, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * Questions/Comments/Bugfixes to storagedev@microchip.com + * + */ + +#if !defined(_SMARTPQI_H) +#define _SMARTPQI_H + +#define TORTUGA 0 + +#include +#include + +#pragma pack(1) + +#define PQI_DEVICE_SIGNATURE "PQI DREG" + +/* This structure is defined by the PQI specification. */ +struct pqi_device_registers { + __le64 signature; + u8 function_and_status_code; + u8 reserved[7]; + u8 max_admin_iq_elements; + u8 max_admin_oq_elements; + u8 admin_iq_element_length; /* in 16-byte units */ + u8 admin_oq_element_length; /* in 16-byte units */ + __le16 max_reset_timeout; /* in 100-millisecond units */ + u8 reserved1[2]; + __le32 legacy_intx_status; + __le32 legacy_intx_mask_set; + __le32 legacy_intx_mask_clear; + u8 reserved2[28]; + __le32 device_status; + u8 reserved3[4]; + __le64 admin_iq_pi_offset; + __le64 admin_oq_ci_offset; + __le64 admin_iq_element_array_addr; + __le64 admin_oq_element_array_addr; + __le64 admin_iq_ci_addr; + __le64 admin_oq_pi_addr; + u8 admin_iq_num_elements; + u8 admin_oq_num_elements; + __le16 admin_queue_int_msg_num; + u8 reserved4[4]; + __le32 device_error; + u8 reserved5[4]; + __le64 error_details; + __le32 device_reset; + __le32 power_action; + u8 reserved6[104]; +}; + +/* + * controller registers + * + * These are defined by the Microchip implementation. + * + * Some registers (those named sis_*) are only used when in + * legacy SIS mode before we transition the controller into + * PQI mode. There are a number of other SIS mode registers, + * but we don't use them, so only the SIS registers that we + * care about are defined here. The offsets mentioned in the + * comments are the offsets from the PCIe BAR 0. + */ +struct pqi_ctrl_registers { + u8 reserved[0x20]; + __le32 sis_host_to_ctrl_doorbell; /* 20h */ + u8 reserved1[0x34 - (0x20 + sizeof(__le32))]; + __le32 sis_interrupt_mask; /* 34h */ + u8 reserved2[0x9c - (0x34 + sizeof(__le32))]; + __le32 sis_ctrl_to_host_doorbell; /* 9Ch */ + u8 reserved3[0xa0 - (0x9c + sizeof(__le32))]; + __le32 sis_ctrl_to_host_doorbell_clear; /* A0h */ + u8 reserved4[0xb0 - (0xa0 + sizeof(__le32))]; + __le32 sis_driver_scratch; /* B0h */ + __le32 sis_product_identifier; /* B4h */ + u8 reserved5[0xbc - (0xb4 + sizeof(__le32))]; + __le32 sis_firmware_status; /* BCh */ + u8 reserved6[0xcc - (0xbc + sizeof(__le32))]; + __le32 sis_ctrl_shutdown_reason_code; /* CCh */ + u8 reserved7[0x1000 - (0xcc + sizeof(__le32))]; + __le32 sis_mailbox[8]; /* 1000h */ + u8 reserved8[0x4000 - (0x1000 + (sizeof(__le32) * 8))]; + /* + * The PQI spec states that the PQI registers should be at + * offset 0 from the PCIe BAR 0. However, we can't map + * them at offset 0 because that would break compatibility + * with the SIS registers. So we map them at offset 4000h. + */ + struct pqi_device_registers pqi_registers; /* 4000h */ +}; + +#define PQI_DEVICE_REGISTERS_OFFSET 0x4000 + +/* shutdown reasons for taking the controller offline */ +enum pqi_ctrl_shutdown_reason { + PQI_IQ_NOT_DRAINED_TIMEOUT = 1, + PQI_LUN_RESET_TIMEOUT = 2, + PQI_IO_PENDING_POST_LUN_RESET_TIMEOUT = 3, + PQI_NO_HEARTBEAT = 4, + PQI_FIRMWARE_KERNEL_NOT_UP = 5, + PQI_OFA_RESPONSE_TIMEOUT = 6, + PQI_INVALID_REQ_ID = 7, + PQI_UNMATCHED_REQ_ID = 8, + PQI_IO_PI_OUT_OF_RANGE = 9, + PQI_EVENT_PI_OUT_OF_RANGE = 10, + PQI_UNEXPECTED_IU_TYPE = 11 +}; + +enum pqi_io_path { + RAID_PATH = 0, + AIO_PATH = 1 +}; + +enum pqi_irq_mode { + IRQ_MODE_NONE, + IRQ_MODE_INTX, + IRQ_MODE_MSIX +}; + +struct pqi_sg_descriptor { + __le64 address; + __le32 length; + __le32 flags; +}; + +/* manifest constants for the flags field of pqi_sg_descriptor */ +#define CISS_SG_LAST 0x40000000 +#define CISS_SG_CHAIN 0x80000000 + +struct pqi_iu_header { + u8 iu_type; + u8 reserved; + __le16 iu_length; /* in bytes - does not include the length */ + /* of this header */ + __le16 response_queue_id; /* specifies the OQ where the */ + /* response IU is to be delivered */ + u16 driver_flags; /* reserved for driver use */ +}; + +/* manifest constants for pqi_iu_header.driver_flags */ +#define PQI_DRIVER_NONBLOCKABLE_REQUEST 0x1 + +/* + * According to the PQI spec, the IU header is only the first 4 bytes of our + * pqi_iu_header structure. + */ +#define PQI_REQUEST_HEADER_LENGTH 4 + +struct pqi_general_admin_request { + struct pqi_iu_header header; + __le16 request_id; + u8 function_code; + union { + struct { + u8 reserved[33]; + __le32 buffer_length; + struct pqi_sg_descriptor sg_descriptor; + } report_device_capability; + + struct { + u8 reserved; + __le16 queue_id; + u8 reserved1[2]; + __le64 element_array_addr; + __le64 ci_addr; + __le16 num_elements; + __le16 element_length; + u8 queue_protocol; + u8 reserved2[23]; + __le32 vendor_specific; + } create_operational_iq; + + struct { + u8 reserved; + __le16 queue_id; + u8 reserved1[2]; + __le64 element_array_addr; + __le64 pi_addr; + __le16 num_elements; + __le16 element_length; + u8 queue_protocol; + u8 reserved2[3]; + __le16 int_msg_num; + __le16 coalescing_count; + __le32 min_coalescing_time; + __le32 max_coalescing_time; + u8 reserved3[8]; + __le32 vendor_specific; + } create_operational_oq; + + struct { + u8 reserved; + __le16 queue_id; + u8 reserved1[50]; + } delete_operational_queue; + + struct { + u8 reserved; + __le16 queue_id; + u8 reserved1[46]; + __le32 vendor_specific; + } change_operational_iq_properties; + + } data; +}; + +struct pqi_general_admin_response { + struct pqi_iu_header header; + __le16 request_id; + u8 function_code; + u8 status; + union { + struct { + u8 status_descriptor[4]; + __le64 iq_pi_offset; + u8 reserved[40]; + } create_operational_iq; + + struct { + u8 status_descriptor[4]; + __le64 oq_ci_offset; + u8 reserved[40]; + } create_operational_oq; + } data; +}; + +struct pqi_iu_layer_descriptor { + u8 inbound_spanning_supported : 1; + u8 reserved : 7; + u8 reserved1[5]; + __le16 max_inbound_iu_length; + u8 outbound_spanning_supported : 1; + u8 reserved2 : 7; + u8 reserved3[5]; + __le16 max_outbound_iu_length; +}; + +struct pqi_device_capability { + __le16 data_length; + u8 reserved[6]; + u8 iq_arbitration_priority_support_bitmask; + u8 maximum_aw_a; + u8 maximum_aw_b; + u8 maximum_aw_c; + u8 max_arbitration_burst : 3; + u8 reserved1 : 4; + u8 iqa : 1; + u8 reserved2[2]; + u8 iq_freeze : 1; + u8 reserved3 : 7; + __le16 max_inbound_queues; + __le16 max_elements_per_iq; + u8 reserved4[4]; + __le16 max_iq_element_length; + __le16 min_iq_element_length; + u8 reserved5[2]; + __le16 max_outbound_queues; + __le16 max_elements_per_oq; + __le16 intr_coalescing_time_granularity; + __le16 max_oq_element_length; + __le16 min_oq_element_length; + u8 reserved6[24]; + struct pqi_iu_layer_descriptor iu_layer_descriptors[32]; +}; + +#define PQI_MAX_EMBEDDED_SG_DESCRIPTORS 4 +#define PQI_MAX_EMBEDDED_R56_SG_DESCRIPTORS 3 + +struct pqi_raid_path_request { + struct pqi_iu_header header; + __le16 request_id; + __le16 nexus_id; + __le32 buffer_length; + u8 lun_number[8]; + __le16 protocol_specific; + u8 data_direction : 2; + u8 partial : 1; + u8 reserved1 : 4; + u8 fence : 1; + __le16 error_index; + u8 reserved2; + u8 task_attribute : 3; + u8 command_priority : 4; + u8 reserved3 : 1; + u8 reserved4 : 2; + u8 additional_cdb_bytes_usage : 3; + u8 reserved5 : 3; + u8 cdb[16]; + u8 reserved6[11]; + u8 ml_device_lun_number; + __le32 timeout; + struct pqi_sg_descriptor sg_descriptors[PQI_MAX_EMBEDDED_SG_DESCRIPTORS]; +}; + +struct pqi_aio_path_request { + struct pqi_iu_header header; + __le16 request_id; + u8 reserved1[2]; + __le32 nexus_id; + __le32 buffer_length; + u8 data_direction : 2; + u8 partial : 1; + u8 memory_type : 1; + u8 fence : 1; + u8 encryption_enable : 1; + u8 reserved2 : 2; + u8 task_attribute : 3; + u8 command_priority : 4; + u8 reserved3 : 1; + __le16 data_encryption_key_index; + __le32 encrypt_tweak_lower; + __le32 encrypt_tweak_upper; + u8 cdb[16]; + __le16 error_index; + u8 num_sg_descriptors; + u8 cdb_length; + u8 lun_number[8]; + u8 reserved4[4]; + struct pqi_sg_descriptor sg_descriptors[PQI_MAX_EMBEDDED_SG_DESCRIPTORS]; +}; + +#define PQI_RAID1_NVME_XFER_LIMIT (32 * 1024) /* 32 KiB */ + +struct pqi_aio_r1_path_request { + struct pqi_iu_header header; + __le16 request_id; + __le16 volume_id; /* ID of the RAID volume */ + __le32 it_nexus_1; /* IT nexus of the 1st drive in the RAID volume */ + __le32 it_nexus_2; /* IT nexus of the 2nd drive in the RAID volume */ + __le32 it_nexus_3; /* IT nexus of the 3rd drive in the RAID volume */ + __le32 data_length; /* total bytes to read/write */ + u8 data_direction : 2; + u8 partial : 1; + u8 memory_type : 1; + u8 fence : 1; + u8 encryption_enable : 1; + u8 reserved : 2; + u8 task_attribute : 3; + u8 command_priority : 4; + u8 reserved2 : 1; + __le16 data_encryption_key_index; + u8 cdb[16]; + __le16 error_index; + u8 num_sg_descriptors; + u8 cdb_length; + u8 num_drives; /* number of drives in the RAID volume (2 or 3) */ + u8 reserved3[3]; + __le32 encrypt_tweak_lower; + __le32 encrypt_tweak_upper; + struct pqi_sg_descriptor sg_descriptors[PQI_MAX_EMBEDDED_SG_DESCRIPTORS]; +}; + +#define PQI_DEFAULT_MAX_WRITE_RAID_5_6 (8 * 1024U) +#define PQI_DEFAULT_MAX_TRANSFER_ENCRYPTED_SAS_SATA (~0U) +#define PQI_DEFAULT_MAX_TRANSFER_ENCRYPTED_NVME (32 * 1024U) + +struct pqi_aio_r56_path_request { + struct pqi_iu_header header; + __le16 request_id; + __le16 volume_id; /* ID of the RAID volume */ + __le32 data_it_nexus; /* IT nexus for the data drive */ + __le32 p_parity_it_nexus; /* IT nexus for the P parity drive */ + __le32 q_parity_it_nexus; /* IT nexus for the Q parity drive */ + __le32 data_length; /* total bytes to read/write */ + u8 data_direction : 2; + u8 partial : 1; + u8 mem_type : 1; /* 0 = PCIe, 1 = DDR */ + u8 fence : 1; + u8 encryption_enable : 1; + u8 reserved : 2; + u8 task_attribute : 3; + u8 command_priority : 4; + u8 reserved1 : 1; + __le16 data_encryption_key_index; + u8 cdb[16]; + __le16 error_index; + u8 num_sg_descriptors; + u8 cdb_length; + u8 xor_multiplier; + u8 reserved2[3]; + __le32 encrypt_tweak_lower; + __le32 encrypt_tweak_upper; + __le64 row; /* row = logical LBA/blocks per row */ + u8 reserved3[8]; + struct pqi_sg_descriptor sg_descriptors[PQI_MAX_EMBEDDED_R56_SG_DESCRIPTORS]; +}; + +struct pqi_io_response { + struct pqi_iu_header header; + __le16 request_id; + __le16 error_index; + u8 reserved2[4]; +}; + +struct pqi_general_management_request { + struct pqi_iu_header header; + __le16 request_id; + union { + struct { + u8 reserved[2]; + __le32 buffer_length; + struct pqi_sg_descriptor sg_descriptors[3]; + } report_event_configuration; + + struct { + __le16 global_event_oq_id; + __le32 buffer_length; + struct pqi_sg_descriptor sg_descriptors[3]; + } set_event_configuration; + } data; +}; + +struct pqi_event_descriptor { + u8 event_type; + u8 reserved; + __le16 oq_id; +}; + +struct pqi_event_config { + u8 reserved[2]; + u8 num_event_descriptors; + u8 reserved1; + struct pqi_event_descriptor descriptors[1]; +}; + +#define PQI_MAX_EVENT_DESCRIPTORS 255 + +#define PQI_EVENT_OFA_MEMORY_ALLOCATION 0x0 +#define PQI_EVENT_OFA_QUIESCE 0x1 +#define PQI_EVENT_OFA_CANCELED 0x2 + +struct pqi_event_response { + struct pqi_iu_header header; + u8 event_type; + u8 reserved2 : 7; + u8 request_acknowledge : 1; + __le16 event_id; + __le32 additional_event_id; + union { + struct { + __le32 bytes_requested; + u8 reserved[12]; + } ofa_memory_allocation; + + struct { + __le16 reason; /* reason for cancellation */ + u8 reserved[14]; + } ofa_cancelled; + } data; +}; + +struct pqi_event_acknowledge_request { + struct pqi_iu_header header; + u8 event_type; + u8 reserved2; + __le16 event_id; + __le32 additional_event_id; +}; + +struct pqi_task_management_request { + struct pqi_iu_header header; + __le16 request_id; + __le16 nexus_id; + u8 reserved; + u8 ml_device_lun_number; + __le16 timeout; + u8 lun_number[8]; + __le16 protocol_specific; + __le16 outbound_queue_id_to_manage; + __le16 request_id_to_manage; + u8 task_management_function; + u8 reserved2 : 7; + u8 fence : 1; +}; + +#define SOP_TASK_MANAGEMENT_LUN_RESET 0x8 + +struct pqi_task_management_response { + struct pqi_iu_header header; + __le16 request_id; + __le16 nexus_id; + u8 additional_response_info[3]; + u8 response_code; +}; + +struct pqi_vendor_general_request { + struct pqi_iu_header header; + __le16 request_id; + __le16 function_code; + union { + struct { + __le16 first_section; + __le16 last_section; + u8 reserved[48]; + } config_table_update; + + struct { + __le64 buffer_address; + __le32 buffer_length; + u8 reserved[40]; + } ofa_memory_allocation; + } data; +}; + +struct pqi_vendor_general_response { + struct pqi_iu_header header; + __le16 request_id; + __le16 function_code; + __le16 status; + u8 reserved[2]; +}; + +#define PQI_VENDOR_GENERAL_CONFIG_TABLE_UPDATE 0 +#define PQI_VENDOR_GENERAL_HOST_MEMORY_UPDATE 1 + +#define PQI_OFA_VERSION 1 +#define PQI_OFA_SIGNATURE "OFA_QRM" +#define PQI_OFA_MAX_SG_DESCRIPTORS 64 + +struct pqi_ofa_memory { + __le64 signature; /* "OFA_QRM" */ + __le16 version; /* version of this struct (1 = 1st version) */ + u8 reserved[62]; + __le32 bytes_allocated; /* total allocated memory in bytes */ + __le16 num_memory_descriptors; + u8 reserved1[2]; + struct pqi_sg_descriptor sg_descriptor[PQI_OFA_MAX_SG_DESCRIPTORS]; +}; + +struct pqi_aio_error_info { + u8 status; + u8 service_response; + u8 data_present; + u8 reserved; + __le32 residual_count; + __le16 data_length; + __le16 reserved1; + u8 data[256]; +}; + +struct pqi_raid_error_info { + u8 data_in_result; + u8 data_out_result; + u8 reserved[3]; + u8 status; + __le16 status_qualifier; + __le16 sense_data_length; + __le16 response_data_length; + __le32 data_in_transferred; + __le32 data_out_transferred; + u8 data[256]; +}; + +#define PQI_REQUEST_IU_TASK_MANAGEMENT 0x13 +#define PQI_REQUEST_IU_RAID_PATH_IO 0x14 +#define PQI_REQUEST_IU_AIO_PATH_IO 0x15 +#define PQI_REQUEST_IU_AIO_PATH_RAID5_IO 0x18 +#define PQI_REQUEST_IU_AIO_PATH_RAID6_IO 0x19 +#define PQI_REQUEST_IU_AIO_PATH_RAID1_IO 0x1A +#define PQI_REQUEST_IU_GENERAL_ADMIN 0x60 +#define PQI_REQUEST_IU_REPORT_VENDOR_EVENT_CONFIG 0x72 +#define PQI_REQUEST_IU_SET_VENDOR_EVENT_CONFIG 0x73 +#define PQI_REQUEST_IU_VENDOR_GENERAL 0x75 +#define PQI_REQUEST_IU_ACKNOWLEDGE_VENDOR_EVENT 0xf6 + +#define PQI_RESPONSE_IU_GENERAL_MANAGEMENT 0x81 +#define PQI_RESPONSE_IU_TASK_MANAGEMENT 0x93 +#define PQI_RESPONSE_IU_GENERAL_ADMIN 0xe0 +#define PQI_RESPONSE_IU_RAID_PATH_IO_SUCCESS 0xf0 +#define PQI_RESPONSE_IU_AIO_PATH_IO_SUCCESS 0xf1 +#define PQI_RESPONSE_IU_RAID_PATH_IO_ERROR 0xf2 +#define PQI_RESPONSE_IU_AIO_PATH_IO_ERROR 0xf3 +#define PQI_RESPONSE_IU_AIO_PATH_DISABLED 0xf4 +#define PQI_RESPONSE_IU_VENDOR_EVENT 0xf5 +#define PQI_RESPONSE_IU_VENDOR_GENERAL 0xf7 + +#define PQI_GENERAL_ADMIN_FUNCTION_REPORT_DEVICE_CAPABILITY 0x0 +#define PQI_GENERAL_ADMIN_FUNCTION_CREATE_IQ 0x10 +#define PQI_GENERAL_ADMIN_FUNCTION_CREATE_OQ 0x11 +#define PQI_GENERAL_ADMIN_FUNCTION_DELETE_IQ 0x12 +#define PQI_GENERAL_ADMIN_FUNCTION_DELETE_OQ 0x13 +#define PQI_GENERAL_ADMIN_FUNCTION_CHANGE_IQ_PROPERTY 0x14 + +#define PQI_GENERAL_ADMIN_STATUS_SUCCESS 0x0 + +#define PQI_IQ_PROPERTY_IS_AIO_QUEUE 0x1 + +#define PQI_GENERAL_ADMIN_IU_LENGTH 0x3c +#define PQI_PROTOCOL_SOP 0x0 + +#define PQI_DATA_IN_OUT_GOOD 0x0 +#define PQI_DATA_IN_OUT_UNDERFLOW 0x1 +#define PQI_DATA_IN_OUT_BUFFER_ERROR 0x40 +#define PQI_DATA_IN_OUT_BUFFER_OVERFLOW 0x41 +#define PQI_DATA_IN_OUT_BUFFER_OVERFLOW_DESCRIPTOR_AREA 0x42 +#define PQI_DATA_IN_OUT_BUFFER_OVERFLOW_BRIDGE 0x43 +#define PQI_DATA_IN_OUT_PCIE_FABRIC_ERROR 0x60 +#define PQI_DATA_IN_OUT_PCIE_COMPLETION_TIMEOUT 0x61 +#define PQI_DATA_IN_OUT_PCIE_COMPLETER_ABORT_RECEIVED 0x62 +#define PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST_RECEIVED 0x63 +#define PQI_DATA_IN_OUT_PCIE_ECRC_CHECK_FAILED 0x64 +#define PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST 0x65 +#define PQI_DATA_IN_OUT_PCIE_ACS_VIOLATION 0x66 +#define PQI_DATA_IN_OUT_PCIE_TLP_PREFIX_BLOCKED 0x67 +#define PQI_DATA_IN_OUT_PCIE_POISONED_MEMORY_READ 0x6F +#define PQI_DATA_IN_OUT_ERROR 0xf0 +#define PQI_DATA_IN_OUT_PROTOCOL_ERROR 0xf1 +#define PQI_DATA_IN_OUT_HARDWARE_ERROR 0xf2 +#define PQI_DATA_IN_OUT_UNSOLICITED_ABORT 0xf3 +#define PQI_DATA_IN_OUT_ABORTED 0xf4 +#define PQI_DATA_IN_OUT_TIMEOUT 0xf5 + +#define CISS_CMD_STATUS_SUCCESS 0x0 +#define CISS_CMD_STATUS_TARGET_STATUS 0x1 +#define CISS_CMD_STATUS_DATA_UNDERRUN 0x2 +#define CISS_CMD_STATUS_DATA_OVERRUN 0x3 +#define CISS_CMD_STATUS_INVALID 0x4 +#define CISS_CMD_STATUS_PROTOCOL_ERROR 0x5 +#define CISS_CMD_STATUS_HARDWARE_ERROR 0x6 +#define CISS_CMD_STATUS_CONNECTION_LOST 0x7 +#define CISS_CMD_STATUS_ABORTED 0x8 +#define CISS_CMD_STATUS_ABORT_FAILED 0x9 +#define CISS_CMD_STATUS_UNSOLICITED_ABORT 0xa +#define CISS_CMD_STATUS_TIMEOUT 0xb +#define CISS_CMD_STATUS_UNABORTABLE 0xc +#define CISS_CMD_STATUS_TMF 0xd +#define CISS_CMD_STATUS_AIO_DISABLED 0xe + +#define PQI_CMD_STATUS_ABORTED CISS_CMD_STATUS_ABORTED + +#define PQI_NUM_EVENT_QUEUE_ELEMENTS 32 +#define PQI_EVENT_OQ_ELEMENT_LENGTH sizeof(struct pqi_event_response) + +#define PQI_EVENT_TYPE_HOTPLUG 0x1 +#define PQI_EVENT_TYPE_HARDWARE 0x2 +#define PQI_EVENT_TYPE_PHYSICAL_DEVICE 0x4 +#define PQI_EVENT_TYPE_LOGICAL_DEVICE 0x5 +#define PQI_EVENT_TYPE_OFA 0xfb +#define PQI_EVENT_TYPE_AIO_STATE_CHANGE 0xfd +#define PQI_EVENT_TYPE_AIO_CONFIG_CHANGE 0xfe + +#pragma pack() + +#define PQI_ERROR_BUFFER_ELEMENT_LENGTH \ + sizeof(struct pqi_raid_error_info) + +/* these values are based on our implementation */ +#define PQI_ADMIN_IQ_NUM_ELEMENTS 8 +#define PQI_ADMIN_OQ_NUM_ELEMENTS 20 +#define PQI_ADMIN_IQ_ELEMENT_LENGTH 64 +#define PQI_ADMIN_OQ_ELEMENT_LENGTH 64 + +#define PQI_OPERATIONAL_IQ_ELEMENT_LENGTH 128 +#define PQI_OPERATIONAL_OQ_ELEMENT_LENGTH 16 + +#define PQI_MIN_MSIX_VECTORS 1 +#define PQI_MAX_MSIX_VECTORS 64 + +/* these values are defined by the PQI spec */ +#define PQI_MAX_NUM_ELEMENTS_ADMIN_QUEUE 255 +#define PQI_MAX_NUM_ELEMENTS_OPERATIONAL_QUEUE 65535 + +#define PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT 64 +#define PQI_QUEUE_ELEMENT_LENGTH_ALIGNMENT 16 +#define PQI_ADMIN_INDEX_ALIGNMENT 64 +#define PQI_OPERATIONAL_INDEX_ALIGNMENT 4 + +#define PQI_MIN_OPERATIONAL_QUEUE_ID 1 +#define PQI_MAX_OPERATIONAL_QUEUE_ID 65535 + +#define PQI_AIO_SERV_RESPONSE_COMPLETE 0 +#define PQI_AIO_SERV_RESPONSE_FAILURE 1 +#define PQI_AIO_SERV_RESPONSE_TMF_COMPLETE 2 +#define PQI_AIO_SERV_RESPONSE_TMF_SUCCEEDED 3 +#define PQI_AIO_SERV_RESPONSE_TMF_REJECTED 4 +#define PQI_AIO_SERV_RESPONSE_TMF_INCORRECT_LUN 5 + +#define PQI_AIO_STATUS_IO_ERROR 0x1 +#define PQI_AIO_STATUS_IO_ABORTED 0x2 +#define PQI_AIO_STATUS_NO_PATH_TO_DEVICE 0x3 +#define PQI_AIO_STATUS_INVALID_DEVICE 0x4 +#define PQI_AIO_STATUS_AIO_PATH_DISABLED 0xe +#define PQI_AIO_STATUS_UNDERRUN 0x51 +#define PQI_AIO_STATUS_OVERRUN 0x75 + +typedef u32 pqi_index_t; + +/* SOP data direction flags */ +#define SOP_NO_DIRECTION_FLAG 0 +#define SOP_WRITE_FLAG 1 /* host writes data to Data-Out */ + /* buffer */ +#define SOP_READ_FLAG 2 /* host receives data from Data-In */ + /* buffer */ +#define SOP_BIDIRECTIONAL 3 /* data is transferred from the */ + /* Data-Out buffer and data is */ + /* transferred to the Data-In buffer */ + +#define SOP_TASK_ATTRIBUTE_SIMPLE 0 +#define SOP_TASK_ATTRIBUTE_HEAD_OF_QUEUE 1 +#define SOP_TASK_ATTRIBUTE_ORDERED 2 +#define SOP_TASK_ATTRIBUTE_ACA 4 + +#define SOP_TMF_COMPLETE 0x0 +#define SOP_TMF_REJECTED 0x4 +#define SOP_TMF_FUNCTION_SUCCEEDED 0x8 +#define SOP_RC_INCORRECT_LOGICAL_UNIT 0x9 + +/* additional CDB bytes usage field codes */ +#define SOP_ADDITIONAL_CDB_BYTES_0 0 /* 16-byte CDB */ +#define SOP_ADDITIONAL_CDB_BYTES_4 1 /* 20-byte CDB */ +#define SOP_ADDITIONAL_CDB_BYTES_8 2 /* 24-byte CDB */ +#define SOP_ADDITIONAL_CDB_BYTES_12 3 /* 28-byte CDB */ +#define SOP_ADDITIONAL_CDB_BYTES_16 4 /* 32-byte CDB */ + +/* + * The purpose of this structure is to obtain proper alignment of objects in + * an admin queue pair. + */ +struct pqi_admin_queues_aligned { + __aligned(PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT) + u8 iq_element_array[PQI_ADMIN_IQ_ELEMENT_LENGTH] + [PQI_ADMIN_IQ_NUM_ELEMENTS]; + __aligned(PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT) + u8 oq_element_array[PQI_ADMIN_OQ_ELEMENT_LENGTH] + [PQI_ADMIN_OQ_NUM_ELEMENTS]; + __aligned(PQI_ADMIN_INDEX_ALIGNMENT) pqi_index_t iq_ci; + __aligned(PQI_ADMIN_INDEX_ALIGNMENT) pqi_index_t oq_pi; +}; + +struct pqi_admin_queues { + void *iq_element_array; + void *oq_element_array; + pqi_index_t __iomem *iq_ci; + pqi_index_t __iomem *oq_pi; + dma_addr_t iq_element_array_bus_addr; + dma_addr_t oq_element_array_bus_addr; + dma_addr_t iq_ci_bus_addr; + dma_addr_t oq_pi_bus_addr; + __le32 __iomem *iq_pi; + pqi_index_t iq_pi_copy; + __le32 __iomem *oq_ci; + pqi_index_t oq_ci_copy; + struct task_struct *task; + u16 int_msg_num; +}; + +struct pqi_queue_group { + struct pqi_ctrl_info *ctrl_info; /* backpointer */ + u16 iq_id[2]; + u16 oq_id; + u16 int_msg_num; + void *iq_element_array[2]; + void *oq_element_array; + dma_addr_t iq_element_array_bus_addr[2]; + dma_addr_t oq_element_array_bus_addr; + __le32 __iomem *iq_pi[2]; + pqi_index_t iq_pi_copy[2]; + pqi_index_t __iomem *iq_ci[2]; + pqi_index_t __iomem *oq_pi; + dma_addr_t iq_ci_bus_addr[2]; + dma_addr_t oq_pi_bus_addr; + __le32 __iomem *oq_ci; + pqi_index_t oq_ci_copy; + spinlock_t submit_lock[2]; /* protect submission queue */ + struct list_head request_list[2]; +}; + +struct pqi_event_queue { + u16 oq_id; + u16 int_msg_num; + void *oq_element_array; + pqi_index_t __iomem *oq_pi; + dma_addr_t oq_element_array_bus_addr; + dma_addr_t oq_pi_bus_addr; + __le32 __iomem *oq_ci; + pqi_index_t oq_ci_copy; +}; + +#define PQI_DEFAULT_QUEUE_GROUP 0 +#if TORTUGA +#define PQI_MAX_QUEUE_GROUPS 1 +#else +#define PQI_MAX_QUEUE_GROUPS PQI_MAX_MSIX_VECTORS +#endif + +struct pqi_encryption_info { + u16 data_encryption_key_index; + u32 encrypt_tweak_lower; + u32 encrypt_tweak_upper; +}; + +#pragma pack(1) + +#define PQI_CONFIG_TABLE_SIGNATURE "CFGTABLE" +#define PQI_CONFIG_TABLE_MAX_LENGTH ((u16)~0) + +/* configuration table section IDs */ +#define PQI_CONFIG_TABLE_ALL_SECTIONS (-1) +#define PQI_CONFIG_TABLE_SECTION_GENERAL_INFO 0 +#define PQI_CONFIG_TABLE_SECTION_FIRMWARE_FEATURES 1 +#define PQI_CONFIG_TABLE_SECTION_FIRMWARE_ERRATA 2 +#define PQI_CONFIG_TABLE_SECTION_DEBUG 3 +#define PQI_CONFIG_TABLE_SECTION_HEARTBEAT 4 +#define PQI_CONFIG_TABLE_SECTION_SOFT_RESET 5 + +struct pqi_config_table { + u8 signature[8]; /* "CFGTABLE" */ + __le32 first_section_offset; /* offset in bytes from the base */ + /* address of this table to the */ + /* first section */ +}; + +struct pqi_config_table_section_header { + __le16 section_id; /* as defined by the */ + /* PQI_CONFIG_TABLE_SECTION_* */ + /* manifest constants above */ + __le16 next_section_offset; /* offset in bytes from base */ + /* address of the table of the */ + /* next section or 0 if last entry */ +}; + +struct pqi_config_table_general_info { + struct pqi_config_table_section_header header; + __le32 section_length; /* size of this section in bytes */ + /* including the section header */ + __le32 max_outstanding_requests; /* max. outstanding */ + /* commands supported by */ + /* the controller */ + __le32 max_sg_size; /* max. transfer size of a single */ + /* command */ + __le32 max_sg_per_request; /* max. number of scatter-gather */ + /* entries supported in a single */ + /* command */ +}; + +struct pqi_config_table_firmware_features { + struct pqi_config_table_section_header header; + __le16 num_elements; + u8 features_supported[]; +/* u8 features_requested_by_host[]; */ +/* u8 features_enabled[]; */ +/* The 2 fields below are only valid if the MAX_KNOWN_FEATURE bit is set. */ +/* __le16 firmware_max_known_feature; */ +/* __le16 host_max_known_feature; */ +}; + +#define PQI_FIRMWARE_FEATURE_OFA 0 +#define PQI_FIRMWARE_FEATURE_SMP 1 +#define PQI_FIRMWARE_FEATURE_MAX_KNOWN_FEATURE 2 +#define PQI_FIRMWARE_FEATURE_RAID_0_READ_BYPASS 3 +#define PQI_FIRMWARE_FEATURE_RAID_1_READ_BYPASS 4 +#define PQI_FIRMWARE_FEATURE_RAID_5_READ_BYPASS 5 +#define PQI_FIRMWARE_FEATURE_RAID_6_READ_BYPASS 6 +#define PQI_FIRMWARE_FEATURE_RAID_0_WRITE_BYPASS 7 +#define PQI_FIRMWARE_FEATURE_RAID_1_WRITE_BYPASS 8 +#define PQI_FIRMWARE_FEATURE_RAID_5_WRITE_BYPASS 9 +#define PQI_FIRMWARE_FEATURE_RAID_6_WRITE_BYPASS 10 +#define PQI_FIRMWARE_FEATURE_SOFT_RESET_HANDSHAKE 11 +#define PQI_FIRMWARE_FEATURE_UNIQUE_SATA_WWN 12 +#define PQI_FIRMWARE_FEATURE_RAID_IU_TIMEOUT 13 +#define PQI_FIRMWARE_FEATURE_TMF_IU_TIMEOUT 14 +#define PQI_FIRMWARE_FEATURE_RAID_BYPASS_ON_ENCRYPTED_NVME 15 +#define PQI_FIRMWARE_FEATURE_UNIQUE_WWID_IN_REPORT_PHYS_LUN 16 +#define PQI_FIRMWARE_FEATURE_FW_TRIAGE 17 +#define PQI_FIRMWARE_FEATURE_RPL_EXTENDED_FORMAT_4_5 18 +#define PQI_FIRMWARE_FEATURE_MULTI_LUN_DEVICE_SUPPORT 21 +#define PQI_FIRMWARE_FEATURE_MAXIMUM 21 + +struct pqi_config_table_debug { + struct pqi_config_table_section_header header; + __le32 scratchpad; +}; + +struct pqi_config_table_heartbeat { + struct pqi_config_table_section_header header; + __le32 heartbeat_counter; +}; + +struct pqi_config_table_soft_reset { + struct pqi_config_table_section_header header; + u8 soft_reset_status; +}; + +#define PQI_SOFT_RESET_INITIATE 0x1 +#define PQI_SOFT_RESET_ABORT 0x2 + +enum pqi_soft_reset_status { + RESET_INITIATE_FIRMWARE, + RESET_INITIATE_DRIVER, + RESET_ABORT, + RESET_NORESPONSE, + RESET_TIMEDOUT +}; + +union pqi_reset_register { + struct { + u32 reset_type : 3; + u32 reserved : 2; + u32 reset_action : 3; + u32 hold_in_pd1 : 1; + u32 reserved2 : 23; + } bits; + u32 all_bits; +}; + +#define PQI_RESET_ACTION_RESET 0x1 + +#define PQI_RESET_TYPE_NO_RESET 0x0 +#define PQI_RESET_TYPE_SOFT_RESET 0x1 +#define PQI_RESET_TYPE_FIRM_RESET 0x2 +#define PQI_RESET_TYPE_HARD_RESET 0x3 + +#define PQI_RESET_ACTION_COMPLETED 0x2 + +#define PQI_RESET_POLL_INTERVAL_MSECS 100 + +#if TORTUGA +#define PQI_MAX_OUTSTANDING_REQUESTS 32 +#define PQI_MAX_OUTSTANDING_REQUESTS_KDUMP PQI_MAX_OUTSTANDING_REQUESTS +#define PQI_MAX_TRANSFER_SIZE (512 * 1024U) +#define PQI_MAX_TRANSFER_SIZE_KDUMP PQI_MAX_TRANSFER_SIZE +#else +#define PQI_MAX_OUTSTANDING_REQUESTS ((u32)~0) +#define PQI_MAX_OUTSTANDING_REQUESTS_KDUMP 32 +#define PQI_MAX_TRANSFER_SIZE (4 * 1024U * 1024U) +#define PQI_MAX_TRANSFER_SIZE_KDUMP (512 * 1024U) +#endif + +#define RAID_MAP_MAX_ENTRIES 1024 +#define RAID_MAP_MAX_DATA_DISKS_PER_ROW 128 + +#define PQI_PHYSICAL_DEVICE_BUS 0 +#define PQI_RAID_VOLUME_BUS 1 +#define PQI_HBA_BUS 2 +#define PQI_EXTERNAL_RAID_VOLUME_BUS 3 +#define PQI_MAX_BUS PQI_EXTERNAL_RAID_VOLUME_BUS +#define PQI_VSEP_CISS_BTL 379 + +struct report_lun_header { + __be32 list_length; + u8 flags; + u8 reserved[3]; +}; + +/* for flags field of struct report_lun_header */ +#define CISS_REPORT_LOG_FLAG_UNIQUE_LUN_ID (1 << 0) +#define CISS_REPORT_LOG_FLAG_QUEUE_DEPTH (1 << 5) +#define CISS_REPORT_LOG_FLAG_DRIVE_TYPE_MIX (1 << 6) + +#define CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_2 0x2 +#define CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_4 0x4 +#define CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_MASK 0xf + +struct report_log_lun { + u8 lunid[8]; + u8 volume_id[16]; +}; + +struct report_log_lun_list { + struct report_lun_header header; + struct report_log_lun lun_entries[1]; +}; + +struct report_phys_lun_8byte_wwid { + u8 lunid[8]; + __be64 wwid; + u8 device_type; + u8 device_flags; + u8 lun_count; /* number of LUNs in a multi-LUN device */ + u8 redundant_paths; + u32 aio_handle; +}; + +struct report_phys_lun_16byte_wwid { + u8 lunid[8]; + u8 wwid[16]; + u8 device_type; + u8 device_flags; + u8 lun_count; /* number of LUNs in a multi-LUN device */ + u8 redundant_paths; + u32 aio_handle; +}; + +/* for device_flags field of struct report_phys_lun_extended_entry */ +#define CISS_REPORT_PHYS_DEV_FLAG_AIO_ENABLED 0x8 + +struct report_phys_lun_8byte_wwid_list { + struct report_lun_header header; + struct report_phys_lun_8byte_wwid lun_entries[1]; +}; + +struct report_phys_lun_16byte_wwid_list { + struct report_lun_header header; + struct report_phys_lun_16byte_wwid lun_entries[1]; +}; + +struct raid_map_disk_data { + u32 aio_handle; + u8 xor_mult[2]; + u8 reserved[2]; +}; + +/* for flags field of RAID map */ +#define RAID_MAP_ENCRYPTION_ENABLED 0x1 + +struct raid_map { + __le32 structure_size; /* size of entire structure in bytes */ + __le32 volume_blk_size; /* bytes / block in the volume */ + __le64 volume_blk_cnt; /* logical blocks on the volume */ + u8 phys_blk_shift; /* shift factor to convert between */ + /* units of logical blocks and */ + /* physical disk blocks */ + u8 parity_rotation_shift; /* shift factor to convert between */ + /* units of logical stripes and */ + /* physical stripes */ + __le16 strip_size; /* blocks used on each disk / stripe */ + __le64 disk_starting_blk; /* first disk block used in volume */ + __le64 disk_blk_cnt; /* disk blocks used by volume / disk */ + __le16 data_disks_per_row; /* data disk entries / row in the map */ + __le16 metadata_disks_per_row; /* mirror/parity disk entries / row */ + /* in the map */ + __le16 row_cnt; /* rows in each layout map */ + __le16 layout_map_count; /* layout maps (1 map per */ + /* mirror parity group) */ + __le16 flags; + __le16 data_encryption_key_index; + u8 reserved[16]; + struct raid_map_disk_data disk_data[RAID_MAP_MAX_ENTRIES]; +}; + +#pragma pack() + +struct pqi_scsi_dev_raid_map_data { + bool is_write; + u8 raid_level; + u32 map_index; + u64 first_block; + u64 last_block; + u32 data_length; + u32 block_cnt; + u32 blocks_per_row; + u64 first_row; + u64 last_row; + u32 first_row_offset; + u32 last_row_offset; + u32 first_column; + u32 last_column; + u64 r5or6_first_row; + u64 r5or6_last_row; + u32 r5or6_first_row_offset; + u32 r5or6_last_row_offset; + u32 r5or6_first_column; + u32 r5or6_last_column; + u16 data_disks_per_row; + u32 total_disks_per_row; + u16 layout_map_count; + u32 stripesize; + u16 strip_size; + u32 first_group; + u32 last_group; + u32 map_row; + u32 aio_handle; + u64 disk_block; + u32 disk_block_cnt; + u8 cdb[16]; + u8 cdb_length; + + /* RAID 1 specific */ +#define NUM_RAID1_MAP_ENTRIES 3 + u32 num_it_nexus_entries; + u32 it_nexus[NUM_RAID1_MAP_ENTRIES]; + + /* RAID 5 / RAID 6 specific */ + u32 p_parity_it_nexus; /* aio_handle */ + u32 q_parity_it_nexus; /* aio_handle */ + u8 xor_mult; + u64 row; + u64 stripe_lba; + u32 p_index; + u32 q_index; +}; + +#define RAID_CTLR_LUNID "\0\0\0\0\0\0\0\0" + +#define NUM_STREAMS_PER_LUN 8 + +struct pqi_stream_data { + u64 next_lba; + u32 last_accessed; +}; + +#define PQI_MAX_LUNS_PER_DEVICE 256 + +struct pqi_scsi_dev { + int devtype; /* as reported by INQUIRY commmand */ + u8 device_type; /* as reported by */ + /* BMIC_IDENTIFY_PHYSICAL_DEVICE */ + /* only valid for devtype = TYPE_DISK */ + int bus; + int target; + int lun; + u8 scsi3addr[8]; + u8 wwid[16]; + u8 volume_id[16]; + u8 is_physical_device : 1; + u8 is_external_raid_device : 1; + u8 is_expander_smp_device : 1; + u8 target_lun_valid : 1; + u8 device_gone : 1; + u8 new_device : 1; + u8 keep_device : 1; + u8 volume_offline : 1; + u8 rescan : 1; + u8 ignore_device : 1; + u8 erase_in_progress : 1; + bool aio_enabled; /* only valid for physical disks */ + bool in_remove; + bool device_offline; + u8 vendor[8]; /* bytes 8-15 of inquiry data */ + u8 model[16]; /* bytes 16-31 of inquiry data */ + u64 sas_address; + u8 raid_level; + u16 queue_depth; /* max. queue_depth for this device */ + u16 advertised_queue_depth; + u32 aio_handle; + u8 volume_status; + u8 active_path_index; + u8 path_map; + u8 bay; + u8 box_index; + u8 phys_box_on_bus; + u8 phy_connected_dev_type; + u8 box[8]; + u16 phys_connector[8]; + u8 phy_id; + u8 ncq_prio_enable; + u8 ncq_prio_support; + u8 lun_count; + bool raid_bypass_configured; /* RAID bypass configured */ + bool raid_bypass_enabled; /* RAID bypass enabled */ + u32 next_bypass_group[RAID_MAP_MAX_DATA_DISKS_PER_ROW]; + struct raid_map *raid_map; /* RAID bypass map */ + u32 max_transfer_encrypted; + + struct pqi_sas_port *sas_port; + struct scsi_device *sdev; + + struct list_head scsi_device_list_entry; + struct list_head new_device_list_entry; + struct list_head add_list_entry; + struct list_head delete_list_entry; + + struct pqi_stream_data stream_data[NUM_STREAMS_PER_LUN]; + atomic_t scsi_cmds_outstanding[PQI_MAX_LUNS_PER_DEVICE]; + unsigned int raid_bypass_cnt; +}; + +/* VPD inquiry pages */ +#define CISS_VPD_LV_DEVICE_GEOMETRY 0xc1 /* vendor-specific page */ +#define CISS_VPD_LV_BYPASS_STATUS 0xc2 /* vendor-specific page */ +#define CISS_VPD_LV_STATUS 0xc3 /* vendor-specific page */ + +#define VPD_PAGE (1 << 8) + +#pragma pack(1) + +/* structure for CISS_VPD_LV_STATUS */ +struct ciss_vpd_logical_volume_status { + u8 peripheral_info; + u8 page_code; + u8 reserved; + u8 page_length; + u8 volume_status; + u8 reserved2[3]; + __be32 flags; +}; + +#pragma pack() + +/* constants for volume_status field of ciss_vpd_logical_volume_status */ +#define CISS_LV_OK 0 +#define CISS_LV_FAILED 1 +#define CISS_LV_NOT_CONFIGURED 2 +#define CISS_LV_DEGRADED 3 +#define CISS_LV_READY_FOR_RECOVERY 4 +#define CISS_LV_UNDERGOING_RECOVERY 5 +#define CISS_LV_WRONG_PHYSICAL_DRIVE_REPLACED 6 +#define CISS_LV_PHYSICAL_DRIVE_CONNECTION_PROBLEM 7 +#define CISS_LV_HARDWARE_OVERHEATING 8 +#define CISS_LV_HARDWARE_HAS_OVERHEATED 9 +#define CISS_LV_UNDERGOING_EXPANSION 10 +#define CISS_LV_NOT_AVAILABLE 11 +#define CISS_LV_QUEUED_FOR_EXPANSION 12 +#define CISS_LV_DISABLED_SCSI_ID_CONFLICT 13 +#define CISS_LV_EJECTED 14 +#define CISS_LV_UNDERGOING_ERASE 15 +/* state 16 not used */ +#define CISS_LV_READY_FOR_PREDICTIVE_SPARE_REBUILD 17 +#define CISS_LV_UNDERGOING_RPI 18 +#define CISS_LV_PENDING_RPI 19 +#define CISS_LV_ENCRYPTED_NO_KEY 20 +/* state 21 not used */ +#define CISS_LV_UNDERGOING_ENCRYPTION 22 +#define CISS_LV_UNDERGOING_ENCRYPTION_REKEYING 23 +#define CISS_LV_ENCRYPTED_IN_NON_ENCRYPTED_CONTROLLER 24 +#define CISS_LV_PENDING_ENCRYPTION 25 +#define CISS_LV_PENDING_ENCRYPTION_REKEYING 26 +#define CISS_LV_NOT_SUPPORTED 27 +#define CISS_LV_STATUS_UNAVAILABLE 255 + +/* constants for flags field of ciss_vpd_logical_volume_status */ +#define CISS_LV_FLAGS_NO_HOST_IO 0x1 /* volume not available for */ + /* host I/O */ + +/* for SAS hosts and SAS expanders */ +struct pqi_sas_node { + struct device *parent_dev; + struct list_head port_list_head; +}; + +struct pqi_sas_port { + struct list_head port_list_entry; + u64 sas_address; + struct pqi_scsi_dev *device; + struct sas_port *port; + int next_phy_index; + struct list_head phy_list_head; + struct pqi_sas_node *parent_node; + struct sas_rphy *rphy; +}; + +struct pqi_sas_phy { + struct list_head phy_list_entry; + struct sas_phy *phy; + struct pqi_sas_port *parent_port; + bool added_to_port; +}; + +struct pqi_io_request { + atomic_t refcount; + u16 index; + void (*io_complete_callback)(struct pqi_io_request *io_request, + void *context); + void *context; + u8 raid_bypass : 1; + int status; + struct pqi_queue_group *queue_group; + struct scsi_cmnd *scmd; + void *error_info; + struct pqi_sg_descriptor *sg_chain_buffer; + dma_addr_t sg_chain_buffer_dma_handle; + void *iu; + struct list_head request_list_entry; +}; + +#define PQI_NUM_SUPPORTED_EVENTS 7 + +struct pqi_event { + bool pending; + u8 event_type; + u16 event_id; + u32 additional_event_id; +}; + +#define PQI_RESERVED_IO_SLOTS_LUN_RESET 1 +#define PQI_RESERVED_IO_SLOTS_EVENT_ACK PQI_NUM_SUPPORTED_EVENTS +#define PQI_RESERVED_IO_SLOTS_SYNCHRONOUS_REQUESTS 3 +#define PQI_RESERVED_IO_SLOTS \ + (PQI_RESERVED_IO_SLOTS_LUN_RESET + PQI_RESERVED_IO_SLOTS_EVENT_ACK + \ + PQI_RESERVED_IO_SLOTS_SYNCHRONOUS_REQUESTS) + +#define PQI_CTRL_PRODUCT_ID_GEN1 0 +#define PQI_CTRL_PRODUCT_ID_GEN2 7 +#define PQI_CTRL_PRODUCT_REVISION_A 0 +#define PQI_CTRL_PRODUCT_REVISION_B 1 + +enum pqi_ctrl_removal_state { + PQI_CTRL_PRESENT = 0, + PQI_CTRL_GRACEFUL_REMOVAL, + PQI_CTRL_SURPRISE_REMOVAL +}; + +struct pqi_ctrl_info { + unsigned int ctrl_id; + struct pci_dev *pci_dev; + char firmware_version[32]; + char serial_number[17]; + char model[17]; + char vendor[9]; + u8 product_id; + u8 product_revision; + void __iomem *iomem_base; + struct pqi_ctrl_registers __iomem *registers; + struct pqi_device_registers __iomem *pqi_registers; + u32 max_sg_entries; + u32 config_table_offset; + u32 config_table_length; + u16 max_inbound_queues; + u16 max_elements_per_iq; + u16 max_iq_element_length; + u16 max_outbound_queues; + u16 max_elements_per_oq; + u16 max_oq_element_length; + u32 max_transfer_size; + u32 max_outstanding_requests; + u32 max_io_slots; + unsigned int scsi_ml_can_queue; + unsigned short sg_tablesize; + unsigned int max_sectors; + u32 error_buffer_length; + void *error_buffer; + dma_addr_t error_buffer_dma_handle; + size_t sg_chain_buffer_length; + unsigned int num_queue_groups; + u16 num_elements_per_iq; + u16 num_elements_per_oq; + u16 max_inbound_iu_length_per_firmware; + u16 max_inbound_iu_length; + unsigned int max_sg_per_iu; + unsigned int max_sg_per_r56_iu; + void *admin_queue_memory_base; + u32 admin_queue_memory_length; + dma_addr_t admin_queue_memory_base_dma_handle; + void *queue_memory_base; + u32 queue_memory_length; + dma_addr_t queue_memory_base_dma_handle; + struct pqi_admin_queues admin_queues; + struct pqi_queue_group queue_groups[PQI_MAX_QUEUE_GROUPS]; + struct pqi_event_queue event_queue; + enum pqi_irq_mode irq_mode; + int max_msix_vectors; + int num_msix_vectors_enabled; + int num_msix_vectors_initialized; + u32 msix_vectors[PQI_MAX_MSIX_VECTORS]; + void *intr_data[PQI_MAX_MSIX_VECTORS]; + int event_irq; + struct Scsi_Host *scsi_host; + + struct mutex scan_mutex; + struct mutex lun_reset_mutex; + bool controller_online; + bool block_requests; + bool scan_blocked; + u8 logical_volume_rescan_needed : 1; + u8 inbound_spanning_supported : 1; + u8 outbound_spanning_supported : 1; + u8 pqi_mode_enabled : 1; + u8 pqi_reset_quiesce_supported : 1; + u8 soft_reset_handshake_supported : 1; + u8 raid_iu_timeout_supported : 1; + u8 tmf_iu_timeout_supported : 1; + u8 firmware_triage_supported : 1; + u8 rpl_extended_format_4_5_supported : 1; + u8 multi_lun_device_supported : 1; + u8 enable_r1_writes : 1; + u8 enable_r5_writes : 1; + u8 enable_r6_writes : 1; + u8 lv_drive_type_mix_valid : 1; + u8 enable_stream_detection : 1; + u8 disable_managed_interrupts : 1; + u8 ciss_report_log_flags; + u32 max_transfer_encrypted_sas_sata; + u32 max_transfer_encrypted_nvme; + u32 max_write_raid_5_6; + u32 max_write_raid_1_10_2drive; + u32 max_write_raid_1_10_3drive; + + struct list_head scsi_device_list; + spinlock_t scsi_device_list_lock; + + struct delayed_work rescan_work; + struct delayed_work update_time_work; + + struct pqi_sas_node *sas_host; + u64 sas_address; + + struct pqi_io_request *io_request_pool; +#if !defined(KFEATURE_HAS_HOST_TAGSET_SUPPORT) + u16 per_cpu_factor; +#endif + struct pqi_event events[PQI_NUM_SUPPORTED_EVENTS]; + struct work_struct event_work; + + atomic_t num_interrupts; + int previous_num_interrupts; + u32 previous_heartbeat_count; + __le32 __iomem *heartbeat_counter; + u8 __iomem *soft_reset_status; + struct timer_list heartbeat_timer; + struct work_struct ctrl_offline_work; + + struct semaphore sync_request_sem; + atomic_t num_busy_threads; + atomic_t num_blocked_threads; + wait_queue_head_t block_requests_wait; + + struct mutex ofa_mutex; + struct pqi_ofa_memory *pqi_ofa_mem_virt_addr; + dma_addr_t pqi_ofa_mem_dma_handle; + void **pqi_ofa_chunk_virt_addr; + struct work_struct ofa_memory_alloc_work; + struct work_struct ofa_quiesce_work; + u32 ofa_bytes_requested; + u16 ofa_cancel_reason; + enum pqi_ctrl_removal_state ctrl_removal_state; + +#if !defined(KFEATURE_HAS_HOST_TAGSET_SUPPORT) + atomic_t total_scmds_outstanding; +#endif +}; + +enum pqi_ctrl_mode { + SIS_MODE = 0, + PQI_MODE +}; + +/* + * assume worst case: SATA queue depth of 31 minus 4 internal firmware commands + */ +#define PQI_PHYSICAL_DISK_DEFAULT_MAX_QUEUE_DEPTH 27 + +/* CISS commands */ +#define CISS_READ 0xc0 +#define CISS_REPORT_LOG 0xc2 /* Report Logical LUNs */ +#define CISS_REPORT_PHYS 0xc3 /* Report Physical LUNs */ +#define CISS_GET_RAID_MAP 0xc8 + +/* BMIC commands */ +#define BMIC_IDENTIFY_CONTROLLER 0x11 +#define BMIC_IDENTIFY_PHYSICAL_DEVICE 0x15 +#define BMIC_READ 0x26 +#define BMIC_WRITE 0x27 +#define BMIC_SENSE_FEATURE 0x61 +#define BMIC_SENSE_CONTROLLER_PARAMETERS 0x64 +#define BMIC_SENSE_SUBSYSTEM_INFORMATION 0x66 +#define BMIC_CSMI_PASSTHRU 0x68 +#define BMIC_WRITE_HOST_WELLNESS 0xa5 +#define BMIC_FLUSH_CACHE 0xc2 +#define BMIC_SET_DIAG_OPTIONS 0xf4 +#define BMIC_SENSE_DIAG_OPTIONS 0xf5 + +#define CSMI_CC_SAS_SMP_PASSTHRU 0x17 + +#define SA_FLUSH_CACHE 0x1 + +#define MASKED_DEVICE(lunid) ((lunid)[3] & 0xc0) +#define CISS_GET_LEVEL_2_BUS(lunid) ((lunid)[7] & 0x3f) +#define CISS_GET_LEVEL_2_TARGET(lunid) ((lunid)[6]) +#define CISS_GET_DRIVE_NUMBER(lunid) \ + (((CISS_GET_LEVEL_2_BUS((lunid)) - 1) << 8) + \ + CISS_GET_LEVEL_2_TARGET((lunid))) + +#define LV_GET_DRIVE_TYPE_MIX(lunid) ((lunid)[6]) + +#define LV_DRIVE_TYPE_MIX_UNKNOWN 0 +#define LV_DRIVE_TYPE_MIX_NO_RESTRICTION 1 +#define LV_DRIVE_TYPE_MIX_SAS_HDD_ONLY 2 +#define LV_DRIVE_TYPE_MIX_SATA_HDD_ONLY 3 +#define LV_DRIVE_TYPE_MIX_SAS_OR_SATA_SSD_ONLY 4 +#define LV_DRIVE_TYPE_MIX_SAS_SSD_ONLY 5 +#define LV_DRIVE_TYPE_MIX_SATA_SSD_ONLY 6 +#define LV_DRIVE_TYPE_MIX_SAS_ONLY 7 +#define LV_DRIVE_TYPE_MIX_SATA_ONLY 8 +#define LV_DRIVE_TYPE_MIX_NVME_ONLY 9 + +#define NO_TIMEOUT ((unsigned long) -1) + +#pragma pack(1) + +struct bmic_identify_controller { + u8 configured_logical_drive_count; + __le32 configuration_signature; + u8 firmware_version_short[4]; + u8 reserved[145]; + __le16 extended_logical_unit_count; + u8 reserved1[34]; + __le16 firmware_build_number; + u8 reserved2[8]; + u8 vendor_id[8]; + u8 product_id[16]; + u8 reserved3[62]; + __le32 extra_controller_flags; + u8 reserved4[2]; + u8 controller_mode; + u8 spare_part_number[32]; + u8 firmware_version_long[32]; +}; + +/* constants for extra_controller_flags field of bmic_identify_controller */ +#define BMIC_IDENTIFY_EXTRA_FLAGS_LONG_FW_VERSION_SUPPORTED 0x20000000 + +struct bmic_sense_subsystem_info { + u8 reserved[44]; + u8 ctrl_serial_number[16]; +}; + +/* constants for device_type field */ +#define SA_DEVICE_TYPE_SATA 0x1 +#define SA_DEVICE_TYPE_SAS 0x2 +#define SA_DEVICE_TYPE_EXPANDER_SMP 0x5 +#define SA_DEVICE_TYPE_SES 0x6 +#define SA_DEVICE_TYPE_CONTROLLER 0x7 +#define SA_DEVICE_TYPE_NVME 0x9 + +struct bmic_identify_physical_device { + u8 scsi_bus; /* SCSI Bus number on controller */ + u8 scsi_id; /* SCSI ID on this bus */ + __le16 block_size; /* sector size in bytes */ + __le32 total_blocks; /* number for sectors on drive */ + __le32 reserved_blocks; /* controller reserved (RIS) */ + u8 model[40]; /* Physical Drive Model */ + u8 serial_number[40]; /* Drive Serial Number */ + u8 firmware_revision[8]; /* drive firmware revision */ + u8 scsi_inquiry_bits; /* inquiry byte 7 bits */ + u8 compaq_drive_stamp; /* 0 means drive not stamped */ + u8 last_failure_reason; + u8 flags; + u8 more_flags; + u8 scsi_lun; /* SCSI LUN for phys drive */ + u8 yet_more_flags; + u8 even_more_flags; + __le32 spi_speed_rules; + u8 phys_connector[2]; /* connector number on controller */ + u8 phys_box_on_bus; /* phys enclosure this drive resides */ + u8 phys_bay_in_box; /* phys drv bay this drive resides */ + __le32 rpm; /* drive rotational speed in RPM */ + u8 device_type; /* type of drive */ + u8 sata_version; /* only valid when device_type = */ + /* SA_DEVICE_TYPE_SATA */ + __le64 big_total_block_count; + __le64 ris_starting_lba; + __le32 ris_size; + u8 wwid[20]; + u8 controller_phy_map[32]; + __le16 phy_count; + u8 phy_connected_dev_type[256]; + u8 phy_to_drive_bay_num[256]; + __le16 phy_to_attached_dev_index[256]; + u8 box_index; + u8 reserved; + __le16 extra_physical_drive_flags; + u8 negotiated_link_rate[256]; + u8 phy_to_phy_map[256]; + u8 redundant_path_present_map; + u8 redundant_path_failure_map; + u8 active_path_number; + __le16 alternate_paths_phys_connector[8]; + u8 alternate_paths_phys_box_on_port[8]; + u8 multi_lun_device_lun_count; + u8 minimum_good_fw_revision[8]; + u8 unique_inquiry_bytes[20]; + u8 current_temperature_degrees; + u8 temperature_threshold_degrees; + u8 max_temperature_degrees; + u8 logical_blocks_per_phys_block_exp; + __le16 current_queue_depth_limit; + u8 switch_name[10]; + __le16 switch_port; + u8 alternate_paths_switch_name[40]; + u8 alternate_paths_switch_port[8]; + __le16 power_on_hours; + __le16 percent_endurance_used; + u8 drive_authentication; + u8 smart_carrier_authentication; + u8 smart_carrier_app_fw_version; + u8 smart_carrier_bootloader_fw_version; + u8 sanitize_flags; + u8 encryption_key_flags; + u8 encryption_key_name[64]; + __le32 misc_drive_flags; + __le16 dek_index; + __le16 hba_drive_encryption_flags; + __le16 max_overwrite_time; + __le16 max_block_erase_time; + __le16 max_crypto_erase_time; + u8 connector_info[5]; + u8 connector_name[8][8]; + u8 page_83_identifier[16]; + u8 maximum_link_rate[256]; + u8 negotiated_physical_link_rate[256]; + u8 box_connector_name[8]; + u8 padding_to_multiple_of_512[9]; +}; + +#define BMIC_SENSE_FEATURE_IO_PAGE 0x8 +#define BMIC_SENSE_FEATURE_IO_PAGE_AIO_SUBPAGE 0x2 + +struct bmic_sense_feature_buffer_header { + u8 page_code; + u8 subpage_code; + __le16 buffer_length; +}; + +struct bmic_sense_feature_page_header { + u8 page_code; + u8 subpage_code; + __le16 page_length; +}; + +struct bmic_sense_feature_io_page_aio_subpage { + struct bmic_sense_feature_page_header header; + u8 firmware_read_support; + u8 driver_read_support; + u8 firmware_write_support; + u8 driver_write_support; + __le16 max_transfer_encrypted_sas_sata; + __le16 max_transfer_encrypted_nvme; + __le16 max_write_raid_5_6; + __le16 max_write_raid_1_10_2drive; + __le16 max_write_raid_1_10_3drive; +}; + +struct bmic_smp_request { + u8 frame_type; + u8 function; + u8 allocated_response_length; + u8 request_length; + u8 additional_request_bytes[1016]; +}; + +struct bmic_smp_response { + u8 frame_type; + u8 function; + u8 function_result; + u8 response_length; + u8 additional_response_bytes[1016]; +}; + +struct bmic_csmi_ioctl_header { + __le32 header_length; + u8 signature[8]; + __le32 timeout; + __le32 control_code; + __le32 return_code; + __le32 length; +}; + +struct bmic_csmi_smp_passthru { + u8 phy_identifier; + u8 port_identifier; + u8 connection_rate; + u8 reserved; + __be64 destination_sas_address; + __le32 request_length; + struct bmic_smp_request request; + u8 connection_status; + u8 reserved1[3]; + __le32 response_length; + struct bmic_smp_response response; +}; + +struct bmic_csmi_smp_passthru_buffer { + struct bmic_csmi_ioctl_header ioctl_header; + struct bmic_csmi_smp_passthru parameters; +}; + +struct bmic_flush_cache { + u8 disable_flag; + u8 system_power_action; + u8 ndu_flush; + u8 shutdown_event; + u8 reserved[28]; +}; + +/* for shutdown_event member of struct bmic_flush_cache */ +enum bmic_flush_cache_shutdown_event { + NONE_CACHE_FLUSH_ONLY = 0, + SHUTDOWN = 1, + HIBERNATE = 2, + SUSPEND = 3, + RESTART = 4 +}; + +struct bmic_diag_options { + __le32 options; +}; + +#pragma pack() + +static inline struct pqi_ctrl_info *shost_to_hba(struct Scsi_Host *shost) +{ + void *hostdata = shost_priv(shost); + + return *((struct pqi_ctrl_info **)hostdata); +} + +void pqi_sas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, + struct sas_rphy *rphy); +int pqi_scsi_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd); +int pqi_add_sas_host(struct Scsi_Host *shost, struct pqi_ctrl_info *ctrl_info); +void pqi_delete_sas_host(struct pqi_ctrl_info *ctrl_info); +int pqi_add_sas_device(struct pqi_sas_node *pqi_sas_node, + struct pqi_scsi_dev *device); +void pqi_remove_sas_device(struct pqi_scsi_dev *device); +struct pqi_scsi_dev *pqi_find_device_by_sas_rphy( + struct pqi_ctrl_info *ctrl_info, struct sas_rphy *rphy); +void pqi_prep_for_scsi_done(struct scsi_cmnd *scmd); +int pqi_csmi_smp_passthru(struct pqi_ctrl_info *ctrl_info, + struct bmic_csmi_smp_passthru_buffer *buffer, size_t buffer_length, + struct pqi_raid_error_info *error_info); + +extern struct sas_function_template pqi_sas_transport_functions; + +#endif /* _SMARTPQI_H */ diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_init.c b/drivers/amazon/scsi/smartpqi/smartpqi_init.c new file mode 100644 index 0000000000000..cd17a128c5133 --- /dev/null +++ b/drivers/amazon/scsi/smartpqi/smartpqi_init.c @@ -0,0 +1,10600 @@ +/* + * driver for Microchip PQI-based storage controllers + * Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries + * Copyright (c) 2016-2018 Microsemi Corporation + * Copyright (c) 2016 PMC-Sierra, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * Questions/Comments/Bugfixes to storagedev@microchip.com + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "smartpqi.h" +#include "smartpqi_sis.h" +#include "smartpqi_kernel_compat.h" + +#if !defined(BUILD_TIMESTAMP) +#define BUILD_TIMESTAMP +#endif + +#define DRIVER_VERSION "2.1.22-040" +#define DRIVER_MAJOR 2 +#define DRIVER_MINOR 1 +#define DRIVER_RELEASE 22 +#define DRIVER_REVISION 32 + +#define DRIVER_NAME "Microchip SmartPQI Driver (v" \ + DRIVER_VERSION BUILD_TIMESTAMP ")" +#define DRIVER_NAME_SHORT "smartpqi" + +#define PQI_EXTRA_SGL_MEMORY (12 * sizeof(struct pqi_sg_descriptor)) +#define PQI_1MB_SECTORS 2048 /* sectors */ + +#define PQI_POST_RESET_DELAY_SECS 5 +#define PQI_POST_OFA_RESET_DELAY_UPON_TIMEOUT_SECS 10 + +MODULE_AUTHOR("Microchip"); +#if TORTUGA +MODULE_DESCRIPTION("Driver for Microchip Smart Family Controller version " + DRIVER_VERSION " (d-b7f1535/s-ed725ab)" " (d147/s325)"); +#else +MODULE_DESCRIPTION("Driver for Microchip Smart Family Controller version " + DRIVER_VERSION " (d-b7f1535/s-ed725ab)"); +#endif +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL"); + +static void pqi_verify_structures(void); +static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info, + enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason); +static void pqi_ctrl_offline_worker(struct work_struct *work); +static int pqi_scan_scsi_devices(struct pqi_ctrl_info *ctrl_info); +static void pqi_scan_start(struct Scsi_Host *shost); +static void pqi_start_io(struct pqi_ctrl_info *ctrl_info, + struct pqi_queue_group *queue_group, enum pqi_io_path path, + struct pqi_io_request *io_request); +static int pqi_submit_raid_request_synchronous(struct pqi_ctrl_info *ctrl_info, + struct pqi_iu_header *request, unsigned int flags, + struct pqi_raid_error_info *error_info); +static int pqi_aio_submit_io(struct pqi_ctrl_info *ctrl_info, + struct scsi_cmnd *scmd, u32 aio_handle, u8 *cdb, + unsigned int cdb_length, struct pqi_queue_group *queue_group, + struct pqi_encryption_info *encryption_info, bool raid_bypass, bool io_high_prio); +static int pqi_aio_submit_r1_write_io(struct pqi_ctrl_info *ctrl_info, + struct scsi_cmnd *scmd, struct pqi_queue_group *queue_group, + struct pqi_encryption_info *encryption_info, struct pqi_scsi_dev *device, + struct pqi_scsi_dev_raid_map_data *rmd); +static int pqi_aio_submit_r56_write_io(struct pqi_ctrl_info *ctrl_info, + struct scsi_cmnd *scmd, struct pqi_queue_group *queue_group, + struct pqi_encryption_info *encryption_info, struct pqi_scsi_dev *device, + struct pqi_scsi_dev_raid_map_data *rmd); +static void pqi_ofa_ctrl_quiesce(struct pqi_ctrl_info *ctrl_info); +static void pqi_ofa_ctrl_unquiesce(struct pqi_ctrl_info *ctrl_info); +static int pqi_ofa_ctrl_restart(struct pqi_ctrl_info *ctrl_info, unsigned int delay_secs); +static void pqi_ofa_setup_host_buffer(struct pqi_ctrl_info *ctrl_info); +static void pqi_ofa_free_host_buffer(struct pqi_ctrl_info *ctrl_info); +static int pqi_ofa_host_memory_update(struct pqi_ctrl_info *ctrl_info); +static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, u8 lun, unsigned long timeout_msecs); +static void pqi_fail_all_outstanding_requests(struct pqi_ctrl_info *ctrl_info); + +/* for flags argument to pqi_submit_raid_request_synchronous() */ +#define PQI_SYNC_FLAGS_INTERRUPTABLE 0x1 + +static struct scsi_transport_template *pqi_sas_transport_template; + +static atomic_t pqi_controller_count = ATOMIC_INIT(0); + +enum pqi_lockup_action { + NONE, + REBOOT, + PANIC +}; + +static enum pqi_lockup_action pqi_lockup_action = NONE; + +static struct { + enum pqi_lockup_action action; + char *name; +} pqi_lockup_actions[] = { + { + .action = NONE, + .name = "none", + }, + { + .action = REBOOT, + .name = "reboot", + }, + { + .action = PANIC, + .name = "panic", + }, +}; + +static unsigned int pqi_supported_event_types[] = { + PQI_EVENT_TYPE_HOTPLUG, + PQI_EVENT_TYPE_HARDWARE, + PQI_EVENT_TYPE_PHYSICAL_DEVICE, + PQI_EVENT_TYPE_LOGICAL_DEVICE, + PQI_EVENT_TYPE_OFA, + PQI_EVENT_TYPE_AIO_STATE_CHANGE, + PQI_EVENT_TYPE_AIO_CONFIG_CHANGE, +}; + +static int pqi_disable_device_id_wildcards; +module_param_named(disable_device_id_wildcards, + pqi_disable_device_id_wildcards, int, 0644); +MODULE_PARM_DESC(disable_device_id_wildcards, + "Disable device ID wildcards."); + +static int pqi_disable_heartbeat; +module_param_named(disable_heartbeat, + pqi_disable_heartbeat, int, 0644); +MODULE_PARM_DESC(disable_heartbeat, + "Disable heartbeat."); + +static int pqi_disable_ctrl_shutdown; +module_param_named(disable_ctrl_shutdown, + pqi_disable_ctrl_shutdown, int, 0644); +MODULE_PARM_DESC(disable_ctrl_shutdown, + "Disable controller shutdown when controller locked up."); + +static char *pqi_lockup_action_param; +module_param_named(lockup_action, + pqi_lockup_action_param, charp, 0644); +MODULE_PARM_DESC(lockup_action, "Action to take when controller locked up.\n" + "\t\tSupported: none, reboot, panic\n" + "\t\tDefault: none"); + +static int pqi_expose_ld_first; +module_param_named(expose_ld_first, + pqi_expose_ld_first, int, 0644); +MODULE_PARM_DESC(expose_ld_first, "Expose logical drives before physical drives."); + +static int pqi_hide_vsep; +module_param_named(hide_vsep, + pqi_hide_vsep, int, 0644); +MODULE_PARM_DESC(hide_vsep, "Hide the virtual SEP for direct attached drives."); + +static int pqi_limit_xfer_to_1MB; +module_param_named(limit_xfer_size_to_1MB, + pqi_limit_xfer_to_1MB, int, 0644); +MODULE_PARM_DESC(limit_xfer_size_to_1MB, "Limit max transfer size to 1MB."); + +static int pqi_disable_managed_interrupts; +module_param_named(disable_managed_interrupts, + pqi_disable_managed_interrupts, int, 0644); +MODULE_PARM_DESC(disable_managed_interrupts, + "Disable the kernel automatically assigning SMP affinity to IRQs."); + +static unsigned int pqi_ctrl_ready_timeout_secs; +module_param_named(ctrl_ready_timeout, + pqi_ctrl_ready_timeout_secs, uint, 0644); +MODULE_PARM_DESC(ctrl_ready_timeout, + "Timeout in seconds for driver to wait for controller ready."); + +static char *raid_levels[] = { + "RAID-0", + "RAID-4", + "RAID-1(1+0)", + "RAID-5", + "RAID-5+1", + "RAID-6", + "RAID-1(Triple)", +}; + +static char *pqi_raid_level_to_string(u8 raid_level) +{ + if (raid_level < ARRAY_SIZE(raid_levels)) + return raid_levels[raid_level]; + + return "RAID UNKNOWN"; +} + +#define SA_RAID_0 0 +#define SA_RAID_4 1 +#define SA_RAID_1 2 /* also used for RAID 10 */ +#define SA_RAID_5 3 /* also used for RAID 50 */ +#define SA_RAID_51 4 +#define SA_RAID_6 5 /* also used for RAID 60 */ +#define SA_RAID_TRIPLE 6 /* also used for RAID 1+0 Triple */ +#define SA_RAID_MAX SA_RAID_TRIPLE +#define SA_RAID_UNKNOWN 0xff + +static inline bool pqi_scsi3addr_equal(u8 *scsi3addr1, u8 *scsi3addr2) +{ + return memcmp(scsi3addr1, scsi3addr2, 8) == 0; +} + +static inline bool pqi_is_logical_device(struct pqi_scsi_dev *device) +{ + return !device->is_physical_device; +} + +static inline bool pqi_is_external_raid_addr(u8 *scsi3addr) +{ + return scsi3addr[2] != 0; +} + +static inline bool pqi_ctrl_offline(struct pqi_ctrl_info *ctrl_info) +{ + return !ctrl_info->controller_online; +} + +static inline void pqi_check_ctrl_health(struct pqi_ctrl_info *ctrl_info) +{ + if (ctrl_info->controller_online) + if (!sis_is_firmware_running(ctrl_info)) + pqi_take_ctrl_offline(ctrl_info, PQI_FIRMWARE_KERNEL_NOT_UP); +} + +static inline bool pqi_is_hba_lunid(u8 *scsi3addr) +{ + return pqi_scsi3addr_equal(scsi3addr, RAID_CTLR_LUNID); +} + +#define PQI_DRIVER_SCRATCH_PQI_MODE 0x1 +#define PQI_DRIVER_SCRATCH_FW_TRIAGE_SUPPORTED 0x2 + +static inline enum pqi_ctrl_mode pqi_get_ctrl_mode(struct pqi_ctrl_info *ctrl_info) +{ + return sis_read_driver_scratch(ctrl_info) & PQI_DRIVER_SCRATCH_PQI_MODE ? PQI_MODE : SIS_MODE; +} + +static inline void pqi_save_ctrl_mode(struct pqi_ctrl_info *ctrl_info, + enum pqi_ctrl_mode mode) +{ + u32 driver_scratch; + + driver_scratch = sis_read_driver_scratch(ctrl_info); + + if (mode == PQI_MODE) + driver_scratch |= PQI_DRIVER_SCRATCH_PQI_MODE; + else + driver_scratch &= ~PQI_DRIVER_SCRATCH_PQI_MODE; + + sis_write_driver_scratch(ctrl_info, driver_scratch); +} + +static inline bool pqi_is_fw_triage_supported(struct pqi_ctrl_info *ctrl_info) +{ + return (sis_read_driver_scratch(ctrl_info) & PQI_DRIVER_SCRATCH_FW_TRIAGE_SUPPORTED) != 0; +} + +static inline void pqi_save_fw_triage_setting(struct pqi_ctrl_info *ctrl_info, bool is_supported) +{ + u32 driver_scratch; + + driver_scratch = sis_read_driver_scratch(ctrl_info); + + if (is_supported) + driver_scratch |= PQI_DRIVER_SCRATCH_FW_TRIAGE_SUPPORTED; + else + driver_scratch &= ~PQI_DRIVER_SCRATCH_FW_TRIAGE_SUPPORTED; + + sis_write_driver_scratch(ctrl_info, driver_scratch); +} + +static inline void pqi_ctrl_block_scan(struct pqi_ctrl_info *ctrl_info) +{ + ctrl_info->scan_blocked = true; + mutex_lock(&ctrl_info->scan_mutex); +} + +static inline void pqi_ctrl_unblock_scan(struct pqi_ctrl_info *ctrl_info) +{ + ctrl_info->scan_blocked = false; + mutex_unlock(&ctrl_info->scan_mutex); +} + +static inline bool pqi_ctrl_scan_blocked(struct pqi_ctrl_info *ctrl_info) +{ + return ctrl_info->scan_blocked; +} + +static inline void pqi_ctrl_block_device_reset(struct pqi_ctrl_info *ctrl_info) +{ + mutex_lock(&ctrl_info->lun_reset_mutex); +} + +static inline void pqi_ctrl_unblock_device_reset(struct pqi_ctrl_info *ctrl_info) +{ + mutex_unlock(&ctrl_info->lun_reset_mutex); +} + +static inline void pqi_scsi_block_requests(struct pqi_ctrl_info *ctrl_info) +{ + struct Scsi_Host *shost; + unsigned int num_loops; + int msecs_sleep; + + shost = ctrl_info->scsi_host; + + scsi_block_requests(shost); + + num_loops = 0; + msecs_sleep = 20; + while (pqi_scsi_host_busy(shost)) { + num_loops++; + if (num_loops == 10) { + dev_warn(&ctrl_info->pci_dev->dev, + "shost %d Waited for %d milli seconds to be unbusy\n", + shost->host_no, num_loops * msecs_sleep); + msecs_sleep = 500; + } + msleep(msecs_sleep); + if(num_loops % 20 == 0) + dev_warn(&ctrl_info->pci_dev->dev, + "shost %d waited for %d more seconds to be unbusy\n", + shost->host_no, msecs_sleep * 20 / 1000); + } +} + +static inline void pqi_scsi_unblock_requests(struct pqi_ctrl_info *ctrl_info) +{ + scsi_unblock_requests(ctrl_info->scsi_host); +} + +static inline void pqi_ctrl_busy(struct pqi_ctrl_info *ctrl_info) +{ + atomic_inc(&ctrl_info->num_busy_threads); +} + +static inline void pqi_ctrl_unbusy(struct pqi_ctrl_info *ctrl_info) +{ + atomic_dec(&ctrl_info->num_busy_threads); +} + +static inline bool pqi_ctrl_blocked(struct pqi_ctrl_info *ctrl_info) +{ + return ctrl_info->block_requests; +} + +static inline void pqi_ctrl_block_requests(struct pqi_ctrl_info *ctrl_info) +{ + ctrl_info->block_requests = true; +} + +static inline void pqi_ctrl_unblock_requests(struct pqi_ctrl_info *ctrl_info) +{ + ctrl_info->block_requests = false; + wake_up_all(&ctrl_info->block_requests_wait); +} + +static void pqi_wait_if_ctrl_blocked(struct pqi_ctrl_info *ctrl_info) +{ + if (!pqi_ctrl_blocked(ctrl_info)) + return; + + atomic_inc(&ctrl_info->num_blocked_threads); + wait_event(ctrl_info->block_requests_wait, + !pqi_ctrl_blocked(ctrl_info)); + atomic_dec(&ctrl_info->num_blocked_threads); +} + +#define PQI_QUIESCE_WARNING_TIMEOUT_SECS 10 + +static inline void pqi_ctrl_wait_until_quiesced(struct pqi_ctrl_info *ctrl_info) +{ + unsigned long start_jiffies; + unsigned long warning_timeout; + bool displayed_warning; + + displayed_warning = false; + start_jiffies = jiffies; + warning_timeout = (PQI_QUIESCE_WARNING_TIMEOUT_SECS * HZ) + start_jiffies; + + while (atomic_read(&ctrl_info->num_busy_threads) > + atomic_read(&ctrl_info->num_blocked_threads)) { + if (time_after(jiffies, warning_timeout)) { + dev_warn(&ctrl_info->pci_dev->dev, + "waiting %u seconds for driver activity to quiesce\n", + jiffies_to_msecs(jiffies - start_jiffies) / 1000); + displayed_warning = true; + warning_timeout = (PQI_QUIESCE_WARNING_TIMEOUT_SECS * HZ) + jiffies; + } + msleep(1); + } + + if (displayed_warning) + dev_warn(&ctrl_info->pci_dev->dev, + "driver activity quiesced after waiting for %u seconds\n", + jiffies_to_msecs(jiffies - start_jiffies) / 1000); +} + +static inline bool pqi_device_offline(struct pqi_scsi_dev *device) +{ + return device->device_offline; +} + +static inline void pqi_ctrl_ofa_start(struct pqi_ctrl_info *ctrl_info) +{ + mutex_lock(&ctrl_info->ofa_mutex); +} + +static inline void pqi_ctrl_ofa_done(struct pqi_ctrl_info *ctrl_info) +{ + mutex_unlock(&ctrl_info->ofa_mutex); +} + +static inline void pqi_wait_until_ofa_finished(struct pqi_ctrl_info *ctrl_info) +{ + mutex_lock(&ctrl_info->ofa_mutex); + mutex_unlock(&ctrl_info->ofa_mutex); +} + +static inline bool pqi_ofa_in_progress(struct pqi_ctrl_info *ctrl_info) +{ + return mutex_is_locked(&ctrl_info->ofa_mutex); +} + +static inline void pqi_device_remove_start(struct pqi_scsi_dev *device) +{ + device->in_remove = true; +} + +static inline bool pqi_device_in_remove(struct pqi_scsi_dev *device) +{ + return device->in_remove; +} + +static inline int pqi_event_type_to_event_index(unsigned int event_type) +{ + int index; + + for (index = 0; index < ARRAY_SIZE(pqi_supported_event_types); index++) + if (event_type == pqi_supported_event_types[index]) + return index; + + return -1; +} + +static inline bool pqi_is_supported_event(unsigned int event_type) +{ + return pqi_event_type_to_event_index(event_type) != -1; +} + +static inline void pqi_schedule_rescan_worker_with_delay(struct pqi_ctrl_info *ctrl_info, + unsigned long delay) +{ + if (pqi_ctrl_offline(ctrl_info)) + return; + + schedule_delayed_work(&ctrl_info->rescan_work, delay); +} + +static inline void pqi_schedule_rescan_worker(struct pqi_ctrl_info *ctrl_info) +{ + pqi_schedule_rescan_worker_with_delay(ctrl_info, 0); +} + +#define PQI_RESCAN_WORK_DELAY (10 * HZ) + +static inline void pqi_schedule_rescan_worker_delayed(struct pqi_ctrl_info *ctrl_info) +{ + pqi_schedule_rescan_worker_with_delay(ctrl_info, PQI_RESCAN_WORK_DELAY); +} + +static inline void pqi_cancel_rescan_worker(struct pqi_ctrl_info *ctrl_info) +{ + cancel_delayed_work_sync(&ctrl_info->rescan_work); +} + +static inline u32 pqi_read_heartbeat_counter(struct pqi_ctrl_info *ctrl_info) +{ + if (!ctrl_info->heartbeat_counter) + return 0; + + return readl(ctrl_info->heartbeat_counter); +} + +static inline u8 pqi_read_soft_reset_status(struct pqi_ctrl_info *ctrl_info) +{ + return readb(ctrl_info->soft_reset_status); +} + +static inline void pqi_clear_soft_reset_status(struct pqi_ctrl_info *ctrl_info) +{ + u8 status; + + status = pqi_read_soft_reset_status(ctrl_info); + status &= ~PQI_SOFT_RESET_ABORT; + writeb(status, ctrl_info->soft_reset_status); +} + +static inline bool pqi_is_io_high_priority(struct pqi_scsi_dev *device, struct scsi_cmnd *scmd) +{ + bool io_high_prio; + int priority_class; + + io_high_prio = false; + + if (device->ncq_prio_enable) { + priority_class = + IOPRIO_PRIO_CLASS(req_get_ioprio(PQI_SCSI_REQUEST(scmd))); + if (priority_class == IOPRIO_CLASS_RT) { + /* Set NCQ priority for read/write commands. */ + switch (scmd->cmnd[0]) { + case WRITE_16: + case READ_16: + case WRITE_12: + case READ_12: + case WRITE_10: + case READ_10: + case WRITE_6: + case READ_6: + io_high_prio = true; + break; + } + } + } + + return io_high_prio; +} + +static int pqi_map_single(struct pci_dev *pci_dev, + struct pqi_sg_descriptor *sg_descriptor, void *buffer, + size_t buffer_length, enum dma_data_direction data_direction) +{ + dma_addr_t bus_address; + + if (!buffer || buffer_length == 0 || data_direction == DMA_NONE) + return 0; + + bus_address = dma_map_single(&pci_dev->dev, buffer, buffer_length, + data_direction); + if (dma_mapping_error(&pci_dev->dev, bus_address)) + return -ENOMEM; + + put_unaligned_le64((u64)bus_address, &sg_descriptor->address); + put_unaligned_le32(buffer_length, &sg_descriptor->length); + put_unaligned_le32(CISS_SG_LAST, &sg_descriptor->flags); + + return 0; +} + +static void pqi_pci_unmap(struct pci_dev *pci_dev, + struct pqi_sg_descriptor *descriptors, int num_descriptors, + enum dma_data_direction data_direction) +{ + int i; + + if (data_direction == DMA_NONE) + return; + + for (i = 0; i < num_descriptors; i++) + dma_unmap_single(&pci_dev->dev, + (dma_addr_t)get_unaligned_le64(&descriptors[i].address), + get_unaligned_le32(&descriptors[i].length), + data_direction); +} + +static int pqi_build_raid_path_request(struct pqi_ctrl_info *ctrl_info, + struct pqi_raid_path_request *request, u8 cmd, + u8 *scsi3addr, void *buffer, size_t buffer_length, + u16 vpd_page, enum dma_data_direction *dir) +{ + u8 *cdb; + size_t cdb_length = buffer_length; + + memset(request, 0, sizeof(*request)); + + request->header.iu_type = PQI_REQUEST_IU_RAID_PATH_IO; + put_unaligned_le16(offsetof(struct pqi_raid_path_request, + sg_descriptors[1]) - PQI_REQUEST_HEADER_LENGTH, + &request->header.iu_length); + put_unaligned_le32(buffer_length, &request->buffer_length); + memcpy(request->lun_number, scsi3addr, sizeof(request->lun_number)); + request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE; + request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_0; + + cdb = request->cdb; + + switch (cmd) { + case INQUIRY: + request->data_direction = SOP_READ_FLAG; + cdb[0] = INQUIRY; + if (vpd_page & VPD_PAGE) { + cdb[1] = 0x1; + cdb[2] = (u8)vpd_page; + } + cdb[4] = (u8)cdb_length; + break; + case CISS_REPORT_LOG: + case CISS_REPORT_PHYS: + request->data_direction = SOP_READ_FLAG; + cdb[0] = cmd; + if (cmd == CISS_REPORT_PHYS) { + if (ctrl_info->rpl_extended_format_4_5_supported) + cdb[1] = CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_4; + else + cdb[1] = CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_2; + } else { + cdb[1] = ctrl_info->ciss_report_log_flags; + } + put_unaligned_be32(cdb_length, &cdb[6]); + break; + case CISS_GET_RAID_MAP: + request->data_direction = SOP_READ_FLAG; + cdb[0] = CISS_READ; + cdb[1] = CISS_GET_RAID_MAP; + put_unaligned_be32(cdb_length, &cdb[6]); + break; + case SA_FLUSH_CACHE: + request->header.driver_flags = PQI_DRIVER_NONBLOCKABLE_REQUEST; + request->data_direction = SOP_WRITE_FLAG; + cdb[0] = BMIC_WRITE; + cdb[6] = BMIC_FLUSH_CACHE; + put_unaligned_be16(cdb_length, &cdb[7]); + break; + case BMIC_SENSE_DIAG_OPTIONS: + cdb_length = 0; + /* fall through */ + case BMIC_IDENTIFY_CONTROLLER: + case BMIC_IDENTIFY_PHYSICAL_DEVICE: + case BMIC_SENSE_SUBSYSTEM_INFORMATION: + case BMIC_SENSE_FEATURE: + request->data_direction = SOP_READ_FLAG; + cdb[0] = BMIC_READ; + cdb[6] = cmd; + put_unaligned_be16(cdb_length, &cdb[7]); + break; + case BMIC_SET_DIAG_OPTIONS: + cdb_length = 0; + /* fall through */ + case BMIC_WRITE_HOST_WELLNESS: + request->data_direction = SOP_WRITE_FLAG; + cdb[0] = BMIC_WRITE; + cdb[6] = cmd; + put_unaligned_be16(cdb_length, &cdb[7]); + break; + case BMIC_CSMI_PASSTHRU: + request->data_direction = SOP_BIDIRECTIONAL; + cdb[0] = BMIC_WRITE; + cdb[5] = CSMI_CC_SAS_SMP_PASSTHRU; + cdb[6] = cmd; + put_unaligned_be16(cdb_length, &cdb[7]); + break; + default: + dev_err(&ctrl_info->pci_dev->dev, "unknown command 0x%c\n", cmd); + BUG(); + break; + } + + switch (request->data_direction) { + case SOP_READ_FLAG: + *dir = DMA_FROM_DEVICE; + break; + case SOP_WRITE_FLAG: + *dir = DMA_TO_DEVICE; + break; + case SOP_NO_DIRECTION_FLAG: + *dir = DMA_NONE; + break; + default: + *dir = DMA_BIDIRECTIONAL; + break; + } + + return pqi_map_single(ctrl_info->pci_dev, &request->sg_descriptors[0], + buffer, buffer_length, *dir); +} + +static inline void pqi_reinit_io_request(struct pqi_io_request *io_request) +{ + io_request->scmd = NULL; + io_request->status = 0; + io_request->error_info = NULL; + io_request->raid_bypass = false; +} + +static inline struct pqi_io_request *pqi_alloc_io_request(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd) +{ + struct pqi_io_request *io_request; + + io_request = pqi_get_io_request(ctrl_info, scmd); + + if (io_request) + pqi_reinit_io_request(io_request); + + return io_request; +} + +static void pqi_free_io_request(struct pqi_io_request *io_request) +{ + atomic_dec(&io_request->refcount); +} + +static int pqi_send_scsi_raid_request(struct pqi_ctrl_info *ctrl_info, u8 cmd, + u8 *scsi3addr, void *buffer, size_t buffer_length, u16 vpd_page, + struct pqi_raid_error_info *error_info) +{ + int rc; + struct pqi_raid_path_request request; + enum dma_data_direction dir; + + rc = pqi_build_raid_path_request(ctrl_info, &request, cmd, scsi3addr, + buffer, buffer_length, vpd_page, &dir); + if (rc) + return rc; + + rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, error_info); + + pqi_pci_unmap(ctrl_info->pci_dev, request.sg_descriptors, 1, dir); + + return rc; +} + +/* helper functions for pqi_send_scsi_raid_request */ + +static inline int pqi_send_ctrl_raid_request(struct pqi_ctrl_info *ctrl_info, + u8 cmd, void *buffer, size_t buffer_length) +{ + return pqi_send_scsi_raid_request(ctrl_info, cmd, RAID_CTLR_LUNID, + buffer, buffer_length, 0, NULL); +} + +static inline int pqi_send_ctrl_raid_with_error(struct pqi_ctrl_info *ctrl_info, + u8 cmd, void *buffer, size_t buffer_length, + struct pqi_raid_error_info *error_info) +{ + return pqi_send_scsi_raid_request(ctrl_info, cmd, RAID_CTLR_LUNID, + buffer, buffer_length, 0, error_info); +} + +static inline int pqi_identify_controller(struct pqi_ctrl_info *ctrl_info, + struct bmic_identify_controller *buffer) +{ + return pqi_send_ctrl_raid_request(ctrl_info, BMIC_IDENTIFY_CONTROLLER, + buffer, sizeof(*buffer)); +} + +static inline int pqi_sense_subsystem_info(struct pqi_ctrl_info *ctrl_info, + struct bmic_sense_subsystem_info *sense_info) +{ + return pqi_send_ctrl_raid_request(ctrl_info, + BMIC_SENSE_SUBSYSTEM_INFORMATION, sense_info, + sizeof(*sense_info)); +} + +static inline int pqi_scsi_inquiry(struct pqi_ctrl_info *ctrl_info, + u8 *scsi3addr, u16 vpd_page, void *buffer, size_t buffer_length) +{ + return pqi_send_scsi_raid_request(ctrl_info, INQUIRY, scsi3addr, + buffer, buffer_length, vpd_page, NULL); +} + +static int pqi_identify_physical_device(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, + struct bmic_identify_physical_device *buffer, size_t buffer_length) +{ + int rc; + enum dma_data_direction dir; + u16 bmic_device_index; + struct pqi_raid_path_request request; + + rc = pqi_build_raid_path_request(ctrl_info, &request, + BMIC_IDENTIFY_PHYSICAL_DEVICE, RAID_CTLR_LUNID, buffer, + buffer_length, 0, &dir); + if (rc) + return rc; + + bmic_device_index = CISS_GET_DRIVE_NUMBER(device->scsi3addr); + request.cdb[2] = (u8)bmic_device_index; + request.cdb[9] = (u8)(bmic_device_index >> 8); + + rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL); + + pqi_pci_unmap(ctrl_info->pci_dev, request.sg_descriptors, 1, dir); + + return rc; +} + +static inline u32 pqi_aio_limit_to_bytes(__le16 *limit) +{ + u32 bytes; + + bytes = get_unaligned_le16(limit); + if (bytes == 0) + bytes = ~0; + else + bytes *= 1024; + + return bytes; +} + +#pragma pack(1) + +struct bmic_sense_feature_buffer { + struct bmic_sense_feature_buffer_header header; + struct bmic_sense_feature_io_page_aio_subpage aio_subpage; +}; + +#pragma pack() + +#define MINIMUM_AIO_SUBPAGE_BUFFER_LENGTH \ + offsetofend(struct bmic_sense_feature_buffer, \ + aio_subpage.max_write_raid_1_10_3drive) + +#define MINIMUM_AIO_SUBPAGE_LENGTH \ + (offsetofend(struct bmic_sense_feature_io_page_aio_subpage, \ + max_write_raid_1_10_3drive) - \ + FIELD_SIZEOF(struct bmic_sense_feature_io_page_aio_subpage, header)) + +static int pqi_get_advanced_raid_bypass_config(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + enum dma_data_direction dir; + struct pqi_raid_path_request request; + struct bmic_sense_feature_buffer *buffer; + + buffer = kmalloc(sizeof(*buffer), GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + rc = pqi_build_raid_path_request(ctrl_info, &request, BMIC_SENSE_FEATURE, RAID_CTLR_LUNID, + buffer, sizeof(*buffer), 0, &dir); + if (rc) + goto error; + + request.cdb[2] = BMIC_SENSE_FEATURE_IO_PAGE; + request.cdb[3] = BMIC_SENSE_FEATURE_IO_PAGE_AIO_SUBPAGE; + + rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL); + + pqi_pci_unmap(ctrl_info->pci_dev, request.sg_descriptors, 1, dir); + + if (rc) + goto error; + + if (buffer->header.page_code != BMIC_SENSE_FEATURE_IO_PAGE || + buffer->header.subpage_code != + BMIC_SENSE_FEATURE_IO_PAGE_AIO_SUBPAGE || + get_unaligned_le16(&buffer->header.buffer_length) < + MINIMUM_AIO_SUBPAGE_BUFFER_LENGTH || + buffer->aio_subpage.header.page_code != + BMIC_SENSE_FEATURE_IO_PAGE || + buffer->aio_subpage.header.subpage_code != + BMIC_SENSE_FEATURE_IO_PAGE_AIO_SUBPAGE || + get_unaligned_le16(&buffer->aio_subpage.header.page_length) < + MINIMUM_AIO_SUBPAGE_LENGTH) { + goto error; + } + + ctrl_info->max_transfer_encrypted_sas_sata = + pqi_aio_limit_to_bytes( + &buffer->aio_subpage.max_transfer_encrypted_sas_sata); + + ctrl_info->max_transfer_encrypted_nvme = + pqi_aio_limit_to_bytes( + &buffer->aio_subpage.max_transfer_encrypted_nvme); + + ctrl_info->max_write_raid_5_6 = + pqi_aio_limit_to_bytes( + &buffer->aio_subpage.max_write_raid_5_6); + + ctrl_info->max_write_raid_1_10_2drive = + pqi_aio_limit_to_bytes( + &buffer->aio_subpage.max_write_raid_1_10_2drive); + + ctrl_info->max_write_raid_1_10_3drive = + pqi_aio_limit_to_bytes( + &buffer->aio_subpage.max_write_raid_1_10_3drive); + +error: + kfree(buffer); + + return rc; +} + +static int pqi_flush_cache(struct pqi_ctrl_info *ctrl_info, + enum bmic_flush_cache_shutdown_event shutdown_event) +{ + int rc; + struct bmic_flush_cache *flush_cache; + + flush_cache = kzalloc(sizeof(*flush_cache), GFP_KERNEL); + if (!flush_cache) + return -ENOMEM; + + flush_cache->shutdown_event = shutdown_event; + + rc = pqi_send_ctrl_raid_request(ctrl_info, SA_FLUSH_CACHE, flush_cache, + sizeof(*flush_cache)); + + kfree(flush_cache); + + return rc; +} + +int pqi_csmi_smp_passthru(struct pqi_ctrl_info *ctrl_info, + struct bmic_csmi_smp_passthru_buffer *buffer, size_t buffer_length, + struct pqi_raid_error_info *error_info) +{ + return pqi_send_ctrl_raid_with_error(ctrl_info, BMIC_CSMI_PASSTHRU, + buffer, buffer_length, error_info); +} + +#define PQI_FETCH_PTRAID_DATA (1 << 31) + +static int pqi_set_diag_rescan(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + struct bmic_diag_options *diag; + + diag = kzalloc(sizeof(*diag), GFP_KERNEL); + if (!diag) + return -ENOMEM; + + rc = pqi_send_ctrl_raid_request(ctrl_info, BMIC_SENSE_DIAG_OPTIONS, + diag, sizeof(*diag)); + if (rc) + goto out; + + diag->options |= cpu_to_le32(PQI_FETCH_PTRAID_DATA); + + rc = pqi_send_ctrl_raid_request(ctrl_info, BMIC_SET_DIAG_OPTIONS, diag, + sizeof(*diag)); + +out: + kfree(diag); + + return rc; +} + +static inline int pqi_write_host_wellness(struct pqi_ctrl_info *ctrl_info, + void *buffer, size_t buffer_length) +{ + return pqi_send_ctrl_raid_request(ctrl_info, BMIC_WRITE_HOST_WELLNESS, + buffer, buffer_length); +} + +#pragma pack(1) + +struct bmic_host_wellness_driver_version { + u8 start_tag[4]; + u8 driver_version_tag[2]; + __le16 driver_version_length; + char driver_version[32]; + u8 dont_write_tag[2]; + u8 end_tag[2]; +}; + +#pragma pack() + +static int pqi_write_driver_version_to_host_wellness( + struct pqi_ctrl_info *ctrl_info) +{ + int rc; + struct bmic_host_wellness_driver_version *buffer; + size_t buffer_length; + + buffer_length = sizeof(*buffer); + + buffer = kmalloc(buffer_length, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + buffer->start_tag[0] = '<'; + buffer->start_tag[1] = 'H'; + buffer->start_tag[2] = 'W'; + buffer->start_tag[3] = '>'; + buffer->driver_version_tag[0] = 'D'; + buffer->driver_version_tag[1] = 'V'; + put_unaligned_le16(sizeof(buffer->driver_version), + &buffer->driver_version_length); + strncpy(buffer->driver_version, "Linux " DRIVER_VERSION, + sizeof(buffer->driver_version) - 1); + buffer->driver_version[sizeof(buffer->driver_version) - 1] = '\0'; + buffer->dont_write_tag[0] = 'D'; + buffer->dont_write_tag[1] = 'W'; + buffer->end_tag[0] = 'Z'; + buffer->end_tag[1] = 'Z'; + + rc = pqi_write_host_wellness(ctrl_info, buffer, buffer_length); + + kfree(buffer); + + return rc; +} + +#pragma pack(1) + +struct bmic_host_wellness_time { + u8 start_tag[4]; + u8 time_tag[2]; + __le16 time_length; + u8 time[8]; + u8 dont_write_tag[2]; + u8 end_tag[2]; +}; + +#pragma pack() + +static int pqi_write_current_time_to_host_wellness( + struct pqi_ctrl_info *ctrl_info) +{ + int rc; + struct bmic_host_wellness_time *buffer; + size_t buffer_length; + unsigned long local_time; + unsigned int year; + struct tm tm; + + buffer_length = sizeof(*buffer); + + buffer = kmalloc(buffer_length, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + buffer->start_tag[0] = '<'; + buffer->start_tag[1] = 'H'; + buffer->start_tag[2] = 'W'; + buffer->start_tag[3] = '>'; + buffer->time_tag[0] = 'T'; + buffer->time_tag[1] = 'D'; + put_unaligned_le16(sizeof(buffer->time), + &buffer->time_length); + + local_time = ktime_get_real_seconds(); + time64_to_tm(local_time, -sys_tz.tz_minuteswest * 60, &tm); + year = tm.tm_year + 1900; + + buffer->time[0] = bin2bcd(tm.tm_hour); + buffer->time[1] = bin2bcd(tm.tm_min); + buffer->time[2] = bin2bcd(tm.tm_sec); + buffer->time[3] = 0; + buffer->time[4] = bin2bcd(tm.tm_mon + 1); + buffer->time[5] = bin2bcd(tm.tm_mday); + buffer->time[6] = bin2bcd(year / 100); + buffer->time[7] = bin2bcd(year % 100); + + buffer->dont_write_tag[0] = 'D'; + buffer->dont_write_tag[1] = 'W'; + buffer->end_tag[0] = 'Z'; + buffer->end_tag[1] = 'Z'; + + rc = pqi_write_host_wellness(ctrl_info, buffer, buffer_length); + + kfree(buffer); + + return rc; +} + +#define PQI_UPDATE_TIME_WORK_INTERVAL (24UL * 60 * 60 * HZ) + +static void pqi_update_time_worker(struct work_struct *work) +{ + int rc; + struct pqi_ctrl_info *ctrl_info; + + ctrl_info = container_of(to_delayed_work(work), struct pqi_ctrl_info, + update_time_work); + + rc = pqi_write_current_time_to_host_wellness(ctrl_info); + if (rc) + dev_warn(&ctrl_info->pci_dev->dev, + "error updating time on controller\n"); + + schedule_delayed_work(&ctrl_info->update_time_work, + PQI_UPDATE_TIME_WORK_INTERVAL); +} + +static inline void pqi_schedule_update_time_worker(struct pqi_ctrl_info *ctrl_info) +{ + schedule_delayed_work(&ctrl_info->update_time_work, 0); +} + +static inline void pqi_cancel_update_time_worker(struct pqi_ctrl_info *ctrl_info) +{ + cancel_delayed_work_sync(&ctrl_info->update_time_work); +} + +static inline int pqi_report_luns(struct pqi_ctrl_info *ctrl_info, u8 cmd, void *buffer, + size_t buffer_length) +{ + return pqi_send_ctrl_raid_request(ctrl_info, cmd, buffer, buffer_length); +} + +static int pqi_report_phys_logical_luns(struct pqi_ctrl_info *ctrl_info, u8 cmd, void **buffer) +{ + int rc; + size_t lun_list_length; + size_t lun_data_length; + size_t new_lun_list_length; + void *lun_data = NULL; + struct report_lun_header *report_lun_header; + + report_lun_header = kmalloc(sizeof(*report_lun_header), GFP_KERNEL); + if (!report_lun_header) { + rc = -ENOMEM; + goto out; + } + + rc = pqi_report_luns(ctrl_info, cmd, report_lun_header, sizeof(*report_lun_header)); + if (rc) + goto out; + + lun_list_length = get_unaligned_be32(&report_lun_header->list_length); + +again: + lun_data_length = sizeof(struct report_lun_header) + lun_list_length; + + lun_data = kmalloc(lun_data_length, GFP_KERNEL); + if (!lun_data) { + rc = -ENOMEM; + goto out; + } + + if (lun_list_length == 0) { + memcpy(lun_data, report_lun_header, sizeof(*report_lun_header)); + goto out; + } + + rc = pqi_report_luns(ctrl_info, cmd, lun_data, lun_data_length); + if (rc) + goto out; + + new_lun_list_length = + get_unaligned_be32(&((struct report_lun_header *)lun_data)->list_length); + + if (new_lun_list_length > lun_list_length) { + lun_list_length = new_lun_list_length; + kfree(lun_data); + goto again; + } + +out: + kfree(report_lun_header); + + if (rc) { + kfree(lun_data); + lun_data = NULL; + } + + *buffer = lun_data; + + return rc; +} + +static inline int pqi_report_phys_luns(struct pqi_ctrl_info *ctrl_info, void **buffer) +{ + int rc; + unsigned int i; + u8 rpl_response_format; + u32 num_physicals; + size_t rpl_16byte_wwid_list_length; + void *rpl_list; + struct report_lun_header *rpl_header; + struct report_phys_lun_8byte_wwid_list *rpl_8byte_wwid_list; + struct report_phys_lun_16byte_wwid_list *rpl_16byte_wwid_list; + + rc = pqi_report_phys_logical_luns(ctrl_info, CISS_REPORT_PHYS, &rpl_list); + if (rc) + return rc; + + if (ctrl_info->rpl_extended_format_4_5_supported) { + rpl_header = rpl_list; + rpl_response_format = rpl_header->flags & CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_MASK; + if (rpl_response_format == CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_4) { + *buffer = rpl_list; + return 0; + } else if (rpl_response_format != CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_2) { + dev_err(&ctrl_info->pci_dev->dev, + "RPL returned unsupported data format %u\n", + rpl_response_format); + return -EINVAL; + } else { + dev_warn(&ctrl_info->pci_dev->dev, + "RPL returned extended format 2 instead of 4\n"); + } + } + + rpl_8byte_wwid_list = rpl_list; + num_physicals = get_unaligned_be32(&rpl_8byte_wwid_list->header.list_length) / sizeof(rpl_8byte_wwid_list->lun_entries[0]); + rpl_16byte_wwid_list_length = sizeof(struct report_lun_header) + (num_physicals * sizeof(struct report_phys_lun_16byte_wwid)); + + rpl_16byte_wwid_list = kmalloc(rpl_16byte_wwid_list_length, GFP_KERNEL); + if (!rpl_16byte_wwid_list) + return -ENOMEM; + + put_unaligned_be32(num_physicals * sizeof(struct report_phys_lun_16byte_wwid), + &rpl_16byte_wwid_list->header.list_length); + rpl_16byte_wwid_list->header.flags = rpl_8byte_wwid_list->header.flags; + + for (i = 0; i < num_physicals; i++) { + memcpy(&rpl_16byte_wwid_list->lun_entries[i].lunid, &rpl_8byte_wwid_list->lun_entries[i].lunid, sizeof(rpl_8byte_wwid_list->lun_entries[i].lunid)); + memcpy(&rpl_16byte_wwid_list->lun_entries[i].wwid[0], &rpl_8byte_wwid_list->lun_entries[i].wwid, sizeof(rpl_8byte_wwid_list->lun_entries[i].wwid)); + memset(&rpl_16byte_wwid_list->lun_entries[i].wwid[8], 0, 8); + rpl_16byte_wwid_list->lun_entries[i].device_type = rpl_8byte_wwid_list->lun_entries[i].device_type; + rpl_16byte_wwid_list->lun_entries[i].device_flags = rpl_8byte_wwid_list->lun_entries[i].device_flags; + rpl_16byte_wwid_list->lun_entries[i].lun_count = rpl_8byte_wwid_list->lun_entries[i].lun_count; + rpl_16byte_wwid_list->lun_entries[i].redundant_paths = rpl_8byte_wwid_list->lun_entries[i].redundant_paths; + rpl_16byte_wwid_list->lun_entries[i].aio_handle = rpl_8byte_wwid_list->lun_entries[i].aio_handle; + } + + kfree(rpl_8byte_wwid_list); + *buffer = rpl_16byte_wwid_list; + + return 0; +} + +static inline int pqi_report_logical_luns(struct pqi_ctrl_info *ctrl_info, void **buffer) +{ + return pqi_report_phys_logical_luns(ctrl_info, CISS_REPORT_LOG, buffer); +} + +static int pqi_get_device_lists(struct pqi_ctrl_info *ctrl_info, + struct report_phys_lun_16byte_wwid_list **physdev_list, + struct report_log_lun_list **logdev_list) +{ + int rc; + size_t logdev_list_length; + size_t logdev_data_length; + struct report_log_lun_list *internal_logdev_list; + struct report_log_lun_list *logdev_data; + struct report_lun_header report_lun_header; + + rc = pqi_report_phys_luns(ctrl_info, (void **)physdev_list); + if (rc) + dev_err(&ctrl_info->pci_dev->dev, + "report physical LUNs failed\n"); + + rc = pqi_report_logical_luns(ctrl_info, (void **)logdev_list); + if (rc) + dev_err(&ctrl_info->pci_dev->dev, + "report logical LUNs failed\n"); + + /* + * Tack the controller itself onto the end of the logical device list. + */ + + logdev_data = *logdev_list; + + if (logdev_data) { + logdev_list_length = + get_unaligned_be32(&logdev_data->header.list_length); + } else { + memset(&report_lun_header, 0, sizeof(report_lun_header)); + logdev_data = + (struct report_log_lun_list *)&report_lun_header; + logdev_list_length = 0; + } + + logdev_data_length = sizeof(struct report_lun_header) + + logdev_list_length; + + internal_logdev_list = kmalloc(logdev_data_length + + sizeof(struct report_log_lun), GFP_KERNEL); + if (!internal_logdev_list) { + kfree(*logdev_list); + *logdev_list = NULL; + return -ENOMEM; + } + + memcpy(internal_logdev_list, logdev_data, logdev_data_length); + memset((u8 *)internal_logdev_list + logdev_data_length, 0, + sizeof(struct report_log_lun)); + put_unaligned_be32(logdev_list_length + + sizeof(struct report_log_lun), + &internal_logdev_list->header.list_length); + + kfree(*logdev_list); + *logdev_list = internal_logdev_list; + + return 0; +} + +static inline void pqi_set_bus_target_lun(struct pqi_scsi_dev *device, + int bus, int target, int lun) +{ + device->bus = bus; + device->target = target; + device->lun = lun; +} + +static void pqi_assign_bus_target_lun(struct pqi_scsi_dev *device) +{ + u8 *scsi3addr; + u32 lunid; + int bus; + int target; + int lun; + + scsi3addr = device->scsi3addr; + lunid = get_unaligned_le32(scsi3addr); + + if (pqi_is_hba_lunid(scsi3addr)) { + /* The specified device is the controller. */ + pqi_set_bus_target_lun(device, PQI_HBA_BUS, 0, lunid & 0x3fff); + device->target_lun_valid = true; + return; + } + + if (pqi_is_logical_device(device)) { + if (device->is_external_raid_device) { + bus = PQI_EXTERNAL_RAID_VOLUME_BUS; + target = (lunid >> 16) & 0x3fff; + lun = lunid & 0xff; + } else { + bus = PQI_RAID_VOLUME_BUS; + target = 0; + lun = lunid & 0x3fff; + } + pqi_set_bus_target_lun(device, bus, target, lun); + device->target_lun_valid = true; + return; + } + + /* + * Defer target and LUN assignment for non-controller physical devices + * because the SAS transport layer will make these assignments later. + */ + pqi_set_bus_target_lun(device, PQI_PHYSICAL_DEVICE_BUS, 0, 0); +} + +static void pqi_get_raid_level(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device) +{ + int rc; + u8 raid_level; + u8 *buffer; + + raid_level = SA_RAID_UNKNOWN; + + buffer = kmalloc(64, GFP_KERNEL); + if (buffer) { + rc = pqi_scsi_inquiry(ctrl_info, device->scsi3addr, + VPD_PAGE | CISS_VPD_LV_DEVICE_GEOMETRY, buffer, 64); + if (rc == 0) { + raid_level = buffer[8]; + if (raid_level > SA_RAID_MAX) + raid_level = SA_RAID_UNKNOWN; + } + kfree(buffer); + } + + device->raid_level = raid_level; +} + +static int pqi_validate_raid_map(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, struct raid_map *raid_map) +{ + char *err_msg; + u32 raid_map_size; + u32 r5or6_blocks_per_row; + + raid_map_size = get_unaligned_le32(&raid_map->structure_size); + + if (raid_map_size < offsetof(struct raid_map, disk_data)) { + err_msg = "RAID map too small"; + goto bad_raid_map; + } + + if (device->raid_level == SA_RAID_1) { + if (get_unaligned_le16(&raid_map->layout_map_count) != 2) { + err_msg = "invalid RAID-1 map"; + goto bad_raid_map; + } + } else if (device->raid_level == SA_RAID_TRIPLE) { + if (get_unaligned_le16(&raid_map->layout_map_count) != 3) { + err_msg = "invalid RAID-1(Triple) map"; + goto bad_raid_map; + } + } else if ((device->raid_level == SA_RAID_5 || + device->raid_level == SA_RAID_6) && + get_unaligned_le16(&raid_map->layout_map_count) > 1) { + /* RAID 50/60 */ + r5or6_blocks_per_row = + get_unaligned_le16(&raid_map->strip_size) * + get_unaligned_le16(&raid_map->data_disks_per_row); + if (r5or6_blocks_per_row == 0) { + err_msg = "invalid RAID-5 or RAID-6 map"; + goto bad_raid_map; + } + } + + return 0; + +bad_raid_map: + dev_warn(&ctrl_info->pci_dev->dev, + "logical device %08x%08x %s\n", + *((u32 *)&device->scsi3addr), + *((u32 *)&device->scsi3addr[4]), err_msg); + + return -EINVAL; +} + +static int pqi_get_raid_map(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *device) +{ + int rc; + u32 raid_map_size; + u32 structure_size; + struct raid_map *raid_map; + + raid_map_size = sizeof(*raid_map); + + while (1) { + raid_map = kmalloc(raid_map_size, GFP_KERNEL); + if (!raid_map) + return -ENOMEM; + + rc = pqi_send_scsi_raid_request(ctrl_info, CISS_GET_RAID_MAP, + device->scsi3addr, raid_map, raid_map_size, 0, NULL); + if (rc) + goto error; + + structure_size = get_unaligned_le32(&raid_map->structure_size); + if (structure_size <= raid_map_size) + break; + + kfree(raid_map); + raid_map_size = structure_size; + } + + rc = pqi_validate_raid_map(ctrl_info, device, raid_map); + if (rc) + goto error; + + device->raid_map = raid_map; + + return 0; + +error: + kfree(raid_map); + + return rc; +} + +static void pqi_set_max_transfer_encrypted(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device) +{ + if (!ctrl_info->lv_drive_type_mix_valid) { + device->max_transfer_encrypted = ~0; + return; + } + + switch (LV_GET_DRIVE_TYPE_MIX(device->scsi3addr)) { + case LV_DRIVE_TYPE_MIX_SAS_HDD_ONLY: + case LV_DRIVE_TYPE_MIX_SATA_HDD_ONLY: + case LV_DRIVE_TYPE_MIX_SAS_OR_SATA_SSD_ONLY: + case LV_DRIVE_TYPE_MIX_SAS_SSD_ONLY: + case LV_DRIVE_TYPE_MIX_SATA_SSD_ONLY: + case LV_DRIVE_TYPE_MIX_SAS_ONLY: + case LV_DRIVE_TYPE_MIX_SATA_ONLY: + device->max_transfer_encrypted = + ctrl_info->max_transfer_encrypted_sas_sata; + break; + case LV_DRIVE_TYPE_MIX_NVME_ONLY: + device->max_transfer_encrypted = + ctrl_info->max_transfer_encrypted_nvme; + break; + case LV_DRIVE_TYPE_MIX_UNKNOWN: + case LV_DRIVE_TYPE_MIX_NO_RESTRICTION: + default: + device->max_transfer_encrypted = + min(ctrl_info->max_transfer_encrypted_sas_sata, + ctrl_info->max_transfer_encrypted_nvme); + break; + } +} + +static void pqi_get_raid_bypass_status(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device) +{ + int rc; + u8 *buffer; + u8 bypass_status; + + buffer = kmalloc(64, GFP_KERNEL); + if (!buffer) + return; + + rc = pqi_scsi_inquiry(ctrl_info, device->scsi3addr, + VPD_PAGE | CISS_VPD_LV_BYPASS_STATUS, buffer, 64); + if (rc) + goto out; + +#define RAID_BYPASS_STATUS 4 +#define RAID_BYPASS_CONFIGURED 0x1 +#define RAID_BYPASS_ENABLED 0x2 + + bypass_status = buffer[RAID_BYPASS_STATUS]; + device->raid_bypass_configured = + (bypass_status & RAID_BYPASS_CONFIGURED) != 0; + if (device->raid_bypass_configured && + (bypass_status & RAID_BYPASS_ENABLED) && + pqi_get_raid_map(ctrl_info, device) == 0) { + device->raid_bypass_enabled = true; + if (get_unaligned_le16(&device->raid_map->flags) & + RAID_MAP_ENCRYPTION_ENABLED) + pqi_set_max_transfer_encrypted(ctrl_info, device); + } + +out: + kfree(buffer); +} + +/* + * Use vendor-specific VPD to determine online/offline status of a volume. + */ + +static void pqi_get_volume_status(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device) +{ + int rc; + size_t page_length; + u8 volume_status = CISS_LV_STATUS_UNAVAILABLE; + bool volume_offline = true; + u32 volume_flags; + struct ciss_vpd_logical_volume_status *vpd; + + vpd = kmalloc(sizeof(*vpd), GFP_KERNEL); + if (!vpd) + goto no_buffer; + + rc = pqi_scsi_inquiry(ctrl_info, device->scsi3addr, + VPD_PAGE | CISS_VPD_LV_STATUS, vpd, sizeof(*vpd)); + if (rc) + goto out; + + if (vpd->page_code != CISS_VPD_LV_STATUS) + goto out; + + page_length = offsetof(struct ciss_vpd_logical_volume_status, + volume_status) + vpd->page_length; + if (page_length < sizeof(*vpd)) + goto out; + + volume_status = vpd->volume_status; + volume_flags = get_unaligned_be32(&vpd->flags); + volume_offline = (volume_flags & CISS_LV_FLAGS_NO_HOST_IO) != 0; + +out: + kfree(vpd); +no_buffer: + device->volume_status = volume_status; + device->volume_offline = volume_offline; +} + +#define PQI_DEVICE_NCQ_PRIO_SUPPORTED 0x01 +#define PQI_DEVICE_PHY_MAP_SUPPORTED 0x10 +#define PQI_DEVICE_ERASE_IN_PROGRESS 0x10 + +static int pqi_get_physical_device_info(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, + struct bmic_identify_physical_device *id_phys) +{ + int rc; + + memset(id_phys, 0, sizeof(*id_phys)); + + rc = pqi_identify_physical_device(ctrl_info, device, + id_phys, sizeof(*id_phys)); + if (rc) { + device->queue_depth = PQI_PHYSICAL_DISK_DEFAULT_MAX_QUEUE_DEPTH; + return rc; + } + + scsi_sanitize_inquiry_string(&id_phys->model[0], 8); + scsi_sanitize_inquiry_string(&id_phys->model[8], 16); + + memcpy(device->vendor, &id_phys->model[0], sizeof(device->vendor)); + memcpy(device->model, &id_phys->model[8], sizeof(device->model)); + + device->box_index = id_phys->box_index; + device->phys_box_on_bus = id_phys->phys_box_on_bus; + device->phy_connected_dev_type = id_phys->phy_connected_dev_type[0]; + device->queue_depth = + get_unaligned_le16(&id_phys->current_queue_depth_limit); + device->active_path_index = id_phys->active_path_number; + device->path_map = id_phys->redundant_path_present_map; + memcpy(&device->box, + &id_phys->alternate_paths_phys_box_on_port, + sizeof(device->box)); + memcpy(&device->phys_connector, + &id_phys->alternate_paths_phys_connector, + sizeof(device->phys_connector)); + device->bay = id_phys->phys_bay_in_box; + device->lun_count = id_phys->multi_lun_device_lun_count; + if ((id_phys->even_more_flags & PQI_DEVICE_PHY_MAP_SUPPORTED) && + id_phys->phy_count) + device->phy_id = + id_phys->phy_to_phy_map[device->active_path_index]; + else + device->phy_id = 0xFF; + + device->ncq_prio_support = + ((get_unaligned_le32(&id_phys->misc_drive_flags) >> 16) & + PQI_DEVICE_NCQ_PRIO_SUPPORTED); + + device->erase_in_progress = !!(get_unaligned_le16(&id_phys->extra_physical_drive_flags) & PQI_DEVICE_ERASE_IN_PROGRESS); + + return 0; +} + +static int pqi_get_logical_device_info(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device) +{ + int rc; + u8 *buffer; + + buffer = kmalloc(64, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + /* Send an inquiry to the device to see what it is. */ + rc = pqi_scsi_inquiry(ctrl_info, device->scsi3addr, 0, buffer, 64); + if (rc) + goto out; + + scsi_sanitize_inquiry_string(&buffer[8], 8); + scsi_sanitize_inquiry_string(&buffer[16], 16); + + device->devtype = buffer[0] & 0x1f; + memcpy(device->vendor, &buffer[8], sizeof(device->vendor)); + memcpy(device->model, &buffer[16], sizeof(device->model)); + + if (device->devtype == TYPE_DISK) { + if (device->is_external_raid_device) { + device->raid_level = SA_RAID_UNKNOWN; + device->volume_status = CISS_LV_OK; + device->volume_offline = false; + } else { + pqi_get_raid_level(ctrl_info, device); + pqi_get_raid_bypass_status(ctrl_info, device); + pqi_get_volume_status(ctrl_info, device); + } + } + +out: + kfree(buffer); + + return rc; +} + +/* + * Prevent adding drive to OS for some corner cases such as a drive + * undergoing a sanitize (erase) operation. Some OSes will continue to poll + * the drive until the sanitize completes, which can take hours, + * resulting in long bootup delays. Commands such as TUR, READ_CAP + * are allowed, but READ/WRITE cause check condition. So the OS + * cannot check/read the partition table. + * Note: devices that have completed sanitize must be re-enabled + * using the management utility. + */ +static inline bool pqi_keep_device_offline(struct pqi_scsi_dev *device) +{ + return device->erase_in_progress; +} + +static int pqi_get_device_info_phys_logical(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, + struct bmic_identify_physical_device *id_phys) +{ + int rc; + + if (device->is_expander_smp_device) + return 0; + + if (pqi_is_logical_device(device)) + rc = pqi_get_logical_device_info(ctrl_info, device); + else + rc = pqi_get_physical_device_info(ctrl_info, device, id_phys); + + return rc; +} + +static int pqi_get_device_info(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, + struct bmic_identify_physical_device *id_phys) +{ + int rc; + + rc = pqi_get_device_info_phys_logical(ctrl_info, device, id_phys); + + if (rc == 0 && device->lun_count == 0) + device->lun_count = 1; + + return rc; +} + +static void pqi_show_volume_status(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device) +{ + char *status; + static const char unknown_state_str[] = + "Volume is in an unknown state (%u)"; + char unknown_state_buffer[sizeof(unknown_state_str) + 10]; + + switch (device->volume_status) { + case CISS_LV_OK: + status = "Volume online"; + break; + case CISS_LV_FAILED: + status = "Volume failed"; + break; + case CISS_LV_NOT_CONFIGURED: + status = "Volume not configured"; + break; + case CISS_LV_DEGRADED: + status = "Volume degraded"; + break; + case CISS_LV_READY_FOR_RECOVERY: + status = "Volume ready for recovery operation"; + break; + case CISS_LV_UNDERGOING_RECOVERY: + status = "Volume undergoing recovery"; + break; + case CISS_LV_WRONG_PHYSICAL_DRIVE_REPLACED: + status = "Wrong physical drive was replaced"; + break; + case CISS_LV_PHYSICAL_DRIVE_CONNECTION_PROBLEM: + status = "A physical drive not properly connected"; + break; + case CISS_LV_HARDWARE_OVERHEATING: + status = "Hardware is overheating"; + break; + case CISS_LV_HARDWARE_HAS_OVERHEATED: + status = "Hardware has overheated"; + break; + case CISS_LV_UNDERGOING_EXPANSION: + status = "Volume undergoing expansion"; + break; + case CISS_LV_NOT_AVAILABLE: + status = "Volume waiting for transforming volume"; + break; + case CISS_LV_QUEUED_FOR_EXPANSION: + status = "Volume queued for expansion"; + break; + case CISS_LV_DISABLED_SCSI_ID_CONFLICT: + status = "Volume disabled due to SCSI ID conflict"; + break; + case CISS_LV_EJECTED: + status = "Volume has been ejected"; + break; + case CISS_LV_UNDERGOING_ERASE: + status = "Volume undergoing background erase"; + break; + case CISS_LV_READY_FOR_PREDICTIVE_SPARE_REBUILD: + status = "Volume ready for predictive spare rebuild"; + break; + case CISS_LV_UNDERGOING_RPI: + status = "Volume undergoing rapid parity initialization"; + break; + case CISS_LV_PENDING_RPI: + status = "Volume queued for rapid parity initialization"; + break; + case CISS_LV_ENCRYPTED_NO_KEY: + status = "Encrypted volume inaccessible - key not present"; + break; + case CISS_LV_UNDERGOING_ENCRYPTION: + status = "Volume undergoing encryption process"; + break; + case CISS_LV_UNDERGOING_ENCRYPTION_REKEYING: + status = "Volume undergoing encryption re-keying process"; + break; + case CISS_LV_ENCRYPTED_IN_NON_ENCRYPTED_CONTROLLER: + status = "Volume encrypted but encryption is disabled"; + break; + case CISS_LV_PENDING_ENCRYPTION: + status = "Volume pending migration to encrypted state"; + break; + case CISS_LV_PENDING_ENCRYPTION_REKEYING: + status = "Volume pending encryption rekeying"; + break; + case CISS_LV_NOT_SUPPORTED: + status = "Volume not supported on this controller"; + break; + case CISS_LV_STATUS_UNAVAILABLE: + status = "Volume status not available"; + break; + default: + scnprintf(unknown_state_buffer, sizeof(unknown_state_buffer), + unknown_state_str, device->volume_status); + status = unknown_state_buffer; + break; + } + + dev_info(&ctrl_info->pci_dev->dev, + "scsi %d:%d:%d:%d %s\n", + ctrl_info->scsi_host->host_no, + device->bus, device->target, device->lun, status); +} + +static void pqi_rescan_worker(struct work_struct *work) +{ + struct pqi_ctrl_info *ctrl_info; + + ctrl_info = container_of(to_delayed_work(work), struct pqi_ctrl_info, + rescan_work); + + pqi_scan_scsi_devices(ctrl_info); +} + +static int pqi_add_device(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device) +{ + int rc; + + if (pqi_is_logical_device(device)) + rc = scsi_add_device(ctrl_info->scsi_host, device->bus, + device->target, device->lun); + else + rc = pqi_add_sas_device(ctrl_info->sas_host, device); + + return rc; +} + +#define PQI_REMOVE_DEVICE_PENDING_IO_TIMEOUT_MSECS (20 * 1000) + +static inline void pqi_remove_device(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *device) +{ + int rc; + int lun; + + for (lun = 0; lun < device->lun_count; lun++) { + rc = pqi_device_wait_for_pending_io(ctrl_info, device, lun, + PQI_REMOVE_DEVICE_PENDING_IO_TIMEOUT_MSECS); + if (rc) + dev_err(&ctrl_info->pci_dev->dev, + "scsi %d:%d:%d:%d removing device with %d outstanding command(s)\n", + ctrl_info->scsi_host->host_no, device->bus, + device->target, lun, + atomic_read(&device->scsi_cmds_outstanding[lun])); + } + + if (pqi_is_logical_device(device)) + scsi_remove_device(device->sdev); + else + pqi_remove_sas_device(device); + + pqi_device_remove_start(device); +} + +/* Assumes the SCSI device list lock is held. */ + +static struct pqi_scsi_dev *pqi_find_scsi_dev(struct pqi_ctrl_info *ctrl_info, + int bus, int target, int lun) +{ + struct pqi_scsi_dev *device; + + list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry) + if (device->bus == bus && device->target == target && device->lun == lun) + return device; + + return NULL; +} + +static inline bool pqi_device_equal(struct pqi_scsi_dev *dev1, struct pqi_scsi_dev *dev2) +{ + if (dev1->is_physical_device != dev2->is_physical_device) + return false; + + if (dev1->is_physical_device) + return memcmp(dev1->wwid, dev2->wwid, sizeof(dev1->wwid)) == 0; + + return memcmp(dev1->volume_id, dev2->volume_id, sizeof(dev1->volume_id)) == 0; +} + +enum pqi_find_result { + DEVICE_NOT_FOUND, + DEVICE_CHANGED, + DEVICE_SAME, +}; + +static enum pqi_find_result pqi_scsi_find_entry(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device_to_find, struct pqi_scsi_dev **matching_device) +{ + struct pqi_scsi_dev *device; + + list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry) { + if (pqi_scsi3addr_equal(device_to_find->scsi3addr, device->scsi3addr)) { + *matching_device = device; + if (pqi_device_equal(device_to_find, device)) { + if (device_to_find->volume_offline) + return DEVICE_CHANGED; + return DEVICE_SAME; + } + return DEVICE_CHANGED; + } + } + + return DEVICE_NOT_FOUND; +} + +static inline const char *pqi_device_type(struct pqi_scsi_dev *device) +{ + if (device->is_expander_smp_device) + return "Enclosure SMP "; + + return scsi_device_type(device->devtype); +} + +#define PQI_DEV_INFO_BUFFER_LENGTH 128 + +static void pqi_dev_info(struct pqi_ctrl_info *ctrl_info, + char *action, struct pqi_scsi_dev *device) +{ + ssize_t count; + char buffer[PQI_DEV_INFO_BUFFER_LENGTH]; + + count = scnprintf(buffer, PQI_DEV_INFO_BUFFER_LENGTH, + "%d:%d:", ctrl_info->scsi_host->host_no, device->bus); + + if (device->target_lun_valid) + count += scnprintf(buffer + count, + PQI_DEV_INFO_BUFFER_LENGTH - count, + "%d:%d", + device->target, + device->lun); + else + count += scnprintf(buffer + count, + PQI_DEV_INFO_BUFFER_LENGTH - count, + "-:-"); + + if (pqi_is_logical_device(device)) + count += scnprintf(buffer + count, + PQI_DEV_INFO_BUFFER_LENGTH - count, + " %08x%08x", + *((u32 *)&device->scsi3addr), + *((u32 *)&device->scsi3addr[4])); + else + count += scnprintf(buffer + count, + PQI_DEV_INFO_BUFFER_LENGTH - count, + " %016llx%016llx", + get_unaligned_be64(&device->wwid[0]), + get_unaligned_be64(&device->wwid[8])); + + count += scnprintf(buffer + count, PQI_DEV_INFO_BUFFER_LENGTH - count, + " %s %.8s %.16s ", + pqi_device_type(device), + device->vendor, + device->model); + + if (pqi_is_logical_device(device)) { + if (device->devtype == TYPE_DISK) + count += scnprintf(buffer + count, + PQI_DEV_INFO_BUFFER_LENGTH - count, + "SSDSmartPathCap%c En%c %-12s", + device->raid_bypass_configured ? '+' : '-', + device->raid_bypass_enabled ? '+' : '-', + pqi_raid_level_to_string(device->raid_level)); + } else { + count += scnprintf(buffer + count, + PQI_DEV_INFO_BUFFER_LENGTH - count, + "AIO%c", device->aio_enabled ? '+' : '-'); + if (device->devtype == TYPE_DISK || + device->devtype == TYPE_ZBC) + count += scnprintf(buffer + count, + PQI_DEV_INFO_BUFFER_LENGTH - count, + " qd=%-6d", device->queue_depth); + } + + dev_info(&ctrl_info->pci_dev->dev, "%s %s\n", action, buffer); +} + +static bool pqi_raid_maps_equal(struct raid_map *raid_map1, struct raid_map *raid_map2) +{ + u32 raid_map1_size; + u32 raid_map2_size; + + if (raid_map1 == NULL || raid_map2 == NULL) + return raid_map1 == raid_map2; + + raid_map1_size = get_unaligned_le32(&raid_map1->structure_size); + raid_map2_size = get_unaligned_le32(&raid_map2->structure_size); + + if (raid_map1_size != raid_map2_size) + return false; + + return memcmp(raid_map1, raid_map2, raid_map1_size) == 0; +} + +/* Assumes the SCSI device list lock is held. */ + +static void pqi_scsi_update_device(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *existing_device, struct pqi_scsi_dev *new_device) +{ + existing_device->device_type = new_device->device_type; + existing_device->bus = new_device->bus; + if (new_device->target_lun_valid) { + existing_device->target = new_device->target; + existing_device->lun = new_device->lun; + existing_device->target_lun_valid = true; + } + + /* By definition, the scsi3addr and wwid fields are already the same. */ + + existing_device->is_physical_device = new_device->is_physical_device; + memcpy(existing_device->vendor, new_device->vendor, sizeof(existing_device->vendor)); + memcpy(existing_device->model, new_device->model, sizeof(existing_device->model)); + existing_device->sas_address = new_device->sas_address; + existing_device->queue_depth = new_device->queue_depth; + existing_device->device_offline = false; + existing_device->lun_count = new_device->lun_count; + + if (pqi_is_logical_device(existing_device)) { + existing_device->is_external_raid_device = new_device->is_external_raid_device; + + if (existing_device->devtype == TYPE_DISK) { + existing_device->raid_level = new_device->raid_level; + existing_device->volume_status = new_device->volume_status; + if (ctrl_info->logical_volume_rescan_needed) + existing_device->rescan = true; + memset(existing_device->next_bypass_group, 0, sizeof(existing_device->next_bypass_group)); + if (!pqi_raid_maps_equal(existing_device->raid_map, new_device->raid_map)) { + kfree(existing_device->raid_map); + existing_device->raid_map = new_device->raid_map; + /* To prevent this from being freed later. */ + new_device->raid_map = NULL; + } + existing_device->raid_bypass_configured = new_device->raid_bypass_configured; + existing_device->raid_bypass_enabled = new_device->raid_bypass_enabled; + } + } else { + existing_device->aio_enabled = new_device->aio_enabled; + existing_device->aio_handle = new_device->aio_handle; + existing_device->is_expander_smp_device = new_device->is_expander_smp_device; + existing_device->active_path_index = new_device->active_path_index; + existing_device->phy_id = new_device->phy_id; + existing_device->path_map = new_device->path_map; + existing_device->bay = new_device->bay; + existing_device->box_index = new_device->box_index; + existing_device->phys_box_on_bus = new_device->phys_box_on_bus; + existing_device->phy_connected_dev_type = new_device->phy_connected_dev_type; + memcpy(existing_device->box, new_device->box, sizeof(existing_device->box)); + memcpy(existing_device->phys_connector, new_device->phys_connector, sizeof(existing_device->phys_connector)); + } +} + +static inline void pqi_free_device(struct pqi_scsi_dev *device) +{ + if (device) { + kfree(device->raid_map); + kfree(device); + } +} + +/* + * Called when exposing a new device to the OS fails in order to re-adjust + * our internal SCSI device list to match the SCSI ML's view. + */ + +static inline void pqi_fixup_botched_add(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device) +{ + unsigned long flags; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + list_del(&device->scsi_device_list_entry); + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + /* Allow the device structure to be freed later. */ + device->keep_device = false; +} + +static inline bool pqi_is_device_added(struct pqi_scsi_dev *device) +{ + if (device->is_expander_smp_device) + return device->sas_port != NULL; + + return device->sdev != NULL; +} + +static void pqi_update_device_list(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *new_device_list[], unsigned int num_new_devices) +{ + int rc; + unsigned int i; + unsigned long flags; + enum pqi_find_result find_result; + struct pqi_scsi_dev *device; + struct pqi_scsi_dev *next; + struct pqi_scsi_dev *matching_device; + LIST_HEAD(add_list); + LIST_HEAD(delete_list); + + /* + * The idea here is to do as little work as possible while holding the + * spinlock. That's why we go to great pains to defer anything other + * than updating the internal device list until after we release the + * spinlock. + */ + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + /* Assume that all devices in the existing list have gone away. */ + list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry) + device->device_gone = true; + + for (i = 0; i < num_new_devices; i++) { + device = new_device_list[i]; + + find_result = pqi_scsi_find_entry(ctrl_info, device, + &matching_device); + + switch (find_result) { + case DEVICE_SAME: + /* + * The newly found device is already in the existing + * device list. + */ + device->new_device = false; + matching_device->device_gone = false; + pqi_scsi_update_device(ctrl_info, matching_device, device); + break; + case DEVICE_NOT_FOUND: + /* + * The newly found device is NOT in the existing device + * list. + */ + device->new_device = true; + break; + case DEVICE_CHANGED: + /* + * The original device has gone away and we need to add + * the new device. + */ + device->new_device = true; + break; + default: + BUG(); + break; + } + } + + /* Process all devices that have gone away. */ + list_for_each_entry_safe(device, next, &ctrl_info->scsi_device_list, + scsi_device_list_entry) { + if (device->device_gone) { + list_del(&device->scsi_device_list_entry); + list_add_tail(&device->delete_list_entry, &delete_list); + } + } + + /* Process all new devices. */ + for (i = 0; i < num_new_devices; i++) { + device = new_device_list[i]; + if (!device->new_device) + continue; + if (device->volume_offline) + continue; + list_add_tail(&device->scsi_device_list_entry, + &ctrl_info->scsi_device_list); + list_add_tail(&device->add_list_entry, &add_list); + /* To prevent this device structure from being freed later. */ + device->keep_device = true; + } + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + /* + * If OFA is in progress and there are devices that need to be deleted, + * allow any pending reset operations to continue and unblock any SCSI + * requests before removal. + */ + if (pqi_ofa_in_progress(ctrl_info)) { + list_for_each_entry_safe(device, next, &delete_list, delete_list_entry) + if (pqi_is_device_added(device)) + pqi_device_remove_start(device); + pqi_ctrl_unblock_device_reset(ctrl_info); + pqi_scsi_unblock_requests(ctrl_info); + } + + /* Remove all devices that have gone away. */ + list_for_each_entry_safe(device, next, &delete_list, delete_list_entry) { + if (device->volume_offline) { + pqi_dev_info(ctrl_info, "offline", device); + pqi_show_volume_status(ctrl_info, device); + } else { + pqi_dev_info(ctrl_info, "removed", device); + } + if (pqi_is_device_added(device)) + pqi_remove_device(ctrl_info, device); + list_del(&device->delete_list_entry); + pqi_free_device(device); + } + + /* + * Notify the SML of any existing device changes such as; + * queue depth, device size. + */ + list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry) { + if (device->sdev && device->queue_depth != device->advertised_queue_depth) { + device->advertised_queue_depth = device->queue_depth; + scsi_change_queue_depth(device->sdev, device->advertised_queue_depth); + } + if (device->rescan) { + scsi_rescan_device(&device->sdev->sdev_gendev); + device->rescan = false; + } + } + + /* Expose any new devices. */ + list_for_each_entry_safe(device, next, &add_list, add_list_entry) { + if (!pqi_is_device_added(device)) { + rc = pqi_add_device(ctrl_info, device); + if (rc == 0) { + pqi_dev_info(ctrl_info, "added", device); + } else { + dev_warn(&ctrl_info->pci_dev->dev, + "scsi %d:%d:%d:%d addition failed, device not added\n", + ctrl_info->scsi_host->host_no, + device->bus, device->target, + device->lun); + pqi_fixup_botched_add(ctrl_info, device); + } + } + } + + ctrl_info->logical_volume_rescan_needed = false; + +} + +static inline bool pqi_is_supported_device(struct pqi_scsi_dev *device) +{ + /* + * Only support the HBA controller itself as a RAID + * controller. If it's a RAID controller other than + * the HBA itself (an external RAID controller, for + * example), we don't support it. + */ + if (device->device_type == SA_DEVICE_TYPE_CONTROLLER && + !pqi_is_hba_lunid(device->scsi3addr)) + return false; + + return true; +} + +static inline bool pqi_skip_device(u8 *scsi3addr) +{ + /* Ignore all masked devices. */ + if (MASKED_DEVICE(scsi3addr)) + return true; + + return false; +} + +static inline void pqi_mask_device(u8 *scsi3addr) +{ + scsi3addr[3] |= 0xc0; +} + +static inline bool pqi_is_multipath_device(struct pqi_scsi_dev *device) +{ + if (pqi_is_logical_device(device)) + return false; + + return (device->path_map & (device->path_map - 1)) != 0; +} + +static inline bool pqi_expose_device(struct pqi_scsi_dev *device) +{ + return !device->is_physical_device || !pqi_skip_device(device->scsi3addr); +} + +static int pqi_update_scsi_devices(struct pqi_ctrl_info *ctrl_info) +{ + int i; + int rc; + LIST_HEAD(new_device_list_head); + struct report_phys_lun_16byte_wwid_list *physdev_list = NULL; + struct report_log_lun_list *logdev_list = NULL; + struct report_phys_lun_16byte_wwid *phys_lun; + struct report_log_lun *log_lun; + struct bmic_identify_physical_device *id_phys = NULL; + u32 num_physicals; + u32 num_logicals; + struct pqi_scsi_dev **new_device_list = NULL; + struct pqi_scsi_dev *device; + struct pqi_scsi_dev *next; + unsigned int num_new_devices; + unsigned int num_valid_devices; + bool is_physical_device; + u8 *scsi3addr; + unsigned int physical_index; + unsigned int logical_index; + static char *out_of_memory_msg = + "failed to allocate memory, device discovery stopped"; + + rc = pqi_get_device_lists(ctrl_info, &physdev_list, &logdev_list); + if (rc) + goto out; + + if (physdev_list) + num_physicals = + get_unaligned_be32(&physdev_list->header.list_length) + / sizeof(physdev_list->lun_entries[0]); + else + num_physicals = 0; + + if (logdev_list) + num_logicals = + get_unaligned_be32(&logdev_list->header.list_length) + / sizeof(logdev_list->lun_entries[0]); + else + num_logicals = 0; + + if (num_physicals) { + /* + * We need this buffer for calls to pqi_get_physical_disk_info() + * below. We allocate it here instead of inside + * pqi_get_physical_disk_info() because it's a fairly large + * buffer. + */ + id_phys = kmalloc(sizeof(*id_phys), GFP_KERNEL); + if (!id_phys) { + dev_warn(&ctrl_info->pci_dev->dev, "%s\n", + out_of_memory_msg); + rc = -ENOMEM; + goto out; + } + + if (pqi_hide_vsep) { + for (i = num_physicals - 1; i >= 0; i--) { + phys_lun = &physdev_list->lun_entries[i]; + if (CISS_GET_DRIVE_NUMBER(phys_lun->lunid) == PQI_VSEP_CISS_BTL) { + pqi_mask_device(phys_lun->lunid); + break; + } + } + } + } + + if (num_logicals && + (logdev_list->header.flags & CISS_REPORT_LOG_FLAG_DRIVE_TYPE_MIX)) + ctrl_info->lv_drive_type_mix_valid = true; + + num_new_devices = num_physicals + num_logicals; + + new_device_list = kmalloc(sizeof(*new_device_list) * + num_new_devices, GFP_KERNEL); + if (!new_device_list) { + dev_warn(&ctrl_info->pci_dev->dev, "%s\n", out_of_memory_msg); + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < num_new_devices; i++) { + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) { + dev_warn(&ctrl_info->pci_dev->dev, "%s\n", + out_of_memory_msg); + rc = -ENOMEM; + goto out; + } + list_add_tail(&device->new_device_list_entry, + &new_device_list_head); + } + + device = NULL; + num_valid_devices = 0; + physical_index = 0; + logical_index = 0; + + for (i = 0; i < num_new_devices; i++) { + + if ((!pqi_expose_ld_first && i < num_physicals) || + (pqi_expose_ld_first && i >= num_logicals)) { + is_physical_device = true; + phys_lun = &physdev_list->lun_entries[physical_index++]; + log_lun = NULL; + scsi3addr = phys_lun->lunid; + } else { + is_physical_device = false; + phys_lun = NULL; + log_lun = &logdev_list->lun_entries[logical_index++]; + scsi3addr = log_lun->lunid; + } + + if (is_physical_device && pqi_skip_device(scsi3addr)) + continue; + + if (device) + device = list_next_entry(device, new_device_list_entry); + else + device = list_first_entry(&new_device_list_head, + struct pqi_scsi_dev, new_device_list_entry); + + memcpy(device->scsi3addr, scsi3addr, sizeof(device->scsi3addr)); + device->is_physical_device = is_physical_device; + if (is_physical_device) { + device->device_type = phys_lun->device_type; + if (device->device_type == SA_DEVICE_TYPE_EXPANDER_SMP) + device->is_expander_smp_device = true; + } else { + device->is_external_raid_device = + pqi_is_external_raid_addr(scsi3addr); + } + + if (!pqi_is_supported_device(device)) + continue; + + /* Gather information about the device. */ + rc = pqi_get_device_info(ctrl_info, device, id_phys); + if (rc == -ENOMEM) { + dev_warn(&ctrl_info->pci_dev->dev, "%s\n", + out_of_memory_msg); + goto out; + } + if (rc) { + if (device->is_physical_device) + dev_warn(&ctrl_info->pci_dev->dev, + "obtaining device info failed, skipping physical device %016llx%016llx\n", + get_unaligned_be64(&phys_lun->wwid[0]), + get_unaligned_be64(&phys_lun->wwid[8])); + else + dev_warn(&ctrl_info->pci_dev->dev, + "obtaining device info failed, skipping logical device %08x%08x\n", + *((u32 *)&device->scsi3addr), + *((u32 *)&device->scsi3addr[4])); + rc = 0; + continue; + } + + /* Do not present disks that the OS cannot fully probe. */ + if (pqi_keep_device_offline(device)) + continue; + + pqi_assign_bus_target_lun(device); + + if (device->is_physical_device) { + memcpy(device->wwid, phys_lun->wwid, sizeof(device->wwid)); + if ((phys_lun->device_flags & + CISS_REPORT_PHYS_DEV_FLAG_AIO_ENABLED) && + phys_lun->aio_handle) { + device->aio_enabled = true; + device->aio_handle = + phys_lun->aio_handle; + } + } else { + memcpy(device->volume_id, log_lun->volume_id, + sizeof(device->volume_id)); + } + + device->sas_address = get_unaligned_be64(&device->wwid[0]); + + new_device_list[num_valid_devices++] = device; + } + + pqi_update_device_list(ctrl_info, new_device_list, num_valid_devices); + +out: + list_for_each_entry_safe(device, next, &new_device_list_head, + new_device_list_entry) { + if (device->keep_device) + continue; + list_del(&device->new_device_list_entry); + pqi_free_device(device); + } + + kfree(new_device_list); + kfree(physdev_list); + kfree(logdev_list); + kfree(id_phys); + + return rc; +} + +#if TORTUGA + +static int pqi_add_controller(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + unsigned long flags; + struct pqi_scsi_dev *device; + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to allocate memory for controller device\n"); + return -ENOMEM; + } + + device->devtype = TYPE_RAID; + pqi_assign_bus_target_lun(device); + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + list_add_tail(&device->scsi_device_list_entry, &ctrl_info->scsi_device_list); + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + rc = pqi_add_device(ctrl_info, device); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "scsi %d:%d:%d:%d addition failed, device not added\n", + ctrl_info->scsi_host->host_no, + device->bus, device->target, + device->lun); + goto error; + } + + return 0; + +error: + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + list_del(&device->scsi_device_list_entry); + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + kfree(device); + + return rc; +} + +static int pqi_scan_scsi_devices(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + + if (list_empty(&ctrl_info->scsi_device_list)) + rc = pqi_add_controller(ctrl_info); + else + rc = 0; + + return rc; +} + +#else + +static int pqi_scan_scsi_devices(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + int mutex_acquired; + + if (pqi_ctrl_offline(ctrl_info)) + return -ENXIO; + + mutex_acquired = mutex_trylock(&ctrl_info->scan_mutex); + + if (!mutex_acquired) { + if (pqi_ctrl_scan_blocked(ctrl_info)) + return -EBUSY; + pqi_schedule_rescan_worker_delayed(ctrl_info); + return -EINPROGRESS; + } + + rc = pqi_update_scsi_devices(ctrl_info); + if (rc && !pqi_ctrl_scan_blocked(ctrl_info)) + pqi_schedule_rescan_worker_delayed(ctrl_info); + + mutex_unlock(&ctrl_info->scan_mutex); + + return rc; +} + +#endif + +static void pqi_scan_start(struct Scsi_Host *shost) +{ + struct pqi_ctrl_info *ctrl_info; + + ctrl_info = shost_to_hba(shost); + + pqi_scan_scsi_devices(ctrl_info); +} + +/* Returns TRUE if scan is finished. */ + +static int pqi_scan_finished(struct Scsi_Host *shost, + unsigned long elapsed_time) +{ + struct pqi_ctrl_info *ctrl_info; + + ctrl_info = shost_priv(shost); + + return !mutex_is_locked(&ctrl_info->scan_mutex); +} + +static inline void pqi_set_encryption_info(struct pqi_encryption_info *encryption_info, + struct raid_map *raid_map, u64 first_block) +{ + u32 volume_blk_size; + + /* + * Set the encryption tweak values based on logical block address. + * If the block size is 512, the tweak value is equal to the LBA. + * For other block sizes, tweak value is (LBA * block size) / 512. + */ + volume_blk_size = get_unaligned_le32(&raid_map->volume_blk_size); + if (volume_blk_size != 512) + first_block = (first_block * volume_blk_size) / 512; + + encryption_info->data_encryption_key_index = + get_unaligned_le16(&raid_map->data_encryption_key_index); + encryption_info->encrypt_tweak_lower = lower_32_bits(first_block); + encryption_info->encrypt_tweak_upper = upper_32_bits(first_block); +} + +/* + * Attempt to perform RAID bypass mapping for a logical volume I/O. + */ + +static bool pqi_aio_raid_level_supported(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev_raid_map_data *rmd) +{ + bool is_supported = true; + + switch (rmd->raid_level) { + case SA_RAID_0: + break; + case SA_RAID_1: + if (rmd->is_write && (!ctrl_info->enable_r1_writes || + rmd->data_length > ctrl_info->max_write_raid_1_10_2drive)) + is_supported = false; + break; + case SA_RAID_TRIPLE: + if (rmd->is_write && (!ctrl_info->enable_r1_writes || + rmd->data_length > ctrl_info->max_write_raid_1_10_3drive)) + is_supported = false; + break; + case SA_RAID_5: + if (rmd->is_write && (!ctrl_info->enable_r5_writes || + rmd->data_length > ctrl_info->max_write_raid_5_6)) + is_supported = false; + break; + case SA_RAID_6: + if (rmd->is_write && (!ctrl_info->enable_r6_writes || + rmd->data_length > ctrl_info->max_write_raid_5_6)) + is_supported = false; + break; + default: + is_supported = false; + break; + } + + return is_supported; +} + +#define PQI_RAID_BYPASS_INELIGIBLE 1 + +static int pqi_get_aio_lba_and_block_count(struct scsi_cmnd *scmd, + struct pqi_scsi_dev_raid_map_data *rmd) +{ + /* Check for valid opcode, get LBA and block count. */ + switch (scmd->cmnd[0]) { + case WRITE_6: + rmd->is_write = true; + /* fall through */ + case READ_6: + rmd->first_block = (u64)(((scmd->cmnd[1] & 0x1f) << 16) | + (scmd->cmnd[2] << 8) | scmd->cmnd[3]); + rmd->block_cnt = (u32)scmd->cmnd[4]; + if (rmd->block_cnt == 0) + rmd->block_cnt = 256; + break; + case WRITE_10: + rmd->is_write = true; + /* fall through */ + case READ_10: + rmd->first_block = (u64)get_unaligned_be32(&scmd->cmnd[2]); + rmd->block_cnt = (u32)get_unaligned_be16(&scmd->cmnd[7]); + break; + case WRITE_12: + rmd->is_write = true; + /* fall through */ + case READ_12: + rmd->first_block = (u64)get_unaligned_be32(&scmd->cmnd[2]); + rmd->block_cnt = get_unaligned_be32(&scmd->cmnd[6]); + break; + case WRITE_16: + rmd->is_write = true; + /* fall through */ + case READ_16: + rmd->first_block = get_unaligned_be64(&scmd->cmnd[2]); + rmd->block_cnt = get_unaligned_be32(&scmd->cmnd[10]); + break; + default: + /* Process via normal I/O path. */ + return PQI_RAID_BYPASS_INELIGIBLE; + } + + put_unaligned_le32(scsi_bufflen(scmd), &rmd->data_length); + + return 0; +} + +static int pci_get_aio_common_raid_map_values(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev_raid_map_data *rmd, struct raid_map *raid_map) +{ + rmd->last_block = rmd->first_block + rmd->block_cnt - 1; + + /* Check for invalid block or wraparound. */ + if (rmd->last_block >= + get_unaligned_le64(&raid_map->volume_blk_cnt) || + rmd->last_block < rmd->first_block) + return PQI_RAID_BYPASS_INELIGIBLE; + + rmd->data_disks_per_row = + get_unaligned_le16(&raid_map->data_disks_per_row); + rmd->strip_size = get_unaligned_le16(&raid_map->strip_size); + rmd->layout_map_count = get_unaligned_le16(&raid_map->layout_map_count); + + /* Calculate stripe information for the request. */ + rmd->blocks_per_row = rmd->data_disks_per_row * rmd->strip_size; + rmd->first_row = rmd->first_block / rmd->blocks_per_row; + rmd->last_row = rmd->last_block / rmd->blocks_per_row; + rmd->first_row_offset = (u32)(rmd->first_block - + (rmd->first_row * rmd->blocks_per_row)); + rmd->last_row_offset = (u32)(rmd->last_block - (rmd->last_row * + rmd->blocks_per_row)); + rmd->first_column = rmd->first_row_offset / rmd->strip_size; + rmd->last_column = rmd->last_row_offset / rmd->strip_size; + + /* If this isn't a single row/column then give to the controller. */ + if (rmd->first_row != rmd->last_row || + rmd->first_column != rmd->last_column) + return PQI_RAID_BYPASS_INELIGIBLE; + + /* Proceeding with driver mapping. */ + rmd->total_disks_per_row = rmd->data_disks_per_row + + get_unaligned_le16(&raid_map->metadata_disks_per_row); + rmd->map_row = ((u32)(rmd->first_row >> + raid_map->parity_rotation_shift)) % + get_unaligned_le16(&raid_map->row_cnt); + rmd->map_index = (rmd->map_row * rmd->total_disks_per_row) + + rmd->first_column; + + return 0; +} + +static int pqi_calc_aio_r5_or_r6(struct pqi_scsi_dev_raid_map_data *rmd, + struct raid_map *raid_map) +{ + /* RAID 50/60 */ + /* Verify first and last block are in same RAID group. */ + rmd->stripesize = rmd->blocks_per_row * rmd->layout_map_count; + rmd->first_group = (rmd->first_block % + rmd->stripesize) / rmd->blocks_per_row; + rmd->last_group = (rmd->last_block % + rmd->stripesize) / rmd->blocks_per_row; + if (rmd->first_group != rmd->last_group) + return PQI_RAID_BYPASS_INELIGIBLE; + + /* Verify request is in a single row of RAID 5/6. */ + rmd->first_row = rmd->r5or6_first_row = + rmd->first_block / rmd->stripesize; + rmd->r5or6_last_row = rmd->last_block / rmd->stripesize; + if (rmd->r5or6_first_row != rmd->r5or6_last_row) + return PQI_RAID_BYPASS_INELIGIBLE; + + /* Verify request is in a single column. */ + rmd->first_row_offset = rmd->r5or6_first_row_offset = + (u32)((rmd->first_block % rmd->stripesize) % + rmd->blocks_per_row); + + rmd->r5or6_last_row_offset = + (u32)((rmd->last_block % rmd->stripesize) % + rmd->blocks_per_row); + + rmd->first_column = + rmd->r5or6_first_row_offset / rmd->strip_size; + rmd->r5or6_first_column = rmd->first_column; + rmd->r5or6_last_column = rmd->r5or6_last_row_offset / rmd->strip_size; + if (rmd->r5or6_first_column != rmd->r5or6_last_column) + return PQI_RAID_BYPASS_INELIGIBLE; + + /* Request is eligible. */ + rmd->map_row = + ((u32)(rmd->first_row >> raid_map->parity_rotation_shift)) % + get_unaligned_le16(&raid_map->row_cnt); + + rmd->map_index = (rmd->first_group * + (get_unaligned_le16(&raid_map->row_cnt) * + rmd->total_disks_per_row)) + + (rmd->map_row * rmd->total_disks_per_row) + rmd->first_column; + + if (rmd->is_write) { + u32 index; + + /* + * p_parity_it_nexus and q_parity_it_nexus are pointers to the + * parity entries inside the device's raid_map. + * + * A device's RAID map is bounded by: number of RAID disks squared. + * + * The device's RAID map size is checked during device + * initialization. + */ + index = DIV_ROUND_UP(rmd->map_index + 1, rmd->total_disks_per_row); + index *= rmd->total_disks_per_row; + index -= get_unaligned_le16(&raid_map->metadata_disks_per_row); + + rmd->p_parity_it_nexus = raid_map->disk_data[index].aio_handle; + if (rmd->raid_level == SA_RAID_6) { + rmd->q_parity_it_nexus = raid_map->disk_data[index + 1].aio_handle; + rmd->xor_mult = raid_map->disk_data[rmd->map_index].xor_mult[1]; + } + if (rmd->blocks_per_row == 0) + return PQI_RAID_BYPASS_INELIGIBLE; + rmd->row = rmd->first_block / rmd->blocks_per_row; + } + + return 0; +} + +static void pqi_set_aio_cdb(struct pqi_scsi_dev_raid_map_data *rmd) +{ + /* Build the new CDB for the physical disk I/O. */ + if (rmd->disk_block > 0xffffffff) { + rmd->cdb[0] = rmd->is_write ? WRITE_16 : READ_16; + rmd->cdb[1] = 0; + put_unaligned_be64(rmd->disk_block, &rmd->cdb[2]); + put_unaligned_be32(rmd->disk_block_cnt, &rmd->cdb[10]); + rmd->cdb[14] = 0; + rmd->cdb[15] = 0; + rmd->cdb_length = 16; + } else { + rmd->cdb[0] = rmd->is_write ? WRITE_10 : READ_10; + rmd->cdb[1] = 0; + put_unaligned_be32((u32)rmd->disk_block, &rmd->cdb[2]); + rmd->cdb[6] = 0; + put_unaligned_be16((u16)rmd->disk_block_cnt, &rmd->cdb[7]); + rmd->cdb[9] = 0; + rmd->cdb_length = 10; + } +} + +static void pqi_calc_aio_r1_nexus(struct raid_map *raid_map, + struct pqi_scsi_dev_raid_map_data *rmd) +{ + u32 index; + u32 group; + + group = rmd->map_index / rmd->data_disks_per_row; + + index = rmd->map_index - (group * rmd->data_disks_per_row); + rmd->it_nexus[0] = raid_map->disk_data[index].aio_handle; + index += rmd->data_disks_per_row; + rmd->it_nexus[1] = raid_map->disk_data[index].aio_handle; + if (rmd->layout_map_count > 2) { + index += rmd->data_disks_per_row; + rmd->it_nexus[2] = raid_map->disk_data[index].aio_handle; + } + + rmd->num_it_nexus_entries = rmd->layout_map_count; +} + +static int pqi_raid_bypass_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, struct scsi_cmnd *scmd, + struct pqi_queue_group *queue_group) +{ + int rc; + struct raid_map *raid_map; + u32 group; + u32 next_bypass_group; + struct pqi_encryption_info *encryption_info_ptr; + struct pqi_encryption_info encryption_info; + struct pqi_scsi_dev_raid_map_data rmd = { 0 }; + + rc = pqi_get_aio_lba_and_block_count(scmd, &rmd); + if (rc) + return PQI_RAID_BYPASS_INELIGIBLE; + + rmd.raid_level = device->raid_level; + + if (!pqi_aio_raid_level_supported(ctrl_info, &rmd)) + return PQI_RAID_BYPASS_INELIGIBLE; + + if (unlikely(rmd.block_cnt == 0)) + return PQI_RAID_BYPASS_INELIGIBLE; + + raid_map = device->raid_map; + + rc = pci_get_aio_common_raid_map_values(ctrl_info, &rmd, raid_map); + if (rc) + return PQI_RAID_BYPASS_INELIGIBLE; + + if (device->raid_level == SA_RAID_1 || + device->raid_level == SA_RAID_TRIPLE) { + if (rmd.is_write) { + pqi_calc_aio_r1_nexus(raid_map, &rmd); + } else { + group = device->next_bypass_group[rmd.map_index]; + next_bypass_group = group + 1; + if (next_bypass_group >= rmd.layout_map_count) + next_bypass_group = 0; + device->next_bypass_group[rmd.map_index] = next_bypass_group; + rmd.map_index += group * rmd.data_disks_per_row; + } + } else if ((device->raid_level == SA_RAID_5 || + device->raid_level == SA_RAID_6) && + (rmd.layout_map_count > 1 || rmd.is_write)) { + rc = pqi_calc_aio_r5_or_r6(&rmd, raid_map); + if (rc) + return PQI_RAID_BYPASS_INELIGIBLE; + } + + if (unlikely(rmd.map_index >= RAID_MAP_MAX_ENTRIES)) + return PQI_RAID_BYPASS_INELIGIBLE; + + rmd.aio_handle = raid_map->disk_data[rmd.map_index].aio_handle; + rmd.disk_block = get_unaligned_le64(&raid_map->disk_starting_blk) + + rmd.first_row * rmd.strip_size + + (rmd.first_row_offset - rmd.first_column * rmd.strip_size); + rmd.disk_block_cnt = rmd.block_cnt; + + /* Handle differing logical/physical block sizes. */ + if (raid_map->phys_blk_shift) { + rmd.disk_block <<= raid_map->phys_blk_shift; + rmd.disk_block_cnt <<= raid_map->phys_blk_shift; + } + + if (unlikely(rmd.disk_block_cnt > 0xffff)) + return PQI_RAID_BYPASS_INELIGIBLE; + + pqi_set_aio_cdb(&rmd); + + if (get_unaligned_le16(&raid_map->flags) & RAID_MAP_ENCRYPTION_ENABLED) { + if (rmd.data_length > device->max_transfer_encrypted) + return PQI_RAID_BYPASS_INELIGIBLE; + pqi_set_encryption_info(&encryption_info, raid_map, rmd.first_block); + encryption_info_ptr = &encryption_info; + } else { + encryption_info_ptr = NULL; + } + + if (rmd.is_write) { + switch (device->raid_level) { + case SA_RAID_1: + case SA_RAID_TRIPLE: + return pqi_aio_submit_r1_write_io(ctrl_info, scmd, queue_group, + encryption_info_ptr, device, &rmd); + case SA_RAID_5: + case SA_RAID_6: + return pqi_aio_submit_r56_write_io(ctrl_info, scmd, queue_group, + encryption_info_ptr, device, &rmd); + } + } + + return pqi_aio_submit_io(ctrl_info, scmd, rmd.aio_handle, + rmd.cdb, rmd.cdb_length, queue_group, + encryption_info_ptr, true, false); +} + +#define PQI_STATUS_IDLE 0x0 + +#define PQI_CREATE_ADMIN_QUEUE_PAIR 1 +#define PQI_DELETE_ADMIN_QUEUE_PAIR 2 + +#define PQI_DEVICE_STATE_POWER_ON_AND_RESET 0x0 +#define PQI_DEVICE_STATE_STATUS_AVAILABLE 0x1 +#define PQI_DEVICE_STATE_ALL_REGISTERS_READY 0x2 +#define PQI_DEVICE_STATE_ADMIN_QUEUE_PAIR_READY 0x3 +#define PQI_DEVICE_STATE_ERROR 0x4 + +#define PQI_MODE_READY_TIMEOUT_SECS 30 +#define PQI_MODE_READY_POLL_INTERVAL_MSECS 1 + +static int pqi_wait_for_pqi_mode_ready(struct pqi_ctrl_info *ctrl_info) +{ + struct pqi_device_registers __iomem *pqi_registers; + unsigned long timeout; + u64 signature; + u8 status; + + pqi_registers = ctrl_info->pqi_registers; + timeout = (PQI_MODE_READY_TIMEOUT_SECS * HZ) + jiffies; + + while (1) { + signature = readq(&pqi_registers->signature); + if (memcmp(&signature, PQI_DEVICE_SIGNATURE, + sizeof(signature)) == 0) + break; + if (time_after(jiffies, timeout)) { + dev_err(&ctrl_info->pci_dev->dev, + "timed out waiting for PQI signature\n"); + return -ETIMEDOUT; + } + msleep(PQI_MODE_READY_POLL_INTERVAL_MSECS); + } + + while (1) { + status = readb(&pqi_registers->function_and_status_code); + if (status == PQI_STATUS_IDLE) + break; + if (time_after(jiffies, timeout)) { + dev_err(&ctrl_info->pci_dev->dev, + "timed out waiting for PQI IDLE\n"); + return -ETIMEDOUT; + } + msleep(PQI_MODE_READY_POLL_INTERVAL_MSECS); + } + + while (1) { + if (readl(&pqi_registers->device_status) == + PQI_DEVICE_STATE_ALL_REGISTERS_READY) + break; + if (time_after(jiffies, timeout)) { + dev_err(&ctrl_info->pci_dev->dev, + "timed out waiting for PQI all registers ready\n"); + return -ETIMEDOUT; + } + msleep(PQI_MODE_READY_POLL_INTERVAL_MSECS); + } + + return 0; +} + +static inline void pqi_aio_path_disabled(struct pqi_io_request *io_request) +{ + struct pqi_scsi_dev *device; + + device = io_request->scmd->device->hostdata; + device->raid_bypass_enabled = false; + device->aio_enabled = false; +} + +static inline void pqi_take_device_offline(struct scsi_device *sdev, char *path) +{ + struct pqi_ctrl_info *ctrl_info; + struct pqi_scsi_dev *device; + + device = sdev->hostdata; + if (device->device_offline) + return; + + device->device_offline = true; + ctrl_info = shost_to_hba(sdev->host); + pqi_schedule_rescan_worker(ctrl_info); + dev_err(&ctrl_info->pci_dev->dev, "re-scanning %s scsi %d:%d:%d:%d\n", + path, ctrl_info->scsi_host->host_no, device->bus, + device->target, device->lun); +} + +static void pqi_process_raid_io_error(struct pqi_io_request *io_request) +{ + u8 scsi_status; + u8 host_byte; + struct scsi_cmnd *scmd; + struct pqi_raid_error_info *error_info; + size_t sense_data_length; + int residual_count; + int xfer_count; + struct scsi_sense_hdr sshdr; + + scmd = io_request->scmd; + if (!scmd) + return; + + error_info = io_request->error_info; + scsi_status = error_info->status; + host_byte = DID_OK; + + switch (error_info->data_out_result) { + case PQI_DATA_IN_OUT_GOOD: + break; + case PQI_DATA_IN_OUT_UNDERFLOW: + xfer_count = + get_unaligned_le32(&error_info->data_out_transferred); + residual_count = scsi_bufflen(scmd) - xfer_count; + scsi_set_resid(scmd, residual_count); + if (xfer_count < scmd->underflow) + host_byte = DID_SOFT_ERROR; + break; + case PQI_DATA_IN_OUT_UNSOLICITED_ABORT: + case PQI_DATA_IN_OUT_ABORTED: + host_byte = DID_ABORT; + break; + case PQI_DATA_IN_OUT_TIMEOUT: + host_byte = DID_TIME_OUT; + break; + case PQI_DATA_IN_OUT_BUFFER_OVERFLOW: + case PQI_DATA_IN_OUT_PROTOCOL_ERROR: + case PQI_DATA_IN_OUT_BUFFER_ERROR: + case PQI_DATA_IN_OUT_BUFFER_OVERFLOW_DESCRIPTOR_AREA: + case PQI_DATA_IN_OUT_BUFFER_OVERFLOW_BRIDGE: + case PQI_DATA_IN_OUT_ERROR: + case PQI_DATA_IN_OUT_HARDWARE_ERROR: + case PQI_DATA_IN_OUT_PCIE_FABRIC_ERROR: + case PQI_DATA_IN_OUT_PCIE_COMPLETION_TIMEOUT: + case PQI_DATA_IN_OUT_PCIE_COMPLETER_ABORT_RECEIVED: + case PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST_RECEIVED: + case PQI_DATA_IN_OUT_PCIE_ECRC_CHECK_FAILED: + case PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST: + case PQI_DATA_IN_OUT_PCIE_ACS_VIOLATION: + case PQI_DATA_IN_OUT_PCIE_TLP_PREFIX_BLOCKED: + case PQI_DATA_IN_OUT_PCIE_POISONED_MEMORY_READ: + default: + host_byte = DID_ERROR; + break; + } + + sense_data_length = get_unaligned_le16(&error_info->sense_data_length); + if (sense_data_length == 0) + sense_data_length = + get_unaligned_le16(&error_info->response_data_length); + if (sense_data_length) { + if (sense_data_length > sizeof(error_info->data)) + sense_data_length = sizeof(error_info->data); + + if (scsi_status == SAM_STAT_CHECK_CONDITION && + scsi_normalize_sense(error_info->data, + sense_data_length, &sshdr) && + sshdr.sense_key == HARDWARE_ERROR && + sshdr.asc == 0x3e && + sshdr.ascq == 0x1) { + pqi_take_device_offline(scmd->device, "RAID"); + host_byte = DID_NO_CONNECT; + } + + if (sense_data_length > SCSI_SENSE_BUFFERSIZE) + sense_data_length = SCSI_SENSE_BUFFERSIZE; + memcpy(scmd->sense_buffer, error_info->data, + sense_data_length); + } + + scmd->result = scsi_status; + set_host_byte(scmd, host_byte); +} + +static void pqi_process_aio_io_error(struct pqi_io_request *io_request) +{ + u8 scsi_status; + u8 host_byte; + struct scsi_cmnd *scmd; + struct pqi_aio_error_info *error_info; + size_t sense_data_length; + int residual_count; + int xfer_count; + bool device_offline; + struct pqi_scsi_dev *device; + + scmd = io_request->scmd; + error_info = io_request->error_info; + host_byte = DID_OK; + sense_data_length = 0; + device_offline = false; + device = scmd->device->hostdata; + + switch (error_info->service_response) { + case PQI_AIO_SERV_RESPONSE_COMPLETE: + scsi_status = error_info->status; + break; + case PQI_AIO_SERV_RESPONSE_FAILURE: + switch (error_info->status) { + case PQI_AIO_STATUS_IO_ABORTED: + scsi_status = SAM_STAT_TASK_ABORTED; + break; + case PQI_AIO_STATUS_UNDERRUN: + scsi_status = SAM_STAT_GOOD; + residual_count = get_unaligned_le32( + &error_info->residual_count); + scsi_set_resid(scmd, residual_count); + xfer_count = scsi_bufflen(scmd) - residual_count; + if (xfer_count < scmd->underflow) + host_byte = DID_SOFT_ERROR; + break; + case PQI_AIO_STATUS_OVERRUN: + scsi_status = SAM_STAT_GOOD; + break; + case PQI_AIO_STATUS_AIO_PATH_DISABLED: + pqi_aio_path_disabled(io_request); + if (pqi_is_multipath_device(device)) { + pqi_device_remove_start(device); + host_byte = DID_NO_CONNECT; + scsi_status = SAM_STAT_CHECK_CONDITION; + } else { + scsi_status = SAM_STAT_GOOD; + io_request->status = -EAGAIN; + } + break; + case PQI_AIO_STATUS_NO_PATH_TO_DEVICE: + case PQI_AIO_STATUS_INVALID_DEVICE: + if (!io_request->raid_bypass) { + device_offline = true; + pqi_take_device_offline(scmd->device, "AIO"); + host_byte = DID_NO_CONNECT; + } + scsi_status = SAM_STAT_CHECK_CONDITION; + break; + case PQI_AIO_STATUS_IO_ERROR: + default: + scsi_status = SAM_STAT_CHECK_CONDITION; + break; + } + break; + case PQI_AIO_SERV_RESPONSE_TMF_COMPLETE: + case PQI_AIO_SERV_RESPONSE_TMF_SUCCEEDED: + scsi_status = SAM_STAT_GOOD; + break; + case PQI_AIO_SERV_RESPONSE_TMF_REJECTED: + case PQI_AIO_SERV_RESPONSE_TMF_INCORRECT_LUN: + default: + scsi_status = SAM_STAT_CHECK_CONDITION; + break; + } + + if (error_info->data_present) { + sense_data_length = + get_unaligned_le16(&error_info->data_length); + if (sense_data_length) { + if (sense_data_length > sizeof(error_info->data)) + sense_data_length = sizeof(error_info->data); + if (sense_data_length > SCSI_SENSE_BUFFERSIZE) + sense_data_length = SCSI_SENSE_BUFFERSIZE; + memcpy(scmd->sense_buffer, error_info->data, + sense_data_length); + } + } + + if (device_offline && sense_data_length == 0) + scsi_build_sense_buffer(0, scmd->sense_buffer, HARDWARE_ERROR, 0x3e, 0x1); + + scmd->result = scsi_status; + set_host_byte(scmd, host_byte); +} + +static void pqi_process_io_error(unsigned int iu_type, + struct pqi_io_request *io_request) +{ + switch (iu_type) { + case PQI_RESPONSE_IU_RAID_PATH_IO_ERROR: + pqi_process_raid_io_error(io_request); + break; + case PQI_RESPONSE_IU_AIO_PATH_IO_ERROR: + pqi_process_aio_io_error(io_request); + break; + } +} + +static int pqi_interpret_task_management_response(struct pqi_ctrl_info *ctrl_info, + struct pqi_task_management_response *response) +{ + int rc; + + switch (response->response_code) { + case SOP_TMF_COMPLETE: + case SOP_TMF_FUNCTION_SUCCEEDED: + rc = 0; + break; + case SOP_TMF_REJECTED: + rc = -EAGAIN; + break; + case SOP_RC_INCORRECT_LOGICAL_UNIT: + rc = -ENODEV; + break; + default: + rc = -EIO; + break; + } + + if (rc) + dev_err(&ctrl_info->pci_dev->dev, + "Task Management Function error: %d (response code: %u)\n", rc, response->response_code); + + return rc; +} + +static inline void pqi_invalid_response(struct pqi_ctrl_info *ctrl_info, + enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason) +{ + pqi_take_ctrl_offline(ctrl_info, ctrl_shutdown_reason); +} + +static int pqi_process_io_intr(struct pqi_ctrl_info *ctrl_info, struct pqi_queue_group *queue_group) +{ + int num_responses; + pqi_index_t oq_pi; + pqi_index_t oq_ci; + struct pqi_io_request *io_request; + struct pqi_io_response *response; + u16 request_id; + + num_responses = 0; + oq_ci = queue_group->oq_ci_copy; + + while (1) { + oq_pi = readl(queue_group->oq_pi); + if (oq_pi >= ctrl_info->num_elements_per_oq) { + pqi_invalid_response(ctrl_info, PQI_IO_PI_OUT_OF_RANGE); + dev_err(&ctrl_info->pci_dev->dev, + "I/O interrupt: producer index (%u) out of range (0-%u): consumer index: %u\n", + oq_pi, ctrl_info->num_elements_per_oq - 1, oq_ci); + return -1; + } + if (oq_pi == oq_ci) + break; + + num_responses++; + response = queue_group->oq_element_array + + (oq_ci * PQI_OPERATIONAL_OQ_ELEMENT_LENGTH); + + request_id = get_unaligned_le16(&response->request_id); + if (request_id >= ctrl_info->max_io_slots) { + pqi_invalid_response(ctrl_info, PQI_INVALID_REQ_ID); + dev_err(&ctrl_info->pci_dev->dev, + "request ID in response (%u) out of range (0-%u): producer index: %u consumer index: %u\n", + request_id, ctrl_info->max_io_slots - 1, oq_pi, oq_ci); + return -1; + } + + io_request = &ctrl_info->io_request_pool[request_id]; + if (atomic_read(&io_request->refcount) == 0) { + pqi_invalid_response(ctrl_info, PQI_UNMATCHED_REQ_ID); + dev_err(&ctrl_info->pci_dev->dev, + "request ID in response (%u) does not match an outstanding I/O request: producer index: %u consumer index: %u\n", + request_id, oq_pi, oq_ci); + return -1; + } + + switch (response->header.iu_type) { + case PQI_RESPONSE_IU_RAID_PATH_IO_SUCCESS: + case PQI_RESPONSE_IU_AIO_PATH_IO_SUCCESS: + if (io_request->scmd) + io_request->scmd->result = 0; + /* fall through */ + case PQI_RESPONSE_IU_GENERAL_MANAGEMENT: + break; + case PQI_RESPONSE_IU_VENDOR_GENERAL: + io_request->status = + get_unaligned_le16( + &((struct pqi_vendor_general_response *)response)->status); + break; + case PQI_RESPONSE_IU_TASK_MANAGEMENT: + io_request->status = pqi_interpret_task_management_response(ctrl_info, + (void *)response); + break; + case PQI_RESPONSE_IU_AIO_PATH_DISABLED: + pqi_aio_path_disabled(io_request); + io_request->status = -EAGAIN; + break; + case PQI_RESPONSE_IU_RAID_PATH_IO_ERROR: + case PQI_RESPONSE_IU_AIO_PATH_IO_ERROR: + io_request->error_info = ctrl_info->error_buffer + + (get_unaligned_le16(&response->error_index) * + PQI_ERROR_BUFFER_ELEMENT_LENGTH); + pqi_process_io_error(response->header.iu_type, io_request); + break; + default: + pqi_invalid_response(ctrl_info, PQI_UNEXPECTED_IU_TYPE); + dev_err(&ctrl_info->pci_dev->dev, + "unexpected IU type: 0x%x: producer index: %u consumer index: %u\n", + response->header.iu_type, oq_pi, oq_ci); + return -1; + } + + io_request->io_complete_callback(io_request, io_request->context); + + /* + * Note that the I/O request structure CANNOT BE TOUCHED after + * returning from the I/O completion callback! + */ + oq_ci = (oq_ci + 1) % ctrl_info->num_elements_per_oq; + } + + if (num_responses) { + queue_group->oq_ci_copy = oq_ci; + writel(oq_ci, queue_group->oq_ci); + } + + return num_responses; +} + +static inline unsigned int pqi_num_elements_free(unsigned int pi, + unsigned int ci, unsigned int elements_in_queue) +{ + unsigned int num_elements_used; + + if (pi >= ci) + num_elements_used = pi - ci; + else + num_elements_used = elements_in_queue - ci + pi; + + return elements_in_queue - num_elements_used - 1; +} + +static void pqi_send_event_ack(struct pqi_ctrl_info *ctrl_info, + struct pqi_event_acknowledge_request *iu, size_t iu_length) +{ + pqi_index_t iq_pi; + pqi_index_t iq_ci; + unsigned long flags; + void *next_element; + struct pqi_queue_group *queue_group; + + queue_group = &ctrl_info->queue_groups[PQI_DEFAULT_QUEUE_GROUP]; + put_unaligned_le16(queue_group->oq_id, &iu->header.response_queue_id); + + while (1) { + spin_lock_irqsave(&queue_group->submit_lock[RAID_PATH], flags); + + iq_pi = queue_group->iq_pi_copy[RAID_PATH]; + iq_ci = readl(queue_group->iq_ci[RAID_PATH]); + + if (pqi_num_elements_free(iq_pi, iq_ci, + ctrl_info->num_elements_per_iq)) + break; + + spin_unlock_irqrestore( + &queue_group->submit_lock[RAID_PATH], flags); + + if (pqi_ctrl_offline(ctrl_info)) + return; + } + + next_element = queue_group->iq_element_array[RAID_PATH] + + (iq_pi * PQI_OPERATIONAL_IQ_ELEMENT_LENGTH); + + memcpy(next_element, iu, iu_length); + + iq_pi = (iq_pi + 1) % ctrl_info->num_elements_per_iq; + queue_group->iq_pi_copy[RAID_PATH] = iq_pi; + + /* + * This write notifies the controller that an IU is available to be + * processed. + */ + writel(iq_pi, queue_group->iq_pi[RAID_PATH]); + + spin_unlock_irqrestore(&queue_group->submit_lock[RAID_PATH], flags); +} + +static void pqi_acknowledge_event(struct pqi_ctrl_info *ctrl_info, + struct pqi_event *event) +{ + struct pqi_event_acknowledge_request request; + + memset(&request, 0, sizeof(request)); + + request.header.iu_type = PQI_REQUEST_IU_ACKNOWLEDGE_VENDOR_EVENT; + put_unaligned_le16(sizeof(request) - PQI_REQUEST_HEADER_LENGTH, + &request.header.iu_length); + request.event_type = event->event_type; + put_unaligned_le16(event->event_id, &request.event_id); + put_unaligned_le16(event->additional_event_id, &request.additional_event_id); + + pqi_send_event_ack(ctrl_info, &request, sizeof(request)); +} + +#define PQI_SOFT_RESET_STATUS_TIMEOUT_SECS 30 +#define PQI_SOFT_RESET_STATUS_POLL_INTERVAL_SECS 1 + +static enum pqi_soft_reset_status pqi_poll_for_soft_reset_status( + struct pqi_ctrl_info *ctrl_info) +{ + u8 status; + unsigned long timeout; + + timeout = (PQI_SOFT_RESET_STATUS_TIMEOUT_SECS * HZ) + jiffies; + + while (1) { + status = pqi_read_soft_reset_status(ctrl_info); + if (status & PQI_SOFT_RESET_INITIATE) + return RESET_INITIATE_DRIVER; + + if (status & PQI_SOFT_RESET_ABORT) + return RESET_ABORT; + + if (!sis_is_firmware_running(ctrl_info)) + return RESET_NORESPONSE; + + if (time_after(jiffies, timeout)) { + dev_warn(&ctrl_info->pci_dev->dev, + "timed out waiting for soft reset status\n"); + return RESET_TIMEDOUT; + } + + ssleep(PQI_SOFT_RESET_STATUS_POLL_INTERVAL_SECS); + } +} + +static void pqi_process_soft_reset(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + unsigned int delay_secs; + enum pqi_soft_reset_status reset_status; + + if (ctrl_info->soft_reset_handshake_supported) + reset_status = pqi_poll_for_soft_reset_status(ctrl_info); + else + reset_status = RESET_INITIATE_FIRMWARE; + + delay_secs = PQI_POST_RESET_DELAY_SECS; + + switch (reset_status) { + case RESET_TIMEDOUT: + delay_secs = PQI_POST_OFA_RESET_DELAY_UPON_TIMEOUT_SECS; + /* fall through */ + case RESET_INITIATE_DRIVER: + dev_info(&ctrl_info->pci_dev->dev, + "Online Firmware Activation: resetting controller\n"); + sis_soft_reset(ctrl_info); + /* fall through */ + case RESET_INITIATE_FIRMWARE: + ctrl_info->pqi_mode_enabled = false; + pqi_save_ctrl_mode(ctrl_info, SIS_MODE); + rc = pqi_ofa_ctrl_restart(ctrl_info, delay_secs); + pqi_ofa_free_host_buffer(ctrl_info); + pqi_ctrl_ofa_done(ctrl_info); + dev_info(&ctrl_info->pci_dev->dev, + "Online Firmware Activation: %s\n", + rc == 0 ? "SUCCESS" : "FAILED"); + break; + case RESET_ABORT: + dev_info(&ctrl_info->pci_dev->dev, + "Online Firmware Activation ABORTED\n"); + if (ctrl_info->soft_reset_handshake_supported) + pqi_clear_soft_reset_status(ctrl_info); + pqi_ofa_free_host_buffer(ctrl_info); + pqi_ctrl_ofa_done(ctrl_info); + pqi_ofa_ctrl_unquiesce(ctrl_info); + break; + case RESET_NORESPONSE: + /* fall through */ + default: + dev_err(&ctrl_info->pci_dev->dev, + "unexpected Online Firmware Activation reset status: 0x%x\n", + reset_status); + pqi_ofa_free_host_buffer(ctrl_info); + pqi_ctrl_ofa_done(ctrl_info); + pqi_ofa_ctrl_unquiesce(ctrl_info); + pqi_take_ctrl_offline(ctrl_info, PQI_OFA_RESPONSE_TIMEOUT); + break; + } +} + +static void pqi_ofa_memory_alloc_worker(struct work_struct *work) +{ + struct pqi_ctrl_info *ctrl_info; + + ctrl_info = container_of(work, struct pqi_ctrl_info, ofa_memory_alloc_work); + + pqi_ctrl_ofa_start(ctrl_info); + pqi_ofa_setup_host_buffer(ctrl_info); + pqi_ofa_host_memory_update(ctrl_info); +} + +static void pqi_ofa_quiesce_worker(struct work_struct *work) +{ + struct pqi_ctrl_info *ctrl_info; + struct pqi_event *event; + + ctrl_info = container_of(work, struct pqi_ctrl_info, ofa_quiesce_work); + + event = &ctrl_info->events[pqi_event_type_to_event_index(PQI_EVENT_TYPE_OFA)]; + + pqi_ofa_ctrl_quiesce(ctrl_info); + pqi_acknowledge_event(ctrl_info, event); + pqi_process_soft_reset(ctrl_info); +} + +static bool pqi_ofa_process_event(struct pqi_ctrl_info *ctrl_info, + struct pqi_event *event) +{ + bool ack_event; + + ack_event = true; + + switch (event->event_id) { + case PQI_EVENT_OFA_MEMORY_ALLOCATION: + dev_info(&ctrl_info->pci_dev->dev, + "received Online Firmware Activation memory allocation request\n"); + schedule_work(&ctrl_info->ofa_memory_alloc_work); + break; + case PQI_EVENT_OFA_QUIESCE: + dev_info(&ctrl_info->pci_dev->dev, + "received Online Firmware Activation quiesce request\n"); + schedule_work(&ctrl_info->ofa_quiesce_work); + ack_event = false; + break; + case PQI_EVENT_OFA_CANCELED: + dev_info(&ctrl_info->pci_dev->dev, + "received Online Firmware Activation cancel request: reason: %u\n", + ctrl_info->ofa_cancel_reason); + pqi_ofa_free_host_buffer(ctrl_info); + pqi_ctrl_ofa_done(ctrl_info); + break; + default: + dev_err(&ctrl_info->pci_dev->dev, + "received unknown Online Firmware Activation request: event ID: %u\n", + event->event_id); + break; + } + + return ack_event; +} + +static void pqi_disable_raid_bypass(struct pqi_ctrl_info *ctrl_info) +{ + unsigned long flags; + struct pqi_scsi_dev *device; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry) + if (device->raid_bypass_enabled) + device->raid_bypass_enabled = false; + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); +} + +static void pqi_event_worker(struct work_struct *work) +{ + unsigned int i; + bool rescan_needed; + struct pqi_ctrl_info *ctrl_info; + struct pqi_event *event; + bool ack_event; + + ctrl_info = container_of(work, struct pqi_ctrl_info, event_work); + + pqi_ctrl_busy(ctrl_info); + pqi_wait_if_ctrl_blocked(ctrl_info); + if (pqi_ctrl_offline(ctrl_info)) + goto out; + + rescan_needed = false; + event = ctrl_info->events; + for (i = 0; i < PQI_NUM_SUPPORTED_EVENTS; i++) { + if (event->pending) { + event->pending = false; + if (event->event_type == PQI_EVENT_TYPE_OFA) { + ack_event = pqi_ofa_process_event(ctrl_info, event); + } else { + ack_event = true; + rescan_needed = true; + if (event->event_type == PQI_EVENT_TYPE_LOGICAL_DEVICE) + ctrl_info->logical_volume_rescan_needed = true; + else if (event->event_type == PQI_EVENT_TYPE_AIO_STATE_CHANGE) + pqi_disable_raid_bypass(ctrl_info); + } + if (ack_event) + pqi_acknowledge_event(ctrl_info, event); + } + event++; + } + +#define PQI_RESCAN_WORK_FOR_EVENT_DELAY (5 * HZ) + + if (rescan_needed) + pqi_schedule_rescan_worker_with_delay(ctrl_info, + PQI_RESCAN_WORK_FOR_EVENT_DELAY); + +out: + pqi_ctrl_unbusy(ctrl_info); +} + +#define PQI_HEARTBEAT_TIMER_INTERVAL (10 * HZ) + +static void pqi_heartbeat_timer_handler(struct timer_list *t) +{ + int num_interrupts; + u32 heartbeat_count; + struct pqi_ctrl_info *ctrl_info; + + ctrl_info = from_timer(ctrl_info, t, heartbeat_timer); + + pqi_check_ctrl_health(ctrl_info); + if (pqi_ctrl_offline(ctrl_info)) + return; + + num_interrupts = atomic_read(&ctrl_info->num_interrupts); + heartbeat_count = pqi_read_heartbeat_counter(ctrl_info); + + if (num_interrupts == ctrl_info->previous_num_interrupts) { + if (heartbeat_count == ctrl_info->previous_heartbeat_count) { + dev_err(&ctrl_info->pci_dev->dev, + "no heartbeat detected - last heartbeat count: %u\n", + heartbeat_count); + pqi_take_ctrl_offline(ctrl_info, PQI_NO_HEARTBEAT); + return; + } + } else { + ctrl_info->previous_num_interrupts = num_interrupts; + } + + ctrl_info->previous_heartbeat_count = heartbeat_count; + mod_timer(&ctrl_info->heartbeat_timer, + jiffies + PQI_HEARTBEAT_TIMER_INTERVAL); +} + +static void pqi_start_heartbeat_timer(struct pqi_ctrl_info *ctrl_info) +{ + if (!ctrl_info->heartbeat_counter) + return; + + ctrl_info->previous_num_interrupts = + atomic_read(&ctrl_info->num_interrupts); + ctrl_info->previous_heartbeat_count = + pqi_read_heartbeat_counter(ctrl_info); + + ctrl_info->heartbeat_timer.expires = + jiffies + PQI_HEARTBEAT_TIMER_INTERVAL; + add_timer(&ctrl_info->heartbeat_timer); +} + +static inline void pqi_stop_heartbeat_timer(struct pqi_ctrl_info *ctrl_info) +{ + del_timer_sync(&ctrl_info->heartbeat_timer); +} + +static void pqi_ofa_capture_event_payload(struct pqi_ctrl_info *ctrl_info, + struct pqi_event *event, struct pqi_event_response *response) +{ + switch (event->event_id) { + case PQI_EVENT_OFA_MEMORY_ALLOCATION: + ctrl_info->ofa_bytes_requested = + get_unaligned_le32(&response->data.ofa_memory_allocation.bytes_requested); + break; + case PQI_EVENT_OFA_CANCELED: + ctrl_info->ofa_cancel_reason = + get_unaligned_le16(&response->data.ofa_cancelled.reason); + break; + } +} + +static int pqi_process_event_intr(struct pqi_ctrl_info *ctrl_info) +{ + int num_events; + pqi_index_t oq_pi; + pqi_index_t oq_ci; + struct pqi_event_queue *event_queue; + struct pqi_event_response *response; + struct pqi_event *event; + int event_index; + + event_queue = &ctrl_info->event_queue; + num_events = 0; + oq_ci = event_queue->oq_ci_copy; + + while (1) { + oq_pi = readl(event_queue->oq_pi); + if (oq_pi >= PQI_NUM_EVENT_QUEUE_ELEMENTS) { + pqi_invalid_response(ctrl_info, PQI_EVENT_PI_OUT_OF_RANGE); + dev_err(&ctrl_info->pci_dev->dev, + "event interrupt: producer index (%u) out of range (0-%u): consumer index: %u\n", + oq_pi, PQI_NUM_EVENT_QUEUE_ELEMENTS - 1, oq_ci); + return -1; + } + + if (oq_pi == oq_ci) + break; + + num_events++; + response = event_queue->oq_element_array + (oq_ci * PQI_EVENT_OQ_ELEMENT_LENGTH); + + event_index = pqi_event_type_to_event_index(response->event_type); + + if (event_index >= 0 && response->request_acknowledge) { + event = &ctrl_info->events[event_index]; + event->pending = true; + event->event_type = response->event_type; + event->event_id = get_unaligned_le16(&response->event_id); + event->additional_event_id = + get_unaligned_le32(&response->additional_event_id); + if (event->event_type == PQI_EVENT_TYPE_OFA) + pqi_ofa_capture_event_payload(ctrl_info, event, response); + } + + oq_ci = (oq_ci + 1) % PQI_NUM_EVENT_QUEUE_ELEMENTS; + } + + if (num_events) { + event_queue->oq_ci_copy = oq_ci; + writel(oq_ci, event_queue->oq_ci); + schedule_work(&ctrl_info->event_work); + } + + return num_events; +} + +#define PQI_LEGACY_INTX_MASK 0x1 + +static inline void pqi_configure_legacy_intx(struct pqi_ctrl_info *ctrl_info, bool enable_intx) +{ + u32 intx_mask; + struct pqi_device_registers __iomem *pqi_registers; + volatile void __iomem *register_addr; + + pqi_registers = ctrl_info->pqi_registers; + + if (enable_intx) + register_addr = &pqi_registers->legacy_intx_mask_clear; + else + register_addr = &pqi_registers->legacy_intx_mask_set; + + intx_mask = readl(register_addr); + intx_mask |= PQI_LEGACY_INTX_MASK; + writel(intx_mask, register_addr); +} + +static void pqi_change_irq_mode(struct pqi_ctrl_info *ctrl_info, + enum pqi_irq_mode new_mode) +{ + switch (ctrl_info->irq_mode) { + case IRQ_MODE_MSIX: + switch (new_mode) { + case IRQ_MODE_MSIX: + break; + case IRQ_MODE_INTX: + pqi_configure_legacy_intx(ctrl_info, true); + sis_enable_intx(ctrl_info); + break; + case IRQ_MODE_NONE: + break; + } + break; + case IRQ_MODE_INTX: + switch (new_mode) { + case IRQ_MODE_MSIX: + pqi_configure_legacy_intx(ctrl_info, false); + sis_enable_msix(ctrl_info); + break; + case IRQ_MODE_INTX: + break; + case IRQ_MODE_NONE: + pqi_configure_legacy_intx(ctrl_info, false); + break; + } + break; + case IRQ_MODE_NONE: + switch (new_mode) { + case IRQ_MODE_MSIX: + sis_enable_msix(ctrl_info); + break; + case IRQ_MODE_INTX: + pqi_configure_legacy_intx(ctrl_info, true); + sis_enable_intx(ctrl_info); + break; + case IRQ_MODE_NONE: + break; + } + break; + } + + ctrl_info->irq_mode = new_mode; +} + +#define PQI_LEGACY_INTX_PENDING 0x1 + +static inline bool pqi_is_valid_irq(struct pqi_ctrl_info *ctrl_info) +{ + bool valid_irq; + u32 intx_status; + + switch (ctrl_info->irq_mode) { + case IRQ_MODE_MSIX: + valid_irq = true; + break; + case IRQ_MODE_INTX: + intx_status = readl(&ctrl_info->pqi_registers->legacy_intx_status); + if (intx_status & PQI_LEGACY_INTX_PENDING) + valid_irq = true; + else + valid_irq = false; + break; + case IRQ_MODE_NONE: + default: + valid_irq = false; + break; + } + + return valid_irq; +} + +static irqreturn_t pqi_irq_handler(int irq, void *data) +{ + struct pqi_ctrl_info *ctrl_info; + struct pqi_queue_group *queue_group; + int num_io_responses_handled; + int num_events_handled; + + queue_group = data; + ctrl_info = queue_group->ctrl_info; + + if (!pqi_is_valid_irq(ctrl_info)) + return IRQ_NONE; + + num_io_responses_handled = pqi_process_io_intr(ctrl_info, queue_group); + if (num_io_responses_handled < 0) + goto out; + + if (irq == ctrl_info->event_irq) { + num_events_handled = pqi_process_event_intr(ctrl_info); + if (num_events_handled < 0) + goto out; + } else { + num_events_handled = 0; + } + + if (num_io_responses_handled + num_events_handled > 0) + atomic_inc(&ctrl_info->num_interrupts); + + pqi_start_io(ctrl_info, queue_group, RAID_PATH, NULL); + pqi_start_io(ctrl_info, queue_group, AIO_PATH, NULL); + +out: + return IRQ_HANDLED; +} + + +static int pqi_request_irqs(struct pqi_ctrl_info *ctrl_info) +{ + struct pci_dev *pci_dev; + int i; + int rc; + + pci_dev = ctrl_info->pci_dev; + ctrl_info->event_irq = pqi_pci_irq_vector(pci_dev, 0); + + for (i = 0; i < ctrl_info->num_msix_vectors_enabled; i++) { + rc = request_irq(pqi_pci_irq_vector(pci_dev, i), pqi_irq_handler, 0, + DRIVER_NAME_SHORT, pqi_get_irq_cookie(ctrl_info, i)); + if (rc) { + dev_err(&pci_dev->dev, + "irq %u init failed with error %d\n", + pqi_pci_irq_vector(pci_dev, i), rc); + return rc; + } + ctrl_info->num_msix_vectors_initialized++; + } + + return 0; +} + +static void pqi_free_irqs(struct pqi_ctrl_info *ctrl_info) +{ + int i; + + for (i = 0; i < ctrl_info->num_msix_vectors_initialized; i++) + free_irq(pqi_pci_irq_vector(ctrl_info->pci_dev, i), + pqi_get_irq_cookie(ctrl_info, i)); + + ctrl_info->num_msix_vectors_initialized = 0; +} + +static int pqi_enable_msix_interrupts(struct pqi_ctrl_info *ctrl_info) +{ + int num_vectors_enabled; + unsigned int flags = PCI_IRQ_MSIX; + + if (!pqi_disable_managed_interrupts) + flags |= PCI_IRQ_AFFINITY; + + num_vectors_enabled = pqi_pci_alloc_irq_vectors(ctrl_info->pci_dev, + PQI_MIN_MSIX_VECTORS, ctrl_info->num_queue_groups, + flags); + + if (num_vectors_enabled < 0) { + dev_err(&ctrl_info->pci_dev->dev, + "MSI-X init failed with error %d\n", + num_vectors_enabled); + return num_vectors_enabled; + } + + ctrl_info->num_msix_vectors_enabled = num_vectors_enabled; + ctrl_info->irq_mode = IRQ_MODE_MSIX; + + return 0; +} + +static void pqi_disable_msix_interrupts(struct pqi_ctrl_info *ctrl_info) +{ + if (ctrl_info->num_msix_vectors_enabled) { + pqi_pci_free_irq_vectors(ctrl_info->pci_dev); + ctrl_info->num_msix_vectors_enabled = 0; + } +} + +static int pqi_alloc_operational_queues(struct pqi_ctrl_info *ctrl_info) +{ + unsigned int i; + size_t alloc_length; + size_t element_array_length_per_iq; + size_t element_array_length_per_oq; + void *element_array; + void __iomem *next_queue_index; + void *aligned_pointer; + unsigned int num_inbound_queues; + unsigned int num_outbound_queues; + unsigned int num_queue_indexes; + struct pqi_queue_group *queue_group; + + element_array_length_per_iq = + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH * + ctrl_info->num_elements_per_iq; + element_array_length_per_oq = + PQI_OPERATIONAL_OQ_ELEMENT_LENGTH * + ctrl_info->num_elements_per_oq; + num_inbound_queues = ctrl_info->num_queue_groups * 2; + num_outbound_queues = ctrl_info->num_queue_groups; + num_queue_indexes = (ctrl_info->num_queue_groups * 3) + 1; + + aligned_pointer = NULL; + + for (i = 0; i < num_inbound_queues; i++) { + aligned_pointer = PTR_ALIGN(aligned_pointer, + PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT); + aligned_pointer += element_array_length_per_iq; + } + + for (i = 0; i < num_outbound_queues; i++) { + aligned_pointer = PTR_ALIGN(aligned_pointer, + PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT); + aligned_pointer += element_array_length_per_oq; + } + + aligned_pointer = PTR_ALIGN(aligned_pointer, + PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT); + aligned_pointer += PQI_NUM_EVENT_QUEUE_ELEMENTS * + PQI_EVENT_OQ_ELEMENT_LENGTH; + + for (i = 0; i < num_queue_indexes; i++) { + aligned_pointer = PTR_ALIGN(aligned_pointer, + PQI_OPERATIONAL_INDEX_ALIGNMENT); + aligned_pointer += sizeof(pqi_index_t); + } + + alloc_length = (size_t)aligned_pointer + + PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT; + + alloc_length += PQI_EXTRA_SGL_MEMORY; + + ctrl_info->queue_memory_base = + dma_zalloc_coherent(&ctrl_info->pci_dev->dev, + alloc_length, + &ctrl_info->queue_memory_base_dma_handle, GFP_KERNEL); + + if (!ctrl_info->queue_memory_base) + return -ENOMEM; + + ctrl_info->queue_memory_length = alloc_length; + + element_array = PTR_ALIGN(ctrl_info->queue_memory_base, + PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT); + + for (i = 0; i < ctrl_info->num_queue_groups; i++) { + queue_group = &ctrl_info->queue_groups[i]; + queue_group->iq_element_array[RAID_PATH] = element_array; + queue_group->iq_element_array_bus_addr[RAID_PATH] = + ctrl_info->queue_memory_base_dma_handle + + (element_array - ctrl_info->queue_memory_base); + element_array += element_array_length_per_iq; + element_array = PTR_ALIGN(element_array, + PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT); + queue_group->iq_element_array[AIO_PATH] = element_array; + queue_group->iq_element_array_bus_addr[AIO_PATH] = + ctrl_info->queue_memory_base_dma_handle + + (element_array - ctrl_info->queue_memory_base); + element_array += element_array_length_per_iq; + element_array = PTR_ALIGN(element_array, + PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT); + } + + for (i = 0; i < ctrl_info->num_queue_groups; i++) { + queue_group = &ctrl_info->queue_groups[i]; + queue_group->oq_element_array = element_array; + queue_group->oq_element_array_bus_addr = + ctrl_info->queue_memory_base_dma_handle + + (element_array - ctrl_info->queue_memory_base); + element_array += element_array_length_per_oq; + element_array = PTR_ALIGN(element_array, + PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT); + } + + ctrl_info->event_queue.oq_element_array = element_array; + ctrl_info->event_queue.oq_element_array_bus_addr = + ctrl_info->queue_memory_base_dma_handle + + (element_array - ctrl_info->queue_memory_base); + element_array += PQI_NUM_EVENT_QUEUE_ELEMENTS * + PQI_EVENT_OQ_ELEMENT_LENGTH; + + next_queue_index = (void __iomem *)PTR_ALIGN(element_array, + PQI_OPERATIONAL_INDEX_ALIGNMENT); + + for (i = 0; i < ctrl_info->num_queue_groups; i++) { + queue_group = &ctrl_info->queue_groups[i]; + queue_group->iq_ci[RAID_PATH] = next_queue_index; + queue_group->iq_ci_bus_addr[RAID_PATH] = + ctrl_info->queue_memory_base_dma_handle + + (next_queue_index - + (void __iomem *)ctrl_info->queue_memory_base); + next_queue_index += sizeof(pqi_index_t); + next_queue_index = PTR_ALIGN(next_queue_index, + PQI_OPERATIONAL_INDEX_ALIGNMENT); + queue_group->iq_ci[AIO_PATH] = next_queue_index; + queue_group->iq_ci_bus_addr[AIO_PATH] = + ctrl_info->queue_memory_base_dma_handle + + (next_queue_index - + (void __iomem *)ctrl_info->queue_memory_base); + next_queue_index += sizeof(pqi_index_t); + next_queue_index = PTR_ALIGN(next_queue_index, + PQI_OPERATIONAL_INDEX_ALIGNMENT); + queue_group->oq_pi = next_queue_index; + queue_group->oq_pi_bus_addr = + ctrl_info->queue_memory_base_dma_handle + + (next_queue_index - + (void __iomem *)ctrl_info->queue_memory_base); + next_queue_index += sizeof(pqi_index_t); + next_queue_index = PTR_ALIGN(next_queue_index, + PQI_OPERATIONAL_INDEX_ALIGNMENT); + } + + ctrl_info->event_queue.oq_pi = next_queue_index; + ctrl_info->event_queue.oq_pi_bus_addr = + ctrl_info->queue_memory_base_dma_handle + + (next_queue_index - + (void __iomem *)ctrl_info->queue_memory_base); + + return 0; +} + +static void pqi_init_operational_queues(struct pqi_ctrl_info *ctrl_info) +{ + unsigned int i; + u16 next_iq_id = PQI_MIN_OPERATIONAL_QUEUE_ID; + u16 next_oq_id = PQI_MIN_OPERATIONAL_QUEUE_ID; + + /* + * Initialize the backpointers to the controller structure in + * each operational queue group structure. + */ + for (i = 0; i < ctrl_info->num_queue_groups; i++) + ctrl_info->queue_groups[i].ctrl_info = ctrl_info; + + /* + * Assign IDs to all operational queues. Note that the IDs + * assigned to operational IQs are independent of the IDs + * assigned to operational OQs. + */ + ctrl_info->event_queue.oq_id = next_oq_id++; + for (i = 0; i < ctrl_info->num_queue_groups; i++) { + ctrl_info->queue_groups[i].iq_id[RAID_PATH] = next_iq_id++; + ctrl_info->queue_groups[i].iq_id[AIO_PATH] = next_iq_id++; + ctrl_info->queue_groups[i].oq_id = next_oq_id++; + } + + /* + * Assign MSI-X table entry indexes to all queues. Note that the + * interrupt for the event queue is shared with the first queue group. + */ + ctrl_info->event_queue.int_msg_num = 0; + for (i = 0; i < ctrl_info->num_queue_groups; i++) + ctrl_info->queue_groups[i].int_msg_num = i; + + for (i = 0; i < ctrl_info->num_queue_groups; i++) { + spin_lock_init(&ctrl_info->queue_groups[i].submit_lock[0]); + spin_lock_init(&ctrl_info->queue_groups[i].submit_lock[1]); + INIT_LIST_HEAD(&ctrl_info->queue_groups[i].request_list[0]); + INIT_LIST_HEAD(&ctrl_info->queue_groups[i].request_list[1]); + } +} + +static int pqi_alloc_admin_queues(struct pqi_ctrl_info *ctrl_info) +{ + size_t alloc_length; + struct pqi_admin_queues_aligned *admin_queues_aligned; + struct pqi_admin_queues *admin_queues; + + alloc_length = sizeof(struct pqi_admin_queues_aligned) + + PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT; + + ctrl_info->admin_queue_memory_base = + dma_zalloc_coherent(&ctrl_info->pci_dev->dev, + alloc_length, + &ctrl_info->admin_queue_memory_base_dma_handle, + GFP_KERNEL); + + if (!ctrl_info->admin_queue_memory_base) + return -ENOMEM; + + ctrl_info->admin_queue_memory_length = alloc_length; + + admin_queues = &ctrl_info->admin_queues; + admin_queues_aligned = PTR_ALIGN(ctrl_info->admin_queue_memory_base, + PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT); + admin_queues->iq_element_array = + &admin_queues_aligned->iq_element_array; + admin_queues->oq_element_array = + &admin_queues_aligned->oq_element_array; + admin_queues->iq_ci = &admin_queues_aligned->iq_ci; + admin_queues->oq_pi = + (pqi_index_t __iomem *)&admin_queues_aligned->oq_pi; + + admin_queues->iq_element_array_bus_addr = + ctrl_info->admin_queue_memory_base_dma_handle + + (admin_queues->iq_element_array - + ctrl_info->admin_queue_memory_base); + admin_queues->oq_element_array_bus_addr = + ctrl_info->admin_queue_memory_base_dma_handle + + (admin_queues->oq_element_array - + ctrl_info->admin_queue_memory_base); + admin_queues->iq_ci_bus_addr = + ctrl_info->admin_queue_memory_base_dma_handle + + ((void *)admin_queues->iq_ci - + ctrl_info->admin_queue_memory_base); + admin_queues->oq_pi_bus_addr = + ctrl_info->admin_queue_memory_base_dma_handle + + ((void __iomem *)admin_queues->oq_pi - + (void __iomem *)ctrl_info->admin_queue_memory_base); + + return 0; +} + +#define PQI_ADMIN_QUEUE_CREATE_TIMEOUT_JIFFIES HZ +#define PQI_ADMIN_QUEUE_CREATE_POLL_INTERVAL_MSECS 1 + +static int pqi_create_admin_queues(struct pqi_ctrl_info *ctrl_info) +{ + struct pqi_device_registers __iomem *pqi_registers; + struct pqi_admin_queues *admin_queues; + unsigned long timeout; + u8 status; + u32 reg; + + pqi_registers = ctrl_info->pqi_registers; + admin_queues = &ctrl_info->admin_queues; + + writeq((u64)admin_queues->iq_element_array_bus_addr, + &pqi_registers->admin_iq_element_array_addr); + writeq((u64)admin_queues->oq_element_array_bus_addr, + &pqi_registers->admin_oq_element_array_addr); + writeq((u64)admin_queues->iq_ci_bus_addr, + &pqi_registers->admin_iq_ci_addr); + writeq((u64)admin_queues->oq_pi_bus_addr, + &pqi_registers->admin_oq_pi_addr); + + reg = PQI_ADMIN_IQ_NUM_ELEMENTS | + (PQI_ADMIN_OQ_NUM_ELEMENTS << 8) | + (admin_queues->int_msg_num << 16); + writel(reg, &pqi_registers->admin_iq_num_elements); + + writel(PQI_CREATE_ADMIN_QUEUE_PAIR, + &pqi_registers->function_and_status_code); + + timeout = PQI_ADMIN_QUEUE_CREATE_TIMEOUT_JIFFIES + jiffies; + while (1) { + msleep(PQI_ADMIN_QUEUE_CREATE_POLL_INTERVAL_MSECS); + status = readb(&pqi_registers->function_and_status_code); + if (status == PQI_STATUS_IDLE) + break; + if (time_after(jiffies, timeout)) + return -ETIMEDOUT; + } + + /* + * The offset registers are not initialized to the correct + * offsets until *after* the create admin queue pair command + * completes successfully. + */ + admin_queues->iq_pi = ctrl_info->iomem_base + + PQI_DEVICE_REGISTERS_OFFSET + + readq(&pqi_registers->admin_iq_pi_offset); + admin_queues->oq_ci = ctrl_info->iomem_base + + PQI_DEVICE_REGISTERS_OFFSET + + readq(&pqi_registers->admin_oq_ci_offset); + + return 0; +} + +static void pqi_submit_admin_request(struct pqi_ctrl_info *ctrl_info, + struct pqi_general_admin_request *request) +{ + struct pqi_admin_queues *admin_queues; + void *next_element; + pqi_index_t iq_pi; + + admin_queues = &ctrl_info->admin_queues; + iq_pi = admin_queues->iq_pi_copy; + + next_element = admin_queues->iq_element_array + + (iq_pi * PQI_ADMIN_IQ_ELEMENT_LENGTH); + + memcpy(next_element, request, sizeof(*request)); + + iq_pi = (iq_pi + 1) % PQI_ADMIN_IQ_NUM_ELEMENTS; + admin_queues->iq_pi_copy = iq_pi; + + /* + * This write notifies the controller that an IU is available to be + * processed. + */ + writel(iq_pi, admin_queues->iq_pi); +} + +#define PQI_ADMIN_REQUEST_TIMEOUT_SECS 60 + +static int pqi_poll_for_admin_response(struct pqi_ctrl_info *ctrl_info, + struct pqi_general_admin_response *response) +{ + struct pqi_admin_queues *admin_queues; + pqi_index_t oq_pi; + pqi_index_t oq_ci; + unsigned long timeout; + + admin_queues = &ctrl_info->admin_queues; + oq_ci = admin_queues->oq_ci_copy; + + timeout = (PQI_ADMIN_REQUEST_TIMEOUT_SECS * HZ) + jiffies; + + while (1) { + oq_pi = readl(admin_queues->oq_pi); + if (oq_pi != oq_ci) + break; + if (time_after(jiffies, timeout)) { + dev_err(&ctrl_info->pci_dev->dev, + "timed out waiting for admin response\n"); + return -ETIMEDOUT; + } + if (!sis_is_firmware_running(ctrl_info)) + return -ENXIO; + msleep(1); + } + + memcpy(response, admin_queues->oq_element_array + + (oq_ci * PQI_ADMIN_OQ_ELEMENT_LENGTH), sizeof(*response)); + + oq_ci = (oq_ci + 1) % PQI_ADMIN_OQ_NUM_ELEMENTS; + admin_queues->oq_ci_copy = oq_ci; + writel(oq_ci, admin_queues->oq_ci); + + return 0; +} + +static void pqi_start_io(struct pqi_ctrl_info *ctrl_info, + struct pqi_queue_group *queue_group, enum pqi_io_path path, + struct pqi_io_request *io_request) +{ + struct pqi_io_request *next; + void *next_element; + pqi_index_t iq_pi; + pqi_index_t iq_ci; + size_t iu_length; + unsigned long flags; + unsigned int num_elements_needed; + unsigned int num_elements_to_end_of_queue; + size_t copy_count; + struct pqi_iu_header *request; + + spin_lock_irqsave(&queue_group->submit_lock[path], flags); + + if (io_request) { + io_request->queue_group = queue_group; + list_add_tail(&io_request->request_list_entry, + &queue_group->request_list[path]); + } + + iq_pi = queue_group->iq_pi_copy[path]; + + list_for_each_entry_safe(io_request, next, + &queue_group->request_list[path], request_list_entry) { + + request = io_request->iu; + + iu_length = get_unaligned_le16(&request->iu_length) + + PQI_REQUEST_HEADER_LENGTH; + num_elements_needed = + DIV_ROUND_UP(iu_length, + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH); + + iq_ci = readl(queue_group->iq_ci[path]); + + if (num_elements_needed > pqi_num_elements_free(iq_pi, iq_ci, + ctrl_info->num_elements_per_iq)) + break; + + put_unaligned_le16(queue_group->oq_id, + &request->response_queue_id); + + next_element = queue_group->iq_element_array[path] + + (iq_pi * PQI_OPERATIONAL_IQ_ELEMENT_LENGTH); + + num_elements_to_end_of_queue = + ctrl_info->num_elements_per_iq - iq_pi; + + if (num_elements_needed <= num_elements_to_end_of_queue) { + memcpy(next_element, request, iu_length); + } else { + copy_count = num_elements_to_end_of_queue * + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH; + memcpy(next_element, request, copy_count); + memcpy(queue_group->iq_element_array[path], + (u8 *)request + copy_count, + iu_length - copy_count); + } + + iq_pi = (iq_pi + num_elements_needed) % + ctrl_info->num_elements_per_iq; + + list_del(&io_request->request_list_entry); + } + + if (iq_pi != queue_group->iq_pi_copy[path]) { + queue_group->iq_pi_copy[path] = iq_pi; + /* + * This write notifies the controller that one or more IUs are + * available to be processed. + */ + writel(iq_pi, queue_group->iq_pi[path]); + } + + spin_unlock_irqrestore(&queue_group->submit_lock[path], flags); +} + +#define PQI_WAIT_FOR_COMPLETION_IO_TIMEOUT_SECS 10 + +static int pqi_wait_for_completion_io(struct pqi_ctrl_info *ctrl_info, + struct completion *wait) +{ + int rc; + + while (1) { + if (wait_for_completion_io_timeout(wait, + PQI_WAIT_FOR_COMPLETION_IO_TIMEOUT_SECS * HZ)) { + rc = 0; + break; + } + + pqi_check_ctrl_health(ctrl_info); + if (pqi_ctrl_offline(ctrl_info)) { + rc = -ENXIO; + break; + } + } + + return rc; +} + +static void pqi_raid_synchronous_complete(struct pqi_io_request *io_request, + void *context) +{ + struct completion *waiting = context; + + complete(waiting); +} + +static int pqi_process_raid_io_error_synchronous( + struct pqi_raid_error_info *error_info) +{ + int rc = -EIO; + + switch (error_info->data_out_result) { + case PQI_DATA_IN_OUT_GOOD: + if (error_info->status == SAM_STAT_GOOD) + rc = 0; + break; + case PQI_DATA_IN_OUT_UNDERFLOW: + if (error_info->status == SAM_STAT_GOOD || + error_info->status == SAM_STAT_CHECK_CONDITION) + rc = 0; + break; + case PQI_DATA_IN_OUT_ABORTED: + rc = PQI_CMD_STATUS_ABORTED; + break; + } + + return rc; +} + +static inline bool pqi_is_blockable_request(struct pqi_iu_header *request) +{ + return (request->driver_flags & PQI_DRIVER_NONBLOCKABLE_REQUEST) == 0; +} + +static int pqi_submit_raid_request_synchronous(struct pqi_ctrl_info *ctrl_info, + struct pqi_iu_header *request, unsigned int flags, + struct pqi_raid_error_info *error_info) +{ + int rc = 0; + struct pqi_io_request *io_request; + size_t iu_length; + DECLARE_COMPLETION_ONSTACK(wait); + + if (flags & PQI_SYNC_FLAGS_INTERRUPTABLE) { + if (down_interruptible(&ctrl_info->sync_request_sem)) + return -ERESTARTSYS; + } else { + down(&ctrl_info->sync_request_sem); + } + + pqi_ctrl_busy(ctrl_info); + /* + * Wait for other admin queue updates such as: + * config table changes, OFA memory updates, ... + */ + if (pqi_is_blockable_request(request)) + pqi_wait_if_ctrl_blocked(ctrl_info); + + if (pqi_ctrl_offline(ctrl_info)) { + rc = -ENXIO; + goto out; + } + + io_request = pqi_alloc_io_request(ctrl_info, NULL); + + put_unaligned_le16(io_request->index, + &(((struct pqi_raid_path_request *)request)->request_id)); + + if (request->iu_type == PQI_REQUEST_IU_RAID_PATH_IO) + ((struct pqi_raid_path_request *)request)->error_index = + ((struct pqi_raid_path_request *)request)->request_id; + + iu_length = get_unaligned_le16(&request->iu_length) + + PQI_REQUEST_HEADER_LENGTH; + memcpy(io_request->iu, request, iu_length); + + io_request->io_complete_callback = pqi_raid_synchronous_complete; + io_request->context = &wait; + + pqi_start_io(ctrl_info, &ctrl_info->queue_groups[PQI_DEFAULT_QUEUE_GROUP], RAID_PATH, + io_request); + + pqi_wait_for_completion_io(ctrl_info, &wait); + + if (error_info) { + if (io_request->error_info) + memcpy(error_info, io_request->error_info, sizeof(*error_info)); + else + memset(error_info, 0, sizeof(*error_info)); + } else if (rc == 0 && io_request->error_info) { + rc = pqi_process_raid_io_error_synchronous(io_request->error_info); + } + + pqi_free_io_request(io_request); + +out: + pqi_ctrl_unbusy(ctrl_info); + up(&ctrl_info->sync_request_sem); + + return rc; +} + +static int pqi_validate_admin_response( + struct pqi_general_admin_response *response, u8 expected_function_code) +{ + if (response->header.iu_type != PQI_RESPONSE_IU_GENERAL_ADMIN) + return -EINVAL; + + if (get_unaligned_le16(&response->header.iu_length) != + PQI_GENERAL_ADMIN_IU_LENGTH) + return -EINVAL; + + if (response->function_code != expected_function_code) + return -EINVAL; + + if (response->status != PQI_GENERAL_ADMIN_STATUS_SUCCESS) + return -EINVAL; + + return 0; +} + +static int pqi_submit_admin_request_synchronous( + struct pqi_ctrl_info *ctrl_info, + struct pqi_general_admin_request *request, + struct pqi_general_admin_response *response) +{ + int rc; + + pqi_submit_admin_request(ctrl_info, request); + + rc = pqi_poll_for_admin_response(ctrl_info, response); + + if (rc == 0) + rc = pqi_validate_admin_response(response, request->function_code); + + return rc; +} + +static int pqi_report_device_capability(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + struct pqi_general_admin_request request; + struct pqi_general_admin_response response; + struct pqi_device_capability *capability; + struct pqi_iu_layer_descriptor *sop_iu_layer_descriptor; + + capability = kmalloc(sizeof(*capability), GFP_KERNEL); + if (!capability) + return -ENOMEM; + + memset(&request, 0, sizeof(request)); + + request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN; + put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH, + &request.header.iu_length); + request.function_code = + PQI_GENERAL_ADMIN_FUNCTION_REPORT_DEVICE_CAPABILITY; + put_unaligned_le32(sizeof(*capability), + &request.data.report_device_capability.buffer_length); + + rc = pqi_map_single(ctrl_info->pci_dev, + &request.data.report_device_capability.sg_descriptor, + capability, sizeof(*capability), + DMA_FROM_DEVICE); + if (rc) + goto out; + + rc = pqi_submit_admin_request_synchronous(ctrl_info, &request, &response); + + pqi_pci_unmap(ctrl_info->pci_dev, + &request.data.report_device_capability.sg_descriptor, 1, + DMA_FROM_DEVICE); + + if (rc) + goto out; + + if (response.status != PQI_GENERAL_ADMIN_STATUS_SUCCESS) { + rc = -EIO; + goto out; + } + + ctrl_info->max_inbound_queues = + get_unaligned_le16(&capability->max_inbound_queues); + ctrl_info->max_elements_per_iq = + get_unaligned_le16(&capability->max_elements_per_iq); + ctrl_info->max_iq_element_length = + get_unaligned_le16(&capability->max_iq_element_length) + * 16; + ctrl_info->max_outbound_queues = + get_unaligned_le16(&capability->max_outbound_queues); + ctrl_info->max_elements_per_oq = + get_unaligned_le16(&capability->max_elements_per_oq); + ctrl_info->max_oq_element_length = + get_unaligned_le16(&capability->max_oq_element_length) + * 16; + + sop_iu_layer_descriptor = + &capability->iu_layer_descriptors[PQI_PROTOCOL_SOP]; + + ctrl_info->max_inbound_iu_length_per_firmware = + get_unaligned_le16( + &sop_iu_layer_descriptor->max_inbound_iu_length); + ctrl_info->inbound_spanning_supported = + sop_iu_layer_descriptor->inbound_spanning_supported; + ctrl_info->outbound_spanning_supported = + sop_iu_layer_descriptor->outbound_spanning_supported; + +out: + kfree(capability); + + return rc; +} + +static int pqi_validate_device_capability(struct pqi_ctrl_info *ctrl_info) +{ + if (ctrl_info->max_iq_element_length < + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH) { + dev_err(&ctrl_info->pci_dev->dev, + "max. inbound queue element length of %d is less than the required length of %d\n", + ctrl_info->max_iq_element_length, + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH); + return -EINVAL; + } + + if (ctrl_info->max_oq_element_length < + PQI_OPERATIONAL_OQ_ELEMENT_LENGTH) { + dev_err(&ctrl_info->pci_dev->dev, + "max. outbound queue element length of %d is less than the required length of %d\n", + ctrl_info->max_oq_element_length, + PQI_OPERATIONAL_OQ_ELEMENT_LENGTH); + return -EINVAL; + } + + if (ctrl_info->max_inbound_iu_length_per_firmware < + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH) { + dev_err(&ctrl_info->pci_dev->dev, + "max. inbound IU length of %u is less than the min. required length of %d\n", + ctrl_info->max_inbound_iu_length_per_firmware, + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH); + return -EINVAL; + } + + if (!ctrl_info->inbound_spanning_supported) { + dev_err(&ctrl_info->pci_dev->dev, + "the controller does not support inbound spanning\n"); + return -EINVAL; + } + + if (ctrl_info->outbound_spanning_supported) { + dev_err(&ctrl_info->pci_dev->dev, + "the controller supports outbound spanning but this driver does not\n"); + return -EINVAL; + } + + return 0; +} + +static int pqi_create_event_queue(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + struct pqi_event_queue *event_queue; + struct pqi_general_admin_request request; + struct pqi_general_admin_response response; + + event_queue = &ctrl_info->event_queue; + + /* + * Create OQ (Outbound Queue - device to host queue) to dedicate + * to events. + */ + memset(&request, 0, sizeof(request)); + request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN; + put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH, + &request.header.iu_length); + request.function_code = PQI_GENERAL_ADMIN_FUNCTION_CREATE_OQ; + put_unaligned_le16(event_queue->oq_id, + &request.data.create_operational_oq.queue_id); + put_unaligned_le64((u64)event_queue->oq_element_array_bus_addr, + &request.data.create_operational_oq.element_array_addr); + put_unaligned_le64((u64)event_queue->oq_pi_bus_addr, + &request.data.create_operational_oq.pi_addr); + put_unaligned_le16(PQI_NUM_EVENT_QUEUE_ELEMENTS, + &request.data.create_operational_oq.num_elements); + put_unaligned_le16(PQI_EVENT_OQ_ELEMENT_LENGTH / 16, + &request.data.create_operational_oq.element_length); + request.data.create_operational_oq.queue_protocol = PQI_PROTOCOL_SOP; + put_unaligned_le16(event_queue->int_msg_num, + &request.data.create_operational_oq.int_msg_num); + + rc = pqi_submit_admin_request_synchronous(ctrl_info, &request, &response); + if (rc) + return rc; + + event_queue->oq_ci = ctrl_info->iomem_base + + PQI_DEVICE_REGISTERS_OFFSET + + get_unaligned_le64( + &response.data.create_operational_oq.oq_ci_offset); + + return 0; +} + +static int pqi_create_queue_group(struct pqi_ctrl_info *ctrl_info, + unsigned int group_number) +{ + int rc; + struct pqi_queue_group *queue_group; + struct pqi_general_admin_request request; + struct pqi_general_admin_response response; + + queue_group = &ctrl_info->queue_groups[group_number]; + + /* + * Create IQ (Inbound Queue - host to device queue) for + * RAID path. + */ + memset(&request, 0, sizeof(request)); + request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN; + put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH, + &request.header.iu_length); + request.function_code = PQI_GENERAL_ADMIN_FUNCTION_CREATE_IQ; + put_unaligned_le16(queue_group->iq_id[RAID_PATH], + &request.data.create_operational_iq.queue_id); + put_unaligned_le64( + (u64)queue_group->iq_element_array_bus_addr[RAID_PATH], + &request.data.create_operational_iq.element_array_addr); + put_unaligned_le64((u64)queue_group->iq_ci_bus_addr[RAID_PATH], + &request.data.create_operational_iq.ci_addr); + put_unaligned_le16(ctrl_info->num_elements_per_iq, + &request.data.create_operational_iq.num_elements); + put_unaligned_le16(PQI_OPERATIONAL_IQ_ELEMENT_LENGTH / 16, + &request.data.create_operational_iq.element_length); + request.data.create_operational_iq.queue_protocol = PQI_PROTOCOL_SOP; + + rc = pqi_submit_admin_request_synchronous(ctrl_info, &request, + &response); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error creating inbound RAID queue\n"); + return rc; + } + + queue_group->iq_pi[RAID_PATH] = ctrl_info->iomem_base + + PQI_DEVICE_REGISTERS_OFFSET + + get_unaligned_le64( + &response.data.create_operational_iq.iq_pi_offset); + + /* + * Create IQ (Inbound Queue - host to device queue) for + * Advanced I/O (AIO) path. + */ + memset(&request, 0, sizeof(request)); + request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN; + put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH, + &request.header.iu_length); + request.function_code = PQI_GENERAL_ADMIN_FUNCTION_CREATE_IQ; + put_unaligned_le16(queue_group->iq_id[AIO_PATH], + &request.data.create_operational_iq.queue_id); + put_unaligned_le64((u64)queue_group-> + iq_element_array_bus_addr[AIO_PATH], + &request.data.create_operational_iq.element_array_addr); + put_unaligned_le64((u64)queue_group->iq_ci_bus_addr[AIO_PATH], + &request.data.create_operational_iq.ci_addr); + put_unaligned_le16(ctrl_info->num_elements_per_iq, + &request.data.create_operational_iq.num_elements); + put_unaligned_le16(PQI_OPERATIONAL_IQ_ELEMENT_LENGTH / 16, + &request.data.create_operational_iq.element_length); + request.data.create_operational_iq.queue_protocol = PQI_PROTOCOL_SOP; + + rc = pqi_submit_admin_request_synchronous(ctrl_info, &request, &response); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error creating inbound AIO queue\n"); + return rc; + } + + queue_group->iq_pi[AIO_PATH] = ctrl_info->iomem_base + + PQI_DEVICE_REGISTERS_OFFSET + + get_unaligned_le64( + &response.data.create_operational_iq.iq_pi_offset); + + /* + * Designate the 2nd IQ as the AIO path. By default, all IQs are + * assumed to be for RAID path I/O unless we change the queue's + * property. + */ + memset(&request, 0, sizeof(request)); + request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN; + put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH, + &request.header.iu_length); + request.function_code = PQI_GENERAL_ADMIN_FUNCTION_CHANGE_IQ_PROPERTY; + put_unaligned_le16(queue_group->iq_id[AIO_PATH], + &request.data.change_operational_iq_properties.queue_id); + put_unaligned_le32(PQI_IQ_PROPERTY_IS_AIO_QUEUE, + &request.data.change_operational_iq_properties.vendor_specific); + + rc = pqi_submit_admin_request_synchronous(ctrl_info, &request, + &response); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error changing queue property\n"); + return rc; + } + + /* + * Create OQ (Outbound Queue - device to host queue). + */ + memset(&request, 0, sizeof(request)); + request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN; + put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH, + &request.header.iu_length); + request.function_code = PQI_GENERAL_ADMIN_FUNCTION_CREATE_OQ; + put_unaligned_le16(queue_group->oq_id, + &request.data.create_operational_oq.queue_id); + put_unaligned_le64((u64)queue_group->oq_element_array_bus_addr, + &request.data.create_operational_oq.element_array_addr); + put_unaligned_le64((u64)queue_group->oq_pi_bus_addr, + &request.data.create_operational_oq.pi_addr); + put_unaligned_le16(ctrl_info->num_elements_per_oq, + &request.data.create_operational_oq.num_elements); + put_unaligned_le16(PQI_OPERATIONAL_OQ_ELEMENT_LENGTH / 16, + &request.data.create_operational_oq.element_length); + request.data.create_operational_oq.queue_protocol = PQI_PROTOCOL_SOP; + put_unaligned_le16(queue_group->int_msg_num, + &request.data.create_operational_oq.int_msg_num); + + rc = pqi_submit_admin_request_synchronous(ctrl_info, &request, + &response); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error creating outbound queue\n"); + return rc; + } + + queue_group->oq_ci = ctrl_info->iomem_base + + PQI_DEVICE_REGISTERS_OFFSET + + get_unaligned_le64( + &response.data.create_operational_oq.oq_ci_offset); + + return 0; +} + +static int pqi_create_queues(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + unsigned int i; + + rc = pqi_create_event_queue(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error creating event queue\n"); + return rc; + } + + for (i = 0; i < ctrl_info->num_queue_groups; i++) { + rc = pqi_create_queue_group(ctrl_info, i); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error creating queue group number %u/%u\n", + i, ctrl_info->num_queue_groups); + return rc; + } + } + + return 0; +} + +#define PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH \ + (offsetof(struct pqi_event_config, descriptors) + \ + (PQI_MAX_EVENT_DESCRIPTORS * sizeof(struct pqi_event_descriptor))) + +static int pqi_configure_events(struct pqi_ctrl_info *ctrl_info, + bool enable_events) +{ + int rc; + unsigned int i; + struct pqi_event_config *event_config; + struct pqi_event_descriptor *event_descriptor; + struct pqi_general_management_request request; + + event_config = kmalloc(PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH, + GFP_KERNEL); + if (!event_config) + return -ENOMEM; + + memset(&request, 0, sizeof(request)); + + request.header.iu_type = PQI_REQUEST_IU_REPORT_VENDOR_EVENT_CONFIG; + put_unaligned_le16(offsetof(struct pqi_general_management_request, + data.report_event_configuration.sg_descriptors[1]) - + PQI_REQUEST_HEADER_LENGTH, &request.header.iu_length); + put_unaligned_le32(PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH, + &request.data.report_event_configuration.buffer_length); + + rc = pqi_map_single(ctrl_info->pci_dev, + request.data.report_event_configuration.sg_descriptors, + event_config, PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH, + DMA_FROM_DEVICE); + if (rc) + goto out; + + rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL); + + pqi_pci_unmap(ctrl_info->pci_dev, + request.data.report_event_configuration.sg_descriptors, 1, + DMA_FROM_DEVICE); + + if (rc) + goto out; + + for (i = 0; i < event_config->num_event_descriptors; i++) { + event_descriptor = &event_config->descriptors[i]; + if (enable_events && + pqi_is_supported_event(event_descriptor->event_type)) + put_unaligned_le16(ctrl_info->event_queue.oq_id, + &event_descriptor->oq_id); + else + put_unaligned_le16(0, &event_descriptor->oq_id); + } + + memset(&request, 0, sizeof(request)); + + request.header.iu_type = PQI_REQUEST_IU_SET_VENDOR_EVENT_CONFIG; + put_unaligned_le16(offsetof(struct pqi_general_management_request, + data.report_event_configuration.sg_descriptors[1]) - + PQI_REQUEST_HEADER_LENGTH, &request.header.iu_length); + put_unaligned_le32(PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH, + &request.data.report_event_configuration.buffer_length); + + rc = pqi_map_single(ctrl_info->pci_dev, + request.data.report_event_configuration.sg_descriptors, + event_config, PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH, + DMA_TO_DEVICE); + if (rc) + goto out; + + rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL); + + pqi_pci_unmap(ctrl_info->pci_dev, + request.data.report_event_configuration.sg_descriptors, 1, + DMA_TO_DEVICE); + +out: + kfree(event_config); + + return rc; +} + +static inline int pqi_enable_events(struct pqi_ctrl_info *ctrl_info) +{ + return pqi_configure_events(ctrl_info, true); +} + +static void pqi_free_all_io_requests(struct pqi_ctrl_info *ctrl_info) +{ + unsigned int i; + struct device *dev; + size_t sg_chain_buffer_length; + struct pqi_io_request *io_request; + + if (!ctrl_info->io_request_pool) + return; + + dev = &ctrl_info->pci_dev->dev; + sg_chain_buffer_length = ctrl_info->sg_chain_buffer_length; + io_request = ctrl_info->io_request_pool; + + for (i = 0; i < ctrl_info->max_io_slots; i++) { + kfree(io_request->iu); + if (!io_request->sg_chain_buffer) + break; + dma_free_coherent(dev, sg_chain_buffer_length, + io_request->sg_chain_buffer, + io_request->sg_chain_buffer_dma_handle); + io_request++; + } + + kfree(ctrl_info->io_request_pool); + ctrl_info->io_request_pool = NULL; +} + +static inline int pqi_alloc_error_buffer(struct pqi_ctrl_info *ctrl_info) +{ + ctrl_info->error_buffer = dma_zalloc_coherent(&ctrl_info->pci_dev->dev, + ctrl_info->error_buffer_length, + &ctrl_info->error_buffer_dma_handle, GFP_KERNEL); + + if (!ctrl_info->error_buffer) + return -ENOMEM; + + return 0; +} + +static int pqi_alloc_io_resources(struct pqi_ctrl_info *ctrl_info) +{ + unsigned int i; + void *sg_chain_buffer; + size_t sg_chain_buffer_length; + dma_addr_t sg_chain_buffer_dma_handle; + struct device *dev; + struct pqi_io_request *io_request; + + ctrl_info->io_request_pool = kzalloc(ctrl_info->max_io_slots * + sizeof(ctrl_info->io_request_pool[0]), GFP_KERNEL); + + if (!ctrl_info->io_request_pool) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to allocate I/O request pool\n"); + goto error; + } + + dev = &ctrl_info->pci_dev->dev; + sg_chain_buffer_length = ctrl_info->sg_chain_buffer_length; + io_request = ctrl_info->io_request_pool; + + for (i = 0; i < ctrl_info->max_io_slots; i++) { + io_request->iu = kmalloc(ctrl_info->max_inbound_iu_length, GFP_KERNEL); + + if (!io_request->iu) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to allocate IU buffers\n"); + goto error; + } + + sg_chain_buffer = dma_alloc_coherent(dev, + sg_chain_buffer_length, &sg_chain_buffer_dma_handle, + GFP_KERNEL); + + if (!sg_chain_buffer) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to allocate PQI scatter-gather chain buffers\n"); + goto error; + } + + io_request->index = i; + io_request->sg_chain_buffer = sg_chain_buffer; + io_request->sg_chain_buffer_dma_handle = sg_chain_buffer_dma_handle; + io_request++; + } + + return 0; + +error: + pqi_free_all_io_requests(ctrl_info); + + return -ENOMEM; +} + +/* + * Calculate required resources that are sized based on max. outstanding + * requests and max. transfer size. + */ + +static void pqi_calculate_io_resources(struct pqi_ctrl_info *ctrl_info) +{ + u32 max_transfer_size; + u32 max_sg_entries; + + ctrl_info->scsi_ml_can_queue = + ctrl_info->max_outstanding_requests - PQI_RESERVED_IO_SLOTS; + ctrl_info->max_io_slots = ctrl_info->max_outstanding_requests; +#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT + ctrl_info->per_cpu_factor = ctrl_info->max_io_slots / num_online_cpus(); +#endif + + ctrl_info->error_buffer_length = + ctrl_info->max_io_slots * PQI_ERROR_BUFFER_ELEMENT_LENGTH; + + if (reset_devices) + max_transfer_size = min(ctrl_info->max_transfer_size, + PQI_MAX_TRANSFER_SIZE_KDUMP); + else + max_transfer_size = min(ctrl_info->max_transfer_size, + PQI_MAX_TRANSFER_SIZE); + + max_sg_entries = max_transfer_size / PAGE_SIZE; + + /* +1 to cover when the buffer is not page-aligned. */ + max_sg_entries++; + + max_sg_entries = min(ctrl_info->max_sg_entries, max_sg_entries); + + max_transfer_size = (max_sg_entries - 1) * PAGE_SIZE; + + ctrl_info->sg_chain_buffer_length = + (max_sg_entries * sizeof(struct pqi_sg_descriptor)) + + PQI_EXTRA_SGL_MEMORY; + ctrl_info->sg_tablesize = max_sg_entries; + ctrl_info->max_sectors = max_transfer_size / 512; +} + +static void pqi_calculate_queue_resources(struct pqi_ctrl_info *ctrl_info) +{ + int num_queue_groups; + u16 num_elements_per_iq; + u16 num_elements_per_oq; + + if (reset_devices) { + num_queue_groups = 1; + } else { + int num_cpus; + int max_queue_groups; + + max_queue_groups = min(ctrl_info->max_inbound_queues / 2, + ctrl_info->max_outbound_queues - 1); + max_queue_groups = min(max_queue_groups, PQI_MAX_QUEUE_GROUPS); + + num_cpus = num_online_cpus(); + num_queue_groups = min(num_cpus, ctrl_info->max_msix_vectors); + num_queue_groups = min(num_queue_groups, max_queue_groups); + } + + ctrl_info->num_queue_groups = num_queue_groups; + + /* + * Make sure that the max. inbound IU length is an even multiple + * of our inbound element length. + */ + ctrl_info->max_inbound_iu_length = + (ctrl_info->max_inbound_iu_length_per_firmware / + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH) * + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH; + + num_elements_per_iq = + (ctrl_info->max_inbound_iu_length / + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH); + + /* Add one because one element in each queue is unusable. */ + num_elements_per_iq++; + + num_elements_per_iq = min(num_elements_per_iq, + ctrl_info->max_elements_per_iq); + + num_elements_per_oq = ((num_elements_per_iq - 1) * 2) + 1; + num_elements_per_oq = min(num_elements_per_oq, + ctrl_info->max_elements_per_oq); + + ctrl_info->num_elements_per_iq = num_elements_per_iq; + ctrl_info->num_elements_per_oq = num_elements_per_oq; + + ctrl_info->max_sg_per_iu = + ((ctrl_info->max_inbound_iu_length - + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH) / + sizeof(struct pqi_sg_descriptor)) + + PQI_MAX_EMBEDDED_SG_DESCRIPTORS; + + ctrl_info->max_sg_per_r56_iu = + ((ctrl_info->max_inbound_iu_length - + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH) / + sizeof(struct pqi_sg_descriptor)) + + PQI_MAX_EMBEDDED_R56_SG_DESCRIPTORS; +} + +static inline void pqi_set_sg_descriptor(struct pqi_sg_descriptor *sg_descriptor, + struct scatterlist *sg) +{ + u64 address = (u64)sg_dma_address(sg); + unsigned int length = sg_dma_len(sg); + + put_unaligned_le64(address, &sg_descriptor->address); + put_unaligned_le32(length, &sg_descriptor->length); + put_unaligned_le32(0, &sg_descriptor->flags); +} + +static unsigned int pqi_build_sg_list(struct pqi_sg_descriptor *sg_descriptor, + struct scatterlist *sg, int sg_count, struct pqi_io_request *io_request, + int max_sg_per_iu, bool *chained) +{ + int i; + unsigned int num_sg_in_iu; + + *chained = false; + i = 0; + num_sg_in_iu = 0; + max_sg_per_iu--; /* Subtract 1 to leave room for chain marker. */ + + while (1) { + pqi_set_sg_descriptor(sg_descriptor, sg); + if (!*chained) + num_sg_in_iu++; + i++; + if (i == sg_count) + break; + sg_descriptor++; + if (i == max_sg_per_iu) { + put_unaligned_le64((u64)io_request->sg_chain_buffer_dma_handle, + &sg_descriptor->address); + put_unaligned_le32((sg_count - num_sg_in_iu) * sizeof(*sg_descriptor), + &sg_descriptor->length); + put_unaligned_le32(CISS_SG_CHAIN, &sg_descriptor->flags); + *chained = true; + num_sg_in_iu++; + sg_descriptor = io_request->sg_chain_buffer; + } + sg = sg_next(sg); + } + + put_unaligned_le32(CISS_SG_LAST, &sg_descriptor->flags); + + return num_sg_in_iu; +} + +static int pqi_build_raid_sg_list(struct pqi_ctrl_info *ctrl_info, + struct pqi_raid_path_request *request, struct scsi_cmnd *scmd, + struct pqi_io_request *io_request) +{ + u16 iu_length; + int sg_count; + bool chained; + unsigned int num_sg_in_iu; + struct scatterlist *sg; + struct pqi_sg_descriptor *sg_descriptor; + + sg_count = scsi_dma_map(scmd); + if (sg_count < 0) + return sg_count; + + iu_length = offsetof(struct pqi_raid_path_request, sg_descriptors) - + PQI_REQUEST_HEADER_LENGTH; + + if (sg_count == 0) + goto out; + + sg = scsi_sglist(scmd); + sg_descriptor = request->sg_descriptors; + + num_sg_in_iu = pqi_build_sg_list(sg_descriptor, sg, sg_count, io_request, + ctrl_info->max_sg_per_iu, &chained); + + request->partial = chained; + iu_length += num_sg_in_iu * sizeof(*sg_descriptor); + +out: + put_unaligned_le16(iu_length, &request->header.iu_length); + + return 0; +} + +static int pqi_build_aio_r1_sg_list(struct pqi_ctrl_info *ctrl_info, + struct pqi_aio_r1_path_request *request, struct scsi_cmnd *scmd, + struct pqi_io_request *io_request) +{ + u16 iu_length; + int sg_count; + bool chained; + unsigned int num_sg_in_iu; + struct scatterlist *sg; + struct pqi_sg_descriptor *sg_descriptor; + + sg_count = scsi_dma_map(scmd); + if (sg_count < 0) + return sg_count; + + iu_length = offsetof(struct pqi_aio_r1_path_request, sg_descriptors) - + PQI_REQUEST_HEADER_LENGTH; + num_sg_in_iu = 0; + + if (sg_count == 0) + goto out; + + sg = scsi_sglist(scmd); + sg_descriptor = request->sg_descriptors; + + num_sg_in_iu = pqi_build_sg_list(sg_descriptor, sg, sg_count, io_request, + ctrl_info->max_sg_per_iu, &chained); + + request->partial = chained; + iu_length += num_sg_in_iu * sizeof(*sg_descriptor); + +out: + put_unaligned_le16(iu_length, &request->header.iu_length); + request->num_sg_descriptors = num_sg_in_iu; + + return 0; +} + +static int pqi_build_aio_r56_sg_list(struct pqi_ctrl_info *ctrl_info, + struct pqi_aio_r56_path_request *request, struct scsi_cmnd *scmd, + struct pqi_io_request *io_request) +{ + u16 iu_length; + int sg_count; + bool chained; + unsigned int num_sg_in_iu; + struct scatterlist *sg; + struct pqi_sg_descriptor *sg_descriptor; + + sg_count = scsi_dma_map(scmd); + if (sg_count < 0) + return sg_count; + + iu_length = offsetof(struct pqi_aio_r56_path_request, sg_descriptors) - + PQI_REQUEST_HEADER_LENGTH; + num_sg_in_iu = 0; + + if (sg_count == 0) + goto out; + + sg = scsi_sglist(scmd); + sg_descriptor = request->sg_descriptors; + + num_sg_in_iu = pqi_build_sg_list(sg_descriptor, sg, sg_count, io_request, + ctrl_info->max_sg_per_r56_iu, &chained); + + request->partial = chained; + iu_length += num_sg_in_iu * sizeof(*sg_descriptor); + +out: + put_unaligned_le16(iu_length, &request->header.iu_length); + request->num_sg_descriptors = num_sg_in_iu; + + return 0; +} + +static int pqi_build_aio_sg_list(struct pqi_ctrl_info *ctrl_info, + struct pqi_aio_path_request *request, struct scsi_cmnd *scmd, + struct pqi_io_request *io_request) +{ + u16 iu_length; + int sg_count; + bool chained; + unsigned int num_sg_in_iu; + struct scatterlist *sg; + struct pqi_sg_descriptor *sg_descriptor; + + sg_count = scsi_dma_map(scmd); + if (sg_count < 0) + return sg_count; + + iu_length = offsetof(struct pqi_aio_path_request, sg_descriptors) - + PQI_REQUEST_HEADER_LENGTH; + num_sg_in_iu = 0; + + if (sg_count == 0) + goto out; + + sg = scsi_sglist(scmd); + sg_descriptor = request->sg_descriptors; + + num_sg_in_iu = pqi_build_sg_list(sg_descriptor, sg, sg_count, io_request, + ctrl_info->max_sg_per_iu, &chained); + + request->partial = chained; + iu_length += num_sg_in_iu * sizeof(*sg_descriptor); + +out: + put_unaligned_le16(iu_length, &request->header.iu_length); + request->num_sg_descriptors = num_sg_in_iu; + + return 0; +} + +static void pqi_raid_io_complete(struct pqi_io_request *io_request, + void *context) +{ + struct scsi_cmnd *scmd; + + scmd = io_request->scmd; + pqi_free_io_request(io_request); + scsi_dma_unmap(scmd); + pqi_scsi_done(scmd); +} + +static int pqi_raid_submit_io(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, struct scsi_cmnd *scmd, + struct pqi_queue_group *queue_group, bool io_high_prio) +{ + int rc; + size_t cdb_length; + struct pqi_io_request *io_request; + struct pqi_raid_path_request *request; + + io_request = pqi_alloc_io_request(ctrl_info, scmd); + if (!io_request) + return SCSI_MLQUEUE_HOST_BUSY; + + io_request->io_complete_callback = pqi_raid_io_complete; + io_request->scmd = scmd; + + request = io_request->iu; + memset(request, 0, offsetof(struct pqi_raid_path_request, sg_descriptors)); + + request->header.iu_type = PQI_REQUEST_IU_RAID_PATH_IO; + put_unaligned_le32(scsi_bufflen(scmd), &request->buffer_length); + request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE; + request->command_priority = io_high_prio; + put_unaligned_le16(io_request->index, &request->request_id); + request->error_index = request->request_id; + memcpy(request->lun_number, device->scsi3addr, sizeof(request->lun_number)); + request->ml_device_lun_number = (u8)scmd->device->lun; + + cdb_length = min_t(size_t, scmd->cmd_len, sizeof(request->cdb)); + memcpy(request->cdb, scmd->cmnd, cdb_length); + + switch (cdb_length) { + case 6: + case 10: + case 12: + case 16: + request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_0; + break; + case 20: + request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_4; + break; + case 24: + request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_8; + break; + case 28: + request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_12; + break; + case 32: + default: + request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_16; + break; + } + +#if TORTUGA + scmd->sc_data_direction = DMA_BIDIRECTIONAL; +#endif + + switch (scmd->sc_data_direction) { + case DMA_FROM_DEVICE: + request->data_direction = SOP_READ_FLAG; + break; + case DMA_TO_DEVICE: + request->data_direction = SOP_WRITE_FLAG; + break; + case DMA_NONE: + request->data_direction = SOP_NO_DIRECTION_FLAG; + break; + case DMA_BIDIRECTIONAL: + request->data_direction = SOP_BIDIRECTIONAL; + break; + default: + dev_err(&ctrl_info->pci_dev->dev, + "unknown data direction: %d\n", + scmd->sc_data_direction); + BUG(); + break; + } + + rc = pqi_build_raid_sg_list(ctrl_info, request, scmd, io_request); + if (rc) { + pqi_free_io_request(io_request); + return SCSI_MLQUEUE_HOST_BUSY; + } + + pqi_start_io(ctrl_info, queue_group, RAID_PATH, io_request); + + return 0; +} + +static inline int pqi_raid_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, struct scsi_cmnd *scmd, + struct pqi_queue_group *queue_group) +{ + bool io_high_prio; + + io_high_prio = pqi_is_io_high_priority(device, scmd); + + return pqi_raid_submit_io(ctrl_info, device, scmd, queue_group, io_high_prio); +} + +static bool pqi_raid_bypass_retry_needed(struct pqi_io_request *io_request) +{ + struct scsi_cmnd *scmd; + struct pqi_scsi_dev *device; + struct pqi_ctrl_info *ctrl_info; + + if (!io_request->raid_bypass) + return false; + + scmd = io_request->scmd; + if ((scmd->result & 0xff) == SAM_STAT_GOOD) + return false; + if (host_byte(scmd->result) == DID_NO_CONNECT) + return false; + + device = scmd->device->hostdata; + if (pqi_device_offline(device) || pqi_device_in_remove(device)) + return false; + + ctrl_info = shost_to_hba(scmd->device->host); + if (pqi_ctrl_offline(ctrl_info)) + return false; + + return true; +} + +static void pqi_aio_io_complete(struct pqi_io_request *io_request, + void *context) +{ + struct scsi_cmnd *scmd; + + scmd = io_request->scmd; + scsi_dma_unmap(scmd); + if (io_request->status == -EAGAIN || pqi_raid_bypass_retry_needed(io_request)) { + set_host_byte(scmd, DID_IMM_RETRY); + PQI_SCSI_CMD_RESIDUAL(scmd)++; + } + + pqi_free_io_request(io_request); + pqi_scsi_done(scmd); +} + +static inline int pqi_aio_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, struct scsi_cmnd *scmd, + struct pqi_queue_group *queue_group) +{ + bool io_high_prio; + + io_high_prio = pqi_is_io_high_priority(device, scmd); + + return pqi_aio_submit_io(ctrl_info, scmd, device->aio_handle, + scmd->cmnd, scmd->cmd_len, queue_group, NULL, + false, io_high_prio); +} + +static int pqi_aio_submit_io(struct pqi_ctrl_info *ctrl_info, + struct scsi_cmnd *scmd, u32 aio_handle, u8 *cdb, + unsigned int cdb_length, struct pqi_queue_group *queue_group, + struct pqi_encryption_info *encryption_info, bool raid_bypass, + bool io_high_prio) +{ + int rc; + struct pqi_io_request *io_request; + struct pqi_aio_path_request *request; + struct pqi_scsi_dev *device; + + io_request = pqi_alloc_io_request(ctrl_info, scmd); + if (!io_request) + return SCSI_MLQUEUE_HOST_BUSY; + + io_request->io_complete_callback = pqi_aio_io_complete; + io_request->scmd = scmd; + io_request->raid_bypass = raid_bypass; + + request = io_request->iu; + memset(request, 0, offsetof(struct pqi_aio_path_request, sg_descriptors)); + + request->header.iu_type = PQI_REQUEST_IU_AIO_PATH_IO; + put_unaligned_le32(aio_handle, &request->nexus_id); + put_unaligned_le32(scsi_bufflen(scmd), &request->buffer_length); + request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE; + request->command_priority = io_high_prio; + put_unaligned_le16(io_request->index, &request->request_id); + request->error_index = request->request_id; + device = scmd->device->hostdata; + if (!pqi_is_logical_device(device) && ctrl_info->multi_lun_device_supported) + put_unaligned_le64(((scmd->device->lun) << 8), &request->lun_number); + if (cdb_length > sizeof(request->cdb)) + cdb_length = sizeof(request->cdb); + request->cdb_length = cdb_length; + memcpy(request->cdb, cdb, cdb_length); + + switch (scmd->sc_data_direction) { + case DMA_TO_DEVICE: + request->data_direction = SOP_READ_FLAG; + break; + case DMA_FROM_DEVICE: + request->data_direction = SOP_WRITE_FLAG; + break; + case DMA_NONE: + request->data_direction = SOP_NO_DIRECTION_FLAG; + break; + case DMA_BIDIRECTIONAL: + request->data_direction = SOP_BIDIRECTIONAL; + break; + default: + dev_err(&ctrl_info->pci_dev->dev, + "unknown data direction: %d\n", + scmd->sc_data_direction); + BUG(); + break; + } + + if (encryption_info) { + request->encryption_enable = true; + put_unaligned_le16(encryption_info->data_encryption_key_index, + &request->data_encryption_key_index); + put_unaligned_le32(encryption_info->encrypt_tweak_lower, + &request->encrypt_tweak_lower); + put_unaligned_le32(encryption_info->encrypt_tweak_upper, + &request->encrypt_tweak_upper); + } + + rc = pqi_build_aio_sg_list(ctrl_info, request, scmd, io_request); + if (rc) { + pqi_free_io_request(io_request); + return SCSI_MLQUEUE_HOST_BUSY; + } + + pqi_start_io(ctrl_info, queue_group, AIO_PATH, io_request); + + return 0; +} + +static int pqi_aio_submit_r1_write_io(struct pqi_ctrl_info *ctrl_info, + struct scsi_cmnd *scmd, struct pqi_queue_group *queue_group, + struct pqi_encryption_info *encryption_info, struct pqi_scsi_dev *device, + struct pqi_scsi_dev_raid_map_data *rmd) +{ + int rc; + struct pqi_io_request *io_request; + struct pqi_aio_r1_path_request *r1_request; + + io_request = pqi_alloc_io_request(ctrl_info, scmd); + if (!io_request) + return SCSI_MLQUEUE_HOST_BUSY; + + io_request->io_complete_callback = pqi_aio_io_complete; + io_request->scmd = scmd; + io_request->raid_bypass = true; + + r1_request = io_request->iu; + memset(r1_request, 0, offsetof(struct pqi_aio_r1_path_request, sg_descriptors)); + + r1_request->header.iu_type = PQI_REQUEST_IU_AIO_PATH_RAID1_IO; + put_unaligned_le16(*(u16 *)device->scsi3addr & 0x3fff, &r1_request->volume_id); + r1_request->num_drives = rmd->num_it_nexus_entries; + put_unaligned_le32(rmd->it_nexus[0], &r1_request->it_nexus_1); + put_unaligned_le32(rmd->it_nexus[1], &r1_request->it_nexus_2); + if (rmd->num_it_nexus_entries == 3) + put_unaligned_le32(rmd->it_nexus[2], &r1_request->it_nexus_3); + + put_unaligned_le32(scsi_bufflen(scmd), &r1_request->data_length); + r1_request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE; + put_unaligned_le16(io_request->index, &r1_request->request_id); + r1_request->error_index = r1_request->request_id; + if (rmd->cdb_length > sizeof(r1_request->cdb)) + rmd->cdb_length = sizeof(r1_request->cdb); + r1_request->cdb_length = rmd->cdb_length; + memcpy(r1_request->cdb, rmd->cdb, rmd->cdb_length); + + /* + * The direction is always write. + * Note: a host write results in a controller read. + */ + r1_request->data_direction = SOP_READ_FLAG; + + if (encryption_info) { + r1_request->encryption_enable = true; + put_unaligned_le16(encryption_info->data_encryption_key_index, + &r1_request->data_encryption_key_index); + put_unaligned_le32(encryption_info->encrypt_tweak_lower, + &r1_request->encrypt_tweak_lower); + put_unaligned_le32(encryption_info->encrypt_tweak_upper, + &r1_request->encrypt_tweak_upper); + } + + rc = pqi_build_aio_r1_sg_list(ctrl_info, r1_request, scmd, io_request); + if (rc) { + pqi_free_io_request(io_request); + return SCSI_MLQUEUE_HOST_BUSY; + } + + pqi_start_io(ctrl_info, queue_group, AIO_PATH, io_request); + + return 0; +} + +static int pqi_aio_submit_r56_write_io(struct pqi_ctrl_info *ctrl_info, + struct scsi_cmnd *scmd, struct pqi_queue_group *queue_group, + struct pqi_encryption_info *encryption_info, struct pqi_scsi_dev *device, + struct pqi_scsi_dev_raid_map_data *rmd) +{ + int rc; + struct pqi_io_request *io_request; + struct pqi_aio_r56_path_request *r56_request; + + io_request = pqi_alloc_io_request(ctrl_info, scmd); + if (!io_request) + return SCSI_MLQUEUE_HOST_BUSY; + io_request->io_complete_callback = pqi_aio_io_complete; + io_request->scmd = scmd; + io_request->raid_bypass = true; + + r56_request = io_request->iu; + memset(r56_request, 0, offsetof(struct pqi_aio_r56_path_request, sg_descriptors)); + + if (device->raid_level == SA_RAID_5 || device->raid_level == SA_RAID_51) + r56_request->header.iu_type = PQI_REQUEST_IU_AIO_PATH_RAID5_IO; + else + r56_request->header.iu_type = PQI_REQUEST_IU_AIO_PATH_RAID6_IO; + + put_unaligned_le16(*(u16 *)device->scsi3addr & 0x3fff, &r56_request->volume_id); + put_unaligned_le32(rmd->aio_handle, &r56_request->data_it_nexus); + put_unaligned_le32(rmd->p_parity_it_nexus, &r56_request->p_parity_it_nexus); + if (rmd->raid_level == SA_RAID_6) { + put_unaligned_le32(rmd->q_parity_it_nexus, &r56_request->q_parity_it_nexus); + r56_request->xor_multiplier = rmd->xor_mult; + } + put_unaligned_le32(scsi_bufflen(scmd), &r56_request->data_length); + r56_request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE; + put_unaligned_le64(rmd->row, &r56_request->row); + + put_unaligned_le16(io_request->index, &r56_request->request_id); + r56_request->error_index = r56_request->request_id; + + if (rmd->cdb_length > sizeof(r56_request->cdb)) + rmd->cdb_length = sizeof(r56_request->cdb); + r56_request->cdb_length = rmd->cdb_length; + memcpy(r56_request->cdb, rmd->cdb, rmd->cdb_length); + + /* + * The direction is always write. + * Note: a host write results in a controller read. + */ + r56_request->data_direction = SOP_READ_FLAG; + + if (encryption_info) { + r56_request->encryption_enable = true; + put_unaligned_le16(encryption_info->data_encryption_key_index, + &r56_request->data_encryption_key_index); + put_unaligned_le32(encryption_info->encrypt_tweak_lower, + &r56_request->encrypt_tweak_lower); + put_unaligned_le32(encryption_info->encrypt_tweak_upper, + &r56_request->encrypt_tweak_upper); + } + + rc = pqi_build_aio_r56_sg_list(ctrl_info, r56_request, scmd, io_request); + if (rc) { + pqi_free_io_request(io_request); + return SCSI_MLQUEUE_HOST_BUSY; + } + + pqi_start_io(ctrl_info, queue_group, AIO_PATH, io_request); + + return 0; +} + +static inline bool pqi_is_bypass_eligible_request(struct scsi_cmnd *scmd) +{ + if (blk_rq_is_passthrough(PQI_SCSI_REQUEST(scmd))) + return false; + + return PQI_SCSI_CMD_RESIDUAL(scmd) == 0; +} + +/* + * This function gets called just before we hand the completed SCSI request + * back to the SML. + */ + +void pqi_prep_for_scsi_done(struct scsi_cmnd *scmd) +{ + struct pqi_scsi_dev *device; + struct pqi_ctrl_info *ctrl_info; + struct Scsi_Host *shost; + + if (!scmd->device) { + set_host_byte(scmd, DID_NO_CONNECT); + return; + } + + device = scmd->device->hostdata; + if (!device) { + set_host_byte(scmd, DID_NO_CONNECT); + return; + } + + shost = scmd->device->host; + ctrl_info = shost_to_hba(shost); + + atomic_dec(&device->scsi_cmds_outstanding[scmd->device->lun]); +#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT + atomic_dec(&ctrl_info->total_scmds_outstanding); +#endif +} + +static bool pqi_is_parity_write_stream(struct pqi_ctrl_info *ctrl_info, + struct scsi_cmnd *scmd) +{ + u32 oldest_jiffies; + u8 lru_index; + int i; + int rc; + struct pqi_scsi_dev *device; + struct pqi_stream_data *pqi_stream_data; + struct pqi_scsi_dev_raid_map_data rmd; + + if (!ctrl_info->enable_stream_detection) + return false; + + rc = pqi_get_aio_lba_and_block_count(scmd, &rmd); + if (rc) + return false; + + /* Check writes only. */ + if (!rmd.is_write) + return false; + + device = scmd->device->hostdata; + + /* Check for RAID 5/6 streams. */ + if (device->raid_level != SA_RAID_5 && device->raid_level != SA_RAID_6) + return false; + + /* + * If controller does not support AIO RAID{5,6} writes, need to send + * requests down non-AIO path. + */ + if ((device->raid_level == SA_RAID_5 && !ctrl_info->enable_r5_writes) || + (device->raid_level == SA_RAID_6 && !ctrl_info->enable_r6_writes)) + return true; + + lru_index = 0; + oldest_jiffies = INT_MAX; + for (i = 0; i < NUM_STREAMS_PER_LUN; i++) { + pqi_stream_data = &device->stream_data[i]; + /* + * Check for adjacent request or request is within + * the previous request. + */ + if ((pqi_stream_data->next_lba && + rmd.first_block >= pqi_stream_data->next_lba) && + rmd.first_block <= pqi_stream_data->next_lba + + rmd.block_cnt) { + pqi_stream_data->next_lba = rmd.first_block + + rmd.block_cnt; + pqi_stream_data->last_accessed = jiffies; + return true; + } + + /* unused entry */ + if (pqi_stream_data->last_accessed == 0) { + lru_index = i; + break; + } + + /* Find entry with oldest last accessed time. */ + if (pqi_stream_data->last_accessed <= oldest_jiffies) { + oldest_jiffies = pqi_stream_data->last_accessed; + lru_index = i; + } + } + + /* Set LRU entry. */ + pqi_stream_data = &device->stream_data[lru_index]; + pqi_stream_data->last_accessed = jiffies; + pqi_stream_data->next_lba = rmd.first_block + rmd.block_cnt; + + return false; +} + +int pqi_scsi_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd) +{ + int rc; + struct pqi_ctrl_info *ctrl_info; + struct pqi_scsi_dev *device; + u16 hw_queue; + struct pqi_queue_group *queue_group; + bool raid_bypassed; + + device = scmd->device->hostdata; + ctrl_info = shost_to_hba(shost); + + if (!device) { + set_host_byte(scmd, DID_NO_CONNECT); + pqi_scsi_done(scmd); + return 0; + } + + atomic_inc(&device->scsi_cmds_outstanding[scmd->device->lun]); +#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT + if (atomic_inc_return(&ctrl_info->total_scmds_outstanding) > + ctrl_info->scsi_ml_can_queue) { + rc = SCSI_MLQUEUE_HOST_BUSY; + goto out; + } +#endif + + if (pqi_ctrl_offline(ctrl_info) || pqi_device_in_remove(device)) { + set_host_byte(scmd, DID_NO_CONNECT); + pqi_scsi_done(scmd); + return 0; + } + + if (pqi_ctrl_blocked(ctrl_info)) { + rc = SCSI_MLQUEUE_HOST_BUSY; + goto out; + } + + /* + * This is necessary because the SML doesn't zero out this field during + * error recovery. + */ + scmd->result = 0; + + hw_queue = pqi_get_hw_queue(ctrl_info, scmd); + queue_group = &ctrl_info->queue_groups[hw_queue]; + + if (pqi_is_logical_device(device)) { + raid_bypassed = false; + if (device->raid_bypass_enabled && + pqi_is_bypass_eligible_request(scmd) && + !pqi_is_parity_write_stream(ctrl_info, scmd)) { + rc = pqi_raid_bypass_submit_scsi_cmd(ctrl_info, device, scmd, queue_group); + if (rc == 0 || rc == SCSI_MLQUEUE_HOST_BUSY) { + raid_bypassed = true; + device->raid_bypass_cnt++; + } + } + if (!raid_bypassed) + rc = pqi_raid_submit_scsi_cmd(ctrl_info, device, scmd, queue_group); + } else { + if (device->aio_enabled) + rc = pqi_aio_submit_scsi_cmd(ctrl_info, device, scmd, queue_group); + else + rc = pqi_raid_submit_scsi_cmd(ctrl_info, device, scmd, queue_group); + } + +out: + if (rc) { + atomic_dec(&device->scsi_cmds_outstanding[scmd->device->lun]); +#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT + atomic_dec(&ctrl_info->total_scmds_outstanding); +#endif + } + + return rc; +} + +static unsigned int pqi_queued_io_count(struct pqi_ctrl_info *ctrl_info) +{ + unsigned int i; + unsigned int path; + unsigned long flags; + unsigned int queued_io_count; + struct pqi_queue_group *queue_group; + struct pqi_io_request *io_request; + + queued_io_count = 0; + + for (i = 0; i < ctrl_info->num_queue_groups; i++) { + queue_group = &ctrl_info->queue_groups[i]; + for (path = 0; path < 2; path++) { + spin_lock_irqsave(&queue_group->submit_lock[path], flags); + list_for_each_entry(io_request, &queue_group->request_list[path], request_list_entry) + queued_io_count++; + spin_unlock_irqrestore(&queue_group->submit_lock[path], flags); + } + } + + return queued_io_count; +} + +static unsigned int pqi_nonempty_inbound_queue_count(struct pqi_ctrl_info *ctrl_info) +{ + unsigned int i; + unsigned int path; + unsigned int nonempty_inbound_queue_count; + struct pqi_queue_group *queue_group; + pqi_index_t iq_pi; + pqi_index_t iq_ci; + + nonempty_inbound_queue_count = 0; + + for (i = 0; i < ctrl_info->num_queue_groups; i++) { + queue_group = &ctrl_info->queue_groups[i]; + for (path = 0; path < 2; path++) { + iq_pi = queue_group->iq_pi_copy[path]; + iq_ci = readl(queue_group->iq_ci[path]); + if (iq_ci != iq_pi) + nonempty_inbound_queue_count++; + } + } + + return nonempty_inbound_queue_count; +} + +#define PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS 10 + +static int pqi_wait_until_inbound_queues_empty(struct pqi_ctrl_info *ctrl_info) +{ + unsigned long start_jiffies; + unsigned long warning_timeout; + unsigned int queued_io_count; + unsigned int nonempty_inbound_queue_count; + bool displayed_warning; + + displayed_warning = false; + start_jiffies = jiffies; + warning_timeout = (PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS * HZ) + start_jiffies; + + while (1) { + queued_io_count = pqi_queued_io_count(ctrl_info); + nonempty_inbound_queue_count = pqi_nonempty_inbound_queue_count(ctrl_info); + if (queued_io_count == 0 && nonempty_inbound_queue_count == 0) + break; + pqi_check_ctrl_health(ctrl_info); + if (pqi_ctrl_offline(ctrl_info)) + return -ENXIO; + if (time_after(jiffies, warning_timeout)) { + dev_warn(&ctrl_info->pci_dev->dev, + "waiting %u seconds for queued I/O to drain (queued I/O count: %u; non-empty inbound queue count: %u)\n", + jiffies_to_msecs(jiffies - start_jiffies) / 1000, queued_io_count, nonempty_inbound_queue_count); + displayed_warning = true; + warning_timeout = (PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS * HZ) + jiffies; + } + msleep(1); + } + + if (displayed_warning) + dev_warn(&ctrl_info->pci_dev->dev, + "queued I/O drained after waiting for %u seconds\n", + jiffies_to_msecs(jiffies - start_jiffies) / 1000); + + return 0; +} + +static void pqi_fail_io_queued_for_device(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device) +{ + unsigned int i; + unsigned int path; + struct pqi_queue_group *queue_group; + unsigned long flags; + struct pqi_io_request *io_request; + struct pqi_io_request *next; + struct scsi_cmnd *scmd; + struct pqi_scsi_dev *scsi_device; + + for (i = 0; i < ctrl_info->num_queue_groups; i++) { + queue_group = &ctrl_info->queue_groups[i]; + + for (path = 0; path < 2; path++) { + spin_lock_irqsave( + &queue_group->submit_lock[path], flags); + + list_for_each_entry_safe(io_request, next, + &queue_group->request_list[path], + request_list_entry) { + + scmd = io_request->scmd; + if (!scmd) + continue; + + scsi_device = scmd->device->hostdata; + if (scsi_device != device) + continue; + + list_del(&io_request->request_list_entry); + set_host_byte(scmd, DID_RESET); + pqi_free_io_request(io_request); + scsi_dma_unmap(scmd); + pqi_scsi_done(scmd); + } + + spin_unlock_irqrestore( + &queue_group->submit_lock[path], flags); + } + } +} + +#define PQI_PENDING_IO_WARNING_TIMEOUT_SECS 10 + +static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, u8 lun, unsigned long timeout_msecs) +{ + int cmds_outstanding; + unsigned long start_jiffies; + unsigned long warning_timeout; + unsigned long msecs_waiting; + + start_jiffies = jiffies; + warning_timeout = (PQI_PENDING_IO_WARNING_TIMEOUT_SECS * HZ) + start_jiffies; + + while ((cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding[lun])) > 0) { + if (ctrl_info->ctrl_removal_state != PQI_CTRL_GRACEFUL_REMOVAL) { + pqi_check_ctrl_health(ctrl_info); + if (pqi_ctrl_offline(ctrl_info)) + return -ENXIO; + } + msecs_waiting = jiffies_to_msecs(jiffies - start_jiffies); + if (msecs_waiting >= timeout_msecs) { + dev_err(&ctrl_info->pci_dev->dev, + "scsi %d:%d:%d:%d: timed out after %lu seconds waiting for %d outstanding command(s)\n", + ctrl_info->scsi_host->host_no, device->bus, device->target, + lun, msecs_waiting / 1000, cmds_outstanding); + return -ETIMEDOUT; + } + if (time_after(jiffies, warning_timeout)) { + dev_warn(&ctrl_info->pci_dev->dev, + "scsi %d:%d:%d:%d: waiting %lu seconds for %d outstanding command(s)\n", + ctrl_info->scsi_host->host_no, device->bus, device->target, + lun, msecs_waiting / 1000, cmds_outstanding); + warning_timeout = (PQI_PENDING_IO_WARNING_TIMEOUT_SECS * HZ) + jiffies; + } + msleep(1); + } + + return 0; +} + +static void pqi_lun_reset_complete(struct pqi_io_request *io_request, + void *context) +{ + struct completion *waiting = context; + + complete(waiting); +} + +#define PQI_LUN_RESET_POLL_COMPLETION_SECS 10 + +static int pqi_wait_for_lun_reset_completion(struct pqi_ctrl_info *ctrl_info, + struct pqi_scsi_dev *device, u8 lun, struct completion *wait) +{ + int rc; + unsigned int wait_secs; + int cmds_outstanding; + + wait_secs = 0; + + while (1) { + if (wait_for_completion_io_timeout(wait, + PQI_LUN_RESET_POLL_COMPLETION_SECS * HZ)) { + rc = 0; + break; + } + + pqi_check_ctrl_health(ctrl_info); + if (pqi_ctrl_offline(ctrl_info)) { + rc = -ENXIO; + break; + } + + wait_secs += PQI_LUN_RESET_POLL_COMPLETION_SECS; + cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding[lun]); + dev_warn(&ctrl_info->pci_dev->dev, + "scsi %d:%d:%d:%d: waiting %u seconds for LUN reset to complete (%d command(s) outstanding)\n", + ctrl_info->scsi_host->host_no, device->bus, device->target, lun, wait_secs, cmds_outstanding); + } + + return rc; +} + +#define PQI_LUN_RESET_FIRMWARE_TIMEOUT_SECS 30 + +static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd) +{ + int rc; + struct pqi_io_request *io_request; + DECLARE_COMPLETION_ONSTACK(wait); + struct pqi_task_management_request *request; + struct pqi_scsi_dev *device; + + device = scmd->device->hostdata; + io_request = pqi_alloc_io_request(ctrl_info, NULL); + io_request->io_complete_callback = pqi_lun_reset_complete; + io_request->context = &wait; + + request = io_request->iu; + memset(request, 0, sizeof(*request)); + + request->header.iu_type = PQI_REQUEST_IU_TASK_MANAGEMENT; + put_unaligned_le16(sizeof(*request) - PQI_REQUEST_HEADER_LENGTH, + &request->header.iu_length); + put_unaligned_le16(io_request->index, &request->request_id); + memcpy(request->lun_number, device->scsi3addr, + sizeof(request->lun_number)); + if (!pqi_is_logical_device(device) && ctrl_info->multi_lun_device_supported) + request->ml_device_lun_number = (u8)scmd->device->lun; + request->task_management_function = SOP_TASK_MANAGEMENT_LUN_RESET; + if (ctrl_info->tmf_iu_timeout_supported) + put_unaligned_le16(PQI_LUN_RESET_FIRMWARE_TIMEOUT_SECS, &request->timeout); + + pqi_start_io(ctrl_info, &ctrl_info->queue_groups[PQI_DEFAULT_QUEUE_GROUP], RAID_PATH, + io_request); + + rc = pqi_wait_for_lun_reset_completion(ctrl_info, device, (u8)scmd->device->lun, &wait); + if (rc == 0) + rc = io_request->status; + + pqi_free_io_request(io_request); + + return rc; +} + +#define PQI_LUN_RESET_RETRIES 3 +#define PQI_LUN_RESET_RETRY_INTERVAL_MSECS (10 * 1000) +#define PQI_LUN_RESET_PENDING_IO_TIMEOUT_MSECS (10 * 60 * 1000) +#define PQI_LUN_RESET_FAILED_PENDING_IO_TIMEOUT_MSECS (2 * 60 * 1000) + +static int pqi_lun_reset_with_retries(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd) +{ + int reset_rc; + int wait_rc; + unsigned int retries; + unsigned long timeout_msecs; + struct pqi_scsi_dev *device; + + device = scmd->device->hostdata; + for (retries = 0;;) { + reset_rc = pqi_lun_reset(ctrl_info, scmd); + if (reset_rc == 0 || reset_rc == -ENODEV || ++retries > PQI_LUN_RESET_RETRIES) + break; + msleep(PQI_LUN_RESET_RETRY_INTERVAL_MSECS); + } + + timeout_msecs = reset_rc ? PQI_LUN_RESET_FAILED_PENDING_IO_TIMEOUT_MSECS : + PQI_LUN_RESET_PENDING_IO_TIMEOUT_MSECS; + + wait_rc = pqi_device_wait_for_pending_io(ctrl_info, device, scmd->device->lun, timeout_msecs); + if (wait_rc && reset_rc == 0) + reset_rc = wait_rc; + + return reset_rc == 0 ? SUCCESS : FAILED; +} + +static int pqi_device_reset(struct pqi_ctrl_info *ctrl_info, + struct scsi_cmnd *scmd) +{ + int rc; + struct pqi_scsi_dev *device; + + device = scmd->device->hostdata; + pqi_ctrl_block_requests(ctrl_info); + pqi_ctrl_wait_until_quiesced(ctrl_info); + pqi_fail_io_queued_for_device(ctrl_info, device); + rc = pqi_wait_until_inbound_queues_empty(ctrl_info); + if (rc) + rc = FAILED; + else + rc = pqi_lun_reset_with_retries(ctrl_info, scmd); + pqi_ctrl_unblock_requests(ctrl_info); + + return rc; +} + +static int pqi_eh_device_reset_handler(struct scsi_cmnd *scmd) +{ + int rc; + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + struct pqi_scsi_dev *device; + + shost = scmd->device->host; + ctrl_info = shost_to_hba(shost); + device = scmd->device->hostdata; + + mutex_lock(&ctrl_info->lun_reset_mutex); + + dev_err(&ctrl_info->pci_dev->dev, + "resetting scsi %d:%d:%d:%d due to cmd 0x%02x\n", + shost->host_no, + device->bus, device->target, (u32)scmd->device->lun, + scmd->cmd_len > 0 ? scmd->cmnd[0] : 0xff); + + pqi_check_ctrl_health(ctrl_info); + if (pqi_ctrl_offline(ctrl_info)) + rc = FAILED; + else + rc = pqi_device_reset(ctrl_info, scmd); + + dev_err(&ctrl_info->pci_dev->dev, + "reset of scsi %d:%d:%d:%d: %s\n", + shost->host_no, device->bus, device->target, (u32)scmd->device->lun, + rc == SUCCESS ? "SUCCESS" : "FAILED"); + + mutex_unlock(&ctrl_info->lun_reset_mutex); + + return rc; +} + +static int pqi_slave_alloc(struct scsi_device *sdev) +{ + struct pqi_scsi_dev *device; + unsigned long flags; + struct pqi_ctrl_info *ctrl_info; + struct scsi_target *starget; + struct sas_rphy *rphy; + + ctrl_info = shost_to_hba(sdev->host); + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + if (sdev_channel(sdev) == PQI_PHYSICAL_DEVICE_BUS) { + starget = scsi_target(sdev); + rphy = target_to_rphy(starget); + device = pqi_find_device_by_sas_rphy(ctrl_info, rphy); + if (device) { + if (device->target_lun_valid) { + device->ignore_device = true; + } else { + device->target = sdev_id(sdev); + device->lun = sdev->lun; + device->target_lun_valid = true; + } + } + } else { + device = pqi_find_scsi_dev(ctrl_info, sdev_channel(sdev), + sdev_id(sdev), sdev->lun); + } + + if (device) { + sdev->hostdata = device; + device->sdev = sdev; + if (device->queue_depth) { + device->advertised_queue_depth = device->queue_depth; + scsi_change_queue_depth(sdev, + device->advertised_queue_depth); + } + if (pqi_is_logical_device(device)) { + pqi_disable_write_same(sdev); + if (pqi_limit_xfer_to_1MB) + blk_queue_max_hw_sectors(sdev->request_queue, + PQI_1MB_SECTORS); + } else { + sdev->allow_restart = 1; + if (device->device_type == SA_DEVICE_TYPE_NVME) + pqi_disable_write_same(sdev); + } + } + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return 0; +} + +static inline bool pqi_is_tape_changer_device(struct pqi_scsi_dev *device) +{ + return device->devtype == TYPE_TAPE || device->devtype == TYPE_MEDIUM_CHANGER; +} + +static int pqi_slave_configure(struct scsi_device *sdev) +{ + int rc = 0; + struct pqi_scsi_dev *device; + + device = sdev->hostdata; + device->devtype = sdev->type; + + if (pqi_is_tape_changer_device(device) && device->ignore_device) { + rc = -ENXIO; + device->ignore_device = false; + } + + return rc; +} + +static void pqi_slave_destroy(struct scsi_device *sdev) +{ + struct pqi_ctrl_info *ctrl_info; + struct pqi_scsi_dev *device; + int mutex_acquired; + unsigned long flags; + + ctrl_info = shost_to_hba(sdev->host); + + mutex_acquired = mutex_trylock(&ctrl_info->scan_mutex); + if (!mutex_acquired) + return; + + device = sdev->hostdata; + if (!device) { + mutex_unlock(&ctrl_info->scan_mutex); + return; + } + + device->lun_count--; + if (device->lun_count > 0) { + mutex_unlock(&ctrl_info->scan_mutex); + return; + } + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + list_del(&device->scsi_device_list_entry); + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + mutex_unlock(&ctrl_info->scan_mutex); + + pqi_dev_info(ctrl_info, "removed", device); + pqi_free_device(device); +} + +static int pqi_getpciinfo_ioctl(struct pqi_ctrl_info *ctrl_info, void __user *arg) +{ + struct pci_dev *pci_dev; + u32 subsystem_vendor; + u32 subsystem_device; + cciss_pci_info_struct pciinfo; + + if (!arg) + return -EINVAL; + + pci_dev = ctrl_info->pci_dev; + + pciinfo.domain = pci_domain_nr(pci_dev->bus); + pciinfo.bus = pci_dev->bus->number; + pciinfo.dev_fn = pci_dev->devfn; + subsystem_vendor = pci_dev->subsystem_vendor; + subsystem_device = pci_dev->subsystem_device; + pciinfo.board_id = ((subsystem_device << 16) & 0xffff0000) | subsystem_vendor; + + if (copy_to_user(arg, &pciinfo, sizeof(pciinfo))) + return -EFAULT; + + return 0; +} + +static int pqi_getdrivver_ioctl(void __user *arg) +{ + u32 version; + + if (!arg) + return -EINVAL; + + version = (DRIVER_MAJOR << 28) | (DRIVER_MINOR << 24) | + (DRIVER_RELEASE << 16) | DRIVER_REVISION; + + if (copy_to_user(arg, &version, sizeof(version))) + return -EFAULT; + + return 0; +} + +struct ciss_error_info { + u8 scsi_status; + int command_status; + size_t sense_data_length; +}; + +static void pqi_error_info_to_ciss(struct pqi_raid_error_info *pqi_error_info, + struct ciss_error_info *ciss_error_info) +{ + int ciss_cmd_status; + size_t sense_data_length; + + switch (pqi_error_info->data_out_result) { + case PQI_DATA_IN_OUT_GOOD: + ciss_cmd_status = CISS_CMD_STATUS_SUCCESS; + break; + case PQI_DATA_IN_OUT_UNDERFLOW: + ciss_cmd_status = CISS_CMD_STATUS_DATA_UNDERRUN; + break; + case PQI_DATA_IN_OUT_BUFFER_OVERFLOW: + ciss_cmd_status = CISS_CMD_STATUS_DATA_OVERRUN; + break; + case PQI_DATA_IN_OUT_PROTOCOL_ERROR: + case PQI_DATA_IN_OUT_BUFFER_ERROR: + case PQI_DATA_IN_OUT_BUFFER_OVERFLOW_DESCRIPTOR_AREA: + case PQI_DATA_IN_OUT_BUFFER_OVERFLOW_BRIDGE: + case PQI_DATA_IN_OUT_ERROR: + ciss_cmd_status = CISS_CMD_STATUS_PROTOCOL_ERROR; + break; + case PQI_DATA_IN_OUT_HARDWARE_ERROR: + case PQI_DATA_IN_OUT_PCIE_FABRIC_ERROR: + case PQI_DATA_IN_OUT_PCIE_COMPLETION_TIMEOUT: + case PQI_DATA_IN_OUT_PCIE_COMPLETER_ABORT_RECEIVED: + case PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST_RECEIVED: + case PQI_DATA_IN_OUT_PCIE_ECRC_CHECK_FAILED: + case PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST: + case PQI_DATA_IN_OUT_PCIE_ACS_VIOLATION: + case PQI_DATA_IN_OUT_PCIE_TLP_PREFIX_BLOCKED: + case PQI_DATA_IN_OUT_PCIE_POISONED_MEMORY_READ: + ciss_cmd_status = CISS_CMD_STATUS_HARDWARE_ERROR; + break; + case PQI_DATA_IN_OUT_UNSOLICITED_ABORT: + ciss_cmd_status = CISS_CMD_STATUS_UNSOLICITED_ABORT; + break; + case PQI_DATA_IN_OUT_ABORTED: + ciss_cmd_status = CISS_CMD_STATUS_ABORTED; + break; + case PQI_DATA_IN_OUT_TIMEOUT: + ciss_cmd_status = CISS_CMD_STATUS_TIMEOUT; + break; + default: + ciss_cmd_status = CISS_CMD_STATUS_TARGET_STATUS; + break; + } + + sense_data_length = + get_unaligned_le16(&pqi_error_info->sense_data_length); + if (sense_data_length == 0) + sense_data_length = + get_unaligned_le16(&pqi_error_info->response_data_length); + if (sense_data_length) + if (sense_data_length > sizeof(pqi_error_info->data)) + sense_data_length = sizeof(pqi_error_info->data); + + ciss_error_info->scsi_status = pqi_error_info->status; + ciss_error_info->command_status = ciss_cmd_status; + ciss_error_info->sense_data_length = sense_data_length; +} + +static int pqi_passthru_ioctl(struct pqi_ctrl_info *ctrl_info, void __user *arg) +{ + int rc; + char *kernel_buffer = NULL; + u16 iu_length; + size_t sense_data_length; + IOCTL_Command_struct iocommand; + struct pqi_raid_path_request request; + struct pqi_raid_error_info pqi_error_info; + struct ciss_error_info ciss_error_info; + + if (pqi_ctrl_offline(ctrl_info)) + return -ENXIO; + if (pqi_ofa_in_progress(ctrl_info) && pqi_ctrl_blocked(ctrl_info)) + return -EBUSY; + if (!arg) + return -EINVAL; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + if (copy_from_user(&iocommand, arg, sizeof(iocommand))) + return -EFAULT; + if (iocommand.buf_size < 1 && + iocommand.Request.Type.Direction != XFER_NONE) + return -EINVAL; + if (iocommand.Request.CDBLen > sizeof(request.cdb)) + return -EINVAL; + if (iocommand.Request.Type.Type != TYPE_CMD) + return -EINVAL; + + switch (iocommand.Request.Type.Direction) { + case XFER_NONE: + case XFER_WRITE: + case XFER_READ: + case XFER_READ | XFER_WRITE: + break; + default: + return -EINVAL; + } + + if (iocommand.buf_size > 0) { + kernel_buffer = kmalloc(iocommand.buf_size, GFP_KERNEL); + if (!kernel_buffer) + return -ENOMEM; + if (iocommand.Request.Type.Direction & XFER_WRITE) { + if (copy_from_user(kernel_buffer, iocommand.buf, + iocommand.buf_size)) { + rc = -EFAULT; + goto out; + } + } else { + memset(kernel_buffer, 0, iocommand.buf_size); + } + } + + memset(&request, 0, sizeof(request)); + + request.header.iu_type = PQI_REQUEST_IU_RAID_PATH_IO; + iu_length = offsetof(struct pqi_raid_path_request, sg_descriptors) - + PQI_REQUEST_HEADER_LENGTH; + memcpy(request.lun_number, iocommand.LUN_info.LunAddrBytes, + sizeof(request.lun_number)); + memcpy(request.cdb, iocommand.Request.CDB, iocommand.Request.CDBLen); + request.additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_0; + + switch (iocommand.Request.Type.Direction) { + case XFER_NONE: + request.data_direction = SOP_NO_DIRECTION_FLAG; + break; + case XFER_WRITE: + request.data_direction = SOP_WRITE_FLAG; + break; + case XFER_READ: + request.data_direction = SOP_READ_FLAG; + break; + case XFER_READ | XFER_WRITE: + request.data_direction = SOP_BIDIRECTIONAL; + break; + } + + request.task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE; + + if (iocommand.buf_size > 0) { + put_unaligned_le32(iocommand.buf_size, &request.buffer_length); + + rc = pqi_map_single(ctrl_info->pci_dev, + &request.sg_descriptors[0], kernel_buffer, + iocommand.buf_size, DMA_BIDIRECTIONAL); + if (rc) + goto out; + + iu_length += sizeof(request.sg_descriptors[0]); + } + + put_unaligned_le16(iu_length, &request.header.iu_length); + + if (ctrl_info->raid_iu_timeout_supported) + put_unaligned_le32(iocommand.Request.Timeout, &request.timeout); + + rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, + PQI_SYNC_FLAGS_INTERRUPTABLE, &pqi_error_info); + + if (iocommand.buf_size > 0) + pqi_pci_unmap(ctrl_info->pci_dev, request.sg_descriptors, 1, + DMA_BIDIRECTIONAL); + + memset(&iocommand.error_info, 0, sizeof(iocommand.error_info)); + + if (rc == 0) { + pqi_error_info_to_ciss(&pqi_error_info, &ciss_error_info); + iocommand.error_info.ScsiStatus = ciss_error_info.scsi_status; + iocommand.error_info.CommandStatus = + ciss_error_info.command_status; + sense_data_length = ciss_error_info.sense_data_length; + if (sense_data_length) { + if (sense_data_length > + sizeof(iocommand.error_info.SenseInfo)) + sense_data_length = + sizeof(iocommand.error_info.SenseInfo); + memcpy(iocommand.error_info.SenseInfo, + pqi_error_info.data, sense_data_length); + iocommand.error_info.SenseLen = sense_data_length; + } + } + + if (copy_to_user(arg, &iocommand, sizeof(iocommand))) { + rc = -EFAULT; + goto out; + } + + if (rc == 0 && iocommand.buf_size > 0 && + (iocommand.Request.Type.Direction & XFER_READ)) { + if (copy_to_user(iocommand.buf, kernel_buffer, + iocommand.buf_size)) { + rc = -EFAULT; + } + } + +out: + kfree(kernel_buffer); + + return rc; +} + +static int pqi_ioctl(struct scsi_device *sdev, IOCTL_INT cmd, void __user *arg) +{ + int rc; + struct pqi_ctrl_info *ctrl_info; + + ctrl_info = shost_to_hba(sdev->host); + + switch (cmd) { + case CCISS_DEREGDISK: + case CCISS_REGNEWDISK: + case CCISS_REGNEWD: + rc = pqi_scan_scsi_devices(ctrl_info); + break; + case CCISS_GETPCIINFO: + rc = pqi_getpciinfo_ioctl(ctrl_info, arg); + break; + case CCISS_GETDRIVVER: + rc = pqi_getdrivver_ioctl(arg); + break; + case CCISS_PASSTHRU: + rc = pqi_passthru_ioctl(ctrl_info, arg); + break; + default: + rc = -EINVAL; + break; + } + + return rc; +} + +static ssize_t pqi_firmware_version_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + + shost = class_to_shost(dev); + ctrl_info = shost_to_hba(shost); + + return scnprintf(buffer, PAGE_SIZE, "%s\n", ctrl_info->firmware_version); +} + +static ssize_t pqi_driver_version_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + return scnprintf(buffer, PAGE_SIZE, "%s\n", DRIVER_VERSION BUILD_TIMESTAMP); +} + +static ssize_t pqi_serial_number_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + + shost = class_to_shost(dev); + ctrl_info = shost_to_hba(shost); + + return scnprintf(buffer, PAGE_SIZE, "%s\n", ctrl_info->serial_number); +} + +static ssize_t pqi_model_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + + shost = class_to_shost(dev); + ctrl_info = shost_to_hba(shost); + + return scnprintf(buffer, PAGE_SIZE, "%s\n", ctrl_info->model); +} + +static ssize_t pqi_vendor_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + + shost = class_to_shost(dev); + ctrl_info = shost_to_hba(shost); + + return scnprintf(buffer, PAGE_SIZE, "%s\n", ctrl_info->vendor); +} + +static ssize_t pqi_host_rescan_store(struct device *dev, + struct device_attribute *attr, const char *buffer, size_t count) +{ + struct Scsi_Host *shost = class_to_shost(dev); + + pqi_scan_start(shost); + + return count; +} + +static ssize_t pqi_lockup_action_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + int count = 0; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(pqi_lockup_actions); i++) { + if (pqi_lockup_actions[i].action == pqi_lockup_action) + count += scnprintf(buffer + count, PAGE_SIZE - count, + "[%s] ", pqi_lockup_actions[i].name); + else + count += scnprintf(buffer + count, PAGE_SIZE - count, + "%s ", pqi_lockup_actions[i].name); + } + + count += scnprintf(buffer + count, PAGE_SIZE - count, "\n"); + + return count; +} + +static ssize_t pqi_lockup_action_store(struct device *dev, + struct device_attribute *attr, const char *buffer, size_t count) +{ + unsigned int i; + char *action_name; + char action_name_buffer[32]; + + strlcpy(action_name_buffer, buffer, sizeof(action_name_buffer)); + action_name = strstrip(action_name_buffer); + + for (i = 0; i < ARRAY_SIZE(pqi_lockup_actions); i++) { + if (strcmp(action_name, pqi_lockup_actions[i].name) == 0) { + pqi_lockup_action = pqi_lockup_actions[i].action; + return count; + } + } + + return -EINVAL; +} + +static ssize_t pqi_host_enable_stream_detection_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + + shost = class_to_shost(dev); + ctrl_info = shost_to_hba(shost); + + return scnprintf(buffer, 10, "%hhx\n", ctrl_info->enable_stream_detection); +} + +static ssize_t pqi_host_enable_stream_detection_store(struct device *dev, + struct device_attribute *attr, const char *buffer, size_t count) +{ + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + u8 set_stream_detection; + + shost = class_to_shost(dev); + ctrl_info = shost_to_hba(shost); + + if (kstrtou8(buffer, 0, &set_stream_detection)) + return -EINVAL; + + ctrl_info->enable_stream_detection = set_stream_detection; + + return count; +} + +static ssize_t pqi_host_enable_r5_writes_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + + shost = class_to_shost(dev); + ctrl_info = shost_to_hba(shost); + + return scnprintf(buffer, 10, "%hhx\n", ctrl_info->enable_r5_writes); +} + +static ssize_t pqi_host_enable_r5_writes_store(struct device *dev, + struct device_attribute *attr, const char *buffer, size_t count) +{ + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + u8 set_r5_writes; + + shost = class_to_shost(dev); + ctrl_info = shost_to_hba(shost); + + if (kstrtou8(buffer, 0, &set_r5_writes)) + return -EINVAL; + + ctrl_info->enable_r5_writes = set_r5_writes; + + return count; +} + +static ssize_t pqi_host_enable_r6_writes_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + + shost = class_to_shost(dev); + ctrl_info = shost_to_hba(shost); + + return scnprintf(buffer, 10, "%hhx\n", ctrl_info->enable_r6_writes); +} + +static ssize_t pqi_host_enable_r6_writes_store(struct device *dev, + struct device_attribute *attr, const char *buffer, size_t count) +{ + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + u8 set_r6_writes; + + shost = class_to_shost(dev); + ctrl_info = shost_to_hba(shost); + + if (kstrtou8(buffer, 0, &set_r6_writes)) + return -EINVAL; + + ctrl_info->enable_r6_writes = set_r6_writes; + + return count; +} + +static DEVICE_ATTR(driver_version, 0444, pqi_driver_version_show, NULL); +static DEVICE_ATTR(firmware_version, 0444, pqi_firmware_version_show, NULL); +static DEVICE_ATTR(model, 0444, pqi_model_show, NULL); +static DEVICE_ATTR(serial_number, 0444, pqi_serial_number_show, NULL); +static DEVICE_ATTR(vendor, 0444, pqi_vendor_show, NULL); +static DEVICE_ATTR(rescan, 0200, NULL, pqi_host_rescan_store); +static DEVICE_ATTR(lockup_action, 0644, pqi_lockup_action_show, + pqi_lockup_action_store); +static DEVICE_ATTR(enable_stream_detection, 0644, + pqi_host_enable_stream_detection_show, + pqi_host_enable_stream_detection_store); +static DEVICE_ATTR(enable_r5_writes, 0644, + pqi_host_enable_r5_writes_show, pqi_host_enable_r5_writes_store); +static DEVICE_ATTR(enable_r6_writes, 0644, + pqi_host_enable_r6_writes_show, pqi_host_enable_r6_writes_store); + +static struct PQI_DEVICE_ATTRIBUTE *pqi_shost_attrs[] = { + PQI_ATTRIBUTE(&dev_attr_driver_version), + PQI_ATTRIBUTE(&dev_attr_firmware_version), + PQI_ATTRIBUTE(&dev_attr_model), + PQI_ATTRIBUTE(&dev_attr_serial_number), + PQI_ATTRIBUTE(&dev_attr_vendor), + PQI_ATTRIBUTE(&dev_attr_rescan), + PQI_ATTRIBUTE(&dev_attr_lockup_action), + PQI_ATTRIBUTE(&dev_attr_enable_stream_detection), + PQI_ATTRIBUTE(&dev_attr_enable_r5_writes), + PQI_ATTRIBUTE(&dev_attr_enable_r6_writes), + NULL +}; +PQI_ATTRIBUTE_GROUPS(pqi_shost); + +static ssize_t pqi_unique_id_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct pqi_ctrl_info *ctrl_info; + struct scsi_device *sdev; + struct pqi_scsi_dev *device; + unsigned long flags; + u8 unique_id[16]; + + sdev = to_scsi_device(dev); + ctrl_info = shost_to_hba(sdev->host); + + if (pqi_ctrl_offline(ctrl_info)) + return -ENODEV; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + device = sdev->hostdata; + if (!device) { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + return -ENODEV; + } + + if (device->is_physical_device) + memcpy(unique_id, device->wwid, sizeof(device->wwid)); + else + memcpy(unique_id, device->volume_id, sizeof(device->volume_id)); + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return scnprintf(buffer, PAGE_SIZE, + "%02X%02X%02X%02X%02X%02X%02X%02X" + "%02X%02X%02X%02X%02X%02X%02X%02X\n", + unique_id[0], unique_id[1], unique_id[2], unique_id[3], + unique_id[4], unique_id[5], unique_id[6], unique_id[7], + unique_id[8], unique_id[9], unique_id[10], unique_id[11], + unique_id[12], unique_id[13], unique_id[14], unique_id[15]); +} + +static ssize_t pqi_lunid_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct pqi_ctrl_info *ctrl_info; + struct scsi_device *sdev; + struct pqi_scsi_dev *device; + unsigned long flags; + u8 lunid[8]; + + sdev = to_scsi_device(dev); + ctrl_info = shost_to_hba(sdev->host); + + if (pqi_ctrl_offline(ctrl_info)) + return -ENODEV; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + device = sdev->hostdata; + if (!device) { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + return -ENODEV; + } + + memcpy(lunid, device->scsi3addr, sizeof(lunid)); + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return scnprintf(buffer, PAGE_SIZE, "0x%8phN\n", lunid); +} + +#define MAX_PATHS 8 + +static ssize_t pqi_path_info_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct pqi_ctrl_info *ctrl_info; + struct scsi_device *sdev; + struct pqi_scsi_dev *device; + unsigned long flags; + int i; + int output_len = 0; + u8 box; + u8 bay; + u8 path_map_index; + char *active; + u8 phys_connector[2]; + + sdev = to_scsi_device(dev); + ctrl_info = shost_to_hba(sdev->host); + + if (pqi_ctrl_offline(ctrl_info)) + return -ENODEV; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + device = sdev->hostdata; + if (!device) { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + return -ENODEV; + } + + bay = device->bay; + for (i = 0; i < MAX_PATHS; i++) { + path_map_index = 1 << i; + if (i == device->active_path_index) + active = "Active"; + else if (device->path_map & path_map_index) + active = "Inactive"; + else + continue; + + output_len += scnprintf(buffer + output_len, + PAGE_SIZE - output_len, + "[%d:%d:%d:%d] %20.20s ", + ctrl_info->scsi_host->host_no, + device->bus, device->target, + device->lun, + scsi_device_type(device->devtype)); + + if (device->devtype == TYPE_RAID || + pqi_is_logical_device(device)) + goto end_buffer; + + memcpy(&phys_connector, &device->phys_connector[i], + sizeof(phys_connector)); + if (phys_connector[0] < '0') + phys_connector[0] = '0'; + if (phys_connector[1] < '0') + phys_connector[1] = '0'; + + output_len += scnprintf(buffer + output_len, + PAGE_SIZE - output_len, + "PORT: %.2s ", phys_connector); + + box = device->box[i]; + if (box != 0 && box != 0xFF) + output_len += scnprintf(buffer + output_len, + PAGE_SIZE - output_len, + "BOX: %hhu ", box); + + if ((device->devtype == TYPE_DISK || + device->devtype == TYPE_ZBC) && + pqi_expose_device(device)) + output_len += scnprintf(buffer + output_len, + PAGE_SIZE - output_len, + "BAY: %hhu ", bay); + +end_buffer: + output_len += scnprintf(buffer + output_len, + PAGE_SIZE - output_len, + "%s\n", active); + } + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return output_len; +} + +static ssize_t pqi_sas_address_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct pqi_ctrl_info *ctrl_info; + struct scsi_device *sdev; + struct pqi_scsi_dev *device; + unsigned long flags; + u64 sas_address; + + sdev = to_scsi_device(dev); + ctrl_info = shost_to_hba(sdev->host); + + if (pqi_ctrl_offline(ctrl_info)) + return -ENODEV; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + device = sdev->hostdata; + if (!device) { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + return -ENODEV; + } + + sas_address = device->sas_address; + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return scnprintf(buffer, PAGE_SIZE, "0x%016llx\n", sas_address); +} + +static ssize_t pqi_ssd_smart_path_enabled_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct pqi_ctrl_info *ctrl_info; + struct scsi_device *sdev; + struct pqi_scsi_dev *device; + unsigned long flags; + + sdev = to_scsi_device(dev); + ctrl_info = shost_to_hba(sdev->host); + + if (pqi_ctrl_offline(ctrl_info)) + return -ENODEV; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + device = sdev->hostdata; + if (!device) { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + return -ENODEV; + } + + buffer[0] = device->raid_bypass_enabled ? '1' : '0'; + buffer[1] = '\n'; + buffer[2] = '\0'; + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return 2; +} + +static ssize_t pqi_raid_level_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct pqi_ctrl_info *ctrl_info; + struct scsi_device *sdev; + struct pqi_scsi_dev *device; + unsigned long flags; + char *raid_level; + + sdev = to_scsi_device(dev); + ctrl_info = shost_to_hba(sdev->host); + + if (pqi_ctrl_offline(ctrl_info)) + return -ENODEV; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + device = sdev->hostdata; + if (!device) { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + return -ENODEV; + } + + if (pqi_is_logical_device(device) && device->devtype == TYPE_DISK) + raid_level = pqi_raid_level_to_string(device->raid_level); + else + raid_level = "N/A"; + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return scnprintf(buffer, PAGE_SIZE, "%s\n", raid_level); +} + +static ssize_t pqi_raid_bypass_cnt_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct pqi_ctrl_info *ctrl_info; + struct scsi_device *sdev; + struct pqi_scsi_dev *device; + unsigned long flags; + unsigned int raid_bypass_cnt; + + sdev = to_scsi_device(dev); + ctrl_info = shost_to_hba(sdev->host); + + if (pqi_ctrl_offline(ctrl_info)) + return -ENODEV; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + device = sdev->hostdata; + if (!device) { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + return -ENODEV; + } + + raid_bypass_cnt = device->raid_bypass_cnt; + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return scnprintf(buffer, PAGE_SIZE, "0x%x\n", raid_bypass_cnt); +} + +static ssize_t pqi_sas_ncq_prio_enable_show(struct device *dev, + struct device_attribute *attr, char *buffer) +{ + struct pqi_ctrl_info *ctrl_info; + struct scsi_device *sdev; + struct pqi_scsi_dev *device; + unsigned long flags; + bool ncq_prio_enable; + + sdev = to_scsi_device(dev); + ctrl_info = shost_to_hba(sdev->host); + + if (pqi_ctrl_offline(ctrl_info)) + return -ENODEV; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + device = sdev->hostdata; + if (!device) { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + return -ENODEV; + } + + ncq_prio_enable = device->ncq_prio_enable; + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return scnprintf(buffer, PAGE_SIZE, "%d\n", ncq_prio_enable); +} + +static ssize_t pqi_sas_ncq_prio_enable_store(struct device *dev, + struct device_attribute *attr, const char *buffer, size_t count) +{ + struct pqi_ctrl_info *ctrl_info; + struct scsi_device *sdev; + struct pqi_scsi_dev *device; + unsigned long flags; + u8 ncq_prio_enable; + + if (kstrtou8(buffer, 0, &ncq_prio_enable)) + return -EINVAL; + + sdev = to_scsi_device(dev); + ctrl_info = shost_to_hba(sdev->host); + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + device = sdev->hostdata; + if (!device) { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + return -ENODEV; + } + + if (!device->ncq_prio_support) { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + return -EINVAL; + } + + device->ncq_prio_enable = ncq_prio_enable; + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return count; +} + +static DEVICE_ATTR(lunid, 0444, pqi_lunid_show, NULL); +static DEVICE_ATTR(unique_id, 0444, pqi_unique_id_show, NULL); +static DEVICE_ATTR(path_info, 0444, pqi_path_info_show, NULL); +static DEVICE_ATTR(sas_address, 0444, pqi_sas_address_show, NULL); +static DEVICE_ATTR(ssd_smart_path_enabled, 0444, pqi_ssd_smart_path_enabled_show, NULL); +static DEVICE_ATTR(raid_level, 0444, pqi_raid_level_show, NULL); +static DEVICE_ATTR(raid_bypass_cnt, 0444, pqi_raid_bypass_cnt_show, NULL); +static DEVICE_ATTR(sas_ncq_prio_enable, 0644, + pqi_sas_ncq_prio_enable_show, pqi_sas_ncq_prio_enable_store); + +static struct PQI_DEVICE_ATTRIBUTE *pqi_sdev_attrs[] = { + PQI_ATTRIBUTE(&dev_attr_lunid), + PQI_ATTRIBUTE(&dev_attr_unique_id), + PQI_ATTRIBUTE(&dev_attr_path_info), + PQI_ATTRIBUTE(&dev_attr_sas_address), + PQI_ATTRIBUTE(&dev_attr_ssd_smart_path_enabled), + PQI_ATTRIBUTE(&dev_attr_raid_level), + PQI_ATTRIBUTE(&dev_attr_raid_bypass_cnt), + PQI_ATTRIBUTE(&dev_attr_sas_ncq_prio_enable), + NULL +}; +PQI_ATTRIBUTE_GROUPS(pqi_sdev); + +static struct scsi_host_template pqi_driver_template = { + .module = THIS_MODULE, + .name = DRIVER_NAME_SHORT, + .proc_name = DRIVER_NAME_SHORT, + .queuecommand = PQI_SCSI_QUEUE_COMMAND, + .scan_start = pqi_scan_start, + .scan_finished = pqi_scan_finished, + .this_id = -1, + .eh_device_reset_handler = pqi_eh_device_reset_handler, + .ioctl = pqi_ioctl, + .slave_alloc = pqi_slave_alloc, + .slave_configure = pqi_slave_configure, + .slave_destroy = pqi_slave_destroy, + PQI_SDEV_ATTRS, + PQI_SHOST_ATTRS, + PQI_CMD_PRIV +}; + +static int pqi_register_scsi(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + struct Scsi_Host *shost; + + pqi_compat_init_scsi_host_template(&pqi_driver_template); + + shost = scsi_host_alloc(&pqi_driver_template, sizeof(ctrl_info)); + if (!shost) { + dev_err(&ctrl_info->pci_dev->dev, "scsi_host_alloc failed\n"); + return -ENOMEM; + } + + shost->io_port = 0; + shost->n_io_port = 0; + shost->this_id = -1; + shost->max_channel = PQI_MAX_BUS; + shost->max_cmd_len = MAX_COMMAND_SIZE; + shost->max_lun = PQI_MAX_LUNS_PER_DEVICE; + shost->max_id = ~0; + shost->max_sectors = ctrl_info->max_sectors; + shost->can_queue = ctrl_info->scsi_ml_can_queue; + shost->cmd_per_lun = shost->can_queue; + shost->sg_tablesize = ctrl_info->sg_tablesize; + shost->transportt = pqi_sas_transport_template; + shost->irq = pqi_pci_irq_vector(ctrl_info->pci_dev, 0); + shost->unique_id = shost->irq; + shost->hostdata[0] = (unsigned long)ctrl_info; + PQI_SET_HOST_TAGSET(shost); + + pqi_compat_init_scsi_host(shost, ctrl_info); + + rc = scsi_add_host(shost, &ctrl_info->pci_dev->dev); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, "scsi_add_host failed\n"); + goto free_host; + } + + rc = pqi_add_sas_host(shost, ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, "add SAS host failed\n"); + goto remove_host; + } + + ctrl_info->scsi_host = shost; + + return 0; + +remove_host: + scsi_remove_host(shost); +free_host: + scsi_host_put(shost); + + return rc; +} + +static void pqi_unregister_scsi(struct pqi_ctrl_info *ctrl_info) +{ + struct Scsi_Host *shost; + + pqi_delete_sas_host(ctrl_info); + + shost = ctrl_info->scsi_host; + if (!shost) + return; + + scsi_remove_host(shost); + scsi_host_put(shost); +} + +static int pqi_wait_for_pqi_reset_completion(struct pqi_ctrl_info *ctrl_info) +{ + int rc = 0; + struct pqi_device_registers __iomem *pqi_registers; + unsigned long timeout; + unsigned int timeout_msecs; + union pqi_reset_register reset_reg; + + pqi_registers = ctrl_info->pqi_registers; + timeout_msecs = readw(&pqi_registers->max_reset_timeout) * 100; + timeout = msecs_to_jiffies(timeout_msecs) + jiffies; + + while (1) { + msleep(PQI_RESET_POLL_INTERVAL_MSECS); + reset_reg.all_bits = readl(&pqi_registers->device_reset); + if (reset_reg.bits.reset_action == PQI_RESET_ACTION_COMPLETED) + break; + if (!sis_is_firmware_running(ctrl_info)) { + rc = -ENXIO; + break; + } + if (time_after(jiffies, timeout)) { + rc = -ETIMEDOUT; + break; + } + } + + return rc; +} + +static int pqi_reset(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + union pqi_reset_register reset_reg; + + if (ctrl_info->pqi_reset_quiesce_supported) { + rc = sis_pqi_reset_quiesce(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "PQI reset failed during quiesce with error %d\n", rc); + return rc; + } + } + + reset_reg.all_bits = 0; + reset_reg.bits.reset_type = PQI_RESET_TYPE_HARD_RESET; + reset_reg.bits.reset_action = PQI_RESET_ACTION_RESET; + + writel(reset_reg.all_bits, &ctrl_info->pqi_registers->device_reset); + + rc = pqi_wait_for_pqi_reset_completion(ctrl_info); + if (rc) + dev_err(&ctrl_info->pci_dev->dev, + "PQI reset failed with error %d\n", rc); + + return rc; +} + +static int pqi_get_ctrl_serial_number(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + struct bmic_sense_subsystem_info *sense_info; + + sense_info = kzalloc(sizeof(*sense_info), GFP_KERNEL); + if (!sense_info) + return -ENOMEM; + + rc = pqi_sense_subsystem_info(ctrl_info, sense_info); + if (rc) + goto out; + + memcpy(ctrl_info->serial_number, sense_info->ctrl_serial_number, + sizeof(sense_info->ctrl_serial_number)); + ctrl_info->serial_number[sizeof(sense_info->ctrl_serial_number)] = '\0'; + +out: + kfree(sense_info); + + return rc; +} + +static int pqi_get_ctrl_product_details(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + struct bmic_identify_controller *identify; + + identify = kmalloc(sizeof(*identify), GFP_KERNEL); + if (!identify) + return -ENOMEM; + + rc = pqi_identify_controller(ctrl_info, identify); + if (rc) + goto out; + + if (get_unaligned_le32(&identify->extra_controller_flags) & + BMIC_IDENTIFY_EXTRA_FLAGS_LONG_FW_VERSION_SUPPORTED) { + memcpy(ctrl_info->firmware_version, + identify->firmware_version_long, + sizeof(identify->firmware_version_long)); + } else { + memcpy(ctrl_info->firmware_version, + identify->firmware_version_short, + sizeof(identify->firmware_version_short)); + ctrl_info->firmware_version + [sizeof(identify->firmware_version_short)] = '\0'; + scnprintf(ctrl_info->firmware_version + + strlen(ctrl_info->firmware_version), + sizeof(ctrl_info->firmware_version), + "-%u", + get_unaligned_le16(&identify->firmware_build_number)); + } + + memcpy(ctrl_info->model, identify->product_id, + sizeof(identify->product_id)); + ctrl_info->model[sizeof(identify->product_id)] = '\0'; + + memcpy(ctrl_info->vendor, identify->vendor_id, + sizeof(identify->vendor_id)); + ctrl_info->vendor[sizeof(identify->vendor_id)] = '\0'; + + dev_info(&ctrl_info->pci_dev->dev, + "Firmware version: %s\n", ctrl_info->firmware_version); + +out: + kfree(identify); + + return rc; +} + +struct pqi_config_table_section_info { + struct pqi_ctrl_info *ctrl_info; + void *section; + u32 section_offset; + void __iomem *section_iomem_addr; +}; + +static inline bool pqi_is_firmware_feature_supported( + struct pqi_config_table_firmware_features *firmware_features, + unsigned int bit_position) +{ + unsigned int byte_index; + + byte_index = bit_position / BITS_PER_BYTE; + + if (byte_index >= le16_to_cpu(firmware_features->num_elements)) + return false; + + return firmware_features->features_supported[byte_index] & + (1 << (bit_position % BITS_PER_BYTE)) ? true : false; +} + +static inline bool pqi_is_firmware_feature_enabled( + struct pqi_config_table_firmware_features *firmware_features, + void __iomem *firmware_features_iomem_addr, + unsigned int bit_position) +{ + unsigned int byte_index; + u8 __iomem *features_enabled_iomem_addr; + + byte_index = (bit_position / BITS_PER_BYTE) + + (le16_to_cpu(firmware_features->num_elements) * 2); + + features_enabled_iomem_addr = firmware_features_iomem_addr + + offsetof(struct pqi_config_table_firmware_features, + features_supported) + byte_index; + + return *((__force u8 *)features_enabled_iomem_addr) & + (1 << (bit_position % BITS_PER_BYTE)) ? true : false; +} + +static inline void pqi_request_firmware_feature( + struct pqi_config_table_firmware_features *firmware_features, + unsigned int bit_position) +{ + unsigned int byte_index; + + byte_index = (bit_position / BITS_PER_BYTE) + + le16_to_cpu(firmware_features->num_elements); + + firmware_features->features_supported[byte_index] |= + (1 << (bit_position % BITS_PER_BYTE)); +} + +static int pqi_config_table_update(struct pqi_ctrl_info *ctrl_info, + u16 first_section, u16 last_section) +{ + struct pqi_vendor_general_request request; + + memset(&request, 0, sizeof(request)); + + request.header.iu_type = PQI_REQUEST_IU_VENDOR_GENERAL; + put_unaligned_le16(sizeof(request) - PQI_REQUEST_HEADER_LENGTH, + &request.header.iu_length); + put_unaligned_le16(PQI_VENDOR_GENERAL_CONFIG_TABLE_UPDATE, + &request.function_code); + put_unaligned_le16(first_section, + &request.data.config_table_update.first_section); + put_unaligned_le16(last_section, + &request.data.config_table_update.last_section); + + return pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL); +} + +static int pqi_enable_firmware_features(struct pqi_ctrl_info *ctrl_info, + struct pqi_config_table_firmware_features *firmware_features, + void __iomem *firmware_features_iomem_addr) +{ + void *features_requested; + void __iomem *features_requested_iomem_addr; + void __iomem *host_max_known_feature_iomem_addr; + + features_requested = firmware_features->features_supported + + le16_to_cpu(firmware_features->num_elements); + + features_requested_iomem_addr = firmware_features_iomem_addr + + (features_requested - (void *)firmware_features); + + memcpy_toio(features_requested_iomem_addr, features_requested, + le16_to_cpu(firmware_features->num_elements)); + + if (pqi_is_firmware_feature_supported(firmware_features, + PQI_FIRMWARE_FEATURE_MAX_KNOWN_FEATURE)) { + host_max_known_feature_iomem_addr = + features_requested_iomem_addr + + (le16_to_cpu(firmware_features->num_elements) * 2) + + sizeof(__le16); + writeb(PQI_FIRMWARE_FEATURE_MAXIMUM & 0xFF, host_max_known_feature_iomem_addr); + writeb((PQI_FIRMWARE_FEATURE_MAXIMUM & 0xFF00) >> 8, host_max_known_feature_iomem_addr + 1); + } + + return pqi_config_table_update(ctrl_info, + PQI_CONFIG_TABLE_SECTION_FIRMWARE_FEATURES, + PQI_CONFIG_TABLE_SECTION_FIRMWARE_FEATURES); +} + +struct pqi_firmware_feature { + char *feature_name; + unsigned int feature_bit; + bool supported; + bool enabled; + void (*feature_status)(struct pqi_ctrl_info *ctrl_info, + struct pqi_firmware_feature *firmware_feature); +}; + +static void pqi_firmware_feature_status(struct pqi_ctrl_info *ctrl_info, + struct pqi_firmware_feature *firmware_feature) +{ + if (!firmware_feature->supported) + return; + + if (firmware_feature->enabled) { + dev_info(&ctrl_info->pci_dev->dev, + "%s enabled\n", firmware_feature->feature_name); + return; + } + + dev_err(&ctrl_info->pci_dev->dev, "failed to enable %s\n", + firmware_feature->feature_name); +} + +static void pqi_ctrl_update_feature_flags(struct pqi_ctrl_info *ctrl_info, + struct pqi_firmware_feature *firmware_feature) +{ + switch (firmware_feature->feature_bit) { + case PQI_FIRMWARE_FEATURE_RAID_1_WRITE_BYPASS: + ctrl_info->enable_r1_writes = firmware_feature->enabled; + break; + case PQI_FIRMWARE_FEATURE_RAID_5_WRITE_BYPASS: + ctrl_info->enable_r5_writes = firmware_feature->enabled; + break; + case PQI_FIRMWARE_FEATURE_RAID_6_WRITE_BYPASS: + ctrl_info->enable_r6_writes = firmware_feature->enabled; + break; + case PQI_FIRMWARE_FEATURE_SOFT_RESET_HANDSHAKE: + ctrl_info->soft_reset_handshake_supported = + firmware_feature->enabled && + ctrl_info->soft_reset_status; + break; + case PQI_FIRMWARE_FEATURE_RAID_IU_TIMEOUT: + ctrl_info->raid_iu_timeout_supported = firmware_feature->enabled; + break; + case PQI_FIRMWARE_FEATURE_TMF_IU_TIMEOUT: + ctrl_info->tmf_iu_timeout_supported = firmware_feature->enabled; + break; + case PQI_FIRMWARE_FEATURE_FW_TRIAGE: + ctrl_info->firmware_triage_supported = firmware_feature->enabled; + pqi_save_fw_triage_setting(ctrl_info, firmware_feature->enabled); + break; + case PQI_FIRMWARE_FEATURE_RPL_EXTENDED_FORMAT_4_5: + ctrl_info->rpl_extended_format_4_5_supported = firmware_feature->enabled; + break; + case PQI_FIRMWARE_FEATURE_MULTI_LUN_DEVICE_SUPPORT: + ctrl_info->multi_lun_device_supported = + firmware_feature->enabled; + break; + } + + pqi_firmware_feature_status(ctrl_info, firmware_feature); +} + +static inline void pqi_firmware_feature_update(struct pqi_ctrl_info *ctrl_info, + struct pqi_firmware_feature *firmware_feature) +{ + if (firmware_feature->feature_status) + firmware_feature->feature_status(ctrl_info, firmware_feature); +} + +static DEFINE_MUTEX(pqi_firmware_features_mutex); + +static struct pqi_firmware_feature pqi_firmware_features[] = { + { + .feature_name = "Online Firmware Activation", + .feature_bit = PQI_FIRMWARE_FEATURE_OFA, + .feature_status = pqi_firmware_feature_status, + }, + { + .feature_name = "Serial Management Protocol", + .feature_bit = PQI_FIRMWARE_FEATURE_SMP, + .feature_status = pqi_firmware_feature_status, + }, + { + .feature_name = "Maximum Known Feature", + .feature_bit = PQI_FIRMWARE_FEATURE_MAX_KNOWN_FEATURE, + .feature_status = pqi_firmware_feature_status, + }, + { + .feature_name = "RAID 0 Read Bypass", + .feature_bit = PQI_FIRMWARE_FEATURE_RAID_0_READ_BYPASS, + .feature_status = pqi_firmware_feature_status, + }, + { + .feature_name = "RAID 1 Read Bypass", + .feature_bit = PQI_FIRMWARE_FEATURE_RAID_1_READ_BYPASS, + .feature_status = pqi_firmware_feature_status, + }, + { + .feature_name = "RAID 5 Read Bypass", + .feature_bit = PQI_FIRMWARE_FEATURE_RAID_5_READ_BYPASS, + .feature_status = pqi_firmware_feature_status, + }, + { + .feature_name = "RAID 6 Read Bypass", + .feature_bit = PQI_FIRMWARE_FEATURE_RAID_6_READ_BYPASS, + .feature_status = pqi_firmware_feature_status, + }, + { + .feature_name = "RAID 0 Write Bypass", + .feature_bit = PQI_FIRMWARE_FEATURE_RAID_0_WRITE_BYPASS, + .feature_status = pqi_firmware_feature_status, + }, + { + .feature_name = "RAID 1 Write Bypass", + .feature_bit = PQI_FIRMWARE_FEATURE_RAID_1_WRITE_BYPASS, + .feature_status = pqi_ctrl_update_feature_flags, + }, + { + .feature_name = "RAID 5 Write Bypass", + .feature_bit = PQI_FIRMWARE_FEATURE_RAID_5_WRITE_BYPASS, + .feature_status = pqi_ctrl_update_feature_flags, + }, + { + .feature_name = "RAID 6 Write Bypass", + .feature_bit = PQI_FIRMWARE_FEATURE_RAID_6_WRITE_BYPASS, + .feature_status = pqi_ctrl_update_feature_flags, + }, + { + .feature_name = "New Soft Reset Handshake", + .feature_bit = PQI_FIRMWARE_FEATURE_SOFT_RESET_HANDSHAKE, + .feature_status = pqi_ctrl_update_feature_flags, + }, + { + .feature_name = "RAID IU Timeout", + .feature_bit = PQI_FIRMWARE_FEATURE_RAID_IU_TIMEOUT, + .feature_status = pqi_ctrl_update_feature_flags, + }, + { + .feature_name = "TMF IU Timeout", + .feature_bit = PQI_FIRMWARE_FEATURE_TMF_IU_TIMEOUT, + .feature_status = pqi_ctrl_update_feature_flags, + }, + { + .feature_name = "RAID Bypass on encrypted logical volumes on NVMe", + .feature_bit = PQI_FIRMWARE_FEATURE_RAID_BYPASS_ON_ENCRYPTED_NVME, + .feature_status = pqi_firmware_feature_status, + }, + { + .feature_name = "Firmware Triage", + .feature_bit = PQI_FIRMWARE_FEATURE_FW_TRIAGE, + .feature_status = pqi_ctrl_update_feature_flags, + }, + { + .feature_name = "RPL Extended Formats 4 and 5", + .feature_bit = PQI_FIRMWARE_FEATURE_RPL_EXTENDED_FORMAT_4_5, + .feature_status = pqi_ctrl_update_feature_flags, + }, + { + .feature_name = "Multi-LUN Target", + .feature_bit = PQI_FIRMWARE_FEATURE_MULTI_LUN_DEVICE_SUPPORT, + .feature_status = pqi_ctrl_update_feature_flags, + }, +}; + +static void pqi_process_firmware_features( + struct pqi_config_table_section_info *section_info) +{ + int rc; + struct pqi_ctrl_info *ctrl_info; + struct pqi_config_table_firmware_features *firmware_features; + void __iomem *firmware_features_iomem_addr; + unsigned int i; + unsigned int num_features_supported; + + ctrl_info = section_info->ctrl_info; + firmware_features = section_info->section; + firmware_features_iomem_addr = section_info->section_iomem_addr; + + for (i = 0, num_features_supported = 0; + i < ARRAY_SIZE(pqi_firmware_features); i++) { + if (pqi_is_firmware_feature_supported(firmware_features, + pqi_firmware_features[i].feature_bit)) { + pqi_firmware_features[i].supported = true; + num_features_supported++; + } else { + pqi_firmware_feature_update(ctrl_info, + &pqi_firmware_features[i]); + } + } + + if (num_features_supported == 0) + return; + + for (i = 0; i < ARRAY_SIZE(pqi_firmware_features); i++) { + if (!pqi_firmware_features[i].supported) + continue; + pqi_request_firmware_feature(firmware_features, + pqi_firmware_features[i].feature_bit); + } + + rc = pqi_enable_firmware_features(ctrl_info, firmware_features, + firmware_features_iomem_addr); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to enable firmware features in PQI configuration table\n"); + for (i = 0; i < ARRAY_SIZE(pqi_firmware_features); i++) { + if (!pqi_firmware_features[i].supported) + continue; + pqi_firmware_feature_update(ctrl_info, + &pqi_firmware_features[i]); + } + return; + } + + for (i = 0; i < ARRAY_SIZE(pqi_firmware_features); i++) { + if (!pqi_firmware_features[i].supported) + continue; + if (pqi_is_firmware_feature_enabled(firmware_features, + firmware_features_iomem_addr, + pqi_firmware_features[i].feature_bit)) { + pqi_firmware_features[i].enabled = true; + } + pqi_firmware_feature_update(ctrl_info, + &pqi_firmware_features[i]); + } +} + +static void pqi_init_firmware_features(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(pqi_firmware_features); i++) { + pqi_firmware_features[i].supported = false; + pqi_firmware_features[i].enabled = false; + } +} + +static void pqi_process_firmware_features_section( + struct pqi_config_table_section_info *section_info) +{ + mutex_lock(&pqi_firmware_features_mutex); + pqi_init_firmware_features(); + pqi_process_firmware_features(section_info); + mutex_unlock(&pqi_firmware_features_mutex); +} + +/* + * Reset all controller settings that can be initialized during the processing + * of the PQI Configuration Table. + */ + +static void pqi_ctrl_reset_config(struct pqi_ctrl_info *ctrl_info) +{ + ctrl_info->heartbeat_counter = NULL; + ctrl_info->soft_reset_status = NULL; + ctrl_info->soft_reset_handshake_supported = false; + ctrl_info->enable_r1_writes = false; + ctrl_info->enable_r5_writes = false; + ctrl_info->enable_r6_writes = false; + ctrl_info->raid_iu_timeout_supported = false; + ctrl_info->tmf_iu_timeout_supported = false; + ctrl_info->firmware_triage_supported = false; + ctrl_info->rpl_extended_format_4_5_supported = false; + ctrl_info->multi_lun_device_supported = false; +} + +static int pqi_process_config_table(struct pqi_ctrl_info *ctrl_info) +{ + u32 table_length; + u32 section_offset; + bool firmware_feature_section_present; + void __iomem *table_iomem_addr; + struct pqi_config_table *config_table; + struct pqi_config_table_section_header *section; + struct pqi_config_table_section_info section_info; + struct pqi_config_table_section_info feature_section_info = {0}; + + table_length = ctrl_info->config_table_length; + if (table_length == 0) + return 0; + + config_table = kmalloc(table_length, GFP_KERNEL); + if (!config_table) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to allocate memory for PQI configuration table\n"); + return -ENOMEM; + } + + /* + * Copy the config table contents from I/O memory space into the + * temporary buffer. + */ + table_iomem_addr = ctrl_info->iomem_base + ctrl_info->config_table_offset; + memcpy_fromio(config_table, table_iomem_addr, table_length); + + firmware_feature_section_present = false; + section_info.ctrl_info = ctrl_info; + section_offset = get_unaligned_le32(&config_table->first_section_offset); + + while (section_offset) { + section = (void *)config_table + section_offset; + + section_info.section = section; + section_info.section_offset = section_offset; + section_info.section_iomem_addr = table_iomem_addr + section_offset; + + switch (get_unaligned_le16(§ion->section_id)) { + case PQI_CONFIG_TABLE_SECTION_FIRMWARE_FEATURES: + firmware_feature_section_present = true; + feature_section_info = section_info; + break; + case PQI_CONFIG_TABLE_SECTION_HEARTBEAT: + if (pqi_disable_heartbeat) + dev_warn(&ctrl_info->pci_dev->dev, + "heartbeat disabled by module parameter\n"); + else + ctrl_info->heartbeat_counter = + table_iomem_addr + + section_offset + + offsetof(struct pqi_config_table_heartbeat, + heartbeat_counter); + break; + case PQI_CONFIG_TABLE_SECTION_SOFT_RESET: + ctrl_info->soft_reset_status = + table_iomem_addr + + section_offset + + offsetof(struct pqi_config_table_soft_reset, + soft_reset_status); + break; + } + + section_offset = get_unaligned_le16(§ion->next_section_offset); + } + + /* + * We process the firmware feature section after all other sections + * have been processed so that the feature bit callbacks can take + * into account the settings configured by other sections. + */ + if (firmware_feature_section_present) + pqi_process_firmware_features_section(&feature_section_info); + + kfree(config_table); + + return 0; +} + +/* Switches the controller from PQI mode back into SIS mode. */ + +static int pqi_revert_to_sis_mode(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + + pqi_change_irq_mode(ctrl_info, IRQ_MODE_NONE); + rc = pqi_reset(ctrl_info); + if (rc) + return rc; + rc = sis_reenable_sis_mode(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "re-enabling SIS mode failed with error %d\n", rc); + return rc; + } + pqi_save_ctrl_mode(ctrl_info, SIS_MODE); + + return 0; +} + +/* + * If the controller isn't already in SIS mode, this function forces it into + * SIS mode. + */ + +static int pqi_force_sis_mode(struct pqi_ctrl_info *ctrl_info) +{ + if (!sis_is_firmware_running(ctrl_info)) + return -ENXIO; + + if (pqi_get_ctrl_mode(ctrl_info) == SIS_MODE) + return 0; + + if (sis_is_kernel_up(ctrl_info)) { + pqi_save_ctrl_mode(ctrl_info, SIS_MODE); + return 0; + } + + return pqi_revert_to_sis_mode(ctrl_info); +} + +static void pqi_perform_lockup_action(void) +{ + switch (pqi_lockup_action) { + case PANIC: + panic("FATAL: Smart Family Controller lockup detected"); + break; + case REBOOT: + emergency_restart(); + break; + case NONE: + default: + break; + } +} + +static int pqi_ctrl_init(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + u32 product_id; + + if (reset_devices) { + if (pqi_is_fw_triage_supported(ctrl_info)) { + rc = sis_wait_for_fw_triage_completion(ctrl_info); + if (rc) + return rc; + } + sis_soft_reset(ctrl_info); + ssleep(PQI_POST_RESET_DELAY_SECS); + } else { + rc = pqi_force_sis_mode(ctrl_info); + if (rc) + return rc; + } + + /* + * Wait until the controller is ready to start accepting SIS + * commands. + */ + rc = sis_wait_for_ctrl_ready(ctrl_info); + if (rc) { + if (reset_devices) { + dev_err(&ctrl_info->pci_dev->dev, + "kdump init failed with error %d\n", rc); + pqi_lockup_action = REBOOT; + pqi_perform_lockup_action(); + } + return rc; + } + + /* + * Get the controller properties. This allows us to determine + * whether or not it supports PQI mode. + */ + rc = sis_get_ctrl_properties(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error obtaining controller properties\n"); + return rc; + } + + rc = sis_get_pqi_capabilities(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error obtaining controller capabilities\n"); + return rc; + } + + product_id = sis_get_product_id(ctrl_info); + ctrl_info->product_id = (u8)product_id; + ctrl_info->product_revision = (u8)(product_id >> 8); + + if (ctrl_info->product_id != PQI_CTRL_PRODUCT_ID_GEN1) + ctrl_info->enable_stream_detection = true; + + if (reset_devices) { + if (ctrl_info->max_outstanding_requests > + PQI_MAX_OUTSTANDING_REQUESTS_KDUMP) + ctrl_info->max_outstanding_requests = + PQI_MAX_OUTSTANDING_REQUESTS_KDUMP; + } else { + if (ctrl_info->max_outstanding_requests > + PQI_MAX_OUTSTANDING_REQUESTS) + ctrl_info->max_outstanding_requests = + PQI_MAX_OUTSTANDING_REQUESTS; + } + + pqi_calculate_io_resources(ctrl_info); + + rc = pqi_alloc_error_buffer(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to allocate PQI error buffer\n"); + return rc; + } + + /* + * If the function we are about to call succeeds, the + * controller will transition from legacy SIS mode + * into PQI mode. + */ + rc = sis_init_base_struct_addr(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error initializing PQI mode\n"); + return rc; + } + + /* Wait for the controller to complete the SIS -> PQI transition. */ + rc = pqi_wait_for_pqi_mode_ready(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "transition to PQI mode failed\n"); + return rc; + } + + /* From here on, we are running in PQI mode. */ + ctrl_info->pqi_mode_enabled = true; + pqi_save_ctrl_mode(ctrl_info, PQI_MODE); + + rc = pqi_alloc_admin_queues(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to allocate admin queues\n"); + return rc; + } + + rc = pqi_create_admin_queues(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error creating admin queues\n"); + return rc; + } + + rc = pqi_report_device_capability(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "obtaining device capability failed\n"); + return rc; + } + + rc = pqi_validate_device_capability(ctrl_info); + if (rc) + return rc; + + pqi_calculate_queue_resources(ctrl_info); + + rc = pqi_enable_msix_interrupts(ctrl_info); + if (rc) + return rc; + + if (ctrl_info->num_msix_vectors_enabled < ctrl_info->num_queue_groups) { + ctrl_info->max_msix_vectors = + ctrl_info->num_msix_vectors_enabled; + pqi_calculate_queue_resources(ctrl_info); + } + + rc = pqi_alloc_io_resources(ctrl_info); + if (rc) + return rc; + + rc = pqi_alloc_operational_queues(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to allocate operational queues\n"); + return rc; + } + + pqi_init_operational_queues(ctrl_info); + + rc = pqi_create_queues(ctrl_info); + if (rc) + return rc; + + rc = pqi_request_irqs(ctrl_info); + if (rc) + return rc; + + pqi_change_irq_mode(ctrl_info, IRQ_MODE_MSIX); + + ctrl_info->controller_online = true; + + rc = pqi_process_config_table(ctrl_info); + if (rc) + return rc; + + pqi_start_heartbeat_timer(ctrl_info); + + if (ctrl_info->enable_r5_writes || ctrl_info->enable_r6_writes) { + rc = pqi_get_advanced_raid_bypass_config(ctrl_info); + if (rc) { /* Supported features not returned correctly. */ + dev_err(&ctrl_info->pci_dev->dev, + "error obtaining advanced RAID bypass configuration\n"); + return rc; + } + ctrl_info->ciss_report_log_flags |= + CISS_REPORT_LOG_FLAG_DRIVE_TYPE_MIX; + } + + rc = pqi_enable_events(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error enabling events\n"); + return rc; + } + + /* Register with the SCSI subsystem. */ + rc = pqi_register_scsi(ctrl_info); + if (rc) + return rc; + + rc = pqi_get_ctrl_product_details(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error obtaining product details\n"); + return rc; + } + + rc = pqi_get_ctrl_serial_number(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error obtaining ctrl serial number\n"); + return rc; + } + + rc = pqi_set_diag_rescan(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error enabling multi-lun rescan\n"); + return rc; + } + + rc = pqi_write_driver_version_to_host_wellness(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error updating host wellness\n"); + return rc; + } + + pqi_schedule_update_time_worker(ctrl_info); + + pqi_scan_scsi_devices(ctrl_info); + + return 0; +} + +static void pqi_reinit_queues(struct pqi_ctrl_info *ctrl_info) +{ + unsigned int i; + struct pqi_admin_queues *admin_queues; + struct pqi_event_queue *event_queue; + + admin_queues = &ctrl_info->admin_queues; + admin_queues->iq_pi_copy = 0; + admin_queues->oq_ci_copy = 0; + writel(0, admin_queues->oq_pi); + + for (i = 0; i < ctrl_info->num_queue_groups; i++) { + ctrl_info->queue_groups[i].iq_pi_copy[RAID_PATH] = 0; + ctrl_info->queue_groups[i].iq_pi_copy[AIO_PATH] = 0; + ctrl_info->queue_groups[i].oq_ci_copy = 0; + + writel(0, ctrl_info->queue_groups[i].iq_ci[RAID_PATH]); + writel(0, ctrl_info->queue_groups[i].iq_ci[AIO_PATH]); + writel(0, ctrl_info->queue_groups[i].oq_pi); + } + + event_queue = &ctrl_info->event_queue; + writel(0, event_queue->oq_pi); + event_queue->oq_ci_copy = 0; +} + +static int pqi_ctrl_init_resume(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + + rc = pqi_force_sis_mode(ctrl_info); + if (rc) + return rc; + + /* + * Wait until the controller is ready to start accepting SIS + * commands. + */ + rc = sis_wait_for_ctrl_ready_resume(ctrl_info); + if (rc) + return rc; + + /* + * Get the controller properties. This allows us to determine + * whether or not it supports PQI mode. + */ + rc = sis_get_ctrl_properties(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error obtaining controller properties\n"); + return rc; + } + + rc = sis_get_pqi_capabilities(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error obtaining controller capabilities\n"); + return rc; + } + + /* + * If the function we are about to call succeeds, the + * controller will transition from legacy SIS mode + * into PQI mode. + */ + rc = sis_init_base_struct_addr(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error initializing PQI mode\n"); + return rc; + } + + /* Wait for the controller to complete the SIS -> PQI transition. */ + rc = pqi_wait_for_pqi_mode_ready(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "transition to PQI mode failed\n"); + return rc; + } + + /* From here on, we are running in PQI mode. */ + ctrl_info->pqi_mode_enabled = true; + pqi_save_ctrl_mode(ctrl_info, PQI_MODE); + + pqi_reinit_queues(ctrl_info); + + rc = pqi_create_admin_queues(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error creating admin queues\n"); + return rc; + } + + rc = pqi_create_queues(ctrl_info); + if (rc) + return rc; + + pqi_change_irq_mode(ctrl_info, IRQ_MODE_MSIX); + + ctrl_info->controller_online = true; + pqi_ctrl_unblock_requests(ctrl_info); + + pqi_ctrl_reset_config(ctrl_info); + + rc = pqi_process_config_table(ctrl_info); + if (rc) + return rc; + + pqi_start_heartbeat_timer(ctrl_info); + + if (ctrl_info->enable_r5_writes || ctrl_info->enable_r6_writes) { + rc = pqi_get_advanced_raid_bypass_config(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error obtaining advanced RAID bypass configuration\n"); + return rc; + } + ctrl_info->ciss_report_log_flags |= + CISS_REPORT_LOG_FLAG_DRIVE_TYPE_MIX; + } + + rc = pqi_enable_events(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error enabling events\n"); + return rc; + } + + rc = pqi_get_ctrl_product_details(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error obtaining product details\n"); + return rc; + } + + rc = pqi_set_diag_rescan(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error enabling multi-lun rescan\n"); + return rc; + } + + rc = pqi_write_driver_version_to_host_wellness(ctrl_info); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "error updating host wellness\n"); + return rc; + } + + if (pqi_ofa_in_progress(ctrl_info)) + pqi_ctrl_unblock_scan(ctrl_info); + + pqi_scan_scsi_devices(ctrl_info); + + return 0; +} + +static inline int pqi_set_pcie_completion_timeout(struct pci_dev *pci_dev, u16 timeout) +{ + return pcie_capability_clear_and_set_word(pci_dev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_COMP_TIMEOUT, timeout); +} + +static int pqi_pci_init(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + u64 mask; + + rc = pci_enable_device(ctrl_info->pci_dev); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to enable PCI device\n"); + return rc; + } + + if (sizeof(dma_addr_t) > 4) + mask = DMA_BIT_MASK(64); + else + mask = DMA_BIT_MASK(32); + + rc = pqi_dma_set_mask_and_coherent(&ctrl_info->pci_dev->dev, mask); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, "failed to set DMA mask\n"); + goto disable_device; + } + + rc = pci_request_regions(ctrl_info->pci_dev, DRIVER_NAME_SHORT); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to obtain PCI resources\n"); + goto disable_device; + } + + ctrl_info->iomem_base = ioremap_nocache(pci_resource_start( + ctrl_info->pci_dev, 0), + pci_resource_len(ctrl_info->pci_dev, 0)); + if (!ctrl_info->iomem_base) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to map memory for controller registers\n"); + rc = -ENOMEM; + goto release_regions; + } + +#define PCI_EXP_COMP_TIMEOUT_65_TO_210_MS 0x6 + + /* Increase the PCIe completion timeout. */ + rc = pqi_set_pcie_completion_timeout(ctrl_info->pci_dev, + PCI_EXP_COMP_TIMEOUT_65_TO_210_MS); + if (rc) { + dev_err(&ctrl_info->pci_dev->dev, + "failed to set PCIe completion timeout\n"); + goto release_regions; + } + + /* Enable bus mastering. */ + pci_set_master(ctrl_info->pci_dev); + + ctrl_info->registers = ctrl_info->iomem_base; + ctrl_info->pqi_registers = &ctrl_info->registers->pqi_registers; + + pci_set_drvdata(ctrl_info->pci_dev, ctrl_info); + + return 0; + +release_regions: + pci_release_regions(ctrl_info->pci_dev); +disable_device: + pci_disable_device(ctrl_info->pci_dev); + + return rc; +} + +static void pqi_cleanup_pci_init(struct pqi_ctrl_info *ctrl_info) +{ + iounmap(ctrl_info->iomem_base); + pci_release_regions(ctrl_info->pci_dev); + if (pci_is_enabled(ctrl_info->pci_dev)) + pci_disable_device(ctrl_info->pci_dev); + pci_set_drvdata(ctrl_info->pci_dev, NULL); +} + +static struct pqi_ctrl_info *pqi_alloc_ctrl_info(int numa_node) +{ + struct pqi_ctrl_info *ctrl_info; + + ctrl_info = kzalloc_node(sizeof(struct pqi_ctrl_info), + GFP_KERNEL, numa_node); + if (!ctrl_info) + return NULL; + + mutex_init(&ctrl_info->scan_mutex); + mutex_init(&ctrl_info->lun_reset_mutex); + mutex_init(&ctrl_info->ofa_mutex); + + INIT_LIST_HEAD(&ctrl_info->scsi_device_list); + spin_lock_init(&ctrl_info->scsi_device_list_lock); + + INIT_WORK(&ctrl_info->event_work, pqi_event_worker); + atomic_set(&ctrl_info->num_interrupts, 0); +#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT + atomic_set(&ctrl_info->total_scmds_outstanding, 0); +#endif + INIT_DELAYED_WORK(&ctrl_info->rescan_work, pqi_rescan_worker); + INIT_DELAYED_WORK(&ctrl_info->update_time_work, pqi_update_time_worker); + + timer_setup(&ctrl_info->heartbeat_timer, pqi_heartbeat_timer_handler, 0); + INIT_WORK(&ctrl_info->ctrl_offline_work, pqi_ctrl_offline_worker); + + INIT_WORK(&ctrl_info->ofa_memory_alloc_work, pqi_ofa_memory_alloc_worker); + INIT_WORK(&ctrl_info->ofa_quiesce_work, pqi_ofa_quiesce_worker); + + sema_init(&ctrl_info->sync_request_sem, + PQI_RESERVED_IO_SLOTS_SYNCHRONOUS_REQUESTS); + init_waitqueue_head(&ctrl_info->block_requests_wait); + + ctrl_info->ctrl_id = atomic_inc_return(&pqi_controller_count) - 1; + ctrl_info->irq_mode = IRQ_MODE_NONE; + ctrl_info->max_msix_vectors = PQI_MAX_MSIX_VECTORS; + + ctrl_info->ciss_report_log_flags = CISS_REPORT_LOG_FLAG_UNIQUE_LUN_ID; + ctrl_info->max_transfer_encrypted_sas_sata = + PQI_DEFAULT_MAX_TRANSFER_ENCRYPTED_SAS_SATA; + ctrl_info->max_transfer_encrypted_nvme = + PQI_DEFAULT_MAX_TRANSFER_ENCRYPTED_NVME; + ctrl_info->max_write_raid_5_6 = PQI_DEFAULT_MAX_WRITE_RAID_5_6; + ctrl_info->max_write_raid_1_10_2drive = ~0; + ctrl_info->max_write_raid_1_10_3drive = ~0; + ctrl_info->disable_managed_interrupts = pqi_disable_managed_interrupts; + + return ctrl_info; +} + +static inline void pqi_free_ctrl_info(struct pqi_ctrl_info *ctrl_info) +{ + kfree(ctrl_info); +} + +static void pqi_free_interrupts(struct pqi_ctrl_info *ctrl_info) +{ + pqi_free_irqs(ctrl_info); + pqi_disable_msix_interrupts(ctrl_info); +} + +static void pqi_free_ctrl_resources(struct pqi_ctrl_info *ctrl_info) +{ + pqi_free_interrupts(ctrl_info); + if (ctrl_info->queue_memory_base) + dma_free_coherent(&ctrl_info->pci_dev->dev, + ctrl_info->queue_memory_length, + ctrl_info->queue_memory_base, + ctrl_info->queue_memory_base_dma_handle); + if (ctrl_info->admin_queue_memory_base) + dma_free_coherent(&ctrl_info->pci_dev->dev, + ctrl_info->admin_queue_memory_length, + ctrl_info->admin_queue_memory_base, + ctrl_info->admin_queue_memory_base_dma_handle); + pqi_free_all_io_requests(ctrl_info); + if (ctrl_info->error_buffer) + dma_free_coherent(&ctrl_info->pci_dev->dev, + ctrl_info->error_buffer_length, + ctrl_info->error_buffer, + ctrl_info->error_buffer_dma_handle); + if (ctrl_info->iomem_base) + pqi_cleanup_pci_init(ctrl_info); + pqi_free_ctrl_info(ctrl_info); +} + +static void pqi_remove_ctrl(struct pqi_ctrl_info *ctrl_info) +{ + ctrl_info->controller_online = false; + pqi_stop_heartbeat_timer(ctrl_info); + pqi_ctrl_block_requests(ctrl_info); + pqi_cancel_rescan_worker(ctrl_info); + pqi_cancel_update_time_worker(ctrl_info); + if (ctrl_info->ctrl_removal_state == PQI_CTRL_SURPRISE_REMOVAL) { + pqi_fail_all_outstanding_requests(ctrl_info); + ctrl_info->pqi_mode_enabled = false; + } + pqi_unregister_scsi(ctrl_info); + if (ctrl_info->pqi_mode_enabled) + pqi_revert_to_sis_mode(ctrl_info); + pqi_free_ctrl_resources(ctrl_info); +} + +static void pqi_ofa_ctrl_quiesce(struct pqi_ctrl_info *ctrl_info) +{ + pqi_ctrl_block_scan(ctrl_info); + pqi_scsi_block_requests(ctrl_info); + pqi_ctrl_block_device_reset(ctrl_info); + pqi_ctrl_block_requests(ctrl_info); + pqi_ctrl_wait_until_quiesced(ctrl_info); + pqi_stop_heartbeat_timer(ctrl_info); +} + +static void pqi_ofa_ctrl_unquiesce(struct pqi_ctrl_info *ctrl_info) +{ + pqi_start_heartbeat_timer(ctrl_info); + pqi_ctrl_unblock_requests(ctrl_info); + pqi_ctrl_unblock_device_reset(ctrl_info); + pqi_scsi_unblock_requests(ctrl_info); + pqi_ctrl_unblock_scan(ctrl_info); +} + +static int pqi_ofa_alloc_mem(struct pqi_ctrl_info *ctrl_info, u32 total_size, u32 chunk_size) +{ + int i; + u32 sg_count; + struct device *dev; + struct pqi_ofa_memory *ofap; + struct pqi_sg_descriptor *mem_descriptor; + dma_addr_t dma_handle; + + ofap = ctrl_info->pqi_ofa_mem_virt_addr; + + sg_count = DIV_ROUND_UP(total_size, chunk_size); + if (sg_count == 0 || sg_count > PQI_OFA_MAX_SG_DESCRIPTORS) + goto out; + + ctrl_info->pqi_ofa_chunk_virt_addr = kmalloc(sg_count * sizeof(void *), GFP_KERNEL); + if (!ctrl_info->pqi_ofa_chunk_virt_addr) + goto out; + + dev = &ctrl_info->pci_dev->dev; + + for (i = 0; i < sg_count; i++) { + ctrl_info->pqi_ofa_chunk_virt_addr[i] = + dma_zalloc_coherent(dev, chunk_size, &dma_handle, GFP_KERNEL); + if (!ctrl_info->pqi_ofa_chunk_virt_addr[i]) + goto out_free_chunks; + mem_descriptor = &ofap->sg_descriptor[i]; + put_unaligned_le64((u64)dma_handle, &mem_descriptor->address); + put_unaligned_le32(chunk_size, &mem_descriptor->length); + } + + put_unaligned_le32(CISS_SG_LAST, &mem_descriptor->flags); + put_unaligned_le16(sg_count, &ofap->num_memory_descriptors); + put_unaligned_le32(sg_count * chunk_size, &ofap->bytes_allocated); + + return 0; + +out_free_chunks: + while (--i >= 0) { + mem_descriptor = &ofap->sg_descriptor[i]; + dma_free_coherent(dev, chunk_size, + ctrl_info->pqi_ofa_chunk_virt_addr[i], + get_unaligned_le64(&mem_descriptor->address)); + } + kfree(ctrl_info->pqi_ofa_chunk_virt_addr); + +out: + return -ENOMEM; +} + +static int pqi_ofa_alloc_host_buffer(struct pqi_ctrl_info *ctrl_info) +{ + u32 total_size; + u32 chunk_size; + u32 min_chunk_size; + + if (ctrl_info->ofa_bytes_requested == 0) + return 0; + + total_size = PAGE_ALIGN(ctrl_info->ofa_bytes_requested); + min_chunk_size = DIV_ROUND_UP(total_size, PQI_OFA_MAX_SG_DESCRIPTORS); + min_chunk_size = PAGE_ALIGN(min_chunk_size); + + for (chunk_size = total_size; chunk_size >= min_chunk_size;) { + if (pqi_ofa_alloc_mem(ctrl_info, total_size, chunk_size) == 0) + return 0; + chunk_size /= 2; + chunk_size = PAGE_ALIGN(chunk_size); + } + + return -ENOMEM; +} + +static void pqi_ofa_setup_host_buffer(struct pqi_ctrl_info *ctrl_info) +{ + struct device *dev; + struct pqi_ofa_memory *ofap; + + dev = &ctrl_info->pci_dev->dev; + + ofap = dma_zalloc_coherent(dev, sizeof(*ofap), + &ctrl_info->pqi_ofa_mem_dma_handle, GFP_KERNEL); + if (!ofap) + return; + + ctrl_info->pqi_ofa_mem_virt_addr = ofap; + + if (pqi_ofa_alloc_host_buffer(ctrl_info) < 0) { + dev_err(dev, + "failed to allocate host buffer for Online Firmware Activation\n"); + dma_free_coherent(dev, sizeof(*ofap), ofap, ctrl_info->pqi_ofa_mem_dma_handle); + ctrl_info->pqi_ofa_mem_virt_addr = NULL; + return; + } + + put_unaligned_le16(PQI_OFA_VERSION, &ofap->version); + memcpy(&ofap->signature, PQI_OFA_SIGNATURE, sizeof(ofap->signature)); +} + +static void pqi_ofa_free_host_buffer(struct pqi_ctrl_info *ctrl_info) +{ + unsigned int i; + struct device *dev; + struct pqi_ofa_memory *ofap; + struct pqi_sg_descriptor *mem_descriptor; + unsigned int num_memory_descriptors; + + ofap = ctrl_info->pqi_ofa_mem_virt_addr; + if (!ofap) + return; + + dev = &ctrl_info->pci_dev->dev; + + if (get_unaligned_le32(&ofap->bytes_allocated) == 0) + goto out; + + mem_descriptor = ofap->sg_descriptor; + num_memory_descriptors = + get_unaligned_le16(&ofap->num_memory_descriptors); + + for (i = 0; i < num_memory_descriptors; i++) { + dma_free_coherent(dev, + get_unaligned_le32(&mem_descriptor[i].length), + ctrl_info->pqi_ofa_chunk_virt_addr[i], + get_unaligned_le64(&mem_descriptor[i].address)); + } + kfree(ctrl_info->pqi_ofa_chunk_virt_addr); + +out: + dma_free_coherent(dev, sizeof(*ofap), ofap, + ctrl_info->pqi_ofa_mem_dma_handle); + ctrl_info->pqi_ofa_mem_virt_addr = NULL; +} + +static int pqi_ofa_host_memory_update(struct pqi_ctrl_info *ctrl_info) +{ + u32 buffer_length; + struct pqi_vendor_general_request request; + struct pqi_ofa_memory *ofap; + + memset(&request, 0, sizeof(request)); + + request.header.iu_type = PQI_REQUEST_IU_VENDOR_GENERAL; + put_unaligned_le16(sizeof(request) - PQI_REQUEST_HEADER_LENGTH, + &request.header.iu_length); + put_unaligned_le16(PQI_VENDOR_GENERAL_HOST_MEMORY_UPDATE, + &request.function_code); + + ofap = ctrl_info->pqi_ofa_mem_virt_addr; + + if (ofap) { + buffer_length = offsetof(struct pqi_ofa_memory, sg_descriptor) + + get_unaligned_le16(&ofap->num_memory_descriptors) * + sizeof(struct pqi_sg_descriptor); + + put_unaligned_le64((u64)ctrl_info->pqi_ofa_mem_dma_handle, + &request.data.ofa_memory_allocation.buffer_address); + put_unaligned_le32(buffer_length, + &request.data.ofa_memory_allocation.buffer_length); + } + + return pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL); +} + +static int pqi_ofa_ctrl_restart(struct pqi_ctrl_info *ctrl_info, unsigned int delay_secs) +{ + ssleep(delay_secs); + + return pqi_ctrl_init_resume(ctrl_info); +} + +static struct pqi_raid_error_info pqi_ctrl_offline_raid_error_info = { + .data_out_result = PQI_DATA_IN_OUT_HARDWARE_ERROR, + .status = SAM_STAT_CHECK_CONDITION, +}; + +static void pqi_fail_all_outstanding_requests(struct pqi_ctrl_info *ctrl_info) +{ + unsigned int i; + struct pqi_io_request *io_request; + struct scsi_cmnd *scmd; + struct scsi_device *sdev; + + for (i = 0; i < ctrl_info->max_io_slots; i++) { + io_request = &ctrl_info->io_request_pool[i]; + if (atomic_read(&io_request->refcount) == 0) + continue; + + scmd = io_request->scmd; + if (scmd) { + sdev = scmd->device; + if (!sdev || !scsi_device_online(sdev)) { + pqi_free_io_request(io_request); + continue; + } else { + set_host_byte(scmd, DID_NO_CONNECT); + } + } else { + io_request->status = -ENXIO; + io_request->error_info = + &pqi_ctrl_offline_raid_error_info; + } + + io_request->io_complete_callback(io_request, + io_request->context); + } +} + +static void pqi_take_ctrl_offline_deferred(struct pqi_ctrl_info *ctrl_info) +{ + pqi_perform_lockup_action(); + pqi_stop_heartbeat_timer(ctrl_info); + pqi_free_interrupts(ctrl_info); + pqi_cancel_rescan_worker(ctrl_info); + pqi_cancel_update_time_worker(ctrl_info); + pqi_ctrl_wait_until_quiesced(ctrl_info); + pqi_fail_all_outstanding_requests(ctrl_info); + pqi_ctrl_unblock_requests(ctrl_info); +} + +static void pqi_ctrl_offline_worker(struct work_struct *work) +{ + struct pqi_ctrl_info *ctrl_info; + + ctrl_info = container_of(work, struct pqi_ctrl_info, ctrl_offline_work); + pqi_take_ctrl_offline_deferred(ctrl_info); +} + +static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info, + enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason) +{ + if (!ctrl_info->controller_online) + return; + + ctrl_info->controller_online = false; + ctrl_info->pqi_mode_enabled = false; + pqi_ctrl_block_requests(ctrl_info); + if (!pqi_disable_ctrl_shutdown) + sis_shutdown_ctrl(ctrl_info, ctrl_shutdown_reason); + pci_disable_device(ctrl_info->pci_dev); + dev_err(&ctrl_info->pci_dev->dev, "controller offline\n"); + schedule_work(&ctrl_info->ctrl_offline_work); +} + +static void pqi_print_ctrl_info(struct pci_dev *pci_dev, + const struct pci_device_id *id) +{ + char *ctrl_description; + + if (id->driver_data) + ctrl_description = (char *)id->driver_data; + else + ctrl_description = "Microchip Smart Family Controller"; + + dev_info(&pci_dev->dev, "%s found\n", ctrl_description); +} + +static int pqi_pci_probe(struct pci_dev *pci_dev, + const struct pci_device_id *id) +{ + int rc; + int node; + struct pqi_ctrl_info *ctrl_info; + + pqi_print_ctrl_info(pci_dev, id); + + if (pqi_disable_device_id_wildcards && + id->subvendor == PCI_ANY_ID && + id->subdevice == PCI_ANY_ID) { + dev_warn(&pci_dev->dev, + "controller not probed because device ID wildcards are disabled\n"); + return -ENODEV; + } + + if (id->subvendor == PCI_ANY_ID || id->subdevice == PCI_ANY_ID) + dev_warn(&pci_dev->dev, + "controller device ID matched using wildcards\n"); + + node = dev_to_node(&pci_dev->dev); + if (node == NUMA_NO_NODE) { + node = cpu_to_node(0); + if (node == NUMA_NO_NODE) + node = 0; + set_dev_node(&pci_dev->dev, node); + } + + ctrl_info = pqi_alloc_ctrl_info(node); + if (!ctrl_info) { + dev_err(&pci_dev->dev, + "failed to allocate controller info block\n"); + return -ENOMEM; + } + + ctrl_info->pci_dev = pci_dev; + + rc = pqi_pci_init(ctrl_info); + if (rc) + goto error; + + rc = pqi_ctrl_init(ctrl_info); + if (rc) + goto error; + + return 0; + +error: + pqi_remove_ctrl(ctrl_info); + + return rc; +} + +static void pqi_pci_remove(struct pci_dev *pci_dev) +{ + struct pqi_ctrl_info *ctrl_info; + u16 vendor_id; + int rc; + + ctrl_info = pci_get_drvdata(pci_dev); + if (!ctrl_info) + return; + + pci_read_config_word(ctrl_info->pci_dev, PCI_SUBSYSTEM_VENDOR_ID, &vendor_id); + if (vendor_id == 0xffff) + ctrl_info->ctrl_removal_state = PQI_CTRL_SURPRISE_REMOVAL; + else + ctrl_info->ctrl_removal_state = PQI_CTRL_GRACEFUL_REMOVAL; + + if (ctrl_info->ctrl_removal_state == PQI_CTRL_GRACEFUL_REMOVAL) { + rc = pqi_flush_cache(ctrl_info, RESTART); + if (rc) + dev_err(&pci_dev->dev, + "unable to flush controller cache during remove\n"); + } + + pqi_remove_ctrl(ctrl_info); +} + +static void pqi_dump_request(struct pqi_ctrl_info *ctrl_info, + struct pqi_io_request *io_request) +{ + struct scsi_cmnd *scmd; + + scmd = io_request->scmd; + if (scmd) { + struct Scsi_Host *shost; + struct pqi_scsi_dev *device; + + if (scmd->device == NULL || scmd->device->host == NULL || + scmd->device->hostdata == NULL) + return; + + shost = scmd->device->host; + device = scmd->device->hostdata; + + dev_warn(&ctrl_info->pci_dev->dev, + "%d:%d:%d:%d scsicmnd=[0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x] scmd=%p outstanding cmds = %d\n", + shost->host_no, device->bus, device->target, device->lun, + scmd->cmnd[0], scmd->cmnd[1], scmd->cmnd[2], scmd->cmnd[3], + scmd->cmnd[4], scmd->cmnd[5], scmd->cmnd[6], scmd->cmnd[7], + scmd->cmnd[8], scmd->cmnd[9], scmd->cmnd[10], scmd->cmnd[11], + scmd->cmnd[12], scmd->cmnd[13], scmd->cmnd[14], scmd->cmnd[15], + scmd, atomic_read(&device->scsi_cmds_outstanding[scmd->device->lun])); + } else { + struct pqi_iu_header *request; + + request = io_request->iu; + dev_warn(&ctrl_info->pci_dev->dev, + "sync cmd IU type = 0x%02x len = %u\n", + request->iu_type, get_unaligned_le16(&request->iu_length)); + } +} + +static void pqi_crash_if_pending_command(struct pqi_ctrl_info *ctrl_info) +{ + unsigned int i; + struct pqi_io_request *io_request; + bool pending = false; + + for (i = 0; i < ctrl_info->max_io_slots; i++) { + io_request = &ctrl_info->io_request_pool[i]; + if (atomic_read(&io_request->refcount) == 0) + continue; + pqi_dump_request(ctrl_info, io_request); + pending = true; + } + BUG_ON(pending); +} + +static void pqi_shutdown(struct pci_dev *pci_dev) +{ + int rc; + struct pqi_ctrl_info *ctrl_info; + enum bmic_flush_cache_shutdown_event shutdown_event; + + ctrl_info = pci_get_drvdata(pci_dev); + if (!ctrl_info) { + dev_err(&pci_dev->dev, + "cache could not be flushed\n"); + return; + } + + pqi_wait_until_ofa_finished(ctrl_info); + + pqi_scsi_block_requests(ctrl_info); + pqi_ctrl_block_device_reset(ctrl_info); + pqi_ctrl_block_requests(ctrl_info); + pqi_ctrl_wait_until_quiesced(ctrl_info); + + if (system_state == SYSTEM_RESTART) + shutdown_event = RESTART; + else + shutdown_event = SHUTDOWN; + + /* + * Write all data in the controller's battery-backed cache to + * storage. + */ + rc = pqi_flush_cache(ctrl_info, shutdown_event); + if (rc) + dev_err(&pci_dev->dev, + "unable to flush controller cache\n"); + + pqi_crash_if_pending_command(ctrl_info); + pqi_reset(ctrl_info); +} + +static void pqi_process_lockup_action_param(void) +{ + unsigned int i; + + if (!pqi_lockup_action_param) + return; + + for (i = 0; i < ARRAY_SIZE(pqi_lockup_actions); i++) { + if (strcmp(pqi_lockup_action_param, + pqi_lockup_actions[i].name) == 0) { + pqi_lockup_action = pqi_lockup_actions[i].action; + return; + } + } + + pr_warn("%s: invalid lockup action setting \"%s\" - supported settings: none, reboot, panic\n", + DRIVER_NAME_SHORT, pqi_lockup_action_param); +} + +#define PQI_CTRL_READY_TIMEOUT_PARAM_MIN_SECS 30 +#define PQI_CTRL_READY_TIMEOUT_PARAM_MAX_SECS (30 * 60) + +static void pqi_process_ctrl_ready_timeout_param(void) +{ + if (pqi_ctrl_ready_timeout_secs == 0) + return; + + if (pqi_ctrl_ready_timeout_secs < PQI_CTRL_READY_TIMEOUT_PARAM_MIN_SECS) { + pr_warn("%s: ctrl_ready_timeout parm of %u second(s) is less than minimum timeout of %d seconds - setting timeout to %d seconds\n", + DRIVER_NAME_SHORT, pqi_ctrl_ready_timeout_secs, PQI_CTRL_READY_TIMEOUT_PARAM_MIN_SECS, PQI_CTRL_READY_TIMEOUT_PARAM_MIN_SECS); + pqi_ctrl_ready_timeout_secs = PQI_CTRL_READY_TIMEOUT_PARAM_MIN_SECS; + } else if (pqi_ctrl_ready_timeout_secs > PQI_CTRL_READY_TIMEOUT_PARAM_MAX_SECS) { + pr_warn("%s: ctrl_ready_timeout parm of %u seconds is greater than maximum timeout of %d seconds - setting timeout to %d seconds\n", + DRIVER_NAME_SHORT, pqi_ctrl_ready_timeout_secs, PQI_CTRL_READY_TIMEOUT_PARAM_MAX_SECS, PQI_CTRL_READY_TIMEOUT_PARAM_MAX_SECS); + pqi_ctrl_ready_timeout_secs = PQI_CTRL_READY_TIMEOUT_PARAM_MAX_SECS; + } + + sis_ctrl_ready_timeout_secs = pqi_ctrl_ready_timeout_secs; +} + +static void pqi_process_module_params(void) +{ + pqi_process_lockup_action_param(); + pqi_process_ctrl_ready_timeout_param(); +} + +#if defined(CONFIG_PM) + +static inline enum bmic_flush_cache_shutdown_event pqi_get_flush_cache_shutdown_event(struct pci_dev *pci_dev) +{ + if (pci_dev->subsystem_vendor == PCI_VENDOR_ID_ADAPTEC2 && pci_dev->subsystem_device == 0x1304) + return RESTART; + + return SUSPEND; +} + +static int pqi_suspend_or_freeze(struct device *dev, bool suspend) +{ + struct pci_dev *pci_dev; + struct pqi_ctrl_info *ctrl_info; + + pci_dev = to_pci_dev(dev); + ctrl_info = pci_get_drvdata(pci_dev); + + pqi_wait_until_ofa_finished(ctrl_info); + + pqi_ctrl_block_scan(ctrl_info); + pqi_scsi_block_requests(ctrl_info); + pqi_ctrl_block_device_reset(ctrl_info); + pqi_ctrl_block_requests(ctrl_info); + pqi_ctrl_wait_until_quiesced(ctrl_info); + + if (suspend) { + enum bmic_flush_cache_shutdown_event shutdown_event; + + shutdown_event = pqi_get_flush_cache_shutdown_event(pci_dev); + pqi_flush_cache(ctrl_info, shutdown_event); + } + + pqi_stop_heartbeat_timer(ctrl_info); + pqi_crash_if_pending_command(ctrl_info); + pqi_free_irqs(ctrl_info); + + ctrl_info->controller_online = false; + ctrl_info->pqi_mode_enabled = false; + + return 0; +} + +static int pqi_suspend(struct device *dev) +{ + return pqi_suspend_or_freeze(dev, true); +} + +static int pqi_resume_or_restore(struct device *dev) +{ + int rc; + struct pci_dev *pci_dev; + struct pqi_ctrl_info *ctrl_info; + + pci_dev = to_pci_dev(dev); + ctrl_info = pci_get_drvdata(pci_dev); + + rc = pqi_request_irqs(ctrl_info); + if (rc) + return rc; + + pqi_ctrl_unblock_device_reset(ctrl_info); + pqi_ctrl_unblock_requests(ctrl_info); + pqi_scsi_unblock_requests(ctrl_info); + pqi_ctrl_unblock_scan(ctrl_info); + + ssleep(PQI_POST_RESET_DELAY_SECS); + + return pqi_ctrl_init_resume(ctrl_info); +} + +static int pqi_freeze(struct device *dev) +{ + return pqi_suspend_or_freeze(dev, false); +} + +static int pqi_thaw(struct device *dev) +{ + int rc; + struct pci_dev *pci_dev; + struct pqi_ctrl_info *ctrl_info; + + pci_dev = to_pci_dev(dev); + ctrl_info = pci_get_drvdata(pci_dev); + + rc = pqi_request_irqs(ctrl_info); + if (rc) + return rc; + + ctrl_info->controller_online = true; + ctrl_info->pqi_mode_enabled = true; + + pqi_ctrl_unblock_device_reset(ctrl_info); + pqi_ctrl_unblock_requests(ctrl_info); + pqi_scsi_unblock_requests(ctrl_info); + pqi_ctrl_unblock_scan(ctrl_info); + + return 0; +} + +static int pqi_poweroff(struct device *dev) +{ + struct pci_dev *pci_dev; + struct pqi_ctrl_info *ctrl_info; + enum bmic_flush_cache_shutdown_event shutdown_event; + + pci_dev = to_pci_dev(dev); + ctrl_info = pci_get_drvdata(pci_dev); + + shutdown_event = pqi_get_flush_cache_shutdown_event(pci_dev); + pqi_flush_cache(ctrl_info, shutdown_event); + + return 0; +} + +static const struct dev_pm_ops pqi_pm_ops = { + .suspend = pqi_suspend, + .resume = pqi_resume_or_restore, + .freeze = pqi_freeze, + .thaw = pqi_thaw, + .poweroff = pqi_poweroff, + .restore = pqi_resume_or_restore, +}; + +#endif /* CONFIG_PM */ + +/* Define the PCI IDs for the controllers that we support. */ +static const struct pci_device_id pqi_pci_id_table[] = { + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_FOXCONN, 0x1211) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_FOXCONN, 0x1321) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_QUANTA, 0x8a22) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_QUANTA, 0x8a23) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_QUANTA, 0x8a24) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_QUANTA, 0x8a36) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_QUANTA, 0x8a37) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0x1104) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0x1105) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0x1106) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0x1107) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0x1108) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0x1109) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0x110b) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0x8460) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0x8461) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0xc460) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0xc461) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0xf460) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_H3C, 0xf461) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0045) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0046) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0047) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0048) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x004a) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x004b) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x004c) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x004f) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0051) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0052) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0053) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0054) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x006b) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x006c) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x006d) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x006f) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0070) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0071) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0072) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0086) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0087) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0088) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_INSPUR, 0x0089) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HUAWEI, 0xd227) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HUAWEI, 0xd228) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HUAWEI, 0xd229) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HUAWEI, 0xd22a) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HUAWEI, 0xd22b) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HUAWEI, 0xd22c) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0110) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0608) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0659) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0800) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0801) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0802) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0803) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0804) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0805) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0806) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0807) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0808) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0809) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x080a) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0900) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0901) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0902) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0903) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0904) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0905) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0906) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0907) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x0908) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x090a) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1200) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1201) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1202) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1280) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1281) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1282) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1300) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1301) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1302) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1303) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1304) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1380) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1400) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1402) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1410) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1411) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1412) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1420) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1430) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1440) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1441) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1450) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1452) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1460) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1461) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1462) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1463) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1470) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1471) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1472) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1473) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1474) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1475) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1480) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1490) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x1491) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14a0) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14a1) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14a2) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14a4) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14a5) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14a6) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14b0) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14b1) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14c0) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14c1) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14c2) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14c3) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14c4) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14d0) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14e0) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADAPTEC2, 0x14f0) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ADVANTECH, 0x8312) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_DELL, 0x1fe0) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0600) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0601) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0602) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0603) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0609) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0650) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0651) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0652) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0653) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0654) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0655) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0700) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x0701) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x1001) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x1002) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x1100) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HP, 0x1101) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HPE, 0x0294) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HPE, 0x02db) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HPE, 0x02dc) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HPE, 0x032e) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HPE, 0x036f) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HPE, 0x0381) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HPE, 0x0382) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_HPE, 0x0383) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_FIBERHOME, 0x0800) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_FIBERHOME, 0x0908) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_FIBERHOME, 0x0806) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_FIBERHOME, 0x0916) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_GIGABYTE, 0x1000) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_NTCOM, 0x3161) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_NT, 0x3161) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x0804) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x0805) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x0806) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x5445) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x5446) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x5447) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x5449) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x544A) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x544B) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x544D) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x544E) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x544F) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x54DA) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x54DB) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x54DC) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x0b27) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x0b29) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_ZTE, 0x0b45) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_RAMAXEL, 0x0101) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_RAMAXEL, 0x0201) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_LENOVO, 0x0220) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_LENOVO, 0x0221) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_LENOVO, 0x0520) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_LENOVO, 0x0522) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_LENOVO, 0x0620) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_LENOVO, 0x0621) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_LENOVO, 0x0622) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_LENOVO, 0x0623) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_VENDOR_ID_IBM, 0x0718) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1e93, 0x1000) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1e93, 0x1001) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1e93, 0x1002) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1e93, 0x1005) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1001) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1002) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1003) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1004) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1005) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1006) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1007) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1008) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1009) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x100A) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + PCI_ANY_ID, PCI_ANY_ID) + }, + { 0 } +}; + +MODULE_DEVICE_TABLE(pci, pqi_pci_id_table); + +static struct pci_driver pqi_pci_driver = { + .name = DRIVER_NAME_SHORT, + .id_table = pqi_pci_id_table, + .probe = pqi_pci_probe, + .remove = pqi_pci_remove, + .shutdown = pqi_shutdown, +#if defined(CONFIG_PM) + .driver = { + .pm = &pqi_pm_ops + }, +#endif +}; + +static int __init pqi_init(void) +{ + int rc; + + pr_info(DRIVER_NAME "\n"); + pqi_verify_structures(); + sis_verify_structures(); + + pqi_sas_transport_template = sas_attach_transport(&pqi_sas_transport_functions); + if (!pqi_sas_transport_template) + return -ENODEV; + + pqi_process_module_params(); + + rc = pci_register_driver(&pqi_pci_driver); + if (rc) + sas_release_transport(pqi_sas_transport_template); + + return rc; +} + +static void __exit pqi_cleanup(void) +{ + pci_unregister_driver(&pqi_pci_driver); + sas_release_transport(pqi_sas_transport_template); +} + +module_init(pqi_init); +module_exit(pqi_cleanup); + +static void pqi_verify_structures(void) +{ + BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers, + sis_host_to_ctrl_doorbell) != 0x20); + BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers, + sis_interrupt_mask) != 0x34); + BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers, + sis_ctrl_to_host_doorbell) != 0x9c); + BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers, + sis_ctrl_to_host_doorbell_clear) != 0xa0); + BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers, + sis_driver_scratch) != 0xb0); + BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers, + sis_product_identifier) != 0xb4); + BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers, + sis_firmware_status) != 0xbc); + BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers, + sis_ctrl_shutdown_reason_code) != 0xcc); + BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers, + sis_mailbox) != 0x1000); + BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers, + pqi_registers) != 0x4000); + + BUILD_BUG_ON(offsetof(struct pqi_iu_header, + iu_type) != 0x0); + BUILD_BUG_ON(offsetof(struct pqi_iu_header, + iu_length) != 0x2); + BUILD_BUG_ON(offsetof(struct pqi_iu_header, + response_queue_id) != 0x4); + BUILD_BUG_ON(offsetof(struct pqi_iu_header, + driver_flags) != 0x6); + BUILD_BUG_ON(sizeof(struct pqi_iu_header) != 0x8); + + BUILD_BUG_ON(offsetof(struct pqi_aio_error_info, + status) != 0x0); + BUILD_BUG_ON(offsetof(struct pqi_aio_error_info, + service_response) != 0x1); + BUILD_BUG_ON(offsetof(struct pqi_aio_error_info, + data_present) != 0x2); + BUILD_BUG_ON(offsetof(struct pqi_aio_error_info, + reserved) != 0x3); + BUILD_BUG_ON(offsetof(struct pqi_aio_error_info, + residual_count) != 0x4); + BUILD_BUG_ON(offsetof(struct pqi_aio_error_info, + data_length) != 0x8); + BUILD_BUG_ON(offsetof(struct pqi_aio_error_info, + reserved1) != 0xa); + BUILD_BUG_ON(offsetof(struct pqi_aio_error_info, + data) != 0xc); + BUILD_BUG_ON(sizeof(struct pqi_aio_error_info) != 0x10c); + + BUILD_BUG_ON(offsetof(struct pqi_raid_error_info, + data_in_result) != 0x0); + BUILD_BUG_ON(offsetof(struct pqi_raid_error_info, + data_out_result) != 0x1); + BUILD_BUG_ON(offsetof(struct pqi_raid_error_info, + reserved) != 0x2); + BUILD_BUG_ON(offsetof(struct pqi_raid_error_info, + status) != 0x5); + BUILD_BUG_ON(offsetof(struct pqi_raid_error_info, + status_qualifier) != 0x6); + BUILD_BUG_ON(offsetof(struct pqi_raid_error_info, + sense_data_length) != 0x8); + BUILD_BUG_ON(offsetof(struct pqi_raid_error_info, + response_data_length) != 0xa); + BUILD_BUG_ON(offsetof(struct pqi_raid_error_info, + data_in_transferred) != 0xc); + BUILD_BUG_ON(offsetof(struct pqi_raid_error_info, + data_out_transferred) != 0x10); + BUILD_BUG_ON(offsetof(struct pqi_raid_error_info, + data) != 0x14); + BUILD_BUG_ON(sizeof(struct pqi_raid_error_info) != 0x114); + + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + signature) != 0x0); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + function_and_status_code) != 0x8); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + max_admin_iq_elements) != 0x10); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + max_admin_oq_elements) != 0x11); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_iq_element_length) != 0x12); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_oq_element_length) != 0x13); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + max_reset_timeout) != 0x14); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + legacy_intx_status) != 0x18); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + legacy_intx_mask_set) != 0x1c); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + legacy_intx_mask_clear) != 0x20); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + device_status) != 0x40); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_iq_pi_offset) != 0x48); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_oq_ci_offset) != 0x50); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_iq_element_array_addr) != 0x58); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_oq_element_array_addr) != 0x60); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_iq_ci_addr) != 0x68); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_oq_pi_addr) != 0x70); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_iq_num_elements) != 0x78); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_oq_num_elements) != 0x79); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + admin_queue_int_msg_num) != 0x7a); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + device_error) != 0x80); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + error_details) != 0x88); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + device_reset) != 0x90); + BUILD_BUG_ON(offsetof(struct pqi_device_registers, + power_action) != 0x94); + BUILD_BUG_ON(sizeof(struct pqi_device_registers) != 0x100); + + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + header.iu_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + header.iu_length) != 2); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + header.driver_flags) != 6); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + request_id) != 8); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + function_code) != 10); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.report_device_capability.buffer_length) != 44); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.report_device_capability.sg_descriptor) != 48); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_iq.queue_id) != 12); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_iq.element_array_addr) != 16); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_iq.ci_addr) != 24); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_iq.num_elements) != 32); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_iq.element_length) != 34); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_iq.queue_protocol) != 36); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_oq.queue_id) != 12); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_oq.element_array_addr) != 16); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_oq.pi_addr) != 24); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_oq.num_elements) != 32); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_oq.element_length) != 34); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_oq.queue_protocol) != 36); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_oq.int_msg_num) != 40); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_oq.coalescing_count) != 42); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_oq.min_coalescing_time) != 44); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.create_operational_oq.max_coalescing_time) != 48); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_request, + data.delete_operational_queue.queue_id) != 12); + BUILD_BUG_ON(sizeof(struct pqi_general_admin_request) != 64); + BUILD_BUG_ON(FIELD_SIZEOF(struct pqi_general_admin_request, + data.create_operational_iq) != 64 - 11); + BUILD_BUG_ON(FIELD_SIZEOF(struct pqi_general_admin_request, + data.create_operational_oq) != 64 - 11); + BUILD_BUG_ON(FIELD_SIZEOF(struct pqi_general_admin_request, + data.delete_operational_queue) != 64 - 11); + + BUILD_BUG_ON(offsetof(struct pqi_general_admin_response, + header.iu_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_response, + header.iu_length) != 2); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_response, + header.driver_flags) != 6); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_response, + request_id) != 8); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_response, + function_code) != 10); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_response, + status) != 11); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_response, + data.create_operational_iq.status_descriptor) != 12); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_response, + data.create_operational_iq.iq_pi_offset) != 16); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_response, + data.create_operational_oq.status_descriptor) != 12); + BUILD_BUG_ON(offsetof(struct pqi_general_admin_response, + data.create_operational_oq.oq_ci_offset) != 16); + BUILD_BUG_ON(sizeof(struct pqi_general_admin_response) != 64); + + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + header.iu_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + header.iu_length) != 2); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + header.response_queue_id) != 4); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + header.driver_flags) != 6); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + request_id) != 8); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + nexus_id) != 10); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + buffer_length) != 12); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + lun_number) != 16); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + protocol_specific) != 24); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + error_index) != 27); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + cdb) != 32); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + timeout) != 60); + BUILD_BUG_ON(offsetof(struct pqi_raid_path_request, + sg_descriptors) != 64); + BUILD_BUG_ON(sizeof(struct pqi_raid_path_request) != + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH); + + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + header.iu_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + header.iu_length) != 2); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + header.response_queue_id) != 4); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + header.driver_flags) != 6); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + request_id) != 8); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + nexus_id) != 12); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + buffer_length) != 16); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + data_encryption_key_index) != 22); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + encrypt_tweak_lower) != 24); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + encrypt_tweak_upper) != 28); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + cdb) != 32); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + error_index) != 48); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + num_sg_descriptors) != 50); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + cdb_length) != 51); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + lun_number) != 52); + BUILD_BUG_ON(offsetof(struct pqi_aio_path_request, + sg_descriptors) != 64); + BUILD_BUG_ON(sizeof(struct pqi_aio_path_request) != + PQI_OPERATIONAL_IQ_ELEMENT_LENGTH); + + BUILD_BUG_ON(offsetof(struct pqi_io_response, + header.iu_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_io_response, + header.iu_length) != 2); + BUILD_BUG_ON(offsetof(struct pqi_io_response, + request_id) != 8); + BUILD_BUG_ON(offsetof(struct pqi_io_response, + error_index) != 10); + + BUILD_BUG_ON(offsetof(struct pqi_general_management_request, + header.iu_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_general_management_request, + header.iu_length) != 2); + BUILD_BUG_ON(offsetof(struct pqi_general_management_request, + header.response_queue_id) != 4); + BUILD_BUG_ON(offsetof(struct pqi_general_management_request, + request_id) != 8); + BUILD_BUG_ON(offsetof(struct pqi_general_management_request, + data.report_event_configuration.buffer_length) != 12); + BUILD_BUG_ON(offsetof(struct pqi_general_management_request, + data.report_event_configuration.sg_descriptors) != 16); + BUILD_BUG_ON(offsetof(struct pqi_general_management_request, + data.set_event_configuration.global_event_oq_id) != 10); + BUILD_BUG_ON(offsetof(struct pqi_general_management_request, + data.set_event_configuration.buffer_length) != 12); + BUILD_BUG_ON(offsetof(struct pqi_general_management_request, + data.set_event_configuration.sg_descriptors) != 16); + + BUILD_BUG_ON(offsetof(struct pqi_iu_layer_descriptor, + max_inbound_iu_length) != 6); + BUILD_BUG_ON(offsetof(struct pqi_iu_layer_descriptor, + max_outbound_iu_length) != 14); + BUILD_BUG_ON(sizeof(struct pqi_iu_layer_descriptor) != 16); + + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + data_length) != 0); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + iq_arbitration_priority_support_bitmask) != 8); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + maximum_aw_a) != 9); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + maximum_aw_b) != 10); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + maximum_aw_c) != 11); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + max_inbound_queues) != 16); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + max_elements_per_iq) != 18); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + max_iq_element_length) != 24); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + min_iq_element_length) != 26); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + max_outbound_queues) != 30); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + max_elements_per_oq) != 32); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + intr_coalescing_time_granularity) != 34); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + max_oq_element_length) != 36); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + min_oq_element_length) != 38); + BUILD_BUG_ON(offsetof(struct pqi_device_capability, + iu_layer_descriptors) != 64); + BUILD_BUG_ON(sizeof(struct pqi_device_capability) != 576); + + BUILD_BUG_ON(offsetof(struct pqi_event_descriptor, + event_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_event_descriptor, + oq_id) != 2); + BUILD_BUG_ON(sizeof(struct pqi_event_descriptor) != 4); + + BUILD_BUG_ON(offsetof(struct pqi_event_config, + num_event_descriptors) != 2); + BUILD_BUG_ON(offsetof(struct pqi_event_config, + descriptors) != 4); + + BUILD_BUG_ON(PQI_NUM_SUPPORTED_EVENTS != + ARRAY_SIZE(pqi_supported_event_types)); + + BUILD_BUG_ON(offsetof(struct pqi_event_response, + header.iu_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_event_response, + header.iu_length) != 2); + BUILD_BUG_ON(offsetof(struct pqi_event_response, + event_type) != 8); + BUILD_BUG_ON(offsetof(struct pqi_event_response, + event_id) != 10); + BUILD_BUG_ON(offsetof(struct pqi_event_response, + additional_event_id) != 12); + BUILD_BUG_ON(offsetof(struct pqi_event_response, + data) != 16); + BUILD_BUG_ON(sizeof(struct pqi_event_response) != 32); + + BUILD_BUG_ON(offsetof(struct pqi_event_acknowledge_request, + header.iu_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_event_acknowledge_request, + header.iu_length) != 2); + BUILD_BUG_ON(offsetof(struct pqi_event_acknowledge_request, + event_type) != 8); + BUILD_BUG_ON(offsetof(struct pqi_event_acknowledge_request, + event_id) != 10); + BUILD_BUG_ON(offsetof(struct pqi_event_acknowledge_request, + additional_event_id) != 12); + BUILD_BUG_ON(sizeof(struct pqi_event_acknowledge_request) != 16); + + BUILD_BUG_ON(offsetof(struct pqi_task_management_request, + header.iu_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_task_management_request, + header.iu_length) != 2); + BUILD_BUG_ON(offsetof(struct pqi_task_management_request, + request_id) != 8); + BUILD_BUG_ON(offsetof(struct pqi_task_management_request, + nexus_id) != 10); + BUILD_BUG_ON(offsetof(struct pqi_task_management_request, + timeout) != 14); + BUILD_BUG_ON(offsetof(struct pqi_task_management_request, + lun_number) != 16); + BUILD_BUG_ON(offsetof(struct pqi_task_management_request, + protocol_specific) != 24); + BUILD_BUG_ON(offsetof(struct pqi_task_management_request, + outbound_queue_id_to_manage) != 26); + BUILD_BUG_ON(offsetof(struct pqi_task_management_request, + request_id_to_manage) != 28); + BUILD_BUG_ON(offsetof(struct pqi_task_management_request, + task_management_function) != 30); + BUILD_BUG_ON(sizeof(struct pqi_task_management_request) != 32); + + BUILD_BUG_ON(offsetof(struct pqi_task_management_response, + header.iu_type) != 0); + BUILD_BUG_ON(offsetof(struct pqi_task_management_response, + header.iu_length) != 2); + BUILD_BUG_ON(offsetof(struct pqi_task_management_response, + request_id) != 8); + BUILD_BUG_ON(offsetof(struct pqi_task_management_response, + nexus_id) != 10); + BUILD_BUG_ON(offsetof(struct pqi_task_management_response, + additional_response_info) != 12); + BUILD_BUG_ON(offsetof(struct pqi_task_management_response, + response_code) != 15); + BUILD_BUG_ON(sizeof(struct pqi_task_management_response) != 16); + + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + configured_logical_drive_count) != 0); + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + configuration_signature) != 1); + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + firmware_version_short) != 5); + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + extended_logical_unit_count) != 154); + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + firmware_build_number) != 190); + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + vendor_id) != 200); + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + product_id) != 208); + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + extra_controller_flags) != 286); + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + controller_mode) != 292); + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + spare_part_number) != 293); + BUILD_BUG_ON(offsetof(struct bmic_identify_controller, + firmware_version_long) != 325); + + BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device, + phys_bay_in_box) != 115); + BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device, + device_type) != 120); + BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device, + redundant_path_present_map) != 1736); + BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device, + active_path_number) != 1738); + BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device, + alternate_paths_phys_connector) != 1739); + BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device, + alternate_paths_phys_box_on_port) != 1755); + BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device, + current_queue_depth_limit) != 1796); + BUILD_BUG_ON(sizeof(struct bmic_identify_physical_device) != 2560); + + BUILD_BUG_ON(sizeof(struct bmic_sense_feature_buffer_header) != 4); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_buffer_header, + page_code) != 0); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_buffer_header, + subpage_code) != 1); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_buffer_header, + buffer_length) != 2); + + BUILD_BUG_ON(sizeof(struct bmic_sense_feature_page_header) != 4); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_page_header, + page_code) != 0); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_page_header, + subpage_code) != 1); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_page_header, + page_length) != 2); + + BUILD_BUG_ON(sizeof(struct bmic_sense_feature_io_page_aio_subpage) + != 18); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage, + header) != 0); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage, + firmware_read_support) != 4); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage, + driver_read_support) != 5); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage, + firmware_write_support) != 6); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage, + driver_write_support) != 7); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage, + max_transfer_encrypted_sas_sata) != 8); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage, + max_transfer_encrypted_nvme) != 10); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage, + max_write_raid_5_6) != 12); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage, + max_write_raid_1_10_2drive) != 14); + BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage, + max_write_raid_1_10_3drive) != 16); + + BUILD_BUG_ON(PQI_ADMIN_IQ_NUM_ELEMENTS > 255); + BUILD_BUG_ON(PQI_ADMIN_OQ_NUM_ELEMENTS > 255); + BUILD_BUG_ON(PQI_ADMIN_IQ_ELEMENT_LENGTH % + PQI_QUEUE_ELEMENT_LENGTH_ALIGNMENT != 0); + BUILD_BUG_ON(PQI_ADMIN_OQ_ELEMENT_LENGTH % + PQI_QUEUE_ELEMENT_LENGTH_ALIGNMENT != 0); + BUILD_BUG_ON(PQI_OPERATIONAL_IQ_ELEMENT_LENGTH > 1048560); + BUILD_BUG_ON(PQI_OPERATIONAL_IQ_ELEMENT_LENGTH % + PQI_QUEUE_ELEMENT_LENGTH_ALIGNMENT != 0); + BUILD_BUG_ON(PQI_OPERATIONAL_OQ_ELEMENT_LENGTH > 1048560); + BUILD_BUG_ON(PQI_OPERATIONAL_OQ_ELEMENT_LENGTH % + PQI_QUEUE_ELEMENT_LENGTH_ALIGNMENT != 0); + + BUILD_BUG_ON(PQI_RESERVED_IO_SLOTS >= PQI_MAX_OUTSTANDING_REQUESTS); + BUILD_BUG_ON(PQI_RESERVED_IO_SLOTS >= + PQI_MAX_OUTSTANDING_REQUESTS_KDUMP); +} diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c new file mode 100644 index 0000000000000..da21e39d1e189 --- /dev/null +++ b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c @@ -0,0 +1,460 @@ +/* + * driver for Microchip PQI-based storage controllers + * Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries + * Copyright (c) 2016-2018 Microsemi Corporation + * Copyright (c) 2016 PMC-Sierra, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * Questions/Comments/Bugfixes to storagedev@microchip.com + * + */ + +#include +#include +#include +#include +#include "smartpqi.h" +#include "smartpqi_kernel_compat.h" +#if KFEATURE_ENABLE_SCSI_MAP_QUEUES +#include +#endif +extern struct device_attribute *pqi_ncq_prio_sdev_attrs; + +#if !KFEATURE_HAS_2011_03_QUEUECOMMAND + +int pqi_scsi_queue_command_compat(struct scsi_cmnd *scmd, + void (*done)(struct scsi_cmnd *)) +{ + scmd->SCp.ptr = (char *)done; + + return pqi_scsi_queue_command(scmd->device->host, scmd); +} + +#endif /* !KFEATURE_HAS_2011_03_QUEUECOMMAND */ + +#if !KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE + +int pci_enable_msix_range(struct pci_dev *pci_dev, struct msix_entry *entries, + int minvec, int maxvec) +{ + int nvec = maxvec; + int rc; + + if (maxvec < minvec) + return -ERANGE; + + do { + rc = pci_enable_msix(pci_dev, entries, nvec); + if (rc < 0) + return rc; + if (rc > 0) { + if (rc < minvec) + return -ENOSPC; + nvec = rc; + } + } while (rc); + + return nvec; +} + +#endif /* !KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE */ + +#if !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH + +int scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth) +{ + scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), queue_depth); + + return queue_depth; +} + +static int pqi_change_queue_depth(struct scsi_device *sdev, int qdepth, + int reason) +{ + if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) { + struct pqi_scsi_dev *device = sdev->hostdata; + + if (!device) + return -ENODEV; + + if (qdepth < 1) + qdepth = 1; + else if (qdepth > device->queue_depth) + qdepth = device->queue_depth; + + scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth); + + } else if (reason == SCSI_QDEPTH_QFULL) + scsi_track_queue_full(sdev, qdepth); + else + return -ENOTSUPP; + + return sdev->queue_depth; +} + +static int pqi_change_queue_type(struct scsi_device *sdev, int tag_type) +{ + if (sdev->tagged_supported) { + scsi_set_tag_type(sdev, tag_type); + if (tag_type) + scsi_activate_tcq(sdev, sdev->queue_depth); + else + scsi_deactivate_tcq(sdev, sdev->queue_depth); + } else { + tag_type = 0; + } + + return tag_type; +} + +#endif /* !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH */ + +#if KFEATURE_ENABLE_SCSI_MAP_QUEUES +static int pqi_map_queues(struct Scsi_Host *shost) +{ + struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost); + + if (!ctrl_info->disable_managed_interrupts) { +#if KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1 + return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev); +#elif KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2 + return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev, 0); +#elif KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3 + return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], + ctrl_info->pci_dev, 0); +#else + #error "A version for KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES has not been defined." +#endif + } else { +#if KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1 + return blk_mq_map_queues(&shost->tag_set); +#elif KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2 + return blk_mq_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT]); +#else + #error "A version for KFEATURE_HAS_BLK_MQ_MAP_QUEUES has not been defined." +#endif + } +} +#endif /* KFEATURE_ENABLE_SCSI_MAP_QUEUES */ + +void pqi_compat_init_scsi_host_template(struct scsi_host_template *hostt) +{ +#if !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH + hostt->change_queue_depth = pqi_change_queue_depth; + hostt->change_queue_type = pqi_change_queue_type; +#endif /* !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH */ +#if KFEATURE_HAS_LOCKLESS_DISPATCH_IO + hostt->lockless = 1; +#endif +#if KFEATURE_HAS_USE_CLUSTERING + hostt->use_clustering = ENABLE_CLUSTERING; +#endif +#if KFEATURE_ENABLE_SCSI_MAP_QUEUES + hostt->map_queues = pqi_map_queues; +#endif +} + +void pqi_compat_init_scsi_host(struct Scsi_Host *shost, + struct pqi_ctrl_info *ctrl_info) +{ +#if KFEATURE_HAS_MQ_SUPPORT + shost->nr_hw_queues = ctrl_info->num_queue_groups; +#endif /* KFEATURE_HAS_MQ_SUPPORT */ +} + +#if !KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING + +void scsi_sanitize_inquiry_string(unsigned char *s, int len) +{ + bool terminated = false; + + for (; len > 0; (--len, ++s)) { + if (*s == 0) + terminated = true; + if (terminated || *s < 0x20 || *s > 0x7e) + *s = ' '; + } +} + +#endif /* !KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING */ + +#if !KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT + +#if defined(RHEL6U3) +/* + * Note that these accessor functions are only for the "PCI Express + * Capability" (see PCIe spec r3.0, sec 7.8). They do not apply to the + * other "PCI Express Extended Capabilities" (AER, VC, ACS, MFVC, etc.) + */ +int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val) +{ + int ret; + + *val = 0; + if (pos & 1) + return -EINVAL; + + ret = pci_read_config_word(dev, pci_pcie_cap(dev) + pos, val); + /* + * Reset *val to 0 if pci_read_config_word() fails, it may + * have been written as 0xFFFF if hardware error happens + * during pci_read_config_word(). + */ + if (ret) + *val = 0; + return ret; +} + +int pcie_capability_write_word(struct pci_dev *dev, int pos, u16 val) +{ + if (pos & 1) + return -EINVAL; + + return pci_write_config_word(dev, pci_pcie_cap(dev) + pos, val); +} + +#endif /* RHEL6U3 */ + +int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos, + u16 clear, u16 set) +{ + int ret; + u16 val; + + ret = pcie_capability_read_word(dev, pos, &val); + if (!ret) { + val &= ~clear; + val |= set; + ret = pcie_capability_write_word(dev, pos, val); + } + + return ret; +} + +#endif + +#if !KFEATURE_HAS_BSG_JOB_SMP_HANDLER + +static int pqi_bsg_map_buffer(struct bsg_buffer *buf, struct request *req) +{ + size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments); + + if (!req->nr_phys_segments) { + WARN_ON(!req->nr_phys_segments); + return -EINVAL; + } + + buf->sg_list = kzalloc(sz, GFP_KERNEL); + if (!buf->sg_list) + return -ENOMEM; + sg_init_table(buf->sg_list, req->nr_phys_segments); + buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->payload_len = blk_rq_bytes(req); + return 0; +} + +static int pqi_bsg_prepare_job(struct bsg_job *job, struct request *rq) +{ + struct request *rsp = rq->next_rq; + int ret; +#if KFEATURE_HAS_SCSI_REQUEST + struct scsi_request *req = scsi_req(rq); +#else + struct request *req = rq; +#endif + + job->request = req->cmd; + job->request_len = req->cmd_len; + job->reply = req->sense; + + if (rq->bio) { + ret = pqi_bsg_map_buffer(&job->request_payload, rq); + if (ret) + goto failjob_rls_job; + } + + if (rsp && rsp->bio) { + ret = pqi_bsg_map_buffer(&job->reply_payload, rsp); + if (ret) + goto failjob_rls_rqst_payload; + } + + return 0; + +failjob_rls_rqst_payload: + kfree(job->request_payload.sg_list); +failjob_rls_job: + return -ENOMEM; +} + +struct bsg_return_data { + int result; + unsigned int reply_payload_rcv_len; +}; +static struct bsg_return_data bsg_ret; + +void pqi_bsg_job_done(struct bsg_job *job, int result, + unsigned int reply_payload_rcv_len) +{ + bsg_ret.result = result; + bsg_ret.reply_payload_rcv_len = reply_payload_rcv_len; + complete(job->dd_data); +} + +int pqi_sas_smp_handler_compat(struct Scsi_Host *shost, struct sas_rphy *rphy, + struct request *rq) +{ + struct bsg_job *job; + struct completion bsg_job; +#if KFEATURE_HAS_SCSI_REQUEST + struct scsi_request *req = scsi_req(rq); + struct scsi_request *resp = scsi_req(rq->next_rq); +#else + struct request *req = rq; + struct request *resp = req->next_rq; +#endif + + init_completion(&bsg_job); + job = kzalloc(sizeof(struct bsg_job), GFP_KERNEL); + if (!job) + return -ENOMEM; + job->dd_data = &bsg_job; + + pqi_bsg_prepare_job(job, rq); + pqi_sas_smp_handler(job, shost, rphy); + + wait_for_completion(&bsg_job); + + req->sense_len = job->reply_len; + memcpy(req->sense, job->reply, job->reply_len); + + resp->resid_len -= min(bsg_ret.reply_payload_rcv_len, resp->resid_len); + req->resid_len = 0; + + kfree(job); + return bsg_ret.result; +} + +#endif /* !KFEATURE_HAS_BSG_JOB_SMP_HANDLER */ + +int pqi_pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ +#if KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS + return pci_irq_vector(dev, nr); +#else + struct pqi_ctrl_info *ctrl_info; + + ctrl_info = pci_get_drvdata(dev); + if (ctrl_info->irq_mode == IRQ_MODE_INTX) + return dev->irq; + else + return ctrl_info->msix_vectors[nr]; +#endif +} + +void pqi_pci_free_irq_vectors(struct pci_dev *dev) +{ +#if KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS + pci_free_irq_vectors(dev); +#else + pci_disable_msix(dev); +#endif +} + +int pqi_pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags) +{ +#if KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS + return pci_alloc_irq_vectors(dev, min_vecs, max_vecs, flags); +#else + unsigned int i; + int num_vectors_enabled; + struct pqi_ctrl_info *ctrl_info; + struct msix_entry msix_entries[PQI_MAX_MSIX_VECTORS]; + + ctrl_info = pci_get_drvdata(dev); + + for (i = 0; i < max_vecs; i++) + msix_entries[i].entry = i; + + num_vectors_enabled = pci_enable_msix_range(dev, msix_entries, min_vecs, + max_vecs); + + for (i = 0; i < num_vectors_enabled; i++) { + ctrl_info->msix_vectors[i] = msix_entries[i].vector; + ctrl_info->intr_data[i] = &ctrl_info->queue_groups[i]; + } + + return num_vectors_enabled; +#endif +} + +#if KFEATURE_HAS_SCSI_CMD_PRIV +struct pqi_cmd_priv *pqi_cmd_priv(struct scsi_cmnd *cmd) +{ + return scsi_cmd_priv(cmd); +} +#endif + +#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT + +struct pqi_io_request *pqi_get_io_request(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd) +{ + struct pqi_io_request *io_request; + u16 i = smp_processor_id() * ctrl_info->per_cpu_factor; + + while (1) { + io_request = &ctrl_info->io_request_pool[i]; + if (atomic_inc_return(&io_request->refcount) == 1) + break; + atomic_dec(&io_request->refcount); + i = (i + 1) % ctrl_info->max_io_slots; + } + + return io_request; +} + +#else + +struct pqi_io_request *pqi_get_io_request(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd) +{ + struct pqi_io_request *io_request; + u16 i; + + if (scmd) { + u32 blk_tag = blk_mq_unique_tag(PQI_SCSI_REQUEST(scmd)); + + i = blk_mq_unique_tag_to_tag(blk_tag); + if (i < 0 || i >= ctrl_info->scsi_ml_can_queue) + return NULL; + + io_request = &ctrl_info->io_request_pool[i]; + if (atomic_inc_return(&io_request->refcount) > 1) { + atomic_dec(&io_request->refcount); + return NULL; + } + } else { + /* + * benignly racy - may have to wait for an open slot. + */ + i = 0; + while (1) { + io_request = &ctrl_info->io_request_pool[ctrl_info->scsi_ml_can_queue + i]; + if (atomic_inc_return(&io_request->refcount) == 1) + break; + atomic_dec(&io_request->refcount); + i = (i + 1) % PQI_RESERVED_IO_SLOTS; + } + } + + return io_request; +} +#endif diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h new file mode 100644 index 0000000000000..016cfeaf48237 --- /dev/null +++ b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h @@ -0,0 +1,825 @@ +/* + * driver for Microchip PQI-based storage controllers + * Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries + * Copyright (c) 2016-2018 Microsemi Corporation + * Copyright (c) 2016 PMC-Sierra, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * Questions/Comments/Bugfixes to storagedev@microchip.com + * + */ + +/* needed for struct definitions */ +#include + +#if !defined(_SMARTPQI_KERNEL_COMPAT_H) +#define _SMARTPQI_KERNEL_COMPAT_H + +/* #define RHEL6 */ +/* #define RHEL7 */ +/* default is kernel.org */ + +/* ----- RHEL6 variants --------- */ +#if \ + defined(RHEL6U0) || \ + defined(RHEL6U1) || \ + defined(RHEL6U2) || \ + defined(RHEL6U3) || \ + defined(RHEL6U4) || \ + defined(RHEL6U5) || \ + defined(RHEL6U6) || \ + defined(RHEL6U7) || \ + defined(RHEL6U8) || \ + defined(RHEL6U9) || \ + defined(RHEL6U10) +#define RHEL6 +#endif + +/* ----- RHEL7 variants --------- */ +#if \ + defined(RHEL7U0) || \ + defined(RHEL7U1) || \ + defined(RHEL7U2) || \ + defined(RHEL7U3) || \ + defined(RHEL7U4) || \ + defined(RHEL7U4ARM) || \ + defined(RHEL7U5) || \ + defined(RHEL7U5ARM) || \ + defined(RHEL7U6) || \ + defined(RHEL7U7) || \ + defined(RHEL7U8) || \ + defined(RHEL7U9) +#define RHEL7 +#endif + +/* ----- RHEL8 variants --------- */ +#if \ + defined(RHEL8U0) || \ + defined(RHEL8U1) || \ + defined(RHEL8U2) || \ + defined(RHEL8U3) || \ + defined(RHEL8U4) || \ + defined(RHEL8U5) || \ + defined(RHEL8U6) || \ + defined(RHEL8U7) +#define RHEL8 +#endif + +/* ----- RHEL9 variants --------- */ +#if \ + defined(RHEL9U0) || \ + defined(RHEL9U1) +#define RHEL9 +#endif + +/* ----- SLES11 variants --------- */ +#if \ + defined(SLES11SP0) || \ + defined(SLES11SP1) || \ + defined(SLES11SP2) || \ + defined(SLES11SP3) || \ + defined(SLES11SP4) +#define SLES11 +#endif + +/* ----- SLES12 variants --------- */ +#if \ + defined(SLES12SP0) || \ + defined(SLES12SP1) || \ + defined(SLES12SP2) || \ + defined(SLES12SP3) || \ + defined(SLES12SP4) || \ + defined(SLES12SP5) +#define SLES12 +#endif + +/* ----- SLES15 variants --------- */ +#if \ + defined(SLES15SP0) || \ + defined(SLES15SP1) || \ + defined(SLES15SP2) || \ + defined(SLES15SP3) || \ + defined(SLES15SP4) || \ + defined(SLES15SP5) +#define SLES15 +#endif + +/* ----- KCLASS5 variants --------- */ +#if \ + defined(KCLASS5A) || \ + defined(KCLASS5B) || \ + defined(KCLASS5C) || \ + defined(KCLASS5D) +#define KCLASS5 +#endif + +/* ----- KCLASS6 variants --------- */ +#if \ + defined(KCLASS6A) +#define KCLASS6 +#endif + +#include +#include +#include +#include + +#if defined(MSG_SIMPLE_TAG) +#define KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH 0 +#if !defined(RHEL7U3) +#define KFEATURE_HAS_MQ_SUPPORT 0 +#endif +#endif + +#if defined(CENTOS7ALTARM) +#define KFEATURE_HAS_MQ_SUPPORT 0 +#endif + +#if defined(XEN7) +#define KCLASS4A +#endif + +#if !defined(PCI_EXP_DEVCTL2_COMP_TIMEOUT) +#define PCI_EXP_DEVCTL2_COMP_TIMEOUT 0x000f +#if TORTUGA +#define KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT 1 +#else +#define KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT 0 +#endif +#endif + +#if defined(RHEL6) +#define KFEATURE_HAS_WAIT_FOR_COMPLETION_IO 0 +#define KFEATURE_HAS_2011_03_QUEUECOMMAND 0 +#define KFEATURE_HAS_NO_WRITE_SAME 0 +#define KFEATURE_HAS_ATOMIC_HOST_BUSY 0 +#if defined(RHEL6U3) || defined(RHEL6U4) || defined(RHEL6U5) +#if defined(RHEL6U3) +#define KFEATURE_HAS_DMA_ZALLOC_COHERENT 0 +#endif +#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE 0 +#endif +#if !defined(RHEL6U0) && !defined(RHEL6U1) +#define KFEATURE_HAS_LOCKLESS_DISPATCH_IO 1 +#endif +#if defined(RHEL6U5) +#define KFEATURE_HAS_DMA_MASK_AND_COHERENT 0 +#endif +#elif defined(RHEL7) +#if defined(RHEL7U0) +#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE 0 +#define KFEATURE_HAS_ATOMIC_HOST_BUSY 0 +#endif +#if defined(RHEL7U1) +#define KFEATURE_HAS_ATOMIC_HOST_BUSY 0 +#endif +#if defined(RHEL7U4ARM) || defined(RHEL7U5ARM) +#endif +#elif defined(RHEL8) || defined(RHEL9) || defined(KCLASS5) || \ + defined(KCLASS6) || defined(OEULER2203) +#define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS 1 +#define KFEATURE_HAS_MQ_SUPPORT 1 +#define shost_use_blk_mq(x) 1 +#define KFEATURE_ENABLE_SCSI_MAP_QUEUES 1 +#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3 1 +#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2 1 +#elif defined(SLES11) +#define KFEATURE_HAS_WAIT_FOR_COMPLETION_IO 0 +#define KFEATURE_HAS_NO_WRITE_SAME 0 +#define KFEATURE_HAS_ATOMIC_HOST_BUSY 0 +#if defined(SLES11SP0) || defined(SLES11SP1) +#define KFEATURE_HAS_2011_03_QUEUECOMMAND 0 +#endif +#if defined(SLES11SP3) +#define KFEATURE_HAS_DMA_ZALLOC_COHERENT 0 +#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE 0 +#endif +#elif defined(SLES12) +#if defined(SLES12SP2) || defined(SLES12SP3) +#define KFEATURE_HAS_KTIME_SECONDS 1 +#endif +#if defined(SLES12SP0) +#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE 0 +#define KFEATURE_HAS_ATOMIC_HOST_BUSY 0 +#endif +#if defined(SLES12SP1) +#define KFEATURE_HAS_ATOMIC_HOST_BUSY 0 +#endif +#elif defined(SLES15) +#define KFEATURE_HAS_SCSI_REQUEST 1 +#define KFEATURE_HAS_KTIME_SECONDS 1 +#define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS 1 +#define KFEATURE_HAS_MQ_SUPPORT 1 +#define KFEATURE_ENABLE_SCSI_MAP_QUEUES 1 +#if defined(SLES15SP0) +#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1 1 +#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1 1 +#elif defined(SLES15SP1) +#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2 1 +#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1 1 +#else +#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3 1 +#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2 1 +#endif +#elif defined(OEULER2003) +#define dma_zalloc_coherent dma_alloc_coherent +#define KFEATURE_HAS_KTIME_SECONDS 1 +#define KFEATURE_HAS_SCSI_REQUEST 1 +#define KFEATURE_HAS_KTIME64 1 +#define KFEATURE_HAS_BSG_JOB_SMP_HANDLER 1 +#define KFEATURE_HAS_USE_CLUSTERING 0 +#define KFEATURE_HAS_NCQ_PRIO_SUPPORT 1 +#define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS 1 +#define KFEATURE_HAS_MQ_SUPPORT 1 +#define shost_use_blk_mq(x) 1 +#define KFEATURE_ENABLE_SCSI_MAP_QUEUES 1 +#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2 1 +#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1 1 +#elif defined(UBUNTU1404) || TORTUGA || defined(KCLASS3C) +#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE 0 +#define KFEATURE_HAS_ATOMIC_HOST_BUSY 0 +#elif defined(OL7U2) || defined(KCLASS3B) +#define KFEATURE_HAS_DMA_MASK_AND_COHERENT 0 +#define KFEATURE_HAS_WAIT_FOR_COMPLETION_IO 0 +#define KFEATURE_HAS_ATOMIC_HOST_BUSY 0 +#endif +#if defined(KCLASS4A) +#define KFEATURE_HAS_KTIME_SECONDS 1 +#endif +#if defined(KCLASS4B) || defined(KCLASS4C) || defined(SLES12SP4) || \ + defined(SLES12SP5) || defined(RHEL8) || defined(KCLASS5A) || \ + defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \ + defined(SLES15SP2) || defined(SLES15SP3) || defined(SLES15SP4) || \ + defined(SLES15SP5) || \ + defined(RHEL9) || defined (CENTOS7ALTARM) || defined(OEULER2203) || \ + defined(KCLASS6) || defined(K10SP2) +#define KFEATURE_HAS_KTIME_SECONDS 1 +#define KFEATURE_HAS_SCSI_REQUEST 1 +#define KFEATURE_HAS_KTIME64 1 +#endif +#if defined(KCLASS4C) || defined(RHEL8) || defined(SLES15SP1) || \ + defined(SLES15SP2) || defined(SLES15SP3) || defined(SLES15SP4) || \ + defined(SLES15SP5) || \ + defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || \ + defined(KCLASS5D) || defined(SLES12SP5) || defined (CENTOS7ALTARM) || \ + defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6) || \ + defined(K10SP2) +#define KFEATURE_HAS_BSG_JOB_SMP_HANDLER 1 +#endif +#if defined(RHEL8U3) || defined(RHEL8U4) || defined(RHEL8U5) || \ + defined(RHEL8U6) || defined(RHEL8U7) +#define KFEATURE_HAS_HOST_BUSY_FUNCTION 1 +#endif + +#if defined(KCLASS3D) +#define KFEATURE_HAS_KTIME_SECONDS 1 +#endif +#if defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || \ + defined(KCLASS5D) || defined(KCLASS4D) || defined(SLES15SP2) || \ + defined(SLES15SP3) || defined(SLES15SP4) || defined(SLES15SP5) || \ + defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6) || \ + defined(K10SP2) +#define dma_zalloc_coherent dma_alloc_coherent +#define shost_use_blk_mq(x) 1 +#define KFEATURE_HAS_USE_CLUSTERING 0 +#endif + +#if defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \ + defined(KCLASS4D) || defined(SLES15SP2) || defined(SLES15SP3) || \ + defined(SLES15SP4) || defined(SLES15SP5) || \ + defined(RHEL9) || defined(OEULER2003) || \ + defined(OEULER2203) || defined(KCLASS6) || defined(K10SP2) +#define IOCTL_INT unsigned int +#else +#define IOCTL_INT int +#endif + +#if defined(KCLASS5C) || defined(KCLASS5D) || defined(SLES15SP4) || \ + defined(SLES15SP5) || \ + defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6) +#define KFEATURE_HAS_HOST_BUSY_FUNCTION 1 +#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) +#define ioremap_nocache ioremap +#endif + +#if defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || \ + defined(KCLASS5D) || defined(KCLASS4C) || defined(KCLASS4D) || \ + defined(RHEL8) || defined(SLES15) || defined(SLES15SP4) || \ + defined(SLES15SP5) || \ + defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6) || \ + defined(K10SP2) +#define KFEATURE_HAS_NCQ_PRIO_SUPPORT 1 +#endif + +#define KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING 0 + +#if !defined(from_timer) +#define KFEATURE_HAS_OLD_TIMER 1 +#endif + +/* default values */ +#if !defined(KFEATURE_HAS_WAIT_FOR_COMPLETION_IO) +#define KFEATURE_HAS_WAIT_FOR_COMPLETION_IO 1 +#endif +#if !defined(KFEATURE_HAS_2011_03_QUEUECOMMAND) +#define KFEATURE_HAS_2011_03_QUEUECOMMAND 1 +#endif +#if !defined(KFEATURE_HAS_DMA_ZALLOC_COHERENT) +#define KFEATURE_HAS_DMA_ZALLOC_COHERENT 1 +#endif +#if !defined(KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE) +#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE 1 +#endif +#if !defined(KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH) +#define KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH 1 +#endif +#if !defined(KFEATURE_HAS_MQ_SUPPORT) +#define KFEATURE_HAS_MQ_SUPPORT 1 +#endif +#if !defined(KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING) +#define KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING 1 +#endif +#if !defined(KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT) +#define KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT 1 +#endif +#if !defined(KFEATURE_HAS_NO_WRITE_SAME) +#define KFEATURE_HAS_NO_WRITE_SAME 1 +#endif +#if !defined(KFEATURE_HAS_BSG_JOB_SMP_HANDLER) +#define KFEATURE_HAS_BSG_JOB_SMP_HANDLER 0 +#endif +#if !defined(KFEATURE_HAS_HOST_BUSY_FUNCTION) +#define KFEATURE_HAS_HOST_BUSY_FUNCTION 0 +#endif +#if !defined(KFEATURE_HAS_SCSI_REQUEST) +#define KFEATURE_HAS_SCSI_REQUEST 0 +#endif +#if !defined(KFEATURE_HAS_LOCKLESS_DISPATCH_IO) +#define KFEATURE_HAS_LOCKLESS_DISPATCH_IO 0 +#endif +#if !defined(KFEATURE_HAS_USE_CLUSTERING) +#define KFEATURE_HAS_USE_CLUSTERING 1 +#define IOCTL_INT int +#endif +#if !defined(KFEATURE_HAS_OLD_TIMER) +#define KFEATURE_HAS_OLD_TIMER 0 +#endif +#if !defined(KFEATURE_HAS_KTIME_SECONDS) +#define KFEATURE_HAS_KTIME_SECONDS 0 +#endif +#if !defined(KFEATURE_HAS_KTIME64) +#define KFEATURE_HAS_KTIME64 0 +#endif +#if !defined(KFEATURE_HAS_DMA_MASK_AND_COHERENT) +#define KFEATURE_HAS_DMA_MASK_AND_COHERENT 1 +#endif +#if !defined(KFEATURE_HAS_ATOMIC_HOST_BUSY) +#define KFEATURE_HAS_ATOMIC_HOST_BUSY 1 +#endif +#if !defined(KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS) +#define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS 0 +#endif +#if !defined(KFEATURE_ENABLE_SCSI_MAP_QUEUES) +#define KFEATURE_ENABLE_SCSI_MAP_QUEUES 0 +#endif +#if !defined(KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1) +#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1 0 +#endif +#if !defined(KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2) +#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2 0 +#endif +#if !defined(KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3) +#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3 0 +#endif +#if !defined(KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1) +#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1 0 +#endif +#if !defined(KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2) +#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2 0 +#endif +#if !defined(KFEATURE_HAS_NCQ_PRIO_SUPPORT) +#define KFEATURE_HAS_NCQ_PRIO_SUPPORT 0 +#endif +#if !defined(KFEATURE_HAS_GLOBAL_SCSI_DONE) +#define KFEATURE_HAS_GLOBAL_SCSI_DONE 0 +#endif +#if !defined(KFEATURE_HAS_HOST_TAGSET_SUPPORT) +#define KFEATURE_HAS_HOST_TAGSET_SUPPORT 0 +#endif +/* Check for change in host device attributes are defined */ +#if !defined(KFEATURE_HAS_SDEV_GROUPS) +#define KFEATURE_HAS_SDEV_GROUPS 0 +# define PQI_DEVICE_ATTRIBUTE device_attribute +# define PQI_ATTR +# define PQI_ATTRIBUTE_GROUPS(x) +# define PQI_ATTRIBUTE(x) (x) +# define PQI_SDEV_ATTRS \ + .sdev_attrs = pqi_sdev_attrs +# define PQI_SHOST_ATTRS \ + .shost_attrs = pqi_shost_attrs +/* Newer device attribute groups defined */ +#else +# define PQI_DEVICE_ATTRIBUTE attribute +# define PQI_ATTRIBUTE_GROUPS(x) \ + ATTRIBUTE_GROUPS(x); +# define PQI_ATTRIBUTE(x) (x.attr) +# define PQI_SDEV_ATTRS \ + .sdev_groups = pqi_sdev_groups +# define PQI_SHOST_ATTRS \ + .shost_groups = pqi_shost_groups +#endif + +#if !defined(KFEATURE_HAS_SCSI_CMD_TO_RQ) +#define KFEATURE_HAS_SCSI_CMD_TO_RQ 0 +# define PQI_SCSI_REQUEST(x) \ + x->request +#else +# define PQI_SCSI_REQUEST(x) \ + scsi_cmd_to_rq(x) +#endif + +#if !defined(KFEATURE_HAS_SCSI_CMD_PRIV) +#define KFEATURE_HAS_SCSI_CMD_PRIV 0 +# define PQI_CMD_PRIV +# define PQI_SCSI_CMD_RESIDUAL(scmd) \ + (scmd->SCp.this_residual) +#else +# define PQI_CMD_PRIV \ + .cmd_size = sizeof(struct pqi_cmd_priv), + struct pqi_cmd_priv { + int this_residual; + }; + struct pqi_cmd_priv *pqi_cmd_priv(struct scsi_cmnd *cmd); +# define PQI_SCSI_CMD_RESIDUAL(scmd) \ + pqi_cmd_priv(scmd)->this_residual +#endif + +#if !defined(list_next_entry) +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) +#endif + +#if !defined(list_first_entry_or_null) +#define list_first_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) +#endif + +#if !defined(TYPE_ZBC) +#define TYPE_ZBC 0x14 +#endif + +#if !defined(readq) +#define readq readq +static inline u64 readq(const volatile void __iomem *addr) +{ + u32 lower32; + u32 upper32; + + lower32 = readl(addr); + upper32 = readl(addr + 4); + + return ((u64)upper32 << 32) | lower32; +} +#endif + +#if !defined(writeq) +#define writeq writeq +static inline void writeq(u64 value, volatile void __iomem *addr) +{ + u32 lower32; + u32 upper32; + + lower32 = lower_32_bits(value); + upper32 = upper_32_bits(value); + + writel(lower32, addr); + writel(upper32, addr + 4); +} +#endif + +static inline void pqi_disable_write_same(struct scsi_device *sdev) +{ +#if KFEATURE_HAS_NO_WRITE_SAME + sdev->no_write_same = 1; +#endif +} + +#if !defined(PCI_DEVICE_SUB) +#define PCI_DEVICE_SUB(vend, dev, subvend, subdev) \ + .vendor = (vend), .device = (dev), \ + .subvendor = (subvend), .subdevice = (subdev) +#endif + +#if !defined(PCI_VENDOR_ID_HPE) +#define PCI_VENDOR_ID_HPE 0x1590 +#endif + +#if !defined(PCI_VENDOR_ID_ADVANTECH) +#define PCI_VENDOR_ID_ADVANTECH 0x13fe +#endif + +#if !defined(PCI_VENDOR_ID_FIBERHOME) +#define PCI_VENDOR_ID_FIBERHOME 0x1d8d +#endif + +#if !defined(PCI_VENDOR_ID_GIGABYTE) +#define PCI_VENDOR_ID_GIGABYTE 0x1458 +#endif + +#if !defined(PCI_VENDOR_ID_FOXCONN) +#define PCI_VENDOR_ID_FOXCONN 0x105b +#endif + +#if !defined(PCI_VENDOR_ID_HUAWEI) +#define PCI_VENDOR_ID_HUAWEI 0x19e5 +#endif + +#if !defined(PCI_VENDOR_ID_H3C) +#define PCI_VENDOR_ID_H3C 0x193d +#endif + +#if !defined(PCI_VENDOR_ID_QUANTA) +#define PCI_VENDOR_ID_QUANTA 0x152d +#endif + +#if !defined(PCI_VENDOR_ID_INSPUR) +#define PCI_VENDOR_ID_INSPUR 0x1bd4 +#endif + +#if !defined(PCI_VENDOR_ID_NTCOM) +#define PCI_VENDOR_ID_NTCOM 0x1dfc +#endif + +#if !defined(PCI_VENDOR_ID_NT) +#define PCI_VENDOR_ID_NT 0x1f0c +#endif + +#if !defined(PCI_VENDOR_ID_ZTE) +#define PCI_VENDOR_ID_ZTE 0x1cf2 +#endif + +#if !defined(PCI_VENDOR_ID_RAMAXEL) +#define PCI_VENDOR_ID_RAMAXEL 0x1cc4 +#endif + +#if !defined(PCI_VENDOR_ID_LENOVO) +#define PCI_VENDOR_ID_LENOVO 0x1d49 +#endif + +#if !defined(PCI_VENDOR_ID_IBM) +#define PCI_VENDOR_ID_IBM 0x1014 +#endif + +#if !defined(offsetofend) +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) +#endif + +void pqi_compat_init_scsi_host_template(struct scsi_host_template *template); +void pqi_compat_init_scsi_host(struct Scsi_Host *shost, + struct pqi_ctrl_info *ctrl_info); + +#if !KFEATURE_HAS_WAIT_FOR_COMPLETION_IO + +static inline unsigned long wait_for_completion_io_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_completion_timeout(x, timeout); +} + +static inline unsigned long wait_for_completion_io(struct completion *x) +{ + wait_for_completion(x); + return 0; +} + +#endif /* !KFEATURE_HAS_WAIT_FOR_COMPLETION_IO */ + +#if KFEATURE_HAS_2011_03_QUEUECOMMAND + +#define PQI_SCSI_QUEUE_COMMAND pqi_scsi_queue_command + +static inline void pqi_scsi_done(struct scsi_cmnd *scmd) +{ + pqi_prep_for_scsi_done(scmd); +#if !KFEATURE_HAS_GLOBAL_SCSI_DONE + if (scmd && scmd->scsi_done) + scmd->scsi_done(scmd); +#else + if (scmd) + scsi_done(scmd); +#endif +} + +#else + +int pqi_scsi_queue_command_compat(struct scsi_cmnd *scmd, + void (*done)(struct scsi_cmnd *)); + +#define PQI_SCSI_QUEUE_COMMAND pqi_scsi_queue_command_compat + +static inline void pqi_scsi_done(struct scsi_cmnd *scmd) +{ + void (*scsi_done)(struct scsi_cmnd *); + + pqi_prep_for_scsi_done(scmd); + if (scmd) { + scsi_done = (void(*)(struct scsi_cmnd *))scmd->SCp.ptr; + scsi_done(scmd); + } +} + +#endif /* KFEATURE_HAS_2011_03_QUEUECOMMAND */ + +#if !KFEATURE_HAS_DMA_ZALLOC_COHERENT + +static inline void *dma_zalloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret = dma_alloc_coherent(dev, size, dma_handle, + flag | __GFP_ZERO); + return ret; +} + +#endif /* !KFEATURE_HAS_DMA_ZALLOC_COHERENT */ + +#if !KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE + +int pci_enable_msix_range(struct pci_dev *pci_dev, struct msix_entry *entries, + int minvec, int maxvec); + +#endif /* !KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE */ + +#if !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH + +int scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth); + +#endif /* !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH */ + +#if !KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING + +void scsi_sanitize_inquiry_string(unsigned char *s, int len); + +#endif /* !KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING */ + +#if !KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT + +#define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ + +int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos, + u16 clear, u16 set); + +#endif /* !KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT */ + +static inline u16 pqi_get_hw_queue(struct pqi_ctrl_info *ctrl_info, + struct scsi_cmnd *scmd) +{ + u16 hw_queue; + +#if KFEATURE_HAS_MQ_SUPPORT + if (shost_use_blk_mq(scmd->device->host)) + hw_queue = blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(PQI_SCSI_REQUEST(scmd))); + else + hw_queue = smp_processor_id() % ctrl_info->num_queue_groups; +#else + hw_queue = smp_processor_id() % ctrl_info->num_queue_groups; +#endif + + return hw_queue; +} + +#ifdef KFEATURE_NEEDS_BLK_RQ_IS_PASSTHROUGH + +static inline bool blk_rq_is_passthrough(struct request *rq) +{ + return rq->cmd_type != REQ_TYPE_FS; +} + +#endif /* KFEATURE_NEEDS_BLK_RQ_IS_PASSTHROUGH */ + +#if !KFEATURE_HAS_BSG_JOB_SMP_HANDLER + +int pqi_sas_smp_handler_compat(struct Scsi_Host *shost, struct sas_rphy *rphy, + struct request *req); + +void pqi_bsg_job_done(struct bsg_job *job, int result, + unsigned int reply_payload_rcv_len); + +#define PQI_SAS_SMP_HANDLER pqi_sas_smp_handler_compat + +#else + +#define PQI_SAS_SMP_HANDLER pqi_sas_smp_handler + +static inline void pqi_bsg_job_done(struct bsg_job *job, int result, + unsigned int reply_payload_rcv_len) +{ + bsg_job_done(job, result, reply_payload_rcv_len); +} + +#endif /* !KFEATURE_HAS_BSG_JOB_SMP_HANDLER */ + +#if KFEATURE_HAS_OLD_TIMER +#define from_timer(var, callback_timer, timer_fieldname) \ + container_of(callback_timer, typeof(*var), timer_fieldname) + +#if !defined(TIMER_DATA_TYPE) +#define TIMER_DATA_TYPE unsigned long +#define TIMER_FUNC_TYPE void (*)(TIMER_DATA_TYPE) +#endif + +static inline void timer_setup (struct timer_list *timer, + void (*func) (struct timer_list *), unsigned long data) +{ + init_timer(timer); + timer->function = (TIMER_FUNC_TYPE) func; + timer->data = (unsigned long) timer; +} +#endif /* KFEATURE_HAS_OLD_TIMER */ + +#if !KFEATURE_HAS_KTIME64 +#define time64_to_tm time_to_tm +#endif + +#if !KFEATURE_HAS_KTIME_SECONDS +static inline unsigned long ktime_get_real_seconds(void) +{ + ktime_t tv; + struct timeval time; + + tv = ktime_get_real(); + time = ktime_to_timeval(tv); + + return time.tv_sec; +} +#endif + +#if !KFEATURE_HAS_DMA_MASK_AND_COHERENT + +static inline int pqi_dma_set_mask_and_coherent(struct device *device, u64 mask) +{ + return dma_set_mask(device, mask); +} + +#else + +static inline int pqi_dma_set_mask_and_coherent(struct device *device, u64 mask) +{ + return dma_set_mask_and_coherent(device, mask); +} + +#endif /* !KFEATURE_HAS_DMA_MASK_AND_COHERENT */ + +static inline bool pqi_scsi_host_busy(struct Scsi_Host *shost) +{ +#if KFEATURE_HAS_HOST_BUSY_FUNCTION + return scsi_host_busy(shost); +#else +#if KFEATURE_HAS_ATOMIC_HOST_BUSY + return atomic_read(&shost->host_busy) > 0; +#else + return shost->host_busy > 0; +#endif +#endif +} + +#if !KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS +#if !defined(PCI_IRQ_MSIX) +#define PCI_IRQ_MSIX (1 << 2) /* Allow MSI-X interrupts */ +#endif +#if !defined(PCI_IRQ_AFFINITY) +#define PCI_IRQ_AFFINITY (1 << 3) /* Auto-assign affinity */ +#endif +#endif + +int pqi_pci_irq_vector(struct pci_dev *dev, unsigned int nr); +int pqi_pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); +void pqi_pci_free_irq_vectors(struct pci_dev *dev); +struct pqi_io_request *pqi_get_io_request(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd); + +static inline void *pqi_get_irq_cookie(struct pqi_ctrl_info *ctrl_info, unsigned int nr) +{ +#if KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS + return &ctrl_info->queue_groups[nr]; +#else + return ctrl_info->intr_data[nr]; +#endif +} + +#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT +#define PQI_SET_HOST_TAGSET(s) +#else +#define PQI_SET_HOST_TAGSET(s) \ + s->host_tagset = 1; +#endif + +#endif /* _SMARTPQI_KERNEL_COMPAT_H */ diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c b/drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c new file mode 100644 index 0000000000000..54dd32170eb61 --- /dev/null +++ b/drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c @@ -0,0 +1,585 @@ +/* + * driver for Microchip PQI-based storage controllers + * Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries + * Copyright (c) 2016-2018 Microsemi Corporation + * Copyright (c) 2016 PMC-Sierra, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * Questions/Comments/Bugfixes to storagedev@microchip.com + * + */ + +#include +#include +#include +#include +#include +#include +#include "smartpqi.h" +#include "smartpqi_kernel_compat.h" + +static struct pqi_sas_phy *pqi_alloc_sas_phy(struct pqi_sas_port *pqi_sas_port) +{ + struct pqi_sas_phy *pqi_sas_phy; + struct sas_phy *phy; + + pqi_sas_phy = kzalloc(sizeof(*pqi_sas_phy), GFP_KERNEL); + if (!pqi_sas_phy) + return NULL; + + phy = sas_phy_alloc(pqi_sas_port->parent_node->parent_dev, + pqi_sas_port->next_phy_index); + if (!phy) { + kfree(pqi_sas_phy); + return NULL; + } + + pqi_sas_port->next_phy_index++; + pqi_sas_phy->phy = phy; + pqi_sas_phy->parent_port = pqi_sas_port; + + return pqi_sas_phy; +} + +static void pqi_free_sas_phy(struct pqi_sas_phy *pqi_sas_phy) +{ + struct sas_phy *phy = pqi_sas_phy->phy; + + sas_port_delete_phy(pqi_sas_phy->parent_port->port, phy); + if (pqi_sas_phy->added_to_port) + list_del(&pqi_sas_phy->phy_list_entry); + sas_phy_delete(phy); + kfree(pqi_sas_phy); +} + +static int pqi_sas_port_add_phy(struct pqi_sas_phy *pqi_sas_phy) +{ + int rc; + struct pqi_sas_port *pqi_sas_port; + struct sas_phy *phy; + struct sas_identify *identify; + + pqi_sas_port = pqi_sas_phy->parent_port; + phy = pqi_sas_phy->phy; + + identify = &phy->identify; + memset(identify, 0, sizeof(*identify)); + identify->sas_address = pqi_sas_port->sas_address; + identify->device_type = SAS_END_DEVICE; + identify->initiator_port_protocols = SAS_PROTOCOL_ALL; + identify->target_port_protocols = SAS_PROTOCOL_ALL; + phy->minimum_linkrate_hw = SAS_LINK_RATE_UNKNOWN; + phy->maximum_linkrate_hw = SAS_LINK_RATE_UNKNOWN; + phy->minimum_linkrate = SAS_LINK_RATE_UNKNOWN; + phy->maximum_linkrate = SAS_LINK_RATE_UNKNOWN; + phy->negotiated_linkrate = SAS_LINK_RATE_UNKNOWN; + + rc = sas_phy_add(pqi_sas_phy->phy); + if (rc) + return rc; + + sas_port_add_phy(pqi_sas_port->port, pqi_sas_phy->phy); + list_add_tail(&pqi_sas_phy->phy_list_entry, + &pqi_sas_port->phy_list_head); + pqi_sas_phy->added_to_port = true; + + return 0; +} + +static int pqi_sas_port_add_rphy(struct pqi_sas_port *pqi_sas_port, + struct sas_rphy *rphy) +{ + struct sas_identify *identify; + + identify = &rphy->identify; + identify->sas_address = pqi_sas_port->sas_address; + identify->phy_identifier = pqi_sas_port->device->phy_id; + + identify->initiator_port_protocols = SAS_PROTOCOL_ALL; + identify->target_port_protocols = SAS_PROTOCOL_STP; + + switch (pqi_sas_port->device->device_type) { + case SA_DEVICE_TYPE_SAS: + case SA_DEVICE_TYPE_SES: + case SA_DEVICE_TYPE_NVME: + identify->target_port_protocols = SAS_PROTOCOL_SSP; + break; + case SA_DEVICE_TYPE_EXPANDER_SMP: + identify->target_port_protocols = SAS_PROTOCOL_SMP; + break; + case SA_DEVICE_TYPE_SATA: + default: + break; + } + + return sas_rphy_add(rphy); +} + +static struct sas_rphy *pqi_sas_rphy_alloc(struct pqi_sas_port *pqi_sas_port) +{ + if (pqi_sas_port->device && pqi_sas_port->device->is_expander_smp_device) + return sas_expander_alloc(pqi_sas_port->port, + SAS_FANOUT_EXPANDER_DEVICE); + + return sas_end_device_alloc(pqi_sas_port->port); +} + +static struct pqi_sas_port *pqi_alloc_sas_port( + struct pqi_sas_node *pqi_sas_node, u64 sas_address, + struct pqi_scsi_dev *device) +{ + int rc; + struct pqi_sas_port *pqi_sas_port; + struct sas_port *port; + + pqi_sas_port = kzalloc(sizeof(*pqi_sas_port), GFP_KERNEL); + if (!pqi_sas_port) + return NULL; + + INIT_LIST_HEAD(&pqi_sas_port->phy_list_head); + pqi_sas_port->parent_node = pqi_sas_node; + + port = sas_port_alloc_num(pqi_sas_node->parent_dev); + if (!port) + goto free_pqi_port; + + rc = sas_port_add(port); + if (rc) + goto free_sas_port; + + pqi_sas_port->port = port; + pqi_sas_port->sas_address = sas_address; + pqi_sas_port->device = device; + list_add_tail(&pqi_sas_port->port_list_entry, + &pqi_sas_node->port_list_head); + + return pqi_sas_port; + +free_sas_port: + sas_port_free(port); +free_pqi_port: + kfree(pqi_sas_port); + + return NULL; +} + +static void pqi_free_sas_port(struct pqi_sas_port *pqi_sas_port) +{ + struct pqi_sas_phy *pqi_sas_phy; + struct pqi_sas_phy *next; + + list_for_each_entry_safe(pqi_sas_phy, next, + &pqi_sas_port->phy_list_head, phy_list_entry) + pqi_free_sas_phy(pqi_sas_phy); + + sas_port_delete(pqi_sas_port->port); + list_del(&pqi_sas_port->port_list_entry); + kfree(pqi_sas_port); +} + +static struct pqi_sas_node *pqi_alloc_sas_node(struct device *parent_dev) +{ + struct pqi_sas_node *pqi_sas_node; + + pqi_sas_node = kzalloc(sizeof(*pqi_sas_node), GFP_KERNEL); + if (pqi_sas_node) { + pqi_sas_node->parent_dev = parent_dev; + INIT_LIST_HEAD(&pqi_sas_node->port_list_head); + } + + return pqi_sas_node; +} + +static void pqi_free_sas_node(struct pqi_sas_node *pqi_sas_node) +{ + struct pqi_sas_port *pqi_sas_port; + struct pqi_sas_port *next; + + if (!pqi_sas_node) + return; + + list_for_each_entry_safe(pqi_sas_port, next, + &pqi_sas_node->port_list_head, port_list_entry) + pqi_free_sas_port(pqi_sas_port); + + kfree(pqi_sas_node); +} + +struct pqi_scsi_dev *pqi_find_device_by_sas_rphy( + struct pqi_ctrl_info *ctrl_info, struct sas_rphy *rphy) +{ + struct pqi_scsi_dev *device; + + list_for_each_entry(device, &ctrl_info->scsi_device_list, + scsi_device_list_entry) { + if (!device->sas_port) + continue; + if (device->sas_port->rphy == rphy) + return device; + } + + return NULL; +} + +int pqi_add_sas_host(struct Scsi_Host *shost, struct pqi_ctrl_info *ctrl_info) +{ + int rc; + struct device *parent_dev; + struct pqi_sas_node *pqi_sas_node; + struct pqi_sas_port *pqi_sas_port; + struct pqi_sas_phy *pqi_sas_phy; + + parent_dev = &shost->shost_dev; + + pqi_sas_node = pqi_alloc_sas_node(parent_dev); + if (!pqi_sas_node) + return -ENOMEM; + + pqi_sas_port = pqi_alloc_sas_port(pqi_sas_node, + ctrl_info->sas_address, NULL); + if (!pqi_sas_port) { + rc = -ENODEV; + goto free_sas_node; + } + + pqi_sas_phy = pqi_alloc_sas_phy(pqi_sas_port); + if (!pqi_sas_phy) { + rc = -ENODEV; + goto free_sas_port; + } + + rc = pqi_sas_port_add_phy(pqi_sas_phy); + if (rc) + goto free_sas_phy; + + ctrl_info->sas_host = pqi_sas_node; + + return 0; + +free_sas_phy: + pqi_free_sas_phy(pqi_sas_phy); +free_sas_port: + pqi_free_sas_port(pqi_sas_port); +free_sas_node: + pqi_free_sas_node(pqi_sas_node); + + return rc; +} + +void pqi_delete_sas_host(struct pqi_ctrl_info *ctrl_info) +{ + pqi_free_sas_node(ctrl_info->sas_host); +} + +int pqi_add_sas_device(struct pqi_sas_node *pqi_sas_node, + struct pqi_scsi_dev *device) +{ + int rc; + struct pqi_sas_port *pqi_sas_port; + struct sas_rphy *rphy; + + pqi_sas_port = pqi_alloc_sas_port(pqi_sas_node, + device->sas_address, device); + if (!pqi_sas_port) + return -ENOMEM; + + rphy = pqi_sas_rphy_alloc(pqi_sas_port); + if (!rphy) { + rc = -ENODEV; + goto free_sas_port; + } + + pqi_sas_port->rphy = rphy; + device->sas_port = pqi_sas_port; + + rc = pqi_sas_port_add_rphy(pqi_sas_port, rphy); + if (rc) + goto free_sas_rphy; + + return 0; + +free_sas_rphy: + sas_rphy_free(rphy); +free_sas_port: + pqi_free_sas_port(pqi_sas_port); + device->sas_port = NULL; + + return rc; +} + +void pqi_remove_sas_device(struct pqi_scsi_dev *device) +{ + if (device->sas_port) { + pqi_free_sas_port(device->sas_port); + device->sas_port = NULL; + } +} + +static int pqi_sas_get_linkerrors(struct sas_phy *phy) +{ + return 0; +} + +static int pqi_sas_get_enclosure_identifier(struct sas_rphy *rphy, + u64 *identifier) +{ + int rc; + unsigned long flags; + struct Scsi_Host *shost; + struct pqi_ctrl_info *ctrl_info; + struct pqi_scsi_dev *found_device; + struct pqi_scsi_dev *device; + + if (!rphy) + return -ENODEV; + + shost = rphy_to_shost(rphy); + ctrl_info = shost_to_hba(shost); + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + found_device = pqi_find_device_by_sas_rphy(ctrl_info, rphy); + + if (!found_device) { + rc = -ENODEV; + goto out; + } + + if (found_device->devtype == TYPE_ENCLOSURE) { + *identifier = get_unaligned_be64(&found_device->wwid[8]); + rc = 0; + goto out; + } + + if (found_device->box_index == 0xff || + found_device->phys_box_on_bus == 0 || + found_device->bay == 0xff) { + rc = -EINVAL; + goto out; + } + + list_for_each_entry(device, &ctrl_info->scsi_device_list, + scsi_device_list_entry) { + if (device->devtype == TYPE_ENCLOSURE && + device->box_index == found_device->box_index && + device->phys_box_on_bus == + found_device->phys_box_on_bus && + memcmp(device->phys_connector, + found_device->phys_connector, 2) == 0) { + *identifier = + get_unaligned_be64(&device->wwid[8]); + rc = 0; + goto out; + } + } + + if (found_device->phy_connected_dev_type != SA_DEVICE_TYPE_CONTROLLER) { + rc = -EINVAL; + goto out; + } + + list_for_each_entry(device, &ctrl_info->scsi_device_list, + scsi_device_list_entry) { + if (device->devtype == TYPE_ENCLOSURE && + CISS_GET_DRIVE_NUMBER(device->scsi3addr) == + PQI_VSEP_CISS_BTL) { + *identifier = get_unaligned_be64(&device->wwid[8]); + rc = 0; + goto out; + } + } + + rc = -EINVAL; +out: + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return rc; +} + +static int pqi_sas_get_bay_identifier(struct sas_rphy *rphy) +{ + int rc; + unsigned long flags; + struct pqi_ctrl_info *ctrl_info; + struct pqi_scsi_dev *device; + struct Scsi_Host *shost; + + if (!rphy) + return -ENODEV; + + shost = rphy_to_shost(rphy); + ctrl_info = shost_to_hba(shost); + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + device = pqi_find_device_by_sas_rphy(ctrl_info, rphy); + + if (!device) { + rc = -ENODEV; + goto out; + } + + if (device->bay == 0xff) + rc = -EINVAL; + else + rc = device->bay; + +out: + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + + return rc; +} + +static int pqi_sas_phy_reset(struct sas_phy *phy, int hard_reset) +{ + return 0; +} + +static int pqi_sas_phy_enable(struct sas_phy *phy, int enable) +{ + return 0; +} + +static int pqi_sas_phy_setup(struct sas_phy *phy) +{ + return 0; +} + +static void pqi_sas_phy_release(struct sas_phy *phy) +{ +} + +static int pqi_sas_phy_speed(struct sas_phy *phy, + struct sas_phy_linkrates *rates) +{ + return -EINVAL; +} + +#define CSMI_IOCTL_TIMEOUT 60 +#define SMP_CRC_FIELD_LENGTH 4 + +static struct bmic_csmi_smp_passthru_buffer * +pqi_build_csmi_smp_passthru_buffer(struct sas_rphy *rphy, + struct bsg_job *job) +{ + struct bmic_csmi_smp_passthru_buffer *smp_buf; + struct bmic_csmi_ioctl_header *ioctl_header; + struct bmic_csmi_smp_passthru *parameters; + u32 req_size; + u32 resp_size; + + smp_buf = kzalloc(sizeof(*smp_buf), GFP_KERNEL); + if (!smp_buf) + return NULL; + + req_size = job->request_payload.payload_len; + resp_size = job->reply_payload.payload_len; + + ioctl_header = &smp_buf->ioctl_header; + put_unaligned_le32(sizeof(smp_buf->ioctl_header), + &ioctl_header->header_length); + put_unaligned_le32(CSMI_IOCTL_TIMEOUT, &ioctl_header->timeout); + put_unaligned_le32(CSMI_CC_SAS_SMP_PASSTHRU, + &ioctl_header->control_code); + put_unaligned_le32(sizeof(smp_buf->parameters), &ioctl_header->length); + + parameters = &smp_buf->parameters; + parameters->phy_identifier = rphy->identify.phy_identifier; + parameters->port_identifier = 0; + parameters->connection_rate = 0; + put_unaligned_be64(rphy->identify.sas_address, + ¶meters->destination_sas_address); + + if (req_size > SMP_CRC_FIELD_LENGTH) + req_size -= SMP_CRC_FIELD_LENGTH; + + put_unaligned_le32(req_size, ¶meters->request_length); + put_unaligned_le32(resp_size, ¶meters->response_length); + + sg_copy_to_buffer(job->request_payload.sg_list, + job->reply_payload.sg_cnt, ¶meters->request, + req_size); + + return smp_buf; +} + +static unsigned int pqi_build_sas_smp_handler_reply( + struct bmic_csmi_smp_passthru_buffer *smp_buf, struct bsg_job *job, + struct pqi_raid_error_info *error_info) +{ + sg_copy_from_buffer(job->reply_payload.sg_list, + job->reply_payload.sg_cnt, &smp_buf->parameters.response, + le32_to_cpu(smp_buf->parameters.response_length)); + + job->reply_len = le16_to_cpu(error_info->sense_data_length); + memcpy(job->reply, error_info->data, + le16_to_cpu(error_info->sense_data_length)); + + return job->reply_payload.payload_len - + get_unaligned_le32(&error_info->data_in_transferred); +} + +void pqi_sas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, + struct sas_rphy *rphy) +{ + int rc; + struct pqi_ctrl_info *ctrl_info; + struct bmic_csmi_smp_passthru_buffer *smp_buf; + struct pqi_raid_error_info error_info; + unsigned int reslen = 0; + + ctrl_info = shost_to_hba(shost); + + if (job->reply_payload.payload_len == 0) { + rc = -ENOMEM; + goto out; + } + + if (!rphy) { + rc = -EINVAL; + goto out; + } + + if (rphy->identify.device_type != SAS_FANOUT_EXPANDER_DEVICE) { + rc = -EINVAL; + goto out; + } + + if (job->request_payload.sg_cnt > 1 || job->reply_payload.sg_cnt > 1) { + rc = -EINVAL; + goto out; + } + + smp_buf = pqi_build_csmi_smp_passthru_buffer(rphy, job); + if (!smp_buf) { + rc = -ENOMEM; + goto out; + } + + rc = pqi_csmi_smp_passthru(ctrl_info, smp_buf, sizeof(*smp_buf), + &error_info); + if (rc) + goto out; + + reslen = pqi_build_sas_smp_handler_reply(smp_buf, job, &error_info); + +out: + pqi_bsg_job_done(job, rc, reslen); +} + +struct sas_function_template pqi_sas_transport_functions = { + .get_linkerrors = pqi_sas_get_linkerrors, + .get_enclosure_identifier = pqi_sas_get_enclosure_identifier, + .get_bay_identifier = pqi_sas_get_bay_identifier, + .phy_reset = pqi_sas_phy_reset, + .phy_enable = pqi_sas_phy_enable, + .phy_setup = pqi_sas_phy_setup, + .phy_release = pqi_sas_phy_release, + .set_phy_speed = pqi_sas_phy_speed, + .smp_handler = PQI_SAS_SMP_HANDLER, +}; + diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_sis.c b/drivers/amazon/scsi/smartpqi/smartpqi_sis.c new file mode 100644 index 0000000000000..5381bfb39090e --- /dev/null +++ b/drivers/amazon/scsi/smartpqi/smartpqi_sis.c @@ -0,0 +1,516 @@ +/* + * driver for Microchip PQI-based storage controllers + * Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries + * Copyright (c) 2016-2018 Microsemi Corporation + * Copyright (c) 2016 PMC-Sierra, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * Questions/Comments/Bugfixes to storagedev@microchip.com + * + */ + +#include +#include +#include +#include +#include +#include +#include "smartpqi.h" +#include "smartpqi_sis.h" + +/* legacy SIS interface commands */ +#define SIS_CMD_GET_ADAPTER_PROPERTIES 0x19 +#define SIS_CMD_INIT_BASE_STRUCT_ADDRESS 0x1b +#define SIS_CMD_GET_PQI_CAPABILITIES 0x3000 + +/* for submission of legacy SIS commands */ +#define SIS_REENABLE_SIS_MODE 0x1 +#define SIS_ENABLE_MSIX 0x40 +#define SIS_ENABLE_INTX 0x80 +#define SIS_SOFT_RESET 0x100 +#define SIS_CMD_READY 0x200 +#define SIS_TRIGGER_SHUTDOWN 0x800000 +#define SIS_PQI_RESET_QUIESCE 0x1000000 + +#define SIS_CMD_COMPLETE 0x1000 +#define SIS_CLEAR_CTRL_TO_HOST_DOORBELL 0x1000 + +#define SIS_CMD_STATUS_SUCCESS 0x1 +#define SIS_CMD_COMPLETE_TIMEOUT_SECS 30 +#define SIS_CMD_COMPLETE_POLL_INTERVAL_MSECS 10 + +/* used with SIS_CMD_GET_ADAPTER_PROPERTIES command */ +#define SIS_EXTENDED_PROPERTIES_SUPPORTED 0x800000 +#define SIS_SMARTARRAY_FEATURES_SUPPORTED 0x2 +#define SIS_PQI_MODE_SUPPORTED 0x4 +#define SIS_PQI_RESET_QUIESCE_SUPPORTED 0x8 +#define SIS_REQUIRED_EXTENDED_PROPERTIES \ + (SIS_SMARTARRAY_FEATURES_SUPPORTED | SIS_PQI_MODE_SUPPORTED) + +/* used with SIS_CMD_INIT_BASE_STRUCT_ADDRESS command */ +#define SIS_BASE_STRUCT_REVISION 9 +#define SIS_BASE_STRUCT_ALIGNMENT 16 + +#define SIS_CTRL_KERNEL_FW_TRIAGE 0x3 +#define SIS_CTRL_KERNEL_UP 0x80 +#define SIS_CTRL_KERNEL_PANIC 0x100 +#if TORTUGA +#define SIS_CTRL_READY_TIMEOUT_SECS 150 +#else +#define SIS_CTRL_READY_TIMEOUT_SECS 180 +#endif +#define SIS_CTRL_READY_RESUME_TIMEOUT_SECS 90 +#define SIS_CTRL_READY_POLL_INTERVAL_MSECS 10 + +enum sis_fw_triage_status { + FW_TRIAGE_NOT_STARTED = 0, + FW_TRIAGE_STARTED, + FW_TRIAGE_COND_INVALID, + FW_TRIAGE_COMPLETED +}; + +#pragma pack(1) + +/* for use with SIS_CMD_INIT_BASE_STRUCT_ADDRESS command */ +struct sis_base_struct { + __le32 revision; /* revision of this structure */ + __le32 flags; /* reserved */ + __le32 error_buffer_paddr_low; /* lower 32 bits of physical memory */ + /* buffer for PQI error response */ + /* data */ + __le32 error_buffer_paddr_high; /* upper 32 bits of physical */ + /* memory buffer for PQI */ + /* error response data */ + __le32 error_buffer_element_length; /* length of each PQI error */ + /* response buffer element */ + /* in bytes */ + __le32 error_buffer_num_elements; /* total number of PQI error */ + /* response buffers available */ +}; + +#pragma pack() + +unsigned int sis_ctrl_ready_timeout_secs = SIS_CTRL_READY_TIMEOUT_SECS; + +static int sis_wait_for_ctrl_ready_with_timeout(struct pqi_ctrl_info *ctrl_info, + unsigned int timeout_secs) +{ + unsigned long timeout; + u32 status; + + timeout = (timeout_secs * HZ) + jiffies; + + while (1) { + status = readl(&ctrl_info->registers->sis_firmware_status); + if (status != ~0) { + if (status & SIS_CTRL_KERNEL_PANIC) { + dev_err(&ctrl_info->pci_dev->dev, + "controller is offline: status code 0x%x\n", + readl( + &ctrl_info->registers->sis_mailbox[7])); + return -ENODEV; + } + if (status & SIS_CTRL_KERNEL_UP) + break; + } + if (time_after(jiffies, timeout)) { + dev_err(&ctrl_info->pci_dev->dev, + "controller not ready after %u seconds\n", + timeout_secs); + return -ETIMEDOUT; + } + msleep(SIS_CTRL_READY_POLL_INTERVAL_MSECS); + } + + return 0; +} + +int sis_wait_for_ctrl_ready(struct pqi_ctrl_info *ctrl_info) +{ + return sis_wait_for_ctrl_ready_with_timeout(ctrl_info, + sis_ctrl_ready_timeout_secs); +} + +int sis_wait_for_ctrl_ready_resume(struct pqi_ctrl_info *ctrl_info) +{ + return sis_wait_for_ctrl_ready_with_timeout(ctrl_info, + SIS_CTRL_READY_RESUME_TIMEOUT_SECS); +} + +bool sis_is_firmware_running(struct pqi_ctrl_info *ctrl_info) +{ + bool running; + u32 status; + + status = readl(&ctrl_info->registers->sis_firmware_status); + + if (status != ~0 && (status & SIS_CTRL_KERNEL_PANIC)) + running = false; + else + running = true; + + if (!running) + dev_err(&ctrl_info->pci_dev->dev, + "controller is offline: status code 0x%x\n", + readl(&ctrl_info->registers->sis_mailbox[7])); + + return running; +} + +bool sis_is_kernel_up(struct pqi_ctrl_info *ctrl_info) +{ + return readl(&ctrl_info->registers->sis_firmware_status) & + SIS_CTRL_KERNEL_UP; +} + +u32 sis_get_product_id(struct pqi_ctrl_info *ctrl_info) +{ + return readl(&ctrl_info->registers->sis_product_identifier); +} + +/* used for passing command parameters/results when issuing SIS commands */ +struct sis_sync_cmd_params { + u32 mailbox[6]; /* mailboxes 0-5 */ +}; + +static int sis_send_sync_cmd(struct pqi_ctrl_info *ctrl_info, + u32 cmd, struct sis_sync_cmd_params *params) +{ + struct pqi_ctrl_registers __iomem *registers; + unsigned int i; + unsigned long timeout; + u32 doorbell; + u32 cmd_status; + + registers = ctrl_info->registers; + + /* Write the command to mailbox 0. */ + writel(cmd, ®isters->sis_mailbox[0]); + + /* + * Write the command parameters to mailboxes 1-4 (mailbox 5 is not used + * when sending a command to the controller). + */ + for (i = 1; i <= 4; i++) + writel(params->mailbox[i], ®isters->sis_mailbox[i]); + + /* Clear the command doorbell. */ + writel(SIS_CLEAR_CTRL_TO_HOST_DOORBELL, + ®isters->sis_ctrl_to_host_doorbell_clear); + + /* Disable doorbell interrupts by masking all interrupts. */ + writel(~0, ®isters->sis_interrupt_mask); + usleep_range(1000, 2000); + + /* + * Force the completion of the interrupt mask register write before + * submitting the command. + */ + readl(®isters->sis_interrupt_mask); + + /* Submit the command to the controller. */ + writel(SIS_CMD_READY, ®isters->sis_host_to_ctrl_doorbell); + + /* + * Poll for command completion. Note that the call to msleep() is at + * the top of the loop in order to give the controller time to start + * processing the command before we start polling. + */ + timeout = (SIS_CMD_COMPLETE_TIMEOUT_SECS * HZ) + jiffies; + while (1) { + msleep(SIS_CMD_COMPLETE_POLL_INTERVAL_MSECS); + doorbell = readl(®isters->sis_ctrl_to_host_doorbell); + if (doorbell & SIS_CMD_COMPLETE) + break; + if (time_after(jiffies, timeout)) + return -ETIMEDOUT; + } + + /* Read the command status from mailbox 0. */ + cmd_status = readl(®isters->sis_mailbox[0]); + if (cmd_status != SIS_CMD_STATUS_SUCCESS) { + dev_err(&ctrl_info->pci_dev->dev, + "SIS command failed for command 0x%x: status = 0x%x\n", + cmd, cmd_status); + return -EINVAL; + } + + /* + * The command completed successfully, so save the command status and + * read the values returned in mailboxes 1-5. + */ + params->mailbox[0] = cmd_status; + for (i = 1; i < ARRAY_SIZE(params->mailbox); i++) + params->mailbox[i] = readl(®isters->sis_mailbox[i]); + + return 0; +} + +/* + * This function verifies that we are talking to a controller that speaks PQI. + */ + +int sis_get_ctrl_properties(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + u32 properties; + u32 extended_properties; + struct sis_sync_cmd_params params; + + memset(¶ms, 0, sizeof(params)); + + rc = sis_send_sync_cmd(ctrl_info, SIS_CMD_GET_ADAPTER_PROPERTIES, + ¶ms); + if (rc) + return rc; + + properties = params.mailbox[1]; + + if (!(properties & SIS_EXTENDED_PROPERTIES_SUPPORTED)) + return -ENODEV; + + extended_properties = params.mailbox[4]; + + if ((extended_properties & SIS_REQUIRED_EXTENDED_PROPERTIES) != + SIS_REQUIRED_EXTENDED_PROPERTIES) + return -ENODEV; + + if (extended_properties & SIS_PQI_RESET_QUIESCE_SUPPORTED) + ctrl_info->pqi_reset_quiesce_supported = true; + + return 0; +} + +int sis_get_pqi_capabilities(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + struct sis_sync_cmd_params params; + + memset(¶ms, 0, sizeof(params)); + + rc = sis_send_sync_cmd(ctrl_info, SIS_CMD_GET_PQI_CAPABILITIES, + ¶ms); + if (rc) + return rc; + + ctrl_info->max_sg_entries = params.mailbox[1]; + ctrl_info->max_transfer_size = params.mailbox[2]; + ctrl_info->max_outstanding_requests = params.mailbox[3]; + ctrl_info->config_table_offset = params.mailbox[4]; + ctrl_info->config_table_length = params.mailbox[5]; + + return 0; +} + +int sis_init_base_struct_addr(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + void *base_struct_unaligned; + struct sis_base_struct *base_struct; + struct sis_sync_cmd_params params; + unsigned long error_buffer_paddr; + dma_addr_t bus_address; + + base_struct_unaligned = kzalloc(sizeof(*base_struct) + + SIS_BASE_STRUCT_ALIGNMENT - 1, GFP_KERNEL); + if (!base_struct_unaligned) + return -ENOMEM; + + base_struct = PTR_ALIGN(base_struct_unaligned, + SIS_BASE_STRUCT_ALIGNMENT); + error_buffer_paddr = (unsigned long)ctrl_info->error_buffer_dma_handle; + + put_unaligned_le32(SIS_BASE_STRUCT_REVISION, &base_struct->revision); + put_unaligned_le32(lower_32_bits(error_buffer_paddr), + &base_struct->error_buffer_paddr_low); + put_unaligned_le32(upper_32_bits(error_buffer_paddr), + &base_struct->error_buffer_paddr_high); + put_unaligned_le32(PQI_ERROR_BUFFER_ELEMENT_LENGTH, + &base_struct->error_buffer_element_length); + put_unaligned_le32(ctrl_info->max_io_slots, + &base_struct->error_buffer_num_elements); + + bus_address = dma_map_single(&ctrl_info->pci_dev->dev, base_struct, + sizeof(*base_struct), DMA_TO_DEVICE); + if (dma_mapping_error(&ctrl_info->pci_dev->dev, bus_address)) { + rc = -ENOMEM; + goto out; + } + + memset(¶ms, 0, sizeof(params)); + params.mailbox[1] = lower_32_bits((u64)bus_address); + params.mailbox[2] = upper_32_bits((u64)bus_address); + params.mailbox[3] = sizeof(*base_struct); + + rc = sis_send_sync_cmd(ctrl_info, SIS_CMD_INIT_BASE_STRUCT_ADDRESS, + ¶ms); + + dma_unmap_single(&ctrl_info->pci_dev->dev, bus_address, + sizeof(*base_struct), DMA_TO_DEVICE); +out: + kfree(base_struct_unaligned); + + return rc; +} + +#define SIS_DOORBELL_BIT_CLEAR_TIMEOUT_SECS 30 + +static int sis_wait_for_doorbell_bit_to_clear( + struct pqi_ctrl_info *ctrl_info, u32 bit) +{ + int rc = 0; + u32 doorbell_register; + unsigned long timeout; + + timeout = (SIS_DOORBELL_BIT_CLEAR_TIMEOUT_SECS * HZ) + jiffies; + + while (1) { + doorbell_register = + readl(&ctrl_info->registers->sis_host_to_ctrl_doorbell); + if ((doorbell_register & bit) == 0) + break; + if (readl(&ctrl_info->registers->sis_firmware_status) & + SIS_CTRL_KERNEL_PANIC) { + rc = -ENODEV; + break; + } + if (time_after(jiffies, timeout)) { + dev_err(&ctrl_info->pci_dev->dev, + "doorbell register bit 0x%x not cleared\n", + bit); + rc = -ETIMEDOUT; + break; + } + msleep(1); + } + + return rc; +} + +static inline int sis_set_doorbell_bit(struct pqi_ctrl_info *ctrl_info, u32 bit) +{ + writel(bit, &ctrl_info->registers->sis_host_to_ctrl_doorbell); + usleep_range(1000, 2000); + + return sis_wait_for_doorbell_bit_to_clear(ctrl_info, bit); +} + +void sis_enable_msix(struct pqi_ctrl_info *ctrl_info) +{ + sis_set_doorbell_bit(ctrl_info, SIS_ENABLE_MSIX); +} + +void sis_enable_intx(struct pqi_ctrl_info *ctrl_info) +{ + sis_set_doorbell_bit(ctrl_info, SIS_ENABLE_INTX); +} + +void sis_shutdown_ctrl(struct pqi_ctrl_info *ctrl_info, + enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason) +{ + if (readl(&ctrl_info->registers->sis_firmware_status) & + SIS_CTRL_KERNEL_PANIC) + return; + + if (ctrl_info->firmware_triage_supported) + writel(ctrl_shutdown_reason, &ctrl_info->registers->sis_ctrl_shutdown_reason_code); + + writel(SIS_TRIGGER_SHUTDOWN, &ctrl_info->registers->sis_host_to_ctrl_doorbell); +} + +int sis_pqi_reset_quiesce(struct pqi_ctrl_info *ctrl_info) +{ + return sis_set_doorbell_bit(ctrl_info, SIS_PQI_RESET_QUIESCE); +} + +int sis_reenable_sis_mode(struct pqi_ctrl_info *ctrl_info) +{ + return sis_set_doorbell_bit(ctrl_info, SIS_REENABLE_SIS_MODE); +} + +void sis_write_driver_scratch(struct pqi_ctrl_info *ctrl_info, u32 value) +{ + writel(value, &ctrl_info->registers->sis_driver_scratch); + usleep_range(1000, 2000); +} + +u32 sis_read_driver_scratch(struct pqi_ctrl_info *ctrl_info) +{ + return readl(&ctrl_info->registers->sis_driver_scratch); +} + +static inline enum sis_fw_triage_status + sis_read_firmware_triage_status(struct pqi_ctrl_info *ctrl_info) +{ + return ((enum sis_fw_triage_status)(readl(&ctrl_info->registers->sis_firmware_status) & + SIS_CTRL_KERNEL_FW_TRIAGE)); +} + +void sis_soft_reset(struct pqi_ctrl_info *ctrl_info) +{ + writel(SIS_SOFT_RESET, + &ctrl_info->registers->sis_host_to_ctrl_doorbell); +} + +#define SIS_FW_TRIAGE_STATUS_TIMEOUT_SECS 300 +#define SIS_FW_TRIAGE_STATUS_POLL_INTERVAL_SECS 1 + +int sis_wait_for_fw_triage_completion(struct pqi_ctrl_info *ctrl_info) +{ + int rc; + enum sis_fw_triage_status status; + unsigned long timeout; + + timeout = (SIS_FW_TRIAGE_STATUS_TIMEOUT_SECS * HZ) + jiffies; + while (1) { + status = sis_read_firmware_triage_status(ctrl_info); + if (status == FW_TRIAGE_COND_INVALID) { + dev_err(&ctrl_info->pci_dev->dev, + "firmware triage condition invalid\n"); + rc = -EINVAL; + break; + } else if (status == FW_TRIAGE_NOT_STARTED || + status == FW_TRIAGE_COMPLETED) { + rc = 0; + break; + } + + if (time_after(jiffies, timeout)) { + dev_err(&ctrl_info->pci_dev->dev, + "timed out waiting for firmware triage status\n"); + rc = -ETIMEDOUT; + break; + } + + ssleep(SIS_FW_TRIAGE_STATUS_POLL_INTERVAL_SECS); + } + + return rc; + +} + +void sis_verify_structures(void) +{ + BUILD_BUG_ON(offsetof(struct sis_base_struct, + revision) != 0x0); + BUILD_BUG_ON(offsetof(struct sis_base_struct, + flags) != 0x4); + BUILD_BUG_ON(offsetof(struct sis_base_struct, + error_buffer_paddr_low) != 0x8); + BUILD_BUG_ON(offsetof(struct sis_base_struct, + error_buffer_paddr_high) != 0xc); + BUILD_BUG_ON(offsetof(struct sis_base_struct, + error_buffer_element_length) != 0x10); + BUILD_BUG_ON(offsetof(struct sis_base_struct, + error_buffer_num_elements) != 0x14); + BUILD_BUG_ON(sizeof(struct sis_base_struct) != 0x18); +} + diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_sis.h b/drivers/amazon/scsi/smartpqi/smartpqi_sis.h new file mode 100644 index 0000000000000..ad570d4cc16d6 --- /dev/null +++ b/drivers/amazon/scsi/smartpqi/smartpqi_sis.h @@ -0,0 +1,45 @@ +/* + * driver for Microchip PQI-based storage controllers + * Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries + * Copyright (c) 2016-2018 Microsemi Corporation + * Copyright (c) 2016 PMC-Sierra, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * Questions/Comments/Bugfixes to storagedev@microchip.com + * + */ + +#if !defined(_SMARTPQI_SIS_H) +#define _SMARTPQI_SIS_H + +void sis_verify_structures(void); +int sis_wait_for_ctrl_ready(struct pqi_ctrl_info *ctrl_info); +int sis_wait_for_ctrl_ready_resume(struct pqi_ctrl_info *ctrl_info); +bool sis_is_firmware_running(struct pqi_ctrl_info *ctrl_info); +bool sis_is_kernel_up(struct pqi_ctrl_info *ctrl_info); +int sis_get_ctrl_properties(struct pqi_ctrl_info *ctrl_info); +int sis_get_pqi_capabilities(struct pqi_ctrl_info *ctrl_info); +int sis_init_base_struct_addr(struct pqi_ctrl_info *ctrl_info); +void sis_enable_msix(struct pqi_ctrl_info *ctrl_info); +void sis_enable_intx(struct pqi_ctrl_info *ctrl_info); +void sis_shutdown_ctrl(struct pqi_ctrl_info *ctrl_info, + enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason); +int sis_pqi_reset_quiesce(struct pqi_ctrl_info *ctrl_info); +int sis_reenable_sis_mode(struct pqi_ctrl_info *ctrl_info); +void sis_write_driver_scratch(struct pqi_ctrl_info *ctrl_info, u32 value); +u32 sis_read_driver_scratch(struct pqi_ctrl_info *ctrl_info); +void sis_soft_reset(struct pqi_ctrl_info *ctrl_info); +u32 sis_get_product_id(struct pqi_ctrl_info *ctrl_info); +int sis_wait_for_fw_triage_completion(struct pqi_ctrl_info *ctrl_info); + +extern unsigned int sis_ctrl_ready_timeout_secs; + +#endif /* _SMARTPQI_SIS_H */ diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 49eb14271f287..770bbd3e9c205 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -169,30 +170,102 @@ int memory_notify(unsigned long val, void *v) return blocking_notifier_call_chain(&memory_chain, val, v); } +static int memory_block_online(struct memory_block *mem) +{ + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); + unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + struct zone *zone; + int ret; + + zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages); + + /* + * Although vmemmap pages have a different lifecycle than the pages + * they describe (they remain until the memory is unplugged), doing + * their initialization and accounting at memory onlining/offlining + * stage helps to keep accounting easier to follow - e.g vmemmaps + * belong to the same zone as the memory they backed. + */ + if (nr_vmemmap_pages) { + ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); + if (ret) + return ret; + } + + ret = online_pages(start_pfn + nr_vmemmap_pages, + nr_pages - nr_vmemmap_pages, zone); + if (ret) { + if (nr_vmemmap_pages) + mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); + return ret; + } + + /* + * Account once onlining succeeded. If the zone was unpopulated, it is + * now already properly populated. + */ + if (nr_vmemmap_pages) + adjust_present_page_count(zone, nr_vmemmap_pages); + + return ret; +} + +static int memory_block_offline(struct memory_block *mem) +{ + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); + unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + struct zone *zone; + int ret; + + /* + * Unaccount before offlining, such that unpopulated zone and kthreads + * can properly be torn down in offline_pages(). + */ + if (nr_vmemmap_pages) { + zone = page_zone(pfn_to_page(start_pfn)); + adjust_present_page_count(zone, -nr_vmemmap_pages); + } + + ret = offline_pages(start_pfn + nr_vmemmap_pages, + nr_pages - nr_vmemmap_pages); + if (ret) { + /* offline_pages() failed. Account back. */ + if (nr_vmemmap_pages) + adjust_present_page_count(zone, nr_vmemmap_pages); + return ret; + } + + if (nr_vmemmap_pages) + mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); + +#ifdef CONFIG_PAGE_REPORTING + page_report_offline(start_pfn, nr_pages); +#endif + + return ret; +} + /* * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is * OK to have direct references to sparsemem variables in here. */ static int -memory_block_action(unsigned long start_section_nr, unsigned long action, - int online_type, int nid) +memory_block_action(struct memory_block *mem, unsigned long action) { - unsigned long start_pfn; - unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; int ret; - start_pfn = section_nr_to_pfn(start_section_nr); - switch (action) { case MEM_ONLINE: - ret = online_pages(start_pfn, nr_pages, online_type, nid); + ret = memory_block_online(mem); break; case MEM_OFFLINE: - ret = offline_pages(start_pfn, nr_pages); + ret = memory_block_offline(mem); break; default: WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " - "%ld\n", __func__, start_section_nr, action, action); + "%ld\n", __func__, mem->start_section_nr, action, action); ret = -EINVAL; } @@ -210,9 +283,7 @@ static int memory_block_change_state(struct memory_block *mem, if (to_state == MEM_OFFLINE) mem->state = MEM_GOING_OFFLINE; - ret = memory_block_action(mem->start_section_nr, to_state, - mem->online_type, mem->nid); - + ret = memory_block_action(mem, to_state); mem->state = ret ? from_state_req : to_state; return ret; @@ -415,9 +486,10 @@ static DEVICE_ATTR_RW(auto_online_blocks); static ssize_t probe_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - u64 phys_addr; + u64 phys_addr, size; int nid, ret; unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; + mhp_t mhp_flags; ret = kstrtoull(buf, 0, &phys_addr); if (ret) @@ -430,10 +502,12 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr, if (ret) return ret; + size = MIN_MEMORY_BLOCK_SIZE * sections_per_block; + mhp_flags = mhp_supports_memmap_on_memory(size) ? + MHP_MEMMAP_ON_MEMORY : MHP_NONE; + nid = memory_add_physaddr_to_nid(phys_addr); - ret = __add_memory(nid, phys_addr, - MIN_MEMORY_BLOCK_SIZE * sections_per_block, - MHP_NONE); + ret = __add_memory(nid, phys_addr, size, mhp_flags); if (ret) goto out; @@ -447,6 +521,34 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_WO(probe); #endif +#ifdef CONFIG_ARCH_MEMORY_REMOVE +static ssize_t remove_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + u64 phys_addr; + int nid, ret; + unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; + + ret = kstrtoull(buf, 0, &phys_addr); + if (ret) + return ret; + + if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) + return -EINVAL; + + nid = memory_add_physaddr_to_nid(phys_addr); + ret = offline_and_remove_memory(nid, phys_addr, MIN_MEMORY_BLOCK_SIZE * sections_per_block); + + if (ret) + return ret; + + return count; +} + +static DEVICE_ATTR_WO(remove); +#endif + + #ifdef CONFIG_MEMORY_FAILURE /* * Support for offlining pages of memory @@ -566,7 +668,8 @@ int register_memory(struct memory_block *memory) return ret; } -static int init_memory_block(unsigned long block_id, unsigned long state) +static int init_memory_block(unsigned long block_id, unsigned long state, + unsigned long nr_vmemmap_pages) { struct memory_block *mem; int ret = 0; @@ -583,6 +686,7 @@ static int init_memory_block(unsigned long block_id, unsigned long state) mem->start_section_nr = block_id * sections_per_block; mem->state = state; mem->nid = NUMA_NO_NODE; + mem->nr_vmemmap_pages = nr_vmemmap_pages; ret = register_memory(mem); @@ -602,7 +706,7 @@ static int add_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; return init_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE); + MEM_ONLINE, 0); } static void unregister_memory(struct memory_block *memory) @@ -624,7 +728,8 @@ static void unregister_memory(struct memory_block *memory) * * Called under device_hotplug_lock. */ -int create_memory_block_devices(unsigned long start, unsigned long size) +int create_memory_block_devices(unsigned long start, unsigned long size, + unsigned long vmemmap_pages) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); @@ -637,7 +742,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size) return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = init_memory_block(block_id, MEM_OFFLINE); + ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages); if (ret) break; } @@ -691,6 +796,9 @@ static struct attribute *memory_root_attrs[] = { #ifdef CONFIG_ARCH_MEMORY_PROBE &dev_attr_probe.attr, #endif +#ifdef CONFIG_ARCH_MEMORY_REMOVE + &dev_attr_remove.attr, +#endif #ifdef CONFIG_MEMORY_FAILURE &dev_attr_soft_offline_page.attr, diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d68a8ca2161fb..61520f25695ea 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -48,6 +48,8 @@ #include #include #include +#include +#include #include #include @@ -81,6 +83,8 @@ enum blkif_state { BLKIF_STATE_CONNECTED, BLKIF_STATE_SUSPENDED, BLKIF_STATE_ERROR, + BLKIF_STATE_FREEZING, + BLKIF_STATE_FROZEN, }; struct grant { @@ -134,7 +138,7 @@ static LIST_HEAD(info_list); * by the backend driver. */ -static unsigned int xen_blkif_max_segments = 32; +static unsigned int xen_blkif_max_segments = 64; module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444); MODULE_PARM_DESC(max_indirect_segments, "Maximum amount of segments in indirect requests (default is 32)"); @@ -229,6 +233,7 @@ struct blkfront_info struct list_head requests; struct bio_list bio_list; struct list_head info_list; + struct completion wait_backend_disconnected; }; static unsigned int nr_minors; @@ -270,6 +275,16 @@ static DEFINE_SPINLOCK(minor_lock); static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); static void blkfront_gather_backend_features(struct blkfront_info *info); static int negotiate_mq(struct blkfront_info *info); +static void __blkif_free(struct blkfront_info *info); + +static inline bool blkfront_ring_is_busy(struct blkif_front_ring *ring) +{ + if (RING_SIZE(ring) > RING_FREE_REQUESTS(ring) || + RING_HAS_UNCONSUMED_RESPONSES(ring)) + return true; + else + return false; +} #define for_each_rinfo(info, ptr, idx) \ for ((ptr) = (info)->rinfo, (idx) = 0; \ @@ -1026,6 +1041,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, info->sector_size = sector_size; info->physical_sector_size = physical_sector_size; blkif_set_queue_limits(info); + init_completion(&info->wait_backend_disconnected); return 0; } @@ -1249,6 +1265,8 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) /* Already hold rinfo->ring_lock. */ static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo) { + if (unlikely(rinfo->dev_info->connected == BLKIF_STATE_FREEZING)) + return; if (!RING_FULL(&rinfo->ring)) blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true); } @@ -1373,9 +1391,6 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) static void blkif_free(struct blkfront_info *info, int suspend) { - unsigned int i; - struct blkfront_ring_info *rinfo; - /* Prevent new requests being issued until we fix things up. */ info->connected = suspend ? BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; @@ -1383,6 +1398,14 @@ static void blkif_free(struct blkfront_info *info, int suspend) if (info->rq) blk_mq_stop_hw_queues(info->rq); + __blkif_free(info); +} + +static void __blkif_free(struct blkfront_info *info) +{ + unsigned int i; + struct blkfront_ring_info *rinfo; + for_each_rinfo(info, rinfo, i) blkif_free_ring(rinfo); @@ -1594,8 +1617,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS; if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { - xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); - return IRQ_HANDLED; + if (info->connected != BLKIF_STATE_FREEZING) { + xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); + return IRQ_HANDLED; + } } spin_lock_irqsave(&rinfo->ring_lock, flags); @@ -2113,6 +2138,7 @@ static int blkif_recover(struct blkfront_info *info) unsigned int segs; struct blkfront_ring_info *rinfo; + bool frozen = info->connected == BLKIF_STATE_FROZEN; blkfront_gather_backend_features(info); /* Reset limits changed by blk_mq_update_nr_hw_queues(). */ blkif_set_queue_limits(info); @@ -2134,6 +2160,9 @@ static int blkif_recover(struct blkfront_info *info) kick_pending_request_queues(rinfo); } + if (frozen) + return 0; + list_for_each_entry_safe(req, n, &info->requests, queuelist) { /* Requeue pending requests (flush or discard) */ list_del_init(&req->queuelist); @@ -2447,6 +2476,7 @@ static void blkfront_connect(struct blkfront_info *info) return; case BLKIF_STATE_SUSPENDED: + case BLKIF_STATE_FROZEN: /* * If we are recovering from suspension, we need to wait * for the backend to announce it's features before @@ -2564,13 +2594,38 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateClosed: - if (dev->state == XenbusStateClosed) + if (dev->state == XenbusStateClosed) { + if (info->connected == BLKIF_STATE_FREEZING) { + __blkif_free(info); + info->connected = BLKIF_STATE_FROZEN; + complete(&info->wait_backend_disconnected); + break; + } + + break; + } + + /* + * We may somehow receive backend's Closed again while thawing + * or restoring and it causes thawing or restoring to fail. + * Ignore such unexpected state anyway. + */ + if (info->connected == BLKIF_STATE_FROZEN && + dev->state == XenbusStateInitialised) { + dev_dbg(&dev->dev, + "ignore the backend's Closed state: %s", + dev->nodename); break; + } fallthrough; case XenbusStateClosing: - if (info) - blkfront_closing(info); - break; + if (info) { + if (info->connected == BLKIF_STATE_FREEZING) + xenbus_frontend_closed(dev); + else + blkfront_closing(info); + } + break; } } @@ -2713,6 +2768,94 @@ static void blkif_release(struct gendisk *disk, fmode_t mode) mutex_unlock(&blkfront_mutex); } +static int blkfront_freeze(struct xenbus_device *dev) +{ + unsigned int i; + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + struct blkfront_ring_info *rinfo; + struct blkif_front_ring *ring; + /* This would be reasonable timeout as used in xenbus_dev_shutdown() */ + unsigned int timeout = 5 * HZ; + int err = 0; + + info->connected = BLKIF_STATE_FREEZING; + + blk_mq_stop_hw_queues(info->rq); + + for (i = 0; i < info->nr_rings; i++) { + rinfo = &info->rinfo[i]; + + gnttab_cancel_free_callback(&rinfo->callback); + flush_work(&rinfo->work); + } + + for (i = 0; i < info->nr_rings; i++) { + spinlock_t *lock; + bool busy; + unsigned long req_timeout_ms = 25; + unsigned long ring_timeout; + + rinfo = &info->rinfo[i]; + ring = &rinfo->ring; + + lock = &rinfo->ring_lock; + + ring_timeout = jiffies + + msecs_to_jiffies(req_timeout_ms * RING_SIZE(ring)); + + do { + spin_lock_irq(lock); + busy = blkfront_ring_is_busy(ring); + spin_unlock_irq(lock); + + if (busy) + msleep(req_timeout_ms); + else + break; + } while (time_is_after_jiffies(ring_timeout)); + + /* Timed out */ + if (busy) { + xenbus_dev_error(dev, err, "the ring is still busy"); + info->connected = BLKIF_STATE_CONNECTED; + return -EBUSY; + } + } + + /* Kick the backend to disconnect */ + xenbus_switch_state(dev, XenbusStateClosing); + + /* + * We don't want to move forward before the frontend is diconnected + * from the backend cleanly. + */ + timeout = wait_for_completion_timeout(&info->wait_backend_disconnected, + timeout); + if (!timeout) { + err = -EBUSY; + xenbus_dev_error(dev, err, "Freezing timed out;" + "the device may become inconsistent state"); + } + + return err; +} + +static int blkfront_restore(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + int err = 0; + + blkfront_gather_backend_features(info); + xlvbd_flush(info); + err = talk_to_blkback(dev, info); + if (err) + goto out; + blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings); + +out: + return err; +} + static const struct block_device_operations xlvbd_block_fops = { .owner = THIS_MODULE, @@ -2736,6 +2879,9 @@ static struct xenbus_driver blkfront_driver = { .resume = blkfront_resume, .otherend_changed = blkback_changed, .is_ready = blkfront_is_ready, + .freeze = blkfront_freeze, + .thaw = blkfront_restore, + .restore = blkfront_restore }; static void purge_persistent_grants(struct blkfront_info *info) diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index a7d9e4600d40e..5ed8ef408cc09 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -536,6 +536,19 @@ config HW_RANDOM_XIPHERA To compile this driver as a module, choose M here: the module will be called xiphera-trng. +config HW_RANDOM_GRAVITON + tristate "AWS Graviton Random Number Generator support" + depends on HW_RANDOM && ACPI && (ARM64 || COMPILE_TEST) + default HW_RANDOM + help + This driver provides kernel-side support for the Random Number + Generator SMC found on AWS Graviton systems. + + To compile this driver as a module, choose M here: the + module will be called graviton-rng. + + If unsure, say Y. + endif # HW_RANDOM config UML_RANDOM diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile index 5da344509a4df..9ca4a225b0bca 100644 --- a/drivers/char/hw_random/Makefile +++ b/drivers/char/hw_random/Makefile @@ -46,3 +46,4 @@ obj-$(CONFIG_HW_RANDOM_OPTEE) += optee-rng.o obj-$(CONFIG_HW_RANDOM_NPCM) += npcm-rng.o obj-$(CONFIG_HW_RANDOM_CCTRNG) += cctrng.o obj-$(CONFIG_HW_RANDOM_XIPHERA) += xiphera-trng.o +obj-$(CONFIG_HW_RANDOM_GRAVITON) += graviton-rng.o diff --git a/drivers/char/hw_random/graviton-rng.c b/drivers/char/hw_random/graviton-rng.c new file mode 100644 index 0000000000000..3a8f3fe35359b --- /dev/null +++ b/drivers/char/hw_random/graviton-rng.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AWS Graviton TRNG driver + * + * Copyright (C) 2019 Amazon Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, 0x00ff) +#define AWS_GRAVITON_UUID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, 0xFF01) +#define AWS_GRAVITON_GET_VER \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, 0xFF03) + +#define AWS_GRAVITON_GET_RND \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_SIP, 0x60) +#define AWS_GRAVITON_GET_RND_LEGACY \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, 0x60) + +/** + * UID of the Graviton TRNG API: eb4af8a0-89d4-49c9-bc8c5b38dc54308e + */ +#define GRVTN_TRNG_UUID_0 0xa0f83aeb +#define GRVTN_TRNG_UUID_1 0xc949d489 +#define GRVTN_TRNG_UUID_2 0x385b8cbc +#define GRVTN_TRNG_UUID_3 0x8e3054dc + +struct grvtn_rng { + u64 call_id; + struct hwrng rng; +}; + +static void grvtn_smccc_conduit(u64 call_id, struct arm_smccc_res *res) +{ + if (acpi_psci_use_hvc()) + arm_smccc_1_1_hvc(call_id, res); + else + arm_smccc_1_1_smc(call_id, res); +} + +static int grvtn_probe_sip_feature(unsigned long feature) +{ + struct arm_smccc_res res = {}; + + if (acpi_psci_use_hvc()) + arm_smccc_1_1_hvc(ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE, + feature, 0, &res); + else + arm_smccc_1_1_smc(ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE, + feature, 0, &res); + + return res.a0; +} + +static int grvtn_trng_read(struct hwrng *rng, void *buf, size_t max, bool wait) +{ + struct grvtn_rng *priv = (struct grvtn_rng *)rng->priv; + struct arm_smccc_res res; + int err = 0; + /* timeout after one waiting period */ + int iter_remain = 2; + size_t count = max > sizeof(ulong) * 2 ? sizeof(ulong) * 2 : max; + size_t total = count; + + do { + if (err && wait) + /* Nominal wait is 5us */ + udelay(err); + + grvtn_smccc_conduit(priv->call_id, &res); + + /* In the unlikely event of rolling back to legacy after probe was issued */ + if (unlikely((res.a0 == SMCCC_RET_NOT_SUPPORTED) && (priv->call_id != AWS_GRAVITON_GET_RND_LEGACY))) { + grvtn_smccc_conduit(AWS_GRAVITON_GET_RND_LEGACY, &res); + priv->call_id = AWS_GRAVITON_GET_RND_LEGACY; + } + + err = (int) res.a0; + + if (err < 0) + return err; + + iter_remain--; + } while (iter_remain && err && wait); + + if (err) + return 0; + + if (count > sizeof(ulong)) { + memcpy(buf, &res.a1, sizeof(ulong)); + count -= sizeof(ulong); + buf += sizeof(ulong); + } + memcpy(buf, &res.a2, count); + return total; +} + +static int grvtn_trng_probe(struct platform_device *pdev) +{ + int version; + int err; + struct arm_smccc_res res; + struct grvtn_rng *priv; + + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->rng.name = "graviton"; + priv->rng.read = grvtn_trng_read; + priv->rng.priv = (unsigned long)priv; + priv->rng.quality = 1024; /* all bits are sourced from a HW TRNG */ + priv->call_id = AWS_GRAVITON_GET_RND_LEGACY; /* default mode is legacy */ + + grvtn_smccc_conduit(AWS_GRAVITON_UUID, &res); + + if (res.a0 != GRVTN_TRNG_UUID_0 || res.a1 != GRVTN_TRNG_UUID_1 || + res.a2 != GRVTN_TRNG_UUID_2 || res.a3 != GRVTN_TRNG_UUID_3) { + dev_err(&pdev->dev, "failed to match UUID\n"); + return -ENXIO; + } + + grvtn_smccc_conduit(AWS_GRAVITON_GET_VER, &res); + dev_info(&pdev->dev, "Graviton TRNG, SMC version %d.%d\n", + (u32)res.a0, (u32)res.a1); + + version = grvtn_probe_sip_feature(AWS_GRAVITON_GET_RND); + if (version > 0) + priv->call_id = AWS_GRAVITON_GET_RND; + + platform_set_drvdata(pdev, priv); + err = devm_hwrng_register(&pdev->dev, &priv->rng); + if (err) + dev_err(&pdev->dev, "failed to register hwrng"); + return err; +} + +static const struct acpi_device_id grvtn_trng_acpi_match[] = { + { "AMZN0010", }, + {} +}; + +MODULE_DEVICE_TABLE(acpi, grvtn_trng_acpi_match); + +static struct platform_driver grvtn_trng_driver = { + .probe = grvtn_trng_probe, + .driver = { + .name = "graviton-rng", + .owner = THIS_MODULE, + .acpi_match_table = ACPI_PTR(grvtn_trng_acpi_match), + }, +}; + +module_platform_driver(grvtn_trng_driver); + +MODULE_AUTHOR("Amazon.com, Inc. or it's affiliates"); +MODULE_DESCRIPTION("Graviton TRNG driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index f4881764bf8f4..5aeab5445f0b4 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,8 @@ #include #include #include +#include +#include #include #include @@ -191,6 +194,7 @@ static u64 arch_counter_read_cc(const struct cyclecounter *cc) static struct clocksource clocksource_counter = { .name = "arch_sys_counter", + .id = CSID_ARM_ARCH_COUNTER, .rating = 400, .read = arch_counter_read, .mask = CLOCKSOURCE_MASK(56), @@ -1657,3 +1661,35 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table) } TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init); #endif + +int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *ts, + struct clocksource **cs) +{ + struct arm_smccc_res hvc_res; + u32 ptp_counter; + ktime_t ktime; + + if (!IS_ENABLED(CONFIG_HAVE_ARM_SMCCC_DISCOVERY)) + return -EOPNOTSUPP; + + if (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) + ptp_counter = KVM_PTP_VIRT_COUNTER; + else + ptp_counter = KVM_PTP_PHYS_COUNTER; + + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID, + ptp_counter, &hvc_res); + + if ((int)(hvc_res.a0) < 0) + return -EOPNOTSUPP; + + ktime = (u64)hvc_res.a0 << 32 | hvc_res.a1; + *ts = ktime_to_timespec64(ktime); + if (cycle) + *cycle = (u64)hvc_res.a2 << 32 | hvc_res.a3; + if (cs) + *cs = &clocksource_counter; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_arch_ptp_get_crosststamp); diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 7f4bafcd9d335..58943f9290b40 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -134,6 +134,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, */ status = efi_random_alloc(*reserve_size, min_kimg_align, reserve_addr, phys_seed); + if (status != EFI_SUCCESS) + efi_warn("efi_random_alloc() failed: 0x%lx\n", status); } else { status = EFI_OUT_OF_RESOURCES; } diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 00af99b6f97c1..ffde5feb728d7 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -470,6 +471,7 @@ static int __init psci_probe(void) psci_init_cpu_suspend(); psci_init_system_suspend(); psci_init_system_reset2(); + kvm_init_hyp_services(); } return 0; diff --git a/drivers/firmware/smccc/Makefile b/drivers/firmware/smccc/Makefile index 72ab840428324..40d19144a8607 100644 --- a/drivers/firmware/smccc/Makefile +++ b/drivers/firmware/smccc/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 # -obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o +obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o kvm_guest.o obj-$(CONFIG_ARM_SMCCC_SOC_ID) += soc_id.o diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c new file mode 100644 index 0000000000000..2d3e866decaa6 --- /dev/null +++ b/drivers/firmware/smccc/kvm_guest.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "smccc: KVM: " fmt + +#include +#include +#include +#include + +#include + +static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) __ro_after_init = { }; + +void __init kvm_init_hyp_services(void) +{ + struct arm_smccc_res res; + u32 val[4]; + + if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_HVC) + return; + + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, &res); + if (res.a0 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 || + res.a1 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 || + res.a2 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 || + res.a3 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3) + return; + + memset(&res, 0, sizeof(res)); + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, &res); + + val[0] = lower_32_bits(res.a0); + val[1] = lower_32_bits(res.a1); + val[2] = lower_32_bits(res.a2); + val[3] = lower_32_bits(res.a3); + + bitmap_from_arr32(__kvm_arm_hyp_services, val, ARM_SMCCC_KVM_NUM_FUNCS); + + pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n", + res.a3, res.a2, res.a1, res.a0); +} + +bool kvm_arm_hyp_service_available(u32 func_id) +{ + if (func_id >= ARM_SMCCC_KVM_NUM_FUNCS) + return false; + + return test_bit(func_id, __kvm_arm_hyp_services); +} +EXPORT_SYMBOL_GPL(kvm_arm_hyp_service_available); diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index b92b032fb6d13..8db9d2f7ee742 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -66,11 +66,17 @@ static struct cpuidle_driver intel_idle_driver = { /* intel_idle.max_cstate=0 disables driver */ static int max_cstate = CPUIDLE_STATE_MAX - 1; static unsigned int disabled_states_mask; +static unsigned int preferred_states_mask; static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; static unsigned long auto_demotion_disable_flags; -static bool disable_promotion_to_c1e; + +static enum { + C1E_PROMOTION_PRESERVE, + C1E_PROMOTION_ENABLE, + C1E_PROMOTION_DISABLE +} c1e_promotion = C1E_PROMOTION_PRESERVE; struct idle_cpu { struct cpuidle_state *state_table; @@ -778,6 +784,35 @@ static struct cpuidle_state icx_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state spr_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 2, + .target_residency = 4, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 290, + .target_residency = 800, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static struct cpuidle_state atom_cstates[] __initdata = { { .name = "C1E", @@ -1088,6 +1123,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_spr __initconst = { + .state_table = spr_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct idle_cpu idle_cpu_avn __initconst = { .state_table = avn_cstates, .disable_promotion_to_c1e = true, @@ -1143,6 +1184,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &idle_cpu_skl), X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt), @@ -1470,6 +1512,27 @@ static void __init sklh_idle_state_table_update(void) skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE; /* C9-SKL */ } +/** + * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table. + */ +static void __init spr_idle_state_table_update(void) +{ + unsigned long long msr; + + /* + * By default, the C6 state assumes the worst-case scenario of package + * C6. However, if PC6 is disabled, we update the numbers to match + * core C6. + */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr); + + /* Limit value 2 and above allow for PC6. */ + if ((msr & 0x7) < 2) { + spr_cstates[2].exit_latency = 190; + spr_cstates[2].target_residency = 600; + } +} + static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) { unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1; @@ -1501,6 +1564,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) case INTEL_FAM6_SKYLAKE: sklh_idle_state_table_update(); break; + case INTEL_FAM6_SAPPHIRERAPIDS_X: + spr_idle_state_table_update(); + break; } for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { @@ -1578,6 +1644,15 @@ static void auto_demotion_disable(void) wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits); } +static void c1e_promotion_enable(void) +{ + unsigned long long msr_bits; + + rdmsrl(MSR_IA32_POWER_CTL, msr_bits); + msr_bits |= 0x2; + wrmsrl(MSR_IA32_POWER_CTL, msr_bits); +} + static void c1e_promotion_disable(void) { unsigned long long msr_bits; @@ -1609,7 +1684,9 @@ static int intel_idle_cpu_init(unsigned int cpu) if (auto_demotion_disable_flags) auto_demotion_disable(); - if (disable_promotion_to_c1e) + if (c1e_promotion == C1E_PROMOTION_ENABLE) + c1e_promotion_enable(); + else if (c1e_promotion == C1E_PROMOTION_DISABLE) c1e_promotion_disable(); return 0; @@ -1688,7 +1765,8 @@ static int __init intel_idle_init(void) if (icpu) { cpuidle_state_table = icpu->state_table; auto_demotion_disable_flags = icpu->auto_demotion_disable_flags; - disable_promotion_to_c1e = icpu->disable_promotion_to_c1e; + if (icpu->disable_promotion_to_c1e) + c1e_promotion = C1E_PROMOTION_DISABLE; if (icpu->use_acpi || force_use_acpi) intel_idle_acpi_cst_extract(); } else if (!intel_idle_acpi_cst_extract()) { @@ -1747,3 +1825,14 @@ module_param(max_cstate, int, 0444); */ module_param_named(states_off, disabled_states_mask, uint, 0444); MODULE_PARM_DESC(states_off, "Mask of disabled idle states"); +/* + * Some platforms come with mutually exclusive C-states, so that if one is + * enabled, the other C-states must not be used. Example: C1 and C1E on + * Sapphire Rapids platform. This parameter allows for selecting the + * preferred C-states among the groups of mutually exclusive C-states - the + * selected C-states will be registered, the other C-states from the mutually + * exclusive group won't be registered. If the platform has no mutually + * exclusive C-states, this parameter has no effect. + */ +module_param_named(preferred_cstates, preferred_states_mask, uint, 0444); +MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states"); diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 04878caf6da49..806af9c742ab4 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -103,6 +103,10 @@ config IOMMU_DMA select IRQ_MSI_IOMMU select NEED_SG_DMA_LENGTH +config IOMMU_DEFAULT_STRICT + def_bool n + depends on IOMMU_API && (ARM || ARM64) + config FSL_PAMU bool "Freescale IOMMU support" depends on PCI diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 9d65557dfb2ce..1849078eb784c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -29,7 +29,12 @@ static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); static unsigned int iommu_def_domain_type __read_mostly; + +#ifdef CONFIG_IOMMU_DEFAULT_STRICT static bool iommu_dma_strict __read_mostly = true; +#else +static bool iommu_dma_strict __read_mostly = false; +#endif static u32 iommu_cmd_line __read_mostly; struct iommu_group { diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index fafa8b0d80996..a2b7cae7a3595 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -456,6 +456,21 @@ config PVPANIC a paravirtualized device provided by QEMU; it lets a virtual machine (guest) communicate panic events to the host. +config SYSGENID + tristate "System Generation ID driver" + help + This is a System Generation ID driver which provides a system + generation counter. The driver exposes FS ops on /dev/sysgenid + through which it can provide information and notifications on system + generation changes that happen because of VM or container snapshots + or cloning. + This enables applications and libraries that store or cache + sensitive information, to know that they need to regenerate it + after process memory has been exposed to potential copying. + + To compile this driver as a module, choose M here: the + module will be called sysgenid. + config HISI_HIKEY_USB tristate "USB GPIO Hub on HiSilicon Hikey 960/970 Platform" depends on (OF && GPIOLIB) || COMPILE_TEST diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index d23231e733303..4b4933d0619dc 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -57,3 +57,4 @@ obj-$(CONFIG_HABANA_AI) += habanalabs/ obj-$(CONFIG_UACCE) += uacce/ obj-$(CONFIG_XILINX_SDFEC) += xilinx_sdfec.o obj-$(CONFIG_HISI_HIKEY_USB) += hisi_hikey_usb.o +obj-$(CONFIG_SYSGENID) += sysgenid.o diff --git a/drivers/misc/sysgenid.c b/drivers/misc/sysgenid.c new file mode 100644 index 0000000000000..ace292b83be4a --- /dev/null +++ b/drivers/misc/sysgenid.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * System Generation ID driver + * + * Copyright (C) 2020 Amazon. All rights reserved. + * + * Authors: + * Adrian Catangiu + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct sysgenid_data { + unsigned long map_buf; + wait_queue_head_t read_waitq; + atomic_t generation_counter; + + unsigned int watchers; + atomic_t outdated_watchers; + wait_queue_head_t outdated_waitq; + spinlock_t lock; +}; +static struct sysgenid_data sysgenid_data; + +struct file_data { + bool tracked_watcher; + int acked_gen_counter; +}; + +static int equals_gen_counter(unsigned int counter) +{ + return counter == atomic_read(&sysgenid_data.generation_counter); +} + +static void _bump_generation(int min_gen) +{ + unsigned long flags; + int counter; + + spin_lock_irqsave(&sysgenid_data.lock, flags); + counter = max(min_gen, 1 + atomic_read(&sysgenid_data.generation_counter)); + atomic_set(&sysgenid_data.generation_counter, counter); + *((int *) sysgenid_data.map_buf) = counter; + atomic_set(&sysgenid_data.outdated_watchers, sysgenid_data.watchers); + + wake_up_interruptible(&sysgenid_data.read_waitq); + wake_up_interruptible(&sysgenid_data.outdated_waitq); + spin_unlock_irqrestore(&sysgenid_data.lock, flags); +} + +void sysgenid_bump_generation(void) +{ + _bump_generation(0); +} +EXPORT_SYMBOL_GPL(sysgenid_bump_generation); + +static void put_outdated_watchers(void) +{ + if (atomic_dec_and_test(&sysgenid_data.outdated_watchers)) + wake_up_interruptible(&sysgenid_data.outdated_waitq); +} + +static void start_fd_tracking(struct file_data *fdata) +{ + unsigned long flags; + + if (!fdata->tracked_watcher) { + /* enable tracking this fd as a watcher */ + spin_lock_irqsave(&sysgenid_data.lock, flags); + fdata->tracked_watcher = 1; + ++sysgenid_data.watchers; + if (!equals_gen_counter(fdata->acked_gen_counter)) + atomic_inc(&sysgenid_data.outdated_watchers); + spin_unlock_irqrestore(&sysgenid_data.lock, flags); + } +} + +static void stop_fd_tracking(struct file_data *fdata) +{ + unsigned long flags; + + if (fdata->tracked_watcher) { + /* stop tracking this fd as a watcher */ + spin_lock_irqsave(&sysgenid_data.lock, flags); + if (!equals_gen_counter(fdata->acked_gen_counter)) + put_outdated_watchers(); + --sysgenid_data.watchers; + fdata->tracked_watcher = 0; + spin_unlock_irqrestore(&sysgenid_data.lock, flags); + } +} + +static int sysgenid_open(struct inode *inode, struct file *file) +{ + struct file_data *fdata = kzalloc(sizeof(struct file_data), GFP_KERNEL); + + if (!fdata) + return -ENOMEM; + fdata->tracked_watcher = 0; + fdata->acked_gen_counter = atomic_read(&sysgenid_data.generation_counter); + file->private_data = fdata; + + return 0; +} + +static int sysgenid_close(struct inode *inode, struct file *file) +{ + struct file_data *fdata = file->private_data; + + stop_fd_tracking(fdata); + kfree(fdata); + + return 0; +} + +static ssize_t sysgenid_read(struct file *file, char __user *ubuf, + size_t nbytes, loff_t *ppos) +{ + struct file_data *fdata = file->private_data; + ssize_t ret; + int gen_counter; + + if (nbytes == 0) + return 0; + /* disallow partial reads */ + if (nbytes < sizeof(gen_counter)) + return -EINVAL; + + if (equals_gen_counter(fdata->acked_gen_counter)) { + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + ret = wait_event_interruptible( + sysgenid_data.read_waitq, + !equals_gen_counter(fdata->acked_gen_counter) + ); + if (ret) + return ret; + } + + gen_counter = atomic_read(&sysgenid_data.generation_counter); + ret = copy_to_user(ubuf, &gen_counter, sizeof(gen_counter)); + if (ret) + return -EFAULT; + + return sizeof(gen_counter); +} + +static ssize_t sysgenid_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct file_data *fdata = file->private_data; + unsigned int new_acked_gen; + unsigned long flags; + + /* disallow partial writes */ + if (count != sizeof(new_acked_gen)) + return -ENOBUFS; + if (copy_from_user(&new_acked_gen, ubuf, count)) + return -EFAULT; + + spin_lock_irqsave(&sysgenid_data.lock, flags); + /* wrong gen-counter acknowledged */ + if (!equals_gen_counter(new_acked_gen)) { + spin_unlock_irqrestore(&sysgenid_data.lock, flags); + return -EINVAL; + } + /* update acked gen-counter if necessary */ + if (!equals_gen_counter(fdata->acked_gen_counter)) { + fdata->acked_gen_counter = new_acked_gen; + if (fdata->tracked_watcher) + put_outdated_watchers(); + } + spin_unlock_irqrestore(&sysgenid_data.lock, flags); + + return (ssize_t)count; +} + +static __poll_t sysgenid_poll(struct file *file, poll_table *wait) +{ + __poll_t mask = 0; + struct file_data *fdata = file->private_data; + + if (!equals_gen_counter(fdata->acked_gen_counter)) + return EPOLLIN | EPOLLRDNORM; + + poll_wait(file, &sysgenid_data.read_waitq, wait); + + if (!equals_gen_counter(fdata->acked_gen_counter)) + mask = EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static long sysgenid_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct file_data *fdata = file->private_data; + bool tracking = !!arg; + unsigned long timeout_ns, min_gen; + ktime_t until; + int ret = 0; + + switch (cmd) { + case SYSGENID_SET_WATCHER_TRACKING: + if (tracking) + start_fd_tracking(fdata); + else + stop_fd_tracking(fdata); + break; + case SYSGENID_WAIT_WATCHERS: + timeout_ns = arg * NSEC_PER_MSEC; + until = timeout_ns ? ktime_set(0, timeout_ns) : KTIME_MAX; + + ret = wait_event_interruptible_hrtimeout( + sysgenid_data.outdated_waitq, + (!atomic_read(&sysgenid_data.outdated_watchers) || + !equals_gen_counter(fdata->acked_gen_counter)), + until + ); + if (!equals_gen_counter(fdata->acked_gen_counter)) + ret = -EINTR; + break; + case SYSGENID_TRIGGER_GEN_UPDATE: + if (!checkpoint_restore_ns_capable(current_user_ns())) + return -EACCES; + min_gen = arg; + _bump_generation(min_gen); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static int sysgenid_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file_data *fdata = file->private_data; + + if (vma->vm_pgoff != 0 || vma_pages(vma) > 1) + return -EINVAL; + + if ((vma->vm_flags & VM_WRITE) != 0) + return -EPERM; + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags &= ~VM_MAYWRITE; + vma->vm_private_data = fdata; + + return vm_insert_page(vma, vma->vm_start, + virt_to_page(sysgenid_data.map_buf)); +} + +static const struct file_operations fops = { + .owner = THIS_MODULE, + .mmap = sysgenid_mmap, + .open = sysgenid_open, + .release = sysgenid_close, + .read = sysgenid_read, + .write = sysgenid_write, + .poll = sysgenid_poll, + .unlocked_ioctl = sysgenid_ioctl, +}; + +static struct miscdevice sysgenid_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sysgenid", + .fops = &fops, +}; + +static int __init sysgenid_init(void) +{ + int ret; + + sysgenid_data.map_buf = get_zeroed_page(GFP_KERNEL); + if (!sysgenid_data.map_buf) + return -ENOMEM; + + atomic_set(&sysgenid_data.generation_counter, 0); + atomic_set(&sysgenid_data.outdated_watchers, 0); + init_waitqueue_head(&sysgenid_data.read_waitq); + init_waitqueue_head(&sysgenid_data.outdated_waitq); + spin_lock_init(&sysgenid_data.lock); + + ret = misc_register(&sysgenid_misc); + if (ret < 0) { + pr_err("misc_register() failed for sysgenid\n"); + goto err; + } + + return 0; + +err: + free_pages(sysgenid_data.map_buf, 0); + sysgenid_data.map_buf = 0; + + return ret; +} + +static void __exit sysgenid_exit(void) +{ + misc_deregister(&sysgenid_misc); + free_pages(sysgenid_data.map_buf, 0); + sysgenid_data.map_buf = 0; +} + +module_init(sysgenid_init); +module_exit(sysgenid_exit); + +MODULE_AUTHOR("Adrian Catangiu"); +MODULE_DESCRIPTION("System Generation ID"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("0.1"); diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 3d149890fa36e..5b001a8ef7534 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,12 @@ #include #include +enum netif_freeze_state { + NETIF_FREEZE_STATE_UNFROZEN, + NETIF_FREEZE_STATE_FREEZING, + NETIF_FREEZE_STATE_FROZEN, +}; + /* Module parameters */ #define MAX_QUEUES_DEFAULT 8 static unsigned int xennet_max_queues; @@ -72,6 +79,12 @@ MODULE_PARM_DESC(trusted, "Is the backend trusted"); #define XENNET_TIMEOUT (5 * HZ) +static unsigned int netfront_freeze_timeout_secs = 10; +module_param_named(freeze_timeout_secs, + netfront_freeze_timeout_secs, uint, 0644); +MODULE_PARM_DESC(freeze_timeout_secs, + "timeout when freezing netfront device in seconds"); + static const struct ethtool_ops xennet_ethtool_ops; struct netfront_cb { @@ -183,6 +196,10 @@ struct netfront_info { bool bounce; atomic_t rx_gso_checksum_fixup; + + int freeze_state; + + struct completion wait_backend_disconnected; }; struct netfront_rx_info { @@ -913,6 +930,21 @@ static void xennet_set_rx_rsp_cons(struct netfront_queue *queue, RING_IDX val) spin_unlock_irqrestore(&queue->rx_cons_lock, flags); } +static int xennet_disable_interrupts(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + unsigned int num_queues = dev->real_num_tx_queues; + unsigned int i; + struct netfront_queue *queue; + + for (i = 0; i < num_queues; ++i) { + queue = &np->queues[i]; + disable_irq(queue->tx_irq); + disable_irq(queue->rx_irq); + } + return 0; +} + static void xennet_move_rx_slot(struct netfront_queue *queue, struct sk_buff *skb, grant_ref_t ref) { @@ -1724,6 +1756,8 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev) np->queues = NULL; + init_completion(&np->wait_backend_disconnected); + err = -ENOMEM; np->rx_stats = netdev_alloc_pcpu_stats(struct netfront_stats); if (np->rx_stats == NULL) @@ -2280,6 +2314,50 @@ static int xennet_create_queues(struct netfront_info *info, return 0; } +static int netfront_freeze(struct xenbus_device *dev) +{ + struct netfront_info *info = dev_get_drvdata(&dev->dev); + unsigned long timeout = netfront_freeze_timeout_secs * HZ; + int err = 0; + + xennet_disable_interrupts(info->netdev); + + netif_device_detach(info->netdev); + + info->freeze_state = NETIF_FREEZE_STATE_FREEZING; + + /* Kick the backend to disconnect */ + xenbus_switch_state(dev, XenbusStateClosing); + + /* We don't want to move forward before the frontend is diconnected + * from the backend cleanly. + */ + timeout = wait_for_completion_timeout(&info->wait_backend_disconnected, + timeout); + if (!timeout) { + err = -EBUSY; + xenbus_dev_error(dev, err, "Freezing timed out;" + "the device may become inconsistent state"); + return err; + } + + /* Tear down queues */ + xennet_disconnect_backend(info); + xennet_destroy_queues(info); + + info->freeze_state = NETIF_FREEZE_STATE_FROZEN; + + return err; +} + +static int netfront_restore(struct xenbus_device *dev) +{ + /* Kick the backend to re-connect */ + xenbus_switch_state(dev, XenbusStateInitialising); + + return 0; +} + /* Common code used when first setting up, and when resuming. */ static int talk_to_netback(struct xenbus_device *dev, struct netfront_info *info) @@ -2479,6 +2557,13 @@ static int xennet_connect(struct net_device *dev) device_unregister(&np->xbdev->dev); return err; } + } else { + /* + * In the resume / thaw case, the netif needs to be + * reattached, as it was detached in netfront_freeze(). + */ + if (np->freeze_state == NETIF_FREEZE_STATE_FROZEN) + netif_device_attach(dev); } rtnl_lock(); @@ -2512,6 +2597,8 @@ static int xennet_connect(struct net_device *dev) spin_unlock_bh(&queue->rx_lock); } + np->freeze_state = NETIF_FREEZE_STATE_UNFROZEN; + return 0; } @@ -2549,10 +2636,22 @@ static void netback_changed(struct xenbus_device *dev, break; case XenbusStateClosed: - if (dev->state == XenbusStateClosed) + if (dev->state == XenbusStateClosed) { + /* dpm context is waiting for the backend */ + if (np->freeze_state == NETIF_FREEZE_STATE_FREEZING) + complete(&np->wait_backend_disconnected); break; + } fallthrough; /* Missed the backend's CLOSING state */ case XenbusStateClosing: + /* We may see unexpected Closed or Closing from the backend. + * Just ignore it not to prevent the frontend from being + * re-connected in the case of PM suspend or hibernation. + */ + if (np->freeze_state == NETIF_FREEZE_STATE_FROZEN && + dev->state == XenbusStateInitialising) { + break; + } xenbus_frontend_closed(dev); break; } @@ -2715,6 +2814,9 @@ static struct xenbus_driver netfront_driver = { .probe = netfront_probe, .remove = xennet_remove, .resume = netfront_resume, + .freeze = netfront_freeze, + .thaw = netfront_restore, + .restore = netfront_restore, .otherend_changed = netback_changed, }; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 07c41a149328a..1bb2bb840ffe3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1116,7 +1116,7 @@ EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout) + u32 meta_seed, u64 *result, unsigned int timeout, bool vec) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -1135,8 +1135,22 @@ static int nvme_submit_user_cmd(struct request_queue *q, nvme_req(req)->flags |= NVME_REQ_USERCMD; if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + if (!vec) + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, GFP_KERNEL); + else { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov = fast_iov; + struct iov_iter iter; + + ret = import_iovec(rq_data_dir(req), ubuffer, bufflen, + UIO_FASTIOV, &iov, &iter); + if (ret < 0) + goto out; + ret = blk_rq_map_user_iov(q, req, NULL, &iter, + GFP_KERNEL); + kfree(iov); + } if (ret) goto out; bio = req->bio; @@ -1599,7 +1613,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return nvme_submit_user_cmd(ns->queue, &c, nvme_to_user_ptr(io.addr), length, - metadata, meta_len, lower_32_bits(io.slba), NULL, 0); + metadata, meta_len, lower_32_bits(io.slba), NULL, 0, + false); } static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, @@ -1637,7 +1652,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &result, timeout); + 0, &result, timeout, false); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -1648,7 +1663,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd) + struct nvme_passthru_cmd64 __user *ucmd, bool vec) { struct nvme_passthru_cmd64 cmd; struct nvme_command c; @@ -1681,7 +1696,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &cmd.result, timeout); + 0, &cmd.result, timeout, vec); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) @@ -1746,7 +1761,7 @@ static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, ret = nvme_user_cmd(ctrl, NULL, argp); break; case NVME_IOCTL_ADMIN64_CMD: - ret = nvme_user_cmd64(ctrl, NULL, argp); + ret = nvme_user_cmd64(ctrl, NULL, argp, false); break; default: ret = sed_ioctl(ctrl->opal_dev, cmd, argp); @@ -1788,7 +1803,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, ret = nvme_submit_io(ns, argp); break; case NVME_IOCTL_IO64_CMD: - ret = nvme_user_cmd64(ns->ctrl, ns, argp); + ret = nvme_user_cmd64(ns->ctrl, ns, argp, false); + break; + case NVME_IOCTL_IO64_CMD_VEC: + ret = nvme_user_cmd64(ns->ctrl, ns, argp, true); break; default: if (ns->ndev) @@ -3336,7 +3354,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); + return nvme_user_cmd64(ctrl, NULL, argp, false); case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c3e4d9b6f9c0d..54f97335dd416 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -145,6 +145,12 @@ enum nvme_quirks { */ NVME_QUIRK_NO_NS_DESC_LIST = (1 << 15), + /* + * The controller does not properly handle DMA addresses over + * 48 bits. + */ + NVME_QUIRK_DMA_ADDRESS_BITS_48 = (1 << 16), + /* * The controller requires the command_id value be be limited, so skip * encoding the generation sequence number. diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3aaead9b3a570..6ce3513eadb84 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2371,13 +2371,16 @@ static int nvme_pci_enable(struct nvme_dev *dev) { int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); + int dma_address_bits = 64; if (pci_enable_device_mem(pdev)) return result; pci_set_master(pdev); - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64))) + if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48) + dma_address_bits = 48; + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits))) goto disable; if (readl(dev->bar + NVME_REG_CSTS) == -1) { @@ -3268,6 +3271,20 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x1d97, 0x2263), /* SPCC */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), .driver_data = NVME_QUIRK_SINGLE_VECTOR }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, diff --git a/drivers/of/device.c b/drivers/of/device.c index 3a547793135c3..d21653a74e57e 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -7,6 +7,7 @@ #include #include /* for bus_dma_region */ #include +#include #include #include #include @@ -186,6 +187,11 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, arch_setup_dma_ops(dev, dma_start, size, iommu, coherent); +#ifdef CONFIG_DMA_PAGE_TOUCHING + if (!dev->dma_ops) + setup_dma_page_touching_ops(dev); +#endif + return 0; } EXPORT_SYMBOL_GPL(of_dma_configure_id); diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 3da69b26e6743..cc0683b9312fd 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -318,29 +318,14 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) /* Don't touch the hardware now */ } else if (entry->msi_attrib.is_msix) { void __iomem *base = pci_msix_desc_addr(entry); - bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT); if (!base) goto skip; - /* - * The specification mandates that the entry is masked - * when the message is modified: - * - * "If software changes the Address or Data value of an - * entry while the entry is unmasked, the result is - * undefined." - */ - if (unmasked) - __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT); - writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR); writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR); writel(msg->data, base + PCI_MSIX_ENTRY_DATA); - if (unmasked) - __pci_msix_desc_mask_irq(entry, 0); - /* Ensure that the writes are visible in the device */ readl(base + PCI_MSIX_ENTRY_DATA); } else { diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 130327ff0b0ec..828a042d6a07b 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -43,7 +43,7 @@ config ARM_CCN config ARM_CMN tristate "Arm CMN-600 PMU support" - depends on ARM64 || (COMPILE_TEST && 64BIT) + depends on ARM64 || COMPILE_TEST help Support for PMU events monitoring on the Arm CMN-600 Coherent Mesh Network interconnect. diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c index 87c4be9dd4125..8468f0e8b704e 100644 --- a/drivers/perf/arm-cci.c +++ b/drivers/perf/arm-cci.c @@ -306,7 +306,7 @@ static ssize_t cci400_pmu_cycle_event_show(struct device *dev, { struct dev_ext_attribute *eattr = container_of(attr, struct dev_ext_attribute, attr); - return snprintf(buf, PAGE_SIZE, "config=0x%lx\n", (unsigned long)eattr->var); + return sysfs_emit(buf, "config=0x%lx\n", (unsigned long)eattr->var); } static int cci400_get_event_idx(struct cci_pmu *cci_pmu, @@ -525,8 +525,8 @@ static ssize_t cci5xx_pmu_global_event_show(struct device *dev, struct dev_ext_attribute *eattr = container_of(attr, struct dev_ext_attribute, attr); /* Global events have single fixed source code */ - return snprintf(buf, PAGE_SIZE, "event=0x%lx,source=0x%x\n", - (unsigned long)eattr->var, CCI5xx_PORT_GLOBAL); + return sysfs_emit(buf, "event=0x%lx,source=0x%x\n", + (unsigned long)eattr->var, CCI5xx_PORT_GLOBAL); } /* @@ -696,7 +696,7 @@ static ssize_t cci_pmu_format_show(struct device *dev, { struct dev_ext_attribute *eattr = container_of(attr, struct dev_ext_attribute, attr); - return snprintf(buf, PAGE_SIZE, "%s\n", (char *)eattr->var); + return sysfs_emit(buf, "%s\n", (char *)eattr->var); } static ssize_t cci_pmu_event_show(struct device *dev, @@ -705,8 +705,8 @@ static ssize_t cci_pmu_event_show(struct device *dev, struct dev_ext_attribute *eattr = container_of(attr, struct dev_ext_attribute, attr); /* source parameter is mandatory for normal PMU events */ - return snprintf(buf, PAGE_SIZE, "source=?,event=0x%lx\n", - (unsigned long)eattr->var); + return sysfs_emit(buf, "source=?,event=0x%lx\n", + (unsigned long)eattr->var); } static int pmu_is_valid_counter(struct cci_pmu *cci_pmu, int idx) @@ -1376,7 +1376,7 @@ static struct attribute *pmu_attrs[] = { NULL, }; -static struct attribute_group pmu_attr_group = { +static const struct attribute_group pmu_attr_group = { .attrs = pmu_attrs, }; diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c index a0a71c1df042a..3a2ddc0cc6c39 100644 --- a/drivers/perf/arm-ccn.c +++ b/drivers/perf/arm-ccn.c @@ -221,7 +221,7 @@ static ssize_t arm_ccn_pmu_format_show(struct device *dev, struct dev_ext_attribute *ea = container_of(attr, struct dev_ext_attribute, attr); - return snprintf(buf, PAGE_SIZE, "%s\n", (char *)ea->var); + return sysfs_emit(buf, "%s\n", (char *)ea->var); } #define CCN_FORMAT_ATTR(_name, _config) \ @@ -476,7 +476,7 @@ static ssize_t arm_ccn_pmu_cmp_mask_show(struct device *dev, struct arm_ccn *ccn = pmu_to_arm_ccn(dev_get_drvdata(dev)); u64 *mask = arm_ccn_pmu_get_cmp_mask(ccn, attr->attr.name); - return mask ? snprintf(buf, PAGE_SIZE, "0x%016llx\n", *mask) : -EINVAL; + return mask ? sysfs_emit(buf, "0x%016llx\n", *mask) : -EINVAL; } static ssize_t arm_ccn_pmu_cmp_mask_store(struct device *dev, diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 36061aaf026c8..9c7a5533622f5 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -5,8 +5,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -23,7 +25,10 @@ #define CMN_NI_LOGICAL_ID GENMASK_ULL(47, 32) #define CMN_NODEID_DEVID(reg) ((reg) & 3) +#define CMN_NODEID_EXT_DEVID(reg) ((reg) & 1) #define CMN_NODEID_PID(reg) (((reg) >> 2) & 1) +#define CMN_NODEID_EXT_PID(reg) (((reg) >> 1) & 3) +#define CMN_NODEID_1x1_PID(reg) (((reg) >> 2) & 7) #define CMN_NODEID_X(reg, bits) ((reg) >> (3 + (bits))) #define CMN_NODEID_Y(reg, bits) (((reg) >> 3) & ((1U << (bits)) - 1)) @@ -31,35 +36,58 @@ #define CMN_CI_CHILD_COUNT GENMASK_ULL(15, 0) #define CMN_CI_CHILD_PTR_OFFSET GENMASK_ULL(31, 16) -#define CMN_CHILD_NODE_ADDR GENMASK(27,0) +#define CMN_CHILD_NODE_ADDR GENMASK(29, 0) #define CMN_CHILD_NODE_EXTERNAL BIT(31) -#define CMN_ADDR_NODE_PTR GENMASK(27, 14) +#define CMN_MAX_DIMENSION 12 +#define CMN_MAX_XPS (CMN_MAX_DIMENSION * CMN_MAX_DIMENSION) +#define CMN_MAX_DTMS (CMN_MAX_XPS + (CMN_MAX_DIMENSION - 1) * 4) -#define CMN_NODE_PTR_DEVID(ptr) (((ptr) >> 2) & 3) -#define CMN_NODE_PTR_PID(ptr) ((ptr) & 1) -#define CMN_NODE_PTR_X(ptr, bits) ((ptr) >> (6 + (bits))) -#define CMN_NODE_PTR_Y(ptr, bits) (((ptr) >> 6) & ((1U << (bits)) - 1)) - -#define CMN_MAX_XPS (8 * 8) - -/* The CFG node has one other useful purpose */ +/* The CFG node has various info besides the discovery tree */ #define CMN_CFGM_PERIPH_ID_2 0x0010 #define CMN_CFGM_PID2_REVISION GENMASK(7, 4) -/* PMU registers occupy the 3rd 4KB page of each node's 16KB space */ +#define CMN_CFGM_INFO_GLOBAL 0x900 +#define CMN_INFO_MULTIPLE_DTM_EN BIT_ULL(63) +#define CMN_INFO_RSP_VC_NUM GENMASK_ULL(53, 52) +#define CMN_INFO_DAT_VC_NUM GENMASK_ULL(51, 50) + +#define CMN_CFGM_INFO_GLOBAL_1 0x908 +#define CMN_INFO_SNP_VC_NUM GENMASK_ULL(3, 2) +#define CMN_INFO_REQ_VC_NUM GENMASK_ULL(1, 0) + +/* XPs also have some local topology info which has uses too */ +#define CMN_MXP__CONNECT_INFO_P0 0x0008 +#define CMN_MXP__CONNECT_INFO_P1 0x0010 +#define CMN_MXP__CONNECT_INFO_P2 0x0028 +#define CMN_MXP__CONNECT_INFO_P3 0x0030 +#define CMN_MXP__CONNECT_INFO_P4 0x0038 +#define CMN_MXP__CONNECT_INFO_P5 0x0040 +#define CMN__CONNECT_INFO_DEVICE_TYPE GENMASK_ULL(4, 0) + +/* PMU registers occupy the 3rd 4KB page of each node's region */ #define CMN_PMU_OFFSET 0x2000 /* For most nodes, this is all there is */ #define CMN_PMU_EVENT_SEL 0x000 -#define CMN_PMU_EVENTn_ID_SHIFT(n) ((n) * 8) +#define CMN__PMU_CBUSY_SNTHROTTLE_SEL GENMASK_ULL(44, 42) +#define CMN__PMU_CLASS_OCCUP_ID GENMASK_ULL(36, 35) +/* Technically this is 4 bits wide on DNs, but we only use 2 there anyway */ +#define CMN__PMU_OCCUP1_ID GENMASK_ULL(34, 32) + +/* HN-Ps are weird... */ +#define CMN_HNP_PMU_EVENT_SEL 0x008 /* DTMs live in the PMU space of XP registers */ #define CMN_DTM_WPn(n) (0x1A0 + (n) * 0x18) #define CMN_DTM_WPn_CONFIG(n) (CMN_DTM_WPn(n) + 0x00) -#define CMN_DTM_WPn_CONFIG_WP_COMBINE BIT(6) -#define CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE BIT(5) -#define CMN_DTM_WPn_CONFIG_WP_GRP BIT(4) +#define CMN_DTM_WPn_CONFIG_WP_CHN_NUM GENMASK_ULL(20, 19) +#define CMN_DTM_WPn_CONFIG_WP_DEV_SEL2 GENMASK_ULL(18, 17) +#define CMN_DTM_WPn_CONFIG_WP_COMBINE BIT(9) +#define CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE BIT(8) +#define CMN600_WPn_CONFIG_WP_COMBINE BIT(6) +#define CMN600_WPn_CONFIG_WP_EXCLUSIVE BIT(5) +#define CMN_DTM_WPn_CONFIG_WP_GRP GENMASK_ULL(5, 4) #define CMN_DTM_WPn_CONFIG_WP_CHN_SEL GENMASK_ULL(3, 1) #define CMN_DTM_WPn_CONFIG_WP_DEV_SEL BIT(0) #define CMN_DTM_WPn_VAL(n) (CMN_DTM_WPn(n) + 0x08) @@ -81,7 +109,11 @@ #define CMN_DTM_PMEVCNTSR 0x240 +#define CMN_DTM_UNIT_INFO 0x0910 + #define CMN_DTM_NUM_COUNTERS 4 +/* Want more local counters? Why not replicate the whole DTM! Ugh... */ +#define CMN_DTM_OFFSET(n) ((n) * 0x200) /* The DTC node is where the magic happens */ #define CMN_DT_DTC_CTL 0x0a00 @@ -122,11 +154,11 @@ /* Event attributes */ -#define CMN_CONFIG_TYPE GENMASK(15, 0) -#define CMN_CONFIG_EVENTID GENMASK(23, 16) -#define CMN_CONFIG_OCCUPID GENMASK(27, 24) -#define CMN_CONFIG_BYNODEID BIT(31) -#define CMN_CONFIG_NODEID GENMASK(47, 32) +#define CMN_CONFIG_TYPE GENMASK_ULL(15, 0) +#define CMN_CONFIG_EVENTID GENMASK_ULL(26, 16) +#define CMN_CONFIG_OCCUPID GENMASK_ULL(30, 27) +#define CMN_CONFIG_BYNODEID BIT_ULL(31) +#define CMN_CONFIG_NODEID GENMASK_ULL(47, 32) #define CMN_EVENT_TYPE(event) FIELD_GET(CMN_CONFIG_TYPE, (event)->attr.config) #define CMN_EVENT_EVENTID(event) FIELD_GET(CMN_CONFIG_EVENTID, (event)->attr.config) @@ -134,13 +166,14 @@ #define CMN_EVENT_BYNODEID(event) FIELD_GET(CMN_CONFIG_BYNODEID, (event)->attr.config) #define CMN_EVENT_NODEID(event) FIELD_GET(CMN_CONFIG_NODEID, (event)->attr.config) -#define CMN_CONFIG_WP_COMBINE GENMASK(27, 24) -#define CMN_CONFIG_WP_DEV_SEL BIT(48) -#define CMN_CONFIG_WP_CHN_SEL GENMASK(50, 49) -#define CMN_CONFIG_WP_GRP BIT(52) -#define CMN_CONFIG_WP_EXCLUSIVE BIT(53) -#define CMN_CONFIG1_WP_VAL GENMASK(63, 0) -#define CMN_CONFIG2_WP_MASK GENMASK(63, 0) +#define CMN_CONFIG_WP_COMBINE GENMASK_ULL(27, 24) +#define CMN_CONFIG_WP_DEV_SEL GENMASK_ULL(50, 48) +#define CMN_CONFIG_WP_CHN_SEL GENMASK_ULL(55, 51) +/* Note that we don't yet support the tertiary match group on newer IPs */ +#define CMN_CONFIG_WP_GRP BIT_ULL(56) +#define CMN_CONFIG_WP_EXCLUSIVE BIT_ULL(57) +#define CMN_CONFIG1_WP_VAL GENMASK_ULL(63, 0) +#define CMN_CONFIG2_WP_MASK GENMASK_ULL(63, 0) #define CMN_EVENT_WP_COMBINE(event) FIELD_GET(CMN_CONFIG_WP_COMBINE, (event)->attr.config) #define CMN_EVENT_WP_DEV_SEL(event) FIELD_GET(CMN_CONFIG_WP_DEV_SEL, (event)->attr.config) @@ -155,7 +188,18 @@ #define CMN_WP_DOWN 2 -/* r0px probably don't exist in silicon, thankfully */ +enum cmn_model { + CMN600 = 1, + CMN650 = 2, + CMN700 = 4, + CI700 = 8, + /* ...and then we can use bitmap tricks for commonality */ + CMN_ANY = -1, + NOT_CMN600 = -2, + CMN_650ON = CMN650 | CMN700, +}; + +/* CMN-600 r0px shouldn't exist in silicon, thankfully */ enum cmn_revision { CMN600_R1P0, CMN600_R1P1, @@ -163,6 +207,18 @@ enum cmn_revision { CMN600_R1P3, CMN600_R2P0, CMN600_R3P0, + CMN600_R3P1, + CMN650_R0P0 = 0, + CMN650_R1P0, + CMN650_R1P1, + CMN650_R2P0, + CMN650_R1P2, + CMN700_R0P0 = 0, + CMN700_R1P0, + CMN700_R2P0, + CI700_R0P0 = 0, + CI700_R1P0, + CI700_R2P0, }; enum cmn_node_type { @@ -174,45 +230,63 @@ enum cmn_node_type { CMN_TYPE_HNF, CMN_TYPE_XP, CMN_TYPE_SBSX, - CMN_TYPE_RNI = 0xa, + CMN_TYPE_MPAM_S, + CMN_TYPE_MPAM_NS, + CMN_TYPE_RNI, CMN_TYPE_RND = 0xd, CMN_TYPE_RNSAM = 0xf, + CMN_TYPE_MTSX, + CMN_TYPE_HNP, CMN_TYPE_CXRA = 0x100, - CMN_TYPE_CXHA = 0x101, - CMN_TYPE_CXLA = 0x102, + CMN_TYPE_CXHA, + CMN_TYPE_CXLA, + CMN_TYPE_CCRA, + CMN_TYPE_CCHA, + CMN_TYPE_CCLA, + CMN_TYPE_CCLA_RNI, /* Not a real node type */ CMN_TYPE_WP = 0x7770 }; +enum cmn_filter_select { + SEL_NONE = -1, + SEL_OCCUP1ID, + SEL_CLASS_OCCUP_ID, + SEL_CBUSY_SNTHROTTLE_SEL, + SEL_MAX +}; + struct arm_cmn_node { void __iomem *pmu_base; u16 id, logid; enum cmn_node_type type; + int dtm; union { - /* Device node */ + /* DN/HN-F/CXHA */ struct { - int to_xp; - /* DN/HN-F/CXHA */ - unsigned int occupid_val; - unsigned int occupid_count; - }; + u8 val : 4; + u8 count : 4; + } occupid[SEL_MAX]; /* XP */ - struct { - int dtc; - u32 pmu_config_low; - union { - u8 input_sel[4]; - __le32 pmu_config_high; - }; - s8 wp_event[4]; - }; + u8 dtc; }; - union { u8 event[4]; __le32 event_sel; + u16 event_w[4]; + __le64 event_sel_w; + }; +}; + +struct arm_cmn_dtm { + void __iomem *base; + u32 pmu_config_low; + union { + u8 input_sel[4]; + __le32 pmu_config_high; }; + s8 wp_event[4]; }; struct arm_cmn_dtc { @@ -231,35 +305,246 @@ struct arm_cmn_dtc { struct arm_cmn { struct device *dev; void __iomem *base; + unsigned int state; enum cmn_revision rev; + enum cmn_model model; u8 mesh_x; u8 mesh_y; u16 num_xps; u16 num_dns; + bool multi_dtm; + u8 ports_used; + struct { + unsigned int rsp_vc_num : 2; + unsigned int dat_vc_num : 2; + unsigned int snp_vc_num : 2; + unsigned int req_vc_num : 2; + }; + struct arm_cmn_node *xps; struct arm_cmn_node *dns; + struct arm_cmn_dtm *dtms; struct arm_cmn_dtc *dtc; unsigned int num_dtcs; int cpu; struct hlist_node cpuhp_node; - unsigned int state; struct pmu pmu; + struct dentry *debug; }; #define to_cmn(p) container_of(p, struct arm_cmn, pmu) static int arm_cmn_hp_state; +struct arm_cmn_nodeid { + u8 x; + u8 y; + u8 port; + u8 dev; +}; + +static int arm_cmn_xyidbits(const struct arm_cmn *cmn) +{ + return fls((cmn->mesh_x - 1) | (cmn->mesh_y - 1) | 2); +} + +static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id) +{ + struct arm_cmn_nodeid nid; + + if (cmn->num_xps == 1) { + nid.x = 0; + nid.y = 0; + nid.port = CMN_NODEID_1x1_PID(id); + nid.dev = CMN_NODEID_DEVID(id); + } else { + int bits = arm_cmn_xyidbits(cmn); + + nid.x = CMN_NODEID_X(id, bits); + nid.y = CMN_NODEID_Y(id, bits); + if (cmn->ports_used & 0xc) { + nid.port = CMN_NODEID_EXT_PID(id); + nid.dev = CMN_NODEID_EXT_DEVID(id); + } else { + nid.port = CMN_NODEID_PID(id); + nid.dev = CMN_NODEID_DEVID(id); + } + } + return nid; +} + +static struct arm_cmn_node *arm_cmn_node_to_xp(const struct arm_cmn *cmn, + const struct arm_cmn_node *dn) +{ + struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); + int xp_idx = cmn->mesh_x * nid.y + nid.x; + + return cmn->xps + xp_idx; +} +static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn, + enum cmn_node_type type) +{ + struct arm_cmn_node *dn; + + for (dn = cmn->dns; dn->type; dn++) + if (dn->type == type) + return dn; + return NULL; +} + +static struct dentry *arm_cmn_debugfs; + +#ifdef CONFIG_DEBUG_FS +static const char *arm_cmn_device_type(u8 type) +{ + switch(FIELD_GET(CMN__CONNECT_INFO_DEVICE_TYPE, type)) { + case 0x00: return " |"; + case 0x01: return " RN-I |"; + case 0x02: return " RN-D |"; + case 0x04: return " RN-F_B |"; + case 0x05: return "RN-F_B_E|"; + case 0x06: return " RN-F_A |"; + case 0x07: return "RN-F_A_E|"; + case 0x08: return " HN-T |"; + case 0x09: return " HN-I |"; + case 0x0a: return " HN-D |"; + case 0x0b: return " HN-P |"; + case 0x0c: return " SN-F |"; + case 0x0d: return " SBSX |"; + case 0x0e: return " HN-F |"; + case 0x0f: return " SN-F_E |"; + case 0x10: return " SN-F_D |"; + case 0x11: return " CXHA |"; + case 0x12: return " CXRA |"; + case 0x13: return " CXRH |"; + case 0x14: return " RN-F_D |"; + case 0x15: return "RN-F_D_E|"; + case 0x16: return " RN-F_C |"; + case 0x17: return "RN-F_C_E|"; + case 0x18: return " RN-F_E |"; + case 0x19: return "RN-F_E_E|"; + case 0x1c: return " MTSX |"; + case 0x1d: return " HN-V |"; + case 0x1e: return " CCG |"; + default: return " ???? |"; + } +} + +static void arm_cmn_show_logid(struct seq_file *s, int x, int y, int p, int d) +{ + struct arm_cmn *cmn = s->private; + struct arm_cmn_node *dn; + + for (dn = cmn->dns; dn->type; dn++) { + struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); + + if (dn->type == CMN_TYPE_XP) + continue; + /* Ignore the extra components that will overlap on some ports */ + if (dn->type < CMN_TYPE_HNI) + continue; + + if (nid.x != x || nid.y != y || nid.port != p || nid.dev != d) + continue; + + seq_printf(s, " #%-2d |", dn->logid); + return; + } + seq_puts(s, " |"); +} + +static int arm_cmn_map_show(struct seq_file *s, void *data) +{ + struct arm_cmn *cmn = s->private; + int x, y, p, pmax = fls(cmn->ports_used); + + seq_puts(s, " X"); + for (x = 0; x < cmn->mesh_x; x++) + seq_printf(s, " %d ", x); + seq_puts(s, "\nY P D+"); + y = cmn->mesh_y; + while (y--) { + int xp_base = cmn->mesh_x * y; + u8 port[6][CMN_MAX_DIMENSION]; + + for (x = 0; x < cmn->mesh_x; x++) + seq_puts(s, "--------+"); + + seq_printf(s, "\n%d |", y); + for (x = 0; x < cmn->mesh_x; x++) { + struct arm_cmn_node *xp = cmn->xps + xp_base + x; + void __iomem *base = xp->pmu_base - CMN_PMU_OFFSET; + + port[0][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P0); + port[1][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P1); + port[2][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P2); + port[3][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P3); + port[4][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P4); + port[5][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P5); + seq_printf(s, " XP #%-2d |", xp_base + x); + } + + seq_puts(s, "\n |"); + for (x = 0; x < cmn->mesh_x; x++) { + u8 dtc = cmn->xps[xp_base + x].dtc; + + if (dtc & (dtc - 1)) + seq_puts(s, " DTC ?? |"); + else + seq_printf(s, " DTC %ld |", __ffs(dtc)); + } + seq_puts(s, "\n |"); + for (x = 0; x < cmn->mesh_x; x++) + seq_puts(s, "........|"); + + for (p = 0; p < pmax; p++) { + seq_printf(s, "\n %d |", p); + for (x = 0; x < cmn->mesh_x; x++) + seq_puts(s, arm_cmn_device_type(port[p][x])); + seq_puts(s, "\n 0|"); + for (x = 0; x < cmn->mesh_x; x++) + arm_cmn_show_logid(s, x, y, p, 0); + seq_puts(s, "\n 1|"); + for (x = 0; x < cmn->mesh_x; x++) + arm_cmn_show_logid(s, x, y, p, 1); + } + seq_puts(s, "\n-----+"); + } + for (x = 0; x < cmn->mesh_x; x++) + seq_puts(s, "--------+"); + seq_puts(s, "\n"); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(arm_cmn_map); + +static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) +{ + const char *name = "map"; + + if (id > 0) + name = devm_kasprintf(cmn->dev, GFP_KERNEL, "map_%d", id); + if (!name) + return; + + cmn->debug = debugfs_create_file(name, 0444, arm_cmn_debugfs, cmn, &arm_cmn_map_fops); +} +#else +static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {} +#endif + struct arm_cmn_hw_event { struct arm_cmn_node *dn; - u64 dtm_idx[2]; + u64 dtm_idx[4]; unsigned int dtc_idx; u8 dtcs_used; u8 num_dns; + u8 dtm_offset; + bool wide_sel; + enum cmn_filter_select filter_sel; }; #define for_each_hw_dn(hw, dn, i) \ @@ -283,8 +568,10 @@ static unsigned int arm_cmn_get_index(u64 x[], unsigned int pos) struct arm_cmn_event_attr { struct device_attribute attr; + enum cmn_model model; enum cmn_node_type type; - u8 eventid; + enum cmn_filter_select fsel; + u16 eventid; u8 occupid; }; @@ -294,51 +581,17 @@ struct arm_cmn_format_attr { int config; }; -static int arm_cmn_xyidbits(const struct arm_cmn *cmn) -{ - return cmn->mesh_x > 4 || cmn->mesh_y > 4 ? 3 : 2; -} - -static void arm_cmn_init_node_to_xp(const struct arm_cmn *cmn, - struct arm_cmn_node *dn) -{ - int bits = arm_cmn_xyidbits(cmn); - int x = CMN_NODEID_X(dn->id, bits); - int y = CMN_NODEID_Y(dn->id, bits); - int xp_idx = cmn->mesh_x * y + x; - - dn->to_xp = (cmn->xps + xp_idx) - dn; -} - -static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn) -{ - return dn->type == CMN_TYPE_XP ? dn : dn + dn->to_xp; -} - -static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn, - enum cmn_node_type type) -{ - int i; - - for (i = 0; i < cmn->num_dns; i++) - if (cmn->dns[i].type == type) - return &cmn->dns[i]; - return NULL; -} - -#define CMN_EVENT_ATTR(_name, _type, _eventid, _occupid) \ +#define _CMN_EVENT_ATTR(_model, _name, _type, _eventid, _occupid, _fsel)\ (&((struct arm_cmn_event_attr[]) {{ \ .attr = __ATTR(_name, 0444, arm_cmn_event_show, NULL), \ + .model = _model, \ .type = _type, \ .eventid = _eventid, \ .occupid = _occupid, \ + .fsel = _fsel, \ }})[0].attr.attr) - -static bool arm_cmn_is_occup_event(enum cmn_node_type type, unsigned int id) -{ - return (type == CMN_TYPE_DVM && id == 0x05) || - (type == CMN_TYPE_HNF && id == 0x0f); -} +#define CMN_EVENT_ATTR(_model, _name, _type, _eventid) \ + _CMN_EVENT_ATTR(_model, _name, _type, _eventid, 0, SEL_NONE) static ssize_t arm_cmn_event_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -348,19 +601,19 @@ static ssize_t arm_cmn_event_show(struct device *dev, eattr = container_of(attr, typeof(*eattr), attr); if (eattr->type == CMN_TYPE_DTC) - return snprintf(buf, PAGE_SIZE, "type=0x%x\n", eattr->type); + return sysfs_emit(buf, "type=0x%x\n", eattr->type); if (eattr->type == CMN_TYPE_WP) - return snprintf(buf, PAGE_SIZE, - "type=0x%x,eventid=0x%x,wp_dev_sel=?,wp_chn_sel=?,wp_grp=?,wp_val=?,wp_mask=?\n", - eattr->type, eattr->eventid); + return sysfs_emit(buf, + "type=0x%x,eventid=0x%x,wp_dev_sel=?,wp_chn_sel=?,wp_grp=?,wp_val=?,wp_mask=?\n", + eattr->type, eattr->eventid); - if (arm_cmn_is_occup_event(eattr->type, eattr->eventid)) - return snprintf(buf, PAGE_SIZE, "type=0x%x,eventid=0x%x,occupid=0x%x\n", - eattr->type, eattr->eventid, eattr->occupid); + if (eattr->fsel > SEL_NONE) + return sysfs_emit(buf, "type=0x%x,eventid=0x%x,occupid=0x%x\n", + eattr->type, eattr->eventid, eattr->occupid); - return snprintf(buf, PAGE_SIZE, "type=0x%x,eventid=0x%x\n", - eattr->type, eattr->eventid); + return sysfs_emit(buf, "type=0x%x,eventid=0x%x\n", eattr->type, + eattr->eventid); } static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj, @@ -371,59 +624,154 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj, struct arm_cmn *cmn = to_cmn(dev_get_drvdata(dev)); struct arm_cmn_event_attr *eattr; enum cmn_node_type type; + u16 eventid; eattr = container_of(attr, typeof(*eattr), attr.attr); + + if (!(eattr->model & cmn->model)) + return 0; + type = eattr->type; + eventid = eattr->eventid; - /* Watchpoints aren't nodes */ + /* Watchpoints aren't nodes, so avoid confusion */ if (type == CMN_TYPE_WP) - type = CMN_TYPE_XP; + return attr->mode; - /* Revision-specific differences */ - if (cmn->rev < CMN600_R1P2) { - if (type == CMN_TYPE_HNF && eattr->eventid == 0x1b) + /* Hide XP events for unused interfaces/channels */ + if (type == CMN_TYPE_XP) { + unsigned int intf = (eventid >> 2) & 7; + unsigned int chan = eventid >> 5; + + if ((intf & 4) && !(cmn->ports_used & BIT(intf & 3))) + return 0; + + if (chan == 4 && cmn->model == CMN600) + return 0; + + if ((chan == 5 && cmn->rsp_vc_num < 2) || + (chan == 6 && cmn->dat_vc_num < 2) || + (chan == 7 && cmn->snp_vc_num < 2) || + (chan == 8 && cmn->req_vc_num < 2)) return 0; } + /* Revision-specific differences */ + if (cmn->model == CMN600) { + if (cmn->rev < CMN600_R1P3) { + if (type == CMN_TYPE_CXRA && eventid > 0x10) + return 0; + } + if (cmn->rev < CMN600_R1P2) { + if (type == CMN_TYPE_HNF && eventid == 0x1b) + return 0; + if (type == CMN_TYPE_CXRA || type == CMN_TYPE_CXHA) + return 0; + } + } else if (cmn->model == CMN650) { + if (cmn->rev < CMN650_R2P0 || cmn->rev == CMN650_R1P2) { + if (type == CMN_TYPE_HNF && eventid > 0x22) + return 0; + if (type == CMN_TYPE_SBSX && eventid == 0x17) + return 0; + if (type == CMN_TYPE_RNI && eventid > 0x10) + return 0; + } + } else if (cmn->model == CMN700) { + if (cmn->rev < CMN700_R2P0) { + if (type == CMN_TYPE_HNF && eventid > 0x2c) + return 0; + if (type == CMN_TYPE_CCHA && eventid > 0x74) + return 0; + if (type == CMN_TYPE_CCLA && eventid > 0x27) + return 0; + } + if (cmn->rev < CMN700_R1P0) { + if (type == CMN_TYPE_HNF && eventid > 0x2b) + return 0; + } + } + if (!arm_cmn_node(cmn, type)) return 0; return attr->mode; } -#define _CMN_EVENT_DVM(_name, _event, _occup) \ - CMN_EVENT_ATTR(dn_##_name, CMN_TYPE_DVM, _event, _occup) +#define _CMN_EVENT_DVM(_model, _name, _event, _occup, _fsel) \ + _CMN_EVENT_ATTR(_model, dn_##_name, CMN_TYPE_DVM, _event, _occup, _fsel) #define CMN_EVENT_DTC(_name) \ - CMN_EVENT_ATTR(dtc_##_name, CMN_TYPE_DTC, 0, 0) -#define _CMN_EVENT_HNF(_name, _event, _occup) \ - CMN_EVENT_ATTR(hnf_##_name, CMN_TYPE_HNF, _event, _occup) + CMN_EVENT_ATTR(CMN_ANY, dtc_##_name, CMN_TYPE_DTC, 0) +#define _CMN_EVENT_HNF(_model, _name, _event, _occup, _fsel) \ + _CMN_EVENT_ATTR(_model, hnf_##_name, CMN_TYPE_HNF, _event, _occup, _fsel) #define CMN_EVENT_HNI(_name, _event) \ - CMN_EVENT_ATTR(hni_##_name, CMN_TYPE_HNI, _event, 0) + CMN_EVENT_ATTR(CMN_ANY, hni_##_name, CMN_TYPE_HNI, _event) +#define CMN_EVENT_HNP(_name, _event) \ + CMN_EVENT_ATTR(CMN_ANY, hnp_##_name, CMN_TYPE_HNP, _event) #define __CMN_EVENT_XP(_name, _event) \ - CMN_EVENT_ATTR(mxp_##_name, CMN_TYPE_XP, _event, 0) -#define CMN_EVENT_SBSX(_name, _event) \ - CMN_EVENT_ATTR(sbsx_##_name, CMN_TYPE_SBSX, _event, 0) -#define CMN_EVENT_RNID(_name, _event) \ - CMN_EVENT_ATTR(rnid_##_name, CMN_TYPE_RNI, _event, 0) - -#define CMN_EVENT_DVM(_name, _event) \ - _CMN_EVENT_DVM(_name, _event, 0) -#define CMN_EVENT_HNF(_name, _event) \ - _CMN_EVENT_HNF(_name, _event, 0) + CMN_EVENT_ATTR(CMN_ANY, mxp_##_name, CMN_TYPE_XP, _event) +#define CMN_EVENT_SBSX(_model, _name, _event) \ + CMN_EVENT_ATTR(_model, sbsx_##_name, CMN_TYPE_SBSX, _event) +#define CMN_EVENT_RNID(_model, _name, _event) \ + CMN_EVENT_ATTR(_model, rnid_##_name, CMN_TYPE_RNI, _event) +#define CMN_EVENT_MTSX(_name, _event) \ + CMN_EVENT_ATTR(CMN_ANY, mtsx_##_name, CMN_TYPE_MTSX, _event) +#define CMN_EVENT_CXRA(_model, _name, _event) \ + CMN_EVENT_ATTR(_model, cxra_##_name, CMN_TYPE_CXRA, _event) +#define CMN_EVENT_CXHA(_name, _event) \ + CMN_EVENT_ATTR(CMN_ANY, cxha_##_name, CMN_TYPE_CXHA, _event) +#define CMN_EVENT_CCRA(_name, _event) \ + CMN_EVENT_ATTR(CMN_ANY, ccra_##_name, CMN_TYPE_CCRA, _event) +#define CMN_EVENT_CCHA(_name, _event) \ + CMN_EVENT_ATTR(CMN_ANY, ccha_##_name, CMN_TYPE_CCHA, _event) +#define CMN_EVENT_CCLA(_name, _event) \ + CMN_EVENT_ATTR(CMN_ANY, ccla_##_name, CMN_TYPE_CCLA, _event) +#define CMN_EVENT_CCLA_RNI(_name, _event) \ + CMN_EVENT_ATTR(CMN_ANY, ccla_rni_##_name, CMN_TYPE_CCLA_RNI, _event) + +#define CMN_EVENT_DVM(_model, _name, _event) \ + _CMN_EVENT_DVM(_model, _name, _event, 0, SEL_NONE) +#define CMN_EVENT_DVM_OCC(_model, _name, _event) \ + _CMN_EVENT_DVM(_model, _name##_all, _event, 0, SEL_OCCUP1ID), \ + _CMN_EVENT_DVM(_model, _name##_dvmop, _event, 1, SEL_OCCUP1ID), \ + _CMN_EVENT_DVM(_model, _name##_dvmsync, _event, 2, SEL_OCCUP1ID) +#define CMN_EVENT_HNF(_model, _name, _event) \ + _CMN_EVENT_HNF(_model, _name, _event, 0, SEL_NONE) +#define CMN_EVENT_HNF_CLS(_model, _name, _event) \ + _CMN_EVENT_HNF(_model, _name##_class0, _event, 0, SEL_CLASS_OCCUP_ID), \ + _CMN_EVENT_HNF(_model, _name##_class1, _event, 1, SEL_CLASS_OCCUP_ID), \ + _CMN_EVENT_HNF(_model, _name##_class2, _event, 2, SEL_CLASS_OCCUP_ID), \ + _CMN_EVENT_HNF(_model, _name##_class3, _event, 3, SEL_CLASS_OCCUP_ID) +#define CMN_EVENT_HNF_SNT(_model, _name, _event) \ + _CMN_EVENT_HNF(_model, _name##_all, _event, 0, SEL_CBUSY_SNTHROTTLE_SEL), \ + _CMN_EVENT_HNF(_model, _name##_group0_read, _event, 1, SEL_CBUSY_SNTHROTTLE_SEL), \ + _CMN_EVENT_HNF(_model, _name##_group0_write, _event, 2, SEL_CBUSY_SNTHROTTLE_SEL), \ + _CMN_EVENT_HNF(_model, _name##_group1_read, _event, 3, SEL_CBUSY_SNTHROTTLE_SEL), \ + _CMN_EVENT_HNF(_model, _name##_group1_write, _event, 4, SEL_CBUSY_SNTHROTTLE_SEL), \ + _CMN_EVENT_HNF(_model, _name##_read, _event, 5, SEL_CBUSY_SNTHROTTLE_SEL), \ + _CMN_EVENT_HNF(_model, _name##_write, _event, 6, SEL_CBUSY_SNTHROTTLE_SEL) + #define _CMN_EVENT_XP(_name, _event) \ __CMN_EVENT_XP(e_##_name, (_event) | (0 << 2)), \ __CMN_EVENT_XP(w_##_name, (_event) | (1 << 2)), \ __CMN_EVENT_XP(n_##_name, (_event) | (2 << 2)), \ __CMN_EVENT_XP(s_##_name, (_event) | (3 << 2)), \ __CMN_EVENT_XP(p0_##_name, (_event) | (4 << 2)), \ - __CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2)) + __CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2)), \ + __CMN_EVENT_XP(p2_##_name, (_event) | (6 << 2)), \ + __CMN_EVENT_XP(p3_##_name, (_event) | (7 << 2)) /* Good thing there are only 3 fundamental XP events... */ #define CMN_EVENT_XP(_name, _event) \ _CMN_EVENT_XP(req_##_name, (_event) | (0 << 5)), \ _CMN_EVENT_XP(rsp_##_name, (_event) | (1 << 5)), \ _CMN_EVENT_XP(snp_##_name, (_event) | (2 << 5)), \ - _CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5)) + _CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5)), \ + _CMN_EVENT_XP(pub_##_name, (_event) | (4 << 5)), \ + _CMN_EVENT_XP(rsp2_##_name, (_event) | (5 << 5)), \ + _CMN_EVENT_XP(dat2_##_name, (_event) | (6 << 5)), \ + _CMN_EVENT_XP(snp2_##_name, (_event) | (7 << 5)), \ + _CMN_EVENT_XP(req2_##_name, (_event) | (8 << 5)) static struct attribute *arm_cmn_event_attrs[] = { @@ -434,115 +782,315 @@ static struct attribute *arm_cmn_event_attrs[] = { * slot, but our lazy short-cut of using the DTM counter index for * the PMU index as well happens to avoid that by construction. */ - CMN_EVENT_DVM(rxreq_dvmop, 0x01), - CMN_EVENT_DVM(rxreq_dvmsync, 0x02), - CMN_EVENT_DVM(rxreq_dvmop_vmid_filtered, 0x03), - CMN_EVENT_DVM(rxreq_retried, 0x04), - _CMN_EVENT_DVM(rxreq_trk_occupancy_all, 0x05, 0), - _CMN_EVENT_DVM(rxreq_trk_occupancy_dvmop, 0x05, 1), - _CMN_EVENT_DVM(rxreq_trk_occupancy_dvmsync, 0x05, 2), - - CMN_EVENT_HNF(cache_miss, 0x01), - CMN_EVENT_HNF(slc_sf_cache_access, 0x02), - CMN_EVENT_HNF(cache_fill, 0x03), - CMN_EVENT_HNF(pocq_retry, 0x04), - CMN_EVENT_HNF(pocq_reqs_recvd, 0x05), - CMN_EVENT_HNF(sf_hit, 0x06), - CMN_EVENT_HNF(sf_evictions, 0x07), - CMN_EVENT_HNF(dir_snoops_sent, 0x08), - CMN_EVENT_HNF(brd_snoops_sent, 0x09), - CMN_EVENT_HNF(slc_eviction, 0x0a), - CMN_EVENT_HNF(slc_fill_invalid_way, 0x0b), - CMN_EVENT_HNF(mc_retries, 0x0c), - CMN_EVENT_HNF(mc_reqs, 0x0d), - CMN_EVENT_HNF(qos_hh_retry, 0x0e), - _CMN_EVENT_HNF(qos_pocq_occupancy_all, 0x0f, 0), - _CMN_EVENT_HNF(qos_pocq_occupancy_read, 0x0f, 1), - _CMN_EVENT_HNF(qos_pocq_occupancy_write, 0x0f, 2), - _CMN_EVENT_HNF(qos_pocq_occupancy_atomic, 0x0f, 3), - _CMN_EVENT_HNF(qos_pocq_occupancy_stash, 0x0f, 4), - CMN_EVENT_HNF(pocq_addrhaz, 0x10), - CMN_EVENT_HNF(pocq_atomic_addrhaz, 0x11), - CMN_EVENT_HNF(ld_st_swp_adq_full, 0x12), - CMN_EVENT_HNF(cmp_adq_full, 0x13), - CMN_EVENT_HNF(txdat_stall, 0x14), - CMN_EVENT_HNF(txrsp_stall, 0x15), - CMN_EVENT_HNF(seq_full, 0x16), - CMN_EVENT_HNF(seq_hit, 0x17), - CMN_EVENT_HNF(snp_sent, 0x18), - CMN_EVENT_HNF(sfbi_dir_snp_sent, 0x19), - CMN_EVENT_HNF(sfbi_brd_snp_sent, 0x1a), - CMN_EVENT_HNF(snp_sent_untrk, 0x1b), - CMN_EVENT_HNF(intv_dirty, 0x1c), - CMN_EVENT_HNF(stash_snp_sent, 0x1d), - CMN_EVENT_HNF(stash_data_pull, 0x1e), - CMN_EVENT_HNF(snp_fwded, 0x1f), - - CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl, 0x20), - CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl, 0x21), - CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl, 0x22), - CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl, 0x23), - CMN_EVENT_HNI(wdb_occ_cnt_ovfl, 0x24), - CMN_EVENT_HNI(rrt_rd_alloc, 0x25), - CMN_EVENT_HNI(rrt_wr_alloc, 0x26), - CMN_EVENT_HNI(rdt_rd_alloc, 0x27), - CMN_EVENT_HNI(rdt_wr_alloc, 0x28), - CMN_EVENT_HNI(wdb_alloc, 0x29), - CMN_EVENT_HNI(txrsp_retryack, 0x2a), - CMN_EVENT_HNI(arvalid_no_arready, 0x2b), - CMN_EVENT_HNI(arready_no_arvalid, 0x2c), - CMN_EVENT_HNI(awvalid_no_awready, 0x2d), - CMN_EVENT_HNI(awready_no_awvalid, 0x2e), - CMN_EVENT_HNI(wvalid_no_wready, 0x2f), - CMN_EVENT_HNI(txdat_stall, 0x30), - CMN_EVENT_HNI(nonpcie_serialization, 0x31), - CMN_EVENT_HNI(pcie_serialization, 0x32), - - CMN_EVENT_XP(txflit_valid, 0x01), - CMN_EVENT_XP(txflit_stall, 0x02), - CMN_EVENT_XP(partial_dat_flit, 0x03), + CMN_EVENT_DVM(CMN600, rxreq_dvmop, 0x01), + CMN_EVENT_DVM(CMN600, rxreq_dvmsync, 0x02), + CMN_EVENT_DVM(CMN600, rxreq_dvmop_vmid_filtered, 0x03), + CMN_EVENT_DVM(CMN600, rxreq_retried, 0x04), + CMN_EVENT_DVM_OCC(CMN600, rxreq_trk_occupancy, 0x05), + CMN_EVENT_DVM(NOT_CMN600, dvmop_tlbi, 0x01), + CMN_EVENT_DVM(NOT_CMN600, dvmop_bpi, 0x02), + CMN_EVENT_DVM(NOT_CMN600, dvmop_pici, 0x03), + CMN_EVENT_DVM(NOT_CMN600, dvmop_vici, 0x04), + CMN_EVENT_DVM(NOT_CMN600, dvmsync, 0x05), + CMN_EVENT_DVM(NOT_CMN600, vmid_filtered, 0x06), + CMN_EVENT_DVM(NOT_CMN600, rndop_filtered, 0x07), + CMN_EVENT_DVM(NOT_CMN600, retry, 0x08), + CMN_EVENT_DVM(NOT_CMN600, txsnp_flitv, 0x09), + CMN_EVENT_DVM(NOT_CMN600, txsnp_stall, 0x0a), + CMN_EVENT_DVM(NOT_CMN600, trkfull, 0x0b), + CMN_EVENT_DVM_OCC(NOT_CMN600, trk_occupancy, 0x0c), + CMN_EVENT_DVM_OCC(CMN700, trk_occupancy_cxha, 0x0d), + CMN_EVENT_DVM_OCC(CMN700, trk_occupancy_pdn, 0x0e), + CMN_EVENT_DVM(CMN700, trk_alloc, 0x0f), + CMN_EVENT_DVM(CMN700, trk_cxha_alloc, 0x10), + CMN_EVENT_DVM(CMN700, trk_pdn_alloc, 0x11), + CMN_EVENT_DVM(CMN700, txsnp_stall_limit, 0x12), + CMN_EVENT_DVM(CMN700, rxsnp_stall_starv, 0x13), + CMN_EVENT_DVM(CMN700, txsnp_sync_stall_op, 0x14), + + CMN_EVENT_HNF(CMN_ANY, cache_miss, 0x01), + CMN_EVENT_HNF(CMN_ANY, slc_sf_cache_access, 0x02), + CMN_EVENT_HNF(CMN_ANY, cache_fill, 0x03), + CMN_EVENT_HNF(CMN_ANY, pocq_retry, 0x04), + CMN_EVENT_HNF(CMN_ANY, pocq_reqs_recvd, 0x05), + CMN_EVENT_HNF(CMN_ANY, sf_hit, 0x06), + CMN_EVENT_HNF(CMN_ANY, sf_evictions, 0x07), + CMN_EVENT_HNF(CMN_ANY, dir_snoops_sent, 0x08), + CMN_EVENT_HNF(CMN_ANY, brd_snoops_sent, 0x09), + CMN_EVENT_HNF(CMN_ANY, slc_eviction, 0x0a), + CMN_EVENT_HNF(CMN_ANY, slc_fill_invalid_way, 0x0b), + CMN_EVENT_HNF(CMN_ANY, mc_retries, 0x0c), + CMN_EVENT_HNF(CMN_ANY, mc_reqs, 0x0d), + CMN_EVENT_HNF(CMN_ANY, qos_hh_retry, 0x0e), + _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_all, 0x0f, 0, SEL_OCCUP1ID), + _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_read, 0x0f, 1, SEL_OCCUP1ID), + _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_write, 0x0f, 2, SEL_OCCUP1ID), + _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_atomic, 0x0f, 3, SEL_OCCUP1ID), + _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_stash, 0x0f, 4, SEL_OCCUP1ID), + CMN_EVENT_HNF(CMN_ANY, pocq_addrhaz, 0x10), + CMN_EVENT_HNF(CMN_ANY, pocq_atomic_addrhaz, 0x11), + CMN_EVENT_HNF(CMN_ANY, ld_st_swp_adq_full, 0x12), + CMN_EVENT_HNF(CMN_ANY, cmp_adq_full, 0x13), + CMN_EVENT_HNF(CMN_ANY, txdat_stall, 0x14), + CMN_EVENT_HNF(CMN_ANY, txrsp_stall, 0x15), + CMN_EVENT_HNF(CMN_ANY, seq_full, 0x16), + CMN_EVENT_HNF(CMN_ANY, seq_hit, 0x17), + CMN_EVENT_HNF(CMN_ANY, snp_sent, 0x18), + CMN_EVENT_HNF(CMN_ANY, sfbi_dir_snp_sent, 0x19), + CMN_EVENT_HNF(CMN_ANY, sfbi_brd_snp_sent, 0x1a), + CMN_EVENT_HNF(CMN_ANY, snp_sent_untrk, 0x1b), + CMN_EVENT_HNF(CMN_ANY, intv_dirty, 0x1c), + CMN_EVENT_HNF(CMN_ANY, stash_snp_sent, 0x1d), + CMN_EVENT_HNF(CMN_ANY, stash_data_pull, 0x1e), + CMN_EVENT_HNF(CMN_ANY, snp_fwded, 0x1f), + CMN_EVENT_HNF(NOT_CMN600, atomic_fwd, 0x20), + CMN_EVENT_HNF(NOT_CMN600, mpam_hardlim, 0x21), + CMN_EVENT_HNF(NOT_CMN600, mpam_softlim, 0x22), + CMN_EVENT_HNF(CMN_650ON, snp_sent_cluster, 0x23), + CMN_EVENT_HNF(CMN_650ON, sf_imprecise_evict, 0x24), + CMN_EVENT_HNF(CMN_650ON, sf_evict_shared_line, 0x25), + CMN_EVENT_HNF_CLS(CMN700, pocq_class_occup, 0x26), + CMN_EVENT_HNF_CLS(CMN700, pocq_class_retry, 0x27), + CMN_EVENT_HNF_CLS(CMN700, class_mc_reqs, 0x28), + CMN_EVENT_HNF_CLS(CMN700, class_cgnt_cmin, 0x29), + CMN_EVENT_HNF_SNT(CMN700, sn_throttle, 0x2a), + CMN_EVENT_HNF_SNT(CMN700, sn_throttle_min, 0x2b), + CMN_EVENT_HNF(CMN700, sf_precise_to_imprecise, 0x2c), + CMN_EVENT_HNF(CMN700, snp_intv_cln, 0x2d), + CMN_EVENT_HNF(CMN700, nc_excl, 0x2e), + CMN_EVENT_HNF(CMN700, excl_mon_ovfl, 0x2f), + + CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl, 0x20), + CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl, 0x21), + CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl, 0x22), + CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl, 0x23), + CMN_EVENT_HNI(wdb_occ_cnt_ovfl, 0x24), + CMN_EVENT_HNI(rrt_rd_alloc, 0x25), + CMN_EVENT_HNI(rrt_wr_alloc, 0x26), + CMN_EVENT_HNI(rdt_rd_alloc, 0x27), + CMN_EVENT_HNI(rdt_wr_alloc, 0x28), + CMN_EVENT_HNI(wdb_alloc, 0x29), + CMN_EVENT_HNI(txrsp_retryack, 0x2a), + CMN_EVENT_HNI(arvalid_no_arready, 0x2b), + CMN_EVENT_HNI(arready_no_arvalid, 0x2c), + CMN_EVENT_HNI(awvalid_no_awready, 0x2d), + CMN_EVENT_HNI(awready_no_awvalid, 0x2e), + CMN_EVENT_HNI(wvalid_no_wready, 0x2f), + CMN_EVENT_HNI(txdat_stall, 0x30), + CMN_EVENT_HNI(nonpcie_serialization, 0x31), + CMN_EVENT_HNI(pcie_serialization, 0x32), + + /* + * HN-P events squat on top of the HN-I similarly to DVM events, except + * for being crammed into the same physical node as well. And of course + * where would the fun be if the same events were in the same order... + */ + CMN_EVENT_HNP(rrt_wr_occ_cnt_ovfl, 0x01), + CMN_EVENT_HNP(rdt_wr_occ_cnt_ovfl, 0x02), + CMN_EVENT_HNP(wdb_occ_cnt_ovfl, 0x03), + CMN_EVENT_HNP(rrt_wr_alloc, 0x04), + CMN_EVENT_HNP(rdt_wr_alloc, 0x05), + CMN_EVENT_HNP(wdb_alloc, 0x06), + CMN_EVENT_HNP(awvalid_no_awready, 0x07), + CMN_EVENT_HNP(awready_no_awvalid, 0x08), + CMN_EVENT_HNP(wvalid_no_wready, 0x09), + CMN_EVENT_HNP(rrt_rd_occ_cnt_ovfl, 0x11), + CMN_EVENT_HNP(rdt_rd_occ_cnt_ovfl, 0x12), + CMN_EVENT_HNP(rrt_rd_alloc, 0x13), + CMN_EVENT_HNP(rdt_rd_alloc, 0x14), + CMN_EVENT_HNP(arvalid_no_arready, 0x15), + CMN_EVENT_HNP(arready_no_arvalid, 0x16), + + CMN_EVENT_XP(txflit_valid, 0x01), + CMN_EVENT_XP(txflit_stall, 0x02), + CMN_EVENT_XP(partial_dat_flit, 0x03), /* We treat watchpoints as a special made-up class of XP events */ - CMN_EVENT_ATTR(watchpoint_up, CMN_TYPE_WP, 0, 0), - CMN_EVENT_ATTR(watchpoint_down, CMN_TYPE_WP, 2, 0), - - CMN_EVENT_SBSX(rd_req, 0x01), - CMN_EVENT_SBSX(wr_req, 0x02), - CMN_EVENT_SBSX(cmo_req, 0x03), - CMN_EVENT_SBSX(txrsp_retryack, 0x04), - CMN_EVENT_SBSX(txdat_flitv, 0x05), - CMN_EVENT_SBSX(txrsp_flitv, 0x06), - CMN_EVENT_SBSX(rd_req_trkr_occ_cnt_ovfl, 0x11), - CMN_EVENT_SBSX(wr_req_trkr_occ_cnt_ovfl, 0x12), - CMN_EVENT_SBSX(cmo_req_trkr_occ_cnt_ovfl, 0x13), - CMN_EVENT_SBSX(wdb_occ_cnt_ovfl, 0x14), - CMN_EVENT_SBSX(rd_axi_trkr_occ_cnt_ovfl, 0x15), - CMN_EVENT_SBSX(cmo_axi_trkr_occ_cnt_ovfl, 0x16), - CMN_EVENT_SBSX(arvalid_no_arready, 0x21), - CMN_EVENT_SBSX(awvalid_no_awready, 0x22), - CMN_EVENT_SBSX(wvalid_no_wready, 0x23), - CMN_EVENT_SBSX(txdat_stall, 0x24), - CMN_EVENT_SBSX(txrsp_stall, 0x25), - - CMN_EVENT_RNID(s0_rdata_beats, 0x01), - CMN_EVENT_RNID(s1_rdata_beats, 0x02), - CMN_EVENT_RNID(s2_rdata_beats, 0x03), - CMN_EVENT_RNID(rxdat_flits, 0x04), - CMN_EVENT_RNID(txdat_flits, 0x05), - CMN_EVENT_RNID(txreq_flits_total, 0x06), - CMN_EVENT_RNID(txreq_flits_retried, 0x07), - CMN_EVENT_RNID(rrt_occ_ovfl, 0x08), - CMN_EVENT_RNID(wrt_occ_ovfl, 0x09), - CMN_EVENT_RNID(txreq_flits_replayed, 0x0a), - CMN_EVENT_RNID(wrcancel_sent, 0x0b), - CMN_EVENT_RNID(s0_wdata_beats, 0x0c), - CMN_EVENT_RNID(s1_wdata_beats, 0x0d), - CMN_EVENT_RNID(s2_wdata_beats, 0x0e), - CMN_EVENT_RNID(rrt_alloc, 0x0f), - CMN_EVENT_RNID(wrt_alloc, 0x10), - CMN_EVENT_RNID(rdb_unord, 0x11), - CMN_EVENT_RNID(rdb_replay, 0x12), - CMN_EVENT_RNID(rdb_hybrid, 0x13), - CMN_EVENT_RNID(rdb_ord, 0x14), + CMN_EVENT_ATTR(CMN_ANY, watchpoint_up, CMN_TYPE_WP, CMN_WP_UP), + CMN_EVENT_ATTR(CMN_ANY, watchpoint_down, CMN_TYPE_WP, CMN_WP_DOWN), + + CMN_EVENT_SBSX(CMN_ANY, rd_req, 0x01), + CMN_EVENT_SBSX(CMN_ANY, wr_req, 0x02), + CMN_EVENT_SBSX(CMN_ANY, cmo_req, 0x03), + CMN_EVENT_SBSX(CMN_ANY, txrsp_retryack, 0x04), + CMN_EVENT_SBSX(CMN_ANY, txdat_flitv, 0x05), + CMN_EVENT_SBSX(CMN_ANY, txrsp_flitv, 0x06), + CMN_EVENT_SBSX(CMN_ANY, rd_req_trkr_occ_cnt_ovfl, 0x11), + CMN_EVENT_SBSX(CMN_ANY, wr_req_trkr_occ_cnt_ovfl, 0x12), + CMN_EVENT_SBSX(CMN_ANY, cmo_req_trkr_occ_cnt_ovfl, 0x13), + CMN_EVENT_SBSX(CMN_ANY, wdb_occ_cnt_ovfl, 0x14), + CMN_EVENT_SBSX(CMN_ANY, rd_axi_trkr_occ_cnt_ovfl, 0x15), + CMN_EVENT_SBSX(CMN_ANY, cmo_axi_trkr_occ_cnt_ovfl, 0x16), + CMN_EVENT_SBSX(NOT_CMN600, rdb_occ_cnt_ovfl, 0x17), + CMN_EVENT_SBSX(CMN_ANY, arvalid_no_arready, 0x21), + CMN_EVENT_SBSX(CMN_ANY, awvalid_no_awready, 0x22), + CMN_EVENT_SBSX(CMN_ANY, wvalid_no_wready, 0x23), + CMN_EVENT_SBSX(CMN_ANY, txdat_stall, 0x24), + CMN_EVENT_SBSX(CMN_ANY, txrsp_stall, 0x25), + + CMN_EVENT_RNID(CMN_ANY, s0_rdata_beats, 0x01), + CMN_EVENT_RNID(CMN_ANY, s1_rdata_beats, 0x02), + CMN_EVENT_RNID(CMN_ANY, s2_rdata_beats, 0x03), + CMN_EVENT_RNID(CMN_ANY, rxdat_flits, 0x04), + CMN_EVENT_RNID(CMN_ANY, txdat_flits, 0x05), + CMN_EVENT_RNID(CMN_ANY, txreq_flits_total, 0x06), + CMN_EVENT_RNID(CMN_ANY, txreq_flits_retried, 0x07), + CMN_EVENT_RNID(CMN_ANY, rrt_occ_ovfl, 0x08), + CMN_EVENT_RNID(CMN_ANY, wrt_occ_ovfl, 0x09), + CMN_EVENT_RNID(CMN_ANY, txreq_flits_replayed, 0x0a), + CMN_EVENT_RNID(CMN_ANY, wrcancel_sent, 0x0b), + CMN_EVENT_RNID(CMN_ANY, s0_wdata_beats, 0x0c), + CMN_EVENT_RNID(CMN_ANY, s1_wdata_beats, 0x0d), + CMN_EVENT_RNID(CMN_ANY, s2_wdata_beats, 0x0e), + CMN_EVENT_RNID(CMN_ANY, rrt_alloc, 0x0f), + CMN_EVENT_RNID(CMN_ANY, wrt_alloc, 0x10), + CMN_EVENT_RNID(CMN600, rdb_unord, 0x11), + CMN_EVENT_RNID(CMN600, rdb_replay, 0x12), + CMN_EVENT_RNID(CMN600, rdb_hybrid, 0x13), + CMN_EVENT_RNID(CMN600, rdb_ord, 0x14), + CMN_EVENT_RNID(NOT_CMN600, padb_occ_ovfl, 0x11), + CMN_EVENT_RNID(NOT_CMN600, rpdb_occ_ovfl, 0x12), + CMN_EVENT_RNID(NOT_CMN600, rrt_occup_ovfl_slice1, 0x13), + CMN_EVENT_RNID(NOT_CMN600, rrt_occup_ovfl_slice2, 0x14), + CMN_EVENT_RNID(NOT_CMN600, rrt_occup_ovfl_slice3, 0x15), + CMN_EVENT_RNID(NOT_CMN600, wrt_throttled, 0x16), + CMN_EVENT_RNID(CMN700, ldb_full, 0x17), + CMN_EVENT_RNID(CMN700, rrt_rd_req_occup_ovfl_slice0, 0x18), + CMN_EVENT_RNID(CMN700, rrt_rd_req_occup_ovfl_slice1, 0x19), + CMN_EVENT_RNID(CMN700, rrt_rd_req_occup_ovfl_slice2, 0x1a), + CMN_EVENT_RNID(CMN700, rrt_rd_req_occup_ovfl_slice3, 0x1b), + CMN_EVENT_RNID(CMN700, rrt_burst_occup_ovfl_slice0, 0x1c), + CMN_EVENT_RNID(CMN700, rrt_burst_occup_ovfl_slice1, 0x1d), + CMN_EVENT_RNID(CMN700, rrt_burst_occup_ovfl_slice2, 0x1e), + CMN_EVENT_RNID(CMN700, rrt_burst_occup_ovfl_slice3, 0x1f), + CMN_EVENT_RNID(CMN700, rrt_burst_alloc, 0x20), + CMN_EVENT_RNID(CMN700, awid_hash, 0x21), + CMN_EVENT_RNID(CMN700, atomic_alloc, 0x22), + CMN_EVENT_RNID(CMN700, atomic_occ_ovfl, 0x23), + + CMN_EVENT_MTSX(tc_lookup, 0x01), + CMN_EVENT_MTSX(tc_fill, 0x02), + CMN_EVENT_MTSX(tc_miss, 0x03), + CMN_EVENT_MTSX(tdb_forward, 0x04), + CMN_EVENT_MTSX(tcq_hazard, 0x05), + CMN_EVENT_MTSX(tcq_rd_alloc, 0x06), + CMN_EVENT_MTSX(tcq_wr_alloc, 0x07), + CMN_EVENT_MTSX(tcq_cmo_alloc, 0x08), + CMN_EVENT_MTSX(axi_rd_req, 0x09), + CMN_EVENT_MTSX(axi_wr_req, 0x0a), + CMN_EVENT_MTSX(tcq_occ_cnt_ovfl, 0x0b), + CMN_EVENT_MTSX(tdb_occ_cnt_ovfl, 0x0c), + + CMN_EVENT_CXRA(CMN_ANY, rht_occ, 0x01), + CMN_EVENT_CXRA(CMN_ANY, sht_occ, 0x02), + CMN_EVENT_CXRA(CMN_ANY, rdb_occ, 0x03), + CMN_EVENT_CXRA(CMN_ANY, wdb_occ, 0x04), + CMN_EVENT_CXRA(CMN_ANY, ssb_occ, 0x05), + CMN_EVENT_CXRA(CMN_ANY, snp_bcasts, 0x06), + CMN_EVENT_CXRA(CMN_ANY, req_chains, 0x07), + CMN_EVENT_CXRA(CMN_ANY, req_chain_avglen, 0x08), + CMN_EVENT_CXRA(CMN_ANY, chirsp_stalls, 0x09), + CMN_EVENT_CXRA(CMN_ANY, chidat_stalls, 0x0a), + CMN_EVENT_CXRA(CMN_ANY, cxreq_pcrd_stalls_link0, 0x0b), + CMN_EVENT_CXRA(CMN_ANY, cxreq_pcrd_stalls_link1, 0x0c), + CMN_EVENT_CXRA(CMN_ANY, cxreq_pcrd_stalls_link2, 0x0d), + CMN_EVENT_CXRA(CMN_ANY, cxdat_pcrd_stalls_link0, 0x0e), + CMN_EVENT_CXRA(CMN_ANY, cxdat_pcrd_stalls_link1, 0x0f), + CMN_EVENT_CXRA(CMN_ANY, cxdat_pcrd_stalls_link2, 0x10), + CMN_EVENT_CXRA(CMN_ANY, external_chirsp_stalls, 0x11), + CMN_EVENT_CXRA(CMN_ANY, external_chidat_stalls, 0x12), + CMN_EVENT_CXRA(NOT_CMN600, cxmisc_pcrd_stalls_link0, 0x13), + CMN_EVENT_CXRA(NOT_CMN600, cxmisc_pcrd_stalls_link1, 0x14), + CMN_EVENT_CXRA(NOT_CMN600, cxmisc_pcrd_stalls_link2, 0x15), + + CMN_EVENT_CXHA(rddatbyp, 0x21), + CMN_EVENT_CXHA(chirsp_up_stall, 0x22), + CMN_EVENT_CXHA(chidat_up_stall, 0x23), + CMN_EVENT_CXHA(snppcrd_link0_stall, 0x24), + CMN_EVENT_CXHA(snppcrd_link1_stall, 0x25), + CMN_EVENT_CXHA(snppcrd_link2_stall, 0x26), + CMN_EVENT_CXHA(reqtrk_occ, 0x27), + CMN_EVENT_CXHA(rdb_occ, 0x28), + CMN_EVENT_CXHA(rdbyp_occ, 0x29), + CMN_EVENT_CXHA(wdb_occ, 0x2a), + CMN_EVENT_CXHA(snptrk_occ, 0x2b), + CMN_EVENT_CXHA(sdb_occ, 0x2c), + CMN_EVENT_CXHA(snphaz_occ, 0x2d), + + CMN_EVENT_CCRA(rht_occ, 0x41), + CMN_EVENT_CCRA(sht_occ, 0x42), + CMN_EVENT_CCRA(rdb_occ, 0x43), + CMN_EVENT_CCRA(wdb_occ, 0x44), + CMN_EVENT_CCRA(ssb_occ, 0x45), + CMN_EVENT_CCRA(snp_bcasts, 0x46), + CMN_EVENT_CCRA(req_chains, 0x47), + CMN_EVENT_CCRA(req_chain_avglen, 0x48), + CMN_EVENT_CCRA(chirsp_stalls, 0x49), + CMN_EVENT_CCRA(chidat_stalls, 0x4a), + CMN_EVENT_CCRA(cxreq_pcrd_stalls_link0, 0x4b), + CMN_EVENT_CCRA(cxreq_pcrd_stalls_link1, 0x4c), + CMN_EVENT_CCRA(cxreq_pcrd_stalls_link2, 0x4d), + CMN_EVENT_CCRA(cxdat_pcrd_stalls_link0, 0x4e), + CMN_EVENT_CCRA(cxdat_pcrd_stalls_link1, 0x4f), + CMN_EVENT_CCRA(cxdat_pcrd_stalls_link2, 0x50), + CMN_EVENT_CCRA(external_chirsp_stalls, 0x51), + CMN_EVENT_CCRA(external_chidat_stalls, 0x52), + CMN_EVENT_CCRA(cxmisc_pcrd_stalls_link0, 0x53), + CMN_EVENT_CCRA(cxmisc_pcrd_stalls_link1, 0x54), + CMN_EVENT_CCRA(cxmisc_pcrd_stalls_link2, 0x55), + CMN_EVENT_CCRA(rht_alloc, 0x56), + CMN_EVENT_CCRA(sht_alloc, 0x57), + CMN_EVENT_CCRA(rdb_alloc, 0x58), + CMN_EVENT_CCRA(wdb_alloc, 0x59), + CMN_EVENT_CCRA(ssb_alloc, 0x5a), + + CMN_EVENT_CCHA(rddatbyp, 0x61), + CMN_EVENT_CCHA(chirsp_up_stall, 0x62), + CMN_EVENT_CCHA(chidat_up_stall, 0x63), + CMN_EVENT_CCHA(snppcrd_link0_stall, 0x64), + CMN_EVENT_CCHA(snppcrd_link1_stall, 0x65), + CMN_EVENT_CCHA(snppcrd_link2_stall, 0x66), + CMN_EVENT_CCHA(reqtrk_occ, 0x67), + CMN_EVENT_CCHA(rdb_occ, 0x68), + CMN_EVENT_CCHA(rdbyp_occ, 0x69), + CMN_EVENT_CCHA(wdb_occ, 0x6a), + CMN_EVENT_CCHA(snptrk_occ, 0x6b), + CMN_EVENT_CCHA(sdb_occ, 0x6c), + CMN_EVENT_CCHA(snphaz_occ, 0x6d), + CMN_EVENT_CCHA(reqtrk_alloc, 0x6e), + CMN_EVENT_CCHA(rdb_alloc, 0x6f), + CMN_EVENT_CCHA(rdbyp_alloc, 0x70), + CMN_EVENT_CCHA(wdb_alloc, 0x71), + CMN_EVENT_CCHA(snptrk_alloc, 0x72), + CMN_EVENT_CCHA(sdb_alloc, 0x73), + CMN_EVENT_CCHA(snphaz_alloc, 0x74), + CMN_EVENT_CCHA(pb_rhu_req_occ, 0x75), + CMN_EVENT_CCHA(pb_rhu_req_alloc, 0x76), + CMN_EVENT_CCHA(pb_rhu_pcie_req_occ, 0x77), + CMN_EVENT_CCHA(pb_rhu_pcie_req_alloc, 0x78), + CMN_EVENT_CCHA(pb_pcie_wr_req_occ, 0x79), + CMN_EVENT_CCHA(pb_pcie_wr_req_alloc, 0x7a), + CMN_EVENT_CCHA(pb_pcie_reg_req_occ, 0x7b), + CMN_EVENT_CCHA(pb_pcie_reg_req_alloc, 0x7c), + CMN_EVENT_CCHA(pb_pcie_rsvd_req_occ, 0x7d), + CMN_EVENT_CCHA(pb_pcie_rsvd_req_alloc, 0x7e), + CMN_EVENT_CCHA(pb_rhu_dat_occ, 0x7f), + CMN_EVENT_CCHA(pb_rhu_dat_alloc, 0x80), + CMN_EVENT_CCHA(pb_rhu_pcie_dat_occ, 0x81), + CMN_EVENT_CCHA(pb_rhu_pcie_dat_alloc, 0x82), + CMN_EVENT_CCHA(pb_pcie_wr_dat_occ, 0x83), + CMN_EVENT_CCHA(pb_pcie_wr_dat_alloc, 0x84), + + CMN_EVENT_CCLA(rx_cxs, 0x21), + CMN_EVENT_CCLA(tx_cxs, 0x22), + CMN_EVENT_CCLA(rx_cxs_avg_size, 0x23), + CMN_EVENT_CCLA(tx_cxs_avg_size, 0x24), + CMN_EVENT_CCLA(tx_cxs_lcrd_backpressure, 0x25), + CMN_EVENT_CCLA(link_crdbuf_occ, 0x26), + CMN_EVENT_CCLA(link_crdbuf_alloc, 0x27), + CMN_EVENT_CCLA(pfwd_rcvr_cxs, 0x28), + CMN_EVENT_CCLA(pfwd_sndr_num_flits, 0x29), + CMN_EVENT_CCLA(pfwd_sndr_stalls_static_crd, 0x2a), + CMN_EVENT_CCLA(pfwd_sndr_stalls_dynmaic_crd, 0x2b), NULL }; @@ -560,12 +1108,12 @@ static ssize_t arm_cmn_format_show(struct device *dev, int lo = __ffs(fmt->field), hi = __fls(fmt->field); if (lo == hi) - return snprintf(buf, PAGE_SIZE, "config:%d\n", lo); + return sysfs_emit(buf, "config:%d\n", lo); if (!fmt->config) - return snprintf(buf, PAGE_SIZE, "config:%d-%d\n", lo, hi); + return sysfs_emit(buf, "config:%d-%d\n", lo, hi); - return snprintf(buf, PAGE_SIZE, "config%d:%d-%d\n", fmt->config, lo, hi); + return sysfs_emit(buf, "config%d:%d-%d\n", fmt->config, lo, hi); } #define _CMN_FORMAT_ATTR(_name, _cfg, _fld) \ @@ -616,7 +1164,7 @@ static struct attribute *arm_cmn_cpumask_attrs[] = { NULL, }; -static struct attribute_group arm_cmn_cpumask_attr_group = { +static const struct attribute_group arm_cmn_cpumask_attr_group = { .attrs = arm_cmn_cpumask_attrs, }; @@ -640,14 +1188,18 @@ static u32 arm_cmn_wp_config(struct perf_event *event) u32 grp = CMN_EVENT_WP_GRP(event); u32 exc = CMN_EVENT_WP_EXCLUSIVE(event); u32 combine = CMN_EVENT_WP_COMBINE(event); + bool is_cmn600 = to_cmn(event->pmu)->model == CMN600; config = FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL, dev) | FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_CHN_SEL, chn) | FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_GRP, grp) | - FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc); + FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL2, dev >> 1); + if (exc) + config |= is_cmn600 ? CMN600_WPn_CONFIG_WP_EXCLUSIVE : + CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE; if (combine && !grp) - config |= CMN_DTM_WPn_CONFIG_WP_COMBINE; - + config |= is_cmn600 ? CMN600_WPn_CONFIG_WP_COMBINE : + CMN_DTM_WPn_CONFIG_WP_COMBINE; return config; } @@ -679,18 +1231,19 @@ static void arm_cmn_pmu_disable(struct pmu *pmu) static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw, bool snapshot) { + struct arm_cmn_dtm *dtm = NULL; struct arm_cmn_node *dn; - unsigned int i, offset; - u64 count = 0; + unsigned int i, offset, dtm_idx; + u64 reg, count = 0; offset = snapshot ? CMN_DTM_PMEVCNTSR : CMN_DTM_PMEVCNT; for_each_hw_dn(hw, dn, i) { - struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn); - int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); - u64 reg = readq_relaxed(xp->pmu_base + offset); - u16 dtm_count = reg >> (dtm_idx * 16); - - count += dtm_count; + if (dtm != &cmn->dtms[dn->dtm]) { + dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset; + reg = readq_relaxed(dtm->base + offset); + } + dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); + count += (u16)(reg >> (dtm_idx * 16)); } return count; } @@ -756,6 +1309,42 @@ static void arm_cmn_event_read(struct perf_event *event) local64_add(delta, &event->count); } +static int arm_cmn_set_event_sel_hi(struct arm_cmn_node *dn, + enum cmn_filter_select fsel, u8 occupid) +{ + u64 reg; + + if (fsel == SEL_NONE) + return 0; + + if (!dn->occupid[fsel].count) { + dn->occupid[fsel].val = occupid; + reg = FIELD_PREP(CMN__PMU_CBUSY_SNTHROTTLE_SEL, + dn->occupid[SEL_CBUSY_SNTHROTTLE_SEL].val) | + FIELD_PREP(CMN__PMU_CLASS_OCCUP_ID, + dn->occupid[SEL_CLASS_OCCUP_ID].val) | + FIELD_PREP(CMN__PMU_OCCUP1_ID, + dn->occupid[SEL_OCCUP1ID].val); + writel_relaxed(reg >> 32, dn->pmu_base + CMN_PMU_EVENT_SEL + 4); + } else if (dn->occupid[fsel].val != occupid) { + return -EBUSY; + } + dn->occupid[fsel].count++; + return 0; +} + +static void arm_cmn_set_event_sel_lo(struct arm_cmn_node *dn, int dtm_idx, + int eventid, bool wide_sel) +{ + if (wide_sel) { + dn->event_w[dtm_idx] = eventid; + writeq_relaxed(le64_to_cpu(dn->event_sel_w), dn->pmu_base + CMN_PMU_EVENT_SEL); + } else { + dn->event[dtm_idx] = eventid; + writel_relaxed(le32_to_cpu(dn->event_sel), dn->pmu_base + CMN_PMU_EVENT_SEL); + } +} + static void arm_cmn_event_start(struct perf_event *event, int flags) { struct arm_cmn *cmn = to_cmn(event->pmu); @@ -774,14 +1363,16 @@ static void arm_cmn_event_start(struct perf_event *event, int flags) u64 mask = CMN_EVENT_WP_MASK(event); for_each_hw_dn(hw, dn, i) { - writeq_relaxed(val, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx)); - writeq_relaxed(mask, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx)); + void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset); + + writeq_relaxed(val, base + CMN_DTM_WPn_VAL(wp_idx)); + writeq_relaxed(mask, base + CMN_DTM_WPn_MASK(wp_idx)); } } else for_each_hw_dn(hw, dn, i) { int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); - dn->event[dtm_idx] = CMN_EVENT_EVENTID(event); - writel_relaxed(le32_to_cpu(dn->event_sel), dn->pmu_base + CMN_PMU_EVENT_SEL); + arm_cmn_set_event_sel_lo(dn, dtm_idx, CMN_EVENT_EVENTID(event), + hw->wide_sel); } } @@ -800,34 +1391,35 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags) int wp_idx = arm_cmn_wp_idx(event); for_each_hw_dn(hw, dn, i) { - writeq_relaxed(0, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx)); - writeq_relaxed(~0ULL, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx)); + void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset); + + writeq_relaxed(0, base + CMN_DTM_WPn_MASK(wp_idx)); + writeq_relaxed(~0ULL, base + CMN_DTM_WPn_VAL(wp_idx)); } } else for_each_hw_dn(hw, dn, i) { int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); - dn->event[dtm_idx] = 0; - writel_relaxed(le32_to_cpu(dn->event_sel), dn->pmu_base + CMN_PMU_EVENT_SEL); + arm_cmn_set_event_sel_lo(dn, dtm_idx, 0, hw->wide_sel); } arm_cmn_event_read(event); } struct arm_cmn_val { - u8 dtm_count[CMN_MAX_XPS]; - u8 occupid[CMN_MAX_XPS]; - u8 wp[CMN_MAX_XPS][4]; + u8 dtm_count[CMN_MAX_DTMS]; + u8 occupid[CMN_MAX_DTMS][SEL_MAX]; + u8 wp[CMN_MAX_DTMS][4]; int dtc_count; bool cycles; }; -static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *event) +static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val, + struct perf_event *event) { struct arm_cmn_hw_event *hw = to_cmn_hw(event); struct arm_cmn_node *dn; enum cmn_node_type type; int i; - u8 occupid; if (is_software_event(event)) return; @@ -839,34 +1431,31 @@ static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *ev } val->dtc_count++; - if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) - occupid = CMN_EVENT_OCCUPID(event) + 1; - else - occupid = 0; for_each_hw_dn(hw, dn, i) { - int wp_idx, xp = arm_cmn_node_to_xp(dn)->logid; + int wp_idx, dtm = dn->dtm, sel = hw->filter_sel; + + val->dtm_count[dtm]++; - val->dtm_count[xp]++; - val->occupid[xp] = occupid; + if (sel > SEL_NONE) + val->occupid[dtm][sel] = CMN_EVENT_OCCUPID(event) + 1; if (type != CMN_TYPE_WP) continue; wp_idx = arm_cmn_wp_idx(event); - val->wp[xp][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1; + val->wp[dtm][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1; } } -static int arm_cmn_validate_group(struct perf_event *event) +static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event) { struct arm_cmn_hw_event *hw = to_cmn_hw(event); struct arm_cmn_node *dn; struct perf_event *sibling, *leader = event->group_leader; enum cmn_node_type type; - struct arm_cmn_val val; - int i; - u8 occupid; + struct arm_cmn_val *val; + int i, ret = -EINVAL; if (leader == event) return 0; @@ -874,54 +1463,73 @@ static int arm_cmn_validate_group(struct perf_event *event) if (event->pmu != leader->pmu && !is_software_event(leader)) return -EINVAL; - memset(&val, 0, sizeof(val)); + val = kzalloc(sizeof(*val), GFP_KERNEL); + if (!val) + return -ENOMEM; - arm_cmn_val_add_event(&val, leader); + arm_cmn_val_add_event(cmn, val, leader); for_each_sibling_event(sibling, leader) - arm_cmn_val_add_event(&val, sibling); + arm_cmn_val_add_event(cmn, val, sibling); type = CMN_EVENT_TYPE(event); - if (type == CMN_TYPE_DTC) - return val.cycles ? -EINVAL : 0; - - if (val.dtc_count == CMN_DT_NUM_COUNTERS) - return -EINVAL; + if (type == CMN_TYPE_DTC) { + ret = val->cycles ? -EINVAL : 0; + goto done; + } - if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) - occupid = CMN_EVENT_OCCUPID(event) + 1; - else - occupid = 0; + if (val->dtc_count == CMN_DT_NUM_COUNTERS) + goto done; for_each_hw_dn(hw, dn, i) { - int wp_idx, wp_cmb, xp = arm_cmn_node_to_xp(dn)->logid; + int wp_idx, wp_cmb, dtm = dn->dtm, sel = hw->filter_sel; - if (val.dtm_count[xp] == CMN_DTM_NUM_COUNTERS) - return -EINVAL; + if (val->dtm_count[dtm] == CMN_DTM_NUM_COUNTERS) + goto done; - if (occupid && val.occupid[xp] && occupid != val.occupid[xp]) - return -EINVAL; + if (sel > SEL_NONE && val->occupid[dtm][sel] && + val->occupid[dtm][sel] != CMN_EVENT_OCCUPID(event) + 1) + goto done; if (type != CMN_TYPE_WP) continue; wp_idx = arm_cmn_wp_idx(event); - if (val.wp[xp][wp_idx]) - return -EINVAL; + if (val->wp[dtm][wp_idx]) + goto done; - wp_cmb = val.wp[xp][wp_idx ^ 1]; + wp_cmb = val->wp[dtm][wp_idx ^ 1]; if (wp_cmb && wp_cmb != CMN_EVENT_WP_COMBINE(event) + 1) - return -EINVAL; + goto done; } - return 0; + ret = 0; +done: + kfree(val); + return ret; +} + +static enum cmn_filter_select arm_cmn_filter_sel(enum cmn_model model, + enum cmn_node_type type, + unsigned int eventid) +{ + struct arm_cmn_event_attr *e; + int i; + + for (i = 0; i < ARRAY_SIZE(arm_cmn_event_attrs) - 1; i++) { + e = container_of(arm_cmn_event_attrs[i], typeof(*e), attr.attr); + if (e->model & model && e->type == type && e->eventid == eventid) + return e->fsel; + } + return SEL_NONE; } + static int arm_cmn_event_init(struct perf_event *event) { struct arm_cmn *cmn = to_cmn(event->pmu); struct arm_cmn_hw_event *hw = to_cmn_hw(event); + struct arm_cmn_node *dn; enum cmn_node_type type; - unsigned int i; bool bynodeid; u16 nodeid, eventid; @@ -940,45 +1548,54 @@ static int arm_cmn_event_init(struct perf_event *event) if (type == CMN_TYPE_DTC) return 0; + eventid = CMN_EVENT_EVENTID(event); /* For watchpoints we need the actual XP node here */ if (type == CMN_TYPE_WP) { type = CMN_TYPE_XP; /* ...and we need a "real" direction */ - eventid = CMN_EVENT_EVENTID(event); if (eventid != CMN_WP_UP && eventid != CMN_WP_DOWN) return -EINVAL; + /* ...but the DTM may depend on which port we're watching */ + if (cmn->multi_dtm) + hw->dtm_offset = CMN_EVENT_WP_DEV_SEL(event) / 2; + } else if (type == CMN_TYPE_XP && cmn->model == CMN700) { + hw->wide_sel = true; } + /* This is sufficiently annoying to recalculate, so cache it */ + hw->filter_sel = arm_cmn_filter_sel(cmn->model, type, eventid); + bynodeid = CMN_EVENT_BYNODEID(event); nodeid = CMN_EVENT_NODEID(event); hw->dn = arm_cmn_node(cmn, type); - for (i = hw->dn - cmn->dns; i < cmn->num_dns && cmn->dns[i].type == type; i++) { - if (!bynodeid) { - hw->num_dns++; - } else if (cmn->dns[i].id != nodeid) { + if (!hw->dn) + return -EINVAL; + for (dn = hw->dn; dn->type == type; dn++) { + if (bynodeid && dn->id != nodeid) { hw->dn++; - } else { - hw->num_dns = 1; - break; + continue; } + hw->num_dns++; + if (bynodeid) + break; } if (!hw->num_dns) { - int bits = arm_cmn_xyidbits(cmn); + struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, nodeid); dev_dbg(cmn->dev, "invalid node 0x%x (%d,%d,%d,%d) type 0x%x\n", - nodeid, CMN_NODEID_X(nodeid, bits), CMN_NODEID_Y(nodeid, bits), - CMN_NODEID_PID(nodeid), CMN_NODEID_DEVID(nodeid), type); + nodeid, nid.x, nid.y, nid.port, nid.dev, type); return -EINVAL; } /* - * By assuming events count in all DTC domains, we cunningly avoid - * needing to know anything about how XPs are assigned to domains. + * Keep assuming non-cycles events count in all DTC domains; turns out + * it's hard to make a worthwhile optimisation around this, short of + * going all-in with domain-local counter allocation as well. */ hw->dtcs_used = (1U << cmn->num_dtcs) - 1; - return arm_cmn_validate_group(event); + return arm_cmn_validate_group(cmn, event); } static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event, @@ -988,17 +1605,17 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event, enum cmn_node_type type = CMN_EVENT_TYPE(event); while (i--) { - struct arm_cmn_node *xp = arm_cmn_node_to_xp(hw->dn + i); + struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm] + hw->dtm_offset; unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); if (type == CMN_TYPE_WP) - hw->dn[i].wp_event[arm_cmn_wp_idx(event)] = -1; + dtm->wp_event[arm_cmn_wp_idx(event)] = -1; - if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) - hw->dn[i].occupid_count--; + if (hw->filter_sel > SEL_NONE) + hw->dn[i].occupid[hw->filter_sel].count--; - xp->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx); - writel_relaxed(xp->pmu_config_low, xp->pmu_base + CMN_DTM_PMU_CONFIG); + dtm->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx); + writel_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG); } memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx)); @@ -1040,12 +1657,12 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) /* ...then the local counters to feed it. */ for_each_hw_dn(hw, dn, i) { - struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn); + struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset; unsigned int dtm_idx, shift; u64 reg; dtm_idx = 0; - while (xp->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx)) + while (dtm->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx)) if (++dtm_idx == CMN_DTM_NUM_COUNTERS) goto free_dtms; @@ -1055,47 +1672,39 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) int tmp, wp_idx = arm_cmn_wp_idx(event); u32 cfg = arm_cmn_wp_config(event); - if (dn->wp_event[wp_idx] >= 0) + if (dtm->wp_event[wp_idx] >= 0) goto free_dtms; - tmp = dn->wp_event[wp_idx ^ 1]; + tmp = dtm->wp_event[wp_idx ^ 1]; if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) != CMN_EVENT_WP_COMBINE(dtc->counters[tmp])) goto free_dtms; input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx; - dn->wp_event[wp_idx] = dtc_idx; - writel_relaxed(cfg, dn->pmu_base + CMN_DTM_WPn_CONFIG(wp_idx)); + dtm->wp_event[wp_idx] = dtc_idx; + writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx)); } else { - unsigned int port = CMN_NODEID_PID(dn->id); - unsigned int dev = CMN_NODEID_DEVID(dn->id); + struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); + + if (cmn->multi_dtm) + nid.port %= 2; input_sel = CMN__PMEVCNT0_INPUT_SEL_DEV + dtm_idx + - (port << 4) + (dev << 2); - - if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) { - int occupid = CMN_EVENT_OCCUPID(event); - - if (dn->occupid_count == 0) { - dn->occupid_val = occupid; - writel_relaxed(occupid, - dn->pmu_base + CMN_PMU_EVENT_SEL + 4); - } else if (dn->occupid_val != occupid) { - goto free_dtms; - } - dn->occupid_count++; - } + (nid.port << 4) + (nid.dev << 2); + + if (arm_cmn_set_event_sel_hi(dn, hw->filter_sel, CMN_EVENT_OCCUPID(event))) + goto free_dtms; } arm_cmn_set_index(hw->dtm_idx, i, dtm_idx); - xp->input_sel[dtm_idx] = input_sel; + dtm->input_sel[dtm_idx] = input_sel; shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx); - xp->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift); - xp->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift; - xp->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx); - reg = (u64)le32_to_cpu(xp->pmu_config_high) << 32 | xp->pmu_config_low; - writeq_relaxed(reg, xp->pmu_base + CMN_DTM_PMU_CONFIG); + dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift); + dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift; + dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx); + reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low; + writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG); } /* Go go go! */ @@ -1147,23 +1756,47 @@ static int arm_cmn_commit_txn(struct pmu *pmu) return 0; } -static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) +static void arm_cmn_migrate(struct arm_cmn *cmn, unsigned int cpu) +{ + unsigned int i; + + perf_pmu_migrate_context(&cmn->pmu, cmn->cpu, cpu); + for (i = 0; i < cmn->num_dtcs; i++) + irq_set_affinity_hint(cmn->dtc[i].irq, cpumask_of(cpu)); + cmn->cpu = cpu; +} + +static int arm_cmn_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) { struct arm_cmn *cmn; - unsigned int i, target; + int node; - cmn = hlist_entry_safe(node, struct arm_cmn, cpuhp_node); - if (cpu != cmn->cpu) - return 0; + cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node); + node = dev_to_node(cmn->dev); + if (node != NUMA_NO_NODE && cpu_to_node(cmn->cpu) != node && cpu_to_node(cpu) == node) + arm_cmn_migrate(cmn, cpu); + return 0; +} + +static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct arm_cmn *cmn; + unsigned int target; + int node; + cpumask_t mask; - target = cpumask_any_but(cpu_online_mask, cpu); - if (target >= nr_cpu_ids) + cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node); + if (cpu != cmn->cpu) return 0; - perf_pmu_migrate_context(&cmn->pmu, cpu, target); - for (i = 0; i < cmn->num_dtcs; i++) - irq_set_affinity_hint(cmn->dtc[i].irq, cpumask_of(target)); - cmn->cpu = target; + node = dev_to_node(cmn->dev); + if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) && + cpumask_andnot(&mask, &mask, cpumask_of(cpu))) + target = cpumask_any(&mask); + else + target = cpumask_any_but(cpu_online_mask, cpu); + if (target < nr_cpu_ids) + arm_cmn_migrate(cmn, target); return 0; } @@ -1231,23 +1864,23 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn) return 0; } -static void arm_cmn_init_dtm(struct arm_cmn_node *xp) +static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, int idx) { int i; + dtm->base = xp->pmu_base + CMN_DTM_OFFSET(idx); + dtm->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN; + writeq_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG); for (i = 0; i < 4; i++) { - xp->wp_event[i] = -1; - writeq_relaxed(0, xp->pmu_base + CMN_DTM_WPn_MASK(i)); - writeq_relaxed(~0ULL, xp->pmu_base + CMN_DTM_WPn_VAL(i)); + dtm->wp_event[i] = -1; + writeq_relaxed(0, dtm->base + CMN_DTM_WPn_MASK(i)); + writeq_relaxed(~0ULL, dtm->base + CMN_DTM_WPn_VAL(i)); } - xp->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN; - xp->dtc = -1; } static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int idx) { struct arm_cmn_dtc *dtc = cmn->dtc + idx; - struct arm_cmn_node *xp; dtc->base = dn->pmu_base - CMN_PMU_OFFSET; dtc->irq = platform_get_irq(to_platform_device(cmn->dev), idx); @@ -1259,10 +1892,6 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id writeq_relaxed(0, dtc->base + CMN_DT_PMCCNTR); writel_relaxed(0x1ff, dtc->base + CMN_DT_PMOVSR_CLR); - /* We do at least know that a DTC's XP must be in that DTC's domain */ - xp = arm_cmn_node_to_xp(dn); - xp->dtc = idx; - return 0; } @@ -1279,8 +1908,9 @@ static int arm_cmn_node_cmp(const void *a, const void *b) static int arm_cmn_init_dtcs(struct arm_cmn *cmn) { - struct arm_cmn_node *dn; + struct arm_cmn_node *dn, *xp; int dtc_idx = 0; + u8 dtcs_present = (1 << cmn->num_dtcs) - 1; cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL); if (!cmn->dtc) @@ -1290,18 +1920,34 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn) cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP); - for (dn = cmn->dns; dn < cmn->dns + cmn->num_dns; dn++) { - if (dn->type != CMN_TYPE_XP) - arm_cmn_init_node_to_xp(cmn, dn); - else if (cmn->num_dtcs == 1) - dn->dtc = 0; + for (dn = cmn->dns; dn->type; dn++) { + if (dn->type == CMN_TYPE_XP) { + dn->dtc &= dtcs_present; + continue; + } - if (dn->type == CMN_TYPE_DTC) - arm_cmn_init_dtc(cmn, dn, dtc_idx++); + xp = arm_cmn_node_to_xp(cmn, dn); + dn->dtm = xp->dtm; + if (cmn->multi_dtm) + dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2; + + if (dn->type == CMN_TYPE_DTC) { + int err; + /* We do at least know that a DTC's XP must be in that DTC's domain */ + if (xp->dtc == 0xf) + xp->dtc = 1 << dtc_idx; + err = arm_cmn_init_dtc(cmn, dn, dtc_idx++); + if (err) + return err; + } /* To the PMU, RN-Ds don't add anything over RN-Is, so smoosh them together */ if (dn->type == CMN_TYPE_RND) dn->type = CMN_TYPE_RNI; + + /* We split the RN-I off already, so let the CCLA part match CCLA events */ + if (dn->type == CMN_TYPE_CCLA_RNI) + dn->type = CMN_TYPE_CCLA; } arm_cmn_set_state(cmn, CMN_STATE_DISABLED); @@ -1332,23 +1978,45 @@ static void arm_cmn_init_node_info(struct arm_cmn *cmn, u32 offset, struct arm_c node->type, node->logid, offset); } +static enum cmn_node_type arm_cmn_subtype(enum cmn_node_type type) +{ + switch (type) { + case CMN_TYPE_HNP: + return CMN_TYPE_HNI; + case CMN_TYPE_CCLA_RNI: + return CMN_TYPE_RNI; + default: + return CMN_TYPE_INVALID; + } +} + static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) { void __iomem *cfg_region; struct arm_cmn_node cfg, *dn; + struct arm_cmn_dtm *dtm; u16 child_count, child_poff; u32 xp_offset[CMN_MAX_XPS]; u64 reg; int i, j; + size_t sz; + + arm_cmn_init_node_info(cmn, rgn_offset, &cfg); + if (cfg.type != CMN_TYPE_CFG) + return -ENODEV; cfg_region = cmn->base + rgn_offset; reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2); cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg); - dev_dbg(cmn->dev, "periph_id_2 revision: %d\n", cmn->rev); - arm_cmn_init_node_info(cmn, rgn_offset, &cfg); - if (cfg.type != CMN_TYPE_CFG) - return -ENODEV; + reg = readq_relaxed(cfg_region + CMN_CFGM_INFO_GLOBAL); + cmn->multi_dtm = reg & CMN_INFO_MULTIPLE_DTM_EN; + cmn->rsp_vc_num = FIELD_GET(CMN_INFO_RSP_VC_NUM, reg); + cmn->dat_vc_num = FIELD_GET(CMN_INFO_DAT_VC_NUM, reg); + + reg = readq_relaxed(cfg_region + CMN_CFGM_INFO_GLOBAL_1); + cmn->snp_vc_num = FIELD_GET(CMN_INFO_SNP_VC_NUM, reg); + cmn->req_vc_num = FIELD_GET(CMN_INFO_REQ_VC_NUM, reg); reg = readq_relaxed(cfg_region + CMN_CHILD_INFO); child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg); @@ -1366,20 +2034,33 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) cmn->num_dns += FIELD_GET(CMN_CI_CHILD_COUNT, reg); } - /* Cheeky +1 to help terminate pointer-based iteration */ - cmn->dns = devm_kcalloc(cmn->dev, cmn->num_dns + 1, - sizeof(*cmn->dns), GFP_KERNEL); - if (!cmn->dns) + /* + * Some nodes effectively have two separate types, which we'll handle + * by creating one of each internally. For a (very) safe initial upper + * bound, account for double the number of non-XP nodes. + */ + dn = devm_kcalloc(cmn->dev, cmn->num_dns * 2 - cmn->num_xps, + sizeof(*dn), GFP_KERNEL); + if (!dn) + return -ENOMEM; + + /* Initial safe upper bound on DTMs for any possible mesh layout */ + i = cmn->num_xps; + if (cmn->multi_dtm) + i += cmn->num_xps + 1; + dtm = devm_kcalloc(cmn->dev, i, sizeof(*dtm), GFP_KERNEL); + if (!dtm) return -ENOMEM; /* Pass 2: now we can actually populate the nodes */ - dn = cmn->dns; + cmn->dns = dn; + cmn->dtms = dtm; for (i = 0; i < cmn->num_xps; i++) { void __iomem *xp_region = cmn->base + xp_offset[i]; struct arm_cmn_node *xp = dn++; + unsigned int xp_ports = 0; arm_cmn_init_node_info(cmn, xp_offset[i], xp); - arm_cmn_init_dtm(xp); /* * Thanks to the order in which XP logical IDs seem to be * assigned, we can handily infer the mesh X dimension by @@ -1389,6 +2070,40 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) if (xp->id == (1 << 3)) cmn->mesh_x = xp->logid; + if (cmn->model == CMN600) + xp->dtc = 0xf; + else + xp->dtc = 1 << readl_relaxed(xp_region + CMN_DTM_UNIT_INFO); + + xp->dtm = dtm - cmn->dtms; + arm_cmn_init_dtm(dtm++, xp, 0); + /* + * Keeping track of connected ports will let us filter out + * unnecessary XP events easily. We can also reliably infer the + * "extra device ports" configuration for the node ID format + * from this, since in that case we will see at least one XP + * with port 2 connected, for the HN-D. + */ + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P0)) + xp_ports |= BIT(0); + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P1)) + xp_ports |= BIT(1); + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P2)) + xp_ports |= BIT(2); + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P3)) + xp_ports |= BIT(3); + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P4)) + xp_ports |= BIT(4); + if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P5)) + xp_ports |= BIT(5); + + if (cmn->multi_dtm && (xp_ports & 0xc)) + arm_cmn_init_dtm(dtm++, xp, 1); + if (cmn->multi_dtm && (xp_ports & 0x30)) + arm_cmn_init_dtm(dtm++, xp, 2); + + cmn->ports_used |= xp_ports; + reg = readq_relaxed(xp_region + CMN_CHILD_INFO); child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg); child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg); @@ -1423,14 +2138,33 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) case CMN_TYPE_SBSX: case CMN_TYPE_RNI: case CMN_TYPE_RND: + case CMN_TYPE_MTSX: case CMN_TYPE_CXRA: case CMN_TYPE_CXHA: + case CMN_TYPE_CCRA: + case CMN_TYPE_CCHA: + case CMN_TYPE_CCLA: dn++; break; /* Nothing to see here */ + case CMN_TYPE_MPAM_S: + case CMN_TYPE_MPAM_NS: case CMN_TYPE_RNSAM: case CMN_TYPE_CXLA: break; + /* + * Split "optimised" combination nodes into separate + * types for the different event sets. Offsetting the + * base address lets us handle the second pmu_event_sel + * register via the normal mechanism later. + */ + case CMN_TYPE_HNP: + case CMN_TYPE_CCLA_RNI: + dn[1] = dn[0]; + dn[0].pmu_base += CMN_HNP_PMU_EVENT_SEL; + dn[1].type = arm_cmn_subtype(dn->type); + dn += 2; + break; /* Something has gone horribly wrong */ default: dev_err(cmn->dev, "invalid device node type: 0x%x\n", dn->type); @@ -1439,9 +2173,20 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) } } - /* Correct for any nodes we skipped */ + /* Correct for any nodes we added or skipped */ cmn->num_dns = dn - cmn->dns; + /* Cheeky +1 to help terminate pointer-based iteration later */ + sz = (void *)(dn + 1) - (void *)cmn->dns; + dn = devm_krealloc(cmn->dev, cmn->dns, sz, GFP_KERNEL); + if (dn) + cmn->dns = dn; + + sz = (void *)dtm - (void *)cmn->dtms; + dtm = devm_krealloc(cmn->dev, cmn->dtms, sz, GFP_KERNEL); + if (dtm) + cmn->dtms = dtm; + /* * If mesh_x wasn't set during discovery then we never saw * an XP at (0,1), thus we must have an Nx1 configuration. @@ -1450,13 +2195,20 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) cmn->mesh_x = cmn->num_xps; cmn->mesh_y = cmn->num_xps / cmn->mesh_x; - dev_dbg(cmn->dev, "mesh %dx%d, ID width %d\n", - cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn)); + /* 1x1 config plays havoc with XP event encodings */ + if (cmn->num_xps == 1) + dev_warn(cmn->dev, "1x1 config not fully supported, translate XP events manually\n"); + + dev_dbg(cmn->dev, "model %d, periph_id_2 revision %d\n", cmn->model, cmn->rev); + reg = cmn->ports_used; + dev_dbg(cmn->dev, "mesh %dx%d, ID width %d, ports %6pbl%s\n", + cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn), ®, + cmn->multi_dtm ? ", multi-DTM" : ""); return 0; } -static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn) +static int arm_cmn600_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn) { struct resource *cfg, *root; @@ -1483,21 +2235,11 @@ static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn) return root->start - cfg->start; } -static int arm_cmn_of_probe(struct platform_device *pdev, struct arm_cmn *cmn) +static int arm_cmn600_of_probe(struct device_node *np) { - struct device_node *np = pdev->dev.of_node; u32 rootnode; - int ret; - cmn->base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(cmn->base)) - return PTR_ERR(cmn->base); - - ret = of_property_read_u32(np, "arm,root-node", &rootnode); - if (ret) - return ret; - - return rootnode; + return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode; } static int arm_cmn_probe(struct platform_device *pdev) @@ -1505,19 +2247,26 @@ static int arm_cmn_probe(struct platform_device *pdev) struct arm_cmn *cmn; const char *name; static atomic_t id; - int err, rootnode; + int err, rootnode, this_id; cmn = devm_kzalloc(&pdev->dev, sizeof(*cmn), GFP_KERNEL); if (!cmn) return -ENOMEM; cmn->dev = &pdev->dev; + cmn->model = (unsigned long)device_get_match_data(cmn->dev); platform_set_drvdata(pdev, cmn); - if (has_acpi_companion(cmn->dev)) - rootnode = arm_cmn_acpi_probe(pdev, cmn); - else - rootnode = arm_cmn_of_probe(pdev, cmn); + if (cmn->model == CMN600 && has_acpi_companion(cmn->dev)) { + rootnode = arm_cmn600_acpi_probe(pdev, cmn); + } else { + rootnode = 0; + cmn->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(cmn->base)) + return PTR_ERR(cmn->base); + if (cmn->model == CMN600) + rootnode = arm_cmn600_of_probe(pdev->dev.of_node); + } if (rootnode < 0) return rootnode; @@ -1533,7 +2282,7 @@ static int arm_cmn_probe(struct platform_device *pdev) if (err) return err; - cmn->cpu = raw_smp_processor_id(); + cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev)); cmn->pmu = (struct pmu) { .module = THIS_MODULE, .attr_groups = arm_cmn_attr_groups, @@ -1552,7 +2301,8 @@ static int arm_cmn_probe(struct platform_device *pdev) .cancel_txn = arm_cmn_end_txn, }; - name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", atomic_fetch_inc(&id)); + this_id = atomic_fetch_inc(&id); + name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", this_id); if (!name) return -ENOMEM; @@ -1562,7 +2312,10 @@ static int arm_cmn_probe(struct platform_device *pdev) err = perf_pmu_register(&cmn->pmu, name, -1); if (err) - cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node); + cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node); + else + arm_cmn_debugfs_init(cmn, this_id); + return err; } @@ -1574,7 +2327,8 @@ static int arm_cmn_remove(struct platform_device *pdev) writel_relaxed(0, cmn->dtc[0].base + CMN_DT_DTC_CTL); perf_pmu_unregister(&cmn->pmu); - cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node); + cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node); + debugfs_remove(cmn->debug); for (i = 0; i < cmn->num_dtcs; i++) irq_set_affinity_hint(cmn->dtc[i].irq, NULL); @@ -1584,7 +2338,10 @@ static int arm_cmn_remove(struct platform_device *pdev) #ifdef CONFIG_OF static const struct of_device_id arm_cmn_of_match[] = { - { .compatible = "arm,cmn-600", }, + { .compatible = "arm,cmn-600", .data = (void *)CMN600 }, + { .compatible = "arm,cmn-650", .data = (void *)CMN650 }, + { .compatible = "arm,cmn-700", .data = (void *)CMN700 }, + { .compatible = "arm,ci-700", .data = (void *)CI700 }, {} }; MODULE_DEVICE_TABLE(of, arm_cmn_of_match); @@ -1592,7 +2349,9 @@ MODULE_DEVICE_TABLE(of, arm_cmn_of_match); #ifdef CONFIG_ACPI static const struct acpi_device_id arm_cmn_acpi_match[] = { - { "ARMHC600", }, + { "ARMHC600", CMN600 }, + { "ARMHC650", CMN650 }, + { "ARMHC700", CMN700 }, {} }; MODULE_DEVICE_TABLE(acpi, arm_cmn_acpi_match); @@ -1613,15 +2372,20 @@ static int __init arm_cmn_init(void) int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, - "perf/arm/cmn:online", NULL, + "perf/arm/cmn:online", + arm_cmn_pmu_online_cpu, arm_cmn_pmu_offline_cpu); if (ret < 0) return ret; arm_cmn_hp_state = ret; + arm_cmn_debugfs = debugfs_create_dir("arm-cmn", NULL); + ret = platform_driver_register(&arm_cmn_driver); - if (ret) + if (ret) { cpuhp_remove_multi_state(arm_cmn_hp_state); + debugfs_remove(arm_cmn_debugfs); + } return ret; } @@ -1629,6 +2393,7 @@ static void __exit arm_cmn_exit(void) { platform_driver_unregister(&arm_cmn_driver); cpuhp_remove_multi_state(arm_cmn_hp_state); + debugfs_remove(arm_cmn_debugfs); } module_init(arm_cmn_init); diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index 1db8eccc9735c..83bc031d79b79 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -136,8 +136,7 @@ static ssize_t dsu_pmu_sysfs_event_show(struct device *dev, { struct dev_ext_attribute *eattr = container_of(attr, struct dev_ext_attribute, attr); - return snprintf(buf, PAGE_SIZE, "event=0x%lx\n", - (unsigned long)eattr->var); + return sysfs_emit(buf, "event=0x%lx\n", (unsigned long)eattr->var); } static ssize_t dsu_pmu_sysfs_format_show(struct device *dev, @@ -146,7 +145,7 @@ static ssize_t dsu_pmu_sysfs_format_show(struct device *dev, { struct dev_ext_attribute *eattr = container_of(attr, struct dev_ext_attribute, attr); - return snprintf(buf, PAGE_SIZE, "%s\n", (char *)eattr->var); + return sysfs_emit(buf, "%s\n", (char *)eattr->var); } static ssize_t dsu_pmu_cpumask_show(struct device *dev, diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 7fd11ef5cb8a2..952264f4fd796 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -575,7 +575,7 @@ static struct attribute *armpmu_common_attrs[] = { NULL, }; -static struct attribute_group armpmu_common_attr_group = { +static const struct attribute_group armpmu_common_attr_group = { .attrs = armpmu_common_attrs, }; diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c index f5a33dbe7acb9..da3d1d807c179 100644 --- a/drivers/perf/arm_smmuv3_pmu.c +++ b/drivers/perf/arm_smmuv3_pmu.c @@ -493,7 +493,7 @@ static struct attribute *smmu_pmu_cpumask_attrs[] = { NULL }; -static struct attribute_group smmu_pmu_cpumask_group = { +static const struct attribute_group smmu_pmu_cpumask_group = { .attrs = smmu_pmu_cpumask_attrs, }; @@ -548,7 +548,7 @@ static umode_t smmu_pmu_event_is_visible(struct kobject *kobj, return 0; } -static struct attribute_group smmu_pmu_events_group = { +static const struct attribute_group smmu_pmu_events_group = { .name = "events", .attrs = smmu_pmu_events, .is_visible = smmu_pmu_event_is_visible, @@ -568,7 +568,7 @@ static struct attribute *smmu_pmu_formats[] = { NULL }; -static struct attribute_group smmu_pmu_format_group = { +static const struct attribute_group smmu_pmu_format_group = { .name = "format", .attrs = smmu_pmu_formats, }; diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 6fbfcab4918cf..a9d001b2447f4 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -144,8 +144,7 @@ static ssize_t arm_spe_pmu_cap_show(struct device *dev, container_of(attr, struct dev_ext_attribute, attr); int cap = (long)ea->var; - return snprintf(buf, PAGE_SIZE, "%u\n", - arm_spe_pmu_cap_get(spe_pmu, cap)); + return sysfs_emit(buf, "%u\n", arm_spe_pmu_cap_get(spe_pmu, cap)); } #define SPE_EXT_ATTR_ENTRY(_name, _func, _var) \ @@ -164,7 +163,7 @@ static struct attribute *arm_spe_pmu_cap_attr[] = { NULL, }; -static struct attribute_group arm_spe_pmu_cap_group = { +static const struct attribute_group arm_spe_pmu_cap_group = { .name = "caps", .attrs = arm_spe_pmu_cap_attr, }; @@ -245,7 +244,7 @@ static struct attribute *arm_spe_pmu_formats_attr[] = { NULL, }; -static struct attribute_group arm_spe_pmu_format_group = { +static const struct attribute_group arm_spe_pmu_format_group = { .name = "format", .attrs = arm_spe_pmu_formats_attr, }; @@ -265,7 +264,7 @@ static struct attribute *arm_spe_pmu_attrs[] = { NULL, }; -static struct attribute_group arm_spe_pmu_group = { +static const struct attribute_group arm_spe_pmu_group = { .attrs = arm_spe_pmu_attrs, }; diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c index e09bbf3890c49..b86f6be0f1cbe 100644 --- a/drivers/perf/fsl_imx8_ddr_perf.c +++ b/drivers/perf/fsl_imx8_ddr_perf.c @@ -116,8 +116,7 @@ static ssize_t ddr_perf_filter_cap_show(struct device *dev, container_of(attr, struct dev_ext_attribute, attr); int cap = (long)ea->var; - return snprintf(buf, PAGE_SIZE, "%u\n", - ddr_perf_filter_cap_get(pmu, cap)); + return sysfs_emit(buf, "%u\n", ddr_perf_filter_cap_get(pmu, cap)); } #define PERF_EXT_ATTR_ENTRY(_name, _func, _var) \ diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 3e377f3c69e5d..1b5834ac6ca0e 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -109,7 +109,7 @@ config PTP_1588_CLOCK_PCH config PTP_1588_CLOCK_KVM tristate "KVM virtual PTP clock" depends on PTP_1588_CLOCK - depends on KVM_GUEST && X86 + depends on (KVM_GUEST && X86) || (HAVE_ARM_SMCCC_DISCOVERY && ARM_ARCH_TIMER) default y help This driver adds support for using kvm infrastructure as a PTP diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile index 7aff75f745dca..9fa5ede44b2b2 100644 --- a/drivers/ptp/Makefile +++ b/drivers/ptp/Makefile @@ -4,6 +4,8 @@ # ptp-y := ptp_clock.o ptp_chardev.o ptp_sysfs.o +ptp_kvm-$(CONFIG_X86) := ptp_kvm_x86.o ptp_kvm_common.o +ptp_kvm-$(CONFIG_HAVE_ARM_SMCCC) := ptp_kvm_arm.o ptp_kvm_common.o obj-$(CONFIG_PTP_1588_CLOCK) += ptp.o obj-$(CONFIG_PTP_1588_CLOCK_DTE) += ptp_dte.o obj-$(CONFIG_PTP_1588_CLOCK_INES) += ptp_ines.o diff --git a/drivers/ptp/ptp_kvm_arm.c b/drivers/ptp/ptp_kvm_arm.c new file mode 100644 index 0000000000000..b7d28c8dfb84e --- /dev/null +++ b/drivers/ptp/ptp_kvm_arm.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Virtual PTP 1588 clock for use with KVM guests + * Copyright (C) 2019 ARM Ltd. + * All Rights Reserved + */ + +#include +#include + +#include +#include + +int kvm_arch_ptp_init(void) +{ + int ret; + + ret = kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_PTP); + if (ret <= 0) + return -EOPNOTSUPP; + + return 0; +} + +int kvm_arch_ptp_get_clock(struct timespec64 *ts) +{ + return kvm_arch_ptp_get_crosststamp(NULL, ts, NULL); +} diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm_common.c similarity index 60% rename from drivers/ptp/ptp_kvm.c rename to drivers/ptp/ptp_kvm_common.c index 658d33fc31952..fcae32f56f25a 100644 --- a/drivers/ptp/ptp_kvm.c +++ b/drivers/ptp/ptp_kvm_common.c @@ -8,11 +8,11 @@ #include #include #include +#include #include +#include #include #include -#include -#include #include #include @@ -24,56 +24,29 @@ struct kvm_ptp_clock { static DEFINE_SPINLOCK(kvm_ptp_lock); -static struct pvclock_vsyscall_time_info *hv_clock; - -static struct kvm_clock_pairing clock_pair; -static phys_addr_t clock_pair_gpa; - static int ptp_kvm_get_time_fn(ktime_t *device_time, struct system_counterval_t *system_counter, void *ctx) { - unsigned long ret; + long ret; + u64 cycle; struct timespec64 tspec; - unsigned version; - int cpu; - struct pvclock_vcpu_time_info *src; + struct clocksource *cs; spin_lock(&kvm_ptp_lock); preempt_disable_notrace(); - cpu = smp_processor_id(); - src = &hv_clock[cpu].pvti; - - do { - /* - * We are using a TSC value read in the hosts - * kvm_hc_clock_pairing handling. - * So any changes to tsc_to_system_mul - * and tsc_shift or any other pvclock - * data invalidate that measurement. - */ - version = pvclock_read_begin(src); - - ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, - clock_pair_gpa, - KVM_CLOCK_PAIRING_WALLCLOCK); - if (ret != 0) { - pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret); - spin_unlock(&kvm_ptp_lock); - preempt_enable_notrace(); - return -EOPNOTSUPP; - } - - tspec.tv_sec = clock_pair.sec; - tspec.tv_nsec = clock_pair.nsec; - ret = __pvclock_read_cycles(src, clock_pair.tsc); - } while (pvclock_read_retry(src, version)); + ret = kvm_arch_ptp_get_crosststamp(&cycle, &tspec, &cs); + if (ret) { + spin_unlock(&kvm_ptp_lock); + preempt_enable_notrace(); + return ret; + } preempt_enable_notrace(); - system_counter->cycles = ret; - system_counter->cs = &kvm_clock; + system_counter->cycles = cycle; + system_counter->cs = cs; *device_time = timespec64_to_ktime(tspec); @@ -111,22 +84,17 @@ static int ptp_kvm_settime(struct ptp_clock_info *ptp, static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) { - unsigned long ret; + long ret; struct timespec64 tspec; spin_lock(&kvm_ptp_lock); - ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, - clock_pair_gpa, - KVM_CLOCK_PAIRING_WALLCLOCK); - if (ret != 0) { - pr_err_ratelimited("clock offset hypercall ret %lu\n", ret); + ret = kvm_arch_ptp_get_clock(&tspec); + if (ret) { spin_unlock(&kvm_ptp_lock); - return -EOPNOTSUPP; + return ret; } - tspec.tv_sec = clock_pair.sec; - tspec.tv_nsec = clock_pair.nsec; spin_unlock(&kvm_ptp_lock); memcpy(ts, &tspec, sizeof(struct timespec64)); @@ -168,19 +136,12 @@ static int __init ptp_kvm_init(void) { long ret; - if (!kvm_para_available()) - return -ENODEV; - - clock_pair_gpa = slow_virt_to_phys(&clock_pair); - hv_clock = pvclock_get_pvti_cpu0_va(); - - if (!hv_clock) - return -ENODEV; - - ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, - KVM_CLOCK_PAIRING_WALLCLOCK); - if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP) - return -ENODEV; + ret = kvm_arch_ptp_init(); + if (ret) { + if (ret != -EOPNOTSUPP) + pr_err("fail to initialize ptp_kvm"); + return ret; + } kvm_ptp_clock.caps = ptp_kvm_caps; diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c new file mode 100644 index 0000000000000..3dd519dfc473c --- /dev/null +++ b/drivers/ptp/ptp_kvm_x86.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtual PTP 1588 clock for use with KVM guests + * + * Copyright (C) 2017 Red Hat Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct pvclock_vsyscall_time_info *hv_clock; + +static phys_addr_t clock_pair_gpa; +static struct kvm_clock_pairing clock_pair; + +int kvm_arch_ptp_init(void) +{ + long ret; + + if (!kvm_para_available()) + return -ENODEV; + + clock_pair_gpa = slow_virt_to_phys(&clock_pair); + hv_clock = pvclock_get_pvti_cpu0_va(); + if (!hv_clock) + return -ENODEV; + + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, + KVM_CLOCK_PAIRING_WALLCLOCK); + if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP) + return -ENODEV; + + return 0; +} + +int kvm_arch_ptp_get_clock(struct timespec64 *ts) +{ + long ret; + + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, + clock_pair_gpa, + KVM_CLOCK_PAIRING_WALLCLOCK); + if (ret != 0) { + pr_err_ratelimited("clock offset hypercall ret %lu\n", ret); + return -EOPNOTSUPP; + } + + ts->tv_sec = clock_pair.sec; + ts->tv_nsec = clock_pair.nsec; + + return 0; +} + +int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec, + struct clocksource **cs) +{ + struct pvclock_vcpu_time_info *src; + unsigned int version; + long ret; + int cpu; + + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + + do { + /* + * We are using a TSC value read in the hosts + * kvm_hc_clock_pairing handling. + * So any changes to tsc_to_system_mul + * and tsc_shift or any other pvclock + * data invalidate that measurement. + */ + version = pvclock_read_begin(src); + + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, + clock_pair_gpa, + KVM_CLOCK_PAIRING_WALLCLOCK); + if (ret != 0) { + pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret); + return -EOPNOTSUPP; + } + tspec->tv_sec = clock_pair.sec; + tspec->tv_nsec = clock_pair.nsec; + *cycle = __pvclock_read_cycles(src, clock_pair.tsc); + } while (pvclock_read_retry(src, version)); + + *cs = &kvm_clock; + + return 0; +} diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 6524e1fe54d2e..79677ed29b1fe 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -482,6 +482,7 @@ config SCSI_ARCMSR source "drivers/scsi/esas2r/Kconfig" source "drivers/scsi/megaraid/Kconfig.megaraid" source "drivers/scsi/mpt3sas/Kconfig" +source "drivers/scsi/mpi3mr/Kconfig" source "drivers/scsi/smartpqi/Kconfig" source "drivers/scsi/ufs/Kconfig" diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index c00e3dd57990c..c5c1249c1166c 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_MEGARAID_LEGACY) += megaraid.o obj-$(CONFIG_MEGARAID_NEWGEN) += megaraid/ obj-$(CONFIG_MEGARAID_SAS) += megaraid/ obj-$(CONFIG_SCSI_MPT3SAS) += mpt3sas/ +obj-$(CONFIG_SCSI_MPI3MR) += mpi3mr/ obj-$(CONFIG_SCSI_UFSHCD) += ufs/ obj-$(CONFIG_SCSI_ACARD) += atp870u.o obj-$(CONFIG_SCSI_SUNESP) += esp_scsi.o sun_esp.o diff --git a/drivers/scsi/mpi3mr/GPL_license.txt b/drivers/scsi/mpi3mr/GPL_license.txt new file mode 100644 index 0000000000000..3912109b5cd65 --- /dev/null +++ b/drivers/scsi/mpi3mr/GPL_license.txt @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/drivers/scsi/mpi3mr/Kconfig b/drivers/scsi/mpi3mr/Kconfig new file mode 100644 index 0000000000000..d9846c03effc0 --- /dev/null +++ b/drivers/scsi/mpi3mr/Kconfig @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config SCSI_MPI3MR + tristate "Broadcom MPI3 Storage Controller Device Driver" + depends on PCI && SCSI + help + This driver supports Broadcom's Unified MPI3 based Storage & RAID Controllers. + diff --git a/drivers/scsi/mpi3mr/Makefile b/drivers/scsi/mpi3mr/Makefile new file mode 100644 index 0000000000000..06e44afa0b189 --- /dev/null +++ b/drivers/scsi/mpi3mr/Makefile @@ -0,0 +1,9 @@ +# mpi3mr makefile +obj-$(CONFIG_SCSI_MPI3MR) += mpi3mr.o +mpi3mr-y += mpi3mr_os.o \ + mpi3mr_fw.o \ + mpi3mr_app.o \ + mpi3mr_debugfs.o \ + mpi3mr_transport.o + + diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_api.h b/drivers/scsi/mpi3mr/mpi/mpi30_api.h new file mode 100644 index 0000000000000..1a13a68e44d6b --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_api.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2019-2022 Broadcom Inc. All rights reserved. + * + */ +#ifndef MPI30_API_H +#define MPI30_API_H 1 +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h new file mode 100644 index 0000000000000..2500844772156 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h @@ -0,0 +1,2258 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_CNFG_H +#define MPI30_CNFG_H 1 +#define MPI3_CONFIG_PAGETYPE_IO_UNIT (0x00) +#define MPI3_CONFIG_PAGETYPE_MANUFACTURING (0x01) +#define MPI3_CONFIG_PAGETYPE_IOC (0x02) +#define MPI3_CONFIG_PAGETYPE_DRIVER (0x03) +#define MPI3_CONFIG_PAGETYPE_SECURITY (0x04) +#define MPI3_CONFIG_PAGETYPE_ENCLOSURE (0x11) +#define MPI3_CONFIG_PAGETYPE_DEVICE (0x12) +#define MPI3_CONFIG_PAGETYPE_SAS_IO_UNIT (0x20) +#define MPI3_CONFIG_PAGETYPE_SAS_EXPANDER (0x21) +#define MPI3_CONFIG_PAGETYPE_SAS_PHY (0x23) +#define MPI3_CONFIG_PAGETYPE_SAS_PORT (0x24) +#define MPI3_CONFIG_PAGETYPE_PCIE_IO_UNIT (0x30) +#define MPI3_CONFIG_PAGETYPE_PCIE_SWITCH (0x31) +#define MPI3_CONFIG_PAGETYPE_PCIE_LINK (0x33) +#define MPI3_CONFIG_PAGEATTR_MASK (0xf0) +#define MPI3_CONFIG_PAGEATTR_READ_ONLY (0x00) +#define MPI3_CONFIG_PAGEATTR_CHANGEABLE (0x10) +#define MPI3_CONFIG_PAGEATTR_PERSISTENT (0x20) +#define MPI3_CONFIG_ACTION_PAGE_HEADER (0x00) +#define MPI3_CONFIG_ACTION_READ_DEFAULT (0x01) +#define MPI3_CONFIG_ACTION_READ_CURRENT (0x02) +#define MPI3_CONFIG_ACTION_WRITE_CURRENT (0x03) +#define MPI3_CONFIG_ACTION_READ_PERSISTENT (0x04) +#define MPI3_CONFIG_ACTION_WRITE_PERSISTENT (0x05) +#define MPI3_DEVICE_PGAD_FORM_MASK (0xf0000000) +#define MPI3_DEVICE_PGAD_FORM_GET_NEXT_HANDLE (0x00000000) +#define MPI3_DEVICE_PGAD_FORM_HANDLE (0x20000000) +#define MPI3_DEVICE_PGAD_HANDLE_MASK (0x0000ffff) +#define MPI3_SAS_EXPAND_PGAD_FORM_MASK (0xf0000000) +#define MPI3_SAS_EXPAND_PGAD_FORM_GET_NEXT_HANDLE (0x00000000) +#define MPI3_SAS_EXPAND_PGAD_FORM_HANDLE_PHY_NUM (0x10000000) +#define MPI3_SAS_EXPAND_PGAD_FORM_HANDLE (0x20000000) +#define MPI3_SAS_EXPAND_PGAD_PHYNUM_MASK (0x00ff0000) +#define MPI3_SAS_EXPAND_PGAD_PHYNUM_SHIFT (16) +#define MPI3_SAS_EXPAND_PGAD_HANDLE_MASK (0x0000ffff) +#define MPI3_SAS_PHY_PGAD_FORM_MASK (0xf0000000) +#define MPI3_SAS_PHY_PGAD_FORM_PHY_NUMBER (0x00000000) +#define MPI3_SAS_PHY_PGAD_PHY_NUMBER_MASK (0x000000ff) +#define MPI3_SASPORT_PGAD_FORM_MASK (0xf0000000) +#define MPI3_SASPORT_PGAD_FORM_GET_NEXT_PORT (0x00000000) +#define MPI3_SASPORT_PGAD_FORM_PORT_NUM (0x10000000) +#define MPI3_SASPORT_PGAD_PORT_NUMBER_MASK (0x000000ff) +#define MPI3_ENCLOS_PGAD_FORM_MASK (0xf0000000) +#define MPI3_ENCLOS_PGAD_FORM_GET_NEXT_HANDLE (0x00000000) +#define MPI3_ENCLOS_PGAD_FORM_HANDLE (0x10000000) +#define MPI3_ENCLOS_PGAD_HANDLE_MASK (0x0000ffff) +#define MPI3_PCIE_SWITCH_PGAD_FORM_MASK (0xf0000000) +#define MPI3_PCIE_SWITCH_PGAD_FORM_GET_NEXT_HANDLE (0x00000000) +#define MPI3_PCIE_SWITCH_PGAD_FORM_HANDLE_PORT_NUM (0x10000000) +#define MPI3_PCIE_SWITCH_PGAD_FORM_HANDLE (0x20000000) +#define MPI3_PCIE_SWITCH_PGAD_PORTNUM_MASK (0x00ff0000) +#define MPI3_PCIE_SWITCH_PGAD_PORTNUM_SHIFT (16) +#define MPI3_PCIE_SWITCH_PGAD_HANDLE_MASK (0x0000ffff) +#define MPI3_PCIE_LINK_PGAD_FORM_MASK (0xf0000000) +#define MPI3_PCIE_LINK_PGAD_FORM_GET_NEXT_LINK (0x00000000) +#define MPI3_PCIE_LINK_PGAD_FORM_LINK_NUM (0x10000000) +#define MPI3_PCIE_LINK_PGAD_LINKNUM_MASK (0x000000ff) +#define MPI3_SECURITY_PGAD_FORM_MASK (0xf0000000) +#define MPI3_SECURITY_PGAD_FORM_GET_NEXT_SLOT (0x00000000) +#define MPI3_SECURITY_PGAD_FORM_SOT_NUM (0x10000000) +#define MPI3_SECURITY_PGAD_SLOT_GROUP_MASK (0x0000ff00) +#define MPI3_SECURITY_PGAD_SLOT_MASK (0x000000ff) +struct mpi3_config_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + u8 page_version; + u8 page_number; + u8 page_type; + u8 action; + __le32 page_address; + __le16 page_length; + __le16 reserved16; + __le32 reserved18[2]; + union mpi3_sge_union sgl; +}; +struct mpi3_config_page_header { + u8 page_version; + u8 reserved01; + u8 page_number; + u8 page_attribute; + __le16 page_length; + u8 page_type; + u8 reserved07; +}; +#define MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK (0xf0) +#define MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT (4) +#define MPI3_SAS_NEG_LINK_RATE_PHYSICAL_MASK (0x0f) +#define MPI3_SAS_NEG_LINK_RATE_UNKNOWN_LINK_RATE (0x00) +#define MPI3_SAS_NEG_LINK_RATE_PHY_DISABLED (0x01) +#define MPI3_SAS_NEG_LINK_RATE_NEGOTIATION_FAILED (0x02) +#define MPI3_SAS_NEG_LINK_RATE_SATA_OOB_COMPLETE (0x03) +#define MPI3_SAS_NEG_LINK_RATE_PORT_SELECTOR (0x04) +#define MPI3_SAS_NEG_LINK_RATE_SMP_RESET_IN_PROGRESS (0x05) +#define MPI3_SAS_NEG_LINK_RATE_UNSUPPORTED_PHY (0x06) +#define MPI3_SAS_NEG_LINK_RATE_1_5 (0x08) +#define MPI3_SAS_NEG_LINK_RATE_3_0 (0x09) +#define MPI3_SAS_NEG_LINK_RATE_6_0 (0x0a) +#define MPI3_SAS_NEG_LINK_RATE_12_0 (0x0b) +#define MPI3_SAS_NEG_LINK_RATE_22_5 (0x0c) +#define MPI3_SAS_APHYINFO_INSIDE_ZPSDS_PERSISTENT (0x00000040) +#define MPI3_SAS_APHYINFO_REQUESTED_INSIDE_ZPSDS (0x00000020) +#define MPI3_SAS_APHYINFO_BREAK_REPLY_CAPABLE (0x00000010) +#define MPI3_SAS_APHYINFO_REASON_MASK (0x0000000f) +#define MPI3_SAS_APHYINFO_REASON_UNKNOWN (0x00000000) +#define MPI3_SAS_APHYINFO_REASON_POWER_ON (0x00000001) +#define MPI3_SAS_APHYINFO_REASON_HARD_RESET (0x00000002) +#define MPI3_SAS_APHYINFO_REASON_SMP_PHY_CONTROL (0x00000003) +#define MPI3_SAS_APHYINFO_REASON_LOSS_OF_SYNC (0x00000004) +#define MPI3_SAS_APHYINFO_REASON_MULTIPLEXING_SEQ (0x00000005) +#define MPI3_SAS_APHYINFO_REASON_IT_NEXUS_LOSS_TIMER (0x00000006) +#define MPI3_SAS_APHYINFO_REASON_BREAK_TIMEOUT (0x00000007) +#define MPI3_SAS_APHYINFO_REASON_PHY_TEST_STOPPED (0x00000008) +#define MPI3_SAS_APHYINFO_REASON_EXP_REDUCED_FUNC (0x00000009) +#define MPI3_SAS_PHYINFO_STATUS_MASK (0xc0000000) +#define MPI3_SAS_PHYINFO_STATUS_SHIFT (30) +#define MPI3_SAS_PHYINFO_STATUS_ACCESSIBLE (0x00000000) +#define MPI3_SAS_PHYINFO_STATUS_NOT_EXIST (0x40000000) +#define MPI3_SAS_PHYINFO_STATUS_VACANT (0x80000000) +#define MPI3_SAS_PHYINFO_PHY_POWER_CONDITION_MASK (0x18000000) +#define MPI3_SAS_PHYINFO_PHY_POWER_CONDITION_ACTIVE (0x00000000) +#define MPI3_SAS_PHYINFO_PHY_POWER_CONDITION_PARTIAL (0x08000000) +#define MPI3_SAS_PHYINFO_PHY_POWER_CONDITION_SLUMBER (0x10000000) +#define MPI3_SAS_PHYINFO_REQUESTED_INSIDE_ZPSDS_CHANGED_MASK (0x04000000) +#define MPI3_SAS_PHYINFO_REQUESTED_INSIDE_ZPSDS_CHANGED_SHIFT (26) +#define MPI3_SAS_PHYINFO_INSIDE_ZPSDS_PERSISTENT_MASK (0x02000000) +#define MPI3_SAS_PHYINFO_INSIDE_ZPSDS_PERSISTENT_SHIFT (25) +#define MPI3_SAS_PHYINFO_REQUESTED_INSIDE_ZPSDS_MASK (0x01000000) +#define MPI3_SAS_PHYINFO_REQUESTED_INSIDE_ZPSDS_SHIFT (24) +#define MPI3_SAS_PHYINFO_ZONE_GROUP_PERSISTENT (0x00400000) +#define MPI3_SAS_PHYINFO_INSIDE_ZPSDS_WITHIN (0x00200000) +#define MPI3_SAS_PHYINFO_ZONING_ENABLED (0x00100000) +#define MPI3_SAS_PHYINFO_REASON_MASK (0x000f0000) +#define MPI3_SAS_PHYINFO_REASON_UNKNOWN (0x00000000) +#define MPI3_SAS_PHYINFO_REASON_POWER_ON (0x00010000) +#define MPI3_SAS_PHYINFO_REASON_HARD_RESET (0x00020000) +#define MPI3_SAS_PHYINFO_REASON_SMP_PHY_CONTROL (0x00030000) +#define MPI3_SAS_PHYINFO_REASON_LOSS_OF_SYNC (0x00040000) +#define MPI3_SAS_PHYINFO_REASON_MULTIPLEXING_SEQ (0x00050000) +#define MPI3_SAS_PHYINFO_REASON_IT_NEXUS_LOSS_TIMER (0x00060000) +#define MPI3_SAS_PHYINFO_REASON_BREAK_TIMEOUT (0x00070000) +#define MPI3_SAS_PHYINFO_REASON_PHY_TEST_STOPPED (0x00080000) +#define MPI3_SAS_PHYINFO_REASON_EXP_REDUCED_FUNC (0x00090000) +#define MPI3_SAS_PHYINFO_SATA_PORT_ACTIVE (0x00004000) +#define MPI3_SAS_PHYINFO_SATA_PORT_SELECTOR_PRESENT (0x00002000) +#define MPI3_SAS_PHYINFO_VIRTUAL_PHY (0x00001000) +#define MPI3_SAS_PHYINFO_PARTIAL_PATHWAY_TIME_MASK (0x00000f00) +#define MPI3_SAS_PHYINFO_PARTIAL_PATHWAY_TIME_SHIFT (8) +#define MPI3_SAS_PHYINFO_ROUTING_ATTRIBUTE_MASK (0x000000f0) +#define MPI3_SAS_PHYINFO_ROUTING_ATTRIBUTE_DIRECT (0x00000000) +#define MPI3_SAS_PHYINFO_ROUTING_ATTRIBUTE_SUBTRACTIVE (0x00000010) +#define MPI3_SAS_PHYINFO_ROUTING_ATTRIBUTE_TABLE (0x00000020) +#define MPI3_SAS_PRATE_MAX_RATE_MASK (0xf0) +#define MPI3_SAS_PRATE_MAX_RATE_NOT_PROGRAMMABLE (0x00) +#define MPI3_SAS_PRATE_MAX_RATE_1_5 (0x80) +#define MPI3_SAS_PRATE_MAX_RATE_3_0 (0x90) +#define MPI3_SAS_PRATE_MAX_RATE_6_0 (0xa0) +#define MPI3_SAS_PRATE_MAX_RATE_12_0 (0xb0) +#define MPI3_SAS_PRATE_MAX_RATE_22_5 (0xc0) +#define MPI3_SAS_PRATE_MIN_RATE_MASK (0x0f) +#define MPI3_SAS_PRATE_MIN_RATE_NOT_PROGRAMMABLE (0x00) +#define MPI3_SAS_PRATE_MIN_RATE_1_5 (0x08) +#define MPI3_SAS_PRATE_MIN_RATE_3_0 (0x09) +#define MPI3_SAS_PRATE_MIN_RATE_6_0 (0x0a) +#define MPI3_SAS_PRATE_MIN_RATE_12_0 (0x0b) +#define MPI3_SAS_PRATE_MIN_RATE_22_5 (0x0c) +#define MPI3_SAS_HWRATE_MAX_RATE_MASK (0xf0) +#define MPI3_SAS_HWRATE_MAX_RATE_1_5 (0x80) +#define MPI3_SAS_HWRATE_MAX_RATE_3_0 (0x90) +#define MPI3_SAS_HWRATE_MAX_RATE_6_0 (0xa0) +#define MPI3_SAS_HWRATE_MAX_RATE_12_0 (0xb0) +#define MPI3_SAS_HWRATE_MAX_RATE_22_5 (0xc0) +#define MPI3_SAS_HWRATE_MIN_RATE_MASK (0x0f) +#define MPI3_SAS_HWRATE_MIN_RATE_1_5 (0x08) +#define MPI3_SAS_HWRATE_MIN_RATE_3_0 (0x09) +#define MPI3_SAS_HWRATE_MIN_RATE_6_0 (0x0a) +#define MPI3_SAS_HWRATE_MIN_RATE_12_0 (0x0b) +#define MPI3_SAS_HWRATE_MIN_RATE_22_5 (0x0c) +#define MPI3_SLOT_INVALID (0xffff) +#define MPI3_SLOT_INDEX_INVALID (0xffff) +#define MPI3_LINK_CHANGE_COUNT_INVALID (0xffff) +#define MPI3_RATE_CHANGE_COUNT_INVALID (0xffff) +#define MPI3_TEMP_SENSOR_LOCATION_INTERNAL (0x0) +#define MPI3_TEMP_SENSOR_LOCATION_INLET (0x1) +#define MPI3_TEMP_SENSOR_LOCATION_OUTLET (0x2) +#define MPI3_TEMP_SENSOR_LOCATION_DRAM (0x3) +#define MPI3_MFGPAGE_VENDORID_BROADCOM (0x1000) +#define MPI3_MFGPAGE_DEVID_SAS4116 (0x00a5) +struct mpi3_man_page0 { + struct mpi3_config_page_header header; + u8 chip_revision[8]; + u8 chip_name[32]; + u8 board_name[32]; + u8 board_assembly[32]; + u8 board_tracer_number[32]; + __le32 board_power; + __le32 reserved94; + __le32 reserved98; + u8 oem; + u8 profile_identifier; + __le16 flags; + u8 board_mfg_day; + u8 board_mfg_month; + __le16 board_mfg_year; + u8 board_rework_day; + u8 board_rework_month; + __le16 board_rework_year; + u8 board_revision[8]; + u8 e_pack_fru[16]; + u8 product_name[256]; +}; +#define MPI3_MAN0_PAGEVERSION (0x00) +#define MPI3_MAN0_FLAGS_SWITCH_PRESENT (0x0002) +#define MPI3_MAN0_FLAGS_EXPANDER_PRESENT (0x0001) +#define MPI3_MAN1_VPD_SIZE (512) +struct mpi3_man_page1 { + struct mpi3_config_page_header header; + __le32 reserved08[2]; + u8 vpd[MPI3_MAN1_VPD_SIZE]; +}; +#define MPI3_MAN1_PAGEVERSION (0x00) +struct mpi3_man_page2 { + struct mpi3_config_page_header header; + u8 flags; + u8 reserved09[3]; + __le32 reserved0c[3]; + u8 oem_board_tracer_number[32]; +}; +#define MPI3_MAN2_PAGEVERSION (0x00) +#define MPI3_MAN2_FLAGS_TRACER_PRESENT (0x01) +struct mpi3_man5_phy_entry { + __le64 ioc_wwid; + __le64 device_name; + __le64 sata_wwid; +}; +#ifndef MPI3_MAN5_PHY_MAX +#define MPI3_MAN5_PHY_MAX (1) +#endif +struct mpi3_man_page5 { + struct mpi3_config_page_header header; + u8 num_phys; + u8 reserved09[3]; + __le32 reserved0c; + struct mpi3_man5_phy_entry phy[MPI3_MAN5_PHY_MAX]; +}; +#define MPI3_MAN5_PAGEVERSION (0x00) +struct mpi3_man6_gpio_entry { + u8 function_code; + u8 function_flags; + __le16 flags; + u8 param1; + u8 param2; + __le16 reserved06; + __le32 param3; +}; +#define MPI3_MAN6_GPIO_FUNCTION_GENERIC (0x00) +#define MPI3_MAN6_GPIO_FUNCTION_ALTERNATE (0x01) +#define MPI3_MAN6_GPIO_FUNCTION_EXT_INTERRUPT (0x02) +#define MPI3_MAN6_GPIO_FUNCTION_GLOBAL_ACTIVITY (0x03) +#define MPI3_MAN6_GPIO_FUNCTION_OVER_TEMPERATURE (0x04) +#define MPI3_MAN6_GPIO_FUNCTION_PORT_STATUS_GREEN (0x05) +#define MPI3_MAN6_GPIO_FUNCTION_PORT_STATUS_YELLOW (0x06) +#define MPI3_MAN6_GPIO_FUNCTION_CABLE_MANAGEMENT (0x07) +#define MPI3_MAN6_GPIO_FUNCTION_BKPLANE_MGMT_TYPE (0x08) +#define MPI3_MAN6_GPIO_FUNCTION_ISTWI_RESET (0x0a) +#define MPI3_MAN6_GPIO_FUNCTION_BACKEND_PCIE_RESET (0x0b) +#define MPI3_MAN6_GPIO_FUNCTION_GLOBAL_FAULT (0x0c) +#define MPI3_MAN6_GPIO_FUNCTION_PBLP_STATUS_CHANGE (0x0d) +#define MPI3_MAN6_GPIO_FUNCTION_EPACK_ONLINE (0x0e) +#define MPI3_MAN6_GPIO_FUNCTION_EPACK_FAULT (0x0f) +#define MPI3_MAN6_GPIO_FUNCTION_CTRL_TYPE (0x10) +#define MPI3_MAN6_GPIO_FUNCTION_LICENSE (0x11) +#define MPI3_MAN6_GPIO_FUNCTION_REFCLK_CONTROL (0x12) +#define MPI3_MAN6_GPIO_FUNCTION_BACKEND_PCIE_RESET_CLAMP (0x13) +#define MPI3_MAN6_GPIO_FUNCTION_AUXILIARY_POWER (0x14) +#define MPI3_MAN6_GPIO_FUNCTION_RAID_DATA_CACHE_DIRTY (0x15) +#define MPI3_MAN6_GPIO_FUNCTION_BOARD_FAN_CONTROL (0x16) +#define MPI3_MAN6_GPIO_FUNCTION_BOARD_FAN_FAULT (0x17) +#define MPI3_MAN6_GPIO_FUNCTION_POWER_BRAKE (0x18) +#define MPI3_MAN6_GPIO_ISTWI_RESET_FUNCTIONFLAGS_DEVSELECT_MASK (0x01) +#define MPI3_MAN6_GPIO_ISTWI_RESET_FUNCTIONFLAGS_DEVSELECT_ISTWI (0x00) +#define MPI3_MAN6_GPIO_ISTWI_RESET_FUNCTIONFLAGS_DEVSELECT_RECEPTACLEID (0x01) +#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_SOURCE_MASK (0xf0) +#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_SOURCE_GENERIC (0x00) +#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_SOURCE_CABLE_MGMT (0x10) +#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_SOURCE_ACTIVE_CABLE_OVERCURRENT (0x20) +#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_MASK (0x01) +#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_EDGE (0x00) +#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_LEVEL (0x01) +#define MPI3_MAN6_GPIO_PORT_GREEN_PARAM1_PHY_STATUS_ALL_UP (0x00) +#define MPI3_MAN6_GPIO_PORT_GREEN_PARAM1_PHY_STATUS_ONE_OR_MORE_UP (0x01) +#define MPI3_MAN6_GPIO_CABLE_MGMT_PARAM1_INTERFACE_MODULE_PRESENT (0x00) +#define MPI3_MAN6_GPIO_CABLE_MGMT_PARAM1_INTERFACE_ACTIVE_CABLE_ENABLE (0x01) +#define MPI3_MAN6_GPIO_CABLE_MGMT_PARAM1_INTERFACE_CABLE_MGMT_ENABLE (0x02) +#define MPI3_MAN6_GPIO_LICENSE_PARAM1_TYPE_IBUTTON (0x00) +#define MPI3_MAN6_GPIO_FLAGS_SLEW_RATE_MASK (0x0100) +#define MPI3_MAN6_GPIO_FLAGS_SLEW_RATE_FAST_EDGE (0x0100) +#define MPI3_MAN6_GPIO_FLAGS_SLEW_RATE_SLOW_EDGE (0x0000) +#define MPI3_MAN6_GPIO_FLAGS_DRIVE_STRENGTH_MASK (0x00c0) +#define MPI3_MAN6_GPIO_FLAGS_DRIVE_STRENGTH_100OHM (0x0000) +#define MPI3_MAN6_GPIO_FLAGS_DRIVE_STRENGTH_66OHM (0x0040) +#define MPI3_MAN6_GPIO_FLAGS_DRIVE_STRENGTH_50OHM (0x0080) +#define MPI3_MAN6_GPIO_FLAGS_DRIVE_STRENGTH_33OHM (0x00c0) +#define MPI3_MAN6_GPIO_FLAGS_ALT_DATA_SEL_MASK (0x0030) +#define MPI3_MAN6_GPIO_FLAGS_ALT_DATA_SEL_SHIFT (4) +#define MPI3_MAN6_GPIO_FLAGS_ACTIVE_HIGH (0x0008) +#define MPI3_MAN6_GPIO_FLAGS_BI_DIR_ENABLED (0x0004) +#define MPI3_MAN6_GPIO_FLAGS_DIRECTION_MASK (0x0003) +#define MPI3_MAN6_GPIO_FLAGS_DIRECTION_INPUT (0x0000) +#define MPI3_MAN6_GPIO_FLAGS_DIRECTION_OPEN_DRAIN_OUTPUT (0x0001) +#define MPI3_MAN6_GPIO_FLAGS_DIRECTION_OPEN_SOURCE_OUTPUT (0x0002) +#define MPI3_MAN6_GPIO_FLAGS_DIRECTION_PUSH_PULL_OUTPUT (0x0003) +#ifndef MPI3_MAN6_GPIO_MAX +#define MPI3_MAN6_GPIO_MAX (1) +#endif +struct mpi3_man_page6 { + struct mpi3_config_page_header header; + __le16 flags; + __le16 reserved0a; + u8 num_gpio; + u8 reserved0d[3]; + struct mpi3_man6_gpio_entry gpio[MPI3_MAN6_GPIO_MAX]; +}; +#define MPI3_MAN6_PAGEVERSION (0x00) +#define MPI3_MAN6_FLAGS_HEARTBEAT_LED_DISABLED (0x0001) +struct mpi3_man7_receptacle_info { + __le32 name[4]; + u8 location; + u8 connector_type; + u8 ped_clk; + u8 connector_id; + __le32 reserved14; +}; +#define MPI3_MAN7_LOCATION_UNKNOWN (0x00) +#define MPI3_MAN7_LOCATION_INTERNAL (0x01) +#define MPI3_MAN7_LOCATION_EXTERNAL (0x02) +#define MPI3_MAN7_LOCATION_VIRTUAL (0x03) +#define MPI3_MAN7_PEDCLK_ROUTING_MASK (0x10) +#define MPI3_MAN7_PEDCLK_ROUTING_DIRECT (0x00) +#define MPI3_MAN7_PEDCLK_ROUTING_CLOCK_BUFFER (0x10) +#define MPI3_MAN7_PEDCLK_ID_MASK (0x0f) +#ifndef MPI3_MAN7_RECEPTACLE_INFO_MAX +#define MPI3_MAN7_RECEPTACLE_INFO_MAX (1) +#endif +struct mpi3_man_page7 { + struct mpi3_config_page_header header; + __le32 flags; + u8 num_receptacles; + u8 reserved0d[3]; + __le32 enclosure_name[4]; + struct mpi3_man7_receptacle_info receptacle_info[MPI3_MAN7_RECEPTACLE_INFO_MAX]; +}; +#define MPI3_MAN7_PAGEVERSION (0x00) +#define MPI3_MAN7_FLAGS_BASE_ENCLOSURE_LEVEL_MASK (0x01) +#define MPI3_MAN7_FLAGS_BASE_ENCLOSURE_LEVEL_0 (0x00) +#define MPI3_MAN7_FLAGS_BASE_ENCLOSURE_LEVEL_1 (0x01) +struct mpi3_man8_phy_info { + u8 receptacle_id; + u8 connector_lane; + __le16 reserved02; + __le16 slotx1; + __le16 slotx2; + __le16 slotx4; + __le16 reserved0a; + __le32 reserved0c; +}; +#define MPI3_MAN8_PHY_INFO_RECEPTACLE_ID_HOST_PHY (0xff) +#ifndef MPI3_MAN8_PHY_INFO_MAX +#define MPI3_MAN8_PHY_INFO_MAX (1) +#endif +struct mpi3_man_page8 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_phys; + u8 reserved0d[3]; + struct mpi3_man8_phy_info phy_info[MPI3_MAN8_PHY_INFO_MAX]; +}; +#define MPI3_MAN8_PAGEVERSION (0x00) +struct mpi3_man9_rsrc_entry { + __le32 maximum; + __le32 decrement; + __le32 minimum; + __le32 actual; +}; +enum mpi3_man9_resources { + MPI3_MAN9_RSRC_OUTSTANDING_REQS = 0, + MPI3_MAN9_RSRC_TARGET_CMDS = 1, + MPI3_MAN9_RSRC_RESERVED02 = 2, + MPI3_MAN9_RSRC_NVME = 3, + MPI3_MAN9_RSRC_INITIATORS = 4, + MPI3_MAN9_RSRC_VDS = 5, + MPI3_MAN9_RSRC_ENCLOSURES = 6, + MPI3_MAN9_RSRC_ENCLOSURE_PHYS = 7, + MPI3_MAN9_RSRC_EXPANDERS = 8, + MPI3_MAN9_RSRC_PCIE_SWITCHES = 9, + MPI3_MAN9_RSRC_RESERVED10 = 10, + MPI3_MAN9_RSRC_HOST_PD_DRIVES = 11, + MPI3_MAN9_RSRC_ADV_HOST_PD_DRIVES = 12, + MPI3_MAN9_RSRC_RAID_PD_DRIVES = 13, + MPI3_MAN9_RSRC_DRV_DIAG_BUF = 14, + MPI3_MAN9_RSRC_NAMESPACE_COUNT = 15, + MPI3_MAN9_RSRC_NUM_RESOURCES +}; +#define MPI3_MAN9_MIN_OUTSTANDING_REQS (1) +#define MPI3_MAN9_MAX_OUTSTANDING_REQS (65000) +#define MPI3_MAN9_MIN_TARGET_CMDS (0) +#define MPI3_MAN9_MAX_TARGET_CMDS (65535) +#define MPI3_MAN9_MIN_NVME_TARGETS (0) +#define MPI3_MAN9_MIN_INITIATORS (0) +#define MPI3_MAN9_MIN_VDS (0) +#define MPI3_MAN9_MIN_ENCLOSURES (1) +#define MPI3_MAN9_MAX_ENCLOSURES (65535) +#define MPI3_MAN9_MIN_ENCLOSURE_PHYS (0) +#define MPI3_MAN9_MIN_EXPANDERS (0) +#define MPI3_MAN9_MAX_EXPANDERS (65535) +#define MPI3_MAN9_MIN_PCIE_SWITCHES (0) +#define MPI3_MAN9_MIN_HOST_PD_DRIVES (0) +#define MPI3_MAN9_ADV_HOST_PD_DRIVES (0) +#define MPI3_MAN9_RAID_PD_DRIVES (0) +#define MPI3_MAN9_DRIVER_DIAG_BUFFER (0) +#define MPI3_MAN9_MIN_NAMESPACE_COUNT (1) +#define MPI3_MAN9_MIN_EXPANDERS (0) +#define MPI3_MAN9_MAX_EXPANDERS (65535) +struct mpi3_man_page9 { + struct mpi3_config_page_header header; + u8 num_resources; + u8 reserved09; + __le16 reserved0a; + __le32 reserved0c; + __le32 reserved10; + __le32 reserved14; + __le32 reserved18; + __le32 reserved1c; + struct mpi3_man9_rsrc_entry resource[MPI3_MAN9_RSRC_NUM_RESOURCES]; +}; +#define MPI3_MAN9_PAGEVERSION (0x00) +struct mpi3_man10_istwi_ctrlr_entry { + __le16 slave_address; + __le16 flags; + u8 scl_low_override; + u8 scl_high_override; + __le16 reserved06; +}; +#define MPI3_MAN10_ISTWI_CTRLR_FLAGS_BUS_SPEED_MASK (0x000c) +#define MPI3_MAN10_ISTWI_CTRLR_FLAGS_BUS_SPEED_100K (0x0000) +#define MPI3_MAN10_ISTWI_CTRLR_FLAGS_BUS_SPEED_400K (0x0004) +#define MPI3_MAN10_ISTWI_CTRLR_FLAGS_SLAVE_ENABLED (0x0002) +#define MPI3_MAN10_ISTWI_CTRLR_FLAGS_MASTER_ENABLED (0x0001) +#ifndef MPI3_MAN10_ISTWI_CTRLR_MAX +#define MPI3_MAN10_ISTWI_CTRLR_MAX (1) +#endif +struct mpi3_man_page10 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_istwi_ctrl; + u8 reserved0d[3]; + struct mpi3_man10_istwi_ctrlr_entry istwi_controller[MPI3_MAN10_ISTWI_CTRLR_MAX]; +}; +#define MPI3_MAN10_PAGEVERSION (0x00) +struct mpi3_man11_mux_device_format { + u8 max_channel; + u8 reserved01[3]; + __le32 reserved04; +}; +struct mpi3_man11_temp_sensor_device_format { + u8 type; + u8 reserved01[3]; + u8 temp_channel[4]; +}; +#define MPI3_MAN11_TEMP_SENSOR_TYPE_MAX6654 (0x00) +#define MPI3_MAN11_TEMP_SENSOR_TYPE_EMC1442 (0x01) +#define MPI3_MAN11_TEMP_SENSOR_TYPE_ADT7476 (0x02) +#define MPI3_MAN11_TEMP_SENSOR_TYPE_SE97B (0x03) +#define MPI3_MAN11_TEMP_SENSOR_CHANNEL_LOCATION_MASK (0xe0) +#define MPI3_MAN11_TEMP_SENSOR_CHANNEL_LOCATION_SHIFT (5) +#define MPI3_MAN11_TEMP_SENSOR_CHANNEL_ENABLED (0x01) +struct mpi3_man11_seeprom_device_format { + u8 size; + u8 page_write_size; + __le16 reserved02; + __le32 reserved04; +}; +#define MPI3_MAN11_SEEPROM_SIZE_1KBITS (0x01) +#define MPI3_MAN11_SEEPROM_SIZE_2KBITS (0x02) +#define MPI3_MAN11_SEEPROM_SIZE_4KBITS (0x03) +#define MPI3_MAN11_SEEPROM_SIZE_8KBITS (0x04) +#define MPI3_MAN11_SEEPROM_SIZE_16KBITS (0x05) +#define MPI3_MAN11_SEEPROM_SIZE_32KBITS (0x06) +#define MPI3_MAN11_SEEPROM_SIZE_64KBITS (0x07) +#define MPI3_MAN11_SEEPROM_SIZE_128KBITS (0x08) +struct mpi3_man11_ddr_spd_device_format { + u8 channel; + u8 reserved01[3]; + __le32 reserved04; +}; +struct mpi3_man11_cable_mgmt_device_format { + u8 type; + u8 receptacle_id; + __le16 reserved02; + __le32 reserved04; +}; +#define MPI3_MAN11_CABLE_MGMT_TYPE_SFF_8636 (0x00) +struct mpi3_man11_bkplane_spec_ubm_format { + __le16 flags; + __le16 reserved02; +}; +#define MPI3_MAN11_BKPLANE_UBM_FLAGS_REFCLK_POLICY_ALWAYS_ENABLED (0x0200) +#define MPI3_MAN11_BKPLANE_UBM_FLAGS_FORCE_POLLING (0x0100) +#define MPI3_MAN11_BKPLANE_UBM_FLAGS_MAX_FRU_MASK (0x00f0) +#define MPI3_MAN11_BKPLANE_UBM_FLAGS_MAX_FRU_SHIFT (4) +#define MPI3_MAN11_BKPLANE_UBM_FLAGS_POLL_INTERVAL_MASK (0x000f) +#define MPI3_MAN11_BKPLANE_UBM_FLAGS_POLL_INTERVAL_SHIFT (0) +struct mpi3_man11_bkplane_spec_non_ubm_format { + __le16 flags; + u8 reserved02; + u8 type; +}; +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_GROUP_MASK (0xf000) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_GROUP_SHIFT (12) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_REFCLK_POLICY_ALWAYS_ENABLED (0x0200) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_LINKWIDTH_MASK (0x00c0) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_LINKWIDTH_4 (0x0000) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_LINKWIDTH_2 (0x0040) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_LINKWIDTH_1 (0x0080) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_PRESENCE_DETECT_MASK (0x0030) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_PRESENCE_DETECT_GPIO (0x0000) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_PRESENCE_DETECT_REG (0x0010) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_POLL_INTERVAL_MASK (0x000f) +#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_POLL_INTERVAL_SHIFT (0) +#define MPI3_MAN11_BKPLANE_NON_UBM_TYPE_VPP (0x00) +union mpi3_man11_bkplane_spec_format { + struct mpi3_man11_bkplane_spec_ubm_format ubm; + struct mpi3_man11_bkplane_spec_non_ubm_format non_ubm; +}; +struct mpi3_man11_bkplane_mgmt_device_format { + u8 type; + u8 receptacle_id; + u8 reset_info; + u8 reserved03; + union mpi3_man11_bkplane_spec_format backplane_mgmt_specific; +}; +#define MPI3_MAN11_BKPLANE_MGMT_TYPE_UBM (0x00) +#define MPI3_MAN11_BKPLANE_MGMT_TYPE_NON_UBM (0x01) +#define MPI3_MAN11_BACKPLANE_RESETINFO_ASSERT_TIME_MASK (0xf0) +#define MPI3_MAN11_BACKPLANE_RESETINFO_ASSERT_TIME_SHIFT (4) +#define MPI3_MAN11_BACKPLANE_RESETINFO_READY_TIME_MASK (0x0f) +#define MPI3_MAN11_BACKPLANE_RESETINFO_READY_TIME_SHIFT (0) +struct mpi3_man11_gas_gauge_device_format { + u8 type; + u8 reserved01[3]; + __le32 reserved04; +}; +#define MPI3_MAN11_GAS_GAUGE_TYPE_STANDARD (0x00) +struct mpi3_man11_mgmt_ctrlr_device_format { + __le32 reserved00; + __le32 reserved04; +}; +struct mpi3_man11_board_fan_device_format { + u8 flags; + u8 reserved01; + u8 min_fan_speed; + u8 max_fan_speed; + __le32 reserved04; +}; +#define MPI3_MAN11_BOARD_FAN_FLAGS_FAN_CTRLR_TYPE_MASK (0x07) +#define MPI3_MAN11_BOARD_FAN_FLAGS_FAN_CTRLR_TYPE_AMC6821 (0x00) +union mpi3_man11_device_specific_format { + struct mpi3_man11_mux_device_format mux; + struct mpi3_man11_temp_sensor_device_format temp_sensor; + struct mpi3_man11_seeprom_device_format seeprom; + struct mpi3_man11_ddr_spd_device_format ddr_spd; + struct mpi3_man11_cable_mgmt_device_format cable_mgmt; + struct mpi3_man11_bkplane_mgmt_device_format bkplane_mgmt; + struct mpi3_man11_gas_gauge_device_format gas_gauge; + struct mpi3_man11_mgmt_ctrlr_device_format mgmt_controller; + struct mpi3_man11_board_fan_device_format board_fan; + __le32 words[2]; +}; +struct mpi3_man11_istwi_device_format { + u8 device_type; + u8 controller; + u8 reserved02; + u8 flags; + __le16 device_address; + u8 mux_channel; + u8 mux_index; + union mpi3_man11_device_specific_format device_specific; +}; +#define MPI3_MAN11_ISTWI_DEVTYPE_MUX (0x00) +#define MPI3_MAN11_ISTWI_DEVTYPE_TEMP_SENSOR (0x01) +#define MPI3_MAN11_ISTWI_DEVTYPE_SEEPROM (0x02) +#define MPI3_MAN11_ISTWI_DEVTYPE_DDR_SPD (0x03) +#define MPI3_MAN11_ISTWI_DEVTYPE_CABLE_MGMT (0x04) +#define MPI3_MAN11_ISTWI_DEVTYPE_BACKPLANE_MGMT (0x05) +#define MPI3_MAN11_ISTWI_DEVTYPE_GAS_GAUGE (0x06) +#define MPI3_MAN11_ISTWI_DEVTYPE_MGMT_CONTROLLER (0x07) +#define MPI3_MAN11_ISTWI_DEVTYPE_BOARD_FAN (0x08) +#define MPI3_MAN11_ISTWI_FLAGS_MUX_PRESENT (0x01) +#ifndef MPI3_MAN11_ISTWI_DEVICE_MAX +#define MPI3_MAN11_ISTWI_DEVICE_MAX (1) +#endif +struct mpi3_man_page11 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_istwi_dev; + u8 reserved0d[3]; + struct mpi3_man11_istwi_device_format istwi_device[MPI3_MAN11_ISTWI_DEVICE_MAX]; +}; +#define MPI3_MAN11_PAGEVERSION (0x00) +#ifndef MPI3_MAN12_NUM_SGPIO_MAX +#define MPI3_MAN12_NUM_SGPIO_MAX (1) +#endif +struct mpi3_man12_sgpio_info { + u8 slot_count; + u8 reserved01[3]; + __le32 reserved04; + u8 phy_order[32]; +}; +struct mpi3_man_page12 { + struct mpi3_config_page_header header; + __le32 flags; + __le32 s_clock_freq; + __le32 activity_modulation; + u8 num_sgpio; + u8 reserved15[3]; + __le32 reserved18; + __le32 reserved1c; + __le32 pattern[8]; + struct mpi3_man12_sgpio_info sgpio_info[MPI3_MAN12_NUM_SGPIO_MAX]; +}; +#define MPI3_MAN12_PAGEVERSION (0x00) +#define MPI3_MAN12_FLAGS_ERROR_PRESENCE_ENABLED (0x0400) +#define MPI3_MAN12_FLAGS_ACTIVITY_INVERT_ENABLED (0x0200) +#define MPI3_MAN12_FLAGS_GROUP_ID_DISABLED (0x0100) +#define MPI3_MAN12_FLAGS_SIO_CLK_FILTER_ENABLED (0x0004) +#define MPI3_MAN12_FLAGS_SCLOCK_SLOAD_TYPE_MASK (0x0002) +#define MPI3_MAN12_FLAGS_SCLOCK_SLOAD_TYPE_PUSH_PULL (0x0000) +#define MPI3_MAN12_FLAGS_SCLOCK_SLOAD_TYPE_OPEN_DRAIN (0x0002) +#define MPI3_MAN12_FLAGS_SDATAOUT_TYPE_MASK (0x0001) +#define MPI3_MAN12_FLAGS_SDATAOUT_TYPE_PUSH_PULL (0x0000) +#define MPI3_MAN12_FLAGS_SDATAOUT_TYPE_OPEN_DRAIN (0x0001) +#define MPI3_MAN12_SIO_CLK_FREQ_MIN (32) +#define MPI3_MAN12_SIO_CLK_FREQ_MAX (100000) +#define MPI3_MAN12_ACTIVITY_MODULATION_FORCE_OFF_MASK (0x0000f000) +#define MPI3_MAN12_ACTIVITY_MODULATION_FORCE_OFF_SHIFT (12) +#define MPI3_MAN12_ACTIVITY_MODULATION_MAX_ON_MASK (0x00000f00) +#define MPI3_MAN12_ACTIVITY_MODULATION_MAX_ON_SHIFT (8) +#define MPI3_MAN12_ACTIVITY_MODULATION_STRETCH_OFF_MASK (0x000000f0) +#define MPI3_MAN12_ACTIVITY_MODULATION_STRETCH_OFF_SHIFT (4) +#define MPI3_MAN12_ACTIVITY_MODULATION_STRETCH_ON_MASK (0x0000000f) +#define MPI3_MAN12_ACTIVITY_MODULATION_STRETCH_ON_SHIFT (0) +#define MPI3_MAN12_PATTERN_RATE_MASK (0xe0000000) +#define MPI3_MAN12_PATTERN_RATE_2_HZ (0x00000000) +#define MPI3_MAN12_PATTERN_RATE_4_HZ (0x20000000) +#define MPI3_MAN12_PATTERN_RATE_8_HZ (0x40000000) +#define MPI3_MAN12_PATTERN_RATE_16_HZ (0x60000000) +#define MPI3_MAN12_PATTERN_RATE_10_HZ (0x80000000) +#define MPI3_MAN12_PATTERN_RATE_20_HZ (0xa0000000) +#define MPI3_MAN12_PATTERN_RATE_40_HZ (0xc0000000) +#define MPI3_MAN12_PATTERN_LENGTH_MASK (0x1f000000) +#define MPI3_MAN12_PATTERN_LENGTH_SHIFT (24) +#define MPI3_MAN12_PATTERN_BIT_PATTERN_MASK (0x00ffffff) +#define MPI3_MAN12_PATTERN_BIT_PATTERN_SHIFT (0) +#ifndef MPI3_MAN13_NUM_TRANSLATION_MAX +#define MPI3_MAN13_NUM_TRANSLATION_MAX (1) +#endif +struct mpi3_man13_translation_info { + __le32 slot_status; + __le32 mask; + u8 activity; + u8 locate; + u8 error; + u8 reserved0b; +}; +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_FAULT (0x20000000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_DEVICE_OFF (0x10000000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_DEVICE_ACTIVITY (0x00800000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_DO_NOT_REMOVE (0x00400000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_DEVICE_MISSING (0x00100000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_INSERT (0x00080000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_REMOVAL (0x00040000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_IDENTIFY (0x00020000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_OK (0x00008000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_RESERVED_DEVICE (0x00004000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_HOT_SPARE (0x00002000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_CONSISTENCY_CHECK (0x00001000) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_IN_CRITICAL_ARRAY (0x00000800) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_IN_FAILED_ARRAY (0x00000400) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_REBUILD_REMAP (0x00000200) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_REBUILD_REMAP_ABORT (0x00000100) +#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_PREDICTED_FAILURE (0x00000040) +#define MPI3_MAN13_BLINK_PATTERN_FORCE_OFF (0x00) +#define MPI3_MAN13_BLINK_PATTERN_FORCE_ON (0x01) +#define MPI3_MAN13_BLINK_PATTERN_PATTERN_0 (0x02) +#define MPI3_MAN13_BLINK_PATTERN_PATTERN_1 (0x03) +#define MPI3_MAN13_BLINK_PATTERN_PATTERN_2 (0x04) +#define MPI3_MAN13_BLINK_PATTERN_PATTERN_3 (0x05) +#define MPI3_MAN13_BLINK_PATTERN_PATTERN_4 (0x06) +#define MPI3_MAN13_BLINK_PATTERN_PATTERN_5 (0x07) +#define MPI3_MAN13_BLINK_PATTERN_PATTERN_6 (0x08) +#define MPI3_MAN13_BLINK_PATTERN_PATTERN_7 (0x09) +#define MPI3_MAN13_BLINK_PATTERN_ACTIVITY (0x0a) +#define MPI3_MAN13_BLINK_PATTERN_ACTIVITY_TRAIL (0x0b) +struct mpi3_man_page13 { + struct mpi3_config_page_header header; + u8 num_trans; + u8 reserved09[3]; + __le32 reserved0c; + struct mpi3_man13_translation_info translation[MPI3_MAN13_NUM_TRANSLATION_MAX]; +}; +#define MPI3_MAN13_PAGEVERSION (0x00) +struct mpi3_man_page14 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_slot_groups; + u8 num_slots; + __le16 max_cert_chain_length; + __le32 sealed_slots; + __le32 populated_slots; + __le32 mgmt_pt_updatable_slots; +}; +#define MPI3_MAN14_PAGEVERSION (0x00) +#define MPI3_MAN14_NUMSLOTS_MAX (32) +#ifndef MPI3_MAN15_VERSION_RECORD_MAX +#define MPI3_MAN15_VERSION_RECORD_MAX 1 +#endif +struct mpi3_man15_version_record { + __le16 spdm_version; + __le16 reserved02; +}; +struct mpi3_man_page15 { + struct mpi3_config_page_header header; + u8 num_version_records; + u8 reserved09[3]; + __le32 reserved0c; + struct mpi3_man15_version_record version_record[MPI3_MAN15_VERSION_RECORD_MAX]; +}; +#define MPI3_MAN15_PAGEVERSION (0x00) +#ifndef MPI3_MAN16_CERT_ALGO_MAX +#define MPI3_MAN16_CERT_ALGO_MAX 1 +#endif +struct mpi3_man16_certificate_algorithm { + u8 slot_group; + u8 reserved01[3]; + __le32 base_asym_algo; + __le32 base_hash_algo; + __le32 reserved0c[3]; +}; +struct mpi3_man_page16 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_cert_algos; + u8 reserved0d[3]; + struct mpi3_man16_certificate_algorithm certificate_algorithm[MPI3_MAN16_CERT_ALGO_MAX]; +}; +#define MPI3_MAN16_PAGEVERSION (0x00) +#ifndef MPI3_MAN17_HASH_ALGORITHM_MAX +#define MPI3_MAN17_HASH_ALGORITHM_MAX 1 +#endif +struct mpi3_man17_hash_algorithm { + u8 meas_specification; + u8 reserved01[3]; + __le32 measurement_hash_algo; + __le32 reserved08[2]; +}; +struct mpi3_man_page17 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_hash_algos; + u8 reserved0d[3]; + struct mpi3_man17_hash_algorithm hash_algorithm[MPI3_MAN17_HASH_ALGORITHM_MAX]; +}; +#define MPI3_MAN17_PAGEVERSION (0x00) +struct mpi3_man_page20 { + struct mpi3_config_page_header header; + __le32 reserved08; + __le32 nonpremium_features; + u8 allowed_personalities; + u8 reserved11[3]; +}; +#define MPI3_MAN20_PAGEVERSION (0x00) +#define MPI3_MAN20_ALLOWEDPERSON_RAID_MASK (0x02) +#define MPI3_MAN20_ALLOWEDPERSON_RAID_ALLOWED (0x02) +#define MPI3_MAN20_ALLOWEDPERSON_RAID_NOT_ALLOWED (0x00) +#define MPI3_MAN20_ALLOWEDPERSON_EHBA_MASK (0x01) +#define MPI3_MAN20_ALLOWEDPERSON_EHBA_ALLOWED (0x01) +#define MPI3_MAN20_ALLOWEDPERSON_EHBA_NOT_ALLOWED (0x00) +#define MPI3_MAN20_NONPREMUIM_DISABLE_PD_DEGRADED_MASK (0x01) +#define MPI3_MAN20_NONPREMUIM_DISABLE_PD_DEGRADED_ENABLED (0x00) +#define MPI3_MAN20_NONPREMUIM_DISABLE_PD_DEGRADED_DISABLED (0x01) +struct mpi3_man_page21 { + struct mpi3_config_page_header header; + __le32 reserved08; + __le32 flags; +}; +#define MPI3_MAN21_PAGEVERSION (0x00) +#define MPI3_MAN21_FLAGS_HOST_METADATA_CAPABILITY_MASK (0x80) +#define MPI3_MAN21_FLAGS_HOST_METADATA_CAPABILITY_ENABLED (0x80) +#define MPI3_MAN21_FLAGS_HOST_METADATA_CAPABILITY_DISABLED (0x00) +#define MPI3_MAN21_FLAGS_UNCERTIFIED_DRIVES_MASK (0x60) +#define MPI3_MAN21_FLAGS_UNCERTIFIED_DRIVES_BLOCK (0x00) +#define MPI3_MAN21_FLAGS_UNCERTIFIED_DRIVES_ALLOW (0x20) +#define MPI3_MAN21_FLAGS_UNCERTIFIED_DRIVES_WARN (0x40) +#define MPI3_MAN21_FLAGS_BLOCK_SSD_WR_CACHE_CHANGE_MASK (0x08) +#define MPI3_MAN21_FLAGS_BLOCK_SSD_WR_CACHE_CHANGE_ALLOW (0x00) +#define MPI3_MAN21_FLAGS_BLOCK_SSD_WR_CACHE_CHANGE_PREVENT (0x08) +#define MPI3_MAN21_FLAGS_SES_VPD_ASSOC_MASK (0x01) +#define MPI3_MAN21_FLAGS_SES_VPD_ASSOC_DEFAULT (0x00) +#define MPI3_MAN21_FLAGS_SES_VPD_ASSOC_OEM_SPECIFIC (0x01) +#ifndef MPI3_MAN_PROD_SPECIFIC_MAX +#define MPI3_MAN_PROD_SPECIFIC_MAX (1) +#endif +struct mpi3_man_page_product_specific { + struct mpi3_config_page_header header; + __le32 product_specific_info[MPI3_MAN_PROD_SPECIFIC_MAX]; +}; +struct mpi3_io_unit_page0 { + struct mpi3_config_page_header header; + __le64 unique_value; + __le32 nvdata_version_default; + __le32 nvdata_version_persistent; +}; +#define MPI3_IOUNIT0_PAGEVERSION (0x00) +struct mpi3_io_unit_page1 { + struct mpi3_config_page_header header; + __le32 flags; + u8 dmd_io_delay; + u8 dmd_report_pcie; + u8 dmd_report_sata; + u8 dmd_report_sas; +}; +#define MPI3_IOUNIT1_PAGEVERSION (0x00) +#define MPI3_IOUNIT1_FLAGS_NVME_WRITE_CACHE_MASK (0x00000030) +#define MPI3_IOUNIT1_FLAGS_NVME_WRITE_CACHE_ENABLE (0x00000000) +#define MPI3_IOUNIT1_FLAGS_NVME_WRITE_CACHE_DISABLE (0x00000010) +#define MPI3_IOUNIT1_FLAGS_NVME_WRITE_CACHE_NO_MODIFY (0x00000020) +#define MPI3_IOUNIT1_FLAGS_ATA_SECURITY_FREEZE_LOCK (0x00000008) +#define MPI3_IOUNIT1_FLAGS_WRITE_SAME_BUFFER (0x00000004) +#define MPI3_IOUNIT1_FLAGS_SATA_WRITE_CACHE_MASK (0x00000003) +#define MPI3_IOUNIT1_FLAGS_SATA_WRITE_CACHE_ENABLE (0x00000000) +#define MPI3_IOUNIT1_FLAGS_SATA_WRITE_CACHE_DISABLE (0x00000001) +#define MPI3_IOUNIT1_FLAGS_SATA_WRITE_CACHE_UNCHANGED (0x00000002) +#define MPI3_IOUNIT1_DMD_REPORT_DELAY_TIME_MASK (0x7f) +#define MPI3_IOUNIT1_DMD_REPORT_UNIT_16_SEC (0x80) +#ifndef MPI3_IO_UNIT2_GPIO_VAL_MAX +#define MPI3_IO_UNIT2_GPIO_VAL_MAX (1) +#endif +struct mpi3_io_unit_page2 { + struct mpi3_config_page_header header; + u8 gpio_count; + u8 reserved09[3]; + __le16 gpio_val[MPI3_IO_UNIT2_GPIO_VAL_MAX]; +}; +#define MPI3_IOUNIT2_PAGEVERSION (0x00) +#define MPI3_IOUNIT2_GPIO_FUNCTION_MASK (0xfffc) +#define MPI3_IOUNIT2_GPIO_FUNCTION_SHIFT (2) +#define MPI3_IOUNIT2_GPIO_SETTING_MASK (0x0001) +#define MPI3_IOUNIT2_GPIO_SETTING_OFF (0x0000) +#define MPI3_IOUNIT2_GPIO_SETTING_ON (0x0001) +struct mpi3_io_unit3_sensor { + __le16 flags; + u8 threshold_margin; + u8 reserved03; + __le16 threshold[3]; + __le16 reserved0a; + __le32 reserved0c; + __le32 reserved10; + __le32 reserved14; +}; +#define MPI3_IOUNIT3_SENSOR_FLAGS_FATAL_EVENT_ENABLED (0x0010) +#define MPI3_IOUNIT3_SENSOR_FLAGS_FATAL_ACTION_ENABLED (0x0008) +#define MPI3_IOUNIT3_SENSOR_FLAGS_CRITICAL_EVENT_ENABLED (0x0004) +#define MPI3_IOUNIT3_SENSOR_FLAGS_CRITICAL_ACTION_ENABLED (0x0002) +#define MPI3_IOUNIT3_SENSOR_FLAGS_WARNING_EVENT_ENABLED (0x0001) +#ifndef MPI3_IO_UNIT3_SENSOR_MAX +#define MPI3_IO_UNIT3_SENSOR_MAX (1) +#endif +struct mpi3_io_unit_page3 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_sensors; + u8 nominal_poll_interval; + u8 warning_poll_interval; + u8 reserved0f; + struct mpi3_io_unit3_sensor sensor[MPI3_IO_UNIT3_SENSOR_MAX]; +}; +#define MPI3_IOUNIT3_PAGEVERSION (0x00) +struct mpi3_io_unit4_sensor { + __le16 current_temperature; + __le16 reserved02; + u8 flags; + u8 reserved05[3]; + __le16 istwi_index; + u8 channel; + u8 reserved0b; + __le32 reserved0c; +}; +#define MPI3_IOUNIT4_SENSOR_FLAGS_LOC_MASK (0xe0) +#define MPI3_IOUNIT4_SENSOR_FLAGS_LOC_SHIFT (5) +#define MPI3_IOUNIT4_SENSOR_FLAGS_TEMP_VALID (0x01) +#define MPI3_IOUNIT4_SENSOR_ISTWI_INDEX_INTERNAL (0xffff) +#define MPI3_IOUNIT4_SENSOR_CHANNEL_RESERVED (0xff) +#ifndef MPI3_IO_UNIT4_SENSOR_MAX +#define MPI3_IO_UNIT4_SENSOR_MAX (1) +#endif +struct mpi3_io_unit_page4 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_sensors; + u8 reserved0d[3]; + struct mpi3_io_unit4_sensor sensor[MPI3_IO_UNIT4_SENSOR_MAX]; +}; +#define MPI3_IOUNIT4_PAGEVERSION (0x00) +struct mpi3_io_unit5_spinup_group { + u8 max_target_spinup; + u8 spinup_delay; + u8 spinup_flags; + u8 reserved03; +}; +#define MPI3_IOUNIT5_SPINUP_FLAGS_DISABLE (0x01) +#ifndef MPI3_IO_UNIT5_PHY_MAX +#define MPI3_IO_UNIT5_PHY_MAX (4) +#endif +struct mpi3_io_unit_page5 { + struct mpi3_config_page_header header; + struct mpi3_io_unit5_spinup_group spinup_group_parameters[4]; + __le32 reserved18; + __le32 reserved1c; + __le16 device_shutdown; + __le16 reserved22; + u8 pcie_device_wait_time; + u8 sata_device_wait_time; + u8 spinup_encl_drive_count; + u8 spinup_encl_delay; + u8 num_phys; + u8 pe_initial_spinup_delay; + u8 topology_stable_time; + u8 flags; + u8 phy[MPI3_IO_UNIT5_PHY_MAX]; +}; +#define MPI3_IOUNIT5_PAGEVERSION (0x00) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_NO_ACTION (0x00) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_DIRECT_ATTACHED (0x01) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_EXPANDER_ATTACHED (0x02) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SWITCH_ATTACHED (0x02) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_DIRECT_AND_EXPANDER (0x03) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_DIRECT_AND_SWITCH (0x03) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SATA_HDD_MASK (0x0300) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SATA_HDD_SHIFT (8) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SAS_HDD_MASK (0x00c0) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SAS_HDD_SHIFT (6) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_NVME_SSD_MASK (0x0030) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_NVME_SSD_SHIFT (4) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SATA_SSD_MASK (0x000c) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SATA_SSD_SHIFT (2) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SAS_SSD_MASK (0x0003) +#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SAA_SSD_SHIFT (0) +#define MPI3_IOUNIT5_FLAGS_SATAPUIS_MASK (0x0c) +#define MPI3_IOUNIT5_FLAGS_SATAPUIS_NOT_SUPPORTED (0x00) +#define MPI3_IOUNIT5_FLAGS_SATAPUIS_OS_CONTROLLED (0x04) +#define MPI3_IOUNIT5_FLAGS_SATAPUIS_APP_CONTROLLED (0x08) +#define MPI3_IOUNIT5_FLAGS_SATAPUIS_BLOCKED (0x0c) +#define MPI3_IOUNIT5_FLAGS_POWER_CAPABLE_SPINUP (0x02) +#define MPI3_IOUNIT5_FLAGS_AUTO_PORT_ENABLE (0x01) +#define MPI3_IOUNIT5_PHY_SPINUP_GROUP_MASK (0x03) +struct mpi3_io_unit_page6 { + struct mpi3_config_page_header header; + __le32 board_power_requirement; + __le32 pci_slot_power_allocation; + u8 flags; + u8 reserved11[3]; +}; +#define MPI3_IOUNIT6_PAGEVERSION (0x00) +#define MPI3_IOUNIT6_FLAGS_ACT_CABLE_PWR_EXC (0x01) +#ifndef MPI3_IOUNIT8_DIGEST_MAX +#define MPI3_IOUNIT8_DIGEST_MAX (1) +#endif +union mpi3_iounit8_digest { + __le32 dword[16]; + __le16 word[32]; + u8 byte[64]; +}; +struct mpi3_io_unit_page8 { + struct mpi3_config_page_header header; + u8 sb_mode; + u8 sb_state; + __le16 reserved0a; + u8 num_slots; + u8 slots_available; + u8 current_key_encryption_algo; + u8 key_digest_hash_algo; + __le32 reserved10[2]; + __le32 current_key[128]; + union mpi3_iounit8_digest digest[MPI3_IOUNIT8_DIGEST_MAX]; +}; +#define MPI3_IOUNIT8_PAGEVERSION (0x00) +#define MPI3_IOUNIT8_SBMODE_SECURE_DEBUG (0x04) +#define MPI3_IOUNIT8_SBMODE_HARD_SECURE (0x02) +#define MPI3_IOUNIT8_SBMODE_CONFIG_SECURE (0x01) +#define MPI3_IOUNIT8_SBSTATE_KEY_UPDATE_PENDING (0x02) +#define MPI3_IOUNIT8_SBSTATE_SECURE_BOOT_ENABLED (0x01) +struct mpi3_io_unit_page9 { + struct mpi3_config_page_header header; + __le32 flags; + __le16 first_device; + __le16 reserved0e; +}; +#define MPI3_IOUNIT9_PAGEVERSION (0x00) +#define MPI3_IOUNIT9_FLAGS_VDFIRST_ENABLED (0x01) +#define MPI3_IOUNIT9_FIRSTDEVICE_UNKNOWN (0xffff) +struct mpi3_io_unit_page10 { + struct mpi3_config_page_header header; + u8 flags; + u8 reserved09[3]; + __le32 silicon_id; + u8 fw_version_minor; + u8 fw_version_major; + u8 hw_version_minor; + u8 hw_version_major; + u8 part_number[16]; +}; +#define MPI3_IOUNIT10_PAGEVERSION (0x00) +#define MPI3_IOUNIT10_FLAGS_VALID (0x01) +#define MPI3_IOUNIT10_FLAGS_ACTIVEID_MASK (0x02) +#define MPI3_IOUNIT10_FLAGS_ACTIVEID_FIRST_REGION (0x00) +#define MPI3_IOUNIT10_FLAGS_ACTIVEID_SECOND_REGION (0x02) +#define MPI3_IOUNIT10_FLAGS_PBLP_EXPECTED (0x80) +#ifndef MPI3_IOUNIT11_PROFILE_MAX +#define MPI3_IOUNIT11_PROFILE_MAX (1) +#endif +struct mpi3_iounit11_profile { + u8 profile_identifier; + u8 reserved01[3]; + __le16 max_vds; + __le16 max_host_pds; + __le16 max_adv_host_pds; + __le16 max_raid_pds; + __le16 max_nvme; + __le16 max_outstanding_requests; + __le16 subsystem_id; + __le16 reserved12; + __le32 reserved14[2]; +}; +struct mpi3_io_unit_page11 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_profiles; + u8 current_profile_identifier; + __le16 reserved0e; + struct mpi3_iounit11_profile profile[MPI3_IOUNIT11_PROFILE_MAX]; +}; +#define MPI3_IOUNIT11_PAGEVERSION (0x00) +#ifndef MPI3_IOUNIT12_BUCKET_MAX +#define MPI3_IOUNIT12_BUCKET_MAX (1) +#endif +struct mpi3_iounit12_bucket { + u8 coalescing_depth; + u8 coalescing_timeout; + __le16 io_count_low_boundary; + __le32 reserved04; +}; +struct mpi3_io_unit_page12 { + struct mpi3_config_page_header header; + __le32 flags; + __le32 reserved0c[4]; + u8 num_buckets; + u8 reserved1d[3]; + struct mpi3_iounit12_bucket bucket[MPI3_IOUNIT12_BUCKET_MAX]; +}; +#define MPI3_IOUNIT12_PAGEVERSION (0x00) +#define MPI3_IOUNIT12_FLAGS_NUMPASSES_MASK (0x00000300) +#define MPI3_IOUNIT12_FLAGS_NUMPASSES_SHIFT (8) +#define MPI3_IOUNIT12_FLAGS_NUMPASSES_8 (0x00000000) +#define MPI3_IOUNIT12_FLAGS_NUMPASSES_16 (0x00000100) +#define MPI3_IOUNIT12_FLAGS_NUMPASSES_32 (0x00000200) +#define MPI3_IOUNIT12_FLAGS_NUMPASSES_64 (0x00000300) +#define MPI3_IOUNIT12_FLAGS_PASSPERIOD_MASK (0x00000003) +#define MPI3_IOUNIT12_FLAGS_PASSPERIOD_DISABLED (0x00000000) +#define MPI3_IOUNIT12_FLAGS_PASSPERIOD_500US (0x00000001) +#define MPI3_IOUNIT12_FLAGS_PASSPERIOD_1MS (0x00000002) +#define MPI3_IOUNIT12_FLAGS_PASSPERIOD_2MS (0x00000003) +struct mpi3_ioc_page0 { + struct mpi3_config_page_header header; + __le32 reserved08; + __le16 vendor_id; + __le16 device_id; + u8 revision_id; + u8 reserved11[3]; + __le32 class_code; + __le16 subsystem_vendor_id; + __le16 subsystem_id; +}; +#define MPI3_IOC0_PAGEVERSION (0x00) +struct mpi3_ioc_page1 { + struct mpi3_config_page_header header; + __le32 coalescing_timeout; + u8 coalescing_depth; + u8 obsolete; + __le16 reserved0e; +}; +#define MPI3_IOC1_PAGEVERSION (0x00) +#ifndef MPI3_IOC2_EVENTMASK_WORDS +#define MPI3_IOC2_EVENTMASK_WORDS (4) +#endif +struct mpi3_ioc_page2 { + struct mpi3_config_page_header header; + __le32 reserved08; + __le16 sas_broadcast_primitive_masks; + __le16 sas_notify_primitive_masks; + __le32 event_masks[MPI3_IOC2_EVENTMASK_WORDS]; +}; +#define MPI3_IOC2_PAGEVERSION (0x00) +#define MPI3_DRIVER_FLAGS_ADMINRAIDPD_BLOCKED (0x0010) +#define MPI3_DRIVER_FLAGS_OOBRAIDPD_BLOCKED (0x0008) +#define MPI3_DRIVER_FLAGS_OOBRAIDVD_BLOCKED (0x0004) +#define MPI3_DRIVER_FLAGS_OOBADVHOSTPD_BLOCKED (0x0002) +#define MPI3_DRIVER_FLAGS_OOBHOSTPD_BLOCKED (0x0001) +struct mpi3_allowed_cmd_scsi { + __le16 service_action; + u8 operation_code; + u8 command_flags; +}; +struct mpi3_allowed_cmd_ata { + u8 subcommand; + u8 reserved01; + u8 command; + u8 command_flags; +}; +struct mpi3_allowed_cmd_nvme { + u8 reserved00; + u8 nvme_cmd_flags; + u8 op_code; + u8 command_flags; +}; +#define MPI3_DRIVER_ALLOWEDCMD_NVMECMDFLAGS_SUBQ_TYPE_MASK (0x80) +#define MPI3_DRIVER_ALLOWEDCMD_NVMECMDFLAGS_SUBQ_TYPE_IO (0x00) +#define MPI3_DRIVER_ALLOWEDCMD_NVMECMDFLAGS_SUBQ_TYPE_ADMIN (0x80) +#define MPI3_DRIVER_ALLOWEDCMD_NVMECMDFLAGS_CMDSET_MASK (0x3f) +#define MPI3_DRIVER_ALLOWEDCMD_NVMECMDFLAGS_CMDSET_NVM (0x00) +union mpi3_allowed_cmd { + struct mpi3_allowed_cmd_scsi scsi; + struct mpi3_allowed_cmd_ata ata; + struct mpi3_allowed_cmd_nvme nvme; +}; +#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_ADMINRAIDPD_BLOCKED (0x20) +#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_OOBRAIDPD_BLOCKED (0x10) +#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_OOBRAIDVD_BLOCKED (0x08) +#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_OOBADVHOSTPD_BLOCKED (0x04) +#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_OOBHOSTPD_BLOCKED (0x02) +#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_CHECKSUBCMD_ENABLED (0x01) +#ifndef MPI3_ALLOWED_CMDS_MAX +#define MPI3_ALLOWED_CMDS_MAX (1) +#endif +struct mpi3_driver_page0 { + struct mpi3_config_page_header header; + __le32 bsd_options; + u8 ssu_timeout; + u8 io_timeout; + u8 tur_retries; + u8 tur_interval; + u8 reserved10; + u8 security_key_timeout; + __le16 reserved12; + __le32 reserved14; + __le32 reserved18; +}; +#define MPI3_DRIVER0_PAGEVERSION (0x00) +#define MPI3_DRIVER0_BSDOPTS_DIS_HII_CONFIG_UTIL (0x00000004) +#define MPI3_DRIVER0_BSDOPTS_REGISTRATION_MASK (0x00000003) +#define MPI3_DRIVER0_BSDOPTS_REGISTRATION_IOC_AND_DEVS (0x00000000) +#define MPI3_DRIVER0_BSDOPTS_REGISTRATION_IOC_ONLY (0x00000001) +struct mpi3_driver_page1 { + struct mpi3_config_page_header header; + __le32 flags; + __le32 reserved0c; + __le16 host_diag_trace_max_size; + __le16 host_diag_trace_min_size; + __le16 host_diag_trace_decrement_size; + __le16 reserved16; + __le16 host_diag_fw_max_size; + __le16 host_diag_fw_min_size; + __le16 host_diag_fw_decrement_size; + __le16 reserved1e; + __le16 host_diag_driver_max_size; + __le16 host_diag_driver_min_size; + __le16 host_diag_driver_decrement_size; + __le16 reserved26; +}; +#define MPI3_DRIVER1_PAGEVERSION (0x00) +#ifndef MPI3_DRIVER2_TRIGGER_MAX +#define MPI3_DRIVER2_TRIGGER_MAX (1) +#endif +struct mpi3_driver2_trigger_event { + u8 type; + u8 flags; + u8 reserved02; + u8 event; + __le32 reserved04[3]; +}; +struct mpi3_driver2_trigger_scsi_sense { + u8 type; + u8 flags; + __le16 reserved02; + u8 ascq; + u8 asc; + u8 sense_key; + u8 reserved07; + __le32 reserved08[2]; +}; +#define MPI3_DRIVER2_TRIGGER_SCSI_SENSE_ASCQ_MATCH_ALL (0xff) +#define MPI3_DRIVER2_TRIGGER_SCSI_SENSE_ASC_MATCH_ALL (0xff) +#define MPI3_DRIVER2_TRIGGER_SCSI_SENSE_SENSE_KEY_MATCH_ALL (0xff) +struct mpi3_driver2_trigger_reply { + u8 type; + u8 flags; + __le16 ioc_status; + __le32 ioc_log_info; + __le32 ioc_log_info_mask; + __le32 reserved0c; +}; +#define MPI3_DRIVER2_TRIGGER_REPLY_IOCSTATUS_MATCH_ALL (0xffff) +union mpi3_driver2_trigger_element { + struct mpi3_driver2_trigger_event event; + struct mpi3_driver2_trigger_scsi_sense scsi_sense; + struct mpi3_driver2_trigger_reply reply; +}; +#define MPI3_DRIVER2_TRIGGER_TYPE_EVENT (0x00) +#define MPI3_DRIVER2_TRIGGER_TYPE_SCSI_SENSE (0x01) +#define MPI3_DRIVER2_TRIGGER_TYPE_REPLY (0x02) +#define MPI3_DRIVER2_TRIGGER_FLAGS_DIAG_TRACE_RELEASE (0x02) +#define MPI3_DRIVER2_TRIGGER_FLAGS_DIAG_FW_RELEASE (0x01) +struct mpi3_driver_page2 { + struct mpi3_config_page_header header; + __le64 master_trigger; + __le32 reserved10[3]; + u8 num_triggers; + u8 reserved1d[3]; + union mpi3_driver2_trigger_element trigger[MPI3_DRIVER2_TRIGGER_MAX]; +}; +#define MPI3_DRIVER2_PAGEVERSION (0x00) +#define MPI3_DRIVER2_MASTERTRIGGER_DIAG_TRACE_RELEASE (0x8000000000000000ULL) +#define MPI3_DRIVER2_MASTERTRIGGER_DIAG_FW_RELEASE (0x4000000000000000ULL) +#define MPI3_DRIVER2_MASTERTRIGGER_SNAPDUMP (0x2000000000000000ULL) +#define MPI3_DRIVER2_MASTERTRIGGER_DEVICE_REMOVAL_ENABLED (0x0000000000000004ULL) +#define MPI3_DRIVER2_MASTERTRIGGER_TASK_MANAGEMENT_ENABLED (0x0000000000000002ULL) +struct mpi3_driver_page10 { + struct mpi3_config_page_header header; + __le16 flags; + __le16 reserved0a; + u8 num_allowed_commands; + u8 reserved0d[3]; + union mpi3_allowed_cmd allowed_command[MPI3_ALLOWED_CMDS_MAX]; +}; +#define MPI3_DRIVER10_PAGEVERSION (0x00) +struct mpi3_driver_page20 { + struct mpi3_config_page_header header; + __le16 flags; + __le16 reserved0a; + u8 num_allowed_commands; + u8 reserved0d[3]; + union mpi3_allowed_cmd allowed_command[MPI3_ALLOWED_CMDS_MAX]; +}; +#define MPI3_DRIVER20_PAGEVERSION (0x00) +struct mpi3_driver_page30 { + struct mpi3_config_page_header header; + __le16 flags; + __le16 reserved0a; + u8 num_allowed_commands; + u8 reserved0d[3]; + union mpi3_allowed_cmd allowed_command[MPI3_ALLOWED_CMDS_MAX]; +}; +#define MPI3_DRIVER30_PAGEVERSION (0x00) +union mpi3_security_mac { + __le32 dword[16]; + __le16 word[32]; + u8 byte[64]; +}; +union mpi3_security_nonce { + __le32 dword[16]; + __le16 word[32]; + u8 byte[64]; +}; +union mpi3_security0_cert_chain { + __le32 dword[1024]; + __le16 word[2048]; + u8 byte[4096]; +}; +struct mpi3_security_page0 { + struct mpi3_config_page_header header; + u8 slot_num_group; + u8 slot_num; + __le16 cert_chain_length; + u8 cert_chain_flags; + u8 reserved0d[3]; + __le32 base_asym_algo; + __le32 base_hash_algo; + __le32 reserved18[4]; + union mpi3_security_mac mac; + union mpi3_security_nonce nonce; + union mpi3_security0_cert_chain certificate_chain; +}; +#define MPI3_SECURITY0_PAGEVERSION (0x00) +#define MPI3_SECURITY0_CERTCHAIN_FLAGS_AUTH_API_MASK (0x0e) +#define MPI3_SECURITY0_CERTCHAIN_FLAGS_AUTH_API_UNUSED (0x00) +#define MPI3_SECURITY0_CERTCHAIN_FLAGS_AUTH_API_CERBERUS (0x02) +#define MPI3_SECURITY0_CERTCHAIN_FLAGS_AUTH_API_SPDM (0x04) +#define MPI3_SECURITY0_CERTCHAIN_FLAGS_SEALED (0x01) +#ifndef MPI3_SECURITY1_KEY_RECORD_MAX +#define MPI3_SECURITY1_KEY_RECORD_MAX 1 +#endif +#ifndef MPI3_SECURITY1_PAD_MAX +#define MPI3_SECURITY1_PAD_MAX 1 +#endif +union mpi3_security1_key_data { + __le32 dword[128]; + __le16 word[256]; + u8 byte[512]; +}; +struct mpi3_security1_key_record { + u8 flags; + u8 consumer; + __le16 key_data_size; + __le32 additional_key_data; + __le32 reserved08[2]; + union mpi3_security1_key_data key_data; +}; +#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_MASK (0x1f) +#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_NOT_VALID (0x00) +#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_HMAC (0x01) +#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_AES (0x02) +#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_ECDSA_PRIVATE (0x03) +#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_ECDSA_PUBLIC (0x04) +#define MPI3_SECURITY1_KEY_RECORD_CONSUMER_NOT_VALID (0x00) +#define MPI3_SECURITY1_KEY_RECORD_CONSUMER_SAFESTORE (0x01) +#define MPI3_SECURITY1_KEY_RECORD_CONSUMER_CERT_CHAIN (0x02) +#define MPI3_SECURITY1_KEY_RECORD_CONSUMER_DEVICE_KEY (0x03) +#define MPI3_SECURITY1_KEY_RECORD_CONSUMER_CACHE_OFFLOAD (0x04) +struct mpi3_security_page1 { + struct mpi3_config_page_header header; + __le32 reserved08[2]; + union mpi3_security_mac mac; + union mpi3_security_nonce nonce; + u8 num_keys; + u8 reserved91[3]; + __le32 reserved94[3]; + struct mpi3_security1_key_record key_record[MPI3_SECURITY1_KEY_RECORD_MAX]; + u8 pad[MPI3_SECURITY1_PAD_MAX]; +}; +#define MPI3_SECURITY1_PAGEVERSION (0x00) +struct mpi3_sas_io_unit0_phy_data { + u8 io_unit_port; + u8 port_flags; + u8 phy_flags; + u8 negotiated_link_rate; + __le16 controller_phy_device_info; + __le16 reserved06; + __le16 attached_dev_handle; + __le16 controller_dev_handle; + __le32 discovery_status; + __le32 reserved10; +}; +#ifndef MPI3_SAS_IO_UNIT0_PHY_MAX +#define MPI3_SAS_IO_UNIT0_PHY_MAX (1) +#endif +struct mpi3_sas_io_unit_page0 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_phys; + u8 init_status; + __le16 reserved0e; + struct mpi3_sas_io_unit0_phy_data phy_data[MPI3_SAS_IO_UNIT0_PHY_MAX]; +}; +#define MPI3_SASIOUNIT0_PAGEVERSION (0x00) +#define MPI3_SASIOUNIT0_INITSTATUS_NO_ERRORS (0x00) +#define MPI3_SASIOUNIT0_INITSTATUS_NEEDS_INITIALIZATION (0x01) +#define MPI3_SASIOUNIT0_INITSTATUS_NO_TARGETS_ALLOCATED (0x02) +#define MPI3_SASIOUNIT0_INITSTATUS_BAD_NUM_PHYS (0x04) +#define MPI3_SASIOUNIT0_INITSTATUS_UNSUPPORTED_CONFIG (0x05) +#define MPI3_SASIOUNIT0_INITSTATUS_HOST_PHYS_ENABLED (0x06) +#define MPI3_SASIOUNIT0_INITSTATUS_PRODUCT_SPECIFIC_MIN (0xf0) +#define MPI3_SASIOUNIT0_INITSTATUS_PRODUCT_SPECIFIC_MAX (0xff) +#define MPI3_SASIOUNIT0_PORTFLAGS_DISC_IN_PROGRESS (0x08) +#define MPI3_SASIOUNIT0_PORTFLAGS_AUTO_PORT_CONFIG_MASK (0x03) +#define MPI3_SASIOUNIT0_PORTFLAGS_AUTO_PORT_CONFIG_IOUNIT1 (0x00) +#define MPI3_SASIOUNIT0_PORTFLAGS_AUTO_PORT_CONFIG_DYNAMIC (0x01) +#define MPI3_SASIOUNIT0_PORTFLAGS_AUTO_PORT_CONFIG_BACKPLANE (0x02) +#define MPI3_SASIOUNIT0_PHYFLAGS_INIT_PERSIST_CONNECT (0x40) +#define MPI3_SASIOUNIT0_PHYFLAGS_TARG_PERSIST_CONNECT (0x20) +#define MPI3_SASIOUNIT0_PHYFLAGS_PHY_DISABLED (0x08) +#define MPI3_SASIOUNIT0_PHYFLAGS_VIRTUAL_PHY (0x02) +#define MPI3_SASIOUNIT0_PHYFLAGS_HOST_PHY (0x01) +struct mpi3_sas_io_unit1_phy_data { + u8 io_unit_port; + u8 port_flags; + u8 phy_flags; + u8 max_min_link_rate; + __le16 controller_phy_device_info; + __le16 max_target_port_connect_time; + __le32 reserved08; +}; +#ifndef MPI3_SAS_IO_UNIT1_PHY_MAX +#define MPI3_SAS_IO_UNIT1_PHY_MAX (1) +#endif +struct mpi3_sas_io_unit_page1 { + struct mpi3_config_page_header header; + __le16 control_flags; + __le16 sas_narrow_max_queue_depth; + __le16 additional_control_flags; + __le16 sas_wide_max_queue_depth; + u8 num_phys; + u8 sata_max_q_depth; + __le16 reserved12; + struct mpi3_sas_io_unit1_phy_data phy_data[MPI3_SAS_IO_UNIT1_PHY_MAX]; +}; +#define MPI3_SASIOUNIT1_PAGEVERSION (0x00) +#define MPI3_SASIOUNIT1_CONTROL_CONTROLLER_DEVICE_SELF_TEST (0x8000) +#define MPI3_SASIOUNIT1_CONTROL_SATA_SW_PRESERVE (0x1000) +#define MPI3_SASIOUNIT1_CONTROL_SATA_48BIT_LBA_REQUIRED (0x0080) +#define MPI3_SASIOUNIT1_CONTROL_SATA_SMART_REQUIRED (0x0040) +#define MPI3_SASIOUNIT1_CONTROL_SATA_NCQ_REQUIRED (0x0020) +#define MPI3_SASIOUNIT1_CONTROL_SATA_FUA_REQUIRED (0x0010) +#define MPI3_SASIOUNIT1_CONTROL_TABLE_SUBTRACTIVE_ILLEGAL (0x0008) +#define MPI3_SASIOUNIT1_CONTROL_SUBTRACTIVE_ILLEGAL (0x0004) +#define MPI3_SASIOUNIT1_CONTROL_FIRST_LVL_DISC_ONLY (0x0002) +#define MPI3_SASIOUNIT1_CONTROL_HARD_RESET_MASK (0x0001) +#define MPI3_SASIOUNIT1_CONTROL_HARD_RESET_DEVICE_NAME (0x0000) +#define MPI3_SASIOUNIT1_CONTROL_HARD_RESET_SAS_ADDRESS (0x0001) +#define MPI3_SASIOUNIT1_ACONTROL_DA_PERSIST_CONNECT (0x0100) +#define MPI3_SASIOUNIT1_ACONTROL_MULTI_PORT_DOMAIN_ILLEGAL (0x0080) +#define MPI3_SASIOUNIT1_ACONTROL_SATA_ASYNCHROUNOUS_NOTIFICATION (0x0040) +#define MPI3_SASIOUNIT1_ACONTROL_INVALID_TOPOLOGY_CORRECTION (0x0020) +#define MPI3_SASIOUNIT1_ACONTROL_PORT_ENABLE_ONLY_SATA_LINK_RESET (0x0010) +#define MPI3_SASIOUNIT1_ACONTROL_OTHER_AFFILIATION_SATA_LINK_RESET (0x0008) +#define MPI3_SASIOUNIT1_ACONTROL_SELF_AFFILIATION_SATA_LINK_RESET (0x0004) +#define MPI3_SASIOUNIT1_ACONTROL_NO_AFFILIATION_SATA_LINK_RESET (0x0002) +#define MPI3_SASIOUNIT1_ACONTROL_ALLOW_TABLE_TO_TABLE (0x0001) +#define MPI3_SASIOUNIT1_PORT_FLAGS_AUTO_PORT_CONFIG (0x01) +#define MPI3_SASIOUNIT1_PHYFLAGS_INIT_PERSIST_CONNECT (0x40) +#define MPI3_SASIOUNIT1_PHYFLAGS_TARG_PERSIST_CONNECT (0x20) +#define MPI3_SASIOUNIT1_PHYFLAGS_PHY_DISABLE (0x08) +#define MPI3_SASIOUNIT1_MMLR_MAX_RATE_MASK (0xf0) +#define MPI3_SASIOUNIT1_MMLR_MAX_RATE_SHIFT (4) +#define MPI3_SASIOUNIT1_MMLR_MAX_RATE_6_0 (0xa0) +#define MPI3_SASIOUNIT1_MMLR_MAX_RATE_12_0 (0xb0) +#define MPI3_SASIOUNIT1_MMLR_MAX_RATE_22_5 (0xc0) +#define MPI3_SASIOUNIT1_MMLR_MIN_RATE_MASK (0x0f) +#define MPI3_SASIOUNIT1_MMLR_MIN_RATE_6_0 (0x0a) +#define MPI3_SASIOUNIT1_MMLR_MIN_RATE_12_0 (0x0b) +#define MPI3_SASIOUNIT1_MMLR_MIN_RATE_22_5 (0x0c) +struct mpi3_sas_io_unit2_phy_pm_settings { + u8 control_flags; + u8 reserved01; + __le16 inactivity_timer_exponent; + u8 sata_partial_timeout; + u8 reserved05; + u8 sata_slumber_timeout; + u8 reserved07; + u8 sas_partial_timeout; + u8 reserved09; + u8 sas_slumber_timeout; + u8 reserved0b; +}; +#ifndef MPI3_SAS_IO_UNIT2_PHY_MAX +#define MPI3_SAS_IO_UNIT2_PHY_MAX (1) +#endif +struct mpi3_sas_io_unit_page2 { + struct mpi3_config_page_header header; + u8 num_phys; + u8 reserved09[3]; + __le32 reserved0c; + struct mpi3_sas_io_unit2_phy_pm_settings sas_phy_power_management_settings[MPI3_SAS_IO_UNIT2_PHY_MAX]; +}; +#define MPI3_SASIOUNIT2_PAGEVERSION (0x00) +#define MPI3_SASIOUNIT2_CONTROL_SAS_SLUMBER_ENABLE (0x08) +#define MPI3_SASIOUNIT2_CONTROL_SAS_PARTIAL_ENABLE (0x04) +#define MPI3_SASIOUNIT2_CONTROL_SATA_SLUMBER_ENABLE (0x02) +#define MPI3_SASIOUNIT2_CONTROL_SATA_PARTIAL_ENABLE (0x01) +#define MPI3_SASIOUNIT2_ITE_SAS_SLUMBER_MASK (0x7000) +#define MPI3_SASIOUNIT2_ITE_SAS_SLUMBER_SHIFT (12) +#define MPI3_SASIOUNIT2_ITE_SAS_PARTIAL_MASK (0x0700) +#define MPI3_SASIOUNIT2_ITE_SAS_PARTIAL_SHIFT (8) +#define MPI3_SASIOUNIT2_ITE_SATA_SLUMBER_MASK (0x0070) +#define MPI3_SASIOUNIT2_ITE_SATA_SLUMBER_SHIFT (4) +#define MPI3_SASIOUNIT2_ITE_SATA_PARTIAL_MASK (0x0007) +#define MPI3_SASIOUNIT2_ITE_SATA_PARTIAL_SHIFT (0) +#define MPI3_SASIOUNIT2_ITE_EXP_TEN_SECONDS (7) +#define MPI3_SASIOUNIT2_ITE_EXP_ONE_SECOND (6) +#define MPI3_SASIOUNIT2_ITE_EXP_HUNDRED_MILLISECONDS (5) +#define MPI3_SASIOUNIT2_ITE_EXP_TEN_MILLISECONDS (4) +#define MPI3_SASIOUNIT2_ITE_EXP_ONE_MILLISECOND (3) +#define MPI3_SASIOUNIT2_ITE_EXP_HUNDRED_MICROSECONDS (2) +#define MPI3_SASIOUNIT2_ITE_EXP_TEN_MICROSECONDS (1) +#define MPI3_SASIOUNIT2_ITE_EXP_ONE_MICROSECOND (0) +struct mpi3_sas_io_unit_page3 { + struct mpi3_config_page_header header; + __le32 reserved08; + __le32 power_management_capabilities; +}; +#define MPI3_SASIOUNIT3_PAGEVERSION (0x00) +#define MPI3_SASIOUNIT3_PM_HOST_SAS_SLUMBER_MODE (0x00000800) +#define MPI3_SASIOUNIT3_PM_HOST_SAS_PARTIAL_MODE (0x00000400) +#define MPI3_SASIOUNIT3_PM_HOST_SATA_SLUMBER_MODE (0x00000200) +#define MPI3_SASIOUNIT3_PM_HOST_SATA_PARTIAL_MODE (0x00000100) +#define MPI3_SASIOUNIT3_PM_IOUNIT_SAS_SLUMBER_MODE (0x00000008) +#define MPI3_SASIOUNIT3_PM_IOUNIT_SAS_PARTIAL_MODE (0x00000004) +#define MPI3_SASIOUNIT3_PM_IOUNIT_SATA_SLUMBER_MODE (0x00000002) +#define MPI3_SASIOUNIT3_PM_IOUNIT_SATA_PARTIAL_MODE (0x00000001) +struct mpi3_sas_expander_page0 { + struct mpi3_config_page_header header; + u8 io_unit_port; + u8 report_gen_length; + __le16 enclosure_handle; + __le32 reserved0c; + __le64 sas_address; + __le32 discovery_status; + __le16 dev_handle; + __le16 parent_dev_handle; + __le16 expander_change_count; + __le16 expander_route_indexes; + u8 num_phys; + u8 sas_level; + __le16 flags; + __le16 stp_bus_inactivity_time_limit; + __le16 stp_max_connect_time_limit; + __le16 stp_smp_nexus_loss_time; + __le16 max_num_routed_sas_addresses; + __le64 active_zone_manager_sas_address; + __le16 zone_lock_inactivity_limit; + __le16 reserved3a; + u8 time_to_reduced_func; + u8 initial_time_to_reduced_func; + u8 max_reduced_func_time; + u8 exp_status; +}; +#define MPI3_SASEXPANDER0_PAGEVERSION (0x00) +#define MPI3_SASEXPANDER0_FLAGS_REDUCED_FUNCTIONALITY (0x2000) +#define MPI3_SASEXPANDER0_FLAGS_ZONE_LOCKED (0x1000) +#define MPI3_SASEXPANDER0_FLAGS_SUPPORTED_PHYSICAL_PRES (0x0800) +#define MPI3_SASEXPANDER0_FLAGS_ASSERTED_PHYSICAL_PRES (0x0400) +#define MPI3_SASEXPANDER0_FLAGS_ZONING_SUPPORT (0x0200) +#define MPI3_SASEXPANDER0_FLAGS_ENABLED_ZONING (0x0100) +#define MPI3_SASEXPANDER0_FLAGS_TABLE_TO_TABLE_SUPPORT (0x0080) +#define MPI3_SASEXPANDER0_FLAGS_CONNECTOR_END_DEVICE (0x0010) +#define MPI3_SASEXPANDER0_FLAGS_OTHERS_CONFIG (0x0004) +#define MPI3_SASEXPANDER0_FLAGS_CONFIG_IN_PROGRESS (0x0002) +#define MPI3_SASEXPANDER0_FLAGS_ROUTE_TABLE_CONFIG (0x0001) +#define MPI3_SASEXPANDER0_ES_NOT_RESPONDING (0x02) +#define MPI3_SASEXPANDER0_ES_RESPONDING (0x03) +#define MPI3_SASEXPANDER0_ES_DELAY_NOT_RESPONDING (0x04) +struct mpi3_sas_expander_page1 { + struct mpi3_config_page_header header; + u8 io_unit_port; + u8 reserved09[3]; + u8 num_phys; + u8 phy; + __le16 num_table_entries_programmed; + u8 programmed_link_rate; + u8 hw_link_rate; + __le16 attached_dev_handle; + __le32 phy_info; + __le16 attached_device_info; + __le16 reserved1a; + __le16 expander_dev_handle; + u8 change_count; + u8 negotiated_link_rate; + u8 phy_identifier; + u8 attached_phy_identifier; + u8 reserved22; + u8 discovery_info; + __le32 attached_phy_info; + u8 zone_group; + u8 self_config_status; + __le16 reserved2a; + __le16 slot; + __le16 slot_index; +}; +#define MPI3_SASEXPANDER1_PAGEVERSION (0x00) +#define MPI3_SASEXPANDER1_DISCINFO_BAD_PHY_DISABLED (0x04) +#define MPI3_SASEXPANDER1_DISCINFO_LINK_STATUS_CHANGE (0x02) +#define MPI3_SASEXPANDER1_DISCINFO_NO_ROUTING_ENTRIES (0x01) +#ifndef MPI3_SASEXPANDER2_MAX_NUM_PHYS +#define MPI3_SASEXPANDER2_MAX_NUM_PHYS (1) +#endif +struct mpi3_sasexpander2_phy_element { + u8 link_change_count; + u8 reserved01; + __le16 rate_change_count; + __le32 reserved04; +}; +struct mpi3_sas_expander_page2 { + struct mpi3_config_page_header header; + u8 num_phys; + u8 reserved09; + __le16 dev_handle; + __le32 reserved0c; + struct mpi3_sasexpander2_phy_element phy[MPI3_SASEXPANDER2_MAX_NUM_PHYS]; +}; +#define MPI3_SASEXPANDER2_PAGEVERSION (0x00) +struct mpi3_sas_port_page0 { + struct mpi3_config_page_header header; + u8 port_number; + u8 reserved09; + u8 port_width; + u8 reserved0b; + u8 zone_group; + u8 reserved0d[3]; + __le64 sas_address; + __le16 device_info; + __le16 reserved1a; + __le32 reserved1c; +}; +#define MPI3_SASPORT0_PAGEVERSION (0x00) +struct mpi3_sas_phy_page0 { + struct mpi3_config_page_header header; + __le16 owner_dev_handle; + __le16 reserved0a; + __le16 attached_dev_handle; + u8 attached_phy_identifier; + u8 reserved0f; + __le32 attached_phy_info; + u8 programmed_link_rate; + u8 hw_link_rate; + u8 change_count; + u8 flags; + __le32 phy_info; + u8 negotiated_link_rate; + u8 reserved1d[3]; + __le16 slot; + __le16 slot_index; +}; +#define MPI3_SASPHY0_PAGEVERSION (0x00) +#define MPI3_SASPHY0_FLAGS_SGPIO_DIRECT_ATTACH_ENC (0x01) +struct mpi3_sas_phy_page1 { + struct mpi3_config_page_header header; + __le32 reserved08; + __le32 invalid_dword_count; + __le32 running_disparity_error_count; + __le32 loss_dword_synch_count; + __le32 phy_reset_problem_count; +}; +#define MPI3_SASPHY1_PAGEVERSION (0x00) +struct mpi3_sas_phy2_phy_event { + u8 phy_event_code; + u8 reserved01[3]; + __le32 phy_event_info; +}; +#ifndef MPI3_SAS_PHY2_PHY_EVENT_MAX +#define MPI3_SAS_PHY2_PHY_EVENT_MAX (1) +#endif +struct mpi3_sas_phy_page2 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_phy_events; + u8 reserved0d[3]; + struct mpi3_sas_phy2_phy_event phy_event[MPI3_SAS_PHY2_PHY_EVENT_MAX]; +}; +#define MPI3_SASPHY2_PAGEVERSION (0x00) +struct mpi3_sas_phy3_phy_event_config { + u8 phy_event_code; + u8 reserved01[3]; + u8 counter_type; + u8 threshold_window; + u8 time_units; + u8 reserved07; + __le32 event_threshold; + __le16 threshold_flags; + __le16 reserved0e; +}; +#define MPI3_SASPHY3_EVENT_CODE_NO_EVENT (0x00) +#define MPI3_SASPHY3_EVENT_CODE_INVALID_DWORD (0x01) +#define MPI3_SASPHY3_EVENT_CODE_RUNNING_DISPARITY_ERROR (0x02) +#define MPI3_SASPHY3_EVENT_CODE_LOSS_DWORD_SYNC (0x03) +#define MPI3_SASPHY3_EVENT_CODE_PHY_RESET_PROBLEM (0x04) +#define MPI3_SASPHY3_EVENT_CODE_ELASTICITY_BUF_OVERFLOW (0x05) +#define MPI3_SASPHY3_EVENT_CODE_RX_ERROR (0x06) +#define MPI3_SASPHY3_EVENT_CODE_INV_SPL_PACKETS (0x07) +#define MPI3_SASPHY3_EVENT_CODE_LOSS_SPL_PACKET_SYNC (0x08) +#define MPI3_SASPHY3_EVENT_CODE_RX_ADDR_FRAME_ERROR (0x20) +#define MPI3_SASPHY3_EVENT_CODE_TX_AC_OPEN_REJECT (0x21) +#define MPI3_SASPHY3_EVENT_CODE_RX_AC_OPEN_REJECT (0x22) +#define MPI3_SASPHY3_EVENT_CODE_TX_RC_OPEN_REJECT (0x23) +#define MPI3_SASPHY3_EVENT_CODE_RX_RC_OPEN_REJECT (0x24) +#define MPI3_SASPHY3_EVENT_CODE_RX_AIP_PARTIAL_WAITING_ON (0x25) +#define MPI3_SASPHY3_EVENT_CODE_RX_AIP_CONNECT_WAITING_ON (0x26) +#define MPI3_SASPHY3_EVENT_CODE_TX_BREAK (0x27) +#define MPI3_SASPHY3_EVENT_CODE_RX_BREAK (0x28) +#define MPI3_SASPHY3_EVENT_CODE_BREAK_TIMEOUT (0x29) +#define MPI3_SASPHY3_EVENT_CODE_CONNECTION (0x2a) +#define MPI3_SASPHY3_EVENT_CODE_PEAKTX_PATHWAY_BLOCKED (0x2b) +#define MPI3_SASPHY3_EVENT_CODE_PEAKTX_ARB_WAIT_TIME (0x2c) +#define MPI3_SASPHY3_EVENT_CODE_PEAK_ARB_WAIT_TIME (0x2d) +#define MPI3_SASPHY3_EVENT_CODE_PEAK_CONNECT_TIME (0x2e) +#define MPI3_SASPHY3_EVENT_CODE_PERSIST_CONN (0x2f) +#define MPI3_SASPHY3_EVENT_CODE_TX_SSP_FRAMES (0x40) +#define MPI3_SASPHY3_EVENT_CODE_RX_SSP_FRAMES (0x41) +#define MPI3_SASPHY3_EVENT_CODE_TX_SSP_ERROR_FRAMES (0x42) +#define MPI3_SASPHY3_EVENT_CODE_RX_SSP_ERROR_FRAMES (0x43) +#define MPI3_SASPHY3_EVENT_CODE_TX_CREDIT_BLOCKED (0x44) +#define MPI3_SASPHY3_EVENT_CODE_RX_CREDIT_BLOCKED (0x45) +#define MPI3_SASPHY3_EVENT_CODE_TX_SATA_FRAMES (0x50) +#define MPI3_SASPHY3_EVENT_CODE_RX_SATA_FRAMES (0x51) +#define MPI3_SASPHY3_EVENT_CODE_SATA_OVERFLOW (0x52) +#define MPI3_SASPHY3_EVENT_CODE_TX_SMP_FRAMES (0x60) +#define MPI3_SASPHY3_EVENT_CODE_RX_SMP_FRAMES (0x61) +#define MPI3_SASPHY3_EVENT_CODE_RX_SMP_ERROR_FRAMES (0x63) +#define MPI3_SASPHY3_EVENT_CODE_HOTPLUG_TIMEOUT (0xd0) +#define MPI3_SASPHY3_EVENT_CODE_MISALIGNED_MUX_PRIMITIVE (0xd1) +#define MPI3_SASPHY3_EVENT_CODE_RX_AIP (0xd2) +#define MPI3_SASPHY3_EVENT_CODE_LCARB_WAIT_TIME (0xd3) +#define MPI3_SASPHY3_EVENT_CODE_RCVD_CONN_RESP_WAIT_TIME (0xd4) +#define MPI3_SASPHY3_EVENT_CODE_LCCONN_TIME (0xd5) +#define MPI3_SASPHY3_EVENT_CODE_SSP_TX_START_TRANSMIT (0xd6) +#define MPI3_SASPHY3_EVENT_CODE_SATA_TX_START (0xd7) +#define MPI3_SASPHY3_EVENT_CODE_SMP_TX_START_TRANSMT (0xd8) +#define MPI3_SASPHY3_EVENT_CODE_TX_SMP_BREAK_CONN (0xd9) +#define MPI3_SASPHY3_EVENT_CODE_SSP_RX_START_RECEIVE (0xda) +#define MPI3_SASPHY3_EVENT_CODE_SATA_RX_START_RECEIVE (0xdb) +#define MPI3_SASPHY3_EVENT_CODE_SMP_RX_START_RECEIVE (0xdc) +#define MPI3_SASPHY3_COUNTER_TYPE_WRAPPING (0x00) +#define MPI3_SASPHY3_COUNTER_TYPE_SATURATING (0x01) +#define MPI3_SASPHY3_COUNTER_TYPE_PEAK_VALUE (0x02) +#define MPI3_SASPHY3_TIME_UNITS_10_MICROSECONDS (0x00) +#define MPI3_SASPHY3_TIME_UNITS_100_MICROSECONDS (0x01) +#define MPI3_SASPHY3_TIME_UNITS_1_MILLISECOND (0x02) +#define MPI3_SASPHY3_TIME_UNITS_10_MILLISECONDS (0x03) +#define MPI3_SASPHY3_TFLAGS_PHY_RESET (0x0002) +#define MPI3_SASPHY3_TFLAGS_EVENT_NOTIFY (0x0001) +#ifndef MPI3_SAS_PHY3_PHY_EVENT_MAX +#define MPI3_SAS_PHY3_PHY_EVENT_MAX (1) +#endif +struct mpi3_sas_phy_page3 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_phy_events; + u8 reserved0d[3]; + struct mpi3_sas_phy3_phy_event_config phy_event_config[MPI3_SAS_PHY3_PHY_EVENT_MAX]; +}; +#define MPI3_SASPHY3_PAGEVERSION (0x00) +struct mpi3_sas_phy_page4 { + struct mpi3_config_page_header header; + u8 reserved08[3]; + u8 flags; + u8 initial_frame[28]; +}; +#define MPI3_SASPHY4_PAGEVERSION (0x00) +#define MPI3_SASPHY4_FLAGS_FRAME_VALID (0x02) +#define MPI3_SASPHY4_FLAGS_SATA_FRAME (0x01) +#define MPI3_PCIE_LINK_RETIMERS_MASK (0x30) +#define MPI3_PCIE_LINK_RETIMERS_SHIFT (4) +#define MPI3_PCIE_NEG_LINK_RATE_MASK (0x0f) +#define MPI3_PCIE_NEG_LINK_RATE_UNKNOWN (0x00) +#define MPI3_PCIE_NEG_LINK_RATE_PHY_DISABLED (0x01) +#define MPI3_PCIE_NEG_LINK_RATE_2_5 (0x02) +#define MPI3_PCIE_NEG_LINK_RATE_5_0 (0x03) +#define MPI3_PCIE_NEG_LINK_RATE_8_0 (0x04) +#define MPI3_PCIE_NEG_LINK_RATE_16_0 (0x05) +#define MPI3_PCIE_NEG_LINK_RATE_32_0 (0x06) +#define MPI3_PCIE_ASPM_ENABLE_NONE (0x0) +#define MPI3_PCIE_ASPM_ENABLE_L0S (0x1) +#define MPI3_PCIE_ASPM_ENABLE_L1 (0x2) +#define MPI3_PCIE_ASPM_ENABLE_L0S_L1 (0x3) +#define MPI3_PCIE_ASPM_SUPPORT_NONE (0x0) +#define MPI3_PCIE_ASPM_SUPPORT_L0S (0x1) +#define MPI3_PCIE_ASPM_SUPPORT_L1 (0x2) +#define MPI3_PCIE_ASPM_SUPPORT_L0S_L1 (0x3) +struct mpi3_pcie_io_unit0_phy_data { + u8 link; + u8 link_flags; + u8 phy_flags; + u8 negotiated_link_rate; + __le16 attached_dev_handle; + __le16 controller_dev_handle; + __le32 enumeration_status; + u8 io_unit_port; + u8 reserved0d[3]; +}; +#define MPI3_PCIEIOUNIT0_LINKFLAGS_CONFIG_SOURCE_MASK (0x10) +#define MPI3_PCIEIOUNIT0_LINKFLAGS_CONFIG_SOURCE_IOUNIT1 (0x00) +#define MPI3_PCIEIOUNIT0_LINKFLAGS_CONFIG_SOURCE_BKPLANE (0x10) +#define MPI3_PCIEIOUNIT0_LINKFLAGS_ENUM_IN_PROGRESS (0x08) +#define MPI3_PCIEIOUNIT0_PHYFLAGS_PHY_DISABLED (0x08) +#define MPI3_PCIEIOUNIT0_PHYFLAGS_HOST_PHY (0x01) +#define MPI3_PCIEIOUNIT0_ES_MAX_SWITCH_DEPTH_EXCEEDED (0x80000000) +#define MPI3_PCIEIOUNIT0_ES_MAX_SWITCHES_EXCEEDED (0x40000000) +#define MPI3_PCIEIOUNIT0_ES_MAX_ENDPOINTS_EXCEEDED (0x20000000) +#define MPI3_PCIEIOUNIT0_ES_INSUFFICIENT_RESOURCES (0x10000000) +#ifndef MPI3_PCIE_IO_UNIT0_PHY_MAX +#define MPI3_PCIE_IO_UNIT0_PHY_MAX (1) +#endif +struct mpi3_pcie_io_unit_page0 { + struct mpi3_config_page_header header; + __le32 reserved08; + u8 num_phys; + u8 init_status; + u8 aspm; + u8 reserved0f; + struct mpi3_pcie_io_unit0_phy_data phy_data[MPI3_PCIE_IO_UNIT0_PHY_MAX]; +}; +#define MPI3_PCIEIOUNIT0_PAGEVERSION (0x00) +#define MPI3_PCIEIOUNIT0_INITSTATUS_NO_ERRORS (0x00) +#define MPI3_PCIEIOUNIT0_INITSTATUS_NEEDS_INITIALIZATION (0x01) +#define MPI3_PCIEIOUNIT0_INITSTATUS_NO_TARGETS_ALLOCATED (0x02) +#define MPI3_PCIEIOUNIT0_INITSTATUS_RESOURCE_ALLOC_FAILED (0x03) +#define MPI3_PCIEIOUNIT0_INITSTATUS_BAD_NUM_PHYS (0x04) +#define MPI3_PCIEIOUNIT0_INITSTATUS_UNSUPPORTED_CONFIG (0x05) +#define MPI3_PCIEIOUNIT0_INITSTATUS_HOST_PORT_MISMATCH (0x06) +#define MPI3_PCIEIOUNIT0_INITSTATUS_PHYS_NOT_CONSECUTIVE (0x07) +#define MPI3_PCIEIOUNIT0_INITSTATUS_BAD_CLOCKING_MODE (0x08) +#define MPI3_PCIEIOUNIT0_INITSTATUS_PROD_SPEC_START (0xf0) +#define MPI3_PCIEIOUNIT0_INITSTATUS_PROD_SPEC_END (0xff) +#define MPI3_PCIEIOUNIT0_ASPM_SWITCH_STATES_MASK (0xc0) +#define MPI3_PCIEIOUNIT0_ASPM_SWITCH_STATES_SHIFT (6) +#define MPI3_PCIEIOUNIT0_ASPM_DIRECT_STATES_MASK (0x30) +#define MPI3_PCIEIOUNIT0_ASPM_DIRECT_STATES_SHIFT (4) +#define MPI3_PCIEIOUNIT0_ASPM_SWITCH_SUPPORT_MASK (0x0c) +#define MPI3_PCIEIOUNIT0_ASPM_SWITCH_SUPPORT_SHIFT (2) +#define MPI3_PCIEIOUNIT0_ASPM_DIRECT_SUPPORT_MASK (0x03) +#define MPI3_PCIEIOUNIT0_ASPM_DIRECT_SUPPORT_SHIFT (0) +struct mpi3_pcie_io_unit1_phy_data { + u8 link; + u8 link_flags; + u8 phy_flags; + u8 max_min_link_rate; + __le32 reserved04; + __le32 reserved08; +}; +#define MPI3_PCIEIOUNIT1_LINKFLAGS_PCIE_CLK_MODE_MASK (0x03) +#define MPI3_PCIEIOUNIT1_LINKFLAGS_PCIE_CLK_MODE_DIS_SEPARATE_REFCLK (0x00) +#define MPI3_PCIEIOUNIT1_LINKFLAGS_PCIE_CLK_MODE_EN_SRIS (0x01) +#define MPI3_PCIEIOUNIT1_LINKFLAGS_PCIE_CLK_MODE_EN_SRNS (0x02) +#define MPI3_PCIEIOUNIT1_PHYFLAGS_PHY_DISABLE (0x08) +#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_MASK (0xf0) +#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_SHIFT (4) +#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_2_5 (0x20) +#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_5_0 (0x30) +#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_8_0 (0x40) +#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_16_0 (0x50) +#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_32_0 (0x60) +#ifndef MPI3_PCIE_IO_UNIT1_PHY_MAX +#define MPI3_PCIE_IO_UNIT1_PHY_MAX (1) +#endif +struct mpi3_pcie_io_unit_page1 { + struct mpi3_config_page_header header; + __le32 control_flags; + __le32 reserved0c; + u8 num_phys; + u8 reserved11; + u8 aspm; + u8 reserved13; + struct mpi3_pcie_io_unit1_phy_data phy_data[MPI3_PCIE_IO_UNIT1_PHY_MAX]; +}; +#define MPI3_PCIEIOUNIT1_PAGEVERSION (0x00) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_OVERRIDE_DISABLE (0x80) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_DISABLE (0x40) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_MODE_MASK (0x30) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_MODE_SHIFT (4) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_MODE_SRIS_SRNS_DISABLED (0x00) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_MODE_SRIS_ENABLED (0x10) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_MODE_SRNS_ENABLED (0x20) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MASK (0x0f) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MAX_2_5 (0x02) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MAX_5_0 (0x03) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MAX_8_0 (0x04) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MAX_16_0 (0x05) +#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MAX_32_0 (0x06) +#define MPI3_PCIEIOUNIT1_ASPM_SWITCH_MASK (0x0c) +#define MPI3_PCIEIOUNIT1_ASPM_SWITCH_SHIFT (2) +#define MPI3_PCIEIOUNIT1_ASPM_DIRECT_MASK (0x03) +#define MPI3_PCIEIOUNIT1_ASPM_DIRECT_SHIFT (0) +struct mpi3_pcie_io_unit_page2 { + struct mpi3_config_page_header header; + __le16 nvme_max_q_dx1; + __le16 nvme_max_q_dx2; + u8 nvme_abort_to; + u8 reserved0d; + __le16 nvme_max_q_dx4; +}; +#define MPI3_PCIEIOUNIT2_PAGEVERSION (0x00) +#define MPI3_PCIEIOUNIT3_ERROR_RECEIVER_ERROR (0) +#define MPI3_PCIEIOUNIT3_ERROR_RECOVERY (1) +#define MPI3_PCIEIOUNIT3_ERROR_CORRECTABLE_ERROR_MSG (2) +#define MPI3_PCIEIOUNIT3_ERROR_BAD_DLLP (3) +#define MPI3_PCIEIOUNIT3_ERROR_BAD_TLP (4) +#define MPI3_PCIEIOUNIT3_NUM_ERROR_INDEX (5) +struct mpi3_pcie_io_unit3_error { + __le16 threshold_count; + __le16 reserved02; +}; +struct mpi3_pcie_io_unit_page3 { + struct mpi3_config_page_header header; + u8 threshold_window; + u8 threshold_action; + u8 escalation_count; + u8 escalation_action; + u8 num_errors; + u8 reserved0d[3]; + struct mpi3_pcie_io_unit3_error error[MPI3_PCIEIOUNIT3_NUM_ERROR_INDEX]; +}; +#define MPI3_PCIEIOUNIT3_PAGEVERSION (0x00) +#define MPI3_PCIEIOUNIT3_ACTION_NO_ACTION (0x00) +#define MPI3_PCIEIOUNIT3_ACTION_HOT_RESET (0x01) +#define MPI3_PCIEIOUNIT3_ACTION_REDUCE_LINK_RATE_ONLY (0x02) +#define MPI3_PCIEIOUNIT3_ACTION_REDUCE_LINK_RATE_NO_ACCESS (0x03) +struct mpi3_pcie_switch_page0 { + struct mpi3_config_page_header header; + u8 io_unit_port; + u8 switch_status; + u8 reserved0a[2]; + __le16 dev_handle; + __le16 parent_dev_handle; + u8 num_ports; + u8 pcie_level; + __le16 reserved12; + __le32 reserved14; + __le32 reserved18; + __le32 reserved1c; +}; +#define MPI3_PCIESWITCH0_PAGEVERSION (0x00) +#define MPI3_PCIESWITCH0_SS_NOT_RESPONDING (0x02) +#define MPI3_PCIESWITCH0_SS_RESPONDING (0x03) +#define MPI3_PCIESWITCH0_SS_DELAY_NOT_RESPONDING (0x04) +struct mpi3_pcie_switch_page1 { + struct mpi3_config_page_header header; + u8 io_unit_port; + u8 flags; + __le16 reserved0a; + u8 num_ports; + u8 port_num; + __le16 attached_dev_handle; + __le16 switch_dev_handle; + u8 negotiated_port_width; + u8 negotiated_link_rate; + __le16 slot; + __le16 slot_index; + __le32 reserved18; +}; +#define MPI3_PCIESWITCH1_PAGEVERSION (0x00) +#define MPI3_PCIESWITCH1_FLAGS_ASPMSTATE_MASK (0x0c) +#define MPI3_PCIESWITCH1_FLAGS_ASPMSTATE_SHIFT (2) +#define MPI3_PCIESWITCH1_FLAGS_ASPMSUPPORT_MASK (0x03) +#define MPI3_PCIESWITCH1_FLAGS_ASPMSUPPORT_SHIFT (0) +#ifndef MPI3_PCIESWITCH2_MAX_NUM_PORTS +#define MPI3_PCIESWITCH2_MAX_NUM_PORTS (1) +#endif +struct mpi3_pcieswitch2_port_element { + __le16 link_change_count; + __le16 rate_change_count; + __le32 reserved04; +}; +struct mpi3_pcie_switch_page2 { + struct mpi3_config_page_header header; + u8 num_ports; + u8 reserved09; + __le16 dev_handle; + __le32 reserved0c; + struct mpi3_pcieswitch2_port_element port[MPI3_PCIESWITCH2_MAX_NUM_PORTS]; +}; +#define MPI3_PCIESWITCH2_PAGEVERSION (0x00) +struct mpi3_pcie_link_page0 { + struct mpi3_config_page_header header; + u8 link; + u8 reserved09[3]; + __le32 reserved0c; + __le32 receiver_error_count; + __le32 recovery_count; + __le32 corr_error_msg_count; + __le32 non_fatal_error_msg_count; + __le32 fatal_error_msg_count; + __le32 non_fatal_error_count; + __le32 fatal_error_count; + __le32 bad_dllp_count; + __le32 bad_tlp_count; +}; +#define MPI3_PCIELINK0_PAGEVERSION (0x00) +struct mpi3_enclosure_page0 { + struct mpi3_config_page_header header; + __le64 enclosure_logical_id; + __le16 flags; + __le16 enclosure_handle; + __le16 num_slots; + __le16 reserved16; + u8 io_unit_port; + u8 enclosure_level; + __le16 sep_dev_handle; + u8 chassis_slot; + u8 reserved1d[3]; +}; +#define MPI3_ENCLOSURE0_PAGEVERSION (0x00) +#define MPI3_ENCLS0_FLAGS_ENCL_TYPE_MASK (0xc000) +#define MPI3_ENCLS0_FLAGS_ENCL_TYPE_VIRTUAL (0x0000) +#define MPI3_ENCLS0_FLAGS_ENCL_TYPE_SAS (0x4000) +#define MPI3_ENCLS0_FLAGS_ENCL_TYPE_PCIE (0x8000) +#define MPI3_ENCLS0_FLAGS_CHASSIS_SLOT_VALID (0x0020) +#define MPI3_ENCLS0_FLAGS_ENCL_DEV_PRESENT_MASK (0x0010) +#define MPI3_ENCLS0_FLAGS_ENCL_DEV_NOT_FOUND (0x0000) +#define MPI3_ENCLS0_FLAGS_ENCL_DEV_PRESENT (0x0010) +#define MPI3_ENCLS0_FLAGS_MNG_MASK (0x000f) +#define MPI3_ENCLS0_FLAGS_MNG_UNKNOWN (0x0000) +#define MPI3_ENCLS0_FLAGS_MNG_IOC_SES (0x0001) +#define MPI3_ENCLS0_FLAGS_MNG_SES_ENCLOSURE (0x0002) +#define MPI3_DEVICE_DEVFORM_SAS_SATA (0x00) +#define MPI3_DEVICE_DEVFORM_PCIE (0x01) +#define MPI3_DEVICE_DEVFORM_VD (0x02) +struct mpi3_device0_sas_sata_format { + __le64 sas_address; + __le16 flags; + __le16 device_info; + u8 phy_num; + u8 attached_phy_identifier; + u8 max_port_connections; + u8 zone_group; +}; +#define MPI3_DEVICE0_SASSATA_FLAGS_WRITE_SAME_UNMAP_NCQ (0x0400) +#define MPI3_DEVICE0_SASSATA_FLAGS_SLUMBER_CAP (0x0200) +#define MPI3_DEVICE0_SASSATA_FLAGS_PARTIAL_CAP (0x0100) +#define MPI3_DEVICE0_SASSATA_FLAGS_ASYNC_NOTIFY (0x0080) +#define MPI3_DEVICE0_SASSATA_FLAGS_SW_PRESERVE (0x0040) +#define MPI3_DEVICE0_SASSATA_FLAGS_UNSUPP_DEV (0x0020) +#define MPI3_DEVICE0_SASSATA_FLAGS_48BIT_LBA (0x0010) +#define MPI3_DEVICE0_SASSATA_FLAGS_SMART_SUPP (0x0008) +#define MPI3_DEVICE0_SASSATA_FLAGS_NCQ_SUPP (0x0004) +#define MPI3_DEVICE0_SASSATA_FLAGS_FUA_SUPP (0x0002) +#define MPI3_DEVICE0_SASSATA_FLAGS_PERSIST_CAP (0x0001) +struct mpi3_device0_pcie_format { + u8 supported_link_rates; + u8 max_port_width; + u8 negotiated_port_width; + u8 negotiated_link_rate; + u8 port_num; + u8 controller_reset_to; + __le16 device_info; + __le32 maximum_data_transfer_size; + __le32 capabilities; + __le16 noiob; + u8 nvme_abort_to; + u8 page_size; + __le16 shutdown_latency; + u8 recovery_info; + u8 reserved17; +}; +#define MPI3_DEVICE0_PCIE_LINK_RATE_32_0_SUPP (0x10) +#define MPI3_DEVICE0_PCIE_LINK_RATE_16_0_SUPP (0x08) +#define MPI3_DEVICE0_PCIE_LINK_RATE_8_0_SUPP (0x04) +#define MPI3_DEVICE0_PCIE_LINK_RATE_5_0_SUPP (0x02) +#define MPI3_DEVICE0_PCIE_LINK_RATE_2_5_SUPP (0x01) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK (0x0007) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NO_DEVICE (0x0000) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NVME_DEVICE (0x0001) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_SWITCH_DEVICE (0x0002) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_SCSI_DEVICE (0x0003) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_ASPM_MASK (0x0030) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_ASPM_SHIFT (4) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_MASK (0x00c0) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_SHIFT (6) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_0 (0x0000) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_1 (0x0040) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_2 (0x0080) +#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_3 (0x00c0) +#define MPI3_DEVICE0_PCIE_CAP_SGL_EXTRA_LENGTH_SUPPORTED (0x00000020) +#define MPI3_DEVICE0_PCIE_CAP_METADATA_SEPARATED (0x00000010) +#define MPI3_DEVICE0_PCIE_CAP_SGL_DWORD_ALIGN_REQUIRED (0x00000008) +#define MPI3_DEVICE0_PCIE_CAP_SGL_FORMAT_SGL (0x00000004) +#define MPI3_DEVICE0_PCIE_CAP_SGL_FORMAT_PRP (0x00000000) +#define MPI3_DEVICE0_PCIE_CAP_BIT_BUCKET_SGL_SUPP (0x00000002) +#define MPI3_DEVICE0_PCIE_CAP_SGL_SUPP (0x00000001) +#define MPI3_DEVICE0_PCIE_CAP_ASPM_MASK (0x000000c0) +#define MPI3_DEVICE0_PCIE_CAP_ASPM_SHIFT (6) +#define MPI3_DEVICE0_PCIE_RECOVER_METHOD_MASK (0xe0) +#define MPI3_DEVICE0_PCIE_RECOVER_METHOD_NS_MGMT (0x00) +#define MPI3_DEVICE0_PCIE_RECOVER_METHOD_FORMAT (0x20) +#define MPI3_DEVICE0_PCIE_RECOVER_REASON_MASK (0x1f) +#define MPI3_DEVICE0_PCIE_RECOVER_REASON_NO_NS (0x00) +#define MPI3_DEVICE0_PCIE_RECOVER_REASON_NO_NSID_1 (0x01) +#define MPI3_DEVICE0_PCIE_RECOVER_REASON_TOO_MANY_NS (0x02) +#define MPI3_DEVICE0_PCIE_RECOVER_REASON_PROTECTION (0x03) +#define MPI3_DEVICE0_PCIE_RECOVER_REASON_METADATA_SZ (0x04) +#define MPI3_DEVICE0_PCIE_RECOVER_REASON_LBA_DATA_SZ (0x05) +struct mpi3_device0_vd_format { + u8 vd_state; + u8 raid_level; + __le16 device_info; + __le16 flags; + __le16 io_throttle_group; + __le16 io_throttle_group_low; + __le16 io_throttle_group_high; + __le32 reserved0c; +}; +#define MPI3_DEVICE0_VD_STATE_OFFLINE (0x00) +#define MPI3_DEVICE0_VD_STATE_PARTIALLY_DEGRADED (0x01) +#define MPI3_DEVICE0_VD_STATE_DEGRADED (0x02) +#define MPI3_DEVICE0_VD_STATE_OPTIMAL (0x03) +#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_0 (0) +#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_1 (1) +#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_5 (5) +#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_6 (6) +#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_10 (10) +#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_50 (50) +#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_60 (60) +#define MPI3_DEVICE0_VD_DEVICE_INFO_HDD (0x0010) +#define MPI3_DEVICE0_VD_DEVICE_INFO_SSD (0x0008) +#define MPI3_DEVICE0_VD_DEVICE_INFO_NVME (0x0004) +#define MPI3_DEVICE0_VD_DEVICE_INFO_SATA (0x0002) +#define MPI3_DEVICE0_VD_DEVICE_INFO_SAS (0x0001) +#define MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_MASK (0xf000) +#define MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_SHIFT (12) +#define MPI3_DEVICE0_VD_FLAGS_METADATA_MODE_MASK (0x0003) +#define MPI3_DEVICE0_VD_FLAGS_METADATA_MODE_NONE (0x0000) +#define MPI3_DEVICE0_VD_FLAGS_METADATA_MODE_HOST (0x0001) +#define MPI3_DEVICE0_VD_FLAGS_METADATA_MODE_IOC (0x0002) +union mpi3_device0_dev_spec_format { + struct mpi3_device0_sas_sata_format sas_sata_format; + struct mpi3_device0_pcie_format pcie_format; + struct mpi3_device0_vd_format vd_format; +}; +struct mpi3_device_page0 { + struct mpi3_config_page_header header; + __le16 dev_handle; + __le16 parent_dev_handle; + __le16 slot; + __le16 enclosure_handle; + __le64 wwid; + __le16 persistent_id; + u8 io_unit_port; + u8 access_status; + __le16 flags; + __le16 reserved1e; + __le16 slot_index; + __le16 queue_depth; + u8 reserved24[3]; + u8 device_form; + union mpi3_device0_dev_spec_format device_specific; +}; +#define MPI3_DEVICE0_PAGEVERSION (0x00) +#define MPI3_DEVICE0_PARENT_INVALID (0xffff) +#define MPI3_DEVICE0_ENCLOSURE_HANDLE_NO_ENCLOSURE (0x0000) +#define MPI3_DEVICE0_WWID_INVALID (0xffffffffffffffff) +#define MPI3_DEVICE0_PERSISTENTID_INVALID (0xffff) +#define MPI3_DEVICE0_IOUNITPORT_INVALID (0xff) +#define MPI3_DEVICE0_ASTATUS_NO_ERRORS (0x00) +#define MPI3_DEVICE0_ASTATUS_NEEDS_INITIALIZATION (0x01) +#define MPI3_DEVICE0_ASTATUS_CAP_UNSUPPORTED (0x02) +#define MPI3_DEVICE0_ASTATUS_DEVICE_BLOCKED (0x03) +#define MPI3_DEVICE0_ASTATUS_UNAUTHORIZED (0x04) +#define MPI3_DEVICE0_ASTATUS_DEVICE_MISSING_DELAY (0x05) +#define MPI3_DEVICE0_ASTATUS_PREPARE (0x06) +#define MPI3_DEVICE0_ASTATUS_SAFE_MODE (0x07) +#define MPI3_DEVICE0_ASTATUS_GENERIC_MAX (0x0f) +#define MPI3_DEVICE0_ASTATUS_SAS_UNKNOWN (0x10) +#define MPI3_DEVICE0_ASTATUS_ROUTE_NOT_ADDRESSABLE (0x11) +#define MPI3_DEVICE0_ASTATUS_SMP_ERROR_NOT_ADDRESSABLE (0x12) +#define MPI3_DEVICE0_ASTATUS_SAS_MAX (0x1f) +#define MPI3_DEVICE0_ASTATUS_SIF_UNKNOWN (0x20) +#define MPI3_DEVICE0_ASTATUS_SIF_AFFILIATION_CONFLICT (0x21) +#define MPI3_DEVICE0_ASTATUS_SIF_DIAG (0x22) +#define MPI3_DEVICE0_ASTATUS_SIF_IDENTIFICATION (0x23) +#define MPI3_DEVICE0_ASTATUS_SIF_CHECK_POWER (0x24) +#define MPI3_DEVICE0_ASTATUS_SIF_PIO_SN (0x25) +#define MPI3_DEVICE0_ASTATUS_SIF_MDMA_SN (0x26) +#define MPI3_DEVICE0_ASTATUS_SIF_UDMA_SN (0x27) +#define MPI3_DEVICE0_ASTATUS_SIF_ZONING_VIOLATION (0x28) +#define MPI3_DEVICE0_ASTATUS_SIF_NOT_ADDRESSABLE (0x29) +#define MPI3_DEVICE0_ASTATUS_SIF_MAX (0x2f) +#define MPI3_DEVICE0_ASTATUS_PCIE_UNKNOWN (0x30) +#define MPI3_DEVICE0_ASTATUS_PCIE_MEM_SPACE_ACCESS (0x31) +#define MPI3_DEVICE0_ASTATUS_PCIE_UNSUPPORTED (0x32) +#define MPI3_DEVICE0_ASTATUS_PCIE_MSIX_REQUIRED (0x33) +#define MPI3_DEVICE0_ASTATUS_PCIE_ECRC_REQUIRED (0x34) +#define MPI3_DEVICE0_ASTATUS_PCIE_MAX (0x3f) +#define MPI3_DEVICE0_ASTATUS_NVME_UNKNOWN (0x40) +#define MPI3_DEVICE0_ASTATUS_NVME_READY_TIMEOUT (0x41) +#define MPI3_DEVICE0_ASTATUS_NVME_DEVCFG_UNSUPPORTED (0x42) +#define MPI3_DEVICE0_ASTATUS_NVME_IDENTIFY_FAILED (0x43) +#define MPI3_DEVICE0_ASTATUS_NVME_QCONFIG_FAILED (0x44) +#define MPI3_DEVICE0_ASTATUS_NVME_QCREATION_FAILED (0x45) +#define MPI3_DEVICE0_ASTATUS_NVME_EVENTCFG_FAILED (0x46) +#define MPI3_DEVICE0_ASTATUS_NVME_GET_FEATURE_STAT_FAILED (0x47) +#define MPI3_DEVICE0_ASTATUS_NVME_IDLE_TIMEOUT (0x48) +#define MPI3_DEVICE0_ASTATUS_NVME_CTRL_FAILURE_STATUS (0x49) +#define MPI3_DEVICE0_ASTATUS_NVME_INSUFFICIENT_POWER (0x4a) +#define MPI3_DEVICE0_ASTATUS_NVME_DOORBELL_STRIDE (0x4b) +#define MPI3_DEVICE0_ASTATUS_NVME_MEM_PAGE_MIN_SIZE (0x4c) +#define MPI3_DEVICE0_ASTATUS_NVME_MEMORY_ALLOCATION (0x4d) +#define MPI3_DEVICE0_ASTATUS_NVME_COMPLETION_TIME (0x4e) +#define MPI3_DEVICE0_ASTATUS_NVME_BAR (0x4f) +#define MPI3_DEVICE0_ASTATUS_NVME_NS_DESCRIPTOR (0x50) +#define MPI3_DEVICE0_ASTATUS_NVME_INCOMPATIBLE_SETTINGS (0x51) +#define MPI3_DEVICE0_ASTATUS_NVME_TOO_MANY_ERRORS (0x52) +#define MPI3_DEVICE0_ASTATUS_NVME_MAX (0x5f) +#define MPI3_DEVICE0_ASTATUS_VD_UNKNOWN (0x80) +#define MPI3_DEVICE0_ASTATUS_VD_MAX (0x8f) +#define MPI3_DEVICE0_FLAGS_CONTROLLER_DEV_HANDLE (0x0080) +#define MPI3_DEVICE0_FLAGS_IO_THROTTLING_REQUIRED (0x0010) +#define MPI3_DEVICE0_FLAGS_HIDDEN (0x0008) +#define MPI3_DEVICE0_FLAGS_ATT_METHOD_VIRTUAL (0x0004) +#define MPI3_DEVICE0_FLAGS_ATT_METHOD_DIR_ATTACHED (0x0002) +#define MPI3_DEVICE0_FLAGS_DEVICE_PRESENT (0x0001) +#define MPI3_DEVICE0_QUEUE_DEPTH_NOT_APPLICABLE (0x0000) +struct mpi3_device1_sas_sata_format { + __le32 reserved00; +}; +struct mpi3_device1_pcie_format { + __le16 vendor_id; + __le16 device_id; + __le16 subsystem_vendor_id; + __le16 subsystem_id; + __le32 reserved08; + u8 revision_id; + u8 reserved0d; + __le16 pci_parameters; +}; +#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_128B (0x0) +#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_256B (0x1) +#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_512B (0x2) +#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_1024B (0x3) +#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_2048B (0x4) +#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_4096B (0x5) +#define MPI3_DEVICE1_PCIE_PARAMS_MAX_READ_REQ_MASK (0x01c0) +#define MPI3_DEVICE1_PCIE_PARAMS_MAX_READ_REQ_SHIFT (6) +#define MPI3_DEVICE1_PCIE_PARAMS_CURR_MAX_PAYLOAD_MASK (0x0038) +#define MPI3_DEVICE1_PCIE_PARAMS_CURR_MAX_PAYLOAD_SHIFT (3) +#define MPI3_DEVICE1_PCIE_PARAMS_SUPP_MAX_PAYLOAD_MASK (0x0007) +#define MPI3_DEVICE1_PCIE_PARAMS_SUPP_MAX_PAYLOAD_SHIFT (0) +struct mpi3_device1_vd_format { + __le32 reserved00; +}; +union mpi3_device1_dev_spec_format { + struct mpi3_device1_sas_sata_format sas_sata_format; + struct mpi3_device1_pcie_format pcie_format; + struct mpi3_device1_vd_format vd_format; +}; +struct mpi3_device_page1 { + struct mpi3_config_page_header header; + __le16 dev_handle; + __le16 reserved0a; + __le16 link_change_count; + __le16 rate_change_count; + __le16 tm_count; + __le16 reserved12; + __le32 reserved14[10]; + u8 reserved3c[3]; + u8 device_form; + union mpi3_device1_dev_spec_format device_specific; +}; +#define MPI3_DEVICE1_PAGEVERSION (0x00) +#define MPI3_DEVICE1_COUNTER_MAX (0xfffe) +#define MPI3_DEVICE1_COUNTER_INVALID (0xffff) +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_image.h b/drivers/scsi/mpi3mr/mpi/mpi30_image.h new file mode 100644 index 0000000000000..0d329eb74e083 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_image.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_IMAGE_H +#define MPI30_IMAGE_H 1 +struct mpi3_comp_image_version { + __le16 build_num; + __le16 customer_id; + u8 phase_minor; + u8 phase_major; + u8 gen_minor; + u8 gen_major; +}; +struct mpi3_hash_exclusion_format { + __le32 offset; + __le32 size; +}; +#define MPI3_IMAGE_HASH_EXCUSION_NUM (4) +struct mpi3_component_image_header { + __le32 signature0; + __le32 load_address; + __le32 data_size; + __le32 start_offset; + __le32 signature1; + __le32 flash_offset; + __le32 image_size; + __le32 version_string_offset; + __le32 build_date_string_offset; + __le32 build_time_string_offset; + __le32 environment_variable_offset; + __le32 application_specific; + __le32 signature2; + __le32 header_size; + __le32 crc; + __le32 flags; + __le32 secondary_flash_offset; + __le32 etp_offset; + __le32 etp_size; + union mpi3_version_union rmc_interface_version; + union mpi3_version_union etp_interface_version; + struct mpi3_comp_image_version component_image_version; + struct mpi3_hash_exclusion_format hash_exclusion[MPI3_IMAGE_HASH_EXCUSION_NUM]; + __le32 next_image_header_offset; + union mpi3_version_union security_version; + __le32 reserved84[31]; +}; +#define MPI3_IMAGE_HEADER_SIGNATURE0_MPI3 (0xeb00003e) +#define MPI3_IMAGE_HEADER_LOAD_ADDRESS_INVALID (0x00000000) +#define MPI3_IMAGE_HEADER_SIGNATURE1_APPLICATION (0x20505041) +#define MPI3_IMAGE_HEADER_SIGNATURE1_FIRST_MUTABLE (0x20434d46) +#define MPI3_IMAGE_HEADER_SIGNATURE1_BSP (0x20505342) +#define MPI3_IMAGE_HEADER_SIGNATURE1_ROM_BIOS (0x534f4942) +#define MPI3_IMAGE_HEADER_SIGNATURE1_HII_X64 (0x4d494948) +#define MPI3_IMAGE_HEADER_SIGNATURE1_HII_ARM (0x41494948) +#define MPI3_IMAGE_HEADER_SIGNATURE1_CPLD (0x444c5043) +#define MPI3_IMAGE_HEADER_SIGNATURE1_SPD (0x20445053) +#define MPI3_IMAGE_HEADER_SIGNATURE1_GAS_GAUGE (0x20534147) +#define MPI3_IMAGE_HEADER_SIGNATURE1_PBLP (0x504c4250) +#define MPI3_IMAGE_HEADER_SIGNATURE1_MANIFEST (0x464e414d) +#define MPI3_IMAGE_HEADER_SIGNATURE1_OEM (0x204d454f) +#define MPI3_IMAGE_HEADER_SIGNATURE2_VALUE (0x50584546) +#define MPI3_IMAGE_HEADER_FLAGS_DEVICE_KEY_BASIS_MASK (0x00000030) +#define MPI3_IMAGE_HEADER_FLAGS_DEVICE_KEY_BASIS_CDI (0x00000000) +#define MPI3_IMAGE_HEADER_FLAGS_DEVICE_KEY_BASIS_DI (0x00000010) +#define MPI3_IMAGE_HEADER_FLAGS_SIGNED_NVDATA (0x00000008) +#define MPI3_IMAGE_HEADER_FLAGS_REQUIRES_ACTIVATION (0x00000004) +#define MPI3_IMAGE_HEADER_FLAGS_COMPRESSED (0x00000002) +#define MPI3_IMAGE_HEADER_FLAGS_FLASH (0x00000001) +#define MPI3_IMAGE_HEADER_SIGNATURE0_OFFSET (0x00) +#define MPI3_IMAGE_HEADER_LOAD_ADDRESS_OFFSET (0x04) +#define MPI3_IMAGE_HEADER_DATA_SIZE_OFFSET (0x08) +#define MPI3_IMAGE_HEADER_START_OFFSET_OFFSET (0x0c) +#define MPI3_IMAGE_HEADER_SIGNATURE1_OFFSET (0x10) +#define MPI3_IMAGE_HEADER_FLASH_OFFSET_OFFSET (0x14) +#define MPI3_IMAGE_HEADER_FLASH_SIZE_OFFSET (0x18) +#define MPI3_IMAGE_HEADER_VERSION_STRING_OFFSET_OFFSET (0x1c) +#define MPI3_IMAGE_HEADER_BUILD_DATE_STRING_OFFSET_OFFSET (0x20) +#define MPI3_IMAGE_HEADER_BUILD_TIME_OFFSET_OFFSET (0x24) +#define MPI3_IMAGE_HEADER_ENVIROMENT_VAR_OFFSET_OFFSET (0x28) +#define MPI3_IMAGE_HEADER_APPLICATION_SPECIFIC_OFFSET (0x2c) +#define MPI3_IMAGE_HEADER_SIGNATURE2_OFFSET (0x30) +#define MPI3_IMAGE_HEADER_HEADER_SIZE_OFFSET (0x34) +#define MPI3_IMAGE_HEADER_CRC_OFFSET (0x38) +#define MPI3_IMAGE_HEADER_FLAGS_OFFSET (0x3c) +#define MPI3_IMAGE_HEADER_SECONDARY_FLASH_OFFSET_OFFSET (0x40) +#define MPI3_IMAGE_HEADER_ETP_OFFSET_OFFSET (0x44) +#define MPI3_IMAGE_HEADER_ETP_SIZE_OFFSET (0x48) +#define MPI3_IMAGE_HEADER_RMC_INTERFACE_VER_OFFSET (0x4c) +#define MPI3_IMAGE_HEADER_ETP_INTERFACE_VER_OFFSET (0x50) +#define MPI3_IMAGE_HEADER_COMPONENT_IMAGE_VER_OFFSET (0x54) +#define MPI3_IMAGE_HEADER_HASH_EXCLUSION_OFFSET (0x5c) +#define MPI3_IMAGE_HEADER_NEXT_IMAGE_HEADER_OFFSET_OFFSET (0x7c) +#define MPI3_IMAGE_HEADER_SIZE (0x100) +#ifndef MPI3_CI_MANIFEST_MPI_MAX +#define MPI3_CI_MANIFEST_MPI_MAX (1) +#endif +struct mpi3_ci_manifest_mpi_comp_image_ref { + __le32 signature1; + __le32 reserved04[3]; + struct mpi3_comp_image_version component_image_version; + __le32 component_image_version_string_offset; + __le32 crc; +}; +struct mpi3_ci_manifest_mpi { + u8 manifest_type; + u8 reserved01[3]; + __le32 reserved04[3]; + u8 num_image_references; + u8 release_level; + __le16 reserved12; + __le16 reserved14; + __le16 flags; + __le32 reserved18[2]; + __le16 vendor_id; + __le16 device_id; + __le16 subsystem_vendor_id; + __le16 subsystem_id; + __le32 reserved28[2]; + union mpi3_version_union package_security_version; + __le32 reserved34; + struct mpi3_comp_image_version package_version; + __le32 package_version_string_offset; + __le32 package_build_date_string_offset; + __le32 package_build_time_string_offset; + __le32 reserved4c; + __le32 diag_authorization_identifier[16]; + struct mpi3_ci_manifest_mpi_comp_image_ref component_image_ref[MPI3_CI_MANIFEST_MPI_MAX]; +}; +#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_DEV (0x00) +#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_PREALPHA (0x10) +#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_ALPHA (0x20) +#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_BETA (0x30) +#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_RC (0x40) +#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_GCA (0x50) +#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_POINT (0x60) +#define MPI3_CI_MANIFEST_MPI_FLAGS_DIAG_AUTHORIZATION (0x01) +#define MPI3_CI_MANIFEST_MPI_SUBSYSTEMID_IGNORED (0xffff) +#define MPI3_CI_MANIFEST_MPI_PKG_VER_STR_OFF_UNSPECIFIED (0x00000000) +#define MPI3_CI_MANIFEST_MPI_PKG_BUILD_DATE_STR_OFF_UNSPECIFIED (0x00000000) +#define MPI3_CI_MANIFEST_MPI_PKG_BUILD_TIME_STR_OFF_UNSPECIFIED (0x00000000) +union mpi3_ci_manifest { + struct mpi3_ci_manifest_mpi mpi; + __le32 dword[1]; +}; +#define MPI3_CI_MANIFEST_TYPE_MPI (0x00) +struct mpi3_extended_image_header { + u8 image_type; + u8 reserved01[3]; + __le32 checksum; + __le32 image_size; + __le32 next_image_header_offset; + __le32 reserved10[4]; + __le32 identify_string[8]; +}; +#define MPI3_EXT_IMAGE_IMAGETYPE_OFFSET (0x00) +#define MPI3_EXT_IMAGE_IMAGESIZE_OFFSET (0x08) +#define MPI3_EXT_IMAGE_NEXTIMAGE_OFFSET (0x0c) +#define MPI3_EXT_IMAGE_HEADER_SIZE (0x40) +#define MPI3_EXT_IMAGE_TYPE_UNSPECIFIED (0x00) +#define MPI3_EXT_IMAGE_TYPE_NVDATA (0x03) +#define MPI3_EXT_IMAGE_TYPE_SUPPORTED_DEVICES (0x07) +#define MPI3_EXT_IMAGE_TYPE_ENCRYPTED_HASH (0x09) +#define MPI3_EXT_IMAGE_TYPE_RDE (0x0a) +#define MPI3_EXT_IMAGE_TYPE_AUXILIARY_PROCESSOR (0x0b) +#define MPI3_EXT_IMAGE_TYPE_MIN_PRODUCT_SPECIFIC (0x80) +#define MPI3_EXT_IMAGE_TYPE_MAX_PRODUCT_SPECIFIC (0xff) +struct mpi3_supported_device { + __le16 device_id; + __le16 vendor_id; + __le16 device_id_mask; + __le16 reserved06; + u8 low_pci_rev; + u8 high_pci_rev; + __le16 reserved0a; + __le32 reserved0c; +}; +#ifndef MPI3_SUPPORTED_DEVICE_MAX +#define MPI3_SUPPORTED_DEVICE_MAX (1) +#endif +struct mpi3_supported_devices_data { + u8 image_version; + u8 reserved01; + u8 num_devices; + u8 reserved03; + __le32 reserved04; + struct mpi3_supported_device supported_device[MPI3_SUPPORTED_DEVICE_MAX]; +}; +#ifndef MPI3_ENCRYPTED_HASH_MAX +#define MPI3_ENCRYPTED_HASH_MAX (1) +#endif +struct mpi3_encrypted_hash_entry { + u8 hash_image_type; + u8 hash_algorithm; + u8 encryption_algorithm; + u8 reserved03; + __le32 reserved04; + __le32 encrypted_hash[MPI3_ENCRYPTED_HASH_MAX]; +}; +#define MPI3_HASH_IMAGE_TYPE_KEY_WITH_SIGNATURE (0x03) +#define MPI3_HASH_ALGORITHM_VERSION_MASK (0xe0) +#define MPI3_HASH_ALGORITHM_VERSION_NONE (0x00) +#define MPI3_HASH_ALGORITHM_VERSION_SHA1 (0x20) +#define MPI3_HASH_ALGORITHM_VERSION_SHA2 (0x40) +#define MPI3_HASH_ALGORITHM_VERSION_SHA3 (0x60) +#define MPI3_HASH_ALGORITHM_SIZE_MASK (0x1f) +#define MPI3_HASH_ALGORITHM_SIZE_UNUSED (0x00) +#define MPI3_HASH_ALGORITHM_SIZE_SHA256 (0x01) +#define MPI3_HASH_ALGORITHM_SIZE_SHA512 (0x02) +#define MPI3_HASH_ALGORITHM_SIZE_SHA384 (0x03) +#define MPI3_ENCRYPTION_ALGORITHM_UNUSED (0x00) +#define MPI3_ENCRYPTION_ALGORITHM_RSA256 (0x01) +#define MPI3_ENCRYPTION_ALGORITHM_RSA512 (0x02) +#define MPI3_ENCRYPTION_ALGORITHM_RSA1024 (0x03) +#define MPI3_ENCRYPTION_ALGORITHM_RSA2048 (0x04) +#define MPI3_ENCRYPTION_ALGORITHM_RSA4096 (0x05) +#define MPI3_ENCRYPTION_ALGORITHM_RSA3072 (0x06) +#ifndef MPI3_PUBLIC_KEY_MAX +#define MPI3_PUBLIC_KEY_MAX (1) +#endif +struct mpi3_encrypted_key_with_hash_entry { + u8 hash_image_type; + u8 hash_algorithm; + u8 encryption_algorithm; + u8 reserved03; + __le32 reserved04; + __le32 public_key[MPI3_PUBLIC_KEY_MAX]; +}; +#ifndef MPI3_ENCRYPTED_HASH_ENTRY_MAX +#define MPI3_ENCRYPTED_HASH_ENTRY_MAX (1) +#endif +struct mpi3_encrypted_hash_data { + u8 image_version; + u8 num_hash; + __le16 reserved02; + __le32 reserved04; + struct mpi3_encrypted_hash_entry encrypted_hash_entry[MPI3_ENCRYPTED_HASH_ENTRY_MAX]; +}; +#ifndef MPI3_AUX_PROC_DATA_MAX +#define MPI3_AUX_PROC_DATA_MAX (1) +#endif +struct mpi3_aux_processor_data { + u8 boot_method; + u8 num_load_addr; + u8 reserved02; + u8 type; + __le32 version; + __le32 load_address[8]; + __le32 reserved28[22]; + __le32 aux_processor_data[MPI3_AUX_PROC_DATA_MAX]; +}; +#define MPI3_AUX_PROC_DATA_OFFSET (0x80) +#define MPI3_AUXPROCESSOR_BOOT_METHOD_MO_MSG (0x00) +#define MPI3_AUXPROCESSOR_BOOT_METHOD_MO_DOORBELL (0x01) +#define MPI3_AUXPROCESSOR_BOOT_METHOD_COMPONENT (0x02) +#define MPI3_AUXPROCESSOR_TYPE_ARM_A15 (0x00) +#define MPI3_AUXPROCESSOR_TYPE_ARM_M0 (0x01) +#define MPI3_AUXPROCESSOR_TYPE_ARM_R4 (0x02) +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_init.h b/drivers/scsi/mpi3mr/mpi/mpi30_init.h new file mode 100644 index 0000000000000..f3ae04ac86608 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_init.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_INIT_H +#define MPI30_INIT_H 1 +struct mpi3_scsi_io_cdb_eedp32 { + u8 cdb[20]; + __be32 primary_reference_tag; + __le16 primary_application_tag; + __le16 primary_application_tag_mask; + __le32 transfer_length; +}; +union mpi3_scsi_io_cdb_union { + u8 cdb32[32]; + struct mpi3_scsi_io_cdb_eedp32 eedp32; + struct mpi3_sge_common sge; +}; +struct mpi3_scsi_io_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 dev_handle; + __le32 flags; + __le32 skip_count; + __le32 data_length; + u8 lun[8]; + union mpi3_scsi_io_cdb_union cdb; + union mpi3_sge_union sgl[4]; +}; +#define MPI3_SCSIIO_MSGFLAGS_METASGL_VALID (0x80) +#define MPI3_SCSIIO_MSGFLAGS_DIVERT_TO_FIRMWARE (0x40) +#define MPI3_SCSIIO_FLAGS_LARGE_CDB (0x60000000) +#define MPI3_SCSIIO_FLAGS_CDB_16_OR_LESS (0x00000000) +#define MPI3_SCSIIO_FLAGS_CDB_GREATER_THAN_16 (0x20000000) +#define MPI3_SCSIIO_FLAGS_CDB_IN_SEPARATE_BUFFER (0x40000000) +#define MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_MASK (0x07000000) +#define MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_SIMPLEQ (0x00000000) +#define MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_HEADOFQ (0x01000000) +#define MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_ORDEREDQ (0x02000000) +#define MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_ACAQ (0x04000000) +#define MPI3_SCSIIO_FLAGS_CMDPRI_MASK (0x00f00000) +#define MPI3_SCSIIO_FLAGS_CMDPRI_SHIFT (20) +#define MPI3_SCSIIO_FLAGS_DATADIRECTION_MASK (0x000c0000) +#define MPI3_SCSIIO_FLAGS_DATADIRECTION_NO_DATA_TRANSFER (0x00000000) +#define MPI3_SCSIIO_FLAGS_DATADIRECTION_WRITE (0x00040000) +#define MPI3_SCSIIO_FLAGS_DATADIRECTION_READ (0x00080000) +#define MPI3_SCSIIO_FLAGS_DMAOPERATION_MASK (0x00030000) +#define MPI3_SCSIIO_FLAGS_DMAOPERATION_HOST_PI (0x00010000) +#define MPI3_SCSIIO_FLAGS_DIVERT_REASON_MASK (0x000000f0) +#define MPI3_SCSIIO_FLAGS_DIVERT_REASON_IO_THROTTLING (0x00000010) +#define MPI3_SCSIIO_FLAGS_DIVERT_REASON_PROD_SPECIFIC (0x00000080) +#define MPI3_SCSIIO_METASGL_INDEX (3) +struct mpi3_scsi_io_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + u8 scsi_status; + u8 scsi_state; + __le16 dev_handle; + __le32 transfer_count; + __le32 sense_count; + __le32 response_data; + __le16 task_tag; + __le16 scsi_status_qualifier; + __le32 eedp_error_offset; + __le16 eedp_observed_app_tag; + __le16 eedp_observed_guard; + __le32 eedp_observed_ref_tag; + __le64 sense_data_buffer_address; +}; +#define MPI3_SCSIIO_REPLY_MSGFLAGS_REFTAG_OBSERVED_VALID (0x01) +#define MPI3_SCSIIO_REPLY_MSGFLAGS_APPTAG_OBSERVED_VALID (0x02) +#define MPI3_SCSIIO_REPLY_MSGFLAGS_GUARD_OBSERVED_VALID (0x04) +#define MPI3_SCSI_STATUS_GOOD (0x00) +#define MPI3_SCSI_STATUS_CHECK_CONDITION (0x02) +#define MPI3_SCSI_STATUS_CONDITION_MET (0x04) +#define MPI3_SCSI_STATUS_BUSY (0x08) +#define MPI3_SCSI_STATUS_INTERMEDIATE (0x10) +#define MPI3_SCSI_STATUS_INTERMEDIATE_CONDMET (0x14) +#define MPI3_SCSI_STATUS_RESERVATION_CONFLICT (0x18) +#define MPI3_SCSI_STATUS_COMMAND_TERMINATED (0x22) +#define MPI3_SCSI_STATUS_TASK_SET_FULL (0x28) +#define MPI3_SCSI_STATUS_ACA_ACTIVE (0x30) +#define MPI3_SCSI_STATUS_TASK_ABORTED (0x40) +#define MPI3_SCSI_STATE_SENSE_MASK (0x03) +#define MPI3_SCSI_STATE_SENSE_VALID (0x00) +#define MPI3_SCSI_STATE_SENSE_FAILED (0x01) +#define MPI3_SCSI_STATE_SENSE_BUFF_Q_EMPTY (0x02) +#define MPI3_SCSI_STATE_SENSE_NOT_AVAILABLE (0x03) +#define MPI3_SCSI_STATE_NO_SCSI_STATUS (0x04) +#define MPI3_SCSI_STATE_TERMINATED (0x08) +#define MPI3_SCSI_STATE_RESPONSE_DATA_VALID (0x10) +#define MPI3_SCSI_RSP_RESPONSECODE_MASK (0x000000ff) +#define MPI3_SCSI_RSP_RESPONSECODE_SHIFT (0) +#define MPI3_SCSI_RSP_ARI2_MASK (0x0000ff00) +#define MPI3_SCSI_RSP_ARI2_SHIFT (8) +#define MPI3_SCSI_RSP_ARI1_MASK (0x00ff0000) +#define MPI3_SCSI_RSP_ARI1_SHIFT (16) +#define MPI3_SCSI_RSP_ARI0_MASK (0xff000000) +#define MPI3_SCSI_RSP_ARI0_SHIFT (24) +#define MPI3_SCSI_TASKTAG_UNKNOWN (0xffff) +struct mpi3_scsi_task_mgmt_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 dev_handle; + __le16 task_host_tag; + u8 task_type; + u8 reserved0f; + __le16 task_request_queue_id; + __le16 reserved12; + __le32 reserved14; + u8 lun[8]; +}; +#define MPI3_SCSITASKMGMT_MSGFLAGS_DO_NOT_SEND_TASK_IU (0x08) +#define MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK (0x01) +#define MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK_SET (0x02) +#define MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET (0x03) +#define MPI3_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET (0x05) +#define MPI3_SCSITASKMGMT_TASKTYPE_CLEAR_TASK_SET (0x06) +#define MPI3_SCSITASKMGMT_TASKTYPE_QUERY_TASK (0x07) +#define MPI3_SCSITASKMGMT_TASKTYPE_CLEAR_ACA (0x08) +#define MPI3_SCSITASKMGMT_TASKTYPE_QUERY_TASK_SET (0x09) +#define MPI3_SCSITASKMGMT_TASKTYPE_QUERY_ASYNC_EVENT (0x0a) +#define MPI3_SCSITASKMGMT_TASKTYPE_I_T_NEXUS_RESET (0x0b) +struct mpi3_scsi_task_mgmt_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le32 termination_count; + __le32 response_data; + __le32 reserved18; +}; +#define MPI3_SCSITASKMGMT_RSPCODE_TM_COMPLETE (0x00) +#define MPI3_SCSITASKMGMT_RSPCODE_INVALID_FRAME (0x02) +#define MPI3_SCSITASKMGMT_RSPCODE_TM_FUNCTION_NOT_SUPPORTED (0x04) +#define MPI3_SCSITASKMGMT_RSPCODE_TM_FAILED (0x05) +#define MPI3_SCSITASKMGMT_RSPCODE_TM_SUCCEEDED (0x08) +#define MPI3_SCSITASKMGMT_RSPCODE_TM_INVALID_LUN (0x09) +#define MPI3_SCSITASKMGMT_RSPCODE_TM_OVERLAPPED_TAG (0x0a) +#define MPI3_SCSITASKMGMT_RSPCODE_IO_QUEUED_ON_IOC (0x80) +#define MPI3_SCSITASKMGMT_RSPCODE_TM_NVME_DENIED (0x81) +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h b/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h new file mode 100644 index 0000000000000..9fb27cfcf28b4 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h @@ -0,0 +1,1021 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_IOC_H +#define MPI30_IOC_H 1 +struct mpi3_ioc_init_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + union mpi3_version_union mpi_version; + __le64 time_stamp; + u8 reserved18; + u8 who_init; + __le16 reserved1a; + __le16 reply_free_queue_depth; + __le16 reserved1e; + __le64 reply_free_queue_address; + __le32 reserved28; + __le16 sense_buffer_free_queue_depth; + __le16 sense_buffer_length; + __le64 sense_buffer_free_queue_address; + __le64 driver_information_address; +}; +#define MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_MASK (0x03) +#define MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_NOT_USED (0x00) +#define MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_SEPARATED (0x01) +#define MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_INLINE (0x02) +#define MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_BOTH (0x03) +#define MPI3_WHOINIT_NOT_INITIALIZED (0x00) +#define MPI3_WHOINIT_ROM_BIOS (0x02) +#define MPI3_WHOINIT_HOST_DRIVER (0x03) +#define MPI3_WHOINIT_MANUFACTURER (0x04) +struct mpi3_driver_info_layout { + __le32 information_length; + u8 driver_signature[12]; + u8 os_name[16]; + u8 os_version[12]; + u8 driver_name[20]; + u8 driver_version[32]; + u8 driver_release_date[20]; + __le32 driver_capabilities; +}; +struct mpi3_ioc_facts_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + __le32 reserved0c; + union mpi3_sge_union sgl; +}; +struct mpi3_ioc_facts_data { + __le16 ioc_facts_data_length; + __le16 reserved02; + union mpi3_version_union mpi_version; + struct mpi3_comp_image_version fw_version; + __le32 ioc_capabilities; + u8 ioc_number; + u8 who_init; + __le16 max_msix_vectors; + __le16 max_outstanding_requests; + __le16 product_id; + __le16 ioc_request_frame_size; + __le16 reply_frame_size; + __le16 ioc_exceptions; + __le16 max_persistent_id; + u8 sge_modifier_mask; + u8 sge_modifier_value; + u8 sge_modifier_shift; + u8 protocol_flags; + __le16 max_sas_initiators; + __le16 max_data_length; + __le16 max_sas_expanders; + __le16 max_enclosures; + __le16 min_dev_handle; + __le16 max_dev_handle; + __le16 max_pcie_switches; + __le16 max_nvme; + __le16 reserved38; + __le16 max_vds; + __le16 max_host_pds; + __le16 max_adv_host_pds; + __le16 max_raid_pds; + __le16 max_posted_cmd_buffers; + __le32 flags; + __le16 max_operational_request_queues; + __le16 max_operational_reply_queues; + __le16 shutdown_timeout; + __le16 reserved4e; + __le32 diag_trace_size; + __le32 diag_fw_size; + __le32 diag_driver_size; + u8 max_host_pd_ns_count; + u8 max_adv_host_pd_ns_count; + u8 max_raidpd_ns_count; + u8 max_devices_per_throttle_group; + __le16 io_throttle_data_length; + __le16 max_io_throttle_group; + __le16 io_throttle_low; + __le16 io_throttle_high; +}; +#define MPI3_IOCFACTS_CAPABILITY_NON_SUPERVISOR_MASK (0x80000000) +#define MPI3_IOCFACTS_CAPABILITY_SUPERVISOR_IOC (0x00000000) +#define MPI3_IOCFACTS_CAPABILITY_NON_SUPERVISOR_IOC (0x80000000) +#define MPI3_IOCFACTS_CAPABILITY_INT_COALESCE_MASK (0x00000600) +#define MPI3_IOCFACTS_CAPABILITY_INT_COALESCE_FIXED_THRESHOLD (0x00000000) +#define MPI3_IOCFACTS_CAPABILITY_INT_COALESCE_OUTSTANDING_IO (0x00000200) +#define MPI3_IOCFACTS_CAPABILITY_COMPLETE_RESET_CAPABLE (0x00000100) +#define MPI3_IOCFACTS_CAPABILITY_SEG_DIAG_TRACE_ENABLED (0x00000080) +#define MPI3_IOCFACTS_CAPABILITY_SEG_DIAG_FW_ENABLED (0x00000040) +#define MPI3_IOCFACTS_CAPABILITY_SEG_DIAG_DRIVER_ENABLED (0x00000020) +#define MPI3_IOCFACTS_CAPABILITY_ADVANCED_HOST_PD_ENABLED (0x00000010) +#define MPI3_IOCFACTS_CAPABILITY_RAID_CAPABLE (0x00000008) +#define MPI3_IOCFACTS_CAPABILITY_MULTIPATH_ENABLED (0x00000002) +#define MPI3_IOCFACTS_CAPABILITY_COALESCE_CTRL_SUPPORTED (0x00000001) +#define MPI3_IOCFACTS_PID_TYPE_MASK (0xf000) +#define MPI3_IOCFACTS_PID_TYPE_SHIFT (12) +#define MPI3_IOCFACTS_PID_PRODUCT_MASK (0x0f00) +#define MPI3_IOCFACTS_PID_PRODUCT_SHIFT (8) +#define MPI3_IOCFACTS_PID_FAMILY_MASK (0x00ff) +#define MPI3_IOCFACTS_PID_FAMILY_SHIFT (0) +#define MPI3_IOCFACTS_EXCEPT_SECURITY_REKEY (0x2000) +#define MPI3_IOCFACTS_EXCEPT_SAS_DISABLED (0x1000) +#define MPI3_IOCFACTS_EXCEPT_SAFE_MODE (0x0800) +#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_MASK (0x0700) +#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_NONE (0x0000) +#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_LOCAL_VIA_MGMT (0x0100) +#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_EXT_VIA_MGMT (0x0200) +#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_DRIVE_EXT_VIA_MGMT (0x0300) +#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_LOCAL_VIA_OOB (0x0400) +#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_EXT_VIA_OOB (0x0500) +#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_DRIVE_EXT_VIA_OOB (0x0600) +#define MPI3_IOCFACTS_EXCEPT_PCIE_DISABLED (0x0080) +#define MPI3_IOCFACTS_EXCEPT_PARTIAL_MEMORY_FAILURE (0x0040) +#define MPI3_IOCFACTS_EXCEPT_MANUFACT_CHECKSUM_FAIL (0x0020) +#define MPI3_IOCFACTS_EXCEPT_FW_CHECKSUM_FAIL (0x0010) +#define MPI3_IOCFACTS_EXCEPT_CONFIG_CHECKSUM_FAIL (0x0008) +#define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_MASK (0x0001) +#define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_PRIMARY (0x0000) +#define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_SECONDARY (0x0001) +#define MPI3_IOCFACTS_PROTOCOL_SAS (0x0010) +#define MPI3_IOCFACTS_PROTOCOL_SATA (0x0008) +#define MPI3_IOCFACTS_PROTOCOL_NVME (0x0004) +#define MPI3_IOCFACTS_PROTOCOL_SCSI_INITIATOR (0x0002) +#define MPI3_IOCFACTS_PROTOCOL_SCSI_TARGET (0x0001) +#define MPI3_IOCFACTS_MAX_DATA_LENGTH_NOT_REPORTED (0x0000) +#define MPI3_IOCFACTS_FLAGS_SIGNED_NVDATA_REQUIRED (0x00010000) +#define MPI3_IOCFACTS_FLAGS_DMA_ADDRESS_WIDTH_MASK (0x0000ff00) +#define MPI3_IOCFACTS_FLAGS_DMA_ADDRESS_WIDTH_SHIFT (8) +#define MPI3_IOCFACTS_FLAGS_INITIAL_PORT_ENABLE_MASK (0x00000030) +#define MPI3_IOCFACTS_FLAGS_INITIAL_PORT_ENABLE_NOT_STARTED (0x00000000) +#define MPI3_IOCFACTS_FLAGS_INITIAL_PORT_ENABLE_IN_PROGRESS (0x00000010) +#define MPI3_IOCFACTS_FLAGS_INITIAL_PORT_ENABLE_COMPLETE (0x00000020) +#define MPI3_IOCFACTS_FLAGS_PERSONALITY_MASK (0x0000000f) +#define MPI3_IOCFACTS_FLAGS_PERSONALITY_EHBA (0x00000000) +#define MPI3_IOCFACTS_FLAGS_PERSONALITY_RAID_DDR (0x00000002) +#define MPI3_IOCFACTS_IO_THROTTLE_DATA_LENGTH_NOT_REQUIRED (0x0000) +#define MPI3_IOCFACTS_MAX_IO_THROTTLE_GROUP_NOT_REQUIRED (0x0000) +struct mpi3_mgmt_passthrough_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + __le32 reserved0c[5]; + union mpi3_sge_union command_sgl; + union mpi3_sge_union response_sgl; +}; +struct mpi3_create_request_queue_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 flags; + u8 burst; + __le16 size; + __le16 queue_id; + __le16 reply_queue_id; + __le16 reserved12; + __le32 reserved14; + __le64 base_address; +}; +#define MPI3_CREATE_REQUEST_QUEUE_FLAGS_SEGMENTED_MASK (0x80) +#define MPI3_CREATE_REQUEST_QUEUE_FLAGS_SEGMENTED_SEGMENTED (0x80) +#define MPI3_CREATE_REQUEST_QUEUE_FLAGS_SEGMENTED_CONTIGUOUS (0x00) +#define MPI3_CREATE_REQUEST_QUEUE_SIZE_MINIMUM (2) +struct mpi3_delete_request_queue_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 queue_id; +}; +struct mpi3_create_reply_queue_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 flags; + u8 reserved0b; + __le16 size; + __le16 queue_id; + __le16 msix_index; + __le16 reserved12; + __le32 reserved14; + __le64 base_address; +}; +#define MPI3_CREATE_REPLY_QUEUE_FLAGS_SEGMENTED_MASK (0x80) +#define MPI3_CREATE_REPLY_QUEUE_FLAGS_SEGMENTED_SEGMENTED (0x80) +#define MPI3_CREATE_REPLY_QUEUE_FLAGS_SEGMENTED_CONTIGUOUS (0x00) +#define MPI3_CREATE_REPLY_QUEUE_FLAGS_COALESCE_DISABLE (0x02) +#define MPI3_CREATE_REPLY_QUEUE_FLAGS_INT_ENABLE_MASK (0x01) +#define MPI3_CREATE_REPLY_QUEUE_FLAGS_INT_ENABLE_DISABLE (0x00) +#define MPI3_CREATE_REPLY_QUEUE_FLAGS_INT_ENABLE_ENABLE (0x01) +#define MPI3_CREATE_REPLY_QUEUE_SIZE_MINIMUM (2) +struct mpi3_delete_reply_queue_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 queue_id; +}; +struct mpi3_port_enable_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; +}; +#define MPI3_EVENT_LOG_DATA (0x01) +#define MPI3_EVENT_CHANGE (0x02) +#define MPI3_EVENT_GPIO_INTERRUPT (0x04) +#define MPI3_EVENT_CABLE_MGMT (0x06) +#define MPI3_EVENT_DEVICE_ADDED (0x07) +#define MPI3_EVENT_DEVICE_INFO_CHANGED (0x08) +#define MPI3_EVENT_PREPARE_FOR_RESET (0x09) +#define MPI3_EVENT_COMP_IMAGE_ACT_START (0x0a) +#define MPI3_EVENT_ENCL_DEVICE_ADDED (0x0b) +#define MPI3_EVENT_ENCL_DEVICE_STATUS_CHANGE (0x0c) +#define MPI3_EVENT_DEVICE_STATUS_CHANGE (0x0d) +#define MPI3_EVENT_ENERGY_PACK_CHANGE (0x0e) +#define MPI3_EVENT_SAS_DISCOVERY (0x11) +#define MPI3_EVENT_SAS_BROADCAST_PRIMITIVE (0x12) +#define MPI3_EVENT_SAS_NOTIFY_PRIMITIVE (0x13) +#define MPI3_EVENT_SAS_INIT_DEVICE_STATUS_CHANGE (0x14) +#define MPI3_EVENT_SAS_INIT_TABLE_OVERFLOW (0x15) +#define MPI3_EVENT_SAS_TOPOLOGY_CHANGE_LIST (0x16) +#define MPI3_EVENT_SAS_PHY_COUNTER (0x18) +#define MPI3_EVENT_SAS_DEVICE_DISCOVERY_ERROR (0x19) +#define MPI3_EVENT_PCIE_TOPOLOGY_CHANGE_LIST (0x20) +#define MPI3_EVENT_PCIE_ENUMERATION (0x22) +#define MPI3_EVENT_PCIE_ERROR_THRESHOLD (0x23) +#define MPI3_EVENT_HARD_RESET_RECEIVED (0x40) +#define MPI3_EVENT_DIAGNOSTIC_BUFFER_STATUS_CHANGE (0x50) +#define MPI3_EVENT_MIN_PRODUCT_SPECIFIC (0x60) +#define MPI3_EVENT_MAX_PRODUCT_SPECIFIC (0x7f) +#define MPI3_EVENT_NOTIFY_EVENTMASK_WORDS (4) +struct mpi3_event_notification_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + __le16 sas_broadcast_primitive_masks; + __le16 sas_notify_primitive_masks; + __le32 event_masks[MPI3_EVENT_NOTIFY_EVENTMASK_WORDS]; +}; +struct mpi3_event_notification_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + u8 event_data_length; + u8 event; + __le16 ioc_change_count; + __le32 event_context; + __le32 event_data[1]; +}; +#define MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_MASK (0x01) +#define MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_REQUIRED (0x01) +#define MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_NOT_REQUIRED (0x00) +#define MPI3_EVENT_NOTIFY_MSGFLAGS_EVENT_ORIGINALITY_MASK (0x02) +#define MPI3_EVENT_NOTIFY_MSGFLAGS_EVENT_ORIGINALITY_ORIGINAL (0x00) +#define MPI3_EVENT_NOTIFY_MSGFLAGS_EVENT_ORIGINALITY_REPLAY (0x02) +struct mpi3_event_data_gpio_interrupt { + u8 gpio_num; + u8 reserved01[3]; +}; +struct mpi3_event_data_cable_management { + __le32 active_cable_power_requirement; + u8 status; + u8 receptacle_id; + __le16 reserved06; +}; +#define MPI3_EVENT_CABLE_MGMT_ACT_CABLE_PWR_INVALID (0xffffffff) +#define MPI3_EVENT_CABLE_MGMT_STATUS_INSUFFICIENT_POWER (0x00) +#define MPI3_EVENT_CABLE_MGMT_STATUS_PRESENT (0x01) +#define MPI3_EVENT_CABLE_MGMT_STATUS_DEGRADED (0x02) +struct mpi3_event_ack_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + u8 event; + u8 reserved0d[3]; + __le32 event_context; +}; +struct mpi3_event_data_prepare_for_reset { + u8 reason_code; + u8 reserved01; + __le16 reserved02; +}; +#define MPI3_EVENT_PREPARE_RESET_RC_START (0x01) +#define MPI3_EVENT_PREPARE_RESET_RC_ABORT (0x02) +struct mpi3_event_data_comp_image_activation { + __le32 reserved00; +}; +struct mpi3_event_data_device_status_change { + __le16 task_tag; + u8 reason_code; + u8 io_unit_port; + __le16 parent_dev_handle; + __le16 dev_handle; + __le64 wwid; + u8 lun[8]; +}; +#define MPI3_EVENT_DEV_STAT_RC_MOVED (0x01) +#define MPI3_EVENT_DEV_STAT_RC_HIDDEN (0x02) +#define MPI3_EVENT_DEV_STAT_RC_NOT_HIDDEN (0x03) +#define MPI3_EVENT_DEV_STAT_RC_ASYNC_NOTIFICATION (0x04) +#define MPI3_EVENT_DEV_STAT_RC_INT_DEVICE_RESET_STRT (0x20) +#define MPI3_EVENT_DEV_STAT_RC_INT_DEVICE_RESET_CMP (0x21) +#define MPI3_EVENT_DEV_STAT_RC_INT_TASK_ABORT_STRT (0x22) +#define MPI3_EVENT_DEV_STAT_RC_INT_TASK_ABORT_CMP (0x23) +#define MPI3_EVENT_DEV_STAT_RC_INT_IT_NEXUS_RESET_STRT (0x24) +#define MPI3_EVENT_DEV_STAT_RC_INT_IT_NEXUS_RESET_CMP (0x25) +#define MPI3_EVENT_DEV_STAT_RC_PCIE_HOT_RESET_FAILED (0x30) +#define MPI3_EVENT_DEV_STAT_RC_EXPANDER_REDUCED_FUNC_STRT (0x40) +#define MPI3_EVENT_DEV_STAT_RC_EXPANDER_REDUCED_FUNC_CMP (0x41) +#define MPI3_EVENT_DEV_STAT_RC_VD_NOT_RESPONDING (0x50) +struct mpi3_event_data_energy_pack_change { + __le32 reserved00; + __le16 shutdown_timeout; + __le16 reserved06; +}; +struct mpi3_event_data_sas_discovery { + u8 flags; + u8 reason_code; + u8 io_unit_port; + u8 reserved03; + __le32 discovery_status; +}; +#define MPI3_EVENT_SAS_DISC_FLAGS_DEVICE_CHANGE (0x02) +#define MPI3_EVENT_SAS_DISC_FLAGS_IN_PROGRESS (0x01) +#define MPI3_EVENT_SAS_DISC_RC_STARTED (0x01) +#define MPI3_EVENT_SAS_DISC_RC_COMPLETED (0x02) +#define MPI3_SAS_DISC_STATUS_MAX_ENCLOSURES_EXCEED (0x80000000) +#define MPI3_SAS_DISC_STATUS_MAX_EXPANDERS_EXCEED (0x40000000) +#define MPI3_SAS_DISC_STATUS_MAX_DEVICES_EXCEED (0x20000000) +#define MPI3_SAS_DISC_STATUS_MAX_TOPO_PHYS_EXCEED (0x10000000) +#define MPI3_SAS_DISC_STATUS_INVALID_CEI (0x00010000) +#define MPI3_SAS_DISC_STATUS_FECEI_MISMATCH (0x00008000) +#define MPI3_SAS_DISC_STATUS_MULTIPLE_DEVICES_IN_SLOT (0x00004000) +#define MPI3_SAS_DISC_STATUS_NECEI_MISMATCH (0x00002000) +#define MPI3_SAS_DISC_STATUS_TOO_MANY_SLOTS (0x00001000) +#define MPI3_SAS_DISC_STATUS_EXP_MULTI_SUBTRACTIVE (0x00000800) +#define MPI3_SAS_DISC_STATUS_MULTI_PORT_DOMAIN (0x00000400) +#define MPI3_SAS_DISC_STATUS_TABLE_TO_SUBTRACTIVE_LINK (0x00000200) +#define MPI3_SAS_DISC_STATUS_UNSUPPORTED_DEVICE (0x00000100) +#define MPI3_SAS_DISC_STATUS_TABLE_LINK (0x00000080) +#define MPI3_SAS_DISC_STATUS_SUBTRACTIVE_LINK (0x00000040) +#define MPI3_SAS_DISC_STATUS_SMP_CRC_ERROR (0x00000020) +#define MPI3_SAS_DISC_STATUS_SMP_FUNCTION_FAILED (0x00000010) +#define MPI3_SAS_DISC_STATUS_SMP_TIMEOUT (0x00000008) +#define MPI3_SAS_DISC_STATUS_MULTIPLE_PORTS (0x00000004) +#define MPI3_SAS_DISC_STATUS_INVALID_SAS_ADDRESS (0x00000002) +#define MPI3_SAS_DISC_STATUS_LOOP_DETECTED (0x00000001) +struct mpi3_event_data_sas_broadcast_primitive { + u8 phy_num; + u8 io_unit_port; + u8 port_width; + u8 primitive; +}; +#define MPI3_EVENT_BROADCAST_PRIMITIVE_CHANGE (0x01) +#define MPI3_EVENT_BROADCAST_PRIMITIVE_SES (0x02) +#define MPI3_EVENT_BROADCAST_PRIMITIVE_EXPANDER (0x03) +#define MPI3_EVENT_BROADCAST_PRIMITIVE_ASYNCHRONOUS_EVENT (0x04) +#define MPI3_EVENT_BROADCAST_PRIMITIVE_RESERVED3 (0x05) +#define MPI3_EVENT_BROADCAST_PRIMITIVE_RESERVED4 (0x06) +#define MPI3_EVENT_BROADCAST_PRIMITIVE_CHANGE0_RESERVED (0x07) +#define MPI3_EVENT_BROADCAST_PRIMITIVE_CHANGE1_RESERVED (0x08) +struct mpi3_event_data_sas_notify_primitive { + u8 phy_num; + u8 io_unit_port; + u8 reserved02; + u8 primitive; +}; +#define MPI3_EVENT_NOTIFY_PRIMITIVE_ENABLE_SPINUP (0x01) +#define MPI3_EVENT_NOTIFY_PRIMITIVE_POWER_LOSS_EXPECTED (0x02) +#define MPI3_EVENT_NOTIFY_PRIMITIVE_RESERVED1 (0x03) +#define MPI3_EVENT_NOTIFY_PRIMITIVE_RESERVED2 (0x04) +#ifndef MPI3_EVENT_SAS_TOPO_PHY_COUNT +#define MPI3_EVENT_SAS_TOPO_PHY_COUNT (1) +#endif +struct mpi3_event_sas_topo_phy_entry { + __le16 attached_dev_handle; + u8 link_rate; + u8 status; +}; +#define MPI3_EVENT_SAS_TOPO_LR_CURRENT_MASK (0xf0) +#define MPI3_EVENT_SAS_TOPO_LR_CURRENT_SHIFT (4) +#define MPI3_EVENT_SAS_TOPO_LR_PREV_MASK (0x0f) +#define MPI3_EVENT_SAS_TOPO_LR_PREV_SHIFT (0) +#define MPI3_EVENT_SAS_TOPO_LR_UNKNOWN_LINK_RATE (0x00) +#define MPI3_EVENT_SAS_TOPO_LR_PHY_DISABLED (0x01) +#define MPI3_EVENT_SAS_TOPO_LR_NEGOTIATION_FAILED (0x02) +#define MPI3_EVENT_SAS_TOPO_LR_SATA_OOB_COMPLETE (0x03) +#define MPI3_EVENT_SAS_TOPO_LR_PORT_SELECTOR (0x04) +#define MPI3_EVENT_SAS_TOPO_LR_SMP_RESET_IN_PROGRESS (0x05) +#define MPI3_EVENT_SAS_TOPO_LR_UNSUPPORTED_PHY (0x06) +#define MPI3_EVENT_SAS_TOPO_LR_RATE_6_0 (0x0a) +#define MPI3_EVENT_SAS_TOPO_LR_RATE_12_0 (0x0b) +#define MPI3_EVENT_SAS_TOPO_LR_RATE_22_5 (0x0c) +#define MPI3_EVENT_SAS_TOPO_PHY_STATUS_MASK (0xc0) +#define MPI3_EVENT_SAS_TOPO_PHY_STATUS_SHIFT (6) +#define MPI3_EVENT_SAS_TOPO_PHY_STATUS_ACCESSIBLE (0x00) +#define MPI3_EVENT_SAS_TOPO_PHY_STATUS_NO_EXIST (0x40) +#define MPI3_EVENT_SAS_TOPO_PHY_STATUS_VACANT (0x80) +#define MPI3_EVENT_SAS_TOPO_PHY_RC_MASK (0x0f) +#define MPI3_EVENT_SAS_TOPO_PHY_RC_TARG_NOT_RESPONDING (0x02) +#define MPI3_EVENT_SAS_TOPO_PHY_RC_PHY_CHANGED (0x03) +#define MPI3_EVENT_SAS_TOPO_PHY_RC_NO_CHANGE (0x04) +#define MPI3_EVENT_SAS_TOPO_PHY_RC_DELAY_NOT_RESPONDING (0x05) +#define MPI3_EVENT_SAS_TOPO_PHY_RC_RESPONDING (0x06) +struct mpi3_event_data_sas_topology_change_list { + __le16 enclosure_handle; + __le16 expander_dev_handle; + u8 num_phys; + u8 reserved05[3]; + u8 num_entries; + u8 start_phy_num; + u8 exp_status; + u8 io_unit_port; + struct mpi3_event_sas_topo_phy_entry phy_entry[MPI3_EVENT_SAS_TOPO_PHY_COUNT]; +}; +#define MPI3_EVENT_SAS_TOPO_ES_NO_EXPANDER (0x00) +#define MPI3_EVENT_SAS_TOPO_ES_NOT_RESPONDING (0x02) +#define MPI3_EVENT_SAS_TOPO_ES_RESPONDING (0x03) +#define MPI3_EVENT_SAS_TOPO_ES_DELAY_NOT_RESPONDING (0x04) +struct mpi3_event_data_sas_phy_counter { + __le64 time_stamp; + __le32 reserved08; + u8 phy_event_code; + u8 phy_num; + __le16 reserved0e; + __le32 phy_event_info; + u8 counter_type; + u8 threshold_window; + u8 time_units; + u8 reserved17; + __le32 event_threshold; + __le16 threshold_flags; + __le16 reserved1e; +}; +struct mpi3_event_data_sas_device_disc_err { + __le16 dev_handle; + u8 reason_code; + u8 io_unit_port; + __le32 reserved04; + __le64 sas_address; +}; +#define MPI3_EVENT_SAS_DISC_ERR_RC_SMP_FAILED (0x01) +#define MPI3_EVENT_SAS_DISC_ERR_RC_SMP_TIMEOUT (0x02) +struct mpi3_event_data_pcie_enumeration { + u8 flags; + u8 reason_code; + u8 io_unit_port; + u8 reserved03; + __le32 enumeration_status; +}; +#define MPI3_EVENT_PCIE_ENUM_FLAGS_DEVICE_CHANGE (0x02) +#define MPI3_EVENT_PCIE_ENUM_FLAGS_IN_PROGRESS (0x01) +#define MPI3_EVENT_PCIE_ENUM_RC_STARTED (0x01) +#define MPI3_EVENT_PCIE_ENUM_RC_COMPLETED (0x02) +#define MPI3_EVENT_PCIE_ENUM_ES_MAX_SWITCH_DEPTH_EXCEED (0x80000000) +#define MPI3_EVENT_PCIE_ENUM_ES_MAX_SWITCHES_EXCEED (0x40000000) +#define MPI3_EVENT_PCIE_ENUM_ES_MAX_DEVICES_EXCEED (0x20000000) +#define MPI3_EVENT_PCIE_ENUM_ES_RESOURCES_EXHAUSTED (0x10000000) +#ifndef MPI3_EVENT_PCIE_TOPO_PORT_COUNT +#define MPI3_EVENT_PCIE_TOPO_PORT_COUNT (1) +#endif +struct mpi3_event_pcie_topo_port_entry { + __le16 attached_dev_handle; + u8 port_status; + u8 reserved03; + u8 current_port_info; + u8 reserved05; + u8 previous_port_info; + u8 reserved07; +}; +#define MPI3_EVENT_PCIE_TOPO_PS_NOT_RESPONDING (0x02) +#define MPI3_EVENT_PCIE_TOPO_PS_PORT_CHANGED (0x03) +#define MPI3_EVENT_PCIE_TOPO_PS_NO_CHANGE (0x04) +#define MPI3_EVENT_PCIE_TOPO_PS_DELAY_NOT_RESPONDING (0x05) +#define MPI3_EVENT_PCIE_TOPO_PS_RESPONDING (0x06) +#define MPI3_EVENT_PCIE_TOPO_PI_LANES_MASK (0xf0) +#define MPI3_EVENT_PCIE_TOPO_PI_LANES_UNKNOWN (0x00) +#define MPI3_EVENT_PCIE_TOPO_PI_LANES_1 (0x10) +#define MPI3_EVENT_PCIE_TOPO_PI_LANES_2 (0x20) +#define MPI3_EVENT_PCIE_TOPO_PI_LANES_4 (0x30) +#define MPI3_EVENT_PCIE_TOPO_PI_LANES_8 (0x40) +#define MPI3_EVENT_PCIE_TOPO_PI_LANES_16 (0x50) +#define MPI3_EVENT_PCIE_TOPO_PI_RATE_MASK (0x0f) +#define MPI3_EVENT_PCIE_TOPO_PI_RATE_UNKNOWN (0x00) +#define MPI3_EVENT_PCIE_TOPO_PI_RATE_DISABLED (0x01) +#define MPI3_EVENT_PCIE_TOPO_PI_RATE_2_5 (0x02) +#define MPI3_EVENT_PCIE_TOPO_PI_RATE_5_0 (0x03) +#define MPI3_EVENT_PCIE_TOPO_PI_RATE_8_0 (0x04) +#define MPI3_EVENT_PCIE_TOPO_PI_RATE_16_0 (0x05) +#define MPI3_EVENT_PCIE_TOPO_PI_RATE_32_0 (0x06) +struct mpi3_event_data_pcie_topology_change_list { + __le16 enclosure_handle; + __le16 switch_dev_handle; + u8 num_ports; + u8 reserved05[3]; + u8 num_entries; + u8 start_port_num; + u8 switch_status; + u8 io_unit_port; + __le32 reserved0c; + struct mpi3_event_pcie_topo_port_entry port_entry[MPI3_EVENT_PCIE_TOPO_PORT_COUNT]; +}; +#define MPI3_EVENT_PCIE_TOPO_SS_NO_PCIE_SWITCH (0x00) +#define MPI3_EVENT_PCIE_TOPO_SS_NOT_RESPONDING (0x02) +#define MPI3_EVENT_PCIE_TOPO_SS_RESPONDING (0x03) +#define MPI3_EVENT_PCIE_TOPO_SS_DELAY_NOT_RESPONDING (0x04) +struct mpi3_event_data_pcie_error_threshold { + __le64 timestamp; + u8 reason_code; + u8 port; + __le16 switch_dev_handle; + u8 error; + u8 action; + __le16 threshold_count; + __le16 attached_dev_handle; + __le16 reserved12; +}; +#define MPI3_EVENT_PCI_ERROR_RC_THRESHOLD_EXCEEDED (0x00) +#define MPI3_EVENT_PCI_ERROR_RC_ESCALATION (0x01) +struct mpi3_event_data_sas_init_dev_status_change { + u8 reason_code; + u8 io_unit_port; + __le16 dev_handle; + __le32 reserved04; + __le64 sas_address; +}; +#define MPI3_EVENT_SAS_INIT_RC_ADDED (0x01) +#define MPI3_EVENT_SAS_INIT_RC_NOT_RESPONDING (0x02) +struct mpi3_event_data_sas_init_table_overflow { + __le16 max_init; + __le16 current_init; + __le32 reserved04; + __le64 sas_address; +}; +struct mpi3_event_data_hard_reset_received { + u8 reserved00; + u8 io_unit_port; + __le16 reserved02; +}; +struct mpi3_event_data_diag_buffer_status_change { + u8 type; + u8 reason_code; + __le16 reserved02; + __le32 reserved04; +}; +#define MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_RELEASED (0x01) +#define MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_PAUSED (0x02) +#define MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_RESUMED (0x03) +#define MPI3_PEL_LOCALE_FLAGS_NON_BLOCKING_BOOT_EVENT (0x0200) +#define MPI3_PEL_LOCALE_FLAGS_BLOCKING_BOOT_EVENT (0x0100) +#define MPI3_PEL_LOCALE_FLAGS_PCIE (0x0080) +#define MPI3_PEL_LOCALE_FLAGS_CONFIGURATION (0x0040) +#define MPI3_PEL_LOCALE_FLAGS_CONTROLER (0x0020) +#define MPI3_PEL_LOCALE_FLAGS_SAS (0x0010) +#define MPI3_PEL_LOCALE_FLAGS_EPACK (0x0008) +#define MPI3_PEL_LOCALE_FLAGS_ENCLOSURE (0x0004) +#define MPI3_PEL_LOCALE_FLAGS_PD (0x0002) +#define MPI3_PEL_LOCALE_FLAGS_VD (0x0001) +#define MPI3_PEL_CLASS_DEBUG (0x00) +#define MPI3_PEL_CLASS_PROGRESS (0x01) +#define MPI3_PEL_CLASS_INFORMATIONAL (0x02) +#define MPI3_PEL_CLASS_WARNING (0x03) +#define MPI3_PEL_CLASS_CRITICAL (0x04) +#define MPI3_PEL_CLASS_FATAL (0x05) +#define MPI3_PEL_CLASS_FAULT (0x06) +#define MPI3_PEL_CLEARTYPE_CLEAR (0x00) +#define MPI3_PEL_WAITTIME_INFINITE_WAIT (0x00) +#define MPI3_PEL_ACTION_GET_SEQNUM (0x01) +#define MPI3_PEL_ACTION_MARK_CLEAR (0x02) +#define MPI3_PEL_ACTION_GET_LOG (0x03) +#define MPI3_PEL_ACTION_GET_COUNT (0x04) +#define MPI3_PEL_ACTION_WAIT (0x05) +#define MPI3_PEL_ACTION_ABORT (0x06) +#define MPI3_PEL_ACTION_GET_PRINT_STRINGS (0x07) +#define MPI3_PEL_ACTION_ACKNOWLEDGE (0x08) +#define MPI3_PEL_STATUS_SUCCESS (0x00) +#define MPI3_PEL_STATUS_NOT_FOUND (0x01) +#define MPI3_PEL_STATUS_ABORTED (0x02) +#define MPI3_PEL_STATUS_NOT_READY (0x03) +struct mpi3_pel_seq { + __le32 newest; + __le32 oldest; + __le32 clear; + __le32 shutdown; + __le32 boot; + __le32 last_acknowledged; +}; +struct mpi3_pel_entry { + __le64 time_stamp; + __le32 sequence_number; + __le16 log_code; + __le16 arg_type; + __le16 locale; + u8 class; + u8 flags; + u8 ext_num; + u8 num_exts; + u8 arg_data_size; + u8 fixed_format_strings_size; + __le32 reserved18[2]; + __le32 pel_info[24]; +}; +#define MPI3_PEL_FLAGS_COMPLETE_RESET_NEEDED (0x02) +#define MPI3_PEL_FLAGS_ACK_NEEDED (0x01) +struct mpi3_pel_list { + __le32 log_count; + __le32 reserved04; + struct mpi3_pel_entry entry[1]; +}; +struct mpi3_pel_arg_map { + u8 arg_type; + u8 length; + __le16 start_location; +}; +#define MPI3_PEL_ARG_MAP_ARG_TYPE_APPEND_STRING (0x00) +#define MPI3_PEL_ARG_MAP_ARG_TYPE_INTEGER (0x01) +#define MPI3_PEL_ARG_MAP_ARG_TYPE_STRING (0x02) +#define MPI3_PEL_ARG_MAP_ARG_TYPE_BIT_FIELD (0x03) +struct mpi3_pel_print_string { + __le16 log_code; + __le16 string_length; + u8 num_arg_map; + u8 reserved05[3]; + struct mpi3_pel_arg_map arg_map[1]; +}; +struct mpi3_pel_print_string_list { + __le32 num_print_strings; + __le32 residual_bytes_remain; + __le32 reserved08[2]; + struct mpi3_pel_print_string print_string[1]; +}; +#ifndef MPI3_PEL_ACTION_SPECIFIC_MAX +#define MPI3_PEL_ACTION_SPECIFIC_MAX (1) +#endif +struct mpi3_pel_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 action; + u8 reserved0b; + __le32 action_specific[MPI3_PEL_ACTION_SPECIFIC_MAX]; +}; +struct mpi3_pel_req_action_get_sequence_numbers { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 action; + u8 reserved0b; + __le32 reserved0c[5]; + union mpi3_sge_union sgl; +}; +struct mpi3_pel_req_action_clear_log_marker { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 action; + u8 reserved0b; + u8 clear_type; + u8 reserved0d[3]; +}; +struct mpi3_pel_req_action_get_log { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 action; + u8 reserved0b; + __le32 starting_sequence_number; + __le16 locale; + u8 class; + u8 reserved13; + __le32 reserved14[3]; + union mpi3_sge_union sgl; +}; +struct mpi3_pel_req_action_get_count { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 action; + u8 reserved0b; + __le32 starting_sequence_number; + __le16 locale; + u8 class; + u8 reserved13; + __le32 reserved14[3]; + union mpi3_sge_union sgl; +}; +struct mpi3_pel_req_action_wait { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 action; + u8 reserved0b; + __le32 starting_sequence_number; + __le16 locale; + u8 class; + u8 reserved13; + __le16 wait_time; + __le16 reserved16; + __le32 reserved18[2]; +}; +struct mpi3_pel_req_action_abort { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 action; + u8 reserved0b; + __le32 reserved0c; + __le16 abort_host_tag; + __le16 reserved12; + __le32 reserved14; +}; +struct mpi3_pel_req_action_get_print_strings { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 action; + u8 reserved0b; + __le32 reserved0c; + __le16 start_log_code; + __le16 reserved12; + __le32 reserved14[3]; + union mpi3_sge_union sgl; +}; +struct mpi3_pel_req_action_acknowledge { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 action; + u8 reserved0b; + __le32 sequence_number; + __le32 reserved10; +}; +#define MPI3_PELACKNOWLEDGE_MSGFLAGS_SAFE_MODE_EXIT_MASK (0x03) +#define MPI3_PELACKNOWLEDGE_MSGFLAGS_SAFE_MODE_EXIT_NO_GUIDANCE (0x00) +#define MPI3_PELACKNOWLEDGE_MSGFLAGS_SAFE_MODE_EXIT_CONTINUE_OP (0x01) +#define MPI3_PELACKNOWLEDGE_MSGFLAGS_SAFE_MODE_EXIT_TRANSITION_TO_FAULT (0x02) +struct mpi3_pel_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + u8 action; + u8 reserved11; + __le16 reserved12; + __le16 pe_log_status; + __le16 reserved16; + __le32 transfer_length; +}; +struct mpi3_ci_download_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 action; + u8 reserved0b; + __le32 signature1; + __le32 total_image_size; + __le32 image_offset; + __le32 segment_size; + __le32 reserved1c; + union mpi3_sge_union sgl; +}; +#define MPI3_CI_DOWNLOAD_MSGFLAGS_LAST_SEGMENT (0x80) +#define MPI3_CI_DOWNLOAD_MSGFLAGS_FORCE_FMC_ENABLE (0x40) +#define MPI3_CI_DOWNLOAD_MSGFLAGS_SIGNED_NVDATA (0x20) +#define MPI3_CI_DOWNLOAD_MSGFLAGS_WRITE_CACHE_FLUSH_MASK (0x03) +#define MPI3_CI_DOWNLOAD_MSGFLAGS_WRITE_CACHE_FLUSH_FAST (0x00) +#define MPI3_CI_DOWNLOAD_MSGFLAGS_WRITE_CACHE_FLUSH_MEDIUM (0x01) +#define MPI3_CI_DOWNLOAD_MSGFLAGS_WRITE_CACHE_FLUSH_SLOW (0x02) +#define MPI3_CI_DOWNLOAD_ACTION_DOWNLOAD (0x01) +#define MPI3_CI_DOWNLOAD_ACTION_ONLINE_ACTIVATION (0x02) +#define MPI3_CI_DOWNLOAD_ACTION_OFFLINE_ACTIVATION (0x03) +#define MPI3_CI_DOWNLOAD_ACTION_GET_STATUS (0x04) +#define MPI3_CI_DOWNLOAD_ACTION_CANCEL_OFFLINE_ACTIVATION (0x05) +struct mpi3_ci_download_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + u8 flags; + u8 cache_dirty; + u8 pending_count; + u8 reserved13; +}; +#define MPI3_CI_DOWNLOAD_FLAGS_DOWNLOAD_IN_PROGRESS (0x80) +#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_FAILURE (0x40) +#define MPI3_CI_DOWNLOAD_FLAGS_OFFLINE_ACTIVATION_REQUIRED (0x20) +#define MPI3_CI_DOWNLOAD_FLAGS_KEY_UPDATE_PENDING (0x10) +#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_STATUS_MASK (0x0e) +#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_STATUS_NOT_NEEDED (0x00) +#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_STATUS_AWAITING (0x02) +#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_STATUS_ONLINE_PENDING (0x04) +#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_STATUS_OFFLINE_PENDING (0x06) +#define MPI3_CI_DOWNLOAD_FLAGS_COMPATIBLE (0x01) +struct mpi3_ci_upload_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + __le32 signature1; + __le32 reserved10; + __le32 image_offset; + __le32 segment_size; + __le32 reserved1c; + union mpi3_sge_union sgl; +}; +#define MPI3_CI_UPLOAD_MSGFLAGS_LOCATION_MASK (0x01) +#define MPI3_CI_UPLOAD_MSGFLAGS_LOCATION_PRIMARY (0x00) +#define MPI3_CI_UPLOAD_MSGFLAGS_LOCATION_SECONDARY (0x01) +#define MPI3_CI_UPLOAD_MSGFLAGS_FORMAT_MASK (0x02) +#define MPI3_CI_UPLOAD_MSGFLAGS_FORMAT_FLASH (0x00) +#define MPI3_CI_UPLOAD_MSGFLAGS_FORMAT_EXECUTABLE (0x02) +#define MPI3_CTRL_OP_FORCE_FULL_DISCOVERY (0x01) +#define MPI3_CTRL_OP_LOOKUP_MAPPING (0x02) +#define MPI3_CTRL_OP_UPDATE_TIMESTAMP (0x04) +#define MPI3_CTRL_OP_GET_TIMESTAMP (0x05) +#define MPI3_CTRL_OP_GET_IOC_CHANGE_COUNT (0x06) +#define MPI3_CTRL_OP_CHANGE_PROFILE (0x07) +#define MPI3_CTRL_OP_REMOVE_DEVICE (0x10) +#define MPI3_CTRL_OP_CLOSE_PERSISTENT_CONNECTION (0x11) +#define MPI3_CTRL_OP_HIDDEN_ACK (0x12) +#define MPI3_CTRL_OP_CLEAR_DEVICE_COUNTERS (0x13) +#define MPI3_CTRL_OP_SEND_SAS_PRIMITIVE (0x20) +#define MPI3_CTRL_OP_SAS_PHY_CONTROL (0x21) +#define MPI3_CTRL_OP_READ_INTERNAL_BUS (0x23) +#define MPI3_CTRL_OP_WRITE_INTERNAL_BUS (0x24) +#define MPI3_CTRL_OP_PCIE_LINK_CONTROL (0x30) +#define MPI3_CTRL_OP_LOOKUP_MAPPING_PARAM8_LOOKUP_METHOD_INDEX (0x00) +#define MPI3_CTRL_OP_UPDATE_TIMESTAMP_PARAM64_TIMESTAMP_INDEX (0x00) +#define MPI3_CTRL_OP_CHANGE_PROFILE_PARAM8_PROFILE_ID_INDEX (0x00) +#define MPI3_CTRL_OP_REMOVE_DEVICE_PARAM16_DEVHANDLE_INDEX (0x00) +#define MPI3_CTRL_OP_CLOSE_PERSIST_CONN_PARAM16_DEVHANDLE_INDEX (0x00) +#define MPI3_CTRL_OP_HIDDEN_ACK_PARAM16_DEVHANDLE_INDEX (0x00) +#define MPI3_CTRL_OP_CLEAR_DEVICE_COUNTERS_PARAM16_DEVHANDLE_INDEX (0x00) +#define MPI3_CTRL_OP_SEND_SAS_PRIM_PARAM8_PHY_INDEX (0x00) +#define MPI3_CTRL_OP_SEND_SAS_PRIM_PARAM8_PRIMSEQ_INDEX (0x01) +#define MPI3_CTRL_OP_SEND_SAS_PRIM_PARAM32_PRIMITIVE_INDEX (0x00) +#define MPI3_CTRL_OP_SAS_PHY_CONTROL_PARAM8_ACTION_INDEX (0x00) +#define MPI3_CTRL_OP_SAS_PHY_CONTROL_PARAM8_PHY_INDEX (0x01) +#define MPI3_CTRL_OP_READ_INTERNAL_BUS_PARAM64_ADDRESS_INDEX (0x00) +#define MPI3_CTRL_OP_WRITE_INTERNAL_BUS_PARAM64_ADDRESS_INDEX (0x00) +#define MPI3_CTRL_OP_WRITE_INTERNAL_BUS_PARAM32_VALUE_INDEX (0x00) +#define MPI3_CTRL_OP_PCIE_LINK_CONTROL_PARAM8_ACTION_INDEX (0x00) +#define MPI3_CTRL_OP_PCIE_LINK_CONTROL_PARAM8_LINK_INDEX (0x01) +#define MPI3_CTRL_LOOKUP_METHOD_WWID_ADDRESS (0x01) +#define MPI3_CTRL_LOOKUP_METHOD_ENCLOSURE_SLOT (0x02) +#define MPI3_CTRL_LOOKUP_METHOD_SAS_DEVICE_NAME (0x03) +#define MPI3_CTRL_LOOKUP_METHOD_PERSISTENT_ID (0x04) +#define MPI3_CTRL_LOOKUP_METHOD_WWIDADDR_PARAM16_DEVH_INDEX (0) +#define MPI3_CTRL_LOOKUP_METHOD_WWIDADDR_PARAM64_WWID_INDEX (0) +#define MPI3_CTRL_LOOKUP_METHOD_ENCLSLOT_PARAM16_SLOTNUM_INDEX (0) +#define MPI3_CTRL_LOOKUP_METHOD_ENCLSLOT_PARAM64_ENCLOSURELID_INDEX (0) +#define MPI3_CTRL_LOOKUP_METHOD_SASDEVNAME_PARAM16_DEVH_INDEX (0) +#define MPI3_CTRL_LOOKUP_METHOD_SASDEVNAME_PARAM64_DEVNAME_INDEX (0) +#define MPI3_CTRL_LOOKUP_METHOD_PERSISTID_PARAM16_DEVH_INDEX (0) +#define MPI3_CTRL_LOOKUP_METHOD_PERSISTID_PARAM16_PERSISTENT_ID_INDEX (1) +#define MPI3_CTRL_LOOKUP_METHOD_VALUE16_DEVH_INDEX (0) +#define MPI3_CTRL_GET_TIMESTAMP_VALUE64_TIMESTAMP_INDEX (0) +#define MPI3_CTRL_GET_IOC_CHANGE_COUNT_VALUE16_CHANGECOUNT_INDEX (0) +#define MPI3_CTRL_READ_INTERNAL_BUS_VALUE32_VALUE_INDEX (0) +#define MPI3_CTRL_PRIMFLAGS_SINGLE (0x01) +#define MPI3_CTRL_PRIMFLAGS_TRIPLE (0x03) +#define MPI3_CTRL_PRIMFLAGS_REDUNDANT (0x06) +#define MPI3_CTRL_ACTION_NOP (0x00) +#define MPI3_CTRL_ACTION_LINK_RESET (0x01) +#define MPI3_CTRL_ACTION_HARD_RESET (0x02) +#define MPI3_CTRL_ACTION_CLEAR_ERROR_LOG (0x05) +struct mpi3_iounit_control_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 reserved0a; + u8 operation; + __le32 reserved0c; + __le64 param64[2]; + __le32 param32[4]; + __le16 param16[4]; + u8 param8[8]; +}; +struct mpi3_iounit_control_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le64 value64[2]; + __le32 value32[4]; + __le16 value16[4]; + u8 value8[8]; +}; +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_pci.h b/drivers/scsi/mpi3mr/mpi/mpi30_pci.h new file mode 100644 index 0000000000000..3daa16efcc3a3 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_pci.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_PCI_H +#define MPI30_PCI_H 1 +#ifndef MPI3_NVME_ENCAP_CMD_MAX +#define MPI3_NVME_ENCAP_CMD_MAX (1) +#endif +struct mpi3_nvme_encapsulated_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 dev_handle; + __le16 encapsulated_command_length; + __le16 flags; + __le32 data_length; + __le32 reserved14[3]; + __le32 command[MPI3_NVME_ENCAP_CMD_MAX]; +}; +#define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_MASK (0x0002) +#define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_FAIL_ONLY (0x0000) +#define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_ALL (0x0002) +#define MPI3_NVME_FLAGS_SUBMISSIONQ_MASK (0x0001) +#define MPI3_NVME_FLAGS_SUBMISSIONQ_IO (0x0000) +#define MPI3_NVME_FLAGS_SUBMISSIONQ_ADMIN (0x0001) +struct mpi3_nvme_encapsulated_error_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le32 nvme_completion_entry[4]; +}; +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_raid.h b/drivers/scsi/mpi3mr/mpi/mpi30_raid.h new file mode 100644 index 0000000000000..7ce3c00d4fbd0 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_raid.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_RAID_H +#define MPI30_RAID_H 1 +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_sas.h b/drivers/scsi/mpi3mr/mpi/mpi30_sas.h new file mode 100644 index 0000000000000..78d8e0ad26757 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_sas.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_SAS_H +#define MPI30_SAS_H 1 +#define MPI3_SAS_DEVICE_INFO_SSP_TARGET (0x00000100) +#define MPI3_SAS_DEVICE_INFO_STP_SATA_TARGET (0x00000080) +#define MPI3_SAS_DEVICE_INFO_SMP_TARGET (0x00000040) +#define MPI3_SAS_DEVICE_INFO_SSP_INITIATOR (0x00000020) +#define MPI3_SAS_DEVICE_INFO_STP_INITIATOR (0x00000010) +#define MPI3_SAS_DEVICE_INFO_SMP_INITIATOR (0x00000008) +#define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_MASK (0x00000007) +#define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_NO_DEVICE (0x00000000) +#define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_END_DEVICE (0x00000001) +#define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_EXPANDER (0x00000002) +struct mpi3_smp_passthrough_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 reserved0a; + u8 io_unit_port; + __le32 reserved0c[3]; + __le64 sas_address; + struct mpi3_sge_common request_sge; + struct mpi3_sge_common response_sge; +}; +struct mpi3_smp_passthrough_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le16 response_data_length; + __le16 reserved12; +}; +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_targ.h b/drivers/scsi/mpi3mr/mpi/mpi30_targ.h new file mode 100644 index 0000000000000..9fa30ca941f10 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_targ.h @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_TARG_H +#define MPI30_TARG_H 1 +struct mpi3_target_ssp_cmd_buffer { + u8 frame_type; + u8 reserved01; + __le16 initiator_connection_tag; + __le32 hashed_source_sas_address; + __le16 reserved08; + __le16 flags; + __le32 reserved0c; + __le16 tag; + __le16 target_port_transfer_tag; + __le32 data_offset; + u8 logical_unit_number[8]; + u8 reserved20; + u8 task_attribute; + u8 reserved22; + u8 additional_cdb_length; + u8 cdb[16]; +}; +struct mpi3_target_ssp_task_buffer { + u8 frame_type; + u8 reserved01; + __le16 initiator_connection_tag; + __le32 hashed_source_sas_address; + __le16 reserved08; + __le16 flags; + __le32 reserved0c; + __le16 tag; + __le16 target_port_transfer_tag; + __le32 data_offset; + u8 logical_unit_number[8]; + __le16 reserved20; + u8 task_management_function; + u8 reserved23; + __le16 managed_task_tag; + __le16 reserved26; + __le32 reserved28[3]; +}; +#define MPI3_TARGET_FRAME_TYPE_COMMAND (0x06) +#define MPI3_TARGET_FRAME_TYPE_TASK (0x16) +#define MPI3_TARGET_HASHED_SAS_ADDRESS_MASK (0xffffff00) +#define MPI3_TARGET_HASHED_SAS_ADDRESS_SHIFT (8) +struct mpi3_target_cmd_buf_post_base_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 buffer_post_flags; + u8 reserved0b; + __le16 min_reply_queue_id; + __le16 max_reply_queue_id; + __le64 base_address; + __le16 cmd_buffer_length; + __le16 total_cmd_buffers; + __le32 reserved1c; +}; +#define MPI3_CMD_BUF_POST_BASE_FLAGS_DLAS_MASK (0x0c) +#define MPI3_CMD_BUF_POST_BASE_FLAGS_DLAS_SYSTEM (0x00) +#define MPI3_CMD_BUF_POST_BASE_FLAGS_DLAS_IOCUDP (0x04) +#define MPI3_CMD_BUF_POST_BASE_FLAGS_DLAS_IOCCTL (0x08) +#define MPI3_CMD_BUF_POST_BASE_FLAGS_AUTO_POST_ALL (0x01) +#define MPI3_CMD_BUF_POST_BASE_MIN_BUF_LENGTH (0x34) +#define MPI3_CMD_BUF_POST_BASE_MAX_BUF_LENGTH (0x3fc) +struct mpi3_target_cmd_buf_post_list_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + u8 cmd_buffer_count; + u8 reserved0d[3]; + __le16 io_index[2]; +}; +struct mpi3_target_cmd_buf_post_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + u8 cmd_buffer_count; + u8 reserved11[3]; + __le16 io_index[2]; +}; +struct mpi3_target_assist_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 dev_handle; + __le32 flags; + __le16 reserved10; + __le16 queue_tag; + __le16 io_index; + __le16 initiator_connection_tag; + __le32 skip_count; + __le32 data_length; + __le32 port_transfer_length; + __le32 primary_reference_tag; + __le16 primary_application_tag; + __le16 primary_application_tag_mask; + __le32 relative_offset; + union mpi3_sge_union sgl[5]; +}; +#define MPI3_TARGET_ASSIST_MSGFLAGS_METASGL_VALID (0x80) +#define MPI3_TARGET_ASSIST_FLAGS_REPOST_CMD_BUFFER (0x00200000) +#define MPI3_TARGET_ASSIST_FLAGS_AUTO_STATUS (0x00100000) +#define MPI3_TARGET_ASSIST_FLAGS_DATADIRECTION_MASK (0x000c0000) +#define MPI3_TARGET_ASSIST_FLAGS_DATADIRECTION_WRITE (0x00040000) +#define MPI3_TARGET_ASSIST_FLAGS_DATADIRECTION_READ (0x00080000) +#define MPI3_TARGET_ASSIST_FLAGS_DMAOPERATION_MASK (0x00030000) +#define MPI3_TARGET_ASSIST_FLAGS_DMAOPERATION_HOST_PI (0x00010000) +#define MPI3_TARGET_ASSIST_METASGL_INDEX (4) +struct mpi3_target_status_send_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 dev_handle; + __le16 response_iu_length; + __le16 flags; + __le16 reserved10; + __le16 queue_tag; + __le16 io_index; + __le16 initiator_connection_tag; + __le32 ioc_use_only18[6]; + __le32 ioc_use_only30[4]; + union mpi3_sge_union sgl; +}; +#define MPI3_TSS_FLAGS_REPOST_CMD_BUFFER (0x0020) +#define MPI3_TSS_FLAGS_AUTO_SEND_GOOD_STATUS (0x0010) +struct mpi3_target_standard_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le32 transfer_count; +}; +struct mpi3_target_mode_abort_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 abort_type; + u8 reserved0b; + __le16 request_queue_id_to_abort; + __le16 host_tag_to_abort; + __le16 dev_handle; + __le16 reserved12; +}; +#define MPI3_TARGET_MODE_ABORT_ALL_CMD_BUFFERS (0x00) +#define MPI3_TARGET_MODE_ABORT_EXACT_IO_REQUEST (0x01) +#define MPI3_TARGET_MODE_ABORT_ALL_COMMANDS (0x02) +struct mpi3_target_mode_abort_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le32 abort_count; +}; +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_tool.h b/drivers/scsi/mpi3mr/mpi/mpi30_tool.h new file mode 100644 index 0000000000000..04c12874cea13 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_tool.h @@ -0,0 +1,289 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_TOOL_H +#define MPI30_TOOL_H 1 +struct mpi3_tool_clean_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 tool; + u8 reserved0b; + __le32 area; +}; +#define MPI3_TOOLBOX_TOOL_CLEAN (0x01) +#define MPI3_TOOLBOX_TOOL_ISTWI_READ_WRITE (0x02) +#define MPI3_TOOLBOX_TOOL_DIAGNOSTIC_CLI (0x03) +#define MPI3_TOOLBOX_TOOL_LANE_MARGINING (0x04) +#define MPI3_TOOLBOX_TOOL_RECOVER_DEVICE (0x05) +#define MPI3_TOOLBOX_TOOL_LOOPBACK (0x06) +#define MPI3_TOOLBOX_CLEAN_AREA_BIOS_BOOT_SERVICES (0x00000008) +#define MPI3_TOOLBOX_CLEAN_AREA_ALL_BUT_MFG (0x00000002) +#define MPI3_TOOLBOX_CLEAN_AREA_NVSTORE (0x00000001) +struct mpi3_tool_istwi_read_write_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 tool; + u8 flags; + u8 dev_index; + u8 action; + __le16 reserved0e; + __le16 tx_data_length; + __le16 rx_data_length; + __le32 reserved14[3]; + struct mpi3_man11_istwi_device_format istwi_device; + union mpi3_sge_union sgl; +}; +#define MPI3_TOOLBOX_ISTWI_FLAGS_AUTO_RESERVE_RELEASE (0x80) +#define MPI3_TOOLBOX_ISTWI_FLAGS_ADDRESS_MODE_MASK (0x04) +#define MPI3_TOOLBOX_ISTWI_FLAGS_ADDRESS_MODE_DEVINDEX (0x00) +#define MPI3_TOOLBOX_ISTWI_FLAGS_ADDRESS_MODE_DEVICE_FIELD (0x04) +#define MPI3_TOOLBOX_ISTWI_FLAGS_PAGE_ADDRESS_MASK (0x03) +#define MPI3_TOOLBOX_ISTWI_ACTION_RESERVE_BUS (0x00) +#define MPI3_TOOLBOX_ISTWI_ACTION_RELEASE_BUS (0x01) +#define MPI3_TOOLBOX_ISTWI_ACTION_RESET (0x02) +#define MPI3_TOOLBOX_ISTWI_ACTION_READ_DATA (0x03) +#define MPI3_TOOLBOX_ISTWI_ACTION_WRITE_DATA (0x04) +#define MPI3_TOOLBOX_ISTWI_ACTION_SEQUENCE (0x05) +struct mpi3_tool_istwi_read_write_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le16 istwi_status; + __le16 reserved12; + __le16 tx_data_count; + __le16 rx_data_count; +}; +struct mpi3_tool_diagnostic_cli_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 tool; + u8 reserved0b; + __le32 command_data_length; + __le32 response_data_length; + __le32 reserved14[3]; + union mpi3_sge_union sgl; +}; +struct mpi3_tool_diagnostic_cli_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le32 returned_data_length; +}; +struct mpi3_tool_lane_margin_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 tool; + u8 reserved0b; + u8 action; + u8 switch_port; + __le16 dev_handle; + u8 start_lane; + u8 num_lanes; + __le16 reserved12; + __le32 reserved14[3]; + union mpi3_sge_union sgl; +}; +#define MPI3_TOOLBOX_LM_ACTION_ENTER (0x00) +#define MPI3_TOOLBOX_LM_ACTION_EXIT (0x01) +#define MPI3_TOOLBOX_LM_ACTION_READ (0x02) +#define MPI3_TOOLBOX_LM_ACTION_WRITE (0x03) +struct mpi3_lane_margin_element { + __le16 control; + __le16 status; +}; +struct mpi3_tool_lane_margin_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le32 returned_data_length; +}; +struct mpi3_tool_recover_device_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 tool; + u8 reserved0b; + u8 action; + u8 reserved0d; + __le16 dev_handle; +}; +#define MPI3_TOOLBOX_RD_ACTION_START (0x01) +#define MPI3_TOOLBOX_RD_ACTION_GET_STATUS (0x02) +#define MPI3_TOOLBOX_RD_ACTION_ABORT (0x03) +struct mpi3_tool_recover_device_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + u8 status; + u8 reserved11; + __le16 reserved1c; +}; +#define MPI3_TOOLBOX_RD_STATUS_NOT_NEEDED (0x01) +#define MPI3_TOOLBOX_RD_STATUS_NEEDED (0x02) +#define MPI3_TOOLBOX_RD_STATUS_IN_PROGRESS (0x03) +#define MPI3_TOOLBOX_RD_STATUS_ABORTING (0x04) +struct mpi3_tool_loopback_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + u8 tool; + u8 reserved0b; + __le32 reserved0c; + __le64 phys; +}; +struct mpi3_tool_loopback_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le64 tested_phys; + __le64 failed_phys; +}; +struct mpi3_diag_buffer_post_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + u8 type; + u8 reserved0d; + __le16 reserved0e; + __le64 address; + __le32 length; + __le32 reserved1c; +}; +#define MPI3_DIAG_BUFFER_POST_MSGFLAGS_SEGMENTED (0x01) +#define MPI3_DIAG_BUFFER_TYPE_TRACE (0x01) +#define MPI3_DIAG_BUFFER_TYPE_FW (0x02) +#define MPI3_DIAG_BUFFER_TYPE_DRIVER (0x10) +#define MPI3_DIAG_BUFFER_TYPE_FDL (0x20) +#define MPI3_DIAG_BUFFER_MIN_PRODUCT_SPECIFIC (0xf0) +#define MPI3_DIAG_BUFFER_MAX_PRODUCT_SPECIFIC (0xff) +struct mpi3_driver_buffer_header { + __le32 signature; + __le16 header_size; + __le16 rtt_file_header_offset; + __le32 flags; + __le32 circular_buffer_size; + __le32 logical_buffer_end; + __le32 logical_buffer_start; + __le32 ioc_use_only18[2]; + __le32 reserved20[760]; + __le32 reserved_rttrace[256]; +}; +#define MPI3_DRIVER_DIAG_BUFFER_HEADER_SIGNATURE_CIRCULAR (0x43495243) +#define MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_MASK (0x00000003) +#define MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_ASCII (0x00000000) +#define MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_RTTRACE (0x00000001) +struct mpi3_diag_buffer_manage_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + u8 type; + u8 action; + __le16 reserved0e; +}; +#define MPI3_DIAG_BUFFER_ACTION_RELEASE (0x01) +#define MPI3_DIAG_BUFFER_ACTION_PAUSE (0x02) +#define MPI3_DIAG_BUFFER_ACTION_RESUME (0x03) +struct mpi3_diag_buffer_upload_request { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 reserved0a; + u8 type; + u8 flags; + __le16 reserved0e; + __le64 context; + __le32 reserved18; + __le32 reserved1c; + union mpi3_sge_union sgl; +}; +#define MPI3_DIAG_BUFFER_UPLOAD_FLAGS_FORMAT_MASK (0x01) +#define MPI3_DIAG_BUFFER_UPLOAD_FLAGS_FORMAT_DECODED (0x00) +#define MPI3_DIAG_BUFFER_UPLOAD_FLAGS_FORMAT_ENCODED (0x01) +struct mpi3_diag_buffer_upload_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; + __le64 context; + __le32 returned_data_length; + __le32 reserved1c; +}; +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_transport.h b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h new file mode 100644 index 0000000000000..fd6989c208e21 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h @@ -0,0 +1,454 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_TRANSPORT_H +#define MPI30_TRANSPORT_H 1 +struct mpi3_version_struct { + u8 dev; + u8 unit; + u8 minor; + u8 major; +}; +union mpi3_version_union { + struct mpi3_version_struct mpi3_version; + __le32 word; +}; +#define MPI3_VERSION_MAJOR (3) +#define MPI3_VERSION_MINOR (0) +#define MPI3_VERSION_UNIT (25) +#define MPI3_VERSION_DEV (0) +#define MPI3_DEVHANDLE_INVALID (0xffff) +struct mpi3_sysif_oper_queue_indexes { + __le16 producer_index; + __le16 reserved02; + __le16 consumer_index; + __le16 reserved06; +}; +struct mpi3_sysif_registers { + __le64 ioc_information; + union mpi3_version_union version; + __le32 reserved0c[2]; + __le32 ioc_configuration; + __le32 reserved18; + __le32 ioc_status; + __le32 reserved20; + __le32 admin_queue_num_entries; + __le64 admin_request_queue_address; + __le64 admin_reply_queue_address; + __le32 reserved38[2]; + __le32 coalesce_control; + __le32 reserved44[1007]; + __le16 admin_request_queue_pi; + __le16 reserved1002; + __le16 admin_reply_queue_ci; + __le16 reserved1006; + struct mpi3_sysif_oper_queue_indexes oper_queue_indexes[383]; + __le32 reserved1c00; + __le32 write_sequence; + __le32 host_diagnostic; + __le32 reserved1c0c; + __le32 fault; + __le32 fault_info[3]; + __le32 reserved1c20[4]; + __le64 hcb_address; + __le32 hcb_size; + __le32 reserved1c3c; + __le32 reply_free_host_index; + __le32 sense_buffer_free_host_index; + __le32 reserved1c48[2]; + __le64 diag_rw_data; + __le64 diag_rw_address; + __le16 diag_rw_control; + __le16 diag_rw_status; + __le32 reserved1c64[35]; + __le32 scratchpad[4]; + __le32 reserved1d00[192]; + __le32 device_assigned_registers[2048]; +}; +#define MPI3_SYSIF_IOC_INFO_LOW_OFFSET (0x00000000) +#define MPI3_SYSIF_IOC_INFO_HIGH_OFFSET (0x00000004) +#define MPI3_SYSIF_IOC_INFO_LOW_TIMEOUT_MASK (0xff000000) +#define MPI3_SYSIF_IOC_INFO_LOW_TIMEOUT_SHIFT (24) +#define MPI3_SYSIF_IOC_INFO_LOW_HCB_DISABLED (0x00000001) +#define MPI3_SYSIF_IOC_CONFIG_OFFSET (0x00000014) +#define MPI3_SYSIF_IOC_CONFIG_OPER_RPY_ENT_SZ (0x00f00000) +#define MPI3_SYSIF_IOC_CONFIG_OPER_RPY_ENT_SZ_SHIFT (20) +#define MPI3_SYSIF_IOC_CONFIG_OPER_REQ_ENT_SZ (0x000f0000) +#define MPI3_SYSIF_IOC_CONFIG_OPER_REQ_ENT_SZ_SHIFT (16) +#define MPI3_SYSIF_IOC_CONFIG_SHUTDOWN_MASK (0x0000c000) +#define MPI3_SYSIF_IOC_CONFIG_SHUTDOWN_NO (0x00000000) +#define MPI3_SYSIF_IOC_CONFIG_SHUTDOWN_NORMAL (0x00004000) +#define MPI3_SYSIF_IOC_CONFIG_DEVICE_SHUTDOWN_SEND_REQ (0x00002000) +#define MPI3_SYSIF_IOC_CONFIG_DIAG_SAVE (0x00000010) +#define MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC (0x00000001) +#define MPI3_SYSIF_IOC_STATUS_OFFSET (0x0000001c) +#define MPI3_SYSIF_IOC_STATUS_RESET_HISTORY (0x00000010) +#define MPI3_SYSIF_IOC_STATUS_SHUTDOWN_MASK (0x0000000c) +#define MPI3_SYSIF_IOC_STATUS_SHUTDOWN_SHIFT (0x00000002) +#define MPI3_SYSIF_IOC_STATUS_SHUTDOWN_NONE (0x00000000) +#define MPI3_SYSIF_IOC_STATUS_SHUTDOWN_IN_PROGRESS (0x00000004) +#define MPI3_SYSIF_IOC_STATUS_SHUTDOWN_COMPLETE (0x00000008) +#define MPI3_SYSIF_IOC_STATUS_FAULT (0x00000002) +#define MPI3_SYSIF_IOC_STATUS_READY (0x00000001) +#define MPI3_SYSIF_ADMIN_Q_NUM_ENTRIES_OFFSET (0x00000024) +#define MPI3_SYSIF_ADMIN_Q_NUM_ENTRIES_REQ_MASK (0x0fff) +#define MPI3_SYSIF_ADMIN_Q_NUM_ENTRIES_REPLY_OFFSET (0x00000026) +#define MPI3_SYSIF_ADMIN_Q_NUM_ENTRIES_REPLY_MASK (0x0fff0000) +#define MPI3_SYSIF_ADMIN_Q_NUM_ENTRIES_REPLY_SHIFT (16) +#define MPI3_SYSIF_ADMIN_REQ_Q_ADDR_LOW_OFFSET (0x00000028) +#define MPI3_SYSIF_ADMIN_REQ_Q_ADDR_HIGH_OFFSET (0x0000002c) +#define MPI3_SYSIF_ADMIN_REPLY_Q_ADDR_LOW_OFFSET (0x00000030) +#define MPI3_SYSIF_ADMIN_REPLY_Q_ADDR_HIGH_OFFSET (0x00000034) +#define MPI3_SYSIF_COALESCE_CONTROL_OFFSET (0x00000040) +#define MPI3_SYSIF_COALESCE_CONTROL_ENABLE_MASK (0xc0000000) +#define MPI3_SYSIF_COALESCE_CONTROL_ENABLE_NO_CHANGE (0x00000000) +#define MPI3_SYSIF_COALESCE_CONTROL_ENABLE_DISABLE (0x40000000) +#define MPI3_SYSIF_COALESCE_CONTROL_ENABLE_ENABLE (0xc0000000) +#define MPI3_SYSIF_COALESCE_CONTROL_VALID (0x20000000) +#define MPI3_SYSIF_COALESCE_CONTROL_MSIX_IDX_MASK (0x01ff0000) +#define MPI3_SYSIF_COALESCE_CONTROL_MSIX_IDX_SHIFT (16) +#define MPI3_SYSIF_COALESCE_CONTROL_TIMEOUT_MASK (0x0000ff00) +#define MPI3_SYSIF_COALESCE_CONTROL_TIMEOUT_SHIFT (8) +#define MPI3_SYSIF_COALESCE_CONTROL_DEPTH_MASK (0x000000ff) +#define MPI3_SYSIF_COALESCE_CONTROL_DEPTH_SHIFT (0) +#define MPI3_SYSIF_ADMIN_REQ_Q_PI_OFFSET (0x00001000) +#define MPI3_SYSIF_ADMIN_REPLY_Q_CI_OFFSET (0x00001004) +#define MPI3_SYSIF_OPER_REQ_Q_PI_OFFSET (0x00001008) +#define MPI3_SYSIF_OPER_REQ_Q_N_PI_OFFSET(N) (MPI3_SYSIF_OPER_REQ_Q_PI_OFFSET + (((N)-1)*8)) +#define MPI3_SYSIF_OPER_REPLY_Q_CI_OFFSET (0x0000100c) +#define MPI3_SYSIF_OPER_REPLY_Q_N_CI_OFFSET(N) (MPI3_SYSIF_OPER_REPLY_Q_CI_OFFSET + (((N)-1)*8)) +#define MPI3_SYSIF_WRITE_SEQUENCE_OFFSET (0x00001c04) +#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_MASK (0x0000000f) +#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_FLUSH (0x0) +#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_1ST (0xf) +#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_2ND (0x4) +#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_3RD (0xb) +#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_4TH (0x2) +#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_5TH (0x7) +#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_6TH (0xd) +#define MPI3_SYSIF_HOST_DIAG_OFFSET (0x00001c08) +#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_MASK (0x00000700) +#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_NO_RESET (0x00000000) +#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET (0x00000100) +#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_HOST_CONTROL_BOOT_RESET (0x00000200) +#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_COMPLETE_RESET (0x00000300) +#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT (0x00000700) +#define MPI3_SYSIF_HOST_DIAG_SAVE_IN_PROGRESS (0x00000080) +#define MPI3_SYSIF_HOST_DIAG_SECURE_BOOT (0x00000040) +#define MPI3_SYSIF_HOST_DIAG_CLEAR_INVALID_FW_IMAGE (0x00000020) +#define MPI3_SYSIF_HOST_DIAG_INVALID_FW_IMAGE (0x00000010) +#define MPI3_SYSIF_HOST_DIAG_HCBENABLE (0x00000008) +#define MPI3_SYSIF_HOST_DIAG_HCBMODE (0x00000004) +#define MPI3_SYSIF_HOST_DIAG_DIAG_RW_ENABLE (0x00000002) +#define MPI3_SYSIF_HOST_DIAG_DIAG_WRITE_ENABLE (0x00000001) +#define MPI3_SYSIF_FAULT_OFFSET (0x00001c10) +#define MPI3_SYSIF_FAULT_FUNC_AREA_MASK (0xff000000) +#define MPI3_SYSIF_FAULT_FUNC_AREA_SHIFT (24) +#define MPI3_SYSIF_FAULT_FUNC_AREA_MPI_DEFINED (0x00000000) +#define MPI3_SYSIF_FAULT_CODE_MASK (0x0000ffff) +#define MPI3_SYSIF_FAULT_CODE_DIAG_FAULT_RESET (0x0000f000) +#define MPI3_SYSIF_FAULT_CODE_CI_ACTIVATION_RESET (0x0000f001) +#define MPI3_SYSIF_FAULT_CODE_SOFT_RESET_IN_PROGRESS (0x0000f002) +#define MPI3_SYSIF_FAULT_CODE_COMPLETE_RESET_NEEDED (0x0000f003) +#define MPI3_SYSIF_FAULT_CODE_SOFT_RESET_NEEDED (0x0000f004) +#define MPI3_SYSIF_FAULT_CODE_POWER_CYCLE_REQUIRED (0x0000f005) +#define MPI3_SYSIF_FAULT_CODE_TEMP_THRESHOLD_EXCEEDED (0x0000f006) +#define MPI3_SYSIF_FAULT_INFO0_OFFSET (0x00001c14) +#define MPI3_SYSIF_FAULT_INFO1_OFFSET (0x00001c18) +#define MPI3_SYSIF_FAULT_INFO2_OFFSET (0x00001c1c) +#define MPI3_SYSIF_HCB_ADDRESS_LOW_OFFSET (0x00001c30) +#define MPI3_SYSIF_HCB_ADDRESS_HIGH_OFFSET (0x00001c34) +#define MPI3_SYSIF_HCB_SIZE_OFFSET (0x00001c38) +#define MPI3_SYSIF_HCB_SIZE_SIZE_MASK (0xfffff000) +#define MPI3_SYSIF_HCB_SIZE_SIZE_SHIFT (12) +#define MPI3_SYSIF_HCB_SIZE_HCDW_ENABLE (0x00000001) +#define MPI3_SYSIF_REPLY_FREE_HOST_INDEX_OFFSET (0x00001c40) +#define MPI3_SYSIF_SENSE_BUF_FREE_HOST_INDEX_OFFSET (0x00001c44) +#define MPI3_SYSIF_DIAG_RW_DATA_LOW_OFFSET (0x00001c50) +#define MPI3_SYSIF_DIAG_RW_DATA_HIGH_OFFSET (0x00001c54) +#define MPI3_SYSIF_DIAG_RW_ADDRESS_LOW_OFFSET (0x00001c58) +#define MPI3_SYSIF_DIAG_RW_ADDRESS_HIGH_OFFSET (0x00001c5c) +#define MPI3_SYSIF_DIAG_RW_CONTROL_OFFSET (0x00001c60) +#define MPI3_SYSIF_DIAG_RW_CONTROL_LEN_MASK (0x00000030) +#define MPI3_SYSIF_DIAG_RW_CONTROL_LEN_1BYTE (0x00000000) +#define MPI3_SYSIF_DIAG_RW_CONTROL_LEN_2BYTES (0x00000010) +#define MPI3_SYSIF_DIAG_RW_CONTROL_LEN_4BYTES (0x00000020) +#define MPI3_SYSIF_DIAG_RW_CONTROL_LEN_8BYTES (0x00000030) +#define MPI3_SYSIF_DIAG_RW_CONTROL_RESET (0x00000004) +#define MPI3_SYSIF_DIAG_RW_CONTROL_DIR_MASK (0x00000002) +#define MPI3_SYSIF_DIAG_RW_CONTROL_DIR_READ (0x00000000) +#define MPI3_SYSIF_DIAG_RW_CONTROL_DIR_WRITE (0x00000002) +#define MPI3_SYSIF_DIAG_RW_CONTROL_START (0x00000001) +#define MPI3_SYSIF_DIAG_RW_STATUS_OFFSET (0x00001c62) +#define MPI3_SYSIF_DIAG_RW_STATUS_STATUS_MASK (0x0000000e) +#define MPI3_SYSIF_DIAG_RW_STATUS_STATUS_SUCCESS (0x00000000) +#define MPI3_SYSIF_DIAG_RW_STATUS_STATUS_INV_ADDR (0x00000002) +#define MPI3_SYSIF_DIAG_RW_STATUS_STATUS_ACC_ERR (0x00000004) +#define MPI3_SYSIF_DIAG_RW_STATUS_STATUS_PAR_ERR (0x00000006) +#define MPI3_SYSIF_DIAG_RW_STATUS_BUSY (0x00000001) +#define MPI3_SYSIF_SCRATCHPAD0_OFFSET (0x00001cf0) +#define MPI3_SYSIF_SCRATCHPAD1_OFFSET (0x00001cf4) +#define MPI3_SYSIF_SCRATCHPAD2_OFFSET (0x00001cf8) +#define MPI3_SYSIF_SCRATCHPAD3_OFFSET (0x00001cfc) +#define MPI3_SYSIF_DEVICE_ASSIGNED_REGS_OFFSET (0x00002000) +#define MPI3_SYSIF_DIAG_SAVE_TIMEOUT (60) +struct mpi3_default_reply_descriptor { + __le32 descriptor_type_dependent1[2]; + __le16 request_queue_ci; + __le16 request_queue_id; + __le16 descriptor_type_dependent2; + __le16 reply_flags; +}; +#define MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK (0x0001) +#define MPI3_REPLY_DESCRIPT_FLAGS_TYPE_MASK (0xf000) +#define MPI3_REPLY_DESCRIPT_FLAGS_TYPE_ADDRESS_REPLY (0x0000) +#define MPI3_REPLY_DESCRIPT_FLAGS_TYPE_SUCCESS (0x1000) +#define MPI3_REPLY_DESCRIPT_FLAGS_TYPE_TARGET_COMMAND_BUFFER (0x2000) +#define MPI3_REPLY_DESCRIPT_FLAGS_TYPE_STATUS (0x3000) +#define MPI3_REPLY_DESCRIPT_REQUEST_QUEUE_ID_INVALID (0xffff) +struct mpi3_address_reply_descriptor { + __le64 reply_frame_address; + __le16 request_queue_ci; + __le16 request_queue_id; + __le16 reserved0c; + __le16 reply_flags; +}; +struct mpi3_success_reply_descriptor { + __le32 reserved00[2]; + __le16 request_queue_ci; + __le16 request_queue_id; + __le16 host_tag; + __le16 reply_flags; +}; +struct mpi3_target_command_buffer_reply_descriptor { + __le32 reserved00; + __le16 initiator_dev_handle; + u8 phy_num; + u8 reserved07; + __le16 request_queue_ci; + __le16 request_queue_id; + __le16 io_index; + __le16 reply_flags; +}; +struct mpi3_status_reply_descriptor { + __le16 ioc_status; + __le16 reserved02; + __le32 ioc_log_info; + __le16 request_queue_ci; + __le16 request_queue_id; + __le16 host_tag; + __le16 reply_flags; +}; +#define MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_LOGINFOAVAIL (0x8000) +#define MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_STATUS_MASK (0x7fff) +#define MPI3_REPLY_DESCRIPT_STATUS_IOCLOGINFO_TYPE_MASK (0xf0000000) +#define MPI3_REPLY_DESCRIPT_STATUS_IOCLOGINFO_TYPE_NO_INFO (0x00000000) +#define MPI3_REPLY_DESCRIPT_STATUS_IOCLOGINFO_TYPE_SAS (0x30000000) +#define MPI3_REPLY_DESCRIPT_STATUS_IOCLOGINFO_DATA_MASK (0x0fffffff) +union mpi3_reply_descriptors_union { + struct mpi3_default_reply_descriptor default_reply; + struct mpi3_address_reply_descriptor address_reply; + struct mpi3_success_reply_descriptor success; + struct mpi3_target_command_buffer_reply_descriptor target_command_buffer; + struct mpi3_status_reply_descriptor status; + __le32 words[4]; +}; +struct mpi3_sge_common { + __le64 address; + __le32 length; + u8 reserved0c[3]; + u8 flags; +}; +struct mpi3_sge_bit_bucket { + __le64 reserved00; + __le32 length; + u8 reserved0c[3]; + u8 flags; +}; +struct mpi3_sge_extended_eedp { + u8 user_data_size; + u8 reserved01; + __le16 eedp_flags; + __le32 secondary_reference_tag; + __le16 secondary_application_tag; + __le16 application_tag_translation_mask; + __le16 reserved0c; + u8 extended_operation; + u8 flags; +}; +union mpi3_sge_union { + struct mpi3_sge_common simple; + struct mpi3_sge_common chain; + struct mpi3_sge_common last_chain; + struct mpi3_sge_bit_bucket bit_bucket; + struct mpi3_sge_extended_eedp eedp; + __le32 words[4]; +}; +#define MPI3_SGE_FLAGS_ELEMENT_TYPE_MASK (0xf0) +#define MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE (0x00) +#define MPI3_SGE_FLAGS_ELEMENT_TYPE_BIT_BUCKET (0x10) +#define MPI3_SGE_FLAGS_ELEMENT_TYPE_CHAIN (0x20) +#define MPI3_SGE_FLAGS_ELEMENT_TYPE_LAST_CHAIN (0x30) +#define MPI3_SGE_FLAGS_ELEMENT_TYPE_EXTENDED (0xf0) +#define MPI3_SGE_FLAGS_END_OF_LIST (0x08) +#define MPI3_SGE_FLAGS_END_OF_BUFFER (0x04) +#define MPI3_SGE_FLAGS_DLAS_MASK (0x03) +#define MPI3_SGE_FLAGS_DLAS_SYSTEM (0x00) +#define MPI3_SGE_FLAGS_DLAS_IOC_UDP (0x01) +#define MPI3_SGE_FLAGS_DLAS_IOC_CTL (0x02) +#define MPI3_SGE_EXT_OPER_EEDP (0x00) +#define MPI3_EEDPFLAGS_INCR_PRI_REF_TAG (0x8000) +#define MPI3_EEDPFLAGS_INCR_SEC_REF_TAG (0x4000) +#define MPI3_EEDPFLAGS_INCR_PRI_APP_TAG (0x2000) +#define MPI3_EEDPFLAGS_INCR_SEC_APP_TAG (0x1000) +#define MPI3_EEDPFLAGS_ESC_PASSTHROUGH (0x0800) +#define MPI3_EEDPFLAGS_CHK_REF_TAG (0x0400) +#define MPI3_EEDPFLAGS_CHK_APP_TAG (0x0200) +#define MPI3_EEDPFLAGS_CHK_GUARD (0x0100) +#define MPI3_EEDPFLAGS_ESC_MODE_MASK (0x00c0) +#define MPI3_EEDPFLAGS_ESC_MODE_DO_NOT_DISABLE (0x0040) +#define MPI3_EEDPFLAGS_ESC_MODE_APPTAG_DISABLE (0x0080) +#define MPI3_EEDPFLAGS_ESC_MODE_APPTAG_REFTAG_DISABLE (0x00c0) +#define MPI3_EEDPFLAGS_HOST_GUARD_MASK (0x0030) +#define MPI3_EEDPFLAGS_HOST_GUARD_T10_CRC (0x0000) +#define MPI3_EEDPFLAGS_HOST_GUARD_IP_CHKSUM (0x0010) +#define MPI3_EEDPFLAGS_HOST_GUARD_OEM_SPECIFIC (0x0020) +#define MPI3_EEDPFLAGS_PT_REF_TAG (0x0008) +#define MPI3_EEDPFLAGS_EEDP_OP_MASK (0x0007) +#define MPI3_EEDPFLAGS_EEDP_OP_CHECK (0x0001) +#define MPI3_EEDPFLAGS_EEDP_OP_STRIP (0x0002) +#define MPI3_EEDPFLAGS_EEDP_OP_CHECK_REMOVE (0x0003) +#define MPI3_EEDPFLAGS_EEDP_OP_INSERT (0x0004) +#define MPI3_EEDPFLAGS_EEDP_OP_REPLACE (0x0006) +#define MPI3_EEDPFLAGS_EEDP_OP_CHECK_REGEN (0x0007) +#define MPI3_EEDP_UDS_512 (0x01) +#define MPI3_EEDP_UDS_520 (0x02) +#define MPI3_EEDP_UDS_4080 (0x03) +#define MPI3_EEDP_UDS_4088 (0x04) +#define MPI3_EEDP_UDS_4096 (0x05) +#define MPI3_EEDP_UDS_4104 (0x06) +#define MPI3_EEDP_UDS_4160 (0x07) +struct mpi3_request_header { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 change_count; + __le16 function_dependent; +}; +struct mpi3_default_reply { + __le16 host_tag; + u8 ioc_use_only02; + u8 function; + __le16 ioc_use_only04; + u8 ioc_use_only06; + u8 msg_flags; + __le16 ioc_use_only08; + __le16 ioc_status; + __le32 ioc_log_info; +}; +#define MPI3_HOST_TAG_INVALID (0xffff) +#define MPI3_FUNCTION_IOC_FACTS (0x01) +#define MPI3_FUNCTION_IOC_INIT (0x02) +#define MPI3_FUNCTION_PORT_ENABLE (0x03) +#define MPI3_FUNCTION_EVENT_NOTIFICATION (0x04) +#define MPI3_FUNCTION_EVENT_ACK (0x05) +#define MPI3_FUNCTION_CI_DOWNLOAD (0x06) +#define MPI3_FUNCTION_CI_UPLOAD (0x07) +#define MPI3_FUNCTION_IO_UNIT_CONTROL (0x08) +#define MPI3_FUNCTION_PERSISTENT_EVENT_LOG (0x09) +#define MPI3_FUNCTION_MGMT_PASSTHROUGH (0x0a) +#define MPI3_FUNCTION_CONFIG (0x10) +#define MPI3_FUNCTION_SCSI_IO (0x20) +#define MPI3_FUNCTION_SCSI_TASK_MGMT (0x21) +#define MPI3_FUNCTION_SMP_PASSTHROUGH (0x22) +#define MPI3_FUNCTION_NVME_ENCAPSULATED (0x24) +#define MPI3_FUNCTION_TARGET_ASSIST (0x30) +#define MPI3_FUNCTION_TARGET_STATUS_SEND (0x31) +#define MPI3_FUNCTION_TARGET_MODE_ABORT (0x32) +#define MPI3_FUNCTION_TARGET_CMD_BUF_POST_BASE (0x33) +#define MPI3_FUNCTION_TARGET_CMD_BUF_POST_LIST (0x34) +#define MPI3_FUNCTION_CREATE_REQUEST_QUEUE (0x70) +#define MPI3_FUNCTION_DELETE_REQUEST_QUEUE (0x71) +#define MPI3_FUNCTION_CREATE_REPLY_QUEUE (0x72) +#define MPI3_FUNCTION_DELETE_REPLY_QUEUE (0x73) +#define MPI3_FUNCTION_TOOLBOX (0x80) +#define MPI3_FUNCTION_DIAG_BUFFER_POST (0x81) +#define MPI3_FUNCTION_DIAG_BUFFER_MANAGE (0x82) +#define MPI3_FUNCTION_DIAG_BUFFER_UPLOAD (0x83) +#define MPI3_FUNCTION_MIN_IOC_USE_ONLY (0xc0) +#define MPI3_FUNCTION_MAX_IOC_USE_ONLY (0xef) +#define MPI3_FUNCTION_MIN_PRODUCT_SPECIFIC (0xf0) +#define MPI3_FUNCTION_MAX_PRODUCT_SPECIFIC (0xff) +#define MPI3_IOCSTATUS_LOG_INFO_AVAIL_MASK (0x8000) +#define MPI3_IOCSTATUS_LOG_INFO_AVAILABLE (0x8000) +#define MPI3_IOCSTATUS_STATUS_MASK (0x7fff) +#define MPI3_IOCSTATUS_SUCCESS (0x0000) +#define MPI3_IOCSTATUS_INVALID_FUNCTION (0x0001) +#define MPI3_IOCSTATUS_BUSY (0x0002) +#define MPI3_IOCSTATUS_INVALID_SGL (0x0003) +#define MPI3_IOCSTATUS_INTERNAL_ERROR (0x0004) +#define MPI3_IOCSTATUS_INSUFFICIENT_RESOURCES (0x0006) +#define MPI3_IOCSTATUS_INVALID_FIELD (0x0007) +#define MPI3_IOCSTATUS_INVALID_STATE (0x0008) +#define MPI3_IOCSTATUS_INSUFFICIENT_POWER (0x000a) +#define MPI3_IOCSTATUS_INVALID_CHANGE_COUNT (0x000b) +#define MPI3_IOCSTATUS_ALLOWED_CMD_BLOCK (0x000c) +#define MPI3_IOCSTATUS_SUPERVISOR_ONLY (0x000d) +#define MPI3_IOCSTATUS_FAILURE (0x001f) +#define MPI3_IOCSTATUS_CONFIG_INVALID_ACTION (0x0020) +#define MPI3_IOCSTATUS_CONFIG_INVALID_TYPE (0x0021) +#define MPI3_IOCSTATUS_CONFIG_INVALID_PAGE (0x0022) +#define MPI3_IOCSTATUS_CONFIG_INVALID_DATA (0x0023) +#define MPI3_IOCSTATUS_CONFIG_NO_DEFAULTS (0x0024) +#define MPI3_IOCSTATUS_CONFIG_CANT_COMMIT (0x0025) +#define MPI3_IOCSTATUS_SCSI_RECOVERED_ERROR (0x0040) +#define MPI3_IOCSTATUS_SCSI_TM_NOT_SUPPORTED (0x0041) +#define MPI3_IOCSTATUS_SCSI_INVALID_DEVHANDLE (0x0042) +#define MPI3_IOCSTATUS_SCSI_DEVICE_NOT_THERE (0x0043) +#define MPI3_IOCSTATUS_SCSI_DATA_OVERRUN (0x0044) +#define MPI3_IOCSTATUS_SCSI_DATA_UNDERRUN (0x0045) +#define MPI3_IOCSTATUS_SCSI_IO_DATA_ERROR (0x0046) +#define MPI3_IOCSTATUS_SCSI_PROTOCOL_ERROR (0x0047) +#define MPI3_IOCSTATUS_SCSI_TASK_TERMINATED (0x0048) +#define MPI3_IOCSTATUS_SCSI_RESIDUAL_MISMATCH (0x0049) +#define MPI3_IOCSTATUS_SCSI_TASK_MGMT_FAILED (0x004a) +#define MPI3_IOCSTATUS_SCSI_IOC_TERMINATED (0x004b) +#define MPI3_IOCSTATUS_SCSI_EXT_TERMINATED (0x004c) +#define MPI3_IOCSTATUS_EEDP_GUARD_ERROR (0x004d) +#define MPI3_IOCSTATUS_EEDP_REF_TAG_ERROR (0x004e) +#define MPI3_IOCSTATUS_EEDP_APP_TAG_ERROR (0x004f) +#define MPI3_IOCSTATUS_TARGET_INVALID_IO_INDEX (0x0062) +#define MPI3_IOCSTATUS_TARGET_ABORTED (0x0063) +#define MPI3_IOCSTATUS_TARGET_NO_CONN_RETRYABLE (0x0064) +#define MPI3_IOCSTATUS_TARGET_NO_CONNECTION (0x0065) +#define MPI3_IOCSTATUS_TARGET_XFER_COUNT_MISMATCH (0x006a) +#define MPI3_IOCSTATUS_TARGET_DATA_OFFSET_ERROR (0x006d) +#define MPI3_IOCSTATUS_TARGET_TOO_MUCH_WRITE_DATA (0x006e) +#define MPI3_IOCSTATUS_TARGET_IU_TOO_SHORT (0x006f) +#define MPI3_IOCSTATUS_TARGET_ACK_NAK_TIMEOUT (0x0070) +#define MPI3_IOCSTATUS_TARGET_NAK_RECEIVED (0x0071) +#define MPI3_IOCSTATUS_SAS_SMP_REQUEST_FAILED (0x0090) +#define MPI3_IOCSTATUS_SAS_SMP_DATA_OVERRUN (0x0091) +#define MPI3_IOCSTATUS_DIAGNOSTIC_RELEASED (0x00a0) +#define MPI3_IOCSTATUS_CI_UNSUPPORTED (0x00b0) +#define MPI3_IOCSTATUS_CI_UPDATE_SEQUENCE (0x00b1) +#define MPI3_IOCSTATUS_CI_VALIDATION_FAILED (0x00b2) +#define MPI3_IOCSTATUS_CI_KEY_UPDATE_PENDING (0x00b3) +#define MPI3_IOCSTATUS_CI_KEY_UPDATE_NOT_POSSIBLE (0x00b4) +#define MPI3_IOCSTATUS_SECURITY_KEY_REQUIRED (0x00c0) +#define MPI3_IOCSTATUS_SECURITY_VIOLATION (0x00c1) +#define MPI3_IOCSTATUS_INVALID_QUEUE_ID (0x0f00) +#define MPI3_IOCSTATUS_INVALID_QUEUE_SIZE (0x0f01) +#define MPI3_IOCSTATUS_INVALID_MSIX_VECTOR (0x0f02) +#define MPI3_IOCSTATUS_INVALID_REPLY_QUEUE_ID (0x0f03) +#define MPI3_IOCSTATUS_INVALID_QUEUE_DELETION (0x0f04) +#define MPI3_IOCLOGINFO_TYPE_MASK (0xf0000000) +#define MPI3_IOCLOGINFO_TYPE_SHIFT (28) +#define MPI3_IOCLOGINFO_TYPE_NONE (0x0) +#define MPI3_IOCLOGINFO_TYPE_SAS (0x3) +#define MPI3_IOCLOGINFO_LOG_DATA_MASK (0x0fffffff) +#endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_type.h b/drivers/scsi/mpi3mr/mpi/mpi30_type.h new file mode 100644 index 0000000000000..36ec6a76d1a97 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi/mpi30_type.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016-2022 Broadcom Inc. All rights reserved. + */ +#ifndef MPI30_TYPE_H +#define MPI30_TYPE_H 1 +#endif diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h new file mode 100644 index 0000000000000..f668f4b8ef9d8 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi3mr.h @@ -0,0 +1,1536 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Driver for Broadcom MPI3 Storage Controllers + * + * Copyright (C) 2017-2022 Broadcom Inc. + * (mailto: mpi3mr-linuxdrv.pdl@broadcom.com) + * + */ + +#ifndef MPI3MR_H_INCLUDED +#define MPI3MR_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if !((defined(RHEL_MAJOR) && (RHEL_MAJOR == 8) && (RHEL_MINOR > 2)) || \ + (defined(CONFIG_SUSE_KERNEL) && \ + ((CONFIG_SUSE_VERSION == 15) && (CONFIG_SUSE_PATCHLEVEL >= 3))) || \ + (LINUX_VERSION_CODE > KERNEL_VERSION(5, 4, 0))) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "mpi/mpi30_transport.h" +#include "mpi/mpi30_cnfg.h" +#include "mpi/mpi30_image.h" +#include "mpi/mpi30_init.h" +#include "mpi/mpi30_ioc.h" +#include "mpi/mpi30_sas.h" +#include "mpi/mpi30_tool.h" +#include "mpi/mpi30_pci.h" +#include "mpi3mr_debug.h" +#include "mpi3mr_kernel_compat.h" + +/* Global list and lock for storing multiple adapters managed by the driver */ +extern spinlock_t mrioc_list_lock; +extern struct list_head mrioc_list; +extern atomic64_t event_counter; + +#define MPI3MR_DRIVER_VERSION "8.2.1.0.0" +#define MPI3MR_DRIVER_RELDATE "03-August-2022" + +#define MPI3MR_DRIVER_NAME "mpi3mr" +#define MPI3MR_DRIVER_LICENSE "GPL" +#define MPI3MR_DRIVER_AUTHOR "Broadcom Inc. " +#define MPI3MR_DRIVER_DESC "MPI3 Storage Controller Device Driver" + +#define MPI3MR_NAME_LENGTH 32 +#define IOCNAME "%s: " + +/* + * The maximum transfer size supported in single I/O is 1M and that is 2K + * in 512-byte sized sectors +*/ +#define MPI3MR_MAX_SECTORS 2048 + +/* Definitions for internal SGL and Chain SGL buffers */ +#define MPI3MR_PAGE_SIZE_4K 4096 +#define MPI3MR_CHAINSGE_SIZE MPI3MR_PAGE_SIZE_4K +#define MPI3MR_SG_DEPTH (MPI3MR_PAGE_SIZE_4K / \ + sizeof(struct mpi3_sge_common)) + +/* Definitions for MAX values for shost */ +#define MPI3MR_MAX_CMDS_LUN 128 +#define MPI3MR_MAX_CDB_LENGTH 32 + +/* Admin queue management definitions */ +#define MPI3MR_ADMIN_REQ_Q_SIZE (2 * MPI3MR_PAGE_SIZE_4K) +#define MPI3MR_ADMIN_REPLY_Q_SIZE (4 * MPI3MR_PAGE_SIZE_4K) +#define MPI3MR_ADMIN_REQ_FRAME_SZ 128 +#define MPI3MR_ADMIN_REPLY_FRAME_SZ 16 + +/* Operational queue management definitions */ +#define MPI3MR_OP_REQ_Q_QD 512 +#define MPI3MR_OP_REP_Q_QD 1024 +#define MPI3MR_OP_REP_Q_QD4K 4096 +#define MPI3MR_OP_REQ_Q_SEG_SIZE 4096 +#define MPI3MR_OP_REP_Q_SEG_SIZE 4096 +#define MPI3MR_MAX_SEG_LIST_SIZE 4096 + +/* Reserved Host Tag definitions */ +#define MPI3MR_HOSTTAG_INVALID 0xFFFF +#define MPI3MR_HOSTTAG_INITCMDS 1 +#define MPI3MR_HOSTTAG_BSG_CMDS 2 +#define MPI3MR_HOSTTAG_PEL_ABORT 3 +#define MPI3MR_HOSTTAG_PEL_WAIT 4 +#define MPI3MR_HOSTTAG_BLK_TMS 5 +#define MPI3MR_HOSTTAG_CFG_CMDS 6 +#define MPI3MR_HOSTTAG_TRANSPORT_CMDS 7 + +#define MPI3MR_NUM_DEVRMCMD 16 +#define MPI3MR_HOSTTAG_DEVRMCMD_MIN (MPI3MR_HOSTTAG_TRANSPORT_CMDS + 1) +#define MPI3MR_HOSTTAG_DEVRMCMD_MAX (MPI3MR_HOSTTAG_DEVRMCMD_MIN + \ + MPI3MR_NUM_DEVRMCMD - 1) + +#define MPI3MR_NUM_EVTACKCMD 4 +#define MPI3MR_HOSTTAG_EVTACKCMD_MIN (MPI3MR_HOSTTAG_DEVRMCMD_MAX + 1) +#define MPI3MR_HOSTTAG_EVTACKCMD_MAX (MPI3MR_HOSTTAG_EVTACKCMD_MIN + \ + MPI3MR_NUM_EVTACKCMD - 1) + +#define MPI3MR_NUM_SYSFS_TM 32 +#define MPI3MR_HOSTTAG_SYSFS_TM_MIN (MPI3MR_HOSTTAG_EVTACKCMD_MAX + 1) +#define MPI3MR_HOSTTAG_SYSFS_TM_MAX (MPI3MR_HOSTTAG_SYSFS_TM_MIN + \ + MPI3MR_NUM_SYSFS_TM - 1) + +#define MPI3MR_INTERNALCMDS_RESVD MPI3MR_HOSTTAG_SYSFS_TM_MAX + +/* Reduced resource count definition for crash kernel */ +#define MPI3MR_HOST_IOS_KDUMP 128 + +/* command/controller interaction timeout definitions in seconds */ +#define MPI3MR_INTADMCMD_TIMEOUT 60 +#define MPI3MR_PORTENABLE_TIMEOUT 300 +#define MPI3MR_PORTENABLE_POLL_INTERVAL 5 +#define MPI3MR_ABORTTM_TIMEOUT 60 +#define MPI3MR_RESETTM_TIMEOUT 60 +#define MPI3MR_TSUPDATE_INTERVAL 900 +#define MPI3MR_DEFAULT_SHUTDOWN_TIME 120 +#define MPI3MR_RAID_ERRREC_RESET_TIMEOUT 180 +#define MPI3MR_RESET_HOST_IOWAIT_TIMEOUT 5 +#define MPI3MR_PREPARE_FOR_RESET_TIMEOUT 180 +#define MPI3MR_RESET_ACK_TIMEOUT 30 + +#define MPI3MR_RESET_TOPOLOGY_SETTLE_TIME 10 + +#define MPI3MR_SCMD_TIMEOUT (60 * HZ) +#define MPI3MR_EH_SCMD_TIMEOUT (60 * HZ) + +#define MPI3MR_WATCHDOG_INTERVAL 1000 /* in milli seconds */ + +#define MPI3MR_DEFAULT_CFG_PAGE_SZ 1024 /*bytes*/ + +/* Internal admin command state definitions*/ +#define MPI3MR_CMD_NOTUSED 0x8000 +#define MPI3MR_CMD_COMPLETE 0x0001 +#define MPI3MR_CMD_PENDING 0x0002 +#define MPI3MR_CMD_REPLY_VALID 0x0004 +#define MPI3MR_CMD_RESET 0x0008 + +/* Definitions for Event replies and sense buffer allocated per controller */ +#define MPI3MR_NUM_EVT_REPLIES 32 +#define MPI3MR_SENSE_BUF_SZ 256 +#define MPI3MR_SENSEBUF_FACTOR 3 + +/* Invalid target device handle */ +#define MPI3MR_INVALID_DEV_HANDLE 0xFFFF + +/* Controller Reset related definitions */ +#define MPI3MR_HOSTDIAG_UNLOCK_RETRY_COUNT 5 +#define MPI3MR_MAX_RESET_RETRY_COUNT 3 + +#define MPI3MR_RI_MASK_RESPCODE (0x000000FF) + +#define MPI3MR_DEFAULT_MDTS (128 * 1024) +#define MPI3MR_DEFAULT_PGSZEXP (12) +#define MPI3MR_MAX_APP_XFER_SIZE (1 * 1024 * 1024) +#define MPI3MR_MAX_APP_XFER_SECTORS (2048 + 512) +#define MPI3MR_MAX_APP_XFER_SEGMENTS 512 + + +/* Command retry count definitions */ +#define MPI3MR_DEV_RMHS_RETRY_COUNT 3 +#define MPI3MR_PEL_RETRY_COUNT 3 + +/* Default target device queue depth */ +#define MPI3MR_DEFAULT_SDEV_QD 32 + +/* Definitions for Threaded IRQ poll*/ +#define MPI3MR_IRQ_POLL_SLEEP 2 +#define MPI3MR_IRQ_POLL_TRIGGER_IOCOUNT 8 + +/* Definitions for the controller security status*/ +#define MPI3MR_CTLR_SECURITY_STATUS_MASK 0x0C +#define MPI3MR_CTLR_SECURE_DBG_STATUS_MASK 0x02 + +#define MPI3MR_INVALID_DEVICE 0x00 +#define MPI3MR_CONFIG_SECURE_DEVICE 0x04 +#define MPI3MR_HARD_SECURE_DEVICE 0x08 +#define MPI3MR_TAMPERED_DEVICE 0x0C + +#define MPI3MR_DEFAULT_HDB_MAX_SZ (4 * 1024 * 1024) +#define MPI3MR_DEFAULT_HDB_DEC_SZ (1 * 1024 * 1024) +#define MPI3MR_DEFAULT_HDB_MIN_SZ (2 * 1024 * 1024) +#define MPI3MR_MAX_NUM_HDB 2 + +/* Driver Host Diag Buffer (drv_db) */ +#define MPI3MR_MIN_DIAG_HOST_BUFFER_SZ (32 * 1024) + \ + sizeof(struct mpi3_driver_buffer_header) +#define MPI3MR_DEFAULT_DIAG_HOST_BUFFER_SZ (512 * 1024) + \ + sizeof(struct mpi3_driver_buffer_header) +#define MPI3MR_UEFI_DIAG_HOST_BUFFER_OFFSET (16 * 1024) + +/* SGE Flag definition */ +#define MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST \ + (MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE | MPI3_SGE_FLAGS_DLAS_SYSTEM | \ + MPI3_SGE_FLAGS_END_OF_LIST) + +/* MSI Index from Reply Queue Index */ +#define REPLY_QUEUE_IDX_TO_MSIX_IDX(qidx, offset) (qidx + offset) + +/* HBA port flags*/ +#define MPI3MR_HBA_PORT_FLAG_DIRTY 0x01 +#define MPI3MR_HBA_PORT_FLAG_NEW 0x02 + +/* Driver diag buffer levels */ +enum mpi3mr_drv_db_level { + MRIOC_DRV_DB_DISABLED = 0, + MRIOC_DRV_DB_MINI = 1, + MRIOC_DRV_DB_FULL = 2, +}; +/* IOC State definitions */ +enum mpi3mr_iocstate { + MRIOC_STATE_READY = 1, + MRIOC_STATE_RESET, + MRIOC_STATE_FAULT, + MRIOC_STATE_BECOMING_READY, + MRIOC_STATE_RESET_REQUESTED, + MRIOC_STATE_UNRECOVERABLE, +}; + +/* Reset reason code definitions*/ +enum mpi3mr_reset_reason { + MPI3MR_RESET_FROM_BRINGUP = 1, + MPI3MR_RESET_FROM_FAULT_WATCH = 2, + MPI3MR_RESET_FROM_APP = 3, + MPI3MR_RESET_FROM_EH_HOS = 4, + MPI3MR_RESET_FROM_TM_TIMEOUT = 5, + MPI3MR_RESET_FROM_APP_TIMEOUT = 6, + MPI3MR_RESET_FROM_MUR_FAILURE = 7, + MPI3MR_RESET_FROM_CTLR_CLEANUP = 8, + MPI3MR_RESET_FROM_CIACTIV_FAULT = 9, + MPI3MR_RESET_FROM_PE_TIMEOUT = 10, + MPI3MR_RESET_FROM_TSU_TIMEOUT = 11, + MPI3MR_RESET_FROM_DELREQQ_TIMEOUT = 12, + MPI3MR_RESET_FROM_DELREPQ_TIMEOUT = 13, + MPI3MR_RESET_FROM_CREATEREPQ_TIMEOUT = 14, + MPI3MR_RESET_FROM_CREATEREQQ_TIMEOUT = 15, + MPI3MR_RESET_FROM_IOCFACTS_TIMEOUT = 16, + MPI3MR_RESET_FROM_IOCINIT_TIMEOUT = 17, + MPI3MR_RESET_FROM_EVTNOTIFY_TIMEOUT = 18, + MPI3MR_RESET_FROM_EVTACK_TIMEOUT = 19, + MPI3MR_RESET_FROM_CIACTVRST_TIMER = 20, + MPI3MR_RESET_FROM_GETPKGVER_TIMEOUT = 21, + MPI3MR_RESET_FROM_PELABORT_TIMEOUT = 22, + MPI3MR_RESET_FROM_SYSFS = 23, + MPI3MR_RESET_FROM_SYSFS_TIMEOUT = 24, + MPI3MR_RESET_FROM_DIAG_BUFFER_POST_TIMEOUT = 25, + MPI3MR_RESET_FROM_DIAG_BUFFER_RELEASE_TIMEOUT = 26, + MPI3MR_RESET_FROM_FIRMWARE = 27, + MPI3MR_RESET_FROM_DIAG_BUFFER_UPLOAD_TIMEOUT = 28, + MPI3MR_RESET_FROM_CFG_REQ_TIMEOUT = 29, + MPI3MR_RESET_FROM_SAS_TRANSPORT_TIMEOUT = 30, + MPI3MR_RESET_FROM_TRIGGER = 31, +}; + +/* Queue type definitions */ +enum queue_type { + MPI3MR_DEFAULT_QUEUE = 0, + MPI3MR_POLL_QUEUE, +}; + +/** + * struct mpi3mr_compimg_ver - replica of component image + * version defined in mpi30_image.h in host endianness + * + */ +struct mpi3mr_compimg_ver { + u16 build_num; + u16 cust_id; + u8 ph_minor; + u8 ph_major; + u8 gen_minor; + u8 gen_major; +}; + +/** + * struct mpi3mr_ioc_facts - replica of IOC facts data defined + * in mpi30_ioc.h in host endianness + * + */ +struct mpi3mr_ioc_facts { + u32 ioc_capabilities; + struct mpi3mr_compimg_ver fw_ver; + u32 mpi_version; + u32 diag_trace_sz; + u32 diag_fw_sz; + u32 diag_drvr_sz; + u16 max_reqs; + u16 product_id; + u16 op_req_sz; + u16 reply_sz; + u16 exceptions; + u16 max_perids; + u16 max_sasexpanders; + u16 max_sasinitiators; + u16 max_enclosures; + u16 max_pcie_switches; + u16 max_nvme; + u16 max_vds; + u16 max_hpds; + u16 max_advhpds; + u16 max_raid_pds; + u16 min_devhandle; + u16 max_devhandle; + u16 max_op_req_q; + u16 max_op_reply_q; + u16 shutdown_timeout; + u16 max_msix_vectors; + u8 ioc_num; + u8 who_init; + u8 personality; + u8 dma_mask; + u8 protocol_flags; + u8 sge_mod_mask; + u8 sge_mod_value; + u8 sge_mod_shift; + u8 max_dev_per_tg; + u16 max_io_throttle_group; + u16 io_throttle_data_length; + u16 io_throttle_low; + u16 io_throttle_high; + +}; + +/** + * struct mpi3mr_fwevt - Firmware event structure. + * + * @list: list head + * @work: Work structure + * @mrioc: Adapter instance reference + * @event_id: MPI3 firmware event ID + * @send_ack: Event acknowledgment required or not + * @process_event: Bottomhalf processing required or not + * @event_context: Event context to send in Ack + * @event_data_size: size of the event data in bytes + * @pending_at_sml: waiting for device add/remove API to complete + * @discard: discard this event + * @ref_count: kref count + * @event_data: Actual MPI3 event data + */ +struct mpi3mr_fwevt { + struct list_head list; + struct work_struct work; + struct mpi3mr_ioc *mrioc; + u16 event_id; + bool send_ack; + bool process_event; + u32 event_context; + u16 event_data_size; + bool pending_at_sml; + bool discard; + struct kref ref_count; + char event_data[0] __aligned(4); +}; + +/** + * struct segments - memory descriptor structure to store + * virtual and dma addresses for operational queue segments. + * + * @segment: virtual address + * @segment_dma: dma address + */ +struct segments { + void *segment; + dma_addr_t segment_dma; +}; + +/** + * struct op_req_qinfo - Operational Request Queue Information + * + * @ci: consumer index + * @pi: producer index + * @num_request: Maximum number of entries in the queue + * @qid: Queue Id starting from 1 + * @reply_qid: Associated reply queue Id + * @num_segments: Number of discontiguous memory segments + * @segment_qd: Depth of each segments + * @q_lock: Concurrent queue access lock + * @q_segments: Segment descriptor pointer + * @q_segment_list: Segment list base virtual address + * @q_segment_list_dma: Segment list base DMA address + * @last_full_host_tag: hosttag of last IO returned to SML + * due to queue full + * @qfull_io_count: Number of IOs returned back to SML + * due to queue full + * @qfull_instances: Total queue full occurences. One occurence + * starts with queue full detection and ends + * with queue full breaks. + * @dbgfs_req_queue: Per request queue debugfs directory + */ +struct op_req_qinfo { + u16 ci; + u16 pi; + u16 num_requests; + u16 qid; + u16 reply_qid; + u16 num_segments; + u16 segment_qd; + spinlock_t q_lock; + struct segments *q_segments; + void *q_segment_list; + dma_addr_t q_segment_list_dma; + + u16 last_full_host_tag; + u64 qfull_io_count; + u32 qfull_instances; +#ifdef CONFIG_DEBUG_FS + struct dentry *dbgfs_req_queue; +#endif +}; + +/** + * struct op_reply_qinfo - Operational Reply Queue Information + * + * @ci: consumer index + * @qid: Queue Id starting from 1 + * @num_replies: Maximum number of entries in the queue + * @num_segments: Number of discontiguous memory segments + * @segment_qd: Depth of each segments + * @q_segments: Segment descriptor pointer + * @q_segment_list: Segment list base virtual address + * @q_segment_list_dma: Segment list base DMA address + * @ephase: Expected phased identifier for the reply queue + * @pend_ios: Number of IOs pending in HW for this queue + * @enable_irq_poll: Flag to indicate polling is enabled + * @in_use: Queue is handled by poll/ISR + * @qtype: Type of queue (types defined in enum queue_type) + */ +struct op_reply_qinfo { + u16 ci; + u16 qid; + u16 num_replies; + u16 num_segments; + u16 segment_qd; + struct segments *q_segments; + void *q_segment_list; + dma_addr_t q_segment_list_dma; + u8 ephase; + atomic_t pend_ios; + bool enable_irq_poll; + atomic_t in_use; + enum queue_type qtype; +}; + +/** + * struct mpi3mr_intr_info - Interrupt cookie information + * + * @mrioc: Adapter instance reference + * @msix_index: MSIx index + * @op_reply_q: Associated operational reply queue + * @name: Dev name for the irq claiming device + */ +struct mpi3mr_intr_info { + struct mpi3mr_ioc *mrioc; + u16 msix_index; + struct op_reply_qinfo *op_reply_q; + char name[MPI3MR_NAME_LENGTH]; +}; + + +/** + * struct mpi3mr_throttle_group_info - Throttle group info + * + * @io_divert: Flag indicates io divert is on or off for the TG + * @needs_qd_reduction: Flag to indicate QD reduction is needed + * @qd_reduction: Queue Depth reduction in units of 10% + * @fw_qd: QueueDepth value reported by the firmware + * @modified_qd: Modified QueueDepth value due to throttling + * @id: Throttle Group ID. + * @high: High limit to turn on throttling in 512 byte blocks + * @low: Low limit to turn off throttling in 512 byte blocks + * @pend_large_data_sz: Counter to track pending large data + */ +struct mpi3mr_throttle_group_info { + u8 io_divert; + u8 need_qd_reduction; + u8 qd_reduction; + u16 fw_qd; + u16 modified_qd; + u16 id; + u32 high; + u32 low; + atomic_t pend_large_data_sz; +}; + +/** + * struct mpi3mr_hba_port - HBA's port information + * @port_id: Port number + * @sas_address: SAS address of this port's attached device + * @phy_mask: HBA PHY's belonging to this port + * @flags: HBA port flags + */ +struct mpi3mr_hba_port { + struct list_head list; + u8 port_id; + u8 flags; +}; + +/** + * struct mpi3mr_sas_port - Internal SAS port information + * @port_list: List of ports belonging to a SAS node + * @num_phys: Number of phys associated with port + * @marked_responding: used while refresing the sas ports + * @lowest_phy: lowest phy ID of current sas port + * @phy_mask: phy_mask of current sas port + * @hba_port: HBA port entry + * @remote_identify: Attached device identification + * @rphy: SAS transport layer rphy object + * @port: SAS transport layer port object + * @phy_list: mpi3mr_sas_phy objects belonging to this port + */ +struct mpi3mr_sas_port { + struct list_head port_list; + u8 num_phys; + u8 marked_responding; + int lowest_phy; + u32 phy_mask; + struct mpi3mr_hba_port *hba_port; + struct sas_identify remote_identify; + struct sas_rphy *rphy; + struct sas_port *port; + struct list_head phy_list; +}; + +/** + * struct mpi3mr_sas_phy - Internal SAS Phy information + * @port_siblings: List of phys belonging to a port + * @identify: Phy identification + * @remote_identify: Attached device identification + * @phy: SAS transport layer Phy object + * @phy_id: Unique phy id within a port + * @handle: Firmware device handle for this phy + * @attached_handle: Firmware device handle for attached device + * @phy_belongs_to_port: Flag to indicate phy belongs to port + @hba_port: HBA port entry + */ +struct mpi3mr_sas_phy { + struct list_head port_siblings; + struct sas_identify identify; + struct sas_identify remote_identify; + struct sas_phy *phy; + u8 phy_id; + u16 handle; + u16 attached_handle; + u8 phy_belongs_to_port; + struct mpi3mr_hba_port *hba_port; +}; + +/** + * struct mpi3mr_sas_node - SAS host/expander information + * @list: List of sas nodes in a controller + * @parent_dev: Parent device class + * @num_phys: Number phys belonging to sas_node + * @sas_address: SAS address of sas_node + * @handle: Firmware device handle for this sas_host/expander + * @sas_address_parent: SAS address of parent expander or host + * @enclosure_handle: Firmware handle of enclosure of this node + * @device_info: Capabilities of this sas_host/expander + * @non_responding: used to refresh the expander devices during reset + * @host_node: Flag to indicate this is a host_node + * @hba_port: HBA port entry + * @phy: A list of phys that make up this sas_host/expander + * @sas_port_list: List of internal ports of this node + * @rphy: sas_rphy object of this expander node + */ +struct mpi3mr_sas_node { + struct list_head list; + struct device *parent_dev; + u8 num_phys; + u64 sas_address; + u16 handle; + u64 sas_address_parent; + u16 enclosure_handle; + u64 enclosure_logical_id; + u8 non_responding; + u8 host_node; + struct mpi3mr_hba_port *hba_port; + struct mpi3mr_sas_phy *phy; + struct list_head sas_port_list; + struct sas_rphy *rphy; +}; + +/** + * struct mpi3mr_enclosure_node - enclosure information + * @list: List of enclosures + * @pg0: Enclosure page 0; + */ +struct mpi3mr_enclosure_node { + struct list_head list; + struct mpi3_enclosure_page0 pg0; +}; + +/** + * struct tgt_dev_sas_sata - SAS/SATA device specific + * information cached from firmware given data + * + * @sas_address: World wide unique SAS address + * @sas_address_parent: Sas address of parent expander or host + * @dev_info: Device information bits + * @phy_id: Phy identifier provided in device page 0 + * @phy_id: Attached phy identifier provided in device page 0 + * @sas_transport_attached: Is this device exposed to transport + * @pend_sas_rphy_add: Flag to check device is in process of add + * @hba_port: HBA port entry + * @rphy: SAS transport layer rphy object + */ +struct tgt_dev_sas_sata { + u64 sas_address; + u64 sas_address_parent; + u16 dev_info; + u8 phy_id; + u8 attached_phy_id; + u8 sas_transport_attached; + u8 pend_sas_rphy_add; + struct mpi3mr_hba_port *hba_port; + struct sas_rphy *rphy; +}; + +/** + * struct trigger_event_data - store trigger related + * information. + * + * @trace_hdb: Trace diag buffer descriptor reference + * @fw_hdb: FW diag buffer descriptor reference + * @trigger_type: Trigger type + * @trigger_specific_data: Trigger specific data + * @snapdump: Snapdump enable or disable flag + */ +struct trigger_event_data { + struct diag_buffer_desc *trace_hdb; + struct diag_buffer_desc *fw_hdb; + u8 trigger_type; + u64 trigger_specific_data; + bool snapdump; +}; + +/** + * struct tgt_dev_pcie - PCIe device specific information cached + * from firmware given data + * + * @mdts: Maximum data transfer size + * @capb: Device capabilities + * @pgsz: Device page size + * @abort_to: Timeout for abort TM + * @reset_to: Timeout for Target/LUN reset TM + * @dev_info: Device information bits + */ +struct tgt_dev_pcie { + u32 mdts; + u16 capb; + u8 pgsz; + u8 abort_to; + u8 reset_to; + u16 dev_info; +}; + +/** + * struct tgt_dev_vd - virtual device specific information + * cached from firmware given data + * + * @state: State of the VD + * @qd_reduction: Queue Depth reduction in units of 10% + * @tg_id: VDs throttle group ID + * @high: High limit to turn on throttling in 512 byte blocks + * @low: Low limit to turn off throttling in 512 byte blocks + * @tg: Pointer to throttle group info + */ +struct tgt_dev_vd { + u8 state; + u8 tg_qd_reduction; + u16 tg_id; + u32 tg_high; + u32 tg_low; + struct mpi3mr_throttle_group_info *tg; +}; + + +/** + * union _form_spec_inf - union of device specific information + */ +union _form_spec_inf { + struct tgt_dev_sas_sata sas_sata_inf; + struct tgt_dev_pcie pcie_inf; + struct tgt_dev_vd vd_inf; +}; + + +/** + * struct mpi3mr_tgt_dev - target device data structure + * + * @list: List pointer + * @starget: Scsi_target pointer + * @dev_handle: FW device handle + * @parent_handle: FW parent device handle + * @slot: Slot number + * @encl_handle: FW enclosure handle + * @perst_id: FW assigned Persistent ID + * @devpg0_flag: Device Page0 flag + * @dev_type: SAS/SATA/PCIE device type + * @is_hidden: Should be exposed to upper layers or not + * @host_exposed: Already exposed to host or not + * @io_unit_port: IO Unit port ID + * @non_stl: Is this device not to be attached with SAS TL + * @io_throttle_enabled: I/O throttling needed or not + * @q_depth: Device specific Queue Depth + * @wwid: World wide ID + * @enclosure_logical_id: Enclosure logical identifier + * @dev_spec: Device type specific information + * @ref_count: Reference count + */ +struct mpi3mr_tgt_dev { + struct list_head list; + struct scsi_target *starget; + u16 dev_handle; + u16 parent_handle; + u16 slot; + u16 encl_handle; + u16 perst_id; + u16 devpg0_flag; + u8 dev_type; + u8 is_hidden; + u8 host_exposed; + u8 io_unit_port; + u8 non_stl; + u8 io_throttle_enabled; + u16 q_depth; + u64 wwid; + u64 enclosure_logical_id; + union _form_spec_inf dev_spec; + struct kref ref_count; +}; + +/** + * mpi3mr_tgtdev_get - k reference incrementor + * @s: Target device reference + * + * Increment target device reference count. + */ +static inline void mpi3mr_tgtdev_get(struct mpi3mr_tgt_dev *s) +{ + kref_get(&s->ref_count); +} + +/** + * mpi3mr_free_tgtdev - target device memory dealloctor + * @r: k reference pointer of the target device + * + * Free target device memory when no reference. + */ +static inline void mpi3mr_free_tgtdev(struct kref *r) +{ + kfree(container_of(r, struct mpi3mr_tgt_dev, ref_count)); +} + +/** + * mpi3mr_tgtdev_put - k reference decrementor + * @s: Target device reference + * + * Decrement target device reference count. + */ +static inline void mpi3mr_tgtdev_put(struct mpi3mr_tgt_dev *s) +{ + kref_put(&s->ref_count, mpi3mr_free_tgtdev); +} + +/** + * struct mpi3mr_stgt_priv_data - SCSI target private structure + * + * @starget: Scsi_target pointer + * @dev_handle: FW device handle + * @perst_id: FW assigned Persistent ID + * @num_luns: Number of Logical Units + * @block_io: I/O blocked to the device or not + * @dev_removed: Device removed in the Firmware + * @dev_removedelay: Device is waiting to be removed in FW + * @dev_type: Device type + * @dev_nvme_dif: Device is NVMe DIF enabled + * @io_throttle_enabled: I/O throttling needed or not + * @io_divert: Flag indicates io divert is on or off for the dev + * @throttle_group: Pointer to throttle group info + * @tgt_dev: Internal target device pointer + * @pend_count: Counter to track pending I/Os during error + * handling + */ +struct mpi3mr_stgt_priv_data { + struct scsi_target *starget; + u16 dev_handle; + u16 perst_id; + u32 num_luns; + atomic_t block_io; + u8 dev_removed; + u8 dev_removedelay; + u8 dev_type; + u8 dev_nvme_dif; + u8 io_throttle_enabled; + u8 io_divert; + struct mpi3mr_throttle_group_info *throttle_group; + struct mpi3mr_tgt_dev *tgt_dev; + u32 pend_count; +}; + +/** + * struct mpi3mr_sdev_priv_data - SCSI device private structure + * + * @tgt_priv_data: Scsi_target private data pointer + * @lun_id: LUN ID of the device + * @ncq_prio_enable: NCQ priority enable for SATA device + * @pend_count: Counter to track pending I/Os during error + * handling + */ +struct mpi3mr_sdev_priv_data { + struct mpi3mr_stgt_priv_data *tgt_priv_data; + u32 lun_id; + u8 ncq_prio_enable; + u32 pend_count; +}; + +/** + * struct mpi3mr_drv_cmd - Internal command tracker + * + * @mutex: Command mutex + * @done: Completor for wakeup + * @reply: Firmware reply for internal commands + * @sensebuf: Sensebuf for SCSI IO commands + * @iou_rc: IO Unit control reason code + * @state: Command State + * @dev_handle: Firmware handle for device specific commands + * @ioc_status: IOC status from the firmware + * @ioc_loginfo:IOC log info from the firmware + * @is_waiting: Is the command issued in block mode + * @is_sense: Is Sense data present + * @retry_count: Retry count for retriable commands + * @host_tag: Host tag used by the command + * @callback: Callback for non blocking commands + */ +struct mpi3mr_drv_cmd { + struct mutex mutex; + struct completion done; + void *reply; + u8 *sensebuf; + u8 iou_rc; + u16 state; + u16 dev_handle; + u16 ioc_status; + u32 ioc_loginfo; + u8 is_waiting; + u8 is_sense; + u8 retry_count; + u16 host_tag; + void (*callback)(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd); +}; + + +/** + * struct chain_element - memory descriptor structure to store + * virtual and dma addresses for chain elements. + * + * @addr: virtual address + * @dma_addr: dma address + */ +struct chain_element { + void *addr; + dma_addr_t dma_addr; +}; + +/** + * struct scmd_priv - SCSI command private data + * + * @host_tag: Host tag specific to operational queue + * @in_lld_scope: Command in LLD scope or not + * @meta_sg_valid: DIX command with meta data SGL or not + * @scmd: SCSI Command pointer + * @req_q_idx: Operational request queue index + * @chain_idx: Chain frame index + * @meta_chain_idx: Chain frame index of meta data SGL + * @mpi3mr_scsiio_req: MPI SCSI IO request + */ +struct scmd_priv { + u16 host_tag; + u8 in_lld_scope; + u8 meta_sg_valid; + struct scsi_cmnd *scmd; + u16 req_q_idx; + int chain_idx; + int meta_chain_idx; + u8 mpi3mr_scsiio_req[MPI3MR_ADMIN_REQ_FRAME_SZ]; +}; + +/** + * struct diag_buffer_desc - memory descriptor structure to + * store virtual, dma addresses, size, buffer status for host + * diagnostic buffers. + * + * @type: Buffer type + * @trigger_data: Trigger data + * @trigger_type: Trigger type + * @status: Buffer status + * @size: Buffer size + * @addr: Virtual address + * @dma_addr: Buffer DMA address + */ +struct diag_buffer_desc { + u8 type; + u64 trigger_data; + u8 trigger_type; + u8 status; + u32 size; + void *addr; + dma_addr_t dma_addr; +}; + +/** + * struct dma_memory_desc - memory descriptor structure to store + * virtual address, dma address and size for any generic dma + * memory allocations in the driver. + * + * @size: Buffer size + * @addr: Virtual address + * @dma_addr: DMA address + */ +struct dma_memory_desc { + u32 size; + void *addr; + dma_addr_t dma_addr; +}; + + +/** + * struct mpi3mr_ioc - Adapter anchor structure stored in shost + * private data + * + * @list: List pointer + * @pdev: PCI device pointer + * @shost: Scsi_Host pointer + * @id: Controller ID + * @cpu_count: Number of online CPUs + * @dbgfs_adapter: Debugfs directory per controller + * @name: Controller ASCII name + * @driver_name: Driver ASCII name + * @sysif_regs: System interface registers virtual address + * @sysif_regs_phys: System interface registers physical address + * @bars: PCI BARS + * @dma_mask: DMA mask + * @msix_count: Number of MSIX vectors used + * @intr_enabled: Is interrupts enabled + * @num_admin_req: Number of admin requests + * @admin_req_q_sz: Admin request queue size + * @admin_req_pi: Admin request queue producer index + * @admin_req_ci: Admin request queue consumer index + * @admin_req_base: Admin request queue base virtual address + * @admin_req_dma: Admin request queue base dma address + * @admin_req_lock: Admin queue access lock + * @num_admin_replies: Number of admin replies + * @admin_reply_q_sz: Admin reply queue size + * @admin_reply_ci: Admin reply queue consumer index + * @admin_reply_ephase:Admin reply queue expected phase + * @admin_reply_base: Admin reply queue base virtual address + * @admin_reply_dma: Admin reply queue base dma address + * @admin_reply_q_in_use: Queue is handled by poll/ISR + * @ready_timeout: Controller ready timeout + * @intr_info: Interrupt cookie pointer + * @intr_info_count: Number of interrupt cookies + * @is_intr_info_set: Flag to indicate intr info is setup + * @num_queues: Number of operational queues + * @is_segqueue_enabled: Flag to indicate segmented q is enabled + * @num_op_req_q: Number of operational request queues + * @req_qinfo: Operational request queue info pointer + * @num_op_reply_q: Number of operational reply queues + * @op_reply_qinfo: Operational reply queue info pointer + * @init_cmds: Command tracker for initialization commands + * @cfg_cmds: Command tracker for configuration requests + * @facts: Cached IOC facts data + * @op_reply_desc_sz: Operational reply descriptor size + * @num_reply_bufs: Number of reply buffers allocated + * @reply_buf_pool: Reply buffer pool + * @reply_buf: Reply buffer base virtual address + * @reply_buf_dma: Reply buffer DMA address + * @reply_buf_dma_max_address: Reply DMA address max limit + * @reply_free_qsz: Reply free queue size + * @reply_sz: Cached Reply size reported in IOC facts + * @reply_free_q_pool: Reply free queue pool + * @reply_free_q: Reply free queue base virtual address + * @reply_free_q_dma: Reply free queue base DMA address + * @reply_free_queue_lock: Reply free queue lock + * @reply_free_queue_host_index: Reply free queue host index + * @num_sense_bufs: Number of sense buffers + * @sense_buf_pool: Sense buffer pool + * @sense_buf: Sense buffer base virtual address + * @sense_buf_dma: Sense buffer base DMA address + * @sense_buf_q_sz: Sense buffer queue size + * @sense_buf_q_pool: Sense buffer queue pool + * @sense_buf_q: Sense buffer queue virtual address + * @sense_buf_q_dma: Sense buffer queue DMA address + * @sbq_lock: Sense buffer queue lock + * @sbq_host_index: Sense buffer queuehost index + * @event_masks: Event mask bitmap + * @fwevt_worker_name: Firmware event worker thread name + * @fwevt_worker_thread: Firmware event worker thread + * @fwevt_lock: Firmware event lock + * @fwevt_list: Firmware event list + * @watchdog_work_q_name: Fault watchdog worker thread name + * @watchdog_work_q: Fault watchdog worker thread + * @watchdog_work: Fault watchdog work + * @watchdog_lock: Fault watchdog lock + * @is_driver_loading: Is driver still loading + * @scan_started: Async scan started + * @scan_failed: Asycn scan failed + * @stop_drv_processing: Stop all command processing + * @device_refresh_on: Don't process the events untill devices are refreshed + * @max_host_ios: Maximum host I/O count + * @tgtdev_lock: Target device list lock + * @tgtdev_list: Target device lock + * @chain_buf_count: Chain buffer count + * @chain_buf_pool: Chain buffer pool + * @chain_sgl_list: Chain SGL list + * @chain_bitmap_sz: Chain buffer allocator bitmap size + * @chain_bitmap: Chain buffer allocator bitmap + * @chain_buf_lock: Chain buffer list lock + * @bsg_cmds: Command tracker for BSG command + * @host_tm_cmds: Command tracker for task management commands + * @dev_rmhs_cmds: Command tracker for device removal commands + * @evtack_cmds: Command tracker for event ack commands + * @sysfs_tm_cmds: Command tracker for sysfs TM commands + * @devrem_bitmap_sz: Device removal bitmap size + * @devrem_bitmap: Device removal bitmap + * @dev_handle_bitmap_sz: Device handle bitmap size + * @removepend_bitmap: Remove pending bitmap + * @delayed_rmhs_list: Delayed device removal list + * @evtack_cmds_bitmap_sz: Event Ack bitmap size + * @evtack_cmds_bitmap: Event Ack bitmap + * @delayed_evtack_cmds_list: Delayed event acknowledgment list + * @ts_update_counter: Timestamp update counter + * @reset_in_progress: Reset in progress flag + * @unrecoverable: Controller unrecoverable flag + * @block_bsgs: Block BSG flag + * @prev_reset_result: Result of previous reset + * @reset_mutex: Controller reset mutex + * @reset_waitq: Controller reset wait queue + * @prepare_for_reset: Prepare for reset event received + * @prepare_for_reset_timeout_counter: Prepare for reset timeout + * @prp_list_virt: NVMe encapsulated PRP list virtual base + * @prp_list_dma: NVMe encapsulated PRP list DMA + * @prp_sz: NVME encapsulated PRP list size + * @diagsave_timeout: Diagnostic information save timeout + * @logging_level: Controller debug logging level + * @flush_io_count: I/O count to flush after reset + * @current_event: Firmware event currently in process + * @driver_info: Driver, Kernel, OS information to firmware + * @change_count: Topology change count + * @pel_enabled: Persistent Event Log(PEL) enabled or not + * @pel_abort_requested: PEL abort is requested or not + * @pel_class: PEL Class identifier + * @pel_locale: PEL Locale identifier + * @pel_cmds: Command tracker for PEL wait command + * @pel_abort_cmd: Command tracker for PEL abort command + * @pel_newest_seqnum: Newest PEL sequenece number + * @pel_seqnum_virt: PEL sequence number virtual address + * @pel_seqnum_dma: PEL sequence number DMA address + * @pel_seqnum_sz: PEL sequenece number size + * @op_reply_q_offset: Operational reply queue offset with MSIx + * @sysfs_tm_pending: Pending TMs issued through SysFS + * @sysfs_tm_issued: TMs issued through SysFS + * @sysfs_tm_terminated_io_count:I/Os terminated by SysFS TMs + * @sysfs_pending_tm_wq: SysFS TM pending work queue + * @diag_buffers: Host diagnostic buffers + * @reply_trigger_present: Reply trigger present flag + * @event_trigger_present: Event trigger present flag + * @scsisense_trigger_present: Scsi sense trigger present flag + * @snapdump_trigger_active: Snapdump trigger active flag + * @fw_release_trigger_active: Fw release trigger active flag + * @trace_release_trigger_active: Trace trigger active flag + * @driver_pg2: Driver page 2 pointer + * @dump: kmsg dumper interface for snapdump + * @drv_diag_buffer: Diagnostic host buffer virtual address + * @drv_diag_buffer_dma: Diagnostic host buffer DMA address + * @drv_diag_buffer_sz: Diagnostic host buffer size + * @default_qcount: Total Default queues + * @active_poll_qcount: Currently active poll queue count + * @requested_poll_qcount: User requested poll queue count + * @check_xprotect_nvme: Flag to check xprotect for nvme or not + * @skip_uefi_snapdump: Skip copying UEFI logs into snapdump + * @logdata_buf: Circular buffer to store log data entries + * @logdata_buf_idx: Index of entry in buffer to store + * @logdata_entry_sz: log data entry size + * @adm_req_q_bar_writeq_lock: Admin request queue lock + * @adm_reply_q_bar_writeq_lock: Admin reply queue lock + * @pend_ios: Pending IO Count + * @cfg_page: Default memory for configuration pages + * @cfg_page_dma: Configuration page DMA address + * @cfg_page_sz: Default configuration page memory size + * @sas_transport_enabled: SAS transport enabled or not + * @scsi_device_channel: Channel ID for SCSI devices + * @transport_cmds: Command tracker for SAS transport commands + * @sas_hba: SAS node for the controller + * @sas_expander_list: SAS node list of expanders + * @sas_node_lock: Lock to protect SAS node list + * @hba_port_table_list: List of HBA Ports + * @enclosure_list: List of Enclosure objects + * @pend_large_data_sz: Counter to track pending large data + * @io_throttle_data_length: I/O size to track in 512b blocks + * @io_throttle_high: I/O size to start throttle in 512b blocks + * @io_throttle_low: I/O size to stop throttle in 512b blocks + * @num_io_throttle_group: Maximum number of throttle groups + * @throttle_groups: Pointer to throttle group info structures + * @bsg_dev: BSG device structure + * @bsg_queue: Request queue for BSG device + */ +struct mpi3mr_ioc { + struct list_head list; + struct pci_dev *pdev; + struct Scsi_Host *shost; + u8 id; + int cpu_count; + +#ifdef CONFIG_DEBUG_FS + struct dentry *dbgfs_adapter; + struct dentry *dmesg_dump; + struct dentry *uefi_logs_dump; +#endif + + char name[MPI3MR_NAME_LENGTH]; + char driver_name[MPI3MR_NAME_LENGTH]; + + volatile struct mpi3_sysif_registers __iomem *sysif_regs; + resource_size_t sysif_regs_phys; + int bars; + u64 dma_mask; + + u16 msix_count; + u8 intr_enabled; + + u16 num_admin_req; + u32 admin_req_q_sz; + u16 admin_req_pi; + u16 admin_req_ci; + void *admin_req_base; + dma_addr_t admin_req_dma; + spinlock_t admin_req_lock; + + u16 num_admin_replies; + u32 admin_reply_q_sz; + u16 admin_reply_ci; + u8 admin_reply_ephase; + void *admin_reply_base; + dma_addr_t admin_reply_dma; + atomic_t admin_reply_q_in_use; + + u32 ready_timeout; + + struct mpi3mr_intr_info *intr_info; + u16 intr_info_count; + bool is_intr_info_set; + + u16 num_queues; + bool is_segqueue_enabled; + u16 num_op_req_q; + struct op_req_qinfo *req_qinfo; + + u16 num_op_reply_q; + struct op_reply_qinfo *op_reply_qinfo; + + struct mpi3mr_drv_cmd init_cmds; + struct mpi3mr_drv_cmd cfg_cmds; + struct mpi3mr_ioc_facts facts; + u16 op_reply_desc_sz; + + u32 num_reply_bufs; + struct dma_pool *reply_buf_pool; + u8 *reply_buf; + dma_addr_t reply_buf_dma; + dma_addr_t reply_buf_dma_max_address; + + u16 reply_free_qsz; + u16 reply_sz; + struct dma_pool *reply_free_q_pool; + __le64 *reply_free_q; + dma_addr_t reply_free_q_dma; + spinlock_t reply_free_queue_lock; + u32 reply_free_queue_host_index; + + u32 num_sense_bufs; + struct dma_pool *sense_buf_pool; + u8 *sense_buf; + dma_addr_t sense_buf_dma; + + u16 sense_buf_q_sz; + struct dma_pool *sense_buf_q_pool; + __le64 *sense_buf_q; + dma_addr_t sense_buf_q_dma; + spinlock_t sbq_lock; + u32 sbq_host_index; + + u32 event_masks[MPI3_EVENT_NOTIFY_EVENTMASK_WORDS]; + + char fwevt_worker_name[MPI3MR_NAME_LENGTH]; + struct workqueue_struct *fwevt_worker_thread; + spinlock_t fwevt_lock; + struct list_head fwevt_list; + + char watchdog_work_q_name[20]; + struct workqueue_struct *watchdog_work_q; + struct delayed_work watchdog_work; + spinlock_t watchdog_lock; + + u8 is_driver_loading; + u8 scan_started; + u16 scan_failed; + u8 stop_drv_processing; + u8 device_refresh_on; + + u16 max_host_ios; + + spinlock_t tgtdev_lock; + struct list_head tgtdev_list; + + u32 chain_buf_count; + struct dma_pool *chain_buf_pool; + struct chain_element *chain_sgl_list; + u16 chain_bitmap_sz; + void *chain_bitmap; + spinlock_t chain_buf_lock; + + struct mpi3mr_drv_cmd bsg_cmds; + struct mpi3mr_drv_cmd host_tm_cmds; + struct mpi3mr_drv_cmd dev_rmhs_cmds[MPI3MR_NUM_DEVRMCMD]; + struct mpi3mr_drv_cmd evtack_cmds[MPI3MR_NUM_EVTACKCMD]; + struct mpi3mr_drv_cmd sysfs_tm_cmds[MPI3MR_NUM_SYSFS_TM]; + + u16 devrem_bitmap_sz; + void *devrem_bitmap; + u16 dev_handle_bitmap_sz; + void *removepend_bitmap; + struct list_head delayed_rmhs_list; + u16 evtack_cmds_bitmap_sz; + void *evtack_cmds_bitmap; + struct list_head delayed_evtack_cmds_list; + + u32 ts_update_counter; + + u8 reset_in_progress; + u8 unrecoverable; + u8 block_bsgs; + int prev_reset_result; + struct mutex reset_mutex; + wait_queue_head_t reset_waitq; + + u8 prepare_for_reset; + u16 prepare_for_reset_timeout_counter; + + void *prp_list_virt; + dma_addr_t prp_list_dma; + u32 prp_sz; + + u16 diagsave_timeout; + int logging_level; + u16 flush_io_count; + + struct mpi3mr_fwevt *current_event; + struct mpi3_driver_info_layout driver_info; + u16 change_count; + + u8 pel_enabled; + u8 pel_abort_requested; + u8 pel_class; + u16 pel_locale; + struct mpi3mr_drv_cmd pel_cmds; + struct mpi3mr_drv_cmd pel_abort_cmd; + + u32 pel_newest_seqnum; + void *pel_seqnum_virt; + dma_addr_t pel_seqnum_dma; + u32 pel_seqnum_sz; + u16 op_reply_q_offset; + + atomic_t sysfs_tm_pending; + u16 sysfs_tm_issued; + u16 sysfs_tm_terminated_io_count; + wait_queue_head_t sysfs_pending_tm_wq; + + struct diag_buffer_desc diag_buffers[MPI3MR_MAX_NUM_HDB]; + bool reply_trigger_present; + bool event_trigger_present; + bool scsisense_trigger_present; + bool snapdump_trigger_active; + bool fw_release_trigger_active; + bool trace_release_trigger_active; + struct mpi3_driver_page2 *driver_pg2; + spinlock_t trigger_lock; + + struct mpi3mr_kmsg_dumper dump; + void *drv_diag_buffer; + dma_addr_t drv_diag_buffer_dma; + u32 drv_diag_buffer_sz; + + void *uefi_logs; + u32 uefi_logs_sz; + + u16 default_qcount; + u16 active_poll_qcount; + u16 requested_poll_qcount; + + bool check_xprotect_nvme; + bool skip_uefi_snapdump; + + u8 *logdata_buf; + u16 logdata_buf_idx; + u16 logdata_entry_sz; + spinlock_t adm_req_q_bar_writeq_lock; + spinlock_t adm_reply_q_bar_writeq_lock; + +#if defined(IO_COUNTER_SUPPORT) + atomic_t pend_ios; +#endif + + void *cfg_page; + dma_addr_t cfg_page_dma; + u16 cfg_page_sz; + + u8 sas_transport_enabled; + u8 scsi_device_channel; + struct mpi3mr_drv_cmd transport_cmds; + struct mpi3mr_sas_node sas_hba; + struct list_head sas_expander_list; + spinlock_t sas_node_lock; + struct list_head hba_port_table_list; + struct list_head enclosure_list; + + atomic_t pend_large_data_sz; + u32 io_throttle_data_length; + u32 io_throttle_high; + u32 io_throttle_low; + u16 num_io_throttle_group; + struct mpi3mr_throttle_group_info *throttle_groups; + + struct device *bsg_dev; + struct request_queue *bsg_queue; +}; + +int mpi3mr_setup_resources(struct mpi3mr_ioc *mrioc); +void mpi3mr_cleanup_resources(struct mpi3mr_ioc *mrioc); +int mpi3mr_init_ioc(struct mpi3mr_ioc *mrioc); +int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume); +void mpi3mr_cleanup_ioc(struct mpi3mr_ioc *mrioc); +int mpi3mr_issue_port_enable(struct mpi3mr_ioc *mrioc, u8 async); +int mpi3mr_admin_request_post(struct mpi3mr_ioc *mrioc, void *admin_req, + u16 admin_req_sz, u8 ignore_reset); +int mpi3mr_op_request_post(struct mpi3mr_ioc *mrioc, + struct op_req_qinfo *opreqq, u8 *req); +void mpi3mr_add_sg_single(void *paddr, u8 flags, u32 length, + dma_addr_t dma_addr); +void mpi3mr_build_zero_len_sge(void *paddr); +void *mpi3mr_get_sensebuf_virt_addr(struct mpi3mr_ioc *mrioc, + dma_addr_t phys_addr); +void *mpi3mr_get_reply_virt_addr(struct mpi3mr_ioc *mrioc, + dma_addr_t phys_addr); +void mpi3mr_repost_sense_buf(struct mpi3mr_ioc *mrioc, + u64 sense_buf_dma); +void mpi3mr_pel_get_seqnum_complete(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd); +void mpi3mr_memset_buffers(struct mpi3mr_ioc *mrioc); +void mpi3mr_free_mem(struct mpi3mr_ioc *mrioc); + +void mpi3mr_os_handle_events(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply); +struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_by_handle( + struct mpi3mr_ioc *mrioc, u16 handle); +struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_by_perst_id( + struct mpi3mr_ioc *mrioc, u16 persist_id); +struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_from_tgtpriv( + struct mpi3mr_ioc *mrioc, struct mpi3mr_stgt_priv_data *tgt_priv); +void mpi3mr_process_op_reply_desc(struct mpi3mr_ioc *mrioc, + struct mpi3_default_reply_descriptor *reply_desc, u64 *reply_dma, u16 qidx); + +void mpi3mr_start_watchdog(struct mpi3mr_ioc *mrioc); +void mpi3mr_stop_watchdog(struct mpi3mr_ioc *mrioc); + +int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc, + u32 reset_reason, u8 snapdump); +int mpi3mr_issue_tm(struct mpi3mr_ioc *mrioc, u8 tm_type, + u16 handle, uint lun, u16 htag, ulong timeout, + struct mpi3mr_drv_cmd *drv_cmd, + u8 *resp_code, struct scsi_cmnd *scmd); +int mpi3mr_diagfault_reset_handler(struct mpi3mr_ioc *mrioc, + u32 reset_reason); +void mpi3mr_ioc_disable_intr(struct mpi3mr_ioc *mrioc); +void mpi3mr_ioc_enable_intr(struct mpi3mr_ioc *mrioc); + +enum mpi3mr_iocstate mpi3mr_get_iocstate(struct mpi3mr_ioc *mrioc); +int mpi3mr_process_event_ack(struct mpi3mr_ioc *mrioc, u8 event, + u32 event_ctx); +int mpi3mr_pel_get_seqnum_post(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd); + +void mpi3mr_wait_for_host_io(struct mpi3mr_ioc *mrioc, u32 timeout); +void mpi3mr_cleanup_fwevt_list(struct mpi3mr_ioc *mrioc); +void mpi3mr_flush_host_io(struct mpi3mr_ioc *mrioc); +void mpi3mr_invalidate_devhandles(struct mpi3mr_ioc *mrioc); +void mpi3mr_refresh_tgtdevs(struct mpi3mr_ioc *mrioc); +void mpi3mr_flush_delayed_cmd_lists(struct mpi3mr_ioc *mrioc); + +void mpi3mr_bsg_init(struct mpi3mr_ioc *mrioc); +void mpi3mr_bsg_exit(struct mpi3mr_ioc *mrioc); +void mpi3mr_app_save_logdata(struct mpi3mr_ioc *mrioc, char *event_data, + u16 event_data_size); +int mpi3mr_process_op_reply_q(struct mpi3mr_ioc *mrioc, + struct op_reply_qinfo *op_reply_q); + +#if (KERNEL_VERSION(5, 16, 0) > LINUX_VERSION_CODE) +extern struct device_attribute *mpi3mr_host_attrs[]; +extern struct device_attribute *mpi3mr_dev_attrs[]; +#else +extern const struct attribute_group *mpi3mr_host_groups[]; +extern const struct attribute_group *mpi3mr_dev_groups[]; +#endif + +u8 mpi3mr_scsih_ncq_prio_supp(struct scsi_device *sdev); + +void mpi3mr_print_fault_info(struct mpi3mr_ioc *mrioc); + +int mpi3mr_post_diag_bufs(struct mpi3mr_ioc *mrioc); +void mpi3mr_release_diag_bufs(struct mpi3mr_ioc *mrioc, u8 skip_rel_action); +void mpi3mr_hdb_trigger_data_event(struct mpi3mr_ioc *mrioc, + struct trigger_event_data *event_data); +void mpi3mr_alloc_diag_bufs(struct mpi3mr_ioc *mrioc); +int mpi3mr_refresh_trigger(struct mpi3mr_ioc *mrioc, u8 page_type); +void mpi3mr_master_trigger(struct mpi3mr_ioc *mrioc, u64 trigger_data); +void mpi3mr_scsisense_trigger(struct mpi3mr_ioc *mrioc, u8 senseky, u8 asc, + u8 ascq); +void mpi3mr_event_trigger(struct mpi3mr_ioc *mrioc, u8 event); +void mpi3mr_reply_trigger(struct mpi3mr_ioc *mrioc, u16 iocstatus, + u32 iocloginfo); +void mpi3mr_hdbstatuschg_evt_th(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply); +int mpi3mr_issue_diag_buf_release(struct mpi3mr_ioc *mrioc, + struct diag_buffer_desc *diag_buffer); +void mpi3mr_check_rh_fault_ioc(struct mpi3mr_ioc *mrioc, u32 reason_code); + +extern struct sas_function_template mpi3mr_transport_functions; +extern struct scsi_transport_template *mpi3mr_transport_template; + +int mpi3mr_cfg_get_dev_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_device_page0 *dev_pg0, u16 pg_sz, u32 form, u32 form_spec); +int mpi3mr_cfg_get_sas_phy_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_sas_phy_page0 *phy_pg0, u16 pg_sz, u32 form, + u32 form_spec); +int mpi3mr_cfg_get_sas_phy_pg1(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_sas_phy_page1 *phy_pg1, u16 pg_sz, u32 form, + u32 form_spec); +int mpi3mr_cfg_get_sas_exp_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_sas_expander_page0 *exp_pg0, u16 pg_sz, u32 form, + u32 form_spec); +int mpi3mr_cfg_get_sas_exp_pg1(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_sas_expander_page1 *exp_pg1, u16 pg_sz, u32 form, + u32 form_spec); +int mpi3mr_cfg_get_enclosure_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_enclosure_page0 *encl_pg0, u16 pg_sz, u32 form, + u32 form_spec); +int mpi3mr_cfg_get_sas_io_unit_pg0(struct mpi3mr_ioc *mrioc, + struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0, u16 pg_sz); +int mpi3mr_cfg_get_sas_io_unit_pg1(struct mpi3mr_ioc *mrioc, + struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1, u16 pg_sz); +int mpi3mr_cfg_set_sas_io_unit_pg1(struct mpi3mr_ioc *mrioc, + struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1, u16 pg_sz); +int mpi3mr_cfg_get_driver_pg1(struct mpi3mr_ioc *mrioc, + struct mpi3_driver_page1 *driver_pg1, u16 pg_sz); +int mpi3mr_cfg_get_driver_pg2(struct mpi3mr_ioc *mrioc, + struct mpi3_driver_page2 *driver_pg2, u16 pg_sz, u8 page_type); +void mpi3mr_remove_tgtdev_from_host(struct mpi3mr_ioc *mrioc, + struct mpi3mr_tgt_dev *tgtdev); +struct mpi3mr_enclosure_node *mpi3mr_enclosure_find_by_handle( + struct mpi3mr_ioc*mrioc, u16 handle); +u8 mpi3mr_is_expander_device(u16 device_info); +int mpi3mr_expander_add(struct mpi3mr_ioc *mrioc, u16 handle); +void mpi3mr_expander_remove(struct mpi3mr_ioc *mrioc, u64 sas_address, + struct mpi3mr_hba_port *hba_port); +struct mpi3mr_sas_node *__mpi3mr_expander_find_by_handle(struct mpi3mr_ioc + *mrioc, u16 handle); +struct mpi3mr_hba_port * mpi3mr_get_hba_port_by_id(struct mpi3mr_ioc *mrioc, + u8 port_id, u8 skip_dirty_flag); +void mpi3mr_sas_host_refresh(struct mpi3mr_ioc *mrioc); +void mpi3mr_sas_host_add(struct mpi3mr_ioc *mrioc); +int mpi3mr_report_tgtdev_to_sas_transport(struct mpi3mr_ioc *mrioc, + struct mpi3mr_tgt_dev *tgtdev); +void mpi3mr_remove_tgtdev_from_sas_transport(struct mpi3mr_ioc *mrioc, + struct mpi3mr_tgt_dev *tgtdev); +struct mpi3mr_tgt_dev *__mpi3mr_get_tgtdev_by_addr_and_rphy( + struct mpi3mr_ioc *mrioc, u64 sas_address, struct sas_rphy *rphy); +void mpi3mr_update_links(struct mpi3mr_ioc *mrioc, + u64 sas_address_parent, u16 handle, u8 phy_number, u8 link_rate, + struct mpi3mr_hba_port *hba_port); +void mpi3mr_refresh_sas_ports(struct mpi3mr_ioc *mrioc); +void mpi3mr_refresh_expanders(struct mpi3mr_ioc *mrioc); +void mpi3mr_add_event_wait_for_device_refresh(struct mpi3mr_ioc *mrioc); +void mpi3mr_free_enclosure_list(struct mpi3mr_ioc *mrioc); + +void mpi3mr_flush_drv_cmds(struct mpi3mr_ioc *mrioc); +void mpi3mr_flush_cmds_for_unrecovered_controller(struct mpi3mr_ioc *mrioc); +void mpi3mr_set_trigger_data_in_hdb(struct diag_buffer_desc *hdb, + u8 type, u64 data, bool force); +void mpi3mr_set_trigger_data_in_all_hdb(struct mpi3mr_ioc *mrioc, + u8 type, u64 data, bool force); +int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc); +inline void mpi3mr_print_discard_event_notice(struct mpi3mr_ioc *mrioc, + bool device_add); +#endif /*MPI3MR_H_INCLUDED*/ diff --git a/drivers/scsi/mpi3mr/mpi3mr_app.c b/drivers/scsi/mpi3mr/mpi3mr_app.c new file mode 100644 index 0000000000000..c7c35b6c41384 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi3mr_app.c @@ -0,0 +1,3346 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Broadcom MPI3 Storage Controllers + * + * Copyright (C) 2017-2022 Broadcom Inc. + * (mailto: mpi3mr-linuxdrv.pdl@broadcom.com) + * + */ + +#include "mpi3mr.h" +#include "mpi3mr_app.h" + +/* SysFS task management type definitions*/ +enum mpi3mr_sysfs_tm { + MPI3MR_SYSFS_TM_SOFT_RESET = 1, + MPI3MR_SYSFS_TM_ABORT_TASK = 3, + MPI3MR_SYSFS_TM_TARGET_RESET, + MPI3MR_SYSFS_TM_LUN_RESET, + MPI3MR_SYSFS_TM_ABORT_TASK_SET, + MPI3MR_SYSFS_TM_DIAG_FAULT_RESET, +}; + +#define MPI3MR_SYSFS_TM_TIMEOUT 120 /*in seconds*/ + +/* Encapsulated NVMe command definitions */ +#define MPI3MR_NVME_PRP_SIZE 8 /* PRP size */ +#define MPI3MR_NVME_CMD_PRP1_OFFSET 24 /* PRP1 offset in NVMe cmd */ +#define MPI3MR_NVME_CMD_PRP2_OFFSET 32 /* PRP2 offset in NVMe cmd */ +#define MPI3MR_NVME_CMD_SGL_OFFSET 24 /* SGL offset in NVMe cmd */ +#define MPI3MR_NVME_DATA_FORMAT_PRP 0 +#define MPI3MR_NVME_DATA_FORMAT_SGL1 1 +#define MPI3MR_NVME_DATA_FORMAT_SGL2 2 + +/** + * struct mpi3mr_nvme_pt_sge - Structure to store SGEs for NVMe + * Encapsulated commands. + * + * @base_addr: Physical address + * @length: SGE length + * @rsvd: Reserved + * @rsvd1: Reserved + * @sgl_type: sgl type + */ +struct mpi3mr_nvme_pt_sge { + u64 base_addr; + u32 length; + u16 rsvd; + u8 rsvd1; + u8 sgl_type; +}; + + +/** + * struct mpi3mr_buf_map - local structure to + * track kernel and user buffers associated with an BSG + * structure. + * + * @bsg_buf: BSG buffer virtual address + * @bsg_buf_len: BSG buffer length + * @kern_buf: Kernel buffer virtual address + * @kern_buf_len: Kernel buffer length + * @kern_buf_dma: Kernel buffer DMA address + * @data_dir: Data direction. + * @is_dma: Whether DMA transfer applies to the buffer type + */ +struct mpi3mr_buf_map { + void *bsg_buf; + u32 bsg_buf_len; + void *kern_buf; + u32 kern_buf_len; + dma_addr_t kern_buf_dma; + u8 data_dir; + bool is_dma; +}; + +/** + * mpi3mr_diag_buffer_for_type - returns buffer desc for type + * @mrioc: Adapter instance reference + * @buffer_type: Diagnostic buffer type + * + * Identifies matching diag descriptor from mrioc for given diag + * buffer type. + * + * Return: diag buffer descriptor on success, NULL on failures. + */ + +static inline struct diag_buffer_desc * +mpi3mr_diag_buffer_for_type(struct mpi3mr_ioc *mrioc, u8 buf_type) +{ + u8 i; + + for (i = 0; i < MPI3MR_MAX_NUM_HDB; i++) { + if (mrioc->diag_buffers[i].type == buf_type) + return &mrioc->diag_buffers[i]; + } + return NULL; +} + + /** + * mpi3mr_set_trigger_data_in_hdb - Updates HDB trigger type and + * trigger data + * + * @hdb: HDB pointer + * @type: Trigger type + * @data: Trigger data + * @force: Trigger overwrite flag + * + * Updates trigger type and trigger data based on parameter + * passed to this function + * + * Return: Nothing + */ +void mpi3mr_set_trigger_data_in_hdb(struct diag_buffer_desc *hdb, + u8 type, u64 data, bool force) +{ + if ((!force) && (hdb->trigger_type != MPI3MR_HDB_TRIGGER_TYPE_UNKNOWN)) + return; + hdb->trigger_type = type; + hdb->trigger_data = data; +} + + /** + * mpi3mr_set_trigger_data_in_all_hdb - Updates HDB trigger type + * and trigger data for all HDB + * + * @type: Trigger type + * @data: Trigger data + * @force: Trigger overwrite flag + * + * Updates trigger type and trigger data based on parameter + * passed to this function + * + * Return: Nothing + */ +void mpi3mr_set_trigger_data_in_all_hdb(struct mpi3mr_ioc *mrioc, + u8 type, u64 data, bool force) +{ + struct diag_buffer_desc *hdb = NULL; + + hdb = mpi3mr_diag_buffer_for_type(mrioc, MPI3_DIAG_BUFFER_TYPE_TRACE); + if (hdb) + mpi3mr_set_trigger_data_in_hdb(hdb, type, data, force); + hdb = mpi3mr_diag_buffer_for_type(mrioc, MPI3_DIAG_BUFFER_TYPE_FW); + if (hdb) + mpi3mr_set_trigger_data_in_hdb(hdb, type, data, force); +} + + /** + * mpi3mr_hdbstatuschg_evt_th - HDB status change evt tophalf + * @mrioc: Adapter instance reference + * @event_reply: event data + * + * Modifies the status of the applicable diag buffer descriptors + * + * Return: Nothing + */ +void mpi3mr_hdbstatuschg_evt_th(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply) +{ + struct mpi3_event_data_diag_buffer_status_change *evtdata; + struct diag_buffer_desc *diag_buffer; + + evtdata = (struct mpi3_event_data_diag_buffer_status_change *) + event_reply->event_data; + + diag_buffer = mpi3mr_diag_buffer_for_type(mrioc, evtdata->type); + if (!diag_buffer) + return; + if ((diag_buffer->status != MPI3MR_HDB_BUFSTATUS_POSTED_UNPAUSED) && + (diag_buffer->status != MPI3MR_HDB_BUFSTATUS_POSTED_PAUSED)) + return; + switch (evtdata->reason_code) { + case MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_RELEASED: + { + diag_buffer->status = MPI3MR_HDB_BUFSTATUS_RELEASED; + mpi3mr_set_trigger_data_in_hdb(diag_buffer, + MPI3MR_HDB_TRIGGER_TYPE_FW_RELEASED, 0, 0); + atomic64_inc(&event_counter); + break; + } + case MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_RESUMED: + { + diag_buffer->status = MPI3MR_HDB_BUFSTATUS_POSTED_UNPAUSED; + break; + } + case MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_PAUSED: + { + diag_buffer->status = MPI3MR_HDB_BUFSTATUS_POSTED_PAUSED; + break; + } + default: + dprint_event_th(mrioc, "%s: unknown reason_code(%d)\n", + __func__, evtdata->reason_code); + break; + } +} + +/** + * mpi3mr_alloc_diag_bufs - Allocate memory for diag buffers + * @mrioc: Adapter instance reference + * + * This functions checks whether the driver defined buffer sizes + * are greater than IOCFacts provided controller local buffer + * sizes and if the driver defined sizes are more then the + * driver allocates the specific buffer by reading driver page1 + * + * Return: Nothing. + */ +void mpi3mr_alloc_diag_bufs(struct mpi3mr_ioc *mrioc) +{ + struct diag_buffer_desc *diag_buffer; + struct mpi3_driver_page1 driver_pg1; + u32 trace_dec_size, trace_min_size, fw_dec_size, fw_min_size, + trace_size, fw_size; + u16 pg_sz = sizeof(driver_pg1); + int retval = 0; + + if (mrioc->diag_buffers[0].addr || mrioc->diag_buffers[1].addr) + return; + + retval = mpi3mr_cfg_get_driver_pg1(mrioc, &driver_pg1, pg_sz); + if (retval) { + ioc_warn(mrioc, "%s: driver page 1 read failed, allocating " + "default trace/fw diag buffer sizes\n", __func__); + trace_size = fw_size = MPI3MR_DEFAULT_HDB_MAX_SZ; + trace_dec_size = fw_dec_size = MPI3MR_DEFAULT_HDB_DEC_SZ; + trace_min_size = fw_min_size = MPI3MR_DEFAULT_HDB_MIN_SZ; + + } else { + trace_size = driver_pg1.host_diag_trace_max_size * 1024; + trace_dec_size = driver_pg1.host_diag_trace_decrement_size + * 1024; + trace_min_size = driver_pg1.host_diag_trace_min_size * 1024; + fw_size = driver_pg1.host_diag_fw_max_size * 1024; + fw_dec_size = driver_pg1.host_diag_fw_decrement_size * 1024; + fw_min_size = driver_pg1.host_diag_fw_min_size * 1024; + if ((trace_size == 0) && (fw_size == 0)) { + dprint_init(mrioc, "%s:Invalid buffer sizes read from " + "driver page1 tracesize = %dKB," + "fwsize = %dKB\n", + __func__, trace_size, fw_size); + return; + } + } + +retry_trace: + + diag_buffer = &mrioc->diag_buffers[0]; + diag_buffer->type = MPI3_DIAG_BUFFER_TYPE_TRACE; + diag_buffer->status = MPI3MR_HDB_BUFSTATUS_NOT_ALLOCATED; + if ((mrioc->facts.diag_trace_sz < trace_size) && (trace_size >= + trace_min_size)) { + diag_buffer->addr = dma_zalloc_coherent(&mrioc->pdev->dev, + trace_size, &diag_buffer->dma_addr, GFP_KERNEL); + if (diag_buffer->addr) { + dprint_init(mrioc, "%s: host diag trace memory " + "allocated = %dKB\n", __func__, trace_size / 1024); + diag_buffer->size = trace_size; + } else { + trace_size -= trace_dec_size; + goto retry_trace; + } + } + +retry_fw: + + diag_buffer = &mrioc->diag_buffers[1]; + + diag_buffer->type = MPI3_DIAG_BUFFER_TYPE_FW; + diag_buffer->status = MPI3MR_HDB_BUFSTATUS_NOT_ALLOCATED; + if ((mrioc->facts.diag_fw_sz < fw_size) && (fw_size >= fw_min_size)) { + diag_buffer->addr = dma_zalloc_coherent(&mrioc->pdev->dev, + fw_size, &diag_buffer->dma_addr, GFP_KERNEL); + if (diag_buffer->addr) { + dprint_init(mrioc, "%s: host diag fw memory " + "allocated = %dKB\n", __func__, fw_size / 1024); + diag_buffer->size = fw_size; + } else { + fw_size -= fw_dec_size; + goto retry_fw; + } + } +} + +/** + * mpi3mr_issue_diag_buf_post - Send diag buffer post req + * @mrioc: Adapter instance reference + * @diag_buffer: Diagnostic buffer descriptor + * + * Issue diagnostic buffer post MPI request through admin queue + * and wait for the completion of it or time out. + * + * Return: 0 on success, non-zero on failures. + */ +static int mpi3mr_issue_diag_buf_post(struct mpi3mr_ioc *mrioc, + struct diag_buffer_desc *diag_buffer) +{ + struct mpi3_diag_buffer_post_request diag_buf_post_req; + u8 prev_status; + int retval = 0; + + memset(&diag_buf_post_req, 0, sizeof(diag_buf_post_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + dprint_bsg_err(mrioc, "%s: command is in use\n", __func__); + mutex_unlock(&mrioc->init_cmds.mutex); + return -1; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + diag_buf_post_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + diag_buf_post_req.function = MPI3_FUNCTION_DIAG_BUFFER_POST; + diag_buf_post_req.type = diag_buffer->type; + diag_buf_post_req.address = le64_to_cpu(diag_buffer->dma_addr); + diag_buf_post_req.length = le32_to_cpu(diag_buffer->size); + + dprint_bsg_info(mrioc, "%s: posting diag buffer type %d\n", __func__, + diag_buffer->type); + prev_status = diag_buffer->status; + diag_buffer->status = MPI3MR_HDB_BUFSTATUS_POSTED_UNPAUSED; + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &diag_buf_post_req, + sizeof(diag_buf_post_req), 1); + if (retval) { + dprint_bsg_err(mrioc, "%s: admin request post failed\n", + __func__); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + mrioc->init_cmds.is_waiting = 0; + dprint_bsg_err(mrioc, "%s: command timedout\n", __func__); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_DIAG_BUFFER_POST_TIMEOUT); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + dprint_bsg_err(mrioc, + "%s: command failed, buffer_type (%d) ioc_status(0x%04x) log_info(0x%08x)\n", + __func__, diag_buffer->type, + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + retval = -1; + goto out_unlock; + } + dprint_bsg_info(mrioc, "%s: diag buffer type %d posted successfully\n", + __func__, diag_buffer->type); + +out_unlock: + if (retval) + diag_buffer->status = prev_status; + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); + return retval; +} + +/** + * mpi3mr_post_diag_bufs - Post diag buffers to the controller + * @mrioc: Adapter instance reference + * + * This function calls helper function to post both trace and + * firmware buffers to the controller. + * + * Return: None + */ +int mpi3mr_post_diag_bufs(struct mpi3mr_ioc *mrioc) +{ + u8 i; + struct diag_buffer_desc *diag_buffer; + + for (i = 0; i < MPI3MR_MAX_NUM_HDB; i++) { + diag_buffer = &mrioc->diag_buffers[i]; + if (!(diag_buffer->addr)) + continue; + if (mpi3mr_issue_diag_buf_post(mrioc, diag_buffer)) + return -1; + } + return 0; +} + +/** + * mpi3mr_issue_diag_buf_release - Send diag buffer release req + * @mrioc: Adapter instance reference + * @diag_buffer: Diagnostic buffer descriptor + * + * Issue diagnostic buffer manage MPI request with release + * action request through admin queue and wait for the + * completion of it or time out. + * + * Return: 0 on success, non-zero on failures. + */ +int mpi3mr_issue_diag_buf_release(struct mpi3mr_ioc *mrioc, + struct diag_buffer_desc *diag_buffer) +{ + struct mpi3_diag_buffer_manage_request diag_buf_manage_req; + int retval = 0; + + if ((diag_buffer->status != MPI3MR_HDB_BUFSTATUS_POSTED_UNPAUSED) && + (diag_buffer->status != MPI3MR_HDB_BUFSTATUS_POSTED_PAUSED)) + return retval; + + memset(&diag_buf_manage_req, 0, sizeof(diag_buf_manage_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + dprint_reset(mrioc, "%s: command is in use\n", __func__); + mutex_unlock(&mrioc->init_cmds.mutex); + return -1; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + diag_buf_manage_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + diag_buf_manage_req.function = MPI3_FUNCTION_DIAG_BUFFER_MANAGE; + diag_buf_manage_req.type = diag_buffer->type; + diag_buf_manage_req.action = MPI3_DIAG_BUFFER_ACTION_RELEASE; + + + dprint_reset(mrioc, "%s: releasing diag buffer type %d\n", __func__, + diag_buffer->type); + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &diag_buf_manage_req, + sizeof(diag_buf_manage_req), 1); + if (retval) { + dprint_reset(mrioc, "%s: admin request post failed\n", __func__); + mpi3mr_set_trigger_data_in_hdb(diag_buffer, + MPI3MR_HDB_TRIGGER_TYPE_UNKNOWN, 0, 1); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + mrioc->init_cmds.is_waiting = 0; + dprint_reset(mrioc, "%s: command timedout\n", __func__); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_DIAG_BUFFER_RELEASE_TIMEOUT); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + dprint_reset(mrioc, + "%s: command failed, buffer_type (%d) ioc_status(0x%04x) log_info(0x%08x)\n", + __func__, diag_buffer->type, + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + retval = -1; + goto out_unlock; + } + dprint_reset(mrioc, "%s: diag buffer type %d released successfully\n", + __func__, diag_buffer->type); + +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); + return retval; +} + +/** + * mpi3mr_process_trigger - Generic HDB Trigger handler + * @mrioc: Adapter instance reference + * @trigger_type: Trigger type + * @trigger_data: Trigger data + * @trigger_flags: Trigger flags + * + * This function checks validity of HDB, triggers and based on + * trigger information, creates an event to be processed in the + * firmware event worker thread . + * + * This function should be called with trigger spinlock held + * + * Return: Nothing + */ +static void mpi3mr_process_trigger(struct mpi3mr_ioc *mrioc, u8 trigger_type, + u64 trigger_data, u8 trigger_flags) +{ + struct trigger_event_data event_data; + struct diag_buffer_desc *trace_hdb = NULL; + struct diag_buffer_desc *fw_hdb = NULL; + u64 master_trigger; + + trace_hdb = mpi3mr_diag_buffer_for_type(mrioc, + MPI3_DIAG_BUFFER_TYPE_TRACE); + fw_hdb = mpi3mr_diag_buffer_for_type(mrioc, MPI3_DIAG_BUFFER_TYPE_FW); + + if (mrioc->snapdump_trigger_active || (mrioc->fw_release_trigger_active + && mrioc->trace_release_trigger_active) || + (!trace_hdb && !fw_hdb) || (!mrioc->driver_pg2) || + ((trigger_type == MPI3MR_HDB_TRIGGER_TYPE_ELEMENT) + && (!mrioc->driver_pg2->num_triggers))) + return; + memset(&event_data, 0, sizeof(event_data)); + event_data.trigger_type = trigger_type; + event_data.trigger_specific_data = trigger_data; + master_trigger = le64_to_cpu(mrioc->driver_pg2->master_trigger); + + if (master_trigger & MPI3_DRIVER2_MASTERTRIGGER_SNAPDUMP) { + event_data.snapdump = true; + event_data.trace_hdb = trace_hdb; + event_data.fw_hdb = fw_hdb; + mrioc->snapdump_trigger_active = true; + } else if (trigger_type == MPI3MR_HDB_TRIGGER_TYPE_MASTER) { + if ((trace_hdb) && (master_trigger & + MPI3_DRIVER2_MASTERTRIGGER_DIAG_TRACE_RELEASE) && + (!mrioc->trace_release_trigger_active)) { + event_data.trace_hdb = trace_hdb; + mrioc->trace_release_trigger_active = true; + } + if ((fw_hdb) && (master_trigger & + MPI3_DRIVER2_MASTERTRIGGER_DIAG_FW_RELEASE) && + (!mrioc->fw_release_trigger_active)) { + event_data.fw_hdb = fw_hdb; + mrioc->fw_release_trigger_active = true; + } + } else if (trigger_type == MPI3MR_HDB_TRIGGER_TYPE_ELEMENT) { + if ((trace_hdb) && (trigger_flags & + MPI3_DRIVER2_TRIGGER_FLAGS_DIAG_TRACE_RELEASE) && + (!mrioc->trace_release_trigger_active)) { + event_data.trace_hdb = trace_hdb; + mrioc->trace_release_trigger_active = true; + } + if ((fw_hdb) && (trigger_flags & + MPI3_DRIVER2_TRIGGER_FLAGS_DIAG_FW_RELEASE) && + (!mrioc->fw_release_trigger_active)) { + event_data.fw_hdb = fw_hdb; + mrioc->fw_release_trigger_active = true; + } + } + + if (event_data.trace_hdb || event_data.fw_hdb) + mpi3mr_hdb_trigger_data_event(mrioc, &event_data); +} + +/** + * mpi3mr_master_trigger - Master HDB trigger handler + * @mrioc: Adapter instance reference + * @trigger_data: Trigger data + * + * This function checks whether the given master trigger is + * enabled in the driver page 2 and if so calls generic trigger + * handler to queue event for HDB release. + * + * Return: Nothing + */ +void mpi3mr_master_trigger(struct mpi3mr_ioc *mrioc, u64 trigger_data) +{ + u64 master_trigger; + unsigned long flags; + + spin_lock_irqsave(&mrioc->trigger_lock, flags); + master_trigger = le64_to_cpu(mrioc->driver_pg2->master_trigger); + if (master_trigger & trigger_data) + mpi3mr_process_trigger(mrioc, MPI3MR_HDB_TRIGGER_TYPE_MASTER, + trigger_data, 0); + spin_unlock_irqrestore(&mrioc->trigger_lock, flags); +} +/** + * mpi3mr_scsisense_trigger - SCSI sense HDB trigger handler + * @mrioc: Adapter instance reference + * @sensekey: Sense Key + * @asc: Additional Sense Code + * @ascq: Additional Sense Code Qualifier + * + * This function compares SCSI sense trigger values with driver + * page 2 values and calls generic trigger handler to release + * HDBs if match found + * + * Return: Nothing + */ +void mpi3mr_scsisense_trigger(struct mpi3mr_ioc *mrioc, u8 sensekey, u8 asc, + u8 ascq) +{ + struct mpi3_driver2_trigger_scsi_sense *scsi_sense_trigger = NULL; + u64 i = 0; + unsigned long flags; + u8 num_triggers, trigger_flags; + + if (mrioc->scsisense_trigger_present) { + spin_lock_irqsave(&mrioc->trigger_lock, flags); + scsi_sense_trigger = (struct mpi3_driver2_trigger_scsi_sense *) + mrioc->driver_pg2->trigger; + num_triggers = mrioc->driver_pg2->num_triggers; + for (i = 0; i < num_triggers; i++, scsi_sense_trigger++) { + if (scsi_sense_trigger->type != + MPI3_DRIVER2_TRIGGER_TYPE_SCSI_SENSE) + continue; + if (!(scsi_sense_trigger->sense_key == + MPI3_DRIVER2_TRIGGER_SCSI_SENSE_SENSE_KEY_MATCH_ALL + || scsi_sense_trigger->sense_key == sensekey)) + continue; + if (!(scsi_sense_trigger->asc == + MPI3_DRIVER2_TRIGGER_SCSI_SENSE_ASC_MATCH_ALL || + scsi_sense_trigger->asc == asc)) + continue; + if (!(scsi_sense_trigger->ascq == + MPI3_DRIVER2_TRIGGER_SCSI_SENSE_ASCQ_MATCH_ALL || + scsi_sense_trigger->ascq == ascq)) + continue; + trigger_flags = scsi_sense_trigger->flags; + mpi3mr_process_trigger(mrioc, + MPI3MR_HDB_TRIGGER_TYPE_ELEMENT, i, trigger_flags); + break; + } + spin_unlock_irqrestore(&mrioc->trigger_lock, flags); + } +} +/** + * mpi3mr_event_trigger - MPI event HDB trigger handler + * @mrioc: Adapter instance reference + * @event: MPI Event + * + * This function compares event trigger values with driver page + * 2 values and calls generic trigger handler to release + * HDBs if match found. + * + * Return: Nothing + */ +void mpi3mr_event_trigger(struct mpi3mr_ioc *mrioc, u8 event) +{ + struct mpi3_driver2_trigger_event *event_trigger = NULL; + u64 i = 0; + unsigned long flags; + u8 num_triggers, trigger_flags; + + if (mrioc->event_trigger_present) { + spin_lock_irqsave(&mrioc->trigger_lock, flags); + event_trigger = (struct mpi3_driver2_trigger_event *) + mrioc->driver_pg2->trigger; + num_triggers = mrioc->driver_pg2->num_triggers; + + for (i = 0; i < num_triggers; i++, event_trigger++) { + if (event_trigger->type != + MPI3_DRIVER2_TRIGGER_TYPE_EVENT) + continue; + if (event_trigger->event != event) + continue; + trigger_flags = event_trigger->flags; + mpi3mr_process_trigger(mrioc, + MPI3MR_HDB_TRIGGER_TYPE_ELEMENT, i, trigger_flags); + break; + } + spin_unlock_irqrestore(&mrioc->trigger_lock, flags); + } +} +/** + * mpi3mr_reply_trigger - MPI Reply HDB trigger handler + * @mrioc: Adapter instance reference + * @iocstatus: Masked value of IOC Status from MPI Reply + * @iocloginfo: IOC Log Info from MPI Reply + * + * This function compares IOC status and IOC log info trigger + * values with driver page 2 values and calls generic trigger + * handler to release HDBs if match found. + * + * Return: Nothing + */ +void mpi3mr_reply_trigger(struct mpi3mr_ioc *mrioc, u16 ioc_status, + u32 ioc_loginfo) +{ + struct mpi3_driver2_trigger_reply *reply_trigger = NULL; + u64 i = 0; + unsigned long flags; + u8 num_triggers, trigger_flags; + + if (mrioc->reply_trigger_present) { + spin_lock_irqsave(&mrioc->trigger_lock, flags); + reply_trigger = (struct mpi3_driver2_trigger_reply *) + mrioc->driver_pg2->trigger; + num_triggers = mrioc->driver_pg2->num_triggers; + for (i = 0; i < num_triggers; i++, reply_trigger++) { + if (reply_trigger->type != + MPI3_DRIVER2_TRIGGER_TYPE_REPLY) + continue; + if ((le16_to_cpu(reply_trigger->ioc_status) != + ioc_status) + && (le16_to_cpu(reply_trigger->ioc_status) != + MPI3_DRIVER2_TRIGGER_REPLY_IOCSTATUS_MATCH_ALL)) + continue; + if ((le32_to_cpu(reply_trigger->ioc_log_info) != + (le32_to_cpu(reply_trigger->ioc_log_info_mask) & + ioc_loginfo))) + continue; + trigger_flags = reply_trigger->flags; + mpi3mr_process_trigger(mrioc, + MPI3MR_HDB_TRIGGER_TYPE_ELEMENT, i, trigger_flags); + break; + } + spin_unlock_irqrestore(&mrioc->trigger_lock, flags); + } +} +/** + * mpi3mr_get_num_trigger - Gets number of HDB triggers + * @mrioc: Adapter instance reference + * @num_triggers: Number of triggers + * @page_action: Page action + * + * This function reads number of triggers by reading driver page + * 2 + * + * Return: 0 on success and proper error codes on failure + */ +static int mpi3mr_get_num_trigger(struct mpi3mr_ioc *mrioc, u8 *num_triggers, + u8 page_action) +{ + struct mpi3_driver_page2 drvr_page2; + int retval = 0; + + *num_triggers = 0; + + retval = mpi3mr_cfg_get_driver_pg2(mrioc, &drvr_page2, + sizeof(struct mpi3_driver_page2), page_action); + + if (retval) { + dprint_init(mrioc, "%s: driver page 2 read failed\n", __func__); + return retval; + } + *num_triggers = drvr_page2.num_triggers; + return retval; +} +/** + * mpi3mr_refresh_trigger - Handler for Refresh trigger BSG + * @mrioc: Adapter instance reference + * @page_action: Page action + * + * This function caches the driver page 2 in the driver's memory + * by reading driver page 2 from the controller for a given page + * type and updates the HDB trigger values + * + * Return: 0 on success and proper error codes on failure + */ +int mpi3mr_refresh_trigger(struct mpi3mr_ioc *mrioc, u8 page_action) +{ + u16 pg_sz = sizeof(struct mpi3_driver_page2); + struct mpi3_driver_page2 *drvr_page2 = NULL; + u8 trigger_type, num_triggers; + int retval; + int i = 0; + unsigned long flags; + + retval = mpi3mr_get_num_trigger(mrioc, &num_triggers, page_action); + + if (retval) + goto out; + + pg_sz = offsetof(struct mpi3_driver_page2, trigger) + + (num_triggers * sizeof(union mpi3_driver2_trigger_element)); + drvr_page2 = kzalloc(pg_sz, GFP_KERNEL); + if (!drvr_page2) { + retval = -ENOMEM; + goto out; + } + + retval = mpi3mr_cfg_get_driver_pg2(mrioc, drvr_page2, pg_sz, page_action); + if (retval) { + dprint_init(mrioc, "%s: driver page 2 read failed\n", __func__); + kfree(drvr_page2); + goto out; + } + spin_lock_irqsave(&mrioc->trigger_lock, flags); + kfree(mrioc->driver_pg2); + mrioc->driver_pg2 = drvr_page2; + mrioc->reply_trigger_present = false; + mrioc->event_trigger_present = false; + mrioc->scsisense_trigger_present = false; + + for (i = 0; (i < mrioc->driver_pg2->num_triggers); i++) { + trigger_type = mrioc->driver_pg2->trigger[i].event.type; + switch (trigger_type) { + case MPI3_DRIVER2_TRIGGER_TYPE_REPLY: + mrioc->reply_trigger_present = true; + break; + case MPI3_DRIVER2_TRIGGER_TYPE_EVENT: + mrioc->event_trigger_present = true; + break; + case MPI3_DRIVER2_TRIGGER_TYPE_SCSI_SENSE: + mrioc->scsisense_trigger_present = true; + break; + default: + break; + } + } + spin_unlock_irqrestore(&mrioc->trigger_lock, flags); +out: + return retval; +} + +/** + * mpi3mr_release_diag_bufs - Release diag buffers + * @mrioc: Adapter instance reference + * @skip_rel_action: Skip release action and set buffer state + * + * This function calls helper function to release both trace and + * firmware buffers from the controller. + * + * Return: None + */ +void mpi3mr_release_diag_bufs(struct mpi3mr_ioc *mrioc, u8 skip_rel_action) +{ + u8 i; + struct diag_buffer_desc *diag_buffer; + + for (i = 0; i < MPI3MR_MAX_NUM_HDB; i++) { + diag_buffer = &mrioc->diag_buffers[i]; + if (!(diag_buffer->addr)) + continue; + if (diag_buffer->status == MPI3MR_HDB_BUFSTATUS_RELEASED) + continue; + if (!skip_rel_action) + mpi3mr_issue_diag_buf_release(mrioc, diag_buffer); + diag_buffer->status = MPI3MR_HDB_BUFSTATUS_RELEASED; + atomic64_inc(&event_counter); + } +} +/** + * mpi3mr_bsg_pel_abort - sends PEL abort request + * @mrioc: Adapter instance reference + * + * This function sends PEL abort request to the firmware through + * admin request queue. + * + * Return: 0 on success, -1 on failure + */ +static int mpi3mr_bsg_pel_abort(struct mpi3mr_ioc *mrioc) +{ + struct mpi3_pel_req_action_abort pel_abort_req; + struct mpi3_pel_reply *pel_reply; + int retval = 0; + u16 pe_log_status; + + if (mrioc->reset_in_progress) { + dprint_bsg_err(mrioc, "%s: reset in progress\n", __func__); + return -1; + } + if (mrioc->block_bsgs) { + dprint_bsg_err(mrioc, "%s: bsgs are blocked\n", __func__); + return -1; + } + + memset(&pel_abort_req, 0, sizeof(pel_abort_req)); + mutex_lock(&mrioc->pel_abort_cmd.mutex); + if (mrioc->pel_abort_cmd.state & MPI3MR_CMD_PENDING) { + dprint_bsg_err(mrioc, "%s: command is in use\n", __func__); + mutex_unlock(&mrioc->pel_abort_cmd.mutex); + return -1; + } + mrioc->pel_abort_cmd.state = MPI3MR_CMD_PENDING; + mrioc->pel_abort_cmd.is_waiting = 1; + mrioc->pel_abort_cmd.callback = NULL; + pel_abort_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_PEL_ABORT); + pel_abort_req.function = MPI3_FUNCTION_PERSISTENT_EVENT_LOG; + pel_abort_req.action = MPI3_PEL_ACTION_ABORT; + pel_abort_req.abort_host_tag = cpu_to_le16(MPI3MR_HOSTTAG_PEL_WAIT); + + mrioc->pel_abort_requested = 1; + init_completion(&mrioc->pel_abort_cmd.done); + retval = mpi3mr_admin_request_post(mrioc, &pel_abort_req, + sizeof(pel_abort_req), 0); + if (retval) { + retval = -1; + dprint_bsg_err(mrioc, "%s: admin request post failed\n", + __func__); + mrioc->pel_abort_requested = 0; + goto out_unlock; + } + + wait_for_completion_timeout(&mrioc->pel_abort_cmd.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->pel_abort_cmd.state & MPI3MR_CMD_COMPLETE)) { + mrioc->pel_abort_cmd.is_waiting = 0; + dprint_bsg_err(mrioc, "%s: command timedout\n", __func__); + if (!(mrioc->pel_abort_cmd.state & MPI3MR_CMD_RESET)) + mpi3mr_soft_reset_handler(mrioc, + MPI3MR_RESET_FROM_PELABORT_TIMEOUT, 1); + retval = -1; + goto out_unlock; + } + if ((mrioc->pel_abort_cmd.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + dprint_bsg_err(mrioc, + "%s: command failed, ioc_status(0x%04x) log_info(0x%08x)\n", + __func__, (mrioc->pel_abort_cmd.ioc_status & + MPI3_IOCSTATUS_STATUS_MASK), + mrioc->pel_abort_cmd.ioc_loginfo); + retval = -1; + goto out_unlock; + } + if (mrioc->pel_abort_cmd.state & MPI3MR_CMD_REPLY_VALID) { + pel_reply = (struct mpi3_pel_reply *)mrioc->pel_abort_cmd.reply; + pe_log_status = le16_to_cpu(pel_reply->pe_log_status); + if (pe_log_status != MPI3_PEL_STATUS_SUCCESS) { + dprint_bsg_err(mrioc, + "%s: command failed, pel_status(0x%04x)\n", + __func__, pe_log_status); + retval = -1; + } + } + +out_unlock: + mrioc->pel_abort_cmd.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->pel_abort_cmd.mutex); + return retval; +} +/** + * mpi3mr_bsg_verify_adapter - verify adapter number is valid + * @ioc_number: Adapter number + * @mriocpp: Pointer to hold per adapter instance + * + * This function checks whether given adapter number matches + * with an adapter id in the driver's list and if so fills + * pointer to the per adapter instance in mriocpp else set that + * to NULL. + * + * Return: Nothing. + */ +static void mpi3mr_bsg_verify_adapter(int ioc_number, + struct mpi3mr_ioc **mriocpp) +{ + struct mpi3mr_ioc *mrioc; + + spin_lock(&mrioc_list_lock); + list_for_each_entry(mrioc, &mrioc_list, list) { + if (mrioc->id != ioc_number) + continue; + spin_unlock(&mrioc_list_lock); + *mriocpp = mrioc; + return; + } + spin_unlock(&mrioc_list_lock); + *mriocpp = NULL; +} + + +/** + * mpi3mr_bsg_refresh_hdb_triggers - Refresh HDB trigger data + * @mrioc: Adapter instance reference + * @job: BSG Job pointer + * + * This function reads the controller trigger config page as + * defined by the input page type and refreshes the driver's + * local trigger information structures with the controller's + * config page data. + * + * Return: 0 on success and proper error codes on failure + */ +static long +mpi3mr_bsg_refresh_hdb_triggers(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + struct mpi3mr_bsg_out_refresh_hdb_triggers refresh_triggers; + uint32_t data_out_sz; + u8 page_action; + long rval = -EINVAL; + + data_out_sz = job->request_payload.payload_len; + + if (data_out_sz != sizeof(refresh_triggers)) { + dprint_bsg_err(mrioc, "%s: invalid size argument\n", + __func__); + return rval; + } + sg_copy_to_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + &refresh_triggers, sizeof(refresh_triggers)); + + switch (refresh_triggers.page_type) { + case MPI3MR_HDB_REFRESH_TYPE_CURRENT: + page_action = MPI3_CONFIG_ACTION_READ_CURRENT; + break; + case MPI3MR_HDB_REFRESH_TYPE_DEFAULT: + page_action = MPI3_CONFIG_ACTION_READ_DEFAULT; + break; + case MPI3MR_HDB_HDB_REFRESH_TYPE_PERSISTENT: + page_action = MPI3_CONFIG_ACTION_READ_PERSISTENT; + break; + default: + dprint_bsg_err(mrioc, + "%s: unsupported refresh trigger, page_type %d\n", + __func__, refresh_triggers.page_type); + return rval; + } + rval = mpi3mr_refresh_trigger(mrioc, page_action); + + return rval; +} + +/** + * mpi3mr_bsg_upload_hdb - Upload a specific HDB to user space + * @mrioc: Adapter instance reference + * @job: BSG Job pointer + * + * Return: 0 on success and proper error codes on failure + */ +static long mpi3mr_bsg_upload_hdb(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + struct mpi3mr_bsg_out_upload_hdb upload_hdb; + struct diag_buffer_desc *diag_buffer; + uint32_t data_out_size; + uint32_t data_in_size; + + data_out_size = job->request_payload.payload_len; + data_in_size = job->reply_payload.payload_len; + + if (data_out_size != sizeof(upload_hdb)) { + dprint_bsg_err(mrioc, "%s: invalid size argument\n", + __func__); + return -EINVAL; + } + + sg_copy_to_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + &upload_hdb, sizeof(upload_hdb)); + + if ((!upload_hdb.length) || (data_in_size != upload_hdb.length)) { + dprint_bsg_err(mrioc, "%s: invalid length argument\n", + __func__); + return -EINVAL; + } + diag_buffer = mpi3mr_diag_buffer_for_type(mrioc, upload_hdb.buf_type); + if ((!diag_buffer) || (!diag_buffer->addr)) { + dprint_bsg_err(mrioc, "%s: invalid buffer type %d\n", + __func__, upload_hdb.buf_type); + return -EINVAL; + } + + if ((diag_buffer->status != MPI3MR_HDB_BUFSTATUS_RELEASED) && + (diag_buffer->status != MPI3MR_HDB_BUFSTATUS_POSTED_PAUSED)) { + dprint_bsg_err(mrioc, + "%s: invalid buffer status %d for type %d\n", + __func__, diag_buffer->status, upload_hdb.buf_type); + return -EINVAL; + } + + if ((upload_hdb.start_offset + upload_hdb.length) > diag_buffer->size) { + dprint_bsg_err(mrioc, + "%s: invalid start offset %d, length %d for type %d\n", + __func__, upload_hdb.start_offset, upload_hdb.length, + upload_hdb.buf_type); + return -EINVAL; + } + if (job->reply_payload.payload_len >= upload_hdb.length) { + sg_copy_from_buffer(job->reply_payload.sg_list, + job->reply_payload.sg_cnt, + (diag_buffer->addr + upload_hdb.start_offset), upload_hdb.length); + return 0; + } + return -EFAULT; +} + + +/** + * mpi3mr_bsg_repost_hdb - Re-post HDB + * @mrioc: Adapter instance reference + * @job: BSG job pointer + * + * This function retrieves the HDB descriptor corresponding to a + * given buffer type and if the HDB is in released status then + * posts the HDB with the firmware. + * + * Return: 0 on success and proper error codes on failure + */ +static long mpi3mr_bsg_repost_hdb(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + struct mpi3mr_bsg_out_repost_hdb repost_hdb; + struct diag_buffer_desc *diag_buffer; + uint32_t data_out_sz; + + data_out_sz = job->request_payload.payload_len; + + if (data_out_sz != sizeof(repost_hdb)) { + dprint_bsg_err(mrioc, "%s: invalid size argument\n", + __func__); + return -EINVAL; + } + + sg_copy_to_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + &repost_hdb, sizeof(repost_hdb)); + + diag_buffer = mpi3mr_diag_buffer_for_type(mrioc, repost_hdb.buf_type); + if ((!diag_buffer) || (!diag_buffer->addr)) { + dprint_bsg_err(mrioc, "%s: invalid buffer type %d\n", + __func__, repost_hdb.buf_type); + return -EINVAL; + } + + if (diag_buffer->status != MPI3MR_HDB_BUFSTATUS_RELEASED) { + dprint_bsg_err(mrioc, + "%s: invalid buffer status %d for type %d\n", + __func__, diag_buffer->status, repost_hdb.buf_type); + return -EINVAL; + } + + if (mpi3mr_issue_diag_buf_post(mrioc, diag_buffer)) { + dprint_bsg_err(mrioc, "%s: post failed for type %d\n", + __func__, repost_hdb.buf_type); + return -EFAULT; + } + mpi3mr_set_trigger_data_in_hdb(diag_buffer, + MPI3MR_HDB_TRIGGER_TYPE_UNKNOWN, 0, 1); + + return 0; +} + +/** + * mpi3mr_bsg_query_hdb - Handler for query HDB command + * @mrioc: Adapter instance reference + * @job: BSG job pointer + * + * This function prepares and copies the host diagnostic buffer + * entries to the user buffer. + * + * Return: 0 on success and proper error codes on failure + */ +static long mpi3mr_bsg_query_hdb(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + long rval = 0; + struct mpi3mr_bsg_in_hdb_status *hbd_status; + struct mpi3mr_hdb_entry *hbd_status_entry; + u32 length, min_length; + u8 i; + struct diag_buffer_desc *diag_buffer; + uint32_t data_in_sz = 0; + + data_in_sz = job->request_payload.payload_len; + + length = (sizeof(*hbd_status) + ((MPI3MR_MAX_NUM_HDB - 1) * + sizeof(*hbd_status_entry))); + hbd_status = kmalloc(length, GFP_KERNEL); + if (!hbd_status) + return -ENOMEM; + hbd_status_entry = &hbd_status->entry[0]; + + hbd_status->num_hdb_types = MPI3MR_MAX_NUM_HDB; + for (i = 0; i < MPI3MR_MAX_NUM_HDB; i++) { + diag_buffer = &mrioc->diag_buffers[i]; + hbd_status_entry->buf_type = diag_buffer->type; + hbd_status_entry->status = diag_buffer->status; + hbd_status_entry->trigger_type = diag_buffer->trigger_type; + hbd_status_entry->trigger_data = diag_buffer->trigger_data; + hbd_status_entry->size = (diag_buffer->size / 1024); + hbd_status_entry++; + } + + if (data_in_sz < 4) { + dprint_bsg_err(mrioc, "%s: invalid size passed\n", __func__); + rval = -EINVAL; + goto out; + } + min_length = min(data_in_sz, length); + if (job->request_payload.payload_len >= min_length) { + sg_copy_from_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + hbd_status, min_length); + rval = 0; + } +out: + kfree(hbd_status); + return rval; +} +/** + * mpi3mr_enable_logdata - Handler for log data enable + * @mrioc: Adapter instance reference + * @job: BSG job reference + * + * This function enables log data caching in the driver if not + * already enabled and return the maximum number of log data + * entries that can be cached in the driver. + * + * Return: 0 on success and proper error codes on failure + */ +static long mpi3mr_enable_logdata(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + long rval = -EINVAL; + struct mpi3mr_logdata_enable logdata_enable; + + if (mrioc->logdata_buf) + goto copy_user_data; + + mrioc->logdata_entry_sz = + (mrioc->reply_sz - (sizeof(struct mpi3_event_notification_reply) - 4)) + + MPI3MR_BSG_LOGDATA_ENTRY_HEADER_SZ; + mrioc->logdata_buf_idx = 0; + + mrioc->logdata_buf = kcalloc(MPI3MR_BSG_LOGDATA_MAX_ENTRIES, + mrioc->logdata_entry_sz, GFP_KERNEL); + if (!mrioc->logdata_buf) + return -ENOMEM; + +copy_user_data: + memset(&logdata_enable, 0, sizeof(logdata_enable)); + logdata_enable.max_entries = + MPI3MR_BSG_LOGDATA_MAX_ENTRIES; + if (job->request_payload.payload_len >= sizeof(logdata_enable)) { + sg_copy_from_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + &logdata_enable, sizeof(logdata_enable)); + rval = 0; + } + return rval; +} +/** + * mpi3mr_get_logdata - Handler for get log data + * @mrioc: Adapter instance reference + * @job: BSG job pointer + * This function copies the log data entries to the user buffer + * when log caching is enabled in the driver. + * + * Return: 0 on success and proper error codes on failure + */ +static long mpi3mr_get_logdata(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + u16 num_entries, sz, entry_sz = mrioc->logdata_entry_sz; + + if ((!mrioc->logdata_buf) || (job->request_payload.payload_len < entry_sz)) + return -EINVAL; + + num_entries = job->request_payload.payload_len / entry_sz; + if (num_entries > MPI3MR_BSG_LOGDATA_MAX_ENTRIES) + num_entries = MPI3MR_BSG_LOGDATA_MAX_ENTRIES; + sz = num_entries * entry_sz; + + if (job->request_payload.payload_len >= sz) { + sg_copy_from_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + mrioc->logdata_buf, sz); + return 0; + } + return -EINVAL; +} + +/** + * mpi3mr_bsg_pel_enable - Handler for PEL enable driver + * @mrioc: Adapter instance reference + * @job: BSG job pointer + * + * This function is the handler for PEL enable driver. + * Validates the application given class and locale and if + * requires aborts the existing PEL wait request and/or issues + * new PEL wait request to the firmware and returns. + * + * Return: 0 on success and proper error codes on failure. + */ +static long mpi3mr_bsg_pel_enable(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + long rval = -EINVAL; + struct mpi3mr_bsg_out_pel_enable pel_enable; + u8 issue_pel_wait; + u8 tmp_class; + u16 tmp_locale; + + if (job->request_payload.payload_len != sizeof(pel_enable)) { + dprint_bsg_err(mrioc, "%s: invalid size argument\n", + __func__); + return rval; + } + + sg_copy_to_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + &pel_enable, sizeof(pel_enable)); + + if (pel_enable.pel_class > MPI3_PEL_CLASS_FAULT) { + dprint_bsg_err(mrioc, "%s: out of range class %d sent\n", + __func__, pel_enable.pel_class); + rval = 0; + goto out; + } + if (!mrioc->pel_enabled) + issue_pel_wait = 1; + else { + if ((mrioc->pel_class <= pel_enable.pel_class) && + !((mrioc->pel_locale & pel_enable.pel_locale) ^ + pel_enable.pel_locale)) { + issue_pel_wait = 0; + rval = 0; + } else { + pel_enable.pel_locale |= mrioc->pel_locale; + + if (mrioc->pel_class < pel_enable.pel_class) + pel_enable.pel_class = mrioc->pel_class; + + rval = mpi3mr_bsg_pel_abort(mrioc); + if (rval) { + dprint_bsg_err(mrioc, + "%s: pel_abort failed, status(%ld)\n", + __func__, rval); + goto out; + } + issue_pel_wait = 1; + } + } + if (issue_pel_wait) { + tmp_class = mrioc->pel_class; + tmp_locale = mrioc->pel_locale; + mrioc->pel_class = pel_enable.pel_class; + mrioc->pel_locale = pel_enable.pel_locale; + mrioc->pel_enabled = 1; + rval = mpi3mr_pel_get_seqnum_post(mrioc, NULL); + if (rval) { + mrioc->pel_class = tmp_class; + mrioc->pel_locale = tmp_locale; + mrioc->pel_enabled = 0; + dprint_bsg_err(mrioc, + "%s: pel get sequence number failed, status(%ld)\n", + __func__, rval); + } + } + +out: + return rval; +} + +/** + * mpi3mr_get_all_tgt_info - Get all target information + * @mrioc: Adapter instance reference + * @job: BSG job reference + * + * This function copies the driver managed target devices device + * handle, persistent ID, bus ID and taret ID to the user + * provided buffer for the specific controller. This function + * also provides the number of devices managed by the driver for + * the specific controller. + * + * Return: 0 on success and proper error codes on failure + */ +static long mpi3mr_get_all_tgt_info(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + long rval = -EINVAL; + u16 num_devices = 0, i = 0, size; + unsigned long flags; + struct mpi3mr_tgt_dev *tgtdev; + struct mpi3mr_device_map_info *devmap_info = NULL; + struct mpi3mr_all_tgt_info *alltgt_info = NULL; + uint32_t min_entrylen = 0, kern_entrylen = 0, usr_entrylen = 0; + + if (job->request_payload.payload_len < sizeof(u32)) { + dprint_bsg_err(mrioc, "%s: invalid size argument\n", + __func__); + return rval; + } + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) + num_devices++; + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + + if ((job->request_payload.payload_len == sizeof(u32)) || + list_empty(&mrioc->tgtdev_list)) { + sg_copy_from_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + &num_devices, sizeof(num_devices)); + return 0; + } + + kern_entrylen = (num_devices - 1) * sizeof(*devmap_info); + size = sizeof(*alltgt_info) + kern_entrylen; + alltgt_info = kzalloc(size, GFP_KERNEL); + if (!alltgt_info) + return -ENOMEM; + + devmap_info = alltgt_info->dmi; + memset((u8 *)devmap_info, 0xFF, (kern_entrylen + sizeof(*devmap_info))); + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) { + if (i < num_devices) { + devmap_info[i].handle = tgtdev->dev_handle; + devmap_info[i].perst_id = tgtdev->perst_id; + if (tgtdev->host_exposed && tgtdev->starget) { + devmap_info[i].target_id = tgtdev->starget->id; + devmap_info[i].bus_id = + tgtdev->starget->channel; + } + i++; + } + } + num_devices = i; + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + + memcpy(&alltgt_info->num_devices, &num_devices, sizeof(num_devices)); + + usr_entrylen = (job->request_payload.payload_len - sizeof(u32)) / sizeof(*devmap_info); + usr_entrylen *= sizeof(*devmap_info); + min_entrylen = min(usr_entrylen, kern_entrylen); + if (min_entrylen && (!memcpy(&alltgt_info->dmi, devmap_info, min_entrylen))) { + dprint_bsg_err(mrioc, "%s:%d: device map info copy failed\n", + __func__, __LINE__); + rval = -EFAULT; + goto out; + } + + sg_copy_from_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + alltgt_info, job->request_payload.payload_len); + rval = 0; +out: + kfree(alltgt_info); + return rval; +} +/** + * mpi3mr_get_change_count - Get topology change count + * @mrioc: Adapter instance reference + * @job: BSG job reference + * + * This function copies the toplogy change count provided by the + * driver in events and cached in the driver to the user + * provided buffer for the specific controller. + * + * Return: 0 on success and proper error codes on failure + */ +static long mpi3mr_get_change_count(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + long rval = -EINVAL; + struct mpi3mr_change_count chgcnt; + + memset(&chgcnt, 0, sizeof(chgcnt)); + chgcnt.change_count = mrioc->change_count; + if (job->request_payload.payload_len >= sizeof(chgcnt)) { + sg_copy_from_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + &chgcnt, sizeof(chgcnt)); + rval = 0; + } + return rval; +} + +/** + * mpi3mr_bsg_adp_reset - Issue controller reset + * @mrioc: Adapter instance reference + * @job: BSG job reference + * + * This function identifies the user provided reset type and + * issues approporiate reset to the controller and wait for that + * to complete and reinitialize the controller and then returns + * + * Return: 0 on success and proper error codes on failure + */ +static long mpi3mr_bsg_adp_reset(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + long rval = -EINVAL; + u8 save_snapdump; + struct mpi3mr_bsg_adp_reset adpreset; + + if (job->request_payload.payload_len != + sizeof(adpreset)) { + dprint_bsg_err(mrioc, "%s: invalid size argument\n", + __func__); + goto out; + } + + sg_copy_to_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + &adpreset, sizeof(adpreset)); + + switch (adpreset.reset_type) { + case MPI3MR_BSG_ADPRESET_SOFT: + save_snapdump = 0; + break; + case MPI3MR_BSG_ADPRESET_DIAG_FAULT: + save_snapdump = 1; + break; + default: + dprint_bsg_err(mrioc, "%s: unknown reset_type(%d)\n", + __func__, adpreset.reset_type); + goto out; + } + + rval = mpi3mr_soft_reset_handler(mrioc, MPI3MR_RESET_FROM_APP, + save_snapdump); + + if (rval) + dprint_bsg_err(mrioc, + "%s: reset handler returned error(%ld) for reset type %d\n", + __func__, rval, adpreset.reset_type); +out: + return rval; +} + +/** + * mpi3mr_bsg_populate_adpinfo - Get adapter info command handler + * @mrioc: Adapter instance reference + * @job: BSG job reference + * + * This function provides adapter information for the given + * controller + * + * Return: 0 on success and proper error codes on failure + */ +static long mpi3mr_bsg_populate_adpinfo(struct mpi3mr_ioc *mrioc, + struct bsg_job *job) +{ + enum mpi3mr_iocstate ioc_state; + struct mpi3mr_bsg_in_adpinfo adpinfo; + + memset(&adpinfo, 0, sizeof(adpinfo)); + adpinfo.adp_type = MPI3MR_BSG_ADPTYPE_AVGFAMILY; + adpinfo.pci_dev_id = mrioc->pdev->device; + adpinfo.pci_dev_hw_rev = mrioc->pdev->revision; + adpinfo.pci_subsys_dev_id = mrioc->pdev->subsystem_device; + adpinfo.pci_subsys_ven_id = mrioc->pdev->subsystem_vendor; + adpinfo.pci_bus = mrioc->pdev->bus->number; + adpinfo.pci_dev = PCI_SLOT(mrioc->pdev->devfn); + adpinfo.pci_func = PCI_FUNC(mrioc->pdev->devfn); + adpinfo.pci_seg_id = pci_domain_nr(mrioc->pdev->bus); + adpinfo.app_intfc_ver = MPI3MR_IOCTL_VERSION; + + ioc_state = mpi3mr_get_iocstate(mrioc); + if (ioc_state == MRIOC_STATE_UNRECOVERABLE) + adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE; + else if ((mrioc->reset_in_progress) || (mrioc->block_bsgs)) + adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_IN_RESET; + else if (ioc_state == MRIOC_STATE_FAULT) + adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_FAULT; + else + adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL; + + memcpy((u8 *)&adpinfo.driver_info, (u8 *)&mrioc->driver_info, + sizeof(adpinfo.driver_info)); + + if (job->request_payload.payload_len >= sizeof(adpinfo)) { + sg_copy_from_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + &adpinfo, sizeof(adpinfo)); + return 0; + } + return -EINVAL; +} + +/** + * mpi3mr_bsg_process_drv_cmds - Driver Command handler + * @job: BSG job reference + * + * This function is the top level handler for driver commands, + * this does basic validation of the buffer and identifies the + * opcode and switches to correct sub handler. + * + * Return: 0 on success and proper error codes on failure + */ +static long mpi3mr_bsg_process_drv_cmds(struct bsg_job *job) +{ + long rval = -EINVAL; + struct mpi3mr_ioc *mrioc = NULL; + struct mpi3mr_bsg_packet *bsg_req = NULL; + struct mpi3mr_bsg_drv_cmd *drvrcmd = NULL; + + bsg_req = job->request; + drvrcmd = &bsg_req->cmd.drvrcmd; + + mpi3mr_bsg_verify_adapter(drvrcmd->mrioc_id, &mrioc); + if (!mrioc) + return -ENODEV; + + if (drvrcmd->opcode == MPI3MR_DRVBSG_OPCODE_ADPINFO) { + rval = mpi3mr_bsg_populate_adpinfo(mrioc, job); + return rval; + } + + if (mutex_lock_interruptible(&mrioc->bsg_cmds.mutex)) + return -ERESTARTSYS; + + switch (drvrcmd->opcode) { + case MPI3MR_DRVBSG_OPCODE_ADPRESET: + rval = mpi3mr_bsg_adp_reset(mrioc, job); + break; + case MPI3MR_DRVBSG_OPCODE_ALLTGTDEVINFO: + rval = mpi3mr_get_all_tgt_info(mrioc, job); + break; + case MPI3MR_DRVBSG_OPCODE_GETCHGCNT: + rval = mpi3mr_get_change_count(mrioc, job); + break; + case MPI3MR_DRVBSG_OPCODE_LOGDATAENABLE: + rval = mpi3mr_enable_logdata(mrioc, job); + break; + case MPI3MR_DRVBSG_OPCODE_GETLOGDATA: + rval = mpi3mr_get_logdata(mrioc, job); + break; + case MPI3MR_DRVBSG_OPCODE_PELENABLE: + rval = mpi3mr_bsg_pel_enable(mrioc, job); + break; + case MPI3MR_DRVBSG_OPCODE_QUERY_HDB: + rval = mpi3mr_bsg_query_hdb(mrioc, job); + break; + case MPI3MR_DRVBSG_OPCODE_REPOST_HDB: + rval = mpi3mr_bsg_repost_hdb(mrioc, job); + break; + case MPI3MR_DRVBSG_OPCODE_UPLOAD_HDB: + rval = mpi3mr_bsg_upload_hdb(mrioc, job); + break; + case MPI3MR_DRVBSG_OPCODE_REFRESH_HDB_TRIGGERS: + rval = mpi3mr_bsg_refresh_hdb_triggers(mrioc, job); + break; + case MPI3MR_DRVBSG_OPCODE_UNKNOWN: + default: + pr_err("%s: unsupported driver command opcode %d\n", + MPI3MR_DRIVER_NAME, drvrcmd->opcode); + break; + } + mutex_unlock(&mrioc->bsg_cmds.mutex); + return rval; +} +/** + * mpi3mr_bsg_build_sgl - SGL construction for MPI commands + * @mpi_req: MPI request + * @sgl_offset: offset to start sgl in the MPI request + * @drv_bufs: DMA address of the buffers to be placed in sgl + * @bufcnt: Number of DMA buffers + * @is_rmc: Does the buffer list has management command buffer + * @is_rmr: Does the buffer list has management response buffer + * @num_datasges: Number of data buffers in the list + * + * This function places the DMA address of the given buffers in + * proper format as SGEs in the given MPI request. + * + * Return: Nothing + */ +static void mpi3mr_bsg_build_sgl(u8 *mpi_req, uint32_t sgl_offset, + struct mpi3mr_buf_map *drv_bufs, u8 bufcnt, u8 is_rmc, + u8 is_rmr, u8 num_datasges) +{ + u8 *sgl = (mpi_req + sgl_offset), count = 0; + struct mpi3_mgmt_passthrough_request *rmgmt_req = + (struct mpi3_mgmt_passthrough_request *)mpi_req; + struct mpi3mr_buf_map *drv_buf_iter = drv_bufs; + u8 sgl_flags, sgl_flags_last; + + sgl_flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE | + MPI3_SGE_FLAGS_DLAS_SYSTEM | MPI3_SGE_FLAGS_END_OF_BUFFER; + sgl_flags_last = sgl_flags | MPI3_SGE_FLAGS_END_OF_LIST; + + if (is_rmc) { + mpi3mr_add_sg_single(&rmgmt_req->command_sgl, + sgl_flags_last, drv_buf_iter->kern_buf_len, + drv_buf_iter->kern_buf_dma); + sgl = (u8 *)drv_buf_iter->kern_buf + drv_buf_iter->bsg_buf_len; + drv_buf_iter++; + count++; + if (is_rmr) { + mpi3mr_add_sg_single(&rmgmt_req->response_sgl, + sgl_flags_last, drv_buf_iter->kern_buf_len, + drv_buf_iter->kern_buf_dma); + drv_buf_iter++; + count++; + } else + mpi3mr_build_zero_len_sge( + &rmgmt_req->response_sgl); + } + if (!num_datasges) { + mpi3mr_build_zero_len_sge(sgl); + return; + } + for (; count < bufcnt; count++, drv_buf_iter++) { + if (drv_buf_iter->is_dma == false) + continue; + if (num_datasges == 1 || !is_rmc) + mpi3mr_add_sg_single(sgl, sgl_flags_last, + drv_buf_iter->kern_buf_len, drv_buf_iter->kern_buf_dma); + else + mpi3mr_add_sg_single(sgl, sgl_flags, + drv_buf_iter->kern_buf_len, drv_buf_iter->kern_buf_dma); + sgl += sizeof(struct mpi3_sge_common); + num_datasges--; + } +} + +/** + * mpi3mr_get_nvme_data_fmt - returns the NVMe data format + * @nvme_encap_request: NVMe encapsulated MPI request + * + * This function returns the type of the data format specified + * in user provided NVMe command in NVMe encapsulated request. + * + * Return: Data format of the NVMe command (PRP/SGL etc) + */ +static unsigned int mpi3mr_get_nvme_data_fmt( + struct mpi3_nvme_encapsulated_request *nvme_encap_request) +{ + u8 format = 0; + + format = ((nvme_encap_request->command[0] & 0xc000) >> 14); + return format; + +} + +/** + * mpi3mr_build_nvme_sgl - SGL constructor for NVME + * encapsulated request + * @mrioc: Adapter instance reference + * @nvme_encap_request: NVMe encapsulated MPI request + * @drv_bufs: DMA address of the buffers to be placed in sgl + * @bufcnt: Number of DMA buffers + * + * This function places the DMA address of the given buffers in + * proper format as SGEs in the given NVMe encapsulated request. + * + * Return: 0 on success, -1 on failure + */ +static int mpi3mr_build_nvme_sgl(struct mpi3mr_ioc *mrioc, + struct mpi3_nvme_encapsulated_request *nvme_encap_request, + struct mpi3mr_buf_map *drv_bufs, u8 bufcnt) +{ + struct mpi3mr_nvme_pt_sge *nvme_sgl; + u64 sgl_ptr; + u8 count; + size_t length = 0; + struct mpi3mr_buf_map *drv_buf_iter = drv_bufs; + u64 sgemod_mask = ((u64)((mrioc->facts.sge_mod_mask) << + mrioc->facts.sge_mod_shift) << 32); + u64 sgemod_val = ((u64)(mrioc->facts.sge_mod_value) << + mrioc->facts.sge_mod_shift) << 32; + + /* + * Not all commands require a data transfer. If no data, just return + * without constructing any sgl. + */ + for (count = 0; count < bufcnt; count++, drv_buf_iter++) { + if (drv_buf_iter->is_dma == false) + continue; + sgl_ptr = (u64)drv_buf_iter->kern_buf_dma; + length = drv_buf_iter->kern_buf_len; + break; + } + if (!length) + return 0; + + if (sgl_ptr & sgemod_mask) { + dprint_bsg_err(mrioc, + "%s: SGL address collides with SGE modifier\n", + __func__); + return -1; + } + + sgl_ptr &= ~sgemod_mask; + sgl_ptr |= sgemod_val; + nvme_sgl = (struct mpi3mr_nvme_pt_sge *) + ((u8 *)(nvme_encap_request->command) + MPI3MR_NVME_CMD_SGL_OFFSET); + memset(nvme_sgl, 0, sizeof(struct mpi3mr_nvme_pt_sge)); + nvme_sgl->base_addr = sgl_ptr; + nvme_sgl->length = length; + return 0; +} + +/** + * mpi3mr_build_nvme_prp - PRP constructor for NVME + * encapsulated request + * @mrioc: Adapter instance reference + * @nvme_encap_request: NVMe encapsulated MPI request + * @drv_bufs: DMA address of the buffers to be placed in SGL + * @bufcnt: Number of DMA buffers + * + * This function places the DMA address of the given buffers in + * proper format as PRP entries in the given NVMe encapsulated + * request. + * + * Return: 0 on success, -1 on failure + */ +static int mpi3mr_build_nvme_prp(struct mpi3mr_ioc *mrioc, + struct mpi3_nvme_encapsulated_request *nvme_encap_request, + struct mpi3mr_buf_map *drv_bufs, u8 bufcnt) +{ + int prp_size = MPI3MR_NVME_PRP_SIZE; + __le64 *prp_entry, *prp1_entry, *prp2_entry; + __le64 *prp_page; + dma_addr_t prp_entry_dma, prp_page_dma, dma_addr; + u32 offset, entry_len, dev_pgsz; + u32 page_mask_result, page_mask; + size_t length = 0; + u8 count; + struct mpi3mr_buf_map *drv_buf_iter = drv_bufs; + u64 sgemod_mask = ((u64)((mrioc->facts.sge_mod_mask) << + mrioc->facts.sge_mod_shift) << 32); + u64 sgemod_val = ((u64)(mrioc->facts.sge_mod_value) << + mrioc->facts.sge_mod_shift) << 32; + u16 dev_handle = nvme_encap_request->dev_handle; + struct mpi3mr_tgt_dev *tgtdev; + + tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, dev_handle); + if (!tgtdev) { + dprint_bsg_err(mrioc, "%s: invalid device handle 0x%04x\n", + __func__, dev_handle); + return -1; + } + + if (tgtdev->dev_spec.pcie_inf.pgsz == 0) { + dprint_bsg_err(mrioc, + "%s: NVMe device page size is zero for handle 0x%04x\n", + __func__, dev_handle); + mpi3mr_tgtdev_put(tgtdev); + return -1; + } + + dev_pgsz = 1 << (tgtdev->dev_spec.pcie_inf.pgsz); + mpi3mr_tgtdev_put(tgtdev); + + /* + * Not all commands require a data transfer. If no data, just return + * without constructing any PRP. + */ + for (count = 0; count < bufcnt; count++, drv_buf_iter++) { + if (drv_buf_iter->is_dma == false) + continue; + dma_addr = drv_buf_iter->kern_buf_dma; + length = drv_buf_iter->kern_buf_len; + break; + } + + if (!length) + return 0; + + mrioc->prp_sz = 0; + mrioc->prp_list_virt = dma_zalloc_coherent(&mrioc->pdev->dev, + dev_pgsz, &mrioc->prp_list_dma, GFP_KERNEL); + + if (!mrioc->prp_list_virt) + return -1; + mrioc->prp_sz = dev_pgsz; + + /* + * Set pointers to PRP1 and PRP2, which are in the NVMe command. + * PRP1 is located at a 24 byte offset from the start of the NVMe + * command. Then set the current PRP entry pointer to PRP1. + */ + prp1_entry = (__le64 *)((u8 *)(nvme_encap_request->command) + + MPI3MR_NVME_CMD_PRP1_OFFSET); + prp2_entry = (__le64 *)((u8 *)(nvme_encap_request->command) + + MPI3MR_NVME_CMD_PRP2_OFFSET); + prp_entry = prp1_entry; + /* + * For the PRP entries, use the specially allocated buffer of + * contiguous memory. + */ + prp_page = (__le64 *)mrioc->prp_list_virt; + prp_page_dma = mrioc->prp_list_dma; + + /* + * Check if we are within 1 entry of a page boundary we don't + * want our first entry to be a PRP List entry. + */ + page_mask = dev_pgsz - 1; + page_mask_result = (uintptr_t)((u8 *)prp_page + prp_size) & page_mask; + if (!page_mask_result) { + dprint_bsg_err(mrioc, "%s: PRP page is not page aligned\n", + __func__); + goto err_out; + } + + /* + * Set PRP physical pointer, which initially points to the current PRP + * DMA memory page. + */ + prp_entry_dma = prp_page_dma; + + + /* Loop while the length is not zero. */ + while (length) { + page_mask_result = (prp_entry_dma + prp_size) & page_mask; + if (!page_mask_result && (length > dev_pgsz)) { + dprint_bsg_err(mrioc, + "%s: single PRP page is not sufficient\n", + __func__); + goto err_out; + } + + /* Need to handle if entry will be part of a page. */ + offset = dma_addr & page_mask; + entry_len = dev_pgsz - offset; + + if (prp_entry == prp1_entry) { + /* + * Must fill in the first PRP pointer (PRP1) before + * moving on. + */ + *prp1_entry = cpu_to_le64(dma_addr); + if (*prp1_entry & sgemod_mask) { + dprint_bsg_err(mrioc, + "%s: PRP1 address collides with SGE modifier\n", + __func__); + goto err_out; + } + *prp1_entry &= ~sgemod_mask; + *prp1_entry |= sgemod_val; + + /* + * Now point to the second PRP entry within the + * command (PRP2). + */ + prp_entry = prp2_entry; + } else if (prp_entry == prp2_entry) { + /* + * Should the PRP2 entry be a PRP List pointer or just + * a regular PRP pointer? If there is more than one + * more page of data, must use a PRP List pointer. + */ + if (length > dev_pgsz) { + /* + * PRP2 will contain a PRP List pointer because + * more PRP's are needed with this command. The + * list will start at the beginning of the + * contiguous buffer. + */ + *prp2_entry = cpu_to_le64(prp_entry_dma); + if (*prp2_entry & sgemod_mask) { + dprint_bsg_err(mrioc, + "%s: PRP list address collides with SGE modifier\n", + __func__); + goto err_out; + } + *prp2_entry &= ~sgemod_mask; + *prp2_entry |= sgemod_val; + + /* + * The next PRP Entry will be the start of the + * first PRP List. + */ + prp_entry = prp_page; + continue; + } else { + /* + * After this, the PRP Entries are complete. + * This command uses 2 PRP's and no PRP list. + */ + *prp2_entry = cpu_to_le64(dma_addr); + if (*prp2_entry & sgemod_mask) { + dprint_bsg_err(mrioc, + "%s: PRP2 collides with SGE modifier\n", + __func__); + goto err_out; + } + *prp2_entry &= ~sgemod_mask; + *prp2_entry |= sgemod_val; + } + } else { + /* + * Put entry in list and bump the addresses. + * + * After PRP1 and PRP2 are filled in, this will fill in + * all remaining PRP entries in a PRP List, one per + * each time through the loop. + */ + *prp_entry = cpu_to_le64(dma_addr); + if (*prp1_entry & sgemod_mask) { + dprint_bsg_err(mrioc, + "%s: PRP address collides with SGE modifier\n", + __func__); + goto err_out; + } + *prp_entry &= ~sgemod_mask; + *prp_entry |= sgemod_val; + prp_entry++; + prp_entry_dma++; + } + + /* + * Bump the phys address of the command's data buffer by the + * entry_len. + */ + dma_addr += entry_len; + + /* decrement length accounting for last partial page. */ + if (entry_len > length) + length = 0; + else + length -= entry_len; + } + return 0; +err_out: + if (mrioc->prp_list_virt) { + dma_free_coherent(&mrioc->pdev->dev, mrioc->prp_sz, + mrioc->prp_list_virt, mrioc->prp_list_dma); + mrioc->prp_list_virt = NULL; + } + return -1; +} + +/** + * mpi3mr_bsg_process_mpt_cmds - MPI Pass through BSG handler + * @job: BSG job reference + * + * This function is the top level handler for MPI Pass through + * command, this does basic validation of the input data buffers, + * identifies the given buffer types and MPI command, allocates + * DMAable memory for user given buffers, construstcs SGL + * properly and passes the command to the firmware. + * + * Once the MPI command is completed the driver copies the data + * if any and reply, sense information to user provided buffers. + * If the command is timed out then issues controller reset + * prior to returning. + * + * Return: 0 on success and proper error codes on failure + */ + +static long mpi3mr_bsg_process_mpt_cmds(struct bsg_job *job) +{ + long rval = -EINVAL; + + struct mpi3mr_ioc *mrioc = NULL; + u8 *mpi_req = NULL, *sense_buff_k = NULL; + u8 mpi_msg_size = 0; + struct mpi3mr_bsg_packet *bsg_req = NULL; + struct mpi3mr_bsg_mptcmd *karg; + struct mpi3mr_buf_entry *buf_entries = NULL; + struct mpi3mr_buf_map *drv_bufs = NULL, *drv_buf_iter = NULL; + u8 count, bufcnt = 0, is_rmcb = 0, is_rmrb = 0, din_cnt = 0, dout_cnt = 0; + u8 invalid_be = 0, erb_offset = 0xFF, mpirep_offset = 0xFF, sg_entries = 0; + u8 block_io = 0, nvme_fmt = 0, resp_code = 0; + struct mpi3_request_header *mpi_header = NULL; + struct mpi3_status_reply_descriptor *status_desc; + struct mpi3_scsi_task_mgmt_request *tm_req; + u32 erbsz = MPI3MR_SENSE_BUF_SZ, tmplen; + u16 dev_handle; + struct mpi3mr_tgt_dev *tgtdev; + struct mpi3mr_stgt_priv_data *stgt_priv = NULL; + struct mpi3mr_bsg_in_reply_buf *bsg_reply_buf = NULL; + u32 din_size = 0, dout_size = 0; + u8 *din_buf = NULL, *dout_buf = NULL; + u8 *sgl_iter = NULL, *sgl_din_iter = NULL, *sgl_dout_iter = NULL; + + bsg_req = job->request; + karg = (struct mpi3mr_bsg_mptcmd *)&bsg_req->cmd.mptcmd; + + mpi3mr_bsg_verify_adapter(karg->mrioc_id, &mrioc); + if (!mrioc) + return -ENODEV; + + if (karg->timeout < MPI3MR_APP_DEFAULT_TIMEOUT) + karg->timeout = MPI3MR_APP_DEFAULT_TIMEOUT; + + mpi_req = kzalloc(MPI3MR_ADMIN_REQ_FRAME_SZ, GFP_KERNEL); + if (!mpi_req) + return -ENOMEM; + mpi_header = (struct mpi3_request_header *)mpi_req; + + bufcnt = karg->buf_entry_list.num_of_entries; + drv_bufs = kzalloc((sizeof(*drv_bufs) * bufcnt), GFP_KERNEL); + if (!drv_bufs) { + rval = -ENOMEM; + goto out; + } + + dout_buf = (uint8_t *)kzalloc(job->request_payload.payload_len, + GFP_KERNEL); + if (!dout_buf) { + rval = -ENOMEM; + goto out; + } + + din_buf = (uint8_t *)kzalloc(job->reply_payload.payload_len, + GFP_KERNEL); + if (!din_buf) { + rval = -ENOMEM; + goto out; + } + + sg_copy_to_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, + dout_buf, job->request_payload.payload_len); + + buf_entries = karg->buf_entry_list.buf_entry; + sgl_din_iter = din_buf; + sgl_dout_iter = dout_buf; + drv_buf_iter = drv_bufs; + + for (count = 0; count < bufcnt; count++, buf_entries++, drv_buf_iter++) { + + switch (buf_entries->buf_type) { + case MPI3MR_BSG_BUFTYPE_RAIDMGMT_CMD: + sgl_iter = sgl_dout_iter; + drv_buf_iter->data_dir = DATA_OUT; + drv_buf_iter->is_dma = true; + is_rmcb = 1; + if (count != 0) + invalid_be = 1; + break; + case MPI3MR_BSG_BUFTYPE_RAIDMGMT_RESP: + sgl_iter = sgl_din_iter; + drv_buf_iter->data_dir = DATA_IN; + drv_buf_iter->is_dma = true; + is_rmrb = 1; + if (count != 1 || !is_rmcb) + invalid_be = 1; + break; + case MPI3MR_BSG_BUFTYPE_DATA_IN: + sgl_iter = sgl_din_iter; + drv_buf_iter->data_dir = DATA_IN; + drv_buf_iter->is_dma = true; + din_cnt++; + din_size += drv_buf_iter->bsg_buf_len; + if ((din_cnt > 1) && !is_rmcb) + invalid_be = 1; + break; + case MPI3MR_BSG_BUFTYPE_DATA_OUT: + sgl_iter = sgl_dout_iter; + drv_buf_iter->data_dir = DATA_OUT; + drv_buf_iter->is_dma = true; + dout_cnt++; + dout_size += drv_buf_iter->bsg_buf_len; + if ((dout_cnt > 1) && !is_rmcb) + invalid_be = 1; + break; + case MPI3MR_BSG_BUFTYPE_MPI_REPLY: + sgl_iter = sgl_din_iter; + drv_buf_iter->data_dir = DATA_IN; + drv_buf_iter->is_dma = false; + mpirep_offset = count; + break; + case MPI3MR_BSG_BUFTYPE_ERR_RESPONSE: + sgl_iter = sgl_din_iter; + drv_buf_iter->data_dir = DATA_IN; + drv_buf_iter->is_dma = false; + erb_offset = count; + break; + case MPI3MR_BSG_BUFTYPE_MPI_REQUEST: + sgl_iter = sgl_dout_iter; + drv_buf_iter->data_dir = DATA_OUT; + drv_buf_iter->is_dma = false; + mpi_msg_size = buf_entries->buf_len; + if ((!mpi_msg_size || (mpi_msg_size % 4)) || + (mpi_msg_size > MPI3MR_ADMIN_REQ_FRAME_SZ)) { + dprint_bsg_err(mrioc, "%s: invalid MPI message size\n", + __func__); + rval = -EINVAL; + goto out; + } + memcpy(mpi_req, sgl_iter, buf_entries->buf_len); + break; + default: + invalid_be = 1; + break; + } + if (invalid_be) { + dprint_bsg_err(mrioc, "%s: invalid buffer entries passed\n", + __func__); + rval = -EINVAL; + goto out; + } + + if ((drv_buf_iter->data_dir == DATA_OUT)) { + sgl_dout_iter += buf_entries->buf_len; + if (sgl_dout_iter > (dout_buf + job->request_payload.payload_len)) { + dprint_bsg_err(mrioc, "%s: data_out buffer length mismatch\n", + __func__); + rval = -EINVAL; + goto out; + } + } else { + sgl_din_iter += buf_entries->buf_len; + if (sgl_din_iter > (din_buf + job->reply_payload.payload_len)) { + dprint_bsg_err(mrioc, "%s: data_in buffer length mismatch\n", + __func__); + rval = -EINVAL; + goto out; + } + } + + drv_buf_iter->bsg_buf = sgl_iter; + drv_buf_iter->bsg_buf_len = buf_entries->buf_len; + + } + if (!is_rmcb && (dout_cnt || din_cnt)) { + sg_entries = dout_cnt + din_cnt; + if (((mpi_msg_size) + (sg_entries * + sizeof(struct mpi3_sge_common))) > MPI3MR_ADMIN_REQ_FRAME_SZ) { + dprint_bsg_err(mrioc, + "%s:%d: invalid message size passed\n", + __func__, __LINE__); + rval = -EINVAL; + goto out; + } + } + if (din_size > MPI3MR_MAX_APP_XFER_SIZE) { + dprint_bsg_err(mrioc, + "%s:%d: invalid data transfer size passed for function 0x%x din_size=%d\n", + __func__, __LINE__, mpi_header->function, din_size); + rval = -EINVAL; + goto out; + } + if (dout_size > MPI3MR_MAX_APP_XFER_SIZE) { + dprint_bsg_err(mrioc, + "%s:%d: invalid data transfer size passed for function 0x%x dout_size = %d\n", + __func__, __LINE__, mpi_header->function, dout_size); + rval = -EINVAL; + goto out; + } + + drv_buf_iter = drv_bufs; + for (count = 0; count < bufcnt; count++, drv_buf_iter++) { + if (drv_buf_iter->is_dma == false) + continue; + + drv_buf_iter->kern_buf_len = drv_buf_iter->bsg_buf_len; + if (is_rmcb && !count) + drv_buf_iter->kern_buf_len += ((dout_cnt + din_cnt) * + sizeof(struct mpi3_sge_common)); + + if (!drv_buf_iter->kern_buf_len) + continue; + + drv_buf_iter->kern_buf = dma_zalloc_coherent(&mrioc->pdev->dev, + drv_buf_iter->kern_buf_len, &drv_buf_iter->kern_buf_dma, + GFP_KERNEL); + if (!drv_buf_iter->kern_buf) { + rval = -ENOMEM; + goto out; + } + if ((drv_buf_iter->data_dir == DATA_OUT)) { + tmplen = min(drv_buf_iter->kern_buf_len, + drv_buf_iter->bsg_buf_len); + memcpy(drv_buf_iter->kern_buf, drv_buf_iter->bsg_buf, tmplen); + } + } + + if (erb_offset != 0xFF) { + sense_buff_k = kzalloc(erbsz, GFP_KERNEL); + if (!sense_buff_k) { + rval = -ENOMEM; + goto out; + } + } + + if (mutex_lock_interruptible(&mrioc->bsg_cmds.mutex)) { + rval = -ERESTARTSYS; + goto out; + } + if (mrioc->bsg_cmds.state & MPI3MR_CMD_PENDING) { + rval = -EAGAIN; + dprint_bsg_err(mrioc, "%s: command is in use\n", __func__); + mutex_unlock(&mrioc->bsg_cmds.mutex); + goto out; + } + if (mrioc->unrecoverable) { + dprint_bsg_err(mrioc, "%s: unrecoverable controller\n", + __func__); + rval = -EFAULT; + mutex_unlock(&mrioc->bsg_cmds.mutex); + goto out; + } + if (mrioc->reset_in_progress) { + dprint_bsg_err(mrioc, "%s: reset in progress\n", __func__); + rval = -EAGAIN; + mutex_unlock(&mrioc->bsg_cmds.mutex); + goto out; + } + if (mrioc->block_bsgs) { + dprint_bsg_err(mrioc, "%s: bsgs are blocked\n", __func__); + rval = -EAGAIN; + mutex_unlock(&mrioc->bsg_cmds.mutex); + goto out; + } + + if (mpi_header->function != MPI3_FUNCTION_NVME_ENCAPSULATED) { + mpi3mr_bsg_build_sgl(mpi_req, (mpi_msg_size), + drv_bufs, bufcnt, is_rmcb, is_rmrb, + (dout_cnt + din_cnt)); + } + + if (mpi_header->function == MPI3_FUNCTION_NVME_ENCAPSULATED) { + nvme_fmt = mpi3mr_get_nvme_data_fmt( + (struct mpi3_nvme_encapsulated_request *)mpi_req); + if (nvme_fmt == MPI3MR_NVME_DATA_FORMAT_PRP) { + if (mpi3mr_build_nvme_prp(mrioc, + (struct mpi3_nvme_encapsulated_request *)mpi_req, + drv_bufs, bufcnt)) { + rval = -ENOMEM; + mutex_unlock(&mrioc->bsg_cmds.mutex); + goto out; + } + } else if (nvme_fmt == MPI3MR_NVME_DATA_FORMAT_SGL1 || + nvme_fmt == MPI3MR_NVME_DATA_FORMAT_SGL2) { + if (mpi3mr_build_nvme_sgl(mrioc, + (struct mpi3_nvme_encapsulated_request *)mpi_req, + drv_bufs, bufcnt)) { + rval = -EINVAL; + mutex_unlock(&mrioc->bsg_cmds.mutex); + goto out; + } + } else { + dprint_bsg_err(mrioc, + "%s:invalid NVMe command format\n", __func__); + rval = -EINVAL; + mutex_unlock(&mrioc->bsg_cmds.mutex); + goto out; + } + } + if (mpi_header->function == MPI3_FUNCTION_SCSI_TASK_MGMT) { + tm_req = (struct mpi3_scsi_task_mgmt_request *)mpi_req; + if (tm_req->task_type != + MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK) { + dev_handle = tm_req->dev_handle; + block_io = 1; + } + } + if (block_io) { + tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, dev_handle); + if (tgtdev && tgtdev->starget && tgtdev->starget->hostdata) { + stgt_priv = (struct mpi3mr_stgt_priv_data *) + tgtdev->starget->hostdata; + atomic_inc(&stgt_priv->block_io); + mpi3mr_tgtdev_put(tgtdev); + } + } + + mrioc->bsg_cmds.state = MPI3MR_CMD_PENDING; + mrioc->bsg_cmds.is_waiting = 1; + mrioc->bsg_cmds.callback = NULL; + mrioc->bsg_cmds.is_sense = 0; + mrioc->bsg_cmds.sensebuf = sense_buff_k; + memset(mrioc->bsg_cmds.reply, 0, mrioc->reply_sz); + mpi_header->host_tag = cpu_to_le16(MPI3MR_HOSTTAG_BSG_CMDS); + if (mrioc->logging_level & MPI3_DEBUG_BSG_INFO) { + dprint_bsg_info(mrioc, + "%s: posting bsg request to the controller\n", __func__); + dprint_dump(mpi_req, MPI3MR_ADMIN_REQ_FRAME_SZ, + "bsg_mpi3_req"); + if (mpi_header->function == MPI3_FUNCTION_MGMT_PASSTHROUGH) { + drv_buf_iter = &drv_bufs[0]; + dprint_dump(drv_buf_iter->kern_buf, + drv_buf_iter->kern_buf_len, "mpi3_mgmt_req"); + } + } + + init_completion(&mrioc->bsg_cmds.done); + rval = mpi3mr_admin_request_post(mrioc, mpi_req, + MPI3MR_ADMIN_REQ_FRAME_SZ, 0); + + + if (rval) { + mrioc->bsg_cmds.is_waiting = 0; + dprint_bsg_err(mrioc, + "%s: posting bsg request is failed\n", __func__); + rval = -EAGAIN; + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->bsg_cmds.done, + (karg->timeout * HZ)); + if (block_io && stgt_priv) + atomic_dec(&stgt_priv->block_io); + if (!(mrioc->bsg_cmds.state & MPI3MR_CMD_COMPLETE)) { + mrioc->bsg_cmds.is_waiting = 0; + rval = -EAGAIN; + if (mrioc->bsg_cmds.state & MPI3MR_CMD_RESET) + goto out_unlock; + dprint_bsg_err(mrioc, + "%s: bsg request timedout after %d seconds\n", __func__, + karg->timeout); + if (mrioc->logging_level & MPI3_DEBUG_BSG_ERROR) { + dprint_dump(mpi_req, MPI3MR_ADMIN_REQ_FRAME_SZ, + "bsg_mpi3_req"); + if (mpi_header->function == + MPI3_FUNCTION_MGMT_PASSTHROUGH) { + drv_buf_iter = &drv_bufs[0]; + dprint_dump(drv_buf_iter->kern_buf, + drv_buf_iter->kern_buf_len, "mpi3_mgmt_req"); + } + } + if ((mpi_header->function == MPI3_FUNCTION_NVME_ENCAPSULATED) || + (mpi_header->function == MPI3_FUNCTION_SCSI_IO)) + mpi3mr_issue_tm(mrioc, + MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET, + mpi_header->function_dependent, 0, + MPI3MR_HOSTTAG_BLK_TMS, MPI3MR_RESETTM_TIMEOUT, + &mrioc->host_tm_cmds, &resp_code, NULL); + if (!(mrioc->bsg_cmds.state & MPI3MR_CMD_COMPLETE) && + !(mrioc->bsg_cmds.state & MPI3MR_CMD_RESET)) + mpi3mr_soft_reset_handler(mrioc, + MPI3MR_RESET_FROM_APP_TIMEOUT, 1); + goto out_unlock; + } + dprint_bsg_info(mrioc, "%s: bsg request is completed\n", __func__); + + if (mrioc->prp_list_virt) { + dma_free_coherent(&mrioc->pdev->dev, mrioc->prp_sz, + mrioc->prp_list_virt, mrioc->prp_list_dma); + mrioc->prp_list_virt = NULL; + } + + if ((mrioc->bsg_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + dprint_bsg_info(mrioc, + "%s: command failed, ioc_status(0x%04x) log_info(0x%08x)\n", + __func__, + (mrioc->bsg_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->bsg_cmds.ioc_loginfo); + } + + if ((mpirep_offset != 0xFF) && + drv_bufs[mpirep_offset].bsg_buf_len) { + drv_buf_iter = &drv_bufs[mpirep_offset]; + drv_buf_iter->kern_buf_len = (sizeof(*bsg_reply_buf) - 1 + + mrioc->reply_sz); + bsg_reply_buf = kzalloc(drv_buf_iter->kern_buf_len, GFP_KERNEL); + + if (!bsg_reply_buf) { + rval = -ENOMEM; + goto out_unlock; + } + if (mrioc->bsg_cmds.state & MPI3MR_CMD_REPLY_VALID) { + bsg_reply_buf->mpi_reply_type = + MPI3MR_BSG_MPI_REPLY_BUFTYPE_ADDRESS; + memcpy(bsg_reply_buf->reply_buf, + mrioc->bsg_cmds.reply, mrioc->reply_sz); + } else { + bsg_reply_buf->mpi_reply_type = + MPI3MR_BSG_MPI_REPLY_BUFTYPE_STATUS; + status_desc = (struct mpi3_status_reply_descriptor *) + bsg_reply_buf->reply_buf; + status_desc->ioc_status = mrioc->bsg_cmds.ioc_status; + status_desc->ioc_log_info = mrioc->bsg_cmds.ioc_loginfo; + } + tmplen = min(drv_buf_iter->kern_buf_len, + drv_buf_iter->bsg_buf_len); + memcpy(drv_buf_iter->bsg_buf, bsg_reply_buf, tmplen); + } + + if (erb_offset != 0xFF && mrioc->bsg_cmds.sensebuf && + mrioc->bsg_cmds.is_sense) { + drv_buf_iter = &drv_bufs[erb_offset]; + tmplen = min(erbsz, drv_buf_iter->bsg_buf_len); + memcpy(drv_buf_iter->bsg_buf, sense_buff_k, tmplen); + } + + drv_buf_iter = drv_bufs; + for (count = 0; count < bufcnt; count++, drv_buf_iter++) { + if (drv_buf_iter->is_dma == false) + continue; + if (drv_buf_iter->data_dir == DATA_IN) { + tmplen = min(drv_buf_iter->kern_buf_len, + drv_buf_iter->bsg_buf_len); + memcpy(drv_buf_iter->bsg_buf, + drv_buf_iter->kern_buf, tmplen); + } + } + +out_unlock: + if (din_buf) { + job->reply_payload_rcv_len = + sg_copy_from_buffer(job->reply_payload.sg_list, + job->reply_payload.sg_cnt, + din_buf, job->reply_payload.payload_len); + } + mrioc->bsg_cmds.is_sense = 0; + mrioc->bsg_cmds.sensebuf = NULL; + mrioc->bsg_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->bsg_cmds.mutex); +out: + kfree(sense_buff_k); + kfree(dout_buf); + kfree(din_buf); + kfree(mpi_req); + if (drv_bufs) { + drv_buf_iter = drv_bufs; + for (count = 0; count < bufcnt; count++, drv_buf_iter++) { + if (drv_buf_iter->kern_buf && drv_buf_iter->kern_buf_dma) + dma_free_coherent(&mrioc->pdev->dev, + drv_buf_iter->kern_buf_len, + drv_buf_iter->kern_buf, + drv_buf_iter->kern_buf_dma); + } + kfree(drv_bufs); + } + kfree(bsg_reply_buf); + return rval; +} + +/** + * mpi3mr_bsg_request - bsg request entry point + * @job: BSG job reference + * + * This the the drivers entry point for bsg requests coming + * bsg layer + * + * Return: 0 on success and proper error codes on failure + */ +int mpi3mr_bsg_request(struct bsg_job *job) +{ + long rval = -EINVAL; + struct mpi3mr_bsg_packet *bsg_req = job->request; + + switch (bsg_req->cmd_type) { + case MPI3MRDRVCMD: + rval = mpi3mr_bsg_process_drv_cmds(job); + break; + case MPI3MRMPTCMD: + rval = mpi3mr_bsg_process_mpt_cmds(job); + break; + default: + pr_err("%s: unsupported BSG command(0x%08x)\n", + MPI3MR_DRIVER_NAME, bsg_req->cmd_type); + break; + } + + bsg_job_done(job, rval, job->reply_payload_rcv_len); + + return 0; +} + +/** + * mpi3mr_app_save_logdata - Save Log Data events + * @mrioc: Adapter instance reference + * @event_data: event data associated with log data event + * @event_data_size: event data size to copy + * + * If log data event caching is enabled by the applicatiobns, + * then this function saves the log data in the circular queue + * and Sends async signal SIGIO to indicate there is an async + * event from the firmware to the event monitoring applications. + * + * Return:Nothing + */ +void mpi3mr_app_save_logdata(struct mpi3mr_ioc *mrioc, char *event_data, + u16 event_data_size) +{ + u32 index = mrioc->logdata_buf_idx, sz; + struct mpi3mr_logdata_entry *entry; + + if (!(mrioc->logdata_buf)) + return; + + entry = (struct mpi3mr_logdata_entry *) + (mrioc->logdata_buf + (index * mrioc->logdata_entry_sz)); + entry->valid_entry = 1; + sz = min(mrioc->logdata_entry_sz, event_data_size); + memcpy(entry->data, event_data, sz); + mrioc->logdata_buf_idx = + ((++index) % MPI3MR_BSG_LOGDATA_MAX_ENTRIES); + atomic64_inc(&event_counter); +} + +/** + * mpi3mr_bsg_exit - de-registration from bsg layer + * + * This will be called during driver unload and all + * bsg resources allocated during load will be freed. + * + * Return:Nothing + */ +void mpi3mr_bsg_exit(struct mpi3mr_ioc *mrioc) +{ + if (!mrioc->bsg_queue) + return; + + bsg_remove_queue(mrioc->bsg_queue); + mrioc->bsg_queue = NULL; + + device_del(mrioc->bsg_dev); + kfree(mrioc->bsg_dev); + return; +} + +/** + * mpi3mr_bsg_node_release -release bsg device node + * @dev: bsg device node + * + * decrements bsg dev reference count + * + * Return:Nothing + */ +void mpi3mr_bsg_node_release(struct device *dev) +{ + put_device(dev); + return; +} + +/** + * mpi3mr_bsg_init - registration with bsg layer + * + * This will be called during driver load and it will + * register driver with bsg layer + * + * Return:Nothing + */ +void mpi3mr_bsg_init(struct mpi3mr_ioc *mrioc) +{ + mrioc->bsg_dev = kzalloc(sizeof(struct device), GFP_KERNEL); + if (!mrioc->bsg_dev) { + ioc_err(mrioc, "bsg device mem allocation failed\n"); + return; + } + + device_initialize(mrioc->bsg_dev); + dev_set_name(mrioc->bsg_dev, "mpi3mrctl%u", mrioc->id); + + if (device_add(mrioc->bsg_dev)) { + ioc_err(mrioc, "%s: bsg device add failed\n", + dev_name(mrioc->bsg_dev)); + goto err_device_add; + } + + mrioc->bsg_dev->release = mpi3mr_bsg_node_release; + + mrioc->bsg_queue = bsg_setup_queue(mrioc->bsg_dev, dev_name(mrioc->bsg_dev), + mpi3mr_bsg_request, NULL, 0); + if (!mrioc->bsg_queue) { + ioc_err(mrioc, "%s: bsg registration failed\n", + dev_name(mrioc->bsg_dev)); + goto err_setup_queue; + } + + blk_queue_max_segments(mrioc->bsg_queue, MPI3MR_MAX_APP_XFER_SEGMENTS); + blk_queue_max_hw_sectors(mrioc->bsg_queue, MPI3MR_MAX_APP_XFER_SECTORS); + + return; + +err_setup_queue: + device_del(mrioc->bsg_dev); + +err_device_add: + kfree(mrioc->bsg_dev); + return; +} + +/* + * SCSI Host attributes under sysfs + */ +/** + * version_fw_show - SysFS callback for firmware version read + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * + * Return: snprintf() return after copying firmware version + */ +static ssize_t +version_fw_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct mpi3mr_ioc *mrioc = shost_priv(shost); + struct mpi3mr_compimg_ver *fwver = &mrioc->facts.fw_ver; + + return snprintf(buf, PAGE_SIZE, "%d.%d.%d.%d.%05d-%05d\n", + fwver->gen_major, fwver->gen_minor, fwver->ph_major, + fwver->ph_minor, fwver->cust_id, fwver->build_num); +} +static DEVICE_ATTR_RO(version_fw); + +/** + * fw_queue_depth_show - SysFS callback for firmware max cmds + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * + * Return: snprintf() return after copying firmware max commands + */ +static ssize_t +fw_queue_depth_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct mpi3mr_ioc *mrioc = shost_priv(shost); + + return snprintf(buf, PAGE_SIZE, "%d\n", mrioc->facts.max_reqs); +} +static DEVICE_ATTR_RO(fw_queue_depth); + +/** + * op_req_q_count_show - SysFS callback for request queue count + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * + * Return: snprintf() return after copying request queue count + */ +static ssize_t +op_req_q_count_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct mpi3mr_ioc *mrioc = shost_priv(shost); + + return snprintf(buf, PAGE_SIZE, "%d\n", mrioc->num_op_req_q); +} +static DEVICE_ATTR_RO(op_req_q_count); + +/** + * reply_queue_count_show - SysFS callback for reply queue count + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * + * Return: snprintf() return after copying reply queue count + */ +static ssize_t +reply_queue_count_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct mpi3mr_ioc *mrioc = shost_priv(shost); + + return snprintf(buf, PAGE_SIZE, "%d\n", mrioc->num_op_reply_q); +} + +static DEVICE_ATTR_RO(reply_queue_count); + +/** + * mpi3mr_app_logging_level_show - Show controller debug level + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * + * A sysfs 'read/write' shost attribute, to show the current + * debug log level used by the driver for the specific + * controller. + * + * Return: snprintf() return + */ +static ssize_t +mpi3mr_app_logging_level_show(struct device *dev, + struct device_attribute *attr, char *buf) + +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct mpi3mr_ioc *mrioc = shost_priv(shost); + return snprintf(buf, PAGE_SIZE, "%08xh\n", mrioc->logging_level); +} + +/** + * mpi3mr_app_logging_level_store- Change controller debug level + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * @count: size of the buffer + * + * A sysfs 'read/write' shost attribute, to change the current + * debug log level used by the driver for the specific + * controller. + * + * Return: strlen() return + */ +static ssize_t +mpi3mr_app_logging_level_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct mpi3mr_ioc *mrioc = shost_priv(shost); + int val = 0; + + if (sscanf(buf, "%x", &val) != 1) + return -EINVAL; + + mrioc->logging_level = val; + ioc_info(mrioc, "logging_level=%08xh\n", mrioc->logging_level); + return strlen(buf); +} +static DEVICE_ATTR(logging_level, 0644, + mpi3mr_app_logging_level_show, + mpi3mr_app_logging_level_store); + +/** + * adapter_state_show - SysFS callback for adapter state show + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * + * Return: snprintf() return after copying adapter state + */ +static ssize_t +adp_state_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct mpi3mr_ioc *mrioc = shost_priv(shost); + enum mpi3mr_iocstate ioc_state; + uint8_t adp_state; + + ioc_state = mpi3mr_get_iocstate(mrioc); + if (ioc_state == MRIOC_STATE_UNRECOVERABLE) + adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE; + else if ((mrioc->reset_in_progress) || (mrioc->block_bsgs)) + adp_state = MPI3MR_BSG_ADPSTATE_IN_RESET; + else if (ioc_state == MRIOC_STATE_FAULT) + adp_state = MPI3MR_BSG_ADPSTATE_FAULT; + else + adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL; + + return snprintf(buf, PAGE_SIZE, "%u\n", adp_state); +} +static DEVICE_ATTR_RO(adp_state); + + +/** + * mpi3mr_app_complete_tm - SysFS TM completion callback + * @mrioc: Adapter instance reference + * @drv_cmd: Internal command tracker + * + * This is a call back handler for the TM requests issued to the + * firmware through SysFS interface in non blocking mode. This + * functions wakes up pending TM wait queue event when all TMs + * issued are completed. + * + * Return: nothing + */ +static void mpi3mr_app_complete_tm(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd) +{ + struct mpi3_scsi_task_mgmt_reply *tm_reply = NULL; + u16 cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_SYSFS_TM_MIN; + + if (drv_cmd->state & MPI3MR_CMD_RESET) + goto clear_drv_cmd; + + if (drv_cmd->state & MPI3MR_CMD_REPLY_VALID) { + tm_reply = (struct mpi3_scsi_task_mgmt_reply *)drv_cmd->reply; + ioc_info(mrioc, + "%s:TM[%d]: completed, handle(0x%04x), ioc_status(0x%04x), log_info(0x%08x), termination_count(%d), response_code(0x%02x)\n", + __func__, cmd_idx+1, drv_cmd->dev_handle, + drv_cmd->ioc_status, drv_cmd->ioc_loginfo, + le32_to_cpu(tm_reply->termination_count), + (le32_to_cpu(tm_reply->response_data) & + MPI3MR_RI_MASK_RESPCODE)); + mrioc->sysfs_tm_terminated_io_count += + le32_to_cpu(tm_reply->termination_count); + } +clear_drv_cmd: + atomic_dec(&mrioc->sysfs_tm_pending); + if (!atomic_read(&mrioc->sysfs_tm_pending)) + wake_up(&mrioc->sysfs_pending_tm_wq); + drv_cmd->state = MPI3MR_CMD_NOTUSED; + drv_cmd->callback = NULL; + drv_cmd->dev_handle = MPI3MR_INVALID_DEV_HANDLE; + return; +} + +/** + * mpi3mr_app_issue_tm - sends Task Management request + * @mrioc: Adapter instance reference + * @tm_type: Task Management type + * @handle: Firmware device handle + * @lun: lun ID + * @cmd_priv: SCSI command private data + * + * This function sends Task Management request to the firmware + * through admin request queue. + * + * Return: 0 on success, -1 on failure + */ +static int mpi3mr_app_issue_tm(struct mpi3mr_ioc *mrioc, u8 tm_type, + u16 handle, uint lun, struct scmd_priv *cmd_priv) +{ + struct mpi3_scsi_task_mgmt_request tm_req; + int r = -1; + struct op_req_qinfo *op_req_q = NULL; + struct mpi3mr_drv_cmd *drv_cmd; + + if ((mrioc->unrecoverable) || (mrioc->reset_in_progress) || + (mrioc->sysfs_tm_issued >= MPI3MR_NUM_SYSFS_TM)) { + return r; + } + drv_cmd = &mrioc->sysfs_tm_cmds[mrioc->sysfs_tm_issued]; + + if (drv_cmd->state & MPI3MR_CMD_PENDING) + return r; + + memset(&tm_req, 0, sizeof(tm_req)); + drv_cmd->state = MPI3MR_CMD_PENDING; + drv_cmd->is_waiting = 0; + drv_cmd->callback = mpi3mr_app_complete_tm; + drv_cmd->dev_handle = handle; + tm_req.dev_handle = cpu_to_le16(handle); + tm_req.host_tag = cpu_to_le16(drv_cmd->host_tag); + tm_req.function = MPI3_FUNCTION_SCSI_TASK_MGMT; + tm_req.task_type = tm_type; + + int_to_scsilun(lun, (struct scsi_lun *)tm_req.lun); + + if (cmd_priv) { + op_req_q = &mrioc->req_qinfo[cmd_priv->req_q_idx]; + tm_req.task_host_tag = cpu_to_le16(cmd_priv->host_tag); + tm_req.task_request_queue_id = cpu_to_le16(op_req_q->qid); + } + + ioc_info(mrioc, "%s: TM[%d] type (0x%02x) issued for handle (0x%04x)\n", + __func__, mrioc->sysfs_tm_issued + 1, tm_type, handle); + atomic_inc(&mrioc->sysfs_tm_pending); + r = mpi3mr_admin_request_post(mrioc, &tm_req, sizeof(tm_req), 1); + if (r) { + atomic_dec(&mrioc->sysfs_tm_pending); + ioc_err(mrioc, "%s : posting TM[%d] failed\n", __func__, + mrioc->sysfs_tm_issued + 1); + } else + mrioc->sysfs_tm_issued++; + + return r; +} + +/** + * mpi3mr_app_issue_abort_task - sends Task Abort + * @rq: Block I/O request + * @data: Adapter instance + * @reserved: Unused + * + * This function sends Abort Task Management request to the + * firmware, this is iterator callback for every I/O. + * + * Return: 0 on success, -1 on failure + */ +static BLK_ITER_CALLBACK_RET_TYPE mpi3mr_app_issue_abort_task( + struct request *rq, void *data, bool reserved) +{ + struct mpi3mr_ioc *mrioc = (struct mpi3mr_ioc *)data; + struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); + struct scmd_priv *priv = NULL; + struct mpi3mr_stgt_priv_data *stgt_priv_data; + struct mpi3mr_sdev_priv_data *sdev_priv_data; + u16 dev_handle; + + if (scmd) { + sdev_priv_data = scmd->device->hostdata; + if (sdev_priv_data && sdev_priv_data->tgt_priv_data) { + stgt_priv_data = sdev_priv_data->tgt_priv_data; + dev_handle = stgt_priv_data->dev_handle; + priv = scsi_cmd_priv(scmd); + } + } + + if (priv && priv->in_lld_scope && + (mrioc->sysfs_tm_issued < MPI3MR_NUM_SYSFS_TM)) { + mpi3mr_app_issue_tm(mrioc, + MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK, dev_handle, + sdev_priv_data->lun_id, priv); + BLK_ITER_CALLBACK_RET_VAL(true); + } + + BLK_ITER_CALLBACK_RET_VAL(false); +} + +/** + * mpi3mr_app_tm_sysfs - sends TM of given type + * @mrioc: Adapter instance reference + * @tm_type: Task Management type + * + * This function checks TM type and issue appropriate number of + * specific TM to the devices/IO requests under the scope of the + * TM and the controller and waits the TM requests to complete. + * If TM requests are not completed within predefined timeout + * then issues controller reset + * + * Return: Nothing + */ +static void mpi3mr_app_tm_sysfs(struct mpi3mr_ioc *mrioc, u8 tm_type) +{ + struct mpi3mr_tgt_dev *tgtdev; + struct mpi3mr_stgt_priv_data *stgt_priv_data; + struct mpi3mr_sdev_priv_data *sdev_priv_data; + struct scsi_device *sdev; + unsigned long flags, r; + + if ((mrioc->unrecoverable) || (mrioc->reset_in_progress)) + return; + + init_waitqueue_head(&mrioc->sysfs_pending_tm_wq); + atomic_set(&mrioc->sysfs_tm_pending, 0); + mrioc->sysfs_tm_issued = 0; + mrioc->sysfs_tm_terminated_io_count = 0; + + scsi_block_requests(mrioc->shost); + + switch (tm_type) { + case MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK: + blk_mq_tagset_busy_iter(&mrioc->shost->tag_set, + mpi3mr_app_issue_abort_task, (void *)mrioc); + break; + + case MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET: + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) + if (mrioc->sysfs_tm_issued < MPI3MR_NUM_SYSFS_TM) + mpi3mr_app_issue_tm(mrioc, tm_type, + tgtdev->dev_handle, 0, NULL); + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + break; + + case MPI3_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET: + case MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK_SET: + shost_for_each_device(sdev, mrioc->shost) { + /* wait for free hpr message frames */ + sdev_priv_data = sdev->hostdata; + if (!sdev_priv_data || + !sdev_priv_data->tgt_priv_data) + continue; + stgt_priv_data = sdev_priv_data->tgt_priv_data; + if (mrioc->sysfs_tm_issued < MPI3MR_NUM_SYSFS_TM) + mpi3mr_app_issue_tm(mrioc, tm_type, + stgt_priv_data->dev_handle, + sdev_priv_data->lun_id, NULL); + } + break; + } + scsi_unblock_requests(mrioc->shost); + + if (atomic_read(&mrioc->sysfs_tm_pending)) { + r = wait_event_timeout(mrioc->sysfs_pending_tm_wq, + !atomic_read(&mrioc->sysfs_tm_pending), + MPI3MR_SYSFS_TM_TIMEOUT*HZ); + if (!r) { + ioc_err(mrioc, + "%s: %d TM requests timed out\n", __func__, + atomic_read(&mrioc->sysfs_tm_pending)); + mpi3mr_soft_reset_handler(mrioc, + MPI3MR_RESET_FROM_SYSFS_TIMEOUT, 1); + } + } + + ioc_info(mrioc, "%s: task management requests issued(%d)\n", __func__, + mrioc->sysfs_tm_issued); + ioc_info(mrioc, "%s: number of IOs terminated(%d)\n", __func__, + mrioc->sysfs_tm_terminated_io_count); +} + +/** + * mpi3mr_app_task_management_store- Issue a TM/controller reset + * @cdev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * @count: size of the buffer + * + * A sysfs 'read/write' shost attribute, to issue a set of TMs + * or a controller reset to validate the controller firmware for + * user applications. + * + * Return: strlen() return + */ +static ssize_t +mpi3mr_app_task_management_store(struct device *cdev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct Scsi_Host *shost = class_to_shost(cdev); + struct mpi3mr_ioc *mrioc = shost_priv(shost); + int opcode = 0; + + if (sscanf(buf, "%d", &opcode) != 1) + return -EINVAL; + if (mrioc->unrecoverable) + return -EINVAL; + + switch (opcode) { + + case MPI3MR_SYSFS_TM_SOFT_RESET: + scsi_block_requests(mrioc->shost); + ioc_info(mrioc, "%s: soft reset issued, status=%s\n", __func__, + ((!mpi3mr_soft_reset_handler(mrioc, MPI3MR_RESET_FROM_SYSFS, + 0)) ? "SUCCESS" : "FAILED")); + scsi_unblock_requests(mrioc->shost); + break; + + case MPI3MR_SYSFS_TM_DIAG_FAULT_RESET: + scsi_block_requests(mrioc->shost); + ioc_info(mrioc, "%s: diag fault reset issued, status=%s\n", + __func__, + ((!mpi3mr_soft_reset_handler(mrioc, MPI3MR_RESET_FROM_SYSFS, + 1)) ? "SUCCESS" : "FAILED")); + scsi_unblock_requests(mrioc->shost); + break; + + case MPI3MR_SYSFS_TM_ABORT_TASK: + ioc_info(mrioc, "%s: abort task issued\n", __func__); + mpi3mr_app_tm_sysfs(mrioc, + MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK); + break; + + case MPI3MR_SYSFS_TM_TARGET_RESET: + ioc_info(mrioc, "%s: target reset issued\n", __func__); + mpi3mr_app_tm_sysfs(mrioc, + MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET); + break; + + case MPI3MR_SYSFS_TM_LUN_RESET: + ioc_info(mrioc, "%s: lun reset issued\n", __func__); + mpi3mr_app_tm_sysfs(mrioc, + MPI3_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET); + break; + + case MPI3MR_SYSFS_TM_ABORT_TASK_SET: + ioc_info(mrioc, "%s: abort task set issued\n", __func__); + mpi3mr_app_tm_sysfs(mrioc, + MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK_SET); + break; + + default: + ioc_warn(mrioc, "%s: unsupported opcode(%d)\n", + __func__, opcode); + break; + }; + + return strlen(buf); +} +static DEVICE_ATTR(task_management, 0200, NULL, + mpi3mr_app_task_management_store); + +#if (KERNEL_VERSION(5, 16, 0) > LINUX_VERSION_CODE) +struct device_attribute *mpi3mr_host_attrs[] = { + &dev_attr_version_fw, + &dev_attr_fw_queue_depth, + &dev_attr_op_req_q_count, + &dev_attr_reply_queue_count, + &dev_attr_logging_level, + &dev_attr_adp_state, + &dev_attr_task_management, + NULL, +}; +#else +static struct attribute *mpi3mr_host_attrs[] = { + &dev_attr_version_fw.attr, + &dev_attr_fw_queue_depth.attr, + &dev_attr_op_req_q_count.attr, + &dev_attr_reply_queue_count.attr, + &dev_attr_logging_level.attr, + &dev_attr_adp_state.attr, + &dev_attr_task_management.attr, + NULL, +}; + +static const struct attribute_group mpi3mr_host_attr_group = { + .attrs = mpi3mr_host_attrs +}; + +const struct attribute_group *mpi3mr_host_groups[] = { + &mpi3mr_host_attr_group, + NULL, +}; +#endif + +/* + * SCSI Device attributes under sysfs + */ + +/** + * mpi3mr_scsih_ncq_prio_supp - Check ncq priority is supported + * @sdev: scsi device struct + * + * This function returns whether the given sdev is capable for + * setting NCQ priority or not. + * + * Return: 0 when device doesn't support NCQ, 1 otherwise + */ +u8 mpi3mr_scsih_ncq_prio_supp(struct scsi_device *sdev) +{ + unsigned char *buf; + u8 ncq_prio_supp = 0; + + if (!scsi_device_supports_vpd(sdev)) + return ncq_prio_supp; + + buf = kmalloc(SCSI_VPD_PG_LEN, GFP_KERNEL); + if (!buf) + return ncq_prio_supp; + + if (!scsi_get_vpd_page(sdev, 0x89, buf, SCSI_VPD_PG_LEN)) + ncq_prio_supp = (buf[213] >> 4) & 1; + + kfree(buf); + return ncq_prio_supp; +} + +/** + * mpi3mr_app_device_ncq_prio_enable_show - NCQ priority value + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * + * A sysfs 'read/write' sdev attribute, to display NCQ priority + * value. only works with SATA. + * + * Return: snprintf() return + */ +static ssize_t +mpi3mr_app_device_ncq_prio_enable_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct scsi_device *sdev = to_scsi_device(dev); + struct mpi3mr_sdev_priv_data *sdev_priv_data = sdev->hostdata; + + return snprintf(buf, PAGE_SIZE, "%d\n", + sdev_priv_data->ncq_prio_enable); +} + +/** + * mpi3mr_app_device_ncq_prio_enable_store - NCQ priority change + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * @count: size of the buffer + * + * A sysfs 'read/write' sdev attribute, to store NCQ priority + * value. only works with SATA. + * + * Return: strlen() return + */ +static ssize_t +mpi3mr_app_device_ncq_prio_enable_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct scsi_device *sdev = to_scsi_device(dev); + struct mpi3mr_sdev_priv_data *sdev_priv_data = sdev->hostdata; + int ncq_prio_enable = 0; + + if (sscanf(buf, "%d", &ncq_prio_enable) != 1) + return -EINVAL; + + if (!mpi3mr_scsih_ncq_prio_supp(sdev)) + return -EINVAL; + + sdev_priv_data->ncq_prio_enable = ncq_prio_enable; + return strlen(buf); +} +static DEVICE_ATTR(sata_ncq_prio_enable, 0644, + mpi3mr_app_device_ncq_prio_enable_show, + mpi3mr_app_device_ncq_prio_enable_store); + +/** + * sas_address_show - SysFS callback for dev SASaddress display + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * + * Return: snprintf() return after copying SAS address of the + * specific SAS/SATA end device. + */ +static ssize_t +sas_address_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct scsi_device *sdev = to_scsi_device(dev); + struct mpi3mr_sdev_priv_data *sdev_priv_data; + struct mpi3mr_stgt_priv_data *tgt_priv_data; + struct mpi3mr_tgt_dev *tgtdev; + + sdev_priv_data = sdev->hostdata; + if (!sdev_priv_data) + return 0; + + tgt_priv_data = sdev_priv_data->tgt_priv_data; + if (!tgt_priv_data) + return 0; + tgtdev = tgt_priv_data->tgt_dev; + if (!tgtdev || tgtdev->dev_type != MPI3_DEVICE_DEVFORM_SAS_SATA) + return 0; + return snprintf(buf, PAGE_SIZE, "0x%016llx\n", + (unsigned long long)tgtdev->dev_spec.sas_sata_inf.sas_address); +} + +static DEVICE_ATTR_RO(sas_address); + +/** + * device_handle_show - SysFS callback for device handle display + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * + * Return: snprintf() return after copying firmware internal + * device handle of the specific device. + */ +static ssize_t +device_handle_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct scsi_device *sdev = to_scsi_device(dev); + struct mpi3mr_sdev_priv_data *sdev_priv_data; + struct mpi3mr_stgt_priv_data *tgt_priv_data; + struct mpi3mr_tgt_dev *tgtdev; + + sdev_priv_data = sdev->hostdata; + if (!sdev_priv_data) + return 0; + + tgt_priv_data = sdev_priv_data->tgt_priv_data; + if (!tgt_priv_data) + return 0; + tgtdev = tgt_priv_data->tgt_dev; + if (!tgtdev) + return 0; + return snprintf(buf, PAGE_SIZE, "0x%04x\n", tgtdev->dev_handle); +} + +static DEVICE_ATTR_RO(device_handle); + +/** + * persistent_id_show - SysFS callback for persisten ID display + * @dev: class device + * @attr: Device attributes + * @buf: Buffer to copy + * + * Return: snprintf() return after copying persistent ID of the + * of the specific device. + */ +static ssize_t +persistent_id_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct scsi_device *sdev = to_scsi_device(dev); + struct mpi3mr_sdev_priv_data *sdev_priv_data; + struct mpi3mr_stgt_priv_data *tgt_priv_data; + struct mpi3mr_tgt_dev *tgtdev; + + sdev_priv_data = sdev->hostdata; + if (!sdev_priv_data) + return 0; + + tgt_priv_data = sdev_priv_data->tgt_priv_data; + if (!tgt_priv_data) + return 0; + tgtdev = tgt_priv_data->tgt_dev; + if (!tgtdev) + return 0; + return snprintf(buf, PAGE_SIZE, "%d\n", tgtdev->perst_id); +} +static DEVICE_ATTR_RO(persistent_id); + +#if (KERNEL_VERSION(5, 16, 0) > LINUX_VERSION_CODE) +struct device_attribute *mpi3mr_dev_attrs[] = { + &dev_attr_sata_ncq_prio_enable, + &dev_attr_sas_address, + &dev_attr_device_handle, + &dev_attr_persistent_id, + NULL, +}; +#else +static struct attribute *mpi3mr_dev_attrs[] = { + &dev_attr_sata_ncq_prio_enable.attr, + &dev_attr_sas_address.attr, + &dev_attr_device_handle.attr, + &dev_attr_persistent_id.attr, + NULL, +}; + +static const struct attribute_group mpi3mr_dev_attr_group = { + .attrs = mpi3mr_dev_attrs +}; + +const struct attribute_group *mpi3mr_dev_groups[] = { + &mpi3mr_dev_attr_group, + NULL, +}; +#endif + diff --git a/drivers/scsi/mpi3mr/mpi3mr_app.h b/drivers/scsi/mpi3mr/mpi3mr_app.h new file mode 100644 index 0000000000000..b68269336d357 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi3mr_app.h @@ -0,0 +1,450 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Driver for Broadcom MPI3 Storage Controllers + * + * Copyright (C) 2017-2022 Broadcom Inc. + * (mailto: mpi3mr-linuxdrv.pdl@broadcom.com) + * + */ + +#ifndef MPI3MR_APP_H_INCLUDED +#define MPI3MR_APP_H_INCLUDED + +#include + +/* Definitions for BSG commands */ +#define MPI3MR_DEV_NAME "mpi3mrctl" + +#define MPI3MR_IOCTL_VERSION 0x06 + +#define MPI3MR_APP_DEFAULT_TIMEOUT (60) /*seconds*/ + +#define MPI3MR_BSG_ADPTYPE_UNKNOWN 0 +#define MPI3MR_BSG_ADPTYPE_AVGFAMILY 1 + +#define MPI3MR_BSG_ADPSTATE_UNKNOWN 0 +#define MPI3MR_BSG_ADPSTATE_OPERATIONAL 1 +#define MPI3MR_BSG_ADPSTATE_FAULT 2 +#define MPI3MR_BSG_ADPSTATE_IN_RESET 3 +#define MPI3MR_BSG_ADPSTATE_UNRECOVERABLE 4 + +#define MPI3MR_BSG_ADPRESET_UNKNOWN 0 +#define MPI3MR_BSG_ADPRESET_SOFT 1 +#define MPI3MR_BSG_ADPRESET_DIAG_FAULT 2 + +#define MPI3MR_BSG_LOGDATA_MAX_ENTRIES 400 +#define MPI3MR_BSG_LOGDATA_ENTRY_HEADER_SZ 4 + +#define MPI3MR_DRVBSG_OPCODE_UNKNOWN 0 +#define MPI3MR_DRVBSG_OPCODE_ADPINFO 1 +#define MPI3MR_DRVBSG_OPCODE_ADPRESET 2 +#define MPI3MR_DRVBSG_OPCODE_ALLTGTDEVINFO 4 +#define MPI3MR_DRVBSG_OPCODE_GETCHGCNT 5 +#define MPI3MR_DRVBSG_OPCODE_LOGDATAENABLE 6 +#define MPI3MR_DRVBSG_OPCODE_PELENABLE 7 +#define MPI3MR_DRVBSG_OPCODE_GETLOGDATA 8 +#define MPI3MR_DRVBSG_OPCODE_QUERY_HDB 9 +#define MPI3MR_DRVBSG_OPCODE_REPOST_HDB 10 +#define MPI3MR_DRVBSG_OPCODE_UPLOAD_HDB 11 +#define MPI3MR_DRVBSG_OPCODE_REFRESH_HDB_TRIGGERS 12 + + +#define MPI3MR_BSG_BUFTYPE_UNKNOWN 0 +#define MPI3MR_BSG_BUFTYPE_RAIDMGMT_CMD 1 +#define MPI3MR_BSG_BUFTYPE_RAIDMGMT_RESP 2 +#define MPI3MR_BSG_BUFTYPE_DATA_IN 3 +#define MPI3MR_BSG_BUFTYPE_DATA_OUT 4 +#define MPI3MR_BSG_BUFTYPE_MPI_REPLY 5 +#define MPI3MR_BSG_BUFTYPE_ERR_RESPONSE 6 +#define MPI3MR_BSG_BUFTYPE_MPI_REQUEST 0xFE + +#define MPI3MR_BSG_MPI_REPLY_BUFTYPE_UNKNOWN 0 +#define MPI3MR_BSG_MPI_REPLY_BUFTYPE_STATUS 1 +#define MPI3MR_BSG_MPI_REPLY_BUFTYPE_ADDRESS 2 + +#define MPI3MR_HDB_BUFTYPE_UNKNOWN 0 +#define MPI3MR_HDB_BUFTYPE_TRACE 1 +#define MPI3MR_HDB_BUFTYPE_FIRMWARE 2 +#define MPI3MR_HDB_BUFTYPE_RESERVED 3 + +#define MPI3MR_HDB_BUFSTATUS_UNKNOWN 0 +#define MPI3MR_HDB_BUFSTATUS_NOT_ALLOCATED 1 +#define MPI3MR_HDB_BUFSTATUS_POSTED_UNPAUSED 2 +#define MPI3MR_HDB_BUFSTATUS_POSTED_PAUSED 3 +#define MPI3MR_HDB_BUFSTATUS_RELEASED 4 + +#define MPI3MR_HDB_TRIGGER_TYPE_UNKNOWN 0 +#define MPI3MR_HDB_TRIGGER_TYPE_FAULT 1 +#define MPI3MR_HDB_TRIGGER_TYPE_ELEMENT 2 +#define MPI3MR_HDB_TRIGGER_TYPE_MASTER 3 +#define MPI3MR_HDB_TRIGGER_TYPE_SOFT_RESET 4 +#define MPI3MR_HDB_TRIGGER_TYPE_FW_RELEASED 5 + +#define MPI3MR_HDB_REFRESH_TYPE_RESERVED 0 +#define MPI3MR_HDB_REFRESH_TYPE_CURRENT 1 +#define MPI3MR_HDB_REFRESH_TYPE_DEFAULT 2 +#define MPI3MR_HDB_HDB_REFRESH_TYPE_PERSISTENT 3 + +/* Supported BSG commands */ +enum command { + MPI3MRDRVCMD = 1, + MPI3MRMPTCMD = 2, +}; + +/* Data direction definitions */ +enum data_direction { + DATA_IN = 1, + DATA_OUT = 2, +}; + +/** + * struct mpi3mr_bsg_in_adpinfo - Adapter information request + * data returned by the driver. + * + * @adp_type: Adapter type + * @rsvd1: Reserved + * @pci_dev_id: PCI device ID of the adapter + * @pci_dev_hw_rev: PCI revision of the adapter + * @pci_subsys_dev_id: PCI subsystem device ID of the adapter + * @pci_subsys_ven_id: PCI subsystem vendor ID of the adapter + * @pci_dev: PCI device + * @pci_func: PCI function + * @pci_bus: PCI bus + * @rsvd2: Reserved + * @pci_seg_id: PCI segment ID + * @app_intfc_ver: version of the application interface definition + * @rsvd3: Reserved + * @rsvd4: Reserved + * @rsvd5: Reserved + * @driver_info: Driver Information (Version/Name) + */ +struct mpi3mr_bsg_in_adpinfo { + uint32_t adp_type; + uint32_t rsvd1; + uint32_t pci_dev_id; + uint32_t pci_dev_hw_rev; + uint32_t pci_subsys_dev_id; + uint32_t pci_subsys_ven_id; + uint32_t pci_dev:5; + uint32_t pci_func:3; + uint32_t pci_bus:8; + uint16_t rsvd2; + uint32_t pci_seg_id; + uint32_t app_intfc_ver; + uint8_t adp_state; + uint8_t rsvd3; + uint16_t rsvd4; + uint32_t rsvd5[2]; + struct mpi3_driver_info_layout driver_info; +}; + +/** + * struct mpi3mr_bsg_adp_reset - Adapter reset request + * payload data to the driver. + * + * @reset_type: Reset type + * @rsvd1: Reserved + * @rsvd2: Reserved + */ +struct mpi3mr_bsg_adp_reset { + uint8_t reset_type; + uint8_t rsvd1; + uint16_t rsvd2; +}; + +/** + * struct mpi3mr_change_count - Topology change count + * returned by the driver. + * + * @change_count: Topology change count + * @rsvd: Reserved + */ +struct mpi3mr_change_count { + uint16_t change_count; + uint16_t rsvd; +}; + +/** + * struct mpi3mr_device_map_info - Target device mapping + * information + * + * @handle: Firmware device handle + * @perst_id: Persistent ID assigned by the firmware + * @target_id: Target ID assigned by the driver + * @bus_id: Bus ID assigned by the driver + * @rsvd1: Reserved + * @rsvd2: Reserved + */ +struct mpi3mr_device_map_info { + uint16_t handle; + uint16_t perst_id; + uint32_t target_id; + uint8_t bus_id; + uint8_t rsvd1; + uint16_t rsvd2; +}; + +/** + * struct mpi3mr_all_tgt_info - Target device mapping + * information returned by the driver + * + * @num_devices: The number of devices in driver's inventory + * @rsvd1: Reserved + * @rsvd2: Reserved + * @dmi: Variable length array of mapping information of targets + */ +struct mpi3mr_all_tgt_info { + uint16_t num_devices; //The number of devices in driver's inventory + uint16_t rsvd1; + uint32_t rsvd2; + struct mpi3mr_device_map_info dmi[1]; //Variable length Array +}; + +/** + * struct mpi3mr_logdata_enable - Number of log data + * entries saved by the driver returned as payload data for + * enable logdata BSG request by the driver. + * + * @max_entries: Number of log data entries cached by the driver + * @rsvd: Reserved + */ +struct mpi3mr_logdata_enable { + uint16_t max_entries; + uint16_t rsvd; +}; + +/** + * struct mpi3mr_bsg_out_pel_enable - PEL enable request payload + * data to the driver. + * + * @pel_locale: PEL locale to the firmware + * @pel_class: PEL class to the firmware + * @rsvd: Reserved + */ +struct mpi3mr_bsg_out_pel_enable { + uint16_t pel_locale; + uint8_t pel_class; + uint8_t rsvd; +}; + +/** + * struct mpi3mr_logdata_entry - Log data entry cached by the + * driver. + * + * @valid_entry: Is the entry valid + * @rsvd1: Reserved + * @rsvd2: Reserved + * @data: Log entry data of controller specific size + */ +struct mpi3mr_logdata_entry { + uint8_t valid_entry; + uint8_t rsvd1; + uint16_t rsvd2; + uint8_t data[1]; //Variable length Array +}; + +/** + * struct mpi3mr_bsg_in_log_data - Log data entries saved by + * the driver returned as payload data for Get logdata request + * by the driver. + * + * @entry: Log data entry + */ +struct mpi3mr_bsg_in_log_data { + struct mpi3mr_logdata_entry entry[1]; //Variable length Array +}; + +/** + * struct mpi3mr_hdb_entry - host diag buffer entry. + * + * @buf_type: Buffer type + * @status: Buffer status + * @trigger_type: Trigger type + * @rsvd1: Reserved + * @size: Buffer size + * @rsvd2: Reserved + * @trigger_data: Trigger specific data + * @rsvd3: Reserved + * @rsvd4: Reserved + */ +struct mpi3mr_hdb_entry { + uint8_t buf_type; + uint8_t status; + uint8_t trigger_type; + uint8_t rsvd1; + uint16_t size; + uint16_t rsvd2; + uint64_t trigger_data; + uint32_t rsvd3; + uint32_t rsvd4; +}; + + +/** + * struct mpi3mr_bsg_in_hdb_status - This structure contains + * return data for the BSG request to retrieve the number of host + * diagnostic buffers supported by the driver and their current + * status and additional status specific data if any in forms of + * multiple hdb entries. + * + * @num_hdb_types: Number of host diag buffer types supported + * @rsvd1: Reserved + * @rsvd2: Reserved + * @rsvd3: Reserved + * @entry: Diag buffer status entry + */ +struct mpi3mr_bsg_in_hdb_status { + uint8_t num_hdb_types; + uint8_t rsvd1; + uint16_t rsvd2; + uint32_t rsvd3; + struct mpi3mr_hdb_entry entry[1]; //Variable length Array +}; + +/** + * struct mpi3mr_bsg_out_repost_hdb - Repost host diagnostic + * buffer request payload data to the driver. + * + * @buf_type: Buffer type + * @rsvd1: Reserved + * @rsvd2: Reserved + */ +struct mpi3mr_bsg_out_repost_hdb { + uint8_t buf_type; + uint8_t rsvd1; + uint16_t rsvd2; +}; + +/** + * struct mpi3mr_bsg_out_upload_hdb - Upload host diagnostic + * buffer request payload data to the driver. + * + * @buf_type: Buffer type + * @rsvd1: Reserved + * @rsvd2: Reserved + * @start_offset: Start offset of the buffer from where to copy + * @length: Length of the buffer to copy + */ +struct mpi3mr_bsg_out_upload_hdb { + uint8_t buf_type; + uint8_t rsvd1; + uint16_t rsvd2; + uint32_t start_offset; + uint32_t length; +}; + +/** + * struct mpi3mr_bsg_out_refresh_hdb_triggers - Refresh host + * diagnostic buffer triggers request payload data to the driver. + * + * @page_type: Page type + * @rsvd1: Reserved + * @rsvd2: Reserved + */ +struct mpi3mr_bsg_out_refresh_hdb_triggers { + uint8_t page_type; + uint8_t rsvd1; + uint16_t rsvd2; +}; + +/** + * struct mpi3mr_bsg_drv_cmd - Generic bsg data + * structure for all driver specific requests. + * + * @mrioc_id: Controller ID + * @opcode: Driver specific opcode + * @rsvd1: Reserved + * @rsvd2: Reserved + */ +struct mpi3mr_bsg_drv_cmd { + uint8_t mrioc_id; + uint8_t opcode; + uint16_t rsvd1; + uint32_t rsvd2[4]; +}; +/** + * struct mpi3mr_bsg_in_reply_buf - MPI reply buffer returned + * for MPI Passthrough request . + * + * @mpi_reply_type: Type of MPI reply + * @rsvd1: Reserved + * @rsvd2: Reserved + * @reply_buf: Variable Length buffer based on mpirep type + */ +struct mpi3mr_bsg_in_reply_buf { + uint8_t mpi_reply_type; + uint8_t rsvd1; + uint16_t rsvd2; + uint8_t reply_buf[1]; /*Variable Length buffer based on mpirep type*/ +}; +/** + * struct mpi3mr_buf_entry - User buffer descriptor for MPI + * Passthrough requests. + * + * @buf_type: Buffer type + * @rsvd1: Reserved + * @rsvd2: Reserved + * @buf_len: Buffer length + */ +struct mpi3mr_buf_entry { + uint8_t buf_type; + uint8_t rsvd1; + uint16_t rsvd2; + uint32_t buf_len; +}; +/** + * struct mpi3mr_bsg_buf_entry_list - list of user buffer + * descriptor for MPI Passthrough requests. + * + * @num_of_entries: Number of buffer descriptors + * @rsvd1: Reserved + * @rsvd2: Reserved + * @rsvd3: Reserved + * @buf_entry: Variable length array of buffer descriptors + */ +struct mpi3mr_buf_entry_list { + uint8_t num_of_entries; + uint8_t rsvd1; + uint16_t rsvd2; + uint32_t rsvd3; + struct mpi3mr_buf_entry buf_entry[1]; +}; +/** + * struct mpi3mr_bsg_mptcmd - Generic bsg data + * structure for all MPI Passthrough requests . + * + * @mrioc_id: Controller ID + * @rsvd1: Reserved + * @timeout: MPI request timeout + * @buf_entry_list: Buffer descriptor list + */ +struct mpi3mr_bsg_mptcmd { + uint8_t mrioc_id; + uint8_t rsvd1; + uint16_t timeout; + uint32_t rsvd2; + struct mpi3mr_buf_entry_list buf_entry_list; +}; + +/** + * struct mpi3mr_bsg_packet - Generic bsg data + * structure for all supported requests . + * + * @cmd_type: represents drvrcmd or mptcmd + * @rsvd1: Reserved + * @rsvd2: Reserved + * @drvrcmd: driver request structure + * @mptcmd: mpt request structure + */ +struct mpi3mr_bsg_packet { + uint8_t cmd_type; + uint8_t rsvd1; + uint16_t rsvd2; + uint32_t rsvd3; + union { + struct mpi3mr_bsg_drv_cmd drvrcmd; + struct mpi3mr_bsg_mptcmd mptcmd; + } cmd; +}; + +#endif diff --git a/drivers/scsi/mpi3mr/mpi3mr_debug.h b/drivers/scsi/mpi3mr/mpi3mr_debug.h new file mode 100644 index 0000000000000..010dfcfac94b2 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi3mr_debug.h @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Driver for Broadcom MPI3 Storage Controllers + * + * Copyright (C) 2017-2022 Broadcom Inc. + * (mailto: mpi3mr-linuxdrv.pdl@broadcom.com) + * + */ + +#ifndef MPI3SAS_DEBUG_H_INCLUDED + +#define MPI3SAS_DEBUG_H_INCLUDED + +/* + * debug levels + */ + +#define MPI3_DEBUG_EVENT 0x00000001 +#define MPI3_DEBUG_EVENT_WORK_TASK 0x00000002 +#define MPI3_DEBUG_INIT 0x00000004 +#define MPI3_DEBUG_EXIT 0x00000008 +#define MPI3_DEBUG_TM 0x00000010 +#define MPI3_DEBUG_RESET 0x00000020 +#define MPI3_DEBUG_SCSI_ERROR 0x00000040 +#define MPI3_DEBUG_REPLY 0x00000080 +#define MPI3_DEBUG_CFG_ERROR 0x00000100 +#define MPI3_DEBUG_TRANSPORT_ERROR 0x00000200 +#define MPI3_DEBUG_BSG_ERROR 0x00008000 +#define MPI3_DEBUG_BSG_INFO 0x00010000 +#define MPI3_DEBUG_SCSI_INFO 0x00020000 +#define MPI3_DEBUG_CFG_INFO 0x00040000 +#define MPI3_DEBUG_TRANSPORT_INFO 0x00080000 +#define MPI3_DEBUG 0x01000000 +#define MPI3_DEBUG_SG 0x02000000 + + +/* + * debug macros + */ + +#define ioc_err(ioc, fmt, ...) \ + pr_err("%s: " fmt, (ioc)->name, ##__VA_ARGS__) +#define ioc_notice(ioc, fmt, ...) \ + pr_notice("%s: " fmt, (ioc)->name, ##__VA_ARGS__) +#define ioc_warn(ioc, fmt, ...) \ + pr_warn("%s: " fmt, (ioc)->name, ##__VA_ARGS__) +#define ioc_info(ioc, fmt, ...) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__) + +#define dprint(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_event_th(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_EVENT) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_event_bh(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_EVENT_WORK_TASK) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_init(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_INIT) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_exit(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_EXIT) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_tm(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_TM) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_reply(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_REPLY) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_reset(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_RESET) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_scsi_info(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_SCSI_INFO) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_scsi_err(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_SCSI_ERROR) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_scsi_command(ioc, SCMD, LOG_LEVEL) \ + do { \ + if (ioc->logging_level & LOG_LEVEL) \ + scsi_print_command(SCMD); \ + } while (0) + + +#define dprint_bsg_info(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_BSG_INFO) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_bsg_err(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_BSG_ERROR) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_cfg_info(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_CFG_INFO) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_cfg_err(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_CFG_ERROR) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) +#define dprint_transport_info(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#define dprint_transport_err(ioc, fmt, ...) \ + do { \ + if (ioc->logging_level & MPI3_DEBUG_TRANSPORT_ERROR) \ + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \ + } while (0) + +#endif /* MPT3SAS_DEBUG_H_INCLUDED */ + +/** + * dprint_dump - print contents of a memory buffer + * @req: Pointer to a memory buffer + * @sz: Memory buffer size + * @namestr: Name String to identify the buffer type + */ +static inline void +dprint_dump(void *req, int sz, const char *name_string) +{ + int i; + __le32 *mfp = (__le32 *)req; + sz = sz/4; + + if (name_string) + pr_info("%s:\n\t", name_string); + else + pr_info("request:\n\t"); + for (i = 0; i < sz; i++) { + if (i && ((i % 8) == 0)) + pr_info("\n\t"); + pr_info("%08x ", le32_to_cpu(mfp[i])); + } + pr_info("\n"); +} + + diff --git a/drivers/scsi/mpi3mr/mpi3mr_debugfs.c b/drivers/scsi/mpi3mr/mpi3mr_debugfs.c new file mode 100644 index 0000000000000..62201826bcd9e --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi3mr_debugfs.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Broadcom MPI3 Storage Controllers + * + * Copyright (C) 2017-2022 Broadcom Inc. + * (mailto: mpi3mr-linuxdrv.pdl@broadcom.com) + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +#include "mpi3mr.h" + +#ifdef CONFIG_DEBUG_FS +#include + +struct dentry *mpi3mr_dbgfs_root; + +struct mpi3mr_debugfs_buffer { + void *buf; + u32 len; +}; + +static ssize_t +mpi3mr_debugfs_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) + +{ + struct mpi3mr_debugfs_buffer *debug = filp->private_data; + + if (!debug || !debug->buf) + return 0; + + return simple_read_from_buffer(ubuf, cnt, ppos, debug->buf, debug->len); +} + +static int +mpi3mr_debugfs_dmesg_open(struct inode *inode, struct file *file) +{ + struct mpi3mr_ioc *mrioc = inode->i_private; + struct mpi3mr_debugfs_buffer *debug; + + if (!mrioc->drv_diag_buffer) + return -EPERM; + + debug = kzalloc(sizeof(struct mpi3mr_debugfs_buffer), GFP_KERNEL); + if (!debug) + return -ENOMEM; + + debug->buf = (void *)mrioc->drv_diag_buffer + sizeof(struct mpi3_driver_buffer_header); + debug->len = mrioc->drv_diag_buffer_sz - sizeof(struct mpi3_driver_buffer_header); + + file->private_data = debug; + + return 0; +} + +static int +mpi3mr_debugfs_uefi_logs_open(struct inode *inode, struct file *file) +{ + struct mpi3mr_ioc *mrioc = inode->i_private; + struct mpi3mr_debugfs_buffer *debug; + + if (!mrioc->uefi_logs) + return -EPERM; + + debug = kzalloc(sizeof(struct mpi3mr_debugfs_buffer), GFP_KERNEL); + if (!debug) + return -ENOMEM; + + debug->buf = (void *)mrioc->uefi_logs; + debug->len = mrioc->uefi_logs_sz; + + file->private_data = debug; + + return 0; +} +static int +mpi3mr_debugfs_release(struct inode *inode, struct file *file) +{ + struct mpi3mr_debug_buffer *debug = file->private_data; + + if (!debug) + return 0; + + file->private_data = NULL; + kfree(debug); + return 0; +} + +static const struct file_operations mpi3mr_debugfs_dmesg_fops = { + .owner = THIS_MODULE, + .open = mpi3mr_debugfs_dmesg_open, + .read = mpi3mr_debugfs_read, + .release = mpi3mr_debugfs_release, +}; + +static const struct file_operations mpi3mr_debugfs_uefi_logs_fops = { + .owner = THIS_MODULE, + .open = mpi3mr_debugfs_uefi_logs_open, + .read = mpi3mr_debugfs_read, + .release = mpi3mr_debugfs_release, +}; + +/* + * mpi3mr_init_debugfs : Create debugfs root for mpi3mr driver + */ +void mpi3mr_init_debugfs(void) +{ + mpi3mr_dbgfs_root = debugfs_create_dir(MPI3MR_DRIVER_NAME, NULL); + if (!mpi3mr_dbgfs_root) + pr_info("Cannot create debugfs root\n"); +} + +/* + * mpi3mr_exit_debugfs : Remove debugfs root for mpi3mr driver + */ +void mpi3mr_exit_debugfs(void) +{ + debugfs_remove_recursive(mpi3mr_dbgfs_root); +} + +/* + * mpi3mr_setup_debugfs : Setup debugfs per adapter + * mrioc: Soft instance of adapter + */ +void +mpi3mr_setup_debugfs(struct mpi3mr_ioc *mrioc) +{ + char name[64]; + int i; + + snprintf(name, sizeof(name), "scsi_host%d", mrioc->shost->host_no); + + if (!mrioc->dbgfs_adapter) { + mrioc->dbgfs_adapter = + debugfs_create_dir(name, mpi3mr_dbgfs_root); + + if (!mrioc->dbgfs_adapter) { + ioc_err(mrioc, + "failed to create per adapter debugfs directory\n"); + return; + } + } + + for (i = 0; i < mrioc->num_queues; i++) { + snprintf(name, sizeof(name), "queue%d", mrioc->req_qinfo[i].qid); + mrioc->req_qinfo[i].dbgfs_req_queue = + debugfs_create_dir(name, mrioc->dbgfs_adapter); + + if (!mrioc->req_qinfo[i].dbgfs_req_queue) { + ioc_err(mrioc, + "failed to create per request queue debugfs directory\n"); + debugfs_remove_recursive(mrioc->dbgfs_adapter); + mrioc->dbgfs_adapter = NULL; + return; + } + + debugfs_create_u32("qfull_instances", 0444, + mrioc->req_qinfo[i].dbgfs_req_queue, + &mrioc->req_qinfo[i].qfull_instances); + + debugfs_create_u64("qfull_io_count", 0644, + mrioc->req_qinfo[i].dbgfs_req_queue, + &mrioc->req_qinfo[i].qfull_io_count); + } + + /* This interface to dump system logs in host space is for test/verify purpose only */ + snprintf(name, sizeof(name), "dmesg"); + mrioc->dmesg_dump = + debugfs_create_file(name, 0444, + mrioc->dbgfs_adapter, + mrioc, &mpi3mr_debugfs_dmesg_fops); + if (!mrioc->dmesg_dump) { + ioc_err(mrioc, "cannot create dmesg debugfs file\n"); + debugfs_remove(mrioc->dbgfs_adapter); + } + + snprintf(name, sizeof(name), "uefi_logs"); + mrioc->uefi_logs_dump = + debugfs_create_file(name, 0444, + mrioc->dbgfs_adapter, + mrioc, &mpi3mr_debugfs_uefi_logs_fops); + if (!mrioc->uefi_logs_dump) { + ioc_err(mrioc, "cannot create uefi debugfs file\n"); + debugfs_remove(mrioc->dbgfs_adapter); + } +} + +/* + * mpi3mr_destroy_debugfs : Destroy debugfs per adapter + * mrioc: Soft instance of adapter + */ +void mpi3mr_destroy_debugfs(struct mpi3mr_ioc *mrioc) +{ + debugfs_remove_recursive(mrioc->dbgfs_adapter); + mrioc->dbgfs_adapter = NULL; +} + +#else +void mpi3mr_init_debugfs(void) +{ +} +void mpi3mr_exit_debugfs(void) +{ +} +void mpi3mr_setup_debugfs(struct mpi3mr_ioc *mrioc) +{ +} +void mpi3mr_destroy_debugfs(struct mpi3mr_ioc *mrioc) +{ +} +#endif /*CONFIG_DEBUG_FS*/ diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c new file mode 100644 index 0000000000000..20bee864977c5 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -0,0 +1,6778 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Broadcom MPI3 Storage Controllers + * + * Copyright (C) 2017-2022 Broadcom Inc. + * (mailto: mpi3mr-linuxdrv.pdl@broadcom.com) + * + */ + +#include "mpi3mr.h" +#include "mpi3mr_app.h" + +int poll_queues; +module_param(poll_queues, int, 0444); +#if (KERNEL_VERSION(5, 13, 0) <= LINUX_VERSION_CODE) +MODULE_PARM_DESC(poll_queues, "Number of queues for io_uring poll mode. (Range 1 - 126)"); +#else +MODULE_PARM_DESC(poll_queues, "This parameter is unused in this version of the kernel (Try kernel >= 5.13)"); +#endif + +bool enable_segqueue = true; +module_param(enable_segqueue, bool, 0444); +MODULE_PARM_DESC(enable_segqueue, + "Enable segmented operational request & reply queues in supported controllers (Default = 1)"); + +int drv_db_level = 1; +module_param(drv_db_level, int, 0444); +MODULE_PARM_DESC(drv_db_level, "Driver diagnostic buffer level(Default=1).\n\t\t" + "options:\n\t\t" + "0 = disabled: Driver diagnostic buffer not captured\n\t\t" + "1 = minidump: Driver diagnostic buffer captures prints\n\t\t" + "related to specific mrioc instance\n\t\t" + "2 = fulldump: Driver diagnostic buffer captures prints\n\t\t" + "related to specific mrioc instance and complete dmesg logs" + ); + +extern int enable_dix; + +static void mpi3mr_pel_wait_complete(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd); + +#if defined(writeq) && defined(CONFIG_64BIT) +static inline void mpi3mr_writeq(__u64 b, volatile void __iomem *addr, + spinlock_t *write_queue_lock) +{ + writeq(b, addr); +} +#else +static inline void mpi3mr_writeq(__u64 b, volatile void __iomem *addr, + spinlock_t *write_queue_lock) +{ + __u64 data_out = b; + unsigned long flags; + + spin_lock_irqsave(write_queue_lock, flags); + + writel((u32)(data_out), addr); + writel((u32)(data_out >> 32), (addr + 4)); + + spin_unlock_irqrestore(write_queue_lock, flags); +} +#endif + +#if defined(readq) && defined(CONFIG_64BIT) +static inline __u64 mpi3mr_readq(const volatile void __iomem *addr) +{ + return readq(addr); +} +#else +static inline __u64 mpi3mr_readq(const volatile void __iomem *addr) +{ + const volatile u32 __iomem *p = addr; + u32 low, high; + + low = readl(p); + high = readl(p + 1); + return low + ((u64)high << 32); +} +#endif +/** + * mpi3mr_check_req_qfull - Check request queue is full or not + * @op_req_q: Operational reply queue info + * + * Return: true when queue full, false otherwise. + */ +static inline bool +mpi3mr_check_req_qfull(struct op_req_qinfo *op_req_q) +{ + u16 pi, ci, max_entries; + bool is_qfull = false; + + pi = op_req_q->pi; + ci = READ_ONCE(op_req_q->ci); + max_entries = op_req_q->num_requests; + + if ((ci == (pi + 1)) || ((!ci) && (pi == (max_entries - 1)))) + is_qfull = true; + + return is_qfull; +} + +/** + * mpi3mr_sync_irqs - Synchronize all IRQs + * @mrioc: Adapter instance reference + * + * Return: Nothing. + */ +static void mpi3mr_sync_irqs(struct mpi3mr_ioc *mrioc) +{ + u16 i, max_vectors; + + max_vectors = mrioc->intr_info_count; + + for (i = 0; i < max_vectors; i++) + synchronize_irq(pci_irq_vector(mrioc->pdev, i)); +} + +/** + * mpi3mr_ioc_disable_intr - Disable controller interrupts + * @mrioc: Adapter instance reference + * + * Return: Nothing. + */ +void mpi3mr_ioc_disable_intr(struct mpi3mr_ioc *mrioc) +{ + mrioc->intr_enabled = 0; + mpi3mr_sync_irqs(mrioc); +} + +/** + * mpi3mr_ioc_enable_intr - Enable controller interrupts + * @mrioc: Adapter instance reference + * + * Return: Nothing. + */ +void mpi3mr_ioc_enable_intr(struct mpi3mr_ioc *mrioc) +{ + mrioc->intr_enabled = 1; +} + +/** + * mpi3mr_cleanup_isr - Cleanup IRQs + * @mrioc: Adapter instance reference + * + * Disable interrupts, Free all IRQs, free memory for interrupt + * information and free IRQ vectors. + * + * Return: Nothing. + */ +static void mpi3mr_cleanup_isr(struct mpi3mr_ioc *mrioc) +{ + u16 i; + + mpi3mr_ioc_disable_intr(mrioc); + + if (!mrioc->intr_info) + return; + + for (i = 0; i < mrioc->intr_info_count; i++) + free_irq(pci_irq_vector(mrioc->pdev, i), + (mrioc->intr_info + i)); + + kfree(mrioc->intr_info); + mrioc->intr_info = NULL; + mrioc->intr_info_count = 0; + mrioc->is_intr_info_set = false; + pci_free_irq_vectors(mrioc->pdev); +} + +/** + * mpi3mr_add_sg_single - Build a scatter gather element(sge) + * @paddr: SGE address + * @flags: SGE flags + * @length: SGE length + * @dma_addr: DMA address + * + * Set the SGE element in the given paddr. + * + * Return: Nothing. + */ +void mpi3mr_add_sg_single(void *paddr, u8 flags, u32 length, + dma_addr_t dma_addr) +{ + struct mpi3_sge_common *sgel = paddr; + + sgel->flags = flags; + sgel->length = cpu_to_le32(length); + sgel->address = cpu_to_le64(dma_addr); +} + +/** + * mpi3mr_build_zero_len_sge - Build zero length SGE + * @paddr: SGE address + * + * Set the length of SGE as 0 and address as all FFs to indicate + * this is a zero length SGE (for no data transfer). + * + * Return: Nothing. + */ +void mpi3mr_build_zero_len_sge(void *paddr) +{ + u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST; + + mpi3mr_add_sg_single(paddr, sgl_flags, 0, -1); + +} + +/** + * mpi3mr_get_reply_virt_addr - Map reply buffer DMA address + * @mrioc: Adapter instance reference + * @phys_addr: reply buffer DMA address + * + * Map reply buffer DMA address to virtual address. + * + * Return: NULL on failure, virtual address on success. + */ +void *mpi3mr_get_reply_virt_addr(struct mpi3mr_ioc *mrioc, + dma_addr_t phys_addr) +{ + if (!phys_addr) + return NULL; + + if ((phys_addr < mrioc->reply_buf_dma) || + (phys_addr > mrioc->reply_buf_dma_max_address)) + return NULL; + + return mrioc->reply_buf + (phys_addr - mrioc->reply_buf_dma); +} + +/** + * mpi3mr_get_sensebuf_virt_addr - Map sense buffer DMA address + * @mrioc: Adapter instance reference + * @phys_addr: Sense buffer DMA address + * + * Map sense buffer DMA address to virtual address. + * + * Return: NULL on failure, virtual address on success. + */ +void *mpi3mr_get_sensebuf_virt_addr(struct mpi3mr_ioc *mrioc, + dma_addr_t phys_addr) +{ + if (!phys_addr) + return NULL; + + return mrioc->sense_buf + (phys_addr - mrioc->sense_buf_dma); +} +/** + * mpi3mr_repost_reply_buf - Post replybuffer to queue + * @mrioc: Adapter instance reference + * @reply_dma: Reply buffer DMA address + * + * Store the reply buffer DMA address into a free element in the + * reply buffer free queue and write the host index to the + * reply_free_host_index to let the hardware know a free reply + * buffer is available. + * + * Return: Nothing. + */ +static void mpi3mr_repost_reply_buf(struct mpi3mr_ioc *mrioc, + u64 reply_dma) +{ + u32 old_idx = 0; + unsigned long flags; + + spin_lock_irqsave(&mrioc->reply_free_queue_lock, flags); + old_idx = mrioc->reply_free_queue_host_index; + mrioc->reply_free_queue_host_index = ( + (mrioc->reply_free_queue_host_index == + (mrioc->reply_free_qsz - 1)) ? 0 : + (mrioc->reply_free_queue_host_index + 1)); + mrioc->reply_free_q[old_idx] = cpu_to_le64(reply_dma); + writel(mrioc->reply_free_queue_host_index, + &mrioc->sysif_regs->reply_free_host_index); + spin_unlock_irqrestore(&mrioc->reply_free_queue_lock, flags); +} + +/** + * mpi3mr_repost_sense_buf - Post sensebuffer to queue + * @mrioc: Adapter instance reference + * @sense_buf_dma: Sense buffer DMA address + * + * Store the sense buffer DMA address into a free element in the + * sense buffer free queue and write the host index to the + * sense_buffer_free_host_index to let the hardware know a free + * buffer is available. + * + * Return: Nothing. + */ +void mpi3mr_repost_sense_buf(struct mpi3mr_ioc *mrioc, + u64 sense_buf_dma) +{ + u32 old_idx = 0; + unsigned long flags; + + spin_lock_irqsave(&mrioc->sbq_lock, flags); + old_idx = mrioc->sbq_host_index; + mrioc->sbq_host_index = ((mrioc->sbq_host_index == + (mrioc->sense_buf_q_sz - 1)) ? 0 : + (mrioc->sbq_host_index + 1)); + mrioc->sense_buf_q[old_idx] = cpu_to_le64(sense_buf_dma); + writel(mrioc->sbq_host_index, + &mrioc->sysif_regs->sense_buffer_free_host_index); + spin_unlock_irqrestore(&mrioc->sbq_lock, flags); +} + + +/** + * mpi3mr_print_event_data - Print event details + * @mrioc: Adapter instance reference + * @event_reply: MPI3 event + * + * Prints the event details when debug level is enabled to print + * events. + * + * Return: Nothing. + */ +static void mpi3mr_print_event_data(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply) +{ + char *desc = NULL; + u16 event; + + if (!(mrioc->logging_level & MPI3_DEBUG_EVENT)) + return; + event = event_reply->event; + + switch (event) { + case MPI3_EVENT_LOG_DATA: + desc = "Log Data"; + break; + case MPI3_EVENT_CHANGE: + desc = "Event Change"; + break; + case MPI3_EVENT_GPIO_INTERRUPT: + desc = "GPIO Interrupt"; + break; + case MPI3_EVENT_CABLE_MGMT: + desc = "Cable Management"; + break; + case MPI3_EVENT_ENERGY_PACK_CHANGE: + desc = "Energy Pack Change"; + break; + case MPI3_EVENT_DEVICE_ADDED: + { + struct mpi3_device_page0 *event_data = + (struct mpi3_device_page0 *)event_reply->event_data; + ioc_info(mrioc, "Device Added: handle(0x%04x), perst_id(%d), form(0x%02x)\n", + le16_to_cpu(event_data->dev_handle), + le16_to_cpu(event_data->persistent_id), + event_data->device_form); + return; + } + case MPI3_EVENT_DEVICE_INFO_CHANGED: + { + struct mpi3_device_page0 *event_data = + (struct mpi3_device_page0 *)event_reply->event_data; + ioc_info(mrioc, "Device Info Changed: handle(0x%04x), perst_id(%d), form(0x%02x)\n", + le16_to_cpu(event_data->dev_handle), + le16_to_cpu(event_data->persistent_id), + event_data->device_form); + return; + } + case MPI3_EVENT_DEVICE_STATUS_CHANGE: + { + struct mpi3_event_data_device_status_change *event_data = + (struct mpi3_event_data_device_status_change *) + event_reply->event_data; + ioc_info(mrioc, "Device Status Change: handle(0x%04x), reason_code(0x%02x)\n", + le16_to_cpu(event_data->dev_handle), event_data->reason_code); + return; + } + case MPI3_EVENT_SAS_DISCOVERY: + { + struct mpi3_event_data_sas_discovery *event_data = + (struct mpi3_event_data_sas_discovery *) + event_reply->event_data; + ioc_info(mrioc, "SAS Discovery: (%s) status (0x%08x)", + (event_data->reason_code == MPI3_EVENT_SAS_DISC_RC_STARTED) + ? "start" : "stop", + le32_to_cpu(event_data->discovery_status)); + return; + } + case MPI3_EVENT_SAS_BROADCAST_PRIMITIVE: + desc = "SAS Broadcast Primitive"; + break; + case MPI3_EVENT_SAS_NOTIFY_PRIMITIVE: + desc = "SAS Notify Primitive"; + break; + case MPI3_EVENT_SAS_INIT_DEVICE_STATUS_CHANGE: + desc = "SAS Init Device Status Change"; + break; + case MPI3_EVENT_SAS_INIT_TABLE_OVERFLOW: + desc = "SAS Init Table Overflow"; + break; + case MPI3_EVENT_SAS_TOPOLOGY_CHANGE_LIST: + desc = "SAS Topology Change List"; + break; + case MPI3_EVENT_ENCL_DEVICE_STATUS_CHANGE: + desc = "Enclosure Device Status Change"; + break; + case MPI3_EVENT_ENCL_DEVICE_ADDED: + desc = "Enclosure Added"; + break; + case MPI3_EVENT_HARD_RESET_RECEIVED: + desc = "Hard Reset Received"; + break; + case MPI3_EVENT_SAS_PHY_COUNTER: + desc = "SAS PHY Counter"; + break; + case MPI3_EVENT_SAS_DEVICE_DISCOVERY_ERROR: + desc = "SAS Device Discovery Error"; + break; + case MPI3_EVENT_PCIE_TOPOLOGY_CHANGE_LIST: + desc = "PCIE Topology Change List"; + break; + case MPI3_EVENT_PCIE_ENUMERATION: + { + struct mpi3_event_data_pcie_enumeration *event_data = + (struct mpi3_event_data_pcie_enumeration *) + event_reply->event_data; + ioc_info(mrioc, "PCIE Enumeration: (%s)", + (event_data->reason_code == + MPI3_EVENT_PCIE_ENUM_RC_STARTED) ? "start" : "stop"); + if (event_data->enumeration_status) + ioc_info(mrioc, "enumeration_status(0x%08x)\n", + le32_to_cpu(event_data->enumeration_status)); + return; + } + case MPI3_EVENT_PREPARE_FOR_RESET: + desc = "Prepare For Reset"; + break; + case MPI3_EVENT_DIAGNOSTIC_BUFFER_STATUS_CHANGE: + desc = "Diagnostic Buffer Status Change"; + break; + } + + if (!desc) + return; + + ioc_info(mrioc, "%s\n", desc); +} + +/** + * mpi3mr_handle_events - Handle events + * @mrioc: Adapter instance reference + * @def_reply: MPI3 default reply + * + * Prints the event details and call the consumer of the events. + * + * Return: Nothing. + */ +static void mpi3mr_handle_events(struct mpi3mr_ioc *mrioc, + struct mpi3_default_reply *def_reply) +{ + struct mpi3_event_notification_reply *event_reply = + (struct mpi3_event_notification_reply *)def_reply; + + mrioc->change_count = le16_to_cpu(event_reply->ioc_change_count); + mpi3mr_print_event_data(mrioc, event_reply); + + mpi3mr_os_handle_events(mrioc, event_reply); +} + +/** + * mpi3mr_get_drv_cmd - Get driver command from host tag + * @mrioc: Adapter instance reference + * @host_tag: Host tag + * @def_reply: MPI3 default reply + * + * Checks the host tag and if it is driver's internal identify + * the corresponding command tracker reference and return. If + * the hosttag is invalid then it is an MPI3 event and event + * processing routine is called and NULL is returned. If the + * host tag is unindentifiable then also NULL is returned. + * + * Return: Null for events/failure or internal command tracker. + */ +static struct mpi3mr_drv_cmd * +mpi3mr_get_drv_cmd(struct mpi3mr_ioc *mrioc, u16 host_tag, + struct mpi3_default_reply *def_reply) +{ + u16 idx; + + switch (host_tag) { + case MPI3MR_HOSTTAG_INITCMDS: + return &mrioc->init_cmds; + case MPI3MR_HOSTTAG_CFG_CMDS: + return &mrioc->cfg_cmds; + case MPI3MR_HOSTTAG_BSG_CMDS: + return &mrioc->bsg_cmds; + case MPI3MR_HOSTTAG_BLK_TMS: + return &mrioc->host_tm_cmds; + case MPI3MR_HOSTTAG_PEL_ABORT: + return &mrioc->pel_abort_cmd; + case MPI3MR_HOSTTAG_PEL_WAIT: + return &mrioc->pel_cmds; + case MPI3MR_HOSTTAG_TRANSPORT_CMDS: + return &mrioc->transport_cmds; + case MPI3MR_HOSTTAG_INVALID: + if (def_reply && def_reply->function == + MPI3_FUNCTION_EVENT_NOTIFICATION) + mpi3mr_handle_events(mrioc, def_reply); + return NULL; + default: + break; + } + if (host_tag >= MPI3MR_HOSTTAG_DEVRMCMD_MIN && + host_tag <= MPI3MR_HOSTTAG_DEVRMCMD_MAX) { + idx = host_tag - MPI3MR_HOSTTAG_DEVRMCMD_MIN; + return &mrioc->dev_rmhs_cmds[idx]; + } + if (host_tag >= MPI3MR_HOSTTAG_SYSFS_TM_MIN && + host_tag <= MPI3MR_HOSTTAG_SYSFS_TM_MAX) { + idx = host_tag - MPI3MR_HOSTTAG_SYSFS_TM_MIN; + return &mrioc->sysfs_tm_cmds[idx]; + } + if (host_tag >= MPI3MR_HOSTTAG_EVTACKCMD_MIN && + host_tag <= MPI3MR_HOSTTAG_EVTACKCMD_MAX) { + idx = host_tag - MPI3MR_HOSTTAG_EVTACKCMD_MIN; + return &mrioc->evtack_cmds[idx]; + } + + return NULL; +} + +/** + * mpi3mr_process_admin_reply_desc - Admin reply descriptor + * handler + * @mrioc: Adapter instance reference + * @reply_desc: Reply descriptor + * @reply_dma: Place holder for reply frames dma + * + * Checks the type of the reply descriptor and infer the + * descriptor as defined in MPI3.0 specification and wake any of + * the functions waiting for the reply. + * + * Return: Nothing. + */ +static void mpi3mr_process_admin_reply_desc(struct mpi3mr_ioc *mrioc, + struct mpi3_default_reply_descriptor *reply_desc, u64 *reply_dma) +{ + u16 reply_desc_type, host_tag = 0; + u16 ioc_status = MPI3_IOCSTATUS_SUCCESS; + u32 ioc_loginfo = 0; + struct mpi3_status_reply_descriptor *status_desc; + struct mpi3_address_reply_descriptor *addr_desc; + struct mpi3_success_reply_descriptor *success_desc; + struct mpi3_default_reply *def_reply = NULL; + struct mpi3mr_drv_cmd *cmdptr = NULL; + struct mpi3_scsi_io_reply *scsi_reply; + u8 *sense_buf = NULL; + + *reply_dma = 0; + reply_desc_type = le16_to_cpu(reply_desc->reply_flags) & + MPI3_REPLY_DESCRIPT_FLAGS_TYPE_MASK; + switch (reply_desc_type) { + case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_STATUS: + status_desc = (struct mpi3_status_reply_descriptor *)reply_desc; + host_tag = le16_to_cpu(status_desc->host_tag); + ioc_status = le16_to_cpu(status_desc->ioc_status); + if (ioc_status & + MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_LOGINFOAVAIL) + ioc_loginfo = le32_to_cpu(status_desc->ioc_log_info); + ioc_status &= MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_STATUS_MASK; + mpi3mr_reply_trigger(mrioc, ioc_status, ioc_loginfo); + break; + case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_ADDRESS_REPLY: + addr_desc = (struct mpi3_address_reply_descriptor *)reply_desc; + *reply_dma = le64_to_cpu(addr_desc->reply_frame_address); + def_reply = mpi3mr_get_reply_virt_addr(mrioc, *reply_dma); + if (!def_reply) + goto out; + host_tag = le16_to_cpu(def_reply->host_tag); + ioc_status = le16_to_cpu(def_reply->ioc_status); + if (ioc_status & + MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_LOGINFOAVAIL) + ioc_loginfo = le32_to_cpu(def_reply->ioc_log_info); + ioc_status &= MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_STATUS_MASK; + if (def_reply->function == MPI3_FUNCTION_SCSI_IO) { + scsi_reply = (struct mpi3_scsi_io_reply *)def_reply; + sense_buf = mpi3mr_get_sensebuf_virt_addr(mrioc, + le64_to_cpu(scsi_reply->sense_data_buffer_address)); + } + mpi3mr_reply_trigger(mrioc, ioc_status, ioc_loginfo); + break; + case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_SUCCESS: + success_desc = (struct mpi3_success_reply_descriptor *) + reply_desc; + host_tag = le16_to_cpu(success_desc->host_tag); + break; + default: + break; + } + + cmdptr = mpi3mr_get_drv_cmd(mrioc, host_tag, def_reply); + if (cmdptr) { + if (cmdptr->state & MPI3MR_CMD_PENDING) { + cmdptr->state |= MPI3MR_CMD_COMPLETE; + cmdptr->ioc_loginfo = ioc_loginfo; + cmdptr->ioc_status = ioc_status; + cmdptr->state &= ~MPI3MR_CMD_PENDING; + if (def_reply) { + cmdptr->state |= MPI3MR_CMD_REPLY_VALID; + memcpy((u8 *)cmdptr->reply, (u8 *)def_reply, + mrioc->reply_sz); + } + if (sense_buf && cmdptr->sensebuf) { + cmdptr->is_sense = 1; + memcpy(cmdptr->sensebuf, sense_buf, + MPI3MR_SENSE_BUF_SZ); + } + if (cmdptr->is_waiting) { + complete(&cmdptr->done); + cmdptr->is_waiting = 0; + } else if (cmdptr->callback) + cmdptr->callback(mrioc, cmdptr); + } + } +out: + if (sense_buf) + mpi3mr_repost_sense_buf(mrioc, + le64_to_cpu(scsi_reply->sense_data_buffer_address)); +} + +/** + * mpi3mr_process_admin_reply_q - Admin reply queue handler + * @mrioc: Adapter instance reference + * + * Checks the admin reply queue and drains the reply queue until + * the queue is empty and process the individual reply + * descriptors. Post the controller with proper consumer index. + * + * Return: Number of reply descriptors processed. + */ +int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc) +{ + u32 exp_phase = mrioc->admin_reply_ephase; + u32 admin_reply_ci = mrioc->admin_reply_ci; + u32 num_admin_replies = 0; + u64 reply_dma = 0; + struct mpi3_default_reply_descriptor *reply_desc; + + if (!atomic_add_unless(&mrioc->admin_reply_q_in_use, 1, 1)) + return 0; + + reply_desc = (struct mpi3_default_reply_descriptor *) + mrioc->admin_reply_base + admin_reply_ci; + + if ((le16_to_cpu(reply_desc->reply_flags) & + MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK) != exp_phase) { + atomic_dec(&mrioc->admin_reply_q_in_use); + return 0; + } + + do { + if (mrioc->unrecoverable) + break; + mrioc->admin_req_ci = le16_to_cpu(reply_desc->request_queue_ci); + mpi3mr_process_admin_reply_desc(mrioc, reply_desc, &reply_dma); + if (reply_dma) + mpi3mr_repost_reply_buf(mrioc, reply_dma); + num_admin_replies++; + if (++admin_reply_ci == mrioc->num_admin_replies) { + admin_reply_ci = 0; + exp_phase ^= 1; + } + reply_desc = + (struct mpi3_default_reply_descriptor *) + mrioc->admin_reply_base + admin_reply_ci; + if ((le16_to_cpu(reply_desc->reply_flags) & + MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK) != exp_phase) + break; + } while (1); + + writel(admin_reply_ci, &mrioc->sysif_regs->admin_reply_queue_ci); + mrioc->admin_reply_ci = admin_reply_ci; + mrioc->admin_reply_ephase = exp_phase; + atomic_dec(&mrioc->admin_reply_q_in_use); + + return num_admin_replies; +} + +/** + * mpi3mr_get_reply_desc - Get reply descriptor + * @op_reply_q: Operational reply queue info + * @reply_ci: Operational reply queue consumer index + * + * Get reply descriptor frame corresponding to a operational + * reply queue's consumer index + * + * Return: Reply descriptor address + */ +static inline struct mpi3_default_reply_descriptor * +mpi3mr_get_reply_desc(struct op_reply_qinfo *op_reply_q, u32 reply_ci) +{ + void *segment_base_addr; + struct segments *segments = op_reply_q->q_segments; + struct mpi3_default_reply_descriptor *reply_desc = NULL; + + segment_base_addr = + segments[reply_ci / op_reply_q->segment_qd].segment; + reply_desc = (struct mpi3_default_reply_descriptor *)segment_base_addr + + (reply_ci % op_reply_q->segment_qd); + return reply_desc; +} + +/** + * mpi3mr_process_op_reply_q - Operational reply queue handler + * @mrioc: Adapter instance reference + * @op_reply_q: Operational reply queue info + * + * Checks the specific operational reply queue and drains the + * reply queue entries until the queue is empty and process the + * individual reply descriptors. + * + * Return: 0 if queue is already processed,or number of reply + * descriptors processed. + */ +int mpi3mr_process_op_reply_q(struct mpi3mr_ioc *mrioc, + struct op_reply_qinfo *op_reply_q) +{ + struct op_req_qinfo *op_req_q; + u32 exp_phase; + u32 reply_ci; + u32 num_op_reply = 0; + u64 reply_dma = 0; + struct mpi3_default_reply_descriptor *reply_desc; + u16 req_q_idx = 0, reply_qidx; + + reply_qidx = op_reply_q->qid - 1; + + if (!atomic_add_unless(&op_reply_q->in_use, 1, 1)) + return 0; + + exp_phase = op_reply_q->ephase; + reply_ci = op_reply_q->ci; + + reply_desc = mpi3mr_get_reply_desc(op_reply_q, reply_ci); + if ((le16_to_cpu(reply_desc->reply_flags) & + MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK) != exp_phase) { + atomic_dec(&op_reply_q->in_use); + return 0; + } + + do { + if (mrioc->unrecoverable) + break; + req_q_idx = le16_to_cpu(reply_desc->request_queue_id) - 1; + op_req_q = &mrioc->req_qinfo[req_q_idx]; + + WRITE_ONCE(op_req_q->ci, + le16_to_cpu(reply_desc->request_queue_ci)); + + mpi3mr_process_op_reply_desc(mrioc, reply_desc, &reply_dma, + reply_qidx); + atomic_dec(&op_reply_q->pend_ios); + +#if defined(IO_COUNTER_SUPPORT) + atomic_dec(&mrioc->pend_ios); +#endif + if (reply_dma) + mpi3mr_repost_reply_buf(mrioc, reply_dma); + num_op_reply++; + + if (++reply_ci == op_reply_q->num_replies) { + reply_ci = 0; + exp_phase ^= 1; + } + + reply_desc = mpi3mr_get_reply_desc(op_reply_q, reply_ci); + + if ((le16_to_cpu(reply_desc->reply_flags) & + MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK) != exp_phase) + break; +#ifndef CONFIG_PREEMPT_RT + /* + * Exit completion loop to avoid CPU lockup + * Ensure remaining completion happens from threaded ISR. + */ + if (num_op_reply > mrioc->max_host_ios) { + op_reply_q->enable_irq_poll = true; + break; + } +#endif + } while (1); + + + writel(reply_ci, + &mrioc->sysif_regs->oper_queue_indexes[reply_qidx].consumer_index); + op_reply_q->ci = reply_ci; + op_reply_q->ephase = exp_phase; + atomic_dec(&op_reply_q->in_use); + + return num_op_reply; +} + +#if (KERNEL_VERSION(5, 12, 0) <= LINUX_VERSION_CODE) +/** + * mpi3mr_blk_mq_poll - Operational reply queue handler + * @shost: SCSI Host reference + * @queue_num: Request queue number (w.r.t OS it is hardware context number) + * + * Checks the specific operational reply queue and drains the + * reply queue entries until the queue is empty and process the + * individual reply descriptors. + * + * Return: 0 if queue is already processed,or number of reply + * descriptors processed. + */ +int mpi3mr_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num) +{ + int num_entries = 0; + struct mpi3mr_ioc *mrioc; + + mrioc = (struct mpi3mr_ioc *)shost->hostdata; + + if ((mrioc->reset_in_progress || mrioc->prepare_for_reset + || mrioc->unrecoverable)) + return 0; + + num_entries = mpi3mr_process_op_reply_q(mrioc, + &mrioc->op_reply_qinfo[queue_num]); + + return num_entries; +} +#else +int mpi3mr_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num) +{ + return 0; +} +#endif +/** + * mpi3mr_isr_primary - Interrupt Handler worker + * @irq: IRQ + * @privdata: Interrupt info + * + * Checks for the MSIx index, if it is 0 calls admin reply queue + * processing routine. If the MSIx has an associated operational + * reply queue, the operational reply processing routine is + * called too. + * + * Return: IRQ_NONE or IRQ_HANDLED + */ +static irqreturn_t mpi3mr_isr_primary(int irq, void *privdata) +{ + struct mpi3mr_intr_info *intr_info = privdata; + struct mpi3mr_ioc *mrioc; + u16 midx; + u32 num_admin_replies = 0, num_op_reply = 0; + + if (!intr_info) + return IRQ_NONE; + + mrioc = intr_info->mrioc; + + if (!mrioc->intr_enabled) + return IRQ_NONE; + + midx = intr_info->msix_index; + + if (!midx) + num_admin_replies = mpi3mr_process_admin_reply_q(mrioc); + if (intr_info->op_reply_q) + num_op_reply = mpi3mr_process_op_reply_q(mrioc, + intr_info->op_reply_q); + + if (num_admin_replies || num_op_reply) + return IRQ_HANDLED; + else + return IRQ_NONE; +} + +#ifndef CONFIG_PREEMPT_RT +/** + * mpi3mr_isr - Interrupt Handler + * @irq: IRQ + * @privdata: Interrupt info + * + * Executes reply queue draining and processing of reply + * descriptors by calling mpi3mr_isr_primary and if more replies + * are exepcted, schedule an IRQ polling thread. + * + * Return: IRQ_NONE or IRQ_HANDLED or IRQ_WAKE_THREAD + */ +static irqreturn_t mpi3mr_isr(int irq, void *privdata) +{ + struct mpi3mr_intr_info *intr_info = privdata; + struct mpi3mr_ioc *mrioc; + u16 midx; + int ret; + + if (!intr_info) + return IRQ_NONE; + + mrioc = intr_info->mrioc; + midx = intr_info->msix_index; + /* Call primary ISR routine */ + ret = mpi3mr_isr_primary(irq, privdata); + + /* + * If more IOs are expected, schedule IRQ polling thread. + * Otherwise exit from ISR. + */ + if (!intr_info->op_reply_q) + return ret; + + if (!intr_info->op_reply_q->enable_irq_poll || + !atomic_read(&intr_info->op_reply_q->pend_ios)) + return ret; + + disable_irq_nosync(pci_irq_vector(mrioc->pdev, midx)); + + return IRQ_WAKE_THREAD; +} + +/** + * mpi3mr_isr_poll - ISR thread. Reply queue polling routine + * @irq: IRQ + * @privdata: Interrupt info + * + * Threaded ISR, polls for pending I/O completions in a loop + * until pending I/Os present or controller queue depth I/Os are + * processed. + * + * Return: IRQ_NONE or IRQ_HANDLED + */ +static irqreturn_t mpi3mr_isr_poll(int irq, void *privdata) +{ + struct mpi3mr_intr_info *intr_info = privdata; + struct mpi3mr_ioc *mrioc; + u16 midx; + u32 num_op_reply = 0; + + if (!intr_info || !intr_info->op_reply_q) + return IRQ_NONE; + + mrioc = intr_info->mrioc; + midx = intr_info->msix_index; + + /* Poll for pending IOs completions */ + do { + if (!mrioc->intr_enabled || mrioc->unrecoverable) + break; + + if (!midx) + mpi3mr_process_admin_reply_q(mrioc); + if (intr_info->op_reply_q) + num_op_reply += + mpi3mr_process_op_reply_q(mrioc, + intr_info->op_reply_q); + + usleep_range(MPI3MR_IRQ_POLL_SLEEP, 10 * MPI3MR_IRQ_POLL_SLEEP); + + } while (atomic_read(&intr_info->op_reply_q->pend_ios) && + (num_op_reply < mrioc->max_host_ios)); + + /*SP2DO - There can be some IO timeout/pause if driver exit above loop + * because of num_orep < mrioc->max_host_ios check. + * It will happen only if application has stopped IO activity and above + * check this just prior to application stopped. Very difficult to + * reproduce. Investigate and fix this area later. + */ + intr_info->op_reply_q->enable_irq_poll = false; + enable_irq(pci_irq_vector(mrioc->pdev, midx)); + + return IRQ_HANDLED; +} +#endif + +/** + * mpi3mr_request_irq - Request IRQ and register ISR + * @mrioc: Adapter instance reference + * @index: IRQ vector index + * + * Request threaded ISR with primary ISR and secondary + * + * Return: 0 on success and non zero on failures. + */ +static inline int mpi3mr_request_irq(struct mpi3mr_ioc *mrioc, u16 index) +{ + struct pci_dev *pdev = mrioc->pdev; + struct mpi3mr_intr_info *intr_info = mrioc->intr_info + index; + int retval = 0; + + intr_info->mrioc = mrioc; + intr_info->msix_index = index; + intr_info->op_reply_q = NULL; + + snprintf(intr_info->name, MPI3MR_NAME_LENGTH, "%s%d-msix%d", + mrioc->driver_name, mrioc->id, index); + +#ifndef CONFIG_PREEMPT_RT + retval = request_threaded_irq(pci_irq_vector(pdev, index), mpi3mr_isr, + mpi3mr_isr_poll, IRQF_SHARED, intr_info->name, intr_info); +#else + retval = request_threaded_irq(pci_irq_vector(pdev, index), mpi3mr_isr_primary, + NULL, IRQF_SHARED, intr_info->name, intr_info); +#endif + if (retval) { + ioc_err(mrioc, "%s: unable to allocate interrupt %d!\n", + intr_info->name, pci_irq_vector(pdev, index)); + return retval; + } + + return retval; +} + +static void mpi3mr_calc_poll_queues(struct mpi3mr_ioc *mrioc, u16 max_vectors) +{ + if (!mrioc->requested_poll_qcount) + return; + + /* Reserved for Admin and Default Queue */ + if (max_vectors > 2 && + (mrioc->requested_poll_qcount < max_vectors - 2)) { + ioc_info(mrioc, + "enabled polled queues (%d) msix (%d)\n", + mrioc->requested_poll_qcount, max_vectors); + } else { + ioc_info(mrioc, + "disabled polled queues (%d) msix (%d) because of no resources for default queue\n", + mrioc->requested_poll_qcount, max_vectors); + mrioc->requested_poll_qcount = 0; + } +} + +/** + * mpi3mr_setup_isr - Setup ISR for the controller + * @mrioc: Adapter instance reference + * @setup_one: Request one IRQ or more + * + * Allocate IRQ vectors and call mpi3mr_request_irq to setup ISR + * + * Return: 0 on success and non zero on failures. + */ +static int mpi3mr_setup_isr(struct mpi3mr_ioc *mrioc, u8 setup_one) +{ + unsigned int irq_flags = PCI_IRQ_MSIX; + int max_vectors, min_vec; + int retval; + int i; + struct irq_affinity desc = { .pre_vectors = 1, .post_vectors = 1 }; + + if (mrioc->is_intr_info_set) + return 0; + + mpi3mr_cleanup_isr(mrioc); + + if (setup_one || reset_devices) { + max_vectors = 1; + retval = pci_alloc_irq_vectors(mrioc->pdev, + 1, max_vectors, irq_flags); + if (retval < 0) { + ioc_err(mrioc, "cannot allocate irq vectors, ret %d\n", + retval); + goto out_failed; + } + } else { + max_vectors = + min_t(int, mrioc->cpu_count + 1 + + mrioc->requested_poll_qcount, mrioc->msix_count); + + mpi3mr_calc_poll_queues(mrioc, max_vectors); + + ioc_info(mrioc, + "MSI-X vectors supported: %d, no of cores: %d,", + mrioc->msix_count, mrioc->cpu_count); + ioc_info(mrioc, + "MSI-x vectors requested: %d poll_queues %d\n", + max_vectors, mrioc->requested_poll_qcount); + + desc.post_vectors = mrioc->requested_poll_qcount; + min_vec = desc.pre_vectors + desc.post_vectors; + irq_flags |= PCI_IRQ_AFFINITY | PCI_IRQ_ALL_TYPES; + + retval = pci_alloc_irq_vectors_affinity(mrioc->pdev, + min_vec, max_vectors, irq_flags, &desc); + + if (retval < 0) { + ioc_err(mrioc, "cannot allocate irq vectors, ret %d\n", + retval); + goto out_failed; + } + + + /* + * If only one MSI-x is allocated, then MSI-x 0 will be shared + * between Admin queue and operational queue + */ + if (retval == min_vec) + mrioc->op_reply_q_offset = 0; + else if (retval != (max_vectors)) { + ioc_info(mrioc, + "allocated vectors (%d) are less than configured (%d)\n", + retval, max_vectors); + } + + max_vectors = retval; + mrioc->op_reply_q_offset = (max_vectors > 1) ? 1 : 0; + + mpi3mr_calc_poll_queues(mrioc, max_vectors); + + } + + mrioc->intr_info = kzalloc(sizeof(struct mpi3mr_intr_info)*max_vectors, + GFP_KERNEL); + if (!mrioc->intr_info) { + retval = -ENOMEM; + pci_free_irq_vectors(mrioc->pdev); + goto out_failed; + } + for (i = 0; i < max_vectors; i++) { + retval = mpi3mr_request_irq(mrioc, i); + if (retval) { + mrioc->intr_info_count = i; + goto out_failed; + } + } + if (reset_devices || !setup_one) + mrioc->is_intr_info_set = true; + mrioc->intr_info_count = max_vectors; + mpi3mr_ioc_enable_intr(mrioc); + return 0; +out_failed: + mpi3mr_cleanup_isr(mrioc); + + return retval; +} + +static const struct { + enum mpi3mr_drv_db_level value; + char *name; +} mpi3mr_drv_db[] = { + { MRIOC_DRV_DB_DISABLED, "disabled (uefi dump is enabled)" }, + { MRIOC_DRV_DB_MINI, "minidump" }, + { MRIOC_DRV_DB_FULL, "fulldump" }, +}; +static const char *mpi3mr_drv_db_name(enum mpi3mr_drv_db_level drv_db_level) +{ + int i; + char *name = NULL; + + /* Start with Disabled */ + name = mpi3mr_drv_db[0].name; + + for (i = 0; i < ARRAY_SIZE(mpi3mr_drv_db); i++) { + if (mpi3mr_drv_db[i].value == drv_db_level) { + name = mpi3mr_drv_db[i].name; + break; + } + } + return name; +} + +static const struct { + enum mpi3mr_iocstate value; + char *name; +} mrioc_states[] = { + { MRIOC_STATE_READY, "ready" }, + { MRIOC_STATE_FAULT, "fault" }, + { MRIOC_STATE_RESET, "reset" }, + { MRIOC_STATE_BECOMING_READY, "becoming ready" }, + { MRIOC_STATE_RESET_REQUESTED, "reset requested" }, + { MRIOC_STATE_UNRECOVERABLE, "unrecoverable error" }, +}; + +static const char *mpi3mr_iocstate_name(enum mpi3mr_iocstate mrioc_state) +{ + int i; + char *name = NULL; + + for (i = 0; i < ARRAY_SIZE(mrioc_states); i++) { + if (mrioc_states[i].value == mrioc_state) { + name = mrioc_states[i].name; + break; + } + } + return name; +} + +/* Reset reason to name mapper structure*/ +static const struct { + enum mpi3mr_reset_reason value; + char *name; +} mpi3mr_reset_reason_codes[] = { + { MPI3MR_RESET_FROM_BRINGUP, "bringup" }, + { MPI3MR_RESET_FROM_FAULT_WATCH, "fault" }, + { MPI3MR_RESET_FROM_APP, "application invocation" }, + { MPI3MR_RESET_FROM_EH_HOS, "host reset from the OS" }, + { MPI3MR_RESET_FROM_TM_TIMEOUT, "task management timeout" }, + { MPI3MR_RESET_FROM_APP_TIMEOUT, "application command timeout" }, + { MPI3MR_RESET_FROM_MUR_FAILURE, "message unit reset failure" }, + { MPI3MR_RESET_FROM_CTLR_CLEANUP, "controller cleanup" }, + { MPI3MR_RESET_FROM_CIACTIV_FAULT, "component image activation fault" }, + { MPI3MR_RESET_FROM_PE_TIMEOUT, "port enable timeout" }, + { MPI3MR_RESET_FROM_TSU_TIMEOUT, "time stamp update timeout" }, + { MPI3MR_RESET_FROM_DELREQQ_TIMEOUT, "delete request queue timeout" }, + { MPI3MR_RESET_FROM_DELREPQ_TIMEOUT, "delete reply queue timeout" }, + { + MPI3MR_RESET_FROM_CREATEREPQ_TIMEOUT, + "create request queue timeout" + }, + { + MPI3MR_RESET_FROM_CREATEREQQ_TIMEOUT, + "create reply queue timeout" + }, + { MPI3MR_RESET_FROM_IOCFACTS_TIMEOUT, "ioc_facts timeout" }, + { MPI3MR_RESET_FROM_IOCINIT_TIMEOUT, "ioc_init timeout" }, + { MPI3MR_RESET_FROM_EVTNOTIFY_TIMEOUT, "event notification timeout" }, + { MPI3MR_RESET_FROM_EVTACK_TIMEOUT, "event acknowledgment timeout" }, + { + MPI3MR_RESET_FROM_CIACTVRST_TIMER, + "component image activation timeout" + }, + { + MPI3MR_RESET_FROM_GETPKGVER_TIMEOUT, + "get package version timeout" + }, + { + MPI3MR_RESET_FROM_PELABORT_TIMEOUT, + "persistent event log abort timeout" + }, + { MPI3MR_RESET_FROM_SYSFS, "sysfs invocation" }, + { MPI3MR_RESET_FROM_SYSFS_TIMEOUT, "sysfs task management timeout" }, + { + MPI3MR_RESET_FROM_DIAG_BUFFER_POST_TIMEOUT, + "diagnostic buffer post timeout" + }, + { + MPI3MR_RESET_FROM_DIAG_BUFFER_RELEASE_TIMEOUT, + "diagnostic buffer release timeout" + }, + { MPI3MR_RESET_FROM_FIRMWARE, "firmware asynchronus reset" }, + { + MPI3MR_RESET_FROM_DIAG_BUFFER_UPLOAD_TIMEOUT, + "diagnostic buffer upload timeout" + }, + { MPI3MR_RESET_FROM_CFG_REQ_TIMEOUT, "configuration request timeout"}, + { + MPI3MR_RESET_FROM_SAS_TRANSPORT_TIMEOUT, + "timeout of a SAS transport layer request" + }, + { MPI3MR_RESET_FROM_TRIGGER, "automatic firmware diagnostic trigger"}, +}; + +/** + * mpi3mr_reset_rc_name - get reset reason code name + * @reason_code: reset reason code value + * + * Map reset reason to an NULL terminated ASCII string + * + * Return: name corresponding to reset reason value or NULL. + */ +static const char *mpi3mr_reset_rc_name(enum mpi3mr_reset_reason reason_code) +{ + int i; + char *name = NULL; + + for (i = 0; i < ARRAY_SIZE(mpi3mr_reset_reason_codes); i++) { + if (mpi3mr_reset_reason_codes[i].value == reason_code) { + name = mpi3mr_reset_reason_codes[i].name; + break; + } + } + return name; +} + +/* Reset type to name mapper structure*/ +static const struct { + u16 reset_type; + char *name; +} mpi3mr_reset_types[] = { + { MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET, "soft" }, + { MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT, "diag fault" }, +}; + +/** + * mpi3mr_reset_type_name - get reset type name + * @reset_type: reset type value + * + * Map reset type to an NULL terminated ASCII string + * + * Return: name corresponding to reset type value or NULL. + */ +static const char *mpi3mr_reset_type_name(u16 reset_type) +{ + int i; + char *name = NULL; + + for (i = 0; i < ARRAY_SIZE(mpi3mr_reset_types); i++) { + if (mpi3mr_reset_types[i].reset_type == reset_type) { + name = mpi3mr_reset_types[i].name; + break; + } + } + return name; +} + +/** + * mpi3mr_print_fault_info - Display fault information + * @mrioc: Adapter instance reference + * + * Display the controller fault information if there is a + * controller fault. + * + * Return: Nothing. + */ +void mpi3mr_print_fault_info(struct mpi3mr_ioc *mrioc) +{ + u32 ioc_status, code, code1, code2, code3; + + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + + if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) { + code = readl(&mrioc->sysif_regs->fault); + code1 = readl(&mrioc->sysif_regs->fault_info[0]); + code2 = readl(&mrioc->sysif_regs->fault_info[1]); + code3 = readl(&mrioc->sysif_regs->fault_info[2]); + + ioc_info(mrioc, + "fault code(0x%08X): additional code: (0x%08X:0x%08X:0x%08X)\n", + code, code1, code2, code3); + } +} + +/** + * mpi3mr_get_iocstate - Get IOC State + * @mrioc: Adapter instance reference + * + * Return a proper IOC state enum based on the IOC status and + * IOC configuration and unrecoverable state of the controller. + * + * Return: Current IOC state. + */ +enum mpi3mr_iocstate mpi3mr_get_iocstate(struct mpi3mr_ioc *mrioc) +{ + u32 ioc_status, ioc_config; + u8 ready, enabled; + + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); + + if (mrioc->unrecoverable) + return MRIOC_STATE_UNRECOVERABLE; + if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) + return MRIOC_STATE_FAULT; + + ready = (ioc_status & MPI3_SYSIF_IOC_STATUS_READY); + enabled = (ioc_config & MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC); + + if (ready && enabled) + return MRIOC_STATE_READY; + if ((!ready) && (!enabled)) + return MRIOC_STATE_RESET; + if ((!ready) && (enabled)) + return MRIOC_STATE_BECOMING_READY; + + return MRIOC_STATE_RESET_REQUESTED; +} + +/** + * mpi3mr_do_uefi_dump - copy uefi logs + * @mrioc: Adapter instance reference + * + * Return: next available location in driver diag buffer. + */ +static int mpi3mr_do_uefi_dump(struct mpi3mr_ioc *mrioc) +{ + struct mpi3_driver_buffer_header *drv_buff_header = NULL; + int pos_uefi_dump = 0, pos_uefi_end = 0; + u32 data_len; + + if (!mrioc->uefi_logs) + return pos_uefi_dump; + + data_len = mrioc->uefi_logs_sz; + memcpy(mrioc->drv_diag_buffer, mrioc->uefi_logs, data_len); + drv_buff_header = + (struct mpi3_driver_buffer_header *)mrioc->drv_diag_buffer; + + pos_uefi_dump = sizeof(struct mpi3_driver_buffer_header); + if (drv_buff_header->signature == 0x43495243) { + pos_uefi_end = + min_t(int, + data_len - sizeof(struct mpi3_driver_buffer_header), + drv_buff_header->circular_buffer_size - 1); + ioc_info(mrioc, + "UEFI logs has valid header size %d\n", + drv_buff_header->circular_buffer_size); + pos_uefi_dump += pos_uefi_end; + } else { + pos_uefi_dump += + min_t(int, data_len, + MPI3MR_UEFI_DIAG_HOST_BUFFER_OFFSET); + ioc_info(mrioc, "UEFI logs has invalid header\n"); + } + + drv_buff_header->signature = 0x43495243; + drv_buff_header->logical_buffer_start = 0; + drv_buff_header->circular_buffer_size = mrioc->drv_diag_buffer_sz + - sizeof(struct mpi3_driver_buffer_header); + drv_buff_header->flags = + MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_ASCII; + drv_buff_header->logical_buffer_end = + pos_uefi_dump - sizeof(struct mpi3_driver_buffer_header); + ioc_info(mrioc, "UEFI Logs offset 0x%04x logical_buffer_end 0x%04x\n", + pos_uefi_dump, drv_buff_header->logical_buffer_end); + + return pos_uefi_dump; +} + +/** + * mpi3mr_do_mini_dump - copy system logs associated with mrioc. + * @mrioc: Adapter instance reference + * @prev_offset: offset returned from previous operation + * + * Read system logs and search for pattern mpi3mr%d and copy the lines + * into driver diag buffer + * + * Return: next available location in driver diag buffer. + */ +static int mpi3mr_do_mini_dump(struct mpi3mr_ioc *mrioc, int prev_offset) +{ + int n = 0, lines, pos_mini_dump; + struct mpi3mr_kmsg_dumper dumper; + size_t len; + char buf[201]; + char *mini_start = "<6> Minidump start\n"; + char *mini_end = "<6> Minidump end\n"; + + struct mpi3_driver_buffer_header *drv_buff_header = NULL; + + dumper = mrioc->dump; + mpi3mr_set_dumper_active(&dumper); + + kmsg_dump_rewind(&dumper.kdumper); + while (kmsg_dump_get_line(&dumper.kdumper, 1, NULL, 0, NULL)) + n++; + + lines = n; + kmsg_dump_rewind(&dumper.kdumper); + + drv_buff_header = (struct mpi3_driver_buffer_header *)mrioc->drv_diag_buffer; + drv_buff_header->signature = 0x43495243; + drv_buff_header->logical_buffer_start = 0; + drv_buff_header->circular_buffer_size = + mrioc->drv_diag_buffer_sz - sizeof(struct mpi3_driver_buffer_header); + drv_buff_header->flags = + MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_ASCII; + pos_mini_dump = + prev_offset ? prev_offset : sizeof(struct mpi3_driver_buffer_header); + + if ((pos_mini_dump + strlen(mini_start) + < mrioc->drv_diag_buffer_sz)) { + sprintf((char *)mrioc->drv_diag_buffer + pos_mini_dump, + "%s\n", mini_start); + pos_mini_dump += strlen(mini_start); + } else { + ioc_info(mrioc, "driver diag buffer is full. minidump is not started\n"); + goto out; + } + + while (kmsg_dump_get_line(&dumper.kdumper, 1, buf, sizeof(buf), &len)) { + if (!lines--) + break; + if (strstr(buf, mrioc->name) && + ((pos_mini_dump + len + strlen(mini_end)) + < mrioc->drv_diag_buffer_sz)) { + sprintf((char *)mrioc->drv_diag_buffer + + pos_mini_dump, "%s", buf); + pos_mini_dump += len; + } + } + + if ((pos_mini_dump + strlen(mini_end) + < mrioc->drv_diag_buffer_sz)) { + sprintf((char *)mrioc->drv_diag_buffer + pos_mini_dump, + "%s\n", mini_end); + pos_mini_dump += strlen(mini_end); + } + +out: + drv_buff_header->logical_buffer_end = + pos_mini_dump - sizeof(struct mpi3_driver_buffer_header); + + ioc_info(mrioc, "driver diag buffer base_address(including 4K header) 0x%016llx, end_address 0x%016llx\n", + (unsigned long long)mrioc->drv_diag_buffer_dma, + (unsigned long long)mrioc->drv_diag_buffer_dma + + mrioc->drv_diag_buffer_sz); + ioc_info(mrioc, "logical_buffer end_address 0x%016llx, logical_buffer_end 0x%08x\n", + (unsigned long long)mrioc->drv_diag_buffer_dma + + drv_buff_header->logical_buffer_end, + drv_buff_header->logical_buffer_end); + + return pos_mini_dump; +} + +/** + * mpi3mr_do_dump - copy system logs into driver diag buffer. + * @mrioc: Adapter instance reference + * + * Return: Nothing. + */ +static void mpi3mr_do_dump(struct mpi3mr_ioc *mrioc) +{ + int offset = 0, uefi_offset = 0; + size_t dump_size; + struct mpi3_driver_buffer_header *drv_buff_header = NULL; + + if (!mrioc->drv_diag_buffer) + return; + + memset(mrioc->drv_diag_buffer, 0, mrioc->drv_diag_buffer_sz); + + /* Copy uefi boot logs */ + if (mrioc->skip_uefi_snapdump == false) + uefi_offset = mpi3mr_do_uefi_dump(mrioc); + else + mrioc->skip_uefi_snapdump = true; + + if (drv_db_level == MRIOC_DRV_DB_DISABLED) + return; + + /* Copy controller specific logs */ + offset += mpi3mr_do_mini_dump(mrioc, uefi_offset); + if (drv_db_level != MRIOC_DRV_DB_FULL) + return; + + mpi3mr_set_dumper_active(&mrioc->dump); + kmsg_dump_rewind(&mrioc->dump.kdumper); + kmsg_dump_get_buffer(&mrioc->dump.kdumper, true, + mrioc->drv_diag_buffer + offset, + mrioc->drv_diag_buffer_sz - offset, &dump_size); + + drv_buff_header = (struct mpi3_driver_buffer_header *) + mrioc->drv_diag_buffer; + drv_buff_header->logical_buffer_end += dump_size; + ioc_info(mrioc, "logical_buffer end_address(0x%016llx), logical_buffer_end(0x%08x)\n", + (unsigned long long)mrioc->drv_diag_buffer_dma + + drv_buff_header->logical_buffer_end, + drv_buff_header->logical_buffer_end); +} + +/** + * mpi3mr_clear_reset_history - clear reset history + * @mrioc: Adapter instance reference + * + * Write the reset history bit in IOC status to clear the bit, + * if it is already set. + * + * Return: Nothing. + */ +static inline void mpi3mr_clear_reset_history(struct mpi3mr_ioc *mrioc) +{ + u32 ioc_status; + + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + if (ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) + writel(ioc_status, &mrioc->sysif_regs->ioc_status); + +} + +/** + * mpi3mr_issue_and_process_mur - Message Unit Reset handler + * @mrioc: Adapter instance reference + * @reset_reason: Reset reason code + * + * Issue Message Unit Reset to the controller and wait for it to + * be complete. + * + * Return: 0 on success, -1 on failure. + */ +static int mpi3mr_issue_and_process_mur(struct mpi3mr_ioc *mrioc, + u32 reset_reason) +{ + u32 ioc_config, timeout, ioc_status; + int retval = -1; + + ioc_info(mrioc, "issuing message unit reset(MUR)\n"); + if (mrioc->unrecoverable) { + ioc_info(mrioc, "controller is unrecoverable message unit reset is not issued\n"); + return retval; + } + mpi3mr_clear_reset_history(mrioc); + writel(reset_reason, &mrioc->sysif_regs->scratchpad[0]); + ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); + ioc_config &= ~MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC; + writel(ioc_config, &mrioc->sysif_regs->ioc_configuration); + + timeout = MPI3MR_RESET_ACK_TIMEOUT * 10; + do { + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY)) { + mpi3mr_clear_reset_history(mrioc); + break; + } + if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) { + mpi3mr_print_fault_info(mrioc); + break; + } + msleep(100); + } while (--timeout); + + ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); + if (timeout && !((ioc_status & MPI3_SYSIF_IOC_STATUS_READY) || + (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) || + (ioc_config & MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC))) + retval = 0; + + ioc_info(mrioc, "ioc_status/ioc_config after %s message unit reset is (0x%x)/(0x%x)\n", + (!retval) ? "successful" : "failed", ioc_status, ioc_config); + return retval; +} + + +/** + * mpi3mr_soft_reset_success - Check softreset is success or not + * @ioc_status: IOC status register value + * @ioc_config: IOC config register value + * + * Check whether the soft reset is successful or not based on + * IOC status and IOC config register values. + * + * Return: True when the soft reset is success, false otherwise. + */ +static inline bool +mpi3mr_soft_reset_success(u32 ioc_status, u32 ioc_config) +{ + if (!((ioc_status & MPI3_SYSIF_IOC_STATUS_READY) || + (ioc_config & MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC))) + return true; + return false; +} + +/** + * mpi3mr_diagfault_success - Check diag fault is success or not + * @mrioc: Adapter reference + * @ioc_status: IOC status register value + * + * Check whether the controller hit diag reset fault code. + * + * Return: True when there is diag fault, false otherwise. + */ +static inline bool mpi3mr_diagfault_success(struct mpi3mr_ioc *mrioc, + u32 ioc_status) +{ + u32 fault; + + if (!(ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) + return false; + fault = readl(&mrioc->sysif_regs->fault) & MPI3_SYSIF_FAULT_CODE_MASK; + if (fault == MPI3_SYSIF_FAULT_CODE_DIAG_FAULT_RESET) { + mpi3mr_print_fault_info(mrioc); + return true; + } + return false; +} + +/** + * mpi3mr_set_diagsave - Set diag save bit for snapdump + * @mrioc: Adapter reference + * + * Set diag save bit in IOC configuration register to enable + * snapdump. + * + * Return: Nothing. + */ +static inline void mpi3mr_set_diagsave(struct mpi3mr_ioc *mrioc) +{ + u32 ioc_config; + + ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); + ioc_config |= MPI3_SYSIF_IOC_CONFIG_DIAG_SAVE; + writel(ioc_config, &mrioc->sysif_regs->ioc_configuration); +} + +/** + * mpi3mr_issue_reset - Issue reset to the controller + * @mrioc: Adapter reference + * @reset_type: Reset type + * @reset_reason: Reset reason code + * + * Unlock the host diagnostic registers and write the specific + * reset type to that, wait for reset acknowledgment from the + * controller, if the reset is not successful retry for the + * predefined number of times. + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_issue_reset(struct mpi3mr_ioc *mrioc, u16 reset_type, + u32 reset_reason) +{ + int retval = -1; + u8 unlock_retry_count = 0; + u32 host_diagnostic, ioc_status, ioc_config; + u32 timeout = MPI3MR_RESET_ACK_TIMEOUT * 10; + + if ((reset_type != MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET) && + (reset_type != MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT)) + return retval; + if (mrioc->unrecoverable) + return retval; + if (reset_reason == MPI3MR_RESET_FROM_FIRMWARE) { + retval = 0; + return retval; + } + + ioc_info(mrioc, "%s reset due to %s(0x%x)\n", + mpi3mr_reset_type_name(reset_type), + mpi3mr_reset_rc_name(reset_reason), reset_reason); + + + mpi3mr_clear_reset_history(mrioc); + do { + ioc_info(mrioc, + "writing magic sequence to unlock host diag register (retry=%d)\n", + ++unlock_retry_count); + if (unlock_retry_count >= MPI3MR_HOSTDIAG_UNLOCK_RETRY_COUNT) { + ioc_err(mrioc, + "%s reset failed due to unlock failure, host_diagnostic(0x%08x)\n", + mpi3mr_reset_type_name(reset_type), + host_diagnostic); + mrioc->unrecoverable = 1; + return retval; + } + + writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_FLUSH, + &mrioc->sysif_regs->write_sequence); + writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_1ST, + &mrioc->sysif_regs->write_sequence); + writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_2ND, + &mrioc->sysif_regs->write_sequence); + writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_3RD, + &mrioc->sysif_regs->write_sequence); + writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_4TH, + &mrioc->sysif_regs->write_sequence); + writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_5TH, + &mrioc->sysif_regs->write_sequence); + writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_6TH, + &mrioc->sysif_regs->write_sequence); + usleep_range(1000, 1100); + host_diagnostic = readl(&mrioc->sysif_regs->host_diagnostic); + ioc_info(mrioc, + "wrote magic sequence: retry_count(%d), host_diagnostic(0x%08x)\n", + unlock_retry_count, host_diagnostic); + } while (!(host_diagnostic & MPI3_SYSIF_HOST_DIAG_DIAG_WRITE_ENABLE)); + + writel(reset_reason, &mrioc->sysif_regs->scratchpad[0]); + writel(host_diagnostic | reset_type, + &mrioc->sysif_regs->host_diagnostic); + switch (reset_type) { + case MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET: + do { + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + ioc_config = + readl(&mrioc->sysif_regs->ioc_configuration); + if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) + && mpi3mr_soft_reset_success(ioc_status, ioc_config) + ) { + mpi3mr_clear_reset_history(mrioc); + retval = 0; + break; + } + msleep(100); + } while (--timeout); + mpi3mr_print_fault_info(mrioc); + break; + case MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT: + do { + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + if (mpi3mr_diagfault_success(mrioc, ioc_status)) { + retval = 0; + break; + } + msleep(100); + } while (--timeout); + break; + default: + break; + } + + writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_2ND, + &mrioc->sysif_regs->write_sequence); + + ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + ioc_info(mrioc, + "ioc_status/ioc_config after %s reset is (0x%x)/(0x%x)\n", + (!retval)?"successful":"failed", ioc_status, + ioc_config); + if (retval) + mrioc->unrecoverable = 1; + return retval; +} + +/** + * mpi3mr_admin_request_post - Post request to admin queue + * @mrioc: Adapter reference + * @admin_req: MPI3 request + * @admin_req_sz: Request size + * @ignore_reset: Ignore reset in process + * + * Post the MPI3 request into admin request queue and + * inform the controller, if the queue is full return + * appropriate error. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_admin_request_post(struct mpi3mr_ioc *mrioc, void *admin_req, + u16 admin_req_sz, u8 ignore_reset) +{ + u16 areq_pi = 0, areq_ci = 0, max_entries = 0; + int retval = 0; + unsigned long flags; + u8 *areq_entry; + + if (mrioc->unrecoverable) { + ioc_err(mrioc, "admin request queue submission failed due to unrecoverable controller\n"); + return -EFAULT; + } + + spin_lock_irqsave(&mrioc->admin_req_lock, flags); + areq_pi = mrioc->admin_req_pi; + areq_ci = mrioc->admin_req_ci; + max_entries = mrioc->num_admin_req; + if ((areq_ci == (areq_pi + 1)) || ((!areq_ci) && + (areq_pi == (max_entries - 1)))) { + ioc_err(mrioc, "admin request queue submission failed due to queue full\n"); + retval = -EAGAIN; + goto out; + } + if (!ignore_reset && mrioc->reset_in_progress) { + ioc_err(mrioc, "admin request queue submission failed due to reset in progress\n"); + retval = -EAGAIN; + goto out; + } + areq_entry = (u8 *)mrioc->admin_req_base + + (areq_pi * MPI3MR_ADMIN_REQ_FRAME_SZ); + memset(areq_entry, 0, MPI3MR_ADMIN_REQ_FRAME_SZ); + memcpy(areq_entry, (u8 *)admin_req, admin_req_sz); + + if (++areq_pi == max_entries) + areq_pi = 0; + mrioc->admin_req_pi = areq_pi; + + writel(mrioc->admin_req_pi, &mrioc->sysif_regs->admin_request_queue_pi); + +out: + spin_unlock_irqrestore(&mrioc->admin_req_lock, flags); + + return retval; +} + +/** + * mpi3mr_op_request_post - Post request to operational queue + * @mrioc: Adapter reference + * @op_req_q: Operational request queue info + * @req: MPI3 request + * + * Post the MPI3 request into operational request queue and + * inform the controller, if the queue is full return + * appropriate error. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_op_request_post(struct mpi3mr_ioc *mrioc, + struct op_req_qinfo *op_req_q, u8 *req) +{ + u16 pi = 0, max_entries, reply_qidx = 0, midx; + int retval = 0; + unsigned long flags; + u8 *req_entry; + void *segment_base_addr; + u16 req_sz = mrioc->facts.op_req_sz; + struct mpi3_scsi_io_request *scsiio_req = + (struct mpi3_scsi_io_request *)req; + struct segments *segments = op_req_q->q_segments; + + reply_qidx = op_req_q->reply_qid - 1; + + if (mrioc->unrecoverable) + return -EFAULT; + + spin_lock_irqsave(&op_req_q->q_lock, flags); + pi = op_req_q->pi; + max_entries = op_req_q->num_requests; + + if (mpi3mr_check_req_qfull(op_req_q)) { + midx = REPLY_QUEUE_IDX_TO_MSIX_IDX( + reply_qidx, mrioc->op_reply_q_offset); + mpi3mr_process_op_reply_q(mrioc, + mrioc->intr_info[midx].op_reply_q); + + if (mpi3mr_check_req_qfull(op_req_q)) { + if (op_req_q->last_full_host_tag == + MPI3MR_HOSTTAG_INVALID) + op_req_q->qfull_instances++; + + op_req_q->last_full_host_tag = scsiio_req->host_tag; + op_req_q->qfull_io_count++; + retval = -EAGAIN; + goto out; + } + } + + if (op_req_q->last_full_host_tag != MPI3MR_HOSTTAG_INVALID) + op_req_q->last_full_host_tag = MPI3MR_HOSTTAG_INVALID; + + if (mrioc->reset_in_progress) { + ioc_err(mrioc, "operation request queue submission failed due to reset in progress\n"); + retval = -EAGAIN; + goto out; + } + + segment_base_addr = segments[pi / op_req_q->segment_qd].segment; + req_entry = (u8 *)segment_base_addr + + ((pi % op_req_q->segment_qd) * req_sz); + + memset(req_entry, 0, req_sz); + memcpy(req_entry, req, MPI3MR_ADMIN_REQ_FRAME_SZ); + + if (++pi == max_entries) + pi = 0; + op_req_q->pi = pi; + +#ifndef CONFIG_PREEMPT_RT + if (atomic_inc_return(&mrioc->op_reply_qinfo[reply_qidx].pend_ios) + > MPI3MR_IRQ_POLL_TRIGGER_IOCOUNT) + mrioc->op_reply_qinfo[reply_qidx].enable_irq_poll = true; +#else + atomic_inc_return(&mrioc->op_reply_qinfo[reply_qidx].pend_ios); +#endif + +#if defined(IO_COUNTER_SUPPORT) + atomic_inc(&mrioc->pend_ios); +#endif + + writel(op_req_q->pi, + &mrioc->sysif_regs->oper_queue_indexes[reply_qidx].producer_index); + +out: + spin_unlock_irqrestore(&op_req_q->q_lock, flags); + return retval; +} + +/** + * mpi3mr_check_rh_fault_ioc - check reset history and fault + * controller + * @mrioc: Adapter instance reference + * @reason_code, reason code for the fault. + * + * This routine will save snapdump and fault the controller with + * the given reason code if it is not already in the fault or + * not asynchronosuly reset. This will be used to handle + * initilaization time faults/resets/timeout as in those cases + * immediate soft reset invocation is not required. + * + * Return: None. + */ +void mpi3mr_check_rh_fault_ioc(struct mpi3mr_ioc *mrioc, u32 reason_code) +{ + u32 ioc_status, host_diagnostic, timeout, fault; + + if (mrioc->unrecoverable) { + ioc_err(mrioc, "controller is unrecoverable\n"); + return; + } + + if (!pci_device_is_present(mrioc->pdev)) + { + mrioc->unrecoverable = 1; + ioc_err(mrioc, "controller is not present\n"); + return; + } + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + if (ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) { + mpi3mr_set_trigger_data_in_all_hdb(mrioc, + MPI3MR_HDB_TRIGGER_TYPE_FW_RELEASED, 0, 0); + return; + } else if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) { + fault = readl(&mrioc->sysif_regs->fault); + mpi3mr_set_trigger_data_in_all_hdb(mrioc, + MPI3MR_HDB_TRIGGER_TYPE_FAULT, fault, 0); + mpi3mr_print_fault_info(mrioc); + return; + } + mpi3mr_set_diagsave(mrioc); + mpi3mr_issue_reset(mrioc, MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT, + reason_code); + fault = readl(&mrioc->sysif_regs->fault); + mpi3mr_set_trigger_data_in_all_hdb(mrioc, MPI3MR_HDB_TRIGGER_TYPE_FAULT, + fault, 0); + timeout = MPI3_SYSIF_DIAG_SAVE_TIMEOUT * 10; + do { + host_diagnostic = readl(&mrioc->sysif_regs->host_diagnostic); + if (!(host_diagnostic & MPI3_SYSIF_HOST_DIAG_SAVE_IN_PROGRESS)) + break; + msleep(100); + } while (--timeout); +} + +/** + * mpi3mr_sync_timestamp - Issue time stamp sync request + * @mrioc: Adapter reference + * + * Issue IO Unit Control MPI request to synchornize firmware + * timestamp with host time. + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_sync_timestamp(struct mpi3mr_ioc *mrioc) +{ + ktime_t current_time; + struct mpi3_iounit_control_request iou_ctrl; + int retval = 0; + + memset(&iou_ctrl, 0, sizeof(iou_ctrl)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "sending time stamp update failed due to command in use\n"); + mutex_unlock(&mrioc->init_cmds.mutex); + goto out; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + iou_ctrl.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + iou_ctrl.function = MPI3_FUNCTION_IO_UNIT_CONTROL; + iou_ctrl.operation = MPI3_CTRL_OP_UPDATE_TIMESTAMP; + current_time = ktime_get_real(); + iou_ctrl.param64[0] = cpu_to_le64(ktime_to_ms(current_time)); + + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &iou_ctrl, + sizeof(iou_ctrl), 0); + if (retval) { + ioc_err(mrioc, "posting time stame update failed\n"); + goto out_unlock; + } + + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "time stamp update timed out\n"); + mrioc->init_cmds.is_waiting = 0; + if (!(mrioc->init_cmds.state & MPI3MR_CMD_RESET)) + mpi3mr_soft_reset_handler(mrioc, + MPI3MR_RESET_FROM_TSU_TIMEOUT, 1); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, + "time stamp update returned with ioc_status(0x%04x), log_info(0x%08x)\n", + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + retval = -1; + goto out_unlock; + } + +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); + +out: + return retval; +} + +/** + * mpi3mr_print_pkg_ver - display controller fw package version + * @mrioc: Adapter reference + * + * Retrieve firmware package version from the component image + * manifest of the controller flash and display it. + * + * Return: 0 on success and non-zero on failure. + */ +static int mpi3mr_print_pkg_ver(struct mpi3mr_ioc *mrioc) +{ + struct mpi3_ci_upload_request ci_upload; + int retval = -1; + void *data = NULL; + dma_addr_t data_dma; + struct mpi3_ci_manifest_mpi *manifest; + u32 data_len = sizeof(struct mpi3_ci_manifest_mpi); + + u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST; + + data = dma_zalloc_coherent(&mrioc->pdev->dev, data_len, &data_dma, + GFP_KERNEL); + if (!data) + return -ENOMEM; + + memset(&ci_upload, 0, sizeof(ci_upload)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + ioc_err(mrioc, + "issue ci manifest upload failed due to command in use\n"); + mutex_unlock(&mrioc->init_cmds.mutex); + goto out; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + ci_upload.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + ci_upload.function = MPI3_FUNCTION_CI_UPLOAD; + ci_upload.msg_flags = MPI3_CI_UPLOAD_MSGFLAGS_LOCATION_PRIMARY; + ci_upload.signature1 = MPI3_IMAGE_HEADER_SIGNATURE1_MANIFEST; + ci_upload.image_offset = MPI3_IMAGE_HEADER_SIZE; + ci_upload.segment_size = data_len; + + mpi3mr_add_sg_single(&ci_upload.sgl, sgl_flags, data_len, + data_dma); + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &ci_upload, + sizeof(ci_upload), 1); + if (retval) { + ioc_err(mrioc, "issue ci manifest upload failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "issue ci manifest upload timed out\n"); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_GETPKGVER_TIMEOUT); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + == MPI3_IOCSTATUS_SUCCESS) { + manifest = (struct mpi3_ci_manifest_mpi *) data; + if (manifest->manifest_type == MPI3_CI_MANIFEST_TYPE_MPI) { + ioc_info(mrioc, + "firmware package version(%d.%d.%d.%d.%05d-%05d)\n", + manifest->package_version.gen_major, + manifest->package_version.gen_minor, + manifest->package_version.phase_major, + manifest->package_version.phase_minor, + manifest->package_version.customer_id, + manifest->package_version.build_num); + } + } + retval = 0; +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); + +out: + if (data) + dma_free_coherent(&mrioc->pdev->dev, data_len, data, + data_dma); + return retval; +} + +/** + * mpi3mr_upload_drv_diag_buffer - upload the driver diag log + * @mrioc: Adapter reference + * + * Uploads the driver buffer to driver internal memory from the + * firmware which might have UEFI boot Services log and cache + * the returned data length from the upload into the per adapter + * structure. + * + * Return: Nothing. + */ +static void +mpi3mr_upload_drv_diag_buffer(struct mpi3mr_ioc *mrioc) +{ + struct mpi3_diag_buffer_upload_request diag_upload; + struct mpi3_diag_buffer_upload_reply *diag_upload_reply; + int retval = 0; + dma_addr_t data_dma; + void *data; + u32 data_len; + u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST; + + data_dma = mrioc->drv_diag_buffer_dma; + data = mrioc->drv_diag_buffer; + + /* At max half of the diag buffer can be used for uefi logs */ + data_len = min_t(int, mrioc->facts.diag_drvr_sz, + mrioc->drv_diag_buffer_sz/2); + + memset(&diag_upload, 0, sizeof(diag_upload)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + ioc_err(mrioc, "sending driver diag upload failed due to command in use\n"); + mutex_unlock(&mrioc->init_cmds.mutex); + return; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + diag_upload.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + diag_upload.function = MPI3_FUNCTION_DIAG_BUFFER_UPLOAD; + diag_upload.msg_flags = 0; + diag_upload.change_count = 0; + diag_upload.type = MPI3_DIAG_BUFFER_TYPE_DRIVER; + diag_upload.flags = + MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_ASCII; + diag_upload.context = 0; + + mpi3mr_add_sg_single(&diag_upload.sgl, sgl_flags, data_len, + data_dma); + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &diag_upload, + sizeof(diag_upload), 1); + if (retval) { + ioc_err(mrioc, "posting driver diag upload failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "driver diag upload timed out\n"); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_DIAG_BUFFER_UPLOAD_TIMEOUT); + goto out_unlock; + } + + mrioc->uefi_logs_sz = 0; + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, + "driver diag upload returned with ioc_status(0x%04x),log_info(0x%08x)\n", + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + } else { + diag_upload_reply = (struct mpi3_diag_buffer_upload_reply *) + mrioc->init_cmds.reply; + mrioc->uefi_logs_sz = + le32_to_cpu(diag_upload_reply->returned_data_length); + if (mrioc->uefi_logs_sz) { + mrioc->uefi_logs = vzalloc(mrioc->uefi_logs_sz); + if (!mrioc->uefi_logs) + mrioc->uefi_logs_sz = 0; + else + memcpy(mrioc->uefi_logs, data, data_len); + } + ioc_info(mrioc, + "driver diag buffer upload is success size drv/fw/final(%d/%d/%d)\n", + data_len, + le32_to_cpu(diag_upload_reply->returned_data_length), + mrioc->uefi_logs_sz); + } + +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); + return; +} + +/** + * mpi3mr_watchdog_work - watchdog thread to monitor faults + * @work: work struct + * + * Watch dog work periodically executed (1 second interval) to + * monitor firmware fault and to issue periodic timer sync to + * the firmware. + * + * Return: Nothing. + */ +static void mpi3mr_watchdog_work(struct work_struct *work) +{ + struct mpi3mr_ioc *mrioc = + container_of(work, struct mpi3mr_ioc, watchdog_work.work); + unsigned long flags; + enum mpi3mr_iocstate ioc_state; + u32 fault, host_diagnostic, ioc_status; + u32 reset_reason = MPI3MR_RESET_FROM_FAULT_WATCH; + + if (mrioc->reset_in_progress) + return; + + if (!mrioc->unrecoverable && !pci_device_is_present(mrioc->pdev)) { + ioc_err(mrioc, "watchdog could not detect the controller\n"); + mrioc->unrecoverable = 1; + } + + if (mrioc->unrecoverable) { + ioc_err(mrioc, "flush pending commands for unrecoverable controller\n"); + mpi3mr_flush_cmds_for_unrecovered_controller(mrioc); + return; + } + + if (mrioc->ts_update_counter++ >= MPI3MR_TSUPDATE_INTERVAL) { + /* No need to capture uefi snapdump + * after certain time elapsed. */ + mrioc->skip_uefi_snapdump = true; + mrioc->ts_update_counter = 0; + mpi3mr_sync_timestamp(mrioc); + } + + if ((mrioc->prepare_for_reset) && + ((mrioc->prepare_for_reset_timeout_counter++) >= + MPI3MR_PREPARE_FOR_RESET_TIMEOUT)) { + mpi3mr_soft_reset_handler(mrioc, + MPI3MR_RESET_FROM_CIACTVRST_TIMER, 1); + return; + } + + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + if (ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) { + mpi3mr_set_trigger_data_in_all_hdb(mrioc, + MPI3MR_HDB_TRIGGER_TYPE_FW_RELEASED, 0, 0); + mpi3mr_soft_reset_handler(mrioc, MPI3MR_RESET_FROM_FIRMWARE, 0); + return; + } + + /*Check for fault state every one second and issue Soft reset*/ + ioc_state = mpi3mr_get_iocstate(mrioc); + if (ioc_state != MRIOC_STATE_FAULT) + goto schedule_work; + + fault = readl(&mrioc->sysif_regs->fault); + mpi3mr_set_trigger_data_in_all_hdb(mrioc, + MPI3MR_HDB_TRIGGER_TYPE_FAULT, fault, 0); + fault = fault & MPI3_SYSIF_FAULT_CODE_MASK; + host_diagnostic = readl(&mrioc->sysif_regs->host_diagnostic); + if (host_diagnostic & MPI3_SYSIF_HOST_DIAG_SAVE_IN_PROGRESS) { + if (!mrioc->diagsave_timeout) { + mpi3mr_print_fault_info(mrioc); + ioc_warn(mrioc, "diag save in progress\n"); + mpi3mr_do_dump(mrioc); + } + if ((mrioc->diagsave_timeout++) <= MPI3_SYSIF_DIAG_SAVE_TIMEOUT) + goto schedule_work; + } + + mpi3mr_print_fault_info(mrioc); + mrioc->diagsave_timeout = 0; + + switch (fault) { + case MPI3_SYSIF_FAULT_CODE_COMPLETE_RESET_NEEDED: + case MPI3_SYSIF_FAULT_CODE_POWER_CYCLE_REQUIRED: + ioc_warn(mrioc, + "controller requires system power cycle, marking controller as unrecoverable\n"); + mrioc->unrecoverable = 1; + goto schedule_work; + case MPI3_SYSIF_FAULT_CODE_SOFT_RESET_IN_PROGRESS: + return; + case MPI3_SYSIF_FAULT_CODE_CI_ACTIVATION_RESET: + reset_reason = MPI3MR_RESET_FROM_CIACTIV_FAULT; + break; + default: + break; + } + mpi3mr_soft_reset_handler(mrioc, reset_reason, 0); + return; + +schedule_work: + spin_lock_irqsave(&mrioc->watchdog_lock, flags); + if (mrioc->watchdog_work_q) + queue_delayed_work(mrioc->watchdog_work_q, + &mrioc->watchdog_work, + msecs_to_jiffies(MPI3MR_WATCHDOG_INTERVAL)); + spin_unlock_irqrestore(&mrioc->watchdog_lock, flags); + return; +} + +/** + * mpi3mr_start_watchdog - Start watchdog + * @mrioc: Adapter instance reference + * + * Create and start the watchdog thread to monitor controller + * faults. + * + * Return: Nothing. + */ +void mpi3mr_start_watchdog(struct mpi3mr_ioc *mrioc) +{ + if (mrioc->watchdog_work_q) + return; + + INIT_DELAYED_WORK(&mrioc->watchdog_work, mpi3mr_watchdog_work); + snprintf(mrioc->watchdog_work_q_name, + sizeof(mrioc->watchdog_work_q_name), "watchdog_%s%d", mrioc->name, + mrioc->id); + mrioc->watchdog_work_q = + create_singlethread_workqueue(mrioc->watchdog_work_q_name); + if (!mrioc->watchdog_work_q) { + ioc_err(mrioc, "%s: failed (line=%d)\n", __func__, __LINE__); + return; + } + + if (mrioc->watchdog_work_q) + queue_delayed_work(mrioc->watchdog_work_q, + &mrioc->watchdog_work, + msecs_to_jiffies(MPI3MR_WATCHDOG_INTERVAL)); +} + +/** + * mpi3mr_stop_watchdog - Stop watchdog + * @mrioc: Adapter instance reference + * + * Stop the watchdog thread created to monitor controller + * faults. + * + * Return: Nothing. + */ +void mpi3mr_stop_watchdog(struct mpi3mr_ioc *mrioc) +{ + unsigned long flags; + struct workqueue_struct *wq; + + spin_lock_irqsave(&mrioc->watchdog_lock, flags); + wq = mrioc->watchdog_work_q; + mrioc->watchdog_work_q = NULL; + spin_unlock_irqrestore(&mrioc->watchdog_lock, flags); + if (wq) { + if (!cancel_delayed_work_sync(&mrioc->watchdog_work)) + flush_workqueue(wq); + destroy_workqueue(wq); + } +} + + +/** + * mpi3mr_free_op_req_q_segments - free request memory segments + * @mrioc: Adapter instance reference + * @q_idx: operational request queue index + * + * Free memory segments allocated for operational request queue + * + * Return: Nothing. + */ +static void mpi3mr_free_op_req_q_segments(struct mpi3mr_ioc *mrioc, u16 q_idx) +{ + u16 j; + int size; + struct segments *segments; + + segments = mrioc->req_qinfo[q_idx].q_segments; + if (!segments) + return; + + if (mrioc->is_segqueue_enabled) { + size = MPI3MR_OP_REQ_Q_SEG_SIZE; + if (mrioc->req_qinfo[q_idx].q_segment_list) { + dma_free_coherent(&mrioc->pdev->dev, + MPI3MR_MAX_SEG_LIST_SIZE, + mrioc->req_qinfo[q_idx].q_segment_list, + mrioc->req_qinfo[q_idx].q_segment_list_dma); + mrioc->req_qinfo[q_idx].q_segment_list = NULL; + } + } else + size = mrioc->req_qinfo[q_idx].segment_qd * + mrioc->facts.op_req_sz; + + for (j = 0; j < mrioc->req_qinfo[q_idx].num_segments; j++) { + if (!segments[j].segment) + continue; + dma_free_coherent(&mrioc->pdev->dev, + size, segments[j].segment, segments[j].segment_dma); + segments[j].segment = NULL; + } + kfree(mrioc->req_qinfo[q_idx].q_segments); + mrioc->req_qinfo[q_idx].q_segments = NULL; + mrioc->req_qinfo[q_idx].qid = 0; +} + +/** + * mpi3mr_free_op_reply_q_segments - free reply memory segments + * @mrioc: Adapter instance reference + * @q_idx: operational reply queue index + * + * Free memory segments allocated for operational reply queue + * + * Return: Nothing. + */ +static void mpi3mr_free_op_reply_q_segments(struct mpi3mr_ioc *mrioc, u16 q_idx) +{ + u16 j; + int size; + struct segments *segments; + + segments = mrioc->op_reply_qinfo[q_idx].q_segments; + if (!segments) + return; + + if (mrioc->is_segqueue_enabled) { + size = MPI3MR_OP_REP_Q_SEG_SIZE; + if (mrioc->op_reply_qinfo[q_idx].q_segment_list) { + dma_free_coherent(&mrioc->pdev->dev, + MPI3MR_MAX_SEG_LIST_SIZE, + mrioc->op_reply_qinfo[q_idx].q_segment_list, + mrioc->op_reply_qinfo[q_idx].q_segment_list_dma); + mrioc->op_reply_qinfo[q_idx].q_segment_list = NULL; + } + } else + size = mrioc->op_reply_qinfo[q_idx].segment_qd * + mrioc->op_reply_desc_sz; + + for (j = 0; j < mrioc->op_reply_qinfo[q_idx].num_segments; j++) { + if (!segments[j].segment) + continue; + dma_free_coherent(&mrioc->pdev->dev, + size, segments[j].segment, segments[j].segment_dma); + segments[j].segment = NULL; + } + + kfree(mrioc->op_reply_qinfo[q_idx].q_segments); + mrioc->op_reply_qinfo[q_idx].q_segments = NULL; + mrioc->op_reply_qinfo[q_idx].qid = 0; +} + + +/** + * mpi3mr_delete_op_reply_q - delete operational reply queue + * @mrioc: Adapter instance reference + * @qidx: operational reply queue index + * + * Delete operational reply queue by issuing MPI request + * through admin queue. + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_delete_op_reply_q(struct mpi3mr_ioc *mrioc, u16 qidx) +{ + struct mpi3_delete_reply_queue_request delq_req; + struct op_reply_qinfo *op_reply_q = mrioc->op_reply_qinfo + qidx; + int retval = 0; + u16 reply_qid = 0, midx; + + reply_qid = op_reply_q->qid; + + midx = REPLY_QUEUE_IDX_TO_MSIX_IDX(qidx, mrioc->op_reply_q_offset); + + if (!reply_qid) { + retval = -1; + ioc_err(mrioc, "delete reply queue failed due to invalid reply queue id\n"); + goto out; + } + + (op_reply_q->qtype == MPI3MR_DEFAULT_QUEUE) ? mrioc->default_qcount-- : + mrioc->active_poll_qcount--; + + memset(&delq_req, 0, sizeof(delq_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "sending delete reply queue failed due to command in use\n"); + mutex_unlock(&mrioc->init_cmds.mutex); + goto out; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + delq_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + delq_req.function = MPI3_FUNCTION_DELETE_REPLY_QUEUE; + delq_req.queue_id = cpu_to_le16(reply_qid); + + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &delq_req, sizeof(delq_req), + 1); + if (retval) { + ioc_err(mrioc, "posting delete reply queue failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "delete reply queue timed out\n"); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_DELREPQ_TIMEOUT); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, + "delete reply queue returned with ioc_status(0x%04x), log_info(0x%08x)\n", + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + retval = -1; + goto out_unlock; + } + mrioc->intr_info[midx].op_reply_q = NULL; + + mpi3mr_free_op_reply_q_segments(mrioc, qidx); +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); +out: + return retval; +} + +/** + * mpi3mr_alloc_op_reply_q_segments -Alloc segmented reply pool + * @mrioc: Adapter instance reference + * @qidx: request queue index + * + * Allocate segmented memory pools for operational reply + * queue. + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_alloc_op_reply_q_segments(struct mpi3mr_ioc *mrioc, u16 qidx) +{ + struct op_reply_qinfo *op_reply_q = mrioc->op_reply_qinfo + qidx; + int i, size; + u64 *q_segment_list_entry = NULL; + struct segments *segments; + + if (mrioc->is_segqueue_enabled) { + op_reply_q->segment_qd = + MPI3MR_OP_REP_Q_SEG_SIZE / mrioc->op_reply_desc_sz; + + size = MPI3MR_OP_REP_Q_SEG_SIZE; + + op_reply_q->q_segment_list = + dma_zalloc_coherent(&mrioc->pdev->dev, + MPI3MR_MAX_SEG_LIST_SIZE, + &op_reply_q->q_segment_list_dma, GFP_KERNEL); + if (!op_reply_q->q_segment_list) + return -ENOMEM; + q_segment_list_entry = (u64 *)op_reply_q->q_segment_list; + } else { + op_reply_q->segment_qd = op_reply_q->num_replies; + size = op_reply_q->num_replies * mrioc->op_reply_desc_sz; + } + + op_reply_q->num_segments = DIV_ROUND_UP(op_reply_q->num_replies, + op_reply_q->segment_qd); + + op_reply_q->q_segments = kcalloc(op_reply_q->num_segments, + sizeof(struct segments), GFP_KERNEL); + if (!op_reply_q->q_segments) + return -ENOMEM; + + segments = op_reply_q->q_segments; + for (i = 0; i < op_reply_q->num_segments; i++) { + segments[i].segment = + dma_zalloc_coherent(&mrioc->pdev->dev, + size, &segments[i].segment_dma, GFP_KERNEL); + if (!segments[i].segment) + return -ENOMEM; + if (mrioc->is_segqueue_enabled) + q_segment_list_entry[i] = + (unsigned long)segments[i].segment_dma; + } + + return 0; +} + +/** + * mpi3mr_alloc_op_req_q_segments - Alloc segmented req pool. + * @mrioc: Adapter instance reference + * @qidx: request queue index + * + * Allocate segmented memory pools for operational request + * queue. + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_alloc_op_req_q_segments(struct mpi3mr_ioc *mrioc, u16 qidx) +{ + struct op_req_qinfo *op_req_q = mrioc->req_qinfo + qidx; + int i, size; + u64 *q_segment_list_entry = NULL; + struct segments *segments; + + if (mrioc->is_segqueue_enabled) { + op_req_q->segment_qd = + MPI3MR_OP_REQ_Q_SEG_SIZE / mrioc->facts.op_req_sz; + + size = MPI3MR_OP_REQ_Q_SEG_SIZE; + + op_req_q->q_segment_list = + dma_zalloc_coherent(&mrioc->pdev->dev, + MPI3MR_MAX_SEG_LIST_SIZE, + &op_req_q->q_segment_list_dma, GFP_KERNEL); + if (!op_req_q->q_segment_list) + return -ENOMEM; + q_segment_list_entry = (u64 *)op_req_q->q_segment_list; + + } else { + op_req_q->segment_qd = op_req_q->num_requests; + size = op_req_q->num_requests * mrioc->facts.op_req_sz; + } + + op_req_q->num_segments = DIV_ROUND_UP(op_req_q->num_requests, + op_req_q->segment_qd); + + op_req_q->q_segments = kcalloc(op_req_q->num_segments, + sizeof(struct segments), GFP_KERNEL); + if (!op_req_q->q_segments) + return -ENOMEM; + + segments = op_req_q->q_segments; + for (i = 0; i < op_req_q->num_segments; i++) { + segments[i].segment = + dma_zalloc_coherent(&mrioc->pdev->dev, + size, &segments[i].segment_dma, GFP_KERNEL); + if (!segments[i].segment) + return -ENOMEM; + if (mrioc->is_segqueue_enabled) + q_segment_list_entry[i] = + (unsigned long)segments[i].segment_dma; + } + + return 0; +} + +/** + * mpi3mr_create_op_reply_q - create operational reply queue + * @mrioc: Adapter instance reference + * @qidx: operational reply queue index + * + * Create operational reply queue by issuing MPI request + * through admin queue. + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_create_op_reply_q(struct mpi3mr_ioc *mrioc, u16 qidx) +{ + struct mpi3_create_reply_queue_request create_req; + struct op_reply_qinfo *op_reply_q = mrioc->op_reply_qinfo + qidx; + int retval = 0; + u16 reply_qid = 0, midx; + + reply_qid = op_reply_q->qid; + + midx = REPLY_QUEUE_IDX_TO_MSIX_IDX(qidx, mrioc->op_reply_q_offset); + + if (reply_qid) { + retval = -1; + ioc_err(mrioc, "create reply queue failed due to duplicate qid(%d)\n", + reply_qid); + + return retval; + } + + reply_qid = qidx + 1; + op_reply_q->num_replies = MPI3MR_OP_REP_Q_QD; + if (!mrioc->pdev->revision) + op_reply_q->num_replies = MPI3MR_OP_REP_Q_QD4K; + op_reply_q->ci = 0; + op_reply_q->ephase = 1; + atomic_set(&op_reply_q->pend_ios, 0); + atomic_set(&op_reply_q->in_use, 0); + op_reply_q->enable_irq_poll = false; + + if (!op_reply_q->q_segments) { + retval = mpi3mr_alloc_op_reply_q_segments(mrioc, qidx); + if (retval) { + mpi3mr_free_op_reply_q_segments(mrioc, qidx); + goto out; + } + } + + memset(&create_req, 0, sizeof(create_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "sending create reply queue failed due to command in use\n"); + goto out_unlock; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + create_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + create_req.function = MPI3_FUNCTION_CREATE_REPLY_QUEUE; + create_req.queue_id = cpu_to_le16(reply_qid); + + if (midx < (mrioc->intr_info_count - mrioc->requested_poll_qcount)) + op_reply_q->qtype = MPI3MR_DEFAULT_QUEUE; + else + op_reply_q->qtype = MPI3MR_POLL_QUEUE; + + if (op_reply_q->qtype == MPI3MR_DEFAULT_QUEUE) { + create_req.flags = + MPI3_CREATE_REPLY_QUEUE_FLAGS_INT_ENABLE_ENABLE; + create_req.msix_index = + cpu_to_le16(mrioc->intr_info[midx].msix_index); + } else { + create_req.msix_index = cpu_to_le16(mrioc->intr_info_count - 1); + ioc_info(mrioc, "create reply queue(polled): for qid(%d), midx(%d)\n", + reply_qid, midx); + if (!mrioc->active_poll_qcount) + disable_irq_nosync(pci_irq_vector(mrioc->pdev, + mrioc->intr_info_count - 1)); + } + + if (mrioc->is_segqueue_enabled) { + create_req.flags |= + MPI3_CREATE_REQUEST_QUEUE_FLAGS_SEGMENTED_SEGMENTED; + create_req.base_address = cpu_to_le64( + op_reply_q->q_segment_list_dma); + } else + create_req.base_address = cpu_to_le64( + op_reply_q->q_segments[0].segment_dma); + + create_req.size = cpu_to_le16(op_reply_q->num_replies); + + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &create_req, + sizeof(create_req), 1); + if (retval) { + ioc_err(mrioc, "posting create reply queue failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "create reply queue timed out\n"); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_CREATEREPQ_TIMEOUT); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, + "create reply queue returned with ioc_status(0x%04x), log_info(0x%08x)\n", + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + retval = -1; + goto out_unlock; + } + op_reply_q->qid = reply_qid; + if (midx < mrioc->intr_info_count) + mrioc->intr_info[midx].op_reply_q = op_reply_q; + + (op_reply_q->qtype == MPI3MR_DEFAULT_QUEUE) ? mrioc->default_qcount++ : + mrioc->active_poll_qcount++; + +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); +out: + + return retval; +} + +/** + * mpi3mr_create_op_req_q - create operational request queue + * @mrioc: Adapter instance reference + * @idx: operational request queue index + * @reply_qid: Reply queue ID + * + * Create operational request queue by issuing MPI request + * through admin queue. + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_create_op_req_q(struct mpi3mr_ioc *mrioc, u16 idx, + u16 reply_qid) +{ + struct mpi3_create_request_queue_request create_req; + struct op_req_qinfo *op_req_q = mrioc->req_qinfo + idx; + int retval = 0; + u16 req_qid = 0; + + + req_qid = op_req_q->qid; + + if (req_qid) { + retval = -1; + ioc_err(mrioc, "create request queue failed due to duplicate qid(%d)\n", + req_qid); + + return retval; + } + req_qid = idx + 1; + + op_req_q->num_requests = MPI3MR_OP_REQ_Q_QD; + op_req_q->ci = 0; + op_req_q->pi = 0; + op_req_q->reply_qid = reply_qid; + op_req_q->last_full_host_tag = MPI3MR_HOSTTAG_INVALID; + op_req_q->qfull_io_count = 0; + op_req_q->qfull_instances = 0; + spin_lock_init(&op_req_q->q_lock); + + if (!op_req_q->q_segments) { + retval = mpi3mr_alloc_op_req_q_segments(mrioc, idx); + if (retval) { + mpi3mr_free_op_req_q_segments(mrioc, idx); + goto out; + } + } + + memset(&create_req, 0, sizeof(create_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "sending create request queue failed due to command in use\n"); + goto out_unlock; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + create_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + create_req.function = MPI3_FUNCTION_CREATE_REQUEST_QUEUE; + create_req.queue_id = cpu_to_le16(req_qid); + if (mrioc->is_segqueue_enabled) { + create_req.flags = + MPI3_CREATE_REQUEST_QUEUE_FLAGS_SEGMENTED_SEGMENTED; + create_req.base_address = cpu_to_le64( + op_req_q->q_segment_list_dma); + } else + create_req.base_address = cpu_to_le64( + op_req_q->q_segments[0].segment_dma); + create_req.reply_queue_id = cpu_to_le16(reply_qid); + create_req.size = cpu_to_le16(op_req_q->num_requests); + + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &create_req, + sizeof(create_req), 1); + if (retval) { + ioc_err(mrioc, "posting create request queue failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "create request queue timed out\n"); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_CREATEREQQ_TIMEOUT); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, + "create request queue returned with ioc_status(0x%04x), log_info(0x%08x)\n", + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + retval = -1; + goto out_unlock; + } + op_req_q->qid = req_qid; + +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); +out: + + return retval; +} + +/** + * mpi3mr_create_op_queues - create operational queue pairs + * @mrioc: Adapter instance reference + * + * Allocate memory for operational queue meta data and call + * create request and reply queue functions. + * + * Return: 0 on success, non-zero on failures. + */ +static int mpi3mr_create_op_queues(struct mpi3mr_ioc *mrioc) +{ + int retval = 0; + u16 num_queues = 0, i = 0, msix_count_op_q = 1; + + num_queues = min_t(int, mrioc->facts.max_op_reply_q, + mrioc->facts.max_op_req_q); + + msix_count_op_q = + mrioc->intr_info_count - mrioc->op_reply_q_offset; + if (!mrioc->num_queues) + mrioc->num_queues = min_t(int, num_queues, msix_count_op_q); + /* + * During reset set the num_queues to the number of queues + * that was set before the reset. + */ + num_queues = mrioc->num_op_reply_q ? + mrioc->num_op_reply_q : mrioc->num_queues; + ioc_info(mrioc, "trying to create %d operational queue pairs\n", + num_queues); + + if (!mrioc->req_qinfo) { + mrioc->req_qinfo = kcalloc(num_queues, + sizeof(struct op_req_qinfo), GFP_KERNEL); + if (!mrioc->req_qinfo) { + retval = -1; + goto out_failed; + } + + mrioc->op_reply_qinfo = kzalloc(sizeof(struct op_reply_qinfo) * + num_queues, GFP_KERNEL); + if (!mrioc->op_reply_qinfo) { + retval = -1; + goto out_failed; + } + } + + if (mrioc->is_segqueue_enabled) + ioc_info(mrioc, + "allocating operational queues through segmented queues\n"); + + for (i = 0; i < num_queues; i++) { + if (mpi3mr_create_op_reply_q(mrioc, i)) { + ioc_err(mrioc, + "cannot create operational reply queue %d\n", i); + break; + } + if (mpi3mr_create_op_req_q(mrioc, i, + mrioc->op_reply_qinfo[i].qid)) { + ioc_err(mrioc, + "cannot create operational request queue %d\n", i); + mpi3mr_delete_op_reply_q(mrioc, i); + break; + } + } + + if (i == 0) { + /* Not even one queue is created successfully*/ + retval = -1; + goto out_failed; + } + mrioc->num_op_reply_q = mrioc->num_op_req_q = i; + ioc_info(mrioc, "successfully created %d operational queue pairs(default/polled) queue = (%d/%d)\n", + mrioc->num_op_reply_q, mrioc->default_qcount, + mrioc->active_poll_qcount); + + return retval; +out_failed: + kfree(mrioc->req_qinfo); + mrioc->req_qinfo = NULL; + + kfree(mrioc->op_reply_qinfo); + mrioc->op_reply_qinfo = NULL; + + + return retval; +} + +/** + * mpi3mr_setup_admin_qpair - Setup admin queue pair + * @mrioc: Adapter instance reference + * + * Allocate memory for admin queue pair if required and register + * the admin queue with the controller. + * + * Return: 0 on success, non-zero on failures. + */ +static int mpi3mr_setup_admin_qpair(struct mpi3mr_ioc *mrioc) +{ + int retval = 0; + u32 num_admin_entries = 0; + + mrioc->admin_req_q_sz = MPI3MR_ADMIN_REQ_Q_SIZE; + mrioc->num_admin_req = mrioc->admin_req_q_sz / + MPI3MR_ADMIN_REQ_FRAME_SZ; + mrioc->admin_req_ci = mrioc->admin_req_pi = 0; + mrioc->admin_req_base = NULL; + + mrioc->admin_reply_q_sz = MPI3MR_ADMIN_REPLY_Q_SIZE; + mrioc->num_admin_replies = mrioc->admin_reply_q_sz / + MPI3MR_ADMIN_REPLY_FRAME_SZ; + mrioc->admin_reply_ci = 0; + mrioc->admin_reply_ephase = 1; + mrioc->admin_reply_base = NULL; + atomic_set(&mrioc->admin_reply_q_in_use, 0); + + if (!mrioc->admin_req_base) { + mrioc->admin_req_base = dma_zalloc_coherent(&mrioc->pdev->dev, + mrioc->admin_req_q_sz, &mrioc->admin_req_dma, GFP_KERNEL); + + if (!mrioc->admin_req_base) { + retval = -1; + goto out_failed; + } + + mrioc->admin_reply_base = dma_zalloc_coherent(&mrioc->pdev->dev, + mrioc->admin_reply_q_sz, &mrioc->admin_reply_dma, + GFP_KERNEL); + + if (!mrioc->admin_reply_base) { + retval = -1; + goto out_failed; + } + + } + + num_admin_entries = (mrioc->num_admin_replies << 16) | + (mrioc->num_admin_req); + writel(num_admin_entries, &mrioc->sysif_regs->admin_queue_num_entries); + mpi3mr_writeq(mrioc->admin_req_dma, + &mrioc->sysif_regs->admin_request_queue_address, + &mrioc->adm_req_q_bar_writeq_lock); + mpi3mr_writeq(mrioc->admin_reply_dma, + &mrioc->sysif_regs->admin_reply_queue_address, + &mrioc->adm_reply_q_bar_writeq_lock); + writel(mrioc->admin_req_pi, &mrioc->sysif_regs->admin_request_queue_pi); + writel(mrioc->admin_reply_ci, &mrioc->sysif_regs->admin_reply_queue_ci); + return retval; + +out_failed: + + if (mrioc->admin_reply_base) { + dma_free_coherent(&mrioc->pdev->dev, mrioc->admin_reply_q_sz, + mrioc->admin_reply_base, mrioc->admin_reply_dma); + mrioc->admin_reply_base = NULL; + } + if (mrioc->admin_req_base) { + dma_free_coherent(&mrioc->pdev->dev, mrioc->admin_req_q_sz, + mrioc->admin_req_base, mrioc->admin_req_dma); + mrioc->admin_req_base = NULL; + } + return retval; +} + +/** + * mpi3mr_process_factsdata - Process IOC facts data + * @mrioc: Adapter instance reference + * @facts_data: IOC facts data pointer + * + * Convert IOC facts data into cpu endianness and cache it in + * the driver . + * + * Return: Nothing. + */ +static void mpi3mr_process_factsdata(struct mpi3mr_ioc *mrioc, + struct mpi3_ioc_facts_data *facts_data) +{ + u32 ioc_config, req_sz, facts_flags; + + if ((le16_to_cpu(facts_data->ioc_facts_data_length)) != + (sizeof(*facts_data) / 4)) + ioc_warn(mrioc, + "ioc_facts data length mismatch driver_sz(%ld), firmware_sz(%d)\n", + sizeof(*facts_data), + le16_to_cpu(facts_data->ioc_facts_data_length) * 4); + + ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); + req_sz = 1 << ((ioc_config & MPI3_SYSIF_IOC_CONFIG_OPER_REQ_ENT_SZ) >> + MPI3_SYSIF_IOC_CONFIG_OPER_REQ_ENT_SZ_SHIFT); + if (le16_to_cpu(facts_data->ioc_request_frame_size) != (req_sz/4)) + ioc_warn(mrioc, + "ioc_facts request frame size mismatch hardware_size(%d), firmware_sz(%d)\n", + req_sz / 4, + le16_to_cpu(facts_data->ioc_request_frame_size)); + + memset(&mrioc->facts, 0, sizeof(mrioc->facts)); + + facts_flags = le32_to_cpu(facts_data->flags); + mrioc->facts.op_req_sz = req_sz; + mrioc->op_reply_desc_sz = 1 << ((ioc_config & + MPI3_SYSIF_IOC_CONFIG_OPER_RPY_ENT_SZ) >> + MPI3_SYSIF_IOC_CONFIG_OPER_RPY_ENT_SZ_SHIFT); + + mrioc->facts.ioc_num = facts_data->ioc_number; + mrioc->facts.who_init = facts_data->who_init; + mrioc->facts.max_msix_vectors = le16_to_cpu(facts_data->max_msix_vectors); + mrioc->facts.personality = (facts_flags & + MPI3_IOCFACTS_FLAGS_PERSONALITY_MASK); + mrioc->facts.dma_mask = (facts_flags & + MPI3_IOCFACTS_FLAGS_DMA_ADDRESS_WIDTH_MASK) >> + MPI3_IOCFACTS_FLAGS_DMA_ADDRESS_WIDTH_SHIFT; + mrioc->facts.protocol_flags = facts_data->protocol_flags; + mrioc->facts.mpi_version = le32_to_cpu(facts_data->mpi_version.word); + mrioc->facts.max_reqs = + le16_to_cpu(facts_data->max_outstanding_requests); + mrioc->facts.product_id = le16_to_cpu(facts_data->product_id); + mrioc->facts.reply_sz = le16_to_cpu(facts_data->reply_frame_size) * 4; + mrioc->facts.exceptions = le16_to_cpu(facts_data->ioc_exceptions); + mrioc->facts.max_perids = le16_to_cpu(facts_data->max_persistent_id); + mrioc->facts.max_vds = le16_to_cpu(facts_data->max_vds); + mrioc->facts.max_hpds = le16_to_cpu(facts_data->max_host_pds); + mrioc->facts.max_advhpds = le16_to_cpu(facts_data->max_adv_host_pds); + mrioc->facts.max_raid_pds = le16_to_cpu(facts_data->max_raid_pds); + mrioc->facts.max_nvme = le16_to_cpu(facts_data->max_nvme); + mrioc->facts.max_pcie_switches = + le16_to_cpu(facts_data->max_pcie_switches); + mrioc->facts.max_sasexpanders = + le16_to_cpu(facts_data->max_sas_expanders); + mrioc->facts.max_sasinitiators = + le16_to_cpu(facts_data->max_sas_initiators); + mrioc->facts.max_enclosures = le16_to_cpu(facts_data->max_enclosures); + mrioc->facts.min_devhandle = le16_to_cpu(facts_data->min_dev_handle); + mrioc->facts.max_devhandle = le16_to_cpu(facts_data->max_dev_handle); + mrioc->facts.max_op_req_q = + le16_to_cpu(facts_data->max_operational_request_queues); + mrioc->facts.max_op_reply_q = + le16_to_cpu(facts_data->max_operational_reply_queues); + mrioc->facts.ioc_capabilities = + le32_to_cpu(facts_data->ioc_capabilities); + mrioc->facts.fw_ver.build_num = + le16_to_cpu(facts_data->fw_version.build_num); + mrioc->facts.fw_ver.cust_id = + le16_to_cpu(facts_data->fw_version.customer_id); + mrioc->facts.fw_ver.ph_minor = facts_data->fw_version.phase_minor; + mrioc->facts.fw_ver.ph_major = facts_data->fw_version.phase_major; + mrioc->facts.fw_ver.gen_minor = facts_data->fw_version.gen_minor; + mrioc->facts.fw_ver.gen_major = facts_data->fw_version.gen_major; + mrioc->msix_count = min_t(int, mrioc->msix_count, + mrioc->facts.max_msix_vectors); + mrioc->facts.sge_mod_mask = facts_data->sge_modifier_mask; + mrioc->facts.sge_mod_value = facts_data->sge_modifier_value; + mrioc->facts.sge_mod_shift = facts_data->sge_modifier_shift; + mrioc->facts.shutdown_timeout = + le16_to_cpu(facts_data->shutdown_timeout); + mrioc->facts.diag_trace_sz = + le32_to_cpu(facts_data->diag_trace_size); + mrioc->facts.diag_fw_sz = + le32_to_cpu(facts_data->diag_fw_size); + mrioc->facts.diag_drvr_sz = le32_to_cpu(facts_data->diag_driver_size); + + mrioc->facts.max_dev_per_tg = + facts_data->max_devices_per_throttle_group; + mrioc->facts.io_throttle_data_length = + le16_to_cpu(facts_data->io_throttle_data_length); + mrioc->facts.max_io_throttle_group = + le16_to_cpu(facts_data->max_io_throttle_group); + mrioc->facts.io_throttle_low = le16_to_cpu(facts_data->io_throttle_low); + mrioc->facts.io_throttle_high = + le16_to_cpu(facts_data->io_throttle_high); + + /*Store in 512b block count*/ + if (mrioc->facts.io_throttle_data_length) + mrioc->io_throttle_data_length = + (mrioc->facts.io_throttle_data_length * 2 * 4); + else + /* set the length to 1MB + 1K to disable throttle*/ + mrioc->io_throttle_data_length = MPI3MR_MAX_SECTORS + 2; + + mrioc->io_throttle_high = (mrioc->facts.io_throttle_high * 2 * 1024); + mrioc->io_throttle_low = (mrioc->facts.io_throttle_low * 2 * 1024); + + ioc_info(mrioc, + "ioc_num(%d), max_op_req_queues (%d), max_op_reply_queues(%d), max_requests(%d), max_msix_vectors(%d)\n", + mrioc->facts.ioc_num, mrioc->facts.max_op_req_q, + mrioc->facts.max_op_reply_q, mrioc->facts.max_reqs, + mrioc->facts.max_msix_vectors); + ioc_info(mrioc, + "max_device_handles(%d), min_device_handles(%d), max_perst_ids(%d)\n", + mrioc->facts.max_devhandle, mrioc->facts.min_devhandle, + mrioc->facts.max_perids); + ioc_info(mrioc, + "sge_modifier_mask(0x%02x), sge_modifier_value(0x%02x), sge_modifier_shift(0x%02x)\n", + mrioc->facts.sge_mod_mask, mrioc->facts.sge_mod_value, + mrioc->facts.sge_mod_shift); + ioc_info(mrioc, "dma_mask(%d), initial_port_enable_status(0x%02x)\n", + mrioc->facts.dma_mask, (facts_flags & + MPI3_IOCFACTS_FLAGS_INITIAL_PORT_ENABLE_MASK)); + ioc_info(mrioc, + "diag_trace_sz(%dKB), diag_fw_size(%dKB), diag_drvr_sizez(%dKB)\n", + mrioc->facts.diag_trace_sz / 1024, mrioc->facts.diag_fw_sz / 1024, + mrioc->facts.diag_drvr_sz / 1024); + ioc_info(mrioc, + "max_dev_per_throttle_group(%d), max_throttle_groups(%d), io_throttle_data_len(%dKiB), io_throttle_high(%dMiB), io_throttle_low(%dMiB)\n", + mrioc->facts.max_dev_per_tg, mrioc->facts.max_io_throttle_group, + mrioc->facts.io_throttle_data_length * 4, + mrioc->facts.io_throttle_high, mrioc->facts.io_throttle_low); + +} + +/** + * mpi3mr_issue_iocfacts - Send IOC Facts + * @mrioc: Adapter instance reference + * @facts_data: IOC facts data pointer + * + * Issue IOC Facts MPI request through admin queue and wait for + * the completion of it or time out. + * + * Return: 0 on success, non-zero on failures. + */ +static int mpi3mr_issue_iocfacts(struct mpi3mr_ioc *mrioc, + struct mpi3_ioc_facts_data *facts_data) +{ + struct mpi3_ioc_facts_request iocfacts_req; + void *data = NULL; + dma_addr_t data_dma; + u32 data_len = sizeof(*facts_data); + int retval = 0; + u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST; + + data = dma_zalloc_coherent(&mrioc->pdev->dev, data_len, &data_dma, + GFP_KERNEL); + + if (!data) { + retval = -1; + goto out; + } + + memset(&iocfacts_req, 0, sizeof(iocfacts_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "getting ioc_facts failed due to command in use\n"); + mutex_unlock(&mrioc->init_cmds.mutex); + goto out; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + iocfacts_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + iocfacts_req.function = MPI3_FUNCTION_IOC_FACTS; + + mpi3mr_add_sg_single(&iocfacts_req.sgl, sgl_flags, data_len, + data_dma); + + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &iocfacts_req, + sizeof(iocfacts_req), 1); + if (retval) { + ioc_err(mrioc, "posting ioc_facts request failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "ioc_facts timed out\n"); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_IOCFACTS_TIMEOUT); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, + "ioc_facts returned with ioc_status(0x%04x), log_info(0x%08x)\n", + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + retval = -1; + goto out_unlock; + } + memcpy(facts_data, (u8 *)data, data_len); + mpi3mr_process_factsdata(mrioc, facts_data); +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); + +out: + if (data) + dma_free_coherent(&mrioc->pdev->dev, data_len, data, data_dma); + + return retval; +} + +/** + * mpi3mr_check_reset_dma_mask - Process IOC facts data + * @mrioc: Adapter instance reference + * + * Check whether the new DMA mask requested through IOCFacts by + * firmware needs to be set, if so set it . + * + * Return: 0 on success, non-zero on failure. + */ +static inline int mpi3mr_check_reset_dma_mask(struct mpi3mr_ioc *mrioc) +{ + struct pci_dev *pdev = mrioc->pdev; + int r; + u64 facts_dma_mask = DMA_BIT_MASK(mrioc->facts.dma_mask); + + if (!mrioc->facts.dma_mask || (mrioc->dma_mask <= facts_dma_mask)) + return 0; + + ioc_info(mrioc, "changing DMA mask from 0x%016llX to 0x%016llX\n", + mrioc->dma_mask, facts_dma_mask); + + r = dma_set_mask_and_coherent(&pdev->dev, facts_dma_mask); + if (r) { + ioc_err(mrioc, "setting DMA mask to 0x%016llX failed: %d\n", + facts_dma_mask, r); + return r; + } + mrioc->dma_mask = facts_dma_mask; + return r; +} + + +/** + * mpi3mr_alloc_reply_sense_bufs - Send IOC Init + * @mrioc: Adapter instance reference + * + * Allocate and initialize the reply free buffers, sense + * buffers, reply free queue and sense buffer queue. + * + * Return: 0 on success, non-zero on failures. + */ +static int mpi3mr_alloc_reply_sense_bufs(struct mpi3mr_ioc *mrioc) +{ + int retval = 0; + u32 sz, i; + + + if (mrioc->init_cmds.reply) + return retval; + + mrioc->init_cmds.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL); + if (!mrioc->init_cmds.reply) + goto out_failed; + + mrioc->bsg_cmds.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL); + if (!mrioc->bsg_cmds.reply) + goto out_failed; + + mrioc->host_tm_cmds.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL); + if (!mrioc->host_tm_cmds.reply) + goto out_failed; + + mrioc->pel_cmds.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL); + if (!mrioc->pel_cmds.reply) + goto out_failed; + + mrioc->pel_abort_cmd.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL); + if (!mrioc->pel_abort_cmd.reply) + goto out_failed; + + mrioc->transport_cmds.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL); + if (!mrioc->transport_cmds.reply) + goto out_failed; + + for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++) { + mrioc->dev_rmhs_cmds[i].reply = kzalloc(mrioc->reply_sz, + GFP_KERNEL); + if (!mrioc->dev_rmhs_cmds[i].reply) + goto out_failed; + } + for (i = 0; i < MPI3MR_NUM_SYSFS_TM; i++) { + mrioc->sysfs_tm_cmds[i].reply = kzalloc(mrioc->reply_sz, + GFP_KERNEL); + if (!mrioc->sysfs_tm_cmds[i].reply) + goto out_failed; + } + for (i = 0; i < MPI3MR_NUM_EVTACKCMD; i++) { + mrioc->evtack_cmds[i].reply = kzalloc(mrioc->reply_sz, + GFP_KERNEL); + if (!mrioc->evtack_cmds[i].reply) + goto out_failed; + } + mrioc->dev_handle_bitmap_sz = mrioc->facts.max_devhandle / 8; + if (mrioc->facts.max_devhandle % 8) + mrioc->dev_handle_bitmap_sz++; + mrioc->removepend_bitmap = kzalloc(mrioc->dev_handle_bitmap_sz, + GFP_KERNEL); + if (!mrioc->removepend_bitmap) + goto out_failed; + + mrioc->devrem_bitmap_sz = MPI3MR_NUM_DEVRMCMD / 8; + if (MPI3MR_NUM_DEVRMCMD % 8) + mrioc->devrem_bitmap_sz++; + mrioc->devrem_bitmap = kzalloc(mrioc->devrem_bitmap_sz, + GFP_KERNEL); + if (!mrioc->devrem_bitmap) + goto out_failed; + + mrioc->evtack_cmds_bitmap_sz = MPI3MR_NUM_EVTACKCMD / 8; + if (MPI3MR_NUM_EVTACKCMD % 8) + mrioc->evtack_cmds_bitmap_sz++; + mrioc->evtack_cmds_bitmap = kzalloc(mrioc->evtack_cmds_bitmap_sz, + GFP_KERNEL); + if (!mrioc->evtack_cmds_bitmap) + goto out_failed; + + mrioc->num_reply_bufs = mrioc->facts.max_reqs + MPI3MR_NUM_EVT_REPLIES; + mrioc->reply_free_qsz = mrioc->num_reply_bufs + 1; + mrioc->num_sense_bufs = mrioc->facts.max_reqs / MPI3MR_SENSEBUF_FACTOR; + mrioc->sense_buf_q_sz = mrioc->num_sense_bufs + 1; + + /* reply buffer pool, 16 byte align */ + sz = mrioc->num_reply_bufs * mrioc->reply_sz; + mrioc->reply_buf_pool = dma_pool_create("reply_buf pool", + &mrioc->pdev->dev, sz, 16, 0); + if (!mrioc->reply_buf_pool) { + ioc_err(mrioc, "reply buf pool: dma_pool_create failed\n"); + goto out_failed; + } + + mrioc->reply_buf = dma_pool_zalloc(mrioc->reply_buf_pool, GFP_KERNEL, + &mrioc->reply_buf_dma); + if (!mrioc->reply_buf) + goto out_failed; + + mrioc->reply_buf_dma_max_address = mrioc->reply_buf_dma + sz; + + /* reply free queue, 8 byte align */ + sz = mrioc->reply_free_qsz * 8; + mrioc->reply_free_q_pool = dma_pool_create("reply_free_q pool", + &mrioc->pdev->dev, sz, 8, 0); + if (!mrioc->reply_free_q_pool) { + ioc_err(mrioc, "reply_free_q pool: dma_pool_create failed\n"); + goto out_failed; + } + mrioc->reply_free_q = dma_pool_zalloc(mrioc->reply_free_q_pool, + GFP_KERNEL, &mrioc->reply_free_q_dma); + if (!mrioc->reply_free_q) + goto out_failed; + + /* sense buffer pool, 4 byte align */ + sz = mrioc->num_sense_bufs * MPI3MR_SENSE_BUF_SZ; + mrioc->sense_buf_pool = dma_pool_create("sense_buf pool", + &mrioc->pdev->dev, sz, 4, 0); + if (!mrioc->sense_buf_pool) { + ioc_err(mrioc, "sense_buf pool: dma_pool_create failed\n"); + goto out_failed; + } + mrioc->sense_buf = dma_pool_zalloc(mrioc->sense_buf_pool, GFP_KERNEL, + &mrioc->sense_buf_dma); + if (!mrioc->sense_buf) + goto out_failed; + + /* sense buffer queue, 8 byte align */ + sz = mrioc->sense_buf_q_sz * 8; + mrioc->sense_buf_q_pool = dma_pool_create("sense_buf_q pool", + &mrioc->pdev->dev, sz, 8, 0); + if (!mrioc->sense_buf_q_pool) { + ioc_err(mrioc, "sense_buf_q pool: dma_pool_create failed\n"); + goto out_failed; + } + mrioc->sense_buf_q = dma_pool_zalloc(mrioc->sense_buf_q_pool, + GFP_KERNEL, &mrioc->sense_buf_q_dma); + if (!mrioc->sense_buf_q) + goto out_failed; + + return retval; + +out_failed: + retval = -1; + return retval; +} + +/** + * mpimr_initialize_reply_sbuf_queues - initialize reply sense + * buffers + * @mrioc: Adapter instance reference + * + * Helper function to initialize reply and sense buffers along + * with some debug prints. + * + * Return: None. + */ +static void mpimr_initialize_reply_sbuf_queues(struct mpi3mr_ioc *mrioc) +{ + u32 sz, i; + dma_addr_t phy_addr; + + sz = mrioc->num_reply_bufs * mrioc->reply_sz; + ioc_info(mrioc, + "reply buf pool(0x%p): depth(%d), frame_size(%d), pool_size(%d kB), reply_dma(0x%llx)\n", + mrioc->reply_buf, mrioc->num_reply_bufs, mrioc->reply_sz, + (sz / 1024), (unsigned long long)mrioc->reply_buf_dma); + sz = mrioc->reply_free_qsz * 8; + ioc_info(mrioc, + "reply_free_q pool(0x%p): depth(%d), frame_size(%d), pool_size(%d kB), reply_dma(0x%llx)\n", + mrioc->reply_free_q, mrioc->reply_free_qsz, 8, (sz / 1024), + (unsigned long long)mrioc->reply_free_q_dma); + sz = mrioc->num_sense_bufs * MPI3MR_SENSE_BUF_SZ; + ioc_info(mrioc, + "sense_buf pool(0x%p): depth(%d), frame_size(%d), pool_size(%d kB), sense_dma(0x%llx)\n", + mrioc->sense_buf, mrioc->num_sense_bufs, MPI3MR_SENSE_BUF_SZ, + (sz / 1024), (unsigned long long)mrioc->sense_buf_dma); + sz = mrioc->sense_buf_q_sz * 8; + ioc_info(mrioc, + "sense_buf_q pool(0x%p): depth(%d), frame_size(%d), pool_size(%d kB), sense_dma(0x%llx)\n", + mrioc->sense_buf_q, mrioc->sense_buf_q_sz, 8, (sz / 1024), + (unsigned long long)mrioc->sense_buf_q_dma); + + /* initialize Reply buffer Queue */ + for (i = 0, phy_addr = mrioc->reply_buf_dma; + i < mrioc->num_reply_bufs; i++, phy_addr += mrioc->reply_sz) + mrioc->reply_free_q[i] = cpu_to_le64(phy_addr); + mrioc->reply_free_q[i] = cpu_to_le64(0); + + /* initialize Sense Buffer Queue */ + for (i = 0, phy_addr = mrioc->sense_buf_dma; + i < mrioc->num_sense_bufs; i++, phy_addr += MPI3MR_SENSE_BUF_SZ) + mrioc->sense_buf_q[i] = cpu_to_le64(phy_addr); + mrioc->sense_buf_q[i] = cpu_to_le64(0); +} + +/** + * mpi3mr_issue_iocinit - Send IOC Init + * @mrioc: Adapter instance reference + * + * Issue IOC Init MPI request through admin queue and wait for + * the completion of it or time out. + * + * Return: 0 on success, non-zero on failures. + */ +static int mpi3mr_issue_iocinit(struct mpi3mr_ioc *mrioc) +{ + struct mpi3_ioc_init_request iocinit_req; + struct mpi3_driver_info_layout *drv_info; + dma_addr_t data_dma; + u32 data_len = sizeof(*drv_info); + int retval = 0; + ktime_t current_time; + + drv_info = dma_zalloc_coherent(&mrioc->pdev->dev, data_len, &data_dma, + GFP_KERNEL); + if (!drv_info) { + retval = -1; + goto out; + } + mpimr_initialize_reply_sbuf_queues(mrioc); + + drv_info->information_length = cpu_to_le32(data_len); + strscpy(drv_info->driver_signature, "Broadcom", sizeof(drv_info->driver_signature)); + strscpy(drv_info->os_name, utsname()->sysname, sizeof(drv_info->os_name)); + strscpy(drv_info->os_version, utsname()->release, sizeof(drv_info->os_version)); + strscpy(drv_info->driver_name, MPI3MR_DRIVER_NAME, sizeof(drv_info->driver_name)); + strscpy(drv_info->driver_version, MPI3MR_DRIVER_VERSION, sizeof(drv_info->driver_version)); + strscpy(drv_info->driver_release_date, MPI3MR_DRIVER_RELDATE, + sizeof(drv_info->driver_release_date)); + drv_info->driver_capabilities = 0; + memcpy((u8 *)&mrioc->driver_info, (u8 *)drv_info, + sizeof(mrioc->driver_info)); + + memset(&iocinit_req, 0, sizeof(iocinit_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "sending ioc_init failed due to command in use\n"); + mutex_unlock(&mrioc->init_cmds.mutex); + goto out; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + iocinit_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + iocinit_req.function = MPI3_FUNCTION_IOC_INIT; + iocinit_req.mpi_version.mpi3_version.dev = MPI3_VERSION_DEV; + iocinit_req.mpi_version.mpi3_version.unit = MPI3_VERSION_UNIT; + iocinit_req.mpi_version.mpi3_version.major = MPI3_VERSION_MAJOR; + iocinit_req.mpi_version.mpi3_version.minor = MPI3_VERSION_MINOR; + iocinit_req.who_init = MPI3_WHOINIT_HOST_DRIVER; + iocinit_req.reply_free_queue_depth = cpu_to_le16(mrioc->reply_free_qsz); + iocinit_req.reply_free_queue_address = + cpu_to_le64(mrioc->reply_free_q_dma); + iocinit_req.sense_buffer_length = cpu_to_le16(MPI3MR_SENSE_BUF_SZ); + iocinit_req.sense_buffer_free_queue_depth = + cpu_to_le16(mrioc->sense_buf_q_sz); + iocinit_req.sense_buffer_free_queue_address = + cpu_to_le64(mrioc->sense_buf_q_dma); + iocinit_req.driver_information_address = cpu_to_le64(data_dma); + + current_time = ktime_get_real(); + iocinit_req.time_stamp = cpu_to_le64(ktime_to_ms(current_time)); + + if (enable_dix) + iocinit_req.msg_flags |= + MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_SEPARATED; + + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &iocinit_req, + sizeof(iocinit_req), 1); + if (retval) { + ioc_err(mrioc, "posting ioc_init failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_IOCINIT_TIMEOUT); + ioc_err(mrioc, "ioc_init timed out\n"); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, + "ioc_init returned with ioc_status(0x%04x) log_info(0x%08x)\n", + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + retval = -1; + goto out_unlock; + } + + mrioc->reply_free_queue_host_index = mrioc->num_reply_bufs; + writel(mrioc->reply_free_queue_host_index, + &mrioc->sysif_regs->reply_free_host_index); + + mrioc->sbq_host_index = mrioc->num_sense_bufs; + writel(mrioc->sbq_host_index, + &mrioc->sysif_regs->sense_buffer_free_host_index); +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); + +out: + if (drv_info) + dma_free_coherent(&mrioc->pdev->dev, data_len, drv_info, + data_dma); + + return retval; +} + +/** + * mpi3mr_unmask_events - Unmask events in event mask bitmap + * @mrioc: Adapter instance reference + * @event: MPI event ID + * + * Un mask the specific event by resetting the event_mask + * bitmap. + * + * Return: 0 on success, non-zero on failures. + */ +static void mpi3mr_unmask_events(struct mpi3mr_ioc *mrioc, u16 event) +{ + u32 desired_event; + u8 word; + + if (event >= 128) + return; + + desired_event = (1 << (event % 32)); + word = event / 32; + + mrioc->event_masks[word] &= ~desired_event; +} + +/** + * mpi3mr_issue_event_notification - Send event notification + * @mrioc: Adapter instance reference + * + * Issue event notification MPI request through admin queue and + * wait for the completion of it or time out. + * + * Return: 0 on success, non-zero on failures. + */ +static int mpi3mr_issue_event_notification(struct mpi3mr_ioc *mrioc) +{ + struct mpi3_event_notification_request evtnotify_req; + int retval = 0; + u8 i; + + memset(&evtnotify_req, 0, sizeof(evtnotify_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "sending event notification failed due to command in use\n"); + mutex_unlock(&mrioc->init_cmds.mutex); + goto out; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + evtnotify_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + evtnotify_req.function = MPI3_FUNCTION_EVENT_NOTIFICATION; + for (i = 0; i < MPI3_EVENT_NOTIFY_EVENTMASK_WORDS; i++) + evtnotify_req.event_masks[i] = + cpu_to_le32(mrioc->event_masks[i]); + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &evtnotify_req, + sizeof(evtnotify_req), 1); + if (retval) { + ioc_err(mrioc, "posting event notification failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "event notification timed out\n"); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_EVTNOTIFY_TIMEOUT); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, + "event notification returned with ioc_tatus(0x%04x), log_info(0x%08x)\n", + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + retval = -1; + goto out_unlock; + } + +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); +out: + return retval; +} + +/** + * mpi3mr_process_event_ack - Process event acknowledgment + * @mrioc: Adapter instance reference + * @event: MPI3 event ID + * @event_ctx: event context + * + * Send event acknowledgment through admin queue and wait for + * it to complete. + * + * Return: 0 on success, non-zero on failures. + */ +int mpi3mr_process_event_ack(struct mpi3mr_ioc *mrioc, u8 event, + u32 event_ctx) +{ + struct mpi3_event_ack_request evtack_req; + int retval = 0; + + memset(&evtack_req, 0, sizeof(evtack_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "sending blocking event ack failed due to command in use\n"); + mutex_unlock(&mrioc->init_cmds.mutex); + goto out; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + evtack_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + evtack_req.function = MPI3_FUNCTION_EVENT_ACK; + evtack_req.event = event; + evtack_req.event_context = cpu_to_le32(event_ctx); + + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &evtack_req, + sizeof(evtack_req), 1); + if (retval) { + ioc_err(mrioc, "posting event ack request failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "blocking event ack request timed out\n"); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_RESET)) + mpi3mr_soft_reset_handler(mrioc, + MPI3MR_RESET_FROM_EVTACK_TIMEOUT, 1); + retval = -1; + goto out_unlock; + } + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, + "blocking event ack returned with ioc_status(0x%04x), log_info(0x%08x)\n", + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + retval = -1; + goto out_unlock; + } + +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); +out: + return retval; +} + + +/** + * mpi3mr_alloc_chain_bufs - Allocate chain buffers + * @mrioc: Adapter instance reference + * + * Allocate chain buffers and set a bitmap to indicate free + * chain buffers. Chain buffers are used to pass the SGE + * information along with MPI3 SCSI IO requests for host I/O. + * + * Return: 0 on success, non-zero on failure + */ +static int mpi3mr_alloc_chain_bufs(struct mpi3mr_ioc *mrioc) +{ + int retval = 0; + u32 sz, i; + u16 num_chains; + + if (mrioc->chain_sgl_list) + return retval; + + num_chains = mrioc->max_host_ios; + + if (enable_dix) + num_chains *= 2; + + mrioc->chain_buf_count = num_chains; + sz = sizeof(struct chain_element) * num_chains; + mrioc->chain_sgl_list = kzalloc(sz, GFP_KERNEL); + if (!mrioc->chain_sgl_list) + goto out_failed; + + sz = MPI3MR_CHAINSGE_SIZE; + mrioc->chain_buf_pool = dma_pool_create("chain_buf pool", + &mrioc->pdev->dev, sz, 16, 0); + if (!mrioc->chain_buf_pool) { + ioc_err(mrioc, "chain buf pool: dma_pool_create failed\n"); + goto out_failed; + } + + for (i = 0; i < num_chains; i++) { + mrioc->chain_sgl_list[i].addr = + dma_pool_zalloc(mrioc->chain_buf_pool, GFP_KERNEL, + &mrioc->chain_sgl_list[i].dma_addr); + + if (!mrioc->chain_sgl_list[i].addr) + goto out_failed; + } + mrioc->chain_bitmap_sz = num_chains / 8; + if (num_chains % 8) + mrioc->chain_bitmap_sz++; + mrioc->chain_bitmap = kzalloc(mrioc->chain_bitmap_sz, GFP_KERNEL); + if (!mrioc->chain_bitmap) + goto out_failed; + return retval; +out_failed: + retval = -1; + return retval; +} + + +/** + * mpi3mr_port_enable_complete - Mark port enable complete + * @mrioc: Adapter instance reference + * @drv_cmd: Internal command tracker + * + * Call back for asynchronous port enable request sets the + * driver command to indicate port enable request is complete. + * + * Return: Nothing + */ +static void mpi3mr_port_enable_complete(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd) +{ + drv_cmd->callback = NULL; + mrioc->scan_started = 0; + if (drv_cmd->state & MPI3MR_CMD_RESET) + mrioc->scan_failed = MPI3_IOCSTATUS_INTERNAL_ERROR; + else + mrioc->scan_failed = drv_cmd->ioc_status; + drv_cmd->state = MPI3MR_CMD_NOTUSED; +} + +/** + * mpi3mr_issue_port_enable - Issue Port Enable + * @mrioc: Adapter instance reference + * @async: Flag to wait for completion or not + * + * Issue Port Enable MPI request through admin queue and if the + * async flag is not set wait for the completion of the port + * enable or time out. + * + * Return: 0 on success, non-zero on failures. + */ +int mpi3mr_issue_port_enable(struct mpi3mr_ioc *mrioc, u8 async) +{ + struct mpi3_port_enable_request pe_req; + int retval = 0; + u32 pe_timeout = MPI3MR_PORTENABLE_TIMEOUT; + + memset(&pe_req, 0, sizeof(pe_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "sending port enable failed due to command is in use\n"); + mutex_unlock(&mrioc->init_cmds.mutex); + goto out; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + if (async) { + mrioc->init_cmds.is_waiting = 0; + mrioc->init_cmds.callback = mpi3mr_port_enable_complete; + } else { + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + init_completion(&mrioc->init_cmds.done); + } + pe_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + pe_req.function = MPI3_FUNCTION_PORT_ENABLE; + + retval = mpi3mr_admin_request_post(mrioc, &pe_req, sizeof(pe_req), 1); + if (retval) { + ioc_err(mrioc, "posting port enable failed\n"); + goto out_unlock; + } + if (async) { + mutex_unlock(&mrioc->init_cmds.mutex); + goto out; + } + + wait_for_completion_timeout(&mrioc->init_cmds.done, (pe_timeout * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "port enable timed out\n"); + retval = -1; + mpi3mr_check_rh_fault_ioc(mrioc, MPI3MR_RESET_FROM_PE_TIMEOUT); + goto out_unlock; + } + mpi3mr_port_enable_complete(mrioc, &mrioc->init_cmds); + +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); +out: + return retval; +} + +/* Protocol type to name mapper structure */ +static const struct { + u8 protocol; + char *name; +} mpi3mr_protocols[] = { + { MPI3_IOCFACTS_PROTOCOL_SCSI_INITIATOR, "Initiator" }, + { MPI3_IOCFACTS_PROTOCOL_SCSI_TARGET, "Target" }, + { MPI3_IOCFACTS_PROTOCOL_NVME, "NVMe attachment" }, +}; + +/* Capability to name mapper structure */ +static const struct { + u32 capability; + char *name; +} mpi3mr_capabilities[] = { + { MPI3_IOCFACTS_CAPABILITY_RAID_CAPABLE, "RAID" }, + { MPI3_IOCFACTS_CAPABILITY_MULTIPATH_ENABLED, "MultiPath" }, +}; + +/** + * mpi3mr_print_ioc_info - Display controller information + * @mrioc: Adapter instance reference + * + * Display controller personality, capability, supported + * protocols etc. + * + * Return: Nothing + */ +static void +mpi3mr_print_ioc_info(struct mpi3mr_ioc *mrioc) +{ + int i = 0, bytes_wrote = 0; + char personality[16]; + char protocol[50] = {0}; + char capabilities[100] = {0}; + bool is_string_nonempty = false; + struct mpi3mr_compimg_ver *fwver = &mrioc->facts.fw_ver; + + switch (mrioc->facts.personality) { + case MPI3_IOCFACTS_FLAGS_PERSONALITY_EHBA: + strncpy(personality, "Enhanced HBA", sizeof(personality)); + break; + case MPI3_IOCFACTS_FLAGS_PERSONALITY_RAID_DDR: + strncpy(personality, "RAID", sizeof(personality)); + break; + default: + strncpy(personality, "Unknown", sizeof(personality)); + break; + } + + ioc_info(mrioc, "running in %s Personality", personality); + + ioc_info(mrioc, "firmware version(%d.%d.%d.%d.%05d-%05d)\n", + fwver->gen_major, fwver->gen_minor, fwver->ph_major, + fwver->ph_minor, fwver->cust_id, fwver->build_num); + + for (i = 0; i < ARRAY_SIZE(mpi3mr_protocols); i++) { + if (mrioc->facts.protocol_flags & + mpi3mr_protocols[i].protocol) { + if (is_string_nonempty && + (bytes_wrote < sizeof(protocol))) + bytes_wrote += snprintf(protocol + bytes_wrote, + (sizeof(protocol) - bytes_wrote), ","); + + if (bytes_wrote < sizeof(protocol)) + bytes_wrote += snprintf(protocol + bytes_wrote, + (sizeof(protocol) - bytes_wrote), "%s", + mpi3mr_protocols[i].name); + is_string_nonempty = true; + } + } + + bytes_wrote = 0; + is_string_nonempty = false; + for (i = 0; i < ARRAY_SIZE(mpi3mr_capabilities); i++) { + if (mrioc->facts.ioc_capabilities & + mpi3mr_capabilities[i].capability) { + if (is_string_nonempty && + (bytes_wrote < sizeof(capabilities))) + bytes_wrote += snprintf(capabilities + + bytes_wrote, + (sizeof(capabilities) - bytes_wrote), ","); + + if (bytes_wrote < sizeof(capabilities)) + bytes_wrote += snprintf(capabilities + + bytes_wrote, + (sizeof(capabilities) - bytes_wrote), "%s", + mpi3mr_capabilities[i].name); + is_string_nonempty = true; + } + } + + ioc_info(mrioc, "Protocol=(%s), Capabilities=(%s)\n", + protocol, capabilities); +} + +/** + * mpi3mr_cleanup_resources - Free PCI resources + * @mrioc: Adapter instance reference + * + * Unmap PCI device memory and disable PCI device. + * + * Return: 0 on success and non-zero on failure. + */ +void mpi3mr_cleanup_resources(struct mpi3mr_ioc *mrioc) +{ + struct pci_dev *pdev = mrioc->pdev; + + mpi3mr_cleanup_isr(mrioc); + + if (mrioc->sysif_regs) { + iounmap((void __iomem *)mrioc->sysif_regs); + mrioc->sysif_regs = NULL; + } + + if (pci_is_enabled(pdev)) { + if (mrioc->bars) + pci_release_selected_regions(pdev, mrioc->bars); + pci_disable_pcie_error_reporting(pdev); + pci_disable_device(pdev); + } +} + +/** + * mpi3mr_setup_resources - Enable PCI resources + * @mrioc: Adapter instance reference + * + * Enable PCI device memory, MSI-x registers and set DMA mask. + * + * Return: 0 on success and non-zero on failure. + */ +int mpi3mr_setup_resources(struct mpi3mr_ioc *mrioc) +{ + struct pci_dev *pdev = mrioc->pdev; + u32 memap_sz = 0; + int i, retval = 0, capb = 0; + u16 message_control; + u64 dma_mask = mrioc->dma_mask ? mrioc->dma_mask : + (((dma_get_required_mask(&pdev->dev) > DMA_BIT_MASK(32)) && + (sizeof(dma_addr_t) > 4)) ? DMA_BIT_MASK(64) : DMA_BIT_MASK(32)); + + if (pci_enable_device_mem(pdev)) { + ioc_err(mrioc, "pci_enable_device_mem: failed\n"); + retval = -ENODEV; + goto out_failed; + } + + capb = pci_find_capability(pdev, PCI_CAP_ID_MSIX); + if (!capb) { + ioc_err(mrioc, "unable to find MSI-X Capabilities\n"); + retval = -ENODEV; + goto out_failed; + } + mrioc->bars = pci_select_bars(pdev, IORESOURCE_MEM); + + if (pci_request_selected_regions(pdev, mrioc->bars, + mrioc->driver_name)) { + ioc_err(mrioc, "pci_request_selected_regions: failed\n"); + retval = -ENODEV; + goto out_failed; + } + + for (i = 0; (i < DEVICE_COUNT_RESOURCE); i++) { + if (pci_resource_flags(pdev, i) & IORESOURCE_MEM) { + mrioc->sysif_regs_phys = pci_resource_start(pdev, i); + memap_sz = pci_resource_len(pdev, i); + mrioc->sysif_regs = + ioremap(mrioc->sysif_regs_phys, memap_sz); + break; + } + } + + pci_enable_pcie_error_reporting(pdev); + + pci_set_master(pdev); + + retval = dma_set_mask_and_coherent(&pdev->dev, dma_mask); + if (retval) { + if (dma_mask != DMA_BIT_MASK(32)) { + ioc_warn(mrioc, "setting 64 bit DMA mask failed\n"); + dma_mask = DMA_BIT_MASK(32); + retval = dma_set_mask_and_coherent(&pdev->dev, + dma_mask); + } + if (retval) { + mrioc->dma_mask = 0; + ioc_err(mrioc, "setting 32 bit DMA mask also failed\n"); + goto out_failed; + } + } + mrioc->dma_mask = dma_mask; + + if (!mrioc->sysif_regs) { + ioc_err(mrioc, + "unable to map adapter memory or resource not found\n"); + retval = -EINVAL; + goto out_failed; + } + + pci_read_config_word(pdev, capb + 2, &message_control); + mrioc->msix_count = (message_control & 0x3FF) + 1; + + pci_save_state(pdev); + + pci_set_drvdata(pdev, mrioc->shost); + + mpi3mr_ioc_disable_intr(mrioc); + + ioc_info(mrioc, "iomem(0x%016llx), mapped(0x%p), size(%d)\n", + (unsigned long long)mrioc->sysif_regs_phys, + mrioc->sysif_regs, memap_sz); + ioc_info(mrioc, "number of MSI-X vectors found in capabilities: (%d)\n", + mrioc->msix_count); + +#if (KERNEL_VERSION(5, 12, 0) <= LINUX_VERSION_CODE) + if (!reset_devices && poll_queues > 0) + mrioc->requested_poll_qcount = min_t(int, poll_queues, + mrioc->msix_count - 2); +#endif + return retval; + +out_failed: + mpi3mr_cleanup_resources(mrioc); + return retval; +} + +/** + * mpi3mr_alloc_issue_host_diag_buf - Allocate and send host diag buffer + * @mrioc: Adapter instance reference + * + * Issue diagnostic buffer post (unconditional) MPI request through admin queue + * and wait for the completion of it or time out. + * + * Return: 0 on success non-zero on failure + */ +static int mpi3mr_alloc_issue_host_diag_buf(struct mpi3mr_ioc *mrioc) +{ + struct mpi3_diag_buffer_post_request diag_buf_post_req; + dma_addr_t buf_dma_addr; + u32 buf_sz; + int retval = -1; + + ioc_info(mrioc, "driver diag buffer level = %s.\n", + mpi3mr_drv_db_name(drv_db_level)); + + if (!mrioc->drv_diag_buffer) { + mrioc->drv_diag_buffer_sz = + MPI3MR_DEFAULT_DIAG_HOST_BUFFER_SZ; + mrioc->drv_diag_buffer = + dma_zalloc_coherent(&mrioc->pdev->dev, + mrioc->drv_diag_buffer_sz, + &mrioc->drv_diag_buffer_dma, GFP_KERNEL); + if (!mrioc->drv_diag_buffer) { + mrioc->drv_diag_buffer_sz = + MPI3MR_MIN_DIAG_HOST_BUFFER_SZ; + mrioc->drv_diag_buffer = + dma_zalloc_coherent(&mrioc->pdev->dev, + mrioc->drv_diag_buffer_sz, + &mrioc->drv_diag_buffer_dma, GFP_KERNEL); + } + if (!mrioc->drv_diag_buffer) { + ioc_warn(mrioc, "%s:%d:failed to allocate buffer\n", + __func__, __LINE__); + mrioc->drv_diag_buffer_sz = 0; + return retval; + } + /* TBD - memset to Zero once feature is stable */ + memset(mrioc->drv_diag_buffer, 0x55, mrioc->drv_diag_buffer_sz); + mpi3mr_upload_drv_diag_buffer(mrioc); + } + + buf_dma_addr = mrioc->drv_diag_buffer_dma; + buf_sz = mrioc->drv_diag_buffer_sz; + + memset(&diag_buf_post_req, 0, sizeof(diag_buf_post_req)); + mutex_lock(&mrioc->init_cmds.mutex); + if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) { + ioc_err(mrioc, "sending driver diag buffer post is failed due to command in use\n"); + mutex_unlock(&mrioc->init_cmds.mutex); + return retval; + } + mrioc->init_cmds.state = MPI3MR_CMD_PENDING; + mrioc->init_cmds.is_waiting = 1; + mrioc->init_cmds.callback = NULL; + diag_buf_post_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS); + diag_buf_post_req.function = MPI3_FUNCTION_DIAG_BUFFER_POST; + diag_buf_post_req.type = MPI3_DIAG_BUFFER_TYPE_DRIVER; + diag_buf_post_req.address = le64_to_cpu(buf_dma_addr); + diag_buf_post_req.length = le32_to_cpu(buf_sz); + + init_completion(&mrioc->init_cmds.done); + retval = mpi3mr_admin_request_post(mrioc, &diag_buf_post_req, + sizeof(diag_buf_post_req), 1); + if (retval) { + ioc_err(mrioc, "posting driver diag buffer failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->init_cmds.done, + (MPI3MR_INTADMCMD_TIMEOUT * HZ)); + if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) { + ioc_err(mrioc, "posting driver diag buffer timed out\n"); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_DIAG_BUFFER_POST_TIMEOUT); + retval = -1; + goto out_unlock; + } + retval = 0; + if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK) + != MPI3_IOCSTATUS_SUCCESS) + ioc_warn(mrioc, + "driver diag buffer post returned with ioc_status(0x%04x) log_info(0x%08x)\n", + (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + mrioc->init_cmds.ioc_loginfo); + else + ioc_info(mrioc, "driver diag buffer of size %dKB posted successfully\n", + mrioc->drv_diag_buffer_sz / 1024); + +out_unlock: + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->init_cmds.mutex); + return retval; +} + +/** + * mpi3mr_revalidate_factsdata - validate IOCFacts parameters + * during reset/resume + * @mrioc: Adapter instance reference + * + * Return zero if the new IOCFacts parameters value is compatible with + * older values else return -EPERM + */ +static int +mpi3mr_revalidate_factsdata(struct mpi3mr_ioc *mrioc) +{ + u16 dev_handle_bitmap_sz; + void *removepend_bitmap; + + if (mrioc->facts.reply_sz > mrioc->reply_sz) { + ioc_err(mrioc, + "cannot increase reply size from %d to %d\n", + mrioc->reply_sz, mrioc->facts.reply_sz); + return -EPERM; + } + if (mrioc->num_io_throttle_group != mrioc->facts.max_io_throttle_group) + { + ioc_err(mrioc, + "max io throttle group doesn't match old(%d), new(%d)\n", + mrioc->num_io_throttle_group, + mrioc->facts.max_io_throttle_group); + return -EPERM; + } + + + if (mrioc->facts.max_op_reply_q < mrioc->num_op_reply_q) { + ioc_err(mrioc, + "cannot reduce number of operational reply queues from %d to %d\n", + mrioc->num_op_reply_q, + mrioc->facts.max_op_reply_q); + return -EPERM; + } + + if (mrioc->facts.max_op_req_q < mrioc->num_op_req_q) { + ioc_err(mrioc, + "cannot reduce number of operational request queues from %d to %d\n", + mrioc->num_op_req_q, mrioc->facts.max_op_req_q); + return -EPERM; + } + + if ((mrioc->sas_transport_enabled) && (mrioc->facts.ioc_capabilities & + MPI3_IOCFACTS_CAPABILITY_MULTIPATH_ENABLED)) + ioc_err(mrioc, + "critical error: multipath capability is enabled at the " + "controller while sas transport support is enabled at the " + "driver, please reboot the system or reload the driver\n"); + + dev_handle_bitmap_sz = mrioc->facts.max_devhandle / 8; + if (mrioc->facts.max_devhandle % 8) + dev_handle_bitmap_sz++; + if (dev_handle_bitmap_sz > mrioc->dev_handle_bitmap_sz) { + removepend_bitmap = krealloc(mrioc->removepend_bitmap, + dev_handle_bitmap_sz, GFP_KERNEL); + if (!removepend_bitmap) { + ioc_err(mrioc, + "failed to increase removepend_bitmap sz from: %d to %d\n", + mrioc->dev_handle_bitmap_sz, dev_handle_bitmap_sz); + return -EPERM; + } + memset(removepend_bitmap + mrioc->dev_handle_bitmap_sz, 0, + dev_handle_bitmap_sz - mrioc->dev_handle_bitmap_sz); + mrioc->removepend_bitmap = removepend_bitmap; + ioc_info(mrioc, + "increased dev_handle_bitmap_sz from %d to %d\n", + mrioc->dev_handle_bitmap_sz, dev_handle_bitmap_sz); + mrioc->dev_handle_bitmap_sz = dev_handle_bitmap_sz; + } + + return 0; +} + +/** + * mpi3mr_bring_ioc_ready - Bring controller to ready state + * @mrioc: Adapter instance reference + * + * Set Enable IOC bit in IOC configuration register and wait for + * the controller to become ready. + * + * Return: 0 on success, appropriate error on failure. + */ +static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc) +{ + u32 ioc_config, ioc_status, timeout; + int retval = 0; + enum mpi3mr_iocstate ioc_state; + u64 base_info; + + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); + base_info = mpi3mr_readq(&mrioc->sysif_regs->ioc_information); + ioc_info(mrioc, "ioc_status(0x%08x), ioc_config(0x%08x), ioc_info(0x%016llx) at the bringup\n", + ioc_status, ioc_config, base_info); + + /*The timeout value is in 2sec unit, changing it to seconds*/ + mrioc->ready_timeout = + ((base_info & MPI3_SYSIF_IOC_INFO_LOW_TIMEOUT_MASK) >> + MPI3_SYSIF_IOC_INFO_LOW_TIMEOUT_SHIFT) * 2; + + ioc_info(mrioc, "ready timeout: %d seconds\n", mrioc->ready_timeout); + + ioc_state = mpi3mr_get_iocstate(mrioc); + ioc_info(mrioc, "controller is in %s state during detection\n", + mpi3mr_iocstate_name(ioc_state)); + + if (ioc_state == MRIOC_STATE_BECOMING_READY || + ioc_state == MRIOC_STATE_RESET_REQUESTED) { + timeout = mrioc->ready_timeout * 10; + do { + msleep(100); + } while (--timeout); + if (!pci_device_is_present(mrioc->pdev)) + { + mrioc->unrecoverable = 1; + ioc_err(mrioc, "controller is not present while waiting to reset\n"); + goto out_device_not_present; + } + + ioc_state = mpi3mr_get_iocstate(mrioc); + ioc_info(mrioc, + "controller is in %s state after waiting to reset\n", + mpi3mr_iocstate_name(ioc_state)); + } + + if (ioc_state == MRIOC_STATE_READY) { + ioc_info(mrioc, "issuing message unit reset (MUR) to bring to reset state\n"); + retval = mpi3mr_issue_and_process_mur(mrioc, + MPI3MR_RESET_FROM_BRINGUP); + ioc_state = mpi3mr_get_iocstate(mrioc); + if (retval) + ioc_err(mrioc, + "message unit reset failed with error %d current state %s\n", + retval, mpi3mr_iocstate_name(ioc_state)); + } + if (ioc_state != MRIOC_STATE_RESET) { + mpi3mr_print_fault_info(mrioc); + ioc_info(mrioc, "issuing soft reset to bring to reset state\n"); + retval = mpi3mr_issue_reset(mrioc, + MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET, + MPI3MR_RESET_FROM_BRINGUP); + if (retval) { + ioc_err(mrioc, + "soft reset failed with error %d\n", retval); + goto out_failed; + } + } + ioc_state = mpi3mr_get_iocstate(mrioc); + if (ioc_state != MRIOC_STATE_RESET) { + ioc_err(mrioc, + "cannot bring controller to reset state, current state: %s\n", + mpi3mr_iocstate_name(ioc_state)); + goto out_failed; + } + mpi3mr_clear_reset_history(mrioc); + retval = mpi3mr_setup_admin_qpair(mrioc); + if (retval) { + ioc_err(mrioc, "failed to setup admin queues: error %d\n", + retval); + goto out_failed; + } + + ioc_info(mrioc, "bringing controller to ready state\n"); + ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); + ioc_config |= MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC; + writel(ioc_config, &mrioc->sysif_regs->ioc_configuration); + + timeout = mrioc->ready_timeout * 10; + do { + ioc_state = mpi3mr_get_iocstate(mrioc); + if (ioc_state == MRIOC_STATE_READY) + { + ioc_info(mrioc, + "successfully transistioned to %s state\n", + mpi3mr_iocstate_name(ioc_state)); + return 0; + } + if (!pci_device_is_present(mrioc->pdev)) + { + mrioc->unrecoverable = 1; + ioc_err(mrioc, "controller is not present at the bringup\n"); + goto out_device_not_present; + } + msleep(100); + } while (--timeout); + +out_failed: + ioc_state = mpi3mr_get_iocstate(mrioc); + ioc_err(mrioc, + "failed to bring to ready state, current state: %s\n", + mpi3mr_iocstate_name(ioc_state)); +out_device_not_present: + retval = -1; + return retval; +} + +/** + * mpi3mr_enable_events - Enable required events + * @mrioc: Adapter instance reference + * + * This routine unmasks the events required by the driver by + * sennding appropriate event mask bitmapt through an event + * notification request. + * + * Return: 0 on success and non-zero on failure. + */ +int mpi3mr_enable_events(struct mpi3mr_ioc *mrioc) +{ + int retval = 0; + u32 i; + + for (i = 0; i < MPI3_EVENT_NOTIFY_EVENTMASK_WORDS; i++) + mrioc->event_masks[i] = -1; + + mpi3mr_unmask_events(mrioc, MPI3_EVENT_DEVICE_ADDED); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_DEVICE_INFO_CHANGED); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_DEVICE_STATUS_CHANGE); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_ENCL_DEVICE_STATUS_CHANGE); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_ENCL_DEVICE_ADDED); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_SAS_TOPOLOGY_CHANGE_LIST); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_SAS_DISCOVERY); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_SAS_DEVICE_DISCOVERY_ERROR); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_SAS_BROADCAST_PRIMITIVE); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_PCIE_TOPOLOGY_CHANGE_LIST); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_PCIE_ENUMERATION); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_PREPARE_FOR_RESET); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_CABLE_MGMT); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_ENERGY_PACK_CHANGE); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_LOG_DATA); + mpi3mr_unmask_events(mrioc, MPI3_EVENT_DIAGNOSTIC_BUFFER_STATUS_CHANGE); + + retval = mpi3mr_issue_event_notification(mrioc); + if (retval) + ioc_err(mrioc, "failed to issue event notification %d\n", + retval); + return retval; +} + + +/** + * mpi3mr_init_ioc - Initialize the controller + * @mrioc: Adapter instance reference + * + * This the controller initialization routine, executed from + * pci probe callback. Creates admin and operational reply queue + * pairs, allocate required memory for reply pool, sense buffer + * pool, issue IOC init request to the firmware, unmask the + * events. + * + * Return: 0 on success and non-zero on failure. + */ +int mpi3mr_init_ioc(struct mpi3mr_ioc *mrioc) +{ + int retval = 0; + u8 retry = 0; + struct mpi3_ioc_facts_data facts_data; + u32 sz; + +#if defined(IO_COUNTER_SUPPORT) + atomic_set(&mrioc->pend_ios, 0); +#endif + +retry_init: + dprint_init(mrioc, "bringing up the controller to ready state\n"); + retval = mpi3mr_bring_ioc_ready(mrioc); + if (retval) { + ioc_err(mrioc, "failed to bring to ready state\n"); + goto out_failed_noretry; + } + + dprint_init(mrioc, "setting up single ISR\n"); + retval = mpi3mr_setup_isr(mrioc, 1); + if (retval) { + ioc_err(mrioc, "failed to setup ISR\n"); + goto out_failed_noretry; + } + + dprint_init(mrioc, "getting ioc_facts\n"); + retval = mpi3mr_issue_iocfacts(mrioc, &facts_data); + if (retval) { + ioc_err(mrioc, "failed to get ioc_facts\n"); + goto out_failed; + } + + mrioc->max_host_ios = mrioc->facts.max_reqs - MPI3MR_INTERNALCMDS_RESVD; + if (!(mrioc->facts.ioc_capabilities & + MPI3_IOCFACTS_CAPABILITY_MULTIPATH_ENABLED)) { + mrioc->sas_transport_enabled = 1; + mrioc->scsi_device_channel = 1; + mrioc->shost->max_channel = 1; + mrioc->shost->transportt = mpi3mr_transport_template; + } + + mrioc->num_io_throttle_group = mrioc->facts.max_io_throttle_group; + atomic_set(&mrioc->pend_large_data_sz, 0); + + if (reset_devices) + mrioc->max_host_ios = min_t(int, mrioc->max_host_ios, + MPI3MR_HOST_IOS_KDUMP); + + mrioc->reply_sz = mrioc->facts.reply_sz; + + dprint_init(mrioc, "check and reset dma mask\n"); + retval = mpi3mr_check_reset_dma_mask(mrioc); + if (retval) { + ioc_err(mrioc, "resetting dma mask failed\n"); + goto out_failed_noretry; + } + + mpi3mr_print_ioc_info(mrioc); + + dprint_init(mrioc, "allocating config page buffers\n"); + mrioc->cfg_page = dma_zalloc_coherent(&mrioc->pdev->dev, + MPI3MR_DEFAULT_CFG_PAGE_SZ, &mrioc->cfg_page_dma, GFP_KERNEL); + if (!mrioc->cfg_page) + goto out_failed_noretry; + + mrioc->cfg_page_sz = MPI3MR_DEFAULT_CFG_PAGE_SZ; + + dprint_init(mrioc, "allocating host diag buffers\n"); + mpi3mr_alloc_diag_bufs(mrioc); + + dprint_init(mrioc, "posting host diag buffers\n"); + retval = mpi3mr_post_diag_bufs(mrioc); + if (retval) { + ioc_err(mrioc, "failed to post host diag buffers\n"); + goto out_failed; + } + + dprint_init(mrioc, "allocating reply and sense buffers\n"); + retval = mpi3mr_alloc_reply_sense_bufs(mrioc); + if (retval) { + ioc_err(mrioc, "failed to allocate reply and sense buffers\n"); + goto out_failed_noretry; + } + + dprint_init(mrioc, "allocating chain buffers\n"); + retval = mpi3mr_alloc_chain_bufs(mrioc); + if (retval) { + ioc_err(mrioc, "failed to allocate chain buffers\n"); + goto out_failed_noretry; + } + + dprint_init(mrioc, "sending ioc_init\n"); + retval = mpi3mr_issue_iocinit(mrioc); + if (retval) { + ioc_err(mrioc, "failed to send ioc_init\n"); + goto out_failed; + } + + dprint_init(mrioc, "getting package version\n"); + retval = mpi3mr_print_pkg_ver(mrioc); + if (retval) { + ioc_err(mrioc, "failed to get package version\n"); + goto out_failed; + } + + dprint_init(mrioc, "setting up multiple ISR\n"); + retval = mpi3mr_setup_isr(mrioc, 0); + if (retval) { + ioc_err(mrioc, "failed to re-setup ISR\n"); + goto out_failed_noretry; + } + + dprint_init(mrioc, "creating operational queue pairs\n"); + retval = mpi3mr_create_op_queues(mrioc); + if (retval) { + ioc_err(mrioc, "failed to create operational queue pairs\n"); + goto out_failed; + } + + if (!mrioc->pel_seqnum_virt) { + dprint_init(mrioc, "allocating memory for pel_seqnum_virt\n"); + mrioc->pel_seqnum_sz = sizeof(struct mpi3_pel_seq); + mrioc->pel_seqnum_virt = dma_zalloc_coherent(&mrioc->pdev->dev, + mrioc->pel_seqnum_sz, &mrioc->pel_seqnum_dma, + GFP_KERNEL); + if (!mrioc->pel_seqnum_virt) + goto out_failed_noretry; + } + + if (!mrioc->throttle_groups && mrioc->num_io_throttle_group) { + dprint_init(mrioc, "allocating memory for throttle groups\n"); + sz = sizeof(struct mpi3mr_throttle_group_info); + mrioc->throttle_groups = (struct mpi3mr_throttle_group_info *) + kcalloc(mrioc->num_io_throttle_group, + sz, GFP_KERNEL); + if (!mrioc->throttle_groups) + goto out_failed_noretry; + } + + dprint_init(mrioc, "enabling events\n"); + retval = mpi3mr_enable_events(mrioc); + if (retval) { + ioc_err(mrioc, "failed to enable events\n"); + goto out_failed; + } + retval = mpi3mr_refresh_trigger(mrioc, MPI3_CONFIG_ACTION_READ_CURRENT); + if (retval) { + ioc_err(mrioc, "failed to refresh triggers\n"); + goto out_failed; + } + if (mrioc->facts.diag_drvr_sz) { + dprint_reset(mrioc, "posting driver diag buffer\n"); + retval = mpi3mr_alloc_issue_host_diag_buf(mrioc); + if (retval) { + ioc_err(mrioc, "failed to post driver diag buffer\n"); + goto out_failed; + } + } + + ioc_info(mrioc, "controller initialization completed successfully\n"); + return retval; +out_failed: + if (retry < 2) { + retry++; + ioc_warn(mrioc, "retrying controller initialization, retry_count:%d\n", + retry); + mpi3mr_memset_buffers(mrioc); + goto retry_init; + } +out_failed_noretry: + ioc_err(mrioc, "controller initialization failed\n"); + mpi3mr_issue_reset(mrioc, MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT, + MPI3MR_RESET_FROM_CTLR_CLEANUP); + mrioc->unrecoverable = 1; + return retval; +} + +/** + * mpi3mr_reinit_ioc - Re-Initialize the controller + * @mrioc: Adapter instance reference + * @is_resume: Called from resume or reset path + * + * This the controller re-initialization routine, executed from + * the soft reset handler or resume callback. creates + * operational reply queue pairs, allocate required memory for + * reply pool, sense buffer pool, issue IOC init request to the + * firmware, unmask the events and issue port enable to discover + * SAS/SATA/NVMe devices and RAID volumes. + * + * Return: 0 on success and non-zero on failure. + */ +int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume) +{ + int retval = 0; + u8 retry = 0; + struct mpi3_ioc_facts_data facts_data; + u32 pe_timeout, ioc_status; + +retry_init: + pe_timeout = + ( MPI3MR_PORTENABLE_TIMEOUT / MPI3MR_PORTENABLE_POLL_INTERVAL); + dprint_reset(mrioc, "bringing up the controller to ready state\n"); + retval = mpi3mr_bring_ioc_ready(mrioc); + if (retval) { + ioc_err(mrioc, "failed to bring to ready state\n"); + goto out_failed_noretry; + } + + if (is_resume) { + dprint_reset(mrioc, "setting up single ISR\n"); + retval = mpi3mr_setup_isr(mrioc, 1); + if (retval) { + ioc_err(mrioc, "failed to setup ISR\n"); + goto out_failed_noretry; + } + } else + mpi3mr_ioc_enable_intr(mrioc); + + dprint_reset(mrioc, "getting ioc_facts\n"); + retval = mpi3mr_issue_iocfacts(mrioc, &facts_data); + if (retval) { + ioc_err(mrioc, "failed to get ioc_facts\n"); + goto out_failed; + } + + dprint_reset(mrioc, "validating ioc_facts\n"); + retval = mpi3mr_revalidate_factsdata(mrioc); + if (retval) { + ioc_err(mrioc, "failed to revalidate ioc_facts data\n"); + goto out_failed_noretry; + } + + mpi3mr_print_ioc_info(mrioc); + + if (is_resume) { + dprint_reset(mrioc, "posting host diag buffers\n"); + retval = mpi3mr_post_diag_bufs(mrioc); + if (retval) { + ioc_err(mrioc, "failed to post host diag buffers\n"); + goto out_failed; + } + } + + dprint_reset(mrioc, "sending ioc_init\n"); + retval = mpi3mr_issue_iocinit(mrioc); + if (retval) { + ioc_err(mrioc, "failed to send ioc_init\n"); + goto out_failed; + } + + dprint_reset(mrioc, "getting package version\n"); + retval = mpi3mr_print_pkg_ver(mrioc); + if (retval) { + ioc_err(mrioc, "failed to get package version\n"); + goto out_failed; + } + + if (is_resume) { + dprint_reset(mrioc, "setting up multiple ISR\n"); + retval = mpi3mr_setup_isr(mrioc, 0); + if (retval) { + ioc_err(mrioc, "failed to re-setup ISR\n"); + goto out_failed_noretry; + } + } + + dprint_reset(mrioc, "creating operational queue pairs\n"); + retval = mpi3mr_create_op_queues(mrioc); + if (retval) { + ioc_err(mrioc, "failed to create operational queue pairs\n"); + goto out_failed; + } + + if (mpi3mr_use_blk_mq(mrioc->shost) && + (mrioc->shost->nr_hw_queues > mrioc->num_op_reply_q)) { + ioc_err(mrioc, + "cannot create minimum number of operatioanl queues expected:%d created:%d\n", + mrioc->shost->nr_hw_queues, mrioc->num_op_reply_q); + goto out_failed_noretry; + } + + if (!mrioc->pel_seqnum_virt) { + dprint_reset(mrioc, "allocating memory for pel_seqnum_virt\n"); + mrioc->pel_seqnum_sz = sizeof(struct mpi3_pel_seq); + mrioc->pel_seqnum_virt = dma_zalloc_coherent(&mrioc->pdev->dev, + mrioc->pel_seqnum_sz, &mrioc->pel_seqnum_dma, + GFP_KERNEL); + if (!mrioc->pel_seqnum_virt) + goto out_failed_noretry; + } + + dprint_reset(mrioc, "enabling events\n"); + retval = mpi3mr_enable_events(mrioc); + if (retval) { + ioc_err(mrioc, "failed to enable events\n"); + goto out_failed; + } + + mrioc->device_refresh_on = 1; + mpi3mr_add_event_wait_for_device_refresh(mrioc); + + ioc_info(mrioc, "sending port enable\n"); + retval = mpi3mr_issue_port_enable(mrioc, 1); + if (retval) { + ioc_err(mrioc, "failed to issue port enable\n"); + goto out_failed; + } + do { + ssleep(MPI3MR_PORTENABLE_POLL_INTERVAL); + if (mrioc->init_cmds.state == MPI3MR_CMD_NOTUSED) + break; + if (!pci_device_is_present(mrioc->pdev)) + mrioc->unrecoverable = 1; + if (mrioc->unrecoverable) + goto out_failed_noretry; + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) || + (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) { + mpi3mr_print_fault_info(mrioc); + mrioc->init_cmds.is_waiting = 0; + mrioc->init_cmds.callback = NULL; + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + goto out_failed; + } + } while (--pe_timeout); + + if (!pe_timeout) { + ioc_err(mrioc, "port enable timed out\n"); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_PE_TIMEOUT); + mrioc->init_cmds.is_waiting = 0; + mrioc->init_cmds.callback = NULL; + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + goto out_failed; + } else if (mrioc->scan_failed){ + ioc_err(mrioc, + "port enable failed with status=0x%04x\n", + mrioc->scan_failed); + } else + ioc_info(mrioc, "port enable completed successfully\n"); + + if (mrioc->facts.diag_drvr_sz) { + dprint_reset(mrioc, "posting driver diag buffer\n"); + retval = mpi3mr_alloc_issue_host_diag_buf(mrioc); + if (retval) { + ioc_err(mrioc, "failed to post driver diag buffer\n"); + goto out_failed; + } + } + + ioc_info(mrioc, "controller %s completed successfully\n", + (is_resume)?"resume":"re-initialization"); + return retval; +out_failed: + if (retry < 2) { + retry++; + ioc_warn(mrioc, "retrying controller %s, retry_count:%d\n", + (is_resume)?"resume":"re-initialization", retry); + mpi3mr_memset_buffers(mrioc); + goto retry_init; + } +out_failed_noretry: + ioc_err(mrioc, "controller %s is failed\n", + (is_resume)?"resume":"re-initialization"); + mpi3mr_issue_reset(mrioc, MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT, + MPI3MR_RESET_FROM_CTLR_CLEANUP); + mrioc->unrecoverable = 1; + return retval; +} + +/** + * mpi3mr_memset_op_reply_q_buffers - memset the operational reply queue's + * segments + * @mrioc: Adapter instance reference + * @qidx: Operational reply queue index + * + * Return: Nothing. + */ +static void mpi3mr_memset_op_reply_q_buffers(struct mpi3mr_ioc *mrioc, u16 qidx) +{ + struct op_reply_qinfo *op_reply_q = mrioc->op_reply_qinfo + qidx; + struct segments *segments; + int i, size; + + if (!op_reply_q->q_segments) + return; + + size = op_reply_q->segment_qd * mrioc->op_reply_desc_sz; + segments = op_reply_q->q_segments; + for (i = 0; i < op_reply_q->num_segments; i++) + memset(segments[i].segment, 0, size); +} + +/** + * mpi3mr_memset_op_req_q_buffers - memset the operational request queue's + * segments + * @mrioc: Adapter instance reference + * @qidx: Operational request queue index + * + * Return: Nothing. + */ +static void mpi3mr_memset_op_req_q_buffers(struct mpi3mr_ioc *mrioc, u16 qidx) +{ + struct op_req_qinfo *op_req_q = mrioc->req_qinfo + qidx; + struct segments *segments; + int i, size; + + if (!op_req_q->q_segments) + return; + + size = op_req_q->segment_qd * mrioc->facts.op_req_sz; + segments = op_req_q->q_segments; + for (i = 0; i < op_req_q->num_segments; i++) + memset(segments[i].segment, 0, size); +} + +/** + * mpi3mr_memset_buffers - memset memory for a controller + * @mrioc: Adapter instance reference + * + * clear all the memory allocated for a controller, typically + * called post reset to reuse the memory allocated during the + * controller init. + * + * Return: Nothing. + */ +void mpi3mr_memset_buffers(struct mpi3mr_ioc *mrioc) +{ + u16 i; + struct mpi3mr_throttle_group_info *tg; + + mrioc->change_count = 0; + mrioc->active_poll_qcount = 0; + mrioc->default_qcount = 0; + if (mrioc->admin_req_base) + memset(mrioc->admin_req_base, 0, mrioc->admin_req_q_sz); + if (mrioc->admin_reply_base) + memset(mrioc->admin_reply_base, 0, mrioc->admin_reply_q_sz); + atomic_set(&mrioc->admin_reply_q_in_use, 0); + + if (mrioc->init_cmds.reply) { + memset(mrioc->init_cmds.reply, 0, + sizeof(*mrioc->init_cmds.reply)); + memset(mrioc->bsg_cmds.reply, 0, + sizeof(*mrioc->bsg_cmds.reply)); + memset(mrioc->host_tm_cmds.reply, 0, + sizeof(*mrioc->host_tm_cmds.reply)); + memset(mrioc->pel_cmds.reply, 0, + sizeof(*mrioc->pel_cmds.reply)); + memset(mrioc->pel_abort_cmd.reply, 0, + sizeof(*mrioc->pel_abort_cmd.reply)); + memset(mrioc->transport_cmds.reply, 0, + sizeof(*mrioc->transport_cmds.reply)); + for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++) + memset(mrioc->dev_rmhs_cmds[i].reply, 0, + sizeof(*mrioc->dev_rmhs_cmds[i].reply)); + for (i = 0; i < MPI3MR_NUM_SYSFS_TM; i++) + memset(mrioc->sysfs_tm_cmds[i].reply, 0, + sizeof(*mrioc->sysfs_tm_cmds[i].reply)); + for (i = 0; i < MPI3MR_NUM_EVTACKCMD; i++) + memset(mrioc->evtack_cmds[i].reply, 0, + sizeof(*mrioc->evtack_cmds[i].reply)); + memset(mrioc->removepend_bitmap, 0, + mrioc->dev_handle_bitmap_sz); + memset(mrioc->devrem_bitmap, 0, mrioc->devrem_bitmap_sz); + memset(mrioc->evtack_cmds_bitmap, 0, + mrioc->evtack_cmds_bitmap_sz); + } + + for (i = 0; i < mrioc->num_queues; i++) { + mrioc->op_reply_qinfo[i].qid = 0; + mrioc->op_reply_qinfo[i].ci = 0; + mrioc->op_reply_qinfo[i].num_replies = 0; + mrioc->op_reply_qinfo[i].ephase = 0; + atomic_set(&mrioc->op_reply_qinfo[i].pend_ios, 0); + atomic_set(&mrioc->op_reply_qinfo[i].in_use, 0); + mpi3mr_memset_op_reply_q_buffers(mrioc, i); + + mrioc->req_qinfo[i].ci = 0; + mrioc->req_qinfo[i].pi = 0; + mrioc->req_qinfo[i].num_requests = 0; + mrioc->req_qinfo[i].qid = 0; + mrioc->req_qinfo[i].reply_qid = 0; + spin_lock_init(&mrioc->req_qinfo[i].q_lock); + mrioc->req_qinfo[i].last_full_host_tag = 0; + mpi3mr_memset_op_req_q_buffers(mrioc, i); + } + + atomic_set(&mrioc->pend_large_data_sz, 0); + if (mrioc->throttle_groups) { + tg = mrioc->throttle_groups; + for (i = 0; i < mrioc->num_io_throttle_group; i++, tg++) { + tg->id = 0; + tg->fw_qd = 0; + tg->modified_qd = 0; + tg->io_divert= 0; + tg->need_qd_reduction= 0; + tg->high = 0; + tg->low = 0; + tg->qd_reduction= 0; + atomic_set(&tg->pend_large_data_sz, 0); + } + } +} + +/** + * mpi3mr_free_mem - Free memory allocated for a controller + * @mrioc: Adapter instance reference + * + * Free all the memory allocated for a controller. + * + * Return: Nothing. + */ +void mpi3mr_free_mem(struct mpi3mr_ioc *mrioc) +{ + u16 i; + struct mpi3mr_intr_info *intr_info; + struct diag_buffer_desc *diag_buffer; + + dprint_exit(mrioc, "freeing up memory allocated for the controller\n"); + + mpi3mr_free_enclosure_list(mrioc); + + if (mrioc->sense_buf_pool) { + if (mrioc->sense_buf) + dma_pool_free(mrioc->sense_buf_pool, mrioc->sense_buf, + mrioc->sense_buf_dma); + dma_pool_destroy(mrioc->sense_buf_pool); + mrioc->sense_buf = NULL; + mrioc->sense_buf_pool = NULL; + } + if (mrioc->sense_buf_q_pool) { + if (mrioc->sense_buf_q) + dma_pool_free(mrioc->sense_buf_q_pool, + mrioc->sense_buf_q, mrioc->sense_buf_q_dma); + dma_pool_destroy(mrioc->sense_buf_q_pool); + mrioc->sense_buf_q = NULL; + mrioc->sense_buf_q_pool = NULL; + } + + if (mrioc->reply_buf_pool) { + if (mrioc->reply_buf) + dma_pool_free(mrioc->reply_buf_pool, mrioc->reply_buf, + mrioc->reply_buf_dma); + dma_pool_destroy(mrioc->reply_buf_pool); + mrioc->reply_buf = NULL; + mrioc->reply_buf_pool = NULL; + } + if (mrioc->reply_free_q_pool) { + if (mrioc->reply_free_q) + dma_pool_free(mrioc->reply_free_q_pool, + mrioc->reply_free_q, mrioc->reply_free_q_dma); + dma_pool_destroy(mrioc->reply_free_q_pool); + mrioc->reply_free_q = NULL; + mrioc->reply_free_q_pool = NULL; + } + + for (i = 0; i < mrioc->num_op_req_q; i++) + mpi3mr_free_op_req_q_segments(mrioc, i); + + for (i = 0; i < mrioc->num_op_reply_q; i++) + mpi3mr_free_op_reply_q_segments(mrioc, i); + + for (i = 0; i < mrioc->intr_info_count; i++) { + intr_info = mrioc->intr_info + i; + intr_info->op_reply_q = NULL; + } + + kfree(mrioc->req_qinfo); + mrioc->req_qinfo = NULL; + mrioc->num_op_req_q = 0; + + kfree(mrioc->op_reply_qinfo); + mrioc->op_reply_qinfo = NULL; + mrioc->num_op_reply_q = 0; + + kfree(mrioc->init_cmds.reply); + mrioc->init_cmds.reply = NULL; + + kfree(mrioc->bsg_cmds.reply); + mrioc->bsg_cmds.reply = NULL; + + kfree(mrioc->host_tm_cmds.reply); + mrioc->host_tm_cmds.reply = NULL; + + kfree(mrioc->pel_cmds.reply); + mrioc->pel_cmds.reply = NULL; + + kfree(mrioc->pel_abort_cmd.reply); + mrioc->pel_abort_cmd.reply = NULL; + + kfree(mrioc->transport_cmds.reply); + mrioc->transport_cmds.reply = NULL; + + for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++) { + kfree(mrioc->dev_rmhs_cmds[i].reply); + mrioc->dev_rmhs_cmds[i].reply = NULL; + } + for (i = 0; i < MPI3MR_NUM_SYSFS_TM; i++) { + kfree(mrioc->sysfs_tm_cmds[i].reply); + mrioc->sysfs_tm_cmds[i].reply = NULL; + } + for (i = 0; i < MPI3MR_NUM_EVTACKCMD; i++) { + kfree(mrioc->evtack_cmds[i].reply); + mrioc->evtack_cmds[i].reply = NULL; + } + + kfree(mrioc->removepend_bitmap); + mrioc->removepend_bitmap = NULL; + + kfree(mrioc->devrem_bitmap); + mrioc->devrem_bitmap = NULL; + + kfree(mrioc->evtack_cmds_bitmap); + mrioc->evtack_cmds_bitmap = NULL; + + kfree(mrioc->chain_bitmap); + mrioc->chain_bitmap = NULL; + + if (mrioc->chain_buf_pool) { + for (i = 0; i < mrioc->chain_buf_count; i++) { + if (mrioc->chain_sgl_list[i].addr) { + dma_pool_free(mrioc->chain_buf_pool, + mrioc->chain_sgl_list[i].addr, + mrioc->chain_sgl_list[i].dma_addr); + mrioc->chain_sgl_list[i].addr = NULL; + } + } + dma_pool_destroy(mrioc->chain_buf_pool); + mrioc->chain_buf_pool = NULL; + } + + kfree(mrioc->chain_sgl_list); + mrioc->chain_sgl_list = NULL; + + if (mrioc->admin_reply_base) { + dma_free_coherent(&mrioc->pdev->dev, mrioc->admin_reply_q_sz, + mrioc->admin_reply_base, mrioc->admin_reply_dma); + mrioc->admin_reply_base = NULL; + } + if (mrioc->admin_req_base) { + dma_free_coherent(&mrioc->pdev->dev, mrioc->admin_req_q_sz, + mrioc->admin_req_base, mrioc->admin_req_dma); + mrioc->admin_req_base = NULL; + } + + if (mrioc->prp_list_virt) { + dma_free_coherent(&mrioc->pdev->dev, mrioc->prp_sz, + mrioc->prp_list_virt, mrioc->prp_list_dma); + mrioc->prp_list_virt = NULL; + } + + if (mrioc->pel_seqnum_virt) { + dma_free_coherent(&mrioc->pdev->dev, mrioc->pel_seqnum_sz, + mrioc->pel_seqnum_virt, mrioc->pel_seqnum_dma); + mrioc->pel_seqnum_virt = NULL; + } + + for (i = 0; i < MPI3MR_MAX_NUM_HDB; i++) { + diag_buffer = &mrioc->diag_buffers[i]; + if (diag_buffer->addr) { + dma_free_coherent(&mrioc->pdev->dev, + diag_buffer->size, diag_buffer->addr, + diag_buffer->dma_addr); + diag_buffer->addr = NULL; + diag_buffer->size = 0; + diag_buffer->type = 0; + diag_buffer->status = 0; + } + } + + if (mrioc->drv_diag_buffer) { + dma_free_coherent(&mrioc->pdev->dev, + mrioc->drv_diag_buffer_sz, mrioc->drv_diag_buffer, + mrioc->drv_diag_buffer_dma); + mrioc->drv_diag_buffer = NULL; + mrioc->drv_diag_buffer_sz = 0; + } + + if (mrioc->cfg_page) { + dma_free_coherent(&mrioc->pdev->dev, + mrioc->cfg_page_sz, mrioc->cfg_page, + mrioc->cfg_page_dma); + mrioc->cfg_page = NULL; + } + + vfree(mrioc->uefi_logs); + kfree(mrioc->logdata_buf); + mrioc->logdata_buf = NULL; + kfree(mrioc->driver_pg2); + mrioc->driver_pg2 = NULL; + dprint_exit(mrioc, "freed up memory allocated for the controller\n"); +} + +/** + * mpi3mr_issue_ioc_shutdown - shutdown controller + * @mrioc: Adapter instance reference + * + * Send shutodwn notification to the controller and wait for the + * shutdown_timeout for it to be completed. + * + * Return: Nothing. + */ +static void mpi3mr_issue_ioc_shutdown(struct mpi3mr_ioc *mrioc) +{ + u32 ioc_config, ioc_status; + u8 retval = 1; + u32 timeout = MPI3MR_DEFAULT_SHUTDOWN_TIME * 10; + + ioc_info(mrioc, "sending shutdown notification\n"); + if (mrioc->unrecoverable) { + ioc_warn(mrioc, + "controller is unrecoverable, shutdown not issued\n"); + return; + } + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + if ((ioc_status & MPI3_SYSIF_IOC_STATUS_SHUTDOWN_MASK) + == MPI3_SYSIF_IOC_STATUS_SHUTDOWN_IN_PROGRESS) { + ioc_warn(mrioc, "shutdown already in progress\n"); + return; + } + + ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); + ioc_config |= MPI3_SYSIF_IOC_CONFIG_SHUTDOWN_NORMAL; + ioc_config |= MPI3_SYSIF_IOC_CONFIG_DEVICE_SHUTDOWN_SEND_REQ; + + writel(ioc_config, &mrioc->sysif_regs->ioc_configuration); + + if (mrioc->facts.shutdown_timeout) + timeout = mrioc->facts.shutdown_timeout * 10; + + do { + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + if ((ioc_status & MPI3_SYSIF_IOC_STATUS_SHUTDOWN_MASK) + == MPI3_SYSIF_IOC_STATUS_SHUTDOWN_COMPLETE) { + retval = 0; + break; + } + msleep(100); + } while (--timeout); + + + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); + + if (retval) { + if ((ioc_status & MPI3_SYSIF_IOC_STATUS_SHUTDOWN_MASK) + == MPI3_SYSIF_IOC_STATUS_SHUTDOWN_IN_PROGRESS) + ioc_warn(mrioc, + "shutdown still in progress after timeout\n"); + } + + ioc_info(mrioc, + "ioc_status/ioc_config after %s shutdown is (0x%x)/(0x%x)\n", + (!retval)?"successful":"failed", ioc_status, + ioc_config); +} + +/** + * mpi3mr_cleanup_ioc - Cleanup controller + * @mrioc: Adapter instance reference + + * controller cleanup handler, Message unit reset or soft reset + * and shutdown notification is issued to the controller. + * + * Return: Nothing. + */ +void mpi3mr_cleanup_ioc(struct mpi3mr_ioc *mrioc) +{ + enum mpi3mr_iocstate ioc_state; + + dprint_exit(mrioc, "cleaning up the controller\n"); + + mpi3mr_ioc_disable_intr(mrioc); + + ioc_state = mpi3mr_get_iocstate(mrioc); + + if ((!mrioc->unrecoverable) && (!mrioc->reset_in_progress) && + (ioc_state == MRIOC_STATE_READY)) { + if (mrioc->is_segqueue_enabled && !mrioc->pdev->revision) + mpi3mr_issue_reset(mrioc, + MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET, + MPI3MR_RESET_FROM_CTLR_CLEANUP); + else if (mpi3mr_issue_and_process_mur(mrioc, + MPI3MR_RESET_FROM_CTLR_CLEANUP)) + mpi3mr_issue_reset(mrioc, + MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET, + MPI3MR_RESET_FROM_MUR_FAILURE); + mpi3mr_issue_ioc_shutdown(mrioc); + } + dprint_exit(mrioc, "controller cleanup completed\n"); +} + +/** + * mpi3mr_drv_cmd_comp_reset - Flush a internal driver command + * @mrioc: Adapter instance reference + * @cmdptr: Internal command tracker + * + * Complete an internal driver commands with state indicating it + * is completed due to reset. + * + * Return: Nothing. + */ +static inline void mpi3mr_drv_cmd_comp_reset(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *cmdptr) +{ + if (cmdptr->state & MPI3MR_CMD_PENDING) { + cmdptr->state |= MPI3MR_CMD_RESET; + cmdptr->state &= ~MPI3MR_CMD_PENDING; + if (cmdptr->is_waiting) { + complete(&cmdptr->done); + cmdptr->is_waiting = 0; + } else if (cmdptr->callback) + cmdptr->callback(mrioc, cmdptr); + } +} + +/** + * mpi3mr_flush_drv_cmds - Flush internal driver commands + * @mrioc: Adapter instance reference + * + * Flush all internal driver commands post reset + * + * Return: Nothing. + */ +void mpi3mr_flush_drv_cmds(struct mpi3mr_ioc *mrioc) +{ + struct mpi3mr_drv_cmd *cmdptr; + u8 i; + + dprint_reset(mrioc, "flushing internal commands\n"); + cmdptr = &mrioc->init_cmds; + mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr); + + cmdptr = &mrioc->cfg_cmds; + mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr); + + cmdptr = &mrioc->bsg_cmds; + mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr); + + cmdptr = &mrioc->host_tm_cmds; + mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr); + + for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++) { + cmdptr = &mrioc->dev_rmhs_cmds[i]; + mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr); + } + + for (i = 0; i < MPI3MR_NUM_SYSFS_TM; i++) { + cmdptr = &mrioc->sysfs_tm_cmds[i]; + mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr); + } + + for (i = 0; i < MPI3MR_NUM_EVTACKCMD; i++) { + cmdptr = &mrioc->evtack_cmds[i]; + mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr); + } + + cmdptr = &mrioc->pel_cmds; + mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr); + + cmdptr = &mrioc->pel_abort_cmd; + mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr); + + cmdptr = &mrioc->transport_cmds; + mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr); + + init_waitqueue_head(&mrioc->sysfs_pending_tm_wq); + atomic_set(&mrioc->sysfs_tm_pending, 0); + mrioc->sysfs_tm_issued = 0; + mrioc->sysfs_tm_terminated_io_count = 0; +} + +/** + * mpi3mr_free_enclosure_list - release enclosures + * @mrioc: Adapter instance reference + * + * Free memory allocated during encloure add. + * + * Return nothing. + */ +void mpi3mr_free_enclosure_list(struct mpi3mr_ioc *mrioc) +{ + struct mpi3mr_enclosure_node *enclosure_dev, *enclosure_dev_next; + + list_for_each_entry_safe(enclosure_dev, + enclosure_dev_next, &mrioc->enclosure_list, list) { + list_del(&enclosure_dev->list); + kfree(enclosure_dev); + } +} + +/** + * mpi3mr_pel_wait_post - Issue PEL Wait + * @mrioc: Adapter instance reference + * @drv_cmd: Internal command tracker + * + * Issue PEL Wait MPI request through admin queue and return. + * + * Return: Nothing. + */ +static void mpi3mr_pel_wait_post(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd) +{ + struct mpi3_pel_req_action_wait pel_wait; + + mrioc->pel_abort_requested = false; + + memset(&pel_wait, 0, sizeof(pel_wait)); + drv_cmd->state = MPI3MR_CMD_PENDING; + drv_cmd->is_waiting = 0; + drv_cmd->callback = mpi3mr_pel_wait_complete; + drv_cmd->ioc_status = 0; + drv_cmd->ioc_loginfo = 0; + pel_wait.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_PEL_WAIT); + pel_wait.function = MPI3_FUNCTION_PERSISTENT_EVENT_LOG; + pel_wait.action = MPI3_PEL_ACTION_WAIT; + pel_wait.starting_sequence_number = cpu_to_le32(mrioc->pel_newest_seqnum); + pel_wait.locale = cpu_to_le16(mrioc->pel_locale); + pel_wait.class = cpu_to_le16(mrioc->pel_class); + pel_wait.wait_time = MPI3_PEL_WAITTIME_INFINITE_WAIT; + dprint_bsg_info(mrioc, "sending pel_wait seqnum(%d), class(%d), locale(0x%08x)\n", + mrioc->pel_newest_seqnum, mrioc->pel_class, mrioc->pel_locale); + + if (mpi3mr_admin_request_post(mrioc, &pel_wait, sizeof(pel_wait), 0)) { + dprint_bsg_err(mrioc, + "Issuing PELWait: Admin post failed\n"); + drv_cmd->state = MPI3MR_CMD_NOTUSED; + drv_cmd->callback = NULL; + drv_cmd->retry_count = 0; + mrioc->pel_enabled = false; + } + return; +} + +/** + * mpi3mr_pel_get_seqnum_post - Issue PEL Get Sequence number + * @mrioc: Adapter instance reference + * @drv_cmd: Internal command tracker + * + * Issue PEL get sequence number MPI request through admin queue + * and return. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_pel_get_seqnum_post(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd) +{ + struct mpi3_pel_req_action_get_sequence_numbers pel_getseq_req; + u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST; + int retval = 0; + + memset(&pel_getseq_req, 0, sizeof(pel_getseq_req)); + mrioc->pel_cmds.state = MPI3MR_CMD_PENDING; + mrioc->pel_cmds.is_waiting = 0; + mrioc->pel_cmds.ioc_status = 0; + mrioc->pel_cmds.ioc_loginfo = 0; + mrioc->pel_cmds.callback = mpi3mr_pel_get_seqnum_complete; + pel_getseq_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_PEL_WAIT); + pel_getseq_req.function = MPI3_FUNCTION_PERSISTENT_EVENT_LOG; + pel_getseq_req.action = MPI3_PEL_ACTION_GET_SEQNUM; + mpi3mr_add_sg_single(&pel_getseq_req.sgl, sgl_flags, + mrioc->pel_seqnum_sz, mrioc->pel_seqnum_dma); + + retval = mpi3mr_admin_request_post(mrioc, &pel_getseq_req, + sizeof(pel_getseq_req), 0); + if (retval) { + if (drv_cmd) { + drv_cmd->state = MPI3MR_CMD_NOTUSED; + drv_cmd->callback = NULL; + drv_cmd->retry_count = 0; + } + mrioc->pel_enabled = false; + } + + return retval; +} + +/** + * mpi3mr_pel_wait_complete - PELWait Completion callback + * @mrioc: Adapter instance reference + * @drv_cmd: Internal command tracker + * + * This is a callback handler for the PELWait request and + * firmware completes a PELWait request when it is aborted or a + * new PEL entry is available. This sends AEN to the application + * and if the PELwait completion is not due to PELAbort then + * this will send a request for new PEL Sequence number + * + * Return: Nothing. + */ +static void mpi3mr_pel_wait_complete(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd) +{ + struct mpi3_pel_reply *pel_reply = NULL; + u16 ioc_status, pe_log_status; + bool do_retry = false; + + if (drv_cmd->state & MPI3MR_CMD_RESET) + goto cleanup_drv_cmd; + + ioc_status = drv_cmd->ioc_status & MPI3_IOCSTATUS_STATUS_MASK; + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "%s: Failed ioc_status(0x%04x) Loginfo(0x%08x)\n", + __func__, ioc_status, drv_cmd->ioc_loginfo); + dprint_bsg_err(mrioc, + "pel_wait: failed with ioc_status(0x%04x), log_info(0x%08x)\n", + ioc_status, drv_cmd->ioc_loginfo); + do_retry = true; + } + + if (drv_cmd->state & MPI3MR_CMD_REPLY_VALID) + pel_reply = (struct mpi3_pel_reply *)drv_cmd->reply; + + if (!pel_reply) { + dprint_bsg_err(mrioc, + "pel_wait: failed due to no reply\n"); + goto out_failed; + } + + pe_log_status = le16_to_cpu(pel_reply->pe_log_status); + if ((pe_log_status != MPI3_PEL_STATUS_SUCCESS) && + (pe_log_status != MPI3_PEL_STATUS_ABORTED)) { + ioc_err(mrioc, "%s: Failed pe_log_status(0x%04x)\n", + __func__, pe_log_status); + dprint_bsg_err(mrioc, + "pel_wait: failed due to pel_log_status(0x%04x)\n", + pe_log_status); + do_retry = true; + } + + if (do_retry) { + if (drv_cmd->retry_count < MPI3MR_PEL_RETRY_COUNT) { + drv_cmd->retry_count++; + dprint_bsg_err(mrioc, "pel_wait: retrying(%d)\n", + drv_cmd->retry_count); + mpi3mr_pel_wait_post(mrioc, drv_cmd); + return; + } + dprint_bsg_err(mrioc, + "pel_wait: failed after all retries(%d)\n", + drv_cmd->retry_count); + goto out_failed; + } + atomic64_inc(&event_counter); + if (!mrioc->pel_abort_requested) { + mrioc->pel_cmds.retry_count = 0; + mpi3mr_pel_get_seqnum_post(mrioc, &mrioc->pel_cmds); + } + + return; +out_failed: + mrioc->pel_enabled = false; +cleanup_drv_cmd: + drv_cmd->state = MPI3MR_CMD_NOTUSED; + drv_cmd->callback = NULL; + drv_cmd->retry_count = 0; +} + +/** + * mpi3mr_pel_get_seqnum_complete - PELGetSeqNum Completion callback + * @mrioc: Adapter instance reference + * @drv_cmd: Internal command tracker + * + * This is a callback handler for the PEL get sequence number + * request and a new PEL wait request will be issued to the + * firmware from this + * + * Return: Nothing. + */ +void mpi3mr_pel_get_seqnum_complete(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd) +{ + struct mpi3_pel_reply *pel_reply = NULL; + struct mpi3_pel_seq *pel_seqnum_virt; + u16 ioc_status; + bool do_retry = false; + + pel_seqnum_virt = (struct mpi3_pel_seq *)mrioc->pel_seqnum_virt; + + if (drv_cmd->state & MPI3MR_CMD_RESET) + goto cleanup_drv_cmd; + + ioc_status = drv_cmd->ioc_status & MPI3_IOCSTATUS_STATUS_MASK; + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + dprint_bsg_err(mrioc, + "pel_get_seqnum: failed with ioc_status(0x%04x), log_info(0x%08x)\n", + ioc_status, drv_cmd->ioc_loginfo); + do_retry = true; + } + + if (drv_cmd->state & MPI3MR_CMD_REPLY_VALID) + pel_reply = (struct mpi3_pel_reply *)drv_cmd->reply; + if (!pel_reply) { + dprint_bsg_err(mrioc, + "pel_get_seqnum: failed due to no reply\n"); + goto out_failed; + } + + if (le16_to_cpu(pel_reply->pe_log_status) != MPI3_PEL_STATUS_SUCCESS) { + dprint_bsg_err(mrioc, + "pel_get_seqnum: failed due to pel_log_status(0x%04x)\n", + le16_to_cpu(pel_reply->pe_log_status)); + do_retry = true; + } + + if (do_retry) { + if (drv_cmd->retry_count < MPI3MR_PEL_RETRY_COUNT) { + drv_cmd->retry_count++; + dprint_bsg_err(mrioc, + "pel_get_seqnum: retrying(%d)\n", + drv_cmd->retry_count); + mpi3mr_pel_get_seqnum_post(mrioc, drv_cmd); + return; + } + + dprint_bsg_err(mrioc, + "pel_get_seqnum: failed after all retries(%d)\n", + drv_cmd->retry_count); + goto out_failed; + } + mrioc->pel_newest_seqnum = le32_to_cpu(pel_seqnum_virt->newest) + 1; + drv_cmd->retry_count = 0; + mpi3mr_pel_wait_post(mrioc, drv_cmd); + + return; +out_failed: + mrioc->pel_enabled = false; +cleanup_drv_cmd: + drv_cmd->state = MPI3MR_CMD_NOTUSED; + drv_cmd->callback = NULL; + drv_cmd->retry_count = 0; +} + +/** + * mpi3mr_soft_reset_handler - Reset the controller + * @mrioc: Adapter instance reference + * @reset_reason: Reset reason code + * @snapdump: Flag to generate snapdump in firmware or not + * + * This is an handler for recovering controller by issuing soft + * reset or diag fault reset. This is a blocking function and + * when one reset is executed if any other resets they will be + * blocked. All BSGs/IO will be blocked during the reset. If + * controller reset is successful then the controller will be + * reinitalized, otherwise the controller will be marked as not + * recoverable + * + * If snapdump bit is set, the controller is issued with diag + * fault reset so that the firmware can create a snap dump and + * post that the firmware will result in F000 fault and the + * driver will issue soft reset to recover from that. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc, + u32 reset_reason, u8 snapdump) +{ + int retval = 0, i; + unsigned long flags; + u32 host_diagnostic, timeout = MPI3_SYSIF_DIAG_SAVE_TIMEOUT * 10; + u32 fault; + + /* Block the reset handler until diag save in progress*/ + dprint_reset(mrioc, + "soft_reset_handler: check and block on diagsave_timeout(%d)\n", + mrioc->diagsave_timeout); + while (mrioc->diagsave_timeout) + ssleep(1); + /* + * Block new resets until the currently executing one is finished and + * return the status of the existing reset for all blocked resets + */ + dprint_reset(mrioc, "soft_reset_handler: acquiring reset_mutex\n"); + if (!mutex_trylock(&mrioc->reset_mutex)) { + ioc_info(mrioc, + "controller reset triggered by %s is blocked due to another reset in progress\n", + mpi3mr_reset_rc_name(reset_reason)); + do { + ssleep(1); + } while (mrioc->reset_in_progress == 1); + ioc_info(mrioc, + "returning previous reset result(%d) for the reset triggered by %s\n", + mrioc->prev_reset_result, + mpi3mr_reset_rc_name(reset_reason)); + return mrioc->prev_reset_result; + } + ioc_info(mrioc, "controller reset is triggered by %s\n", + mpi3mr_reset_rc_name(reset_reason)); + + mrioc->device_refresh_on = 0; + mrioc->reset_in_progress = 1; + mrioc->block_bsgs = 1; + mrioc->prev_reset_result = -1; + + if ((!snapdump) && (reset_reason != MPI3MR_RESET_FROM_FAULT_WATCH) && + (reset_reason != MPI3MR_RESET_FROM_FIRMWARE) && + (reset_reason != MPI3MR_RESET_FROM_CIACTIV_FAULT)) { + mpi3mr_set_trigger_data_in_all_hdb(mrioc, + MPI3MR_HDB_TRIGGER_TYPE_SOFT_RESET, 0, 0); + dprint_reset(mrioc, + "soft_reset_handler: releasing host diagnostic buffers\n"); + mpi3mr_release_diag_bufs(mrioc, 0); + for (i = 0; i < MPI3_EVENT_NOTIFY_EVENTMASK_WORDS; i++) + mrioc->event_masks[i] = -1; + dprint_reset(mrioc, "soft_reset_handler: masking events\n"); + mpi3mr_issue_event_notification(mrioc); + } + + mpi3mr_wait_for_host_io(mrioc, MPI3MR_RESET_HOST_IOWAIT_TIMEOUT); + + mpi3mr_ioc_disable_intr(mrioc); + + if (snapdump) { + dprint_reset(mrioc, + "soft_reset_handler: saving snapdump\n"); + mpi3mr_do_dump(mrioc); + mpi3mr_set_diagsave(mrioc); + retval = mpi3mr_issue_reset(mrioc, + MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT, reset_reason); + if (!retval) { + fault = readl(&mrioc->sysif_regs->fault); + do { + host_diagnostic = + readl(&mrioc->sysif_regs->host_diagnostic); + if (!(host_diagnostic & + MPI3_SYSIF_HOST_DIAG_SAVE_IN_PROGRESS)) + break; + msleep(100); + } while (--timeout); + mpi3mr_set_trigger_data_in_all_hdb(mrioc, + MPI3MR_HDB_TRIGGER_TYPE_FAULT, fault, 0); + } + } + retval = mpi3mr_issue_reset(mrioc, + MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET, reset_reason); + if (retval) { + ioc_err(mrioc, + "failed to issue soft reset to the controller\n"); + goto out; + } + + mpi3mr_flush_delayed_cmd_lists(mrioc); + mpi3mr_flush_drv_cmds(mrioc); + memset(mrioc->devrem_bitmap, 0, mrioc->devrem_bitmap_sz); + memset(mrioc->removepend_bitmap, 0, mrioc->dev_handle_bitmap_sz); + memset(mrioc->evtack_cmds_bitmap, 0, mrioc->evtack_cmds_bitmap_sz); + mpi3mr_flush_host_io(mrioc); + mpi3mr_cleanup_fwevt_list(mrioc); + mpi3mr_invalidate_devhandles(mrioc); + mpi3mr_free_enclosure_list(mrioc); + + if (mrioc->prepare_for_reset) { + mrioc->prepare_for_reset = 0; + mrioc->prepare_for_reset_timeout_counter = 0; + } + +#if defined(IO_COUNTER_SUPPORT) + atomic_set(&mrioc->pend_ios, 0); +#endif + mpi3mr_memset_buffers(mrioc); + mpi3mr_release_diag_bufs(mrioc, 1); + mrioc->fw_release_trigger_active = false; + mrioc->trace_release_trigger_active = false; + mrioc->snapdump_trigger_active = false; + mpi3mr_set_trigger_data_in_all_hdb(mrioc, + MPI3MR_HDB_TRIGGER_TYPE_SOFT_RESET, 0, 0); + + dprint_reset(mrioc, + "soft_reset_handler: reinitializing the controller\n"); + retval = mpi3mr_reinit_ioc(mrioc, 0); + if (retval) { + ioc_err(mrioc, "reinitialization after soft reset failed\n"); + goto out; + } + dprint_reset(mrioc, + "soft_reset_handler: waiting for device events to settle\n"); + ssleep(10); + +out: + if (!retval) { + mrioc->diagsave_timeout = 0; + mrioc->reset_in_progress = 0; + mrioc->pel_abort_requested = 0; + if (mrioc->pel_enabled) { + mrioc->pel_cmds.retry_count = 0; + mpi3mr_pel_wait_post(mrioc, &mrioc->pel_cmds); + } + + mrioc->device_refresh_on = 0; + + mrioc->ts_update_counter = 0; + spin_lock_irqsave(&mrioc->watchdog_lock, flags); + if (mrioc->watchdog_work_q) + queue_delayed_work(mrioc->watchdog_work_q, + &mrioc->watchdog_work, + msecs_to_jiffies(MPI3MR_WATCHDOG_INTERVAL)); + spin_unlock_irqrestore(&mrioc->watchdog_lock, flags); + mrioc->block_bsgs = 0; + if (mrioc->pel_enabled) + atomic64_inc(&event_counter); + } else { + dprint_reset(mrioc, + "soft_reset_handler failed, marking controller as unrecoverable\n"); + mpi3mr_issue_reset(mrioc, + MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT, reset_reason); + mrioc->device_refresh_on = 0; + mrioc->unrecoverable = 1; + mrioc->reset_in_progress = 0; + retval = -1; + mpi3mr_flush_cmds_for_unrecovered_controller(mrioc); + } + mrioc->prev_reset_result = retval; + mutex_unlock(&mrioc->reset_mutex); + ioc_info(mrioc, "controller reset is %s\n", + ((retval == 0) ? "successful" : "failed")); + return retval; +} + + +/** + * mpi3mr_free_config_dma_memory - free memory for config page + * @mrioc: Adapter instance reference + * @mem_desc: memory descriptor structure + * + * Check whether the size of the buffer specified by the memory + * descriptor is greater than the defaulpage size if so then + * free the memory pointed by the descriptor. + * + * Return: 0 on success, non-zero on failure. + */ +static void mpi3mr_free_config_dma_memory(struct mpi3mr_ioc *mrioc, + struct dma_memory_desc *mem_desc) +{ + if ((mem_desc->size > mrioc->cfg_page_sz) && mem_desc->addr) { + dma_free_coherent(&mrioc->pdev->dev, mem_desc->size, + mem_desc->addr, mem_desc->dma_addr); + mem_desc->addr = NULL; + } +} + + + +/** + * mpi3mr_alloc_config_dma_memory - Alloc memory for config page + * @mrioc: Adapter instance reference + * @mem_desc: Memory descriptor to hold dma memory info + * + * This function allocates new dmaable memory or provides the + * default config page dmaable memory based on the memory size + * described by the descriptor. + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_alloc_config_dma_memory(struct mpi3mr_ioc *mrioc, + struct dma_memory_desc *mem_desc) +{ + if (mem_desc->size > mrioc->cfg_page_sz) { + mem_desc->addr = dma_zalloc_coherent(&mrioc->pdev->dev, + mem_desc->size, &mem_desc->dma_addr, GFP_KERNEL); + if (!mem_desc->addr) + return -ENOMEM; + } else { + mem_desc->addr = mrioc->cfg_page; + mem_desc->dma_addr = mrioc->cfg_page_dma; + memset(mem_desc->addr, 0, mrioc->cfg_page_sz); + } + return 0; +} + + +/** + * mpi3mr_post_cfg_req - Issue config requests and wait + * @mrioc: Adapter instance reference + * @cfg_request: Configuration request + * @timeout: Timeout in seconds + * @ioc_status: Pointer to return ioc status + * + * A generic function for posting MPI3 configuration request to + * the firmware. This blocks for the completion of request for + * timeout seconds and if the request times out this function + * faults the controller with proper reason code. + * + * On successful completion of the request this function returns + * appropriate ioc status from the firmware back to the caller. + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_post_cfg_req(struct mpi3mr_ioc *mrioc, + struct mpi3_config_request *cfg_req, int timeout, u16 *ioc_status) +{ + int retval = 0; + + mutex_lock(&mrioc->cfg_cmds.mutex); + if (mrioc->cfg_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "sending config request failed due to command in use\n"); + mutex_unlock(&mrioc->cfg_cmds.mutex); + goto out; + } + mrioc->cfg_cmds.state = MPI3MR_CMD_PENDING; + mrioc->cfg_cmds.is_waiting = 1; + mrioc->cfg_cmds.callback = NULL; + mrioc->cfg_cmds.ioc_status = 0; + mrioc->cfg_cmds.ioc_loginfo = 0; + + cfg_req->host_tag = cpu_to_le16(MPI3MR_HOSTTAG_CFG_CMDS); + cfg_req->function = MPI3_FUNCTION_CONFIG; + + init_completion(&mrioc->cfg_cmds.done); + dprint_cfg_info(mrioc, "posting config request\n"); + if (mrioc->logging_level & MPI3_DEBUG_CFG_INFO) + dprint_dump(cfg_req, sizeof(struct mpi3_config_request), + "mpi3_cfg_req"); + retval = mpi3mr_admin_request_post(mrioc, cfg_req, sizeof(*cfg_req), 1); + if (retval) { + ioc_err(mrioc, "posting config request failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->cfg_cmds.done, (timeout * HZ)); + if (!(mrioc->cfg_cmds.state & MPI3MR_CMD_COMPLETE)) { + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_CFG_REQ_TIMEOUT); + ioc_err(mrioc, "config request timed out\n"); + retval = -1; + goto out_unlock; + } + *ioc_status = mrioc->cfg_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK; + if ((*ioc_status) != MPI3_IOCSTATUS_SUCCESS) + dprint_cfg_err(mrioc, + "cfg_page request returned with ioc_status(0x%04x), log_info(0x%08x)\n", + *ioc_status, mrioc->cfg_cmds.ioc_loginfo); + +out_unlock: + mrioc->cfg_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->cfg_cmds.mutex); + +out: + return retval; +} + +/** + * mpi3mr_process_cfg_req - config page request processor + * @mrioc: Adapter instance reference + * @cfg_request: Configuration request + * @cf_hdr: Configuration page header + * @timeout: Timeout in seconds + * @ioc_status: Pointer to return ioc status + * @cfg_buf: Memory pointer to copy config page or header + * @cfg_buf_sz: Size of the memory to get config page or header + * + * This is handler for config page read, write and config page + * header read operations. + * + * This function expects the cfg_req to be populated with page + * type, page number, action for the header read and with page + * address for all other operations. + * + * The cfg_hdr can be passed as null for reading required header + * details for read/write pages the cfg_hdr should point valid + * configuration page header. + * + * This allocates dmaable memory based on the size of the config + * buffer and set the SGE of the cfg_req. + * + * For write actions, the config page data has to be passed in + * the cfg_buf and size of the data has to be mentioned in the + * cfg_buf_sz. + * + * For read/header actions, on successful completion of the + * request with successful ioc_status the data will be copied + * into the cfg_buf limited to a minium of actual page size and + * cfg_buf_sz + * + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_process_cfg_req(struct mpi3mr_ioc *mrioc, + struct mpi3_config_request *cfg_req, + struct mpi3_config_page_header *cfg_hdr, int timeout, u16 *ioc_status, + void *cfg_buf, u32 cfg_buf_sz) +{ + struct dma_memory_desc mem_desc; + int retval = -1; + u8 invalid_action = 0; + u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST; + + memset(&mem_desc, 0, sizeof(struct dma_memory_desc)); + + if (cfg_req->action == MPI3_CONFIG_ACTION_PAGE_HEADER) + mem_desc.size = sizeof(struct mpi3_config_page_header); + else { + if (!cfg_hdr) { + ioc_err(mrioc, "null config header passed for config action(%d), page_type(0x%02x), page_num(%d)\n", + cfg_req->action, cfg_req->page_type, + cfg_req->page_number); + goto out; + } + switch (cfg_hdr->page_attribute & MPI3_CONFIG_PAGEATTR_MASK) { + case MPI3_CONFIG_PAGEATTR_READ_ONLY: + if (cfg_req->action + != MPI3_CONFIG_ACTION_READ_CURRENT) + invalid_action = 1; + break; + case MPI3_CONFIG_PAGEATTR_CHANGEABLE: + if ((cfg_req->action == + MPI3_CONFIG_ACTION_READ_PERSISTENT) || + (cfg_req->action == + MPI3_CONFIG_ACTION_WRITE_PERSISTENT)) + invalid_action = 1; + break; + case MPI3_CONFIG_PAGEATTR_PERSISTENT: + default: + break; + } + if (invalid_action) { + ioc_err(mrioc, + "config action(%d) is not allowed for page_type(0x%02x), page_num(%d) with page_attribute(0x%02x)\n", + cfg_req->action, cfg_req->page_type, + cfg_req->page_number, cfg_hdr->page_attribute); + goto out; + } + mem_desc.size = le16_to_cpu(cfg_hdr->page_length) * 4; + cfg_req->page_length = cfg_hdr->page_length; + cfg_req->page_version = cfg_hdr->page_version; + } + if (mpi3mr_alloc_config_dma_memory(mrioc, &mem_desc)) + goto out; + + mpi3mr_add_sg_single(&cfg_req->sgl, sgl_flags, mem_desc.size, + mem_desc.dma_addr); + + if ((cfg_req->action == MPI3_CONFIG_ACTION_WRITE_PERSISTENT) || + (cfg_req->action == MPI3_CONFIG_ACTION_WRITE_CURRENT)) { + memcpy(mem_desc.addr, cfg_buf, min_t(u16, mem_desc.size, + cfg_buf_sz)); + dprint_cfg_info(mrioc, "config buffer to be written\n"); + if (mrioc->logging_level & MPI3_DEBUG_CFG_INFO) + dprint_dump(mem_desc.addr, mem_desc.size, "cfg_buf"); + } + + if (mpi3mr_post_cfg_req(mrioc, cfg_req, timeout, ioc_status)) + goto out; + + retval = 0; + if ((*ioc_status == MPI3_IOCSTATUS_SUCCESS) && + (cfg_req->action != MPI3_CONFIG_ACTION_WRITE_PERSISTENT) && + (cfg_req->action != MPI3_CONFIG_ACTION_WRITE_CURRENT)) { + memcpy(cfg_buf, mem_desc.addr, min_t(u16, mem_desc.size, + cfg_buf_sz)); + dprint_cfg_info(mrioc, "config buffer read\n"); + if (mrioc->logging_level & MPI3_DEBUG_CFG_INFO) + dprint_dump(mem_desc.addr, mem_desc.size, "cfg_buf"); + } + +out: + mpi3mr_free_config_dma_memory(mrioc, &mem_desc); + return retval; +} + +/** + * mpi3mr_cfg_get_dev_pg0 - Read current device page0 + * @mrioc: Adapter instance reference + * @ioc_status: Pointer to return ioc status + * @dev_pg0: Pointer to return device page 0 + * @pg_sz: Size of the memory allocated to the page pointer + * @form: The form to be used for addressing the page + * @form_spec: Form specific information like device handle + * + * This is handler for config page read for a specific device + * page0. The ioc_status has the controller returned ioc_status. + * This routine doesn't check ioc_status to decide whether the + * page read is success or not and it is the callers + * responsibility. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_get_dev_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_device_page0 *dev_pg0, u16 pg_sz, u32 form, u32 form_spec) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u32 page_address; + + memset(dev_pg0, 0, pg_sz); + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_DEVICE; + cfg_req.page_number = 0; + cfg_req.page_address = 0; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "device page0 header read failed \n"); + goto out_failed; + } + if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "device page0 header read failed with ioc_status(0x%04x)\n", + *ioc_status); + goto out_failed; + } + cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT; + page_address = ((form & MPI3_DEVICE_PGAD_FORM_MASK) | + (form_spec & MPI3_DEVICE_PGAD_HANDLE_MASK)); + cfg_req.page_address = cpu_to_le32(page_address); + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, dev_pg0, pg_sz)) { + ioc_err(mrioc, "device page0 read failed \n"); + goto out_failed; + } + return 0; +out_failed: + return -1; +} + + +/** + * mpi3mr_cfg_get_sas_phy_pg0 - Read current SAS Phy page0 + * @mrioc: Adapter instance reference + * @ioc_status: Pointer to return ioc status + * @phy_pg0: Pointer to return SAS Phy page 0 + * @pg_sz: Size of the memory allocated to the page pointer + * @form: The form to be used for addressing the page + * @form_spec: Form specific information like phy number + * + * This is handler for config page read for a specific SAS Phy + * page0. The ioc_status has the controller returned ioc_status. + * This routine doesn't check ioc_status to decide whether the + * page read is success or not and it is the callers + * responsibility. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_get_sas_phy_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_sas_phy_page0 *phy_pg0, u16 pg_sz, u32 form, + u32 form_spec) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u32 page_address; + + memset(phy_pg0, 0, pg_sz); + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_PHY; + cfg_req.page_number = 0; + cfg_req.page_address = 0; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "sas phy page0 header read failed \n"); + goto out_failed; + } + if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "sas phy page0 header read failed with ioc_status(0x%04x)\n", + *ioc_status); + goto out_failed; + } + cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT; + page_address = ((form & MPI3_SAS_PHY_PGAD_FORM_MASK) | + (form_spec & MPI3_SAS_PHY_PGAD_PHY_NUMBER_MASK)); + cfg_req.page_address = cpu_to_le32(page_address); + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, phy_pg0, pg_sz)) { + ioc_err(mrioc, "sas phy page0 read failed \n"); + goto out_failed; + } + return 0; +out_failed: + return -1; +} + +/** + * mpi3mr_cfg_get_sas_phy_pg1 - Read current SAS Phy page1 + * @mrioc: Adapter instance reference + * @ioc_status: Pointer to return ioc status + * @phy_pg1: Pointer to return SAS Phy page 1 + * @pg_sz: Size of the memory allocated to the page pointer + * @form: The form to be used for addressing the page + * @form_spec: Form specific information like phy number + * + * This is handler for config page read for a specific SAS Phy + * page1. The ioc_status has the controller returned ioc_status. + * This routine doesn't check ioc_status to decide whether the + * page read is success or not and it is the callers + * responsibility. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_get_sas_phy_pg1(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_sas_phy_page1 *phy_pg1, u16 pg_sz, u32 form, + u32 form_spec) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u32 page_address; + + memset(phy_pg1, 0, pg_sz); + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_PHY; + cfg_req.page_number = 1; + cfg_req.page_address = 0; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "sas phy page1 header read failed \n"); + goto out_failed; + } + if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "sas phy page1 header read failed with ioc_status(0x%04x)\n", + *ioc_status); + goto out_failed; + } + cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT; + page_address = ((form & MPI3_SAS_PHY_PGAD_FORM_MASK) | + (form_spec & MPI3_SAS_PHY_PGAD_PHY_NUMBER_MASK)); + cfg_req.page_address = cpu_to_le32(page_address); + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, phy_pg1, pg_sz)) { + ioc_err(mrioc, "sas phy page1 read failed \n"); + goto out_failed; + } + return 0; +out_failed: + return -1; +} + + +/** + * mpi3mr_cfg_get_sas_exp_pg0 - Read current SAS Expander page0 + * @mrioc: Adapter instance reference + * @ioc_status: Pointer to return ioc status + * @exp_pg0: Pointer to return SAS Expander page 0 + * @pg_sz: Size of the memory allocated to the page pointer + * @form: The form to be used for addressing the page + * @form_spec: Form specific information like device handle + * + * This is handler for config page read for a specific SAS + * Expander page0. The ioc_status has the controller returned + * ioc_status. This routine doesn't check ioc_status to decide + * whether the page read is success or not and it is the callers + * responsibility. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_get_sas_exp_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_sas_expander_page0 *exp_pg0, u16 pg_sz, u32 form, + u32 form_spec) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u32 page_address; + + memset(exp_pg0, 0, pg_sz); + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_EXPANDER; + cfg_req.page_number = 0; + cfg_req.page_address = 0; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "expander page0 header read failed \n"); + goto out_failed; + } + if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "expander page0 header read failed with ioc_status(0x%04x)\n", + *ioc_status); + goto out_failed; + } + cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT; + page_address = ((form & MPI3_SAS_EXPAND_PGAD_FORM_MASK) | + (form_spec & (MPI3_SAS_EXPAND_PGAD_PHYNUM_MASK | + MPI3_SAS_EXPAND_PGAD_HANDLE_MASK))); + cfg_req.page_address = cpu_to_le32(page_address); + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, exp_pg0, pg_sz)) { + ioc_err(mrioc, "expander page0 read failed \n"); + goto out_failed; + } + return 0; +out_failed: + return -1; +} + +/** + * mpi3mr_cfg_get_sas_exp_pg1 - Read current SAS Expander page1 + * @mrioc: Adapter instance reference + * @ioc_status: Pointer to return ioc status + * @exp_pg1: Pointer to return SAS Expander page 1 + * @pg_sz: Size of the memory allocated to the page pointer + * @form: The form to be used for addressing the page + * @form_spec: Form specific information like phy number + * + * This is handler for config page read for a specific SAS + * Expander page1. The ioc_status has the controller returned + * ioc_status. This routine doesn't check ioc_status to decide + * whether the page read is success or not and it is the callers + * responsibility. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_get_sas_exp_pg1(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_sas_expander_page1 *exp_pg1, u16 pg_sz, u32 form, + u32 form_spec) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u32 page_address; + + memset(exp_pg1, 0, pg_sz); + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_EXPANDER; + cfg_req.page_number = 1; + cfg_req.page_address = 0; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "expander page1 header read failed \n"); + goto out_failed; + } + if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "expander page1 header read failed with ioc_status(0x%04x)\n", + *ioc_status); + goto out_failed; + } + cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT; + page_address = ((form & MPI3_SAS_EXPAND_PGAD_FORM_MASK) | + (form_spec & (MPI3_SAS_EXPAND_PGAD_PHYNUM_MASK | + MPI3_SAS_EXPAND_PGAD_HANDLE_MASK))); + cfg_req.page_address = cpu_to_le32(page_address); + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, exp_pg1, pg_sz)) { + ioc_err(mrioc, "expander page1 read failed \n"); + goto out_failed; + } + return 0; +out_failed: + return -1; +} + +/** + * mpi3mr_cfg_get_enclosure_pg0 - Read current Enclosure page0 + * @mrioc: Adapter instance reference + * @ioc_status: Pointer to return ioc status + * @encl_pg0: Pointer to return Enclosure page 0 + * @pg_sz: Size of the memory allocated to the page pointer + * @form: The form to be used for addressing the page + * @form_spec: Form specific information like device handle + * + * This is handler for config page read for a specific Enclosure + * page0. The ioc_status has the controller returned ioc_status. + * This routine doesn't check ioc_status to decide whether the + * page read is success or not and it is the callers + * responsibility. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_get_enclosure_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status, + struct mpi3_enclosure_page0 *encl_pg0, u16 pg_sz, u32 form, + u32 form_spec) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u32 page_address; + + memset(encl_pg0, 0, pg_sz); + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_ENCLOSURE; + cfg_req.page_number = 0; + cfg_req.page_address = 0; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "enclosure page0 header read failed \n"); + goto out_failed; + } + if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "enclosure page0 header read failed with ioc_status(0x%04x)\n", + *ioc_status); + goto out_failed; + } + cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT; + page_address = ((form & MPI3_ENCLOS_PGAD_FORM_MASK) | + (form_spec & MPI3_ENCLOS_PGAD_HANDLE_MASK)); + cfg_req.page_address = cpu_to_le32(page_address); + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, ioc_status, encl_pg0, pg_sz)) { + ioc_err(mrioc, "enclosure page0 read failed \n"); + goto out_failed; + } + return 0; +out_failed: + return -1; +} + + +/** + * mpi3mr_cfg_get_sas_io_unit_pg0 - Read current SASIOUnit page0 + * @mrioc: Adapter instance reference + * @sas_io_unit_pg0: Pointer to return SAS IO Unit page 0 + * @pg_sz: Size of the memory allocated to the page pointer + * + * This is handler for config page read for the SAS IO Unit + * page0. This routine checks ioc_status to decide whether the + * page read is success or not. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_get_sas_io_unit_pg0(struct mpi3mr_ioc *mrioc, + struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0, u16 pg_sz) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u16 ioc_status = 0; + + memset(sas_io_unit_pg0, 0, pg_sz); + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_IO_UNIT; + cfg_req.page_number = 0; + cfg_req.page_address = 0; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "sas io unit page0 header read failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "sas io unit page0 header read failed with ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, sas_io_unit_pg0, pg_sz)) { + ioc_err(mrioc, "sas io unit page0 read failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "sas io unit page0 read failed with ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + return 0; +out_failed: + return -1; +} + +/** + * mpi3mr_cfg_get_sas_io_unit_pg1 - Read current SASIOUnit page1 + * @mrioc: Adapter instance reference + * @sas_io_unit_pg1: Pointer to return SAS IO Unit page 1 + * @pg_sz: Size of the memory allocated to the page pointer + * + * This is handler for config page read for the SAS IO Unit + * page1. This routine checks ioc_status to decide whether the + * page read is success or not. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_get_sas_io_unit_pg1(struct mpi3mr_ioc *mrioc, + struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1, u16 pg_sz) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u16 ioc_status = 0; + + memset(sas_io_unit_pg1, 0, pg_sz); + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_IO_UNIT; + cfg_req.page_number = 1; + cfg_req.page_address = 0; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "sas io unit page1 header read failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "sas io unit page1 header read failed with ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, sas_io_unit_pg1, pg_sz)) { + ioc_err(mrioc, "sas io unit page1 read failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "sas io unit page1 read failed with ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + return 0; +out_failed: + return -1; +} + +/** + * mpi3mr_cfg_set_sas_io_unit_pg1 - Write SASIOUnit page1 + * @mrioc: Adapter instance reference + * @sas_io_unit_pg1: Pointer to the SAS IO Unit page 1 to write + * @pg_sz: Size of the memory allocated to the page pointer + * + * This is handler for config page write for the SAS IO Unit + * page1. This routine checks ioc_status to decide whether the + * page read is success or not. This will modify both current + * and persistent page. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_set_sas_io_unit_pg1(struct mpi3mr_ioc *mrioc, + struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1, u16 pg_sz) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u16 ioc_status = 0; + + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_IO_UNIT; + cfg_req.page_number = 1; + cfg_req.page_address = 0; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "sas io unit page1 header read failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "sas io unit page1 header read failed with ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + cfg_req.action = MPI3_CONFIG_ACTION_WRITE_CURRENT; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, sas_io_unit_pg1, pg_sz)) { + ioc_err(mrioc, "sas io unit page1 write current failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "sas io unit page1 write current failed with ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + + cfg_req.action = MPI3_CONFIG_ACTION_WRITE_PERSISTENT; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, sas_io_unit_pg1, pg_sz)) { + ioc_err(mrioc, "sas io unit page1 write persistent failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "sas io unit page1 write persistent failed with ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + return 0; +out_failed: + return -1; +} + +/** + * mpi3mr_cfg_get_driver_pg1 - Read current Driver page1 + * @mrioc: Adapter instance reference + * @driver_pg1: Pointer to return Driver page 1 + * @pg_sz: Size of the memory allocated to the page pointer + * + * This is handler for config page read for the Driver page1. + * This routine checks ioc_status to decide whether the page + * read is success or not. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_get_driver_pg1(struct mpi3mr_ioc *mrioc, + struct mpi3_driver_page1 *driver_pg1, u16 pg_sz) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u16 ioc_status = 0; + + memset(driver_pg1, 0, pg_sz); + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_DRIVER; + cfg_req.page_number = 1; + cfg_req.page_address = 0; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "driver page1 header read failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "driver page1 header read failed with ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, driver_pg1, pg_sz)) { + ioc_err(mrioc, "driver page1 read failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "driver page1 read failed with ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + return 0; +out_failed: + return -1; +} +/** + * mpi3mr_cfg_get_driver_pg2 - Read current driver page2 + * @mrioc: Adapter instance reference + * @driver_pg2: Pointer to return driver page 2 + * @pg_sz: Size of the memory allocated to the page pointer + * @page_action: Page action + * + * This is handler for config page read for the driver page2. + * This routine checks ioc_status to decide whether the page + * read is success or not. + * + * Return: 0 on success, non-zero on failure. + */ +int mpi3mr_cfg_get_driver_pg2(struct mpi3mr_ioc *mrioc, + struct mpi3_driver_page2 *driver_pg2, u16 pg_sz, u8 page_action) +{ + struct mpi3_config_page_header cfg_hdr; + struct mpi3_config_request cfg_req; + u16 ioc_status = 0; + + memset(driver_pg2, 0, pg_sz); + memset(&cfg_hdr, 0, sizeof(cfg_hdr)); + memset(&cfg_req, 0, sizeof(cfg_req)); + + cfg_req.function = MPI3_FUNCTION_CONFIG; + cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER; + cfg_req.page_type = MPI3_CONFIG_PAGETYPE_DRIVER; + cfg_req.page_number = 2; + cfg_req.page_address = 0; + cfg_req.page_version = MPI3_DRIVER2_PAGEVERSION; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, &cfg_hdr, sizeof(cfg_hdr))) { + ioc_err(mrioc, "driver page2 header read failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "driver page2 header read failed with" + "ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + cfg_req.action = page_action; + + if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr, + MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, driver_pg2, pg_sz)) { + ioc_err(mrioc, "driver page2 read failed \n"); + goto out_failed; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "driver page2 read failed with" + "ioc_status(0x%04x)\n", + ioc_status); + goto out_failed; + } + return 0; +out_failed: + return -1; +} + diff --git a/drivers/scsi/mpi3mr/mpi3mr_kernel_compat.h b/drivers/scsi/mpi3mr/mpi3mr_kernel_compat.h new file mode 100644 index 0000000000000..6fa51466a911a --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi3mr_kernel_compat.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Driver for Broadcom MPI3 Storage Controllers + * + * Copyright (C) 2017-2022 Broadcom Inc. + * (mailto: mpi3mr-linuxdrv.pdl@broadcom.com) + * + */ +#include + +struct mpi3mr_kmsg_dumper { +#if ((KERNEL_VERSION(5,13,0) <= LINUX_VERSION_CODE) || \ + (defined(RHEL_MAJOR) && (RHEL_MAJOR == 8) && (RHEL_MINOR >= 6))) + struct kmsg_dump_iter kdumper; +#else + struct kmsg_dumper kdumper; +#endif +}; + +static inline void mpi3mr_set_dumper_active(struct mpi3mr_kmsg_dumper *dumper) +{ +#if ((KERNEL_VERSION(5,13,0) <= LINUX_VERSION_CODE) || \ + (defined(RHEL_MAJOR) && (RHEL_MAJOR == 8) && (RHEL_MINOR >= 6))) + return; +#else + dumper->kdumper.active = true; + return; +#endif +} + +#if (KERNEL_VERSION(5,15,0) <= LINUX_VERSION_CODE) +#define SCMD_GET_REQUEST(scmd) scsi_cmd_to_rq(scmd) +#else +#define SCMD_GET_REQUEST(scmd) scmd->request +#endif + +#if (KERNEL_VERSION(5,16,0) <= LINUX_VERSION_CODE) +#define SCMD_DONE(scmd) scsi_done(scmd) +#else +#define SCMD_DONE(scmd) scmd->scsi_done(scmd) +#endif + +static inline u32 mpi3mr_kc_prot_ref_tag(struct scsi_cmnd *scmd) +{ +#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)) || \ + (defined(RHEL_MAJOR) && (RHEL_MAJOR == 8)) || \ + (defined(CONFIG_SUSE_KERNEL) && ((CONFIG_SUSE_VERSION == 15) && \ + (CONFIG_SUSE_PATCHLEVEL >= 1)))) + return t10_pi_ref_tag(SCMD_GET_REQUEST(scmd)); +#else + return scsi_prot_ref_tag(scmd); +#endif +} + +static inline bool mpi3mr_use_blk_mq(struct Scsi_Host *shost) +{ +#if ((defined(RHEL_MAJOR) && (RHEL_MAJOR == 8)) || \ + (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0))) + return true; +#else + return shost_use_blk_mq(shost); +#endif +} + +/*Revisit enabling Shared HostTag for RHEL8x kernels*/ +#if ((defined(RHEL_MAJOR) && (RHEL_MAJOR == 8) && (RHEL_MINOR >= 6)) || \ + (LINUX_VERSION_CODE >= KERNEL_VERSION(5,16,0))) +#define HOST_TAGSET_SUPPORT +#endif + +#if (defined(SCMD_STATE_INFLIGHT) && !defined(HOST_TAGSET_SUPPORT)) +#define IO_COUNTER_SUPPORT +#endif + + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)) +#define dma_zalloc_coherent dma_alloc_coherent +#endif + +#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)) || \ + (defined(RHEL_MAJOR) && (RHEL_MAJOR == 8))) +#define BLK_ITER_CALLBACK_RET_TYPE bool +#define BLK_ITER_CALLBACK_RET_VAL(x) return x +#else +#define BLK_ITER_CALLBACK_RET_TYPE void +#define BLK_ITER_CALLBACK_RET_VAL(x) return +#endif + +/** + * mpi3mr_scsi_build_sense - build sense data + * @scmd: scsi command object + * @desc: Sense format (non zero == descriptor format, + * 0 == fixed format) + * @key: Sense key + * @asc: Additional sense code + * @ascq: Additional sense code qualifier + **/ +static inline void mpi3mr_scsi_build_sense(struct scsi_cmnd *scmd, + int desc, u8 key, u8 asc, u8 ascq) +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,14,0)) + scsi_build_sense_buffer(desc, scmd->sense_buffer, key, asc, ascq); + scmd->result = (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION; + set_host_byte(scmd, DID_OK); +#else + scsi_build_sense(scmd, desc, key, asc, ascq); +#endif +} + +#ifndef fallthrough +#define fallthrough +#endif diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c new file mode 100644 index 0000000000000..4718a01aa2563 --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -0,0 +1,6019 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Broadcom MPI3 Storage Controllers + * + * Copyright (C) 2017-2022 Broadcom Inc. + * (mailto: mpi3mr-linuxdrv.pdl@broadcom.com) + * + */ + +#include "mpi3mr.h" + +/* Global driver scope variables */ +LIST_HEAD(mrioc_list); +DEFINE_SPINLOCK(mrioc_list_lock); +static int mrioc_ids; +static int warn_non_secure_ctlr; +atomic64_t event_counter; + +MODULE_AUTHOR(MPI3MR_DRIVER_AUTHOR); +MODULE_DESCRIPTION(MPI3MR_DRIVER_DESC); +MODULE_LICENSE(MPI3MR_DRIVER_LICENSE); +MODULE_VERSION(MPI3MR_DRIVER_VERSION); + +/* Module parameters*/ +static int logging_level; +module_param(logging_level, int, 0444); +MODULE_PARM_DESC(logging_level, + " Enable additional logging info (default=0)"); + +static bool enable_dif = true; +module_param(enable_dif, bool, 0444); +MODULE_PARM_DESC(enable_dif, + "Enable Data Integrity Format (DIF) support (Default = 1)"); + +bool enable_dix; +module_param(enable_dix, bool, 0444); +MODULE_PARM_DESC(enable_dix, + "Enable Data Integrity Extensions (DIX) support (Default = 0)"); + +extern bool enable_segqueue; +extern int mpi3mr_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num); +extern struct dentry *mpi3mr_debugfs_root; +extern void mpi3mr_init_debugfs(void); +extern void mpi3mr_exit_debugfs(void); +extern void mpi3mr_setup_debugfs(struct mpi3mr_ioc *mrioc); +extern void mpi3mr_destroy_debugfs(struct mpi3mr_ioc *mrioc); + +/* Forward declarations*/ +static int mpi3mr_change_queue_depth(struct scsi_device *sdev, + int q_depth); +static void mpi3mr_dev_rmhs_send_tm(struct mpi3mr_ioc *mrioc, u16 handle, + struct mpi3mr_drv_cmd *cmdparam, u8 iou_rc); +static void mpi3mr_send_event_ack(struct mpi3mr_ioc *mrioc, u8 event, + struct mpi3mr_drv_cmd *cmdparam, u32 event_ctx); +static void mpi3mr_fwevt_worker(struct work_struct *work); + +#define MPI3MR_DRIVER_EVENT_PROCESS_TRIGGER (0xFFFD) +#define MPI3MR_DRIVER_EVENT_WAIT_FOR_DEVICES_TO_REFRESH (0xFFFE) +#define MPI3MR_DRIVER_EVENT_TG_QD_REDUCTION (0xFFFF) + +/** + * struct delayed_dev_rmhs_node - Delayed device removal node + * + * @list: list head + * @handle: Device handle + * @iou_rc: IO Unit Control Reason Code + */ +struct delayed_dev_rmhs_node { + struct list_head list; + u16 handle; + u8 iou_rc; +}; + +/** + * struct delayed_evt_ack_node - Delayed event ack node + * + * @list: list head + * @event: MPI3 event ID + * @event_ctx: event context + */ +struct delayed_evt_ack_node { + struct list_head list; + u8 event; + u32 event_ctx; +}; + +/** + * mpi3mr_fwevt_free - firmware event memory dealloctor + * @r: k reference pointer of the firmware event + * + * Free firmware event memory when no reference. + */ +static void mpi3mr_fwevt_free(struct kref *r) +{ + kfree(container_of(r, struct mpi3mr_fwevt, ref_count)); +} + +/** + * mpi3mr_fwevt_get - k reference incrementor + * @fwevt: Firmware event reference + * + * Increment firmware event reference count. + */ +static void mpi3mr_fwevt_get(struct mpi3mr_fwevt *fwevt) +{ + kref_get(&fwevt->ref_count); +} + +/** + * mpi3mr_fwevt_put - k reference decrementor + * @fwevt: Firmware event reference + * + * decrement firmware event reference count. + */ +static void mpi3mr_fwevt_put(struct mpi3mr_fwevt *fwevt) +{ + kref_put(&fwevt->ref_count, mpi3mr_fwevt_free); +} + +/** + * mpi3mr_alloc_fwevt - Allocate firmware event + * @len: length of firmware event data to allocate + * + * Allocate firmware event with required length and initialize + * the reference counter. + * + * Return: firmware event reference. + */ +static struct mpi3mr_fwevt *mpi3mr_alloc_fwevt(int len) +{ + struct mpi3mr_fwevt *fwevt; + + fwevt = kzalloc(sizeof(*fwevt) + len, GFP_ATOMIC); + if (!fwevt) + return NULL; + + kref_init(&fwevt->ref_count); + return fwevt; +} + +/** + * mpi3mr_fwevt_add_to_list - Add firmware event to the list + * @mrioc: Adapter instance reference + * @fwevt: Firmware event reference + * + * Add the given firmware event to the firmware event list. + * + * Return: Nothing. + */ +static void mpi3mr_fwevt_add_to_list(struct mpi3mr_ioc *mrioc, + struct mpi3mr_fwevt *fwevt) +{ + unsigned long flags; + + if (!mrioc->fwevt_worker_thread) + return; + + spin_lock_irqsave(&mrioc->fwevt_lock, flags); + /* get fwevt reference count while adding it to fwevt_list */ + mpi3mr_fwevt_get(fwevt); + INIT_LIST_HEAD(&fwevt->list); + list_add_tail(&fwevt->list, &mrioc->fwevt_list); + INIT_WORK(&fwevt->work, mpi3mr_fwevt_worker); + /* get fwevt reference count while enqueueing it to worker queue */ + mpi3mr_fwevt_get(fwevt); + queue_work(mrioc->fwevt_worker_thread, &fwevt->work); + spin_unlock_irqrestore(&mrioc->fwevt_lock, flags); +} +/** + * mpi3mr_hdb_trigger_data_event - Add hdb trigger data event to + * the list + * @mrioc: Adapter instance reference + * @event_data: Event data + * + * Add the given hdb trigger data event to the firmware event + * list. + * + * Return: Nothing. + */ +void mpi3mr_hdb_trigger_data_event(struct mpi3mr_ioc *mrioc, + struct trigger_event_data *event_data) +{ + struct mpi3mr_fwevt *fwevt; + u16 sz = sizeof(*event_data); + + fwevt = mpi3mr_alloc_fwevt(sz); + if (!fwevt) { + ioc_warn(mrioc, "failed to queue hdb trigger data event\n"); + return; + } + + fwevt->mrioc = mrioc; + fwevt->event_id = MPI3MR_DRIVER_EVENT_PROCESS_TRIGGER; + fwevt->send_ack = 0; + fwevt->process_event = 1; + fwevt->event_context = 0; + fwevt->event_data_size = sz; + memcpy(fwevt->event_data, event_data, sz); + + mpi3mr_fwevt_add_to_list(mrioc, fwevt); +} + +/** + * mpi3mr_fwevt_del_from_list - Delete firmware event from list + * @mrioc: Adapter instance reference + * @fwevt: Firmware event reference + * + * Delete the given firmware event from the firmware event list. + * + * Return: Nothing. + */ +static void mpi3mr_fwevt_del_from_list(struct mpi3mr_ioc *mrioc, + struct mpi3mr_fwevt *fwevt) +{ + unsigned long flags; + + spin_lock_irqsave(&mrioc->fwevt_lock, flags); + if (!list_empty(&fwevt->list)) { + list_del_init(&fwevt->list); + /* + * Put fwevt reference count after + * removing it from fwevt_list + */ + mpi3mr_fwevt_put(fwevt); + } + spin_unlock_irqrestore(&mrioc->fwevt_lock, flags); +} + +/** + * mpi3mr_dequeue_fwevt - Dequeue firmware event from the list + * @mrioc: Adapter instance reference + * + * Dequeue a firmware event from the firmware event list. + * + * Return: firmware event. + */ +static struct mpi3mr_fwevt *mpi3mr_dequeue_fwevt( + struct mpi3mr_ioc *mrioc) +{ + unsigned long flags; + struct mpi3mr_fwevt *fwevt = NULL; + + spin_lock_irqsave(&mrioc->fwevt_lock, flags); + if (!list_empty(&mrioc->fwevt_list)) { + fwevt = list_first_entry(&mrioc->fwevt_list, + struct mpi3mr_fwevt, list); + list_del_init(&fwevt->list); + /* + * Put fwevt reference count after + * removing it from fwevt_list + */ + mpi3mr_fwevt_put(fwevt); + } + spin_unlock_irqrestore(&mrioc->fwevt_lock, flags); + + return fwevt; +} + +/** + * mpi3mr_cleanup_fwevt_list - Cleanup firmware event list + * @mrioc: Adapter instance reference + * + * Flush all pending firmware events from the firmware event + * list. + * + * Return: Nothing. + */ +void mpi3mr_cleanup_fwevt_list(struct mpi3mr_ioc *mrioc) +{ + struct mpi3mr_fwevt *fwevt = NULL; + + if ((list_empty(&mrioc->fwevt_list) && !mrioc->current_event) || + !mrioc->fwevt_worker_thread) + return; + dprint_reset(mrioc, "flushing firmware events\n"); + while ((fwevt = mpi3mr_dequeue_fwevt(mrioc))) { + /* + * Wait on the fwevt to complete. If this returns 1, then + * the event was never executed, and we need a put for the + * reference the work had on the fwevt. + */ + if (cancel_work_sync(&fwevt->work)) { + /* + * Put fwevt reference count after + * dequeuing it from worker queue + */ + mpi3mr_fwevt_put(fwevt); + /* + * Put fwevt reference count to neutralize + * kref_init increment + */ + mpi3mr_fwevt_put(fwevt); + } + } + if (mrioc->current_event) { + fwevt = mrioc->current_event; + /* + * Don't call cancel_work_sync() API for the + * fwevt work if the controller reset is + * get called as part of processing the + * same fwevt work (or) when worker thread is + * waiting for device add/remove APIs to complete. + * Otherwise we will see deadlock. + */ + if (current_work() == &fwevt->work || fwevt->pending_at_sml) { + fwevt->discard = 1; + return; + } + + /* + * Wait on the fwevt to complete. If this returns 1, then + * the event was never executed, and we need a put for the + * reference the work had on the fwevt. + * + * If it did execute, we wait for it to finish, and the put will + * happen from mpi3mr_process_fwevt() + */ + if (cancel_work_sync(&fwevt->work)) { + /* + * Put fwevt reference count after + * dequeuing it from worker queue + */ + mpi3mr_fwevt_put(fwevt); + /* + * Put fwevt reference count to neutralize + * kref_init increment + */ + mpi3mr_fwevt_put(fwevt); + } + } +} + +/** + * mpi3mr_queue_qd_reduction_event -Queue TG QD reduction event + * @mrioc: Adapter instance reference + * @tg: Throttle group information pointer + * + * Accessor to queue on synthetically generated driver event to + * the event worker thread, the driver event will be used to + * reduce the QD of all VDs in the TG from the worker thread. + * + * Return: None. + */ +static void mpi3mr_queue_qd_reduction_event(struct mpi3mr_ioc *mrioc, + struct mpi3mr_throttle_group_info *tg) +{ + struct mpi3mr_fwevt *fwevt; + u16 sz = sizeof(struct mpi3mr_throttle_group_info *); + + /* If the QD reduction event is already queued due to throttle and if + the QD is not restored through device info change event + then dont queue further reduction events*/ + if (tg->fw_qd != tg->modified_qd) + return; + + fwevt = mpi3mr_alloc_fwevt(sz); + if (!fwevt) { + ioc_warn(mrioc,"failed to queue TG QD reduction event\n"); + return; + } + *(__le64 *)fwevt->event_data = (__le64)tg; + fwevt->mrioc = mrioc; + fwevt->event_id = MPI3MR_DRIVER_EVENT_TG_QD_REDUCTION; + fwevt->send_ack = 0; + fwevt->process_event = 1; + fwevt->event_context = 0; + fwevt->event_data_size = sz; + tg->modified_qd = max_t(u16, (tg->fw_qd * tg->qd_reduction) / 10, 8); + + dprint_event_bh(mrioc, "qd reduction event queued for tg_id(%d)\n", + tg->id); + mpi3mr_fwevt_add_to_list(mrioc, fwevt); +} + +/** + * mpi3mr_host_tag_for_scmd - Get host tag for a scmd + * @mrioc: Adapter instance reference + * @scmd: SCSI command reference + * + * Calculate the host tag based on block tag for a given scmd. + * + * Return: Valid host tag or MPI3MR_HOSTTAG_INVALID. + */ +static u16 mpi3mr_host_tag_for_scmd(struct mpi3mr_ioc *mrioc, + struct scsi_cmnd *scmd) +{ + struct scmd_priv *priv = NULL; + u32 unique_tag; + u16 host_tag, hw_queue; + + unique_tag = blk_mq_unique_tag(SCMD_GET_REQUEST(scmd)); + + if (mpi3mr_use_blk_mq(mrioc->shost)) { + hw_queue = blk_mq_unique_tag_to_hwq(unique_tag); + if (hw_queue >= mrioc->num_op_reply_q) + return MPI3MR_HOSTTAG_INVALID; + host_tag = blk_mq_unique_tag_to_tag(unique_tag); + } else { + hw_queue = raw_smp_processor_id() % mrioc->num_op_reply_q; + host_tag = unique_tag & 0xFFFF; + } + + if (WARN_ON(host_tag >= mrioc->max_host_ios)) + return MPI3MR_HOSTTAG_INVALID; + + priv = scsi_cmd_priv(scmd); + /*host_tag 0 is invalid hence incrementing by 1*/ + priv->host_tag = host_tag + 1; + priv->scmd = scmd; + priv->in_lld_scope = 1; + priv->req_q_idx = hw_queue; + priv->meta_chain_idx = -1; + priv->chain_idx = -1; + priv->meta_sg_valid = 0; + return priv->host_tag; +} + +/** + * mpi3mr_scmd_from_host_tag - Get SCSI command from host tag + * @mrioc: Adapter instance reference + * @host_tag: Host tag + * @qidx: Operational queue index + * + * Identify the block tag from the host tag and queue index and + * retrieve associated scsi command using scsi_host_find_tag(). + * + * Return: SCSI command reference or NULL. + */ +static struct scsi_cmnd *mpi3mr_scmd_from_host_tag( + struct mpi3mr_ioc *mrioc, u16 host_tag, u16 qidx) +{ + struct scsi_cmnd *scmd = NULL; + struct scmd_priv *priv = NULL; + u32 unique_tag = host_tag - 1; + + if (WARN_ON(host_tag > mrioc->max_host_ios)) + goto out; + + if (mpi3mr_use_blk_mq(mrioc->shost)) + unique_tag |= (qidx << BLK_MQ_UNIQUE_TAG_BITS); + + scmd = scsi_host_find_tag(mrioc->shost, unique_tag); + if (scmd) { + priv = scsi_cmd_priv(scmd); + if (!priv->in_lld_scope) + scmd = NULL; + } +out: + return scmd; +} + +/** + * mpi3mr_clear_scmd_priv - Cleanup SCSI command private date + * @mrioc: Adapter instance reference + * @scmd: SCSI command reference + * + * Invalidate the SCSI command private data to mark the command + * is not in LLD scope anymore. + * + * Return: Nothing. + */ +static void mpi3mr_clear_scmd_priv(struct mpi3mr_ioc *mrioc, + struct scsi_cmnd *scmd) +{ + struct scmd_priv *priv = NULL; + + priv = scsi_cmd_priv(scmd); + + if (WARN_ON(priv->in_lld_scope == 0)) + return; + priv->host_tag = MPI3MR_HOSTTAG_INVALID; + priv->req_q_idx = 0xFFFF; + priv->scmd = NULL; + priv->in_lld_scope = 0; + priv->meta_sg_valid = 0; + if (priv->chain_idx >= 0) { + clear_bit(priv->chain_idx, mrioc->chain_bitmap); + priv->chain_idx = -1; + } + if (priv->meta_chain_idx >= 0) { + clear_bit(priv->meta_chain_idx, mrioc->chain_bitmap); + priv->meta_chain_idx = -1; + } +} + +/** + * mpi3mr_invalidate_devhandles -Invalidate device handles + * @mrioc: Adapter instance reference + * + * Invalidate the device handles in the target device structures + * . Called post reset prior to reinitializing the controller. + * + * Return: Nothing. + */ +void mpi3mr_invalidate_devhandles(struct mpi3mr_ioc *mrioc) +{ + struct mpi3mr_tgt_dev *tgtdev; + struct mpi3mr_stgt_priv_data *tgt_priv; + + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) { + tgtdev->dev_handle = MPI3MR_INVALID_DEV_HANDLE; + if (tgtdev->starget && tgtdev->starget->hostdata) { + tgt_priv = tgtdev->starget->hostdata; + tgt_priv->dev_handle = MPI3MR_INVALID_DEV_HANDLE; + tgt_priv->io_throttle_enabled = 0; + tgt_priv->io_divert = 0; + tgt_priv->throttle_group = NULL; + if (tgtdev->host_exposed) + atomic_set(&tgt_priv->block_io, 1); + } + } +} + +/** + * mpi3mr_print_scmd - print individual SCSI command + * @rq: Block request + * @data: Adapter instance reference + * @reserved: N/A. Currently not used + * + * Print the SCSI command details if it is in LLD scope. + * + * Return: true always. + */ +static BLK_ITER_CALLBACK_RET_TYPE mpi3mr_print_scmd(struct request *rq, + void *data, bool reserved) +{ + struct mpi3mr_ioc *mrioc = (struct mpi3mr_ioc *)data; + struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); + struct scmd_priv *priv = NULL; + + if (scmd) { + priv = scsi_cmd_priv(scmd); + if (!priv->in_lld_scope) + goto out; + dprint_reset(mrioc, "host_tag=%d, qid=%d\n", priv->host_tag, + priv->req_q_idx + 1); + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_RESET); + } + +out: + BLK_ITER_CALLBACK_RET_VAL(true); +} + +/** + * mpi3mr_flush_scmd - Flush individual SCSI command + * @rq: Block request + * @data: Adapter instance reference + * @reserved: N/A. Currently not used + * + * Return the SCSI command to the upper layers if it is in LLD + * scope. + * + * Return: true always. + */ + +static BLK_ITER_CALLBACK_RET_TYPE mpi3mr_flush_scmd(struct request *rq, + void *data, bool reserved) +{ + struct mpi3mr_ioc *mrioc = (struct mpi3mr_ioc *)data; + struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); + struct scmd_priv *priv = NULL; + + if (scmd) { + priv = scsi_cmd_priv(scmd); + if (!priv->in_lld_scope) + goto out; + + if (priv->meta_sg_valid) + dma_unmap_sg(&mrioc->pdev->dev, scsi_prot_sglist(scmd), + scsi_prot_sg_count(scmd), scmd->sc_data_direction); + mpi3mr_clear_scmd_priv(mrioc, scmd); + scsi_dma_unmap(scmd); + if (mrioc->unrecoverable) + set_host_byte(scmd, DID_NO_CONNECT); + else + set_host_byte(scmd, DID_REQUEUE); + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_RESET); + SCMD_DONE(scmd); + mrioc->flush_io_count++; + } + +out: + BLK_ITER_CALLBACK_RET_VAL(true); +} + +/** + * mpi3mr_count_dev_pending - Count commands pending for a lun + * @rq: Block request + * @data: SCSI device reference + * @reserved: Unused + * + * This is an iterator function called for each SCSI command in + * a host and if the command is pending in the LLD for the + * specific device(lun) then device specific pending I/O counter + * is updated in the device structure. + * + * Return: true always. + */ + +static BLK_ITER_CALLBACK_RET_TYPE mpi3mr_count_dev_pending(struct request *rq, + void *data, bool reserved) +{ + struct scsi_device *sdev = (struct scsi_device *)data; + struct mpi3mr_sdev_priv_data *sdev_priv_data = sdev->hostdata; + struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); + struct scmd_priv *priv; + + if (scmd) { + priv = scsi_cmd_priv(scmd); + if (!priv->in_lld_scope) + goto out; + if (scmd->device == sdev) + sdev_priv_data->pend_count++; + } + +out: + BLK_ITER_CALLBACK_RET_VAL(true); +} + +/** + * mpi3mr_count_tgt_pending - Count commands pending for target + * @rq: Block request + * @data: SCSI target reference + * @reserved: Unused + * + * This is an iterator function called for each SCSI command in + * a host and if the command is pending in the LLD for the + * specific target then target specific pending I/O counter is + * updated in the target structure. + * + * Return: true always. + */ + +static BLK_ITER_CALLBACK_RET_TYPE mpi3mr_count_tgt_pending(struct request *rq, + void *data, bool reserved) +{ + struct scsi_target *starget = (struct scsi_target *)data; + struct mpi3mr_stgt_priv_data *stgt_priv_data = starget->hostdata; + struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); + struct scmd_priv *priv; + + if (scmd) { + priv = scsi_cmd_priv(scmd); + if (!priv->in_lld_scope) + goto out; + if (scmd->device && (scsi_target(scmd->device) == starget)) + stgt_priv_data->pend_count++; + } + +out: + BLK_ITER_CALLBACK_RET_VAL(true); +} + +/** + * mpi3mr_flush_host_io - Flush host I/Os + * @mrioc: Adapter instance reference + * + * Flush all of the pending I/Os by calling + * blk_mq_tagset_busy_iter() for each possible tag. This is + * executed post controller reset + * + * Return: Nothing. + */ +void mpi3mr_flush_host_io(struct mpi3mr_ioc *mrioc) +{ + struct Scsi_Host *shost = mrioc->shost; + + mrioc->flush_io_count = 0; + ioc_info(mrioc, "flushing host I/O cmds post reset\n"); + blk_mq_tagset_busy_iter(&shost->tag_set, + mpi3mr_flush_scmd, (void *)mrioc); + ioc_info(mrioc, "flushed %d host I/O cmds\n", mrioc->flush_io_count); +} + +/** + * mpi3mr_flush_cmds_for_unrecovered_controller- Flush all pend cmds + * @mrioc: Adapter instance reference + * + * This function waits for currently running IO poll threads to + * exit and then flushes all host I/Os and any internal pending + * cmds. This is executed after controller is marked as + * unrecoverable. + * + * Return: Nothing. + */ +void mpi3mr_flush_cmds_for_unrecovered_controller(struct mpi3mr_ioc *mrioc) +{ + struct Scsi_Host *shost = mrioc->shost; + int i; + + if (!mrioc->unrecoverable) + return; + + if (mrioc->op_reply_qinfo) + { + for (i = 0; i < mrioc->num_queues; i++) { + while (atomic_read(&mrioc->op_reply_qinfo[i].in_use)) + udelay(500); + atomic_set(&mrioc->op_reply_qinfo[i].pend_ios, 0); + } + } + mrioc->flush_io_count = 0; + blk_mq_tagset_busy_iter(&shost->tag_set, + mpi3mr_flush_scmd, (void *)mrioc); + mpi3mr_flush_delayed_cmd_lists(mrioc); + mpi3mr_flush_drv_cmds(mrioc); +} + +/** + * mpi3mr_alloc_tgtdev - target device allocator + * @void: No arguments + * + * Allocate target device instance and initialize the reference + * count + * + * Return: target device instance. + */ +static struct mpi3mr_tgt_dev *mpi3mr_alloc_tgtdev(void) +{ + struct mpi3mr_tgt_dev *tgtdev; + + tgtdev = kzalloc(sizeof(*tgtdev), GFP_ATOMIC); + if (!tgtdev) + return NULL; + kref_init(&tgtdev->ref_count); + return tgtdev; +} + +/** + * mpi3mr_tgtdev_add_to_list -Add tgtdevice to the list + * @mrioc: Adapter instance reference + * @tgtdev: Target device + * + * Add the target device to the target device list + * + * Return: Nothing. + */ +static void mpi3mr_tgtdev_add_to_list(struct mpi3mr_ioc *mrioc, + struct mpi3mr_tgt_dev *tgtdev) +{ + unsigned long flags; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + mpi3mr_tgtdev_get(tgtdev); + INIT_LIST_HEAD(&tgtdev->list); + list_add_tail(&tgtdev->list, &mrioc->tgtdev_list); + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); +} + +/** + * mpi3mr_tgtdev_del_from_list -Delete tgtdevice from the list + * @mrioc: Adapter instance reference + * @tgtdev: Target device + * + * Remove the target device from the target device list + * + * Return: Nothing. + */ +static void mpi3mr_tgtdev_del_from_list(struct mpi3mr_ioc *mrioc, + struct mpi3mr_tgt_dev *tgtdev) +{ + unsigned long flags; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + if (!list_empty(&tgtdev->list)) { + list_del_init(&tgtdev->list); + mpi3mr_tgtdev_put(tgtdev); + } + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); +} + +/** + * __mpi3mr_get_tgtdev_by_handle -Get tgtdev from device handle + * @mrioc: Adapter instance reference + * @handle: Device handle + * + * Accessor to retrieve target device from the device handle. + * Non Lock version + * + * Return: Target device reference. + */ +static struct mpi3mr_tgt_dev *__mpi3mr_get_tgtdev_by_handle( + struct mpi3mr_ioc *mrioc, u16 handle) +{ + struct mpi3mr_tgt_dev *tgtdev; + + assert_spin_locked(&mrioc->tgtdev_lock); + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) + if (tgtdev->dev_handle == handle) + goto found_tgtdev; + return NULL; + +found_tgtdev: + mpi3mr_tgtdev_get(tgtdev); + return tgtdev; +} + +/** + * mpi3mr_get_tgtdev_by_handle -Get tgtdev from device handle + * @mrioc: Adapter instance reference + * @handle: Device handle + * + * Accessor to retrieve target device from the device handle. + * Lock version + * + * Return: Target device reference. + */ +struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_by_handle( + struct mpi3mr_ioc *mrioc, u16 handle) +{ + struct mpi3mr_tgt_dev *tgtdev; + unsigned long flags; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + tgtdev = __mpi3mr_get_tgtdev_by_handle(mrioc, handle); + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + return tgtdev; +} + +/** + * __mpi3mr_get_tgtdev_by_perst_id -Get tgtdev from persist ID + * @mrioc: Adapter instance reference + * @persist_id: Persistent ID + * + * Accessor to retrieve target device from the Persistent ID. + * Non Lock version + * + * Return: Target device reference. + */ +static struct mpi3mr_tgt_dev *__mpi3mr_get_tgtdev_by_perst_id( + struct mpi3mr_ioc *mrioc, u16 persist_id) +{ + struct mpi3mr_tgt_dev *tgtdev; + + assert_spin_locked(&mrioc->tgtdev_lock); + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) + if (tgtdev->perst_id == persist_id) + goto found_tgtdev; + return NULL; + +found_tgtdev: + mpi3mr_tgtdev_get(tgtdev); + return tgtdev; +} + +/** + * mpi3mr_get_tgtdev_by_perst_id -Get tgtdev from persistent ID + * @mrioc: Adapter instance reference + * @persist_id: Persistent ID + * + * Accessor to retrieve target device from the Persistent ID. + * Lock version + * + * Return: Target device reference. + */ +struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_by_perst_id( + struct mpi3mr_ioc *mrioc, u16 persist_id) +{ + struct mpi3mr_tgt_dev *tgtdev; + unsigned long flags; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + tgtdev = __mpi3mr_get_tgtdev_by_perst_id(mrioc, persist_id); + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + return tgtdev; +} + + +/** + * __mpi3mr_get_tgtdev_from_tgtpriv -Get tgtdev from tgt private + * @mrioc: Adapter instance reference + * @tgt_priv: Target private data + * + * Accessor to return target device from the target private + * data. Non Lock version + * + * Return: Target device reference. + */ +static struct mpi3mr_tgt_dev *__mpi3mr_get_tgtdev_from_tgtpriv( + struct mpi3mr_ioc *mrioc, struct mpi3mr_stgt_priv_data *tgt_priv) +{ + struct mpi3mr_tgt_dev *tgtdev; + + assert_spin_locked(&mrioc->tgtdev_lock); + tgtdev = tgt_priv->tgt_dev; + if (tgtdev) + mpi3mr_tgtdev_get(tgtdev); + return tgtdev; +} + +/** + * mpi3mr_get_tgtdev_from_tgtpriv -Get tgtdev from tgt priv data + * @mrioc: Adapter instance reference + * @tgt_priv: Target private data + * + * Accessor to return target device from the target private + * data. Lock version + * + * Return: Target device reference. + */ +struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_from_tgtpriv( + struct mpi3mr_ioc *mrioc, struct mpi3mr_stgt_priv_data *tgt_priv) +{ + struct mpi3mr_tgt_dev *tgtdev; + unsigned long flags; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + tgtdev = __mpi3mr_get_tgtdev_from_tgtpriv(mrioc, tgt_priv); + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + return tgtdev; +} + +/** + * mpi3mr_set_io_divert_for_all_vd_in_tg -set divert for TG VDs + * @mrioc: Adapter instance reference + * @tg: Throttle group information pointer + * @divert_value: 1 or 0 + * + * Accessor to set io_divert flag for each device associated + * with the given throttle group with the given value. + * + * Return: None. + */ +static void mpi3mr_set_io_divert_for_all_vd_in_tg(struct mpi3mr_ioc *mrioc, + struct mpi3mr_throttle_group_info *tg, u8 divert_value) +{ + unsigned long flags; + struct mpi3mr_tgt_dev *tgtdev; + struct mpi3mr_stgt_priv_data *tgt_priv; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) + if (tgtdev->starget && tgtdev->starget->hostdata) { + tgt_priv = tgtdev->starget->hostdata; + if (tgt_priv->throttle_group == tg) + tgt_priv->io_divert = divert_value; + } + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); +} + +/** + * mpi3mr_print_discard_event_notice - print discarded evt info + * + * @mrioc: Adapter instance reference + * @device_add: true for device add event and false for device removal event + * + * Print notice related to post processing of discarded device + * event after controller reset. + * + * Return: None. + */ +inline void mpi3mr_print_discard_event_notice(struct mpi3mr_ioc *mrioc, + bool device_add) +{ + ioc_notice(mrioc, + "Device %s was under process before the reset and completed after reset\n", + (device_add ? "addition" : "removal")); + ioc_notice(mrioc, + "Verify whether the exposed devices are matched with attached devices for correctness\n"); +} + +/** + * mpi3mr_remove_tgtdev_from_host - Remove dev from upper layers + * @mrioc: Adapter instance reference + * @tgtdev: Target device structure + * + * Checks whether the device is exposed to upper layers and if + * it is then remove the device from upper layers by calling + * scsi_remove_target(). + * + * Return: 0 on success, non zero on failure. + */ +void mpi3mr_remove_tgtdev_from_host(struct mpi3mr_ioc *mrioc, + struct mpi3mr_tgt_dev *tgtdev) +{ + struct mpi3mr_stgt_priv_data *tgt_priv; + + + ioc_info(mrioc, "removing handle(0x%04x), perst_id(%d)\n", + tgtdev->dev_handle, tgtdev->perst_id); + if (tgtdev->starget && tgtdev->starget->hostdata) { + tgt_priv = tgtdev->starget->hostdata; + atomic_set(&tgt_priv->block_io, 0); + tgt_priv->dev_handle = MPI3MR_INVALID_DEV_HANDLE; + } + if (!mrioc->sas_transport_enabled || (tgtdev->dev_type != + MPI3_DEVICE_DEVFORM_SAS_SATA) || tgtdev->non_stl) { + if (tgtdev->starget) { + if (mrioc->current_event) + mrioc->current_event->pending_at_sml = 1; + scsi_remove_target(&tgtdev->starget->dev); + tgtdev->host_exposed = 0; + if (mrioc->current_event) { + mrioc->current_event->pending_at_sml = 0; + if (mrioc->current_event->discard) { + mpi3mr_print_discard_event_notice(mrioc, + false); + return; + } + } + } + } else if (tgtdev->starget) + mpi3mr_remove_tgtdev_from_sas_transport(mrioc, tgtdev); + mpi3mr_master_trigger(mrioc, + MPI3_DRIVER2_MASTERTRIGGER_DEVICE_REMOVAL_ENABLED); + + ioc_info(mrioc, "removed handle(0x%04x), perst_id(%d)\n", + tgtdev->dev_handle, tgtdev->perst_id); +} + + +/** + * mpi3mr_report_tgtdev_to_host - Expose device to upper layers + * @mrioc: Adapter instance reference + * @perst_id: Persistent ID of the device + * + * Checks whether the device can be exposed to upper layers and + * if it is not then expose the device to upper layers by + * calling scsi_scan_target(). + * + * Return: 0 on success, non zero on failure. + */ +static int mpi3mr_report_tgtdev_to_host(struct mpi3mr_ioc *mrioc, + u16 perst_id) +{ + int retval = 0; + struct mpi3mr_tgt_dev *tgtdev; + + if (mrioc->reset_in_progress) + return -1; + tgtdev = mpi3mr_get_tgtdev_by_perst_id(mrioc, perst_id); + if (!tgtdev) { + retval = -1; + goto out; + } + if (tgtdev->is_hidden || tgtdev->host_exposed) { + retval = -1; + goto out; + } + if (!mrioc->sas_transport_enabled || (tgtdev->dev_type != + MPI3_DEVICE_DEVFORM_SAS_SATA) || tgtdev->non_stl){ + tgtdev->host_exposed = 1; + ioc_info(mrioc, + "exposing target device with handle(0x%04x), perst_id(%d)\n", + tgtdev->dev_handle, perst_id); + if (mrioc->current_event) + mrioc->current_event->pending_at_sml = 1; + scsi_scan_target(&mrioc->shost->shost_gendev, + mrioc->scsi_device_channel, tgtdev->perst_id, + SCAN_WILD_CARD, SCSI_SCAN_INITIAL); + if (!tgtdev->starget) { + ioc_err(mrioc, + "exposing target device with handle(0x%04x), perst_id(%d) failed\n", + tgtdev->dev_handle, perst_id); + tgtdev->host_exposed = 0; + } + if (mrioc->current_event) { + mrioc->current_event->pending_at_sml = 0; + if (mrioc->current_event->discard) { + mpi3mr_print_discard_event_notice(mrioc, true); + goto out; + } + } + dprint_event_bh(mrioc, + "exposed target device with handle(0x%04x), perst_id(%d)\n", + tgtdev->dev_handle, perst_id); + goto out; + } else + mpi3mr_report_tgtdev_to_sas_transport(mrioc, tgtdev); + + + +out: + if (tgtdev) + mpi3mr_tgtdev_put(tgtdev); + + return retval; +} + +/** + * mpi3mr_update_sdev - Update SCSI device information + * @sdev: SCSI device reference + * @data: target device reference + * + * This is an iterator function called for each SCSI device in a + * target to update the target specific information into each + * SCSI device. + * + * Return: Nothing. + */ +static void +mpi3mr_update_sdev(struct scsi_device *sdev, void *data) +{ + struct mpi3mr_tgt_dev *tgtdev; + + tgtdev = (struct mpi3mr_tgt_dev *)data; + if (!tgtdev) + return; + + mpi3mr_change_queue_depth(sdev, tgtdev->q_depth); + switch (tgtdev->dev_type) { + case MPI3_DEVICE_DEVFORM_PCIE: + /*The block layer hw sector size = 512*/ + if ((tgtdev->dev_spec.pcie_inf.dev_info & + MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK) == + MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NVME_DEVICE) { + blk_queue_max_hw_sectors(sdev->request_queue, + tgtdev->dev_spec.pcie_inf.mdts / 512); + if (tgtdev->dev_spec.pcie_inf.pgsz == 0) + blk_queue_virt_boundary(sdev->request_queue, + ((1 << MPI3MR_DEFAULT_PGSZEXP) - 1)); + else + blk_queue_virt_boundary(sdev->request_queue, + ((1 << tgtdev->dev_spec.pcie_inf.pgsz) - 1)); + } + + break; + default: + break; + } +} + +/** + * mpi3mr_refresh_tgtdevs - Refresh target device exposure + * @mrioc: Adapter instance reference + * + * This is executed post controller reset to identify any + * missing devices during reset and remove from the upper layers + * or expose any newly detected device to the upper layers. + * + * Return: Nothing. + */ + +void mpi3mr_refresh_tgtdevs(struct mpi3mr_ioc *mrioc) +{ + struct mpi3mr_tgt_dev *tgtdev, *tgtdev_next; + + dprint_reset(mrioc, "refresh target devices: check for removals\n"); + list_for_each_entry_safe(tgtdev, tgtdev_next, &mrioc->tgtdev_list, + list) { + if (tgtdev->dev_handle == MPI3MR_INVALID_DEV_HANDLE) { + dprint_reset(mrioc, "removing target device with perst_id(%d)\n", + tgtdev->perst_id); + if (tgtdev->host_exposed) + mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev); + mpi3mr_tgtdev_del_from_list(mrioc, tgtdev); + mpi3mr_tgtdev_put(tgtdev); + } + } + + dprint_reset(mrioc, "refresh target devices: check for additions\n"); + tgtdev = NULL; + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) { + if ((tgtdev->dev_handle != MPI3MR_INVALID_DEV_HANDLE) && + !tgtdev->is_hidden && !tgtdev->host_exposed) + mpi3mr_report_tgtdev_to_host(mrioc, tgtdev->perst_id); + } + dprint_reset(mrioc, "refresh target devices: done\n"); +} + + +/** + * mpi3mr_debug_dump_devpg0 - Dump device page0 + * @mrioc: Adapter instance reference + * @dev_pg0: Device page 0. + * + * Prints pertinent details of the device page 0. + * + * Return: Nothing. + */ +static void +mpi3mr_debug_dump_devpg0(struct mpi3mr_ioc *mrioc, struct mpi3_device_page0 *dev_pg0) +{ + + if (!(mrioc->logging_level & + (MPI3_DEBUG_EVENT | MPI3_DEBUG_EVENT_WORK_TASK))) + return; + + ioc_info(mrioc, + "device_pg0: handle(0x%04x), perst_id(%d), wwid(0x%016llx), encl_handle(0x%04x), slot(%d)\n", + le16_to_cpu(dev_pg0->dev_handle), + le16_to_cpu(dev_pg0->persistent_id), + le64_to_cpu(dev_pg0->wwid), le16_to_cpu(dev_pg0->enclosure_handle), + le16_to_cpu(dev_pg0->slot)); + ioc_info(mrioc, "device_pg0: access_status(0x%02x), flags(0x%04x), device_form(0x%02x), queue_depth(%d)\n", + dev_pg0->access_status, le16_to_cpu(dev_pg0->flags), + dev_pg0->device_form, le16_to_cpu(dev_pg0->queue_depth)); + ioc_info(mrioc, "device_pg0: parent_handle(0x%04x), iounit_port(%d)\n", + le16_to_cpu(dev_pg0->parent_dev_handle), dev_pg0->io_unit_port); + + switch (dev_pg0->device_form) { + case MPI3_DEVICE_DEVFORM_SAS_SATA: + { + struct mpi3_device0_sas_sata_format *sasinf = + &dev_pg0->device_specific.sas_sata_format; + ioc_info(mrioc, + "device_pg0: sas_sata: sas_address(0x%016llx),flags(0x%04x), device_info(0x%04x), phy_num(%d), attached_phy_id(%d)\n", + le64_to_cpu(sasinf->sas_address), + le16_to_cpu(sasinf->flags), + le16_to_cpu(sasinf->device_info), sasinf->phy_num, + sasinf->attached_phy_identifier); + break; + } + case MPI3_DEVICE_DEVFORM_PCIE: + { + struct mpi3_device0_pcie_format *pcieinf = + &dev_pg0->device_specific.pcie_format; + ioc_info(mrioc, + "device_pg0: pcie: port_num(%d), device_info(0x%04x), mdts(%d), page_sz(0x%02x)\n", + pcieinf->port_num, le16_to_cpu(pcieinf->device_info), + le32_to_cpu(pcieinf->maximum_data_transfer_size), + pcieinf->page_size); + ioc_info(mrioc, + "device_pg0: pcie: abort_timeout(%d), reset_timeout(%d)\n", + pcieinf->nvme_abort_to, pcieinf->controller_reset_to); + break; + } + case MPI3_DEVICE_DEVFORM_VD: + { + struct mpi3_device0_vd_format *vdinf = + &dev_pg0->device_specific.vd_format; + + ioc_info(mrioc, + "device_pg0: vd: state(0x%02x), raid_level(%d), flags(0x%04x), device_info(0x%04x)\n", + vdinf->vd_state, vdinf->raid_level, + le16_to_cpu(vdinf->flags), + le16_to_cpu(vdinf->device_info)); + ioc_info(mrioc, + "device_pg0: vd: tg_id(%d), high(%dMiB), low(%dMiB), qd_reduction_factor(%d)\n", + vdinf->io_throttle_group, + le16_to_cpu(vdinf->io_throttle_group_high), + le16_to_cpu(vdinf->io_throttle_group_low), + ((le16_to_cpu(vdinf->flags) & + MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_MASK) >> 12)); + + } + default: + break; + } +} +/** + * mpi3mr_update_tgtdev - DevStatusChange evt bottomhalf + * @mrioc: Adapter instance reference + * @tgtdev: Target device internal structure + * @dev_pg0: New device page0 + * @is_added: Flag to indicate the device is just added + * + * Update the information from the device page0 into the driver + * cached target device structure. + * + * Return: Nothing. + */ +static void mpi3mr_update_tgtdev(struct mpi3mr_ioc *mrioc, + struct mpi3mr_tgt_dev *tgtdev, struct mpi3_device_page0 *dev_pg0, + bool is_added) +{ + u16 flags = 0; + struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data = NULL; + struct mpi3mr_enclosure_node *enclosure_dev = NULL; + + mpi3mr_debug_dump_devpg0(mrioc, dev_pg0); + + tgtdev->perst_id = le16_to_cpu(dev_pg0->persistent_id); + tgtdev->dev_handle = le16_to_cpu(dev_pg0->dev_handle); + tgtdev->dev_type = dev_pg0->device_form; + tgtdev->io_unit_port = dev_pg0->io_unit_port; + tgtdev->encl_handle = le16_to_cpu(dev_pg0->enclosure_handle); + tgtdev->parent_handle = le16_to_cpu(dev_pg0->parent_dev_handle); + tgtdev->slot = le16_to_cpu(dev_pg0->slot); + tgtdev->q_depth = le16_to_cpu(dev_pg0->queue_depth); + tgtdev->wwid = le64_to_cpu(dev_pg0->wwid); + tgtdev->devpg0_flag = le16_to_cpu(dev_pg0->flags); + + if (tgtdev->encl_handle) + enclosure_dev = mpi3mr_enclosure_find_by_handle(mrioc, + tgtdev->encl_handle); + if (enclosure_dev) + tgtdev->enclosure_logical_id = le64_to_cpu( + enclosure_dev->pg0.enclosure_logical_id); + + flags = tgtdev->devpg0_flag; + + tgtdev->is_hidden = (flags & MPI3_DEVICE0_FLAGS_HIDDEN); + + if (is_added == true) + tgtdev->io_throttle_enabled = + (flags & MPI3_DEVICE0_FLAGS_IO_THROTTLING_REQUIRED) ? 1 : 0; + + + if (tgtdev->starget && tgtdev->starget->hostdata) { + scsi_tgt_priv_data = (struct mpi3mr_stgt_priv_data *) + tgtdev->starget->hostdata; + scsi_tgt_priv_data->perst_id = tgtdev->perst_id; + scsi_tgt_priv_data->dev_handle = tgtdev->dev_handle; + scsi_tgt_priv_data->dev_type = tgtdev->dev_type; + scsi_tgt_priv_data->io_throttle_enabled = + tgtdev->io_throttle_enabled; + if (is_added == true) + atomic_set(&scsi_tgt_priv_data->block_io, 0); + } + + switch (dev_pg0->access_status) { + case MPI3_DEVICE0_ASTATUS_NO_ERRORS: + case MPI3_DEVICE0_ASTATUS_PREPARE: + case MPI3_DEVICE0_ASTATUS_NEEDS_INITIALIZATION: + case MPI3_DEVICE0_ASTATUS_DEVICE_MISSING_DELAY: + break; + default: + tgtdev->is_hidden = 1; + break; + } + + switch (tgtdev->dev_type) { + case MPI3_DEVICE_DEVFORM_SAS_SATA: + { + struct mpi3_device0_sas_sata_format *sasinf = + &dev_pg0->device_specific.sas_sata_format; + u16 dev_info = le16_to_cpu(sasinf->device_info); + + tgtdev->dev_spec.sas_sata_inf.dev_info = dev_info; + tgtdev->dev_spec.sas_sata_inf.sas_address = + le64_to_cpu(sasinf->sas_address); + tgtdev->dev_spec.sas_sata_inf.phy_id = sasinf->phy_num; + tgtdev->dev_spec.sas_sata_inf.attached_phy_id = + sasinf->attached_phy_identifier; + if ((dev_info & MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_MASK) + != MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_END_DEVICE) + tgtdev->is_hidden = 1; + else if (!(dev_info & + (MPI3_SAS_DEVICE_INFO_STP_SATA_TARGET | + MPI3_SAS_DEVICE_INFO_SSP_TARGET))) + tgtdev->is_hidden = 1; + + if (((tgtdev->devpg0_flag & + MPI3_DEVICE0_FLAGS_ATT_METHOD_DIR_ATTACHED) + && (tgtdev->devpg0_flag & + MPI3_DEVICE0_FLAGS_ATT_METHOD_VIRTUAL)) || + (tgtdev->parent_handle == 0xFFFF)) + tgtdev->non_stl = 1; + if (tgtdev->dev_spec.sas_sata_inf.hba_port) + tgtdev->dev_spec.sas_sata_inf.hba_port->port_id = dev_pg0->io_unit_port; + break; + } + case MPI3_DEVICE_DEVFORM_PCIE: + { + struct mpi3_device0_pcie_format *pcieinf = + &dev_pg0->device_specific.pcie_format; + u16 dev_info = le16_to_cpu(pcieinf->device_info); + + tgtdev->dev_spec.pcie_inf.dev_info = dev_info; + tgtdev->dev_spec.pcie_inf.capb = + le32_to_cpu(pcieinf->capabilities); + tgtdev->dev_spec.pcie_inf.mdts = MPI3MR_DEFAULT_MDTS; + if (dev_pg0->access_status == MPI3_DEVICE0_ASTATUS_NO_ERRORS) { + tgtdev->dev_spec.pcie_inf.mdts = + le32_to_cpu(pcieinf->maximum_data_transfer_size); + tgtdev->dev_spec.pcie_inf.pgsz = pcieinf->page_size; + tgtdev->dev_spec.pcie_inf.reset_to = + max_t(u8, pcieinf->controller_reset_to, + MPI3MR_INTADMCMD_TIMEOUT); + tgtdev->dev_spec.pcie_inf.abort_to = + max_t(u8, pcieinf->nvme_abort_to, + MPI3MR_INTADMCMD_TIMEOUT); + } + if (tgtdev->dev_spec.pcie_inf.mdts > (1024 * 1024)) + tgtdev->dev_spec.pcie_inf.mdts = (1024 * 1024); + if (((dev_info & MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK) != + MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NVME_DEVICE) && + ((dev_info & MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK) != + MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_SCSI_DEVICE)) + tgtdev->is_hidden = 1; + tgtdev->non_stl = 1; + break; + } + case MPI3_DEVICE_DEVFORM_VD: + { + struct mpi3_device0_vd_format *vdinf = + &dev_pg0->device_specific.vd_format; + struct mpi3mr_throttle_group_info *tg = NULL; + + tgtdev->dev_spec.vd_inf.state = vdinf->vd_state; + if (vdinf->vd_state == MPI3_DEVICE0_VD_STATE_OFFLINE) + tgtdev->is_hidden = 1; + tgtdev->non_stl = 1; + tgtdev->dev_spec.vd_inf.tg_id = vdinf->io_throttle_group; + tgtdev->dev_spec.vd_inf.tg_high = + le16_to_cpu(vdinf->io_throttle_group_high) * 2048; + tgtdev->dev_spec.vd_inf.tg_low = + le16_to_cpu(vdinf->io_throttle_group_low) * 2048; + tgtdev->dev_spec.vd_inf.tg_qd_reduction = + ((le16_to_cpu(vdinf->flags) & + MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_MASK) >> 12); + if (vdinf->io_throttle_group < mrioc->num_io_throttle_group) { + tg = mrioc->throttle_groups + vdinf->io_throttle_group; + tg->id = vdinf->io_throttle_group; + tg->high = tgtdev->dev_spec.vd_inf.tg_high; + tg->low = tgtdev->dev_spec.vd_inf.tg_low; + tg->qd_reduction = + tgtdev->dev_spec.vd_inf.tg_qd_reduction; + if (is_added == true) + tg->fw_qd = tgtdev->q_depth; + tg->modified_qd = tgtdev->q_depth; + } + tgtdev->dev_spec.vd_inf.tg = tg; + if (scsi_tgt_priv_data) + scsi_tgt_priv_data->throttle_group = tg; + break; + } + default: + break; + } + +} + +/** + * mpi3mr_devstatuschg_evt_bh - DevStatusChange evt bottomhalf + * @mrioc: Adapter instance reference + * @fwevt: Firmware event + * + * Process Device status Change event and based on device's new + * information, either expose the device to the upper layers, or + * remove the device from upper layers. + * + * Return: Nothing. + */ +static void mpi3mr_devstatuschg_evt_bh(struct mpi3mr_ioc *mrioc, + struct mpi3mr_fwevt *fwevt) +{ + u16 dev_handle = 0; + u8 uhide = 0, delete = 0, cleanup = 0; + struct mpi3mr_tgt_dev *tgtdev = NULL; + struct mpi3_event_data_device_status_change *evtdata = + (struct mpi3_event_data_device_status_change *)fwevt->event_data; + + + dev_handle = le16_to_cpu(evtdata->dev_handle); + dprint_event_bh(mrioc, + "processing device status change event bottom half for handle(0x%04x), rc(0x%02x)\n", + dev_handle, evtdata->reason_code); + switch (evtdata->reason_code) { + case MPI3_EVENT_DEV_STAT_RC_HIDDEN: + delete = 1; + break; + case MPI3_EVENT_DEV_STAT_RC_NOT_HIDDEN: + uhide = 1; + break; + case MPI3_EVENT_DEV_STAT_RC_VD_NOT_RESPONDING: + delete = 1; + cleanup = 1; + break; + default: + break; + } + + tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, dev_handle); + if (!tgtdev) { + dprint_event_bh(mrioc, + "processing device status change event bottom half, cannot identify target device for handle(0x%04x), rc(0x%02x)\n", + dev_handle, evtdata->reason_code); + goto out; + } + if (uhide) { + tgtdev->is_hidden = 0; + if (!tgtdev->host_exposed) + mpi3mr_report_tgtdev_to_host(mrioc, tgtdev->perst_id); + } + if (tgtdev->starget && tgtdev->starget->hostdata) { + if (delete) + mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev); + } + if (cleanup) { + mpi3mr_tgtdev_del_from_list(mrioc, tgtdev); + mpi3mr_tgtdev_put(tgtdev); + } + +out: + if (tgtdev) + mpi3mr_tgtdev_put(tgtdev); + +} + +/** + * mpi3mr_devinfochg_evt_bh - DeviceInfoChange evt bottomhalf + * @mrioc: Adapter instance reference + * @dev_pg0: New device page0 + * + * Process Device Info Change event and based on device's new + * information, either expose the device to the upper layers, or + * remove the device from upper layers or update the details of + * the device. + * + * Return: Nothing. + */ +static void mpi3mr_devinfochg_evt_bh(struct mpi3mr_ioc *mrioc, + struct mpi3_device_page0 *dev_pg0) +{ + struct mpi3mr_tgt_dev *tgtdev = NULL; + u16 dev_handle = 0, perst_id = 0; + + perst_id = le16_to_cpu(dev_pg0->persistent_id); + dev_handle = le16_to_cpu(dev_pg0->dev_handle); + + dprint_event_bh(mrioc, + "processing device info change event bottom half for handle(0x%04x), perst_id(%d)\n", + dev_handle, perst_id); + tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, dev_handle); + if (!tgtdev) { + dprint_event_bh(mrioc, + "cannot identify target device for device info change event handle(0x%04x), perst_id(%d)\n", + dev_handle, perst_id); + goto out; + } + mpi3mr_update_tgtdev(mrioc, tgtdev, dev_pg0, false); + if (!tgtdev->is_hidden && !tgtdev->host_exposed) + mpi3mr_report_tgtdev_to_host(mrioc, perst_id); + if (tgtdev->is_hidden && tgtdev->host_exposed) + mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev); + if (!tgtdev->is_hidden && tgtdev->host_exposed && tgtdev->starget) + starget_for_each_device(tgtdev->starget, (void *) tgtdev, + mpi3mr_update_sdev); +out: + if (tgtdev) + mpi3mr_tgtdev_put(tgtdev); + +} + +/** + * mpi3mr_process_trigger_data_event_bh - Process trigger event + * data + * @mrioc: Adapter instance reference + * @event_data: Event data + * @trigger_type: Trigger type + * + * This function releases diage buffers or issues diag fault + * based on trigger conditions + * + * Return: Nothing + */ +static void mpi3mr_process_trigger_data_event_bh(struct mpi3mr_ioc *mrioc, + struct trigger_event_data *event_data) +{ + struct diag_buffer_desc *trace_hdb = event_data->trace_hdb; + struct diag_buffer_desc *fw_hdb = event_data->fw_hdb; + unsigned long flags; + u8 trigger_type = event_data->trigger_type; + u64 trigger_data = event_data->trigger_specific_data; + + if (event_data->snapdump) { + if (trace_hdb) + mpi3mr_set_trigger_data_in_hdb(trace_hdb, trigger_type, + trigger_data, 1); + if (fw_hdb) + mpi3mr_set_trigger_data_in_hdb(fw_hdb, trigger_type, + trigger_data, 1); + mpi3mr_soft_reset_handler(mrioc, + MPI3MR_RESET_FROM_TRIGGER, 1); + return; + } + + if (trace_hdb) { + mpi3mr_set_trigger_data_in_hdb(trace_hdb, trigger_type, + trigger_data, 1); + mpi3mr_issue_diag_buf_release(mrioc, trace_hdb); + spin_lock_irqsave(&mrioc->trigger_lock, flags); + mrioc->trace_release_trigger_active = false; + spin_unlock_irqrestore(&mrioc->trigger_lock, flags); + } + if (fw_hdb) { + mpi3mr_set_trigger_data_in_hdb(fw_hdb, trigger_type, + trigger_data, 1); + mpi3mr_issue_diag_buf_release(mrioc, fw_hdb); + spin_lock_irqsave(&mrioc->trigger_lock, flags); + mrioc->fw_release_trigger_active = false; + spin_unlock_irqrestore(&mrioc->trigger_lock, flags); + } +} + +/** + * mpi3mr_encldev_add_chg_evt_debug - debug for enclosure event + * @mrioc: Adapter instance reference + * @encl_pg0: Enclosure page 0. + * @is_added: Added event or not + * + * Return nothing. + */ +static void mpi3mr_encldev_add_chg_evt_debug(struct mpi3mr_ioc *mrioc, + struct mpi3_enclosure_page0 *encl_pg0, u8 is_added) +{ + char *reason_str = NULL; + + if (!(mrioc->logging_level & MPI3_DEBUG_EVENT_WORK_TASK)) + return; + + if (is_added) + reason_str = "enclosure added"; + else + reason_str = "enclosure dev status changed"; + + ioc_info(mrioc, "%s: handle(0x%04x), enclosure logical id(0x%016llx)" + " number of slots(%d), port(%d), flags(0x%04x), present(%d)\n", + reason_str, le16_to_cpu(encl_pg0->enclosure_handle), + (unsigned long long)le64_to_cpu(encl_pg0->enclosure_logical_id), + le16_to_cpu(encl_pg0->num_slots), encl_pg0->io_unit_port, + le16_to_cpu(encl_pg0->flags), + ((le16_to_cpu(encl_pg0->flags) & + MPI3_ENCLS0_FLAGS_ENCL_DEV_PRESENT_MASK) >> 4)); +} + +/** + * mpi3mr_encldev_add_chg_evt_bh - Enclosure evt bottomhalf + * @mrioc: Adapter instance reference + * @fwevt: Firmware event reference + * + * Prints information about the Enclosure device status or + * Enclosure add events if logging is enabled and add or remove + * the enclosure from the controller's internal list of + * enclosures. + * + * Return: Nothing. + */ +static void mpi3mr_encldev_add_chg_evt_bh(struct mpi3mr_ioc *mrioc, + struct mpi3mr_fwevt *fwevt) +{ + struct mpi3mr_enclosure_node *enclosure_dev = NULL; + struct mpi3_enclosure_page0 *encl_pg0; + u16 encl_handle; + u8 added, present; + + encl_pg0= (struct mpi3_enclosure_page0 *) fwevt->event_data; + added = (fwevt->event_id == MPI3_EVENT_ENCL_DEVICE_ADDED) ? 1 : 0; + mpi3mr_encldev_add_chg_evt_debug(mrioc, encl_pg0, added); + + + encl_handle= le16_to_cpu(encl_pg0->enclosure_handle); + present = ((le16_to_cpu(encl_pg0->flags) & + MPI3_ENCLS0_FLAGS_ENCL_DEV_PRESENT_MASK) >> 4); + + if (encl_handle) + enclosure_dev = mpi3mr_enclosure_find_by_handle(mrioc, + encl_handle); + if (!enclosure_dev && present) { + enclosure_dev = + kzalloc(sizeof(struct mpi3mr_enclosure_node), + GFP_KERNEL); + if (!enclosure_dev) + return; + list_add_tail(&enclosure_dev->list, + &mrioc->enclosure_list); + } + if (enclosure_dev) { + if (!present) { + list_del(&enclosure_dev->list); + kfree(enclosure_dev); + } else + memcpy(&enclosure_dev->pg0, encl_pg0, + sizeof(enclosure_dev->pg0)); + + } +} + +/** + * mpi3mr_sastopochg_evt_debug - SASTopoChange details + * @mrioc: Adapter instance reference + * @event_data: SAS topology change list event data + * + * Prints information about the SAS topology change event. + * + * Return: Nothing. + */ +static void +mpi3mr_sastopochg_evt_debug(struct mpi3mr_ioc *mrioc, + struct mpi3_event_data_sas_topology_change_list *event_data) +{ + int i; + u16 handle; + u8 reason_code, phy_number; + char *status_str = NULL; + u8 link_rate, prev_link_rate; + + if (!(mrioc->logging_level & MPI3_DEBUG_EVENT_WORK_TASK)) + return; + + switch (event_data->exp_status) { + case MPI3_EVENT_SAS_TOPO_ES_NOT_RESPONDING: + status_str = "remove"; + break; + case MPI3_EVENT_SAS_TOPO_ES_RESPONDING: + status_str = "responding"; + break; + case MPI3_EVENT_SAS_TOPO_ES_DELAY_NOT_RESPONDING: + status_str = "remove delay"; + break; + case MPI3_EVENT_SAS_TOPO_ES_NO_EXPANDER: + status_str = "direct attached"; + break; + default: + status_str = "unknown status"; + break; + } + ioc_info(mrioc, "%s :sas topology change: (%s)\n", + __func__, status_str); + ioc_info(mrioc, + "%s :\texpander_handle(0x%04x), port(%d), enclosure_handle(0x%04x) start_phy(%02d), num_entries(%d)\n", + __func__, le16_to_cpu(event_data->expander_dev_handle), + event_data->io_unit_port, + le16_to_cpu(event_data->enclosure_handle), + event_data->start_phy_num, event_data->num_entries); + for (i = 0; i < event_data->num_entries; i++) { + handle = + le16_to_cpu(event_data->phy_entry[i].attached_dev_handle); + if (!handle) + continue; + phy_number = event_data->start_phy_num + i; + reason_code = event_data->phy_entry[i].status & + MPI3_EVENT_SAS_TOPO_PHY_RC_MASK; + switch (reason_code) { + case MPI3_EVENT_SAS_TOPO_PHY_RC_TARG_NOT_RESPONDING: + status_str = "target remove"; + break; + case MPI3_EVENT_SAS_TOPO_PHY_RC_DELAY_NOT_RESPONDING: + status_str = "delay target remove"; + break; + case MPI3_EVENT_SAS_TOPO_PHY_RC_PHY_CHANGED: + status_str = "link status change"; + break; + case MPI3_EVENT_SAS_TOPO_PHY_RC_NO_CHANGE: + status_str = "link status no change"; + break; + case MPI3_EVENT_SAS_TOPO_PHY_RC_RESPONDING: + status_str = "target responding"; + break; + default: + status_str = "unknown"; + break; + } + link_rate = event_data->phy_entry[i].link_rate >> 4; + prev_link_rate = event_data->phy_entry[i].link_rate & 0xF; + ioc_info(mrioc, + "%s :\tphy(%02d), attached_handle(0x%04x): %s: link rate: new(0x%02x), old(0x%02x)\n", + __func__, phy_number, handle, status_str, link_rate, + prev_link_rate); + } +} + +/** + * mpi3mr_sastopochg_evt_bh - SASTopologyChange evt bottomhalf + * @mrioc: Adapter instance reference + * @fwevt: Firmware event reference + * + * Prints information about the SAS topology change event and + * for "not responding" event code, removes the device from the + * upper layers. + * + * Return: Nothing. + */ +static void mpi3mr_sastopochg_evt_bh(struct mpi3mr_ioc *mrioc, + struct mpi3mr_fwevt *fwevt) +{ + struct mpi3_event_data_sas_topology_change_list *event_data = + (struct mpi3_event_data_sas_topology_change_list *)fwevt->event_data; + int i; + u16 handle; + u8 reason_code; + struct mpi3mr_tgt_dev *tgtdev = NULL; + u64 exp_sas_address = 0, parent_sas_address = 0; + struct mpi3mr_hba_port *hba_port = NULL; + struct mpi3mr_sas_node *sas_expander = NULL; + unsigned long flags; + u8 link_rate, prev_link_rate, parent_phy_number; + + mpi3mr_sastopochg_evt_debug(mrioc, event_data); + if (mrioc->sas_transport_enabled) { + hba_port = mpi3mr_get_hba_port_by_id(mrioc, + event_data->io_unit_port, 0); + if (le16_to_cpu(event_data->expander_dev_handle)) { + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + sas_expander = __mpi3mr_expander_find_by_handle(mrioc, + le16_to_cpu(event_data->expander_dev_handle)); + if (sas_expander) { + exp_sas_address = sas_expander->sas_address; + hba_port = sas_expander->hba_port; + } + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + parent_sas_address = exp_sas_address; + } else + parent_sas_address = mrioc->sas_hba.sas_address; + } + + for (i = 0; i < event_data->num_entries; i++) { + if (fwevt->discard) + return; + handle = + le16_to_cpu(event_data->phy_entry[i].attached_dev_handle); + if (!handle) + continue; + tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, handle); + if (!tgtdev) + continue; + + reason_code = event_data->phy_entry[i].status & + MPI3_EVENT_SAS_TOPO_PHY_RC_MASK; + + switch (reason_code) { + case MPI3_EVENT_SAS_TOPO_PHY_RC_TARG_NOT_RESPONDING: + if (tgtdev->host_exposed) + mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev); + mpi3mr_tgtdev_del_from_list(mrioc, tgtdev); + mpi3mr_tgtdev_put(tgtdev); + break; + case MPI3_EVENT_SAS_TOPO_PHY_RC_RESPONDING: + case MPI3_EVENT_SAS_TOPO_PHY_RC_PHY_CHANGED: + case MPI3_EVENT_SAS_TOPO_PHY_RC_NO_CHANGE: + { + if (!mrioc->sas_transport_enabled || tgtdev->non_stl + || tgtdev->is_hidden) + break; + link_rate = event_data->phy_entry[i].link_rate >> 4; + prev_link_rate = event_data->phy_entry[i].link_rate + & 0xF; + if (link_rate == prev_link_rate) + break; + if (!parent_sas_address) + break; + parent_phy_number = event_data->start_phy_num + i; + mpi3mr_update_links(mrioc, parent_sas_address, handle, + parent_phy_number, link_rate, hba_port); + } + default: + break; + } + if (tgtdev) + mpi3mr_tgtdev_put(tgtdev); + } + if (mrioc->sas_transport_enabled && (event_data->exp_status == + MPI3_EVENT_SAS_TOPO_ES_NOT_RESPONDING)) { + if (sas_expander) + mpi3mr_expander_remove(mrioc, exp_sas_address, + hba_port); + } + +} + +/** + * mpi3mr_pcietopochg_evt_debug - PCIeTopoChange details + * @mrioc: Adapter instance reference + * @event_data: PCIe topology change list event data + * + * Prints information about the PCIe topology change event. + * + * Return: Nothing. + */ +static void +mpi3mr_pcietopochg_evt_debug(struct mpi3mr_ioc *mrioc, + struct mpi3_event_data_pcie_topology_change_list *event_data) +{ + int i; + u16 handle; + u16 reason_code; + u8 port_number; + char *status_str = NULL; + u8 link_rate, prev_link_rate; + + if (!(mrioc->logging_level & MPI3_DEBUG_EVENT_WORK_TASK)) + return; + + switch (event_data->switch_status) { + case MPI3_EVENT_PCIE_TOPO_SS_NOT_RESPONDING: + status_str = "remove"; + break; + case MPI3_EVENT_PCIE_TOPO_SS_RESPONDING: + status_str = "responding"; + break; + case MPI3_EVENT_PCIE_TOPO_SS_DELAY_NOT_RESPONDING: + status_str = "remove delay"; + break; + case MPI3_EVENT_PCIE_TOPO_SS_NO_PCIE_SWITCH: + status_str = "direct attached"; + break; + default: + status_str = "unknown status"; + break; + } + ioc_info(mrioc, "%s :pcie topology change: (%s)\n", + __func__, status_str); + ioc_info(mrioc, + "%s :\tswitch_handle(0x%04x), enclosure_handle(0x%04x) start_port(%02d), num_entries(%d)\n", + __func__, le16_to_cpu(event_data->switch_dev_handle), + le16_to_cpu(event_data->enclosure_handle), + event_data->start_port_num, event_data->num_entries); + for (i = 0; i < event_data->num_entries; i++) { + handle = + le16_to_cpu(event_data->port_entry[i].attached_dev_handle); + if (!handle) + continue; + port_number = event_data->start_port_num + i; + reason_code = event_data->port_entry[i].port_status; + switch (reason_code) { + case MPI3_EVENT_PCIE_TOPO_PS_NOT_RESPONDING: + status_str = "target remove"; + break; + case MPI3_EVENT_PCIE_TOPO_PS_DELAY_NOT_RESPONDING: + status_str = "delay target remove"; + break; + case MPI3_EVENT_PCIE_TOPO_PS_PORT_CHANGED: + status_str = "link status change"; + break; + case MPI3_EVENT_PCIE_TOPO_PS_NO_CHANGE: + status_str = "link status no change"; + break; + case MPI3_EVENT_PCIE_TOPO_PS_RESPONDING: + status_str = "target responding"; + break; + default: + status_str = "unknown"; + break; + } + link_rate = event_data->port_entry[i].current_port_info & + MPI3_EVENT_PCIE_TOPO_PI_RATE_MASK; + prev_link_rate = event_data->port_entry[i].previous_port_info & + MPI3_EVENT_PCIE_TOPO_PI_RATE_MASK; + ioc_info(mrioc, + "%s :\tport(%02d), attached_handle(0x%04x): %s: link rate: new(0x%02x), old(0x%02x)\n", + __func__, port_number, handle, status_str, link_rate, + prev_link_rate); + } +} + +/** + * mpi3mr_pcietopochg_evt_bh - PCIeTopologyChange evt bottomhalf + * @mrioc: Adapter instance reference + * @fwevt: Firmware event reference + * + * Prints information about the PCIe topology change event and + * for "not responding" event code, removes the device from the + * upper layers. + * + * Return: Nothing. + */ +static void mpi3mr_pcietopochg_evt_bh(struct mpi3mr_ioc *mrioc, + struct mpi3mr_fwevt *fwevt) +{ + struct mpi3_event_data_pcie_topology_change_list *event_data = + (struct mpi3_event_data_pcie_topology_change_list *) + fwevt->event_data; + int i; + u16 handle; + u8 reason_code; + struct mpi3mr_tgt_dev *tgtdev = NULL; + + mpi3mr_pcietopochg_evt_debug(mrioc, event_data); + + for (i = 0; i < event_data->num_entries; i++) { + if (fwevt->discard) + return; + handle = + le16_to_cpu(event_data->port_entry[i].attached_dev_handle); + if (!handle) + continue; + tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, handle); + if (!tgtdev) + continue; + + reason_code = event_data->port_entry[i].port_status; + + switch (reason_code) { + case MPI3_EVENT_PCIE_TOPO_PS_NOT_RESPONDING: + if (tgtdev->host_exposed) + mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev); + mpi3mr_tgtdev_del_from_list(mrioc, tgtdev); + mpi3mr_tgtdev_put(tgtdev); + break; + default: + break; + } + if (tgtdev) + mpi3mr_tgtdev_put(tgtdev); + } +} + +/** + * mpi3mr_logdata_evt_bh - Log data event bottomhalf + * @mrioc: Adapter instance reference + * @fwevt: Firmware event reference + * + * Extracts the event data and calls application interfacing + * function to process the event further. + * + * Return: Nothing. + */ +static void mpi3mr_logdata_evt_bh(struct mpi3mr_ioc *mrioc, + struct mpi3mr_fwevt *fwevt) +{ + mpi3mr_app_save_logdata(mrioc, fwevt->event_data, + fwevt->event_data_size); +} + +/** + * mpi3mr_update_sdev_qd - Update SCSI device queue depath + * @sdev: SCSI device reference + * @data: Queue depth reference + * + * This is an iterator function called for each SCSI device in a + * target to update the QD of each SCSI device. + * + * Return: Nothing. + */ +static void mpi3mr_update_sdev_qd(struct scsi_device *sdev, void *data) +{ + u16 *q_depth = (u16 *)data; + scsi_change_queue_depth(sdev, (int)*q_depth); + sdev->max_queue_depth = sdev->queue_depth; +} +/** + * mpi3mr_set_qd_for_all_vd_in_tg -set QD for TG VDs + * @mrioc: Adapter instance reference + * @tg: Throttle group information pointer + * + * Accessor to reduce QD for each device associated with the + * given throttle group. + * + * Return: None. + */ +static void mpi3mr_set_qd_for_all_vd_in_tg(struct mpi3mr_ioc *mrioc, + struct mpi3mr_throttle_group_info *tg) +{ + unsigned long flags; + struct mpi3mr_tgt_dev *tgtdev; + struct mpi3mr_stgt_priv_data *tgt_priv; + + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) + if (tgtdev->starget && tgtdev->starget->hostdata) { + tgt_priv = tgtdev->starget->hostdata; + if (tgt_priv->throttle_group == tg) { + dprint_event_bh(mrioc, + "updating qd due to throttling for persist_id(%d) original_qd(%d), reduced_qd (%d)\n", + tgt_priv->perst_id, tgtdev->q_depth, + tg->modified_qd); + starget_for_each_device(tgtdev->starget, + (void *)&tg->modified_qd, + mpi3mr_update_sdev_qd); + } + } + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); +} + +/** + * mpi3mr_fwevt_bh - Firmware event bottomhalf handler + * @mrioc: Adapter instance reference + * @fwevt: Firmware event reference + * + * Identifies the firmware event and calls corresponding bottom + * half handler and sends event acknowledgment if required. + * + * Return: Nothing. + */ +static void mpi3mr_fwevt_bh(struct mpi3mr_ioc *mrioc, + struct mpi3mr_fwevt *fwevt) +{ + struct mpi3_device_page0 *dev_pg0 = NULL; + u16 perst_id, handle, dev_info; + struct mpi3_device0_sas_sata_format *sasinf = NULL; + struct mpi3mr_throttle_group_info *tg; + + mpi3mr_fwevt_del_from_list(mrioc, fwevt); + mrioc->current_event = fwevt; + + if (mrioc->stop_drv_processing) { + dprint_event_bh(mrioc, "ignoring event(0x%02x) in the bottom half handler due to stop_drv_processing\n", + fwevt->event_id); + goto out; + } + if (mrioc->unrecoverable) { + dprint_event_bh(mrioc, "ignoring event(0x%02x) in the bottom half handler due to unrecoverable controller\n", + fwevt->event_id); + goto out; + } + + if (!fwevt->process_event) + goto evt_ack; + + dprint_event_bh(mrioc, "processing event(0x%02x) in the bottom half handler\n", + fwevt->event_id); + switch (fwevt->event_id) { + case MPI3_EVENT_DEVICE_ADDED: + { + dev_pg0 = (struct mpi3_device_page0 *)fwevt->event_data; + perst_id = le16_to_cpu(dev_pg0->persistent_id); + handle = le16_to_cpu(dev_pg0->dev_handle); + if (perst_id != MPI3_DEVICE0_PERSISTENTID_INVALID) + mpi3mr_report_tgtdev_to_host(mrioc, perst_id); + else if (mrioc->sas_transport_enabled && + (dev_pg0->device_form == + MPI3_DEVICE_DEVFORM_SAS_SATA)) { + sasinf = &dev_pg0->device_specific.sas_sata_format; + dev_info = le16_to_cpu(sasinf->device_info); + if (!mrioc->sas_hba.num_phys) + mpi3mr_sas_host_add(mrioc); + else + mpi3mr_sas_host_refresh(mrioc); + if (mpi3mr_is_expander_device(dev_info)) + mpi3mr_expander_add(mrioc, handle); + } + break; + } + case MPI3_EVENT_DEVICE_INFO_CHANGED: + { + dev_pg0 = (struct mpi3_device_page0 *)fwevt->event_data; + perst_id = le16_to_cpu(dev_pg0->persistent_id); + if (perst_id != MPI3_DEVICE0_PERSISTENTID_INVALID) + mpi3mr_devinfochg_evt_bh(mrioc, dev_pg0); + break; + } + case MPI3_EVENT_DEVICE_STATUS_CHANGE: + { + mpi3mr_devstatuschg_evt_bh(mrioc, fwevt); + break; + } + case MPI3_EVENT_ENCL_DEVICE_ADDED: + case MPI3_EVENT_ENCL_DEVICE_STATUS_CHANGE: + { + mpi3mr_encldev_add_chg_evt_bh(mrioc, fwevt); + break; + } + + case MPI3_EVENT_SAS_TOPOLOGY_CHANGE_LIST: + { + mpi3mr_sastopochg_evt_bh(mrioc, fwevt); + break; + } + case MPI3_EVENT_PCIE_TOPOLOGY_CHANGE_LIST: + { + mpi3mr_pcietopochg_evt_bh(mrioc, fwevt); + break; + } + case MPI3_EVENT_LOG_DATA: + { + mpi3mr_logdata_evt_bh(mrioc, fwevt); + break; + } + case MPI3MR_DRIVER_EVENT_WAIT_FOR_DEVICES_TO_REFRESH: + { + while (mrioc->device_refresh_on) { + msleep(500); + } + dprint_event_bh(mrioc, + "scan for non responding and newly added devices after soft reset started\n"); + if (mrioc->sas_transport_enabled) { + mpi3mr_refresh_sas_ports(mrioc); + mpi3mr_refresh_expanders(mrioc); + } + mpi3mr_refresh_tgtdevs(mrioc); + ioc_info(mrioc, + "scan for non responding and newly added devices after soft reset completed\n"); + break; + } + case MPI3MR_DRIVER_EVENT_TG_QD_REDUCTION: + { + tg = (struct mpi3mr_throttle_group_info *) + (*(__le64 *)fwevt->event_data); + dprint_event_bh(mrioc, + "qd reduction event processed for tg_id(%d) reduction_needed(%d)\n", + tg->id, tg->need_qd_reduction); + if (tg->need_qd_reduction) { + mpi3mr_set_qd_for_all_vd_in_tg(mrioc, tg); + tg->need_qd_reduction = 0; + } + break; + } + case MPI3MR_DRIVER_EVENT_PROCESS_TRIGGER: + { + mpi3mr_process_trigger_data_event_bh(mrioc, + (struct trigger_event_data *)fwevt->event_data); + break; + } + default: + break; + } + +evt_ack: + if (fwevt->send_ack) + mpi3mr_process_event_ack(mrioc, fwevt->event_id, + fwevt->event_context); +out: + /* Put fwevt reference count to neutralize kref_init increment */ + mpi3mr_fwevt_put(fwevt); + mrioc->current_event = NULL; + +} + +/** + * mpi3mr_fwevt_worker - Firmware event worker + * @work: Work struct containing firmware event + * + * Extracts the firmware event and calls mpi3mr_fwevt_bh. + * + * Return: Nothing. + */ +static void mpi3mr_fwevt_worker(struct work_struct *work) +{ + struct mpi3mr_fwevt *fwevt = container_of(work, struct mpi3mr_fwevt, + work); + mpi3mr_fwevt_bh(fwevt->mrioc, fwevt); + /* + * Put fwevt reference count after + * dequeuing it from worker queue + */ + mpi3mr_fwevt_put(fwevt); +} + + +/** + * mpi3mr_create_tgtdev - Create and add a target device + * @mrioc: Adapter instance reference + * @dev_pg0: Device Page 0 data + * + * If the device specified by the device page 0 data is not + * present in the driver's internal list, allocate the memory + * for the device, populate the data and add to the list, else + * update the device data. The key is persistent ID. + * + * Return: 0 on success, -ENOMEM on memory allocation failure + */ +static int mpi3mr_create_tgtdev(struct mpi3mr_ioc *mrioc, + struct mpi3_device_page0 *dev_pg0) +{ + int retval = 0; + struct mpi3mr_tgt_dev *tgtdev = NULL; + u16 perst_id = 0; + + perst_id = le16_to_cpu(dev_pg0->persistent_id); + if (perst_id == MPI3_DEVICE0_PERSISTENTID_INVALID) + return retval; + + tgtdev = mpi3mr_get_tgtdev_by_perst_id(mrioc, perst_id); + if (tgtdev) { + mpi3mr_update_tgtdev(mrioc, tgtdev, dev_pg0, true); + mpi3mr_tgtdev_put(tgtdev); + } else { + tgtdev = mpi3mr_alloc_tgtdev(); + if (!tgtdev) + return -ENOMEM; + mpi3mr_update_tgtdev(mrioc, tgtdev, dev_pg0, true); + mpi3mr_tgtdev_add_to_list(mrioc, tgtdev); + } + + return retval; +} + +/** + * mpi3mr_flush_delayed_cmd_lists - Flush pending commands + * @mrioc: Adapter instance reference + * + * Flush pending commands in the delayed lists due to a + * controller reset or driver removal as a cleanup. + * + * Return: Nothing + */ +void mpi3mr_flush_delayed_cmd_lists(struct mpi3mr_ioc *mrioc) +{ + struct delayed_dev_rmhs_node *_rmhs_node; + struct delayed_evt_ack_node *_evtack_node; + + dprint_reset(mrioc, "flushing delayed dev_remove_hs commands\n"); + while (!list_empty(&mrioc->delayed_rmhs_list)) { + _rmhs_node = list_entry(mrioc->delayed_rmhs_list.next, + struct delayed_dev_rmhs_node, list); + list_del(&_rmhs_node->list); + kfree(_rmhs_node); + } + dprint_reset(mrioc, "flushing delayed event ack commands\n"); + while (!list_empty(&mrioc->delayed_evtack_cmds_list)) { + _evtack_node = list_entry(mrioc->delayed_evtack_cmds_list.next, + struct delayed_evt_ack_node, list); + list_del(&_evtack_node->list); + kfree(_evtack_node); + } +} + +/** + * mpi3mr_dev_rmhs_complete_iou - Device removal IOUC completion + * @mrioc: Adapter instance reference + * @drv_cmd: Internal command tracker + * + * Issues a target reset TM to the firmware from the device + * removal TM pend list or retry the removal handshake sequence + * based on the IOU control request IOC status. + * + * Return: Nothing + */ +static void mpi3mr_dev_rmhs_complete_iou(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd) +{ + u16 cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_DEVRMCMD_MIN; + struct delayed_dev_rmhs_node *delayed_dev_rmhs = NULL; + + if (drv_cmd->state & MPI3MR_CMD_RESET) + goto clear_drv_cmd; + + ioc_info(mrioc, + "dev_remove_hs: cmd_idx(%d): iounit control completed for handle(0x%04x), rc(%d), ioc_status(0x%04x), loginfo(0x%08x)\n", + cmd_idx, drv_cmd->dev_handle, drv_cmd->iou_rc, + drv_cmd->ioc_status, drv_cmd->ioc_loginfo); + + if (drv_cmd->ioc_status != MPI3_IOCSTATUS_SUCCESS) { + if (drv_cmd->retry_count < MPI3MR_DEV_RMHS_RETRY_COUNT) { + drv_cmd->retry_count++; + ioc_info(mrioc, + "dev_remove_hs: cmd_idx(%d): retrying for handle(0x%04x), rc(%d), ioc_status(0x%04x), loginfo(0x%08x), retry_count(%d)\n", + cmd_idx, drv_cmd->dev_handle, drv_cmd->iou_rc, + drv_cmd->ioc_status, drv_cmd->ioc_loginfo, + drv_cmd->retry_count); + mpi3mr_dev_rmhs_send_tm(mrioc, drv_cmd->dev_handle, + drv_cmd, drv_cmd->iou_rc); + return; + } + ioc_err(mrioc, + "dev_remove_hs: cmd_idx(%d): failed for handle(0x%04x), rc(%d) after all retries(%d)\n", + cmd_idx, drv_cmd->dev_handle, drv_cmd->iou_rc, + drv_cmd->retry_count); + } else { + ioc_info(mrioc, + "dev_remove_hs: cmd_idx(%d): completed successfully for handle(0x%04x), rc(%d)\n", + cmd_idx, drv_cmd->dev_handle, drv_cmd->iou_rc); + clear_bit(drv_cmd->dev_handle, mrioc->removepend_bitmap); + } + + if (!list_empty(&mrioc->delayed_rmhs_list)) { + delayed_dev_rmhs = list_entry(mrioc->delayed_rmhs_list.next, + struct delayed_dev_rmhs_node, list); + drv_cmd->dev_handle = delayed_dev_rmhs->handle; + drv_cmd->retry_count = 0; + drv_cmd->iou_rc = delayed_dev_rmhs->iou_rc; + mpi3mr_dev_rmhs_send_tm(mrioc, drv_cmd->dev_handle, drv_cmd, + drv_cmd->iou_rc); + list_del(&delayed_dev_rmhs->list); + kfree(delayed_dev_rmhs); + return; + } + +clear_drv_cmd: + drv_cmd->state = MPI3MR_CMD_NOTUSED; + drv_cmd->callback = NULL; + drv_cmd->retry_count = 0; + drv_cmd->dev_handle = MPI3MR_INVALID_DEV_HANDLE; + clear_bit(cmd_idx, mrioc->devrem_bitmap); +} + +/** + * mpi3mr_dev_rmhs_complete_tm - Device removal TM completion + * @mrioc: Adapter instance reference + * @drv_cmd: Internal command tracker + * + * Issues a target reset TM to the firmware from the device + * removal TM pend list or issue IO Unit Control request as + * part of device removal or hidden acknowledgment handshake. + * + * Return: Nothing + */ +static void mpi3mr_dev_rmhs_complete_tm(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd) +{ + struct mpi3_iounit_control_request iou_ctrl; + u16 cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_DEVRMCMD_MIN; + struct mpi3_scsi_task_mgmt_reply *tm_reply = NULL; + int retval; + + if (drv_cmd->state & MPI3MR_CMD_RESET) + goto clear_drv_cmd; + + if (drv_cmd->state & MPI3MR_CMD_REPLY_VALID) + tm_reply = (struct mpi3_scsi_task_mgmt_reply *)drv_cmd->reply; + + if (tm_reply) + ioc_info(mrioc, + "dev_remove_hs: cmd_idx(%d): target reset completed for handle(0x%04x), ioc_status(0x%04x), log_info(0x%08x), termination_count(%d)\n", + cmd_idx, drv_cmd->dev_handle, drv_cmd->ioc_status, + drv_cmd->ioc_loginfo, + le32_to_cpu(tm_reply->termination_count)); + + ioc_info(mrioc, + "dev_remove_hs: cmd_idx(%d): sending iounit control for handle(0x%04x) rc(%d)\n", + cmd_idx, drv_cmd->dev_handle, drv_cmd->iou_rc); + + memset(&iou_ctrl, 0, sizeof(iou_ctrl)); + + drv_cmd->state = MPI3MR_CMD_PENDING; + drv_cmd->is_waiting = 0; + drv_cmd->callback = mpi3mr_dev_rmhs_complete_iou; + iou_ctrl.operation = drv_cmd->iou_rc; + iou_ctrl.param16[0] = cpu_to_le16(drv_cmd->dev_handle); + iou_ctrl.host_tag = cpu_to_le16(drv_cmd->host_tag); + iou_ctrl.function = MPI3_FUNCTION_IO_UNIT_CONTROL; + + retval = mpi3mr_admin_request_post(mrioc, &iou_ctrl, sizeof(iou_ctrl), + 1); + if (retval) { + ioc_err(mrioc, + "dev_remove_hs: cmd_idx(%d): posting iounit control for handle(0x%04x) rc(%d) failed\n", + cmd_idx, drv_cmd->dev_handle, drv_cmd->iou_rc); + goto clear_drv_cmd; + } + ioc_info(mrioc, + "dev_remove_hs: cmd_idx(%d): posted iounit control for handle(0x%04x) rc(%d)\n", + cmd_idx, drv_cmd->dev_handle, drv_cmd->iou_rc); + + return; +clear_drv_cmd: + drv_cmd->state = MPI3MR_CMD_NOTUSED; + drv_cmd->callback = NULL; + drv_cmd->dev_handle = MPI3MR_INVALID_DEV_HANDLE; + drv_cmd->retry_count = 0; + clear_bit(cmd_idx, mrioc->devrem_bitmap); +} + +/** + * mpi3mr_dev_rmhs_send_tm - Issue TM for device removal + * @mrioc: Adapter instance reference + * @handle: Device handle + * @cmdparam: Internal command tracker + * @iou_rc: IO unit reason code + * + * Issues a target reset TM to the firmware or add it to a pend + * list as part of device removal or hidden acknowledgment + * handshake. + * + * Return: Nothing + */ +static void mpi3mr_dev_rmhs_send_tm(struct mpi3mr_ioc *mrioc, u16 handle, + struct mpi3mr_drv_cmd *cmdparam, u8 iou_rc) +{ + struct mpi3_scsi_task_mgmt_request tm_req; + int retval = 0; + u16 cmd_idx = MPI3MR_NUM_DEVRMCMD; + u8 retrycount = 5; + struct mpi3mr_drv_cmd *drv_cmd = cmdparam; + struct delayed_dev_rmhs_node *delayed_dev_rmhs = NULL; + + if (drv_cmd) { + ioc_info(mrioc, + "dev_remove_hs: sending delayed target reset for handle(0x%04x) rc(%d)\n", + handle, iou_rc); + goto issue_cmd; + } + ioc_info(mrioc, + "dev_remove_hs: sending target reset for handle(0x%04x) rc(%d)\n", + handle, iou_rc); + + do { + cmd_idx = find_first_zero_bit(mrioc->devrem_bitmap, + MPI3MR_NUM_DEVRMCMD); + if (cmd_idx < MPI3MR_NUM_DEVRMCMD) { + if (!test_and_set_bit(cmd_idx, mrioc->devrem_bitmap)) + break; + cmd_idx = MPI3MR_NUM_DEVRMCMD; + } + } while (retrycount--); + + if (cmd_idx >= MPI3MR_NUM_DEVRMCMD) { + delayed_dev_rmhs = kzalloc(sizeof(*delayed_dev_rmhs), + GFP_ATOMIC); + if (!delayed_dev_rmhs) + return; + INIT_LIST_HEAD(&delayed_dev_rmhs->list); + delayed_dev_rmhs->handle = handle; + delayed_dev_rmhs->iou_rc = iou_rc; + list_add_tail(&delayed_dev_rmhs->list, + &mrioc->delayed_rmhs_list); + ioc_info(mrioc, + "dev_remove_hs: target reset for handle(0x%04x) rc(%d) is postponed\n", + handle, iou_rc); + return; + } + drv_cmd = &mrioc->dev_rmhs_cmds[cmd_idx]; + +issue_cmd: + cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_DEVRMCMD_MIN; + + memset(&tm_req, 0, sizeof(tm_req)); + if (drv_cmd->state & MPI3MR_CMD_PENDING) { + ioc_err(mrioc, + "dev_remove_hs: sending target reset for handle(0x%04x) rc(%d) is failed due to command in use\n", + handle, iou_rc); + goto out; + } + drv_cmd->state = MPI3MR_CMD_PENDING; + drv_cmd->is_waiting = 0; + drv_cmd->callback = mpi3mr_dev_rmhs_complete_tm; + drv_cmd->dev_handle = handle; + drv_cmd->iou_rc = iou_rc; + tm_req.dev_handle = cpu_to_le16(handle); + tm_req.task_type = MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET; + tm_req.host_tag = cpu_to_le16(drv_cmd->host_tag); + tm_req.task_host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INVALID); + tm_req.function = MPI3_FUNCTION_SCSI_TASK_MGMT; + + set_bit(handle, mrioc->removepend_bitmap); + retval = mpi3mr_admin_request_post(mrioc, &tm_req, sizeof(tm_req), 1); + if (retval) { + ioc_err(mrioc, + "dev_remove_hs: posting target reset for handle(0x%04x) rc(%d) is failed\n", + handle, iou_rc); + goto out_failed; + } + ioc_info(mrioc, + "dev_remove_hs: posted target reset for handle(0x%04x) rc(%d) with cmd_idx(%d)\n", + handle, iou_rc, cmd_idx); +out: + return; +out_failed: + drv_cmd->state = MPI3MR_CMD_NOTUSED; + drv_cmd->callback = NULL; + drv_cmd->dev_handle = MPI3MR_INVALID_DEV_HANDLE; + drv_cmd->retry_count = 0; + clear_bit(cmd_idx, mrioc->devrem_bitmap); +} + + +/** + * mpi3mr_complete_evt_ack - event ack request completion + * @mrioc: Adapter instance reference + * @drv_cmd: Internal command tracker + * + * This is the completion handler for non blocking event + * acknowledgment sent to the firmware and this will issue any + * pending event acknowledgment request. + * + * Return: Nothing + */ +static void mpi3mr_complete_evt_ack(struct mpi3mr_ioc *mrioc, + struct mpi3mr_drv_cmd *drv_cmd) +{ + u16 cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_EVTACKCMD_MIN; + struct delayed_evt_ack_node *delayed_evtack = NULL; + + if (drv_cmd->state & MPI3MR_CMD_RESET) + goto clear_drv_cmd; + + if (drv_cmd->ioc_status != MPI3_IOCSTATUS_SUCCESS) { + dprint_event_th(mrioc, + "immediate event ack failed with ioc_status(0x%04x) log_info(0x%08x)\n", + (drv_cmd->ioc_status & MPI3_IOCSTATUS_STATUS_MASK), + drv_cmd->ioc_loginfo); + } + + if (!list_empty(&mrioc->delayed_evtack_cmds_list)) { + delayed_evtack = + list_entry(mrioc->delayed_evtack_cmds_list.next, + struct delayed_evt_ack_node, list); + mpi3mr_send_event_ack(mrioc, delayed_evtack->event, drv_cmd, + delayed_evtack->event_ctx); + list_del(&delayed_evtack->list); + kfree(delayed_evtack); + return; + } +clear_drv_cmd: + drv_cmd->state = MPI3MR_CMD_NOTUSED; + drv_cmd->callback = NULL; + clear_bit(cmd_idx, mrioc->evtack_cmds_bitmap); +} + +/** + * mpi3mr_send_event_ack - Issue event acknwoledgment request + * @mrioc: Adapter instance reference + * @event: MPI3 event id + * @cmdparam: Internal command tracker + * @event_ctx: event context + * + * Issues event acknowledgment request to the firmware if there + * is a free command to send the event ack else it to a pend + * list so that it will be processed on a completion of a prior + * event acknowledgment . + * + * Return: Nothing + */ +static void mpi3mr_send_event_ack(struct mpi3mr_ioc *mrioc, u8 event, + struct mpi3mr_drv_cmd *cmdparam, u32 event_ctx) +{ + struct mpi3_event_ack_request evtack_req; + int retval = 0; + u8 retrycount = 5; + u16 cmd_idx = MPI3MR_NUM_EVTACKCMD; + struct mpi3mr_drv_cmd *drv_cmd = cmdparam; + struct delayed_evt_ack_node *delayed_evtack = NULL; + + if (drv_cmd) { + dprint_event_th(mrioc, + "sending delayed event ack in the top half for event(0x%02x), event_ctx(0x%08x)\n", + event, event_ctx); + goto issue_cmd; + } + dprint_event_th(mrioc, + "sending event ack in the top half for event(0x%02x), event_ctx(0x%08x)\n", + event, event_ctx); + do { + cmd_idx = find_first_zero_bit(mrioc->evtack_cmds_bitmap, + MPI3MR_NUM_EVTACKCMD); + if (cmd_idx < MPI3MR_NUM_EVTACKCMD) { + if (!test_and_set_bit(cmd_idx, + mrioc->evtack_cmds_bitmap)) + break; + cmd_idx = MPI3MR_NUM_EVTACKCMD; + } + } while (retrycount--); + + if (cmd_idx >= MPI3MR_NUM_EVTACKCMD) { + delayed_evtack = kzalloc(sizeof(*delayed_evtack), + GFP_ATOMIC); + if (!delayed_evtack) + return; + INIT_LIST_HEAD(&delayed_evtack->list); + delayed_evtack->event = event; + delayed_evtack->event_ctx = event_ctx; + list_add_tail(&delayed_evtack->list, + &mrioc->delayed_evtack_cmds_list); + dprint_event_th(mrioc, + "event ack in the top half for event(0x%02x), event_ctx(0x%08x) is postponed\n", + event, event_ctx); + return; + } + drv_cmd = &mrioc->evtack_cmds[cmd_idx]; + +issue_cmd: + cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_EVTACKCMD_MIN; + + memset(&evtack_req, 0, sizeof(evtack_req)); + if (drv_cmd->state & MPI3MR_CMD_PENDING) { + dprint_event_th(mrioc, + "sending event ack failed due to command in use\n"); + goto out; + } + drv_cmd->state = MPI3MR_CMD_PENDING; + drv_cmd->is_waiting = 0; + drv_cmd->callback = mpi3mr_complete_evt_ack; + evtack_req.host_tag = cpu_to_le16(drv_cmd->host_tag); + evtack_req.function = MPI3_FUNCTION_EVENT_ACK; + evtack_req.event = event; + evtack_req.event_context = cpu_to_le32(event_ctx); + retval = mpi3mr_admin_request_post(mrioc, &evtack_req, + sizeof(evtack_req), 1); + if (retval) { + dprint_event_th(mrioc, + "posting event ack request is failed\n"); + goto out_failed; + } + + dprint_event_th(mrioc, + "event ack in the top half for event(0x%02x), event_ctx(0x%08x) is posted\n", + event, event_ctx); +out: + return; +out_failed: + drv_cmd->state = MPI3MR_CMD_NOTUSED; + drv_cmd->callback = NULL; + clear_bit(cmd_idx, mrioc->evtack_cmds_bitmap); +} + +/** + * mpi3mr_pcietopochg_evt_th - PCIETopologyChange event tophalf + * @mrioc: Adapter instance reference + * @event_reply: event data + * + * Checks for the reason code and based on that either block I/O + * to device, or unblock I/O to the device, or start the device + * removal handshake with reason as remove with the firmware for + * PCIe devices. + * + * Return: Nothing + */ +static void mpi3mr_pcietopochg_evt_th(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply) +{ + struct mpi3_event_data_pcie_topology_change_list *topo_evt = + (struct mpi3_event_data_pcie_topology_change_list *) event_reply->event_data; + int i; + u16 handle; + u8 reason_code; + struct mpi3mr_tgt_dev *tgtdev = NULL; + struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data = NULL; + + for (i = 0; i < topo_evt->num_entries; i++) { + handle = le16_to_cpu(topo_evt->port_entry[i].attached_dev_handle); + if (!handle) + continue; + reason_code = topo_evt->port_entry[i].port_status; + scsi_tgt_priv_data = NULL; + tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, handle); + if (tgtdev && tgtdev->starget && tgtdev->starget->hostdata) + scsi_tgt_priv_data = (struct mpi3mr_stgt_priv_data *) + tgtdev->starget->hostdata; + switch (reason_code) { + case MPI3_EVENT_PCIE_TOPO_PS_NOT_RESPONDING: + if (scsi_tgt_priv_data) { + scsi_tgt_priv_data->dev_removed = 1; + scsi_tgt_priv_data->dev_removedelay = 0; + atomic_set(&scsi_tgt_priv_data->block_io, 0); + } + mpi3mr_dev_rmhs_send_tm(mrioc, handle, NULL, + MPI3_CTRL_OP_REMOVE_DEVICE); + break; + case MPI3_EVENT_PCIE_TOPO_PS_DELAY_NOT_RESPONDING: + if (scsi_tgt_priv_data) { + scsi_tgt_priv_data->dev_removedelay = 1; + atomic_inc(&scsi_tgt_priv_data->block_io); + } + break; + case MPI3_EVENT_PCIE_TOPO_PS_RESPONDING: + if (scsi_tgt_priv_data && + scsi_tgt_priv_data->dev_removedelay) { + scsi_tgt_priv_data->dev_removedelay = 0; + atomic_dec_if_positive + (&scsi_tgt_priv_data->block_io); + } + break; + case MPI3_EVENT_PCIE_TOPO_PS_PORT_CHANGED: + default: + break; + } + if (tgtdev) + mpi3mr_tgtdev_put(tgtdev); + } +} + +/** + * mpi3mr_sastopochg_evt_th - SASTopologyChange event tophalf + * @mrioc: Adapter instance reference + * @event_reply: event data + * + * Checks for the reason code and based on that either block I/O + * to device, or unblock I/O to the device, or start the device + * removal handshake with reason as remove with the firmware for + * SAS/SATA devices. + * + * Return: Nothing + */ +static void mpi3mr_sastopochg_evt_th(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply) +{ + struct mpi3_event_data_sas_topology_change_list *topo_evt = + (struct mpi3_event_data_sas_topology_change_list *) + event_reply->event_data; + int i; + u16 handle; + u8 reason_code; + struct mpi3mr_tgt_dev *tgtdev = NULL; + struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data = NULL; + + for (i = 0; i < topo_evt->num_entries; i++) { + handle = le16_to_cpu(topo_evt->phy_entry[i].attached_dev_handle); + if (!handle) + continue; + reason_code = topo_evt->phy_entry[i].status & + MPI3_EVENT_SAS_TOPO_PHY_RC_MASK; + scsi_tgt_priv_data = NULL; + tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, handle); + if (tgtdev && tgtdev->starget && tgtdev->starget->hostdata) + scsi_tgt_priv_data = (struct mpi3mr_stgt_priv_data *) + tgtdev->starget->hostdata; + switch (reason_code) { + case MPI3_EVENT_SAS_TOPO_PHY_RC_TARG_NOT_RESPONDING: + if (scsi_tgt_priv_data) { + scsi_tgt_priv_data->dev_removed = 1; + scsi_tgt_priv_data->dev_removedelay = 0; + atomic_set(&scsi_tgt_priv_data->block_io, 0); + } + mpi3mr_dev_rmhs_send_tm(mrioc, handle, NULL, + MPI3_CTRL_OP_REMOVE_DEVICE); + break; + case MPI3_EVENT_SAS_TOPO_PHY_RC_DELAY_NOT_RESPONDING: + if (scsi_tgt_priv_data) { + scsi_tgt_priv_data->dev_removedelay = 1; + atomic_inc(&scsi_tgt_priv_data->block_io); + } + break; + case MPI3_EVENT_SAS_TOPO_PHY_RC_RESPONDING: + if (scsi_tgt_priv_data && + scsi_tgt_priv_data->dev_removedelay) { + scsi_tgt_priv_data->dev_removedelay = 0; + atomic_dec_if_positive + (&scsi_tgt_priv_data->block_io); + } + case MPI3_EVENT_SAS_TOPO_PHY_RC_PHY_CHANGED: + default: + break; + } + if (tgtdev) + mpi3mr_tgtdev_put(tgtdev); + } + +} + +/** + * mpi3mr_devstatuschg_evt_th - DeviceStatusChange event tophalf + * @mrioc: Adapter instance reference + * @event_reply: event data + * + * Checks for the reason code and based on that either block I/O + * to device, or unblock I/O to the device, or start the device + * removal handshake with reason as remove/hide acknowledgment + * with the firmware. + * + * Return: Nothing + */ +static void mpi3mr_devstatuschg_evt_th(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply) +{ + u16 dev_handle = 0; + u8 ublock = 0, block = 0, hide = 0, delete = 0, remove = 0; + struct mpi3mr_tgt_dev *tgtdev = NULL; + struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data = NULL; + struct mpi3_event_data_device_status_change *evtdata = + (struct mpi3_event_data_device_status_change *) + event_reply->event_data; + + if (mrioc->stop_drv_processing) + goto out; + + dev_handle = le16_to_cpu(evtdata->dev_handle); + dprint_event_th(mrioc, + "device status change event top half with rc(0x%02x) for handle(0x%04x)\n", + evtdata->reason_code, dev_handle); + + switch (evtdata->reason_code) { + case MPI3_EVENT_DEV_STAT_RC_INT_DEVICE_RESET_STRT: + case MPI3_EVENT_DEV_STAT_RC_INT_IT_NEXUS_RESET_STRT: + block = 1; + break; + case MPI3_EVENT_DEV_STAT_RC_HIDDEN: + delete = 1; + hide = 1; + break; + case MPI3_EVENT_DEV_STAT_RC_VD_NOT_RESPONDING: + delete = 1; + remove = 1; + break; + case MPI3_EVENT_DEV_STAT_RC_INT_DEVICE_RESET_CMP: + case MPI3_EVENT_DEV_STAT_RC_INT_IT_NEXUS_RESET_CMP: + ublock = 1; + break; + default: + break; + } + + tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, dev_handle); + if (!tgtdev) { + dprint_event_th(mrioc, + "processing device status change event could not identify device for handle(0x%04x)\n", + dev_handle); + goto out; + } + if (hide) + tgtdev->is_hidden = hide; + if (tgtdev->starget && tgtdev->starget->hostdata) { + scsi_tgt_priv_data = (struct mpi3mr_stgt_priv_data *) + tgtdev->starget->hostdata; + if (block) + atomic_inc(&scsi_tgt_priv_data->block_io); + if (delete) + scsi_tgt_priv_data->dev_removed = 1; + if (ublock) + atomic_dec_if_positive(&scsi_tgt_priv_data->block_io); + } + if (remove) + mpi3mr_dev_rmhs_send_tm(mrioc, dev_handle, NULL, + MPI3_CTRL_OP_REMOVE_DEVICE); + if (hide) + mpi3mr_dev_rmhs_send_tm(mrioc, dev_handle, NULL, + MPI3_CTRL_OP_HIDDEN_ACK); + +out: + if (tgtdev) + mpi3mr_tgtdev_put(tgtdev); + +} + +/** + * mpi3mr_preparereset_evt_th - Prepare for reset event tophalf + * @mrioc: Adapter instance reference + * @event_reply: event data + * + * Blocks and unblocks host level I/O based on the reason code + * + * Return: Nothing + */ +static void mpi3mr_preparereset_evt_th(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply) +{ + struct mpi3_event_data_prepare_for_reset *evtdata = + (struct mpi3_event_data_prepare_for_reset *)event_reply->event_data; + + if (evtdata->reason_code == MPI3_EVENT_PREPARE_RESET_RC_START) { + dprint_event_th(mrioc, + "prepare for reset event top half with rc=start\n"); + if (mrioc->prepare_for_reset) + return; + mrioc->prepare_for_reset = 1; + mrioc->prepare_for_reset_timeout_counter = 0; + } else if (evtdata->reason_code == MPI3_EVENT_PREPARE_RESET_RC_ABORT) { + dprint_event_th(mrioc, + "prepare for reset top half with rc=abort\n"); + mrioc->prepare_for_reset = 0; + mrioc->prepare_for_reset_timeout_counter = 0; + } + if ((event_reply->msg_flags & MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_MASK) + == MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_REQUIRED) + mpi3mr_send_event_ack(mrioc, event_reply->event, NULL, + le32_to_cpu(event_reply->event_context)); +} + +/** + * mpi3mr_energypackchg_evt_th - Energy pack change event tophalf + * @mrioc: Adapter instance reference + * @event_reply: event data + * + * Identifies the new shutdown timeout value and update. + * + * Return: Nothing + */ +static void mpi3mr_energypackchg_evt_th(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply) +{ + struct mpi3_event_data_energy_pack_change *evtdata = + (struct mpi3_event_data_energy_pack_change *) + event_reply->event_data; + u16 shutdown_timeout = le16_to_cpu(evtdata->shutdown_timeout); + + if (shutdown_timeout <= 0) { + dprint_event_th(mrioc, + "invalid shutdown timeout(%d) in the energy pack change event\n", + shutdown_timeout); + return; + } + + dprint_event_th(mrioc, + "previous shutdown timeout(%d), new shutdown timeout(%d) in the energy pack change event\n", + mrioc->facts.shutdown_timeout, shutdown_timeout); + mrioc->facts.shutdown_timeout = shutdown_timeout; +} + + +/** + * mpi3mr_cablemgmt_evt_th - Cable management event tophalf + * @mrioc: Adapter instance reference + * @event_reply: event data + * + * Displays Cable manegemt event details. + * + * Return: Nothing + */ +static void mpi3mr_cablemgmt_evt_th(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply) +{ + struct mpi3_event_data_cable_management *evtdata = + (struct mpi3_event_data_cable_management *)event_reply->event_data; + + switch (evtdata->status) { + case MPI3_EVENT_CABLE_MGMT_STATUS_INSUFFICIENT_POWER: + { + ioc_info(mrioc, "An active cable with receptacle_id %d cannot be powered.\n" + "Devices connected to this cable are not detected.\n" + "This cable requires %d mW of power.\n", + evtdata->receptacle_id, + le32_to_cpu(evtdata->active_cable_power_requirement)); + break; + } + case MPI3_EVENT_CABLE_MGMT_STATUS_DEGRADED: + { + ioc_info(mrioc, "A cable with receptacle_id %d is not running at optimal speed\n", + evtdata->receptacle_id); + break; + } + default: + break; + } +} + +/** + * mpi3mr_add_event_wait_for_device_refresh - Add Wait for Device Refresh Event + * @mrioc: Adapter instance reference + * + * Add driver specific event to make sure that the driver won't process the + * events until all the devices are refreshed during soft reset. + * + * Return: Nothing + */ +void mpi3mr_add_event_wait_for_device_refresh(struct mpi3mr_ioc *mrioc) +{ + struct mpi3mr_fwevt *fwevt = NULL; + + fwevt = mpi3mr_alloc_fwevt(0); + if (!fwevt) + { + dprint_event_th(mrioc, + "failed to schedule bottom half handler for event(0x%02x)\n", + MPI3MR_DRIVER_EVENT_WAIT_FOR_DEVICES_TO_REFRESH); + return; + } + fwevt->mrioc = mrioc; + fwevt->event_id = MPI3MR_DRIVER_EVENT_WAIT_FOR_DEVICES_TO_REFRESH; + fwevt->send_ack = 0; + fwevt->process_event = 1; + fwevt->event_context = 0; + fwevt->event_data_size = 0; + mpi3mr_fwevt_add_to_list(mrioc, fwevt); +} + +/** + * mpi3mr_os_handle_events - Firmware event handler + * @mrioc: Adapter instance reference + * @event_reply: event data + * + * Identify whether the event has to handled and acknowledged + * and either process the event in the tophalf and/or schedule a + * bottom half through mpi3mr_fwevt_worker. + * + * Return: Nothing + */ +void mpi3mr_os_handle_events(struct mpi3mr_ioc *mrioc, + struct mpi3_event_notification_reply *event_reply) +{ + u8 evt_type; + u16 sz; + struct mpi3mr_fwevt *fwevt = NULL; + bool ack_req = 0, process_event_bh = 0; + + if (mrioc->stop_drv_processing) + return; + + if ((event_reply->msg_flags & MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_MASK) + == MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_REQUIRED) + ack_req = 1; + + evt_type = event_reply->event; + mpi3mr_event_trigger(mrioc, event_reply->event); + + switch (evt_type) { + case MPI3_EVENT_DEVICE_ADDED: + { + struct mpi3_device_page0 *dev_pg0 = + (struct mpi3_device_page0 *)event_reply->event_data; + if (mpi3mr_create_tgtdev(mrioc, dev_pg0)) + dprint_event_th(mrioc, + "failed to process device added event for handle(0x%04x), perst_id(%d) in the event top half handler\n", + le16_to_cpu(dev_pg0->dev_handle), + le16_to_cpu(dev_pg0->persistent_id)); + else + process_event_bh = 1; + break; + } + case MPI3_EVENT_DEVICE_STATUS_CHANGE: + { + process_event_bh = 1; + mpi3mr_devstatuschg_evt_th(mrioc, event_reply); + break; + } + case MPI3_EVENT_SAS_TOPOLOGY_CHANGE_LIST: + { + process_event_bh = 1; + mpi3mr_sastopochg_evt_th(mrioc, event_reply); + break; + } + case MPI3_EVENT_PCIE_TOPOLOGY_CHANGE_LIST: + { + process_event_bh = 1; + mpi3mr_pcietopochg_evt_th(mrioc, event_reply); + break; + } + case MPI3_EVENT_PREPARE_FOR_RESET: + { + mpi3mr_preparereset_evt_th(mrioc, event_reply); + ack_req = 0; + break; + } + case MPI3_EVENT_DIAGNOSTIC_BUFFER_STATUS_CHANGE: + { + mpi3mr_hdbstatuschg_evt_th(mrioc, event_reply); + break; + } + case MPI3_EVENT_DEVICE_INFO_CHANGED: + case MPI3_EVENT_LOG_DATA: + case MPI3_EVENT_ENCL_DEVICE_STATUS_CHANGE: + case MPI3_EVENT_ENCL_DEVICE_ADDED: + { + process_event_bh = 1; + break; + } + case MPI3_EVENT_ENERGY_PACK_CHANGE: + { + mpi3mr_energypackchg_evt_th(mrioc, event_reply); + break; + } + case MPI3_EVENT_CABLE_MGMT: + { + mpi3mr_cablemgmt_evt_th(mrioc, event_reply); + break; + } + + case MPI3_EVENT_SAS_DISCOVERY: + case MPI3_EVENT_SAS_DEVICE_DISCOVERY_ERROR: + case MPI3_EVENT_SAS_BROADCAST_PRIMITIVE: + case MPI3_EVENT_PCIE_ENUMERATION: + break; + default: + break; + } + if (process_event_bh || ack_req) { + dprint_event_th(mrioc, + "scheduling bottom half handler for event(0x%02x), ack_required=%d\n", + evt_type, ack_req); + sz = event_reply->event_data_length * 4; + fwevt = mpi3mr_alloc_fwevt(sz); + if (!fwevt) + { + dprint_event_th(mrioc, + "failed to schedule bottom half handler for event(0x%02x), ack_required=%d\n", + evt_type, ack_req); + return; + } + memcpy(fwevt->event_data, event_reply->event_data, sz); + fwevt->mrioc = mrioc; + fwevt->event_id = evt_type; + fwevt->send_ack = ack_req; + fwevt->process_event = process_event_bh; + fwevt->event_context = + le32_to_cpu(event_reply->event_context); + fwevt->event_data_size = sz; + mpi3mr_fwevt_add_to_list(mrioc, fwevt); + } +} + +/** + * mpi3mr_get_fw_pending_ios - Calculate pending I/O count + * @mrioc: Adapter instance reference + * + * Calculate the pending I/Os for the controller and return. + * + * Return: Number of pending I/Os + */ +static inline int mpi3mr_get_fw_pending_ios(struct mpi3mr_ioc *mrioc) +{ + u16 i; + uint pend_ios = 0; + + for (i = 0; i < mrioc->num_op_reply_q; i++) + pend_ios += atomic_read(&mrioc->op_reply_qinfo[i].pend_ios); + return pend_ios; +} + +/** + * mpi3mr_wait_for_host_io - block for I/Os to complete + * @mrioc: Adapter instance reference + * @timeout: time out in seconds + * Waits for pending I/Os for the given adapter to complete or + * to hit the timeout. + * + * Return: Nothing + */ +void mpi3mr_wait_for_host_io(struct mpi3mr_ioc *mrioc, u32 timeout) +{ + enum mpi3mr_iocstate iocstate; + int i = 0; + + iocstate = mpi3mr_get_iocstate(mrioc); + if (iocstate != MRIOC_STATE_READY) + return; + + if (!mpi3mr_get_fw_pending_ios(mrioc)) + return; + ioc_info(mrioc, + "waiting for maximum of %d seconds prior to reset for %d pending I/Os to complete\n", + timeout, mpi3mr_get_fw_pending_ios(mrioc)); + + for (i = 0; i < timeout; i++) { + if (!mpi3mr_get_fw_pending_ios(mrioc)) + break; + iocstate = mpi3mr_get_iocstate(mrioc); + if (iocstate != MRIOC_STATE_READY) + break; + msleep(1000); + } + + ioc_info(mrioc, "pending I/Os after wait is: %d\n", + mpi3mr_get_fw_pending_ios(mrioc)); +} + +/** + * mpi3mr_setup_nvme_eedp - Setup DIF info for NVMe IO request + * @mrioc: Adapter instance reference + * @scmd: SCSI command reference + * @scsiio_req: MPI3 SCSI IO request + * @scsiio_flags: Pointer to MPI3 SCSI IO Flags + * + * Identifies the protection information flags from the SCSI + * command and set appropriate flags in the MPI3 SCSI IO request + * for the I/Os issued to the NVMe drives. + * + * Return: Nothing + */ +static void mpi3mr_setup_nvme_eedp(struct mpi3mr_ioc *mrioc, + struct scsi_cmnd *scmd, struct mpi3_scsi_io_request *scsiio_req, + u32 *scsiio_flags) +{ + unsigned char prot_op = scsi_get_prot_op(scmd); + u8 host_md = 0, opcode = scmd->cmnd[0], sa = scmd->cmnd[9], xprt = 0; + + + if ((prot_op == SCSI_PROT_READ_PASS) || + (prot_op == SCSI_PROT_WRITE_PASS)) { + host_md = 1; + scsiio_req->msg_flags |= MPI3_SCSIIO_MSGFLAGS_METASGL_VALID; + } + + if (!mrioc->check_xprotect_nvme) + return; + + if (!((opcode == READ_10) || (opcode == WRITE_10) || + (opcode == READ_12) || (opcode == WRITE_12) || + (opcode == READ_16) || (opcode == WRITE_16) || + ((opcode == VARIABLE_LENGTH_CMD) && + ((sa == READ_32) || (sa == WRITE_32))))) + return; + if (opcode == VARIABLE_LENGTH_CMD) + xprt = scmd->cmnd[10] & 0xe0; + else + xprt = scmd->cmnd[1] & 0xe0; + if (!xprt) { + scsiio_req->msg_flags &= ~MPI3_SCSIIO_MSGFLAGS_METASGL_VALID; + scsiio_req->msg_flags |= + MPI3_SCSIIO_MSGFLAGS_DIVERT_TO_FIRMWARE; + *scsiio_flags |= MPI3_SCSIIO_FLAGS_DIVERT_REASON_PROD_SPECIFIC; + } else if (!host_md) { + scsiio_req->msg_flags |= + MPI3_SCSIIO_MSGFLAGS_DIVERT_TO_FIRMWARE; + *scsiio_flags |= MPI3_SCSIIO_FLAGS_DIVERT_REASON_PROD_SPECIFIC; + } +} +/** + * mpi3mr_setup_sas_eedp - Setup EEDP information for SAS IO Req + * @mrioc: Adapter instance reference + * @scmd: SCSI command reference + * @scsiio_req: MPI3 SCSI IO request + * + * Identifies the protection information flags from the SCSI + * command and set appropriate flags in the MPI3 SCSI IO + * request. + * + * Return: Nothing + */ +static void mpi3mr_setup_sas_eedp(struct mpi3mr_ioc *mrioc, + struct scsi_cmnd *scmd, struct mpi3_scsi_io_request *scsiio_req) +{ + u16 eedp_flags = 0; + unsigned char prot_op = scsi_get_prot_op(scmd); + unsigned char prot_type = scsi_get_prot_type(scmd); + + scsiio_req->sgl[0].eedp.flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE; + switch (prot_op) { + case SCSI_PROT_NORMAL: + return; + case SCSI_PROT_READ_STRIP: + eedp_flags = MPI3_EEDPFLAGS_EEDP_OP_CHECK_REMOVE; + break; + case SCSI_PROT_WRITE_INSERT: + eedp_flags = MPI3_EEDPFLAGS_EEDP_OP_INSERT; + break; + case SCSI_PROT_READ_INSERT: + eedp_flags = MPI3_EEDPFLAGS_EEDP_OP_INSERT; + scsiio_req->msg_flags |= MPI3_SCSIIO_MSGFLAGS_METASGL_VALID; + break; + case SCSI_PROT_WRITE_STRIP: + eedp_flags = MPI3_EEDPFLAGS_EEDP_OP_CHECK_REMOVE; + scsiio_req->msg_flags |= MPI3_SCSIIO_MSGFLAGS_METASGL_VALID; + break; + case SCSI_PROT_READ_PASS: + case SCSI_PROT_WRITE_PASS: + eedp_flags = MPI3_EEDPFLAGS_EEDP_OP_CHECK | + MPI3_EEDPFLAGS_CHK_REF_TAG | MPI3_EEDPFLAGS_CHK_APP_TAG | + MPI3_EEDPFLAGS_CHK_GUARD; + scsiio_req->msg_flags |= MPI3_SCSIIO_MSGFLAGS_METASGL_VALID; + break; + default: + return; + } + + switch (prot_type) { + case SCSI_PROT_DIF_TYPE0: + eedp_flags |= MPI3_EEDPFLAGS_INCR_PRI_REF_TAG; + scsiio_req->cdb.eedp32.primary_reference_tag = + cpu_to_be32(mpi3mr_kc_prot_ref_tag(scmd)); + break; + case SCSI_PROT_DIF_TYPE1: + case SCSI_PROT_DIF_TYPE2: + eedp_flags |= MPI3_EEDPFLAGS_INCR_PRI_REF_TAG | + MPI3_EEDPFLAGS_ESC_MODE_APPTAG_DISABLE | + MPI3_EEDPFLAGS_CHK_GUARD; + scsiio_req->cdb.eedp32.primary_reference_tag = + cpu_to_be32(mpi3mr_kc_prot_ref_tag(scmd)); + break; + case SCSI_PROT_DIF_TYPE3: + eedp_flags |= MPI3_EEDPFLAGS_CHK_GUARD | + MPI3_EEDPFLAGS_ESC_MODE_APPTAG_DISABLE; + break; + + default: + scsiio_req->msg_flags &= ~(MPI3_SCSIIO_MSGFLAGS_METASGL_VALID); + return; + } + + switch (scmd->device->sector_size) { + case 512: + scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_512; + break; + case 520: + scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_520; + break; + case 4080: + scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_4080; + break; + case 4088: + scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_4088; + break; + case 4096: + scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_4096; + break; + case 4104: + scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_4104; + break; + case 4160: + scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_4160; + break; + default: + break; + } + + scsiio_req->sgl[0].eedp.eedp_flags = cpu_to_le16(eedp_flags); + scsiio_req->sgl[0].eedp.flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_EXTENDED; +} + + + +/** + * mpi3mr_build_sense_buffer - Map sense information + * @desc: Sense type + * @buf: Sense buffer to populate + * @key: Sense key + * @asc: Additional sense code + * @ascq: Additional sense code qualifier + * + * Maps the given sense information into either descriptor or + * fixed format sense data. + * + * Return: Nothing + */ +static inline void mpi3mr_build_sense_buffer(int desc, u8 *buf, u8 key, + u8 asc, u8 ascq) +{ + if (desc) { + buf[0] = 0x72; /* descriptor, current */ + buf[1] = key; + buf[2] = asc; + buf[3] = ascq; + buf[7] = 0; + } else { + buf[0] = 0x70; /* fixed, current */ + buf[2] = key; + buf[7] = 0xa; + buf[12] = asc; + buf[13] = ascq; + } +} + +/** + * mpi3mr_map_eedp_error - Map EEDP errors from IOC status + * @scmd: SCSI command reference + * @ioc_status: status of MPI3 request + * + * Maps the EEDP error status of the SCSI IO request to sense + * data. + * + * Return: Nothing + */ +static void mpi3mr_map_eedp_error(struct scsi_cmnd *scmd, + u16 ioc_status) +{ + u8 ascq = 0; + + switch (ioc_status) { + case MPI3_IOCSTATUS_EEDP_GUARD_ERROR: + ascq = 0x01; + break; + case MPI3_IOCSTATUS_EEDP_APP_TAG_ERROR: + ascq = 0x02; + break; + case MPI3_IOCSTATUS_EEDP_REF_TAG_ERROR: + ascq = 0x03; + break; + default: + ascq = 0x00; + break; + } + + mpi3mr_scsi_build_sense(scmd, 0, ILLEGAL_REQUEST, 0x10, ascq); + set_host_byte(scmd, DID_ABORT); +} + +/** + * mpi3mr_process_op_reply_desc - reply descriptor handler + * @mrioc: Adapter instance reference + * @reply_desc: Operational reply descriptor + * @reply_dma: place holder for reply DMA address + * @qidx: Operational queue index + * + * Process the operational reply descriptor and identifies the + * descriptor type. Based on the descriptor map the MPI3 request + * status to a SCSI command status and calls scsi_done call + * back. + * + * Return: Nothing + */ +void mpi3mr_process_op_reply_desc(struct mpi3mr_ioc *mrioc, + struct mpi3_default_reply_descriptor *reply_desc, u64 *reply_dma, + u16 qidx) +{ + u16 reply_desc_type, host_tag = 0; + u16 ioc_status = MPI3_IOCSTATUS_SUCCESS; + u32 ioc_loginfo = 0; + struct mpi3_status_reply_descriptor *status_desc = NULL; + struct mpi3_address_reply_descriptor *addr_desc = NULL; + struct mpi3_success_reply_descriptor *success_desc = NULL; + struct mpi3_scsi_io_reply *scsi_reply = NULL; + struct scsi_cmnd *scmd = NULL; + struct scmd_priv *priv = NULL; + u8 *sense_buf = NULL; + u8 scsi_state = 0, scsi_status = 0, sense_state = 0; + u32 xfer_count = 0, sense_count = 0, resp_data = 0; + u16 dev_handle = 0xFFFF; + struct scsi_sense_hdr sshdr; + struct mpi3mr_stgt_priv_data *stgt_priv_data = NULL; + struct mpi3mr_sdev_priv_data *sdev_priv_data = NULL; + u32 ioc_pend_data_len = 0, tg_pend_data_len = 0, data_len_blks = 0; + struct mpi3mr_throttle_group_info *tg = NULL; + u8 throttle_enabled_dev = 0; + + *reply_dma = 0; + reply_desc_type = le16_to_cpu(reply_desc->reply_flags) & + MPI3_REPLY_DESCRIPT_FLAGS_TYPE_MASK; + switch (reply_desc_type) { + case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_STATUS: + status_desc = (struct mpi3_status_reply_descriptor *)reply_desc; + host_tag = le16_to_cpu(status_desc->host_tag); + ioc_status = le16_to_cpu(status_desc->ioc_status); + if (ioc_status & + MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_LOGINFOAVAIL) + ioc_loginfo = le32_to_cpu(status_desc->ioc_log_info); + ioc_status &= MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_STATUS_MASK; + mpi3mr_reply_trigger(mrioc, ioc_status, ioc_loginfo); + break; + case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_ADDRESS_REPLY: + addr_desc = (struct mpi3_address_reply_descriptor *)reply_desc; + *reply_dma = le64_to_cpu(addr_desc->reply_frame_address); + scsi_reply = mpi3mr_get_reply_virt_addr(mrioc, + *reply_dma); + if (!scsi_reply) { + ioc_err(mrioc, "NULL address reply is received, qidx %d\n", + qidx); + goto out; + } + host_tag = le16_to_cpu(scsi_reply->host_tag); + ioc_status = le16_to_cpu(scsi_reply->ioc_status); + scsi_status = scsi_reply->scsi_status; + scsi_state = scsi_reply->scsi_state; + dev_handle = le16_to_cpu(scsi_reply->dev_handle); + sense_state = (scsi_state & MPI3_SCSI_STATE_SENSE_MASK); + xfer_count = le32_to_cpu(scsi_reply->transfer_count); + sense_count = le32_to_cpu(scsi_reply->sense_count); + resp_data = le32_to_cpu(scsi_reply->response_data); + sense_buf = mpi3mr_get_sensebuf_virt_addr(mrioc, + le64_to_cpu(scsi_reply->sense_data_buffer_address)); + if (ioc_status & + MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_LOGINFOAVAIL) + ioc_loginfo = le32_to_cpu(scsi_reply->ioc_log_info); + ioc_status &= MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_STATUS_MASK; + if (sense_state == MPI3_SCSI_STATE_SENSE_BUFF_Q_EMPTY) + ioc_err(mrioc, + "controller cannot transfer sense data due to empty sense buffer queue\n"); + if (sense_buf) { + scsi_normalize_sense(sense_buf, sense_count, &sshdr); + mpi3mr_scsisense_trigger(mrioc, sshdr.sense_key, + sshdr.asc, sshdr.ascq); + } + mpi3mr_reply_trigger(mrioc, ioc_status, ioc_loginfo); + break; + case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_SUCCESS: + success_desc = (struct mpi3_success_reply_descriptor *) + reply_desc; + host_tag = le16_to_cpu(success_desc->host_tag); + break; + default: + break; + } + + scmd = mpi3mr_scmd_from_host_tag(mrioc, host_tag, qidx); + if (!scmd) { + ioc_err(mrioc, "cannot identify scmd for host_tag %d\n", + host_tag); + goto out; + } + priv = scsi_cmd_priv(scmd); + + data_len_blks = scsi_bufflen(scmd) >> 9; + sdev_priv_data = scmd->device->hostdata; + if (sdev_priv_data) { + stgt_priv_data = sdev_priv_data->tgt_priv_data; + if (stgt_priv_data) { + tg = stgt_priv_data->throttle_group; + throttle_enabled_dev = + stgt_priv_data->io_throttle_enabled; + } + } + if (unlikely((data_len_blks >= mrioc->io_throttle_data_length) && + throttle_enabled_dev)) { + ioc_pend_data_len = atomic_sub_return(data_len_blks, + &mrioc->pend_large_data_sz); + if (tg) { + tg_pend_data_len = atomic_sub_return(data_len_blks, + &tg->pend_large_data_sz); +#ifdef THROTTLE_LOGGING + if (printk_ratelimit()) + ioc_info(mrioc, + "large vd_io completion persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), tg_pending(%d), ioc_low(%d), tg_low(%d)\n", + stgt_priv_data->perst_id, + stgt_priv_data->dev_handle, + data_len_blks, ioc_pend_data_len, + tg_pend_data_len, + mrioc->io_throttle_low, + tg->low); +#endif + if (tg->io_divert && ((ioc_pend_data_len <= + mrioc->io_throttle_low) && + (tg_pend_data_len <= tg->low))) { + tg->io_divert = 0; + mpi3mr_set_io_divert_for_all_vd_in_tg( + mrioc, tg, 0); + } + } else { +#ifdef THROTTLE_LOGGING + if (printk_ratelimit()) + ioc_info(mrioc, + "large pd_io completion persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), ioc_low(%d)\n", + stgt_priv_data->perst_id, + stgt_priv_data->dev_handle, + data_len_blks, ioc_pend_data_len, + mrioc->io_throttle_low); +#endif + if ( ioc_pend_data_len <= mrioc->io_throttle_low) + stgt_priv_data->io_divert = 0; + } + } else if (unlikely((stgt_priv_data && stgt_priv_data->io_divert))) { + ioc_pend_data_len = atomic_read(&mrioc->pend_large_data_sz); + if (!tg) { +#ifdef THROTTLE_LOGGING + if (printk_ratelimit()) + ioc_info(mrioc, + "pd_io completion persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), ioc_low(%d)\n", + stgt_priv_data->perst_id, + stgt_priv_data->dev_handle, + data_len_blks, ioc_pend_data_len, + mrioc->io_throttle_low); +#endif + if ( ioc_pend_data_len <= mrioc->io_throttle_low) + stgt_priv_data->io_divert = 0; + + } else if (ioc_pend_data_len <= mrioc->io_throttle_low) { + tg_pend_data_len = atomic_read(&tg->pend_large_data_sz); +#ifdef THROTTLE_LOGGING + if (printk_ratelimit()) + ioc_info(mrioc, + "vd_io completion persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), tg_pending(%d), ioc_low(%d), tg_low(%d)\n", + stgt_priv_data->perst_id, + stgt_priv_data->dev_handle, + data_len_blks, ioc_pend_data_len, + tg_pend_data_len, + mrioc->io_throttle_low, + tg->low); +#endif + if (tg->io_divert && (tg_pend_data_len <= tg->low)) { + tg->io_divert = 0; + mpi3mr_set_io_divert_for_all_vd_in_tg( + mrioc, tg, 0); + } + + } + } + + if (success_desc) { + set_host_byte(scmd, DID_OK); + goto out_success; + } + scsi_set_resid(scmd, scsi_bufflen(scmd) - xfer_count); + if (ioc_status == MPI3_IOCSTATUS_SCSI_DATA_UNDERRUN && + xfer_count == 0 && (scsi_status == MPI3_SCSI_STATUS_BUSY || + scsi_status == MPI3_SCSI_STATUS_RESERVATION_CONFLICT || + scsi_status == MPI3_SCSI_STATUS_TASK_SET_FULL)) + ioc_status = MPI3_IOCSTATUS_SUCCESS; + + if ((sense_state == MPI3_SCSI_STATE_SENSE_VALID) && sense_count && + sense_buf) { + u32 sz = min_t(u32, SCSI_SENSE_BUFFERSIZE, sense_count); + + memcpy(scmd->sense_buffer, sense_buf, sz); + } + + switch (ioc_status) { + case MPI3_IOCSTATUS_BUSY: + case MPI3_IOCSTATUS_INSUFFICIENT_RESOURCES: + scmd->result = SAM_STAT_BUSY; + break; + case MPI3_IOCSTATUS_SCSI_DEVICE_NOT_THERE: + set_host_byte(scmd, DID_NO_CONNECT); + break; + case MPI3_IOCSTATUS_SCSI_IOC_TERMINATED: + set_host_byte(scmd, DID_SOFT_ERROR); + break; + case MPI3_IOCSTATUS_SCSI_TASK_TERMINATED: + case MPI3_IOCSTATUS_SCSI_EXT_TERMINATED: + set_host_byte(scmd, DID_RESET); + break; + case MPI3_IOCSTATUS_SCSI_RESIDUAL_MISMATCH: + if ((xfer_count == 0) || (scmd->underflow > xfer_count)) + set_host_byte(scmd, DID_SOFT_ERROR); + else { + scmd->result |= scsi_status; + set_host_byte(scmd, DID_OK); + } + break; + case MPI3_IOCSTATUS_SCSI_DATA_UNDERRUN: + scmd->result |= scsi_status; + set_host_byte(scmd, DID_OK); + if (sense_state == MPI3_SCSI_STATE_SENSE_VALID) + break; + if (xfer_count < scmd->underflow) { + if (scsi_status == SAM_STAT_BUSY) + scmd->result |= SAM_STAT_BUSY; + else + set_host_byte(scmd, DID_SOFT_ERROR); + } else if ((scsi_state & (MPI3_SCSI_STATE_NO_SCSI_STATUS)) || + (sense_state != MPI3_SCSI_STATE_SENSE_NOT_AVAILABLE)) + set_host_byte(scmd, DID_SOFT_ERROR); + else if (scsi_state & MPI3_SCSI_STATE_TERMINATED) + set_host_byte(scmd, DID_RESET); + break; + case MPI3_IOCSTATUS_SCSI_DATA_OVERRUN: + scsi_set_resid(scmd, 0); + /* fall through */ + fallthrough; + case MPI3_IOCSTATUS_SCSI_RECOVERED_ERROR: + case MPI3_IOCSTATUS_SUCCESS: + scmd->result |= scsi_status; + set_host_byte(scmd, DID_OK); + if ((scsi_state & (MPI3_SCSI_STATE_NO_SCSI_STATUS)) || + (sense_state == MPI3_SCSI_STATE_SENSE_FAILED) || + (sense_state == MPI3_SCSI_STATE_SENSE_BUFF_Q_EMPTY)) + set_host_byte(scmd, DID_SOFT_ERROR); + else if (scsi_state & MPI3_SCSI_STATE_TERMINATED) + set_host_byte(scmd, DID_RESET); + break; + case MPI3_IOCSTATUS_EEDP_GUARD_ERROR: + case MPI3_IOCSTATUS_EEDP_REF_TAG_ERROR: + case MPI3_IOCSTATUS_EEDP_APP_TAG_ERROR: + mpi3mr_map_eedp_error(scmd, ioc_status); + break; + case MPI3_IOCSTATUS_SCSI_PROTOCOL_ERROR: + case MPI3_IOCSTATUS_INVALID_FUNCTION: + case MPI3_IOCSTATUS_INVALID_SGL: + case MPI3_IOCSTATUS_INTERNAL_ERROR: + case MPI3_IOCSTATUS_INVALID_FIELD: + case MPI3_IOCSTATUS_INVALID_STATE: + case MPI3_IOCSTATUS_SCSI_IO_DATA_ERROR: + case MPI3_IOCSTATUS_SCSI_TASK_MGMT_FAILED: + case MPI3_IOCSTATUS_INSUFFICIENT_POWER: + default: + set_host_byte(scmd, DID_SOFT_ERROR); + break; + } + + if ((mrioc->logging_level & MPI3_DEBUG_SCSI_ERROR) && + (scmd->result != (DID_OK << 16)) && (scmd->cmnd[0] != ATA_12) && + (scmd->cmnd[0] != ATA_16)) { + ioc_info(mrioc, + "host_tag(%d): qid(%d): command issued to handle(0x%04x) returned with ioc_status(0x%04x), log_info(0x%08x), scsi_state(0x%02x), scsi_status(0x%02x), xfer_count(%d), resp_data(0x%08x) scmd_result(0x%08x)\n", + host_tag, priv->req_q_idx+1, dev_handle, ioc_status, + ioc_loginfo, scsi_state, scsi_status, xfer_count, + resp_data, scmd->result); + if (sense_buf) + ioc_info(mrioc, + "host_tag(%d): qid(%d): sense_count(%d), sense_key(0x%x), ASC(0x%x,) ASCQ(0x%x)\n", + host_tag, priv->req_q_idx+1, sense_count, + sshdr.sense_key, sshdr.asc, sshdr.ascq); + scsi_print_command(scmd); + } +out_success: + if (priv->meta_sg_valid) { + dma_unmap_sg(&mrioc->pdev->dev, scsi_prot_sglist(scmd), + scsi_prot_sg_count(scmd), scmd->sc_data_direction); + } + mpi3mr_clear_scmd_priv(mrioc, scmd); + scsi_dma_unmap(scmd); + SCMD_DONE(scmd); +out: + if (sense_buf) + mpi3mr_repost_sense_buf(mrioc, + le64_to_cpu(scsi_reply->sense_data_buffer_address)); +} + +/** + * mpi3mr_get_chain_idx - get free chain buffer index + * @mrioc: Adapter instance reference + * + * Try to get a free chain buffer index from the free pool. + * + * Return: -1 on failure or the free chain buffer index + */ +static int mpi3mr_get_chain_idx(struct mpi3mr_ioc *mrioc) +{ + u8 retry_count = 5; + int cmd_idx = -1; + + do { + spin_lock(&mrioc->chain_buf_lock); + cmd_idx = find_first_zero_bit(mrioc->chain_bitmap, + mrioc->chain_buf_count); + if (cmd_idx < mrioc->chain_buf_count) { + set_bit(cmd_idx, mrioc->chain_bitmap); + spin_unlock(&mrioc->chain_buf_lock); + break; + } + spin_unlock(&mrioc->chain_buf_lock); + cmd_idx = -1; + } while (retry_count--); + return cmd_idx; +} + +/** + * mpi3mr_prepare_sg_scmd - build scatter gather list + * @mrioc: Adapter instance reference + * @scmd: SCSI command reference + * @scsiio_req: MPI3 SCSI IO request + * + * This function maps SCSI command's data and protection SGEs to + * MPI request SGEs. If required additional 4K chain buffer is + * used to send the SGEs. + * + * Return: 0 on success, -ENOMEM on dma_map_sg failure + */ +static int mpi3mr_prepare_sg_scmd(struct mpi3mr_ioc *mrioc, + struct scsi_cmnd *scmd, struct mpi3_scsi_io_request *scsiio_req) +{ + dma_addr_t chain_dma; + struct scatterlist *sg_scmd; + void *sg_local, *chain; + u32 chain_length; + int sges_left, chain_idx; + u32 sges_in_segment; + u8 simple_sgl_flags; + u8 simple_sgl_flags_last; + u8 last_chain_sgl_flags; + struct chain_element *chain_req; + struct scmd_priv *priv = NULL; + u32 meta_sg = le32_to_cpu(scsiio_req->flags) & + MPI3_SCSIIO_FLAGS_DMAOPERATION_HOST_PI; + + priv = scsi_cmd_priv(scmd); + + simple_sgl_flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE | + MPI3_SGE_FLAGS_DLAS_SYSTEM; + simple_sgl_flags_last = simple_sgl_flags | + MPI3_SGE_FLAGS_END_OF_LIST; + last_chain_sgl_flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_LAST_CHAIN | + MPI3_SGE_FLAGS_DLAS_SYSTEM; + + if (meta_sg) + sg_local = &scsiio_req->sgl[MPI3_SCSIIO_METASGL_INDEX]; + else + sg_local = &scsiio_req->sgl; + + if (!scsiio_req->data_length && !meta_sg) { + mpi3mr_build_zero_len_sge(sg_local); + return 0; + } + + if (meta_sg) { + sg_scmd = scsi_prot_sglist(scmd); + sges_left = dma_map_sg(&mrioc->pdev->dev, + scsi_prot_sglist(scmd), + scsi_prot_sg_count(scmd), + scmd->sc_data_direction); + priv->meta_sg_valid = 1; /* To unmap meta sg DMA */ + } else { + sg_scmd = scsi_sglist(scmd); + sges_left = scsi_dma_map(scmd); + } + + if (sges_left < 0) + return -ENOMEM; + + if (sges_left > MPI3MR_SG_DEPTH) { + pr_err_ratelimited( + "sd %s: scsi_dma_map returned unsupported sge count %d!\n", + dev_name(&scmd->device->sdev_gendev), sges_left); + return -ENOMEM; + } + + sges_in_segment = (mrioc->facts.op_req_sz - + offsetof(struct mpi3_scsi_io_request, sgl)) / + sizeof(struct mpi3_sge_common); + + if (scsiio_req->sgl[0].eedp.flags == + MPI3_SGE_FLAGS_ELEMENT_TYPE_EXTENDED && !meta_sg) { + sg_local += sizeof(struct mpi3_sge_common); + sges_in_segment--; + /* Reserve 1st segment (scsiio_req->sgl[0]) for eedp */ + } + + if (scsiio_req->msg_flags == + MPI3_SCSIIO_MSGFLAGS_METASGL_VALID && !meta_sg) { + sges_in_segment--; + /* Reserve last segment (scsiio_req->sgl[3]) for meta sg */ + } + + if (meta_sg) + sges_in_segment = 1; + + if (sges_left <= sges_in_segment) + goto fill_in_last_segment; + + /* fill in main message segment when there is a chain following */ + while (sges_in_segment > 1) { + mpi3mr_add_sg_single(sg_local, simple_sgl_flags, + sg_dma_len(sg_scmd), sg_dma_address(sg_scmd)); + sg_scmd = sg_next(sg_scmd); + sg_local += sizeof(struct mpi3_sge_common); + sges_left--; + sges_in_segment--; + } + + chain_idx = mpi3mr_get_chain_idx(mrioc); + if (chain_idx < 0) + return -1; + chain_req = &mrioc->chain_sgl_list[chain_idx]; + if (meta_sg) + priv->meta_chain_idx = chain_idx; + else + priv->chain_idx = chain_idx; + + chain = chain_req->addr; + chain_dma = chain_req->dma_addr; + sges_in_segment = sges_left; + chain_length = sges_in_segment * sizeof(struct mpi3_sge_common); + + mpi3mr_add_sg_single(sg_local, last_chain_sgl_flags, + chain_length, chain_dma); + + sg_local = chain; + +fill_in_last_segment: + while (sges_left > 0) { + if (sges_left == 1) + mpi3mr_add_sg_single(sg_local, + simple_sgl_flags_last, sg_dma_len(sg_scmd), + sg_dma_address(sg_scmd)); + else + mpi3mr_add_sg_single(sg_local, simple_sgl_flags, + sg_dma_len(sg_scmd), sg_dma_address(sg_scmd)); + sg_scmd = sg_next(sg_scmd); + sg_local += sizeof(struct mpi3_sge_common); + sges_left--; + } + + return 0; +} + +/** + * mpi3mr_build_sg_scmd - build scatter gather list for SCSI IO + * @mrioc: Adapter instance reference + * @scmd: SCSI command reference + * @scsiio_req: MPI3 SCSI IO request + * + * This function calls mpi3mr_prepare_sg_scmd for constructing + * both data SGEs and protection information SGEs in the MPI + * format from the SCSI Command as appropriate . + * + * Return: return value of mpi3mr_prepare_sg_scmd. + */ +static int mpi3mr_build_sg_scmd(struct mpi3mr_ioc *mrioc, + struct scsi_cmnd *scmd, struct mpi3_scsi_io_request *scsiio_req) +{ + int ret; + + ret = mpi3mr_prepare_sg_scmd(mrioc, scmd, scsiio_req); + if (ret) + return ret; + + if (scsiio_req->msg_flags == MPI3_SCSIIO_MSGFLAGS_METASGL_VALID) { + /* There is a valid meta sg */ + scsiio_req->flags |= + cpu_to_le32(MPI3_SCSIIO_FLAGS_DMAOPERATION_HOST_PI); + ret = mpi3mr_prepare_sg_scmd(mrioc, scmd, scsiio_req); + } + + return ret; +} + +/** + * mpi3mr_tm_response_name - get TM response as a string + * @resp_code: TM response code + * + * Convert known task management response code as a readable + * string. + * + * Return: response code string. + */ +static const char* mpi3mr_tm_response_name(u8 resp_code) +{ + char *desc; + + switch (resp_code) { + case MPI3_SCSITASKMGMT_RSPCODE_TM_COMPLETE: + desc = "task management request completed"; + break; + case MPI3_SCSITASKMGMT_RSPCODE_INVALID_FRAME: + desc = "invalid frame"; + break; + case MPI3_SCSITASKMGMT_RSPCODE_TM_FUNCTION_NOT_SUPPORTED: + desc = "task management request not supported"; + break; + case MPI3_SCSITASKMGMT_RSPCODE_TM_FAILED: + desc = "task management request failed"; + break; + case MPI3_SCSITASKMGMT_RSPCODE_TM_SUCCEEDED: + desc = "task management request succeeded"; + break; + case MPI3_SCSITASKMGMT_RSPCODE_TM_INVALID_LUN: + desc = "invalid LUN"; + break; + case MPI3_SCSITASKMGMT_RSPCODE_TM_OVERLAPPED_TAG: + desc = "overlapped tag attempted"; + break; + case MPI3_SCSITASKMGMT_RSPCODE_IO_QUEUED_ON_IOC: + desc = "task queued, however not sent to target"; + break; + case MPI3_SCSITASKMGMT_RSPCODE_TM_NVME_DENIED: + desc = "task management request denied by NVMe device"; + break; + default: + desc = "unknown"; + break; + } + + return desc; +} + +inline void mpi3mr_poll_pend_io_completions(struct mpi3mr_ioc *mrioc) +{ + int i; + int num_of_reply_queues = + mrioc->num_op_reply_q + mrioc->op_reply_q_offset; + + for (i = mrioc->op_reply_q_offset; i < num_of_reply_queues; i++) + mpi3mr_process_op_reply_q(mrioc, + mrioc->intr_info[i].op_reply_q); +} + +/** + * mpi3mr_issue_tm - Issue Task Management request + * @mrioc: Adapter instance reference + * @tm_type: Task Management type + * @handle: Device handle + * @lun: lun ID + * @htag: Host tag of the TM request + * @timeout: TM timeout value + * @drv_cmd: Internal command tracker + * @resp_code: Response code place holder + * @scmd: SCSI command + * + * Issues a Task Management Request to the controller for a + * specified target, lun and command and wait for its completion + * and check TM response. Recover the TM if it timed out by + * issuing controller reset. + * + * Return: 0 on success, non-zero on errors + */ +int mpi3mr_issue_tm(struct mpi3mr_ioc *mrioc, u8 tm_type, + u16 handle, uint lun, u16 htag, ulong timeout, + struct mpi3mr_drv_cmd *drv_cmd, + u8 *resp_code, struct scsi_cmnd *scmd) +{ + struct mpi3_scsi_task_mgmt_request tm_req; + struct mpi3_scsi_task_mgmt_reply *tm_reply = NULL; + int retval = 0; + struct mpi3mr_tgt_dev *tgtdev = NULL; + struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data = NULL; + struct op_req_qinfo *op_req_q = NULL; + struct scmd_priv *cmd_priv = NULL; + struct scsi_device *sdev = NULL; + struct mpi3mr_sdev_priv_data *sdev_priv_data = NULL; + + if (mrioc->unrecoverable) { + retval = -1; + dprint_tm(mrioc, "sending task management failed due to unrecoverable controller\n"); + goto out; + } + + memset(&tm_req, 0, sizeof(tm_req)); + mutex_lock(&drv_cmd->mutex); + if (drv_cmd->state & MPI3MR_CMD_PENDING) { + retval = -1; + dprint_tm(mrioc, "sending task management failed due to command in use\n"); + mutex_unlock(&drv_cmd->mutex); + goto out; + } + if (mrioc->reset_in_progress) { + retval = -1; + dprint_tm(mrioc, "sending task management failed due to controller reset\n"); + mutex_unlock(&drv_cmd->mutex); + goto out; + } + + drv_cmd->state = MPI3MR_CMD_PENDING; + drv_cmd->is_waiting = 1; + drv_cmd->callback = NULL; + tm_req.dev_handle = cpu_to_le16(handle); + tm_req.task_type = tm_type; + tm_req.host_tag = cpu_to_le16(htag); + + int_to_scsilun(lun, (struct scsi_lun *)tm_req.lun); + tm_req.function = MPI3_FUNCTION_SCSI_TASK_MGMT; + + tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, handle); + + if (scmd) { + if (tm_type == MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK) { + cmd_priv = scsi_cmd_priv(scmd); + if (!cmd_priv) + goto out_unlock; + op_req_q = &mrioc->req_qinfo[cmd_priv->req_q_idx]; + tm_req.task_host_tag = cpu_to_le16(cmd_priv->host_tag); + tm_req.task_request_queue_id = + cpu_to_le16(op_req_q->qid); + } + + sdev = scmd->device; + sdev_priv_data = sdev->hostdata; + scsi_tgt_priv_data = ((sdev_priv_data) ? + sdev_priv_data->tgt_priv_data : NULL); + } else { + if (tgtdev && tgtdev->starget && tgtdev->starget->hostdata) + scsi_tgt_priv_data = (struct mpi3mr_stgt_priv_data *) + tgtdev->starget->hostdata; + } + + if (scsi_tgt_priv_data) + atomic_inc(&scsi_tgt_priv_data->block_io); + + if (tgtdev && (tgtdev->dev_type == MPI3_DEVICE_DEVFORM_PCIE)) { + if (cmd_priv && tgtdev->dev_spec.pcie_inf.abort_to) + timeout = tgtdev->dev_spec.pcie_inf.abort_to; + else if (!cmd_priv && tgtdev->dev_spec.pcie_inf.reset_to) + timeout = tgtdev->dev_spec.pcie_inf.reset_to; + } + + dprint_tm(mrioc, "posting task management request: type(%d), handle(0x%04x)\n", + tm_type, handle); + init_completion(&drv_cmd->done); + retval = mpi3mr_admin_request_post(mrioc, &tm_req, sizeof(tm_req), 1); + if (retval) { + dprint_tm(mrioc, "posting task management request is failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&drv_cmd->done, (timeout * HZ)); + + if (!(drv_cmd->state & MPI3MR_CMD_COMPLETE)) { + drv_cmd->is_waiting = 0; + retval = -1; + if (!(drv_cmd->state & MPI3MR_CMD_RESET)) { + dprint_tm(mrioc, + "task management request timed out after %ld seconds\n", + timeout); + if (mrioc->logging_level & MPI3_DEBUG_TM) + dprint_dump(&tm_req, sizeof(tm_req), + "mpi3_task_mgmt_req"); + mpi3mr_soft_reset_handler(mrioc, + MPI3MR_RESET_FROM_TM_TIMEOUT, 1); + } + goto out_unlock; + } + + if (!(drv_cmd->state & MPI3MR_CMD_REPLY_VALID)) { + dprint_tm(mrioc, "invalid task management reply message\n"); + retval = -1; + goto out_unlock; + } + + tm_reply = (struct mpi3_scsi_task_mgmt_reply *)drv_cmd->reply; + + switch (drv_cmd->ioc_status) { + case MPI3_IOCSTATUS_SUCCESS: + *resp_code = le32_to_cpu(tm_reply->response_data) & + MPI3MR_RI_MASK_RESPCODE; + break; + case MPI3_IOCSTATUS_SCSI_IOC_TERMINATED: + *resp_code = MPI3_SCSITASKMGMT_RSPCODE_TM_COMPLETE; + break; + default: + dprint_tm(mrioc, + "task management request to handle(0x%04x) is failed with ioc_status(0x%04x) log_info(0x%08x)\n", + handle, drv_cmd->ioc_status, drv_cmd->ioc_loginfo); + retval = -1; + goto out_unlock; + } + + switch (*resp_code) { + case MPI3_SCSITASKMGMT_RSPCODE_TM_SUCCEEDED: + case MPI3_SCSITASKMGMT_RSPCODE_TM_COMPLETE: + break; + case MPI3_SCSITASKMGMT_RSPCODE_IO_QUEUED_ON_IOC: + if (tm_type != MPI3_SCSITASKMGMT_TASKTYPE_QUERY_TASK) + retval = -1; + break; + default: + retval = -1; + break; + } + + dprint_tm(mrioc, + "task management request type(%d) completed for handle(0x%04x) with ioc_status(0x%04x), log_info(0x%08x), termination_count(%d), response:%s(0x%x)\n", + tm_type, handle, drv_cmd->ioc_status, drv_cmd->ioc_loginfo, + le32_to_cpu(tm_reply->termination_count), + mpi3mr_tm_response_name(*resp_code), *resp_code); + + if (!retval) { + mpi3mr_ioc_disable_intr(mrioc); + mpi3mr_poll_pend_io_completions(mrioc); + mpi3mr_ioc_enable_intr(mrioc); + mpi3mr_poll_pend_io_completions(mrioc); + mpi3mr_process_admin_reply_q(mrioc); + } + switch (tm_type) { + case MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET: + if (!scsi_tgt_priv_data) + break; + scsi_tgt_priv_data->pend_count = 0; + blk_mq_tagset_busy_iter(&mrioc->shost->tag_set, + mpi3mr_count_tgt_pending, + (void *)scsi_tgt_priv_data->starget); + break; + case MPI3_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET: + if (!sdev_priv_data) + break; + sdev_priv_data->pend_count = 0; + blk_mq_tagset_busy_iter(&mrioc->shost->tag_set, + mpi3mr_count_dev_pending, (void *)sdev); + break; + default: + break; + } + mpi3mr_master_trigger(mrioc, + MPI3_DRIVER2_MASTERTRIGGER_TASK_MANAGEMENT_ENABLED); + +out_unlock: + drv_cmd->state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&drv_cmd->mutex); + if (scsi_tgt_priv_data) + atomic_dec_if_positive(&scsi_tgt_priv_data->block_io); + if (tgtdev) + mpi3mr_tgtdev_put(tgtdev); +out: + return retval; +} + +/** + * mpi3mr_bios_param - BIOS param callback + * @sdev: SCSI device reference + * @bdev: Block device reference + * @capacity: Capacity in logical sectors + * @params: Parameter array + * + * Just the parameters with heads/sectors/cylinders. + * + * Return: 0 always + */ +static int mpi3mr_bios_param(struct scsi_device *sdev, + struct block_device *bdev, sector_t capacity, int params[]) +{ + int heads; + int sectors; + sector_t cylinders; + ulong dummy; + + heads = 64; + sectors = 32; + + dummy = heads * sectors; + cylinders = capacity; + sector_div(cylinders, dummy); + + if ((ulong)capacity >= 0x200000) { + heads = 255; + sectors = 63; + dummy = heads * sectors; + cylinders = capacity; + sector_div(cylinders, dummy); + } + + params[0] = heads; + params[1] = sectors; + params[2] = cylinders; + return 0; +} + +#if ((defined(RHEL_MAJOR) && (RHEL_MAJOR == 8)) || \ + (KERNEL_VERSION(5, 0, 0) <= LINUX_VERSION_CODE)) +static int mpi3mr_map_queues(struct Scsi_Host *shost) +{ + struct mpi3mr_ioc *mrioc = shost_priv(shost); + int i, qoff, offset; + struct blk_mq_queue_map *map = NULL; + + offset = mrioc->op_reply_q_offset; + + for (i = 0, qoff = 0; i < HCTX_MAX_TYPES; i++) { + map = &shost->tag_set.map[i]; + + map->nr_queues = 0; + + if (i == HCTX_TYPE_DEFAULT) + map->nr_queues = mrioc->default_qcount; + else if (i == HCTX_TYPE_POLL) + map->nr_queues = mrioc->active_poll_qcount; + + if (!map->nr_queues) { + BUG_ON(i == HCTX_TYPE_DEFAULT); + continue; + } + + /* + * The poll queue(s) doesn't have an IRQ (and hence IRQ + * affinity), so use the regular blk-mq cpu mapping + */ + map->queue_offset = qoff; + if (i != HCTX_TYPE_POLL) + blk_mq_pci_map_queues(map, mrioc->pdev, offset); + else + blk_mq_map_queues(map); + + qoff += map->nr_queues; + offset += map->nr_queues; + } + + return 0; + +} + +#endif + +/** + * mpi3mr_print_pending_host_io - print pending I/Os + * @mrioc: Adapter instance reference + * + * Print number of pending I/Os and each I/O details prior to + * reset for debug purpose. + * + * Return: Nothing + */ +static void mpi3mr_print_pending_host_io(struct mpi3mr_ioc *mrioc) +{ + struct Scsi_Host *shost = mrioc->shost; + + ioc_info(mrioc, "number of pending I/O requests prior to reset: %d\n", + mpi3mr_get_fw_pending_ios(mrioc)); + blk_mq_tagset_busy_iter(&shost->tag_set, + mpi3mr_print_scmd, (void *)mrioc); +} + +/** + * mpi3mr_eh_host_reset - Host reset error handling callback + * @scmd: SCSI command reference + * + * Issue controller reset if the scmd is for a Physical Device, + * if the scmd is for RAID volume, then wait for + * MPI3MR_RAID_ERRREC_RESET_TIMEOUT and checks whether any + * pending I/Os prior to issuing reset to the controller. + * + * Return: SUCCESS of successful reset else FAILED + */ +static int mpi3mr_eh_host_reset(struct scsi_cmnd *scmd) +{ + struct mpi3mr_ioc *mrioc = shost_priv(scmd->device->host); + struct mpi3mr_stgt_priv_data *stgt_priv_data; + struct mpi3mr_sdev_priv_data *sdev_priv_data; + u8 dev_type = MPI3_DEVICE_DEVFORM_VD; + int retval = FAILED, ret; + + sdev_printk(KERN_INFO, scmd->device, + "%s: attempting host reset! scmd(%p)\n", mrioc->name, scmd); + + sdev_priv_data = scmd->device->hostdata; + if (sdev_priv_data && sdev_priv_data->tgt_priv_data) { + stgt_priv_data = sdev_priv_data->tgt_priv_data; + dev_type = stgt_priv_data->dev_type; + } + + if (dev_type == MPI3_DEVICE_DEVFORM_VD) { + mpi3mr_wait_for_host_io(mrioc, + MPI3MR_RAID_ERRREC_RESET_TIMEOUT); + if (!mpi3mr_get_fw_pending_ios(mrioc)) { + while (mrioc->reset_in_progress || + mrioc->prepare_for_reset) + ssleep(1); + retval = SUCCESS; + goto out; + } + } + mpi3mr_print_pending_host_io(mrioc); + + ret = mpi3mr_soft_reset_handler(mrioc, + MPI3MR_RESET_FROM_EH_HOS, 1); + if (ret) + goto out; + + retval = SUCCESS; +out: + sdev_printk(KERN_INFO, scmd->device, + "%s: host reset is %s for scmd(%p)\n", mrioc->name, + ((retval == SUCCESS) ? "SUCCESS" : "FAILED"), scmd); + + return retval; +} + +/** + * mpi3mr_eh_target_reset - Target reset error handling callback + * @scmd: SCSI command reference + * + * Issue Target reset Task Management and verify the scmd is + * terminated successfully and return status accordingly. + * + * Return: SUCCESS of successful termination of the scmd else + * FAILED + */ +static int mpi3mr_eh_target_reset(struct scsi_cmnd *scmd) +{ + struct mpi3mr_ioc *mrioc = shost_priv(scmd->device->host); + struct mpi3mr_stgt_priv_data *stgt_priv_data; + struct mpi3mr_sdev_priv_data *sdev_priv_data; + u16 dev_handle; + u8 resp_code = 0; + int retval = FAILED, ret = 0; + + + sdev_printk(KERN_INFO, scmd->device, + "%s: attempting target reset! scmd(%p)\n", mrioc->name, scmd); + scsi_print_command(scmd); + + sdev_priv_data = scmd->device->hostdata; + if (!sdev_priv_data || !sdev_priv_data->tgt_priv_data) { + sdev_printk(KERN_INFO, scmd->device, + "%s: target is not available, target reset is not issued\n", + mrioc->name); + retval = SUCCESS; + goto out; + } + + stgt_priv_data = sdev_priv_data->tgt_priv_data; + dev_handle = stgt_priv_data->dev_handle; + if (stgt_priv_data->dev_removed) { + sdev_printk(KERN_INFO, scmd->device, + "%s:target(handle = 0x%04x) is removed, target reset is not issued\n", + mrioc->name, dev_handle); + retval = FAILED; + goto out; + } + sdev_printk(KERN_INFO, scmd->device, + "%s: target reset is issued to handle(0x%04x)\n", + mrioc->name, dev_handle); + + ret = mpi3mr_issue_tm(mrioc, + MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET, dev_handle, + sdev_priv_data->lun_id, MPI3MR_HOSTTAG_BLK_TMS, + MPI3MR_RESETTM_TIMEOUT, &mrioc->host_tm_cmds, &resp_code, scmd); + + if (ret) + goto out; + + if (stgt_priv_data->pend_count) { + sdev_printk(KERN_INFO, scmd->device, + "%s: target has %d pending commands, target reset is failed\n", + mrioc->name, stgt_priv_data->pend_count); + goto out; + } + + retval = SUCCESS; +out: + sdev_printk(KERN_INFO, scmd->device, + "%s: target reset is %s for scmd(%p)\n", mrioc->name, + ((retval == SUCCESS) ? "SUCCESS" : "FAILED"), scmd); + + return retval; +} + +/** + * mpi3mr_eh_dev_reset- Device reset error handling callback + * @scmd: SCSI command reference + * + * Issue lun reset Task Management and verify the scmd is + * terminated successfully and return status accordingly. + * + * Return: SUCCESS of successful termination of the scmd else + * FAILED + */ +static int mpi3mr_eh_dev_reset(struct scsi_cmnd *scmd) +{ + struct mpi3mr_ioc *mrioc = shost_priv(scmd->device->host); + struct mpi3mr_stgt_priv_data *stgt_priv_data; + struct mpi3mr_sdev_priv_data *sdev_priv_data; + u16 dev_handle; + u8 resp_code = 0; + int retval = FAILED, ret = 0; + + sdev_printk(KERN_INFO, scmd->device, + "%s: attempting device(LUN) reset! scmd(%p)\n", mrioc->name, scmd); + scsi_print_command(scmd); + + sdev_priv_data = scmd->device->hostdata; + if (!sdev_priv_data || !sdev_priv_data->tgt_priv_data) { + sdev_printk(KERN_INFO, scmd->device, + "%s: device is not available, device(LUN) reset is not issued\n", + mrioc->name); + retval = SUCCESS; + goto out; + } + + stgt_priv_data = sdev_priv_data->tgt_priv_data; + dev_handle = stgt_priv_data->dev_handle; + if (stgt_priv_data->dev_removed) { + sdev_printk(KERN_INFO, scmd->device, + "%s: device(handle = 0x%04x) is removed, device(LUN) reset is not issued\n", + mrioc->name, dev_handle); + retval = FAILED; + goto out; + } + sdev_printk(KERN_INFO, scmd->device, + "%s: device(LUN) reset is issued to handle(0x%04x)\n", + mrioc->name, dev_handle); + + ret = mpi3mr_issue_tm(mrioc, + MPI3_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET, dev_handle, + sdev_priv_data->lun_id, MPI3MR_HOSTTAG_BLK_TMS, + MPI3MR_RESETTM_TIMEOUT, &mrioc->host_tm_cmds, &resp_code, scmd); + + if (ret) + goto out; + + if (sdev_priv_data->pend_count) { + sdev_printk(KERN_INFO, scmd->device, + "%s: device has %d pending commands, device(LUN) reset is failed\n", + mrioc->name, sdev_priv_data->pend_count); + goto out; + } + retval = SUCCESS; +out: + sdev_printk(KERN_INFO, scmd->device, + "%s: device(LUN) reset is %s for scmd(%p)\n", mrioc->name, + ((retval == SUCCESS) ? "SUCCESS" : "FAILED"), scmd); + + return retval; +} + +/** + * mpi3mr_eh_abort- Abort error handling callback + * @scmd: SCSI command reference + * + * Issue Abort Task Management if the command is in LLD scope + * and verify if it is aborted successfully and return status + * accordingly. + * + * Return: SUCCESS of successful abort the scmd else FAILED + */ +static int mpi3mr_eh_abort(struct scsi_cmnd *scmd) +{ + struct mpi3mr_ioc *mrioc = shost_priv(scmd->device->host); + struct mpi3mr_stgt_priv_data *stgt_priv_data; + struct mpi3mr_sdev_priv_data *sdev_priv_data; + struct scmd_priv *cmd_priv; + u16 dev_handle; + u8 resp_code = 0; + int retval = FAILED, ret = 0; + + sdev_printk(KERN_INFO, scmd->device, + "%s: attempting abort task! scmd(%p)\n", mrioc->name, scmd); + scsi_print_command(scmd); + + sdev_priv_data = scmd->device->hostdata; + if (!sdev_priv_data || !sdev_priv_data->tgt_priv_data) { + sdev_printk(KERN_INFO, scmd->device, + "%s: device is not available, abort task is not issued\n", + mrioc->name); + retval = SUCCESS; + goto out; + } + + stgt_priv_data = sdev_priv_data->tgt_priv_data; + dev_handle = stgt_priv_data->dev_handle; + if (stgt_priv_data->dev_removed) { + sdev_printk(KERN_INFO, scmd->device, + "%s: device(handle = 0x%04x) is removed, abort task is not issued\n", + mrioc->name, dev_handle); + retval = FAILED; + goto out; + } + + sdev_printk(KERN_INFO, scmd->device, + "%s: scmd(%p) to be aborted is issued to handle(0x%04x)\n", + mrioc->name, scmd, dev_handle); + + cmd_priv = scsi_cmd_priv(scmd); + if (!cmd_priv->in_lld_scope || + cmd_priv->host_tag == MPI3MR_HOSTTAG_INVALID) { + sdev_printk(KERN_INFO, scmd->device, + "%s: scmd is not in LLD scope, abort task is not issued\n", + mrioc->name); + retval = SUCCESS; + goto out; + } + + ret = mpi3mr_issue_tm(mrioc, MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK, + dev_handle, sdev_priv_data->lun_id, MPI3MR_HOSTTAG_BLK_TMS, + MPI3MR_ABORTTM_TIMEOUT, &mrioc->host_tm_cmds, &resp_code, scmd); + + if (ret) + goto out; + + if (cmd_priv->in_lld_scope) { + sdev_printk(KERN_INFO, scmd->device, + "%s: scmd was not terminated, abort task is failed\n", + mrioc->name); + goto out; + } + retval = SUCCESS; +out: + sdev_printk(KERN_INFO, scmd->device, + "%s: abort task is %s for scmd(%p)\n", mrioc->name, + ((retval == SUCCESS) ? "SUCCESS" : "FAILED"), scmd); + + return retval; +} + +/** + * mpi3mr_change_queue_depth- Change QD callback handler + * @sdev: SCSI device reference + * @q_depth: Queue depth + * + * Validate and limit QD and call scsi_change_queue_depth. + * + * Return: return value of scsi_change_queue_depth + */ +static int mpi3mr_change_queue_depth(struct scsi_device *sdev, + int q_depth) +{ + struct scsi_target *starget = scsi_target(sdev); + struct Scsi_Host *shost = dev_to_shost(&starget->dev); + int retval = 0; + + if (!sdev->tagged_supported) + q_depth = 1; + if (q_depth > shost->can_queue) + q_depth = shost->can_queue; + else if (!q_depth) + q_depth = MPI3MR_DEFAULT_SDEV_QD; + retval = scsi_change_queue_depth(sdev, q_depth); + sdev->max_queue_depth = sdev->queue_depth; + + return retval; +} + +/** + * mpi3mr_scan_start - Scan start callback handler + * @shost: SCSI host reference + * + * Issue port enable request asynchronously. + * + * Return: Nothing + */ +static void mpi3mr_scan_start(struct Scsi_Host *shost) +{ + struct mpi3mr_ioc *mrioc = shost_priv(shost); + + mrioc->scan_started = 1; + ioc_info(mrioc, "scan started, issuing port enable\n"); + if (mpi3mr_issue_port_enable(mrioc, 1)) { + ioc_err(mrioc, "issuing port enable failed\n"); + mrioc->scan_started = 0; + mrioc->scan_failed = MPI3_IOCSTATUS_INTERNAL_ERROR; + } + +} + +/** + * mpi3mr_scan_finished - Scan finished callback handler + * @shost: SCSI host reference + * @time: Jiffies from the scan start + * + * Checks whether the port enable is completed or timedout or + * failed and set the scan status accordingly after taking any + * recovery if required. + * + * Return: 1 on scan finished or timed out, 0 for in progress + */ +static int mpi3mr_scan_finished(struct Scsi_Host *shost, + unsigned long time) +{ + struct mpi3mr_ioc *mrioc = shost_priv(shost); + u32 pe_timeout = MPI3MR_PORTENABLE_TIMEOUT; + u32 ioc_status = readl(&mrioc->sysif_regs->ioc_status); + + if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) || + (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) { + ioc_err(mrioc, "port enable failed due to fault or reset\n"); + mpi3mr_print_fault_info(mrioc); + mrioc->scan_failed = MPI3_IOCSTATUS_INTERNAL_ERROR; + mrioc->scan_started = 0; + mrioc->init_cmds.is_waiting = 0; + mrioc->init_cmds.callback = NULL; + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + } + + if (time >= (pe_timeout * HZ)) { + ioc_err(mrioc, "port enable failed due to time out\n"); + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_PE_TIMEOUT); + mrioc->scan_failed = MPI3_IOCSTATUS_INTERNAL_ERROR; + mrioc->scan_started = 0; + mrioc->init_cmds.is_waiting = 0; + mrioc->init_cmds.callback = NULL; + mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED; + } + + if (mrioc->scan_started) + return 0; + + if (mrioc->scan_failed) { + ioc_err(mrioc, + "port enable failed with status=0x%04x\n", + mrioc->scan_failed); + } else + ioc_info(mrioc, "port enable is successfully completed\n"); + + mpi3mr_start_watchdog(mrioc); + mrioc->is_driver_loading = 0; + mrioc->block_bsgs = 0; + return 1; +} + +/** + * mpi3mr_slave_destroy - Slave destroy callback handler + * @sdev: SCSI device reference + * + * Cleanup and free per device(lun) private data. + * + * Return: Nothing. + */ +static void mpi3mr_slave_destroy(struct scsi_device *sdev) +{ + struct Scsi_Host *shost; + struct mpi3mr_ioc *mrioc; + struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data; + struct mpi3mr_tgt_dev *tgt_dev = NULL; + unsigned long flags; + struct scsi_target *starget; + struct sas_rphy *rphy = NULL; + + if (!sdev->hostdata) + return; + + starget = scsi_target(sdev); + shost = dev_to_shost(&starget->dev); + mrioc = shost_priv(shost); + scsi_tgt_priv_data = starget->hostdata; + + scsi_tgt_priv_data->num_luns--; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + if (starget->channel == mrioc->scsi_device_channel) + tgt_dev = __mpi3mr_get_tgtdev_by_perst_id(mrioc, starget->id); + else if (mrioc->sas_transport_enabled && !starget->channel) { + rphy = dev_to_rphy(starget->dev.parent); + tgt_dev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc, + rphy->identify.sas_address, rphy); + } + + if (tgt_dev && (!scsi_tgt_priv_data->num_luns)) + tgt_dev->starget = NULL; + if (tgt_dev) + mpi3mr_tgtdev_put(tgt_dev); + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + + kfree(sdev->hostdata); + sdev->hostdata = NULL; +} + +/** + * mpi3mr_target_destroy - Target destroy callback handler + * @starget: SCSI target reference + * + * Cleanup and free per target private data. + * + * Return: Nothing. + */ +static void mpi3mr_target_destroy(struct scsi_target *starget) +{ + struct Scsi_Host *shost; + struct mpi3mr_ioc *mrioc; + struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data; + struct mpi3mr_tgt_dev *tgt_dev; + unsigned long flags; + + if (!starget->hostdata) + return; + + shost = dev_to_shost(&starget->dev); + mrioc = shost_priv(shost); + scsi_tgt_priv_data = starget->hostdata; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + tgt_dev = __mpi3mr_get_tgtdev_from_tgtpriv(mrioc, scsi_tgt_priv_data); + if (tgt_dev && (tgt_dev->starget == starget) && + (tgt_dev->perst_id == starget->id)) + tgt_dev->starget = NULL; + if (tgt_dev) { + scsi_tgt_priv_data->tgt_dev = NULL; + scsi_tgt_priv_data->perst_id = 0; + mpi3mr_tgtdev_put(tgt_dev); + mpi3mr_tgtdev_put(tgt_dev); + } + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + + kfree(starget->hostdata); + starget->hostdata = NULL; + +} + +/** + * mpi3mr_slave_configure - Slave configure callback handler + * @sdev: SCSI device reference + * + * Configure queue depth, max hardware sectors and virt boundary + * as required + * + * Return: 0 always. + */ +static int mpi3mr_slave_configure(struct scsi_device *sdev) +{ + struct scsi_target *starget; + struct Scsi_Host *shost; + struct mpi3mr_ioc *mrioc; + struct mpi3mr_tgt_dev *tgt_dev = NULL; + unsigned long flags; + int retval = 0; + struct sas_rphy *rphy = NULL; + + starget = scsi_target(sdev); + shost = dev_to_shost(&starget->dev); + mrioc = shost_priv(shost); + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + if (starget->channel == mrioc->scsi_device_channel) + tgt_dev = __mpi3mr_get_tgtdev_by_perst_id(mrioc, starget->id); + else if (mrioc->sas_transport_enabled && !starget->channel) { + rphy = dev_to_rphy(starget->dev.parent); + tgt_dev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc, + rphy->identify.sas_address, rphy); + } + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + if (!tgt_dev) + return -ENXIO; + + sdev->eh_timeout = MPI3MR_EH_SCMD_TIMEOUT; + blk_queue_rq_timeout(sdev->request_queue, MPI3MR_SCMD_TIMEOUT); + + mpi3mr_change_queue_depth(sdev, tgt_dev->q_depth); + switch (tgt_dev->dev_type) { + case MPI3_DEVICE_DEVFORM_PCIE: + /*The block layer hw sector size = 512*/ + if ((tgt_dev->dev_spec.pcie_inf.dev_info & + MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK) == + MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NVME_DEVICE) { + blk_queue_max_hw_sectors(sdev->request_queue, + tgt_dev->dev_spec.pcie_inf.mdts / 512); + if (tgt_dev->dev_spec.pcie_inf.pgsz == 0) + blk_queue_virt_boundary(sdev->request_queue, + ((1 << MPI3MR_DEFAULT_PGSZEXP) - 1)); + else + blk_queue_virt_boundary(sdev->request_queue, + ((1 << tgt_dev->dev_spec.pcie_inf.pgsz) + - 1)); + } + break; + default: + break; + } + mpi3mr_tgtdev_put(tgt_dev); + + return retval; +} + +/** + * mpi3mr_slave_alloc -Slave alloc callback handler + * @sdev: SCSI device reference + * + * Allocate per device(lun) private data and initialize it. + * + * Return: 0 on success -ENOMEM on memory allocation failure. + */ +static int mpi3mr_slave_alloc(struct scsi_device *sdev) +{ + struct Scsi_Host *shost; + struct mpi3mr_ioc *mrioc; + struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data; + struct mpi3mr_tgt_dev *tgt_dev = NULL; + struct mpi3mr_sdev_priv_data *scsi_dev_priv_data; + unsigned long flags; + struct scsi_target *starget; + int retval = 0; + struct sas_rphy *rphy = NULL; + + starget = scsi_target(sdev); + shost = dev_to_shost(&starget->dev); + mrioc = shost_priv(shost); + scsi_tgt_priv_data = starget->hostdata; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + + if (starget->channel == mrioc->scsi_device_channel) + tgt_dev = __mpi3mr_get_tgtdev_by_perst_id(mrioc, starget->id); + else if (mrioc->sas_transport_enabled && !starget->channel) { + rphy = dev_to_rphy(starget->dev.parent); + tgt_dev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc, + rphy->identify.sas_address, rphy); + } + + if (tgt_dev) { + if (tgt_dev->starget == NULL) + tgt_dev->starget = starget; + mpi3mr_tgtdev_put(tgt_dev); + retval = 0; + } else { + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + return -ENXIO; + } + + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + + scsi_dev_priv_data = kzalloc(sizeof(*scsi_dev_priv_data), GFP_KERNEL); + if (!scsi_dev_priv_data) + return -ENOMEM; + + scsi_dev_priv_data->lun_id = sdev->lun; + scsi_dev_priv_data->tgt_priv_data = scsi_tgt_priv_data; + sdev->hostdata = scsi_dev_priv_data; + + scsi_tgt_priv_data->num_luns++; + + return retval; +} + +/** + * mpi3mr_target_alloc - Target alloc callback handler + * @starget: SCSI target reference + * + * Allocate per target private data and initialize it. + * + * Return: 0 on success -ENOMEM on memory allocation failure. + */ +static int mpi3mr_target_alloc(struct scsi_target *starget) +{ + struct Scsi_Host *shost = dev_to_shost(&starget->dev); + struct mpi3mr_ioc *mrioc = shost_priv(shost); + struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data; + struct mpi3mr_tgt_dev *tgt_dev; + unsigned long flags; + int retval = 0; + struct sas_rphy *rphy = NULL; + + scsi_tgt_priv_data = kzalloc(sizeof(*scsi_tgt_priv_data), GFP_KERNEL); + if (!scsi_tgt_priv_data) + return -ENOMEM; + + starget->hostdata = scsi_tgt_priv_data; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + if (starget->channel == mrioc->scsi_device_channel) { + tgt_dev = __mpi3mr_get_tgtdev_by_perst_id(mrioc, starget->id); + if (tgt_dev && !tgt_dev->is_hidden) { + scsi_tgt_priv_data->starget = starget; + scsi_tgt_priv_data->dev_handle = tgt_dev->dev_handle; + scsi_tgt_priv_data->perst_id = tgt_dev->perst_id; + scsi_tgt_priv_data->dev_type = tgt_dev->dev_type; + scsi_tgt_priv_data->tgt_dev = tgt_dev; + tgt_dev->starget = starget; + atomic_set(&scsi_tgt_priv_data->block_io, 0); + retval = 0; + if ((tgt_dev->dev_type == + MPI3_DEVICE_DEVFORM_PCIE) && + ((tgt_dev->dev_spec.pcie_inf.dev_info & + MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK) == + MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NVME_DEVICE) && + ((tgt_dev->dev_spec.pcie_inf.dev_info & + MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_MASK) != + MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_0)) + scsi_tgt_priv_data->dev_nvme_dif = 1; + scsi_tgt_priv_data->io_throttle_enabled = + tgt_dev->io_throttle_enabled; + if (tgt_dev->dev_type == MPI3_DEVICE_DEVFORM_VD) + scsi_tgt_priv_data->throttle_group = + tgt_dev->dev_spec.vd_inf.tg; + } else + retval = -ENXIO; + } else if (mrioc->sas_transport_enabled && !starget->channel) { + rphy = dev_to_rphy(starget->dev.parent); + tgt_dev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc, + rphy->identify.sas_address, rphy); + if (tgt_dev && !tgt_dev->is_hidden && !tgt_dev->non_stl && + (tgt_dev->dev_type == MPI3_DEVICE_DEVFORM_SAS_SATA)) { + scsi_tgt_priv_data->starget = starget; + scsi_tgt_priv_data->dev_handle = tgt_dev->dev_handle; + scsi_tgt_priv_data->perst_id = tgt_dev->perst_id; + scsi_tgt_priv_data->dev_type = tgt_dev->dev_type; + scsi_tgt_priv_data->tgt_dev = tgt_dev; + scsi_tgt_priv_data->io_throttle_enabled = + tgt_dev->io_throttle_enabled; + tgt_dev->starget = starget; + atomic_set(&scsi_tgt_priv_data->block_io, 0); + retval = 0; + } else + retval = -ENXIO; + } + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + return retval; +} + +/** + * mpi3mr_check_return_unmap - Whether an unmap is allowed + * @mrioc: Adapter instance reference + * @scmd: SCSI Command reference + * + * The controller hardware cannot handle certain unmap commands + * for NVMe drives, this routine checks those and return true + * and completes the SCSI command with proper status and sense + * data. + * + * Return: TRUE for not allowed unmap, FALSE otherwise. + */ +static bool mpi3mr_check_return_unmap(struct mpi3mr_ioc *mrioc, + struct scsi_cmnd *scmd) +{ + unsigned char *buf; + u16 param_len, desc_len, trunc_param_len; + + trunc_param_len = param_len = get_unaligned_be16(scmd->cmnd + 7); + + if (!mrioc->pdev->revision) { + if (!param_len) { + dprint_scsi_err(mrioc, "CDB received with zero parameter length\n"); + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR); + set_host_byte(scmd, DID_OK); + SCMD_DONE(scmd); + return true; + } + + if (param_len < 24) { + dprint_scsi_err(mrioc, + "CDB received with invalid param_len: %d\n", + param_len); + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR); + mpi3mr_scsi_build_sense(scmd, 0, + ILLEGAL_REQUEST, 0x1A, 0); + SCMD_DONE(scmd); + return true; + } + if (param_len != scsi_bufflen(scmd)) { + dprint_scsi_err(mrioc, + "CDB received with param_len: %d bufflen: %d\n", + param_len, scsi_bufflen(scmd)); + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR); + mpi3mr_scsi_build_sense(scmd, 0, + ILLEGAL_REQUEST, 0x1A, 0); + SCMD_DONE(scmd); + return true; + } + buf = kzalloc(scsi_bufflen(scmd), GFP_ATOMIC); + if (!buf) { + mpi3mr_scsi_build_sense(scmd, 0, + ILLEGAL_REQUEST, 0x55, 0x03); + SCMD_DONE(scmd); + return true; + } + scsi_sg_copy_to_buffer(scmd, buf, scsi_bufflen(scmd)); + desc_len = get_unaligned_be16(&buf[2]); + + if (desc_len < 16) { + dprint_scsi_err(mrioc, + "invalid descriptor length in parameter list: %d\n", + desc_len); + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR); + mpi3mr_scsi_build_sense(scmd, 0, + ILLEGAL_REQUEST, 0x26, 0); + SCMD_DONE(scmd); + kfree(buf); + return true; + } + + if (param_len > (desc_len + 8)) { + trunc_param_len = desc_len + 8; + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR); + dprint_scsi_err(mrioc, + "truncating param_len(%d) to desc_len+8(%d)\n", + param_len, trunc_param_len); + put_unaligned_be16(trunc_param_len, scmd->cmnd + 7); + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR); + } + + kfree(buf); + } else { + if ((param_len > 24) && ((param_len - 8) & 0xF)) { + trunc_param_len -= (param_len - 8) & 0xF; + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR); + dprint_scsi_err(mrioc, + "truncating param_len from (%d) to (%d)\n", + param_len, trunc_param_len); + put_unaligned_be16(trunc_param_len, scmd->cmnd + 7); + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR); + } + } + return false; +} + +/** + * mpi3mr_allow_scmd_to_fw - Command is allowed during shutdown + * @scmd: SCSI Command reference + * + * Checks whether a cdb is allowed during shutdown or not. + * + * Return: TRUE for allowed commands, FALSE otherwise. + */ + +inline bool mpi3mr_allow_scmd_to_fw(struct scsi_cmnd *scmd) +{ + switch (scmd->cmnd[0]) { + case SYNCHRONIZE_CACHE: + case START_STOP: + return true; + default: + return false; + } +} + +/** + * mpi3mr_qcmd - I/O request dispatcher + * @shost: SCSI Host reference + * @scmd: SCSI Command reference + * + * Issues the SCSI Command as an MPI3 request. + * + * Return: 0 on successful queueing of the request or if the + * request is completed with failure. + * SCSI_MLQUEUE_DEVICE_BUSY when the device is busy. + * SCSI_MLQUEUE_HOST_BUSY when the host queue is full. + */ +static int mpi3mr_qcmd(struct Scsi_Host *shost, + struct scsi_cmnd *scmd) +{ + struct mpi3mr_ioc *mrioc = shost_priv(shost); + struct mpi3mr_stgt_priv_data *stgt_priv_data; + struct mpi3mr_sdev_priv_data *sdev_priv_data; + struct scmd_priv *scmd_priv_data = NULL; + struct mpi3_scsi_io_request *scsiio_req = NULL; + struct op_req_qinfo *op_req_q = NULL; + int retval = 0; + u16 dev_handle; + u16 host_tag; + u32 scsiio_flags = 0, data_len_blks = 0; + struct request *rq = SCMD_GET_REQUEST(scmd); + int iprio_class; + u8 is_pcie_dev = 0; + struct chain_element *chain_req; + u32 tracked_io_sz = 0; + u32 ioc_pend_data_len = 0, tg_pend_data_len = 0; + struct mpi3mr_throttle_group_info *tg = NULL; + + + dprint_scsi_info(mrioc, "qcmd invoked for scmd(%p)\n", scmd); + dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_INFO); + + if (mrioc->unrecoverable) { + set_host_byte(scmd, DID_NO_CONNECT); + SCMD_DONE(scmd); + goto out; + } + + sdev_priv_data = scmd->device->hostdata; + if (!sdev_priv_data || !sdev_priv_data->tgt_priv_data) { + set_host_byte(scmd, DID_NO_CONNECT); + SCMD_DONE(scmd); + goto out; + } + + if (mrioc->stop_drv_processing && + !(mpi3mr_allow_scmd_to_fw(scmd))) { + set_host_byte(scmd, DID_NO_CONNECT); + SCMD_DONE(scmd); + goto out; + } + + if (mrioc->reset_in_progress || mrioc->prepare_for_reset) { + retval = SCSI_MLQUEUE_HOST_BUSY; + goto out; + } + + stgt_priv_data = sdev_priv_data->tgt_priv_data; + + if (atomic_read(&stgt_priv_data->block_io)) { + if (mrioc->stop_drv_processing) { + set_host_byte(scmd, DID_NO_CONNECT); + SCMD_DONE(scmd); + goto out; + } + retval = SCSI_MLQUEUE_DEVICE_BUSY; + goto out; + } + + dev_handle = stgt_priv_data->dev_handle; + if (dev_handle == MPI3MR_INVALID_DEV_HANDLE) { + set_host_byte(scmd, DID_NO_CONNECT); + SCMD_DONE(scmd); + goto out; + } + if (stgt_priv_data->dev_removed) { + set_host_byte(scmd, DID_NO_CONNECT); + SCMD_DONE(scmd); + goto out; + } + +#if defined(IO_COUNTER_SUPPORT) + if (atomic_read(&mrioc->pend_ios) >= shost->can_queue) { + retval = SCSI_MLQUEUE_HOST_BUSY; + goto out; + } +#endif + + if (stgt_priv_data->dev_type == MPI3_DEVICE_DEVFORM_PCIE) + is_pcie_dev = 1; + if ((scmd->cmnd[0] == UNMAP) && is_pcie_dev && + (mrioc->pdev->device == MPI3_MFGPAGE_DEVID_SAS4116) && + mpi3mr_check_return_unmap(mrioc, scmd)) + goto out; + + host_tag = mpi3mr_host_tag_for_scmd(mrioc, scmd); + if (host_tag == MPI3MR_HOSTTAG_INVALID) { + set_host_byte(scmd, DID_ERROR); + SCMD_DONE(scmd); + goto out; + } + + if (scmd->sc_data_direction == DMA_FROM_DEVICE) + scsiio_flags = MPI3_SCSIIO_FLAGS_DATADIRECTION_READ; + else if (scmd->sc_data_direction == DMA_TO_DEVICE) + scsiio_flags = MPI3_SCSIIO_FLAGS_DATADIRECTION_WRITE; + else + scsiio_flags = MPI3_SCSIIO_FLAGS_DATADIRECTION_NO_DATA_TRANSFER; + + scsiio_flags |= MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_SIMPLEQ; + + if (sdev_priv_data->ncq_prio_enable) { + iprio_class = IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); + if (iprio_class == IOPRIO_CLASS_RT) + scsiio_flags |= 1 << MPI3_SCSIIO_FLAGS_CMDPRI_SHIFT; + } + + if (scmd->cmd_len > 16) + scsiio_flags |= MPI3_SCSIIO_FLAGS_CDB_GREATER_THAN_16; + + scmd_priv_data = scsi_cmd_priv(scmd); + memset(scmd_priv_data->mpi3mr_scsiio_req, 0, MPI3MR_ADMIN_REQ_FRAME_SZ); + scsiio_req = (struct mpi3_scsi_io_request *) + scmd_priv_data->mpi3mr_scsiio_req; + scsiio_req->function = MPI3_FUNCTION_SCSI_IO; + scsiio_req->host_tag = cpu_to_le16(host_tag); + + if (!is_pcie_dev) + mpi3mr_setup_sas_eedp(mrioc, scmd, scsiio_req); + else if (stgt_priv_data->dev_nvme_dif) + mpi3mr_setup_nvme_eedp(mrioc, scmd, scsiio_req, &scsiio_flags); + + memcpy(scsiio_req->cdb.cdb32, scmd->cmnd, scmd->cmd_len); + scsiio_req->data_length = cpu_to_le32(scsi_bufflen(scmd)); + scsiio_req->dev_handle = cpu_to_le16(dev_handle); + + int_to_scsilun(sdev_priv_data->lun_id, + (struct scsi_lun *)scsiio_req->lun); + + if (mpi3mr_build_sg_scmd(mrioc, scmd, scsiio_req)) { + mpi3mr_clear_scmd_priv(mrioc, scmd); + retval = SCSI_MLQUEUE_HOST_BUSY; + goto out; + } + if (mrioc->logging_level & MPI3_DEBUG_SG) { + dprint_dump(scmd_priv_data->mpi3mr_scsiio_req, + MPI3MR_ADMIN_REQ_FRAME_SZ, "mpi3_scsi_io_req"); + if (scmd_priv_data->chain_idx >= 0) { + chain_req = + &mrioc->chain_sgl_list[scmd_priv_data->chain_idx]; + dprint_dump(chain_req->addr, MPI3MR_CHAINSGE_SIZE, + "chain_sge"); + } + if (scmd_priv_data->meta_chain_idx > 0) { + chain_req = + &mrioc->chain_sgl_list[scmd_priv_data->meta_chain_idx]; + ioc_info(mrioc, "meta SGE\n"); + dprint_dump(chain_req->addr, MPI3MR_CHAINSGE_SIZE, + "meta_chain_sge"); + } + } + op_req_q = &mrioc->req_qinfo[scmd_priv_data->req_q_idx]; + data_len_blks = scsi_bufflen(scmd) >> 9; + if ((data_len_blks >= mrioc->io_throttle_data_length) && + stgt_priv_data->io_throttle_enabled) { + tracked_io_sz = data_len_blks; + tg = stgt_priv_data->throttle_group; + if (tg) { + ioc_pend_data_len = atomic_add_return(data_len_blks, + &mrioc->pend_large_data_sz); + tg_pend_data_len = atomic_add_return(data_len_blks, + &tg->pend_large_data_sz); +#ifdef THROTTLE_LOGGING + if (printk_ratelimit()) + ioc_info(mrioc, + "large vd_io persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), tg_pending(%d), ioc_high(%d), tg_high(%d)\n", + stgt_priv_data->perst_id, dev_handle, + data_len_blks, ioc_pend_data_len, + tg_pend_data_len, mrioc->io_throttle_high, + tg->high); +#endif + if (!tg->io_divert && ((ioc_pend_data_len >= + mrioc->io_throttle_high) || + (tg_pend_data_len >= tg->high))) { + tg->io_divert = 1; + tg->need_qd_reduction = 1; + mpi3mr_set_io_divert_for_all_vd_in_tg(mrioc, + tg, 1); + mpi3mr_queue_qd_reduction_event(mrioc, tg); + } + } else { + ioc_pend_data_len = atomic_add_return(data_len_blks, + &mrioc->pend_large_data_sz); +#ifdef THROTTLE_LOGGING + if (printk_ratelimit()) + ioc_info(mrioc, + "large pd_io persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), ioc_high(%d)\n", + stgt_priv_data->perst_id, dev_handle, + data_len_blks, ioc_pend_data_len, + mrioc->io_throttle_high); +#endif + if ( ioc_pend_data_len >= mrioc->io_throttle_high) + stgt_priv_data->io_divert = 1; + } + } + + if (stgt_priv_data->io_divert) { +#ifdef THROTTLE_LOGGING + if (printk_ratelimit()) { + scsi_print_command(scmd); + ioc_info(mrioc, "setting divert flag for host_tag(%d), qid(%d)\n", + host_tag, scmd_priv_data->req_q_idx); + } +#endif + scsiio_req->msg_flags |= + MPI3_SCSIIO_MSGFLAGS_DIVERT_TO_FIRMWARE; + scsiio_flags |= MPI3_SCSIIO_FLAGS_DIVERT_REASON_IO_THROTTLING; + } + scsiio_req->flags = cpu_to_le32(scsiio_flags); + + if (mpi3mr_op_request_post(mrioc, op_req_q, + scmd_priv_data->mpi3mr_scsiio_req)) { + mpi3mr_clear_scmd_priv(mrioc, scmd); + retval = SCSI_MLQUEUE_HOST_BUSY; + if (tracked_io_sz) { + atomic_sub(tracked_io_sz, &mrioc->pend_large_data_sz); + if (tg) + atomic_sub(tracked_io_sz, + &tg->pend_large_data_sz); + } + goto out; + } + dprint_scsi_info(mrioc, "sent scmd(%p) to the controller\n", scmd); + +out: + return retval; +} + +static struct scsi_host_template mpi3mr_driver_template = { + .module = THIS_MODULE, + .name = "MPI3 Storage Controller", + .proc_name = MPI3MR_DRIVER_NAME, + .queuecommand = mpi3mr_qcmd, + .target_alloc = mpi3mr_target_alloc, + .slave_alloc = mpi3mr_slave_alloc, + .slave_configure = mpi3mr_slave_configure, + .target_destroy = mpi3mr_target_destroy, + .slave_destroy = mpi3mr_slave_destroy, + .scan_finished = mpi3mr_scan_finished, + .scan_start = mpi3mr_scan_start, + .change_queue_depth = mpi3mr_change_queue_depth, + .eh_abort_handler = mpi3mr_eh_abort, + .eh_device_reset_handler = mpi3mr_eh_dev_reset, + .eh_target_reset_handler = mpi3mr_eh_target_reset, + .eh_host_reset_handler = mpi3mr_eh_host_reset, + .bios_param = mpi3mr_bios_param, +#if ((defined(RHEL_MAJOR) && (RHEL_MAJOR == 8)) || \ + (KERNEL_VERSION(5, 0, 0) <= LINUX_VERSION_CODE)) + .map_queues = mpi3mr_map_queues, +#endif +#if (KERNEL_VERSION(5, 13, 0) <= LINUX_VERSION_CODE) + .mq_poll = mpi3mr_blk_mq_poll, +#endif + .no_write_same = 1, + .can_queue = 1, + .this_id = -1, + .sg_tablesize = MPI3MR_SG_DEPTH, + .max_sectors = MPI3MR_MAX_SECTORS, + .cmd_per_lun = MPI3MR_MAX_CMDS_LUN, +#if (KERNEL_VERSION(5, 0, 0) > LINUX_VERSION_CODE) + .use_clustering = ENABLE_CLUSTERING, +#endif +#if (KERNEL_VERSION(5, 3, 0) <= LINUX_VERSION_CODE) + .max_segment_size = 0xffffffff, +#endif +#if (KERNEL_VERSION(5, 16, 0) > LINUX_VERSION_CODE) + .shost_attrs = mpi3mr_host_attrs, + .sdev_attrs = mpi3mr_dev_attrs, +#else + .shost_groups = mpi3mr_host_groups, + .sdev_groups = mpi3mr_dev_groups, +#endif + .track_queue_depth = 1, + .cmd_size = sizeof(struct scmd_priv), +}; + + +/** + * mpi3mr_init_drv_cmd - Initialize internal command tracker + * @cmdptr: Internal command tracker + * @host_tag: Host tag used for the specific command + * + * Initialize the internal command tracker structure with + * specified host tag. + * + * Return: Nothing. + */ +static inline void mpi3mr_init_drv_cmd(struct mpi3mr_drv_cmd *cmdptr, + u16 host_tag) +{ + mutex_init(&cmdptr->mutex); + cmdptr->reply = NULL; + cmdptr->state = MPI3MR_CMD_NOTUSED; + cmdptr->dev_handle = MPI3MR_INVALID_DEV_HANDLE; + cmdptr->host_tag = host_tag; +} + +/** + * osintfc_mrioc_security_status -Check controller secure status + * @pdev: PCI device instance + * + * Read the Device Serial Number capability from PCI config + * space and decide whether the controller is secure or not. + * + * Return: 0 on success, non-zero on failure. + */ +static int +osintfc_mrioc_security_status(struct pci_dev *pdev) +{ + u32 cap_data; + int base; + u32 ctlr_status; + u32 debug_status; + int retval = 0; + + base = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DSN); + if (!base) { + dev_err(&pdev->dev, "PCI_EXT_CAP_ID_DSN is not supported\n"); + return -1; + } + + pci_read_config_dword(pdev, base + 4, &cap_data); + + debug_status = cap_data & MPI3MR_CTLR_SECURE_DBG_STATUS_MASK; + ctlr_status = cap_data & MPI3MR_CTLR_SECURITY_STATUS_MASK; + + switch (ctlr_status) { + case MPI3MR_INVALID_DEVICE: + dev_err(&pdev->dev, + "non secure controller (Invalid) is detected: DID: 0x%x: SVID: 0x%x: SDID: 0x%x\n", + pdev->device, pdev->subsystem_vendor, + pdev->subsystem_device); + retval = -1; + break; + case MPI3MR_CONFIG_SECURE_DEVICE: + if (!debug_status) + dev_info(&pdev->dev, + "configurable secure controller is detected\n"); + break; + case MPI3MR_HARD_SECURE_DEVICE: + break; + case MPI3MR_TAMPERED_DEVICE: + dev_err(&pdev->dev, + "non secure controller (Tampered) is detected: DID: 0x%x: SVID: 0x%x: SDID: 0x%x\n", + pdev->device, pdev->subsystem_vendor, + pdev->subsystem_device); + retval = -1; + break; + default: + retval = -1; + break; + } + + if (!retval && debug_status) { + dev_err(&pdev->dev, + "non secure controller (Secure Debug) is detected: DID: 0x%x: SVID: 0x%x: SDID: 0x%x\n", + pdev->device, pdev->subsystem_vendor, + pdev->subsystem_device); + retval = -1; + } + + return retval; +} + +/** + * mpi3mr_probe - PCI probe callback + * @pdev: PCI device instance + * @id: PCI device ID details + * + * controller initialization routine. Checks the security status + * of the controller and if it is invalid or tampered return the + * probe without initializing the controller. Otherwise, + * allocate per adapter instance through shost_priv and + * initialize controller specific data structures, initialize + * the controller hardware, add shost to the SCSI subsystem. + * + * Return: 0 on success, non-zero on failure. + */ + +static int +mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct mpi3mr_ioc *mrioc = NULL; + struct Scsi_Host *shost = NULL; + int retval = 0, i, prot_mask = 0; + + if (osintfc_mrioc_security_status(pdev)) { + warn_non_secure_ctlr = 1; + return 1; /* For Invalid and Tampered device */ + } + + shost = scsi_host_alloc(&mpi3mr_driver_template, + sizeof(struct mpi3mr_ioc)); + if (!shost) { + retval = -ENODEV; + goto shost_failed; + } + + mrioc = shost_priv(shost); + mrioc->id = mrioc_ids++; + if (!pdev->revision) + mrioc->is_segqueue_enabled = false; + else + mrioc->is_segqueue_enabled = enable_segqueue; + sprintf(mrioc->driver_name, "%s", MPI3MR_DRIVER_NAME); + sprintf(mrioc->name, "%s%d", mrioc->driver_name, mrioc->id); + dev_info(&pdev->dev, "PCI device is: %s\n", mrioc->name); + INIT_LIST_HEAD(&mrioc->list); + spin_lock(&mrioc_list_lock); + list_add_tail(&mrioc->list, &mrioc_list); + spin_unlock(&mrioc_list_lock); + + spin_lock_init(&mrioc->admin_req_lock); + spin_lock_init(&mrioc->reply_free_queue_lock); + spin_lock_init(&mrioc->sbq_lock); + spin_lock_init(&mrioc->fwevt_lock); + spin_lock_init(&mrioc->tgtdev_lock); + spin_lock_init(&mrioc->watchdog_lock); + spin_lock_init(&mrioc->chain_buf_lock); + spin_lock_init(&mrioc->adm_req_q_bar_writeq_lock); + spin_lock_init(&mrioc->adm_reply_q_bar_writeq_lock); + spin_lock_init(&mrioc->sas_node_lock); + spin_lock_init(&mrioc->trigger_lock); + + INIT_LIST_HEAD(&mrioc->fwevt_list); + INIT_LIST_HEAD(&mrioc->tgtdev_list); + INIT_LIST_HEAD(&mrioc->delayed_rmhs_list); + INIT_LIST_HEAD(&mrioc->delayed_evtack_cmds_list); + INIT_LIST_HEAD(&mrioc->sas_expander_list); + INIT_LIST_HEAD(&mrioc->hba_port_table_list); + INIT_LIST_HEAD(&mrioc->enclosure_list); + + mutex_init(&mrioc->reset_mutex); + + mpi3mr_init_drv_cmd(&mrioc->init_cmds, MPI3MR_HOSTTAG_INITCMDS); + mpi3mr_init_drv_cmd(&mrioc->cfg_cmds, MPI3MR_HOSTTAG_CFG_CMDS); + mpi3mr_init_drv_cmd(&mrioc->bsg_cmds, MPI3MR_HOSTTAG_BSG_CMDS); + mpi3mr_init_drv_cmd(&mrioc->host_tm_cmds, MPI3MR_HOSTTAG_BLK_TMS); + mpi3mr_init_drv_cmd(&mrioc->pel_abort_cmd, MPI3MR_HOSTTAG_PEL_ABORT); + mpi3mr_init_drv_cmd(&mrioc->pel_cmds, MPI3MR_HOSTTAG_PEL_WAIT); + mpi3mr_init_drv_cmd(&mrioc->transport_cmds, + MPI3MR_HOSTTAG_TRANSPORT_CMDS); + + for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++) + mpi3mr_init_drv_cmd(&mrioc->dev_rmhs_cmds[i], + MPI3MR_HOSTTAG_DEVRMCMD_MIN + i); + + for (i = 0; i < MPI3MR_NUM_SYSFS_TM; i++) + mpi3mr_init_drv_cmd(&mrioc->sysfs_tm_cmds[i], + MPI3MR_HOSTTAG_SYSFS_TM_MIN + i); + + for (i = 0; i < MPI3MR_NUM_EVTACKCMD; i++) + mpi3mr_init_drv_cmd(&mrioc->evtack_cmds[i], + MPI3MR_HOSTTAG_EVTACKCMD_MIN + i); + + init_waitqueue_head(&mrioc->reset_waitq); + + mrioc->logging_level = logging_level; + mrioc->shost = shost; + mrioc->pdev = pdev; + mrioc->block_bsgs = 1; + + /* init shost parameters */ + shost->max_cmd_len = MPI3MR_MAX_CDB_LENGTH; + shost->max_lun = -1; + shost->unique_id = mrioc->id; + + shost->max_channel = 0; + shost->max_id = 0xFFFFFFFF; + +#if defined(HOST_TAGSET_SUPPORT) + shost->host_tagset = 1; +#endif + + if (enable_dix) { + prot_mask = SHOST_DIF_TYPE1_PROTECTION + | SHOST_DIF_TYPE2_PROTECTION + | SHOST_DIF_TYPE3_PROTECTION + | SHOST_DIX_TYPE1_PROTECTION + | SHOST_DIX_TYPE2_PROTECTION + | SHOST_DIX_TYPE3_PROTECTION; + enable_dif = true; + } else if (enable_dif) + prot_mask = SHOST_DIF_TYPE1_PROTECTION + | SHOST_DIF_TYPE2_PROTECTION + | SHOST_DIF_TYPE3_PROTECTION; + else + prot_mask = 0; + + scsi_host_set_prot(shost, prot_mask); + + if (enable_dix && (pdev->device == MPI3_MFGPAGE_DEVID_SAS4116) && + pdev->revision) + mrioc->check_xprotect_nvme = true; + else + mrioc->check_xprotect_nvme = false; + + ioc_info(mrioc, + "host protection capabilities enabled %s%s%s%s%s%s\n", + (prot_mask & SHOST_DIF_TYPE1_PROTECTION) ? " DIF1" : "", + (prot_mask & SHOST_DIF_TYPE2_PROTECTION) ? " DIF2" : "", + (prot_mask & SHOST_DIF_TYPE3_PROTECTION) ? " DIF3" : "", + (prot_mask & SHOST_DIX_TYPE1_PROTECTION) ? " DIX1" : "", + (prot_mask & SHOST_DIX_TYPE2_PROTECTION) ? " DIX2" : "", + (prot_mask & SHOST_DIX_TYPE3_PROTECTION) ? " DIX3" : ""); + + scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); + + snprintf(mrioc->fwevt_worker_name, sizeof(mrioc->fwevt_worker_name), + "%s%d_fwevt_wrkr", mrioc->driver_name, mrioc->id); + mrioc->fwevt_worker_thread = alloc_ordered_workqueue( + mrioc->fwevt_worker_name, 0); + if (!mrioc->fwevt_worker_thread) { + ioc_err(mrioc, "firmware worker thread creation failed\n"); + retval = -ENODEV; + goto fwevt_thread_failed; + } + + mrioc->is_driver_loading = 1; + mrioc->cpu_count = num_online_cpus(); + + if (mpi3mr_setup_resources(mrioc)) { + ioc_err(mrioc, "setup resources failed\n"); + retval = -ENODEV; + goto resource_alloc_failed; + } + if (mpi3mr_init_ioc(mrioc)) { + ioc_err(mrioc, "initializing IOC failed\n"); + retval = -ENODEV; + goto init_ioc_failed; + } + + shost->nr_hw_queues = 1; + if (mpi3mr_use_blk_mq(mrioc->shost)) { + shost->nr_hw_queues = mrioc->num_op_reply_q; +#if (KERNEL_VERSION(5, 13, 0) <= LINUX_VERSION_CODE) + if (mrioc->active_poll_qcount) + shost->nr_maps = 3; +#endif + } + + shost->can_queue = mrioc->max_host_ios; + shost->sg_tablesize = MPI3MR_SG_DEPTH; + shost->max_id = mrioc->facts.max_perids + 1; + + retval = scsi_add_host(shost, &pdev->dev); + if (retval) { + ioc_err(mrioc, "scsi_add_host failed error:%d\n", retval); + goto addhost_failed; + } + + scsi_scan_host(shost); + mpi3mr_setup_debugfs(mrioc); + mpi3mr_bsg_init(mrioc); + return retval; + +addhost_failed: + mpi3mr_stop_watchdog(mrioc); + mpi3mr_cleanup_ioc(mrioc); +init_ioc_failed: + mpi3mr_free_mem(mrioc); + mpi3mr_cleanup_resources(mrioc); +resource_alloc_failed: + destroy_workqueue(mrioc->fwevt_worker_thread); +fwevt_thread_failed: + spin_lock(&mrioc_list_lock); + list_del(&mrioc->list); + spin_unlock(&mrioc_list_lock); + scsi_host_put(shost); +shost_failed: + return retval; +} + +/** + * mpi3mr_remove - PCI remove callback + * @pdev: PCI device instance + * + * Cleanup the IOC by issuing MUR and shutdown notification. + * Free up all memory and resources associated with the + * controllerand target devices, unregister the shost. + * + * Return: Nothing. + */ +static void mpi3mr_remove(struct pci_dev *pdev) +{ + struct Scsi_Host *shost = pci_get_drvdata(pdev); + struct mpi3mr_ioc *mrioc; + struct workqueue_struct *wq; + unsigned long flags; + struct mpi3mr_tgt_dev *tgtdev, *tgtdev_next; + + if (!shost) + return; + + mrioc = shost_priv(shost); + while (mrioc->reset_in_progress || mrioc->is_driver_loading) + ssleep(1); + if (!pci_device_is_present(mrioc->pdev)) { + mrioc->unrecoverable = 1; + mpi3mr_flush_cmds_for_unrecovered_controller(mrioc); + } + + mpi3mr_bsg_exit(mrioc); + mpi3mr_destroy_debugfs(mrioc); + mrioc->stop_drv_processing = 1; + + mpi3mr_cleanup_fwevt_list(mrioc); + spin_lock_irqsave(&mrioc->fwevt_lock, flags); + wq = mrioc->fwevt_worker_thread; + mrioc->fwevt_worker_thread = NULL; + spin_unlock_irqrestore(&mrioc->fwevt_lock, flags); + if (wq) + destroy_workqueue(wq); + + if (mrioc->sas_transport_enabled) + sas_remove_host(shost); + scsi_remove_host(shost); + + list_for_each_entry_safe(tgtdev, tgtdev_next, &mrioc->tgtdev_list, + list) { + mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev); + mpi3mr_tgtdev_del_from_list(mrioc, tgtdev); + mpi3mr_tgtdev_put(tgtdev); + } + mpi3mr_stop_watchdog(mrioc); + mpi3mr_cleanup_ioc(mrioc); + mpi3mr_free_mem(mrioc); + mpi3mr_cleanup_resources(mrioc); + + spin_lock(&mrioc_list_lock); + list_del(&mrioc->list); + spin_unlock(&mrioc_list_lock); + + scsi_host_put(shost); +} + +/** + * mpi3mr_suspend - PCI shutdown callback + * @pdev: PCI device instance + * + * Cleanup the IOC by issuing MUR and shutdown notification. + * Free up all memory and resources associated with the + * controller + * + * Return: Nothing. + */ +static void mpi3mr_shutdown(struct pci_dev *pdev) +{ + struct Scsi_Host *shost = pci_get_drvdata(pdev); + struct mpi3mr_ioc *mrioc; + struct workqueue_struct *wq; + unsigned long flags; + + if (!shost) + return; + + mrioc = shost_priv(shost); + while (mrioc->reset_in_progress || mrioc->is_driver_loading) + ssleep(1); + mrioc->stop_drv_processing = 1; + + mpi3mr_cleanup_fwevt_list(mrioc); + spin_lock_irqsave(&mrioc->fwevt_lock, flags); + wq = mrioc->fwevt_worker_thread; + mrioc->fwevt_worker_thread = NULL; + spin_unlock_irqrestore(&mrioc->fwevt_lock, flags); + if (wq) + destroy_workqueue(wq); + + mpi3mr_stop_watchdog(mrioc); + mpi3mr_cleanup_ioc(mrioc); + mpi3mr_cleanup_resources(mrioc); + +} + +/** + * mpi3mr_suspend - PCI power management suspend callback + * @dev: Device struct + * + * Change the power state to the given value and cleanup the IOC + * by issuing MUR and shutdown notification + * + * Return: 0 always. + */ +static int __maybe_unused +mpi3mr_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct Scsi_Host *shost = pci_get_drvdata(pdev); + struct mpi3mr_ioc *mrioc; + + if (!shost) + return 0; + + mrioc = shost_priv(shost); + while (mrioc->reset_in_progress || mrioc->is_driver_loading) + ssleep(1); + mrioc->stop_drv_processing = 1; + mpi3mr_cleanup_fwevt_list(mrioc); + scsi_block_requests(shost); + mpi3mr_stop_watchdog(mrioc); + mpi3mr_cleanup_ioc(mrioc); + + ioc_info(mrioc, + "suspending controller pdev=0x%p, slot=%s, entering operating state\n", + pdev, pci_name(pdev)); + mpi3mr_cleanup_resources(mrioc); + + return 0; +} + +/** + * mpi3mr_resume - PCI power management resume callback + * @dev: Device struct + * + * Restore the power state to D0 and reinitialize the controller + * and resume I/O operations to the target devices + * + * Return: 0 on success, non-zero on failure + */ +static int __maybe_unused +mpi3mr_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct Scsi_Host *shost = pci_get_drvdata(pdev); + struct mpi3mr_ioc *mrioc; + pci_power_t device_state = pdev->current_state; + int r; + + if (!shost) + return 0; + + mrioc = shost_priv(shost); + + ioc_info(mrioc, + "resuming controller pdev=0x%p, slot=%s, previous operating state [D%d]\n", + pdev, pci_name(pdev), device_state); + mrioc->pdev = pdev; + mrioc->cpu_count = num_online_cpus(); + r = mpi3mr_setup_resources(mrioc); + if (r) { + ioc_err(mrioc, "setup resoruces failed[%d]\n", r); + return r; + } + + mrioc->stop_drv_processing = 0; + mpi3mr_invalidate_devhandles(mrioc); + mpi3mr_free_enclosure_list(mrioc); + mpi3mr_memset_buffers(mrioc); + r = mpi3mr_reinit_ioc(mrioc, 1); + if (r) { + ioc_err(mrioc, "resuming controller failed[%d]\n", r); + return r; + } + ssleep(MPI3MR_RESET_TOPOLOGY_SETTLE_TIME); + scsi_unblock_requests(shost); + mrioc->device_refresh_on = 0; + mpi3mr_start_watchdog(mrioc); + + return 0; +} + +static ssize_t event_counter_show(struct device_driver *dd, char *buf) +{ + return sprintf(buf, "%llu\n", atomic64_read(&event_counter)); +} +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)) +static DRIVER_ATTR_RO(event_counter); +#else +static DRIVER_ATTR(version, S_IRUGO, event_counter_show, NULL); +#endif + +/** + * mpi3mr_pcierr_detected - PCI error detected callback + * @pdev: PCI device instance + * @state: channel state + * + * Template function, need to implement actual handling + * + * Return: PCI_ERS_RESULT_NEED_RESET + */ +static pci_ers_result_t +mpi3mr_pcierr_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + dev_info(&pdev->dev, "%s: callback invoked state(%d)\n", __func__, + state); + return PCI_ERS_RESULT_NEED_RESET; +} + +/** + * mpi3mr_pcierr_slot_reset - PCI error recovery slot reset + * @pdev: PCI device instance + * + * Template function, need to implement actual handling + * + * Return: PCI_ERS_RESULT_DISCONNECT + */ +static pci_ers_result_t mpi3mr_pcierr_slot_reset(struct pci_dev *pdev) +{ + dev_info(&pdev->dev, "%s: callback invoked\n", __func__); + return PCI_ERS_RESULT_DISCONNECT; +} + +/** + * mpi3mr_pcierr_mmio_enabled - PCI error recovery resume + * callback + * @pdev: PCI device instance + * + * Template function, need to implement actual handling + * + * Return: Nothing. + */ +static void mpi3mr_pcierr_resume(struct pci_dev *pdev) +{ + dev_info(&pdev->dev, "%s: callback invoked\n", __func__); +} + +/** + * mpi3mr_pcierr_mmio_enabled - PCI error recovery callback + * @pdev: PCI device instance + * + * Template function, need to implement actual handling + * + * Return: PCI_ERS_RESULT_RECOVERED + */ +static pci_ers_result_t mpi3mr_pcierr_mmio_enabled(struct pci_dev *pdev) +{ +/* + * This is called only if _pcierr_error_detected returns + * PCI_ERS_RESULT_CAN_RECOVER. Read/Write to the device still works and + * there is no need to reset the slot + */ + dev_info(&pdev->dev, "%s: callback invoked\n", __func__); + return PCI_ERS_RESULT_RECOVERED; +} + +static const struct pci_device_id mpi3mr_pci_id_table[] = { + { + PCI_DEVICE_SUB(MPI3_MFGPAGE_VENDORID_BROADCOM, + MPI3_MFGPAGE_DEVID_SAS4116, PCI_ANY_ID, PCI_ANY_ID) + }, + { 0 } +}; +MODULE_DEVICE_TABLE(pci, mpi3mr_pci_id_table); + +static struct pci_error_handlers mpi3mr_err_handler = { + .error_detected = mpi3mr_pcierr_detected, + .mmio_enabled = mpi3mr_pcierr_mmio_enabled, + .slot_reset = mpi3mr_pcierr_slot_reset, + .resume = mpi3mr_pcierr_resume, +}; + +static SIMPLE_DEV_PM_OPS(mpi3mr_pm_ops, mpi3mr_suspend, mpi3mr_resume); + +static struct pci_driver mpi3mr_pci_driver = { + .name = MPI3MR_DRIVER_NAME, + .id_table = mpi3mr_pci_id_table, + .probe = mpi3mr_probe, + .remove = mpi3mr_remove, + .shutdown = mpi3mr_shutdown, + .err_handler = &mpi3mr_err_handler, + .driver.pm = &mpi3mr_pm_ops, +}; + +/** + * mpi3mr_init - Module init entry point + * @void: No argument + * Registers character driver interface and PCI driver. + * + * Return: Success or failure of PCI driver registration + */ +static int __init mpi3mr_init(void) +{ + int ret_val; + + pr_info("Loading %s version %s\n", MPI3MR_DRIVER_NAME, + MPI3MR_DRIVER_VERSION); + + mpi3mr_transport_template = + sas_attach_transport(&mpi3mr_transport_functions); + if (!mpi3mr_transport_template) { + pr_err("%s failed to load due to sas transport attach failure\n", + MPI3MR_DRIVER_NAME); + return -ENODEV; + } + + mpi3mr_init_debugfs(); + + ret_val = pci_register_driver(&mpi3mr_pci_driver); + if (ret_val) { + pr_err("%s failed to load due to pci register driver failure\n", + MPI3MR_DRIVER_NAME); + goto err_pci_reg_fail; + } + + ret_val = driver_create_file(&mpi3mr_pci_driver.driver, + &driver_attr_event_counter); + if (ret_val) + goto err_event_counter; + + return ret_val; + +err_event_counter: + pci_unregister_driver(&mpi3mr_pci_driver); + +err_pci_reg_fail: + mpi3mr_exit_debugfs(); + sas_release_transport(mpi3mr_transport_template); + + return ret_val; +} + + +/** + * mpi3mr_exit - Module unload entry point + * @void: No argument + * + * Registers character driver interface and PCI driver. + * + * Return: Nothing + */ +static void __exit mpi3mr_exit(void) +{ + if (warn_non_secure_ctlr) + pr_warn( + "Unloading %s version %s while managing a non secure controller\n", + MPI3MR_DRIVER_NAME, MPI3MR_DRIVER_VERSION); + else + pr_info("Unloading %s version %s\n", MPI3MR_DRIVER_NAME, + MPI3MR_DRIVER_VERSION); + + driver_remove_file(&mpi3mr_pci_driver.driver, &driver_attr_event_counter); + pci_unregister_driver(&mpi3mr_pci_driver); + mpi3mr_exit_debugfs(); + sas_release_transport(mpi3mr_transport_template); +} + +module_init(mpi3mr_init); +module_exit(mpi3mr_exit); diff --git a/drivers/scsi/mpi3mr/mpi3mr_transport.c b/drivers/scsi/mpi3mr/mpi3mr_transport.c new file mode 100644 index 0000000000000..def047d6e231c --- /dev/null +++ b/drivers/scsi/mpi3mr/mpi3mr_transport.c @@ -0,0 +1,3374 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for Broadcom MPI3 Storage Controllers + * + * Copyright (C) 2017-2022 Broadcom Inc. + * (mailto: mpi3mr-linuxdrv.pdl@broadcom.com) + * + */ + +#include "mpi3mr.h" + +#define MPI3MR_MAX_PHYSICAL_PHYS 32 + +static void mpi3mr_expander_node_remove(struct mpi3mr_ioc *mrioc, + struct mpi3mr_sas_node *sas_expander); + +/** + * mpi3mr_post_transport_req - Issue transport requests and wait + * @mrioc: Adapter instance reference + * @request: Properly populated MPI3 request + * @request_sz: Size of the MPI3 request + * @reply: Pointer to return MPI3 reply + * @reply_sz: Size of the MPI3 reply buffer + * @timeout: Timeout in seconds + * @ioc_status: Pointer to return ioc status + * + * A generic function for posting MPI3 requests from the SAS + * transport layer that uses transport command infrastructure. + * This blocks for the completion of request for timeout seconds + * and if the request times out this function faults the + * controller with proper reason code. + * + * On successful completion of the request this function returns + * appropriate ioc status from the firmware back to the caller. + * + * Return: 0 on success, non-zero on failure. + */ +static int mpi3mr_post_transport_req(struct mpi3mr_ioc *mrioc, void *request, + u16 request_sz, void *reply, u16 reply_sz, int timeout, + u16 *ioc_status) +{ + int retval = 0; + + mutex_lock(&mrioc->transport_cmds.mutex); + if (mrioc->transport_cmds.state & MPI3MR_CMD_PENDING) { + retval = -1; + ioc_err(mrioc, "sending transport request failed due to command in use\n"); + mutex_unlock(&mrioc->transport_cmds.mutex); + goto out; + } + mrioc->transport_cmds.state = MPI3MR_CMD_PENDING; + mrioc->transport_cmds.is_waiting = 1; + mrioc->transport_cmds.callback = NULL; + mrioc->transport_cmds.ioc_status = 0; + mrioc->transport_cmds.ioc_loginfo = 0; + + init_completion(&mrioc->transport_cmds.done); + dprint_cfg_info(mrioc, "posting transport request\n"); + if (mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO) + dprint_dump(request, request_sz,"transport_req"); + retval = mpi3mr_admin_request_post(mrioc, request, request_sz, 1); + if (retval) { + ioc_err(mrioc, "posting transport request failed\n"); + goto out_unlock; + } + wait_for_completion_timeout(&mrioc->transport_cmds.done, + (timeout * HZ)); + if (!(mrioc->transport_cmds.state & MPI3MR_CMD_COMPLETE)) { + mpi3mr_check_rh_fault_ioc(mrioc, + MPI3MR_RESET_FROM_SAS_TRANSPORT_TIMEOUT); + ioc_err(mrioc, "transport request timed out\n"); + retval = -1; + goto out_unlock; + } + *ioc_status = mrioc->transport_cmds.ioc_status & + MPI3_IOCSTATUS_STATUS_MASK; + if ((*ioc_status) != MPI3_IOCSTATUS_SUCCESS) + dprint_transport_err(mrioc, + "transport request returned with ioc_status(0x%04x), log_info(0x%08x)\n", + *ioc_status, mrioc->transport_cmds.ioc_loginfo); + + if ((reply) && (mrioc->transport_cmds.state & MPI3MR_CMD_REPLY_VALID)) + memcpy((u8 *)reply, mrioc->transport_cmds.reply, reply_sz); + +out_unlock: + mrioc->transport_cmds.state = MPI3MR_CMD_NOTUSED; + mutex_unlock(&mrioc->transport_cmds.mutex); + +out: + return retval; +} + +/** + * __mpi3mr_expander_find_by_handle - expander search by handle + * @mrioc: Adapter instance reference + * @handle: Firmware device handle of the expander + * + * Context: The caller should acquire sas_node_lock + * + * This searches for expander device based on handle, then + * returns the sas_node object. + * + * Return: Expander sas_node object reference or NULL + */ +struct mpi3mr_sas_node *__mpi3mr_expander_find_by_handle(struct mpi3mr_ioc + *mrioc, u16 handle) +{ + struct mpi3mr_sas_node *sas_expander, *r; + + r = NULL; + list_for_each_entry(sas_expander, &mrioc->sas_expander_list, list) { + if (sas_expander->handle != handle) + continue; + r = sas_expander; + goto out; + } + out: + return r; +} + +/** + * mpi3mr_enclosure_find_by_handle - enclosure search by handle + * @mrioc: Adapter instance reference + * @handle: Firmware device handle of the enclosure + + * This searches for enclosure device based on handle, then returns the + * enclosure object. + * + * Return: Enclosure object reference or NULL + */ +struct mpi3mr_enclosure_node *mpi3mr_enclosure_find_by_handle( + struct mpi3mr_ioc*mrioc, u16 handle) +{ + struct mpi3mr_enclosure_node *enclosure_dev, *r; + r = NULL; + + list_for_each_entry(enclosure_dev, &mrioc->enclosure_list, list) { + if (le16_to_cpu(enclosure_dev->pg0.enclosure_handle) != handle) + continue; + r = enclosure_dev; + goto out; + } +out: + return r; +} + + +/** + * mpi3mr_expander_node_add - insert an expander to the list. + * @mrioc: Adapter instance reference + * @sas_expander: Expander sas node + * Context: This function will acquire sas_node_lock. + * + * Adding new object to the ioc->sas_expander_list. + * + * Return: None. + */ +static void mpi3mr_expander_node_add(struct mpi3mr_ioc*mrioc, + struct mpi3mr_sas_node *sas_expander) +{ + unsigned long flags; + + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + list_add_tail(&sas_expander->list, &mrioc->sas_expander_list); + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); +} + +/** + * mpi3mr_is_sas_exp_device - if device is an expander + * @device_info: Bitfield providing information about the device + * + * Return: 1 if the device is expander device, else 0. + */ +u8 mpi3mr_is_expander_device(u16 device_info) +{ + if ((device_info & MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_MASK) == + MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_EXPANDER) + return 1; + else + return 0; +} + + +/** + * mpi3mr_get_sas_address - retrieve sas_address for handle + * @mrioc: Adapter instance reference + * @handle: Firmware device handle + * @sas_address: Address to hold sas address + * + * This function issues device page0 read for a given device + * handle and gets the SAS address and return it back + * + * Return: 0 for success, non-zero for failure + */ +static int mpi3mr_get_sas_address(struct mpi3mr_ioc *mrioc, u16 handle, + u64 *sas_address) +{ + struct mpi3_device_page0 dev_pg0; + u16 ioc_status; + struct mpi3_device0_sas_sata_format *sasinf; + + *sas_address = 0; + + if ((mpi3mr_cfg_get_dev_pg0(mrioc, &ioc_status, &dev_pg0, + sizeof(dev_pg0), MPI3_DEVICE_PGAD_FORM_HANDLE, + handle))) { + ioc_err(mrioc, "%s: device page0 read failed\n", __func__); + return -ENXIO; + } + + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "device page read failed for handle(0x%04x), with ioc_status(0x%04x) failure at %s:%d/%s()!\n", + handle, ioc_status, __FILE__, __LINE__, __func__); + return -ENXIO; + } + + if (le16_to_cpu(dev_pg0.flags) & + MPI3_DEVICE0_FLAGS_CONTROLLER_DEV_HANDLE) + *sas_address = mrioc->sas_hba.sas_address; + else if (dev_pg0.device_form == MPI3_DEVICE_DEVFORM_SAS_SATA) { + sasinf = &dev_pg0.device_specific.sas_sata_format; + *sas_address = le64_to_cpu(sasinf->sas_address); + } else { + ioc_err(mrioc, "%s: device_form(%d) is not SAS_SATA\n", + __func__, dev_pg0.device_form); + return -ENXIO; + } + return 0; +} + +/** + * __mpi3mr_get_tgtdev_by_addr - target device search + * @mrioc: Adapter instance reference + * @sas_address: SAS address of the device + * @hba_port: HBA port entry + * + * This searches for target device from sas address and hba port + * pointer then return mpi3mr_tgt_dev object. + * + * Return: Valid tget_dev or NULL + */ +struct mpi3mr_tgt_dev *__mpi3mr_get_tgtdev_by_addr(struct mpi3mr_ioc *mrioc, + u64 sas_address, struct mpi3mr_hba_port *hba_port) +{ + struct mpi3mr_tgt_dev *tgtdev; + + assert_spin_locked(&mrioc->tgtdev_lock); + + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) + if ((tgtdev->dev_type == MPI3_DEVICE_DEVFORM_SAS_SATA) && + (tgtdev->dev_spec.sas_sata_inf.sas_address == sas_address) + && (tgtdev->dev_spec.sas_sata_inf.hba_port == hba_port)) + goto found_device; + return NULL; +found_device: + mpi3mr_tgtdev_get(tgtdev); + return tgtdev; +} + +/** + * mpi3mr_get_tgtdev_by_addr - target device search + * @mrioc: Adapter instance reference + * @sas_address: SAS address of the device + * @hba_port: HBA port entry + * + * This searches for target device from sas address and hba port + * pointer then return mpi3mr_tgt_dev object. + * + * Context: This function will acquire tgtdev_lock and will + * release before returning the mpi3mr_tgt_dev object. + * + * Return: Valid tget_dev or NULL + */ +struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_by_addr(struct mpi3mr_ioc *mrioc, + u64 sas_address, struct mpi3mr_hba_port *hba_port) +{ + struct mpi3mr_tgt_dev *tgtdev = NULL; + unsigned long flags; + + if(!hba_port) + goto out; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + tgtdev = __mpi3mr_get_tgtdev_by_addr(mrioc, sas_address, hba_port); + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + +out: + return tgtdev; +} + +/** + * mpi3mr_remove_device_by_sas_address - remove the device + * @mrioc: Adapter instance reference + * @sas_address: SAS address of the device + * @hba_port: HBA port entry + * + * This searches for target device using sas address and hba + * port pointer then removes it from the OS. + * + * Return: None + */ +void mpi3mr_remove_device_by_sas_address(struct mpi3mr_ioc *mrioc, + u64 sas_address, struct mpi3mr_hba_port *hba_port) +{ + struct mpi3mr_tgt_dev *tgtdev = NULL; + unsigned long flags; + u8 was_on_tgtdev_list = 0; + + if(!hba_port) + return; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + tgtdev = __mpi3mr_get_tgtdev_by_addr(mrioc, + sas_address, hba_port); + if (tgtdev) { + if (!list_empty(&tgtdev->list)) { + list_del_init(&tgtdev->list); + was_on_tgtdev_list = 1; + mpi3mr_tgtdev_put(tgtdev); + } + } + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + /*SP2DO -- Needs revisit of the removal logic*/ + if (was_on_tgtdev_list) { + if (tgtdev->host_exposed) + mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev); + mpi3mr_tgtdev_put(tgtdev); + } +} + +/** + * __mpi3mr_get_tgtdev_by_addr_and_rphy - target device search + * @mrioc: Adapter instance reference + * @sas_address: SAS address of the device + * @rphy: SAS transport layer rphy object + * + * This searches for target device from sas address and rphy + * pointer then return mpi3mr_tgt_dev object. + * + * Return: Valid tget_dev or NULL + */ +struct mpi3mr_tgt_dev *__mpi3mr_get_tgtdev_by_addr_and_rphy( + struct mpi3mr_ioc *mrioc, u64 sas_address, struct sas_rphy *rphy) +{ + struct mpi3mr_tgt_dev *tgtdev; + + assert_spin_locked(&mrioc->tgtdev_lock); + + list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) + if ((tgtdev->dev_type == MPI3_DEVICE_DEVFORM_SAS_SATA) && + (tgtdev->dev_spec.sas_sata_inf.sas_address == sas_address) + && (tgtdev->dev_spec.sas_sata_inf.rphy == rphy)) + goto found_device; + return NULL; +found_device: + mpi3mr_tgtdev_get(tgtdev); + return tgtdev; +} + + + +/** + * mpi3mr_expander_find_by_sas_address - sas expander search + * @mrioc: Adapter instance reference + * @sas_address: SAS address of expander + * @hba_port: HBA port entry + * + * Return: A valid SAS expander node or NULL. + * + */ +struct mpi3mr_sas_node *mpi3mr_expander_find_by_sas_address( + struct mpi3mr_ioc *mrioc, u64 sas_address, + struct mpi3mr_hba_port *hba_port) +{ + struct mpi3mr_sas_node *sas_expander, *r=NULL; + if (!hba_port) + goto out; + + list_for_each_entry(sas_expander, &mrioc->sas_expander_list, list) { + if ((sas_expander->sas_address != sas_address) || + (sas_expander->hba_port != hba_port)) + continue; + r = sas_expander; + goto out; + } +out: + return r; +} + +/** + * __mpi3mr_sas_node_find_by_sas_address - sas node search + * @mrioc: Adapter instance reference + * @sas_address: SAS address of expander or sas host + * @hba_port: HBA port entry + * Context: Caller should acquire mrioc->sas_node_lock. + * + * If the SAS address indicates the device is direct attached to + * the controller (controller's SAS address) then the SAS node + * associated with the controller is returned back else the SAS + * address and hba port are used to identify the exact expander + * and the associated sas_node object is returned. If there is + * no match NULL is returned. + * + * Return: A valid SAS node or NULL. + * + */ +static struct mpi3mr_sas_node *__mpi3mr_sas_node_find_by_sas_address( + struct mpi3mr_ioc *mrioc, u64 sas_address, + struct mpi3mr_hba_port *hba_port) +{ + + if (mrioc->sas_hba.sas_address == sas_address) + return &mrioc->sas_hba; + return mpi3mr_expander_find_by_sas_address(mrioc, sas_address, + hba_port); +} + +/** + * mpi3mr_get_port_id_by_sas_phy - Get port ID of the given phy + * @phy - SAS transport layer phy object + * + * Return: Port number for valid ID else 0xFFFF + */ +static inline u8 mpi3mr_get_port_id_by_sas_phy(struct sas_phy *phy) +{ + u8 port_id = 0xFF; + + struct mpi3mr_hba_port *hba_port = phy->hostdata; + if (hba_port) + port_id = hba_port->port_id; + + return port_id; +} + + + +/** + * mpi3mr_find_parent_present - Is parent present for a phy + * @mrioc: Adapter instance reference + * @phy - SAS transport layer phy object + * + * Return: 0 if parent is present else non-zero + */ +static int mpi3mr_parent_present(struct mpi3mr_ioc *mrioc, struct sas_phy *phy) +{ + + unsigned long flags; + struct mpi3mr_hba_port *hba_port = phy->hostdata; + + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + if (__mpi3mr_sas_node_find_by_sas_address(mrioc, + phy->identify.sas_address, + hba_port) == NULL) { + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + return -1; + } + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + return 0; +} + + +/** + * mpi3mr_get_hba_port_by_id - find hba port by id + * @mrioc: Adapter instance reference + * @port_id - Port ID to search + * @skip_dirty_flag - Skip dirty ports that matched pot_id + * + * Return: mpi3mr_hba_port reference for the matched port + */ + +struct mpi3mr_hba_port * mpi3mr_get_hba_port_by_id(struct mpi3mr_ioc *mrioc, + u8 port_id, u8 skip_dirty_flag) +{ + + struct mpi3mr_hba_port *port, *port_next; + + list_for_each_entry_safe (port, port_next, + &mrioc->hba_port_table_list, list) { + if (port->port_id != port_id) + continue; + if (!skip_dirty_flag && (port->flags & + MPI3MR_HBA_PORT_FLAG_DIRTY)) + continue; + return port; + } + + return NULL; +} + + +/** + * mpi3mr_get_port_id_by_rphy - Get Port number from SAS rphy + * + * @mrioc: Adapter instance reference + * @rphy - SAS transport layer remote phy object + * + * Retrieves HBA port number in which the device pointed by the + * rphy object is attached with. + * + * Return: Valid port number on success else OxFFFF. + */ +u8 mpi3mr_get_port_id_by_rphy(struct mpi3mr_ioc *mrioc, struct sas_rphy *rphy) +{ + struct mpi3mr_sas_node *sas_expander; + struct mpi3mr_tgt_dev *tgtdev; + unsigned long flags; + u8 port_id = 0xFF; + + if (!rphy) + return port_id; + + if (rphy->identify.device_type == SAS_EDGE_EXPANDER_DEVICE || + rphy->identify.device_type == SAS_FANOUT_EXPANDER_DEVICE) { + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + list_for_each_entry(sas_expander, &mrioc->sas_expander_list, + list) { + if (sas_expander->rphy == rphy) { + port_id = sas_expander->hba_port->port_id; + break; + } + } + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + } else if (rphy->identify.device_type == SAS_END_DEVICE) { + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + + tgtdev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc, + rphy->identify.sas_address, rphy); + if (tgtdev) { + port_id = + tgtdev->dev_spec.sas_sata_inf.hba_port->port_id; + mpi3mr_tgtdev_put(tgtdev); + } + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + } + return port_id; +} + +/** + * mpi3mr_convert_phy_link_rate - + * @link_rate: link rate as defined in the MPI header + * + * Convert link_rate from mpi format into sas_transport layer + * form. + * + * Return: A valid SAS transport layer defined link rate + */ +static enum sas_linkrate mpi3mr_convert_phy_link_rate(u8 link_rate) +{ + enum sas_linkrate rc; + + switch (link_rate) { + case MPI3_SAS_NEG_LINK_RATE_1_5: + rc = SAS_LINK_RATE_1_5_GBPS; + break; + case MPI3_SAS_NEG_LINK_RATE_3_0: + rc = SAS_LINK_RATE_3_0_GBPS; + break; + case MPI3_SAS_NEG_LINK_RATE_6_0: + rc = SAS_LINK_RATE_6_0_GBPS; + break; + case MPI3_SAS_NEG_LINK_RATE_12_0: + rc = SAS_LINK_RATE_12_0_GBPS; + break; + case MPI3_SAS_NEG_LINK_RATE_22_5: + /*TODO: Once SAS TL included define for 22.5 replace this*/ + rc = SAS_LINK_RATE_12_0_GBPS; + break; + case MPI3_SAS_NEG_LINK_RATE_PHY_DISABLED: + rc = SAS_PHY_DISABLED; + break; + case MPI3_SAS_NEG_LINK_RATE_NEGOTIATION_FAILED: + rc = SAS_LINK_RATE_FAILED; + break; + case MPI3_SAS_NEG_LINK_RATE_PORT_SELECTOR: + rc = SAS_SATA_PORT_SELECTOR; + break; + case MPI3_SAS_NEG_LINK_RATE_SMP_RESET_IN_PROGRESS: + rc = SAS_PHY_RESET_IN_PROGRESS; + break; + default: + case MPI3_SAS_NEG_LINK_RATE_SATA_OOB_COMPLETE: + case MPI3_SAS_NEG_LINK_RATE_UNKNOWN_LINK_RATE: + rc = SAS_LINK_RATE_UNKNOWN; + break; + } + return rc; +} + +/** + * mpi3mr_set_identify - set identify for phys and end devices + * @mrioc: Adapter instance reference + * @handle: Firmware device handle + * @identify: SAS transport layer's identify info + * + * Populates sas identify info for a specific device. + * + * Return: 0 for success, non-zero for failure. + */ +static int mpi3mr_set_identify(struct mpi3mr_ioc *mrioc, u16 handle, + struct sas_identify *identify) +{ + + struct mpi3_device_page0 device_pg0; + struct mpi3_device0_sas_sata_format *sasinf; + u16 device_info; + u16 ioc_status; + + if (mrioc->reset_in_progress) { + ioc_err(mrioc, "%s: host reset in progress!\n", __func__); + return -EFAULT; + } + + if ((mpi3mr_cfg_get_dev_pg0(mrioc, &ioc_status, &device_pg0, + sizeof(device_pg0), MPI3_DEVICE_PGAD_FORM_HANDLE, handle))) { + ioc_err(mrioc, "%s: device page0 read failed\n", __func__); + return -ENXIO; + } + + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "device page read failed for handle(0x%04x), with ioc_status(0x%04x) failure at %s:%d/%s()!\n", + handle, ioc_status, __FILE__, __LINE__, __func__); + return -EIO; + } + + memset(identify, 0, sizeof(struct sas_identify)); + sasinf = &device_pg0.device_specific.sas_sata_format; + device_info = le16_to_cpu(sasinf->device_info); + + /* sas_address */ + identify->sas_address = le64_to_cpu(sasinf->sas_address); + + /* phy number of the parent device this device is linked to */ + identify->phy_identifier = sasinf->phy_num; + + /* device_type */ + switch (device_info & MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_MASK) { + case MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_NO_DEVICE: + identify->device_type = SAS_PHY_UNUSED; + break; + case MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_END_DEVICE: + identify->device_type = SAS_END_DEVICE; + break; + case MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_EXPANDER: + identify->device_type = SAS_EDGE_EXPANDER_DEVICE; + break; + /* MPI3.0 doesnt have define for FANOUT expander*/ + } + + /* initiator_port_protocols */ + if (device_info & MPI3_SAS_DEVICE_INFO_SSP_INITIATOR) + identify->initiator_port_protocols |= SAS_PROTOCOL_SSP; + /* MPI3.0 doesnt have define for SATA INIT so setting both here*/ + if (device_info & MPI3_SAS_DEVICE_INFO_STP_INITIATOR) + identify->initiator_port_protocols |= (SAS_PROTOCOL_STP | + SAS_PROTOCOL_SATA); + if (device_info & MPI3_SAS_DEVICE_INFO_SMP_INITIATOR) + identify->initiator_port_protocols |= SAS_PROTOCOL_SMP; + + /* target_port_protocols */ + if (device_info & MPI3_SAS_DEVICE_INFO_SSP_TARGET) + identify->target_port_protocols |= SAS_PROTOCOL_SSP; + /* MPI3.0 doesnt have define for STP Target so setting both here*/ + if (device_info & MPI3_SAS_DEVICE_INFO_STP_SATA_TARGET) + identify->target_port_protocols |= (SAS_PROTOCOL_STP | + SAS_PROTOCOL_SATA); + if (device_info & MPI3_SAS_DEVICE_INFO_SMP_TARGET) + identify->target_port_protocols |= SAS_PROTOCOL_SMP; + return 0; +} + +/* report manufacture request structure */ +struct rep_manu_request { + u8 smp_frame_type; + u8 function; + u8 reserved; + u8 request_length; +}; + +/* report manufacture reply structure */ +struct rep_manu_reply { + u8 smp_frame_type; /* 0x41 */ + u8 function; /* 0x01 */ + u8 function_result; + u8 response_length; + u16 expander_change_count; + u8 reserved0[2]; + u8 sas_format; + u8 reserved2[3]; + u8 vendor_id[SAS_EXPANDER_VENDOR_ID_LEN]; + u8 product_id[SAS_EXPANDER_PRODUCT_ID_LEN]; + u8 product_rev[SAS_EXPANDER_PRODUCT_REV_LEN]; + u8 component_vendor_id[SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN]; + u16 component_id; + u8 component_revision_id; + u8 reserved3; + u8 vendor_specific[8]; +}; + +/** + * mpi3mr_report_manufacture - obtain SMP report_manufacture + * @mrioc: Adapter instance reference + * @sas_address: SAS address of the expander device + * @edev: SAS transport layer sas_expander_device object + * @port_id: ID of the HBA port + * + * Fills in the sas_expander_device with manufacturing info. + * + * Return: 0 for success, non-zero for failure. + */ +static int mpi3mr_report_manufacture(struct mpi3mr_ioc *mrioc, + u64 sas_address, struct sas_expander_device *edev, u8 port_id) +{ + struct mpi3_smp_passthrough_request mpi_request; + struct mpi3_smp_passthrough_reply mpi_reply; + struct rep_manu_reply *manufacture_reply; + struct rep_manu_request *manufacture_request; + int rc = 0; + void *psge; + void *data_out = NULL; + dma_addr_t data_out_dma; + dma_addr_t data_in_dma; + size_t data_in_sz; + size_t data_out_sz; + u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST; + u16 request_sz = sizeof(struct mpi3_smp_passthrough_request); + u16 reply_sz = sizeof(struct mpi3_smp_passthrough_reply); + u16 ioc_status; + + if (mrioc->reset_in_progress) { + ioc_err(mrioc, "%s: host reset in progress!\n", __func__); + return -EFAULT; + } + + data_out_sz = sizeof(struct rep_manu_request); + data_in_sz = sizeof(struct rep_manu_reply); + data_out = dma_zalloc_coherent(&mrioc->pdev->dev, + data_out_sz + data_in_sz, &data_out_dma, GFP_KERNEL); + if (!data_out) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", __FILE__, + __LINE__, __func__); + rc = -ENOMEM; + goto out; + } + + data_in_dma = data_out_dma + data_out_sz; + manufacture_reply = data_out + data_out_sz; + + manufacture_request = data_out; + manufacture_request->smp_frame_type = 0x40; + manufacture_request->function = 1; + manufacture_request->reserved = 0; + manufacture_request->request_length = 0; + + memset(&mpi_request, 0, request_sz); + memset(&mpi_reply, 0, reply_sz); + mpi_request.host_tag = MPI3MR_HOSTTAG_TRANSPORT_CMDS; + mpi_request.function = MPI3_FUNCTION_SMP_PASSTHROUGH; + mpi_request.io_unit_port = (u8) port_id; + mpi_request.sas_address = cpu_to_le64(sas_address); + + psge = &mpi_request.request_sge; + mpi3mr_add_sg_single(psge, sgl_flags, data_out_sz, data_out_dma); + + psge = &mpi_request.response_sge; + mpi3mr_add_sg_single(psge, sgl_flags, data_in_sz, data_in_dma); + + dprint_transport_info(mrioc, + "sending report manufacturer SMP request to sas_address(0x%016llx), port(%d)\n", + (unsigned long long)sas_address, port_id); + + if (mpi3mr_post_transport_req(mrioc, &mpi_request, request_sz, + &mpi_reply, reply_sz, MPI3MR_INTADMCMD_TIMEOUT, &ioc_status)) + goto out; + + dprint_transport_info(mrioc, + "report manufacturer SMP request completed with ioc_status(0x%04x)\n", + ioc_status); + + if (ioc_status == MPI3_IOCSTATUS_SUCCESS) { + u8 *tmp; + + dprint_transport_info(mrioc, + "report manufacturer - reply data transfer size(%d)\n", + le16_to_cpu(mpi_reply.response_data_length)); + + if (le16_to_cpu(mpi_reply.response_data_length) != + sizeof(struct rep_manu_reply)) + goto out; + + strscpy(edev->vendor_id, manufacture_reply->vendor_id, + SAS_EXPANDER_VENDOR_ID_LEN); + strscpy(edev->product_id, manufacture_reply->product_id, + SAS_EXPANDER_PRODUCT_ID_LEN); + strscpy(edev->product_rev, manufacture_reply->product_rev, + SAS_EXPANDER_PRODUCT_REV_LEN); + edev->level = manufacture_reply->sas_format & 1; + if (edev->level) { + strscpy(edev->component_vendor_id, + manufacture_reply->component_vendor_id, + SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN); + tmp = (u8 *)&manufacture_reply->component_id; + edev->component_id = tmp[0] << 8 | tmp[1]; + edev->component_revision_id = + manufacture_reply->component_revision_id; + } + } + +out: + if (data_out) + dma_free_coherent(&mrioc->pdev->dev, data_out_sz + data_in_sz, + data_out, data_out_dma); + + return rc; +} + + +/** + * mpi3mr_delete_sas_port - helper function to removing a port + * @mrioc: Adapter instance reference + * @mr_sas_port: Internal Port object + * + * Return: None. + */ +static void mpi3mr_delete_sas_port(struct mpi3mr_ioc *mrioc, + struct mpi3mr_sas_port *mr_sas_port) +{ + + u64 sas_address = mr_sas_port->remote_identify.sas_address; + struct mpi3mr_hba_port *hba_port = mr_sas_port->hba_port; + enum sas_device_type device_type = + mr_sas_port->remote_identify.device_type; + + dev_printk(KERN_INFO, &mr_sas_port->port->dev, + "remove: sas_address(0x%016llx)\n", + (unsigned long long) sas_address); + + if (device_type == SAS_END_DEVICE) + mpi3mr_remove_device_by_sas_address(mrioc, sas_address, + hba_port); + + else if (device_type == SAS_EDGE_EXPANDER_DEVICE || + device_type == SAS_FANOUT_EXPANDER_DEVICE) + mpi3mr_expander_remove(mrioc, sas_address, hba_port); +} + +/** + * mpi3mr_delete_phy - Remove a single phy from port + * @mrioc: Adapter instance reference + * @mr_sas_port: Internal Port object + * @mr_sas_phy: Internal Phy object + * + * Return: None. + */ +static void mpi3mr_delete_sas_phy(struct mpi3mr_ioc *mrioc, + struct mpi3mr_sas_port *mr_sas_port, + struct mpi3mr_sas_phy *mr_sas_phy) +{ + + u64 sas_address = mr_sas_port->remote_identify.sas_address; + + dev_printk(KERN_INFO, &mr_sas_phy->phy->dev, + "remove: sas_address(0x%016llx), phy(%d)\n", + (unsigned long long) sas_address, mr_sas_phy->phy_id); + + list_del(&mr_sas_phy->port_siblings); + mr_sas_port->num_phys--; + mr_sas_port->phy_mask &= ~(1 << mr_sas_phy->phy_id); + if (mr_sas_port->lowest_phy == mr_sas_phy->phy_id) + mr_sas_port->lowest_phy = ffs(mr_sas_port->phy_mask) - 1; + sas_port_delete_phy(mr_sas_port->port, mr_sas_phy->phy); + mr_sas_phy->phy_belongs_to_port = 0; +} + +/** + * mpi3mr_add_sas_phy - Adding a single phy to a port + * @mrioc: Adapter instance reference + * @mr_sas_port: Internal Port object + * @mr_sas_phy: Internal Phy object + * + * Return: None. + */ +static void mpi3mr_add_sas_phy(struct mpi3mr_ioc *mrioc, + struct mpi3mr_sas_port *mr_sas_port, + struct mpi3mr_sas_phy *mr_sas_phy) +{ + u64 sas_address = mr_sas_port->remote_identify.sas_address; + + dev_printk(KERN_INFO, &mr_sas_phy->phy->dev, + "add: sas_address(0x%016llx), phy(%d)\n", (unsigned long long) + sas_address, mr_sas_phy->phy_id); + + list_add_tail(&mr_sas_phy->port_siblings, &mr_sas_port->phy_list); + mr_sas_port->num_phys++; + mr_sas_port->phy_mask |= (1 << mr_sas_phy->phy_id); + if (mr_sas_phy->phy_id < mr_sas_port->lowest_phy) + mr_sas_port->lowest_phy = ffs(mr_sas_port->phy_mask) - 1; + sas_port_add_phy(mr_sas_port->port, mr_sas_phy->phy); + mr_sas_phy->phy_belongs_to_port = 1; +} + +/** + * mpi3mr_add_phy_to_an_existing_port - add phy to existing port + * @mrioc: Adapter instance reference + * @mr_sas_node: Internal sas node object (expander or host) + * @mr_sas_phy: Internal Phy object * + * @sas_address: SAS address of device/expander were phy needs + * to be added to + * @hba_port: HBA port entry + * + * Return: None. + */ +void mpi3mr_add_phy_to_an_existing_port(struct mpi3mr_ioc *mrioc, + struct mpi3mr_sas_node *mr_sas_node, struct mpi3mr_sas_phy *mr_sas_phy, + u64 sas_address, struct mpi3mr_hba_port *hba_port) +{ + struct mpi3mr_sas_port *mr_sas_port; + struct mpi3mr_sas_phy *srch_phy; + + if (mr_sas_phy->phy_belongs_to_port == 1) + return; + + if (!hba_port) + return; + + list_for_each_entry(mr_sas_port, &mr_sas_node->sas_port_list, + port_list) { + if (mr_sas_port->remote_identify.sas_address != + sas_address) + continue; + if (mr_sas_port->hba_port != hba_port) + continue; + list_for_each_entry(srch_phy, &mr_sas_port->phy_list, + port_siblings) { + if (srch_phy == mr_sas_phy) + return; + } + mpi3mr_add_sas_phy(mrioc, mr_sas_port, mr_sas_phy); + return; + } +} + +/** + * mpi3mr_del_phy_from_an_existing_port - del phy from a port + * @mrioc: Adapter instance reference + * @mr_sas_node: Internal sas node object (expander or host) + * @mr_sas_phy: Internal Phy object + * + * Return: None. + */ +void mpi3mr_del_phy_from_an_existing_port(struct mpi3mr_ioc *mrioc, + struct mpi3mr_sas_node *mr_sas_node, struct mpi3mr_sas_phy *mr_sas_phy) +{ + struct mpi3mr_sas_port *mr_sas_port, *next; + struct mpi3mr_sas_phy *srch_phy; + + if (mr_sas_phy->phy_belongs_to_port == 0) + return; + + list_for_each_entry_safe(mr_sas_port, next, &mr_sas_node->sas_port_list, + port_list) { + list_for_each_entry(srch_phy, &mr_sas_port->phy_list, + port_siblings) { + if (srch_phy != mr_sas_phy) + continue; + if ((mr_sas_port->num_phys == 1) && + !mrioc->reset_in_progress) + mpi3mr_delete_sas_port(mrioc, mr_sas_port); + else + mpi3mr_delete_sas_phy(mrioc, mr_sas_port, + mr_sas_phy); + return; + } + } +} + +/** + * mpi3mr_sas_phy_sanity_check - sanity check while adding port + * @mrioc: Adapter instance reference + * @mr_sas_node: Internal sas node object (expander or host) + * @sas_address: SAS address of device/expander + * @hba_port: HBA port entry + * + * Verifies whether the Phys attached to a device with the given + * SAS address already belongs to an existing sas port if so + * will remove those phys from the sas port + * + * Return: None. + */ +static void mpi3mr_sas_port_sanity_check(struct mpi3mr_ioc *mrioc, + struct mpi3mr_sas_node *mr_sas_node, u64 sas_address, + struct mpi3mr_hba_port *hba_port) +{ + int i; + + for (i = 0; i < mr_sas_node->num_phys; i++) { + if ((mr_sas_node->phy[i].remote_identify.sas_address != + sas_address) || (mr_sas_node->phy[i].hba_port != hba_port)) + continue; + if (mr_sas_node->phy[i].phy_belongs_to_port == 1) + mpi3mr_del_phy_from_an_existing_port(mrioc, + mr_sas_node, &mr_sas_node->phy[i]); + } +} + +/** + * mpi3mr_sas_port_add - Expose the SAS device to the SAS TL + * @mrioc: Adapter instance reference + * @handle: Firmware device handle of the attached device + * @sas_address_parent: sas address of parent expander or host + * @hba_port: HBA port entry + * + * This function creates a new sas port object for the given end + * device matching sas address and hba_port and adds it to the + * sas_node's sas_port_list and expose the attached sas device + * to the SAS transport layer through sas_rphy_add. + * + * Returns a valid mpi3mr_sas_port reference or NULL. + */ +struct mpi3mr_sas_port * mpi3mr_sas_port_add(struct mpi3mr_ioc *mrioc, + u16 handle, u64 sas_address_parent, struct mpi3mr_hba_port *hba_port) +{ + + struct mpi3mr_sas_phy *mr_sas_phy, *next; + struct mpi3mr_sas_port *mr_sas_port; + unsigned long flags; + struct mpi3mr_sas_node *mr_sas_node; + struct sas_rphy *rphy; + struct mpi3mr_tgt_dev *tgtdev = NULL; + int i; + struct sas_port *port; + + if (!hba_port) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return NULL; + } + + mr_sas_port = kzalloc(sizeof(struct mpi3mr_sas_port), GFP_KERNEL); + if (!mr_sas_port) + return NULL; + + INIT_LIST_HEAD(&mr_sas_port->port_list); + INIT_LIST_HEAD(&mr_sas_port->phy_list); + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + mr_sas_node = __mpi3mr_sas_node_find_by_sas_address(mrioc, + sas_address_parent, hba_port); + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + + if (!mr_sas_node) { + ioc_err(mrioc, "%s:could not find parent sas_address(0x%016llx)!\n", + __func__, (unsigned long long)sas_address_parent); + goto out_fail; + } + + if ((mpi3mr_set_identify(mrioc, handle, + &mr_sas_port->remote_identify))) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out_fail; + } + + if (mr_sas_port->remote_identify.device_type == SAS_PHY_UNUSED) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out_fail; + } + + mr_sas_port->hba_port = hba_port; + mpi3mr_sas_port_sanity_check(mrioc, mr_sas_node, + mr_sas_port->remote_identify.sas_address, hba_port); + + for (i = 0; i < mr_sas_node->num_phys; i++) { + if ((mr_sas_node->phy[i].remote_identify.sas_address != + mr_sas_port->remote_identify.sas_address) || + (mr_sas_node->phy[i].hba_port != hba_port)) + continue; + list_add_tail(&mr_sas_node->phy[i].port_siblings, + &mr_sas_port->phy_list); + mr_sas_port->num_phys++; + mr_sas_port->phy_mask |= (1 << i); + } + + if (!mr_sas_port->num_phys) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out_fail; + } + + mr_sas_port->lowest_phy = ffs(mr_sas_port->phy_mask) - 1; + + if (mr_sas_port->remote_identify.device_type == SAS_END_DEVICE) { + tgtdev = mpi3mr_get_tgtdev_by_addr(mrioc, + mr_sas_port->remote_identify.sas_address, + mr_sas_port->hba_port); + + if (!tgtdev) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out_fail; + } + tgtdev->dev_spec.sas_sata_inf.pend_sas_rphy_add = 1; + } + + if (!mr_sas_node->parent_dev) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out_fail; + } + + port = sas_port_alloc_num(mr_sas_node->parent_dev); + if ((sas_port_add(port))) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out_fail; + } + + list_for_each_entry(mr_sas_phy, &mr_sas_port->phy_list, + port_siblings) { + if ((mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO)) + dev_printk(KERN_INFO, &port->dev, "add: handle(0x%04x)" + ", sas_address(0x%016llx), phy(%d)\n", handle, + (unsigned long long) + mr_sas_port->remote_identify.sas_address, + mr_sas_phy->phy_id); + sas_port_add_phy(port, mr_sas_phy->phy); + mr_sas_phy->phy_belongs_to_port = 1; + mr_sas_phy->hba_port = hba_port; + } + + mr_sas_port->port = port; + if (mr_sas_port->remote_identify.device_type == SAS_END_DEVICE) { + rphy = sas_end_device_alloc(port); + tgtdev->dev_spec.sas_sata_inf.rphy=rphy; + } else { + rphy = sas_expander_alloc(port, + mr_sas_port->remote_identify.device_type); + } + rphy->identify = mr_sas_port->remote_identify; + + if (mrioc->current_event) + mrioc->current_event->pending_at_sml = 1; + + if ((sas_rphy_add(rphy))) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + } + + if (mr_sas_port->remote_identify.device_type == SAS_END_DEVICE) { + tgtdev->dev_spec.sas_sata_inf.pend_sas_rphy_add = 0; + tgtdev->dev_spec.sas_sata_inf.sas_transport_attached = 1; + mpi3mr_tgtdev_put(tgtdev); + } + + dev_printk(KERN_INFO, &rphy->dev, + "%s: added: handle(0x%04x), sas_address(0x%016llx)\n", + __func__, handle, (unsigned long long) + mr_sas_port->remote_identify.sas_address); + + mr_sas_port->rphy = rphy; + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + list_add_tail(&mr_sas_port->port_list, &mr_sas_node->sas_port_list); + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + + if (mrioc->current_event) { + mrioc->current_event->pending_at_sml = 0; + if (mrioc->current_event->discard) + mpi3mr_print_discard_event_notice(mrioc, true); + } + + /* fill in report manufacture */ + if (mr_sas_port->remote_identify.device_type == + SAS_EDGE_EXPANDER_DEVICE || + mr_sas_port->remote_identify.device_type == + SAS_FANOUT_EXPANDER_DEVICE) + mpi3mr_report_manufacture(mrioc, + mr_sas_port->remote_identify.sas_address, + rphy_to_expander_device(rphy), hba_port->port_id); + + return mr_sas_port; + + out_fail: + list_for_each_entry_safe(mr_sas_phy, next, &mr_sas_port->phy_list, + port_siblings) + list_del(&mr_sas_phy->port_siblings); + kfree(mr_sas_port); + return NULL; +} + +/** + * mpi3mr_sas_port_remove - remove port from the list + * @mrioc: Adapter instance reference + * @sas_address: SAS address of attached device + * @sas_address_parent: SAS address of parent expander or host + * @hba_port: HBA port entry + * + * Removing object and freeing associated memory from the + * sas_port_list. + * + * Return: None + */ +void mpi3mr_sas_port_remove(struct mpi3mr_ioc *mrioc, u64 sas_address, + u64 sas_address_parent, struct mpi3mr_hba_port *hba_port) +{ + int i; + unsigned long flags; + struct mpi3mr_sas_port *mr_sas_port, *next; + struct mpi3mr_sas_node *mr_sas_node; + u8 found = 0; + struct mpi3mr_sas_phy *mr_sas_phy, *next_phy; + struct mpi3mr_hba_port *srch_port, *hba_port_next=NULL; + + + if (!hba_port) + return; + + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + mr_sas_node = __mpi3mr_sas_node_find_by_sas_address(mrioc, + sas_address_parent, hba_port); + if (!mr_sas_node) { + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + return; + } + list_for_each_entry_safe(mr_sas_port, next, &mr_sas_node->sas_port_list, + port_list) { + if (mr_sas_port->remote_identify.sas_address != sas_address) + continue; + if (mr_sas_port->hba_port != hba_port) + continue; + found = 1; + list_del(&mr_sas_port->port_list); + goto out; + } + + out: + if (!found) { + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + return; + } + + if (mr_sas_node->host_node) { + list_for_each_entry_safe(srch_port, hba_port_next, + &mrioc->hba_port_table_list, list) { + if (srch_port != hba_port) + continue; + ioc_info(mrioc, + "removing hba_port entry: %p port: %d from hba_port list\n", + srch_port, srch_port->port_id); + list_del(&hba_port->list); + kfree(hba_port); + break; + } + } + + for (i = 0; i < mr_sas_node->num_phys; i++) { + if (mr_sas_node->phy[i].remote_identify.sas_address == + sas_address) + memset(&mr_sas_node->phy[i].remote_identify, 0 , + sizeof(struct sas_identify)); + } + + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + + if (mrioc->current_event) + mrioc->current_event->pending_at_sml = 1; + + list_for_each_entry_safe(mr_sas_phy, next_phy, + &mr_sas_port->phy_list, port_siblings) { + if ((mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO)) + dev_printk(KERN_INFO, &mr_sas_port->port->dev, + "remove: sas_address(0x%016llx), phy(%d)\n", + (unsigned long long) + mr_sas_port->remote_identify.sas_address, + mr_sas_phy->phy_id); + mr_sas_phy->phy_belongs_to_port = 0; + if(!mrioc->stop_drv_processing) + sas_port_delete_phy(mr_sas_port->port, + mr_sas_phy->phy); + list_del(&mr_sas_phy->port_siblings); + } + if(!mrioc->stop_drv_processing) + sas_port_delete(mr_sas_port->port); + + ioc_info(mrioc, "%s: removed sas_address(0x%016llx)\n", + __func__, (unsigned long long)sas_address); + + if (mrioc->current_event) { + mrioc->current_event->pending_at_sml = 0; + if (mrioc->current_event->discard) + mpi3mr_print_discard_event_notice(mrioc, false); + } + + kfree(mr_sas_port); +} + +/** + * mpi3mr_add_host_phy - report sas_host phy to SAS transport + * @mrioc: Adapter instance reference + * @mr_sas_phy: Intenal Phy object + * @phy_pg0: SAS phy page 0 + * @parent_dev: Prent device class object + * + * Return: 0 for success, non-zero for failure. + */ +int mpi3mr_add_host_phy(struct mpi3mr_ioc *mrioc, + struct mpi3mr_sas_phy *mr_sas_phy, struct mpi3_sas_phy_page0 phy_pg0, + struct device *parent_dev) +{ + struct sas_phy *phy; + int phy_index = mr_sas_phy->phy_id; + + + INIT_LIST_HEAD(&mr_sas_phy->port_siblings); + phy = sas_phy_alloc(parent_dev, phy_index); + if (!phy) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -1; + } + if ((mpi3mr_set_identify(mrioc, mr_sas_phy->handle, + &mr_sas_phy->identify))) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + sas_phy_free(phy); + return -1; + } + phy->identify = mr_sas_phy->identify; + mr_sas_phy->attached_handle = le16_to_cpu(phy_pg0.attached_dev_handle); + if (mr_sas_phy->attached_handle) + mpi3mr_set_identify(mrioc, mr_sas_phy->attached_handle, + &mr_sas_phy->remote_identify); + phy->identify.phy_identifier = mr_sas_phy->phy_id; + phy->negotiated_linkrate = mpi3mr_convert_phy_link_rate( + (phy_pg0.negotiated_link_rate & + MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >> + MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT); + phy->minimum_linkrate_hw = mpi3mr_convert_phy_link_rate( + phy_pg0.hw_link_rate & MPI3_SAS_HWRATE_MIN_RATE_MASK); + phy->maximum_linkrate_hw = mpi3mr_convert_phy_link_rate( + phy_pg0.hw_link_rate >> 4); + phy->minimum_linkrate = mpi3mr_convert_phy_link_rate( + phy_pg0.programmed_link_rate & MPI3_SAS_PRATE_MIN_RATE_MASK); + phy->maximum_linkrate = mpi3mr_convert_phy_link_rate( + phy_pg0.programmed_link_rate >> 4); + phy->hostdata = mr_sas_phy->hba_port; + + if ((sas_phy_add(phy))) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + sas_phy_free(phy); + return -1; + } + if ((mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO)) + dev_printk(KERN_INFO, &phy->dev, + "add: handle(0x%04x), sas_address(0x%016llx)\n" + "\tattached_handle(0x%04x), sas_address(0x%016llx)\n", + mr_sas_phy->handle, (unsigned long long) + mr_sas_phy->identify.sas_address, + mr_sas_phy->attached_handle, + (unsigned long long) + mr_sas_phy->remote_identify.sas_address); + mr_sas_phy->phy = phy; + return 0; +} + +/** + * mpi3mr_add_expander_phy - report expander phy to transport + * @mrioc: Adapter instance reference + * @mr_sas_phy: Intenal Phy object + * @expander_pg1: SAS Expander page 1 + * @parent_dev: Parent device class object + + * + * Return: 0 for success, non-zero for failure. + */ +int mpi3mr_add_expander_phy(struct mpi3mr_ioc *mrioc, + struct mpi3mr_sas_phy *mr_sas_phy, + struct mpi3_sas_expander_page1 expander_pg1, + struct device *parent_dev) +{ + struct sas_phy *phy; + int phy_index = mr_sas_phy->phy_id; + + INIT_LIST_HEAD(&mr_sas_phy->port_siblings); + phy = sas_phy_alloc(parent_dev, phy_index); + if (!phy) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -1; + } + if ((mpi3mr_set_identify(mrioc, mr_sas_phy->handle, + &mr_sas_phy->identify))) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + sas_phy_free(phy); + return -1; + } + phy->identify = mr_sas_phy->identify; + mr_sas_phy->attached_handle = + le16_to_cpu(expander_pg1.attached_dev_handle); + if (mr_sas_phy->attached_handle) + mpi3mr_set_identify(mrioc, mr_sas_phy->attached_handle, + &mr_sas_phy->remote_identify); + phy->identify.phy_identifier = mr_sas_phy->phy_id; + phy->negotiated_linkrate = mpi3mr_convert_phy_link_rate( + (expander_pg1.negotiated_link_rate & + MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >> + MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT); + phy->minimum_linkrate_hw = mpi3mr_convert_phy_link_rate( + expander_pg1.hw_link_rate & MPI3_SAS_HWRATE_MIN_RATE_MASK); + phy->maximum_linkrate_hw = mpi3mr_convert_phy_link_rate( + expander_pg1.hw_link_rate >> 4); + phy->minimum_linkrate = mpi3mr_convert_phy_link_rate( + expander_pg1.programmed_link_rate & MPI3_SAS_PRATE_MIN_RATE_MASK); + phy->maximum_linkrate = mpi3mr_convert_phy_link_rate( + expander_pg1.programmed_link_rate >> 4); + phy->hostdata = mr_sas_phy->hba_port; + + if ((sas_phy_add(phy))) { + ioc_err(mrioc,"failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + sas_phy_free(phy); + return -1; + } + if ((mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO)) + dev_printk(KERN_INFO, &phy->dev, + "add: handle(0x%04x), sas_address(0x%016llx)\n" + "\tattached_handle(0x%04x), sas_address(0x%016llx)\n", + mr_sas_phy->handle, (unsigned long long) + mr_sas_phy->identify.sas_address, + mr_sas_phy->attached_handle, + (unsigned long long) + mr_sas_phy->remote_identify.sas_address); + mr_sas_phy->phy = phy; + return 0; +} + + +/** + * mpi3mr_update_sas_links - refreshing SAS phy link changes + * @mrioc: Adapter instance reference + * @sas_address_parent: SAS address of parent expander or host + * @handle: Firmware device handle of attached device + * @phy_number: Phy number + * @link_rate: New link rate + * @hba_port: HBA port entry + * + * Return: None. + */ +void mpi3mr_update_links(struct mpi3mr_ioc *mrioc, + u64 sas_address_parent, u16 handle, u8 phy_number, u8 link_rate, + struct mpi3mr_hba_port *hba_port) +{ + unsigned long flags; + struct mpi3mr_sas_node *mr_sas_node; + struct mpi3mr_sas_phy *mr_sas_phy; + + if (mrioc->reset_in_progress) + return; + + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + mr_sas_node = __mpi3mr_sas_node_find_by_sas_address(mrioc, + sas_address_parent, hba_port); + if (!mr_sas_node) { + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + return; + } + + mr_sas_phy = &mr_sas_node->phy[phy_number]; + mr_sas_phy->attached_handle = handle; + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + if (handle && (link_rate >= MPI3_SAS_NEG_LINK_RATE_1_5)) { + mpi3mr_set_identify(mrioc, handle, + &mr_sas_phy->remote_identify); + mpi3mr_add_phy_to_an_existing_port(mrioc, mr_sas_node, + mr_sas_phy, mr_sas_phy->remote_identify.sas_address, + hba_port); + } else + memset(&mr_sas_phy->remote_identify, 0 , sizeof(struct + sas_identify)); + + if (mr_sas_phy->phy) + mr_sas_phy->phy->negotiated_linkrate = + mpi3mr_convert_phy_link_rate(link_rate); + + if ((mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO)) + dev_printk(KERN_INFO, &mr_sas_phy->phy->dev, + "refresh: parent sas_address(0x%016llx),\n" + "\tlink_rate(0x%02x), phy(%d)\n" + "\tattached_handle(0x%04x), sas_address(0x%016llx)\n", + (unsigned long long)sas_address_parent, + link_rate, phy_number, handle, (unsigned long long) + mr_sas_phy->remote_identify.sas_address); +} + +static inline struct mpi3mr_ioc *phy_to_mrioc(struct sas_phy *phy) +{ + struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); + return shost_priv(shost); +} + +static inline struct mpi3mr_ioc *rphy_to_mrioc(struct sas_rphy *rphy) +{ + struct Scsi_Host *shost = dev_to_shost(rphy->dev.parent->parent); + return shost_priv(shost); +} + +/* report phy error log structure */ +struct phy_error_log_request { + u8 smp_frame_type; /* 0x40 */ + u8 function; /* 0x11 */ + u8 allocated_response_length; + u8 request_length; /* 02 */ + u8 reserved_1[5]; + u8 phy_identifier; + u8 reserved_2[2]; +}; + +/* report phy error log reply structure */ +struct phy_error_log_reply { + u8 smp_frame_type; /* 0x41 */ + u8 function; /* 0x11 */ + u8 function_result; + u8 response_length; + __be16 expander_change_count; + u8 reserved_1[3]; + u8 phy_identifier; + u8 reserved_2[2]; + __be32 invalid_dword; + __be32 running_disparity_error; + __be32 loss_of_dword_sync; + __be32 phy_reset_problem; +}; + + +/** + * mpi3mr_get_expander_phy_error_log - return expander counters: + * @mrioc: Adapter instance reference + * @phy: The SAS transport layer phy object + * + * Return: 0 for success, non-zero for failure. + * + */ +static int mpi3mr_get_expander_phy_error_log(struct mpi3mr_ioc *mrioc, + struct sas_phy *phy) +{ + struct mpi3_smp_passthrough_request mpi_request; + struct mpi3_smp_passthrough_reply mpi_reply; + struct phy_error_log_request *phy_error_log_request; + struct phy_error_log_reply *phy_error_log_reply; + int rc; + void *psge; + void *data_out = NULL; + dma_addr_t data_out_dma, data_in_dma; + u32 data_out_sz, data_in_sz, sz; + u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST; + u16 request_sz = sizeof(struct mpi3_smp_passthrough_request); + u16 reply_sz = sizeof(struct mpi3_smp_passthrough_reply); + u16 ioc_status; + + + if (mrioc->reset_in_progress) { + ioc_err(mrioc, "%s: host reset in progress!\n", __func__); + return -EFAULT; + } + + + data_out_sz = sizeof(struct phy_error_log_request); + data_in_sz = sizeof(struct phy_error_log_reply); + sz = data_out_sz + data_in_sz; + data_out = dma_zalloc_coherent(&mrioc->pdev->dev, sz, &data_out_dma, + GFP_KERNEL); + if (!data_out) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", __FILE__, + __LINE__, __func__); + rc = -ENOMEM; + goto out; + } + + data_in_dma = data_out_dma + data_out_sz; + phy_error_log_reply = data_out + data_out_sz; + + rc = -EINVAL; + memset(data_out, 0, sz); + phy_error_log_request = data_out; + phy_error_log_request->smp_frame_type = 0x40; + phy_error_log_request->function = 0x11; + phy_error_log_request->request_length = 2; + phy_error_log_request->allocated_response_length = 0; + phy_error_log_request->phy_identifier = phy->number; + + memset(&mpi_request, 0, request_sz); + memset(&mpi_reply, 0, reply_sz); + mpi_request.host_tag = MPI3MR_HOSTTAG_TRANSPORT_CMDS; + mpi_request.function = MPI3_FUNCTION_SMP_PASSTHROUGH; + mpi_request.io_unit_port = (u8) mpi3mr_get_port_id_by_sas_phy(phy); + mpi_request.sas_address = cpu_to_le64(phy->identify.sas_address); + + psge = &mpi_request.request_sge; + mpi3mr_add_sg_single(psge, sgl_flags, data_out_sz, data_out_dma); + + psge = &mpi_request.response_sge; + mpi3mr_add_sg_single(psge, sgl_flags, data_in_sz, data_in_dma); + + dprint_transport_info(mrioc, + "sending phy error log SMP request to sas_address(0x%016llx), phy_id(%d)\n", + (unsigned long long)phy->identify.sas_address, phy->number); + + if (mpi3mr_post_transport_req(mrioc, &mpi_request, request_sz, + &mpi_reply, reply_sz, MPI3MR_INTADMCMD_TIMEOUT, &ioc_status)) + goto out; + + dprint_transport_info(mrioc, + "phy error log SMP request completed with ioc_status(0x%04x)\n", + ioc_status); + + + if (ioc_status == MPI3_IOCSTATUS_SUCCESS) { + dprint_transport_info(mrioc, + "phy error log - reply data transfer size(%d)\n", + le16_to_cpu(mpi_reply.response_data_length)); + + if (le16_to_cpu(mpi_reply.response_data_length) != + sizeof(struct phy_error_log_reply)) + goto out; + + + dprint_transport_info(mrioc, + "phy error log - function_result(%d)\n", + phy_error_log_reply->function_result); + + phy->invalid_dword_count = + be32_to_cpu(phy_error_log_reply->invalid_dword); + phy->running_disparity_error_count = + be32_to_cpu(phy_error_log_reply->running_disparity_error); + phy->loss_of_dword_sync_count = + be32_to_cpu(phy_error_log_reply->loss_of_dword_sync); + phy->phy_reset_problem_count = + be32_to_cpu(phy_error_log_reply->phy_reset_problem); + rc = 0; + } + +out: + if (data_out) + dma_free_coherent(&mrioc->pdev->dev, sz, data_out, + data_out_dma); + + return rc; +} + + +/** + * mpi3mr_transport_get_linkerrors - return phy error counters + * @phy: The SAS transport layer phy object + * + * This function retrieves the phy error log information of the + * HBA or expander for which the phy belongs to + * + * Return: 0 for success, non-zero for failure. + * + */ +static int mpi3mr_transport_get_linkerrors(struct sas_phy *phy) +{ + struct mpi3mr_ioc *mrioc = phy_to_mrioc(phy); + struct mpi3_sas_phy_page1 phy_pg1; + int rc = 0; + u16 ioc_status; + + rc = mpi3mr_parent_present(mrioc, phy); + if (rc) + return rc; + + if (phy->identify.sas_address != mrioc->sas_hba.sas_address) + return mpi3mr_get_expander_phy_error_log(mrioc, phy); + + memset(&phy_pg1, 0, sizeof(struct mpi3_sas_phy_page1)); + /* get hba phy error logs */ + if ((mpi3mr_cfg_get_sas_phy_pg1(mrioc, &ioc_status, &phy_pg1, + sizeof(struct mpi3_sas_phy_page1), + MPI3_SAS_PHY_PGAD_FORM_PHY_NUMBER, phy->number))) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -ENXIO; + } + + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -ENXIO; + } + phy->invalid_dword_count = le32_to_cpu(phy_pg1.invalid_dword_count); + phy->running_disparity_error_count = + le32_to_cpu(phy_pg1.running_disparity_error_count); + phy->loss_of_dword_sync_count = + le32_to_cpu(phy_pg1.loss_dword_synch_count); + phy->phy_reset_problem_count = + le32_to_cpu(phy_pg1.phy_reset_problem_count); + return 0; +} + + +/** + * mpi3mr_transport_get_enclosure_identifier - Get Enclosure ID + * @rphy: The SAS transport layer remote phy object + * @identifier: Enclosure identifier to be returned + * + * Returns the enclosure id for the device pointed by the remote + * phy object. + * + * Return: 0 on success or -ENXIO + */ +static int +mpi3mr_transport_get_enclosure_identifier(struct sas_rphy *rphy, + u64 *identifier) +{ + struct mpi3mr_ioc *mrioc = rphy_to_mrioc(rphy); + struct mpi3mr_tgt_dev *tgtdev = NULL; + unsigned long flags; + int rc; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + tgtdev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc, + rphy->identify.sas_address, rphy); + if (tgtdev) { + *identifier = + tgtdev->enclosure_logical_id; + rc = 0; + mpi3mr_tgtdev_put(tgtdev); + } else { + *identifier = 0; + rc = -ENXIO; + } + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + + return rc; +} + +/** + * mpi3mr_transport_get_bay_identifier - Get bay ID + * @rphy: The SAS transport layer remote phy object + * + * Returns the slot id for the device pointed by the remote phy + * object. + * + * Return: Valid slot ID on success or -ENXIO + */ +static int +mpi3mr_transport_get_bay_identifier(struct sas_rphy *rphy) +{ + + struct mpi3mr_ioc *mrioc = rphy_to_mrioc(rphy); + struct mpi3mr_tgt_dev *tgtdev = NULL; + unsigned long flags; + int rc; + + spin_lock_irqsave(&mrioc->tgtdev_lock, flags); + tgtdev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc, + rphy->identify.sas_address, rphy); + if (tgtdev) { + rc = tgtdev->slot; + mpi3mr_tgtdev_put(tgtdev); + } else + rc = -ENXIO; + spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); + + return rc; +} + +/* phy control request structure */ +struct phy_control_request { + u8 smp_frame_type; /* 0x40 */ + u8 function; /* 0x91 */ + u8 allocated_response_length; + u8 request_length; /* 0x09 */ + u16 expander_change_count; + u8 reserved_1[3]; + u8 phy_identifier; + u8 phy_operation; + u8 reserved_2[13]; + u64 attached_device_name; + u8 programmed_min_physical_link_rate; + u8 programmed_max_physical_link_rate; + u8 reserved_3[6]; +}; + +/* phy control reply structure */ +struct phy_control_reply { + u8 smp_frame_type; /* 0x41 */ + u8 function; /* 0x11 */ + u8 function_result; + u8 response_length; +}; + +#define SMP_PHY_CONTROL_LINK_RESET (0x01) +#define SMP_PHY_CONTROL_HARD_RESET (0x02) +#define SMP_PHY_CONTROL_DISABLE (0x03) + +/** + * mpi3mr_expander_phy_control - expander phy control + * @mrioc: Adapter instance reference + * @phy: The SAS transport layer phy object + * @phy_operation: The phy operation to be executed + * + * Issues SMP passthru phy control reuest to execute a specific + * phy operation for a given expander device. + * + * Return: 0 for success, non-zero for failure. + * + */ +static int +mpi3mr_expander_phy_control(struct mpi3mr_ioc *mrioc, + struct sas_phy *phy, u8 phy_operation) +{ + struct mpi3_smp_passthrough_request mpi_request; + struct mpi3_smp_passthrough_reply mpi_reply; + struct phy_control_request *phy_control_request; + struct phy_control_reply *phy_control_reply; + int rc; + void *psge; + void *data_out = NULL; + dma_addr_t data_out_dma; + dma_addr_t data_in_dma; + size_t data_in_sz; + size_t data_out_sz; + u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST; + u16 request_sz = sizeof(struct mpi3_smp_passthrough_request); + u16 reply_sz = sizeof(struct mpi3_smp_passthrough_reply); + u16 ioc_status; + u16 sz; + + if (mrioc->reset_in_progress) { + ioc_err(mrioc, "%s: host reset in progress!\n", __func__); + return -EFAULT; + } + + + data_out_sz = sizeof(struct phy_control_request); + data_in_sz = sizeof(struct phy_control_reply); + sz = data_out_sz + data_in_sz; + data_out = dma_zalloc_coherent(&mrioc->pdev->dev, sz, &data_out_dma, + GFP_KERNEL); + if (!data_out) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", __FILE__, + __LINE__, __func__); + rc = -ENOMEM; + goto out; + } + + data_in_dma = data_out_dma + data_out_sz; + phy_control_reply = data_out + data_out_sz; + + rc = -EINVAL; + memset(data_out, 0, sz); + + phy_control_request = data_out; + phy_control_request->smp_frame_type = 0x40; + phy_control_request->function = 0x91; + phy_control_request->request_length = 9; + phy_control_request->allocated_response_length = 0; + phy_control_request->phy_identifier = phy->number; + phy_control_request->phy_operation = phy_operation; + phy_control_request->programmed_min_physical_link_rate = + phy->minimum_linkrate << 4; + phy_control_request->programmed_max_physical_link_rate = + phy->maximum_linkrate << 4; + + memset(&mpi_request, 0, request_sz); + memset(&mpi_reply, 0, reply_sz); + mpi_request.host_tag = MPI3MR_HOSTTAG_TRANSPORT_CMDS; + mpi_request.function = MPI3_FUNCTION_SMP_PASSTHROUGH; + mpi_request.io_unit_port = (u8) mpi3mr_get_port_id_by_sas_phy(phy); + mpi_request.sas_address = cpu_to_le64(phy->identify.sas_address); + + psge = &mpi_request.request_sge; + mpi3mr_add_sg_single(psge, sgl_flags, data_out_sz, data_out_dma); + + psge = &mpi_request.response_sge; + mpi3mr_add_sg_single(psge, sgl_flags, data_in_sz, data_in_dma); + + dprint_transport_info(mrioc, + "sending phy control SMP request to sas_address(0x%016llx), phy_id(%d) opcode(%d)\n", + (unsigned long long)phy->identify.sas_address, phy->number, + phy_operation); + + if (mpi3mr_post_transport_req(mrioc, &mpi_request, request_sz, + &mpi_reply, reply_sz, MPI3MR_INTADMCMD_TIMEOUT, &ioc_status)) + goto out; + + dprint_transport_info(mrioc, + "phy control SMP request completed with ioc_status(0x%04x)\n", + ioc_status); + + + if (ioc_status == MPI3_IOCSTATUS_SUCCESS) { + dprint_transport_info(mrioc, + "phy control - reply data transfer size(%d)\n", + le16_to_cpu(mpi_reply.response_data_length)); + + if (le16_to_cpu(mpi_reply.response_data_length) != + sizeof(struct phy_control_reply)) + goto out; + dprint_transport_info(mrioc, + "phy control - function_result(%d)\n", + phy_control_reply->function_result); + rc = 0; + } + out: + if (data_out) + dma_free_coherent(&mrioc->pdev->dev, sz, data_out, + data_out_dma); + + return rc; +} + +/** + * mpi3mr_transport_phy_reset - Reset a given phy + * @phy: The SAS transport layer phy object + * @hard_reset: Flag to indicate the type of reset + * + * Return: 0 for success, non-zero for failure. + */ +static int +mpi3mr_transport_phy_reset(struct sas_phy *phy, int hard_reset) +{ + struct mpi3mr_ioc *mrioc = phy_to_mrioc(phy); + struct mpi3_iounit_control_request mpi_request; + struct mpi3_iounit_control_reply mpi_reply; + u16 request_sz = sizeof(struct mpi3_iounit_control_request); + u16 reply_sz = sizeof(struct mpi3_iounit_control_reply); + int rc = 0; + u16 ioc_status; + + rc = mpi3mr_parent_present(mrioc, phy); + if (rc) + return rc; + + /* handle expander phys */ + if (phy->identify.sas_address != mrioc->sas_hba.sas_address) + return mpi3mr_expander_phy_control(mrioc, phy, + (hard_reset == 1) ? SMP_PHY_CONTROL_HARD_RESET : + SMP_PHY_CONTROL_LINK_RESET); + + /* handle hba phys */ + memset(&mpi_request, 0, request_sz); + mpi_request.host_tag = MPI3MR_HOSTTAG_TRANSPORT_CMDS; + mpi_request.function = MPI3_FUNCTION_IO_UNIT_CONTROL; + mpi_request.operation = MPI3_CTRL_OP_SAS_PHY_CONTROL; + mpi_request.param8[MPI3_CTRL_OP_SAS_PHY_CONTROL_PARAM8_ACTION_INDEX] = + (hard_reset ? MPI3_CTRL_ACTION_HARD_RESET : + MPI3_CTRL_ACTION_LINK_RESET); + mpi_request.param8[MPI3_CTRL_OP_SAS_PHY_CONTROL_PARAM8_PHY_INDEX] = + phy->number; + + dprint_transport_info(mrioc, + "sending phy reset request to sas_address(0x%016llx), phy_id(%d) hard_reset(%d)\n", + (unsigned long long)phy->identify.sas_address, phy->number, + hard_reset); + + if (mpi3mr_post_transport_req(mrioc, &mpi_request, request_sz, + &mpi_reply, reply_sz, MPI3MR_INTADMCMD_TIMEOUT, &ioc_status)) { + rc = -EAGAIN; + goto out; + } + + dprint_transport_info(mrioc, + "phy reset request completed with ioc_status(0x%04x)\n", + ioc_status); +out: + return rc; +} + +/** + * mpi3mr_transport_phy_enable - enable/disable phys + * @phy: The SAS transport layer phy object + * @enable: flag to enable/disable, enable phy when true + * + * This function enables/disables a given by executing required + * configuration page changes or expander phy control command + * + * Return: 0 for success, non-zero for failure. + */ +static int +mpi3mr_transport_phy_enable(struct sas_phy *phy, int enable) +{ + struct mpi3mr_ioc *mrioc = phy_to_mrioc(phy); + struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0 = NULL; + struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1 = NULL; + u16 sz; + int rc = 0; + int i, discovery_active; + + rc = mpi3mr_parent_present(mrioc, phy); + if (rc) + return rc; + + /* handle expander phys */ + if (phy->identify.sas_address != mrioc->sas_hba.sas_address) + return mpi3mr_expander_phy_control(mrioc, phy, + (enable == 1) ? SMP_PHY_CONTROL_LINK_RESET : + SMP_PHY_CONTROL_DISABLE); + + /* handle hba phys */ + + /* read sas_iounit page 0 */ + sz = offsetof(struct mpi3_sas_io_unit_page0, phy_data) + + (mrioc->sas_hba.num_phys * + sizeof(struct mpi3_sas_io_unit0_phy_data)); + sas_io_unit_pg0 = kzalloc(sz, GFP_KERNEL); + if (!sas_io_unit_pg0) { + rc = -ENOMEM; + goto out; + } + if (mpi3mr_cfg_get_sas_io_unit_pg0(mrioc, sas_io_unit_pg0, sz)) { + ioc_err(mrioc,"failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + rc = -ENXIO; + goto out; + } + + /* unable to enable/disable phys when when discovery is active */ + for (i = 0, discovery_active = 0; i < mrioc->sas_hba.num_phys ; i++) { + if (sas_io_unit_pg0->phy_data[i].port_flags & + MPI3_SASIOUNIT0_PORTFLAGS_DISC_IN_PROGRESS) { + ioc_err(mrioc, "discovery is active on " + "port = %d, phy = %d: unable to enable/disable " + "phys, try again later!\n", + sas_io_unit_pg0->phy_data[i].io_unit_port, i); + discovery_active = 1; + } + } + + if (discovery_active) { + rc = -EAGAIN; + goto out; + } + + if ((sas_io_unit_pg0->phy_data[phy->number].phy_flags & + ( MPI3_SASIOUNIT0_PHYFLAGS_HOST_PHY | + MPI3_SASIOUNIT0_PHYFLAGS_VIRTUAL_PHY))) + { + ioc_err(mrioc,"failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + rc = -ENXIO; + goto out; + } + + + /* read sas_iounit page 1 */ + sz = offsetof(struct mpi3_sas_io_unit_page1, phy_data) + + (mrioc->sas_hba.num_phys * + sizeof(struct mpi3_sas_io_unit1_phy_data)); + sas_io_unit_pg1 = kzalloc(sz, GFP_KERNEL); + if (!sas_io_unit_pg1) { + rc = -ENOMEM; + goto out; + } + + if (mpi3mr_cfg_get_sas_io_unit_pg1(mrioc, sas_io_unit_pg1, sz)) { + ioc_err(mrioc,"failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + rc = -ENXIO; + goto out; + } + + if (enable) + sas_io_unit_pg1->phy_data[phy->number].phy_flags + &= ~MPI3_SASIOUNIT1_PHYFLAGS_PHY_DISABLE; + else + sas_io_unit_pg1->phy_data[phy->number].phy_flags + |= MPI3_SASIOUNIT1_PHYFLAGS_PHY_DISABLE; + + mpi3mr_cfg_set_sas_io_unit_pg1(mrioc, sas_io_unit_pg1, sz); + + /* link reset */ + if (enable) + mpi3mr_transport_phy_reset(phy, 0); + + out: + kfree(sas_io_unit_pg1); + kfree(sas_io_unit_pg0); + return rc; +} + +/** + * mpi3mr_transport_phy_speed - set phy min/max speed + * @phy: The SAS transport later phy object + * @rates: Rates defined as in sas_phy_linkrates + * + * This function sets the the link rates given in the rates + * argument to the given phy by executing required configuration + * page changes or expander phy control command + * + * Return: 0 for success, non-zero for failure. + */ +static int +mpi3mr_transport_phy_speed(struct sas_phy *phy, struct sas_phy_linkrates *rates) +{ + + struct mpi3mr_ioc *mrioc = phy_to_mrioc(phy); + struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1 = NULL; + struct mpi3_sas_phy_page0 phy_pg0; + u16 sz, ioc_status; + int rc = 0; + + rc = mpi3mr_parent_present(mrioc, phy); + if (rc) + return rc; + + if (!rates->minimum_linkrate) + rates->minimum_linkrate = phy->minimum_linkrate; + else if (rates->minimum_linkrate < phy->minimum_linkrate_hw) + rates->minimum_linkrate = phy->minimum_linkrate_hw; + + if (!rates->maximum_linkrate) + rates->maximum_linkrate = phy->maximum_linkrate; + else if (rates->maximum_linkrate > phy->maximum_linkrate_hw) + rates->maximum_linkrate = phy->maximum_linkrate_hw; + + /* handle expander phys */ + if (phy->identify.sas_address != mrioc->sas_hba.sas_address) { + phy->minimum_linkrate = rates->minimum_linkrate; + phy->maximum_linkrate = rates->maximum_linkrate; + return mpi3mr_expander_phy_control(mrioc, phy, + SMP_PHY_CONTROL_LINK_RESET); + } + + /* handle hba phys */ + + /* sas_iounit page 1 */ + sz = offsetof(struct mpi3_sas_io_unit_page1, phy_data) + + (mrioc->sas_hba.num_phys * + sizeof(struct mpi3_sas_io_unit1_phy_data)); + sas_io_unit_pg1 = kzalloc(sz, GFP_KERNEL); + if (!sas_io_unit_pg1) { + rc = -ENOMEM; + goto out; + } + + if (mpi3mr_cfg_get_sas_io_unit_pg1(mrioc, sas_io_unit_pg1, sz)) { + ioc_err(mrioc,"failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + rc = -ENXIO; + goto out; + } + + sas_io_unit_pg1->phy_data[phy->number].max_min_link_rate = + (rates->minimum_linkrate + (rates->maximum_linkrate << 4)); + + if (mpi3mr_cfg_set_sas_io_unit_pg1(mrioc, sas_io_unit_pg1, sz)) { + ioc_err(mrioc,"failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + rc = -ENXIO; + goto out; + } + + /* link reset */ + mpi3mr_transport_phy_reset(phy, 0); + + /* read phy page 0, then update the rates in the sas transport phy */ + if (!mpi3mr_cfg_get_sas_phy_pg0(mrioc, &ioc_status, &phy_pg0, + sizeof(struct mpi3_sas_phy_page0), + MPI3_SAS_PHY_PGAD_FORM_PHY_NUMBER, phy->number) && + (ioc_status == MPI3_IOCSTATUS_SUCCESS)) { + phy->minimum_linkrate = mpi3mr_convert_phy_link_rate( + phy_pg0.programmed_link_rate & + MPI3_SAS_PRATE_MIN_RATE_MASK); + phy->maximum_linkrate = mpi3mr_convert_phy_link_rate( + phy_pg0.programmed_link_rate >> 4); + phy->negotiated_linkrate = + mpi3mr_convert_phy_link_rate( + (phy_pg0.negotiated_link_rate & + MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) + >> MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT); + } + +out: + kfree(sas_io_unit_pg1); + return rc; +} + + +/** + * mpi3mr_map_smp_buffer - map BSG dma buffer + * @dev: Generic device reference + * @buf: BSG buffer pointer + * @dma_addr: Phyiscal address holder + * @dma_len: Mapped DMA buffer length. + * @p: Virtual address holder + * + * This function maps the DMAable buffer + * + * Return: 0 on success, non-zero on failure + */ + +static int +mpi3mr_map_smp_buffer(struct device *dev, struct bsg_buffer *buf, + dma_addr_t *dma_addr, size_t *dma_len, void **p) +{ + /* Check if the request is split across multiple segments */ + if (buf->sg_cnt > 1) { + *p = dma_zalloc_coherent(dev, buf->payload_len, dma_addr, + GFP_KERNEL); + if (!*p) + return -ENOMEM; + *dma_len = buf->payload_len; + } else { + if (!dma_map_sg(dev, buf->sg_list, 1, DMA_BIDIRECTIONAL)) + return -ENOMEM; + *dma_addr = sg_dma_address(buf->sg_list); + *dma_len = sg_dma_len(buf->sg_list); + *p = NULL; + } + + return 0; +} + +/** + * mpi3mr_unmap_smp_buffer - unmap BSG dma buffer + * @dev: Generic device reference + * @buf: BSG buffer pointer + * @dma_addr: Phyiscal address to be unmapped + * @p: Virtual address + * + * This function unmaps the DMAable buffer + */ + +static void +mpi3mr_unmap_smp_buffer(struct device *dev, struct bsg_buffer *buf, + dma_addr_t dma_addr, void *p) +{ + if (p) + dma_free_coherent(dev, buf->payload_len, p, dma_addr); + else + dma_unmap_sg(dev, buf->sg_list, 1, DMA_BIDIRECTIONAL); +} + +/** + * mpi3mr_transport_smp_handler - handler for smp passthru + * @job: BSG job reference + * @shost: SCSI host object reference + * @rphy: SAS transport rphy object pointing the expander + * + * This is used primarily by smp utils for sending the SMP + * commands to the expanders attached to the controller + */ +static void +mpi3mr_transport_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, + struct sas_rphy *rphy) +{ + + struct mpi3mr_ioc *mrioc = shost_priv(shost); + struct mpi3_smp_passthrough_request mpi_request; + struct mpi3_smp_passthrough_reply mpi_reply; + int rc; + void *psge; + dma_addr_t dma_addr_in; + dma_addr_t dma_addr_out; + void *addr_in = NULL; + void *addr_out = NULL; + size_t dma_len_in; + size_t dma_len_out; + unsigned int reslen = 0; + u16 request_sz = sizeof(struct mpi3_smp_passthrough_request); + u16 reply_sz = sizeof(struct mpi3_smp_passthrough_reply); + u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST; + u16 ioc_status; + + + if (mrioc->reset_in_progress) { + ioc_err(mrioc, "%s: host reset in progress!\n", __func__); + rc = -EFAULT; + goto out; + } + + rc = mpi3mr_map_smp_buffer(&mrioc->pdev->dev, &job->request_payload, + &dma_addr_out, &dma_len_out, &addr_out); + if (rc) + goto out; + + if (addr_out) + sg_copy_to_buffer(job->request_payload.sg_list, + job->request_payload.sg_cnt, addr_out, + job->request_payload.payload_len); + + rc = mpi3mr_map_smp_buffer(&mrioc->pdev->dev, &job->reply_payload, + &dma_addr_in, &dma_len_in, &addr_in); + if (rc) + goto unmap_out; + + memset(&mpi_request, 0, request_sz); + memset(&mpi_reply, 0, reply_sz); + mpi_request.host_tag = MPI3MR_HOSTTAG_TRANSPORT_CMDS; + mpi_request.function = MPI3_FUNCTION_SMP_PASSTHROUGH; + mpi_request.io_unit_port = (u8) mpi3mr_get_port_id_by_rphy(mrioc, rphy); + mpi_request.sas_address = ((rphy) ? + cpu_to_le64(rphy->identify.sas_address) : + cpu_to_le64(mrioc->sas_hba.sas_address)); + psge = &mpi_request.request_sge; + mpi3mr_add_sg_single(psge, sgl_flags, dma_len_out - 4, dma_addr_out); + + psge = &mpi_request.response_sge; + mpi3mr_add_sg_single(psge, sgl_flags, dma_len_in - 4, dma_addr_in); + + + dprint_transport_info(mrioc, "sending SMP request \n"); + + if (mpi3mr_post_transport_req(mrioc, &mpi_request, request_sz, + &mpi_reply, reply_sz, MPI3MR_INTADMCMD_TIMEOUT, &ioc_status)) + goto unmap_in; + + dprint_transport_info(mrioc, + "SMP request completed with ioc_status(0x%04x)\n", ioc_status); + + + dprint_transport_info(mrioc, + "SMP request - reply data transfer size(%d)\n", + le16_to_cpu(mpi_reply.response_data_length)); + + memcpy(job->reply, &mpi_reply, reply_sz); + job->reply_len = reply_sz; + reslen = le16_to_cpu(mpi_reply.response_data_length); + + if (addr_in) + sg_copy_from_buffer(job->reply_payload.sg_list, + job->reply_payload.sg_cnt, addr_in, + job->reply_payload.payload_len); + + rc = 0; +unmap_in: + mpi3mr_unmap_smp_buffer(&mrioc->pdev->dev, &job->reply_payload, + dma_addr_in, addr_in); +unmap_out: + mpi3mr_unmap_smp_buffer(&mrioc->pdev->dev, &job->request_payload, + dma_addr_out, addr_out); +out: + bsg_job_done(job, rc, reslen); + +} + + + +struct sas_function_template mpi3mr_transport_functions = { + .get_linkerrors = mpi3mr_transport_get_linkerrors, + .get_enclosure_identifier = mpi3mr_transport_get_enclosure_identifier, + .get_bay_identifier = mpi3mr_transport_get_bay_identifier, + .phy_reset = mpi3mr_transport_phy_reset, + .phy_enable = mpi3mr_transport_phy_enable, + .set_phy_speed = mpi3mr_transport_phy_speed, + .smp_handler = mpi3mr_transport_smp_handler, +}; + +struct scsi_transport_template *mpi3mr_transport_template; + +/** + * struct host_port - host port details + * @sas_address: SAS Address of the attached device + * @phy_mask: phy mask of host port + * @handle: Device Handle of attached device + * @iounit_port_id: port ID + * @used: host port is already matched with sas port from sas_port_list + * lowest_phy: lowest phy ID of host port + */ +struct host_port { + u64 sas_address; + u32 phy_mask; + u16 handle; + u8 iounit_port_id; + u8 used; + u8 lowest_phy; +}; + + +/** + * mpi3mr_update_mr_sas_port - update sas port objects during reset + * @mrioc: Adapter instance reference + * @h_port: host_port object + * @mr_sas_port: sas_port objects which needs to be updated + * + * Update the port ID of sas port object. Also add the phys if new phys got + * added to current sas port and remove the phys if some phys are moved + * out of the current sas port. + * + * Return: Nothing. + */ +void +mpi3mr_update_mr_sas_port(struct mpi3mr_ioc *mrioc, struct host_port *h_port, + struct mpi3mr_sas_port *mr_sas_port) +{ + struct mpi3mr_sas_phy *mr_sas_phy; + u32 phy_mask_xor, phys_to_be_added, phys_to_be_removed; + int i; + + h_port->used = 1; + mr_sas_port->marked_responding = 1; + + dev_printk(KERN_INFO, &mr_sas_port->port->dev, + "sas_address(0x%016llx), old: port_id %d phy_mask 0x%x, new: port_id %d phy_mask:0x%x\n", + mr_sas_port->remote_identify.sas_address, + mr_sas_port->hba_port->port_id, mr_sas_port->phy_mask, + h_port->iounit_port_id, h_port->phy_mask); + + mr_sas_port->hba_port->port_id = h_port->iounit_port_id; + mr_sas_port->hba_port->flags &= ~MPI3MR_HBA_PORT_FLAG_DIRTY; + + /* Get the newly added phys bit map & removed phys bit map */ + phy_mask_xor = mr_sas_port->phy_mask ^ h_port->phy_mask; + phys_to_be_added = h_port->phy_mask & phy_mask_xor; + phys_to_be_removed = mr_sas_port->phy_mask & phy_mask_xor; + + /* Register these new phys to current mr_sas_port's port. + * if these phys are previously registered with another port + * then delete these phys from that port first. + */ + for_each_set_bit(i, (ulong *) &phys_to_be_added, BITS_PER_TYPE(u32)) { + mr_sas_phy = &mrioc->sas_hba.phy[i]; + if (mr_sas_phy->phy_belongs_to_port) + mpi3mr_del_phy_from_an_existing_port(mrioc, + &mrioc->sas_hba, mr_sas_phy); + mpi3mr_add_phy_to_an_existing_port(mrioc, + &mrioc->sas_hba, mr_sas_phy, + mr_sas_port->remote_identify.sas_address, + mr_sas_port->hba_port); + } + + /* Delete the phys which are not part of current mr_sas_port's port. */ + for_each_set_bit(i, (ulong *) &phys_to_be_removed, BITS_PER_TYPE(u32)) { + mr_sas_phy = &mrioc->sas_hba.phy[i]; + if (mr_sas_phy->phy_belongs_to_port) + mpi3mr_del_phy_from_an_existing_port(mrioc, + &mrioc->sas_hba, mr_sas_phy); + } +} + +/** + * mpi3mr_refresh_sas_ports - update host's sas ports during reset + * @mrioc: Adapter instance reference + * + * Update the host's sas ports during reset by checking whether + * sas ports are still intact or not. Add/remove phys if any hba + * phys are (moved in)/(moved out) of sas port. Also update + * io_unit_port if it got changed during reset. + * + * Return: Nothing. + */ +void +mpi3mr_refresh_sas_ports(struct mpi3mr_ioc *mrioc) +{ + struct host_port h_port[32]; + int i, j, found, host_port_count = 0, port_idx, num_phys; + u16 sz, attached_handle, ioc_status; + struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0 = NULL; + struct mpi3_device_page0 dev_pg0; + struct mpi3_device0_sas_sata_format *sasinf; + struct mpi3mr_sas_port *mr_sas_port; + + sz = offsetof(struct mpi3_sas_io_unit_page0, phy_data) + + (mrioc->sas_hba.num_phys * + sizeof(struct mpi3_sas_io_unit0_phy_data)); + sas_io_unit_pg0 = kzalloc(sz, GFP_KERNEL); + if (!sas_io_unit_pg0) + return; + if (mpi3mr_cfg_get_sas_io_unit_pg0(mrioc, sas_io_unit_pg0, sz)) { + ioc_err(mrioc,"failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out; + } + + /* Create a new expander port table */ + num_phys = min_t(int, + mrioc->sas_hba.num_phys, MPI3MR_MAX_PHYSICAL_PHYS); + for (i = 0; i < num_phys; i++) { + attached_handle = le16_to_cpu( + sas_io_unit_pg0->phy_data[i].attached_dev_handle); + if (!attached_handle) + continue; + found = 0; + for (j = 0; j < host_port_count; j++) { + if (h_port[j].handle == attached_handle) { + h_port[j].phy_mask |= (1 << i); + found = 1; + break; + } + } + if (found) + continue; + if ((mpi3mr_cfg_get_dev_pg0(mrioc, &ioc_status, &dev_pg0, + sizeof(dev_pg0), MPI3_DEVICE_PGAD_FORM_HANDLE, + attached_handle))) { + dprint_reset(mrioc, + "failed to read dev_pg0 for handle(0x%04x) at %s:%d/%s()!\n", + attached_handle, __FILE__, __LINE__, __func__); + continue; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + dprint_reset(mrioc, + "ioc_status(0x%x) while reading dev_pg0 for handle(0x%04x) at %s:%d/%s()!\n", + ioc_status, attached_handle, + __FILE__, __LINE__, __func__); + continue; + } + sasinf = &dev_pg0.device_specific.sas_sata_format; + + port_idx = host_port_count; + h_port[port_idx].sas_address = le64_to_cpu(sasinf->sas_address); + h_port[port_idx].handle = attached_handle; + h_port[port_idx].phy_mask = (1 << i); + h_port[port_idx].iounit_port_id = sas_io_unit_pg0->phy_data[i].io_unit_port; + h_port[port_idx].lowest_phy = sasinf->phy_num; + h_port[port_idx].used = 0; + host_port_count++; + } + + if (!host_port_count) + goto out; + + if (mrioc->logging_level & MPI3_DEBUG_RESET) { + ioc_info(mrioc, "Host port details before reset\n"); + list_for_each_entry(mr_sas_port, &mrioc->sas_hba.sas_port_list, + port_list) { + ioc_info(mrioc, + "port_id:%d, sas_address:(0x%016llx), phy_mask:(0x%x), lowest phy id:%d\n", + mr_sas_port->hba_port->port_id, + mr_sas_port->remote_identify.sas_address, + mr_sas_port->phy_mask, mr_sas_port->lowest_phy); + } + mr_sas_port = NULL; + ioc_info(mrioc, "Host port details after reset\n"); + for (i = 0; i < host_port_count; i++) { + ioc_info(mrioc, + "port_id:%d, sas_address:(0x%016llx), phy_mask:(0x%x), lowest phy id:%d\n", + h_port[i].iounit_port_id, h_port[i].sas_address, + h_port[i].phy_mask, h_port[i].lowest_phy); + } + } + + /* mark all host sas port entries as dirty */ + list_for_each_entry(mr_sas_port, &mrioc->sas_hba.sas_port_list, + port_list) { + mr_sas_port->marked_responding = 0; + mr_sas_port->hba_port->flags |= MPI3MR_HBA_PORT_FLAG_DIRTY; + } + + /* First check for matching lowest phy */ + for (i = 0; i < host_port_count; i++) { + mr_sas_port = NULL; + list_for_each_entry(mr_sas_port, &mrioc->sas_hba.sas_port_list, + port_list) { + if (mr_sas_port->marked_responding) + continue; + if (h_port[i].sas_address != mr_sas_port->remote_identify.sas_address) + continue; + if (h_port[i].lowest_phy == mr_sas_port->lowest_phy) { + mpi3mr_update_mr_sas_port(mrioc, &h_port[i], mr_sas_port); + break; + } + } + } + + /* In case if lowest phy is got enabled or disabled during reset */ + for (i = 0; i < host_port_count; i++) { + if (h_port[i].used) + continue; + mr_sas_port = NULL; + list_for_each_entry(mr_sas_port, &mrioc->sas_hba.sas_port_list, + port_list) { + if (mr_sas_port->marked_responding) + continue; + if (h_port[i].sas_address != mr_sas_port->remote_identify.sas_address) + continue; + if (h_port[i].phy_mask & mr_sas_port->phy_mask) { + mpi3mr_update_mr_sas_port(mrioc, &h_port[i], mr_sas_port); + break; + } + } + } + + /* In case if expander cable is removed & connected to another HBA port during reset */ + for (i = 0; i < host_port_count; i++) { + if (h_port[i].used) + continue; + mr_sas_port = NULL; + list_for_each_entry(mr_sas_port, &mrioc->sas_hba.sas_port_list, + port_list) { + if (mr_sas_port->marked_responding) + continue; + if (h_port[i].sas_address != mr_sas_port->remote_identify.sas_address) + continue; + mpi3mr_update_mr_sas_port(mrioc, &h_port[i], mr_sas_port); + break; + } + } +out: + kfree(sas_io_unit_pg0); +} + +/** + * mpi3mr_refresh_expanders - Refresh expander device exposure + * @mrioc: Adapter instance reference + * + * This is executed post controller reset to identify any + * missing expander devices during reset and remove from the upper layers + * or expose any newly detected expander device to the upper layers. + * + * Return: Nothing. + */ +void +mpi3mr_refresh_expanders(struct mpi3mr_ioc *mrioc) +{ + struct mpi3mr_sas_node *sas_expander, *sas_expander_next; + struct mpi3_sas_expander_page0 expander_pg0; + u16 ioc_status, handle; + u64 sas_address; + int i; + unsigned long flags; + struct mpi3mr_hba_port *hba_port; + + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + list_for_each_entry(sas_expander, &mrioc->sas_expander_list, list) { + sas_expander->non_responding = 1; + } + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + + sas_expander = NULL; + + handle = 0xffff; + + /* Search for responding expander devices and add them if they are newly got added */ + while (true) { + if ((mpi3mr_cfg_get_sas_exp_pg0(mrioc, &ioc_status, &expander_pg0, + sizeof(struct mpi3_sas_expander_page0), + MPI3_SAS_EXPAND_PGAD_FORM_GET_NEXT_HANDLE, handle))) { + dprint_reset(mrioc, + "failed to read exp pg0 for handle(0x%04x) at %s:%d/%s()!\n", + handle, __FILE__, __LINE__, __func__); + break; + } + + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + dprint_reset(mrioc, + "ioc_status(0x%x) while reading exp pg0 for handle:(0x%04x), %s:%d/%s()!\n", + ioc_status, handle, __FILE__, __LINE__, __func__); + break; + } + + handle = le16_to_cpu(expander_pg0.dev_handle); + sas_address = le64_to_cpu(expander_pg0.sas_address); + hba_port = mpi3mr_get_hba_port_by_id(mrioc, expander_pg0.io_unit_port, 0); + + if (!hba_port) { + mpi3mr_sas_host_refresh(mrioc); + mpi3mr_expander_add(mrioc, handle); + continue; + } + + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + sas_expander = + mpi3mr_expander_find_by_sas_address(mrioc, + sas_address, hba_port); + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + + if (!sas_expander) { + mpi3mr_sas_host_refresh(mrioc); + mpi3mr_expander_add(mrioc, handle); + continue; + } + + sas_expander->non_responding = 0; + if (sas_expander->handle == handle) + continue; + + sas_expander->handle = handle; + for (i = 0 ; i < sas_expander->num_phys ; i++) + sas_expander->phy[i].handle = handle; + } + + /* Delete non responding expander devices and the corresponding hba_port ( if + * the non responding expander device's parent device is host node. + */ + + sas_expander = NULL; + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + list_for_each_entry_safe_reverse(sas_expander, sas_expander_next, + &mrioc->sas_expander_list, list) { + if (sas_expander->non_responding) { + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + mpi3mr_expander_node_remove(mrioc, sas_expander); + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + } + } + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); +} + +/** + * mpi3mr_expander_add - Create expander object + * @mrioc: Adapter instance reference + * @handle: Expander firmware device handle + * + * This funcion creating expander object, stored in + * sas_expander_list and expose it to the SAS transport + * layer. + * + * Return: 0 for success, non-zero for failure. + */ +int mpi3mr_expander_add(struct mpi3mr_ioc *mrioc, u16 handle) +{ + struct mpi3mr_sas_node *sas_expander; + struct mpi3mr_enclosure_node *enclosure_dev; + struct mpi3_sas_expander_page0 expander_pg0; + struct mpi3_sas_expander_page1 expander_pg1; + u16 ioc_status, parent_handle, temp_handle; + u64 sas_address, sas_address_parent = 0; + int i; + unsigned long flags; + u8 port_id, link_rate; + struct mpi3mr_sas_port *mr_sas_port = NULL; + struct mpi3mr_hba_port *hba_port; + u32 phynum_handle; + + int rc = 0; + + if (!handle) + return -1; + + if (mrioc->reset_in_progress) + return -1; + + if ((mpi3mr_cfg_get_sas_exp_pg0(mrioc, &ioc_status, &expander_pg0, + sizeof(expander_pg0), MPI3_SAS_EXPAND_PGAD_FORM_HANDLE, handle))) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -1; + } + + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -1; + } + + parent_handle = le16_to_cpu(expander_pg0.parent_dev_handle); + if (mpi3mr_get_sas_address(mrioc, parent_handle, &sas_address_parent) + != 0) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -1; + } + + port_id = expander_pg0.io_unit_port; + hba_port = mpi3mr_get_hba_port_by_id(mrioc,port_id, 0); + if (!hba_port) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -1; + } + if (sas_address_parent != mrioc->sas_hba.sas_address) { + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + sas_expander = + mpi3mr_expander_find_by_sas_address(mrioc, + sas_address_parent, hba_port); + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + if (!sas_expander) { + rc = mpi3mr_expander_add(mrioc, parent_handle); + if (rc != 0) + return rc; + } else { + /*When there is a parent expander present, update it's + phys where child expander is connected with the link + speed, attached dev handle and sas address*/ + for (i = 0 ; i < sas_expander->num_phys ; i++) { + phynum_handle = + (i <> + MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT; + mpi3mr_update_links(mrioc, sas_address_parent, + handle, i, link_rate, hba_port); + } + } + } + + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + sas_address = le64_to_cpu(expander_pg0.sas_address); + sas_expander = mpi3mr_expander_find_by_sas_address(mrioc, + sas_address, hba_port); + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + + if (sas_expander) + return 0; + + sas_expander = kzalloc(sizeof(struct mpi3mr_sas_node), + GFP_KERNEL); + if (!sas_expander) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -1; + } + + sas_expander->handle = handle; + sas_expander->num_phys = expander_pg0.num_phys; + sas_expander->sas_address_parent = sas_address_parent; + sas_expander->sas_address = sas_address; + sas_expander->hba_port = hba_port; + + ioc_info(mrioc, "expander_add: handle(0x%04x)," + " parent(0x%04x), sas_addr(0x%016llx), phys(%d)\n", + handle, parent_handle, (unsigned long long) + sas_expander->sas_address, sas_expander->num_phys); + + if (!sas_expander->num_phys) { + rc = -1; + goto out_fail; + } + sas_expander->phy = kcalloc(sas_expander->num_phys, + sizeof(struct mpi3mr_sas_phy), GFP_KERNEL); + if (!sas_expander->phy) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + rc = -1; + goto out_fail; + } + + INIT_LIST_HEAD(&sas_expander->sas_port_list); + mr_sas_port = mpi3mr_sas_port_add(mrioc, handle, sas_address_parent, + sas_expander->hba_port); + if (!mr_sas_port) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + rc = -1; + goto out_fail; + } + sas_expander->parent_dev = &mr_sas_port->rphy->dev; + sas_expander->rphy = mr_sas_port->rphy; + + for (i = 0 ; i < sas_expander->num_phys ; i++) { + phynum_handle = (i <phy[i].handle = handle; + sas_expander->phy[i].phy_id = i; + sas_expander->phy[i].hba_port = hba_port; + + if ((mpi3mr_add_expander_phy(mrioc, &sas_expander->phy[i], + expander_pg1, sas_expander->parent_dev))) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + rc = -1; + goto out_fail; + } + } + + if (sas_expander->enclosure_handle) { + enclosure_dev = + mpi3mr_enclosure_find_by_handle(mrioc, + sas_expander->enclosure_handle); + if (enclosure_dev) + sas_expander->enclosure_logical_id = le64_to_cpu( + enclosure_dev->pg0.enclosure_logical_id); + } + + mpi3mr_expander_node_add(mrioc, sas_expander); + return 0; + +out_fail: + + if (mr_sas_port) + mpi3mr_sas_port_remove(mrioc, + sas_expander->sas_address, + sas_address_parent, sas_expander->hba_port); + if (sas_expander->phy) + kfree(sas_expander->phy); + if (sas_expander) + kfree(sas_expander); + return rc; +} + +/** + * mpi3mr_expander_node_remove - recursive removal of expander. + * @mrioc: Adapter instance reference + * @sas_expander: Expander device object + * + * Removes expander object and freeing associated memory from + * the sas_expander_list and removes the same from SAS TL, if + * one of the attached device is an expander then it recursively + * removes the expander device too. + * + * Return nothing. + */ +static void mpi3mr_expander_node_remove(struct mpi3mr_ioc *mrioc, + struct mpi3mr_sas_node *sas_expander) +{ + struct mpi3mr_sas_port *mr_sas_port, *next; + unsigned long flags; + u8 port_id; + + /* remove sibling ports attached to this expander */ + list_for_each_entry_safe(mr_sas_port, next, + &sas_expander->sas_port_list, port_list) { + if (mrioc->reset_in_progress) + return; + if (mr_sas_port->remote_identify.device_type == + SAS_END_DEVICE) + mpi3mr_remove_device_by_sas_address(mrioc, + mr_sas_port->remote_identify.sas_address, + mr_sas_port->hba_port); + else if (mr_sas_port->remote_identify.device_type == + SAS_EDGE_EXPANDER_DEVICE || + mr_sas_port->remote_identify.device_type == + SAS_FANOUT_EXPANDER_DEVICE) + mpi3mr_expander_remove(mrioc, + mr_sas_port->remote_identify.sas_address, + mr_sas_port->hba_port); + } + port_id = sas_expander->hba_port->port_id; + mpi3mr_sas_port_remove(mrioc, sas_expander->sas_address, + sas_expander->sas_address_parent, sas_expander->hba_port); + + ioc_info(mrioc, "expander_remove: handle(0x%04x), sas_addr(0x%016llx)," + "port:%d\n", sas_expander->handle, + (unsigned long long)sas_expander->sas_address, port_id); + + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + list_del(&sas_expander->list); + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + + kfree(sas_expander->phy); + kfree(sas_expander); +} + + +/** + * mpi3mr_expander_remove - Remove expander object + * @mrioc: Adapter instance reference + * @sas_address: Remove expander sas_address + * @hba_port: HBA port reference + * + * This funcion remove expander object, stored in + * mrioc->sas_expander_list and removes it from the SAS TL by + * calling mpi3mr_expander_node_remove(). + * + * Return: None + */ +void mpi3mr_expander_remove(struct mpi3mr_ioc *mrioc, u64 sas_address, + struct mpi3mr_hba_port *hba_port) +{ + struct mpi3mr_sas_node *sas_expander; + unsigned long flags; + + if (mrioc->reset_in_progress) + return; + + if (!hba_port) + return; + + spin_lock_irqsave(&mrioc->sas_node_lock, flags); + sas_expander = mpi3mr_expander_find_by_sas_address(mrioc, sas_address, + hba_port); + spin_unlock_irqrestore(&mrioc->sas_node_lock, flags); + if (sas_expander) + mpi3mr_expander_node_remove(mrioc, sas_expander); + +} + +/** + * mpi3mr_sas_host_refresh - refreshing sas host object contents + * @mrioc: Adapter instance reference + * + * This function refreshes the controllers phy information and + * updates the SAS transport layer with updated information, + * this is excecuted for eeach device addition or device info + * change events + * + * Return: None. + */ +void mpi3mr_sas_host_refresh(struct mpi3mr_ioc *mrioc) +{ + int i; + u8 link_rate; + u16 sz, port_id, attached_handle; + struct mpi3mr_hba_port *hba_port; + struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0 = NULL; + + dprint_transport_info(mrioc, + "updating handles for sas_host(0x%016llx)\n", + (unsigned long long)mrioc->sas_hba.sas_address); + + sz = offsetof(struct mpi3_sas_io_unit_page0, phy_data) + + (mrioc->sas_hba.num_phys * + sizeof(struct mpi3_sas_io_unit0_phy_data)); + sas_io_unit_pg0 = kzalloc(sz, GFP_KERNEL); + if (!sas_io_unit_pg0) + return; + if (mpi3mr_cfg_get_sas_io_unit_pg0(mrioc, sas_io_unit_pg0, sz)) { + ioc_err(mrioc,"failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out; + } + + mrioc->sas_hba.handle = 0; + for (i = 0; i < mrioc->sas_hba.num_phys; i++) { + if (sas_io_unit_pg0->phy_data[i].phy_flags & + (MPI3_SASIOUNIT0_PHYFLAGS_HOST_PHY | + MPI3_SASIOUNIT0_PHYFLAGS_VIRTUAL_PHY)) + continue; + link_rate = + sas_io_unit_pg0->phy_data[i].negotiated_link_rate >> 4; + if (!mrioc->sas_hba.handle) + mrioc->sas_hba.handle = le16_to_cpu(sas_io_unit_pg0-> + phy_data[i].controller_dev_handle); + port_id = sas_io_unit_pg0->phy_data[i].io_unit_port; + if (!(mpi3mr_get_hba_port_by_id(mrioc, port_id, 0))) { + hba_port = kzalloc(sizeof(struct mpi3mr_hba_port), + GFP_KERNEL); + if (!hba_port) + goto out; + hba_port->port_id = port_id; + ioc_info(mrioc, "hba_port entry: %p," + " port: %d is added to hba_port list\n", + hba_port, hba_port->port_id); + if (mrioc->reset_in_progress) + hba_port->flags = MPI3MR_HBA_PORT_FLAG_NEW; + list_add_tail(&hba_port->list, + &mrioc->hba_port_table_list); + } + + mrioc->sas_hba.phy[i].handle = mrioc->sas_hba.handle; + attached_handle = le16_to_cpu(sas_io_unit_pg0->phy_data[i]. + attached_dev_handle); + if (attached_handle && link_rate < MPI3_SAS_NEG_LINK_RATE_1_5) + link_rate = MPI3_SAS_NEG_LINK_RATE_1_5; + mrioc->sas_hba.phy[i].hba_port = + mpi3mr_get_hba_port_by_id(mrioc,port_id, 0); + mpi3mr_update_links(mrioc, mrioc->sas_hba.sas_address, + attached_handle, i, link_rate, + mrioc->sas_hba.phy[i].hba_port); + } + out: + kfree(sas_io_unit_pg0); +} + +/** + * mpi3mr_sas_host_add - create sas host object + * @mrioc: Adapter instance reference + * + * This function creates the controllers phy information and + * updates the SAS transport layer with updated information, + * this is excecuted for first device addition or device info + * change event. + * + * Return: None. + */ +void mpi3mr_sas_host_add(struct mpi3mr_ioc *mrioc) +{ + int i; + u16 sz, num_phys = 1, port_id, ioc_status; + struct mpi3mr_hba_port *hba_port; + struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0 = NULL; + struct mpi3_sas_phy_page0 phy_pg0; + struct mpi3_device_page0 dev_pg0; + struct mpi3_enclosure_page0 encl_pg0; + struct mpi3_device0_sas_sata_format *sasinf; + + + sz = offsetof(struct mpi3_sas_io_unit_page0, phy_data) + + (num_phys * sizeof(struct mpi3_sas_io_unit0_phy_data)); + sas_io_unit_pg0 = kzalloc(sz, GFP_KERNEL); + if (!sas_io_unit_pg0) + return; + + if (mpi3mr_cfg_get_sas_io_unit_pg0(mrioc, sas_io_unit_pg0, sz)) { + ioc_err(mrioc,"failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out; + } + num_phys = sas_io_unit_pg0->num_phys; + kfree(sas_io_unit_pg0); + + mrioc->sas_hba.host_node = 1; + INIT_LIST_HEAD(&mrioc->sas_hba.sas_port_list); + mrioc->sas_hba.parent_dev = &mrioc->shost->shost_gendev; + mrioc->sas_hba.phy = kcalloc(num_phys, + sizeof(struct mpi3mr_sas_phy), GFP_KERNEL); + if (!mrioc->sas_hba.phy) + return; + + mrioc->sas_hba.num_phys = num_phys; + + sz = offsetof(struct mpi3_sas_io_unit_page0, phy_data) + + (num_phys * sizeof(struct mpi3_sas_io_unit0_phy_data)); + sas_io_unit_pg0 = kzalloc(sz, GFP_KERNEL); + if (!sas_io_unit_pg0) + return; + + if (mpi3mr_cfg_get_sas_io_unit_pg0(mrioc, sas_io_unit_pg0, sz)) { + ioc_err(mrioc,"failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out; + } + + mrioc->sas_hba.handle = 0; + for (i = 0; i < mrioc->sas_hba.num_phys; i++) { + if (sas_io_unit_pg0->phy_data[i].phy_flags & + (MPI3_SASIOUNIT0_PHYFLAGS_HOST_PHY | + MPI3_SASIOUNIT0_PHYFLAGS_VIRTUAL_PHY)) + continue; + if (mpi3mr_cfg_get_sas_phy_pg0(mrioc, &ioc_status, &phy_pg0, + sizeof(struct mpi3_sas_phy_page0), + MPI3_SAS_PHY_PGAD_FORM_PHY_NUMBER, i)) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out; + } + + if (!mrioc->sas_hba.handle) + mrioc->sas_hba.handle = le16_to_cpu(sas_io_unit_pg0-> + phy_data[i].controller_dev_handle); + port_id = sas_io_unit_pg0->phy_data[i].io_unit_port; + + if (!(mpi3mr_get_hba_port_by_id(mrioc, port_id, 0))) { + hba_port = kzalloc(sizeof(struct mpi3mr_hba_port), + GFP_KERNEL); + if (!hba_port) + goto out; + hba_port->port_id = port_id; + ioc_info(mrioc, "hba_port entry: %p," + " port: %d is added to hba_port list\n", + hba_port, hba_port->port_id); + list_add_tail(&hba_port->list, + &mrioc->hba_port_table_list); + } + + mrioc->sas_hba.phy[i].handle = mrioc->sas_hba.handle; + mrioc->sas_hba.phy[i].phy_id = i; + mrioc->sas_hba.phy[i].hba_port = + mpi3mr_get_hba_port_by_id(mrioc, port_id, 0); + mpi3mr_add_host_phy(mrioc, &mrioc->sas_hba.phy[i], + phy_pg0, mrioc->sas_hba.parent_dev); + } + if ((mpi3mr_cfg_get_dev_pg0(mrioc, &ioc_status, &dev_pg0, + sizeof(dev_pg0), MPI3_DEVICE_PGAD_FORM_HANDLE, + mrioc->sas_hba.handle))) { + ioc_err(mrioc, "%s: device page0 read failed\n", __func__); + goto out; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "device page read failed for handle(0x%04x), with ioc_status(0x%04x) failure at %s:%d/%s()!\n", + mrioc->sas_hba.handle, ioc_status, __FILE__, __LINE__, + __func__); + goto out; + } + mrioc->sas_hba.enclosure_handle = + le16_to_cpu(dev_pg0.enclosure_handle); + sasinf = &dev_pg0.device_specific.sas_sata_format; + mrioc->sas_hba.sas_address = + le64_to_cpu(sasinf->sas_address); + ioc_info(mrioc, "host_add: handle(0x%04x), " + "sas_addr(0x%016llx), phys(%d)\n", + mrioc->sas_hba.handle, + (unsigned long long) mrioc->sas_hba.sas_address, + mrioc->sas_hba.num_phys); + + if (mrioc->sas_hba.enclosure_handle) { + if (!(mpi3mr_cfg_get_enclosure_pg0(mrioc, &ioc_status, + &encl_pg0, sizeof(dev_pg0), + MPI3_ENCLOS_PGAD_FORM_HANDLE, + mrioc->sas_hba.enclosure_handle)) && + (ioc_status == MPI3_IOCSTATUS_SUCCESS)) + mrioc->sas_hba.enclosure_logical_id = + le64_to_cpu(encl_pg0.enclosure_logical_id); + } + +out: + kfree(sas_io_unit_pg0); +} + +/** + * mpi3mr_get_sas_negotiated_logical_linkrate - get linkrate + * @mrioc: Adapter instance reference + * @tgtdev: Target device + * + * This function identifies whether the target device is + * attached directly or through expander and issues sas phy + * page0 or expander phy page1 and gets the link rate, if there + * is any faiulre in reading the pages then this returns link + * rate of 1.5. + * + * Return: logical link rate. + */ +static u8 mpi3mr_get_sas_negotiated_logical_linkrate(struct mpi3mr_ioc *mrioc, + struct mpi3mr_tgt_dev *tgtdev) +{ + u8 link_rate = MPI3_SAS_NEG_LINK_RATE_1_5, phy_number; + struct mpi3_sas_expander_page1 expander_pg1; + struct mpi3_sas_phy_page0 phy_pg0; + u32 phynum_handle; + u16 ioc_status; + + phy_number = tgtdev->dev_spec.sas_sata_inf.phy_id; + if (!(tgtdev->devpg0_flag & MPI3_DEVICE0_FLAGS_ATT_METHOD_DIR_ATTACHED)) + { + phynum_handle = ((phy_number<parent_handle); + if (mpi3mr_cfg_get_sas_exp_pg1(mrioc, &ioc_status, + &expander_pg1, sizeof(expander_pg1), + MPI3_SAS_EXPAND_PGAD_FORM_HANDLE_PHY_NUM, + phynum_handle)) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out; + } + link_rate = (expander_pg1.negotiated_link_rate & + MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >> + MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT; + goto out; + } + if (mpi3mr_cfg_get_sas_phy_pg0(mrioc, &ioc_status, &phy_pg0, + sizeof(struct mpi3_sas_phy_page0), + MPI3_SAS_PHY_PGAD_FORM_PHY_NUMBER, phy_number)) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out; + } + if (ioc_status != MPI3_IOCSTATUS_SUCCESS) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out; + } + link_rate = (phy_pg0.negotiated_link_rate & + MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >> + MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT; +out: + return link_rate; +} + +/** + * mpi3mr_report_tgtdev_to_sas_transport - expose dev to SAS TL + * @mrioc: Adapter instance reference + * @tgtdev: Target device + * + * This function function exposes the target device after + * preparing host_phy, setting up link rate etc. + * + * Return: 0 on success, non-zero for failure. + */ +int mpi3mr_report_tgtdev_to_sas_transport(struct mpi3mr_ioc *mrioc, + struct mpi3mr_tgt_dev *tgtdev) +{ + int retval = 0; + u8 link_rate, parent_phy_number; + u64 sas_address_parent, sas_address; + struct mpi3mr_hba_port *hba_port; + u8 port_id; + + if ((tgtdev->dev_type != MPI3_DEVICE_DEVFORM_SAS_SATA) || + !mrioc->sas_transport_enabled) + return -1; + + sas_address = tgtdev->dev_spec.sas_sata_inf.sas_address; + if (!mrioc->sas_hba.num_phys) + mpi3mr_sas_host_add(mrioc); + else + mpi3mr_sas_host_refresh(mrioc); + + if (mpi3mr_get_sas_address(mrioc, tgtdev->parent_handle, + &sas_address_parent) != 0) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -1; + } + tgtdev->dev_spec.sas_sata_inf.sas_address_parent = sas_address_parent; + + parent_phy_number = tgtdev->dev_spec.sas_sata_inf.phy_id; + port_id = tgtdev->io_unit_port; + + hba_port = mpi3mr_get_hba_port_by_id(mrioc, port_id, 0); + if (!hba_port) { + ioc_err(mrioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return -1; + } + tgtdev->dev_spec.sas_sata_inf.hba_port = hba_port; + + link_rate = mpi3mr_get_sas_negotiated_logical_linkrate(mrioc, tgtdev); + + mpi3mr_update_links(mrioc, sas_address_parent, tgtdev->dev_handle, + parent_phy_number, link_rate, hba_port); + + tgtdev->host_exposed = 1; + if (!mpi3mr_sas_port_add(mrioc, tgtdev->dev_handle, + sas_address_parent, hba_port)) { + tgtdev->host_exposed = 0; + retval = -1; + } else if ((!tgtdev->starget)) { + if (!mrioc->is_driver_loading) + mpi3mr_sas_port_remove(mrioc, sas_address, + sas_address_parent, hba_port); + tgtdev->host_exposed = 0; + retval = -1; + } + return retval; +} + +/** + * mpi3mr_remove_tgtdev_from_sas_transport - remove from SAS TL + * @mrioc: Adapter instance reference + * @tgtdev: Target device + * + * This function function removes the target device + * + * Return: None. + */ +void mpi3mr_remove_tgtdev_from_sas_transport(struct mpi3mr_ioc *mrioc, + struct mpi3mr_tgt_dev *tgtdev) +{ + u64 sas_address_parent, sas_address; + struct mpi3mr_hba_port *hba_port; + + if ((tgtdev->dev_type != MPI3_DEVICE_DEVFORM_SAS_SATA) || + !mrioc->sas_transport_enabled) + return; + + hba_port = tgtdev->dev_spec.sas_sata_inf.hba_port; + sas_address = tgtdev->dev_spec.sas_sata_inf.sas_address; + sas_address_parent = tgtdev->dev_spec.sas_sata_inf.sas_address_parent; + mpi3mr_sas_port_remove(mrioc, sas_address, sas_address_parent, + hba_port); + tgtdev->host_exposed = 0; +} + + diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 58f66176bcb28..f0468990cefe9 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3189,15 +3189,22 @@ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp, static int sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); - struct scsi_device *sdp = sdkp->device; - struct request_queue *q = sdkp->disk->queue; - sector_t old_capacity = sdkp->capacity; + struct scsi_device *sdp; + struct request_queue *q; + sector_t old_capacity; unsigned char *buffer; unsigned int dev_max, rw_max; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); + if (WARN_ONCE((!sdkp), "Invalid scsi_disk from %p\n", disk)) + goto out; + + sdp = sdkp->device; + q = sdkp->disk->queue; + old_capacity = sdkp->capacity; + /* * If the device is offline, don't try and read capacity or any * of the other niceties. diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 2d0310448eba0..7ac7b71259169 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -70,6 +70,8 @@ source "drivers/staging/fwserial/Kconfig" source "drivers/staging/goldfish/Kconfig" +source "drivers/staging/lustrefsx/Kconfig" + source "drivers/staging/netlogic/Kconfig" source "drivers/staging/gs_fpgaboot/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 757a892ab5b9a..968031df9e110 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_STAGING_BOARD) += board/ obj-$(CONFIG_LTE_GDM724X) += gdm724x/ obj-$(CONFIG_FIREWIRE_SERIAL) += fwserial/ obj-$(CONFIG_GOLDFISH) += goldfish/ +obj-$(CONFIG_LUSTREFSX_LNET) += lustrefsx/ obj-$(CONFIG_GS_FPGABOOT) += gs_fpgaboot/ obj-$(CONFIG_UNISYSSPAR) += unisys/ obj-$(CONFIG_COMMON_CLK_XLNX_CLKWZRD) += clocking-wizard/ diff --git a/drivers/staging/lustrefsx/Kconfig b/drivers/staging/lustrefsx/Kconfig new file mode 100644 index 0000000000000..81e9bc1043d76 --- /dev/null +++ b/drivers/staging/lustrefsx/Kconfig @@ -0,0 +1,3 @@ +source "drivers/staging/lustrefsx/libcfs/Kconfig" +source "drivers/staging/lustrefsx/lnet/Kconfig" +source "drivers/staging/lustrefsx/lustre/Kconfig" diff --git a/drivers/staging/lustrefsx/Makefile b/drivers/staging/lustrefsx/Makefile new file mode 100644 index 0000000000000..20c7929213c3f --- /dev/null +++ b/drivers/staging/lustrefsx/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += lnet/ +obj-$(CONFIG_LUSTREFSX_FS) += lustre/ +obj-$(CONFIG_LUSTREFSX_LIBCFS) += libcfs/ diff --git a/drivers/staging/lustrefsx/Makefile.rules b/drivers/staging/lustrefsx/Makefile.rules new file mode 100644 index 0000000000000..ce56ffa5576a0 --- /dev/null +++ b/drivers/staging/lustrefsx/Makefile.rules @@ -0,0 +1,7 @@ +ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/undef.h +ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/config.h +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/libcfs/include +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lnet/include +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include/uapi +ccflags-y += -Wno-format-truncation -Werror diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h new file mode 100644 index 0000000000000..9a97870d7fcc7 --- /dev/null +++ b/drivers/staging/lustrefsx/config.h @@ -0,0 +1,1193 @@ +/* config.h. Generated from config.h.in by configure. */ +/* config.h.in. Generated from configure.ac by autoheader. */ + +/* enable libcfs CDEBUG, CWARN */ +#define CDEBUG_ENABLED 1 + +/* enable libcfs ENTRY/EXIT */ +#define CDEBUG_ENTRY_EXIT 1 + +/* enable page state tracking code */ +/* #undef CONFIG_DEBUG_PAGESTATE_TRACKING */ + +/* enable encryption for ldiskfs */ +/* #undef CONFIG_LDISKFS_FS_ENCRYPTION */ + +/* posix acls for ldiskfs */ +/* #undef CONFIG_LDISKFS_FS_POSIX_ACL */ + +/* enable rw access for ldiskfs */ +/* #undef CONFIG_LDISKFS_FS_RW */ + +/* fs security for ldiskfs */ +/* #undef CONFIG_LDISKFS_FS_SECURITY */ + +/* extened attributes for ldiskfs */ +/* #undef CONFIG_LDISKFS_FS_XATTR */ + +/* enable invariant checking */ +/* #undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */ + +/* kernel has cpu affinity support */ +#define CPU_AFFINITY 1 + +/* both i_dentry/d_alias uses list */ +/* #undef DATA_FOR_LLITE_IS_LIST */ + +/* name of ldiskfs debug program */ +#define DEBUGFS "debugfs" + +/* name of ldiskfs dump program */ +#define DUMPE2FS "dumpe2fs" + +/* name of ldiskfs fsck program */ +#define E2FSCK "e2fsck" + +/* name of ldiskfs e2fsprogs package */ +#define E2FSPROGS "e2fsprogs" + +/* name of ldiskfs label program */ +#define E2LABEL "e2label" + +/* do data checksums */ +#define ENABLE_CHECKSUM 1 + +/* enable flock by default */ +#define ENABLE_FLOCK 1 + +/* Use the Pinger */ +#define ENABLE_PINGER 1 + +/* aes-sha2 is supported by krb5 */ +/* #undef HAVE_AES_SHA2_SUPPORT */ + +/* Define to 1 if you have the header file. */ +#define HAVE_ASM_TYPES_H 1 + +/* backing_dev_info exist */ +/* #undef HAVE_BACKING_DEV_INFO */ + +/* BDI_CAP_MAP_COPY exist */ +/* #undef HAVE_BDI_CAP_MAP_COPY */ + +/* bio_endio takes only one argument */ +#define HAVE_BIO_ENDIO_USES_ONE_ARG 1 + +/* bio_end_sector is defined */ +#define HAVE_BIO_END_SECTOR 1 + +/* 'bio_integrity_enabled' is available */ +/* #undef HAVE_BIO_INTEGRITY_ENABLED */ + +/* kernel has bio_integrity_prep_fn */ +/* #undef HAVE_BIO_INTEGRITY_PREP_FN */ + +/* bio_integrity_payload.bip_iter exist */ +#define HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD 1 + +/* 'bi_bdev' is available */ +/* #undef HAVE_BI_BDEV */ + +/* struct bio has bi_cnt */ +/* #undef HAVE_BI_CNT */ + +/* struct bio has bi_rw */ +/* #undef HAVE_BI_RW */ + +/* 'bi_status' is available */ +#define HAVE_BI_STATUS 1 + +/* blkdev_get_by_dev is exported by the kernel */ +#define HAVE_BLKDEV_GET_BY_DEV 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_BLKID_BLKID_H */ + +/* blk_plug struct exists */ +#define HAVE_BLK_PLUG 1 + +/* blk_queue_max_segments is defined */ +#define HAVE_BLK_QUEUE_MAX_SEGMENTS 1 + +/* kernel hash_64() is broken */ +/* #undef HAVE_BROKEN_HASH_64 */ + +/* kernel has struct bvec_iter */ +#define HAVE_BVEC_ITER 1 + +/* struct cache_detail has writers */ +#define HAVE_CACHE_DETAIL_WRITERS 1 + +/* if cache_detail->hash_lock is a spinlock */ +#define HAVE_CACHE_HASH_SPINLOCK 1 + +/* cache_head has hlist cache_list */ +#define HAVE_CACHE_HEAD_HLIST 1 + +/* have cache_register */ +/* #undef HAVE_CACHE_REGISTER */ + +/* cancel_dirty_page is still available */ +/* #undef HAVE_CANCEL_DIRTY_PAGE */ + +/* kernel has clean_bdev_aliases */ +/* #undef HAVE_CLEAN_BDEV_ALIASES */ + +/* 'clear_and_wake_up_bit' is available */ +#define HAVE_CLEAR_AND_WAKE_UP_BIT 1 + +/* have clear_inode */ +#define HAVE_CLEAR_INODE 1 + +/* compat rdma found */ +/* #undef HAVE_COMPAT_RDMA */ + +/* 'cpu_read_lock' exist */ +#define HAVE_CPUS_READ_LOCK 1 + +/* kernel compiled with CRC32 functions */ +#define HAVE_CRC32 1 + +/* crypto hash helper functions are available */ +#define HAVE_CRYPTO_HASH_HELPERS 1 + +/* 'CRYPTO_MAX_ALG_NAME' is 128 */ +#define HAVE_CRYPTO_MAX_ALG_NAME_128 1 + +/* current_time() has replaced CURRENT_TIME */ +#define HAVE_CURRENT_TIME 1 + +/* dcache_lock is exist */ +/* #undef HAVE_DCACHE_LOCK */ + +/* kernel export delete_from_page_cache */ +#define HAVE_DELETE_FROM_PAGE_CACHE 1 + +/* dentry.d_child exist */ +#define HAVE_DENTRY_D_CHILD 1 + +/* hlist dentry.d_u.d_alias exist */ +#define HAVE_DENTRY_D_U_D_ALIAS 1 + +/* dentry_open uses struct path as first argument */ +#define HAVE_DENTRY_OPEN_USE_PATH 1 + +/* DES3 enctype is supported by krb5 */ +/* #undef HAVE_DES3_SUPPORT */ + +/* direct_IO need 2 arguments */ +#define HAVE_DIRECTIO_2ARGS 1 + +/* direct IO uses iov_iter */ +/* #undef HAVE_DIRECTIO_ITER */ + +/* dirty_inode super_operation takes flag */ +#define HAVE_DIRTY_INODE_HAS_FLAG 1 + +/* dir_context exist */ +#define HAVE_DIR_CONTEXT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Have dmu_object_alloc_dnsize in ZFS */ +/* #undef HAVE_DMU_OBJECT_ALLOC_DNSIZE */ + +/* Have dmu_objset_disown() with 3 args */ +/* #undef HAVE_DMU_OBJSET_DISOWN_3ARG */ + +/* Have dmu_objset_own() with 6 args */ +/* #undef HAVE_DMU_OBJSET_OWN_6ARG */ + +/* Have 6 argument dmu_pretch in ZFS */ +/* #undef HAVE_DMU_PREFETCH_6ARG */ + +/* Have dmu_read_by_dnode() in ZFS */ +/* #undef HAVE_DMU_READ_BY_DNODE */ + +/* Have dmu_tx_hold_write_by_dnode() in ZFS */ +/* #undef HAVE_DMU_TX_HOLD_WRITE_BY_DNODE */ + +/* Have dmu_tx_hold_zap_by_dnode() in ZFS */ +/* #undef HAVE_DMU_TX_HOLD_ZAP_BY_DNODE */ + +/* Have dmu_tx_mark_netfree */ +/* #undef HAVE_DMU_TX_MARK_NETFREE */ + +/* Have native dnode accounting in ZFS */ +/* #undef HAVE_DMU_USEROBJ_ACCOUNTING */ + +/* Have dmu_write_by_dnode() in ZFS */ +/* #undef HAVE_DMU_WRITE_BY_DNODE */ + +/* quotactl_ops.set_dqblk takes struct fs_disk_quota */ +/* #undef HAVE_DQUOT_FS_DISK_QUOTA */ + +/* quotactl_ops.set_dqblk takes struct kqid */ +#define HAVE_DQUOT_KQID 1 + +/* quotactl_ops.set_dqblk takes struct qc_dqblk */ +#define HAVE_DQUOT_QC_DQBLK 1 + +/* dquot_suspend is defined */ +#define HAVE_DQUOT_SUSPEND 1 + +/* Have dsl_pool_config_enter/exit in ZFS */ +/* #undef HAVE_DSL_POOL_CONFIG */ + +/* Have dsl_sync_task_do_nowait in ZFS */ +/* #undef HAVE_DSL_SYNC_TASK_DO_NOWAIT */ + +/* dump_trace want address argument */ +/* #undef HAVE_DUMP_TRACE_ADDRESS */ + +/* d_compare need 4 arguments */ +#define HAVE_D_COMPARE_4ARGS 1 + +/* d_compare need 5 arguments */ +/* #undef HAVE_D_COMPARE_5ARGS */ + +/* d_compare need 7 arguments */ +/* #undef HAVE_D_COMPARE_7ARGS */ + +/* d_count exist */ +#define HAVE_D_COUNT 1 + +/* d_delete first parameter declared is not const */ +#define HAVE_D_DELETE_CONST const + +/* d_hash_and_lookup is exported by the kernel */ +#define HAVE_D_HASH_AND_LOOKUP 1 + +/* have d_make_root */ +#define HAVE_D_MAKE_ROOT 1 + +/* have parent inode as parameter */ +#define HAVE_ENCODE_FH_PARENT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ENDIAN_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_EXT2FS_EXT2FS_H */ + +/* ext4_bread takes 4 arguments */ +/* #undef HAVE_EXT4_BREAD_4ARGS */ + +/* i_dquot is in ext4_inode_info */ +/* #undef HAVE_EXT4_INFO_DQUOT */ + +/* ext4_free_blocks do not require struct buffer_head */ +/* #undef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD */ + +/* Linux kernel has ext_pblock */ +/* #undef HAVE_EXT_PBLOCK */ + +/* file handle and related syscalls are supported */ +#define HAVE_FHANDLE_GLIBC_SUPPORT 1 + +/* kernel supports fhandles and related syscalls */ +#define HAVE_FHANDLE_SYSCALLS 1 + +/* kernel has file_dentry */ +#define HAVE_FILE_DENTRY 1 + +/* file_operations.fsync takes 2 arguments */ +/* #undef HAVE_FILE_FSYNC_2ARGS */ + +/* file_operations.fsync takes 4 arguments */ +#define HAVE_FILE_FSYNC_4ARGS 1 + +/* struct file has member f_inode */ +#define HAVE_FILE_F_INODE 1 + +/* file_inode() has been defined */ +#define HAVE_FILE_INODE 1 + +/* generic_file_llseek_size is exported by the kernel */ +#define HAVE_FILE_LLSEEK_SIZE 1 + +/* kernel has generic_file_llseek_size with 5 args */ +#define HAVE_FILE_LLSEEK_SIZE_5ARGS 1 + +/* file_operations.[read|write]_iter functions exist */ +#define HAVE_FILE_OPERATIONS_READ_WRITE_ITER 1 + +/* filldir_t needs struct dir_context as argument */ +#define HAVE_FILLDIR_USE_CTX 1 + +/* FMR pool API is available */ +/* #undef HAVE_FMR_POOL_API */ + +/* fpu/api.h is present */ +#define HAVE_FPU_API_HEADER 1 + +/* struct file_system_type has mount field */ +#define HAVE_FSTYPE_MOUNT 1 + +/* fs_struct.lock use rwlock */ +/* #undef HAVE_FS_STRUCT_RWLOCK */ + +/* fs_struct use seqcount */ +/* #undef HAVE_FS_STRUCT_SEQCOUNT */ + +/* full_name_hash need 3 arguments */ +#define HAVE_FULL_NAME_HASH_3ARGS 1 + +/* generic_permission taken 2 arguments */ +#define HAVE_GENERIC_PERMISSION_2ARGS 1 + +/* generic_permission taken 4 arguments */ +/* #undef HAVE_GENERIC_PERMISSION_4ARGS */ + +/* generic_write_sync need 2 arguments */ +#define HAVE_GENERIC_WRITE_SYNC_2ARGS 1 + +/* Define to 1 if you have the `gethostbyname' function. */ +#define HAVE_GETHOSTBYNAME 1 + +/* 'get_acl' has a rcu argument */ +/* #undef HAVE_GET_ACL_RCU_ARG */ + +/* get_request_key_auth() is available */ +#define HAVE_GET_REQUEST_KEY_AUTH 1 + +/* get_user_pages takes 6 arguments */ +/* #undef HAVE_GET_USER_PAGES_6ARG */ + +/* get_user_pages takes gup_flags in arguments */ +#define HAVE_GET_USER_PAGES_GUP_FLAGS 1 + +/* struct group_info has member gid */ +#define HAVE_GROUP_INFO_GID 1 + +/* Define this is if you enable gss */ +/* #undef HAVE_GSS */ + +/* Define this if you enable gss keyring backend */ +/* #undef HAVE_GSS_KEYRING */ + +/* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */ +/* #undef HAVE_GSS_KRB5_CCACHE_NAME */ + +/* '__rhashtable_insert_fast()' returns int */ +/* #undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT */ + +/* Define this if you have Heimdal Kerberos libraries */ +/* #undef HAVE_HEIMDAL */ + +/* hlist_add_after is available */ +/* #undef HAVE_HLIST_ADD_AFTER */ + +/* hlist_for_each_entry has 3 args */ +#define HAVE_HLIST_FOR_EACH_3ARG 1 + +/* hotplug state machine is supported */ +#define HAVE_HOTPLUG_STATE_MACHINE 1 + +/* ib_alloc_fast_reg_mr is defined */ +/* #undef HAVE_IB_ALLOC_FAST_REG_MR */ + +/* ib_alloc_pd has 2 arguments */ +#define HAVE_IB_ALLOC_PD_2ARGS 1 + +/* struct ib_cq_init_attr is used by ib_create_cq */ +#define HAVE_IB_CQ_INIT_ATTR 1 + +/* struct ib_device.attrs is defined */ +#define HAVE_IB_DEVICE_ATTRS 1 + +/* if struct ib_device_ops is defined */ +/* #undef HAVE_IB_DEVICE_OPS */ + +/* ib_get_dma_mr is defined */ +/* #undef HAVE_IB_GET_DMA_MR */ + +/* function ib_inc_rkey exist */ +#define HAVE_IB_INC_RKEY 1 + +/* ib_map_mr_sg exists */ +#define HAVE_IB_MAP_MR_SG 1 + +/* ib_map_mr_sg has 5 arguments */ +#define HAVE_IB_MAP_MR_SG_5ARGS 1 + +/* ib_post_send and ib_post_recv have const parameters */ +#define HAVE_IB_POST_SEND_RECV_CONST 1 + +/* struct ib_rdma_wr is defined */ +#define HAVE_IB_RDMA_WR 1 + +/* if ib_sg_dma_address wrapper exists */ +/* #undef HAVE_IB_SG_DMA_ADDRESS */ + +/* INIT_LIST_HEAD_RCU exists */ +#define HAVE_INIT_LIST_HEAD_RCU 1 + +/* inode_operations .getattr member function can gather advance stats */ +#define HAVE_INODEOPS_ENHANCED_GETATTR 1 + +/* inode_operations has .truncate member function */ +/* #undef HAVE_INODEOPS_TRUNCATE */ + +/* inode_operations use umode_t as parameter */ +#define HAVE_INODEOPS_USE_UMODE_T 1 + +/* inode->i_alloc_sem is killed and use inode_dio_wait */ +#define HAVE_INODE_DIO_WAIT 1 + +/* inode.i_rcu exists */ +#define HAVE_INODE_I_RCU 1 + +/* inode_lock is defined */ +#define HAVE_INODE_LOCK 1 + +/* inode_owner_or_capable exist */ +#define HAVE_INODE_OWNER_OR_CAPABLE 1 + +/* inode_operations->permission has two args */ +#define HAVE_INODE_PERMISION_2ARGS 1 + +/* inode times are using timespec64 */ +#define HAVE_INODE_TIMESPEC64 1 + +/* blk_integrity.interval exist */ +/* #undef HAVE_INTERVAL_BLK_INTEGRITY */ + +/* blk_integrity.interval_exp exist */ +#define HAVE_INTERVAL_EXP_BLK_INTEGRITY 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* address_space_operations.invalidatepage needs 3 arguments */ +#define HAVE_INVALIDATE_RANGE 1 + +/* have in_compat_syscall */ +#define HAVE_IN_COMPAT_SYSCALL 1 + +/* 'in_dev_for_each_ifa_rtnl' is defined */ +#define HAVE_IN_DEV_FOR_EACH_IFA_RTNL 1 + +/* inode_operations->rename need flags as argument */ +#define HAVE_IOPS_RENAME_WITH_FLAGS 1 + +/* have iop atomic_open */ +#define HAVE_IOP_ATOMIC_OPEN 1 + +/* generic_readlink has been removed */ +/* #undef HAVE_IOP_GENERIC_READLINK */ + +/* inode_operations has .get_acl member function */ +#define HAVE_IOP_GET_ACL 1 + +/* have iop get_link */ +#define HAVE_IOP_GET_LINK 1 + +/* inode_operations has .set_acl member function */ +#define HAVE_IOP_SET_ACL 1 + +/* inode_operations has {get,set,remove}xattr members */ +/* #undef HAVE_IOP_XATTR */ + +/* if iov_iter has member iter_type */ +#define HAVE_IOV_ITER_HAS_TYPE_MEMBER 1 + +/* iov_iter_init handles directional tag */ +#define HAVE_IOV_ITER_INIT_DIRECTION 1 + +/* iov_iter_rw exist */ +#define HAVE_IOV_ITER_RW 1 + +/* iov_iter_truncate exists */ +#define HAVE_IOV_ITER_TRUNCATE 1 + +/* if iov_iter_type exists */ +#define HAVE_IOV_ITER_TYPE 1 + +/* is_root_inode defined */ +#define HAVE_IS_ROOT_INODE 1 + +/* is_sxid is defined */ +#define HAVE_IS_SXID 1 + +/* 'iterate_shared' is available */ +#define HAVE_ITERATE_SHARED 1 + +/* struct address_space has i_pages */ +#define HAVE_I_PAGES 1 + +/* i_uid_read is present */ +#define HAVE_I_UID_READ 1 + +/* kallsyms_lookup_name is exported by kernel */ +/* #undef HAVE_KALLSYMS_LOOKUP_NAME */ + +/* kernel_locked is defined */ +/* #undef HAVE_KERNEL_LOCKED */ + +/* 'kernel_param_[un]lock' is available */ +#define HAVE_KERNEL_PARAM_LOCK 1 + +/* 'struct kernel_param_ops' is available */ +#define HAVE_KERNEL_PARAM_OPS 1 + +/* kernel_setsockopt still in use */ +/* #undef HAVE_KERNEL_SETSOCKOPT */ + +/* 'struct sock' accept function requires bool argument */ +#define HAVE_KERN_SOCK_ACCEPT_FLAG_ARG 1 + +/* 'getname' has two args */ +#define HAVE_KERN_SOCK_GETNAME_2ARGS 1 + +/* struct key_match_data exist */ +#define HAVE_KEY_MATCH_DATA 1 + +/* payload.data is an array */ +#define HAVE_KEY_PAYLOAD_DATA_ARRAY 1 + +/* key_type->instantiate has two args */ +#define HAVE_KEY_TYPE_INSTANTIATE_2ARGS 1 + +/* key.usage is of type refcount_t */ +#define HAVE_KEY_USAGE_REFCOUNT 1 + +/* ki_left exist */ +/* #undef HAVE_KIOCB_KI_LEFT */ + +/* ki_nbytes field exist */ +/* #undef HAVE_KI_NBYTES */ + +/* have kmap_atomic has only 1 argument */ +#define HAVE_KMAP_ATOMIC_HAS_1ARG 1 + +/* kmap_to_page is exported by the kernel */ +/* #undef HAVE_KMAP_TO_PAGE */ + +/* Define this if you have MIT Kerberos libraries */ +/* #undef HAVE_KRB5 */ + +/* Define this if the function krb5int_derive_key is available */ +/* #undef HAVE_KRB5INT_DERIVE_KEY */ + +/* Define this if the function krb5_derive_key is available */ +/* #undef HAVE_KRB5_DERIVE_KEY */ + +/* Define this if the function krb5_get_error_message is available */ +/* #undef HAVE_KRB5_GET_ERROR_MESSAGE */ + +/* Define this if the function krb5_get_init_creds_opt_set_addressless is + available */ +/* #undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS */ + +/* kset_find_obj is exported by the kernel */ +#define HAVE_KSET_FIND_OBJ 1 + +/* kernel has kstrtobool_from_user */ +#define HAVE_KSTRTOBOOL_FROM_USER 1 + +/* kernel has kstrtoul */ +#define HAVE_KSTRTOUL 1 + +/* kthread_worker found */ +/* #undef HAVE_KTHREAD_WORK */ + +/* ktime_add is available */ +#define HAVE_KTIME_ADD 1 + +/* ktime_after is available */ +#define HAVE_KTIME_AFTER 1 + +/* ktime_before is available */ +#define HAVE_KTIME_BEFORE 1 + +/* ktime_compare is available */ +#define HAVE_KTIME_COMPARE 1 + +/* 'ktime_get_real_seconds' is available */ +#define HAVE_KTIME_GET_REAL_SECONDS 1 + +/* 'ktime_get_real_ts64' is available */ +#define HAVE_KTIME_GET_REAL_TS64 1 + +/* 'ktime_get_seconds' is available */ +#define HAVE_KTIME_GET_SECONDS 1 + +/* 'ktime_get_ts64' is available */ +#define HAVE_KTIME_GET_TS64 1 + +/* 'ktime_ms_delta' is available */ +#define HAVE_KTIME_MS_DELTA 1 + +/* 'ktime_to_timespec64' is available */ +#define HAVE_KTIME_TO_TIMESPEC64 1 + +/* enable use of ldiskfsprogs package */ +/* #undef HAVE_LDISKFSPROGS */ + +/* kernel has ext4_map_blocks */ +/* #undef HAVE_LDISKFS_MAP_BLOCKS */ + +/* Enable ldiskfs osd */ +/* #undef HAVE_LDISKFS_OSD */ + +/* libefence support is requested */ +/* #undef HAVE_LIBEFENCE */ + +/* Define to 1 if you have the `keyutils' library (-lkeyutils). */ +/* #undef HAVE_LIBKEYUTILS */ + +/* build with libmount */ +/* #undef HAVE_LIBMOUNT */ + +/* use libpthread for libcfs library */ +#define HAVE_LIBPTHREAD 1 + +/* readline library is available */ +/* #undef HAVE_LIBREADLINE */ + +/* linux/rhashtable.h is present */ +#define HAVE_LINUX_RHASHTABLE_H 1 + +/* if linux/selinux.h exists */ +/* #undef HAVE_LINUX_SELINUX_IS_ENABLED */ + +/* linux/stdarg.h is present */ +/* #undef HAVE_LINUX_STDARG_HEADER */ + +/* lock_manager_operations has lm_compare_owner */ +/* #undef HAVE_LM_COMPARE_OWNER */ + +/* lock-manager ops renamed to lm_xxx */ +#define HAVE_LM_XXX_LOCK_MANAGER_OPS 1 + +/* kernel has locks_lock_file_wait */ +#define HAVE_LOCKS_LOCK_FILE_WAIT 1 + +/* lookup_user_key() is available */ +#define HAVE_LOOKUP_USER_KEY 1 + +/* kernel has LOOP_CTL_GET_FREE */ +#define HAVE_LOOP_CTL_GET_FREE 1 + +/* Enable lru resize support */ +#define HAVE_LRU_RESIZE_SUPPORT 1 + +/* Define this if the Kerberos GSS library supports + gss_krb5_export_lucid_sec_context */ +/* #undef HAVE_LUCID_CONTEXT_SUPPORT */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* address_space_operations.migratepage has 4 args */ +#define HAVE_MIGRATEPAGE_4ARGS 1 + +/* kernel has include/linux/migrate.h */ +#define HAVE_MIGRATE_H 1 + +/* kernel has include/linux/migrate_mode.h */ +/* #undef HAVE_MIGRATE_MODE_H */ + +/* mmap_lock API is available. */ +#define HAVE_MMAP_LOCK 1 + +/* kernel module loading is possible */ +#define HAVE_MODULE_LOADING_SUPPORT 1 + +/* locking module param is supported */ +/* #undef HAVE_MODULE_PARAM_LOCKING */ + +/* Define to 1 if you have the `name_to_handle_at' function. */ +#define HAVE_NAME_TO_HANDLE_AT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NETDB_H 1 + +/* cancel_dirty_page with one arguement is available */ +#define HAVE_NEW_CANCEL_DIRTY_PAGE 1 + +/* DEFINE_TIMER uses only 2 arguements */ +#define HAVE_NEW_DEFINE_TIMER 1 + +/* 'kernel_write' aligns with read/write helpers */ +#define HAVE_NEW_KERNEL_WRITE 1 + +/* NR_UNSTABLE_NFS is still in use. */ +/* #undef HAVE_NR_UNSTABLE_NFS */ + +/* ns_to_timespec64() is available */ +#define HAVE_NS_TO_TIMESPEC64 1 + +/* with oldsize */ +/* #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE */ + +/* openssl-devel is present */ +/* #undef HAVE_OPENSSL_GETSEPOL */ + +/* OpenSSL HMAC functions needed for SSK */ +/* #undef HAVE_OPENSSL_SSK */ + +/* 'pagevec_init' takes one parameter */ +#define HAVE_PAGEVEC_INIT_ONE_PARAM 1 + +/* linux/panic_notifier.h is present */ +/* #undef HAVE_PANIC_NOTIFIER_H */ + +/* 'param_set_uint_minmax' is available */ +/* #undef HAVE_PARAM_SET_UINT_MINMAX */ + +/* have PCLMULQDQ instruction */ +#define HAVE_PCLMULQDQ 1 + +/* percpu_counter_init uses GFP_* flag */ +#define HAVE_PERCPU_COUNTER_INIT_GFP_FLAG 1 + +/* 'struct nsproxy' has 'pid_ns_for_children' */ +#define HAVE_PID_NS_FOR_CHILDREN 1 + +/* posix_acl_to_xattr takes struct user_namespace */ +#define HAVE_POSIXACL_USER_NS 1 + +/* 'posix_acl_update_mode' is available */ +#define HAVE_POSIX_ACL_UPDATE_MODE 1 + +/* posix_acl_valid takes struct user_namespace */ +#define HAVE_POSIX_ACL_VALID_USER_NS 1 + +/* 'prepare_to_wait_event' is available */ +#define HAVE_PREPARE_TO_WAIT_EVENT 1 + +/* struct proc_ops exists */ +#define HAVE_PROC_OPS 1 + +/* proc_remove is defined */ +#define HAVE_PROC_REMOVE 1 + +/* get_projid function exists */ +#define HAVE_PROJECT_QUOTA 1 + +/* inode->i_nlink is protected from direct modification */ +#define HAVE_PROTECT_I_NLINK 1 + +/* 'PTR_ERR_OR_ZERO' exist */ +#define HAVE_PTR_ERR_OR_ZERO 1 + +/* have quota64 */ +/* #undef HAVE_QUOTA64 */ + +/* radix_tree_exceptional_entry exist */ +/* #undef HAVE_RADIX_EXCEPTION_ENTRY */ + +/* rdma_connect_locked is defined */ +#define HAVE_RDMA_CONNECT_LOCKED 1 + +/* rdma_create_id wants 4 args */ +/* #undef HAVE_RDMA_CREATE_ID_4ARG */ + +/* rdma_create_id wants 5 args */ +#define HAVE_RDMA_CREATE_ID_5ARG 1 + +/* rdma_reject has 4 arguments */ +#define HAVE_RDMA_REJECT_4ARGS 1 + +/* kernel export remove_from_page_cache */ +/* #undef HAVE_REMOVE_FROM_PAGE_CACHE */ + +/* remove_proc_subtree is defined */ +#define HAVE_REMOVE_PROC_SUBTREE 1 + +/* rhashtable_lookup() is available */ +#define HAVE_RHASHTABLE_LOOKUP 1 + +/* rhashtable_lookup_get_insert_fast() is available */ +#define HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST 1 + +/* struct rhltable exist */ +#define HAVE_RHLTABLE 1 + +/* save_stack_trace_tsk is exported */ +/* #undef HAVE_SAVE_STACK_TRACE_TSK */ + +/* Have sa_spill_alloc in ZFS */ +/* #undef HAVE_SA_SPILL_ALLOC */ + +/* super_operations.evict_inode() is exist in kernel */ +#define HAVE_SBOPS_EVICT_INODE 1 + +/* kernel supports wrapped FS freeze functions */ +#define HAVE_SB_START_WRITE 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SCHED_H 1 + +/* linux/sched header directory exist */ +#define HAVE_SCHED_HEADERS 1 + +/* security_dentry_init_security' is defined */ +#define HAVE_SECURITY_DENTRY_INIT_SECURITY 1 + +/* security_inode_init_security takes a callback to set xattrs */ +#define HAVE_SECURITY_IINITSEC_CALLBACK 1 + +/* security_inode_init_security takes a 'struct qstr' parameter */ +/* #undef HAVE_SECURITY_IINITSEC_QSTR */ + +/* security_inode_listsecurity() is available/exported */ +#define HAVE_SECURITY_INODE_LISTSECURITY 1 + +/* security_release_secctx has 1 arg. */ +/* #undef HAVE_SEC_RELEASE_SECCTX_1ARG */ + +/* support for selinux */ +#define HAVE_SELINUX 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SELINUX_SELINUX_H 1 + +/* support server */ +/* #undef HAVE_SERVER_SUPPORT */ + +/* Define to 1 if you have the `setns' function. */ +#define HAVE_SETNS 1 + +/* Define this if the Kerberos GSS library supports + gss_krb5_set_allowable_enctypes */ +/* #undef HAVE_SET_ALLOWABLE_ENCTYPES */ + +/* shrinker has count_objects member */ +#define HAVE_SHRINKER_COUNT 1 + +/* shrinker want self pointer in handler */ +/* #undef HAVE_SHRINKER_WANT_SHRINK_PTR */ + +/* shrink_control is present */ +#define HAVE_SHRINK_CONTROL 1 + +/* simple_setattr is exported by the kernel */ +#define HAVE_SIMPLE_SETATTR 1 + +/* sk_data_ready uses only one argument */ +#define HAVE_SK_DATA_READY_ONE_ARG 1 + +/* kernel has sk_sleep */ +#define HAVE_SK_SLEEP 1 + +/* sock_create_kern use net as first parameter */ +#define HAVE_SOCK_CREATE_KERN_USE_NET 1 + +/* Have spa_maxblocksize in ZFS */ +/* #undef HAVE_SPA_MAXBLOCKSIZE */ + +/* struct stacktrace_ops exists */ +/* #undef HAVE_STACKTRACE_OPS */ + +/* stacktrace_ops.warning is exist */ +/* #undef HAVE_STACKTRACE_WARNING */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* stringhash.h is present */ +#define HAVE_STRINGHASH 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* struct posix_acl_xattr_{header,entry} defined */ +#define HAVE_STRUCT_POSIX_ACL_XATTR 1 + +/* submit_bio takes two arguments */ +/* #undef HAVE_SUBMIT_BIO_2ARGS */ + +/* sunrpc_cache_pipe_upcall takes 3 args */ +/* #undef HAVE_SUNRPC_UPCALL_HAS_3ARGS */ + +/* super_operations use dentry as parameter */ +#define HAVE_SUPEROPS_USE_DENTRY 1 + +/* 'super_setup_bdi_name' is available */ +#define HAVE_SUPER_SETUP_BDI_NAME 1 + +/* symlink inode operations need struct nameidata argument */ +/* #undef HAVE_SYMLINK_OPS_USE_NAMEIDATA */ + +/* new_sync_[read|write] is exported by the kernel */ +/* #undef HAVE_SYNC_READ_WRITE */ + +/* ctl_table has ctl_name field */ +/* #undef HAVE_SYSCTL_CTLNAME */ + +/* Define to 1 if you have . */ +#define HAVE_SYS_QUOTA_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* task_is_running() is defined */ +/* #undef HAVE_TASK_IS_RUNNING */ + +/* tcp_sendpage use socket as first parameter */ +/* #undef HAVE_TCP_SENDPAGE_USE_SOCKET */ + +/* 'tcp_sock_set_keepidle()' exists */ +#define HAVE_TCP_SOCK_SET_KEEPIDLE 1 + +/* 'tcp_sock_set_nodelay()' exists */ +#define HAVE_TCP_SOCK_SET_NODELAY 1 + +/* timer_setup has replaced setup_timer */ +#define HAVE_TIMER_SETUP 1 + +/* 'struct timespec64' is available */ +#define HAVE_TIMESPEC64 1 + +/* 'timespec64_sub' is available */ +#define HAVE_TIMESPEC64_SUB 1 + +/* 'timespec64_to_ktime' is available */ +#define HAVE_TIMESPEC64_TO_KTIME 1 + +/* topology_sibling_cpumask is available */ +#define HAVE_TOPOLOGY_SIBLING_CPUMASK 1 + +/* if totalram_pages is a function */ +#define HAVE_TOTALRAM_PAGES_AS_FUNC 1 + +/* kernel export truncate_complete_page */ +/* #undef HAVE_TRUNCATE_COMPLETE_PAGE */ + +/* kernel has truncate_inode_pages_final */ +#define HAVE_TRUNCATE_INODE_PAGES_FINAL 1 + +/* if MS_RDONLY was moved to uapi/linux/mount.h */ +#define HAVE_UAPI_LINUX_MOUNT_H 1 + +/* uidgid.h is present */ +#define HAVE_UIDGID_HEADER 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* 'inode_operations' members have user namespace argument */ +/* #undef HAVE_USER_NAMESPACE_ARG */ + +/* kernel has vfs_rename with 5 args */ +/* #undef HAVE_VFS_RENAME_5ARGS */ + +/* kernel has vfs_rename with 6 args */ +#define HAVE_VFS_RENAME_6ARGS 1 + +/* '__vfs_setxattr is available */ +#define HAVE_VFS_SETXATTR 1 + +/* kernel has vfs_unlink with 3 args */ +#define HAVE_VFS_UNLINK_3ARGS 1 + +/* __vmalloc only takes 2 args. */ +#define HAVE_VMALLOC_2ARGS 1 + +/* virtual_address has been replaced by address field */ +#define HAVE_VM_FAULT_ADDRESS 1 + +/* if VM_FAULT_RETRY is defined */ +#define HAVE_VM_FAULT_RETRY 1 + +/* if vm_fault_t type exists */ +#define HAVE_VM_FAULT_T 1 + +/* 'struct vm_operations' remove struct vm_area_struct argument */ +#define HAVE_VM_OPS_USE_VM_FAULT_ONLY 1 + +/* wait_bit.h is present */ +#define HAVE_WAIT_BIT_HEADER_H 1 + +/* 'wait_queue_entry_t' is available */ +#define HAVE_WAIT_QUEUE_ENTRY 1 + +/* linux wait_queue_head_t list_head is name head */ +#define HAVE_WAIT_QUEUE_ENTRY_LIST 1 + +/* 'wait_var_event' is available */ +#define HAVE_WAIT_VAR_EVENT 1 + +/* flags field exist */ +#define HAVE_XATTR_HANDLER_FLAGS 1 + +/* needs inode parameter */ +#define HAVE_XATTR_HANDLER_INODE_PARAM 1 + +/* xattr_handler has a name member */ +#define HAVE_XATTR_HANDLER_NAME 1 + +/* handler pointer is parameter */ +/* #undef HAVE_XATTR_HANDLER_SIMPLIFIED */ + +/* xa_is_value exist */ +#define HAVE_XA_IS_VALUE 1 + +/* Have zap_add_by_dnode() in ZFS */ +/* #undef HAVE_ZAP_ADD_BY_DNODE */ + +/* Have zap_lookup_by_dnode() in ZFS */ +/* #undef HAVE_ZAP_LOOKUP_BY_DNODE */ + +/* Have zap_remove_by_dnode() in ZFS */ +/* #undef HAVE_ZAP_REMOVE_ADD_BY_DNODE */ + +/* Have inode_timespec_t */ +/* #undef HAVE_ZFS_INODE_TIMESPEC */ + +/* Have multihost protection in ZFS */ +/* #undef HAVE_ZFS_MULTIHOST */ + +/* Enable zfs osd */ +/* #undef HAVE_ZFS_OSD */ + +/* Have zfs_refcount_add */ +/* #undef HAVE_ZFS_REFCOUNT_ADD */ + +/* __add_wait_queue_exclusive exists */ +/* #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */ + +/* ext4_journal_start takes 3 arguments */ +/* #undef JOURNAL_START_HAS_3ARGS */ + +/* Define this as the Kerberos version number */ +/* #undef KRB5_VERSION */ + +/* enable libcfs LASSERT, LASSERTF */ +#define LIBCFS_DEBUG 1 + +/* use dumplog on panic */ +/* #undef LNET_DUMP_ON_PANIC */ + +/* Define to the sub-directory in which libtool stores uninstalled libraries. + */ +#define LT_OBJDIR ".libs/" + +/* Fourth number in the Lustre version */ +#define LUSTRE_FIX 0 + +/* First number in the Lustre version */ +#define LUSTRE_MAJOR 2 + +/* Second number in the Lustre version */ +#define LUSTRE_MINOR 12 + +/* Third number in the Lustre version */ +#define LUSTRE_PATCH 8 + +/* A copy of PACKAGE_VERSION */ +#define LUSTRE_VERSION_STRING "2.12.8_163_g540d104" + +/* maximum number of MDS threads */ +/* #undef MDS_MAX_THREADS */ + +/* Report minimum OST free space */ +/* #undef MIN_DF */ + +/* name of ldiskfs mkfs program */ +#define MKE2FS "mke2fs" + +/* need pclmulqdq based crc32c */ +/* #undef NEED_CRC32C_ACCEL */ + +/* need pclmulqdq based crc32 */ +/* #undef NEED_CRC32_ACCEL */ + +/* 'ktime_get_ns' is not available */ +/* #undef NEED_KTIME_GET_NS */ + +/* 'ktime_get_real_ns' is not available */ +/* #undef NEED_KTIME_GET_REAL_NS */ + +/* enable nodemap proc debug support */ +/* #undef NODEMAP_PROC_DEBUG */ + +/* Name of package */ +#define PACKAGE "lustre" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "https://jira.whamcloud.com/" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "Lustre" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "Lustre 2.12.8_163_g540d104" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "lustre" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "2.12.8_163_g540d104" + +/* name of parallel fsck program */ +#define PFSCK "fsck" + +/* enable randomly alloc failure */ +#define RANDOM_FAIL_ALLOC 1 + +/* The size of `unsigned long long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG_LONG 8 + +/* use tunable backoff TCP */ +/* #undef SOCKNAL_BACKOFF */ + +/* tunable backoff TCP in ms */ +/* #undef SOCKNAL_BACKOFF_MS */ + +/* 'struct stacktrace_ops' address function returns an int */ +/* #undef STACKTRACE_OPS_ADDRESS_RETURN_INT */ + +/* 'struct stacktrace_ops' has 'walk_stack' field */ +/* #undef STACKTRACE_OPS_HAVE_WALK_STACK */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* name of ldiskfs tune program */ +#define TUNE2FS "tune2fs" + +/* Define this if the private function, gss_krb5_cache_name, must be used to + tell the Kerberos library which credentials cache to use. Otherwise, this + is done by setting the KRB5CCNAME environment variable */ +/* #undef USE_GSS_KRB5_CCACHE_NAME */ + +/* Write when Checking Health */ +/* #undef USE_HEALTH_CHECK_WRITE */ + +/* enable lu_ref reference tracking code */ +/* #undef USE_LU_REF */ + +/* Version number of package */ +#define VERSION "2.12.8_163_g540d104" + +/* zfs fix version */ +/* #undef ZFS_FIX */ + +/* zfs major version */ +/* #undef ZFS_MAJOR */ + +/* zfs minor version */ +/* #undef ZFS_MINOR */ + +/* zfs patch version */ +/* #undef ZFS_PATCH */ diff --git a/drivers/staging/lustrefsx/libcfs/Kconfig b/drivers/staging/lustrefsx/libcfs/Kconfig new file mode 100644 index 0000000000000..3675b8381af2e --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/Kconfig @@ -0,0 +1,3 @@ +config LUSTREFSX_LIBCFS + depends on m + tristate "Lustre helper library" diff --git a/drivers/staging/lustrefsx/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/Makefile new file mode 100644 index 0000000000000..6c5ff83ac791a --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_LUSTREFSX_LIBCFS) += libcfs/ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h new file mode 100644 index 0000000000000..b4782c4b51094 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h @@ -0,0 +1,119 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef _LIBCFS_BITMAP_H_ +#define _LIBCFS_BITMAP_H_ + +#include +#include + +struct cfs_bitmap { + unsigned int size; + unsigned long data[0]; +}; + +#define CFS_BITMAP_SIZE(nbits) \ + (BITS_TO_LONGS(nbits) * sizeof(long) + sizeof(struct cfs_bitmap)) + +static inline +struct cfs_bitmap *CFS_ALLOCATE_BITMAP(int size) +{ + struct cfs_bitmap *ptr; + + LIBCFS_ALLOC(ptr, CFS_BITMAP_SIZE(size)); + if (ptr == NULL) + RETURN(ptr); + + ptr->size = size; + + RETURN(ptr); +} + +static inline void CFS_RESET_BITMAP(struct cfs_bitmap *bitmap) +{ + if (bitmap->size > 0) { + int nbits = bitmap->size; + + memset(bitmap, 0, CFS_BITMAP_SIZE(nbits)); + bitmap->size = nbits; + } +} + +#define CFS_FREE_BITMAP(ptr) LIBCFS_FREE(ptr, CFS_BITMAP_SIZE(ptr->size)) + +static inline +void cfs_bitmap_set(struct cfs_bitmap *bitmap, int nbit) +{ + set_bit(nbit, bitmap->data); +} + +static inline +void cfs_bitmap_clear(struct cfs_bitmap *bitmap, int nbit) +{ + test_and_clear_bit(nbit, bitmap->data); +} + +static inline +int cfs_bitmap_check(struct cfs_bitmap *bitmap, int nbit) +{ + return test_bit(nbit, bitmap->data); +} + +static inline +int cfs_bitmap_test_and_clear(struct cfs_bitmap *bitmap, int nbit) +{ + return test_and_clear_bit(nbit, bitmap->data); +} + +/* return 0 is bitmap has none set bits */ +static inline +int cfs_bitmap_check_empty(struct cfs_bitmap *bitmap) +{ + return find_first_bit(bitmap->data, bitmap->size) == bitmap->size; +} + +static inline +void cfs_bitmap_copy(struct cfs_bitmap *new, struct cfs_bitmap *old) +{ + size_t newsize; + + LASSERT(new->size >= old->size); + newsize = new->size; + memcpy(new, old, CFS_BITMAP_SIZE(old->size)); + new->size = newsize; +} + +#define cfs_foreach_bit(bitmap, pos) \ + for ((pos) = find_first_bit((bitmap)->data, bitmap->size); \ + (pos) < (bitmap)->size; \ + (pos) = find_next_bit((bitmap)->data, (bitmap)->size, (pos) + 1)) + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h new file mode 100644 index 0000000000000..0f00c7219e75d --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h @@ -0,0 +1,85 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/curproc.h + * + * Lustre curproc API declaration + * + * Author: Nikita Danilov + */ + +#ifndef __LIBCFS_CURPROC_H__ +#define __LIBCFS_CURPROC_H__ + +/* check if task is running in compat mode.*/ +#define current_pid() (current->pid) +#define current_comm() (current->comm) + +typedef __u32 cfs_cap_t; + +#define CFS_CAP_CHOWN 0 +#define CFS_CAP_DAC_OVERRIDE 1 +#define CFS_CAP_DAC_READ_SEARCH 2 +#define CFS_CAP_FOWNER 3 +#define CFS_CAP_FSETID 4 +#define CFS_CAP_LINUX_IMMUTABLE 9 +#define CFS_CAP_SYS_ADMIN 21 +#define CFS_CAP_SYS_BOOT 23 +#define CFS_CAP_SYS_RESOURCE 24 + +#define CFS_CAP_FS_MASK ((1 << CFS_CAP_CHOWN) | \ + (1 << CFS_CAP_DAC_OVERRIDE) | \ + (1 << CFS_CAP_DAC_READ_SEARCH) | \ + (1 << CFS_CAP_FOWNER) | \ + (1 << CFS_CAP_FSETID ) | \ + (1 << CFS_CAP_LINUX_IMMUTABLE) | \ + (1 << CFS_CAP_SYS_ADMIN) | \ + (1 << CFS_CAP_SYS_BOOT) | \ + (1 << CFS_CAP_SYS_RESOURCE)) + +void cfs_cap_raise(cfs_cap_t cap); +void cfs_cap_lower(cfs_cap_t cap); +int cfs_cap_raised(cfs_cap_t cap); +cfs_cap_t cfs_curproc_cap_pack(void); +void cfs_curproc_cap_unpack(cfs_cap_t cap); +int cfs_capable(cfs_cap_t cap); + +/* __LIBCFS_CURPROC_H__ */ +#endif +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h new file mode 100644 index 0000000000000..9ae7b8405a94b --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h @@ -0,0 +1,174 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_LIBCFS_H__ +#define __LIBCFS_LIBCFS_H__ + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "curproc.h" + +#define LIBCFS_VERSION "0.7.1" + +#define PO2_ROUNDUP_TYPED(x, po2, type) (-(-(type)(x) & -(type)(po2))) +#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) + +/* Sparse annotations */ +#if !defined(__must_hold) +# ifdef __CHECKER__ +# define __must_hold(x) __attribute__((context(x, 1, 1))) +# else /* __CHECKER__ */ +# define __must_hold(x) +# endif /* !__CHECKER__ */ +#endif /* !__must_hold */ + +/* libcfs watchdogs */ +struct lc_watchdog; + +/* Add a watchdog which fires after "time" milliseconds of delay. You have to + * touch it once to enable it. */ +struct lc_watchdog *lc_watchdog_add(int time, + void (*cb)(pid_t pid, void *), + void *data); + +/* Enables a watchdog and resets its timer. */ +void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout); +#define CFS_GET_TIMEOUT(svc) (max_t(int, obd_timeout, \ + AT_OFF ? 0 : at_get(&svc->srv_at_estimate)) * \ + svc->srv_watchdog_factor) + +/* Disable a watchdog; touch it to restart it. */ +void lc_watchdog_disable(struct lc_watchdog *lcw); + +/* Clean up the watchdog */ +void lc_watchdog_delete(struct lc_watchdog *lcw); + +#ifdef HAVE_TOTALRAM_PAGES_AS_FUNC + #ifndef cfs_totalram_pages + #define cfs_totalram_pages() totalram_pages() + #endif +#else + #ifndef cfs_totalram_pages + #define cfs_totalram_pages() totalram_pages + #endif +#endif + +typedef s32 timeout_t; + +/* need both kernel and user-land acceptor */ +#define LNET_ACCEPTOR_MIN_RESERVED_PORT 512 +#define LNET_ACCEPTOR_MAX_RESERVED_PORT 1023 + +extern struct blocking_notifier_head libcfs_ioctl_list; +static inline int notifier_from_ioctl_errno(int err) +{ + if (err == -EINVAL) + return NOTIFY_OK; + return notifier_from_errno(err) | NOTIFY_STOP_MASK; +} + +/* + * Defined by platform + */ +int unshare_fs_struct(void); +sigset_t cfs_block_allsigs(void); +sigset_t cfs_block_sigs(unsigned long sigs); +sigset_t cfs_block_sigsinv(unsigned long sigs); +void cfs_restore_sigs(sigset_t); +void cfs_clear_sigpending(void); + +/* + * Random number handling + */ + +/* returns a random 32-bit integer */ +unsigned int cfs_rand(void); +/* seed the generator */ +void cfs_srand(unsigned int, unsigned int); +void cfs_get_random_bytes(void *buf, int size); + +int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data); +int libcfs_ioctl(unsigned long cmd, void __user *uparam); + +/* container_of depends on "likely" which is defined in libcfs_private.h */ +static inline void *__container_of(const void *ptr, unsigned long shift) +{ + if (unlikely(IS_ERR(ptr) || ptr == NULL)) + return ERR_CAST(ptr); + else + return (char *)ptr - shift; +} + +#define container_of0(ptr, type, member) \ + ((type *)__container_of((ptr), offsetof(type, member))) + +struct lnet_debugfs_symlink_def { + const char *name; + const char *target; +}; + +void lnet_insert_debugfs(struct ctl_table *table); +void lnet_remove_debugfs(struct ctl_table *table); + +/* helper for sysctl handlers */ +int lprocfs_call_handler(void *data, int write, loff_t *ppos, + void __user *buffer, size_t *lenp, + int (*handler)(void *data, int write, loff_t pos, + void __user *buffer, int len)); +int debugfs_doint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + +/* atomic-context safe vfree */ +#ifdef HAVE_LIBCFS_VFREE_ATOMIC +void libcfs_vfree_atomic(const void *addr); +#else +#define libcfs_vfree_atomic(ptr) vfree(ptr) +#endif + +#endif /* _LIBCFS_LIBCFS_H_ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h new file mode 100644 index 0000000000000..4620dcc08cf80 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h @@ -0,0 +1,374 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_cpu.h + * + * CPU partition + * . CPU partition is virtual processing unit + * + * . CPU partition can present 1-N cores, or 1-N NUMA nodes, + * in other words, CPU partition is a processors pool. + * + * CPU Partition Table (CPT) + * . a set of CPU partitions + * + * . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP + * + * . User can specify total number of CPU partitions while creating a + * CPT, ID of CPU partition is always start from 0. + * + * Example: if there are 8 cores on the system, while creating a CPT + * with cpu_npartitions=4: + * core[0, 1] = partition[0], core[2, 3] = partition[1] + * core[4, 5] = partition[2], core[6, 7] = partition[3] + * + * cpu_npartitions=1: + * core[0, 1, ... 7] = partition[0] + * + * . User can also specify CPU partitions by string pattern + * + * Examples: cpu_partitions="0[0,1], 1[2,3]" + * cpu_partitions="N 0[0-3], 1[4-8]" + * + * The first character "N" means following numbers are numa ID + * + * . NUMA allocators, CPU affinity threads are built over CPU partitions, + * instead of HW CPUs or HW nodes. + * + * . By default, Lustre modules should refer to the global cfs_cpt_table, + * instead of accessing HW CPUs directly, so concurrency of Lustre can be + * configured by cpu_npartitions of the global cfs_cpt_table + * + * . If cpu_npartitions=1(all CPUs in one pool), lustre should work the + * same way as 2.2 or earlier versions + * + * Author: liang@whamcloud.com + */ + +#ifndef __LIBCFS_CPU_H__ +#define __LIBCFS_CPU_H__ + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_SMP + +/** virtual processing unit */ +struct cfs_cpu_partition { + /* CPUs mask for this partition */ + cpumask_t *cpt_cpumask; + /* nodes mask for this partition */ + nodemask_t *cpt_nodemask; + /* NUMA distance between CPTs */ + unsigned int *cpt_distance; + /* spread rotor for NUMA allocator */ + unsigned int cpt_spread_rotor; + /* NUMA node if cpt_nodemask is empty */ + int cpt_node; +}; +#endif /* CONFIG_SMP */ + +/** descriptor for CPU partitions */ +struct cfs_cpt_table { +#ifdef CONFIG_SMP + /* spread rotor for NUMA allocator */ + unsigned int ctb_spread_rotor; + /* maximum NUMA distance between all nodes in table */ + unsigned int ctb_distance; + /* partitions tables */ + struct cfs_cpu_partition *ctb_parts; + /* shadow HW CPU to CPU partition ID */ + int *ctb_cpu2cpt; + /* shadow HW node to CPU partition ID */ + int *ctb_node2cpt; + /* # of CPU partitions */ + int ctb_nparts; + /* all nodes in this partition table */ + nodemask_t *ctb_nodemask; +#else + nodemask_t ctb_nodemask; +#endif /* CONFIG_SMP */ + /* all cpus in this partition table */ + cpumask_t *ctb_cpumask; +}; + +/* any CPU partition */ +#define CFS_CPT_ANY (-1) + +extern struct cfs_cpt_table *cfs_cpt_table; + +/** + * destroy a CPU partition table + */ +void cfs_cpt_table_free(struct cfs_cpt_table *cptab); +/** + * create a cfs_cpt_table with \a ncpt number of partitions + */ +struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt); +/** + * print string information of cpt-table + */ +int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len); +/** + * print distance information of cpt-table + */ +int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len); +/** + * return total number of CPU partitions in \a cptab + */ +int cfs_cpt_number(struct cfs_cpt_table *cptab); +/** + * return number of HW cores or hyper-threadings in a CPU partition \a cpt + */ +int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt); +/** + * is there any online CPU in CPU partition \a cpt + */ +int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt); +/** + * return cpumask of CPU partition \a cpt + */ +cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt); +/** + * return nodemask of CPU partition \a cpt + */ +nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt); +/** + * shadow current HW processor ID to CPU-partition ID of \a cptab + */ +int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap); +/** + * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab + */ +int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu); +/** + * shadow HW node ID \a NODE to CPU-partition ID by \a cptab + */ +int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node); +/** + * NUMA distance between \a cpt1 and \a cpt2 in \a cptab + */ +unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2); +/** + * bind current thread on a CPU-partition \a cpt of \a cptab + */ +int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt); +/** + * add \a cpu to CPU partition @cpt of \a cptab, return 1 for success, + * otherwise 0 is returned + */ +int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); +/** + * remove \a cpu from CPU partition \a cpt of \a cptab + */ +void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); +/** + * add all cpus in \a mask to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, + const cpumask_t *mask); +/** + * remove all cpus in \a mask from CPU partition \a cpt + */ +void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, + const cpumask_t *mask); +/** + * add all cpus in NUMA node \a node to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node); +/** + * remove all cpus in NUMA node \a node from CPU partition \a cpt + */ +void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node); + +/** + * add all cpus in node mask \a mask to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, + const nodemask_t *mask); +/** + * remove all cpus in node mask \a mask from CPU partition \a cpt + */ +void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, + const nodemask_t *mask); +/** + * convert partition id \a cpt to numa node id, if there are more than one + * nodes in this partition, it might return a different node id each time. + */ +int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt); + +/* + * allocate per-cpu-partition data, returned value is an array of pointers, + * variable can be indexed by CPU ID. + * cptab != NULL: size of array is number of CPU partitions + * cptab == NULL: size of array is number of HW cores + */ +void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size); +/* + * destroy per-cpu-partition variable + */ +void cfs_percpt_free(void *vars); +int cfs_percpt_number(void *vars); + +#define cfs_percpt_for_each(var, i, vars) \ + for (i = 0; i < cfs_percpt_number(vars) && \ + ((var) = (vars)[i]) != NULL; i++) + +/* + * percpu partition lock + * + * There are some use-cases like this in Lustre: + * . each CPU partition has it's own private data which is frequently changed, + * and mostly by the local CPU partition. + * . all CPU partitions share some global data, these data are rarely changed. + * + * LNet is typical example. + * CPU partition lock is designed for this kind of use-cases: + * . each CPU partition has it's own private lock + * . change on private data just needs to take the private lock + * . read on shared data just needs to take _any_ of private locks + * . change on shared data needs to take _all_ private locks, + * which is slow and should be really rare. + */ +enum { + CFS_PERCPT_LOCK_EX = -1, /* negative */ +}; + +struct cfs_percpt_lock { + /* cpu-partition-table for this lock */ + struct cfs_cpt_table *pcl_cptab; + /* exclusively locked */ + unsigned int pcl_locked; + /* private lock table */ + spinlock_t **pcl_locks; +}; + +/* return number of private locks */ +#define cfs_percpt_lock_num(pcl) cfs_cpt_number(pcl->pcl_cptab) + +/* + * create a cpu-partition lock based on CPU partition table \a cptab, + * each private lock has extra \a psize bytes padding data + */ +struct cfs_percpt_lock *cfs_percpt_lock_create(struct cfs_cpt_table *cptab, + struct lock_class_key *keys); +/* destroy a cpu-partition lock */ +void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl); + +/* lock private lock \a index of \a pcl */ +void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index); + +/* unlock private lock \a index of \a pcl */ +void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index); + +#define CFS_PERCPT_LOCK_KEYS 256 + +/* NB: don't allocate keys dynamically, lockdep needs them to be in ".data" */ +#define cfs_percpt_lock_alloc(cptab) \ +({ \ + static struct lock_class_key ___keys[CFS_PERCPT_LOCK_KEYS]; \ + struct cfs_percpt_lock *___lk; \ + \ + if (cfs_cpt_number(cptab) > CFS_PERCPT_LOCK_KEYS) \ + ___lk = cfs_percpt_lock_create(cptab, NULL); \ + else \ + ___lk = cfs_percpt_lock_create(cptab, ___keys); \ + ___lk; \ +}) + +/** + * allocate \a nr_bytes of physical memory from a contiguous region with the + * properties of \a flags which are bound to the partition id \a cpt. This + * function should only be used for the case when only a few pages of memory + * are need. + */ +static inline void * +cfs_cpt_malloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes, + gfp_t flags) +{ + return kmalloc_node(nr_bytes, flags, + cfs_cpt_spread_node(cptab, cpt)); +} + +/** + * allocate \a nr_bytes of virtually contiguous memory that is bound to the + * partition id \a cpt. + */ +static inline void * +cfs_cpt_vzalloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes) +{ + /* vzalloc_node() sets __GFP_FS by default but no current Kernel + * exported entry-point allows for both a NUMA node specification + * and a custom allocation flags mask. This may be an issue since + * __GFP_FS usage can cause some deadlock situations in our code, + * like when memory reclaim started, within the same context of a + * thread doing FS operations, that can also attempt conflicting FS + * operations, ... + */ + return vzalloc_node(nr_bytes, cfs_cpt_spread_node(cptab, cpt)); +} + +/** + * allocate a single page of memory with the properties of \a flags were + * that page is bound to the partition id \a cpt. + */ +static inline struct page * +cfs_page_cpt_alloc(struct cfs_cpt_table *cptab, int cpt, gfp_t flags) +{ + return alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), flags, 0); +} + +/** + * allocate a chunck of memory from a memory pool that is bound to the + * partition id \a cpt with the properites of \a flags. + */ +static inline void * +cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, struct cfs_cpt_table *cptab, + int cpt, gfp_t flags) +{ + return kmem_cache_alloc_node(cachep, flags, + cfs_cpt_spread_node(cptab, cpt)); +} + +/** + * iterate over all CPU partitions in \a cptab + */ +#define cfs_cpt_for_each(i, cptab) \ + for (i = 0; i < cfs_cpt_number(cptab); i++) + +int cfs_cpu_init(void); +void cfs_cpu_fini(void); + +#endif /* __LIBCFS_CPU_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h new file mode 100644 index 0000000000000..8271306ce6019 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h @@ -0,0 +1,316 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Copyright (c) 2014, Intel Corporation. + */ + +#ifndef _LIBCFS_CRYPTO_H +#define _LIBCFS_CRYPTO_H + +struct cfs_crypto_hash_type { + char *cht_name; /**< hash algorithm name, equal to + * format name for crypto api */ + unsigned int cht_key; /**< init key by default (vaild for + * 4 bytes context like crc32, adler */ + unsigned int cht_size; /**< hash digest size */ +}; + +struct cfs_crypto_crypt_type { + char *cct_name; /**< crypto algorithm name, equal to + * format name for crypto api */ + unsigned int cct_size; /**< crypto key size */ +}; + +enum cfs_crypto_hash_alg { + CFS_HASH_ALG_NULL = 0, + CFS_HASH_ALG_ADLER32, + CFS_HASH_ALG_CRC32, + CFS_HASH_ALG_CRC32C, + /* hashes before here will be speed-tested at module load */ + CFS_HASH_ALG_MD5, + CFS_HASH_ALG_SHA1, + CFS_HASH_ALG_SHA256, + CFS_HASH_ALG_SHA384, + CFS_HASH_ALG_SHA512, + CFS_HASH_ALG_MAX, + CFS_HASH_ALG_SPEED_MAX = CFS_HASH_ALG_MD5, + CFS_HASH_ALG_UNKNOWN = 0xff +}; + +enum cfs_crypto_crypt_alg { + CFS_CRYPT_ALG_NULL = 0, + CFS_CRYPT_ALG_AES256_CTR, + CFS_CRYPT_ALG_MAX, + CFS_CRYPT_ALG_UNKNOWN = 0xff +}; + +static struct cfs_crypto_hash_type hash_types[] = { + [CFS_HASH_ALG_NULL] = { + .cht_name = "null", + .cht_key = 0, + .cht_size = 0 + }, + [CFS_HASH_ALG_ADLER32] = { + .cht_name = "adler32", + .cht_key = 1, + .cht_size = 4 + }, + [CFS_HASH_ALG_CRC32] = { + .cht_name = "crc32", + .cht_key = ~0, + .cht_size = 4 + }, + [CFS_HASH_ALG_CRC32C] = { + .cht_name = "crc32c", + .cht_key = ~0, + .cht_size = 4 + }, + [CFS_HASH_ALG_MD5] = { + .cht_name = "md5", + .cht_key = 0, + .cht_size = 16 + }, + [CFS_HASH_ALG_SHA1] = { + .cht_name = "sha1", + .cht_key = 0, + .cht_size = 20 + }, + [CFS_HASH_ALG_SHA256] = { + .cht_name = "sha256", + .cht_key = 0, + .cht_size = 32 + }, + [CFS_HASH_ALG_SHA384] = { + .cht_name = "sha384", + .cht_key = 0, + .cht_size = 48 + }, + [CFS_HASH_ALG_SHA512] = { + .cht_name = "sha512", + .cht_key = 0, + .cht_size = 64 + }, + [CFS_HASH_ALG_MAX] = { + .cht_name = NULL, + .cht_key = 0, + .cht_size = 64 + } +}; + +static struct cfs_crypto_crypt_type crypt_types[] = { + [CFS_CRYPT_ALG_NULL] = { + .cct_name = "null", + .cct_size = 0 + }, + [CFS_CRYPT_ALG_AES256_CTR] = { + .cct_name = "ctr(aes)", + .cct_size = 32 + } +}; + +/* Maximum size of hash_types[].cht_size */ +#define CFS_CRYPTO_HASH_DIGESTSIZE_MAX 64 + +/** + * Return hash algorithm information for the specified algorithm identifier + * + * Hash information includes algorithm name, initial seed, hash size. + * + * \retval cfs_crypto_hash_type for valid ID (CFS_HASH_ALG_*) + * \retval NULL for unknown algorithm identifier + */ +static inline const struct +cfs_crypto_hash_type *cfs_crypto_hash_type(enum cfs_crypto_hash_alg hash_alg) +{ + struct cfs_crypto_hash_type *ht; + + if (hash_alg < CFS_HASH_ALG_MAX) { + ht = &hash_types[hash_alg]; + if (ht->cht_name != NULL) + return ht; + } + return NULL; +} + +/** + * Return hash name for hash algorithm identifier + * + * \param[in] hash_alg hash alrgorithm id (CFS_HASH_ALG_*) + * + * \retval string name of known hash algorithm + * \retval "unknown" if hash algorithm is unknown + */ +static inline const +char *cfs_crypto_hash_name(enum cfs_crypto_hash_alg hash_alg) +{ + const struct cfs_crypto_hash_type *ht; + + ht = cfs_crypto_hash_type(hash_alg); + if (ht) + return ht->cht_name; + + return "unknown"; +} + +/** + * Return digest size for hash algorithm type + * + * \param[in] hash_alg hash alrgorithm id (CFS_HASH_ALG_*) + * + * \retval hash algorithm digest size in bytes + * \retval 0 if hash algorithm type is unknown + */ +static inline +unsigned int cfs_crypto_hash_digestsize(enum cfs_crypto_hash_alg hash_alg) +{ + const struct cfs_crypto_hash_type *ht; + + ht = cfs_crypto_hash_type(hash_alg); + if (ht != NULL) + return ht->cht_size; + + return 0; +} + +/** + * Find hash algorithm ID for the specified algorithm name + * + * \retval hash algorithm ID for valid ID (CFS_HASH_ALG_*) + * \retval CFS_HASH_ALG_UNKNOWN for unknown algorithm name + */ +static inline unsigned char cfs_crypto_hash_alg(const char *algname) +{ + enum cfs_crypto_hash_alg hash_alg; + + for (hash_alg = 0; hash_alg < CFS_HASH_ALG_MAX; hash_alg++) + if (strcmp(hash_types[hash_alg].cht_name, algname) == 0) + return hash_alg; + + return CFS_HASH_ALG_UNKNOWN; +} + +/** + * Return crypt algorithm information for the specified algorithm identifier + * + * Crypt information includes algorithm name, key size. + * + * \retval cfs_crypto_crupt_type for valid ID (CFS_CRYPT_ALG_*) + * \retval NULL for unknown algorithm identifier + */ +static inline const struct +cfs_crypto_crypt_type *cfs_crypto_crypt_type( + enum cfs_crypto_crypt_alg crypt_alg) +{ + struct cfs_crypto_crypt_type *ct; + + if (crypt_alg < CFS_CRYPT_ALG_MAX) { + ct = &crypt_types[crypt_alg]; + if (ct->cct_name != NULL) + return ct; + } + return NULL; +} + +/** + * Return crypt name for crypt algorithm identifier + * + * \param[in] crypt_alg crypt alrgorithm id (CFS_CRYPT_ALG_*) + * + * \retval string name of known crypt algorithm + * \retval "unknown" if hash algorithm is unknown + */ +static inline const +char *cfs_crypto_crypt_name(enum cfs_crypto_crypt_alg crypt_alg) +{ + const struct cfs_crypto_crypt_type *ct; + + ct = cfs_crypto_crypt_type(crypt_alg); + if (ct) + return ct->cct_name; + + return "unknown"; +} + + +/** + * Return key size for crypto algorithm type + * + * \param[in] crypt_alg crypt alrgorithm id (CFS_CRYPT_ALG_*) + * + * \retval crypt algorithm key size in bytes + * \retval 0 if crypt algorithm type is unknown + */ +static inline +unsigned int cfs_crypto_crypt_keysize(enum cfs_crypto_crypt_alg crypt_alg) +{ + const struct cfs_crypto_crypt_type *ct; + + ct = cfs_crypto_crypt_type(crypt_alg); + if (ct != NULL) + return ct->cct_size; + + return 0; +} + +/** + * Find crypto algorithm ID for the specified algorithm name + * + * \retval crypto algorithm ID for valid ID (CFS_CRYPT_ALG_*) + * \retval CFS_CRYPT_ALG_UNKNOWN for unknown algorithm name + */ +static inline unsigned char cfs_crypto_crypt_alg(const char *algname) +{ + enum cfs_crypto_crypt_alg crypt_alg; + + for (crypt_alg = 0; crypt_alg < CFS_CRYPT_ALG_MAX; crypt_alg++) + if (strcmp(crypt_types[crypt_alg].cct_name, algname) == 0) + return crypt_alg; + + return CFS_CRYPT_ALG_UNKNOWN; +} + +int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg, + const void *buf, unsigned int buf_len, + unsigned char *key, unsigned int key_len, + unsigned char *hash, unsigned int *hash_len); + +/* cfs crypto hash descriptor */ +struct page; + +struct ahash_request * + cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg, + unsigned char *key, unsigned int key_len); +int cfs_crypto_hash_update_page(struct ahash_request *req, + struct page *page, unsigned int offset, + unsigned int len); +int cfs_crypto_hash_update(struct ahash_request *req, const void *buf, + unsigned int buf_len); +int cfs_crypto_hash_final(struct ahash_request *req, + unsigned char *hash, unsigned int *hash_len); +int cfs_crypto_register(void); +void cfs_crypto_unregister(void); +int cfs_crypto_hash_speed(enum cfs_crypto_hash_alg hash_alg); +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h new file mode 100644 index 0000000000000..ac89d2cb60b55 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h @@ -0,0 +1,295 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_debug.h + * + * Debug messages and assertions + * + */ + +#ifndef __LIBCFS_DEBUG_H__ +#define __LIBCFS_DEBUG_H__ + +#include +#include +#include + +/* + * Debugging + */ +extern unsigned int libcfs_subsystem_debug; +extern unsigned int libcfs_stack; +extern unsigned int libcfs_debug; +extern unsigned int libcfs_printk; +extern unsigned int libcfs_console_ratelimit; +extern unsigned int libcfs_watchdog_ratelimit; +extern unsigned int libcfs_console_max_delay; +extern unsigned int libcfs_console_min_delay; +extern unsigned int libcfs_console_backoff; +extern unsigned int libcfs_debug_binary; +extern char libcfs_debug_file_path_arr[PATH_MAX]; + +int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys); +int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys); + +/* Has there been an LBUG? */ +extern unsigned int libcfs_catastrophe; +extern unsigned int libcfs_panic_on_lbug; + +#ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +#endif + +#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600)) /* jiffies */ +#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */ +#define CDEBUG_DEFAULT_BACKOFF 2 +struct cfs_debug_limit_state { + unsigned long cdls_next; + unsigned int cdls_delay; + int cdls_count; +}; + +struct libcfs_debug_msg_data { + const char *msg_file; + const char *msg_fn; + int msg_subsys; + int msg_line; + int msg_mask; + struct cfs_debug_limit_state *msg_cdls; +}; + +#define LIBCFS_DEBUG_MSG_DATA_INIT(data, mask, cdls) \ +do { \ + (data)->msg_subsys = DEBUG_SUBSYSTEM; \ + (data)->msg_file = __FILE__; \ + (data)->msg_fn = __FUNCTION__; \ + (data)->msg_line = __LINE__; \ + (data)->msg_cdls = (cdls); \ + (data)->msg_mask = (mask); \ +} while (0) + +#define LIBCFS_DEBUG_MSG_DATA_DECL(dataname, mask, cdls) \ + static struct libcfs_debug_msg_data dataname = { \ + .msg_subsys = DEBUG_SUBSYSTEM, \ + .msg_file = __FILE__, \ + .msg_fn = __FUNCTION__, \ + .msg_line = __LINE__, \ + .msg_cdls = (cdls) }; \ + dataname.msg_mask = (mask); + +#ifdef CDEBUG_ENABLED + +#if !defined(__x86_64__) +# ifdef __ia64__ +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_dwarf_cfa() & \ + (THREAD_SIZE - 1))) +# else +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_frame_address(0) & \ + (THREAD_SIZE - 1))) +# endif /* __ia64__ */ + +#define __CHECK_STACK(msgdata, mask, cdls) \ +do { \ + if (unlikely(CDEBUG_STACK() > libcfs_stack)) { \ + LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL); \ + libcfs_stack = CDEBUG_STACK(); \ + libcfs_debug_msg(msgdata, \ + "maximum lustre stack %lu\n", \ + CDEBUG_STACK()); \ + (msgdata)->msg_mask = mask; \ + (msgdata)->msg_cdls = cdls; \ + dump_stack(); \ + /*panic("LBUG");*/ \ + } \ +} while (0) +#define CFS_CHECK_STACK(msgdata, mask, cdls) __CHECK_STACK(msgdata, mask, cdls) +#else /* __x86_64__ */ +#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while (0) +#define CDEBUG_STACK() (0L) +#endif /* __x86_64__ */ + +/** + * Filters out logging messages based on mask and subsystem. + */ +static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem) +{ + return mask & D_CANTMASK || + ((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem)); +} + +# define __CDEBUG(cdls, mask, format, ...) \ +do { \ + static struct libcfs_debug_msg_data msgdata; \ + \ + CFS_CHECK_STACK(&msgdata, mask, cdls); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_INIT(&msgdata, mask, cdls); \ + libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__); \ + } \ +} while (0) + +# define CDEBUG(mask, format, ...) __CDEBUG(NULL, mask, format, ## __VA_ARGS__) + +# define CDEBUG_LIMIT(mask, format, ...) \ +do { \ + static struct cfs_debug_limit_state cdls; \ + \ + __CDEBUG(&cdls, mask, format, ## __VA_ARGS__); \ +} while (0) + +# else /* !CDEBUG_ENABLED */ +static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem) +{ + return 0; +} +# define CDEBUG(mask, format, ...) (void)(0) +# define CDEBUG_LIMIT(mask, format, ...) (void)(0) +# warning "CDEBUG IS DISABLED. THIS SHOULD NEVER BE DONE FOR PRODUCTION!" +# endif /* CDEBUG_ENABLED */ + +/* + * Lustre Error Checksum: calculates checksum + * of Hex number by XORing each bit. + */ +#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \ + ((hexnum) >> 8 & 0xf)) + +#define CWARN(format, ...) CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__) +#define CERROR(format, ...) CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__) +#define CNETERR(format, a...) CDEBUG_LIMIT(D_NETERROR, format, ## a) +#define CEMERG(format, ...) CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__) + +#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__) +#define LCONSOLE_INFO(format, ...) CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__) +#define LCONSOLE_WARN(format, ...) CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__) +#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \ + "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__) +#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__) + +#define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__) + +#if defined(CDEBUG_ENTRY_EXIT) + +void libcfs_log_goto(struct libcfs_debug_msg_data *goto_data, + const char *label, long rc); + +# define GOTO(label, rc) \ +do { \ + if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(_goto_data, D_TRACE, NULL); \ + libcfs_log_goto(&_goto_data, #label, (long)(rc)); \ + } else { \ + (void)(rc); \ + } \ + \ + goto label; \ +} while (0) + + +long libcfs_log_return(struct libcfs_debug_msg_data *, long rc); +# if BITS_PER_LONG > 32 +# define RETURN(rc) \ +do { \ + if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL); \ + return (typeof(rc))libcfs_log_return(&msgdata, \ + (long)(rc)); \ + } \ + \ + return (rc); \ +} while (0) +# else /* BITS_PER_LONG == 32 */ +/* We need an on-stack variable, because we cannot case a 32-bit pointer + * directly to (long long) without generating a complier warning/error, yet + * casting directly to (long) will truncate 64-bit return values. The log + * values will print as 32-bit values, but they always have been. LU-1436 + */ +# define RETURN(rc) \ +do { \ + if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) { \ + typeof(rc) __rc = (rc); \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL); \ + libcfs_log_return(&msgdata, (long)__rc); \ + return __rc; \ + } \ + \ + return (rc); \ +} while (0) + +# endif /* BITS_PER_LONG > 32 */ + +# define ENTRY CDEBUG(D_TRACE, "Process entered\n") +# define EXIT CDEBUG(D_TRACE, "Process leaving\n") + +#else /* !CDEBUG_ENTRY_EXIT */ + +# define GOTO(label, rc) \ + do { \ + ((void)(rc)); \ + goto label; \ + } while (0) + +# define RETURN(rc) return (rc) +# define ENTRY do { } while (0) +# define EXIT do { } while (0) + +#endif /* CDEBUG_ENTRY_EXIT */ + +#define RETURN_EXIT \ +do { \ + EXIT; \ + return; \ +} while (0) + +extern int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, + const char *format1, ...) + __attribute__ ((format (printf, 2, 3))); + +extern int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata, + const char *format1, + va_list args, const char *format2, ...) + __attribute__ ((format (printf, 4, 5))); + +/* other external symbols that tracefile provides: */ +extern int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, + const char __user *usr_buffer, + int usr_buffer_nob); +extern int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, + const char *knl_buffer, char *append); + +#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log" + +void cfs_debug_init(void); + +#endif /* __LIBCFS_DEBUG_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h new file mode 100644 index 0000000000000..203e470df88d0 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h @@ -0,0 +1,216 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Oracle Corporation, Inc. + */ + +#ifndef _LIBCFS_FAIL_H +#define _LIBCFS_FAIL_H + +extern unsigned long cfs_fail_loc; +extern unsigned int cfs_fail_val; +extern int cfs_fail_err; + +extern wait_queue_head_t cfs_race_waitq; +extern int cfs_race_state; + +int __cfs_fail_check_set(__u32 id, __u32 value, int set); +int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set); + +enum { + CFS_FAIL_LOC_NOSET = 0, + CFS_FAIL_LOC_ORSET = 1, + CFS_FAIL_LOC_RESET = 2, + CFS_FAIL_LOC_VALUE = 3 +}; + +/* Failure injection control */ +#define CFS_FAIL_MASK_SYS 0x0000FF00 +#define CFS_FAIL_MASK_LOC (0x000000FF | CFS_FAIL_MASK_SYS) + +#define CFS_FAILED_BIT 30 +/* CFS_FAILED is 0x40000000 */ +#define CFS_FAILED (1 << CFS_FAILED_BIT) + +#define CFS_FAIL_ONCE_BIT 31 +/* CFS_FAIL_ONCE is 0x80000000 */ +#define CFS_FAIL_ONCE (1 << CFS_FAIL_ONCE_BIT) + +/* The following flags aren't made to be combined */ +#define CFS_FAIL_SKIP 0x20000000 /* skip N times then fail */ +#define CFS_FAIL_SOME 0x10000000 /* only fail N times */ +#define CFS_FAIL_RAND 0x08000000 /* fail 1/N of the times */ +#define CFS_FAIL_USR1 0x04000000 /* user flag */ + +/* CFS_FAULT may be combined with any one of the above flags. */ +#define CFS_FAULT 0x02000000 /* match any CFS_FAULT_CHECK */ + +static inline bool CFS_FAIL_PRECHECK(__u32 id) +{ + return cfs_fail_loc != 0 && + ((cfs_fail_loc & CFS_FAIL_MASK_LOC) == (id & CFS_FAIL_MASK_LOC) || + (cfs_fail_loc & id & CFS_FAULT)); +} + +static inline int cfs_fail_check_set(__u32 id, __u32 value, + int set, int quiet) +{ + int ret = 0; + + if (unlikely(CFS_FAIL_PRECHECK(id) && + (ret = __cfs_fail_check_set(id, value, set)))) { + if (quiet) { + CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n", + id, value); + } else { + LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n", + id, value); + } + } + + return ret; +} + +/* If id hit cfs_fail_loc, return 1, otherwise return 0 */ +#define CFS_FAIL_CHECK(id) \ + cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0) +#define CFS_FAIL_CHECK_QUIET(id) \ + cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1) + +/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_VALUE(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0) +#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1) + +/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_ORSET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0) +#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1) + +/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_RESET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0) +#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1) + +static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) +{ + if (unlikely(CFS_FAIL_PRECHECK(id))) + return __cfs_fail_timeout_set(id, value, ms, set); + else + return 0; +} + +/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */ +#define CFS_FAIL_TIMEOUT(id, secs) \ + cfs_fail_timeout_set(id, 0, (secs) * 1000, CFS_FAIL_LOC_NOSET) + +#define CFS_FAIL_TIMEOUT_MS(id, ms) \ + cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET) + +/* If id hit cfs_fail_loc, cfs_fail_loc |= value and + * sleep seconds or milliseconds */ +#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \ + cfs_fail_timeout_set(id, value, (secs) * 1000, CFS_FAIL_LOC_ORSET) + +#define CFS_FAIL_TIMEOUT_RESET(id, value, secs) \ + cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_RESET) + +#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \ + cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET) + +#define CFS_FAULT_CHECK(id) \ + CFS_FAIL_CHECK(CFS_FAULT | (id)) + +/* The idea here is to synchronise two threads to force a race. The + * first thread that calls this with a matching fail_loc is put to + * sleep. The next thread that calls with the same fail_loc wakes up + * the first and continues. */ +static inline void cfs_race(__u32 id) +{ + if (CFS_FAIL_PRECHECK(id)) { + if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) { + int rc; + cfs_race_state = 0; + CERROR("cfs_race id %x sleeping\n", id); + rc = wait_event_interruptible(cfs_race_waitq, + cfs_race_state != 0); + CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc); + } else { + CERROR("cfs_fail_race id %x waking\n", id); + cfs_race_state = 1; + wake_up(&cfs_race_waitq); + } + } +} +#define CFS_RACE(id) cfs_race(id) + +/** + * Wait on race. + * + * The first thread that calls this with a matching fail_loc is put to sleep, + * but subseqent callers of this won't sleep. Until another thread that calls + * cfs_race_wakeup(), the first thread will be woken up and continue. + */ +static inline void cfs_race_wait(__u32 id) +{ + if (CFS_FAIL_PRECHECK(id)) { + if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) { + int rc; + + cfs_race_state = 0; + CERROR("cfs_race id %x sleeping\n", id); + rc = wait_event_interruptible(cfs_race_waitq, + cfs_race_state != 0); + CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc); + } + } +} +#define CFS_RACE_WAIT(id) cfs_race_wait(id) + +/** + * Wake up the thread that is waiting on the matching fail_loc. + */ +static inline void cfs_race_wakeup(__u32 id) +{ + if (CFS_FAIL_PRECHECK(id)) { + if (likely(!__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) { + CERROR("cfs_fail_race id %x waking\n", id); + cfs_race_state = 1; + wake_up(&cfs_race_waitq); + } + } +} +#define CFS_RACE_WAKEUP(id) cfs_race_wakeup(id) + +#endif /* _LIBCFS_FAIL_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h new file mode 100644 index 0000000000000..0c385a337ce26 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h @@ -0,0 +1,857 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_hash.h + * + * Hashing routines + * + */ + +#ifndef __LIBCFS_HASH_H__ +#define __LIBCFS_HASH_H__ + +#include + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL + +/** disable debug */ +#define CFS_HASH_DEBUG_NONE 0 +/** record hash depth and output to console when it's too deep, + * computing overhead is low but consume more memory */ +#define CFS_HASH_DEBUG_1 1 +/** expensive, check key validation */ +#define CFS_HASH_DEBUG_2 2 + +#define CFS_HASH_DEBUG_LEVEL CFS_HASH_DEBUG_NONE + +struct cfs_hash_ops; +struct cfs_hash_lock_ops; +struct cfs_hash_hlist_ops; + +union cfs_hash_lock { + rwlock_t rw; /**< rwlock */ + spinlock_t spin; /**< spinlock */ +}; + +/** + * cfs_hash_bucket is a container of: + * - lock, counter ... + * - array of hash-head starting from hsb_head[0], hash-head can be one of + * . struct cfs_hash_head + * . struct cfs_hash_head_dep + * . struct cfs_hash_dhead + * . struct cfs_hash_dhead_dep + * which depends on requirement of user + * - some extra bytes (caller can require it while creating hash) + */ +struct cfs_hash_bucket { + union cfs_hash_lock hsb_lock; /**< bucket lock */ + __u32 hsb_count; /**< current entries */ + __u32 hsb_version; /**< change version */ + unsigned int hsb_index; /**< index of bucket */ + int hsb_depmax; /**< max depth on bucket */ + long hsb_head[0]; /**< hash-head array */ +}; + +/** + * cfs_hash bucket descriptor, it's normally in stack of caller + */ +struct cfs_hash_bd { + /**< address of bucket */ + struct cfs_hash_bucket *bd_bucket; + /**< offset in bucket */ + unsigned int bd_offset; +}; + +#define CFS_HASH_NAME_LEN 16 /**< default name length */ +#define CFS_HASH_BIGNAME_LEN 64 /**< bigname for param tree */ + +#define CFS_HASH_BKT_BITS 3 /**< default bits of bucket */ +#define CFS_HASH_BITS_MAX 30 /**< max bits of bucket */ +#define CFS_HASH_BITS_MIN CFS_HASH_BKT_BITS + +/** + * common hash attributes. + */ +enum cfs_hash_tag { + /** + * don't need any lock, caller will protect operations with it's + * own lock. With this flag: + * . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK + * will be ignored. + * . Some functions will be disabled with this flag, i.e: + * cfs_hash_for_each_empty, cfs_hash_rehash + */ + CFS_HASH_NO_LOCK = 1 << 0, + /** no bucket lock, use one spinlock to protect the whole hash */ + CFS_HASH_NO_BKTLOCK = 1 << 1, + /** rwlock to protect bucket */ + CFS_HASH_RW_BKTLOCK = 1 << 2, + /** spinlock to protect bucket */ + CFS_HASH_SPIN_BKTLOCK = 1 << 3, + /** always add new item to tail */ + CFS_HASH_ADD_TAIL = 1 << 4, + /** hash-table doesn't have refcount on item */ + CFS_HASH_NO_ITEMREF = 1 << 5, + /** big name for param-tree */ + CFS_HASH_BIGNAME = 1 << 6, + /** track global count */ + CFS_HASH_COUNTER = 1 << 7, + /** rehash item by new key */ + CFS_HASH_REHASH_KEY = 1 << 8, + /** Enable dynamic hash resizing */ + CFS_HASH_REHASH = 1 << 9, + /** can shrink hash-size */ + CFS_HASH_SHRINK = 1 << 10, + /** assert hash is empty on exit */ + CFS_HASH_ASSERT_EMPTY = 1 << 11, + /** record hlist depth */ + CFS_HASH_DEPTH = 1 << 12, + /** + * rehash is always scheduled in a different thread, so current + * change on hash table is non-blocking + */ + CFS_HASH_NBLK_CHANGE = 1 << 13, + /** NB, we typed hs_flags as __u16, please change it + * if you need to extend >=16 flags */ +}; + +/** most used attributes */ +#define CFS_HASH_DEFAULT (CFS_HASH_RW_BKTLOCK | \ + CFS_HASH_COUNTER | CFS_HASH_REHASH) + +/** + * cfs_hash is a hash-table implementation for general purpose, it can support: + * . two refcount modes + * hash-table with & without refcount + * . four lock modes + * nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock + * . general operations + * lookup, add(add_tail or add_head), delete + * . rehash + * grows or shrink + * . iteration + * locked iteration and unlocked iteration + * . bigname + * support long name hash + * . debug + * trace max searching depth + * + * Rehash: + * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker) + * is spawned to handle the rehash in the background, it's possible that other + * processes can concurrently perform additions, deletions, and lookups + * without being blocked on rehash completion, because rehash will release + * the global wrlock for each bucket. + * + * rehash and iteration can't run at the same time because it's too tricky + * to keep both of them safe and correct. + * As they are relatively rare operations, so: + * . if iteration is in progress while we try to launch rehash, then + * it just giveup, iterator will launch rehash at the end. + * . if rehash is in progress while we try to iterate the hash table, + * then we just wait (shouldn't be very long time), anyway, nobody + * should expect iteration of whole hash-table to be non-blocking. + * + * During rehashing, a (key,object) pair may be in one of two buckets, + * depending on whether the worker task has yet to transfer the object + * to its new location in the table. Lookups and deletions need to search both + * locations; additions must take care to only insert into the new bucket. + */ + +struct cfs_hash { + /** serialize with rehash, or serialize all operations if + * the hash-table has CFS_HASH_NO_BKTLOCK */ + union cfs_hash_lock hs_lock; + /** hash operations */ + struct cfs_hash_ops *hs_ops; + /** hash lock operations */ + struct cfs_hash_lock_ops *hs_lops; + /** hash list operations */ + struct cfs_hash_hlist_ops *hs_hops; + /** hash buckets-table */ + struct cfs_hash_bucket **hs_buckets; + /** total number of items on this hash-table */ + atomic_t hs_count; + /** hash flags, see cfs_hash_tag for detail */ + __u16 hs_flags; + /** # of extra-bytes for bucket, for user saving extended attributes */ + __u16 hs_extra_bytes; + /** wants to iterate */ + __u8 hs_iterating; + /** hash-table is dying */ + __u8 hs_exiting; + /** current hash bits */ + __u8 hs_cur_bits; + /** min hash bits */ + __u8 hs_min_bits; + /** max hash bits */ + __u8 hs_max_bits; + /** bits for rehash */ + __u8 hs_rehash_bits; + /** bits for each bucket */ + __u8 hs_bkt_bits; + /** resize min threshold */ + __u16 hs_min_theta; + /** resize max threshold */ + __u16 hs_max_theta; + /** resize count */ + __u32 hs_rehash_count; + /** # of iterators (caller of cfs_hash_for_each_*) */ + __u32 hs_iterators; + /** rehash workitem */ + struct cfs_workitem hs_rehash_wi; + /** refcount on this hash table */ + atomic_t hs_refcount; + /** rehash buckets-table */ + struct cfs_hash_bucket **hs_rehash_buckets; +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 + /** serialize debug members */ + spinlock_t hs_dep_lock; + /** max depth */ + unsigned int hs_dep_max; + /** id of the deepest bucket */ + unsigned int hs_dep_bkt; + /** offset in the deepest bucket */ + unsigned int hs_dep_off; + /** bits when we found the max depth */ + unsigned int hs_dep_bits; + /** workitem to output max depth */ + struct cfs_workitem hs_dep_wi; +#endif + /** name of htable */ + char hs_name[0]; +}; + +struct cfs_hash_lock_ops { + /** lock the hash table */ + void (*hs_lock)(union cfs_hash_lock *lock, int exclusive); + /** unlock the hash table */ + void (*hs_unlock)(union cfs_hash_lock *lock, int exclusive); + /** lock the hash bucket */ + void (*hs_bkt_lock)(union cfs_hash_lock *lock, int exclusive); + /** unlock the hash bucket */ + void (*hs_bkt_unlock)(union cfs_hash_lock *lock, int exclusive); +}; + +struct cfs_hash_hlist_ops { + /** return hlist_head of hash-head of @bd */ + struct hlist_head *(*hop_hhead)(struct cfs_hash *hs, struct cfs_hash_bd *bd); + /** return hash-head size */ + int (*hop_hhead_size)(struct cfs_hash *hs); + /** add @hnode to hash-head of @bd */ + int (*hop_hnode_add)(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode); + /** remove @hnode from hash-head of @bd */ + int (*hop_hnode_del)(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode); +}; + +struct cfs_hash_ops { + /** return hashed value from @key */ + unsigned (*hs_hash)(struct cfs_hash *hs, const void *key, unsigned mask); + /** return key address of @hnode */ + void * (*hs_key)(struct hlist_node *hnode); + /** copy key from @hnode to @key */ + void (*hs_keycpy)(struct hlist_node *hnode, void *key); + /** + * compare @key with key of @hnode + * returns 1 on a match + */ + int (*hs_keycmp)(const void *key, struct hlist_node *hnode); + /** return object address of @hnode, i.e: container_of(...hnode) */ + void * (*hs_object)(struct hlist_node *hnode); + /** get refcount of item, always called with holding bucket-lock */ + void (*hs_get)(struct cfs_hash *hs, struct hlist_node *hnode); + /** release refcount of item */ + void (*hs_put)(struct cfs_hash *hs, struct hlist_node *hnode); + /** release refcount of item, always called with holding bucket-lock */ + void (*hs_put_locked)(struct cfs_hash *hs, struct hlist_node *hnode); + /** it's called before removing of @hnode */ + void (*hs_exit)(struct cfs_hash *hs, struct hlist_node *hnode); +}; + +/** total number of buckets in @hs */ +#define CFS_HASH_NBKT(hs) \ + (1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits)) + +/** total number of buckets in @hs while rehashing */ +#define CFS_HASH_RH_NBKT(hs) \ + (1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits)) + +/** number of hlist for in bucket */ +#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits) + +/** total number of hlist in @hs */ +#define CFS_HASH_NHLIST(hs) (1U << (hs)->hs_cur_bits) + +/** total number of hlist in @hs while rehashing */ +#define CFS_HASH_RH_NHLIST(hs) (1U << (hs)->hs_rehash_bits) + +static inline int +cfs_hash_with_no_lock(struct cfs_hash *hs) +{ + /* caller will serialize all operations for this hash-table */ + return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0; +} + +static inline int +cfs_hash_with_no_bktlock(struct cfs_hash *hs) +{ + /* no bucket lock, one single lock to protect the hash-table */ + return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_rw_bktlock(struct cfs_hash *hs) +{ + /* rwlock to protect hash bucket */ + return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_spin_bktlock(struct cfs_hash *hs) +{ + /* spinlock to protect hash bucket */ + return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_add_tail(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0; +} + +static inline int +cfs_hash_with_no_itemref(struct cfs_hash *hs) +{ + /* hash-table doesn't keep refcount on item, + * item can't be removed from hash unless it's + * ZERO refcount */ + return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0; +} + +static inline int +cfs_hash_with_bigname(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_BIGNAME) != 0; +} + +static inline int +cfs_hash_with_counter(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_COUNTER) != 0; +} + +static inline int +cfs_hash_with_rehash(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_REHASH) != 0; +} + +static inline int +cfs_hash_with_rehash_key(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0; +} + +static inline int +cfs_hash_with_shrink(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_SHRINK) != 0; +} + +static inline int +cfs_hash_with_assert_empty(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0; +} + +static inline int +cfs_hash_with_depth(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_DEPTH) != 0; +} + +static inline int +cfs_hash_with_nblk_change(struct cfs_hash *hs) +{ + return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0; +} + +static inline int +cfs_hash_is_exiting(struct cfs_hash *hs) +{ /* cfs_hash_destroy is called */ + return hs->hs_exiting; +} + +static inline int +cfs_hash_is_rehashing(struct cfs_hash *hs) +{ /* rehash is launched */ + return hs->hs_rehash_bits != 0; +} + +static inline int +cfs_hash_is_iterating(struct cfs_hash *hs) +{ /* someone is calling cfs_hash_for_each_* */ + return hs->hs_iterating || hs->hs_iterators != 0; +} + +static inline int +cfs_hash_bkt_size(struct cfs_hash *hs) +{ + return offsetof(struct cfs_hash_bucket, hsb_head[0]) + + hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) + + hs->hs_extra_bytes; +} + +static inline unsigned +cfs_hash_id(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return hs->hs_ops->hs_hash(hs, key, mask); +} + +static inline void * +cfs_hash_key(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_key(hnode); +} + +static inline void +cfs_hash_keycpy(struct cfs_hash *hs, struct hlist_node *hnode, void *key) +{ + if (hs->hs_ops->hs_keycpy != NULL) + hs->hs_ops->hs_keycpy(hnode, key); +} + +/** + * Returns 1 on a match, + */ +static inline int +cfs_hash_keycmp(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_keycmp(key, hnode); +} + +static inline void * +cfs_hash_object(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_object(hnode); +} + +static inline void +cfs_hash_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_get(hs, hnode); +} + +static inline void +cfs_hash_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_put_locked(hs, hnode); +} + +static inline void +cfs_hash_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + return hs->hs_ops->hs_put(hs, hnode); +} + +static inline void +cfs_hash_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + if (hs->hs_ops->hs_exit) + hs->hs_ops->hs_exit(hs, hnode); +} + +static inline void cfs_hash_lock(struct cfs_hash *hs, int excl) +{ + hs->hs_lops->hs_lock(&hs->hs_lock, excl); +} + +static inline void cfs_hash_unlock(struct cfs_hash *hs, int excl) +{ + hs->hs_lops->hs_unlock(&hs->hs_lock, excl); +} + +static inline int cfs_hash_dec_and_lock(struct cfs_hash *hs, + atomic_t *condition) +{ + LASSERT(cfs_hash_with_no_bktlock(hs)); + return atomic_dec_and_lock(condition, &hs->hs_lock.spin); +} + +static inline void cfs_hash_bd_lock(struct cfs_hash *hs, + struct cfs_hash_bd *bd, int excl) +{ + hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl); +} + +static inline void cfs_hash_bd_unlock(struct cfs_hash *hs, + struct cfs_hash_bd *bd, int excl) +{ + hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl); +} + +/** + * operations on cfs_hash bucket (bd: bucket descriptor), + * they are normally for hash-table without rehash + */ +void cfs_hash_bd_get(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bd); + +static inline void +cfs_hash_bd_get_and_lock(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bd, int excl) +{ + cfs_hash_bd_get(hs, key, bd); + cfs_hash_bd_lock(hs, bd, excl); +} + +static inline unsigned +cfs_hash_bd_index_get(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits); +} + +static inline void +cfs_hash_bd_index_set(struct cfs_hash *hs, unsigned index, + struct cfs_hash_bd *bd) +{ + bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits]; + bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U); +} + +static inline void * +cfs_hash_bd_extra_get(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + return (void *)bd->bd_bucket + + cfs_hash_bkt_size(hs) - hs->hs_extra_bytes; +} + +static inline __u32 +cfs_hash_bd_version_get(struct cfs_hash_bd *bd) +{ + /* need hold cfs_hash_bd_lock */ + return bd->bd_bucket->hsb_version; +} + +static inline __u32 +cfs_hash_bd_count_get(struct cfs_hash_bd *bd) +{ + /* need hold cfs_hash_bd_lock */ + return bd->bd_bucket->hsb_count; +} + +static inline int +cfs_hash_bd_depmax_get(struct cfs_hash_bd *bd) +{ + return bd->bd_bucket->hsb_depmax; +} + +static inline int +cfs_hash_bd_compare(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2) +{ + if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index) + return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index; + + if (bd1->bd_offset != bd2->bd_offset) + return bd1->bd_offset - bd2->bd_offset; + + return 0; +} + +void cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode); +void cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode); +void cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old, + struct cfs_hash_bd *bd_new, + struct hlist_node *hnode); + +static inline int +cfs_hash_bd_dec_and_lock(struct cfs_hash *hs, struct cfs_hash_bd *bd, + atomic_t *condition) +{ + LASSERT(cfs_hash_with_spin_bktlock(hs)); + return atomic_dec_and_lock(condition, &bd->bd_bucket->hsb_lock.spin); +} + +static inline struct hlist_head * +cfs_hash_bd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + return hs->hs_hops->hop_hhead(hs, bd); +} + +struct hlist_node * +cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key); +struct hlist_node * +cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key); +struct hlist_node * +cfs_hash_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key, struct hlist_node *hnode, + int insist_add); +struct hlist_node * +cfs_hash_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key, struct hlist_node *hnode); + +/** + * operations on cfs_hash bucket (bd: bucket descriptor), + * they are safe for hash-table with rehash + */ +void cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bds); +void cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, + int excl); +void cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, + int excl); + +static inline void +cfs_hash_dual_bd_get_and_lock(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bds, int excl) +{ + cfs_hash_dual_bd_get(hs, key, bds); + cfs_hash_dual_bd_lock(hs, bds, excl); +} + +struct hlist_node * +cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key); +struct hlist_node * +cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key, struct hlist_node *hnode, + int insist_add); +struct hlist_node * +cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key, struct hlist_node *hnode); + +/* Hash init/cleanup functions */ +struct cfs_hash * +cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits, + unsigned bkt_bits, unsigned extra_bytes, + unsigned min_theta, unsigned max_theta, + struct cfs_hash_ops *ops, unsigned flags); + +struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs); +void cfs_hash_putref(struct cfs_hash *hs); + +/* Hash addition functions */ +void cfs_hash_add(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); +int cfs_hash_add_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); +void *cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); + +/* Hash deletion functions */ +void *cfs_hash_del(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode); +void *cfs_hash_del_key(struct cfs_hash *hs, const void *key); + +/* Hash lookup/for_each functions */ +#define CFS_HASH_LOOP_HOG 1024 + +typedef int (*cfs_hash_for_each_cb_t)(struct cfs_hash *hs, + struct cfs_hash_bd *bd, + struct hlist_node *node, + void *data); +void * +cfs_hash_lookup(struct cfs_hash *hs, const void *key); +void +cfs_hash_for_each(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data); +void +cfs_hash_for_each_safe(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data); +int +cfs_hash_for_each_nolock(struct cfs_hash *hs, cfs_hash_for_each_cb_t, + void *data, int start); +int +cfs_hash_for_each_empty(struct cfs_hash *hs, cfs_hash_for_each_cb_t, + void *data); +void +cfs_hash_for_each_key(struct cfs_hash *hs, const void *key, + cfs_hash_for_each_cb_t, void *data); +typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data); +void +cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t, void *data); + +void +cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex, + cfs_hash_for_each_cb_t, void *data); +int cfs_hash_is_empty(struct cfs_hash *hs); +__u64 cfs_hash_size_get(struct cfs_hash *hs); + +/* + * Rehash - Theta is calculated to be the average chained + * hash depth assuming a perfectly uniform hash function. + */ +void cfs_hash_rehash_cancel_locked(struct cfs_hash *hs); +void cfs_hash_rehash_cancel(struct cfs_hash *hs); +int cfs_hash_rehash(struct cfs_hash *hs, int do_rehash); +void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key, + void *new_key, struct hlist_node *hnode); + +#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 +/* Validate hnode references the correct key */ +static inline void +cfs_hash_key_validate(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) +{ + LASSERT(cfs_hash_keycmp(hs, key, hnode)); +} + +/* Validate hnode is in the correct bucket */ +static inline void +cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_bd bds[2]; + + cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds); + LASSERT(bds[0].bd_bucket == bd->bd_bucket || + bds[1].bd_bucket == bd->bd_bucket); +} + +#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */ + +static inline void +cfs_hash_key_validate(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) {} + +static inline void +cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) {} + +#endif /* CFS_HASH_DEBUG_LEVEL */ + +#define CFS_HASH_THETA_BITS 10 +#define CFS_HASH_MIN_THETA (1U << (CFS_HASH_THETA_BITS - 1)) +#define CFS_HASH_MAX_THETA (1U << (CFS_HASH_THETA_BITS + 1)) + +/* Return integer component of theta */ +static inline int __cfs_hash_theta_int(int theta) +{ + return (theta >> CFS_HASH_THETA_BITS); +} + +/* Return a fractional value between 0 and 999 */ +static inline int __cfs_hash_theta_frac(int theta) +{ + return ((theta * 1000) >> CFS_HASH_THETA_BITS) - + (__cfs_hash_theta_int(theta) * 1000); +} + +static inline int __cfs_hash_theta(struct cfs_hash *hs) +{ + return (atomic_read(&hs->hs_count) << + CFS_HASH_THETA_BITS) >> hs->hs_cur_bits; +} + +static inline void +__cfs_hash_set_theta(struct cfs_hash *hs, int min, int max) +{ + LASSERT(min < max); + hs->hs_min_theta = (__u16)min; + hs->hs_max_theta = (__u16)max; +} + +/* Generic debug formatting routines mainly for proc handler */ +struct seq_file; +void cfs_hash_debug_header(struct seq_file *m); +void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m); + +/* + * Generic djb2 hash algorithm for character arrays. + */ +static inline unsigned +cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask) +{ + unsigned i, hash = 5381; + + LASSERT(key != NULL); + + for (i = 0; i < size; i++) + hash = hash * 33 + ((char *)key)[i]; + + return (hash & mask); +} + +/* + * Generic u32 hash algorithm. + */ +static inline unsigned +cfs_hash_u32_hash(const __u32 key, unsigned mask) +{ + return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask); +} + +/* + * Generic u64 hash algorithm. + */ +static inline unsigned +cfs_hash_u64_hash(const __u64 key, unsigned mask) +{ + return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask); +} + +/** iterate over all buckets in @bds (array of struct cfs_hash_bd) */ +#define cfs_hash_for_each_bd(bds, n, i) \ + for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++) + +/** iterate over all buckets of @hs */ +#define cfs_hash_for_each_bucket(hs, bd, pos) \ + for (pos = 0; \ + pos < CFS_HASH_NBKT(hs) && \ + ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++) + +/** iterate over all hlist of bucket @bd */ +#define cfs_hash_bd_for_each_hlist(hs, bd, hlist) \ + for ((bd)->bd_offset = 0; \ + (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) && \ + (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL; \ + (bd)->bd_offset++) + +/* !__LIBCFS__HASH_H__ */ +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_heap.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_heap.h new file mode 100644 index 0000000000000..239e9e0547214 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_heap.h @@ -0,0 +1,203 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + */ +/* + * libcfs/include/libcfs/heap.h + * + * Author: Eric Barton + * Liang Zhen + */ + +#ifndef __LIBCFS_HEAP_H__ +#define __LIBCFS_HEAP_H__ + +/** \defgroup heap Binary heap + * + * The binary heap is a scalable data structure created using a binary tree. It + * is capable of maintaining large sets of elements sorted usually by one or + * more element properties, but really based on anything that can be used as a + * binary predicate in order to determine the relevant ordering of any two nodes + * that belong to the set. There is no search operation, rather the intention is + * for the element of the lowest priority which will always be at the root of + * the tree (as this is an implementation of a min-heap) to be removed by users + * for consumption. + * + * Users of the heap should embed a \e struct cfs_binheap_node object instance + * on every object of the set that they wish the binary heap instance to handle, + * and (at a minimum) provide a struct cfs_binheap_ops::hop_compare() + * implementation which is used by the heap as the binary predicate during its + * internal sorting operations. + * + * The current implementation enforces no locking scheme, and so assumes the + * user caters for locking between calls to insert, delete and lookup + * operations. Since the only consumer for the data structure at this point + * are NRS policies, and these operate on a per-CPT basis, binary heap instances + * are tied to a specific CPT. + * @{ + */ + +/** + * Binary heap node. + * + * Objects of this type are embedded into objects of the ordered set that is to + * be maintained by a \e struct cfs_binheap instance. + */ +struct cfs_binheap_node { + /** Index into the binary tree */ + unsigned int chn_index; +}; + +#define CBH_SHIFT 9 +#define CBH_SIZE (1 << CBH_SHIFT) /* # ptrs per level */ +#define CBH_MASK (CBH_SIZE - 1) +#define CBH_NOB (CBH_SIZE * sizeof(struct cfs_binheap_node *)) + +#define CBH_POISON 0xdeadbeef + +/** + * Binary heap flags. + */ +enum { + CBH_FLAG_ATOMIC_GROW = 1, +}; + +struct cfs_binheap; + +/** + * Binary heap operations. + */ +struct cfs_binheap_ops { + /** + * Called right before inserting a node into the binary heap. + * + * Implementing this operation is optional. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 0 success + * \retval != 0 error + */ + int (*hop_enter)(struct cfs_binheap *h, + struct cfs_binheap_node *e); + /** + * Called right after removing a node from the binary heap. + * + * Implementing this operation is optional. + * + * \param[in] h The heap + * \param[in] e The node + */ + void (*hop_exit)(struct cfs_binheap *h, + struct cfs_binheap_node *e); + /** + * A binary predicate which is called during internal heap sorting + * operations, and used in order to determine the relevant ordering of + * two heap nodes. + * + * Implementing this operation is mandatory. + * + * \param[in] a The first heap node + * \param[in] b The second heap node + * + * \retval 0 Node a > node b + * \retval 1 Node a < node b + * + * \see cfs_binheap_bubble() + * \see cfs_biheap_sink() + */ + int (*hop_compare)(struct cfs_binheap_node *a, + struct cfs_binheap_node *b); +}; + +/** + * Binary heap object. + * + * Sorts elements of type \e struct cfs_binheap_node + */ +struct cfs_binheap { + /** Triple indirect */ + struct cfs_binheap_node ****cbh_elements3; + /** double indirect */ + struct cfs_binheap_node ***cbh_elements2; + /** single indirect */ + struct cfs_binheap_node **cbh_elements1; + /** # elements referenced */ + unsigned int cbh_nelements; + /** high water mark */ + unsigned int cbh_hwm; + /** user flags */ + unsigned int cbh_flags; + /** operations table */ + struct cfs_binheap_ops *cbh_ops; + /** private data */ + void *cbh_private; + /** associated CPT table */ + struct cfs_cpt_table *cbh_cptab; + /** associated CPT id of this struct cfs_binheap::cbh_cptab */ + int cbh_cptid; +}; + +void cfs_binheap_destroy(struct cfs_binheap *h); +struct cfs_binheap * +cfs_binheap_create(struct cfs_binheap_ops *ops, unsigned int flags, + unsigned count, void *arg, struct cfs_cpt_table *cptab, + int cptid); +struct cfs_binheap_node * +cfs_binheap_find(struct cfs_binheap *h, unsigned int idx); +int cfs_binheap_insert(struct cfs_binheap *h, struct cfs_binheap_node *e); +void cfs_binheap_remove(struct cfs_binheap *h, struct cfs_binheap_node *e); +void cfs_binheap_relocate(struct cfs_binheap *h, struct cfs_binheap_node *e); + +static inline int +cfs_binheap_size(struct cfs_binheap *h) +{ + return h->cbh_nelements; +} + +static inline int +cfs_binheap_is_empty(struct cfs_binheap *h) +{ + return h->cbh_nelements == 0; +} + +static inline struct cfs_binheap_node * +cfs_binheap_root(struct cfs_binheap *h) +{ + return cfs_binheap_find(h, 0); +} + +static inline struct cfs_binheap_node * +cfs_binheap_remove_root(struct cfs_binheap *h) +{ + struct cfs_binheap_node *e = cfs_binheap_find(h, 0); + + if (e != NULL) + cfs_binheap_remove(h, e); + return e; +} + +/** @} heap */ + +#endif /* __LIBCFS_HEAP_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h new file mode 100644 index 0000000000000..1001362e75cd0 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h @@ -0,0 +1,85 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_prim.h + * + * General primitives. + * + */ + +#ifndef __LIBCFS_PRIM_H__ +#define __LIBCFS_PRIM_H__ + +#include +#include + +/* + * Memory + */ +#if BITS_PER_LONG == 32 +/* limit to lowmem on 32-bit systems */ +# define NUM_CACHEPAGES \ + min(cfs_totalram_pages(), 1UL << (30 - PAGE_SHIFT) * 3 / 4) +#else +# define NUM_CACHEPAGES cfs_totalram_pages() +#endif + +static inline unsigned int memory_pressure_get(void) +{ + return current->flags & PF_MEMALLOC; +} + +static inline void memory_pressure_set(void) +{ + current->flags |= PF_MEMALLOC; +} + +static inline void memory_pressure_clr(void) +{ + current->flags &= ~PF_MEMALLOC; +} + +static inline int cfs_memory_pressure_get_and_set(void) +{ + int old = memory_pressure_get(); + + if (!old) + memory_pressure_set(); + return old; +} + +static inline void cfs_memory_pressure_restore(int old) +{ + if (old) + memory_pressure_set(); + else + memory_pressure_clr(); + return; +} +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h new file mode 100644 index 0000000000000..9a242839fd843 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h @@ -0,0 +1,428 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_private.h + * + * Various defines for libcfs. + * + */ + +#ifndef __LIBCFS_PRIVATE_H__ +#define __LIBCFS_PRIVATE_H__ + +#ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +#endif + +#include +#include + +#ifdef LIBCFS_DEBUG + +/* + * When this is on, LASSERT macro includes check for assignment used instead + * of equality check, but doesn't have unlikely(). Turn this on from time to + * time to make test-builds. This shouldn't be on for production release. + */ +#define LASSERT_CHECKED (0) + +#if LASSERT_CHECKED +/* + * Assertion. + * + * Strange construction with empty "then" clause is used to trigger compiler + * warnings on the assertions of the form LASSERT(a = b); + * + * "warning: suggest parentheses around assignment used as truth value" + * + * requires -Wall. Unfortunately this rules out use of likely/unlikely. + */ +#define LASSERTF(cond, fmt, ...) \ +do { \ + if (cond) \ + ; \ + else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL); \ + libcfs_debug_msg(&__msg_data, \ + "ASSERTION( %s ) failed: " fmt, #cond, \ + ## __VA_ARGS__); \ + lbug_with_loc(&__msg_data); \ + } \ +} while (0) + +#define LASSERT(cond) LASSERTF(cond, "\n") + +#else /* !LASSERT_CHECKED */ + +#define LASSERTF(cond, fmt, ...) \ +do { \ + if (unlikely(!(cond))) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL); \ + libcfs_debug_msg(&__msg_data, \ + "ASSERTION( %s ) failed: " fmt, #cond, \ + ## __VA_ARGS__); \ + lbug_with_loc(&__msg_data); \ + } \ +} while (0) + +#define LASSERT(cond) LASSERTF(cond, "\n") +#endif /* !LASSERT_CHECKED */ +#else /* !LIBCFS_DEBUG */ +/* sizeof is to use expression without evaluating it. */ +# define LASSERT(e) ((void)sizeof!!(e)) +# define LASSERTF(cond, ...) ((void)sizeof!!(cond)) +#endif /* !LIBCFS_DEBUG */ + +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK +/** + * This is for more expensive checks that one doesn't want to be enabled all + * the time. LINVRNT() has to be explicitly enabled by --enable-invariants + * configure option. + */ +# define LINVRNT(exp) LASSERT(exp) +#else +# define LINVRNT(exp) ((void)sizeof!!(exp)) +#endif + +#define KLASSERT(e) LASSERT(e) + +void lbug_with_loc(struct libcfs_debug_msg_data *) __attribute__((noreturn)); + +#define LBUG() \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \ + lbug_with_loc(&msgdata); \ +} while(0) + +/* + * Memory + */ +#ifdef LIBCFS_DEBUG + +extern atomic_t libcfs_kmemory; + +# define libcfs_kmem_inc(ptr, size) \ +do { \ + atomic_add(size, &libcfs_kmemory); \ +} while (0) + +# define libcfs_kmem_dec(ptr, size) \ +do { \ + atomic_sub(size, &libcfs_kmemory); \ +} while (0) + +# define libcfs_kmem_read() \ + atomic_read(&libcfs_kmemory) + +#else +# define libcfs_kmem_inc(ptr, size) do {} while (0) +# define libcfs_kmem_dec(ptr, size) do {} while (0) +# define libcfs_kmem_read() (0) +#endif /* LIBCFS_DEBUG */ + +#ifndef LIBCFS_VMALLOC_SIZE +#define LIBCFS_VMALLOC_SIZE (2 << PAGE_SHIFT) /* 2 pages */ +#endif + +#define LIBCFS_ALLOC_PRE(size, mask) \ +do { \ + LASSERT(!in_interrupt() || \ + ((size) <= LIBCFS_VMALLOC_SIZE && \ + ((mask) & GFP_ATOMIC)) != 0); \ +} while (0) + +#define LIBCFS_ALLOC_POST(ptr, size) \ +do { \ + if (unlikely((ptr) == NULL)) { \ + CERROR("LNET: out of memory at %s:%d (tried to alloc '" \ + #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size)); \ + CERROR("LNET: %d total bytes allocated by lnet\n", \ + libcfs_kmem_read()); \ + } else { \ + libcfs_kmem_inc((ptr), (size)); \ + CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %d).\n", \ + (int)(size), (ptr), libcfs_kmem_read()); \ + } \ +} while (0) + +/** + * allocate memory with GFP flags @mask + * The allocated memory is zeroed-out. + */ +#define LIBCFS_ALLOC_GFP(ptr, size, mask) \ +do { \ + LIBCFS_ALLOC_PRE((size), (mask)); \ + (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ? \ + kzalloc((size), (mask)) : vzalloc(size); \ + LIBCFS_ALLOC_POST((ptr), (size)); \ +} while (0) + +/** + * default allocator + */ +#define LIBCFS_ALLOC(ptr, size) \ + LIBCFS_ALLOC_GFP(ptr, size, GFP_NOFS) + +/** + * non-sleeping allocator + */ +#define LIBCFS_ALLOC_ATOMIC(ptr, size) \ + LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC) + +/** + * allocate memory for specified CPU partition + * \a cptab != NULL, \a cpt is CPU partition id of \a cptab + * \a cptab == NULL, \a cpt is HW NUMA node id + * The allocated memory is zeroed-out. + */ +#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask) \ +do { \ + LIBCFS_ALLOC_PRE((size), (mask)); \ + (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ? \ + cfs_cpt_malloc((cptab), (cpt), (size), (mask) | __GFP_ZERO) : \ + cfs_cpt_vzalloc((cptab), (cpt), (size)); \ + LIBCFS_ALLOC_POST((ptr), (size)); \ +} while (0) + +/** default numa allocator */ +#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size) \ + LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS) + +#ifdef LLIST_HEAD +void init_libcfs_vfree_atomic(void); +void exit_libcfs_vfree_atomic(void); +#define HAVE_LIBCFS_VFREE_ATOMIC +#else +#define init_libcfs_vfree_atomic() do {} while(0) +#define exit_libcfs_vfree_atomic() do {} while(0) +#endif + +#define LIBCFS_FREE(ptr, size) \ +do { \ + int s = (size); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + libcfs_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n", \ + s, (ptr), libcfs_kmem_read()); \ + if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ + libcfs_vfree_atomic(ptr); \ + else \ + kfree(ptr); \ +} while (0) + +/******************************************************************************/ + +struct task_struct; + +void libcfs_debug_dumpstack(struct task_struct *tsk); +void libcfs_debug_dumplog(void); +int libcfs_debug_init(unsigned long bufsize); +int libcfs_debug_cleanup(void); +int libcfs_debug_clear_buffer(void); +int libcfs_debug_mark_buffer(const char *text); + +/* + * allocate a variable array, returned value is an array of pointers. + * Caller can specify length of array by count. + */ +void *cfs_array_alloc(int count, unsigned int size); +void cfs_array_free(void *vars); + +#define LASSERT_ATOMIC_ENABLED (1) + +#if LASSERT_ATOMIC_ENABLED + +/** assert value of @a is equal to @v */ +#define LASSERT_ATOMIC_EQ(a, v) \ +do { \ + LASSERTF(atomic_read(a) == v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is unequal to @v */ +#define LASSERT_ATOMIC_NE(a, v) \ +do { \ + LASSERTF(atomic_read(a) != v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is little than @v */ +#define LASSERT_ATOMIC_LT(a, v) \ +do { \ + LASSERTF(atomic_read(a) < v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is little/equal to @v */ +#define LASSERT_ATOMIC_LE(a, v) \ +do { \ + LASSERTF(atomic_read(a) <= v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is great than @v */ +#define LASSERT_ATOMIC_GT(a, v) \ +do { \ + LASSERTF(atomic_read(a) > v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is great/equal to @v */ +#define LASSERT_ATOMIC_GE(a, v) \ +do { \ + LASSERTF(atomic_read(a) >= v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is great than @v1 and little than @v2 */ +#define LASSERT_ATOMIC_GT_LT(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v); \ +} while (0) + +/** assert value of @a is great than @v1 and little/equal to @v2 */ +#define LASSERT_ATOMIC_GT_LE(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v); \ +} while (0) + +/** assert value of @a is great/equal to @v1 and little than @v2 */ +#define LASSERT_ATOMIC_GE_LT(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v); \ +} while (0) + +/** assert value of @a is great/equal to @v1 and little/equal to @v2 */ +#define LASSERT_ATOMIC_GE_LE(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v); \ +} while (0) + +#else /* !LASSERT_ATOMIC_ENABLED */ + +#define LASSERT_ATOMIC_EQ(a, v) do {} while (0) +#define LASSERT_ATOMIC_NE(a, v) do {} while (0) +#define LASSERT_ATOMIC_LT(a, v) do {} while (0) +#define LASSERT_ATOMIC_LE(a, v) do {} while (0) +#define LASSERT_ATOMIC_GT(a, v) do {} while (0) +#define LASSERT_ATOMIC_GE(a, v) do {} while (0) +#define LASSERT_ATOMIC_GT_LT(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GT_LE(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GE_LT(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GE_LE(a, v1, v2) do {} while (0) + +#endif /* LASSERT_ATOMIC_ENABLED */ + +#define LASSERT_ATOMIC_ZERO(a) LASSERT_ATOMIC_EQ(a, 0) +#define LASSERT_ATOMIC_POS(a) LASSERT_ATOMIC_GT(a, 0) + +#define CFS_ALLOC_PTR(ptr) LIBCFS_ALLOC(ptr, sizeof (*(ptr))); +#define CFS_FREE_PTR(ptr) LIBCFS_FREE(ptr, sizeof (*(ptr))); + +/** Compile-time assertion. + + * Check an invariant described by a constant expression at compile time by + * forcing a compiler error if it does not hold. \a cond must be a constant + * expression as defined by the ISO C Standard: + * + * 6.8.4.2 The switch statement + * .... + * [#3] The expression of each case label shall be an integer + * constant expression and no two of the case constant + * expressions in the same switch statement shall have the same + * value after conversion... + * + */ +#define CLASSERT(cond) do {switch (1) {case (cond): case 0: break; } } while (0) + +/* implication */ +#define ergo(a, b) (!(a) || (b)) +/* logical equivalence */ +#define equi(a, b) (!!(a) == !!(b)) + +/* what used to be in portals_lib.h */ +#ifndef MIN +# define MIN(a,b) (((a)<(b)) ? (a): (b)) +#endif +#ifndef MAX +# define MAX(a,b) (((a)>(b)) ? (a): (b)) +#endif + +#define MKSTR(ptr) ((ptr))? (ptr) : "" + +static inline size_t cfs_size_round4(size_t val) +{ + return (val + 3) & (~0x3); +} + +#ifndef HAVE_CFS_SIZE_ROUND +static inline size_t cfs_size_round(size_t val) +{ + return (val + 7) & (~0x7); +} +#define HAVE_CFS_SIZE_ROUND +#endif + +static inline size_t cfs_size_round16(size_t val) +{ + return (val + 0xf) & (~0xf); +} + +static inline size_t cfs_size_round32(size_t val) +{ + return (val + 0x1f) & (~0x1f); +} + +static inline size_t cfs_size_round0(size_t val) +{ + if (!val) + return 0; + return (val + 1 + 7) & (~0x7); +} + +static inline size_t cfs_round_strlen(char *fset) +{ + return cfs_size_round(strlen(fset) + 1); +} + +extern struct cfs_psdev_ops libcfs_psdev_ops; +extern struct miscdevice libcfs_dev; +extern struct cfs_wi_sched *cfs_sched_rehash; + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h new file mode 100644 index 0000000000000..4d9dbde91e8a0 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h @@ -0,0 +1,91 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_string.h + * + * Generic string manipulation functions. + * + * Author: Nathan Rutman + */ + +#ifndef __LIBCFS_STRING_H__ +#define __LIBCFS_STRING_H__ + +/* libcfs_string.c */ +char *cfs_strrstr(const char *haystack, const char *needle); +/* Convert a text string to a bitmask */ +int cfs_str2mask(const char *str, const char *(*bit2str)(int bit), + int *oldmask, int minmask, int allmask); +/* trim leading and trailing space characters */ +char *cfs_firststr(char *str, size_t size); + +/** + * Structure to represent NULL-less strings. + */ +struct cfs_lstr { + char *ls_str; + int ls_len; +}; + +/* + * Structure to represent \ token of the syntax. + */ +struct cfs_range_expr { + /* + * Link to cfs_expr_list::el_exprs. + */ + struct list_head re_link; + __u32 re_lo; + __u32 re_hi; + __u32 re_stride; +}; + +struct cfs_expr_list { + struct list_head el_link; + struct list_head el_exprs; +}; + +char *cfs_trimwhite(char *str); +int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res); +int cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max); +int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list); +int cfs_expr_list_print(char *buffer, int count, + struct cfs_expr_list *expr_list); +int cfs_expr_list_values(struct cfs_expr_list *expr_list, + int max, __u32 **values); +void cfs_expr_list_values_free(__u32 *values, int num); +void cfs_expr_list_free(struct cfs_expr_list *expr_list); +int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp); +void cfs_expr_list_free_list(struct list_head *list); + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h new file mode 100644 index 0000000000000..84da4d98591ee --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h @@ -0,0 +1,107 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_workitem.h + * + * Author: Isaac Huang + * Liang Zhen + * + * A workitems is deferred work with these semantics: + * - a workitem always runs in thread context. + * - a workitem can be concurrent with other workitems but is strictly + * serialized with respect to itself. + * - no CPU affinity, a workitem does not necessarily run on the same CPU + * that schedules it. However, this might change in the future. + * - if a workitem is scheduled again before it has a chance to run, it + * runs only once. + * - if a workitem is scheduled while it runs, it runs again after it + * completes; this ensures that events occurring while other events are + * being processed receive due attention. This behavior also allows a + * workitem to reschedule itself. + * + * Usage notes: + * - a workitem can sleep but it should be aware of how that sleep might + * affect others. + * - a workitem runs inside a kernel thread so there's no user space to access. + * - do not use a workitem if the scheduling latency can't be tolerated. + * + * When wi_action returns non-zero, it means the workitem has either been + * freed or reused and workitem scheduler won't touch it any more. + */ + +#ifndef __LIBCFS_WORKITEM_H__ +#define __LIBCFS_WORKITEM_H__ + +struct cfs_wi_sched; + +void cfs_wi_sched_destroy(struct cfs_wi_sched *); +int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt, + int nthrs, struct cfs_wi_sched **); + +struct cfs_workitem; + +typedef int (*cfs_wi_action_t) (struct cfs_workitem *); + +struct cfs_workitem { + /** chain on runq or rerunq */ + struct list_head wi_list; + /** working function */ + cfs_wi_action_t wi_action; + /** arg for working function */ + void *wi_data; + /** in running */ + unsigned short wi_running:1; + /** scheduled */ + unsigned short wi_scheduled:1; +}; + +static inline void +cfs_wi_init(struct cfs_workitem *wi, void *data, cfs_wi_action_t action) +{ + INIT_LIST_HEAD(&wi->wi_list); + + wi->wi_running = 0; + wi->wi_scheduled = 0; + wi->wi_data = data; + wi->wi_action = action; +} + +void cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi); +int cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi); +void cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi); + +int cfs_wi_startup(void); +void cfs_wi_shutdown(void); + +/** # workitem scheduler loops before reschedule */ +#define CFS_WI_RESCHED 128 + +#endif /* __LIBCFS_WORKITEM_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h new file mode 100644 index 0000000000000..918f8daa8f4ca --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h @@ -0,0 +1,53 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-mem.h + * + * Basic library routines. + * + * Author: liang@whamcloud.com + */ + +#ifndef __LIBCFS_LINUX_CPU_H__ +#define __LIBCFS_LINUX_CPU_H__ + +#include + +#ifndef HAVE_TOPOLOGY_SIBLING_CPUMASK +# define topology_sibling_cpumask(cpu) topology_thread_cpumask(cpu) +#endif /* HAVE_TOPOLOGY_SIBLING_CPUMASK */ + +#ifndef HAVE_CPUS_READ_LOCK +# define cpus_read_lock get_online_cpus +# define cpus_read_unlock put_online_cpus +#endif + +#endif /* __LIBCFS_LINUX_CPU_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h new file mode 100644 index 0000000000000..6346c59e516e7 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h @@ -0,0 +1,55 @@ + /* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +/** + * Linux crypto hash specific functions. + */ + +/** + * Functions for start/stop shash CRC32 algorithm. + */ +int cfs_crypto_crc32_register(void); +void cfs_crypto_crc32_unregister(void); + +/** + * Functions for start/stop shash adler32 algorithm. + */ +int cfs_crypto_adler32_register(void); +void cfs_crypto_adler32_unregister(void); + +/** + * Functions for start/stop shash crc32 pclmulqdq + */ +int cfs_crypto_crc32_pclmul_register(void); +void cfs_crypto_crc32_pclmul_unregister(void); + +/** + * Functions for start/stop shash crc32c pclmulqdq + */ +int cfs_crypto_crc32c_pclmul_register(void); +void cfs_crypto_crc32c_pclmul_unregister(void); diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h new file mode 100644 index 0000000000000..dd86d1947466b --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h @@ -0,0 +1,107 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-fs.h + * + * Basic library routines. + */ + +#ifndef __LIBCFS_LINUX_CFS_FS_H__ +#define __LIBCFS_LINUX_CFS_FS_H__ + +#include +#include +#include +#include +#include + +#ifndef HAVE_FILE_DENTRY +static inline struct dentry *file_dentry(const struct file *file) +{ + return file->f_path.dentry; +} +#endif + +#ifndef QSTR_INIT +#define QSTR_INIT(n, l) { .len = l, .name = n } +#endif + +#if defined(HAVE_FILE_FSYNC_4ARGS) || defined(HAVE_FILE_FSYNC_2ARGS) +#define ll_vfs_fsync_range(fp, start, end, datasync) \ + vfs_fsync_range(fp, start, end, datasync) +#else +#define ll_vfs_fsync_range(fp, start, end, datasync) \ + vfs_fsync_range(fp, file_dentry(fp), start, end, datasync) +#endif + +#ifndef IFSHIFT +#define IFSHIFT 12 +#endif + +#ifndef IFTODT +#define IFTODT(type) (((type) & S_IFMT) >> IFSHIFT) +#endif +#ifndef DTTOIF +#define DTTOIF(dirtype) ((dirtype) << IFSHIFT) +#endif + +#ifndef HAVE_POSIXACL_USER_NS +/* + * Mask out &init_user_ns so we don't jump + * through hoops to define it somehow only + * to have it ignored anyway. + */ +#define posix_acl_from_xattr(a,b,c) posix_acl_from_xattr(b,c) +#define posix_acl_to_xattr(a,b,c,d) posix_acl_to_xattr(b,c,d) +#endif + +#ifndef HAVE_POSIX_ACL_VALID_USER_NS +#define posix_acl_valid(a,b) posix_acl_valid(b) +#endif + +#ifdef HAVE_PROC_OPS +#define PROC_OWNER(_fn) +#else +#define proc_ops file_operations +#define PROC_OWNER(_owner) .owner = (_owner), +#define proc_open open +#define proc_read read +#define proc_write write +#define proc_lseek llseek +#define proc_release release +#define proc_poll poll +#define proc_ioctl unlocked_ioctl +#define proc_compat_ioctl compat_ioctl +#define proc_mmap mmap +#define proc_get_unmapped_area get_unmapped_area +#endif + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h new file mode 100644 index 0000000000000..2721655306bbe --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h @@ -0,0 +1,247 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ + +#ifndef __LIBCFS_LINUX_HASH_H__ +#define __LIBCFS_LINUX_HASH_H__ + +#include + +u64 cfs_hashlen_string(const void *salt, const char *name); + +#ifndef hashlen_hash +#define hashlen_hash(hashlen) ((u32)(hashlen)) +#endif + +#ifndef HAVE_STRINGHASH +#ifndef hashlen_create +#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash)) +#endif +#endif /* !HAVE_STRINGHASH */ + +#ifdef HAVE_LINUX_RHASHTABLE_H +#include + +#ifndef HAVE_RHLTABLE +struct rhlist_head { + struct rhash_head rhead; + struct rhlist_head __rcu *next; +}; + +struct rhltable { + struct rhashtable ht; +}; + +#define rhl_for_each_entry_rcu(tpos, pos, list, member) \ + for (pos = list; pos && rht_entry(tpos, pos, member); \ + pos = rcu_dereference_raw(pos->next)) + +static inline int rhltable_init(struct rhltable *hlt, + const struct rhashtable_params *params) +{ + return rhashtable_init(&hlt->ht, params); +} + +static inline struct rhlist_head *rhltable_lookup( + struct rhltable *hlt, const void *key, + const struct rhashtable_params params) +{ + struct rhashtable *ht = &hlt->ht; + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct bucket_table *tbl; + struct rhash_head *he; + unsigned int hash; + + tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, key, params); + rht_for_each_rcu(he, tbl, hash) { + if (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, he)) : + rhashtable_compare(&arg, rht_obj(ht, he))) + continue; + return he ? container_of(he, struct rhlist_head, rhead) : NULL; + } + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + + return NULL; +} + +static inline int rhltable_insert_key( + struct rhltable *hlt, const void *key, struct rhlist_head *list, + const struct rhashtable_params params) +{ +#ifdef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT + return __rhashtable_insert_fast(&hlt->ht, key, &list->rhead, + params); +#else + return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, + params)); +#endif +} + +static inline int rhltable_remove( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return rhashtable_remove_fast(&hlt->ht, &list->rhead, params); +} + +static inline void rhltable_free_and_destroy(struct rhltable *hlt, + void (*free_fn)(void *ptr, + void *arg), + void *arg) +{ + rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); +} + +static inline void rhltable_destroy(struct rhltable *hlt) +{ + rhltable_free_and_destroy(hlt, NULL, NULL); +} + +static inline void rhltable_walk_enter(struct rhltable *hlt, + struct rhashtable_iter *iter) +{ + rhashtable_walk_init(&hlt->ht, iter); +} +#endif /* !HAVE_RHLTABLE */ + +#ifdef HAVE_BROKEN_HASH_64 + +#define GOLDEN_RATIO_32 0x61C88647 +#define GOLDEN_RATIO_64 0x61C8864680B583EBull + +static inline u32 cfs_hash_32(u32 val, unsigned int bits) +{ + /* High bits are more random, so use them. */ + return (val * GOLDEN_RATIO_32) >> (32 - bits); +} + +static __always_inline u32 cfs_hash_64(u64 val, unsigned int bits) +{ +#if BITS_PER_LONG == 64 + /* 64x64-bit multiply is efficient on all 64-bit processors */ + return val * GOLDEN_RATIO_64 >> (64 - bits); +#else + /* Hash 64 bits using only 32x32-bit multiply. */ + return cfs_hash_32(((u32)val ^ ((val >> 32) * GOLDEN_RATIO_32)), bits); +#endif +} +#else + +#define cfs_hash_32 hash_32 +#define cfs_hash_64 hash_64 + +#endif /* HAVE_BROKEN_HASH_64 */ + +#ifndef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST +/** + * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Just like rhashtable_lookup_insert_fast(), but this function returns the + * object if it exists, NULL if it did not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + const char *key; + void *ret; + int rc; + + rc = rhashtable_lookup_insert_fast(ht, obj, params); + switch (rc) { + case -EEXIST: + key = rht_obj(ht, obj); + ret = rhashtable_lookup_fast(ht, key, params); + break; + case 0: + ret = NULL; + break; + default: + ret = ERR_PTR(rc); + break; + } + return ret; +} +#endif /* !HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST */ + +#ifndef HAVE_RHASHTABLE_LOOKUP +/* + * The function rhashtable_lookup() and rhashtable_lookup_fast() + * are almost the same except rhashtable_lookup() doesn't + * take the RCU read lock. Since this is the case and only + * SLES12 SP3 lacks rhashtable_lookup() just duplicate the + * SLES12 SP3 rhashtable_lookup_fast() minus the RCU read lock. + */ +static inline void *rhashtable_lookup( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + const struct bucket_table *tbl; + struct rhash_head *he; + unsigned int hash; + + tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, key, params); + rht_for_each_rcu(he, tbl, hash) { + if (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, he)) : + rhashtable_compare(&arg, rht_obj(ht, he))) + continue; + return rht_obj(ht, he); + } + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + + return NULL; +} +#endif /* !HAVE_RHASHTABLE_LOOKUP */ +#else +#define rhashtable_init(ht, param) 0 +#define rhashtable_destroy(ht) do {} while (0) +#endif /* HAVE_LINUX_RHASHTABLE_H */ + +#endif /* __LIBCFS_LINUX_HASH_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h new file mode 100644 index 0000000000000..e4a8e8d92c325 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h @@ -0,0 +1,52 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ + +#ifndef __LIBCFS_LINUX_LIST_H__ +#define __LIBCFS_LINUX_LIST_H__ + +#include + +#ifdef HAVE_HLIST_FOR_EACH_3ARG +#define cfs_hlist_for_each_entry(tpos, pos, head, member) \ + hlist_for_each_entry(tpos, head, member) +#define cfs_hlist_for_each_entry_safe(tpos, pos, n, head, member) \ + hlist_for_each_entry_safe(tpos, n, head, member) +#define cfs_hlist_for_each_entry_continue(tpos, pos, member) \ + hlist_for_each_entry_continue(tpos, member) +#define cfs_hlist_for_each_entry_from(tpos, pos, member) \ + hlist_for_each_entry_from(tpos, member) +#else +#define cfs_hlist_for_each_entry(tpos, pos, head, member) \ + hlist_for_each_entry(tpos, pos, head, member) +#define cfs_hlist_for_each_entry_safe(tpos, pos, n, head, member) \ + hlist_for_each_entry_safe(tpos, pos, n, head, member) +#define cfs_hlist_for_each_entry_continue(tpos, pos, member) \ + hlist_for_each_entry_continue(tpos, pos, member) +#define cfs_hlist_for_each_entry_from(tpos, pos, member) \ + hlist_for_each_entry_from(tpos, pos, member) +#endif + +#ifdef HAVE_HLIST_ADD_AFTER +#define hlist_add_behind(hnode, tail) hlist_add_after(tail, hnode) +#endif /* HAVE_HLIST_ADD_AFTER */ + +#endif /* __LIBCFS_LINUX_LIST_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h new file mode 100644 index 0000000000000..81e79dbf24852 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h @@ -0,0 +1,162 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-mem.h + * + * Basic library routines. + */ + +#ifndef __LIBCFS_LINUX_CFS_MEM_H__ +#define __LIBCFS_LINUX_CFS_MEM_H__ + +#include +#include +#include +#include +#ifdef HAVE_MM_INLINE +# include +#endif + +/* + * Shrinker + */ +#ifdef HAVE_SHRINK_CONTROL +# define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask) \ + struct shrinker *shrinker, \ + struct shrink_control *sc +# define shrink_param(sc, var) ((sc)->var) +#else +struct shrink_control { + gfp_t gfp_mask; + unsigned long nr_to_scan; +}; +# ifdef HAVE_SHRINKER_WANT_SHRINK_PTR +# define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask) \ + struct shrinker *shrinker, \ + int nr_to_scan, gfp_t gfp_mask +# else +# define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask) \ + int nr_to_scan, gfp_t gfp_mask +# endif + /* avoid conflict with spl mm_compat.h */ +# define HAVE_SHRINK_CONTROL_STRUCT 1 +# define shrink_param(sc, var) (var) +#endif + +#ifdef HAVE_SHRINKER_COUNT +struct shrinker_var { + unsigned long (*count)(struct shrinker *, + struct shrink_control *sc); + unsigned long (*scan)(struct shrinker *, + struct shrink_control *sc); +}; +# define DEF_SHRINKER_VAR(name, shrink, count_obj, scan_obj) \ + struct shrinker_var name = { .count = count_obj, .scan = scan_obj } +#else +struct shrinker_var { + int (*shrink)(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)); +}; +# define DEF_SHRINKER_VAR(name, shrinker, count, scan) \ + struct shrinker_var name = { .shrink = shrinker } +# define SHRINK_STOP (~0UL) +#endif + +static inline +struct shrinker *set_shrinker(int seek, struct shrinker_var *var) +{ + struct shrinker *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return (NULL); + +#ifdef HAVE_SHRINKER_COUNT + s->count_objects = var->count; + s->scan_objects = var->scan; +#else + s->shrink = var->shrink; +#endif + s->seeks = seek; + + register_shrinker(s); + + return s; +} + +static inline +void remove_shrinker(struct shrinker *shrinker) +{ + if (shrinker == NULL) + return; + + unregister_shrinker(shrinker); + kfree(shrinker); +} + +#ifndef HAVE_MMAP_LOCK +static inline void mmap_write_lock(struct mm_struct *mm) +{ + down_write(&mm->mmap_sem); +} + +static inline bool mmap_write_trylock(struct mm_struct *mm) +{ + return down_write_trylock(&mm->mmap_sem) != 0; +} + +static inline void mmap_write_unlock(struct mm_struct *mm) +{ + up_write(&mm->mmap_sem); +} + +static inline void mmap_read_lock(struct mm_struct *mm) +{ + down_read(&mm->mmap_sem); +} + +static inline bool mmap_read_trylock(struct mm_struct *mm) +{ + return down_read_trylock(&mm->mmap_sem) != 0; +} + +static inline void mmap_read_unlock(struct mm_struct *mm) +{ + up_read(&mm->mmap_sem); +} +#endif + +#ifdef HAVE_VMALLOC_2ARGS +#define __ll_vmalloc(size, flags) __vmalloc(size, flags) +#else +#define __ll_vmalloc(size, flags) __vmalloc(size, flags, PAGE_KERNEL) +#endif + +#endif /* __LINUX_CFS_MEM_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h new file mode 100644 index 0000000000000..a55697b2cfbfe --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h @@ -0,0 +1,204 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_LINUX_MISC_H__ +#define __LIBCFS_LINUX_MISC_H__ + +#include +#include +#include +#include +#include + +#ifdef HAVE_SYSCTL_CTLNAME +#define INIT_CTL_NAME .ctl_name = CTL_UNNUMBERED, +#define INIT_STRATEGY .strategy = &sysctl_intvec, +#else +#define INIT_CTL_NAME +#define INIT_STRATEGY +#endif + +#ifndef HAVE_IOV_ITER_TYPE +#ifdef HAVE_IOV_ITER_HAS_TYPE_MEMBER +#define iter_is_iovec(iter) ((iter)->type & ITER_IOVEC) +#define iov_iter_is_kvec(iter) ((iter)->type & ITER_KVEC) +#define iov_iter_is_bvec(iter) ((iter)->type & ITER_BVEC) +#define iov_iter_is_pipe(iter) ((iter)->type & ITER_PIPE) +#define iov_iter_is_discard(iter) ((iter)->type & ITER_DISCARD) +#else +#define iter_is_iovec(iter) 1 +#define iov_iter_is_kvec(iter) 0 +#define iov_iter_is_bvec(iter) 0 +#define iov_iter_is_pipe(iter) 0 +#define iov_iter_is_discard(iter) 0 +#endif +#endif /* HAVE_IOV_ITER_TYPE */ + +#ifndef HAVE_MODULE_PARAM_LOCKING +static DEFINE_MUTEX(param_lock); +#endif + +#ifndef HAVE_UIDGID_HEADER + +#ifndef _LINUX_UIDGID_H +#define _LINUX_UIDGID_H + +typedef uid_t kuid_t; +typedef gid_t kgid_t; + +#define INVALID_UID -1 +#define INVALID_GID -1 + +#define GLOBAL_ROOT_UID 0 +#define GLOBAL_ROOT_GID 0 + +static inline uid_t __kuid_val(kuid_t uid) +{ + return uid; +} + +static inline gid_t __kgid_val(kgid_t gid) +{ + return gid; +} + +static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) +{ + return uid; +} + +static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid) +{ + return gid; +} + +static inline uid_t from_kuid(struct user_namespace *to, kuid_t uid) +{ + return uid; +} + +static inline gid_t from_kgid(struct user_namespace *to, kgid_t gid) +{ + return gid; +} + +static inline bool uid_eq(kuid_t left, kuid_t right) +{ + return left == right; +} + +static inline bool uid_valid(kuid_t uid) +{ + return uid != (typeof(uid))INVALID_UID; +} + +static inline bool gid_valid(kgid_t gid) +{ + return gid != (typeof(gid))INVALID_GID; +} +#endif /* _LINUX_UIDGID_H */ + +#endif + +int cfs_get_environ(const char *key, char *value, int *val_len); + +int cfs_kernel_write(struct file *filp, const void *buf, size_t count, + loff_t *pos); + +/* + * For RHEL6 struct kernel_parm_ops doesn't exist. Also + * the arguments for .set and .get take different + * parameters which is handled below + */ +#ifdef HAVE_KERNEL_PARAM_OPS +#define cfs_kernel_param_arg_t const struct kernel_param +#else +#define cfs_kernel_param_arg_t struct kernel_param_ops +#define kernel_param_ops kernel_param +#endif /* ! HAVE_KERNEL_PARAM_OPS */ + +#ifndef HAVE_KERNEL_PARAM_LOCK +static inline void kernel_param_unlock(struct module *mod) +{ +#ifndef HAVE_MODULE_PARAM_LOCKING + mutex_unlock(¶m_lock); +#else + __kernel_param_unlock(); +#endif +} + +static inline void kernel_param_lock(struct module *mod) +{ +#ifndef HAVE_MODULE_PARAM_LOCKING + mutex_lock(¶m_lock); +#else + __kernel_param_lock(); +#endif +} +#endif /* ! HAVE_KERNEL_PARAM_LOCK */ + +#ifndef HAVE_KSTRTOUL +static inline int kstrtoul(const char *s, unsigned int base, unsigned long *res) +{ + char *end = (char *)s; + + *res = simple_strtoul(s, &end, base); + if (end - s == 0) + return -EINVAL; + return 0; +} +#endif /* !HAVE_KSTRTOUL */ + +#ifndef HAVE_KSTRTOBOOL_FROM_USER + +#define kstrtobool strtobool + +int kstrtobool_from_user(const char __user *s, size_t count, bool *res); +#endif + +#ifndef HAVE_TASK_IS_RUNNING +#define task_is_running(task) (task->state == TASK_RUNNING) +#endif + +#ifdef HAVE_KALLSYMS_LOOKUP_NAME +static inline void *cfs_kallsyms_lookup_name(const char *name) +{ + return (void *)kallsyms_lookup_name(name); +} +#else +static inline void *cfs_kallsyms_lookup_name(const char *name) +{ + return NULL; +} +#endif + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h new file mode 100644 index 0000000000000..98951f7a5d4bb --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h @@ -0,0 +1,76 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ + +#ifndef __LIBCFS_LINUX_NET_H__ +#define __LIBCFS_LINUX_NET_H__ + +#ifdef HAVE_KERNEL_SETSOCKOPT + +#include + +static inline void tcp_sock_set_quickack(struct sock *sk, int opt) +{ + struct socket *sock = sk->sk_socket; + + kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, + (char *)&opt, sizeof(opt)); +} + +#if !defined(HAVE_TCP_SOCK_SET_NODELAY) +static inline void tcp_sock_set_nodelay(struct sock *sk) +{ + int opt = 1; + struct socket *sock = sk->sk_socket; + + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)&opt, sizeof(opt)); +} +#endif /* HAVE_TCP_SOCK_SET_NODELAY */ + +#if !defined(HAVE_TCP_SOCK_SET_KEEPIDLE) +static inline int tcp_sock_set_keepidle(struct sock *sk, int opt) +{ + struct socket *sock = sk->sk_socket; + + return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&opt, sizeof(opt)); +} +#endif /* HAVE_TCP_SOCK_SET_KEEPIDLE */ + +static inline int tcp_sock_set_keepintvl(struct sock *sk, int opt) +{ + struct socket *sock = sk->sk_socket; + + return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&opt, sizeof(opt)); +} + +static inline int tcp_sock_set_keepcnt(struct sock *sk, int opt) +{ + struct socket *sock = sk->sk_socket; + + return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, + (char *)&opt, sizeof(opt)); +} +#endif /* HAVE_KERNEL_SETSOCKOPT */ + +#endif /* __LIBCFS_LINUX_NET_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h new file mode 100644 index 0000000000000..3934635dcd322 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h @@ -0,0 +1,251 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-time.h + * + * Implementation of portable time API for Linux (kernel and user-level). + * + * Author: Nikita Danilov + */ + +#ifndef __LIBCFS_LINUX_LINUX_TIME_H__ +#define __LIBCFS_LINUX_LINUX_TIME_H__ + +/* Portable time API */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Generic kernel stuff + */ +#ifndef HAVE_TIMESPEC64 + +typedef __s64 time64_t; + +#if __BITS_PER_LONG == 64 + +# define timespec64 timespec + +static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) +{ + return ts; +} + +static inline struct timespec timespec64_to_timespec(const struct timespec64 ts) +{ + return ts; +} + +#else +struct timespec64 { + time64_t tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ +}; + +static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) +{ + struct timespec64 ret; + + ret.tv_sec = ts.tv_sec; + ret.tv_nsec = ts.tv_nsec; + return ret; +} + +static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) +{ + struct timespec ret; + + ret.tv_sec = (time_t)ts64.tv_sec; + ret.tv_nsec = ts64.tv_nsec; + return ret; +} +#endif /* __BITS_PER_LONG != 64 */ + +#endif /* HAVE_TIMESPEC64 */ + +#ifndef HAVE_NS_TO_TIMESPEC64 +static inline struct timespec64 ns_to_timespec64(const s64 nsec) +{ + struct timespec64 ts; + s32 rem; + + if (!nsec) + return (struct timespec64) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +#endif + +#ifndef HAVE_KTIME_ADD +# define ktime_add(lhs, rhs) ({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; }) +#endif /* !HAVE_KTIME_ADD */ + +#ifndef HAVE_KTIME_AFTER +static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 > cmp2.tv64; +} +#endif /* !HAVE_KTIME_AFTER */ + +#ifndef HAVE_KTIME_BEFORE +static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 < cmp2.tv64; +} +#endif /* !HAVE_KTIME_BEFORE */ + +#ifndef HAVE_KTIME_COMPARE +static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) +{ + if (cmp1.tv64 < cmp2.tv64) + return -1; + if (cmp1.tv64 > cmp2.tv64) + return 1; + return 0; +} +#endif /* !HAVE_KTIME_COMPARE */ + +#ifndef HAVE_KTIME_GET_TS64 +void ktime_get_ts64(struct timespec64 *ts); +#endif /* HAVE_KTIME_GET_TS */ + +#ifndef HAVE_KTIME_GET_REAL_TS64 +void ktime_get_real_ts64(struct timespec64 *ts); +#endif /* HAVE_KTIME_GET_REAL_TS */ + +#ifndef HAVE_KTIME_GET_REAL_SECONDS +time64_t ktime_get_real_seconds(void); +#endif /* HAVE_KTIME_GET_REAL_SECONDS */ + +#ifndef HAVE_KTIME_GET_SECONDS +time64_t ktime_get_seconds(void); +#endif /* HAVE_KTIME_GET_SECONDS */ + +#ifdef NEED_KTIME_GET_NS +static inline u64 ktime_get_ns(void) +{ + return ktime_to_ns(ktime_get()); +} +#endif /* NEED_KTIME_GET_NS */ + +#ifdef NEED_KTIME_GET_REAL_NS +static inline u64 ktime_get_real_ns(void) +{ + return ktime_to_ns(ktime_get_real()); +} +#endif /* NEED_KTIME_GET_REAL_NS */ + +#ifndef HAVE_KTIME_MS_DELTA +static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier) +{ + return ktime_to_ms(ktime_sub(later, earlier)); +} +#endif /* HAVE_KTIME_MS_DELTA */ + +#ifndef HAVE_KTIME_TO_TIMESPEC64 +static inline struct timespec64 ktime_to_timespec64(ktime_t kt) +{ + struct timespec ts = ns_to_timespec((kt).tv64); + + return timespec_to_timespec64(ts); +} +#endif /* HAVE_KTIME_TO_TIMESPEC64 */ + +#ifndef HAVE_TIMESPEC64_SUB +static inline struct timespec64 +timespec64_sub(struct timespec64 later, struct timespec64 earlier) +{ + struct timespec diff; + + diff = timespec_sub(timespec64_to_timespec(later), + timespec64_to_timespec(earlier)); + return timespec_to_timespec64(diff); +} +#endif + +#ifndef HAVE_TIMESPEC64_TO_KTIME +static inline ktime_t timespec64_to_ktime(struct timespec64 ts) +{ + return ktime_set(ts.tv_sec, ts.tv_nsec); +} +#endif + +static inline unsigned long cfs_time_seconds(time64_t seconds) +{ + return nsecs_to_jiffies(seconds * NSEC_PER_SEC); +} + +#ifdef HAVE_NEW_DEFINE_TIMER +# ifndef TIMER_DATA_TYPE +# define TIMER_DATA_TYPE struct timer_list * +# endif + +#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \ + DEFINE_TIMER((_name), (_function)) +#else +# ifndef TIMER_DATA_TYPE +# define TIMER_DATA_TYPE unsigned long +# endif + +#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \ + DEFINE_TIMER((_name), (_function), (_expires), (_data)) +#endif + +#ifdef HAVE_TIMER_SETUP +#define cfs_timer_cb_arg_t struct timer_list * +#define cfs_from_timer(var, callback_timer, timer_fieldname) \ + from_timer(var, callback_timer, timer_fieldname) +#define cfs_timer_setup(timer, callback, data, flags) \ + timer_setup((timer), (callback), (flags)) +#define cfs_timer_cb_arg(var, timer_fieldname) (&(var)->timer_fieldname) +#else +#define cfs_timer_cb_arg_t unsigned long +#define cfs_from_timer(var, data, timer_fieldname) (typeof(var))(data) +#define cfs_timer_setup(timer, callback, data, flags) \ + setup_timer((timer), (callback), (data)) +#define cfs_timer_cb_arg(var, timer_fieldname) (cfs_timer_cb_arg_t)(var) +#endif + +#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h new file mode 100644 index 0000000000000..fd154ba0f049f --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h @@ -0,0 +1,568 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LIBCFS_LINUX_WAIT_BIT_H +#define __LIBCFS_LINUX_WAIT_BIT_H + +/* Make sure we can see if we have TASK_NOLOAD */ +#include +/* + * Linux wait-bit related types and methods: + */ +#ifdef HAVE_WAIT_BIT_HEADER_H +#include +#endif +#include + +#ifndef HAVE_WAIT_QUEUE_ENTRY +#define wait_queue_entry_t wait_queue_t +#endif + +#ifndef HAVE_WAIT_BIT_HEADER_H +struct wait_bit_queue_entry { + struct wait_bit_key key; + wait_queue_entry_t wq_entry; +}; + +#define ___wait_is_interruptible(state) \ + (!__builtin_constant_p(state) || \ + state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ + +#endif /* ! HAVE_WAIT_BIT_HEADER_H */ + +#ifndef HAVE_PREPARE_TO_WAIT_EVENT +extern long prepare_to_wait_event(wait_queue_head_t *wq_head, + wait_queue_entry_t *wq_entry, int state); +#endif + +/* ___wait_cond_timeout changed number of args in v3.12-rc1-78-g35a2af94c7ce + * so let's define our own ___wait_cond_timeout1 + */ + +#define ___wait_cond_timeout1(condition) \ +({ \ + bool __cond = (condition); \ + if (__cond && !__ret) \ + __ret = 1; \ + __cond || !__ret; \ +}) + +#ifndef HAVE_CLEAR_AND_WAKE_UP_BIT +/** + * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit + * + * @bit: the bit of the word being waited on + * @word: the word being waited on, a kernel virtual address + * + * You can use this helper if bitflags are manipulated atomically rather than + * non-atomically under a lock. + */ +static inline void clear_and_wake_up_bit(int bit, void *word) +{ + clear_bit_unlock(bit, word); + /* See wake_up_bit() for which memory barrier you need to use. */ + smp_mb__after_atomic(); + wake_up_bit(word, bit); +} +#endif /* ! HAVE_CLEAR_AND_WAKE_UP_BIT */ + +#ifndef HAVE_WAIT_VAR_EVENT +extern void __init wait_bit_init(void); +extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, + void *var, int flags); +extern void wake_up_var(void *var); +extern wait_queue_head_t *__var_waitqueue(void *p); + +#define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ +({ \ + __label__ __out; \ + wait_queue_head_t *__wq_head = __var_waitqueue(var); \ + struct wait_bit_queue_entry __wbq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait_var_entry(&__wbq_entry, var, \ + exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ + for (;;) { \ + long __int = prepare_to_wait_event(__wq_head, \ + &__wbq_entry.wq_entry, \ + state); \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + goto __out; \ + } \ + \ + cmd; \ + } \ + finish_wait(__wq_head, &__wbq_entry.wq_entry); \ +__out: __ret; \ +}) + +#define __wait_var_event(var, condition) \ + ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + schedule()) + +#define wait_var_event(var, condition) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __wait_var_event(var, condition); \ +} while (0) + +#define __wait_var_event_killable(var, condition) \ + ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ + schedule()) + +#define wait_var_event_killable(var, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_var_event_killable(var, condition); \ + __ret; \ +}) + +#define __wait_var_event_timeout(var, condition, timeout) \ + ___wait_var_event(var, ___wait_cond_timeout1(condition), \ + TASK_UNINTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_var_event_timeout(var, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_var_event_timeout(var, condition, timeout); \ + __ret; \ +}) +#endif /* ! HAVE_WAIT_VAR_EVENT */ + +/* + * prepare_to_wait_event() does not support an exclusive + * lifo wait. + * However it will not relink the wait_queue_entry if + * it is already linked. So we link to the head of the + * queue here, and it will stay there. + */ +static inline void prepare_to_wait_exclusive_head( + wait_queue_head_t *waitq, wait_queue_entry_t *link) +{ + unsigned long flags; + + spin_lock_irqsave(&(waitq->lock), flags); +#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST + if (list_empty(&link->entry)) +#else + if (list_empty(&link->task_list)) +#endif + __add_wait_queue_exclusive(waitq, link); + spin_unlock_irqrestore(&((waitq)->lock), flags); +} + +#ifndef ___wait_event +/* + * The below macro ___wait_event() has an explicit shadow of the __ret + * variable when used from the wait_event_*() macros. + * + * This is so that both can use the ___wait_cond_timeout1() construct + * to wrap the condition. + * + * The type inconsistency of the wait_event_*() __ret variable is also + * on purpose; we use long where we can return timeout values and int + * otherwise. + */ + +#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \ +({ \ + __label__ __out; \ + wait_queue_entry_ __wq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait(&__wq_entry); \ + if (exclusive) \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE \ + for (;;) { \ + long __int = prepare_to_wait_event(&wq_head, \ + &__wq_entry, state); \ + \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + goto __out; \ + } \ + \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ +__out: __ret; \ +}) +#endif + +#ifndef TASK_NOLOAD + +#define ___wait_event_idle(wq_head, condition, exclusive, ret, cmd) \ +({ \ + wait_queue_entry_t __wq_entry; \ + unsigned long flags; \ + long __ret = ret; /* explicit shadow */ \ + sigset_t __blocked; \ + \ + __blocked = cfs_block_sigsinv(0); \ + init_wait(&__wq_entry); \ + if (exclusive) \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE; \ + for (;;) { \ + prepare_to_wait_event(&wq_head, \ + &__wq_entry, \ + TASK_INTERRUPTIBLE); \ + \ + if (condition) \ + break; \ + /* We have to do this here because some signals */ \ + /* are not blockable - ie from strace(1). */ \ + /* In these cases we want to schedule_timeout() */ \ + /* again, because we don't want that to return */ \ + /* -EINTR when the RPC actually succeeded. */ \ + /* the recalc_sigpending() below will deliver the */ \ + /* signal properly. */ \ + if (signal_pending(current)) { \ + spin_lock_irqsave(¤t->sighand->siglock, \ + flags); \ + clear_tsk_thread_flag(current, TIF_SIGPENDING); \ + spin_unlock_irqrestore(¤t->sighand->siglock,\ + flags); \ + } \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ + cfs_restore_sigs(__blocked); \ + __ret; \ +}) + +#define wait_event_idle(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event_idle(wq_head, condition, 0, 0, schedule());\ +} while (0) + +#define wait_event_idle_exclusive(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event_idle(wq_head, condition, 1, 0, schedule());\ +} while (0) + +#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)\ + ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition), \ + 1, timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout( \ + wq_head, condition, timeout); \ + __ret; \ +}) + +#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition, \ + timeout, cmd1, cmd2) \ + ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition), \ + 1, timeout, \ + cmd1; __ret = schedule_timeout(__ret); cmd2) + +#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\ + cmd1, cmd2) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout_cmd( \ + wq_head, condition, timeout, cmd1, cmd2); \ + __ret; \ +}) + +#define __wait_event_idle_timeout(wq_head, condition, timeout) \ + ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition), \ + 0, timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_event_idle_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_timeout(wq_head, condition, \ + timeout); \ + __ret; \ +}) + +#else /* TASK_IDLE */ +#ifndef wait_event_idle +/** + * wait_event_idle - wait for a condition without contributing to system load + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. + * The @condition is checked each time the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + */ +#define wait_event_idle(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, \ + schedule()); \ +} while (0) +#endif +#ifndef wait_event_idle_exclusive +/** + * wait_event_idle_exclusive - wait for a condition without contributing to + * system load + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. + * The @condition is checked each time the waitqueue @wq_head is woken up. + * + * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag + * set thus if other processes wait on the same list, when this + * process is woken further processes are not considered. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + */ +#define wait_event_idle_exclusive(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, \ + schedule()); \ +} while (0) +#endif +#ifndef wait_event_idle_exclusive_timeout +/** + * wait_event_idle_exclusive_timeout - sleep without load until a condition + * becomes true or a timeout elapses + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, in jiffies + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. The @condition is checked each time + * the waitqueue @wq_head is woken up. + * + * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag + * set thus if other processes wait on the same list, when this + * process is woken further processes are not considered. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * or the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed. + */ +#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout(wq_head, \ + condition, \ + timeout); \ + __ret; \ +}) +#endif +#ifndef wait_event_idle_exclusive_timeout_cmd +#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition, \ + timeout, cmd1, cmd2) \ + ___wait_event(wq_head, ___wait_cond_timeout1(condition), \ + TASK_IDLE, 1, timeout, \ + cmd1; __ret = schedule_timeout(__ret); cmd2) + +#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\ + cmd1, cmd2) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout_cmd( \ + wq_head, condition, timeout, cmd1, cmd2); \ + __ret; \ +}) +#endif + +#ifndef wait_event_idle_timeout + +#define __wait_event_idle_timeout(wq_head, condition, timeout) \ + ___wait_event(wq_head, ___wait_cond_timeout1(condition), \ + TASK_IDLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) + +/** + * wait_event_idle_timeout - sleep without load until a condition becomes + * true or a timeout elapses + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, in jiffies + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. The @condition is checked each time + * the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * or the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed. + */ +#define wait_event_idle_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_timeout(wq_head, condition, \ + timeout); \ + __ret; \ +}) +#endif +#endif /* TASK_IDLE */ + +/* ___wait_event_lifo is used for lifo exclusive 'idle' waits */ +#ifdef TASK_NOLOAD + +#define ___wait_event_lifo(wq_head, condition, ret, cmd) \ +({ \ + wait_queue_entry_t __wq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait(&__wq_entry); \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE; \ + for (;;) { \ + prepare_to_wait_exclusive_head(&wq_head, &__wq_entry); \ + prepare_to_wait_event(&wq_head, &__wq_entry, TASK_IDLE);\ + \ + if (condition) \ + break; \ + \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ + __ret; \ +}) +#else +#define ___wait_event_lifo(wq_head, condition, ret, cmd) \ +({ \ + wait_queue_entry_t __wq_entry; \ + unsigned long flags; \ + long __ret = ret; /* explicit shadow */ \ + sigset_t __blocked; \ + \ + __blocked = cfs_block_sigsinv(0); \ + init_wait(&__wq_entry); \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE; \ + for (;;) { \ + prepare_to_wait_exclusive_head(&wq_head, &__wq_entry); \ + prepare_to_wait_event(&wq_head, &__wq_entry, \ + TASK_INTERRUPTIBLE); \ + \ + if (condition) \ + break; \ + /* See justification in ___wait_event_idle */ \ + if (signal_pending(current)) { \ + spin_lock_irqsave(¤t->sighand->siglock, \ + flags); \ + clear_tsk_thread_flag(current, TIF_SIGPENDING); \ + spin_unlock_irqrestore(¤t->sighand->siglock,\ + flags); \ + } \ + cmd; \ + } \ + cfs_restore_sigs(__blocked); \ + finish_wait(&wq_head, &__wq_entry); \ + __ret; \ +}) +#endif + +#define wait_event_idle_exclusive_lifo(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event_lifo(wq_head, condition, 0, schedule()); \ +} while (0) + +#define __wait_event_idle_lifo_timeout(wq_head, condition, timeout) \ + ___wait_event_lifo(wq_head, ___wait_cond_timeout1(condition), \ + timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_event_idle_exclusive_lifo_timeout(wq_head, condition, timeout)\ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_lifo_timeout(wq_head, \ + condition, \ + timeout); \ + __ret; \ +}) + +/* l_wait_event_abortable() is a bit like wait_event_killable() + * except there is a fixed set of signals which will abort: + * LUSTRE_FATAL_SIGS + */ +#define LUSTRE_FATAL_SIGS \ + (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGTERM) | \ + sigmask(SIGQUIT) | sigmask(SIGALRM)) + +#define l_wait_event_abortable(wq, condition) \ +({ \ + sigset_t __new_blocked, __old_blocked; \ + int __ret = 0; \ + siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ + __ret = wait_event_interruptible(wq, condition); \ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + __ret; \ +}) + +#define l_wait_event_abortable_timeout(wq, condition, timeout) \ +({ \ + sigset_t __new_blocked, __old_blocked; \ + int __ret = 0; \ + siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ + __ret = wait_event_interruptible_timeout(wq, condition, timeout);\ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + __ret; \ +}) + +#define l_wait_event_abortable_exclusive(wq, condition) \ +({ \ + sigset_t __new_blocked, __old_blocked; \ + int __ret = 0; \ + siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ + __ret = wait_event_interruptible_exclusive(wq, condition); \ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + __ret; \ +}) + +#endif /* __LICBFS_LINUX_WAIT_BIT_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h new file mode 100644 index 0000000000000..45818dddedd94 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h @@ -0,0 +1,103 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ + +#ifndef _LINUX_HASH_H +#define _LINUX_HASH_H +/* Fast hashing routine for ints, longs and pointers. + (C) 2002 Nadia Yvette Chambers, IBM */ + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ + +#include + +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME_32 0x9e370001UL +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL + +#if __BITS_PER_LONG == 32 +#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32 +#define hash_long(val, bits) hash_32(val, bits) +#elif __BITS_PER_LONG == 64 +#define hash_long(val, bits) hash_64(val, bits) +#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64 +#else +#error Wordsize not 32 or 64 +#endif + +static __always_inline __u64 hash_64(__u64 val, unsigned int bits) +{ + __u64 hash = val; + + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ + __u64 n = hash; + n <<= 18; + hash -= n; + n <<= 33; + hash -= n; + n <<= 3; + hash += n; + n <<= 3; + hash -= n; + n <<= 4; + hash += n; + n <<= 2; + hash += n; + + /* High bits are more random, so use them. */ + return hash >> (64 - bits); +} + +static inline __u32 hash_32(__u32 val, unsigned int bits) +{ + /* On some cpus multiply is faster, on others gcc will do shifts */ + __u32 hash = val * GOLDEN_RATIO_PRIME_32; + + /* High bits are more random, so use them. */ + return hash >> (32 - bits); +} + +static inline unsigned long hash_ptr(const void *ptr, unsigned int bits) +{ + return hash_long((unsigned long)ptr, bits); +} + +static inline __u32 hash32_ptr(const void *ptr) +{ + unsigned long val = (unsigned long)ptr; + +#if __BITS_PER_LONG == 64 + val ^= (val >> 32); +#endif + return (__u32)val; +} + +#endif /* _LINUX_HASH_H */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h new file mode 100644 index 0000000000000..a42e0c5fe4568 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h @@ -0,0 +1,68 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/util/ioctl.h + * + * Utility functions for calling ioctls. + * + */ +#ifndef _LIBCFS_IOCTL_H_ +#define _LIBCFS_IOCTL_H_ + +#include +#include + +/* Sparse annotation. */ +#define __user + +#include + +#define LIBCFS_IOC_INIT(data) \ +do { \ + memset(&(data), 0, sizeof(data)); \ + (data).ioc_hdr.ioc_version = LIBCFS_IOCTL_VERSION; \ + (data).ioc_hdr.ioc_len = sizeof(data); \ +} while (0) + +#define LIBCFS_IOC_INIT_V2(data, hdr) \ +do { \ + memset(&(data), 0, sizeof(data)); \ + (data).hdr.ioc_version = LIBCFS_IOCTL_VERSION2; \ + (data).hdr.ioc_len = sizeof(data); \ +} while (0) + +/* FIXME - rename these to libcfs_ */ +int libcfs_ioctl_pack(struct libcfs_ioctl_data *data, char **pbuf, int max); +void libcfs_ioctl_unpack(struct libcfs_ioctl_data *data, char *pbuf); +int register_ioc_dev(int dev_id, const char *dev_name); +void unregister_ioc_dev(int dev_id); +int l_ioctl(int dev_id, unsigned int opc, void *buf); +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h new file mode 100644 index 0000000000000..ef69efed6cf1e --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h @@ -0,0 +1,499 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ + +#ifndef __LIBCFS_UTIL_LIST_H__ +#define __LIBCFS_UTIL_LIST_H__ + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +#define prefetch(a) ((void)a) + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/** + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * Insert an entry at the start of a list. + * \param new new entry to be inserted + * \param head list to add it to + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, + struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * Insert an entry at the end of a list. + * \param new new entry to be inserted + * \param head list to add it to + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, + struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head *prev, + struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * Remove an entry from the list it is currently in. + * \param entry the entry to remove + * Note: list_empty(entry) does not return true after this, the entry is in an + * undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * Remove an entry from the list it is currently in and reinitialize it. + * \param entry the entry to remove. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * Remove an entry from the list it is currently in and insert it at the start + * of another list. + * \param list the entry to move + * \param head the list to move it to + */ +static inline void list_move(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * Remove an entry from the list it is currently in and insert it at the end of + * another list. + * \param list the entry to move + * \param head the list to move it to + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * Test whether a list is empty + * \param head the list to test. + */ +static inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +/** + * Test whether a list is empty and not being modified + * \param head the list to test + * + * Tests whether a list is empty _and_ checks that no other CPU might be + * in the process of modifying either member (next or prev) + * + * NOTE: using list_empty_careful() without synchronization + * can only be safe if the only activity that can happen + * to the list entry is list_del_init(). Eg. it cannot be used + * if another CPU could re-list_add() it. + */ +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * Join two lists + * \param list the new list to add. + * \param head the place to add it in the first list. + * + * The contents of \a list are added at the start of \a head. \a list is in an + * undefined state on return. + */ +static inline void list_splice(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +static inline void list_splice_tail(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head->prev); +} + +/** + * Join two lists and reinitialise the emptied list. + * \param list the new list to add. + * \param head the place to add it in the first list. + * + * The contents of \a list are added at the start of \a head. \a list is empty + * on return. + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * Get the container of a list + * \param ptr the embedded list. + * \param type the type of the struct this is embedded in. + * \param member the member name of the list within the struct. + */ +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(char *)(&((type *)0)->member))) + +/** + * Iterate over a list + * \param pos the iterator + * \param head the list to iterate over + * + * Behaviour is undefined if \a pos is removed from the list in the body of the + * loop. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + +/** + * Iterate over a list safely + * \param pos the iterator + * \param n temporary storage + * \param head the list to iterate over + * + * This is safe to use if \a pos could be removed from the list in the body of + * the loop. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * Iterate over a list continuing after existing point + * \param pos the type * to use as a loop counter + * \param head the list head + * \param member the name of the list_struct within the struct + */ +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member); \ + prefetch(pos->member.next), &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * \defgroup hlist Hash List + * Double linked lists with a single pointer list head. + * Mostly useful for hash tables where the two pointer list head is too + * wasteful. You lose the ability to access the tail in O(1). + * @{ + */ + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + +/* @} */ + +/* + * "NULL" might not be defined at this point + */ +#ifdef NULL +#define NULL_P NULL +#else +#define NULL_P ((void *)0) +#endif + +/** + * \addtogroup hlist + * @{ + */ + +#define HLIST_HEAD_INIT { NULL_P } +#define HLIST_HEAD(name) struct hlist_head name = { NULL_P } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL_P) +#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL_P, (ptr)->pprev = NULL_P) + +static inline int hlist_unhashed(const struct hlist_node *h) +{ + return !h->pprev; +} + +static inline int hlist_empty(const struct hlist_head *h) +{ + return !h->first; +} + +static inline void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + *pprev = next; + if (next) + next->pprev = pprev; +} + +static inline void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); +} + +static inline void hlist_del_init(struct hlist_node *n) +{ + if (n->pprev) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +static inline void hlist_add_head(struct hlist_node *n, + struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + +/* next must be != NULL */ +static inline void hlist_add_before(struct hlist_node *n, + struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; +} + +static inline void hlist_add_after(struct hlist_node *n, + struct hlist_node *next) +{ + next->next = n->next; + n->next = next; + next->pprev = &n->next; + + if(next->next) + next->next->pprev = &next->next; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_for_each(pos, head) \ + for (pos = (head)->first; pos && (prefetch(pos->next), 1); \ + pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; pos && (n = pos->next, 1); \ + pos = n) + +/** + * Iterate over an hlist of given type + * \param tpos the type * to use as a loop counter. + * \param pos the &struct hlist_node to use as a loop counter. + * \param head the head for your list. + * \param member the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(tpos, pos, head, member) \ + for (pos = (head)->first; \ + pos && ({ prefetch(pos->next); 1;}) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * Iterate over an hlist continuing after existing point + * \param tpos the type * to use as a loop counter. + * \param pos the &struct hlist_node to use as a loop counter. + * \param member the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_continue(tpos, pos, member) \ + for (pos = (pos)->next; \ + pos && ({ prefetch(pos->next); 1;}) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * Iterate over an hlist continuing from an existing point + * \param tpos the type * to use as a loop counter. + * \param pos the &struct hlist_node to use as a loop counter. + * \param member the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_from(tpos, pos, member) \ + for (; pos && ({ prefetch(pos->next); 1;}) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * Iterate over an hlist of given type safe against removal of list entry + * \param tpos the type * to use as a loop counter. + * \param pos the &struct hlist_node to use as a loop counter. + * \param n another &struct hlist_node to use as temporary storage + * \param head the head for your list. + * \param member the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \ + for (pos = (head)->first; \ + pos && ({ n = pos->next; 1; }) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = n) + +/* @} */ + +/** + * Iterate over a list in reverse order + * \param pos the &struct list_head to use as a loop counter. + * \param head the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ + pos = pos->prev, prefetch(pos->prev)) + +/** + * Iterate over a list of given type + * \param pos the type * to use as a loop counter. + * \param head the head for your list. + * \param member the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + prefetch(pos->member.next); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next)) + +/** + * Iterate backwards over a list of given type. + * \param pos the type * to use as a loop counter. + * \param head the head for your list. + * \param member the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); \ + prefetch(pos->member.prev), &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +/** + * Iterate over a list of given type safe against removal of list entry + * \param pos the type * to use as a loop counter. + * \param n another type * to use as temporary storage + * \param head the head for your list. + * \param member the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * Iterate backwards over a list of given type safely against removal of entry + * \param pos the type * to use as a loop counter. + * \param n another type * to use as temporary storage + * \param head the head for your list. + * \param member the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member), \ + n = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + +#endif /* __LIBCFS_UTIL_LIST_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h new file mode 100644 index 0000000000000..2fd1e36b07354 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h @@ -0,0 +1,40 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of the + * License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + * LGPL HEADER END + * + * Copyright (c) 2015, James Simmons + * + * Author: + * James Simmons + */ +#ifndef _LIBCFS_UTIL_PARAM_H_ +#define _LIBCFS_UTIL_PARAM_H_ + +#include +#include + +static inline void cfs_free_param_data(glob_t *paths) +{ + globfree(paths); +} + +int cfs_get_param_paths(glob_t *paths, const char *pattern, ...) + __attribute__((__format__(__printf__, 2, 3))); + +#endif /* _LIBCFS_UTIL_PARAM_H_ */ diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h new file mode 100644 index 0000000000000..7bae8393a1916 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h @@ -0,0 +1,115 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/util/parser.h + * + * A command line parser. + * + */ + +#ifndef _PARSER_H_ +#define _PARSER_H_ + +#define HISTORY 100 /* Don't let history grow unbounded */ +#define MAXARGS 512 + +#define CMD_COMPLETE 0 +#define CMD_INCOMPLETE 1 +#define CMD_NONE 2 +#define CMD_AMBIG 3 +#define CMD_HELP 4 + +typedef struct parser_cmd { + char *pc_name; + int (* pc_func)(int, char **); + struct parser_cmd * pc_sub_cmd; + char *pc_help; +} command_t; + +typedef struct argcmd { + char *ac_name; + int (*ac_func)(int, char **); + char *ac_help; +} argcmd_t; + +typedef struct network { + char *type; + char *server; + int port; +} network_t; + +int Parser_quit(int argc, char **argv); +int Parser_version(int argc, char **argv); +void Parser_init(char *, command_t *); /* Set prompt and load command list */ +int Parser_commands(void); /* Start the command parser */ +void Parser_qhelp(int, char **); /* Quick help routine */ +int Parser_help(int, char **); /* Detailed help routine */ +void Parser_ignore_errors(int ignore); /* Set the ignore errors flag */ +void Parser_printhelp(char *); /* Detailed help routine */ +void Parser_exit(int, char **); /* Shuts down command parser */ +int Parser_execarg(int argc, char **argv, command_t cmds[]); +int execute_line(char * line); +int Parser_list_commands(const command_t *cmdlist, char *buffer, + size_t buf_size, const char *parent_cmd, + int col_start, int col_num); + +/* Converts a string to an integer */ +int Parser_int(char *, int *); + +/* Prompts for a string, with default values and a maximum length */ +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len); + +/* Prompts for an integer, with minimum, maximum and default values and base */ +int Parser_getint(const char *prompt, long min, long max, long deft, + int base); + +/* Prompts for a yes/no, with default */ +int Parser_getbool(const char *prompt, int deft); + +/* Extracts an integer from a string, or prompts if it cannot get one */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base); + +/* Extracts a word from the input, or propmts if it cannot get one */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len); + +/* Extracts an integer from a string with a base */ +int Parser_arg2int(const char *inp, long *result, int base); + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size(unsigned long *sizep, char *str); + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool(int *b, char *str); + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h new file mode 100644 index 0000000000000..065829b7161d6 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h @@ -0,0 +1,90 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_string.h + * + * Generic string manipulation functions. + * + * Author: Nathan Rutman + */ + +#ifndef __LIBCFS_UTIL_STRING_H__ +#define __LIBCFS_UTIL_STRING_H__ + +#include + +#include +#include + +/** + * Structure to represent NULL-less strings. + */ +struct cfs_lstr { + char *ls_str; + int ls_len; +}; + +/* + * Structure to represent \ token of the syntax. + */ +struct cfs_range_expr { + /* + * Link to cfs_expr_list::el_exprs. + */ + struct list_head re_link; + __u32 re_lo; + __u32 re_hi; + __u32 re_stride; +}; + +struct cfs_expr_list { + struct list_head el_link; + struct list_head el_exprs; +}; + +int cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp); +int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res); +int cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max); +int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list); +int cfs_expr_list_print(char *buffer, int count, + struct cfs_expr_list *expr_list); +int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp); +void cfs_expr_list_free(struct cfs_expr_list *expr_list); +void cfs_expr_list_free_list(struct list_head *list); +int cfs_ip_addr_parse(char *str, int len, struct list_head *list); +int cfs_ip_addr_range_gen(__u32 *ip_list, int count, + struct list_head *ip_addr_expr); +int cfs_ip_addr_match(__u32 addr, struct list_head *list); +int cfs_abs_path(const char *request_path, char **resolved_path); + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile new file mode 100644 index 0000000000000..04b9fafaae920 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile @@ -0,0 +1,18 @@ +obj-$(CONFIG_LUSTREFSX_LIBCFS) += libcfs.o + +libcfs-linux-objs := linux-tracefile.o linux-debug.o linux-prim.o +libcfs-linux-objs += linux-curproc.o linux-module.o linux-hash.o linux-wait.o +libcfs-linux-objs += linux-crypto.o linux-crypto-adler.o +libcfs-linux-objs += linux-crypto-crc32.o + +libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs)) + +libcfs-all-objs := debug.o fail.o module.o tracefile.o watchdog.o +libcfs-all-objs += libcfs_string.o hash.o prng.o workitem.o +libcfs-all-objs += libcfs_cpu.o libcfs_mem.o libcfs_lock.o heap.o + +libcfs-y += $(libcfs-linux-objs) $(libcfs-all-objs) + +ccflags-y += -I$(src) + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c new file mode 100644 index 0000000000000..65e5b4669d0d2 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c @@ -0,0 +1,534 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/debug.c + * + * Author: Phil Schwan + * + */ + +# define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include "tracefile.h" + +static char debug_file_name[1024]; + +unsigned int libcfs_subsystem_debug = ~0; +module_param(libcfs_subsystem_debug, int, 0644); +MODULE_PARM_DESC(libcfs_subsystem_debug, "Lustre kernel debug subsystem mask"); +EXPORT_SYMBOL(libcfs_subsystem_debug); + +unsigned int libcfs_debug = (D_CANTMASK | + D_NETERROR | D_HA | D_CONFIG | D_IOCTL | D_LFSCK); +module_param(libcfs_debug, int, 0644); +MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask"); +EXPORT_SYMBOL(libcfs_debug); + +static int libcfs_param_debug_mb_set(const char *val, + cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned int num; + + rc = kstrtouint(val, 0, &num); + if (rc < 0) + return rc; + +/* + * RHEL6 does not support any kind of locking so we have to provide + * our own + */ +#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK) + kernel_param_lock(THIS_MODULE); +#endif + if (!*((unsigned int *)kp->arg)) { + *((unsigned int *)kp->arg) = num; + +#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK) + kernel_param_unlock(THIS_MODULE); +#endif + return 0; + } + + rc = cfs_trace_set_debug_mb(num); + + if (!rc) + *((unsigned int *)kp->arg) = cfs_trace_get_debug_mb(); + +#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK) + kernel_param_unlock(THIS_MODULE); +#endif + return rc; +} + +/* + * While debug_mb setting look like unsigned int, in fact + * it needs quite a bunch of extra processing, so we define special + * debug_mb parameter type with corresponding methods to handle this case + */ +static struct kernel_param_ops param_ops_debug_mb = { + .set = libcfs_param_debug_mb_set, + .get = param_get_uint, +}; + +#define param_check_debug_mb(name, p) \ + __param_check(name, p, unsigned int) + +static unsigned int libcfs_debug_mb; +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(libcfs_debug_mb, debug_mb, 0644); +#else +module_param_call(libcfs_debug_mb, libcfs_param_debug_mb_set, param_get_uint, + ¶m_ops_debug_mb, 0644); +#endif +MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size."); + +unsigned int libcfs_printk = D_CANTMASK; +module_param(libcfs_printk, uint, 0644); +MODULE_PARM_DESC(libcfs_printk, "Lustre kernel debug console mask"); + +unsigned int libcfs_console_ratelimit = 1; +module_param(libcfs_console_ratelimit, uint, 0644); +MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)"); + +static int param_set_delay_minmax(const char *val, + cfs_kernel_param_arg_t *kp, + long min, long max) +{ + long d; + int sec; + int rc; + + rc = kstrtoint(val, 0, &sec); + if (rc) + return -EINVAL; + + /* The sysfs setting is in centiseconds */ + d = cfs_time_seconds(sec) / 100; + if (d < min || d > max) + return -EINVAL; + + *((unsigned int *)kp->arg) = d; + + return 0; +} + +static int param_get_delay(char *buffer, cfs_kernel_param_arg_t *kp) +{ + unsigned int d = *(unsigned int *)kp->arg; + + return sprintf(buffer, "%lu", jiffies_to_msecs(d * 10) / MSEC_PER_SEC); +} + +unsigned int libcfs_console_max_delay; +unsigned int libcfs_console_min_delay; + +static int param_set_console_max_delay(const char *val, + cfs_kernel_param_arg_t *kp) +{ + return param_set_delay_minmax(val, kp, + libcfs_console_min_delay, INT_MAX); +} + +static struct kernel_param_ops param_ops_console_max_delay = { + .set = param_set_console_max_delay, + .get = param_get_delay, +}; + +#define param_check_console_max_delay(name, p) \ + __param_check(name, p, unsigned int) + +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(libcfs_console_max_delay, console_max_delay, 0644); +#else +module_param_call(libcfs_console_max_delay, param_set_console_max_delay, + param_get_delay, ¶m_ops_console_max_delay, 0644); +#endif +MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)"); + +static int param_set_console_min_delay(const char *val, + cfs_kernel_param_arg_t *kp) +{ + return param_set_delay_minmax(val, kp, + 1, libcfs_console_max_delay); +} + +static struct kernel_param_ops param_ops_console_min_delay = { + .set = param_set_console_min_delay, + .get = param_get_delay, +}; + +#define param_check_console_min_delay(name, p) \ + __param_check(name, p, unsigned int) + +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(libcfs_console_min_delay, console_min_delay, 0644); +#else +module_param_call(libcfs_console_min_delay, param_set_console_min_delay, + param_get_delay, ¶m_ops_console_min_delay, 0644); +#endif +MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)"); + +#ifndef HAVE_PARAM_SET_UINT_MINMAX +static int param_set_uint_minmax(const char *val, + cfs_kernel_param_arg_t *kp, + unsigned int min, unsigned int max) +{ + unsigned int num; + int ret; + + if (!val) + return -EINVAL; + + ret = kstrtouint(val, 0, &num); + if (ret < 0 || num < min || num > max) + return -EINVAL; + + *((unsigned int *)kp->arg) = num; + return 0; +} +#endif + +static int param_set_uintpos(const char *val, + cfs_kernel_param_arg_t *kp) +{ + return param_set_uint_minmax(val, kp, 1, -1); +} + +static struct kernel_param_ops param_ops_uintpos = { + .set = param_set_uintpos, + .get = param_get_uint, +}; + +#define param_check_uintpos(name, p) \ + __param_check(name, p, unsigned int) + +unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF; +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(libcfs_console_backoff, uintpos, 0644); +#else +module_param_call(libcfs_console_backoff, param_set_uintpos, param_get_uint, + ¶m_ops_uintpos, 0644); +#endif +MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor"); + +unsigned int libcfs_debug_binary = 1; + +unsigned int libcfs_stack = 3 * THREAD_SIZE / 4; +EXPORT_SYMBOL(libcfs_stack); + +unsigned int libcfs_catastrophe; +EXPORT_SYMBOL(libcfs_catastrophe); + +unsigned int libcfs_watchdog_ratelimit = 300; + +unsigned int libcfs_panic_on_lbug = 1; +module_param(libcfs_panic_on_lbug, uint, 0644); +MODULE_PARM_DESC(libcfs_panic_on_lbug, "Lustre kernel panic on LBUG"); + +atomic_t libcfs_kmemory = ATOMIC_INIT(0); +EXPORT_SYMBOL(libcfs_kmemory); + +static wait_queue_head_t debug_ctlwq; + +char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT; +EXPORT_SYMBOL(libcfs_debug_file_path_arr); + +/* We need to pass a pointer here, but elsewhere this must be a const */ +static char *libcfs_debug_file_path = LIBCFS_DEBUG_FILE_PATH_DEFAULT; +module_param(libcfs_debug_file_path, charp, 0644); +MODULE_PARM_DESC(libcfs_debug_file_path, + "Path for dumping debug logs, set 'NONE' to prevent log dumping"); + +int libcfs_panic_in_progress; + +/* + * libcfs_debug_token2mask() expects the returned + * string in lower-case + */ +static const char *libcfs_debug_subsys2str(int subsys) +{ + static const char *libcfs_debug_subsystems[] = LIBCFS_DEBUG_SUBSYS_NAMES; + + if (subsys >= ARRAY_SIZE(libcfs_debug_subsystems)) + return NULL; + + return libcfs_debug_subsystems[subsys]; +} + +/* + * libcfs_debug_token2mask() expects the returned + * string in lower-case + */ +static const char *libcfs_debug_dbg2str(int debug) +{ + static const char *libcfs_debug_masks[] = LIBCFS_DEBUG_MASKS_NAMES; + + if (debug >= ARRAY_SIZE(libcfs_debug_masks)) + return NULL; + + return libcfs_debug_masks[debug]; +} + +int +libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys) +{ + const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : + libcfs_debug_dbg2str; + int len = 0; + const char *token; + int i; + + if (mask == 0) { /* "0" */ + if (size > 0) + str[0] = '0'; + len = 1; + } else { /* space-separated tokens */ + for (i = 0; i < 32; i++) { + if ((mask & (1 << i)) == 0) + continue; + + token = fn(i); + if (token == NULL) /* unused bit */ + continue; + + if (len > 0) { /* separator? */ + if (len < size) + str[len] = ' '; + len++; + } + + while (*token != 0) { + if (len < size) + str[len] = *token; + token++; + len++; + } + } + } + + /* terminate 'str' */ + if (len < size) + str[len] = 0; + else + str[size - 1] = 0; + + return len; +} + +int +libcfs_debug_str2mask(int *mask, const char *str, int is_subsys) +{ + const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : + libcfs_debug_dbg2str; + int m = 0; + int matched; + int n; + int t; + + /* Allow a number for backwards compatibility */ + for (n = strlen(str); n > 0; n--) + if (!isspace(str[n-1])) + break; + matched = n; + + t = sscanf(str, "%i%n", &m, &matched); + if (t >= 1 && matched == n) { + /* don't print warning for lctl set_param debug=0 or -1 */ + if (m != 0 && m != -1) + CWARN("You are trying to use a numerical value for the " + "mask - this will be deprecated in a future " + "release.\n"); + *mask = m; + return 0; + } + + return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK, + 0xffffffff); +} + +/** + * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages() + */ +void libcfs_debug_dumplog_internal(void *arg) +{ + static time64_t last_dump_time; + time64_t current_time; + void *journal_info; + + journal_info = current->journal_info; + current->journal_info = NULL; + current_time = ktime_get_real_seconds(); + + if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0 && + current_time > last_dump_time) { + last_dump_time = current_time; + snprintf(debug_file_name, sizeof(debug_file_name) - 1, + "%s.%lld.%ld", libcfs_debug_file_path_arr, + (s64)current_time, (uintptr_t)arg); + printk(KERN_ALERT "LustreError: dumping log to %s\n", + debug_file_name); + cfs_tracefile_dump_all_pages(debug_file_name); + libcfs_run_debug_log_upcall(debug_file_name); + } + current->journal_info = journal_info; +} + +static int libcfs_debug_dumplog_thread(void *arg) +{ + libcfs_debug_dumplog_internal(arg); + wake_up(&debug_ctlwq); + return 0; +} + +void libcfs_debug_dumplog(void) +{ + wait_queue_entry_t wait; + struct task_struct *dumper; + + ENTRY; + + /* + * we're being careful to ensure that the kernel thread is + * able to set our state to running as it exits before we + * get to schedule() + */ + init_waitqueue_entry(&wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&debug_ctlwq, &wait); + + dumper = kthread_run(libcfs_debug_dumplog_thread, + (void *)(long)current_pid(), + "libcfs_debug_dumper"); + if (IS_ERR(dumper)) + printk(KERN_ERR "LustreError: cannot start log dump thread:" + " %ld\n", PTR_ERR(dumper)); + else + schedule(); + + /* be sure to teardown if cfs_create_thread() failed */ + remove_wait_queue(&debug_ctlwq, &wait); + set_current_state(TASK_RUNNING); +} +EXPORT_SYMBOL(libcfs_debug_dumplog); + +int libcfs_debug_init(unsigned long bufsize) +{ + int rc = 0; + unsigned int max = libcfs_debug_mb; + + init_waitqueue_head(&debug_ctlwq); + + if (libcfs_console_max_delay <= 0 || /* not set by user or */ + libcfs_console_min_delay <= 0 || /* set to invalid values */ + libcfs_console_min_delay >= libcfs_console_max_delay) { + libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY; + libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY; + } + + if (libcfs_debug_file_path != NULL) { + strlcpy(libcfs_debug_file_path_arr, + libcfs_debug_file_path, + sizeof(libcfs_debug_file_path_arr)); + } + + /* + * If libcfs_debug_mb is set to an invalid value or uninitialized + * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES + */ + if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) { + max = TCD_MAX_PAGES; + } else { + max = (max / num_possible_cpus()); + max = (max << (20 - PAGE_SHIFT)); + } + + rc = cfs_tracefile_init(max); + if (rc) + return rc; + + libcfs_register_panic_notifier(); + kernel_param_lock(THIS_MODULE); + libcfs_debug_mb = cfs_trace_get_debug_mb(); + kernel_param_unlock(THIS_MODULE); + return rc; +} + +int libcfs_debug_cleanup(void) +{ + libcfs_unregister_panic_notifier(); + kernel_param_lock(THIS_MODULE); + cfs_tracefile_exit(); + kernel_param_unlock(THIS_MODULE); + return 0; +} + +int libcfs_debug_clear_buffer(void) +{ + cfs_trace_flush_pages(); + return 0; +} + +/* + * Debug markers, although printed by S_LNET + * should not be be marked as such. + */ +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_UNDEFINED +int libcfs_debug_mark_buffer(const char *text) +{ + CDEBUG(D_TRACE, "**************************************************\n"); + LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text); + CDEBUG(D_TRACE, "**************************************************\n"); + + return 0; +} +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_LNET + +long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc) +{ + libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n", + rc, rc, rc); + return rc; +} +EXPORT_SYMBOL(libcfs_log_return); + +void libcfs_log_goto(struct libcfs_debug_msg_data *msgdata, const char *label, + long rc) +{ + libcfs_debug_msg(msgdata, "Process leaving via %s (rc=%lu : %ld" + " : %#lx)\n", label, rc, rc, rc); +} +EXPORT_SYMBOL(libcfs_log_goto); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/fail.c b/drivers/staging/lustrefsx/libcfs/libcfs/fail.c new file mode 100644 index 0000000000000..13d31ab16fdf4 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/fail.c @@ -0,0 +1,144 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Oracle Corporation, Inc. + */ + +#include + +unsigned long cfs_fail_loc = 0; +unsigned int cfs_fail_val = 0; +int cfs_fail_err; +DECLARE_WAIT_QUEUE_HEAD(cfs_race_waitq); +int cfs_race_state; + +EXPORT_SYMBOL(cfs_fail_loc); +EXPORT_SYMBOL(cfs_fail_val); +EXPORT_SYMBOL(cfs_fail_err); +EXPORT_SYMBOL(cfs_race_waitq); +EXPORT_SYMBOL(cfs_race_state); + +int __cfs_fail_check_set(__u32 id, __u32 value, int set) +{ + static atomic_t cfs_fail_count = ATOMIC_INIT(0); + + LASSERT(!(id & CFS_FAIL_ONCE)); + + if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) == + (CFS_FAILED | CFS_FAIL_ONCE)) { + atomic_set(&cfs_fail_count, 0); /* paranoia */ + return 0; + } + + /* Fail 1/cfs_fail_val times */ + if (cfs_fail_loc & CFS_FAIL_RAND) { + if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0) + return 0; + } + + /* Skip the first cfs_fail_val, then fail */ + if (cfs_fail_loc & CFS_FAIL_SKIP) { + if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val) + return 0; + } + + /* check cfs_fail_val... */ + if (set == CFS_FAIL_LOC_VALUE) { + if (cfs_fail_val != -1 && cfs_fail_val != value) + return 0; + } + + /* Fail cfs_fail_val times, overridden by FAIL_ONCE */ + if (cfs_fail_loc & CFS_FAIL_SOME && + (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) { + int count = atomic_inc_return(&cfs_fail_count); + + if (count >= cfs_fail_val) { + set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); + atomic_set(&cfs_fail_count, 0); + /* we are lost race to increase */ + if (count > cfs_fail_val) + return 0; + } + } + + /* Take into account the current call for FAIL_ONCE for ORSET only, + * as RESET is a new fail_loc, it does not change the current call */ + if ((set == CFS_FAIL_LOC_ORSET) && (value & CFS_FAIL_ONCE)) + set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); + /* Lost race to set CFS_FAILED_BIT. */ + if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) { + /* If CFS_FAIL_ONCE is valid, only one process can fail, + * otherwise multi-process can fail at the same time. */ + if (cfs_fail_loc & CFS_FAIL_ONCE) + return 0; + } + + switch (set) { + case CFS_FAIL_LOC_NOSET: + case CFS_FAIL_LOC_VALUE: + break; + case CFS_FAIL_LOC_ORSET: + cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE); + break; + case CFS_FAIL_LOC_RESET: + cfs_fail_loc = value; + atomic_set(&cfs_fail_count, 0); + break; + default: + LASSERTF(0, "called with bad set %u\n", set); + break; + } + + return 1; +} +EXPORT_SYMBOL(__cfs_fail_check_set); + +int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) +{ + ktime_t till = ktime_add_ms(ktime_get(), ms); + int ret = 0; + + ret = __cfs_fail_check_set(id, value, set); + if (ret && likely(ms > 0)) { + CERROR("cfs_fail_timeout id %x sleeping for %dms\n", id, ms); + while (ktime_before(ktime_get(), till)) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(1000) / 10); + set_current_state(TASK_RUNNING); + if (!cfs_fail_loc) { + CERROR("cfs_fail_timeout interrupted\n"); + break; + } + } + if (cfs_fail_loc) + CERROR("cfs_fail_timeout id %x awake\n", id); + } + return ret; +} +EXPORT_SYMBOL(__cfs_fail_timeout_set); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/hash.c b/drivers/staging/lustrefsx/libcfs/libcfs/hash.c new file mode 100644 index 0000000000000..228cf0b022a58 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/hash.c @@ -0,0 +1,2123 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/hash.c + * + * Implement a hash class for hash process in lustre system. + * + * Author: YuZhangyong + * + * 2008-08-15: Brian Behlendorf + * - Simplified API and improved documentation + * - Added per-hash feature flags: + * * CFS_HASH_DEBUG additional validation + * * CFS_HASH_REHASH dynamic rehashing + * - Added per-hash statistics + * - General performance enhancements + * + * 2009-07-31: Liang Zhen + * - move all stuff to libcfs + * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH + * - ignore hs_rwlock if without CFS_HASH_REHASH setting + * - buckets are allocated one by one(instead of contiguous memory), + * to avoid unnecessary cacheline conflict + * + * 2010-03-01: Liang Zhen + * - "bucket" is a group of hlist_head now, user can specify bucket size + * by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share + * one lock for reducing memory overhead. + * + * - support lockless hash, caller will take care of locks: + * avoid lock overhead for hash tables that are already protected + * by locking in the caller for another reason + * + * - support both spin_lock/rwlock for bucket: + * overhead of spinlock contention is lower than read/write + * contention of rwlock, so using spinlock to serialize operations on + * bucket is more reasonable for those frequently changed hash tables + * + * - support one-single lock mode: + * one lock to protect all hash operations to avoid overhead of + * multiple locks if hash table is always small + * + * - removed a lot of unnecessary addref & decref on hash element: + * addref & decref are atomic operations in many use-cases which + * are expensive. + * + * - support non-blocking cfs_hash_add() and cfs_hash_findadd(): + * some lustre use-cases require these functions to be strictly + * non-blocking, we need to schedule required rehash on a different + * thread on those cases. + * + * - safer rehash on large hash table + * In old implementation, rehash function will exclusively lock the + * hash table and finish rehash in one batch, it's dangerous on SMP + * system because rehash millions of elements could take long time. + * New implemented rehash can release lock and relax CPU in middle + * of rehash, it's safe for another thread to search/change on the + * hash table even it's in rehasing. + * + * - support two different refcount modes + * . hash table has refcount on element + * . hash table doesn't change refcount on adding/removing element + * + * - support long name hash table (for param-tree) + * + * - fix a bug for cfs_hash_rehash_key: + * in old implementation, cfs_hash_rehash_key could screw up the + * hash-table because @key is overwritten without any protection. + * Now we need user to define hs_keycpy for those rehash enabled + * hash tables, cfs_hash_rehash_key will overwrite hash-key + * inside lock by calling hs_keycpy. + * + * - better hash iteration: + * Now we support both locked iteration & lockless iteration of hash + * table. Also, user can break the iteration by return 1 in callback. + */ +#include +#include + +#include +#include + +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 +static unsigned int warn_on_depth = 8; +module_param(warn_on_depth, uint, 0644); +MODULE_PARM_DESC(warn_on_depth, "warning when hash depth is high."); +#endif + +struct cfs_wi_sched *cfs_sched_rehash; + +static inline void +cfs_hash_nl_lock(union cfs_hash_lock *lock, int exclusive) {} + +static inline void +cfs_hash_nl_unlock(union cfs_hash_lock *lock, int exclusive) {} + +static inline void +cfs_hash_spin_lock(union cfs_hash_lock *lock, int exclusive) + __acquires(&lock->spin) +{ + spin_lock(&lock->spin); +} + +static inline void +cfs_hash_spin_unlock(union cfs_hash_lock *lock, int exclusive) + __releases(&lock->spin) +{ + spin_unlock(&lock->spin); +} + +static inline void +cfs_hash_rw_lock(union cfs_hash_lock *lock, int exclusive) + __acquires(&lock->rw) +{ + if (!exclusive) + read_lock(&lock->rw); + else + write_lock(&lock->rw); +} + +static inline void +cfs_hash_rw_unlock(union cfs_hash_lock *lock, int exclusive) + __releases(&lock->rw) +{ + if (!exclusive) + read_unlock(&lock->rw); + else + write_unlock(&lock->rw); +} + +/** No lock hash */ +static struct cfs_hash_lock_ops cfs_hash_nl_lops = { + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_nl_lock, + .hs_bkt_unlock = cfs_hash_nl_unlock, +}; + +/** no bucket lock, one spinlock to protect everything */ +static struct cfs_hash_lock_ops cfs_hash_nbl_lops = { + .hs_lock = cfs_hash_spin_lock, + .hs_unlock = cfs_hash_spin_unlock, + .hs_bkt_lock = cfs_hash_nl_lock, + .hs_bkt_unlock = cfs_hash_nl_unlock, +}; + +/** spin bucket lock, rehash is enabled */ +static struct cfs_hash_lock_ops cfs_hash_bkt_spin_lops = { + .hs_lock = cfs_hash_rw_lock, + .hs_unlock = cfs_hash_rw_unlock, + .hs_bkt_lock = cfs_hash_spin_lock, + .hs_bkt_unlock = cfs_hash_spin_unlock, +}; + +/** rw bucket lock, rehash is enabled */ +static struct cfs_hash_lock_ops cfs_hash_bkt_rw_lops = { + .hs_lock = cfs_hash_rw_lock, + .hs_unlock = cfs_hash_rw_unlock, + .hs_bkt_lock = cfs_hash_rw_lock, + .hs_bkt_unlock = cfs_hash_rw_unlock, +}; + +/** spin bucket lock, rehash is disabled */ +static struct cfs_hash_lock_ops cfs_hash_nr_bkt_spin_lops = { + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_spin_lock, + .hs_bkt_unlock = cfs_hash_spin_unlock, +}; + +/** rw bucket lock, rehash is disabled */ +static struct cfs_hash_lock_ops cfs_hash_nr_bkt_rw_lops = { + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_rw_lock, + .hs_bkt_unlock = cfs_hash_rw_unlock, +}; + +static void +cfs_hash_lock_setup(struct cfs_hash *hs) +{ + if (cfs_hash_with_no_lock(hs)) { + hs->hs_lops = &cfs_hash_nl_lops; + + } else if (cfs_hash_with_no_bktlock(hs)) { + hs->hs_lops = &cfs_hash_nbl_lops; + spin_lock_init(&hs->hs_lock.spin); + + } else if (cfs_hash_with_rehash(hs)) { + rwlock_init(&hs->hs_lock.rw); + + if (cfs_hash_with_rw_bktlock(hs)) + hs->hs_lops = &cfs_hash_bkt_rw_lops; + else if (cfs_hash_with_spin_bktlock(hs)) + hs->hs_lops = &cfs_hash_bkt_spin_lops; + else + LBUG(); + } else { + if (cfs_hash_with_rw_bktlock(hs)) + hs->hs_lops = &cfs_hash_nr_bkt_rw_lops; + else if (cfs_hash_with_spin_bktlock(hs)) + hs->hs_lops = &cfs_hash_nr_bkt_spin_lops; + else + LBUG(); + } +} + +/** + * Simple hash head without depth tracking + * new element is always added to head of hlist + */ +struct cfs_hash_head { + struct hlist_head hh_head; /**< entries list */ +}; + +static int +cfs_hash_hh_hhead_size(struct cfs_hash *hs) +{ + return sizeof(struct cfs_hash_head); +} + +static struct hlist_head * +cfs_hash_hh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + struct cfs_hash_head *head; + + head = (struct cfs_hash_head *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].hh_head; +} + +static int +cfs_hash_hh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd)); + return -1; /* unknown depth */ +} + +static int +cfs_hash_hh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + hlist_del_init(hnode); + return -1; /* unknown depth */ +} + +/** + * Simple hash head with depth tracking + * new element is always added to head of hlist + */ +struct cfs_hash_head_dep { + struct hlist_head hd_head; /**< entries list */ + unsigned int hd_depth; /**< list length */ +}; + +static int +cfs_hash_hd_hhead_size(struct cfs_hash *hs) +{ + return sizeof(struct cfs_hash_head_dep); +} + +static struct hlist_head * +cfs_hash_hd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + struct cfs_hash_head_dep *head; + + head = (struct cfs_hash_head_dep *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].hd_head; +} + +static int +cfs_hash_hd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_head_dep *hh; + + hh = container_of(cfs_hash_hd_hhead(hs, bd), + struct cfs_hash_head_dep, hd_head); + hlist_add_head(hnode, &hh->hd_head); + return ++hh->hd_depth; +} + +static int +cfs_hash_hd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_head_dep *hh; + + hh = container_of(cfs_hash_hd_hhead(hs, bd), + struct cfs_hash_head_dep, hd_head); + hlist_del_init(hnode); + return --hh->hd_depth; +} + +/** + * double links hash head without depth tracking + * new element is always added to tail of hlist + */ +struct cfs_hash_dhead { + struct hlist_head dh_head; /**< entries list */ + struct hlist_node *dh_tail; /**< the last entry */ +}; + +static int +cfs_hash_dh_hhead_size(struct cfs_hash *hs) +{ + return sizeof(struct cfs_hash_dhead); +} + +static struct hlist_head * +cfs_hash_dh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + struct cfs_hash_dhead *head; + + head = (struct cfs_hash_dhead *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].dh_head; +} + +static int +cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_dhead *dh; + + dh = container_of(cfs_hash_dh_hhead(hs, bd), + struct cfs_hash_dhead, dh_head); + if (dh->dh_tail != NULL) /* not empty */ + hlist_add_behind(hnode, dh->dh_tail); + else /* empty list */ + hlist_add_head(hnode, &dh->dh_head); + dh->dh_tail = hnode; + return -1; /* unknown depth */ +} + +static int +cfs_hash_dh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnd) +{ + struct cfs_hash_dhead *dh; + + dh = container_of(cfs_hash_dh_hhead(hs, bd), + struct cfs_hash_dhead, dh_head); + if (hnd->next == NULL) { /* it's the tail */ + dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL : + container_of(hnd->pprev, struct hlist_node, next); + } + hlist_del_init(hnd); + return -1; /* unknown depth */ +} + +/** + * double links hash head with depth tracking + * new element is always added to tail of hlist + */ +struct cfs_hash_dhead_dep { + struct hlist_head dd_head; /**< entries list */ + struct hlist_node *dd_tail; /**< the last entry */ + unsigned int dd_depth; /**< list length */ +}; + +static int +cfs_hash_dd_hhead_size(struct cfs_hash *hs) +{ + return sizeof(struct cfs_hash_dhead_dep); +} + +static struct hlist_head * +cfs_hash_dd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) +{ + struct cfs_hash_dhead_dep *head; + + head = (struct cfs_hash_dhead_dep *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].dd_head; +} + +static int +cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + struct cfs_hash_dhead_dep *dh; + + dh = container_of(cfs_hash_dd_hhead(hs, bd), + struct cfs_hash_dhead_dep, dd_head); + if (dh->dd_tail != NULL) /* not empty */ + hlist_add_behind(hnode, dh->dd_tail); + else /* empty list */ + hlist_add_head(hnode, &dh->dd_head); + dh->dd_tail = hnode; + return ++dh->dd_depth; +} + +static int +cfs_hash_dd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnd) +{ + struct cfs_hash_dhead_dep *dh; + + dh = container_of(cfs_hash_dd_hhead(hs, bd), + struct cfs_hash_dhead_dep, dd_head); + if (hnd->next == NULL) { /* it's the tail */ + dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL : + container_of(hnd->pprev, struct hlist_node, next); + } + hlist_del_init(hnd); + return --dh->dd_depth; +} + +static struct cfs_hash_hlist_ops cfs_hash_hh_hops = { + .hop_hhead = cfs_hash_hh_hhead, + .hop_hhead_size = cfs_hash_hh_hhead_size, + .hop_hnode_add = cfs_hash_hh_hnode_add, + .hop_hnode_del = cfs_hash_hh_hnode_del, +}; + +static struct cfs_hash_hlist_ops cfs_hash_hd_hops = { + .hop_hhead = cfs_hash_hd_hhead, + .hop_hhead_size = cfs_hash_hd_hhead_size, + .hop_hnode_add = cfs_hash_hd_hnode_add, + .hop_hnode_del = cfs_hash_hd_hnode_del, +}; + +static struct cfs_hash_hlist_ops cfs_hash_dh_hops = { + .hop_hhead = cfs_hash_dh_hhead, + .hop_hhead_size = cfs_hash_dh_hhead_size, + .hop_hnode_add = cfs_hash_dh_hnode_add, + .hop_hnode_del = cfs_hash_dh_hnode_del, +}; + +static struct cfs_hash_hlist_ops cfs_hash_dd_hops = { + .hop_hhead = cfs_hash_dd_hhead, + .hop_hhead_size = cfs_hash_dd_hhead_size, + .hop_hnode_add = cfs_hash_dd_hnode_add, + .hop_hnode_del = cfs_hash_dd_hnode_del, +}; + +static void +cfs_hash_hlist_setup(struct cfs_hash *hs) +{ + if (cfs_hash_with_add_tail(hs)) { + hs->hs_hops = cfs_hash_with_depth(hs) ? + &cfs_hash_dd_hops : &cfs_hash_dh_hops; + } else { + hs->hs_hops = cfs_hash_with_depth(hs) ? + &cfs_hash_hd_hops : &cfs_hash_hh_hops; + } +} + +static void +cfs_hash_bd_from_key(struct cfs_hash *hs, struct cfs_hash_bucket **bkts, + unsigned int bits, const void *key, struct cfs_hash_bd *bd) +{ + unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1); + + LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits); + + bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)]; + bd->bd_offset = index >> (bits - hs->hs_bkt_bits); +} + +void +cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (likely(hs->hs_rehash_buckets == NULL)) { + cfs_hash_bd_from_key(hs, hs->hs_buckets, + hs->hs_cur_bits, key, bd); + } else { + LASSERT(hs->hs_rehash_bits != 0); + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, bd); + } +} +EXPORT_SYMBOL(cfs_hash_bd_get); + +static inline void +cfs_hash_bd_dep_record(struct cfs_hash *hs, struct cfs_hash_bd *bd, int dep_cur) +{ + if (likely(dep_cur <= bd->bd_bucket->hsb_depmax)) + return; + + bd->bd_bucket->hsb_depmax = dep_cur; +# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 + if (likely(warn_on_depth == 0 || + max(warn_on_depth, hs->hs_dep_max) >= dep_cur)) + return; + + spin_lock(&hs->hs_dep_lock); + hs->hs_dep_max = dep_cur; + hs->hs_dep_bkt = bd->bd_bucket->hsb_index; + hs->hs_dep_off = bd->bd_offset; + hs->hs_dep_bits = hs->hs_cur_bits; + spin_unlock(&hs->hs_dep_lock); + + cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi); +# endif +} + +void +cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + int rc; + + rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode); + cfs_hash_bd_dep_record(hs, bd, rc); + bd->bd_bucket->hsb_version++; + if (unlikely(bd->bd_bucket->hsb_version == 0)) + bd->bd_bucket->hsb_version++; + bd->bd_bucket->hsb_count++; + + if (cfs_hash_with_counter(hs)) + atomic_inc(&hs->hs_count); + if (!cfs_hash_with_no_itemref(hs)) + cfs_hash_get(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_bd_add_locked); + +void +cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode) +{ + hs->hs_hops->hop_hnode_del(hs, bd, hnode); + + LASSERT(bd->bd_bucket->hsb_count > 0); + bd->bd_bucket->hsb_count--; + bd->bd_bucket->hsb_version++; + if (unlikely(bd->bd_bucket->hsb_version == 0)) + bd->bd_bucket->hsb_version++; + + if (cfs_hash_with_counter(hs)) { + LASSERT(atomic_read(&hs->hs_count) > 0); + atomic_dec(&hs->hs_count); + } + if (!cfs_hash_with_no_itemref(hs)) + cfs_hash_put_locked(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_bd_del_locked); + +void +cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old, + struct cfs_hash_bd *bd_new, struct hlist_node *hnode) +{ + struct cfs_hash_bucket *obkt = bd_old->bd_bucket; + struct cfs_hash_bucket *nbkt = bd_new->bd_bucket; + int rc; + + if (cfs_hash_bd_compare(bd_old, bd_new) == 0) + return; + + /* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops + * in cfs_hash_bd_del/add_locked */ + hs->hs_hops->hop_hnode_del(hs, bd_old, hnode); + rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode); + cfs_hash_bd_dep_record(hs, bd_new, rc); + + LASSERT(obkt->hsb_count > 0); + obkt->hsb_count--; + obkt->hsb_version++; + if (unlikely(obkt->hsb_version == 0)) + obkt->hsb_version++; + nbkt->hsb_count++; + nbkt->hsb_version++; + if (unlikely(nbkt->hsb_version == 0)) + nbkt->hsb_version++; +} + +enum { + /** always set, for sanity (avoid ZERO intent) */ + CFS_HS_LOOKUP_MASK_FIND = 1 << 0, + /** return entry with a ref */ + CFS_HS_LOOKUP_MASK_REF = 1 << 1, + /** add entry if not existing */ + CFS_HS_LOOKUP_MASK_ADD = 1 << 2, + /** delete entry, ignore other masks */ + CFS_HS_LOOKUP_MASK_DEL = 1 << 3, +}; + +enum cfs_hash_lookup_intent { + /** return item w/o refcount */ + CFS_HS_LOOKUP_IT_PEEK = CFS_HS_LOOKUP_MASK_FIND, + /** return item with refcount */ + CFS_HS_LOOKUP_IT_FIND = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_REF), + /** return item w/o refcount if existed, otherwise add */ + CFS_HS_LOOKUP_IT_ADD = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_ADD), + /** return item with refcount if existed, otherwise add */ + CFS_HS_LOOKUP_IT_FINDADD = (CFS_HS_LOOKUP_IT_FIND | + CFS_HS_LOOKUP_MASK_ADD), + /** delete if existed */ + CFS_HS_LOOKUP_IT_FINDDEL = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_DEL) +}; + +static struct hlist_node * +cfs_hash_bd_lookup_intent(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key, struct hlist_node *hnode, + enum cfs_hash_lookup_intent intent) + +{ + struct hlist_head *hhead = cfs_hash_bd_hhead(hs, bd); + struct hlist_node *ehnode; + struct hlist_node *match; + int intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0; + + /* with this function, we can avoid a lot of useless refcount ops, + * which are expensive atomic operations most time. */ + match = intent_add ? NULL : hnode; + hlist_for_each(ehnode, hhead) { + if (!cfs_hash_keycmp(hs, key, ehnode)) + continue; + + if (match != NULL && match != ehnode) /* can't match */ + continue; + + /* match and ... */ + if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) { + cfs_hash_bd_del_locked(hs, bd, ehnode); + return ehnode; + } + + /* caller wants refcount? */ + if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0) + cfs_hash_get(hs, ehnode); + return ehnode; + } + /* no match item */ + if (!intent_add) + return NULL; + + LASSERT(hnode != NULL); + cfs_hash_bd_add_locked(hs, bd, hnode); + return hnode; +} + +struct hlist_node * +cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key) +{ + return cfs_hash_bd_lookup_intent(hs, bd, key, NULL, + CFS_HS_LOOKUP_IT_FIND); +} +EXPORT_SYMBOL(cfs_hash_bd_lookup_locked); + +struct hlist_node * +cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const void *key) +{ + return cfs_hash_bd_lookup_intent(hs, bd, key, NULL, + CFS_HS_LOOKUP_IT_PEEK); +} +EXPORT_SYMBOL(cfs_hash_bd_peek_locked); + +static void +cfs_hash_multi_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, int excl) +{ + struct cfs_hash_bucket *prev = NULL; + int i; + + /** + * bds must be ascendantly ordered by bd->bd_bucket->hsb_index. + * NB: it's possible that several bds point to the same bucket but + * have different bd::bd_offset, so need take care of deadlock. + */ + cfs_hash_for_each_bd(bds, n, i) { + if (prev == bds[i].bd_bucket) + continue; + + LASSERT(prev == NULL || + prev->hsb_index < bds[i].bd_bucket->hsb_index); + cfs_hash_bd_lock(hs, &bds[i], excl); + prev = bds[i].bd_bucket; + } +} + +static void +cfs_hash_multi_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, int excl) +{ + struct cfs_hash_bucket *prev = NULL; + int i; + + cfs_hash_for_each_bd(bds, n, i) { + if (prev != bds[i].bd_bucket) { + cfs_hash_bd_unlock(hs, &bds[i], excl); + prev = bds[i].bd_bucket; + } + } +} + +static struct hlist_node * +cfs_hash_multi_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, const void *key) +{ + struct hlist_node *ehnode; + unsigned i; + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL, + CFS_HS_LOOKUP_IT_FIND); + if (ehnode != NULL) + return ehnode; + } + return NULL; +} + +static struct hlist_node * +cfs_hash_multi_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, const void *key, + struct hlist_node *hnode, int noref) +{ + struct hlist_node *ehnode; + int intent; + unsigned i; + + LASSERT(hnode != NULL); + intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF); + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, + NULL, intent); + if (ehnode != NULL) + return ehnode; + } + + if (i == 1) { /* only one bucket */ + cfs_hash_bd_add_locked(hs, &bds[0], hnode); + } else { + struct cfs_hash_bd mybd; + + cfs_hash_bd_get(hs, key, &mybd); + cfs_hash_bd_add_locked(hs, &mybd, hnode); + } + + return hnode; +} + +static struct hlist_node * +cfs_hash_multi_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + unsigned n, const void *key, + struct hlist_node *hnode) +{ + struct hlist_node *ehnode; + unsigned i; + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode, + CFS_HS_LOOKUP_IT_FINDDEL); + if (ehnode != NULL) + return ehnode; + } + return NULL; +} + +static void +cfs_hash_bd_order(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2) +{ + int rc; + + if (bd2->bd_bucket == NULL) + return; + + if (bd1->bd_bucket == NULL) { + *bd1 = *bd2; + bd2->bd_bucket = NULL; + return; + } + + rc = cfs_hash_bd_compare(bd1, bd2); + if (rc == 0) { + bd2->bd_bucket = NULL; + + } else if (rc > 0) { + swap(*bd1, *bd2); /* swab bd1 and bd2 */ + } +} + +void +cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, + struct cfs_hash_bd *bds) +{ + /* NB: caller should hold hs_lock.rw if REHASH is set */ + cfs_hash_bd_from_key(hs, hs->hs_buckets, + hs->hs_cur_bits, key, &bds[0]); + if (likely(hs->hs_rehash_buckets == NULL)) { + /* no rehash or not rehashing */ + bds[1].bd_bucket = NULL; + return; + } + + LASSERT(hs->hs_rehash_bits != 0); + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, &bds[1]); + + cfs_hash_bd_order(&bds[0], &bds[1]); +} + +void +cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl) +{ + cfs_hash_multi_bd_lock(hs, bds, 2, excl); +} + +void +cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl) +{ + cfs_hash_multi_bd_unlock(hs, bds, 2, excl); +} + +struct hlist_node * +cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key) +{ + return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key); +} + +struct hlist_node * +cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key, struct hlist_node *hnode, + int noref) +{ + return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key, + hnode, noref); +} + +struct hlist_node * +cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, + const void *key, struct hlist_node *hnode) +{ + return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode); +} + +static void +cfs_hash_buckets_free(struct cfs_hash_bucket **buckets, + int bkt_size, int prev_size, int size) +{ + int i; + + for (i = prev_size; i < size; i++) { + if (buckets[i] != NULL) + LIBCFS_FREE(buckets[i], bkt_size); + } + + LIBCFS_FREE(buckets, sizeof(buckets[0]) * size); +} + +/* + * Create or grow bucket memory. Return old_buckets if no allocation was + * needed, the newly allocated buckets if allocation was needed and + * successful, and NULL on error. + */ +static struct cfs_hash_bucket ** +cfs_hash_buckets_realloc(struct cfs_hash *hs, struct cfs_hash_bucket **old_bkts, + unsigned int old_size, unsigned int new_size) +{ + struct cfs_hash_bucket **new_bkts; + int i; + + LASSERT(old_size == 0 || old_bkts != NULL); + + if (old_bkts != NULL && old_size == new_size) + return old_bkts; + + LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size); + if (new_bkts == NULL) + return NULL; + + if (old_bkts != NULL) { + memcpy(new_bkts, old_bkts, + min(old_size, new_size) * sizeof(*old_bkts)); + } + + for (i = old_size; i < new_size; i++) { + struct hlist_head *hhead; + struct cfs_hash_bd bd; + + LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs)); + if (new_bkts[i] == NULL) { + cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs), + old_size, new_size); + return NULL; + } + + new_bkts[i]->hsb_index = i; + new_bkts[i]->hsb_version = 1; /* shouldn't be zero */ + new_bkts[i]->hsb_depmax = -1; /* unknown */ + bd.bd_bucket = new_bkts[i]; + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) + INIT_HLIST_HEAD(hhead); + + if (cfs_hash_with_no_lock(hs) || + cfs_hash_with_no_bktlock(hs)) + continue; + + if (cfs_hash_with_rw_bktlock(hs)) + rwlock_init(&new_bkts[i]->hsb_lock.rw); + else if (cfs_hash_with_spin_bktlock(hs)) + spin_lock_init(&new_bkts[i]->hsb_lock.spin); + else + LBUG(); /* invalid use-case */ + } + return new_bkts; +} + +/** + * Initialize new libcfs hash, where: + * @name - Descriptive hash name + * @cur_bits - Initial hash table size, in bits + * @max_bits - Maximum allowed hash table resize, in bits + * @ops - Registered hash table operations + * @flags - CFS_HASH_REHASH enable synamic hash resizing + * - CFS_HASH_SORT enable chained hash sort + */ +static int cfs_hash_rehash_worker(struct cfs_workitem *wi); + +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 +static int cfs_hash_dep_print(struct cfs_workitem *wi) +{ + struct cfs_hash *hs = container_of(wi, struct cfs_hash, hs_dep_wi); + int dep; + int bkt; + int off; + int bits; + + spin_lock(&hs->hs_dep_lock); + dep = hs->hs_dep_max; + bkt = hs->hs_dep_bkt; + off = hs->hs_dep_off; + bits = hs->hs_dep_bits; + spin_unlock(&hs->hs_dep_lock); + + LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n", + hs->hs_name, bits, dep, bkt, off); + spin_lock(&hs->hs_dep_lock); + hs->hs_dep_bits = 0; /* mark as workitem done */ + spin_unlock(&hs->hs_dep_lock); + return 0; +} + +static void cfs_hash_depth_wi_init(struct cfs_hash *hs) +{ + spin_lock_init(&hs->hs_dep_lock); + cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print); +} + +static void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) +{ + if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi)) + return; + + spin_lock(&hs->hs_dep_lock); + while (hs->hs_dep_bits != 0) { + spin_unlock(&hs->hs_dep_lock); + cond_resched(); + spin_lock(&hs->hs_dep_lock); + } + spin_unlock(&hs->hs_dep_lock); +} + +#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */ + +static inline void cfs_hash_depth_wi_init(struct cfs_hash *hs) {} +static inline void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) {} + +#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */ + +struct cfs_hash * +cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits, + unsigned bkt_bits, unsigned extra_bytes, + unsigned min_theta, unsigned max_theta, + struct cfs_hash_ops *ops, unsigned flags) +{ + struct cfs_hash *hs; + int len; + + ENTRY; + + CLASSERT(CFS_HASH_THETA_BITS < 15); + + LASSERT(name != NULL); + LASSERT(ops != NULL); + LASSERT(ops->hs_key); + LASSERT(ops->hs_hash); + LASSERT(ops->hs_object); + LASSERT(ops->hs_keycmp); + LASSERT(ops->hs_get != NULL); + LASSERT(ops->hs_put != NULL || ops->hs_put_locked != NULL); + + if ((flags & CFS_HASH_REHASH) != 0) + flags |= CFS_HASH_COUNTER; /* must have counter */ + + LASSERT(cur_bits > 0); + LASSERT(cur_bits >= bkt_bits); + LASSERT(max_bits >= cur_bits && max_bits < 31); + LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits)); + LASSERT(ergo((flags & CFS_HASH_REHASH) != 0, + (flags & CFS_HASH_NO_LOCK) == 0)); + LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0, + ops->hs_keycpy != NULL)); + + len = (flags & CFS_HASH_BIGNAME) == 0 ? + CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN; + LIBCFS_ALLOC(hs, offsetof(struct cfs_hash, hs_name[len])); + if (hs == NULL) + RETURN(NULL); + + strlcpy(hs->hs_name, name, len); + hs->hs_flags = flags; + + atomic_set(&hs->hs_refcount, 1); + atomic_set(&hs->hs_count, 0); + + cfs_hash_lock_setup(hs); + cfs_hash_hlist_setup(hs); + + hs->hs_cur_bits = (__u8)cur_bits; + hs->hs_min_bits = (__u8)cur_bits; + hs->hs_max_bits = (__u8)max_bits; + hs->hs_bkt_bits = (__u8)bkt_bits; + + hs->hs_ops = ops; + hs->hs_extra_bytes = extra_bytes; + hs->hs_rehash_bits = 0; + cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker); + cfs_hash_depth_wi_init(hs); + + if (cfs_hash_with_rehash(hs)) + __cfs_hash_set_theta(hs, min_theta, max_theta); + + hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0, + CFS_HASH_NBKT(hs)); + if (hs->hs_buckets != NULL) + return hs; + + LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[len])); + RETURN(NULL); +} +EXPORT_SYMBOL(cfs_hash_create); + +/** + * Cleanup libcfs hash @hs. + */ +static void +cfs_hash_destroy(struct cfs_hash *hs) +{ + struct hlist_node *hnode; + struct hlist_node *pos; + struct cfs_hash_bd bd; + int i; + ENTRY; + + LASSERT(hs != NULL); + LASSERT(!cfs_hash_is_exiting(hs) && + !cfs_hash_is_iterating(hs)); + + /** + * prohibit further rehashes, don't need any lock because + * I'm the only (last) one can change it. + */ + hs->hs_exiting = 1; + if (cfs_hash_with_rehash(hs)) + cfs_hash_rehash_cancel(hs); + + cfs_hash_depth_wi_cancel(hs); + /* rehash should be done/canceled */ + LASSERT(hs->hs_buckets != NULL && + hs->hs_rehash_buckets == NULL); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + LASSERT(bd.bd_bucket != NULL); + /* no need to take this lock, just for consistent code */ + cfs_hash_bd_lock(hs, &bd, 1); + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + LASSERTF(!cfs_hash_with_assert_empty(hs), + "hash %s bucket %u(%u) is not " + " empty: %u items left\n", + hs->hs_name, bd.bd_bucket->hsb_index, + bd.bd_offset, bd.bd_bucket->hsb_count); + /* can't assert key valicate, because we + * can interrupt rehash */ + cfs_hash_bd_del_locked(hs, &bd, hnode); + cfs_hash_exit(hs, hnode); + } + } + LASSERT(bd.bd_bucket->hsb_count == 0); + cfs_hash_bd_unlock(hs, &bd, 1); + cond_resched(); + } + + LASSERT(atomic_read(&hs->hs_count) == 0); + + cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs), + 0, CFS_HASH_NBKT(hs)); + i = cfs_hash_with_bigname(hs) ? + CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN; + LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[i])); + + EXIT; +} + +struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs) +{ + if (atomic_inc_not_zero(&hs->hs_refcount)) + return hs; + return NULL; +} +EXPORT_SYMBOL(cfs_hash_getref); + +void cfs_hash_putref(struct cfs_hash *hs) +{ + if (atomic_dec_and_test(&hs->hs_refcount)) + cfs_hash_destroy(hs); +} +EXPORT_SYMBOL(cfs_hash_putref); + +static inline int +cfs_hash_rehash_bits(struct cfs_hash *hs) +{ + if (cfs_hash_with_no_lock(hs) || + !cfs_hash_with_rehash(hs)) + return -EOPNOTSUPP; + + if (unlikely(cfs_hash_is_exiting(hs))) + return -ESRCH; + + if (unlikely(cfs_hash_is_rehashing(hs))) + return -EALREADY; + + if (unlikely(cfs_hash_is_iterating(hs))) + return -EAGAIN; + + /* XXX: need to handle case with max_theta != 2.0 + * and the case with min_theta != 0.5 */ + if ((hs->hs_cur_bits < hs->hs_max_bits) && + (__cfs_hash_theta(hs) > hs->hs_max_theta)) + return hs->hs_cur_bits + 1; + + if (!cfs_hash_with_shrink(hs)) + return 0; + + if ((hs->hs_cur_bits > hs->hs_min_bits) && + (__cfs_hash_theta(hs) < hs->hs_min_theta)) + return hs->hs_cur_bits - 1; + + return 0; +} + +/** + * don't allow inline rehash if: + * - user wants non-blocking change (add/del) on hash table + * - too many elements + */ +static inline int +cfs_hash_rehash_inline(struct cfs_hash *hs) +{ + return !cfs_hash_with_nblk_change(hs) && + atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG; +} + +/** + * Add item @hnode to libcfs hash @hs using @key. The registered + * ops->hs_get function will be called when the item is added. + */ +void +cfs_hash_add(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) +{ + struct cfs_hash_bd bd; + int bits; + + LASSERT(hlist_unhashed(hnode)); + + cfs_hash_lock(hs, 0); + cfs_hash_bd_get_and_lock(hs, key, &bd, 1); + + cfs_hash_key_validate(hs, key, hnode); + cfs_hash_bd_add_locked(hs, &bd, hnode); + + cfs_hash_bd_unlock(hs, &bd, 1); + + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); +} +EXPORT_SYMBOL(cfs_hash_add); + +static struct hlist_node * +cfs_hash_find_or_add(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode, int noref) +{ + struct hlist_node *ehnode; + struct cfs_hash_bd bds[2]; + int bits = 0; + + LASSERTF(hlist_unhashed(hnode), "hnode = %p\n", hnode); + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1); + + cfs_hash_key_validate(hs, key, hnode); + ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key, + hnode, noref); + cfs_hash_dual_bd_unlock(hs, bds, 1); + + if (ehnode == hnode) /* new item added */ + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); + + return ehnode; +} + +/** + * Add item @hnode to libcfs hash @hs using @key. The registered + * ops->hs_get function will be called if the item was added. + * Returns 0 on success or -EALREADY on key collisions. + */ +int +cfs_hash_add_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) +{ + return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ? + -EALREADY : 0; +} +EXPORT_SYMBOL(cfs_hash_add_unique); + +/** + * Add item @hnode to libcfs hash @hs using @key. If this @key + * already exists in the hash then ops->hs_get will be called on the + * conflicting entry and that entry will be returned to the caller. + * Otherwise ops->hs_get is called on the item which was added. + */ +void * +cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key, + struct hlist_node *hnode) +{ + hnode = cfs_hash_find_or_add(hs, key, hnode, 0); + + return cfs_hash_object(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_findadd_unique); + +/** + * Delete item @hnode from the libcfs hash @hs using @key. The @key + * is required to ensure the correct hash bucket is locked since there + * is no direct linkage from the item to the bucket. The object + * removed from the hash will be returned and obs->hs_put is called + * on the removed object. + */ +void * +cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) +{ + void *obj = NULL; + int bits = 0; + struct cfs_hash_bd bds[2]; + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1); + + /* NB: do nothing if @hnode is not in hash table */ + if (hnode == NULL || !hlist_unhashed(hnode)) { + if (bds[1].bd_bucket == NULL && hnode != NULL) { + cfs_hash_bd_del_locked(hs, &bds[0], hnode); + } else { + hnode = cfs_hash_dual_bd_finddel_locked(hs, bds, + key, hnode); + } + } + + if (hnode != NULL) { + obj = cfs_hash_object(hs, hnode); + bits = cfs_hash_rehash_bits(hs); + } + + cfs_hash_dual_bd_unlock(hs, bds, 1); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); + + return obj; +} +EXPORT_SYMBOL(cfs_hash_del); + +/** + * Delete item given @key in libcfs hash @hs. The first @key found in + * the hash will be removed, if the key exists multiple times in the hash + * @hs this function must be called once per key. The removed object + * will be returned and ops->hs_put is called on the removed object. + */ +void * +cfs_hash_del_key(struct cfs_hash *hs, const void *key) +{ + return cfs_hash_del(hs, key, NULL); +} +EXPORT_SYMBOL(cfs_hash_del_key); + +/** + * Lookup an item using @key in the libcfs hash @hs and return it. + * If the @key is found in the hash hs->hs_get() is called and the + * matching objects is returned. It is the callers responsibility + * to call the counterpart ops->hs_put using the cfs_hash_put() macro + * when when finished with the object. If the @key was not found + * in the hash @hs NULL is returned. + */ +void * +cfs_hash_lookup(struct cfs_hash *hs, const void *key) +{ + void *obj = NULL; + struct hlist_node *hnode; + struct cfs_hash_bd bds[2]; + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0); + + hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key); + if (hnode != NULL) + obj = cfs_hash_object(hs, hnode); + + cfs_hash_dual_bd_unlock(hs, bds, 0); + cfs_hash_unlock(hs, 0); + + return obj; +} +EXPORT_SYMBOL(cfs_hash_lookup); + +static void +cfs_hash_for_each_enter(struct cfs_hash *hs) +{ + LASSERT(!cfs_hash_is_exiting(hs)); + + if (!cfs_hash_with_rehash(hs)) + return; + /* + * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter + * because it's just an unreliable signal to rehash-thread, + * rehash-thread will try to finish rehash ASAP when seeing this. + */ + hs->hs_iterating = 1; + + cfs_hash_lock(hs, 1); + hs->hs_iterators++; + + /* NB: iteration is mostly called by service thread, + * we tend to cancel pending rehash-request, instead of + * blocking service thread, we will relaunch rehash request + * after iteration */ + if (cfs_hash_is_rehashing(hs)) + cfs_hash_rehash_cancel_locked(hs); + cfs_hash_unlock(hs, 1); +} + +static void +cfs_hash_for_each_exit(struct cfs_hash *hs) +{ + int remained; + int bits; + + if (!cfs_hash_with_rehash(hs)) + return; + cfs_hash_lock(hs, 1); + remained = --hs->hs_iterators; + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 1); + /* NB: it's race on cfs_has_t::hs_iterating, see above */ + if (remained == 0) + hs->hs_iterating = 0; + if (bits > 0) { + cfs_hash_rehash(hs, atomic_read(&hs->hs_count) < + CFS_HASH_LOOP_HOG); + } +} + +/** + * For each item in the libcfs hash @hs call the passed callback @func + * and pass to it as an argument each hash item and the private @data. + * + * a) the function may sleep! + * b) during the callback: + * . the bucket lock is held so the callback must never sleep. + * . if @removal_safe is true, use can remove current item by + * cfs_hash_bd_del_locked + */ +static __u64 +cfs_hash_for_each_tight(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, + void *data, int remove_safe) +{ + struct hlist_node *hnode; + struct hlist_node *pos; + struct cfs_hash_bd bd; + __u64 count = 0; + int excl = !!remove_safe; + int loop = 0; + int i; + ENTRY; + + cfs_hash_for_each_enter(hs); + + cfs_hash_lock(hs, 0); + LASSERT(!cfs_hash_is_rehashing(hs)); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + cfs_hash_bd_lock(hs, &bd, excl); + if (func == NULL) { /* only glimpse size */ + count += bd.bd_bucket->hsb_count; + cfs_hash_bd_unlock(hs, &bd, excl); + continue; + } + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + cfs_hash_bucket_validate(hs, &bd, hnode); + count++; + loop++; + if (func(hs, &bd, hnode, data)) { + cfs_hash_bd_unlock(hs, &bd, excl); + goto out; + } + } + } + cfs_hash_bd_unlock(hs, &bd, excl); + if (loop < CFS_HASH_LOOP_HOG) + continue; + loop = 0; + cfs_hash_unlock(hs, 0); + cond_resched(); + cfs_hash_lock(hs, 0); + } + out: + cfs_hash_unlock(hs, 0); + + cfs_hash_for_each_exit(hs); + RETURN(count); +} + +struct cfs_hash_cond_arg { + cfs_hash_cond_opt_cb_t func; + void *arg; +}; + +static int +cfs_hash_cond_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + struct cfs_hash_cond_arg *cond = data; + + if (cond->func(cfs_hash_object(hs, hnode), cond->arg)) + cfs_hash_bd_del_locked(hs, bd, hnode); + return 0; +} + +/** + * Delete item from the libcfs hash @hs when @func return true. + * The write lock being hold during loop for each bucket to avoid + * any object be reference. + */ +void +cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t func, void *data) +{ + struct cfs_hash_cond_arg arg = { + .func = func, + .arg = data, + }; + + cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1); +} +EXPORT_SYMBOL(cfs_hash_cond_del); + +void +cfs_hash_for_each(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + cfs_hash_for_each_tight(hs, func, data, 0); +} +EXPORT_SYMBOL(cfs_hash_for_each); + +void +cfs_hash_for_each_safe(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + cfs_hash_for_each_tight(hs, func, data, 1); +} +EXPORT_SYMBOL(cfs_hash_for_each_safe); + +static int +cfs_hash_peek(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + *(int *)data = 0; + return 1; /* return 1 to break the loop */ +} + +int +cfs_hash_is_empty(struct cfs_hash *hs) +{ + int empty = 1; + + cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0); + return empty; +} +EXPORT_SYMBOL(cfs_hash_is_empty); + +__u64 +cfs_hash_size_get(struct cfs_hash *hs) +{ + return cfs_hash_with_counter(hs) ? + atomic_read(&hs->hs_count) : + cfs_hash_for_each_tight(hs, NULL, NULL, 0); +} +EXPORT_SYMBOL(cfs_hash_size_get); + +/* + * cfs_hash_for_each_relax: + * Iterate the hash table and call @func on each item without + * any lock. This function can't guarantee to finish iteration + * if these features are enabled: + * + * a. if rehash_key is enabled, an item can be moved from + * one bucket to another bucket + * b. user can remove non-zero-ref item from hash-table, + * so the item can be removed from hash-table, even worse, + * it's possible that user changed key and insert to another + * hash bucket. + * there's no way for us to finish iteration correctly on previous + * two cases, so iteration has to be stopped on change. + */ +static int +cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, + void *data, int start) +{ + struct hlist_node *hnode; + struct hlist_node *next = NULL; + struct cfs_hash_bd bd; + __u32 version; + int count = 0; + int stop_on_change; + int has_put_locked; + int rc = 0; + int i, end = -1; + ENTRY; + + stop_on_change = cfs_hash_with_rehash_key(hs) || + !cfs_hash_with_no_itemref(hs); + has_put_locked = hs->hs_ops->hs_put_locked != NULL; + cfs_hash_lock(hs, 0); +again: + LASSERT(!cfs_hash_is_rehashing(hs)); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + if (i < start) + continue; + else if (end > 0 && i >= end) + break; + + cfs_hash_bd_lock(hs, &bd, 0); + version = cfs_hash_bd_version_get(&bd); + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + hnode = hhead->first; + if (hnode == NULL) + continue; + cfs_hash_get(hs, hnode); + for (; hnode != NULL; hnode = next) { + cfs_hash_bucket_validate(hs, &bd, hnode); + next = hnode->next; + if (next != NULL) + cfs_hash_get(hs, next); + cfs_hash_bd_unlock(hs, &bd, 0); + cfs_hash_unlock(hs, 0); + + rc = func(hs, &bd, hnode, data); + if (stop_on_change || !has_put_locked) + cfs_hash_put(hs, hnode); + + cond_resched(); + count++; + + cfs_hash_lock(hs, 0); + cfs_hash_bd_lock(hs, &bd, 0); + if (stop_on_change) { + if (version != + cfs_hash_bd_version_get(&bd)) + rc = -EINTR; + } else if (has_put_locked) { + cfs_hash_put_locked(hs, hnode); + } + if (rc) /* callback wants to break iteration */ + break; + } + if (next != NULL) { + if (has_put_locked) { + cfs_hash_put_locked(hs, next); + next = NULL; + } + break; + } else if (rc != 0) { + break; + } + } + cfs_hash_bd_unlock(hs, &bd, 0); + if (next != NULL && !has_put_locked) { + cfs_hash_put(hs, next); + next = NULL; + } + if (rc) /* callback wants to break iteration */ + break; + } + + if (start > 0 && rc == 0) { + end = start; + start = 0; + goto again; + } + + cfs_hash_unlock(hs, 0); + return count; +} + +int +cfs_hash_for_each_nolock(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data, int start) +{ + ENTRY; + + if (cfs_hash_with_no_lock(hs) || + cfs_hash_with_rehash_key(hs) || + !cfs_hash_with_no_itemref(hs)) + RETURN(-EOPNOTSUPP); + + if (hs->hs_ops->hs_get == NULL || + (hs->hs_ops->hs_put == NULL && + hs->hs_ops->hs_put_locked == NULL)) + RETURN(-EOPNOTSUPP); + + cfs_hash_for_each_enter(hs); + cfs_hash_for_each_relax(hs, func, data, start); + cfs_hash_for_each_exit(hs); + + RETURN(0); +} +EXPORT_SYMBOL(cfs_hash_for_each_nolock); + +/** + * For each hash bucket in the libcfs hash @hs call the passed callback + * @func until all the hash buckets are empty. The passed callback @func + * or the previously registered callback hs->hs_put must remove the item + * from the hash. You may either use the cfs_hash_del() or hlist_del() + * functions. No rwlocks will be held during the callback @func it is + * safe to sleep if needed. This function will not terminate until the + * hash is empty. Note it is still possible to concurrently add new + * items in to the hash. It is the callers responsibility to ensure + * the required locking is in place to prevent concurrent insertions. + */ +int +cfs_hash_for_each_empty(struct cfs_hash *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + unsigned i = 0; + ENTRY; + + if (cfs_hash_with_no_lock(hs)) + return -EOPNOTSUPP; + + if (hs->hs_ops->hs_get == NULL || + (hs->hs_ops->hs_put == NULL && + hs->hs_ops->hs_put_locked == NULL)) + return -EOPNOTSUPP; + + cfs_hash_for_each_enter(hs); + while (cfs_hash_for_each_relax(hs, func, data, 0)) { + CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n", + hs->hs_name, i++); + } + cfs_hash_for_each_exit(hs); + RETURN(0); +} +EXPORT_SYMBOL(cfs_hash_for_each_empty); + +void +cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex, + cfs_hash_for_each_cb_t func, void *data) +{ + struct hlist_head *hhead; + struct hlist_node *hnode; + struct cfs_hash_bd bd; + + cfs_hash_for_each_enter(hs); + cfs_hash_lock(hs, 0); + if (hindex >= CFS_HASH_NHLIST(hs)) + goto out; + + cfs_hash_bd_index_set(hs, hindex, &bd); + + cfs_hash_bd_lock(hs, &bd, 0); + hhead = cfs_hash_bd_hhead(hs, &bd); + hlist_for_each(hnode, hhead) { + if (func(hs, &bd, hnode, data)) + break; + } + cfs_hash_bd_unlock(hs, &bd, 0); +out: + cfs_hash_unlock(hs, 0); + cfs_hash_for_each_exit(hs); +} + +EXPORT_SYMBOL(cfs_hash_hlist_for_each); + +/* + * For each item in the libcfs hash @hs which matches the @key call + * the passed callback @func and pass to it as an argument each hash + * item and the private @data. During the callback the bucket lock + * is held so the callback must never sleep. + */ +void +cfs_hash_for_each_key(struct cfs_hash *hs, const void *key, + cfs_hash_for_each_cb_t func, void *data) +{ + struct hlist_node *hnode; + struct cfs_hash_bd bds[2]; + unsigned i; + + cfs_hash_lock(hs, 0); + + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0); + + cfs_hash_for_each_bd(bds, 2, i) { + struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]); + + hlist_for_each(hnode, hlist) { + cfs_hash_bucket_validate(hs, &bds[i], hnode); + + if (cfs_hash_keycmp(hs, key, hnode)) { + if (func(hs, &bds[i], hnode, data)) + break; + } + } + } + + cfs_hash_dual_bd_unlock(hs, bds, 0); + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_for_each_key); + +/** + * Rehash the libcfs hash @hs to the given @bits. This can be used + * to grow the hash size when excessive chaining is detected, or to + * shrink the hash when it is larger than needed. When the CFS_HASH_REHASH + * flag is set in @hs the libcfs hash may be dynamically rehashed + * during addition or removal if the hash's theta value exceeds + * either the hs->hs_min_theta or hs->max_theta values. By default + * these values are tuned to keep the chained hash depth small, and + * this approach assumes a reasonably uniform hashing function. The + * theta thresholds for @hs are tunable via cfs_hash_set_theta(). + */ +void +cfs_hash_rehash_cancel_locked(struct cfs_hash *hs) +{ + int i; + + /* need hold cfs_hash_lock(hs, 1) */ + LASSERT(cfs_hash_with_rehash(hs) && + !cfs_hash_with_no_lock(hs)); + + if (!cfs_hash_is_rehashing(hs)) + return; + + if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) { + hs->hs_rehash_bits = 0; + return; + } + + for (i = 2; cfs_hash_is_rehashing(hs); i++) { + cfs_hash_unlock(hs, 1); + /* raise console warning while waiting too long */ + CDEBUG(is_power_of_2(i >> 3) ? D_WARNING : D_INFO, + "hash %s is still rehashing, rescheded %d\n", + hs->hs_name, i - 1); + cond_resched(); + cfs_hash_lock(hs, 1); + } +} + +void +cfs_hash_rehash_cancel(struct cfs_hash *hs) +{ + cfs_hash_lock(hs, 1); + cfs_hash_rehash_cancel_locked(hs); + cfs_hash_unlock(hs, 1); +} + +int +cfs_hash_rehash(struct cfs_hash *hs, int do_rehash) +{ + int rc; + + LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs)); + + cfs_hash_lock(hs, 1); + + rc = cfs_hash_rehash_bits(hs); + if (rc <= 0) { + cfs_hash_unlock(hs, 1); + return rc; + } + + hs->hs_rehash_bits = rc; + if (!do_rehash) { + /* launch and return */ + cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi); + cfs_hash_unlock(hs, 1); + return 0; + } + + /* rehash right now */ + cfs_hash_unlock(hs, 1); + + return cfs_hash_rehash_worker(&hs->hs_rehash_wi); +} + +static int +cfs_hash_rehash_bd(struct cfs_hash *hs, struct cfs_hash_bd *old) +{ + struct cfs_hash_bd new; + struct hlist_head *hhead; + struct hlist_node *hnode; + struct hlist_node *pos; + void *key; + int c = 0; + + /* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */ + cfs_hash_bd_for_each_hlist(hs, old, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + key = cfs_hash_key(hs, hnode); + LASSERT(key != NULL); + /* Validate hnode is in the correct bucket. */ + cfs_hash_bucket_validate(hs, old, hnode); + /* + * Delete from old hash bucket; move to new bucket. + * ops->hs_key must be defined. + */ + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, &new); + cfs_hash_bd_move_locked(hs, old, &new, hnode); + c++; + } + } + return c; +} + +static int +cfs_hash_rehash_worker(struct cfs_workitem *wi) +{ + struct cfs_hash *hs = + container_of(wi, struct cfs_hash, hs_rehash_wi); + struct cfs_hash_bucket **bkts; + struct cfs_hash_bd bd; + unsigned int old_size; + unsigned int new_size; + int bsize; + int count = 0; + int rc = 0; + int i; + + LASSERT(hs != NULL && cfs_hash_with_rehash(hs)); + + cfs_hash_lock(hs, 0); + LASSERT(cfs_hash_is_rehashing(hs)); + + old_size = CFS_HASH_NBKT(hs); + new_size = CFS_HASH_RH_NBKT(hs); + + cfs_hash_unlock(hs, 0); + + /* + * don't need hs::hs_rwlock for hs::hs_buckets, + * because nobody can change bkt-table except me. + */ + bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets, + old_size, new_size); + cfs_hash_lock(hs, 1); + if (bkts == NULL) { + rc = -ENOMEM; + goto out; + } + + if (bkts == hs->hs_buckets) { + bkts = NULL; /* do nothing */ + goto out; + } + + rc = __cfs_hash_theta(hs); + if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) { + /* free the new allocated bkt-table */ + old_size = new_size; + new_size = CFS_HASH_NBKT(hs); + rc = -EALREADY; + goto out; + } + + LASSERT(hs->hs_rehash_buckets == NULL); + hs->hs_rehash_buckets = bkts; + + rc = 0; + cfs_hash_for_each_bucket(hs, &bd, i) { + if (cfs_hash_is_exiting(hs)) { + rc = -ESRCH; + /* someone wants to destroy the hash, abort now */ + if (old_size < new_size) /* OK to free old bkt-table */ + break; + /* it's shrinking, need free new bkt-table */ + hs->hs_rehash_buckets = NULL; + old_size = new_size; + new_size = CFS_HASH_NBKT(hs); + goto out; + } + + count += cfs_hash_rehash_bd(hs, &bd); + if (count < CFS_HASH_LOOP_HOG || + cfs_hash_is_iterating(hs)) { /* need to finish ASAP */ + continue; + } + + count = 0; + cfs_hash_unlock(hs, 1); + cond_resched(); + cfs_hash_lock(hs, 1); + } + + hs->hs_rehash_count++; + + bkts = hs->hs_buckets; + hs->hs_buckets = hs->hs_rehash_buckets; + hs->hs_rehash_buckets = NULL; + + hs->hs_cur_bits = hs->hs_rehash_bits; + out: + hs->hs_rehash_bits = 0; + if (rc == -ESRCH) /* never be scheduled again */ + cfs_wi_exit(cfs_sched_rehash, wi); + bsize = cfs_hash_bkt_size(hs); + cfs_hash_unlock(hs, 1); + /* can't refer to @hs anymore because it could be destroyed */ + if (bkts != NULL) + cfs_hash_buckets_free(bkts, bsize, new_size, old_size); + if (rc != 0) + CDEBUG(D_INFO, "early quit of rehashing: %d\n", rc); + /* return 1 only if cfs_wi_exit is called */ + return rc == -ESRCH; +} + +/** + * Rehash the object referenced by @hnode in the libcfs hash @hs. The + * @old_key must be provided to locate the objects previous location + * in the hash, and the @new_key will be used to reinsert the object. + * Use this function instead of a cfs_hash_add() + cfs_hash_del() + * combo when it is critical that there is no window in time where the + * object is missing from the hash. When an object is being rehashed + * the registered cfs_hash_get() and cfs_hash_put() functions will + * not be called. + */ +void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key, + void *new_key, struct hlist_node *hnode) +{ + struct cfs_hash_bd bds[3]; + struct cfs_hash_bd old_bds[2]; + struct cfs_hash_bd new_bd; + + LASSERT(!hlist_unhashed(hnode)); + + cfs_hash_lock(hs, 0); + + cfs_hash_dual_bd_get(hs, old_key, old_bds); + cfs_hash_bd_get(hs, new_key, &new_bd); + + bds[0] = old_bds[0]; + bds[1] = old_bds[1]; + bds[2] = new_bd; + + /* NB: bds[0] and bds[1] are ordered already */ + cfs_hash_bd_order(&bds[1], &bds[2]); + cfs_hash_bd_order(&bds[0], &bds[1]); + + cfs_hash_multi_bd_lock(hs, bds, 3, 1); + if (likely(old_bds[1].bd_bucket == NULL)) { + cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode); + } else { + cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode); + cfs_hash_bd_add_locked(hs, &new_bd, hnode); + } + /* overwrite key inside locks, otherwise may screw up with + * other operations, i.e: rehash */ + cfs_hash_keycpy(hs, hnode, new_key); + + cfs_hash_multi_bd_unlock(hs, bds, 3, 1); + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_rehash_key); + +void cfs_hash_debug_header(struct seq_file *m) +{ + seq_printf(m, "%-*s cur min max theta t-min t-max flags rehash count maxdep maxdepb distribution\n", + CFS_HASH_BIGNAME_LEN, "name"); +} +EXPORT_SYMBOL(cfs_hash_debug_header); + +static struct cfs_hash_bucket ** +cfs_hash_full_bkts(struct cfs_hash *hs) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (hs->hs_rehash_buckets == NULL) + return hs->hs_buckets; + + LASSERT(hs->hs_rehash_bits != 0); + return hs->hs_rehash_bits > hs->hs_cur_bits ? + hs->hs_rehash_buckets : hs->hs_buckets; +} + +static unsigned int +cfs_hash_full_nbkt(struct cfs_hash *hs) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (hs->hs_rehash_buckets == NULL) + return CFS_HASH_NBKT(hs); + + LASSERT(hs->hs_rehash_bits != 0); + return hs->hs_rehash_bits > hs->hs_cur_bits ? + CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs); +} + +void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m) +{ + int dist[8] = { 0, }; + int maxdep = -1; + int maxdepb = -1; + int total = 0; + int theta; + int i; + + cfs_hash_lock(hs, 0); + theta = __cfs_hash_theta(hs); + + seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d 0x%02x %6d ", + CFS_HASH_BIGNAME_LEN, hs->hs_name, + 1 << hs->hs_cur_bits, 1 << hs->hs_min_bits, + 1 << hs->hs_max_bits, + __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta), + __cfs_hash_theta_int(hs->hs_min_theta), + __cfs_hash_theta_frac(hs->hs_min_theta), + __cfs_hash_theta_int(hs->hs_max_theta), + __cfs_hash_theta_frac(hs->hs_max_theta), + hs->hs_flags, hs->hs_rehash_count); + + /* + * The distribution is a summary of the chained hash depth in + * each of the libcfs hash buckets. Each buckets hsb_count is + * divided by the hash theta value and used to generate a + * histogram of the hash distribution. A uniform hash will + * result in all hash buckets being close to the average thus + * only the first few entries in the histogram will be non-zero. + * If you hash function results in a non-uniform hash the will + * be observable by outlier bucks in the distribution histogram. + * + * Uniform hash distribution: 128/128/0/0/0/0/0/0 + * Non-Uniform hash distribution: 128/125/0/0/0/0/2/1 + */ + for (i = 0; i < cfs_hash_full_nbkt(hs); i++) { + struct cfs_hash_bd bd; + + bd.bd_bucket = cfs_hash_full_bkts(hs)[i]; + cfs_hash_bd_lock(hs, &bd, 0); + if (maxdep < bd.bd_bucket->hsb_depmax) { + maxdep = bd.bd_bucket->hsb_depmax; + maxdepb = ffz(~maxdep); + } + total += bd.bd_bucket->hsb_count; + dist[min(fls(bd.bd_bucket->hsb_count / max(theta, 1)), 7)]++; + cfs_hash_bd_unlock(hs, &bd, 0); + } + + seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb); + for (i = 0; i < 8; i++) + seq_printf(m, "%d%c", dist[i], (i == 7) ? '\n' : '/'); + + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_debug_str); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/heap.c b/drivers/staging/lustrefsx/libcfs/libcfs/heap.c new file mode 100644 index 0000000000000..4efc4eba743b3 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/heap.c @@ -0,0 +1,499 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + */ +/* + * libcfs/libcfs/heap.c + * + * Author: Eric Barton + * Liang Zhen + */ +/** \addtogroup heap + * + * @{ + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +#define CBH_ALLOC(ptr, h) \ +do { \ + if (h->cbh_cptab) { \ + if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW) \ + LIBCFS_CPT_ALLOC_GFP((ptr), h->cbh_cptab, \ + h->cbh_cptid, CBH_NOB, \ + GFP_ATOMIC); \ + else \ + LIBCFS_CPT_ALLOC((ptr), h->cbh_cptab, \ + h->cbh_cptid, CBH_NOB); \ + } else { \ + if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW) \ + LIBCFS_ALLOC_ATOMIC((ptr), CBH_NOB); \ + else \ + LIBCFS_ALLOC((ptr), CBH_NOB); \ + } \ +} while (0) + +#define CBH_FREE(ptr) LIBCFS_FREE(ptr, CBH_NOB) + +/** + * Grows the capacity of a binary heap so that it can handle a larger number of + * \e struct cfs_binheap_node objects. + * + * \param[in] h The binary heap + * + * \retval 0 Successfully grew the heap + * \retval -ENOMEM OOM error + */ +static int +cfs_binheap_grow(struct cfs_binheap *h) +{ + struct cfs_binheap_node ***frag1 = NULL; + struct cfs_binheap_node **frag2; + int hwm = h->cbh_hwm; + + /* need a whole new chunk of pointers */ + LASSERT((h->cbh_hwm & CBH_MASK) == 0); + + if (hwm == 0) { + /* first use of single indirect */ + CBH_ALLOC(h->cbh_elements1, h); + if (h->cbh_elements1 == NULL) + return -ENOMEM; + + goto out; + } + + hwm -= CBH_SIZE; + if (hwm < CBH_SIZE * CBH_SIZE) { + /* not filled double indirect */ + CBH_ALLOC(frag2, h); + if (frag2 == NULL) + return -ENOMEM; + + if (hwm == 0) { + /* first use of double indirect */ + CBH_ALLOC(h->cbh_elements2, h); + if (h->cbh_elements2 == NULL) { + CBH_FREE(frag2); + return -ENOMEM; + } + } + + h->cbh_elements2[hwm >> CBH_SHIFT] = frag2; + goto out; + } + + hwm -= CBH_SIZE * CBH_SIZE; +#if (CBH_SHIFT * 3 < 32) + if (hwm >= CBH_SIZE * CBH_SIZE * CBH_SIZE) { + /* filled triple indirect */ + return -ENOMEM; + } +#endif + CBH_ALLOC(frag2, h); + if (frag2 == NULL) + return -ENOMEM; + + if (((hwm >> CBH_SHIFT) & CBH_MASK) == 0) { + /* first use of this 2nd level index */ + CBH_ALLOC(frag1, h); + if (frag1 == NULL) { + CBH_FREE(frag2); + return -ENOMEM; + } + } + + if (hwm == 0) { + /* first use of triple indirect */ + CBH_ALLOC(h->cbh_elements3, h); + if (h->cbh_elements3 == NULL) { + CBH_FREE(frag2); + CBH_FREE(frag1); + return -ENOMEM; + } + } + + if (frag1 != NULL) { + LASSERT(h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] == NULL); + h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] = frag1; + } else { + frag1 = h->cbh_elements3[hwm >> (2 * CBH_SHIFT)]; + LASSERT(frag1 != NULL); + } + + frag1[(hwm >> CBH_SHIFT) & CBH_MASK] = frag2; + + out: + h->cbh_hwm += CBH_SIZE; + return 0; +} + +/** + * Creates and initializes a binary heap instance. + * + * \param[in] ops The operations to be used + * \param[in] flags The heap flags + * \parm[in] count The initial heap capacity in # of elements + * \param[in] arg An optional private argument + * \param[in] cptab The CPT table this heap instance will operate over + * \param[in] cptid The CPT id of \a cptab this heap instance will operate over + * + * \retval valid-pointer A newly-created and initialized binary heap object + * \retval NULL error + */ +struct cfs_binheap * +cfs_binheap_create(struct cfs_binheap_ops *ops, unsigned int flags, + unsigned count, void *arg, struct cfs_cpt_table *cptab, + int cptid) +{ + struct cfs_binheap *h; + + LASSERT(ops != NULL); + LASSERT(ops->hop_compare != NULL); + if (cptab) { + LASSERT(cptid == CFS_CPT_ANY || + (cptid >= 0 && cptid < cptab->ctb_nparts)); + LIBCFS_CPT_ALLOC(h, cptab, cptid, sizeof(*h)); + } else { + LIBCFS_ALLOC(h, sizeof(*h)); + } + if (!h) + return NULL; + + h->cbh_ops = ops; + h->cbh_nelements = 0; + h->cbh_hwm = 0; + h->cbh_private = arg; + h->cbh_flags = flags & (~CBH_FLAG_ATOMIC_GROW); + h->cbh_cptab = cptab; + h->cbh_cptid = cptid; + + while (h->cbh_hwm < count) { /* preallocate */ + if (cfs_binheap_grow(h) != 0) { + cfs_binheap_destroy(h); + return NULL; + } + } + + h->cbh_flags |= flags & CBH_FLAG_ATOMIC_GROW; + + return h; +} +EXPORT_SYMBOL(cfs_binheap_create); + +/** + * Releases all resources associated with a binary heap instance. + * + * Deallocates memory for all indirection levels and the binary heap object + * itself. + * + * \param[in] h The binary heap object + */ +void +cfs_binheap_destroy(struct cfs_binheap *h) +{ + int idx0; + int idx1; + int n; + + LASSERT(h != NULL); + + n = h->cbh_hwm; + + if (n > 0) { + CBH_FREE(h->cbh_elements1); + n -= CBH_SIZE; + } + + if (n > 0) { + for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) { + CBH_FREE(h->cbh_elements2[idx0]); + n -= CBH_SIZE; + } + + CBH_FREE(h->cbh_elements2); + } + + if (n > 0) { + for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) { + + for (idx1 = 0; idx1 < CBH_SIZE && n > 0; idx1++) { + CBH_FREE(h->cbh_elements3[idx0][idx1]); + n -= CBH_SIZE; + } + + CBH_FREE(h->cbh_elements3[idx0]); + } + + CBH_FREE(h->cbh_elements3); + } + + LIBCFS_FREE(h, sizeof(*h)); +} +EXPORT_SYMBOL(cfs_binheap_destroy); + +/** + * Obtains a double pointer to a heap element, given its index into the binary + * tree. + * + * \param[in] h The binary heap instance + * \param[in] idx The requested node's index + * + * \retval valid-pointer A double pointer to a heap pointer entry + */ +static struct cfs_binheap_node ** +cfs_binheap_pointer(struct cfs_binheap *h, unsigned int idx) +{ + if (idx < CBH_SIZE) + return &(h->cbh_elements1[idx]); + + idx -= CBH_SIZE; + if (idx < CBH_SIZE * CBH_SIZE) + return &(h->cbh_elements2[idx >> CBH_SHIFT][idx & CBH_MASK]); + + idx -= CBH_SIZE * CBH_SIZE; + return &(h->cbh_elements3[idx >> (2 * CBH_SHIFT)]\ + [(idx >> CBH_SHIFT) & CBH_MASK]\ + [idx & CBH_MASK]); +} + +/** + * Obtains a pointer to a heap element, given its index into the binary tree. + * + * \param[in] h The binary heap + * \param[in] idx The requested node's index + * + * \retval valid-pointer The requested heap node + * \retval NULL Supplied index is out of bounds + */ +struct cfs_binheap_node * +cfs_binheap_find(struct cfs_binheap *h, unsigned int idx) +{ + if (idx >= h->cbh_nelements) + return NULL; + + return *cfs_binheap_pointer(h, idx); +} +EXPORT_SYMBOL(cfs_binheap_find); + +/** + * Moves a node upwards, towards the root of the binary tree. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 1 The position of \a e in the tree was changed at least once + * \retval 0 The position of \a e in the tree was not changed + */ +static int +cfs_binheap_bubble(struct cfs_binheap *h, struct cfs_binheap_node *e) +{ + unsigned int cur_idx = e->chn_index; + struct cfs_binheap_node **cur_ptr; + unsigned int parent_idx; + struct cfs_binheap_node **parent_ptr; + int did_sth = 0; + + cur_ptr = cfs_binheap_pointer(h, cur_idx); + LASSERT(*cur_ptr == e); + + while (cur_idx > 0) { + parent_idx = (cur_idx - 1) >> 1; + + parent_ptr = cfs_binheap_pointer(h, parent_idx); + LASSERT((*parent_ptr)->chn_index == parent_idx); + + if (h->cbh_ops->hop_compare(*parent_ptr, e)) + break; + + (*parent_ptr)->chn_index = cur_idx; + *cur_ptr = *parent_ptr; + cur_ptr = parent_ptr; + cur_idx = parent_idx; + did_sth = 1; + } + + e->chn_index = cur_idx; + *cur_ptr = e; + + return did_sth; +} + +/** + * Moves a node downwards, towards the last level of the binary tree. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 1 The position of \a e in the tree was changed at least once + * \retval 0 The position of \a e in the tree was not changed + */ +static int +cfs_binheap_sink(struct cfs_binheap *h, struct cfs_binheap_node *e) +{ + unsigned int n = h->cbh_nelements; + unsigned int child_idx; + struct cfs_binheap_node **child_ptr; + struct cfs_binheap_node *child; + unsigned int child2_idx; + struct cfs_binheap_node **child2_ptr; + struct cfs_binheap_node *child2; + unsigned int cur_idx; + struct cfs_binheap_node **cur_ptr; + int did_sth = 0; + + cur_idx = e->chn_index; + cur_ptr = cfs_binheap_pointer(h, cur_idx); + LASSERT(*cur_ptr == e); + + while (cur_idx < n) { + child_idx = (cur_idx << 1) + 1; + if (child_idx >= n) + break; + + child_ptr = cfs_binheap_pointer(h, child_idx); + child = *child_ptr; + + child2_idx = child_idx + 1; + if (child2_idx < n) { + child2_ptr = cfs_binheap_pointer(h, child2_idx); + child2 = *child2_ptr; + + if (h->cbh_ops->hop_compare(child2, child)) { + child_idx = child2_idx; + child_ptr = child2_ptr; + child = child2; + } + } + + LASSERT(child->chn_index == child_idx); + + if (h->cbh_ops->hop_compare(e, child)) + break; + + child->chn_index = cur_idx; + *cur_ptr = child; + cur_ptr = child_ptr; + cur_idx = child_idx; + did_sth = 1; + } + + e->chn_index = cur_idx; + *cur_ptr = e; + + return did_sth; +} + +/** + * Sort-inserts a node into the binary heap. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 0 Element inserted successfully + * \retval != 0 error + */ +int +cfs_binheap_insert(struct cfs_binheap *h, struct cfs_binheap_node *e) +{ + struct cfs_binheap_node **new_ptr; + unsigned int new_idx = h->cbh_nelements; + int rc; + + if (new_idx == h->cbh_hwm) { + rc = cfs_binheap_grow(h); + if (rc != 0) + return rc; + } + + if (h->cbh_ops->hop_enter) { + rc = h->cbh_ops->hop_enter(h, e); + if (rc != 0) + return rc; + } + + e->chn_index = new_idx; + new_ptr = cfs_binheap_pointer(h, new_idx); + h->cbh_nelements++; + *new_ptr = e; + + cfs_binheap_bubble(h, e); + + return 0; +} +EXPORT_SYMBOL(cfs_binheap_insert); + +/** + * Removes a node from the binary heap. + * + * \param[in] h The heap + * \param[in] e The node + */ +void +cfs_binheap_remove(struct cfs_binheap *h, struct cfs_binheap_node *e) +{ + unsigned int n = h->cbh_nelements; + unsigned int cur_idx = e->chn_index; + struct cfs_binheap_node **cur_ptr; + struct cfs_binheap_node *last; + + LASSERT(cur_idx != CBH_POISON); + LASSERT(cur_idx < n); + + cur_ptr = cfs_binheap_pointer(h, cur_idx); + LASSERT(*cur_ptr == e); + + n--; + last = *cfs_binheap_pointer(h, n); + h->cbh_nelements = n; + if (last == e) + return; + + last->chn_index = cur_idx; + *cur_ptr = last; + cfs_binheap_relocate(h, *cur_ptr); + + e->chn_index = CBH_POISON; + if (h->cbh_ops->hop_exit) + h->cbh_ops->hop_exit(h, e); +} +EXPORT_SYMBOL(cfs_binheap_remove); + +/** + * Relocate a node in the binary heap. + * Should be called whenever a node's values + * which affects its ranking are changed. + * + * \param[in] h The heap + * \param[in] e The node + */ +void +cfs_binheap_relocate(struct cfs_binheap *h, struct cfs_binheap_node *e) +{ + if (!cfs_binheap_bubble(h, e)) + cfs_binheap_sink(h, e); +} +EXPORT_SYMBOL(cfs_binheap_relocate); +/** @} heap */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c new file mode 100644 index 0000000000000..fff5a2217c6f5 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c @@ -0,0 +1,1320 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include + +/** Global CPU partition table */ +struct cfs_cpt_table *cfs_cpt_table __read_mostly; +EXPORT_SYMBOL(cfs_cpt_table); + +/** + * modparam for setting number of partitions + * + * 0 : estimate best value based on cores or NUMA nodes + * 1 : disable multiple partitions + * >1 : specify number of partitions + */ +static int cpu_npartitions; +module_param(cpu_npartitions, int, 0444); +MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions"); + +/** + * modparam for setting CPU partitions patterns: + * + * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID, + * number in bracket is processor ID (core or HT) + * + * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket + * are NUMA node ID, number before bracket is CPU partition ID. + * + * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology + * + * NB: If user specified cpu_pattern, cpu_npartitions will be ignored + */ +static char *cpu_pattern = "N"; +module_param(cpu_pattern, charp, 0444); +MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern"); + +#ifdef CONFIG_SMP +struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt) +{ + struct cfs_cpt_table *cptab; + int i; + + LIBCFS_ALLOC(cptab, sizeof(*cptab)); + if (!cptab) + return NULL; + + cptab->ctb_nparts = ncpt; + + LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size()); + if (!cptab->ctb_cpumask) + goto failed_alloc_cpumask; + + LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); + if (!cptab->ctb_nodemask) + goto failed_alloc_nodemask; + + LIBCFS_ALLOC(cptab->ctb_cpu2cpt, + nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0])); + if (!cptab->ctb_cpu2cpt) + goto failed_alloc_cpu2cpt; + + memset(cptab->ctb_cpu2cpt, -1, + nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0])); + + LIBCFS_ALLOC(cptab->ctb_node2cpt, + nr_node_ids * sizeof(cptab->ctb_node2cpt[0])); + if (!cptab->ctb_node2cpt) + goto failed_alloc_node2cpt; + + memset(cptab->ctb_node2cpt, -1, + nr_node_ids * sizeof(cptab->ctb_node2cpt[0])); + + LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0])); + if (!cptab->ctb_parts) + goto failed_alloc_ctb_parts; + + memset(cptab->ctb_parts, -1, ncpt * sizeof(cptab->ctb_parts[0])); + + for (i = 0; i < ncpt; i++) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size()); + if (!part->cpt_cpumask) + goto failed_setting_ctb_parts; + + LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask)); + if (!part->cpt_nodemask) + goto failed_setting_ctb_parts; + + LIBCFS_ALLOC(part->cpt_distance, + cptab->ctb_nparts * sizeof(part->cpt_distance[0])); + if (!part->cpt_distance) + goto failed_setting_ctb_parts; + + memset(part->cpt_distance, -1, + cptab->ctb_nparts * sizeof(part->cpt_distance[0])); + } + + return cptab; + +failed_setting_ctb_parts: + while (i-- >= 0) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + if (part->cpt_nodemask) { + LIBCFS_FREE(part->cpt_nodemask, + sizeof(*part->cpt_nodemask)); + } + + if (part->cpt_cpumask) + LIBCFS_FREE(part->cpt_cpumask, cpumask_size()); + + if (part->cpt_distance) { + LIBCFS_FREE(part->cpt_distance, + cptab->ctb_nparts * + sizeof(part->cpt_distance[0])); + } + } + + if (cptab->ctb_parts) { + LIBCFS_FREE(cptab->ctb_parts, + cptab->ctb_nparts * sizeof(cptab->ctb_parts[0])); + } +failed_alloc_ctb_parts: + if (cptab->ctb_node2cpt) { + LIBCFS_FREE(cptab->ctb_node2cpt, + nr_node_ids * sizeof(cptab->ctb_node2cpt[0])); + } +failed_alloc_node2cpt: + if (cptab->ctb_cpu2cpt) { + LIBCFS_FREE(cptab->ctb_cpu2cpt, + nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0])); + } +failed_alloc_cpu2cpt: + if (cptab->ctb_nodemask) + LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); +failed_alloc_nodemask: + if (cptab->ctb_cpumask) + LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size()); +failed_alloc_cpumask: + LIBCFS_FREE(cptab, sizeof(*cptab)); + return NULL; +} +EXPORT_SYMBOL(cfs_cpt_table_alloc); + +void cfs_cpt_table_free(struct cfs_cpt_table *cptab) +{ + int i; + + if (cptab->ctb_cpu2cpt) { + LIBCFS_FREE(cptab->ctb_cpu2cpt, + nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0])); + } + + if (cptab->ctb_node2cpt) { + LIBCFS_FREE(cptab->ctb_node2cpt, + nr_node_ids * sizeof(cptab->ctb_node2cpt[0])); + } + + for (i = 0; cptab->ctb_parts && i < cptab->ctb_nparts; i++) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + if (part->cpt_nodemask) { + LIBCFS_FREE(part->cpt_nodemask, + sizeof(*part->cpt_nodemask)); + } + + if (part->cpt_cpumask) + LIBCFS_FREE(part->cpt_cpumask, cpumask_size()); + + if (part->cpt_distance) { + LIBCFS_FREE(part->cpt_distance, + cptab->ctb_nparts * + sizeof(part->cpt_distance[0])); + } + } + + if (cptab->ctb_parts) { + LIBCFS_FREE(cptab->ctb_parts, + cptab->ctb_nparts * sizeof(cptab->ctb_parts[0])); + } + + if (cptab->ctb_nodemask) + LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); + if (cptab->ctb_cpumask) + LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size()); + + LIBCFS_FREE(cptab, sizeof(*cptab)); +} +EXPORT_SYMBOL(cfs_cpt_table_free); + +int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + char *tmp = buf; + int rc; + int i; + int j; + + for (i = 0; i < cptab->ctb_nparts; i++) { + if (len <= 0) + goto err; + + rc = snprintf(tmp, len, "%d\t:", i); + len -= rc; + + if (len <= 0) + goto err; + + tmp += rc; + for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) { + rc = snprintf(tmp, len, " %d", j); + len -= rc; + if (len <= 0) + goto err; + tmp += rc; + } + + *tmp = '\n'; + tmp++; + len--; + } + + return tmp - buf; +err: + return -E2BIG; +} +EXPORT_SYMBOL(cfs_cpt_table_print); + +int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + char *tmp = buf; + int rc; + int i; + int j; + + for (i = 0; i < cptab->ctb_nparts; i++) { + if (len <= 0) + goto err; + + rc = snprintf(tmp, len, "%d\t:", i); + len -= rc; + + if (len <= 0) + goto err; + + tmp += rc; + for (j = 0; j < cptab->ctb_nparts; j++) { + rc = snprintf(tmp, len, " %d:%d", j, + cptab->ctb_parts[i].cpt_distance[j]); + len -= rc; + if (len <= 0) + goto err; + tmp += rc; + } + + *tmp = '\n'; + tmp++; + len--; + } + + return tmp - buf; +err: + return -E2BIG; +} +EXPORT_SYMBOL(cfs_cpt_distance_print); + +int cfs_cpt_number(struct cfs_cpt_table *cptab) +{ + return cptab->ctb_nparts; +} +EXPORT_SYMBOL(cfs_cpt_number); + +int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cpumask_weight(cptab->ctb_cpumask) : + cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask); +} +EXPORT_SYMBOL(cfs_cpt_weight); + +int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cpumask_any_and(cptab->ctb_cpumask, + cpu_online_mask) < nr_cpu_ids : + cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask, + cpu_online_mask) < nr_cpu_ids; +} +EXPORT_SYMBOL(cfs_cpt_online); + +cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask; +} +EXPORT_SYMBOL(cfs_cpt_cpumask); + +nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask; +} +EXPORT_SYMBOL(cfs_cpt_nodemask); + +unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2) +{ + LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts)); + LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts)); + + if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY) + return cptab->ctb_distance; + + return cptab->ctb_parts[cpt1].cpt_distance[cpt2]; +} +EXPORT_SYMBOL(cfs_cpt_distance); + +/* + * Calculate the maximum NUMA distance between all nodes in the + * from_mask and all nodes in the to_mask. + */ +static unsigned int cfs_cpt_distance_calculate(nodemask_t *from_mask, + nodemask_t *to_mask) +{ + unsigned int maximum; + unsigned int distance; + int from; + int to; + + maximum = 0; + for_each_node_mask(from, *from_mask) { + for_each_node_mask(to, *to_mask) { + distance = node_distance(from, to); + if (maximum < distance) + maximum = distance; + } + } + return maximum; +} + +static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + cptab->ctb_cpu2cpt[cpu] = cpt; + + cpumask_set_cpu(cpu, cptab->ctb_cpumask); + cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask); +} + +static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask); + cpumask_clear_cpu(cpu, cptab->ctb_cpumask); + + cptab->ctb_cpu2cpt[cpu] = -1; +} + +static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + struct cfs_cpu_partition *part; + + if (!node_isset(node, *cptab->ctb_nodemask)) { + unsigned int dist; + + /* first time node is added to the CPT table */ + node_set(node, *cptab->ctb_nodemask); + cptab->ctb_node2cpt[node] = cpt; + + dist = cfs_cpt_distance_calculate(cptab->ctb_nodemask, + cptab->ctb_nodemask); + cptab->ctb_distance = dist; + } + + part = &cptab->ctb_parts[cpt]; + if (!node_isset(node, *part->cpt_nodemask)) { + int cpt2; + + /* first time node is added to this CPT */ + node_set(node, *part->cpt_nodemask); + for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) { + struct cfs_cpu_partition *part2; + unsigned int dist; + + part2 = &cptab->ctb_parts[cpt2]; + dist = cfs_cpt_distance_calculate(part->cpt_nodemask, + part2->cpt_nodemask); + part->cpt_distance[cpt2] = dist; + dist = cfs_cpt_distance_calculate(part2->cpt_nodemask, + part->cpt_nodemask); + part2->cpt_distance[cpt] = dist; + } + } +} + +static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt]; + int cpu; + + for_each_cpu(cpu, part->cpt_cpumask) { + /* this CPT has other CPU belonging to this node? */ + if (cpu_to_node(cpu) == node) + break; + } + + if (cpu >= nr_cpu_ids && node_isset(node, *part->cpt_nodemask)) { + int cpt2; + + /* No more CPUs in the node for this CPT. */ + node_clear(node, *part->cpt_nodemask); + for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) { + struct cfs_cpu_partition *part2; + unsigned int dist; + + part2 = &cptab->ctb_parts[cpt2]; + if (node_isset(node, *part2->cpt_nodemask)) + cptab->ctb_node2cpt[node] = cpt2; + + dist = cfs_cpt_distance_calculate(part->cpt_nodemask, + part2->cpt_nodemask); + part->cpt_distance[cpt2] = dist; + dist = cfs_cpt_distance_calculate(part2->cpt_nodemask, + part->cpt_nodemask); + part2->cpt_distance[cpt] = dist; + } + } + + for_each_cpu(cpu, cptab->ctb_cpumask) { + /* this CPT-table has other CPUs belonging to this node? */ + if (cpu_to_node(cpu) == node) + break; + } + + if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) { + /* No more CPUs in the table for this node. */ + node_clear(node, *cptab->ctb_nodemask); + cptab->ctb_node2cpt[node] = -1; + cptab->ctb_distance = + cfs_cpt_distance_calculate(cptab->ctb_nodemask, + cptab->ctb_nodemask); + } +} + +int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts); + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) { + CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu); + return 0; + } + + if (cptab->ctb_cpu2cpt[cpu] != -1) { + CDEBUG(D_INFO, "CPU %d is already in partition %d\n", + cpu, cptab->ctb_cpu2cpt[cpu]); + return 0; + } + + if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) { + CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu); + return 0; + } + + if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) { + CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n", + cpu, cptab->ctb_cpu2cpt[cpu]); + return 0; + } + + cfs_cpt_add_cpu(cptab, cpt, cpu); + cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu)); + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpu); + +void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + if (cpu < 0 || cpu >= nr_cpu_ids) { + CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu); + return; + } + + if (cpt == CFS_CPT_ANY) { + /* caller doesn't know the partition ID */ + cpt = cptab->ctb_cpu2cpt[cpu]; + if (cpt < 0) { /* not set in this CPT-table */ + CDEBUG(D_INFO, + "Try to unset cpu %d which is not in CPT-table %p\n", + cpt, cptab); + return; + } + + } else if (cpt != cptab->ctb_cpu2cpt[cpu]) { + CDEBUG(D_INFO, + "CPU %d is not in CPU partition %d\n", cpu, cpt); + return; + } + + LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)); + LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask)); + + cfs_cpt_del_cpu(cptab, cpt, cpu); + cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu)); +} +EXPORT_SYMBOL(cfs_cpt_unset_cpu); + +int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, + const cpumask_t *mask) +{ + int cpu; + + if (!cpumask_weight(mask) || + cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) { + CDEBUG(D_INFO, + "No online CPU is found in the CPU mask for CPU partition %d\n", + cpt); + return 0; + } + + for_each_cpu(cpu, mask) { + cfs_cpt_add_cpu(cptab, cpt, cpu); + cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu)); + } + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpumask); + +void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, + const cpumask_t *mask) +{ + int cpu; + + for_each_cpu(cpu, mask) { + cfs_cpt_del_cpu(cptab, cpt, cpu); + cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu)); + } +} +EXPORT_SYMBOL(cfs_cpt_unset_cpumask); + +int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + const cpumask_t *mask; + int cpu; + + if (node < 0 || node >= nr_node_ids) { + CDEBUG(D_INFO, + "Invalid NUMA id %d for CPU partition %d\n", node, cpt); + return 0; + } + + mask = cpumask_of_node(node); + + for_each_cpu(cpu, mask) + cfs_cpt_add_cpu(cptab, cpt, cpu); + + cfs_cpt_add_node(cptab, cpt, node); + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_node); + +void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + const cpumask_t *mask; + int cpu; + + if (node < 0 || node >= nr_node_ids) { + CDEBUG(D_INFO, + "Invalid NUMA id %d for CPU partition %d\n", node, cpt); + return; + } + + mask = cpumask_of_node(node); + + for_each_cpu(cpu, mask) + cfs_cpt_del_cpu(cptab, cpt, cpu); + + cfs_cpt_del_node(cptab, cpt, node); +} +EXPORT_SYMBOL(cfs_cpt_unset_node); + +int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, + const nodemask_t *mask) +{ + int node; + + for_each_node_mask(node, *mask) + cfs_cpt_set_node(cptab, cpt, node); + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_nodemask); + +void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, + const nodemask_t *mask) +{ + int node; + + for_each_node_mask(node, *mask) + cfs_cpt_unset_node(cptab, cpt, node); +} +EXPORT_SYMBOL(cfs_cpt_unset_nodemask); + +int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) +{ + nodemask_t *mask; + int weight; + unsigned int rotor; + int node = 0; + + /* convert CPU partition ID to HW node id */ + + if (cpt < 0 || cpt >= cptab->ctb_nparts) { + mask = cptab->ctb_nodemask; + rotor = cptab->ctb_spread_rotor++; + } else { + mask = cptab->ctb_parts[cpt].cpt_nodemask; + rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++; + node = cptab->ctb_parts[cpt].cpt_node; + } + + weight = nodes_weight(*mask); + if (weight > 0) { + rotor %= weight; + + for_each_node_mask(node, *mask) { + if (!rotor--) + return node; + } + } + + return node; +} +EXPORT_SYMBOL(cfs_cpt_spread_node); + +int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) +{ + int cpu; + int cpt; + + preempt_disable(); + cpu = smp_processor_id(); + cpt = cptab->ctb_cpu2cpt[cpu]; + + if (cpt < 0 && remap) { + /* don't return negative value for safety of upper layer, + * instead we shadow the unknown cpu to a valid partition ID + */ + cpt = cpu % cptab->ctb_nparts; + } + preempt_enable(); + return cpt; +} +EXPORT_SYMBOL(cfs_cpt_current); + +int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) +{ + LASSERT(cpu >= 0 && cpu < nr_cpu_ids); + + return cptab->ctb_cpu2cpt[cpu]; +} +EXPORT_SYMBOL(cfs_cpt_of_cpu); + +int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node) +{ + if (node < 0 || node > nr_node_ids) + return CFS_CPT_ANY; + + return cptab->ctb_node2cpt[node]; +} +EXPORT_SYMBOL(cfs_cpt_of_node); + +int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) +{ + nodemask_t *nodemask; + cpumask_t *cpumask; + int cpu; + int rc; + + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + if (cpt == CFS_CPT_ANY) { + cpumask = cptab->ctb_cpumask; + nodemask = cptab->ctb_nodemask; + } else { + cpumask = cptab->ctb_parts[cpt].cpt_cpumask; + nodemask = cptab->ctb_parts[cpt].cpt_nodemask; + } + + if (!cpumask_intersects(cpumask, cpu_online_mask)) { + CDEBUG(D_INFO, + "No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n", + cpt); + return -ENODEV; + } + + for_each_online_cpu(cpu) { + if (cpumask_test_cpu(cpu, cpumask)) + continue; + + rc = set_cpus_allowed_ptr(current, cpumask); + set_mems_allowed(*nodemask); + if (!rc) + schedule(); /* switch to allowed CPU */ + + return rc; + } + + /* don't need to set affinity because all online CPUs are covered */ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_bind); + +/** + * Choose max to \a number CPUs from \a node and set them in \a cpt. + * We always prefer to choose CPU in the same core/socket. + */ +static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt, + cpumask_t *node_mask, int number) +{ + cpumask_t *socket_mask = NULL; + cpumask_t *core_mask = NULL; + int rc = 0; + int cpu; + int i; + + LASSERT(number > 0); + + if (number >= cpumask_weight(node_mask)) { + while (!cpumask_empty(node_mask)) { + cpu = cpumask_first(node_mask); + cpumask_clear_cpu(cpu, node_mask); + + if (!cpu_online(cpu)) + continue; + + rc = cfs_cpt_set_cpu(cptab, cpt, cpu); + if (!rc) + return -EINVAL; + } + return 0; + } + + /* allocate scratch buffer */ + LIBCFS_ALLOC(socket_mask, cpumask_size()); + LIBCFS_ALLOC(core_mask, cpumask_size()); + if (!socket_mask || !core_mask) { + rc = -ENOMEM; + goto out; + } + + while (!cpumask_empty(node_mask)) { + cpu = cpumask_first(node_mask); + + /* get cpumask for cores in the same socket */ + cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask); + while (!cpumask_empty(socket_mask)) { + /* get cpumask for hts in the same core */ + cpumask_and(core_mask, topology_sibling_cpumask(cpu), + node_mask); + + for_each_cpu(i, core_mask) { + cpumask_clear_cpu(i, socket_mask); + cpumask_clear_cpu(i, node_mask); + + if (!cpu_online(i)) + continue; + + rc = cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { + rc = -EINVAL; + goto out; + } + + if (!--number) + goto out; + } + cpu = cpumask_first(socket_mask); + } + } + +out: + if (core_mask) + LIBCFS_FREE(core_mask, cpumask_size()); + if (socket_mask) + LIBCFS_FREE(socket_mask, cpumask_size()); + return rc; +} + +#define CPT_WEIGHT_MIN 4 + +static int cfs_cpt_num_estimate(void) +{ + int nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id())); + int ncpu = num_online_cpus(); + int ncpt = 1; + + if (ncpu > CPT_WEIGHT_MIN) + for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++) + ; /* nothing */ + +#if (BITS_PER_LONG == 32) + /* config many CPU partitions on 32-bit system could consume + * too much memory + */ + ncpt = min(2, ncpt); +#endif + while (ncpu % ncpt) + ncpt--; /* worst case is 1 */ + + return ncpt; +} + +static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt) +{ + struct cfs_cpt_table *cptab = NULL; + cpumask_t *node_mask = NULL; + int cpt = 0; + int node; + int num; + int rem; + int rc = 0; + + num = cfs_cpt_num_estimate(); + if (ncpt <= 0) + ncpt = num; + + if (ncpt > num_online_cpus()) { + rc = -EINVAL; + CERROR("libcfs: CPU partition count %d > cores %d: rc = %d\n", + ncpt, num_online_cpus(), rc); + goto failed; + } + + if (ncpt > 4 * num) { + CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n", + ncpt, num); + } + + cptab = cfs_cpt_table_alloc(ncpt); + if (!cptab) { + CERROR("Failed to allocate CPU map(%d)\n", ncpt); + rc = -ENOMEM; + goto failed; + } + + LIBCFS_ALLOC(node_mask, cpumask_size()); + if (!node_mask) { + CERROR("Failed to allocate scratch cpumask\n"); + rc = -ENOMEM; + goto failed; + } + + num = num_online_cpus() / ncpt; + rem = num_online_cpus() % ncpt; + for_each_online_node(node) { + cpumask_copy(node_mask, cpumask_of_node(node)); + + while (cpt < ncpt && !cpumask_empty(node_mask)) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt]; + int ncpu = cpumask_weight(part->cpt_cpumask); + + rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask, + (rem > 0) + num - ncpu); + if (rc < 0) { + rc = -EINVAL; + goto failed_mask; + } + + ncpu = cpumask_weight(part->cpt_cpumask); + if (ncpu == num + !!(rem > 0)) { + cpt++; + rem--; + } + } + } + + LIBCFS_FREE(node_mask, cpumask_size()); + + return cptab; + +failed_mask: + if (node_mask) + LIBCFS_FREE(node_mask, cpumask_size()); +failed: + CERROR("Failed (rc = %d) to setup CPU partition table with %d partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n", + rc, ncpt, num_online_nodes(), num_online_cpus()); + + if (cptab) + cfs_cpt_table_free(cptab); + + return ERR_PTR(rc); +} + +static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern) +{ + struct cfs_cpt_table *cptab; + char *pattern_dup; + char *bracket; + char *str; + int node = 0; + int ncpt = 0; + int cpt = 0; + int high; + int rc; + int c; + int i; + + pattern_dup = kstrdup(pattern, GFP_KERNEL); + if (!pattern_dup) { + CERROR("Failed to duplicate pattern '%s'\n", pattern); + return ERR_PTR(-ENOMEM); + } + + str = cfs_trimwhite(pattern_dup); + if (*str == 'n' || *str == 'N') { + str++; /* skip 'N' char */ + node = 1; /* NUMA pattern */ + if (*str == '\0') { + node = -1; + for_each_online_node(i) { + if (!cpumask_empty(cpumask_of_node(i))) + ncpt++; + } + if (ncpt == 1) { /* single NUMA node */ + kfree(pattern_dup); + return cfs_cpt_table_create(cpu_npartitions); + } + } + } + + if (!ncpt) { /* scanning bracket which is mark of partition */ + bracket = str; + while ((bracket = strchr(bracket, '['))) { + bracket++; + ncpt++; + } + } + + if (!ncpt || + (node && ncpt > num_online_nodes()) || + (!node && ncpt > num_online_cpus())) { + CERROR("Invalid pattern '%s', or too many partitions %d\n", + pattern_dup, ncpt); + rc = -EINVAL; + goto err_free_str; + } + + cptab = cfs_cpt_table_alloc(ncpt); + if (!cptab) { + CERROR("Failed to allocate CPU partition table\n"); + rc = -ENOMEM; + goto err_free_str; + } + + if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */ + for_each_online_node(i) { + if (cpumask_empty(cpumask_of_node(i))) + continue; + + rc = cfs_cpt_set_node(cptab, cpt++, i); + if (!rc) { + rc = -EINVAL; + goto err_free_table; + } + } + kfree(pattern_dup); + return cptab; + } + + high = node ? nr_node_ids - 1 : nr_cpu_ids - 1; + + for (str = cfs_trimwhite(str), c = 0; /* until break */; c++) { + struct cfs_range_expr *range; + struct cfs_expr_list *el; + int n; + + bracket = strchr(str, '['); + if (!bracket) { + if (*str) { + CERROR("Invalid pattern '%s'\n", str); + rc = -EINVAL; + goto err_free_table; + } else if (c != ncpt) { + CERROR("Expect %d partitions but found %d\n", + ncpt, c); + rc = -EINVAL; + goto err_free_table; + } + break; + } + + if (sscanf(str, "%d%n", &cpt, &n) < 1) { + CERROR("Invalid CPU pattern '%s'\n", str); + rc = -EINVAL; + goto err_free_table; + } + + if (cpt < 0 || cpt >= ncpt) { + CERROR("Invalid partition id %d, total partitions %d\n", + cpt, ncpt); + rc = -EINVAL; + goto err_free_table; + } + + if (cfs_cpt_weight(cptab, cpt)) { + CERROR("Partition %d has already been set.\n", cpt); + rc = -EPERM; + goto err_free_table; + } + + str = cfs_trimwhite(str + n); + if (str != bracket) { + CERROR("Invalid pattern '%s'\n", str); + rc = -EINVAL; + goto err_free_table; + } + + bracket = strchr(str, ']'); + if (!bracket) { + CERROR("Missing right bracket for partition %d in '%s'\n", + cpt, str); + rc = -EINVAL; + goto err_free_table; + } + + rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high, + &el); + if (rc) { + CERROR("Can't parse number range in '%s'\n", str); + rc = -ERANGE; + goto err_free_table; + } + + list_for_each_entry(range, &el->el_exprs, re_link) { + for (i = range->re_lo; i <= range->re_hi; i++) { + if ((i - range->re_lo) % range->re_stride) + continue; + + rc = node ? cfs_cpt_set_node(cptab, cpt, i) + : cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { + cfs_expr_list_free(el); + rc = -EINVAL; + goto err_free_table; + } + } + } + + cfs_expr_list_free(el); + + if (!cfs_cpt_online(cptab, cpt)) { + CERROR("No online CPU is found on partition %d\n", cpt); + rc = -ENODEV; + goto err_free_table; + } + + str = cfs_trimwhite(bracket + 1); + } + + kfree(pattern_dup); + return cptab; + +err_free_table: + cfs_cpt_table_free(cptab); +err_free_str: + kfree(pattern_dup); + return ERR_PTR(rc); +} + +#ifdef CONFIG_HOTPLUG_CPU +#ifdef HAVE_HOTPLUG_STATE_MACHINE +static enum cpuhp_state lustre_cpu_online; + +static int cfs_cpu_online(unsigned int cpu) +{ + return 0; +} +#endif + +static int cfs_cpu_dead(unsigned int cpu) +{ + bool warn; + + /* if all HTs in a core are offline, it may break affinity */ + warn = cpumask_any_and(topology_sibling_cpumask(cpu), + cpu_online_mask) >= nr_cpu_ids; + CDEBUG(warn ? D_WARNING : D_INFO, + "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n", + cpu); + return 0; +} + +#ifndef HAVE_HOTPLUG_STATE_MACHINE +static int cfs_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + default: + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) { + CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n", + cpu, action); + break; + } + + cfs_cpu_dead(cpu); + } + + return NOTIFY_OK; +} + +static struct notifier_block cfs_cpu_notifier = { + .notifier_call = cfs_cpu_notify, + .priority = 0 +}; +#endif /* !HAVE_HOTPLUG_STATE_MACHINE */ +#endif /* CONFIG_HOTPLUG_CPU */ + +void cfs_cpu_fini(void) +{ + if (!IS_ERR_OR_NULL(cfs_cpt_table)) + cfs_cpt_table_free(cfs_cpt_table); + +#ifdef CONFIG_HOTPLUG_CPU +#ifdef HAVE_HOTPLUG_STATE_MACHINE + if (lustre_cpu_online > 0) + cpuhp_remove_state_nocalls(lustre_cpu_online); + cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD); +#else + unregister_hotcpu_notifier(&cfs_cpu_notifier); +#endif /* !HAVE_HOTPLUG_STATE_MACHINE */ +#endif /* CONFIG_HOTPLUG_CPU */ +} + +int cfs_cpu_init(void) +{ + int ret; + + LASSERT(!cfs_cpt_table); + +#ifdef CONFIG_HOTPLUG_CPU +#ifdef HAVE_HOTPLUG_STATE_MACHINE + ret = cpuhp_setup_state_nocalls(CPUHP_LUSTRE_CFS_DEAD, + "fs/lustre/cfe:dead", NULL, + cfs_cpu_dead); + if (ret < 0) + goto failed_cpu_dead; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "fs/lustre/cfe:online", + cfs_cpu_online, NULL); + if (ret < 0) + goto failed_cpu_online; + + lustre_cpu_online = ret; +#else + register_hotcpu_notifier(&cfs_cpu_notifier); +#endif /* !HAVE_HOTPLUG_STATE_MACHINE */ +#endif /* CONFIG_HOTPLUG_CPU */ + + cpus_read_lock(); + if (*cpu_pattern) { + cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern); + if (IS_ERR(cfs_cpt_table)) { + CERROR("Failed to create cptab from pattern '%s'\n", + cpu_pattern); + ret = PTR_ERR(cfs_cpt_table); + goto failed_alloc_table; + } + + } else { + cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions); + if (IS_ERR(cfs_cpt_table)) { + CERROR("Failed to create cptab with npartitions %d\n", + cpu_npartitions); + ret = PTR_ERR(cfs_cpt_table); + goto failed_alloc_table; + } + } + + cpus_read_unlock(); + + LCONSOLE(0, "HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n", + num_online_nodes(), num_online_cpus(), + cfs_cpt_number(cfs_cpt_table)); + return 0; + +failed_alloc_table: + cpus_read_unlock(); + + if (!IS_ERR_OR_NULL(cfs_cpt_table)) + cfs_cpt_table_free(cfs_cpt_table); + +#ifdef CONFIG_HOTPLUG_CPU +#ifdef HAVE_HOTPLUG_STATE_MACHINE + if (lustre_cpu_online > 0) + cpuhp_remove_state_nocalls(lustre_cpu_online); +failed_cpu_online: + cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD); +failed_cpu_dead: +#else + unregister_hotcpu_notifier(&cfs_cpu_notifier); +#endif /* !HAVE_HOTPLUG_STATE_MACHINE */ +#endif /* CONFIG_HOTPLUG_CPU */ + return ret; +} + +#else /* ! CONFIG_SMP */ + +struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt) +{ + struct cfs_cpt_table *cptab; + + if (ncpt != 1) { + CERROR("Can't support cpu partition number %d\n", ncpt); + return NULL; + } + + LIBCFS_ALLOC(cptab, sizeof(*cptab)); + if (!cptab) + return NULL; + + cpumask_set_cpu(0, cptab->ctb_cpumask); + node_set(0, cptab->ctb_nodemask); + + return cptab; +} +EXPORT_SYMBOL(cfs_cpt_table_alloc); + +int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + int rc; + + rc = snprintf(buf, len, "0\t: 0\n"); + len -= rc; + if (len <= 0) + return -EFBIG; + + return rc; +} +EXPORT_SYMBOL(cfs_cpt_table_print); + +int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + int rc; + + rc = snprintf(buf, len, "0\t: 0:1\n"); + len -= rc; + if (len <= 0) + return -EFBIG; + + return rc; +} +EXPORT_SYMBOL(cfs_cpt_distance_print); + +void cfs_cpu_fini(void) +{ + if (cfs_cpt_table) { + cfs_cpt_table_free(cfs_cpt_table); + cfs_cpt_table = NULL; + } +} + +int cfs_cpu_init(void) +{ + cfs_cpt_table = cfs_cpt_table_alloc(1); + + return cfs_cpt_table ? 0 : -1; +} + +#endif /* !CONFIG_SMP */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c new file mode 100644 index 0000000000000..c6ba9e728b688 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c @@ -0,0 +1,157 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/** destroy cpu-partition lock, see libcfs_private.h for more detail */ +void +cfs_percpt_lock_free(struct cfs_percpt_lock *pcl) +{ + LASSERT(pcl->pcl_locks != NULL); + LASSERT(!pcl->pcl_locked); + + cfs_percpt_free(pcl->pcl_locks); + LIBCFS_FREE(pcl, sizeof(*pcl)); +} +EXPORT_SYMBOL(cfs_percpt_lock_free); + +/** + * create cpu-partition lock, see libcfs_private.h for more detail. + * + * cpu-partition lock is designed for large-scale SMP system, so we need to + * reduce cacheline conflict as possible as we can, that's the + * reason we always allocate cacheline-aligned memory block. + */ +struct cfs_percpt_lock * +cfs_percpt_lock_create(struct cfs_cpt_table *cptab, + struct lock_class_key *keys) +{ + struct cfs_percpt_lock *pcl; + spinlock_t *lock; + int i; + + /* NB: cptab can be NULL, pcl will be for HW CPUs on that case */ + LIBCFS_ALLOC(pcl, sizeof(*pcl)); + if (pcl == NULL) + return NULL; + + pcl->pcl_cptab = cptab; + pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock)); + if (pcl->pcl_locks == NULL) { + LIBCFS_FREE(pcl, sizeof(*pcl)); + return NULL; + } + + if (keys == NULL) { + CWARN("Cannot setup class key for percpt lock, you may see " + "recursive locking warnings which are actually fake.\n"); + } + + cfs_percpt_for_each(lock, i, pcl->pcl_locks) { + spin_lock_init(lock); + if (keys != NULL) + lockdep_set_class(lock, &keys[i]); + } + + return pcl; +} +EXPORT_SYMBOL(cfs_percpt_lock_create); + +/** + * lock a CPU partition + * + * \a index != CFS_PERCPT_LOCK_EX + * hold private lock indexed by \a index + * + * \a index == CFS_PERCPT_LOCK_EX + * exclusively lock @pcl and nobody can take private lock + */ +void +cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index) +__acquires(pcl->pcl_locks) +{ + int ncpt = cfs_cpt_number(pcl->pcl_cptab); + int i; + + LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt); + + if (ncpt == 1) { + index = 0; + } else { /* serialize with exclusive lock */ + while (pcl->pcl_locked) + cpu_relax(); + } + + if (likely(index != CFS_PERCPT_LOCK_EX)) { + spin_lock(pcl->pcl_locks[index]); + return; + } + + /* exclusive lock request */ + for (i = 0; i < ncpt; i++) { + spin_lock(pcl->pcl_locks[i]); + if (i == 0) { + LASSERT(!pcl->pcl_locked); + /* nobody should take private lock after this + * so I wouldn't starve for too long time */ + pcl->pcl_locked = 1; + } + } +} +EXPORT_SYMBOL(cfs_percpt_lock); + +/** unlock a CPU partition */ +void +cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index) +__releases(pcl->pcl_locks) +{ + int ncpt = cfs_cpt_number(pcl->pcl_cptab); + int i; + + index = ncpt == 1 ? 0 : index; + + if (likely(index != CFS_PERCPT_LOCK_EX)) { + spin_unlock(pcl->pcl_locks[index]); + return; + } + + for (i = ncpt - 1; i >= 0; i--) { + if (i == 0) { + LASSERT(pcl->pcl_locked); + pcl->pcl_locked = 0; + } + spin_unlock(pcl->pcl_locks[i]); + } +} +EXPORT_SYMBOL(cfs_percpt_unlock); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c new file mode 100644 index 0000000000000..5f85219101eb0 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c @@ -0,0 +1,235 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +struct cfs_var_array { + unsigned int va_count; /* # of buffers */ + unsigned int va_size; /* size of each var */ + struct cfs_cpt_table *va_cptab; /* cpu partition table */ + void *va_ptrs[0]; /* buffer addresses */ +}; + +/* + * free per-cpu data, see more detail in cfs_percpt_free + */ +void +cfs_percpt_free(void *vars) +{ + struct cfs_var_array *arr; + int i; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + for (i = 0; i < arr->va_count; i++) { + if (arr->va_ptrs[i] != NULL) + LIBCFS_FREE(arr->va_ptrs[i], arr->va_size); + } + + LIBCFS_FREE(arr, offsetof(struct cfs_var_array, + va_ptrs[arr->va_count])); +} +EXPORT_SYMBOL(cfs_percpt_free); + +/* + * allocate per cpu-partition variables, returned value is an array of pointers, + * variable can be indexed by CPU partition ID, i.e: + * + * arr = cfs_percpt_alloc(cfs_cpu_pt, size); + * then caller can access memory block for CPU 0 by arr[0], + * memory block for CPU 1 by arr[1]... + * memory block for CPU N by arr[N]... + * + * cacheline aligned. + */ +void * +cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size) +{ + struct cfs_var_array *arr; + int count; + int i; + + count = cfs_cpt_number(cptab); + + LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count])); + if (arr == NULL) + return NULL; + + arr->va_size = size = L1_CACHE_ALIGN(size); + arr->va_count = count; + arr->va_cptab = cptab; + + for (i = 0; i < count; i++) { + LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size); + if (arr->va_ptrs[i] == NULL) { + cfs_percpt_free((void *)&arr->va_ptrs[0]); + return NULL; + } + } + + return (void *)&arr->va_ptrs[0]; +} +EXPORT_SYMBOL(cfs_percpt_alloc); + +/* + * return number of CPUs (or number of elements in per-cpu data) + * according to cptab of @vars + */ +int +cfs_percpt_number(void *vars) +{ + struct cfs_var_array *arr; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + return arr->va_count; +} +EXPORT_SYMBOL(cfs_percpt_number); + +/* + * free variable array, see more detail in cfs_array_alloc + */ +void +cfs_array_free(void *vars) +{ + struct cfs_var_array *arr; + int i; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + for (i = 0; i < arr->va_count; i++) { + if (arr->va_ptrs[i] == NULL) + continue; + + LIBCFS_FREE(arr->va_ptrs[i], arr->va_size); + } + LIBCFS_FREE(arr, offsetof(struct cfs_var_array, + va_ptrs[arr->va_count])); +} +EXPORT_SYMBOL(cfs_array_free); + +/* + * allocate a variable array, returned value is an array of pointers. + * Caller can specify length of array by @count, @size is size of each + * memory block in array. + */ +void * +cfs_array_alloc(int count, unsigned int size) +{ + struct cfs_var_array *arr; + int i; + + LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count])); + if (arr == NULL) + return NULL; + + arr->va_count = count; + arr->va_size = size; + + for (i = 0; i < count; i++) { + LIBCFS_ALLOC(arr->va_ptrs[i], size); + + if (arr->va_ptrs[i] == NULL) { + cfs_array_free((void *)&arr->va_ptrs[0]); + return NULL; + } + } + + return (void *)&arr->va_ptrs[0]; +} +EXPORT_SYMBOL(cfs_array_alloc); + +#ifdef HAVE_LIBCFS_VFREE_ATOMIC +#include +/* + * This is opencoding of vfree_atomic from Linux kernel added in 4.10 with + * minimum changes needed to work on some older kernels too. + * For RHEL6, just use vfree() directly since it is missing too much code. + */ + +#ifndef raw_cpu_ptr +#define raw_cpu_ptr(p) __this_cpu_ptr(p) +#endif + +#ifndef llist_for_each_safe +#define llist_for_each_safe(pos, n, node) \ + for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n)) +#endif + +struct vfree_deferred { + struct llist_head list; + struct work_struct wq; +}; +static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); + +static void free_work(struct work_struct *w) +{ + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + vfree((void *)llnode); +} + +void libcfs_vfree_atomic(const void *addr) +{ + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + + if (!addr) + return; + + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); +} +EXPORT_SYMBOL(libcfs_vfree_atomic); + +void __init init_libcfs_vfree_atomic(void) +{ + int i; + + for_each_possible_cpu(i) { + struct vfree_deferred *p; + + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, free_work); + } +} + +void __exit exit_libcfs_vfree_atomic(void) +{ + flush_scheduled_work(); +} +#endif /* HAVE_LIBCFS_VFREE_ATOMIC */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c new file mode 100644 index 0000000000000..b460df3c4d9bc --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c @@ -0,0 +1,628 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * String manipulation functions. + * + * libcfs/libcfs/libcfs_string.c + * + * Author: Nathan Rutman + */ + +#include +#include +#include + +char *cfs_strrstr(const char *haystack, const char *needle) +{ + char *ptr; + + if (unlikely(haystack == NULL || needle == NULL)) + return NULL; + + if (strlen(needle) == 1) + return strrchr(haystack, needle[0]); + + ptr = strstr(haystack, needle); + if (ptr != NULL) { + while (1) { + char *tmp; + + tmp = strstr(&ptr[1], needle); + if (tmp == NULL) + return ptr; + + ptr = tmp; + } + } + + return NULL; +} +EXPORT_SYMBOL(cfs_strrstr); + +/* Convert a text string to a bitmask */ +int cfs_str2mask(const char *str, const char *(*bit2str)(int bit), + int *oldmask, int minmask, int allmask) +{ + const char *debugstr; + char op = 0; + int newmask = minmask, i, len, found = 0; + ENTRY; + + /* must be a list of tokens separated by whitespace + * and optionally an operator ('+' or '-'). If an operator + * appears first in , '*oldmask' is used as the starting point + * (relative), otherwise minmask is used (absolute). An operator + * applies to all following tokens up to the next operator. */ + while (*str != 0) { + while (isspace(*str)) + str++; + if (*str == 0) + break; + if (*str == '+' || *str == '-') { + op = *str++; + if (!found) + /* only if first token is relative */ + newmask = *oldmask; + while (isspace(*str)) + str++; + if (*str == 0) /* trailing op */ + return -EINVAL; + } + + /* find token length */ + for (len = 0; str[len] != 0 && !isspace(str[len]) && + str[len] != '+' && str[len] != '-'; len++); + + /* match token */ + found = 0; + for (i = 0; i < 32; i++) { + debugstr = bit2str(i); + if (debugstr != NULL && + strlen(debugstr) == len && + strncasecmp(str, debugstr, len) == 0) { + if (op == '-') + newmask &= ~(1 << i); + else + newmask |= (1 << i); + found = 1; + break; + } + } + if (!found && len == 3 && + (strncasecmp(str, "ALL", len) == 0)) { + if (op == '-') + newmask = minmask; + else + newmask = allmask; + found = 1; + } + if (!found) { + CWARN("unknown mask '%.*s'.\n" + "mask usage: [+|-] ...\n", len, str); + return -EINVAL; + } + str += len; + } + + *oldmask = newmask; + return 0; +} +EXPORT_SYMBOL(cfs_str2mask); + +/* get the first string out of @str */ +char *cfs_firststr(char *str, size_t size) +{ + size_t i = 0; + char *end; + + /* trim leading spaces */ + while (i < size && *str && isspace(*str)) { + ++i; + ++str; + } + + /* string with all spaces */ + if (*str == '\0') + goto out; + + end = str; + while (i < size && *end != '\0' && !isspace(*end)) { + ++i; + ++end; + } + + *end= '\0'; +out: + return str; +} +EXPORT_SYMBOL(cfs_firststr); + +char * +cfs_trimwhite(char *str) +{ + char *end; + + while (isspace(*str)) + str++; + + end = str + strlen(str); + while (end > str) { + if (!isspace(end[-1])) + break; + end--; + } + + *end = 0; + return str; +} +EXPORT_SYMBOL(cfs_trimwhite); + +/** + * Extracts tokens from strings. + * + * Looks for \a delim in string \a next, sets \a res to point to + * substring before the delimiter, sets \a next right after the found + * delimiter. + * + * \retval 1 if \a res points to a string of non-whitespace characters + * \retval 0 otherwise + */ +int +cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res) +{ + char *end; + + if (next->ls_str == NULL) + return 0; + + /* skip leading white spaces */ + while (next->ls_len) { + if (!isspace(*next->ls_str)) + break; + next->ls_str++; + next->ls_len--; + } + + if (next->ls_len == 0) /* whitespaces only */ + return 0; + + if (*next->ls_str == delim) { + /* first non-writespace is the delimiter */ + return 0; + } + + res->ls_str = next->ls_str; + end = memchr(next->ls_str, delim, next->ls_len); + if (end == NULL) { + /* there is no the delimeter in the string */ + end = next->ls_str + next->ls_len; + next->ls_str = NULL; + } else { + next->ls_str = end + 1; + next->ls_len -= (end - res->ls_str + 1); + } + + /* skip ending whitespaces */ + while (--end != res->ls_str) { + if (!isspace(*end)) + break; + } + + res->ls_len = end - res->ls_str + 1; + return 1; +} +EXPORT_SYMBOL(cfs_gettok); + +/** + * Converts string to integer. + * + * Accepts decimal and hexadecimal number recordings. + * + * \retval 1 if first \a nob chars of \a str convert to decimal or + * hexadecimal integer in the range [\a min, \a max] + * \retval 0 otherwise + */ +int +cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max) +{ + bool all_numbers = true; + char *endp, cache; + int len; + int rc; + + endp = strim(str); + /** + * kstrouint can only handle strings composed + * of only numbers. We need to scan the string + * passed in for the first non-digit character + * and end the string at that location. If we + * don't find any non-digit character we still + * need to place a '\0' at position len since + * we are not interested in the rest of the + * string which is longer than len in size. + * After we are done the character at the + * position we placed '\0' must be restored. + */ + len = min((int)strlen(endp), nob); + for (; endp < str + len; endp++) { + if (!isxdigit(*endp) && *endp != '-' && + *endp != '+') { + all_numbers = false; + break; + } + } + + /* Eat trailing space */ + if (!all_numbers && isspace(*endp)) { + all_numbers = true; + endp--; + } + + cache = *endp; + *endp = '\0'; + + rc = kstrtouint(str, 0, num); + *endp = cache; + if (rc || !all_numbers) + return 0; + + return (*num >= min && *num <= max); +} +EXPORT_SYMBOL(cfs_str2num_check); + +/** + * Parses \ token of the syntax. If \a bracketed is false, + * \a src should only have a single token which can be \ or \* + * + * \retval pointer to allocated range_expr and initialized + * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a + `* src parses to + * \ | + * \ '-' \ | + * \ '-' \ '/' \ + * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or + * -ENOMEM will be returned. + */ +static int +cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max, + int bracketed, struct cfs_range_expr **expr) +{ + struct cfs_range_expr *re; + struct cfs_lstr tok; + + LIBCFS_ALLOC(re, sizeof(*re)); + if (re == NULL) + return -ENOMEM; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + re->re_lo = min; + re->re_hi = max; + re->re_stride = 1; + goto out; + } + + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_lo, min, max)) { + /* is parsed */ + re->re_hi = re->re_lo; + re->re_stride = 1; + goto out; + } + + if (!bracketed || !cfs_gettok(src, '-', &tok)) + goto failed; + + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_lo, min, max)) + goto failed; + + /* - */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_hi, min, max)) { + /* - is parsed */ + re->re_stride = 1; + goto out; + } + + /* go to check '-' '/' */ + if (cfs_gettok(src, '/', &tok)) { + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_hi, min, max)) + goto failed; + + /* - / ... */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_stride, min, max)) { + /* - / is parsed */ + goto out; + } + } + + out: + *expr = re; + return 0; + + failed: + LIBCFS_FREE(re, sizeof(*re)); + return -EINVAL; +} + +/** + * Print the range expression \a re into specified \a buffer. + * If \a bracketed is true, expression does not need additional + * brackets. + * + * \retval number of characters written + */ +static int +cfs_range_expr_print(char *buffer, int count, struct cfs_range_expr *expr, + bool bracketed) +{ + int i; + char s[] = "["; + char e[] = "]"; + + if (bracketed) + s[0] = e[0] = '\0'; + + if (expr->re_lo == expr->re_hi) + i = scnprintf(buffer, count, "%u", expr->re_lo); + else if (expr->re_stride == 1) + i = scnprintf(buffer, count, "%s%u-%u%s", + s, expr->re_lo, expr->re_hi, e); + else + i = scnprintf(buffer, count, "%s%u-%u/%u%s", + s, expr->re_lo, expr->re_hi, + expr->re_stride, e); + return i; +} + +/** + * Print a list of range expressions (\a expr_list) into specified \a buffer. + * If the list contains several expressions, separate them with comma + * and surround the list with brackets. + * + * \retval number of characters written + */ +int +cfs_expr_list_print(char *buffer, int count, struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + int i = 0, j = 0; + int numexprs = 0; + + if (count <= 0) + return 0; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) + numexprs++; + + if (numexprs > 1) + i += scnprintf(buffer + i, count - i, "["); + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + if (j++ != 0) + i += scnprintf(buffer + i, count - i, ","); + i += cfs_range_expr_print(buffer + i, count - i, expr, + numexprs > 1); + } + + if (numexprs > 1) + i += scnprintf(buffer + i, count - i, "]"); + + return i; +} +EXPORT_SYMBOL(cfs_expr_list_print); + +/** + * Matches value (\a value) against ranges expression list \a expr_list. + * + * \retval 1 if \a value matches + * \retval 0 otherwise + */ +int +cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + if (value >= expr->re_lo && value <= expr->re_hi && + ((value - expr->re_lo) % expr->re_stride) == 0) + return 1; + } + + return 0; +} +EXPORT_SYMBOL(cfs_expr_list_match); + +/** + * Convert express list (\a expr_list) to an array of all matched values + * + * \retval N N is total number of all matched values + * \retval 0 if expression list is empty + * \retval < 0 for failure + */ +int +cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp) +{ + struct cfs_range_expr *expr; + __u32 *val; + int count = 0; + int i; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + count++; + } + } + + if (count == 0) /* empty expression list */ + return 0; + + if (count > max) { + CERROR("Number of values %d exceeds max allowed %d\n", + max, count); + return -EINVAL; + } + + LIBCFS_ALLOC(val, sizeof(val[0]) * count); + if (val == NULL) + return -ENOMEM; + + count = 0; + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + val[count++] = i; + } + } + + *valpp = val; + return count; +} +EXPORT_SYMBOL(cfs_expr_list_values); + +void +cfs_expr_list_values_free(__u32 *values, int num) +{ + /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed + * by OBD_FREE() if it's called by module other than libcfs & LNet, + * otherwise we will see fake memory leak */ + LIBCFS_FREE(values, num * sizeof(values[0])); +} +EXPORT_SYMBOL(cfs_expr_list_values_free); + +/** + * Frees cfs_range_expr structures of \a expr_list. + * + * \retval none + */ +void +cfs_expr_list_free(struct cfs_expr_list *expr_list) +{ + while (!list_empty(&expr_list->el_exprs)) { + struct cfs_range_expr *expr; + + expr = list_entry(expr_list->el_exprs.next, + struct cfs_range_expr, re_link); + list_del(&expr->re_link); + LIBCFS_FREE(expr, sizeof(*expr)); + } + + LIBCFS_FREE(expr_list, sizeof(*expr_list)); +} +EXPORT_SYMBOL(cfs_expr_list_free); + +/** + * Parses \ token of the syntax. + * + * \retval 0 if \a str parses to \ | \ + * \retval -errno otherwise + */ +int +cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp) +{ + struct cfs_expr_list *expr_list; + struct cfs_range_expr *expr; + struct cfs_lstr src; + int rc; + + LIBCFS_ALLOC(expr_list, sizeof(*expr_list)); + if (expr_list == NULL) + return -ENOMEM; + + src.ls_str = str; + src.ls_len = len; + + INIT_LIST_HEAD(&expr_list->el_exprs); + + if (src.ls_str[0] == '[' && + src.ls_str[src.ls_len - 1] == ']') { + src.ls_str++; + src.ls_len -= 2; + + rc = -EINVAL; + while (src.ls_str != NULL) { + struct cfs_lstr tok; + + if (!cfs_gettok(&src, ',', &tok)) { + rc = -EINVAL; + break; + } + + rc = cfs_range_expr_parse(&tok, min, max, 1, &expr); + if (rc != 0) + break; + + list_add_tail(&expr->re_link, + &expr_list->el_exprs); + } + } else { + rc = cfs_range_expr_parse(&src, min, max, 0, &expr); + if (rc == 0) { + list_add_tail(&expr->re_link, + &expr_list->el_exprs); + } + } + + if (rc != 0) + cfs_expr_list_free(expr_list); + else + *elpp = expr_list; + + return rc; +} +EXPORT_SYMBOL(cfs_expr_list_parse); + +/** + * Frees cfs_expr_list structures of \a list. + * + * For each struct cfs_expr_list structure found on \a list it frees + * range_expr list attached to it and frees the cfs_expr_list itself. + * + * \retval none + */ +void +cfs_expr_list_free_list(struct list_head *list) +{ + struct cfs_expr_list *el; + + while (!list_empty(list)) { + el = list_entry(list->next, + struct cfs_expr_list, el_link); + list_del(&el->el_link); + cfs_expr_list_free(el); + } +} +EXPORT_SYMBOL(cfs_expr_list_free_list); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32-pclmul_asm.S b/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32-pclmul_asm.S new file mode 100644 index 0000000000000..ede54c7084d4d --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32-pclmul_asm.S @@ -0,0 +1,243 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas + * Alexander Boyko + */ + +#define __ASSEMBLY__ 1 +#include "inst.h" + +.align 16 +/* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ +.Lconstant_R2R1: + .octa 0x00000001c6e415960000000154442bd4 +/* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ +.Lconstant_R4R3: + .octa 0x00000000ccaa009e00000001751997d0 +/* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ +.Lconstant_R5: + .octa 0x00000000000000000000000163cd6124 +.Lconstant_mask32: + .octa 0x000000000000000000000000FFFFFFFF +/* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ +.Lconstant_RUpoly: + .octa 0x00000001F701164100000001DB710641 + +#define CONSTANT %xmm0 + +#ifdef __x86_64__ +#define BUF %rdi +#define LEN %rsi +#define CRC %edx +#else +#define BUF %eax +#define LEN %edx +#define CRC %ecx +#endif + +.text +/** + * Calculate crc32 + * BUF - buffer (16 bytes aligned) + * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 + * CRC - initial crc32 + * return %eax crc32 + * uint crc32_pclmul_le_16(unsigned char const *buffer, + * size_t len, uint crc32) + */ +.globl crc32_pclmul_le_16 +.align 4, 0x90 +crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */ + movdqa (BUF), %xmm1 + movdqa 0x10(BUF), %xmm2 + movdqa 0x20(BUF), %xmm3 + movdqa 0x30(BUF), %xmm4 + movd CRC, CONSTANT + pxor CONSTANT, %xmm1 + sub $0x40, LEN + add $0x40, BUF +#ifndef __x86_64__ + /* This is for position independed code(-fPIC) support for 32bit */ + call delta +delta: + pop %ecx +#endif + cmp $0x40, LEN + jb less_64 + +#ifdef __x86_64__ + movdqa .Lconstant_R2R1(%rip), CONSTANT +#else + movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT +#endif + +loop_64:/* 64 bytes Full cache line folding */ + prefetchnta 0x40(BUF) + movdqa %xmm1, %xmm5 + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 +#ifdef __x86_64__ + movdqa %xmm4, %xmm8 +#endif + PCLMULQDQ 00, CONSTANT, %xmm1 + PCLMULQDQ 00, CONSTANT, %xmm2 + PCLMULQDQ 00, CONSTANT, %xmm3 +#ifdef __x86_64__ + PCLMULQDQ 00, CONSTANT, %xmm4 +#endif + PCLMULQDQ 0x11, CONSTANT, %xmm5 + PCLMULQDQ 0x11, CONSTANT, %xmm6 + PCLMULQDQ 0x11, CONSTANT, %xmm7 +#ifdef __x86_64__ + PCLMULQDQ 0x11, CONSTANT, %xmm8 +#endif + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 +#ifdef __x86_64__ + pxor %xmm8, %xmm4 +#else + /* xmm8 unsupported for x32 */ + movdqa %xmm4, %xmm5 + PCLMULQDQ 00, CONSTANT, %xmm4 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm4 +#endif + + pxor (BUF), %xmm1 + pxor 0x10(BUF), %xmm2 + pxor 0x20(BUF), %xmm3 + pxor 0x30(BUF), %xmm4 + + sub $0x40, LEN + add $0x40, BUF + cmp $0x40, LEN + jge loop_64 +less_64:/* Folding cache line into 128bit */ +#ifdef __x86_64__ + movdqa .Lconstant_R4R3(%rip), CONSTANT +#else + movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT +#endif + prefetchnta (BUF) + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm2, %xmm1 + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm3, %xmm1 + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm1 + + cmp $0x10, LEN + jb fold_64 +loop_16:/* Folding rest buffer into 128bit */ + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor (BUF), %xmm1 + sub $0x10, LEN + add $0x10, BUF + cmp $0x10, LEN + jge loop_16 + +fold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ + psrldq $0x08, %xmm1 + pxor CONSTANT, %xmm1 + + /* final 32-bit fold */ + movdqa %xmm1, %xmm2 +#ifdef __x86_64__ + movdqa .Lconstant_R5(%rip), CONSTANT + movdqa .Lconstant_mask32(%rip), %xmm3 +#else + movdqa .Lconstant_R5 - delta(%ecx), CONSTANT + movdqa .Lconstant_mask32 - delta(%ecx), %xmm3 +#endif + psrldq $0x04, %xmm2 + pand %xmm3, %xmm1 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ +#ifdef __x86_64__ + movdqa .Lconstant_RUpoly(%rip), CONSTANT +#else + movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT +#endif + movdqa %xmm1, %xmm2 + pand %xmm3, %xmm1 + PCLMULQDQ 0x10, CONSTANT, %xmm1 + pand %xmm3, %xmm1 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + PEXTRD 0x01, %xmm1, %eax + + ret diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32c-pcl-intel-asm_64.S b/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32c-pcl-intel-asm_64.S new file mode 100644 index 0000000000000..5c896b95024ea --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32c-pcl-intel-asm_64.S @@ -0,0 +1,466 @@ +/* + * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) + * + * The white papers on CRC32C calculations with PCLMULQDQ instruction can be + * downloaded from: + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf + * + * Copyright (C) 2012 Intel Corporation. + * + * Authors: + * Wajdi Feghali + * James Guilford + * David Cote + * Tim Chen + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "inst.h" + +## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JMPTBL_ENTRY i +.word crc_\i - crc_array +.endm + +.macro JNC_LESS_THAN j + jnc less_than_\j +.endm + +# Define threshold where buffers are considered "small" and routed to more +# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so +# SMALL_SIZE can be no larger than 255. + +#define SMALL_SIZE 200 + +.if (SMALL_SIZE > 255) +.error "SMALL_SIZE must be < 256" +.endif + +# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); + +ENTRY(crc_pcl) +#define bufp %rdi +#define bufp_dw %edi +#define bufp_w %di +#define bufp_b %dil +#define bufptmp %rcx +#define block_0 %rcx +#define block_1 %rdx +#define block_2 %r11 +#define len %rsi +#define len_dw %esi +#define len_w %si +#define len_b %sil +#define crc_init_arg %rdx +#define tmp %rbx +#define crc_init %r8 +#define crc_init_dw %r8d +#define crc1 %r9 +#define crc2 %r10 + + pushq %rbx + pushq %rdi + pushq %rsi + + ## Move crc_init for Linux to a different + mov crc_init_arg, crc_init + + ################################################################ + ## 1) ALIGN: + ################################################################ + + mov bufp, bufptmp # rdi = *buf + neg bufp + and $7, bufp # calculate the unalignment amount of + # the address + je proc_block # Skip if aligned + + ## If len is less than 8 and we're unaligned, we need to jump + ## to special code to avoid reading beyond the end of the buffer + cmp $8, len + jae do_align + # less_than_8 expects length in upper 3 bits of len_dw + # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] + shl $32-3+1, len_dw + jmp less_than_8_post_shl1 + +do_align: + #### Calculate CRC of unaligned bytes of the buffer (if any) + movq (bufptmp), tmp # load a quadward from the buffer + add bufp, bufptmp # align buffer pointer for quadword + # processing + sub bufp, len # update buffer length +align_loop: + crc32b %bl, crc_init_dw # compute crc32 of 1-byte + shr $8, tmp # get next byte + dec bufp + jne align_loop + +proc_block: + + ################################################################ + ## 2) PROCESS BLOCKS: + ################################################################ + + ## compute num of bytes to be processed + movq len, tmp # save num bytes in tmp + + cmpq $128*24, len + jae full_block + +continue_block: + cmpq $SMALL_SIZE, len + jb small + + ## len < 128*24 + movq $2731, %rax # 2731 = ceil(2^16 / 24) + mul len_dw + shrq $16, %rax + + ## eax contains floor(bytes / 24) = num 24-byte chunks to do + + ## process rax 24-byte chunks (128 >= rax >= 0) + + ## compute end address of each block + ## block 0 (base addr + RAX * 8) + ## block 1 (base addr + RAX * 16) + ## block 2 (base addr + RAX * 24) + lea (bufptmp, %rax, 8), block_0 + lea (block_0, %rax, 8), block_1 + lea (block_1, %rax, 8), block_2 + + xor crc1, crc1 + xor crc2, crc2 + + ## branch into array + lea jump_table(%rip), bufp + movzxw (bufp, %rax, 2), len + offset=crc_array-jump_table + lea offset(bufp, len, 1), bufp + jmp *bufp + + ################################################################ + ## 2a) PROCESS FULL BLOCKS: + ################################################################ +full_block: + movq $128,%rax + lea 128*8*2(block_0), block_1 + lea 128*8*3(block_0), block_2 + add $128*8*1, block_0 + + xor crc1,crc1 + xor crc2,crc2 + + # Fall thruogh into top of crc array (crc_128) + + ################################################################ + ## 3) CRC Array: + ################################################################ + +crc_array: + i=128 +.rept 128-1 +.altmacro +LABEL crc_ %i +.noaltmacro + crc32q -i*8(block_0), crc_init + crc32q -i*8(block_1), crc1 + crc32q -i*8(block_2), crc2 + i=(i-1) +.endr + +.altmacro +LABEL crc_ %i +.noaltmacro + crc32q -i*8(block_0), crc_init + crc32q -i*8(block_1), crc1 +# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet + + mov block_2, block_0 + + ################################################################ + ## 4) Combine three results: + ################################################################ + + lea (K_table-16)(%rip), bufp # first entry is for idx 1 + shlq $3, %rax # rax *= 8 + subq %rax, tmp # tmp -= rax*8 + shlq $1, %rax + subq %rax, tmp # tmp -= rax*16 + # (total tmp -= rax*24) + addq %rax, bufp + + movdqa (bufp), %xmm0 # 2 consts: K1:K2 + + movq crc_init, %xmm1 # CRC for block 1 + PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 + PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1 + + pxor %xmm2,%xmm1 + movq %xmm1, %rax + xor -i*8(block_2), %rax + mov crc2, crc_init + crc32 %rax, crc_init + +################################################################ +## 5) Check for end: +################################################################ + +LABEL crc_ 0 + mov tmp, len + cmp $128*24, tmp + jae full_block + cmp $24, tmp + jae continue_block + +less_than_24: + shl $32-4, len_dw # less_than_16 expects length + # in upper 4 bits of len_dw + jnc less_than_16 + crc32q (bufptmp), crc_init + crc32q 8(bufptmp), crc_init + jz do_return + add $16, bufptmp + # len is less than 8 if we got here + # less_than_8 expects length in upper 3 bits of len_dw + # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] + shl $2, len_dw + jmp less_than_8_post_shl1 + + ####################################################################### + ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) + ####################################################################### +small: + shl $32-8, len_dw # Prepare len_dw for less_than_256 + j=256 +.rept 5 # j = {256, 128, 64, 32, 16} +.altmacro +LABEL less_than_ %j # less_than_j: Length should be in + # upper lg(j) bits of len_dw + j=(j/2) + shl $1, len_dw # Get next MSB + JNC_LESS_THAN %j +.noaltmacro + i=0 +.rept (j/8) + crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data + i=i+8 +.endr + jz do_return # Return if remaining length is zero + add $j, bufptmp # Advance buf +.endr + +less_than_8: # Length should be stored in + # upper 3 bits of len_dw + shl $1, len_dw +less_than_8_post_shl1: + jnc less_than_4 + crc32l (bufptmp), crc_init_dw # CRC of 4 bytes + jz do_return # return if remaining data is zero + add $4, bufptmp +less_than_4: # Length should be stored in + # upper 2 bits of len_dw + shl $1, len_dw + jnc less_than_2 + crc32w (bufptmp), crc_init_dw # CRC of 2 bytes + jz do_return # return if remaining data is zero + add $2, bufptmp +less_than_2: # Length should be stored in the MSB + # of len_dw + shl $1, len_dw + jnc less_than_1 + crc32b (bufptmp), crc_init_dw # CRC of 1 byte +less_than_1: # Length should be zero +do_return: + movq crc_init, %rax + popq %rsi + popq %rdi + popq %rbx + ret + + ################################################################ + ## jump table Table is 129 entries x 2 bytes each + ################################################################ +.align 4 +jump_table: + i=0 +.rept 129 +.altmacro +JMPTBL_ENTRY %i +.noaltmacro + i=i+1 +.endr + +ENDPROC(crc_pcl) + + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 quad words each + ################################################################ +.data +.align 64 +K_table: + .quad 0x14cd00bd6,0x105ec76f0 + .quad 0x0ba4fc28e,0x14cd00bd6 + .quad 0x1d82c63da,0x0f20c0dfe + .quad 0x09e4addf8,0x0ba4fc28e + .quad 0x039d3b296,0x1384aa63a + .quad 0x102f9b8a2,0x1d82c63da + .quad 0x14237f5e6,0x01c291d04 + .quad 0x00d3b6092,0x09e4addf8 + .quad 0x0c96cfdc0,0x0740eef02 + .quad 0x18266e456,0x039d3b296 + .quad 0x0daece73e,0x0083a6eec + .quad 0x0ab7aff2a,0x102f9b8a2 + .quad 0x1248ea574,0x1c1733996 + .quad 0x083348832,0x14237f5e6 + .quad 0x12c743124,0x02ad91c30 + .quad 0x0b9e02b86,0x00d3b6092 + .quad 0x018b33a4e,0x06992cea2 + .quad 0x1b331e26a,0x0c96cfdc0 + .quad 0x17d35ba46,0x07e908048 + .quad 0x1bf2e8b8a,0x18266e456 + .quad 0x1a3e0968a,0x11ed1f9d8 + .quad 0x0ce7f39f4,0x0daece73e + .quad 0x061d82e56,0x0f1d0f55e + .quad 0x0d270f1a2,0x0ab7aff2a + .quad 0x1c3f5f66c,0x0a87ab8a8 + .quad 0x12ed0daac,0x1248ea574 + .quad 0x065863b64,0x08462d800 + .quad 0x11eef4f8e,0x083348832 + .quad 0x1ee54f54c,0x071d111a8 + .quad 0x0b3e32c28,0x12c743124 + .quad 0x0064f7f26,0x0ffd852c6 + .quad 0x0dd7e3b0c,0x0b9e02b86 + .quad 0x0f285651c,0x0dcb17aa4 + .quad 0x010746f3c,0x018b33a4e + .quad 0x1c24afea4,0x0f37c5aee + .quad 0x0271d9844,0x1b331e26a + .quad 0x08e766a0c,0x06051d5a2 + .quad 0x093a5f730,0x17d35ba46 + .quad 0x06cb08e5c,0x11d5ca20e + .quad 0x06b749fb2,0x1bf2e8b8a + .quad 0x1167f94f2,0x021f3d99c + .quad 0x0cec3662e,0x1a3e0968a + .quad 0x19329634a,0x08f158014 + .quad 0x0e6fc4e6a,0x0ce7f39f4 + .quad 0x08227bb8a,0x1a5e82106 + .quad 0x0b0cd4768,0x061d82e56 + .quad 0x13c2b89c4,0x188815ab2 + .quad 0x0d7a4825c,0x0d270f1a2 + .quad 0x10f5ff2ba,0x105405f3e + .quad 0x00167d312,0x1c3f5f66c + .quad 0x0f6076544,0x0e9adf796 + .quad 0x026f6a60a,0x12ed0daac + .quad 0x1a2adb74e,0x096638b34 + .quad 0x19d34af3a,0x065863b64 + .quad 0x049c3cc9c,0x1e50585a0 + .quad 0x068bce87a,0x11eef4f8e + .quad 0x1524fa6c6,0x19f1c69dc + .quad 0x16cba8aca,0x1ee54f54c + .quad 0x042d98888,0x12913343e + .quad 0x1329d9f7e,0x0b3e32c28 + .quad 0x1b1c69528,0x088f25a3a + .quad 0x02178513a,0x0064f7f26 + .quad 0x0e0ac139e,0x04e36f0b0 + .quad 0x0170076fa,0x0dd7e3b0c + .quad 0x141a1a2e2,0x0bd6f81f8 + .quad 0x16ad828b4,0x0f285651c + .quad 0x041d17b64,0x19425cbba + .quad 0x1fae1cc66,0x010746f3c + .quad 0x1a75b4b00,0x18db37e8a + .quad 0x0f872e54c,0x1c24afea4 + .quad 0x01e41e9fc,0x04c144932 + .quad 0x086d8e4d2,0x0271d9844 + .quad 0x160f7af7a,0x052148f02 + .quad 0x05bb8f1bc,0x08e766a0c + .quad 0x0a90fd27a,0x0a3c6f37a + .quad 0x0b3af077a,0x093a5f730 + .quad 0x04984d782,0x1d22c238e + .quad 0x0ca6ef3ac,0x06cb08e5c + .quad 0x0234e0b26,0x063ded06a + .quad 0x1d88abd4a,0x06b749fb2 + .quad 0x04597456a,0x04d56973c + .quad 0x0e9e28eb4,0x1167f94f2 + .quad 0x07b3ff57a,0x19385bf2e + .quad 0x0c9c8b782,0x0cec3662e + .quad 0x13a9cba9e,0x0e417f38a + .quad 0x093e106a4,0x19329634a + .quad 0x167001a9c,0x14e727980 + .quad 0x1ddffc5d4,0x0e6fc4e6a + .quad 0x00df04680,0x0d104b8fc + .quad 0x02342001e,0x08227bb8a + .quad 0x00a2a8d7e,0x05b397730 + .quad 0x168763fa6,0x0b0cd4768 + .quad 0x1ed5a407a,0x0e78eb416 + .quad 0x0d2c3ed1a,0x13c2b89c4 + .quad 0x0995a5724,0x1641378f0 + .quad 0x19b1afbc4,0x0d7a4825c + .quad 0x109ffedc0,0x08d96551c + .quad 0x0f2271e60,0x10f5ff2ba + .quad 0x00b0bf8ca,0x00bf80dd2 + .quad 0x123888b7a,0x00167d312 + .quad 0x1e888f7dc,0x18dcddd1c + .quad 0x002ee03b2,0x0f6076544 + .quad 0x183e8d8fe,0x06a45d2b2 + .quad 0x133d7a042,0x026f6a60a + .quad 0x116b0f50c,0x1dd3e10e8 + .quad 0x05fabe670,0x1a2adb74e + .quad 0x130004488,0x0de87806c + .quad 0x000bcf5f6,0x19d34af3a + .quad 0x18f0c7078,0x014338754 + .quad 0x017f27698,0x049c3cc9c + .quad 0x058ca5f00,0x15e3e77ee + .quad 0x1af900c24,0x068bce87a + .quad 0x0b5cfca28,0x0dd07448e + .quad 0x0ded288f8,0x1524fa6c6 + .quad 0x059f229bc,0x1d8048348 + .quad 0x06d390dec,0x16cba8aca + .quad 0x037170390,0x0a3e3e02c + .quad 0x06353c1cc,0x042d98888 + .quad 0x0c4584f5c,0x0d73c7bea + .quad 0x1f16a3418,0x1329d9f7e + .quad 0x0531377e2,0x185137662 + .quad 0x1d8d9ca7c,0x1b1c69528 + .quad 0x0b25b29f2,0x18a08b5bc + .quad 0x19fb2a8b0,0x02178513a + .quad 0x1a08fe6ac,0x1da758ae0 + .quad 0x045cddf4e,0x0e0ac139e + .quad 0x1a91647f2,0x169cf9eb0 + .quad 0x1a0f717c4,0x0170076fa diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/inst.h b/drivers/staging/lustrefsx/libcfs/libcfs/linux/inst.h new file mode 100644 index 0000000000000..3e115273ed885 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/inst.h @@ -0,0 +1,310 @@ +/* + * Generate .byte code for some instructions not supported by old + * binutils. + */ +#ifndef X86_ASM_INST_H +#define X86_ASM_INST_H + +#ifdef __ASSEMBLY__ + +#define REG_NUM_INVALID 100 + +#define REG_TYPE_R32 0 +#define REG_TYPE_R64 1 +#define REG_TYPE_XMM 2 +#define REG_TYPE_INVALID 100 + + .macro R32_NUM opd r32 + \opd = REG_NUM_INVALID + .ifc \r32,%eax + \opd = 0 + .endif + .ifc \r32,%ecx + \opd = 1 + .endif + .ifc \r32,%edx + \opd = 2 + .endif + .ifc \r32,%ebx + \opd = 3 + .endif + .ifc \r32,%esp + \opd = 4 + .endif + .ifc \r32,%ebp + \opd = 5 + .endif + .ifc \r32,%esi + \opd = 6 + .endif + .ifc \r32,%edi + \opd = 7 + .endif +#ifdef CONFIG_X86_64 + .ifc \r32,%r8d + \opd = 8 + .endif + .ifc \r32,%r9d + \opd = 9 + .endif + .ifc \r32,%r10d + \opd = 10 + .endif + .ifc \r32,%r11d + \opd = 11 + .endif + .ifc \r32,%r12d + \opd = 12 + .endif + .ifc \r32,%r13d + \opd = 13 + .endif + .ifc \r32,%r14d + \opd = 14 + .endif + .ifc \r32,%r15d + \opd = 15 + .endif +#endif + .endm + + .macro R64_NUM opd r64 + \opd = REG_NUM_INVALID +#ifdef CONFIG_X86_64 + .ifc \r64,%rax + \opd = 0 + .endif + .ifc \r64,%rcx + \opd = 1 + .endif + .ifc \r64,%rdx + \opd = 2 + .endif + .ifc \r64,%rbx + \opd = 3 + .endif + .ifc \r64,%rsp + \opd = 4 + .endif + .ifc \r64,%rbp + \opd = 5 + .endif + .ifc \r64,%rsi + \opd = 6 + .endif + .ifc \r64,%rdi + \opd = 7 + .endif + .ifc \r64,%r8 + \opd = 8 + .endif + .ifc \r64,%r9 + \opd = 9 + .endif + .ifc \r64,%r10 + \opd = 10 + .endif + .ifc \r64,%r11 + \opd = 11 + .endif + .ifc \r64,%r12 + \opd = 12 + .endif + .ifc \r64,%r13 + \opd = 13 + .endif + .ifc \r64,%r14 + \opd = 14 + .endif + .ifc \r64,%r15 + \opd = 15 + .endif +#endif + .endm + + .macro XMM_NUM opd xmm + \opd = REG_NUM_INVALID + .ifc \xmm,%xmm0 + \opd = 0 + .endif + .ifc \xmm,%xmm1 + \opd = 1 + .endif + .ifc \xmm,%xmm2 + \opd = 2 + .endif + .ifc \xmm,%xmm3 + \opd = 3 + .endif + .ifc \xmm,%xmm4 + \opd = 4 + .endif + .ifc \xmm,%xmm5 + \opd = 5 + .endif + .ifc \xmm,%xmm6 + \opd = 6 + .endif + .ifc \xmm,%xmm7 + \opd = 7 + .endif + .ifc \xmm,%xmm8 + \opd = 8 + .endif + .ifc \xmm,%xmm9 + \opd = 9 + .endif + .ifc \xmm,%xmm10 + \opd = 10 + .endif + .ifc \xmm,%xmm11 + \opd = 11 + .endif + .ifc \xmm,%xmm12 + \opd = 12 + .endif + .ifc \xmm,%xmm13 + \opd = 13 + .endif + .ifc \xmm,%xmm14 + \opd = 14 + .endif + .ifc \xmm,%xmm15 + \opd = 15 + .endif + .endm + + .macro REG_TYPE type reg + R32_NUM reg_type_r32 \reg + R64_NUM reg_type_r64 \reg + XMM_NUM reg_type_xmm \reg + .if reg_type_r64 <> REG_NUM_INVALID + \type = REG_TYPE_R64 + .elseif reg_type_r32 <> REG_NUM_INVALID + \type = REG_TYPE_R32 + .elseif reg_type_xmm <> REG_NUM_INVALID + \type = REG_TYPE_XMM + .else + \type = REG_TYPE_INVALID + .endif + .endm + + .macro PFX_OPD_SIZE + .byte 0x66 + .endm + + .macro PFX_REX opd1 opd2 W=0 + .if ((\opd1 | \opd2) & 8) || \W + .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3) + .endif + .endm + + .macro MODRM mod opd1 opd2 + .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) + .endm + + .macro PSHUFB_XMM xmm1 xmm2 + XMM_NUM pshufb_opd1 \xmm1 + XMM_NUM pshufb_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX pshufb_opd1 pshufb_opd2 + .byte 0x0f, 0x38, 0x00 + MODRM 0xc0 pshufb_opd1 pshufb_opd2 + .endm + + .macro PCLMULQDQ imm8 xmm1 xmm2 + XMM_NUM clmul_opd1 \xmm1 + XMM_NUM clmul_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX clmul_opd1 clmul_opd2 + .byte 0x0f, 0x3a, 0x44 + MODRM 0xc0 clmul_opd1 clmul_opd2 + .byte \imm8 + .endm + + .macro PEXTRD imm8 xmm gpr + R32_NUM extrd_opd1 \gpr + XMM_NUM extrd_opd2 \xmm + PFX_OPD_SIZE + PFX_REX extrd_opd1 extrd_opd2 + .byte 0x0f, 0x3a, 0x16 + MODRM 0xc0 extrd_opd1 extrd_opd2 + .byte \imm8 + .endm + + .macro AESKEYGENASSIST rcon xmm1 xmm2 + XMM_NUM aeskeygen_opd1 \xmm1 + XMM_NUM aeskeygen_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aeskeygen_opd1 aeskeygen_opd2 + .byte 0x0f, 0x3a, 0xdf + MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2 + .byte \rcon + .endm + + .macro AESIMC xmm1 xmm2 + XMM_NUM aesimc_opd1 \xmm1 + XMM_NUM aesimc_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesimc_opd1 aesimc_opd2 + .byte 0x0f, 0x38, 0xdb + MODRM 0xc0 aesimc_opd1 aesimc_opd2 + .endm + + .macro AESENC xmm1 xmm2 + XMM_NUM aesenc_opd1 \xmm1 + XMM_NUM aesenc_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesenc_opd1 aesenc_opd2 + .byte 0x0f, 0x38, 0xdc + MODRM 0xc0 aesenc_opd1 aesenc_opd2 + .endm + + .macro AESENCLAST xmm1 xmm2 + XMM_NUM aesenclast_opd1 \xmm1 + XMM_NUM aesenclast_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesenclast_opd1 aesenclast_opd2 + .byte 0x0f, 0x38, 0xdd + MODRM 0xc0 aesenclast_opd1 aesenclast_opd2 + .endm + + .macro AESDEC xmm1 xmm2 + XMM_NUM aesdec_opd1 \xmm1 + XMM_NUM aesdec_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesdec_opd1 aesdec_opd2 + .byte 0x0f, 0x38, 0xde + MODRM 0xc0 aesdec_opd1 aesdec_opd2 + .endm + + .macro AESDECLAST xmm1 xmm2 + XMM_NUM aesdeclast_opd1 \xmm1 + XMM_NUM aesdeclast_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesdeclast_opd1 aesdeclast_opd2 + .byte 0x0f, 0x38, 0xdf + MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 + .endm + + .macro MOVQ_R64_XMM opd1 opd2 + REG_TYPE movq_r64_xmm_opd1_type \opd1 + .if movq_r64_xmm_opd1_type == REG_TYPE_XMM + XMM_NUM movq_r64_xmm_opd1 \opd1 + R64_NUM movq_r64_xmm_opd2 \opd2 + .else + R64_NUM movq_r64_xmm_opd1 \opd1 + XMM_NUM movq_r64_xmm_opd2 \opd2 + .endif + PFX_OPD_SIZE + PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1 + .if movq_r64_xmm_opd1_type == REG_TYPE_XMM + .byte 0x0f, 0x7e + .else + .byte 0x0f, 0x6e + .endif + MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 + .endm +#endif + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c new file mode 100644 index 0000000000000..7a19a5803ee8c --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c @@ -0,0 +1,137 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +/* + * This is crypto api shash wrappers to zlib_adler32. + */ + +#include +#include +#include +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +static int adler32_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 1; + + return 0; +} + +static int adler32_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) + return -EINVAL; + + *mctx = *(u32 *)key; + return 0; +} + +static int adler32_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *cksump = shash_desc_ctx(desc); + + *cksump = *mctx; + + return 0; +} + +static int adler32_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *cksump = shash_desc_ctx(desc); + + *cksump = zlib_adler32(*cksump, data, len); + return 0; +} +static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len, + u8 *out) +{ + *(u32 *)out = zlib_adler32(*cksump, data, len); + return 0; +} + +static int adler32_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __adler32_finup(shash_desc_ctx(desc), data, len, out); +} + +static int adler32_final(struct shash_desc *desc, u8 *out) +{ + u32 *cksump = shash_desc_ctx(desc); + + *(u32 *)out = *cksump; + return 0; +} + +static int adler32_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} +static struct shash_alg alg = { + .setkey = adler32_setkey, + .init = adler32_init, + .update = adler32_update, + .final = adler32_final, + .finup = adler32_finup, + .digest = adler32_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "adler32", + .cra_driver_name = "adler32-zlib", + .cra_priority = 100, +#ifdef CRYPTO_ALG_OPTIONAL_KEY + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, +#endif + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = NULL, + .cra_init = adler32_cra_init, + } +}; + +int cfs_crypto_adler32_register(void) +{ + return crypto_register_shash(&alg); +} + +void cfs_crypto_adler32_unregister(void) +{ + crypto_unregister_shash(&alg); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c new file mode 100644 index 0000000000000..c794e670ecfd9 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c @@ -0,0 +1,150 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +/* + * This is crypto api shash wrappers to crc32_le. + */ + +#include +#include +#include +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +static u32 __crc32_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le(crc, p, len); +} + +/** No default init with ~0 */ +static int crc32_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 0; + + return 0; +} + + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) + return -EINVAL; + + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + + return 0; +} + +static int crc32_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + *crcp = __crc32_le(*crcp, data, len); + return 0; +} +/* No final XOR 0xFFFFFFFF, like crc32_le */ +static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__le32 *)out = cpu_to_le32(__crc32_le(*crcp, data, len)); + return 0; +} + +static int crc32_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *)out = cpu_to_le32p(crcp); + return 0; +} + +static int crc32_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} +static struct shash_alg alg = { + .setkey = crc32_setkey, + .init = crc32_init, + .update = crc32_update, + .final = crc32_final, + .finup = crc32_finup, + .digest = crc32_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32", + .cra_driver_name = "crc32-table", + .cra_priority = 100, +#ifdef CRYPTO_ALG_OPTIONAL_KEY + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, +#endif + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = NULL, + .cra_init = crc32_cra_init, + } +}; + +int cfs_crypto_crc32_register(void) +{ + return crypto_register_shash(&alg); +} + +void cfs_crypto_crc32_unregister(void) +{ + crypto_unregister_shash(&alg); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c new file mode 100644 index 0000000000000..566ba882ede82 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c @@ -0,0 +1,161 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Wrappers for kernel crypto shash api to pclmulqdq crc32c imlementation. + * + * Author: James Simmons + */ +#include +#include +#include +#include +#ifdef HAVE_FPU_API_HEADER +#include +#else +#include +#endif +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, + unsigned int crc_init); + +static int crc32c_pclmul_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = ~0; + return 0; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32c_pclmul_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) + return -EINVAL; + + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32c_pclmul_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + return 0; +} + +static int crc32c_pclmul_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + kernel_fpu_begin(); + *crcp = crc_pcl(data, len, *crcp); + kernel_fpu_end(); + return 0; +} + +/* No final XOR 0xFFFFFFFF, like crc32_le */ +static int __crc32c_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + kernel_fpu_begin(); + *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); + kernel_fpu_end(); + return 0; +} + +static int crc32c_pclmul_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_pclmul_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32c_pclmul_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + +static int crc32c_pclmul_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *)out = ~cpu_to_le32p(crcp); + return 0; +} + +static struct shash_alg alg = { + .setkey = crc32c_pclmul_setkey, + .init = crc32c_pclmul_init, + .update = crc32c_pclmul_update, + .final = crc32c_pclmul_final, + .finup = crc32c_pclmul_finup, + .digest = crc32c_pclmul_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-pclmul", + .cra_priority = 150, +#ifdef CRYPTO_ALG_OPTIONAL_KEY + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, +#endif + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = NULL, + .cra_init = crc32c_pclmul_cra_init, + } +}; + +#ifndef X86_FEATURE_XMM4_2 +#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ +#endif + +int cfs_crypto_crc32c_pclmul_register(void) +{ + if (!boot_cpu_has(X86_FEATURE_XMM4_2)) { + CDEBUG(D_INFO, "CRC32 instruction is not detected.\n"); + return -ENODEV; + } + return crypto_register_shash(&alg); +} + +void cfs_crypto_crc32c_pclmul_unregister(void) +{ + crypto_unregister_shash(&alg); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c new file mode 100644 index 0000000000000..8d4cb640681f8 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c @@ -0,0 +1,197 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation. + * + * Author: Alexander Boyko + */ +#include +#include +#include +#include +#ifdef HAVE_FPU_API_HEADER +#include +#else +#include +#endif +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +#define PCLMUL_MIN_LEN 64L /* minimum size of buffer + * for crc32_pclmul_le_16 */ +#define SCALE_F 16L /* size of xmm register */ +#define SCALE_F_MASK (SCALE_F - 1) + +u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32); + +static u32 __attribute__((pure)) + crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len) +{ + unsigned int iquotient; + unsigned int iremainder; + unsigned int prealign; + + if (len < PCLMUL_MIN_LEN + SCALE_F_MASK) + return crc32_le(crc, p, len); + + if ((long)p & SCALE_F_MASK) { + /* align p to 16 byte */ + prealign = SCALE_F - ((long)p & SCALE_F_MASK); + + crc = crc32_le(crc, p, prealign); + len -= prealign; + p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) & + ~SCALE_F_MASK); + } + iquotient = len & (~SCALE_F_MASK); + iremainder = len & SCALE_F_MASK; + + kernel_fpu_begin(); + crc = crc32_pclmul_le_16(p, iquotient, crc); + kernel_fpu_end(); + + if (iremainder) + crc = crc32_le(crc, p + iquotient, iremainder); + + return crc; +} + +static int crc32_pclmul_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 0; + + return 0; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) + return -EINVAL; + + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32_pclmul_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + + return 0; +} + +static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + *crcp = crc32_pclmul_le(*crcp, data, len); + return 0; +} + +/* No final XOR 0xFFFFFFFF, like crc32_le */ +static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len)); + return 0; +} + +static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32_pclmul_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *)out = cpu_to_le32p(crcp); + return 0; +} + +static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + +static struct shash_alg alg = { + .setkey = crc32_pclmul_setkey, + .init = crc32_pclmul_init, + .update = crc32_pclmul_update, + .final = crc32_pclmul_final, + .finup = crc32_pclmul_finup, + .digest = crc32_pclmul_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32", + .cra_driver_name = "crc32-pclmul", + .cra_priority = 200, +#ifdef CRYPTO_ALG_OPTIONAL_KEY + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, +#endif + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = NULL, + .cra_init = crc32_pclmul_cra_init, + } +}; + +#ifndef X86_FEATURE_PCLMULQDQ +#define X86_FEATURE_PCLMULQDQ (4*32+1) /* PCLMULQDQ instruction */ +#endif + +int cfs_crypto_crc32_pclmul_register(void) +{ + if (!boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + CDEBUG(D_INFO, "PCLMULQDQ-NI instructions are not detected.\n"); + return -ENODEV; + } + return crypto_register_shash(&alg); +} + +void cfs_crypto_crc32_pclmul_unregister(void) +{ + crypto_unregister_shash(&alg); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c new file mode 100644 index 0000000000000..dce1734a4d500 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c @@ -0,0 +1,522 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include + +#ifndef HAVE_CRYPTO_HASH_HELPERS +static inline const char *crypto_ahash_alg_name(struct crypto_ahash *tfm) +{ + return crypto_tfm_alg_name(crypto_ahash_tfm(tfm)); +} + +static inline const char *crypto_ahash_driver_name(struct crypto_ahash *tfm) +{ + return crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm)); +} +#endif + +/** + * Array of hash algorithm speed in MByte per second + */ +static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX]; + +/** + * Initialize the state descriptor for the specified hash algorithm. + * + * An internal routine to allocate the hash-specific state in \a hdesc for + * use with cfs_crypto_hash_digest() to compute the hash of a single message, + * though possibly in multiple chunks. The descriptor internal state should + * be freed with cfs_crypto_hash_final(). + * + * \param[in] hash_alg hash algorithm id (CFS_HASH_ALG_*) + * \param[out] type pointer to the hash description in hash_types[] array + * \param[in,out] req ahash request to be initialized + * \param[in] key initial hash value/state, NULL to use default value + * \param[in] key_len length of \a key + * + * \retval 0 on success + * \retval negative errno on failure + */ +static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg, + const struct cfs_crypto_hash_type **type, + struct ahash_request **req, + unsigned char *key, + unsigned int key_len) +{ + struct crypto_ahash *tfm; + int err = 0; + + *type = cfs_crypto_hash_type(hash_alg); + if (!*type) { + CWARN("Unsupported hash algorithm id = %d, max id is %d\n", + hash_alg, CFS_HASH_ALG_MAX); + return -EINVAL; + } + + /* Keys are only supported for the hmac version */ + if (key && key_len > 0) { + char *algo_name; + + algo_name = kasprintf(GFP_KERNEL, "hmac(%s)", + (*type)->cht_name); + if (!algo_name) + return -ENOMEM; + + tfm = crypto_alloc_ahash(algo_name, 0, CRYPTO_ALG_ASYNC); + kfree(algo_name); + } else { + tfm = crypto_alloc_ahash((*type)->cht_name, 0, + CRYPTO_ALG_ASYNC); + } + if (IS_ERR(tfm)) { + CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n", + (*type)->cht_name); + return PTR_ERR(tfm); + } + + *req = ahash_request_alloc(tfm, GFP_KERNEL); + if (!*req) { + CDEBUG(D_INFO, "Failed to alloc ahash_request for %s\n", + (*type)->cht_name); + GOTO(out_free_tfm, err = -ENOMEM); + } + + ahash_request_set_callback(*req, 0, NULL, NULL); + + if (key) + err = crypto_ahash_setkey(tfm, key, key_len); + else if ((*type)->cht_key != 0) + err = crypto_ahash_setkey(tfm, + (unsigned char *)&((*type)->cht_key), + (*type)->cht_size); + if (err) + GOTO(out_free_req, err); + + CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n", + crypto_ahash_alg_name(tfm), crypto_ahash_driver_name(tfm), + cfs_crypto_hash_speeds[hash_alg]); + + err = crypto_ahash_init(*req); + if (err) { +out_free_req: + ahash_request_free(*req); +out_free_tfm: + crypto_free_ahash(tfm); + } + return err; +} + +/** + * Calculate hash digest for the passed buffer. + * + * This should be used when computing the hash on a single contiguous buffer. + * It combines the hash initialization, computation, and cleanup. + * + * \param[in] hash_alg id of hash algorithm (CFS_HASH_ALG_*) + * \param[in] buf data buffer on which to compute hash + * \param[in] buf_len length of \a buf in bytes + * \param[in] key initial value/state for algorithm, if \a key = NULL + * use default initial value + * \param[in] key_len length of \a key in bytes + * \param[out] hash pointer to computed hash value, if \a hash = NULL then + * \a hash_len is to digest size in bytes, retval -ENOSPC + * \param[in,out] hash_len size of \a hash buffer + * + * \retval -EINVAL \a buf, \a buf_len, \a hash_len, \a hash_alg invalid + * \retval -ENOENT \a hash_alg is unsupported + * \retval -ENOSPC \a hash is NULL, or \a hash_len less than digest size + * \retval 0 for success + * \retval negative errno for other errors from lower layers. + */ +int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg, + const void *buf, unsigned int buf_len, + unsigned char *key, unsigned int key_len, + unsigned char *hash, unsigned int *hash_len) +{ + struct scatterlist sl; + struct ahash_request *req; + int err; + const struct cfs_crypto_hash_type *type; + + if (!buf || buf_len == 0 || !hash_len) + return -EINVAL; + + err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len); + if (err != 0) + return err; + + if (!hash || *hash_len < type->cht_size) { + *hash_len = type->cht_size; + crypto_free_ahash(crypto_ahash_reqtfm(req)); + ahash_request_free(req); + return -ENOSPC; + } + sg_init_one(&sl, (void *)buf, buf_len); + + ahash_request_set_crypt(req, &sl, hash, sl.length); + err = crypto_ahash_digest(req); + crypto_free_ahash(crypto_ahash_reqtfm(req)); + ahash_request_free(req); + + return err; +} +EXPORT_SYMBOL(cfs_crypto_hash_digest); + +/** + * Allocate and initialize desriptor for hash algorithm. + * + * This should be used to initialize a hash descriptor for multiple calls + * to a single hash function when computing the hash across multiple + * separate buffers or pages using cfs_crypto_hash_update{,_page}(). + * + * The hash descriptor should be freed with cfs_crypto_hash_final(). + * + * \param[in] hash_alg algorithm id (CFS_HASH_ALG_*) + * \param[in] key initial value/state for algorithm, if \a key = NULL + * use default initial value + * \param[in] key_len length of \a key in bytes + * + * \retval pointer to ahash request + * \retval ERR_PTR(errno) in case of error + */ +struct ahash_request * + cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg, + unsigned char *key, unsigned int key_len) +{ + struct ahash_request *req; + int err; + const struct cfs_crypto_hash_type *type; + + err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len); + if (err) + return ERR_PTR(err); + return req; +} +EXPORT_SYMBOL(cfs_crypto_hash_init); + +/** + * Update hash digest computed on data within the given \a page + * + * \param[in] req ahash request + * \param[in] page data page on which to compute the hash + * \param[in] offset offset within \a page at which to start hash + * \param[in] len length of data on which to compute hash + * + * \retval 0 for success + * \retval negative errno on failure + */ +int cfs_crypto_hash_update_page(struct ahash_request *req, + struct page *page, unsigned int offset, + unsigned int len) +{ + struct scatterlist sl; + + sg_init_table(&sl, 1); + sg_set_page(&sl, page, len, offset & ~PAGE_MASK); + + ahash_request_set_crypt(req, &sl, NULL, sl.length); + return crypto_ahash_update(req); +} +EXPORT_SYMBOL(cfs_crypto_hash_update_page); + +/** + * Update hash digest computed on the specified data + * + * \param[in] req ahash request + * \param[in] buf data buffer on which to compute the hash + * \param[in] buf_len length of \buf on which to compute hash + * + * \retval 0 for success + * \retval negative errno on failure + */ +int cfs_crypto_hash_update(struct ahash_request *req, + const void *buf, unsigned int buf_len) +{ + struct scatterlist sl; + + sg_init_one(&sl, (void *)buf, buf_len); + + ahash_request_set_crypt(req, &sl, NULL, sl.length); + return crypto_ahash_update(req); +} +EXPORT_SYMBOL(cfs_crypto_hash_update); + +/** + * Finish hash calculation, copy hash digest to buffer, clean up hash descriptor + * + * \param[in] req ahash request + * \param[out] hash pointer to hash buffer to store hash digest + * \param[in,out] hash_len pointer to hash buffer size, if \a hash == NULL + * or hash_len == NULL only free \a hdesc instead + * of computing the hash + * + * \retval 0 for success + * \retval -EOVERFLOW if hash_len is too small for the hash digest + * \retval negative errno for other errors from lower layers + */ +int cfs_crypto_hash_final(struct ahash_request *req, + unsigned char *hash, unsigned int *hash_len) +{ + int size = crypto_ahash_digestsize(crypto_ahash_reqtfm(req)); + int err; + + if (!hash || !hash_len) { + err = 0; + goto free; + } + if (*hash_len < size) { + err = -EOVERFLOW; + goto free; + } + + ahash_request_set_crypt(req, NULL, hash, 0); + err = crypto_ahash_final(req); + if (err == 0) + *hash_len = size; +free: + crypto_free_ahash(crypto_ahash_reqtfm(req)); + ahash_request_free(req); + + return err; +} +EXPORT_SYMBOL(cfs_crypto_hash_final); + +/** + * Compute the speed of specified hash function + * + * Run a speed test on the given hash algorithm on buffer using a 1MB buffer + * size. This is a reasonable buffer size for Lustre RPCs, even if the actual + * RPC size is larger or smaller. + * + * The speed is stored internally in the cfs_crypto_hash_speeds[] array, and + * is available through the cfs_crypto_hash_speed() function. + * + * This function needs to stay the same as obd_t10_performance_test() so that + * the speeds are comparable. + * + * \param[in] hash_alg hash algorithm id (CFS_HASH_ALG_*) + * \param[in] buf data buffer on which to compute the hash + * \param[in] buf_len length of \buf on which to compute hash + */ +static void cfs_crypto_performance_test(enum cfs_crypto_hash_alg hash_alg) +{ + int buf_len = max(PAGE_SIZE, 1048576UL); + void *buf; + unsigned long start, end; + int err = 0; + unsigned long bcount; + struct page *page; + unsigned char hash[CFS_CRYPTO_HASH_DIGESTSIZE_MAX]; + unsigned int hash_len = sizeof(hash); + + page = alloc_page(GFP_KERNEL); + if (page == NULL) { + err = -ENOMEM; + goto out_err; + } + + buf = kmap(page); + memset(buf, 0xAD, PAGE_SIZE); + kunmap(page); + + for (start = jiffies, end = start + msecs_to_jiffies(MSEC_PER_SEC / 4), + bcount = 0; time_before(jiffies, end) && err == 0; bcount++) { + struct ahash_request *req; + int i; + + req = cfs_crypto_hash_init(hash_alg, NULL, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + break; + } + + for (i = 0; i < buf_len / PAGE_SIZE; i++) { + err = cfs_crypto_hash_update_page(req, page, 0, + PAGE_SIZE); + if (err != 0) + break; + } + + err = cfs_crypto_hash_final(req, hash, &hash_len); + if (err != 0) + break; + } + end = jiffies; + __free_page(page); +out_err: + if (err != 0) { + cfs_crypto_hash_speeds[hash_alg] = err; + CDEBUG(D_INFO, "Crypto hash algorithm %s test error: rc = %d\n", + cfs_crypto_hash_name(hash_alg), err); + } else { + unsigned long tmp; + + tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) * + 1000) / (1024 * 1024); + cfs_crypto_hash_speeds[hash_alg] = (int)tmp; + CDEBUG(D_CONFIG, "Crypto hash algorithm %s speed = %d MB/s\n", + cfs_crypto_hash_name(hash_alg), + cfs_crypto_hash_speeds[hash_alg]); + } +} + +/** + * hash speed in Mbytes per second for valid hash algorithm + * + * Return the performance of the specified \a hash_alg that was + * computed using cfs_crypto_performance_test(). If the performance + * has not yet been computed, do that when it is first requested. + * That avoids computing the speed when it is not actually needed. + * To avoid competing threads computing the checksum speed at the + * same time, only compute a single checksum speed at one time. + * + * \param[in] hash_alg hash algorithm id (CFS_HASH_ALG_*) + * + * \retval positive speed of the hash function in MB/s + * \retval -ENOENT if \a hash_alg is unsupported + * \retval negative errno if \a hash_alg speed is unavailable + */ +int cfs_crypto_hash_speed(enum cfs_crypto_hash_alg hash_alg) +{ + if (hash_alg < CFS_HASH_ALG_MAX) { + if (unlikely(cfs_crypto_hash_speeds[hash_alg] == 0)) { + static DEFINE_MUTEX(crypto_hash_speed_mutex); + + mutex_lock(&crypto_hash_speed_mutex); + if (cfs_crypto_hash_speeds[hash_alg] == 0) + cfs_crypto_performance_test(hash_alg); + mutex_unlock(&crypto_hash_speed_mutex); + } + return cfs_crypto_hash_speeds[hash_alg]; + } + + return -ENOENT; +} +EXPORT_SYMBOL(cfs_crypto_hash_speed); + +/** + * Run the performance test for all hash algorithms. + * + * Run the cfs_crypto_performance_test() benchmark for some of the available + * hash functions at module load time. This can't be reliably done at runtime + * since the CPUs may be under load from thousands of connecting clients when + * the first client connects and the checksum speeds are needed. + * + * Since the setup cost and computation speed of various hash algorithms is + * a function of the buffer size (and possibly internal contention of offload + * engines), this speed only represents an estimate of the actual speed under + * actual usage, but is reasonable for comparing available algorithms. + * + * The actual speeds are available via cfs_crypto_hash_speed() for later + * comparison. + * + * \retval 0 on success + * \retval -ENOMEM if no memory is available for test buffer + */ +static int cfs_crypto_test_hashes(void) +{ + enum cfs_crypto_hash_alg hash_alg; + + for (hash_alg = 1; hash_alg < CFS_HASH_ALG_SPEED_MAX; hash_alg++) + cfs_crypto_performance_test(hash_alg); + + return 0; +} + +static int adler32; + +#ifdef HAVE_CRC32 +static int crc32; +#endif +#ifdef HAVE_PCLMULQDQ +#ifdef NEED_CRC32_ACCEL +static int crc32_pclmul; +#endif +#ifdef NEED_CRC32C_ACCEL +static int crc32c_pclmul; +#endif +#endif /* HAVE_PCLMULQDQ */ + +/** + * Register available hash functions + * + * \retval 0 + */ +int cfs_crypto_register(void) +{ + request_module("crc32c"); + + adler32 = cfs_crypto_adler32_register(); + +#ifdef HAVE_CRC32 + crc32 = cfs_crypto_crc32_register(); +#endif +#ifdef HAVE_PCLMULQDQ +#ifdef NEED_CRC32_ACCEL + crc32_pclmul = cfs_crypto_crc32_pclmul_register(); +#endif +#ifdef NEED_CRC32C_ACCEL + crc32c_pclmul = cfs_crypto_crc32c_pclmul_register(); +#endif +#endif /* HAVE_PCLMULQDQ */ + + /* check all algorithms and do performance test */ + cfs_crypto_test_hashes(); + + return 0; +} + +/** + * Unregister previously registered hash functions + */ +void cfs_crypto_unregister(void) +{ + if (adler32 == 0) + cfs_crypto_adler32_unregister(); + +#ifdef HAVE_CRC32 + if (crc32 == 0) + cfs_crypto_crc32_unregister(); +#endif +#ifdef HAVE_PCLMULQDQ +#ifdef NEED_CRC32_ACCEL + if (crc32_pclmul == 0) + cfs_crypto_crc32_pclmul_unregister(); +#endif +#ifdef NEED_CRC32C_ACCEL + if (crc32c_pclmul == 0) + cfs_crypto_crc32c_pclmul_unregister(); +#endif +#endif /* HAVE_PCLMULQDQ */ +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c new file mode 100644 index 0000000000000..799c40ea638ec --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c @@ -0,0 +1,308 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/linux/linux-curproc.c + * + * Lustre curproc API implementation for Linux kernel + * + * Author: Nikita Danilov + */ + +#include +#ifdef HAVE_SCHED_HEADERS +#include +#include +#endif +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/* + * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h) + * for Linux kernel. + */ + +/* Currently all the CFS_CAP_* defines match CAP_* ones. */ +#define cfs_cap_pack(cap) (cap) +#define cfs_cap_unpack(cap) (cap) + +void cfs_cap_raise(cfs_cap_t cap) +{ + struct cred *cred; + if ((cred = prepare_creds())) { + cap_raise(cred->cap_effective, cfs_cap_unpack(cap)); + commit_creds(cred); + } +} + +void cfs_cap_lower(cfs_cap_t cap) +{ + struct cred *cred; + if ((cred = prepare_creds())) { + cap_lower(cred->cap_effective, cfs_cap_unpack(cap)); + commit_creds(cred); + } +} + +int cfs_cap_raised(cfs_cap_t cap) +{ + return cap_raised(current_cap(), cfs_cap_unpack(cap)); +} + +static void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap) +{ +#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330 + *cap = cfs_cap_pack(kcap); +#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026 + *cap = cfs_cap_pack(kcap[0]); +#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522 + /* XXX lost high byte */ + *cap = cfs_cap_pack(kcap.cap[0]); +#else + #error "need correct _KERNEL_CAPABILITY_VERSION " +#endif +} + +static void cfs_kernel_cap_unpack(kernel_cap_t *kcap, cfs_cap_t cap) +{ +#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330 + *kcap = cfs_cap_unpack(cap); +#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026 + (*kcap)[0] = cfs_cap_unpack(cap); +#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522 + kcap->cap[0] = cfs_cap_unpack(cap); +#else + #error "need correct _KERNEL_CAPABILITY_VERSION " +#endif +} + +cfs_cap_t cfs_curproc_cap_pack(void) +{ + cfs_cap_t cap; + cfs_kernel_cap_pack(current_cap(), &cap); + return cap; +} + +void cfs_curproc_cap_unpack(cfs_cap_t cap) +{ + struct cred *cred; + if ((cred = prepare_creds())) { + cfs_kernel_cap_unpack(&cred->cap_effective, cap); + commit_creds(cred); + } +} + +int cfs_capable(cfs_cap_t cap) +{ + return capable(cfs_cap_unpack(cap)); +} + +static int cfs_access_process_vm(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long addr, + void *buf, int len, int write) +{ + /* Just copied from kernel for the kernels which doesn't + * have access_process_vm() exported */ + struct vm_area_struct *vma; + struct page *page; + void *old_buf = buf; + + /* Avoid deadlocks on mmap_lock if called from sys_mmap_pgoff(), + * which is already holding mmap_lock for writes. If some other + * thread gets the write lock in the meantime, this thread will + * block, but at least it won't deadlock on itself. LU-1735 */ + if (!mmap_read_trylock(mm)) + return -EDEADLK; + + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, rc, offset; + void *maddr; + +#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS) + rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page, &vma); +#elif defined(HAVE_GET_USER_PAGES_6ARG) + rc = get_user_pages(addr, 1, write, 1, &page, &vma); +#else + rc = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); +#endif + if (rc <= 0) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + put_page(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + mmap_read_unlock(mm); + + return buf - old_buf; +} + +/* Read the environment variable of current process specified by @key. */ +int cfs_get_environ(const char *key, char *value, int *val_len) +{ + struct mm_struct *mm; + char *buffer; + int buf_len = PAGE_SIZE; + int key_len = strlen(key); + unsigned long addr; + int rc; + bool skip = false; + ENTRY; + + buffer = kmalloc(buf_len, GFP_USER); + if (!buffer) + RETURN(-ENOMEM); + + mm = get_task_mm(current); + if (!mm) { + kfree(buffer); + RETURN(-EINVAL); + } + + addr = mm->env_start; + while (addr < mm->env_end) { + int this_len, retval, scan_len; + char *env_start, *env_end; + + memset(buffer, 0, buf_len); + + this_len = min_t(int, mm->env_end - addr, buf_len); + retval = cfs_access_process_vm(current, mm, addr, buffer, + this_len, 0); + if (retval < 0) + GOTO(out, rc = retval); + else if (retval != this_len) + break; + + addr += retval; + + /* Parse the buffer to find out the specified key/value pair. + * The "key=value" entries are separated by '\0'. */ + env_start = buffer; + scan_len = this_len; + while (scan_len) { + char *entry; + int entry_len; + + env_end = memscan(env_start, '\0', scan_len); + LASSERT(env_end >= env_start && + env_end <= env_start + scan_len); + + /* The last entry of this buffer cross the buffer + * boundary, reread it in next cycle. */ + if (unlikely(env_end - env_start == scan_len)) { + /* Just skip the entry larger than page size, + * it can't be jobID env variable. */ + if (unlikely(scan_len == this_len)) + skip = true; + else + addr -= scan_len; + break; + } else if (unlikely(skip)) { + skip = false; + goto skip; + } + + entry = env_start; + entry_len = env_end - env_start; + CDEBUG(D_INFO, "key: %s, entry: %s\n", key, entry); + + /* Key length + length of '=' */ + if (entry_len > key_len + 1 && + entry[key_len] == '=' && + !memcmp(entry, key, key_len)) { + entry += key_len + 1; + entry_len -= key_len + 1; + + /* The 'value' buffer passed in is too small. + * Copy what fits, but return -EOVERFLOW. */ + if (entry_len >= *val_len) { + memcpy(value, entry, *val_len); + value[*val_len - 1] = 0; + GOTO(out, rc = -EOVERFLOW); + } + + memcpy(value, entry, entry_len); + *val_len = entry_len; + GOTO(out, rc = 0); + } +skip: + scan_len -= (env_end - env_start + 1); + env_start = env_end + 1; + } + } + GOTO(out, rc = -ENOENT); + +out: + mmput(mm); + kfree((void *)buffer); + return rc; +} +EXPORT_SYMBOL(cfs_get_environ); + +EXPORT_SYMBOL(cfs_cap_raise); +EXPORT_SYMBOL(cfs_cap_lower); +EXPORT_SYMBOL(cfs_cap_raised); +EXPORT_SYMBOL(cfs_curproc_cap_pack); +EXPORT_SYMBOL(cfs_capable); + +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c new file mode 100644 index 0000000000000..5bb4f08ecefd7 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c @@ -0,0 +1,340 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/linux/linux-debug.c + * + * Author: Phil Schwan + */ + +#include +#include +#include +#include +#include +#ifdef HAVE_KERNEL_LOCKED +#include +#endif +#include +#include +#include +#include + +# define DEBUG_SUBSYSTEM S_LNET + +#include +#ifdef HAVE_PANIC_NOTIFIER_H +#include +#endif + +#include "tracefile.h" + +char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall"; + +/** + * Upcall function once a Lustre log has been dumped. + * + * \param file path of the dumped log + */ +void libcfs_run_debug_log_upcall(char *file) +{ + char *argv[3]; + int rc; + char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + ENTRY; + + argv[0] = lnet_debug_log_upcall; + + LASSERTF(file != NULL, "called on a null filename\n"); + argv[1] = file; //only need to pass the path of the file + + argv[2] = NULL; + + rc = call_usermodehelper(argv[0], argv, envp, 1); + if (rc < 0 && rc != -ENOENT) { + CERROR("Error %d invoking LNET debug log upcall %s %s; " + "check /proc/sys/lnet/debug_log_upcall\n", + rc, argv[0], argv[1]); + } else { + CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n", + argv[0], argv[1]); + } + + EXIT; +} + +/* coverity[+kill] */ +void lbug_with_loc(struct libcfs_debug_msg_data *msgdata) +{ + libcfs_catastrophe = 1; + libcfs_debug_msg(msgdata, "LBUG\n"); + + if (in_interrupt()) { + panic("LBUG in interrupt.\n"); + /* not reached */ + } + + libcfs_debug_dumpstack(NULL); + if (libcfs_panic_on_lbug) + panic("LBUG"); + else + libcfs_debug_dumplog(); + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) + schedule(); +} +EXPORT_SYMBOL(lbug_with_loc); + +#ifdef CONFIG_STACKTRACE + +#ifndef HAVE_SAVE_STACK_TRACE_TSK +#define save_stack_trace_tsk(tsk, trace) \ +do { \ + if (tsk == current) \ + save_stack_trace(trace); \ + else \ + pr_info("No stack, save_stack_trace_tsk() not exported\n"); \ +} while (0) +#endif + +static void cfs_print_stack_trace(unsigned long *entries, unsigned int nr) +{ + unsigned int i; + + /* Prefer %pB for backtraced symbolic names since it was added in: + * Linux v2.6.38-6557-g0f77a8d37825 + * vsprintf: Introduce %pB format specifier + */ + for (i = 0; i < nr; i++) + pr_info("[<0>] %pB\n", (void *)entries[i]); +} + +#define MAX_ST_ENTRIES 100 +static DEFINE_SPINLOCK(st_lock); + +/* + * Linux v5.1-rc5 214d8ca6ee ("stacktrace: Provide common infrastructure") + * CONFIG_ARCH_STACKWALK indicates that save_stack_trace_tsk symbol is not + * exported. Use symbol_get() to find if save_stack_trace_tsk is available. + */ +#ifdef CONFIG_ARCH_STACKWALK +typedef unsigned int (stack_trace_save_tsk_t)(struct task_struct *task, + unsigned long *store, unsigned int size, + unsigned int skipnr); +static stack_trace_save_tsk_t *task_dump_stack; +#endif + +void __init cfs_debug_init(void) +{ +#ifdef CONFIG_ARCH_STACKWALK + task_dump_stack = (void *) + cfs_kallsyms_lookup_name("stack_trace_save_tsk"); + +#endif +} + +static void libcfs_call_trace(struct task_struct *tsk) +{ + static unsigned long entries[MAX_ST_ENTRIES]; +#ifdef CONFIG_ARCH_STACKWALK + unsigned int nr_entries; + + if (!task_dump_stack) + task_dump_stack = (stack_trace_save_tsk_t *) + symbol_get("stack_trace_save_tsk"); + + spin_lock(&st_lock); + pr_info("Pid: %d, comm: %.20s %s %s\n", tsk->pid, tsk->comm, + init_utsname()->release, init_utsname()->version); + pr_info("Call Trace TBD:\n"); + if (task_dump_stack) { + nr_entries = task_dump_stack(tsk, entries, MAX_ST_ENTRIES, 0); + cfs_print_stack_trace(entries, nr_entries); + } + spin_unlock(&st_lock); +#else + struct stack_trace trace; + + trace.nr_entries = 0; + trace.max_entries = MAX_ST_ENTRIES; + trace.entries = entries; + trace.skip = 0; + + spin_lock(&st_lock); + pr_info("Pid: %d, comm: %.20s %s %s\n", tsk->pid, tsk->comm, + init_utsname()->release, init_utsname()->version); + pr_info("Call Trace:\n"); + save_stack_trace_tsk(tsk, &trace); + cfs_print_stack_trace(trace.entries, trace.nr_entries); + spin_unlock(&st_lock); +#endif +} + +#else /* !CONFIG_STACKTRACE */ + +#ifdef CONFIG_X86 +#include +#include + +#ifdef HAVE_STACKTRACE_OPS +#ifdef HAVE_STACKTRACE_WARNING +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk("%s", (char *)data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} +#endif + +static int print_trace_stack(void *data, char *name) +{ + printk(" <%s> ", name); + return 0; +} + +#ifdef STACKTRACE_OPS_ADDRESS_RETURN_INT +static int +#else +static void +#endif +print_trace_address(void *data, unsigned long addr, int reliable) +{ + char fmt[32]; + + touch_nmi_watchdog(); + sprintf(fmt, " [<%016lx>] %s%%s\n", addr, reliable ? "": "? "); + __print_symbol(fmt, addr); +#ifdef STACKTRACE_OPS_ADDRESS_RETURN_INT + return 0; +#endif +} + +static const struct stacktrace_ops print_trace_ops = { +#ifdef HAVE_STACKTRACE_WARNING + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, +#endif + .stack = print_trace_stack, + .address = print_trace_address, +#ifdef STACKTRACE_OPS_HAVE_WALK_STACK + .walk_stack = print_context_stack, +#endif +}; +#endif /* HAVE_STACKTRACE_OPS */ + +static void libcfs_call_trace(struct task_struct *tsk) +{ +#ifdef HAVE_STACKTRACE_OPS + printk("Pid: %d, comm: %.20s\n", tsk->pid, tsk->comm); + printk("\nCall Trace:\n"); + dump_trace(tsk, NULL, NULL, +#ifdef HAVE_DUMP_TRACE_ADDRESS + 0, +#endif /* HAVE_DUMP_TRACE_ADDRESS */ + &print_trace_ops, NULL); + printk("\n"); +#else /* !HAVE_STACKTRACE_OPS */ + if (tsk == current) + dump_stack(); + else + CWARN("can't show stack: kernel doesn't export show_task\n"); +#endif /* HAVE_STACKTRACE_OPS */ +} + +#else /* !CONFIG_X86 */ + +static void libcfs_call_trace(struct task_struct *tsk) +{ + if (tsk == current) + dump_stack(); + else + CWARN("can't show stack: kernel doesn't export show_task\n"); +} + +#endif /* CONFIG_X86 */ + +#endif /* CONFIG_STACKTRACE */ + +void libcfs_debug_dumpstack(struct task_struct *tsk) +{ + libcfs_call_trace(tsk ?: current); +} +EXPORT_SYMBOL(libcfs_debug_dumpstack); + +static int panic_notifier(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + if (libcfs_panic_in_progress) + return 0; + + libcfs_panic_in_progress = 1; + mb(); + +#ifdef LNET_DUMP_ON_PANIC + /* This is currently disabled because it spews far too much to the + * console on the rare cases it is ever triggered. */ + + if (in_interrupt()) { + cfs_trace_debug_print(); + } else { +#ifdef HAVE_KERNEL_LOCKED + while (kernel_locked()) + unlock_kernel(); +#endif + libcfs_debug_dumplog_internal((void *)(long)current_pid()); + } +#endif + return 0; +} + +static struct notifier_block libcfs_panic_notifier = { + .notifier_call = panic_notifier, + .next = NULL, + .priority = 10000 +}; + +void libcfs_register_panic_notifier(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier); +} + +void libcfs_unregister_panic_notifier(void) +{ + atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c new file mode 100644 index 0000000000000..e4e67c20cee5d --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#ifdef HAVE_STRINGHASH +#include +#else +#include +#endif +#include + +#include + +/* Return the "hash_len" (hash and length) of a null-terminated string */ +/* The kernel equivalent is in fs/namei.c but for some strange reason + * RHEL7.5 stuck it in dax/super.c instead. This placement never existed + * upstream so to make life easier we just have the equavilent + */ +u64 cfs_hashlen_string(const void *salt, const char *name) +{ +#ifdef HAVE_FULL_NAME_HASH_3ARGS + unsigned long hash = init_name_hash(salt); +#else + unsigned long hash = init_name_hash(); +#endif + unsigned long len = 0, c; + + c = (unsigned char)*name; + while (c) { + len++; + hash = partial_name_hash(c, hash); + c = (unsigned char)name[len]; + } + return hashlen_create(end_name_hash(hash), len); +} +EXPORT_SYMBOL(cfs_hashlen_string); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c new file mode 100644 index 0000000000000..7300af8018c69 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c @@ -0,0 +1,175 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include + +static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data) +{ + size_t len = sizeof(*data); + + len += (data->ioc_inllen1 + 7) & ~7; + len += (data->ioc_inllen2 + 7) & ~7; + return len; +} + +static bool libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data) +{ + if (data->ioc_hdr.ioc_len > BIT(30)) + return true; + + if (data->ioc_inllen1 > BIT(30)) + return true; + + if (data->ioc_inllen2 > BIT(30)) + return true; + + if (data->ioc_inlbuf1 && !data->ioc_inllen1) + return true; + + if (data->ioc_inlbuf2 && !data->ioc_inllen2) + return true; + + if (data->ioc_pbuf1 && !data->ioc_plen1) + return true; + + if (data->ioc_pbuf2 && !data->ioc_plen2) + return true; + + if (data->ioc_plen1 && !data->ioc_pbuf1) + return true; + + if (data->ioc_plen2 && !data->ioc_pbuf2) + return true; + + if (libcfs_ioctl_packlen(data) != data->ioc_hdr.ioc_len) + return true; + + if (data->ioc_inllen1 && + data->ioc_bulk[((data->ioc_inllen1 + 7) & ~7) + + data->ioc_inllen2 - 1] != '\0') + return true; + + return false; +} + +int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data) +{ + ENTRY; + + if (libcfs_ioctl_is_invalid(data)) { + CERROR("libcfs ioctl: parameter not correctly formatted\n"); + RETURN(-EINVAL); + } + + if (data->ioc_inllen1 != 0) + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + + if (data->ioc_inllen2 != 0) + data->ioc_inlbuf2 = &data->ioc_bulk[0] + + cfs_size_round(data->ioc_inllen1); + + RETURN(0); +} + +int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp, + struct libcfs_ioctl_hdr __user *uhdr) +{ + struct libcfs_ioctl_hdr hdr; + int err = 0; + ENTRY; + + if (copy_from_user(&hdr, uhdr, sizeof(hdr))) + RETURN(-EFAULT); + + if (hdr.ioc_version != LIBCFS_IOCTL_VERSION && + hdr.ioc_version != LIBCFS_IOCTL_VERSION2) { + CERROR("libcfs ioctl: version mismatch expected %#x, got %#x\n", + LIBCFS_IOCTL_VERSION, hdr.ioc_version); + RETURN(-EINVAL); + } + + if (hdr.ioc_len < sizeof(struct libcfs_ioctl_hdr)) { + CERROR("libcfs ioctl: user buffer too small for ioctl\n"); + RETURN(-EINVAL); + } + + if (hdr.ioc_len > LIBCFS_IOC_DATA_MAX) { + CERROR("libcfs ioctl: user buffer is too large %d/%d\n", + hdr.ioc_len, LIBCFS_IOC_DATA_MAX); + RETURN(-EINVAL); + } + + LIBCFS_ALLOC(*hdr_pp, hdr.ioc_len); + if (*hdr_pp == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(*hdr_pp, uhdr, hdr.ioc_len)) + GOTO(failed, err = -EFAULT); + + RETURN(0); +failed: + LIBCFS_FREE(*hdr_pp, hdr.ioc_len); + RETURN(err); +} + +static long +libcfs_psdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (_IOC_TYPE(cmd) != IOC_LIBCFS_TYPE || + _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR || + _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR) { + CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", + _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); + return -EINVAL; + } + + return libcfs_ioctl(cmd, (void __user *)arg); +} + +static struct file_operations libcfs_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = libcfs_psdev_ioctl, +}; + +struct miscdevice libcfs_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "lnet", + .fops = &libcfs_fops +}; diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c new file mode 100644 index 0000000000000..2ee18be5e59a6 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c @@ -0,0 +1,237 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include +#include +#include +#include +#include +#ifdef HAVE_SCHED_HEADERS +#include +#include +#endif +#include + +#if defined(CONFIG_KGDB) +#include +#endif + +#include + +#ifndef HAVE_KTIME_GET_TS64 +void ktime_get_ts64(struct timespec64 *ts) +{ + struct timespec now; + + ktime_get_ts(&now); + *ts = timespec_to_timespec64(now); +} +EXPORT_SYMBOL(ktime_get_ts64); +#endif /* HAVE_KTIME_GET_TS64 */ + +#ifndef HAVE_KTIME_GET_REAL_TS64 +void ktime_get_real_ts64(struct timespec64 *ts) +{ + struct timespec now; + + getnstimeofday(&now); + *ts = timespec_to_timespec64(now); +} +EXPORT_SYMBOL(ktime_get_real_ts64); +#endif /* HAVE_KTIME_GET_REAL_TS64 */ + +#ifndef HAVE_KTIME_GET_REAL_SECONDS +/* + * Get the seconds portion of CLOCK_REALTIME (wall clock). + * This is the clock that can be altered by NTP and is + * independent of a reboot. + */ +time64_t ktime_get_real_seconds(void) +{ + return (time64_t)get_seconds(); +} +EXPORT_SYMBOL(ktime_get_real_seconds); +#endif /* HAVE_KTIME_GET_REAL_SECONDS */ + +#ifndef HAVE_KTIME_GET_SECONDS +/* + * Get the seconds portion of CLOCK_MONOTONIC + * This clock is immutable and is reset across + * reboots. For older platforms this is a + * wrapper around get_seconds which is valid + * until 2038. By that time this will be gone + * one would hope. + */ +time64_t ktime_get_seconds(void) +{ + struct timespec64 now; + + ktime_get_ts64(&now); + return now.tv_sec; +} +EXPORT_SYMBOL(ktime_get_seconds); +#endif /* HAVE_KTIME_GET_SECONDS */ + +static int (*cfs_apply_workqueue_attrs_t)(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs); + +int cfs_apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + if (cfs_apply_workqueue_attrs_t) + return cfs_apply_workqueue_attrs_t(wq, attrs); + return 0; +} +EXPORT_SYMBOL_GPL(cfs_apply_workqueue_attrs); + +int cfs_kernel_write(struct file *filp, const void *buf, size_t count, + loff_t *pos) +{ +#ifdef HAVE_NEW_KERNEL_WRITE + return kernel_write(filp, buf, count, pos); +#else + mm_segment_t __old_fs = get_fs(); + int rc; + + set_fs(KERNEL_DS); + rc = vfs_write(filp, (__force const char __user *)buf, count, pos); + set_fs(__old_fs); + + return rc; +#endif +} +EXPORT_SYMBOL(cfs_kernel_write); + +#ifndef HAVE_KSET_FIND_OBJ +struct kobject *kset_find_obj(struct kset *kset, const char *name) +{ + struct kobject *ret = NULL; + struct kobject *k; + + spin_lock(&kset->list_lock); + + list_for_each_entry(k, &kset->list, entry) { + if (kobject_name(k) && !strcmp(kobject_name(k), name)) { + if (kref_get_unless_zero(&k->kref)) + ret = k; + break; + } + } + + spin_unlock(&kset->list_lock); + return ret; +} +EXPORT_SYMBOL_GPL(kset_find_obj); +#endif + +#ifndef HAVE_KSTRTOBOOL_FROM_USER +int kstrtobool_from_user(const char __user *s, size_t count, bool *res) +{ + /* Longest string needed to differentiate, newline, terminator */ + char buf[4]; + + count = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, s, count)) + return -EFAULT; + buf[count] = '\0'; + return strtobool(buf, res); +} +EXPORT_SYMBOL(kstrtobool_from_user); +#endif /* !HAVE_KSTRTOBOOL_FROM_USER */ + +sigset_t +cfs_block_allsigs(void) +{ + unsigned long flags; + sigset_t old; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + old = current->blocked; + sigfillset(¤t->blocked); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + return old; +} +EXPORT_SYMBOL(cfs_block_allsigs); + +sigset_t cfs_block_sigs(unsigned long sigs) +{ + unsigned long flags; + sigset_t old; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + old = current->blocked; + sigaddsetmask(¤t->blocked, sigs); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + return old; +} +EXPORT_SYMBOL(cfs_block_sigs); + +/* Block all signals except for the @sigs */ +sigset_t cfs_block_sigsinv(unsigned long sigs) +{ + unsigned long flags; + sigset_t old; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + old = current->blocked; + sigaddsetmask(¤t->blocked, ~sigs); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + return old; +} +EXPORT_SYMBOL(cfs_block_sigsinv); + +void +cfs_restore_sigs(sigset_t old) +{ + unsigned long flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->blocked = old; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} +EXPORT_SYMBOL(cfs_restore_sigs); + +void +cfs_clear_sigpending(void) +{ + unsigned long flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + clear_tsk_thread_flag(current, TIF_SIGPENDING); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} +EXPORT_SYMBOL(cfs_clear_sigpending); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c new file mode 100644 index 0000000000000..9685296266f04 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c @@ -0,0 +1,273 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#define LUSTRE_TRACEFILE_PRIVATE + +#include +#include +#include "tracefile.h" + +/* percents to share the total debug memory for each type */ +static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = { + 80, /* 80% pages for CFS_TCD_TYPE_PROC */ + 10, /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */ + 10 /* 10% pages for CFS_TCD_TYPE_IRQ */ +}; + +char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX]; + +static DECLARE_RWSEM(cfs_tracefile_sem); + +int cfs_tracefile_init_arch() +{ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + + /* initialize trace_data */ + memset(cfs_trace_data, 0, sizeof(cfs_trace_data)); + for (i = 0; i < CFS_TCD_TYPE_MAX; i++) { + cfs_trace_data[i] = + kmalloc(sizeof(union cfs_trace_data_union) * + num_possible_cpus(), GFP_KERNEL); + if (cfs_trace_data[i] == NULL) + goto out; + + } + + /* arch related info initialized */ + cfs_tcd_for_each(tcd, i, j) { + spin_lock_init(&tcd->tcd_lock); + tcd->tcd_pages_factor = pages_factor[i]; + tcd->tcd_type = i; + tcd->tcd_cpu = j; + } + + for (i = 0; i < num_possible_cpus(); i++) + for (j = 0; j < 3; j++) { + cfs_trace_console_buffers[i][j] = + kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE, + GFP_KERNEL); + + if (cfs_trace_console_buffers[i][j] == NULL) + goto out; + } + + return 0; + +out: + cfs_tracefile_fini_arch(); + printk(KERN_ERR "lnet: Not enough memory\n"); + return -ENOMEM; +} + +void cfs_tracefile_fini_arch() +{ + int i; + int j; + + for (i = 0; i < num_possible_cpus(); i++) + for (j = 0; j < 3; j++) + if (cfs_trace_console_buffers[i][j] != NULL) { + kfree(cfs_trace_console_buffers[i][j]); + cfs_trace_console_buffers[i][j] = NULL; + } + + for (i = 0; cfs_trace_data[i] != NULL; i++) { + kfree(cfs_trace_data[i]); + cfs_trace_data[i] = NULL; + } +} + +void cfs_tracefile_read_lock() +{ + down_read(&cfs_tracefile_sem); +} + +void cfs_tracefile_read_unlock() +{ + up_read(&cfs_tracefile_sem); +} + +void cfs_tracefile_write_lock() +{ + down_write(&cfs_tracefile_sem); +} + +void cfs_tracefile_write_unlock() +{ + up_write(&cfs_tracefile_sem); +} + +enum cfs_trace_buf_type cfs_trace_buf_idx_get() +{ + if (in_irq()) + return CFS_TCD_TYPE_IRQ; + else if (in_softirq()) + return CFS_TCD_TYPE_SOFTIRQ; + else + return CFS_TCD_TYPE_PROC; +} + +/* + * The walking argument indicates the locking comes from all tcd types + * iterator and we must lock it and dissable local irqs to avoid deadlocks + * with other interrupt locks that might be happening. See LU-1311 + * for details. + */ +int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking) +__acquires(&tcd->tcd_lock) +{ + __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX); + if (tcd->tcd_type == CFS_TCD_TYPE_IRQ) + spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags); + else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ) + spin_lock_bh(&tcd->tcd_lock); + else if (unlikely(walking)) + spin_lock_irq(&tcd->tcd_lock); + else + spin_lock(&tcd->tcd_lock); + return 1; +} + +void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking) +__releases(&tcd->tcd_lock) +{ + __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX); + if (tcd->tcd_type == CFS_TCD_TYPE_IRQ) + spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags); + else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ) + spin_unlock_bh(&tcd->tcd_lock); + else if (unlikely(walking)) + spin_unlock_irq(&tcd->tcd_lock); + else + spin_unlock(&tcd->tcd_lock); +} + +int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd, + struct cfs_trace_page *tage) +{ + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + return tcd->tcd_cpu == tage->cpu; +} + +void +cfs_set_ptldebug_header(struct ptldebug_header *header, + struct libcfs_debug_msg_data *msgdata, + unsigned long stack) +{ + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + + header->ph_subsys = msgdata->msg_subsys; + header->ph_mask = msgdata->msg_mask; + header->ph_cpu_id = smp_processor_id(); + header->ph_type = cfs_trace_buf_idx_get(); + /* y2038 safe since all user space treats this as unsigned, but + * will overflow in 2106 + */ + header->ph_sec = (u32)ts.tv_sec; + header->ph_usec = ts.tv_nsec / NSEC_PER_USEC; + header->ph_stack = stack; + header->ph_pid = current->pid; + header->ph_line_num = msgdata->msg_line; + header->ph_extern_pid = 0; + return; +} + +static char * +dbghdr_to_err_string(struct ptldebug_header *hdr) +{ + switch (hdr->ph_subsys) { + + case S_LND: + case S_LNET: + return "LNetError"; + default: + return "LustreError"; + } +} + +static char * +dbghdr_to_info_string(struct ptldebug_header *hdr) +{ + switch (hdr->ph_subsys) { + + case S_LND: + case S_LNET: + return "LNet"; + default: + return "Lustre"; + } +} + +void cfs_print_to_console(struct ptldebug_header *hdr, int mask, + const char *buf, int len, const char *file, + const char *fn) +{ + char *prefix = "Lustre", *ptype = NULL; + + if ((mask & D_EMERG) != 0) { + prefix = dbghdr_to_err_string(hdr); + ptype = KERN_EMERG; + } else if ((mask & D_ERROR) != 0) { + prefix = dbghdr_to_err_string(hdr); + ptype = KERN_ERR; + } else if ((mask & D_WARNING) != 0) { + prefix = dbghdr_to_info_string(hdr); + ptype = KERN_WARNING; + } else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) { + prefix = dbghdr_to_info_string(hdr); + ptype = KERN_INFO; + } + + if ((mask & D_CONSOLE) != 0) { + printk("%s%s: %.*s", ptype, prefix, len, buf); + } else { + printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, + hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num, + fn, len, buf); + } + return; +} + +int cfs_trace_max_debug_mb(void) +{ + int total_mb = (cfs_totalram_pages() >> (20 - PAGE_SHIFT)); + + return MAX(512, (total_mb * 80)/100); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c new file mode 100644 index 0000000000000..5843d808bc332 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c @@ -0,0 +1,115 @@ +/* + * The implementation of the wait_bit*() and related waiting APIs: + */ +#include +#include +#ifdef HAVE_SCHED_HEADERS +#include +#endif +#include + +#ifndef HAVE_PREPARE_TO_WAIT_EVENT + +#define __add_wait_queue_entry_tail __add_wait_queue_tail + +long prepare_to_wait_event(wait_queue_head_t *wq_head, + wait_queue_entry_t *wq_entry, int state) +{ + unsigned long flags; + long ret = 0; + + spin_lock_irqsave(&wq_head->lock, flags); + if (unlikely(signal_pending_state(state, current))) { + /* + * Exclusive waiter must not fail if it was selected by wakeup, + * it should "consume" the condition we were waiting for. + * + * The caller will recheck the condition and return success if + * we were already woken up, we can not miss the event because + * wakeup locks/unlocks the same wq_head->lock. + * + * But we need to ensure that set-condition + wakeup after that + * can't see us, it should wake up another exclusive waiter if + * we fail. + */ + list_del_init(&wq_entry->task_list); + ret = -ERESTARTSYS; + } else { + if (list_empty(&wq_entry->task_list)) { + if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_entry_tail(wq_head, wq_entry); + else + __add_wait_queue(wq_head, wq_entry); + } + set_current_state(state); + } + spin_unlock_irqrestore(&wq_head->lock, flags); + + return ret; +} +EXPORT_SYMBOL(prepare_to_wait_event); +#endif /* !HAVE_PREPARE_TO_WAIT_EVENT */ + +#ifndef HAVE_WAIT_VAR_EVENT + +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) + +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *__var_waitqueue(void *p) +{ + return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(__var_waitqueue); + +static int +var_wake_function(wait_queue_entry_t *wq_entry, unsigned int mode, + int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wbq_entry = + container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + + if (wbq_entry->key.flags != key->flags || + wbq_entry->key.bit_nr != key->bit_nr) + return 0; + + return autoremove_wake_function(wq_entry, mode, sync, key); +} + +void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, + int flags) +{ + *wbq_entry = (struct wait_bit_queue_entry){ + .key = { + .flags = (var), + .bit_nr = -1, + }, + .wq_entry = { + .private = current, + .func = var_wake_function, +#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST + .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), +#else + .task_list = LIST_HEAD_INIT(wbq_entry->wq_entry.task_list), +#endif + }, + }; +} +EXPORT_SYMBOL(init_wait_var_entry); + +void wake_up_var(void *var) +{ + __wake_up_bit(__var_waitqueue(var), var, -1); +} +EXPORT_SYMBOL(wake_up_var); + +void __init wait_bit_init(void) +{ + int i; + + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); +} +#endif /* ! HAVE_WAIT_VAR_EVENT */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/module.c b/drivers/staging/lustrefsx/libcfs/libcfs/module.c new file mode 100644 index 0000000000000..08f5a1c1a5655 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/module.c @@ -0,0 +1,806 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include "tracefile.h" + +static struct dentry *lnet_debugfs_root; + +BLOCKING_NOTIFIER_HEAD(libcfs_ioctl_list); +EXPORT_SYMBOL(libcfs_ioctl_list); + +int libcfs_ioctl(unsigned long cmd, void __user *uparam) +{ + struct libcfs_ioctl_data *data = NULL; + struct libcfs_ioctl_hdr *hdr; + int err; + ENTRY; + + /* 'cmd' and permissions get checked in our arch-specific caller */ + err = libcfs_ioctl_getdata(&hdr, uparam); + if (err != 0) { + CDEBUG_LIMIT(D_ERROR, + "libcfs ioctl: data header error %d\n", err); + RETURN(err); + } + + if (hdr->ioc_version == LIBCFS_IOCTL_VERSION) { + /* The libcfs_ioctl_data_adjust() function performs adjustment + * operations on the libcfs_ioctl_data structure to make + * it usable by the code. This doesn't need to be called + * for new data structures added. */ + data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr); + err = libcfs_ioctl_data_adjust(data); + if (err != 0) + GOTO(out, err); + } + + CDEBUG(D_IOCTL, "libcfs ioctl cmd %lu\n", cmd); + switch (cmd) { + case IOC_LIBCFS_CLEAR_DEBUG: + libcfs_debug_clear_buffer(); + break; + case IOC_LIBCFS_MARK_DEBUG: + if (data == NULL || + data->ioc_inlbuf1 == NULL || + data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') + GOTO(out, err = -EINVAL); + + libcfs_debug_mark_buffer(data->ioc_inlbuf1); + break; + + default: + err = blocking_notifier_call_chain(&libcfs_ioctl_list, + cmd, hdr); + if (!(err & NOTIFY_STOP_MASK)) + /* No-one claimed the ioctl */ + err = -EINVAL; + else + err = notifier_to_errno(err); + if (copy_to_user(uparam, hdr, hdr->ioc_len) && !err) + err = -EFAULT; + break; + } +out: + LIBCFS_FREE(hdr, hdr->ioc_len); + RETURN(err); +} + +int lprocfs_call_handler(void *data, int write, loff_t *ppos, + void __user *buffer, size_t *lenp, + int (*handler)(void *data, int write, loff_t pos, + void __user *buffer, int len)) +{ + int rc = handler(data, write, *ppos, buffer, *lenp); + + if (rc < 0) + return rc; + + if (write) { + *ppos += *lenp; + } else { + *lenp = rc; + *ppos += rc; + } + return 0; +} +EXPORT_SYMBOL(lprocfs_call_handler); + +static int __proc_dobitmasks(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + const int tmpstrlen = 512; + char *tmpstr; + int rc; + unsigned int *mask = data; + int is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0; + int is_printk = (mask == &libcfs_printk) ? 1 : 0; + + rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen); + if (rc < 0) + return rc; + + if (!write) { + libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys); + rc = strlen(tmpstr); + + if (pos >= rc) { + rc = 0; + } else { + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, "\n"); + } + } else { + rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob); + if (rc < 0) { + kfree(tmpstr); + return rc; + } + + rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys); + /* Always print LBUG/LASSERT to console, so keep this mask */ + if (is_printk) + *mask |= D_EMERG; + } + + kfree(tmpstr); + return rc; +} + +static int proc_dobitmasks(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, + __proc_dobitmasks); +} + +static int min_watchdog_ratelimit; /* disable ratelimiting */ +static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */ + +static int __proc_dump_kernel(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + if (!write) + return 0; + + return cfs_trace_dump_debug_buffer_usrstr(buffer, nob); +} + +static int proc_dump_kernel(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, + __proc_dump_kernel); +} + +static int __proc_daemon_file(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + if (!write) { + int len = strlen(cfs_tracefile); + + if (pos >= len) + return 0; + + return cfs_trace_copyout_string(buffer, nob, + cfs_tracefile + pos, "\n"); + } + + return cfs_trace_daemon_command_usrstr(buffer, nob); +} + +static int proc_daemon_file(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, + __proc_daemon_file); +} + +static int libcfs_force_lbug(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + if (write) + LBUG(); + return 0; +} + +static int proc_fail_loc(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc; + long old_fail_loc = cfs_fail_loc; + + if (!*lenp || *ppos) { + *lenp = 0; + return 0; + } + + if (write) { + char *kbuf = memdup_user_nul(buffer, *lenp); + + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + rc = kstrtoul(kbuf, 0, &cfs_fail_loc); + kfree(kbuf); + *ppos += *lenp; + } else { + char kbuf[64/3+3]; + + rc = scnprintf(kbuf, sizeof(kbuf), "%lu\n", cfs_fail_loc); + if (copy_to_user(buffer, kbuf, rc)) + rc = -EFAULT; + else { + *lenp = rc; + *ppos += rc; + } + } + + if (old_fail_loc != cfs_fail_loc) { + cfs_race_state = 1; + wake_up(&cfs_race_waitq); + } + return rc; +} + +int debugfs_doint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc; + + if (!*lenp || *ppos) { + *lenp = 0; + return 0; + } + + if (write) { + char *kbuf = memdup_user_nul(buffer, *lenp); + int val; + + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + rc = kstrtoint(kbuf, 0, &val); + kfree(kbuf); + if (!rc) { + if (table->extra1 && val < *(int *)table->extra1) + val = *(int *)table->extra1; + if (table->extra2 && val > *(int *)table->extra2) + val = *(int *)table->extra2; + *(int *)table->data = val; + } + *ppos += *lenp; + } else { + char kbuf[64/3+3]; + + rc = scnprintf(kbuf, sizeof(kbuf), "%u\n", *(int *)table->data); + if (copy_to_user(buffer, kbuf, rc)) + rc = -EFAULT; + else { + *lenp = rc; + *ppos += rc; + } + } + + return rc; +} +EXPORT_SYMBOL(debugfs_doint); + +static int debugfs_dostring(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int len = *lenp; + char *kbuf = table->data; + + if (!len || *ppos) { + *lenp = 0; + return 0; + } + if (len > table->maxlen) + len = table->maxlen; + if (write) { + if (copy_from_user(kbuf, buffer, len)) + return -EFAULT; + memset(kbuf+len, 0, table->maxlen - len); + *ppos = *lenp; + } else { + len = strnlen(kbuf, len); + if (copy_to_user(buffer, kbuf, len)) + return -EFAULT; + if (len < *lenp) { + if (copy_to_user(buffer+len, "\n", 1)) + return -EFAULT; + len += 1; + } + *ppos += len; + *lenp -= len; + } + return len; +} + +static int __proc_cpt_table(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + char *buf = NULL; + int len = 4096; + int rc = 0; + + if (write) + return -EPERM; + + LASSERT(cfs_cpt_table != NULL); + + while (1) { + LIBCFS_ALLOC(buf, len); + if (buf == NULL) + return -ENOMEM; + + rc = cfs_cpt_table_print(cfs_cpt_table, buf, len); + if (rc >= 0) + break; + + if (rc == -EFBIG) { + LIBCFS_FREE(buf, len); + len <<= 1; + continue; + } + goto out; + } + + if (pos >= rc) { + rc = 0; + goto out; + } + + rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); +out: + if (buf != NULL) + LIBCFS_FREE(buf, len); + return rc; +} + +static int proc_cpt_table(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, + __proc_cpt_table); +} + +static int __proc_cpt_distance(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + char *buf = NULL; + int len = 4096; + int rc = 0; + + if (write) + return -EPERM; + + LASSERT(cfs_cpt_table != NULL); + + while (1) { + LIBCFS_ALLOC(buf, len); + if (buf == NULL) + return -ENOMEM; + + rc = cfs_cpt_distance_print(cfs_cpt_table, buf, len); + if (rc >= 0) + break; + + if (rc == -EFBIG) { + LIBCFS_FREE(buf, len); + len <<= 1; + continue; + } + goto out; + } + + if (pos >= rc) { + rc = 0; + goto out; + } + + rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); + out: + if (buf != NULL) + LIBCFS_FREE(buf, len); + return rc; +} + +static int proc_cpt_distance(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, + __proc_cpt_distance); +} + +static struct ctl_table lnet_table[] = { + { + INIT_CTL_NAME + .procname = "debug", + .data = &libcfs_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + INIT_CTL_NAME + .procname = "subsystem_debug", + .data = &libcfs_subsystem_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + INIT_CTL_NAME + .procname = "printk", + .data = &libcfs_printk, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + INIT_CTL_NAME + .procname = "cpu_partition_table", + .maxlen = 128, + .mode = 0444, + .proc_handler = &proc_cpt_table, + }, + { + INIT_CTL_NAME + .procname = "cpu_partition_distance", + .maxlen = 128, + .mode = 0444, + .proc_handler = &proc_cpt_distance, + }, + { + INIT_CTL_NAME + .procname = "debug_log_upcall", + .data = lnet_debug_log_upcall, + .maxlen = sizeof(lnet_debug_log_upcall), + .mode = 0644, + .proc_handler = &debugfs_dostring, + }, + { + INIT_CTL_NAME + .procname = "lnet_memused", + .data = (int *)&libcfs_kmemory.counter, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &debugfs_doint, + }, + { + INIT_CTL_NAME + .procname = "catastrophe", + .data = &libcfs_catastrophe, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &debugfs_doint, + }, + { + INIT_CTL_NAME + .procname = "dump_kernel", + .maxlen = 256, + .mode = 0200, + .proc_handler = &proc_dump_kernel, + }, + { + INIT_CTL_NAME + .procname = "daemon_file", + .mode = 0644, + .maxlen = 256, + .proc_handler = &proc_daemon_file, + }, + { + INIT_CTL_NAME + .procname = "watchdog_ratelimit", + .data = &libcfs_watchdog_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &debugfs_doint, + .extra1 = &min_watchdog_ratelimit, + .extra2 = &max_watchdog_ratelimit, + }, + { + INIT_CTL_NAME + .procname = "force_lbug", + .data = NULL, + .maxlen = 0, + .mode = 0200, + .proc_handler = &libcfs_force_lbug + }, + { + INIT_CTL_NAME + .procname = "fail_loc", + .data = &cfs_fail_loc, + .maxlen = sizeof(cfs_fail_loc), + .mode = 0644, + .proc_handler = &proc_fail_loc + }, + { + INIT_CTL_NAME + .procname = "fail_val", + .data = &cfs_fail_val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &debugfs_doint + }, + { + INIT_CTL_NAME + .procname = "fail_err", + .data = &cfs_fail_err, + .maxlen = sizeof(cfs_fail_err), + .mode = 0644, + .proc_handler = &debugfs_doint, + }, + { + } +}; + +static const struct lnet_debugfs_symlink_def lnet_debugfs_symlinks[] = { + { .name = "console_ratelimit", + .target = "../../../module/libcfs/parameters/libcfs_console_ratelimit" }, + { .name = "debug_path", + .target = "../../../module/libcfs/parameters/libcfs_debug_file_path" }, + { .name = "panic_on_lbug", + .target = "../../../module/libcfs/parameters/libcfs_panic_on_lbug" }, + { .name = "console_backoff", + .target = "../../../module/libcfs/parameters/libcfs_console_backoff" }, + { .name = "debug_mb", + .target = "../../../module/libcfs/parameters/libcfs_debug_mb" }, + { .name = "console_min_delay_centisecs", + .target = "../../../module/libcfs/parameters/libcfs_console_min_delay" }, + { .name = "console_max_delay_centisecs", + .target = "../../../module/libcfs/parameters/libcfs_console_max_delay" }, + { .name = NULL }, +}; + +static ssize_t lnet_debugfs_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct ctl_table *table = filp->private_data; + ssize_t rc = -EINVAL; + + if (table) { + rc = table->proc_handler(table, 0, buf, &count, ppos); + if (!rc) + rc = count; + } + + return rc; +} + +static ssize_t lnet_debugfs_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct ctl_table *table = filp->private_data; + ssize_t rc = -EINVAL; + + if (table) { + rc = table->proc_handler(table, 1, (void __user *)buf, &count, + ppos); + if (!rc) + rc = count; + } + + return rc; +} + +static const struct file_operations lnet_debugfs_file_operations_rw = { + .open = simple_open, + .read = lnet_debugfs_read, + .write = lnet_debugfs_write, + .llseek = default_llseek, +}; + +static const struct file_operations lnet_debugfs_file_operations_ro = { + .open = simple_open, + .read = lnet_debugfs_read, + .llseek = default_llseek, +}; + +static const struct file_operations lnet_debugfs_file_operations_wo = { + .open = simple_open, + .write = lnet_debugfs_write, + .llseek = default_llseek, +}; + +static const struct file_operations *lnet_debugfs_fops_select(umode_t mode) +{ + if (!(mode & S_IWUGO)) + return &lnet_debugfs_file_operations_ro; + + if (!(mode & S_IRUGO)) + return &lnet_debugfs_file_operations_wo; + + return &lnet_debugfs_file_operations_rw; +} + +void lnet_insert_debugfs(struct ctl_table *table) +{ + if (!lnet_debugfs_root) + lnet_debugfs_root = debugfs_create_dir("lnet", NULL); + + /* Even if we cannot create, just ignore it altogether) */ + if (IS_ERR_OR_NULL(lnet_debugfs_root)) + return; + + /* We don't save the dentry returned in next two calls, because + * we don't call debugfs_remove() but rather remove_recursive() + */ + for (; table && table->procname; table++) + debugfs_create_file(table->procname, table->mode, + lnet_debugfs_root, table, + lnet_debugfs_fops_select(table->mode)); +} +EXPORT_SYMBOL_GPL(lnet_insert_debugfs); + +static void lnet_insert_debugfs_links( + const struct lnet_debugfs_symlink_def *symlinks) +{ + for (; symlinks && symlinks->name; symlinks++) + debugfs_create_symlink(symlinks->name, lnet_debugfs_root, + symlinks->target); +} + +void lnet_remove_debugfs(struct ctl_table *table) +{ +#ifndef HAVE_D_HASH_AND_LOOKUP + debugfs_remove_recursive(lnet_debugfs_root); + lnet_debugfs_root = NULL; + return; +#endif + + for (; table && table->procname; table++) { + struct qstr dname = QSTR_INIT(table->procname, + strlen(table->procname)); + struct dentry *dentry; + + dentry = d_hash_and_lookup(lnet_debugfs_root, &dname); + debugfs_remove(dentry); + } +} +EXPORT_SYMBOL_GPL(lnet_remove_debugfs); + +static int __init libcfs_init(void) +{ + int rc; + +#ifndef HAVE_WAIT_VAR_EVENT + wait_bit_init(); +#endif + init_libcfs_vfree_atomic(); + + rc = libcfs_debug_init(5 * 1024 * 1024); + if (rc < 0) { + printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc); + return (rc); + } + + cfs_debug_init(); + + rc = cfs_cpu_init(); + if (rc != 0) + goto cleanup_debug; + + rc = misc_register(&libcfs_dev); + if (rc) { + CERROR("misc_register: error %d\n", rc); + goto cleanup_cpu; + } + + rc = cfs_wi_startup(); + if (rc) { + CERROR("initialize workitem: error %d\n", rc); + goto cleanup_deregister; + } + + /* max to 4 threads, should be enough for rehash */ + rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4); + rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY, + rc, &cfs_sched_rehash); + if (rc != 0) { + CERROR("Startup workitem scheduler: error: %d\n", rc); + goto cleanup_deregister; + } + + rc = cfs_crypto_register(); + if (rc) { + CERROR("cfs_crypto_regster: error %d\n", rc); + goto cleanup_wi; + } + + lnet_insert_debugfs(lnet_table); + if (!IS_ERR_OR_NULL(lnet_debugfs_root)) + lnet_insert_debugfs_links(lnet_debugfs_symlinks); + + CDEBUG (D_OTHER, "portals setup OK\n"); + return 0; +cleanup_wi: + cfs_wi_shutdown(); +cleanup_deregister: + misc_deregister(&libcfs_dev); +cleanup_cpu: + cfs_cpu_fini(); +cleanup_debug: + libcfs_debug_cleanup(); + return rc; +} + +static void __exit libcfs_exit(void) +{ + int rc; + + /* Remove everthing */ + if (lnet_debugfs_root) { + debugfs_remove_recursive(lnet_debugfs_root); + lnet_debugfs_root = NULL; + } + + CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + if (cfs_sched_rehash != NULL) { + cfs_wi_sched_destroy(cfs_sched_rehash); + cfs_sched_rehash = NULL; + } + + cfs_crypto_unregister(); + cfs_wi_shutdown(); + + misc_deregister(&libcfs_dev); + + cfs_cpu_fini(); + + if (atomic_read(&libcfs_kmemory) != 0) + CERROR("Portals memory leaked: %d bytes\n", + atomic_read(&libcfs_kmemory)); + + rc = libcfs_debug_cleanup(); + if (rc) + printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n", + rc); + + exit_libcfs_vfree_atomic(); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre helper library"); +MODULE_VERSION(LIBCFS_VERSION); +MODULE_LICENSE("GPL"); + +module_init(libcfs_init); +module_exit(libcfs_exit); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/prng.c b/drivers/staging/lustrefsx/libcfs/libcfs/prng.c new file mode 100644 index 0000000000000..03931745c9003 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/prng.c @@ -0,0 +1,136 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/prng.c + * + * concatenation of following two 16-bit multiply with carry generators + * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16, + * number and carry packed within the same 32 bit integer. + * algorithm recommended by Marsaglia +*/ + +#include +#include + +/* +From: George Marsaglia +Newsgroups: sci.math +Subject: Re: A RANDOM NUMBER GENERATOR FOR C +Date: Tue, 30 Sep 1997 05:29:35 -0700 + + * You may replace the two constants 36969 and 18000 by any + * pair of distinct constants from this list: + * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584 + * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243 + * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974 + * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114 + * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088 + * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834 + * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013 + * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083 + * (or any other 16-bit constants k for which both k*2^16-1 + * and k*2^15-1 are prime) */ + +#define RANDOM_CONST_A 18030 +#define RANDOM_CONST_B 29013 + +static unsigned int seed_x = 521288629; +static unsigned int seed_y = 362436069; + +/** + * cfs_rand - creates new seeds + * + * First it creates new seeds from the previous seeds. Then it generates a + * new psuedo random number for use. + * + * Returns a pseudo-random 32-bit integer + */ +unsigned int cfs_rand(void) +{ + seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16); + seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16); + + return ((seed_x << 16) + (seed_y & 65535)); +} +EXPORT_SYMBOL(cfs_rand); + +/** + * cfs_srand - sets the inital seed + * @seed1 : (seed_x) should have the most entropy in the low bits of the word + * @seed2 : (seed_y) should have the most entropy in the high bits of the word + * + * Replaces the original seeds with new values. Used to generate a new pseudo + * random numbers. + */ +void cfs_srand(unsigned int seed1, unsigned int seed2) +{ + if (seed1) + seed_x = seed1; /* use default seeds if parameter is 0 */ + if (seed2) + seed_y = seed2; +} +EXPORT_SYMBOL(cfs_srand); + +/** + * cfs_get_random_bytes - generate a bunch of random numbers + * @buf : buffer to fill with random numbers + * @size: size of passed in buffer + * + * Fills a buffer with random bytes + */ +void cfs_get_random_bytes(void *buf, int size) +{ + int *p = buf; + int rem, tmp; + + LASSERT(size >= 0); + + rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size); + if (rem) { + get_random_bytes(&tmp, sizeof(tmp)); + tmp ^= cfs_rand(); + memcpy(buf, &tmp, rem); + p = buf + rem; + size -= rem; + } + + while (size >= sizeof(int)) { + get_random_bytes(&tmp, sizeof(tmp)); + *p = cfs_rand() ^ tmp; + size -= sizeof(int); + p++; + } + buf = p; + if (size) { + get_random_bytes(&tmp, sizeof(tmp)); + tmp ^= cfs_rand(); + memcpy(buf, &tmp, size); + } +} +EXPORT_SYMBOL(cfs_get_random_bytes); diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c new file mode 100644 index 0000000000000..f9d96d12f2555 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c @@ -0,0 +1,1152 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/tracefile.c + * + * Author: Zach Brown + * Author: Phil Schwan + */ + + +#define DEBUG_SUBSYSTEM S_LNET +#define LUSTRE_TRACEFILE_PRIVATE +#include "tracefile.h" + +#include +#include +#include +#include +#include +#include +#include + +/* XXX move things up to the top, comment */ +union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned; + +char cfs_tracefile[TRACEFILE_NAME_SIZE]; +long long cfs_tracefile_size = CFS_TRACEFILE_SIZE; +static struct tracefiled_ctl trace_tctl; +static DEFINE_MUTEX(cfs_trace_thread_mutex); +static int thread_running = 0; + +static atomic_t cfs_tage_allocated = ATOMIC_INIT(0); + +static void put_pages_on_tcd_daemon_list(struct page_collection *pc, + struct cfs_trace_cpu_data *tcd); + +static inline struct cfs_trace_page * +cfs_tage_from_list(struct list_head *list) +{ + return list_entry(list, struct cfs_trace_page, linkage); +} + +static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp) +{ + struct page *page; + struct cfs_trace_page *tage; + + /* My caller is trying to free memory */ + if (!in_interrupt() && memory_pressure_get()) + return NULL; + + /* + * Don't spam console with allocation failures: they will be reported + * by upper layer anyway. + */ + gfp |= __GFP_NOWARN; + page = alloc_page(gfp); + if (page == NULL) + return NULL; + + tage = kmalloc(sizeof(*tage), gfp); + if (tage == NULL) { + __free_page(page); + return NULL; + } + + tage->page = page; + atomic_inc(&cfs_tage_allocated); + return tage; +} + +static void cfs_tage_free(struct cfs_trace_page *tage) +{ + __LASSERT(tage != NULL); + __LASSERT(tage->page != NULL); + + __free_page(tage->page); + kfree(tage); + atomic_dec(&cfs_tage_allocated); +} + +static void cfs_tage_to_tail(struct cfs_trace_page *tage, + struct list_head *queue) +{ + __LASSERT(tage != NULL); + __LASSERT(queue != NULL); + + list_move_tail(&tage->linkage, queue); +} + +int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp, + struct list_head *stock) +{ + int i; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) { + struct cfs_trace_page *tage; + + tage = cfs_tage_alloc(gfp); + if (tage == NULL) + break; + list_add_tail(&tage->linkage, stock); + } + return i; +} + +/* return a page that has 'len' bytes left at the end */ +static struct cfs_trace_page * +cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len) +{ + struct cfs_trace_page *tage; + + if (tcd->tcd_cur_pages > 0) { + __LASSERT(!list_empty(&tcd->tcd_pages)); + tage = cfs_tage_from_list(tcd->tcd_pages.prev); + if (tage->used + len <= PAGE_SIZE) + return tage; + } + + if (tcd->tcd_cur_pages < tcd->tcd_max_pages) { + if (tcd->tcd_cur_stock_pages > 0) { + tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev); + --tcd->tcd_cur_stock_pages; + list_del_init(&tage->linkage); + } else { + tage = cfs_tage_alloc(GFP_ATOMIC); + if (unlikely(tage == NULL)) { + if ((!memory_pressure_get() || + in_interrupt()) && printk_ratelimit()) + printk(KERN_WARNING + "cannot allocate a tage (%ld)\n", + tcd->tcd_cur_pages); + return NULL; + } + } + + tage->used = 0; + tage->cpu = smp_processor_id(); + tage->type = tcd->tcd_type; + list_add_tail(&tage->linkage, &tcd->tcd_pages); + tcd->tcd_cur_pages++; + + if (tcd->tcd_cur_pages > 8 && thread_running) { + struct tracefiled_ctl *tctl = &trace_tctl; + /* + * wake up tracefiled to process some pages. + */ + wake_up(&tctl->tctl_waitq); + } + return tage; + } + return NULL; +} + +static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd) +{ + int pgcount = tcd->tcd_cur_pages / 10; + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + if (printk_ratelimit()) + printk(KERN_WARNING "debug daemon buffer overflowed; " + "discarding 10%% of pages (%d of %ld)\n", + pgcount + 1, tcd->tcd_cur_pages); + + INIT_LIST_HEAD(&pc.pc_pages); + + list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) { + if (pgcount-- == 0) + break; + + list_move_tail(&tage->linkage, &pc.pc_pages); + tcd->tcd_cur_pages--; + } + put_pages_on_tcd_daemon_list(&pc, tcd); +} + +/* return a page that has 'len' bytes left at the end */ +static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd, + unsigned long len) +{ + struct cfs_trace_page *tage; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + if (len > PAGE_SIZE) { + printk(KERN_ERR + "cowardly refusing to write %lu bytes in a page\n", len); + return NULL; + } + + tage = cfs_trace_get_tage_try(tcd, len); + if (tage != NULL) + return tage; + if (thread_running) + cfs_tcd_shrink(tcd); + if (tcd->tcd_cur_pages > 0) { + tage = cfs_tage_from_list(tcd->tcd_pages.next); + tage->used = 0; + cfs_tage_to_tail(tage, &tcd->tcd_pages); + } + return tage; +} + +int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, + const char *format, ...) +{ + va_list args; + int rc; + + va_start(args, format); + rc = libcfs_debug_vmsg2(msgdata, format, args, NULL); + va_end(args); + + return rc; +} +EXPORT_SYMBOL(libcfs_debug_msg); + +int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata, + const char *format1, va_list args, + const char *format2, ...) +{ + struct cfs_trace_cpu_data *tcd = NULL; + struct ptldebug_header header = {0}; + struct cfs_trace_page *tage; + /* string_buf is used only if tcd != NULL, and is always set then */ + char *string_buf = NULL; + char *debug_buf; + int known_size; + int needed = 85; /* average message length */ + int max_nob; + va_list ap; + int i; + int remain; + int mask = msgdata->msg_mask; + char *file = (char *)msgdata->msg_file; + struct cfs_debug_limit_state *cdls = msgdata->msg_cdls; + + if (strchr(file, '/')) + file = strrchr(file, '/') + 1; + + tcd = cfs_trace_get_tcd(); + + /* cfs_trace_get_tcd() grabs a lock, which disables preemption and + * pins us to a particular CPU. This avoids an smp_processor_id() + * warning on Linux when debugging is enabled. */ + cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK()); + + if (tcd == NULL) /* arch may not log in IRQ context */ + goto console; + + if (tcd->tcd_cur_pages == 0) + header.ph_flags |= PH_FLAG_FIRST_RECORD; + + if (tcd->tcd_shutting_down) { + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + known_size = strlen(file) + 1; + if (msgdata->msg_fn) + known_size += strlen(msgdata->msg_fn) + 1; + + if (libcfs_debug_binary) + known_size += sizeof(header); + + /*/ + * '2' used because vsnprintf return real size required for output + * _without_ terminating NULL. + * if needed is to small for this format. + */ + for (i = 0; i < 2; i++) { + tage = cfs_trace_get_tage(tcd, needed + known_size + 1); + if (tage == NULL) { + if (needed + known_size > PAGE_SIZE) + mask |= D_ERROR; + + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + string_buf = (char *)page_address(tage->page) + + tage->used + known_size; + + max_nob = PAGE_SIZE - tage->used - known_size; + if (max_nob <= 0) { + printk(KERN_EMERG "negative max_nob: %d\n", + max_nob); + mask |= D_ERROR; + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + needed = 0; + if (format1) { + va_copy(ap, args); + needed = vsnprintf(string_buf, max_nob, format1, ap); + va_end(ap); + } + + if (format2) { + remain = max_nob - needed; + if (remain < 0) + remain = 0; + + va_start(ap, format2); + needed += vsnprintf(string_buf + needed, remain, + format2, ap); + va_end(ap); + } + + if (needed < max_nob) /* well. printing ok.. */ + break; + } + + if (*(string_buf+needed-1) != '\n') + printk(KERN_INFO "format at %s:%d:%s doesn't end in " + "newline\n", file, msgdata->msg_line, msgdata->msg_fn); + + header.ph_len = known_size + needed; + debug_buf = (char *)page_address(tage->page) + tage->used; + + if (libcfs_debug_binary) { + memcpy(debug_buf, &header, sizeof(header)); + tage->used += sizeof(header); + debug_buf += sizeof(header); + } + + strcpy(debug_buf, file); + tage->used += strlen(file) + 1; + debug_buf += strlen(file) + 1; + + if (msgdata->msg_fn) { + strcpy(debug_buf, msgdata->msg_fn); + tage->used += strlen(msgdata->msg_fn) + 1; + debug_buf += strlen(msgdata->msg_fn) + 1; + } + + __LASSERT(debug_buf == string_buf); + + tage->used += needed; + __LASSERT(tage->used <= PAGE_SIZE); + +console: + if ((mask & libcfs_printk) == 0) { + /* no console output requested */ + if (tcd != NULL) + cfs_trace_put_tcd(tcd); + return 1; + } + + if (cdls != NULL) { + if (libcfs_console_ratelimit && + cdls->cdls_next != 0 && /* not first time ever */ + time_before(jiffies, cdls->cdls_next)) { + /* skipping a console message */ + cdls->cdls_count++; + if (tcd != NULL) + cfs_trace_put_tcd(tcd); + return 1; + } + + if (time_after(jiffies, cdls->cdls_next + + libcfs_console_max_delay + + cfs_time_seconds(10))) { + /* last timeout was a long time ago */ + cdls->cdls_delay /= libcfs_console_backoff * 4; + } else { + cdls->cdls_delay *= libcfs_console_backoff; + } + + if (cdls->cdls_delay < libcfs_console_min_delay) + cdls->cdls_delay = libcfs_console_min_delay; + else if (cdls->cdls_delay > libcfs_console_max_delay) + cdls->cdls_delay = libcfs_console_max_delay; + + /* ensure cdls_next is never zero after it's been seen */ + cdls->cdls_next = (jiffies + cdls->cdls_delay) | 1; + } + + if (tcd != NULL) { + cfs_print_to_console(&header, mask, string_buf, needed, file, + msgdata->msg_fn); + cfs_trace_put_tcd(tcd); + } else { + string_buf = cfs_trace_get_console_buffer(); + + needed = 0; + if (format1 != NULL) { + va_copy(ap, args); + needed = vsnprintf(string_buf, + CFS_TRACE_CONSOLE_BUFFER_SIZE, + format1, ap); + va_end(ap); + } + if (format2 != NULL) { + remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed; + if (remain > 0) { + va_start(ap, format2); + needed += vsnprintf(string_buf+needed, remain, + format2, ap); + va_end(ap); + } + } + cfs_print_to_console(&header, mask, + string_buf, needed, file, msgdata->msg_fn); + + put_cpu(); + } + + if (cdls != NULL && cdls->cdls_count != 0) { + string_buf = cfs_trace_get_console_buffer(); + + needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE, + "Skipped %d previous similar message%s\n", + cdls->cdls_count, + (cdls->cdls_count > 1) ? "s" : ""); + + cfs_print_to_console(&header, mask, + string_buf, needed, file, msgdata->msg_fn); + + put_cpu(); + cdls->cdls_count = 0; + } + + return 0; +} +EXPORT_SYMBOL(libcfs_debug_vmsg2); + +void +cfs_trace_assertion_failed(const char *str, + struct libcfs_debug_msg_data *msgdata) +{ + struct ptldebug_header hdr; + + libcfs_panic_in_progress = 1; + libcfs_catastrophe = 1; + smp_mb(); + + cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK()); + + cfs_print_to_console(&hdr, D_EMERG, str, strlen(str), + msgdata->msg_file, msgdata->msg_fn); + + panic("Lustre debug assertion failure\n"); + + /* not reached */ +} + +static void +panic_collect_pages(struct page_collection *pc) +{ + /* Do the collect_pages job on a single CPU: assumes that all other + * CPUs have been stopped during a panic. If this isn't true for some + * arch, this will have to be implemented separately in each arch. */ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + + INIT_LIST_HEAD(&pc->pc_pages); + + cfs_tcd_for_each(tcd, i, j) { + list_splice_init(&tcd->tcd_pages, &pc->pc_pages); + tcd->tcd_cur_pages = 0; + + if (pc->pc_want_daemon_pages) { + list_splice_init(&tcd->tcd_daemon_pages, + &pc->pc_pages); + tcd->tcd_cur_daemon_pages = 0; + } + } +} + +static void collect_pages_on_all_cpus(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + int i, cpu; + + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + list_splice_init(&tcd->tcd_pages, &pc->pc_pages); + tcd->tcd_cur_pages = 0; + if (pc->pc_want_daemon_pages) { + list_splice_init(&tcd->tcd_daemon_pages, + &pc->pc_pages); + tcd->tcd_cur_daemon_pages = 0; + } + } + } +} + +static void collect_pages(struct page_collection *pc) +{ + INIT_LIST_HEAD(&pc->pc_pages); + + if (libcfs_panic_in_progress) + panic_collect_pages(pc); + else + collect_pages_on_all_cpus(pc); +} + +static void put_pages_back_on_all_cpus(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + struct list_head *cur_head; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + int i, cpu; + + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + cur_head = tcd->tcd_pages.next; + + list_for_each_entry_safe(tage, tmp, &pc->pc_pages, + linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + if (tage->cpu != cpu || tage->type != i) + continue; + + cfs_tage_to_tail(tage, cur_head); + tcd->tcd_cur_pages++; + } + } + } +} + +static void put_pages_back(struct page_collection *pc) +{ + if (!libcfs_panic_in_progress) + put_pages_back_on_all_cpus(pc); +} + +/* Add pages to a per-cpu debug daemon ringbuffer. This buffer makes sure that + * we have a good amount of data at all times for dumping during an LBUG, even + * if we have been steadily writing (and otherwise discarding) pages via the + * debug daemon. */ +static void put_pages_on_tcd_daemon_list(struct page_collection *pc, + struct cfs_trace_cpu_data *tcd) +{ + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) { + __LASSERT_TAGE_INVARIANT(tage); + + if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type) + continue; + + cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages); + tcd->tcd_cur_daemon_pages++; + + if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) { + struct cfs_trace_page *victim; + + __LASSERT(!list_empty(&tcd->tcd_daemon_pages)); + victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next); + + __LASSERT_TAGE_INVARIANT(victim); + + list_del(&victim->linkage); + cfs_tage_free(victim); + tcd->tcd_cur_daemon_pages--; + } + } +} + +static void put_pages_on_daemon_list(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + int i, cpu; + + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) + put_pages_on_tcd_daemon_list(pc, tcd); + } +} + +void cfs_trace_debug_print(void) +{ + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + pc.pc_want_daemon_pages = 1; + collect_pages(&pc); + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + char *p, *file, *fn; + struct page *page; + + __LASSERT_TAGE_INVARIANT(tage); + + page = tage->page; + p = page_address(page); + while (p < ((char *)page_address(page) + tage->used)) { + struct ptldebug_header *hdr; + int len; + hdr = (void *)p; + p += sizeof(*hdr); + file = p; + p += strlen(file) + 1; + fn = p; + p += strlen(fn) + 1; + len = hdr->ph_len - (int)(p - (char *)hdr); + + cfs_print_to_console(hdr, D_EMERG, p, len, file, fn); + + p += len; + } + + list_del(&tage->linkage); + cfs_tage_free(tage); + } +} + +int cfs_tracefile_dump_all_pages(char *filename) +{ + struct page_collection pc; + struct file *filp; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + char *buf; + int rc; + + cfs_tracefile_write_lock(); + + filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + filp = NULL; + printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n", + filename, rc); + goto out; + } + + pc.pc_want_daemon_pages = 1; + collect_pages(&pc); + if (list_empty(&pc.pc_pages)) { + rc = 0; + goto close; + } + + /* ok, for now, just write the pages. in the future we'll be building + * iobufs with the pages and calling generic_direct_IO */ + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + buf = kmap(tage->page); + rc = cfs_kernel_write(filp, buf, tage->used, &filp->f_pos); + kunmap(tage->page); + if (rc != (int)tage->used) { + printk(KERN_WARNING "wanted to write %u but wrote " + "%d\n", tage->used, rc); + put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + break; + } + list_del(&tage->linkage); + cfs_tage_free(tage); + } + + rc = ll_vfs_fsync_range(filp, 0, LLONG_MAX, 1); + if (rc) + printk(KERN_ERR "sync returns %d\n", rc); +close: + filp_close(filp, NULL); +out: + cfs_tracefile_write_unlock(); + return rc; +} + +void cfs_trace_flush_pages(void) +{ + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + pc.pc_want_daemon_pages = 1; + collect_pages(&pc); + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + list_del(&tage->linkage); + cfs_tage_free(tage); + } +} + +int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, + const char __user *usr_buffer, int usr_buffer_nob) +{ + int nob; + + if (usr_buffer_nob > knl_buffer_nob) + return -EOVERFLOW; + + if (copy_from_user(knl_buffer, usr_buffer, usr_buffer_nob)) + return -EFAULT; + + nob = strnlen(knl_buffer, usr_buffer_nob); + while (nob-- >= 0) /* strip trailing whitespace */ + if (!isspace(knl_buffer[nob])) + break; + + if (nob < 0) /* empty string */ + return -EINVAL; + + if (nob == knl_buffer_nob) /* no space to terminate */ + return -EOVERFLOW; + + knl_buffer[nob + 1] = 0; /* terminate */ + return 0; +} +EXPORT_SYMBOL(cfs_trace_copyin_string); + +int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, + const char *knl_buffer, char *append) +{ + /* NB if 'append' != NULL, it's a single character to append to the + * copied out string - usually "\n", for /proc entries and "" (i.e. a + * terminating zero byte) for sysctl entries */ + int nob = strlen(knl_buffer); + + if (nob > usr_buffer_nob) + nob = usr_buffer_nob; + + if (copy_to_user(usr_buffer, knl_buffer, nob)) + return -EFAULT; + + if (append != NULL && nob < usr_buffer_nob) { + if (copy_to_user(usr_buffer + nob, append, 1)) + return -EFAULT; + + nob++; + } + + return nob; +} +EXPORT_SYMBOL(cfs_trace_copyout_string); + +int cfs_trace_allocate_string_buffer(char **str, int nob) +{ + if (nob > 2 * PAGE_SIZE) /* string must be "sensible" */ + return -EINVAL; + + *str = kmalloc(nob, GFP_KERNEL | __GFP_ZERO); + if (*str == NULL) + return -ENOMEM; + + return 0; +} + +int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob) +{ + char *str; + int rc; + + rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1); + if (rc != 0) + return rc; + + rc = cfs_trace_copyin_string(str, usr_str_nob + 1, + usr_str, usr_str_nob); + if (rc != 0) + goto out; + + if (str[0] != '/') { + rc = -EINVAL; + goto out; + } + rc = cfs_tracefile_dump_all_pages(str); +out: + kfree(str); + return rc; +} + +int cfs_trace_daemon_command(char *str) +{ + int rc = 0; + + cfs_tracefile_write_lock(); + + if (strcmp(str, "stop") == 0) { + cfs_tracefile_write_unlock(); + cfs_trace_stop_thread(); + cfs_tracefile_write_lock(); + memset(cfs_tracefile, 0, sizeof(cfs_tracefile)); + + } else if (strncmp(str, "size=", 5) == 0) { + unsigned long tmp; + + rc = kstrtoul(str + 5, 10, &tmp); + if (!rc) { + if (tmp < 10 || tmp > 20480) + cfs_tracefile_size = CFS_TRACEFILE_SIZE; + else + cfs_tracefile_size = tmp << 20; + } + } else if (strlen(str) >= sizeof(cfs_tracefile)) { + rc = -ENAMETOOLONG; + } else if (str[0] != '/') { + rc = -EINVAL; + } else { + strcpy(cfs_tracefile, str); + + printk(KERN_INFO + "Lustre: debug daemon will attempt to start writing " + "to %s (%lukB max)\n", cfs_tracefile, + (long)(cfs_tracefile_size >> 10)); + + cfs_trace_start_thread(); + } + + cfs_tracefile_write_unlock(); + return rc; +} + +int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob) +{ + char *str; + int rc; + + rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1); + if (rc != 0) + return rc; + + rc = cfs_trace_copyin_string(str, usr_str_nob + 1, + usr_str, usr_str_nob); + if (rc == 0) + rc = cfs_trace_daemon_command(str); + + kfree(str); + return rc; +} + +int cfs_trace_set_debug_mb(int mb) +{ + int i; + int j; + int pages; + int limit = cfs_trace_max_debug_mb(); + struct cfs_trace_cpu_data *tcd; + + if (mb < num_possible_cpus()) { + printk(KERN_WARNING + "Lustre: %d MB is too small for debug buffer size, " + "setting it to %d MB.\n", mb, num_possible_cpus()); + mb = num_possible_cpus(); + } + + if (mb > limit) { + printk(KERN_WARNING + "Lustre: %d MB is too large for debug buffer size, " + "setting it to %d MB.\n", mb, limit); + mb = limit; + } + + mb /= num_possible_cpus(); + pages = mb << (20 - PAGE_SHIFT); + + cfs_tracefile_write_lock(); + + cfs_tcd_for_each(tcd, i, j) + tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100; + + cfs_tracefile_write_unlock(); + + return 0; +} + +int cfs_trace_get_debug_mb(void) +{ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + int total_pages = 0; + + cfs_tracefile_read_lock(); + + cfs_tcd_for_each(tcd, i, j) + total_pages += tcd->tcd_max_pages; + + cfs_tracefile_read_unlock(); + + return (total_pages >> (20 - PAGE_SHIFT)) + 1; +} + +static int tracefiled(void *arg) +{ + struct page_collection pc; + struct tracefiled_ctl *tctl = arg; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + struct file *filp; + char *buf; + int last_loop = 0; + int rc; + + /* we're started late enough that we pick up init's fs context */ + /* this is so broken in uml? what on earth is going on? */ + + complete(&tctl->tctl_start); + + while (1) { + wait_queue_entry_t __wait; + + pc.pc_want_daemon_pages = 0; + collect_pages(&pc); + if (list_empty(&pc.pc_pages)) + goto end_loop; + + filp = NULL; + cfs_tracefile_read_lock(); + if (cfs_tracefile[0] != 0) { + filp = filp_open(cfs_tracefile, + O_CREAT | O_RDWR | O_LARGEFILE, + 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + filp = NULL; + printk(KERN_WARNING "couldn't open %s: " + "%d\n", cfs_tracefile, rc); + } + } + cfs_tracefile_read_unlock(); + if (filp == NULL) { + put_pages_on_daemon_list(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + goto end_loop; + } + + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + struct dentry *de = file_dentry(filp); + static loff_t f_pos; + + __LASSERT_TAGE_INVARIANT(tage); + + if (f_pos >= (off_t)cfs_tracefile_size) + f_pos = 0; + else if (f_pos > i_size_read(de->d_inode)) + f_pos = i_size_read(de->d_inode); + + buf = kmap(tage->page); + rc = cfs_kernel_write(filp, buf, tage->used, &f_pos); + kunmap(tage->page); + if (rc != (int)tage->used) { + printk(KERN_WARNING "wanted to write %u " + "but wrote %d\n", tage->used, rc); + put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + break; + } + } + + filp_close(filp, NULL); + put_pages_on_daemon_list(&pc); + if (!list_empty(&pc.pc_pages)) { + int i; + + printk(KERN_ALERT "Lustre: trace pages aren't " + " empty\n"); + printk(KERN_ERR "total cpus(%d): ", + num_possible_cpus()); + for (i = 0; i < num_possible_cpus(); i++) + if (cpu_online(i)) + printk(KERN_ERR "%d(on) ", i); + else + printk(KERN_ERR "%d(off) ", i); + printk(KERN_ERR "\n"); + + i = 0; + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, + linkage) + printk(KERN_ERR "page %d belongs to cpu " + "%d\n", ++i, tage->cpu); + printk(KERN_ERR "There are %d pages unwritten\n", + i); + } + __LASSERT(list_empty(&pc.pc_pages)); +end_loop: + if (atomic_read(&tctl->tctl_shutdown)) { + if (last_loop == 0) { + last_loop = 1; + continue; + } else { + break; + } + } + init_waitqueue_entry(&__wait, current); + add_wait_queue(&tctl->tctl_waitq, &__wait); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + remove_wait_queue(&tctl->tctl_waitq, &__wait); + } + complete(&tctl->tctl_stop); + return 0; +} + +int cfs_trace_start_thread(void) +{ + struct tracefiled_ctl *tctl = &trace_tctl; + int rc = 0; + + mutex_lock(&cfs_trace_thread_mutex); + if (thread_running) + goto out; + + init_completion(&tctl->tctl_start); + init_completion(&tctl->tctl_stop); + init_waitqueue_head(&tctl->tctl_waitq); + atomic_set(&tctl->tctl_shutdown, 0); + + if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) { + rc = -ECHILD; + goto out; + } + + wait_for_completion(&tctl->tctl_start); + thread_running = 1; +out: + mutex_unlock(&cfs_trace_thread_mutex); + return rc; +} + +void cfs_trace_stop_thread(void) +{ + struct tracefiled_ctl *tctl = &trace_tctl; + + mutex_lock(&cfs_trace_thread_mutex); + if (thread_running) { + printk(KERN_INFO + "Lustre: shutting down debug daemon thread...\n"); + atomic_set(&tctl->tctl_shutdown, 1); + wait_for_completion(&tctl->tctl_stop); + thread_running = 0; + } + mutex_unlock(&cfs_trace_thread_mutex); +} + +int cfs_tracefile_init(int max_pages) +{ + struct cfs_trace_cpu_data *tcd; + int i; + int j; + int rc; + int factor; + + rc = cfs_tracefile_init_arch(); + if (rc != 0) + return rc; + + cfs_tcd_for_each(tcd, i, j) { + /* tcd_pages_factor is initialized int tracefile_init_arch. */ + factor = tcd->tcd_pages_factor; + INIT_LIST_HEAD(&tcd->tcd_pages); + INIT_LIST_HEAD(&tcd->tcd_stock_pages); + INIT_LIST_HEAD(&tcd->tcd_daemon_pages); + tcd->tcd_cur_pages = 0; + tcd->tcd_cur_stock_pages = 0; + tcd->tcd_cur_daemon_pages = 0; + tcd->tcd_max_pages = (max_pages * factor) / 100; + LASSERT(tcd->tcd_max_pages > 0); + tcd->tcd_shutting_down = 0; + } + return 0; +} + +static void trace_cleanup_on_all_cpus(void) +{ + struct cfs_trace_cpu_data *tcd; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + int i, cpu; + + for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + tcd->tcd_shutting_down = 1; + + list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) { + __LASSERT_TAGE_INVARIANT(tage); + + list_del(&tage->linkage); + cfs_tage_free(tage); + } + tcd->tcd_cur_pages = 0; + } + } +} + +static void cfs_trace_cleanup(void) +{ + struct page_collection pc; + + INIT_LIST_HEAD(&pc.pc_pages); + + trace_cleanup_on_all_cpus(); + + cfs_tracefile_fini_arch(); +} + +void cfs_tracefile_exit(void) +{ + cfs_trace_stop_thread(); + cfs_trace_cleanup(); +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h new file mode 100644 index 0000000000000..c6ca34d4fb08e --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h @@ -0,0 +1,319 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_TRACEFILE_H__ +#define __LIBCFS_TRACEFILE_H__ + +#include + +enum cfs_trace_buf_type { + CFS_TCD_TYPE_PROC = 0, + CFS_TCD_TYPE_SOFTIRQ, + CFS_TCD_TYPE_IRQ, + CFS_TCD_TYPE_MAX +}; + +/* trace file lock routines */ + +#define TRACEFILE_NAME_SIZE 1024 +extern char cfs_tracefile[TRACEFILE_NAME_SIZE]; +extern long long cfs_tracefile_size; + +/** + * The path of debug log dump upcall script. + */ +extern char lnet_debug_log_upcall[1024]; + +extern void libcfs_run_debug_log_upcall(char *file); + +int cfs_tracefile_init_arch(void); +void cfs_tracefile_fini_arch(void); + +void cfs_tracefile_read_lock(void); +void cfs_tracefile_read_unlock(void); +void cfs_tracefile_write_lock(void); +void cfs_tracefile_write_unlock(void); + +int cfs_tracefile_dump_all_pages(char *filename); +void cfs_trace_debug_print(void); +void cfs_trace_flush_pages(void); +int cfs_trace_start_thread(void); +void cfs_trace_stop_thread(void); +int cfs_tracefile_init(int max_pages); +void cfs_tracefile_exit(void); + + + +int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, + const char __user *usr_buffer, int usr_buffer_nob); +int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, + const char *knl_str, char *append); +int cfs_trace_allocate_string_buffer(char **str, int nob); +int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob); +int cfs_trace_daemon_command(char *str); +int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob); +int cfs_trace_set_debug_mb(int mb); +int cfs_trace_get_debug_mb(void); + +extern void libcfs_debug_dumplog_internal(void *arg); +extern void libcfs_register_panic_notifier(void); +extern void libcfs_unregister_panic_notifier(void); +extern int libcfs_panic_in_progress; +extern int cfs_trace_max_debug_mb(void); + +#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT)) +#define TCD_STOCK_PAGES (TCD_MAX_PAGES) +#define CFS_TRACEFILE_SIZE (500 << 20) + +#ifdef LUSTRE_TRACEFILE_PRIVATE + +/* + * Private declare for tracefile + */ +#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT)) +#define TCD_STOCK_PAGES (TCD_MAX_PAGES) + +#define CFS_TRACEFILE_SIZE (500 << 20) + +/* Size of a buffer for sprinting console messages if we can't get a page + * from system */ +#define CFS_TRACE_CONSOLE_BUFFER_SIZE 1024 + +union cfs_trace_data_union { + struct cfs_trace_cpu_data { + /* + * Even though this structure is meant to be per-CPU, locking + * is needed because in some places the data may be accessed + * from other CPUs. This lock is directly used in trace_get_tcd + * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and + * tcd_for_each_type_lock + */ + spinlock_t tcd_lock; + unsigned long tcd_lock_flags; + + /* + * pages with trace records not yet processed by tracefiled. + */ + struct list_head tcd_pages; + /* number of pages on ->tcd_pages */ + unsigned long tcd_cur_pages; + + /* + * pages with trace records already processed by + * tracefiled. These pages are kept in memory, so that some + * portion of log can be written in the event of LBUG. This + * list is maintained in LRU order. + * + * Pages are moved to ->tcd_daemon_pages by tracefiled() + * (put_pages_on_daemon_list()). LRU pages from this list are + * discarded when list grows too large. + */ + struct list_head tcd_daemon_pages; + /* number of pages on ->tcd_daemon_pages */ + unsigned long tcd_cur_daemon_pages; + + /* + * Maximal number of pages allowed on ->tcd_pages and + * ->tcd_daemon_pages each. + * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current + * implementation. + */ + unsigned long tcd_max_pages; + + /* + * preallocated pages to write trace records into. Pages from + * ->tcd_stock_pages are moved to ->tcd_pages by + * portals_debug_msg(). + * + * This list is necessary, because on some platforms it's + * impossible to perform efficient atomic page allocation in a + * non-blockable context. + * + * Such platforms fill ->tcd_stock_pages "on occasion", when + * tracing code is entered in blockable context. + * + * trace_get_tage_try() tries to get a page from + * ->tcd_stock_pages first and resorts to atomic page + * allocation only if this queue is empty. ->tcd_stock_pages + * is replenished when tracing code is entered in blocking + * context (darwin-tracefile.c:trace_get_tcd()). We try to + * maintain TCD_STOCK_PAGES (40 by default) pages in this + * queue. Atomic allocation is only required if more than + * TCD_STOCK_PAGES pagesful are consumed by trace records all + * emitted in non-blocking contexts. Which is quite unlikely. + */ + struct list_head tcd_stock_pages; + /* number of pages on ->tcd_stock_pages */ + unsigned long tcd_cur_stock_pages; + + unsigned short tcd_shutting_down; + unsigned short tcd_cpu; + unsigned short tcd_type; + /* The factors to share debug memory. */ + unsigned short tcd_pages_factor; + } tcd; + char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))]; +}; + +#define TCD_MAX_TYPES 8 +extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS]; + +#define cfs_tcd_for_each(tcd, i, j) \ + for (i = 0; cfs_trace_data[i] != NULL; i++) \ + for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd); \ + j < num_possible_cpus(); \ + j++, (tcd) = &(*cfs_trace_data[i])[j].tcd) + +#define cfs_tcd_for_each_type_lock(tcd, i, cpu) \ + for (i = 0; cfs_trace_data[i] && \ + (tcd = &(*cfs_trace_data[i])[cpu].tcd) && \ + cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++) + +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct page_collection { + struct list_head pc_pages; + /* + * if this flag is set, collect_pages() will spill both + * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise, + * only ->tcd_pages are spilled. + */ + int pc_want_daemon_pages; +}; + +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct tracefiled_ctl { + struct completion tctl_start; + struct completion tctl_stop; + wait_queue_head_t tctl_waitq; + pid_t tctl_pid; + atomic_t tctl_shutdown; +}; + +/* + * small data-structure for each page owned by tracefiled. + */ +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct cfs_trace_page { + /* + * page itself + */ + struct page *page; + /* + * linkage into one of the lists in trace_data_union or + * page_collection + */ + struct list_head linkage; + /* + * number of bytes used within this page + */ + unsigned int used; + /* + * cpu that owns this page + */ + unsigned short cpu; + /* + * type(context) of this page + */ + unsigned short type; +}; + +extern void cfs_set_ptldebug_header(struct ptldebug_header *header, + struct libcfs_debug_msg_data *m, + unsigned long stack); +extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask, + const char *buf, int len, const char *file, + const char *fn); + +extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking); +extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking); + +extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX]; +extern enum cfs_trace_buf_type cfs_trace_buf_idx_get(void); + +static inline char *cfs_trace_get_console_buffer(void) +{ + unsigned int i = get_cpu(); + unsigned int j = cfs_trace_buf_idx_get(); + + return cfs_trace_console_buffers[i][j]; +} + +static inline struct cfs_trace_cpu_data *cfs_trace_get_tcd(void) +{ + struct cfs_trace_cpu_data *tcd = + &(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd; + + cfs_trace_lock_tcd(tcd, 0); + + return tcd; +} + +static inline void cfs_trace_put_tcd(struct cfs_trace_cpu_data *tcd) +{ + cfs_trace_unlock_tcd(tcd, 0); + put_cpu(); +} + +int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp, + struct list_head *stock); + + +int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd, + struct cfs_trace_page *tage); + +extern void cfs_trace_assertion_failed(const char *str, + struct libcfs_debug_msg_data *m); + +/* ASSERTION that is safe to use within the debug system */ +#define __LASSERT(cond) \ +do { \ + if (unlikely(!(cond))) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \ + cfs_trace_assertion_failed("ASSERTION("#cond") failed", \ + &msgdata); \ + } \ +} while (0) + +#define __LASSERT_TAGE_INVARIANT(tage) \ +do { \ + __LASSERT(tage != NULL); \ + __LASSERT(tage->page != NULL); \ + __LASSERT(tage->used <= PAGE_SIZE); \ + __LASSERT(page_count(tage->page) > 0); \ +} while (0) + +#endif /* LUSTRE_TRACEFILE_PRIVATE */ + +#endif /* __LIBCFS_TRACEFILE_H__ */ diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c new file mode 100644 index 0000000000000..f1676aa8f7a4d --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c @@ -0,0 +1,186 @@ +/* + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * Copyright (c) 2014, 2017, Intel Corporation. + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define __USE_FILE_OFFSET64 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct ioc_dev { + const char *dev_name; + int dev_fd; +}; + +static struct ioc_dev ioc_dev_list[10]; + +static int +open_ioc_dev(int dev_id) +{ + const char * dev_name; + + if (dev_id < 0 || + dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0])) + return -EINVAL; + + dev_name = ioc_dev_list[dev_id].dev_name; + if (dev_name == NULL) { + fprintf(stderr, "unknown device id: %d\n", dev_id); + return -EINVAL; + } + + if (ioc_dev_list[dev_id].dev_fd < 0) { + int fd = open(dev_name, O_RDWR); + + if (fd < 0) { + fprintf(stderr, "opening %s failed: %s\n" + "hint: the kernel modules may not be loaded\n", + dev_name, strerror(errno)); + return fd; + } + ioc_dev_list[dev_id].dev_fd = fd; + } + + return ioc_dev_list[dev_id].dev_fd; +} + + +int l_ioctl(int dev_id, unsigned int opc, void *buf) +{ + int fd, rc; + + fd = open_ioc_dev(dev_id); + if (fd < 0) + return fd; + + rc = ioctl(fd, opc, buf); + + return rc; +} + +/* register a device to send ioctls to. */ +int +register_ioc_dev(int dev_id, const char *dev_name) +{ + if (dev_id < 0 || + dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0])) + return -EINVAL; + + unregister_ioc_dev(dev_id); + + ioc_dev_list[dev_id].dev_name = dev_name; + ioc_dev_list[dev_id].dev_fd = -1; + + return dev_id; +} + +void +unregister_ioc_dev(int dev_id) +{ + if (dev_id < 0 || + dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0])) + return; + + if (ioc_dev_list[dev_id].dev_name != NULL && + ioc_dev_list[dev_id].dev_fd >= 0) + close(ioc_dev_list[dev_id].dev_fd); + + ioc_dev_list[dev_id].dev_name = NULL; + ioc_dev_list[dev_id].dev_fd = -1; +} + +static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data) +{ + size_t len = sizeof(*data); + + len += (data->ioc_inllen1 + 7) & ~7; + len += (data->ioc_inllen2 + 7) & ~7; + return len; +} + +int libcfs_ioctl_pack(struct libcfs_ioctl_data *data, char **pbuf, + int max) +{ + char *ptr; + struct libcfs_ioctl_data *overlay; + data->ioc_hdr.ioc_len = libcfs_ioctl_packlen(data); + data->ioc_hdr.ioc_version = LIBCFS_IOCTL_VERSION; + + if (*pbuf != NULL && libcfs_ioctl_packlen(data) > max) + return 1; + if (*pbuf == NULL) + *pbuf = malloc(data->ioc_hdr.ioc_len); + if (*pbuf == NULL) + return 1; + overlay = (struct libcfs_ioctl_data *)*pbuf; + memcpy(*pbuf, data, sizeof(*data)); + + ptr = overlay->ioc_bulk; + if (data->ioc_inlbuf1 != NULL) { + memcpy((char *)ptr, (const char *)data->ioc_inlbuf1, + data->ioc_inllen1); + ptr += ((data->ioc_inllen1 + 7) & ~7); + } + if (data->ioc_inlbuf2 != NULL) { + memcpy((char *)ptr, (const char *)data->ioc_inlbuf2, + data->ioc_inllen2); + ptr += ((data->ioc_inllen2 + 7) & ~7); + } + + return 0; +} + +void +libcfs_ioctl_unpack(struct libcfs_ioctl_data *data, char *pbuf) +{ + struct libcfs_ioctl_data *overlay = (struct libcfs_ioctl_data *)pbuf; + char *ptr; + + /* Preserve the caller's buffer pointers */ + overlay->ioc_inlbuf1 = data->ioc_inlbuf1; + overlay->ioc_inlbuf2 = data->ioc_inlbuf2; + + memcpy(data, pbuf, sizeof(*data)); + ptr = &overlay->ioc_bulk[0]; + + if (data->ioc_inlbuf1 != NULL) { + memcpy((char *)data->ioc_inlbuf1, (const char *)ptr, + data->ioc_inllen1); + ptr += ((data->ioc_inllen1 + 7) & ~7); + } + if (data->ioc_inlbuf2 != NULL) { + memcpy((char *)data->ioc_inlbuf2, (const char *)ptr, + data->ioc_inllen2); + ptr += ((data->ioc_inllen2 + 7) & ~7); + } +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c new file mode 100644 index 0000000000000..246d420354217 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c @@ -0,0 +1,1305 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/util/nidstrings.c + * + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#ifdef HAVE_NETDB_H +# include +#endif + +/* max value for numeric network address */ +#define MAX_NUMERIC_VALUE 0xffffffff + +#define IPSTRING_LENGTH 16 + +/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids + * consistent in all conversion functions. Some code fragments are copied + * around for the sake of clarity... + */ + +/* CAVEAT EMPTOR! Racey temporary buffer allocation! + * Choose the number of nidstrings to support the MAXIMUM expected number of + * concurrent users. If there are more, the returned string will be volatile. + * NB this number must allow for a process to be descheduled for a timeslice + * between getting its string and using it. + */ + +static char libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE]; +static int libcfs_nidstring_idx; + +char * +libcfs_next_nidstring(void) +{ + char *str; + + str = libcfs_nidstrings[libcfs_nidstring_idx++]; + if (libcfs_nidstring_idx == + sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0])) + libcfs_nidstring_idx = 0; + + return str; +} + +static int +libcfs_lo_str2addr(const char *str, int nob, __u32 *addr) +{ + *addr = 0; + return 1; +} + +static void +libcfs_ip_addr2str(__u32 addr, char *str, size_t size) +{ + snprintf(str, size, "%u.%u.%u.%u", + (addr >> 24) & 0xff, (addr >> 16) & 0xff, + (addr >> 8) & 0xff, addr & 0xff); +} + +/* CAVEAT EMPTOR XscanfX + * I use "%n" at the end of a sscanf format to detect trailing junk. However + * sscanf may return immediately if it sees the terminating '0' in a string, so + * I initialise the %n variable to the expected length. If sscanf sets it; + * fine, if it doesn't, then the scan ended at the end of the string, which is + * fine too :) */ +static int +libcfs_ip_str2addr(const char *str, int nob, __u32 *addr) +{ + unsigned int a; + unsigned int b; + unsigned int c; + unsigned int d; + int n = nob; /* XscanfX */ + + /* numeric IP? */ + if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 && + n == nob && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) { + *addr = ((a<<24)|(b<<16)|(c<<8)|d); + return 1; + } + +#ifdef HAVE_GETHOSTBYNAME + /* known hostname? */ + if (('a' <= str[0] && str[0] <= 'z') || + ('A' <= str[0] && str[0] <= 'Z')) { + char *tmp; + + tmp = calloc(1, nob + 1); + if (tmp != NULL) { + struct hostent *he; + + memcpy(tmp, str, nob); + tmp[nob] = 0; + + he = gethostbyname(tmp); + + free(tmp); + + if (he != NULL) { + __u32 ip = *(__u32 *)he->h_addr; + + *addr = ntohl(ip); + return 1; + } + } + } +#endif + return 0; +} + +int +cfs_ip_addr_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + struct cfs_lstr src; + int rc; + int i; + + src.ls_str = str; + src.ls_len = len; + i = 0; + + while (src.ls_str != NULL) { + struct cfs_lstr res; + + if (!cfs_gettok(&src, '.', &res)) { + rc = -EINVAL; + goto out; + } + + rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el); + if (rc != 0) + goto out; + + list_add_tail(&el->el_link, list); + i++; + } + + if (i == 4) + return 0; + + rc = -EINVAL; +out: + cfs_expr_list_free_list(list); + + return rc; +} + +static int +libcfs_ip_addr_range_print(char *buffer, int count, struct list_head *list) +{ + int i = 0, j = 0; + struct cfs_expr_list *el; + + list_for_each_entry(el, list, el_link) { + assert(j++ < 4); + if (i != 0) + i += snprintf(buffer + i, count - i, "."); + i += cfs_expr_list_print(buffer + i, count - i, el); + } + return i; +} + +static int +cfs_ip_addr_range_gen_recurse(__u32 *ip_list, int *count, int shift, + __u32 result, struct list_head *head_el, + struct cfs_expr_list *octet_el) +{ + __u32 value = 0; + int i; + struct cfs_expr_list *next_octet_el; + struct cfs_range_expr *octet_expr; + + /* + * each octet can have multiple expressions so we need to traverse + * all of the expressions + */ + list_for_each_entry(octet_expr, &octet_el->el_exprs, re_link) { + for (i = octet_expr->re_lo; i <= octet_expr->re_hi; i++) { + if (((i - octet_expr->re_lo) % octet_expr->re_stride) == 0) { + /* + * we have a hit calculate the result and + * pass it forward to the next iteration + * of the recursion. + */ + next_octet_el = + list_entry(octet_el->el_link.next, + typeof(*next_octet_el), + el_link); + value = result | (i << (shift * 8)); + if (next_octet_el->el_link.next != head_el) { + /* + * We still have more octets in + * the IP address so traverse + * that. We're doing a depth first + * recursion here. + */ + if (cfs_ip_addr_range_gen_recurse(ip_list, count, + shift - 1, value, + head_el, + next_octet_el) == -1) + return -1; + } else { + /* + * We have hit a leaf so store the + * calculated IP address in the + * list. If we have run out of + * space stop the recursion. + */ + if (*count == -1) + return -1; + /* add ip to the list */ + ip_list[*count] = value; + (*count)--; + } + } + } + } + return 0; +} + +/* + * only generate maximum of count ip addresses from the given expression + */ +int +cfs_ip_addr_range_gen(__u32 *ip_list, int count, struct list_head *ip_addr_expr) +{ + struct cfs_expr_list *octet_el; + int idx = count - 1; + + octet_el = list_entry(ip_addr_expr->next, typeof(*octet_el), el_link); + + (void) cfs_ip_addr_range_gen_recurse(ip_list, &idx, 3, 0, &octet_el->el_link, octet_el); + + return idx; +} + +/** + * Matches address (\a addr) against address set encoded in \a list. + * + * \retval 1 if \a addr matches + * \retval 0 otherwise + */ +int +cfs_ip_addr_match(__u32 addr, struct list_head *list) +{ + struct cfs_expr_list *el; + int i = 0; + + list_for_each_entry_reverse(el, list, el_link) { + if (!cfs_expr_list_match(addr & 0xff, el)) + return 0; + addr >>= 8; + i++; + } + + return i == 4; +} + +static void +libcfs_decnum_addr2str(__u32 addr, char *str, size_t size) +{ + snprintf(str, size, "%u", addr); +} + +static int +libcfs_num_str2addr(const char *str, int nob, __u32 *addr) +{ + int n; + + n = nob; + if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob) + return 1; + + return 0; +} + +/** + * Nf_parse_addrlist method for networks using numeric addresses. + * + * Examples of such networks are gm and elan. + * + * \retval 0 if \a str parsed to numeric address + * \retval errno otherwise + */ +static int +libcfs_num_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + int rc; + + rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el); + if (rc == 0) + list_add_tail(&el->el_link, list); + + return rc; +} + +static int +libcfs_num_addr_range_print(char *buffer, int count, struct list_head *list) +{ + struct cfs_expr_list *el; + int i = 0, j = 0; + + list_for_each_entry(el, list, el_link) { + assert(j++ < 1); + i += cfs_expr_list_print(buffer + i, count - i, el); + } + return i; +} + +/* + * Nf_match_addr method for networks using numeric addresses + * + * \retval 1 on match + * \retval 0 otherwise + */ +static int +libcfs_num_match(__u32 addr, struct list_head *numaddr) +{ + struct cfs_expr_list *el; + + assert(!list_empty(numaddr)); + el = list_entry(numaddr->next, struct cfs_expr_list, el_link); + + return cfs_expr_list_match(addr, el); +} + +static int cfs_ip_min_max(struct list_head *nidlist, __u32 *min, __u32 *max); +static int cfs_num_min_max(struct list_head *nidlist, __u32 *min, __u32 *max); + +static struct netstrfns libcfs_netstrfns[] = { + { + .nf_type = LOLND, + .nf_name = "lo", + .nf_modname = "klolnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_lo_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match, + .nf_min_max = cfs_num_min_max + }, + { + .nf_type = SOCKLND, + .nf_name = "tcp", + .nf_modname = "ksocklnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match, + .nf_min_max = cfs_ip_min_max + }, + { + .nf_type = O2IBLND, + .nf_name = "o2ib", + .nf_modname = "ko2iblnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match, + .nf_min_max = cfs_ip_min_max + }, + { + .nf_type = GNILND, + .nf_name = "gni", + .nf_modname = "kgnilnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match, + .nf_min_max = cfs_num_min_max + }, + { + .nf_type = GNIIPLND, + .nf_name = "gip", + .nf_modname = "kgnilnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match, + .nf_min_max = cfs_ip_min_max + }, + { + .nf_type = PTL4LND, + .nf_name = "ptlf", + .nf_modname = "kptl4lnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match, + .nf_min_max = cfs_num_min_max + } +}; + +static const size_t libcfs_nnetstrfns = + sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]); + +static struct netstrfns * +libcfs_lnd2netstrfns(__u32 lnd) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (lnd == libcfs_netstrfns[i].nf_type) + return &libcfs_netstrfns[i]; + + return NULL; +} + +static struct netstrfns * +libcfs_namenum2netstrfns(const char *name) +{ + struct netstrfns *nf; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (!strncmp(name, nf->nf_name, strlen(nf->nf_name))) + return nf; + } + return NULL; +} + +static struct netstrfns * +libcfs_name2netstrfns(const char *name) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (!strcmp(libcfs_netstrfns[i].nf_name, name)) + return &libcfs_netstrfns[i]; + + return NULL; +} + +int +libcfs_isknown_lnd(__u32 lnd) +{ + return libcfs_lnd2netstrfns(lnd) != NULL; +} + +char * +libcfs_lnd2modname(__u32 lnd) +{ + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + + return (nf == NULL) ? NULL : nf->nf_modname; +} + +int +libcfs_str2lnd(const char *str) +{ + struct netstrfns *nf = libcfs_name2netstrfns(str); + + if (nf != NULL) + return nf->nf_type; + + return -1; +} + +char * +libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size) +{ + struct netstrfns *nf; + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) + snprintf(buf, buf_size, "?%u?", lnd); + else + snprintf(buf, buf_size, "%s", nf->nf_name); + + return buf; +} + +char * +libcfs_net2str_r(__u32 net, char *buf, size_t buf_size) +{ + __u32 nnum = LNET_NETNUM(net); + __u32 lnd = LNET_NETTYP(net); + struct netstrfns *nf; + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) + snprintf(buf, buf_size, "<%u:%u>", lnd, nnum); + else if (nnum == 0) + snprintf(buf, buf_size, "%s", nf->nf_name); + else + snprintf(buf, buf_size, "%s%u", nf->nf_name, nnum); + + return buf; +} + +char * +libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size) +{ + __u32 addr = LNET_NIDADDR(nid); + __u32 net = LNET_NIDNET(nid); + __u32 nnum = LNET_NETNUM(net); + __u32 lnd = LNET_NETTYP(net); + struct netstrfns *nf; + + if (nid == LNET_NID_ANY) { + strncpy(buf, "", buf_size); + buf[buf_size - 1] = '\0'; + return buf; + } + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) { + snprintf(buf, buf_size, "%x@<%u:%u>", addr, lnd, nnum); + } else { + size_t addr_len; + + nf->nf_addr2str(addr, buf, buf_size); + addr_len = strlen(buf); + if (nnum == 0) + snprintf(buf + addr_len, buf_size - addr_len, "@%s", + nf->nf_name); + else + snprintf(buf + addr_len, buf_size - addr_len, "@%s%u", + nf->nf_name, nnum); + } + + return buf; +} + +static struct netstrfns * +libcfs_str2net_internal(const char *str, __u32 *net) +{ + struct netstrfns *nf = NULL; + int nob; + unsigned int netnum; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (!strncmp(str, nf->nf_name, strlen(nf->nf_name))) + break; + } + + if (i == libcfs_nnetstrfns) + return NULL; + + nob = strlen(nf->nf_name); + + if (strlen(str) == (unsigned int)nob) { + netnum = 0; + } else { + if (nf->nf_type == LOLND) /* net number not allowed */ + return NULL; + + str += nob; + i = strlen(str); + if (sscanf(str, "%u%n", &netnum, &i) < 1 || + i != (int)strlen(str)) + return NULL; + } + + *net = LNET_MKNET(nf->nf_type, netnum); + return nf; +} + +__u32 +libcfs_str2net(const char *str) +{ + __u32 net; + + if (libcfs_str2net_internal(str, &net) != NULL) + return net; + + return LNET_NIDNET(LNET_NID_ANY); +} + +lnet_nid_t +libcfs_str2nid(const char *str) +{ + const char *sep = strchr(str, '@'); + struct netstrfns *nf; + __u32 net; + __u32 addr; + + if (sep != NULL) { + nf = libcfs_str2net_internal(sep + 1, &net); + if (nf == NULL) + return LNET_NID_ANY; + } else { + sep = str + strlen(str); + net = LNET_MKNET(SOCKLND, 0); + nf = libcfs_lnd2netstrfns(SOCKLND); + assert(nf != NULL); + } + + if (!nf->nf_str2addr(str, (int)(sep - str), &addr)) + return LNET_NID_ANY; + + return LNET_MKNID(net, addr); +} + +char * +libcfs_id2str(struct lnet_process_id id) +{ + char *str = libcfs_next_nidstring(); + + if (id.pid == LNET_PID_ANY) { + snprintf(str, LNET_NIDSTR_SIZE, + "LNET_PID_ANY-%s", libcfs_nid2str(id.nid)); + return str; + } + + snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s", + ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "", + (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid)); + return str; +} + +int +libcfs_str2anynid(lnet_nid_t *nidp, const char *str) +{ + if (!strcmp(str, "*")) { + *nidp = LNET_NID_ANY; + return 1; + } + + *nidp = libcfs_str2nid(str); + return *nidp != LNET_NID_ANY; +} + +/** + * Nid range list syntax. + * \verbatim + * + * :== [ ' ' ] + * :== '@' + * :== '*' | + * | + * + * :== ... + * + * :== | + * + * :== '[' [ ',' ] ']' + * :== | + * '-' | + * '-' '/' + * :== | + * :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" | + * "vib" | "ra" | "elan" | "mx" | "ptl" + * \endverbatim + */ + +/** + * Structure to represent \ token of the syntax. + * + * One of this is created for each \ parsed. + */ +struct nidrange { + /** + * Link to list of this structures which is built on nid range + * list parsing. + */ + struct list_head nr_link; + /** + * List head for addrrange::ar_link. + */ + struct list_head nr_addrranges; + /** + * Flag indicating that *@ is found. + */ + int nr_all; + /** + * Pointer to corresponding element of libcfs_netstrfns. + */ + struct netstrfns *nr_netstrfns; + /** + * Number of network. E.g. 5 if \ is "elan5". + */ + int nr_netnum; +}; + +/** + * Structure to represent \ token of the syntax. + */ +struct addrrange { + /** + * Link to nidrange::nr_addrranges. + */ + struct list_head ar_link; + /** + * List head for cfs_expr_list::el_list. + */ + struct list_head ar_numaddr_ranges; +}; + +/** + * Parses \ token on the syntax. + * + * Allocates struct addrrange and links to \a nidrange via + * (nidrange::nr_addrranges) + * + * \retval 0 if \a src parses to '*' | \ | \ + * \retval -errno otherwise + */ +static int +parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange) +{ + struct addrrange *addrrange; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + nidrange->nr_all = 1; + return 0; + } + + addrrange = calloc(1, sizeof(struct addrrange)); + if (addrrange == NULL) + return -ENOMEM; + list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges); + INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges); + + return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str, + src->ls_len, + &addrrange->ar_numaddr_ranges); +} + +/** + * Finds or creates struct nidrange. + * + * Checks if \a src is a valid network name, looks for corresponding + * nidrange on the ist of nidranges (\a nidlist), creates new struct + * nidrange if it is not found. + * + * \retval pointer to struct nidrange matching network specified via \a src + * \retval NULL if \a src does not match any network + */ +static struct nidrange * +add_nidrange(const struct cfs_lstr *src, + struct list_head *nidlist) +{ + struct netstrfns *nf; + struct nidrange *nr; + int endlen; + unsigned netnum; + + if (src->ls_len >= LNET_NIDSTR_SIZE) + return NULL; + + nf = libcfs_namenum2netstrfns(src->ls_str); + if (nf == NULL) + return NULL; + endlen = src->ls_len - strlen(nf->nf_name); + if (endlen == 0) + /* network name only, e.g. "elan" or "tcp" */ + netnum = 0; + else { + /* e.g. "elan25" or "tcp23", refuse to parse if + * network name is not appended with decimal or + * hexadecimal number */ + if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name), + endlen, &netnum, 0, MAX_NUMERIC_VALUE)) + return NULL; + } + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns != nf) + continue; + if (nr->nr_netnum != netnum) + continue; + return nr; + } + + nr = calloc(1, sizeof(struct nidrange)); + if (nr == NULL) + return NULL; + list_add_tail(&nr->nr_link, nidlist); + INIT_LIST_HEAD(&nr->nr_addrranges); + nr->nr_netstrfns = nf; + nr->nr_all = 0; + nr->nr_netnum = netnum; + + return nr; +} + +/** + * Parses \ token of the syntax. + * + * \retval 1 if \a src parses to \ '@' \ + * \retval 0 otherwise + */ +static int +parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist) +{ + struct cfs_lstr addrrange; + struct cfs_lstr net; + struct cfs_lstr tmp; + struct nidrange *nr; + + tmp = *src; + if (cfs_gettok(src, '@', &addrrange) == 0) + goto failed; + + if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL) + goto failed; + + nr = add_nidrange(&net, nidlist); + if (nr == NULL) + goto failed; + + if (parse_addrange(&addrrange, nr) != 0) + goto failed; + + return 1; + failed: + fprintf(stderr, "can't parse nidrange: \"%.*s\"\n", + tmp.ls_len, tmp.ls_str); + return 0; +} + +/** + * Frees addrrange structures of \a list. + * + * For each struct addrrange structure found on \a list it frees + * cfs_expr_list list attached to it and frees the addrrange itself. + * + * \retval none + */ +static void +free_addrranges(struct list_head *list) +{ + while (!list_empty(list)) { + struct addrrange *ar; + + ar = list_entry(list->next, struct addrrange, ar_link); + + cfs_expr_list_free_list(&ar->ar_numaddr_ranges); + list_del(&ar->ar_link); + free(ar); + } +} + +/** + * Frees nidrange strutures of \a list. + * + * For each struct nidrange structure found on \a list it frees + * addrrange list attached to it and frees the nidrange itself. + * + * \retval none + */ +void +cfs_free_nidlist(struct list_head *list) +{ + struct list_head *pos, *next; + struct nidrange *nr; + + list_for_each_safe(pos, next, list) { + nr = list_entry(pos, struct nidrange, nr_link); + free_addrranges(&nr->nr_addrranges); + list_del(pos); + free(nr); + } +} + +/** + * Parses nid range list. + * + * Parses with rigorous syntax and overflow checking \a str into + * \ [ ' ' \ ], compiles \a str into set of + * structures and links that structure to \a nidlist. The resulting + * list can be used to match a NID againts set of NIDS defined by \a + * str. + * \see cfs_match_nid + * + * \retval 1 on success + * \retval 0 otherwise + */ +int +cfs_parse_nidlist(char *str, int len, struct list_head *nidlist) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(nidlist); + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + cfs_free_nidlist(nidlist); + return 0; + } + rc = parse_nidrange(&res, nidlist); + if (rc == 0) { + cfs_free_nidlist(nidlist); + return 0; + } + } + return 1; +} + +/** + * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist). + * + * \see cfs_parse_nidlist() + * + * \retval 1 on match + * \retval 0 otherwises + */ +int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist) +{ + struct nidrange *nr; + struct addrrange *ar; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid))) + continue; + if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid))) + continue; + if (nr->nr_all) + return 1; + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) + if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid), + &ar->ar_numaddr_ranges)) + return 1; + } + return 0; +} + +/** + * Print the network part of the nidrange \a nr into the specified \a buffer. + * + * \retval number of characters written + */ +static int +cfs_print_network(char *buffer, int count, struct nidrange *nr) +{ + struct netstrfns *nf = nr->nr_netstrfns; + + if (nr->nr_netnum == 0) + return snprintf(buffer, count, "@%s", nf->nf_name); + else + return snprintf(buffer, count, "@%s%u", + nf->nf_name, nr->nr_netnum); +} + + +/** + * Print a list of addrrange (\a addrranges) into the specified \a buffer. + * At max \a count characters can be printed into \a buffer. + * + * \retval number of characters written + */ +static int +cfs_print_addrranges(char *buffer, int count, struct list_head *addrranges, + struct nidrange *nr) +{ + int i = 0; + struct addrrange *ar; + struct netstrfns *nf = nr->nr_netstrfns; + + list_for_each_entry(ar, addrranges, ar_link) { + if (i != 0) + i += snprintf(buffer + i, count - i, " "); + i += nf->nf_print_addrlist(buffer + i, count - i, + &ar->ar_numaddr_ranges); + i += cfs_print_network(buffer + i, count - i, nr); + } + return i; +} + +/** + * Print a list of nidranges (\a nidlist) into the specified \a buffer. + * At max \a count characters can be printed into \a buffer. + * Nidranges are separated by a space character. + * + * \retval number of characters written + */ +int cfs_print_nidlist(char *buffer, int count, struct list_head *nidlist) +{ + int i = 0; + struct nidrange *nr; + + if (count <= 0) + return 0; + + list_for_each_entry(nr, nidlist, nr_link) { + if (i != 0) + i += snprintf(buffer + i, count - i, " "); + + if (nr->nr_all != 0) { + assert(list_empty(&nr->nr_addrranges)); + i += snprintf(buffer + i, count - i, "*"); + i += cfs_print_network(buffer + i, count - i, nr); + } else { + i += cfs_print_addrranges(buffer + i, count - i, + &nr->nr_addrranges, nr); + } + } + return i; +} + +/** + * Determines minimum and maximum addresses for a single + * numeric address range + * + * \param ar + * \param[out] *min_nid __u32 representation of min NID + * \param[out] *max_nid __u32 representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +static int cfs_ip_ar_min_max(struct addrrange *ar, __u32 *min_nid, + __u32 *max_nid) +{ + struct cfs_expr_list *expr_list; + struct cfs_range_expr *range; + unsigned int min_ip[4] = {0}; + unsigned int max_ip[4] = {0}; + int cur_octet = 0; + bool expect_full_octet = false; + + list_for_each_entry(expr_list, &ar->ar_numaddr_ranges, el_link) { + int re_count = 0; + + list_for_each_entry(range, &expr_list->el_exprs, re_link) { + /* XXX: add support for multiple & non-contig. re's */ + if (re_count > 0) + return -EINVAL; + + /* if a previous octet was ranged, then all remaining + * octets must be full for contiguous range */ + if (expect_full_octet && (range->re_lo != 0 || + range->re_hi != 255)) + return -ERANGE; + + if (range->re_stride != 1) + return -ERANGE; + + if (range->re_lo > range->re_hi) + return -EINVAL; + + if (range->re_lo != range->re_hi) + expect_full_octet = true; + + min_ip[cur_octet] = range->re_lo; + max_ip[cur_octet] = range->re_hi; + + re_count++; + } + + cur_octet++; + } + + if (min_nid != NULL) + *min_nid = ((min_ip[0] << 24) | (min_ip[1] << 16) | + (min_ip[2] << 8) | min_ip[3]); + + if (max_nid != NULL) + *max_nid = ((max_ip[0] << 24) | (max_ip[1] << 16) | + (max_ip[2] << 8) | max_ip[3]); + + return 0; +} + +/** + * Determines minimum and maximum addresses for a single + * numeric address range + * + * \param ar + * \param[out] *min_nid __u32 representation of min NID + * \param[out] *max_nid __u32 representation of max NID + * \retval -EINVAL unsupported LNET range + */ +static int cfs_num_ar_min_max(struct addrrange *ar, __u32 *min_nid, + __u32 *max_nid) +{ + struct cfs_expr_list *el; + struct cfs_range_expr *re; + unsigned int min_addr = 0; + unsigned int max_addr = 0; + + list_for_each_entry(el, &ar->ar_numaddr_ranges, el_link) { + int re_count = 0; + + list_for_each_entry(re, &el->el_exprs, re_link) { + if (re_count > 0) + return -EINVAL; + if (re->re_lo > re->re_hi) + return -EINVAL; + + if (re->re_lo < min_addr || min_addr == 0) + min_addr = re->re_lo; + if (re->re_hi > max_addr) + max_addr = re->re_hi; + + re_count++; + } + } + + if (min_nid != NULL) + *min_nid = min_addr; + if (max_nid != NULL) + *max_nid = max_addr; + + return 0; +} + +/** + * Takes a linked list of nidrange expressions, determines the minimum + * and maximum nid and creates appropriate nid structures + * + * \param *nidlist + * \param[out] *min_nid string representation of min NID + * \param[out] *max_nid string representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid, + char *max_nid, size_t nidstr_length) +{ + struct nidrange *first_nidrange; + int netnum; + struct netstrfns *nf; + char *lndname; + __u32 min_addr; + __u32 max_addr; + char min_addr_str[IPSTRING_LENGTH]; + char max_addr_str[IPSTRING_LENGTH]; + int rc; + + first_nidrange = list_entry(nidlist->next, struct nidrange, nr_link); + + netnum = first_nidrange->nr_netnum; + nf = first_nidrange->nr_netstrfns; + lndname = nf->nf_name; + + rc = nf->nf_min_max(nidlist, &min_addr, &max_addr); + if (rc < 0) + return rc; + + nf->nf_addr2str(min_addr, min_addr_str, sizeof(min_addr_str)); + nf->nf_addr2str(max_addr, max_addr_str, sizeof(max_addr_str)); + + snprintf(min_nid, nidstr_length, "%s@%s%d", min_addr_str, lndname, + netnum); + snprintf(max_nid, nidstr_length, "%s@%s%d", max_addr_str, lndname, + netnum); + + return 0; +} + +/** + * Determines the min and max NID values for num LNDs + * + * \param *nidlist + * \param[out] *min_nid if provided, returns string representation of min NID + * \param[out] *max_nid if provided, returns string representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +static int cfs_num_min_max(struct list_head *nidlist, __u32 *min_nid, + __u32 *max_nid) +{ + struct nidrange *nr; + struct addrrange *ar; + unsigned int tmp_min_addr = 0; + unsigned int tmp_max_addr = 0; + unsigned int min_addr = 0; + unsigned int max_addr = 0; + int nidlist_count = 0; + int rc; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nidlist_count > 0) + return -EINVAL; + + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { + rc = cfs_num_ar_min_max(ar, &tmp_min_addr, + &tmp_max_addr); + if (rc < 0) + return rc; + + if (tmp_min_addr < min_addr || min_addr == 0) + min_addr = tmp_min_addr; + if (tmp_max_addr > max_addr) + max_addr = tmp_min_addr; + } + } + if (max_nid != NULL) + *max_nid = max_addr; + if (min_nid != NULL) + *min_nid = min_addr; + + return 0; +} + +/** + * Takes an nidlist and determines the minimum and maximum + * ip addresses. + * + * \param *nidlist + * \param[out] *min_nid if provided, returns string representation of min NID + * \param[out] *max_nid if provided, returns string representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +static int cfs_ip_min_max(struct list_head *nidlist, __u32 *min_nid, + __u32 *max_nid) +{ + struct nidrange *nr; + struct addrrange *ar; + __u32 tmp_min_ip_addr = 0; + __u32 tmp_max_ip_addr = 0; + __u32 min_ip_addr = 0; + __u32 max_ip_addr = 0; + int nidlist_count = 0; + int rc; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nidlist_count > 0) + return -EINVAL; + + if (nr->nr_all) { + min_ip_addr = 0; + max_ip_addr = 0xffffffff; + break; + } + + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { + rc = cfs_ip_ar_min_max(ar, &tmp_min_ip_addr, + &tmp_max_ip_addr); + if (rc < 0) + return rc; + + if (tmp_min_ip_addr < min_ip_addr || min_ip_addr == 0) + min_ip_addr = tmp_min_ip_addr; + if (tmp_max_ip_addr > max_ip_addr) + max_ip_addr = tmp_max_ip_addr; + } + + nidlist_count++; + } + + if (max_nid != NULL) + *max_nid = max_ip_addr; + if (min_nid != NULL) + *min_nid = min_ip_addr; + + return 0; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c new file mode 100644 index 0000000000000..18fe84dc53f6a --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c @@ -0,0 +1,155 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the GNU Lesser General Public License + * (LGPL) version 2.1 or (at your discretion) any later version. + * (LGPL) version 2.1 accompanies this distribution, and is available at + * http://www.gnu.org/licenses/lgpl-2.1.html + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * LGPL HEADER END + */ +/* + * libcfs/libcfs/utils/param.c + * + * This code handles user interaction with the configuration interface + * to the Lustre file system to fine tune it. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Get parameter path matching the pattern + * + * \param[out] paths glob_t structure used to hold the final result + * \param[in] pattern the pattern containing sprintf format specifiers + * which will be used to create the path to match + * + * The \param pattern is appended to the default path glob to complete the + * absolute path to the file the caller is requesting. If the results point + * to one or more files that exist those results are stored in the \param + * paths glob_t structure that is passed by the caller. + * + * Lustre tunables traditionally were in /proc/{sys,fs}/{lnet,lustre} + * but in upstream kernels starting with Linux 4.2 these parameters + * have been moved to /sys/fs/lustre and /sys/kernel/debug/{lnet,lustre} + * so the user tools need to check both locations. + * + * \retval 0 for success, with results stored in \param paths. + * \retval -1 for failure with errno set to report the reason. + */ +int +cfs_get_param_paths(glob_t *paths, const char *pattern, ...) +{ + char topdir[PATH_MAX] = "{/sys/{fs,kernel/debug}/{lnet,lustre}," + "/proc/{fs,sys}/{lnet,lustre}}"; + static bool test_mounted = false; + char path[PATH_MAX]; + char buf[PATH_MAX]; + struct statfs statfsbuf; + va_list args; + int rc; + + + if (test_mounted) + goto skip_mounting; + test_mounted = true; + + rc = statfs("/sys/kernel/debug/", &statfsbuf); + if (rc == 0 && statfsbuf.f_type == DEBUGFS_MAGIC) + goto skip_mounting; + + if (mount("none", "/sys/kernel/debug", "debugfs", 0, "") == -1) { + /* Already mounted or don't have permission to mount is okay */ + if (errno != EPERM && errno != EBUSY) + fprintf(stderr, "Warning: failed to mount debug: %s\n", + strerror(errno)); + } else { + struct stat mtab; + + /* This is all for RHEL6 which is old school. Can be removed + * later when RHEL6 client support is dropped. */ + rc = lstat(_PATH_MOUNTED, &mtab); + if (!rc && !S_ISLNK(mtab.st_mode)) { + FILE *fp = setmntent(_PATH_MOUNTED, "r+"); + + if (fp != NULL) { + const struct mntent fs = { + .mnt_fsname = "debugfs", + .mnt_dir = "/sys/kernel/debug", + .mnt_type = "debugfs", + .mnt_opts = "rw,relatime", + }; + + rc = addmntent(fp, &fs); + if (rc) { + fprintf(stderr, + "failed to add debugfs to %s: %s\n", + _PATH_MOUNTED, strerror(errno)); + } + endmntent(fp); + } else { + fprintf(stderr, "could not open %s: %s\n", + _PATH_MOUNTED, strerror(errno)); + } + } + } +skip_mounting: + va_start(args, pattern); + rc = vsnprintf(buf, sizeof(buf), pattern, args); + va_end(args); + if (rc < 0) { + return rc; + } else if (rc >= sizeof(buf)) { + errno = EINVAL; + return -1; + } + + if (snprintf(path, sizeof(path), "%s/%s", topdir, buf) >= + sizeof(path)) { + errno = E2BIG; + return -1; + } + + rc = glob(path, GLOB_BRACE, NULL, paths); + if (rc != 0) { + switch (rc) { + case GLOB_NOSPACE: + errno = ENOMEM; + break; + case GLOB_ABORTED: + errno = ENODEV; + break; + case GLOB_NOMATCH: + default: + errno = ENOENT; + break; + } + rc = -1; + } + + return rc; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c new file mode 100644 index 0000000000000..861f97a3c51e6 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c @@ -0,0 +1,846 @@ +/* + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * Copyright (c) 2014, 2017, Intel Corporation. + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_LIBREADLINE +# include +# include +#endif /* HAVE_LIBREADLINE */ +#include +#include + +#include +#include + +static command_t * top_level; /* Top level of commands, initialized by + * InitParser */ +static char * parser_prompt = NULL;/* Parser prompt, set by InitParser */ +static int done; /* Set to 1 if user types exit or quit */ +static int ignore_errors; /* Normally, the parser will quit when + an error occurs in non-interacive + mode. Setting this to non-zero will + force it to keep buggering on. */ + + +/* static functions */ +static char *skipwhitespace(char *s); +static char *skiptowhitespace(char *s); +static command_t *find_cmd(char *name, command_t cmds[], char **next); +static int process(char *s, char **next, command_t *lookup, command_t **result, + char **prev); + +static char * skipwhitespace(char * s) +{ + char * t; + int len; + + len = (int)strlen(s); + for (t = s; t <= s + len && isspace(*t); t++); + return(t); +} + + +static char * skiptowhitespace(char * s) +{ + char * t; + + for (t = s; *t && !isspace(*t); t++); + return(t); +} + +static int line2args(char *line, char **argv, int maxargs) +{ + char *arg; + int i = 0; + + arg = strtok(line, " \t"); + if (arg == NULL || maxargs < 1) + return 0; + + argv[i++] = arg; + while ((arg = strtok(NULL, " \t")) != NULL && i < maxargs) + argv[i++] = arg; + return i; +} + +/* find a command -- return it if unique otherwise print alternatives */ +static command_t *Parser_findargcmd(char *name, command_t cmds[]) +{ + command_t *cmd; + + for (cmd = cmds; cmd->pc_name; cmd++) { + if (strcmp(name, cmd->pc_name) == 0) + return cmd; + } + return NULL; +} + +void Parser_ignore_errors(int ignore) +{ + ignore_errors = ignore; +} + +int Parser_execarg(int argc, char **argv, command_t cmds[]) +{ + command_t *cmd; + + cmd = Parser_findargcmd(argv[0], cmds); + if (cmd != NULL && cmd->pc_func != NULL) { + int rc = (cmd->pc_func)(argc, argv); + if (rc == CMD_HELP) + fprintf(stderr, "%s\n", cmd->pc_help); + return rc; + } else { + printf("Try interactive use without arguments or use one of:\n"); + for (cmd = cmds; cmd->pc_name; cmd++) + printf("\"%s\"\n", cmd->pc_name); + printf("as argument.\n"); + } + return -1; +} + +/* returns the command_t * (NULL if not found) corresponding to a + _partial_ match with the first token in name. It sets *next to + point to the following token. Does not modify *name. */ +static command_t * find_cmd(char * name, command_t cmds[], char ** next) +{ + int i, len; + + if (!cmds || !name ) + return NULL; + + /* This sets name to point to the first non-white space character, + and next to the first whitespace after name, len to the length: do + this with strtok*/ + name = skipwhitespace(name); + *next = skiptowhitespace(name); + len = (int)(*next - name); + if (len == 0) + return NULL; + + for (i = 0; cmds[i].pc_name; i++) { + if (strncasecmp(name, cmds[i].pc_name, len) == 0) { + *next = skipwhitespace(*next); + return(&cmds[i]); + } + } + return NULL; +} + +/* Recursively process a command line string s and find the command + corresponding to it. This can be ambiguous, full, incomplete, + non-existent. */ +static int process(char *s, char ** next, command_t *lookup, + command_t **result, char **prev) +{ + *result = find_cmd(s, lookup, next); + *prev = s; + + /* non existent */ + if (!*result) + return CMD_NONE; + + /* found entry: is it ambigous, i.e. not exact command name and + more than one command in the list matches. Note that find_cmd + points to the first ambiguous entry */ + if (strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name))) { + char *another_next; + command_t *another_result = find_cmd(s, (*result) + 1, + &another_next); + int found_another = 0; + + while (another_result) { + if (strncasecmp(s, another_result->pc_name, + strlen(another_result->pc_name)) == 0){ + *result = another_result; + *next = another_next; + goto got_it; + } + another_result = find_cmd(s, another_result + 1, + &another_next); + found_another = 1; + } + if (found_another) + return CMD_AMBIG; + } + +got_it: + /* found a unique command: component or full? */ + if ((*result)->pc_func != NULL) { + return CMD_COMPLETE; + } else { + if (**next == '\0') { + return CMD_INCOMPLETE; + } else { + return process(*next, next, (*result)->pc_sub_cmd, + result, prev); + } + } +} + +#ifdef HAVE_LIBREADLINE +static command_t * match_tbl; /* Command completion against this table */ +static char * command_generator(const char * text, int state) +{ + static int index, + len; + char *name; + + /* Do we have a match table? */ + if (!match_tbl) + return NULL; + + /* If this is the first time called on this word, state is 0 */ + if (!state) { + index = 0; + len = (int)strlen(text); + } + + /* Return next name in the command list that paritally matches test */ + while ( (name = (match_tbl + index)->pc_name) ) { + index++; + + if (strncasecmp(name, text, len) == 0) { + return(strdup(name)); + } + } + + /* No more matches */ + return NULL; +} + +/* probably called by readline */ +static char **command_completion(const char *text, int start, int end) +{ + command_t * table; + char * pos; + + match_tbl = top_level; + + for (table = find_cmd(rl_line_buffer, match_tbl, &pos); + table; table = find_cmd(pos, match_tbl, &pos)) + { + + if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd; + } + + return rl_completion_matches(text, command_generator); +} +#endif + +/* take a string and execute the function or print help */ +int execute_line(char * line) +{ + command_t *cmd, *ambig; + char *prev; + char *next, *tmp; + char *argv[MAXARGS]; + int i; + int rc = 0; + + switch (process(line, &next, top_level, &cmd, &prev)) { + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while ((ambig = find_cmd(prev, cmd, &tmp))) { + fprintf(stderr, "%s ", ambig->pc_name); + cmd = ambig + 1; + } + fprintf(stderr, "\n"); + break; + case CMD_NONE: + fprintf(stderr, "No such command, type help\n"); + break; + case CMD_INCOMPLETE: + fprintf(stderr, "'%s' incomplete command. Use '%s x' where " + "x is one of:\n", line, line); + fprintf(stderr, "\t"); + for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) + fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name); + fprintf(stderr, "\n"); + break; + case CMD_COMPLETE: + optind = 0; + i = line2args(line, argv, MAXARGS); + rc = (cmd->pc_func)(i, argv); + + if (rc == CMD_HELP) + fprintf(stderr, "%s\n", cmd->pc_help); + + break; + } + + return rc; +} + +#ifdef HAVE_LIBREADLINE +static void noop_int_fn(int unused) { } +static void noop_void_fn(void) { } +#endif + +/* just in case you're ever in an airplane and discover you + * forgot to install readline-dev. :) */ +static int init_input(void) +{ + int interactive = isatty(fileno(stdin)); + +#ifdef HAVE_LIBREADLINE + using_history(); + stifle_history(HISTORY); + + if (!interactive) { + rl_prep_term_function = noop_int_fn; + rl_deprep_term_function = noop_void_fn; + } + + rl_attempted_completion_function = command_completion; + rl_completion_entry_function = command_generator; +#endif + return interactive; +} + +#ifndef HAVE_LIBREADLINE +#define add_history(s) +char * readline(char * prompt) +{ + int size = 2048; + char *line = malloc(size); + char *ptr = line; + int c; + int eof = 0; + + if (line == NULL) + return NULL; + if (prompt) + printf ("%s", prompt); + + while (1) { + if ((c = fgetc(stdin)) != EOF) { + if (c == '\n') + goto out; + *ptr++ = (char)c; + + if (ptr - line >= size - 1) { + char *tmp; + + size *= 2; + tmp = malloc(size); + if (tmp == NULL) + goto outfree; + memcpy(tmp, line, ptr - line); + ptr = tmp + (ptr - line); + free(line); + line = tmp; + } + } else { + eof = 1; + if (ferror(stdin) || feof(stdin)) + goto outfree; + goto out; + } + } +out: + *ptr = 0; + if (eof && (strlen(line) == 0)) { + free(line); + line = NULL; + } + return line; +outfree: + free(line); + return NULL; +} +#endif + +/* this is the command execution machine */ +int Parser_commands(void) +{ + char *line, *s; + int rc = 0, save_error = 0; + int interactive; + + interactive = init_input(); + + while(!done) { + line = readline(interactive ? parser_prompt : NULL); + + if (!line) break; + + s = skipwhitespace(line); + + if (*s) { + add_history(s); + rc = execute_line(s); + } + /* stop on error if not-interactive */ + if (rc != 0 && !interactive) { + if (save_error == 0) + save_error = rc; + if (!ignore_errors) + done = 1; + } + + free(line); + } + if (save_error) + rc = save_error; + return rc; +} + + +/* sets the parser prompt */ +void Parser_init(char * prompt, command_t * cmds) +{ + done = 0; + top_level = cmds; + if (parser_prompt) free(parser_prompt); + parser_prompt = strdup(prompt); +} + +/* frees the parser prompt */ +void Parser_exit(int argc, char *argv[]) +{ + done = 1; + free(parser_prompt); + parser_prompt = NULL; +} + +/* convert a string to an integer */ +int Parser_int(char *s, int *val) +{ + int ret; + + if (*s != '0') + ret = sscanf(s, "%d", val); + else if (*(s+1) != 'x') + ret = sscanf(s, "%o", val); + else { + s++; + ret = sscanf(++s, "%x", val); + } + + return(ret); +} + + +void Parser_qhelp(int argc, char *argv[]) { + + printf("usage: %s [COMMAND] [OPTIONS]... [ARGS]\n", + program_invocation_short_name); + printf("Without any parameters, interactive mode is invoked\n"); + + printf("Try '%s help ' or '%s --list-commands' for more information\n", + program_invocation_short_name, program_invocation_short_name); +} + +int Parser_help(int argc, char **argv) +{ + char line[1024]; + char *next, *prev, *tmp; + command_t *result, *ambig; + int i; + + if ( argc == 1 ) { + Parser_qhelp(argc, argv); + return 0; + } + + /* Joining command line arguments without space is not critical here + * because of this string is used for search a help topic and assume + * that only one argument will be (the name of topic). For example: + * lst > help ping run + * pingrun: Unknown command. */ + line[0] = '\0'; + for (i = 1; i < argc; i++) { + if (strlen(argv[i]) >= sizeof(line) - strlen(line)) + return -E2BIG; + /* The function strlcat() cannot be used here because of + * this function is used in LNet utils that is not linked + * with libcfs.a. */ + strncat(line, argv[i], sizeof(line) - strlen(line)); + } + + switch ( process(line, &next, top_level, &result, &prev) ) { + case CMD_COMPLETE: + fprintf(stderr, "%s: %s\n",line, result->pc_help); + break; + case CMD_NONE: + fprintf(stderr, "%s: Unknown command.\n", line); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; result->pc_sub_cmd[i].pc_name; i++) { + fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name); + } + fprintf(stderr, "\n"); + break; + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while( (ambig = find_cmd(prev, result, &tmp)) ) { + fprintf(stderr, "%s ", ambig->pc_name); + result = ambig + 1; + } + fprintf(stderr, "\n"); + break; + } + return 0; +} + + +void Parser_printhelp(char *cmd) +{ + char *argv[] = { "help", cmd }; + Parser_help(2, argv); +} + + +/************************************************************************* + * COMMANDS * + *************************************************************************/ + +/** + * Parser_list_commands() - Output a list of the supported commands. + * @cmdlist: Array of structures describing the commands. + * @buffer: String buffer used to temporarily store the output text. + * @buf_size: Length of the string buffer. + * @parent_cmd: When called recursively, contains the name of the parent cmd. + * @col_start: Column where printing should begin. + * @col_num: The number of commands printed in a single row. + * + * The commands and subcommands supported by the utility are printed, arranged + * into several columns for readability. If a command supports subcommands, the + * function is called recursively, and the name of the parent command is + * supplied so that it can be prepended to the names of the subcommands. + * + * Return: The number of items that were printed. + */ +int Parser_list_commands(const command_t *cmdlist, char *buffer, + size_t buf_size, const char *parent_cmd, + int col_start, int col_num) +{ + int col = col_start; + int char_max; + int len; + int count = 0; + int rc; + + if (col_start >= col_num) + return 0; + + char_max = (buf_size - 1) / col_num; /* Reserve 1 char for NUL */ + + for (; cmdlist->pc_name != NULL; cmdlist++) { + if (cmdlist->pc_func == NULL && cmdlist->pc_sub_cmd == NULL) + break; + count++; + if (parent_cmd != NULL) + len = snprintf(&buffer[col * char_max], + char_max + 1, "%s %s", parent_cmd, + cmdlist->pc_name); + else + len = snprintf(&buffer[col * char_max], + char_max + 1, "%s", cmdlist->pc_name); + + /* Add trailing spaces to pad the entry to the column size */ + if (len < char_max) { + snprintf(&buffer[col * char_max] + len, + char_max - len + 1, "%*s", char_max - len, + " "); + } else { + buffer[(col + 1) * char_max - 1] = ' '; + } + + col++; + if (col >= col_num) { + fprintf(stdout, "%s\n", buffer); + col = 0; + buffer[0] = '\0'; + } + + if (cmdlist->pc_sub_cmd != NULL) { + rc = Parser_list_commands(cmdlist->pc_sub_cmd, buffer, + buf_size, cmdlist->pc_name, + col, col_num); + col = (col + rc) % col_num; + count += rc; + } + } + if (parent_cmd == NULL && col != 0) + fprintf(stdout, "%s\n", buffer); + return count; +} + +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len) +{ + char *line = NULL; + int size = strlen(prompt) + strlen(deft) + 8; + char *theprompt; + theprompt = malloc(size); + assert(theprompt); + + sprintf(theprompt, "%s [%s]: ", prompt, deft); + + line = readline(theprompt); + free(theprompt); + + /* The function strlcpy() cannot be used here because of + * this function is used in LNet utils that is not linked + * with libcfs.a. */ + if (line == NULL || *line == '\0') + strncpy(res, deft, len); + else + strncpy(res, line, len); + res[len - 1] = '\0'; + + if (line != NULL) { + free(line); + return res; + } + return NULL; +} + +/* get integer from prompt, loop forever to get it */ +int Parser_getint(const char *prompt, long min, long max, long deft, int base) +{ + int rc; + long result; + char *line; + int size = strlen(prompt) + 40; + char *theprompt = malloc(size); + assert(theprompt); + sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft); + + fflush(stdout); + + do { + line = NULL; + line = readline(theprompt); + if ( !line ) { + fprintf(stdout, "Please enter an integer.\n"); + fflush(stdout); + continue; + } + if ( *line == '\0' ) { + free(line); + result = deft; + break; + } + rc = Parser_arg2int(line, &result, base); + free(line); + if ( rc != 0 ) { + fprintf(stdout, "Invalid string.\n"); + fflush(stdout); + } else if ( result > max || result < min ) { + fprintf(stdout, "Error: response must lie between %ld and %ld.\n", + min, max); + fflush(stdout); + } else { + break; + } + } while ( 1 ) ; + + if (theprompt) + free(theprompt); + return result; + +} + +/* get boolean (starting with YyNn; loop forever */ +int Parser_getbool(const char *prompt, int deft) +{ + int result = 0; + char *line; + int size = strlen(prompt) + 8; + char *theprompt = malloc(size); + assert(theprompt); + + fflush(stdout); + + if ( deft != 0 && deft != 1 ) { + fprintf(stderr, "Error: Parser_getbool given bad default %d\n", + deft); + assert ( 0 ); + } + sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y"); + + do { + line = NULL; + line = readline(theprompt); + if ( line == NULL ) { + result = deft; + break; + } + if ( *line == '\0' ) { + result = deft; + break; + } + if ( *line == 'y' || *line == 'Y' ) { + result = 1; + break; + } + if ( *line == 'n' || *line == 'N' ) { + result = 0; + break; + } + if ( line ) + free(line); + fprintf(stdout, "Invalid string. Must start with yY or nN\n"); + fflush(stdout); + } while ( 1 ); + + if ( line ) + free(line); + if ( theprompt ) + free(theprompt); + return result; +} + +/* parse int out of a string or prompt for it */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base) +{ + long result; + int rc; + + rc = Parser_arg2int(inp, &result, base); + + if ( rc == 0 ) { + return result; + } else { + return Parser_getint(prompt, deft, min, max, base); + } +} + +/* parse int out of a string or prompt for it */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len) +{ + if ( inp == NULL || *inp == '\0' ) { + return Parser_getstr(prompt, deft, answer, len); + } else + return inp; +} + +/* change a string into a number: return 0 on success. No invalid characters + allowed. The processing of base and validity follows strtol(3)*/ +int Parser_arg2int(const char *inp, long *result, int base) +{ + char *endptr; + + if ( (base !=0) && (base < 2 || base > 36) ) + return 1; + + *result = strtol(inp, &endptr, base); + + if ( *inp != '\0' && *endptr == '\0' ) + return 0; + else + return 1; +} + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size(unsigned long *sizep, char *str) +{ + unsigned long size; + char mod[32]; + + switch (sscanf(str, "%lu%1[gGmMkK]", &size, mod)) { + default: + return -1; + + case 1: + *sizep = size; + return 0; + + case 2: + switch (*mod) { + case 'g': + case 'G': + *sizep = size << 30; + return 0; + + case 'm': + case 'M': + *sizep = size << 20; + return 0; + + case 'k': + case 'K': + *sizep = size << 10; + return 0; + + default: + *sizep = size; + return 0; + } + } +} + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool (int *b, char *str) { + if (!strcasecmp (str, "no") || + !strcasecmp (str, "n") || + !strcasecmp (str, "off") || + !strcasecmp (str, "down") || + !strcasecmp (str, "disable")) + { + *b = 0; + return (0); + } + + if (!strcasecmp (str, "yes") || + !strcasecmp (str, "y") || + !strcasecmp (str, "on") || + !strcasecmp (str, "up") || + !strcasecmp (str, "enable")) + { + *b = 1; + return (0); + } + + return (-1); +} + +int Parser_quit(int argc, char **argv) +{ + argc = argc; + argv = argv; + done = 1; + return 0; +} + +int Parser_version(int argc, char **argv) +{ + fprintf(stdout, "%s %s\n", program_invocation_short_name, + LUSTRE_VERSION_STRING); + return 0; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c new file mode 100644 index 0000000000000..2c1a24cacebb2 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c @@ -0,0 +1,526 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * String manipulation functions. + * + * libcfs/libcfs/util/string.c + * + * Author: Nathan Rutman + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Extracts tokens from strings. + * + * Looks for \a delim in string \a next, sets \a res to point to + * substring before the delimiter, sets \a next right after the found + * delimiter. + * + * \retval 1 if \a res points to a string of non-whitespace characters + * \retval 0 otherwise + */ +int +cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res) +{ + char *end; + + if (next->ls_str == NULL) + return 0; + + /* skip leading white spaces */ + while (next->ls_len) { + if (!isspace(*next->ls_str)) + break; + next->ls_str++; + next->ls_len--; + } + + if (next->ls_len == 0) /* whitespaces only */ + return 0; + + if (*next->ls_str == delim) { + /* first non-writespace is the delimiter */ + return 0; + } + + res->ls_str = next->ls_str; + end = memchr(next->ls_str, delim, next->ls_len); + if (end == NULL) { + /* there is no the delimeter in the string */ + end = next->ls_str + next->ls_len; + next->ls_str = NULL; + } else { + next->ls_str = end + 1; + next->ls_len -= (end - res->ls_str + 1); + } + + /* skip ending whitespaces */ + while (--end != res->ls_str) { + if (!isspace(*end)) + break; + } + + res->ls_len = end - res->ls_str + 1; + return 1; +} + +/** + * Converts string to integer. + * + * Accepts decimal and hexadecimal number recordings. + * + * \retval 1 if first \a nob chars of \a str convert to decimal or + * hexadecimal integer in the range [\a min, \a max] + * \retval 0 otherwise + */ +int +cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max) +{ + char *endp; + + *num = strtoul(str, &endp, 0); + if (endp == str) + return 0; + + for (; endp < str + nob; endp++) { + if (!isspace(*endp)) + return 0; + } + + return (*num >= min && *num <= max); +} + +/** + * Parses \ token of the syntax. If \a bracketed is false, + * \a src should only have a single token which can be \ or \* + * + * \retval pointer to allocated range_expr and initialized + * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a + * src parses to + * \ | + * \ '-' \ | + * \ '-' \ '/' \ + * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or + * -ENOMEM will be returned. + */ +static int +cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max, + int bracketed, struct cfs_range_expr **expr) +{ + struct cfs_range_expr *re; + struct cfs_lstr tok; + + re = calloc(1, sizeof(*re)); + if (re == NULL) + return -ENOMEM; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + re->re_lo = min; + re->re_hi = max; + re->re_stride = 1; + goto out; + } + + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_lo, min, max)) { + /* is parsed */ + re->re_hi = re->re_lo; + re->re_stride = 1; + goto out; + } + + if (!bracketed || !cfs_gettok(src, '-', &tok)) + goto failed; + + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_lo, min, max)) + goto failed; + + /* - */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_hi, min, max)) { + /* - is parsed */ + re->re_stride = 1; + goto out; + } + + /* go to check '-' '/' */ + if (cfs_gettok(src, '/', &tok)) { + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_hi, min, max)) + goto failed; + + /* - / ... */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_stride, min, max)) { + /* - / is parsed */ + goto out; + } + } + + out: + *expr = re; + return 0; + + failed: + free(re); + return -EINVAL; +} + +/** + * Print the range expression \a re into specified \a buffer. + * If \a bracketed is true, expression does not need additional + * brackets. + * + * \retval number of characters written + */ +static int +cfs_range_expr_print(char *buffer, int count, struct cfs_range_expr *expr, + bool bracketed) +{ + int i; + char s[] = "["; + char e[] = "]"; + + if (bracketed) + s[0] = e[0] = '\0'; + + if (expr->re_lo == expr->re_hi) + i = snprintf(buffer, count, "%u", expr->re_lo); + else if (expr->re_stride == 1) + i = snprintf(buffer, count, "%s%u-%u%s", + s, expr->re_lo, expr->re_hi, e); + else + i = snprintf(buffer, count, "%s%u-%u/%u%s", + s, expr->re_lo, expr->re_hi, + expr->re_stride, e); + return i; +} + +/** + * Print a list of range expressions (\a expr_list) into specified \a buffer. + * If the list contains several expressions, separate them with comma + * and surround the list with brackets. + * + * \retval number of characters written + */ +int +cfs_expr_list_print(char *buffer, int count, struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + int i = 0, j = 0; + int numexprs = 0; + + if (count <= 0) + return 0; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) + numexprs++; + + if (numexprs > 1) + i += snprintf(buffer + i, count - i, "["); + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + if (j++ != 0) + i += snprintf(buffer + i, count - i, ","); + i += cfs_range_expr_print(buffer + i, count - i, expr, + numexprs > 1); + } + + if (numexprs > 1) + i += snprintf(buffer + i, count - i, "]"); + + return i; +} + +/** + * Matches value (\a value) against ranges expression list \a expr_list. + * + * \retval 1 if \a value matches + * \retval 0 otherwise + */ +int +cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + if (value >= expr->re_lo && value <= expr->re_hi && + ((value - expr->re_lo) % expr->re_stride) == 0) + return 1; + } + + return 0; +} + +/** + * Convert express list (\a expr_list) to an array of all matched values + * + * \retval N N is total number of all matched values + * \retval 0 if expression list is empty + * \retval < 0 for failure + */ +int +cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp) +{ + struct cfs_range_expr *expr; + __u32 *val; + int count = 0; + int i; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + count++; + } + } + + if (count == 0) /* empty expression list */ + return 0; + + if (count > max) + return -EINVAL; + + val = calloc(sizeof(val[0]), count); + if (val == NULL) + return -ENOMEM; + + count = 0; + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + val[count++] = i; + } + } + + *valpp = val; + return count; +} + +void +cfs_expr_list_values_free(__u32 *values, int num) +{ + /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed + * by OBD_FREE() if it's called by module other than libcfs & LNet, + * otherwise we will see fake memory leak */ + free(values); +} + +/** + * Frees cfs_range_expr structures of \a expr_list. + * + * \retval none + */ +void +cfs_expr_list_free(struct cfs_expr_list *expr_list) +{ + while (!list_empty(&expr_list->el_exprs)) { + struct cfs_range_expr *expr; + + expr = list_entry(expr_list->el_exprs.next, + struct cfs_range_expr, re_link); + list_del(&expr->re_link); + free(expr); + } + + free(expr_list); +} + +/** + * Parses \ token of the syntax. + * + * \retval 0 if \a str parses to \ | \ + * \retval -errno otherwise + */ +int +cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp) +{ + struct cfs_expr_list *expr_list; + struct cfs_range_expr *expr; + struct cfs_lstr src; + int rc; + + expr_list = calloc(1, sizeof(*expr_list)); + if (expr_list == NULL) + return -ENOMEM; + + src.ls_str = str; + src.ls_len = len; + + INIT_LIST_HEAD(&expr_list->el_exprs); + + if (src.ls_str[0] == '[' && + src.ls_str[src.ls_len - 1] == ']') { + src.ls_str++; + src.ls_len -= 2; + + rc = -EINVAL; + while (src.ls_str != NULL) { + struct cfs_lstr tok; + + if (!cfs_gettok(&src, ',', &tok)) { + rc = -EINVAL; + break; + } + + rc = cfs_range_expr_parse(&tok, min, max, 1, &expr); + if (rc != 0) + break; + + list_add_tail(&expr->re_link, + &expr_list->el_exprs); + } + } else { + rc = cfs_range_expr_parse(&src, min, max, 0, &expr); + if (rc == 0) { + list_add_tail(&expr->re_link, + &expr_list->el_exprs); + } + } + + if (rc != 0) + cfs_expr_list_free(expr_list); + else + *elpp = expr_list; + + return rc; +} + +/** + * Frees cfs_expr_list structures of \a list. + * + * For each struct cfs_expr_list structure found on \a list it frees + * range_expr list attached to it and frees the cfs_expr_list itself. + * + * \retval none + */ +void +cfs_expr_list_free_list(struct list_head *list) +{ + struct cfs_expr_list *el; + + while (!list_empty(list)) { + el = list_entry(list->next, + struct cfs_expr_list, el_link); + list_del(&el->el_link); + cfs_expr_list_free(el); + } +} + +/** + * cfs_abs_path() - Get the absolute path of a relative path + * @request_path: The relative path to be resolved + * @resolved_path: Set to the resolved absolute path + * + * Returns the canonicalized absolute pathname. This function is a wrapper to + * realpath, but will work even if the target file does not exist. All + * directories in the path must exist. + * + * Return: On success, 0 is returned and resolved_path points to an allocated + * string containing the absolute pathname. On error, errno is set + * appropriately, -errno is returned, and resolved_path points to NULL. + */ +int cfs_abs_path(const char *request_path, char **resolved_path) +{ + char buf[PATH_MAX + 1] = ""; + char *path; + char *ptr; + int len; + int rc = 0; + const char *fmt; + + path = malloc(sizeof(buf)); + if (path == NULL) + return -ENOMEM; + + if (request_path[0] != '/') { + if (getcwd(path, sizeof(buf) - 1) == NULL) { + rc = -errno; + goto out; + } + len = snprintf(buf, sizeof(buf), "%s/%s", path, request_path); + if (len >= sizeof(buf)) { + rc = -ENAMETOOLONG; + goto out; + } + } else { + /* skip duplicate leading '/' */ + len = snprintf(buf, sizeof(buf), "%s", + request_path + strspn(request_path, "/") - 1); + if (len >= sizeof(buf)) { + rc = -ENAMETOOLONG; + goto out; + } + } + + /* if filename not in root directory, call realpath for parent path */ + ptr = strrchr(buf, '/'); + if (ptr != buf) { + *ptr = '\0'; + if (path != realpath(buf, path)) { + rc = -errno; + goto out; + } + /* add the filename back */ + len = strlen(path); + fmt = (path[len - 1] == '/') ? "%s" : "/%s"; + len = snprintf(path + len, sizeof(buf) - len, fmt, ptr + 1); + if (len >= sizeof(buf) - len) { + rc = -ENAMETOOLONG; + goto out; + } + } else { + len = snprintf(path, sizeof(buf), "%s", buf); + if (len >= sizeof(buf)) { + rc = -ENAMETOOLONG; + goto out; + } + } + +out: + if (rc == 0) { + *resolved_path = path; + } else { + *resolved_path = NULL; + free(path); + } + return rc; +} diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c new file mode 100644 index 0000000000000..dd451dd807bc1 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c @@ -0,0 +1,506 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/watchdog.c + * + * Author: Jacob Berkman + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include "tracefile.h" + +#ifndef WITH_WATCHDOG +#define WITH_WATCHDOG +#endif + +struct lc_watchdog { + spinlock_t lcw_lock; /* check or change lcw_list */ + int lcw_refcount; /* must hold lcw_pending_timers_lock */ + struct timer_list lcw_timer; /* kernel timer */ + struct list_head lcw_list; /* chain on pending list */ + ktime_t lcw_last_touched;/* last touched stamp */ + struct task_struct *lcw_task; /* owner task */ + void (*lcw_callback)(pid_t, void *); + void *lcw_data; + + pid_t lcw_pid; + + enum { + LC_WATCHDOG_DISABLED, + LC_WATCHDOG_ENABLED, + LC_WATCHDOG_EXPIRED + } lcw_state; +}; + +#ifdef WITH_WATCHDOG +/* + * The dispatcher will complete lcw_start_completion when it starts, + * and lcw_stop_completion when it exits. + * Wake lcw_event_waitq to signal timer callback dispatches. + */ +static struct completion lcw_start_completion; +static struct completion lcw_stop_completion; +static wait_queue_head_t lcw_event_waitq; + +/* + * Set this and wake lcw_event_waitq to stop the dispatcher. + */ +enum { + LCW_FLAG_STOP = 0 +}; +static unsigned long lcw_flags = 0; + +/* + * Number of outstanding watchdogs. + * When it hits 1, we start the dispatcher. + * When it hits 0, we stop the dispatcher. + */ +static __u32 lcw_refcount = 0; +static DEFINE_MUTEX(lcw_refcount_mutex); + +/* + * List of timers that have fired that need their callbacks run by the + * dispatcher. + */ +/* BH lock! */ +static DEFINE_SPINLOCK(lcw_pending_timers_lock); +static struct list_head lcw_pending_timers = LIST_HEAD_INIT(lcw_pending_timers); + +/* Last time a watchdog expired */ +static time64_t lcw_last_watchdog_time; +static int lcw_recent_watchdog_count; + +static void +lcw_dump(struct lc_watchdog *lcw) +{ + ENTRY; + rcu_read_lock(); + if (lcw->lcw_task == NULL) { + LCONSOLE_WARN("Process %d was not found in the task " + "list; watchdog callback may be incomplete\n", + (int)lcw->lcw_pid); + } else { + libcfs_debug_dumpstack(lcw->lcw_task); + } + + rcu_read_unlock(); + EXIT; +} + +static void lcw_cb(cfs_timer_cb_arg_t data) +{ + struct lc_watchdog *lcw = cfs_from_timer(lcw, data, lcw_timer); + ENTRY; + + if (lcw->lcw_state != LC_WATCHDOG_ENABLED) { + EXIT; + return; + } + + lcw->lcw_state = LC_WATCHDOG_EXPIRED; + + spin_lock_bh(&lcw->lcw_lock); + LASSERT(list_empty(&lcw->lcw_list)); + + spin_lock_bh(&lcw_pending_timers_lock); + lcw->lcw_refcount++; /* +1 for pending list */ + list_add(&lcw->lcw_list, &lcw_pending_timers); + wake_up(&lcw_event_waitq); + + spin_unlock_bh(&lcw_pending_timers_lock); + spin_unlock_bh(&lcw->lcw_lock); + EXIT; +} + +static int is_watchdog_fired(void) +{ + int rc; + + if (test_bit(LCW_FLAG_STOP, &lcw_flags)) + return 1; + + spin_lock_bh(&lcw_pending_timers_lock); + rc = !list_empty(&lcw_pending_timers); + spin_unlock_bh(&lcw_pending_timers_lock); + return rc; +} + +static void lcw_dump_stack(struct lc_watchdog *lcw) +{ + time64_t current_time = ktime_get_seconds(); + struct timespec64 timediff; + time64_t delta_time; + + timediff = ktime_to_timespec64(ktime_sub(ktime_get(), + lcw->lcw_last_touched)); + + /* LU-9235: Don't dump stack if the thread is just touched. */ + if (timediff.tv_sec == 0) + return; + + /* + * Check to see if we should throttle the watchdog timer to avoid + * too many dumps going to the console thus triggering an NMI. + */ + delta_time = current_time - lcw_last_watchdog_time; + if (delta_time < libcfs_watchdog_ratelimit && + lcw_recent_watchdog_count > 3) { + LCONSOLE_WARN("Service thread pid %u was inactive for %llu.%.02lus. Watchdog stack traces are limited to 3 per %d seconds, skipping this one.\n", + (int)lcw->lcw_pid, + (unsigned long long)timediff.tv_sec, + timediff.tv_nsec / (NSEC_PER_SEC / 100), + libcfs_watchdog_ratelimit); + } else { + if (delta_time < libcfs_watchdog_ratelimit) { + lcw_recent_watchdog_count++; + } else { + memcpy(&lcw_last_watchdog_time, ¤t_time, + sizeof(current_time)); + lcw_recent_watchdog_count = 0; + } + + LCONSOLE_WARN("Service thread pid %u was inactive for %llu.%.02lus. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n", + (int)lcw->lcw_pid, + (unsigned long long)timediff.tv_sec, + timediff.tv_nsec / (NSEC_PER_SEC / 100)); + lcw_dump(lcw); + } +} + +/* + * Provided watchdog handlers + */ + +static void lc_watchdog_dumplog(pid_t pid, void *data) +{ + libcfs_debug_dumplog_internal((void *)((uintptr_t)pid)); +} + +static int lcw_dispatch_main(void *data) +{ + int rc = 0; + struct lc_watchdog *lcw; + struct list_head zombies = LIST_HEAD_INIT(zombies); + + ENTRY; + + complete(&lcw_start_completion); + + while (1) { + int dumplog = 1; + + rc = wait_event_interruptible(lcw_event_waitq, + is_watchdog_fired()); + CDEBUG(D_INFO, "Watchdog got woken up...\n"); + if (test_bit(LCW_FLAG_STOP, &lcw_flags)) { + CDEBUG(D_INFO, "LCW_FLAG_STOP set, shutting down...\n"); + + spin_lock_bh(&lcw_pending_timers_lock); + rc = !list_empty(&lcw_pending_timers); + spin_unlock_bh(&lcw_pending_timers_lock); + if (rc) { + CERROR("pending timers list was not empty at " + "time of watchdog dispatch shutdown\n"); + } + break; + } + + spin_lock_bh(&lcw_pending_timers_lock); + while (!list_empty(&lcw_pending_timers)) { + int is_dumplog; + + lcw = list_entry(lcw_pending_timers.next, + struct lc_watchdog, lcw_list); + /* +1 ref for callback to make sure lwc wouldn't be + * deleted after releasing lcw_pending_timers_lock */ + lcw->lcw_refcount++; + spin_unlock_bh(&lcw_pending_timers_lock); + + /* lock ordering */ + spin_lock_bh(&lcw->lcw_lock); + spin_lock_bh(&lcw_pending_timers_lock); + + if (list_empty(&lcw->lcw_list)) { + /* already removed from pending list */ + lcw->lcw_refcount--; /* -1 ref for callback */ + if (lcw->lcw_refcount == 0) + list_add(&lcw->lcw_list, &zombies); + spin_unlock_bh(&lcw->lcw_lock); + /* still hold lcw_pending_timers_lock */ + continue; + } + + list_del_init(&lcw->lcw_list); + lcw->lcw_refcount--; /* -1 ref for pending list */ + + spin_unlock_bh(&lcw_pending_timers_lock); + spin_unlock_bh(&lcw->lcw_lock); + + CDEBUG(D_INFO, "found lcw for pid %d\n", + lcw->lcw_pid); + lcw_dump_stack(lcw); + + is_dumplog = lcw->lcw_callback == lc_watchdog_dumplog; + if (lcw->lcw_state != LC_WATCHDOG_DISABLED && + (dumplog || !is_dumplog)) { + lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data); + if (dumplog && is_dumplog) + dumplog = 0; + } + + spin_lock_bh(&lcw_pending_timers_lock); + lcw->lcw_refcount--; /* -1 ref for callback */ + if (lcw->lcw_refcount == 0) + list_add(&lcw->lcw_list, &zombies); + } + spin_unlock_bh(&lcw_pending_timers_lock); + + while (!list_empty(&zombies)) { + lcw = list_entry(zombies.next, + struct lc_watchdog, lcw_list); + list_del_init(&lcw->lcw_list); + LIBCFS_FREE(lcw, sizeof(*lcw)); + } + } + + complete(&lcw_stop_completion); + + RETURN(rc); +} + +static void lcw_dispatch_start(void) +{ + struct task_struct *task; + + ENTRY; + LASSERT(lcw_refcount == 1); + + init_completion(&lcw_stop_completion); + init_completion(&lcw_start_completion); + init_waitqueue_head(&lcw_event_waitq); + + CDEBUG(D_INFO, "starting dispatch thread\n"); + task = kthread_run(lcw_dispatch_main, NULL, "lc_watchdogd"); + if (IS_ERR(task)) { + CERROR("error spawning watchdog dispatch thread: %ld\n", + PTR_ERR(task)); + EXIT; + return; + } + wait_for_completion(&lcw_start_completion); + CDEBUG(D_INFO, "watchdog dispatcher initialization complete.\n"); + + EXIT; +} + +static void lcw_dispatch_stop(void) +{ + ENTRY; + LASSERT(lcw_refcount == 0); + + CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n"); + + set_bit(LCW_FLAG_STOP, &lcw_flags); + wake_up(&lcw_event_waitq); + + wait_for_completion(&lcw_stop_completion); + clear_bit(LCW_FLAG_STOP, &lcw_flags); + + CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n"); + + EXIT; +} + +struct lc_watchdog *lc_watchdog_add(int timeout, + void (*callback)(pid_t, void *), + void *data) +{ + struct lc_watchdog *lcw = NULL; + ENTRY; + + LIBCFS_ALLOC(lcw, sizeof(*lcw)); + if (lcw == NULL) { + CDEBUG(D_INFO, "Could not allocate new lc_watchdog\n"); + RETURN(ERR_PTR(-ENOMEM)); + } + + spin_lock_init(&lcw->lcw_lock); + lcw->lcw_refcount = 1; /* refcount for owner */ + lcw->lcw_task = current; + lcw->lcw_pid = current_pid(); + lcw->lcw_callback = (callback != NULL) ? callback : lc_watchdog_dumplog; + lcw->lcw_data = data; + lcw->lcw_state = LC_WATCHDOG_DISABLED; + + INIT_LIST_HEAD(&lcw->lcw_list); + cfs_timer_setup(&lcw->lcw_timer, lcw_cb, (unsigned long)lcw, 0); + + mutex_lock(&lcw_refcount_mutex); + if (++lcw_refcount == 1) + lcw_dispatch_start(); + mutex_unlock(&lcw_refcount_mutex); + + /* Keep this working in case we enable them by default */ + if (lcw->lcw_state == LC_WATCHDOG_ENABLED) { + lcw->lcw_last_touched = ktime_get(); + mod_timer(&lcw->lcw_timer, cfs_time_seconds(timeout) + + jiffies); + } + + RETURN(lcw); +} +EXPORT_SYMBOL(lc_watchdog_add); + +static void lcw_update_time(struct lc_watchdog *lcw, const char *message) +{ + ktime_t newtime = ktime_get(); + + if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) { + ktime_t lapse = ktime_sub(newtime, lcw->lcw_last_touched); + struct timespec64 timediff; + + timediff = ktime_to_timespec64(lapse); + LCONSOLE_WARN("Service thread pid %u %s after %llu.%.02lus. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).\n", + lcw->lcw_pid, message, + (unsigned long long)timediff.tv_sec, + timediff.tv_nsec / (NSEC_PER_SEC / 100)); + } + lcw->lcw_last_touched = newtime; +} + +static void lc_watchdog_del_pending(struct lc_watchdog *lcw) +{ + spin_lock_bh(&lcw->lcw_lock); + if (unlikely(!list_empty(&lcw->lcw_list))) { + spin_lock_bh(&lcw_pending_timers_lock); + list_del_init(&lcw->lcw_list); + lcw->lcw_refcount--; /* -1 ref for pending list */ + spin_unlock_bh(&lcw_pending_timers_lock); + } + + spin_unlock_bh(&lcw->lcw_lock); +} + +void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout) +{ + ENTRY; + LASSERT(lcw != NULL); + + lc_watchdog_del_pending(lcw); + + lcw_update_time(lcw, "resumed"); + + mod_timer(&lcw->lcw_timer, jiffies + cfs_time_seconds(timeout)); + lcw->lcw_state = LC_WATCHDOG_ENABLED; + + EXIT; +} +EXPORT_SYMBOL(lc_watchdog_touch); + +void lc_watchdog_disable(struct lc_watchdog *lcw) +{ + ENTRY; + LASSERT(lcw != NULL); + + lc_watchdog_del_pending(lcw); + + lcw_update_time(lcw, "completed"); + lcw->lcw_state = LC_WATCHDOG_DISABLED; + + EXIT; +} +EXPORT_SYMBOL(lc_watchdog_disable); + +void lc_watchdog_delete(struct lc_watchdog *lcw) +{ + int dead; + + ENTRY; + LASSERT(lcw != NULL); + + del_timer(&lcw->lcw_timer); + + lcw_update_time(lcw, "stopped"); + + spin_lock_bh(&lcw->lcw_lock); + spin_lock_bh(&lcw_pending_timers_lock); + if (unlikely(!list_empty(&lcw->lcw_list))) { + list_del_init(&lcw->lcw_list); + lcw->lcw_refcount--; /* -1 ref for pending list */ + } + + lcw->lcw_refcount--; /* -1 ref for owner */ + dead = lcw->lcw_refcount == 0; + spin_unlock_bh(&lcw_pending_timers_lock); + spin_unlock_bh(&lcw->lcw_lock); + + if (dead) + LIBCFS_FREE(lcw, sizeof(*lcw)); + + mutex_lock(&lcw_refcount_mutex); + if (--lcw_refcount == 0) + lcw_dispatch_stop(); + mutex_unlock(&lcw_refcount_mutex); + + EXIT; +} +EXPORT_SYMBOL(lc_watchdog_delete); + +#else /* !defined(WITH_WATCHDOG) */ + +struct lc_watchdog *lc_watchdog_add(int timeout, + void (*callback)(pid_t pid, void *), + void *data) +{ + static struct lc_watchdog watchdog; + return &watchdog; +} +EXPORT_SYMBOL(lc_watchdog_add); + +void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout) +{ +} +EXPORT_SYMBOL(lc_watchdog_touch); + +void lc_watchdog_disable(struct lc_watchdog *lcw) +{ +} +EXPORT_SYMBOL(lc_watchdog_disable); + +void lc_watchdog_delete(struct lc_watchdog *lcw) +{ +} +EXPORT_SYMBOL(lc_watchdog_delete); + +#endif diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c new file mode 100644 index 0000000000000..f370ffab81677 --- /dev/null +++ b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c @@ -0,0 +1,468 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/workitem.c + * + * Author: Isaac Huang + * Liang Zhen + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +#define CFS_WS_NAME_LEN 16 + +struct cfs_wi_sched { + struct list_head ws_list; /* chain on global list */ + /** serialised workitems */ + spinlock_t ws_lock; + /** where schedulers sleep */ + wait_queue_head_t ws_waitq; + /** concurrent workitems */ + struct list_head ws_runq; + /** rescheduled running-workitems, a workitem can be rescheduled + * while running in wi_action(), but we don't to execute it again + * unless it returns from wi_action(), so we put it on ws_rerunq + * while rescheduling, and move it to runq after it returns + * from wi_action() */ + struct list_head ws_rerunq; + /** CPT-table for this scheduler */ + struct cfs_cpt_table *ws_cptab; + /** CPT id for affinity */ + int ws_cpt; + /** number of scheduled workitems */ + int ws_nscheduled; + /** started scheduler thread, protected by cfs_wi_data::wi_glock */ + unsigned int ws_nthreads:30; + /** shutting down, protected by cfs_wi_data::wi_glock */ + unsigned int ws_stopping:1; + /** serialize starting thread, protected by cfs_wi_data::wi_glock */ + unsigned int ws_starting:1; + /** scheduler name */ + char ws_name[CFS_WS_NAME_LEN]; +}; + +static struct cfs_workitem_data { + /** serialize */ + spinlock_t wi_glock; + /** list of all schedulers */ + struct list_head wi_scheds; + /** WI module is initialized */ + int wi_init; + /** shutting down the whole WI module */ + int wi_stopping; +} cfs_wi_data; + +static inline int +cfs_wi_sched_cansleep(struct cfs_wi_sched *sched) +{ + spin_lock(&sched->ws_lock); + if (sched->ws_stopping) { + spin_unlock(&sched->ws_lock); + return 0; + } + + if (!list_empty(&sched->ws_runq)) { + spin_unlock(&sched->ws_lock); + return 0; + } + spin_unlock(&sched->ws_lock); + return 1; +} + +/* XXX: + * 0. it only works when called from wi->wi_action. + * 1. when it returns no one shall try to schedule the workitem. + */ +void +cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi) +{ + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + spin_lock(&sched->ws_lock); + + LASSERT(wi->wi_running); + + if (wi->wi_scheduled) { /* cancel pending schedules */ + LASSERT(!list_empty(&wi->wi_list)); + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + } + + LASSERT(list_empty(&wi->wi_list)); + + wi->wi_scheduled = 1; /* LBUG future schedule attempts */ + spin_unlock(&sched->ws_lock); + + return; +} +EXPORT_SYMBOL(cfs_wi_exit); + +/** + * cancel schedule request of workitem \a wi + */ +int +cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi) +{ + int rc; + + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + /* + * return 0 if it's running already, otherwise return 1, which + * means the workitem will not be scheduled and will not have + * any race with wi_action. + */ + spin_lock(&sched->ws_lock); + + rc = !(wi->wi_running); + + if (wi->wi_scheduled) { /* cancel pending schedules */ + LASSERT(!list_empty(&wi->wi_list)); + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + + wi->wi_scheduled = 0; + } + + LASSERT (list_empty(&wi->wi_list)); + + spin_unlock(&sched->ws_lock); + return rc; +} +EXPORT_SYMBOL(cfs_wi_deschedule); + +/* + * Workitem scheduled with (serial == 1) is strictly serialised not only with + * itself, but also with others scheduled this way. + * + * Now there's only one static serialised queue, but in the future more might + * be added, and even dynamic creation of serialised queues might be supported. + */ +void +cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi) +{ + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + spin_lock(&sched->ws_lock); + + if (!wi->wi_scheduled) { + LASSERT (list_empty(&wi->wi_list)); + + wi->wi_scheduled = 1; + sched->ws_nscheduled++; + if (!wi->wi_running) { + list_add_tail(&wi->wi_list, &sched->ws_runq); + wake_up(&sched->ws_waitq); + } else { + list_add(&wi->wi_list, &sched->ws_rerunq); + } + } + + LASSERT (!list_empty(&wi->wi_list)); + spin_unlock(&sched->ws_lock); + return; +} +EXPORT_SYMBOL(cfs_wi_schedule); + +static int +cfs_wi_scheduler(void *arg) +{ + struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg; + + cfs_block_allsigs(); + + /* CPT affinity scheduler? */ + if (sched->ws_cptab != NULL) + if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0) + CWARN("Unable to bind %s on CPU partition %d\n", + sched->ws_name, sched->ws_cpt); + + spin_lock(&cfs_wi_data.wi_glock); + + LASSERT(sched->ws_starting == 1); + sched->ws_starting--; + sched->ws_nthreads++; + + spin_unlock(&cfs_wi_data.wi_glock); + + spin_lock(&sched->ws_lock); + + while (!sched->ws_stopping) { + int nloops = 0; + int rc; + struct cfs_workitem *wi; + + while (!list_empty(&sched->ws_runq) && + nloops < CFS_WI_RESCHED) { + wi = list_entry(sched->ws_runq.next, + struct cfs_workitem, wi_list); + LASSERT(wi->wi_scheduled && !wi->wi_running); + + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + + wi->wi_running = 1; + wi->wi_scheduled = 0; + + spin_unlock(&sched->ws_lock); + nloops++; + + rc = (*wi->wi_action) (wi); + + spin_lock(&sched->ws_lock); + if (rc != 0) /* WI should be dead, even be freed! */ + continue; + + wi->wi_running = 0; + if (list_empty(&wi->wi_list)) + continue; + + LASSERT(wi->wi_scheduled); + /* wi is rescheduled, should be on rerunq now, we + * move it to runq so it can run action now */ + list_move_tail(&wi->wi_list, &sched->ws_runq); + } + + if (!list_empty(&sched->ws_runq)) { + spin_unlock(&sched->ws_lock); + /* don't sleep because some workitems still + * expect me to come back soon */ + cond_resched(); + spin_lock(&sched->ws_lock); + continue; + } + + spin_unlock(&sched->ws_lock); + rc = wait_event_interruptible_exclusive(sched->ws_waitq, + !cfs_wi_sched_cansleep(sched)); + spin_lock(&sched->ws_lock); + } + + spin_unlock(&sched->ws_lock); + + spin_lock(&cfs_wi_data.wi_glock); + sched->ws_nthreads--; + spin_unlock(&cfs_wi_data.wi_glock); + + return 0; +} + +void +cfs_wi_sched_destroy(struct cfs_wi_sched *sched) +{ + LASSERT(cfs_wi_data.wi_init); + LASSERT(!cfs_wi_data.wi_stopping); + + spin_lock(&cfs_wi_data.wi_glock); + if (sched->ws_stopping) { + CDEBUG(D_INFO, "%s is in progress of stopping\n", + sched->ws_name); + spin_unlock(&cfs_wi_data.wi_glock); + return; + } + + LASSERT(!list_empty(&sched->ws_list)); + sched->ws_stopping = 1; + + spin_unlock(&cfs_wi_data.wi_glock); + + wake_up_all(&sched->ws_waitq); + + spin_lock(&cfs_wi_data.wi_glock); + { + int i = 2; + + while (sched->ws_nthreads > 0) { + CDEBUG(is_power_of_2(++i / 20) ? D_WARNING : D_NET, + "waiting %us for %d %s worker threads to exit\n", + i / 20, sched->ws_nthreads, sched->ws_name); + + spin_unlock(&cfs_wi_data.wi_glock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 20); + spin_lock(&cfs_wi_data.wi_glock); + } + } + + list_del(&sched->ws_list); + + spin_unlock(&cfs_wi_data.wi_glock); + + LASSERT(sched->ws_nscheduled == 0); + + LIBCFS_FREE(sched, sizeof(*sched)); +} +EXPORT_SYMBOL(cfs_wi_sched_destroy); + +int +cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, + int cpt, int nthrs, struct cfs_wi_sched **sched_pp) +{ + struct cfs_wi_sched *sched; + + LASSERT(cfs_wi_data.wi_init); + LASSERT(!cfs_wi_data.wi_stopping); + LASSERT(cptab == NULL || cpt == CFS_CPT_ANY || + (cpt >= 0 && cpt < cfs_cpt_number(cptab))); + + LIBCFS_ALLOC(sched, sizeof(*sched)); + if (sched == NULL) + return -ENOMEM; + + if (strlen(name) > sizeof(sched->ws_name)-1) { + LIBCFS_FREE(sched, sizeof(*sched)); + return -E2BIG; + } + strlcpy(sched->ws_name, name, sizeof(sched->ws_name)); + + sched->ws_cptab = cptab; + sched->ws_cpt = cpt; + + spin_lock_init(&sched->ws_lock); + init_waitqueue_head(&sched->ws_waitq); + + INIT_LIST_HEAD(&sched->ws_runq); + INIT_LIST_HEAD(&sched->ws_rerunq); + INIT_LIST_HEAD(&sched->ws_list); + + for (; nthrs > 0; nthrs--) { + char name[16]; + struct task_struct *task; + + spin_lock(&cfs_wi_data.wi_glock); + while (sched->ws_starting > 0) { + spin_unlock(&cfs_wi_data.wi_glock); + schedule(); + spin_lock(&cfs_wi_data.wi_glock); + } + + sched->ws_starting++; + spin_unlock(&cfs_wi_data.wi_glock); + + if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) { + snprintf(name, sizeof(name), "%s_%02d_%02d", + sched->ws_name, sched->ws_cpt, + sched->ws_nthreads); + } else { + snprintf(name, sizeof(name), "%s_%02d", + sched->ws_name, sched->ws_nthreads); + } + + task = kthread_run(cfs_wi_scheduler, sched, name); + if (IS_ERR(task)) { + int rc = PTR_ERR(task); + + CERROR("Failed to create thread for " + "WI scheduler %s: %d\n", name, rc); + + spin_lock(&cfs_wi_data.wi_glock); + + /* make up for cfs_wi_sched_destroy */ + list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); + sched->ws_starting--; + + spin_unlock(&cfs_wi_data.wi_glock); + + cfs_wi_sched_destroy(sched); + return rc; + } + } + + spin_lock(&cfs_wi_data.wi_glock); + list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); + spin_unlock(&cfs_wi_data.wi_glock); + + *sched_pp = sched; + return 0; +} +EXPORT_SYMBOL(cfs_wi_sched_create); + +int +cfs_wi_startup(void) +{ + memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data)); + + spin_lock_init(&cfs_wi_data.wi_glock); + INIT_LIST_HEAD(&cfs_wi_data.wi_scheds); + cfs_wi_data.wi_init = 1; + + return 0; +} + +void +cfs_wi_shutdown (void) +{ + struct cfs_wi_sched *sched; + + spin_lock(&cfs_wi_data.wi_glock); + cfs_wi_data.wi_stopping = 1; + spin_unlock(&cfs_wi_data.wi_glock); + + /* nobody should contend on this list */ + list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { + sched->ws_stopping = 1; + wake_up_all(&sched->ws_waitq); + } + + list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { + spin_lock(&cfs_wi_data.wi_glock); + + while (sched->ws_nthreads != 0) { + spin_unlock(&cfs_wi_data.wi_glock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 20); + spin_lock(&cfs_wi_data.wi_glock); + } + spin_unlock(&cfs_wi_data.wi_glock); + } + + while (!list_empty(&cfs_wi_data.wi_scheds)) { + sched = list_entry(cfs_wi_data.wi_scheds.next, + struct cfs_wi_sched, ws_list); + list_del(&sched->ws_list); + LIBCFS_FREE(sched, sizeof(*sched)); + } + + cfs_wi_data.wi_stopping = 0; + cfs_wi_data.wi_init = 0; +} diff --git a/drivers/staging/lustrefsx/lnet/Kconfig b/drivers/staging/lustrefsx/lnet/Kconfig new file mode 100644 index 0000000000000..0d0686a25fe1e --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/Kconfig @@ -0,0 +1,37 @@ +config LUSTREFSX_LNET + tristate "Lustre networking subsystem (LNet)" + select LUSTREFSX_LIBCFS + depends on m + depends on INET + help + The Lustre network layer, also known as LNet, is a networking abstaction + level API that was initially created to allow Lustre Filesystem to utilize + very different networks like tcp and ib verbs in a uniform way. In the + case of Lustre routers only the LNet layer is required. Lately other + projects are also looking into using LNet as their networking API as well. + +config LUSTREFSX_LNET_SELFTEST + tristate "Lustre networking self testing" + depends on m + depends on LUSTREFSX_LNET + help + Choose Y here if you want to do lnet self testing. To compile this + as a module, choose M here: the module will be called lnet_selftest. + + If unsure, say N. + + See also http://wiki.lustre.org/ + +config LUSTREFSX_LNET_XPRT_IB + tristate "LNET infiniband support" + depends on m + depends on LUSTREFSX_LNET && INFINIBAND && INFINIBAND_ADDR_TRANS + default LUSTREFSX_LNET && INFINIBAND + help + This option allows the LNET users to use infiniband as an + RDMA-enabled transport. + + To compile this as a kernel module, choose M here and it will be + called ko2iblnd. + + If unsure, say N. diff --git a/drivers/staging/lustrefsx/lnet/LICENSE b/drivers/staging/lustrefsx/lnet/LICENSE new file mode 100644 index 0000000000000..92728f4d300d2 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/LICENSE @@ -0,0 +1,363 @@ +Each file in this distribution should contain a header stating the +copyright owner(s), and the licensing terms for that module. Some +files are not eligible for copyright protection, and contain neither. + +All files in this subtree are licensed under the terms and conditions +of the GNU General Public License version 2. + +Reproduced below is the GPL v2, and Linus's clarifying statement from +the Linux kernel source code: + +---------------------------------------- + + NOTE! This copyright does *not* cover user programs that use kernel + services by normal system calls - this is merely considered normal use + of the kernel, and does *not* fall under the heading of "derived work". + Also note that the GPL below is copyrighted by the Free Software + Foundation, but the instance of code that it refers to (the Linux + kernel) is copyrighted by me and others who actually wrote it. + + Linus Torvalds + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/drivers/staging/lustrefsx/lnet/Makefile b/drivers/staging/lustrefsx/lnet/Makefile new file mode 100644 index 0000000000000..7ee52eb559025 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += lnet/ +obj-$(CONFIG_LUSTREFSX_LNET) += klnds/ +obj-$(CONFIG_LUSTREFSX_LNET_SELFTEST) += selftest/ diff --git a/drivers/staging/lustrefsx/lnet/include/cyaml.h b/drivers/staging/lustrefsx/lnet/include/cyaml.h new file mode 100644 index 0000000000000..1537dbd19ed0c --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/cyaml.h @@ -0,0 +1,257 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of the + * License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + * LGPL HEADER END + * + * Copyright (c) 2014, 2017, Intel Corporation. + * + * Author: + * Amir Shehata + */ + +#ifndef CYAML_H +#define CYAML_H + +#include + +enum cYAML_object_type { + CYAML_TYPE_FALSE = 0, + CYAML_TYPE_TRUE, + CYAML_TYPE_NULL, + CYAML_TYPE_NUMBER, + CYAML_TYPE_STRING, + CYAML_TYPE_ARRAY, + CYAML_TYPE_OBJECT +}; + +struct cYAML { + /* next/prev allow you to walk array/object chains. */ + struct cYAML *cy_next, *cy_prev; + /* An array or object item will have a child pointer pointing + to a chain of the items in the array/object. */ + struct cYAML *cy_child; + /* The type of the item, as above. */ + enum cYAML_object_type cy_type; + + /* The item's string, if type==CYAML_TYPE_STRING */ + char *cy_valuestring; + /* The item's number, if type==CYAML_TYPE_NUMBER */ + int cy_valueint; + /* The item's number, if type==CYAML_TYPE_NUMBER */ + double cy_valuedouble; + /* The item's name string, if this item is the child of, + or is in the list of subitems of an object. */ + char *cy_string; + /* user data which might need to be tracked per object */ + void *cy_user_data; +}; + +typedef void (*cYAML_user_data_free_cb)(void *); + +/* + * cYAML_walk_cb + * Callback called when recursing through the tree + * + * cYAML* - pointer to the node currently being visitied + * void* - user data passed to the callback. + * void** - output value from the callback + * + * Returns true to continue recursing. false to stop recursing + */ +typedef bool (*cYAML_walk_cb)(struct cYAML *, void *, void**); + +/* + * cYAML_build_tree + * Build a tree representation of the YAML formatted text passed in. + * + * yaml_file - YAML file to parse and build tree representation + * yaml_blk - blk of YAML. yaml_file takes precedence if both + * are defined. + * yaml_blk_size - length of the yaml block (obtained via strlen) + */ +struct cYAML *cYAML_build_tree(char *yaml_file, const char *yaml_blk, + size_t yaml_blk_size, + struct cYAML **err_str, bool debug); + +/* + * cYAML_print_tree + * Print the textual representation of a YAML tree to stderr + * + * node - Node where you want to start printing + */ +void cYAML_print_tree(struct cYAML *node); + +/* + * cYAML_print_tree2file + * Print the textual representation of a YAML tree to file + * + * f - file to print to + * node - Node where you want to start printing + */ +void cYAML_print_tree2file(FILE *f, struct cYAML *node); + +/* + * cYAML_free_tree + * Free the cYAML tree returned as part of the cYAML_build_tree + * + * node - root of the tree to be freed + */ +void cYAML_free_tree(struct cYAML *node); + +/* + * cYAML_get_object_item + * Returns the cYAML object which key correspods to the name passed in + * This function searches only through the current level. + * + * parent - is the parent object on which you want to conduct the search + * name - key name of the object you want to find. + */ +struct cYAML *cYAML_get_object_item(struct cYAML *parent, + const char *name); + +/* + * cYAML_get_next_seq_item + * Returns the next item in the YAML sequence. This function uses the + * itm parameter to keep track of its position in the sequence. If the + * itm parameter is reset to NULL between calls that resets and returns + * the first item in the sequence. + * This function returns NULL when there are no more items in the + * sequence. + * + * seq - is the head node of the YAML sequence + * itm - [OUT] next sequence item to continue looking from next time. + * + */ +struct cYAML *cYAML_get_next_seq_item(struct cYAML *seq, + struct cYAML **itm); + +/* + * cYAML_is_seq + * Returns 1 if the node provided is an ARRAY 0 otherwise + * + * node - the node to examine + * + */ +bool cYAML_is_sequence(struct cYAML *node); + +/* + * cYAML_find_object + * Returns the cYAML object which key correspods to the name passed in + * this function searches the entire tree. + * + * root - is the root of the tree on which you want to conduct the search + * name - key name of the object you want to find. + */ +struct cYAML *cYAML_find_object(struct cYAML *root, const char *key); + +/* + * cYAML_clean_usr_data + * walks the tree and for each node with some user data it calls the + * free_cb with the user data as a parameter. + * + * node: node to start the walk from + * free_cb: cb to call to cleanup the user data + */ +void cYAML_clean_usr_data(struct cYAML *node, + cYAML_user_data_free_cb free_cb); + +/* + * cYAML_create_object + * Creates a CYAML of type OBJECT + * + * parent - parent node + * key - node key + */ +struct cYAML *cYAML_create_object(struct cYAML *parent, char *key); + +/* + * cYAML_create_seq + * Creates a CYAML of type ARRAY + * Once this is created, more sequence items can be added. + * + * parent - parent node + * key - node key + */ +struct cYAML *cYAML_create_seq(struct cYAML *parent, char *key); + +/* + * cYAML_create_object + * Create a sequence item, which can have more entites added underneath + * it + * + * parent - parent node + */ +struct cYAML *cYAML_create_seq_item(struct cYAML *seq); + +/* + * cYAML_create_string + * Creates a cYAML node of type STRING + * + * parent - parent node + * key - node key + * value - value of node + */ +struct cYAML *cYAML_create_string(struct cYAML *parent, char *key, + char *value); + +/* + * cYAML_create_string + * Creates a cYAML node of type STRING + * + * parent - parent node + * key - node key + * value - value of node + */ +struct cYAML *cYAML_create_number(struct cYAML *parent, char *key, + double value); + +/* + * cYAML_insert_sibling + * inserts one cYAML object as a sibling to another + * + * root - root node to have a sibling added to + * sibling - sibling to be added + */ +void cYAML_insert_sibling(struct cYAML *root, struct cYAML *sibling); + +/* + * cYAML_insert_child + * inserts one cYAML object as a child to another + * + * parent - parent node to have a child added to + * child - child to be added + */ +void cYAML_insert_child(struct cYAML *parent, struct cYAML *node); + +/* + * cYAML_build_error + * Build a YAML error message given: + * + * rc - return code to add in the error + * seq_no - a sequence number to add in the error + * cmd - the command that failed. + * entity - command entity that failed. + * err_str - error string to add in the error + * root - the root to which to add the YAML error + */ +void cYAML_build_error(int rc, int seq_no, char *cmd, + char *entity, char *err_str, + struct cYAML **root); + + +#endif /* CYAML_H */ diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/api.h b/drivers/staging/lustrefsx/lnet/include/lnet/api.h new file mode 100644 index 0000000000000..1ce4a0056829d --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/lnet/api.h @@ -0,0 +1,219 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_API_H__ +#define __LNET_API_H__ + +/** \defgroup lnet LNet + * + * The Lustre Networking subsystem. + * + * LNet is an asynchronous message-passing API, which provides an unreliable + * connectionless service that can't guarantee any order. It supports OFA IB, + * TCP/IP, and Cray Portals, and routes between heterogeneous networks. + * @{ + */ + +#ifndef __KERNEL__ +# error This include is only for kernel use. +#endif + +#include + +/** \defgroup lnet_init_fini Initialization and cleanup + * The LNet must be properly initialized before any LNet calls can be made. + * @{ */ +int LNetNIInit(lnet_pid_t requested_pid); +int LNetNIFini(void); +/** @} lnet_init_fini */ + +/** \defgroup lnet_addr LNet addressing and basic types + * + * Addressing scheme and basic data types of LNet. + * + * The LNet API is memory-oriented, so LNet must be able to address not only + * end-points but also memory region within a process address space. + * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process + * in a node. A portal represents an opening in the address space of a + * process. Match bits is criteria to identify a region of memory inside a + * portal, and offset specifies an offset within the memory region. + * + * LNet creates a table of portals for each process during initialization. + * This table has MAX_PORTALS entries and its size can't be dynamically + * changed. A portal stays empty until the owning process starts to add + * memory regions to it. A portal is sometimes called an index because + * it's an entry in the portals table of a process. + * + * \see LNetMEAttach + * @{ */ +int LNetGetId(unsigned int index, struct lnet_process_id *id); +int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order); +lnet_nid_t LNetPrimaryNID(lnet_nid_t nid); +bool LNetIsPeerLocal(lnet_nid_t nid); + +/** @} lnet_addr */ + + +/** \defgroup lnet_me Match entries + * + * A match entry (abbreviated as ME) describes a set of criteria to accept + * incoming requests. + * + * A portal is essentially a match list plus a set of attributes. A match + * list is a chain of MEs. Each ME includes a pointer to a memory descriptor + * and a set of match criteria. The match criteria can be used to reject + * incoming requests based on process ID or the match bits provided in the + * request. MEs can be dynamically inserted into a match list by LNetMEAttach() + * and LNetMEInsert(), and removed from its list by LNetMEUnlink(). + * @{ */ +int LNetMEAttach(unsigned int portal, + struct lnet_process_id match_id_in, + __u64 match_bits_in, + __u64 ignore_bits_in, + enum lnet_unlink unlink_in, + enum lnet_ins_pos pos_in, + struct lnet_handle_me *handle_out); + +int LNetMEInsert(struct lnet_handle_me current_in, + struct lnet_process_id match_id_in, + __u64 match_bits_in, + __u64 ignore_bits_in, + enum lnet_unlink unlink_in, + enum lnet_ins_pos position_in, + struct lnet_handle_me *handle_out); + +int LNetMEUnlink(struct lnet_handle_me current_in); +/** @} lnet_me */ + +/** \defgroup lnet_md Memory descriptors + * + * A memory descriptor contains information about a region of a user's + * memory (either in kernel or user space) and optionally points to an + * event queue where information about the operations performed on the + * memory descriptor are recorded. Memory descriptor is abbreviated as + * MD and can be used interchangeably with the memory region it describes. + * + * The LNet API provides two operations to create MDs: LNetMDAttach() + * and LNetMDBind(); one operation to unlink and release the resources + * associated with a MD: LNetMDUnlink(). + * @{ */ +int LNetMDAttach(struct lnet_handle_me current_in, + struct lnet_md md_in, + enum lnet_unlink unlink_in, + struct lnet_handle_md *md_handle_out); + +int LNetMDBind(struct lnet_md md_in, + enum lnet_unlink unlink_in, + struct lnet_handle_md *md_handle_out); + +int LNetMDUnlink(struct lnet_handle_md md_in); +/** @} lnet_md */ + +/** \defgroup lnet_eq Events and event queues + * + * Event queues (abbreviated as EQ) are used to log operations performed on + * local MDs. In particular, they signal the completion of a data transmission + * into or out of a MD. They can also be used to hold acknowledgments for + * completed PUT operations and indicate when a MD has been unlinked. Multiple + * MDs can share a single EQ. An EQ may have an optional event handler + * associated with it. If an event handler exists, it will be run for each + * event that is deposited into the EQ. + * + * In addition to the struct lnet_handle_eq, the LNet API defines two types + * associated with events: The ::lnet_event_kind defines the kinds of events + * that can be stored in an EQ. The struct lnet_event defines a structure that + * holds the information about with an event. + * + * There are five functions for dealing with EQs: LNetEQAlloc() is used to + * create an EQ and allocate the resources needed, while LNetEQFree() + * releases these resources and free the EQ. LNetEQGet() retrieves the next + * event from an EQ, and LNetEQWait() can be used to block a process until + * an EQ has at least one event. LNetEQPoll() can be used to test or wait + * on multiple EQs. + * @{ */ +int LNetEQAlloc(unsigned int count_in, + lnet_eq_handler_t handler, + struct lnet_handle_eq *handle_out); + +int LNetEQFree(struct lnet_handle_eq eventq_in); + +int LNetEQGet(struct lnet_handle_eq eventq_in, + struct lnet_event *event_out); + +int LNetEQWait(struct lnet_handle_eq eventq_in, + struct lnet_event *event_out); + +int LNetEQPoll(struct lnet_handle_eq *eventqs_in, + int neq_in, + signed long timeout, + struct lnet_event *event_out, + int *which_eq_out); +/** @} lnet_eq */ + +/** \defgroup lnet_data Data movement operations + * + * The LNet API provides two data movement operations: LNetPut() + * and LNetGet(). + * @{ */ +int LNetPut(lnet_nid_t self, + struct lnet_handle_md md_in, + enum lnet_ack_req ack_req_in, + struct lnet_process_id target_in, + unsigned int portal_in, + __u64 match_bits_in, + unsigned int offset_in, + __u64 hdr_data_in); + +int LNetGet(lnet_nid_t self, + struct lnet_handle_md md_in, + struct lnet_process_id target_in, + unsigned int portal_in, + __u64 match_bits_in, + unsigned int offset_in, + bool recovery); +/** @} lnet_data */ + + +/** \defgroup lnet_misc Miscellaneous operations. + * Miscellaneous operations. + * @{ */ + +int LNetSetLazyPortal(int portal); +int LNetClearLazyPortal(int portal); +int LNetCtl(unsigned int cmd, void *arg); +void LNetDebugPeer(struct lnet_process_id id); +int LNetGetPeerDiscoveryStatus(void); + +/** @} lnet_misc */ + +/** @} lnet */ +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h new file mode 100644 index 0000000000000..3115757aea5d6 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h @@ -0,0 +1,1037 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/lib-lnet.h + * + * Top level include for library side routines + */ + +#ifndef __LNET_LIB_LNET_H__ +#define __LNET_LIB_LNET_H__ + +#include + +#include +#include +#include +#include +#include +#include +#include + +extern struct lnet the_lnet; /* THE network */ + +#if (BITS_PER_LONG == 32) +/* 2 CPTs, allowing more CPTs might make us under memory pressure */ +# define LNET_CPT_MAX_BITS 1 + +#else /* 64-bit system */ +/* + * 256 CPTs for thousands of CPUs, allowing more CPTs might make us + * under risk of consuming all lh_cookie. + */ +# define LNET_CPT_MAX_BITS 8 +#endif /* BITS_PER_LONG == 32 */ + +/* max allowed CPT number */ +#define LNET_CPT_MAX (1 << LNET_CPT_MAX_BITS) + +#define LNET_CPT_NUMBER (the_lnet.ln_cpt_number) +#define LNET_CPT_BITS (the_lnet.ln_cpt_bits) +#define LNET_CPT_MASK ((1ULL << LNET_CPT_BITS) - 1) + +/** exclusive lock */ +#define LNET_LOCK_EX CFS_PERCPT_LOCK_EX + +/* default timeout */ +#define DEFAULT_PEER_TIMEOUT 180 +#define LNET_LND_DEFAULT_TIMEOUT 5 + +#ifdef HAVE_KERN_SOCK_GETNAME_2ARGS +#define lnet_kernel_getpeername(sock, addr, addrlen) \ + kernel_getpeername(sock, addr) +#define lnet_kernel_getsockname(sock, addr, addrlen) \ + kernel_getsockname(sock, addr) +#else +#define lnet_kernel_getpeername(sock, addr, addrlen) \ + kernel_getpeername(sock, addr, addrlen) +#define lnet_kernel_getsockname(sock, addr, addrlen) \ + kernel_getsockname(sock, addr, addrlen) +#endif + +#ifndef fallthrough +#define fallthrough do {} while (0) /* fallthrough */ +#endif + +static inline int lnet_is_route_alive(struct lnet_route *route) +{ + if (!route->lr_gateway->lpni_alive) + return 0; /* gateway is down */ + if ((route->lr_gateway->lpni_ping_feats & + LNET_PING_FEAT_NI_STATUS) == 0) + return 1; /* no NI status, assume it's alive */ + /* has NI status, check # down NIs */ + return route->lr_downis == 0; +} + +static inline int lnet_is_wire_handle_none(struct lnet_handle_wire *wh) +{ + return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE && + wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE); +} + +static inline int lnet_md_exhausted(struct lnet_libmd *md) +{ + return (md->md_threshold == 0 || + ((md->md_options & LNET_MD_MAX_SIZE) != 0 && + md->md_offset + md->md_max_size > md->md_length)); +} + +static inline int lnet_md_unlinkable(struct lnet_libmd *md) +{ + /* Should unlink md when its refcount is 0 and either: + * - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink, + * in the latter case md may not be exhausted). + * - auto unlink is on and md is exhausted. + */ + if (md->md_refcount != 0) + return 0; + + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0) + return 1; + + return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 && + lnet_md_exhausted(md)); +} + +#define lnet_cpt_table() (the_lnet.ln_cpt_table) +#define lnet_cpt_current() cfs_cpt_current(the_lnet.ln_cpt_table, 1) + +static inline int +lnet_cpt_of_cookie(__u64 cookie) +{ + unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK; + + /* LNET_CPT_NUMBER doesn't have to be power2, which means we can + * get illegal cpt from it's invalid cookie */ + return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER; +} + +static inline void +lnet_res_lock(int cpt) +{ + cfs_percpt_lock(the_lnet.ln_res_lock, cpt); +} + +static inline void +lnet_res_unlock(int cpt) +{ + cfs_percpt_unlock(the_lnet.ln_res_lock, cpt); +} + +static inline int +lnet_res_lock_current(void) +{ + int cpt = lnet_cpt_current(); + + lnet_res_lock(cpt); + return cpt; +} + +static inline void +lnet_net_lock(int cpt) +{ + cfs_percpt_lock(the_lnet.ln_net_lock, cpt); +} + +static inline void +lnet_net_unlock(int cpt) +{ + cfs_percpt_unlock(the_lnet.ln_net_lock, cpt); +} + +static inline int +lnet_net_lock_current(void) +{ + int cpt = lnet_cpt_current(); + + lnet_net_lock(cpt); + return cpt; +} + +#define LNET_LOCK() lnet_net_lock(LNET_LOCK_EX) +#define LNET_UNLOCK() lnet_net_unlock(LNET_LOCK_EX) + +#define lnet_ptl_lock(ptl) spin_lock(&(ptl)->ptl_lock) +#define lnet_ptl_unlock(ptl) spin_unlock(&(ptl)->ptl_lock) +#define lnet_eq_wait_lock() spin_lock(&the_lnet.ln_eq_wait_lock) +#define lnet_eq_wait_unlock() spin_unlock(&the_lnet.ln_eq_wait_lock) +#define lnet_ni_lock(ni) spin_lock(&(ni)->ni_lock) +#define lnet_ni_unlock(ni) spin_unlock(&(ni)->ni_lock) + +#define MAX_PORTALS 64 + +#define LNET_SMALL_MD_SIZE offsetof(struct lnet_libmd, md_iov.iov[1]) +extern struct kmem_cache *lnet_mes_cachep; /* MEs kmem_cache */ +extern struct kmem_cache *lnet_small_mds_cachep; /* <= LNET_SMALL_MD_SIZE bytes + * MDs kmem_cache */ + +static inline struct lnet_eq * +lnet_eq_alloc (void) +{ + struct lnet_eq *eq; + + LIBCFS_ALLOC(eq, sizeof(*eq)); + return (eq); +} + +static inline void +lnet_eq_free(struct lnet_eq *eq) +{ + LIBCFS_FREE(eq, sizeof(*eq)); +} + +static inline struct lnet_libmd * +lnet_md_alloc(struct lnet_md *umd) +{ + struct lnet_libmd *md; + unsigned int size; + unsigned int niov; + + if ((umd->options & LNET_MD_KIOV) != 0) { + niov = umd->length; + size = offsetof(struct lnet_libmd, md_iov.kiov[niov]); + } else { + niov = ((umd->options & LNET_MD_IOVEC) != 0) ? + umd->length : 1; + size = offsetof(struct lnet_libmd, md_iov.iov[niov]); + } + + if (size <= LNET_SMALL_MD_SIZE) { + md = kmem_cache_alloc(lnet_small_mds_cachep, + GFP_NOFS | __GFP_ZERO); + if (md) { + CDEBUG(D_MALLOC, "slab-alloced 'md' of size %u at " + "%p.\n", size, md); + } else { + CDEBUG(D_MALLOC, "failed to allocate 'md' of size %u\n", + size); + return NULL; + } + } else { + LIBCFS_ALLOC(md, size); + } + + if (md != NULL) { + /* Set here in case of early free */ + md->md_options = umd->options; + md->md_niov = niov; + INIT_LIST_HEAD(&md->md_list); + } + + return md; +} + +static inline void +lnet_md_free(struct lnet_libmd *md) +{ + unsigned int size; + + if ((md->md_options & LNET_MD_KIOV) != 0) + size = offsetof(struct lnet_libmd, md_iov.kiov[md->md_niov]); + else + size = offsetof(struct lnet_libmd, md_iov.iov[md->md_niov]); + + if (size <= LNET_SMALL_MD_SIZE) { + CDEBUG(D_MALLOC, "slab-freed 'md' at %p.\n", md); + kmem_cache_free(lnet_small_mds_cachep, md); + } else { + LIBCFS_FREE(md, size); + } +} + +static inline struct lnet_me * +lnet_me_alloc (void) +{ + struct lnet_me *me; + + me = kmem_cache_alloc(lnet_mes_cachep, GFP_NOFS | __GFP_ZERO); + + if (me) + CDEBUG(D_MALLOC, "slab-alloced 'me' at %p.\n", me); + else + CDEBUG(D_MALLOC, "failed to allocate 'me'\n"); + + return me; +} + +static inline void +lnet_me_free(struct lnet_me *me) +{ + CDEBUG(D_MALLOC, "slab-freed 'me' at %p.\n", me); + kmem_cache_free(lnet_mes_cachep, me); +} + +struct lnet_libhandle *lnet_res_lh_lookup(struct lnet_res_container *rec, + __u64 cookie); +void lnet_res_lh_initialize(struct lnet_res_container *rec, + struct lnet_libhandle *lh); +static inline void +lnet_res_lh_invalidate(struct lnet_libhandle *lh) +{ + /* ALWAYS called with resource lock held */ + /* NB: cookie is still useful, don't reset it */ + list_del(&lh->lh_hash_chain); +} + +static inline void +lnet_eq2handle(struct lnet_handle_eq *handle, struct lnet_eq *eq) +{ + if (eq == NULL) { + LNetInvalidateEQHandle(handle); + return; + } + + handle->cookie = eq->eq_lh.lh_cookie; +} + +static inline struct lnet_eq * +lnet_handle2eq(struct lnet_handle_eq *handle) +{ + /* ALWAYS called with resource lock held */ + struct lnet_libhandle *lh; + + lh = lnet_res_lh_lookup(&the_lnet.ln_eq_container, handle->cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, struct lnet_eq, eq_lh); +} + +static inline void +lnet_md2handle(struct lnet_handle_md *handle, struct lnet_libmd *md) +{ + handle->cookie = md->md_lh.lh_cookie; +} + +static inline struct lnet_libmd * +lnet_handle2md(struct lnet_handle_md *handle) +{ + /* ALWAYS called with resource lock held */ + struct lnet_libhandle *lh; + int cpt; + + cpt = lnet_cpt_of_cookie(handle->cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt], + handle->cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, struct lnet_libmd, md_lh); +} + +static inline struct lnet_libmd * +lnet_wire_handle2md(struct lnet_handle_wire *wh) +{ + /* ALWAYS called with resource lock held */ + struct lnet_libhandle *lh; + int cpt; + + if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie) + return NULL; + + cpt = lnet_cpt_of_cookie(wh->wh_object_cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt], + wh->wh_object_cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, struct lnet_libmd, md_lh); +} + +static inline void +lnet_me2handle(struct lnet_handle_me *handle, struct lnet_me *me) +{ + handle->cookie = me->me_lh.lh_cookie; +} + +static inline struct lnet_me * +lnet_handle2me(struct lnet_handle_me *handle) +{ + /* ALWAYS called with resource lock held */ + struct lnet_libhandle *lh; + int cpt; + + cpt = lnet_cpt_of_cookie(handle->cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_me_containers[cpt], + handle->cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, struct lnet_me, me_lh); +} + +static inline void +lnet_peer_net_addref_locked(struct lnet_peer_net *lpn) +{ + atomic_inc(&lpn->lpn_refcount); +} + +extern void lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn); + +static inline void +lnet_peer_net_decref_locked(struct lnet_peer_net *lpn) +{ + if (atomic_dec_and_test(&lpn->lpn_refcount)) + lnet_destroy_peer_net_locked(lpn); +} + +static inline void +lnet_peer_addref_locked(struct lnet_peer *lp) +{ + atomic_inc(&lp->lp_refcount); +} + +extern void lnet_destroy_peer_locked(struct lnet_peer *lp); + +static inline void +lnet_peer_decref_locked(struct lnet_peer *lp) +{ + if (atomic_dec_and_test(&lp->lp_refcount)) + lnet_destroy_peer_locked(lp); +} + +static inline void +lnet_peer_ni_addref_locked(struct lnet_peer_ni *lp) +{ + LASSERT(atomic_read(&lp->lpni_refcount) > 0); + atomic_inc(&lp->lpni_refcount); +} + +extern void lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lp); + +static inline void +lnet_peer_ni_decref_locked(struct lnet_peer_ni *lp) +{ + LASSERT(atomic_read(&lp->lpni_refcount) > 0); + if (atomic_dec_and_test(&lp->lpni_refcount)) + lnet_destroy_peer_ni_locked(lp); +} + +static inline int +lnet_isrouter(struct lnet_peer_ni *lp) +{ + return lp->lpni_rtr_refcount != 0; +} + +static inline void +lnet_ni_addref_locked(struct lnet_ni *ni, int cpt) +{ + LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER); + LASSERT(*ni->ni_refs[cpt] >= 0); + + (*ni->ni_refs[cpt])++; +} + +static inline void +lnet_ni_addref(struct lnet_ni *ni) +{ + lnet_net_lock(0); + lnet_ni_addref_locked(ni, 0); + lnet_net_unlock(0); +} + +static inline void +lnet_ni_decref_locked(struct lnet_ni *ni, int cpt) +{ + LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER); + LASSERT(*ni->ni_refs[cpt] > 0); + + (*ni->ni_refs[cpt])--; +} + +static inline void +lnet_ni_decref(struct lnet_ni *ni) +{ + lnet_net_lock(0); + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); +} + +static inline struct lnet_msg * +lnet_msg_alloc(void) +{ + struct lnet_msg *msg; + + LIBCFS_ALLOC(msg, sizeof(*msg)); + + /* no need to zero, LIBCFS_ALLOC does for us */ + return (msg); +} + +static inline void +lnet_msg_free(struct lnet_msg *msg) +{ + LASSERT(!msg->msg_onactivelist); + LIBCFS_FREE(msg, sizeof(*msg)); +} + +static inline struct lnet_rsp_tracker * +lnet_rspt_alloc(int cpt) +{ + struct lnet_rsp_tracker *rspt; + LIBCFS_ALLOC(rspt, sizeof(*rspt)); + lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc++; + lnet_net_unlock(cpt); + return rspt; +} + +static inline void +lnet_rspt_free(struct lnet_rsp_tracker *rspt, int cpt) +{ + LIBCFS_FREE(rspt, sizeof(*rspt)); + lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc--; + lnet_net_unlock(cpt); +} + +void lnet_ni_free(struct lnet_ni *ni); +void lnet_net_free(struct lnet_net *net); + +struct lnet_net * +lnet_net_alloc(__u32 net_type, struct list_head *netlist); + +struct lnet_ni * +lnet_ni_alloc(struct lnet_net *net, struct cfs_expr_list *el, + char *iface); +struct lnet_ni * +lnet_ni_alloc_w_cpt_array(struct lnet_net *net, __u32 *cpts, __u32 ncpts, + char *iface); + +static inline int +lnet_nid2peerhash(lnet_nid_t nid) +{ + return hash_long(nid, LNET_PEER_HASH_BITS); +} + +static inline struct list_head * +lnet_net2rnethash(__u32 net) +{ + return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) + + LNET_NETTYP(net)) & + ((1U << the_lnet.ln_remote_nets_hbits) - 1)]; +} + +extern struct lnet_lnd the_lolnd; +extern int avoid_asym_router_failure; + +extern unsigned int lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number); +extern int lnet_cpt_of_nid_locked(lnet_nid_t nid, struct lnet_ni *ni); +extern int lnet_cpt_of_nid(lnet_nid_t nid, struct lnet_ni *ni); +extern struct lnet_ni *lnet_nid2ni_locked(lnet_nid_t nid, int cpt); +extern struct lnet_ni *lnet_nid2ni_addref(lnet_nid_t nid); +extern struct lnet_ni *lnet_net2ni_locked(__u32 net, int cpt); +extern struct lnet_ni *lnet_net2ni_addref(__u32 net); +struct lnet_net *lnet_get_net_locked(__u32 net_id); + +int lnet_lib_init(void); +void lnet_lib_exit(void); + +extern unsigned lnet_transaction_timeout; +extern unsigned lnet_retry_count; +extern unsigned int lnet_numa_range; +extern unsigned int lnet_health_sensitivity; +extern unsigned int lnet_recovery_interval; +extern unsigned int lnet_peer_discovery_disabled; +extern unsigned int lnet_drop_asym_route; +extern int portal_rotor; + +void lnet_mt_event_handler(struct lnet_event *event); + +int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, int alive, + time64_t when); +void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive, + time64_t when); +int lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway_nid, + unsigned int priority); +int lnet_check_routes(void); +int lnet_del_route(__u32 net, lnet_nid_t gw_nid); +void lnet_destroy_routes(void); +int lnet_get_route(int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive, __u32 *priority); +int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg); +struct lnet_ni *lnet_get_next_ni_locked(struct lnet_net *mynet, + struct lnet_ni *prev); +struct lnet_ni *lnet_get_ni_idx_locked(int idx); + +extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp, + struct libcfs_ioctl_hdr __user *uparam); +extern int lnet_get_peer_list(__u32 *countp, __u32 *sizep, + struct lnet_process_id __user *ids); +extern void lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all); +extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni); + +void lnet_router_debugfs_init(void); +void lnet_router_debugfs_fini(void); +int lnet_rtrpools_alloc(int im_a_router); +void lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages); +int lnet_rtrpools_adjust(int tiny, int small, int large); +int lnet_rtrpools_enable(void); +void lnet_rtrpools_disable(void); +void lnet_rtrpools_free(int keep_pools); +struct lnet_remotenet *lnet_find_rnet_locked(__u32 net); +int lnet_dyn_add_net(struct lnet_ioctl_config_data *conf); +int lnet_dyn_del_net(__u32 net); +int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf); +int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf); +int lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason); +struct lnet_net *lnet_get_net_locked(__u32 net_id); + +int lnet_islocalnid(lnet_nid_t nid); +int lnet_islocalnet(__u32 net); + +void lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md, + unsigned int offset, unsigned int mlen); +void lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev); +void lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type); +void lnet_msg_commit(struct lnet_msg *msg, int cpt); +void lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status); + +void lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev); +void lnet_prep_send(struct lnet_msg *msg, int type, + struct lnet_process_id target, unsigned int offset, + unsigned int len); +int lnet_send(lnet_nid_t nid, struct lnet_msg *msg, lnet_nid_t rtr_nid); +int lnet_send_ping(lnet_nid_t dest_nid, struct lnet_handle_md *mdh, int nnis, + void *user_ptr, struct lnet_handle_eq eqh, bool recovery); +void lnet_return_tx_credits_locked(struct lnet_msg *msg); +void lnet_return_rx_credits_locked(struct lnet_msg *msg); +void lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp); +void lnet_drop_routed_msgs_locked(struct list_head *list, int cpt); + +struct list_head **lnet_create_array_of_queues(void); + +/* portals functions */ +/* portals attributes */ +static inline int +lnet_ptl_is_lazy(struct lnet_portal *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_LAZY); +} + +static inline int +lnet_ptl_is_unique(struct lnet_portal *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE); +} + +static inline int +lnet_ptl_is_wildcard(struct lnet_portal *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD); +} + +static inline void +lnet_ptl_setopt(struct lnet_portal *ptl, int opt) +{ + ptl->ptl_options |= opt; +} + +static inline void +lnet_ptl_unsetopt(struct lnet_portal *ptl, int opt) +{ + ptl->ptl_options &= ~opt; +} + +/* match-table functions */ +struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable, + struct lnet_process_id id, __u64 mbits); +struct lnet_match_table *lnet_mt_of_attach(unsigned int index, + struct lnet_process_id id, + __u64 mbits, __u64 ignore_bits, + enum lnet_ins_pos pos); +int lnet_mt_match_md(struct lnet_match_table *mtable, + struct lnet_match_info *info, struct lnet_msg *msg); + +/* portals match/attach functions */ +void lnet_ptl_attach_md(struct lnet_me *me, struct lnet_libmd *md, + struct list_head *matches, struct list_head *drops); +void lnet_ptl_detach_md(struct lnet_me *me, struct lnet_libmd *md); +int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg); + +/* initialized and finalize portals */ +int lnet_portals_create(void); +void lnet_portals_destroy(void); + +/* message functions */ +int lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, + lnet_nid_t fromnid, void *private, int rdma_req); +int lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg); +int lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg); + +void lnet_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int offset, unsigned int mlen, + unsigned int rlen); +void lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int offset, + unsigned int mlen, unsigned int rlen); +void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg); + +struct lnet_msg *lnet_create_reply_msg(struct lnet_ni *ni, + struct lnet_msg *get_msg); +void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg, + unsigned int len); +void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt); +void lnet_clean_zombie_rstqs(void); + +void lnet_finalize(struct lnet_msg *msg, int rc); +bool lnet_send_error_simulation(struct lnet_msg *msg, + enum lnet_msg_hstatus *hstatus); +void lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni); + +void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, + unsigned int nob, __u32 msg_type); +void lnet_drop_delayed_msg_list(struct list_head *head, char *reason); +void lnet_recv_delayed_msg_list(struct list_head *head); + +int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt); +void lnet_msg_container_cleanup(struct lnet_msg_container *container); +void lnet_msg_containers_destroy(void); +int lnet_msg_containers_create(void); + +char *lnet_health_error2str(enum lnet_msg_hstatus hstatus); +char *lnet_msgtyp2str(int type); +void lnet_print_hdr(struct lnet_hdr *hdr); +int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold); + +/** \addtogroup lnet_fault_simulation @{ */ + +int lnet_fault_ctl(int cmd, struct libcfs_ioctl_data *data); +int lnet_fault_init(void); +void lnet_fault_fini(void); + +bool lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus); + +int lnet_delay_rule_add(struct lnet_fault_attr *attr); +int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown); +int lnet_delay_rule_list(int pos, struct lnet_fault_attr *attr, + struct lnet_fault_stat *stat); +void lnet_delay_rule_reset(void); +void lnet_delay_rule_check(void); +bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg); + +/** @} lnet_fault_simulation */ + +void lnet_counters_get_common(struct lnet_counters_common *common); +void lnet_counters_get(struct lnet_counters *counters); +void lnet_counters_reset(void); + +unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov); +int lnet_extract_iov(int dst_niov, struct kvec *dst, + int src_niov, struct kvec *src, + unsigned int offset, unsigned int len); + +unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov); +int lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len); + +void lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, + unsigned int doffset, + unsigned int nsiov, struct kvec *siov, + unsigned int soffset, unsigned int nob); +void lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov, + unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, unsigned int nob); +void lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, + unsigned int niov, struct kvec *iov, + unsigned int iovoffset, unsigned int nob); +void lnet_copy_kiov2kiov(unsigned int ndkiov, lnet_kiov_t *dkiov, + unsigned int doffset, + unsigned int nskiov, lnet_kiov_t *skiov, + unsigned int soffset, unsigned int nob); + +static inline void +lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset, + unsigned int nsiov, struct kvec *siov, unsigned int soffset, + unsigned int nob) +{ + struct kvec diov = { .iov_base = dest, .iov_len = dlen }; + + lnet_copy_iov2iov(1, &diov, doffset, + nsiov, siov, soffset, nob); +} + +static inline void +lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset, + unsigned int nsiov, lnet_kiov_t *skiov, + unsigned int soffset, unsigned int nob) +{ + struct kvec diov = { .iov_base = dest, .iov_len = dlen }; + + lnet_copy_kiov2iov(1, &diov, doffset, + nsiov, skiov, soffset, nob); +} + +static inline void +lnet_copy_flat2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, + int slen, void *src, unsigned int soffset, + unsigned int nob) +{ + struct kvec siov = { .iov_base = src, .iov_len = slen }; + lnet_copy_iov2iov(ndiov, diov, doffset, + 1, &siov, soffset, nob); +} + +static inline void +lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, + unsigned int doffset, int slen, void *src, + unsigned int soffset, unsigned int nob) +{ + struct kvec siov = { .iov_base = src, .iov_len = slen }; + lnet_copy_iov2kiov(ndiov, dkiov, doffset, + 1, &siov, soffset, nob); +} + +void lnet_me_unlink(struct lnet_me *me); + +void lnet_md_unlink(struct lnet_libmd *md); +void lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_md *umd); +struct page *lnet_kvaddr_to_page(unsigned long vaddr); +int lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset); + +unsigned int lnet_get_lnd_timeout(void); +void lnet_register_lnd(struct lnet_lnd *lnd); +void lnet_unregister_lnd(struct lnet_lnd *lnd); + +int lnet_connect(struct socket **sockp, lnet_nid_t peer_nid, + __u32 local_ip, __u32 peer_ip, int peer_port, struct net *ns); +void lnet_connect_console_error(int rc, lnet_nid_t peer_nid, + __u32 peer_ip, int port); +int lnet_count_acceptor_nets(void); +int lnet_acceptor_timeout(void); +int lnet_acceptor_port(void); +int lnet_acceptor_start(void); +void lnet_acceptor_stop(void); + +struct lnet_inetdev { + u32 li_cpt; + u32 li_flags; + u32 li_ipaddr; + u32 li_netmask; + char li_name[IFNAMSIZ]; +}; + +int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns); +void lnet_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize); +void lnet_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize); +int lnet_sock_getaddr(struct socket *socket, bool remote, __u32 *ip, int *port); +int lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout); +int lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout); + +int lnet_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog, + struct net *ns); +int lnet_sock_accept(struct socket **newsockp, struct socket *sock); +int lnet_sock_connect(struct socket **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port, struct net *ns); + +int lnet_peers_start_down(void); +int lnet_peer_buffer_credits(struct lnet_net *net); + +int lnet_monitor_thr_start(void); +void lnet_monitor_thr_stop(void); + +bool lnet_router_checker_active(void); +void lnet_check_routers(void); +int lnet_router_pre_mt_start(void); +void lnet_router_post_mt_start(void); +void lnet_prune_rc_data(int wait_unlink); +void lnet_router_cleanup(void); +void lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net); +void lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf); + +int lnet_ping_info_validate(struct lnet_ping_info *pinfo); +struct lnet_ping_buffer *lnet_ping_buffer_alloc(int nnis, gfp_t gfp); +void lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf); + +static inline void lnet_ping_buffer_addref(struct lnet_ping_buffer *pbuf) +{ + atomic_inc(&pbuf->pb_refcnt); +} + +static inline void lnet_ping_buffer_decref(struct lnet_ping_buffer *pbuf) +{ + if (atomic_dec_and_test(&pbuf->pb_refcnt)) + lnet_ping_buffer_free(pbuf); +} + +static inline int lnet_ping_buffer_numref(struct lnet_ping_buffer *pbuf) +{ + return atomic_read(&pbuf->pb_refcnt); +} + +static inline int lnet_push_target_resize_needed(void) +{ + return the_lnet.ln_push_target->pb_nnis < the_lnet.ln_push_target_nnis; +} + +int lnet_push_target_resize(void); +void lnet_peer_push_event(struct lnet_event *ev); + +int lnet_parse_ip2nets(char **networksp, char *ip2nets); +int lnet_parse_routes(char *route_str, int *im_a_router); +int lnet_parse_networks(struct list_head *nilist, char *networks, + bool use_tcp_bonding); +bool lnet_net_unique(__u32 net_id, struct list_head *nilist, + struct lnet_net **net); +bool lnet_ni_unique_net(struct list_head *nilist, char *iface); +void lnet_incr_dlc_seq(void); +__u32 lnet_get_dlc_seq_locked(void); + +struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer, + struct lnet_peer_net *peer_net, + struct lnet_peer_ni *prev); +struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref, + int cpt); +struct lnet_peer_ni *lnet_nid2peerni_ex(lnet_nid_t nid, int cpt); +struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid); +struct lnet_peer *lnet_find_peer(lnet_nid_t nid); +void lnet_peer_net_added(struct lnet_net *net); +lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid); +int lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block); +int lnet_peer_discovery_start(void); +void lnet_peer_discovery_stop(void); +void lnet_push_update_to_peers(int force); +void lnet_peer_tables_cleanup(struct lnet_net *net); +void lnet_peer_uninit(void); +int lnet_peer_tables_create(void); +void lnet_debug_peer(lnet_nid_t nid); +struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer, + __u32 net_id); +bool lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid); +int lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid); +int lnet_add_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid, bool mr); +int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid); +int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk); +int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid, + char alivness[LNET_MAX_STR_LEN], + __u32 *cpt_iter, __u32 *refcount, + __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits, + __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis, + __u32 *peer_tx_qnob); +int lnet_get_peer_ni_hstats(struct lnet_ioctl_peer_ni_hstats *stats); + +static inline struct lnet_peer_net * +lnet_find_peer_net_locked(struct lnet_peer *peer, __u32 net_id) +{ + struct lnet_peer_net *peer_net; + + list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) { + if (peer_net->lpn_net_id == net_id) + return peer_net; + } + + return NULL; +} + +static inline void +lnet_peer_set_alive(struct lnet_peer_ni *lp) +{ + lp->lpni_last_alive = ktime_get_seconds(); + lp->lpni_last_query = lp->lpni_last_alive; + if (!lp->lpni_alive) + lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive); +} + +static inline bool +lnet_peer_is_multi_rail(struct lnet_peer *lp) +{ + if (lp->lp_state & LNET_PEER_MULTI_RAIL) + return true; + return false; +} + +static inline bool +lnet_peer_ni_is_configured(struct lnet_peer_ni *lpni) +{ + if (lpni->lpni_peer_net->lpn_peer->lp_state & LNET_PEER_CONFIGURED) + return true; + return false; +} + +static inline bool +lnet_peer_ni_is_primary(struct lnet_peer_ni *lpni) +{ + return lpni->lpni_nid == lpni->lpni_peer_net->lpn_peer->lp_primary_nid; +} + +bool lnet_peer_is_uptodate(struct lnet_peer *lp); +bool lnet_peer_is_uptodate_locked(struct lnet_peer *lp); +bool lnet_is_discovery_disabled(struct lnet_peer *lp); + +static inline bool +lnet_peer_needs_push(struct lnet_peer *lp) +{ + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) + return false; + if (lp->lp_state & LNET_PEER_FORCE_PUSH) + return true; + if (lp->lp_state & LNET_PEER_NO_DISCOVERY) + return false; + /* if discovery is not enabled then no need to push */ + if (lnet_peer_discovery_disabled) + return false; + if (lp->lp_node_seqno < atomic_read(&the_lnet.ln_ping_target_seqno)) + return true; + return false; +} + +static inline void +lnet_inc_healthv(atomic_t *healthv) +{ + atomic_add_unless(healthv, 1, LNET_MAX_HEALTH_VALUE); +} + +void lnet_incr_stats(struct lnet_element_stats *stats, + enum lnet_msg_type msg_type, + enum lnet_stats_type stats_type); + +__u32 lnet_sum_stats(struct lnet_element_stats *stats, + enum lnet_stats_type stats_type); + +void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats, + struct lnet_element_stats *stats); + +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h new file mode 100644 index 0000000000000..496a1b0fe0f93 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h @@ -0,0 +1,1164 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/lib-types.h + * + * Types used by the library side routines that do not need to be + * exposed to the user application + */ + +#ifndef __LNET_LIB_TYPES_H__ +#define __LNET_LIB_TYPES_H__ + +#ifndef __KERNEL__ +# error This include is only for kernel use. +#endif + +#include +#include +#include +#include + +#include +#include + +/* Max payload size */ +#define LNET_MAX_PAYLOAD LNET_MTU + +/** limit on the number of fragments in discontiguous MDs */ +#define LNET_MAX_IOV 256 + +/* + * This is the maximum health value. + * All local and peer NIs created have their health default to this value. + */ +#define LNET_MAX_HEALTH_VALUE 1000 + +/* forward refs */ +struct lnet_libmd; + +enum lnet_msg_hstatus { + LNET_MSG_STATUS_OK = 0, + LNET_MSG_STATUS_LOCAL_INTERRUPT, + LNET_MSG_STATUS_LOCAL_DROPPED, + LNET_MSG_STATUS_LOCAL_ABORTED, + LNET_MSG_STATUS_LOCAL_NO_ROUTE, + LNET_MSG_STATUS_LOCAL_ERROR, + LNET_MSG_STATUS_LOCAL_TIMEOUT, + LNET_MSG_STATUS_REMOTE_ERROR, + LNET_MSG_STATUS_REMOTE_DROPPED, + LNET_MSG_STATUS_REMOTE_TIMEOUT, + LNET_MSG_STATUS_NETWORK_TIMEOUT, + LNET_MSG_STATUS_END, +}; + +struct lnet_rsp_tracker { + /* chain on the waiting list */ + struct list_head rspt_on_list; + /* cpt to lock */ + int rspt_cpt; + /* nid of next hop */ + lnet_nid_t rspt_next_hop_nid; + /* deadline of the REPLY/ACK */ + ktime_t rspt_deadline; + /* parent MD */ + struct lnet_handle_md rspt_mdh; +}; + +struct lnet_msg { + struct list_head msg_activelist; + struct list_head msg_list; /* Q for credits/MD */ + + struct lnet_process_id msg_target; + /* Primary NID of the source. */ + lnet_nid_t msg_initiator; + /* where is it from, it's only for building event */ + lnet_nid_t msg_from; + __u32 msg_type; + + /* + * hold parameters in case message is with held due + * to discovery + */ + lnet_nid_t msg_src_nid_param; + lnet_nid_t msg_rtr_nid_param; + + /* + * Deadline for the message after which it will be finalized if it + * has not completed. + */ + ktime_t msg_deadline; + + /* The message health status. */ + enum lnet_msg_hstatus msg_health_status; + /* This is a recovery message */ + bool msg_recovery; + /* the number of times a transmission has been retried */ + int msg_retry_count; + /* flag to indicate that we do not want to resend this message */ + bool msg_no_resend; + + /* committed for sending */ + unsigned int msg_tx_committed:1; + /* CPT # this message committed for sending */ + unsigned int msg_tx_cpt:15; + /* committed for receiving */ + unsigned int msg_rx_committed:1; + /* CPT # this message committed for receiving */ + unsigned int msg_rx_cpt:15; + /* queued for tx credit */ + unsigned int msg_tx_delayed:1; + /* queued for RX buffer */ + unsigned int msg_rx_delayed:1; + /* ready for pending on RX delay list */ + unsigned int msg_rx_ready_delay:1; + + unsigned int msg_vmflush:1; /* VM trying to free memory */ + unsigned int msg_target_is_router:1; /* sending to a router */ + unsigned int msg_routing:1; /* being forwarded */ + unsigned int msg_ack:1; /* ack on finalize (PUT) */ + unsigned int msg_sending:1; /* outgoing message */ + unsigned int msg_receiving:1; /* being received */ + unsigned int msg_txcredit:1; /* taken an NI send credit */ + unsigned int msg_peertxcredit:1; /* taken a peer send credit */ + unsigned int msg_rtrcredit:1; /* taken a globel router credit */ + unsigned int msg_peerrtrcredit:1; /* taken a peer router credit */ + unsigned int msg_onactivelist:1; /* on the activelist */ + unsigned int msg_rdma_get:1; + + struct lnet_peer_ni *msg_txpeer; /* peer I'm sending to */ + struct lnet_peer_ni *msg_rxpeer; /* peer I received from */ + + void *msg_private; + struct lnet_libmd *msg_md; + /* the NI the message was sent or received over */ + struct lnet_ni *msg_txni; + struct lnet_ni *msg_rxni; + + unsigned int msg_len; + unsigned int msg_wanted; + unsigned int msg_offset; + unsigned int msg_niov; + struct kvec *msg_iov; + lnet_kiov_t *msg_kiov; + + struct lnet_event msg_ev; + struct lnet_hdr msg_hdr; +}; + +struct lnet_libhandle { + struct list_head lh_hash_chain; + __u64 lh_cookie; +}; + +#define lh_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(char *)(&((type *)0)->member))) + +struct lnet_eq { + struct list_head eq_list; + struct lnet_libhandle eq_lh; + unsigned long eq_enq_seq; + unsigned long eq_deq_seq; + unsigned int eq_size; + lnet_eq_handler_t eq_callback; + struct lnet_event *eq_events; + int **eq_refs; /* percpt refcount for EQ */ +}; + +struct lnet_me { + struct list_head me_list; + struct lnet_libhandle me_lh; + struct lnet_process_id me_match_id; + unsigned int me_portal; + unsigned int me_pos; /* hash offset in mt_hash */ + __u64 me_match_bits; + __u64 me_ignore_bits; + enum lnet_unlink me_unlink; + struct lnet_libmd *me_md; +}; + +struct lnet_libmd { + struct list_head md_list; + struct lnet_libhandle md_lh; + struct lnet_me *md_me; + char *md_start; + unsigned int md_offset; + unsigned int md_length; + unsigned int md_max_size; + int md_threshold; + int md_refcount; + unsigned int md_options; + unsigned int md_flags; + unsigned int md_niov; /* # frags at end of struct */ + void *md_user_ptr; + struct lnet_rsp_tracker *md_rspt_ptr; + struct lnet_eq *md_eq; + struct lnet_handle_md md_bulk_handle; + union { + struct kvec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; + } md_iov; +}; + +#define LNET_MD_FLAG_ZOMBIE (1 << 0) +#define LNET_MD_FLAG_AUTO_UNLINK (1 << 1) +#define LNET_MD_FLAG_ABORTED (1 << 2) + +struct lnet_test_peer { + /* info about peers we are trying to fail */ + struct list_head tp_list; /* ln_test_peers */ + lnet_nid_t tp_nid; /* matching nid */ + unsigned int tp_threshold; /* # failures to simulate */ +}; + +#define LNET_COOKIE_TYPE_MD 1 +#define LNET_COOKIE_TYPE_ME 2 +#define LNET_COOKIE_TYPE_EQ 3 +#define LNET_COOKIE_TYPE_BITS 2 +#define LNET_COOKIE_MASK ((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL) + +struct lnet_ni; /* forward ref */ +struct socket; + +struct lnet_lnd { + /* fields managed by portals */ + struct list_head lnd_list; /* stash in the LND table */ + int lnd_refcount; /* # active instances */ + + /* fields initialized by the LND */ + __u32 lnd_type; + + int (*lnd_startup)(struct lnet_ni *ni); + void (*lnd_shutdown)(struct lnet_ni *ni); + int (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg); + + /* In data movement APIs below, payload buffers are described as a set + * of 'niov' fragments which are... + * EITHER + * in virtual memory (struct kvec *iov != NULL) + * OR + * in pages (kernel only: plt_kiov_t *kiov != NULL). + * The LND may NOT overwrite these fragment descriptors. + * An 'offset' and may specify a byte offset within the set of + * fragments to start from + */ + + /* Start sending a preformatted message. 'private' is NULL for PUT and + * GET messages; otherwise this is a response to an incoming message + * and 'private' is the 'private' passed to lnet_parse(). Return + * non-zero for immediate failure, otherwise complete later with + * lnet_finalize() */ + int (*lnd_send)(struct lnet_ni *ni, void *private, + struct lnet_msg *msg); + + /* Start receiving 'mlen' bytes of payload data, skipping the following + * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to + * lnet_parse(). Return non-zero for immedaite failure, otherwise + * complete later with lnet_finalize(). This also gives back a receive + * credit if the LND does flow control. */ + int (*lnd_recv)(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int niov, + struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + + /* lnet_parse() has had to delay processing of this message + * (e.g. waiting for a forwarding buffer or send credits). Give the + * LND a chance to free urgently needed resources. If called, return 0 + * for success and do NOT give back a receive credit; that has to wait + * until lnd_recv() gets called. On failure return < 0 and + * release resources; lnd_recv() will not be called. */ + int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, + struct lnet_msg *msg, void **new_privatep); + + /* notification of peer health */ + void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); + + /* query of peer aliveness */ + void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, time64_t *when); + + /* accept a new connection */ + int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock); +}; + +struct lnet_tx_queue { + int tq_credits; /* # tx credits free */ + int tq_credits_min; /* lowest it's been */ + int tq_credits_max; /* total # tx credits */ + struct list_head tq_delayed; /* delayed TXs */ +}; + +enum lnet_net_state { + /* set when net block is allocated */ + LNET_NET_STATE_INIT = 0, + /* set when NIs in net are started successfully */ + LNET_NET_STATE_ACTIVE, + /* set if all NIs in net are in FAILED state */ + LNET_NET_STATE_INACTIVE, + /* set when shutting down a NET */ + LNET_NET_STATE_DELETING +}; + +enum lnet_ni_state { + /* initial state when NI is created */ + LNET_NI_STATE_INIT = 0, + /* set when NI is brought up */ + LNET_NI_STATE_ACTIVE, + /* set when NI is being shutdown */ + LNET_NI_STATE_DELETING, +}; + +#define LNET_NI_RECOVERY_PENDING BIT(0) +#define LNET_NI_RECOVERY_FAILED BIT(1) + +enum lnet_stats_type { + LNET_STATS_TYPE_SEND = 0, + LNET_STATS_TYPE_RECV, + LNET_STATS_TYPE_DROP +}; + +struct lnet_comm_count { + atomic_t co_get_count; + atomic_t co_put_count; + atomic_t co_reply_count; + atomic_t co_ack_count; + atomic_t co_hello_count; +}; + +struct lnet_element_stats { + struct lnet_comm_count el_send_stats; + struct lnet_comm_count el_recv_stats; + struct lnet_comm_count el_drop_stats; +}; + +struct lnet_health_local_stats { + atomic_t hlt_local_interrupt; + atomic_t hlt_local_dropped; + atomic_t hlt_local_aborted; + atomic_t hlt_local_no_route; + atomic_t hlt_local_timeout; + atomic_t hlt_local_error; +}; + +struct lnet_health_remote_stats { + atomic_t hlt_remote_dropped; + atomic_t hlt_remote_timeout; + atomic_t hlt_remote_error; + atomic_t hlt_network_timeout; +}; + +struct lnet_net { + /* chain on the ln_nets */ + struct list_head net_list; + + /* net ID, which is composed of + * (net_type << 16) | net_num. + * net_type can be one of the enumerated types defined in + * lnet/include/lnet/nidstr.h */ + __u32 net_id; + + /* priority of the network */ + __u32 net_prio; + + /* total number of CPTs in the array */ + __u32 net_ncpts; + + /* cumulative CPTs of all NIs in this net */ + __u32 *net_cpts; + + /* network tunables */ + struct lnet_ioctl_config_lnd_cmn_tunables net_tunables; + + /* + * boolean to indicate that the tunables have been set and + * shouldn't be reset + */ + bool net_tunables_set; + + /* procedural interface */ + struct lnet_lnd *net_lnd; + + /* list of NIs on this net */ + struct list_head net_ni_list; + + /* list of NIs being added, but not started yet */ + struct list_head net_ni_added; + + /* dying LND instances */ + struct list_head net_ni_zombie; + + /* network state */ + enum lnet_net_state net_state; +}; + +struct lnet_ni { + /* chain on the lnet_net structure */ + struct list_head ni_netlist; + + /* chain on the recovery queue */ + struct list_head ni_recovery; + + /* MD handle for recovery ping */ + struct lnet_handle_md ni_ping_mdh; + + spinlock_t ni_lock; + + /* number of CPTs */ + int ni_ncpts; + + /* bond NI on some CPTs */ + __u32 *ni_cpts; + + /* interface's NID */ + lnet_nid_t ni_nid; + + /* instance-specific data */ + void *ni_data; + + /* per ni credits */ + atomic_t ni_tx_credits; + + /* percpt TX queues */ + struct lnet_tx_queue **ni_tx_queues; + + /* percpt reference count */ + int **ni_refs; + + /* when I was last alive */ + time64_t ni_last_alive; + + /* pointer to parent network */ + struct lnet_net *ni_net; + + /* my health status */ + struct lnet_ni_status *ni_status; + + /* NI FSM. Protected by lnet_ni_lock() */ + enum lnet_ni_state ni_state; + + /* Recovery state. Protected by lnet_ni_lock() */ + __u32 ni_recovery_state; + + /* per NI LND tunables */ + struct lnet_lnd_tunables ni_lnd_tunables; + + /* lnd tunables set explicitly */ + bool ni_lnd_tunables_set; + + /* NI statistics */ + struct lnet_element_stats ni_stats; + struct lnet_health_local_stats ni_hstats; + + /* physical device CPT */ + int ni_dev_cpt; + + /* sequence number used to round robin over nis within a net */ + __u32 ni_seq; + + /* + * health value + * initialized to LNET_MAX_HEALTH_VALUE + * Value is decremented every time we fail to send a message over + * this NI because of a NI specific failure. + * Value is incremented if we successfully send a message. + */ + atomic_t ni_healthv; + + /* + * Set to 1 by the LND when it receives an event telling it the device + * has gone into a fatal state. Set to 0 when the LND receives an + * even telling it the device is back online. + */ + atomic_t ni_fatal_error_on; + + /* + * equivalent interfaces to use + * This is an array because socklnd bonding can still be configured + */ + char *ni_interfaces[LNET_INTERFACES_NUM]; + struct net *ni_net_ns; /* original net namespace */ +}; + +#define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL + +/* + * Descriptor of a ping info buffer: keep a separate indicator of the + * size and a reference count. The type is used both as a source and + * sink of data, so we need to keep some information outside of the + * area that may be overwritten by network data. + */ +struct lnet_ping_buffer { + int pb_nnis; + atomic_t pb_refcnt; + struct lnet_ping_info pb_info; +}; + +#define LNET_PING_BUFFER_SIZE(NNIDS) \ + offsetof(struct lnet_ping_buffer, pb_info.pi_ni[NNIDS]) +#define LNET_PING_BUFFER_LONI(PBUF) ((PBUF)->pb_info.pi_ni[0].ns_nid) +#define LNET_PING_BUFFER_SEQNO(PBUF) ((PBUF)->pb_info.pi_ni[0].ns_status) + +#define LNET_PING_INFO_TO_BUFFER(PINFO) \ + container_of((PINFO), struct lnet_ping_buffer, pb_info) + +/* router checker data, per router */ +struct lnet_rc_data { + /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */ + struct list_head rcd_list; + struct lnet_handle_md rcd_mdh; /* ping buffer MD */ + struct lnet_peer_ni *rcd_gateway; /* reference to gateway */ + struct lnet_ping_buffer *rcd_pingbuffer;/* ping buffer */ + int rcd_nnis; /* desired size of buffer */ +}; + +struct lnet_peer_ni { + /* chain on lpn_peer_nis */ + struct list_head lpni_peer_nis; + /* chain on remote peer list */ + struct list_head lpni_on_remote_peer_ni_list; + /* chain on recovery queue */ + struct list_head lpni_recovery; + /* chain on peer hash */ + struct list_head lpni_hashlist; + /* messages blocking for tx credits */ + struct list_head lpni_txq; + /* messages blocking for router credits */ + struct list_head lpni_rtrq; + /* chain on router list */ + struct list_head lpni_rtr_list; + /* pointer to peer net I'm part of */ + struct lnet_peer_net *lpni_peer_net; + /* statistics kept on each peer NI */ + struct lnet_element_stats lpni_stats; + struct lnet_health_remote_stats lpni_hstats; + /* spin lock protecting credits and lpni_txq / lpni_rtrq */ + spinlock_t lpni_lock; + /* # tx credits available */ + int lpni_txcredits; + /* low water mark */ + int lpni_mintxcredits; + /* # router credits */ + int lpni_rtrcredits; + /* low water mark */ + int lpni_minrtrcredits; + /* bytes queued for sending */ + long lpni_txqnob; + /* alive/dead? */ + bool lpni_alive; + /* notification outstanding? */ + bool lpni_notify; + /* outstanding notification for LND? */ + bool lpni_notifylnd; + /* some thread is handling notification */ + bool lpni_notifying; + /* SEND event outstanding from ping */ + bool lpni_ping_notsent; + /* # times router went dead<->alive. Protected with lpni_lock */ + int lpni_alive_count; + /* time of last aliveness news */ + time64_t lpni_timestamp; + /* time of last ping attempt */ + time64_t lpni_ping_timestamp; + /* != 0 if ping reply expected */ + time64_t lpni_ping_deadline; + /* when I was last alive */ + time64_t lpni_last_alive; + /* when lpni_ni was queried last time */ + time64_t lpni_last_query; + /* network peer is on */ + struct lnet_net *lpni_net; + /* peer's NID */ + lnet_nid_t lpni_nid; + /* # refs */ + atomic_t lpni_refcount; + /* health value for the peer */ + atomic_t lpni_healthv; + /* recovery ping mdh */ + struct lnet_handle_md lpni_recovery_ping_mdh; + /* CPT this peer attached on */ + int lpni_cpt; + /* state flags -- protected by lpni_lock */ + unsigned lpni_state; + /* # refs from lnet_route_t::lr_gateway */ + int lpni_rtr_refcount; + /* sequence number used to round robin over peer nis within a net */ + __u32 lpni_seq; + /* sequence number used to round robin over gateways */ + __u32 lpni_gw_seq; + /* health flag */ + bool lpni_healthy; + /* returned RC ping features. Protected with lpni_lock */ + unsigned int lpni_ping_feats; + /* routes on this peer */ + struct list_head lpni_routes; + /* preferred local nids: if only one, use lpni_pref.nid */ + union lpni_pref { + lnet_nid_t nid; + lnet_nid_t *nids; + } lpni_pref; + /* number of preferred NIDs in lnpi_pref_nids */ + __u32 lpni_pref_nnids; + /* router checker state */ + struct lnet_rc_data *lpni_rcd; +}; + +/* Preferred path added due to traffic on non-MR peer_ni */ +#define LNET_PEER_NI_NON_MR_PREF (1 << 0) +/* peer is being recovered. */ +#define LNET_PEER_NI_RECOVERY_PENDING (1 << 1) +/* recovery ping failed */ +#define LNET_PEER_NI_RECOVERY_FAILED (1 << 2) +/* peer is being deleted */ +#define LNET_PEER_NI_DELETING (1 << 3) + +struct lnet_peer { + /* chain on pt_peer_list */ + struct list_head lp_peer_list; + + /* list of peer nets */ + struct list_head lp_peer_nets; + + /* list of messages pending discovery*/ + struct list_head lp_dc_pendq; + + /* primary NID of the peer */ + lnet_nid_t lp_primary_nid; + + /* source NID to use during discovery */ + lnet_nid_t lp_disc_src_nid; + + /* CPT of peer_table */ + int lp_cpt; + + /* number of NIDs on this peer */ + int lp_nnis; + + /* reference count */ + atomic_t lp_refcount; + + /* lock protecting peer state flags */ + spinlock_t lp_lock; + + /* peer state flags */ + unsigned lp_state; + + /* buffer for data pushed by peer */ + struct lnet_ping_buffer *lp_data; + + /* MD handle for ping in progress */ + struct lnet_handle_md lp_ping_mdh; + + /* MD handle for push in progress */ + struct lnet_handle_md lp_push_mdh; + + /* number of NIDs for sizing push data */ + int lp_data_nnis; + + /* NI config sequence number of peer */ + __u32 lp_peer_seqno; + + /* Local NI config sequence number acked by peer */ + __u32 lp_node_seqno; + + /* Local NI config sequence number sent to peer */ + __u32 lp_node_seqno_sent; + + /* Ping error encountered during discovery. */ + int lp_ping_error; + + /* Push error encountered during discovery. */ + int lp_push_error; + + /* Error encountered during discovery. */ + int lp_dc_error; + + /* time it was put on the ln_dc_working queue */ + time64_t lp_last_queued; + + /* link on discovery-related lists */ + struct list_head lp_dc_list; + + /* tasks waiting on discovery of this peer */ + wait_queue_head_t lp_dc_waitq; +}; + +/* + * The status flags in lp_state. Their semantics have chosen so that + * lp_state can be zero-initialized. + * + * A peer is marked MULTI_RAIL in two cases: it was configured using DLC + * as multi-rail aware, or the LNET_PING_FEAT_MULTI_RAIL bit was set. + * + * A peer is marked NO_DISCOVERY if the LNET_PING_FEAT_DISCOVERY bit was + * NOT set when the peer was pinged by discovery. + */ +#define LNET_PEER_MULTI_RAIL (1 << 0) /* Multi-rail aware */ +#define LNET_PEER_NO_DISCOVERY (1 << 1) /* Peer disabled discovery */ +/* + * A peer is marked CONFIGURED if it was configured by DLC. + * + * In addition, a peer is marked DISCOVERED if it has fully passed + * through Peer Discovery. + * + * When Peer Discovery is disabled, the discovery thread will mark + * peers REDISCOVER to indicate that they should be re-examined if + * discovery is (re)enabled on the node. + * + * A peer that was created as the result of inbound traffic will not + * be marked at all. + */ +#define LNET_PEER_CONFIGURED (1 << 2) /* Configured via DLC */ +#define LNET_PEER_DISCOVERED (1 << 3) /* Peer was discovered */ +#define LNET_PEER_REDISCOVER (1 << 4) /* Discovery was disabled */ +/* + * A peer is marked DISCOVERING when discovery is in progress. + * The other flags below correspond to stages of discovery. + */ +#define LNET_PEER_DISCOVERING (1 << 5) /* Discovering */ +#define LNET_PEER_DATA_PRESENT (1 << 6) /* Remote peer data present */ +#define LNET_PEER_NIDS_UPTODATE (1 << 7) /* Remote peer info uptodate */ +#define LNET_PEER_PING_SENT (1 << 8) /* Waiting for REPLY to Ping */ +#define LNET_PEER_PUSH_SENT (1 << 9) /* Waiting for ACK of Push */ +#define LNET_PEER_PING_FAILED (1 << 10) /* Ping send failure */ +#define LNET_PEER_PUSH_FAILED (1 << 11) /* Push send failure */ +/* + * A ping can be forced as a way to fix up state, or as a manual + * intervention by an admin. + * A push can be forced in circumstances that would normally not + * allow for one to happen. + */ +#define LNET_PEER_FORCE_PING (1 << 12) /* Forced Ping */ +#define LNET_PEER_FORCE_PUSH (1 << 13) /* Forced Push */ + +struct lnet_peer_net { + /* chain on lp_peer_nets */ + struct list_head lpn_peer_nets; + + /* list of peer_nis on this network */ + struct list_head lpn_peer_nis; + + /* pointer to the peer I'm part of */ + struct lnet_peer *lpn_peer; + + /* Net ID */ + __u32 lpn_net_id; + + /* reference count */ + atomic_t lpn_refcount; +}; + +/* peer hash size */ +#define LNET_PEER_HASH_BITS 9 +#define LNET_PEER_HASH_SIZE (1 << LNET_PEER_HASH_BITS) + +/* + * peer hash table - one per CPT + * + * protected by lnet_net_lock/EX for update + * pt_version + * pt_number + * pt_hash[...] + * pt_peer_list + * pt_peers + * protected by pt_zombie_lock: + * pt_zombie_list + * pt_zombies + * + * pt_zombie lock nests inside lnet_net_lock + */ +struct lnet_peer_table { + int pt_version; /* /proc validity stamp */ + int pt_number; /* # peers_ni extant */ + struct list_head *pt_hash; /* NID->peer hash */ + struct list_head pt_peer_list; /* peers */ + int pt_peers; /* # peers */ + struct list_head pt_zombie_list; /* zombie peer_ni */ + int pt_zombies; /* # zombie peers_ni */ + spinlock_t pt_zombie_lock; /* protect list and count */ +}; + +/* peer aliveness is enabled only on routers for peers in a network where the + * struct lnet_ni::ni_peertimeout has been set to a positive value + */ +#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \ + ((lp)->lpni_net) && \ + (lp)->lpni_net->net_tunables.lct_peer_timeout > 0) + +struct lnet_route { + struct list_head lr_list; /* chain on net */ + struct list_head lr_gwlist; /* chain on gateway */ + struct lnet_peer_ni *lr_gateway; /* router node */ + __u32 lr_net; /* remote network number */ + int lr_seq; /* sequence for round-robin */ + unsigned int lr_downis; /* number of down NIs */ + __u32 lr_hops; /* how far I am */ + unsigned int lr_priority; /* route priority */ +}; + +#define LNET_REMOTE_NETS_HASH_DEFAULT (1U << 7) +#define LNET_REMOTE_NETS_HASH_MAX (1U << 16) +#define LNET_REMOTE_NETS_HASH_SIZE (1 << the_lnet.ln_remote_nets_hbits) + +struct lnet_remotenet { + /* chain on ln_remote_nets_hash */ + struct list_head lrn_list; + /* routes to me */ + struct list_head lrn_routes; + /* my net number */ + __u32 lrn_net; +}; + +/** lnet message has credit and can be submitted to lnd for send/receive */ +#define LNET_CREDIT_OK 0 +/** lnet message is waiting for credit */ +#define LNET_CREDIT_WAIT 1 +/** lnet message is waiting for discovery */ +#define LNET_DC_WAIT 2 + +struct lnet_rtrbufpool { + /* my free buffer pool */ + struct list_head rbp_bufs; + /* messages blocking for a buffer */ + struct list_head rbp_msgs; + /* # pages in each buffer */ + int rbp_npages; + /* requested number of buffers */ + int rbp_req_nbuffers; + /* # buffers actually allocated */ + int rbp_nbuffers; + /* # free buffers / blocked messages */ + int rbp_credits; + /* low water mark */ + int rbp_mincredits; +}; + +struct lnet_rtrbuf { + struct list_head rb_list; /* chain on rbp_bufs */ + struct lnet_rtrbufpool *rb_pool; /* owning pool */ + lnet_kiov_t rb_kiov[0]; /* the buffer space */ +}; + +#define LNET_PEER_HASHSIZE 503 /* prime! */ + +enum lnet_match_flags { + /* Didn't match anything */ + LNET_MATCHMD_NONE = (1 << 0), + /* Matched OK */ + LNET_MATCHMD_OK = (1 << 1), + /* Must be discarded */ + LNET_MATCHMD_DROP = (1 << 2), + /* match and buffer is exhausted */ + LNET_MATCHMD_EXHAUSTED = (1 << 3), + /* match or drop */ + LNET_MATCHMD_FINISH = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP), +}; + +/* Options for struct lnet_portal::ptl_options */ +#define LNET_PTL_LAZY (1 << 0) +#define LNET_PTL_MATCH_UNIQUE (1 << 1) /* unique match, for RDMA */ +#define LNET_PTL_MATCH_WILDCARD (1 << 2) /* wildcard match, request portal */ + +/* parameter for matching operations (GET, PUT) */ +struct lnet_match_info { + __u64 mi_mbits; + struct lnet_process_id mi_id; + unsigned int mi_cpt; + unsigned int mi_opc; + unsigned int mi_portal; + unsigned int mi_rlength; + unsigned int mi_roffset; +}; + +/* ME hash of RDMA portal */ +#define LNET_MT_HASH_BITS 8 +#define LNET_MT_HASH_SIZE (1 << LNET_MT_HASH_BITS) +#define LNET_MT_HASH_MASK (LNET_MT_HASH_SIZE - 1) +/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash, + * the last entry is reserved for MEs with ignore-bits */ +#define LNET_MT_HASH_IGNORE LNET_MT_HASH_SIZE +/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which + * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the + * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */ +#define LNET_MT_BITS_U64 6 /* 2^6 bits */ +#define LNET_MT_EXHAUSTED_BITS (LNET_MT_HASH_BITS - LNET_MT_BITS_U64) +#define LNET_MT_EXHAUSTED_BMAP ((1 << LNET_MT_EXHAUSTED_BITS) + 1) + +/* portal match table */ +struct lnet_match_table { + /* reserved for upcoming patches, CPU partition ID */ + unsigned int mt_cpt; + unsigned int mt_portal; /* portal index */ + /* match table is set as "enabled" if there's non-exhausted MD + * attached on mt_mhash, it's only valid for wildcard portal */ + unsigned int mt_enabled; + /* bitmap to flag whether MEs on mt_hash are exhausted or not */ + __u64 mt_exhausted[LNET_MT_EXHAUSTED_BMAP]; + struct list_head *mt_mhash; /* matching hash */ +}; + +/* these are only useful for wildcard portal */ +/* Turn off message rotor for wildcard portals */ +#define LNET_PTL_ROTOR_OFF 0 +/* round-robin dispatch all PUT messages for wildcard portals */ +#define LNET_PTL_ROTOR_ON 1 +/* round-robin dispatch routed PUT message for wildcard portals */ +#define LNET_PTL_ROTOR_RR_RT 2 +/* dispatch routed PUT message by hashing source NID for wildcard portals */ +#define LNET_PTL_ROTOR_HASH_RT 3 + +struct lnet_portal { + spinlock_t ptl_lock; + unsigned int ptl_index; /* portal ID, reserved */ + /* flags on this portal: lazy, unique... */ + unsigned int ptl_options; + /* list of messages which are stealing buffer */ + struct list_head ptl_msg_stealing; + /* messages blocking for MD */ + struct list_head ptl_msg_delayed; + /* Match table for each CPT */ + struct lnet_match_table **ptl_mtables; + /* spread rotor of incoming "PUT" */ + unsigned int ptl_rotor; + /* # active entries for this portal */ + int ptl_mt_nmaps; + /* array of active entries' cpu-partition-id */ + int ptl_mt_maps[0]; +}; + +#define LNET_LH_HASH_BITS 12 +#define LNET_LH_HASH_SIZE (1ULL << LNET_LH_HASH_BITS) +#define LNET_LH_HASH_MASK (LNET_LH_HASH_SIZE - 1) + +/* resource container (ME, MD, EQ) */ +struct lnet_res_container { + unsigned int rec_type; /* container type */ + __u64 rec_lh_cookie; /* cookie generator */ + struct list_head rec_active; /* active resource list */ + struct list_head *rec_lh_hash; /* handle hash */ +}; + +/* message container */ +struct lnet_msg_container { + int msc_init; /* initialized or not */ + /* max # threads finalizing */ + int msc_nfinalizers; + /* msgs waiting to complete finalizing */ + struct list_head msc_finalizing; + /* msgs waiting to be resent */ + struct list_head msc_resending; + struct list_head msc_active; /* active message list */ + /* threads doing finalization */ + void **msc_finalizers; + /* threads doing resends */ + void **msc_resenders; +}; + +/* Peer Discovery states */ +#define LNET_DC_STATE_SHUTDOWN 0 /* not started */ +#define LNET_DC_STATE_RUNNING 1 /* started up OK */ +#define LNET_DC_STATE_STOPPING 2 /* telling thread to stop */ + +/* Router Checker states */ +#define LNET_MT_STATE_SHUTDOWN 0 /* not started */ +#define LNET_MT_STATE_RUNNING 1 /* started up OK */ +#define LNET_MT_STATE_STOPPING 2 /* telling thread to stop */ + +/* LNet states */ +#define LNET_STATE_SHUTDOWN 0 /* not started */ +#define LNET_STATE_RUNNING 1 /* started up OK */ +#define LNET_STATE_STOPPING 2 /* telling thread to stop */ + +struct lnet { + /* CPU partition table of LNet */ + struct cfs_cpt_table *ln_cpt_table; + /* number of CPTs in ln_cpt_table */ + unsigned int ln_cpt_number; + unsigned int ln_cpt_bits; + + /* protect LNet resources (ME/MD/EQ) */ + struct cfs_percpt_lock *ln_res_lock; + /* # portals */ + int ln_nportals; + /* the vector of portals */ + struct lnet_portal **ln_portals; + /* percpt ME containers */ + struct lnet_res_container **ln_me_containers; + /* percpt MD container */ + struct lnet_res_container **ln_md_containers; + + /* Event Queue container */ + struct lnet_res_container ln_eq_container; + wait_queue_head_t ln_eq_waitq; + spinlock_t ln_eq_wait_lock; + + unsigned int ln_remote_nets_hbits; + + /* protect NI, peer table, credits, routers, rtrbuf... */ + struct cfs_percpt_lock *ln_net_lock; + /* percpt message containers for active/finalizing/freed message */ + struct lnet_msg_container **ln_msg_containers; + struct lnet_counters **ln_counters; + struct lnet_peer_table **ln_peer_tables; + /* list of peer nis not on a local network */ + struct list_head ln_remote_peer_ni_list; + /* failure simulation */ + struct list_head ln_test_peers; + struct list_head ln_drop_rules; + struct list_head ln_delay_rules; + /* LND instances */ + struct list_head ln_nets; + /* the loopback NI */ + struct lnet_ni *ln_loni; + /* network zombie list */ + struct list_head ln_net_zombie; + /* resend messages list */ + struct list_head ln_msg_resend; + /* spin lock to protect the msg resend list */ + spinlock_t ln_msg_resend_lock; + + /* remote networks with routes to them */ + struct list_head *ln_remote_nets_hash; + /* validity stamp */ + __u64 ln_remote_nets_version; + /* list of all known routers */ + struct list_head ln_routers; + /* validity stamp */ + __u64 ln_routers_version; + /* percpt router buffer pools */ + struct lnet_rtrbufpool **ln_rtrpools; + + /* + * Ping target / Push source + * + * The ping target and push source share a single buffer. The + * ln_ping_target is protected against concurrent updates by + * ln_api_mutex. + */ + struct lnet_handle_md ln_ping_target_md; + struct lnet_handle_eq ln_ping_target_eq; + struct lnet_ping_buffer *ln_ping_target; + atomic_t ln_ping_target_seqno; + + /* + * Push Target + * + * ln_push_nnis contains the desired size of the push target. + * The lnet_net_lock is used to handle update races. The old + * buffer may linger a while after it has been unlinked, in + * which case the event handler cleans up. + */ + struct lnet_handle_eq ln_push_target_eq; + struct lnet_handle_md ln_push_target_md; + struct lnet_ping_buffer *ln_push_target; + int ln_push_target_nnis; + + /* discovery event queue handle */ + struct lnet_handle_eq ln_dc_eqh; + /* discovery requests */ + struct list_head ln_dc_request; + /* discovery working list */ + struct list_head ln_dc_working; + /* discovery expired list */ + struct list_head ln_dc_expired; + /* discovery thread wait queue */ + wait_queue_head_t ln_dc_waitq; + /* discovery startup/shutdown state */ + int ln_dc_state; + + /* monitor thread startup/shutdown state */ + int ln_mt_state; + /* router checker's event queue */ + struct lnet_handle_eq ln_rc_eqh; + /* rcd still pending on net */ + struct list_head ln_rcd_deathrow; + /* rcd ready for free */ + struct list_head ln_rcd_zombie; + /* serialise startup/shutdown */ + struct semaphore ln_mt_signal; + + struct mutex ln_api_mutex; + struct mutex ln_lnd_mutex; + /* Have I called LNetNIInit myself? */ + int ln_niinit_self; + /* LNetNIInit/LNetNIFini counter */ + int ln_refcount; + /* SHUTDOWN/RUNNING/STOPPING */ + int ln_state; + + int ln_routing; /* am I a router? */ + lnet_pid_t ln_pid; /* requested pid */ + /* uniquely identifies this ni in this epoch */ + __u64 ln_interface_cookie; + /* registered LNDs */ + struct list_head ln_lnds; + + /* test protocol compatibility flags */ + int ln_testprotocompat; + + /* 0 - load the NIs from the mod params + * 1 - do not load the NIs from the mod params + * Reverse logic to ensure that other calls to LNetNIInit + * need no change + */ + bool ln_nis_from_mod_params; + + /* + * waitq for the monitor thread. The monitor thread takes care of + * checking routes, timedout messages and resending messages. + */ + wait_queue_head_t ln_mt_waitq; + + /* per-cpt resend queues */ + struct list_head **ln_mt_resendqs; + /* local NIs to recover */ + struct list_head ln_mt_localNIRecovq; + /* local NIs to recover */ + struct list_head ln_mt_peerNIRecovq; + /* + * An array of queues for GET/PUT waiting for REPLY/ACK respectively. + * There are CPT number of queues. Since response trackers will be + * added on the fast path we can't afford to grab the exclusive + * net lock to protect these queues. The CPT will be calculated + * based on the mdh cookie. + */ + struct list_head **ln_mt_rstq; + /* + * A response tracker becomes a zombie when the associated MD is queued + * for unlink before the response tracker is detached from the MD. An + * entry on a zombie list can be freed when either the remaining + * operations on the MD complete or when LNet has shut down. + */ + struct list_head **ln_mt_zombie_rstqs; + /* recovery eq handler */ + struct lnet_handle_eq ln_mt_eqh; + +}; + +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h new file mode 100644 index 0000000000000..e2c19f2a4ed35 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h @@ -0,0 +1,80 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/socklnd.h + */ +#ifndef __LNET_LNET_SOCKLND_H__ +#define __LNET_LNET_SOCKLND_H__ + +#include +#include + +struct ksock_hello_msg { + __u32 kshm_magic; /* magic number of socklnd message */ + __u32 kshm_version; /* version of socklnd message */ + lnet_nid_t kshm_src_nid; /* sender's nid */ + lnet_nid_t kshm_dst_nid; /* destination nid */ + lnet_pid_t kshm_src_pid; /* sender's pid */ + lnet_pid_t kshm_dst_pid; /* destination pid */ + __u64 kshm_src_incarnation; /* sender's incarnation */ + __u64 kshm_dst_incarnation; /* destination's incarnation */ + __u32 kshm_ctype; /* connection type */ + __u32 kshm_nips; /* # IP addrs */ + __u32 kshm_ips[0]; /* IP addrs */ +} WIRE_ATTR; + +struct ksock_lnet_msg { + struct lnet_hdr ksnm_hdr; /* lnet hdr */ + + /* + * ksnm_payload is removed because of winnt compiler's limitation: + * zero-sized array can only be placed at the tail of [nested] + * structure definitions. lnet payload will be stored just after + * the body of structure struct ksock_lnet_msg + */ +} WIRE_ATTR; + +struct ksock_msg { + __u32 ksm_type; /* type of socklnd message */ + __u32 ksm_csum; /* checksum if != 0 */ + __u64 ksm_zc_cookies[2]; /* Zero-Copy request/ACK cookie */ + union { + struct ksock_lnet_msg lnetmsg; /* lnet message, it's empty if it's NOOP */ + } WIRE_ATTR ksm_u; +} WIRE_ATTR; + +#define KSOCK_MSG_NOOP 0xc0 /* ksm_u empty */ +#define KSOCK_MSG_LNET 0xc1 /* lnet msg */ + +/* We need to know this number to parse hello msg from ksocklnd in + * other LND (usocklnd, for example) */ +#define KSOCK_PROTO_V2 2 +#define KSOCK_PROTO_V3 3 + +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h new file mode 100644 index 0000000000000..2672fe7ae103d --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h @@ -0,0 +1,151 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_debug.h + * + * Debug messages and assertions + * + */ + +#ifndef __UAPI_LIBCFS_DEBUG_H__ +#define __UAPI_LIBCFS_DEBUG_H__ + +#include + +/** + * Format for debug message headers + */ +struct ptldebug_header { + __u32 ph_len; + __u32 ph_flags; + __u32 ph_subsys; + __u32 ph_mask; + __u16 ph_cpu_id; + __u16 ph_type; + /* time_t overflow in 2106 */ + __u32 ph_sec; + __u64 ph_usec; + __u32 ph_stack; + __u32 ph_pid; + __u32 ph_extern_pid; + __u32 ph_line_num; +} __attribute__((packed)); + +#define PH_FLAG_FIRST_RECORD 1 + +/* Debugging subsystems (32 bits, non-overlapping) */ +#define S_UNDEFINED 0x00000001 +#define S_MDC 0x00000002 +#define S_MDS 0x00000004 +#define S_OSC 0x00000008 +#define S_OST 0x00000010 +#define S_CLASS 0x00000020 +#define S_LOG 0x00000040 +#define S_LLITE 0x00000080 +#define S_RPC 0x00000100 +#define S_MGMT 0x00000200 +#define S_LNET 0x00000400 +#define S_LND 0x00000800 /* ALL LNDs */ +#define S_PINGER 0x00001000 +#define S_FILTER 0x00002000 +#define S_LIBCFS 0x00004000 +#define S_ECHO 0x00008000 +#define S_LDLM 0x00010000 +#define S_LOV 0x00020000 +#define S_LQUOTA 0x00040000 +#define S_OSD 0x00080000 +#define S_LFSCK 0x00100000 +#define S_SNAPSHOT 0x00200000 +/* unused */ +#define S_LMV 0x00800000 /* b_new_cmd */ +/* unused */ +#define S_SEC 0x02000000 /* upcall cache */ +#define S_GSS 0x04000000 /* b_new_cmd */ +/* unused */ +#define S_MGC 0x10000000 +#define S_MGS 0x20000000 +#define S_FID 0x40000000 /* b_new_cmd */ +#define S_FLD 0x80000000 /* b_new_cmd */ + +#define LIBCFS_DEBUG_SUBSYS_NAMES { \ + "undefined", "mdc", "mds", "osc", "ost", "class", "log", \ + "llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter", \ + "libcfs", "echo", "ldlm", "lov", "lquota", "osd", "lfsck", \ + "snapshot", "", "lmv", "", "sec", "gss", "", "mgc", "mgs", \ + "fid", "fld", NULL } + +/* Debugging masks (32 bits, non-overlapping) */ +#define D_TRACE 0x00000001 /* ENTRY/EXIT markers */ +#define D_INODE 0x00000002 +#define D_SUPER 0x00000004 +#define D_EXT2 0x00000008 /* anything from ext2_debug */ +#define D_MALLOC 0x00000010 /* print malloc, free information */ +#define D_CACHE 0x00000020 /* cache-related items */ +#define D_INFO 0x00000040 /* general information */ +#define D_IOCTL 0x00000080 /* ioctl related information */ +#define D_NETERROR 0x00000100 /* network errors */ +#define D_NET 0x00000200 /* network communications */ +#define D_WARNING 0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */ +#define D_BUFFS 0x00000800 +#define D_OTHER 0x00001000 +#define D_DENTRY 0x00002000 +#define D_NETTRACE 0x00004000 +#define D_PAGE 0x00008000 /* bulk page handling */ +#define D_DLMTRACE 0x00010000 +#define D_ERROR 0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */ +#define D_EMERG 0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */ +#define D_HA 0x00080000 /* recovery and failover */ +#define D_RPCTRACE 0x00100000 /* for distributed debugging */ +#define D_VFSTRACE 0x00200000 +#define D_READA 0x00400000 /* read-ahead */ +#define D_MMAP 0x00800000 +#define D_CONFIG 0x01000000 +#define D_CONSOLE 0x02000000 +#define D_QUOTA 0x04000000 +#define D_SEC 0x08000000 +#define D_LFSCK 0x10000000 /* For both OI scrub and LFSCK */ +#define D_HSM 0x20000000 +#define D_SNAPSHOT 0x40000000 /* snapshot */ +#define D_LAYOUT 0x80000000 + +#define LIBCFS_DEBUG_MASKS_NAMES { \ + "trace", "inode", "super", "ext2", "malloc", "cache", "info", \ + "ioctl", "neterror", "net", "warning", "buffs", "other", \ + "dentry", "nettrace", "page", "dlmtrace", "error", "emerg", \ + "ha", "rpctrace", "vfstrace", "reada", "mmap", "config", \ + "console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\ + NULL } + +#define D_CANTMASK (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE) + +#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log" + +#endif /* __UAPI_LIBCFS_DEBUG_H__ */ diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h new file mode 100644 index 0000000000000..cdac10f572408 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h @@ -0,0 +1,157 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Low-level ioctl data structures. Kernel ioctl functions declared here, + * and user space functions are in libcfs/util/ioctl.h. + * + */ + +#ifndef __UAPI_LIBCFS_IOCTL_H__ +#define __UAPI_LIBCFS_IOCTL_H__ + +#include +#include + +#define LIBCFS_IOCTL_VERSION 0x0001000a +#define LIBCFS_IOCTL_VERSION2 0x0001000b + +struct libcfs_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +/** max size to copy from userspace */ +#define LIBCFS_IOC_DATA_MAX (128 * 1024) + +struct libcfs_ioctl_data { + struct libcfs_ioctl_hdr ioc_hdr; + + __u64 ioc_nid; + __u64 ioc_u64[1]; + + __u32 ioc_flags; + __u32 ioc_count; + __u32 ioc_net; + __u32 ioc_u32[7]; + + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + + __u32 ioc_plen1; /* buffers in userspace */ + void __user *ioc_pbuf1; + __u32 ioc_plen2; /* buffers in userspace */ + void __user *ioc_pbuf2; + + char ioc_bulk[0]; +}; + +struct libcfs_debug_ioctl_data { + struct libcfs_ioctl_hdr hdr; + unsigned int subs; + unsigned int debug; +}; + +/* 'f' ioctls are defined in lustre_ioctl.h and lustre_user.h except for: */ +#define LIBCFS_IOC_DEBUG_MASK _IOWR('f', 250, long) +#define IOCTL_LIBCFS_TYPE long + +#define IOC_LIBCFS_TYPE ('e') +#define IOC_LIBCFS_MIN_NR 30 +/* libcfs ioctls */ +/* IOC_LIBCFS_PANIC obsolete in 2.8.0, was _IOWR('e', 30, IOCTL_LIBCFS_TYPE) */ +#define IOC_LIBCFS_CLEAR_DEBUG _IOWR('e', 31, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_MARK_DEBUG _IOWR('e', 32, IOCTL_LIBCFS_TYPE) +/* IOC_LIBCFS_MEMHOG obsolete in 2.8.0, was _IOWR('e', 36, IOCTL_LIBCFS_TYPE) */ +/* lnet ioctls */ +#define IOC_LIBCFS_GET_NI _IOWR('e', 50, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_FAIL_NID _IOWR('e', 51, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_NOTIFY_ROUTER _IOWR('e', 55, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_UNCONFIGURE _IOWR('e', 56, IOCTL_LIBCFS_TYPE) +/* IOC_LIBCFS_PORTALS_COMPATIBILITY _IOWR('e', 57, IOCTL_LIBCFS_TYPE) */ +#define IOC_LIBCFS_LNET_DIST _IOWR('e', 58, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_CONFIGURE _IOWR('e', 59, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_TESTPROTOCOMPAT _IOWR('e', 60, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PING _IOWR('e', 61, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PING_PEER _IOWR('e', 62, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LNETST _IOWR('e', 63, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LNET_FAULT _IOWR('e', 64, IOCTL_LIBCFS_TYPE) +/* lnd ioctls */ +#define IOC_LIBCFS_REGISTER_MYNID _IOWR('e', 70, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_CLOSE_CONNECTION _IOWR('e', 71, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PUSH_CONNECTION _IOWR('e', 72, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_CONN _IOWR('e', 73, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEL_PEER _IOWR('e', 74, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_ADD_PEER _IOWR('e', 75, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_PEER _IOWR('e', 76, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DISCOVER _IOWR('e', 77, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_ADD_INTERFACE _IOWR('e', 78, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEL_INTERFACE _IOWR('e', 79, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_INTERFACE _IOWR('e', 80, IOCTL_LIBCFS_TYPE) + + +/* + * DLC Specific IOCTL numbers. + * In order to maintain backward compatibility with any possible external + * tools which might be accessing the IOCTL numbers, a new group of IOCTL + * number have been allocated. + */ +#define IOCTL_CONFIG_SIZE struct lnet_ioctl_config_data +#define IOC_LIBCFS_ADD_ROUTE _IOWR(IOC_LIBCFS_TYPE, 81, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_DEL_ROUTE _IOWR(IOC_LIBCFS_TYPE, 82, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_ROUTE _IOWR(IOC_LIBCFS_TYPE, 83, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_ADD_NET _IOWR(IOC_LIBCFS_TYPE, 84, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_DEL_NET _IOWR(IOC_LIBCFS_TYPE, 85, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_NET _IOWR(IOC_LIBCFS_TYPE, 86, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_CONFIG_RTR _IOWR(IOC_LIBCFS_TYPE, 87, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_ADD_BUF _IOWR(IOC_LIBCFS_TYPE, 88, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_BUF _IOWR(IOC_LIBCFS_TYPE, 89, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_PEER_INFO _IOWR(IOC_LIBCFS_TYPE, 90, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_LNET_STATS _IOWR(IOC_LIBCFS_TYPE, 91, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_ADD_PEER_NI _IOWR(IOC_LIBCFS_TYPE, 92, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_DEL_PEER_NI _IOWR(IOC_LIBCFS_TYPE, 93, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_PEER_NI _IOWR(IOC_LIBCFS_TYPE, 94, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_ADD_LOCAL_NI _IOWR(IOC_LIBCFS_TYPE, 95, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_DEL_LOCAL_NI _IOWR(IOC_LIBCFS_TYPE, 96, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_LOCAL_NI _IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_SET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_PEER_LIST _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_SET_HEALHV _IOWR(IOC_LIBCFS_TYPE, 102, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_LOCAL_HSTATS _IOWR(IOC_LIBCFS_TYPE, 103, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_GET_RECOVERY_QUEUE _IOWR(IOC_LIBCFS_TYPE, 104, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_MAX_NR 104 + +extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data); + +#endif /* __UAPI_LIBCFS_IOCTL_H__ */ diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h new file mode 100644 index 0000000000000..f10cbc3309176 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h @@ -0,0 +1,294 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. + * + * LGPL HEADER END + * + */ +/* + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * Author: Amir Shehata + */ + +#ifndef __UAPI_LNET_DLC_H_ +#define __UAPI_LNET_DLC_H_ + +#include +/* + * This is due to us being out of kernel and the way the OpenSFS branch + * handles CFLAGS. + */ +#ifdef __KERNEL__ +# include +# include +#else +# include +# include +#endif + +#define MAX_NUM_SHOW_ENTRIES 32 +#define LNET_MAX_STR_LEN 128 +#define LNET_MAX_SHOW_NUM_CPT 128 +#define LNET_MAX_SHOW_NUM_NID 128 +#define LNET_UNDEFINED_HOPS ((__u32) -1) + +/* + * To allow for future enhancements to extend the tunables + * add a hdr to this structure, so that the version can be set + * and checked for backwards compatibility. Newer versions of LNet + * can still work with older versions of lnetctl. The restriction is + * that the structure can be added to and not removed from in order + * to not invalidate older lnetctl utilities. Moreover, the order of + * fields must remain the same, and new fields appended to the structure + * + * That said all existing LND tunables will be added in this structure + * to avoid future changes. + */ +struct lnet_ioctl_config_lnd_cmn_tunables { + __u32 lct_version; + __s32 lct_peer_timeout; + __s32 lct_peer_tx_credits; + __s32 lct_peer_rtr_credits; + __s32 lct_max_tx_credits; +}; + +struct lnet_ioctl_config_o2iblnd_tunables { + __u32 lnd_version; + __u32 lnd_peercredits_hiw; + __u32 lnd_map_on_demand; + __u32 lnd_concurrent_sends; + __u32 lnd_fmr_pool_size; + __u32 lnd_fmr_flush_trigger; + __u32 lnd_fmr_cache; + __u16 lnd_conns_per_peer; + __u16 lnd_ntx; +}; + +struct lnet_lnd_tunables { + union { + struct lnet_ioctl_config_o2iblnd_tunables lnd_o2ib; + } lnd_tun_u; +}; + +struct lnet_ioctl_config_lnd_tunables { + struct lnet_ioctl_config_lnd_cmn_tunables lt_cmn; + struct lnet_lnd_tunables lt_tun; +}; + +struct lnet_ioctl_net_config { + char ni_interfaces[LNET_INTERFACES_NUM][LNET_MAX_STR_LEN]; + __u32 ni_status; + __u32 ni_cpts[LNET_MAX_SHOW_NUM_CPT]; + char cfg_bulk[0]; +}; + +#define LNET_TINY_BUF_IDX 0 +#define LNET_SMALL_BUF_IDX 1 +#define LNET_LARGE_BUF_IDX 2 + +/* # different router buffer pools */ +#define LNET_NRBPOOLS (LNET_LARGE_BUF_IDX + 1) + +struct lnet_ioctl_pool_cfg { + struct { + __u32 pl_npages; + __u32 pl_nbuffers; + __u32 pl_credits; + __u32 pl_mincredits; + } pl_pools[LNET_NRBPOOLS]; + __u32 pl_routing; +}; + +struct lnet_ioctl_ping_data { + struct libcfs_ioctl_hdr ping_hdr; + + __u32 op_param; + __u32 ping_count; + __u32 ping_flags; + bool mr_info; + struct lnet_process_id ping_id; + struct lnet_process_id __user *ping_buf; +}; + +struct lnet_ioctl_config_data { + struct libcfs_ioctl_hdr cfg_hdr; + + __u32 cfg_net; + __u32 cfg_count; + __u64 cfg_nid; + __u32 cfg_ncpts; + + union { + struct { + __u32 rtr_hop; + __u32 rtr_priority; + __u32 rtr_flags; + } cfg_route; + struct { + char net_intf[LNET_MAX_STR_LEN]; + __s32 net_peer_timeout; + __s32 net_peer_tx_credits; + __s32 net_peer_rtr_credits; + __s32 net_max_tx_credits; + __u32 net_cksum_algo; + __u32 net_interface_count; + } cfg_net; + struct { + __u32 buf_enable; + __s32 buf_tiny; + __s32 buf_small; + __s32 buf_large; + } cfg_buffers; + } cfg_config_u; + + char cfg_bulk[0]; +}; + +struct lnet_ioctl_comm_count { + __u32 ico_get_count; + __u32 ico_put_count; + __u32 ico_reply_count; + __u32 ico_ack_count; + __u32 ico_hello_count; +}; + +struct lnet_ioctl_element_stats { + __u32 iel_send_count; + __u32 iel_recv_count; + __u32 iel_drop_count; +}; + +enum lnet_health_type { + LNET_HEALTH_TYPE_LOCAL_NI = 0, + LNET_HEALTH_TYPE_PEER_NI, +}; + +struct lnet_ioctl_local_ni_hstats { + struct libcfs_ioctl_hdr hlni_hdr; + lnet_nid_t hlni_nid; + __u32 hlni_local_interrupt; + __u32 hlni_local_dropped; + __u32 hlni_local_aborted; + __u32 hlni_local_no_route; + __u32 hlni_local_timeout; + __u32 hlni_local_error; + __s32 hlni_health_value; +}; + +struct lnet_ioctl_peer_ni_hstats { + __u32 hlpni_remote_dropped; + __u32 hlpni_remote_timeout; + __u32 hlpni_remote_error; + __u32 hlpni_network_timeout; + __s32 hlpni_health_value; +}; + +struct lnet_ioctl_element_msg_stats { + struct libcfs_ioctl_hdr im_hdr; + __u32 im_idx; + struct lnet_ioctl_comm_count im_send_stats; + struct lnet_ioctl_comm_count im_recv_stats; + struct lnet_ioctl_comm_count im_drop_stats; +}; + +/* + * lnet_ioctl_config_ni + * This structure describes an NI configuration. There are multiple components + * when configuring an NI: Net, Interfaces, CPT list and LND tunables + * A network is passed as a string to the DLC and translated using + * libcfs_str2net() + * An interface is the name of the system configured interface + * (ex eth0, ib1) + * CPT is the list of CPTS LND tunables are passed in the lic_bulk area + */ +struct lnet_ioctl_config_ni { + struct libcfs_ioctl_hdr lic_cfg_hdr; + lnet_nid_t lic_nid; + char lic_ni_intf[LNET_INTERFACES_NUM][LNET_MAX_STR_LEN]; + char lic_legacy_ip2nets[LNET_MAX_STR_LEN]; + __u32 lic_cpts[LNET_MAX_SHOW_NUM_CPT]; + __u32 lic_ncpts; + __u32 lic_status; + __u32 lic_tcp_bonding; + __u32 lic_idx; + __s32 lic_dev_cpt; + char pad[4]; + char lic_bulk[0]; +}; + +struct lnet_peer_ni_credit_info { + char cr_aliveness[LNET_MAX_STR_LEN]; + __u32 cr_refcount; + __s32 cr_ni_peer_tx_credits; + __s32 cr_peer_tx_credits; + __s32 cr_peer_min_tx_credits; + __u32 cr_peer_tx_qnob; + __s32 cr_peer_rtr_credits; + __s32 cr_peer_min_rtr_credits; + __u32 cr_ncpt; +}; + +struct lnet_ioctl_peer { + struct libcfs_ioctl_hdr pr_hdr; + __u32 pr_count; + __u32 pr_pad; + lnet_nid_t pr_nid; + + union { + struct lnet_peer_ni_credit_info pr_peer_credits; + } pr_lnd_u; +}; + +struct lnet_ioctl_peer_cfg { + struct libcfs_ioctl_hdr prcfg_hdr; + lnet_nid_t prcfg_prim_nid; + lnet_nid_t prcfg_cfg_nid; + __u32 prcfg_count; + bool prcfg_mr; + __u32 prcfg_state; + __u32 prcfg_size; + void __user *prcfg_bulk; +}; + +struct lnet_ioctl_reset_health_cfg { + struct libcfs_ioctl_hdr rh_hdr; + enum lnet_health_type rh_type; + bool rh_all; + int rh_value; + lnet_nid_t rh_nid; +}; + +struct lnet_ioctl_recovery_list { + struct libcfs_ioctl_hdr rlst_hdr; + enum lnet_health_type rlst_type; + int rlst_num_nids; + lnet_nid_t rlst_nid_array[LNET_MAX_SHOW_NUM_NID]; +}; + +struct lnet_ioctl_set_value { + struct libcfs_ioctl_hdr sv_hdr; + __u32 sv_value; +}; + +struct lnet_ioctl_lnet_stats { + struct libcfs_ioctl_hdr st_hdr; + struct lnet_counters st_cntrs; +}; + +#endif /* _LNET_DLC_H_ */ diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h new file mode 100644 index 0000000000000..1f7828c8c9c15 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h @@ -0,0 +1,746 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __UAPI_LNET_TYPES_H__ +#define __UAPI_LNET_TYPES_H__ + +/** \addtogroup lnet + * @{ */ + +#include +/** \addtogroup lnet_addr + * @{ */ + +#define LNET_VERSION "0.7.0" + +/** Portal reserved for LNet's own use. + * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments. + */ +#define LNET_RESERVED_PORTAL 0 + +/** + * Address of an end-point in an LNet network. + * + * A node can have multiple end-points and hence multiple addresses. + * An LNet network can be a simple network (e.g. tcp0) or a network of + * LNet networks connected by LNet routers. Therefore an end-point address + * has two parts: network ID, and address within a network. + * + * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID. + */ +typedef __u64 lnet_nid_t; + +/** + * ID of a process in a node. Shortened as PID to distinguish from + * lnet_process_id, the global process ID. + */ +typedef __u32 lnet_pid_t; + +/** wildcard NID that matches any end-point address */ +#define LNET_NID_ANY ((lnet_nid_t) -1) +/** wildcard PID that matches any lnet_pid_t */ +#define LNET_PID_ANY ((lnet_pid_t) -1) + +#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */ +#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */ +#define LNET_PID_LUSTRE 12345 + +/* how an LNET NID encodes net:address */ +/** extract the address part of an lnet_nid_t */ + +static inline __u32 LNET_NIDADDR(lnet_nid_t nid) +{ + return nid & 0xffffffff; +} + +static inline __u32 LNET_NIDNET(lnet_nid_t nid) +{ + return (nid >> 32) & 0xffffffff; +} + +static inline lnet_nid_t LNET_MKNID(__u32 net, __u32 addr) +{ + return (((__u64)net) << 32) | addr; +} + +static inline __u32 LNET_NETNUM(__u32 net) +{ + return net & 0xffff; +} + +static inline __u32 LNET_NETTYP(__u32 net) +{ + return (net >> 16) & 0xffff; +} + +static inline __u32 LNET_MKNET(__u32 type, __u32 num) +{ + return (type << 16) | num; +} + +/** The lolnd NID (i.e. myself) */ +#define LNET_NID_LO_0 LNET_MKNID(LNET_MKNET(LOLND, 0), 0) + +#define WIRE_ATTR __attribute__((packed)) + +/* Packed version of struct lnet_process_id to transfer via network */ +struct lnet_process_id_packed { + lnet_nid_t nid; + lnet_pid_t pid; /* node id / process id */ +} WIRE_ATTR; + +/* The wire handle's interface cookie only matches one network interface in + * one epoch (i.e. new cookie when the interface restarts or the node + * reboots). The object cookie only matches one object on that interface + * during that object's lifetime (i.e. no cookie re-use). */ +struct lnet_handle_wire { + __u64 wh_interface_cookie; + __u64 wh_object_cookie; +} WIRE_ATTR; + +enum lnet_msg_type { + LNET_MSG_ACK = 0, + LNET_MSG_PUT, + LNET_MSG_GET, + LNET_MSG_REPLY, + LNET_MSG_HELLO, +}; + +/* The variant fields of the portals message header are aligned on an 8 + * byte boundary in the message header. Note that all types used in these + * wire structs MUST be fixed size and the smaller types are placed at the + * end. */ +struct lnet_ack { + struct lnet_handle_wire dst_wmd; + __u64 match_bits; + __u32 mlength; +} WIRE_ATTR; + +struct lnet_put { + struct lnet_handle_wire ack_wmd; + __u64 match_bits; + __u64 hdr_data; + __u32 ptl_index; + __u32 offset; +} WIRE_ATTR; + +struct lnet_get { + struct lnet_handle_wire return_wmd; + __u64 match_bits; + __u32 ptl_index; + __u32 src_offset; + __u32 sink_length; +} WIRE_ATTR; + +struct lnet_reply { + struct lnet_handle_wire dst_wmd; +} WIRE_ATTR; + +struct lnet_hello { + __u64 incarnation; + __u32 type; +} WIRE_ATTR; + +struct lnet_hdr { + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + lnet_pid_t dest_pid; + lnet_pid_t src_pid; + __u32 type; /* enum lnet_msg_type */ + __u32 payload_length; /* payload data to follow */ + /*<------__u64 aligned------->*/ + union { + struct lnet_ack ack; + struct lnet_put put; + struct lnet_get get; + struct lnet_reply reply; + struct lnet_hello hello; + } msg; +} WIRE_ATTR; + +/* A HELLO message contains a magic number and protocol version + * code in the header's dest_nid, the peer's NID in the src_nid, and + * LNET_MSG_HELLO in the type field. All other common fields are zero + * (including payload_size; i.e. no payload). + * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is + * running the same protocol and to find out its NID. These LNDs should + * exchange HELLO messages when a connection is first established. Individual + * LNDs can put whatever else they fancy in lnet_hdr::msg. + */ +struct lnet_magicversion { + __u32 magic; /* LNET_PROTO_TCP_MAGIC */ + __u16 version_major; /* increment on incompatible change */ + __u16 version_minor; /* increment on compatible change */ +} WIRE_ATTR; + +/* PROTO MAGIC for LNDs */ +#define LNET_PROTO_IB_MAGIC 0x0be91b91 +#define LNET_PROTO_GNI_MAGIC 0xb00fbabe /* ask Kim */ +#define LNET_PROTO_TCP_MAGIC 0xeebc0ded +#define LNET_PROTO_ACCEPTOR_MAGIC 0xacce7100 +#define LNET_PROTO_PING_MAGIC 0x70696E67 /* 'ping' */ + +/* Placeholder for a future "unified" protocol across all LNDs */ +/* Current LNDs that receive a request with this magic will respond + * with a "stub" reply using their current protocol */ +#define LNET_PROTO_MAGIC 0x45726963 /* ! */ + +#define LNET_PROTO_TCP_VERSION_MAJOR 1 +#define LNET_PROTO_TCP_VERSION_MINOR 0 + +/* Acceptor connection request */ +struct lnet_acceptor_connreq { + __u32 acr_magic; /* PTL_ACCEPTOR_PROTO_MAGIC */ + __u32 acr_version; /* protocol version */ + __u64 acr_nid; /* target NID */ +} WIRE_ATTR; + +#define LNET_PROTO_ACCEPTOR_VERSION 1 + +struct lnet_counters_common { + __u32 lcc_msgs_alloc; + __u32 lcc_msgs_max; + __u32 lcc_errors; + __u32 lcc_send_count; + __u32 lcc_recv_count; + __u32 lcc_route_count; + __u32 lcc_drop_count; + __u64 lcc_send_length; + __u64 lcc_recv_length; + __u64 lcc_route_length; + __u64 lcc_drop_length; +} WIRE_ATTR; + +struct lnet_counters_health { + __u32 lch_rst_alloc; + __u32 lch_resend_count; + __u32 lch_response_timeout_count; + __u32 lch_local_interrupt_count; + __u32 lch_local_dropped_count; + __u32 lch_local_aborted_count; + __u32 lch_local_no_route_count; + __u32 lch_local_timeout_count; + __u32 lch_local_error_count; + __u32 lch_remote_dropped_count; + __u32 lch_remote_error_count; + __u32 lch_remote_timeout_count; + __u32 lch_network_timeout_count; +}; + +struct lnet_counters { + struct lnet_counters_common lct_common; + struct lnet_counters_health lct_health; +}; + +#define LNET_NI_STATUS_UP 0x15aac0de +#define LNET_NI_STATUS_DOWN 0xdeadface +#define LNET_NI_STATUS_INVALID 0x00000000 + +struct lnet_ni_status { + lnet_nid_t ns_nid; + __u32 ns_status; + __u32 ns_unused; +} WIRE_ATTR; + +/* + * NB: value of these features equal to LNET_PROTO_PING_VERSION_x + * of old LNet, so there shouldn't be any compatibility issue + */ +#define LNET_PING_FEAT_INVAL (0) /* no feature */ +#define LNET_PING_FEAT_BASE (1 << 0) /* just a ping */ +#define LNET_PING_FEAT_NI_STATUS (1 << 1) /* return NI status */ +#define LNET_PING_FEAT_RTE_DISABLED (1 << 2) /* Routing enabled */ +#define LNET_PING_FEAT_MULTI_RAIL (1 << 3) /* Multi-Rail aware */ +#define LNET_PING_FEAT_DISCOVERY (1 << 4) /* Supports Discovery */ + +/* + * All ping feature bits fit to hit the wire. + * In lnet_assert_wire_constants() this is compared against its open-coded + * value, and in lnet_ping_target_update() it is used to verify that no + * unknown bits have been set. + * New feature bits can be added, just be aware that this does change the + * over-the-wire protocol. + */ +#define LNET_PING_FEAT_BITS (LNET_PING_FEAT_BASE | \ + LNET_PING_FEAT_NI_STATUS | \ + LNET_PING_FEAT_RTE_DISABLED | \ + LNET_PING_FEAT_MULTI_RAIL | \ + LNET_PING_FEAT_DISCOVERY) + +struct lnet_ping_info { + __u32 pi_magic; + __u32 pi_features; + lnet_pid_t pi_pid; + __u32 pi_nnis; + struct lnet_ni_status pi_ni[0]; +} WIRE_ATTR; + +#define LNET_PING_INFO_SIZE(NNIDS) \ + offsetof(struct lnet_ping_info, pi_ni[NNIDS]) +#define LNET_PING_INFO_LONI(PINFO) ((PINFO)->pi_ni[0].ns_nid) +#define LNET_PING_INFO_SEQNO(PINFO) ((PINFO)->pi_ni[0].ns_status) + +/* + * This is a hard-coded limit on the number of interfaces supported by + * the interface bonding implemented by the ksocknal LND. It must be + * defined here because it is used in LNet data structures that are + * common to all LNDs. + */ +#define LNET_INTERFACES_NUM 16 + +/* The minimum number of interfaces per node supported by LNet. */ +#define LNET_INTERFACES_MIN 16 +/* The default - arbitrary - value of the lnet_max_interfaces tunable. */ +#define LNET_INTERFACES_MAX_DEFAULT 200 + +/** + * Objects maintained by the LNet are accessed through handles. Handle types + * have names of the form lnet_handle_xx, where xx is one of the two letter + * object type codes ('eq' for event queue, 'md' for memory descriptor, and + * 'me' for match entry). Each type of object is given a unique handle type + * to enhance type checking. + */ +#define LNET_WIRE_HANDLE_COOKIE_NONE (-1) + +struct lnet_handle_eq { + __u64 cookie; +}; + +/** + * Invalidate eq handle \a h. + */ +static inline void LNetInvalidateEQHandle(struct lnet_handle_eq *h) +{ + h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE; +} + +/** + * Check whether eq handle \a h is invalid. + * + * \return 1 if handle is invalid, 0 if valid. + */ +static inline int LNetEQHandleIsInvalid(struct lnet_handle_eq h) +{ + return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie); +} + +struct lnet_handle_md { + __u64 cookie; +}; + +/** + * Invalidate md handle \a h. + */ +static inline void LNetInvalidateMDHandle(struct lnet_handle_md *h) +{ + h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE; +} + +/** + * Check whether eq handle \a h is invalid. + * + * \return 1 if handle is invalid, 0 if valid. + */ +static inline int LNetMDHandleIsInvalid(struct lnet_handle_md h) +{ + return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie); +} + +struct lnet_handle_me { + __u64 cookie; +}; + +/** + * Global process ID. + */ +struct lnet_process_id { + /** node id */ + lnet_nid_t nid; + /** process id */ + lnet_pid_t pid; +}; +/** @} lnet_addr */ + +/** \addtogroup lnet_me + * @{ */ + +/** + * Specifies whether the match entry or memory descriptor should be unlinked + * automatically (LNET_UNLINK) or not (LNET_RETAIN). + */ +enum lnet_unlink { + LNET_RETAIN = 0, + LNET_UNLINK +}; + +/** + * Values of the type enum lnet_ins_pos are used to control where a new match + * entry is inserted. The value LNET_INS_BEFORE is used to insert the new + * entry before the current entry or before the head of the list. The value + * LNET_INS_AFTER is used to insert the new entry after the current entry + * or after the last item in the list. + */ +enum lnet_ins_pos { + /** insert ME before current position or head of the list */ + LNET_INS_BEFORE, + /** insert ME after current position or tail of the list */ + LNET_INS_AFTER, + /** attach ME at tail of local CPU partition ME list */ + LNET_INS_LOCAL +}; + +/** @} lnet_me */ + +/** \addtogroup lnet_md + * @{ */ + +/** + * Defines the visible parts of a memory descriptor. Values of this type + * are used to initialize memory descriptors. + */ +struct lnet_md { + /** + * Specify the memory region associated with the memory descriptor. + * If the options field has: + * - LNET_MD_KIOV bit set: The start field points to the starting + * address of an array of lnet_kiov_t and the length field specifies + * the number of entries in the array. The length can't be bigger + * than LNET_MAX_IOV. The lnet_kiov_t is used to describe page-based + * fragments that are not necessarily mapped in virtal memory. + * - LNET_MD_IOVEC bit set: The start field points to the starting + * address of an array of struct kvec and the length field specifies + * the number of entries in the array. The length can't be bigger + * than LNET_MAX_IOV. The struct kvec is used to describe fragments + * that have virtual addresses. + * - Otherwise: The memory region is contiguous. The start field + * specifies the starting address for the memory region and the + * length field specifies its length. + * + * When the memory region is fragmented, all fragments but the first + * one must start on page boundary, and all but the last must end on + * page boundary. + */ + void *start; + unsigned int length; + /** + * Specifies the maximum number of operations that can be performed + * on the memory descriptor. An operation is any action that could + * possibly generate an event. In the usual case, the threshold value + * is decremented for each operation on the MD. When the threshold + * drops to zero, the MD becomes inactive and does not respond to + * operations. A threshold value of LNET_MD_THRESH_INF indicates that + * there is no bound on the number of operations that may be applied + * to a MD. + */ + int threshold; + /** + * Specifies the largest incoming request that the memory descriptor + * should respond to. When the unused portion of a MD (length - + * local offset) falls below this value, the MD becomes inactive and + * does not respond to further operations. This value is only used + * if the LNET_MD_MAX_SIZE option is set. + */ + int max_size; + /** + * Specifies the behavior of the memory descriptor. A bitwise OR + * of the following values can be used: + * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD. + * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD. + * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory + * region is provided by the incoming request. By default, the + * offset is maintained locally. When maintained locally, the + * offset is incremented by the length of the request so that + * the next operation (PUT or GET) will access the next part of + * the memory region. Note that only one offset variable exists + * per memory descriptor. If both PUT and GET operations are + * performed on a memory descriptor, the offset is updated each time. + * - LNET_MD_TRUNCATE: The length provided in the incoming request can + * be reduced to match the memory available in the region (determined + * by subtracting the offset from the length of the memory region). + * By default, if the length in the incoming operation is greater + * than the amount of memory available, the operation is rejected. + * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for + * incoming PUT operations, even if requested. By default, + * acknowledgments are sent for PUT operations that request an + * acknowledgment. Acknowledgments are never sent for GET operations. + * The data sent in the REPLY serves as an implicit acknowledgment. + * - LNET_MD_KIOV: The start and length fields specify an array of + * lnet_kiov_t. + * - LNET_MD_IOVEC: The start and length fields specify an array of + * struct iovec. + * - LNET_MD_MAX_SIZE: The max_size field is valid. + * - LNET_MD_BULK_HANDLE: The bulk_handle field is valid. + * + * Note: + * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather + * capability for memory descriptors. They can't be both set. + * - When LNET_MD_MAX_SIZE is set, the total length of the memory + * region (i.e. sum of all fragment lengths) must not be less than + * \a max_size. + */ + unsigned int options; + /** + * A user-specified value that is associated with the memory + * descriptor. The value does not need to be a pointer, but must fit + * in the space used by a pointer. This value is recorded in events + * associated with operations on this MD. + */ + void *user_ptr; + /** + * A handle for the event queue used to log the operations performed on + * the memory region. If this argument is a NULL handle (i.e. nullified + * by LNetInvalidateHandle()), operations performed on this memory + * descriptor are not logged. + */ + struct lnet_handle_eq eq_handle; + /** + * The bulk MD handle which was registered to describe the buffers + * either to be used to transfer data to the peer or receive data + * from the peer. This allows LNet to properly determine the NUMA + * node on which the memory was allocated and use that to select the + * nearest local network interface. This value is only used + * if the LNET_MD_BULK_HANDLE option is set. + */ + struct lnet_handle_md bulk_handle; +}; + +/* Max Transfer Unit (minimum supported everywhere). + * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks) + * these limits are system wide and not interface-local. */ +#define LNET_MTU_BITS 20 +#define LNET_MTU (1 << LNET_MTU_BITS) + +/** + * Options for the MD structure. See struct lnet_md::options. + */ +#define LNET_MD_OP_PUT (1 << 0) +/** See struct lnet_md::options. */ +#define LNET_MD_OP_GET (1 << 1) +/** See struct lnet_md::options. */ +#define LNET_MD_MANAGE_REMOTE (1 << 2) +/* unused (1 << 3) */ +/** See struct lnet_md::options. */ +#define LNET_MD_TRUNCATE (1 << 4) +/** See struct lnet_md::options. */ +#define LNET_MD_ACK_DISABLE (1 << 5) +/** See struct lnet_md::options. */ +#define LNET_MD_IOVEC (1 << 6) +/** See struct lnet_md::options. */ +#define LNET_MD_MAX_SIZE (1 << 7) +/** See struct lnet_md::options. */ +#define LNET_MD_KIOV (1 << 8) +/** See struct lnet_md::options. */ +#define LNET_MD_BULK_HANDLE (1 << 9) + +/* For compatibility with Cray Portals */ +#define LNET_MD_PHYS 0 + +/** Infinite threshold on MD operations. See struct lnet_md::threshold */ +#define LNET_MD_THRESH_INF (-1) + +/** + * A page-based fragment of a MD. + */ +typedef struct { + /** Pointer to the page where the fragment resides */ + struct page *kiov_page; + /** Length in bytes of the fragment */ + unsigned int kiov_len; + /** + * Starting offset of the fragment within the page. Note that the + * end of the fragment must not pass the end of the page; i.e., + * kiov_len + kiov_offset <= PAGE_SIZE. + */ + unsigned int kiov_offset; +} lnet_kiov_t; +/** @} lnet_md */ + +/** \addtogroup lnet_eq + * @{ */ + +/** + * Six types of events can be logged in an event queue. + */ +enum lnet_event_kind { + /** An incoming GET operation has completed on the MD. */ + LNET_EVENT_GET = 1, + /** + * An incoming PUT operation has completed on the MD. The + * underlying layers will not alter the memory (on behalf of this + * operation) once this event has been logged. + */ + LNET_EVENT_PUT, + /** + * A REPLY operation has completed. This event is logged after the + * data (if any) from the REPLY has been written into the MD. + */ + LNET_EVENT_REPLY, + /** An acknowledgment has been received. */ + LNET_EVENT_ACK, + /** + * An outgoing send (PUT or GET) operation has completed. This event + * is logged after the entire buffer has been sent and it is safe for + * the caller to reuse the buffer. + * + * Note: + * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can + * happen even when the message has not yet been put out on wire. + * - It's unsafe to assume that in an outgoing GET operation + * the LNET_EVENT_SEND event would happen before the + * LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and + * LNET_EVENT_ACK events in an outgoing PUT operation. + */ + LNET_EVENT_SEND, + /** + * A MD has been unlinked. Note that LNetMDUnlink() does not + * necessarily trigger an LNET_EVENT_UNLINK event. + * \see LNetMDUnlink + */ + LNET_EVENT_UNLINK, +}; + +#define LNET_SEQ_GT(a, b) (((signed long)((a) - (b))) > 0) + +/** + * Information about an event on a MD. + */ +struct lnet_event { + /** The identifier (nid, pid) of the target. */ + struct lnet_process_id target; + /** The identifier (nid, pid) of the initiator. */ + struct lnet_process_id initiator; + /** The source NID on the initiator. */ + struct lnet_process_id source; + /** + * The NID of the immediate sender. If the request has been forwarded + * by routers, this is the NID of the last hop; otherwise it's the + * same as the source. + */ + lnet_nid_t sender; + /** Indicates the type of the event. */ + enum lnet_event_kind type; + /** The portal table index specified in the request */ + unsigned int pt_index; + /** A copy of the match bits specified in the request. */ + __u64 match_bits; + /** The length (in bytes) specified in the request. */ + unsigned int rlength; + /** + * The length (in bytes) of the data that was manipulated by the + * operation. For truncated operations, the manipulated length will be + * the number of bytes specified by the MD (possibly with an offset, + * see struct lnet_md). For all other operations, the manipulated length + * will be the length of the requested operation, i.e. rlength. + */ + unsigned int mlength; + /** + * The handle to the MD associated with the event. The handle may be + * invalid if the MD has been unlinked. + */ + struct lnet_handle_md md_handle; + /** + * A snapshot of the state of the MD immediately after the event has + * been processed. In particular, the threshold field in md will + * reflect the value of the threshold after the operation occurred. + */ + struct lnet_md md; + /** + * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT. + * \see LNetPut + */ + __u64 hdr_data; + /** + * The message type, to ensure a handler for LNET_EVENT_SEND can + * distinguish between LNET_MSG_GET and LNET_MSG_PUT. + */ + __u32 msg_type; + /** + * Indicates the completion status of the operation. It's 0 for + * successful operations, otherwise it's an error code. + */ + int status; + /** + * Indicates whether the MD has been unlinked. Note that: + * - An event with unlinked set is the last event on the MD. + * - This field is also set for an explicit LNET_EVENT_UNLINK event. + * \see LNetMDUnlink + */ + int unlinked; + /** + * The displacement (in bytes) into the memory region that the + * operation used. The offset can be determined by the operation for + * a remote managed MD or by the local MD. + * \see struct lnet_md::options + */ + unsigned int offset; + /** + * The sequence number for this event. Sequence numbers are unique + * to each event. + */ + volatile unsigned long sequence; +}; + +/** + * Event queue handler function type. + * + * The EQ handler runs for each event that is deposited into the EQ. The + * handler is supplied with a pointer to the event that triggered the + * handler invocation. + * + * The handler must not block, must be reentrant, and must not call any LNet + * API functions. It should return as quickly as possible. + */ +typedef void (*lnet_eq_handler_t)(struct lnet_event *event); +#define LNET_EQ_HANDLER_NONE NULL +/** @} lnet_eq */ + +/** \addtogroup lnet_data + * @{ */ + +/** + * Specify whether an acknowledgment should be sent by target when the PUT + * operation completes (i.e., when the data has been written to a MD of the + * target process). + * + * \see struct lnet_md::options for the discussion on LNET_MD_ACK_DISABLE + * by which acknowledgments can be disabled for a MD. + */ +enum lnet_ack_req { + /** Request an acknowledgment */ + LNET_ACK_REQ, + /** Request that no acknowledgment should be generated. */ + LNET_NOACK_REQ +}; +/** @} lnet_data */ + +/** @} lnet */ +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h new file mode 100644 index 0000000000000..cb4f153e377d1 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h @@ -0,0 +1,158 @@ +/* + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * header for lnet ioctl + */ +/* + * Copyright (c) 2014, 2017, Intel Corporation. + */ +#ifndef __UAPI_LNETCTL_H_ +#define __UAPI_LNETCTL_H_ + +#include +/* + * This is due to us being out of kernel and the way the OpenSFS branch + * handles CFLAGS. + */ +#ifdef __KERNEL__ +# include +#else +# include +#endif + +#include + +/** \addtogroup lnet_fault_simulation + * @{ */ + +enum { + LNET_CTL_DROP_ADD, + LNET_CTL_DROP_DEL, + LNET_CTL_DROP_RESET, + LNET_CTL_DROP_LIST, + LNET_CTL_DELAY_ADD, + LNET_CTL_DELAY_DEL, + LNET_CTL_DELAY_RESET, + LNET_CTL_DELAY_LIST, +}; + +#define LNET_ACK_BIT (1 << 0) +#define LNET_PUT_BIT (1 << 1) +#define LNET_GET_BIT (1 << 2) +#define LNET_REPLY_BIT (1 << 3) + +#define HSTATUS_END 11 +#define HSTATUS_LOCAL_INTERRUPT_BIT (1 << 1) +#define HSTATUS_LOCAL_DROPPED_BIT (1 << 2) +#define HSTATUS_LOCAL_ABORTED_BIT (1 << 3) +#define HSTATUS_LOCAL_NO_ROUTE_BIT (1 << 4) +#define HSTATUS_LOCAL_ERROR_BIT (1 << 5) +#define HSTATUS_LOCAL_TIMEOUT_BIT (1 << 6) +#define HSTATUS_REMOTE_ERROR_BIT (1 << 7) +#define HSTATUS_REMOTE_DROPPED_BIT (1 << 8) +#define HSTATUS_REMOTE_TIMEOUT_BIT (1 << 9) +#define HSTATUS_NETWORK_TIMEOUT_BIT (1 << 10) +#define HSTATUS_RANDOM 0xffffffff + +/** ioctl parameter for LNet fault simulation */ +struct lnet_fault_attr { + /** + * source NID of drop rule + * LNET_NID_ANY is wildcard for all sources + * 255.255.255.255@net is wildcard for all addresses from @net + */ + lnet_nid_t fa_src; + /** destination NID of drop rule, see \a dr_src for details */ + lnet_nid_t fa_dst; + /** + * Portal mask to drop, -1 means all portals, for example: + * fa_ptl_mask = (1 << _LDLM_CB_REQUEST_PORTAL ) | + * (1 << LDLM_CANCEL_REQUEST_PORTAL) + * + * If it is non-zero then only PUT and GET will be filtered, otherwise + * there is no portal filter, all matched messages will be checked. + */ + __u64 fa_ptl_mask; + /** + * message types to drop, for example: + * dra_type = LNET_DROP_ACK_BIT | LNET_DROP_PUT_BIT + * + * If it is non-zero then only specified message types are filtered, + * otherwise all message types will be checked. + */ + __u32 fa_msg_mask; + union { + /** message drop simulation */ + struct { + /** drop rate of this rule */ + __u32 da_rate; + /** + * time interval of message drop, it is exclusive + * with da_rate + */ + __u32 da_interval; + /** error type mask */ + __u32 da_health_error_mask; + /** randomize error generation */ + bool da_random; + } drop; + /** message latency simulation */ + struct { + __u32 la_rate; + /** + * time interval of message delay, it is exclusive + * with la_rate + */ + __u32 la_interval; + /** latency to delay */ + __u32 la_latency; + } delay; + __u64 space[8]; + } u; + +}; + +/** fault simluation stats */ +struct lnet_fault_stat { + /** total # matched messages */ + __u64 fs_count; + /** # dropped LNET_MSG_PUT by this rule */ + __u64 fs_put; + /** # dropped LNET_MSG_ACK by this rule */ + __u64 fs_ack; + /** # dropped LNET_MSG_GET by this rule */ + __u64 fs_get; + /** # dropped LNET_MSG_REPLY by this rule */ + __u64 fs_reply; + union { + struct { + /** total # dropped messages */ + __u64 ds_dropped; + } drop; + struct { + /** total # delayed messages */ + __u64 ls_delayed; + } delay; + __u64 space[8]; + } u; +}; + +/** @} lnet_fault_simulation */ + +#define LNET_DEV_ID 0 +#define LNET_DEV_PATH "/dev/lnet" + +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h new file mode 100644 index 0000000000000..ca871cac02b7b --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h @@ -0,0 +1,521 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: Liang Zhen + */ + +#ifndef __UAPI_LNET_ST_H__ +#define __UAPI_LNET_ST_H__ + +#include + +#define LST_FEAT_NONE (0) +#define LST_FEAT_BULK_LEN (1 << 0) /* enable variable page size */ + +#define LST_FEATS_EMPTY (LST_FEAT_NONE) +#define LST_FEATS_MASK (LST_FEAT_NONE | LST_FEAT_BULK_LEN) + +#define LST_NAME_SIZE 32 /* max name buffer length */ + +#define LSTIO_DEBUG 0xC00 /* debug */ +#define LSTIO_SESSION_NEW 0xC01 /* create session */ +#define LSTIO_SESSION_END 0xC02 /* end session */ +#define LSTIO_SESSION_INFO 0xC03 /* query session */ +#define LSTIO_GROUP_ADD 0xC10 /* add group */ +#define LSTIO_GROUP_LIST 0xC11 /* list all groups in session */ +#define LSTIO_GROUP_INFO 0xC12 /* query defailt infomation of specified group */ +#define LSTIO_GROUP_DEL 0xC13 /* delete group */ +#define LSTIO_NODES_ADD 0xC14 /* add nodes to specified group */ +#define LSTIO_GROUP_UPDATE 0xC15 /* update group */ +#define LSTIO_BATCH_ADD 0xC20 /* add batch */ +#define LSTIO_BATCH_START 0xC21 /* start batch */ +#define LSTIO_BATCH_STOP 0xC22 /* stop batch */ +#define LSTIO_BATCH_DEL 0xC23 /* delete batch */ +#define LSTIO_BATCH_LIST 0xC24 /* show all batches in the session */ +#define LSTIO_BATCH_INFO 0xC25 /* show defail of specified batch */ +#define LSTIO_TEST_ADD 0xC26 /* add test (to batch) */ +#define LSTIO_BATCH_QUERY 0xC27 /* query batch status */ +#define LSTIO_STAT_QUERY 0xC30 /* get stats */ + +struct lst_sid { + lnet_nid_t ses_nid; /* nid of console node */ + __s64 ses_stamp; /* time stamp in milliseconds */ +}; /*** session id */ + +extern struct lst_sid LST_INVALID_SID; + +struct lst_bid { + __u64 bat_id; /* unique id in session */ +}; + +/* Status of test node */ +#define LST_NODE_ACTIVE 0x1 /* node in this session */ +#define LST_NODE_BUSY 0x2 /* node is taken by other session */ +#define LST_NODE_DOWN 0x4 /* node is down */ +#define LST_NODE_UNKNOWN 0x8 /* node not in session */ + +struct lstcon_node_ent { + struct lnet_process_id nde_id; /* id of node */ + int nde_state; /* state of node */ +}; /*** node entry, for list_group command */ + +struct lstcon_ndlist_ent { + int nle_nnode; /* # of nodes */ + int nle_nactive; /* # of active nodes */ + int nle_nbusy; /* # of busy nodes */ + int nle_ndown; /* # of down nodes */ + int nle_nunknown; /* # of unknown nodes */ +}; /*** node_list entry, for list_batch command */ + +struct lstcon_test_ent { + int tse_type; /* test type */ + int tse_loop; /* loop count */ + int tse_concur; /* concurrency of test */ +}; /*** test summary entry, for list_batch command */ + +struct lstcon_batch_ent { + int bae_state; /* batch status */ + int bae_timeout; /* batch timeout */ + int bae_ntest; /* # of tests in the batch */ +}; /*** batch summary entry, for list_batch command */ + +struct lstcon_test_batch_ent { + struct lstcon_ndlist_ent tbe_cli_nle; /* client (group) node_list entry */ + struct lstcon_ndlist_ent tbe_srv_nle; /* server (group) node_list entry */ + union { + struct lstcon_test_ent tbe_test; /* test entry */ + struct lstcon_batch_ent tbe_batch; /* batch entry */ + } u; +}; /*** test/batch verbose information entry, + *** for list_batch command */ + +struct lstcon_rpc_ent { + struct list_head rpe_link; /* link chain */ + struct lnet_process_id rpe_peer; /* peer's id */ + /* This has not been used since Lustre 2.2 so its safe to use. + * Update to allow future use of timespec64 + */ + struct { + __s64 tv_sec; + __s64 tv_nsec; + } rpe_stamp; /* time stamp of RPC */ + int rpe_state; /* peer's state */ + int rpe_rpc_errno; /* RPC errno */ + + struct lst_sid rpe_sid; /* peer's session id */ + int rpe_fwk_errno; /* framework errno */ + int rpe_priv[4]; /* private data */ + char rpe_payload[0]; /* private reply payload */ +}; + +struct lstcon_trans_stat { + int trs_rpc_stat[4]; /* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */ + int trs_rpc_errno; /* RPC errno */ + int trs_fwk_stat[8]; /* framework stat */ + int trs_fwk_errno; /* errno of the first remote error */ + void *trs_fwk_private; /* private framework stat */ +}; + +static inline int +lstcon_rpc_stat_total(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0]; +} + +static inline int +lstcon_rpc_stat_success(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1]; +} + +static inline int +lstcon_rpc_stat_failure(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2]; +} + +static inline int +lstcon_sesop_stat_success(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_sesop_stat_failure(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_sesqry_stat_active(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_sesqry_stat_busy(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_sesqry_stat_unknown(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2]; +} + +static inline int +lstcon_tsbop_stat_success(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_tsbop_stat_failure(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_tsbqry_stat_idle(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_tsbqry_stat_run(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_tsbqry_stat_failure(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2]; +} + +static inline int +lstcon_statqry_stat_success(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_statqry_stat_failure(struct lstcon_trans_stat *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +/* create a session */ +struct lstio_session_new_args { + int lstio_ses_key; /* IN: local key */ + int lstio_ses_timeout; /* IN: session timeout */ + int lstio_ses_force; /* IN: force create ? */ + /** IN: session features */ + unsigned lstio_ses_feats; + struct lst_sid __user *lstio_ses_idp; /* OUT: session id */ + int lstio_ses_nmlen; /* IN: name length */ + char __user *lstio_ses_namep; /* IN: session name */ +}; + +/* query current session */ +struct lstio_session_info_args { + struct lst_sid __user *lstio_ses_idp; /* OUT: session id */ + int __user *lstio_ses_keyp; /* OUT: local key */ + /** OUT: session features */ + unsigned __user *lstio_ses_featp; + struct lstcon_ndlist_ent __user *lstio_ses_ndinfo; /* OUT: */ + int lstio_ses_nmlen; /* IN: name length */ + char __user *lstio_ses_namep; /* OUT: session name */ +}; + +/* delete a session */ +struct lstio_session_end_args { + int lstio_ses_key; /* IN: session key */ +}; + +#define LST_OPC_SESSION 1 +#define LST_OPC_GROUP 2 +#define LST_OPC_NODES 3 +#define LST_OPC_BATCHCLI 4 +#define LST_OPC_BATCHSRV 5 + +struct lstio_debug_args { + int lstio_dbg_key; /* IN: session key */ + int lstio_dbg_type; /* IN: debug sessin|batch|group|nodes list */ + int lstio_dbg_flags; /* IN: reserved debug flags */ + int lstio_dbg_timeout; /* IN: timeout of debug */ + + int lstio_dbg_nmlen; /* IN: len of name */ + char __user *lstio_dbg_namep; /* IN: name of group|batch */ + int lstio_dbg_count; /* IN: # of test nodes to debug */ + struct lnet_process_id __user *lstio_dbg_idsp; /* IN: id of test nodes */ + /* OUT: list head of result buffer */ + struct list_head __user *lstio_dbg_resultp; +}; + +struct lstio_group_add_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char __user *lstio_grp_namep; /* IN: group name */ +}; + +struct lstio_group_del_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char __user *lstio_grp_namep; /* IN: group name */ +}; + +#define LST_GROUP_CLEAN 1 /* remove inactive nodes in the group */ +#define LST_GROUP_REFRESH 2 /* refresh inactive nodes in the group */ +#define LST_GROUP_RMND 3 /* delete nodes from the group */ + +struct lstio_group_update_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_opc; /* IN: OPC */ + int lstio_grp_args; /* IN: arguments */ + int lstio_grp_nmlen; /* IN: name length */ + char __user *lstio_grp_namep; /* IN: group name */ + int lstio_grp_count; /* IN: # of nodes id */ + struct lnet_process_id __user *lstio_grp_idsp; /* IN: array of nodes */ + /* OUT: list head of result buffer */ + struct list_head __user *lstio_grp_resultp; +}; + +struct lstio_group_nodes_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char __user *lstio_grp_namep; /* IN: group name */ + int lstio_grp_count; /* IN: # of nodes */ + /** OUT: session features */ + unsigned __user *lstio_grp_featp; + struct lnet_process_id __user *lstio_grp_idsp; /* IN: nodes */ + /* OUT: list head of result buffer */ + struct list_head __user *lstio_grp_resultp; +}; + +struct lstio_group_list_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_idx; /* IN: group idx */ + int lstio_grp_nmlen; /* IN: name len */ + char __user *lstio_grp_namep; /* OUT: name */ +}; + +struct lstio_group_info_args { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name len */ + char __user *lstio_grp_namep; /* IN: name */ + struct lstcon_ndlist_ent __user *lstio_grp_entp;/* OUT: description of group */ + + int __user *lstio_grp_idxp; /* IN/OUT: node index */ + int __user *lstio_grp_ndentp; /* IN/OUT: # of nodent */ + struct lstcon_node_ent __user *lstio_grp_dentsp;/* OUT: nodent array */ +}; + +#define LST_DEFAULT_BATCH "batch" /* default batch name */ + +struct lstio_batch_add_args { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char __user *lstio_bat_namep; /* IN: batch name */ +}; + +struct lstio_batch_del_args { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char __user *lstio_bat_namep; /* IN: batch name */ +}; + +struct lstio_batch_run_args { + /* IN: session key */ + int lstio_bat_key; + /* IN: timeout for the batch */ + int lstio_bat_timeout; + /* IN: name length */ + int lstio_bat_nmlen; + /* IN: batch name */ + char __user *lstio_bat_namep; + /* OUT: list head of result buffer */ + struct list_head __user *lstio_bat_resultp; +}; + +struct lstio_batch_stop_args { + /* IN: session key */ + int lstio_bat_key; + /* IN: abort unfinished test RPC */ + int lstio_bat_force; + /* IN: name length */ + int lstio_bat_nmlen; + /* IN: batch name */ + char __user *lstio_bat_namep; + /* OUT: list head of result buffer */ + struct list_head __user *lstio_bat_resultp; +}; + +struct lstio_batch_query_args { + /* IN: session key */ + int lstio_bat_key; + /* IN: test index */ + int lstio_bat_testidx; + /* IN: is test client? */ + int lstio_bat_client; + /* IN: timeout for waiting */ + int lstio_bat_timeout; + /* IN: name length */ + int lstio_bat_nmlen; + /* IN: batch name */ + char __user *lstio_bat_namep; + /* OUT: list head of result buffer */ + struct list_head __user *lstio_bat_resultp; +}; + +struct lstio_batch_list_args { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_idx; /* IN: index */ + int lstio_bat_nmlen; /* IN: name length */ + char __user *lstio_bat_namep; /* IN: batch name */ +}; + +struct lstio_batch_info_args { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char __user *lstio_bat_namep; /* IN: name */ + int lstio_bat_server; /* IN: query server or not */ + int lstio_bat_testidx; /* IN: test index */ + struct lstcon_test_batch_ent __user *lstio_bat_entp;/* OUT: batch ent */ + + int __user *lstio_bat_idxp; /* IN/OUT: index of node */ + int __user *lstio_bat_ndentp; /* IN/OUT: # of nodent */ + struct lstcon_node_ent __user *lstio_bat_dentsp;/* array of nodent */ +}; + +/* add stat in session */ +struct lstio_stat_args { + /* IN: session key */ + int lstio_sta_key; + /* IN: timeout for stat requst */ + int lstio_sta_timeout; + /* IN: group name length */ + int lstio_sta_nmlen; + /* IN: group name */ + char __user *lstio_sta_namep; + /* IN: # of pid */ + int lstio_sta_count; + /* IN: pid */ + struct lnet_process_id __user *lstio_sta_idsp; + /* OUT: list head of result buffer */ + struct list_head __user *lstio_sta_resultp; +}; + +enum lst_test_type { + LST_TEST_BULK = 1, + LST_TEST_PING = 2 +}; + +/* create a test in a batch */ +#define LST_MAX_CONCUR 1024 /* Max concurrency of test */ + +struct lstio_test_args { + int lstio_tes_key; /* IN: session key */ + int lstio_tes_bat_nmlen; /* IN: batch name len */ + char __user *lstio_tes_bat_name; /* IN: batch name */ + int lstio_tes_type; /* IN: test type */ + int lstio_tes_oneside; /* IN: one sided test */ + int lstio_tes_loop; /* IN: loop count */ + int lstio_tes_concur; /* IN: concurrency */ + + int lstio_tes_dist; /* IN: node distribution in destination groups */ + int lstio_tes_span; /* IN: node span in destination groups */ + int lstio_tes_sgrp_nmlen; /* IN: source group name length */ + char __user *lstio_tes_sgrp_name; /* IN: group name */ + int lstio_tes_dgrp_nmlen; /* IN: destination group name length */ + char __user *lstio_tes_dgrp_name; /* IN: group name */ + + /* IN: param buffer len */ + int lstio_tes_param_len; + /* IN: parameter for specified test: + lstio_bulk_param_t, + lstio_ping_param_t, + ... more */ + void __user *lstio_tes_param; + /* OUT: private returned value */ + int __user *lstio_tes_retp; + /* OUT: list head of result buffer */ + struct list_head __user *lstio_tes_resultp; +}; + +enum lst_brw_type { + LST_BRW_READ = 1, + LST_BRW_WRITE = 2 +}; + +enum lst_brw_flags { + LST_BRW_CHECK_NONE = 1, + LST_BRW_CHECK_SIMPLE = 2, + LST_BRW_CHECK_FULL = 3 +}; + +struct lst_test_bulk_param { + int blk_opc; /* bulk operation code */ + int blk_size; /* size (bytes) */ + int blk_time; /* time of running the test*/ + int blk_flags; /* reserved flags */ + int blk_cli_off; /* bulk offset on client */ + int blk_srv_off; /* reserved: bulk offset on server */ +}; + +struct lst_test_ping_param { + int png_size; /* size of ping message */ + int png_time; /* time */ + int png_loop; /* loop */ + int png_flags; /* reserved flags */ +}; + +struct srpc_counters { + __u32 errors; + __u32 rpcs_sent; + __u32 rpcs_rcvd; + __u32 rpcs_dropped; + __u32 rpcs_expired; + __u64 bulk_get; + __u64 bulk_put; +} WIRE_ATTR; + +struct sfw_counters { + /** milliseconds since current session started */ + __u32 running_ms; + __u32 active_batches; + __u32 zombie_sessions; + __u32 brw_errors; + __u32 ping_errors; +} WIRE_ATTR; + +#endif diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h new file mode 100644 index 0000000000000..c41b9158ecd7d --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h @@ -0,0 +1,122 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2015, 2017, Intel Corporation. + */ +#ifndef _LNET_NIDSTRINGS_H +#define _LNET_NIDSTRINGS_H + +#include +/* + * This is due to us being out of kernel and the way the OpenSFS branch + * handles CFLAGS. + */ +#ifdef __KERNEL__ +# include +#else +# include +#endif + +/** + * Lustre Network Driver types. + */ +enum { + /* Only add to these values (i.e. don't ever change or redefine them): + * network addresses depend on them... */ + /*QSWLND = 1, removed v2_7_50 */ + SOCKLND = 2, + /*GMLND = 3, removed v2_0_0-rc1a-16-gc660aac */ + /*PTLLND = 4, removed v2_7_50 */ + O2IBLND = 5, + /*CIBLND = 6, removed v2_0_0-rc1a-175-gd2b8a0e */ + /*OPENIBLND = 7, removed v2_0_0-rc1a-175-gd2b8a0e */ + /*IIBLND = 8, removed v2_0_0-rc1a-175-gd2b8a0e */ + LOLND = 9, + /*RALND = 10, removed v2_7_50_0-34-g8be9e41 */ + /*VIBLND = 11, removed v2_0_0-rc1a-175-gd2b8a0e */ + /*MXLND = 12, removed v2_7_50_0-34-g8be9e41 */ + GNILND = 13, + GNIIPLND = 14, + PTL4LND = 15, +}; + +struct list_head; + +#define LNET_NIDSTR_COUNT 1024 /* # of nidstrings */ +#define LNET_NIDSTR_SIZE 32 /* size of each one (see below for usage) */ + +/* support decl needed by both kernel and user space */ +char *libcfs_next_nidstring(void); +int libcfs_isknown_lnd(__u32 lnd); +char *libcfs_lnd2modname(__u32 lnd); +char *libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size); +static inline char *libcfs_lnd2str(__u32 lnd) +{ + return libcfs_lnd2str_r(lnd, libcfs_next_nidstring(), + LNET_NIDSTR_SIZE); +} +int libcfs_str2lnd(const char *str); +char *libcfs_net2str_r(__u32 net, char *buf, size_t buf_size); +static inline char *libcfs_net2str(__u32 net) +{ + return libcfs_net2str_r(net, libcfs_next_nidstring(), + LNET_NIDSTR_SIZE); +} +char *libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size); +static inline char *libcfs_nid2str(lnet_nid_t nid) +{ + return libcfs_nid2str_r(nid, libcfs_next_nidstring(), + LNET_NIDSTR_SIZE); +} +__u32 libcfs_str2net(const char *str); +lnet_nid_t libcfs_str2nid(const char *str); +int libcfs_str2anynid(lnet_nid_t *nid, const char *str); +char *libcfs_id2str(struct lnet_process_id id); +void cfs_free_nidlist(struct list_head *list); +int cfs_parse_nidlist(char *str, int len, struct list_head *list); +int cfs_print_nidlist(char *buffer, int count, struct list_head *list); +int cfs_match_nid(lnet_nid_t nid, struct list_head *list); + +int cfs_ip_addr_parse(char *str, int len, struct list_head *list); +int cfs_ip_addr_match(__u32 addr, struct list_head *list); +int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid, + char *max_nid, size_t nidstr_length); + +struct netstrfns { + __u32 nf_type; + char *nf_name; + char *nf_modname; + void (*nf_addr2str)(__u32 addr, char *str, size_t size); + int (*nf_str2addr)(const char *str, int nob, __u32 *addr); + int (*nf_parse_addrlist)(char *str, int len, + struct list_head *list); + int (*nf_print_addrlist)(char *buffer, int count, + struct list_head *list); + int (*nf_match_addr)(__u32 addr, struct list_head *list); + int (*nf_min_max)(struct list_head *nidlist, __u32 *min_nid, + __u32 *max_nid); +}; + +#endif /* _LNET_NIDSTRINGS_H */ diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h new file mode 100644 index 0000000000000..6453e053fa99d --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h @@ -0,0 +1,44 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * #defines shared between socknal implementation and utilities + */ +#ifndef __UAPI_LNET_SOCKLND_H__ +#define __UAPI_LNET_SOCKLND_H__ + +#define SOCKLND_CONN_NONE (-1) +#define SOCKLND_CONN_ANY 0 +#define SOCKLND_CONN_CONTROL 1 +#define SOCKLND_CONN_BULK_IN 2 +#define SOCKLND_CONN_BULK_OUT 3 +#define SOCKLND_CONN_NTYPES 4 + +#define SOCKLND_CONN_ACK SOCKLND_CONN_BULK_IN + +#endif diff --git a/drivers/staging/lustrefsx/lnet/klnds/Makefile b/drivers/staging/lustrefsx/lnet/klnds/Makefile new file mode 100644 index 0000000000000..cd375ca2cc67f --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/Makefile @@ -0,0 +1,2 @@ +obj-y += o2iblnd/ +obj-y += socklnd/ diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile new file mode 100644 index 0000000000000..5ce6dc99ffe1a --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTREFSX_LNET_XPRT_IB) += ko2iblnd.o + +ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c new file mode 100644 index 0000000000000..68b83585dc300 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c @@ -0,0 +1,3424 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd.c + * + * Author: Eric Barton + */ + +#include +#include + +#include "o2iblnd.h" + +static struct lnet_lnd the_o2iblnd; + +struct kib_data kiblnd_data; + +static __u32 +kiblnd_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} + +static char * +kiblnd_msgtype2str(int type) +{ + switch (type) { + case IBLND_MSG_CONNREQ: + return "CONNREQ"; + + case IBLND_MSG_CONNACK: + return "CONNACK"; + + case IBLND_MSG_NOOP: + return "NOOP"; + + case IBLND_MSG_IMMEDIATE: + return "IMMEDIATE"; + + case IBLND_MSG_PUT_REQ: + return "PUT_REQ"; + + case IBLND_MSG_PUT_NAK: + return "PUT_NAK"; + + case IBLND_MSG_PUT_ACK: + return "PUT_ACK"; + + case IBLND_MSG_PUT_DONE: + return "PUT_DONE"; + + case IBLND_MSG_GET_REQ: + return "GET_REQ"; + + case IBLND_MSG_GET_DONE: + return "GET_DONE"; + + default: + return "???"; + } +} + +static int +kiblnd_msgtype2size(int type) +{ + const int hdr_size = offsetof(struct kib_msg, ibm_u); + + switch (type) { + case IBLND_MSG_CONNREQ: + case IBLND_MSG_CONNACK: + return hdr_size + sizeof(struct kib_connparams); + + case IBLND_MSG_NOOP: + return hdr_size; + + case IBLND_MSG_IMMEDIATE: + return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]); + + case IBLND_MSG_PUT_REQ: + return hdr_size + sizeof(struct kib_putreq_msg); + + case IBLND_MSG_PUT_ACK: + return hdr_size + sizeof(struct kib_putack_msg); + + case IBLND_MSG_GET_REQ: + return hdr_size + sizeof(struct kib_get_msg); + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + return hdr_size + sizeof(struct kib_completion_msg); + default: + return -1; + } +} + +static int kiblnd_unpack_rd(struct kib_msg *msg, int flip) +{ + struct kib_rdma_desc *rd; + int nob; + int n; + int i; + + LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ || + msg->ibm_type == IBLND_MSG_PUT_ACK); + + rd = msg->ibm_type == IBLND_MSG_GET_REQ ? + &msg->ibm_u.get.ibgm_rd : + &msg->ibm_u.putack.ibpam_rd; + + if (flip) { + __swab32s(&rd->rd_key); + __swab32s(&rd->rd_nfrags); + } + + n = rd->rd_nfrags; + + if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { + CERROR("Bad nfrags: %d, should be 0 < n <= %d\n", + n, IBLND_MAX_RDMA_FRAGS); + return 1; + } + + nob = offsetof(struct kib_msg, ibm_u) + + kiblnd_rd_msg_size(rd, msg->ibm_type, n); + + if (msg->ibm_nob < nob) { + CERROR("Short %s: %d(%d)\n", + kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob); + return 1; + } + + if (!flip) + return 0; + + for (i = 0; i < n; i++) { + __swab32s(&rd->rd_frags[i].rf_nob); + __swab64s(&rd->rd_frags[i].rf_addr); + } + + return 0; +} + +void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version, + int credits, lnet_nid_t dstnid, __u64 dststamp) +{ + struct kib_net *net = ni->ni_data; + + /* CAVEAT EMPTOR! all message fields not set here should have been + * initialised previously. */ + msg->ibm_magic = IBLND_MSG_MAGIC; + msg->ibm_version = version; + /* ibm_type */ + msg->ibm_credits = credits; + /* ibm_nob */ + msg->ibm_cksum = 0; + msg->ibm_srcnid = ni->ni_nid; + msg->ibm_srcstamp = net->ibn_incarnation; + msg->ibm_dstnid = dstnid; + msg->ibm_dststamp = dststamp; + + if (*kiblnd_tunables.kib_cksum) { + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob); + } +} + +int kiblnd_unpack_msg(struct kib_msg *msg, int nob) +{ + const int hdr_size = offsetof(struct kib_msg, ibm_u); + __u32 msg_cksum; + __u16 version; + int msg_nob; + int flip; + + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + if (msg->ibm_magic == IBLND_MSG_MAGIC) { + flip = 0; + } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) { + flip = 1; + } else { + CERROR("Bad magic: %08x\n", msg->ibm_magic); + return -EPROTO; + } + + version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; + if (version != IBLND_MSG_VERSION && + version != IBLND_MSG_VERSION_1) { + CERROR("Bad version: %x\n", version); + return -EPROTO; + } + + if (nob < hdr_size) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; + if (msg_nob > nob) { + CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); + return -EPROTO; + } + + /* checksum must be computed with ibm_cksum zero and BEFORE anything + * gets flipped */ + msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; + msg->ibm_cksum = 0; + if (msg_cksum != 0 && + msg_cksum != kiblnd_cksum(msg, msg_nob)) { + CERROR("Bad checksum\n"); + return -EPROTO; + } + + msg->ibm_cksum = msg_cksum; + + if (flip) { + /* leave magic unflipped as a clue to peer_ni endianness */ + msg->ibm_version = version; + CLASSERT (sizeof(msg->ibm_type) == 1); + CLASSERT (sizeof(msg->ibm_credits) == 1); + msg->ibm_nob = msg_nob; + __swab64s(&msg->ibm_srcnid); + __swab64s(&msg->ibm_srcstamp); + __swab64s(&msg->ibm_dstnid); + __swab64s(&msg->ibm_dststamp); + } + + if (msg->ibm_srcnid == LNET_NID_ANY) { + CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); + return -EPROTO; + } + + if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) { + CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type), + msg_nob, kiblnd_msgtype2size(msg->ibm_type)); + return -EPROTO; + } + + switch (msg->ibm_type) { + default: + CERROR("Unknown message type %x\n", msg->ibm_type); + return -EPROTO; + + case IBLND_MSG_NOOP: + case IBLND_MSG_IMMEDIATE: + case IBLND_MSG_PUT_REQ: + break; + + case IBLND_MSG_PUT_ACK: + case IBLND_MSG_GET_REQ: + if (kiblnd_unpack_rd(msg, flip)) + return -EPROTO; + break; + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + if (flip) + __swab32s(&msg->ibm_u.completion.ibcm_status); + break; + + case IBLND_MSG_CONNREQ: + case IBLND_MSG_CONNACK: + if (flip) { + __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth); + __swab16s(&msg->ibm_u.connparams.ibcp_max_frags); + __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); + } + break; + } + return 0; +} + +int +kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp, + lnet_nid_t nid) +{ + struct kib_peer_ni *peer_ni; + struct kib_net *net = ni->ni_data; + int cpt = lnet_cpt_of_nid(nid, ni); + unsigned long flags; + + LASSERT(net != NULL); + LASSERT(nid != LNET_NID_ANY); + + LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni)); + if (peer_ni == NULL) { + CERROR("Cannot allocate peer_ni\n"); + return -ENOMEM; + } + + peer_ni->ibp_ni = ni; + peer_ni->ibp_nid = nid; + peer_ni->ibp_error = 0; + peer_ni->ibp_last_alive = 0; + peer_ni->ibp_max_frags = IBLND_MAX_RDMA_FRAGS; + peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits; + atomic_set(&peer_ni->ibp_refcount, 1); /* 1 ref for caller */ + + INIT_LIST_HEAD(&peer_ni->ibp_list); /* not in the peer_ni table yet */ + INIT_LIST_HEAD(&peer_ni->ibp_conns); + INIT_LIST_HEAD(&peer_ni->ibp_tx_queue); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT(net->ibn_shutdown == 0); + + /* npeers only grows with the global lock held */ + atomic_inc(&net->ibn_npeers); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + *peerp = peer_ni; + return 0; +} + +void +kiblnd_destroy_peer(struct kib_peer_ni *peer_ni) +{ + struct kib_net *net = peer_ni->ibp_ni->ni_data; + + LASSERT(net != NULL); + LASSERT (atomic_read(&peer_ni->ibp_refcount) == 0); + LASSERT(!kiblnd_peer_active(peer_ni)); + LASSERT(kiblnd_peer_idle(peer_ni)); + LASSERT(list_empty(&peer_ni->ibp_tx_queue)); + + LIBCFS_FREE(peer_ni, sizeof(*peer_ni)); + + /* NB a peer_ni's connections keep a reference on their peer_ni until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer_ni has been cleaned up when its refcount drops to + * zero. */ + atomic_dec(&net->ibn_npeers); +} + +struct kib_peer_ni * +kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid) +{ + /* the caller is responsible for accounting the additional reference + * that this creates */ + struct list_head *peer_list = kiblnd_nid2peerlist(nid); + struct list_head *tmp; + struct kib_peer_ni *peer_ni; + + list_for_each(tmp, peer_list) { + + peer_ni = list_entry(tmp, struct kib_peer_ni, ibp_list); + LASSERT(!kiblnd_peer_idle(peer_ni)); + + /* + * Match a peer if its NID and the NID of the local NI it + * communicates over are the same. Otherwise don't match + * the peer, which will result in a new lnd peer being + * created. + */ + if (peer_ni->ibp_nid != nid || + peer_ni->ibp_ni->ni_nid != ni->ni_nid) + continue; + + CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d) version: %x\n", + peer_ni, libcfs_nid2str(nid), + atomic_read(&peer_ni->ibp_refcount), + peer_ni->ibp_version); + return peer_ni; + } + return NULL; +} + +void +kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni) +{ + LASSERT(list_empty(&peer_ni->ibp_conns)); + + LASSERT (kiblnd_peer_active(peer_ni)); + list_del_init(&peer_ni->ibp_list); + /* lose peerlist's ref */ + kiblnd_peer_decref(peer_ni); +} + +static int +kiblnd_get_peer_info(struct lnet_ni *ni, int index, + lnet_nid_t *nidp, int *count) +{ + struct kib_peer_ni *peer_ni; + struct list_head *ptmp; + int i; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + + list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { + + peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list); + LASSERT(!kiblnd_peer_idle(peer_ni)); + + if (peer_ni->ibp_ni != ni) + continue; + + if (index-- > 0) + continue; + + *nidp = peer_ni->ibp_nid; + *count = atomic_read(&peer_ni->ibp_refcount); + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return 0; + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return -ENOENT; +} + +static void +kiblnd_del_peer_locked(struct kib_peer_ni *peer_ni) +{ + struct list_head *ctmp; + struct list_head *cnxt; + struct kib_conn *conn; + + if (list_empty(&peer_ni->ibp_conns)) { + kiblnd_unlink_peer_locked(peer_ni); + } else { + list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) { + conn = list_entry(ctmp, struct kib_conn, ibc_list); + + kiblnd_close_conn_locked(conn, 0); + } + /* NB closing peer_ni's last conn unlinked it. */ + } + /* NB peer_ni now unlinked; might even be freed if the peer_ni table had the + * last ref on it. */ +} + +static int +kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid) +{ + struct list_head zombies = LIST_HEAD_INIT(zombies); + struct list_head *ptmp; + struct list_head *pnxt; + struct kib_peer_ni *peer_ni; + int lo; + int hi; + int i; + unsigned long flags; + int rc = -ENOENT; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (nid != LNET_NID_ANY) { + lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; + } else { + lo = 0; + hi = kiblnd_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { + peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list); + LASSERT(!kiblnd_peer_idle(peer_ni)); + + if (peer_ni->ibp_ni != ni) + continue; + + if (!(nid == LNET_NID_ANY || peer_ni->ibp_nid == nid)) + continue; + + if (!list_empty(&peer_ni->ibp_tx_queue)) { + LASSERT(list_empty(&peer_ni->ibp_conns)); + + list_splice_init(&peer_ni->ibp_tx_queue, + &zombies); + } + + kiblnd_del_peer_locked(peer_ni); + rc = 0; /* matched something */ + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_txlist_done(&zombies, -EIO, LNET_MSG_STATUS_LOCAL_ERROR); + + return rc; +} + +static struct kib_conn * +kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index) +{ + struct kib_peer_ni *peer_ni; + struct list_head *ptmp; + struct kib_conn *conn; + struct list_head *ctmp; + int i; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { + + peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list); + LASSERT(!kiblnd_peer_idle(peer_ni)); + + if (peer_ni->ibp_ni != ni) + continue; + + list_for_each(ctmp, &peer_ni->ibp_conns) { + if (index-- > 0) + continue; + + conn = list_entry(ctmp, struct kib_conn, ibc_list); + kiblnd_conn_addref(conn); + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return conn; + } + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return NULL; +} + +static void +kiblnd_debug_rx(struct kib_rx *rx) +{ + CDEBUG(D_CONSOLE, " %p status %d msg_type %x cred %d\n", + rx, rx->rx_status, rx->rx_msg->ibm_type, + rx->rx_msg->ibm_credits); +} + +static void +kiblnd_debug_tx(struct kib_tx *tx) +{ + CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lld " + "cookie %#llx msg %s%s type %x cred %d\n", + tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting, + tx->tx_status, ktime_to_ns(tx->tx_deadline), tx->tx_cookie, + tx->tx_lntmsg[0] == NULL ? "-" : "!", + tx->tx_lntmsg[1] == NULL ? "-" : "!", + tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits); +} + +void +kiblnd_debug_conn(struct kib_conn *conn) +{ + struct list_head *tmp; + int i; + + spin_lock(&conn->ibc_lock); + + CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s:\n", + atomic_read(&conn->ibc_refcount), conn, + conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + CDEBUG(D_CONSOLE, " state %d nposted %d/%d cred %d o_cred %d " + " r_cred %d\n", conn->ibc_state, conn->ibc_noops_posted, + conn->ibc_nsends_posted, conn->ibc_credits, + conn->ibc_outstanding_credits, conn->ibc_reserved_credits); + CDEBUG(D_CONSOLE, " comms_err %d\n", conn->ibc_comms_error); + + CDEBUG(D_CONSOLE, " early_rxs:\n"); + list_for_each(tmp, &conn->ibc_early_rxs) + kiblnd_debug_rx(list_entry(tmp, struct kib_rx, rx_list)); + + CDEBUG(D_CONSOLE, " tx_noops:\n"); + list_for_each(tmp, &conn->ibc_tx_noops) + kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_nocred:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_nocred) + kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_rsrvd) + kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue:\n"); + list_for_each(tmp, &conn->ibc_tx_queue) + kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list)); + + CDEBUG(D_CONSOLE, " active_txs:\n"); + list_for_each(tmp, &conn->ibc_active_txs) + kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list)); + + CDEBUG(D_CONSOLE, " rxs:\n"); + for (i = 0; i < IBLND_RX_MSGS(conn); i++) + kiblnd_debug_rx(&conn->ibc_rxs[i]); + + spin_unlock(&conn->ibc_lock); +} + +int +kiblnd_translate_mtu(int value) +{ + switch (value) { + default: + return -1; + case 0: + return 0; + case 256: + return IB_MTU_256; + case 512: + return IB_MTU_512; + case 1024: + return IB_MTU_1024; + case 2048: + return IB_MTU_2048; + case 4096: + return IB_MTU_4096; + } +} + +static void +kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid) +{ + int mtu; + + /* XXX There is no path record for iWARP, set by netdev->change_mtu? */ + if (cmid->route.path_rec == NULL) + return; + + mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu); + LASSERT (mtu >= 0); + if (mtu != 0) + cmid->route.path_rec->mtu = mtu; +} + +static int +kiblnd_get_completion_vector(struct kib_conn *conn, int cpt) +{ + cpumask_t *mask; + int vectors; + int off; + int i; + lnet_nid_t ibp_nid; + + vectors = conn->ibc_cmid->device->num_comp_vectors; + if (vectors <= 1) + return 0; + + mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt); + + /* hash NID to CPU id in this partition... */ + ibp_nid = conn->ibc_peer->ibp_nid; + off = do_div(ibp_nid, cpumask_weight(mask)); + for_each_cpu(i, mask) { + if (off-- == 0) + return i % vectors; + } + + LBUG(); + return 1; +} + +/* + * Get the scheduler bound to this CPT. If the scheduler has no + * threads, which means that the CPT has no CPUs, then grab the + * next scheduler that we can use. + * + * This case would be triggered if a NUMA node is configured with + * no associated CPUs. + */ +static struct kib_sched_info * +kiblnd_get_scheduler(int cpt) +{ + struct kib_sched_info *sched; + int i; + + sched = kiblnd_data.kib_scheds[cpt]; + + if (sched->ibs_nthreads > 0) + return sched; + + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { + if (sched->ibs_nthreads > 0) { + CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n", + cpt, sched->ibs_cpt); + return sched; + } + } + + return NULL; +} + +static unsigned int kiblnd_send_wrs(struct kib_conn *conn) +{ + /* + * One WR for the LNet message + * And ibc_max_frags for the transfer WRs + */ + int ret; + int multiplier = 1 + conn->ibc_max_frags; + enum kib_dev_caps dev_caps = conn->ibc_hdev->ibh_dev->ibd_dev_caps; + + /* FastReg needs two extra WRs for map and invalidate */ + if (dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED) + multiplier += 2; + + /* account for a maximum of ibc_queue_depth in-flight transfers */ + ret = multiplier * conn->ibc_queue_depth; + + if (ret > conn->ibc_hdev->ibh_max_qp_wr) { + CDEBUG(D_NET, "peer_credits %u will result in send work " + "request size %d larger than maximum %d device " + "can handle\n", conn->ibc_queue_depth, ret, + conn->ibc_hdev->ibh_max_qp_wr); + conn->ibc_queue_depth = + conn->ibc_hdev->ibh_max_qp_wr / multiplier; + } + + /* don't go beyond the maximum the device can handle */ + return min(ret, conn->ibc_hdev->ibh_max_qp_wr); +} + +struct kib_conn * +kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid, + int state, int version) +{ + /* CAVEAT EMPTOR: + * If the new conn is created successfully it takes over the caller's + * ref on 'peer_ni'. It also "owns" 'cmid' and destroys it when it itself + * is destroyed. On failure, the caller's ref on 'peer_ni' remains and + * she must dispose of 'cmid'. (Actually I'd block forever if I tried + * to destroy 'cmid' here since I'm called from the CM which still has + * its ref on 'cmid'). */ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + struct kib_net *net = peer_ni->ibp_ni->ni_data; + struct kib_dev *dev; + struct ib_qp_init_attr *init_qp_attr; + struct kib_sched_info *sched; +#ifdef HAVE_IB_CQ_INIT_ATTR + struct ib_cq_init_attr cq_attr = {}; +#endif + struct kib_conn *conn; + struct ib_cq *cq; + unsigned long flags; + int cpt; + int rc; + int i; + + LASSERT(net != NULL); + LASSERT(!in_interrupt()); + + dev = net->ibn_dev; + + cpt = lnet_cpt_of_nid(peer_ni->ibp_nid, peer_ni->ibp_ni); + sched = kiblnd_get_scheduler(cpt); + + if (sched == NULL) { + CERROR("no schedulers available. node is unhealthy\n"); + goto failed_0; + } + + /* + * The cpt might have changed if we ended up selecting a non cpt + * native scheduler. So use the scheduler's cpt instead. + */ + cpt = sched->ibs_cpt; + + LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt, + sizeof(*init_qp_attr)); + if (init_qp_attr == NULL) { + CERROR("Can't allocate qp_attr for %s\n", + libcfs_nid2str(peer_ni->ibp_nid)); + goto failed_0; + } + + LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn)); + if (conn == NULL) { + CERROR("Can't allocate connection for %s\n", + libcfs_nid2str(peer_ni->ibp_nid)); + goto failed_1; + } + + conn->ibc_state = IBLND_CONN_INIT; + conn->ibc_version = version; + conn->ibc_peer = peer_ni; /* I take the caller's ref */ + cmid->context = conn; /* for future CM callbacks */ + conn->ibc_cmid = cmid; + conn->ibc_max_frags = peer_ni->ibp_max_frags; + conn->ibc_queue_depth = peer_ni->ibp_queue_depth; + + INIT_LIST_HEAD(&conn->ibc_early_rxs); + INIT_LIST_HEAD(&conn->ibc_tx_noops); + INIT_LIST_HEAD(&conn->ibc_tx_queue); + INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd); + INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred); + INIT_LIST_HEAD(&conn->ibc_active_txs); + INIT_LIST_HEAD(&conn->ibc_zombie_txs); + spin_lock_init(&conn->ibc_lock); + + LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt, + sizeof(*conn->ibc_connvars)); + if (conn->ibc_connvars == NULL) { + CERROR("Can't allocate in-progress connection state\n"); + goto failed_2; + } + + write_lock_irqsave(glock, flags); + if (dev->ibd_failover) { + write_unlock_irqrestore(glock, flags); + CERROR("%s: failover in progress\n", dev->ibd_ifname); + goto failed_2; + } + + if (dev->ibd_hdev->ibh_ibdev != cmid->device) { + /* wakeup failover thread and teardown connection */ + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + wake_up(&kiblnd_data.kib_failover_waitq); + } + + write_unlock_irqrestore(glock, flags); + CERROR("cmid HCA(%s), kib_dev(%s) need failover\n", + cmid->device->name, dev->ibd_ifname); + goto failed_2; + } + + kiblnd_hdev_addref_locked(dev->ibd_hdev); + conn->ibc_hdev = dev->ibd_hdev; + + kiblnd_setup_mtu_locked(cmid); + + write_unlock_irqrestore(glock, flags); + + LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt, + IBLND_RX_MSGS(conn) * sizeof(struct kib_rx)); + if (conn->ibc_rxs == NULL) { + CERROR("Cannot allocate RX buffers\n"); + goto failed_2; + } + + rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt, + IBLND_RX_MSG_PAGES(conn)); + if (rc != 0) + goto failed_2; + + kiblnd_map_rx_descs(conn); + +#ifdef HAVE_IB_CQ_INIT_ATTR + cq_attr.cqe = IBLND_CQ_ENTRIES(conn); + cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt); + cq = ib_create_cq(cmid->device, + kiblnd_cq_completion, kiblnd_cq_event, conn, + &cq_attr); +#else + cq = ib_create_cq(cmid->device, + kiblnd_cq_completion, kiblnd_cq_event, conn, + IBLND_CQ_ENTRIES(conn), + kiblnd_get_completion_vector(conn, cpt)); +#endif + if (IS_ERR(cq)) { + /* + * on MLX-5 (possibly MLX-4 as well) this error could be + * hit if the concurrent_sends and/or peer_tx_credits is set + * too high. Or due to an MLX-5 bug which tries to + * allocate 256kb via kmalloc for WR cookie array + */ + CERROR("Failed to create CQ with %d CQEs: %ld\n", + IBLND_CQ_ENTRIES(conn), PTR_ERR(cq)); + goto failed_2; + } + + conn->ibc_cq = cq; + + rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (rc != 0) { + CERROR("Can't request completion notification: %d\n", rc); + goto failed_2; + } + + init_qp_attr->event_handler = kiblnd_qp_event; + init_qp_attr->qp_context = conn; + init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge; + init_qp_attr->cap.max_recv_sge = 1; + init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + init_qp_attr->qp_type = IB_QPT_RC; + init_qp_attr->send_cq = cq; + init_qp_attr->recv_cq = cq; + /* + * kiblnd_send_wrs() can change the connection's queue depth if + * the maximum work requests for the device is maxed out + */ + init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn); + init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn); + + rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr); + if (rc) { + CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, " + "send_sge: %d, recv_sge: %d\n", + rc, init_qp_attr->cap.max_send_wr, + init_qp_attr->cap.max_recv_wr, + init_qp_attr->cap.max_send_sge, + init_qp_attr->cap.max_recv_sge); + goto failed_2; + } + + conn->ibc_sched = sched; + + if (conn->ibc_queue_depth != peer_ni->ibp_queue_depth) + CWARN("peer %s - queue depth reduced from %u to %u" + " to allow for qp creation\n", + libcfs_nid2str(peer_ni->ibp_nid), + peer_ni->ibp_queue_depth, + conn->ibc_queue_depth); + + LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); + + /* 1 ref for caller and each rxmsg */ + atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn)); + conn->ibc_nrx = IBLND_RX_MSGS(conn); + + /* post receives */ + for (i = 0; i < IBLND_RX_MSGS(conn); i++) { + rc = kiblnd_post_rx(&conn->ibc_rxs[i], IBLND_POSTRX_NO_CREDIT); + if (rc != 0) { + CERROR("Can't post rxmsg: %d\n", rc); + + /* Make posted receives complete */ + kiblnd_abort_receives(conn); + + /* correct # of posted buffers + * NB locking needed now I'm racing with completion */ + spin_lock_irqsave(&sched->ibs_lock, flags); + conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i; + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + /* cmid will be destroyed by CM(ofed) after cm_callback + * returned, so we can't refer it anymore + * (by kiblnd_connd()->kiblnd_destroy_conn) */ + rdma_destroy_qp(conn->ibc_cmid); + conn->ibc_cmid = NULL; + + /* Drop my own and unused rxbuffer refcounts */ + while (i++ <= IBLND_RX_MSGS(conn)) + kiblnd_conn_decref(conn); + + return NULL; + } + } + + /* Init successful! */ + LASSERT (state == IBLND_CONN_ACTIVE_CONNECT || + state == IBLND_CONN_PASSIVE_WAIT); + conn->ibc_state = state; + + /* 1 more conn */ + atomic_inc(&net->ibn_nconns); + return conn; + + failed_2: + kiblnd_destroy_conn(conn); + LIBCFS_FREE(conn, sizeof(*conn)); + failed_1: + LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); + failed_0: + return NULL; +} + +void +kiblnd_destroy_conn(struct kib_conn *conn) +{ + struct rdma_cm_id *cmid = conn->ibc_cmid; + struct kib_peer_ni *peer_ni = conn->ibc_peer; + + LASSERT (!in_interrupt()); + LASSERT (atomic_read(&conn->ibc_refcount) == 0); + LASSERT(list_empty(&conn->ibc_early_rxs)); + LASSERT(list_empty(&conn->ibc_tx_noops)); + LASSERT(list_empty(&conn->ibc_tx_queue)); + LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd)); + LASSERT(list_empty(&conn->ibc_tx_queue_nocred)); + LASSERT(list_empty(&conn->ibc_active_txs)); + LASSERT (conn->ibc_noops_posted == 0); + LASSERT (conn->ibc_nsends_posted == 0); + + switch (conn->ibc_state) { + default: + /* conn must be completely disengaged from the network */ + LBUG(); + + case IBLND_CONN_DISCONNECTED: + /* connvars should have been freed already */ + LASSERT (conn->ibc_connvars == NULL); + break; + + case IBLND_CONN_INIT: + break; + } + + /* conn->ibc_cmid might be destroyed by CM already */ + if (cmid != NULL && cmid->qp != NULL) + rdma_destroy_qp(cmid); + + if (conn->ibc_cq) + ib_destroy_cq(conn->ibc_cq); + + kiblnd_txlist_done(&conn->ibc_zombie_txs, -ECONNABORTED, + LNET_MSG_STATUS_OK); + + if (conn->ibc_rx_pages != NULL) + kiblnd_unmap_rx_descs(conn); + + if (conn->ibc_rxs != NULL) { + LIBCFS_FREE(conn->ibc_rxs, + IBLND_RX_MSGS(conn) * sizeof(struct kib_rx)); + } + + if (conn->ibc_connvars != NULL) + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + + if (conn->ibc_hdev != NULL) + kiblnd_hdev_decref(conn->ibc_hdev); + + /* See CAVEAT EMPTOR above in kiblnd_create_conn */ + if (conn->ibc_state != IBLND_CONN_INIT) { + struct kib_net *net = peer_ni->ibp_ni->ni_data; + + kiblnd_peer_decref(peer_ni); + rdma_destroy_id(cmid); + atomic_dec(&net->ibn_nconns); + } +} + +int +kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why) +{ + struct kib_conn *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) { + conn = list_entry(ctmp, struct kib_conn, ibc_list); + + CDEBUG(D_NET, "Closing conn -> %s, " + "version: %x, reason: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), + conn->ibc_version, why); + + kiblnd_close_conn_locked(conn, why); + count++; + } + + return count; +} + +int +kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni, + int version, __u64 incarnation) +{ + struct kib_conn *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) { + conn = list_entry(ctmp, struct kib_conn, ibc_list); + + if (conn->ibc_version == version && + conn->ibc_incarnation == incarnation) + continue; + + CDEBUG(D_NET, "Closing stale conn -> %s version: %x, " + "incarnation:%#llx(%x, %#llx)\n", + libcfs_nid2str(peer_ni->ibp_nid), + conn->ibc_version, conn->ibc_incarnation, + version, incarnation); + + kiblnd_close_conn_locked(conn, -ESTALE); + count++; + } + + return count; +} + +static int +kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid) +{ + struct kib_peer_ni *peer_ni; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + unsigned long flags; + int count = 0; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (nid != LNET_NID_ANY) + lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; + else { + lo = 0; + hi = kiblnd_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { + + peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list); + LASSERT(!kiblnd_peer_idle(peer_ni)); + + if (peer_ni->ibp_ni != ni) + continue; + + if (!(nid == LNET_NID_ANY || nid == peer_ni->ibp_nid)) + continue; + + count += kiblnd_close_peer_conns_locked(peer_ni, 0); + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* wildcards always succeed */ + if (nid == LNET_NID_ANY) + return 0; + + return (count == 0) ? -ENOENT : 0; +} + +static int +kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; + + switch(cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; + int count = 0; + + rc = kiblnd_get_peer_info(ni, data->ioc_count, + &nid, &count); + data->ioc_nid = nid; + data->ioc_count = count; + break; + } + + case IOC_LIBCFS_DEL_PEER: { + rc = kiblnd_del_peer(ni, data->ioc_nid); + break; + } + case IOC_LIBCFS_GET_CONN: { + struct kib_conn *conn; + + rc = 0; + conn = kiblnd_get_conn_by_idx(ni, data->ioc_count); + if (conn == NULL) { + rc = -ENOENT; + break; + } + + LASSERT(conn->ibc_cmid != NULL); + data->ioc_nid = conn->ibc_peer->ibp_nid; + if (conn->ibc_cmid->route.path_rec == NULL) + data->ioc_u32[0] = 0; /* iWarp has no path MTU */ + else + data->ioc_u32[0] = + ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); + kiblnd_conn_decref(conn); + break; + } + case IOC_LIBCFS_CLOSE_CONNECTION: { + rc = kiblnd_close_matching_conns(ni, data->ioc_nid); + break; + } + + default: + break; + } + + return rc; +} + +static void +kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when) +{ + time64_t last_alive = 0; + time64_t now = ktime_get_seconds(); + rwlock_t *glock = &kiblnd_data.kib_global_lock; + struct kib_peer_ni *peer_ni; + unsigned long flags; + + read_lock_irqsave(glock, flags); + + peer_ni = kiblnd_find_peer_locked(ni, nid); + if (peer_ni != NULL) + last_alive = peer_ni->ibp_last_alive; + + read_unlock_irqrestore(glock, flags); + + if (last_alive != 0) + *when = last_alive; + + /* peer_ni is not persistent in hash, trigger peer_ni creation + * and connection establishment with a NULL tx */ + if (peer_ni == NULL) + kiblnd_launch_tx(ni, NULL, nid); + + CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago\n", + libcfs_nid2str(nid), peer_ni, + last_alive ? now - last_alive : -1); + return; +} + +static void +kiblnd_free_pages(struct kib_pages *p) +{ + int npages = p->ibp_npages; + int i; + + for (i = 0; i < npages; i++) { + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); + } + + LIBCFS_FREE(p, offsetof(struct kib_pages, ibp_pages[npages])); +} + +int +kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages) +{ + struct kib_pages *p; + int i; + + LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt, + offsetof(struct kib_pages, ibp_pages[npages])); + if (p == NULL) { + CERROR("Can't allocate descriptor for %d pages\n", npages); + return -ENOMEM; + } + + memset(p, 0, offsetof(struct kib_pages, ibp_pages[npages])); + p->ibp_npages = npages; + + for (i = 0; i < npages; i++) { + p->ibp_pages[i] = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, + GFP_NOFS); + if (p->ibp_pages[i] == NULL) { + CERROR("Can't allocate page %d of %d\n", i, npages); + kiblnd_free_pages(p); + return -ENOMEM; + } + } + + *pp = p; + return 0; +} + +void +kiblnd_unmap_rx_descs(struct kib_conn *conn) +{ + struct kib_rx *rx; + int i; + + LASSERT (conn->ibc_rxs != NULL); + LASSERT (conn->ibc_hdev != NULL); + + for (i = 0; i < IBLND_RX_MSGS(conn); i++) { + rx = &conn->ibc_rxs[i]; + + LASSERT(rx->rx_nob >= 0); /* not posted */ + + kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev, + KIBLND_UNMAP_ADDR(rx, rx_msgunmap, + rx->rx_msgaddr), + IBLND_MSG_SIZE, DMA_FROM_DEVICE); + } + + kiblnd_free_pages(conn->ibc_rx_pages); + + conn->ibc_rx_pages = NULL; +} + +void +kiblnd_map_rx_descs(struct kib_conn *conn) +{ + struct kib_rx *rx; + struct page *pg; + int pg_off; + int ipg; + int i; + + for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) { + pg = conn->ibc_rx_pages->ibp_pages[ipg]; + rx = &conn->ibc_rxs[i]; + + rx->rx_conn = conn; + rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off); + + rx->rx_msgaddr = + kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev, + rx->rx_msg, IBLND_MSG_SIZE, + DMA_FROM_DEVICE); + LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev, + rx->rx_msgaddr)); + KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr); + + CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n", + i, rx->rx_msg, rx->rx_msgaddr, + (__u64)(page_to_phys(pg) + pg_off)); + + pg_off += IBLND_MSG_SIZE; + LASSERT(pg_off <= PAGE_SIZE); + + if (pg_off == PAGE_SIZE) { + pg_off = 0; + ipg++; + LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn)); + } + } +} + +static void +kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo) +{ + struct kib_hca_dev *hdev = tpo->tpo_hdev; + struct kib_tx *tx; + int i; + + LASSERT (tpo->tpo_pool.po_allocated == 0); + + if (hdev == NULL) + return; + + for (i = 0; i < tpo->tpo_pool.po_size; i++) { + tx = &tpo->tpo_tx_descs[i]; + kiblnd_dma_unmap_single(hdev->ibh_ibdev, + KIBLND_UNMAP_ADDR(tx, tx_msgunmap, + tx->tx_msgaddr), + IBLND_MSG_SIZE, DMA_TO_DEVICE); + } + + kiblnd_hdev_decref(hdev); + tpo->tpo_hdev = NULL; +} + +static struct kib_hca_dev * +kiblnd_current_hdev(struct kib_dev *dev) +{ + struct kib_hca_dev *hdev; + unsigned long flags; + int i = 0; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + while (dev->ibd_failover) { + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + if (i++ % 50 == 0) + CDEBUG(D_NET, "%s: Wait for failover\n", + dev->ibd_ifname); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 100); + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + } + + kiblnd_hdev_addref_locked(dev->ibd_hdev); + hdev = dev->ibd_hdev; + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + return hdev; +} + +static void +kiblnd_map_tx_pool(struct kib_tx_pool *tpo) +{ + struct kib_pages *txpgs = tpo->tpo_tx_pages; + struct kib_pool *pool = &tpo->tpo_pool; + struct kib_net *net = pool->po_owner->ps_net; + struct kib_dev *dev; + struct page *page; + struct kib_tx *tx; + int page_offset; + int ipage; + int i; + + LASSERT (net != NULL); + + dev = net->ibn_dev; + + /* pre-mapped messages are not bigger than 1 page */ + CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE); + + /* No fancy arithmetic when we do the buffer calculations */ + CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0); + + tpo->tpo_hdev = kiblnd_current_hdev(dev); + + for (ipage = page_offset = i = 0; i < pool->po_size; i++) { + page = txpgs->ibp_pages[ipage]; + tx = &tpo->tpo_tx_descs[i]; + + tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) + + page_offset); + + tx->tx_msgaddr = kiblnd_dma_map_single(tpo->tpo_hdev->ibh_ibdev, + tx->tx_msg, + IBLND_MSG_SIZE, + DMA_TO_DEVICE); + LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev, + tx->tx_msgaddr)); + KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr); + + list_add(&tx->tx_list, &pool->po_free_list); + + page_offset += IBLND_MSG_SIZE; + LASSERT(page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT(ipage <= txpgs->ibp_npages); + } + } +} + +static void +kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo) +{ + LASSERT(fpo->fpo_map_count == 0); + +#ifdef HAVE_FMR_POOL_API + if (fpo->fpo_is_fmr && fpo->fmr.fpo_fmr_pool) { + ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool); + } else +#endif /* HAVE_FMR_POOL_API */ + { + struct kib_fast_reg_descriptor *frd, *tmp; + int i = 0; + + list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, + frd_list) { + list_del(&frd->frd_list); +#ifndef HAVE_IB_MAP_MR_SG + ib_free_fast_reg_page_list(frd->frd_frpl); +#endif + ib_dereg_mr(frd->frd_mr); + LIBCFS_FREE(frd, sizeof(*frd)); + i++; + } + if (i < fpo->fast_reg.fpo_pool_size) + CERROR("FastReg pool still has %d regions registered\n", + fpo->fast_reg.fpo_pool_size - i); + } + + if (fpo->fpo_hdev) + kiblnd_hdev_decref(fpo->fpo_hdev); + + LIBCFS_FREE(fpo, sizeof(*fpo)); +} + +static void +kiblnd_destroy_fmr_pool_list(struct list_head *head) +{ + struct kib_fmr_pool *fpo, *tmp; + + list_for_each_entry_safe(fpo, tmp, head, fpo_list) { + list_del(&fpo->fpo_list); + kiblnd_destroy_fmr_pool(fpo); + } +} + +static int +kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables, + int ncpts) +{ + int size = tunables->lnd_fmr_pool_size / ncpts; + + return max(IBLND_FMR_POOL, size); +} + +static int +kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables, + int ncpts) +{ + int size = tunables->lnd_fmr_flush_trigger / ncpts; + + return max(IBLND_FMR_POOL_FLUSH, size); +} + +#ifdef HAVE_FMR_POOL_API +static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps, + struct kib_fmr_pool *fpo) +{ + struct ib_fmr_pool_param param = { + .max_pages_per_fmr = LNET_MAX_IOV, + .page_shift = PAGE_SHIFT, + .access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE), + .pool_size = fps->fps_pool_size, + .dirty_watermark = fps->fps_flush_trigger, + .flush_function = NULL, + .flush_arg = NULL, + .cache = !!fps->fps_cache }; + int rc = 0; + + fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, + ¶m); + if (IS_ERR(fpo->fmr.fpo_fmr_pool)) { + rc = PTR_ERR(fpo->fmr.fpo_fmr_pool); + if (rc != -ENOSYS) + CERROR("Failed to create FMR pool: %d\n", rc); + else + CERROR("FMRs are not supported\n"); + } + fpo->fpo_is_fmr = true; + + return rc; +} +#endif /* HAVE_FMR_POOL_API */ + +static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps, + struct kib_fmr_pool *fpo, + enum kib_dev_caps dev_caps) +{ + struct kib_fast_reg_descriptor *frd, *tmp; + int i, rc; + +#ifdef HAVE_FMR_POOL_API + fpo->fpo_is_fmr = false; +#endif + + INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list); + fpo->fast_reg.fpo_pool_size = 0; + for (i = 0; i < fps->fps_pool_size; i++) { + LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt, + sizeof(*frd)); + if (!frd) { + CERROR("Failed to allocate a new fast_reg descriptor\n"); + rc = -ENOMEM; + goto out; + } + frd->frd_mr = NULL; + +#ifndef HAVE_IB_MAP_MR_SG + frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev, + LNET_MAX_IOV); + if (IS_ERR(frd->frd_frpl)) { + rc = PTR_ERR(frd->frd_frpl); + CERROR("Failed to allocate ib_fast_reg_page_list: %d\n", + rc); + frd->frd_frpl = NULL; + goto out_middle; + } +#endif + +#ifdef HAVE_IB_ALLOC_FAST_REG_MR + frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd, + LNET_MAX_IOV); +#else + /* + * it is expected to get here if this is an MLX-5 card. + * MLX-4 cards will always use FMR and MLX-5 cards will + * always use fast_reg. It turns out that some MLX-5 cards + * (possibly due to older FW versions) do not natively support + * gaps. So we will need to track them here. + */ + frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd, +#ifdef IB_MR_TYPE_SG_GAPS + ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) && + (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT)) ? + IB_MR_TYPE_SG_GAPS : + IB_MR_TYPE_MEM_REG, +#else + IB_MR_TYPE_MEM_REG, +#endif + LNET_MAX_IOV); + if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) && + (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT)) + CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n"); +#endif + if (IS_ERR(frd->frd_mr)) { + rc = PTR_ERR(frd->frd_mr); + CERROR("Failed to allocate ib_fast_reg_mr: %d\n", rc); + frd->frd_mr = NULL; + goto out_middle; + } + + /* There appears to be a bug in MLX5 code where you must + * invalidate the rkey of a new FastReg pool before first + * using it. Thus, I am marking the FRD invalid here. */ + frd->frd_valid = false; + + list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); + fpo->fast_reg.fpo_pool_size++; + } + + return 0; + +out_middle: + if (frd->frd_mr) + ib_dereg_mr(frd->frd_mr); +#ifndef HAVE_IB_MAP_MR_SG + if (frd->frd_frpl) + ib_free_fast_reg_page_list(frd->frd_frpl); +#endif + LIBCFS_FREE(frd, sizeof(*frd)); + +out: + list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, + frd_list) { + list_del(&frd->frd_list); +#ifndef HAVE_IB_MAP_MR_SG + ib_free_fast_reg_page_list(frd->frd_frpl); +#endif + ib_dereg_mr(frd->frd_mr); + LIBCFS_FREE(frd, sizeof(*frd)); + } + + return rc; +} + +static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps, + struct kib_fmr_pool **pp_fpo) +{ + struct kib_dev *dev = fps->fps_net->ibn_dev; + struct kib_fmr_pool *fpo; + int rc; + + LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo)); + if (!fpo) { + return -ENOMEM; + } + memset(fpo, 0, sizeof(*fpo)); + + fpo->fpo_hdev = kiblnd_current_hdev(dev); + +#ifdef HAVE_FMR_POOL_API + if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED) + rc = kiblnd_alloc_fmr_pool(fps, fpo); + else +#endif /* HAVE_FMR_POOL_API */ + rc = kiblnd_alloc_freg_pool(fps, fpo, dev->ibd_dev_caps); + if (rc) + goto out_fpo; + + fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE; + fpo->fpo_owner = fps; + *pp_fpo = fpo; + + return 0; + +out_fpo: + kiblnd_hdev_decref(fpo->fpo_hdev); + LIBCFS_FREE(fpo, sizeof(*fpo)); + return rc; +} + +static void +kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies) +{ + if (fps->fps_net == NULL) /* intialized? */ + return; + + spin_lock(&fps->fps_lock); + + while (!list_empty(&fps->fps_pool_list)) { + struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next, + struct kib_fmr_pool, + fpo_list); + + fpo->fpo_failed = 1; + list_del(&fpo->fpo_list); + if (fpo->fpo_map_count == 0) + list_add(&fpo->fpo_list, zombies); + else + list_add(&fpo->fpo_list, &fps->fps_failed_pool_list); + } + + spin_unlock(&fps->fps_lock); +} + +static void +kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps) +{ + if (fps->fps_net != NULL) { /* initialized? */ + kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list); + kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); + } +} + +static int +kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts, + struct kib_net *net, + struct lnet_ioctl_config_o2iblnd_tunables *tunables) +{ + struct kib_fmr_pool *fpo; + int rc; + + memset(fps, 0, sizeof(struct kib_fmr_poolset)); + + fps->fps_net = net; + fps->fps_cpt = cpt; + + fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts); + fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts); + fps->fps_cache = tunables->lnd_fmr_cache; + + spin_lock_init(&fps->fps_lock); + INIT_LIST_HEAD(&fps->fps_pool_list); + INIT_LIST_HEAD(&fps->fps_failed_pool_list); + + rc = kiblnd_create_fmr_pool(fps, &fpo); + if (rc == 0) + list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); + + return rc; +} + +static int +kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, time64_t now) +{ + if (fpo->fpo_map_count != 0) /* still in use */ + return 0; + if (fpo->fpo_failed) + return 1; + return now >= fpo->fpo_deadline; +} + +#if defined(HAVE_FMR_POOL_API) || !defined(HAVE_IB_MAP_MR_SG) +static int +kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd) +{ + struct kib_hca_dev *hdev; + __u64 *pages = tx->tx_pages; + int npages; + int size; + int i; + + hdev = tx->tx_pool->tpo_hdev; + + for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { + for (size = 0; size < rd->rd_frags[i].rf_nob; + size += hdev->ibh_page_size) { + pages[npages++] = (rd->rd_frags[i].rf_addr & + hdev->ibh_page_mask) + size; + } + } + + return npages; +} +#endif + +void +kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status) +{ + struct list_head zombies = LIST_HEAD_INIT(zombies); + struct kib_fmr_pool *fpo = fmr->fmr_pool; + struct kib_fmr_poolset *fps; + time64_t now = ktime_get_seconds(); + struct kib_fmr_pool *tmp; + + if (!fpo) + return; + + fps = fpo->fpo_owner; + +#ifdef HAVE_FMR_POOL_API + if (fpo->fpo_is_fmr) { + if (fmr->fmr_pfmr) { + ib_fmr_pool_unmap(fmr->fmr_pfmr); + fmr->fmr_pfmr = NULL; + } + + if (status) { + int rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool); + LASSERT(!rc); + } + } else +#endif /* HAVE_FMR_POOL_API */ + { + struct kib_fast_reg_descriptor *frd = fmr->fmr_frd; + + if (frd) { + frd->frd_valid = false; + frd->frd_posted = false; + fmr->fmr_frd = NULL; + spin_lock(&fps->fps_lock); + list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); + spin_unlock(&fps->fps_lock); + } + } + fmr->fmr_pool = NULL; + + spin_lock(&fps->fps_lock); + fpo->fpo_map_count--; /* decref the pool */ + + list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) { + /* the first pool is persistent */ + if (fps->fps_pool_list.next == &fpo->fpo_list) + continue; + + if (kiblnd_fmr_pool_is_idle(fpo, now)) { + list_move(&fpo->fpo_list, &zombies); + fps->fps_version++; + } + } + spin_unlock(&fps->fps_lock); + + if (!list_empty(&zombies)) + kiblnd_destroy_fmr_pool_list(&zombies); +} + +int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx, + struct kib_rdma_desc *rd, u32 nob, u64 iov, + struct kib_fmr *fmr) +{ + struct kib_fmr_pool *fpo; + __u64 version; + bool is_rx = (rd != tx->tx_rd); +#ifdef HAVE_FMR_POOL_API + __u64 *pages = tx->tx_pages; + bool tx_pages_mapped = 0; + int npages = 0; +#endif + int rc; + +again: + spin_lock(&fps->fps_lock); + version = fps->fps_version; + list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) { + fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE; + fpo->fpo_map_count++; + +#ifdef HAVE_FMR_POOL_API + fmr->fmr_pfmr = NULL; + if (fpo->fpo_is_fmr) { + struct ib_pool_fmr *pfmr; + + spin_unlock(&fps->fps_lock); + + if (!tx_pages_mapped) { + npages = kiblnd_map_tx_pages(tx, rd); + tx_pages_mapped = 1; + } + + pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool, + pages, npages, iov); + if (likely(!IS_ERR(pfmr))) { + fmr->fmr_key = is_rx ? pfmr->fmr->rkey + : pfmr->fmr->lkey; + fmr->fmr_frd = NULL; + fmr->fmr_pfmr = pfmr; + fmr->fmr_pool = fpo; + return 0; + } + rc = PTR_ERR(pfmr); + } else +#endif /* HAVE_FMR_POOL_API */ + { + if (!list_empty(&fpo->fast_reg.fpo_pool_list)) { + struct kib_fast_reg_descriptor *frd; +#ifdef HAVE_IB_MAP_MR_SG + struct ib_reg_wr *wr; + int n; +#else + struct ib_rdma_wr *wr; + struct ib_fast_reg_page_list *frpl; +#endif + struct ib_mr *mr; + + frd = list_first_entry(&fpo->fast_reg.fpo_pool_list, + struct kib_fast_reg_descriptor, + frd_list); + list_del(&frd->frd_list); + spin_unlock(&fps->fps_lock); + +#ifndef HAVE_IB_MAP_MR_SG + frpl = frd->frd_frpl; +#endif + mr = frd->frd_mr; + + if (!frd->frd_valid) { + struct ib_rdma_wr *inv_wr; + __u32 key = is_rx ? mr->rkey : mr->lkey; + + inv_wr = &frd->frd_inv_wr; + memset(inv_wr, 0, sizeof(*inv_wr)); + + inv_wr->wr.opcode = IB_WR_LOCAL_INV; + inv_wr->wr.wr_id = IBLND_WID_MR; + inv_wr->wr.ex.invalidate_rkey = key; + + /* Bump the key */ + key = ib_inc_rkey(key); + ib_update_fast_reg_key(mr, key); + } + +#ifdef HAVE_IB_MAP_MR_SG +#ifdef HAVE_IB_MAP_MR_SG_5ARGS + n = ib_map_mr_sg(mr, tx->tx_frags, + rd->rd_nfrags, NULL, PAGE_SIZE); +#else + n = ib_map_mr_sg(mr, tx->tx_frags, + rd->rd_nfrags, PAGE_SIZE); +#endif /* HAVE_IB_MAP_MR_SG_5ARGS */ + if (unlikely(n != rd->rd_nfrags)) { + CERROR("Failed to map mr %d/%d " + "elements\n", n, rd->rd_nfrags); + return n < 0 ? n : -EINVAL; + } + + wr = &frd->frd_fastreg_wr; + memset(wr, 0, sizeof(*wr)); + + wr->wr.opcode = IB_WR_REG_MR; + wr->wr.wr_id = IBLND_WID_MR; + wr->wr.num_sge = 0; + wr->wr.send_flags = 0; + wr->mr = mr; + wr->key = is_rx ? mr->rkey : mr->lkey; + wr->access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); +#else /* HAVE_IB_MAP_MR_SG */ + if (!tx_pages_mapped) { + npages = kiblnd_map_tx_pages(tx, rd); + tx_pages_mapped = 1; + } + + LASSERT(npages <= frpl->max_page_list_len); + memcpy(frpl->page_list, pages, + sizeof(*pages) * npages); + + /* Prepare FastReg WR */ + wr = &frd->frd_fastreg_wr; + memset(wr, 0, sizeof(*wr)); + + wr->wr.opcode = IB_WR_FAST_REG_MR; + wr->wr.wr_id = IBLND_WID_MR; + + wr->wr.wr.fast_reg.iova_start = iov; + wr->wr.wr.fast_reg.page_list = frpl; + wr->wr.wr.fast_reg.page_list_len = npages; + wr->wr.wr.fast_reg.page_shift = PAGE_SHIFT; + wr->wr.wr.fast_reg.length = nob; + wr->wr.wr.fast_reg.rkey = + is_rx ? mr->rkey : mr->lkey; + wr->wr.wr.fast_reg.access_flags = + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); +#endif /* HAVE_IB_MAP_MR_SG */ + + fmr->fmr_key = is_rx ? mr->rkey : mr->lkey; + fmr->fmr_frd = frd; + fmr->fmr_pool = fpo; + frd->frd_posted = false; + return 0; + } + spin_unlock(&fps->fps_lock); + rc = -EAGAIN; + } + + spin_lock(&fps->fps_lock); + fpo->fpo_map_count--; + if (rc != -EAGAIN) { + spin_unlock(&fps->fps_lock); + return rc; + } + + /* EAGAIN and ... */ + if (version != fps->fps_version) { + spin_unlock(&fps->fps_lock); + goto again; + } + } + + if (fps->fps_increasing) { + spin_unlock(&fps->fps_lock); + CDEBUG(D_NET, "Another thread is allocating new " + "FMR pool, waiting for her to complete\n"); + schedule(); + goto again; + + } + + if (ktime_get_seconds() < fps->fps_next_retry) { + /* someone failed recently */ + spin_unlock(&fps->fps_lock); + return -EAGAIN; + } + + fps->fps_increasing = 1; + spin_unlock(&fps->fps_lock); + + CDEBUG(D_NET, "Allocate new FMR pool\n"); + rc = kiblnd_create_fmr_pool(fps, &fpo); + spin_lock(&fps->fps_lock); + fps->fps_increasing = 0; + if (rc == 0) { + fps->fps_version++; + list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); + } else { + fps->fps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY; + } + spin_unlock(&fps->fps_lock); + + goto again; +} + +static void +kiblnd_fini_pool(struct kib_pool *pool) +{ + LASSERT(list_empty(&pool->po_free_list)); + LASSERT(pool->po_allocated == 0); + + CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name); +} + +static void +kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size) +{ + CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name); + + memset(pool, 0, sizeof(struct kib_pool)); + INIT_LIST_HEAD(&pool->po_free_list); + pool->po_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE; + pool->po_owner = ps; + pool->po_size = size; +} + +static void +kiblnd_destroy_pool_list(struct list_head *head) +{ + struct kib_pool *pool; + + while (!list_empty(head)) { + pool = list_entry(head->next, struct kib_pool, po_list); + list_del(&pool->po_list); + + LASSERT(pool->po_owner != NULL); + pool->po_owner->ps_pool_destroy(pool); + } +} + +static void +kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies) +{ + if (ps->ps_net == NULL) /* intialized? */ + return; + + spin_lock(&ps->ps_lock); + while (!list_empty(&ps->ps_pool_list)) { + struct kib_pool *po = list_entry(ps->ps_pool_list.next, + struct kib_pool, po_list); + + po->po_failed = 1; + list_del(&po->po_list); + if (po->po_allocated == 0) + list_add(&po->po_list, zombies); + else + list_add(&po->po_list, &ps->ps_failed_pool_list); + } + spin_unlock(&ps->ps_lock); +} + +static void +kiblnd_fini_poolset(struct kib_poolset *ps) +{ + if (ps->ps_net != NULL) { /* initialized? */ + kiblnd_destroy_pool_list(&ps->ps_failed_pool_list); + kiblnd_destroy_pool_list(&ps->ps_pool_list); + } +} + +static int +kiblnd_init_poolset(struct kib_poolset *ps, int cpt, + struct kib_net *net, char *name, int size, + kib_ps_pool_create_t po_create, + kib_ps_pool_destroy_t po_destroy, + kib_ps_node_init_t nd_init, + kib_ps_node_fini_t nd_fini) +{ + struct kib_pool *pool; + int rc; + + memset(ps, 0, sizeof(struct kib_poolset)); + + ps->ps_cpt = cpt; + ps->ps_net = net; + ps->ps_pool_create = po_create; + ps->ps_pool_destroy = po_destroy; + ps->ps_node_init = nd_init; + ps->ps_node_fini = nd_fini; + ps->ps_pool_size = size; + if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name)) + >= sizeof(ps->ps_name)) + return -E2BIG; + spin_lock_init(&ps->ps_lock); + INIT_LIST_HEAD(&ps->ps_pool_list); + INIT_LIST_HEAD(&ps->ps_failed_pool_list); + + rc = ps->ps_pool_create(ps, size, &pool); + if (rc == 0) + list_add(&pool->po_list, &ps->ps_pool_list); + else + CERROR("Failed to create the first pool for %s\n", ps->ps_name); + + return rc; +} + +static int +kiblnd_pool_is_idle(struct kib_pool *pool, time64_t now) +{ + if (pool->po_allocated != 0) /* still in use */ + return 0; + if (pool->po_failed) + return 1; + return now >= pool->po_deadline; +} + +void +kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node) +{ + struct list_head zombies = LIST_HEAD_INIT(zombies); + struct kib_poolset *ps = pool->po_owner; + struct kib_pool *tmp; + time64_t now = ktime_get_seconds(); + + spin_lock(&ps->ps_lock); + + if (ps->ps_node_fini != NULL) + ps->ps_node_fini(pool, node); + + LASSERT(pool->po_allocated > 0); + list_add(node, &pool->po_free_list); + pool->po_allocated--; + + list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) { + /* the first pool is persistent */ + if (ps->ps_pool_list.next == &pool->po_list) + continue; + + if (kiblnd_pool_is_idle(pool, now)) + list_move(&pool->po_list, &zombies); + } + spin_unlock(&ps->ps_lock); + + if (!list_empty(&zombies)) + kiblnd_destroy_pool_list(&zombies); +} + +struct list_head * +kiblnd_pool_alloc_node(struct kib_poolset *ps) +{ + struct list_head *node; + struct kib_pool *pool; + int rc; + unsigned int interval = 1; + ktime_t time_before; + unsigned int trips = 0; + +again: + spin_lock(&ps->ps_lock); + list_for_each_entry(pool, &ps->ps_pool_list, po_list) { + if (list_empty(&pool->po_free_list)) + continue; + + pool->po_allocated++; + pool->po_deadline = ktime_get_seconds() + + IBLND_POOL_DEADLINE; + node = pool->po_free_list.next; + list_del(node); + + if (ps->ps_node_init != NULL) { + /* still hold the lock */ + ps->ps_node_init(pool, node); + } + spin_unlock(&ps->ps_lock); + return node; + } + + /* no available tx pool and ... */ + if (ps->ps_increasing) { + /* another thread is allocating a new pool */ + spin_unlock(&ps->ps_lock); + trips++; + CDEBUG(D_NET, "Another thread is allocating new " + "%s pool, waiting %d HZs for her to complete." + "trips = %d\n", + ps->ps_name, interval, trips); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(interval); + if (interval < cfs_time_seconds(1)) + interval *= 2; + + goto again; + } + + if (ktime_get_seconds() < ps->ps_next_retry) { + /* someone failed recently */ + spin_unlock(&ps->ps_lock); + return NULL; + } + + ps->ps_increasing = 1; + spin_unlock(&ps->ps_lock); + + CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name); + time_before = ktime_get(); + rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool); + CDEBUG(D_NET, "ps_pool_create took %lld ms to complete", + ktime_ms_delta(ktime_get(), time_before)); + + spin_lock(&ps->ps_lock); + ps->ps_increasing = 0; + if (rc == 0) { + list_add_tail(&pool->po_list, &ps->ps_pool_list); + } else { + ps->ps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY; + CERROR("Can't allocate new %s pool because out of memory\n", + ps->ps_name); + } + spin_unlock(&ps->ps_lock); + + goto again; +} + +static void +kiblnd_destroy_tx_pool(struct kib_pool *pool) +{ + struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool, + tpo_pool); + int i; + + LASSERT (pool->po_allocated == 0); + + if (tpo->tpo_tx_pages != NULL) { + kiblnd_unmap_tx_pool(tpo); + kiblnd_free_pages(tpo->tpo_tx_pages); + } + + if (tpo->tpo_tx_descs == NULL) + goto out; + + for (i = 0; i < pool->po_size; i++) { + struct kib_tx *tx = &tpo->tpo_tx_descs[i]; + int wrq_sge = *kiblnd_tunables.kib_wrq_sge; + + list_del(&tx->tx_list); + if (tx->tx_pages != NULL) + LIBCFS_FREE(tx->tx_pages, + LNET_MAX_IOV * + sizeof(*tx->tx_pages)); + if (tx->tx_frags != NULL) + LIBCFS_FREE(tx->tx_frags, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_frags)); + if (tx->tx_wrq != NULL) + LIBCFS_FREE(tx->tx_wrq, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_sge != NULL) + LIBCFS_FREE(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge * + sizeof(*tx->tx_sge)); + if (tx->tx_rd != NULL) + LIBCFS_FREE(tx->tx_rd, + offsetof(struct kib_rdma_desc, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + } + + LIBCFS_FREE(tpo->tpo_tx_descs, + pool->po_size * sizeof(struct kib_tx)); +out: + kiblnd_fini_pool(pool); + LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool)); +} + +static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + int ntx; + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + ntx = tunables->lnd_ntx / ncpts; + + return max(IBLND_TX_POOL, ntx); +} + +static int +kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po) +{ + int i; + int npg; + struct kib_pool *pool; + struct kib_tx_pool *tpo; + + LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo)); + if (tpo == NULL) { + CERROR("Failed to allocate TX pool\n"); + return -ENOMEM; + } + + pool = &tpo->tpo_pool; + kiblnd_init_pool(ps, pool, size); + tpo->tpo_tx_descs = NULL; + tpo->tpo_tx_pages = NULL; + + npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE; + if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) { + CERROR("Can't allocate tx pages: %d\n", npg); + LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool)); + return -ENOMEM; + } + + LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt, + size * sizeof(struct kib_tx)); + if (tpo->tpo_tx_descs == NULL) { + CERROR("Can't allocate %d tx descriptors\n", size); + ps->ps_pool_destroy(pool); + return -ENOMEM; + } + + memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx)); + + for (i = 0; i < size; i++) { + struct kib_tx *tx = &tpo->tpo_tx_descs[i]; + int wrq_sge = *kiblnd_tunables.kib_wrq_sge; + + tx->tx_pool = tpo; + if (ps->ps_net->ibn_fmr_ps != NULL) { + LIBCFS_CPT_ALLOC(tx->tx_pages, + lnet_cpt_table(), ps->ps_cpt, + LNET_MAX_IOV * sizeof(*tx->tx_pages)); + if (tx->tx_pages == NULL) + break; + } + + LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_frags)); + if (tx->tx_frags == NULL) + break; + + sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1); + + LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_wrq == NULL) + break; + + LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt, + (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge * + sizeof(*tx->tx_sge)); + if (tx->tx_sge == NULL) + break; + + LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt, + offsetof(struct kib_rdma_desc, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + if (tx->tx_rd == NULL) + break; + } + + if (i == size) { + kiblnd_map_tx_pool(tpo); + *pp_po = pool; + return 0; + } + + ps->ps_pool_destroy(pool); + return -ENOMEM; +} + +static void +kiblnd_tx_init(struct kib_pool *pool, struct list_head *node) +{ + struct kib_tx_poolset *tps = container_of(pool->po_owner, + struct kib_tx_poolset, + tps_poolset); + struct kib_tx *tx = list_entry(node, struct kib_tx, tx_list); + + tx->tx_cookie = tps->tps_next_tx_cookie++; +} + +static void +kiblnd_net_fini_pools(struct kib_net *net) +{ + int i; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + struct kib_tx_poolset *tps; + struct kib_fmr_poolset *fps; + + if (net->ibn_tx_ps != NULL) { + tps = net->ibn_tx_ps[i]; + kiblnd_fini_poolset(&tps->tps_poolset); + } + + if (net->ibn_fmr_ps != NULL) { + fps = net->ibn_fmr_ps[i]; + kiblnd_fini_fmr_poolset(fps); + } + } + + if (net->ibn_tx_ps != NULL) { + cfs_percpt_free(net->ibn_tx_ps); + net->ibn_tx_ps = NULL; + } + + if (net->ibn_fmr_ps != NULL) { + cfs_percpt_free(net->ibn_fmr_ps); + net->ibn_fmr_ps = NULL; + } +} + +static int +kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts, + int ncpts) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; +#ifdef HAVE_IB_GET_DMA_MR + unsigned long flags; +#endif + int cpt; + int rc; + int i; + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + +#ifdef HAVE_IB_GET_DMA_MR + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + /* + * if lnd_map_on_demand is zero then we have effectively disabled + * FMR or FastReg and we're using global memory regions + * exclusively. + */ + if (!tunables->lnd_map_on_demand) { + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + goto create_tx_pool; + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +#endif + + if (tunables->lnd_fmr_pool_size < tunables->lnd_ntx / 4) { + CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", + tunables->lnd_fmr_pool_size, + tunables->lnd_ntx / 4); + rc = -EINVAL; + goto failed; + } + + /* TX pool must be created later than FMR, see LU-2268 + * for details */ + LASSERT(net->ibn_tx_ps == NULL); + + /* premapping can fail if ibd_nmr > 1, so we always create + * FMR pool and map-on-demand if premapping failed */ + + net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct kib_fmr_poolset)); + if (net->ibn_fmr_ps == NULL) { + CERROR("Failed to allocate FMR pool array\n"); + rc = -ENOMEM; + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts, + net, tunables); + if (rc != 0) { + CERROR("Can't initialize FMR pool for CPT %d: %d\n", + cpt, rc); + goto failed; + } + } + + if (i > 0) + LASSERT(i == ncpts); + +#ifdef HAVE_IB_GET_DMA_MR + create_tx_pool: +#endif + net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct kib_tx_poolset)); + if (net->ibn_tx_ps == NULL) { + CERROR("Failed to allocate tx pool array\n"); + rc = -ENOMEM; + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset, + cpt, net, "TX", + kiblnd_tx_pool_size(ni, ncpts), + kiblnd_create_tx_pool, + kiblnd_destroy_tx_pool, + kiblnd_tx_init, NULL); + if (rc != 0) { + CERROR("Can't initialize TX pool for CPT %d: %d\n", + cpt, rc); + goto failed; + } + } + + return 0; + failed: + kiblnd_net_fini_pools(net); + LASSERT(rc != 0); + return rc; +} + +static int +kiblnd_hdev_get_attr(struct kib_hca_dev *hdev) +{ + struct ib_device_attr *dev_attr; + int rc = 0; + + /* It's safe to assume a HCA can handle a page size + * matching that of the native system */ + hdev->ibh_page_shift = PAGE_SHIFT; + hdev->ibh_page_size = 1 << PAGE_SHIFT; + hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1); + +#ifndef HAVE_IB_DEVICE_ATTRS + LIBCFS_ALLOC(dev_attr, sizeof(*dev_attr)); + if (dev_attr == NULL) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + rc = ib_query_device(hdev->ibh_ibdev, dev_attr); + if (rc != 0) { + CERROR("Failed to query IB device: %d\n", rc); + goto out_clean_attr; + } +#else + dev_attr = &hdev->ibh_ibdev->attrs; +#endif + + hdev->ibh_mr_size = dev_attr->max_mr_size; + hdev->ibh_max_qp_wr = dev_attr->max_qp_wr; + + /* Setup device Memory Registration capabilities */ +#ifdef HAVE_FMR_POOL_API +#ifdef HAVE_IB_DEVICE_OPS + if (hdev->ibh_ibdev->ops.alloc_fmr && + hdev->ibh_ibdev->ops.dealloc_fmr && + hdev->ibh_ibdev->ops.map_phys_fmr && + hdev->ibh_ibdev->ops.unmap_fmr) { +#else + if (hdev->ibh_ibdev->alloc_fmr && + hdev->ibh_ibdev->dealloc_fmr && + hdev->ibh_ibdev->map_phys_fmr && + hdev->ibh_ibdev->unmap_fmr) { +#endif + LCONSOLE_INFO("Using FMR for registration\n"); + hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FMR_ENABLED; + } else +#endif /* HAVE_FMR_POOL_API */ + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + LCONSOLE_INFO("Using FastReg for registration\n"); + hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_ENABLED; +#ifndef HAVE_IB_ALLOC_FAST_REG_MR +#ifdef IB_DEVICE_SG_GAPS_REG + if (dev_attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG) + hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT; +#endif +#endif + } else { + rc = -ENOSYS; + } + + if (rc == 0 && hdev->ibh_mr_size == ~0ULL) + hdev->ibh_mr_shift = 64; + else if (rc != 0) + rc = -EINVAL; + +#ifndef HAVE_IB_DEVICE_ATTRS +out_clean_attr: + LIBCFS_FREE(dev_attr, sizeof(*dev_attr)); +#endif + + if (rc == -ENOSYS) + CERROR("IB device does not support FMRs nor FastRegs, can't " + "register memory: %d\n", rc); + else if (rc == -EINVAL) + CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size); + return rc; +} + +#ifdef HAVE_IB_GET_DMA_MR +static void +kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev) +{ + if (hdev->ibh_mrs == NULL) + return; + + ib_dereg_mr(hdev->ibh_mrs); + + hdev->ibh_mrs = NULL; +} +#endif + +void +kiblnd_hdev_destroy(struct kib_hca_dev *hdev) +{ +#ifdef HAVE_IB_GET_DMA_MR + kiblnd_hdev_cleanup_mrs(hdev); +#endif + + if (hdev->ibh_pd != NULL) + ib_dealloc_pd(hdev->ibh_pd); + + if (hdev->ibh_cmid != NULL) + rdma_destroy_id(hdev->ibh_cmid); + + LIBCFS_FREE(hdev, sizeof(*hdev)); +} + +#ifdef HAVE_IB_GET_DMA_MR +static int +kiblnd_hdev_setup_mrs(struct kib_hca_dev *hdev) +{ + struct ib_mr *mr; + int acflags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE; + + mr = ib_get_dma_mr(hdev->ibh_pd, acflags); + if (IS_ERR(mr)) { + CERROR("Failed ib_get_dma_mr: %ld\n", PTR_ERR(mr)); + kiblnd_hdev_cleanup_mrs(hdev); + return PTR_ERR(mr); + } + + hdev->ibh_mrs = mr; + + return 0; +} +#endif + +static int +kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) +{ /* DUMMY */ + return 0; +} + +static int +kiblnd_dev_need_failover(struct kib_dev *dev, struct net *ns) +{ + struct rdma_cm_id *cmid; + struct sockaddr_in srcaddr; + struct sockaddr_in dstaddr; + int rc; + + if (dev->ibd_hdev == NULL || /* initializing */ + dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */ + *kiblnd_tunables.kib_dev_failover > 1) /* debugging */ + return 1; + + /* XXX: it's UGLY, but I don't have better way to find + * ib-bonding HCA failover because: + * + * a. no reliable CM event for HCA failover... + * b. no OFED API to get ib_device for current net_device... + * + * We have only two choices at this point: + * + * a. rdma_bind_addr(), it will conflict with listener cmid + * b. rdma_resolve_addr() to zero addr */ + cmid = kiblnd_rdma_create_id(ns, kiblnd_dummy_callback, dev, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cmid)) { + rc = PTR_ERR(cmid); + CERROR("Failed to create cmid for failover: %d\n", rc); + return rc; + } + + memset(&srcaddr, 0, sizeof(srcaddr)); + srcaddr.sin_family = AF_INET; + srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); + + memset(&dstaddr, 0, sizeof(dstaddr)); + dstaddr.sin_family = AF_INET; + rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr, + (struct sockaddr *)&dstaddr, 1); + if (rc != 0 || cmid->device == NULL) { + CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", + dev->ibd_ifname, &dev->ibd_ifip, + cmid->device, rc); + rdma_destroy_id(cmid); + return rc; + } + + rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */ + rdma_destroy_id(cmid); + return rc; +} + +int +kiblnd_dev_failover(struct kib_dev *dev, struct net *ns) +{ + struct list_head zombie_tpo = LIST_HEAD_INIT(zombie_tpo); + struct list_head zombie_ppo = LIST_HEAD_INIT(zombie_ppo); + struct list_head zombie_fpo = LIST_HEAD_INIT(zombie_fpo); + struct rdma_cm_id *cmid = NULL; + struct kib_hca_dev *hdev = NULL; + struct kib_hca_dev *old; + struct ib_pd *pd; + struct kib_net *net; + struct sockaddr_in addr; + unsigned long flags; + int rc = 0; + int i; + + LASSERT (*kiblnd_tunables.kib_dev_failover > 1 || + dev->ibd_can_failover || + dev->ibd_hdev == NULL); + + rc = kiblnd_dev_need_failover(dev, ns); + if (rc <= 0) + goto out; + + if (dev->ibd_hdev != NULL && + dev->ibd_hdev->ibh_cmid != NULL) { + /* XXX it's not good to close old listener at here, + * because we can fail to create new listener. + * But we have to close it now, otherwise rdma_bind_addr + * will return EADDRINUSE... How crap! */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + cmid = dev->ibd_hdev->ibh_cmid; + /* make next schedule of kiblnd_dev_need_failover() + * return 1 for me */ + dev->ibd_hdev->ibh_cmid = NULL; + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + rdma_destroy_id(cmid); + } + + cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(cmid)) { + rc = PTR_ERR(cmid); + CERROR("Failed to create cmid for failover: %d\n", rc); + goto out; + } + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); + addr.sin_port = htons(*kiblnd_tunables.kib_service); + + /* Bind to failover device or port */ + rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr); + if (rc != 0 || cmid->device == NULL) { + CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", + dev->ibd_ifname, &dev->ibd_ifip, + cmid->device, rc); + rdma_destroy_id(cmid); + goto out; + } + + LIBCFS_ALLOC(hdev, sizeof(*hdev)); + if (hdev == NULL) { + CERROR("Failed to allocate kib_hca_dev\n"); + rdma_destroy_id(cmid); + rc = -ENOMEM; + goto out; + } + + atomic_set(&hdev->ibh_ref, 1); + hdev->ibh_dev = dev; + hdev->ibh_cmid = cmid; + hdev->ibh_ibdev = cmid->device; + +#ifdef HAVE_IB_ALLOC_PD_2ARGS + pd = ib_alloc_pd(cmid->device, 0); +#else + pd = ib_alloc_pd(cmid->device); +#endif + if (IS_ERR(pd)) { + rc = PTR_ERR(pd); + CERROR("Can't allocate PD: %d\n", rc); + goto out; + } + + hdev->ibh_pd = pd; + + rc = rdma_listen(cmid, 0); + if (rc != 0) { + CERROR("Can't start new listener: %d\n", rc); + goto out; + } + + rc = kiblnd_hdev_get_attr(hdev); + if (rc != 0) { + CERROR("Can't get device attributes: %d\n", rc); + goto out; + } + +#ifdef HAVE_IB_GET_DMA_MR + rc = kiblnd_hdev_setup_mrs(hdev); + if (rc != 0) { + CERROR("Can't setup device: %d\n", rc); + goto out; + } +#endif + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + old = dev->ibd_hdev; + dev->ibd_hdev = hdev; /* take over the refcount */ + hdev = old; + + list_for_each_entry(net, &dev->ibd_nets, ibn_list) { + cfs_cpt_for_each(i, lnet_cpt_table()) { + kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset, + &zombie_tpo); + + if (net->ibn_fmr_ps != NULL) + kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i], + &zombie_fpo); + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + out: + if (!list_empty(&zombie_tpo)) + kiblnd_destroy_pool_list(&zombie_tpo); + if (!list_empty(&zombie_ppo)) + kiblnd_destroy_pool_list(&zombie_ppo); + if (!list_empty(&zombie_fpo)) + kiblnd_destroy_fmr_pool_list(&zombie_fpo); + if (hdev != NULL) + kiblnd_hdev_decref(hdev); + + if (rc != 0) + dev->ibd_failed_failover++; + else + dev->ibd_failed_failover = 0; + + return rc; +} + +void +kiblnd_destroy_dev(struct kib_dev *dev) +{ + LASSERT(dev->ibd_nnets == 0); + LASSERT(list_empty(&dev->ibd_nets)); + + list_del(&dev->ibd_fail_list); + list_del(&dev->ibd_list); + + if (dev->ibd_hdev != NULL) + kiblnd_hdev_decref(dev->ibd_hdev); + + LIBCFS_FREE(dev, sizeof(*dev)); +} + +static void +kiblnd_base_shutdown(void) +{ + struct kib_sched_info *sched; + int i; + + LASSERT(list_empty(&kiblnd_data.kib_devs)); + + CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + switch (kiblnd_data.kib_init) { + default: + LBUG(); + + case IBLND_INIT_ALL: + case IBLND_INIT_DATA: + LASSERT (kiblnd_data.kib_peers != NULL); + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + LASSERT(list_empty(&kiblnd_data.kib_peers[i])); + } + LASSERT(list_empty(&kiblnd_data.kib_connd_zombies)); + LASSERT(list_empty(&kiblnd_data.kib_connd_conns)); + LASSERT(list_empty(&kiblnd_data.kib_reconn_list)); + LASSERT(list_empty(&kiblnd_data.kib_reconn_wait)); + + /* flag threads to terminate; wake and wait for them to die */ + kiblnd_data.kib_shutdown = 1; + + /* NB: we really want to stop scheduler threads net by net + * instead of the whole module, this should be improved + * with dynamic configuration LNet */ + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) + wake_up_all(&sched->ibs_waitq); + + wake_up_all(&kiblnd_data.kib_connd_waitq); + wake_up_all(&kiblnd_data.kib_failover_waitq); + + i = 2; + while (atomic_read(&kiblnd_data.kib_nthreads) != 0) { + i++; + /* power of 2? */ + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for %d threads to terminate\n", + atomic_read(&kiblnd_data.kib_nthreads)); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + + fallthrough; + + case IBLND_INIT_NOTHING: + break; + } + + if (kiblnd_data.kib_peers != NULL) { + LIBCFS_FREE(kiblnd_data.kib_peers, + sizeof(struct list_head) * + kiblnd_data.kib_peer_hash_size); + } + + if (kiblnd_data.kib_scheds != NULL) + cfs_percpt_free(kiblnd_data.kib_scheds); + + CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + kiblnd_data.kib_init = IBLND_INIT_NOTHING; + module_put(THIS_MODULE); +} + +static void +kiblnd_shutdown(struct lnet_ni *ni) +{ + struct kib_net *net = ni->ni_data; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + int i; + unsigned long flags; + + LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL); + + if (net == NULL) + goto out; + + CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + write_lock_irqsave(g_lock, flags); + net->ibn_shutdown = 1; + write_unlock_irqrestore(g_lock, flags); + + switch (net->ibn_init) { + default: + LBUG(); + + case IBLND_INIT_ALL: + /* nuke all existing peers within this net */ + kiblnd_del_peer(ni, LNET_NID_ANY); + + /* Wait for all peer_ni state to clean up */ + i = 2; + while (atomic_read(&net->ibn_npeers) != 0) { + i++; + /* power of 2? */ + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "%s: waiting for %d peers to disconnect\n", + libcfs_nid2str(ni->ni_nid), + atomic_read(&net->ibn_npeers)); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + + kiblnd_net_fini_pools(net); + + write_lock_irqsave(g_lock, flags); + LASSERT(net->ibn_dev->ibd_nnets > 0); + net->ibn_dev->ibd_nnets--; + list_del(&net->ibn_list); + write_unlock_irqrestore(g_lock, flags); + + fallthrough; + + case IBLND_INIT_NOTHING: + LASSERT (atomic_read(&net->ibn_nconns) == 0); + + if (net->ibn_dev != NULL && + net->ibn_dev->ibd_nnets == 0) + kiblnd_destroy_dev(net->ibn_dev); + + break; + } + + CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + net->ibn_init = IBLND_INIT_NOTHING; + ni->ni_data = NULL; + + LIBCFS_FREE(net, sizeof(*net)); + +out: + if (list_empty(&kiblnd_data.kib_devs)) + kiblnd_base_shutdown(); + return; +} + +static int +kiblnd_base_startup(struct net *ns) +{ + struct kib_sched_info *sched; + int rc; + int i; + + LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING); + + try_module_get(THIS_MODULE); + memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */ + + rwlock_init(&kiblnd_data.kib_global_lock); + + INIT_LIST_HEAD(&kiblnd_data.kib_devs); + INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs); + + kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE; + LIBCFS_ALLOC(kiblnd_data.kib_peers, + sizeof(struct list_head) * + kiblnd_data.kib_peer_hash_size); + if (kiblnd_data.kib_peers == NULL) + goto failed; + + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]); + + spin_lock_init(&kiblnd_data.kib_connd_lock); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); + INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list); + INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait); + + init_waitqueue_head(&kiblnd_data.kib_connd_waitq); + init_waitqueue_head(&kiblnd_data.kib_failover_waitq); + + kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*sched)); + if (kiblnd_data.kib_scheds == NULL) + goto failed; + + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { + int nthrs; + + spin_lock_init(&sched->ibs_lock); + INIT_LIST_HEAD(&sched->ibs_conns); + init_waitqueue_head(&sched->ibs_waitq); + + nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + if (*kiblnd_tunables.kib_nscheds > 0) { + nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds); + } else { + /* max to half of CPUs, another half is reserved for + * upper layer modules */ + nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); + } + + sched->ibs_nthreads_max = nthrs; + sched->ibs_cpt = i; + } + + kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR; + + /* lists/ptrs/locks initialised */ + kiblnd_data.kib_init = IBLND_INIT_DATA; + /*****************************************************/ + + rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd"); + if (rc != 0) { + CERROR("Can't spawn o2iblnd connd: %d\n", rc); + goto failed; + } + + if (*kiblnd_tunables.kib_dev_failover != 0) + rc = kiblnd_thread_start(kiblnd_failover_thread, ns, + "kiblnd_failover"); + + if (rc != 0) { + CERROR("Can't spawn o2iblnd failover thread: %d\n", rc); + goto failed; + } + + /* flag everything initialised */ + kiblnd_data.kib_init = IBLND_INIT_ALL; + /*****************************************************/ + + return 0; + + failed: + kiblnd_base_shutdown(); + return -ENETDOWN; +} + +static int +kiblnd_start_schedulers(struct kib_sched_info *sched) +{ + int rc = 0; + int nthrs; + int i; + + if (sched->ibs_nthreads == 0) { + if (*kiblnd_tunables.kib_nscheds > 0) { + nthrs = sched->ibs_nthreads_max; + } else { + nthrs = cfs_cpt_weight(lnet_cpt_table(), + sched->ibs_cpt); + nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); + nthrs = min(IBLND_N_SCHED_HIGH, nthrs); + } + } else { + LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max); + /* increase one thread if there is new interface */ + nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max); + } + + for (i = 0; i < nthrs; i++) { + long id; + char name[20]; + id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i); + snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld", + KIB_THREAD_CPT(id), KIB_THREAD_TID(id)); + rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name); + if (rc == 0) + continue; + + CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", + sched->ibs_cpt, sched->ibs_nthreads + i, rc); + break; + } + + sched->ibs_nthreads += i; + return rc; +} + +static int kiblnd_dev_start_threads(struct kib_dev *dev, bool newdev, u32 *cpts, + int ncpts) +{ + int cpt; + int rc; + int i; + + for (i = 0; i < ncpts; i++) { + struct kib_sched_info *sched; + + cpt = (cpts == NULL) ? i : cpts[i]; + sched = kiblnd_data.kib_scheds[cpt]; + + if (!newdev && sched->ibs_nthreads > 0) + continue; + + rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]); + if (rc != 0) { + CERROR("Failed to start scheduler threads for %s\n", + dev->ibd_ifname); + return rc; + } + } + return 0; +} + +static struct kib_dev * +kiblnd_dev_search(char *ifname) +{ + struct kib_dev *alias = NULL; + struct kib_dev *dev; + char *colon; + char *colon2; + + colon = strchr(ifname, ':'); + list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { + if (strcmp(&dev->ibd_ifname[0], ifname) == 0) + return dev; + + if (alias != NULL) + continue; + + colon2 = strchr(dev->ibd_ifname, ':'); + if (colon != NULL) + *colon = 0; + if (colon2 != NULL) + *colon2 = 0; + + if (strcmp(&dev->ibd_ifname[0], ifname) == 0) + alias = dev; + + if (colon != NULL) + *colon = ':'; + if (colon2 != NULL) + *colon2 = ':'; + } + return alias; +} + +static int +kiblnd_startup(struct lnet_ni *ni) +{ + char *ifname = NULL; + struct lnet_inetdev *ifaces = NULL; + struct kib_dev *ibdev = NULL; + struct kib_net *net = NULL; + unsigned long flags; + int rc; + int i; + bool newdev; + + LASSERT(ni->ni_net->net_lnd == &the_o2iblnd); + + if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) { + rc = kiblnd_base_startup(ni->ni_net_ns); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + ni->ni_data = net; + if (net == NULL) { + rc = -ENOMEM; + goto failed; + } + + net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC; + + kiblnd_tunables_setup(ni); + + /* + * ni_interfaces is only to support legacy pre Multi-Rail + * tcp bonding for ksocklnd. Multi-Rail wants each secondary + * IP to be treated as an unique 'struct ni' interfaces instead. + */ + if (ni->ni_interfaces[0] != NULL) { + /* Use the IPoIB interface specified in 'networks=' */ + if (ni->ni_interfaces[1] != NULL) { + CERROR("ko2iblnd: Multiple interfaces not supported\n"); + rc = -EINVAL; + goto failed; + } + + ifname = ni->ni_interfaces[0]; + } else { + ifname = *kiblnd_tunables.kib_default_ipif; + } + + if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) { + CERROR("IPoIB interface name too long: %s\n", ifname); + rc = -E2BIG; + goto failed; + } + + rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns); + if (rc < 0) + goto failed; + + for (i = 0; i < rc; i++) { + if (strcmp(ifname, ifaces[i].li_name) == 0) + break; + } + + if (i == rc) { + CERROR("ko2iblnd: No matching interfaces\n"); + rc = -ENOENT; + goto failed; + } + + ibdev = kiblnd_dev_search(ifname); + newdev = ibdev == NULL; + /* hmm...create kib_dev even for alias */ + if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) { + LIBCFS_ALLOC(ibdev, sizeof(*ibdev)); + if (!ibdev) { + rc = -ENOMEM; + goto failed; + } + + ibdev->ibd_ifip = ifaces[i].li_ipaddr; + strlcpy(ibdev->ibd_ifname, ifaces[i].li_name, + sizeof(ibdev->ibd_ifname)); + ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER); + + INIT_LIST_HEAD(&ibdev->ibd_nets); + INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */ + INIT_LIST_HEAD(&ibdev->ibd_fail_list); + + /* initialize the device */ + rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns); + if (rc) { + CERROR("ko2iblnd: Can't initialize device: rc = %d\n", rc); + goto failed; + } + + list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs); + } + + net->ibn_dev = ibdev; + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); + + ni->ni_dev_cpt = ifaces[i].li_cpt; + + rc = kiblnd_dev_start_threads(ibdev, newdev, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) + goto failed; + + rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) { + CERROR("Failed to initialize NI pools: %d\n", rc); + goto failed; + } + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + ibdev->ibd_nnets++; + list_add_tail(&net->ibn_list, &ibdev->ibd_nets); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + net->ibn_init = IBLND_INIT_ALL; + + return 0; + +failed: + if (net != NULL && net->ibn_dev == NULL && ibdev != NULL) + kiblnd_destroy_dev(ibdev); + + kfree(ifaces); + kiblnd_shutdown(ni); + + CDEBUG(D_NET, "Configuration of device %s failed: rc = %d\n", + ifname ? ifname : "", rc); + + return -ENETDOWN; +} + +static struct lnet_lnd the_o2iblnd = { + .lnd_type = O2IBLND, + .lnd_startup = kiblnd_startup, + .lnd_shutdown = kiblnd_shutdown, + .lnd_ctl = kiblnd_ctl, + .lnd_query = kiblnd_query, + .lnd_send = kiblnd_send, + .lnd_recv = kiblnd_recv, +}; + +static void __exit ko2iblnd_exit(void) +{ + lnet_unregister_lnd(&the_o2iblnd); +} + +static int __init ko2iblnd_init(void) +{ + int rc; + + CLASSERT(sizeof(struct kib_msg) <= IBLND_MSG_SIZE); + CLASSERT(offsetof(struct kib_msg, + ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) <= + IBLND_MSG_SIZE); + CLASSERT(offsetof(struct kib_msg, + ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) + <= IBLND_MSG_SIZE); + + rc = kiblnd_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_o2iblnd); + + return 0; +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver"); +MODULE_VERSION("2.8.0"); +MODULE_LICENSE("GPL"); + +module_init(ko2iblnd_init); +module_exit(ko2iblnd_exit); diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h new file mode 100644 index 0000000000000..3e24405c2c31e --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h @@ -0,0 +1,1230 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd.h + * + * Author: Eric Barton + */ + +#ifdef HAVE_COMPAT_RDMA +#include + +#ifdef LINUX_3_17_COMPAT_H +#undef NEED_KTIME_GET_REAL_NS +#endif + +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#ifdef HAVE_FMR_POOL_API +#include +#endif + +#define DEBUG_SUBSYSTEM S_LND + +#include +#include + +#define IBLND_PEER_HASH_SIZE 101 /* # peer_ni lists */ +/* # scheduler loops before reschedule */ +#define IBLND_RESCHED 100 + +#define IBLND_N_SCHED 2 +#define IBLND_N_SCHED_HIGH 4 + +struct kib_tunables { + int *kib_dev_failover; /* HCA failover */ + unsigned int *kib_service; /* IB service number */ + int *kib_min_reconnect_interval; /* first failed connection retry... */ + int *kib_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kib_cksum; /* checksum struct kib_msg? */ + int *kib_timeout; /* comms timeout (seconds) */ + int *kib_keepalive; /* keepalive timeout (seconds) */ + int *kib_ntx; /* # tx descs */ + char **kib_default_ipif; /* default IPoIB interface */ + int *kib_retry_count; + int *kib_rnr_retry_count; + int *kib_ib_mtu; /* IB MTU */ + int *kib_require_priv_port;/* accept only privileged ports */ + int *kib_use_priv_port; /* use privileged port for active connect */ + /* # threads on each CPT */ + int *kib_nscheds; + int *kib_wrq_sge; /* # sg elements per wrq */ + int *kib_use_fastreg_gaps; /* enable discontiguous fastreg fragment support */ +}; + +extern struct kib_tunables kiblnd_tunables; + +#define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */ +#define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */ + +#define IBLND_CREDITS_DEFAULT 8 /* default # of peer_ni credits */ +#define IBLND_CREDITS_MAX ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1) /* Max # of peer_ni credits */ + +/* when eagerly to return credits */ +#define IBLND_CREDITS_HIGHWATER(t, conn) ((conn->ibc_version) == IBLND_MSG_VERSION_1 ? \ + IBLND_CREDIT_HIGHWATER_V1 : \ + min(t->lnd_peercredits_hiw, (__u32)conn->ibc_queue_depth - 1)) + +#ifdef HAVE_RDMA_CREATE_ID_5ARG +# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \ + rdma_create_id((ns) ? (ns) : &init_net, cb, dev, ps, qpt) +#else +# ifdef HAVE_RDMA_CREATE_ID_4ARG +# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \ + rdma_create_id(cb, dev, ps, qpt) +# else +# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \ + rdma_create_id(cb, dev, ps) +# endif +#endif + +/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */ +#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1) +#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0) + +#define IBLND_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ +#define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */ + +/************************/ +/* derived constants... */ +/* Pools (shared by connections on each CPT) */ +/* These pools can grow at runtime, so don't need give a very large value */ +#define IBLND_TX_POOL 256 +#define IBLND_FMR_POOL 256 +#define IBLND_FMR_POOL_FLUSH 192 + +/* RX messages (per connection) */ +#define IBLND_RX_MSGS(c) \ + ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version)) +#define IBLND_RX_MSG_BYTES(c) (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE) +#define IBLND_RX_MSG_PAGES(c) \ + ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE) + +/* WRs and CQEs (per connection) */ +#define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c) + +/* 2 = LNet msg + Transfer chain */ +#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c)) + +struct kib_hca_dev; + +/* o2iblnd can run over aliased interface */ +#ifdef IFALIASZ +#define KIB_IFNAME_SIZE IFALIASZ +#else +#define KIB_IFNAME_SIZE 256 +#endif + +enum kib_dev_caps { + IBLND_DEV_CAPS_FASTREG_ENABLED = BIT(0), + IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT = BIT(1), +#ifdef HAVE_FMR_POOL_API + IBLND_DEV_CAPS_FMR_ENABLED = BIT(2), +#endif +}; + +struct kib_dev { + struct list_head ibd_list; /* chain on kib_devs */ + struct list_head ibd_fail_list; /* chain on kib_failed_devs */ + __u32 ibd_ifip; /* IPoIB interface IP */ + /** IPoIB interface name */ + char ibd_ifname[KIB_IFNAME_SIZE]; + int ibd_nnets; /* # nets extant */ + + time64_t ibd_next_failover; + /* # failover failures */ + int ibd_failed_failover; + /* failover in progress */ + unsigned int ibd_failover; + /* IPoIB interface is a bonding master */ + unsigned int ibd_can_failover; + struct list_head ibd_nets; + struct kib_hca_dev *ibd_hdev; + enum kib_dev_caps ibd_dev_caps; +}; + +struct kib_hca_dev { + struct rdma_cm_id *ibh_cmid; /* listener cmid */ + struct ib_device *ibh_ibdev; /* IB device */ + int ibh_page_shift; /* page shift of current HCA */ + int ibh_page_size; /* page size of current HCA */ + __u64 ibh_page_mask; /* page mask of current HCA */ + int ibh_mr_shift; /* bits shift of max MR size */ + __u64 ibh_mr_size; /* size of MR */ + int ibh_max_qp_wr; /* maximum work requests size */ +#ifdef HAVE_IB_GET_DMA_MR + struct ib_mr *ibh_mrs; /* global MR */ +#endif + struct ib_pd *ibh_pd; /* PD */ + struct kib_dev *ibh_dev; /* owner */ + atomic_t ibh_ref; /* refcount */ +}; + +/** # of seconds to keep pool alive */ +#define IBLND_POOL_DEADLINE 300 +/** # of seconds to retry if allocation failed */ +#define IBLND_POOL_RETRY 1 + +struct kib_pages { + int ibp_npages; /* # pages */ + struct page *ibp_pages[0]; /* page array */ +}; + +struct kib_pool; +struct kib_poolset; + +typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, + int inc, struct kib_pool **pp_po); +typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po); +typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node); +typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node); + +struct kib_net; + +#define IBLND_POOL_NAME_LEN 32 + +struct kib_poolset { + /* serialize */ + spinlock_t ps_lock; + /* network it belongs to */ + struct kib_net *ps_net; + /* pool set name */ + char ps_name[IBLND_POOL_NAME_LEN]; + /* list of pools */ + struct list_head ps_pool_list; + /* failed pool list */ + struct list_head ps_failed_pool_list; + /* time stamp for retry if failed to allocate */ + time64_t ps_next_retry; + /* is allocating new pool */ + int ps_increasing; + /* new pool size */ + int ps_pool_size; + /* CPT id */ + int ps_cpt; + + /* create a new pool */ + kib_ps_pool_create_t ps_pool_create; + /* destroy a pool */ + kib_ps_pool_destroy_t ps_pool_destroy; + /* initialize new allocated node */ + kib_ps_node_init_t ps_node_init; + /* finalize node */ + kib_ps_node_fini_t ps_node_fini; +}; + +struct kib_pool { + /* chain on pool list */ + struct list_head po_list; + /* pre-allocated node */ + struct list_head po_free_list; + /* pool_set of this pool */ + struct kib_poolset *po_owner; + /* deadline of this pool */ + time64_t po_deadline; + /* # of elements in use */ + int po_allocated; + /* pool is created on failed HCA */ + int po_failed; + /* # of pre-allocated elements */ + int po_size; +}; + +struct kib_tx_poolset { + struct kib_poolset tps_poolset; /* pool-set */ + __u64 tps_next_tx_cookie; /* cookie of TX */ +}; + +struct kib_tx_pool { + struct kib_pool tpo_pool; /* pool */ + struct kib_hca_dev *tpo_hdev; /* device for this pool */ + struct kib_tx *tpo_tx_descs; /* all the tx descriptors */ + struct kib_pages *tpo_tx_pages; /* premapped tx msg pages */ +}; + +struct kib_fmr_poolset { + spinlock_t fps_lock; /* serialize */ + struct kib_net *fps_net; /* IB network */ + struct list_head fps_pool_list; /* FMR pool list */ + struct list_head fps_failed_pool_list; /* FMR pool list */ + __u64 fps_version; /* validity stamp */ + int fps_cpt; /* CPT id */ + int fps_pool_size; + int fps_flush_trigger; + int fps_cache; + /* is allocating new pool */ + int fps_increasing; + /* time stamp for retry if failed to allocate */ + time64_t fps_next_retry; +}; + +#ifndef HAVE_IB_RDMA_WR +struct ib_rdma_wr { + struct ib_send_wr wr; +}; +#endif + +struct kib_fast_reg_descriptor { /* For fast registration */ + struct list_head frd_list; + struct ib_rdma_wr frd_inv_wr; +#ifdef HAVE_IB_MAP_MR_SG + struct ib_reg_wr frd_fastreg_wr; +#else + struct ib_rdma_wr frd_fastreg_wr; + struct ib_fast_reg_page_list *frd_frpl; +#endif + struct ib_mr *frd_mr; + bool frd_valid; + bool frd_posted; +}; + +struct kib_fmr_pool { + struct list_head fpo_list; /* chain on pool list */ + struct kib_hca_dev *fpo_hdev; /* device for this pool */ + struct kib_fmr_poolset *fpo_owner; /* owner of this pool */ +#ifdef HAVE_FMR_POOL_API + union { + struct { + struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */ + } fmr; +#endif + struct { /* For fast registration */ + struct list_head fpo_pool_list; + int fpo_pool_size; + } fast_reg; +#ifdef HAVE_FMR_POOL_API + }; + bool fpo_is_fmr; /* True if FMR pools allocated */ +#endif + time64_t fpo_deadline; /* deadline of this pool */ + int fpo_failed; /* fmr pool is failed */ + int fpo_map_count; /* # of mapped FMR */ +}; + +struct kib_fmr { + struct kib_fmr_pool *fmr_pool; /* pool of FMR */ +#ifdef HAVE_FMR_POOL_API + struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */ +#endif /* HAVE_FMR_POOL_API */ + struct kib_fast_reg_descriptor *fmr_frd; + u32 fmr_key; +}; + +struct kib_net { + /* chain on struct kib_dev::ibd_nets */ + struct list_head ibn_list; + __u64 ibn_incarnation;/* my epoch */ + int ibn_init; /* initialisation state */ + int ibn_shutdown; /* shutting down? */ + + atomic_t ibn_npeers; /* # peers extant */ + atomic_t ibn_nconns; /* # connections extant */ + + struct kib_tx_poolset **ibn_tx_ps; /* tx pool-set */ + struct kib_fmr_poolset **ibn_fmr_ps; /* fmr pool-set */ + + struct kib_dev *ibn_dev; /* underlying IB device */ +}; + +#define KIB_THREAD_SHIFT 16 +#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid)) +#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT) +#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1)) + +struct kib_sched_info { + /* serialise */ + spinlock_t ibs_lock; + /* schedulers sleep here */ + wait_queue_head_t ibs_waitq; + /* conns to check for rx completions */ + struct list_head ibs_conns; + /* number of scheduler threads */ + int ibs_nthreads; + /* max allowed scheduler threads */ + int ibs_nthreads_max; + int ibs_cpt; /* CPT id */ +}; + +struct kib_data { + int kib_init; /* initialisation state */ + int kib_shutdown; /* shut down? */ + struct list_head kib_devs; /* IB devices extant */ + /* list head of failed devices */ + struct list_head kib_failed_devs; + /* schedulers sleep here */ + wait_queue_head_t kib_failover_waitq; + atomic_t kib_nthreads; /* # live threads */ + /* stabilize net/dev/peer_ni/conn ops */ + rwlock_t kib_global_lock; + /* hash table of all my known peers */ + struct list_head *kib_peers; + /* size of kib_peers */ + int kib_peer_hash_size; + /* the connd task (serialisation assertions) */ + void *kib_connd; + /* connections to setup/teardown */ + struct list_head kib_connd_conns; + /* connections with zero refcount */ + struct list_head kib_connd_zombies; + /* connections to reconnect */ + struct list_head kib_reconn_list; + /* peers wait for reconnection */ + struct list_head kib_reconn_wait; + /* + * The second that peers are pulled out from \a kib_reconn_wait + * for reconnection. + */ + time64_t kib_reconn_sec; + /* connection daemon sleeps here */ + wait_queue_head_t kib_connd_waitq; + spinlock_t kib_connd_lock; /* serialise */ + struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ + /* percpt data for schedulers */ + struct kib_sched_info **kib_scheds; +}; + +#define IBLND_INIT_NOTHING 0 +#define IBLND_INIT_DATA 1 +#define IBLND_INIT_ALL 2 + +/************************************************************************ + * IB Wire message format. + * These are sent in sender's byte order (i.e. receiver flips). + */ + +struct kib_connparams { + __u16 ibcp_queue_depth; + __u16 ibcp_max_frags; + __u32 ibcp_max_msg_size; +} WIRE_ATTR; + +struct kib_immediate_msg { + struct lnet_hdr ibim_hdr; /* portals header */ + char ibim_payload[0];/* piggy-backed payload */ +} WIRE_ATTR; + +struct kib_rdma_frag { + __u32 rf_nob; /* # bytes this frag */ + __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */ +} WIRE_ATTR; + +struct kib_rdma_desc { + __u32 rd_key; /* local/remote key */ + __u32 rd_nfrags; /* # fragments */ + struct kib_rdma_frag rd_frags[0]; /* buffer frags */ +} WIRE_ATTR; + +struct kib_putreq_msg { + struct lnet_hdr ibprm_hdr; /* portals header */ + __u64 ibprm_cookie; /* opaque completion cookie */ +} WIRE_ATTR; + +struct kib_putack_msg { + __u64 ibpam_src_cookie; /* reflected completion cookie */ + __u64 ibpam_dst_cookie; /* opaque completion cookie */ + struct kib_rdma_desc ibpam_rd; /* sender's sink buffer */ +} WIRE_ATTR; + +struct kib_get_msg { + struct lnet_hdr ibgm_hdr; /* portals header */ + __u64 ibgm_cookie; /* opaque completion cookie */ + struct kib_rdma_desc ibgm_rd; /* rdma descriptor */ +} WIRE_ATTR; + +struct kib_completion_msg { + __u64 ibcm_cookie; /* opaque completion cookie */ + __s32 ibcm_status; /* < 0 failure: >= 0 length */ +} WIRE_ATTR; + +struct kib_msg { + /* First 2 fields fixed FOR ALL TIME */ + __u32 ibm_magic; /* I'm an ibnal message */ + __u16 ibm_version; /* this is my version number */ + + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ + __u32 ibm_nob; /* # bytes in whole message */ + __u32 ibm_cksum; /* checksum (0 == no checksum) */ + __u64 ibm_srcnid; /* sender's NID */ + __u64 ibm_srcstamp; /* sender's incarnation */ + __u64 ibm_dstnid; /* destination's NID */ + __u64 ibm_dststamp; /* destination's incarnation */ + + union { + struct kib_connparams connparams; + struct kib_immediate_msg immediate; + struct kib_putreq_msg putreq; + struct kib_putack_msg putack; + struct kib_get_msg get; + struct kib_completion_msg completion; + } WIRE_ATTR ibm_u; +} WIRE_ATTR; + +#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */ + +#define IBLND_MSG_VERSION_1 0x11 +#define IBLND_MSG_VERSION_2 0x12 +#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2 + +#define IBLND_MSG_CONNREQ 0xc0 /* connection request */ +#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */ +#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */ +#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ +#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ +#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ +#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ +#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ +#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ + +struct kib_rej { + __u32 ibr_magic; /* sender's magic */ + __u16 ibr_version; /* sender's version */ + __u8 ibr_why; /* reject reason */ + __u8 ibr_padding; /* padding */ + __u64 ibr_incarnation; /* incarnation of peer_ni */ + struct kib_connparams ibr_cp; /* connection parameters */ +} WIRE_ATTR; + +/* connection rejection reasons */ +#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */ +#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */ +#define IBLND_REJECT_FATAL 3 /* Anything else */ + +#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer_ni */ +#define IBLND_REJECT_CONN_STALE 5 /* stale peer_ni */ + +/* peer_ni's rdma frags doesn't match mine */ +#define IBLND_REJECT_RDMA_FRAGS 6 +/* peer_ni's msg queue size doesn't match mine */ +#define IBLND_REJECT_MSG_QUEUE_SIZE 7 +#define IBLND_REJECT_INVALID_SRV_ID 8 + +/***********************************************************************/ + +struct kib_rx { /* receive message */ + /* queue for attention */ + struct list_head rx_list; + /* owning conn */ + struct kib_conn *rx_conn; + /* # bytes received (-1 while posted) */ + int rx_nob; + /* completion status */ + enum ib_wc_status rx_status; + /* message buffer (host vaddr) */ + struct kib_msg *rx_msg; + /* message buffer (I/O addr) */ + __u64 rx_msgaddr; + /* for dma_unmap_single() */ + DEFINE_DMA_UNMAP_ADDR(rx_msgunmap); + /* receive work item... */ + struct ib_recv_wr rx_wrq; + /* ...and its memory */ + struct ib_sge rx_sge; +}; + +#define IBLND_POSTRX_DONT_POST 0 /* don't post */ +#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */ +#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer_ni back 1 credit */ +#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give myself back 1 reserved credit */ + +struct kib_tx { /* transmit message */ + /* queue on idle_txs ibc_tx_queue etc. */ + struct list_head tx_list; + /* pool I'm from */ + struct kib_tx_pool *tx_pool; + /* owning conn */ + struct kib_conn *tx_conn; + /* # tx callbacks outstanding */ + short tx_sending; + /* queued for sending */ + short tx_queued; + /* waiting for peer_ni */ + short tx_waiting; + /* LNET completion status */ + int tx_status; + /* health status of the transmit */ + enum lnet_msg_hstatus tx_hstatus; + /* completion deadline */ + ktime_t tx_deadline; + /* completion cookie */ + __u64 tx_cookie; + /* lnet msgs to finalize on completion */ + struct lnet_msg *tx_lntmsg[2]; + /* message buffer (host vaddr) */ + struct kib_msg *tx_msg; + /* message buffer (I/O addr) */ + __u64 tx_msgaddr; + /* for dma_unmap_single() */ + DEFINE_DMA_UNMAP_ADDR(tx_msgunmap); + /** sge for tx_msgaddr */ + struct ib_sge tx_msgsge; + /* # send work items */ + int tx_nwrq; + /* # used scatter/gather elements */ + int tx_nsge; + /* send work items... */ + struct ib_rdma_wr *tx_wrq; + /* ...and their memory */ + struct ib_sge *tx_sge; + /* rdma descriptor */ + struct kib_rdma_desc *tx_rd; + /* # entries in... */ + int tx_nfrags; + /* dma_map_sg descriptor */ + struct scatterlist *tx_frags; + /* rdma phys page addrs */ + __u64 *tx_pages; + /* gaps in fragments */ + bool tx_gaps; + /* FMR */ + struct kib_fmr tx_fmr; + /* dma direction */ + int tx_dmadir; +}; + +struct kib_connvars { + /* connection-in-progress variables */ + struct kib_msg cv_msg; +}; + +struct kib_conn { + /* scheduler information */ + struct kib_sched_info *ibc_sched; + /* owning peer_ni */ + struct kib_peer_ni *ibc_peer; + /* HCA bound on */ + struct kib_hca_dev *ibc_hdev; + /* stash on peer_ni's conn list */ + struct list_head ibc_list; + /* schedule for attention */ + struct list_head ibc_sched_list; + /* version of connection */ + __u16 ibc_version; + /* reconnect later */ + __u16 ibc_reconnect:1; + /* which instance of the peer */ + __u64 ibc_incarnation; + /* # users */ + atomic_t ibc_refcount; + /* what's happening */ + int ibc_state; + /* # uncompleted sends */ + int ibc_nsends_posted; + /* # uncompleted NOOPs */ + int ibc_noops_posted; + /* # credits I have */ + int ibc_credits; + /* # credits to return */ + int ibc_outstanding_credits; + /* # ACK/DONE msg credits */ + int ibc_reserved_credits; + /* set on comms error */ + int ibc_comms_error; + /* connections queue depth */ + __u16 ibc_queue_depth; + /* connections max frags */ + __u16 ibc_max_frags; + /* receive buffers owned */ + unsigned int ibc_nrx:16; + /* scheduled for attention */ + unsigned int ibc_scheduled:1; + /* CQ callback fired */ + unsigned int ibc_ready:1; + /* time of last send */ + ktime_t ibc_last_send; + /** link chain for kiblnd_check_conns only */ + struct list_head ibc_connd_list; + /** rxs completed before ESTABLISHED */ + struct list_head ibc_early_rxs; + /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */ + struct list_head ibc_tx_noops; + /* sends that need a credit */ + struct list_head ibc_tx_queue; + /* sends that don't need a credit */ + struct list_head ibc_tx_queue_nocred; + /* sends that need to reserve an ACK/DONE msg */ + struct list_head ibc_tx_queue_rsrvd; + /* active tx awaiting completion */ + struct list_head ibc_active_txs; + /* zombie tx awaiting done */ + struct list_head ibc_zombie_txs; + /* serialise */ + spinlock_t ibc_lock; + /* the rx descs */ + struct kib_rx *ibc_rxs; + /* premapped rx msg pages */ + struct kib_pages *ibc_rx_pages; + + /* CM id */ + struct rdma_cm_id *ibc_cmid; + /* completion queue */ + struct ib_cq *ibc_cq; + + /* in-progress connection state */ + struct kib_connvars *ibc_connvars; +}; + +#define IBLND_CONN_INIT 0 /* being initialised */ +#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */ +#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */ +#define IBLND_CONN_ESTABLISHED 3 /* connection established */ +#define IBLND_CONN_CLOSING 4 /* being closed */ +#define IBLND_CONN_DISCONNECTED 5 /* disconnected */ + +struct kib_peer_ni { + /* stash on global peer_ni list */ + struct list_head ibp_list; + /* who's on the other end(s) */ + lnet_nid_t ibp_nid; + /* LNet interface */ + struct lnet_ni *ibp_ni; + /* all active connections */ + struct list_head ibp_conns; + /* next connection to send on for round robin */ + struct kib_conn *ibp_next_conn; + /* msgs waiting for a conn */ + struct list_head ibp_tx_queue; + /* incarnation of peer_ni */ + __u64 ibp_incarnation; + /* when (in seconds) I was last alive */ + time64_t ibp_last_alive; + /* # users */ + atomic_t ibp_refcount; + /* version of peer_ni */ + __u16 ibp_version; + /* current passive connection attempts */ + unsigned short ibp_accepting; + /* current active connection attempts */ + unsigned short ibp_connecting; + /* reconnect this peer_ni later */ + unsigned char ibp_reconnecting; + /* counter of how many times we triggered a conn race */ + unsigned char ibp_races; + /* # consecutive reconnection attempts to this peer */ + unsigned int ibp_reconnected; + /* number of total active retries */ + unsigned int ibp_retries; + /* errno on closing this peer_ni */ + int ibp_error; + /* max map_on_demand */ + __u16 ibp_max_frags; + /* max_peer_credits */ + __u16 ibp_queue_depth; +}; + +#ifndef HAVE_IB_INC_RKEY +/** + * ib_inc_rkey - increments the key portion of the given rkey. Can be used + * for calculating a new rkey for type 2 memory windows. + * @rkey - the rkey to increment. + */ +static inline u32 ib_inc_rkey(u32 rkey) +{ + const u32 mask = 0x000000ff; + return ((rkey + 1) & mask) | (rkey & ~mask); +} +#endif + +extern struct kib_data kiblnd_data; + +extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev); + +int kiblnd_msg_queue_size(int version, struct lnet_ni *ni); + +static inline int +kiblnd_concurrent_sends(int version, struct lnet_ni *ni) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + int concurrent_sends; + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + concurrent_sends = tunables->lnd_concurrent_sends; + + if (version == IBLND_MSG_VERSION_1) { + if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2) + return IBLND_MSG_QUEUE_SIZE_V1 * 2; + + if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2) + return IBLND_MSG_QUEUE_SIZE_V1 / 2; + } + + return concurrent_sends; +} + +static inline void +kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev) +{ + LASSERT(atomic_read(&hdev->ibh_ref) > 0); + atomic_inc(&hdev->ibh_ref); +} + +static inline void +kiblnd_hdev_decref(struct kib_hca_dev *hdev) +{ + LASSERT(atomic_read(&hdev->ibh_ref) > 0); + if (atomic_dec_and_test(&hdev->ibh_ref)) + kiblnd_hdev_destroy(hdev); +} + +static inline int +kiblnd_dev_can_failover(struct kib_dev *dev) +{ + if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */ + return 0; + + if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */ + return 0; + + if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */ + return 1; + + return dev->ibd_can_failover; +} + +#define kiblnd_conn_addref(conn) \ +do { \ + CDEBUG(D_NET, "conn[%p] (%d)++\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + atomic_inc(&(conn)->ibc_refcount); \ +} while (0) + +#define kiblnd_conn_decref(conn) \ +do { \ + unsigned long flags; \ + \ + CDEBUG(D_NET, "conn[%p] (%d)--\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \ + if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \ + list_add_tail(&(conn)->ibc_list, \ + &kiblnd_data.kib_connd_zombies); \ + wake_up(&kiblnd_data.kib_connd_waitq); \ + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\ + } \ +} while (0) + +#define kiblnd_peer_addref(peer_ni) \ +do { \ + CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)++\n", \ + (peer_ni), libcfs_nid2str((peer_ni)->ibp_nid), \ + atomic_read (&(peer_ni)->ibp_refcount)); \ + atomic_inc(&(peer_ni)->ibp_refcount); \ +} while (0) + +#define kiblnd_peer_decref(peer_ni) \ +do { \ + CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)--\n", \ + (peer_ni), libcfs_nid2str((peer_ni)->ibp_nid), \ + atomic_read (&(peer_ni)->ibp_refcount)); \ + LASSERT_ATOMIC_POS(&(peer_ni)->ibp_refcount); \ + if (atomic_dec_and_test(&(peer_ni)->ibp_refcount)) \ + kiblnd_destroy_peer(peer_ni); \ +} while (0) + +static inline bool +kiblnd_peer_connecting(struct kib_peer_ni *peer_ni) +{ + return peer_ni->ibp_connecting != 0 || + peer_ni->ibp_reconnecting != 0 || + peer_ni->ibp_accepting != 0; +} + +static inline bool +kiblnd_peer_idle(struct kib_peer_ni *peer_ni) +{ + return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns); +} + +static inline struct list_head * +kiblnd_nid2peerlist (lnet_nid_t nid) +{ + unsigned int hash = + ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size; + + return &kiblnd_data.kib_peers[hash]; +} + +static inline int +kiblnd_peer_active(struct kib_peer_ni *peer_ni) +{ + /* Am I in the peer_ni hash table? */ + return !list_empty(&peer_ni->ibp_list); +} + +static inline struct kib_conn * +kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni) +{ + struct list_head *next; + + LASSERT(!list_empty(&peer_ni->ibp_conns)); + + /* Advance to next connection, be sure to skip the head node */ + if (!peer_ni->ibp_next_conn || + peer_ni->ibp_next_conn->ibc_list.next == &peer_ni->ibp_conns) + next = peer_ni->ibp_conns.next; + else + next = peer_ni->ibp_next_conn->ibc_list.next; + peer_ni->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list); + + return peer_ni->ibp_next_conn; +} + +static inline int +kiblnd_send_keepalive(struct kib_conn *conn) +{ + s64 keepalive_ns = *kiblnd_tunables.kib_keepalive * NSEC_PER_SEC; + + return (*kiblnd_tunables.kib_keepalive > 0) && + ktime_after(ktime_get(), + ktime_add_ns(conn->ibc_last_send, keepalive_ns)); +} + +static inline int +kiblnd_need_noop(struct kib_conn *conn) +{ + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + + if (conn->ibc_outstanding_credits < + IBLND_CREDITS_HIGHWATER(tunables, conn) && + !kiblnd_send_keepalive(conn)) + return 0; /* No need to send NOOP */ + + if (IBLND_OOB_CAPABLE(conn->ibc_version)) { + if (!list_empty(&conn->ibc_tx_queue_nocred)) + return 0; /* NOOP can be piggybacked */ + + /* No tx to piggyback NOOP onto or no credit to send a tx */ + return (list_empty(&conn->ibc_tx_queue) || + conn->ibc_credits == 0); + } + + if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ + !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */ + conn->ibc_credits == 0) /* no credit */ + return 0; + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) /* giving back credits */ + return 0; + + /* No tx to piggyback NOOP onto or no credit to send a tx */ + return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1); +} + +static inline void +kiblnd_abort_receives(struct kib_conn *conn) +{ + ib_modify_qp(conn->ibc_cmid->qp, + &kiblnd_data.kib_error_qpa, IB_QP_STATE); +} + +static inline const char * +kiblnd_queue2str(struct kib_conn *conn, struct list_head *q) +{ + if (q == &conn->ibc_tx_queue) + return "tx_queue"; + + if (q == &conn->ibc_tx_queue_rsrvd) + return "tx_queue_rsrvd"; + + if (q == &conn->ibc_tx_queue_nocred) + return "tx_queue_nocred"; + + if (q == &conn->ibc_active_txs) + return "active_txs"; + + LBUG(); + return NULL; +} + +/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the + * lowest bits of the work request id to stash the work item type. */ + +#define IBLND_WID_INVAL 0 +#define IBLND_WID_TX 1 +#define IBLND_WID_RX 2 +#define IBLND_WID_RDMA 3 +#define IBLND_WID_MR 4 +#define IBLND_WID_MASK 7UL + +static inline __u64 +kiblnd_ptr2wreqid (void *ptr, int type) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & IBLND_WID_MASK) == 0); + LASSERT ((type & ~IBLND_WID_MASK) == 0); + return (__u64)(lptr | type); +} + +static inline void * +kiblnd_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK); +} + +static inline int +kiblnd_wreqid2type (__u64 wreqid) +{ + return (wreqid & IBLND_WID_MASK); +} + +static inline void +kiblnd_set_conn_state(struct kib_conn *conn, int state) +{ + conn->ibc_state = state; + smp_mb(); +} + +static inline void +kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob) +{ + msg->ibm_type = type; + msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob; +} + +static inline int +kiblnd_rd_size(struct kib_rdma_desc *rd) +{ + int i; + int size; + + for (i = size = 0; i < rd->rd_nfrags; i++) + size += rd->rd_frags[i].rf_nob; + + return size; +} + +static inline __u64 +kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index) +{ + return rd->rd_frags[index].rf_addr; +} + +static inline __u32 +kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index) +{ + return rd->rd_frags[index].rf_nob; +} + +static inline __u32 +kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index) +{ + return rd->rd_key; +} + +static inline int +kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob) +{ + if (nob < rd->rd_frags[index].rf_nob) { + rd->rd_frags[index].rf_addr += nob; + rd->rd_frags[index].rf_nob -= nob; + } else { + index ++; + } + + return index; +} + +static inline int +kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n) +{ + LASSERT (msgtype == IBLND_MSG_GET_REQ || + msgtype == IBLND_MSG_PUT_ACK); + + return msgtype == IBLND_MSG_GET_REQ ? + offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) : + offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]); +} + +static inline __u64 +kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return ib_dma_mapping_error(dev, dma_addr); +} + +static inline __u64 kiblnd_dma_map_single(struct ib_device *dev, + void *msg, size_t size, + enum dma_data_direction direction) +{ + return ib_dma_map_single(dev, msg, size, direction); +} + +static inline void kiblnd_dma_unmap_single(struct ib_device *dev, + __u64 addr, size_t size, + enum dma_data_direction direction) +{ + ib_dma_unmap_single(dev, addr, size, direction); +} + +#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0) +#define KIBLND_UNMAP_ADDR(p, m, a) (a) + +static inline int kiblnd_dma_map_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + return ib_dma_map_sg(dev, sg, nents, direction); +} + +static inline void kiblnd_dma_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + ib_dma_unmap_sg(dev, sg, nents, direction); +} + +#ifndef HAVE_IB_SG_DMA_ADDRESS +#include +#define ib_sg_dma_address(dev, sg) sg_dma_address(sg) +#define ib_sg_dma_len(dev, sg) sg_dma_len(sg) +#endif + +static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev, + struct scatterlist *sg) +{ + return ib_sg_dma_address(dev, sg); +} + +static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev, + struct scatterlist *sg) +{ + return ib_sg_dma_len(dev, sg); +} + +#ifndef HAVE_RDMA_CONNECT_LOCKED +#define rdma_connect_locked(cmid, cpp) rdma_connect(cmid, cpp) +#endif + +/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly + * right because OFED1.2 defines it as const, to use it we have to add + * (void *) cast to overcome "const" */ + +#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data) +#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len) + +void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs); +void kiblnd_map_rx_descs(struct kib_conn *conn); +void kiblnd_unmap_rx_descs(struct kib_conn *conn); +void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node); +struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps); + +int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx, + struct kib_rdma_desc *rd, u32 nob, u64 iov, + struct kib_fmr *fmr); +void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status); + +int kiblnd_tunables_setup(struct lnet_ni *ni); +int kiblnd_tunables_init(void); + +int kiblnd_connd (void *arg); +int kiblnd_scheduler(void *arg); +int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name); +int kiblnd_failover_thread (void *arg); + +int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages); + +int kiblnd_cm_callback(struct rdma_cm_id *cmid, + struct rdma_cm_event *event); +int kiblnd_translate_mtu(int value); + +int kiblnd_dev_failover(struct kib_dev *dev, struct net *ns); +int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp, + lnet_nid_t nid); +void kiblnd_destroy_peer(struct kib_peer_ni *peer); +bool kiblnd_reconnect_peer(struct kib_peer_ni *peer); +void kiblnd_destroy_dev(struct kib_dev *dev); +void kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni); +struct kib_peer_ni *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid); +int kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni, + int version, u64 incarnation); +int kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why); + +struct kib_conn *kiblnd_create_conn(struct kib_peer_ni *peer_ni, + struct rdma_cm_id *cmid, + int state, int version); +void kiblnd_destroy_conn(struct kib_conn *conn); +void kiblnd_close_conn(struct kib_conn *conn, int error); +void kiblnd_close_conn_locked(struct kib_conn *conn, int error); + +void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid); +void kiblnd_txlist_done(struct list_head *txlist, int status, + enum lnet_msg_hstatus hstatus); + +void kiblnd_qp_event(struct ib_event *event, void *arg); +void kiblnd_cq_event(struct ib_event *event, void *arg); +void kiblnd_cq_completion(struct ib_cq *cq, void *arg); + +void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version, + int credits, lnet_nid_t dstnid, __u64 dststamp); +int kiblnd_unpack_msg(struct kib_msg *msg, int nob); +int kiblnd_post_rx(struct kib_rx *rx, int credit); + +int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg); +int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, + int delayed, unsigned int niov, struct kvec *iov, + lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen, + unsigned int rlen); + diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c new file mode 100644 index 0000000000000..72079ead79c4d --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -0,0 +1,4023 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd_cb.c + * + * Author: Eric Barton + */ + +#include "o2iblnd.h" + +#define MAX_CONN_RACES_BEFORE_ABORT 20 + +static void kiblnd_peer_alive(struct kib_peer_ni *peer_ni); +static void kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active, + int error); +static void kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, + int type, int body_nob); +static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, + int resid, struct kib_rdma_desc *dstrd, u64 dstcookie); +static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn); +static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn); + +static void kiblnd_unmap_tx(struct kib_tx *tx); +static void kiblnd_check_sends_locked(struct kib_conn *conn); + +void +kiblnd_tx_done(struct kib_tx *tx) +{ + struct lnet_msg *lntmsg[2]; + int rc; + int i; + + LASSERT (!in_interrupt()); + LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ + LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ + LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer_ni response */ + LASSERT (tx->tx_pool != NULL); + + kiblnd_unmap_tx(tx); + + /* tx may have up to 2 lnet msgs to finalise */ + lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; + rc = tx->tx_status; + + if (tx->tx_conn != NULL) { + kiblnd_conn_decref(tx->tx_conn); + tx->tx_conn = NULL; + } + + tx->tx_nwrq = tx->tx_nsge = 0; + tx->tx_status = 0; + + kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list); + + /* delay finalize until my descs have been freed */ + for (i = 0; i < 2; i++) { + if (lntmsg[i] == NULL) + continue; + + /* propagate health status to LNet for requests */ + if (i == 0 && lntmsg[i]) + lntmsg[i]->msg_health_status = tx->tx_hstatus; + + lnet_finalize(lntmsg[i], rc); + } +} + +void +kiblnd_txlist_done(struct list_head *txlist, int status, + enum lnet_msg_hstatus hstatus) +{ + struct kib_tx *tx; + + while (!list_empty(txlist)) { + tx = list_entry(txlist->next, struct kib_tx, tx_list); + + list_del(&tx->tx_list); + /* complete now */ + tx->tx_waiting = 0; + tx->tx_status = status; + if (hstatus != LNET_MSG_STATUS_OK) + tx->tx_hstatus = hstatus; + kiblnd_tx_done(tx); + } +} + +static struct kib_tx * +kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target) +{ + struct kib_net *net = ni->ni_data; + struct list_head *node; + struct kib_tx *tx; + struct kib_tx_poolset *tps; + + tps = net->ibn_tx_ps[lnet_cpt_of_nid(target, ni)]; + node = kiblnd_pool_alloc_node(&tps->tps_poolset); + if (node == NULL) + return NULL; + tx = container_of(node, struct kib_tx, tx_list); + + LASSERT (tx->tx_nwrq == 0); + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending == 0); + LASSERT (!tx->tx_waiting); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + LASSERT (tx->tx_nfrags == 0); + + tx->tx_gaps = false; + tx->tx_hstatus = LNET_MSG_STATUS_OK; + + return tx; +} + +static void +kiblnd_drop_rx(struct kib_rx *rx) +{ + struct kib_conn *conn = rx->rx_conn; + struct kib_sched_info *sched = conn->ibc_sched; + unsigned long flags; + + spin_lock_irqsave(&sched->ibs_lock, flags); + LASSERT(conn->ibc_nrx > 0); + conn->ibc_nrx--; + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + kiblnd_conn_decref(conn); +} + +int +kiblnd_post_rx(struct kib_rx *rx, int credit) +{ + struct kib_conn *conn = rx->rx_conn; + struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data; + struct ib_recv_wr *bad_wrq = NULL; +#ifdef HAVE_IB_GET_DMA_MR + struct ib_mr *mr = conn->ibc_hdev->ibh_mrs; +#endif + int rc; + + LASSERT (net != NULL); + LASSERT (!in_interrupt()); + LASSERT (credit == IBLND_POSTRX_NO_CREDIT || + credit == IBLND_POSTRX_PEER_CREDIT || + credit == IBLND_POSTRX_RSRVD_CREDIT); +#ifdef HAVE_IB_GET_DMA_MR + LASSERT(mr != NULL); + + rx->rx_sge.lkey = mr->lkey; +#else + rx->rx_sge.lkey = conn->ibc_hdev->ibh_pd->local_dma_lkey; +#endif + rx->rx_sge.addr = rx->rx_msgaddr; + rx->rx_sge.length = IBLND_MSG_SIZE; + + rx->rx_wrq.next = NULL; + rx->rx_wrq.sg_list = &rx->rx_sge; + rx->rx_wrq.num_sge = 1; + rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX); + + LASSERT (conn->ibc_state >= IBLND_CONN_INIT); + LASSERT (rx->rx_nob >= 0); /* not posted */ + + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) { + kiblnd_drop_rx(rx); /* No more posts for this rx */ + return 0; + } + + rx->rx_nob = -1; /* flag posted */ + + /* NB: need an extra reference after ib_post_recv because we don't + * own this rx (and rx::rx_conn) anymore, LU-5678. + */ + kiblnd_conn_addref(conn); +#ifdef HAVE_IB_POST_SEND_RECV_CONST + rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, + (const struct ib_recv_wr **)&bad_wrq); +#else + rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq); +#endif + if (unlikely(rc != 0)) { + CERROR("Can't post rx for %s: %d, bad_wrq: %p\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq); + rx->rx_nob = 0; + } + + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */ + goto out; + + if (unlikely(rc != 0)) { + kiblnd_close_conn(conn, rc); + kiblnd_drop_rx(rx); /* No more posts for this rx */ + goto out; + } + + if (credit == IBLND_POSTRX_NO_CREDIT) + goto out; + + spin_lock(&conn->ibc_lock); + if (credit == IBLND_POSTRX_PEER_CREDIT) + conn->ibc_outstanding_credits++; + else + conn->ibc_reserved_credits++; + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + +out: + kiblnd_conn_decref(conn); + return rc; +} + +static struct kib_tx * +kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, u64 cookie) +{ + struct list_head *tmp; + + list_for_each(tmp, &conn->ibc_active_txs) { + struct kib_tx *tx = list_entry(tmp, struct kib_tx, tx_list); + + LASSERT(!tx->tx_queued); + LASSERT(tx->tx_sending != 0 || tx->tx_waiting); + + if (tx->tx_cookie != cookie) + continue; + + if (tx->tx_waiting && + tx->tx_msg->ibm_type == txtype) + return tx; + + CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", + tx->tx_waiting ? "" : "NOT ", + tx->tx_msg->ibm_type, txtype); + } + return NULL; +} + +static void +kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, u64 cookie) +{ + struct kib_tx *tx; + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + int idle; + + spin_lock(&conn->ibc_lock); + + tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie); + if (tx == NULL) { + spin_unlock(&conn->ibc_lock); + + CWARN("Unmatched completion type %x cookie %#llx from %s\n", + txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_close_conn(conn, -EPROTO); + return; + } + + if (tx->tx_status == 0) { /* success so far */ + if (status < 0) { /* failed? */ + tx->tx_status = status; + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR; + } else if (txtype == IBLND_MSG_GET_REQ) { + lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status); + } + } + + tx->tx_waiting = 0; + + idle = !tx->tx_queued && (tx->tx_sending == 0); + if (idle) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (idle) + kiblnd_tx_done(tx); +} + +static void +kiblnd_send_completion(struct kib_conn *conn, int type, int status, u64 cookie) +{ + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + + if (tx == NULL) { + CERROR("Can't get tx for completion %x for %s\n", + type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + tx->tx_msg->ibm_u.completion.ibcm_status = status; + tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; + kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg)); + + kiblnd_queue_tx(tx, conn); +} + +static void +kiblnd_handle_rx(struct kib_rx *rx) +{ + struct kib_msg *msg = rx->rx_msg; + struct kib_conn *conn = rx->rx_conn; + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + int credits = msg->ibm_credits; + struct kib_tx *tx; + int rc = 0; + int rc2; + int post_credit; + + LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + CDEBUG (D_NET, "Received %x[%d] from %s\n", + msg->ibm_type, credits, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + if (credits != 0) { + /* Have I received credits that will let me send? */ + spin_lock(&conn->ibc_lock); + + if (conn->ibc_credits + credits > + conn->ibc_queue_depth) { + rc2 = conn->ibc_credits; + spin_unlock(&conn->ibc_lock); + + CERROR("Bad credits from %s: %d + %d > %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc2, credits, + conn->ibc_queue_depth); + + kiblnd_close_conn(conn, -EPROTO); + kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT); + return; + } + + conn->ibc_credits += credits; + + /* This ensures the credit taken by NOOP can be returned */ + if (msg->ibm_type == IBLND_MSG_NOOP && + !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */ + conn->ibc_outstanding_credits++; + + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + } + + switch (msg->ibm_type) { + default: + CERROR("Bad IBLND message type %x from %s\n", + msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + post_credit = IBLND_POSTRX_NO_CREDIT; + rc = -EPROTO; + break; + + case IBLND_MSG_NOOP: + if (IBLND_OOB_CAPABLE(conn->ibc_version)) { + post_credit = IBLND_POSTRX_NO_CREDIT; + break; + } + + if (credits != 0) /* credit already posted */ + post_credit = IBLND_POSTRX_NO_CREDIT; + else /* a keepalive NOOP */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_IMMEDIATE: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr, + msg->ibm_srcnid, rx, 0); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_PUT_REQ: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr, + msg->ibm_srcnid, rx, 1); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_PUT_NAK: + CWARN ("PUT_NACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBLND_MSG_PUT_ACK: + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + + spin_lock(&conn->ibc_lock); + tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ, + msg->ibm_u.putack.ibpam_src_cookie); + if (tx != NULL) + list_del(&tx->tx_list); + spin_unlock(&conn->ibc_lock); + + if (tx == NULL) { + CERROR("Unmatched PUT_ACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; + break; + } + + LASSERT (tx->tx_waiting); + /* CAVEAT EMPTOR: I could be racing with tx_complete, but... + * (a) I can overwrite tx_msg since my peer_ni has received it! + * (b) tx_waiting set tells tx_complete() it's not done. */ + + tx->tx_nwrq = tx->tx_nsge = 0; /* overwrite PUT_REQ */ + + rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE, + kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd), + &msg->ibm_u.putack.ibpam_rd, + msg->ibm_u.putack.ibpam_dst_cookie); + if (rc2 < 0) + CERROR("Can't setup rdma for PUT to %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); + + spin_lock(&conn->ibc_lock); + tx->tx_waiting = 0; /* clear waiting and queue atomically */ + kiblnd_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); + break; + + case IBLND_MSG_PUT_DONE: + post_credit = IBLND_POSTRX_PEER_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBLND_MSG_GET_REQ: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr, + msg->ibm_srcnid, rx, 1); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_GET_DONE: + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + } + + if (rc < 0) /* protocol error */ + kiblnd_close_conn(conn, rc); + + if (post_credit != IBLND_POSTRX_DONT_POST) + kiblnd_post_rx(rx, post_credit); +} + +static void +kiblnd_rx_complete(struct kib_rx *rx, int status, int nob) +{ + struct kib_msg *msg = rx->rx_msg; + struct kib_conn *conn = rx->rx_conn; + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + struct kib_net *net = ni->ni_data; + int rc; + int err = -EIO; + + LASSERT (net != NULL); + LASSERT (rx->rx_nob < 0); /* was posted */ + rx->rx_nob = 0; /* isn't now */ + + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) + goto ignore; + + if (status != IB_WC_SUCCESS) { + CNETERR("Rx from %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), status); + goto failed; + } + + LASSERT (nob >= 0); + rx->rx_nob = nob; + + rc = kiblnd_unpack_msg(msg, rx->rx_nob); + if (rc != 0) { + CERROR ("Error %d unpacking rx from %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + goto failed; + } + + if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + msg->ibm_dstnid != ni->ni_nid || + msg->ibm_srcstamp != conn->ibc_incarnation || + msg->ibm_dststamp != net->ibn_incarnation) { + CERROR ("Stale rx from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + err = -ESTALE; + goto failed; + } + + /* set time last known alive */ + kiblnd_peer_alive(conn->ibc_peer); + + /* racing with connection establishment/teardown! */ + + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + unsigned long flags; + + write_lock_irqsave(g_lock, flags); + /* must check holding global lock to eliminate race */ + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); + write_unlock_irqrestore(g_lock, flags); + return; + } + write_unlock_irqrestore(g_lock, flags); + } + kiblnd_handle_rx(rx); + return; + + failed: + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + kiblnd_close_conn(conn, err); + ignore: + kiblnd_drop_rx(rx); /* Don't re-post rx. */ +} + +static int +kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, + struct kib_rdma_desc *rd, u32 nob) +{ + struct kib_hca_dev *hdev; + struct kib_dev *dev; + struct kib_fmr_poolset *fps; + int cpt; + int rc; + int i; + + LASSERT(tx->tx_pool != NULL); + LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL); + + dev = net->ibn_dev; + hdev = tx->tx_pool->tpo_hdev; + cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; + + /* + * If we're dealing with FastReg, but the device doesn't + * support GAPS and the tx has GAPS, then there is no real point + * in trying to map the memory, because it'll just fail. So + * preemptively fail with an appropriate message + */ + if ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED) && + !(dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT) && + tx->tx_gaps) { + CERROR("Using FastReg with no GAPS support, but tx has gaps. " + "Try setting use_fastreg_gaps to 1\n"); + return -EPROTONOSUPPORT; + } + +#ifdef HAVE_FMR_POOL_API + /* + * FMR does not support gaps but the tx has gaps then + * we should make sure that the number of fragments we'll be sending + * over fits within the number of fragments negotiated on the + * connection, otherwise, we won't be able to RDMA the data. + * We need to maintain the number of fragments negotiation on the + * connection for backwards compatibility. + */ + if (tx->tx_gaps && (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)) { + if (tx->tx_conn && + tx->tx_conn->ibc_max_frags <= rd->rd_nfrags) { + CERROR("TX number of frags (%d) is <= than connection" + " number of frags (%d). Consider setting peer's" + " map_on_demand to 256\n", tx->tx_nfrags, + tx->tx_conn->ibc_max_frags); + return -EFBIG; + } + } +#endif + + fps = net->ibn_fmr_ps[cpt]; + rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->tx_fmr); + if (rc != 0) { + CERROR("Can't map %u bytes (%u/%u)s: %d\n", nob, + tx->tx_nfrags, rd->rd_nfrags, rc); + return rc; + } + + /* + * If rd is not tx_rd, it's going to get sent to a peer_ni, who will + * need the rkey + */ + rd->rd_key = tx->tx_fmr.fmr_key; + /* + * for FastReg or FMR with no gaps we can accumulate all + * the fragments in one FastReg or FMR fragment. + */ + if ( +#ifdef HAVE_FMR_POOL_API + ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED) + && !tx->tx_gaps) || +#endif + (dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)) { + /* FMR requires zero based address */ +#ifdef HAVE_FMR_POOL_API + if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED) + rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask; +#endif + rd->rd_frags[0].rf_nob = nob; + rd->rd_nfrags = 1; + } else { + /* + * We're transmitting with gaps using FMR. + * We'll need to use multiple fragments and identify the + * zero based address of each fragment. + */ + for (i = 0; i < rd->rd_nfrags; i++) { + rd->rd_frags[i].rf_addr &= ~hdev->ibh_page_mask; + rd->rd_frags[i].rf_addr += i << hdev->ibh_page_shift; + } + } + + return 0; +} + +static void +kiblnd_unmap_tx(struct kib_tx *tx) +{ + if ( +#ifdef HAVE_FMR_POOL_API + tx->tx_fmr.fmr_pfmr || +#endif + tx->tx_fmr.fmr_frd) + kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status); + + if (tx->tx_nfrags != 0) { + kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev, + tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); + tx->tx_nfrags = 0; + } +} + +#ifdef HAVE_IB_GET_DMA_MR +static struct ib_mr * +kiblnd_find_rd_dma_mr(struct lnet_ni *ni, struct kib_rdma_desc *rd) +{ + struct kib_net *net = ni->ni_data; + struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev; + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + + /* + * if map-on-demand is turned on and the device supports + * either FMR or FastReg then use that. Otherwise use global + * memory regions. If that's not available either, then you're + * dead in the water and fail the operation. + */ + if (tunables->lnd_map_on_demand && + (net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED +#ifdef HAVE_FMR_POOL_API + || net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED +#endif + )) + return NULL; + + /* + * hdev->ibh_mrs can be NULL. This case is dealt with gracefully + * in the call chain. The mapping will fail with appropriate error + * message. + */ + return hdev->ibh_mrs; +} +#endif + +static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx, + struct kib_rdma_desc *rd, int nfrags) +{ + struct kib_net *net = ni->ni_data; + struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev; +#ifdef HAVE_IB_GET_DMA_MR + struct ib_mr *mr = NULL; +#endif + __u32 nob; + int i; + + /* If rd is not tx_rd, it's going to get sent to a peer_ni and I'm the + * RDMA sink */ + tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + tx->tx_nfrags = nfrags; + + rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags, + tx->tx_nfrags, tx->tx_dmadir); + + for (i = 0, nob = 0; i < rd->rd_nfrags; i++) { + rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len( + hdev->ibh_ibdev, &tx->tx_frags[i]); + rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address( + hdev->ibh_ibdev, &tx->tx_frags[i]); + nob += rd->rd_frags[i].rf_nob; + } + +#ifdef HAVE_IB_GET_DMA_MR + mr = kiblnd_find_rd_dma_mr(ni, rd); + if (mr != NULL) { + /* found pre-mapping MR */ + rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey; + return 0; + } +#endif + + if (net->ibn_fmr_ps != NULL) + return kiblnd_fmr_map_tx(net, tx, rd, nob); + + return -EINVAL; +} + +static int kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx, + struct kib_rdma_desc *rd, unsigned int niov, + struct kvec *iov, int offset, int nob) +{ + struct kib_net *net = ni->ni_data; + struct page *page; + struct scatterlist *sg; + unsigned long vaddr; + int fragnob; + int page_offset; + unsigned int max_niov; + + LASSERT (nob > 0); + LASSERT (niov > 0); + LASSERT (net != NULL); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + max_niov = niov; + + sg = tx->tx_frags; + do { + LASSERT(niov > 0); + + vaddr = ((unsigned long)iov->iov_base) + offset; + page_offset = vaddr & (PAGE_SIZE - 1); + page = lnet_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR("Can't find page\n"); + return -EFAULT; + } + + fragnob = min((int)(iov->iov_len - offset), nob); + fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); + + /* + * We're allowed to start at a non-aligned page offset in + * the first fragment and end at a non-aligned page offset + * in the last fragment. + */ + if ((fragnob < (int)PAGE_SIZE - page_offset) && + (niov < max_niov) && nob > fragnob) { + CDEBUG(D_NET, "fragnob %d < available page %d: with" + " remaining %d iovs with %d nob left\n", + fragnob, (int)PAGE_SIZE - page_offset, niov, + nob); + tx->tx_gaps = true; + } + + sg_set_page(sg, page, fragnob, page_offset); + sg = sg_next(sg); + if (!sg) { + CERROR("lacking enough sg entries to map tx\n"); + return -EFAULT; + } + + if (offset + fragnob < iov->iov_len) { + offset += fragnob; + } else { + offset = 0; + iov++; + niov--; + } + nob -= fragnob; + } while (nob > 0); + + return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); +} + +static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx, + struct kib_rdma_desc *rd, int nkiov, + lnet_kiov_t *kiov, int offset, int nob) +{ + struct kib_net *net = ni->ni_data; + struct scatterlist *sg; + int fragnob; + int max_nkiov; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT(nob > 0); + LASSERT(nkiov > 0); + LASSERT(net != NULL); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT(nkiov > 0); + } + + max_nkiov = nkiov; + + sg = tx->tx_frags; + do { + LASSERT(nkiov > 0); + + fragnob = min((int)(kiov->kiov_len - offset), nob); + + /* + * We're allowed to start at a non-aligned page offset in + * the first fragment and end at a non-aligned page offset + * in the last fragment. + */ + if ((fragnob < (int)(kiov->kiov_len - offset)) && + nkiov < max_nkiov && nob > fragnob) { + CDEBUG(D_NET, "fragnob %d < available page %d: with" + " remaining %d kiovs with %d nob left\n", + fragnob, (int)(kiov->kiov_len - offset), + nkiov, nob); + tx->tx_gaps = true; + } + + sg_set_page(sg, kiov->kiov_page, fragnob, + kiov->kiov_offset + offset); + sg = sg_next(sg); + if (!sg) { + CERROR("lacking enough sg entries to map tx\n"); + return -EFAULT; + } + + offset = 0; + kiov++; + nkiov--; + nob -= fragnob; + } while (nob > 0); + + return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); +} + +static int +kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit) +__must_hold(&conn->ibc_lock) +{ + struct kib_msg *msg = tx->tx_msg; + struct kib_peer_ni *peer_ni = conn->ibc_peer; + struct lnet_ni *ni = peer_ni->ibp_ni; + struct kib_fast_reg_descriptor *frd = tx->tx_fmr.fmr_frd; + int ver = conn->ibc_version; + int rc; + int done; + + LASSERT(tx->tx_queued); + /* We rely on this for QP sizing */ + LASSERT(tx->tx_nwrq > 0 && tx->tx_nsge >= 0); + LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags); + + LASSERT(credit == 0 || credit == 1); + LASSERT(conn->ibc_outstanding_credits >= 0); + LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth); + LASSERT(conn->ibc_credits >= 0); + LASSERT(conn->ibc_credits <= conn->ibc_queue_depth); + + if (conn->ibc_nsends_posted == + kiblnd_concurrent_sends(ver, ni)) { + /* tx completions outstanding... */ + CDEBUG(D_NET, "%s: posted enough\n", + libcfs_nid2str(peer_ni->ibp_nid)); + return -EAGAIN; + } + + if (credit != 0 && conn->ibc_credits == 0) { /* no credits */ + CDEBUG(D_NET, "%s: no credits\n", + libcfs_nid2str(peer_ni->ibp_nid)); + return -EAGAIN; + } + + if (credit != 0 && !IBLND_OOB_CAPABLE(ver) && + conn->ibc_credits == 1 && /* last credit reserved */ + msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */ + CDEBUG(D_NET, "%s: not using last credit\n", + libcfs_nid2str(peer_ni->ibp_nid)); + return -EAGAIN; + } + + /* NB don't drop ibc_lock before bumping tx_sending */ + list_del(&tx->tx_list); + tx->tx_queued = 0; + + if (msg->ibm_type == IBLND_MSG_NOOP && + (!kiblnd_need_noop(conn) || /* redundant NOOP */ + (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */ + conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) { + /* OK to drop when posted enough NOOPs, since + * kiblnd_check_sends_locked will queue NOOP again when + * posted NOOPs complete */ + spin_unlock(&conn->ibc_lock); + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + kiblnd_tx_done(tx); + spin_lock(&conn->ibc_lock); + CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n", + libcfs_nid2str(peer_ni->ibp_nid), + conn->ibc_noops_posted); + return 0; + } + + kiblnd_pack_msg(peer_ni->ibp_ni, msg, ver, conn->ibc_outstanding_credits, + peer_ni->ibp_nid, conn->ibc_incarnation); + + conn->ibc_credits -= credit; + conn->ibc_outstanding_credits = 0; + conn->ibc_nsends_posted++; + if (msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted++; + + /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA + * PUT. If so, it was first queued here as a PUT_REQ, sent and + * stashed on ibc_active_txs, matched by an incoming PUT_ACK, + * and then re-queued here. It's (just) possible that + * tx_sending is non-zero if we've not done the tx_complete() + * from the first send; hence the ++ rather than = below. */ + tx->tx_sending++; + list_add(&tx->tx_list, &conn->ibc_active_txs); + + /* I'm still holding ibc_lock! */ + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) { + rc = -ECONNABORTED; + } else if (tx->tx_pool->tpo_pool.po_failed || + conn->ibc_hdev != tx->tx_pool->tpo_hdev) { + /* close_conn will launch failover */ + rc = -ENETDOWN; + } else { + struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr; + struct ib_send_wr *wr = &tx->tx_wrq[0].wr; + + if (frd != NULL && !frd->frd_posted) { + if (!frd->frd_valid) { + wr = &frd->frd_inv_wr.wr; + wr->next = &frd->frd_fastreg_wr.wr; + } else { + wr = &frd->frd_fastreg_wr.wr; + } + frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr; + } + + LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX), + "bad wr_id %#llx, opc %d, flags %d, peer_ni: %s\n", + bad->wr_id, bad->opcode, bad->send_flags, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + bad = NULL; + if (lnet_send_error_simulation(tx->tx_lntmsg[0], &tx->tx_hstatus)) + rc = -EINVAL; + else +#ifdef HAVE_IB_POST_SEND_RECV_CONST + rc = ib_post_send(conn->ibc_cmid->qp, wr, + (const struct ib_send_wr **)&bad); +#else + rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad); +#endif + } + + conn->ibc_last_send = ktime_get(); + + if (rc == 0) { + if (frd != NULL) + frd->frd_posted = true; + return 0; + } + + /* NB credits are transferred in the actual + * message, which can only be the last work item */ + conn->ibc_credits += credit; + conn->ibc_outstanding_credits += msg->ibm_credits; + conn->ibc_nsends_posted--; + if (msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted--; + + tx->tx_status = rc; + tx->tx_waiting = 0; + tx->tx_sending--; + + done = (tx->tx_sending == 0); + if (done) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + CERROR("Error %d posting transmit to %s\n", + rc, libcfs_nid2str(peer_ni->ibp_nid)); + else + CDEBUG(D_NET, "Error %d posting transmit to %s\n", + rc, libcfs_nid2str(peer_ni->ibp_nid)); + + kiblnd_close_conn(conn, rc); + + if (done) + kiblnd_tx_done(tx); + + spin_lock(&conn->ibc_lock); + + return -EIO; +} + +static void +kiblnd_check_sends_locked(struct kib_conn *conn) +{ + int ver = conn->ibc_version; + struct lnet_ni *ni = conn->ibc_peer->ibp_ni; + struct kib_tx *tx; + + /* Don't send anything until after the connection is established */ + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CDEBUG(D_NET, "%s too soon\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + LASSERT(conn->ibc_nsends_posted <= + kiblnd_concurrent_sends(ver, ni)); + LASSERT (!IBLND_OOB_CAPABLE(ver) || + conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver)); + LASSERT (conn->ibc_reserved_credits >= 0); + + while (conn->ibc_reserved_credits > 0 && + !list_empty(&conn->ibc_tx_queue_rsrvd)) { + tx = list_entry(conn->ibc_tx_queue_rsrvd.next, + struct kib_tx, tx_list); + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); + conn->ibc_reserved_credits--; + } + + if (kiblnd_need_noop(conn)) { + spin_unlock(&conn->ibc_lock); + + tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + if (tx != NULL) + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0); + + spin_lock(&conn->ibc_lock); + if (tx != NULL) + kiblnd_queue_tx_locked(tx, conn); + } + + for (;;) { + int credit; + + if (!list_empty(&conn->ibc_tx_queue_nocred)) { + credit = 0; + tx = list_entry(conn->ibc_tx_queue_nocred.next, + struct kib_tx, tx_list); + } else if (!list_empty(&conn->ibc_tx_noops)) { + LASSERT (!IBLND_OOB_CAPABLE(ver)); + credit = 1; + tx = list_entry(conn->ibc_tx_noops.next, + struct kib_tx, tx_list); + } else if (!list_empty(&conn->ibc_tx_queue)) { + credit = 1; + tx = list_entry(conn->ibc_tx_queue.next, + struct kib_tx, tx_list); + } else + break; + + if (kiblnd_post_tx_locked(conn, tx, credit) != 0) + break; + } +} + +static void +kiblnd_tx_complete(struct kib_tx *tx, int status) +{ + int failed = (status != IB_WC_SUCCESS); + struct kib_conn *conn = tx->tx_conn; + int idle; + + if (tx->tx_sending <= 0) { + CERROR("Received an event on a freed tx: %p status %d\n", + tx, tx->tx_status); + return; + } + + if (failed) { + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + CNETERR("Tx -> %s cookie %#llx" + " sending %d waiting %d: failed %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_cookie, tx->tx_sending, tx->tx_waiting, + status); + + kiblnd_close_conn(conn, -EIO); + } else { + kiblnd_peer_alive(conn->ibc_peer); + } + + spin_lock(&conn->ibc_lock); + + /* I could be racing with rdma completion. Whoever makes 'tx' idle + * gets to free it, which also drops its ref on 'conn'. */ + + tx->tx_sending--; + conn->ibc_nsends_posted--; + if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted--; + + if (failed) { + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED; + tx->tx_waiting = 0; /* don't wait for peer_ni */ + tx->tx_status = -EIO; + } + + idle = (tx->tx_sending == 0) && /* This is the final callback */ + !tx->tx_waiting && /* Not waiting for peer_ni */ + !tx->tx_queued; /* Not re-queued (PUT_DONE) */ + if (idle) + list_del(&tx->tx_list); + + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + + if (idle) + kiblnd_tx_done(tx); +} + +static void +kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type, + int body_nob) +{ + struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev; + struct ib_sge *sge = &tx->tx_msgsge; + struct ib_rdma_wr *wrq; + int nob = offsetof(struct kib_msg, ibm_u) + body_nob; +#ifdef HAVE_IB_GET_DMA_MR + struct ib_mr *mr = hdev->ibh_mrs; +#endif + + LASSERT(tx->tx_nwrq >= 0); + LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1); + LASSERT(nob <= IBLND_MSG_SIZE); +#ifdef HAVE_IB_GET_DMA_MR + LASSERT(mr != NULL); +#endif + + kiblnd_init_msg(tx->tx_msg, type, body_nob); + +#ifdef HAVE_IB_GET_DMA_MR + sge->lkey = mr->lkey; +#else + sge->lkey = hdev->ibh_pd->local_dma_lkey; +#endif + sge->addr = tx->tx_msgaddr; + sge->length = nob; + + wrq = &tx->tx_wrq[tx->tx_nwrq]; + memset(wrq, 0, sizeof(*wrq)); + + wrq->wr.next = NULL; + wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX); + wrq->wr.sg_list = sge; + wrq->wr.num_sge = 1; + wrq->wr.opcode = IB_WR_SEND; + wrq->wr.send_flags = IB_SEND_SIGNALED; + + tx->tx_nwrq++; +} + +static int +kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, + int resid, struct kib_rdma_desc *dstrd, u64 dstcookie) +{ + struct kib_msg *ibmsg = tx->tx_msg; + struct kib_rdma_desc *srcrd = tx->tx_rd; + struct ib_rdma_wr *wrq = NULL; + struct ib_sge *sge; + int rc = resid; + int srcidx; + int dstidx; + int sge_nob; + int wrq_sge; + + LASSERT(!in_interrupt()); + LASSERT(tx->tx_nwrq == 0 && tx->tx_nsge == 0); + LASSERT(type == IBLND_MSG_GET_DONE || type == IBLND_MSG_PUT_DONE); + + for (srcidx = dstidx = wrq_sge = sge_nob = 0; + resid > 0; resid -= sge_nob) { + int prev = dstidx; + + if (srcidx >= srcrd->rd_nfrags) { + CERROR("Src buffer exhausted: %d frags\n", srcidx); + rc = -EPROTO; + break; + } + + if (dstidx >= dstrd->rd_nfrags) { + CERROR("Dst buffer exhausted: %d frags\n", dstidx); + rc = -EPROTO; + break; + } + + if (tx->tx_nwrq >= conn->ibc_max_frags) { + CERROR("RDMA has too many fragments for peer_ni %s (%d), " + "src idx/frags: %d/%d dst idx/frags: %d/%d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_max_frags, + srcidx, srcrd->rd_nfrags, + dstidx, dstrd->rd_nfrags); + rc = -EMSGSIZE; + break; + } + + sge_nob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx), + kiblnd_rd_frag_size(dstrd, dstidx)), resid); + + sge = &tx->tx_sge[tx->tx_nsge]; + sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx); + sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx); + sge->length = sge_nob; + + if (wrq_sge == 0) { + wrq = &tx->tx_wrq[tx->tx_nwrq]; + + wrq->wr.next = &(wrq + 1)->wr; + wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA); + wrq->wr.sg_list = sge; + wrq->wr.opcode = IB_WR_RDMA_WRITE; + wrq->wr.send_flags = 0; + +#ifdef HAVE_IB_RDMA_WR + wrq->remote_addr = kiblnd_rd_frag_addr(dstrd, + dstidx); + wrq->rkey = kiblnd_rd_frag_key(dstrd, + dstidx); +#else + wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, + dstidx); + wrq->wr.wr.rdma.rkey = kiblnd_rd_frag_key(dstrd, + dstidx); +#endif + } + + srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, sge_nob); + dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, sge_nob); + + wrq_sge++; + if (wrq_sge == *kiblnd_tunables.kib_wrq_sge || dstidx != prev) { + tx->tx_nwrq++; + wrq->wr.num_sge = wrq_sge; + wrq_sge = 0; + } + tx->tx_nsge++; + } + + if (rc < 0) /* no RDMA if completing with failure */ + tx->tx_nwrq = tx->tx_nsge = 0; + + ibmsg->ibm_u.completion.ibcm_status = rc; + ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; + kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx, + type, sizeof(struct kib_completion_msg)); + + return rc; +} + +static void +kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn) +{ + struct list_head *q; + s64 timeout_ns; + + LASSERT(tx->tx_nwrq > 0); /* work items set up */ + LASSERT(!tx->tx_queued); /* not queued for sending already */ + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + if (conn->ibc_state >= IBLND_CONN_DISCONNECTED) { + tx->tx_status = -ECONNABORTED; + tx->tx_waiting = 0; + if (tx->tx_conn != NULL) { + /* PUT_DONE first attached to conn as a PUT_REQ */ + LASSERT(tx->tx_conn == conn); + LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE); + tx->tx_conn = NULL; + kiblnd_conn_decref(conn); + } + list_add(&tx->tx_list, &conn->ibc_zombie_txs); + + return; + } + + timeout_ns = lnet_get_lnd_timeout() * NSEC_PER_SEC; + tx->tx_queued = 1; + tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns); + + if (tx->tx_conn == NULL) { + kiblnd_conn_addref(conn); + tx->tx_conn = conn; + LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE); + } else { + /* PUT_DONE first attached to conn as a PUT_REQ */ + LASSERT (tx->tx_conn == conn); + LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE); + } + + switch (tx->tx_msg->ibm_type) { + default: + LBUG(); + + case IBLND_MSG_PUT_REQ: + case IBLND_MSG_GET_REQ: + q = &conn->ibc_tx_queue_rsrvd; + break; + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_ACK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + q = &conn->ibc_tx_queue_nocred; + break; + + case IBLND_MSG_NOOP: + if (IBLND_OOB_CAPABLE(conn->ibc_version)) + q = &conn->ibc_tx_queue_nocred; + else + q = &conn->ibc_tx_noops; + break; + + case IBLND_MSG_IMMEDIATE: + q = &conn->ibc_tx_queue; + break; + } + + list_add_tail(&tx->tx_list, q); +} + +static void +kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn) +{ + spin_lock(&conn->ibc_lock); + kiblnd_queue_tx_locked(tx, conn); + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); +} + +static int kiblnd_resolve_addr(struct rdma_cm_id *cmid, + struct sockaddr_in *srcaddr, + struct sockaddr_in *dstaddr, + int timeout_ms) +{ + unsigned short port; + int rc; + + /* allow the port to be reused */ + rc = rdma_set_reuseaddr(cmid, 1); + if (rc != 0) { + CERROR("Unable to set reuse on cmid: %d\n", rc); + return rc; + } + + /* look for a free privileged port */ + for (port = PROT_SOCK-1; port > 0; port--) { + srcaddr->sin_port = htons(port); + rc = rdma_resolve_addr(cmid, + (struct sockaddr *)srcaddr, + (struct sockaddr *)dstaddr, + timeout_ms); + if (rc == 0) { + CDEBUG(D_NET, "bound to port %hu\n", port); + return 0; + } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) { + CDEBUG(D_NET, "bind to port %hu failed: %d\n", + port, rc); + } else { + return rc; + } + } + + CERROR("Failed to bind to a free privileged port\n"); + return rc; +} + +static void +kiblnd_connect_peer(struct kib_peer_ni *peer_ni) +{ + struct rdma_cm_id *cmid; + struct kib_dev *dev; + struct kib_net *net = peer_ni->ibp_ni->ni_data; + struct sockaddr_in srcaddr; + struct sockaddr_in dstaddr; + int rc; + + LASSERT (net != NULL); + LASSERT (peer_ni->ibp_connecting > 0); + + cmid = kiblnd_rdma_create_id(peer_ni->ibp_ni->ni_net_ns, + kiblnd_cm_callback, peer_ni, + RDMA_PS_TCP, IB_QPT_RC); + + if (IS_ERR(cmid)) { + CERROR("Can't create CMID for %s: %ld\n", + libcfs_nid2str(peer_ni->ibp_nid), PTR_ERR(cmid)); + rc = PTR_ERR(cmid); + goto failed; + } + + dev = net->ibn_dev; + memset(&srcaddr, 0, sizeof(srcaddr)); + srcaddr.sin_family = AF_INET; + srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip); + + memset(&dstaddr, 0, sizeof(dstaddr)); + dstaddr.sin_family = AF_INET; + dstaddr.sin_port = htons(*kiblnd_tunables.kib_service); + dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer_ni->ibp_nid)); + + kiblnd_peer_addref(peer_ni); /* cmid's ref */ + + if (*kiblnd_tunables.kib_use_priv_port) { + rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr, + lnet_get_lnd_timeout() * 1000); + } else { + rc = rdma_resolve_addr(cmid, + (struct sockaddr *)&srcaddr, + (struct sockaddr *)&dstaddr, + lnet_get_lnd_timeout() * 1000); + } + if (rc != 0) { + /* Can't initiate address resolution: */ + CERROR("Can't resolve addr for %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), rc); + goto failed2; + } + + return; + + failed2: + kiblnd_peer_connect_failed(peer_ni, 1, rc); + kiblnd_peer_decref(peer_ni); /* cmid's ref */ + rdma_destroy_id(cmid); + return; + failed: + kiblnd_peer_connect_failed(peer_ni, 1, rc); +} + +bool +kiblnd_reconnect_peer(struct kib_peer_ni *peer_ni) +{ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + char *reason = NULL; + struct list_head txs; + unsigned long flags; + + INIT_LIST_HEAD(&txs); + + write_lock_irqsave(glock, flags); + if (peer_ni->ibp_reconnecting == 0) { + if (peer_ni->ibp_accepting) + reason = "accepting"; + else if (peer_ni->ibp_connecting) + reason = "connecting"; + else if (!list_empty(&peer_ni->ibp_conns)) + reason = "connected"; + else /* connected then closed */ + reason = "closed"; + + goto no_reconnect; + } + + if (peer_ni->ibp_accepting) + CNETERR("Detecting race between accepting and reconnecting\n"); + peer_ni->ibp_reconnecting--; + + if (!kiblnd_peer_active(peer_ni)) { + list_splice_init(&peer_ni->ibp_tx_queue, &txs); + reason = "unlinked"; + goto no_reconnect; + } + + peer_ni->ibp_connecting++; + peer_ni->ibp_reconnected++; + + write_unlock_irqrestore(glock, flags); + + kiblnd_connect_peer(peer_ni); + return true; + + no_reconnect: + write_unlock_irqrestore(glock, flags); + + CWARN("Abort reconnection of %s: %s\n", + libcfs_nid2str(peer_ni->ibp_nid), reason); + kiblnd_txlist_done(&txs, -ECONNABORTED, + LNET_MSG_STATUS_LOCAL_ABORTED); + return false; +} + +void +kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid) +{ + struct kib_peer_ni *peer_ni; + struct kib_peer_ni *peer2; + struct kib_conn *conn; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + unsigned long flags; + int rc; + int i; + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems */ + + LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */ + LASSERT (tx == NULL || tx->tx_nwrq > 0); /* work items have been set up */ + + /* First time, just use a read lock since I expect to find my peer_ni + * connected */ + read_lock_irqsave(g_lock, flags); + + peer_ni = kiblnd_find_peer_locked(ni, nid); + if (peer_ni != NULL && !list_empty(&peer_ni->ibp_conns)) { + /* Found a peer_ni with an established connection */ + conn = kiblnd_get_conn_locked(peer_ni); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + read_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + return; + } + + read_unlock(g_lock); + /* Re-try with a write lock */ + write_lock(g_lock); + + peer_ni = kiblnd_find_peer_locked(ni, nid); + if (peer_ni != NULL) { + if (list_empty(&peer_ni->ibp_conns)) { + /* found a peer_ni, but it's still connecting... */ + LASSERT(kiblnd_peer_connecting(peer_ni)); + if (tx != NULL) + list_add_tail(&tx->tx_list, + &peer_ni->ibp_tx_queue); + write_unlock_irqrestore(g_lock, flags); + } else { + conn = kiblnd_get_conn_locked(peer_ni); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + write_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + return; + } + + write_unlock_irqrestore(g_lock, flags); + + /* Allocate a peer_ni ready to add to the peer_ni table and retry */ + rc = kiblnd_create_peer(ni, &peer_ni, nid); + if (rc != 0) { + CERROR("Can't create peer_ni %s\n", libcfs_nid2str(nid)); + if (tx != NULL) { + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + kiblnd_tx_done(tx); + } + return; + } + + write_lock_irqsave(g_lock, flags); + + peer2 = kiblnd_find_peer_locked(ni, nid); + if (peer2 != NULL) { + if (list_empty(&peer2->ibp_conns)) { + /* found a peer_ni, but it's still connecting... */ + LASSERT(kiblnd_peer_connecting(peer2)); + if (tx != NULL) + list_add_tail(&tx->tx_list, + &peer2->ibp_tx_queue); + write_unlock_irqrestore(g_lock, flags); + } else { + conn = kiblnd_get_conn_locked(peer2); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + write_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + + kiblnd_peer_decref(peer_ni); + return; + } + + /* Brand new peer_ni */ + LASSERT(peer_ni->ibp_connecting == 0); + tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + peer_ni->ibp_connecting = tunables->lnd_conns_per_peer; + + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT(((struct kib_net *)ni->ni_data)->ibn_shutdown == 0); + + if (tx != NULL) + list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue); + + kiblnd_peer_addref(peer_ni); + list_add_tail(&peer_ni->ibp_list, kiblnd_nid2peerlist(nid)); + + write_unlock_irqrestore(g_lock, flags); + + for (i = 0; i < tunables->lnd_conns_per_peer; i++) + kiblnd_connect_peer(peer_ni); + kiblnd_peer_decref(peer_ni); +} + +int +kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) +{ + struct lnet_hdr *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + struct lnet_process_id target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct kvec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + struct kib_msg *ibmsg; + struct kib_rdma_desc *rd; + struct kib_tx *tx; + int nob; + int rc; + + /* NB 'private' is different depending on what we're sending.... */ + + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + + /* Thread context */ + LASSERT (!in_interrupt()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + switch (type) { + default: + LBUG(); + return (-EIO); + + case LNET_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); + if (nob <= IBLND_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kiblnd_get_idle_tx(ni, target.nid); + if (tx == NULL) { + CERROR("Can't allocate txd for GET to %s\n", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + ibmsg = tx->tx_msg; + rd = &ibmsg->ibm_u.get.ibgm_rd; + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) + rc = kiblnd_setup_rd_iov(ni, tx, rd, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, + 0, lntmsg->msg_md->md_length); + else + rc = kiblnd_setup_rd_kiov(ni, tx, rd, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.kiov, + 0, lntmsg->msg_md->md_length); + if (rc != 0) { + CERROR("Can't setup GET sink for %s: %d\n", + libcfs_nid2str(target.nid), rc); + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + kiblnd_tx_done(tx); + return -EIO; + } + + nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]); + ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; + ibmsg->ibm_u.get.ibgm_hdr = *hdr; + + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob); + + tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR("Can't create reply for GET -> %s\n", + libcfs_nid2str(target.nid)); + kiblnd_tx_done(tx); + return -EIO; + } + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ + tx->tx_waiting = 1; /* waiting for GET_DONE */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; + + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob <= IBLND_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kiblnd_get_idle_tx(ni, target.nid); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + type == LNET_MSG_PUT ? "PUT" : "REPLY", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + if (payload_kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, + payload_niov, payload_iov, + payload_offset, payload_nob); + else + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup PUT src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kiblnd_tx_done(tx); + return -EIO; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; + ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, + sizeof(struct kib_putreq_msg)); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; + } + + /* send IMMEDIATE */ + LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]) + <= IBLND_MSG_SIZE); + + tx = kiblnd_get_idle_tx(ni, target.nid); + if (tx == NULL) { + CERROR ("Can't send %d to %s: tx descs exhausted\n", + type, libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + + if (payload_kiov != NULL) + lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg, + offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg, + offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), + payload_niov, payload_iov, + payload_offset, payload_nob); + + nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]); + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; +} + +static void +kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg) +{ + struct lnet_process_id target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct kvec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + struct kib_tx *tx; + int rc; + + tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid); + if (tx == NULL) { + CERROR("Can't get tx for REPLY to %s\n", + libcfs_nid2str(target.nid)); + goto failed_0; + } + + if (nob == 0) + rc = 0; + else if (kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, + niov, iov, offset, nob); + else + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + niov, kiov, offset, nob); + + if (rc != 0) { + CERROR("Can't setup GET src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + rc = kiblnd_init_rdma(rx->rx_conn, tx, + IBLND_MSG_GET_DONE, nob, + &rx->rx_msg->ibm_u.get.ibgm_rd, + rx->rx_msg->ibm_u.get.ibgm_cookie); + if (rc < 0) { + CERROR("Can't setup rdma for GET from %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + if (nob == 0) { + /* No RDMA: local completion may happen now! */ + lnet_finalize(lntmsg, 0); + } else { + /* RDMA: lnet_finalize(lntmsg) when it + * completes */ + tx->tx_lntmsg[0] = lntmsg; + } + + kiblnd_queue_tx(tx, rx->rx_conn); + return; + + +failed_1: + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + kiblnd_tx_done(tx); +failed_0: + lnet_finalize(lntmsg, -EIO); +} + +int +kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, + int delayed, unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + struct kib_rx *rx = private; + struct kib_msg *rxmsg = rx->rx_msg; + struct kib_conn *conn = rx->rx_conn; + struct kib_tx *tx; + __u64 ibprm_cookie; + int nob; + int post_credit = IBLND_POSTRX_PEER_CREDIT; + int rc = 0; + + LASSERT (mlen <= rlen); + LASSERT (!in_interrupt()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + switch (rxmsg->ibm_type) { + default: + LBUG(); + + case IBLND_MSG_IMMEDIATE: + nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]); + if (nob > rx->rx_nob) { + CERROR ("Immediate message from %s too big: %d(%d)\n", + libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), + nob, rx->rx_nob); + rc = -EPROTO; + break; + } + + if (kiov != NULL) + lnet_copy_flat2kiov(niov, kiov, offset, + IBLND_MSG_SIZE, rxmsg, + offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), + mlen); + else + lnet_copy_flat2iov(niov, iov, offset, + IBLND_MSG_SIZE, rxmsg, + offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), + mlen); + lnet_finalize(lntmsg, 0); + break; + + case IBLND_MSG_PUT_REQ: { + struct kib_msg *txmsg; + struct kib_rdma_desc *rd; + ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; + + if (mlen == 0) { + lnet_finalize(lntmsg, 0); + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, + 0, ibprm_cookie); + break; + } + + tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + if (tx == NULL) { + CERROR("Can't allocate tx for %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* Not replying will break the connection */ + rc = -ENOMEM; + break; + } + + txmsg = tx->tx_msg; + rd = &txmsg->ibm_u.putack.ibpam_rd; + if (kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, rd, + niov, iov, offset, mlen); + else + rc = kiblnd_setup_rd_kiov(ni, tx, rd, + niov, kiov, offset, mlen); + if (rc != 0) { + CERROR("Can't setup PUT sink for %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + kiblnd_tx_done(tx); + /* tell peer_ni it's over */ + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, + rc, ibprm_cookie); + break; + } + + nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]); + txmsg->ibm_u.putack.ibpam_src_cookie = ibprm_cookie; + txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; + + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_DONE */ + kiblnd_queue_tx(tx, conn); + + /* reposted buffer reserved for PUT_DONE */ + post_credit = IBLND_POSTRX_NO_CREDIT; + break; + } + + case IBLND_MSG_GET_REQ: + if (lntmsg != NULL) { + /* Optimized GET; RDMA lntmsg's payload */ + kiblnd_reply(ni, rx, lntmsg); + } else { + /* GET didn't match anything */ + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE, + -ENODATA, + rxmsg->ibm_u.get.ibgm_cookie); + } + break; + } + + kiblnd_post_rx(rx, post_credit); + return rc; +} + +int +kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name) +{ + struct task_struct *task = kthread_run(fn, arg, name); + + if (IS_ERR(task)) + return PTR_ERR(task); + + atomic_inc(&kiblnd_data.kib_nthreads); + return 0; +} + +static void +kiblnd_thread_fini (void) +{ + atomic_dec (&kiblnd_data.kib_nthreads); +} + +static void +kiblnd_peer_alive(struct kib_peer_ni *peer_ni) +{ + /* This is racy, but everyone's only writing ktime_get_seconds() */ + peer_ni->ibp_last_alive = ktime_get_seconds(); + smp_mb(); +} + +static void +kiblnd_peer_notify(struct kib_peer_ni *peer_ni) +{ + int error = 0; + time64_t last_alive = 0; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (kiblnd_peer_idle(peer_ni) && peer_ni->ibp_error != 0) { + error = peer_ni->ibp_error; + peer_ni->ibp_error = 0; + + last_alive = peer_ni->ibp_last_alive; + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (error != 0) + lnet_notify(peer_ni->ibp_ni, + peer_ni->ibp_nid, 0, last_alive); +} + +void +kiblnd_close_conn_locked(struct kib_conn *conn, int error) +{ + /* This just does the immediate housekeeping. 'error' is zero for a + * normal shutdown which can happen only after the connection has been + * established. If the connection is established, schedule the + * connection to be finished off by the connd. Otherwise the connd is + * already dealing with it (either to set it up or tear it down). + * Caller holds kib_global_lock exclusively in irq context */ + struct kib_peer_ni *peer_ni = conn->ibc_peer; + struct kib_dev *dev; + unsigned long flags; + + LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + if (error != 0 && conn->ibc_comms_error == 0) + conn->ibc_comms_error = error; + + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) + return; /* already being handled */ + + if (error == 0 && + list_empty(&conn->ibc_tx_noops) && + list_empty(&conn->ibc_tx_queue) && + list_empty(&conn->ibc_tx_queue_rsrvd) && + list_empty(&conn->ibc_tx_queue_nocred) && + list_empty(&conn->ibc_active_txs)) { + CDEBUG(D_NET, "closing conn to %s\n", + libcfs_nid2str(peer_ni->ibp_nid)); + } else { + CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n", + libcfs_nid2str(peer_ni->ibp_nid), error, + list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", + list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)", + list_empty(&conn->ibc_tx_queue_rsrvd) ? + "" : "(sending_rsrvd)", + list_empty(&conn->ibc_tx_queue_nocred) ? + "" : "(sending_nocred)", + list_empty(&conn->ibc_active_txs) ? "" : "(waiting)"); + } + + dev = ((struct kib_net *)peer_ni->ibp_ni->ni_data)->ibn_dev; + if (peer_ni->ibp_next_conn == conn) + /* clear next_conn so it won't be used */ + peer_ni->ibp_next_conn = NULL; + list_del(&conn->ibc_list); + /* connd (see below) takes over ibc_list's ref */ + + if (list_empty(&peer_ni->ibp_conns) && /* no more conns */ + kiblnd_peer_active(peer_ni)) { /* still in peer_ni table */ + kiblnd_unlink_peer_locked(peer_ni); + + /* set/clear error on last conn */ + peer_ni->ibp_error = conn->ibc_comms_error; + } + + kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING); + + if (error != 0 && + kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + wake_up(&kiblnd_data.kib_failover_waitq); + } + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + + list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns); + wake_up(&kiblnd_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); +} + +void +kiblnd_close_conn(struct kib_conn *conn, int error) +{ + unsigned long flags; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + kiblnd_close_conn_locked(conn, error); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +static void +kiblnd_handle_early_rxs(struct kib_conn *conn) +{ + unsigned long flags; + struct kib_rx *rx; + + LASSERT(!in_interrupt()); + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + while (!list_empty(&conn->ibc_early_rxs)) { + rx = list_entry(conn->ibc_early_rxs.next, + struct kib_rx, rx_list); + list_del(&rx->rx_list); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_handle_rx(rx); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + } + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +void +kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) +{ + struct list_head zombies = LIST_HEAD_INIT(zombies); + struct list_head *tmp; + struct list_head *nxt; + struct kib_tx *tx; + + spin_lock(&conn->ibc_lock); + + list_for_each_safe(tmp, nxt, txs) { + tx = list_entry(tmp, struct kib_tx, tx_list); + + if (txs == &conn->ibc_active_txs) { + LASSERT(!tx->tx_queued); + LASSERT(tx->tx_waiting || + tx->tx_sending != 0); + if (conn->ibc_comms_error == -ETIMEDOUT) { + if (tx->tx_waiting && !tx->tx_sending) + tx->tx_hstatus = + LNET_MSG_STATUS_REMOTE_TIMEOUT; + else if (tx->tx_sending) + tx->tx_hstatus = + LNET_MSG_STATUS_NETWORK_TIMEOUT; + } + } else { + LASSERT(tx->tx_queued); + if (conn->ibc_comms_error == -ETIMEDOUT) + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT; + else + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + } + + tx->tx_status = -ECONNABORTED; + tx->tx_waiting = 0; + + /* + * TODO: This makes an assumption that + * kiblnd_tx_complete() will be called for each tx. If + * that event is dropped we could end up with stale + * connections floating around. We'd like to deal with + * that in a better way. + * + * Also that means we can exceed the timeout by many + * seconds. + */ + if (tx->tx_sending == 0) { + tx->tx_queued = 0; + list_del(&tx->tx_list); + list_add(&tx->tx_list, &zombies); + } + } + + spin_unlock(&conn->ibc_lock); + + /* + * aborting transmits occurs when finalizing the connection. + * The connection is finalized on error. + * Passing LNET_MSG_STATUS_OK to txlist_done() will not + * override the value already set in tx->tx_hstatus above. + */ + kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK); +} + +static void +kiblnd_finalise_conn(struct kib_conn *conn) +{ + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state > IBLND_CONN_INIT); + + /* abort_receives moves QP state to IB_QPS_ERR. This is only required + * for connections that didn't get as far as being connected, because + * rdma_disconnect() does this for free. */ + kiblnd_abort_receives(conn); + + kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED); + + /* Complete all tx descs not waiting for sends to complete. + * NB we should be safe from RDMA now that the QP has changed state */ + + kiblnd_abort_txs(conn, &conn->ibc_tx_noops); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred); + kiblnd_abort_txs(conn, &conn->ibc_active_txs); + + kiblnd_handle_early_rxs(conn); +} + +static void +kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active, + int error) +{ + struct list_head zombies = LIST_HEAD_INIT(zombies); + unsigned long flags; + + LASSERT (error != 0); + LASSERT (!in_interrupt()); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (active) { + LASSERT(peer_ni->ibp_connecting > 0); + peer_ni->ibp_connecting--; + } else { + LASSERT (peer_ni->ibp_accepting > 0); + peer_ni->ibp_accepting--; + } + + if (kiblnd_peer_connecting(peer_ni)) { + /* another connection attempt under way... */ + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return; + } + + peer_ni->ibp_reconnected = 0; + if (list_empty(&peer_ni->ibp_conns)) { + /* Take peer_ni's blocked transmits to complete with error */ + list_splice_init(&peer_ni->ibp_tx_queue, &zombies); + + if (kiblnd_peer_active(peer_ni)) + kiblnd_unlink_peer_locked(peer_ni); + + peer_ni->ibp_error = error; + } else { + /* Can't have blocked transmits if there are connections */ + LASSERT(list_empty(&peer_ni->ibp_tx_queue)); + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_peer_notify(peer_ni); + + if (list_empty(&zombies)) + return; + + CNETERR("Deleting messages for %s: connection failed\n", + libcfs_nid2str(peer_ni->ibp_nid)); + + kiblnd_txlist_done(&zombies, error, + LNET_MSG_STATUS_LOCAL_DROPPED); +} + +static void +kiblnd_connreq_done(struct kib_conn *conn, int status) +{ + struct kib_peer_ni *peer_ni = conn->ibc_peer; + struct kib_tx *tx; + struct list_head txs; + unsigned long flags; + int active; + + active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + + CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n", + libcfs_nid2str(peer_ni->ibp_nid), active, + conn->ibc_version, status); + + LASSERT (!in_interrupt()); + LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT && + peer_ni->ibp_connecting > 0) || + (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT && + peer_ni->ibp_accepting > 0)); + + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + conn->ibc_connvars = NULL; + + if (status != 0) { + /* failed to establish connection */ + kiblnd_peer_connect_failed(peer_ni, active, status); + kiblnd_finalise_conn(conn); + return; + } + + /* connection established */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + /* reset retry count */ + peer_ni->ibp_retries = 0; + + conn->ibc_last_send = ktime_get(); + kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); + kiblnd_peer_alive(peer_ni); + + /* Add conn to peer_ni's list and nuke any dangling conns from a different + * peer_ni instance... */ + kiblnd_conn_addref(conn); /* +1 ref for ibc_list */ + list_add(&conn->ibc_list, &peer_ni->ibp_conns); + peer_ni->ibp_reconnected = 0; + if (active) + peer_ni->ibp_connecting--; + else + peer_ni->ibp_accepting--; + + if (peer_ni->ibp_version == 0) { + peer_ni->ibp_version = conn->ibc_version; + peer_ni->ibp_incarnation = conn->ibc_incarnation; + } + + if (peer_ni->ibp_version != conn->ibc_version || + peer_ni->ibp_incarnation != conn->ibc_incarnation) { + kiblnd_close_stale_conns_locked(peer_ni, conn->ibc_version, + conn->ibc_incarnation); + peer_ni->ibp_version = conn->ibc_version; + peer_ni->ibp_incarnation = conn->ibc_incarnation; + } + + /* grab pending txs while I have the lock */ + INIT_LIST_HEAD(&txs); + list_splice_init(&peer_ni->ibp_tx_queue, &txs); + + if (!kiblnd_peer_active(peer_ni) || /* peer_ni has been deleted */ + conn->ibc_comms_error != 0) { /* error has happened already */ + + /* start to shut down connection */ + kiblnd_close_conn_locked(conn, -ECONNABORTED); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_txlist_done(&txs, -ECONNABORTED, + LNET_MSG_STATUS_LOCAL_ERROR); + + return; + } + + /* +1 ref for myself, this connection is visible to other threads + * now, refcount of peer:ibp_conns can be released by connection + * close from either a different thread, or the calling of + * kiblnd_check_sends_locked() below. See bz21911 for details. + */ + kiblnd_conn_addref(conn); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* Schedule blocked txs + * Note: if we are running with conns_per_peer > 1, these blocked + * txs will all get scheduled to the first connection which gets + * scheduled. We won't be using round robin on this first batch. + */ + spin_lock(&conn->ibc_lock); + while (!list_empty(&txs)) { + tx = list_entry(txs.next, struct kib_tx, tx_list); + list_del(&tx->tx_list); + + kiblnd_queue_tx_locked(tx, conn); + } + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + + /* schedule blocked rxs */ + kiblnd_handle_early_rxs(conn); + kiblnd_conn_decref(conn); +} + +static void +kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej) +{ + int rc; + +#ifdef HAVE_RDMA_REJECT_4ARGS + rc = rdma_reject(cmid, rej, sizeof(*rej), IB_CM_REJ_CONSUMER_DEFINED); +#else + rc = rdma_reject(cmid, rej, sizeof(*rej)); +#endif + + if (rc != 0) + CWARN("Error %d sending reject\n", rc); +} + +static int +kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) +{ + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + struct kib_msg *reqmsg = priv; + struct kib_msg *ackmsg; + struct kib_dev *ibdev; + struct kib_peer_ni *peer_ni; + struct kib_peer_ni *peer2; + struct kib_conn *conn; + struct lnet_ni *ni = NULL; + struct kib_net *net = NULL; + lnet_nid_t nid; + struct rdma_conn_param cp; + struct kib_rej rej; + int version = IBLND_MSG_VERSION; + unsigned long flags; + int rc; + struct sockaddr_in *peer_addr; + LASSERT (!in_interrupt()); + + /* cmid inherits 'context' from the corresponding listener id */ + ibdev = cmid->context; + LASSERT(ibdev); + + memset(&rej, 0, sizeof(rej)); + rej.ibr_magic = IBLND_MSG_MAGIC; + rej.ibr_why = IBLND_REJECT_FATAL; + rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE; + + peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr); + if (*kiblnd_tunables.kib_require_priv_port && + ntohs(peer_addr->sin_port) >= PROT_SOCK) { + __u32 ip = ntohl(peer_addr->sin_addr.s_addr); + CERROR("peer_ni's port (%pI4h:%hu) is not privileged\n", + &ip, ntohs(peer_addr->sin_port)); + goto failed; + } + + if (priv_nob < offsetof(struct kib_msg, ibm_type)) { + CERROR("Short connection request\n"); + goto failed; + } + + /* Future protocol version compatibility support! If the + * o2iblnd-specific protocol changes, or when LNET unifies + * protocols over all LNDs, the initial connection will + * negotiate a protocol version. I trap this here to avoid + * console errors; the reject tells the peer_ni which protocol I + * speak. */ + if (reqmsg->ibm_magic == LNET_PROTO_MAGIC || + reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) + goto failed; + if (reqmsg->ibm_magic == IBLND_MSG_MAGIC && + reqmsg->ibm_version != IBLND_MSG_VERSION && + reqmsg->ibm_version != IBLND_MSG_VERSION_1) + goto failed; + if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) && + reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) && + reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1)) + goto failed; + + rc = kiblnd_unpack_msg(reqmsg, priv_nob); + if (rc != 0) { + CERROR("Can't parse connection request: %d\n", rc); + goto failed; + } + + nid = reqmsg->ibm_srcnid; + ni = lnet_nid2ni_addref(reqmsg->ibm_dstnid); + + if (ni != NULL) { + net = (struct kib_net *)ni->ni_data; + rej.ibr_incarnation = net->ibn_incarnation; + } + + if (ni == NULL || /* no matching net */ + ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */ + net->ibn_dev != ibdev) { /* wrong device */ + CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): " + "bad dst nid %s\n", libcfs_nid2str(nid), + ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid), + ibdev->ibd_ifname, ibdev->ibd_nnets, + &ibdev->ibd_ifip, + libcfs_nid2str(reqmsg->ibm_dstnid)); + + goto failed; + } + + /* check time stamp as soon as possible */ + if (reqmsg->ibm_dststamp != 0 && + reqmsg->ibm_dststamp != net->ibn_incarnation) { + CWARN("Stale connection request\n"); + rej.ibr_why = IBLND_REJECT_CONN_STALE; + goto failed; + } + + /* I can accept peer_ni's version */ + version = reqmsg->ibm_version; + + if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) { + CERROR("Unexpected connreq msg type: %x from %s\n", + reqmsg->ibm_type, libcfs_nid2str(nid)); + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_queue_depth > + kiblnd_msg_queue_size(version, ni)) { + CERROR("Can't accept conn from %s, queue depth too large: " + " %d (<=%d wanted)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_queue_depth, + kiblnd_msg_queue_size(version, ni)); + + if (version == IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE; + + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_max_frags > + IBLND_MAX_RDMA_FRAGS) { + CWARN("Can't accept conn from %s (version %x): " + "max_frags %d too large (%d wanted)\n", + libcfs_nid2str(nid), version, + reqmsg->ibm_u.connparams.ibcp_max_frags, + IBLND_MAX_RDMA_FRAGS); + + if (version >= IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; + + goto failed; + } else if (reqmsg->ibm_u.connparams.ibcp_max_frags < + IBLND_MAX_RDMA_FRAGS && + net->ibn_fmr_ps == NULL) { + CWARN("Can't accept conn from %s (version %x): " + "max_frags %d incompatible without FMR pool " + "(%d wanted)\n", + libcfs_nid2str(nid), version, + reqmsg->ibm_u.connparams.ibcp_max_frags, + IBLND_MAX_RDMA_FRAGS); + + if (version == IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; + + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("Can't accept %s: message size %d too big (%d max)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + goto failed; + } + + /* assume 'nid' is a new peer_ni; create */ + rc = kiblnd_create_peer(ni, &peer_ni, nid); + if (rc != 0) { + CERROR("Can't create peer_ni for %s\n", libcfs_nid2str(nid)); + rej.ibr_why = IBLND_REJECT_NO_RESOURCES; + goto failed; + } + + /* We have validated the peer's parameters so use those */ + peer_ni->ibp_max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags; + peer_ni->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth; + + write_lock_irqsave(g_lock, flags); + + peer2 = kiblnd_find_peer_locked(ni, nid); + if (peer2 != NULL) { + if (peer2->ibp_version == 0) { + peer2->ibp_version = version; + peer2->ibp_incarnation = reqmsg->ibm_srcstamp; + } + + /* not the guy I've talked with */ + if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp || + peer2->ibp_version != version) { + kiblnd_close_peer_conns_locked(peer2, -ESTALE); + + if (kiblnd_peer_active(peer2)) { + peer2->ibp_incarnation = reqmsg->ibm_srcstamp; + peer2->ibp_version = version; + } + write_unlock_irqrestore(g_lock, flags); + + CWARN("Conn stale %s version %x/%x incarnation %llu/%llu\n", + libcfs_nid2str(nid), peer2->ibp_version, version, + peer2->ibp_incarnation, reqmsg->ibm_srcstamp); + + kiblnd_peer_decref(peer_ni); + rej.ibr_why = IBLND_REJECT_CONN_STALE; + goto failed; + } + + /* Tie-break connection race in favour of the higher NID. + * If we keep running into a race condition multiple times, + * we have to assume that the connection attempt with the + * higher NID is stuck in a connecting state and will never + * recover. As such, we pass through this if-block and let + * the lower NID connection win so we can move forward. + */ + if (peer2->ibp_connecting != 0 && + nid < ni->ni_nid && peer2->ibp_races < + MAX_CONN_RACES_BEFORE_ABORT) { + peer2->ibp_races++; + write_unlock_irqrestore(g_lock, flags); + + CDEBUG(D_NET, "Conn race %s\n", + libcfs_nid2str(peer2->ibp_nid)); + + kiblnd_peer_decref(peer_ni); + rej.ibr_why = IBLND_REJECT_CONN_RACE; + goto failed; + } + if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT) + CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n", + libcfs_nid2str(peer2->ibp_nid), + MAX_CONN_RACES_BEFORE_ABORT); + /* + * passive connection is allowed even this peer_ni is waiting for + * reconnection. + */ + peer2->ibp_reconnecting = 0; + peer2->ibp_races = 0; + peer2->ibp_accepting++; + kiblnd_peer_addref(peer2); + + /* Race with kiblnd_launch_tx (active connect) to create peer_ni + * so copy validated parameters since we now know what the + * peer_ni's limits are */ + peer2->ibp_max_frags = peer_ni->ibp_max_frags; + peer2->ibp_queue_depth = peer_ni->ibp_queue_depth; + + write_unlock_irqrestore(g_lock, flags); + kiblnd_peer_decref(peer_ni); + peer_ni = peer2; + } else { + /* Brand new peer_ni */ + LASSERT (peer_ni->ibp_accepting == 0); + LASSERT (peer_ni->ibp_version == 0 && + peer_ni->ibp_incarnation == 0); + + peer_ni->ibp_accepting = 1; + peer_ni->ibp_version = version; + peer_ni->ibp_incarnation = reqmsg->ibm_srcstamp; + + /* I have a ref on ni that prevents it being shutdown */ + LASSERT (net->ibn_shutdown == 0); + + kiblnd_peer_addref(peer_ni); + list_add_tail(&peer_ni->ibp_list, kiblnd_nid2peerlist(nid)); + + write_unlock_irqrestore(g_lock, flags); + } + + conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_PASSIVE_WAIT, version); + if (conn == NULL) { + kiblnd_peer_connect_failed(peer_ni, 0, -ENOMEM); + kiblnd_peer_decref(peer_ni); + rej.ibr_why = IBLND_REJECT_NO_RESOURCES; + goto failed; + } + + /* conn now "owns" cmid, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. */ + conn->ibc_incarnation = reqmsg->ibm_srcstamp; + conn->ibc_credits = conn->ibc_queue_depth; + conn->ibc_reserved_credits = conn->ibc_queue_depth; + LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + + IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn)); + + ackmsg = &conn->ibc_connvars->cv_msg; + memset(ackmsg, 0, sizeof(*ackmsg)); + + kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, + sizeof(ackmsg->ibm_u.connparams)); + ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; + ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + + kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); + + memset(&cp, 0, sizeof(cp)); + cp.private_data = ackmsg; + cp.private_data_len = ackmsg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + + CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid)); + + rc = rdma_accept(cmid, &cp); + if (rc != 0) { + CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc); + rej.ibr_version = version; + rej.ibr_why = IBLND_REJECT_FATAL; + + kiblnd_reject(cmid, &rej); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } + + lnet_ni_decref(ni); + return 0; + + failed: + if (ni != NULL) { + rej.ibr_cp.ibcp_queue_depth = + kiblnd_msg_queue_size(version, ni); + rej.ibr_cp.ibcp_max_frags = IBLND_MAX_RDMA_FRAGS; + lnet_ni_decref(ni); + } + + rej.ibr_version = version; + kiblnd_reject(cmid, &rej); + + return -ECONNREFUSED; +} + +static void +kiblnd_check_reconnect(struct kib_conn *conn, int version, + u64 incarnation, int why, struct kib_connparams *cp) +{ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + struct kib_peer_ni *peer_ni = conn->ibc_peer; + char *reason; + int msg_size = IBLND_MSG_SIZE; + int frag_num = -1; + int queue_dep = -1; + bool reconnect; + unsigned long flags; + + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + LASSERT(peer_ni->ibp_connecting > 0); /* 'conn' at least */ + + if (cp) { + msg_size = cp->ibcp_max_msg_size; + frag_num = cp->ibcp_max_frags; + queue_dep = cp->ibcp_queue_depth; + } + + write_lock_irqsave(glock, flags); + /* retry connection if it's still needed and no other connection + * attempts (active or passive) are in progress + * NB: reconnect is still needed even when ibp_tx_queue is + * empty if ibp_version != version because reconnect may be + * initiated by kiblnd_query() */ + reconnect = (!list_empty(&peer_ni->ibp_tx_queue) || + peer_ni->ibp_version != version) && + peer_ni->ibp_connecting && + peer_ni->ibp_accepting == 0; + if (!reconnect) { + reason = "no need"; + goto out; + } + + if (peer_ni->ibp_retries > *kiblnd_tunables.kib_retry_count) { + reason = "retry count exceeded due to no listener"; + goto out; + } + + switch (why) { + default: + reason = "Unknown"; + break; + + case IBLND_REJECT_RDMA_FRAGS: { + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + + if (!cp) { + reason = "can't negotiate max frags"; + goto out; + } + tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; +#ifdef HAVE_IB_GET_DMA_MR + /* + * This check only makes sense if the kernel supports global + * memory registration. Otherwise, map_on_demand will never == 0 + */ + if (!tunables->lnd_map_on_demand) { + reason = "map_on_demand must be enabled"; + goto out; + } +#endif + if (conn->ibc_max_frags <= frag_num) { + reason = "unsupported max frags"; + goto out; + } + + peer_ni->ibp_max_frags = frag_num; + reason = "rdma fragments"; + break; + } + case IBLND_REJECT_MSG_QUEUE_SIZE: + if (!cp) { + reason = "can't negotiate queue depth"; + goto out; + } + if (conn->ibc_queue_depth <= queue_dep) { + reason = "unsupported queue depth"; + goto out; + } + + peer_ni->ibp_queue_depth = queue_dep; + reason = "queue depth"; + break; + + case IBLND_REJECT_CONN_STALE: + reason = "stale"; + break; + + case IBLND_REJECT_CONN_RACE: + reason = "conn race"; + break; + + case IBLND_REJECT_CONN_UNCOMPAT: + reason = "version negotiation"; + break; + + case IBLND_REJECT_INVALID_SRV_ID: + reason = "invalid service id"; + break; + } + + conn->ibc_reconnect = 1; + peer_ni->ibp_reconnecting++; + peer_ni->ibp_version = version; + if (incarnation != 0) + peer_ni->ibp_incarnation = incarnation; + out: + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + CNETERR("%s: %s (%s), %x, %x, msg_size: %d, queue_depth: %d/%d, max_frags: %d/%d\n", + libcfs_nid2str(peer_ni->ibp_nid), + reconnect ? "reconnect" : "don't reconnect", + reason, IBLND_MSG_VERSION, version, msg_size, + conn->ibc_queue_depth, queue_dep, + conn->ibc_max_frags, frag_num); + /* + * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer_ni + * while destroying the zombie + */ +} + +static void +kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob) +{ + struct kib_peer_ni *peer_ni = conn->ibc_peer; + + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + + switch (reason) { + case IB_CM_REJ_STALE_CONN: + kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0, + IBLND_REJECT_CONN_STALE, NULL); + break; + + case IB_CM_REJ_INVALID_SERVICE_ID: + peer_ni->ibp_retries++; + kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0, + IBLND_REJECT_INVALID_SRV_ID, NULL); + CNETERR("%s rejected: no listener at %d\n", + libcfs_nid2str(peer_ni->ibp_nid), + *kiblnd_tunables.kib_service); + break; + + case IB_CM_REJ_CONSUMER_DEFINED: + if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) { + struct kib_rej *rej = priv; + struct kib_connparams *cp = NULL; + int flip = 0; + __u64 incarnation = -1; + + /* NB. default incarnation is -1 because: + * a) V1 will ignore dst incarnation in connreq. + * b) V2 will provide incarnation while rejecting me, + * -1 will be overwrote. + * + * if I try to connect to a V1 peer_ni with V2 protocol, + * it rejected me then upgrade to V2, I have no idea + * about the upgrading and try to reconnect with V1, + * in this case upgraded V2 can find out I'm trying to + * talk to the old guy and reject me(incarnation is -1). + */ + + if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || + rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) { + __swab32s(&rej->ibr_magic); + __swab16s(&rej->ibr_version); + flip = 1; + } + + if (priv_nob >= sizeof(struct kib_rej) && + rej->ibr_version > IBLND_MSG_VERSION_1) { + /* priv_nob is always 148 in current version + * of OFED, so we still need to check version. + * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */ + cp = &rej->ibr_cp; + + if (flip) { + __swab64s(&rej->ibr_incarnation); + __swab16s(&cp->ibcp_queue_depth); + __swab16s(&cp->ibcp_max_frags); + __swab32s(&cp->ibcp_max_msg_size); + } + + incarnation = rej->ibr_incarnation; + } + + if (rej->ibr_magic != IBLND_MSG_MAGIC && + rej->ibr_magic != LNET_PROTO_MAGIC) { + CERROR("%s rejected: consumer defined fatal error\n", + libcfs_nid2str(peer_ni->ibp_nid)); + break; + } + + if (rej->ibr_version != IBLND_MSG_VERSION && + rej->ibr_version != IBLND_MSG_VERSION_1) { + CERROR("%s rejected: o2iblnd version %x error\n", + libcfs_nid2str(peer_ni->ibp_nid), + rej->ibr_version); + break; + } + + if (rej->ibr_why == IBLND_REJECT_FATAL && + rej->ibr_version == IBLND_MSG_VERSION_1) { + CDEBUG(D_NET, "rejected by old version peer_ni %s: %x\n", + libcfs_nid2str(peer_ni->ibp_nid), rej->ibr_version); + + if (conn->ibc_version != IBLND_MSG_VERSION_1) + rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT; + } + + switch (rej->ibr_why) { + case IBLND_REJECT_CONN_RACE: + case IBLND_REJECT_CONN_STALE: + case IBLND_REJECT_CONN_UNCOMPAT: + case IBLND_REJECT_MSG_QUEUE_SIZE: + case IBLND_REJECT_RDMA_FRAGS: + kiblnd_check_reconnect(conn, rej->ibr_version, + incarnation, rej->ibr_why, cp); + break; + + case IBLND_REJECT_NO_RESOURCES: + CERROR("%s rejected: o2iblnd no resources\n", + libcfs_nid2str(peer_ni->ibp_nid)); + break; + + case IBLND_REJECT_FATAL: + CERROR("%s rejected: o2iblnd fatal error\n", + libcfs_nid2str(peer_ni->ibp_nid)); + break; + + default: + CERROR("%s rejected: o2iblnd reason %d\n", + libcfs_nid2str(peer_ni->ibp_nid), + rej->ibr_why); + break; + } + break; + } + fallthrough; + default: + CNETERR("%s rejected: reason %d, size %d\n", + libcfs_nid2str(peer_ni->ibp_nid), reason, priv_nob); + break; + } + + kiblnd_connreq_done(conn, -ECONNREFUSED); +} + +static void +kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob) +{ + struct kib_peer_ni *peer_ni = conn->ibc_peer; + struct lnet_ni *ni = peer_ni->ibp_ni; + struct kib_net *net = ni->ni_data; + struct kib_msg *msg = priv; + int ver = conn->ibc_version; + int rc = kiblnd_unpack_msg(msg, priv_nob); + unsigned long flags; + + LASSERT (net != NULL); + + if (rc != 0) { + CERROR("Can't unpack connack from %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), rc); + goto failed; + } + + if (msg->ibm_type != IBLND_MSG_CONNACK) { + CERROR("Unexpected message %d from %s\n", + msg->ibm_type, libcfs_nid2str(peer_ni->ibp_nid)); + rc = -EPROTO; + goto failed; + } + + if (ver != msg->ibm_version) { + CERROR("%s replied version %x is different with " + "requested version %x\n", + libcfs_nid2str(peer_ni->ibp_nid), msg->ibm_version, ver); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_queue_depth > + conn->ibc_queue_depth) { + CERROR("%s has incompatible queue depth %d (<=%d wanted)\n", + libcfs_nid2str(peer_ni->ibp_nid), + msg->ibm_u.connparams.ibcp_queue_depth, + conn->ibc_queue_depth); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_max_frags > + conn->ibc_max_frags) { + CERROR("%s has incompatible max_frags %d (<=%d wanted)\n", + libcfs_nid2str(peer_ni->ibp_nid), + msg->ibm_u.connparams.ibcp_max_frags, + conn->ibc_max_frags); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("%s max message size %d too big (%d max)\n", + libcfs_nid2str(peer_ni->ibp_nid), + msg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + rc = -EPROTO; + goto failed; + } + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + if (msg->ibm_dstnid == ni->ni_nid && + msg->ibm_dststamp == net->ibn_incarnation) + rc = 0; + else + rc = -ESTALE; + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (rc != 0) { + CERROR("Bad connection reply from %s, rc = %d, " + "version: %x max_frags: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), rc, + msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags); + goto failed; + } + + conn->ibc_incarnation = msg->ibm_srcstamp; + conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth; + conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth; + conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth; + conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags; + LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + + IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn)); + + kiblnd_connreq_done(conn, 0); + return; + + failed: + /* NB My QP has already established itself, so I handle anything going + * wrong here by setting ibc_comms_error. + * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then + * immediately tears it down. */ + + LASSERT (rc != 0); + conn->ibc_comms_error = rc; + kiblnd_connreq_done(conn, 0); +} + +static int +kiblnd_active_connect(struct rdma_cm_id *cmid) +{ + struct kib_peer_ni *peer_ni = cmid->context; + struct kib_conn *conn; + struct kib_msg *msg; + struct rdma_conn_param cp; + int version; + __u64 incarnation; + unsigned long flags; + int rc; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + incarnation = peer_ni->ibp_incarnation; + version = (peer_ni->ibp_version == 0) ? IBLND_MSG_VERSION : + peer_ni->ibp_version; + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_ACTIVE_CONNECT, + version); + if (conn == NULL) { + kiblnd_peer_connect_failed(peer_ni, 1, -ENOMEM); + kiblnd_peer_decref(peer_ni); /* lose cmid's ref */ + return -ENOMEM; + } + + /* conn "owns" cmid now, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. conn also takes over cmid's ref + * on peer_ni */ + + msg = &conn->ibc_connvars->cv_msg; + + memset(msg, 0, sizeof(*msg)); + kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); + msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; + msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + + kiblnd_pack_msg(peer_ni->ibp_ni, msg, version, + 0, peer_ni->ibp_nid, incarnation); + + memset(&cp, 0, sizeof(cp)); + cp.private_data = msg; + cp.private_data_len = msg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + + LASSERT(cmid->context == (void *)conn); + LASSERT(conn->ibc_cmid == cmid); + rc = rdma_connect_locked(cmid, &cp); + if (rc != 0) { + CERROR("Can't connect to %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), rc); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } + + return 0; +} + +int +kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) +{ + struct kib_peer_ni *peer_ni; + struct kib_conn *conn; + int rc; + + switch (event->event) { + default: + CERROR("Unexpected event: %d, status: %d\n", + event->event, event->status); + LBUG(); + + case RDMA_CM_EVENT_CONNECT_REQUEST: + /* destroy cmid on failure */ + rc = kiblnd_passive_connect(cmid, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + CDEBUG(D_NET, "connreq: %d\n", rc); + return rc; + + case RDMA_CM_EVENT_ADDR_ERROR: + peer_ni = cmid->context; + CNETERR("%s: ADDR ERROR %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH); + kiblnd_peer_decref(peer_ni); + return -EHOSTUNREACH; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ADDR_RESOLVED: + peer_ni = cmid->context; + + CDEBUG(D_NET,"%s Addr resolved: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + + if (event->status != 0) { + CNETERR("Can't resolve address for %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + rc = event->status; + } else { + rc = rdma_resolve_route( + cmid, lnet_get_lnd_timeout() * 1000); + if (rc == 0) { + struct kib_net *net = peer_ni->ibp_ni->ni_data; + struct kib_dev *dev = net->ibn_dev; + + CDEBUG(D_NET, "%s: connection bound to "\ + "%s:%pI4h:%s\n", + libcfs_nid2str(peer_ni->ibp_nid), + dev->ibd_ifname, + &dev->ibd_ifip, cmid->device->name); + + return 0; + } + + /* Can't initiate route resolution */ + CERROR("Can't resolve route for %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), rc); + } + kiblnd_peer_connect_failed(peer_ni, 1, rc); + kiblnd_peer_decref(peer_ni); + return rc; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ROUTE_ERROR: + peer_ni = cmid->context; + CNETERR("%s: ROUTE ERROR %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH); + kiblnd_peer_decref(peer_ni); + return -EHOSTUNREACH; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + peer_ni = cmid->context; + CDEBUG(D_NET,"%s Route resolved: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + + if (event->status == 0) + return kiblnd_active_connect(cmid); + + CNETERR("Can't resolve route for %s: %d\n", + libcfs_nid2str(peer_ni->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer_ni, 1, event->status); + kiblnd_peer_decref(peer_ni); + return event->status; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_UNREACHABLE: + conn = cmid->context; + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || + conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); + CNETERR("%s: UNREACHABLE %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); + kiblnd_connreq_done(conn, -ENETDOWN); + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_CONNECT_ERROR: + conn = cmid->context; + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || + conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); + CNETERR("%s: CONNECT ERROR %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); + kiblnd_connreq_done(conn, -ENOTCONN); + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_REJECTED: + conn = cmid->context; + switch (conn->ibc_state) { + default: + LBUG(); + + case IBLND_CONN_PASSIVE_WAIT: + CERROR ("%s: REJECTED %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + event->status); + kiblnd_connreq_done(conn, -ECONNRESET); + break; + + case IBLND_CONN_ACTIVE_CONNECT: + kiblnd_rejected(conn, event->status, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + break; + } + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_ESTABLISHED: + conn = cmid->context; + switch (conn->ibc_state) { + default: + LBUG(); + + case IBLND_CONN_PASSIVE_WAIT: + CDEBUG(D_NET, "ESTABLISHED (passive): %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_connreq_done(conn, 0); + break; + + case IBLND_CONN_ACTIVE_CONNECT: + CDEBUG(D_NET, "ESTABLISHED(active): %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_check_connreply(conn, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + break; + } + /* net keeps its ref on conn! */ + return 0; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n"); + return 0; + + case RDMA_CM_EVENT_DISCONNECTED: + conn = cmid->context; + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CERROR("%s DISCONNECTED\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_connreq_done(conn, -ECONNRESET); + } else { + kiblnd_close_conn(conn, 0); + } + kiblnd_conn_decref(conn); + cmid->context = NULL; + return 0; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + LCONSOLE_ERROR_MSG(0x131, + "Received notification of device removal\n" + "Please shutdown LNET to allow this to proceed\n"); + /* Can't remove network from underneath LNET for now, so I have + * to ignore this */ + return 0; + + case RDMA_CM_EVENT_ADDR_CHANGE: + LCONSOLE_INFO("Physical link changed (eg hca/port)\n"); + return 0; + } +} + +static int +kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) +{ + struct kib_tx *tx; + struct list_head *ttmp; + + list_for_each(ttmp, txs) { + tx = list_entry(ttmp, struct kib_tx, tx_list); + + if (txs != &conn->ibc_active_txs) { + LASSERT(tx->tx_queued); + } else { + LASSERT(!tx->tx_queued); + LASSERT(tx->tx_waiting || tx->tx_sending != 0); + } + + if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) { + CERROR("Timed out tx: %s, %lld seconds\n", + kiblnd_queue2str(conn, txs), + ktime_ms_delta(ktime_get(), + tx->tx_deadline) / MSEC_PER_SEC); + return 1; + } + } + + return 0; +} + +static int +kiblnd_conn_timed_out_locked(struct kib_conn *conn) +{ + return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) || + kiblnd_check_txs_locked(conn, &conn->ibc_active_txs); +} + +static void +kiblnd_check_conns (int idx) +{ + struct list_head closes = LIST_HEAD_INIT(closes); + struct list_head checksends = LIST_HEAD_INIT(checksends); + struct list_head timedout_txs = LIST_HEAD_INIT(timedout_txs); + struct list_head *peers = &kiblnd_data.kib_peers[idx]; + struct list_head *ptmp; + struct kib_peer_ni *peer_ni; + struct kib_conn *conn; + struct kib_tx *tx, *tx_tmp; + struct list_head *ctmp; + unsigned long flags; + + /* NB. We expect to have a look at all the peers and not find any + * RDMAs to time out, so we just use a shared lock while we + * take a look... */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + list_for_each(ptmp, peers) { + peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list); + + /* Check tx_deadline */ + list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) { + if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) { + CWARN("Timed out tx for %s: %lld seconds\n", + libcfs_nid2str(peer_ni->ibp_nid), + ktime_ms_delta(ktime_get(), + tx->tx_deadline) / MSEC_PER_SEC); + list_move(&tx->tx_list, &timedout_txs); + } + } + + list_for_each(ctmp, &peer_ni->ibp_conns) { + int timedout; + int sendnoop; + + conn = list_entry(ctmp, struct kib_conn, ibc_list); + + LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED); + + spin_lock(&conn->ibc_lock); + + sendnoop = kiblnd_need_noop(conn); + timedout = kiblnd_conn_timed_out_locked(conn); + if (!sendnoop && !timedout) { + spin_unlock(&conn->ibc_lock); + continue; + } + + if (timedout) { + CERROR("Timed out RDMA with %s (%lld): " + "c: %u, oc: %u, rc: %u\n", + libcfs_nid2str(peer_ni->ibp_nid), + ktime_get_seconds() - peer_ni->ibp_last_alive, + conn->ibc_credits, + conn->ibc_outstanding_credits, + conn->ibc_reserved_credits); + list_add(&conn->ibc_connd_list, &closes); + } else { + list_add(&conn->ibc_connd_list, &checksends); + } + /* +ref for 'closes' or 'checksends' */ + kiblnd_conn_addref(conn); + + spin_unlock(&conn->ibc_lock); + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (!list_empty(&timedout_txs)) + kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT, + LNET_MSG_STATUS_LOCAL_TIMEOUT); + + /* Handle timeout by closing the whole + * connection. We can only be sure RDMA activity + * has ceased once the QP has been modified. */ + while (!list_empty(&closes)) { + conn = list_entry(closes.next, + struct kib_conn, ibc_connd_list); + list_del(&conn->ibc_connd_list); + kiblnd_close_conn(conn, -ETIMEDOUT); + kiblnd_conn_decref(conn); + } + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + while (!list_empty(&checksends)) { + conn = list_entry(checksends.next, + struct kib_conn, ibc_connd_list); + list_del(&conn->ibc_connd_list); + + spin_lock(&conn->ibc_lock); + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + + kiblnd_conn_decref(conn); + } +} + +static void +kiblnd_disconnect_conn(struct kib_conn *conn) +{ + LASSERT (!in_interrupt()); + LASSERT (current == kiblnd_data.kib_connd); + LASSERT (conn->ibc_state == IBLND_CONN_CLOSING); + + rdma_disconnect(conn->ibc_cmid); + kiblnd_finalise_conn(conn); + + kiblnd_peer_notify(conn->ibc_peer); +} + +/* + * High-water for reconnection to the same peer_ni, reconnection attempt should + * be delayed after trying more than KIB_RECONN_HIGH_RACE. + */ +#define KIB_RECONN_HIGH_RACE 10 +/* + * Allow connd to take a break and handle other things after consecutive + * reconnection attemps. + */ +#define KIB_RECONN_BREAK 100 + +int +kiblnd_connd (void *arg) +{ + spinlock_t *lock= &kiblnd_data.kib_connd_lock; + wait_queue_entry_t wait; + unsigned long flags; + struct kib_conn *conn; + int timeout; + int i; + int dropped_lock; + int peer_index = 0; + unsigned long deadline = jiffies; + + cfs_block_allsigs(); + + init_waitqueue_entry(&wait, current); + kiblnd_data.kib_connd = current; + + spin_lock_irqsave(lock, flags); + + while (!kiblnd_data.kib_shutdown) { + int reconn = 0; + + dropped_lock = 0; + + if (!list_empty(&kiblnd_data.kib_connd_zombies)) { + struct kib_peer_ni *peer_ni = NULL; + + conn = list_entry(kiblnd_data.kib_connd_zombies.next, + struct kib_conn, ibc_list); + list_del(&conn->ibc_list); + if (conn->ibc_reconnect) { + peer_ni = conn->ibc_peer; + kiblnd_peer_addref(peer_ni); + } + + spin_unlock_irqrestore(lock, flags); + dropped_lock = 1; + + kiblnd_destroy_conn(conn); + + spin_lock_irqsave(lock, flags); + if (!peer_ni) { + LIBCFS_FREE(conn, sizeof(*conn)); + continue; + } + + conn->ibc_peer = peer_ni; + if (peer_ni->ibp_reconnected < KIB_RECONN_HIGH_RACE) + list_add_tail(&conn->ibc_list, + &kiblnd_data.kib_reconn_list); + else + list_add_tail(&conn->ibc_list, + &kiblnd_data.kib_reconn_wait); + } + + if (!list_empty(&kiblnd_data.kib_connd_conns)) { + conn = list_entry(kiblnd_data.kib_connd_conns.next, + struct kib_conn, ibc_list); + list_del(&conn->ibc_list); + + spin_unlock_irqrestore(lock, flags); + dropped_lock = 1; + + kiblnd_disconnect_conn(conn); + kiblnd_conn_decref(conn); + + spin_lock_irqsave(lock, flags); + } + + while (reconn < KIB_RECONN_BREAK) { + if (kiblnd_data.kib_reconn_sec != + ktime_get_real_seconds()) { + kiblnd_data.kib_reconn_sec = ktime_get_real_seconds(); + list_splice_init(&kiblnd_data.kib_reconn_wait, + &kiblnd_data.kib_reconn_list); + } + + if (list_empty(&kiblnd_data.kib_reconn_list)) + break; + + conn = list_entry(kiblnd_data.kib_reconn_list.next, + struct kib_conn, ibc_list); + list_del(&conn->ibc_list); + + spin_unlock_irqrestore(lock, flags); + dropped_lock = 1; + + reconn += kiblnd_reconnect_peer(conn->ibc_peer); + kiblnd_peer_decref(conn->ibc_peer); + LIBCFS_FREE(conn, sizeof(*conn)); + + spin_lock_irqsave(lock, flags); + } + + /* careful with the jiffy wrap... */ + timeout = (int)(deadline - jiffies); + if (timeout <= 0) { + const int n = 4; + const int p = 1; + int chunk = kiblnd_data.kib_peer_hash_size; + unsigned int lnd_timeout; + + spin_unlock_irqrestore(lock, flags); + dropped_lock = 1; + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer_ni table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. */ + + lnd_timeout = lnet_get_lnd_timeout(); + if (lnd_timeout > n * p) + chunk = (chunk * n * p) / lnd_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + kiblnd_check_conns(peer_index); + peer_index = (peer_index + 1) % + kiblnd_data.kib_peer_hash_size; + } + + deadline += msecs_to_jiffies(p * MSEC_PER_SEC); + spin_lock_irqsave(lock, flags); + } + + if (dropped_lock) + continue; + + /* Nothing to do for 'timeout' */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); + spin_unlock_irqrestore(lock, flags); + + schedule_timeout(timeout); + + set_current_state(TASK_RUNNING); + remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); + spin_lock_irqsave(lock, flags); + } + + spin_unlock_irqrestore(lock, flags); + + kiblnd_thread_fini(); + return 0; +} + +void +kiblnd_qp_event(struct ib_event *event, void *arg) +{ + struct kib_conn *conn = arg; + + switch (event->event) { + case IB_EVENT_COMM_EST: + CDEBUG(D_NET, "%s established\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* We received a packet but connection isn't established + * probably handshake packet was lost, so free to + * force make connection established */ + rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST); + return; + + case IB_EVENT_PORT_ERR: + case IB_EVENT_DEVICE_FATAL: + CERROR("Fatal device error for NI %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid)); + atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 1); + return; + + case IB_EVENT_PORT_ACTIVE: + CERROR("Port reactivated for NI %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid)); + atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 0); + return; + + default: + CERROR("%s: Async QP event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); + return; + } +} + +static void +kiblnd_complete (struct ib_wc *wc) +{ + switch (kiblnd_wreqid2type(wc->wr_id)) { + default: + LBUG(); + + case IBLND_WID_MR: + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) + CNETERR("FastReg failed: %d\n", wc->status); + return; + + case IBLND_WID_RDMA: + /* We only get RDMA completion notification if it fails. All + * subsequent work items, including the final SEND will fail + * too. However we can't print out any more info about the + * failing RDMA because 'tx' might be back on the idle list or + * even reused already if we didn't manage to post all our work + * items */ + CNETERR("RDMA (tx: %p) failed: %d\n", + kiblnd_wreqid2ptr(wc->wr_id), wc->status); + return; + + case IBLND_WID_TX: + kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status); + return; + + case IBLND_WID_RX: + kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status, + wc->byte_len); + return; + } +} + +void +kiblnd_cq_completion(struct ib_cq *cq, void *arg) +{ + /* NB I'm not allowed to schedule this conn once its refcount has + * reached 0. Since fundamentally I'm racing with scheduler threads + * consuming my CQ I could be called after all completions have + * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0 + * and this CQ is about to be destroyed so I NOOP. */ + struct kib_conn *conn = arg; + struct kib_sched_info *sched = conn->ibc_sched; + unsigned long flags; + + LASSERT(cq == conn->ibc_cq); + + spin_lock_irqsave(&sched->ibs_lock, flags); + + conn->ibc_ready = 1; + + if (!conn->ibc_scheduled && + (conn->ibc_nrx > 0 || + conn->ibc_nsends_posted > 0)) { + kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ + conn->ibc_scheduled = 1; + list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns); + + if (waitqueue_active(&sched->ibs_waitq)) + wake_up(&sched->ibs_waitq); + } + + spin_unlock_irqrestore(&sched->ibs_lock, flags); +} + +void +kiblnd_cq_event(struct ib_event *event, void *arg) +{ + struct kib_conn *conn = arg; + + CERROR("%s: async CQ event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); +} + +int +kiblnd_scheduler(void *arg) +{ + long id = (long)arg; + struct kib_sched_info *sched; + struct kib_conn *conn; + wait_queue_entry_t wait; + unsigned long flags; + struct ib_wc wc; + int did_something; + int busy_loops = 0; + int rc; + + cfs_block_allsigs(); + + init_waitqueue_entry(&wait, current); + + sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)]; + + rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt); + if (rc != 0) { + CWARN("Unable to bind on CPU partition %d, please verify " + "whether all CPUs are healthy and reload modules if " + "necessary, otherwise your system might under risk of " + "low performance\n", sched->ibs_cpt); + } + + spin_lock_irqsave(&sched->ibs_lock, flags); + + while (!kiblnd_data.kib_shutdown) { + if (busy_loops++ >= IBLND_RESCHED) { + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + cond_resched(); + busy_loops = 0; + + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + did_something = 0; + + if (!list_empty(&sched->ibs_conns)) { + conn = list_entry(sched->ibs_conns.next, + struct kib_conn, ibc_sched_list); + /* take over kib_sched_conns' ref on conn... */ + LASSERT(conn->ibc_scheduled); + list_del(&conn->ibc_sched_list); + conn->ibc_ready = 0; + + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + wc.wr_id = IBLND_WID_INVAL; + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + if (rc == 0) { + rc = ib_req_notify_cq(conn->ibc_cq, + IB_CQ_NEXT_COMP); + if (rc < 0) { + CWARN("%s: ib_req_notify_cq failed: %d, " + "closing connection\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kiblnd_close_conn(conn, -EIO); + kiblnd_conn_decref(conn); + spin_lock_irqsave(&sched->ibs_lock, + flags); + continue; + } + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + } + + if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) { + LCONSOLE_ERROR( + "ib_poll_cq (rc: %d) returned invalid " + "wr_id, opcode %d, status: %d, " + "vendor_err: %d, conn: %s status: %d\n" + "please upgrade firmware and OFED or " + "contact vendor.\n", rc, + wc.opcode, wc.status, wc.vendor_err, + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_state); + rc = -EINVAL; + } + + if (rc < 0) { + CWARN("%s: ib_poll_cq failed: %d, " + "closing connection\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc); + kiblnd_close_conn(conn, -EIO); + kiblnd_conn_decref(conn); + spin_lock_irqsave(&sched->ibs_lock, flags); + continue; + } + + spin_lock_irqsave(&sched->ibs_lock, flags); + + if (rc != 0 || conn->ibc_ready) { + /* There may be another completion waiting; get + * another scheduler to check while I handle + * this one... */ + /* +1 ref for sched_conns */ + kiblnd_conn_addref(conn); + list_add_tail(&conn->ibc_sched_list, + &sched->ibs_conns); + if (waitqueue_active(&sched->ibs_waitq)) + wake_up(&sched->ibs_waitq); + } else { + conn->ibc_scheduled = 0; + } + + if (rc != 0) { + spin_unlock_irqrestore(&sched->ibs_lock, flags); + kiblnd_complete(&wc); + + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + kiblnd_conn_decref(conn); /* ...drop my ref from above */ + did_something = 1; + } + + if (did_something) + continue; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&sched->ibs_waitq, &wait); + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + schedule(); + busy_loops = 0; + + remove_wait_queue(&sched->ibs_waitq, &wait); + set_current_state(TASK_RUNNING); + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + kiblnd_thread_fini(); + return 0; +} + +int +kiblnd_failover_thread(void *arg) +{ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + struct kib_dev *dev; + struct net *ns = arg; + wait_queue_entry_t wait; + unsigned long flags; + int rc; + + LASSERT(*kiblnd_tunables.kib_dev_failover != 0); + + cfs_block_allsigs(); + + init_waitqueue_entry(&wait, current); + write_lock_irqsave(glock, flags); + + while (!kiblnd_data.kib_shutdown) { + int do_failover = 0; + int long_sleep; + + list_for_each_entry(dev, &kiblnd_data.kib_failed_devs, + ibd_fail_list) { + if (ktime_get_seconds() < dev->ibd_next_failover) + continue; + do_failover = 1; + break; + } + + if (do_failover) { + list_del_init(&dev->ibd_fail_list); + dev->ibd_failover = 1; + write_unlock_irqrestore(glock, flags); + + rc = kiblnd_dev_failover(dev, ns); + + write_lock_irqsave(glock, flags); + + LASSERT (dev->ibd_failover); + dev->ibd_failover = 0; + if (rc >= 0) { /* Device is OK or failover succeed */ + dev->ibd_next_failover = ktime_get_seconds() + 3; + continue; + } + + /* failed to failover, retry later */ + dev->ibd_next_failover = ktime_get_seconds() + + min(dev->ibd_failed_failover, 10); + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + } + + continue; + } + + /* long sleep if no more pending failover */ + long_sleep = list_empty(&kiblnd_data.kib_failed_devs); + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); + write_unlock_irqrestore(glock, flags); + + rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) : + cfs_time_seconds(1)); + set_current_state(TASK_RUNNING); + remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); + write_lock_irqsave(glock, flags); + + if (!long_sleep || rc != 0) + continue; + + /* have a long sleep, routine check all active devices, + * we need checking like this because if there is not active + * connection on the dev and no SEND from local, we may listen + * on wrong HCA for ever while there is a bonding failover */ + list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + } + } + } + + write_unlock_irqrestore(glock, flags); + + kiblnd_thread_fini(); + return 0; +} diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c new file mode 100644 index 0000000000000..39f9a620d04a4 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -0,0 +1,333 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd_modparams.c + * + * Author: Eric Barton + */ + +#include "o2iblnd.h" + +#define CURRENT_LND_VERSION 1 + +static int service = 987; +module_param(service, int, 0444); +MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)"); + +static int cksum = 0; +module_param(cksum, int, 0644); +MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums"); + +static int timeout = 50; +module_param(timeout, int, 0644); +MODULE_PARM_DESC(timeout, "timeout (seconds)"); + +/* Number of threads in each scheduler pool which is percpt, + * we will estimate reasonable value based on CPUs if it's set to zero. */ +static int nscheds; +module_param(nscheds, int, 0444); +MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool"); + +static unsigned int conns_per_peer = 1; +module_param(conns_per_peer, uint, 0444); +MODULE_PARM_DESC(conns_per_peer, "number of connections per peer"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int ntx = 512; +module_param(ntx, int, 0444); +MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool"); + +/* NB: this value is shared by all CPTs */ +static int credits = 256; +module_param(credits, int, 0444); +MODULE_PARM_DESC(credits, "# concurrent sends"); + +static int peer_credits = 8; +module_param(peer_credits, int, 0444); +MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); + +static int peer_credits_hiw = 0; +module_param(peer_credits_hiw, int, 0444); +MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits"); + +static int peer_buffer_credits = 0; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); + +static int peer_timeout = DEFAULT_PEER_TIMEOUT; +module_param(peer_timeout, int, 0444); +MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + +static char *ipif_name = "ib0"; +module_param(ipif_name, charp, 0444); +MODULE_PARM_DESC(ipif_name, "IPoIB interface name"); + +static int retry_count = 5; +module_param(retry_count, int, 0644); +MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations"); + +static int rnr_retry_count = 6; +module_param(rnr_retry_count, int, 0644); +MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions"); + +static int keepalive = 100; +module_param(keepalive, int, 0644); +MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive"); + +static int ib_mtu; +module_param(ib_mtu, int, 0444); +MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096"); + +static int concurrent_sends; +module_param(concurrent_sends, int, 0444); +MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing"); + +static int use_fastreg_gaps; +module_param(use_fastreg_gaps, int, 0444); +MODULE_PARM_DESC(use_fastreg_gaps, "Enable discontiguous fastreg fragment support. Expect performance drop"); + +/* + * map_on_demand is a flag used to determine if we can use FMR or FastReg. + * This is applicable for kernels which support global memory regions. For + * later kernels this flag is always enabled, since we will always either + * use FMR or FastReg + * For kernels which support global memory regions map_on_demand defaults + * to 0 which means we will be using global memory regions exclusively. + * If it is set to a value other than 0, then we will behave as follows: + * 1. Always default the number of fragments to IBLND_MAX_RDMA_FRAGS + * 2. Create FMR/FastReg pools + * 3. Negotiate the supported number of fragments per connection + * 4. Attempt to transmit using global memory regions only if + * map-on-demand is not turned on, otherwise use FMR or FastReg + * 5. In case of transmitting tx with GAPS over FMR we will need to + * transmit it with multiple fragments. Look at the comments in + * kiblnd_fmr_map_tx() for an explanation of the behavior. + * + * For later kernels we default map_on_demand to 1 and not allow + * it to be set to 0, since there is no longer support for global memory + * regions. Behavior: + * 1. Default the number of fragments to IBLND_MAX_RDMA_FRAGS + * 2. Create FMR/FastReg pools + * 3. Negotiate the supported number of fragments per connection + * 4. Look at the comments in kiblnd_fmr_map_tx() for an explanation of + * the behavior when transmit with GAPS verses contiguous. + */ +#ifdef HAVE_IB_GET_DMA_MR +#define IBLND_DEFAULT_MAP_ON_DEMAND 0 +#define MOD_STR "map on demand" +#else +#define IBLND_DEFAULT_MAP_ON_DEMAND 1 +#define MOD_STR "map on demand (obsolete)" +#endif +static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND; +module_param(map_on_demand, int, 0444); +MODULE_PARM_DESC(map_on_demand, MOD_STR); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int fmr_pool_size = 512; +module_param(fmr_pool_size, int, 0444); +MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int fmr_flush_trigger = 384; +module_param(fmr_flush_trigger, int, 0444); +MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush"); + +static int fmr_cache = 1; +module_param(fmr_cache, int, 0444); +MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching"); + +/* + * 0: disable failover + * 1: enable failover if necessary + * 2: force to failover (for debug) + */ +static int dev_failover = 0; +module_param(dev_failover, int, 0444); +MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)"); + +static int require_privileged_port; +module_param(require_privileged_port, int, 0644); +MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection"); + +static int use_privileged_port = 1; +module_param(use_privileged_port, int, 0644); +MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection"); + +static unsigned int wrq_sge = 2; +module_param(wrq_sge, uint, 0444); +MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request"); + +struct kib_tunables kiblnd_tunables = { + .kib_dev_failover = &dev_failover, + .kib_service = &service, + .kib_cksum = &cksum, + .kib_timeout = &timeout, + .kib_keepalive = &keepalive, + .kib_default_ipif = &ipif_name, + .kib_retry_count = &retry_count, + .kib_rnr_retry_count = &rnr_retry_count, + .kib_ib_mtu = &ib_mtu, + .kib_require_priv_port = &require_privileged_port, + .kib_use_priv_port = &use_privileged_port, + .kib_nscheds = &nscheds, + .kib_wrq_sge = &wrq_sge, + .kib_use_fastreg_gaps = &use_fastreg_gaps, +}; + +static struct lnet_ioctl_config_o2iblnd_tunables default_tunables; + +/* # messages/RDMAs in-flight */ +int +kiblnd_msg_queue_size(int version, struct lnet_ni *ni) +{ + if (version == IBLND_MSG_VERSION_1) + return IBLND_MSG_QUEUE_SIZE_V1; + else if (ni) + return ni->ni_net->net_tunables.lct_peer_tx_credits; + else + return peer_credits; +} + +int +kiblnd_tunables_setup(struct lnet_ni *ni) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables; + + /* + * if there was no tunables specified, setup the tunables to be + * defaulted + */ + if (!ni->ni_lnd_tunables_set) + memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib, + &default_tunables, sizeof(*tunables)); + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + + /* Current API version */ + tunables->lnd_version = CURRENT_LND_VERSION; + + if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) { + CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n", + *kiblnd_tunables.kib_ib_mtu); + return -EINVAL; + } + + net_tunables = &ni->ni_net->net_tunables; + + if (net_tunables->lct_peer_timeout == -1) + net_tunables->lct_peer_timeout = peer_timeout; + + if (net_tunables->lct_max_tx_credits == -1) + net_tunables->lct_max_tx_credits = credits; + + if (net_tunables->lct_peer_tx_credits == -1) + net_tunables->lct_peer_tx_credits = peer_credits; + + if (net_tunables->lct_peer_rtr_credits == -1) + net_tunables->lct_peer_rtr_credits = peer_buffer_credits; + + if (net_tunables->lct_peer_tx_credits < IBLND_CREDITS_DEFAULT) + net_tunables->lct_peer_tx_credits = IBLND_CREDITS_DEFAULT; + + if (net_tunables->lct_peer_tx_credits > IBLND_CREDITS_MAX) + net_tunables->lct_peer_tx_credits = IBLND_CREDITS_MAX; + + if (net_tunables->lct_peer_tx_credits > + net_tunables->lct_max_tx_credits) + net_tunables->lct_peer_tx_credits = + net_tunables->lct_max_tx_credits; + +#ifndef HAVE_IB_GET_DMA_MR + /* + * For kernels which do not support global memory regions, always + * enable map_on_demand + */ + if (tunables->lnd_map_on_demand == 0) + tunables->lnd_map_on_demand = 1; +#endif + + if (!tunables->lnd_peercredits_hiw) + tunables->lnd_peercredits_hiw = peer_credits_hiw; + + if (tunables->lnd_peercredits_hiw < net_tunables->lct_peer_tx_credits / 2) + tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits / 2; + + if (tunables->lnd_peercredits_hiw >= net_tunables->lct_peer_tx_credits) + tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits - 1; + + if (tunables->lnd_concurrent_sends == 0) + tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits; + + if (tunables->lnd_concurrent_sends > net_tunables->lct_peer_tx_credits * 2) + tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits * 2; + + if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits / 2) + tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits / 2; + + if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits) { + CWARN("Concurrent sends %d is lower than message " + "queue size: %d, performance may drop slightly.\n", + tunables->lnd_concurrent_sends, + net_tunables->lct_peer_tx_credits); + } + + if (!tunables->lnd_fmr_pool_size) + tunables->lnd_fmr_pool_size = fmr_pool_size; + if (!tunables->lnd_fmr_flush_trigger) + tunables->lnd_fmr_flush_trigger = fmr_flush_trigger; + if (!tunables->lnd_fmr_cache) + tunables->lnd_fmr_cache = fmr_cache; + if (!tunables->lnd_ntx) + tunables->lnd_ntx = ntx; + if (!tunables->lnd_conns_per_peer) { + tunables->lnd_conns_per_peer = (conns_per_peer) ? + conns_per_peer : 1; + } + + return 0; +} + +int +kiblnd_tunables_init(void) +{ + default_tunables.lnd_version = CURRENT_LND_VERSION; + default_tunables.lnd_peercredits_hiw = peer_credits_hiw; + default_tunables.lnd_map_on_demand = map_on_demand; + default_tunables.lnd_concurrent_sends = concurrent_sends; + default_tunables.lnd_fmr_pool_size = fmr_pool_size; + default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger; + default_tunables.lnd_fmr_cache = fmr_cache; + default_tunables.lnd_ntx = ntx; + default_tunables.lnd_conns_per_peer = conns_per_peer; + return 0; +} diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile b/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile new file mode 100644 index 0000000000000..6e6ec925b891f --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += ksocklnd.o + +ksocklnd-y := socklnd.o socklnd_cb.o socklnd_lib.o +ksocklnd-y += socklnd_modparams.o socklnd_proto.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c new file mode 100644 index 0000000000000..32dda0a5769b3 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c @@ -0,0 +1,2862 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/socklnd/socklnd.c + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + */ + +#include "socklnd.h" +#include + +static struct lnet_lnd the_ksocklnd; +struct ksock_nal_data ksocknal_data; + +static struct ksock_interface * +ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip) +{ + struct ksock_net *net = ni->ni_data; + int i; + struct ksock_interface *iface; + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT(i < LNET_INTERFACES_NUM); + iface = &net->ksnn_interfaces[i]; + + if (iface->ksni_ipaddr == ip) + return iface; + } + + return NULL; +} + +static struct ksock_route * +ksocknal_create_route(__u32 ipaddr, int port) +{ + struct ksock_route *route; + + LIBCFS_ALLOC (route, sizeof (*route)); + if (route == NULL) + return (NULL); + + atomic_set (&route->ksnr_refcount, 1); + route->ksnr_peer = NULL; + route->ksnr_retry_interval = 0; /* OK to connect at any time */ + route->ksnr_ipaddr = ipaddr; + route->ksnr_port = port; + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + route->ksnr_connected = 0; + route->ksnr_deleted = 0; + route->ksnr_conn_count = 0; + route->ksnr_share_count = 0; + + return (route); +} + +void +ksocknal_destroy_route(struct ksock_route *route) +{ + LASSERT (atomic_read(&route->ksnr_refcount) == 0); + + if (route->ksnr_peer != NULL) + ksocknal_peer_decref(route->ksnr_peer); + + LIBCFS_FREE (route, sizeof (*route)); +} + +static int +ksocknal_create_peer(struct ksock_peer_ni **peerp, struct lnet_ni *ni, + struct lnet_process_id id) +{ + int cpt = lnet_cpt_of_nid(id.nid, ni); + struct ksock_net *net = ni->ni_data; + struct ksock_peer_ni *peer_ni; + + LASSERT(id.nid != LNET_NID_ANY); + LASSERT(id.pid != LNET_PID_ANY); + LASSERT(!in_interrupt()); + + LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni)); + if (peer_ni == NULL) + return -ENOMEM; + + peer_ni->ksnp_ni = ni; + peer_ni->ksnp_id = id; + atomic_set(&peer_ni->ksnp_refcount, 1); /* 1 ref for caller */ + peer_ni->ksnp_closing = 0; + peer_ni->ksnp_accepting = 0; + peer_ni->ksnp_proto = NULL; + peer_ni->ksnp_last_alive = 0; + peer_ni->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; + + INIT_LIST_HEAD(&peer_ni->ksnp_conns); + INIT_LIST_HEAD(&peer_ni->ksnp_routes); + INIT_LIST_HEAD(&peer_ni->ksnp_tx_queue); + INIT_LIST_HEAD(&peer_ni->ksnp_zc_req_list); + spin_lock_init(&peer_ni->ksnp_lock); + + spin_lock_bh(&net->ksnn_lock); + + if (net->ksnn_shutdown) { + spin_unlock_bh(&net->ksnn_lock); + + LIBCFS_FREE(peer_ni, sizeof(*peer_ni)); + CERROR("Can't create peer_ni: network shutdown\n"); + return -ESHUTDOWN; + } + + net->ksnn_npeers++; + + spin_unlock_bh(&net->ksnn_lock); + + *peerp = peer_ni; + return 0; +} + +void +ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni) +{ + struct ksock_net *net = peer_ni->ksnp_ni->ni_data; + + CDEBUG (D_NET, "peer_ni %s %p deleted\n", + libcfs_id2str(peer_ni->ksnp_id), peer_ni); + + LASSERT(atomic_read(&peer_ni->ksnp_refcount) == 0); + LASSERT(peer_ni->ksnp_accepting == 0); + LASSERT(list_empty(&peer_ni->ksnp_conns)); + LASSERT(list_empty(&peer_ni->ksnp_routes)); + LASSERT(list_empty(&peer_ni->ksnp_tx_queue)); + LASSERT(list_empty(&peer_ni->ksnp_zc_req_list)); + + LIBCFS_FREE(peer_ni, sizeof(*peer_ni)); + + /* NB a peer_ni's connections and routes keep a reference on their peer_ni + * until they are destroyed, so we can be assured that _all_ state to + * do with this peer_ni has been cleaned up when its refcount drops to + * zero. */ + spin_lock_bh(&net->ksnn_lock); + net->ksnn_npeers--; + spin_unlock_bh(&net->ksnn_lock); +} + +struct ksock_peer_ni * +ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id) +{ + struct list_head *peer_list = ksocknal_nid2peerlist(id.nid); + struct list_head *tmp; + struct ksock_peer_ni *peer_ni; + + list_for_each(tmp, peer_list) { + peer_ni = list_entry(tmp, struct ksock_peer_ni, ksnp_list); + + LASSERT(!peer_ni->ksnp_closing); + + if (peer_ni->ksnp_ni != ni) + continue; + + if (peer_ni->ksnp_id.nid != id.nid || + peer_ni->ksnp_id.pid != id.pid) + continue; + + CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d)\n", + peer_ni, libcfs_id2str(id), + atomic_read(&peer_ni->ksnp_refcount)); + return peer_ni; + } + return NULL; +} + +struct ksock_peer_ni * +ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id) +{ + struct ksock_peer_ni *peer_ni; + + read_lock(&ksocknal_data.ksnd_global_lock); + peer_ni = ksocknal_find_peer_locked(ni, id); + if (peer_ni != NULL) /* +1 ref for caller? */ + ksocknal_peer_addref(peer_ni); + read_unlock(&ksocknal_data.ksnd_global_lock); + + return (peer_ni); +} + +static void +ksocknal_unlink_peer_locked(struct ksock_peer_ni *peer_ni) +{ + int i; + __u32 ip; + struct ksock_interface *iface; + + for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++) { + LASSERT(i < LNET_INTERFACES_NUM); + ip = peer_ni->ksnp_passive_ips[i]; + + iface = ksocknal_ip2iface(peer_ni->ksnp_ni, ip); + /* + * All IPs in peer_ni->ksnp_passive_ips[] come from the + * interface list, therefore the call must succeed. + */ + LASSERT(iface != NULL); + + CDEBUG(D_NET, "peer_ni=%p iface=%p ksni_nroutes=%d\n", + peer_ni, iface, iface->ksni_nroutes); + iface->ksni_npeers--; + } + + LASSERT(list_empty(&peer_ni->ksnp_conns)); + LASSERT(list_empty(&peer_ni->ksnp_routes)); + LASSERT(!peer_ni->ksnp_closing); + peer_ni->ksnp_closing = 1; + list_del(&peer_ni->ksnp_list); + /* lose peerlist's ref */ + ksocknal_peer_decref(peer_ni); +} + +static int +ksocknal_get_peer_info(struct lnet_ni *ni, int index, + struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip, + int *port, int *conn_count, int *share_count) +{ + struct ksock_peer_ni *peer_ni; + struct list_head *ptmp; + struct ksock_route *route; + struct list_head *rtmp; + int i; + int j; + int rc = -ENOENT; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list); + + if (peer_ni->ksnp_ni != ni) + continue; + + if (peer_ni->ksnp_n_passive_ips == 0 && + list_empty(&peer_ni->ksnp_routes)) { + if (index-- > 0) + continue; + + *id = peer_ni->ksnp_id; + *myip = 0; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + + for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++) { + if (index-- > 0) + continue; + + *id = peer_ni->ksnp_id; + *myip = peer_ni->ksnp_passive_ips[j]; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + + list_for_each(rtmp, &peer_ni->ksnp_routes) { + if (index-- > 0) + continue; + + route = list_entry(rtmp, struct ksock_route, + ksnr_list); + + *id = peer_ni->ksnp_id; + *myip = route->ksnr_myipaddr; + *peer_ip = route->ksnr_ipaddr; + *port = route->ksnr_port; + *conn_count = route->ksnr_conn_count; + *share_count = route->ksnr_share_count; + rc = 0; + goto out; + } + } + } +out: + read_unlock(&ksocknal_data.ksnd_global_lock); + return rc; +} + +static void +ksocknal_associate_route_conn_locked(struct ksock_route *route, struct ksock_conn *conn) +{ + struct ksock_peer_ni *peer_ni = route->ksnr_peer; + int type = conn->ksnc_type; + struct ksock_interface *iface; + + conn->ksnc_route = route; + ksocknal_route_addref(route); + + if (route->ksnr_myipaddr != conn->ksnc_myipaddr) { + if (route->ksnr_myipaddr == 0) { + /* route wasn't bound locally yet (the initial route) */ + CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n", + libcfs_id2str(peer_ni->ksnp_id), + &route->ksnr_ipaddr, + &conn->ksnc_myipaddr); + } else { + CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h " + "to %pI4h\n", libcfs_id2str(peer_ni->ksnp_id), + &route->ksnr_ipaddr, + &route->ksnr_myipaddr, + &conn->ksnc_myipaddr); + + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes--; + } + route->ksnr_myipaddr = conn->ksnc_myipaddr; + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes++; + } + + route->ksnr_connected |= (1<ksnr_conn_count++; + + /* Successful connection => further attempts can + * proceed immediately */ + route->ksnr_retry_interval = 0; +} + +static void +ksocknal_add_route_locked(struct ksock_peer_ni *peer_ni, struct ksock_route *route) +{ + struct list_head *tmp; + struct ksock_conn *conn; + struct ksock_route *route2; + + LASSERT(!peer_ni->ksnp_closing); + LASSERT(route->ksnr_peer == NULL); + LASSERT(!route->ksnr_scheduled); + LASSERT(!route->ksnr_connecting); + LASSERT(route->ksnr_connected == 0); + + /* LASSERT(unique) */ + list_for_each(tmp, &peer_ni->ksnp_routes) { + route2 = list_entry(tmp, struct ksock_route, ksnr_list); + + if (route2->ksnr_ipaddr == route->ksnr_ipaddr) { + CERROR("Duplicate route %s %pI4h\n", + libcfs_id2str(peer_ni->ksnp_id), + &route->ksnr_ipaddr); + LBUG(); + } + } + + route->ksnr_peer = peer_ni; + ksocknal_peer_addref(peer_ni); + /* peer_ni's routelist takes over my ref on 'route' */ + list_add_tail(&route->ksnr_list, &peer_ni->ksnp_routes); + + list_for_each(tmp, &peer_ni->ksnp_conns) { + conn = list_entry(tmp, struct ksock_conn, ksnc_list); + + if (conn->ksnc_ipaddr != route->ksnr_ipaddr) + continue; + + ksocknal_associate_route_conn_locked(route, conn); + /* keep going (typed routes) */ + } +} + +static void +ksocknal_del_route_locked(struct ksock_route *route) +{ + struct ksock_peer_ni *peer_ni = route->ksnr_peer; + struct ksock_interface *iface; + struct ksock_conn *conn; + struct list_head *ctmp; + struct list_head *cnxt; + + LASSERT(!route->ksnr_deleted); + + /* Close associated conns */ + list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) { + conn = list_entry(ctmp, struct ksock_conn, ksnc_list); + + if (conn->ksnc_route != route) + continue; + + ksocknal_close_conn_locked(conn, 0); + } + + if (route->ksnr_myipaddr != 0) { + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes--; + } + + route->ksnr_deleted = 1; + list_del(&route->ksnr_list); + ksocknal_route_decref(route); /* drop peer_ni's ref */ + + if (list_empty(&peer_ni->ksnp_routes) && + list_empty(&peer_ni->ksnp_conns)) { + /* I've just removed the last route to a peer_ni with no active + * connections */ + ksocknal_unlink_peer_locked(peer_ni); + } +} + +int +ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr, + int port) +{ + struct list_head *tmp; + struct ksock_peer_ni *peer_ni; + struct ksock_peer_ni *peer2; + struct ksock_route *route; + struct ksock_route *route2; + int rc; + + if (id.nid == LNET_NID_ANY || + id.pid == LNET_PID_ANY) + return (-EINVAL); + + /* Have a brand new peer_ni ready... */ + rc = ksocknal_create_peer(&peer_ni, ni, id); + if (rc != 0) + return rc; + + route = ksocknal_create_route (ipaddr, port); + if (route == NULL) { + ksocknal_peer_decref(peer_ni); + return (-ENOMEM); + } + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + /* always called with a ref on ni, so shutdown can't have started */ + LASSERT(((struct ksock_net *) ni->ni_data)->ksnn_shutdown == 0); + + peer2 = ksocknal_find_peer_locked(ni, id); + if (peer2 != NULL) { + ksocknal_peer_decref(peer_ni); + peer_ni = peer2; + } else { + /* peer_ni table takes my ref on peer_ni */ + list_add_tail(&peer_ni->ksnp_list, + ksocknal_nid2peerlist(id.nid)); + } + + route2 = NULL; + list_for_each(tmp, &peer_ni->ksnp_routes) { + route2 = list_entry(tmp, struct ksock_route, ksnr_list); + + if (route2->ksnr_ipaddr == ipaddr) + break; + + route2 = NULL; + } + if (route2 == NULL) { + ksocknal_add_route_locked(peer_ni, route); + route->ksnr_share_count++; + } else { + ksocknal_route_decref(route); + route2->ksnr_share_count++; + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return 0; +} + +static void +ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip) +{ + struct ksock_conn *conn; + struct ksock_route *route; + struct list_head *tmp; + struct list_head *nxt; + int nshared; + + LASSERT(!peer_ni->ksnp_closing); + + /* Extra ref prevents peer_ni disappearing until I'm done with it */ + ksocknal_peer_addref(peer_ni); + + list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) { + route = list_entry(tmp, struct ksock_route, ksnr_list); + + /* no match */ + if (!(ip == 0 || route->ksnr_ipaddr == ip)) + continue; + + route->ksnr_share_count = 0; + /* This deletes associated conns too */ + ksocknal_del_route_locked(route); + } + + nshared = 0; + list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) { + route = list_entry(tmp, struct ksock_route, ksnr_list); + nshared += route->ksnr_share_count; + } + + if (nshared == 0) { + /* remove everything else if there are no explicit entries + * left */ + + list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) { + route = list_entry(tmp, struct ksock_route, ksnr_list); + + /* we should only be removing auto-entries */ + LASSERT(route->ksnr_share_count == 0); + ksocknal_del_route_locked(route); + } + + list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) { + conn = list_entry(tmp, struct ksock_conn, ksnc_list); + + ksocknal_close_conn_locked(conn, 0); + } + } + + ksocknal_peer_decref(peer_ni); + /* NB peer_ni unlinks itself when last conn/route is removed */ +} + +static int +ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip) +{ + struct list_head zombies = LIST_HEAD_INIT(zombies); + struct list_head *ptmp; + struct list_head *pnxt; + struct ksock_peer_ni *peer_ni; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + if (id.nid != LNET_NID_ANY) { + hi = (int)(ksocknal_nid2peerlist(id.nid) - + ksocknal_data.ksnd_peers); + lo = hi; + } else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, + &ksocknal_data.ksnd_peers[i]) { + peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list); + + if (peer_ni->ksnp_ni != ni) + continue; + + if (!((id.nid == LNET_NID_ANY || + peer_ni->ksnp_id.nid == id.nid) && + (id.pid == LNET_PID_ANY || + peer_ni->ksnp_id.pid == id.pid))) + continue; + + ksocknal_peer_addref(peer_ni); /* a ref for me... */ + + ksocknal_del_peer_locked(peer_ni, ip); + + if (peer_ni->ksnp_closing && + !list_empty(&peer_ni->ksnp_tx_queue)) { + LASSERT(list_empty(&peer_ni->ksnp_conns)); + LASSERT(list_empty(&peer_ni->ksnp_routes)); + + list_splice_init(&peer_ni->ksnp_tx_queue, + &zombies); + } + + ksocknal_peer_decref(peer_ni); /* ...till here */ + + rc = 0; /* matched! */ + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(ni, &zombies, -ENETDOWN); + + return rc; +} + +static struct ksock_conn * +ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index) +{ + struct ksock_peer_ni *peer_ni; + struct list_head *ptmp; + struct ksock_conn *conn; + struct list_head *ctmp; + int i; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list); + + LASSERT(!peer_ni->ksnp_closing); + + if (peer_ni->ksnp_ni != ni) + continue; + + list_for_each(ctmp, &peer_ni->ksnp_conns) { + if (index-- > 0) + continue; + + conn = list_entry(ctmp, struct ksock_conn, + ksnc_list); + ksocknal_conn_addref(conn); + read_unlock(&ksocknal_data. \ + ksnd_global_lock); + return conn; + } + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return NULL; +} + +static struct ksock_sched * +ksocknal_choose_scheduler_locked(unsigned int cpt) +{ + struct ksock_sched *sched = ksocknal_data.ksnd_schedulers[cpt]; + int i; + + if (sched->kss_nthreads == 0) { + cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) { + if (sched->kss_nthreads > 0) { + CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n", + cpt, sched->kss_cpt); + return sched; + } + } + return NULL; + } + + return sched; +} + +static int +ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs) +{ + struct ksock_net *net = ni->ni_data; + int i; + int nip; + + read_lock(&ksocknal_data.ksnd_global_lock); + + nip = net->ksnn_ninterfaces; + LASSERT(nip <= LNET_INTERFACES_NUM); + + /* + * Only offer interfaces for additional connections if I have + * more than one. + */ + if (nip < 2) { + read_unlock(&ksocknal_data.ksnd_global_lock); + return 0; + } + + for (i = 0; i < nip; i++) { + ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr; + LASSERT(ipaddrs[i] != 0); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return nip; +} + +static int +ksocknal_match_peerip(struct ksock_interface *iface, __u32 *ips, int nips) +{ + int best_netmatch = 0; + int best_xor = 0; + int best = -1; + int this_xor; + int this_netmatch; + int i; + + for (i = 0; i < nips; i++) { + if (ips[i] == 0) + continue; + + this_xor = (ips[i] ^ iface->ksni_ipaddr); + this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best < 0 || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_xor > this_xor))) + continue; + + best = i; + best_netmatch = this_netmatch; + best_xor = this_xor; + } + + LASSERT (best >= 0); + return (best); +} + +static int +ksocknal_select_ips(struct ksock_peer_ni *peer_ni, __u32 *peerips, int n_peerips) +{ + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + struct ksock_net *net = peer_ni->ksnp_ni->ni_data; + struct ksock_interface *iface; + struct ksock_interface *best_iface; + int n_ips; + int i; + int j; + int k; + u32 ip; + u32 xor; + int this_netmatch; + int best_netmatch; + int best_npeers; + + /* CAVEAT EMPTOR: We do all our interface matching with an + * exclusive hold of global lock at IRQ priority. We're only + * expecting to be dealing with small numbers of interfaces, so the + * O(n**3)-ness shouldn't matter */ + + /* Also note that I'm not going to return more than n_peerips + * interfaces, even if I have more myself */ + + write_lock_bh(global_lock); + + LASSERT(n_peerips <= LNET_INTERFACES_NUM); + LASSERT(net->ksnn_ninterfaces <= LNET_INTERFACES_NUM); + + /* Only match interfaces for additional connections + * if I have > 1 interface */ + n_ips = (net->ksnn_ninterfaces < 2) ? 0 : + MIN(n_peerips, net->ksnn_ninterfaces); + + for (i = 0; peer_ni->ksnp_n_passive_ips < n_ips; i++) { + /* ^ yes really... */ + + /* If we have any new interfaces, first tick off all the + * peer_ni IPs that match old interfaces, then choose new + * interfaces to match the remaining peer_ni IPS. + * We don't forget interfaces we've stopped using; we might + * start using them again... */ + + if (i < peer_ni->ksnp_n_passive_ips) { + /* Old interface. */ + ip = peer_ni->ksnp_passive_ips[i]; + best_iface = ksocknal_ip2iface(peer_ni->ksnp_ni, ip); + + /* peer_ni passive ips are kept up to date */ + LASSERT(best_iface != NULL); + } else { + /* choose a new interface */ + LASSERT (i == peer_ni->ksnp_n_passive_ips); + + best_iface = NULL; + best_netmatch = 0; + best_npeers = 0; + + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; + ip = iface->ksni_ipaddr; + + for (k = 0; k < peer_ni->ksnp_n_passive_ips; k++) + if (peer_ni->ksnp_passive_ips[k] == ip) + break; + + if (k < peer_ni->ksnp_n_passive_ips) /* using it already */ + continue; + + k = ksocknal_match_peerip(iface, peerips, n_peerips); + xor = (ip ^ peerips[k]); + this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best_iface == NULL || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_npeers > iface->ksni_npeers))) + continue; + + best_iface = iface; + best_netmatch = this_netmatch; + best_npeers = iface->ksni_npeers; + } + + LASSERT(best_iface != NULL); + + best_iface->ksni_npeers++; + ip = best_iface->ksni_ipaddr; + peer_ni->ksnp_passive_ips[i] = ip; + peer_ni->ksnp_n_passive_ips = i+1; + } + + /* mark the best matching peer_ni IP used */ + j = ksocknal_match_peerip(best_iface, peerips, n_peerips); + peerips[j] = 0; + } + + /* Overwrite input peer_ni IP addresses */ + memcpy(peerips, peer_ni->ksnp_passive_ips, n_ips * sizeof(*peerips)); + + write_unlock_bh(global_lock); + + return (n_ips); +} + +static void +ksocknal_create_routes(struct ksock_peer_ni *peer_ni, int port, + __u32 *peer_ipaddrs, int npeer_ipaddrs) +{ + struct ksock_route *newroute = NULL; + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + struct lnet_ni *ni = peer_ni->ksnp_ni; + struct ksock_net *net = ni->ni_data; + struct list_head *rtmp; + struct ksock_route *route; + struct ksock_interface *iface; + struct ksock_interface *best_iface; + int best_netmatch; + int this_netmatch; + int best_nroutes; + int i; + int j; + + /* CAVEAT EMPTOR: We do all our interface matching with an + * exclusive hold of global lock at IRQ priority. We're only + * expecting to be dealing with small numbers of interfaces, so the + * O(n**3)-ness here shouldn't matter */ + + write_lock_bh(global_lock); + + if (net->ksnn_ninterfaces < 2) { + /* Only create additional connections + * if I have > 1 interface */ + write_unlock_bh(global_lock); + return; + } + + LASSERT(npeer_ipaddrs <= LNET_INTERFACES_NUM); + + for (i = 0; i < npeer_ipaddrs; i++) { + if (newroute != NULL) { + newroute->ksnr_ipaddr = peer_ipaddrs[i]; + } else { + write_unlock_bh(global_lock); + + newroute = ksocknal_create_route(peer_ipaddrs[i], port); + if (newroute == NULL) + return; + + write_lock_bh(global_lock); + } + + if (peer_ni->ksnp_closing) { + /* peer_ni got closed under me */ + break; + } + + /* Already got a route? */ + route = NULL; + list_for_each(rtmp, &peer_ni->ksnp_routes) { + route = list_entry(rtmp, struct ksock_route, ksnr_list); + + if (route->ksnr_ipaddr == newroute->ksnr_ipaddr) + break; + + route = NULL; + } + if (route != NULL) + continue; + + best_iface = NULL; + best_nroutes = 0; + best_netmatch = 0; + + LASSERT(net->ksnn_ninterfaces <= LNET_INTERFACES_NUM); + + /* Select interface to connect from */ + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; + + /* Using this interface already? */ + list_for_each(rtmp, &peer_ni->ksnp_routes) { + route = list_entry(rtmp, struct ksock_route, + ksnr_list); + + if (route->ksnr_myipaddr == iface->ksni_ipaddr) + break; + + route = NULL; + } + if (route != NULL) + continue; + + this_netmatch = (((iface->ksni_ipaddr ^ + newroute->ksnr_ipaddr) & + iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best_iface == NULL || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_nroutes > iface->ksni_nroutes))) + continue; + + best_iface = iface; + best_netmatch = this_netmatch; + best_nroutes = iface->ksni_nroutes; + } + + if (best_iface == NULL) + continue; + + newroute->ksnr_myipaddr = best_iface->ksni_ipaddr; + best_iface->ksni_nroutes++; + + ksocknal_add_route_locked(peer_ni, newroute); + newroute = NULL; + } + + write_unlock_bh(global_lock); + if (newroute != NULL) + ksocknal_route_decref(newroute); +} + +int +ksocknal_accept(struct lnet_ni *ni, struct socket *sock) +{ + struct ksock_connreq *cr; + int rc; + u32 peer_ip; + int peer_port; + + rc = lnet_sock_getaddr(sock, true, &peer_ip, &peer_port); + LASSERT(rc == 0); /* we succeeded before */ + + LIBCFS_ALLOC(cr, sizeof(*cr)); + if (cr == NULL) { + LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from " + "%pI4h: memory exhausted\n", &peer_ip); + return -ENOMEM; + } + + lnet_ni_addref(ni); + cr->ksncr_ni = ni; + cr->ksncr_sock = sock; + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs); + wake_up(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + return 0; +} + +static int +ksocknal_connecting(struct ksock_peer_ni *peer_ni, __u32 ipaddr) +{ + struct ksock_route *route; + + list_for_each_entry(route, &peer_ni->ksnp_routes, ksnr_list) { + if (route->ksnr_ipaddr == ipaddr) + return route->ksnr_connecting; + } + return 0; +} + +int +ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route, + struct socket *sock, int type) +{ + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + struct list_head zombies = LIST_HEAD_INIT(zombies); + struct lnet_process_id peerid; + struct list_head *tmp; + u64 incarnation; + struct ksock_conn *conn; + struct ksock_conn *conn2; + struct ksock_peer_ni *peer_ni = NULL; + struct ksock_peer_ni *peer2; + struct ksock_sched *sched; + struct ksock_hello_msg *hello; + int cpt; + struct ksock_tx *tx; + struct ksock_tx *txtmp; + int rc; + int rc2; + int active; + char *warn = NULL; + + active = (route != NULL); + + LASSERT (active == (type != SOCKLND_CONN_NONE)); + + LIBCFS_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + rc = -ENOMEM; + goto failed_0; + } + + conn->ksnc_peer = NULL; + conn->ksnc_route = NULL; + conn->ksnc_sock = sock; + /* 2 ref, 1 for conn, another extra ref prevents socket + * being closed before establishment of connection */ + atomic_set (&conn->ksnc_sock_refcount, 2); + conn->ksnc_type = type; + ksocknal_lib_save_callback(sock, conn); + atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + + INIT_LIST_HEAD(&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + conn->ksnc_tx_carrier = NULL; + atomic_set (&conn->ksnc_tx_nob, 0); + + LIBCFS_ALLOC(hello, offsetof(struct ksock_hello_msg, + kshm_ips[LNET_INTERFACES_NUM])); + if (hello == NULL) { + rc = -ENOMEM; + goto failed_1; + } + + /* stash conn's local and remote addrs */ + rc = ksocknal_lib_get_conn_addrs (conn); + if (rc != 0) + goto failed_1; + + /* Find out/confirm peer_ni's NID and connection type and get the + * vector of interfaces she's willing to let me connect to. + * Passive connections use the listener timeout since the peer_ni sends + * eagerly */ + + if (active) { + peer_ni = route->ksnr_peer; + LASSERT(ni == peer_ni->ksnp_ni); + + /* Active connection sends HELLO eagerly */ + hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips); + peerid = peer_ni->ksnp_id; + + write_lock_bh(global_lock); + conn->ksnc_proto = peer_ni->ksnp_proto; + write_unlock_bh(global_lock); + + if (conn->ksnc_proto == NULL) { + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + } + + rc = ksocknal_send_hello (ni, conn, peerid.nid, hello); + if (rc != 0) + goto failed_1; + } else { + peerid.nid = LNET_NID_ANY; + peerid.pid = LNET_PID_ANY; + + /* Passive, get protocol from peer_ni */ + conn->ksnc_proto = NULL; + } + + rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation); + if (rc < 0) + goto failed_1; + + LASSERT (rc == 0 || active); + LASSERT (conn->ksnc_proto != NULL); + LASSERT (peerid.nid != LNET_NID_ANY); + + cpt = lnet_cpt_of_nid(peerid.nid, ni); + + if (active) { + ksocknal_peer_addref(peer_ni); + write_lock_bh(global_lock); + } else { + rc = ksocknal_create_peer(&peer_ni, ni, peerid); + if (rc != 0) + goto failed_1; + + write_lock_bh(global_lock); + + /* called with a ref on ni, so shutdown can't have started */ + LASSERT(((struct ksock_net *) ni->ni_data)->ksnn_shutdown == 0); + + peer2 = ksocknal_find_peer_locked(ni, peerid); + if (peer2 == NULL) { + /* NB this puts an "empty" peer_ni in the peer_ni + * table (which takes my ref) */ + list_add_tail(&peer_ni->ksnp_list, + ksocknal_nid2peerlist(peerid.nid)); + } else { + ksocknal_peer_decref(peer_ni); + peer_ni = peer2; + } + + /* +1 ref for me */ + ksocknal_peer_addref(peer_ni); + peer_ni->ksnp_accepting++; + + /* Am I already connecting to this guy? Resolve in + * favour of higher NID... */ + if (peerid.nid < ni->ni_nid && + ksocknal_connecting(peer_ni, conn->ksnc_ipaddr)) { + rc = EALREADY; + warn = "connection race resolution"; + goto failed_2; + } + } + + if (peer_ni->ksnp_closing || + (active && route->ksnr_deleted)) { + /* peer_ni/route got closed under me */ + rc = -ESTALE; + warn = "peer_ni/route removed"; + goto failed_2; + } + + if (peer_ni->ksnp_proto == NULL) { + /* Never connected before. + * NB recv_hello may have returned EPROTO to signal my peer_ni + * wants a different protocol than the one I asked for. + */ + LASSERT(list_empty(&peer_ni->ksnp_conns)); + + peer_ni->ksnp_proto = conn->ksnc_proto; + peer_ni->ksnp_incarnation = incarnation; + } + + if (peer_ni->ksnp_proto != conn->ksnc_proto || + peer_ni->ksnp_incarnation != incarnation) { + /* peer_ni rebooted or I've got the wrong protocol version */ + ksocknal_close_peer_conns_locked(peer_ni, 0, 0); + + peer_ni->ksnp_proto = NULL; + rc = ESTALE; + warn = peer_ni->ksnp_incarnation != incarnation ? + "peer_ni rebooted" : + "wrong proto version"; + goto failed_2; + } + + switch (rc) { + default: + LBUG(); + case 0: + break; + case EALREADY: + warn = "lost conn race"; + goto failed_2; + case EPROTO: + warn = "retry with different protocol version"; + goto failed_2; + } + + /* Refuse to duplicate an existing connection, unless this is a + * loopback connection */ + if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { + list_for_each(tmp, &peer_ni->ksnp_conns) { + conn2 = list_entry(tmp, struct ksock_conn, ksnc_list); + + if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr || + conn2->ksnc_myipaddr != conn->ksnc_myipaddr || + conn2->ksnc_type != conn->ksnc_type) + continue; + + /* Reply on a passive connection attempt so the peer_ni + * realises we're connected. */ + LASSERT (rc == 0); + if (!active) + rc = EALREADY; + + warn = "duplicate"; + goto failed_2; + } + } + + /* If the connection created by this route didn't bind to the IP + * address the route connected to, the connection/route matching + * code below probably isn't going to work. */ + if (active && + route->ksnr_ipaddr != conn->ksnc_ipaddr) { + CERROR("Route %s %pI4h connected to %pI4h\n", + libcfs_id2str(peer_ni->ksnp_id), + &route->ksnr_ipaddr, + &conn->ksnc_ipaddr); + } + + /* Search for a route corresponding to the new connection and + * create an association. This allows incoming connections created + * by routes in my peer_ni to match my own route entries so I don't + * continually create duplicate routes. */ + list_for_each(tmp, &peer_ni->ksnp_routes) { + route = list_entry(tmp, struct ksock_route, ksnr_list); + + if (route->ksnr_ipaddr != conn->ksnc_ipaddr) + continue; + + ksocknal_associate_route_conn_locked(route, conn); + break; + } + + conn->ksnc_peer = peer_ni; /* conn takes my ref on peer_ni */ + peer_ni->ksnp_last_alive = ktime_get_seconds(); + peer_ni->ksnp_send_keepalive = 0; + peer_ni->ksnp_error = 0; + + sched = ksocknal_choose_scheduler_locked(cpt); + if (!sched) { + CERROR("no schedulers available. node is unhealthy\n"); + goto failed_2; + } + /* + * The cpt might have changed if we ended up selecting a non cpt + * native scheduler. So use the scheduler's cpt instead. + */ + cpt = sched->kss_cpt; + sched->kss_nconns++; + conn->ksnc_scheduler = sched; + + conn->ksnc_tx_last_post = ktime_get_seconds(); + /* Set the deadline for the outgoing HELLO to drain */ + conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued; + conn->ksnc_tx_deadline = ktime_get_seconds() + + lnet_get_lnd_timeout(); + smp_mb(); /* order with adding to peer_ni's conn list */ + + list_add(&conn->ksnc_list, &peer_ni->ksnp_conns); + ksocknal_conn_addref(conn); + + ksocknal_new_packet(conn, 0); + + conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn); + + /* Take packets blocking for this connection. */ + list_for_each_entry_safe(tx, txtmp, &peer_ni->ksnp_tx_queue, tx_list) { + if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == + SOCKNAL_MATCH_NO) + continue; + + list_del(&tx->tx_list); + ksocknal_queue_tx_locked(tx, conn); + } + + write_unlock_bh(global_lock); + + /* We've now got a new connection. Any errors from here on are just + * like "normal" comms errors and we close the connection normally. + * NB (a) we still have to send the reply HELLO for passive + * connections, + * (b) normal I/O on the conn is blocked until I setup and call the + * socket callbacks. + */ + + CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d" + " incarnation:%lld sched[%d]\n", + libcfs_id2str(peerid), conn->ksnc_proto->pro_version, + &conn->ksnc_myipaddr, &conn->ksnc_ipaddr, + conn->ksnc_port, incarnation, cpt); + + if (active) { + /* additional routes after interface exchange? */ + ksocknal_create_routes(peer_ni, conn->ksnc_port, + hello->kshm_ips, hello->kshm_nips); + } else { + hello->kshm_nips = ksocknal_select_ips(peer_ni, hello->kshm_ips, + hello->kshm_nips); + rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); + } + + LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg, + kshm_ips[LNET_INTERFACES_NUM])); + + /* setup the socket AFTER I've received hello (it disables + * SO_LINGER). I might call back to the acceptor who may want + * to send a protocol version response and then close the + * socket; this ensures the socket only tears down after the + * response has been sent. */ + if (rc == 0) + rc = ksocknal_lib_setup_sock(sock); + + write_lock_bh(global_lock); + + /* NB my callbacks block while I hold ksnd_global_lock */ + ksocknal_lib_set_callback(sock, conn); + + if (!active) + peer_ni->ksnp_accepting--; + + write_unlock_bh(global_lock); + + if (rc != 0) { + write_lock_bh(global_lock); + if (!conn->ksnc_closing) { + /* could be closed by another thread */ + ksocknal_close_conn_locked(conn, rc); + } + write_unlock_bh(global_lock); + } else if (ksocknal_connsock_addref(conn) == 0) { + /* Allow I/O to proceed. */ + ksocknal_read_callback(conn); + ksocknal_write_callback(conn); + ksocknal_connsock_decref(conn); + } + + ksocknal_connsock_decref(conn); + ksocknal_conn_decref(conn); + return rc; + +failed_2: + if (!peer_ni->ksnp_closing && + list_empty(&peer_ni->ksnp_conns) && + list_empty(&peer_ni->ksnp_routes)) { + list_add(&zombies, &peer_ni->ksnp_tx_queue); + list_del_init(&peer_ni->ksnp_tx_queue); + ksocknal_unlink_peer_locked(peer_ni); + } + + write_unlock_bh(global_lock); + + if (warn != NULL) { + if (rc < 0) + CERROR("Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + else + CDEBUG(D_NET, "Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + } + + if (!active) { + if (rc > 0) { + /* Request retry by replying with CONN_NONE + * ksnc_proto has been set already */ + conn->ksnc_type = SOCKLND_CONN_NONE; + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, peerid.nid, hello); + } + + write_lock_bh(global_lock); + peer_ni->ksnp_accepting--; + write_unlock_bh(global_lock); + } + + /* + * If we get here without an error code, just use -EALREADY. + * Depending on how we got here, the error may be positive + * or negative. Normalize the value for ksocknal_txlist_done(). + */ + rc2 = (rc == 0 ? -EALREADY : (rc > 0 ? -rc : rc)); + ksocknal_txlist_done(ni, &zombies, rc2); + ksocknal_peer_decref(peer_ni); + +failed_1: + if (hello != NULL) + LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg, + kshm_ips[LNET_INTERFACES_NUM])); + + LIBCFS_FREE(conn, sizeof(*conn)); + +failed_0: + sock_release(sock); + return rc; +} + +void +ksocknal_close_conn_locked(struct ksock_conn *conn, int error) +{ + /* This just does the immmediate housekeeping, and queues the + * connection for the reaper to terminate. + * Caller holds ksnd_global_lock exclusively in irq context */ + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + struct ksock_route *route; + struct ksock_conn *conn2; + struct list_head *tmp; + + LASSERT(peer_ni->ksnp_error == 0); + LASSERT(!conn->ksnc_closing); + conn->ksnc_closing = 1; + + /* ksnd_deathrow_conns takes over peer_ni's ref */ + list_del(&conn->ksnc_list); + + route = conn->ksnc_route; + if (route != NULL) { + /* dissociate conn from route... */ + LASSERT(!route->ksnr_deleted); + LASSERT((route->ksnr_connected & (1 << conn->ksnc_type)) != 0); + + conn2 = NULL; + list_for_each(tmp, &peer_ni->ksnp_conns) { + conn2 = list_entry(tmp, struct ksock_conn, ksnc_list); + + if (conn2->ksnc_route == route && + conn2->ksnc_type == conn->ksnc_type) + break; + + conn2 = NULL; + } + if (conn2 == NULL) + route->ksnr_connected &= ~(1 << conn->ksnc_type); + + conn->ksnc_route = NULL; + + ksocknal_route_decref(route); /* drop conn's ref on route */ + } + + if (list_empty(&peer_ni->ksnp_conns)) { + /* No more connections to this peer_ni */ + + if (!list_empty(&peer_ni->ksnp_tx_queue)) { + struct ksock_tx *tx; + + LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x); + + /* throw them to the last connection..., + * these TXs will be send to /dev/null by scheduler */ + list_for_each_entry(tx, &peer_ni->ksnp_tx_queue, + tx_list) + ksocknal_tx_prep(conn, tx); + + spin_lock_bh(&conn->ksnc_scheduler->kss_lock); + list_splice_init(&peer_ni->ksnp_tx_queue, + &conn->ksnc_tx_queue); + spin_unlock_bh(&conn->ksnc_scheduler->kss_lock); + } + + /* renegotiate protocol version */ + peer_ni->ksnp_proto = NULL; + /* stash last conn close reason */ + peer_ni->ksnp_error = error; + + if (list_empty(&peer_ni->ksnp_routes)) { + /* I've just closed last conn belonging to a + * peer_ni with no routes to it */ + ksocknal_unlink_peer_locked(peer_ni); + } + } + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, + &ksocknal_data.ksnd_deathrow_conns); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_peer_failed(struct ksock_peer_ni *peer_ni) +{ + int notify = 0; + time64_t last_alive = 0; + + /* There has been a connection failure or comms error; but I'll only + * tell LNET I think the peer_ni is dead if it's to another kernel and + * there are no connections or connection attempts in existence. */ + + read_lock(&ksocknal_data.ksnd_global_lock); + + if ((peer_ni->ksnp_id.pid & LNET_PID_USERFLAG) == 0 && + list_empty(&peer_ni->ksnp_conns) && + peer_ni->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer_ni) == NULL) { + notify = 1; + last_alive = peer_ni->ksnp_last_alive; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (notify) + lnet_notify(peer_ni->ksnp_ni, peer_ni->ksnp_id.nid, 0, + last_alive); +} + +void +ksocknal_finalize_zcreq(struct ksock_conn *conn) +{ + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + struct ksock_tx *tx; + struct ksock_tx *tmp; + struct list_head zlist = LIST_HEAD_INIT(zlist); + + /* NB safe to finalize TXs because closing of socket will + * abort all buffered data */ + LASSERT(conn->ksnc_sock == NULL); + + spin_lock(&peer_ni->ksnp_lock); + + list_for_each_entry_safe(tx, tmp, &peer_ni->ksnp_zc_req_list, tx_zc_list) { + if (tx->tx_conn != conn) + continue; + + LASSERT(tx->tx_msg.ksm_zc_cookies[0] != 0); + + tx->tx_msg.ksm_zc_cookies[0] = 0; + tx->tx_zc_aborted = 1; /* mark it as not-acked */ + list_del(&tx->tx_zc_list); + list_add(&tx->tx_zc_list, &zlist); + } + + spin_unlock(&peer_ni->ksnp_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, struct ksock_tx, tx_zc_list); + + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } +} + +void +ksocknal_terminate_conn(struct ksock_conn *conn) +{ + /* This gets called by the reaper (guaranteed thread context) to + * disengage the socket from its callbacks and close it. + * ksnc_refcount will eventually hit zero, and then the reaper will + * destroy it. */ + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + struct ksock_sched *sched = conn->ksnc_scheduler; + int failed = 0; + + LASSERT(conn->ksnc_closing); + + /* wake up the scheduler to "send" all remaining packets to /dev/null */ + spin_lock_bh(&sched->kss_lock); + + /* a closing conn is always ready to tx */ + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && + !list_empty(&conn->ksnc_tx_queue)) { + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up (&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); + + /* serialise with callbacks */ + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_lib_reset_callback(conn->ksnc_sock, conn); + + /* OK, so this conn may not be completely disengaged from its + * scheduler yet, but it _has_ committed to terminate... */ + conn->ksnc_scheduler->kss_nconns--; + + if (peer_ni->ksnp_error != 0) { + /* peer_ni's last conn closed in error */ + LASSERT(list_empty(&peer_ni->ksnp_conns)); + failed = 1; + peer_ni->ksnp_error = 0; /* avoid multiple notifications */ + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + if (failed) + ksocknal_peer_failed(peer_ni); + + /* The socket is closed on the final put; either here, or in + * ksocknal_{send,recv}msg(). Since we set up the linger2 option + * when the connection was established, this will close the socket + * immediately, aborting anything buffered in it. Any hung + * zero-copy transmits will therefore complete in finite time. */ + ksocknal_connsock_decref(conn); +} + +void +ksocknal_queue_zombie_conn(struct ksock_conn *conn) +{ + /* Queue the conn for the reaper to destroy */ + LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0); + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_destroy_conn(struct ksock_conn *conn) +{ + time64_t last_rcv; + + /* Final coup-de-grace of the reaper */ + CDEBUG (D_NET, "connection %p\n", conn); + + LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0); + LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0); + LASSERT (conn->ksnc_sock == NULL); + LASSERT (conn->ksnc_route == NULL); + LASSERT (!conn->ksnc_tx_scheduled); + LASSERT (!conn->ksnc_rx_scheduled); + LASSERT(list_empty(&conn->ksnc_tx_queue)); + + /* complete current receive if any */ + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_LNET_PAYLOAD: + last_rcv = conn->ksnc_rx_deadline - + lnet_get_lnd_timeout(); + CERROR("Completing partial receive from %s[%d], " + "ip %pI4h:%d, with error, wanted: %d, left: %d, " + "last alive is %lld secs ago\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type, + &conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left, + ktime_get_seconds() - last_rcv); + if (conn->ksnc_lnet_msg) + conn->ksnc_lnet_msg->msg_health_status = + LNET_MSG_STATUS_REMOTE_ERROR; + lnet_finalize(conn->ksnc_lnet_msg, -EIO); + break; + case SOCKNAL_RX_LNET_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of lnet header from %s, " + "ip %pI4h:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_proto->pro_version); + break; + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of ksock message from %s, " + "ip %pI4h:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_proto->pro_version); + break; + case SOCKNAL_RX_SLOP: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of slops from %s, " + "ip %pI4h:%d, with error\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port); + break; + default: + LBUG (); + break; + } + + ksocknal_peer_decref(conn->ksnc_peer); + + LIBCFS_FREE (conn, sizeof (*conn)); +} + +int +ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr, int why) +{ + struct ksock_conn *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) { + conn = list_entry(ctmp, struct ksock_conn, ksnc_list); + + if (ipaddr == 0 || + conn->ksnc_ipaddr == ipaddr) { + count++; + ksocknal_close_conn_locked (conn, why); + } + } + + return (count); +} + +int +ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why) +{ + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + u32 ipaddr = conn->ksnc_ipaddr; + int count; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + count = ksocknal_close_peer_conns_locked (peer_ni, ipaddr, why); + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return (count); +} + +int +ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr) +{ + struct ksock_peer_ni *peer_ni; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + int count = 0; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + if (id.nid != LNET_NID_ANY) + lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { + + peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list); + + if (!((id.nid == LNET_NID_ANY || id.nid == peer_ni->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || id.pid == peer_ni->ksnp_id.pid))) + continue; + + count += ksocknal_close_peer_conns_locked (peer_ni, ipaddr, 0); + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + /* wildcards always succeed */ + if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0) + return (0); + + return (count == 0 ? -ENOENT : 0); +} + +void +ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive) +{ + /* The router is telling me she's been notified of a change in + * gateway state.... + */ + struct lnet_process_id id = { + .nid = gw_nid, + .pid = LNET_PID_ANY, + }; + + CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid), + alive ? "up" : "down"); + + if (!alive) { + /* If the gateway crashed, close all open connections... */ + ksocknal_close_matching_conns (id, 0); + return; + } + + /* ...otherwise do nothing. We can only establish new connections + * if we have autroutes, and these connect on demand. */ +} + +void +ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when) +{ + int connect = 1; + time64_t last_alive = 0; + time64_t now = ktime_get_seconds(); + struct ksock_peer_ni *peer_ni = NULL; + rwlock_t *glock = &ksocknal_data.ksnd_global_lock; + struct lnet_process_id id = { + .nid = nid, + .pid = LNET_PID_LUSTRE, + }; + + read_lock(glock); + + peer_ni = ksocknal_find_peer_locked(ni, id); + if (peer_ni != NULL) { + struct list_head *tmp; + struct ksock_conn *conn; + int bufnob; + + list_for_each(tmp, &peer_ni->ksnp_conns) { + conn = list_entry(tmp, struct ksock_conn, ksnc_list); + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + + if (bufnob < conn->ksnc_tx_bufnob) { + /* something got ACKed */ + conn->ksnc_tx_deadline = ktime_get_seconds() + + lnet_get_lnd_timeout(); + peer_ni->ksnp_last_alive = now; + conn->ksnc_tx_bufnob = bufnob; + } + } + + last_alive = peer_ni->ksnp_last_alive; + if (ksocknal_find_connectable_route_locked(peer_ni) == NULL) + connect = 0; + } + + read_unlock(glock); + + if (last_alive != 0) + *when = last_alive; + + CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago, connect %d\n", + libcfs_nid2str(nid), peer_ni, + last_alive ? now - last_alive : -1, + connect); + + if (!connect) + return; + + ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port()); + + write_lock_bh(glock); + + peer_ni = ksocknal_find_peer_locked(ni, id); + if (peer_ni != NULL) + ksocknal_launch_all_connections_locked(peer_ni); + + write_unlock_bh(glock); + return; +} + +static void +ksocknal_push_peer(struct ksock_peer_ni *peer_ni) +{ + int index; + int i; + struct list_head *tmp; + struct ksock_conn *conn; + + for (index = 0; ; index++) { + read_lock(&ksocknal_data.ksnd_global_lock); + + i = 0; + conn = NULL; + + list_for_each(tmp, &peer_ni->ksnp_conns) { + if (i++ == index) { + conn = list_entry(tmp, struct ksock_conn, + ksnc_list); + ksocknal_conn_addref(conn); + break; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (conn == NULL) + break; + + ksocknal_lib_push_conn (conn); + ksocknal_conn_decref(conn); + } +} + +static int +ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id) +{ + struct list_head *start; + struct list_head *end; + struct list_head *tmp; + int rc = -ENOENT; + unsigned int hsize = ksocknal_data.ksnd_peer_hash_size; + + if (id.nid == LNET_NID_ANY) { + start = &ksocknal_data.ksnd_peers[0]; + end = &ksocknal_data.ksnd_peers[hsize - 1]; + } else { + start = end = ksocknal_nid2peerlist(id.nid); + } + + for (tmp = start; tmp <= end; tmp++) { + int peer_off; /* searching offset in peer_ni hash table */ + + for (peer_off = 0; ; peer_off++) { + struct ksock_peer_ni *peer_ni; + int i = 0; + + read_lock(&ksocknal_data.ksnd_global_lock); + list_for_each_entry(peer_ni, tmp, ksnp_list) { + if (!((id.nid == LNET_NID_ANY || + id.nid == peer_ni->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || + id.pid == peer_ni->ksnp_id.pid))) + continue; + + if (i++ == peer_off) { + ksocknal_peer_addref(peer_ni); + break; + } + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (i == 0) /* no match */ + break; + + rc = 0; + ksocknal_push_peer(peer_ni); + ksocknal_peer_decref(peer_ni); + } + } + return rc; +} + +static int +ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask) +{ + struct ksock_net *net = ni->ni_data; + struct ksock_interface *iface; + int rc; + int i; + int j; + struct list_head *ptmp; + struct ksock_peer_ni *peer_ni; + struct list_head *rtmp; + struct ksock_route *route; + + if (ipaddress == 0 || + netmask == 0) + return -EINVAL; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + iface = ksocknal_ip2iface(ni, ipaddress); + if (iface != NULL) { + /* silently ignore dups */ + rc = 0; + } else if (net->ksnn_ninterfaces == LNET_INTERFACES_NUM) { + rc = -ENOSPC; + } else { + iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++]; + + iface->ksni_ipaddr = ipaddress; + iface->ksni_netmask = netmask; + iface->ksni_nroutes = 0; + iface->ksni_npeers = 0; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer_ni = list_entry(ptmp, struct ksock_peer_ni, + ksnp_list); + + for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++) + if (peer_ni->ksnp_passive_ips[j] == ipaddress) + iface->ksni_npeers++; + + list_for_each(rtmp, &peer_ni->ksnp_routes) { + route = list_entry(rtmp, + struct ksock_route, + ksnr_list); + + if (route->ksnr_myipaddr == ipaddress) + iface->ksni_nroutes++; + } + } + } + + rc = 0; + /* NB only new connections will pay attention to the new interface! */ + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return rc; +} + +static void +ksocknal_peer_del_interface_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr) +{ + struct list_head *tmp; + struct list_head *nxt; + struct ksock_route *route; + struct ksock_conn *conn; + int i; + int j; + + for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++) + if (peer_ni->ksnp_passive_ips[i] == ipaddr) { + for (j = i+1; j < peer_ni->ksnp_n_passive_ips; j++) + peer_ni->ksnp_passive_ips[j-1] = + peer_ni->ksnp_passive_ips[j]; + peer_ni->ksnp_n_passive_ips--; + break; + } + + list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) { + route = list_entry(tmp, struct ksock_route, ksnr_list); + + if (route->ksnr_myipaddr != ipaddr) + continue; + + if (route->ksnr_share_count != 0) { + /* Manually created; keep, but unbind */ + route->ksnr_myipaddr = 0; + } else { + ksocknal_del_route_locked(route); + } + } + + list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) { + conn = list_entry(tmp, struct ksock_conn, ksnc_list); + + if (conn->ksnc_myipaddr == ipaddr) + ksocknal_close_conn_locked (conn, 0); + } +} + +static int +ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress) +{ + struct ksock_net *net = ni->ni_data; + int rc = -ENOENT; + struct list_head *tmp; + struct list_head *nxt; + struct ksock_peer_ni *peer_ni; + u32 this_ip; + int i; + int j; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + this_ip = net->ksnn_interfaces[i].ksni_ipaddr; + + if (!(ipaddress == 0 || + ipaddress == this_ip)) + continue; + + rc = 0; + + for (j = i+1; j < net->ksnn_ninterfaces; j++) + net->ksnn_interfaces[j-1] = + net->ksnn_interfaces[j]; + + net->ksnn_ninterfaces--; + + for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) { + list_for_each_safe(tmp, nxt, + &ksocknal_data.ksnd_peers[j]) { + peer_ni = list_entry(tmp, struct ksock_peer_ni, + ksnp_list); + + if (peer_ni->ksnp_ni != ni) + continue; + + ksocknal_peer_del_interface_locked(peer_ni, this_ip); + } + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return (rc); +} + +int +ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) +{ + struct lnet_process_id id = {0}; + struct libcfs_ioctl_data *data = arg; + int rc; + + switch(cmd) { + case IOC_LIBCFS_GET_INTERFACE: { + struct ksock_net *net = ni->ni_data; + struct ksock_interface *iface; + + read_lock(&ksocknal_data.ksnd_global_lock); + + if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) { + rc = -ENOENT; + } else { + rc = 0; + iface = &net->ksnn_interfaces[data->ioc_count]; + + data->ioc_u32[0] = iface->ksni_ipaddr; + data->ioc_u32[1] = iface->ksni_netmask; + data->ioc_u32[2] = iface->ksni_npeers; + data->ioc_u32[3] = iface->ksni_nroutes; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return rc; + } + + case IOC_LIBCFS_ADD_INTERFACE: + return ksocknal_add_interface(ni, + data->ioc_u32[0], /* IP address */ + data->ioc_u32[1]); /* net mask */ + + case IOC_LIBCFS_DEL_INTERFACE: + return ksocknal_del_interface(ni, + data->ioc_u32[0]); /* IP address */ + + case IOC_LIBCFS_GET_PEER: { + __u32 myip = 0; + __u32 ip = 0; + int port = 0; + int conn_count = 0; + int share_count = 0; + + rc = ksocknal_get_peer_info(ni, data->ioc_count, + &id, &myip, &ip, &port, + &conn_count, &share_count); + if (rc != 0) + return rc; + + data->ioc_nid = id.nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = port; + data->ioc_u32[2] = myip; + data->ioc_u32[3] = conn_count; + data->ioc_u32[4] = id.pid; + return 0; + } + + case IOC_LIBCFS_ADD_PEER: + id.nid = data->ioc_nid; + id.pid = LNET_PID_LUSTRE; + return ksocknal_add_peer (ni, id, + data->ioc_u32[0], /* IP */ + data->ioc_u32[1]); /* port */ + + case IOC_LIBCFS_DEL_PEER: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_del_peer (ni, id, + data->ioc_u32[0]); /* IP */ + + case IOC_LIBCFS_GET_CONN: { + int txmem; + int rxmem; + int nagle; + struct ksock_conn *conn = ksocknal_get_conn_by_idx(ni, data->ioc_count); + + if (conn == NULL) + return -ENOENT; + + ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + + data->ioc_count = txmem; + data->ioc_nid = conn->ksnc_peer->ksnp_id.nid; + data->ioc_flags = nagle; + data->ioc_u32[0] = conn->ksnc_ipaddr; + data->ioc_u32[1] = conn->ksnc_port; + data->ioc_u32[2] = conn->ksnc_myipaddr; + data->ioc_u32[3] = conn->ksnc_type; + data->ioc_u32[4] = conn->ksnc_scheduler->kss_cpt; + data->ioc_u32[5] = rxmem; + data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid; + ksocknal_conn_decref(conn); + return 0; + } + + case IOC_LIBCFS_CLOSE_CONNECTION: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_close_matching_conns (id, + data->ioc_u32[0]); + + case IOC_LIBCFS_REGISTER_MYNID: + /* Ignore if this is a noop */ + if (data->ioc_nid == ni->ni_nid) + return 0; + + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + + case IOC_LIBCFS_PUSH_CONNECTION: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_push(ni, id); + + default: + return -EINVAL; + } + /* not reached */ +} + +static void +ksocknal_free_buffers (void) +{ + LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0); + + if (ksocknal_data.ksnd_schedulers != NULL) + cfs_percpt_free(ksocknal_data.ksnd_schedulers); + + LIBCFS_FREE (ksocknal_data.ksnd_peers, + sizeof(struct list_head) * + ksocknal_data.ksnd_peer_hash_size); + + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + struct list_head zlist; + struct ksock_tx *tx; + + list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs); + list_del_init(&ksocknal_data.ksnd_idle_noop_txs); + spin_unlock(&ksocknal_data.ksnd_tx_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, struct ksock_tx, tx_list); + list_del(&tx->tx_list); + LIBCFS_FREE(tx, tx->tx_desc_size); + } + } else { + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } +} + +static void +ksocknal_base_shutdown(void) +{ + struct ksock_sched *sched; + int i; + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&libcfs_kmemory)); + LASSERT (ksocknal_data.ksnd_nnets == 0); + + switch (ksocknal_data.ksnd_init) { + default: + LASSERT(0); + fallthrough; + + case SOCKNAL_INIT_ALL: + case SOCKNAL_INIT_DATA: + LASSERT(ksocknal_data.ksnd_peers != NULL); + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) + LASSERT(list_empty(&ksocknal_data.ksnd_peers[i])); + + LASSERT(list_empty(&ksocknal_data.ksnd_nets)); + LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns)); + LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns)); + LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs)); + LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes)); + + if (ksocknal_data.ksnd_schedulers != NULL) { + cfs_percpt_for_each(sched, i, + ksocknal_data.ksnd_schedulers) { + + LASSERT(list_empty(&sched->kss_tx_conns)); + LASSERT(list_empty(&sched->kss_rx_conns)); + LASSERT(list_empty(&sched->kss_zombie_noop_txs)); + LASSERT(sched->kss_nconns == 0); + } + } + + /* flag threads to terminate; wake and wait for them to die */ + ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all(&ksocknal_data.ksnd_connd_waitq); + wake_up_all(&ksocknal_data.ksnd_reaper_waitq); + + if (ksocknal_data.ksnd_schedulers != NULL) { + cfs_percpt_for_each(sched, i, + ksocknal_data.ksnd_schedulers) + wake_up_all(&sched->kss_waitq); + } + + i = 4; + read_lock(&ksocknal_data.ksnd_global_lock); + while (ksocknal_data.ksnd_nthreads != 0) { + i++; + /* power of 2? */ + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "waiting for %d threads to terminate\n", + ksocknal_data.ksnd_nthreads); + read_unlock(&ksocknal_data.ksnd_global_lock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + read_lock(&ksocknal_data.ksnd_global_lock); + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_free_buffers(); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&libcfs_kmemory)); + + module_put(THIS_MODULE); +} + +static int +ksocknal_base_startup(void) +{ + struct ksock_sched *sched; + int rc; + int i; + + LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + LASSERT (ksocknal_data.ksnd_nnets == 0); + + memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ + + ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; + LIBCFS_ALLOC(ksocknal_data.ksnd_peers, + sizeof(struct list_head) * + ksocknal_data.ksnd_peer_hash_size); + if (ksocknal_data.ksnd_peers == NULL) + return -ENOMEM; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) + INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); + + rwlock_init(&ksocknal_data.ksnd_global_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_nets); + + spin_lock_init(&ksocknal_data.ksnd_reaper_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns); + INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns); + INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns); + init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); + + spin_lock_init(&ksocknal_data.ksnd_connd_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes); + init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq); + + spin_lock_init(&ksocknal_data.ksnd_tx_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs); + + /* NB memset above zeros whole of ksocknal_data */ + + /* flag lists/ptrs/locks initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + try_module_get(THIS_MODULE); + + /* Create a scheduler block per available CPT */ + ksocknal_data.ksnd_schedulers = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*sched)); + if (ksocknal_data.ksnd_schedulers == NULL) + goto failed; + + cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) { + int nthrs; + + /* + * make sure not to allocate more threads than there are + * cores/CPUs in teh CPT + */ + nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + if (*ksocknal_tunables.ksnd_nscheds > 0) { + nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds); + } else { + /* + * max to half of CPUs, assume another half should be + * reserved for upper layer modules + */ + nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); + } + + sched->kss_nthreads_max = nthrs; + sched->kss_cpt = i; + + spin_lock_init(&sched->kss_lock); + INIT_LIST_HEAD(&sched->kss_rx_conns); + INIT_LIST_HEAD(&sched->kss_tx_conns); + INIT_LIST_HEAD(&sched->kss_zombie_noop_txs); + init_waitqueue_head(&sched->kss_waitq); + } + + ksocknal_data.ksnd_connd_starting = 0; + ksocknal_data.ksnd_connd_failed_stamp = 0; + ksocknal_data.ksnd_connd_starting_stamp = ktime_get_real_seconds(); + /* must have at least 2 connds to remain responsive to accepts while + * connecting */ + if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1) + *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1; + + if (*ksocknal_tunables.ksnd_nconnds_max < + *ksocknal_tunables.ksnd_nconnds) { + ksocknal_tunables.ksnd_nconnds_max = + ksocknal_tunables.ksnd_nconnds; + } + + for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) { + char name[16]; + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_starting++; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + + + snprintf(name, sizeof(name), "socknal_cd%02d", i); + rc = ksocknal_thread_start(ksocknal_connd, + (void *)((uintptr_t)i), name); + if (rc != 0) { + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_starting--; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + CERROR("Can't spawn socknal connd: %d\n", rc); + goto failed; + } + } + + rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper"); + if (rc != 0) { + CERROR ("Can't spawn socknal reaper: %d\n", rc); + goto failed; + } + + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + + return 0; + + failed: + ksocknal_base_shutdown(); + return -ENETDOWN; +} + +static void +ksocknal_debug_peerhash(struct lnet_ni *ni) +{ + struct ksock_peer_ni *peer_ni = NULL; + struct list_head *tmp; + int i; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) { + peer_ni = list_entry(tmp, struct ksock_peer_ni, ksnp_list); + + if (peer_ni->ksnp_ni == ni) break; + + peer_ni = NULL; + } + } + + if (peer_ni != NULL) { + struct ksock_route *route; + struct ksock_conn *conn; + + CWARN ("Active peer_ni on shutdown: %s, ref %d, scnt %d, " + "closing %d, accepting %d, err %d, zcookie %llu, " + "txq %d, zc_req %d\n", libcfs_id2str(peer_ni->ksnp_id), + atomic_read(&peer_ni->ksnp_refcount), + peer_ni->ksnp_sharecount, peer_ni->ksnp_closing, + peer_ni->ksnp_accepting, peer_ni->ksnp_error, + peer_ni->ksnp_zc_next_cookie, + !list_empty(&peer_ni->ksnp_tx_queue), + !list_empty(&peer_ni->ksnp_zc_req_list)); + + list_for_each(tmp, &peer_ni->ksnp_routes) { + route = list_entry(tmp, struct ksock_route, ksnr_list); + CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, " + "del %d\n", atomic_read(&route->ksnr_refcount), + route->ksnr_scheduled, route->ksnr_connecting, + route->ksnr_connected, route->ksnr_deleted); + } + + list_for_each(tmp, &peer_ni->ksnp_conns) { + conn = list_entry(tmp, struct ksock_conn, ksnc_list); + CWARN ("Conn: ref %d, sref %d, t %d, c %d\n", + atomic_read(&conn->ksnc_conn_refcount), + atomic_read(&conn->ksnc_sock_refcount), + conn->ksnc_type, conn->ksnc_closing); + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return; +} + +void +ksocknal_shutdown(struct lnet_ni *ni) +{ + struct ksock_net *net = ni->ni_data; + struct lnet_process_id anyid = { + .nid = LNET_NID_ANY, + .pid = LNET_PID_ANY, + }; + int i; + + LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL); + LASSERT(ksocknal_data.ksnd_nnets > 0); + + spin_lock_bh(&net->ksnn_lock); + net->ksnn_shutdown = 1; /* prevent new peers */ + spin_unlock_bh(&net->ksnn_lock); + + /* Delete all peers */ + ksocknal_del_peer(ni, anyid, 0); + + /* Wait for all peer_ni state to clean up */ + i = 2; + spin_lock_bh(&net->ksnn_lock); + while (net->ksnn_npeers != 0) { + spin_unlock_bh(&net->ksnn_lock); + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect\n", + net->ksnn_npeers); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + + ksocknal_debug_peerhash(ni); + + spin_lock_bh(&net->ksnn_lock); + } + spin_unlock_bh(&net->ksnn_lock); + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0); + LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0); + } + + list_del(&net->ksnn_list); + LIBCFS_FREE(net, sizeof(*net)); + + ksocknal_data.ksnd_nnets--; + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); +} + +static int +ksocknal_search_new_ipif(struct ksock_net *net) +{ + int new_ipif = 0; + int i; + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + char *ifnam = &net->ksnn_interfaces[i].ksni_name[0]; + char *colon = strchr(ifnam, ':'); + int found = 0; + struct ksock_net *tmp; + int j; + + if (colon != NULL) /* ignore alias device */ + *colon = 0; + + list_for_each_entry(tmp, &ksocknal_data.ksnd_nets, + ksnn_list) { + for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) { + char *ifnam2 = &tmp->ksnn_interfaces[j].\ + ksni_name[0]; + char *colon2 = strchr(ifnam2, ':'); + + if (colon2 != NULL) + *colon2 = 0; + + found = strcmp(ifnam, ifnam2) == 0; + if (colon2 != NULL) + *colon2 = ':'; + } + if (found) + break; + } + + new_ipif += !found; + if (colon != NULL) + *colon = ':'; + } + + return new_ipif; +} + +static int +ksocknal_start_schedulers(struct ksock_sched *sched) +{ + int nthrs; + int rc = 0; + int i; + + if (sched->kss_nthreads == 0) { + if (*ksocknal_tunables.ksnd_nscheds > 0) { + nthrs = sched->kss_nthreads_max; + } else { + nthrs = cfs_cpt_weight(lnet_cpt_table(), + sched->kss_cpt); + nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); + nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs); + } + nthrs = min(nthrs, sched->kss_nthreads_max); + } else { + LASSERT(sched->kss_nthreads <= sched->kss_nthreads_max); + /* increase two threads if there is new interface */ + nthrs = min(2, sched->kss_nthreads_max - sched->kss_nthreads); + } + + for (i = 0; i < nthrs; i++) { + long id; + char name[20]; + + id = KSOCK_THREAD_ID(sched->kss_cpt, sched->kss_nthreads + i); + snprintf(name, sizeof(name), "socknal_sd%02d_%02d", + sched->kss_cpt, (int)KSOCK_THREAD_SID(id)); + + rc = ksocknal_thread_start(ksocknal_scheduler, + (void *)id, name); + if (rc == 0) + continue; + + CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", + sched->kss_cpt, (int) KSOCK_THREAD_SID(id), rc); + break; + } + + sched->kss_nthreads += i; + return rc; +} + +static int +ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts) +{ + int newif = ksocknal_search_new_ipif(net); + int rc; + int i; + + if (ncpts > 0 && ncpts > cfs_cpt_number(lnet_cpt_table())) + return -EINVAL; + + for (i = 0; i < ncpts; i++) { + struct ksock_sched *sched; + int cpt = (cpts == NULL) ? i : cpts[i]; + + LASSERT(cpt < cfs_cpt_number(lnet_cpt_table())); + sched = ksocknal_data.ksnd_schedulers[cpt]; + + if (!newif && sched->kss_nthreads > 0) + continue; + + rc = ksocknal_start_schedulers(sched); + if (rc != 0) + return rc; + } + return 0; +} + +int +ksocknal_startup(struct lnet_ni *ni) +{ + struct ksock_net *net; + struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables; + struct ksock_interface *ksi = NULL; + struct lnet_inetdev *ifaces = NULL; + int i = 0; + int rc; + + LASSERT (ni->ni_net->net_lnd == &the_ksocklnd); + + if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) { + rc = ksocknal_base_startup(); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + if (net == NULL) + goto fail_0; + + spin_lock_init(&net->ksnn_lock); + net->ksnn_incarnation = ktime_get_real_ns(); + ni->ni_data = net; + net_tunables = &ni->ni_net->net_tunables; + + if (net_tunables->lct_peer_timeout == -1) + net_tunables->lct_peer_timeout = + *ksocknal_tunables.ksnd_peertimeout; + + if (net_tunables->lct_max_tx_credits == -1) + net_tunables->lct_max_tx_credits = + *ksocknal_tunables.ksnd_credits; + + if (net_tunables->lct_peer_tx_credits == -1) + net_tunables->lct_peer_tx_credits = + *ksocknal_tunables.ksnd_peertxcredits; + + if (net_tunables->lct_peer_tx_credits > + net_tunables->lct_max_tx_credits) + net_tunables->lct_peer_tx_credits = + net_tunables->lct_max_tx_credits; + + if (net_tunables->lct_peer_rtr_credits == -1) + net_tunables->lct_peer_rtr_credits = + *ksocknal_tunables.ksnd_peerrtrcredits; + + rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns); + if (rc < 0) + goto fail_1; + + if (!ni->ni_interfaces[0]) { + ksi = &net->ksnn_interfaces[0]; + + /* Use the first discovered interface */ + net->ksnn_ninterfaces = 1; + ni->ni_dev_cpt = ifaces[0].li_cpt; + ksi->ksni_ipaddr = ifaces[0].li_ipaddr; + ksi->ksni_netmask = ifaces[0].li_netmask; + strlcpy(ksi->ksni_name, ifaces[0].li_name, + sizeof(ksi->ksni_name)); + } else { + /* Before Multi-Rail ksocklnd would manage + * multiple interfaces with its own tcp bonding. + * If we encounter an old configuration using + * this tcp bonding approach then we need to + * handle more than one ni_interfaces. + * + * In Multi-Rail configuration only ONE ni_interface + * should exist. Each IP alias should be mapped to + * each 'struct net_ni'. + */ + for (i = 0; i < LNET_INTERFACES_NUM; i++) { + int j; + + if (!ni->ni_interfaces[i]) + break; + + for (j = 0; j < LNET_INTERFACES_NUM; j++) { + if (i != j && ni->ni_interfaces[j] && + strcmp(ni->ni_interfaces[i], + ni->ni_interfaces[j]) == 0) { + rc = -EEXIST; + CERROR("ksocklnd: found duplicate %s at %d and %d, rc = %d\n", + ni->ni_interfaces[i], i, j, rc); + goto fail_1; + } + } + + for (j = 0; j < rc; j++) { + if (strcmp(ifaces[j].li_name, + ni->ni_interfaces[i]) != 0) + continue; + + ksi = &net->ksnn_interfaces[j]; + ni->ni_dev_cpt = ifaces[j].li_cpt; + ksi->ksni_ipaddr = ifaces[j].li_ipaddr; + ksi->ksni_netmask = ifaces[j].li_netmask; + strlcpy(ksi->ksni_name, ifaces[j].li_name, + sizeof(ksi->ksni_name)); + net->ksnn_ninterfaces++; + break; + } + } + /* ni_interfaces don't map to all network interfaces */ + if (!ksi || net->ksnn_ninterfaces != i) { + CERROR("ksocklnd: requested %d but only %d interfaces found\n", + i, net->ksnn_ninterfaces); + goto fail_1; + } + } + + /* call it before add it to ksocknal_data.ksnd_nets */ + rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) + goto fail_1; + + LASSERT(ksi); + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ksi->ksni_ipaddr); + list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets); + + ksocknal_data.ksnd_nnets++; + + return 0; + + fail_1: + LIBCFS_FREE(net, sizeof(*net)); + fail_0: + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); + + return -ENETDOWN; +} + + +static void __exit ksocklnd_exit(void) +{ + lnet_unregister_lnd(&the_ksocklnd); +} + +static int __init ksocklnd_init(void) +{ + int rc; + + /* check ksnr_connected/connecting field large enough */ + CLASSERT(SOCKLND_CONN_NTYPES <= 4); + CLASSERT(SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN); + + /* initialize the_ksocklnd */ + the_ksocklnd.lnd_type = SOCKLND; + the_ksocklnd.lnd_startup = ksocknal_startup; + the_ksocklnd.lnd_shutdown = ksocknal_shutdown; + the_ksocklnd.lnd_ctl = ksocknal_ctl; + the_ksocklnd.lnd_send = ksocknal_send; + the_ksocklnd.lnd_recv = ksocknal_recv; + the_ksocklnd.lnd_notify = ksocknal_notify; + the_ksocklnd.lnd_query = ksocknal_query; + the_ksocklnd.lnd_accept = ksocknal_accept; + + rc = ksocknal_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_ksocklnd); + + return 0; +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("TCP Socket LNet Network Driver"); +MODULE_VERSION("2.8.0"); +MODULE_LICENSE("GPL"); + +module_init(ksocklnd_init); +module_exit(ksocklnd_exit); diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h new file mode 100644 index 0000000000000..cbc40f7347d4d --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h @@ -0,0 +1,679 @@ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef _SOCKLND_SOCKLND_H_ +#define _SOCKLND_SOCKLND_H_ + +#define DEBUG_PORTAL_ALLOC +#define DEBUG_SUBSYSTEM S_LND + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef HAVE_TCP_SENDPAGE_USE_SOCKET +# define cfs_tcp_sendpage(sk, page, offset, size, flags) \ + tcp_sendpage((sk)->sk_socket, page, offset, size, flags) +#else /* !HAVE_TCP_SENDPAGE_USE_SOCKET */ +# define cfs_tcp_sendpage(sk, page, offset, size, flags) \ + tcp_sendpage(sk, page, offset, size, flags) +#endif /* HAVE_TCP_SENDPAGE_USE_SOCKET */ + +#include + +#ifndef NETIF_F_CSUM_MASK +# define NETIF_F_CSUM_MASK NETIF_F_ALL_CSUM +#endif + +/* assume one thread for each connection type */ +#define SOCKNAL_NSCHEDS 3 +#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1) + +#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer_ni lists */ +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */ +#define SOCKNAL_ENOMEM_RETRY 1 /* seconds between retries */ + +#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */ +#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */ + +#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */ + +/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled). + * no risk if we're not running on a CONFIG_HIGHMEM platform. */ +#ifdef CONFIG_HIGHMEM +# define SOCKNAL_RISK_KMAP_DEADLOCK 0 +#else +# define SOCKNAL_RISK_KMAP_DEADLOCK 1 +#endif + +/* per scheduler state */ +struct ksock_sched { + /* serialise */ + spinlock_t kss_lock; + /* conn waiting to be written */ + struct list_head kss_rx_conns; + struct list_head kss_tx_conns; + /* zombie noop tx list */ + struct list_head kss_zombie_noop_txs; + /* where scheduler sleeps */ + wait_queue_head_t kss_waitq; + /* # connections assigned to this scheduler */ + int kss_nconns; + /* max allowed threads */ + int kss_nthreads_max; + /* number of threads */ + int kss_nthreads; + /* CPT id */ + int kss_cpt; +}; + +#define KSOCK_CPT_SHIFT 16 +#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid)) +#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT) +#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1)) + +struct ksock_interface { /* in-use interface */ + __u32 ksni_ipaddr; /* interface's IP address */ + __u32 ksni_netmask; /* interface's network mask */ + int ksni_nroutes; /* # routes using (active) */ + int ksni_npeers; /* # peers using (passive) */ + char ksni_name[IFNAMSIZ]; /* interface name */ +}; + +struct ksock_tunables { + /* "stuck" socket timeout (seconds) */ + int *ksnd_timeout; + /* # scheduler threads in each pool while starting */ + int *ksnd_nscheds; + int *ksnd_nconnds; /* # connection daemons */ + int *ksnd_nconnds_max; /* max # connection daemons */ + int *ksnd_min_reconnectms; /* first connection retry after (ms)... */ + int *ksnd_max_reconnectms; /* ...exponentially increasing to this */ + int *ksnd_eager_ack; /* make TCP ack eagerly? */ + int *ksnd_typed_conns; /* drive sockets by type? */ + int *ksnd_min_bulk; /* smallest "large" message */ + int *ksnd_tx_buffer_size; /* socket tx buffer size */ + int *ksnd_rx_buffer_size; /* socket rx buffer size */ + int *ksnd_nagle; /* enable NAGLE? */ + int *ksnd_round_robin; /* round robin for multiple interfaces */ + int *ksnd_keepalive; /* # secs for sending keepalive NOOP */ + int *ksnd_keepalive_idle; /* # idle secs before 1st probe */ + int *ksnd_keepalive_count; /* # probes */ + int *ksnd_keepalive_intvl; /* time between probes */ + int *ksnd_credits; /* # concurrent sends */ + int *ksnd_peertxcredits; /* # concurrent sends to 1 peer_ni */ + int *ksnd_peerrtrcredits; /* # per-peer_ni router buffer credits */ + int *ksnd_peertimeout; /* seconds to consider peer_ni dead */ + int *ksnd_enable_csum; /* enable check sum */ + int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */ + int *ksnd_nonblk_zcack; /* always send zc-ack on non-blocking connection */ + unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload size */ + int *ksnd_zc_recv; /* enable ZC receive (for Chelsio TOE) */ + int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */ +#ifdef CPU_AFFINITY + int *ksnd_irq_affinity; /* enable IRQ affinity? */ +#endif +#ifdef SOCKNAL_BACKOFF + int *ksnd_backoff_init; /* initial TCP backoff */ + int *ksnd_backoff_max; /* maximum TCP backoff */ +#endif +#if SOCKNAL_VERSION_DEBUG + int *ksnd_protocol; /* protocol version */ +#endif +}; + +struct ksock_net { + __u64 ksnn_incarnation; /* my epoch */ + spinlock_t ksnn_lock; /* serialise */ + struct list_head ksnn_list; /* chain on global list */ + int ksnn_npeers; /* # peers */ + int ksnn_shutdown; /* shutting down? */ + int ksnn_ninterfaces; /* IP interfaces */ + struct ksock_interface ksnn_interfaces[LNET_INTERFACES_NUM]; +}; + +/** connd timeout */ +#define SOCKNAL_CONND_TIMEOUT 120 +/** reserved thread for accepting & creating new connd */ +#define SOCKNAL_CONND_RESV 1 + +struct ksock_nal_data { + int ksnd_init; /* initialisation state */ + int ksnd_nnets; /* # networks set up */ + struct list_head ksnd_nets; /* list of nets */ + /* stabilize peer_ni/conn ops */ + rwlock_t ksnd_global_lock; + /* hash table of all my known peers */ + struct list_head *ksnd_peers; + int ksnd_peer_hash_size; /* size of ksnd_peers */ + + int ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + /* schedulers information */ + struct ksock_sched **ksnd_schedulers; + + atomic_t ksnd_nactive_txs; /* #active txs */ + + /* conns to close: reaper_lock*/ + struct list_head ksnd_deathrow_conns; + /* conns to free: reaper_lock */ + struct list_head ksnd_zombie_conns; + /* conns to retry: reaper_lock*/ + struct list_head ksnd_enomem_conns; + /* reaper sleeps here */ + wait_queue_head_t ksnd_reaper_waitq; + /* when reaper will wake */ + time64_t ksnd_reaper_waketime; + /* serialise */ + spinlock_t ksnd_reaper_lock; + + int ksnd_enomem_tx; /* test ENOMEM sender */ + int ksnd_stall_tx; /* test sluggish sender */ + int ksnd_stall_rx; /* test sluggish receiver */ + + /* incoming connection requests */ + struct list_head ksnd_connd_connreqs; + /* routes waiting to be connected */ + struct list_head ksnd_connd_routes; + /* connds sleep here */ + wait_queue_head_t ksnd_connd_waitq; + /* # connds connecting */ + int ksnd_connd_connecting; + /** time stamp of the last failed connecting attempt */ + time64_t ksnd_connd_failed_stamp; + /** # starting connd */ + unsigned ksnd_connd_starting; + /** time stamp of the last starting connd */ + time64_t ksnd_connd_starting_stamp; + /** # running connd */ + unsigned ksnd_connd_running; + /* serialise */ + spinlock_t ksnd_connd_lock; + + /* list head for freed noop tx */ + struct list_head ksnd_idle_noop_txs; + /* serialise, g_lock unsafe */ + spinlock_t ksnd_tx_lock; +}; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_ALL 2 + +/* A packet just assembled for transmission is represented by 1 or more + * struct kvec fragments (the first frag contains the portals header), + * followed by 0 or more lnet_kiov_t fragments. + * + * On the receive side, initially 1 struct kvec fragment is posted for + * receive (the header). Once the header has been received, the payload is + * received into either struct kvec or lnet_kiov_t fragments, depending on + * what the header matched or whether the message needs forwarding. */ + +struct ksock_conn; /* forward ref */ +struct ksock_peer; /* forward ref */ +struct ksock_route; /* forward ref */ +struct ksock_proto; /* forward ref */ + +struct ksock_tx { /* transmit packet */ + struct list_head tx_list; /* queue on conn for transmission etc */ + struct list_head tx_zc_list; /* queue on peer_ni for ZC request */ + atomic_t tx_refcount; /* tx reference count */ + int tx_nob; /* # packet bytes */ + int tx_resid; /* residual bytes */ + int tx_niov; /* # packet kvec frags */ + struct kvec *tx_iov; /* packet kvec frags */ + int tx_nkiov; /* # packet page frags */ + unsigned short tx_zc_aborted; /* aborted ZC request */ + unsigned short tx_zc_capable:1; /* payload is large enough for ZC */ + unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */ + unsigned short tx_nonblk:1; /* it's a non-blocking ACK */ + lnet_kiov_t *tx_kiov; /* packet page frags */ + struct ksock_conn *tx_conn; /* owning conn */ + struct lnet_msg *tx_lnetmsg; /* lnet message for lnet_finalize() */ + time64_t tx_deadline; /* when (in secs) tx times out */ + struct ksock_msg tx_msg; /* socklnd message buffer */ + int tx_desc_size; /* size of this descriptor */ + enum lnet_msg_hstatus tx_hstatus; /* health status of tx */ + union { + struct { + struct kvec iov; /* virt hdr */ + lnet_kiov_t kiov[0]; /* paged payload */ + } paged; + struct { + struct kvec iov[1]; /* virt hdr + payload */ + } virt; + } tx_frags; +}; + +#define KSOCK_NOOP_TX_SIZE ((int)offsetof(struct ksock_tx, tx_frags.paged.kiov[0])) + +/* network zero copy callback descriptor embedded in struct ksock_tx */ + +/* space for the rx frag descriptors; we either read a single contiguous + * header, or up to LNET_MAX_IOV frags of payload of either type. */ +union ksock_rxiovspace { + struct kvec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; +}; + +#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */ +#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */ +#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */ +#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */ +#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */ +#define SOCKNAL_RX_SLOP 6 /* skipping body */ + +struct ksock_conn { + struct ksock_peer_ni *ksnc_peer; /* owning peer_ni */ + struct ksock_route *ksnc_route; /* owning route */ + struct list_head ksnc_list; /* stash on peer_ni's conn list */ + struct socket *ksnc_sock; /* actual socket */ + void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ + void *ksnc_saved_write_space; /* socket's original write_space() callback */ + atomic_t ksnc_conn_refcount; /* conn refcount */ + atomic_t ksnc_sock_refcount; /* sock refcount */ + struct ksock_sched *ksnc_scheduler; /* who schedules this connection */ + __u32 ksnc_myipaddr; /* my IP */ + __u32 ksnc_ipaddr; /* peer_ni's IP */ + int ksnc_port; /* peer_ni's port */ + signed int ksnc_type:3; /* type of connection, + * should be signed value */ + unsigned int ksnc_closing:1; /* being shut down */ + unsigned int ksnc_flip:1; /* flip or not, only for V2.x */ + unsigned int ksnc_zc_capable:1; /* enable to ZC */ + struct ksock_proto *ksnc_proto; /* protocol for the connection */ + + /* READER */ + + /* where I enq waiting input or a forwarding descriptor */ + struct list_head ksnc_rx_list; + time64_t ksnc_rx_deadline; /* when (in seconds) receive times out */ + __u8 ksnc_rx_started; /* started receiving a message */ + __u8 ksnc_rx_ready; /* data ready to read */ + __u8 ksnc_rx_scheduled;/* being progressed */ + __u8 ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # kvec frags */ + struct kvec *ksnc_rx_iov; /* the kvec frags */ + int ksnc_rx_nkiov; /* # page frags */ + lnet_kiov_t *ksnc_rx_kiov; /* the page frags */ + union ksock_rxiovspace ksnc_rx_iov_space;/* space for frag descriptors */ + __u32 ksnc_rx_csum; /* partial checksum for incoming data */ + struct lnet_msg *ksnc_lnet_msg; /* rx lnet_finalize arg*/ + struct ksock_msg ksnc_msg; /* incoming message buffer: + * V2.x message takes the + * whole struct + * V1.x message is a bare + * struct lnet_hdr, it's stored + * in ksnc_msg.ksm_u.lnetmsg + */ + /* -- WRITER -- */ + /* where I enq waiting for output space */ + struct list_head ksnc_tx_list; + /* packets waiting to be sent */ + struct list_head ksnc_tx_queue; + /* next TX that can carry a LNet message or ZC-ACK */ + struct ksock_tx *ksnc_tx_carrier; + /* when (in seconds) tx times out */ + time64_t ksnc_tx_deadline; + /* send buffer marker */ + int ksnc_tx_bufnob; + /* # bytes queued */ + atomic_t ksnc_tx_nob; + /* write space */ + int ksnc_tx_ready; + /* being progressed */ + int ksnc_tx_scheduled; + /* time stamp of the last posted TX */ + time64_t ksnc_tx_last_post; +}; + +struct ksock_route { + struct list_head ksnr_list; /* chain on peer_ni route list */ + struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */ + struct ksock_peer_ni *ksnr_peer; /* owning peer_ni */ + atomic_t ksnr_refcount; /* # users */ + time64_t ksnr_timeout; /* when (in secs) reconnection can happen next */ + time64_t ksnr_retry_interval; /* how long between retries */ + __u32 ksnr_myipaddr; /* my IP */ + __u32 ksnr_ipaddr; /* IP address to connect to */ + int ksnr_port; /* port to connect to */ + unsigned int ksnr_scheduled:1; /* scheduled for attention */ + unsigned int ksnr_connecting:1;/* connection establishment in progress */ + unsigned int ksnr_connected:4; /* connections established by type */ + unsigned int ksnr_deleted:1; /* been removed from peer_ni? */ + unsigned int ksnr_share_count; /* created explicitly? */ + int ksnr_conn_count; /* # conns established by this route */ +}; + +#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */ + +struct ksock_peer_ni { + struct list_head ksnp_list; /* stash on global peer_ni list */ + time64_t ksnp_last_alive;/* when (in seconds) I was last alive */ + struct lnet_process_id ksnp_id; /* who's on the other end(s) */ + atomic_t ksnp_refcount; /* # users */ + int ksnp_sharecount; /* lconf usage counter */ + int ksnp_closing; /* being closed */ + int ksnp_accepting;/* # passive connections pending */ + int ksnp_error; /* errno on closing last conn */ + __u64 ksnp_zc_next_cookie;/* ZC completion cookie */ + __u64 ksnp_incarnation; /* latest known peer_ni incarnation */ + struct ksock_proto *ksnp_proto; /* latest known peer_ni protocol */ + struct list_head ksnp_conns; /* all active connections */ + struct list_head ksnp_routes; /* routes */ + struct list_head ksnp_tx_queue; /* waiting packets */ + spinlock_t ksnp_lock; /* serialize, g_lock unsafe */ + /* zero copy requests wait for ACK */ + struct list_head ksnp_zc_req_list; + time64_t ksnp_send_keepalive; /* time to send keepalive */ + struct lnet_ni *ksnp_ni; /* which network */ + int ksnp_n_passive_ips; /* # of... */ + __u32 ksnp_passive_ips[LNET_INTERFACES_NUM]; /* preferred local interfaces */ +}; + +struct ksock_connreq { + /* stash on ksnd_connd_connreqs */ + struct list_head ksncr_list; + /* chosen NI */ + struct lnet_ni *ksncr_ni; + /* accepted socket */ + struct socket *ksncr_sock; +}; + +extern struct ksock_nal_data ksocknal_data; +extern struct ksock_tunables ksocknal_tunables; + +#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */ +#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */ +#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not preferred */ + +struct ksock_proto { + int pro_version; /* version number of protocol */ + int (*pro_send_hello)(struct ksock_conn *, struct ksock_hello_msg *); /* handshake function */ + int (*pro_recv_hello)(struct ksock_conn *, struct ksock_hello_msg *, int);/* handshake function */ + void (*pro_pack)(struct ksock_tx *); /* message pack */ + void (*pro_unpack)(struct ksock_msg *); /* message unpack */ + struct ksock_tx *(*pro_queue_tx_msg)(struct ksock_conn *, struct ksock_tx *); /* queue tx on the connection */ + int (*pro_queue_tx_zcack)(struct ksock_conn *, struct ksock_tx *, __u64); /* queue ZC ack on the connection */ + int (*pro_handle_zcreq)(struct ksock_conn *, __u64, int); /* handle ZC request */ + int (*pro_handle_zcack)(struct ksock_conn *, __u64, __u64); /* handle ZC ACK */ + int (*pro_match_tx)(struct ksock_conn *, struct ksock_tx *, int); /* msg type matches the connection type: + * return value: + * return MATCH_NO : no + * return MATCH_YES : matching type + * return MATCH_MAY : can be backup */ +}; + +extern struct ksock_proto ksocknal_protocol_v1x; +extern struct ksock_proto ksocknal_protocol_v2x; +extern struct ksock_proto ksocknal_protocol_v3x; + +#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR +#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR +#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR + +#ifndef CPU_MASK_NONE +#define CPU_MASK_NONE 0UL +#endif + +static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len) +{ +#if 1 + return crc32_le(crc, p, len); +#else + while (len-- > 0) + crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ; + + return crc; +#endif +} + +static inline int +ksocknal_route_mask(void) +{ + if (!*ksocknal_tunables.ksnd_typed_conns) + return (1 << SOCKLND_CONN_ANY); + + return ((1 << SOCKLND_CONN_CONTROL) | + (1 << SOCKLND_CONN_BULK_IN) | + (1 << SOCKLND_CONN_BULK_OUT)); +} + +static inline struct list_head * +ksocknal_nid2peerlist (lnet_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; + + return (&ksocknal_data.ksnd_peers [hash]); +} + +static inline void +ksocknal_conn_addref(struct ksock_conn *conn) +{ + LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); + atomic_inc(&conn->ksnc_conn_refcount); +} + +extern void ksocknal_queue_zombie_conn(struct ksock_conn *conn); +extern void ksocknal_finalize_zcreq(struct ksock_conn *conn); + +static inline void +ksocknal_conn_decref(struct ksock_conn *conn) +{ + LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); + if (atomic_dec_and_test(&conn->ksnc_conn_refcount)) + ksocknal_queue_zombie_conn(conn); +} + +static inline int +ksocknal_connsock_addref(struct ksock_conn *conn) +{ + int rc = -ESHUTDOWN; + + read_lock(&ksocknal_data.ksnd_global_lock); + if (!conn->ksnc_closing) { + LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); + atomic_inc(&conn->ksnc_sock_refcount); + rc = 0; + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + return (rc); +} + +static inline void +ksocknal_connsock_decref(struct ksock_conn *conn) +{ + LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); + if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) { + LASSERT (conn->ksnc_closing); + sock_release(conn->ksnc_sock); + conn->ksnc_sock = NULL; + ksocknal_finalize_zcreq(conn); + } +} + +static inline void +ksocknal_tx_addref(struct ksock_tx *tx) +{ + LASSERT(atomic_read(&tx->tx_refcount) > 0); + atomic_inc(&tx->tx_refcount); +} + +extern void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx); +extern void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int error); + +static inline void +ksocknal_tx_decref(struct ksock_tx *tx) +{ + LASSERT(atomic_read(&tx->tx_refcount) > 0); + if (atomic_dec_and_test(&tx->tx_refcount)) + ksocknal_tx_done(NULL, tx, 0); +} + +static inline void +ksocknal_route_addref(struct ksock_route *route) +{ + LASSERT(atomic_read(&route->ksnr_refcount) > 0); + atomic_inc(&route->ksnr_refcount); +} + +extern void ksocknal_destroy_route(struct ksock_route *route); + +static inline void +ksocknal_route_decref(struct ksock_route *route) +{ + LASSERT(atomic_read(&route->ksnr_refcount) > 0); + if (atomic_dec_and_test(&route->ksnr_refcount)) + ksocknal_destroy_route (route); +} + +static inline void +ksocknal_peer_addref(struct ksock_peer_ni *peer_ni) +{ + LASSERT(atomic_read(&peer_ni->ksnp_refcount) > 0); + atomic_inc(&peer_ni->ksnp_refcount); +} + +extern void ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni); + +static inline void +ksocknal_peer_decref(struct ksock_peer_ni *peer_ni) +{ + LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0); + if (atomic_dec_and_test(&peer_ni->ksnp_refcount)) + ksocknal_destroy_peer(peer_ni); +} + +int ksocknal_startup(struct lnet_ni *ni); +void ksocknal_shutdown(struct lnet_ni *ni); +int ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg); +int ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg); +int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, + int delayed, unsigned int niov, + struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int ksocknal_accept(struct lnet_ni *ni, struct socket *sock); + +int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip, + int port); +struct ksock_peer_ni *ksocknal_find_peer_locked(struct lnet_ni *ni, + struct lnet_process_id id); +struct ksock_peer_ni *ksocknal_find_peer(struct lnet_ni *ni, + struct lnet_process_id id); +extern void ksocknal_peer_failed(struct ksock_peer_ni *peer_ni); +extern int ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route, + struct socket *sock, int type); +extern void ksocknal_close_conn_locked(struct ksock_conn *conn, int why); +extern void ksocknal_terminate_conn(struct ksock_conn *conn); +extern void ksocknal_destroy_conn(struct ksock_conn *conn); +extern int ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni, + __u32 ipaddr, int why); +extern int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why); +int ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr); +extern struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, + struct ksock_tx *tx, int nonblk); + +extern int ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx, + struct lnet_process_id id); +extern struct ksock_tx *ksocknal_alloc_tx(int type, int size); +extern void ksocknal_free_tx(struct ksock_tx *tx); +extern struct ksock_tx *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk); +extern void ksocknal_next_tx_carrier(struct ksock_conn *conn); +extern void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn); +extern void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, + int error); +extern void ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive); +extern void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when); +extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name); +extern void ksocknal_thread_fini(void); +extern void ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni); +extern struct ksock_route *ksocknal_find_connectable_route_locked(struct ksock_peer_ni *peer_ni); +extern struct ksock_route *ksocknal_find_connecting_route_locked(struct ksock_peer_ni *peer_ni); +extern int ksocknal_new_packet(struct ksock_conn *conn, int skip); +extern int ksocknal_scheduler(void *arg); +extern int ksocknal_connd(void *arg); +extern int ksocknal_reaper(void *arg); +int ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn, + lnet_nid_t peer_nid, struct ksock_hello_msg *hello); +int ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn, + struct ksock_hello_msg *hello, + struct lnet_process_id *id, + __u64 *incarnation); +extern void ksocknal_read_callback(struct ksock_conn *conn); +extern void ksocknal_write_callback(struct ksock_conn *conn); + +extern int ksocknal_lib_zc_capable(struct ksock_conn *conn); +extern void ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn); +extern void ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn); +extern void ksocknal_lib_reset_callback(struct socket *sock, + struct ksock_conn *conn); +extern void ksocknal_lib_push_conn(struct ksock_conn *conn); +extern int ksocknal_lib_get_conn_addrs(struct ksock_conn *conn); +extern int ksocknal_lib_setup_sock(struct socket *so); +extern int ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov); +extern int ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov); +extern void ksocknal_lib_eager_ack(struct ksock_conn *conn); +extern int ksocknal_lib_recv_iov(struct ksock_conn *conn, + struct kvec *scratchiov); +extern int ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages, + struct kvec *scratchiov); +extern int ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, + int *rxmem, int *nagle); + +extern int ksocknal_tunables_init(void); + +extern void ksocknal_lib_csum_tx(struct ksock_tx *tx); + +extern int ksocknal_lib_memory_pressure(struct ksock_conn *conn); +extern int ksocknal_lib_bind_thread_to_cpu(int id); + +#endif /* _SOCKLND_SOCKLND_H_ */ diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c new file mode 100644 index 0000000000000..69e275e18adde --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c @@ -0,0 +1,2736 @@ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +struct ksock_tx * +ksocknal_alloc_tx(int type, int size) +{ + struct ksock_tx *tx = NULL; + + if (type == KSOCK_MSG_NOOP) { + LASSERT(size == KSOCK_NOOP_TX_SIZE); + + /* searching for a noop tx in free list */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next, + struct ksock_tx, tx_list); + LASSERT(tx->tx_desc_size == size); + list_del(&tx->tx_list); + } + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } + + if (tx == NULL) + LIBCFS_ALLOC(tx, size); + + if (tx == NULL) + return NULL; + + atomic_set(&tx->tx_refcount, 1); + tx->tx_zc_aborted = 0; + tx->tx_zc_capable = 0; + tx->tx_zc_checked = 0; + tx->tx_hstatus = LNET_MSG_STATUS_OK; + tx->tx_desc_size = size; + + atomic_inc(&ksocknal_data.ksnd_nactive_txs); + + return tx; +} + +struct ksock_tx * +ksocknal_alloc_tx_noop(__u64 cookie, int nonblk) +{ + struct ksock_tx *tx; + + tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE); + if (tx == NULL) { + CERROR("Can't allocate noop tx desc\n"); + return NULL; + } + + tx->tx_conn = NULL; + tx->tx_lnetmsg = NULL; + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_iov = tx->tx_frags.virt.iov; + tx->tx_niov = 1; + tx->tx_nonblk = nonblk; + + tx->tx_msg.ksm_csum = 0; + tx->tx_msg.ksm_type = KSOCK_MSG_NOOP; + tx->tx_msg.ksm_zc_cookies[0] = 0; + tx->tx_msg.ksm_zc_cookies[1] = cookie; + + return tx; +} + + +void +ksocknal_free_tx(struct ksock_tx *tx) +{ + atomic_dec(&ksocknal_data.ksnd_nactive_txs); + + if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) { + /* it's a noop tx */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs); + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } else { + LIBCFS_FREE(tx, tx->tx_desc_size); + } +} + +static int +ksocknal_send_iov(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov) +{ + struct kvec *iov = tx->tx_iov; + int nob; + int rc; + + LASSERT(tx->tx_niov > 0); + + /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */ + rc = ksocknal_lib_send_iov(conn, tx, scratch_iov); + + if (rc <= 0) /* sent nothing? */ + return rc; + + nob = rc; + LASSERT(nob <= tx->tx_resid); + tx->tx_resid -= nob; + + /* "consume" iov */ + do { + LASSERT(tx->tx_niov > 0); + + if (nob < (int) iov->iov_len) { + iov->iov_base += nob; + iov->iov_len -= nob; + return rc; + } + + nob -= iov->iov_len; + tx->tx_iov = ++iov; + tx->tx_niov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov) +{ + lnet_kiov_t *kiov = tx->tx_kiov; + int nob; + int rc; + + LASSERT(tx->tx_niov == 0); + LASSERT(tx->tx_nkiov > 0); + + /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */ + rc = ksocknal_lib_send_kiov(conn, tx, scratch_iov); + + if (rc <= 0) /* sent nothing? */ + return rc; + + nob = rc; + LASSERT(nob <= tx->tx_resid); + tx->tx_resid -= nob; + + /* "consume" kiov */ + do { + LASSERT(tx->tx_nkiov > 0); + + if (nob < (int)kiov->kiov_len) { + kiov->kiov_offset += nob; + kiov->kiov_len -= nob; + return rc; + } + + nob -= (int)kiov->kiov_len; + tx->tx_kiov = ++kiov; + tx->tx_nkiov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov) +{ + int rc; + int bufnob; + + if (ksocknal_data.ksnd_stall_tx != 0) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_tx)); + } + + LASSERT(tx->tx_resid != 0); + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT(conn->ksnc_closing); + return -ESHUTDOWN; + } + + do { + if (ksocknal_data.ksnd_enomem_tx > 0) { + /* testing... */ + ksocknal_data.ksnd_enomem_tx--; + rc = -EAGAIN; + } else if (tx->tx_niov != 0) { + rc = ksocknal_send_iov(conn, tx, scratch_iov); + } else { + rc = ksocknal_send_kiov(conn, tx, scratch_iov); + } + + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + if (rc > 0) /* sent something? */ + conn->ksnc_tx_bufnob += rc; /* account it */ + + if (bufnob < conn->ksnc_tx_bufnob) { + /* allocated send buffer bytes < computed; infer + * something got ACKed */ + conn->ksnc_tx_deadline = ktime_get_seconds() + + lnet_get_lnd_timeout(); + conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds(); + conn->ksnc_tx_bufnob = bufnob; + smp_mb(); + } + + if (rc <= 0) { /* Didn't write anything? */ + /* some stacks return 0 instead of -EAGAIN */ + if (rc == 0) + rc = -EAGAIN; + + /* Check if EAGAIN is due to memory pressure */ + if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn)) + rc = -ENOMEM; + + break; + } + + /* socket's wmem_queued now includes 'rc' bytes */ + atomic_sub (rc, &conn->ksnc_tx_nob); + rc = 0; + + } while (tx->tx_resid != 0); + + ksocknal_connsock_decref(conn); + return rc; +} + +static int +ksocknal_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov) +{ + struct kvec *iov = conn->ksnc_rx_iov; + int nob; + int rc; + + LASSERT(conn->ksnc_rx_niov > 0); + + /* Never touch conn->ksnc_rx_iov or change connection + * status inside ksocknal_lib_recv_iov */ + rc = ksocknal_lib_recv_iov(conn, scratchiov); + + if (rc <= 0) + return rc; + + /* received something... */ + nob = rc; + + conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds(); + conn->ksnc_rx_deadline = ktime_get_seconds() + + lnet_get_lnd_timeout(); + smp_mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + + conn->ksnc_rx_nob_wanted -= nob; + conn->ksnc_rx_nob_left -= nob; + + do { + LASSERT(conn->ksnc_rx_niov > 0); + + if (nob < (int)iov->iov_len) { + iov->iov_len -= nob; + iov->iov_base += nob; + return -EAGAIN; + } + + nob -= iov->iov_len; + conn->ksnc_rx_iov = ++iov; + conn->ksnc_rx_niov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_recv_kiov(struct ksock_conn *conn, struct page **rx_scratch_pgs, + struct kvec *scratch_iov) +{ + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + int nob; + int rc; + LASSERT(conn->ksnc_rx_nkiov > 0); + + /* Never touch conn->ksnc_rx_kiov or change connection + * status inside ksocknal_lib_recv_iov */ + rc = ksocknal_lib_recv_kiov(conn, rx_scratch_pgs, scratch_iov); + + if (rc <= 0) + return rc; + + /* received something... */ + nob = rc; + + conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds(); + conn->ksnc_rx_deadline = ktime_get_seconds() + + lnet_get_lnd_timeout(); + smp_mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + + conn->ksnc_rx_nob_wanted -= nob; + conn->ksnc_rx_nob_left -= nob; + + do { + LASSERT(conn->ksnc_rx_nkiov > 0); + + if (nob < (int) kiov->kiov_len) { + kiov->kiov_offset += nob; + kiov->kiov_len -= nob; + return -EAGAIN; + } + + nob -= kiov->kiov_len; + conn->ksnc_rx_kiov = ++kiov; + conn->ksnc_rx_nkiov--; + } while (nob != 0); + + return 1; +} + +static int +ksocknal_receive(struct ksock_conn *conn, struct page **rx_scratch_pgs, + struct kvec *scratch_iov) +{ + /* Return 1 on success, 0 on EOF, < 0 on error. + * Caller checks ksnc_rx_nob_wanted to determine + * progress/completion. */ + int rc; + ENTRY; + + if (ksocknal_data.ksnd_stall_rx != 0) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_rx)); + } + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT(conn->ksnc_closing); + return -ESHUTDOWN; + } + + for (;;) { + if (conn->ksnc_rx_niov != 0) + rc = ksocknal_recv_iov(conn, scratch_iov); + else + rc = ksocknal_recv_kiov(conn, rx_scratch_pgs, + scratch_iov); + + if (rc <= 0) { + /* error/EOF or partial receive */ + if (rc == -EAGAIN) { + rc = 1; + } else if (rc == 0 && conn->ksnc_rx_started) { + /* EOF in the middle of a message */ + rc = -EPROTO; + } + break; + } + + /* Completed a fragment */ + + if (conn->ksnc_rx_nob_wanted == 0) { + rc = 1; + break; + } + } + + ksocknal_connsock_decref(conn); + RETURN(rc); +} + +void +ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int rc) +{ + struct lnet_msg *lnetmsg = tx->tx_lnetmsg; + enum lnet_msg_hstatus hstatus = tx->tx_hstatus; + ENTRY; + + LASSERT(ni != NULL || tx->tx_conn != NULL); + + if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted)) { + rc = -EIO; + if (hstatus == LNET_MSG_STATUS_OK) + hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + } + + if (tx->tx_conn != NULL) + ksocknal_conn_decref(tx->tx_conn); + + ksocknal_free_tx(tx); + if (lnetmsg != NULL) { /* KSOCK_MSG_NOOP go without lnetmsg */ + lnetmsg->msg_health_status = hstatus; + lnet_finalize(lnetmsg, rc); + } + + EXIT; +} + +void +ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error) +{ + struct ksock_tx *tx; + + while (!list_empty(txlist)) { + tx = list_entry(txlist->next, struct ksock_tx, tx_list); + + if (error && tx->tx_lnetmsg != NULL) { + CNETERR("Deleting packet type %d len %d %s->%s\n", + le32_to_cpu(tx->tx_lnetmsg->msg_hdr.type), + le32_to_cpu(tx->tx_lnetmsg->msg_hdr.payload_length), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid))); + } else if (error) { + CNETERR("Deleting noop packet\n"); + } + + list_del(&tx->tx_list); + + if (tx->tx_hstatus == LNET_MSG_STATUS_OK) { + if (error == -ETIMEDOUT) + tx->tx_hstatus = + LNET_MSG_STATUS_LOCAL_TIMEOUT; + else if (error == -ENETDOWN || + error == -EHOSTUNREACH || + error == -ENETUNREACH || + error == -ECONNREFUSED || + error == -ECONNRESET) + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED; + /* + * for all other errors we don't want to + * retransmit + */ + else if (error) + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + } + + LASSERT(atomic_read(&tx->tx_refcount) == 1); + ksocknal_tx_done(ni, tx, error); + } +} + +static void +ksocknal_check_zc_req(struct ksock_tx *tx) +{ + struct ksock_conn *conn = tx->tx_conn; + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + + /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx + * to ksnp_zc_req_list if some fragment of this message should be sent + * zero-copy. Our peer_ni will send an ACK containing this cookie when + * she has received this message to tell us we can signal completion. + * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on + * ksnp_zc_req_list. */ + LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT (tx->tx_zc_capable); + + tx->tx_zc_checked = 1; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x || + !conn->ksnc_zc_capable) + return; + + /* assign cookie and queue tx to pending list, it will be released when + * a matching ack is received. See ksocknal_handle_zcack() */ + + ksocknal_tx_addref(tx); + + spin_lock(&peer_ni->ksnp_lock); + + /* ZC_REQ is going to be pinned to the peer_ni */ + tx->tx_deadline = ktime_get_seconds() + + lnet_get_lnd_timeout(); + + LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0); + + tx->tx_msg.ksm_zc_cookies[0] = peer_ni->ksnp_zc_next_cookie++; + + if (peer_ni->ksnp_zc_next_cookie == 0) + peer_ni->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; + + list_add_tail(&tx->tx_zc_list, &peer_ni->ksnp_zc_req_list); + + spin_unlock(&peer_ni->ksnp_lock); +} + +static void +ksocknal_uncheck_zc_req(struct ksock_tx *tx) +{ + struct ksock_peer_ni *peer_ni = tx->tx_conn->ksnc_peer; + + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_zc_capable); + + tx->tx_zc_checked = 0; + + spin_lock(&peer_ni->ksnp_lock); + + if (tx->tx_msg.ksm_zc_cookies[0] == 0) { + /* Not waiting for an ACK */ + spin_unlock(&peer_ni->ksnp_lock); + return; + } + + tx->tx_msg.ksm_zc_cookies[0] = 0; + list_del(&tx->tx_zc_list); + + spin_unlock(&peer_ni->ksnp_lock); + + ksocknal_tx_decref(tx); +} + +static int +ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratch_iov) +{ + int rc; + bool error_sim = false; + + if (lnet_send_error_simulation(tx->tx_lnetmsg, &tx->tx_hstatus)) { + error_sim = true; + rc = -EINVAL; + goto simulate_error; + } + + if (tx->tx_zc_capable && !tx->tx_zc_checked) + ksocknal_check_zc_req(tx); + + rc = ksocknal_transmit(conn, tx, scratch_iov); + + CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc); + + if (tx->tx_resid == 0) { + /* Sent everything OK */ + LASSERT(rc == 0); + + return 0; + } + + if (rc == -EAGAIN) + return rc; + + if (rc == -ENOMEM) { + static int counter; + + counter++; /* exponential backoff warnings */ + if ((counter & (-counter)) == counter) + CWARN("%u ENOMEM tx %p (%u allocated)\n", + counter, conn, atomic_read(&libcfs_kmemory)); + + /* Queue on ksnd_enomem_conns for retry after a timeout */ + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* enomem list takes over scheduler's ref... */ + LASSERT(conn->ksnc_tx_scheduled); + list_add_tail(&conn->ksnc_tx_list, + &ksocknal_data.ksnd_enomem_conns); + if (ktime_get_seconds() + SOCKNAL_ENOMEM_RETRY < + ksocknal_data.ksnd_reaper_waketime) + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* + * set the health status of the message which determines + * whether we should retry the transmit + */ + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + return (rc); + } + +simulate_error: + + /* Actual error */ + LASSERT(rc < 0); + + if (!error_sim) { + /* + * set the health status of the message which determines + * whether we should retry the transmit + */ + if (rc == -ETIMEDOUT) + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT; + else + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + } + + if (!conn->ksnc_closing) { + switch (rc) { + case -ECONNRESET: + LCONSOLE_WARN("Host %pI4h reset our connection " + "while we were sending data; it may have " + "rebooted.\n", + &conn->ksnc_ipaddr); + break; + default: + LCONSOLE_WARN("There was an unexpected network error " + "while writing to %pI4h: %d.\n", + &conn->ksnc_ipaddr, rc); + break; + } + CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n", + conn, rc, libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port); + } + + if (tx->tx_zc_checked) + ksocknal_uncheck_zc_req(tx); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings(conn, + (conn->ksnc_closing) ? 0 : rc); + + return rc; +} + +static void +ksocknal_launch_connection_locked(struct ksock_route *route) +{ + + /* called holding write lock on ksnd_global_lock */ + + LASSERT (!route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0); + + route->ksnr_scheduled = 1; /* scheduling conn for connd */ + ksocknal_route_addref(route); /* extra ref for connd */ + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&route->ksnr_connd_list, + &ksocknal_data.ksnd_connd_routes); + wake_up(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); +} + +void +ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni) +{ + struct ksock_route *route; + + /* called holding write lock on ksnd_global_lock */ + for (;;) { + /* launch any/all connections that need it */ + route = ksocknal_find_connectable_route_locked(peer_ni); + if (route == NULL) + return; + + ksocknal_launch_connection_locked(route); + } +} + +struct ksock_conn * +ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, struct ksock_tx *tx, int nonblk) +{ + struct list_head *tmp; + struct ksock_conn *conn; + struct ksock_conn *typed = NULL; + struct ksock_conn *fallback = NULL; + int tnob = 0; + int fnob = 0; + + list_for_each(tmp, &peer_ni->ksnp_conns) { + struct ksock_conn *c = list_entry(tmp, struct ksock_conn, + ksnc_list); + int nob = atomic_read(&c->ksnc_tx_nob) + + c->ksnc_sock->sk->sk_wmem_queued; + int rc; + + LASSERT (!c->ksnc_closing); + LASSERT (c->ksnc_proto != NULL && + c->ksnc_proto->pro_match_tx != NULL); + + rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk); + + switch (rc) { + default: + LBUG(); + case SOCKNAL_MATCH_NO: /* protocol rejected the tx */ + continue; + + case SOCKNAL_MATCH_YES: /* typed connection */ + if (typed == NULL || tnob > nob || + (tnob == nob && *ksocknal_tunables.ksnd_round_robin && + typed->ksnc_tx_last_post > c->ksnc_tx_last_post)) { + typed = c; + tnob = nob; + } + break; + + case SOCKNAL_MATCH_MAY: /* fallback connection */ + if (fallback == NULL || fnob > nob || + (fnob == nob && *ksocknal_tunables.ksnd_round_robin && + fallback->ksnc_tx_last_post > c->ksnc_tx_last_post)) { + fallback = c; + fnob = nob; + } + break; + } + } + + /* prefer the typed selection */ + conn = (typed != NULL) ? typed : fallback; + + if (conn != NULL) + conn->ksnc_tx_last_post = ktime_get_seconds(); + + return conn; +} + +void +ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx) +{ + conn->ksnc_proto->pro_pack(tx); + + atomic_add (tx->tx_nob, &conn->ksnc_tx_nob); + ksocknal_conn_addref(conn); /* +1 ref for tx */ + tx->tx_conn = conn; +} + +void +ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn) +{ + struct ksock_sched *sched = conn->ksnc_scheduler; + struct ksock_msg *msg = &tx->tx_msg; + struct ksock_tx *ztx = NULL; + int bufnob = 0; + + /* called holding global lock (read or irq-write) and caller may + * not have dropped this lock between finding conn and calling me, + * so we don't need the {get,put}connsock dance to deref + * ksnc_sock... */ + LASSERT(!conn->ksnc_closing); + + CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port); + + ksocknal_tx_prep(conn, tx); + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete ksocknal message header. */ + LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) + + lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) == + (unsigned int)tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_resid == tx->tx_nob); + + CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n", + tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type: + KSOCK_MSG_NOOP, + tx->tx_nob, tx->tx_niov, tx->tx_nkiov); + + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + spin_lock_bh(&sched->kss_lock); + + if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) { + /* First packet starts the timeout */ + conn->ksnc_tx_deadline = ktime_get_seconds() + + lnet_get_lnd_timeout(); + if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */ + conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds(); + conn->ksnc_tx_bufnob = 0; + smp_mb(); /* order with adding to tx_queue */ + } + + if (msg->ksm_type == KSOCK_MSG_NOOP) { + /* The packet is noop ZC ACK, try to piggyback the ack_cookie + * on a normal packet so I don't need to send it */ + LASSERT (msg->ksm_zc_cookies[1] != 0); + LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL); + + if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0)) + ztx = tx; /* ZC ACK piggybacked on ztx release tx later */ + + } else { + /* It's a normal packet - can it piggback a noop zc-ack that + * has been queued already? */ + LASSERT (msg->ksm_zc_cookies[1] == 0); + LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL); + + ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx); + /* ztx will be released later */ + } + + if (ztx != NULL) { + atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob); + list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs); + } + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + /* +1 ref for scheduler */ + ksocknal_conn_addref(conn); + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + wake_up(&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); +} + + +struct ksock_route * +ksocknal_find_connectable_route_locked(struct ksock_peer_ni *peer_ni) +{ + time64_t now = ktime_get_seconds(); + struct list_head *tmp; + struct ksock_route *route; + + list_for_each(tmp, &peer_ni->ksnp_routes) { + route = list_entry(tmp, struct ksock_route, ksnr_list); + + LASSERT (!route->ksnr_connecting || route->ksnr_scheduled); + + if (route->ksnr_scheduled) /* connections being established */ + continue; + + /* all route types connected ? */ + if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0) + continue; + + if (!(route->ksnr_retry_interval == 0 || /* first attempt */ + now >= route->ksnr_timeout)) { + CDEBUG(D_NET, + "Too soon to retry route %pI4h " + "(cnted %d, interval %lld, %lld secs later)\n", + &route->ksnr_ipaddr, + route->ksnr_connected, + route->ksnr_retry_interval, + route->ksnr_timeout - now); + continue; + } + + return (route); + } + + return (NULL); +} + +struct ksock_route * +ksocknal_find_connecting_route_locked(struct ksock_peer_ni *peer_ni) +{ + struct list_head *tmp; + struct ksock_route *route; + + list_for_each(tmp, &peer_ni->ksnp_routes) { + route = list_entry(tmp, struct ksock_route, ksnr_list); + + LASSERT (!route->ksnr_connecting || route->ksnr_scheduled); + + if (route->ksnr_scheduled) + return (route); + } + + return (NULL); +} + +int +ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx, + struct lnet_process_id id) +{ + struct ksock_peer_ni *peer_ni; + struct ksock_conn *conn; + rwlock_t *g_lock; + int retry; + int rc; + + LASSERT (tx->tx_conn == NULL); + + g_lock = &ksocknal_data.ksnd_global_lock; + + for (retry = 0;; retry = 1) { + read_lock(g_lock); + peer_ni = ksocknal_find_peer_locked(ni, id); + if (peer_ni != NULL) { + if (ksocknal_find_connectable_route_locked(peer_ni) == NULL) { + conn = ksocknal_find_conn_locked(peer_ni, tx, tx->tx_nonblk); + if (conn != NULL) { + /* I've got no routes that need to be + * connecting and I do have an actual + * connection... */ + ksocknal_queue_tx_locked (tx, conn); + read_unlock(g_lock); + return (0); + } + } + } + + /* I'll need a write lock... */ + read_unlock(g_lock); + + write_lock_bh(g_lock); + + peer_ni = ksocknal_find_peer_locked(ni, id); + if (peer_ni != NULL) + break; + + write_unlock_bh(g_lock); + + if ((id.pid & LNET_PID_USERFLAG) != 0) { + CERROR("Refusing to create a connection to " + "userspace process %s\n", libcfs_id2str(id)); + return -EHOSTUNREACH; + } + + if (retry) { + CERROR("Can't find peer_ni %s\n", libcfs_id2str(id)); + return -EHOSTUNREACH; + } + + rc = ksocknal_add_peer(ni, id, + LNET_NIDADDR(id.nid), + lnet_acceptor_port()); + if (rc != 0) { + CERROR("Can't add peer_ni %s: %d\n", + libcfs_id2str(id), rc); + return rc; + } + } + + ksocknal_launch_all_connections_locked(peer_ni); + + conn = ksocknal_find_conn_locked(peer_ni, tx, tx->tx_nonblk); + if (conn != NULL) { + /* Connection exists; queue message on it */ + ksocknal_queue_tx_locked (tx, conn); + write_unlock_bh(g_lock); + return (0); + } + + if (peer_ni->ksnp_accepting > 0 || + ksocknal_find_connecting_route_locked (peer_ni) != NULL) { + /* the message is going to be pinned to the peer_ni */ + tx->tx_deadline = ktime_get_seconds() + + lnet_get_lnd_timeout(); + + /* Queue the message until a connection is established */ + list_add_tail(&tx->tx_list, &peer_ni->ksnp_tx_queue); + write_unlock_bh(g_lock); + return 0; + } + + write_unlock_bh(g_lock); + + /* NB Routes may be ignored if connections to them failed recently */ + CNETERR("No usable routes to %s\n", libcfs_id2str(id)); + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR; + return (-EHOSTUNREACH); +} + +int +ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) +{ + int mpflag = 1; + int type = lntmsg->msg_type; + struct lnet_process_id target = lntmsg->msg_target; + unsigned int payload_niov = lntmsg->msg_niov; + struct kvec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + struct ksock_tx *tx; + int desc_size; + int rc; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it... */ + + CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + LASSERT (!in_interrupt ()); + + if (payload_iov != NULL) + desc_size = offsetof(struct ksock_tx, + tx_frags.virt.iov[1 + payload_niov]); + else + desc_size = offsetof(struct ksock_tx, + tx_frags.paged.kiov[payload_niov]); + + if (lntmsg->msg_vmflush) + mpflag = cfs_memory_pressure_get_and_set(); + tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size); + if (tx == NULL) { + CERROR("Can't allocate tx desc type %d size %d\n", + type, desc_size); + if (lntmsg->msg_vmflush) + cfs_memory_pressure_restore(mpflag); + return (-ENOMEM); + } + + tx->tx_conn = NULL; /* set when assigned a conn */ + tx->tx_lnetmsg = lntmsg; + + if (payload_iov != NULL) { + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_iov = tx->tx_frags.virt.iov; + tx->tx_niov = 1 + + lnet_extract_iov(payload_niov, &tx->tx_iov[1], + payload_niov, payload_iov, + payload_offset, payload_nob); + } else { + tx->tx_niov = 1; + tx->tx_iov = &tx->tx_frags.paged.iov; + tx->tx_kiov = tx->tx_frags.paged.kiov; + tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov, + payload_niov, payload_kiov, + payload_offset, payload_nob); + + if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload) + tx->tx_zc_capable = 1; + } + + tx->tx_msg.ksm_csum = 0; + tx->tx_msg.ksm_type = KSOCK_MSG_LNET; + tx->tx_msg.ksm_zc_cookies[0] = 0; + tx->tx_msg.ksm_zc_cookies[1] = 0; + + /* The first fragment will be set later in pro_pack */ + rc = ksocknal_launch_packet(ni, tx, target); + if (!mpflag) + cfs_memory_pressure_restore(mpflag); + + if (rc == 0) + return (0); + + lntmsg->msg_health_status = tx->tx_hstatus; + ksocknal_free_tx(tx); + return (-EIO); +} + +int +ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name) +{ + struct task_struct *task = kthread_run(fn, arg, name); + + if (IS_ERR(task)) + return PTR_ERR(task); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + ksocknal_data.ksnd_nthreads++; + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + return 0; +} + +void +ksocknal_thread_fini (void) +{ + write_lock_bh(&ksocknal_data.ksnd_global_lock); + ksocknal_data.ksnd_nthreads--; + write_unlock_bh(&ksocknal_data.ksnd_global_lock); +} + +int +ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip) +{ + static char ksocknal_slop_buffer[4096]; + int nob; + unsigned int niov; + int skipped; + + LASSERT(conn->ksnc_proto != NULL); + + if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) { + /* Remind the socket to ack eagerly... */ + ksocknal_lib_eager_ack(conn); + } + + if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_started = 0; + smp_mb(); /* racing with timeout thread */ + + switch (conn->ksnc_proto->pro_version) { + case KSOCK_PROTO_V2: + case KSOCK_PROTO_V3: + conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER; + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg; + + conn->ksnc_rx_nob_wanted = offsetof(struct ksock_msg, ksm_u); + conn->ksnc_rx_nob_left = offsetof(struct ksock_msg, ksm_u); + conn->ksnc_rx_iov[0].iov_len = offsetof(struct ksock_msg, ksm_u); + break; + + case KSOCK_PROTO_V1: + /* Receiving bare struct lnet_hdr */ + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(struct lnet_hdr); + conn->ksnc_rx_nob_left = sizeof(struct lnet_hdr); + + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg; + conn->ksnc_rx_iov[0].iov_len = sizeof(struct lnet_hdr); + break; + + default: + LBUG (); + } + conn->ksnc_rx_niov = 1; + + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_csum = ~0; + return (1); + } + + /* Set up to skip as much as possible now. If there's more left + * (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + skipped = 0; + niov = 0; + + do { + nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof(conn->ksnc_rx_iov_space) / sizeof(struct kvec)); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +static int +ksocknal_process_receive(struct ksock_conn *conn, + struct page **rx_scratch_pgs, + struct kvec *scratch_iov) +{ + struct lnet_hdr *lhdr; + struct lnet_process_id *id; + int rc; + + LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0); + + /* NB: sched lock NOT held */ + /* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */ + LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + again: + if (conn->ksnc_rx_nob_wanted != 0) { + rc = ksocknal_receive(conn, rx_scratch_pgs, + scratch_iov); + + if (rc <= 0) { + struct lnet_process_id ksnp_id; + + ksnp_id = conn->ksnc_peer->ksnp_id; + + LASSERT(rc != -EAGAIN); + if (rc == 0) + CDEBUG(D_NET, "[%p] EOF from %s " + "ip %pI4h:%d\n", conn, + libcfs_id2str(ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + else if (!conn->ksnc_closing) + CERROR("[%p] Error %d on read from %s " + "ip %pI4h:%d\n", conn, rc, + libcfs_id2str(ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings (conn, + (conn->ksnc_closing) ? 0 : rc); + return (rc == 0 ? -ESHUTDOWN : rc); + } + + if (conn->ksnc_rx_nob_wanted != 0) { + /* short read */ + return (-EAGAIN); + } + } + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_flip) { + __swab32s(&conn->ksnc_msg.ksm_type); + __swab32s(&conn->ksnc_msg.ksm_csum); + __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]); + __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]); + } + + if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP && + conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) { + CERROR("%s: Unknown message type: %x\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_type); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return (-EPROTO); + } + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP && + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + /* NOOP Checksum error */ + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return (-EIO); + } + + if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) { + __u64 cookie = 0; + + LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x); + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) + cookie = conn->ksnc_msg.ksm_zc_cookies[0]; + + rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie, + conn->ksnc_msg.ksm_zc_cookies[1]); + + if (rc != 0) { + CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + cookie, conn->ksnc_msg.ksm_zc_cookies[1]); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return (rc); + } + } + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) { + ksocknal_new_packet (conn, 0); + return 0; /* NOOP is done and just return */ + } + + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(struct ksock_lnet_msg); + conn->ksnc_rx_nob_left = sizeof(struct ksock_lnet_msg); + + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg; + conn->ksnc_rx_iov[0].iov_len = sizeof(struct ksock_lnet_msg); + + conn->ksnc_rx_niov = 1; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + + goto again; /* read lnet header now */ + + case SOCKNAL_RX_LNET_HEADER: + /* unpack message header */ + conn->ksnc_proto->pro_unpack(&conn->ksnc_msg); + + if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) { + /* Userspace peer_ni */ + lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; + id = &conn->ksnc_peer->ksnp_id; + + /* Substitute process ID assigned at connection time */ + lhdr->src_pid = cpu_to_le32(id->pid); + lhdr->src_nid = cpu_to_le64(id->nid); + } + + conn->ksnc_rx_state = SOCKNAL_RX_PARSE; + ksocknal_conn_addref(conn); /* ++ref while parsing */ + + rc = lnet_parse(conn->ksnc_peer->ksnp_ni, + &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr, + conn->ksnc_peer->ksnp_id.nid, conn, 0); + if (rc < 0) { + /* I just received garbage: give up on this conn */ + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings (conn, rc); + ksocknal_conn_decref(conn); + return (-EPROTO); + } + + /* I'm racing with ksocknal_recv() */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD); + + if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD) + return 0; + + /* ksocknal_recv() got called */ + goto again; + + case SOCKNAL_RX_LNET_PAYLOAD: + /* payload all received */ + rc = 0; + + if (conn->ksnc_rx_nob_left == 0 && /* not truncating */ + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + rc = -EIO; + } + + if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) { + LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); + + lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; + id = &conn->ksnc_peer->ksnp_id; + + rc = conn->ksnc_proto->pro_handle_zcreq(conn, + conn->ksnc_msg.ksm_zc_cookies[0], + *ksocknal_tunables.ksnd_nonblk_zcack || + le64_to_cpu(lhdr->src_nid) != id->nid); + } + + if (rc && conn->ksnc_lnet_msg) + conn->ksnc_lnet_msg->msg_health_status = + LNET_MSG_STATUS_REMOTE_ERROR; + lnet_finalize(conn->ksnc_lnet_msg, rc); + + if (rc != 0) { + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings (conn, rc); + return (-EPROTO); + } + fallthrough; + + case SOCKNAL_RX_SLOP: + /* starting new packet? */ + if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left)) + return 0; /* come back later */ + goto again; /* try to finish reading slop now */ + + default: + break; + } + + /* Not Reached */ + LBUG (); + return (-EINVAL); /* keep gcc happy */ +} + +int +ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int niov, struct kvec *iov, + lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen, + unsigned int rlen) +{ + struct ksock_conn *conn = private; + struct ksock_sched *sched = conn->ksnc_scheduler; + + LASSERT (mlen <= rlen); + LASSERT (niov <= LNET_MAX_IOV); + + conn->ksnc_lnet_msg = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + if (mlen == 0 || iov != NULL) { + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + conn->ksnc_rx_niov = + lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov, + niov, iov, offset, mlen); + } else { + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_iov = NULL; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + conn->ksnc_rx_nkiov = + lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov, + niov, kiov, offset, mlen); + } + + LASSERT (mlen == + lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + LASSERT (conn->ksnc_rx_scheduled); + + spin_lock_bh(&sched->kss_lock); + + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_PARSE_WAIT: + list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); + wake_up(&sched->kss_waitq); + LASSERT(conn->ksnc_rx_ready); + break; + + case SOCKNAL_RX_PARSE: + /* scheduler hasn't noticed I'm parsing yet */ + break; + } + + conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD; + + spin_unlock_bh(&sched->kss_lock); + ksocknal_conn_decref(conn); + return 0; +} + +static inline int +ksocknal_sched_cansleep(struct ksock_sched *sched) +{ + int rc; + + spin_lock_bh(&sched->kss_lock); + + rc = (!ksocknal_data.ksnd_shuttingdown && + list_empty(&sched->kss_rx_conns) && + list_empty(&sched->kss_tx_conns)); + + spin_unlock_bh(&sched->kss_lock); + return rc; +} + +int ksocknal_scheduler(void *arg) +{ + struct ksock_sched *sched; + struct ksock_conn *conn; + struct ksock_tx *tx; + int rc; + int nloops = 0; + long id = (long)arg; + struct page **rx_scratch_pgs; + struct kvec *scratch_iov; + + sched = ksocknal_data.ksnd_schedulers[KSOCK_THREAD_CPT(id)]; + + LIBCFS_CPT_ALLOC(rx_scratch_pgs, lnet_cpt_table(), sched->kss_cpt, + sizeof(*rx_scratch_pgs) * LNET_MAX_IOV); + if (!rx_scratch_pgs) { + CERROR("Unable to allocate scratch pages\n"); + return -ENOMEM; + } + + LIBCFS_CPT_ALLOC(scratch_iov, lnet_cpt_table(), sched->kss_cpt, + sizeof(*scratch_iov) * LNET_MAX_IOV); + if (!scratch_iov) { + CERROR("Unable to allocate scratch iov\n"); + return -ENOMEM; + } + + cfs_block_allsigs(); + + rc = cfs_cpt_bind(lnet_cpt_table(), sched->kss_cpt); + if (rc != 0) { + CWARN("Can't set CPU partition affinity to %d: %d\n", + sched->kss_cpt, rc); + } + + spin_lock_bh(&sched->kss_lock); + + while (!ksocknal_data.ksnd_shuttingdown) { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty(&sched->kss_rx_conns)) { + conn = list_entry(sched->kss_rx_conns.next, + struct ksock_conn, ksnc_rx_list); + list_del(&conn->ksnc_rx_list); + + LASSERT(conn->ksnc_rx_scheduled); + LASSERT(conn->ksnc_rx_ready); + + /* clear rx_ready in case receive isn't complete. + * Do it BEFORE we call process_recv, since + * data_ready can set it any time after we release + * kss_lock. */ + conn->ksnc_rx_ready = 0; + spin_unlock_bh(&sched->kss_lock); + + rc = ksocknal_process_receive(conn, rx_scratch_pgs, + scratch_iov); + + spin_lock_bh(&sched->kss_lock); + + /* I'm the only one that can clear this flag */ + LASSERT(conn->ksnc_rx_scheduled); + + /* Did process_receive get everything it wanted? */ + if (rc == 0) + conn->ksnc_rx_ready = 1; + + if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) { + /* Conn blocked waiting for ksocknal_recv() + * I change its state (under lock) to signal + * it can be rescheduled */ + conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT; + } else if (conn->ksnc_rx_ready) { + /* reschedule for rx */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + } else { + conn->ksnc_rx_scheduled = 0; + /* drop my ref */ + ksocknal_conn_decref(conn); + } + + did_something = 1; + } + + if (!list_empty(&sched->kss_tx_conns)) { + struct list_head zlist = LIST_HEAD_INIT(zlist); + + if (!list_empty(&sched->kss_zombie_noop_txs)) { + list_add(&zlist, + &sched->kss_zombie_noop_txs); + list_del_init(&sched->kss_zombie_noop_txs); + } + + conn = list_entry(sched->kss_tx_conns.next, + struct ksock_conn, ksnc_tx_list); + list_del(&conn->ksnc_tx_list); + + LASSERT(conn->ksnc_tx_scheduled); + LASSERT(conn->ksnc_tx_ready); + LASSERT(!list_empty(&conn->ksnc_tx_queue)); + + tx = list_entry(conn->ksnc_tx_queue.next, + struct ksock_tx, tx_list); + + if (conn->ksnc_tx_carrier == tx) + ksocknal_next_tx_carrier(conn); + + /* dequeue now so empty list => more to send */ + list_del(&tx->tx_list); + + /* Clear tx_ready in case send isn't complete. Do + * it BEFORE we call process_transmit, since + * write_space can set it any time after we release + * kss_lock. */ + conn->ksnc_tx_ready = 0; + spin_unlock_bh(&sched->kss_lock); + + if (!list_empty(&zlist)) { + /* free zombie noop txs, it's fast because + * noop txs are just put in freelist */ + ksocknal_txlist_done(NULL, &zlist, 0); + } + + rc = ksocknal_process_transmit(conn, tx, scratch_iov); + + if (rc == -ENOMEM || rc == -EAGAIN) { + /* Incomplete send: replace tx on HEAD of tx_queue */ + spin_lock_bh(&sched->kss_lock); + list_add(&tx->tx_list, + &conn->ksnc_tx_queue); + } else { + /* Complete send; tx -ref */ + ksocknal_tx_decref(tx); + + spin_lock_bh(&sched->kss_lock); + /* assume space for more */ + conn->ksnc_tx_ready = 1; + } + + if (rc == -ENOMEM) { + /* Do nothing; after a short timeout, this + * conn will be reposted on kss_tx_conns. */ + } else if (conn->ksnc_tx_ready && + !list_empty(&conn->ksnc_tx_queue)) { + /* reschedule for tx */ + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + } else { + conn->ksnc_tx_scheduled = 0; + /* drop my ref */ + ksocknal_conn_decref(conn); + } + + did_something = 1; + } + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ + spin_unlock_bh(&sched->kss_lock); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ + rc = wait_event_interruptible_exclusive( + sched->kss_waitq, + !ksocknal_sched_cansleep(sched)); + LASSERT (rc == 0); + } else { + cond_resched(); + } + + spin_lock_bh(&sched->kss_lock); + } + } + + spin_unlock_bh(&sched->kss_lock); + LIBCFS_FREE(rx_scratch_pgs, sizeof(*rx_scratch_pgs) * + LNET_MAX_IOV); + LIBCFS_FREE(scratch_iov, sizeof(*scratch_iov) * + LNET_MAX_IOV); + ksocknal_thread_fini(); + return 0; +} + +/* + * Add connection to kss_rx_conns of scheduler + * and wakeup the scheduler. + */ +void ksocknal_read_callback(struct ksock_conn *conn) +{ + struct ksock_sched *sched; + ENTRY; + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + conn->ksnc_rx_ready = 1; + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up (&sched->kss_waitq); + } + spin_unlock_bh(&sched->kss_lock); + + EXIT; +} + +/* + * Add connection to kss_tx_conns of scheduler + * and wakeup the scheduler. + */ +void ksocknal_write_callback(struct ksock_conn *conn) +{ + struct ksock_sched *sched; + ENTRY; + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && /* not being progressed */ + !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */ + list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up(&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); + + EXIT; +} + +static struct ksock_proto * +ksocknal_parse_proto_version (struct ksock_hello_msg *hello) +{ + __u32 version = 0; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + version = hello->kshm_version; + else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC)) + version = __swab32(hello->kshm_version); + + if (version != 0) { +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 1) + return NULL; + + if (*ksocknal_tunables.ksnd_protocol == 2 && + version == KSOCK_PROTO_V3) + return NULL; +#endif + if (version == KSOCK_PROTO_V2) + return &ksocknal_protocol_v2x; + + if (version == KSOCK_PROTO_V3) + return &ksocknal_protocol_v3x; + + return NULL; + } + + if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { + struct lnet_magicversion *hmv; + + CLASSERT(sizeof(struct lnet_magicversion) == + offsetof(struct ksock_hello_msg, kshm_src_nid)); + + hmv = (struct lnet_magicversion *)hello; + + if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) && + hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR)) + return &ksocknal_protocol_v1x; + } + + return NULL; +} + +int +ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn, + lnet_nid_t peer_nid, struct ksock_hello_msg *hello) +{ + /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ + struct ksock_net *net = (struct ksock_net *)ni->ni_data; + + LASSERT(hello->kshm_nips <= LNET_INTERFACES_NUM); + + /* rely on caller to hold a ref on socket so it wouldn't disappear */ + LASSERT(conn->ksnc_proto != NULL); + + hello->kshm_src_nid = ni->ni_nid; + hello->kshm_dst_nid = peer_nid; + hello->kshm_src_pid = the_lnet.ln_pid; + + hello->kshm_src_incarnation = net->ksnn_incarnation; + hello->kshm_ctype = conn->ksnc_type; + + return conn->ksnc_proto->pro_send_hello(conn, hello); +} + +static int +ksocknal_invert_type(int type) +{ + switch (type) + { + case SOCKLND_CONN_ANY: + case SOCKLND_CONN_CONTROL: + return (type); + case SOCKLND_CONN_BULK_IN: + return SOCKLND_CONN_BULK_OUT; + case SOCKLND_CONN_BULK_OUT: + return SOCKLND_CONN_BULK_IN; + default: + return (SOCKLND_CONN_NONE); + } +} + +int +ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn, + struct ksock_hello_msg *hello, + struct lnet_process_id *peerid, + __u64 *incarnation) +{ + /* Return < 0 fatal error + * 0 success + * EALREADY lost connection race + * EPROTO protocol version mismatch + */ + struct socket *sock = conn->ksnc_sock; + int active = (conn->ksnc_proto != NULL); + int timeout; + int proto_match; + int rc; + struct ksock_proto *proto; + struct lnet_process_id recv_id; + + /* socket type set on active connections - not set on passive */ + LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE)); + + timeout = active ? lnet_get_lnd_timeout() : + lnet_acceptor_timeout(); + + rc = lnet_sock_read(sock, &hello->kshm_magic, + sizeof(hello->kshm_magic), timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT (rc < 0); + return rc; + } + + if (hello->kshm_magic != LNET_PROTO_MAGIC && + hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) && + hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) { + /* Unexpected magic! */ + CERROR ("Bad magic(1) %#08x (%#08x expected) from " + "%pI4h\n", __cpu_to_le32 (hello->kshm_magic), + LNET_PROTO_TCP_MAGIC, &conn->ksnc_ipaddr); + return -EPROTO; + } + + rc = lnet_sock_read(sock, &hello->kshm_version, + sizeof(hello->kshm_version), timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0); + return rc; + } + + proto = ksocknal_parse_proto_version(hello); + if (proto == NULL) { + if (!active) { + /* unknown protocol from peer_ni, tell peer_ni my protocol */ + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, ni->ni_nid, hello); + } + + CERROR("Unknown protocol version (%d.x expected) from %pI4h\n", + conn->ksnc_proto->pro_version, &conn->ksnc_ipaddr); + + return -EPROTO; + } + + proto_match = (conn->ksnc_proto == proto); + conn->ksnc_proto = proto; + + /* receive the rest of hello message anyway */ + rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout); + if (rc != 0) { + CERROR("Error %d reading or checking hello from from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT (rc < 0); + return rc; + } + + *incarnation = hello->kshm_src_incarnation; + + if (hello->kshm_src_nid == LNET_NID_ANY) { + CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY" + "from %pI4h\n", &conn->ksnc_ipaddr); + return -EPROTO; + } + + if (!active && + conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { + /* Userspace NAL assigns peer_ni process ID from socket */ + recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG; + recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr); + } else { + recv_id.nid = hello->kshm_src_nid; + recv_id.pid = hello->kshm_src_pid; + } + + if (!active) { + *peerid = recv_id; + + /* peer_ni determines type */ + conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype); + if (conn->ksnc_type == SOCKLND_CONN_NONE) { + CERROR("Unexpected type %d from %s ip %pI4h\n", + hello->kshm_ctype, libcfs_id2str(*peerid), + &conn->ksnc_ipaddr); + return -EPROTO; + } + return 0; + } + + if (peerid->pid != recv_id.pid || + peerid->nid != recv_id.nid) { + LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host" + " %pI4h, but they claimed they were " + "%s; please check your Lustre " + "configuration.\n", + libcfs_id2str(*peerid), + &conn->ksnc_ipaddr, + libcfs_id2str(recv_id)); + return -EPROTO; + } + + if (hello->kshm_ctype == SOCKLND_CONN_NONE) { + /* Possible protocol mismatch or I lost the connection race */ + return proto_match ? EALREADY : EPROTO; + } + + if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) { + CERROR("Mismatched types: me %d, %s ip %pI4h %d\n", + conn->ksnc_type, libcfs_id2str(*peerid), + &conn->ksnc_ipaddr, + hello->kshm_ctype); + return -EPROTO; + } + return 0; +} + +static int +ksocknal_connect(struct ksock_route *route) +{ + struct list_head zombies = LIST_HEAD_INIT(zombies); + struct ksock_peer_ni *peer_ni = route->ksnr_peer; + int type; + int wanted; + struct socket *sock; + time64_t deadline; + int retry_later = 0; + int rc = 0; + + deadline = ktime_get_seconds() + lnet_get_lnd_timeout(); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + LASSERT (route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + + route->ksnr_connecting = 1; + + for (;;) { + wanted = ksocknal_route_mask() & ~route->ksnr_connected; + + /* stop connecting if peer_ni/route got closed under me, or + * route got connected while queued */ + if (peer_ni->ksnp_closing || route->ksnr_deleted || + wanted == 0) { + retry_later = 0; + break; + } + + /* reschedule if peer_ni is connecting to me */ + if (peer_ni->ksnp_accepting > 0) { + CDEBUG(D_NET, + "peer_ni %s(%d) already connecting to me, retry later.\n", + libcfs_nid2str(peer_ni->ksnp_id.nid), peer_ni->ksnp_accepting); + retry_later = 1; + } + + if (retry_later) /* needs reschedule */ + break; + + if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) { + type = SOCKLND_CONN_ANY; + } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) { + type = SOCKLND_CONN_CONTROL; + } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) { + type = SOCKLND_CONN_BULK_IN; + } else { + LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0); + type = SOCKLND_CONN_BULK_OUT; + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + if (ktime_get_seconds() >= deadline) { + rc = -ETIMEDOUT; + lnet_connect_console_error(rc, peer_ni->ksnp_id.nid, + route->ksnr_ipaddr, + route->ksnr_port); + goto failed; + } + + rc = lnet_connect(&sock, peer_ni->ksnp_id.nid, + route->ksnr_myipaddr, + route->ksnr_ipaddr, route->ksnr_port, + peer_ni->ksnp_ni->ni_net_ns); + if (rc != 0) + goto failed; + + rc = ksocknal_create_conn(peer_ni->ksnp_ni, route, sock, type); + if (rc < 0) { + lnet_connect_console_error(rc, peer_ni->ksnp_id.nid, + route->ksnr_ipaddr, + route->ksnr_port); + goto failed; + } + + /* A +ve RC means I have to retry because I lost the connection + * race or I have to renegotiate protocol version */ + retry_later = (rc != 0); + if (retry_later) + CDEBUG(D_NET, "peer_ni %s: conn race, retry later.\n", + libcfs_nid2str(peer_ni->ksnp_id.nid)); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + } + + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + + if (retry_later) { + /* re-queue for attention; this frees me up to handle + * the peer_ni's incoming connection request */ + + if (rc == EALREADY || + (rc == 0 && peer_ni->ksnp_accepting > 0)) { + /* We want to introduce a delay before next + * attempt to connect if we lost conn race, + * but the race is resolved quickly usually, + * so min_reconnectms should be good heuristic */ + route->ksnr_retry_interval = *ksocknal_tunables.ksnd_min_reconnectms / 1000; + route->ksnr_timeout = ktime_get_seconds() + + route->ksnr_retry_interval; + } + + ksocknal_launch_connection_locked(route); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + return retry_later; + + failed: + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + + /* This is a retry rather than a new connection */ + route->ksnr_retry_interval *= 2; + route->ksnr_retry_interval = + max_t(time64_t, route->ksnr_retry_interval, + *ksocknal_tunables.ksnd_min_reconnectms / 1000); + route->ksnr_retry_interval = + min_t(time64_t, route->ksnr_retry_interval, + *ksocknal_tunables.ksnd_max_reconnectms / 1000); + + LASSERT(route->ksnr_retry_interval); + route->ksnr_timeout = ktime_get_seconds() + route->ksnr_retry_interval; + + if (!list_empty(&peer_ni->ksnp_tx_queue) && + peer_ni->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer_ni) == NULL) { + struct ksock_conn *conn; + + /* ksnp_tx_queue is queued on a conn on successful + * connection for V1.x and V2.x */ + if (!list_empty(&peer_ni->ksnp_conns)) { + conn = list_entry(peer_ni->ksnp_conns.next, + struct ksock_conn, ksnc_list); + LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x); + } + + /* take all the blocked packets while I've got the lock and + * complete below... */ + list_splice_init(&peer_ni->ksnp_tx_queue, &zombies); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_peer_failed(peer_ni); + ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, rc); + return 0; +} + +/* + * check whether we need to create more connds. + * It will try to create new thread if it's necessary, @timeout can + * be updated if failed to create, so caller wouldn't keep try while + * running out of resource. + */ +static int +ksocknal_connd_check_start(time64_t sec, long *timeout) +{ + char name[16]; + int rc; + int total = ksocknal_data.ksnd_connd_starting + + ksocknal_data.ksnd_connd_running; + + if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { + /* still in initializing */ + return 0; + } + + if (total >= *ksocknal_tunables.ksnd_nconnds_max || + total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) { + /* can't create more connd, or still have enough + * threads to handle more connecting */ + return 0; + } + + if (list_empty(&ksocknal_data.ksnd_connd_routes)) { + /* no pending connecting request */ + return 0; + } + + if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) { + /* may run out of resource, retry later */ + *timeout = cfs_time_seconds(1); + return 0; + } + + if (ksocknal_data.ksnd_connd_starting > 0) { + /* serialize starting to avoid flood */ + return 0; + } + + ksocknal_data.ksnd_connd_starting_stamp = sec; + ksocknal_data.ksnd_connd_starting++; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + + /* NB: total is the next id */ + snprintf(name, sizeof(name), "socknal_cd%02d", total); + rc = ksocknal_thread_start(ksocknal_connd, NULL, name); + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + if (rc == 0) + return 1; + + /* we tried ... */ + LASSERT(ksocknal_data.ksnd_connd_starting > 0); + ksocknal_data.ksnd_connd_starting--; + ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds(); + + return 1; +} + +/* + * check whether current thread can exit, it will return 1 if there are too + * many threads and no creating in past 120 seconds. + * Also, this function may update @timeout to make caller come back + * again to recheck these conditions. + */ +static int +ksocknal_connd_check_stop(time64_t sec, long *timeout) +{ + int val; + + if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { + /* still in initializing */ + return 0; + } + + if (ksocknal_data.ksnd_connd_starting > 0) { + /* in progress of starting new thread */ + return 0; + } + + if (ksocknal_data.ksnd_connd_running <= + *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */ + return 0; + } + + /* created thread in past 120 seconds? */ + val = (int)(ksocknal_data.ksnd_connd_starting_stamp + + SOCKNAL_CONND_TIMEOUT - sec); + + *timeout = (val > 0) ? cfs_time_seconds(val) : + cfs_time_seconds(SOCKNAL_CONND_TIMEOUT); + if (val > 0) + return 0; + + /* no creating in past 120 seconds */ + + return ksocknal_data.ksnd_connd_running > + ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV; +} + +/* Go through connd_routes queue looking for a route that we can process + * right now, @timeout_p can be updated if we need to come back later */ +static struct ksock_route * +ksocknal_connd_get_route_locked(signed long *timeout_p) +{ + time64_t now = ktime_get_seconds(); + struct ksock_route *route; + + /* connd_routes can contain both pending and ordinary routes */ + list_for_each_entry(route, &ksocknal_data.ksnd_connd_routes, + ksnr_connd_list) { + + if (route->ksnr_retry_interval == 0 || + now >= route->ksnr_timeout) + return route; + + if (*timeout_p == MAX_SCHEDULE_TIMEOUT || + *timeout_p > cfs_time_seconds(route->ksnr_timeout - now)) + *timeout_p = cfs_time_seconds(route->ksnr_timeout - now); + } + + return NULL; +} + +int +ksocknal_connd(void *arg) +{ + spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock; + struct ksock_connreq *cr; + wait_queue_entry_t wait; + int nloops = 0; + int cons_retry = 0; + + cfs_block_allsigs(); + + init_waitqueue_entry(&wait, current); + + spin_lock_bh(connd_lock); + + LASSERT(ksocknal_data.ksnd_connd_starting > 0); + ksocknal_data.ksnd_connd_starting--; + ksocknal_data.ksnd_connd_running++; + + while (!ksocknal_data.ksnd_shuttingdown) { + struct ksock_route *route = NULL; + time64_t sec = ktime_get_real_seconds(); + long timeout = MAX_SCHEDULE_TIMEOUT; + int dropped_lock = 0; + + if (ksocknal_connd_check_stop(sec, &timeout)) { + /* wakeup another one to check stop */ + wake_up(&ksocknal_data.ksnd_connd_waitq); + break; + } + + if (ksocknal_connd_check_start(sec, &timeout)) { + /* created new thread */ + dropped_lock = 1; + } + + if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) { + /* Connection accepted by the listener */ + cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next, + struct ksock_connreq, ksncr_list); + + list_del(&cr->ksncr_list); + spin_unlock_bh(connd_lock); + dropped_lock = 1; + + ksocknal_create_conn(cr->ksncr_ni, NULL, + cr->ksncr_sock, SOCKLND_CONN_NONE); + lnet_ni_decref(cr->ksncr_ni); + LIBCFS_FREE(cr, sizeof(*cr)); + + spin_lock_bh(connd_lock); + } + + /* Only handle an outgoing connection request if there + * is a thread left to handle incoming connections and + * create new connd */ + if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV < + ksocknal_data.ksnd_connd_running) { + route = ksocknal_connd_get_route_locked(&timeout); + } + if (route != NULL) { + list_del(&route->ksnr_connd_list); + ksocknal_data.ksnd_connd_connecting++; + spin_unlock_bh(connd_lock); + dropped_lock = 1; + + if (ksocknal_connect(route)) { + /* consecutive retry */ + if (cons_retry++ > SOCKNAL_INSANITY_RECONN) { + CWARN("massive consecutive " + "re-connecting to %pI4h\n", + &route->ksnr_ipaddr); + cons_retry = 0; + } + } else { + cons_retry = 0; + } + + ksocknal_route_decref(route); + + spin_lock_bh(connd_lock); + ksocknal_data.ksnd_connd_connecting--; + } + + if (dropped_lock) { + if (++nloops < SOCKNAL_RESCHED) + continue; + spin_unlock_bh(connd_lock); + nloops = 0; + cond_resched(); + spin_lock_bh(connd_lock); + continue; + } + + /* Nothing to do for 'timeout' */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait); + spin_unlock_bh(connd_lock); + + nloops = 0; + schedule_timeout(timeout); + + set_current_state(TASK_RUNNING); + remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); + spin_lock_bh(connd_lock); + } + ksocknal_data.ksnd_connd_running--; + spin_unlock_bh(connd_lock); + + ksocknal_thread_fini(); + return 0; +} + +static struct ksock_conn * +ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni) +{ + /* We're called with a shared lock on ksnd_global_lock */ + struct ksock_conn *conn; + struct list_head *ctmp; + struct ksock_tx *tx; + + list_for_each(ctmp, &peer_ni->ksnp_conns) { + int error; + + conn = list_entry(ctmp, struct ksock_conn, ksnc_list); + + /* Don't need the {get,put}connsock dance to deref ksnc_sock */ + LASSERT (!conn->ksnc_closing); + + error = conn->ksnc_sock->sk->sk_err; + if (error != 0) { + ksocknal_conn_addref(conn); + + switch (error) { + case ECONNRESET: + CNETERR("A connection with %s " + "(%pI4h:%d) was reset; " + "it may have rebooted.\n", + libcfs_id2str(peer_ni->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + break; + case ETIMEDOUT: + CNETERR("A connection with %s " + "(%pI4h:%d) timed out; the " + "network or node may be down.\n", + libcfs_id2str(peer_ni->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + break; + default: + CNETERR("An unexpected network error %d " + "occurred with %s " + "(%pI4h:%d\n", error, + libcfs_id2str(peer_ni->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + break; + } + + return (conn); + } + + if (conn->ksnc_rx_started && + ktime_get_seconds() >= conn->ksnc_rx_deadline) { + /* Timed out incomplete incoming message */ + ksocknal_conn_addref(conn); + CNETERR("Timeout receiving from %s (%pI4h:%d), " + "state %d wanted %d left %d\n", + libcfs_id2str(peer_ni->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port, + conn->ksnc_rx_state, + conn->ksnc_rx_nob_wanted, + conn->ksnc_rx_nob_left); + return (conn); + } + + if ((!list_empty(&conn->ksnc_tx_queue) || + conn->ksnc_sock->sk->sk_wmem_queued != 0) && + ktime_get_seconds() >= conn->ksnc_tx_deadline) { + /* Timed out messages queued for sending or + * buffered in the socket's send buffer */ + ksocknal_conn_addref(conn); + list_for_each_entry(tx, &conn->ksnc_tx_queue, + tx_list) + tx->tx_hstatus = + LNET_MSG_STATUS_LOCAL_TIMEOUT; + CNETERR("Timeout sending data to %s (%pI4h:%d) " + "the network or that node may be down.\n", + libcfs_id2str(peer_ni->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port); + return (conn); + } + } + + return (NULL); +} + +static inline void +ksocknal_flush_stale_txs(struct ksock_peer_ni *peer_ni) +{ + struct ksock_tx *tx; + struct list_head stale_txs = LIST_HEAD_INIT(stale_txs); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + while (!list_empty(&peer_ni->ksnp_tx_queue)) { + tx = list_entry(peer_ni->ksnp_tx_queue.next, + struct ksock_tx, tx_list); + + if (ktime_get_seconds() < tx->tx_deadline) + break; + + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT; + + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &stale_txs); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, -ETIMEDOUT); +} + +static int +ksocknal_send_keepalive_locked(struct ksock_peer_ni *peer_ni) +__must_hold(&ksocknal_data.ksnd_global_lock) +{ + struct ksock_sched *sched; + struct ksock_conn *conn; + struct ksock_tx *tx; + + /* last_alive will be updated by create_conn */ + if (list_empty(&peer_ni->ksnp_conns)) + return 0; + + if (peer_ni->ksnp_proto != &ksocknal_protocol_v3x) + return 0; + + if (*ksocknal_tunables.ksnd_keepalive <= 0 || + ktime_get_seconds() < peer_ni->ksnp_last_alive + + *ksocknal_tunables.ksnd_keepalive) + return 0; + + if (ktime_get_seconds() < peer_ni->ksnp_send_keepalive) + return 0; + + /* retry 10 secs later, so we wouldn't put pressure + * on this peer_ni if we failed to send keepalive this time */ + peer_ni->ksnp_send_keepalive = ktime_get_seconds() + 10; + + conn = ksocknal_find_conn_locked(peer_ni, NULL, 1); + if (conn != NULL) { + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + if (!list_empty(&conn->ksnc_tx_queue)) { + spin_unlock_bh(&sched->kss_lock); + /* there is an queued ACK, don't need keepalive */ + return 0; + } + + spin_unlock_bh(&sched->kss_lock); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + /* cookie = 1 is reserved for keepalive PING */ + tx = ksocknal_alloc_tx_noop(1, 1); + if (tx == NULL) { + read_lock(&ksocknal_data.ksnd_global_lock); + return -ENOMEM; + } + + if (ksocknal_launch_packet(peer_ni->ksnp_ni, tx, peer_ni->ksnp_id) == 0) { + read_lock(&ksocknal_data.ksnd_global_lock); + return 1; + } + + ksocknal_free_tx(tx); + read_lock(&ksocknal_data.ksnd_global_lock); + + return -EIO; +} + + +static void +ksocknal_check_peer_timeouts(int idx) +{ + struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; + struct ksock_peer_ni *peer_ni; + struct ksock_conn *conn; + struct ksock_tx *tx; + + again: + /* NB. We expect to have a look at all the peers and not find any + * connections to time out, so we just use a shared lock while we + * take a look... */ + read_lock(&ksocknal_data.ksnd_global_lock); + + list_for_each_entry(peer_ni, peers, ksnp_list) { + struct ksock_tx *tx_stale; + time64_t deadline = 0; + int resid = 0; + int n = 0; + + if (ksocknal_send_keepalive_locked(peer_ni) != 0) { + read_unlock(&ksocknal_data.ksnd_global_lock); + goto again; + } + + conn = ksocknal_find_timed_out_conn (peer_ni); + + if (conn != NULL) { + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); + + /* NB we won't find this one again, but we can't + * just proceed with the next peer_ni, since we dropped + * ksnd_global_lock and it might be dead already! */ + ksocknal_conn_decref(conn); + goto again; + } + + /* we can't process stale txs right here because we're + * holding only shared lock */ + if (!list_empty(&peer_ni->ksnp_tx_queue)) { + struct ksock_tx *tx; + + tx = list_entry(peer_ni->ksnp_tx_queue.next, + struct ksock_tx, tx_list); + if (ktime_get_seconds() >= tx->tx_deadline) { + ksocknal_peer_addref(peer_ni); + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_flush_stale_txs(peer_ni); + + ksocknal_peer_decref(peer_ni); + goto again; + } + } + + if (list_empty(&peer_ni->ksnp_zc_req_list)) + continue; + + tx_stale = NULL; + spin_lock(&peer_ni->ksnp_lock); + list_for_each_entry(tx, &peer_ni->ksnp_zc_req_list, tx_zc_list) { + if (ktime_get_seconds() < tx->tx_deadline) + break; + /* ignore the TX if connection is being closed */ + if (tx->tx_conn->ksnc_closing) + continue; + n++; + if (tx_stale == NULL) + tx_stale = tx; + } + + if (tx_stale == NULL) { + spin_unlock(&peer_ni->ksnp_lock); + continue; + } + + deadline = tx_stale->tx_deadline; + resid = tx_stale->tx_resid; + conn = tx_stale->tx_conn; + ksocknal_conn_addref(conn); + + spin_unlock(&peer_ni->ksnp_lock); + read_unlock(&ksocknal_data.ksnd_global_lock); + + CERROR("Total %d stale ZC_REQs for peer_ni %s detected; the " + "oldest(%p) timed out %lld secs ago, " + "resid: %d, wmem: %d\n", + n, libcfs_nid2str(peer_ni->ksnp_id.nid), tx_stale, + ktime_get_seconds() - deadline, + resid, conn->ksnc_sock->sk->sk_wmem_queued); + + ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); + ksocknal_conn_decref(conn); + goto again; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +int ksocknal_reaper(void *arg) +{ + wait_queue_entry_t wait; + struct ksock_conn *conn; + struct ksock_sched *sched; + struct list_head enomem_conns; + int nenomem_conns; + time64_t timeout; + int i; + int peer_index = 0; + time64_t deadline = ktime_get_seconds(); + + cfs_block_allsigs (); + + INIT_LIST_HEAD(&enomem_conns); + init_waitqueue_entry(&wait, current); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + while (!ksocknal_data.ksnd_shuttingdown) { + if (!list_empty(&ksocknal_data.ksnd_deathrow_conns)) { + conn = list_entry(ksocknal_data.ksnd_deathrow_conns.next, + struct ksock_conn, ksnc_list); + list_del(&conn->ksnc_list); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_terminate_conn(conn); + ksocknal_conn_decref(conn); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + continue; + } + + if (!list_empty(&ksocknal_data.ksnd_zombie_conns)) { + conn = list_entry(ksocknal_data.ksnd_zombie_conns.next, + struct ksock_conn, ksnc_list); + list_del(&conn->ksnc_list); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_destroy_conn(conn); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + continue; + } + + if (!list_empty(&ksocknal_data.ksnd_enomem_conns)) { + list_add(&enomem_conns, + &ksocknal_data.ksnd_enomem_conns); + list_del_init(&ksocknal_data.ksnd_enomem_conns); + } + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* reschedule all the connections that stalled with ENOMEM... */ + nenomem_conns = 0; + while (!list_empty(&enomem_conns)) { + conn = list_entry(enomem_conns.next, + struct ksock_conn, ksnc_tx_list); + list_del(&conn->ksnc_tx_list); + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + LASSERT(conn->ksnc_tx_scheduled); + conn->ksnc_tx_ready = 1; + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + wake_up(&sched->kss_waitq); + + spin_unlock_bh(&sched->kss_lock); + nenomem_conns++; + } + + /* careful with the jiffy wrap... */ + while ((timeout = deadline - ktime_get_seconds()) <= 0) { + const int n = 4; + const int p = 1; + int chunk = ksocknal_data.ksnd_peer_hash_size; + unsigned int lnd_timeout; + + /* Time to check for timeouts on a few more peers: I do + * checks every 'p' seconds on a proportion of the peer_ni + * table and I need to check every connection 'n' times + * within a timeout interval, to ensure I detect a + * timeout on any connection within (n+1)/n times the + * timeout interval. */ + + lnd_timeout = lnet_get_lnd_timeout(); + if (lnd_timeout > n * p) + chunk = (chunk * n * p) / lnd_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + ksocknal_check_peer_timeouts (peer_index); + peer_index = (peer_index + 1) % + ksocknal_data.ksnd_peer_hash_size; + } + + deadline += p; + } + + if (nenomem_conns != 0) { + /* Reduce my timeout if I rescheduled ENOMEM conns. + * This also prevents me getting woken immediately + * if any go back on my enomem list. */ + timeout = SOCKNAL_ENOMEM_RETRY; + } + ksocknal_data.ksnd_reaper_waketime = ktime_get_seconds() + + timeout; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait); + + if (!ksocknal_data.ksnd_shuttingdown && + list_empty(&ksocknal_data.ksnd_deathrow_conns) && + list_empty(&ksocknal_data.ksnd_zombie_conns)) + schedule_timeout(cfs_time_seconds(timeout)); + + set_current_state(TASK_RUNNING); + remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + } + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_thread_fini(); + return 0; +} diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c new file mode 100644 index 0000000000000..72f2bd526613e --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c @@ -0,0 +1,715 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include "socklnd.h" + +int +ksocknal_lib_get_conn_addrs(struct ksock_conn *conn) +{ + int rc = lnet_sock_getaddr(conn->ksnc_sock, true, + &conn->ksnc_ipaddr, + &conn->ksnc_port); + + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); + + if (rc != 0) { + CERROR ("Error %d getting sock peer_ni IP\n", rc); + return rc; + } + + rc = lnet_sock_getaddr(conn->ksnc_sock, false, + &conn->ksnc_myipaddr, NULL); + if (rc != 0) { + CERROR ("Error %d getting sock local IP\n", rc); + return rc; + } + + return 0; +} + +int +ksocknal_lib_zc_capable(struct ksock_conn *conn) +{ + int caps = conn->ksnc_sock->sk->sk_route_caps; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x) + return 0; + + /* ZC if the socket supports scatter/gather and doesn't need software + * checksums */ + return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_CSUM_MASK) != 0); +} + +int +ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratchiov) +{ + struct socket *sock = conn->ksnc_sock; + int nob; + int rc; + + if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ + conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ + tx->tx_nob == tx->tx_resid && /* frist sending */ + tx->tx_msg.ksm_csum == 0) /* not checksummed */ + ksocknal_lib_csum_tx(tx); + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + + { +#if SOCKNAL_SINGLE_FRAG_TX + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else + unsigned int niov = tx->tx_niov; +#endif + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = tx->tx_iov[i]; + nob += scratchiov[i].iov_len; + } + + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; + + rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob); + } + return rc; +} + +int +ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx, + struct kvec *scratchiov) +{ + struct socket *sock = conn->ksnc_sock; + lnet_kiov_t *kiov = tx->tx_kiov; + int rc; + int nob; + + /* Not NOOP message */ + LASSERT(tx->tx_lnetmsg != NULL); + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + if (tx->tx_msg.ksm_zc_cookies[0] != 0) { + /* Zero copy is enabled */ + struct sock *sk = sock->sk; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int fragsize = kiov->kiov_len; + int msgflg = MSG_DONTWAIT; + + CDEBUG(D_NET, "page %p + offset %x for %d\n", + page, offset, kiov->kiov_len); + + if (!list_empty(&conn->ksnc_tx_queue) || + fragsize < tx->tx_resid) + msgflg |= MSG_MORE; + + if (sk->sk_prot->sendpage != NULL) { + rc = sk->sk_prot->sendpage(sk, page, + offset, fragsize, msgflg); + } else { + rc = cfs_tcp_sendpage(sk, page, offset, fragsize, + msgflg); + } + } else { +#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else +#ifdef CONFIG_HIGHMEM +#warning "XXX risk of kmap deadlock on multiple frags..." +#endif + unsigned int niov = tx->tx_nkiov; +#endif + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + } + + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; + + rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob); + + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); + } + return rc; +} + +void +ksocknal_lib_eager_ack(struct ksock_conn *conn) +{ + struct socket *sock = conn->ksnc_sock; + + /* Remind the socket to ACK eagerly. If I don't, the socket might + * think I'm about to send something it could piggy-back the ACK on, + * introducing delay in completing zero-copy sends in my peer_ni. + */ + + tcp_sock_set_quickack(sock->sk, 1); +} + +int +ksocknal_lib_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov) +{ +#if SOCKNAL_SINGLE_FRAG_RX + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else + unsigned int niov = conn->ksnc_rx_niov; +#endif + struct kvec *iov = conn->ksnc_rx_iov; + struct msghdr msg = { + .msg_flags = 0 + }; + int nob; + int i; + int rc; + int fragnob; + int sum; + __u32 saved_csum; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + LASSERT (niov > 0); + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = iov[i]; + nob += scratchiov[i].iov_len; + } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + + rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, niov, nob, + MSG_DONTWAIT); + + saved_csum = 0; + if (conn->ksnc_proto == &ksocknal_protocol_v2x) { + saved_csum = conn->ksnc_msg.ksm_csum; + conn->ksnc_msg.ksm_csum = 0; + } + + if (saved_csum != 0) { + /* accumulate checksum */ + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT (i < niov); + + fragnob = iov[i].iov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + iov[i].iov_base, fragnob); + } + conn->ksnc_msg.ksm_csum = saved_csum; + } + + return rc; +} + +static void +ksocknal_lib_kiov_vunmap(void *addr) +{ + if (addr == NULL) + return; + + vunmap(addr); +} + +static void * +ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, + struct kvec *iov, struct page **pages) +{ + void *addr; + int nob; + int i; + + if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL) + return NULL; + + LASSERT (niov <= LNET_MAX_IOV); + + if (niov < 2 || + niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) + return NULL; + + for (nob = i = 0; i < niov; i++) { + if ((kiov[i].kiov_offset != 0 && i > 0) || + (kiov[i].kiov_offset + kiov[i].kiov_len != + PAGE_SIZE && i < niov - 1)) + return NULL; + + pages[i] = kiov[i].kiov_page; + nob += kiov[i].kiov_len; + } + + addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); + if (addr == NULL) + return NULL; + + iov->iov_base = addr + kiov[0].kiov_offset; + iov->iov_len = nob; + + return addr; +} + +int +ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages, + struct kvec *scratchiov) +{ +#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct kvec scratch; + struct kvec *scratchiov = &scratch; + struct page **pages = NULL; + unsigned int niov = 1; +#else +#ifdef CONFIG_HIGHMEM +#warning "XXX risk of kmap deadlock on multiple frags..." +#endif + unsigned int niov = conn->ksnc_rx_nkiov; +#endif + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + struct msghdr msg = { + .msg_flags = 0 + }; + int nob; + int i; + int rc; + void *base; + void *addr; + int sum; + int fragnob; + int n; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) { + nob = scratchiov[0].iov_len; + n = 1; + + } else { + for (nob = i = 0; i < niov; i++) { + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + } + n = niov; + } + + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + + rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, n, nob, + MSG_DONTWAIT); + + if (conn->ksnc_msg.ksm_csum != 0) { + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT (i < niov); + + /* Dang! have to kmap again because I have nowhere to stash the + * mapped address. But by doing it while the page is still + * mapped, the kernel just bumps the map count and returns me + * the address it stashed. */ + base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; + fragnob = kiov[i].kiov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + base, fragnob); + + kunmap(kiov[i].kiov_page); + } + } + + if (addr != NULL) { + ksocknal_lib_kiov_vunmap(addr); + } else { + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); + } + + return (rc); +} + +void +ksocknal_lib_csum_tx(struct ksock_tx *tx) +{ + int i; + __u32 csum; + void *base; + + LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg); + LASSERT(tx->tx_conn != NULL); + LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); + + tx->tx_msg.ksm_csum = 0; + + csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base, + tx->tx_iov[0].iov_len); + + if (tx->tx_kiov != NULL) { + for (i = 0; i < tx->tx_nkiov; i++) { + base = kmap(tx->tx_kiov[i].kiov_page) + + tx->tx_kiov[i].kiov_offset; + + csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len); + + kunmap(tx->tx_kiov[i].kiov_page); + } + } else { + for (i = 1; i < tx->tx_niov; i++) + csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base, + tx->tx_iov[i].iov_len); + } + + if (*ksocknal_tunables.ksnd_inject_csum_error) { + csum++; + *ksocknal_tunables.ksnd_inject_csum_error = 0; + } + + tx->tx_msg.ksm_csum = csum; +} + +int +ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, int *rxmem, int *nagle) +{ + struct socket *sock = conn->ksnc_sock; + struct tcp_sock *tp = tcp_sk(sock->sk); + + if (ksocknal_connsock_addref(conn) < 0) { + LASSERT(conn->ksnc_closing); + *txmem = 0; + *rxmem = 0; + *nagle = 0; + return -ESHUTDOWN; + } + + lnet_sock_getbuf(sock, txmem, rxmem); + + *nagle = !(tp->nonagle & TCP_NAGLE_OFF); + + ksocknal_connsock_decref(conn); + + + return 0; +} + +int +ksocknal_lib_setup_sock (struct socket *sock) +{ + int rc; + int keep_idle; + int keep_intvl; + int keep_count; + int do_keepalive; + struct tcp_sock *tp = tcp_sk(sock->sk); + + sock->sk->sk_allocation = GFP_NOFS; + + /* Ensure this socket aborts active sends immediately when closed. */ + sock_reset_flag(sock->sk, SOCK_LINGER); + + tp->linger2 = -1; + + if (!*ksocknal_tunables.ksnd_nagle) + tcp_sock_set_nodelay(sock->sk); + + lnet_sock_setbuf(sock, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + +/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ +#ifdef SOCKNAL_BACKOFF + if (*ksocknal_tunables.ksnd_backoff_init > 0) { + int option = *ksocknal_tunables.ksnd_backoff_init; +#ifdef SOCKNAL_BACKOFF_MS + option *= 1000; +#endif + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_BACKOFF_INIT, + (char *)&option, sizeof(option)); + if (rc != 0) { + CERROR("Can't set initial tcp backoff %d: %d\n", + option, rc); + return rc; + } + } + + if (*ksocknal_tunables.ksnd_backoff_max > 0) { + int option = *ksocknal_tunables.ksnd_backoff_max; +#ifdef SOCKNAL_BACKOFF_MS + option *= 1000; +#endif + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_BACKOFF_MAX, + (char *)&option, sizeof(option)); + if (rc != 0) { + CERROR("Can't set maximum tcp backoff %d: %d\n", + option, rc); + return rc; + } + } +#endif + + /* snapshot tunables */ + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; + + do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); + +#ifdef HAVE_KERNEL_SETSOCKOPT + /* open-coded version doesn't work in all kernels, and + * there is no helper function, so call kernel_setsockopt() + * directly. + */ + { + int option = (do_keepalive ? 1 : 0); + kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&option, sizeof(option)); + } +#else + if (sock->sk->sk_prot->keepalive) + sock->sk->sk_prot->keepalive(sock->sk, do_keepalive); + if (do_keepalive) + sock_set_flag(sock->sk, SOCK_KEEPOPEN); + else + sock_reset_flag(sock->sk, SOCK_KEEPOPEN); +#endif /* HAVE_KERNEL_SETSOCKOPT */ + + if (!do_keepalive) + return (0); + + rc = tcp_sock_set_keepidle(sock->sk, keep_idle); + if (rc != 0) { + CERROR("Can't set TCP_KEEPIDLE: %d\n", rc); + return rc; + } + + rc = tcp_sock_set_keepintvl(sock->sk, keep_intvl); + if (rc != 0) { + CERROR("Can't set TCP_KEEPINTVL: %d\n", rc); + return rc; + } + + rc = tcp_sock_set_keepcnt(sock->sk, keep_count); + if (rc != 0) { + CERROR("Can't set TCP_KEEPCNT: %d\n", rc); + return rc; + } + + return (0); +} + +void +ksocknal_lib_push_conn(struct ksock_conn *conn) +{ + struct sock *sk; + struct tcp_sock *tp; + int nonagle; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) /* being shut down */ + return; + + sk = conn->ksnc_sock->sk; + tp = tcp_sk(sk); + + lock_sock(sk); + nonagle = tp->nonagle; + tp->nonagle = TCP_NAGLE_OFF; + release_sock(sk); + + tcp_sock_set_nodelay(conn->ksnc_sock->sk); + + lock_sock(sk); + tp->nonagle = nonagle; + release_sock(sk); + + ksocknal_connsock_decref(conn); +} + +void ksocknal_read_callback(struct ksock_conn *conn); +void ksocknal_write_callback(struct ksock_conn *conn); +/* + * socket call back in Linux + */ +static void +#ifdef HAVE_SK_DATA_READY_ONE_ARG +ksocknal_data_ready(struct sock *sk) +#else +ksocknal_data_ready(struct sock *sk, int n) +#endif +{ + struct ksock_conn *conn; + ENTRY; + + /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = sk->sk_user_data; + if (conn == NULL) { /* raced with ksocknal_terminate_conn */ + LASSERT(sk->sk_data_ready != &ksocknal_data_ready); +#ifdef HAVE_SK_DATA_READY_ONE_ARG + sk->sk_data_ready(sk); +#else + sk->sk_data_ready(sk, n); +#endif + } else + ksocknal_read_callback(conn); + + read_unlock(&ksocknal_data.ksnd_global_lock); + + EXIT; +} + +static void +ksocknal_write_space (struct sock *sk) +{ + struct ksock_conn *conn; + int wspace; + int min_wpace; + + /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = sk->sk_user_data; + wspace = sk_stream_wspace(sk); + min_wpace = sk_stream_min_wspace(sk); + + CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", + sk, wspace, min_wpace, conn, + (conn == NULL) ? "" : (conn->ksnc_tx_ready ? + " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? + " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty(&conn->ksnc_tx_queue) ? + " empty" : " queued")); + + if (conn == NULL) { /* raced with ksocknal_terminate_conn */ + LASSERT (sk->sk_write_space != &ksocknal_write_space); + sk->sk_write_space (sk); + + read_unlock(&ksocknal_data.ksnd_global_lock); + return; + } + + if (wspace >= min_wpace) { /* got enough space */ + ksocknal_write_callback(conn); + + /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the + * ENOMEM check in ksocknal_transmit is race-free (think about + * it). */ + + clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +void +ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn) +{ + conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; + conn->ksnc_saved_write_space = sock->sk->sk_write_space; +} + +void +ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn) +{ + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = ksocknal_data_ready; + sock->sk->sk_write_space = ksocknal_write_space; + return; +} + +void +ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn) +{ + /* Remove conn's network callbacks. + * NB I _have_ to restore the callback, rather than storing a noop, + * since the socket could survive past this module being unloaded!! */ + sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; + sock->sk->sk_write_space = conn->ksnc_saved_write_space; + + /* A callback could be in progress already; they hold a read lock + * on ksnd_global_lock (to serialise with me) and NOOP if + * sk_user_data is NULL. */ + sock->sk->sk_user_data = NULL; + + return ; +} + +int +ksocknal_lib_memory_pressure(struct ksock_conn *conn) +{ + int rc = 0; + struct ksock_sched *sched; + + sched = conn->ksnc_scheduler; + spin_lock_bh(&sched->kss_lock); + + if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) && + !conn->ksnc_tx_ready) { + /* SOCK_NOSPACE is set when the socket fills + * and cleared in the write_space callback + * (which also sets ksnc_tx_ready). If + * SOCK_NOSPACE and ksnc_tx_ready are BOTH + * zero, I didn't fill the socket and + * write_space won't reschedule me, so I + * return -ENOMEM to get my caller to retry + * after a timeout */ + rc = -ENOMEM; + } + + spin_unlock_bh(&sched->kss_lock); + + return rc; +} diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c new file mode 100644 index 0000000000000..df9d96e6e4cfc --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Eric Barton + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +static int sock_timeout = 50; +module_param(sock_timeout, int, 0644); +MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)"); + +static int credits = 256; +module_param(credits, int, 0444); +MODULE_PARM_DESC(credits, "# concurrent sends"); + +static int peer_credits = 8; +module_param(peer_credits, int, 0444); +MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); + +static int peer_buffer_credits; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); + +static int peer_timeout = DEFAULT_PEER_TIMEOUT; +module_param(peer_timeout, int, 0444); +MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + +/* Number of daemons in each thread pool which is percpt, + * we will estimate reasonable value based on CPUs if it's not set. */ +static unsigned int nscheds; +module_param(nscheds, int, 0444); +MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting"); + +static int nconnds = 4; +module_param(nconnds, int, 0444); +MODULE_PARM_DESC(nconnds, "# connection daemons while starting"); + +static int nconnds_max = 64; +module_param(nconnds_max, int, 0444); +MODULE_PARM_DESC(nconnds_max, "max # connection daemons"); + +static int min_reconnectms = 1000; +module_param(min_reconnectms, int, 0644); +MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)"); + +static int max_reconnectms = 60000; +module_param(max_reconnectms, int, 0644); +MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)"); + +static int eager_ack; +module_param(eager_ack, int, 0644); +MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly"); + +static int typed_conns = 1; +module_param(typed_conns, int, 0444); +MODULE_PARM_DESC(typed_conns, "use different sockets for bulk"); + +static int min_bulk = (1<<10); +module_param(min_bulk, int, 0644); +MODULE_PARM_DESC(min_bulk, "smallest 'large' message"); + +# define DEFAULT_BUFFER_SIZE 0 +static int tx_buffer_size = DEFAULT_BUFFER_SIZE; +module_param(tx_buffer_size, int, 0644); +MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)"); + +static int rx_buffer_size = DEFAULT_BUFFER_SIZE; +module_param(rx_buffer_size, int, 0644); +MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)"); + +static int nagle = 0; +module_param(nagle, int, 0644); +MODULE_PARM_DESC(nagle, "enable NAGLE?"); + +static int round_robin = 1; +module_param(round_robin, int, 0644); +MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces"); + +static int keepalive = 30; +module_param(keepalive, int, 0644); +MODULE_PARM_DESC(keepalive, "# seconds before send keepalive"); + +static int keepalive_idle = 30; +module_param(keepalive_idle, int, 0644); +MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe"); + +#define DEFAULT_KEEPALIVE_COUNT 5 +static int keepalive_count = DEFAULT_KEEPALIVE_COUNT; +module_param(keepalive_count, int, 0644); +MODULE_PARM_DESC(keepalive_count, "# missed probes == dead"); + +static int keepalive_intvl = 5; +module_param(keepalive_intvl, int, 0644); +MODULE_PARM_DESC(keepalive_intvl, "seconds between probes"); + +static int enable_csum = 0; +module_param(enable_csum, int, 0644); +MODULE_PARM_DESC(enable_csum, "enable check sum"); + +static int inject_csum_error = 0; +module_param(inject_csum_error, int, 0644); +MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error"); + +#ifdef CPU_AFFINITY +static int enable_irq_affinity = 0; +module_param(enable_irq_affinity, int, 0644); +MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity"); +#endif + +static int nonblk_zcack = 1; +module_param(nonblk_zcack, int, 0644); +MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection"); + +static unsigned int zc_min_payload = (16 << 10); +module_param(zc_min_payload, int, 0644); +MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy"); + +static unsigned int zc_recv = 0; +module_param(zc_recv, int, 0644); +MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver"); + +static unsigned int zc_recv_min_nfrags = 16; +module_param(zc_recv_min_nfrags, int, 0644); +MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv"); + +#ifdef SOCKNAL_BACKOFF +static int backoff_init = 3; +module_param(backoff_init, int, 0644); +MODULE_PARM_DESC(backoff_init, "seconds for initial tcp backoff"); + +static int backoff_max = 3; +module_param(backoff_max, int, 0644); +MODULE_PARM_DESC(backoff_max, "seconds for maximum tcp backoff"); +#endif + +#if SOCKNAL_VERSION_DEBUG +static int protocol = 3; +module_param(protocol, int, 0644); +MODULE_PARM_DESC(protocol, "protocol version"); +#endif + +struct ksock_tunables ksocknal_tunables; + +int ksocknal_tunables_init(void) +{ + + /* initialize ksocknal_tunables structure */ + ksocknal_tunables.ksnd_timeout = &sock_timeout; + ksocknal_tunables.ksnd_nscheds = &nscheds; + ksocknal_tunables.ksnd_nconnds = &nconnds; + ksocknal_tunables.ksnd_nconnds_max = &nconnds_max; + ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; + ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; + ksocknal_tunables.ksnd_eager_ack = &eager_ack; + ksocknal_tunables.ksnd_typed_conns = &typed_conns; + ksocknal_tunables.ksnd_min_bulk = &min_bulk; + ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; + ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; + ksocknal_tunables.ksnd_nagle = &nagle; + ksocknal_tunables.ksnd_round_robin = &round_robin; + ksocknal_tunables.ksnd_keepalive = &keepalive; + ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; + ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; + ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; + ksocknal_tunables.ksnd_credits = &credits; + ksocknal_tunables.ksnd_peertxcredits = &peer_credits; + ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits; + ksocknal_tunables.ksnd_peertimeout = &peer_timeout; + ksocknal_tunables.ksnd_enable_csum = &enable_csum; + ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; + ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack; + ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; + ksocknal_tunables.ksnd_zc_recv = &zc_recv; + ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; + +#ifdef CPU_AFFINITY + if (enable_irq_affinity) { + CWARN("irq_affinity is removed from socklnd because modern " + "computer always has fast CPUs and more cores than " + "# NICs, although you still can set irq_affinity by " + "another way, please check manual for details.\n"); + } + ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity; +#endif + +#ifdef SOCKNAL_BACKOFF + ksocknal_tunables.ksnd_backoff_init = &backoff_init; + ksocknal_tunables.ksnd_backoff_max = &backoff_max; +#endif + +#if SOCKNAL_VERSION_DEBUG + ksocknal_tunables.ksnd_protocol = &protocol; +#endif + + if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) + *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10); + + return 0; +}; diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c new file mode 100644 index 0000000000000..6dd648a2299cc --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2017, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +/* + * Protocol entries : + * pro_send_hello : send hello message + * pro_recv_hello : receive hello message + * pro_pack : pack message header + * pro_unpack : unpack message header + * pro_queue_tx_zcack() : Called holding BH lock: kss_lock + * return 1 if ACK is piggybacked, otherwise return 0 + * pro_queue_tx_msg() : Called holding BH lock: kss_lock + * return the ACK that piggybacked by my message, or NULL + * pro_handle_zcreq() : handler of incoming ZC-REQ + * pro_handle_zcack() : handler of incoming ZC-ACK + * pro_match_tx() : Called holding glock + */ + +static struct ksock_tx * +ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg) +{ + /* V1.x, just enqueue it */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + return NULL; +} + +void +ksocknal_next_tx_carrier(struct ksock_conn *conn) +{ + struct ksock_tx *tx = conn->ksnc_tx_carrier; + + /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */ + LASSERT(!list_empty(&conn->ksnc_tx_queue)); + LASSERT(tx != NULL); + + /* Next TX that can carry ZC-ACK or LNet message */ + if (tx->tx_list.next == &conn->ksnc_tx_queue) { + /* no more packets queued */ + conn->ksnc_tx_carrier = NULL; + } else { + conn->ksnc_tx_carrier = list_entry(tx->tx_list.next, + struct ksock_tx, tx_list); + LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type == + tx->tx_msg.ksm_type); + } +} + +static int +ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn, + struct ksock_tx *tx_ack, __u64 cookie) +{ + struct ksock_tx *tx = conn->ksnc_tx_carrier; + + LASSERT (tx_ack == NULL || + tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + /* + * Enqueue or piggyback tx_ack / cookie + * . no tx can piggyback cookie of tx_ack (or cookie), just + * enqueue the tx_ack (if tx_ack != NUL) and return NULL. + * . There is tx can piggyback cookie of tx_ack (or cookie), + * piggyback the cookie and return the tx. + */ + if (tx == NULL) { + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_ack; + } + return 0; + } + + if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) { + /* tx is noop zc-ack, can't piggyback zc-ack cookie */ + if (tx_ack != NULL) + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + return 0; + } + + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET); + LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0); + + if (tx_ack != NULL) + cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; + + /* piggyback the zc-ack cookie */ + tx->tx_msg.ksm_zc_cookies[1] = cookie; + /* move on to the next TX which can carry cookie */ + ksocknal_next_tx_carrier(conn); + + return 1; +} + +static struct ksock_tx * +ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg) +{ + struct ksock_tx *tx = conn->ksnc_tx_carrier; + + /* + * Enqueue tx_msg: + * . If there is no NOOP on the connection, just enqueue + * tx_msg and return NULL + * . If there is NOOP on the connection, piggyback the cookie + * and replace the NOOP tx, and return the NOOP tx. + */ + if (tx == NULL) { /* nothing on queue */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_msg; + return NULL; + } + + if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + return NULL; + } + + LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + /* There is a noop zc-ack can be piggybacked */ + tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1]; + ksocknal_next_tx_carrier(conn); + + /* use new_tx to replace the noop zc-ack packet */ + list_add(&tx_msg->tx_list, &tx->tx_list); + list_del(&tx->tx_list); + + return tx; +} + +static int +ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn, + struct ksock_tx *tx_ack, __u64 cookie) +{ + struct ksock_tx *tx; + + if (conn->ksnc_type != SOCKLND_CONN_ACK) + return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie); + + /* non-blocking ZC-ACK (to router) */ + LASSERT (tx_ack == NULL || + tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + if ((tx = conn->ksnc_tx_carrier) == NULL) { + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_ack; + } + return 0; + } + + /* conn->ksnc_tx_carrier != NULL */ + + if (tx_ack != NULL) + cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; + + if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */ + return 1; + + if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) { + /* replace the keepalive PING with a real ACK */ + LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0); + tx->tx_msg.ksm_zc_cookies[1] = cookie; + return 1; + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[0] || + cookie == tx->tx_msg.ksm_zc_cookies[1]) { + CWARN("%s: duplicated ZC cookie: %llu\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); + return 1; /* XXX return error in the future */ + } + + if (tx->tx_msg.ksm_zc_cookies[0] == 0) { + /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */ + if (tx->tx_msg.ksm_zc_cookies[1] > cookie) { + tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1]; + tx->tx_msg.ksm_zc_cookies[1] = cookie; + } else { + tx->tx_msg.ksm_zc_cookies[0] = cookie; + } + + if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) { + /* not likely to carry more ACKs, skip it to simplify logic */ + ksocknal_next_tx_carrier(conn); + } + + return 1; + } + + /* takes two or more cookies already */ + + if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) { + __u64 tmp = 0; + + /* two separated cookies: (a+2, a) or (a+1, a) */ + LASSERT (tx->tx_msg.ksm_zc_cookies[0] - + tx->tx_msg.ksm_zc_cookies[1] <= 2); + + if (tx->tx_msg.ksm_zc_cookies[0] - + tx->tx_msg.ksm_zc_cookies[1] == 2) { + if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) + tmp = cookie; + } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) { + tmp = tx->tx_msg.ksm_zc_cookies[1]; + } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) { + tmp = tx->tx_msg.ksm_zc_cookies[0]; + } + + if (tmp != 0) { + /* range of cookies */ + tx->tx_msg.ksm_zc_cookies[0] = tmp - 1; + tx->tx_msg.ksm_zc_cookies[1] = tmp + 1; + return 1; + } + + } else { + /* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */ + if (cookie >= tx->tx_msg.ksm_zc_cookies[0] && + cookie <= tx->tx_msg.ksm_zc_cookies[1]) { + CWARN("%s: duplicated ZC cookie: %llu\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); + return 1; /* XXX: return error in the future */ + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) { + tx->tx_msg.ksm_zc_cookies[1] = cookie; + return 1; + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) { + tx->tx_msg.ksm_zc_cookies[0] = cookie; + return 1; + } + } + + /* failed to piggyback ZC-ACK */ + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue); + /* the next tx can piggyback at least 1 ACK */ + ksocknal_next_tx_carrier(conn); + } + + return 0; +} + +static int +ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk) +{ + int nob; + +#if SOCKNAL_VERSION_DEBUG + if (!*ksocknal_tunables.ksnd_typed_conns) + return SOCKNAL_MATCH_YES; +#endif + + if (tx == NULL || tx->tx_lnetmsg == NULL) { + /* noop packet */ + nob = offsetof(struct ksock_msg, ksm_u); + } else { + nob = tx->tx_lnetmsg->msg_len + + ((conn->ksnc_proto == &ksocknal_protocol_v1x) ? + sizeof(struct lnet_hdr) : sizeof(struct ksock_msg)); + } + + /* default checking for typed connection */ + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_BULK_IN: + return SOCKNAL_MATCH_MAY; + + case SOCKLND_CONN_BULK_OUT: + if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +static int +ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk) +{ + int nob; + + if (tx == NULL || tx->tx_lnetmsg == NULL) + nob = offsetof(struct ksock_msg, ksm_u); + else + nob = tx->tx_lnetmsg->msg_len + sizeof(struct ksock_msg); + + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_ACK: + if (nonblk) + return SOCKNAL_MATCH_YES; + else if (tx == NULL || tx->tx_lnetmsg == NULL) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_BULK_OUT: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +/* (Sink) handle incoming ZC request from sender */ +static int +ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote) +{ + struct ksock_peer_ni *peer_ni = c->ksnc_peer; + struct ksock_conn *conn; + struct ksock_tx *tx; + int rc; + + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = ksocknal_find_conn_locked(peer_ni, NULL, !!remote); + if (conn != NULL) { + struct ksock_sched *sched = conn->ksnc_scheduler; + + LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL); + + spin_lock_bh(&sched->kss_lock); + + rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie); + + spin_unlock_bh(&sched->kss_lock); + + if (rc) { /* piggybacked */ + read_unlock(&ksocknal_data.ksnd_global_lock); + return 0; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + /* ACK connection is not ready, or can't piggyback the ACK */ + tx = ksocknal_alloc_tx_noop(cookie, !!remote); + if (tx == NULL) + return -ENOMEM; + + if ((rc = ksocknal_launch_packet(peer_ni->ksnp_ni, tx, peer_ni->ksnp_id)) == 0) + return 0; + + ksocknal_free_tx(tx); + return rc; +} + +/* (Sender) handle ZC_ACK from sink */ +static int +ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2) +{ + struct ksock_peer_ni *peer_ni = conn->ksnc_peer; + struct ksock_tx *tx; + struct ksock_tx *tmp; + struct list_head zlist = LIST_HEAD_INIT(zlist); + int count; + + if (cookie1 == 0) + cookie1 = cookie2; + + count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1); + + if (cookie2 == SOCKNAL_KEEPALIVE_PING && + conn->ksnc_proto == &ksocknal_protocol_v3x) { + /* keepalive PING for V3.x, just ignore it */ + return count == 1 ? 0 : -EPROTO; + } + + spin_lock(&peer_ni->ksnp_lock); + + list_for_each_entry_safe(tx, tmp, + &peer_ni->ksnp_zc_req_list, tx_zc_list) { + __u64 c = tx->tx_msg.ksm_zc_cookies[0]; + + if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) { + tx->tx_msg.ksm_zc_cookies[0] = 0; + list_del(&tx->tx_zc_list); + list_add(&tx->tx_zc_list, &zlist); + + if (--count == 0) + break; + } + } + + spin_unlock(&peer_ni->ksnp_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, struct ksock_tx, tx_zc_list); + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } + + return count == 0 ? 0 : -EPROTO; +} + +static int +ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello) +{ + struct socket *sock = conn->ksnc_sock; + struct lnet_hdr *hdr; + struct lnet_magicversion *hmv; + int rc; + int i; + + CLASSERT(sizeof(struct lnet_magicversion) == + offsetof(struct lnet_hdr, src_nid)); + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate struct lnet_hdr\n"); + return -ENOMEM; + } + + hmv = (struct lnet_magicversion *)&hdr->dest_nid; + + /* Re-organize V2.x message header to V1.x (struct lnet_hdr) + * header and send out */ + hmv->magic = cpu_to_le32 (LNET_PROTO_TCP_MAGIC); + hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR); + hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR); + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hmv->version_major++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + hmv->magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + + hdr->src_nid = cpu_to_le64 (hello->kshm_src_nid); + hdr->src_pid = cpu_to_le32 (hello->kshm_src_pid); + hdr->type = cpu_to_le32 (LNET_MSG_HELLO); + hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32)); + hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype); + hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation); + + rc = lnet_sock_write(sock, hdr, sizeof(*hdr), lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n", + rc, &conn->ksnc_ipaddr, conn->ksnc_port); + goto out; + } + + if (hello->kshm_nips == 0) + goto out; + + for (i = 0; i < (int) hello->kshm_nips; i++) { + hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]); + } + + rc = lnet_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO payload (%d)" + " to %pI4h/%d\n", rc, hello->kshm_nips, + &conn->ksnc_ipaddr, conn->ksnc_port); + } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); + + return rc; +} + +static int +ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello) +{ + struct socket *sock = conn->ksnc_sock; + int rc; + + hello->kshm_magic = LNET_PROTO_MAGIC; + hello->kshm_version = conn->ksnc_proto->pro_version; + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hello->kshm_version++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + LNET_UNLOCK(); + } + + rc = lnet_sock_write(sock, hello, offsetof(struct ksock_hello_msg, kshm_ips), + lnet_acceptor_timeout()); + + if (rc != 0) { + CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n", + rc, &conn->ksnc_ipaddr, conn->ksnc_port); + return rc; + } + + if (hello->kshm_nips == 0) + return 0; + + rc = lnet_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO payload (%d)" + " to %pI4h/%d\n", rc, hello->kshm_nips, + &conn->ksnc_ipaddr, conn->ksnc_port); + } + + return rc; +} + +static int +ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello, + int timeout) +{ + struct socket *sock = conn->ksnc_sock; + struct lnet_hdr *hdr; + int rc; + int i; + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate struct lnet_hdr\n"); + return -ENOMEM; + } + + rc = lnet_sock_read(sock, &hdr->src_nid, + sizeof(*hdr) - offsetof(struct lnet_hdr, src_nid), + timeout); + if (rc != 0) { + CERROR("Error %d reading rest of HELLO hdr from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + goto out; + } + + /* ...and check we got what we expected */ + if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) { + CERROR ("Expecting a HELLO hdr," + " but got type %d from %pI4h\n", + le32_to_cpu (hdr->type), + &conn->ksnc_ipaddr); + rc = -EPROTO; + goto out; + } + + hello->kshm_src_nid = le64_to_cpu (hdr->src_nid); + hello->kshm_src_pid = le32_to_cpu (hdr->src_pid); + hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation); + hello->kshm_ctype = le32_to_cpu (hdr->msg.hello.type); + hello->kshm_nips = le32_to_cpu (hdr->payload_length) / + sizeof (__u32); + + if (hello->kshm_nips > LNET_INTERFACES_NUM) { + CERROR("Bad nips %d from ip %pI4h\n", + hello->kshm_nips, &conn->ksnc_ipaddr); + rc = -EPROTO; + goto out; + } + + if (hello->kshm_nips == 0) + goto out; + + rc = lnet_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR("Error %d reading IPs from ip %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + goto out; + } + + for (i = 0; i < (int) hello->kshm_nips; i++) { + hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %pI4h\n", + i, &conn->ksnc_ipaddr); + rc = -EPROTO; + break; + } + } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); + + return rc; +} + +static int +ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello, + int timeout) +{ + struct socket *sock = conn->ksnc_sock; + int rc; + int i; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + conn->ksnc_flip = 0; + else + conn->ksnc_flip = 1; + + rc = lnet_sock_read(sock, &hello->kshm_src_nid, + offsetof(struct ksock_hello_msg, kshm_ips) - + offsetof(struct ksock_hello_msg, kshm_src_nid), + timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + return rc; + } + + if (conn->ksnc_flip) { + __swab32s(&hello->kshm_src_pid); + __swab64s(&hello->kshm_src_nid); + __swab32s(&hello->kshm_dst_pid); + __swab64s(&hello->kshm_dst_nid); + __swab64s(&hello->kshm_src_incarnation); + __swab64s(&hello->kshm_dst_incarnation); + __swab32s(&hello->kshm_ctype); + __swab32s(&hello->kshm_nips); + } + + if (hello->kshm_nips > LNET_INTERFACES_NUM) { + CERROR("Bad nips %d from ip %pI4h\n", + hello->kshm_nips, &conn->ksnc_ipaddr); + return -EPROTO; + } + + if (hello->kshm_nips == 0) + return 0; + + rc = lnet_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR("Error %d reading IPs from ip %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + return rc; + } + + for (i = 0; i < (int) hello->kshm_nips; i++) { + if (conn->ksnc_flip) + __swab32s(&hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %pI4h\n", + i, &conn->ksnc_ipaddr); + return -EPROTO; + } + } + + return 0; +} + +static void +ksocknal_pack_msg_v1(struct ksock_tx *tx) +{ + /* V1.x has no KSOCK_MSG_NOOP */ + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_lnetmsg != NULL); + + tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr; + tx->tx_iov[0].iov_len = sizeof(struct lnet_hdr); + + tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr); + tx->tx_resid = tx->tx_nob; +} + +static void +ksocknal_pack_msg_v2(struct ksock_tx *tx) +{ + tx->tx_iov[0].iov_base = (void *)&tx->tx_msg; + + if (tx->tx_lnetmsg != NULL) { + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + + tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr; + tx->tx_iov[0].iov_len = sizeof(struct ksock_msg); + tx->tx_resid = tx->tx_nob = sizeof(struct ksock_msg) + tx->tx_lnetmsg->msg_len; + } else { + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + tx->tx_iov[0].iov_len = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr); + tx->tx_resid = tx->tx_nob = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr); + } + /* Don't checksum before start sending, because packet can be piggybacked with ACK */ +} + +static void +ksocknal_unpack_msg_v1(struct ksock_msg *msg) +{ + msg->ksm_csum = 0; + msg->ksm_type = KSOCK_MSG_LNET; + msg->ksm_zc_cookies[0] = msg->ksm_zc_cookies[1] = 0; +} + +static void +ksocknal_unpack_msg_v2(struct ksock_msg *msg) +{ + return; /* Do nothing */ +} + +struct ksock_proto ksocknal_protocol_v1x = +{ + .pro_version = KSOCK_PROTO_V1, + .pro_send_hello = ksocknal_send_hello_v1, + .pro_recv_hello = ksocknal_recv_hello_v1, + .pro_pack = ksocknal_pack_msg_v1, + .pro_unpack = ksocknal_unpack_msg_v1, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1, + .pro_handle_zcreq = NULL, + .pro_handle_zcack = NULL, + .pro_queue_tx_zcack = NULL, + .pro_match_tx = ksocknal_match_tx +}; + +struct ksock_proto ksocknal_protocol_v2x = +{ + .pro_version = KSOCK_PROTO_V2, + .pro_send_hello = ksocknal_send_hello_v2, + .pro_recv_hello = ksocknal_recv_hello_v2, + .pro_pack = ksocknal_pack_msg_v2, + .pro_unpack = ksocknal_unpack_msg_v2, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx +}; + +struct ksock_proto ksocknal_protocol_v3x = +{ + .pro_version = KSOCK_PROTO_V3, + .pro_send_hello = ksocknal_send_hello_v2, + .pro_recv_hello = ksocknal_recv_hello_v2, + .pro_pack = ksocknal_pack_msg_v2, + .pro_unpack = ksocknal_unpack_msg_v2, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx_v3 +}; + diff --git a/drivers/staging/lustrefsx/lnet/lnet/Makefile b/drivers/staging/lustrefsx/lnet/lnet/Makefile new file mode 100644 index 0000000000000..330de0a670651 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += lnet.o + +lnet-y := api-ni.o config.o nidstrings.o +lnet-y += lib-me.o lib-msg.o lib-eq.o lib-md.o lib-ptl.o +lnet-y += lib-socket.o lib-move.o module.o lo.o +lnet-y += router.o router_proc.o acceptor.o peer.o net_fault.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c new file mode 100644 index 0000000000000..5be1dd88a6b2f --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c @@ -0,0 +1,526 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include + +static int accept_port = 988; +static int accept_backlog = 127; +static int accept_timeout = 5; + +static struct { + int pta_shutdown; + struct socket *pta_sock; + struct completion pta_signal; + struct net *pta_ns; +} lnet_acceptor_state = { + .pta_shutdown = 1 +}; + +int +lnet_acceptor_port(void) +{ + return accept_port; +} + +static inline int +lnet_accept_magic(__u32 magic, __u32 constant) +{ + return (magic == constant || + magic == __swab32(constant)); +} + +EXPORT_SYMBOL(lnet_acceptor_port); + +static char *accept = "secure"; + +module_param(accept, charp, 0444); +MODULE_PARM_DESC(accept, "Accept connections (secure|all|none)"); +module_param(accept_port, int, 0444); +MODULE_PARM_DESC(accept_port, "Acceptor's port (same on all nodes)"); +module_param(accept_backlog, int, 0444); +MODULE_PARM_DESC(accept_backlog, "Acceptor's listen backlog"); +module_param(accept_timeout, int, 0644); +MODULE_PARM_DESC(accept_timeout, "Acceptor's timeout (seconds)"); + +static char *accept_type = NULL; + +static int +lnet_acceptor_get_tunables(void) +{ + /* Userland acceptor uses 'accept_type' instead of 'accept', due to + * conflict with 'accept(2)', but kernel acceptor still uses 'accept' + * for compatibility. Hence the trick. */ + accept_type = accept; + return 0; +} + +int +lnet_acceptor_timeout(void) +{ + return accept_timeout; +} +EXPORT_SYMBOL(lnet_acceptor_timeout); + +void +lnet_connect_console_error (int rc, lnet_nid_t peer_nid, + __u32 peer_ip, int peer_port) +{ + switch (rc) { + /* "normal" errors */ + case -ECONNREFUSED: + CNETERR("Connection to %s at host %pI4h on port %d was " + "refused: check that Lustre is running on that node.\n", + libcfs_nid2str(peer_nid), &peer_ip, peer_port); + break; + case -EHOSTUNREACH: + case -ENETUNREACH: + CNETERR("Connection to %s at host %pI4h " + "was unreachable: the network or that node may " + "be down, or Lustre may be misconfigured.\n", + libcfs_nid2str(peer_nid), &peer_ip); + break; + case -ETIMEDOUT: + CNETERR("Connection to %s at host %pI4h on " + "port %d took too long: that node may be hung " + "or experiencing high load.\n", + libcfs_nid2str(peer_nid), &peer_ip, peer_port); + break; + case -ECONNRESET: + LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %pI4h" + " on port %d was reset: " + "is it running a compatible version of " + "Lustre and is %s one of its NIDs?\n", + libcfs_nid2str(peer_nid), &peer_ip, + peer_port, libcfs_nid2str(peer_nid)); + break; + case -EPROTO: + LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at " + "host %pI4h on port %d: is it running " + "a compatible version of Lustre?\n", + libcfs_nid2str(peer_nid), &peer_ip, + peer_port); + break; + case -EADDRINUSE: + LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to " + "connect to %s at host %pI4h on port " + "%d\n", libcfs_nid2str(peer_nid), + &peer_ip, peer_port); + break; + default: + LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s" + " at host %pI4h on port %d\n", rc, + libcfs_nid2str(peer_nid), + &peer_ip, peer_port); + break; + } +} +EXPORT_SYMBOL(lnet_connect_console_error); + +int +lnet_connect(struct socket **sockp, lnet_nid_t peer_nid, + __u32 local_ip, __u32 peer_ip, int peer_port, struct net *ns) +{ + struct lnet_acceptor_connreq cr; + struct socket *sock; + int rc; + int port; + int fatal; + + CLASSERT(sizeof(cr) <= 16); /* not too big to be on the stack */ + + for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT; + port >= LNET_ACCEPTOR_MIN_RESERVED_PORT; + --port) { + /* Iterate through reserved ports. */ + + rc = lnet_sock_connect(&sock, &fatal, + local_ip, port, + peer_ip, peer_port, ns); + if (rc != 0) { + if (fatal) + goto failed; + continue; + } + + CLASSERT(LNET_PROTO_ACCEPTOR_VERSION == 1); + + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + cr.acr_nid = peer_nid; + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + lnet_net_lock(LNET_LOCK_EX); + if ((the_lnet.ln_testprotocompat & 4) != 0) { + cr.acr_version++; + the_lnet.ln_testprotocompat &= ~4; + } + if ((the_lnet.ln_testprotocompat & 8) != 0) { + cr.acr_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~8; + } + lnet_net_unlock(LNET_LOCK_EX); + } + + rc = lnet_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + if (rc != 0) + goto failed_sock; + + *sockp = sock; + return 0; + } + + rc = -EADDRINUSE; + goto failed; + +failed_sock: + sock_release(sock); +failed: + lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port); + return rc; +} +EXPORT_SYMBOL(lnet_connect); + +static int +lnet_accept(struct socket *sock, __u32 magic) +{ + struct lnet_acceptor_connreq cr; + __u32 peer_ip; + int peer_port; + int rc; + int flip; + struct lnet_ni *ni; + char *str; + + LASSERT(sizeof(cr) <= 16); /* not too big for the stack */ + + rc = lnet_sock_getaddr(sock, true, &peer_ip, &peer_port); + LASSERT(rc == 0); /* we succeeded before */ + + if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) { + + if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) { + /* future version compatibility! + * When LNET unifies protocols over all LNDs, the first + * thing sent will be a version query. I send back + * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */ + + memset(&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + rc = lnet_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response" + "to LNET magic from %pI4h: %d\n", + &peer_ip, rc); + return -EPROTO; + } + + if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) + str = "'old' socknal/tcpnal"; + else + str = "unrecognised"; + + LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %pI4h" + " magic %08x: %s acceptor protocol\n", + &peer_ip, magic, str); + return -EPROTO; + } + + flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC); + + rc = lnet_sock_read(sock, &cr.acr_version, + sizeof(cr.acr_version), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request version from " + "%pI4h\n", rc, &peer_ip); + return -EIO; + } + + if (flip) + __swab32s(&cr.acr_version); + + if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) { + /* future version compatibility! + * An acceptor-specific protocol rev will first send a version + * query. I send back my current version to tell her I'm + * "old". */ + int peer_version = cr.acr_version; + + memset(&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + + rc = lnet_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response" + "to version %d from %pI4h: %d\n", + peer_version, &peer_ip, rc); + return -EPROTO; + } + + rc = lnet_sock_read(sock, &cr.acr_nid, + sizeof(cr) - + offsetof(struct lnet_acceptor_connreq, acr_nid), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from " + "%pI4h\n", rc, &peer_ip); + return -EIO; + } + + if (flip) + __swab64s(&cr.acr_nid); + + ni = lnet_nid2ni_addref(cr.acr_nid); + if (ni == NULL || /* no matching net */ + ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */ + if (ni != NULL) + lnet_ni_decref(ni); + LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %pI4h " + "for %s: No matching NI\n", + &peer_ip, libcfs_nid2str(cr.acr_nid)); + return -EPERM; + } + + if (ni->ni_net->net_lnd->lnd_accept == NULL) { + /* This catches a request for the loopback LND */ + lnet_ni_decref(ni); + LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %pI4h " + "for %s: NI doesn not accept IP connections\n", + &peer_ip, libcfs_nid2str(cr.acr_nid)); + return -EPERM; + } + + CDEBUG(D_NET, "Accept %s from %pI4h\n", + libcfs_nid2str(cr.acr_nid), &peer_ip); + + rc = ni->ni_net->net_lnd->lnd_accept(ni, sock); + + lnet_ni_decref(ni); + return rc; +} + +static int +lnet_acceptor(void *arg) +{ + struct socket *newsock; + int rc; + __u32 magic; + __u32 peer_ip; + int peer_port; + int secure = (int)((uintptr_t)arg); + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + + cfs_block_allsigs(); + + rc = lnet_sock_listen(&lnet_acceptor_state.pta_sock, + 0, accept_port, accept_backlog, + lnet_acceptor_state.pta_ns); + if (rc != 0) { + if (rc == -EADDRINUSE) + LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port" + " %d: port already in use\n", + accept_port); + else + LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port " + "%d: unexpected error %d\n", + accept_port, rc); + + lnet_acceptor_state.pta_sock = NULL; + } else { + LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port); + } + + /* set init status and unblock parent */ + lnet_acceptor_state.pta_shutdown = rc; + complete(&lnet_acceptor_state.pta_signal); + + if (rc != 0) + return rc; + + while (!lnet_acceptor_state.pta_shutdown) { + + rc = lnet_sock_accept(&newsock, lnet_acceptor_state.pta_sock); + if (rc != 0) { + if (rc != -EAGAIN) { + CWARN("Accept error %d: pausing...\n", rc); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + continue; + } + + /* maybe we're waken up with lnet_sock_abort_accept() */ + if (lnet_acceptor_state.pta_shutdown) { + sock_release(newsock); + break; + } + + rc = lnet_sock_getaddr(newsock, true, &peer_ip, &peer_port); + if (rc != 0) { + CERROR("Can't determine new connection's address\n"); + goto failed; + } + + if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { + CERROR("Refusing connection from %pI4h: " + "insecure port %d\n", &peer_ip, peer_port); + goto failed; + } + + rc = lnet_sock_read(newsock, &magic, sizeof(magic), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from " + "%pI4h\n", rc, &peer_ip); + goto failed; + } + + rc = lnet_accept(newsock, magic); + if (rc != 0) + goto failed; + + continue; + +failed: + sock_release(newsock); + } + + sock_release(lnet_acceptor_state.pta_sock); + lnet_acceptor_state.pta_sock = NULL; + + CDEBUG(D_NET, "Acceptor stopping\n"); + + /* unblock lnet_acceptor_stop() */ + complete(&lnet_acceptor_state.pta_signal); + return 0; +} + +static inline int +accept2secure(const char *acc, long *sec) +{ + if (!strcmp(acc, "secure")) { + *sec = 1; + return 1; + } else if (!strcmp(acc, "all")) { + *sec = 0; + return 1; + } else if (!strcmp(acc, "none")) { + return 0; + } else { + LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n", + acc); + return -EINVAL; + } +} + +int +lnet_acceptor_start(void) +{ + struct task_struct *task; + int rc; + long rc2; + long secure; + + /* if acceptor is already running return immediately */ + if (!lnet_acceptor_state.pta_shutdown) + return 0; + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + + rc = lnet_acceptor_get_tunables(); + if (rc != 0) + return rc; + + init_completion(&lnet_acceptor_state.pta_signal); + rc = accept2secure(accept_type, &secure); + if (rc <= 0) + return rc; + + if (lnet_count_acceptor_nets() == 0) /* not required */ + return 0; + if (current->nsproxy && current->nsproxy->net_ns) + lnet_acceptor_state.pta_ns = current->nsproxy->net_ns; + else + lnet_acceptor_state.pta_ns = &init_net; + task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure, + "acceptor_%03ld", secure); + if (IS_ERR(task)) { + rc2 = PTR_ERR(task); + CERROR("Can't start acceptor thread: %ld\n", rc2); + return -ESRCH; + } + + /* wait for acceptor to startup */ + wait_for_completion(&lnet_acceptor_state.pta_signal); + + if (!lnet_acceptor_state.pta_shutdown) { + /* started OK */ + LASSERT(lnet_acceptor_state.pta_sock != NULL); + return 0; + } + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + + return -ENETDOWN; +} + +void +lnet_acceptor_stop(void) +{ + struct sock *sk; + + if (lnet_acceptor_state.pta_shutdown) /* not running */ + return; + + lnet_acceptor_state.pta_shutdown = 1; + + sk = lnet_acceptor_state.pta_sock->sk; + + /* awake any sleepers using safe method */ + sk->sk_state_change(sk); + + /* block until acceptor signals exit */ + wait_for_completion(&lnet_acceptor_state.pta_signal); +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/api-ni.c b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c new file mode 100644 index 0000000000000..24e7d7aa59cd0 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c @@ -0,0 +1,4354 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include + +#include + +#define D_LNI D_CONSOLE + +/* + * initialize ln_api_mutex statically, since it needs to be used in + * discovery_set callback. That module parameter callback can be called + * before module init completes. The mutex needs to be ready for use then. + */ +struct lnet the_lnet = { + .ln_api_mutex = __MUTEX_INITIALIZER(the_lnet.ln_api_mutex), +}; /* THE state of the network */ +EXPORT_SYMBOL(the_lnet); + +static char *ip2nets = ""; +module_param(ip2nets, charp, 0444); +MODULE_PARM_DESC(ip2nets, "LNET network <- IP table"); + +static char *networks = ""; +module_param(networks, charp, 0444); +MODULE_PARM_DESC(networks, "local networks"); + +static char *routes = ""; +module_param(routes, charp, 0444); +MODULE_PARM_DESC(routes, "routes to non-local networks"); + +static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; +module_param(rnet_htable_size, int, 0444); +MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table"); + +static int use_tcp_bonding = false; +module_param(use_tcp_bonding, int, 0444); +MODULE_PARM_DESC(use_tcp_bonding, + "use_tcp_bonding parameter has been deprecated"); + +unsigned int lnet_numa_range = 0; +module_param(lnet_numa_range, uint, 0444); +MODULE_PARM_DESC(lnet_numa_range, + "NUMA range to consider during Multi-Rail selection"); + +/* + * lnet_health_sensitivity determines by how much we decrement the health + * value on sending error. The value defaults to 100, which means health + * interface health is decremented by 100 points every failure. + */ +unsigned int lnet_health_sensitivity = 100; +static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp); +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_health_sensitivity = { + .set = sensitivity_set, + .get = param_get_int, +}; +#define param_check_health_sensitivity(name, p) \ + __param_check(name, p, int) +module_param(lnet_health_sensitivity, health_sensitivity, S_IRUGO|S_IWUSR); +#else +module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int, + &lnet_health_sensitivity, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(lnet_health_sensitivity, + "Value to decrement the health value by on error"); + +/* + * lnet_recovery_interval determines how often we should perform recovery + * on unhealthy interfaces. + */ +unsigned int lnet_recovery_interval = 1; +static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp); +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_recovery_interval = { + .set = recovery_interval_set, + .get = param_get_int, +}; +#define param_check_recovery_interval(name, p) \ + __param_check(name, p, int) +module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR); +#else +module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int, + &lnet_recovery_interval, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(lnet_recovery_interval, + "Interval to recover unhealthy interfaces in seconds"); + +static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT; +static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp); + +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_interfaces_max = { + .set = intf_max_set, + .get = param_get_int, +}; + +#define param_check_interfaces_max(name, p) \ + __param_check(name, p, int) + +module_param(lnet_interfaces_max, interfaces_max, 0644); +#else +module_param_call(lnet_interfaces_max, intf_max_set, param_get_int, + &lnet_interfaces_max, 0644); +#endif +MODULE_PARM_DESC(lnet_interfaces_max, + "Maximum number of interfaces in a node."); + +unsigned lnet_peer_discovery_disabled = 0; +static int discovery_set(const char *val, cfs_kernel_param_arg_t *kp); + +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_discovery_disabled = { + .set = discovery_set, + .get = param_get_int, +}; + +#define param_check_discovery_disabled(name, p) \ + __param_check(name, p, int) +module_param(lnet_peer_discovery_disabled, discovery_disabled, 0644); +#else +module_param_call(lnet_peer_discovery_disabled, discovery_set, param_get_int, + &lnet_peer_discovery_disabled, 0644); +#endif +MODULE_PARM_DESC(lnet_peer_discovery_disabled, + "Set to 1 to disable peer discovery on this node."); + +unsigned int lnet_drop_asym_route; +static int drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp); + +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_drop_asym_route = { + .set = drop_asym_route_set, + .get = param_get_int, +}; + +#define param_check_drop_asym_route(name, p) \ + __param_check(name, p, int) +module_param(lnet_drop_asym_route, drop_asym_route, 0644); +#else +module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int, + &lnet_drop_asym_route, 0644); +#endif +MODULE_PARM_DESC(lnet_drop_asym_route, + "Set to 1 to drop asymmetrical route messages."); + +#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50 +#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 50 + +unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT; +static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp); +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_transaction_timeout = { + .set = transaction_to_set, + .get = param_get_int, +}; + +#define param_check_transaction_timeout(name, p) \ + __param_check(name, p, int) +module_param(lnet_transaction_timeout, transaction_timeout, S_IRUGO|S_IWUSR); +#else +module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int, + &lnet_transaction_timeout, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(lnet_transaction_timeout, + "Maximum number of seconds to wait for a peer response."); + +#define LNET_RETRY_COUNT_HEALTH_DEFAULT 2 +unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT; +static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp); +#ifdef HAVE_KERNEL_PARAM_OPS +static struct kernel_param_ops param_ops_retry_count = { + .set = retry_count_set, + .get = param_get_int, +}; + +#define param_check_retry_count(name, p) \ + __param_check(name, p, int) +module_param(lnet_retry_count, retry_count, S_IRUGO|S_IWUSR); +#else +module_param_call(lnet_retry_count, retry_count_set, param_get_int, + &lnet_retry_count, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(lnet_retry_count, + "Maximum number of times to retry transmitting a message"); + + +unsigned lnet_lnd_timeout = LNET_LND_DEFAULT_TIMEOUT; + +/* + * This sequence number keeps track of how many times DLC was used to + * update the local NIs. It is incremented when a NI is added or + * removed and checked when sending a message to determine if there is + * a need to re-run the selection algorithm. See lnet_select_pathway() + * for more details on its usage. + */ +static atomic_t lnet_dlc_seq_no = ATOMIC_INIT(0); + +static int lnet_ping(struct lnet_process_id id, signed long timeout, + struct lnet_process_id __user *ids, int n_ids); + +static int lnet_discover(struct lnet_process_id id, __u32 force, + struct lnet_process_id __user *ids, int n_ids); + +static int +sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *sensitivity = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_health_sensitivity'\n"); + return rc; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (value > LNET_MAX_HEALTH_VALUE) { + mutex_unlock(&the_lnet.ln_api_mutex); + CERROR("Invalid health value. Maximum: %d value = %lu\n", + LNET_MAX_HEALTH_VALUE, value); + return -EINVAL; + } + + /* + * if we're turning on health then use the health timeout + * defaults. + */ + if (*sensitivity == 0 && value != 0) { + lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT; + lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT; + /* + * if we're turning off health then use the no health timeout + * default. + */ + } else if (*sensitivity != 0 && value == 0) { + lnet_transaction_timeout = + LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT; + lnet_retry_count = 0; + } + + *sensitivity = value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *interval = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_recovery_interval'\n"); + return rc; + } + + if (value < 1) { + CERROR("lnet_recovery_interval must be at least 1 second\n"); + return -EINVAL; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + *interval = value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +discovery_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *discovery = (unsigned *)kp->arg; + unsigned long value; + struct lnet_ping_buffer *pbuf; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_peer_discovery_disabled'\n"); + return rc; + } + + value = (value) ? 1 : 0; + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (value == *discovery) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + *discovery = value; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + /* tell peers that discovery setting has changed */ + lnet_net_lock(LNET_LOCK_EX); + pbuf = the_lnet.ln_ping_target; + if (value) + pbuf->pb_info.pi_features &= ~LNET_PING_FEAT_DISCOVERY; + else + pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY; + lnet_net_unlock(LNET_LOCK_EX); + + lnet_push_update_to_peers(1); + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned int *drop_asym_route = (unsigned int *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for " + "'lnet_drop_asym_route'\n"); + return rc; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (value == *drop_asym_route) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + *drop_asym_route = value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *transaction_to = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_transaction_timeout'\n"); + return rc; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (value < lnet_retry_count || value == 0) { + mutex_unlock(&the_lnet.ln_api_mutex); + CERROR("Invalid value for lnet_transaction_timeout (%lu). " + "Has to be greater than lnet_retry_count (%u)\n", + value, lnet_retry_count); + return -EINVAL; + } + + if (value == *transaction_to) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + *transaction_to = value; + if (lnet_retry_count == 0) + lnet_lnd_timeout = value; + else + lnet_lnd_timeout = value / lnet_retry_count; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +retry_count_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *retry_count = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_retry_count'\n"); + return rc; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (lnet_health_sensitivity == 0) { + mutex_unlock(&the_lnet.ln_api_mutex); + CERROR("Can not set retry_count when health feature is turned off\n"); + return -EINVAL; + } + + if (value > lnet_transaction_timeout) { + mutex_unlock(&the_lnet.ln_api_mutex); + CERROR("Invalid value for lnet_retry_count (%lu). " + "Has to be smaller than lnet_transaction_timeout (%u)\n", + value, lnet_transaction_timeout); + return -EINVAL; + } + + *retry_count = value; + + if (value == 0) + lnet_lnd_timeout = lnet_transaction_timeout; + else + lnet_lnd_timeout = lnet_transaction_timeout / value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int +intf_max_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int value, rc; + + rc = kstrtoint(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_interfaces_max'\n"); + return rc; + } + + if (value < LNET_INTERFACES_MIN) { + CWARN("max interfaces provided are too small, setting to %d\n", + LNET_INTERFACES_MAX_DEFAULT); + value = LNET_INTERFACES_MAX_DEFAULT; + } + + *(int *)kp->arg = value; + + return 0; +} + +static char * +lnet_get_routes(void) +{ + return routes; +} + +static char * +lnet_get_networks(void) +{ + char *nets; + int rc; + + if (*networks != 0 && *ip2nets != 0) { + LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or " + "'ip2nets' but not both at once\n"); + return NULL; + } + + if (*ip2nets != 0) { + rc = lnet_parse_ip2nets(&nets, ip2nets); + return (rc == 0) ? nets : NULL; + } + + if (*networks != 0) + return networks; + + return "tcp"; +} + +static void +lnet_init_locks(void) +{ + spin_lock_init(&the_lnet.ln_eq_wait_lock); + spin_lock_init(&the_lnet.ln_msg_resend_lock); + init_waitqueue_head(&the_lnet.ln_eq_waitq); + init_waitqueue_head(&the_lnet.ln_mt_waitq); + mutex_init(&the_lnet.ln_lnd_mutex); +} + +static void +lnet_fini_locks(void) +{ +} + +struct kmem_cache *lnet_mes_cachep; /* MEs kmem_cache */ +struct kmem_cache *lnet_small_mds_cachep; /* <= LNET_SMALL_MD_SIZE bytes + * MDs kmem_cache */ + +static int +lnet_descriptor_setup(void) +{ + /* create specific kmem_cache for MEs and small MDs (i.e., originally + * allocated in kmem_cache). + */ + lnet_mes_cachep = kmem_cache_create("lnet_MEs", sizeof(struct lnet_me), + 0, 0, NULL); + if (!lnet_mes_cachep) + return -ENOMEM; + + lnet_small_mds_cachep = kmem_cache_create("lnet_small_MDs", + LNET_SMALL_MD_SIZE, 0, 0, + NULL); + if (!lnet_small_mds_cachep) + return -ENOMEM; + + return 0; +} + +static void +lnet_descriptor_cleanup(void) +{ + + if (lnet_small_mds_cachep) { + kmem_cache_destroy(lnet_small_mds_cachep); + lnet_small_mds_cachep = NULL; + } + + if (lnet_mes_cachep) { + kmem_cache_destroy(lnet_mes_cachep); + lnet_mes_cachep = NULL; + } +} + +static int +lnet_create_remote_nets_table(void) +{ + int i; + struct list_head *hash; + + LASSERT(the_lnet.ln_remote_nets_hash == NULL); + LASSERT(the_lnet.ln_remote_nets_hbits > 0); + LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash)); + if (hash == NULL) { + CERROR("Failed to create remote nets hash table\n"); + return -ENOMEM; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) + INIT_LIST_HEAD(&hash[i]); + the_lnet.ln_remote_nets_hash = hash; + return 0; +} + +static void +lnet_destroy_remote_nets_table(void) +{ + int i; + + if (the_lnet.ln_remote_nets_hash == NULL) + return; + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) + LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i])); + + LIBCFS_FREE(the_lnet.ln_remote_nets_hash, + LNET_REMOTE_NETS_HASH_SIZE * + sizeof(the_lnet.ln_remote_nets_hash[0])); + the_lnet.ln_remote_nets_hash = NULL; +} + +static void +lnet_destroy_locks(void) +{ + if (the_lnet.ln_res_lock != NULL) { + cfs_percpt_lock_free(the_lnet.ln_res_lock); + the_lnet.ln_res_lock = NULL; + } + + if (the_lnet.ln_net_lock != NULL) { + cfs_percpt_lock_free(the_lnet.ln_net_lock); + the_lnet.ln_net_lock = NULL; + } + + lnet_fini_locks(); +} + +static int +lnet_create_locks(void) +{ + lnet_init_locks(); + + the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); + if (the_lnet.ln_res_lock == NULL) + goto failed; + + the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); + if (the_lnet.ln_net_lock == NULL) + goto failed; + + return 0; + + failed: + lnet_destroy_locks(); + return -ENOMEM; +} + +static void lnet_assert_wire_constants(void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * running on Linux robert.bartonsoftware.com 2.6.8-1.521 + * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux + * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */ + + /* Constants... */ + CLASSERT(LNET_PROTO_TCP_MAGIC == 0xeebc0ded); + CLASSERT(LNET_PROTO_TCP_VERSION_MAJOR == 1); + CLASSERT(LNET_PROTO_TCP_VERSION_MINOR == 0); + CLASSERT(LNET_MSG_ACK == 0); + CLASSERT(LNET_MSG_PUT == 1); + CLASSERT(LNET_MSG_GET == 2); + CLASSERT(LNET_MSG_REPLY == 3); + CLASSERT(LNET_MSG_HELLO == 4); + + /* Checks for struct lnet_handle_wire */ + CLASSERT((int)sizeof(struct lnet_handle_wire) == 16); + CLASSERT((int)offsetof(struct lnet_handle_wire, wh_interface_cookie) == 0); + CLASSERT((int)sizeof(((struct lnet_handle_wire *)0)->wh_interface_cookie) == 8); + CLASSERT((int)offsetof(struct lnet_handle_wire, wh_object_cookie) == 8); + CLASSERT((int)sizeof(((struct lnet_handle_wire *)0)->wh_object_cookie) == 8); + + /* Checks for struct struct lnet_magicversion */ + CLASSERT((int)sizeof(struct lnet_magicversion) == 8); + CLASSERT((int)offsetof(struct lnet_magicversion, magic) == 0); + CLASSERT((int)sizeof(((struct lnet_magicversion *)0)->magic) == 4); + CLASSERT((int)offsetof(struct lnet_magicversion, version_major) == 4); + CLASSERT((int)sizeof(((struct lnet_magicversion *)0)->version_major) == 2); + CLASSERT((int)offsetof(struct lnet_magicversion, version_minor) == 6); + CLASSERT((int)sizeof(((struct lnet_magicversion *)0)->version_minor) == 2); + + /* Checks for struct struct lnet_hdr */ + CLASSERT((int)sizeof(struct lnet_hdr) == 72); + CLASSERT((int)offsetof(struct lnet_hdr, dest_nid) == 0); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->dest_nid) == 8); + CLASSERT((int)offsetof(struct lnet_hdr, src_nid) == 8); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->src_nid) == 8); + CLASSERT((int)offsetof(struct lnet_hdr, dest_pid) == 16); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->dest_pid) == 4); + CLASSERT((int)offsetof(struct lnet_hdr, src_pid) == 20); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->src_pid) == 4); + CLASSERT((int)offsetof(struct lnet_hdr, type) == 24); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->type) == 4); + CLASSERT((int)offsetof(struct lnet_hdr, payload_length) == 28); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->payload_length) == 4); + CLASSERT((int)offsetof(struct lnet_hdr, msg) == 32); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg) == 40); + + /* Ack */ + CLASSERT((int)offsetof(struct lnet_hdr, msg.ack.dst_wmd) == 32); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.ack.dst_wmd) == 16); + CLASSERT((int)offsetof(struct lnet_hdr, msg.ack.match_bits) == 48); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.ack.match_bits) == 8); + CLASSERT((int)offsetof(struct lnet_hdr, msg.ack.mlength) == 56); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.ack.mlength) == 4); + + /* Put */ + CLASSERT((int)offsetof(struct lnet_hdr, msg.put.ack_wmd) == 32); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.put.ack_wmd) == 16); + CLASSERT((int)offsetof(struct lnet_hdr, msg.put.match_bits) == 48); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.put.match_bits) == 8); + CLASSERT((int)offsetof(struct lnet_hdr, msg.put.hdr_data) == 56); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.put.hdr_data) == 8); + CLASSERT((int)offsetof(struct lnet_hdr, msg.put.ptl_index) == 64); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.put.ptl_index) == 4); + CLASSERT((int)offsetof(struct lnet_hdr, msg.put.offset) == 68); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.put.offset) == 4); + + /* Get */ + CLASSERT((int)offsetof(struct lnet_hdr, msg.get.return_wmd) == 32); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.get.return_wmd) == 16); + CLASSERT((int)offsetof(struct lnet_hdr, msg.get.match_bits) == 48); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.get.match_bits) == 8); + CLASSERT((int)offsetof(struct lnet_hdr, msg.get.ptl_index) == 56); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.get.ptl_index) == 4); + CLASSERT((int)offsetof(struct lnet_hdr, msg.get.src_offset) == 60); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.get.src_offset) == 4); + CLASSERT((int)offsetof(struct lnet_hdr, msg.get.sink_length) == 64); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.get.sink_length) == 4); + + /* Reply */ + CLASSERT((int)offsetof(struct lnet_hdr, msg.reply.dst_wmd) == 32); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.reply.dst_wmd) == 16); + + /* Hello */ + CLASSERT((int)offsetof(struct lnet_hdr, msg.hello.incarnation) == 32); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.hello.incarnation) == 8); + CLASSERT((int)offsetof(struct lnet_hdr, msg.hello.type) == 40); + CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.hello.type) == 4); + + /* Checks for struct lnet_ni_status and related constants */ + CLASSERT(LNET_NI_STATUS_INVALID == 0x00000000); + CLASSERT(LNET_NI_STATUS_UP == 0x15aac0de); + CLASSERT(LNET_NI_STATUS_DOWN == 0xdeadface); + + /* Checks for struct lnet_ni_status */ + CLASSERT((int)sizeof(struct lnet_ni_status) == 16); + CLASSERT((int)offsetof(struct lnet_ni_status, ns_nid) == 0); + CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_nid) == 8); + CLASSERT((int)offsetof(struct lnet_ni_status, ns_status) == 8); + CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_status) == 4); + CLASSERT((int)offsetof(struct lnet_ni_status, ns_unused) == 12); + CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_unused) == 4); + + /* Checks for struct lnet_ping_info and related constants */ + CLASSERT(LNET_PROTO_PING_MAGIC == 0x70696E67); + CLASSERT(LNET_PING_FEAT_INVAL == 0); + CLASSERT(LNET_PING_FEAT_BASE == 1); + CLASSERT(LNET_PING_FEAT_NI_STATUS == 2); + CLASSERT(LNET_PING_FEAT_RTE_DISABLED == 4); + CLASSERT(LNET_PING_FEAT_MULTI_RAIL == 8); + CLASSERT(LNET_PING_FEAT_DISCOVERY == 16); + CLASSERT(LNET_PING_FEAT_BITS == 31); + + /* Checks for struct lnet_ping_info */ + CLASSERT((int)sizeof(struct lnet_ping_info) == 16); + CLASSERT((int)offsetof(struct lnet_ping_info, pi_magic) == 0); + CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_magic) == 4); + CLASSERT((int)offsetof(struct lnet_ping_info, pi_features) == 4); + CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_features) == 4); + CLASSERT((int)offsetof(struct lnet_ping_info, pi_pid) == 8); + CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_pid) == 4); + CLASSERT((int)offsetof(struct lnet_ping_info, pi_nnis) == 12); + CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_nnis) == 4); + CLASSERT((int)offsetof(struct lnet_ping_info, pi_ni) == 16); + CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_ni) == 0); +} + +static struct lnet_lnd *lnet_find_lnd_by_type(__u32 type) +{ + struct lnet_lnd *lnd; + struct list_head *tmp; + + /* holding lnd mutex */ + list_for_each(tmp, &the_lnet.ln_lnds) { + lnd = list_entry(tmp, struct lnet_lnd, lnd_list); + + if (lnd->lnd_type == type) + return lnd; + } + return NULL; +} + +unsigned int +lnet_get_lnd_timeout(void) +{ + return lnet_lnd_timeout; +} +EXPORT_SYMBOL(lnet_get_lnd_timeout); + +void +lnet_register_lnd(struct lnet_lnd *lnd) +{ + mutex_lock(&the_lnet.ln_lnd_mutex); + + LASSERT(libcfs_isknown_lnd(lnd->lnd_type)); + LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == NULL); + + list_add_tail(&lnd->lnd_list, &the_lnet.ln_lnds); + lnd->lnd_refcount = 0; + + CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type)); + + mutex_unlock(&the_lnet.ln_lnd_mutex); +} +EXPORT_SYMBOL(lnet_register_lnd); + +void +lnet_unregister_lnd(struct lnet_lnd *lnd) +{ + mutex_lock(&the_lnet.ln_lnd_mutex); + + LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == lnd); + LASSERT(lnd->lnd_refcount == 0); + + list_del(&lnd->lnd_list); + CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type)); + + mutex_unlock(&the_lnet.ln_lnd_mutex); +} +EXPORT_SYMBOL(lnet_unregister_lnd); + +void +lnet_counters_get_common(struct lnet_counters_common *common) +{ + struct lnet_counters *ctr; + int i; + + memset(common, 0, sizeof(*common)); + + lnet_net_lock(LNET_LOCK_EX); + + cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { + common->lcc_msgs_max += ctr->lct_common.lcc_msgs_max; + common->lcc_msgs_alloc += ctr->lct_common.lcc_msgs_alloc; + common->lcc_errors += ctr->lct_common.lcc_errors; + common->lcc_send_count += ctr->lct_common.lcc_send_count; + common->lcc_recv_count += ctr->lct_common.lcc_recv_count; + common->lcc_route_count += ctr->lct_common.lcc_route_count; + common->lcc_drop_count += ctr->lct_common.lcc_drop_count; + common->lcc_send_length += ctr->lct_common.lcc_send_length; + common->lcc_recv_length += ctr->lct_common.lcc_recv_length; + common->lcc_route_length += ctr->lct_common.lcc_route_length; + common->lcc_drop_length += ctr->lct_common.lcc_drop_length; + } + lnet_net_unlock(LNET_LOCK_EX); +} +EXPORT_SYMBOL(lnet_counters_get_common); + +void +lnet_counters_get(struct lnet_counters *counters) +{ + struct lnet_counters *ctr; + struct lnet_counters_health *health = &counters->lct_health; + int i; + + memset(counters, 0, sizeof(*counters)); + + lnet_counters_get_common(&counters->lct_common); + + lnet_net_lock(LNET_LOCK_EX); + + cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { + health->lch_rst_alloc += ctr->lct_health.lch_rst_alloc; + health->lch_resend_count += ctr->lct_health.lch_resend_count; + health->lch_response_timeout_count += + ctr->lct_health.lch_response_timeout_count; + health->lch_local_interrupt_count += + ctr->lct_health.lch_local_interrupt_count; + health->lch_local_dropped_count += + ctr->lct_health.lch_local_dropped_count; + health->lch_local_aborted_count += + ctr->lct_health.lch_local_aborted_count; + health->lch_local_no_route_count += + ctr->lct_health.lch_local_no_route_count; + health->lch_local_timeout_count += + ctr->lct_health.lch_local_timeout_count; + health->lch_local_error_count += + ctr->lct_health.lch_local_error_count; + health->lch_remote_dropped_count += + ctr->lct_health.lch_remote_dropped_count; + health->lch_remote_error_count += + ctr->lct_health.lch_remote_error_count; + health->lch_remote_timeout_count += + ctr->lct_health.lch_remote_timeout_count; + health->lch_network_timeout_count += + ctr->lct_health.lch_network_timeout_count; + } + lnet_net_unlock(LNET_LOCK_EX); +} +EXPORT_SYMBOL(lnet_counters_get); + +void +lnet_counters_reset(void) +{ + struct lnet_counters *counters; + int i; + + lnet_net_lock(LNET_LOCK_EX); + + cfs_percpt_for_each(counters, i, the_lnet.ln_counters) + memset(counters, 0, sizeof(struct lnet_counters)); + + lnet_net_unlock(LNET_LOCK_EX); +} + +static char * +lnet_res_type2str(int type) +{ + switch (type) { + default: + LBUG(); + case LNET_COOKIE_TYPE_MD: + return "MD"; + case LNET_COOKIE_TYPE_ME: + return "ME"; + case LNET_COOKIE_TYPE_EQ: + return "EQ"; + } +} + +static void +lnet_res_container_cleanup(struct lnet_res_container *rec) +{ + int count = 0; + + if (rec->rec_type == 0) /* not set yet, it's uninitialized */ + return; + + while (!list_empty(&rec->rec_active)) { + struct list_head *e = rec->rec_active.next; + + list_del_init(e); + if (rec->rec_type == LNET_COOKIE_TYPE_EQ) { + lnet_eq_free(list_entry(e, struct lnet_eq, eq_list)); + + } else if (rec->rec_type == LNET_COOKIE_TYPE_MD) { + lnet_md_free(list_entry(e, struct lnet_libmd, md_list)); + + } else { /* NB: Active MEs should be attached on portals */ + LBUG(); + } + count++; + } + + if (count > 0) { + /* Found alive MD/ME/EQ, user really should unlink/free + * all of them before finalize LNet, but if someone didn't, + * we have to recycle garbage for him */ + CERROR("%d active elements on exit of %s container\n", + count, lnet_res_type2str(rec->rec_type)); + } + + if (rec->rec_lh_hash != NULL) { + LIBCFS_FREE(rec->rec_lh_hash, + LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0])); + rec->rec_lh_hash = NULL; + } + + rec->rec_type = 0; /* mark it as finalized */ +} + +static int +lnet_res_container_setup(struct lnet_res_container *rec, int cpt, int type) +{ + int rc = 0; + int i; + + LASSERT(rec->rec_type == 0); + + rec->rec_type = type; + INIT_LIST_HEAD(&rec->rec_active); + + rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type; + + /* Arbitrary choice of hash table size */ + LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt, + LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0])); + if (rec->rec_lh_hash == NULL) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < LNET_LH_HASH_SIZE; i++) + INIT_LIST_HEAD(&rec->rec_lh_hash[i]); + + return 0; + +out: + CERROR("Failed to setup %s resource container\n", + lnet_res_type2str(type)); + lnet_res_container_cleanup(rec); + return rc; +} + +static void +lnet_res_containers_destroy(struct lnet_res_container **recs) +{ + struct lnet_res_container *rec; + int i; + + cfs_percpt_for_each(rec, i, recs) + lnet_res_container_cleanup(rec); + + cfs_percpt_free(recs); +} + +static struct lnet_res_container ** +lnet_res_containers_create(int type) +{ + struct lnet_res_container **recs; + struct lnet_res_container *rec; + int rc; + int i; + + recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec)); + if (recs == NULL) { + CERROR("Failed to allocate %s resource containers\n", + lnet_res_type2str(type)); + return NULL; + } + + cfs_percpt_for_each(rec, i, recs) { + rc = lnet_res_container_setup(rec, i, type); + if (rc != 0) { + lnet_res_containers_destroy(recs); + return NULL; + } + } + + return recs; +} + +struct lnet_libhandle * +lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie) +{ + /* ALWAYS called with lnet_res_lock held */ + struct list_head *head; + struct lnet_libhandle *lh; + unsigned int hash; + + if ((cookie & LNET_COOKIE_MASK) != rec->rec_type) + return NULL; + + hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS); + head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK]; + + list_for_each_entry(lh, head, lh_hash_chain) { + if (lh->lh_cookie == cookie) + return lh; + } + + return NULL; +} + +void +lnet_res_lh_initialize(struct lnet_res_container *rec, + struct lnet_libhandle *lh) +{ + /* ALWAYS called with lnet_res_lock held */ + unsigned int ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS; + unsigned int hash; + + lh->lh_cookie = rec->rec_lh_cookie; + rec->rec_lh_cookie += 1 << ibits; + + hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK; + + list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]); +} + +struct list_head ** +lnet_create_array_of_queues(void) +{ + struct list_head **qs; + struct list_head *q; + int i; + + qs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct list_head)); + if (!qs) { + CERROR("Failed to allocate queues\n"); + return NULL; + } + + cfs_percpt_for_each(q, i, qs) + INIT_LIST_HEAD(q); + + return qs; +} + +static int lnet_unprepare(void); + +static int +lnet_prepare(lnet_pid_t requested_pid) +{ + /* Prepare to bring up the network */ + struct lnet_res_container **recs; + int rc = 0; + + if (requested_pid == LNET_PID_ANY) { + /* Don't instantiate LNET just for me */ + return -ENETDOWN; + } + + LASSERT(the_lnet.ln_refcount == 0); + + the_lnet.ln_routing = 0; + + LASSERT((requested_pid & LNET_PID_USERFLAG) == 0); + the_lnet.ln_pid = requested_pid; + + INIT_LIST_HEAD(&the_lnet.ln_test_peers); + INIT_LIST_HEAD(&the_lnet.ln_remote_peer_ni_list); + INIT_LIST_HEAD(&the_lnet.ln_nets); + INIT_LIST_HEAD(&the_lnet.ln_routers); + INIT_LIST_HEAD(&the_lnet.ln_drop_rules); + INIT_LIST_HEAD(&the_lnet.ln_delay_rules); + INIT_LIST_HEAD(&the_lnet.ln_dc_request); + INIT_LIST_HEAD(&the_lnet.ln_dc_working); + INIT_LIST_HEAD(&the_lnet.ln_dc_expired); + INIT_LIST_HEAD(&the_lnet.ln_mt_localNIRecovq); + INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq); + init_waitqueue_head(&the_lnet.ln_dc_waitq); + LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh); + + rc = lnet_descriptor_setup(); + if (rc != 0) + goto failed; + + rc = lnet_create_remote_nets_table(); + if (rc != 0) + goto failed; + + /* + * NB the interface cookie in wire handles guards against delayed + * replies and ACKs appearing valid after reboot. + */ + the_lnet.ln_interface_cookie = ktime_get_real_ns(); + + the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct lnet_counters)); + if (the_lnet.ln_counters == NULL) { + CERROR("Failed to allocate counters for LNet\n"); + rc = -ENOMEM; + goto failed; + } + + rc = lnet_peer_tables_create(); + if (rc != 0) + goto failed; + + rc = lnet_msg_containers_create(); + if (rc != 0) + goto failed; + + rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0, + LNET_COOKIE_TYPE_EQ); + if (rc != 0) + goto failed; + + recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME); + if (recs == NULL) { + rc = -ENOMEM; + goto failed; + } + + the_lnet.ln_me_containers = recs; + + recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD); + if (recs == NULL) { + rc = -ENOMEM; + goto failed; + } + + the_lnet.ln_md_containers = recs; + + rc = lnet_portals_create(); + if (rc != 0) { + CERROR("Failed to create portals for LNet: %d\n", rc); + goto failed; + } + + the_lnet.ln_mt_zombie_rstqs = lnet_create_array_of_queues(); + if (!the_lnet.ln_mt_zombie_rstqs) { + rc = -ENOMEM; + goto failed; + } + + return 0; + + failed: + lnet_unprepare(); + return rc; +} + +static int +lnet_unprepare (void) +{ + int rc; + + /* NB no LNET_LOCK since this is the last reference. All LND instances + * have shut down already, so it is safe to unlink and free all + * descriptors, even those that appear committed to a network op (eg MD + * with non-zero pending count) */ + + lnet_fail_nid(LNET_NID_ANY, 0); + + LASSERT(the_lnet.ln_refcount == 0); + LASSERT(list_empty(&the_lnet.ln_test_peers)); + LASSERT(list_empty(&the_lnet.ln_nets)); + + if (the_lnet.ln_mt_zombie_rstqs) { + lnet_clean_zombie_rstqs(); + the_lnet.ln_mt_zombie_rstqs = NULL; + } + + if (!LNetEQHandleIsInvalid(the_lnet.ln_mt_eqh)) { + rc = LNetEQFree(the_lnet.ln_mt_eqh); + LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh); + LASSERT(rc == 0); + } + + lnet_portals_destroy(); + + if (the_lnet.ln_md_containers != NULL) { + lnet_res_containers_destroy(the_lnet.ln_md_containers); + the_lnet.ln_md_containers = NULL; + } + + if (the_lnet.ln_me_containers != NULL) { + lnet_res_containers_destroy(the_lnet.ln_me_containers); + the_lnet.ln_me_containers = NULL; + } + + lnet_res_container_cleanup(&the_lnet.ln_eq_container); + + lnet_msg_containers_destroy(); + lnet_peer_uninit(); + lnet_rtrpools_free(0); + + if (the_lnet.ln_counters != NULL) { + cfs_percpt_free(the_lnet.ln_counters); + the_lnet.ln_counters = NULL; + } + lnet_destroy_remote_nets_table(); + lnet_descriptor_cleanup(); + + return 0; +} + +struct lnet_ni * +lnet_net2ni_locked(__u32 net_id, int cpt) +{ + struct lnet_ni *ni; + struct lnet_net *net; + + LASSERT(cpt != LNET_LOCK_EX); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + if (net->net_id == net_id) { + ni = list_entry(net->net_ni_list.next, struct lnet_ni, + ni_netlist); + return ni; + } + } + + return NULL; +} + +struct lnet_ni * +lnet_net2ni_addref(__u32 net) +{ + struct lnet_ni *ni; + + lnet_net_lock(0); + ni = lnet_net2ni_locked(net, 0); + if (ni) + lnet_ni_addref_locked(ni, 0); + lnet_net_unlock(0); + + return ni; +} +EXPORT_SYMBOL(lnet_net2ni_addref); + +struct lnet_net * +lnet_get_net_locked(__u32 net_id) +{ + struct lnet_net *net; + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + if (net->net_id == net_id) + return net; + } + + return NULL; +} + +unsigned int +lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number) +{ + __u64 key = nid; + unsigned int val; + + LASSERT(number >= 1 && number <= LNET_CPT_NUMBER); + + if (number == 1) + return 0; + + val = hash_long(key, LNET_CPT_BITS); + /* NB: LNET_CP_NUMBER doesn't have to be PO2 */ + if (val < number) + return val; + + return (unsigned int)(key + val + (val >> 1)) % number; +} + +int +lnet_cpt_of_nid_locked(lnet_nid_t nid, struct lnet_ni *ni) +{ + struct lnet_net *net; + + /* must called with hold of lnet_net_lock */ + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + /* + * If NI is provided then use the CPT identified in the NI cpt + * list if one exists. If one doesn't exist, then that NI is + * associated with all CPTs and it follows that the net it belongs + * to is implicitly associated with all CPTs, so just hash the nid + * and return that. + */ + if (ni != NULL) { + if (ni->ni_cpts != NULL) + return ni->ni_cpts[lnet_nid_cpt_hash(nid, + ni->ni_ncpts)]; + else + return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + } + + /* no NI provided so look at the net */ + net = lnet_get_net_locked(LNET_NIDNET(nid)); + + if (net != NULL && net->net_cpts != NULL) { + return net->net_cpts[lnet_nid_cpt_hash(nid, net->net_ncpts)]; + } + + return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); +} + +int +lnet_cpt_of_nid(lnet_nid_t nid, struct lnet_ni *ni) +{ + int cpt; + int cpt2; + + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + cpt = lnet_net_lock_current(); + + cpt2 = lnet_cpt_of_nid_locked(nid, ni); + + lnet_net_unlock(cpt); + + return cpt2; +} +EXPORT_SYMBOL(lnet_cpt_of_nid); + +int +lnet_islocalnet(__u32 net_id) +{ + struct lnet_net *net; + int cpt; + bool local; + + cpt = lnet_net_lock_current(); + + net = lnet_get_net_locked(net_id); + + local = net != NULL; + + lnet_net_unlock(cpt); + + return local; +} + +struct lnet_ni * +lnet_nid2ni_locked(lnet_nid_t nid, int cpt) +{ + struct lnet_net *net; + struct lnet_ni *ni; + + LASSERT(cpt != LNET_LOCK_EX); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (ni->ni_nid == nid) + return ni; + } + } + + return NULL; +} + +struct lnet_ni * +lnet_nid2ni_addref(lnet_nid_t nid) +{ + struct lnet_ni *ni; + + lnet_net_lock(0); + ni = lnet_nid2ni_locked(nid, 0); + if (ni) + lnet_ni_addref_locked(ni, 0); + lnet_net_unlock(0); + + return ni; +} +EXPORT_SYMBOL(lnet_nid2ni_addref); + +int +lnet_islocalnid(lnet_nid_t nid) +{ + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + ni = lnet_nid2ni_locked(nid, cpt); + lnet_net_unlock(cpt); + + return ni != NULL; +} + +int +lnet_count_acceptor_nets(void) +{ + /* Return the # of NIs that need the acceptor. */ + int count = 0; + struct lnet_net *net; + int cpt; + + cpt = lnet_net_lock_current(); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + /* all socklnd type networks should have the acceptor + * thread started */ + if (net->net_lnd->lnd_accept != NULL) + count++; + } + + lnet_net_unlock(cpt); + + return count; +} + +struct lnet_ping_buffer * +lnet_ping_buffer_alloc(int nnis, gfp_t gfp) +{ + struct lnet_ping_buffer *pbuf; + + LIBCFS_ALLOC_GFP(pbuf, LNET_PING_BUFFER_SIZE(nnis), gfp); + if (pbuf) { + pbuf->pb_nnis = nnis; + atomic_set(&pbuf->pb_refcnt, 1); + } + + return pbuf; +} + +void +lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf) +{ + LASSERT(lnet_ping_buffer_numref(pbuf) == 0); + LIBCFS_FREE(pbuf, LNET_PING_BUFFER_SIZE(pbuf->pb_nnis)); +} + +static struct lnet_ping_buffer * +lnet_ping_target_create(int nnis) +{ + struct lnet_ping_buffer *pbuf; + + pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS); + if (pbuf == NULL) { + CERROR("Can't allocate ping source [%d]\n", nnis); + return NULL; + } + + pbuf->pb_info.pi_nnis = nnis; + pbuf->pb_info.pi_pid = the_lnet.ln_pid; + pbuf->pb_info.pi_magic = LNET_PROTO_PING_MAGIC; + pbuf->pb_info.pi_features = + LNET_PING_FEAT_NI_STATUS | LNET_PING_FEAT_MULTI_RAIL; + + return pbuf; +} + +static inline int +lnet_get_net_ni_count_locked(struct lnet_net *net) +{ + struct lnet_ni *ni; + int count = 0; + + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) + count++; + + return count; +} + +static inline int +lnet_get_net_ni_count_pre(struct lnet_net *net) +{ + struct lnet_ni *ni; + int count = 0; + + list_for_each_entry(ni, &net->net_ni_added, ni_netlist) + count++; + + return count; +} + +static inline int +lnet_get_ni_count(void) +{ + struct lnet_ni *ni; + struct lnet_net *net; + int count = 0; + + lnet_net_lock(0); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) + count++; + } + + lnet_net_unlock(0); + + return count; +} + +int +lnet_ping_info_validate(struct lnet_ping_info *pinfo) +{ + if (!pinfo) + return -EINVAL; + if (pinfo->pi_magic != LNET_PROTO_PING_MAGIC) + return -EPROTO; + if (!(pinfo->pi_features & LNET_PING_FEAT_NI_STATUS)) + return -EPROTO; + /* Loopback is guaranteed to be present */ + if (pinfo->pi_nnis < 1 || pinfo->pi_nnis > lnet_interfaces_max) + return -ERANGE; + if (LNET_PING_INFO_LONI(pinfo) != LNET_NID_LO_0) + return -EPROTO; + return 0; +} + +static void +lnet_ping_target_destroy(void) +{ + struct lnet_net *net; + struct lnet_ni *ni; + + lnet_net_lock(LNET_LOCK_EX); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + lnet_ni_lock(ni); + ni->ni_status = NULL; + lnet_ni_unlock(ni); + } + } + + lnet_ping_buffer_decref(the_lnet.ln_ping_target); + the_lnet.ln_ping_target = NULL; + + lnet_net_unlock(LNET_LOCK_EX); +} + +static void +lnet_ping_target_event_handler(struct lnet_event *event) +{ + struct lnet_ping_buffer *pbuf = event->md.user_ptr; + + if (event->unlinked) + lnet_ping_buffer_decref(pbuf); +} + +static int +lnet_ping_target_setup(struct lnet_ping_buffer **ppbuf, + struct lnet_handle_md *ping_mdh, + int ni_count, bool set_eq) +{ + struct lnet_process_id id = { + .nid = LNET_NID_ANY, + .pid = LNET_PID_ANY + }; + struct lnet_handle_me me_handle; + struct lnet_md md = { NULL }; + int rc, rc2; + + if (set_eq) { + rc = LNetEQAlloc(0, lnet_ping_target_event_handler, + &the_lnet.ln_ping_target_eq); + if (rc != 0) { + CERROR("Can't allocate ping buffer EQ: %d\n", rc); + return rc; + } + } + + *ppbuf = lnet_ping_target_create(ni_count); + if (*ppbuf == NULL) { + rc = -ENOMEM; + goto fail_free_eq; + } + + /* Ping target ME/MD */ + rc = LNetMEAttach(LNET_RESERVED_PORTAL, id, + LNET_PROTO_PING_MATCHBITS, 0, + LNET_UNLINK, LNET_INS_AFTER, + &me_handle); + if (rc != 0) { + CERROR("Can't create ping target ME: %d\n", rc); + goto fail_decref_ping_buffer; + } + + /* initialize md content */ + md.start = &(*ppbuf)->pb_info; + md.length = LNET_PING_INFO_SIZE((*ppbuf)->pb_nnis); + md.threshold = LNET_MD_THRESH_INF; + md.max_size = 0; + md.options = LNET_MD_OP_GET | LNET_MD_TRUNCATE | + LNET_MD_MANAGE_REMOTE; + md.eq_handle = the_lnet.ln_ping_target_eq; + md.user_ptr = *ppbuf; + + rc = LNetMDAttach(me_handle, md, LNET_RETAIN, ping_mdh); + if (rc != 0) { + CERROR("Can't attach ping target MD: %d\n", rc); + goto fail_unlink_ping_me; + } + lnet_ping_buffer_addref(*ppbuf); + + return 0; + +fail_unlink_ping_me: + rc2 = LNetMEUnlink(me_handle); + LASSERT(rc2 == 0); +fail_decref_ping_buffer: + LASSERT(lnet_ping_buffer_numref(*ppbuf) == 1); + lnet_ping_buffer_decref(*ppbuf); + *ppbuf = NULL; +fail_free_eq: + if (set_eq) { + rc2 = LNetEQFree(the_lnet.ln_ping_target_eq); + LASSERT(rc2 == 0); + } + return rc; +} + +static void +lnet_ping_md_unlink(struct lnet_ping_buffer *pbuf, + struct lnet_handle_md *ping_mdh) +{ + sigset_t blocked = cfs_block_allsigs(); + + LNetMDUnlink(*ping_mdh); + LNetInvalidateMDHandle(ping_mdh); + + /* NB the MD could be busy; this just starts the unlink */ + while (lnet_ping_buffer_numref(pbuf) > 1) { + CDEBUG(D_NET, "Still waiting for ping data MD to unlink\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + + cfs_restore_sigs(blocked); +} + +static void +lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf) +{ + struct lnet_ni *ni; + struct lnet_net *net; + struct lnet_ni_status *ns; + int i; + int rc; + + i = 0; + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + LASSERT(i < pbuf->pb_nnis); + + ns = &pbuf->pb_info.pi_ni[i]; + + ns->ns_nid = ni->ni_nid; + + lnet_ni_lock(ni); + ns->ns_status = (ni->ni_status != NULL) ? + ni->ni_status->ns_status : + LNET_NI_STATUS_UP; + ni->ni_status = ns; + lnet_ni_unlock(ni); + + i++; + } + } + /* + * We (ab)use the ns_status of the loopback interface to + * transmit the sequence number. The first interface listed + * must be the loopback interface. + */ + rc = lnet_ping_info_validate(&pbuf->pb_info); + if (rc) { + LCONSOLE_EMERG("Invalid ping target: %d\n", rc); + LBUG(); + } + LNET_PING_BUFFER_SEQNO(pbuf) = + atomic_inc_return(&the_lnet.ln_ping_target_seqno); +} + +static void +lnet_ping_target_update(struct lnet_ping_buffer *pbuf, + struct lnet_handle_md ping_mdh) +{ + struct lnet_ping_buffer *old_pbuf = NULL; + struct lnet_handle_md old_ping_md; + + /* switch the NIs to point to the new ping info created */ + lnet_net_lock(LNET_LOCK_EX); + + if (!the_lnet.ln_routing) + pbuf->pb_info.pi_features |= LNET_PING_FEAT_RTE_DISABLED; + if (!lnet_peer_discovery_disabled) + pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY; + + /* Ensure only known feature bits have been set. */ + LASSERT(pbuf->pb_info.pi_features & LNET_PING_FEAT_BITS); + LASSERT(!(pbuf->pb_info.pi_features & ~LNET_PING_FEAT_BITS)); + + lnet_ping_target_install_locked(pbuf); + + if (the_lnet.ln_ping_target) { + old_pbuf = the_lnet.ln_ping_target; + old_ping_md = the_lnet.ln_ping_target_md; + } + the_lnet.ln_ping_target_md = ping_mdh; + the_lnet.ln_ping_target = pbuf; + + lnet_net_unlock(LNET_LOCK_EX); + + if (old_pbuf) { + /* unlink and free the old ping info */ + lnet_ping_md_unlink(old_pbuf, &old_ping_md); + lnet_ping_buffer_decref(old_pbuf); + } + + lnet_push_update_to_peers(0); +} + +static void +lnet_ping_target_fini(void) +{ + int rc; + + lnet_ping_md_unlink(the_lnet.ln_ping_target, + &the_lnet.ln_ping_target_md); + + rc = LNetEQFree(the_lnet.ln_ping_target_eq); + LASSERT(rc == 0); + + lnet_ping_target_destroy(); +} + +/* Resize the push target. */ +int lnet_push_target_resize(void) +{ + struct lnet_process_id id = { LNET_NID_ANY, LNET_PID_ANY }; + struct lnet_md md = { NULL }; + struct lnet_handle_me meh; + struct lnet_handle_md mdh; + struct lnet_handle_md old_mdh; + struct lnet_ping_buffer *pbuf; + struct lnet_ping_buffer *old_pbuf; + int nnis = the_lnet.ln_push_target_nnis; + int rc; + + if (nnis <= 0) { + rc = -EINVAL; + goto fail_return; + } +again: + pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS); + if (!pbuf) { + rc = -ENOMEM; + goto fail_return; + } + + rc = LNetMEAttach(LNET_RESERVED_PORTAL, id, + LNET_PROTO_PING_MATCHBITS, 0, + LNET_UNLINK, LNET_INS_AFTER, + &meh); + if (rc) { + CERROR("Can't create push target ME: %d\n", rc); + goto fail_decref_pbuf; + } + + /* initialize md content */ + md.start = &pbuf->pb_info; + md.length = LNET_PING_INFO_SIZE(nnis); + md.threshold = LNET_MD_THRESH_INF; + md.max_size = 0; + md.options = LNET_MD_OP_PUT | LNET_MD_TRUNCATE | + LNET_MD_MANAGE_REMOTE; + md.user_ptr = pbuf; + md.eq_handle = the_lnet.ln_push_target_eq; + + rc = LNetMDAttach(meh, md, LNET_RETAIN, &mdh); + if (rc) { + CERROR("Can't attach push MD: %d\n", rc); + goto fail_unlink_meh; + } + lnet_ping_buffer_addref(pbuf); + + lnet_net_lock(LNET_LOCK_EX); + old_pbuf = the_lnet.ln_push_target; + old_mdh = the_lnet.ln_push_target_md; + the_lnet.ln_push_target = pbuf; + the_lnet.ln_push_target_md = mdh; + lnet_net_unlock(LNET_LOCK_EX); + + if (old_pbuf) { + LNetMDUnlink(old_mdh); + lnet_ping_buffer_decref(old_pbuf); + } + + if (nnis < the_lnet.ln_push_target_nnis) + goto again; + + CDEBUG(D_NET, "nnis %d success\n", nnis); + + return 0; + +fail_unlink_meh: + LNetMEUnlink(meh); +fail_decref_pbuf: + lnet_ping_buffer_decref(pbuf); +fail_return: + CDEBUG(D_NET, "nnis %d error %d\n", nnis, rc); + return rc; +} + +static void lnet_push_target_event_handler(struct lnet_event *ev) +{ + struct lnet_ping_buffer *pbuf = ev->md.user_ptr; + + if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) + lnet_swap_pinginfo(pbuf); + + lnet_peer_push_event(ev); + if (ev->unlinked) + lnet_ping_buffer_decref(pbuf); +} + +/* Initialize the push target. */ +static int lnet_push_target_init(void) +{ + int rc; + + if (the_lnet.ln_push_target) + return -EALREADY; + + rc = LNetEQAlloc(0, lnet_push_target_event_handler, + &the_lnet.ln_push_target_eq); + if (rc) { + CERROR("Can't allocated push target EQ: %d\n", rc); + return rc; + } + + /* Start at the required minimum, we'll enlarge if required. */ + the_lnet.ln_push_target_nnis = LNET_INTERFACES_MIN; + + rc = lnet_push_target_resize(); + + if (rc) { + LNetEQFree(the_lnet.ln_push_target_eq); + LNetInvalidateEQHandle(&the_lnet.ln_push_target_eq); + } + + return rc; +} + +/* Clean up the push target. */ +static void lnet_push_target_fini(void) +{ + if (!the_lnet.ln_push_target) + return; + + /* Unlink and invalidate to prevent new references. */ + LNetMDUnlink(the_lnet.ln_push_target_md); + LNetInvalidateMDHandle(&the_lnet.ln_push_target_md); + + /* Wait for the unlink to complete. */ + while (lnet_ping_buffer_numref(the_lnet.ln_push_target) > 1) { + CDEBUG(D_NET, "Still waiting for ping data MD to unlink\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + + lnet_ping_buffer_decref(the_lnet.ln_push_target); + the_lnet.ln_push_target = NULL; + the_lnet.ln_push_target_nnis = 0; + + LNetEQFree(the_lnet.ln_push_target_eq); + LNetInvalidateEQHandle(&the_lnet.ln_push_target_eq); +} + +static int +lnet_ni_tq_credits(struct lnet_ni *ni) +{ + int credits; + + LASSERT(ni->ni_ncpts >= 1); + + if (ni->ni_ncpts == 1) + return ni->ni_net->net_tunables.lct_max_tx_credits; + + credits = ni->ni_net->net_tunables.lct_max_tx_credits / ni->ni_ncpts; + credits = max(credits, 8 * ni->ni_net->net_tunables.lct_peer_tx_credits); + credits = min(credits, ni->ni_net->net_tunables.lct_max_tx_credits); + + return credits; +} + +static void +lnet_ni_unlink_locked(struct lnet_ni *ni) +{ + /* move it to zombie list and nobody can find it anymore */ + LASSERT(!list_empty(&ni->ni_netlist)); + list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie); + lnet_ni_decref_locked(ni, 0); +} + +static void +lnet_clear_zombies_nis_locked(struct lnet_net *net) +{ + int i; + int islo; + struct lnet_ni *ni; + struct list_head *zombie_list = &net->net_ni_zombie; + + /* + * Now wait for the NIs I just nuked to show up on the zombie + * list and shut them down in guaranteed thread context + */ + i = 2; + while (!list_empty(zombie_list)) { + int *ref; + int j; + + ni = list_entry(zombie_list->next, + struct lnet_ni, ni_netlist); + list_del_init(&ni->ni_netlist); + /* the ni should be in deleting state. If it's not it's + * a bug */ + LASSERT(ni->ni_state == LNET_NI_STATE_DELETING); + cfs_percpt_for_each(ref, j, ni->ni_refs) { + if (*ref == 0) + continue; + /* still busy, add it back to zombie list */ + list_add(&ni->ni_netlist, zombie_list); + break; + } + + if (!list_empty(&ni->ni_netlist)) { + /* Unlock mutex while waiting to allow other + * threads to read the LNet state and fall through + * to avoid deadlock + */ + lnet_net_unlock(LNET_LOCK_EX); + mutex_unlock(&the_lnet.ln_api_mutex); + + ++i; + if ((i & (-i)) == i) { + CDEBUG(D_WARNING, + "Waiting for zombie LNI %s\n", + libcfs_nid2str(ni->ni_nid)); + } + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + + mutex_lock(&the_lnet.ln_api_mutex); + lnet_net_lock(LNET_LOCK_EX); + continue; + } + + lnet_net_unlock(LNET_LOCK_EX); + + islo = ni->ni_net->net_lnd->lnd_type == LOLND; + + LASSERT(!in_interrupt()); + (net->net_lnd->lnd_shutdown)(ni); + + if (!islo) + CDEBUG(D_LNI, "Removed LNI %s\n", + libcfs_nid2str(ni->ni_nid)); + + lnet_ni_free(ni); + i = 2; + lnet_net_lock(LNET_LOCK_EX); + } +} + +/* shutdown down the NI and release refcount */ +static void +lnet_shutdown_lndni(struct lnet_ni *ni) +{ + int i; + struct lnet_net *net = ni->ni_net; + + lnet_net_lock(LNET_LOCK_EX); + lnet_ni_lock(ni); + ni->ni_state = LNET_NI_STATE_DELETING; + lnet_ni_unlock(ni); + lnet_ni_unlink_locked(ni); + lnet_incr_dlc_seq(); + lnet_net_unlock(LNET_LOCK_EX); + + /* clear messages for this NI on the lazy portal */ + for (i = 0; i < the_lnet.ln_nportals; i++) + lnet_clear_lazy_portal(ni, i, "Shutting down NI"); + + lnet_net_lock(LNET_LOCK_EX); + lnet_clear_zombies_nis_locked(net); + lnet_net_unlock(LNET_LOCK_EX); +} + +static void +lnet_shutdown_lndnet(struct lnet_net *net) +{ + struct lnet_ni *ni; + + lnet_net_lock(LNET_LOCK_EX); + + net->net_state = LNET_NET_STATE_DELETING; + + list_del_init(&net->net_list); + + while (!list_empty(&net->net_ni_list)) { + ni = list_entry(net->net_ni_list.next, + struct lnet_ni, ni_netlist); + lnet_net_unlock(LNET_LOCK_EX); + lnet_shutdown_lndni(ni); + lnet_net_lock(LNET_LOCK_EX); + } + + lnet_net_unlock(LNET_LOCK_EX); + + /* Do peer table cleanup for this net */ + lnet_peer_tables_cleanup(net); + + lnet_net_lock(LNET_LOCK_EX); + /* + * decrement ref count on lnd only when the entire network goes + * away + */ + net->net_lnd->lnd_refcount--; + + lnet_net_unlock(LNET_LOCK_EX); + + lnet_net_free(net); +} + +static void +lnet_shutdown_lndnets(void) +{ + struct lnet_net *net; + struct list_head resend; + struct lnet_msg *msg, *tmp; + + INIT_LIST_HEAD(&resend); + + /* NB called holding the global mutex */ + + /* All quiet on the API front */ + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); + LASSERT(the_lnet.ln_refcount == 0); + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_state = LNET_STATE_STOPPING; + + while (!list_empty(&the_lnet.ln_nets)) { + /* + * move the nets to the zombie list to avoid them being + * picked up for new work. LONET is also included in the + * Nets that will be moved to the zombie list + */ + net = list_entry(the_lnet.ln_nets.next, + struct lnet_net, net_list); + list_move(&net->net_list, &the_lnet.ln_net_zombie); + } + + /* Drop the cached loopback Net. */ + if (the_lnet.ln_loni != NULL) { + lnet_ni_decref_locked(the_lnet.ln_loni, 0); + the_lnet.ln_loni = NULL; + } + lnet_net_unlock(LNET_LOCK_EX); + + /* iterate through the net zombie list and delete each net */ + while (!list_empty(&the_lnet.ln_net_zombie)) { + net = list_entry(the_lnet.ln_net_zombie.next, + struct lnet_net, net_list); + lnet_shutdown_lndnet(net); + } + + spin_lock(&the_lnet.ln_msg_resend_lock); + list_splice(&the_lnet.ln_msg_resend, &resend); + spin_unlock(&the_lnet.ln_msg_resend_lock); + + list_for_each_entry_safe(msg, tmp, &resend, msg_list) { + list_del_init(&msg->msg_list); + msg->msg_no_resend = true; + lnet_finalize(msg, -ECANCELED); + } + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_state = LNET_STATE_SHUTDOWN; + lnet_net_unlock(LNET_LOCK_EX); +} + +static int +lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun) +{ + int rc = -EINVAL; + struct lnet_tx_queue *tq; + int i; + struct lnet_net *net = ni->ni_net; + + mutex_lock(&the_lnet.ln_lnd_mutex); + + if (tun) { + memcpy(&ni->ni_lnd_tunables, tun, sizeof(*tun)); + ni->ni_lnd_tunables_set = true; + } + + rc = (net->net_lnd->lnd_startup)(ni); + + mutex_unlock(&the_lnet.ln_lnd_mutex); + + if (rc != 0) { + LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s\n", + rc, libcfs_lnd2str(net->net_lnd->lnd_type)); + lnet_net_lock(LNET_LOCK_EX); + net->net_lnd->lnd_refcount--; + lnet_net_unlock(LNET_LOCK_EX); + goto failed0; + } + + lnet_ni_lock(ni); + ni->ni_state = LNET_NI_STATE_ACTIVE; + lnet_ni_unlock(ni); + + /* We keep a reference on the loopback net through the loopback NI */ + if (net->net_lnd->lnd_type == LOLND) { + lnet_ni_addref(ni); + LASSERT(the_lnet.ln_loni == NULL); + the_lnet.ln_loni = ni; + ni->ni_net->net_tunables.lct_peer_tx_credits = 0; + ni->ni_net->net_tunables.lct_peer_rtr_credits = 0; + ni->ni_net->net_tunables.lct_max_tx_credits = 0; + ni->ni_net->net_tunables.lct_peer_timeout = 0; + return 0; + } + + if (ni->ni_net->net_tunables.lct_peer_tx_credits == 0 || + ni->ni_net->net_tunables.lct_max_tx_credits == 0) { + LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n", + libcfs_lnd2str(net->net_lnd->lnd_type), + ni->ni_net->net_tunables.lct_peer_tx_credits == 0 ? + "" : "per-peer "); + /* shutdown the NI since if we get here then it must've already + * been started + */ + lnet_shutdown_lndni(ni); + return -EINVAL; + } + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + tq->tq_credits_min = + tq->tq_credits_max = + tq->tq_credits = lnet_ni_tq_credits(ni); + } + + atomic_set(&ni->ni_tx_credits, + lnet_ni_tq_credits(ni) * ni->ni_ncpts); + atomic_set(&ni->ni_healthv, LNET_MAX_HEALTH_VALUE); + + CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", + libcfs_nid2str(ni->ni_nid), + ni->ni_net->net_tunables.lct_peer_tx_credits, + lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER, + ni->ni_net->net_tunables.lct_peer_rtr_credits, + ni->ni_net->net_tunables.lct_peer_timeout); + + return 0; +failed0: + lnet_ni_free(ni); + return rc; +} + +static int +lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun) +{ + struct lnet_ni *ni; + struct lnet_net *net_l = NULL; + struct list_head local_ni_list; + int rc; + int ni_count = 0; + __u32 lnd_type; + struct lnet_lnd *lnd; + int peer_timeout = + net->net_tunables.lct_peer_timeout; + int maxtxcredits = + net->net_tunables.lct_max_tx_credits; + int peerrtrcredits = + net->net_tunables.lct_peer_rtr_credits; + + INIT_LIST_HEAD(&local_ni_list); + + /* + * make sure that this net is unique. If it isn't then + * we are adding interfaces to an already existing network, and + * 'net' is just a convenient way to pass in the list. + * if it is unique we need to find the LND and load it if + * necessary. + */ + if (lnet_net_unique(net->net_id, &the_lnet.ln_nets, &net_l)) { + lnd_type = LNET_NETTYP(net->net_id); + + mutex_lock(&the_lnet.ln_lnd_mutex); + lnd = lnet_find_lnd_by_type(lnd_type); + + if (lnd == NULL) { + mutex_unlock(&the_lnet.ln_lnd_mutex); + rc = request_module("%s", libcfs_lnd2modname(lnd_type)); + mutex_lock(&the_lnet.ln_lnd_mutex); + + lnd = lnet_find_lnd_by_type(lnd_type); + if (lnd == NULL) { + mutex_unlock(&the_lnet.ln_lnd_mutex); + CERROR("Can't load LND %s, module %s, rc=%d\n", + libcfs_lnd2str(lnd_type), + libcfs_lnd2modname(lnd_type), rc); +#ifndef HAVE_MODULE_LOADING_SUPPORT + LCONSOLE_ERROR_MSG(0x104, "Your kernel must be " + "compiled with kernel module " + "loading support."); +#endif + rc = -EINVAL; + goto failed0; + } + } + + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount++; + lnet_net_unlock(LNET_LOCK_EX); + + net->net_lnd = lnd; + + mutex_unlock(&the_lnet.ln_lnd_mutex); + + net_l = net; + } + + /* + * net_l: if the network being added is unique then net_l + * will point to that network + * if the network being added is not unique then + * net_l points to the existing network. + * + * When we enter the loop below, we'll pick NIs off he + * network beign added and start them up, then add them to + * a local ni list. Once we've successfully started all + * the NIs then we join the local NI list (of started up + * networks) with the net_l->net_ni_list, which should + * point to the correct network to add the new ni list to + * + * If any of the new NIs fail to start up, then we want to + * iterate through the local ni list, which should include + * any NIs which were successfully started up, and shut + * them down. + * + * After than we want to delete the network being added, + * to avoid a memory leak. + */ + + /* + * When a network uses TCP bonding then all its interfaces + * must be specified when the network is first defined: the + * TCP bonding code doesn't allow for interfaces to be added + * or removed. + */ + if (net_l != net && net_l != NULL && use_tcp_bonding && + LNET_NETTYP(net_l->net_id) == SOCKLND) { + rc = -EINVAL; + goto failed0; + } + + while (!list_empty(&net->net_ni_added)) { + ni = list_entry(net->net_ni_added.next, struct lnet_ni, + ni_netlist); + list_del_init(&ni->ni_netlist); + + /* make sure that the the NI we're about to start + * up is actually unique. if it's not fail. */ + if (!lnet_ni_unique_net(&net_l->net_ni_list, + ni->ni_interfaces[0])) { + rc = -EEXIST; + goto failed1; + } + + /* adjust the pointer the parent network, just in case it + * the net is a duplicate */ + ni->ni_net = net_l; + + rc = lnet_startup_lndni(ni, tun); + + LASSERT(ni->ni_net->net_tunables.lct_peer_timeout <= 0 || + ni->ni_net->net_lnd->lnd_query != NULL); + + if (rc < 0) + goto failed1; + + lnet_ni_addref(ni); + list_add_tail(&ni->ni_netlist, &local_ni_list); + + ni_count++; + } + + lnet_net_lock(LNET_LOCK_EX); + list_splice_tail(&local_ni_list, &net_l->net_ni_list); + lnet_incr_dlc_seq(); + lnet_net_unlock(LNET_LOCK_EX); + + /* if the network is not unique then we don't want to keep + * it around after we're done. Free it. Otherwise add that + * net to the global the_lnet.ln_nets */ + if (net_l != net && net_l != NULL) { + /* + * TODO - note. currently the tunables can not be updated + * once added + */ + lnet_net_free(net); + } else { + net->net_state = LNET_NET_STATE_ACTIVE; + /* + * restore tunables after it has been overwitten by the + * lnd + */ + if (peer_timeout != -1) + net->net_tunables.lct_peer_timeout = peer_timeout; + if (maxtxcredits != -1) + net->net_tunables.lct_max_tx_credits = maxtxcredits; + if (peerrtrcredits != -1) + net->net_tunables.lct_peer_rtr_credits = peerrtrcredits; + + lnet_net_lock(LNET_LOCK_EX); + list_add_tail(&net->net_list, &the_lnet.ln_nets); + lnet_net_unlock(LNET_LOCK_EX); + } + + return ni_count; + +failed1: + /* + * shutdown the new NIs that are being started up + * free the NET being started + */ + while (!list_empty(&local_ni_list)) { + ni = list_entry(local_ni_list.next, struct lnet_ni, + ni_netlist); + + lnet_shutdown_lndni(ni); + } + +failed0: + lnet_net_free(net); + + return rc; +} + +static int +lnet_startup_lndnets(struct list_head *netlist) +{ + struct lnet_net *net; + int rc; + int ni_count = 0; + + /* + * Change to running state before bringing up the LNDs. This + * allows lnet_shutdown_lndnets() to assert that we've passed + * through here. + */ + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_state = LNET_STATE_RUNNING; + lnet_net_unlock(LNET_LOCK_EX); + + while (!list_empty(netlist)) { + net = list_entry(netlist->next, struct lnet_net, net_list); + list_del_init(&net->net_list); + + rc = lnet_startup_lndnet(net, NULL); + + if (rc < 0) + goto failed; + + ni_count += rc; + } + + return ni_count; +failed: + lnet_shutdown_lndnets(); + + return rc; +} + +/** + * Initialize LNet library. + * + * Automatically called at module loading time. Caller has to call + * lnet_lib_exit() after a call to lnet_lib_init(), if and only if the + * latter returned 0. It must be called exactly once. + * + * \retval 0 on success + * \retval -ve on failures. + */ +int lnet_lib_init(void) +{ + int rc; + + lnet_assert_wire_constants(); + + /* refer to global cfs_cpt_table for now */ + the_lnet.ln_cpt_table = cfs_cpt_table; + the_lnet.ln_cpt_number = cfs_cpt_number(cfs_cpt_table); + + LASSERT(the_lnet.ln_cpt_number > 0); + if (the_lnet.ln_cpt_number > LNET_CPT_MAX) { + /* we are under risk of consuming all lh_cookie */ + CERROR("Can't have %d CPTs for LNet (max allowed is %d), " + "please change setting of CPT-table and retry\n", + the_lnet.ln_cpt_number, LNET_CPT_MAX); + return -E2BIG; + } + + while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number) + the_lnet.ln_cpt_bits++; + + rc = lnet_create_locks(); + if (rc != 0) { + CERROR("Can't create LNet global locks: %d\n", rc); + return rc; + } + + the_lnet.ln_refcount = 0; + LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh); + INIT_LIST_HEAD(&the_lnet.ln_lnds); + INIT_LIST_HEAD(&the_lnet.ln_net_zombie); + INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie); + INIT_LIST_HEAD(&the_lnet.ln_msg_resend); + INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow); + + /* The hash table size is the number of bits it takes to express the set + * ln_num_routes, minus 1 (better to under estimate than over so we + * don't waste memory). */ + if (rnet_htable_size <= 0) + rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; + else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX) + rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX; + the_lnet.ln_remote_nets_hbits = max_t(int, 1, + order_base_2(rnet_htable_size) - 1); + + /* All LNDs apart from the LOLND are in separate modules. They + * register themselves when their module loads, and unregister + * themselves when their module is unloaded. */ + lnet_register_lnd(&the_lolnd); + return 0; +} + +/** + * Finalize LNet library. + * + * \pre lnet_lib_init() called with success. + * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls. + */ +void lnet_lib_exit(void) +{ + LASSERT(the_lnet.ln_refcount == 0); + + while (!list_empty(&the_lnet.ln_lnds)) + lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next, + struct lnet_lnd, lnd_list)); + lnet_destroy_locks(); +} + +/** + * Set LNet PID and start LNet interfaces, routing, and forwarding. + * + * Users must call this function at least once before any other functions. + * For each successful call there must be a corresponding call to + * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is + * ignored. + * + * The PID used by LNet may be different from the one requested. + * See LNetGetId(). + * + * \param requested_pid PID requested by the caller. + * + * \return >= 0 on success, and < 0 error code on failures. + */ +int +LNetNIInit(lnet_pid_t requested_pid) +{ + int im_a_router = 0; + int rc; + int ni_count; + struct lnet_ping_buffer *pbuf; + struct lnet_handle_md ping_mdh; + struct list_head net_head; + struct lnet_net *net; + + INIT_LIST_HEAD(&net_head); + + mutex_lock(&the_lnet.ln_api_mutex); + + CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount); + + if (the_lnet.ln_refcount > 0) { + rc = the_lnet.ln_refcount++; + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + rc = lnet_prepare(requested_pid); + if (rc != 0) { + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + /* create a network for Loopback network */ + net = lnet_net_alloc(LNET_MKNET(LOLND, 0), &net_head); + if (net == NULL) { + rc = -ENOMEM; + goto err_empty_list; + } + + /* Add in the loopback NI */ + if (lnet_ni_alloc(net, NULL, NULL) == NULL) { + rc = -ENOMEM; + goto err_empty_list; + } + + if (use_tcp_bonding) + CWARN("'use_tcp_bonding' option has been deprecated. See LU-13641\n"); + + /* If LNet is being initialized via DLC it is possible + * that the user requests not to load module parameters (ones which + * are supported by DLC) on initialization. Therefore, make sure not + * to load networks, routes and forwarding from module parameters + * in this case. On cleanup in case of failure only clean up + * routes if it has been loaded */ + if (!the_lnet.ln_nis_from_mod_params) { + rc = lnet_parse_networks(&net_head, lnet_get_networks(), + use_tcp_bonding); + if (rc < 0) + goto err_empty_list; + } + + ni_count = lnet_startup_lndnets(&net_head); + if (ni_count < 0) { + rc = ni_count; + goto err_empty_list; + } + + if (!the_lnet.ln_nis_from_mod_params) { + rc = lnet_parse_routes(lnet_get_routes(), &im_a_router); + if (rc != 0) + goto err_shutdown_lndnis; + + rc = lnet_check_routes(); + if (rc != 0) + goto err_destroy_routes; + + rc = lnet_rtrpools_alloc(im_a_router); + if (rc != 0) + goto err_destroy_routes; + } + + rc = lnet_acceptor_start(); + if (rc != 0) + goto err_destroy_routes; + + the_lnet.ln_refcount = 1; + /* Now I may use my own API functions... */ + + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, ni_count, true); + if (rc != 0) + goto err_acceptor_stop; + + lnet_ping_target_update(pbuf, ping_mdh); + + rc = LNetEQAlloc(0, lnet_mt_event_handler, &the_lnet.ln_mt_eqh); + if (rc != 0) { + CERROR("Can't allocate monitor thread EQ: %d\n", rc); + goto err_stop_ping; + } + + rc = lnet_monitor_thr_start(); + if (rc != 0) + goto err_stop_ping; + + rc = lnet_push_target_init(); + if (rc != 0) + goto err_stop_monitor_thr; + + rc = lnet_peer_discovery_start(); + if (rc != 0) + goto err_destroy_push_target; + + lnet_fault_init(); + lnet_router_debugfs_init(); + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; + +err_destroy_push_target: + lnet_push_target_fini(); +err_stop_monitor_thr: + lnet_monitor_thr_stop(); +err_stop_ping: + lnet_ping_target_fini(); +err_acceptor_stop: + the_lnet.ln_refcount = 0; + lnet_acceptor_stop(); +err_destroy_routes: + if (!the_lnet.ln_nis_from_mod_params) + lnet_destroy_routes(); +err_shutdown_lndnis: + lnet_shutdown_lndnets(); +err_empty_list: + lnet_unprepare(); + LASSERT(rc < 0); + mutex_unlock(&the_lnet.ln_api_mutex); + while (!list_empty(&net_head)) { + struct lnet_net *net; + + net = list_entry(net_head.next, struct lnet_net, net_list); + list_del_init(&net->net_list); + lnet_net_free(net); + } + return rc; +} +EXPORT_SYMBOL(LNetNIInit); + +/** + * Stop LNet interfaces, routing, and forwarding. + * + * Users must call this function once for each successful call to LNetNIInit(). + * Once the LNetNIFini() operation has been started, the results of pending + * API operations are undefined. + * + * \return always 0 for current implementation. + */ +int +LNetNIFini() +{ + mutex_lock(&the_lnet.ln_api_mutex); + + LASSERT(the_lnet.ln_refcount > 0); + + if (the_lnet.ln_refcount != 1) { + the_lnet.ln_refcount--; + } else { + LASSERT(!the_lnet.ln_niinit_self); + + lnet_fault_fini(); + + lnet_router_debugfs_fini(); + lnet_peer_discovery_stop(); + lnet_push_target_fini(); + lnet_monitor_thr_stop(); + lnet_ping_target_fini(); + + /* Teardown fns that use my own API functions BEFORE here */ + the_lnet.ln_refcount = 0; + + lnet_acceptor_stop(); + lnet_destroy_routes(); + lnet_shutdown_lndnets(); + lnet_unprepare(); + } + + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; +} +EXPORT_SYMBOL(LNetNIFini); + +/** + * Grabs the ni data from the ni structure and fills the out + * parameters + * + * \param[in] ni network interface structure + * \param[out] cfg_ni NI config information + * \param[out] tun network and LND tunables + */ +static void +lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni, + struct lnet_ioctl_config_lnd_tunables *tun, + struct lnet_ioctl_element_stats *stats, + __u32 tun_size) +{ + size_t min_size = 0; + int i; + + if (!ni || !cfg_ni || !tun) + return; + + if (ni->ni_interfaces[0] != NULL) { + for (i = 0; i < ARRAY_SIZE(ni->ni_interfaces); i++) { + if (ni->ni_interfaces[i] != NULL) { + strncpy(cfg_ni->lic_ni_intf[i], + ni->ni_interfaces[i], + sizeof(cfg_ni->lic_ni_intf[i])); + } + } + } + + cfg_ni->lic_nid = ni->ni_nid; + if (ni->ni_nid == LNET_NID_LO_0) + cfg_ni->lic_status = LNET_NI_STATUS_UP; + else + cfg_ni->lic_status = ni->ni_status->ns_status; + cfg_ni->lic_tcp_bonding = use_tcp_bonding; + cfg_ni->lic_dev_cpt = ni->ni_dev_cpt; + + memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn)); + + if (stats) { + stats->iel_send_count = lnet_sum_stats(&ni->ni_stats, + LNET_STATS_TYPE_SEND); + stats->iel_recv_count = lnet_sum_stats(&ni->ni_stats, + LNET_STATS_TYPE_RECV); + stats->iel_drop_count = lnet_sum_stats(&ni->ni_stats, + LNET_STATS_TYPE_DROP); + } + + /* + * tun->lt_tun will always be present, but in order to be + * backwards compatible, we need to deal with the cases when + * tun->lt_tun is smaller than what the kernel has, because it + * comes from an older version of a userspace program, then we'll + * need to copy as much information as we have available space. + */ + min_size = tun_size - sizeof(tun->lt_cmn); + memcpy(&tun->lt_tun, &ni->ni_lnd_tunables, min_size); + + /* copy over the cpts */ + if (ni->ni_ncpts == LNET_CPT_NUMBER && + ni->ni_cpts == NULL) { + for (i = 0; i < ni->ni_ncpts; i++) + cfg_ni->lic_cpts[i] = i; + } else { + for (i = 0; + ni->ni_cpts != NULL && i < ni->ni_ncpts && + i < LNET_MAX_SHOW_NUM_CPT; + i++) + cfg_ni->lic_cpts[i] = ni->ni_cpts[i]; + } + cfg_ni->lic_ncpts = ni->ni_ncpts; +} + +/** + * NOTE: This is a legacy function left in the code to be backwards + * compatible with older userspace programs. It should eventually be + * removed. + * + * Grabs the ni data from the ni structure and fills the out + * parameters + * + * \param[in] ni network interface structure + * \param[out] config config information + */ +static void +lnet_fill_ni_info_legacy(struct lnet_ni *ni, + struct lnet_ioctl_config_data *config) +{ + struct lnet_ioctl_net_config *net_config; + struct lnet_ioctl_config_lnd_tunables *lnd_cfg = NULL; + size_t min_size, tunable_size = 0; + int i; + + if (!ni || !config) + return; + + net_config = (struct lnet_ioctl_net_config *) config->cfg_bulk; + if (!net_config) + return; + + BUILD_BUG_ON(ARRAY_SIZE(ni->ni_interfaces) != + ARRAY_SIZE(net_config->ni_interfaces)); + + for (i = 0; i < ARRAY_SIZE(ni->ni_interfaces); i++) { + if (!ni->ni_interfaces[i]) + break; + + strncpy(net_config->ni_interfaces[i], + ni->ni_interfaces[i], + sizeof(net_config->ni_interfaces[i])); + } + + config->cfg_nid = ni->ni_nid; + config->cfg_config_u.cfg_net.net_peer_timeout = + ni->ni_net->net_tunables.lct_peer_timeout; + config->cfg_config_u.cfg_net.net_max_tx_credits = + ni->ni_net->net_tunables.lct_max_tx_credits; + config->cfg_config_u.cfg_net.net_peer_tx_credits = + ni->ni_net->net_tunables.lct_peer_tx_credits; + config->cfg_config_u.cfg_net.net_peer_rtr_credits = + ni->ni_net->net_tunables.lct_peer_rtr_credits; + + if (ni->ni_nid == LNET_NID_LO_0) + net_config->ni_status = LNET_NI_STATUS_UP; + else + net_config->ni_status = ni->ni_status->ns_status; + + if (ni->ni_cpts) { + int num_cpts = min(ni->ni_ncpts, LNET_MAX_SHOW_NUM_CPT); + + for (i = 0; i < num_cpts; i++) + net_config->ni_cpts[i] = ni->ni_cpts[i]; + + config->cfg_ncpts = num_cpts; + } + + /* + * See if user land tools sent in a newer and larger version + * of struct lnet_tunables than what the kernel uses. + */ + min_size = sizeof(*config) + sizeof(*net_config); + + if (config->cfg_hdr.ioc_len > min_size) + tunable_size = config->cfg_hdr.ioc_len - min_size; + + /* Don't copy too much data to user space */ + min_size = min(tunable_size, sizeof(ni->ni_lnd_tunables)); + lnd_cfg = (struct lnet_ioctl_config_lnd_tunables *)net_config->cfg_bulk; + + if (lnd_cfg && min_size) { + memcpy(&lnd_cfg->lt_tun, &ni->ni_lnd_tunables, min_size); + config->cfg_config_u.cfg_net.net_interface_count = 1; + + /* Tell user land that kernel side has less data */ + if (tunable_size > sizeof(ni->ni_lnd_tunables)) { + min_size = tunable_size - sizeof(ni->ni_lnd_tunables); + config->cfg_hdr.ioc_len -= min_size; + } + } +} + +struct lnet_ni * +lnet_get_ni_idx_locked(int idx) +{ + struct lnet_ni *ni; + struct lnet_net *net; + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (idx-- == 0) + return ni; + } + } + + return NULL; +} + +struct lnet_ni * +lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev) +{ + struct lnet_ni *ni; + struct lnet_net *net = mynet; + + /* + * It is possible that the net has been cleaned out while there is + * a message being sent. This function accessed the net without + * checking if the list is empty + */ + if (prev == NULL) { + if (net == NULL) + net = list_entry(the_lnet.ln_nets.next, struct lnet_net, + net_list); + if (list_empty(&net->net_ni_list)) + return NULL; + ni = list_entry(net->net_ni_list.next, struct lnet_ni, + ni_netlist); + + return ni; + } + + if (prev->ni_netlist.next == &prev->ni_net->net_ni_list) { + /* if you reached the end of the ni list and the net is + * specified, then there are no more nis in that net */ + if (net != NULL) + return NULL; + + /* we reached the end of this net ni list. move to the + * next net */ + if (prev->ni_net->net_list.next == &the_lnet.ln_nets) + /* no more nets and no more NIs. */ + return NULL; + + /* get the next net */ + net = list_entry(prev->ni_net->net_list.next, struct lnet_net, + net_list); + if (list_empty(&net->net_ni_list)) + return NULL; + /* get the ni on it */ + ni = list_entry(net->net_ni_list.next, struct lnet_ni, + ni_netlist); + + return ni; + } + + if (list_empty(&prev->ni_netlist)) + return NULL; + + /* there are more nis left */ + ni = list_entry(prev->ni_netlist.next, struct lnet_ni, ni_netlist); + + return ni; +} + +int +lnet_get_net_config(struct lnet_ioctl_config_data *config) +{ + struct lnet_ni *ni; + int cpt; + int rc = -ENOENT; + int idx = config->cfg_count; + + cpt = lnet_net_lock_current(); + + ni = lnet_get_ni_idx_locked(idx); + + if (ni != NULL) { + rc = 0; + lnet_ni_lock(ni); + lnet_fill_ni_info_legacy(ni, config); + lnet_ni_unlock(ni); + } + + lnet_net_unlock(cpt); + return rc; +} + +int +lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni, + struct lnet_ioctl_config_lnd_tunables *tun, + struct lnet_ioctl_element_stats *stats, + __u32 tun_size) +{ + struct lnet_ni *ni; + int cpt; + int rc = -ENOENT; + + if (!cfg_ni || !tun || !stats) + return -EINVAL; + + cpt = lnet_net_lock_current(); + + ni = lnet_get_ni_idx_locked(cfg_ni->lic_idx); + + if (ni) { + rc = 0; + lnet_ni_lock(ni); + lnet_fill_ni_info(ni, cfg_ni, tun, stats, tun_size); + lnet_ni_unlock(ni); + } + + lnet_net_unlock(cpt); + return rc; +} + +int lnet_get_ni_stats(struct lnet_ioctl_element_msg_stats *msg_stats) +{ + struct lnet_ni *ni; + int cpt; + int rc = -ENOENT; + + if (!msg_stats) + return -EINVAL; + + cpt = lnet_net_lock_current(); + + ni = lnet_get_ni_idx_locked(msg_stats->im_idx); + + if (ni) { + lnet_usr_translate_stats(msg_stats, &ni->ni_stats); + rc = 0; + } + + lnet_net_unlock(cpt); + + return rc; +} + +static int lnet_add_net_common(struct lnet_net *net, + struct lnet_ioctl_config_lnd_tunables *tun) +{ + __u32 net_id; + struct lnet_ping_buffer *pbuf; + struct lnet_handle_md ping_mdh; + int rc; + struct lnet_remotenet *rnet; + int net_ni_count; + int num_acceptor_nets; + + lnet_net_lock(LNET_LOCK_EX); + rnet = lnet_find_rnet_locked(net->net_id); + lnet_net_unlock(LNET_LOCK_EX); + /* + * make sure that the net added doesn't invalidate the current + * configuration LNet is keeping + */ + if (rnet) { + CERROR("Adding net %s will invalidate routing configuration\n", + libcfs_net2str(net->net_id)); + lnet_net_free(net); + return -EUSERS; + } + + /* + * make sure you calculate the correct number of slots in the ping + * buffer. Since the ping info is a flattened list of all the NIs, + * we should allocate enough slots to accomodate the number of NIs + * which will be added. + * + * since ni hasn't been configured yet, use + * lnet_get_net_ni_count_pre() which checks the net_ni_added list + */ + net_ni_count = lnet_get_net_ni_count_pre(net); + + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, + net_ni_count + lnet_get_ni_count(), + false); + if (rc < 0) { + lnet_net_free(net); + return rc; + } + + if (tun) + memcpy(&net->net_tunables, + &tun->lt_cmn, sizeof(net->net_tunables)); + else + memset(&net->net_tunables, -1, sizeof(net->net_tunables)); + + /* + * before starting this network get a count of the current TCP + * networks which require the acceptor thread running. If that + * count is == 0 before we start up this network, then we'd want to + * start up the acceptor thread after starting up this network + */ + num_acceptor_nets = lnet_count_acceptor_nets(); + + net_id = net->net_id; + + rc = lnet_startup_lndnet(net, + (tun) ? &tun->lt_tun : NULL); + if (rc < 0) + goto failed; + + lnet_net_lock(LNET_LOCK_EX); + net = lnet_get_net_locked(net_id); + lnet_net_unlock(LNET_LOCK_EX); + + LASSERT(net); + + /* + * Start the acceptor thread if this is the first network + * being added that requires the thread. + */ + if (net->net_lnd->lnd_accept && num_acceptor_nets == 0) { + rc = lnet_acceptor_start(); + if (rc < 0) { + /* shutdown the net that we just started */ + CERROR("Failed to start up acceptor thread\n"); + lnet_shutdown_lndnet(net); + goto failed; + } + } + + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_net_added(net); + lnet_net_unlock(LNET_LOCK_EX); + + lnet_ping_target_update(pbuf, ping_mdh); + + return 0; + +failed: + lnet_ping_md_unlink(pbuf, &ping_mdh); + lnet_ping_buffer_decref(pbuf); + return rc; +} + +static int lnet_handle_legacy_ip2nets(char *ip2nets, + struct lnet_ioctl_config_lnd_tunables *tun) +{ + struct lnet_net *net; + char *nets; + int rc; + struct list_head net_head; + + INIT_LIST_HEAD(&net_head); + + rc = lnet_parse_ip2nets(&nets, ip2nets); + if (rc < 0) + return rc; + + rc = lnet_parse_networks(&net_head, nets, use_tcp_bonding); + if (rc < 0) + return rc; + + mutex_lock(&the_lnet.ln_api_mutex); + while (!list_empty(&net_head)) { + net = list_entry(net_head.next, struct lnet_net, net_list); + list_del_init(&net->net_list); + rc = lnet_add_net_common(net, tun); + if (rc < 0) + goto out; + } + +out: + mutex_unlock(&the_lnet.ln_api_mutex); + + while (!list_empty(&net_head)) { + net = list_entry(net_head.next, struct lnet_net, net_list); + list_del_init(&net->net_list); + lnet_net_free(net); + } + return rc; +} + +int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf) +{ + struct lnet_net *net; + struct lnet_ni *ni; + struct lnet_ioctl_config_lnd_tunables *tun = NULL; + int rc, i; + __u32 net_id, lnd_type; + + /* get the tunables if they are available */ + if (conf->lic_cfg_hdr.ioc_len >= + sizeof(*conf) + sizeof(*tun)) + tun = (struct lnet_ioctl_config_lnd_tunables *) + conf->lic_bulk; + + /* handle legacy ip2nets from DLC */ + if (conf->lic_legacy_ip2nets[0] != '\0') + return lnet_handle_legacy_ip2nets(conf->lic_legacy_ip2nets, + tun); + + net_id = LNET_NIDNET(conf->lic_nid); + lnd_type = LNET_NETTYP(net_id); + + if (!libcfs_isknown_lnd(lnd_type)) { + CERROR("No valid net and lnd information provided\n"); + return -EINVAL; + } + + net = lnet_net_alloc(net_id, NULL); + if (!net) + return -ENOMEM; + + for (i = 0; i < conf->lic_ncpts; i++) { + if (conf->lic_cpts[i] >= LNET_CPT_NUMBER) + return -EINVAL; + } + + ni = lnet_ni_alloc_w_cpt_array(net, conf->lic_cpts, conf->lic_ncpts, + conf->lic_ni_intf[0]); + if (!ni) + return -ENOMEM; + + mutex_lock(&the_lnet.ln_api_mutex); + + rc = lnet_add_net_common(net, tun); + + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; +} + +int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf) +{ + struct lnet_net *net; + struct lnet_ni *ni; + __u32 net_id = LNET_NIDNET(conf->lic_nid); + struct lnet_ping_buffer *pbuf; + struct lnet_handle_md ping_mdh; + int rc; + int net_count; + __u32 addr; + + /* don't allow userspace to shutdown the LOLND */ + if (LNET_NETTYP(net_id) == LOLND) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + + lnet_net_lock(0); + + net = lnet_get_net_locked(net_id); + if (!net) { + CERROR("net %s not found\n", + libcfs_net2str(net_id)); + rc = -ENOENT; + goto unlock_net; + } + + addr = LNET_NIDADDR(conf->lic_nid); + if (addr == 0) { + /* remove the entire net */ + net_count = lnet_get_net_ni_count_locked(net); + + lnet_net_unlock(0); + + /* create and link a new ping info, before removing the old one */ + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, + lnet_get_ni_count() - net_count, + false); + if (rc != 0) + goto unlock_api_mutex; + + lnet_shutdown_lndnet(net); + + if (lnet_count_acceptor_nets() == 0) + lnet_acceptor_stop(); + + lnet_ping_target_update(pbuf, ping_mdh); + + goto unlock_api_mutex; + } + + ni = lnet_nid2ni_locked(conf->lic_nid, 0); + if (!ni) { + CERROR("nid %s not found\n", + libcfs_nid2str(conf->lic_nid)); + rc = -ENOENT; + goto unlock_net; + } + + net_count = lnet_get_net_ni_count_locked(net); + + lnet_net_unlock(0); + + /* create and link a new ping info, before removing the old one */ + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, + lnet_get_ni_count() - 1, false); + if (rc != 0) + goto unlock_api_mutex; + + lnet_shutdown_lndni(ni); + + if (lnet_count_acceptor_nets() == 0) + lnet_acceptor_stop(); + + lnet_ping_target_update(pbuf, ping_mdh); + + /* check if the net is empty and remove it if it is */ + if (net_count == 1) + lnet_shutdown_lndnet(net); + + goto unlock_api_mutex; + +unlock_net: + lnet_net_unlock(0); +unlock_api_mutex: + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; +} + +/* + * lnet_dyn_add_net and lnet_dyn_del_net are now deprecated. + * They are only expected to be called for unique networks. + * That can be as a result of older DLC library + * calls. Multi-Rail DLC and beyond no longer uses these APIs. + */ +int +lnet_dyn_add_net(struct lnet_ioctl_config_data *conf) +{ + struct lnet_net *net; + struct list_head net_head; + int rc; + struct lnet_ioctl_config_lnd_tunables tun; + char *nets = conf->cfg_config_u.cfg_net.net_intf; + + INIT_LIST_HEAD(&net_head); + + /* Create a net/ni structures for the network string */ + rc = lnet_parse_networks(&net_head, nets, use_tcp_bonding); + if (rc <= 0) + return rc == 0 ? -EINVAL : rc; + + mutex_lock(&the_lnet.ln_api_mutex); + + if (rc > 1) { + rc = -EINVAL; /* only add one network per call */ + goto out_unlock_clean; + } + + net = list_entry(net_head.next, struct lnet_net, net_list); + list_del_init(&net->net_list); + + LASSERT(lnet_net_unique(net->net_id, &the_lnet.ln_nets, NULL)); + + memset(&tun, 0, sizeof(tun)); + + tun.lt_cmn.lct_peer_timeout = + conf->cfg_config_u.cfg_net.net_peer_timeout; + tun.lt_cmn.lct_peer_tx_credits = + conf->cfg_config_u.cfg_net.net_peer_tx_credits; + tun.lt_cmn.lct_peer_rtr_credits = + conf->cfg_config_u.cfg_net.net_peer_rtr_credits; + tun.lt_cmn.lct_max_tx_credits = + conf->cfg_config_u.cfg_net.net_max_tx_credits; + + rc = lnet_add_net_common(net, &tun); + +out_unlock_clean: + mutex_unlock(&the_lnet.ln_api_mutex); + while (!list_empty(&net_head)) { + /* net_head list is empty in success case */ + net = list_entry(net_head.next, struct lnet_net, net_list); + list_del_init(&net->net_list); + lnet_net_free(net); + } + return rc; +} + +int +lnet_dyn_del_net(__u32 net_id) +{ + struct lnet_net *net; + struct lnet_ping_buffer *pbuf; + struct lnet_handle_md ping_mdh; + int rc; + int net_ni_count; + + /* don't allow userspace to shutdown the LOLND */ + if (LNET_NETTYP(net_id) == LOLND) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + + lnet_net_lock(0); + + net = lnet_get_net_locked(net_id); + if (net == NULL) { + lnet_net_unlock(0); + rc = -EINVAL; + goto out; + } + + net_ni_count = lnet_get_net_ni_count_locked(net); + + lnet_net_unlock(0); + + /* create and link a new ping info, before removing the old one */ + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, + lnet_get_ni_count() - net_ni_count, false); + if (rc != 0) + goto out; + + lnet_shutdown_lndnet(net); + + if (lnet_count_acceptor_nets() == 0) + lnet_acceptor_stop(); + + lnet_ping_target_update(pbuf, ping_mdh); + +out: + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; +} + +void lnet_incr_dlc_seq(void) +{ + atomic_inc(&lnet_dlc_seq_no); +} + +__u32 lnet_get_dlc_seq_locked(void) +{ + return atomic_read(&lnet_dlc_seq_no); +} + +static void +lnet_ni_set_healthv(lnet_nid_t nid, int value, bool all) +{ + struct lnet_net *net; + struct lnet_ni *ni; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (ni->ni_nid == nid || all) { + atomic_set(&ni->ni_healthv, value); + if (list_empty(&ni->ni_recovery) && + value < LNET_MAX_HEALTH_VALUE) { + CERROR("manually adding local NI %s to recovery\n", + libcfs_nid2str(ni->ni_nid)); + list_add_tail(&ni->ni_recovery, + &the_lnet.ln_mt_localNIRecovq); + lnet_ni_addref_locked(ni, 0); + } + if (!all) { + lnet_net_unlock(LNET_LOCK_EX); + return; + } + } + } + } + lnet_net_unlock(LNET_LOCK_EX); +} + +static int +lnet_get_local_ni_hstats(struct lnet_ioctl_local_ni_hstats *stats) +{ + int cpt, rc = 0; + struct lnet_ni *ni; + lnet_nid_t nid = stats->hlni_nid; + + cpt = lnet_net_lock_current(); + ni = lnet_nid2ni_locked(nid, cpt); + + if (!ni) { + rc = -ENOENT; + goto unlock; + } + + stats->hlni_local_interrupt = atomic_read(&ni->ni_hstats.hlt_local_interrupt); + stats->hlni_local_dropped = atomic_read(&ni->ni_hstats.hlt_local_dropped); + stats->hlni_local_aborted = atomic_read(&ni->ni_hstats.hlt_local_aborted); + stats->hlni_local_no_route = atomic_read(&ni->ni_hstats.hlt_local_no_route); + stats->hlni_local_timeout = atomic_read(&ni->ni_hstats.hlt_local_timeout); + stats->hlni_local_error = atomic_read(&ni->ni_hstats.hlt_local_error); + stats->hlni_health_value = atomic_read(&ni->ni_healthv); + +unlock: + lnet_net_unlock(cpt); + + return rc; +} + +static int +lnet_get_local_ni_recovery_list(struct lnet_ioctl_recovery_list *list) +{ + struct lnet_ni *ni; + int i = 0; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry(ni, &the_lnet.ln_mt_localNIRecovq, ni_recovery) { + list->rlst_nid_array[i] = ni->ni_nid; + i++; + if (i >= LNET_MAX_SHOW_NUM_NID) + break; + } + lnet_net_unlock(LNET_LOCK_EX); + list->rlst_num_nids = i; + + return 0; +} + +static int +lnet_get_peer_ni_recovery_list(struct lnet_ioctl_recovery_list *list) +{ + struct lnet_peer_ni *lpni; + int i = 0; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry(lpni, &the_lnet.ln_mt_peerNIRecovq, lpni_recovery) { + list->rlst_nid_array[i] = lpni->lpni_nid; + i++; + if (i >= LNET_MAX_SHOW_NUM_NID) + break; + } + lnet_net_unlock(LNET_LOCK_EX); + list->rlst_num_nids = i; + + return 0; +} + +/** + * LNet ioctl handler. + * + */ +int +LNetCtl(unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + struct lnet_ioctl_config_data *config; + struct lnet_process_id id = {0}; + struct lnet_ni *ni; + int rc; + + BUILD_BUG_ON(sizeof(struct lnet_ioctl_net_config) + + sizeof(struct lnet_ioctl_config_data) > LIBCFS_IOC_DATA_MAX); + + switch (cmd) { + case IOC_LIBCFS_GET_NI: + rc = LNetGetId(data->ioc_count, &id); + data->ioc_nid = id.nid; + return rc; + + case IOC_LIBCFS_FAIL_NID: + return lnet_fail_nid(data->ioc_nid, data->ioc_count); + + case IOC_LIBCFS_ADD_ROUTE: + config = arg; + + if (config->cfg_hdr.ioc_len < sizeof(*config)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_add_route(config->cfg_net, + config->cfg_config_u.cfg_route.rtr_hop, + config->cfg_nid, + config->cfg_config_u.cfg_route. + rtr_priority); + if (rc == 0) { + rc = lnet_check_routes(); + if (rc != 0) + lnet_del_route(config->cfg_net, + config->cfg_nid); + } + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + + case IOC_LIBCFS_DEL_ROUTE: + config = arg; + + if (config->cfg_hdr.ioc_len < sizeof(*config)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_del_route(config->cfg_net, config->cfg_nid); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + + case IOC_LIBCFS_GET_ROUTE: + config = arg; + + if (config->cfg_hdr.ioc_len < sizeof(*config)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_route(config->cfg_count, + &config->cfg_net, + &config->cfg_config_u.cfg_route.rtr_hop, + &config->cfg_nid, + &config->cfg_config_u.cfg_route.rtr_flags, + &config->cfg_config_u.cfg_route. + rtr_priority); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + + case IOC_LIBCFS_GET_LOCAL_NI: { + struct lnet_ioctl_config_ni *cfg_ni; + struct lnet_ioctl_config_lnd_tunables *tun = NULL; + struct lnet_ioctl_element_stats *stats; + __u32 tun_size; + + cfg_ni = arg; + + /* get the tunables if they are available */ + if (cfg_ni->lic_cfg_hdr.ioc_len < + sizeof(*cfg_ni) + sizeof(*stats) + sizeof(*tun)) + return -EINVAL; + + stats = (struct lnet_ioctl_element_stats *) + cfg_ni->lic_bulk; + tun = (struct lnet_ioctl_config_lnd_tunables *) + (cfg_ni->lic_bulk + sizeof(*stats)); + + tun_size = cfg_ni->lic_cfg_hdr.ioc_len - sizeof(*cfg_ni) - + sizeof(*stats); + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_ni_config(cfg_ni, tun, stats, tun_size); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS: { + struct lnet_ioctl_element_msg_stats *msg_stats = arg; + + if (msg_stats->im_hdr.ioc_len != sizeof(*msg_stats)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_ni_stats(msg_stats); + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; + } + + case IOC_LIBCFS_GET_NET: { + size_t total = sizeof(*config) + + sizeof(struct lnet_ioctl_net_config); + config = arg; + + if (config->cfg_hdr.ioc_len < total) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_net_config(config); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_LNET_STATS: + { + struct lnet_ioctl_lnet_stats *lnet_stats = arg; + + if (lnet_stats->st_hdr.ioc_len < sizeof(*lnet_stats)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + lnet_counters_get(&lnet_stats->st_cntrs); + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + case IOC_LIBCFS_CONFIG_RTR: + config = arg; + + if (config->cfg_hdr.ioc_len < sizeof(*config)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + if (config->cfg_config_u.cfg_buffers.buf_enable) { + rc = lnet_rtrpools_enable(); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + lnet_rtrpools_disable(); + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + + case IOC_LIBCFS_ADD_BUF: + config = arg; + + if (config->cfg_hdr.ioc_len < sizeof(*config)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_rtrpools_adjust(config->cfg_config_u.cfg_buffers. + buf_tiny, + config->cfg_config_u.cfg_buffers. + buf_small, + config->cfg_config_u.cfg_buffers. + buf_large); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + + case IOC_LIBCFS_SET_NUMA_RANGE: { + struct lnet_ioctl_set_value *numa; + numa = arg; + if (numa->sv_hdr.ioc_len != sizeof(*numa)) + return -EINVAL; + lnet_net_lock(LNET_LOCK_EX); + lnet_numa_range = numa->sv_value; + lnet_net_unlock(LNET_LOCK_EX); + return 0; + } + + case IOC_LIBCFS_GET_NUMA_RANGE: { + struct lnet_ioctl_set_value *numa; + numa = arg; + if (numa->sv_hdr.ioc_len != sizeof(*numa)) + return -EINVAL; + numa->sv_value = lnet_numa_range; + return 0; + } + + case IOC_LIBCFS_GET_BUF: { + struct lnet_ioctl_pool_cfg *pool_cfg; + size_t total = sizeof(*config) + sizeof(*pool_cfg); + + config = arg; + + if (config->cfg_hdr.ioc_len < total) + return -EINVAL; + + pool_cfg = (struct lnet_ioctl_pool_cfg *)config->cfg_bulk; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_rtr_pool_cfg(config->cfg_count, pool_cfg); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_LOCAL_HSTATS: { + struct lnet_ioctl_local_ni_hstats *stats = arg; + + if (stats->hlni_hdr.ioc_len < sizeof(*stats)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_local_ni_hstats(stats); + mutex_unlock(&the_lnet.ln_api_mutex); + + return rc; + } + + case IOC_LIBCFS_GET_RECOVERY_QUEUE: { + struct lnet_ioctl_recovery_list *list = arg; + if (list->rlst_hdr.ioc_len < sizeof(*list)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + if (list->rlst_type == LNET_HEALTH_TYPE_LOCAL_NI) + rc = lnet_get_local_ni_recovery_list(list); + else + rc = lnet_get_peer_ni_recovery_list(list); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_ADD_PEER_NI: { + struct lnet_ioctl_peer_cfg *cfg = arg; + + if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_add_peer_ni(cfg->prcfg_prim_nid, + cfg->prcfg_cfg_nid, + cfg->prcfg_mr); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_DEL_PEER_NI: { + struct lnet_ioctl_peer_cfg *cfg = arg; + + if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_del_peer_ni(cfg->prcfg_prim_nid, + cfg->prcfg_cfg_nid); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_PEER_INFO: { + struct lnet_ioctl_peer *peer_info = arg; + + if (peer_info->pr_hdr.ioc_len < sizeof(*peer_info)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_peer_ni_info( + peer_info->pr_count, + &peer_info->pr_nid, + peer_info->pr_lnd_u.pr_peer_credits.cr_aliveness, + &peer_info->pr_lnd_u.pr_peer_credits.cr_ncpt, + &peer_info->pr_lnd_u.pr_peer_credits.cr_refcount, + &peer_info->pr_lnd_u.pr_peer_credits.cr_ni_peer_tx_credits, + &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_credits, + &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_rtr_credits, + &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_min_tx_credits, + &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_qnob); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_PEER_NI: { + struct lnet_ioctl_peer_cfg *cfg = arg; + + if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_peer_info(cfg, + (void __user *)cfg->prcfg_bulk); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_GET_PEER_LIST: { + struct lnet_ioctl_peer_cfg *cfg = arg; + + if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_get_peer_list(&cfg->prcfg_count, &cfg->prcfg_size, + (struct lnet_process_id __user *)cfg->prcfg_bulk); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + + case IOC_LIBCFS_SET_HEALHV: { + struct lnet_ioctl_reset_health_cfg *cfg = arg; + int value; + if (cfg->rh_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + if (cfg->rh_value < 0 || + cfg->rh_value > LNET_MAX_HEALTH_VALUE) + value = LNET_MAX_HEALTH_VALUE; + else + value = cfg->rh_value; + CDEBUG(D_NET, "Manually setting healthv to %d for %s:%s. all = %d\n", + value, (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI) ? + "local" : "peer", libcfs_nid2str(cfg->rh_nid), cfg->rh_all); + mutex_lock(&the_lnet.ln_api_mutex); + if (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI) + lnet_ni_set_healthv(cfg->rh_nid, value, + cfg->rh_all); + else + lnet_peer_ni_set_healthv(cfg->rh_nid, value, + cfg->rh_all); + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + case IOC_LIBCFS_NOTIFY_ROUTER: { + time64_t deadline = ktime_get_real_seconds() - data->ioc_u64[0]; + + /* The deadline passed in by the user should be some time in + * seconds in the future since the UNIX epoch. We have to map + * that deadline to the wall clock. + */ + deadline += ktime_get_seconds(); + return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, + deadline); + } + + case IOC_LIBCFS_LNET_DIST: + rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]); + if (rc < 0 && rc != -EHOSTUNREACH) + return rc; + + data->ioc_u32[0] = rc; + return 0; + + case IOC_LIBCFS_TESTPROTOCOMPAT: + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_testprotocompat = data->ioc_flags; + lnet_net_unlock(LNET_LOCK_EX); + return 0; + + case IOC_LIBCFS_LNET_FAULT: + return lnet_fault_ctl(data->ioc_flags, data); + + case IOC_LIBCFS_PING: { + signed long timeout; + + id.nid = data->ioc_nid; + id.pid = data->ioc_u32[0]; + + /* If timeout is negative then set default of 3 minutes */ + if (((s32)data->ioc_u32[1] <= 0) || + data->ioc_u32[1] > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC)) + timeout = msecs_to_jiffies(DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC); + else + timeout = msecs_to_jiffies(data->ioc_u32[1]); + + rc = lnet_ping(id, timeout, data->ioc_pbuf1, + data->ioc_plen1 / sizeof(struct lnet_process_id)); + + if (rc < 0) + return rc; + + data->ioc_count = rc; + return 0; + } + + case IOC_LIBCFS_PING_PEER: { + struct lnet_ioctl_ping_data *ping = arg; + struct lnet_peer *lp; + signed long timeout; + + /* If timeout is negative then set default of 3 minutes */ + if (((s32)ping->op_param) <= 0 || + ping->op_param > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC)) + timeout = msecs_to_jiffies(DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC); + else + timeout = msecs_to_jiffies(ping->op_param); + + rc = lnet_ping(ping->ping_id, timeout, + ping->ping_buf, + ping->ping_count); + if (rc < 0) + return rc; + + mutex_lock(&the_lnet.ln_api_mutex); + lp = lnet_find_peer(ping->ping_id.nid); + if (lp) { + ping->ping_id.nid = lp->lp_primary_nid; + ping->mr_info = lnet_peer_is_multi_rail(lp); + lnet_peer_decref_locked(lp); + } + mutex_unlock(&the_lnet.ln_api_mutex); + + ping->ping_count = rc; + return 0; + } + + case IOC_LIBCFS_DISCOVER: { + struct lnet_ioctl_ping_data *discover = arg; + struct lnet_peer *lp; + + rc = lnet_discover(discover->ping_id, discover->op_param, + discover->ping_buf, + discover->ping_count); + if (rc < 0) + return rc; + + mutex_lock(&the_lnet.ln_api_mutex); + lp = lnet_find_peer(discover->ping_id.nid); + if (lp) { + discover->ping_id.nid = lp->lp_primary_nid; + discover->mr_info = lnet_peer_is_multi_rail(lp); + lnet_peer_decref_locked(lp); + } + mutex_unlock(&the_lnet.ln_api_mutex); + + discover->ping_count = rc; + return 0; + } + + default: + ni = lnet_net2ni_addref(data->ioc_net); + if (ni == NULL) + return -EINVAL; + + if (ni->ni_net->net_lnd->lnd_ctl == NULL) + rc = -EINVAL; + else + rc = ni->ni_net->net_lnd->lnd_ctl(ni, cmd, arg); + + lnet_ni_decref(ni); + return rc; + } + /* not reached */ +} +EXPORT_SYMBOL(LNetCtl); + +void LNetDebugPeer(struct lnet_process_id id) +{ + lnet_debug_peer(id.nid); +} +EXPORT_SYMBOL(LNetDebugPeer); + +/** + * Determine if the specified peer \a nid is on the local node. + * + * \param nid peer nid to check + * + * \retval true If peer NID is on the local node. + * \retval false If peer NID is not on the local node. + */ +bool LNetIsPeerLocal(lnet_nid_t nid) +{ + struct lnet_net *net; + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (ni->ni_nid == nid) { + lnet_net_unlock(cpt); + return true; + } + } + } + lnet_net_unlock(cpt); + + return false; +} +EXPORT_SYMBOL(LNetIsPeerLocal); + +/** + * Retrieve the struct lnet_process_id ID of LNet interface at \a index. + * Note that all interfaces share a same PID, as requested by LNetNIInit(). + * + * \param index Index of the interface to look up. + * \param id On successful return, this location will hold the + * struct lnet_process_id ID of the interface. + * + * \retval 0 If an interface exists at \a index. + * \retval -ENOENT If no interface has been found. + */ +int +LNetGetId(unsigned int index, struct lnet_process_id *id) +{ + struct lnet_ni *ni; + struct lnet_net *net; + int cpt; + int rc = -ENOENT; + + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_net_lock_current(); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (index-- != 0) + continue; + + id->nid = ni->ni_nid; + id->pid = the_lnet.ln_pid; + rc = 0; + break; + } + } + + lnet_net_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetGetId); + +static int lnet_ping(struct lnet_process_id id, signed long timeout, + struct lnet_process_id __user *ids, int n_ids) +{ + struct lnet_handle_eq eqh; + struct lnet_handle_md mdh; + struct lnet_event event; + struct lnet_md md = { NULL }; + int which; + int unlinked = 0; + int replied = 0; + const signed long a_long_time = msecs_to_jiffies(60 * MSEC_PER_SEC); + struct lnet_ping_buffer *pbuf; + struct lnet_process_id tmpid; + int i; + int nob; + int rc; + int rc2; + sigset_t blocked; + + /* n_ids limit is arbitrary */ + if (n_ids <= 0 || id.nid == LNET_NID_ANY) + return -EINVAL; + + /* + * if the user buffer has more space than the lnet_interfaces_max + * then only fill it up to lnet_interfaces_max + */ + if (n_ids > lnet_interfaces_max) + n_ids = lnet_interfaces_max; + + if (id.pid == LNET_PID_ANY) + id.pid = LNET_PID_LUSTRE; + + pbuf = lnet_ping_buffer_alloc(n_ids, GFP_NOFS); + if (!pbuf) + return -ENOMEM; + + /* NB 2 events max (including any unlink event) */ + rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh); + if (rc != 0) { + CERROR("Can't allocate EQ: %d\n", rc); + goto fail_ping_buffer_decref; + } + + /* initialize md content */ + md.start = &pbuf->pb_info; + md.length = LNET_PING_INFO_SIZE(n_ids); + md.threshold = 2; /* GET/REPLY */ + md.max_size = 0; + md.options = LNET_MD_TRUNCATE; + md.user_ptr = NULL; + md.eq_handle = eqh; + + rc = LNetMDBind(md, LNET_UNLINK, &mdh); + if (rc != 0) { + CERROR("Can't bind MD: %d\n", rc); + goto fail_free_eq; + } + + rc = LNetGet(LNET_NID_ANY, mdh, id, + LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0, false); + + if (rc != 0) { + /* Don't CERROR; this could be deliberate! */ + rc2 = LNetMDUnlink(mdh); + LASSERT(rc2 == 0); + + /* NB must wait for the UNLINK event below... */ + unlinked = 1; + timeout = a_long_time; + } + + do { + /* MUST block for unlink to complete */ + if (unlinked) + blocked = cfs_block_allsigs(); + + rc2 = LNetEQPoll(&eqh, 1, timeout, &event, &which); + + if (unlinked) + cfs_restore_sigs(blocked); + + CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2, + (rc2 <= 0) ? -1 : event.type, + (rc2 <= 0) ? -1 : event.status, + (rc2 > 0 && event.unlinked) ? " unlinked" : ""); + + LASSERT(rc2 != -EOVERFLOW); /* can't miss anything */ + + if (rc2 <= 0 || event.status != 0) { + /* timeout or error */ + if (!replied && rc == 0) + rc = (rc2 < 0) ? rc2 : + (rc2 == 0) ? -ETIMEDOUT : + event.status; + + if (!unlinked) { + /* Ensure completion in finite time... */ + LNetMDUnlink(mdh); + /* No assertion (racing with network) */ + unlinked = 1; + timeout = a_long_time; + } else if (rc2 == 0) { + /* timed out waiting for unlink */ + CWARN("ping %s: late network completion\n", + libcfs_id2str(id)); + } + } else if (event.type == LNET_EVENT_REPLY) { + replied = 1; + rc = event.mlength; + } + } while (rc2 <= 0 || !event.unlinked); + + if (!replied) { + if (rc >= 0) + CWARN("%s: Unexpected rc >= 0 but no reply!\n", + libcfs_id2str(id)); + rc = -EIO; + goto fail_free_eq; + } + + nob = rc; + LASSERT(nob >= 0 && nob <= LNET_PING_INFO_SIZE(n_ids)); + + rc = -EPROTO; /* if I can't parse... */ + + if (nob < 8) { + CERROR("%s: ping info too short %d\n", + libcfs_id2str(id), nob); + goto fail_free_eq; + } + + if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) { + lnet_swap_pinginfo(pbuf); + } else if (pbuf->pb_info.pi_magic != LNET_PROTO_PING_MAGIC) { + CERROR("%s: Unexpected magic %08x\n", + libcfs_id2str(id), pbuf->pb_info.pi_magic); + goto fail_free_eq; + } + + if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_NI_STATUS) == 0) { + CERROR("%s: ping w/o NI status: 0x%x\n", + libcfs_id2str(id), pbuf->pb_info.pi_features); + goto fail_free_eq; + } + + if (nob < LNET_PING_INFO_SIZE(0)) { + CERROR("%s: Short reply %d(%d min)\n", + libcfs_id2str(id), + nob, (int)LNET_PING_INFO_SIZE(0)); + goto fail_free_eq; + } + + if (pbuf->pb_info.pi_nnis < n_ids) + n_ids = pbuf->pb_info.pi_nnis; + + if (nob < LNET_PING_INFO_SIZE(n_ids)) { + CERROR("%s: Short reply %d(%d expected)\n", + libcfs_id2str(id), + nob, (int)LNET_PING_INFO_SIZE(n_ids)); + goto fail_free_eq; + } + + rc = -EFAULT; /* if I segv in copy_to_user()... */ + + memset(&tmpid, 0, sizeof(tmpid)); + for (i = 0; i < n_ids; i++) { + tmpid.pid = pbuf->pb_info.pi_pid; + tmpid.nid = pbuf->pb_info.pi_ni[i].ns_nid; + if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid))) + goto fail_free_eq; + } + rc = pbuf->pb_info.pi_nnis; + + fail_free_eq: + rc2 = LNetEQFree(eqh); + if (rc2 != 0) + CERROR("rc2 %d\n", rc2); + LASSERT(rc2 == 0); + + fail_ping_buffer_decref: + lnet_ping_buffer_decref(pbuf); + return rc; +} + +static int +lnet_discover(struct lnet_process_id id, __u32 force, + struct lnet_process_id __user *ids, int n_ids) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer_ni *p; + struct lnet_peer *lp; + struct lnet_process_id *buf; + int cpt; + int i; + int rc; + int max_intf = lnet_interfaces_max; + size_t buf_size; + + if (n_ids <= 0 || + id.nid == LNET_NID_ANY) + return -EINVAL; + + if (id.pid == LNET_PID_ANY) + id.pid = LNET_PID_LUSTRE; + + /* + * if the user buffer has more space than the max_intf + * then only fill it up to max_intf + */ + if (n_ids > max_intf) + n_ids = max_intf; + + buf_size = n_ids * sizeof(*buf); + + LIBCFS_ALLOC(buf, buf_size); + if (!buf) + return -ENOMEM; + + cpt = lnet_net_lock_current(); + lpni = lnet_nid2peerni_locked(id.nid, LNET_NID_ANY, cpt); + if (IS_ERR(lpni)) { + rc = PTR_ERR(lpni); + goto out; + } + + /* + * Clearing the NIDS_UPTODATE flag ensures the peer will + * be discovered, provided discovery has not been disabled. + */ + lp = lpni->lpni_peer_net->lpn_peer; + spin_lock(&lp->lp_lock); + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + /* If the force flag is set, force a PING and PUSH as well. */ + if (force) + lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH; + spin_unlock(&lp->lp_lock); + rc = lnet_discover_peer_locked(lpni, cpt, true); + if (rc) + goto out_decref; + + /* Peer may have changed. */ + lp = lpni->lpni_peer_net->lpn_peer; + if (lp->lp_nnis < n_ids) + n_ids = lp->lp_nnis; + + i = 0; + p = NULL; + while ((p = lnet_get_next_peer_ni_locked(lp, NULL, p)) != NULL) { + buf[i].pid = id.pid; + buf[i].nid = p->lpni_nid; + if (++i >= n_ids) + break; + } + + lnet_net_unlock(cpt); + + rc = -EFAULT; + if (copy_to_user(ids, buf, n_ids * sizeof(*buf))) + goto out_relock; + rc = n_ids; +out_relock: + lnet_net_lock(cpt); +out_decref: + lnet_peer_ni_decref_locked(lpni); +out: + lnet_net_unlock(cpt); + + LIBCFS_FREE(buf, buf_size); + + return rc; +} + +/** + * Retrieve peer discovery status. + * + * \retval 1 if lnet_peer_discovery_disabled is 0 + * \retval 0 if lnet_peer_discovery_disabled is 1 + */ +int +LNetGetPeerDiscoveryStatus(void) +{ + return !lnet_peer_discovery_disabled; +} +EXPORT_SYMBOL(LNetGetPeerDiscoveryStatus); diff --git a/drivers/staging/lustrefsx/lnet/lnet/config.c b/drivers/staging/lustrefsx/lnet/lnet/config.c new file mode 100644 index 0000000000000..741711af0813f --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/config.c @@ -0,0 +1,1742 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include +#include + +/* tmp struct for parsing routes */ +struct lnet_text_buf { + struct list_head ltb_list; /* stash on lists */ + int ltb_size; /* allocated size */ + char ltb_text[0]; /* text buffer */ +}; + +static int lnet_tbnob = 0; /* track text buf allocation */ +#define LNET_MAX_TEXTBUF_NOB (64<<10) /* bound allocation */ +#define LNET_SINGLE_TEXTBUF_NOB (4<<10) + +#define SPACESTR " \t\v\r\n" +#define DELIMITERS ":()[]" + +static void +lnet_syntax(const char *name, const char *str, int offset, int width) +{ + static char dots[LNET_SINGLE_TEXTBUF_NOB]; + static char dashes[LNET_SINGLE_TEXTBUF_NOB]; + + memset(dots, '.', sizeof(dots)); + dots[sizeof(dots)-1] = 0; + memset(dashes, '-', sizeof(dashes)); + dashes[sizeof(dashes)-1] = 0; + + LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str); + LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n", + (int)strlen(name), dots, offset, dots, + (width < 1) ? 0 : width - 1, dashes); +} + +static int +lnet_issep (char c) +{ + switch (c) { + case '\n': + case '\r': + case ';': + return 1; + default: + return 0; + } +} + +bool +lnet_net_unique(__u32 net_id, struct list_head *netlist, + struct lnet_net **net) +{ + struct lnet_net *net_l; + + if (!netlist) + return true; + + list_for_each_entry(net_l, netlist, net_list) { + if (net_l->net_id == net_id) { + if (net != NULL) + *net = net_l; + return false; + } + } + + return true; +} + +/* check that the NI is unique within the list of NIs already added to + * a network */ +bool +lnet_ni_unique_net(struct list_head *nilist, char *iface) +{ + struct list_head *tmp; + struct lnet_ni *ni; + + list_for_each(tmp, nilist) { + ni = list_entry(tmp, struct lnet_ni, ni_netlist); + + if (ni->ni_interfaces[0] != NULL && + strncmp(ni->ni_interfaces[0], iface, strlen(iface)) == 0) + return false; + } + + return true; +} + +/* check that the NI is unique to the interfaces with in the same NI. + * This is only a consideration if use_tcp_bonding is set */ +static bool +lnet_ni_unique_ni(char *iface_list[LNET_INTERFACES_NUM], char *iface) +{ + int i; + for (i = 0; i < LNET_INTERFACES_NUM; i++) { + if (iface_list[i] != NULL && + strncmp(iface_list[i], iface, strlen(iface)) == 0) + return false; + } + + return true; +} + +static bool +in_array(__u32 *array, __u32 size, __u32 value) +{ + int i; + + for (i = 0; i < size; i++) { + if (array[i] == value) + return false; + } + + return true; +} + +static int +lnet_net_append_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net) +{ + __u32 *added_cpts = NULL; + int i, j = 0, rc = 0; + + /* + * no need to go futher since a subset of the NIs already exist on + * all CPTs + */ + if (net->net_ncpts == LNET_CPT_NUMBER) + return 0; + + if (cpts == NULL) { + /* there is an NI which will exist on all CPTs */ + if (net->net_cpts != NULL) + LIBCFS_FREE(net->net_cpts, sizeof(*net->net_cpts) * + net->net_ncpts); + net->net_cpts = NULL; + net->net_ncpts = LNET_CPT_NUMBER; + return 0; + } + + if (net->net_cpts == NULL) { + LIBCFS_ALLOC(net->net_cpts, sizeof(*net->net_cpts) * ncpts); + if (net->net_cpts == NULL) + return -ENOMEM; + memcpy(net->net_cpts, cpts, ncpts * sizeof(*net->net_cpts)); + net->net_ncpts = ncpts; + return 0; + } + + LIBCFS_ALLOC(added_cpts, sizeof(*added_cpts) * LNET_CPT_NUMBER); + if (added_cpts == NULL) + return -ENOMEM; + + for (i = 0; i < ncpts; i++) { + if (!in_array(net->net_cpts, net->net_ncpts, cpts[i])) { + added_cpts[j] = cpts[i]; + j++; + } + } + + /* append the new cpts if any to the list of cpts in the net */ + if (j > 0) { + __u32 *array = NULL, *loc; + __u32 total_entries = j + net->net_ncpts; + + LIBCFS_ALLOC(array, sizeof(*net->net_cpts) * total_entries); + if (array == NULL) { + rc = -ENOMEM; + goto failed; + } + + memcpy(array, net->net_cpts, net->net_ncpts); + loc = array + net->net_ncpts; + memcpy(loc, added_cpts, j); + + LIBCFS_FREE(net->net_cpts, sizeof(*net->net_cpts) * + net->net_ncpts); + net->net_ncpts = total_entries; + net->net_cpts = array; + } + +failed: + LIBCFS_FREE(added_cpts, sizeof(*added_cpts) * LNET_CPT_NUMBER); + + return rc; +} + +static void +lnet_net_remove_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net) +{ + struct lnet_ni *ni; + int rc; + + /* + * Operation Assumption: + * This function is called after an NI has been removed from + * its parent net. + * + * if we're removing an NI which exists on all CPTs then + * we have to check if any of the other NIs on this net also + * exists on all CPTs. If none, then we need to build our Net CPT + * list based on the remaining NIs. + * + * If the NI being removed exist on a subset of the CPTs then we + * alo rebuild the Net CPT list based on the remaining NIs, which + * should resutl in the expected Net CPT list. + */ + + /* + * sometimes this function can be called due to some failure + * creating an NI, before any of the cpts are allocated, so check + * for that case and don't do anything + */ + if (ncpts == 0) + return; + + if (ncpts == LNET_CPT_NUMBER) { + /* + * first iteration through the NI list in the net to see + * if any of the NIs exist on all the CPTs. If one is + * found then our job is done. + */ + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (ni->ni_ncpts == LNET_CPT_NUMBER) + return; + } + } + + /* + * Rebuild the Net CPT list again, thereby only including only the + * CPTs which the remaining NIs are associated with. + */ + if (net->net_cpts != NULL) { + LIBCFS_FREE(net->net_cpts, + sizeof(*net->net_cpts) * net->net_ncpts); + net->net_cpts = NULL; + } + + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, + net); + if (rc != 0) { + CERROR("Out of Memory\n"); + /* + * do our best to keep on going. Delete + * the net cpts and set it to NULL. This + * way we can keep on going but less + * efficiently, since memory accesses might be + * accross CPT lines. + */ + if (net->net_cpts != NULL) { + LIBCFS_FREE(net->net_cpts, + sizeof(*net->net_cpts) * + net->net_ncpts); + net->net_cpts = NULL; + net->net_ncpts = LNET_CPT_NUMBER; + } + return; + } + } +} + +void +lnet_ni_free(struct lnet_ni *ni) +{ + int i; + + lnet_net_remove_cpts(ni->ni_cpts, ni->ni_ncpts, ni->ni_net); + + if (ni->ni_refs != NULL) + cfs_percpt_free(ni->ni_refs); + + if (ni->ni_tx_queues != NULL) + cfs_percpt_free(ni->ni_tx_queues); + + if (ni->ni_cpts != NULL) + cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts); + + for (i = 0; i < LNET_INTERFACES_NUM && + ni->ni_interfaces[i] != NULL; i++) { + LIBCFS_FREE(ni->ni_interfaces[i], + strlen(ni->ni_interfaces[i]) + 1); + } + + /* release reference to net namespace */ + if (ni->ni_net_ns != NULL) + put_net(ni->ni_net_ns); + + LIBCFS_FREE(ni, sizeof(*ni)); +} + +void +lnet_net_free(struct lnet_net *net) +{ + struct list_head *tmp, *tmp2; + struct lnet_ni *ni; + + LASSERT(list_empty(&net->net_ni_zombie)); + + /* + * delete any nis that haven't been added yet. This could happen + * if there is a failure on net startup + */ + list_for_each_safe(tmp, tmp2, &net->net_ni_added) { + ni = list_entry(tmp, struct lnet_ni, ni_netlist); + list_del_init(&ni->ni_netlist); + lnet_ni_free(ni); + } + + /* delete any nis which have been started. */ + list_for_each_safe(tmp, tmp2, &net->net_ni_list) { + ni = list_entry(tmp, struct lnet_ni, ni_netlist); + list_del_init(&ni->ni_netlist); + lnet_ni_free(ni); + } + + if (net->net_cpts != NULL) + LIBCFS_FREE(net->net_cpts, + sizeof(*net->net_cpts) * net->net_ncpts); + + LIBCFS_FREE(net, sizeof(*net)); +} + +struct lnet_net * +lnet_net_alloc(__u32 net_id, struct list_head *net_list) +{ + struct lnet_net *net; + + if (!lnet_net_unique(net_id, net_list, NULL)) { + CERROR("Duplicate net %s. Ignore\n", + libcfs_net2str(net_id)); + return NULL; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + if (net == NULL) { + CERROR("Out of memory creating network %s\n", + libcfs_net2str(net_id)); + return NULL; + } + + INIT_LIST_HEAD(&net->net_list); + INIT_LIST_HEAD(&net->net_ni_list); + INIT_LIST_HEAD(&net->net_ni_added); + INIT_LIST_HEAD(&net->net_ni_zombie); + + net->net_id = net_id; + net->net_state = LNET_NET_STATE_INIT; + + /* initialize global paramters to undefiend */ + net->net_tunables.lct_peer_timeout = -1; + net->net_tunables.lct_max_tx_credits = -1; + net->net_tunables.lct_peer_tx_credits = -1; + net->net_tunables.lct_peer_rtr_credits = -1; + + if (net_list) + list_add_tail(&net->net_list, net_list); + + return net; +} + +static int +lnet_ni_add_interface(struct lnet_ni *ni, char *iface) +{ + int niface = 0; + + if (ni == NULL) + return -ENOMEM; + + if (!lnet_ni_unique_ni(ni->ni_interfaces, iface)) + return -EINVAL; + + /* Allocate a separate piece of memory and copy + * into it the string, so we don't have + * a depencency on the tokens string. This way we + * can free the tokens at the end of the function. + * The newly allocated ni_interfaces[] can be + * freed when freeing the NI */ + while (niface < LNET_INTERFACES_NUM && + ni->ni_interfaces[niface] != NULL) + niface++; + + if (niface >= LNET_INTERFACES_NUM) { + LCONSOLE_ERROR_MSG(0x115, "Too many interfaces " + "for net %s\n", + libcfs_net2str(LNET_NIDNET(ni->ni_nid))); + return -EINVAL; + } + + LIBCFS_ALLOC(ni->ni_interfaces[niface], + strlen(iface) + 1); + + if (ni->ni_interfaces[niface] == NULL) { + CERROR("Can't allocate net interface name\n"); + return -ENOMEM; + } + + strncpy(ni->ni_interfaces[niface], iface, + strlen(iface) + 1); + + return 0; +} + +static struct lnet_ni * +lnet_ni_alloc_common(struct lnet_net *net, char *iface) +{ + struct lnet_tx_queue *tq; + struct lnet_ni *ni; + int i; + + if (iface != NULL) + /* make sure that this NI is unique in the net it's + * being added to */ + if (!lnet_ni_unique_net(&net->net_ni_added, iface)) + return NULL; + + LIBCFS_ALLOC(ni, sizeof(*ni)); + if (ni == NULL) { + CERROR("Out of memory creating network interface %s%s\n", + libcfs_net2str(net->net_id), + (iface != NULL) ? iface : ""); + return NULL; + } + + spin_lock_init(&ni->ni_lock); + INIT_LIST_HEAD(&ni->ni_netlist); + INIT_LIST_HEAD(&ni->ni_recovery); + LNetInvalidateMDHandle(&ni->ni_ping_mdh); + ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ni->ni_refs[0])); + if (ni->ni_refs == NULL) + goto failed; + + ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ni->ni_tx_queues[0])); + if (ni->ni_tx_queues == NULL) + goto failed; + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) + INIT_LIST_HEAD(&tq->tq_delayed); + + ni->ni_net = net; + /* LND will fill in the address part of the NID */ + ni->ni_nid = LNET_MKNID(net->net_id, 0); + + /* Store net namespace in which current ni is being created */ + if (current->nsproxy && current->nsproxy->net_ns) + ni->ni_net_ns = get_net(current->nsproxy->net_ns); + else + ni->ni_net_ns = get_net(&init_net); + + ni->ni_last_alive = ktime_get_real_seconds(); + ni->ni_state = LNET_NI_STATE_INIT; + list_add_tail(&ni->ni_netlist, &net->net_ni_added); + + /* + * if an interface name is provided then make sure to add in that + * interface name in NI + */ + if (iface) + if (lnet_ni_add_interface(ni, iface) != 0) + goto failed; + + return ni; +failed: + lnet_ni_free(ni); + return NULL; +} + +/* allocate and add to the provided network */ +struct lnet_ni * +lnet_ni_alloc(struct lnet_net *net, struct cfs_expr_list *el, char *iface) +{ + struct lnet_ni *ni; + int rc; + + ni = lnet_ni_alloc_common(net, iface); + if (!ni) + return NULL; + + if (!el) { + ni->ni_cpts = NULL; + ni->ni_ncpts = LNET_CPT_NUMBER; + } else { + rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts); + if (rc <= 0) { + CERROR("Failed to set CPTs for NI %s(%s): %d\n", + libcfs_net2str(net->net_id), + (iface != NULL) ? iface : "", rc); + goto failed; + } + + LASSERT(rc <= LNET_CPT_NUMBER); + if (rc == LNET_CPT_NUMBER) { + LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0])); + ni->ni_cpts = NULL; + } + + ni->ni_ncpts = rc; + } + + rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, net); + if (rc != 0) + goto failed; + + return ni; +failed: + lnet_ni_free(ni); + return NULL; +} + +struct lnet_ni * +lnet_ni_alloc_w_cpt_array(struct lnet_net *net, __u32 *cpts, __u32 ncpts, + char *iface) +{ + struct lnet_ni *ni; + int rc; + + ni = lnet_ni_alloc_common(net, iface); + if (!ni) + return NULL; + + if (ncpts == 0) { + ni->ni_cpts = NULL; + ni->ni_ncpts = LNET_CPT_NUMBER; + } else { + size_t array_size = ncpts * sizeof(ni->ni_cpts[0]); + LIBCFS_ALLOC(ni->ni_cpts, array_size); + if (ni->ni_cpts == NULL) + goto failed; + memcpy(ni->ni_cpts, cpts, array_size); + ni->ni_ncpts = ncpts; + } + + rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, net); + if (rc != 0) + goto failed; + + return ni; +failed: + lnet_ni_free(ni); + return NULL; +} + +/* + * Parse the networks string and create the matching set of NIs on the + * nilist. + */ +int +lnet_parse_networks(struct list_head *netlist, char *networks, + bool use_tcp_bonding) +{ + struct cfs_expr_list *net_el = NULL; + struct cfs_expr_list *ni_el = NULL; + int tokensize; + char *tokens; + char *str; + struct lnet_net *net; + struct lnet_ni *ni = NULL; + __u32 net_id; + int nnets = 0; + + if (networks == NULL) { + CERROR("networks string is undefined\n"); + return -EINVAL; + } + + if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) { + /* _WAY_ conservative */ + LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too " + "long\n"); + return -EINVAL; + } + + tokensize = strlen(networks) + 1; + + LIBCFS_ALLOC(tokens, tokensize); + if (tokens == NULL) { + CERROR("Can't allocate net tokens\n"); + return -ENOMEM; + } + + memcpy(tokens, networks, tokensize); + str = tokens; + + /* + * Main parser loop. + * + * NB we don't check interface conflicts here; it's the LNDs + * responsibility (if it cares at all) + */ + do { + char *nistr; + char *elstr; + char *name; + int rc; + + /* + * Parse a network string into its components. + * + * {"("...")"}{"[""]"} + */ + + /* Network name (mandatory) */ + while (isspace(*str)) + *str++ = '\0'; + if (!*str) + break; + name = str; + str += strcspn(str, SPACESTR ":()[],"); + while (isspace(*str)) + *str++ = '\0'; + + /* Interface list (optional) */ + if (*str == '(') { + *str++ = '\0'; + nistr = str; + str += strcspn(str, ")"); + if (*str != ')') { + str = nistr; + goto failed_syntax; + } + do { + *str++ = '\0'; + } while (isspace(*str)); + } else { + nistr = NULL; + } + + /* CPT expression (optional) */ + if (*str == '[') { + elstr = str; + str += strcspn(str, "]"); + if (*str != ']') { + str = elstr; + goto failed_syntax; + } + rc = cfs_expr_list_parse(elstr, str - elstr + 1, + 0, LNET_CPT_NUMBER - 1, + &net_el); + if (rc != 0) { + str = elstr; + goto failed_syntax; + } + *elstr = '\0'; + do { + *str++ = '\0'; + } while (isspace(*str)); + } + + /* Bad delimiters */ + if (*str && (strchr(DELIMITERS, *str) != NULL)) + goto failed_syntax; + + /* go to the next net if it exits */ + str += strcspn(str, ","); + if (*str == ',') + *str++ = '\0'; + + /* + * At this point the name is properly terminated. + */ + net_id = libcfs_str2net(name); + if (net_id == LNET_NIDNET(LNET_NID_ANY)) { + LCONSOLE_ERROR_MSG(0x113, + "Unrecognised network type\n"); + str = name; + goto failed_syntax; + } + + if (LNET_NETTYP(net_id) == LOLND) { + /* Loopback is implicit, and there can be only one. */ + if (net_el) { + cfs_expr_list_free(net_el); + net_el = NULL; + } + /* Should we error out instead? */ + continue; + } + + /* + * All network paramaters are now known. + */ + nnets++; + + /* always allocate a net, since we will eventually add an + * interface to it, or we will fail, in which case we'll + * just delete it */ + net = lnet_net_alloc(net_id, netlist); + if (IS_ERR_OR_NULL(net)) + goto failed; + + if (!nistr || + (use_tcp_bonding && LNET_NETTYP(net_id) == SOCKLND)) { + /* + * No interface list was specified, allocate a + * ni using the defaults. + */ + ni = lnet_ni_alloc(net, net_el, NULL); + if (IS_ERR_OR_NULL(ni)) + goto failed; + + if (!nistr) { + if (net_el) { + cfs_expr_list_free(net_el); + net_el = NULL; + } + continue; + } + } + + do { + elstr = NULL; + + /* Interface name (mandatory) */ + while (isspace(*nistr)) + *nistr++ = '\0'; + name = nistr; + nistr += strcspn(nistr, SPACESTR "[],"); + while (isspace(*nistr)) + *nistr++ = '\0'; + + /* CPT expression (optional) */ + if (*nistr == '[') { + elstr = nistr; + nistr += strcspn(nistr, "]"); + if (*nistr != ']') { + str = elstr; + goto failed_syntax; + } + rc = cfs_expr_list_parse(elstr, + nistr - elstr + 1, + 0, LNET_CPT_NUMBER - 1, + &ni_el); + if (rc != 0) { + str = elstr; + goto failed_syntax; + } + *elstr = '\0'; + do { + *nistr++ = '\0'; + } while (isspace(*nistr)); + } else { + ni_el = net_el; + } + + /* + * End of single interface specificaton, + * advance to the start of the next one, if + * any. + */ + if (*nistr == ',') { + do { + *nistr++ = '\0'; + } while (isspace(*nistr)); + if (!*nistr) { + str = nistr; + goto failed_syntax; + } + } else if (*nistr) { + str = nistr; + goto failed_syntax; + } + + /* + * At this point the name is properly terminated. + */ + if (!*name) { + str = name; + goto failed_syntax; + } + + if (use_tcp_bonding && + LNET_NETTYP(net->net_id) == SOCKLND) { + rc = lnet_ni_add_interface(ni, name); + if (rc != 0) + goto failed; + } else { + ni = lnet_ni_alloc(net, ni_el, name); + if (IS_ERR_OR_NULL(ni)) + goto failed; + } + + if (ni_el) { + if (ni_el != net_el) { + cfs_expr_list_free(ni_el); + ni_el = NULL; + } + } + } while (*nistr); + + if (net_el) { + cfs_expr_list_free(net_el); + net_el = NULL; + } + } while (*str); + + LIBCFS_FREE(tokens, tokensize); + return nnets; + + failed_syntax: + lnet_syntax("networks", networks, (int)(str - tokens), strlen(str)); + failed: + /* free the net list and all the nis on each net */ + while (!list_empty(netlist)) { + net = list_entry(netlist->next, struct lnet_net, net_list); + + list_del_init(&net->net_list); + lnet_net_free(net); + } + + if (ni_el && ni_el != net_el) + cfs_expr_list_free(ni_el); + if (net_el) + cfs_expr_list_free(net_el); + + LIBCFS_FREE(tokens, tokensize); + + return -EINVAL; +} + +static struct lnet_text_buf *lnet_new_text_buf(int str_len) +{ + struct lnet_text_buf *ltb; + int nob; + + /* NB allocate space for the terminating 0 */ + nob = offsetof(struct lnet_text_buf, ltb_text[str_len + 1]); + if (nob > LNET_SINGLE_TEXTBUF_NOB) { + /* _way_ conservative for "route net gateway..." */ + CERROR("text buffer too big\n"); + return NULL; + } + + if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) { + CERROR("Too many text buffers\n"); + return NULL; + } + + LIBCFS_ALLOC(ltb, nob); + if (ltb == NULL) + return NULL; + + ltb->ltb_size = nob; + ltb->ltb_text[0] = 0; + lnet_tbnob += nob; + return ltb; +} + +static void +lnet_free_text_buf(struct lnet_text_buf *ltb) +{ + lnet_tbnob -= ltb->ltb_size; + LIBCFS_FREE(ltb, ltb->ltb_size); +} + +static void +lnet_free_text_bufs(struct list_head *tbs) +{ + struct lnet_text_buf *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, struct lnet_text_buf, ltb_list); + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } +} + +void +lnet_print_text_bufs(struct list_head *tbs) +{ + struct list_head *tmp; + struct lnet_text_buf *ltb; + + list_for_each(tmp, tbs) { + ltb = list_entry(tmp, struct lnet_text_buf, ltb_list); + + CDEBUG(D_WARNING, "%s\n", ltb->ltb_text); + } + + CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob); +} + +static int +lnet_str2tbs_sep(struct list_head *tbs, char *str) +{ + struct list_head pending; + char *sep; + int nob; + int i; + struct lnet_text_buf *ltb; + + INIT_LIST_HEAD(&pending); + + /* Split 'str' into separate commands */ + for (;;) { + /* skip leading whitespace */ + while (isspace(*str)) + str++; + + /* scan for separator or comment */ + for (sep = str; *sep != 0; sep++) + if (lnet_issep(*sep) || *sep == '#') + break; + + nob = (int)(sep - str); + if (nob > 0) { + ltb = lnet_new_text_buf(nob); + if (ltb == NULL) { + lnet_free_text_bufs(&pending); + return -ENOMEM; + } + + for (i = 0; i < nob; i++) + if (isspace(str[i])) + ltb->ltb_text[i] = ' '; + else + ltb->ltb_text[i] = str[i]; + + ltb->ltb_text[nob] = 0; + + list_add_tail(<b->ltb_list, &pending); + } + + if (*sep == '#') { + /* scan for separator */ + do { + sep++; + } while (*sep != 0 && !lnet_issep(*sep)); + } + + if (*sep == 0) + break; + + str = sep + 1; + } + + list_splice(&pending, tbs->prev); + return 0; +} + +static int +lnet_expand1tb(struct list_head *list, + char *str, char *sep1, char *sep2, + char *item, int itemlen) +{ + int len1 = (int)(sep1 - str); + int len2 = strlen(sep2 + 1); + struct lnet_text_buf *ltb; + + LASSERT (*sep1 == '['); + LASSERT (*sep2 == ']'); + + ltb = lnet_new_text_buf(len1 + itemlen + len2); + if (ltb == NULL) + return -ENOMEM; + + memcpy(ltb->ltb_text, str, len1); + memcpy(<b->ltb_text[len1], item, itemlen); + memcpy(<b->ltb_text[len1+itemlen], sep2 + 1, len2); + ltb->ltb_text[len1 + itemlen + len2] = 0; + + list_add_tail(<b->ltb_list, list); + return 0; +} + +static int +lnet_str2tbs_expand(struct list_head *tbs, char *str) +{ + char num[16]; + struct list_head pending; + char *sep; + char *sep2; + char *parsed; + char *enditem; + int lo; + int hi; + int stride; + int i; + int nob; + int scanned; + + INIT_LIST_HEAD(&pending); + + sep = strchr(str, '['); + if (sep == NULL) /* nothing to expand */ + return 0; + + sep2 = strchr(sep, ']'); + if (sep2 == NULL) + goto failed; + + for (parsed = sep; parsed < sep2; parsed = enditem) { + + enditem = ++parsed; + while (enditem < sep2 && *enditem != ',') + enditem++; + + if (enditem == parsed) /* no empty items */ + goto failed; + + if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) { + + if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) { + + /* simple string enumeration */ + if (lnet_expand1tb(&pending, str, sep, sep2, + parsed, (int)(enditem - parsed)) != 0) + goto failed; + + continue; + } + + stride = 1; + } + + /* range expansion */ + + if (enditem != parsed + scanned) /* no trailing junk */ + goto failed; + + if (hi < 0 || lo < 0 || stride < 0 || hi < lo || + (hi - lo) % stride != 0) + goto failed; + + for (i = lo; i <= hi; i += stride) { + + snprintf(num, sizeof(num), "%d", i); + nob = strlen(num); + if (nob + 1 == sizeof(num)) + goto failed; + + if (lnet_expand1tb(&pending, str, sep, sep2, + num, nob) != 0) + goto failed; + } + } + + list_splice(&pending, tbs->prev); + return 1; + + failed: + lnet_free_text_bufs(&pending); + return -EINVAL; +} + +static int +lnet_parse_hops (char *str, unsigned int *hops) +{ + int len = strlen(str); + int nob = len; + + return (sscanf(str, "%u%n", hops, &nob) >= 1 && + nob == len && + *hops > 0 && *hops < 256); +} + +#define LNET_PRIORITY_SEPARATOR (':') + +static int +lnet_parse_priority(char *str, unsigned int *priority, char **token) +{ + int nob; + char *sep; + int len; + + sep = strchr(str, LNET_PRIORITY_SEPARATOR); + if (sep == NULL) { + *priority = 0; + return 0; + } + len = strlen(sep + 1); + + if ((sscanf((sep+1), "%u%n", priority, &nob) < 1) || (len != nob)) { + /* Update the caller's token pointer so it treats the found + priority as the token to report in the error message. */ + *token += sep - str + 1; + return -EINVAL; + } + + CDEBUG(D_NET, "gateway %s, priority %d, nob %d\n", str, *priority, nob); + + /* + * Change priority separator to \0 to be able to parse NID + */ + *sep = '\0'; + return 0; +} + +static int +lnet_parse_route(char *str, int *im_a_router) +{ + /* static scratch buffer OK (single threaded) */ + static char cmd[LNET_SINGLE_TEXTBUF_NOB]; + + struct list_head nets; + struct list_head gateways; + struct list_head *tmp1; + struct list_head *tmp2; + __u32 net; + lnet_nid_t nid; + struct lnet_text_buf *ltb; + int rc; + char *sep; + char *token = str; + int ntokens = 0; + int myrc = -1; + __u32 hops; + int got_hops = 0; + unsigned int priority = 0; + + INIT_LIST_HEAD(&gateways); + INIT_LIST_HEAD(&nets); + + /* save a copy of the string for error messages */ + strncpy(cmd, str, sizeof(cmd)); + cmd[sizeof(cmd) - 1] = '\0'; + + sep = str; + for (;;) { + /* scan for token start */ + while (isspace(*sep)) + sep++; + if (*sep == 0) { + if (ntokens < (got_hops ? 3 : 2)) + goto token_error; + break; + } + + ntokens++; + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !isspace(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens == 1) { + tmp2 = &nets; /* expanding nets */ + } else if (ntokens == 2 && + lnet_parse_hops(token, &hops)) { + got_hops = 1; /* got a hop count */ + continue; + } else { + tmp2 = &gateways; /* expanding gateways */ + } + + ltb = lnet_new_text_buf(strlen(token)); + if (ltb == NULL) + goto out; + + strcpy(ltb->ltb_text, token); + tmp1 = <b->ltb_list; + list_add_tail(tmp1, tmp2); + + while (tmp1 != tmp2) { + ltb = list_entry(tmp1, struct lnet_text_buf, ltb_list); + + rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text); + if (rc < 0) + goto token_error; + + tmp1 = tmp1->next; + + if (rc > 0) { /* expanded! */ + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + continue; + } + + if (ntokens == 1) { + net = libcfs_str2net(ltb->ltb_text); + if (net == LNET_NIDNET(LNET_NID_ANY) || + LNET_NETTYP(net) == LOLND) + goto token_error; + } else { + rc = lnet_parse_priority(ltb->ltb_text, + &priority, &token); + if (rc < 0) + goto token_error; + + nid = libcfs_str2nid(ltb->ltb_text); + if (nid == LNET_NID_ANY || nid == LNET_NID_LO_0) + goto token_error; + } + } + } + + /* if there are no hops set then we want to flag this value as + * unset since hops is an optional parameter */ + if (!got_hops) + hops = LNET_UNDEFINED_HOPS; + + LASSERT(!list_empty(&nets)); + LASSERT(!list_empty(&gateways)); + + list_for_each(tmp1, &nets) { + ltb = list_entry(tmp1, struct lnet_text_buf, ltb_list); + net = libcfs_str2net(ltb->ltb_text); + LASSERT (net != LNET_NIDNET(LNET_NID_ANY)); + + list_for_each(tmp2, &gateways) { + ltb = list_entry(tmp2, struct lnet_text_buf, ltb_list); + nid = libcfs_str2nid(ltb->ltb_text); + LASSERT(nid != LNET_NID_ANY); + + if (lnet_islocalnid(nid)) { + *im_a_router = 1; + continue; + } + + rc = lnet_add_route(net, hops, nid, priority); + if (rc != 0 && rc != -EEXIST && rc != -EHOSTUNREACH) { + CERROR("Can't create route " + "to %s via %s\n", + libcfs_net2str(net), + libcfs_nid2str(nid)); + goto out; + } + } + } + + myrc = 0; + goto out; + +token_error: + lnet_syntax("routes", cmd, (int)(token - str), strlen(token)); +out: + lnet_free_text_bufs(&nets); + lnet_free_text_bufs(&gateways); + return myrc; +} + +static int +lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router) +{ + struct lnet_text_buf *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, struct lnet_text_buf, ltb_list); + + if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) { + lnet_free_text_bufs(tbs); + return -EINVAL; + } + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } + + return 0; +} + +int +lnet_parse_routes (char *routes, int *im_a_router) +{ + struct list_head tbs; + int rc = 0; + + *im_a_router = 0; + + INIT_LIST_HEAD(&tbs); + + if (lnet_str2tbs_sep(&tbs, routes) < 0) { + CERROR("Error parsing routes\n"); + rc = -EINVAL; + } else { + rc = lnet_parse_route_tbs(&tbs, im_a_router); + } + + LASSERT (lnet_tbnob == 0); + return rc; +} + +static int +lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip) +{ + struct list_head list = LIST_HEAD_INIT(list); + int rc; + int i; + + rc = cfs_ip_addr_parse(token, len, &list); + if (rc != 0) + return rc; + + for (rc = i = 0; !rc && i < nip; i++) + rc = cfs_ip_addr_match(ipaddrs[i], &list); + + cfs_expr_list_free_list(&list); + + return rc; +} + +static int +lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip) +{ + static char tokens[LNET_SINGLE_TEXTBUF_NOB]; + + int matched = 0; + int ntokens = 0; + int len; + char *net = NULL; + char *sep; + char *token; + int rc; + + LASSERT(strlen(net_entry) < sizeof(tokens)); + + /* work on a copy of the string */ + strcpy(tokens, net_entry); + sep = tokens; + for (;;) { + /* scan for token start */ + while (isspace(*sep)) + sep++; + if (*sep == 0) + break; + + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !isspace(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens++ == 0) { + net = token; + continue; + } + + len = strlen(token); + + rc = lnet_match_network_token(token, len, ipaddrs, nip); + if (rc < 0) { + lnet_syntax("ip2nets", net_entry, + (int)(token - tokens), len); + return rc; + } + + matched |= (rc != 0); + } + + if (!matched) + return 0; + + strcpy(net_entry, net); /* replace with matched net */ + return 1; +} + +static __u32 +lnet_netspec2net(char *netspec) +{ + char *bracket = strchr(netspec, '('); + __u32 net; + + if (bracket != NULL) + *bracket = 0; + + net = libcfs_str2net(netspec); + + if (bracket != NULL) + *bracket = '('; + + return net; +} + +static int +lnet_splitnets(char *source, struct list_head *nets) +{ + int offset = 0; + int offset2; + int len; + struct lnet_text_buf *tb; + struct lnet_text_buf *tb2; + struct list_head *t; + char *sep; + char *bracket; + __u32 net; + + LASSERT(!list_empty(nets)); + LASSERT(nets->next == nets->prev); /* single entry */ + + tb = list_entry(nets->next, struct lnet_text_buf, ltb_list); + + for (;;) { + sep = strchr(tb->ltb_text, ','); + bracket = strchr(tb->ltb_text, '('); + + if (sep != NULL && + bracket != NULL && + bracket < sep) { + /* netspec lists interfaces... */ + + offset2 = offset + (int)(bracket - tb->ltb_text); + len = strlen(bracket); + + bracket = strchr(bracket + 1, ')'); + + if (bracket == NULL || + !(bracket[1] == ',' || bracket[1] == 0)) { + lnet_syntax("ip2nets", source, offset2, len); + return -EINVAL; + } + + sep = (bracket[1] == 0) ? NULL : bracket + 1; + } + + if (sep != NULL) + *sep++ = 0; + + net = lnet_netspec2net(tb->ltb_text); + if (net == LNET_NIDNET(LNET_NID_ANY)) { + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + + list_for_each(t, nets) { + tb2 = list_entry(t, struct lnet_text_buf, ltb_list); + + if (tb2 == tb) + continue; + + if (net == lnet_netspec2net(tb2->ltb_text)) { + /* duplicate network */ + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + } + + if (sep == NULL) + return 0; + + offset += (int)(sep - tb->ltb_text); + len = strlen(sep); + tb2 = lnet_new_text_buf(len); + if (tb2 == NULL) + return -ENOMEM; + + strncpy(tb2->ltb_text, sep, len); + tb2->ltb_text[len] = '\0'; + list_add_tail(&tb2->ltb_list, nets); + + tb = tb2; + } +} + +static int +lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip) +{ + static char networks[LNET_SINGLE_TEXTBUF_NOB]; + static char source[LNET_SINGLE_TEXTBUF_NOB]; + + struct list_head raw_entries; + struct list_head matched_nets; + struct list_head current_nets; + struct list_head *t; + struct list_head *t2; + struct lnet_text_buf *tb; + struct lnet_text_buf *tb2; + __u32 net1; + __u32 net2; + int len; + int count; + int dup; + int rc; + + INIT_LIST_HEAD(&raw_entries); + if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) { + CERROR("Error parsing ip2nets\n"); + LASSERT(lnet_tbnob == 0); + return -EINVAL; + } + + INIT_LIST_HEAD(&matched_nets); + INIT_LIST_HEAD(¤t_nets); + networks[0] = 0; + count = 0; + len = 0; + rc = 0; + + while (!list_empty(&raw_entries)) { + tb = list_entry(raw_entries.next, struct lnet_text_buf, + ltb_list); + + strncpy(source, tb->ltb_text, sizeof(source)); + source[sizeof(source) - 1] = '\0'; + + /* replace ltb_text with the network(s) add on match */ + rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip); + if (rc < 0) + break; + + list_del(&tb->ltb_list); + + if (rc == 0) { /* no match */ + lnet_free_text_buf(tb); + continue; + } + + /* split into separate networks */ + INIT_LIST_HEAD(¤t_nets); + list_add(&tb->ltb_list, ¤t_nets); + rc = lnet_splitnets(source, ¤t_nets); + if (rc < 0) + break; + + dup = 0; + list_for_each(t, ¤t_nets) { + tb = list_entry(t, struct lnet_text_buf, ltb_list); + net1 = lnet_netspec2net(tb->ltb_text); + LASSERT(net1 != LNET_NIDNET(LNET_NID_ANY)); + + list_for_each(t2, &matched_nets) { + tb2 = list_entry(t2, struct lnet_text_buf, + ltb_list); + net2 = lnet_netspec2net(tb2->ltb_text); + LASSERT(net2 != LNET_NIDNET(LNET_NID_ANY)); + + if (net1 == net2) { + dup = 1; + break; + } + } + + if (dup) + break; + } + + if (dup) { + lnet_free_text_bufs(¤t_nets); + continue; + } + + list_for_each_safe(t, t2, ¤t_nets) { + tb = list_entry(t, struct lnet_text_buf, ltb_list); + + list_del(&tb->ltb_list); + list_add_tail(&tb->ltb_list, &matched_nets); + + len += snprintf(networks + len, sizeof(networks) - len, + "%s%s", (len == 0) ? "" : ",", + tb->ltb_text); + + if (len >= sizeof(networks)) { + CERROR("Too many matched networks\n"); + rc = -E2BIG; + goto out; + } + } + + count++; + } + + out: + lnet_free_text_bufs(&raw_entries); + lnet_free_text_bufs(&matched_nets); + lnet_free_text_bufs(¤t_nets); + LASSERT(lnet_tbnob == 0); + + if (rc < 0) + return rc; + + *networksp = networks; + return count; +} +/* + * kernel 5.3: commit ef11db3310e272d3d8dbe8739e0770820dd20e52 + * kernel 4.18.0-193.el8: + * added in_dev_for_each_ifa_rtnl and in_dev_for_each_ifa_rcu + * and removed for_ifa and endfor_ifa. + * Use the _rntl variant as the current locking is rtnl. + */ +#ifdef HAVE_IN_DEV_FOR_EACH_IFA_RTNL +#define DECLARE_CONST_IN_IFADDR(ifa) const struct in_ifaddr *ifa +#define endfor_ifa(in_dev) +#else +#define DECLARE_CONST_IN_IFADDR(ifa) +#define in_dev_for_each_ifa_rtnl(ifa, in_dev) for_ifa((in_dev)) +#endif + +int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns) +{ + struct lnet_inetdev *ifaces = NULL; + struct net_device *dev; + int nalloc = 0; + int nip = 0; + DECLARE_CONST_IN_IFADDR(ifa); + + rtnl_lock(); + for_each_netdev(ns, dev) { + int flags = dev_get_flags(dev); + struct in_device *in_dev; + int node_id; + int cpt; + + if (flags & IFF_LOOPBACK) /* skip the loopback IF */ + continue; + + if (!(flags & IFF_UP)) { + CWARN("lnet: Ignoring interface %s: it's down\n", + dev->name); + continue; + } + + in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) { + CWARN("lnet: Interface %s has no IPv4 status.\n", + dev->name); + continue; + } + + node_id = dev_to_node(&dev->dev); + cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id); + + in_dev_for_each_ifa_rtnl(ifa, in_dev) { + if (nip >= nalloc) { + struct lnet_inetdev *tmp; + + nalloc += LNET_INTERFACES_NUM; + tmp = krealloc(ifaces, nalloc * sizeof(*tmp), + GFP_KERNEL); + if (!tmp) { + kfree(ifaces); + ifaces = NULL; + nip = -ENOMEM; + goto unlock_rtnl; + } + ifaces = tmp; + } + + ifaces[nip].li_cpt = cpt; + ifaces[nip].li_flags = flags; + ifaces[nip].li_ipaddr = ntohl(ifa->ifa_local); + ifaces[nip].li_netmask = ntohl(ifa->ifa_mask); + strlcpy(ifaces[nip].li_name, ifa->ifa_label, + sizeof(ifaces[nip].li_name)); + nip++; + } + endfor_ifa(in_dev); + } +unlock_rtnl: + rtnl_unlock(); + + if (nip == 0) { + CERROR("lnet: Can't find any usable interfaces, rc = -ENOENT\n"); + nip = -ENOENT; + } + + *dev_list = ifaces; + return nip; +} +EXPORT_SYMBOL(lnet_inet_enumerate); + +int +lnet_parse_ip2nets (char **networksp, char *ip2nets) +{ + struct lnet_inetdev *ifaces = NULL; + __u32 *ipaddrs = NULL; + int nip; + int rc; + int i; + + if (current->nsproxy && current->nsproxy->net_ns) + nip = lnet_inet_enumerate(&ifaces, current->nsproxy->net_ns); + else + nip = lnet_inet_enumerate(&ifaces, &init_net); + if (nip < 0) { + if (nip != -ENOENT) { + LCONSOLE_ERROR_MSG(0x117, + "Error %d enumerating local IP interfaces for ip2nets to match\n", + nip); + } else { + LCONSOLE_ERROR_MSG(0x118, + "No local IP interfaces for ip2nets to match\n"); + } + return nip; + } + + LIBCFS_ALLOC(ipaddrs, nip * sizeof(*ipaddrs)); + if (!ipaddrs) { + rc = -ENOMEM; + CERROR("lnet: Can't allocate ipaddrs[%d], rc = %d\n", + nip, rc); + goto out_free_addrs; + } + + for (i = 0; i < nip; i++) + ipaddrs[i] = ifaces[i].li_ipaddr; + + rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip); + if (rc < 0) { + LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc); + } else if (rc == 0) { + LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match " + "any local IP interfaces\n"); + rc = -ENOENT; + } + LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs)); +out_free_addrs: + kfree(ifaces); + return rc > 0 ? 0 : rc; +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c b/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c new file mode 100644 index 0000000000000..354c9768a3a1d --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c @@ -0,0 +1,421 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-eq.c + * + * Library level Event queue management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +/** + * Create an event queue that has room for \a count number of events. + * + * The event queue is circular and older events will be overwritten by new + * ones if they are not removed in time by the user using the functions + * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to + * determine the appropriate size of the event queue to prevent this loss + * of events. Note that when EQ handler is specified in \a callback, no + * event loss can happen, since the handler is run for each event deposited + * into the EQ. + * + * \param count The number of events to be stored in the event queue. It + * will be rounded up to the next power of two. + * \param callback A handler function that runs when an event is deposited + * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to + * indicate that no event handler is desired. + * \param handle On successful return, this location will hold a handle for + * the newly created EQ. + * + * \retval 0 On success. + * \retval -EINVAL If an parameter is not valid. + * \retval -ENOMEM If memory for the EQ can't be allocated. + * + * \see lnet_eq_handler_t for the discussion on EQ handler semantics. + */ +int +LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, + struct lnet_handle_eq *handle) +{ + struct lnet_eq *eq; + + LASSERT(the_lnet.ln_refcount > 0); + + /* We need count to be a power of 2 so that when eq_{enq,deq}_seq + * overflow, they don't skip entries, so the queue has the same + * apparent capacity at all times */ + + if (count) + count = roundup_pow_of_two(count); + + if (callback != LNET_EQ_HANDLER_NONE && count != 0) { + CWARN("EQ callback is guaranteed to get every event, " + "do you still want to set eqcount %d for polling " + "event which will have locking overhead? " + "Please contact with developer to confirm\n", count); + } + + /* count can be 0 if only need callback, we can eliminate + * overhead of enqueue event */ + if (count == 0 && callback == LNET_EQ_HANDLER_NONE) + return -EINVAL; + + eq = lnet_eq_alloc(); + if (eq == NULL) + return -ENOMEM; + + if (count != 0) { + LIBCFS_ALLOC(eq->eq_events, count * sizeof(struct lnet_event)); + if (eq->eq_events == NULL) + goto failed; + /* NB allocator has set all event sequence numbers to 0, + * so all them should be earlier than eq_deq_seq */ + } + + eq->eq_deq_seq = 1; + eq->eq_enq_seq = 1; + eq->eq_size = count; + eq->eq_callback = callback; + + eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*eq->eq_refs[0])); + if (eq->eq_refs == NULL) + goto failed; + + /* MUST hold both exclusive lnet_res_lock */ + lnet_res_lock(LNET_LOCK_EX); + /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do + * both EQ lookup and poll event with only lnet_eq_wait_lock */ + lnet_eq_wait_lock(); + + lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh); + list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active); + + lnet_eq_wait_unlock(); + lnet_res_unlock(LNET_LOCK_EX); + + lnet_eq2handle(handle, eq); + return 0; + +failed: + if (eq->eq_events != NULL) + LIBCFS_FREE(eq->eq_events, count * sizeof(struct lnet_event)); + + if (eq->eq_refs != NULL) + cfs_percpt_free(eq->eq_refs); + + lnet_eq_free(eq); + return -ENOMEM; +} +EXPORT_SYMBOL(LNetEQAlloc); + +/** + * Release the resources associated with an event queue if it's idle; + * otherwise do nothing and it's up to the user to try again. + * + * \param eqh A handle for the event queue to be released. + * + * \retval 0 If the EQ is not in use and freed. + * \retval -ENOENT If \a eqh does not point to a valid EQ. + * \retval -EBUSY If the EQ is still in use by some MDs. + */ +int +LNetEQFree(struct lnet_handle_eq eqh) +{ + struct lnet_eq *eq; + struct lnet_event *events = NULL; + int **refs = NULL; + int *ref; + int rc = 0; + int size = 0; + int i; + + lnet_res_lock(LNET_LOCK_EX); + /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do + * both EQ lookup and poll event with only lnet_eq_wait_lock */ + lnet_eq_wait_lock(); + + eq = lnet_handle2eq(&eqh); + if (eq == NULL) { + rc = -ENOENT; + goto out; + } + + cfs_percpt_for_each(ref, i, eq->eq_refs) { + LASSERT(*ref >= 0); + if (*ref == 0) + continue; + + CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n", + i, *ref); + rc = -EBUSY; + goto out; + } + + /* stash for free after lock dropped */ + events = eq->eq_events; + size = eq->eq_size; + refs = eq->eq_refs; + + lnet_res_lh_invalidate(&eq->eq_lh); + list_del(&eq->eq_list); + lnet_eq_free(eq); + out: + lnet_eq_wait_unlock(); + lnet_res_unlock(LNET_LOCK_EX); + + if (events != NULL) + LIBCFS_FREE(events, size * sizeof(struct lnet_event)); + if (refs != NULL) + cfs_percpt_free(refs); + + return rc; +} +EXPORT_SYMBOL(LNetEQFree); + +void +lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev) +{ + /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */ + int index; + + if (eq->eq_size == 0) { + LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE); + eq->eq_callback(ev); + return; + } + + lnet_eq_wait_lock(); + ev->sequence = eq->eq_enq_seq++; + + LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size)); + index = ev->sequence & (eq->eq_size - 1); + + eq->eq_events[index] = *ev; + + if (eq->eq_callback != LNET_EQ_HANDLER_NONE) + eq->eq_callback(ev); + + /* Wake anyone waiting in LNetEQPoll() */ + if (waitqueue_active(&the_lnet.ln_eq_waitq)) + wake_up_all(&the_lnet.ln_eq_waitq); + lnet_eq_wait_unlock(); +} + +static int +lnet_eq_dequeue_event(struct lnet_eq *eq, struct lnet_event *ev) +{ + int new_index = eq->eq_deq_seq & (eq->eq_size - 1); + struct lnet_event *new_event = &eq->eq_events[new_index]; + int rc; + ENTRY; + + /* must called with lnet_eq_wait_lock hold */ + if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence)) + RETURN(0); + + /* We've got a new event... */ + *ev = *new_event; + + CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->eq_deq_seq, eq->eq_size); + + /* ...but did it overwrite an event we've not seen yet? */ + if (eq->eq_deq_seq == new_event->sequence) { + rc = 1; + } else { + /* don't complain with CERROR: some EQs are sized small + * anyway; if it's important, the caller should complain */ + CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", + eq->eq_deq_seq, new_event->sequence); + rc = -EOVERFLOW; + } + + eq->eq_deq_seq = new_event->sequence + 1; + RETURN(rc); +} + +/** + * A nonblocking function that can be used to get the next event in an EQ. + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully. The event is removed from the queue. + * + * \param eventq A handle for the event queue. + * \param event On successful return (1 or -EOVERFLOW), this location will + * hold the next event in the EQ. + * + * \retval 0 No pending event in the EQ. + * \retval 1 Indicates success. + * \retval -ENOENT If \a eventq does not point to a valid EQ. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ has been dropped due to limited space in the EQ. + */ +int +LNetEQGet(struct lnet_handle_eq eventq, struct lnet_event *event) +{ + int which; + + return LNetEQPoll(&eventq, 1, 0, + event, &which); +} +EXPORT_SYMBOL(LNetEQGet); + +/** + * Block the calling process until there is an event in the EQ. + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully. This function returns the next event + * in the EQ and removes it from the EQ. + * + * \param eventq A handle for the event queue. + * \param event On successful return (1 or -EOVERFLOW), this location will + * hold the next event in the EQ. + * + * \retval 1 Indicates success. + * \retval -ENOENT If \a eventq does not point to a valid EQ. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ has been dropped due to limited space in the EQ. + */ +int +LNetEQWait(struct lnet_handle_eq eventq, struct lnet_event *event) +{ + int which; + + return LNetEQPoll(&eventq, 1, MAX_SCHEDULE_TIMEOUT, + event, &which); +} +EXPORT_SYMBOL(LNetEQWait); + +static int +lnet_eq_wait_locked(signed long *timeout) +__must_hold(&the_lnet.ln_eq_wait_lock) +{ + signed long tms = *timeout; + wait_queue_entry_t wl; + int wait; + + if (tms == 0) + return -ENXIO; /* don't want to wait and no new event */ + + init_waitqueue_entry(&wl, current); + add_wait_queue(&the_lnet.ln_eq_waitq, &wl); + + lnet_eq_wait_unlock(); + + tms = schedule_timeout_interruptible(tms); + wait = tms != 0; /* might need to call here again */ + *timeout = tms; + + lnet_eq_wait_lock(); + remove_wait_queue(&the_lnet.ln_eq_waitq, &wl); + + return wait; +} + +/** + * Block the calling process until there's an event from a set of EQs or + * timeout happens. + * + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully, in which case the corresponding event + * is consumed. + * + * LNetEQPoll() provides a timeout to allow applications to poll, block for a + * fixed period, or block indefinitely. + * + * \param eventqs,neq An array of EQ handles, and size of the array. + * \param timeout Time in jiffies to wait for an event to occur on + * one of the EQs. The constant MAX_SCHEDULE_TIMEOUT can be used to indicate an + * infinite timeout. + * \param event,which On successful return (1 or -EOVERFLOW), \a event will + * hold the next event in the EQs, and \a which will contain the index of the + * EQ from which the event was taken. + * + * \retval 0 No pending event in the EQs after timeout. + * \retval 1 Indicates success. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ indicated by \a which has been dropped due to limited space in the EQ. + * \retval -ENOENT If there's an invalid handle in \a eventqs. + */ +int +LNetEQPoll(struct lnet_handle_eq *eventqs, int neq, signed long timeout, + struct lnet_event *event, int *which) +{ + int wait = 1; + int rc; + int i; + ENTRY; + + LASSERT(the_lnet.ln_refcount > 0); + + if (neq < 1) + RETURN(-ENOENT); + + lnet_eq_wait_lock(); + + for (;;) { + for (i = 0; i < neq; i++) { + struct lnet_eq *eq = lnet_handle2eq(&eventqs[i]); + + if (eq == NULL) { + lnet_eq_wait_unlock(); + RETURN(-ENOENT); + } + + rc = lnet_eq_dequeue_event(eq, event); + if (rc != 0) { + lnet_eq_wait_unlock(); + *which = i; + RETURN(rc); + } + } + + if (wait == 0) + break; + + /* + * return value of lnet_eq_wait_locked: + * -1 : did nothing and it's sure no new event + * 1 : sleep inside and wait until new event + * 0 : don't want to wait anymore, but might have new event + * so need to call dequeue again + */ + wait = lnet_eq_wait_locked(&timeout); + if (wait < 0) /* no new event */ + break; + } + + lnet_eq_wait_unlock(); + RETURN(0); +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-md.c b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c new file mode 100644 index 0000000000000..9bf890c9477b6 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c @@ -0,0 +1,557 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-md.c + * + * Memory Descriptor management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/* must be called with lnet_res_lock held */ +void +lnet_md_unlink(struct lnet_libmd *md) +{ + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) { + /* first unlink attempt... */ + struct lnet_me *me = md->md_me; + + md->md_flags |= LNET_MD_FLAG_ZOMBIE; + + /* Disassociate from ME (if any), and unlink it if it was created + * with LNET_UNLINK */ + if (me != NULL) { + /* detach MD from portal */ + lnet_ptl_detach_md(me, md); + if (me->me_unlink == LNET_UNLINK) + lnet_me_unlink(me); + } + + /* ensure all future handle lookups fail */ + lnet_res_lh_invalidate(&md->md_lh); + } + + if (md->md_refcount != 0) { + CDEBUG(D_NET, "Queueing unlink of md %p\n", md); + return; + } + + CDEBUG(D_NET, "Unlinking md %p\n", md); + + if (md->md_eq != NULL) { + int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + + LASSERT(*md->md_eq->eq_refs[cpt] > 0); + (*md->md_eq->eq_refs[cpt])--; + } + + LASSERT(!list_empty(&md->md_list)); + list_del_init(&md->md_list); + lnet_md_free(md); +} + +struct page * +lnet_kvaddr_to_page(unsigned long vaddr) +{ + if (is_vmalloc_addr((void *)vaddr)) + return vmalloc_to_page((void *)vaddr); + +#ifdef CONFIG_HIGHMEM + +#ifdef HAVE_KMAP_TO_PAGE + /* + * This ifdef is added to handle the kernel versions + * which have kmap_to_page() function exported. If so, + * we should use it. Otherwise, remain with the legacy check. + */ + return kmap_to_page((void *)vaddr); +#else + + if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { + /* No highmem pages only used for bulk (kiov) I/O */ + CERROR("find page for address in highmem\n"); + LBUG(); + } + return virt_to_page(vaddr); +#endif /* HAVE_KMAP_TO_PAGE */ +#else + + return virt_to_page(vaddr); +#endif /* CONFIG_HIGHMEM */ +} +EXPORT_SYMBOL(lnet_kvaddr_to_page); + +int +lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset) +{ + int cpt = CFS_CPT_ANY; + unsigned int niov; + + /* + * if the md_options has a bulk handle then we want to look at the + * bulk md because that's the data which we will be DMAing + */ + if (md && (md->md_options & LNET_MD_BULK_HANDLE) != 0 && + !LNetMDHandleIsInvalid(md->md_bulk_handle)) + md = lnet_handle2md(&md->md_bulk_handle); + + if (!md || md->md_niov == 0) + return CFS_CPT_ANY; + + niov = md->md_niov; + + /* + * There are three cases to handle: + * 1. The MD is using lnet_kiov_t + * 2. The MD is using struct kvec + * 3. Contiguous buffer allocated via vmalloc + * + * in case 2 we can use virt_to_page() macro to get the page + * address of the memory kvec describes. + * + * in case 3 use is_vmalloc_addr() and vmalloc_to_page() + * + * The offset provided can be within the first iov/kiov entry or + * it could go beyond it. In that case we need to make sure to + * look at the page which actually contains the data that will be + * DMAed. + */ + if ((md->md_options & LNET_MD_KIOV) != 0) { + lnet_kiov_t *kiov = md->md_iov.kiov; + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + niov--; + kiov++; + if (niov == 0) { + CERROR("offset %d goes beyond kiov\n", offset); + goto out; + } + } + + cpt = cfs_cpt_of_node(lnet_cpt_table(), + page_to_nid(kiov->kiov_page)); + } else { + struct kvec *iov = md->md_iov.iov; + unsigned long vaddr; + struct page *page; + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + if (niov == 0) { + CERROR("offset %d goes beyond iov\n", offset); + goto out; + } + } + + vaddr = ((unsigned long)iov->iov_base) + offset; + page = lnet_kvaddr_to_page(vaddr); + if (!page) { + CERROR("Couldn't resolve vaddr 0x%lx to page\n", vaddr); + goto out; + } + cpt = cfs_cpt_of_node(lnet_cpt_table(), page_to_nid(page)); + } + +out: + return cpt; +} + +static int +lnet_md_build(struct lnet_libmd *lmd, struct lnet_md *umd, int unlink) +{ + int i; + unsigned int niov; + int total_length = 0; + + lmd->md_me = NULL; + lmd->md_start = umd->start; + lmd->md_offset = 0; + lmd->md_max_size = umd->max_size; + lmd->md_options = umd->options; + lmd->md_user_ptr = umd->user_ptr; + lmd->md_eq = NULL; + lmd->md_threshold = umd->threshold; + lmd->md_refcount = 0; + lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0; + lmd->md_bulk_handle = umd->bulk_handle; + + if ((umd->options & LNET_MD_IOVEC) != 0) { + + if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */ + return -EINVAL; + + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.iov, umd->start, + niov * sizeof(lmd->md_iov.iov[0])); + + for (i = 0; i < (int)niov; i++) { + /* We take the base address on trust */ + if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */ + return -EINVAL; + + total_length += lmd->md_iov.iov[i].iov_len; + } + + lmd->md_length = total_length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return -EINVAL; + + } else if ((umd->options & LNET_MD_KIOV) != 0) { + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.kiov, umd->start, + niov * sizeof(lmd->md_iov.kiov[0])); + + for (i = 0; i < (int)niov; i++) { + /* We take the page pointer on trust */ + if (lmd->md_iov.kiov[i].kiov_offset + + lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE) + return -EINVAL; /* invalid length */ + + total_length += lmd->md_iov.kiov[i].kiov_len; + } + + lmd->md_length = total_length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return -EINVAL; + } else { /* contiguous */ + lmd->md_length = umd->length; + lmd->md_niov = niov = 1; + lmd->md_iov.iov[0].iov_base = umd->start; + lmd->md_iov.iov[0].iov_len = umd->length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > (int)umd->length)) // illegal max_size + return -EINVAL; + } + + return 0; +} + +/* must be called with resource lock held */ +static int +lnet_md_link(struct lnet_libmd *md, struct lnet_handle_eq eq_handle, int cpt) +{ + struct lnet_res_container *container = the_lnet.ln_md_containers[cpt]; + + /* NB we are passed an allocated, but inactive md. + * if we return success, caller may lnet_md_unlink() it. + * otherwise caller may only lnet_md_free() it. + */ + /* This implementation doesn't know how to create START events or + * disable END events. Best to LASSERT our caller is compliant so + * we find out quickly... */ + /* TODO - reevaluate what should be here in light of + * the removal of the start and end events + * maybe there we shouldn't even allow LNET_EQ_NONE!) + * LASSERT (eq == NULL); + */ + if (!LNetEQHandleIsInvalid(eq_handle)) { + md->md_eq = lnet_handle2eq(&eq_handle); + + if (md->md_eq == NULL) + return -ENOENT; + + (*md->md_eq->eq_refs[cpt])++; + } + + lnet_res_lh_initialize(container, &md->md_lh); + + LASSERT(list_empty(&md->md_list)); + list_add(&md->md_list, &container->rec_active); + + return 0; +} + +/* must be called with lnet_res_lock held */ +void +lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_md *umd) +{ + /* NB this doesn't copy out all the iov entries so when a + * discontiguous MD is copied out, the target gets to know the + * original iov pointer (in start) and the number of entries it had + * and that's all. + */ + umd->start = lmd->md_start; + umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ? + lmd->md_length : lmd->md_niov; + umd->threshold = lmd->md_threshold; + umd->max_size = lmd->md_max_size; + umd->options = lmd->md_options; + umd->user_ptr = lmd->md_user_ptr; + lnet_eq2handle(&umd->eq_handle, lmd->md_eq); +} + +static int +lnet_md_validate(struct lnet_md *umd) +{ + if (umd->start == NULL && umd->length != 0) { + CERROR("MD start pointer can not be NULL with length %u\n", + umd->length); + return -EINVAL; + } + + if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 && + umd->length > LNET_MAX_IOV) { + CERROR("Invalid option: too many fragments %u, %d max\n", + umd->length, LNET_MAX_IOV); + return -EINVAL; + } + + return 0; +} + +/** + * Create a memory descriptor and attach it to a ME + * + * \param meh A handle for a ME to associate the new MD with. + * \param umd Provides initial values for the user-visible parts of a MD. + * Other than its use for initialization, there is no linkage between this + * structure and the MD maintained by the LNet. + * \param unlink A flag to indicate whether the MD is automatically unlinked + * when it becomes inactive, either because the operation threshold drops to + * zero or because the available memory becomes less than \a umd.max_size. + * (Note that the check for unlinking a MD only occurs after the completion + * of a successful operation on the MD.) The value LNET_UNLINK enables auto + * unlinking; the value LNET_RETAIN disables it. + * \param handle On successful returns, a handle to the newly created MD is + * saved here. This handle can be used later in LNetMDUnlink(). + * + * \retval 0 On success. + * \retval -EINVAL If \a umd is not valid. + * \retval -ENOMEM If new MD cannot be allocated. + * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a + * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by + * calling LNetInvalidateHandle() on it. + * \retval -EBUSY If the ME pointed to by \a meh is already associated with + * a MD. + */ +int +LNetMDAttach(struct lnet_handle_me meh, struct lnet_md umd, + enum lnet_unlink unlink, struct lnet_handle_md *handle) +{ + struct list_head matches = LIST_HEAD_INIT(matches); + struct list_head drops = LIST_HEAD_INIT(drops); + struct lnet_me *me; + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT(the_lnet.ln_refcount > 0); + + if (lnet_md_validate(&umd) != 0) + return -EINVAL; + + if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) { + CERROR("Invalid option: no MD_OP set\n"); + return -EINVAL; + } + + md = lnet_md_alloc(&umd); + if (md == NULL) + return -ENOMEM; + + rc = lnet_md_build(md, &umd, unlink); + if (rc != 0) + goto out_free; + + cpt = lnet_cpt_of_cookie(meh.cookie); + + lnet_res_lock(cpt); + + me = lnet_handle2me(&meh); + if (me == NULL) + rc = -ENOENT; + else if (me->me_md != NULL) + rc = -EBUSY; + else + rc = lnet_md_link(md, umd.eq_handle, cpt); + + if (rc != 0) + goto out_unlock; + + /* attach this MD to portal of ME and check if it matches any + * blocked msgs on this portal */ + lnet_ptl_attach_md(me, md, &matches, &drops); + + lnet_md2handle(handle, md); + + lnet_res_unlock(cpt); + + lnet_drop_delayed_msg_list(&drops, "Bad match"); + lnet_recv_delayed_msg_list(&matches); + + return 0; + +out_unlock: + lnet_res_unlock(cpt); +out_free: + lnet_md_free(md); + return rc; +} +EXPORT_SYMBOL(LNetMDAttach); + +/** + * Create a "free floating" memory descriptor - a MD that is not associated + * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations. + * + * \param umd,unlink See the discussion for LNetMDAttach(). + * \param handle On successful returns, a handle to the newly created MD is + * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(), + * and LNetGet() operations. + * + * \retval 0 On success. + * \retval -EINVAL If \a umd is not valid. + * \retval -ENOMEM If new MD cannot be allocated. + * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that + * it's OK to supply a NULL \a umd.eq_handle by calling + * LNetInvalidateHandle() on it. + */ +int +LNetMDBind(struct lnet_md umd, enum lnet_unlink unlink, + struct lnet_handle_md *handle) +{ + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT(the_lnet.ln_refcount > 0); + + if (lnet_md_validate(&umd) != 0) + return -EINVAL; + + if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) { + CERROR("Invalid option: GET|PUT illegal on active MDs\n"); + return -EINVAL; + } + + md = lnet_md_alloc(&umd); + if (md == NULL) + return -ENOMEM; + + rc = lnet_md_build(md, &umd, unlink); + if (rc != 0) + goto out_free; + + cpt = lnet_res_lock_current(); + + rc = lnet_md_link(md, umd.eq_handle, cpt); + if (rc != 0) + goto out_unlock; + + lnet_md2handle(handle, md); + + lnet_res_unlock(cpt); + return 0; + + out_unlock: + lnet_res_unlock(cpt); + + out_free: + lnet_md_free(md); + return rc; +} +EXPORT_SYMBOL(LNetMDBind); + +/** + * Unlink the memory descriptor from any ME it may be linked to and release + * the internal resources associated with it. As a result, active messages + * associated with the MD may get aborted. + * + * This function does not free the memory region associated with the MD; + * i.e., the memory the user allocated for this MD. If the ME associated with + * this MD is not NULL and was created with auto unlink enabled, the ME is + * unlinked as well (see LNetMEAttach()). + * + * Explicitly unlinking a MD via this function call has the same behavior as + * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK + * is generated in the latter case. + * + * An unlinked event can be reported in two ways: + * - If there's no pending operations on the MD, it's unlinked immediately + * and an LNET_EVENT_UNLINK event is logged before this function returns. + * - Otherwise, the MD is only marked for deletion when this function + * returns, and the unlinked event will be piggybacked on the event of + * the completion of the last operation by setting the unlinked field of + * the event. No dedicated LNET_EVENT_UNLINK event is generated. + * + * Note that in both cases the unlinked field of the event is always set; no + * more event will happen on the MD after such an event is logged. + * + * \param mdh A handle for the MD to be unlinked. + * + * \retval 0 On success. + * \retval -ENOENT If \a mdh does not point to a valid MD object. + */ +int +LNetMDUnlink(struct lnet_handle_md mdh) +{ + struct lnet_event ev; + struct lnet_libmd *md; + int cpt; + + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL) { + lnet_res_unlock(cpt); + return -ENOENT; + } + + md->md_flags |= LNET_MD_FLAG_ABORTED; + /* If the MD is busy, lnet_md_unlink just marks it for deletion, and + * when the LND is done, the completion event flags that the MD was + * unlinked. Otherwise, we enqueue an event now... */ + if (md->md_eq != NULL && md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + lnet_eq_enqueue_event(md->md_eq, &ev); + } + + lnet_md_unlink(md); + + lnet_res_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(LNetMDUnlink); diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-me.c b/drivers/staging/lustrefsx/lnet/lnet/lib-me.c new file mode 100644 index 0000000000000..1a1d9b1bdb671 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-me.c @@ -0,0 +1,291 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-me.c + * + * Match Entry management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/** + * Create and attach a match entry to the match list of \a portal. The new + * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach() + * can be used to attach a MD to an empty ME. + * + * \param portal The portal table index where the ME should be attached. + * \param match_id Specifies the match criteria for the process ID of + * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be + * used to wildcard either of the identifiers in the struct lnet_process_id + * structure. + * \param match_bits,ignore_bits Specify the match criteria to apply + * to the match bits in the incoming request. The ignore bits are used + * to mask out insignificant bits in the incoming match bits. The resulting + * bits are then compared to the ME's match bits to determine if the + * incoming request meets the match criteria. + * \param unlink Indicates whether the ME should be unlinked when the memory + * descriptor associated with it is unlinked (Note that the check for + * unlinking a ME only occurs when the memory descriptor is unlinked.). + * Valid values are LNET_RETAIN and LNET_UNLINK. + * \param pos Indicates whether the new ME should be prepended or + * appended to the match list. Allowed constants: LNET_INS_BEFORE, + * LNET_INS_AFTER. + * \param handle On successful returns, a handle to the newly created ME + * object is saved here. This handle can be used later in LNetMEInsert(), + * LNetMEUnlink(), or LNetMDAttach() functions. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is invalid. + * \retval -ENOMEM If new ME object cannot be allocated. + */ +int +LNetMEAttach(unsigned int portal, + struct lnet_process_id match_id, + __u64 match_bits, __u64 ignore_bits, + enum lnet_unlink unlink, enum lnet_ins_pos pos, + struct lnet_handle_me *handle) +{ + struct lnet_match_table *mtable; + struct lnet_me *me; + struct list_head *head; + + LASSERT(the_lnet.ln_refcount > 0); + + if ((int)portal >= the_lnet.ln_nportals) + return -EINVAL; + + mtable = lnet_mt_of_attach(portal, match_id, + match_bits, ignore_bits, pos); + if (mtable == NULL) /* can't match portal type */ + return -EPERM; + + me = lnet_me_alloc(); + if (me == NULL) + return -ENOMEM; + + lnet_res_lock(mtable->mt_cpt); + + me->me_portal = portal; + me->me_match_id = match_id; + me->me_match_bits = match_bits; + me->me_ignore_bits = ignore_bits; + me->me_unlink = unlink; + me->me_md = NULL; + + lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt], + &me->me_lh); + if (ignore_bits != 0) + head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; + else + head = lnet_mt_match_head(mtable, match_id, match_bits); + + me->me_pos = head - &mtable->mt_mhash[0]; + if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL) + list_add_tail(&me->me_list, head); + else + list_add(&me->me_list, head); + + lnet_me2handle(handle, me); + + lnet_res_unlock(mtable->mt_cpt); + return 0; +} +EXPORT_SYMBOL(LNetMEAttach); + +/** + * Create and a match entry and insert it before or after the ME pointed to by + * \a current_meh. The new ME is empty, i.e. not associated with a memory + * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME. + * + * This function is identical to LNetMEAttach() except for the position + * where the new ME is inserted. + * + * \param current_meh A handle for a ME. The new ME will be inserted + * immediately before or immediately after this ME. + * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion + * for LNetMEAttach(). + * + * \retval 0 On success. + * \retval -ENOMEM If new ME object cannot be allocated. + * \retval -ENOENT If \a current_meh does not point to a valid match entry. + */ +int +LNetMEInsert(struct lnet_handle_me current_meh, + struct lnet_process_id match_id, + __u64 match_bits, __u64 ignore_bits, + enum lnet_unlink unlink, enum lnet_ins_pos pos, + struct lnet_handle_me *handle) +{ + struct lnet_me *current_me; + struct lnet_me *new_me; + struct lnet_portal *ptl; + int cpt; + + LASSERT(the_lnet.ln_refcount > 0); + + if (pos == LNET_INS_LOCAL) + return -EPERM; + + new_me = lnet_me_alloc(); + if (new_me == NULL) + return -ENOMEM; + + cpt = lnet_cpt_of_cookie(current_meh.cookie); + + lnet_res_lock(cpt); + + current_me = lnet_handle2me(¤t_meh); + if (current_me == NULL) { + lnet_me_free(new_me); + + lnet_res_unlock(cpt); + return -ENOENT; + } + + LASSERT(current_me->me_portal < the_lnet.ln_nportals); + + ptl = the_lnet.ln_portals[current_me->me_portal]; + if (lnet_ptl_is_unique(ptl)) { + /* nosense to insertion on unique portal */ + lnet_me_free(new_me); + lnet_res_unlock(cpt); + return -EPERM; + } + + new_me->me_pos = current_me->me_pos; + new_me->me_portal = current_me->me_portal; + new_me->me_match_id = match_id; + new_me->me_match_bits = match_bits; + new_me->me_ignore_bits = ignore_bits; + new_me->me_unlink = unlink; + new_me->me_md = NULL; + + lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh); + + if (pos == LNET_INS_AFTER) + list_add(&new_me->me_list, ¤t_me->me_list); + else + list_add_tail(&new_me->me_list, ¤t_me->me_list); + + lnet_me2handle(handle, new_me); + + lnet_res_unlock(cpt); + + return 0; +} +EXPORT_SYMBOL(LNetMEInsert); + +/** + * Unlink a match entry from its match list. + * + * This operation also releases any resources associated with the ME. If a + * memory descriptor is attached to the ME, then it will be unlinked as well + * and an unlink event will be generated. It is an error to use the ME handle + * after calling LNetMEUnlink(). + * + * \param meh A handle for the ME to be unlinked. + * + * \retval 0 On success. + * \retval -ENOENT If \a meh does not point to a valid ME. + * \see LNetMDUnlink() for the discussion on delivering unlink event. + */ +int +LNetMEUnlink(struct lnet_handle_me meh) +{ + struct lnet_me *me; + struct lnet_libmd *md; + struct lnet_event ev; + int cpt; + + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_cpt_of_cookie(meh.cookie); + lnet_res_lock(cpt); + + me = lnet_handle2me(&meh); + if (me == NULL) { + lnet_res_unlock(cpt); + return -ENOENT; + } + + md = me->me_md; + if (md != NULL) { + md->md_flags |= LNET_MD_FLAG_ABORTED; + if (md->md_eq != NULL && md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + lnet_eq_enqueue_event(md->md_eq, &ev); + } + } + + lnet_me_unlink(me); + + lnet_res_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(LNetMEUnlink); + +/* call with lnet_res_lock please */ +void +lnet_me_unlink(struct lnet_me *me) +{ + list_del(&me->me_list); + + if (me->me_md != NULL) { + struct lnet_libmd *md = me->me_md; + + /* detach MD from portal of this ME */ + lnet_ptl_detach_md(me, md); + lnet_md_unlink(md); + } + + lnet_res_lh_invalidate(&me->me_lh); + lnet_me_free(me); +} + +#if 0 +static void +lib_me_dump(struct lnet_me *me) +{ + CWARN("Match Entry %p (%#llx)\n", me, + me->me_lh.lh_cookie); + + CWARN("\tMatch/Ignore\t= %016lx / %016lx\n", + me->me_match_bits, me->me_ignore_bits); + + CWARN("\tMD\t= %p\n", me->md); + CWARN("\tprev\t= %p\n", + list_entry(me->me_list.prev, struct lnet_me, me_list)); + CWARN("\tnext\t= %p\n", + list_entry(me->me_list.next, struct lnet_me, me_list)); +} +#endif diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-move.c b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c new file mode 100644 index 0000000000000..f52621c56b3de --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c @@ -0,0 +1,5065 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-move.c + * + * Data movement routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +#include +#include +#include + +static int local_nid_dist_zero = 1; +module_param(local_nid_dist_zero, int, 0444); +MODULE_PARM_DESC(local_nid_dist_zero, "Reserved"); + +struct lnet_send_data { + struct lnet_ni *sd_best_ni; + struct lnet_peer_ni *sd_best_lpni; + struct lnet_peer_ni *sd_final_dst_lpni; + struct lnet_peer *sd_peer; + struct lnet_peer *sd_gw_peer; + struct lnet_peer_ni *sd_gw_lpni; + struct lnet_peer_net *sd_peer_net; + struct lnet_msg *sd_msg; + lnet_nid_t sd_dst_nid; + lnet_nid_t sd_src_nid; + lnet_nid_t sd_rtr_nid; + int sd_cpt; + int sd_md_cpt; + __u32 sd_send_case; +}; + +static inline struct lnet_comm_count * +get_stats_counts(struct lnet_element_stats *stats, + enum lnet_stats_type stats_type) +{ + switch (stats_type) { + case LNET_STATS_TYPE_SEND: + return &stats->el_send_stats; + case LNET_STATS_TYPE_RECV: + return &stats->el_recv_stats; + case LNET_STATS_TYPE_DROP: + return &stats->el_drop_stats; + default: + CERROR("Unknown stats type\n"); + } + + return NULL; +} + +void lnet_incr_stats(struct lnet_element_stats *stats, + enum lnet_msg_type msg_type, + enum lnet_stats_type stats_type) +{ + struct lnet_comm_count *counts = get_stats_counts(stats, stats_type); + if (!counts) + return; + + switch (msg_type) { + case LNET_MSG_ACK: + atomic_inc(&counts->co_ack_count); + break; + case LNET_MSG_PUT: + atomic_inc(&counts->co_put_count); + break; + case LNET_MSG_GET: + atomic_inc(&counts->co_get_count); + break; + case LNET_MSG_REPLY: + atomic_inc(&counts->co_reply_count); + break; + case LNET_MSG_HELLO: + atomic_inc(&counts->co_hello_count); + break; + default: + CERROR("There is a BUG in the code. Unknown message type\n"); + break; + } +} + +__u32 lnet_sum_stats(struct lnet_element_stats *stats, + enum lnet_stats_type stats_type) +{ + struct lnet_comm_count *counts = get_stats_counts(stats, stats_type); + if (!counts) + return 0; + + return (atomic_read(&counts->co_ack_count) + + atomic_read(&counts->co_put_count) + + atomic_read(&counts->co_get_count) + + atomic_read(&counts->co_reply_count) + + atomic_read(&counts->co_hello_count)); +} + +static inline void assign_stats(struct lnet_ioctl_comm_count *msg_stats, + struct lnet_comm_count *counts) +{ + msg_stats->ico_get_count = atomic_read(&counts->co_get_count); + msg_stats->ico_put_count = atomic_read(&counts->co_put_count); + msg_stats->ico_reply_count = atomic_read(&counts->co_reply_count); + msg_stats->ico_ack_count = atomic_read(&counts->co_ack_count); + msg_stats->ico_hello_count = atomic_read(&counts->co_hello_count); +} + +void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats, + struct lnet_element_stats *stats) +{ + struct lnet_comm_count *counts; + + LASSERT(msg_stats); + LASSERT(stats); + + counts = get_stats_counts(stats, LNET_STATS_TYPE_SEND); + if (!counts) + return; + assign_stats(&msg_stats->im_send_stats, counts); + + counts = get_stats_counts(stats, LNET_STATS_TYPE_RECV); + if (!counts) + return; + assign_stats(&msg_stats->im_recv_stats, counts); + + counts = get_stats_counts(stats, LNET_STATS_TYPE_DROP); + if (!counts) + return; + assign_stats(&msg_stats->im_drop_stats, counts); +} + +int +lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) +{ + struct lnet_test_peer *tp; + struct list_head *el; + struct list_head *next; + struct list_head cull; + + /* NB: use lnet_net_lock(0) to serialize operations on test peers */ + if (threshold != 0) { + /* Adding a new entry */ + LIBCFS_ALLOC(tp, sizeof(*tp)); + if (tp == NULL) + return -ENOMEM; + + tp->tp_nid = nid; + tp->tp_threshold = threshold; + + lnet_net_lock(0); + list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers); + lnet_net_unlock(0); + return 0; + } + + /* removing entries */ + INIT_LIST_HEAD(&cull); + + lnet_net_lock(0); + + list_for_each_safe(el, next, &the_lnet.ln_test_peers) { + tp = list_entry(el, struct lnet_test_peer, tp_list); + + if (tp->tp_threshold == 0 || /* needs culling anyway */ + nid == LNET_NID_ANY || /* removing all entries */ + tp->tp_nid == nid) { /* matched this one */ + list_del(&tp->tp_list); + list_add(&tp->tp_list, &cull); + } + } + + lnet_net_unlock(0); + + while (!list_empty(&cull)) { + tp = list_entry(cull.next, struct lnet_test_peer, tp_list); + + list_del(&tp->tp_list); + LIBCFS_FREE(tp, sizeof(*tp)); + } + return 0; +} + +static int +fail_peer (lnet_nid_t nid, int outgoing) +{ + struct lnet_test_peer *tp; + struct list_head *el; + struct list_head *next; + struct list_head cull; + int fail = 0; + + INIT_LIST_HEAD(&cull); + + /* NB: use lnet_net_lock(0) to serialize operations on test peers */ + lnet_net_lock(0); + + list_for_each_safe(el, next, &the_lnet.ln_test_peers) { + tp = list_entry(el, struct lnet_test_peer, tp_list); + + if (tp->tp_threshold == 0) { + /* zombie entry */ + if (outgoing) { + /* only cull zombies on outgoing tests, + * since we may be at interrupt priority on + * incoming messages. */ + list_del(&tp->tp_list); + list_add(&tp->tp_list, &cull); + } + continue; + } + + if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */ + nid == tp->tp_nid) { /* fail this peer */ + fail = 1; + + if (tp->tp_threshold != LNET_MD_THRESH_INF) { + tp->tp_threshold--; + if (outgoing && + tp->tp_threshold == 0) { + /* see above */ + list_del(&tp->tp_list); + list_add(&tp->tp_list, &cull); + } + } + break; + } + } + + lnet_net_unlock(0); + + while (!list_empty(&cull)) { + tp = list_entry(cull.next, struct lnet_test_peer, tp_list); + list_del(&tp->tp_list); + + LIBCFS_FREE(tp, sizeof(*tp)); + } + + return fail; +} + +unsigned int +lnet_iov_nob(unsigned int niov, struct kvec *iov) +{ + unsigned int nob = 0; + + LASSERT(niov == 0 || iov != NULL); + while (niov-- > 0) + nob += (iov++)->iov_len; + + return (nob); +} +EXPORT_SYMBOL(lnet_iov_nob); + +void +lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, + unsigned int nsiov, struct kvec *siov, unsigned int soffset, + unsigned int nob) +{ + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + + if (nob == 0) + return; + + /* skip complete frags before 'doffset' */ + LASSERT(ndiov > 0); + while (doffset >= diov->iov_len) { + doffset -= diov->iov_len; + diov++; + ndiov--; + LASSERT(ndiov > 0); + } + + /* skip complete frags before 'soffset' */ + LASSERT(nsiov > 0); + while (soffset >= siov->iov_len) { + soffset -= siov->iov_len; + siov++; + nsiov--; + LASSERT(nsiov > 0); + } + + do { + LASSERT(ndiov > 0); + LASSERT(nsiov > 0); + this_nob = MIN(diov->iov_len - doffset, + siov->iov_len - soffset); + this_nob = MIN(this_nob, nob); + + memcpy((char *)diov->iov_base + doffset, + (char *)siov->iov_base + soffset, this_nob); + nob -= this_nob; + + if (diov->iov_len > doffset + this_nob) { + doffset += this_nob; + } else { + diov++; + ndiov--; + doffset = 0; + } + + if (siov->iov_len > soffset + this_nob) { + soffset += this_nob; + } else { + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); +} +EXPORT_SYMBOL(lnet_copy_iov2iov); + +int +lnet_extract_iov(int dst_niov, struct kvec *dst, + int src_niov, struct kvec *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT(src_niov > 0); + while (offset >= src->iov_len) { /* skip initial frags */ + offset -= src->iov_len; + src_niov--; + src++; + LASSERT(src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT(src_niov > 0); + LASSERT((int)niov <= dst_niov); + + frag_len = src->iov_len - offset; + dst->iov_base = ((char *)src->iov_base) + offset; + + if (len <= frag_len) { + dst->iov_len = len; + return (niov); + } + + dst->iov_len = frag_len; + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} +EXPORT_SYMBOL(lnet_extract_iov); + + +unsigned int +lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov) +{ + unsigned int nob = 0; + + LASSERT(niov == 0 || kiov != NULL); + while (niov-- > 0) + nob += (kiov++)->kiov_len; + + return (nob); +} +EXPORT_SYMBOL(lnet_kiov_nob); + +void +lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, + unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, + unsigned int nob) +{ + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + char *daddr = NULL; + char *saddr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (ndiov > 0); + while (doffset >= diov->kiov_len) { + doffset -= diov->kiov_len; + diov++; + ndiov--; + LASSERT(ndiov > 0); + } + + LASSERT(nsiov > 0); + while (soffset >= siov->kiov_len) { + soffset -= siov->kiov_len; + siov++; + nsiov--; + LASSERT(nsiov > 0); + } + + do { + LASSERT(ndiov > 0); + LASSERT(nsiov > 0); + this_nob = MIN(diov->kiov_len - doffset, + siov->kiov_len - soffset); + this_nob = MIN(this_nob, nob); + + if (daddr == NULL) + daddr = ((char *)kmap(diov->kiov_page)) + + diov->kiov_offset + doffset; + if (saddr == NULL) + saddr = ((char *)kmap(siov->kiov_page)) + + siov->kiov_offset + soffset; + + /* Vanishing risk of kmap deadlock when mapping 2 pages. + * However in practice at least one of the kiovs will be mapped + * kernel pages and the map/unmap will be NOOPs */ + + memcpy (daddr, saddr, this_nob); + nob -= this_nob; + + if (diov->kiov_len > doffset + this_nob) { + daddr += this_nob; + doffset += this_nob; + } else { + kunmap(diov->kiov_page); + daddr = NULL; + diov++; + ndiov--; + doffset = 0; + } + + if (siov->kiov_len > soffset + this_nob) { + saddr += this_nob; + soffset += this_nob; + } else { + kunmap(siov->kiov_page); + saddr = NULL; + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); + + if (daddr != NULL) + kunmap(diov->kiov_page); + if (saddr != NULL) + kunmap(siov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_kiov2kiov); + +void +lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int nob) +{ + /* NB iov, kiov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT(niov > 0); + } + + LASSERT(nkiov > 0); + while (kiovoffset >= kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT(nkiov > 0); + } + + do { + LASSERT(niov > 0); + LASSERT(nkiov > 0); + this_nob = MIN(iov->iov_len - iovoffset, + kiov->kiov_len - kiovoffset); + this_nob = MIN(this_nob, nob); + + if (addr == NULL) + addr = ((char *)kmap(kiov->kiov_page)) + + kiov->kiov_offset + kiovoffset; + + memcpy((char *)iov->iov_base + iovoffset, addr, this_nob); + nob -= this_nob; + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + kunmap(kiov->kiov_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + } while (nob > 0); + + if (addr != NULL) + kunmap(kiov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_kiov2iov); + +void +lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int niov, struct kvec *iov, unsigned int iovoffset, + unsigned int nob) +{ + /* NB kiov, iov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (nkiov > 0); + while (kiovoffset >= kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT(nkiov > 0); + } + + LASSERT(niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT(niov > 0); + } + + do { + LASSERT(nkiov > 0); + LASSERT(niov > 0); + this_nob = MIN(kiov->kiov_len - kiovoffset, + iov->iov_len - iovoffset); + this_nob = MIN(this_nob, nob); + + if (addr == NULL) + addr = ((char *)kmap(kiov->kiov_page)) + + kiov->kiov_offset + kiovoffset; + + memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob); + nob -= this_nob; + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + kunmap(kiov->kiov_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + } while (nob > 0); + + if (addr != NULL) + kunmap(kiov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_iov2kiov); + +int +lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT(src_niov > 0); + while (offset >= src->kiov_len) { /* skip initial frags */ + offset -= src->kiov_len; + src_niov--; + src++; + LASSERT(src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT(src_niov > 0); + LASSERT((int)niov <= dst_niov); + + frag_len = src->kiov_len - offset; + dst->kiov_page = src->kiov_page; + dst->kiov_offset = src->kiov_offset + offset; + + if (len <= frag_len) { + dst->kiov_len = len; + LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + return niov; + } + + dst->kiov_len = frag_len; + LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} +EXPORT_SYMBOL(lnet_extract_kiov); + +void +lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int offset, unsigned int mlen, + unsigned int rlen) +{ + unsigned int niov = 0; + struct kvec *iov = NULL; + lnet_kiov_t *kiov = NULL; + int rc; + + LASSERT (!in_interrupt ()); + LASSERT (mlen == 0 || msg != NULL); + + if (msg != NULL) { + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_sending); + LASSERT(rlen == msg->msg_len); + LASSERT(mlen <= msg->msg_len); + LASSERT(msg->msg_offset == offset); + LASSERT(msg->msg_wanted == mlen); + + msg->msg_receiving = 0; + + if (mlen != 0) { + niov = msg->msg_niov; + iov = msg->msg_iov; + kiov = msg->msg_kiov; + + LASSERT (niov > 0); + LASSERT ((iov == NULL) != (kiov == NULL)); + } + } + + rc = (ni->ni_net->net_lnd->lnd_recv)(ni, private, msg, delayed, + niov, iov, kiov, offset, mlen, + rlen); + if (rc < 0) + lnet_finalize(msg, rc); +} + +static void +lnet_setpayloadbuffer(struct lnet_msg *msg) +{ + struct lnet_libmd *md = msg->msg_md; + + LASSERT(msg->msg_len > 0); + LASSERT(!msg->msg_routing); + LASSERT(md != NULL); + LASSERT(msg->msg_niov == 0); + LASSERT(msg->msg_iov == NULL); + LASSERT(msg->msg_kiov == NULL); + + msg->msg_niov = md->md_niov; + if ((md->md_options & LNET_MD_KIOV) != 0) + msg->msg_kiov = md->md_iov.kiov; + else + msg->msg_iov = md->md_iov.iov; +} + +void +lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target, + unsigned int offset, unsigned int len) +{ + msg->msg_type = type; + msg->msg_target = target; + msg->msg_len = len; + msg->msg_offset = offset; + + if (len != 0) + lnet_setpayloadbuffer(msg); + + memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); + msg->msg_hdr.type = cpu_to_le32(type); + /* dest_nid will be overwritten by lnet_select_pathway() */ + msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); + msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); + /* src_nid will be set later */ + msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); + msg->msg_hdr.payload_length = cpu_to_le32(len); +} + +void +lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg) +{ + void *priv = msg->msg_private; + int rc; + + LASSERT(!in_interrupt()); + LASSERT(ni->ni_nid == LNET_NID_LO_0 || + (msg->msg_txcredit && msg->msg_peertxcredit)); + + rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg); + if (rc < 0) { + msg->msg_no_resend = true; + lnet_finalize(msg, rc); + } +} + +static int +lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg) +{ + int rc; + + LASSERT(!msg->msg_sending); + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_rx_ready_delay); + LASSERT(ni->ni_net->net_lnd->lnd_eager_recv != NULL); + + msg->msg_rx_ready_delay = 1; + rc = (ni->ni_net->net_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, + &msg->msg_private); + if (rc != 0) { + CERROR("recv from %s / send to %s aborted: " + "eager_recv failed %d\n", + libcfs_nid2str(msg->msg_rxpeer->lpni_nid), + libcfs_id2str(msg->msg_target), rc); + LASSERT(rc < 0); /* required by my callers */ + } + + return rc; +} + +/* + * This function can be called from two paths: + * 1. when sending a message + * 2. when decommiting a message (lnet_msg_decommit_tx()) + * In both these cases the peer_ni should have it's reference count + * acquired by the caller and therefore it is safe to drop the spin + * lock before calling lnd_query() + */ +static void +lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp) +{ + time64_t last_alive = 0; + int cpt = lnet_cpt_of_nid_locked(lp->lpni_nid, ni); + + LASSERT(lnet_peer_aliveness_enabled(lp)); + LASSERT(ni->ni_net->net_lnd->lnd_query != NULL); + + lnet_net_unlock(cpt); + (ni->ni_net->net_lnd->lnd_query)(ni, lp->lpni_nid, &last_alive); + lnet_net_lock(cpt); + + lp->lpni_last_query = ktime_get_seconds(); + + if (last_alive != 0) /* NI has updated timestamp */ + lp->lpni_last_alive = last_alive; +} + +/* NB: always called with lnet_net_lock held */ +static inline int +lnet_peer_is_alive(struct lnet_peer_ni *lp, time64_t now) +{ + int alive; + time64_t deadline; + + LASSERT (lnet_peer_aliveness_enabled(lp)); + + /* + * Trust lnet_notify() if it has more recent aliveness news, but + * ignore the initial assumed death (see lnet_peers_start_down()). + */ + spin_lock(&lp->lpni_lock); + if (!lp->lpni_alive && lp->lpni_alive_count > 0 && + lp->lpni_timestamp >= lp->lpni_last_alive) { + spin_unlock(&lp->lpni_lock); + return 0; + } + + deadline = lp->lpni_last_alive + + lp->lpni_net->net_tunables.lct_peer_timeout; + alive = deadline > now; + + /* + * Update obsolete lp_alive except for routers assumed to be dead + * initially, because router checker would update aliveness in this + * case, and moreover lpni_last_alive at peer creation is assumed. + */ + if (alive && !lp->lpni_alive && + !(lnet_isrouter(lp) && lp->lpni_alive_count == 0)) { + spin_unlock(&lp->lpni_lock); + lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive); + } else { + spin_unlock(&lp->lpni_lock); + } + + return alive; +} + + +/* NB: returns 1 when alive, 0 when dead, negative when error; + * may drop the lnet_net_lock */ +static int +lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp, + struct lnet_msg *msg) +{ + time64_t now = ktime_get_seconds(); + + if (!lnet_peer_aliveness_enabled(lp)) + return -ENODEV; + + if (lnet_peer_is_alive(lp, now)) + return 1; + + /* + * If we're resending a message, let's attempt to send it even if + * the peer is down to fulfill our resend quota on the message + */ + if (msg->msg_retry_count > 0) + return 1; + + /* + * Peer appears dead, but we should avoid frequent NI queries (at + * most once per lnet_queryinterval seconds). + */ + if (lp->lpni_last_query != 0) { + static const int lnet_queryinterval = 1; + time64_t next_query; + + next_query = lp->lpni_last_query + lnet_queryinterval; + + if (now < next_query) { + if (lp->lpni_alive) + CWARN("Unexpected aliveness of peer %s: " + "%lld < %lld (%d/%d)\n", + libcfs_nid2str(lp->lpni_nid), + now, next_query, + lnet_queryinterval, + lp->lpni_net->net_tunables.lct_peer_timeout); + return 0; + } + } + + /* query NI for latest aliveness news */ + lnet_ni_query_locked(ni, lp); + + if (lnet_peer_is_alive(lp, now)) + return 1; + + lnet_notify_locked(lp, 0, 0, lp->lpni_last_alive); + return 0; +} + +/** + * \param msg The message to be sent. + * \param do_send True if lnet_ni_send() should be called in this function. + * lnet_send() is going to lnet_net_unlock immediately after this, so + * it sets do_send FALSE and I don't do the unlock/send/lock bit. + * + * \retval LNET_CREDIT_OK If \a msg sent or OK to send. + * \retval LNET_CREDIT_WAIT If \a msg blocked for credit. + * \retval -EHOSTUNREACH If the next hop of the message appears dead. + * \retval -ECANCELED If the MD of the message has been unlinked. + */ +static int +lnet_post_send_locked(struct lnet_msg *msg, int do_send) +{ + struct lnet_peer_ni *lp = msg->msg_txpeer; + struct lnet_ni *ni = msg->msg_txni; + int cpt = msg->msg_tx_cpt; + struct lnet_tx_queue *tq = ni->ni_tx_queues[cpt]; + + /* non-lnet_send() callers have checked before */ + LASSERT(!do_send || msg->msg_tx_delayed); + LASSERT(!msg->msg_receiving); + LASSERT(msg->msg_tx_committed); + /* can't get here if we're sending to the loopback interface */ + LASSERT(lp->lpni_nid != the_lnet.ln_loni->ni_nid); + + /* NB 'lp' is always the next hop */ + if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && + lnet_peer_alive_locked(ni, lp, msg) == 0) { + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++; + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += + msg->msg_len; + lnet_net_unlock(cpt); + if (msg->msg_txpeer) + lnet_incr_stats(&msg->msg_txpeer->lpni_stats, + msg->msg_type, + LNET_STATS_TYPE_DROP); + if (msg->msg_txni) + lnet_incr_stats(&msg->msg_txni->ni_stats, + msg->msg_type, + LNET_STATS_TYPE_DROP); + + CNETERR("Dropping message for %s: peer not alive\n", + libcfs_id2str(msg->msg_target)); + msg->msg_health_status = LNET_MSG_STATUS_REMOTE_DROPPED; + if (do_send) + lnet_finalize(msg, -EHOSTUNREACH); + + lnet_net_lock(cpt); + return -EHOSTUNREACH; + } + + if (msg->msg_md != NULL && + (msg->msg_md->md_flags & LNET_MD_FLAG_ABORTED) != 0) { + lnet_net_unlock(cpt); + + CNETERR("Aborting message for %s: LNetM[DE]Unlink() already " + "called on the MD/ME.\n", + libcfs_id2str(msg->msg_target)); + if (do_send) { + msg->msg_no_resend = true; + CDEBUG(D_NET, "msg %p to %s canceled and will not be resent\n", + msg, libcfs_id2str(msg->msg_target)); + lnet_finalize(msg, -ECANCELED); + } + + lnet_net_lock(cpt); + return -ECANCELED; + } + + if (!msg->msg_peertxcredit) { + spin_lock(&lp->lpni_lock); + LASSERT((lp->lpni_txcredits < 0) == + !list_empty(&lp->lpni_txq)); + + msg->msg_peertxcredit = 1; + lp->lpni_txqnob += msg->msg_len + sizeof(struct lnet_hdr); + lp->lpni_txcredits--; + + if (lp->lpni_txcredits < lp->lpni_mintxcredits) + lp->lpni_mintxcredits = lp->lpni_txcredits; + + if (lp->lpni_txcredits < 0) { + msg->msg_tx_delayed = 1; + list_add_tail(&msg->msg_list, &lp->lpni_txq); + spin_unlock(&lp->lpni_lock); + return LNET_CREDIT_WAIT; + } + spin_unlock(&lp->lpni_lock); + } + + if (!msg->msg_txcredit) { + LASSERT((tq->tq_credits < 0) == + !list_empty(&tq->tq_delayed)); + + msg->msg_txcredit = 1; + tq->tq_credits--; + atomic_dec(&ni->ni_tx_credits); + + if (tq->tq_credits < tq->tq_credits_min) + tq->tq_credits_min = tq->tq_credits; + + if (tq->tq_credits < 0) { + msg->msg_tx_delayed = 1; + list_add_tail(&msg->msg_list, &tq->tq_delayed); + return LNET_CREDIT_WAIT; + } + } + + if (unlikely(!list_empty(&the_lnet.ln_delay_rules)) && + lnet_delay_rule_match_locked(&msg->msg_hdr, msg)) { + msg->msg_tx_delayed = 1; + return LNET_CREDIT_WAIT; + } + + /* unset the tx_delay flag as we're going to send it now */ + msg->msg_tx_delayed = 0; + + if (do_send) { + lnet_net_unlock(cpt); + lnet_ni_send(ni, msg); + lnet_net_lock(cpt); + } + return LNET_CREDIT_OK; +} + + +static struct lnet_rtrbufpool * +lnet_msg2bufpool(struct lnet_msg *msg) +{ + struct lnet_rtrbufpool *rbp; + int cpt; + + LASSERT(msg->msg_rx_committed); + + cpt = msg->msg_rx_cpt; + rbp = &the_lnet.ln_rtrpools[cpt][0]; + + LASSERT(msg->msg_len <= LNET_MTU); + while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_SIZE) { + rbp++; + LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]); + } + + return rbp; +} + +static int +lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) +{ + /* lnet_parse is going to lnet_net_unlock immediately after this, so it + * sets do_recv FALSE and I don't do the unlock/send/lock bit. + * I return LNET_CREDIT_WAIT if msg blocked and LNET_CREDIT_OK if + * received or OK to receive */ + struct lnet_peer_ni *lp = msg->msg_rxpeer; + struct lnet_rtrbufpool *rbp; + struct lnet_rtrbuf *rb; + + LASSERT (msg->msg_iov == NULL); + LASSERT (msg->msg_kiov == NULL); + LASSERT (msg->msg_niov == 0); + LASSERT (msg->msg_routing); + LASSERT (msg->msg_receiving); + LASSERT (!msg->msg_sending); + + /* non-lnet_parse callers only receive delayed messages */ + LASSERT(!do_recv || msg->msg_rx_delayed); + + if (!msg->msg_peerrtrcredit) { + spin_lock(&lp->lpni_lock); + LASSERT((lp->lpni_rtrcredits < 0) == + !list_empty(&lp->lpni_rtrq)); + + msg->msg_peerrtrcredit = 1; + lp->lpni_rtrcredits--; + if (lp->lpni_rtrcredits < lp->lpni_minrtrcredits) + lp->lpni_minrtrcredits = lp->lpni_rtrcredits; + + if (lp->lpni_rtrcredits < 0) { + /* must have checked eager_recv before here */ + LASSERT(msg->msg_rx_ready_delay); + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &lp->lpni_rtrq); + spin_unlock(&lp->lpni_lock); + return LNET_CREDIT_WAIT; + } + spin_unlock(&lp->lpni_lock); + } + + rbp = lnet_msg2bufpool(msg); + + if (!msg->msg_rtrcredit) { + msg->msg_rtrcredit = 1; + rbp->rbp_credits--; + if (rbp->rbp_credits < rbp->rbp_mincredits) + rbp->rbp_mincredits = rbp->rbp_credits; + + if (rbp->rbp_credits < 0) { + /* must have checked eager_recv before here */ + LASSERT(msg->msg_rx_ready_delay); + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &rbp->rbp_msgs); + return LNET_CREDIT_WAIT; + } + } + + LASSERT(!list_empty(&rbp->rbp_bufs)); + rb = list_entry(rbp->rbp_bufs.next, struct lnet_rtrbuf, rb_list); + list_del(&rb->rb_list); + + msg->msg_niov = rbp->rbp_npages; + msg->msg_kiov = &rb->rb_kiov[0]; + + /* unset the msg-rx_delayed flag since we're receiving the message */ + msg->msg_rx_delayed = 0; + + if (do_recv) { + int cpt = msg->msg_rx_cpt; + + lnet_net_unlock(cpt); + lnet_ni_recv(msg->msg_rxni, msg->msg_private, msg, 1, + 0, msg->msg_len, msg->msg_len); + lnet_net_lock(cpt); + } + return LNET_CREDIT_OK; +} + +void +lnet_return_tx_credits_locked(struct lnet_msg *msg) +{ + struct lnet_peer_ni *txpeer = msg->msg_txpeer; + struct lnet_ni *txni = msg->msg_txni; + struct lnet_msg *msg2; + + if (msg->msg_txcredit) { + struct lnet_ni *ni = msg->msg_txni; + struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt]; + + /* give back NI txcredits */ + msg->msg_txcredit = 0; + + LASSERT((tq->tq_credits < 0) == + !list_empty(&tq->tq_delayed)); + + tq->tq_credits++; + atomic_inc(&ni->ni_tx_credits); + if (tq->tq_credits <= 0) { + msg2 = list_entry(tq->tq_delayed.next, + struct lnet_msg, msg_list); + list_del(&msg2->msg_list); + + LASSERT(msg2->msg_txni == ni); + LASSERT(msg2->msg_tx_delayed); + LASSERT(msg2->msg_tx_cpt == msg->msg_tx_cpt); + + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (msg->msg_peertxcredit) { + /* give back peer txcredits */ + msg->msg_peertxcredit = 0; + + spin_lock(&txpeer->lpni_lock); + LASSERT((txpeer->lpni_txcredits < 0) == + !list_empty(&txpeer->lpni_txq)); + + txpeer->lpni_txqnob -= msg->msg_len + sizeof(struct lnet_hdr); + LASSERT(txpeer->lpni_txqnob >= 0); + + txpeer->lpni_txcredits++; + if (txpeer->lpni_txcredits <= 0) { + int msg2_cpt; + + msg2 = list_entry(txpeer->lpni_txq.next, + struct lnet_msg, msg_list); + list_del(&msg2->msg_list); + spin_unlock(&txpeer->lpni_lock); + + LASSERT(msg2->msg_txpeer == txpeer); + LASSERT(msg2->msg_tx_delayed); + + msg2_cpt = msg2->msg_tx_cpt; + + /* + * The msg_cpt can be different from the msg2_cpt + * so we need to make sure we lock the correct cpt + * for msg2. + * Once we call lnet_post_send_locked() it is no + * longer safe to access msg2, since it could've + * been freed by lnet_finalize(), but we still + * need to relock the correct cpt, so we cache the + * msg2_cpt for the purpose of the check that + * follows the call to lnet_pose_send_locked(). + */ + if (msg2_cpt != msg->msg_tx_cpt) { + lnet_net_unlock(msg->msg_tx_cpt); + lnet_net_lock(msg2_cpt); + } + (void) lnet_post_send_locked(msg2, 1); + if (msg2_cpt != msg->msg_tx_cpt) { + lnet_net_unlock(msg2_cpt); + lnet_net_lock(msg->msg_tx_cpt); + } + } else { + spin_unlock(&txpeer->lpni_lock); + } + } + + if (txni != NULL) { + msg->msg_txni = NULL; + lnet_ni_decref_locked(txni, msg->msg_tx_cpt); + } + + if (txpeer != NULL) { + msg->msg_txpeer = NULL; + lnet_peer_ni_decref_locked(txpeer); + } +} + +void +lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp) +{ + struct lnet_msg *msg; + + if (list_empty(&rbp->rbp_msgs)) + return; + msg = list_entry(rbp->rbp_msgs.next, + struct lnet_msg, msg_list); + list_del(&msg->msg_list); + + (void)lnet_post_routed_recv_locked(msg, 1); +} + +void +lnet_drop_routed_msgs_locked(struct list_head *list, int cpt) +{ + struct lnet_msg *msg; + struct lnet_msg *tmp; + + lnet_net_unlock(cpt); + + list_for_each_entry_safe(msg, tmp, list, msg_list) { + lnet_ni_recv(msg->msg_rxni, msg->msg_private, NULL, + 0, 0, 0, msg->msg_hdr.payload_length); + list_del_init(&msg->msg_list); + msg->msg_no_resend = true; + msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR; + lnet_finalize(msg, -ECANCELED); + } + + lnet_net_lock(cpt); +} + +void +lnet_return_rx_credits_locked(struct lnet_msg *msg) +{ + struct lnet_peer_ni *rxpeer = msg->msg_rxpeer; + struct lnet_ni *rxni = msg->msg_rxni; + struct lnet_msg *msg2; + + if (msg->msg_rtrcredit) { + /* give back global router credits */ + struct lnet_rtrbuf *rb; + struct lnet_rtrbufpool *rbp; + + /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays + * there until it gets one allocated, or aborts the wait + * itself */ + LASSERT(msg->msg_kiov != NULL); + + rb = list_entry(msg->msg_kiov, struct lnet_rtrbuf, rb_kiov[0]); + rbp = rb->rb_pool; + + msg->msg_kiov = NULL; + msg->msg_rtrcredit = 0; + + LASSERT(rbp == lnet_msg2bufpool(msg)); + + LASSERT((rbp->rbp_credits > 0) == + !list_empty(&rbp->rbp_bufs)); + + /* If routing is now turned off, we just drop this buffer and + * don't bother trying to return credits. */ + if (!the_lnet.ln_routing) { + lnet_destroy_rtrbuf(rb, rbp->rbp_npages); + goto routing_off; + } + + /* It is possible that a user has lowered the desired number of + * buffers in this pool. Make sure we never put back + * more buffers than the stated number. */ + if (unlikely(rbp->rbp_credits >= rbp->rbp_req_nbuffers)) { + /* Discard this buffer so we don't have too + * many. */ + lnet_destroy_rtrbuf(rb, rbp->rbp_npages); + rbp->rbp_nbuffers--; + } else { + list_add(&rb->rb_list, &rbp->rbp_bufs); + rbp->rbp_credits++; + if (rbp->rbp_credits <= 0) + lnet_schedule_blocked_locked(rbp); + } + } + +routing_off: + if (msg->msg_peerrtrcredit) { + /* give back peer router credits */ + msg->msg_peerrtrcredit = 0; + + spin_lock(&rxpeer->lpni_lock); + LASSERT((rxpeer->lpni_rtrcredits < 0) == + !list_empty(&rxpeer->lpni_rtrq)); + + rxpeer->lpni_rtrcredits++; + + /* drop all messages which are queued to be routed on that + * peer. */ + if (!the_lnet.ln_routing) { + struct list_head drop; + INIT_LIST_HEAD(&drop); + list_splice_init(&rxpeer->lpni_rtrq, &drop); + spin_unlock(&rxpeer->lpni_lock); + lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt); + } else if (rxpeer->lpni_rtrcredits <= 0) { + msg2 = list_entry(rxpeer->lpni_rtrq.next, + struct lnet_msg, msg_list); + list_del(&msg2->msg_list); + spin_unlock(&rxpeer->lpni_lock); + (void) lnet_post_routed_recv_locked(msg2, 1); + } else { + spin_unlock(&rxpeer->lpni_lock); + } + } + if (rxni != NULL) { + msg->msg_rxni = NULL; + lnet_ni_decref_locked(rxni, msg->msg_rx_cpt); + } + if (rxpeer != NULL) { + msg->msg_rxpeer = NULL; + lnet_peer_ni_decref_locked(rxpeer); + } +} + +static int +lnet_compare_peers(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) +{ + if (p1->lpni_txqnob < p2->lpni_txqnob) + return 1; + + if (p1->lpni_txqnob > p2->lpni_txqnob) + return -1; + + if (p1->lpni_txcredits > p2->lpni_txcredits) + return 1; + + if (p1->lpni_txcredits < p2->lpni_txcredits) + return -1; + + return 0; +} + +static int +lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2) +{ + struct lnet_peer_ni *p1 = r1->lr_gateway; + struct lnet_peer_ni *p2 = r2->lr_gateway; + int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops; + int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops; + int rc; + + if (r1->lr_priority < r2->lr_priority) + return 1; + + if (r1->lr_priority > r2->lr_priority) + return -1; + + if (r1_hops < r2_hops) + return 1; + + if (r1_hops > r2_hops) + return -1; + + rc = lnet_compare_peers(p1, p2); + if (rc) + return rc; + + if (r1->lr_seq - r2->lr_seq <= 0) + return 1; + + return -1; +} + +static struct lnet_peer_ni * +lnet_find_route_locked(struct lnet_net *net, __u32 remote_net, + lnet_nid_t rtr_nid) +{ + struct lnet_remotenet *rnet; + struct lnet_route *route; + struct lnet_route *best_route; + struct lnet_route *last_route; + struct lnet_peer_ni *lpni_best; + struct lnet_peer_ni *lp; + int rc; + + /* If @rtr_nid is not LNET_NID_ANY, return the gateway with + * rtr_nid nid, otherwise find the best gateway I can use */ + + rnet = lnet_find_rnet_locked(remote_net); + if (rnet == NULL) + return NULL; + + lpni_best = NULL; + best_route = last_route = NULL; + list_for_each_entry(route, &rnet->lrn_routes, lr_list) { + lp = route->lr_gateway; + + if (!lnet_is_route_alive(route)) + continue; + + if (net != NULL && lp->lpni_net != net) + continue; + + if (lp->lpni_nid == rtr_nid) /* it's pre-determined router */ + return lp; + + if (lpni_best == NULL) { + best_route = last_route = route; + lpni_best = lp; + continue; + } + + /* no protection on below fields, but it's harmless */ + if (last_route->lr_seq - route->lr_seq < 0) + last_route = route; + + rc = lnet_compare_routes(route, best_route); + if (rc < 0) + continue; + + best_route = route; + lpni_best = lp; + } + + /* set sequence number on the best router to the latest sequence + 1 + * so we can round-robin all routers, it's race and inaccurate but + * harmless and functional */ + if (best_route != NULL) + best_route->lr_seq = last_route->lr_seq + 1; + return lpni_best; +} + +static struct lnet_ni * +lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, + struct lnet_peer *peer, struct lnet_peer_net *peer_net, + int md_cpt) +{ + struct lnet_ni *ni = NULL; + unsigned int shortest_distance; + int best_credits; + int best_healthv; + + /* + * If there is no peer_ni that we can send to on this network, + * then there is no point in looking for a new best_ni here. + */ + if (!lnet_get_next_peer_ni_locked(peer, peer_net, NULL)) + return best_ni; + + if (best_ni == NULL) { + shortest_distance = UINT_MAX; + best_credits = INT_MIN; + best_healthv = 0; + } else { + shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt, + best_ni->ni_dev_cpt); + best_credits = atomic_read(&best_ni->ni_tx_credits); + best_healthv = atomic_read(&best_ni->ni_healthv); + } + + while ((ni = lnet_get_next_ni_locked(local_net, ni))) { + unsigned int distance; + int ni_credits; + int ni_healthv; + int ni_fatal; + + ni_credits = atomic_read(&ni->ni_tx_credits); + ni_healthv = atomic_read(&ni->ni_healthv); + ni_fatal = atomic_read(&ni->ni_fatal_error_on); + + /* + * calculate the distance from the CPT on which + * the message memory is allocated to the CPT of + * the NI's physical device + */ + distance = cfs_cpt_distance(lnet_cpt_table(), + md_cpt, + ni->ni_dev_cpt); + + CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d] with best_ni %s [c:%d, d:%d, s:%d]\n", + libcfs_nid2str(ni->ni_nid), ni_credits, distance, + ni->ni_seq, (best_ni) ? libcfs_nid2str(best_ni->ni_nid) + : "not seleced", best_credits, shortest_distance, + (best_ni) ? best_ni->ni_seq : 0); + + /* + * All distances smaller than the NUMA range + * are treated equally. + */ + if (distance < lnet_numa_range) + distance = lnet_numa_range; + + /* + * Select on health, shorter distance, available + * credits, then round-robin. + */ + if (ni_fatal) { + continue; + } else if (ni_healthv < best_healthv) { + continue; + } else if (ni_healthv > best_healthv) { + best_healthv = ni_healthv; + /* + * If we're going to prefer this ni because it's + * the healthiest, then we should set the + * shortest_distance in the algorithm in case + * there are multiple NIs with the same health but + * different distances. + */ + if (distance < shortest_distance) + shortest_distance = distance; + } else if (distance > shortest_distance) { + continue; + } else if (distance < shortest_distance) { + shortest_distance = distance; + } else if (ni_credits < best_credits) { + continue; + } else if (ni_credits == best_credits) { + if (best_ni && best_ni->ni_seq <= ni->ni_seq) + continue; + } + best_ni = ni; + best_credits = ni_credits; + } + + CDEBUG(D_NET, "selected best_ni %s\n", + (best_ni) ? libcfs_nid2str(best_ni->ni_nid) : "no selection"); + + return best_ni; +} + +/* + * Traffic to the LNET_RESERVED_PORTAL may not trigger peer discovery, + * because such traffic is required to perform discovery. We therefore + * exclude all GET and PUT on that portal. We also exclude all ACK and + * REPLY traffic, but that is because the portal is not tracked in the + * message structure for these message types. We could restrict this + * further by also checking for LNET_PROTO_PING_MATCHBITS. + */ +static bool +lnet_msg_discovery(struct lnet_msg *msg) +{ + if (msg->msg_type == LNET_MSG_PUT) { + if (msg->msg_hdr.msg.put.ptl_index != LNET_RESERVED_PORTAL) + return true; + } else if (msg->msg_type == LNET_MSG_GET) { + if (msg->msg_hdr.msg.get.ptl_index != LNET_RESERVED_PORTAL) + return true; + } + return false; +} + +#define SRC_SPEC 0x0001 +#define SRC_ANY 0x0002 +#define LOCAL_DST 0x0004 +#define REMOTE_DST 0x0008 +#define MR_DST 0x0010 +#define NMR_DST 0x0020 +#define SND_RESP 0x0040 + +/* The following to defines are used for return codes */ +#define REPEAT_SEND 0x1000 +#define PASS_THROUGH 0x2000 + +/* The different cases lnet_select pathway needs to handle */ +#define SRC_SPEC_LOCAL_MR_DST (SRC_SPEC | LOCAL_DST | MR_DST) +#define SRC_SPEC_ROUTER_MR_DST (SRC_SPEC | REMOTE_DST | MR_DST) +#define SRC_SPEC_LOCAL_NMR_DST (SRC_SPEC | LOCAL_DST | NMR_DST) +#define SRC_SPEC_ROUTER_NMR_DST (SRC_SPEC | REMOTE_DST | NMR_DST) +#define SRC_ANY_LOCAL_MR_DST (SRC_ANY | LOCAL_DST | MR_DST) +#define SRC_ANY_ROUTER_MR_DST (SRC_ANY | REMOTE_DST | MR_DST) +#define SRC_ANY_LOCAL_NMR_DST (SRC_ANY | LOCAL_DST | NMR_DST) +#define SRC_ANY_ROUTER_NMR_DST (SRC_ANY | REMOTE_DST | NMR_DST) + +static int +lnet_handle_lo_send(struct lnet_send_data *sd) +{ + struct lnet_msg *msg = sd->sd_msg; + int cpt = sd->sd_cpt; + + /* No send credit hassles with LOLND */ + lnet_ni_addref_locked(the_lnet.ln_loni, cpt); + msg->msg_hdr.dest_nid = cpu_to_le64(the_lnet.ln_loni->ni_nid); + if (!msg->msg_routing) + msg->msg_hdr.src_nid = + cpu_to_le64(the_lnet.ln_loni->ni_nid); + msg->msg_target.nid = the_lnet.ln_loni->ni_nid; + lnet_msg_commit(msg, cpt); + msg->msg_txni = the_lnet.ln_loni; + + return LNET_CREDIT_OK; +} + +static int +lnet_handle_send(struct lnet_send_data *sd) +{ + struct lnet_ni *best_ni = sd->sd_best_ni; + struct lnet_peer_ni *best_lpni = sd->sd_best_lpni; + struct lnet_peer_ni *final_dst_lpni = sd->sd_final_dst_lpni; + struct lnet_msg *msg = sd->sd_msg; + int cpt2; + __u32 send_case = sd->sd_send_case; + int rc; + __u32 routing = send_case & REMOTE_DST; + struct lnet_rsp_tracker *rspt; + + /* + * Increment sequence number of the selected peer so that we + * pick the next one in Round Robin. + */ + best_lpni->lpni_seq++; + + /* + * grab a reference on the peer_ni so it sticks around even if + * we need to drop and relock the lnet_net_lock below. + */ + lnet_peer_ni_addref_locked(best_lpni); + + /* + * Use lnet_cpt_of_nid() to determine the CPT used to commit the + * message. This ensures that we get a CPT that is correct for + * the NI when the NI has been restricted to a subset of all CPTs. + * If the selected CPT differs from the one currently locked, we + * must unlock and relock the lnet_net_lock(), and then check whether + * the configuration has changed. We don't have a hold on the best_ni + * yet, and it may have vanished. + */ + cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni); + if (sd->sd_cpt != cpt2) { + __u32 seq = lnet_get_dlc_seq_locked(); + lnet_net_unlock(sd->sd_cpt); + sd->sd_cpt = cpt2; + lnet_net_lock(sd->sd_cpt); + if (seq != lnet_get_dlc_seq_locked()) { + lnet_peer_ni_decref_locked(best_lpni); + return REPEAT_SEND; + } + } + + /* + * store the best_lpni in the message right away to avoid having + * to do the same operation under different conditions + */ + msg->msg_txpeer = best_lpni; + msg->msg_txni = best_ni; + + /* + * grab a reference for the best_ni since now it's in use in this + * send. The reference will be dropped in lnet_finalize() + */ + lnet_ni_addref_locked(msg->msg_txni, sd->sd_cpt); + + /* + * Always set the target.nid to the best peer picked. Either the + * NID will be one of the peer NIDs selected, or the same NID as + * what was originally set in the target or it will be the NID of + * a router if this message should be routed + */ + msg->msg_target.nid = msg->msg_txpeer->lpni_nid; + + /* + * lnet_msg_commit assigns the correct cpt to the message, which + * is used to decrement the correct refcount on the ni when it's + * time to return the credits + */ + lnet_msg_commit(msg, sd->sd_cpt); + + /* + * If we are routing the message then we keep the src_nid that was + * set by the originator. If we are not routing then we are the + * originator and set it here. + */ + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid); + + if (routing) { + msg->msg_target_is_router = 1; + msg->msg_target.pid = LNET_PID_LUSTRE; + /* + * since we're routing we want to ensure that the + * msg_hdr.dest_nid is set to the final destination. When + * the router receives this message it knows how to route + * it. + * + * final_dst_lpni is set at the beginning of the + * lnet_select_pathway() function and is never changed. + * It's safe to use it here. + */ + msg->msg_hdr.dest_nid = cpu_to_le64(final_dst_lpni->lpni_nid); + } else { + /* + * if we're not routing set the dest_nid to the best peer + * ni NID that we picked earlier in the algorithm. + */ + msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid); + } + + /* + * if we have response tracker block update it with the next hop + * nid + */ + if (msg->msg_md) { + rspt = msg->msg_md->md_rspt_ptr; + if (rspt) { + rspt->rspt_next_hop_nid = msg->msg_txpeer->lpni_nid; + CDEBUG(D_NET, "rspt_next_hop_nid = %s\n", + libcfs_nid2str(rspt->rspt_next_hop_nid)); + } + } + + rc = lnet_post_send_locked(msg, 0); + + if (!rc) + CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s try# %d\n", + libcfs_nid2str(msg->msg_hdr.src_nid), + libcfs_nid2str(msg->msg_txni->ni_nid), + libcfs_nid2str(sd->sd_src_nid), + libcfs_nid2str(msg->msg_hdr.dest_nid), + libcfs_nid2str(sd->sd_dst_nid), + libcfs_nid2str(msg->msg_txpeer->lpni_nid), + lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count); + + return rc; +} + +static struct lnet_peer_ni * +lnet_select_peer_ni(struct lnet_send_data *sd, struct lnet_peer *peer, + struct lnet_peer_net *peer_net) +{ + /* + * Look at the peer NIs for the destination peer that connect + * to the chosen net. If a peer_ni is preferred when using the + * best_ni to communicate, we use that one. If there is no + * preferred peer_ni, or there are multiple preferred peer_ni, + * the available transmit credits are used. If the transmit + * credits are equal, we round-robin over the peer_ni. + */ + struct lnet_peer_ni *lpni = NULL; + struct lnet_peer_ni *best_lpni = NULL; + struct lnet_ni *best_ni = sd->sd_best_ni; + lnet_nid_t dst_nid = sd->sd_dst_nid; + int best_lpni_credits = INT_MIN; + bool preferred = false; + bool ni_is_pref; + int best_lpni_healthv = 0; + int lpni_healthv; + + while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { + /* + * if the best_ni we've chosen aleady has this lpni + * preferred, then let's use it + */ + ni_is_pref = lnet_peer_is_pref_nid_locked(lpni, + best_ni->ni_nid); + + lpni_healthv = atomic_read(&lpni->lpni_healthv); + + CDEBUG(D_NET, "%s ni_is_pref = %d\n", + libcfs_nid2str(best_ni->ni_nid), ni_is_pref); + + if (best_lpni) + CDEBUG(D_NET, "%s c:[%d, %d], s:[%d, %d]\n", + libcfs_nid2str(lpni->lpni_nid), + lpni->lpni_txcredits, best_lpni_credits, + lpni->lpni_seq, best_lpni->lpni_seq); + + /* pick the healthiest peer ni */ + if (lpni_healthv < best_lpni_healthv) { + continue; + } else if (lpni_healthv > best_lpni_healthv) { + best_lpni_healthv = lpni_healthv; + /* if this is a preferred peer use it */ + } else if (!preferred && ni_is_pref) { + preferred = true; + } else if (preferred && !ni_is_pref) { + /* + * this is not the preferred peer so let's ignore + * it. + */ + continue; + } else if (lpni->lpni_txcredits < best_lpni_credits) { + /* + * We already have a peer that has more credits + * available than this one. No need to consider + * this peer further. + */ + continue; + } else if (lpni->lpni_txcredits == best_lpni_credits) { + /* + * The best peer found so far and the current peer + * have the same number of available credits let's + * make sure to select between them using Round + * Robin + */ + if (best_lpni) { + if (best_lpni->lpni_seq <= lpni->lpni_seq) + continue; + } + } + + best_lpni = lpni; + best_lpni_credits = lpni->lpni_txcredits; + } + + /* if we still can't find a peer ni then we can't reach it */ + if (!best_lpni) { + __u32 net_id = (peer_net) ? peer_net->lpn_net_id : + LNET_NIDNET(dst_nid); + CDEBUG(D_NET, "no peer_ni found on peer net %s\n", + libcfs_net2str(net_id)); + return NULL; + } + + CDEBUG(D_NET, "sd_best_lpni = %s\n", + libcfs_nid2str(best_lpni->lpni_nid)); + + return best_lpni; +} + +/* + * Prerequisite: the best_ni should already be set in the sd + */ +static inline struct lnet_peer_ni * +lnet_find_best_lpni_on_net(struct lnet_send_data *sd, struct lnet_peer *peer, + __u32 net_id) +{ + struct lnet_peer_net *peer_net; + + /* + * The gateway is Multi-Rail capable so now we must select the + * proper peer_ni + */ + peer_net = lnet_peer_get_net_locked(peer, net_id); + + if (!peer_net) { + CERROR("gateway peer %s has no NI on net %s\n", + libcfs_nid2str(peer->lp_primary_nid), + libcfs_net2str(net_id)); + return NULL; + } + + return lnet_select_peer_ni(sd, peer, peer_net); +} + +static inline void +lnet_set_non_mr_pref_nid(struct lnet_send_data *sd) +{ + if (sd->sd_send_case & NMR_DST && + sd->sd_msg->msg_type != LNET_MSG_REPLY && + sd->sd_msg->msg_type != LNET_MSG_ACK && + sd->sd_best_lpni->lpni_pref_nnids == 0) { + CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n", + libcfs_nid2str(sd->sd_best_ni->ni_nid), + libcfs_nid2str(sd->sd_best_lpni->lpni_nid)); + lnet_peer_ni_set_non_mr_pref_nid(sd->sd_best_lpni, + sd->sd_best_ni->ni_nid); + } +} + +/* + * Source Specified + * Local Destination + * non-mr peer + * + * use the source and destination NIDs as the pathway + */ +static int +lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd) +{ + /* the destination lpni is set before we get here. */ + + /* find local NI */ + sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt); + if (!sd->sd_best_ni) { + CERROR("Can't send to %s: src %s is not a " + "local nid\n", libcfs_nid2str(sd->sd_dst_nid), + libcfs_nid2str(sd->sd_src_nid)); + return -EINVAL; + } + + /* + * the preferred NID will only be set for NMR peers + */ + lnet_set_non_mr_pref_nid(sd); + + return lnet_handle_send(sd); +} + +/* + * Source Specified + * Local Destination + * MR Peer + * + * Don't run the selection algorithm on the peer NIs. By specifying the + * local NID, we're also saying that we should always use the destination NID + * provided. This handles the case where we should be using the same + * destination NID for the all the messages which belong to the same RPC + * request. + */ +static int +lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd) +{ + sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt); + if (!sd->sd_best_ni) { + CERROR("Can't send to %s: src %s is not a " + "local nid\n", libcfs_nid2str(sd->sd_dst_nid), + libcfs_nid2str(sd->sd_src_nid)); + return -EINVAL; + } + + if (sd->sd_best_lpni && + sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid) + return lnet_handle_lo_send(sd); + else if (sd->sd_best_lpni) + return lnet_handle_send(sd); + + CERROR("can't send to %s. no NI on %s\n", + libcfs_nid2str(sd->sd_dst_nid), + libcfs_net2str(sd->sd_best_ni->ni_net->net_id)); + + return -EHOSTUNREACH; +} + +struct lnet_ni * +lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, + struct lnet_peer *peer, + struct lnet_peer_net *peer_net, + int cpt, + bool incr_seq) +{ + struct lnet_net *local_net; + struct lnet_ni *best_ni; + + local_net = lnet_get_net_locked(peer_net->lpn_net_id); + if (!local_net) + return NULL; + + /* + * Iterate through the NIs in this local Net and select + * the NI to send from. The selection is determined by + * these 3 criterion in the following priority: + * 1. NUMA + * 2. NI available credits + * 3. Round Robin + */ + best_ni = lnet_get_best_ni(local_net, cur_best_ni, + peer, peer_net, cpt); + + if (incr_seq && best_ni) + best_ni->ni_seq++; + + return best_ni; +} + +static int +lnet_handle_find_routed_path(struct lnet_send_data *sd, + lnet_nid_t dst_nid, + struct lnet_peer_ni **gw_lpni, + struct lnet_peer **gw_peer) +{ + struct lnet_peer_ni *gw; + lnet_nid_t src_nid = sd->sd_src_nid; + + gw = lnet_find_route_locked(NULL, LNET_NIDNET(dst_nid), + sd->sd_rtr_nid); + if (!gw) { + CERROR("no route to %s from %s\n", + libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid)); + return -EHOSTUNREACH; + } + + /* get the peer of the gw_ni */ + LASSERT(gw->lpni_peer_net); + LASSERT(gw->lpni_peer_net->lpn_peer); + + *gw_peer = gw->lpni_peer_net->lpn_peer; + + if (!sd->sd_best_ni) + sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, *gw_peer, + gw->lpni_peer_net, + sd->sd_md_cpt, + true); + + if (!sd->sd_best_ni) { + CERROR("Internal Error. Expected local ni on %s " + "but non found :%s\n", + libcfs_net2str(gw->lpni_peer_net->lpn_net_id), + libcfs_nid2str(sd->sd_src_nid)); + return -EFAULT; + } + + /* + * if gw is MR let's find its best peer_ni + */ + if (lnet_peer_is_multi_rail(*gw_peer)) { + gw = lnet_find_best_lpni_on_net(sd, *gw_peer, + sd->sd_best_ni->ni_net->net_id); + /* + * We've already verified that the gw has an NI on that + * desired net, but we're not finding it. Something is + * wrong. + */ + if (!gw) { + CERROR("Internal Error. Route expected to %s from %s\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EFAULT; + } + } + + *gw_lpni = gw; + + return 0; +} + +/* + * Handle two cases: + * + * Case 1: + * Source specified + * Remote destination + * Non-MR destination + * + * Case 2: + * Source specified + * Remote destination + * MR destination + * + * The handling of these two cases is similar. Even though the destination + * can be MR or non-MR, we'll deal directly with the router. + */ +static int +lnet_handle_spec_router_dst(struct lnet_send_data *sd) +{ + int rc; + struct lnet_peer_ni *gw_lpni = NULL; + struct lnet_peer *gw_peer = NULL; + + /* find local NI */ + sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt); + if (!sd->sd_best_ni) { + CERROR("Can't send to %s: src %s is not a " + "local nid\n", libcfs_nid2str(sd->sd_dst_nid), + libcfs_nid2str(sd->sd_src_nid)); + return -EINVAL; + } + + rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni, + &gw_peer); + if (rc < 0) + return rc; + + if (sd->sd_send_case & NMR_DST) + /* + * since the final destination is non-MR let's set its preferred + * NID before we send + */ + lnet_set_non_mr_pref_nid(sd); + + /* + * We're going to send to the gw found so let's set its + * info + */ + sd->sd_peer = gw_peer; + sd->sd_best_lpni = gw_lpni; + + return lnet_handle_send(sd); +} + +struct lnet_ni * +lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt) +{ + struct lnet_peer_net *peer_net = NULL; + struct lnet_ni *best_ni = NULL; + + /* + * The peer can have multiple interfaces, some of them can be on + * the local network and others on a routed network. We should + * prefer the local network. However if the local network is not + * available then we need to try the routed network + */ + + /* go through all the peer nets and find the best_ni */ + list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) { + /* + * The peer's list of nets can contain non-local nets. We + * want to only examine the local ones. + */ + if (!lnet_get_net_locked(peer_net->lpn_net_id)) + continue; + best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, + peer_net, md_cpt, false); + } + + if (best_ni) + /* increment sequence number so we can round robin */ + best_ni->ni_seq++; + + return best_ni; +} + +static struct lnet_ni * +lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) +{ + struct lnet_ni *best_ni = NULL; + struct lnet_peer_net *peer_net; + struct lnet_peer *peer = sd->sd_peer; + struct lnet_peer_ni *best_lpni = sd->sd_best_lpni; + struct lnet_peer_ni *lpni; + int cpt = sd->sd_cpt; + + /* + * We must use a consistent source address when sending to a + * non-MR peer. However, a non-MR peer can have multiple NIDs + * on multiple networks, and we may even need to talk to this + * peer on multiple networks -- certain types of + * load-balancing configuration do this. + * + * So we need to pick the NI the peer prefers for this + * particular network. + */ + + /* Get the target peer_ni */ + peer_net = lnet_peer_get_net_locked(peer, + LNET_NIDNET(best_lpni->lpni_nid)); + LASSERT(peer_net != NULL); + list_for_each_entry(lpni, &peer_net->lpn_peer_nis, + lpni_peer_nis) { + if (lpni->lpni_pref_nnids == 0) + continue; + LASSERT(lpni->lpni_pref_nnids == 1); + best_ni = lnet_nid2ni_locked( + lpni->lpni_pref.nid, cpt); + break; + } + + return best_ni; +} + +/* Prerequisite: sd->sd_peer and sd->sd_best_lpni should be set */ +static int +lnet_select_preferred_best_ni(struct lnet_send_data *sd) +{ + struct lnet_ni *best_ni = NULL; + struct lnet_peer_ni *best_lpni = sd->sd_best_lpni; + + /* + * We must use a consistent source address when sending to a + * non-MR peer. However, a non-MR peer can have multiple NIDs + * on multiple networks, and we may even need to talk to this + * peer on multiple networks -- certain types of + * load-balancing configuration do this. + * + * So we need to pick the NI the peer prefers for this + * particular network. + */ + + best_ni = lnet_find_existing_preferred_best_ni(sd); + + /* if best_ni is still not set just pick one */ + if (!best_ni) { + best_ni = + lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, + sd->sd_best_lpni->lpni_peer_net, + sd->sd_md_cpt, true); + /* If there is no best_ni we don't have a route */ + if (!best_ni) { + CERROR("no path to %s from net %s\n", + libcfs_nid2str(best_lpni->lpni_nid), + libcfs_net2str(best_lpni->lpni_net->net_id)); + return -EHOSTUNREACH; + } + } + + sd->sd_best_ni = best_ni; + + /* Set preferred NI if necessary. */ + lnet_set_non_mr_pref_nid(sd); + + return 0; +} + + +/* + * Source not specified + * Local destination + * Non-MR Peer + * + * always use the same source NID for NMR peers + * If we've talked to that peer before then we already have a preferred + * source NI associated with it. Otherwise, we select a preferred local NI + * and store it in the peer + */ +static int +lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd) +{ + int rc; + + /* sd->sd_best_lpni is already set to the final destination */ + + /* + * At this point we should've created the peer ni and peer. If we + * can't find it, then something went wrong. Instead of assert + * output a relevant message and fail the send + */ + if (!sd->sd_best_lpni) { + CERROR("Internal fault. Unable to send msg %s to %s. " + "NID not known\n", + lnet_msgtyp2str(sd->sd_msg->msg_type), + libcfs_nid2str(sd->sd_dst_nid)); + return -EFAULT; + } + + rc = lnet_select_preferred_best_ni(sd); + if (!rc) + rc = lnet_handle_send(sd); + + return rc; +} + +static int +lnet_handle_any_mr_dsta(struct lnet_send_data *sd) +{ + /* + * NOTE we've already handled the remote peer case. So we only + * need to worry about the local case here. + * + * if we're sending a response, ACK or reply, we need to send it + * to the destination NID given to us. At this point we already + * have the peer_ni we're suppose to send to, so just find the + * best_ni on the peer net and use that. Since we're sending to an + * MR peer then we can just run the selection algorithm on our + * local NIs and pick the best one. + */ + if (sd->sd_send_case & SND_RESP) { + sd->sd_best_ni = + lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, + sd->sd_best_lpni->lpni_peer_net, + sd->sd_md_cpt, true); + + if (!sd->sd_best_ni) { + /* + * We're not going to deal with not able to send + * a response to the provided final destination + */ + CERROR("Can't send response to %s. " + "No local NI available\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + + return lnet_handle_send(sd); + } + + /* + * If we get here that means we're sending a fresh request, PUT or + * GET, so we need to run our standard selection algorithm. + * First find the best local interface that's on any of the peer's + * networks. + */ + sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer, + sd->sd_md_cpt); + if (sd->sd_best_ni) { + sd->sd_best_lpni = + lnet_find_best_lpni_on_net(sd, sd->sd_peer, + sd->sd_best_ni->ni_net->net_id); + + /* + * if we're successful in selecting a peer_ni on the local + * network, then send to it. Otherwise fall through and + * try and see if we can reach it over another routed + * network + */ + if (sd->sd_best_lpni && + sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid) { + /* + * in case we initially started with a routed + * destination, let's reset to local + */ + sd->sd_send_case &= ~REMOTE_DST; + sd->sd_send_case |= LOCAL_DST; + return lnet_handle_lo_send(sd); + } else if (sd->sd_best_lpni) { + /* + * in case we initially started with a routed + * destination, let's reset to local + */ + sd->sd_send_case &= ~REMOTE_DST; + sd->sd_send_case |= LOCAL_DST; + return lnet_handle_send(sd); + } + + CERROR("Internal Error. Expected to have a best_lpni: " + "%s -> %s\n", + libcfs_nid2str(sd->sd_src_nid), + libcfs_nid2str(sd->sd_dst_nid)); + + return -EFAULT; + } + + /* + * Peer doesn't have a local network. Let's see if there is + * a remote network we can reach it on. + */ + return PASS_THROUGH; +} + +/* + * Case 1: + * Source NID not specified + * Local destination + * MR peer + * + * Case 2: + * Source NID not speified + * Remote destination + * MR peer + * + * In both of these cases if we're sending a response, ACK or REPLY, then + * we need to send to the destination NID provided. + * + * In the remote case let's deal with MR routers. + * + */ + +static int +lnet_handle_any_mr_dst(struct lnet_send_data *sd) +{ + int rc = 0; + struct lnet_peer *gw_peer = NULL; + struct lnet_peer_ni *gw_lpni = NULL; + + /* + * handle sending a response to a remote peer here so we don't + * have to worry about it if we hit lnet_handle_any_mr_dsta() + */ + if (sd->sd_send_case & REMOTE_DST && + sd->sd_send_case & SND_RESP) { + struct lnet_peer_ni *gw; + struct lnet_peer *gw_peer; + + rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw, + &gw_peer); + if (rc < 0) { + CERROR("Can't send response to %s. " + "No route available\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + + sd->sd_best_lpni = gw; + sd->sd_peer = gw_peer; + + return lnet_handle_send(sd); + } + + /* + * Even though the NID for the peer might not be on a local network, + * since the peer is MR there could be other interfaces on the + * local network. In that case we'd still like to prefer the local + * network over the routed network. If we're unable to do that + * then we select the best router among the different routed networks, + * and if the router is MR then we can deal with it as such. + */ + rc = lnet_handle_any_mr_dsta(sd); + if (rc != PASS_THROUGH) + return rc; + + /* + * TODO; One possible enhancement is to run the selection + * algorithm on the peer. However for remote peers the credits are + * not decremented, so we'll be basically going over the peer NIs + * in round robin. An MR router will run the selection algorithm + * on the next-hop interfaces. + */ + rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni, + &gw_peer); + if (rc < 0) + return rc; + + sd->sd_send_case &= ~LOCAL_DST; + sd->sd_send_case |= REMOTE_DST; + + sd->sd_peer = gw_peer; + sd->sd_best_lpni = gw_lpni; + + return lnet_handle_send(sd); +} + +/* + * Source not specified + * Remote destination + * Non-MR peer + * + * Must send to the specified peer NID using the same source NID that + * we've used before. If it's the first time to talk to that peer then + * find the source NI and assign it as preferred to that peer + */ +static int +lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) +{ + int rc; + struct lnet_peer_ni *gw_lpni = NULL; + struct lnet_peer *gw_peer = NULL; + + /* + * Let's set if we have a preferred NI to talk to this NMR peer + */ + sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd); + + /* + * find the router and that'll find the best NI if we didn't find + * it already. + */ + rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni, + &gw_peer); + if (rc < 0) + return rc; + + /* + * set the best_ni we've chosen as the preferred one for + * this peer + */ + lnet_set_non_mr_pref_nid(sd); + + /* we'll be sending to the gw */ + sd->sd_best_lpni = gw_lpni; + sd->sd_peer = gw_peer; + + return lnet_handle_send(sd); +} + +static int +lnet_handle_send_case_locked(struct lnet_send_data *sd) +{ + /* + * turn off the SND_RESP bit. + * It will be checked in the case handling + */ + __u32 send_case = sd->sd_send_case &= ~SND_RESP ; + + CDEBUG(D_NET, "Source %s%s to %s %s %s destination\n", + (send_case & SRC_SPEC) ? "Specified: " : "ANY", + (send_case & SRC_SPEC) ? libcfs_nid2str(sd->sd_src_nid) : "", + (send_case & MR_DST) ? "MR: " : "NMR: ", + libcfs_nid2str(sd->sd_dst_nid), + (send_case & LOCAL_DST) ? "local" : "routed"); + + switch (send_case) { + /* + * For all cases where the source is specified, we should always + * use the destination NID, whether it's an MR destination or not, + * since we're continuing a series of related messages for the + * same RPC + */ + case SRC_SPEC_LOCAL_NMR_DST: + return lnet_handle_spec_local_nmr_dst(sd); + case SRC_SPEC_LOCAL_MR_DST: + return lnet_handle_spec_local_mr_dst(sd); + case SRC_SPEC_ROUTER_NMR_DST: + case SRC_SPEC_ROUTER_MR_DST: + return lnet_handle_spec_router_dst(sd); + case SRC_ANY_LOCAL_NMR_DST: + return lnet_handle_any_local_nmr_dst(sd); + case SRC_ANY_LOCAL_MR_DST: + case SRC_ANY_ROUTER_MR_DST: + return lnet_handle_any_mr_dst(sd); + case SRC_ANY_ROUTER_NMR_DST: + return lnet_handle_any_router_nmr_dst(sd); + default: + CERROR("Unknown send case\n"); + return -1; + } +} + +static int +lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, + struct lnet_msg *msg, lnet_nid_t rtr_nid) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer *peer; + struct lnet_send_data send_data; + int cpt, rc; + int md_cpt; + __u32 send_case = 0; + + memset(&send_data, 0, sizeof(send_data)); + + /* + * get an initial CPT to use for locking. The idea here is not to + * serialize the calls to select_pathway, so that as many + * operations can run concurrently as possible. To do that we use + * the CPT where this call is being executed. Later on when we + * determine the CPT to use in lnet_message_commit, we switch the + * lock and check if there was any configuration change. If none, + * then we proceed, if there is, then we restart the operation. + */ + cpt = lnet_net_lock_current(); + + md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset); + if (md_cpt == CFS_CPT_ANY) + md_cpt = cpt; + +again: + + /* + * If we're being asked to send to the loopback interface, there + * is no need to go through any selection. We can just shortcut + * the entire process and send over lolnd + */ + send_data.sd_msg = msg; + send_data.sd_cpt = cpt; + if (dst_nid == LNET_NID_LO_0) { + rc = lnet_handle_lo_send(&send_data); + lnet_net_unlock(cpt); + return rc; + } + + /* + * find an existing peer_ni, or create one and mark it as having been + * created due to network traffic. This call will create the + * peer->peer_net->peer_ni tree. + */ + lpni = lnet_nid2peerni_locked(dst_nid, LNET_NID_ANY, cpt); + if (IS_ERR(lpni)) { + lnet_net_unlock(cpt); + return PTR_ERR(lpni); + } + + /* + * Cache the original src_nid. If we need to resend the message + * then we'll need to know whether the src_nid was originally + * specified for this message. If it was originally specified, + * then we need to keep using the same src_nid since it's + * continuing the same sequence of messages. + */ + msg->msg_src_nid_param = src_nid; + + /* + * Now that we have a peer_ni, check if we want to discover + * the peer. Traffic to the LNET_RESERVED_PORTAL should not + * trigger discovery. + */ + peer = lpni->lpni_peer_net->lpn_peer; + if (lnet_msg_discovery(msg) && !lnet_peer_is_uptodate(peer)) { + lnet_nid_t primary_nid; + rc = lnet_discover_peer_locked(lpni, cpt, false); + if (rc) { + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(cpt); + return rc; + } + /* The peer may have changed. */ + peer = lpni->lpni_peer_net->lpn_peer; + spin_lock(&peer->lp_lock); + if (lnet_peer_is_uptodate_locked(peer)) { + spin_unlock(&peer->lp_lock); + } else { + /* queue message and return */ + msg->msg_rtr_nid_param = rtr_nid; + msg->msg_sending = 0; + list_add_tail(&msg->msg_list, &peer->lp_dc_pendq); + primary_nid = peer->lp_primary_nid; + spin_unlock(&peer->lp_lock); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(cpt); + + CDEBUG(D_NET, "%s pending discovery\n", + libcfs_nid2str(primary_nid)); + + return LNET_DC_WAIT; + } + } + lnet_peer_ni_decref_locked(lpni); + peer = lpni->lpni_peer_net->lpn_peer; + + /* + * Identify the different send cases + */ + if (src_nid == LNET_NID_ANY) + send_case |= SRC_ANY; + else + send_case |= SRC_SPEC; + + if (lnet_get_net_locked(LNET_NIDNET(dst_nid))) + send_case |= LOCAL_DST; + else + send_case |= REMOTE_DST; + + /* + * if this is a non-MR peer or if we're recovering a peer ni then + * let's consider this an NMR case so we can hit the destination + * NID. + */ + if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery) + send_case |= NMR_DST; + else + send_case |= MR_DST; + + if (msg->msg_type == LNET_MSG_REPLY || + msg->msg_type == LNET_MSG_ACK) + send_case |= SND_RESP; + + /* assign parameters to the send_data */ + send_data.sd_rtr_nid = rtr_nid; + send_data.sd_src_nid = src_nid; + send_data.sd_dst_nid = dst_nid; + send_data.sd_best_lpni = lpni; + /* + * keep a pointer to the final destination in case we're going to + * route, so we'll need to access it later + */ + send_data.sd_final_dst_lpni = lpni; + send_data.sd_peer = peer; + send_data.sd_md_cpt = md_cpt; + send_data.sd_send_case = send_case; + + rc = lnet_handle_send_case_locked(&send_data); + + /* + * Update the local cpt since send_data.sd_cpt might've been + * updated as a result of calling lnet_handle_send_case_locked(). + */ + cpt = send_data.sd_cpt; + + if (rc == REPEAT_SEND) + goto again; + + lnet_net_unlock(cpt); + + return rc; +} + +int +lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid) +{ + lnet_nid_t dst_nid = msg->msg_target.nid; + int rc; + + /* + * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases, + * but we might want to use pre-determined router for ACK/REPLY + * in the future + */ + /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */ + LASSERT(msg->msg_txpeer == NULL); + LASSERT(msg->msg_txni == NULL); + LASSERT(!msg->msg_sending); + LASSERT(!msg->msg_target_is_router); + LASSERT(!msg->msg_receiving); + + msg->msg_sending = 1; + + LASSERT(!msg->msg_tx_committed); + + rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid); + if (rc < 0) + return rc; + + if (rc == LNET_CREDIT_OK) + lnet_ni_send(msg->msg_txni, msg); + + /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT or LNET_DC_WAIT */ + return 0; +} + +enum lnet_mt_event_type { + MT_TYPE_LOCAL_NI = 0, + MT_TYPE_PEER_NI +}; + +struct lnet_mt_event_info { + enum lnet_mt_event_type mt_type; + lnet_nid_t mt_nid; +}; + +/* called with res_lock held */ +void +lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt) +{ + struct lnet_rsp_tracker *rspt; + + /* + * msg has a refcount on the MD so the MD is not going away. + * The rspt queue for the cpt is protected by + * the lnet_net_lock(cpt). cpt is the cpt of the MD cookie. + */ + if (!md->md_rspt_ptr) + return; + + rspt = md->md_rspt_ptr; + + /* debug code */ + LASSERT(rspt->rspt_cpt == cpt); + + md->md_rspt_ptr = NULL; + + if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) { + /* + * The monitor thread has invalidated this handle because the + * response timed out, but it failed to lookup the MD. That + * means this response tracker is on the zombie list. We can + * safely remove it under the resource lock (held by caller) and + * free the response tracker block. + */ + list_del(&rspt->rspt_on_list); + lnet_rspt_free(rspt, cpt); + } else { + /* + * invalidate the handle to indicate that a response has been + * received, which will then lead the monitor thread to clean up + * the rspt block. + */ + LNetInvalidateMDHandle(&rspt->rspt_mdh); + } +} + +void +lnet_clean_zombie_rstqs(void) +{ + struct lnet_rsp_tracker *rspt, *tmp; + int i; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + list_for_each_entry_safe(rspt, tmp, + the_lnet.ln_mt_zombie_rstqs[i], + rspt_on_list) { + list_del(&rspt->rspt_on_list); + lnet_rspt_free(rspt, i); + } + } + + cfs_percpt_free(the_lnet.ln_mt_zombie_rstqs); +} + +static void +lnet_finalize_expired_responses(void) +{ + struct lnet_libmd *md; + struct list_head local_queue; + struct lnet_rsp_tracker *rspt, *tmp; + ktime_t now; + int i; + + if (the_lnet.ln_mt_rstq == NULL) + return; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + INIT_LIST_HEAD(&local_queue); + + lnet_net_lock(i); + if (!the_lnet.ln_mt_rstq[i]) { + lnet_net_unlock(i); + continue; + } + list_splice_init(the_lnet.ln_mt_rstq[i], &local_queue); + lnet_net_unlock(i); + + now = ktime_get(); + + list_for_each_entry_safe(rspt, tmp, &local_queue, rspt_on_list) { + /* + * The rspt mdh will be invalidated when a response + * is received or whenever we want to discard the + * block the monitor thread will walk the queue + * and clean up any rsts with an invalid mdh. + * The monitor thread will walk the queue until + * the first unexpired rspt block. This means that + * some rspt blocks which received their + * corresponding responses will linger in the + * queue until they are cleaned up eventually. + */ + lnet_res_lock(i); + if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) { + lnet_res_unlock(i); + list_del(&rspt->rspt_on_list); + lnet_rspt_free(rspt, i); + continue; + } + + if (ktime_compare(now, rspt->rspt_deadline) >= 0 || + the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) { + struct lnet_peer_ni *lpni; + lnet_nid_t nid; + + md = lnet_handle2md(&rspt->rspt_mdh); + if (!md) { + /* MD has been queued for unlink, but + * rspt hasn't been detached (Note we've + * checked above that the rspt_mdh is + * valid). Since we cannot lookup the MD + * we're unable to detach the rspt + * ourselves. Thus, move the rspt to the + * zombie list where we'll wait for + * either: + * 1. The remaining operations on the + * MD to complete. In this case the + * final operation will result in + * lnet_msg_detach_md()-> + * lnet_detach_rsp_tracker() where + * we will clean up this response + * tracker. + * 2. LNet to shutdown. In this case + * we'll wait until after all LND Nets + * have shutdown and then we can + * safely free any remaining response + * tracker blocks on the zombie list. + * Note: We need to hold the resource + * lock when adding to the zombie list + * because we may have concurrent access + * with lnet_detach_rsp_tracker(). + */ + LNetInvalidateMDHandle(&rspt->rspt_mdh); + list_move(&rspt->rspt_on_list, + the_lnet.ln_mt_zombie_rstqs[i]); + lnet_res_unlock(i); + continue; + } + LASSERT(md->md_rspt_ptr == rspt); + md->md_rspt_ptr = NULL; + lnet_res_unlock(i); + + LNetMDUnlink(rspt->rspt_mdh); + + nid = rspt->rspt_next_hop_nid; + + list_del(&rspt->rspt_on_list); + lnet_rspt_free(rspt, i); + + /* If we're shutting down we just want to clean + * up the rspt blocks + */ + if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) + continue; + + lnet_net_lock(i); + the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++; + lnet_net_unlock(i); + + CDEBUG(D_NET, + "Response timeout: md = %p: nid = %s\n", + md, libcfs_nid2str(nid)); + + /* + * If there is a timeout on the response + * from the next hop decrement its health + * value so that we don't use it + */ + lnet_net_lock(0); + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) { + lnet_handle_remote_failure_locked(lpni); + lnet_peer_ni_decref_locked(lpni); + } + lnet_net_unlock(0); + } else { + lnet_res_unlock(i); + break; + } + } + + if (!list_empty(&local_queue)) { + lnet_net_lock(i); + list_splice(&local_queue, the_lnet.ln_mt_rstq[i]); + lnet_net_unlock(i); + } + } +} + +static void +lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt) +{ + struct lnet_msg *msg; + + while (!list_empty(resendq)) { + struct lnet_peer_ni *lpni; + + msg = list_entry(resendq->next, struct lnet_msg, + msg_list); + + list_del_init(&msg->msg_list); + + lpni = lnet_find_peer_ni_locked(msg->msg_hdr.dest_nid); + if (!lpni) { + lnet_net_unlock(cpt); + CERROR("Expected that a peer is already created for %s\n", + libcfs_nid2str(msg->msg_hdr.dest_nid)); + msg->msg_no_resend = true; + lnet_finalize(msg, -EFAULT); + lnet_net_lock(cpt); + } else { + struct lnet_peer *peer; + int rc; + lnet_nid_t src_nid = LNET_NID_ANY; + + /* + * if this message is not being routed and the + * peer is non-MR then we must use the same + * src_nid that was used in the original send. + * Otherwise if we're routing the message (IE + * we're a router) then we can use any of our + * local interfaces. It doesn't matter to the + * final destination. + */ + peer = lpni->lpni_peer_net->lpn_peer; + if (!msg->msg_routing && + !lnet_peer_is_multi_rail(peer)) + src_nid = le64_to_cpu(msg->msg_hdr.src_nid); + + /* + * If we originally specified a src NID, then we + * must attempt to reuse it in the resend as well. + */ + if (msg->msg_src_nid_param != LNET_NID_ANY) + src_nid = msg->msg_src_nid_param; + lnet_peer_ni_decref_locked(lpni); + + lnet_net_unlock(cpt); + CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n", + libcfs_nid2str(src_nid), + libcfs_id2str(msg->msg_target), + lnet_msgtyp2str(msg->msg_type), + msg->msg_recovery, + msg->msg_retry_count); + rc = lnet_send(src_nid, msg, LNET_NID_ANY); + if (rc) { + CERROR("Error sending %s to %s: %d\n", + lnet_msgtyp2str(msg->msg_type), + libcfs_id2str(msg->msg_target), rc); + msg->msg_no_resend = true; + lnet_finalize(msg, rc); + } + lnet_net_lock(cpt); + if (!rc) + the_lnet.ln_counters[cpt]->lct_health.lch_resend_count++; + } + } +} + +static void +lnet_resend_pending_msgs(void) +{ + int i; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + lnet_net_lock(i); + lnet_resend_pending_msgs_locked(the_lnet.ln_mt_resendqs[i], i); + lnet_net_unlock(i); + } +} + +/* called with cpt and ni_lock held */ +static void +lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force) +{ + struct lnet_handle_md recovery_mdh; + + LNetInvalidateMDHandle(&recovery_mdh); + + if (ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING || + force) { + recovery_mdh = ni->ni_ping_mdh; + LNetInvalidateMDHandle(&ni->ni_ping_mdh); + } + lnet_ni_unlock(ni); + lnet_net_unlock(cpt); + if (!LNetMDHandleIsInvalid(recovery_mdh)) + LNetMDUnlink(recovery_mdh); + lnet_net_lock(cpt); + lnet_ni_lock(ni); +} + +static void +lnet_recover_local_nis(void) +{ + struct lnet_mt_event_info *ev_info; + struct list_head processed_list; + struct list_head local_queue; + struct lnet_handle_md mdh; + struct lnet_ni *tmp; + struct lnet_ni *ni; + lnet_nid_t nid; + int healthv; + int rc; + + INIT_LIST_HEAD(&local_queue); + INIT_LIST_HEAD(&processed_list); + + /* + * splice the recovery queue on a local queue. We will iterate + * through the local queue and update it as needed. Once we're + * done with the traversal, we'll splice the local queue back on + * the head of the ln_mt_localNIRecovq. Any newly added local NIs + * will be traversed in the next iteration. + */ + lnet_net_lock(0); + list_splice_init(&the_lnet.ln_mt_localNIRecovq, + &local_queue); + lnet_net_unlock(0); + + list_for_each_entry_safe(ni, tmp, &local_queue, ni_recovery) { + /* + * if an NI is being deleted or it is now healthy, there + * is no need to keep it around in the recovery queue. + * The monitor thread is the only thread responsible for + * removing the NI from the recovery queue. + * Multiple threads can be adding NIs to the recovery + * queue. + */ + healthv = atomic_read(&ni->ni_healthv); + + lnet_net_lock(0); + lnet_ni_lock(ni); + if (ni->ni_state != LNET_NI_STATE_ACTIVE || + healthv == LNET_MAX_HEALTH_VALUE) { + list_del_init(&ni->ni_recovery); + lnet_unlink_ni_recovery_mdh_locked(ni, 0, false); + lnet_ni_unlock(ni); + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); + continue; + } + + /* + * if the local NI failed recovery we must unlink the md. + * But we want to keep the local_ni on the recovery queue + * so we can continue the attempts to recover it. + */ + if (ni->ni_recovery_state & LNET_NI_RECOVERY_FAILED) { + lnet_unlink_ni_recovery_mdh_locked(ni, 0, true); + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED; + } + + lnet_ni_unlock(ni); + lnet_net_unlock(0); + + + CDEBUG(D_NET, "attempting to recover local ni: %s\n", + libcfs_nid2str(ni->ni_nid)); + + lnet_ni_lock(ni); + if (!(ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING)) { + ni->ni_recovery_state |= LNET_NI_RECOVERY_PENDING; + lnet_ni_unlock(ni); + + LIBCFS_ALLOC(ev_info, sizeof(*ev_info)); + if (!ev_info) { + CERROR("out of memory. Can't recover %s\n", + libcfs_nid2str(ni->ni_nid)); + lnet_ni_lock(ni); + ni->ni_recovery_state &= + ~LNET_NI_RECOVERY_PENDING; + lnet_ni_unlock(ni); + continue; + } + + mdh = ni->ni_ping_mdh; + /* + * Invalidate the ni mdh in case it's deleted. + * We'll unlink the mdh in this case below. + */ + LNetInvalidateMDHandle(&ni->ni_ping_mdh); + nid = ni->ni_nid; + + /* + * remove the NI from the local queue and drop the + * reference count to it while we're recovering + * it. The reason for that, is that the NI could + * be deleted, and the way the code is structured + * is if we don't drop the NI, then the deletion + * code will enter a loop waiting for the + * reference count to be removed while holding the + * ln_mutex_lock(). When we look up the peer to + * send to in lnet_select_pathway() we will try to + * lock the ln_mutex_lock() as well, leading to + * a deadlock. By dropping the refcount and + * removing it from the list, we allow for the NI + * to be removed, then we use the cached NID to + * look it up again. If it's gone, then we just + * continue examining the rest of the queue. + */ + lnet_net_lock(0); + list_del_init(&ni->ni_recovery); + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); + + ev_info->mt_type = MT_TYPE_LOCAL_NI; + ev_info->mt_nid = nid; + rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN, + ev_info, the_lnet.ln_mt_eqh, true); + /* lookup the nid again */ + lnet_net_lock(0); + ni = lnet_nid2ni_locked(nid, 0); + if (!ni) { + /* + * the NI has been deleted when we dropped + * the ref count + */ + lnet_net_unlock(0); + LNetMDUnlink(mdh); + continue; + } + /* + * Same note as in lnet_recover_peer_nis(). When + * we're sending the ping, the NI is free to be + * deleted or manipulated. By this point it + * could've been added back on the recovery queue, + * and a refcount taken on it. + * So we can't just add it blindly again or we'll + * corrupt the queue. We must check under lock if + * it's not on any list and if not then add it + * to the processed list, which will eventually be + * spliced back on to the recovery queue. + */ + ni->ni_ping_mdh = mdh; + if (list_empty(&ni->ni_recovery)) { + list_add_tail(&ni->ni_recovery, &processed_list); + lnet_ni_addref_locked(ni, 0); + } + lnet_net_unlock(0); + + lnet_ni_lock(ni); + if (rc) + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; + } + lnet_ni_unlock(ni); + } + + /* + * put back the remaining NIs on the ln_mt_localNIRecovq to be + * reexamined in the next iteration. + */ + list_splice_init(&processed_list, &local_queue); + lnet_net_lock(0); + list_splice(&local_queue, &the_lnet.ln_mt_localNIRecovq); + lnet_net_unlock(0); +} + +static int +lnet_resendqs_create(void) +{ + struct list_head **resendqs; + resendqs = lnet_create_array_of_queues(); + + if (!resendqs) + return -ENOMEM; + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_resendqs = resendqs; + lnet_net_unlock(LNET_LOCK_EX); + + return 0; +} + +static void +lnet_clean_local_ni_recoveryq(void) +{ + struct lnet_ni *ni; + + /* This is only called when the monitor thread has stopped */ + lnet_net_lock(0); + + while (!list_empty(&the_lnet.ln_mt_localNIRecovq)) { + ni = list_entry(the_lnet.ln_mt_localNIRecovq.next, + struct lnet_ni, ni_recovery); + list_del_init(&ni->ni_recovery); + lnet_ni_lock(ni); + lnet_unlink_ni_recovery_mdh_locked(ni, 0, true); + lnet_ni_unlock(ni); + lnet_ni_decref_locked(ni, 0); + } + + lnet_net_unlock(0); +} + +static void +lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt, + bool force) +{ + struct lnet_handle_md recovery_mdh; + + LNetInvalidateMDHandle(&recovery_mdh); + + if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING || force) { + recovery_mdh = lpni->lpni_recovery_ping_mdh; + LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); + } + spin_unlock(&lpni->lpni_lock); + lnet_net_unlock(cpt); + if (!LNetMDHandleIsInvalid(recovery_mdh)) + LNetMDUnlink(recovery_mdh); + lnet_net_lock(cpt); + spin_lock(&lpni->lpni_lock); +} + +static void +lnet_clean_peer_ni_recoveryq(void) +{ + struct lnet_peer_ni *lpni, *tmp; + + lnet_net_lock(LNET_LOCK_EX); + + list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_mt_peerNIRecovq, + lpni_recovery) { + list_del_init(&lpni->lpni_recovery); + spin_lock(&lpni->lpni_lock); + lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX, true); + spin_unlock(&lpni->lpni_lock); + lnet_peer_ni_decref_locked(lpni); + } + + lnet_net_unlock(LNET_LOCK_EX); +} + +static void +lnet_clean_resendqs(void) +{ + struct lnet_msg *msg, *tmp; + struct list_head msgs; + int i; + + INIT_LIST_HEAD(&msgs); + + cfs_cpt_for_each(i, lnet_cpt_table()) { + lnet_net_lock(i); + list_splice_init(the_lnet.ln_mt_resendqs[i], &msgs); + lnet_net_unlock(i); + list_for_each_entry_safe(msg, tmp, &msgs, msg_list) { + list_del_init(&msg->msg_list); + msg->msg_no_resend = true; + lnet_finalize(msg, -ESHUTDOWN); + } + } + + cfs_percpt_free(the_lnet.ln_mt_resendqs); +} + +static void +lnet_recover_peer_nis(void) +{ + struct lnet_mt_event_info *ev_info; + struct list_head processed_list; + struct list_head local_queue; + struct lnet_handle_md mdh; + struct lnet_peer_ni *lpni; + struct lnet_peer_ni *tmp; + lnet_nid_t nid; + int healthv; + int rc; + + INIT_LIST_HEAD(&local_queue); + INIT_LIST_HEAD(&processed_list); + + /* + * Always use cpt 0 for locking across all interactions with + * ln_mt_peerNIRecovq + */ + lnet_net_lock(0); + list_splice_init(&the_lnet.ln_mt_peerNIRecovq, + &local_queue); + lnet_net_unlock(0); + + list_for_each_entry_safe(lpni, tmp, &local_queue, + lpni_recovery) { + /* + * The same protection strategy is used here as is in the + * local recovery case. + */ + lnet_net_lock(0); + healthv = atomic_read(&lpni->lpni_healthv); + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_state & LNET_PEER_NI_DELETING || + healthv == LNET_MAX_HEALTH_VALUE) { + list_del_init(&lpni->lpni_recovery); + lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, false); + spin_unlock(&lpni->lpni_lock); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(0); + continue; + } + + /* + * If the peer NI has failed recovery we must unlink the + * md. But we want to keep the peer ni on the recovery + * queue so we can try to continue recovering it + */ + if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_FAILED) { + lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, true); + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_FAILED; + } + + spin_unlock(&lpni->lpni_lock); + lnet_net_unlock(0); + + /* + * NOTE: we're racing with peer deletion from user space. + * It's possible that a peer is deleted after we check its + * state. In this case the recovery can create a new peer + */ + spin_lock(&lpni->lpni_lock); + if (!(lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING) && + !(lpni->lpni_state & LNET_PEER_NI_DELETING)) { + lpni->lpni_state |= LNET_PEER_NI_RECOVERY_PENDING; + spin_unlock(&lpni->lpni_lock); + + LIBCFS_ALLOC(ev_info, sizeof(*ev_info)); + if (!ev_info) { + CERROR("out of memory. Can't recover %s\n", + libcfs_nid2str(lpni->lpni_nid)); + spin_lock(&lpni->lpni_lock); + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + spin_unlock(&lpni->lpni_lock); + continue; + } + + /* look at the comments in lnet_recover_local_nis() */ + mdh = lpni->lpni_recovery_ping_mdh; + LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); + nid = lpni->lpni_nid; + lnet_net_lock(0); + list_del_init(&lpni->lpni_recovery); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(0); + + ev_info->mt_type = MT_TYPE_PEER_NI; + ev_info->mt_nid = nid; + rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN, + ev_info, the_lnet.ln_mt_eqh, true); + lnet_net_lock(0); + /* + * lnet_find_peer_ni_locked() grabs a refcount for + * us. No need to take it explicitly. + */ + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + lnet_net_unlock(0); + LNetMDUnlink(mdh); + continue; + } + + lpni->lpni_recovery_ping_mdh = mdh; + /* + * While we're unlocked the lpni could've been + * readded on the recovery queue. In this case we + * don't need to add it to the local queue, since + * it's already on there and the thread that added + * it would've incremented the refcount on the + * peer, which means we need to decref the refcount + * that was implicitly grabbed by find_peer_ni_locked. + * Otherwise, if the lpni is still not on + * the recovery queue, then we'll add it to the + * processed list. + */ + if (list_empty(&lpni->lpni_recovery)) + list_add_tail(&lpni->lpni_recovery, &processed_list); + else + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(0); + + spin_lock(&lpni->lpni_lock); + if (rc) + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + } + spin_unlock(&lpni->lpni_lock); + } + + list_splice_init(&processed_list, &local_queue); + lnet_net_lock(0); + list_splice(&local_queue, &the_lnet.ln_mt_peerNIRecovq); + lnet_net_unlock(0); +} + +static int +lnet_monitor_thread(void *arg) +{ + time64_t recovery_timeout = 0; + time64_t rsp_timeout = 0; + int interval; + time64_t now; + + /* + * The monitor thread takes care of the following: + * 1. Checks the aliveness of routers + * 2. Checks if there are messages on the resend queue to resend + * them. + * 3. Check if there are any NIs on the local recovery queue and + * pings them + * 4. Checks if there are any NIs on the remote recovery queue + * and pings them. + */ + cfs_block_allsigs(); + + while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) { + now = ktime_get_real_seconds(); + + if (lnet_router_checker_active()) + lnet_check_routers(); + + lnet_resend_pending_msgs(); + + if (now >= rsp_timeout) { + lnet_finalize_expired_responses(); + rsp_timeout = now + (lnet_transaction_timeout / 2); + } + + if (now >= recovery_timeout) { + lnet_recover_local_nis(); + lnet_recover_peer_nis(); + recovery_timeout = now + lnet_recovery_interval; + } + + /* + * TODO do we need to check if we should sleep without + * timeout? Technically, an active system will always + * have messages in flight so this check will always + * evaluate to false. And on an idle system do we care + * if we wake up every 1 second? Although, we've seen + * cases where we get a complaint that an idle thread + * is waking up unnecessarily. + */ + interval = min(lnet_recovery_interval, + lnet_transaction_timeout / 2); + wait_event_interruptible_timeout(the_lnet.ln_mt_waitq, + false, + cfs_time_seconds(interval)); + } + + /* clean up the router checker */ + lnet_prune_rc_data(1); + + /* Shutting down */ + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN; + lnet_net_unlock(LNET_LOCK_EX); + + /* signal that the monitor thread is exiting */ + up(&the_lnet.ln_mt_signal); + + return 0; +} + +/* + * lnet_send_ping + * Sends a ping. + * Returns == 0 if success + * Returns > 0 if LNetMDBind or prior fails + * Returns < 0 if LNetGet fails + */ +int +lnet_send_ping(lnet_nid_t dest_nid, + struct lnet_handle_md *mdh, int nnis, + void *user_data, struct lnet_handle_eq eqh, bool recovery) +{ + struct lnet_md md = { NULL }; + struct lnet_process_id id; + struct lnet_ping_buffer *pbuf; + int rc; + + if (dest_nid == LNET_NID_ANY) { + rc = -EHOSTUNREACH; + goto fail_error; + } + + pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS); + if (!pbuf) { + rc = ENOMEM; + goto fail_error; + } + + /* initialize md content */ + md.start = &pbuf->pb_info; + md.length = LNET_PING_INFO_SIZE(nnis); + md.threshold = 2; /* GET/REPLY */ + md.max_size = 0; + md.options = LNET_MD_TRUNCATE; + md.user_ptr = user_data; + md.eq_handle = eqh; + + rc = LNetMDBind(md, LNET_UNLINK, mdh); + if (rc) { + lnet_ping_buffer_decref(pbuf); + CERROR("Can't bind MD: %d\n", rc); + rc = -rc; /* change the rc to positive */ + goto fail_error; + } + id.pid = LNET_PID_LUSTRE; + id.nid = dest_nid; + + rc = LNetGet(LNET_NID_ANY, *mdh, id, + LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0, recovery); + + if (rc) + goto fail_unlink_md; + + return 0; + +fail_unlink_md: + LNetMDUnlink(*mdh); + LNetInvalidateMDHandle(mdh); +fail_error: + return rc; +} + +static void +lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, + int status, bool unlink_event) +{ + lnet_nid_t nid = ev_info->mt_nid; + + if (ev_info->mt_type == MT_TYPE_LOCAL_NI) { + struct lnet_ni *ni; + + lnet_net_lock(0); + ni = lnet_nid2ni_locked(nid, 0); + if (!ni) { + lnet_net_unlock(0); + return; + } + lnet_ni_lock(ni); + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; + if (status) + ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED; + lnet_ni_unlock(ni); + lnet_net_unlock(0); + + if (status != 0) { + CERROR("local NI (%s) recovery failed with %d\n", + libcfs_nid2str(nid), status); + return; + } + /* + * need to increment healthv for the ni here, because in + * the lnet_finalize() path we don't have access to this + * NI. And in order to get access to it, we'll need to + * carry forward too much information. + * In the peer case, it'll naturally be incremented + */ + if (!unlink_event) + lnet_inc_healthv(&ni->ni_healthv); + } else { + struct lnet_peer_ni *lpni; + int cpt; + + cpt = lnet_net_lock_current(); + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + lnet_net_unlock(cpt); + return; + } + spin_lock(&lpni->lpni_lock); + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + if (status) + lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED; + spin_unlock(&lpni->lpni_lock); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(cpt); + + if (status != 0) + CERROR("peer NI (%s) recovery failed with %d\n", + libcfs_nid2str(nid), status); + } +} + +void +lnet_mt_event_handler(struct lnet_event *event) +{ + struct lnet_mt_event_info *ev_info = event->md.user_ptr; + struct lnet_ping_buffer *pbuf; + + /* TODO: remove assert */ + LASSERT(event->type == LNET_EVENT_REPLY || + event->type == LNET_EVENT_SEND || + event->type == LNET_EVENT_UNLINK); + + CDEBUG(D_NET, "Received event: %d status: %d\n", event->type, + event->status); + + switch (event->type) { + case LNET_EVENT_UNLINK: + CDEBUG(D_NET, "%s recovery ping unlinked\n", + libcfs_nid2str(ev_info->mt_nid)); + fallthrough; + case LNET_EVENT_REPLY: + lnet_handle_recovery_reply(ev_info, event->status, + event->type == LNET_EVENT_UNLINK); + break; + case LNET_EVENT_SEND: + CDEBUG(D_NET, "%s recovery message sent %s:%d\n", + libcfs_nid2str(ev_info->mt_nid), + (event->status) ? "unsuccessfully" : + "successfully", event->status); + break; + default: + CERROR("Unexpected event: %d\n", event->type); + break; + } + if (event->unlinked) { + LIBCFS_FREE(ev_info, sizeof(*ev_info)); + pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start); + lnet_ping_buffer_decref(pbuf); + } +} + +static int +lnet_rsp_tracker_create(void) +{ + struct list_head **rstqs; + rstqs = lnet_create_array_of_queues(); + + if (!rstqs) + return -ENOMEM; + + the_lnet.ln_mt_rstq = rstqs; + + return 0; +} + +static void +lnet_rsp_tracker_clean(void) +{ + lnet_finalize_expired_responses(); + + cfs_percpt_free(the_lnet.ln_mt_rstq); + the_lnet.ln_mt_rstq = NULL; +} + +int lnet_monitor_thr_start(void) +{ + int rc = 0; + struct task_struct *task; + + if (the_lnet.ln_mt_state != LNET_MT_STATE_SHUTDOWN) + return -EALREADY; + + rc = lnet_resendqs_create(); + if (rc) + return rc; + + rc = lnet_rsp_tracker_create(); + if (rc) + goto clean_queues; + + /* Pre monitor thread start processing */ + rc = lnet_router_pre_mt_start(); + if (rc) + goto free_mem; + + sema_init(&the_lnet.ln_mt_signal, 0); + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_state = LNET_MT_STATE_RUNNING; + lnet_net_unlock(LNET_LOCK_EX); + task = kthread_run(lnet_monitor_thread, NULL, "monitor_thread"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("Can't start monitor thread: %d\n", rc); + goto clean_thread; + } + + /* post monitor thread start processing */ + lnet_router_post_mt_start(); + + return 0; + +clean_thread: + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING; + lnet_net_unlock(LNET_LOCK_EX); + /* block until event callback signals exit */ + down(&the_lnet.ln_mt_signal); + /* clean up */ + lnet_router_cleanup(); +free_mem: + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN; + lnet_net_unlock(LNET_LOCK_EX); + lnet_rsp_tracker_clean(); + lnet_clean_local_ni_recoveryq(); + lnet_clean_peer_ni_recoveryq(); + lnet_clean_resendqs(); + LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh); + return rc; +clean_queues: + lnet_rsp_tracker_clean(); + lnet_clean_local_ni_recoveryq(); + lnet_clean_peer_ni_recoveryq(); + lnet_clean_resendqs(); + return rc; +} + +void lnet_monitor_thr_stop(void) +{ + if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) + return; + + LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING); + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING; + lnet_net_unlock(LNET_LOCK_EX); + + /* tell the monitor thread that we're shutting down */ + wake_up(&the_lnet.ln_mt_waitq); + + /* block until monitor thread signals that it's done */ + down(&the_lnet.ln_mt_signal); + LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN); + + /* perform cleanup tasks */ + lnet_router_cleanup(); + lnet_rsp_tracker_clean(); + lnet_clean_local_ni_recoveryq(); + lnet_clean_peer_ni_recoveryq(); + lnet_clean_resendqs(); + + return; +} + +void +lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob, + __u32 msg_type) +{ + lnet_net_lock(cpt); + lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP); + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++; + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += nob; + lnet_net_unlock(cpt); + + lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob); +} + +static void +lnet_recv_put(struct lnet_ni *ni, struct lnet_msg *msg) +{ + struct lnet_hdr *hdr = &msg->msg_hdr; + + if (msg->msg_wanted != 0) + lnet_setpayloadbuffer(msg); + + lnet_build_msg_event(msg, LNET_EVENT_PUT); + + /* Must I ACK? If so I'll grab the ack_wmd out of the header and put + * it back into the ACK during lnet_finalize() */ + msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0); + + lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed, + msg->msg_offset, msg->msg_wanted, hdr->payload_length); +} + +static int +lnet_parse_put(struct lnet_ni *ni, struct lnet_msg *msg) +{ + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_match_info info; + int rc; + bool ready_delay; + + /* Convert put fields to host byte order */ + hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); + hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); + + /* Primary peer NID. */ + info.mi_id.nid = msg->msg_initiator; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_PUT; + info.mi_portal = hdr->msg.put.ptl_index; + info.mi_rlength = hdr->payload_length; + info.mi_roffset = hdr->msg.put.offset; + info.mi_mbits = hdr->msg.put.match_bits; + info.mi_cpt = lnet_cpt_of_nid(msg->msg_rxpeer->lpni_nid, ni); + + msg->msg_rx_ready_delay = ni->ni_net->net_lnd->lnd_eager_recv == NULL; + ready_delay = msg->msg_rx_ready_delay; + + again: + rc = lnet_ptl_match_md(&info, msg); + switch (rc) { + default: + LBUG(); + + case LNET_MATCHMD_OK: + lnet_recv_put(ni, msg); + return 0; + + case LNET_MATCHMD_NONE: + if (ready_delay) + /* no eager_recv or has already called it, should + * have been attached on delayed list */ + return 0; + + rc = lnet_ni_eager_recv(ni, msg); + if (rc == 0) { + ready_delay = true; + goto again; + } + fallthrough; + + case LNET_MATCHMD_DROP: + CNETERR("Dropping PUT from %s portal %d match %llu" + " offset %d length %d: %d\n", + libcfs_id2str(info.mi_id), info.mi_portal, + info.mi_mbits, info.mi_roffset, info.mi_rlength, rc); + + return -ENOENT; /* -ve: OK but no match */ + } +} + +static int +lnet_parse_get(struct lnet_ni *ni, struct lnet_msg *msg, int rdma_get) +{ + struct lnet_match_info info; + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_process_id source_id; + struct lnet_handle_wire reply_wmd; + int rc; + + /* Convert get fields to host byte order */ + hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); + hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index); + hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); + hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); + + source_id.nid = hdr->src_nid; + source_id.pid = hdr->src_pid; + /* Primary peer NID */ + info.mi_id.nid = msg->msg_initiator; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_GET; + info.mi_portal = hdr->msg.get.ptl_index; + info.mi_rlength = hdr->msg.get.sink_length; + info.mi_roffset = hdr->msg.get.src_offset; + info.mi_mbits = hdr->msg.get.match_bits; + info.mi_cpt = lnet_cpt_of_nid(msg->msg_rxpeer->lpni_nid, ni); + + rc = lnet_ptl_match_md(&info, msg); + if (rc == LNET_MATCHMD_DROP) { + CNETERR("Dropping GET from %s portal %d match %llu" + " offset %d length %d\n", + libcfs_id2str(info.mi_id), info.mi_portal, + info.mi_mbits, info.mi_roffset, info.mi_rlength); + return -ENOENT; /* -ve: OK but no match */ + } + + LASSERT(rc == LNET_MATCHMD_OK); + + lnet_build_msg_event(msg, LNET_EVENT_GET); + + reply_wmd = hdr->msg.get.return_wmd; + + lnet_prep_send(msg, LNET_MSG_REPLY, source_id, + msg->msg_offset, msg->msg_wanted); + + msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; + + if (rdma_get) { + /* The LND completes the REPLY from her recv procedure */ + lnet_ni_recv(ni, msg->msg_private, msg, 0, + msg->msg_offset, msg->msg_len, msg->msg_len); + return 0; + } + + lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); + msg->msg_receiving = 0; + + rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY); + if (rc < 0) { + /* didn't get as far as lnet_ni_send() */ + CERROR("%s: Unable to send REPLY for GET from %s: %d\n", + libcfs_nid2str(ni->ni_nid), + libcfs_id2str(info.mi_id), rc); + + lnet_finalize(msg, rc); + } + + return 0; +} + +static int +lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) +{ + void *private = msg->msg_private; + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_process_id src = {0}; + struct lnet_libmd *md; + int rlength; + int mlength; + int cpt; + + cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); + lnet_res_lock(cpt); + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CNETERR("%s: Dropping REPLY from %s for %s " + "MD %#llx.%#llx\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + return -ENOENT; /* -ve: OK but no match */ + } + + LASSERT(md->md_offset == 0); + + rlength = hdr->payload_length; + mlength = MIN(rlength, (int)md->md_length); + + if (mlength < rlength && + (md->md_options & LNET_MD_TRUNCATE) == 0) { + CNETERR("%s: Dropping REPLY from %s length %d " + "for MD %#llx would overflow (%d)\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, + mlength); + lnet_res_unlock(cpt); + return -ENOENT; /* -ve: OK but no match */ + } + + CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md %#llx\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); + + lnet_msg_attach_md(msg, md, 0, mlength); + + if (mlength != 0) + lnet_setpayloadbuffer(msg); + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_REPLY); + + lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength); + return 0; +} + +static int +lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg) +{ + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_process_id src = {0}; + struct lnet_libmd *md; + int cpt; + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* Convert ack fields to host byte order */ + hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); + + cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie); + lnet_res_lock(cpt); + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + /* Don't moan; this is expected */ + CDEBUG(D_NET, + "%s: Dropping ACK from %s to %s MD %#llx.%#llx\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + return -ENOENT; /* -ve! */ + } + + CDEBUG(D_NET, "%s: ACK from %s into md %#llx\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + hdr->msg.ack.dst_wmd.wh_object_cookie); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_ACK); + + lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len); + return 0; +} + +/** + * \retval LNET_CREDIT_OK If \a msg is forwarded + * \retval LNET_CREDIT_WAIT If \a msg is blocked because w/o buffer + * \retval -ve error code + */ +int +lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg) +{ + int rc = 0; + + if (!the_lnet.ln_routing) + return -ECANCELED; + + if (msg->msg_rxpeer->lpni_rtrcredits <= 0 || + lnet_msg2bufpool(msg)->rbp_credits <= 0) { + if (ni->ni_net->net_lnd->lnd_eager_recv == NULL) { + msg->msg_rx_ready_delay = 1; + } else { + lnet_net_unlock(msg->msg_rx_cpt); + rc = lnet_ni_eager_recv(ni, msg); + lnet_net_lock(msg->msg_rx_cpt); + } + } + + if (rc == 0) + rc = lnet_post_routed_recv_locked(msg, 0); + return rc; +} + +int +lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg) +{ + int rc; + + switch (msg->msg_type) { + case LNET_MSG_ACK: + rc = lnet_parse_ack(ni, msg); + break; + case LNET_MSG_PUT: + rc = lnet_parse_put(ni, msg); + break; + case LNET_MSG_GET: + rc = lnet_parse_get(ni, msg, msg->msg_rdma_get); + break; + case LNET_MSG_REPLY: + rc = lnet_parse_reply(ni, msg); + break; + default: /* prevent an unused label if !kernel */ + LASSERT(0); + return -EPROTO; + } + + LASSERT(rc == 0 || rc == -ENOENT); + return rc; +} + +char * +lnet_msgtyp2str (int type) +{ + switch (type) { + case LNET_MSG_ACK: + return ("ACK"); + case LNET_MSG_PUT: + return ("PUT"); + case LNET_MSG_GET: + return ("GET"); + case LNET_MSG_REPLY: + return ("REPLY"); + case LNET_MSG_HELLO: + return ("HELLO"); + default: + return (""); + } +} + +void +lnet_print_hdr(struct lnet_hdr *hdr) +{ + struct lnet_process_id src = { + .nid = hdr->src_nid, + .pid = hdr->src_pid, + }; + struct lnet_process_id dst = { + .nid = hdr->dest_nid, + .pid = hdr->dest_pid, + }; + char *type_str = lnet_msgtyp2str(hdr->type); + + CWARN("P3 Header at %p of type %s\n", hdr, type_str); + CWARN(" From %s\n", libcfs_id2str(src)); + CWARN(" To %s\n", libcfs_id2str(dst)); + + switch (hdr->type) { + default: + break; + + case LNET_MSG_PUT: + CWARN(" Ptl index %d, ack md %#llx.%#llx, " + "match bits %llu\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + CWARN(" Length %d, offset %d, hdr data %#llx\n", + hdr->payload_length, hdr->msg.put.offset, + hdr->msg.put.hdr_data); + break; + + case LNET_MSG_GET: + CWARN(" Ptl index %d, return md %#llx.%#llx, " + "match bits %llu\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CWARN(" Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); + break; + + case LNET_MSG_ACK: + CWARN(" dst md %#llx.%#llx, " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); + break; + + case LNET_MSG_REPLY: + CWARN(" dst md %#llx.%#llx, " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + hdr->payload_length); + } + +} + +int +lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, + void *private, int rdma_req) +{ + int rc = 0; + int cpt; + int for_me; + struct lnet_msg *msg; + lnet_pid_t dest_pid; + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + struct lnet_peer_ni *lpni; + __u32 payload_length; + __u32 type; + + LASSERT (!in_interrupt ()); + + type = le32_to_cpu(hdr->type); + src_nid = le64_to_cpu(hdr->src_nid); + dest_nid = le64_to_cpu(hdr->dest_nid); + dest_pid = le32_to_cpu(hdr->dest_pid); + payload_length = le32_to_cpu(hdr->payload_length); + + for_me = (ni->ni_nid == dest_nid); + cpt = lnet_cpt_of_nid(from_nid, ni); + + CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s - %s\n", + libcfs_nid2str(dest_nid), + libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), + (for_me) ? "for me" : "routed"); + + switch (type) { + case LNET_MSG_ACK: + case LNET_MSG_GET: + if (payload_length > 0) { + CERROR("%s, src %s: bad %s payload %d (0 expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), payload_length); + return -EPROTO; + } + break; + + case LNET_MSG_PUT: + case LNET_MSG_REPLY: + if (payload_length > + (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) { + CERROR("%s, src %s: bad %s payload %d " + "(%d max expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), + payload_length, + for_me ? LNET_MAX_PAYLOAD : LNET_MTU); + return -EPROTO; + } + break; + + default: + CERROR("%s, src %s: Bad message type 0x%x\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), type); + return -EPROTO; + } + + if (the_lnet.ln_routing && + ni->ni_last_alive != ktime_get_real_seconds()) { + /* NB: so far here is the only place to set NI status to "up */ + lnet_ni_lock(ni); + ni->ni_last_alive = ktime_get_real_seconds(); + if (ni->ni_status != NULL && + ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) + ni->ni_status->ns_status = LNET_NI_STATUS_UP; + lnet_ni_unlock(ni); + } + + /* Regard a bad destination NID as a protocol error. Senders should + * know what they're doing; if they don't they're misconfigured, buggy + * or malicious so we chop them off at the knees :) */ + + if (!for_me) { + if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) { + /* should have gone direct */ + CERROR("%s, src %s: Bad dest nid %s " + "(should have been sent direct)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (lnet_islocalnid(dest_nid)) { + /* dest is another local NI; sender should have used + * this node's NID on its own network */ + CERROR("%s, src %s: Bad dest nid %s " + "(it's my nid but on a different network)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (rdma_req && type == LNET_MSG_GET) { + CERROR("%s, src %s: Bad optimized GET for %s " + "(final destination must be me)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (!the_lnet.ln_routing) { + CERROR("%s, src %s: Dropping message for %s " + "(routing not enabled)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + goto drop; + } + } + + /* Message looks OK; we're not going to return an error, so we MUST + * call back lnd_recv() come what may... */ + + if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer(src_nid, 0)) { /* shall we now? */ + CERROR("%s, src %s: Dropping %s to simulate failure\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type)); + goto drop; + } + + if (!list_empty(&the_lnet.ln_drop_rules) && + lnet_drop_rule_match(hdr, NULL)) { + CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate" + "silent message loss\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid), lnet_msgtyp2str(type)); + goto drop; + } + + if (lnet_drop_asym_route && for_me && + LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { + struct lnet_net *net; + struct lnet_remotenet *rnet; + bool found = true; + + /* we are dealing with a routed message, + * so see if route to reach src_nid goes through from_nid + */ + lnet_net_lock(cpt); + net = lnet_get_net_locked(LNET_NIDNET(ni->ni_nid)); + if (!net) { + lnet_net_unlock(cpt); + CERROR("net %s not found\n", + libcfs_net2str(LNET_NIDNET(ni->ni_nid))); + return -EPROTO; + } + + rnet = lnet_find_rnet_locked(LNET_NIDNET(src_nid)); + if (rnet) { + struct lnet_peer_ni *gw = NULL; + struct lnet_route *route; + + list_for_each_entry(route, &rnet->lrn_routes, lr_list) { + found = false; + gw = route->lr_gateway; + if (gw->lpni_net != net) + continue; + if (gw->lpni_nid == from_nid) { + found = true; + break; + } + } + } + lnet_net_unlock(cpt); + if (!found) { + /* we would not use from_nid to route a message to + * src_nid + * => asymmetric routing detected but forbidden + */ + CERROR("%s, src %s: Dropping asymmetrical route %s\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), lnet_msgtyp2str(type)); + goto drop; + } + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("%s, src %s: Dropping %s (out of memory)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type)); + goto drop; + } + + /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, + * pointers NULL etc */ + + msg->msg_type = type; + msg->msg_private = private; + msg->msg_receiving = 1; + msg->msg_rdma_get = rdma_req; + msg->msg_len = msg->msg_wanted = payload_length; + msg->msg_offset = 0; + msg->msg_hdr = *hdr; + /* for building message event */ + msg->msg_from = from_nid; + if (!for_me) { + msg->msg_target.pid = dest_pid; + msg->msg_target.nid = dest_nid; + msg->msg_routing = 1; + + } else { + /* convert common msg->hdr fields to host byteorder */ + msg->msg_hdr.type = type; + msg->msg_hdr.src_nid = src_nid; + msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid); + msg->msg_hdr.dest_nid = dest_nid; + msg->msg_hdr.dest_pid = dest_pid; + msg->msg_hdr.payload_length = payload_length; + } + + lnet_net_lock(cpt); + lpni = lnet_nid2peerni_locked(from_nid, ni->ni_nid, cpt); + if (IS_ERR(lpni)) { + lnet_net_unlock(cpt); + CERROR("%s, src %s: Dropping %s " + "(error %ld looking up sender)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), PTR_ERR(lpni)); + lnet_msg_free(msg); + if (rc == -ESHUTDOWN) + /* We are shutting down. Don't do anything more */ + return 0; + goto drop; + } + msg->msg_rxpeer = lpni; + msg->msg_rxni = ni; + lnet_ni_addref_locked(ni, cpt); + /* Multi-Rail: Primary NID of source. */ + msg->msg_initiator = lnet_peer_primary_nid_locked(src_nid); + + if (lnet_isrouter(msg->msg_rxpeer)) { + lnet_peer_set_alive(msg->msg_rxpeer); + if (avoid_asym_router_failure && + LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { + /* received a remote message from router, update + * remote NI status on this router. + * NB: multi-hop routed message will be ignored. + */ + lnet_router_ni_update_locked(msg->msg_rxpeer, + LNET_NIDNET(src_nid)); + } + } + + lnet_msg_commit(msg, cpt); + + /* message delay simulation */ + if (unlikely(!list_empty(&the_lnet.ln_delay_rules) && + lnet_delay_rule_match_locked(hdr, msg))) { + lnet_net_unlock(cpt); + return 0; + } + + if (!for_me) { + rc = lnet_parse_forward_locked(ni, msg); + lnet_net_unlock(cpt); + + if (rc < 0) + goto free_drop; + + if (rc == LNET_CREDIT_OK) { + lnet_ni_recv(ni, msg->msg_private, msg, 0, + 0, payload_length, payload_length); + } + return 0; + } + + lnet_net_unlock(cpt); + + rc = lnet_parse_local(ni, msg); + if (rc != 0) + goto free_drop; + return 0; + + free_drop: + LASSERT(msg->msg_md == NULL); + lnet_finalize(msg, rc); + + drop: + lnet_drop_message(ni, cpt, private, payload_length, type); + return 0; +} +EXPORT_SYMBOL(lnet_parse); + +void +lnet_drop_delayed_msg_list(struct list_head *head, char *reason) +{ + while (!list_empty(head)) { + struct lnet_process_id id = {0}; + struct lnet_msg *msg; + + msg = list_entry(head->next, struct lnet_msg, msg_list); + list_del(&msg->msg_list); + + id.nid = msg->msg_hdr.src_nid; + id.pid = msg->msg_hdr.src_pid; + + LASSERT(msg->msg_md == NULL); + LASSERT(msg->msg_rx_delayed); + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); + + CWARN("Dropping delayed PUT from %s portal %d match %llu" + " offset %d length %d: %s\n", + libcfs_id2str(id), + msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length, reason); + + /* NB I can't drop msg's ref on msg_rxpeer until after I've + * called lnet_drop_message(), so I just hang onto msg as well + * until that's done */ + + lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt, + msg->msg_private, msg->msg_len, + msg->msg_type); + + msg->msg_no_resend = true; + /* + * NB: message will not generate event because w/o attached MD, + * but we still should give error code so lnet_msg_decommit() + * can skip counters operations and other checks. + */ + lnet_finalize(msg, -ENOENT); + } +} + +void +lnet_recv_delayed_msg_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct lnet_msg *msg; + struct lnet_process_id id; + + msg = list_entry(head->next, struct lnet_msg, msg_list); + list_del(&msg->msg_list); + + /* md won't disappear under me, since each msg + * holds a ref on it */ + + id.nid = msg->msg_hdr.src_nid; + id.pid = msg->msg_hdr.src_pid; + + LASSERT(msg->msg_rx_delayed); + LASSERT(msg->msg_md != NULL); + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_rxni != NULL); + LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " + "match %llu offset %d length %d.\n", + libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length); + + lnet_recv_put(msg->msg_rxni, msg); + } +} + +static void +lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, + struct lnet_libmd *md, struct lnet_handle_md mdh) +{ + s64 timeout_ns; + bool new_entry = true; + struct lnet_rsp_tracker *local_rspt; + + /* + * MD has a refcount taken by message so it's not going away. + * The MD however can be looked up. We need to secure the access + * to the md_rspt_ptr by taking the res_lock. + * The rspt can be accessed without protection up to when it gets + * added to the list. + */ + + lnet_res_lock(cpt); + local_rspt = md->md_rspt_ptr; + timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC; + if (local_rspt != NULL) { + /* + * we already have an rspt attached to the md, so we'll + * update the deadline on that one. + */ + LIBCFS_FREE(rspt, sizeof(*rspt)); + new_entry = false; + } else { + /* new md */ + rspt->rspt_mdh = mdh; + rspt->rspt_cpt = cpt; + /* store the rspt so we can access it when we get the REPLY */ + md->md_rspt_ptr = rspt; + local_rspt = rspt; + } + local_rspt->rspt_deadline = ktime_add_ns(ktime_get(), timeout_ns); + + /* + * add to the list of tracked responses. It's added to tail of the + * list in order to expire all the older entries first. + */ + lnet_net_lock(cpt); + if (!new_entry && !list_empty(&local_rspt->rspt_on_list)) + list_del_init(&local_rspt->rspt_on_list); + list_add_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]); + lnet_net_unlock(cpt); + lnet_res_unlock(cpt); +} + +/** + * Initiate an asynchronous PUT operation. + * + * There are several events associated with a PUT: completion of the send on + * the initiator node (LNET_EVENT_SEND), and when the send completes + * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating + * that the operation was accepted by the target. The event LNET_EVENT_PUT is + * used at the target node to indicate the completion of incoming data + * delivery. + * + * The local events will be logged in the EQ associated with the MD pointed to + * by \a mdh handle. Using a MD without an associated EQ results in these + * events being discarded. In this case, the caller must have another + * mechanism (e.g., a higher level protocol) for determining when it is safe + * to modify the memory region associated with the MD. + * + * Note that LNet does not guarantee the order of LNET_EVENT_SEND and + * LNET_EVENT_ACK, though intuitively ACK should happen after SEND. + * + * \param self Indicates the NID of a local interface through which to send + * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself. + * \param mdh A handle for the MD that describes the memory to be sent. The MD + * must be "free floating" (See LNetMDBind()). + * \param ack Controls whether an acknowledgment is requested. + * Acknowledgments are only sent when they are requested by the initiating + * process and the target MD enables them. + * \param target A process identifier for the target process. + * \param portal The index in the \a target's portal table. + * \param match_bits The match bits to use for MD selection at the target + * process. + * \param offset The offset into the target MD (only used when the target + * MD has the LNET_MD_MANAGE_REMOTE option set). + * \param hdr_data 64 bits of user data that can be included in the message + * header. This data is written to an event queue entry at the target if an + * EQ is present on the matching MD. + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists). + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + * + * \see struct lnet_event::hdr_data and lnet_event_kind_t. + */ +int +LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, + struct lnet_process_id target, unsigned int portal, + __u64 match_bits, unsigned int offset, + __u64 hdr_data) +{ + struct lnet_msg *msg; + struct lnet_libmd *md; + int cpt; + int rc; + struct lnet_rsp_tracker *rspt = NULL; + + LASSERT(the_lnet.ln_refcount > 0); + + if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer(target.nid, 1)) { /* shall we now? */ + CERROR("Dropping PUT to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("Dropping PUT to %s: ENOMEM on struct lnet_msg\n", + libcfs_id2str(target)); + return -ENOMEM; + } + msg->msg_vmflush = !!memory_pressure_get(); + + cpt = lnet_cpt_of_cookie(mdh.cookie); + + if (ack == LNET_ACK_REQ) { + rspt = lnet_rspt_alloc(cpt); + if (!rspt) { + CERROR("Dropping PUT to %s: ENOMEM on response tracker\n", + libcfs_id2str(target)); + return -ENOMEM; + } + INIT_LIST_HEAD(&rspt->rspt_on_list); + } + + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping PUT (%llu:%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); + lnet_res_unlock(cpt); + + LIBCFS_FREE(rspt, sizeof(*rspt)); + lnet_msg_free(msg); + return -ENOENT; + } + + CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target)); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length); + + msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); + msg->msg_hdr.msg.put.hdr_data = hdr_data; + + /* NB handles only looked up by creator (no flips) */ + if (ack == LNET_ACK_REQ) { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + } else { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + LNET_WIRE_HANDLE_COOKIE_NONE; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + LNET_WIRE_HANDLE_COOKIE_NONE; + } + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_SEND); + + if (ack == LNET_ACK_REQ) + lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + + rc = lnet_send(self, msg, LNET_NID_ANY); + if (rc != 0) { + CNETERR("Error sending PUT to %s: %d\n", + libcfs_id2str(target), rc); + msg->msg_no_resend = true; + lnet_finalize(msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} +EXPORT_SYMBOL(LNetPut); + +/* + * The LND can DMA direct to the GET md (i.e. no REPLY msg). This + * returns a msg for the LND to pass to lnet_finalize() when the sink + * data has been received. + * + * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when + * lnet_finalize() is called on it, so the LND must call this first + */ +struct lnet_msg * +lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg) +{ + struct lnet_msg *msg = lnet_msg_alloc(); + struct lnet_libmd *getmd = getmsg->msg_md; + struct lnet_process_id peer_id = getmsg->msg_target; + int cpt; + + LASSERT(!getmsg->msg_target_is_router); + LASSERT(!getmsg->msg_routing); + + if (msg == NULL) { + CERROR("%s: Dropping REPLY from %s: can't allocate msg\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id)); + goto drop; + } + + cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie); + lnet_res_lock(cpt); + + LASSERT(getmd->md_refcount > 0); + + if (getmd->md_threshold == 0) { + CERROR("%s: Dropping REPLY from %s for inactive MD %p\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), + getmd); + lnet_res_unlock(cpt); + goto drop; + } + + LASSERT(getmd->md_offset == 0); + + CDEBUG(D_NET, "%s: Reply from %s md %p\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); + + /* setup information for lnet_build_msg_event */ + msg->msg_initiator = getmsg->msg_txpeer->lpni_peer_net->lpn_peer->lp_primary_nid; + msg->msg_from = peer_id.nid; + msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ + msg->msg_hdr.src_nid = peer_id.nid; + msg->msg_hdr.payload_length = getmd->md_length; + msg->msg_receiving = 1; /* required by lnet_msg_attach_md */ + + lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length); + lnet_res_unlock(cpt); + + cpt = lnet_cpt_of_nid(peer_id.nid, ni); + + lnet_net_lock(cpt); + lnet_msg_commit(msg, cpt); + lnet_net_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_REPLY); + + return msg; + + drop: + cpt = lnet_cpt_of_nid(peer_id.nid, ni); + + lnet_net_lock(cpt); + lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP); + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++; + the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += + getmd->md_length; + lnet_net_unlock(cpt); + + if (msg != NULL) + lnet_msg_free(msg); + + return NULL; +} +EXPORT_SYMBOL(lnet_create_reply_msg); + +void +lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *reply, + unsigned int len) +{ + /* Set the REPLY length, now the RDMA that elides the REPLY message has + * completed and I know it. */ + LASSERT(reply != NULL); + LASSERT(reply->msg_type == LNET_MSG_GET); + LASSERT(reply->msg_ev.type == LNET_EVENT_REPLY); + + /* NB I trusted my peer to RDMA. If she tells me she's written beyond + * the end of my buffer, I might as well be dead. */ + LASSERT(len <= reply->msg_ev.mlength); + + reply->msg_ev.mlength = len; +} +EXPORT_SYMBOL(lnet_set_reply_msg_len); + +/** + * Initiate an asynchronous GET operation. + * + * On the initiator node, an LNET_EVENT_SEND is logged when the GET request + * is sent, and an LNET_EVENT_REPLY is logged when the data returned from + * the target node in the REPLY has been written to local MD. + * + * On the target node, an LNET_EVENT_GET is logged when the GET request + * arrives and is accepted into a MD. + * + * \param self,target,portal,match_bits,offset See the discussion in LNetPut(). + * \param mdh A handle for the MD that describes the memory into which the + * requested data will be received. The MD must be "free floating" (See LNetMDBind()). + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists) of the MD. + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + */ +int +LNetGet(lnet_nid_t self, struct lnet_handle_md mdh, + struct lnet_process_id target, unsigned int portal, + __u64 match_bits, unsigned int offset, bool recovery) +{ + struct lnet_msg *msg; + struct lnet_libmd *md; + struct lnet_rsp_tracker *rspt; + int cpt; + int rc; + + LASSERT(the_lnet.ln_refcount > 0); + + if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer(target.nid, 1)) /* shall we now? */ + { + CERROR("Dropping GET to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (!msg) { + CERROR("Dropping GET to %s: ENOMEM on struct lnet_msg\n", + libcfs_id2str(target)); + return -ENOMEM; + } + + cpt = lnet_cpt_of_cookie(mdh.cookie); + + rspt = lnet_rspt_alloc(cpt); + if (!rspt) { + CERROR("Dropping GET to %s: ENOMEM on response tracker\n", + libcfs_id2str(target)); + return -ENOMEM; + } + INIT_LIST_HEAD(&rspt->rspt_on_list); + + msg->msg_recovery = recovery; + + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping GET (%llu:%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + + lnet_msg_free(msg); + LIBCFS_FREE(rspt, sizeof(*rspt)); + return -ENOENT; + } + + CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target)); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0); + + msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); + msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); + + /* NB handles only looked up by creator (no flips) */ + msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_SEND); + + lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + + rc = lnet_send(self, msg, LNET_NID_ANY); + if (rc < 0) { + CNETERR("Error sending GET to %s: %d\n", + libcfs_id2str(target), rc); + msg->msg_no_resend = true; + lnet_finalize(msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} +EXPORT_SYMBOL(LNetGet); + +/** + * Calculate distance to node at \a dstnid. + * + * \param dstnid Target NID. + * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid + * is saved here. + * \param orderp If not NULL, order of the route to reach \a dstnid is saved + * here. + * + * \retval 0 If \a dstnid belongs to a local interface, and reserved option + * local_nid_dist_zero is set, which is the default. + * \retval positives Distance to target NID, i.e. number of hops plus one. + * \retval -EHOSTUNREACH If \a dstnid is not reachable. + */ +int +LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) +{ + struct list_head *e; + struct lnet_ni *ni = NULL; + struct lnet_remotenet *rnet; + __u32 dstnet = LNET_NIDNET(dstnid); + int hops; + int cpt; + __u32 order = 2; + struct list_head *rn_list; + + /* if !local_nid_dist_zero, I don't return a distance of 0 ever + * (when lustre sees a distance of 0, it substitutes 0@lo), so I + * keep order 0 free for 0@lo and order 1 free for a local NID + * match */ + + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_net_lock_current(); + + while ((ni = lnet_get_next_ni_locked(NULL, ni))) { + if (ni->ni_nid == dstnid) { + if (srcnidp != NULL) + *srcnidp = dstnid; + if (orderp != NULL) { + if (dstnid == LNET_NID_LO_0) + *orderp = 0; + else + *orderp = 1; + } + lnet_net_unlock(cpt); + + return local_nid_dist_zero ? 0 : 1; + } + + if (LNET_NIDNET(ni->ni_nid) == dstnet) { + /* Check if ni was originally created in + * current net namespace. + * If not, assign order above 0xffff0000, + * to make this ni not a priority. */ + if (current->nsproxy && + !net_eq(ni->ni_net_ns, current->nsproxy->net_ns)) + order += 0xffff0000; + if (srcnidp != NULL) + *srcnidp = ni->ni_nid; + if (orderp != NULL) + *orderp = order; + lnet_net_unlock(cpt); + return 1; + } + + order++; + } + + rn_list = lnet_net2rnethash(dstnet); + list_for_each(e, rn_list) { + rnet = list_entry(e, struct lnet_remotenet, lrn_list); + + if (rnet->lrn_net == dstnet) { + struct lnet_route *route; + struct lnet_route *shortest = NULL; + __u32 shortest_hops = LNET_UNDEFINED_HOPS; + __u32 route_hops; + + LASSERT(!list_empty(&rnet->lrn_routes)); + + list_for_each_entry(route, &rnet->lrn_routes, + lr_list) { + route_hops = route->lr_hops; + if (route_hops == LNET_UNDEFINED_HOPS) + route_hops = 1; + if (shortest == NULL || + route_hops < shortest_hops) { + shortest = route; + shortest_hops = route_hops; + } + } + + LASSERT(shortest != NULL); + hops = shortest_hops; + if (srcnidp != NULL) { + ni = lnet_get_next_ni_locked( + shortest->lr_gateway->lpni_net, + NULL); + *srcnidp = ni->ni_nid; + } + if (orderp != NULL) + *orderp = order; + lnet_net_unlock(cpt); + return hops + 1; + } + order++; + } + + lnet_net_unlock(cpt); + return -EHOSTUNREACH; +} +EXPORT_SYMBOL(LNetDist); diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c new file mode 100644 index 0000000000000..959c370d2d4da --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c @@ -0,0 +1,1198 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-msg.c + * + * Message decoding, parsing and finalizing routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +void +lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev) +{ + ENTRY; + + memset(ev, 0, sizeof(*ev)); + + ev->status = 0; + ev->unlinked = 1; + ev->type = LNET_EVENT_UNLINK; + lnet_md_deconstruct(md, &ev->md); + lnet_md2handle(&ev->md_handle, md); + EXIT; +} + +/* + * Don't need any lock, must be called after lnet_commit_md + */ +void +lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type) +{ + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_event *ev = &msg->msg_ev; + + LASSERT(!msg->msg_routing); + + ev->type = ev_type; + ev->msg_type = msg->msg_type; + + if (ev_type == LNET_EVENT_SEND) { + /* event for active message */ + ev->target.nid = le64_to_cpu(hdr->dest_nid); + ev->target.pid = le32_to_cpu(hdr->dest_pid); + ev->initiator.nid = LNET_NID_ANY; + ev->initiator.pid = the_lnet.ln_pid; + ev->source.nid = LNET_NID_ANY; + ev->source.pid = the_lnet.ln_pid; + ev->sender = LNET_NID_ANY; + } else { + /* event for passive message */ + ev->target.pid = hdr->dest_pid; + ev->target.nid = hdr->dest_nid; + ev->initiator.pid = hdr->src_pid; + /* Multi-Rail: resolve src_nid to "primary" peer NID */ + ev->initiator.nid = msg->msg_initiator; + /* Multi-Rail: track source NID. */ + ev->source.pid = hdr->src_pid; + ev->source.nid = hdr->src_nid; + ev->rlength = hdr->payload_length; + ev->sender = msg->msg_from; + ev->mlength = msg->msg_wanted; + ev->offset = msg->msg_offset; + } + + switch (ev_type) { + default: + LBUG(); + + case LNET_EVENT_PUT: /* passive PUT */ + ev->pt_index = hdr->msg.put.ptl_index; + ev->match_bits = hdr->msg.put.match_bits; + ev->hdr_data = hdr->msg.put.hdr_data; + return; + + case LNET_EVENT_GET: /* passive GET */ + ev->pt_index = hdr->msg.get.ptl_index; + ev->match_bits = hdr->msg.get.match_bits; + ev->hdr_data = 0; + return; + + case LNET_EVENT_ACK: /* ACK */ + ev->match_bits = hdr->msg.ack.match_bits; + ev->mlength = hdr->msg.ack.mlength; + return; + + case LNET_EVENT_REPLY: /* REPLY */ + return; + + case LNET_EVENT_SEND: /* active message */ + if (msg->msg_type == LNET_MSG_PUT) { + ev->pt_index = le32_to_cpu(hdr->msg.put.ptl_index); + ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits); + ev->offset = le32_to_cpu(hdr->msg.put.offset); + ev->mlength = + ev->rlength = le32_to_cpu(hdr->payload_length); + ev->hdr_data = le64_to_cpu(hdr->msg.put.hdr_data); + + } else { + LASSERT(msg->msg_type == LNET_MSG_GET); + ev->pt_index = le32_to_cpu(hdr->msg.get.ptl_index); + ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits); + ev->mlength = + ev->rlength = le32_to_cpu(hdr->msg.get.sink_length); + ev->offset = le32_to_cpu(hdr->msg.get.src_offset); + ev->hdr_data = 0; + } + return; + } +} + +void +lnet_msg_commit(struct lnet_msg *msg, int cpt) +{ + struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt]; + struct lnet_counters_common *common; + s64 timeout_ns; + + /* set the message deadline */ + timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC; + msg->msg_deadline = ktime_add_ns(ktime_get(), timeout_ns); + + /* routed message can be committed for both receiving and sending */ + LASSERT(!msg->msg_tx_committed); + + if (msg->msg_sending) { + LASSERT(!msg->msg_receiving); + msg->msg_tx_cpt = cpt; + msg->msg_tx_committed = 1; + if (msg->msg_rx_committed) { /* routed message REPLY */ + LASSERT(msg->msg_onactivelist); + return; + } + } else { + LASSERT(!msg->msg_sending); + msg->msg_rx_cpt = cpt; + msg->msg_rx_committed = 1; + } + + LASSERT(!msg->msg_onactivelist); + + msg->msg_onactivelist = 1; + list_add_tail(&msg->msg_activelist, &container->msc_active); + + common = &the_lnet.ln_counters[cpt]->lct_common; + common->lcc_msgs_alloc++; + if (common->lcc_msgs_alloc > common->lcc_msgs_max) + common->lcc_msgs_max = common->lcc_msgs_alloc; +} + +static void +lnet_msg_decommit_tx(struct lnet_msg *msg, int status) +{ + struct lnet_counters_common *common; + struct lnet_event *ev = &msg->msg_ev; + + LASSERT(msg->msg_tx_committed); + if (status != 0) + goto out; + + common = &(the_lnet.ln_counters[msg->msg_tx_cpt]->lct_common); + switch (ev->type) { + default: /* routed message */ + LASSERT(msg->msg_routing); + LASSERT(msg->msg_rx_committed); + LASSERT(ev->type == 0); + + common->lcc_route_length += msg->msg_len; + common->lcc_route_count++; + goto incr_stats; + + case LNET_EVENT_PUT: + /* should have been decommitted */ + LASSERT(!msg->msg_rx_committed); + /* overwritten while sending ACK */ + LASSERT(msg->msg_type == LNET_MSG_ACK); + msg->msg_type = LNET_MSG_PUT; /* fix type */ + break; + + case LNET_EVENT_SEND: + LASSERT(!msg->msg_rx_committed); + if (msg->msg_type == LNET_MSG_PUT) + common->lcc_send_length += msg->msg_len; + break; + + case LNET_EVENT_GET: + LASSERT(msg->msg_rx_committed); + /* overwritten while sending reply, we should never be + * here for optimized GET */ + LASSERT(msg->msg_type == LNET_MSG_REPLY); + msg->msg_type = LNET_MSG_GET; /* fix type */ + break; + } + + common->lcc_send_count++; + +incr_stats: + if (msg->msg_txpeer) + lnet_incr_stats(&msg->msg_txpeer->lpni_stats, + msg->msg_type, + LNET_STATS_TYPE_SEND); + if (msg->msg_txni) + lnet_incr_stats(&msg->msg_txni->ni_stats, + msg->msg_type, + LNET_STATS_TYPE_SEND); + out: + lnet_return_tx_credits_locked(msg); + msg->msg_tx_committed = 0; +} + +static void +lnet_msg_decommit_rx(struct lnet_msg *msg, int status) +{ + struct lnet_counters_common *common; + struct lnet_event *ev = &msg->msg_ev; + + LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */ + LASSERT(msg->msg_rx_committed); + + if (status != 0) + goto out; + + common = &(the_lnet.ln_counters[msg->msg_rx_cpt]->lct_common); + switch (ev->type) { + default: + LASSERT(ev->type == 0); + LASSERT(msg->msg_routing); + goto incr_stats; + + case LNET_EVENT_ACK: + LASSERT(msg->msg_type == LNET_MSG_ACK); + break; + + case LNET_EVENT_GET: + /* type is "REPLY" if it's an optimized GET on passive side, + * because optimized GET will never be committed for sending, + * so message type wouldn't be changed back to "GET" by + * lnet_msg_decommit_tx(), see details in lnet_parse_get() */ + LASSERT(msg->msg_type == LNET_MSG_REPLY || + msg->msg_type == LNET_MSG_GET); + common->lcc_send_length += msg->msg_wanted; + break; + + case LNET_EVENT_PUT: + LASSERT(msg->msg_type == LNET_MSG_PUT); + break; + + case LNET_EVENT_REPLY: + /* type is "GET" if it's an optimized GET on active side, + * see details in lnet_create_reply_msg() */ + LASSERT(msg->msg_type == LNET_MSG_GET || + msg->msg_type == LNET_MSG_REPLY); + break; + } + + common->lcc_recv_count++; + +incr_stats: + if (msg->msg_rxpeer) + lnet_incr_stats(&msg->msg_rxpeer->lpni_stats, + msg->msg_type, + LNET_STATS_TYPE_RECV); + if (msg->msg_rxni) + lnet_incr_stats(&msg->msg_rxni->ni_stats, + msg->msg_type, + LNET_STATS_TYPE_RECV); + if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY) + common->lcc_recv_length += msg->msg_wanted; + + out: + lnet_return_rx_credits_locked(msg); + msg->msg_rx_committed = 0; +} + +void +lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status) +{ + int cpt2 = cpt; + + LASSERT(msg->msg_tx_committed || msg->msg_rx_committed); + LASSERT(msg->msg_onactivelist); + + if (msg->msg_tx_committed) { /* always decommit for sending first */ + LASSERT(cpt == msg->msg_tx_cpt); + lnet_msg_decommit_tx(msg, status); + } + + if (msg->msg_rx_committed) { + /* forwarding msg committed for both receiving and sending */ + if (cpt != msg->msg_rx_cpt) { + lnet_net_unlock(cpt); + cpt2 = msg->msg_rx_cpt; + lnet_net_lock(cpt2); + } + lnet_msg_decommit_rx(msg, status); + } + + list_del(&msg->msg_activelist); + msg->msg_onactivelist = 0; + + the_lnet.ln_counters[cpt2]->lct_common.lcc_msgs_alloc--; + + if (cpt2 != cpt) { + lnet_net_unlock(cpt2); + lnet_net_lock(cpt); + } +} + +void +lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md, + unsigned int offset, unsigned int mlen) +{ + /* NB: @offset and @len are only useful for receiving */ + /* Here, we attach the MD on lnet_msg and mark it busy and + * decrementing its threshold. Come what may, the lnet_msg "owns" + * the MD until a call to lnet_msg_detach_md or lnet_finalize() + * signals completion. */ + LASSERT(!msg->msg_routing); + + msg->msg_md = md; + if (msg->msg_receiving) { /* committed for receiving */ + msg->msg_offset = offset; + msg->msg_wanted = mlen; + } + + md->md_refcount++; + if (md->md_threshold != LNET_MD_THRESH_INF) { + LASSERT(md->md_threshold > 0); + md->md_threshold--; + } + + /* build umd in event */ + lnet_md2handle(&msg->msg_ev.md_handle, md); + lnet_md_deconstruct(md, &msg->msg_ev.md); +} + +static int +lnet_complete_msg_locked(struct lnet_msg *msg, int cpt) +{ + struct lnet_handle_wire ack_wmd; + int rc; + int status = msg->msg_ev.status; + + LASSERT(msg->msg_onactivelist); + + if (status == 0 && msg->msg_ack) { + /* Only send an ACK if the PUT completed successfully */ + + lnet_msg_decommit(msg, cpt, 0); + + msg->msg_ack = 0; + lnet_net_unlock(cpt); + + LASSERT(msg->msg_ev.type == LNET_EVENT_PUT); + LASSERT(!msg->msg_routing); + + ack_wmd = msg->msg_hdr.msg.put.ack_wmd; + + lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.source, 0, 0); + + msg->msg_hdr.msg.ack.dst_wmd = ack_wmd; + msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; + msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength); + + /* NB: we probably want to use NID of msg::msg_from as 3rd + * parameter (router NID) if it's routed message */ + rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY); + + lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is committed for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either + * because CPT for sending can be different with CPT for + * receiving, so we should return back to lnet_finalize() + * to make sure we are locking the correct partition. + */ + return rc; + + } else if (status == 0 && /* OK so far */ + (msg->msg_routing && !msg->msg_sending)) { + /* not forwarded */ + LASSERT(!msg->msg_receiving); /* called back recv already */ + lnet_net_unlock(cpt); + + rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY); + + lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is committed for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either: + * - The rule is message must decommit for sending first if + * the it's committed for both sending and receiving + * - CPT for sending can be different with CPT for receiving, + * so we should return back to lnet_finalize() to make + * sure we are locking the correct partition. + */ + return rc; + } + + lnet_msg_decommit(msg, cpt, status); + lnet_msg_free(msg); + return 0; +} + +static void +lnet_dec_healthv_locked(atomic_t *healthv) +{ + int h = atomic_read(healthv); + + if (h < lnet_health_sensitivity) { + atomic_set(healthv, 0); + } else { + h -= lnet_health_sensitivity; + atomic_set(healthv, h); + } +} + +static void +lnet_handle_local_failure(struct lnet_msg *msg) +{ + struct lnet_ni *local_ni; + + local_ni = msg->msg_txni; + + /* + * the lnet_net_lock(0) is used to protect the addref on the ni + * and the recovery queue. + */ + lnet_net_lock(0); + /* the mt could've shutdown and cleaned up the queues */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) { + lnet_net_unlock(0); + return; + } + + lnet_dec_healthv_locked(&local_ni->ni_healthv); + /* + * add the NI to the recovery queue if it's not already there + * and it's health value is actually below the maximum. It's + * possible that the sensitivity might be set to 0, and the health + * value will not be reduced. In this case, there is no reason to + * invoke recovery + */ + if (list_empty(&local_ni->ni_recovery) && + atomic_read(&local_ni->ni_healthv) < LNET_MAX_HEALTH_VALUE) { + CDEBUG(D_NET, "ni %s added to recovery queue. Health = %d\n", + libcfs_nid2str(local_ni->ni_nid), + atomic_read(&local_ni->ni_healthv)); + list_add_tail(&local_ni->ni_recovery, + &the_lnet.ln_mt_localNIRecovq); + lnet_ni_addref_locked(local_ni, 0); + } + lnet_net_unlock(0); +} + +void +lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni) +{ + /* lpni could be NULL if we're in the LOLND case */ + if (!lpni) + return; + + lnet_dec_healthv_locked(&lpni->lpni_healthv); + /* + * add the peer NI to the recovery queue if it's not already there + * and it's health value is actually below the maximum. It's + * possible that the sensitivity might be set to 0, and the health + * value will not be reduced. In this case, there is no reason to + * invoke recovery + */ + lnet_peer_ni_add_to_recoveryq_locked(lpni); +} + +static void +lnet_handle_remote_failure(struct lnet_peer_ni *lpni) +{ + /* lpni could be NULL if we're in the LOLND case */ + if (!lpni) + return; + + lnet_net_lock(0); + /* the mt could've shutdown and cleaned up the queues */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) { + lnet_net_unlock(0); + return; + } + lnet_handle_remote_failure_locked(lpni); + lnet_net_unlock(0); +} + +static void +lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus) +{ + struct lnet_ni *ni = msg->msg_txni; + struct lnet_peer_ni *lpni = msg->msg_txpeer; + struct lnet_counters_health *health; + + health = &the_lnet.ln_counters[0]->lct_health; + + switch (hstatus) { + case LNET_MSG_STATUS_LOCAL_INTERRUPT: + atomic_inc(&ni->ni_hstats.hlt_local_interrupt); + health->lch_local_interrupt_count++; + break; + case LNET_MSG_STATUS_LOCAL_DROPPED: + atomic_inc(&ni->ni_hstats.hlt_local_dropped); + health->lch_local_dropped_count++; + break; + case LNET_MSG_STATUS_LOCAL_ABORTED: + atomic_inc(&ni->ni_hstats.hlt_local_aborted); + health->lch_local_aborted_count++; + break; + case LNET_MSG_STATUS_LOCAL_NO_ROUTE: + atomic_inc(&ni->ni_hstats.hlt_local_no_route); + health->lch_local_no_route_count++; + break; + case LNET_MSG_STATUS_LOCAL_TIMEOUT: + atomic_inc(&ni->ni_hstats.hlt_local_timeout); + health->lch_local_timeout_count++; + break; + case LNET_MSG_STATUS_LOCAL_ERROR: + atomic_inc(&ni->ni_hstats.hlt_local_error); + health->lch_local_error_count++; + break; + case LNET_MSG_STATUS_REMOTE_DROPPED: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped); + health->lch_remote_dropped_count++; + break; + case LNET_MSG_STATUS_REMOTE_ERROR: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_remote_error); + health->lch_remote_error_count++; + break; + case LNET_MSG_STATUS_REMOTE_TIMEOUT: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout); + health->lch_remote_timeout_count++; + break; + case LNET_MSG_STATUS_NETWORK_TIMEOUT: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_network_timeout); + health->lch_network_timeout_count++; + break; + case LNET_MSG_STATUS_OK: + break; + default: + LBUG(); + } +} + +static void +lnet_resend_msg_locked(struct lnet_msg *msg) +{ + msg->msg_retry_count++; + + /* + * remove message from the active list and reset it to prepare + * for a resend. Two exceptions to this + * + * 1. the router case. When a message is being routed it is + * committed for rx when received and committed for tx when + * forwarded. We don't want to remove it from the active list, since + * code which handles receiving expects it to remain on the active + * list. + * + * 2. The REPLY case. Reply messages use the same message + * structure for the GET that was received. + */ + if (!msg->msg_routing && msg->msg_type != LNET_MSG_REPLY) { + list_del_init(&msg->msg_activelist); + msg->msg_onactivelist = 0; + } + /* + * The msg_target.nid which was originally set + * when calling LNetGet() or LNetPut() might've + * been overwritten if we're routing this message. + * Call lnet_msg_decommit_tx() to return the credit + * this message consumed. The message will + * consume another credit when it gets resent. + */ + msg->msg_target.nid = msg->msg_hdr.dest_nid; + lnet_msg_decommit_tx(msg, -EAGAIN); + msg->msg_sending = 0; + msg->msg_receiving = 0; + msg->msg_target_is_router = 0; + + CDEBUG(D_NET, "%s->%s:%s:%s - queuing msg (%p) for resend\n", + libcfs_nid2str(msg->msg_hdr.src_nid), + libcfs_nid2str(msg->msg_hdr.dest_nid), + lnet_msgtyp2str(msg->msg_type), + lnet_health_error2str(msg->msg_health_status), msg); + + list_add_tail(&msg->msg_list, the_lnet.ln_mt_resendqs[msg->msg_tx_cpt]); + + wake_up(&the_lnet.ln_mt_waitq); +} + +int +lnet_check_finalize_recursion_locked(struct lnet_msg *msg, + struct list_head *containerq, + int nworkers, void **workers) +{ + int my_slot = -1; + int i; + + list_add_tail(&msg->msg_list, containerq); + + for (i = 0; i < nworkers; i++) { + if (workers[i] == current) + break; + + if (my_slot < 0 && workers[i] == NULL) + my_slot = i; + } + + if (i < nworkers || my_slot < 0) + return -1; + + workers[my_slot] = current; + + return my_slot; +} + +int +lnet_attempt_msg_resend(struct lnet_msg *msg) +{ + struct lnet_msg_container *container; + int my_slot; + int cpt; + + /* we can only resend tx_committed messages */ + LASSERT(msg->msg_tx_committed); + + /* don't resend recovery messages */ + if (msg->msg_recovery) { + CDEBUG(D_NET, "msg %s->%s is a recovery ping. retry# %d\n", + libcfs_nid2str(msg->msg_from), + libcfs_nid2str(msg->msg_target.nid), + msg->msg_retry_count); + return -ENOTRECOVERABLE; + } + + /* + * if we explicitly indicated we don't want to resend then just + * return + */ + if (msg->msg_no_resend) { + CDEBUG(D_NET, "msg %s->%s requested no resend. retry# %d\n", + libcfs_nid2str(msg->msg_from), + libcfs_nid2str(msg->msg_target.nid), + msg->msg_retry_count); + return -ENOTRECOVERABLE; + } + + /* check if the message has exceeded the number of retries */ + if (msg->msg_retry_count >= lnet_retry_count) { + CNETERR("msg %s->%s exceeded retry count %d\n", + libcfs_nid2str(msg->msg_from), + libcfs_nid2str(msg->msg_target.nid), + msg->msg_retry_count); + return -ENOTRECOVERABLE; + } + + cpt = msg->msg_tx_cpt; + lnet_net_lock(cpt); + + /* check again under lock */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + + container = the_lnet.ln_msg_containers[cpt]; + my_slot = + lnet_check_finalize_recursion_locked(msg, + &container->msc_resending, + container->msc_nfinalizers, + container->msc_resenders); + + /* enough threads are resending */ + if (my_slot == -1) { + lnet_net_unlock(cpt); + return 0; + } + + while (!list_empty(&container->msc_resending)) { + msg = list_entry(container->msc_resending.next, + struct lnet_msg, msg_list); + list_del(&msg->msg_list); + + /* + * resending the message will require us to call + * lnet_msg_decommit_tx() which will return the credit + * which this message holds. This could trigger another + * queued message to be sent. If that message fails and + * requires a resend we will recurse. + * But since at this point the slot is taken, the message + * will be queued in the container and dealt with + * later. This breaks the recursion. + */ + lnet_resend_msg_locked(msg); + } + + /* + * msc_resenders is an array of process pointers. Each entry holds + * a pointer to the current process operating on the message. An + * array entry is created per CPT. If the array slot is already + * set, then it means that there is a thread on the CPT currently + * resending a message. + * Once the thread finishes clear the slot to enable the thread to + * take on more resend work. + */ + container->msc_resenders[my_slot] = NULL; + lnet_net_unlock(cpt); + + return 0; +} + +/* + * Do a health check on the message: + * return -1 if we're not going to handle the error or + * if we've reached the maximum number of retries. + * success case will return -1 as well + * return 0 if it the message is requeued for send + */ +static int +lnet_health_check(struct lnet_msg *msg) +{ + enum lnet_msg_hstatus hstatus = msg->msg_health_status; + bool lo = false; + + /* if we're shutting down no point in handling health. */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) + return -1; + + LASSERT(msg->msg_txni); + + /* + * if we're sending to the LOLND then the msg_txpeer will not be + * set. So no need to sanity check it. + */ + if (msg->msg_txni->ni_nid != LNET_NID_LO_0) + LASSERT(msg->msg_txpeer); + else + lo = true; + + if (hstatus != LNET_MSG_STATUS_OK && + ktime_compare(ktime_get(), msg->msg_deadline) >= 0) + return -1; + + /* + * stats are only incremented for errors so avoid wasting time + * incrementing statistics if there is no error. + */ + if (hstatus != LNET_MSG_STATUS_OK) { + lnet_net_lock(0); + lnet_incr_hstats(msg, hstatus); + lnet_net_unlock(0); + } + + CDEBUG(D_NET, "health check: %s->%s: %s: %s\n", + libcfs_nid2str(msg->msg_txni->ni_nid), + (lo) ? "self" : libcfs_nid2str(msg->msg_txpeer->lpni_nid), + lnet_msgtyp2str(msg->msg_type), + lnet_health_error2str(hstatus)); + + switch (hstatus) { + case LNET_MSG_STATUS_OK: + lnet_inc_healthv(&msg->msg_txni->ni_healthv); + /* + * It's possible msg_txpeer is NULL in the LOLND + * case. + */ + if (msg->msg_txpeer) + lnet_inc_healthv(&msg->msg_txpeer->lpni_healthv); + + /* we can finalize this message */ + return -1; + case LNET_MSG_STATUS_LOCAL_INTERRUPT: + case LNET_MSG_STATUS_LOCAL_DROPPED: + case LNET_MSG_STATUS_LOCAL_ABORTED: + case LNET_MSG_STATUS_LOCAL_NO_ROUTE: + case LNET_MSG_STATUS_LOCAL_TIMEOUT: + lnet_handle_local_failure(msg); + /* add to the re-send queue */ + return lnet_attempt_msg_resend(msg); + + /* + * These errors will not trigger a resend so simply + * finalize the message + */ + case LNET_MSG_STATUS_LOCAL_ERROR: + lnet_handle_local_failure(msg); + return -1; + + /* + * TODO: since the remote dropped the message we can + * attempt a resend safely. + */ + case LNET_MSG_STATUS_REMOTE_DROPPED: + lnet_handle_remote_failure(msg->msg_txpeer); + return lnet_attempt_msg_resend(msg); + + case LNET_MSG_STATUS_REMOTE_ERROR: + case LNET_MSG_STATUS_REMOTE_TIMEOUT: + case LNET_MSG_STATUS_NETWORK_TIMEOUT: + lnet_handle_remote_failure(msg->msg_txpeer); + return -1; + default: + LBUG(); + } + + /* no resend is needed */ + return -1; +} + +static void +lnet_msg_detach_md(struct lnet_msg *msg, int cpt, int status) +{ + struct lnet_libmd *md = msg->msg_md; + int unlink; + + /* Now it's safe to drop my caller's ref */ + md->md_refcount--; + LASSERT(md->md_refcount >= 0); + + unlink = lnet_md_unlinkable(md); + if (md->md_eq != NULL) { + msg->msg_ev.status = status; + msg->msg_ev.unlinked = unlink; + lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev); + } + + if (unlink || (md->md_refcount == 0 && + md->md_threshold == LNET_MD_THRESH_INF)) + lnet_detach_rsp_tracker(md, cpt); + + if (unlink) + lnet_md_unlink(md); + + msg->msg_md = NULL; +} + +static bool +lnet_is_health_check(struct lnet_msg *msg) +{ + bool hc; + int status = msg->msg_ev.status; + + if ((!msg->msg_tx_committed && !msg->msg_rx_committed) || + !msg->msg_onactivelist) { + CDEBUG(D_NET, "msg %p not committed for send or receive\n", + msg); + return false; + } + + if ((msg->msg_tx_committed && !msg->msg_txpeer) || + (msg->msg_rx_committed && !msg->msg_rxpeer)) { + CDEBUG(D_NET, "msg %p failed too early to retry and send\n", + msg); + return false; + } + + /* + * perform a health check for any message committed for transmit + */ + hc = msg->msg_tx_committed; + + /* Check for status inconsistencies */ + if (hc && + ((!status && msg->msg_health_status != LNET_MSG_STATUS_OK) || + (status && msg->msg_health_status == LNET_MSG_STATUS_OK))) { + CDEBUG(D_NET, "Msg %p is in inconsistent state, don't perform health " + "checking (%d, %d)\n", msg, status, + msg->msg_health_status); + hc = false; + } + + CDEBUG(D_NET, "health check = %d, status = %d, hstatus = %d\n", + hc, status, msg->msg_health_status); + + return hc; +} + +char * +lnet_health_error2str(enum lnet_msg_hstatus hstatus) +{ + switch (hstatus) { + case LNET_MSG_STATUS_LOCAL_INTERRUPT: + return "LOCAL_INTERRUPT"; + case LNET_MSG_STATUS_LOCAL_DROPPED: + return "LOCAL_DROPPED"; + case LNET_MSG_STATUS_LOCAL_ABORTED: + return "LOCAL_ABORTED"; + case LNET_MSG_STATUS_LOCAL_NO_ROUTE: + return "LOCAL_NO_ROUTE"; + case LNET_MSG_STATUS_LOCAL_TIMEOUT: + return "LOCAL_TIMEOUT"; + case LNET_MSG_STATUS_LOCAL_ERROR: + return "LOCAL_ERROR"; + case LNET_MSG_STATUS_REMOTE_DROPPED: + return "REMOTE_DROPPED"; + case LNET_MSG_STATUS_REMOTE_ERROR: + return "REMOTE_ERROR"; + case LNET_MSG_STATUS_REMOTE_TIMEOUT: + return "REMOTE_TIMEOUT"; + case LNET_MSG_STATUS_NETWORK_TIMEOUT: + return "NETWORK_TIMEOUT"; + case LNET_MSG_STATUS_OK: + return "OK"; + default: + return ""; + } +} + +bool +lnet_send_error_simulation(struct lnet_msg *msg, + enum lnet_msg_hstatus *hstatus) +{ + if (!msg) + return false; + + if (list_empty(&the_lnet.ln_drop_rules)) + return false; + + /* match only health rules */ + if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus)) + return false; + + CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n", + libcfs_nid2str(msg->msg_hdr.src_nid), + libcfs_nid2str(msg->msg_hdr.dest_nid), + lnet_msgtyp2str(msg->msg_type), + lnet_health_error2str(*hstatus)); + + return true; +} +EXPORT_SYMBOL(lnet_send_error_simulation); + +void +lnet_finalize(struct lnet_msg *msg, int status) +{ + struct lnet_msg_container *container; + int my_slot; + int cpt; + int rc; + + LASSERT(!in_interrupt()); + + if (msg == NULL) + return; + + msg->msg_ev.status = status; + + if (lnet_is_health_check(msg)) { + /* + * Check the health status of the message. If it has one + * of the errors that we're supposed to handle, and it has + * not timed out, then + * 1. Decrement the appropriate health_value + * 2. queue the message on the resend queue + + * if the message send is success, timed out or failed in the + * health check for any reason then we'll just finalize the + * message. Otherwise just return since the message has been + * put on the resend queue. + */ + if (!lnet_health_check(msg)) + return; + } + + /* + * We're not going to resend this message so detach its MD and invoke + * the appropriate callbacks + */ + if (msg->msg_md != NULL) { + cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie); + lnet_res_lock(cpt); + lnet_msg_detach_md(msg, cpt, status); + lnet_res_unlock(cpt); + } + +again: + if (!msg->msg_tx_committed && !msg->msg_rx_committed) { + /* not committed to network yet */ + LASSERT(!msg->msg_onactivelist); + lnet_msg_free(msg); + return; + } + + /* + * NB: routed message can be committed for both receiving and sending, + * we should finalize in LIFO order and keep counters correct. + * (finalize sending first then finalize receiving) + */ + cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt; + lnet_net_lock(cpt); + + container = the_lnet.ln_msg_containers[cpt]; + + /* Recursion breaker. Don't complete the message here if I am (or + * enough other threads are) already completing messages */ + my_slot = lnet_check_finalize_recursion_locked(msg, + &container->msc_finalizing, + container->msc_nfinalizers, + container->msc_finalizers); + + /* enough threads are resending */ + if (my_slot == -1) { + lnet_net_unlock(cpt); + return; + } + + rc = 0; + while (!list_empty(&container->msc_finalizing)) { + msg = list_entry(container->msc_finalizing.next, + struct lnet_msg, msg_list); + + list_del_init(&msg->msg_list); + + /* NB drops and regains the lnet lock if it actually does + * anything, so my finalizing friends can chomp along too */ + rc = lnet_complete_msg_locked(msg, cpt); + if (rc != 0) + break; + } + + if (unlikely(!list_empty(&the_lnet.ln_delay_rules))) { + lnet_net_unlock(cpt); + lnet_delay_rule_check(); + lnet_net_lock(cpt); + } + + container->msc_finalizers[my_slot] = NULL; + lnet_net_unlock(cpt); + + if (rc != 0) + goto again; +} +EXPORT_SYMBOL(lnet_finalize); + +void +lnet_msg_container_cleanup(struct lnet_msg_container *container) +{ + int count = 0; + + if (container->msc_init == 0) + return; + + while (!list_empty(&container->msc_active)) { + struct lnet_msg *msg; + + msg = list_entry(container->msc_active.next, + struct lnet_msg, msg_activelist); + LASSERT(msg->msg_onactivelist); + msg->msg_onactivelist = 0; + list_del_init(&msg->msg_activelist); + lnet_msg_free(msg); + count++; + } + + if (count > 0) + CERROR("%d active msg on exit\n", count); + + if (container->msc_finalizers != NULL) { + LIBCFS_FREE(container->msc_finalizers, + container->msc_nfinalizers * + sizeof(*container->msc_finalizers)); + container->msc_finalizers = NULL; + } + + if (container->msc_resenders != NULL) { + LIBCFS_FREE(container->msc_resenders, + container->msc_nfinalizers * + sizeof(*container->msc_resenders)); + container->msc_resenders = NULL; + } + container->msc_init = 0; +} + +int +lnet_msg_container_setup(struct lnet_msg_container *container, int cpt) +{ + int rc = 0; + + container->msc_init = 1; + + INIT_LIST_HEAD(&container->msc_active); + INIT_LIST_HEAD(&container->msc_finalizing); + INIT_LIST_HEAD(&container->msc_resending); + + /* number of CPUs */ + container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt); + if (container->msc_nfinalizers == 0) + container->msc_nfinalizers = 1; + + LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt, + container->msc_nfinalizers * + sizeof(*container->msc_finalizers)); + + if (container->msc_finalizers == NULL) { + CERROR("Failed to allocate message finalizers\n"); + lnet_msg_container_cleanup(container); + return -ENOMEM; + } + + LIBCFS_CPT_ALLOC(container->msc_resenders, lnet_cpt_table(), cpt, + container->msc_nfinalizers * + sizeof(*container->msc_resenders)); + + if (container->msc_resenders == NULL) { + CERROR("Failed to allocate message resenders\n"); + lnet_msg_container_cleanup(container); + return -ENOMEM; + } + + return rc; +} + +void +lnet_msg_containers_destroy(void) +{ + struct lnet_msg_container *container; + int i; + + if (the_lnet.ln_msg_containers == NULL) + return; + + cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) + lnet_msg_container_cleanup(container); + + cfs_percpt_free(the_lnet.ln_msg_containers); + the_lnet.ln_msg_containers = NULL; +} + +int +lnet_msg_containers_create(void) +{ + struct lnet_msg_container *container; + int rc; + int i; + + the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*container)); + + if (the_lnet.ln_msg_containers == NULL) { + CERROR("Failed to allocate cpu-partition data for network\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) { + rc = lnet_msg_container_setup(container, i); + if (rc != 0) { + lnet_msg_containers_destroy(); + return rc; + } + } + + return 0; +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c new file mode 100644 index 0000000000000..75a352dec6ff8 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c @@ -0,0 +1,983 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-ptl.c + * + * portal & match routines + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/* NB: add /proc interfaces in upcoming patches */ +int portal_rotor = LNET_PTL_ROTOR_HASH_RT; +module_param(portal_rotor, int, 0644); +MODULE_PARM_DESC(portal_rotor, "redirect PUTs to different cpu-partitions"); + +static int +lnet_ptl_match_type(unsigned int index, struct lnet_process_id match_id, + __u64 mbits, __u64 ignore_bits) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[index]; + int unique; + + unique = ignore_bits == 0 && + match_id.nid != LNET_NID_ANY && + match_id.pid != LNET_PID_ANY; + + LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl)); + + /* prefer to check w/o any lock */ + if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) + goto match; + + /* unset, new portal */ + lnet_ptl_lock(ptl); + /* check again with lock */ + if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) { + lnet_ptl_unlock(ptl); + goto match; + } + + /* still not set */ + if (unique) + lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE); + else + lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD); + + lnet_ptl_unlock(ptl); + + return 1; + + match: + if ((lnet_ptl_is_unique(ptl) && !unique) || + (lnet_ptl_is_wildcard(ptl) && unique)) + return 0; + return 1; +} + +static void +lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt) +{ + struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; + int i; + + /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + mtable->mt_enabled = 1; + + ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt; + for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) { + LASSERT(ptl->ptl_mt_maps[i] != cpt); + if (ptl->ptl_mt_maps[i] < cpt) + break; + + /* swap to order */ + ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i]; + ptl->ptl_mt_maps[i] = cpt; + } + + ptl->ptl_mt_nmaps++; +} + +static void +lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt) +{ + struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; + int i; + + /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + if (LNET_CPT_NUMBER == 1) + return; /* never disable the only match-table */ + + mtable->mt_enabled = 0; + + LASSERT(ptl->ptl_mt_nmaps > 0 && + ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER); + + /* remove it from mt_maps */ + ptl->ptl_mt_nmaps--; + for (i = 0; i < ptl->ptl_mt_nmaps; i++) { + if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */ + ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1]; + } +} + +static int +lnet_try_match_md(struct lnet_libmd *md, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + /* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock; + * lnet_match_blocked_msg() relies on this to avoid races */ + unsigned int offset; + unsigned int mlength; + struct lnet_me *me = md->md_me; + + /* MD exhausted */ + if (lnet_md_exhausted(md)) + return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED; + + /* mismatched MD op */ + if ((md->md_options & info->mi_opc) == 0) + return LNET_MATCHMD_NONE; + + /* mismatched ME nid/pid? */ + if (me->me_match_id.nid != LNET_NID_ANY && + me->me_match_id.nid != info->mi_id.nid) + return LNET_MATCHMD_NONE; + + if (me->me_match_id.pid != LNET_PID_ANY && + me->me_match_id.pid != info->mi_id.pid) + return LNET_MATCHMD_NONE; + + /* mismatched ME matchbits? */ + if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0) + return LNET_MATCHMD_NONE; + + /* Hurrah! This _is_ a match; check it out... */ + + if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0) + offset = md->md_offset; + else + offset = info->mi_roffset; + + if ((md->md_options & LNET_MD_MAX_SIZE) != 0) { + mlength = md->md_max_size; + LASSERT(md->md_offset + mlength <= md->md_length); + } else { + mlength = md->md_length - offset; + } + + if (info->mi_rlength <= mlength) { /* fits in allowed space */ + mlength = info->mi_rlength; + } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) { + /* this packet _really_ is too big */ + CERROR("Matching packet from %s, match %llu" + " length %d too big: %d left, %d allowed\n", + libcfs_id2str(info->mi_id), info->mi_mbits, + info->mi_rlength, md->md_length - offset, mlength); + + return LNET_MATCHMD_DROP; + } + + /* Commit to this ME/MD */ + CDEBUG(D_NET, "Incoming %s index %x from %s of " + "length %d/%d into md %#llx [%d] + %d\n", + (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get", + info->mi_portal, libcfs_id2str(info->mi_id), mlength, + info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset); + + lnet_msg_attach_md(msg, md, offset, mlength); + md->md_offset = offset + mlength; + + if (!lnet_md_exhausted(md)) + return LNET_MATCHMD_OK; + + /* Auto-unlink NOW, so the ME gets unlinked if required. + * We bumped md->md_refcount above so the MD just gets flagged + * for unlink when it is finalized. */ + if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0) + lnet_md_unlink(md); + + return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED; +} + +static struct lnet_match_table * +lnet_match2mt(struct lnet_portal *ptl, struct lnet_process_id id, __u64 mbits) +{ + if (LNET_CPT_NUMBER == 1) + return ptl->ptl_mtables[0]; /* the only one */ + + /* if it's a unique portal, return match-table hashed by NID */ + return lnet_ptl_is_unique(ptl) ? + ptl->ptl_mtables[lnet_cpt_of_nid(id.nid, NULL)] : NULL; +} + +struct lnet_match_table * +lnet_mt_of_attach(unsigned int index, struct lnet_process_id id, + __u64 mbits, __u64 ignore_bits, enum lnet_ins_pos pos) +{ + struct lnet_portal *ptl; + struct lnet_match_table *mtable; + + /* NB: called w/o lock */ + LASSERT(index < the_lnet.ln_nportals); + + if (!lnet_ptl_match_type(index, id, mbits, ignore_bits)) + return NULL; + + ptl = the_lnet.ln_portals[index]; + + mtable = lnet_match2mt(ptl, id, mbits); + if (mtable != NULL) /* unique portal or only one match-table */ + return mtable; + + /* it's a wildcard portal */ + switch (pos) { + default: + return NULL; + case LNET_INS_BEFORE: + case LNET_INS_AFTER: + /* posted by no affinity thread, always hash to specific + * match-table to avoid buffer stealing which is heavy */ + return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER]; + case LNET_INS_LOCAL: + /* posted by cpu-affinity thread */ + return ptl->ptl_mtables[lnet_cpt_current()]; + } +} + +static struct lnet_match_table * +lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct lnet_match_table *mtable; + struct lnet_portal *ptl; + unsigned int nmaps; + unsigned int rotor; + unsigned int cpt; + bool routed; + + /* NB: called w/o lock */ + LASSERT(info->mi_portal < the_lnet.ln_nportals); + ptl = the_lnet.ln_portals[info->mi_portal]; + + LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)); + + mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits); + if (mtable != NULL) + return mtable; + + /* it's a wildcard portal */ + routed = LNET_NIDNET(msg->msg_hdr.src_nid) != + LNET_NIDNET(msg->msg_hdr.dest_nid); + + if (portal_rotor == LNET_PTL_ROTOR_OFF || + (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) { + cpt = lnet_cpt_current(); + if (ptl->ptl_mtables[cpt]->mt_enabled) + return ptl->ptl_mtables[cpt]; + } + + rotor = ptl->ptl_rotor++; /* get round-robin factor */ + if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed) + cpt = info->mi_cpt; + else + cpt = rotor % LNET_CPT_NUMBER; + + if (!ptl->ptl_mtables[cpt]->mt_enabled) { + /* is there any active entry for this portal? */ + nmaps = ptl->ptl_mt_nmaps; + /* map to an active mtable to avoid heavy "stealing" */ + if (nmaps != 0) { + /* NB: there is possibility that ptl_mt_maps is being + * changed because we are not under protection of + * lnet_ptl_lock, but it shouldn't hurt anything */ + cpt = ptl->ptl_mt_maps[rotor % nmaps]; + } + } + + return ptl->ptl_mtables[cpt]; +} + +static int +lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos) +{ + __u64 *bmap; + int i; + + if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) + return 0; + + if (pos < 0) { /* check all bits */ + for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) { + if (mtable->mt_exhausted[i] != (__u64)(-1)) + return 0; + } + return 1; + } + + LASSERT(pos <= LNET_MT_HASH_IGNORE); + /* mtable::mt_mhash[pos] is marked as exhausted or not */ + bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; + pos &= (1 << LNET_MT_BITS_U64) - 1; + + return ((*bmap) & (1ULL << pos)) != 0; +} + +static void +lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted) +{ + __u64 *bmap; + + LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])); + LASSERT(pos <= LNET_MT_HASH_IGNORE); + + /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */ + bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; + pos &= (1 << LNET_MT_BITS_U64) - 1; + + if (!exhausted) + *bmap &= ~(1ULL << pos); + else + *bmap |= 1ULL << pos; +} + +struct list_head * +lnet_mt_match_head(struct lnet_match_table *mtable, + struct lnet_process_id id, __u64 mbits) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal]; + + if (lnet_ptl_is_wildcard(ptl)) { + return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK]; + } else { + unsigned long hash = mbits + id.nid + id.pid; + + LASSERT(lnet_ptl_is_unique(ptl)); + hash = hash_long(hash, LNET_MT_HASH_BITS); + return &mtable->mt_mhash[hash & LNET_MT_HASH_MASK]; + } +} + +int +lnet_mt_match_md(struct lnet_match_table *mtable, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct list_head *head; + struct lnet_me *me; + struct lnet_me *tmp; + int exhausted = 0; + int rc; + + /* any ME with ignore bits? */ + if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE])) + head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; + else + head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits); + again: + /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */ + if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) + exhausted = LNET_MATCHMD_EXHAUSTED; + + list_for_each_entry_safe(me, tmp, head, me_list) { + /* ME attached but MD not attached yet */ + if (me->me_md == NULL) + continue; + + LASSERT(me == me->me_md->md_me); + + rc = lnet_try_match_md(me->me_md, info, msg); + if ((rc & LNET_MATCHMD_EXHAUSTED) == 0) + exhausted = 0; /* mlist is not empty */ + + if ((rc & LNET_MATCHMD_FINISH) != 0) { + /* don't return EXHAUSTED bit because we don't know + * whether the mlist is empty or not */ + return rc & ~LNET_MATCHMD_EXHAUSTED; + } + } + + if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */ + lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1); + if (!lnet_mt_test_exhausted(mtable, -1)) + exhausted = 0; + } + + if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) { + head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits); + goto again; /* re-check MEs w/o ignore-bits */ + } + + if (info->mi_opc == LNET_MD_OP_GET || + !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal])) + return LNET_MATCHMD_DROP | exhausted; + + return LNET_MATCHMD_NONE | exhausted; +} + +static int +lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg) +{ + int rc; + + /* message arrived before any buffer posting on this portal, + * simply delay or drop this message */ + if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl))) + return 0; + + lnet_ptl_lock(ptl); + /* check it again with hold of lock */ + if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) { + lnet_ptl_unlock(ptl); + return 0; + } + + if (lnet_ptl_is_lazy(ptl)) { + if (msg->msg_rx_ready_delay) { + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_delayed); + } + rc = LNET_MATCHMD_NONE; + } else { + rc = LNET_MATCHMD_DROP; + } + + lnet_ptl_unlock(ptl); + return rc; +} + +static int +lnet_ptl_match_delay(struct lnet_portal *ptl, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + int first = ptl->ptl_mt_maps[0]; /* read w/o lock */ + int rc = 0; + int i; + + /* + * Steal buffer from other CPTs, and delay msg if nothing to + * steal. This function is more expensive than a regular + * match, but we don't expect it can happen a lot. The return + * code contains one of LNET_MATCHMD_OK, LNET_MATCHMD_DROP, or + * LNET_MATCHMD_NONE. + */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + for (i = 0; i < LNET_CPT_NUMBER; i++) { + struct lnet_match_table *mtable; + int cpt; + + cpt = (first + i) % LNET_CPT_NUMBER; + mtable = ptl->ptl_mtables[cpt]; + if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled) + continue; + + lnet_res_lock(cpt); + lnet_ptl_lock(ptl); + + if (i == 0) { + /* The first try, add to stealing list. */ + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_stealing); + } + + if (!list_empty(&msg->msg_list)) { + /* On stealing list. */ + rc = lnet_mt_match_md(mtable, info, msg); + + if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && + mtable->mt_enabled) + lnet_ptl_disable_mt(ptl, cpt); + + if ((rc & LNET_MATCHMD_FINISH) != 0) { + /* Match found, remove from stealing list. */ + list_del_init(&msg->msg_list); + } else if (i == LNET_CPT_NUMBER - 1 || /* (1) */ + ptl->ptl_mt_nmaps == 0 || /* (2) */ + (ptl->ptl_mt_nmaps == 1 && /* (3) */ + ptl->ptl_mt_maps[0] == cpt)) { + /* + * No match found, and this is either + * (1) the last cpt to check, or + * (2) there is no active cpt, or + * (3) this is the only active cpt. + * There is nothing to steal: delay or + * drop the message. + */ + list_del_init(&msg->msg_list); + + if (lnet_ptl_is_lazy(ptl)) { + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_delayed); + rc = LNET_MATCHMD_NONE; + } else { + rc = LNET_MATCHMD_DROP; + } + } else { + /* Do another iteration. */ + rc = 0; + } + } else { + /* + * No longer on stealing list: another thread + * matched the message in lnet_ptl_attach_md(). + * We are now expected to handle the message. + */ + rc = msg->msg_md == NULL ? + LNET_MATCHMD_DROP : LNET_MATCHMD_OK; + } + + lnet_ptl_unlock(ptl); + lnet_res_unlock(cpt); + + /* + * Note that test (1) above ensures that we always + * exit the loop through this break statement. + * + * LNET_MATCHMD_NONE means msg was added to the + * delayed queue, and we may no longer reference it + * after lnet_ptl_unlock() and lnet_res_unlock(). + */ + if (rc & (LNET_MATCHMD_FINISH | LNET_MATCHMD_NONE)) + break; + } + + return rc; +} + +int +lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct lnet_match_table *mtable; + struct lnet_portal *ptl; + int rc; + + CDEBUG(D_NET, "Request from %s of length %d into portal %d " + "MB=%#llx\n", libcfs_id2str(info->mi_id), + info->mi_rlength, info->mi_portal, info->mi_mbits); + + if (info->mi_portal >= the_lnet.ln_nportals) { + CERROR("Invalid portal %d not in [0-%d]\n", + info->mi_portal, the_lnet.ln_nportals); + return LNET_MATCHMD_DROP; + } + + ptl = the_lnet.ln_portals[info->mi_portal]; + rc = lnet_ptl_match_early(ptl, msg); + if (rc != 0) /* matched or delayed early message */ + return rc; + + mtable = lnet_mt_of_match(info, msg); + lnet_res_lock(mtable->mt_cpt); + + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = LNET_MATCHMD_DROP; + goto out1; + } + + rc = lnet_mt_match_md(mtable, info, msg); + if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) { + lnet_ptl_lock(ptl); + lnet_ptl_disable_mt(ptl, mtable->mt_cpt); + lnet_ptl_unlock(ptl); + } + + if ((rc & LNET_MATCHMD_FINISH) != 0) /* matched or dropping */ + goto out1; + + if (!msg->msg_rx_ready_delay) + goto out1; + + LASSERT(lnet_ptl_is_lazy(ptl)); + LASSERT(!msg->msg_rx_delayed); + + /* NB: we don't expect "delay" can happen a lot */ + if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) { + lnet_ptl_lock(ptl); + + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(mtable->mt_cpt); + rc = LNET_MATCHMD_NONE; + } else { + lnet_res_unlock(mtable->mt_cpt); + rc = lnet_ptl_match_delay(ptl, info, msg); + } + + /* LNET_MATCHMD_NONE means msg was added to the delay queue */ + if (rc & LNET_MATCHMD_NONE) { + CDEBUG(D_NET, + "Delaying %s from %s ptl %d MB %#llx off %d len %d\n", + info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET", + libcfs_id2str(info->mi_id), info->mi_portal, + info->mi_mbits, info->mi_roffset, info->mi_rlength); + } + goto out0; + out1: + lnet_res_unlock(mtable->mt_cpt); + out0: + /* EXHAUSTED bit is only meaningful for internal functions */ + return rc & ~LNET_MATCHMD_EXHAUSTED; +} + +void +lnet_ptl_detach_md(struct lnet_me *me, struct lnet_libmd *md) +{ + LASSERT(me->me_md == md && md->md_me == me); + + me->me_md = NULL; + md->md_me = NULL; +} + +/* called with lnet_res_lock held */ +void +lnet_ptl_attach_md(struct lnet_me *me, struct lnet_libmd *md, + struct list_head *matches, struct list_head *drops) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal]; + struct lnet_match_table *mtable; + struct list_head *head; + struct lnet_msg *tmp; + struct lnet_msg *msg; + int exhausted = 0; + int cpt; + + LASSERT(md->md_refcount == 0); /* a brand new MD */ + + me->me_md = md; + md->md_me = me; + + cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + mtable = ptl->ptl_mtables[cpt]; + + if (list_empty(&ptl->ptl_msg_stealing) && + list_empty(&ptl->ptl_msg_delayed) && + !lnet_mt_test_exhausted(mtable, me->me_pos)) + return; + + lnet_ptl_lock(ptl); + head = &ptl->ptl_msg_stealing; + again: + list_for_each_entry_safe(msg, tmp, head, msg_list) { + struct lnet_match_info info; + struct lnet_hdr *hdr; + int rc; + + LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing); + + hdr = &msg->msg_hdr; + /* Multi-Rail: Primary peer NID */ + info.mi_id.nid = msg->msg_initiator; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_PUT; + info.mi_portal = hdr->msg.put.ptl_index; + info.mi_rlength = hdr->payload_length; + info.mi_roffset = hdr->msg.put.offset; + info.mi_mbits = hdr->msg.put.match_bits; + + rc = lnet_try_match_md(md, &info, msg); + + exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0; + if ((rc & LNET_MATCHMD_NONE) != 0) { + if (exhausted) + break; + continue; + } + + /* Hurrah! This _is_ a match */ + LASSERT((rc & LNET_MATCHMD_FINISH) != 0); + list_del_init(&msg->msg_list); + + if (head == &ptl->ptl_msg_stealing) { + if (exhausted) + break; + /* stealing thread will handle the message */ + continue; + } + + if ((rc & LNET_MATCHMD_OK) != 0) { + list_add_tail(&msg->msg_list, matches); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " + "match %llu offset %d length %d.\n", + libcfs_id2str(info.mi_id), + info.mi_portal, info.mi_mbits, + info.mi_roffset, info.mi_rlength); + } else { + list_add_tail(&msg->msg_list, drops); + } + + if (exhausted) + break; + } + + if (!exhausted && head == &ptl->ptl_msg_stealing) { + head = &ptl->ptl_msg_delayed; + goto again; + } + + if (lnet_ptl_is_wildcard(ptl) && !exhausted) { + lnet_mt_set_exhausted(mtable, me->me_pos, 0); + if (!mtable->mt_enabled) + lnet_ptl_enable_mt(ptl, cpt); + } + + lnet_ptl_unlock(ptl); +} + +static void +lnet_ptl_cleanup(struct lnet_portal *ptl) +{ + struct lnet_match_table *mtable; + int i; + + if (ptl->ptl_mtables == NULL) /* uninitialized portal */ + return; + + LASSERT(list_empty(&ptl->ptl_msg_delayed)); + LASSERT(list_empty(&ptl->ptl_msg_stealing)); + cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { + struct list_head *mhash; + struct lnet_me *me; + int j; + + if (mtable->mt_mhash == NULL) /* uninitialized match-table */ + continue; + + mhash = mtable->mt_mhash; + /* cleanup ME */ + for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) { + while (!list_empty(&mhash[j])) { + me = list_entry(mhash[j].next, + struct lnet_me, me_list); + CERROR("Active ME %p on exit\n", me); + list_del(&me->me_list); + lnet_me_free(me); + } + } + /* the extra entry is for MEs with ignore bits */ + LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1)); + } + + cfs_percpt_free(ptl->ptl_mtables); + ptl->ptl_mtables = NULL; +} + +static int +lnet_ptl_setup(struct lnet_portal *ptl, int index) +{ + struct lnet_match_table *mtable; + struct list_head *mhash; + int i; + int j; + + ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct lnet_match_table)); + if (ptl->ptl_mtables == NULL) { + CERROR("Failed to create match table for portal %d\n", index); + return -ENOMEM; + } + + ptl->ptl_index = index; + INIT_LIST_HEAD(&ptl->ptl_msg_delayed); + INIT_LIST_HEAD(&ptl->ptl_msg_stealing); + spin_lock_init(&ptl->ptl_lock); + cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { + /* the extra entry is for MEs with ignore bits */ + LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i, + sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1)); + if (mhash == NULL) { + CERROR("Failed to create match hash for portal %d\n", + index); + goto failed; + } + + memset(&mtable->mt_exhausted[0], -1, + sizeof(mtable->mt_exhausted[0]) * + LNET_MT_EXHAUSTED_BMAP); + mtable->mt_mhash = mhash; + for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) + INIT_LIST_HEAD(&mhash[j]); + + mtable->mt_portal = index; + mtable->mt_cpt = i; + } + + return 0; + failed: + lnet_ptl_cleanup(ptl); + return -ENOMEM; +} + +void +lnet_portals_destroy(void) +{ + int i; + + if (the_lnet.ln_portals == NULL) + return; + + for (i = 0; i < the_lnet.ln_nportals; i++) + lnet_ptl_cleanup(the_lnet.ln_portals[i]); + + cfs_array_free(the_lnet.ln_portals); + the_lnet.ln_portals = NULL; +} + +int +lnet_portals_create(void) +{ + int size; + int i; + + size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]); + + the_lnet.ln_nportals = MAX_PORTALS; + the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size); + if (the_lnet.ln_portals == NULL) { + CERROR("Failed to allocate portals table\n"); + return -ENOMEM; + } + + for (i = 0; i < the_lnet.ln_nportals; i++) { + if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) { + lnet_portals_destroy(); + return -ENOMEM; + } + } + + return 0; +} + +/** + * Turn on the lazy portal attribute. Use with caution! + * + * This portal attribute only affects incoming PUT requests to the portal, + * and is off by default. By default, if there's no matching MD for an + * incoming PUT request, it is simply dropped. With the lazy attribute on, + * such requests are queued indefinitely until either a matching MD is + * posted to the portal or the lazy attribute is turned off. + * + * It would prevent dropped requests, however it should be regarded as the + * last line of defense - i.e. users must keep a close watch on active + * buffers on a lazy portal and once it becomes too low post more buffers as + * soon as possible. This is because delayed requests usually have detrimental + * effects on underlying network connections. A few delayed requests often + * suffice to bring an underlying connection to a complete halt, due to flow + * control mechanisms. + * + * There's also a DOS attack risk. If users don't post match-all MDs on a + * lazy portal, a malicious peer can easily stop a service by sending some + * PUT requests with match bits that won't match any MD. A routed server is + * especially vulnerable since the connections to its neighbor routers are + * shared among all clients. + * + * \param portal Index of the portal to enable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ +int +LNetSetLazyPortal(int portal) +{ + struct lnet_portal *ptl; + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + CDEBUG(D_NET, "Setting portal %d lazy\n", portal); + ptl = the_lnet.ln_portals[portal]; + + lnet_res_lock(LNET_LOCK_EX); + lnet_ptl_lock(ptl); + + lnet_ptl_setopt(ptl, LNET_PTL_LAZY); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + + return 0; +} +EXPORT_SYMBOL(LNetSetLazyPortal); + +int +lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason) +{ + struct lnet_portal *ptl; + struct list_head zombies = LIST_HEAD_INIT(zombies); + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + ptl = the_lnet.ln_portals[portal]; + + lnet_res_lock(LNET_LOCK_EX); + lnet_ptl_lock(ptl); + + if (!lnet_ptl_is_lazy(ptl)) { + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + return 0; + } + + if (ni != NULL) { + struct lnet_msg *msg, *tmp; + + /* grab all messages which are on the NI passed in */ + list_for_each_entry_safe(msg, tmp, &ptl->ptl_msg_delayed, + msg_list) { + if (msg->msg_txni == ni || msg->msg_rxni == ni) + list_move(&msg->msg_list, &zombies); + } + } else { + if (the_lnet.ln_state != LNET_STATE_RUNNING) + CWARN("Active lazy portal %d on exit\n", portal); + else + CDEBUG(D_NET, "clearing portal %d lazy\n", portal); + + /* grab all the blocked messages atomically */ + list_splice_init(&ptl->ptl_msg_delayed, &zombies); + + lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY); + } + + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + + lnet_drop_delayed_msg_list(&zombies, reason); + + return 0; +} + +/** + * Turn off the lazy portal attribute. Delayed requests on the portal, + * if any, will be all dropped when this function returns. + * + * \param portal Index of the portal to disable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ +int +LNetClearLazyPortal(int portal) +{ + return lnet_clear_lazy_portal(NULL, portal, + "Clearing lazy portal attr"); +} +EXPORT_SYMBOL(LNetClearLazyPortal); diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c new file mode 100644 index 0000000000000..ba330c6d2af1c --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c @@ -0,0 +1,393 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2015, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include +/* For sys_open & sys_close */ +#include +#include + +#include +#include +#include +#include + +/* + * kernel 5.1: commit 7f1bc6e95d7840d4305595b3e4025cddda88cee5 + * Y2038 64-bit time. + * SO_TIMESTAMP, SO_TIMESTAMPNS and SO_TIMESTAMPING options, the + * way they are currently defined, are not y2038 safe. + * Subsequent patches in the series add new y2038 safe versions + * of these options which provide 64 bit timestamps on all + * architectures uniformly. + * Hence, rename existing options with OLD tag suffixes. + * + * NOTE: When updating to timespec64 change change these to '_NEW'. + * + */ +#ifndef SO_SNDTIMEO +#define SO_SNDTIMEO SO_SNDTIMEO_OLD +#endif + +#ifndef SO_RCVTIMEO +#define SO_RCVTIMEO SO_RCVTIMEO_OLD +#endif + +int +lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + long jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC); + unsigned long then; + + LASSERT(nob > 0); + /* Caller may pass a zero timeout if she thinks the socket buffer is + * empty enough to take the whole message immediately */ + + for (;;) { + struct kvec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0 + }; + + if (timeout != 0) { + struct sock *sk = sock->sk; + + /* Set send timeout to remaining time */ + lock_sock(sk); + sk->sk_sndtimeo = jiffies_left; + release_sock(sk); + } + + then = jiffies; + rc = kernel_sendmsg(sock, &msg, &iov, 1, nob); + jiffies_left -= jiffies - then; + + if (rc == nob) + return 0; + + if (rc < 0) + return rc; + + if (rc == 0) { + CERROR("Unexpected zero rc\n"); + return -ECONNABORTED; + } + + if (jiffies_left <= 0) + return -EAGAIN; + + buffer = ((char *)buffer) + rc; + nob -= rc; + } + return 0; +} +EXPORT_SYMBOL(lnet_sock_write); + +int +lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + long jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC); + unsigned long then; + + LASSERT(nob > 0); + LASSERT(jiffies_left > 0); + + for (;;) { + struct kvec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_flags = 0 + }; + struct sock *sk = sock->sk; + + /* Set receive timeout to remaining time */ + lock_sock(sk); + sk->sk_rcvtimeo = jiffies_left; + release_sock(sk); + + then = jiffies; + rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0); + jiffies_left -= jiffies - then; + + if (rc < 0) + return rc; + + if (rc == 0) + return -ECONNRESET; + + buffer = ((char *)buffer) + rc; + nob -= rc; + + if (nob == 0) + return 0; + + if (jiffies_left <= 0) + return -ETIMEDOUT; + } +} +EXPORT_SYMBOL(lnet_sock_read); + +static int +lnet_sock_create(struct socket **sockp, int *fatal, + __u32 local_ip, int local_port, struct net *ns) +{ + struct sockaddr_in locaddr; + struct socket *sock; + int rc; + + /* All errors are fatal except bind failure if the port is in use */ + *fatal = 1; + +#ifdef HAVE_SOCK_CREATE_KERN_USE_NET + rc = sock_create_kern(ns, PF_INET, SOCK_STREAM, 0, &sock); +#else + rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, &sock); +#endif + *sockp = sock; + if (rc != 0) { + CERROR("Can't create socket: %d\n", rc); + return rc; + } + + sock->sk->sk_reuseport = 1; + + if (local_ip != 0 || local_port != 0) { + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_port = htons(local_port); + locaddr.sin_addr.s_addr = (local_ip == 0) ? + INADDR_ANY : htonl(local_ip); + + rc = kernel_bind(sock, (struct sockaddr *)&locaddr, + sizeof(locaddr)); + if (rc == -EADDRINUSE) { + CDEBUG(D_NET, "Port %d already in use\n", local_port); + *fatal = 0; + goto failed; + } + if (rc != 0) { + CERROR("Error trying to bind to port %d: %d\n", + local_port, rc); + goto failed; + } + } + return 0; + +failed: + sock_release(sock); + return rc; +} + +void +lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize) +{ + struct sock *sk = sock->sk; + + if (txbufsize != 0) { + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = txbufsize; + sk->sk_write_space(sk); + } + + if (rxbufsize != 0) { + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_sndbuf = rxbufsize; + } +} +EXPORT_SYMBOL(lnet_sock_setbuf); + +int +lnet_sock_getaddr(struct socket *sock, bool remote, __u32 *ip, int *port) +{ + struct sockaddr_in sin; + int rc; +#ifndef HAVE_KERN_SOCK_GETNAME_2ARGS + int len = sizeof(sin); +#endif + + if (remote) + rc = lnet_kernel_getpeername(sock, + (struct sockaddr *)&sin, &len); + else + rc = lnet_kernel_getsockname(sock, + (struct sockaddr *)&sin, &len); + if (rc < 0) { + CERROR("Error %d getting sock %s IP/port\n", + rc, remote ? "peer" : "local"); + return rc; + } + + if (ip != NULL) + *ip = ntohl(sin.sin_addr.s_addr); + + if (port != NULL) + *port = ntohs(sin.sin_port); + + return 0; +} +EXPORT_SYMBOL(lnet_sock_getaddr); + +void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize) +{ + if (txbufsize != NULL) + *txbufsize = sock->sk->sk_sndbuf; + + if (rxbufsize != NULL) + *rxbufsize = sock->sk->sk_rcvbuf; +} +EXPORT_SYMBOL(lnet_sock_getbuf); + +int +lnet_sock_listen(struct socket **sockp, + __u32 local_ip, int local_port, int backlog, struct net *ns) +{ + int fatal; + int rc; + + rc = lnet_sock_create(sockp, &fatal, local_ip, local_port, ns); + if (rc != 0) { + if (!fatal) + CERROR("Can't create socket: port %d already in use\n", + local_port); + return rc; + } + + rc = kernel_listen(*sockp, backlog); + if (rc == 0) + return 0; + + CERROR("Can't set listen backlog %d: %d\n", backlog, rc); + sock_release(*sockp); + return rc; +} + +#ifndef HAVE_SK_SLEEP +static inline wait_queue_head_t *sk_sleep(struct sock *sk) +{ + return sk->sk_sleep; +} +#endif + +int +lnet_sock_accept(struct socket **newsockp, struct socket *sock) +{ + wait_queue_entry_t wait; + struct socket *newsock; + int rc; + + /* XXX this should add a ref to sock->ops->owner, if + * TCP could be a module */ + rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock); + if (rc) { + CERROR("Can't allocate socket\n"); + return rc; + } + + newsock->ops = sock->ops; + +#ifdef HAVE_KERN_SOCK_ACCEPT_FLAG_ARG + rc = sock->ops->accept(sock, newsock, O_NONBLOCK, false); +#else + rc = sock->ops->accept(sock, newsock, O_NONBLOCK); +#endif + if (rc == -EAGAIN) { + /* Nothing ready, so wait for activity */ + init_waitqueue_entry(&wait, current); + add_wait_queue(sk_sleep(sock->sk), &wait); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + remove_wait_queue(sk_sleep(sock->sk), &wait); +#ifdef HAVE_KERN_SOCK_ACCEPT_FLAG_ARG + rc = sock->ops->accept(sock, newsock, O_NONBLOCK, false); +#else + rc = sock->ops->accept(sock, newsock, O_NONBLOCK); +#endif + } + + if (rc != 0) + goto failed; + + *newsockp = newsock; + return 0; + +failed: + sock_release(newsock); + return rc; +} + +int +lnet_sock_connect(struct socket **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port, + struct net *ns) +{ + struct sockaddr_in srvaddr; + int rc; + + rc = lnet_sock_create(sockp, fatal, local_ip, local_port, ns); + if (rc != 0) + return rc; + + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(peer_port); + srvaddr.sin_addr.s_addr = htonl(peer_ip); + + rc = kernel_connect(*sockp, (struct sockaddr *)&srvaddr, + sizeof(srvaddr), 0); + if (rc == 0) + return 0; + + /* EADDRNOTAVAIL probably means we're already connected to the same + * peer/port on the same local port on a differently typed + * connection. Let our caller retry with a different local + * port... */ + *fatal = !(rc == -EADDRNOTAVAIL); + + CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET, + "Error %d connecting %pI4h/%d -> %pI4h/%d\n", rc, + &local_ip, local_port, &peer_ip, peer_port); + + sock_release(*sockp); + return rc; +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/lo.c b/drivers/staging/lustrefsx/lnet/lnet/lo.c new file mode 100644 index 0000000000000..a11ecddb08349 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/lo.c @@ -0,0 +1,116 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +static int +lolnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) +{ + LASSERT(!lntmsg->msg_routing); + LASSERT(!lntmsg->msg_target_is_router); + + return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0); +} + +static int +lolnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, + int delayed, unsigned int niov, + struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + struct lnet_msg *sendmsg = private; + + if (lntmsg != NULL) { /* not discarding */ + if (sendmsg->msg_iov != NULL) { + if (iov != NULL) + lnet_copy_iov2iov(niov, iov, offset, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, mlen); + else + lnet_copy_iov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, mlen); + } else { + if (iov != NULL) + lnet_copy_kiov2iov(niov, iov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + else + lnet_copy_kiov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + } + + lnet_finalize(lntmsg, 0); + } + + lnet_finalize(sendmsg, 0); + return 0; +} + +static int lolnd_instanced; + +static void +lolnd_shutdown(struct lnet_ni *ni) +{ + CDEBUG (D_NET, "shutdown\n"); + LASSERT(lolnd_instanced); + + lolnd_instanced = 0; +} + +static int +lolnd_startup(struct lnet_ni *ni) +{ + LASSERT (ni->ni_net->net_lnd == &the_lolnd); + LASSERT (!lolnd_instanced); + lolnd_instanced = 1; + + return (0); +} + +struct lnet_lnd the_lolnd = { + .lnd_list = { + .next = &the_lolnd.lnd_list, + .prev = &the_lolnd.lnd_list + }, + .lnd_type = LOLND, + .lnd_startup = lolnd_startup, + .lnd_shutdown = lolnd_shutdown, + .lnd_send = lolnd_send, + .lnd_recv = lolnd_recv +}; diff --git a/drivers/staging/lustrefsx/lnet/lnet/module.c b/drivers/staging/lustrefsx/lnet/lnet/module.c new file mode 100644 index 0000000000000..676f7345ca576 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/module.c @@ -0,0 +1,276 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +static int config_on_load = 0; +module_param(config_on_load, int, 0444); +MODULE_PARM_DESC(config_on_load, "configure network at module load"); + +static struct mutex lnet_config_mutex; + +static int +lnet_configure(void *arg) +{ + /* 'arg' only there so I can be passed to cfs_create_thread() */ + int rc = 0; + + mutex_lock(&lnet_config_mutex); + + if (!the_lnet.ln_niinit_self) { + rc = try_module_get(THIS_MODULE); + + if (rc != 1) + goto out; + + rc = LNetNIInit(LNET_PID_LUSTRE); + if (rc >= 0) { + the_lnet.ln_niinit_self = 1; + rc = 0; + } else { + module_put(THIS_MODULE); + } + } + +out: + mutex_unlock(&lnet_config_mutex); + return rc; +} + +static int +lnet_unconfigure (void) +{ + int refcount; + + mutex_lock(&lnet_config_mutex); + + if (the_lnet.ln_niinit_self) { + the_lnet.ln_niinit_self = 0; + LNetNIFini(); + module_put(THIS_MODULE); + } + + mutex_lock(&the_lnet.ln_api_mutex); + refcount = the_lnet.ln_refcount; + mutex_unlock(&the_lnet.ln_api_mutex); + + mutex_unlock(&lnet_config_mutex); + + return (refcount == 0) ? 0 : -EBUSY; +} + +static int +lnet_dyn_configure_net(struct libcfs_ioctl_hdr *hdr) +{ + struct lnet_ioctl_config_data *conf = + (struct lnet_ioctl_config_data *)hdr; + int rc; + + if (conf->cfg_hdr.ioc_len < sizeof(*conf)) + return -EINVAL; + + mutex_lock(&lnet_config_mutex); + if (the_lnet.ln_niinit_self) + rc = lnet_dyn_add_net(conf); + else + rc = -EINVAL; + mutex_unlock(&lnet_config_mutex); + + return rc; +} + +static int +lnet_dyn_unconfigure_net(struct libcfs_ioctl_hdr *hdr) +{ + struct lnet_ioctl_config_data *conf = + (struct lnet_ioctl_config_data *) hdr; + int rc; + + if (conf->cfg_hdr.ioc_len < sizeof(*conf)) + return -EINVAL; + + mutex_lock(&lnet_config_mutex); + if (the_lnet.ln_niinit_self) + rc = lnet_dyn_del_net(conf->cfg_net); + else + rc = -EINVAL; + mutex_unlock(&lnet_config_mutex); + + return rc; +} + +static int +lnet_dyn_configure_ni(struct libcfs_ioctl_hdr *hdr) +{ + struct lnet_ioctl_config_ni *conf = + (struct lnet_ioctl_config_ni *)hdr; + int rc; + + if (conf->lic_cfg_hdr.ioc_len < sizeof(*conf)) + return -EINVAL; + + mutex_lock(&lnet_config_mutex); + if (the_lnet.ln_niinit_self) + rc = lnet_dyn_add_ni(conf); + else + rc = -EINVAL; + mutex_unlock(&lnet_config_mutex); + + return rc; +} + +static int +lnet_dyn_unconfigure_ni(struct libcfs_ioctl_hdr *hdr) +{ + struct lnet_ioctl_config_ni *conf = + (struct lnet_ioctl_config_ni *) hdr; + int rc; + + if (conf->lic_cfg_hdr.ioc_len < sizeof(*conf)) + return -EINVAL; + + mutex_lock(&lnet_config_mutex); + if (the_lnet.ln_niinit_self) + rc = lnet_dyn_del_ni(conf); + else + rc = -EINVAL; + mutex_unlock(&lnet_config_mutex); + + return rc; +} + +static int +lnet_ioctl(struct notifier_block *nb, + unsigned long cmd, void *vdata) +{ + struct libcfs_ioctl_hdr *hdr = vdata; + int rc; + + switch (cmd) { + case IOC_LIBCFS_CONFIGURE: { + struct libcfs_ioctl_data *data = + (struct libcfs_ioctl_data *)hdr; + + if (data->ioc_hdr.ioc_len < sizeof(*data)) { + rc = -EINVAL; + } else { + the_lnet.ln_nis_from_mod_params = data->ioc_flags; + rc = lnet_configure(NULL); + } + break; + } + + case IOC_LIBCFS_UNCONFIGURE: + rc = lnet_unconfigure(); + break; + + case IOC_LIBCFS_ADD_NET: + rc = lnet_dyn_configure_net(hdr); + break; + + case IOC_LIBCFS_DEL_NET: + rc = lnet_dyn_unconfigure_net(hdr); + break; + + case IOC_LIBCFS_ADD_LOCAL_NI: + rc = lnet_dyn_configure_ni(hdr); + break; + + case IOC_LIBCFS_DEL_LOCAL_NI: + rc = lnet_dyn_unconfigure_ni(hdr); + break; + + default: + /* Passing LNET_PID_ANY only gives me a ref if the net is up + * already; I'll need it to ensure the net can't go down while + * I'm called into it */ + rc = LNetNIInit(LNET_PID_ANY); + if (rc >= 0) { + rc = LNetCtl(cmd, hdr); + LNetNIFini(); + } + break; + } + return notifier_from_ioctl_errno(rc); +} + +static struct notifier_block lnet_ioctl_handler = { + .notifier_call = lnet_ioctl, +}; + +static int __init lnet_init(void) +{ + int rc; + ENTRY; + + mutex_init(&lnet_config_mutex); + + rc = lnet_lib_init(); + if (rc != 0) { + CERROR("lnet_lib_init: error %d\n", rc); + RETURN(rc); + } + + rc = blocking_notifier_chain_register(&libcfs_ioctl_list, + &lnet_ioctl_handler); + LASSERT(rc == 0); + + if (config_on_load) { + /* Have to schedule a separate thread to avoid deadlocking + * in modload */ + (void)kthread_run(lnet_configure, NULL, "lnet_initd"); + } + + RETURN(0); +} + +static void __exit lnet_exit(void) +{ + int rc; + + rc = blocking_notifier_chain_unregister(&libcfs_ioctl_list, + &lnet_ioctl_handler); + LASSERT(rc == 0); + + lnet_lib_exit(); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Networking layer"); +MODULE_VERSION(LNET_VERSION); +MODULE_LICENSE("GPL"); + +module_init(lnet_init); +module_exit(lnet_exit); diff --git a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c new file mode 100644 index 0000000000000..4013ac47ab096 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c @@ -0,0 +1,1108 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/net_fault.c + * + * Lustre network fault simulation + * + * Author: liang.zhen@intel.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +#define LNET_MSG_MASK (LNET_PUT_BIT | LNET_ACK_BIT | \ + LNET_GET_BIT | LNET_REPLY_BIT) + +struct lnet_drop_rule { + /** link chain on the_lnet.ln_drop_rules */ + struct list_head dr_link; + /** attributes of this rule */ + struct lnet_fault_attr dr_attr; + /** lock to protect \a dr_drop_at and \a dr_stat */ + spinlock_t dr_lock; + /** + * the message sequence to drop, which means message is dropped when + * dr_stat.drs_count == dr_drop_at + */ + unsigned long dr_drop_at; + /** + * seconds to drop the next message, it's exclusive with dr_drop_at + */ + time64_t dr_drop_time; + /** baseline to caculate dr_drop_time */ + time64_t dr_time_base; + /** statistic of dropped messages */ + struct lnet_fault_stat dr_stat; +}; + +static bool +lnet_fault_nid_match(lnet_nid_t nid, lnet_nid_t msg_nid) +{ + if (nid == msg_nid || nid == LNET_NID_ANY) + return true; + + if (LNET_NIDNET(nid) != LNET_NIDNET(msg_nid)) + return false; + + /* 255.255.255.255@net is wildcard for all addresses in a network */ + return LNET_NIDADDR(nid) == LNET_NIDADDR(LNET_NID_ANY); +} + +static bool +lnet_fault_attr_match(struct lnet_fault_attr *attr, lnet_nid_t src, + lnet_nid_t dst, unsigned int type, unsigned int portal) +{ + if (!lnet_fault_nid_match(attr->fa_src, src) || + !lnet_fault_nid_match(attr->fa_dst, dst)) + return false; + + if (!(attr->fa_msg_mask & (1 << type))) + return false; + + /* NB: ACK and REPLY have no portal, but they should have been + * rejected by message mask */ + if (attr->fa_ptl_mask != 0 && /* has portal filter */ + !(attr->fa_ptl_mask & (1ULL << portal))) + return false; + + return true; +} + +static int +lnet_fault_attr_validate(struct lnet_fault_attr *attr) +{ + if (attr->fa_msg_mask == 0) + attr->fa_msg_mask = LNET_MSG_MASK; /* all message types */ + + if (attr->fa_ptl_mask == 0) /* no portal filter */ + return 0; + + /* NB: only PUT and GET can be filtered if portal filter has been set */ + attr->fa_msg_mask &= LNET_GET_BIT | LNET_PUT_BIT; + if (attr->fa_msg_mask == 0) { + CDEBUG(D_NET, "can't find valid message type bits %x\n", + attr->fa_msg_mask); + return -EINVAL; + } + return 0; +} + +static void +lnet_fault_stat_inc(struct lnet_fault_stat *stat, unsigned int type) +{ + /* NB: fs_counter is NOT updated by this function */ + switch (type) { + case LNET_MSG_PUT: + stat->fs_put++; + return; + case LNET_MSG_ACK: + stat->fs_ack++; + return; + case LNET_MSG_GET: + stat->fs_get++; + return; + case LNET_MSG_REPLY: + stat->fs_reply++; + return; + } +} + +/** + * LNet message drop simulation + */ + +/** + * Add a new drop rule to LNet + * There is no check for duplicated drop rule, all rules will be checked for + * incoming message. + */ +static int +lnet_drop_rule_add(struct lnet_fault_attr *attr) +{ + struct lnet_drop_rule *rule; + ENTRY; + + if (!((attr->u.drop.da_rate == 0) ^ (attr->u.drop.da_interval == 0))) { + CDEBUG(D_NET, + "please provide either drop rate or drop interval, " + "but not both at the same time %d/%d\n", + attr->u.drop.da_rate, attr->u.drop.da_interval); + RETURN(-EINVAL); + } + + if (lnet_fault_attr_validate(attr) != 0) + RETURN(-EINVAL); + + CFS_ALLOC_PTR(rule); + if (rule == NULL) + RETURN(-ENOMEM); + + spin_lock_init(&rule->dr_lock); + + rule->dr_attr = *attr; + if (attr->u.drop.da_interval != 0) { + rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval; + rule->dr_drop_time = ktime_get_seconds() + + cfs_rand() % attr->u.drop.da_interval; + } else { + rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate; + } + + lnet_net_lock(LNET_LOCK_EX); + list_add(&rule->dr_link, &the_lnet.ln_drop_rules); + lnet_net_unlock(LNET_LOCK_EX); + + CDEBUG(D_NET, "Added drop rule: src %s, dst %s, rate %d, interval %d\n", + libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_src), + attr->u.drop.da_rate, attr->u.drop.da_interval); + RETURN(0); +} + +/** + * Remove matched drop rules from lnet, all rules that can match \a src and + * \a dst will be removed. + * If \a src is zero, then all rules have \a dst as destination will be remove + * If \a dst is zero, then all rules have \a src as source will be removed + * If both of them are zero, all rules will be removed + */ +static int +lnet_drop_rule_del(lnet_nid_t src, lnet_nid_t dst) +{ + struct lnet_drop_rule *rule; + struct lnet_drop_rule *tmp; + struct list_head zombies; + int n = 0; + ENTRY; + + INIT_LIST_HEAD(&zombies); + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry_safe(rule, tmp, &the_lnet.ln_drop_rules, dr_link) { + if (rule->dr_attr.fa_src != src && src != 0) + continue; + + if (rule->dr_attr.fa_dst != dst && dst != 0) + continue; + + list_move(&rule->dr_link, &zombies); + } + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry_safe(rule, tmp, &zombies, dr_link) { + CDEBUG(D_NET, "Remove drop rule: src %s->dst: %s (1/%d, %d)\n", + libcfs_nid2str(rule->dr_attr.fa_src), + libcfs_nid2str(rule->dr_attr.fa_dst), + rule->dr_attr.u.drop.da_rate, + rule->dr_attr.u.drop.da_interval); + + list_del(&rule->dr_link); + CFS_FREE_PTR(rule); + n++; + } + + RETURN(n); +} + +/** + * List drop rule at position of \a pos + */ +static int +lnet_drop_rule_list(int pos, struct lnet_fault_attr *attr, + struct lnet_fault_stat *stat) +{ + struct lnet_drop_rule *rule; + int cpt; + int i = 0; + int rc = -ENOENT; + ENTRY; + + cpt = lnet_net_lock_current(); + list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { + if (i++ < pos) + continue; + + spin_lock(&rule->dr_lock); + *attr = rule->dr_attr; + *stat = rule->dr_stat; + spin_unlock(&rule->dr_lock); + rc = 0; + break; + } + + lnet_net_unlock(cpt); + RETURN(rc); +} + +/** + * reset counters for all drop rules + */ +static void +lnet_drop_rule_reset(void) +{ + struct lnet_drop_rule *rule; + int cpt; + ENTRY; + + cpt = lnet_net_lock_current(); + + list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { + struct lnet_fault_attr *attr = &rule->dr_attr; + + spin_lock(&rule->dr_lock); + + memset(&rule->dr_stat, 0, sizeof(rule->dr_stat)); + if (attr->u.drop.da_rate != 0) { + rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate; + } else { + rule->dr_drop_time = ktime_get_seconds() + + cfs_rand() % attr->u.drop.da_interval; + rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval; + } + spin_unlock(&rule->dr_lock); + } + + lnet_net_unlock(cpt); + EXIT; +} + +static void +lnet_fault_match_health(enum lnet_msg_hstatus *hstatus, __u32 mask) +{ + unsigned int random; + int choice; + int delta; + int best_delta; + int i; + + /* assign a random failure */ + random = cfs_rand(); + choice = random % (LNET_MSG_STATUS_END - LNET_MSG_STATUS_OK); + if (choice == 0) + choice++; + + if (mask == HSTATUS_RANDOM) { + *hstatus = choice; + return; + } + + if (mask & (1 << choice)) { + *hstatus = choice; + return; + } + + /* round to the closest ON bit */ + i = HSTATUS_END; + best_delta = HSTATUS_END; + while (i > 0) { + if (mask & (1 << i)) { + delta = choice - i; + if (delta < 0) + delta *= -1; + if (delta < best_delta) { + best_delta = delta; + choice = i; + } + } + i--; + } + + *hstatus = choice; +} + +/** + * check source/destination NID, portal, message type and drop rate, + * decide whether should drop this message or not + */ +static bool +drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, + lnet_nid_t dst, unsigned int type, unsigned int portal, + enum lnet_msg_hstatus *hstatus) +{ + struct lnet_fault_attr *attr = &rule->dr_attr; + bool drop; + + if (!lnet_fault_attr_match(attr, src, dst, type, portal)) + return false; + + /* + * if we're trying to match a health status error but it hasn't + * been set in the rule, then don't match + */ + if ((hstatus && !attr->u.drop.da_health_error_mask) || + (!hstatus && attr->u.drop.da_health_error_mask)) + return false; + + /* match this rule, check drop rate now */ + spin_lock(&rule->dr_lock); + if (attr->u.drop.da_random) { + int value = cfs_rand() % attr->u.drop.da_interval; + if (value >= (attr->u.drop.da_interval / 2)) + drop = true; + else + drop = false; + } else if (rule->dr_drop_time != 0) { /* time based drop */ + time64_t now = ktime_get_seconds(); + + rule->dr_stat.fs_count++; + drop = now >= rule->dr_drop_time; + if (drop) { + if (now > rule->dr_time_base) + rule->dr_time_base = now; + + rule->dr_drop_time = rule->dr_time_base + + cfs_rand() % attr->u.drop.da_interval; + rule->dr_time_base += attr->u.drop.da_interval; + + CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %lld\n", + libcfs_nid2str(attr->fa_src), + libcfs_nid2str(attr->fa_dst), + rule->dr_drop_time); + } + + } else { /* rate based drop */ + __u64 count; + + drop = rule->dr_stat.fs_count++ == rule->dr_drop_at; + count = rule->dr_stat.fs_count; + if (do_div(count, attr->u.drop.da_rate) == 0) { + rule->dr_drop_at = rule->dr_stat.fs_count + + cfs_rand() % attr->u.drop.da_rate; + CDEBUG(D_NET, "Drop Rule %s->%s: next drop: %lu\n", + libcfs_nid2str(attr->fa_src), + libcfs_nid2str(attr->fa_dst), rule->dr_drop_at); + } + } + + if (drop) { /* drop this message, update counters */ + if (hstatus) + lnet_fault_match_health(hstatus, + attr->u.drop.da_health_error_mask); + lnet_fault_stat_inc(&rule->dr_stat, type); + rule->dr_stat.u.drop.ds_dropped++; + } + + spin_unlock(&rule->dr_lock); + return drop; +} + +/** + * Check if message from \a src to \a dst can match any existed drop rule + */ +bool +lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus) +{ + lnet_nid_t src = le64_to_cpu(hdr->src_nid); + lnet_nid_t dst = le64_to_cpu(hdr->dest_nid); + unsigned int typ = le32_to_cpu(hdr->type); + struct lnet_drop_rule *rule; + unsigned int ptl = -1; + bool drop = false; + int cpt; + + /* NB: if Portal is specified, then only PUT and GET will be + * filtered by drop rule */ + if (typ == LNET_MSG_PUT) + ptl = le32_to_cpu(hdr->msg.put.ptl_index); + else if (typ == LNET_MSG_GET) + ptl = le32_to_cpu(hdr->msg.get.ptl_index); + + cpt = lnet_net_lock_current(); + list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { + drop = drop_rule_match(rule, src, dst, typ, ptl, + hstatus); + if (drop) + break; + } + lnet_net_unlock(cpt); + + return drop; +} + +/** + * LNet Delay Simulation + */ +/** timestamp (second) to send delayed message */ +#define msg_delay_send msg_ev.hdr_data + +struct lnet_delay_rule { + /** link chain on the_lnet.ln_delay_rules */ + struct list_head dl_link; + /** link chain on delay_dd.dd_sched_rules */ + struct list_head dl_sched_link; + /** attributes of this rule */ + struct lnet_fault_attr dl_attr; + /** lock to protect \a below members */ + spinlock_t dl_lock; + /** refcount of delay rule */ + atomic_t dl_refcount; + /** + * the message sequence to delay, which means message is delayed when + * dl_stat.fs_count == dl_delay_at + */ + unsigned long dl_delay_at; + /** + * seconds to delay the next message, it's exclusive with dl_delay_at + */ + time64_t dl_delay_time; + /** baseline to caculate dl_delay_time */ + time64_t dl_time_base; + /** jiffies to send the next delayed message */ + unsigned long dl_msg_send; + /** delayed message list */ + struct list_head dl_msg_list; + /** statistic of delayed messages */ + struct lnet_fault_stat dl_stat; + /** timer to wakeup delay_daemon */ + struct timer_list dl_timer; +}; + +struct delay_daemon_data { + /** serialise rule add/remove */ + struct mutex dd_mutex; + /** protect rules on \a dd_sched_rules */ + spinlock_t dd_lock; + /** scheduled delay rules (by timer) */ + struct list_head dd_sched_rules; + /** deamon thread sleeps at here */ + wait_queue_head_t dd_waitq; + /** controler (lctl command) wait at here */ + wait_queue_head_t dd_ctl_waitq; + /** deamon is running */ + unsigned int dd_running; + /** deamon stopped */ + unsigned int dd_stopped; +}; + +static struct delay_daemon_data delay_dd; + +static void +delay_rule_decref(struct lnet_delay_rule *rule) +{ + if (atomic_dec_and_test(&rule->dl_refcount)) { + LASSERT(list_empty(&rule->dl_sched_link)); + LASSERT(list_empty(&rule->dl_msg_list)); + LASSERT(list_empty(&rule->dl_link)); + + CFS_FREE_PTR(rule); + } +} + +/** + * check source/destination NID, portal, message type and delay rate, + * decide whether should delay this message or not + */ +static bool +delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src, + lnet_nid_t dst, unsigned int type, unsigned int portal, + struct lnet_msg *msg) +{ + struct lnet_fault_attr *attr = &rule->dl_attr; + bool delay; + time64_t now = ktime_get_seconds(); + + if (!lnet_fault_attr_match(attr, src, dst, type, portal)) + return false; + + /* match this rule, check delay rate now */ + spin_lock(&rule->dl_lock); + if (rule->dl_delay_time != 0) { /* time based delay */ + rule->dl_stat.fs_count++; + delay = now >= rule->dl_delay_time; + if (delay) { + if (now > rule->dl_time_base) + rule->dl_time_base = now; + + rule->dl_delay_time = rule->dl_time_base + + cfs_rand() % attr->u.delay.la_interval; + rule->dl_time_base += attr->u.delay.la_interval; + + CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %lld\n", + libcfs_nid2str(attr->fa_src), + libcfs_nid2str(attr->fa_dst), + rule->dl_delay_time); + } + + } else { /* rate based delay */ + __u64 count; + + delay = rule->dl_stat.fs_count++ == rule->dl_delay_at; + /* generate the next random rate sequence */ + count = rule->dl_stat.fs_count; + if (do_div(count, attr->u.delay.la_rate) == 0) { + rule->dl_delay_at = rule->dl_stat.fs_count + + cfs_rand() % attr->u.delay.la_rate; + CDEBUG(D_NET, "Delay Rule %s->%s: next delay: %lu\n", + libcfs_nid2str(attr->fa_src), + libcfs_nid2str(attr->fa_dst), rule->dl_delay_at); + } + } + + if (!delay) { + spin_unlock(&rule->dl_lock); + return false; + } + + /* delay this message, update counters */ + lnet_fault_stat_inc(&rule->dl_stat, type); + rule->dl_stat.u.delay.ls_delayed++; + + list_add_tail(&msg->msg_list, &rule->dl_msg_list); + msg->msg_delay_send = now + attr->u.delay.la_latency; + if (rule->dl_msg_send == -1) { + rule->dl_msg_send = msg->msg_delay_send; + mod_timer(&rule->dl_timer, + jiffies + cfs_time_seconds(attr->u.delay.la_latency)); + } + + spin_unlock(&rule->dl_lock); + return true; +} + +/** + * check if \a msg can match any Delay Rule, receiving of this message + * will be delayed if there is a match. + */ +bool +lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg) +{ + struct lnet_delay_rule *rule; + lnet_nid_t src = le64_to_cpu(hdr->src_nid); + lnet_nid_t dst = le64_to_cpu(hdr->dest_nid); + unsigned int typ = le32_to_cpu(hdr->type); + unsigned int ptl = -1; + + /* NB: called with hold of lnet_net_lock */ + + /* NB: if Portal is specified, then only PUT and GET will be + * filtered by delay rule */ + if (typ == LNET_MSG_PUT) + ptl = le32_to_cpu(hdr->msg.put.ptl_index); + else if (typ == LNET_MSG_GET) + ptl = le32_to_cpu(hdr->msg.get.ptl_index); + + list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) { + if (delay_rule_match(rule, src, dst, typ, ptl, msg)) + return true; + } + + return false; +} + +/** check out delayed messages for send */ +static void +delayed_msg_check(struct lnet_delay_rule *rule, bool all, + struct list_head *msg_list) +{ + struct lnet_msg *msg; + struct lnet_msg *tmp; + time64_t now = ktime_get_seconds(); + + if (!all && rule->dl_msg_send > now) + return; + + spin_lock(&rule->dl_lock); + list_for_each_entry_safe(msg, tmp, &rule->dl_msg_list, msg_list) { + if (!all && msg->msg_delay_send > now) + break; + + msg->msg_delay_send = 0; + list_move_tail(&msg->msg_list, msg_list); + } + + if (list_empty(&rule->dl_msg_list)) { + del_timer(&rule->dl_timer); + rule->dl_msg_send = -1; + + } else if (!list_empty(msg_list)) { + /* dequeued some timedout messages, update timer for the + * next delayed message on rule */ + msg = list_entry(rule->dl_msg_list.next, + struct lnet_msg, msg_list); + rule->dl_msg_send = msg->msg_delay_send; + mod_timer(&rule->dl_timer, + jiffies + + cfs_time_seconds(msg->msg_delay_send - now)); + } + spin_unlock(&rule->dl_lock); +} + +static void +delayed_msg_process(struct list_head *msg_list, bool drop) +{ + struct lnet_msg *msg; + + while (!list_empty(msg_list)) { + struct lnet_ni *ni; + int cpt; + int rc; + + msg = list_entry(msg_list->next, struct lnet_msg, msg_list); + + if (msg->msg_sending) { + /* Delayed send */ + list_del_init(&msg->msg_list); + ni = msg->msg_txni; + CDEBUG(D_NET, "TRACE: msg %p %s -> %s : %s\n", msg, + libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(msg->msg_txpeer->lpni_nid), + lnet_msgtyp2str(msg->msg_type)); + lnet_ni_send(ni, msg); + continue; + } + + /* Delayed receive */ + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_rxni != NULL); + + ni = msg->msg_rxni; + cpt = msg->msg_rx_cpt; + + list_del_init(&msg->msg_list); + if (drop) { + rc = -ECANCELED; + + } else if (!msg->msg_routing) { + rc = lnet_parse_local(ni, msg); + if (rc == 0) + continue; + + } else { + lnet_net_lock(cpt); + rc = lnet_parse_forward_locked(ni, msg); + lnet_net_unlock(cpt); + + switch (rc) { + case LNET_CREDIT_OK: + lnet_ni_recv(ni, msg->msg_private, msg, 0, + 0, msg->msg_len, msg->msg_len); + fallthrough; + case LNET_CREDIT_WAIT: + continue; + default: /* failures */ + break; + } + } + + lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len, + msg->msg_type); + lnet_finalize(msg, rc); + } +} + +/** + * Process delayed messages for scheduled rules + * This function can either be called by delay_rule_daemon, or by lnet_finalise + */ +void +lnet_delay_rule_check(void) +{ + struct lnet_delay_rule *rule; + struct list_head msgs; + + INIT_LIST_HEAD(&msgs); + while (1) { + if (list_empty(&delay_dd.dd_sched_rules)) + break; + + spin_lock_bh(&delay_dd.dd_lock); + if (list_empty(&delay_dd.dd_sched_rules)) { + spin_unlock_bh(&delay_dd.dd_lock); + break; + } + + rule = list_entry(delay_dd.dd_sched_rules.next, + struct lnet_delay_rule, dl_sched_link); + list_del_init(&rule->dl_sched_link); + spin_unlock_bh(&delay_dd.dd_lock); + + delayed_msg_check(rule, false, &msgs); + delay_rule_decref(rule); /* -1 for delay_dd.dd_sched_rules */ + } + + if (!list_empty(&msgs)) + delayed_msg_process(&msgs, false); +} + +/** deamon thread to handle delayed messages */ +static int +lnet_delay_rule_daemon(void *arg) +{ + delay_dd.dd_running = 1; + wake_up(&delay_dd.dd_ctl_waitq); + + while (delay_dd.dd_running) { + wait_event_interruptible(delay_dd.dd_waitq, + !delay_dd.dd_running || + !list_empty(&delay_dd.dd_sched_rules)); + lnet_delay_rule_check(); + } + + /* in case more rules have been enqueued after my last check */ + lnet_delay_rule_check(); + delay_dd.dd_stopped = 1; + wake_up(&delay_dd.dd_ctl_waitq); + + return 0; +} + +static void +delay_timer_cb(cfs_timer_cb_arg_t data) +{ + struct lnet_delay_rule *rule = cfs_from_timer(rule, data, dl_timer); + + spin_lock_bh(&delay_dd.dd_lock); + if (list_empty(&rule->dl_sched_link) && delay_dd.dd_running) { + atomic_inc(&rule->dl_refcount); + list_add_tail(&rule->dl_sched_link, &delay_dd.dd_sched_rules); + wake_up(&delay_dd.dd_waitq); + } + spin_unlock_bh(&delay_dd.dd_lock); +} + +/** + * Add a new delay rule to LNet + * There is no check for duplicated delay rule, all rules will be checked for + * incoming message. + */ +int +lnet_delay_rule_add(struct lnet_fault_attr *attr) +{ + struct lnet_delay_rule *rule; + int rc = 0; + ENTRY; + + if (!((attr->u.delay.la_rate == 0) ^ + (attr->u.delay.la_interval == 0))) { + CDEBUG(D_NET, + "please provide either delay rate or delay interval, " + "but not both at the same time %d/%d\n", + attr->u.delay.la_rate, attr->u.delay.la_interval); + RETURN(-EINVAL); + } + + if (attr->u.delay.la_latency == 0) { + CDEBUG(D_NET, "delay latency cannot be zero\n"); + RETURN(-EINVAL); + } + + if (lnet_fault_attr_validate(attr) != 0) + RETURN(-EINVAL); + + CFS_ALLOC_PTR(rule); + if (rule == NULL) + RETURN(-ENOMEM); + + mutex_lock(&delay_dd.dd_mutex); + if (!delay_dd.dd_running) { + struct task_struct *task; + + /* NB: although LND threads will process delayed message + * in lnet_finalize, but there is no guarantee that LND + * threads will be waken up if no other message needs to + * be handled. + * Only one daemon thread, performance is not the concern + * of this simualation module. + */ + task = kthread_run(lnet_delay_rule_daemon, NULL, "lnet_dd"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + GOTO(failed, rc); + } + wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_running); + } + + cfs_timer_setup(&rule->dl_timer, delay_timer_cb, + (unsigned long)rule, 0); + + spin_lock_init(&rule->dl_lock); + INIT_LIST_HEAD(&rule->dl_msg_list); + INIT_LIST_HEAD(&rule->dl_sched_link); + + rule->dl_attr = *attr; + if (attr->u.delay.la_interval != 0) { + rule->dl_time_base = ktime_get_seconds() + + attr->u.delay.la_interval; + rule->dl_delay_time = ktime_get_seconds() + + cfs_rand() % attr->u.delay.la_interval; + } else { + rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate; + } + + rule->dl_msg_send = -1; + + lnet_net_lock(LNET_LOCK_EX); + atomic_set(&rule->dl_refcount, 1); + list_add(&rule->dl_link, &the_lnet.ln_delay_rules); + lnet_net_unlock(LNET_LOCK_EX); + + CDEBUG(D_NET, "Added delay rule: src %s, dst %s, rate %d\n", + libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_src), + attr->u.delay.la_rate); + + mutex_unlock(&delay_dd.dd_mutex); + RETURN(0); + failed: + mutex_unlock(&delay_dd.dd_mutex); + CFS_FREE_PTR(rule); + return rc; +} + +/** + * Remove matched Delay Rules from lnet, if \a shutdown is true or both \a src + * and \a dst are zero, all rules will be removed, otherwise only matched rules + * will be removed. + * If \a src is zero, then all rules have \a dst as destination will be remove + * If \a dst is zero, then all rules have \a src as source will be removed + * + * When a delay rule is removed, all delayed messages of this rule will be + * processed immediately. + */ +int +lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown) +{ + struct lnet_delay_rule *rule; + struct lnet_delay_rule *tmp; + struct list_head rule_list; + struct list_head msg_list; + int n = 0; + bool cleanup; + ENTRY; + + INIT_LIST_HEAD(&rule_list); + INIT_LIST_HEAD(&msg_list); + + if (shutdown) + src = dst = 0; + + mutex_lock(&delay_dd.dd_mutex); + lnet_net_lock(LNET_LOCK_EX); + + list_for_each_entry_safe(rule, tmp, &the_lnet.ln_delay_rules, dl_link) { + if (rule->dl_attr.fa_src != src && src != 0) + continue; + + if (rule->dl_attr.fa_dst != dst && dst != 0) + continue; + + CDEBUG(D_NET, "Remove delay rule: src %s->dst: %s (1/%d, %d)\n", + libcfs_nid2str(rule->dl_attr.fa_src), + libcfs_nid2str(rule->dl_attr.fa_dst), + rule->dl_attr.u.delay.la_rate, + rule->dl_attr.u.delay.la_interval); + /* refcount is taken over by rule_list */ + list_move(&rule->dl_link, &rule_list); + } + + /* check if we need to shutdown delay_daemon */ + cleanup = list_empty(&the_lnet.ln_delay_rules) && + !list_empty(&rule_list); + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry_safe(rule, tmp, &rule_list, dl_link) { + list_del_init(&rule->dl_link); + + del_timer_sync(&rule->dl_timer); + delayed_msg_check(rule, true, &msg_list); + delay_rule_decref(rule); /* -1 for the_lnet.ln_delay_rules */ + n++; + } + + if (cleanup) { /* no more delay rule, shutdown delay_daemon */ + LASSERT(delay_dd.dd_running); + delay_dd.dd_running = 0; + wake_up(&delay_dd.dd_waitq); + + while (!delay_dd.dd_stopped) + wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_stopped); + } + mutex_unlock(&delay_dd.dd_mutex); + + if (!list_empty(&msg_list)) + delayed_msg_process(&msg_list, shutdown); + + RETURN(n); +} + +/** + * List Delay Rule at position of \a pos + */ +int +lnet_delay_rule_list(int pos, struct lnet_fault_attr *attr, + struct lnet_fault_stat *stat) +{ + struct lnet_delay_rule *rule; + int cpt; + int i = 0; + int rc = -ENOENT; + ENTRY; + + cpt = lnet_net_lock_current(); + list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) { + if (i++ < pos) + continue; + + spin_lock(&rule->dl_lock); + *attr = rule->dl_attr; + *stat = rule->dl_stat; + spin_unlock(&rule->dl_lock); + rc = 0; + break; + } + + lnet_net_unlock(cpt); + RETURN(rc); +} + +/** + * reset counters for all Delay Rules + */ +void +lnet_delay_rule_reset(void) +{ + struct lnet_delay_rule *rule; + int cpt; + ENTRY; + + cpt = lnet_net_lock_current(); + + list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) { + struct lnet_fault_attr *attr = &rule->dl_attr; + + spin_lock(&rule->dl_lock); + + memset(&rule->dl_stat, 0, sizeof(rule->dl_stat)); + if (attr->u.delay.la_rate != 0) { + rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate; + } else { + rule->dl_delay_time = ktime_get_seconds() + + cfs_rand() % attr->u.delay.la_interval; + rule->dl_time_base = ktime_get_seconds() + + attr->u.delay.la_interval; + } + spin_unlock(&rule->dl_lock); + } + + lnet_net_unlock(cpt); + EXIT; +} + +int +lnet_fault_ctl(int opc, struct libcfs_ioctl_data *data) +{ + struct lnet_fault_attr *attr; + struct lnet_fault_stat *stat; + + attr = (struct lnet_fault_attr *)data->ioc_inlbuf1; + + switch (opc) { + default: + return -EINVAL; + + case LNET_CTL_DROP_ADD: + if (attr == NULL) + return -EINVAL; + + return lnet_drop_rule_add(attr); + + case LNET_CTL_DROP_DEL: + if (attr == NULL) + return -EINVAL; + + data->ioc_count = lnet_drop_rule_del(attr->fa_src, + attr->fa_dst); + return 0; + + case LNET_CTL_DROP_RESET: + lnet_drop_rule_reset(); + return 0; + + case LNET_CTL_DROP_LIST: + stat = (struct lnet_fault_stat *)data->ioc_inlbuf2; + if (attr == NULL || stat == NULL) + return -EINVAL; + + return lnet_drop_rule_list(data->ioc_count, attr, stat); + + case LNET_CTL_DELAY_ADD: + if (attr == NULL) + return -EINVAL; + + return lnet_delay_rule_add(attr); + + case LNET_CTL_DELAY_DEL: + if (attr == NULL) + return -EINVAL; + + data->ioc_count = lnet_delay_rule_del(attr->fa_src, + attr->fa_dst, false); + return 0; + + case LNET_CTL_DELAY_RESET: + lnet_delay_rule_reset(); + return 0; + + case LNET_CTL_DELAY_LIST: + stat = (struct lnet_fault_stat *)data->ioc_inlbuf2; + if (attr == NULL || stat == NULL) + return -EINVAL; + + return lnet_delay_rule_list(data->ioc_count, attr, stat); + } +} + +int +lnet_fault_init(void) +{ + CLASSERT(LNET_PUT_BIT == 1 << LNET_MSG_PUT); + CLASSERT(LNET_ACK_BIT == 1 << LNET_MSG_ACK); + CLASSERT(LNET_GET_BIT == 1 << LNET_MSG_GET); + CLASSERT(LNET_REPLY_BIT == 1 << LNET_MSG_REPLY); + + mutex_init(&delay_dd.dd_mutex); + spin_lock_init(&delay_dd.dd_lock); + init_waitqueue_head(&delay_dd.dd_waitq); + init_waitqueue_head(&delay_dd.dd_ctl_waitq); + INIT_LIST_HEAD(&delay_dd.dd_sched_rules); + + return 0; +} + +void +lnet_fault_fini(void) +{ + lnet_drop_rule_del(0, 0); + lnet_delay_rule_del(0, 0, true); + + LASSERT(list_empty(&the_lnet.ln_drop_rules)); + LASSERT(list_empty(&the_lnet.ln_delay_rules)); + LASSERT(list_empty(&delay_dd.dd_sched_rules)); +} diff --git a/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c new file mode 100644 index 0000000000000..fe3add7b9701c --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c @@ -0,0 +1,1200 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/nidstrings.c + * + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +/* max value for numeric network address */ +#define MAX_NUMERIC_VALUE 0xffffffff + +#define IPSTRING_LENGTH 16 + +/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids + * consistent in all conversion functions. Some code fragments are copied + * around for the sake of clarity... + */ + +/* CAVEAT EMPTOR! Racey temporary buffer allocation! + * Choose the number of nidstrings to support the MAXIMUM expected number of + * concurrent users. If there are more, the returned string will be volatile. + * NB this number must allow for a process to be descheduled for a timeslice + * between getting its string and using it. + */ + +static char libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE]; +static int libcfs_nidstring_idx; + +static DEFINE_SPINLOCK(libcfs_nidstring_lock); + +static struct netstrfns *libcfs_namenum2netstrfns(const char *name); + +char * +libcfs_next_nidstring(void) +{ + char *str; + unsigned long flags; + + spin_lock_irqsave(&libcfs_nidstring_lock, flags); + + str = libcfs_nidstrings[libcfs_nidstring_idx++]; + if (libcfs_nidstring_idx == ARRAY_SIZE(libcfs_nidstrings)) + libcfs_nidstring_idx = 0; + + spin_unlock_irqrestore(&libcfs_nidstring_lock, flags); + return str; +} +EXPORT_SYMBOL(libcfs_next_nidstring); + +/** + * Nid range list syntax. + * \verbatim + * + * :== [ ' ' ] + * :== '@' + * :== '*' | + * | + * + * :== ... + * + * :== | + * + * :== '[' [ ',' ] ']' + * :== | + * '-' | + * '-' '/' + * :== | + * :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" | + * "vib" | "ra" | "elan" | "mx" | "ptl" + * \endverbatim + */ + +/** + * Structure to represent \ token of the syntax. + * + * One of this is created for each \ parsed. + */ +struct nidrange { + /** + * Link to list of this structures which is built on nid range + * list parsing. + */ + struct list_head nr_link; + /** + * List head for addrrange::ar_link. + */ + struct list_head nr_addrranges; + /** + * Flag indicating that *@ is found. + */ + int nr_all; + /** + * Pointer to corresponding element of libcfs_netstrfns. + */ + struct netstrfns *nr_netstrfns; + /** + * Number of network. E.g. 5 if \ is "elan5". + */ + int nr_netnum; +}; + +/** + * Structure to represent \ token of the syntax. + */ +struct addrrange { + /** + * Link to nidrange::nr_addrranges. + */ + struct list_head ar_link; + /** + * List head for cfs_expr_list::el_list. + */ + struct list_head ar_numaddr_ranges; +}; + +/** + * Parses \ token on the syntax. + * + * Allocates struct addrrange and links to \a nidrange via + * (nidrange::nr_addrranges) + * + * \retval 0 if \a src parses to '*' | \ | \ + * \retval -errno otherwise + */ +static int +parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange) +{ + struct addrrange *addrrange; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + nidrange->nr_all = 1; + return 0; + } + + LIBCFS_ALLOC(addrrange, sizeof(struct addrrange)); + if (addrrange == NULL) + return -ENOMEM; + list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges); + INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges); + + return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str, + src->ls_len, + &addrrange->ar_numaddr_ranges); +} + +/** + * Finds or creates struct nidrange. + * + * Checks if \a src is a valid network name, looks for corresponding + * nidrange on the ist of nidranges (\a nidlist), creates new struct + * nidrange if it is not found. + * + * \retval pointer to struct nidrange matching network specified via \a src + * \retval NULL if \a src does not match any network + */ +static struct nidrange * +add_nidrange(const struct cfs_lstr *src, + struct list_head *nidlist) +{ + struct netstrfns *nf; + struct nidrange *nr; + int endlen; + unsigned netnum; + + if (src->ls_len >= LNET_NIDSTR_SIZE) + return NULL; + + nf = libcfs_namenum2netstrfns(src->ls_str); + if (nf == NULL) + return NULL; + endlen = src->ls_len - strlen(nf->nf_name); + if (endlen == 0) + /* network name only, e.g. "elan" or "tcp" */ + netnum = 0; + else { + /* e.g. "elan25" or "tcp23", refuse to parse if + * network name is not appended with decimal or + * hexadecimal number */ + if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name), + endlen, &netnum, 0, MAX_NUMERIC_VALUE)) + return NULL; + } + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns != nf) + continue; + if (nr->nr_netnum != netnum) + continue; + return nr; + } + + LIBCFS_ALLOC(nr, sizeof(struct nidrange)); + if (nr == NULL) + return NULL; + list_add_tail(&nr->nr_link, nidlist); + INIT_LIST_HEAD(&nr->nr_addrranges); + nr->nr_netstrfns = nf; + nr->nr_all = 0; + nr->nr_netnum = netnum; + + return nr; +} + +/** + * Parses \ token of the syntax. + * + * \retval 1 if \a src parses to \ '@' \ + * \retval 0 otherwise + */ +static int +parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist) +{ + struct cfs_lstr addrrange; + struct cfs_lstr net; + struct nidrange *nr; + + if (cfs_gettok(src, '@', &addrrange) == 0) + goto failed; + + if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL) + goto failed; + + nr = add_nidrange(&net, nidlist); + if (nr == NULL) + goto failed; + + if (parse_addrange(&addrrange, nr) != 0) + goto failed; + + return 1; +failed: + return 0; +} + +/** + * Frees addrrange structures of \a list. + * + * For each struct addrrange structure found on \a list it frees + * cfs_expr_list list attached to it and frees the addrrange itself. + * + * \retval none + */ +static void +free_addrranges(struct list_head *list) +{ + while (!list_empty(list)) { + struct addrrange *ar; + + ar = list_entry(list->next, struct addrrange, ar_link); + + cfs_expr_list_free_list(&ar->ar_numaddr_ranges); + list_del(&ar->ar_link); + LIBCFS_FREE(ar, sizeof(struct addrrange)); + } +} + +/** + * Frees nidrange strutures of \a list. + * + * For each struct nidrange structure found on \a list it frees + * addrrange list attached to it and frees the nidrange itself. + * + * \retval none + */ +void +cfs_free_nidlist(struct list_head *list) +{ + struct list_head *pos, *next; + struct nidrange *nr; + + list_for_each_safe(pos, next, list) { + nr = list_entry(pos, struct nidrange, nr_link); + free_addrranges(&nr->nr_addrranges); + list_del(pos); + LIBCFS_FREE(nr, sizeof(struct nidrange)); + } +} +EXPORT_SYMBOL(cfs_free_nidlist); + +/** + * Parses nid range list. + * + * Parses with rigorous syntax and overflow checking \a str into + * \ [ ' ' \ ], compiles \a str into set of + * structures and links that structure to \a nidlist. The resulting + * list can be used to match a NID againts set of NIDS defined by \a + * str. + * \see cfs_match_nid + * + * \retval 1 on success + * \retval 0 otherwise + */ +int +cfs_parse_nidlist(char *str, int len, struct list_head *nidlist) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(nidlist); + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + cfs_free_nidlist(nidlist); + return 0; + } + rc = parse_nidrange(&res, nidlist); + if (rc == 0) { + cfs_free_nidlist(nidlist); + return 0; + } + } + return 1; +} +EXPORT_SYMBOL(cfs_parse_nidlist); + +/** + * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist). + * + * \see cfs_parse_nidlist() + * + * \retval 1 on match + * \retval 0 otherwises + */ +int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist) +{ + struct nidrange *nr; + struct addrrange *ar; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid))) + continue; + if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid))) + continue; + if (nr->nr_all) + return 1; + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) + if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid), + &ar->ar_numaddr_ranges)) + return 1; + } + return 0; +} +EXPORT_SYMBOL(cfs_match_nid); + +/** + * Print the network part of the nidrange \a nr into the specified \a buffer. + * + * \retval number of characters written + */ +static int +cfs_print_network(char *buffer, int count, struct nidrange *nr) +{ + struct netstrfns *nf = nr->nr_netstrfns; + + if (nr->nr_netnum == 0) + return scnprintf(buffer, count, "@%s", nf->nf_name); + else + return scnprintf(buffer, count, "@%s%u", + nf->nf_name, nr->nr_netnum); +} + +/** + * Print a list of addrrange (\a addrranges) into the specified \a buffer. + * At max \a count characters can be printed into \a buffer. + * + * \retval number of characters written + */ +static int +cfs_print_addrranges(char *buffer, int count, struct list_head *addrranges, + struct nidrange *nr) +{ + int i = 0; + struct addrrange *ar; + struct netstrfns *nf = nr->nr_netstrfns; + + list_for_each_entry(ar, addrranges, ar_link) { + if (i != 0) + i += scnprintf(buffer + i, count - i, " "); + i += nf->nf_print_addrlist(buffer + i, count - i, + &ar->ar_numaddr_ranges); + i += cfs_print_network(buffer + i, count - i, nr); + } + return i; +} + +/** + * Print a list of nidranges (\a nidlist) into the specified \a buffer. + * At max \a count characters can be printed into \a buffer. + * Nidranges are separated by a space character. + * + * \retval number of characters written + */ +int cfs_print_nidlist(char *buffer, int count, struct list_head *nidlist) +{ + int i = 0; + struct nidrange *nr; + + if (count <= 0) + return 0; + + list_for_each_entry(nr, nidlist, nr_link) { + if (i != 0) + i += scnprintf(buffer + i, count - i, " "); + + if (nr->nr_all != 0) { + LASSERT(list_empty(&nr->nr_addrranges)); + i += scnprintf(buffer + i, count - i, "*"); + i += cfs_print_network(buffer + i, count - i, nr); + } else { + i += cfs_print_addrranges(buffer + i, count - i, + &nr->nr_addrranges, nr); + } + } + return i; +} +EXPORT_SYMBOL(cfs_print_nidlist); + +/** + * Determines minimum and maximum addresses for a single + * numeric address range + * + * \param ar + * \param[out] *min_nid __u32 representation of min NID + * \param[out] *max_nid __u32 representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +static int cfs_ip_ar_min_max(struct addrrange *ar, __u32 *min_nid, + __u32 *max_nid) +{ + struct cfs_expr_list *expr_list; + struct cfs_range_expr *range; + unsigned int min_ip[4] = {0}; + unsigned int max_ip[4] = {0}; + int cur_octet = 0; + bool expect_full_octet = false; + + list_for_each_entry(expr_list, &ar->ar_numaddr_ranges, el_link) { + int re_count = 0; + + list_for_each_entry(range, &expr_list->el_exprs, re_link) { + /* XXX: add support for multiple & non-contig. re's */ + if (re_count > 0) + return -EINVAL; + + /* if a previous octet was ranged, then all remaining + * octets must be full for contiguous range */ + if (expect_full_octet && (range->re_lo != 0 || + range->re_hi != 255)) + return -ERANGE; + + if (range->re_stride != 1) + return -ERANGE; + + if (range->re_lo > range->re_hi) + return -EINVAL; + + if (range->re_lo != range->re_hi) + expect_full_octet = true; + + min_ip[cur_octet] = range->re_lo; + max_ip[cur_octet] = range->re_hi; + + re_count++; + } + + cur_octet++; + } + + if (min_nid != NULL) + *min_nid = ((min_ip[0] << 24) | (min_ip[1] << 16) | + (min_ip[2] << 8) | min_ip[3]); + + if (max_nid != NULL) + *max_nid = ((max_ip[0] << 24) | (max_ip[1] << 16) | + (max_ip[2] << 8) | max_ip[3]); + + return 0; +} + +/** + * Determines minimum and maximum addresses for a single + * numeric address range + * + * \param ar + * \param[out] *min_nid __u32 representation of min NID + * \param[out] *max_nid __u32 representation of max NID + * \retval -EINVAL unsupported LNET range + */ +static int cfs_num_ar_min_max(struct addrrange *ar, __u32 *min_nid, + __u32 *max_nid) +{ + struct cfs_expr_list *el; + struct cfs_range_expr *re; + unsigned int min_addr = 0; + unsigned int max_addr = 0; + + list_for_each_entry(el, &ar->ar_numaddr_ranges, el_link) { + int re_count = 0; + + list_for_each_entry(re, &el->el_exprs, re_link) { + if (re_count > 0) + return -EINVAL; + if (re->re_lo > re->re_hi) + return -EINVAL; + + if (re->re_lo < min_addr || min_addr == 0) + min_addr = re->re_lo; + if (re->re_hi > max_addr) + max_addr = re->re_hi; + + re_count++; + } + } + + if (min_nid != NULL) + *min_nid = min_addr; + if (max_nid != NULL) + *max_nid = max_addr; + + return 0; +} + +/** + * Takes a linked list of nidrange expressions, determines the minimum + * and maximum nid and creates appropriate nid structures + * + * \param[out] *min_nid string representation of min NID + * \param[out] *max_nid string representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid, + char *max_nid, size_t nidstr_length) +{ + struct nidrange *first_nidrange; + int netnum; + struct netstrfns *nf; + char *lndname; + __u32 min_addr; + __u32 max_addr; + char min_addr_str[IPSTRING_LENGTH]; + char max_addr_str[IPSTRING_LENGTH]; + int rc; + + first_nidrange = list_entry(nidlist->next, struct nidrange, nr_link); + + netnum = first_nidrange->nr_netnum; + nf = first_nidrange->nr_netstrfns; + lndname = nf->nf_name; + + rc = nf->nf_min_max(nidlist, &min_addr, &max_addr); + if (rc < 0) + return rc; + + nf->nf_addr2str(min_addr, min_addr_str, sizeof(min_addr_str)); + nf->nf_addr2str(max_addr, max_addr_str, sizeof(max_addr_str)); + + snprintf(min_nid, nidstr_length, "%s@%s%d", min_addr_str, lndname, + netnum); + snprintf(max_nid, nidstr_length, "%s@%s%d", max_addr_str, lndname, + netnum); + + return 0; +} +EXPORT_SYMBOL(cfs_nidrange_find_min_max); + +/** + * Determines the min and max NID values for num LNDs + * + * \param *nidlist + * \param[out] *min_nid if provided, returns string representation of min NID + * \param[out] *max_nid if provided, returns string representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +static int cfs_num_min_max(struct list_head *nidlist, __u32 *min_nid, + __u32 *max_nid) +{ + struct nidrange *nr; + struct addrrange *ar; + unsigned int tmp_min_addr = 0; + unsigned int tmp_max_addr = 0; + unsigned int min_addr = 0; + unsigned int max_addr = 0; + int nidlist_count = 0; + int rc; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nidlist_count > 0) + return -EINVAL; + + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { + rc = cfs_num_ar_min_max(ar, &tmp_min_addr, + &tmp_max_addr); + if (rc < 0) + return rc; + + if (tmp_min_addr < min_addr || min_addr == 0) + min_addr = tmp_min_addr; + if (tmp_max_addr > max_addr) + max_addr = tmp_min_addr; + } + } + if (max_nid != NULL) + *max_nid = max_addr; + if (min_nid != NULL) + *min_nid = min_addr; + + return 0; +} + +/** + * Takes an nidlist and determines the minimum and maximum + * ip addresses. + * + * \param *nidlist + * \param[out] *min_nid if provided, returns string representation of min NID + * \param[out] *max_nid if provided, returns string representation of max NID + * \retval -EINVAL unsupported LNET range + * \retval -ERANGE non-contiguous LNET range + */ +static int cfs_ip_min_max(struct list_head *nidlist, __u32 *min_nid, + __u32 *max_nid) +{ + struct nidrange *nr; + struct addrrange *ar; + __u32 tmp_min_ip_addr = 0; + __u32 tmp_max_ip_addr = 0; + __u32 min_ip_addr = 0; + __u32 max_ip_addr = 0; + int nidlist_count = 0; + int rc; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nidlist_count > 0) + return -EINVAL; + + if (nr->nr_all) { + min_ip_addr = 0; + max_ip_addr = 0xffffffff; + break; + } + + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { + rc = cfs_ip_ar_min_max(ar, &tmp_min_ip_addr, + &tmp_max_ip_addr); + if (rc < 0) + return rc; + + if (tmp_min_ip_addr < min_ip_addr || min_ip_addr == 0) + min_ip_addr = tmp_min_ip_addr; + if (tmp_max_ip_addr > max_ip_addr) + max_ip_addr = tmp_max_ip_addr; + } + + nidlist_count++; + } + + if (max_nid != NULL) + *max_nid = max_ip_addr; + if (min_nid != NULL) + *min_nid = min_ip_addr; + + return 0; +} + +static int +libcfs_lo_str2addr(const char *str, int nob, __u32 *addr) +{ + *addr = 0; + return 1; +} + +static void +libcfs_ip_addr2str(__u32 addr, char *str, size_t size) +{ + snprintf(str, size, "%u.%u.%u.%u", + (addr >> 24) & 0xff, (addr >> 16) & 0xff, + (addr >> 8) & 0xff, addr & 0xff); +} + +/* CAVEAT EMPTOR XscanfX + * I use "%n" at the end of a sscanf format to detect trailing junk. However + * sscanf may return immediately if it sees the terminating '0' in a string, so + * I initialise the %n variable to the expected length. If sscanf sets it; + * fine, if it doesn't, then the scan ended at the end of the string, which is + * fine too :) */ +static int +libcfs_ip_str2addr(const char *str, int nob, __u32 *addr) +{ + unsigned int a; + unsigned int b; + unsigned int c; + unsigned int d; + int n = nob; /* XscanfX */ + + /* numeric IP? */ + if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 && + n == nob && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) { + *addr = ((a<<24)|(b<<16)|(c<<8)|d); + return 1; + } + return 0; +} + +/* Used by lnet/config.c so it can't be static */ +int +cfs_ip_addr_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + struct cfs_lstr src; + int rc; + int i; + + src.ls_str = str; + src.ls_len = len; + i = 0; + + while (src.ls_str != NULL) { + struct cfs_lstr res; + + if (!cfs_gettok(&src, '.', &res)) { + rc = -EINVAL; + goto out; + } + + rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el); + if (rc != 0) + goto out; + + list_add_tail(&el->el_link, list); + i++; + } + + if (i == 4) + return 0; + + rc = -EINVAL; +out: + cfs_expr_list_free_list(list); + + return rc; +} + +static int +libcfs_ip_addr_range_print(char *buffer, int count, struct list_head *list) +{ + int i = 0, j = 0; + struct cfs_expr_list *el; + + list_for_each_entry(el, list, el_link) { + LASSERT(j++ < 4); + if (i != 0) + i += scnprintf(buffer + i, count - i, "."); + i += cfs_expr_list_print(buffer + i, count - i, el); + } + return i; +} + +/** + * Matches address (\a addr) against address set encoded in \a list. + * + * \retval 1 if \a addr matches + * \retval 0 otherwise + */ +int +cfs_ip_addr_match(__u32 addr, struct list_head *list) +{ + struct cfs_expr_list *el; + int i = 0; + + list_for_each_entry_reverse(el, list, el_link) { + if (!cfs_expr_list_match(addr & 0xff, el)) + return 0; + addr >>= 8; + i++; + } + + return i == 4; +} + +/** + * Print the network part of the nidrange \a nr into the specified \a buffer. + * + * \retval number of characters written + */ +static void +libcfs_decnum_addr2str(__u32 addr, char *str, size_t size) +{ + snprintf(str, size, "%u", addr); +} + +static int +libcfs_num_str2addr(const char *str, int nob, __u32 *addr) +{ + int n; + + n = nob; + if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob) + return 1; + + return 0; +} + +/** + * Nf_parse_addrlist method for networks using numeric addresses. + * + * Examples of such networks are gm and elan. + * + * \retval 0 if \a str parsed to numeric address + * \retval errno otherwise + */ +static int +libcfs_num_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + int rc; + + rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el); + if (rc == 0) + list_add_tail(&el->el_link, list); + + return rc; +} + +static int +libcfs_num_addr_range_print(char *buffer, int count, struct list_head *list) +{ + int i = 0, j = 0; + struct cfs_expr_list *el; + + list_for_each_entry(el, list, el_link) { + LASSERT(j++ < 1); + i += cfs_expr_list_print(buffer + i, count - i, el); + } + return i; +} + +/* + * Nf_match_addr method for networks using numeric addresses + * + * \retval 1 on match + * \retval 0 otherwise + */ +static int +libcfs_num_match(__u32 addr, struct list_head *numaddr) +{ + struct cfs_expr_list *el; + + LASSERT(!list_empty(numaddr)); + el = list_entry(numaddr->next, struct cfs_expr_list, el_link); + + return cfs_expr_list_match(addr, el); +} + +static struct netstrfns libcfs_netstrfns[] = { + { .nf_type = LOLND, + .nf_name = "lo", + .nf_modname = "klolnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_lo_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match, + .nf_min_max = cfs_num_min_max }, + { .nf_type = SOCKLND, + .nf_name = "tcp", + .nf_modname = "ksocklnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match, + .nf_min_max = cfs_ip_min_max }, + { .nf_type = O2IBLND, + .nf_name = "o2ib", + .nf_modname = "ko2iblnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match, + .nf_min_max = cfs_ip_min_max }, + { .nf_type = GNILND, + .nf_name = "gni", + .nf_modname = "kgnilnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match, + .nf_min_max = cfs_num_min_max }, + { .nf_type = GNIIPLND, + .nf_name = "gip", + .nf_modname = "kgnilnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match, + .nf_min_max = cfs_ip_min_max }, + { .nf_type = PTL4LND, + .nf_name = "ptlf", + .nf_modname = "kptl4lnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match, + .nf_min_max = cfs_num_min_max}, +}; + +static const size_t libcfs_nnetstrfns = ARRAY_SIZE(libcfs_netstrfns); + +static struct netstrfns * +libcfs_lnd2netstrfns(__u32 lnd) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (lnd == libcfs_netstrfns[i].nf_type) + return &libcfs_netstrfns[i]; + + return NULL; +} + +static struct netstrfns * +libcfs_namenum2netstrfns(const char *name) +{ + struct netstrfns *nf; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (!strncmp(name, nf->nf_name, strlen(nf->nf_name))) + return nf; + } + return NULL; +} + +static struct netstrfns * +libcfs_name2netstrfns(const char *name) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (!strcmp(libcfs_netstrfns[i].nf_name, name)) + return &libcfs_netstrfns[i]; + + return NULL; +} + +int +libcfs_isknown_lnd(__u32 lnd) +{ + return libcfs_lnd2netstrfns(lnd) != NULL; +} +EXPORT_SYMBOL(libcfs_isknown_lnd); + +char * +libcfs_lnd2modname(__u32 lnd) +{ + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + + return (nf == NULL) ? NULL : nf->nf_modname; +} +EXPORT_SYMBOL(libcfs_lnd2modname); + +int +libcfs_str2lnd(const char *str) +{ + struct netstrfns *nf = libcfs_name2netstrfns(str); + + if (nf != NULL) + return nf->nf_type; + + return -ENXIO; +} +EXPORT_SYMBOL(libcfs_str2lnd); + +char * +libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size) +{ + struct netstrfns *nf; + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) + snprintf(buf, buf_size, "?%u?", lnd); + else + snprintf(buf, buf_size, "%s", nf->nf_name); + + return buf; +} +EXPORT_SYMBOL(libcfs_lnd2str_r); + +char * +libcfs_net2str_r(__u32 net, char *buf, size_t buf_size) +{ + __u32 nnum = LNET_NETNUM(net); + __u32 lnd = LNET_NETTYP(net); + struct netstrfns *nf; + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) + snprintf(buf, buf_size, "<%u:%u>", lnd, nnum); + else if (nnum == 0) + snprintf(buf, buf_size, "%s", nf->nf_name); + else + snprintf(buf, buf_size, "%s%u", nf->nf_name, nnum); + + return buf; +} +EXPORT_SYMBOL(libcfs_net2str_r); + +char * +libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size) +{ + __u32 addr = LNET_NIDADDR(nid); + __u32 net = LNET_NIDNET(nid); + __u32 nnum = LNET_NETNUM(net); + __u32 lnd = LNET_NETTYP(net); + struct netstrfns *nf; + + if (nid == LNET_NID_ANY) { + strncpy(buf, "", buf_size); + buf[buf_size - 1] = '\0'; + return buf; + } + + nf = libcfs_lnd2netstrfns(lnd); + if (nf == NULL) { + snprintf(buf, buf_size, "%x@<%u:%u>", addr, lnd, nnum); + } else { + size_t addr_len; + + nf->nf_addr2str(addr, buf, buf_size); + addr_len = strlen(buf); + if (nnum == 0) + snprintf(buf + addr_len, buf_size - addr_len, "@%s", + nf->nf_name); + else + snprintf(buf + addr_len, buf_size - addr_len, "@%s%u", + nf->nf_name, nnum); + } + + return buf; +} +EXPORT_SYMBOL(libcfs_nid2str_r); + +static struct netstrfns * +libcfs_str2net_internal(const char *str, __u32 *net) +{ + struct netstrfns *nf = NULL; + int nob; + unsigned int netnum; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (!strncmp(str, nf->nf_name, strlen(nf->nf_name))) + break; + } + + if (i == libcfs_nnetstrfns) + return NULL; + + nob = strlen(nf->nf_name); + + if (strlen(str) == (unsigned int)nob) { + netnum = 0; + } else { + if (nf->nf_type == LOLND) /* net number not allowed */ + return NULL; + + str += nob; + i = strlen(str); + if (sscanf(str, "%u%n", &netnum, &i) < 1 || + i != (int)strlen(str)) + return NULL; + } + + *net = LNET_MKNET(nf->nf_type, netnum); + return nf; +} + +__u32 +libcfs_str2net(const char *str) +{ + __u32 net; + + if (libcfs_str2net_internal(str, &net) != NULL) + return net; + + return LNET_NIDNET(LNET_NID_ANY); +} +EXPORT_SYMBOL(libcfs_str2net); + +lnet_nid_t +libcfs_str2nid(const char *str) +{ + const char *sep = strchr(str, '@'); + struct netstrfns *nf; + __u32 net; + __u32 addr; + + if (sep != NULL) { + nf = libcfs_str2net_internal(sep + 1, &net); + if (nf == NULL) + return LNET_NID_ANY; + } else { + sep = str + strlen(str); + net = LNET_MKNET(SOCKLND, 0); + nf = libcfs_lnd2netstrfns(SOCKLND); + LASSERT(nf != NULL); + } + + if (!nf->nf_str2addr(str, (int)(sep - str), &addr)) + return LNET_NID_ANY; + + return LNET_MKNID(net, addr); +} +EXPORT_SYMBOL(libcfs_str2nid); + +char * +libcfs_id2str(struct lnet_process_id id) +{ + char *str = libcfs_next_nidstring(); + + if (id.pid == LNET_PID_ANY) { + snprintf(str, LNET_NIDSTR_SIZE, + "LNET_PID_ANY-%s", libcfs_nid2str(id.nid)); + return str; + } + + snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s", + ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "", + (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid)); + return str; +} +EXPORT_SYMBOL(libcfs_id2str); + +int +libcfs_str2anynid(lnet_nid_t *nidp, const char *str) +{ + if (!strcmp(str, "*")) { + *nidp = LNET_NID_ANY; + return 1; + } + + *nidp = libcfs_str2nid(str); + return *nidp != LNET_NID_ANY; +} +EXPORT_SYMBOL(libcfs_str2anynid); diff --git a/drivers/staging/lustrefsx/lnet/lnet/peer.c b/drivers/staging/lustrefsx/lnet/lnet/peer.c new file mode 100644 index 0000000000000..c2d64d140702e --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/peer.c @@ -0,0 +1,3620 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/peer.c + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#ifdef HAVE_SCHED_HEADERS +#include +#endif +#include + +#include +#include + +/* Value indicating that recovery needs to re-check a peer immediately. */ +#define LNET_REDISCOVER_PEER (1) + +static int lnet_peer_queue_for_discovery(struct lnet_peer *lp); + +static void +lnet_peer_remove_from_remote_list(struct lnet_peer_ni *lpni) +{ + if (!list_empty(&lpni->lpni_on_remote_peer_ni_list)) { + list_del_init(&lpni->lpni_on_remote_peer_ni_list); + lnet_peer_ni_decref_locked(lpni); + } +} + +void +lnet_peer_net_added(struct lnet_net *net) +{ + struct lnet_peer_ni *lpni, *tmp; + + list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list, + lpni_on_remote_peer_ni_list) { + + if (LNET_NIDNET(lpni->lpni_nid) == net->net_id) { + lpni->lpni_net = net; + + spin_lock(&lpni->lpni_lock); + lpni->lpni_txcredits = + lpni->lpni_net->net_tunables.lct_peer_tx_credits; + lpni->lpni_mintxcredits = lpni->lpni_txcredits; + lpni->lpni_rtrcredits = + lnet_peer_buffer_credits(lpni->lpni_net); + lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits; + spin_unlock(&lpni->lpni_lock); + + lnet_peer_remove_from_remote_list(lpni); + } + } +} + +static void +lnet_peer_tables_destroy(void) +{ + struct lnet_peer_table *ptable; + struct list_head *hash; + int i; + int j; + + if (!the_lnet.ln_peer_tables) + return; + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + hash = ptable->pt_hash; + if (!hash) /* not intialized */ + break; + + LASSERT(list_empty(&ptable->pt_zombie_list)); + + ptable->pt_hash = NULL; + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) + LASSERT(list_empty(&hash[j])); + + LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash)); + } + + cfs_percpt_free(the_lnet.ln_peer_tables); + the_lnet.ln_peer_tables = NULL; +} + +int +lnet_peer_tables_create(void) +{ + struct lnet_peer_table *ptable; + struct list_head *hash; + int i; + int j; + + the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ptable)); + if (the_lnet.ln_peer_tables == NULL) { + CERROR("Failed to allocate cpu-partition peer tables\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i, + LNET_PEER_HASH_SIZE * sizeof(*hash)); + if (hash == NULL) { + CERROR("Failed to create peer hash table\n"); + lnet_peer_tables_destroy(); + return -ENOMEM; + } + + spin_lock_init(&ptable->pt_zombie_lock); + INIT_LIST_HEAD(&ptable->pt_zombie_list); + + INIT_LIST_HEAD(&ptable->pt_peer_list); + + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) + INIT_LIST_HEAD(&hash[j]); + ptable->pt_hash = hash; /* sign of initialization */ + } + + return 0; +} + +static struct lnet_peer_ni * +lnet_peer_ni_alloc(lnet_nid_t nid) +{ + struct lnet_peer_ni *lpni; + struct lnet_net *net; + int cpt; + + cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + + LIBCFS_CPT_ALLOC(lpni, lnet_cpt_table(), cpt, sizeof(*lpni)); + if (!lpni) + return NULL; + + INIT_LIST_HEAD(&lpni->lpni_txq); + INIT_LIST_HEAD(&lpni->lpni_rtrq); + INIT_LIST_HEAD(&lpni->lpni_routes); + INIT_LIST_HEAD(&lpni->lpni_hashlist); + INIT_LIST_HEAD(&lpni->lpni_peer_nis); + INIT_LIST_HEAD(&lpni->lpni_recovery); + INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list); + LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); + + spin_lock_init(&lpni->lpni_lock); + + lpni->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */ + lpni->lpni_last_alive = ktime_get_seconds(); /* assumes alive */ + lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL; + lpni->lpni_nid = nid; + lpni->lpni_cpt = cpt; + atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE); + + net = lnet_get_net_locked(LNET_NIDNET(nid)); + lpni->lpni_net = net; + if (net) { + lpni->lpni_txcredits = net->net_tunables.lct_peer_tx_credits; + lpni->lpni_mintxcredits = lpni->lpni_txcredits; + lpni->lpni_rtrcredits = lnet_peer_buffer_credits(net); + lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits; + } else { + /* + * This peer_ni is not on a local network, so we + * cannot add the credits here. In case the net is + * added later, add the peer_ni to the remote peer ni + * list so it can be easily found and revisited. + */ + /* FIXME: per-net implementation instead? */ + atomic_inc(&lpni->lpni_refcount); + list_add_tail(&lpni->lpni_on_remote_peer_ni_list, + &the_lnet.ln_remote_peer_ni_list); + } + + CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nid2str(lpni->lpni_nid)); + + return lpni; +} + +static struct lnet_peer_net * +lnet_peer_net_alloc(__u32 net_id) +{ + struct lnet_peer_net *lpn; + + LIBCFS_CPT_ALLOC(lpn, lnet_cpt_table(), CFS_CPT_ANY, sizeof(*lpn)); + if (!lpn) + return NULL; + + INIT_LIST_HEAD(&lpn->lpn_peer_nets); + INIT_LIST_HEAD(&lpn->lpn_peer_nis); + lpn->lpn_net_id = net_id; + + CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id)); + + return lpn; +} + +void +lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn) +{ + struct lnet_peer *lp; + + CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id)); + + LASSERT(atomic_read(&lpn->lpn_refcount) == 0); + LASSERT(list_empty(&lpn->lpn_peer_nis)); + LASSERT(list_empty(&lpn->lpn_peer_nets)); + lp = lpn->lpn_peer; + lpn->lpn_peer = NULL; + LIBCFS_FREE(lpn, sizeof(*lpn)); + + lnet_peer_decref_locked(lp); +} + +static struct lnet_peer * +lnet_peer_alloc(lnet_nid_t nid) +{ + struct lnet_peer *lp; + + LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), CFS_CPT_ANY, sizeof(*lp)); + if (!lp) + return NULL; + + INIT_LIST_HEAD(&lp->lp_peer_list); + INIT_LIST_HEAD(&lp->lp_peer_nets); + INIT_LIST_HEAD(&lp->lp_dc_list); + INIT_LIST_HEAD(&lp->lp_dc_pendq); + init_waitqueue_head(&lp->lp_dc_waitq); + spin_lock_init(&lp->lp_lock); + lp->lp_primary_nid = nid; + lp->lp_disc_src_nid = LNET_NID_ANY; + + /* + * Turn off discovery for loopback peer. If you're creating a peer + * for the loopback interface then that was initiated when we + * attempted to send a message over the loopback. There is no need + * to ever use a different interface when sending messages to + * myself. + */ + if (nid == LNET_NID_LO_0) + lp->lp_state = LNET_PEER_NO_DISCOVERY; + lp->lp_cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + + CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nid2str(lp->lp_primary_nid)); + + return lp; +} + +void +lnet_destroy_peer_locked(struct lnet_peer *lp) +{ + CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nid2str(lp->lp_primary_nid)); + + LASSERT(atomic_read(&lp->lp_refcount) == 0); + LASSERT(list_empty(&lp->lp_peer_nets)); + LASSERT(list_empty(&lp->lp_peer_list)); + LASSERT(list_empty(&lp->lp_dc_list)); + + if (lp->lp_data) + lnet_ping_buffer_decref(lp->lp_data); + + /* + * if there are messages still on the pending queue, then make + * sure to queue them on the ln_msg_resend list so they can be + * resent at a later point if the discovery thread is still + * running. + * If the discovery thread has stopped, then the wakeup will be a + * no-op, and it is expected the lnet_shutdown_lndnets() will + * eventually be called, which will traverse this list and + * finalize the messages on the list. + * We can not resend them now because we're holding the cpt lock. + * Releasing the lock can cause an inconsistent state + */ + spin_lock(&the_lnet.ln_msg_resend_lock); + spin_lock(&lp->lp_lock); + list_splice(&lp->lp_dc_pendq, &the_lnet.ln_msg_resend); + spin_unlock(&lp->lp_lock); + spin_unlock(&the_lnet.ln_msg_resend_lock); + wake_up(&the_lnet.ln_dc_waitq); + + LIBCFS_FREE(lp, sizeof(*lp)); +} + +/* + * Detach a peer_ni from its peer_net. If this was the last peer_ni on + * that peer_net, detach the peer_net from the peer. + * + * Call with lnet_net_lock/EX held + */ +static void +lnet_peer_detach_peer_ni_locked(struct lnet_peer_ni *lpni) +{ + struct lnet_peer_table *ptable; + struct lnet_peer_net *lpn; + struct lnet_peer *lp; + + /* + * Belts and suspenders: gracefully handle teardown of a + * partially connected peer_ni. + */ + lpn = lpni->lpni_peer_net; + + list_del_init(&lpni->lpni_peer_nis); + /* + * If there are no lpni's left, we detach lpn from + * lp_peer_nets, so it cannot be found anymore. + */ + if (list_empty(&lpn->lpn_peer_nis)) + list_del_init(&lpn->lpn_peer_nets); + + /* Update peer NID count. */ + lp = lpn->lpn_peer; + lp->lp_nnis--; + + /* + * If there are no more peer nets, make the peer unfindable + * via the peer_tables. + * + * Otherwise, if the peer is DISCOVERED, tell discovery to + * take another look at it. This is a no-op if discovery for + * this peer did the detaching. + */ + if (list_empty(&lp->lp_peer_nets)) { + list_del_init(&lp->lp_peer_list); + ptable = the_lnet.ln_peer_tables[lp->lp_cpt]; + ptable->pt_peers--; + } else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) { + /* Discovery isn't running, nothing to do here. */ + } else if (lp->lp_state & LNET_PEER_DISCOVERED) { + lnet_peer_queue_for_discovery(lp); + wake_up(&the_lnet.ln_dc_waitq); + } + CDEBUG(D_NET, "peer %s NID %s\n", + libcfs_nid2str(lp->lp_primary_nid), + libcfs_nid2str(lpni->lpni_nid)); +} + +/* called with lnet_net_lock LNET_LOCK_EX held */ +static int +lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni) +{ + struct lnet_peer_table *ptable = NULL; + + /* don't remove a peer_ni if it's also a gateway */ + if (lpni->lpni_rtr_refcount > 0) { + CERROR("Peer NI %s is a gateway. Can not delete it\n", + libcfs_nid2str(lpni->lpni_nid)); + return -EBUSY; + } + + lnet_peer_remove_from_remote_list(lpni); + + /* remove peer ni from the hash list. */ + list_del_init(&lpni->lpni_hashlist); + + /* + * indicate the peer is being deleted so the monitor thread can + * remove it from the recovery queue. + */ + spin_lock(&lpni->lpni_lock); + lpni->lpni_state |= LNET_PEER_NI_DELETING; + spin_unlock(&lpni->lpni_lock); + + /* decrement the ref count on the peer table */ + ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; + LASSERT(ptable->pt_number > 0); + ptable->pt_number--; + + /* + * The peer_ni can no longer be found with a lookup. But there + * can be current users, so keep track of it on the zombie + * list until the reference count has gone to zero. + * + * The last reference may be lost in a place where the + * lnet_net_lock locks only a single cpt, and that cpt may not + * be lpni->lpni_cpt. So the zombie list of lnet_peer_table + * has its own lock. + */ + spin_lock(&ptable->pt_zombie_lock); + list_add(&lpni->lpni_hashlist, &ptable->pt_zombie_list); + ptable->pt_zombies++; + spin_unlock(&ptable->pt_zombie_lock); + + /* no need to keep this peer_ni on the hierarchy anymore */ + lnet_peer_detach_peer_ni_locked(lpni); + + /* remove hashlist reference on peer_ni */ + lnet_peer_ni_decref_locked(lpni); + + return 0; +} + +void lnet_peer_uninit(void) +{ + struct lnet_peer_ni *lpni, *tmp; + + lnet_net_lock(LNET_LOCK_EX); + + /* remove all peer_nis from the remote peer and the hash list */ + list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list, + lpni_on_remote_peer_ni_list) + lnet_peer_ni_del_locked(lpni); + + lnet_peer_tables_destroy(); + + lnet_net_unlock(LNET_LOCK_EX); +} + +static int +lnet_peer_del_locked(struct lnet_peer *peer) +{ + struct lnet_peer_ni *lpni = NULL, *lpni2; + int rc = 0, rc2 = 0; + + CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(peer->lp_primary_nid)); + + lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni); + while (lpni != NULL) { + lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni); + rc = lnet_peer_ni_del_locked(lpni); + if (rc != 0) + rc2 = rc; + lpni = lpni2; + } + + return rc2; +} + +static int +lnet_peer_del(struct lnet_peer *peer) +{ + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_del_locked(peer); + lnet_net_unlock(LNET_LOCK_EX); + + return 0; +} + +/* + * Delete a NID from a peer. Call with ln_api_mutex held. + * + * Error codes: + * -EPERM: Non-DLC deletion from DLC-configured peer. + * -ENOENT: No lnet_peer_ni corresponding to the nid. + * -ECHILD: The lnet_peer_ni isn't connected to the peer. + * -EBUSY: The lnet_peer_ni is the primary, and not the only peer_ni. + */ +static int +lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) +{ + struct lnet_peer_ni *lpni; + lnet_nid_t primary_nid = lp->lp_primary_nid; + int rc = 0; + + if (!(flags & LNET_PEER_CONFIGURED)) { + if (lp->lp_state & LNET_PEER_CONFIGURED) { + rc = -EPERM; + goto out; + } + } + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + rc = -ENOENT; + goto out; + } + lnet_peer_ni_decref_locked(lpni); + if (lp != lpni->lpni_peer_net->lpn_peer) { + rc = -ECHILD; + goto out; + } + + /* + * This function only allows deletion of the primary NID if it + * is the only NID. + */ + if (nid == lp->lp_primary_nid && lp->lp_nnis != 1) { + rc = -EBUSY; + goto out; + } + + lnet_net_lock(LNET_LOCK_EX); + + rc = lnet_peer_ni_del_locked(lpni); + + lnet_net_unlock(LNET_LOCK_EX); + +out: + CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n", + libcfs_nid2str(primary_nid), libcfs_nid2str(nid), flags, rc); + + return rc; +} + +static void +lnet_peer_table_cleanup_locked(struct lnet_net *net, + struct lnet_peer_table *ptable) +{ + int i; + struct lnet_peer_ni *next; + struct lnet_peer_ni *lpni; + struct lnet_peer *peer; + + for (i = 0; i < LNET_PEER_HASH_SIZE; i++) { + list_for_each_entry_safe(lpni, next, &ptable->pt_hash[i], + lpni_hashlist) { + if (net != NULL && net != lpni->lpni_net) + continue; + + peer = lpni->lpni_peer_net->lpn_peer; + if (peer->lp_primary_nid != lpni->lpni_nid) { + lnet_peer_ni_del_locked(lpni); + continue; + } + /* + * Removing the primary NID implies removing + * the entire peer. Advance next beyond any + * peer_ni that belongs to the same peer. + */ + list_for_each_entry_from(next, &ptable->pt_hash[i], + lpni_hashlist) { + if (next->lpni_peer_net->lpn_peer != peer) + break; + } + lnet_peer_del_locked(peer); + } + } +} + +static void +lnet_peer_ni_finalize_wait(struct lnet_peer_table *ptable) +{ + int i = 3; + + spin_lock(&ptable->pt_zombie_lock); + while (ptable->pt_zombies) { + spin_unlock(&ptable->pt_zombie_lock); + + if (is_power_of_2(i)) { + CDEBUG(D_WARNING, + "Waiting for %d zombies on peer table\n", + ptable->pt_zombies); + } + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) >> 1); + spin_lock(&ptable->pt_zombie_lock); + } + spin_unlock(&ptable->pt_zombie_lock); +} + +static void +lnet_peer_table_del_rtrs_locked(struct lnet_net *net, + struct lnet_peer_table *ptable) +{ + struct lnet_peer_ni *lp; + struct lnet_peer_ni *tmp; + lnet_nid_t lpni_nid; + int i; + + for (i = 0; i < LNET_PEER_HASH_SIZE; i++) { + list_for_each_entry_safe(lp, tmp, &ptable->pt_hash[i], + lpni_hashlist) { + if (net != lp->lpni_net) + continue; + + if (lp->lpni_rtr_refcount == 0) + continue; + + lpni_nid = lp->lpni_nid; + + lnet_net_unlock(LNET_LOCK_EX); + lnet_del_route(LNET_NIDNET(LNET_NID_ANY), lpni_nid); + lnet_net_lock(LNET_LOCK_EX); + } + } +} + +void +lnet_peer_tables_cleanup(struct lnet_net *net) +{ + int i; + struct lnet_peer_table *ptable; + + LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net != NULL); + /* If just deleting the peers for a NI, get rid of any routes these + * peers are gateways for. */ + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_table_del_rtrs_locked(net, ptable); + lnet_net_unlock(LNET_LOCK_EX); + } + + /* Start the cleanup process */ + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_table_cleanup_locked(net, ptable); + lnet_net_unlock(LNET_LOCK_EX); + } + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) + lnet_peer_ni_finalize_wait(ptable); +} + +static struct lnet_peer_ni * +lnet_get_peer_ni_locked(struct lnet_peer_table *ptable, lnet_nid_t nid) +{ + struct list_head *peers; + struct lnet_peer_ni *lp; + + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); + + peers = &ptable->pt_hash[lnet_nid2peerhash(nid)]; + list_for_each_entry(lp, peers, lpni_hashlist) { + if (lp->lpni_nid == nid) { + lnet_peer_ni_addref_locked(lp); + return lp; + } + } + + return NULL; +} + +struct lnet_peer_ni * +lnet_find_peer_ni_locked(lnet_nid_t nid) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer_table *ptable; + int cpt; + + cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + + ptable = the_lnet.ln_peer_tables[cpt]; + lpni = lnet_get_peer_ni_locked(ptable, nid); + + return lpni; +} + +struct lnet_peer * +lnet_find_peer(lnet_nid_t nid) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer *lp = NULL; + int cpt; + + cpt = lnet_net_lock_current(); + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) { + lp = lpni->lpni_peer_net->lpn_peer; + lnet_peer_addref_locked(lp); + lnet_peer_ni_decref_locked(lpni); + } + lnet_net_unlock(cpt); + + return lp; +} + +struct lnet_peer_ni * +lnet_get_next_peer_ni_locked(struct lnet_peer *peer, + struct lnet_peer_net *peer_net, + struct lnet_peer_ni *prev) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer_net *net = peer_net; + + if (!prev) { + if (!net) { + if (list_empty(&peer->lp_peer_nets)) + return NULL; + + net = list_entry(peer->lp_peer_nets.next, + struct lnet_peer_net, + lpn_peer_nets); + } + lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni, + lpni_peer_nis); + + return lpni; + } + + if (prev->lpni_peer_nis.next == &prev->lpni_peer_net->lpn_peer_nis) { + /* + * if you reached the end of the peer ni list and the peer + * net is specified then there are no more peer nis in that + * net. + */ + if (net) + return NULL; + + /* + * we reached the end of this net ni list. move to the + * next net + */ + if (prev->lpni_peer_net->lpn_peer_nets.next == + &peer->lp_peer_nets) + /* no more nets and no more NIs. */ + return NULL; + + /* get the next net */ + net = list_entry(prev->lpni_peer_net->lpn_peer_nets.next, + struct lnet_peer_net, + lpn_peer_nets); + /* get the ni on it */ + lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni, + lpni_peer_nis); + + return lpni; + } + + /* there are more nis left */ + lpni = list_entry(prev->lpni_peer_nis.next, + struct lnet_peer_ni, lpni_peer_nis); + + return lpni; +} + +/* Call with the ln_api_mutex held */ +int lnet_get_peer_list(u32 *countp, u32 *sizep, struct lnet_process_id __user *ids) +{ + struct lnet_process_id id; + struct lnet_peer_table *ptable; + struct lnet_peer *lp; + __u32 count = 0; + __u32 size = 0; + int lncpt; + int cpt; + __u32 i; + int rc; + + rc = -ESHUTDOWN; + if (the_lnet.ln_state != LNET_STATE_RUNNING) + goto done; + + lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + + /* + * Count the number of peers, and return E2BIG if the buffer + * is too small. We'll also return the desired size. + */ + rc = -E2BIG; + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + count += ptable->pt_peers; + } + size = count * sizeof(*ids); + if (size > *sizep) + goto done; + + /* + * Walk the peer lists and copy out the primary nids. + * This is safe because the peer lists are only modified + * while the ln_api_mutex is held. So we don't need to + * hold the lnet_net_lock as well, and can therefore + * directly call copy_to_user(). + */ + rc = -EFAULT; + memset(&id, 0, sizeof(id)); + id.pid = LNET_PID_LUSTRE; + i = 0; + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) { + if (i >= count) + goto done; + id.nid = lp->lp_primary_nid; + if (copy_to_user(&ids[i], &id, sizeof(id))) + goto done; + i++; + } + } + rc = 0; +done: + *countp = count; + *sizep = size; + return rc; +} + +/* + * Start pushes to peers that need to be updated for a configuration + * change on this node. + */ +void +lnet_push_update_to_peers(int force) +{ + struct lnet_peer_table *ptable; + struct lnet_peer *lp; + int lncpt; + int cpt; + + lnet_net_lock(LNET_LOCK_EX); + lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) { + if (force) { + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_MULTI_RAIL) + lp->lp_state |= LNET_PEER_FORCE_PUSH; + spin_unlock(&lp->lp_lock); + } + if (lnet_peer_needs_push(lp)) + lnet_peer_queue_for_discovery(lp); + } + } + lnet_net_unlock(LNET_LOCK_EX); + wake_up(&the_lnet.ln_dc_waitq); +} + +/* + * Test whether a ni is a preferred ni for this peer_ni, e.g, whether + * this is a preferred point-to-point path. Call with lnet_net_lock in + * shared mmode. + */ +bool +lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid) +{ + int i; + + if (lpni->lpni_pref_nnids == 0) + return false; + if (lpni->lpni_pref_nnids == 1) + return lpni->lpni_pref.nid == nid; + for (i = 0; i < lpni->lpni_pref_nnids; i++) { + if (lpni->lpni_pref.nids[i] == nid) + return true; + } + return false; +} + +/* + * Set a single ni as preferred, provided no preferred ni is already + * defined. Only to be used for non-multi-rail peer_ni. + */ +int +lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) +{ + int rc = 0; + + spin_lock(&lpni->lpni_lock); + if (nid == LNET_NID_ANY) { + rc = -EINVAL; + } else if (lpni->lpni_pref_nnids > 0) { + rc = -EPERM; + } else if (lpni->lpni_pref_nnids == 0) { + lpni->lpni_pref.nid = nid; + lpni->lpni_pref_nnids = 1; + lpni->lpni_state |= LNET_PEER_NI_NON_MR_PREF; + } + spin_unlock(&lpni->lpni_lock); + + CDEBUG(D_NET, "peer %s nid %s: %d\n", + libcfs_nid2str(lpni->lpni_nid), libcfs_nid2str(nid), rc); + return rc; +} + +/* + * Clear the preferred NID from a non-multi-rail peer_ni, provided + * this preference was set by lnet_peer_ni_set_non_mr_pref_nid(). + */ +int +lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni) +{ + int rc = 0; + + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF) { + lpni->lpni_pref_nnids = 0; + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + } else if (lpni->lpni_pref_nnids == 0) { + rc = -ENOENT; + } else { + rc = -EPERM; + } + spin_unlock(&lpni->lpni_lock); + + CDEBUG(D_NET, "peer %s: %d\n", + libcfs_nid2str(lpni->lpni_nid), rc); + return rc; +} + +/* + * Clear the preferred NIDs from a non-multi-rail peer. + */ +void +lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp) +{ + struct lnet_peer_ni *lpni = NULL; + + while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) + lnet_peer_ni_clr_non_mr_pref_nid(lpni); +} + +int +lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) +{ + lnet_nid_t *nids = NULL; + lnet_nid_t *oldnids = NULL; + struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer; + int size; + int i; + int rc = 0; + + if (nid == LNET_NID_ANY) { + rc = -EINVAL; + goto out; + } + + if (lpni->lpni_pref_nnids == 1 && lpni->lpni_pref.nid == nid) { + rc = -EEXIST; + goto out; + } + + /* A non-MR node may have only one preferred NI per peer_ni */ + if (lpni->lpni_pref_nnids > 0) { + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + rc = -EPERM; + goto out; + } + } + + if (lpni->lpni_pref_nnids != 0) { + size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1); + LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size); + if (!nids) { + rc = -ENOMEM; + goto out; + } + for (i = 0; i < lpni->lpni_pref_nnids; i++) { + if (lpni->lpni_pref.nids[i] == nid) { + LIBCFS_FREE(nids, size); + rc = -EEXIST; + goto out; + } + nids[i] = lpni->lpni_pref.nids[i]; + } + nids[i] = nid; + } + + lnet_net_lock(LNET_LOCK_EX); + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_pref_nnids == 0) { + lpni->lpni_pref.nid = nid; + } else { + oldnids = lpni->lpni_pref.nids; + lpni->lpni_pref.nids = nids; + } + lpni->lpni_pref_nnids++; + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + spin_unlock(&lpni->lpni_lock); + lnet_net_unlock(LNET_LOCK_EX); + + if (oldnids) { + size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1); + LIBCFS_FREE(oldnids, sizeof(*oldnids) * size); + } +out: + if (rc == -EEXIST && (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF)) { + spin_lock(&lpni->lpni_lock); + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + spin_unlock(&lpni->lpni_lock); + } + CDEBUG(D_NET, "peer %s nid %s: %d\n", + libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc); + return rc; +} + +int +lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) +{ + lnet_nid_t *nids = NULL; + lnet_nid_t *oldnids = NULL; + struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer; + int size; + int i, j; + int rc = 0; + + if (lpni->lpni_pref_nnids == 0) { + rc = -ENOENT; + goto out; + } + + if (lpni->lpni_pref_nnids == 1) { + if (lpni->lpni_pref.nid != nid) { + rc = -ENOENT; + goto out; + } + } else if (lpni->lpni_pref_nnids == 2) { + if (lpni->lpni_pref.nids[0] != nid && + lpni->lpni_pref.nids[1] != nid) { + rc = -ENOENT; + goto out; + } + } else { + size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1); + LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size); + if (!nids) { + rc = -ENOMEM; + goto out; + } + for (i = 0, j = 0; i < lpni->lpni_pref_nnids; i++) { + if (lpni->lpni_pref.nids[i] != nid) + continue; + nids[j++] = lpni->lpni_pref.nids[i]; + } + /* Check if we actually removed a nid. */ + if (j == lpni->lpni_pref_nnids) { + LIBCFS_FREE(nids, size); + rc = -ENOENT; + goto out; + } + } + + lnet_net_lock(LNET_LOCK_EX); + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_pref_nnids == 1) { + lpni->lpni_pref.nid = LNET_NID_ANY; + } else if (lpni->lpni_pref_nnids == 2) { + oldnids = lpni->lpni_pref.nids; + if (oldnids[0] == nid) + lpni->lpni_pref.nid = oldnids[1]; + else + lpni->lpni_pref.nid = oldnids[2]; + } else { + oldnids = lpni->lpni_pref.nids; + lpni->lpni_pref.nids = nids; + } + lpni->lpni_pref_nnids--; + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + spin_unlock(&lpni->lpni_lock); + lnet_net_unlock(LNET_LOCK_EX); + + if (oldnids) { + size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1); + LIBCFS_FREE(oldnids, sizeof(*oldnids) * size); + } +out: + CDEBUG(D_NET, "peer %s nid %s: %d\n", + libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc); + return rc; +} + +lnet_nid_t +lnet_peer_primary_nid_locked(lnet_nid_t nid) +{ + struct lnet_peer_ni *lpni; + lnet_nid_t primary_nid = nid; + + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) { + primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid; + lnet_peer_ni_decref_locked(lpni); + } + + return primary_nid; +} + +bool +lnet_is_discovery_disabled_locked(struct lnet_peer *lp) +{ + if (lnet_peer_discovery_disabled) + return true; + + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) || + (lp->lp_state & LNET_PEER_NO_DISCOVERY)) { + return true; + } + + return false; +} + +/* + * Peer Discovery + */ +bool +lnet_is_discovery_disabled(struct lnet_peer *lp) +{ + bool rc = false; + + spin_lock(&lp->lp_lock); + rc = lnet_is_discovery_disabled_locked(lp); + spin_unlock(&lp->lp_lock); + + return rc; +} + +lnet_nid_t +LNetPrimaryNID(lnet_nid_t nid) +{ + struct lnet_peer *lp; + struct lnet_peer_ni *lpni; + lnet_nid_t primary_nid = nid; + int rc = 0; + int cpt; + + if (nid == LNET_NID_LO_0) + return LNET_NID_LO_0; + + cpt = lnet_net_lock_current(); + lpni = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt); + if (IS_ERR(lpni)) { + rc = PTR_ERR(lpni); + goto out_unlock; + } + lp = lpni->lpni_peer_net->lpn_peer; + + while (!lnet_peer_is_uptodate(lp)) { + spin_lock(&lp->lp_lock); + /* force a full discovery cycle */ + lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH; + spin_unlock(&lp->lp_lock); + + rc = lnet_discover_peer_locked(lpni, cpt, true); + if (rc) + goto out_decref; + lp = lpni->lpni_peer_net->lpn_peer; + + /* Only try once if discovery is disabled */ + if (lnet_is_discovery_disabled(lp)) + break; + } + primary_nid = lp->lp_primary_nid; +out_decref: + lnet_peer_ni_decref_locked(lpni); +out_unlock: + lnet_net_unlock(cpt); + + CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid), + libcfs_nid2str(primary_nid), rc); + return primary_nid; +} +EXPORT_SYMBOL(LNetPrimaryNID); + +struct lnet_peer_net * +lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id) +{ + struct lnet_peer_net *peer_net; + list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) { + if (peer_net->lpn_net_id == net_id) + return peer_net; + } + return NULL; +} + +/* + * Attach a peer_ni to a peer_net and peer. This function assumes + * peer_ni is not already attached to the peer_net/peer. The peer_ni + * may be attached to a different peer, in which case it will be + * properly detached first. The whole operation is done atomically. + * + * Always returns 0. This is the last function called from functions + * that do return an int, so returning 0 here allows the compiler to + * do a tail call. + */ +static int +lnet_peer_attach_peer_ni(struct lnet_peer *lp, + struct lnet_peer_net *lpn, + struct lnet_peer_ni *lpni, + unsigned flags) +{ + struct lnet_peer_table *ptable; + + /* Install the new peer_ni */ + lnet_net_lock(LNET_LOCK_EX); + /* Add peer_ni to global peer table hash, if necessary. */ + if (list_empty(&lpni->lpni_hashlist)) { + int hash = lnet_nid2peerhash(lpni->lpni_nid); + + ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; + list_add_tail(&lpni->lpni_hashlist, &ptable->pt_hash[hash]); + ptable->pt_version++; + ptable->pt_number++; + /* This is the 1st refcount on lpni. */ + atomic_inc(&lpni->lpni_refcount); + } + + /* Detach the peer_ni from an existing peer, if necessary. */ + if (lpni->lpni_peer_net) { + LASSERT(lpni->lpni_peer_net != lpn); + LASSERT(lpni->lpni_peer_net->lpn_peer != lp); + lnet_peer_detach_peer_ni_locked(lpni); + lnet_peer_net_decref_locked(lpni->lpni_peer_net); + lpni->lpni_peer_net = NULL; + } + + /* Add peer_ni to peer_net */ + lpni->lpni_peer_net = lpn; + list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis); + lnet_peer_net_addref_locked(lpn); + + /* Add peer_net to peer */ + if (!lpn->lpn_peer) { + lpn->lpn_peer = lp; + list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets); + lnet_peer_addref_locked(lp); + } + + /* Add peer to global peer list, if necessary */ + ptable = the_lnet.ln_peer_tables[lp->lp_cpt]; + if (list_empty(&lp->lp_peer_list)) { + list_add_tail(&lp->lp_peer_list, &ptable->pt_peer_list); + ptable->pt_peers++; + } + + + /* Update peer state */ + spin_lock(&lp->lp_lock); + if (flags & LNET_PEER_CONFIGURED) { + if (!(lp->lp_state & LNET_PEER_CONFIGURED)) + lp->lp_state |= LNET_PEER_CONFIGURED; + } + if (flags & LNET_PEER_MULTI_RAIL) { + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); + } + } + spin_unlock(&lp->lp_lock); + + lp->lp_nnis++; + lnet_net_unlock(LNET_LOCK_EX); + + CDEBUG(D_NET, "peer %s NID %s flags %#x\n", + libcfs_nid2str(lp->lp_primary_nid), + libcfs_nid2str(lpni->lpni_nid), flags); + + return 0; +} + +/* + * Create a new peer, with nid as its primary nid. + * + * Call with the lnet_api_mutex held. + */ +static int +lnet_peer_add(lnet_nid_t nid, unsigned flags) +{ + struct lnet_peer *lp; + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + int rc = 0; + + LASSERT(nid != LNET_NID_ANY); + + /* + * No need for the lnet_net_lock here, because the + * lnet_api_mutex is held. + */ + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) { + /* A peer with this NID already exists. */ + lp = lpni->lpni_peer_net->lpn_peer; + lnet_peer_ni_decref_locked(lpni); + /* + * This is an error if the peer was configured and the + * primary NID differs or an attempt is made to change + * the Multi-Rail flag. Otherwise the assumption is + * that an existing peer is being modified. + */ + if (lp->lp_state & LNET_PEER_CONFIGURED) { + if (lp->lp_primary_nid != nid) + rc = -EEXIST; + else if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL) + rc = -EPERM; + goto out; + } + /* Delete and recreate as a configured peer. */ + lnet_peer_del(lp); + } + + /* Create peer, peer_net, and peer_ni. */ + rc = -ENOMEM; + lp = lnet_peer_alloc(nid); + if (!lp) + goto out; + lpn = lnet_peer_net_alloc(LNET_NIDNET(nid)); + if (!lpn) + goto out_free_lp; + lpni = lnet_peer_ni_alloc(nid); + if (!lpni) + goto out_free_lpn; + + return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags); + +out_free_lpn: + LIBCFS_FREE(lpn, sizeof(*lpn)); +out_free_lp: + LIBCFS_FREE(lp, sizeof(*lp)); +out: + CDEBUG(D_NET, "peer %s NID flags %#x: %d\n", + libcfs_nid2str(nid), flags, rc); + return rc; +} + +/* + * Add a NID to a peer. Call with ln_api_mutex held. + * + * Error codes: + * -EPERM: Non-DLC addition to a DLC-configured peer. + * -EEXIST: The NID was configured by DLC for a different peer. + * -ENOMEM: Out of memory. + * -ENOTUNIQ: Adding a second peer NID on a single network on a + * non-multi-rail peer. + */ +static int +lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) +{ + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + int rc = 0; + + LASSERT(lp); + LASSERT(nid != LNET_NID_ANY); + + /* A configured peer can only be updated through configuration. */ + if (!(flags & LNET_PEER_CONFIGURED)) { + if (lp->lp_state & LNET_PEER_CONFIGURED) { + rc = -EPERM; + goto out; + } + } + + /* + * The MULTI_RAIL flag can be set but not cleared, because + * that would leave the peer struct in an invalid state. + */ + if (flags & LNET_PEER_MULTI_RAIL) { + spin_lock(&lp->lp_lock); + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); + } + spin_unlock(&lp->lp_lock); + } else if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + rc = -EPERM; + goto out; + } + + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) { + /* + * A peer_ni already exists. This is only a problem if + * it is not connected to this peer and was configured + * by DLC. + */ + lnet_peer_ni_decref_locked(lpni); + if (lpni->lpni_peer_net->lpn_peer == lp) + goto out; + if (lnet_peer_ni_is_configured(lpni)) { + rc = -EEXIST; + goto out; + } + /* If this is the primary NID, destroy the peer. */ + if (lnet_peer_ni_is_primary(lpni)) { + lnet_peer_del(lpni->lpni_peer_net->lpn_peer); + lpni = lnet_peer_ni_alloc(nid); + if (!lpni) { + rc = -ENOMEM; + goto out; + } + } + } else { + lpni = lnet_peer_ni_alloc(nid); + if (!lpni) { + rc = -ENOMEM; + goto out; + } + } + + /* + * Get the peer_net. Check that we're not adding a second + * peer_ni on a peer_net of a non-multi-rail peer. + */ + lpn = lnet_peer_get_net_locked(lp, LNET_NIDNET(nid)); + if (!lpn) { + lpn = lnet_peer_net_alloc(LNET_NIDNET(nid)); + if (!lpn) { + rc = -ENOMEM; + goto out_free_lpni; + } + } else if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + rc = -ENOTUNIQ; + goto out_free_lpni; + } + + return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags); + +out_free_lpni: + /* If the peer_ni was allocated above its peer_net pointer is NULL */ + if (!lpni->lpni_peer_net) + LIBCFS_FREE(lpni, sizeof(*lpni)); +out: + CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n", + libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), + flags, rc); + return rc; +} + +/* + * Update the primary NID of a peer, if possible. + * + * Call with the lnet_api_mutex held. + */ +static int +lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) +{ + lnet_nid_t old = lp->lp_primary_nid; + int rc = 0; + + if (lp->lp_primary_nid == nid) + goto out; + rc = lnet_peer_add_nid(lp, nid, flags); + if (rc) + goto out; + lp->lp_primary_nid = nid; +out: + CDEBUG(D_NET, "peer %s NID %s: %d\n", + libcfs_nid2str(old), libcfs_nid2str(nid), rc); + return rc; +} + +/* + * lpni creation initiated due to traffic either sending or receiving. + */ +static int +lnet_peer_ni_traffic_add(lnet_nid_t nid, lnet_nid_t pref) +{ + struct lnet_peer *lp; + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + unsigned flags = 0; + int rc = 0; + + if (nid == LNET_NID_ANY) { + rc = -EINVAL; + goto out; + } + + /* lnet_net_lock is not needed here because ln_api_lock is held */ + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) { + /* + * We must have raced with another thread. Since we + * know next to nothing about a peer_ni created by + * traffic, we just assume everything is ok and + * return. + */ + lnet_peer_ni_decref_locked(lpni); + goto out; + } + + /* Create peer, peer_net, and peer_ni. */ + rc = -ENOMEM; + lp = lnet_peer_alloc(nid); + if (!lp) + goto out; + lpn = lnet_peer_net_alloc(LNET_NIDNET(nid)); + if (!lpn) + goto out_free_lp; + lpni = lnet_peer_ni_alloc(nid); + if (!lpni) + goto out_free_lpn; + if (pref != LNET_NID_ANY) + lnet_peer_ni_set_non_mr_pref_nid(lpni, pref); + + return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags); + +out_free_lpn: + LIBCFS_FREE(lpn, sizeof(*lpn)); +out_free_lp: + LIBCFS_FREE(lp, sizeof(*lp)); +out: + CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(nid), rc); + return rc; +} + +/* + * Implementation of IOC_LIBCFS_ADD_PEER_NI. + * + * This API handles the following combinations: + * Create a peer with its primary NI if only the prim_nid is provided + * Add a NID to a peer identified by the prim_nid. The peer identified + * by the prim_nid must already exist. + * The peer being created may be non-MR. + * + * The caller must hold ln_api_mutex. This prevents the peer from + * being created/modified/deleted by a different thread. + */ +int +lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr) +{ + struct lnet_peer *lp = NULL; + struct lnet_peer_ni *lpni; + unsigned flags; + + /* The prim_nid must always be specified */ + if (prim_nid == LNET_NID_ANY) + return -EINVAL; + + flags = LNET_PEER_CONFIGURED; + if (mr) + flags |= LNET_PEER_MULTI_RAIL; + + /* + * If nid isn't specified, we must create a new peer with + * prim_nid as its primary nid. + */ + if (nid == LNET_NID_ANY) + return lnet_peer_add(prim_nid, flags); + + /* Look up the prim_nid, which must exist. */ + lpni = lnet_find_peer_ni_locked(prim_nid); + if (!lpni) + return -ENOENT; + lnet_peer_ni_decref_locked(lpni); + lp = lpni->lpni_peer_net->lpn_peer; + + /* Peer must have been configured. */ + if (!(lp->lp_state & LNET_PEER_CONFIGURED)) { + CDEBUG(D_NET, "peer %s was not configured\n", + libcfs_nid2str(prim_nid)); + return -ENOENT; + } + + /* Primary NID must match */ + if (lp->lp_primary_nid != prim_nid) { + CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n", + libcfs_nid2str(prim_nid), + libcfs_nid2str(lp->lp_primary_nid)); + return -ENODEV; + } + + /* Multi-Rail flag must match. */ + if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL) { + CDEBUG(D_NET, "multi-rail state mismatch for peer %s\n", + libcfs_nid2str(prim_nid)); + return -EPERM; + } + + return lnet_peer_add_nid(lp, nid, flags); +} + +/* + * Implementation of IOC_LIBCFS_DEL_PEER_NI. + * + * This API handles the following combinations: + * Delete a NI from a peer if both prim_nid and nid are provided. + * Delete a peer if only prim_nid is provided. + * Delete a peer if its primary nid is provided. + * + * The caller must hold ln_api_mutex. This prevents the peer from + * being modified/deleted by a different thread. + */ +int +lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid) +{ + struct lnet_peer *lp; + struct lnet_peer_ni *lpni; + unsigned flags; + + if (prim_nid == LNET_NID_ANY) + return -EINVAL; + + lpni = lnet_find_peer_ni_locked(prim_nid); + if (!lpni) + return -ENOENT; + lnet_peer_ni_decref_locked(lpni); + lp = lpni->lpni_peer_net->lpn_peer; + + if (prim_nid != lp->lp_primary_nid) { + CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n", + libcfs_nid2str(prim_nid), + libcfs_nid2str(lp->lp_primary_nid)); + return -ENODEV; + } + + if (nid == LNET_NID_ANY || nid == lp->lp_primary_nid) + return lnet_peer_del(lp); + + flags = LNET_PEER_CONFIGURED; + if (lp->lp_state & LNET_PEER_MULTI_RAIL) + flags |= LNET_PEER_MULTI_RAIL; + + return lnet_peer_del_nid(lp, nid, flags); +} + +void +lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni) +{ + struct lnet_peer_table *ptable; + struct lnet_peer_net *lpn; + + CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nid2str(lpni->lpni_nid)); + + LASSERT(atomic_read(&lpni->lpni_refcount) == 0); + LASSERT(lpni->lpni_rtr_refcount == 0); + LASSERT(list_empty(&lpni->lpni_txq)); + LASSERT(lpni->lpni_txqnob == 0); + LASSERT(list_empty(&lpni->lpni_peer_nis)); + LASSERT(list_empty(&lpni->lpni_on_remote_peer_ni_list)); + + lpn = lpni->lpni_peer_net; + lpni->lpni_peer_net = NULL; + lpni->lpni_net = NULL; + + /* remove the peer ni from the zombie list */ + ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; + spin_lock(&ptable->pt_zombie_lock); + list_del_init(&lpni->lpni_hashlist); + ptable->pt_zombies--; + spin_unlock(&ptable->pt_zombie_lock); + + if (lpni->lpni_pref_nnids > 1) { + LIBCFS_FREE(lpni->lpni_pref.nids, + sizeof(*lpni->lpni_pref.nids) * lpni->lpni_pref_nnids); + } + LIBCFS_FREE(lpni, sizeof(*lpni)); + + lnet_peer_net_decref_locked(lpn); +} + +struct lnet_peer_ni * +lnet_nid2peerni_ex(lnet_nid_t nid, int cpt) +{ + struct lnet_peer_ni *lpni = NULL; + int rc; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return ERR_PTR(-ESHUTDOWN); + + /* + * find if a peer_ni already exists. + * If so then just return that. + */ + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) + return lpni; + + lnet_net_unlock(cpt); + + rc = lnet_peer_ni_traffic_add(nid, LNET_NID_ANY); + if (rc) { + lpni = ERR_PTR(rc); + goto out_net_relock; + } + + lpni = lnet_find_peer_ni_locked(nid); + LASSERT(lpni); + +out_net_relock: + lnet_net_lock(cpt); + + return lpni; +} + +/* + * Get a peer_ni for the given nid, create it if necessary. Takes a + * hold on the peer_ni. + */ +struct lnet_peer_ni * +lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref, int cpt) +{ + struct lnet_peer_ni *lpni = NULL; + int rc; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return ERR_PTR(-ESHUTDOWN); + + /* + * find if a peer_ni already exists. + * If so then just return that. + */ + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) + return lpni; + + /* + * Slow path: + * use the lnet_api_mutex to serialize the creation of the peer_ni + * and the creation/deletion of the local ni/net. When a local ni is + * created, if there exists a set of peer_nis on that network, + * they need to be traversed and updated. When a local NI is + * deleted, which could result in a network being deleted, then + * all peer nis on that network need to be removed as well. + * + * Creation through traffic should also be serialized with + * creation through DLC. + */ + lnet_net_unlock(cpt); + mutex_lock(&the_lnet.ln_api_mutex); + /* + * Shutdown is only set under the ln_api_lock, so a single + * check here is sufficent. + */ + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + lpni = ERR_PTR(-ESHUTDOWN); + goto out_mutex_unlock; + } + + rc = lnet_peer_ni_traffic_add(nid, pref); + if (rc) { + lpni = ERR_PTR(rc); + goto out_mutex_unlock; + } + + lpni = lnet_find_peer_ni_locked(nid); + LASSERT(lpni); + +out_mutex_unlock: + mutex_unlock(&the_lnet.ln_api_mutex); + lnet_net_lock(cpt); + + /* Lock has been dropped, check again for shutdown. */ + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + if (!IS_ERR(lpni)) + lnet_peer_ni_decref_locked(lpni); + lpni = ERR_PTR(-ESHUTDOWN); + } + + return lpni; +} + +bool +lnet_peer_is_uptodate(struct lnet_peer *lp) +{ + bool rc; + + spin_lock(&lp->lp_lock); + rc = lnet_peer_is_uptodate_locked(lp); + spin_unlock(&lp->lp_lock); + return rc; +} + +/* + * Is a peer uptodate from the point of view of discovery? + * + * If it is currently being processed, obviously not. + * A forced Ping or Push is also handled by the discovery thread. + * + * Otherwise look at whether the peer needs rediscovering. + */ +bool +lnet_peer_is_uptodate_locked(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + bool rc; + + if (lp->lp_state & (LNET_PEER_DISCOVERING | + LNET_PEER_FORCE_PING | + LNET_PEER_FORCE_PUSH)) { + rc = false; + } else if (lp->lp_state & LNET_PEER_REDISCOVER) { + rc = false; + } else if (lnet_peer_needs_push(lp)) { + rc = false; + } else if (lp->lp_state & LNET_PEER_DISCOVERED) { + if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) + rc = true; + else + rc = false; + } else { + rc = false; + } + + return rc; +} + +/* + * Queue a peer for the attention of the discovery thread. Call with + * lnet_net_lock/EX held. Returns 0 if the peer was queued, and + * -EALREADY if the peer was already queued. + */ +static int lnet_peer_queue_for_discovery(struct lnet_peer *lp) +{ + int rc; + + spin_lock(&lp->lp_lock); + if (!(lp->lp_state & LNET_PEER_DISCOVERING)) + lp->lp_state |= LNET_PEER_DISCOVERING; + spin_unlock(&lp->lp_lock); + if (list_empty(&lp->lp_dc_list)) { + lnet_peer_addref_locked(lp); + list_add_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request); + wake_up(&the_lnet.ln_dc_waitq); + rc = 0; + } else { + rc = -EALREADY; + } + + CDEBUG(D_NET, "Queue peer %s: %d\n", + libcfs_nid2str(lp->lp_primary_nid), rc); + + return rc; +} + +/* + * Discovery of a peer is complete. Wake all waiters on the peer. + * Call with lnet_net_lock/EX held. + */ +static void lnet_peer_discovery_complete(struct lnet_peer *lp) +{ + struct lnet_msg *msg, *tmp; + int rc = 0; + struct list_head pending_msgs; + + INIT_LIST_HEAD(&pending_msgs); + + CDEBUG(D_NET, "Discovery complete. Dequeue peer %s\n", + libcfs_nid2str(lp->lp_primary_nid)); + + list_del_init(&lp->lp_dc_list); + spin_lock(&lp->lp_lock); + list_splice_init(&lp->lp_dc_pendq, &pending_msgs); + spin_unlock(&lp->lp_lock); + wake_up_all(&lp->lp_dc_waitq); + + lnet_net_unlock(LNET_LOCK_EX); + + /* iterate through all pending messages and send them again */ + list_for_each_entry_safe(msg, tmp, &pending_msgs, msg_list) { + list_del_init(&msg->msg_list); + if (lp->lp_dc_error) { + lnet_finalize(msg, lp->lp_dc_error); + continue; + } + + CDEBUG(D_NET, "sending pending message %s to target %s\n", + lnet_msgtyp2str(msg->msg_type), + libcfs_id2str(msg->msg_target)); + rc = lnet_send(msg->msg_src_nid_param, msg, + msg->msg_rtr_nid_param); + if (rc < 0) { + CNETERR("Error sending %s to %s: %d\n", + lnet_msgtyp2str(msg->msg_type), + libcfs_id2str(msg->msg_target), rc); + lnet_finalize(msg, rc); + } + } + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_decref_locked(lp); +} + +/* + * Handle inbound push. + * Like any event handler, called with lnet_res_lock/CPT held. + */ +void lnet_peer_push_event(struct lnet_event *ev) +{ + struct lnet_ping_buffer *pbuf = ev->md.user_ptr; + struct lnet_peer *lp; + + /* lnet_find_peer() adds a refcount */ + lp = lnet_find_peer(ev->source.nid); + if (!lp) { + CDEBUG(D_NET, "Push Put from unknown %s (source %s). Ignoring...\n", + libcfs_nid2str(ev->initiator.nid), + libcfs_nid2str(ev->source.nid)); + return; + } + + /* Ensure peer state remains consistent while we modify it. */ + spin_lock(&lp->lp_lock); + + /* + * If some kind of error happened the contents of the message + * cannot be used. Clear the NIDS_UPTODATE and set the + * FORCE_PING flag to trigger a ping. + */ + if (ev->status) { + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + lp->lp_state |= LNET_PEER_FORCE_PING; + CDEBUG(D_NET, "Push Put error %d from %s (source %s)\n", + ev->status, + libcfs_nid2str(lp->lp_primary_nid), + libcfs_nid2str(ev->source.nid)); + goto out; + } + + /* + * A push with invalid or corrupted info. Clear the UPTODATE + * flag to trigger a ping. + */ + if (lnet_ping_info_validate(&pbuf->pb_info)) { + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + lp->lp_state |= LNET_PEER_FORCE_PING; + CDEBUG(D_NET, "Corrupted Push from %s\n", + libcfs_nid2str(lp->lp_primary_nid)); + goto out; + } + + /* + * Make sure we'll allocate the correct size ping buffer when + * pinging the peer. + */ + if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis) + lp->lp_data_nnis = pbuf->pb_info.pi_nnis; + + /* + * A non-Multi-Rail peer is not supposed to be capable of + * sending a push. + */ + if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)) { + CERROR("Push from non-Multi-Rail peer %s dropped\n", + libcfs_nid2str(lp->lp_primary_nid)); + goto out; + } + + /* + * The peer may have discovery disabled at its end. Set + * NO_DISCOVERY as appropriate. + */ + if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) { + CDEBUG(D_NET, "Peer %s has discovery disabled\n", + libcfs_nid2str(lp->lp_primary_nid)); + lp->lp_state |= LNET_PEER_NO_DISCOVERY; + } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { + CDEBUG(D_NET, "Peer %s has discovery enabled\n", + libcfs_nid2str(lp->lp_primary_nid)); + lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; + } + + /* + * Update the MULTI_RAIL flag based on the push. If the peer + * was configured with DLC then the setting should match what + * DLC put in. + * NB: We verified above that the MR feature bit is set in pi_features + */ + if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + CDEBUG(D_NET, "peer %s(%p) is MR\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_CONFIGURED) { + CWARN("Push says %s is Multi-Rail, DLC says not\n", + libcfs_nid2str(lp->lp_primary_nid)); + } else if (lnet_peer_discovery_disabled) { + CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled locally\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { + CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled remotely\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else { + CDEBUG(D_NET, "peer %s(%p) is MR capable\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); + } + + /* + * Check for truncation of the Put message. Clear the + * NIDS_UPTODATE flag and set FORCE_PING to trigger a ping, + * and tell discovery to allocate a bigger buffer. + */ + if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) { + if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis) + the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis; + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + lp->lp_state |= LNET_PEER_FORCE_PING; + CDEBUG(D_NET, "Truncated Push from %s (%d nids)\n", + libcfs_nid2str(lp->lp_primary_nid), + pbuf->pb_info.pi_nnis); + goto out; + } + + /* always assume new data */ + lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + + /* + * If there is data present that hasn't been processed yet, + * we'll replace it if the Put contained newer data and it + * fits. We're racing with a Ping or earlier Push in this + * case. + */ + if (lp->lp_state & LNET_PEER_DATA_PRESENT) { + if (LNET_PING_BUFFER_SEQNO(pbuf) > + LNET_PING_BUFFER_SEQNO(lp->lp_data) && + pbuf->pb_info.pi_nnis <= lp->lp_data->pb_nnis) { + memcpy(&lp->lp_data->pb_info, &pbuf->pb_info, + LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis)); + CDEBUG(D_NET, "Ping/Push race from %s: %u vs %u\n", + libcfs_nid2str(lp->lp_primary_nid), + LNET_PING_BUFFER_SEQNO(pbuf), + LNET_PING_BUFFER_SEQNO(lp->lp_data)); + } + goto out; + } + + /* + * Allocate a buffer to copy the data. On a failure we drop + * the Push and set FORCE_PING to force the discovery + * thread to fix the problem by pinging the peer. + */ + lp->lp_data = lnet_ping_buffer_alloc(lp->lp_data_nnis, GFP_ATOMIC); + if (!lp->lp_data) { + lp->lp_state |= LNET_PEER_FORCE_PING; + CDEBUG(D_NET, "Cannot allocate Push buffer for %s %u\n", + libcfs_nid2str(lp->lp_primary_nid), + LNET_PING_BUFFER_SEQNO(pbuf)); + goto out; + } + + /* Success */ + memcpy(&lp->lp_data->pb_info, &pbuf->pb_info, + LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis)); + lp->lp_state |= LNET_PEER_DATA_PRESENT; + CDEBUG(D_NET, "Received Push %s %u\n", + libcfs_nid2str(lp->lp_primary_nid), + LNET_PING_BUFFER_SEQNO(pbuf)); + +out: + /* + * Queue the peer for discovery if not done, force it on the request + * queue and wake the discovery thread if the peer was already queued, + * because its status changed. + */ + spin_unlock(&lp->lp_lock); + lnet_net_lock(LNET_LOCK_EX); + if (!lnet_peer_is_uptodate(lp) && lnet_peer_queue_for_discovery(lp)) { + list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request); + wake_up(&the_lnet.ln_dc_waitq); + } + /* Drop refcount from lookup */ + lnet_peer_decref_locked(lp); + lnet_net_unlock(LNET_LOCK_EX); +} + +/* + * Clear the discovery error state, unless we're already discovering + * this peer, in which case the error is current. + */ +static void lnet_peer_clear_discovery_error(struct lnet_peer *lp) +{ + spin_lock(&lp->lp_lock); + if (!(lp->lp_state & LNET_PEER_DISCOVERING)) + lp->lp_dc_error = 0; + spin_unlock(&lp->lp_lock); +} + +/* + * Peer discovery slow path. The ln_api_mutex is held on entry, and + * dropped/retaken within this function. An lnet_peer_ni is passed in + * because discovery could tear down an lnet_peer. + */ +int +lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block) +{ + DEFINE_WAIT(wait); + struct lnet_peer *lp; + int rc = 0; + int count = 0; + +again: + lnet_net_unlock(cpt); + lnet_net_lock(LNET_LOCK_EX); + lp = lpni->lpni_peer_net->lpn_peer; + lnet_peer_clear_discovery_error(lp); + + /* + * We're willing to be interrupted. The lpni can become a + * zombie if we race with DLC, so we must check for that. + */ + for (;;) { + /* Keep lp alive when the lnet_net_lock is unlocked */ + lnet_peer_addref_locked(lp); + prepare_to_wait(&lp->lp_dc_waitq, &wait, TASK_INTERRUPTIBLE); + if (signal_pending(current)) + break; + if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) + break; + /* + * Don't repeat discovery if discovery is disabled. This is + * done to ensure we can use discovery as a standard ping as + * well for backwards compatibility with routers which do not + * have discovery or have discovery disabled + */ + if (lnet_is_discovery_disabled(lp) && count > 0) + break; + if (lp->lp_dc_error) + break; + if (lnet_peer_is_uptodate(lp)) + break; + lnet_peer_queue_for_discovery(lp); + count++; + CDEBUG(D_NET, "Discovery attempt # %d\n", count); + + /* + * If caller requested a non-blocking operation then + * return immediately. Once discovery is complete any + * pending messages that were stopped due to discovery + * will be transmitted. + */ + if (!block) + break; + + lnet_net_unlock(LNET_LOCK_EX); + schedule(); + finish_wait(&lp->lp_dc_waitq, &wait); + lnet_net_lock(LNET_LOCK_EX); + lnet_peer_decref_locked(lp); + /* Peer may have changed */ + lp = lpni->lpni_peer_net->lpn_peer; + } + finish_wait(&lp->lp_dc_waitq, &wait); + + lnet_net_unlock(LNET_LOCK_EX); + lnet_net_lock(cpt); + lnet_peer_decref_locked(lp); + /* + * The peer may have changed, so re-check and rediscover if that turns + * out to have been the case. The reference count on lp ensured that + * even if it was unlinked from lpni the memory could not be recycled. + * Thus the check below is sufficient to determine whether the peer + * changed. If the peer changed, then lp must not be dereferenced. + */ + if (lp != lpni->lpni_peer_net->lpn_peer) + goto again; + + if (signal_pending(current)) + rc = -EINTR; + else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) + rc = -ESHUTDOWN; + else if (lp->lp_dc_error) + rc = lp->lp_dc_error; + else if (!block) + CDEBUG(D_NET, "non-blocking discovery\n"); + else if (!lnet_peer_is_uptodate(lp) && !lnet_is_discovery_disabled(lp)) + goto again; + + CDEBUG(D_NET, "peer %s NID %s: %d. %s\n", + (lp ? libcfs_nid2str(lp->lp_primary_nid) : "(none)"), + libcfs_nid2str(lpni->lpni_nid), rc, + (!block) ? "pending discovery" : "discovery complete"); + + return rc; +} + +/* Handle an incoming ack for a push. */ +static void +lnet_discovery_event_ack(struct lnet_peer *lp, struct lnet_event *ev) +{ + struct lnet_ping_buffer *pbuf; + + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start); + spin_lock(&lp->lp_lock); + lp->lp_state &= ~LNET_PEER_PUSH_SENT; + lp->lp_push_error = ev->status; + if (ev->status) + lp->lp_state |= LNET_PEER_PUSH_FAILED; + else + lp->lp_node_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + spin_unlock(&lp->lp_lock); + + CDEBUG(D_NET, "peer %s ev->status %d\n", + libcfs_nid2str(lp->lp_primary_nid), ev->status); +} + +/* Handle a Reply message. This is the reply to a Ping message. */ +static void +lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) +{ + struct lnet_ping_buffer *pbuf; + int rc; + + spin_lock(&lp->lp_lock); + + lp->lp_disc_src_nid = ev->target.nid; + + /* + * If some kind of error happened the contents of message + * cannot be used. Set PING_FAILED to trigger a retry. + */ + if (ev->status) { + lp->lp_state |= LNET_PEER_PING_FAILED; + lp->lp_ping_error = ev->status; + CDEBUG(D_NET, "Ping Reply error %d from %s (source %s)\n", + ev->status, + libcfs_nid2str(lp->lp_primary_nid), + libcfs_nid2str(ev->source.nid)); + goto out; + } + + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start); + if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) + lnet_swap_pinginfo(pbuf); + + /* + * A reply with invalid or corrupted info. Set PING_FAILED to + * trigger a retry. + */ + rc = lnet_ping_info_validate(&pbuf->pb_info); + if (rc) { + lp->lp_state |= LNET_PEER_PING_FAILED; + lp->lp_ping_error = 0; + CDEBUG(D_NET, "Corrupted Ping Reply from %s: %d\n", + libcfs_nid2str(lp->lp_primary_nid), rc); + goto out; + } + + /* The peer may have discovery disabled at its end. Set + * NO_DISCOVERY as appropriate. + */ + if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY) && + !lnet_peer_discovery_disabled) { + CDEBUG(D_NET, "Peer %s has discovery enabled\n", + libcfs_nid2str(lp->lp_primary_nid)); + lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; + } else { + CDEBUG(D_NET, "Peer %s has discovery disabled\n", + libcfs_nid2str(lp->lp_primary_nid)); + lp->lp_state |= LNET_PEER_NO_DISCOVERY; + } + + /* + * Update the MULTI_RAIL flag based on the reply. If the peer + * was configured with DLC then the setting should match what + * DLC put in. + */ + if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) { + if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + CDEBUG(D_NET, "peer %s(%p) is MR\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_CONFIGURED) { + CWARN("Reply says %s is Multi-Rail, DLC says not\n", + libcfs_nid2str(lp->lp_primary_nid)); + } else if (lnet_peer_discovery_disabled) { + CDEBUG(D_NET, + "peer %s(%p) not MR: DD disabled locally\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { + CDEBUG(D_NET, + "peer %s(%p) not MR: DD disabled remotely\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else { + CDEBUG(D_NET, "peer %s(%p) is MR capable\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); + } + } else if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + if (lp->lp_state & LNET_PEER_CONFIGURED) { + CWARN("DLC says %s is Multi-Rail, Reply says not\n", + libcfs_nid2str(lp->lp_primary_nid)); + } else { + CERROR("Multi-Rail state vanished from %s\n", + libcfs_nid2str(lp->lp_primary_nid)); + lp->lp_state &= ~LNET_PEER_MULTI_RAIL; + } + } + + /* + * Make sure we'll allocate the correct size ping buffer when + * pinging the peer. + */ + if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis) + lp->lp_data_nnis = pbuf->pb_info.pi_nnis; + + /* + * Check for truncation of the Reply. Clear PING_SENT and set + * PING_FAILED to trigger a retry. + */ + if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) { + if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis) + the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis; + lp->lp_state |= LNET_PEER_PING_FAILED; + lp->lp_ping_error = 0; + CDEBUG(D_NET, "Truncated Reply from %s (%d nids)\n", + libcfs_nid2str(lp->lp_primary_nid), + pbuf->pb_info.pi_nnis); + goto out; + } + + /* + * Check the sequence numbers in the reply. These are only + * available if the reply came from a Multi-Rail peer. + */ + if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL && + pbuf->pb_info.pi_nnis > 1 && + lp->lp_primary_nid == pbuf->pb_info.pi_ni[1].ns_nid) { + if (LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno) + CDEBUG(D_NET, "peer %s: seq# got %u have %u. peer rebooted?\n", + libcfs_nid2str(lp->lp_primary_nid), + LNET_PING_BUFFER_SEQNO(pbuf), + lp->lp_peer_seqno); + + lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + } + + /* We're happy with the state of the data in the buffer. */ + CDEBUG(D_NET, "peer %s data present %u\n", + libcfs_nid2str(lp->lp_primary_nid), lp->lp_peer_seqno); + if (lp->lp_state & LNET_PEER_DATA_PRESENT) + lnet_ping_buffer_decref(lp->lp_data); + else + lp->lp_state |= LNET_PEER_DATA_PRESENT; + lnet_ping_buffer_addref(pbuf); + lp->lp_data = pbuf; +out: + lp->lp_state &= ~LNET_PEER_PING_SENT; + spin_unlock(&lp->lp_lock); +} + +/* + * Send event handling. Only matters for error cases, where we clean + * up state on the peer and peer_ni that would otherwise be updated in + * the REPLY event handler for a successful Ping, and the ACK event + * handler for a successful Push. + */ +static int +lnet_discovery_event_send(struct lnet_peer *lp, struct lnet_event *ev) +{ + int rc = 0; + + if (!ev->status) + goto out; + + spin_lock(&lp->lp_lock); + if (ev->msg_type == LNET_MSG_GET) { + lp->lp_state &= ~LNET_PEER_PING_SENT; + lp->lp_state |= LNET_PEER_PING_FAILED; + lp->lp_ping_error = ev->status; + } else { /* ev->msg_type == LNET_MSG_PUT */ + lp->lp_state &= ~LNET_PEER_PUSH_SENT; + lp->lp_state |= LNET_PEER_PUSH_FAILED; + lp->lp_push_error = ev->status; + } + spin_unlock(&lp->lp_lock); + rc = LNET_REDISCOVER_PEER; +out: + CDEBUG(D_NET, "%s Send to %s: %d\n", + (ev->msg_type == LNET_MSG_GET ? "Ping" : "Push"), + libcfs_nid2str(ev->target.nid), rc); + return rc; +} + +/* + * Unlink event handling. This event is only seen if a call to + * LNetMDUnlink() caused the event to be unlinked. If this call was + * made after the event was set up in LNetGet() or LNetPut() then we + * assume the Ping or Push timed out. + */ +static void +lnet_discovery_event_unlink(struct lnet_peer *lp, struct lnet_event *ev) +{ + spin_lock(&lp->lp_lock); + /* We've passed through LNetGet() */ + if (lp->lp_state & LNET_PEER_PING_SENT) { + lp->lp_state &= ~LNET_PEER_PING_SENT; + lp->lp_state |= LNET_PEER_PING_FAILED; + lp->lp_ping_error = -ETIMEDOUT; + CDEBUG(D_NET, "Ping Unlink for message to peer %s\n", + libcfs_nid2str(lp->lp_primary_nid)); + } + /* We've passed through LNetPut() */ + if (lp->lp_state & LNET_PEER_PUSH_SENT) { + lp->lp_state &= ~LNET_PEER_PUSH_SENT; + lp->lp_state |= LNET_PEER_PUSH_FAILED; + lp->lp_push_error = -ETIMEDOUT; + CDEBUG(D_NET, "Push Unlink for message to peer %s\n", + libcfs_nid2str(lp->lp_primary_nid)); + } + spin_unlock(&lp->lp_lock); +} + +/* + * Event handler for the discovery EQ. + * + * Called with lnet_res_lock(cpt) held. The cpt is the + * lnet_cpt_of_cookie() of the md handle cookie. + */ +static void lnet_discovery_event_handler(struct lnet_event *event) +{ + struct lnet_peer *lp = event->md.user_ptr; + struct lnet_ping_buffer *pbuf; + int rc; + + /* discovery needs to take another look */ + rc = LNET_REDISCOVER_PEER; + + CDEBUG(D_NET, "Received event: %d\n", event->type); + + switch (event->type) { + case LNET_EVENT_ACK: + lnet_discovery_event_ack(lp, event); + break; + case LNET_EVENT_REPLY: + lnet_discovery_event_reply(lp, event); + break; + case LNET_EVENT_SEND: + /* Only send failure triggers a retry. */ + rc = lnet_discovery_event_send(lp, event); + break; + case LNET_EVENT_UNLINK: + /* LNetMDUnlink() was called */ + lnet_discovery_event_unlink(lp, event); + break; + default: + /* Invalid events. */ + LBUG(); + } + lnet_net_lock(LNET_LOCK_EX); + if (event->unlinked) { + pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start); + lnet_ping_buffer_decref(pbuf); + lnet_peer_decref_locked(lp); + } + + /* put peer back at end of request queue, if discovery not already + * done */ + if (rc == LNET_REDISCOVER_PEER && !lnet_peer_is_uptodate(lp) && + lnet_peer_queue_for_discovery(lp)) { + list_move_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request); + wake_up(&the_lnet.ln_dc_waitq); + } + lnet_net_unlock(LNET_LOCK_EX); +} + +/* + * Build a peer from incoming data. + * + * The NIDs in the incoming data are supposed to be structured as follows: + * - loopback + * - primary NID + * - other NIDs in same net + * - NIDs in second net + * - NIDs in third net + * - ... + * This due to the way the list of NIDs in the data is created. + * + * Note that this function will mark the peer uptodate unless an + * ENOMEM is encontered. All other errors are due to a conflict + * between the DLC configuration and what discovery sees. We treat DLC + * as binding, and therefore set the NIDS_UPTODATE flag to prevent the + * peer from becoming stuck in discovery. + */ +static int lnet_peer_merge_data(struct lnet_peer *lp, + struct lnet_ping_buffer *pbuf) +{ + struct lnet_peer_ni *lpni; + lnet_nid_t *curnis = NULL; + lnet_nid_t *addnis = NULL; + lnet_nid_t *delnis = NULL; + unsigned flags; + int ncurnis; + int naddnis; + int ndelnis; + int nnis = 0; + int i; + int j; + int rc; + + flags = LNET_PEER_DISCOVERED; + if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) + flags |= LNET_PEER_MULTI_RAIL; + + nnis = MAX(lp->lp_nnis, pbuf->pb_info.pi_nnis); + LIBCFS_ALLOC(curnis, nnis * sizeof(lnet_nid_t)); + LIBCFS_ALLOC(addnis, nnis * sizeof(lnet_nid_t)); + LIBCFS_ALLOC(delnis, nnis * sizeof(lnet_nid_t)); + if (!curnis || !addnis || !delnis) { + rc = -ENOMEM; + goto out; + } + ncurnis = 0; + naddnis = 0; + ndelnis = 0; + + /* Construct the list of NIDs present in peer. */ + lpni = NULL; + while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) + curnis[ncurnis++] = lpni->lpni_nid; + + /* + * Check for NIDs in pbuf not present in curnis[]. + * The loop starts at 1 to skip the loopback NID. + */ + for (i = 1; i < pbuf->pb_info.pi_nnis; i++) { + for (j = 0; j < ncurnis; j++) + if (pbuf->pb_info.pi_ni[i].ns_nid == curnis[j]) + break; + if (j == ncurnis) + addnis[naddnis++] = pbuf->pb_info.pi_ni[i].ns_nid; + } + /* + * Check for NIDs in curnis[] not present in pbuf. + * The nested loop starts at 1 to skip the loopback NID. + * + * But never add the loopback NID to delnis[]: if it is + * present in curnis[] then this peer is for this node. + */ + for (i = 0; i < ncurnis; i++) { + if (curnis[i] == LNET_NID_LO_0) + continue; + for (j = 1; j < pbuf->pb_info.pi_nnis; j++) + if (curnis[i] == pbuf->pb_info.pi_ni[j].ns_nid) + break; + if (j == pbuf->pb_info.pi_nnis) + delnis[ndelnis++] = curnis[i]; + } + + rc = 0; + if (lnet_is_discovery_disabled(lp)) + goto out; + + for (i = 0; i < naddnis; i++) { + rc = lnet_peer_add_nid(lp, addnis[i], flags); + if (rc) { + CERROR("Error adding NID %s to peer %s: %d\n", + libcfs_nid2str(addnis[i]), + libcfs_nid2str(lp->lp_primary_nid), rc); + if (rc == -ENOMEM) + goto out; + } + } + for (i = 0; i < ndelnis; i++) { + rc = lnet_peer_del_nid(lp, delnis[i], flags); + if (rc) { + CERROR("Error deleting NID %s from peer %s: %d\n", + libcfs_nid2str(delnis[i]), + libcfs_nid2str(lp->lp_primary_nid), rc); + if (rc == -ENOMEM) + goto out; + } + } + /* + * Errors other than -ENOMEM are due to peers having been + * configured with DLC. Ignore these because DLC overrides + * Discovery. + */ + rc = 0; +out: + LIBCFS_FREE(curnis, nnis * sizeof(lnet_nid_t)); + LIBCFS_FREE(addnis, nnis * sizeof(lnet_nid_t)); + LIBCFS_FREE(delnis, nnis * sizeof(lnet_nid_t)); + lnet_ping_buffer_decref(pbuf); + CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc); + + if (rc) { + spin_lock(&lp->lp_lock); + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; + lp->lp_state |= LNET_PEER_FORCE_PING; + spin_unlock(&lp->lp_lock); + } + return rc; +} + +/* + * The data in pbuf says lp is its primary peer, but the data was + * received by a different peer. Try to update lp with the data. + */ +static int +lnet_peer_set_primary_data(struct lnet_peer *lp, struct lnet_ping_buffer *pbuf) +{ + struct lnet_handle_md mdh; + + /* Queue lp for discovery, and force it on the request queue. */ + lnet_net_lock(LNET_LOCK_EX); + if (lnet_peer_queue_for_discovery(lp)) + list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request); + lnet_net_unlock(LNET_LOCK_EX); + + LNetInvalidateMDHandle(&mdh); + + /* + * Decide whether we can move the peer to the DATA_PRESENT state. + * + * We replace stale data for a multi-rail peer, repair PING_FAILED + * status, and preempt FORCE_PING. + * + * If after that we have DATA_PRESENT, we merge it into this peer. + */ + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + if (lp->lp_peer_seqno < LNET_PING_BUFFER_SEQNO(pbuf)) { + lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + } else if (lp->lp_state & LNET_PEER_DATA_PRESENT) { + lp->lp_state &= ~LNET_PEER_DATA_PRESENT; + lnet_ping_buffer_decref(pbuf); + pbuf = lp->lp_data; + lp->lp_data = NULL; + } + } + if (lp->lp_state & LNET_PEER_DATA_PRESENT) { + lnet_ping_buffer_decref(lp->lp_data); + lp->lp_data = NULL; + lp->lp_state &= ~LNET_PEER_DATA_PRESENT; + } + if (lp->lp_state & LNET_PEER_PING_FAILED) { + mdh = lp->lp_ping_mdh; + LNetInvalidateMDHandle(&lp->lp_ping_mdh); + lp->lp_state &= ~LNET_PEER_PING_FAILED; + lp->lp_ping_error = 0; + } + if (lp->lp_state & LNET_PEER_FORCE_PING) + lp->lp_state &= ~LNET_PEER_FORCE_PING; + lp->lp_state |= LNET_PEER_NIDS_UPTODATE; + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(mdh)) + LNetMDUnlink(mdh); + + if (pbuf) + return lnet_peer_merge_data(lp, pbuf); + + CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); + return 0; +} + +static bool lnet_is_nid_in_ping_info(lnet_nid_t nid, struct lnet_ping_info *pinfo) +{ + int i; + + for (i = 0; i < pinfo->pi_nnis; i++) { + if (pinfo->pi_ni[i].ns_nid == nid) + return true; + } + + return false; +} + +/* + * Update a peer using the data received. + */ +static int lnet_peer_data_present(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct lnet_ping_buffer *pbuf; + struct lnet_peer_ni *lpni; + lnet_nid_t nid = LNET_NID_ANY; + unsigned flags; + int rc = 0; + + pbuf = lp->lp_data; + lp->lp_data = NULL; + lp->lp_state &= ~LNET_PEER_DATA_PRESENT; + lp->lp_state |= LNET_PEER_NIDS_UPTODATE; + spin_unlock(&lp->lp_lock); + + /* + * Modifications of peer structures are done while holding the + * ln_api_mutex. A global lock is required because we may be + * modifying multiple peer structures, and a mutex greatly + * simplifies memory management. + * + * The actual changes to the data structures must also protect + * against concurrent lookups, for which the lnet_net_lock in + * LNET_LOCK_EX mode is used. + */ + mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out; + } + + /* + * If this peer is not on the peer list then it is being torn + * down, and our reference count may be all that is keeping it + * alive. Don't do any work on it. + */ + if (list_empty(&lp->lp_peer_list)) + goto out; + + flags = LNET_PEER_DISCOVERED; + if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) + flags |= LNET_PEER_MULTI_RAIL; + + /* + * Check whether the primary NID in the message matches the + * primary NID of the peer. If it does, update the peer, if + * it it does not, check whether there is already a peer with + * that primary NID. If no such peer exists, try to update + * the primary NID of the current peer (allowed if it was + * created due to message traffic) and complete the update. + * If the peer did exist, hand off the data to it. + * + * The peer for the loopback interface is a special case: this + * is the peer for the local node, and we want to set its + * primary NID to the correct value here. Moreover, this peer + * can show up with only the loopback NID in the ping buffer. + */ + if (pbuf->pb_info.pi_nnis <= 1) + goto out; + nid = pbuf->pb_info.pi_ni[1].ns_nid; + if (lp->lp_primary_nid == LNET_NID_LO_0) { + rc = lnet_peer_set_primary_nid(lp, nid, flags); + if (!rc) + rc = lnet_peer_merge_data(lp, pbuf); + } else if (lp->lp_primary_nid == nid || + (lnet_is_nid_in_ping_info(lp->lp_primary_nid, &pbuf->pb_info) && + lnet_is_discovery_disabled(lp))) { + rc = lnet_peer_merge_data(lp, pbuf); + } else { + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + rc = lnet_peer_set_primary_nid(lp, nid, flags); + if (rc) { + CERROR("Primary NID error %s versus %s: %d\n", + libcfs_nid2str(lp->lp_primary_nid), + libcfs_nid2str(nid), rc); + } else { + rc = lnet_peer_merge_data(lp, pbuf); + } + } else { + struct lnet_peer *new_lp; + + new_lp = lpni->lpni_peer_net->lpn_peer; + /* if lp has discovery/MR enabled that means new_lp + * should have discovery/MR enabled as well, since + * it's the same peer, which we're about to merge + */ + if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY)) + new_lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; + if (lp->lp_state & LNET_PEER_MULTI_RAIL) + new_lp->lp_state |= LNET_PEER_MULTI_RAIL; + rc = lnet_peer_set_primary_data( + lpni->lpni_peer_net->lpn_peer, pbuf); + lnet_peer_ni_decref_locked(lpni); + } + } +out: + CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc); + mutex_unlock(&the_lnet.ln_api_mutex); + + spin_lock(&lp->lp_lock); + /* Tell discovery to re-check the peer immediately. */ + if (!rc) + rc = LNET_REDISCOVER_PEER; + return rc; +} + +/* + * A ping failed. Clear the PING_FAILED state and set the + * FORCE_PING state, to ensure a retry even if discovery is + * disabled. This avoids being left with incorrect state. + */ +static int lnet_peer_ping_failed(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct lnet_handle_md mdh; + int rc; + + mdh = lp->lp_ping_mdh; + LNetInvalidateMDHandle(&lp->lp_ping_mdh); + lp->lp_state &= ~LNET_PEER_PING_FAILED; + lp->lp_state |= LNET_PEER_FORCE_PING; + rc = lp->lp_ping_error; + lp->lp_ping_error = 0; + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(mdh)) + LNetMDUnlink(mdh); + + CDEBUG(D_NET, "peer %s:%d\n", + libcfs_nid2str(lp->lp_primary_nid), rc); + + spin_lock(&lp->lp_lock); + return rc ? rc : LNET_REDISCOVER_PEER; +} + +/* + * Select NID to send a Ping or Push to. + */ +static lnet_nid_t lnet_peer_select_nid(struct lnet_peer *lp) +{ + struct lnet_peer_ni *lpni; + + /* Look for a direct-connected NID for this peer. */ + lpni = NULL; + while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) { + if (!lnet_get_net_locked(lpni->lpni_peer_net->lpn_net_id)) + continue; + break; + } + if (lpni) + return lpni->lpni_nid; + + /* Look for a routed-connected NID for this peer. */ + lpni = NULL; + while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) { + if (!lnet_find_rnet_locked(lpni->lpni_peer_net->lpn_net_id)) + continue; + break; + } + if (lpni) + return lpni->lpni_nid; + + return LNET_NID_ANY; +} + +/* Active side of ping. */ +static int lnet_peer_send_ping(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + lnet_nid_t pnid; + int nnis; + int rc; + int cpt; + + lp->lp_state |= LNET_PEER_PING_SENT; + lp->lp_state &= ~LNET_PEER_FORCE_PING; + spin_unlock(&lp->lp_lock); + + cpt = lnet_net_lock_current(); + /* Refcount for MD. */ + lnet_peer_addref_locked(lp); + pnid = lnet_peer_select_nid(lp); + lnet_net_unlock(cpt); + + nnis = MAX(lp->lp_data_nnis, LNET_INTERFACES_MIN); + + rc = lnet_send_ping(pnid, &lp->lp_ping_mdh, nnis, lp, + the_lnet.ln_dc_eqh, false); + + /* + * if LNetMDBind in lnet_send_ping fails we need to decrement the + * refcount on the peer, otherwise LNetMDUnlink will be called + * which will eventually do that. + */ + if (rc > 0) { + lnet_net_lock(cpt); + lnet_peer_decref_locked(lp); + lnet_net_unlock(cpt); + rc = -rc; /* change the rc to negative value */ + goto fail_error; + } else if (rc < 0) { + goto fail_error; + } + + CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); + + spin_lock(&lp->lp_lock); + return 0; + +fail_error: + CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc); + /* + * The errors that get us here are considered hard errors and + * cause Discovery to terminate. So we clear PING_SENT, but do + * not set either PING_FAILED or FORCE_PING. In fact we need + * to clear PING_FAILED, because the unlink event handler will + * have set it if we called LNetMDUnlink() above. + */ + spin_lock(&lp->lp_lock); + lp->lp_state &= ~(LNET_PEER_PING_SENT | LNET_PEER_PING_FAILED); + return rc; +} + +/* + * This function exists because you cannot call LNetMDUnlink() from an + * event handler. + */ +static int lnet_peer_push_failed(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct lnet_handle_md mdh; + int rc; + + mdh = lp->lp_push_mdh; + LNetInvalidateMDHandle(&lp->lp_push_mdh); + lp->lp_state &= ~LNET_PEER_PUSH_FAILED; + rc = lp->lp_push_error; + lp->lp_push_error = 0; + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(mdh)) + LNetMDUnlink(mdh); + + CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); + spin_lock(&lp->lp_lock); + return rc ? rc : LNET_REDISCOVER_PEER; +} + +/* + * Mark the peer as discovered. + */ +static int lnet_peer_discovered(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + lp->lp_state |= LNET_PEER_DISCOVERED; + lp->lp_state &= ~(LNET_PEER_DISCOVERING | + LNET_PEER_REDISCOVER); + + CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); + + return 0; +} + +/* Active side of push. */ +static int lnet_peer_send_push(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct lnet_ping_buffer *pbuf; + struct lnet_process_id id; + struct lnet_md md; + int cpt; + int rc; + + /* Don't push to a non-multi-rail peer. */ + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + lp->lp_state &= ~LNET_PEER_FORCE_PUSH; + /* if peer's NIDs are uptodate then peer is discovered */ + if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) { + rc = lnet_peer_discovered(lp); + return rc; + } + + return 0; + } + + lp->lp_state |= LNET_PEER_PUSH_SENT; + lp->lp_state &= ~LNET_PEER_FORCE_PUSH; + spin_unlock(&lp->lp_lock); + + cpt = lnet_net_lock_current(); + pbuf = the_lnet.ln_ping_target; + lnet_ping_buffer_addref(pbuf); + lnet_net_unlock(cpt); + + /* Push source MD */ + md.start = &pbuf->pb_info; + md.length = LNET_PING_INFO_SIZE(pbuf->pb_nnis); + md.threshold = 2; /* Put/Ack */ + md.max_size = 0; + md.options = 0; + md.eq_handle = the_lnet.ln_dc_eqh; + md.user_ptr = lp; + + rc = LNetMDBind(md, LNET_UNLINK, &lp->lp_push_mdh); + if (rc) { + lnet_ping_buffer_decref(pbuf); + CERROR("Can't bind push source MD: %d\n", rc); + goto fail_error; + } + cpt = lnet_net_lock_current(); + /* Refcount for MD. */ + lnet_peer_addref_locked(lp); + id.pid = LNET_PID_LUSTRE; + id.nid = lnet_peer_select_nid(lp); + lnet_net_unlock(cpt); + + if (id.nid == LNET_NID_ANY) { + rc = -EHOSTUNREACH; + goto fail_unlink; + } + + rc = LNetPut(lp->lp_disc_src_nid, lp->lp_push_mdh, + LNET_ACK_REQ, id, LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0, 0); + + /* + * reset the discovery nid. There is no need to restrict sending + * from that source, if we call lnet_push_update_to_peers(). It'll + * get set to a specific NID, if we initiate discovery from the + * scratch + */ + lp->lp_disc_src_nid = LNET_NID_ANY; + + if (rc) + goto fail_unlink; + + CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); + + spin_lock(&lp->lp_lock); + return 0; + +fail_unlink: + LNetMDUnlink(lp->lp_push_mdh); + LNetInvalidateMDHandle(&lp->lp_push_mdh); +fail_error: + CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc); + /* + * The errors that get us here are considered hard errors and + * cause Discovery to terminate. So we clear PUSH_SENT, but do + * not set PUSH_FAILED. In fact we need to clear PUSH_FAILED, + * because the unlink event handler will have set it if we + * called LNetMDUnlink() above. + */ + spin_lock(&lp->lp_lock); + lp->lp_state &= ~(LNET_PEER_PUSH_SENT | LNET_PEER_PUSH_FAILED); + return rc; +} + +/* + * An unrecoverable error was encountered during discovery. + * Set error status in peer and abort discovery. + */ +static void lnet_peer_discovery_error(struct lnet_peer *lp, int error) +{ + CDEBUG(D_NET, "Discovery error %s: %d\n", + libcfs_nid2str(lp->lp_primary_nid), error); + + spin_lock(&lp->lp_lock); + lp->lp_dc_error = error; + lp->lp_state &= ~LNET_PEER_DISCOVERING; + lp->lp_state |= LNET_PEER_REDISCOVER; + spin_unlock(&lp->lp_lock); +} + +/* + * Discovering this peer is taking too long. Cancel any Ping or Push + * that discovery is waiting on by unlinking the relevant MDs. The + * lnet_discovery_event_handler() will proceed from here and complete + * the cleanup. + */ +static void lnet_peer_cancel_discovery(struct lnet_peer *lp) +{ + struct lnet_handle_md ping_mdh; + struct lnet_handle_md push_mdh; + + LNetInvalidateMDHandle(&ping_mdh); + LNetInvalidateMDHandle(&push_mdh); + + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_PING_SENT) { + ping_mdh = lp->lp_ping_mdh; + LNetInvalidateMDHandle(&lp->lp_ping_mdh); + } + if (lp->lp_state & LNET_PEER_PUSH_SENT) { + push_mdh = lp->lp_push_mdh; + LNetInvalidateMDHandle(&lp->lp_push_mdh); + } + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(ping_mdh)) + LNetMDUnlink(ping_mdh); + if (!LNetMDHandleIsInvalid(push_mdh)) + LNetMDUnlink(push_mdh); +} + +/* + * Wait for work to be queued or some other change that must be + * attended to. Returns non-zero if the discovery thread should shut + * down. + */ +static int lnet_peer_discovery_wait_for_work(void) +{ + int cpt; + int rc = 0; + + DEFINE_WAIT(wait); + + cpt = lnet_net_lock_current(); + for (;;) { + prepare_to_wait(&the_lnet.ln_dc_waitq, &wait, + TASK_INTERRUPTIBLE); + if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) + break; + if (lnet_push_target_resize_needed()) + break; + if (!list_empty(&the_lnet.ln_dc_request)) + break; + if (!list_empty(&the_lnet.ln_msg_resend)) + break; + lnet_net_unlock(cpt); + + /* + * wakeup max every second to check if there are peers that + * have been stuck on the working queue for greater than + * the peer timeout. + */ + schedule_timeout(cfs_time_seconds(1)); + finish_wait(&the_lnet.ln_dc_waitq, &wait); + cpt = lnet_net_lock_current(); + } + finish_wait(&the_lnet.ln_dc_waitq, &wait); + + if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) + rc = -ESHUTDOWN; + + lnet_net_unlock(cpt); + + CDEBUG(D_NET, "woken: %d\n", rc); + + return rc; +} + +/* + * Messages that were pending on a destroyed peer will be put on a global + * resend list. The message resend list will be checked by + * the discovery thread when it wakes up, and will resend messages. These + * messages can still be sendable in the case the lpni which was the initial + * cause of the message re-queue was transfered to another peer. + * + * It is possible that LNet could be shutdown while we're iterating + * through the list. lnet_shudown_lndnets() will attempt to access the + * resend list, but will have to wait until the spinlock is released, by + * which time there shouldn't be any more messages on the resend list. + * During shutdown lnet_send() will fail and lnet_finalize() will be called + * for the messages so they can be released. The other case is that + * lnet_shudown_lndnets() can finalize all the messages before this + * function can visit the resend list, in which case this function will be + * a no-op. + */ +static void lnet_resend_msgs(void) +{ + struct lnet_msg *msg, *tmp; + struct list_head resend; + int rc; + + INIT_LIST_HEAD(&resend); + + spin_lock(&the_lnet.ln_msg_resend_lock); + list_splice(&the_lnet.ln_msg_resend, &resend); + spin_unlock(&the_lnet.ln_msg_resend_lock); + + list_for_each_entry_safe(msg, tmp, &resend, msg_list) { + list_del_init(&msg->msg_list); + rc = lnet_send(msg->msg_src_nid_param, msg, + msg->msg_rtr_nid_param); + if (rc < 0) { + CNETERR("Error sending %s to %s: %d\n", + lnet_msgtyp2str(msg->msg_type), + libcfs_id2str(msg->msg_target), rc); + lnet_finalize(msg, rc); + } + } +} + +/* The discovery thread. */ +static int lnet_peer_discovery(void *arg) +{ + struct lnet_peer *lp; + int rc; + + CDEBUG(D_NET, "started\n"); + cfs_block_allsigs(); + + for (;;) { + if (lnet_peer_discovery_wait_for_work()) + break; + + lnet_resend_msgs(); + + if (lnet_push_target_resize_needed()) + lnet_push_target_resize(); + + lnet_net_lock(LNET_LOCK_EX); + if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) { + lnet_net_unlock(LNET_LOCK_EX); + break; + } + + /* + * Process all incoming discovery work requests. When + * discovery must wait on a peer to change state, it + * is added to the tail of the ln_dc_working queue. A + * timestamp keeps track of when the peer was added, + * so we can time out discovery requests that take too + * long. + */ + while (!list_empty(&the_lnet.ln_dc_request)) { + lp = list_first_entry(&the_lnet.ln_dc_request, + struct lnet_peer, lp_dc_list); + list_move(&lp->lp_dc_list, &the_lnet.ln_dc_working); + /* + * set the time the peer was put on the dc_working + * queue. It shouldn't remain on the queue + * forever, in case the GET message (for ping) + * doesn't get a REPLY or the PUT message (for + * push) doesn't get an ACK. + */ + lp->lp_last_queued = ktime_get_real_seconds(); + lnet_net_unlock(LNET_LOCK_EX); + + /* + * Select an action depending on the state of + * the peer and whether discovery is disabled. + * The check whether discovery is disabled is + * done after the code that handles processing + * for arrived data, cleanup for failures, and + * forcing a Ping or Push. + */ + spin_lock(&lp->lp_lock); + CDEBUG(D_NET, "peer %s state %#x\n", + libcfs_nid2str(lp->lp_primary_nid), + lp->lp_state); + if (lp->lp_state & LNET_PEER_DATA_PRESENT) + rc = lnet_peer_data_present(lp); + else if (lp->lp_state & LNET_PEER_PING_FAILED) + rc = lnet_peer_ping_failed(lp); + else if (lp->lp_state & LNET_PEER_PUSH_FAILED) + rc = lnet_peer_push_failed(lp); + else if (lp->lp_state & LNET_PEER_FORCE_PING) + rc = lnet_peer_send_ping(lp); + else if (lp->lp_state & LNET_PEER_FORCE_PUSH) + rc = lnet_peer_send_push(lp); + else if (!(lp->lp_state & LNET_PEER_NIDS_UPTODATE)) + rc = lnet_peer_send_ping(lp); + else if (lnet_peer_needs_push(lp)) + rc = lnet_peer_send_push(lp); + else + rc = lnet_peer_discovered(lp); + CDEBUG(D_NET, "peer %s state %#x rc %d\n", + libcfs_nid2str(lp->lp_primary_nid), + lp->lp_state, rc); + spin_unlock(&lp->lp_lock); + + lnet_net_lock(LNET_LOCK_EX); + if (rc == LNET_REDISCOVER_PEER) { + list_move(&lp->lp_dc_list, + &the_lnet.ln_dc_request); + } else if (rc) { + lnet_peer_discovery_error(lp, rc); + } + if (!(lp->lp_state & LNET_PEER_DISCOVERING)) + lnet_peer_discovery_complete(lp); + if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) + break; + } + + lnet_net_unlock(LNET_LOCK_EX); + } + + CDEBUG(D_NET, "stopping\n"); + /* + * Clean up before telling lnet_peer_discovery_stop() that + * we're done. Use wake_up() below to somewhat reduce the + * size of the thundering herd if there are multiple threads + * waiting on discovery of a single peer. + */ + + /* Queue cleanup 1: stop all pending pings and pushes. */ + lnet_net_lock(LNET_LOCK_EX); + while (!list_empty(&the_lnet.ln_dc_working)) { + lp = list_first_entry(&the_lnet.ln_dc_working, + struct lnet_peer, lp_dc_list); + list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired); + lnet_net_unlock(LNET_LOCK_EX); + lnet_peer_cancel_discovery(lp); + lnet_net_lock(LNET_LOCK_EX); + } + lnet_net_unlock(LNET_LOCK_EX); + + /* Queue cleanup 2: wait for the expired queue to clear. */ + while (!list_empty(&the_lnet.ln_dc_expired)) + schedule_timeout(cfs_time_seconds(1)); + + /* Queue cleanup 3: clear the request queue. */ + lnet_net_lock(LNET_LOCK_EX); + while (!list_empty(&the_lnet.ln_dc_request)) { + lp = list_first_entry(&the_lnet.ln_dc_request, + struct lnet_peer, lp_dc_list); + lnet_peer_discovery_error(lp, -ESHUTDOWN); + lnet_peer_discovery_complete(lp); + } + lnet_net_unlock(LNET_LOCK_EX); + + LNetEQFree(the_lnet.ln_dc_eqh); + LNetInvalidateEQHandle(&the_lnet.ln_dc_eqh); + + the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN; + wake_up(&the_lnet.ln_dc_waitq); + + CDEBUG(D_NET, "stopped\n"); + + return 0; +} + +/* ln_api_mutex is held on entry. */ +int lnet_peer_discovery_start(void) +{ + struct task_struct *task; + int rc; + + if (the_lnet.ln_dc_state != LNET_DC_STATE_SHUTDOWN) + return -EALREADY; + + rc = LNetEQAlloc(0, lnet_discovery_event_handler, &the_lnet.ln_dc_eqh); + if (rc != 0) { + CERROR("Can't allocate discovery EQ: %d\n", rc); + return rc; + } + + the_lnet.ln_dc_state = LNET_DC_STATE_RUNNING; + task = kthread_run(lnet_peer_discovery, NULL, "lnet_discovery"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("Can't start peer discovery thread: %d\n", rc); + + LNetEQFree(the_lnet.ln_dc_eqh); + LNetInvalidateEQHandle(&the_lnet.ln_dc_eqh); + + the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN; + } + + CDEBUG(D_NET, "discovery start: %d\n", rc); + + return rc; +} + +/* ln_api_mutex is held on entry. */ +void lnet_peer_discovery_stop(void) +{ + if (the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN) + return; + + LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING); + the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING; + wake_up(&the_lnet.ln_dc_waitq); + + wait_event(the_lnet.ln_dc_waitq, + the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN); + + LASSERT(list_empty(&the_lnet.ln_dc_request)); + LASSERT(list_empty(&the_lnet.ln_dc_working)); + LASSERT(list_empty(&the_lnet.ln_dc_expired)); + + CDEBUG(D_NET, "discovery stopped\n"); +} + +/* Debugging */ + +void +lnet_debug_peer(lnet_nid_t nid) +{ + char *aliveness = "NA"; + struct lnet_peer_ni *lp; + int cpt; + + cpt = lnet_cpt_of_nid(nid, NULL); + lnet_net_lock(cpt); + + lp = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt); + if (IS_ERR(lp)) { + lnet_net_unlock(cpt); + CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid)); + return; + } + + if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp)) + aliveness = lp->lpni_alive ? "up" : "down"; + + CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", + libcfs_nid2str(lp->lpni_nid), atomic_read(&lp->lpni_refcount), + aliveness, lp->lpni_net->net_tunables.lct_peer_tx_credits, + lp->lpni_rtrcredits, lp->lpni_minrtrcredits, + lp->lpni_txcredits, lp->lpni_mintxcredits, lp->lpni_txqnob); + + lnet_peer_ni_decref_locked(lp); + + lnet_net_unlock(cpt); +} + +/* Gathering information for userspace. */ + +int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid, + char aliveness[LNET_MAX_STR_LEN], + __u32 *cpt_iter, __u32 *refcount, + __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits, + __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credits, + __u32 *peer_tx_qnob) +{ + struct lnet_peer_table *peer_table; + struct lnet_peer_ni *lp; + int j; + int lncpt; + bool found = false; + + /* get the number of CPTs */ + lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + + /* if the cpt number to be examined is >= the number of cpts in + * the system then indicate that there are no more cpts to examin + */ + if (*cpt_iter >= lncpt) + return -ENOENT; + + /* get the current table */ + peer_table = the_lnet.ln_peer_tables[*cpt_iter]; + /* if the ptable is NULL then there are no more cpts to examine */ + if (peer_table == NULL) + return -ENOENT; + + lnet_net_lock(*cpt_iter); + + for (j = 0; j < LNET_PEER_HASH_SIZE && !found; j++) { + struct list_head *peers = &peer_table->pt_hash[j]; + + list_for_each_entry(lp, peers, lpni_hashlist) { + if (peer_index-- > 0) + continue; + + snprintf(aliveness, LNET_MAX_STR_LEN, "NA"); + if (lnet_isrouter(lp) || + lnet_peer_aliveness_enabled(lp)) + snprintf(aliveness, LNET_MAX_STR_LEN, + lp->lpni_alive ? "up" : "down"); + + *nid = lp->lpni_nid; + *refcount = atomic_read(&lp->lpni_refcount); + *ni_peer_tx_credits = + lp->lpni_net->net_tunables.lct_peer_tx_credits; + *peer_tx_credits = lp->lpni_txcredits; + *peer_rtr_credits = lp->lpni_rtrcredits; + *peer_min_rtr_credits = lp->lpni_mintxcredits; + *peer_tx_qnob = lp->lpni_txqnob; + + found = true; + } + + } + lnet_net_unlock(*cpt_iter); + + *cpt_iter = lncpt; + + return found ? 0 : -ENOENT; +} + +/* ln_api_mutex is held, which keeps the peer list stable */ +int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) +{ + struct lnet_ioctl_element_stats *lpni_stats; + struct lnet_ioctl_element_msg_stats *lpni_msg_stats; + struct lnet_ioctl_peer_ni_hstats *lpni_hstats; + struct lnet_peer_ni_credit_info *lpni_info; + struct lnet_peer_ni *lpni; + struct lnet_peer *lp; + lnet_nid_t nid; + __u32 size; + int rc; + + lp = lnet_find_peer(cfg->prcfg_prim_nid); + + if (!lp) { + rc = -ENOENT; + goto out; + } + + size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats) + + sizeof(*lpni_msg_stats) + sizeof(*lpni_hstats); + size *= lp->lp_nnis; + if (size > cfg->prcfg_size) { + cfg->prcfg_size = size; + rc = -E2BIG; + goto out_lp_decref; + } + + cfg->prcfg_prim_nid = lp->lp_primary_nid; + cfg->prcfg_mr = lnet_peer_is_multi_rail(lp); + cfg->prcfg_cfg_nid = lp->lp_primary_nid; + cfg->prcfg_count = lp->lp_nnis; + cfg->prcfg_size = size; + cfg->prcfg_state = lp->lp_state; + + /* Allocate helper buffers. */ + rc = -ENOMEM; + LIBCFS_ALLOC(lpni_info, sizeof(*lpni_info)); + if (!lpni_info) + goto out_lp_decref; + LIBCFS_ALLOC(lpni_stats, sizeof(*lpni_stats)); + if (!lpni_stats) + goto out_free_info; + LIBCFS_ALLOC(lpni_msg_stats, sizeof(*lpni_msg_stats)); + if (!lpni_msg_stats) + goto out_free_stats; + LIBCFS_ALLOC(lpni_hstats, sizeof(*lpni_hstats)); + if (!lpni_hstats) + goto out_free_msg_stats; + + + lpni = NULL; + rc = -EFAULT; + while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) { + nid = lpni->lpni_nid; + if (copy_to_user(bulk, &nid, sizeof(nid))) + goto out_free_hstats; + bulk += sizeof(nid); + + memset(lpni_info, 0, sizeof(*lpni_info)); + snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN, "NA"); + if (lnet_isrouter(lpni) || + lnet_peer_aliveness_enabled(lpni)) + snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN, + lpni->lpni_alive ? "up" : "down"); + + lpni_info->cr_refcount = atomic_read(&lpni->lpni_refcount); + lpni_info->cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ? + lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0; + lpni_info->cr_peer_tx_credits = lpni->lpni_txcredits; + lpni_info->cr_peer_rtr_credits = lpni->lpni_rtrcredits; + lpni_info->cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits; + lpni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits; + lpni_info->cr_peer_tx_qnob = lpni->lpni_txqnob; + if (copy_to_user(bulk, lpni_info, sizeof(*lpni_info))) + goto out_free_hstats; + bulk += sizeof(*lpni_info); + + memset(lpni_stats, 0, sizeof(*lpni_stats)); + lpni_stats->iel_send_count = lnet_sum_stats(&lpni->lpni_stats, + LNET_STATS_TYPE_SEND); + lpni_stats->iel_recv_count = lnet_sum_stats(&lpni->lpni_stats, + LNET_STATS_TYPE_RECV); + lpni_stats->iel_drop_count = lnet_sum_stats(&lpni->lpni_stats, + LNET_STATS_TYPE_DROP); + if (copy_to_user(bulk, lpni_stats, sizeof(*lpni_stats))) + goto out_free_hstats; + bulk += sizeof(*lpni_stats); + lnet_usr_translate_stats(lpni_msg_stats, &lpni->lpni_stats); + if (copy_to_user(bulk, lpni_msg_stats, sizeof(*lpni_msg_stats))) + goto out_free_hstats; + bulk += sizeof(*lpni_msg_stats); + lpni_hstats->hlpni_network_timeout = + atomic_read(&lpni->lpni_hstats.hlt_network_timeout); + lpni_hstats->hlpni_remote_dropped = + atomic_read(&lpni->lpni_hstats.hlt_remote_dropped); + lpni_hstats->hlpni_remote_timeout = + atomic_read(&lpni->lpni_hstats.hlt_remote_timeout); + lpni_hstats->hlpni_remote_error = + atomic_read(&lpni->lpni_hstats.hlt_remote_error); + lpni_hstats->hlpni_health_value = + atomic_read(&lpni->lpni_healthv); + if (copy_to_user(bulk, lpni_hstats, sizeof(*lpni_hstats))) + goto out_free_hstats; + bulk += sizeof(*lpni_hstats); + } + rc = 0; + +out_free_hstats: + LIBCFS_FREE(lpni_hstats, sizeof(*lpni_hstats)); +out_free_msg_stats: + LIBCFS_FREE(lpni_msg_stats, sizeof(*lpni_msg_stats)); +out_free_stats: + LIBCFS_FREE(lpni_stats, sizeof(*lpni_stats)); +out_free_info: + LIBCFS_FREE(lpni_info, sizeof(*lpni_info)); +out_lp_decref: + lnet_peer_decref_locked(lp); +out: + return rc; +} + +void +lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni) +{ + /* the mt could've shutdown and cleaned up the queues */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) + return; + + if (list_empty(&lpni->lpni_recovery) && + atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) { + CDEBUG(D_NET, "lpni %s added to recovery queue. Health = %d\n", + libcfs_nid2str(lpni->lpni_nid), + atomic_read(&lpni->lpni_healthv)); + list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq); + lnet_peer_ni_addref_locked(lpni); + } +} + +/* Call with the ln_api_mutex held */ +void +lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all) +{ + struct lnet_peer_table *ptable; + struct lnet_peer *lp; + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + int lncpt; + int cpt; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return; + + if (!all) { + lnet_net_lock(LNET_LOCK_EX); + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + lnet_net_unlock(LNET_LOCK_EX); + return; + } + atomic_set(&lpni->lpni_healthv, value); + lnet_peer_ni_add_to_recoveryq_locked(lpni); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(LNET_LOCK_EX); + return; + } + + lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + + /* + * Walk all the peers and reset the healhv for each one to the + * maximum value. + */ + lnet_net_lock(LNET_LOCK_EX); + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) { + list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { + list_for_each_entry(lpni, &lpn->lpn_peer_nis, + lpni_peer_nis) { + atomic_set(&lpni->lpni_healthv, value); + lnet_peer_ni_add_to_recoveryq_locked(lpni); + } + } + } + } + lnet_net_unlock(LNET_LOCK_EX); +} + diff --git a/drivers/staging/lustrefsx/lnet/lnet/router.c b/drivers/staging/lustrefsx/lnet/lnet/router.c new file mode 100644 index 0000000000000..e2966cf77c561 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/router.c @@ -0,0 +1,1835 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */ +#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4) +#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */ +#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4) +#define LNET_NRB_SMALL_PAGES 1 +#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */ +#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4) +#define LNET_NRB_LARGE_PAGES ((LNET_MTU + PAGE_SIZE - 1) >> \ + PAGE_SHIFT) + +static char *forwarding = ""; +module_param(forwarding, charp, 0444); +MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks"); + +static int tiny_router_buffers; +module_param(tiny_router_buffers, int, 0444); +MODULE_PARM_DESC(tiny_router_buffers, "# of 0 payload messages to buffer in the router"); +static int small_router_buffers; +module_param(small_router_buffers, int, 0444); +MODULE_PARM_DESC(small_router_buffers, "# of small (1 page) messages to buffer in the router"); +static int large_router_buffers; +module_param(large_router_buffers, int, 0444); +MODULE_PARM_DESC(large_router_buffers, "# of large messages to buffer in the router"); +static int peer_buffer_credits; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# router buffer credits per peer"); + +static int auto_down = 1; +module_param(auto_down, int, 0444); +MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error"); + +int +lnet_peer_buffer_credits(struct lnet_net *net) +{ + /* NI option overrides LNet default */ + if (net->net_tunables.lct_peer_rtr_credits > 0) + return net->net_tunables.lct_peer_rtr_credits; + if (peer_buffer_credits > 0) + return peer_buffer_credits; + + /* As an approximation, allow this peer the same number of router + * buffers as it is allowed outstanding sends */ + return net->net_tunables.lct_peer_tx_credits; +} + +static int check_routers_before_use; +module_param(check_routers_before_use, int, 0444); +MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use"); + +int avoid_asym_router_failure = 1; +module_param(avoid_asym_router_failure, int, 0644); +MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)"); + +static int dead_router_check_interval = 60; +module_param(dead_router_check_interval, int, 0644); +MODULE_PARM_DESC(dead_router_check_interval, "Seconds between dead router health checks (<= 0 to disable)"); + +static int live_router_check_interval = 60; +module_param(live_router_check_interval, int, 0644); +MODULE_PARM_DESC(live_router_check_interval, "Seconds between live router health checks (<= 0 to disable)"); + +static int router_ping_timeout = 50; +module_param(router_ping_timeout, int, 0644); +MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query"); + +int +lnet_peers_start_down(void) +{ + return check_routers_before_use; +} + +void +lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive, + time64_t when) +{ + if (lp->lpni_timestamp > when) { /* out of date information */ + CDEBUG(D_NET, "Out of date\n"); + return; + } + + /* + * This function can be called with different cpt locks being + * held. lpni_alive_count modification needs to be properly protected. + * Significant reads to lpni_alive_count are also protected with + * the same lock + */ + spin_lock(&lp->lpni_lock); + + lp->lpni_timestamp = when; /* update timestamp */ + lp->lpni_ping_deadline = 0; /* disable ping timeout */ + + if (lp->lpni_alive_count != 0 && /* got old news */ + (!lp->lpni_alive) == (!alive)) { /* new date for old news */ + spin_unlock(&lp->lpni_lock); + CDEBUG(D_NET, "Old news\n"); + return; + } + + /* Flag that notification is outstanding */ + + lp->lpni_alive_count++; + lp->lpni_alive = (alive) ? 1 : 0; + lp->lpni_notify = 1; + lp->lpni_notifylnd = notifylnd; + if (lp->lpni_alive) + lp->lpni_ping_feats = LNET_PING_FEAT_INVAL; /* reset */ + + spin_unlock(&lp->lpni_lock); + + CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lpni_nid), alive); +} + +/* + * This function will always be called with lp->lpni_cpt lock held. + */ +static void +lnet_ni_notify_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp) +{ + int alive; + int notifylnd; + + /* Notify only in 1 thread at any time to ensure ordered notification. + * NB individual events can be missed; the only guarantee is that you + * always get the most recent news */ + + spin_lock(&lp->lpni_lock); + + if (lp->lpni_notifying || ni == NULL) { + spin_unlock(&lp->lpni_lock); + return; + } + + lp->lpni_notifying = 1; + + /* + * lp->lpni_notify needs to be protected because it can be set in + * lnet_notify_locked(). + */ + while (lp->lpni_notify) { + alive = lp->lpni_alive; + notifylnd = lp->lpni_notifylnd; + + lp->lpni_notifylnd = 0; + lp->lpni_notify = 0; + + if (notifylnd && ni->ni_net->net_lnd->lnd_notify != NULL) { + spin_unlock(&lp->lpni_lock); + lnet_net_unlock(lp->lpni_cpt); + + /* A new notification could happen now; I'll handle it + * when control returns to me */ + + (ni->ni_net->net_lnd->lnd_notify)(ni, lp->lpni_nid, + alive); + + lnet_net_lock(lp->lpni_cpt); + spin_lock(&lp->lpni_lock); + } + } + + lp->lpni_notifying = 0; + spin_unlock(&lp->lpni_lock); +} + +static void +lnet_rtr_addref_locked(struct lnet_peer_ni *lp) +{ + LASSERT(atomic_read(&lp->lpni_refcount) > 0); + LASSERT(lp->lpni_rtr_refcount >= 0); + + /* lnet_net_lock must be exclusively locked */ + lp->lpni_rtr_refcount++; + if (lp->lpni_rtr_refcount == 1) { + struct list_head *pos; + + /* a simple insertion sort */ + list_for_each_prev(pos, &the_lnet.ln_routers) { + struct lnet_peer_ni *rtr; + + rtr = list_entry(pos, struct lnet_peer_ni, + lpni_rtr_list); + if (rtr->lpni_nid < lp->lpni_nid) + break; + } + + list_add(&lp->lpni_rtr_list, pos); + /* addref for the_lnet.ln_routers */ + lnet_peer_ni_addref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +static void +lnet_rtr_decref_locked(struct lnet_peer_ni *lp) +{ + LASSERT(atomic_read(&lp->lpni_refcount) > 0); + LASSERT(lp->lpni_rtr_refcount > 0); + + /* lnet_net_lock must be exclusively locked */ + lp->lpni_rtr_refcount--; + if (lp->lpni_rtr_refcount == 0) { + LASSERT(list_empty(&lp->lpni_routes)); + + if (lp->lpni_rcd != NULL) { + list_add(&lp->lpni_rcd->rcd_list, + &the_lnet.ln_rcd_deathrow); + lp->lpni_rcd = NULL; + } + + list_del(&lp->lpni_rtr_list); + /* decref for the_lnet.ln_routers */ + lnet_peer_ni_decref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +struct lnet_remotenet * +lnet_find_rnet_locked(__u32 net) +{ + struct lnet_remotenet *rnet; + struct list_head *tmp; + struct list_head *rn_list; + + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); + + rn_list = lnet_net2rnethash(net); + list_for_each(tmp, rn_list) { + rnet = list_entry(tmp, struct lnet_remotenet, lrn_list); + + if (rnet->lrn_net == net) + return rnet; + } + return NULL; +} + +static void lnet_shuffle_seed(void) +{ + static int seeded; + __u32 lnd_type; + __u32 seed[2]; + struct timespec64 ts; + struct lnet_ni *ni = NULL; + + if (seeded) + return; + + cfs_get_random_bytes(seed, sizeof(seed)); + + /* Nodes with small feet have little entropy + * the NID for this node gives the most entropy in the low bits */ + while ((ni = lnet_get_next_ni_locked(NULL, ni))) { + lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); + + if (lnd_type != LOLND) + seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type); + } + + ktime_get_ts64(&ts); + cfs_srand(ts.tv_sec ^ seed[0], ts.tv_nsec ^ seed[1]); + seeded = 1; + return; +} + +/* NB expects LNET_LOCK held */ +static void +lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route) +{ + unsigned int len = 0; + unsigned int offset = 0; + struct list_head *e; + + lnet_shuffle_seed(); + + list_for_each(e, &rnet->lrn_routes) { + len++; + } + + /* len+1 positions to add a new entry, also prevents division by 0 */ + offset = cfs_rand() % (len + 1); + list_for_each(e, &rnet->lrn_routes) { + if (offset == 0) + break; + offset--; + } + list_add(&route->lr_list, e); + list_add(&route->lr_gwlist, &route->lr_gateway->lpni_routes); + + the_lnet.ln_remote_nets_version++; + lnet_rtr_addref_locked(route->lr_gateway); +} + +int +lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway, + unsigned int priority) +{ + struct list_head *e; + struct lnet_remotenet *rnet; + struct lnet_remotenet *rnet2; + struct lnet_route *route; + struct lnet_ni *ni; + struct lnet_peer_ni *lpni; + int add_route; + int rc; + + CDEBUG(D_NET, "Add route: net %s hops %d priority %u gw %s\n", + libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway)); + + if (gateway == LNET_NID_ANY || + gateway == LNET_NID_LO_0 || + net == LNET_NIDNET(LNET_NID_ANY) || + LNET_NETTYP(net) == LOLND || + LNET_NIDNET(gateway) == net || + (hops != LNET_UNDEFINED_HOPS && (hops < 1 || hops > 255))) + return -EINVAL; + + if (lnet_islocalnet(net)) /* it's a local network */ + return -EEXIST; + + if (!lnet_islocalnet(LNET_NIDNET(gateway))) { + CERROR("Cannot add route with gateway %s. There is no local interface configured on LNet %s\n", + libcfs_nid2str(gateway), + libcfs_net2str(LNET_NIDNET(gateway))); + return -EHOSTUNREACH; + } + + /* Assume net, route, all new */ + LIBCFS_ALLOC(route, sizeof(*route)); + LIBCFS_ALLOC(rnet, sizeof(*rnet)); + if (route == NULL || rnet == NULL) { + CERROR("Out of memory creating route %s %d %s\n", + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + if (route != NULL) + LIBCFS_FREE(route, sizeof(*route)); + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + return -ENOMEM; + } + + INIT_LIST_HEAD(&rnet->lrn_routes); + rnet->lrn_net = net; + route->lr_hops = hops; + route->lr_net = net; + route->lr_priority = priority; + + lnet_net_lock(LNET_LOCK_EX); + + lpni = lnet_nid2peerni_ex(gateway, LNET_LOCK_EX); + if (IS_ERR(lpni)) { + lnet_net_unlock(LNET_LOCK_EX); + + LIBCFS_FREE(route, sizeof(*route)); + LIBCFS_FREE(rnet, sizeof(*rnet)); + + rc = PTR_ERR(lpni); + if (rc == -EHOSTUNREACH) /* gateway is not on a local net. */ + return rc; /* ignore the route entry */ + CERROR("Error %d creating route %s %d %s\n", rc, + libcfs_net2str(net), hops, + libcfs_nid2str(gateway)); + return rc; + } + route->lr_gateway = lpni; + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); + + rnet2 = lnet_find_rnet_locked(net); + if (rnet2 == NULL) { + /* new network */ + list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net)); + rnet2 = rnet; + } + + /* Search for a duplicate route (it's a NOOP if it is) */ + add_route = 1; + list_for_each(e, &rnet2->lrn_routes) { + struct lnet_route *route2; + + route2 = list_entry(e, struct lnet_route, lr_list); + if (route2->lr_gateway == route->lr_gateway) { + add_route = 0; + break; + } + + /* our lookups must be true */ + LASSERT(route2->lr_gateway->lpni_nid != gateway); + } + + if (add_route) { + lnet_peer_ni_addref_locked(route->lr_gateway); /* +1 for notify */ + lnet_add_route_to_rnet(rnet2, route); + + ni = lnet_get_next_ni_locked(route->lr_gateway->lpni_net, NULL); + lnet_net_unlock(LNET_LOCK_EX); + + /* XXX Assume alive */ + if (ni->ni_net->net_lnd->lnd_notify != NULL) + (ni->ni_net->net_lnd->lnd_notify)(ni, gateway, 1); + + lnet_net_lock(LNET_LOCK_EX); + } + + /* -1 for notify or !add_route */ + lnet_peer_ni_decref_locked(route->lr_gateway); + lnet_net_unlock(LNET_LOCK_EX); + + rc = 0; + + if (!add_route) { + rc = -EEXIST; + LIBCFS_FREE(route, sizeof(*route)); + } + + if (rnet != rnet2) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + /* kick start the monitor thread to handle the added route */ + wake_up(&the_lnet.ln_mt_waitq); + + return rc; +} + +int +lnet_check_routes(void) +{ + struct lnet_remotenet *rnet; + struct lnet_route *route; + struct lnet_route *route2; + struct list_head *e1; + struct list_head *e2; + int cpt; + struct list_head *rn_list; + int i; + + cpt = lnet_net_lock_current(); + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each(e1, rn_list) { + rnet = list_entry(e1, struct lnet_remotenet, lrn_list); + + route2 = NULL; + list_for_each(e2, &rnet->lrn_routes) { + lnet_nid_t nid1; + lnet_nid_t nid2; + int net; + + route = list_entry(e2, struct lnet_route, + lr_list); + + if (route2 == NULL) { + route2 = route; + continue; + } + + if (route->lr_gateway->lpni_net == + route2->lr_gateway->lpni_net) + continue; + + nid1 = route->lr_gateway->lpni_nid; + nid2 = route2->lr_gateway->lpni_nid; + net = rnet->lrn_net; + + lnet_net_unlock(cpt); + + CERROR("Routes to %s via %s and %s not " + "supported\n", + libcfs_net2str(net), + libcfs_nid2str(nid1), + libcfs_nid2str(nid2)); + return -EINVAL; + } + } + } + + lnet_net_unlock(cpt); + return 0; +} + +int +lnet_del_route(__u32 net, lnet_nid_t gw_nid) +{ + struct lnet_peer_ni *gateway; + struct lnet_remotenet *rnet; + struct lnet_route *route; + struct list_head *e1; + struct list_head *e2; + int rc = -ENOENT; + struct list_head *rn_list; + int idx = 0; + + CDEBUG(D_NET, "Del route: net %s : gw %s\n", + libcfs_net2str(net), libcfs_nid2str(gw_nid)); + + /* NB Caller may specify either all routes via the given gateway + * or a specific route entry actual NIDs) */ + + lnet_net_lock(LNET_LOCK_EX); + if (net == LNET_NIDNET(LNET_NID_ANY)) + rn_list = &the_lnet.ln_remote_nets_hash[0]; + else + rn_list = lnet_net2rnethash(net); + +again: + list_for_each(e1, rn_list) { + rnet = list_entry(e1, struct lnet_remotenet, lrn_list); + + if (!(net == LNET_NIDNET(LNET_NID_ANY) || + net == rnet->lrn_net)) + continue; + + list_for_each(e2, &rnet->lrn_routes) { + route = list_entry(e2, struct lnet_route, lr_list); + + gateway = route->lr_gateway; + if (!(gw_nid == LNET_NID_ANY || + gw_nid == gateway->lpni_nid)) + continue; + + list_del(&route->lr_list); + list_del(&route->lr_gwlist); + the_lnet.ln_remote_nets_version++; + + if (list_empty(&rnet->lrn_routes)) + list_del(&rnet->lrn_list); + else + rnet = NULL; + + lnet_rtr_decref_locked(gateway); + lnet_peer_ni_decref_locked(gateway); + + lnet_net_unlock(LNET_LOCK_EX); + + LIBCFS_FREE(route, sizeof(*route)); + + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + rc = 0; + lnet_net_lock(LNET_LOCK_EX); + goto again; + } + } + + if (net == LNET_NIDNET(LNET_NID_ANY) && + ++idx < LNET_REMOTE_NETS_HASH_SIZE) { + rn_list = &the_lnet.ln_remote_nets_hash[idx]; + goto again; + } + lnet_net_unlock(LNET_LOCK_EX); + + return rc; +} + +void +lnet_destroy_routes (void) +{ + lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY); +} + +int lnet_get_rtr_pool_cfg(int cpt, struct lnet_ioctl_pool_cfg *pool_cfg) +{ + struct lnet_rtrbufpool *rbp; + int i, rc = -ENOENT, j; + + if (the_lnet.ln_rtrpools == NULL) + return rc; + + + cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) { + if (i != cpt) + continue; + + lnet_net_lock(i); + for (j = 0; j < LNET_NRBPOOLS; j++) { + pool_cfg->pl_pools[j].pl_npages = rbp[j].rbp_npages; + pool_cfg->pl_pools[j].pl_nbuffers = rbp[j].rbp_nbuffers; + pool_cfg->pl_pools[j].pl_credits = rbp[j].rbp_credits; + pool_cfg->pl_pools[j].pl_mincredits = rbp[j].rbp_mincredits; + } + lnet_net_unlock(i); + rc = 0; + break; + } + + lnet_net_lock(LNET_LOCK_EX); + pool_cfg->pl_routing = the_lnet.ln_routing; + lnet_net_unlock(LNET_LOCK_EX); + + return rc; +} + +int +lnet_get_route(int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive, __u32 *priority) +{ + struct list_head *e1; + struct list_head *e2; + struct lnet_remotenet *rnet; + struct lnet_route *route; + int cpt; + int i; + struct list_head *rn_list; + + cpt = lnet_net_lock_current(); + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each(e1, rn_list) { + rnet = list_entry(e1, struct lnet_remotenet, lrn_list); + + list_for_each(e2, &rnet->lrn_routes) { + route = list_entry(e2, struct lnet_route, + lr_list); + + if (idx-- == 0) { + *net = rnet->lrn_net; + *hops = route->lr_hops; + *priority = route->lr_priority; + *gateway = route->lr_gateway->lpni_nid; + *alive = lnet_is_route_alive(route); + lnet_net_unlock(cpt); + return 0; + } + } + } + } + + lnet_net_unlock(cpt); + return -ENOENT; +} + +void +lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf) +{ + struct lnet_ni_status *stat; + int nnis; + int i; + + __swab32s(&pbuf->pb_info.pi_magic); + __swab32s(&pbuf->pb_info.pi_features); + __swab32s(&pbuf->pb_info.pi_pid); + __swab32s(&pbuf->pb_info.pi_nnis); + nnis = pbuf->pb_info.pi_nnis; + if (nnis > pbuf->pb_nnis) + nnis = pbuf->pb_nnis; + for (i = 0; i < nnis; i++) { + stat = &pbuf->pb_info.pi_ni[i]; + __swab64s(&stat->ns_nid); + __swab32s(&stat->ns_status); + } + return; +} + +/** + * parse router-checker pinginfo, record number of down NIs for remote + * networks on that router. + */ +static void +lnet_parse_rc_info(struct lnet_rc_data *rcd) +{ + struct lnet_ping_buffer *pbuf = rcd->rcd_pingbuffer; + struct lnet_peer_ni *gw = rcd->rcd_gateway; + struct lnet_route *rte; + int nnis; + + if (!gw->lpni_alive || !pbuf) + return; + + /* + * Protect gw->lpni_ping_feats. This can be set from + * lnet_notify_locked with different locks being held + */ + spin_lock(&gw->lpni_lock); + + if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) + lnet_swap_pinginfo(pbuf); + + /* NB always racing with network! */ + if (pbuf->pb_info.pi_magic != LNET_PROTO_PING_MAGIC) { + CDEBUG(D_NET, "%s: Unexpected magic %08x\n", + libcfs_nid2str(gw->lpni_nid), pbuf->pb_info.pi_magic); + gw->lpni_ping_feats = LNET_PING_FEAT_INVAL; + goto out; + } + + gw->lpni_ping_feats = pbuf->pb_info.pi_features; + + /* Without NI status info there's nothing more to do. */ + if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0) + goto out; + + /* Determine the number of NIs for which there is data. */ + nnis = pbuf->pb_info.pi_nnis; + if (pbuf->pb_nnis < nnis) { + if (rcd->rcd_nnis < nnis) + rcd->rcd_nnis = nnis; + nnis = pbuf->pb_nnis; + } + + list_for_each_entry(rte, &gw->lpni_routes, lr_gwlist) { + int down = 0; + int up = 0; + int i; + + /* If routing disabled then the route is down. */ + if ((gw->lpni_ping_feats & LNET_PING_FEAT_RTE_DISABLED) != 0) { + rte->lr_downis = 1; + continue; + } + + for (i = 0; i < nnis; i++) { + struct lnet_ni_status *stat = &pbuf->pb_info.pi_ni[i]; + lnet_nid_t nid = stat->ns_nid; + + if (nid == LNET_NID_ANY) { + CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n", + libcfs_nid2str(gw->lpni_nid)); + gw->lpni_ping_feats = LNET_PING_FEAT_INVAL; + goto out; + } + + if (nid == LNET_NID_LO_0) + continue; + + if (stat->ns_status == LNET_NI_STATUS_DOWN) { + down++; + continue; + } + + if (stat->ns_status == LNET_NI_STATUS_UP) { + if (LNET_NIDNET(nid) == rte->lr_net) { + up = 1; + break; + } + continue; + } + + CDEBUG(D_NET, "%s: Unexpected status 0x%x\n", + libcfs_nid2str(gw->lpni_nid), stat->ns_status); + gw->lpni_ping_feats = LNET_PING_FEAT_INVAL; + goto out; + } + + if (up) { /* ignore downed NIs if NI for dest network is up */ + rte->lr_downis = 0; + continue; + } + /* if @down is zero and this route is single-hop, it means + * we can't find NI for target network */ + if (down == 0 && rte->lr_hops == 1) + down = 1; + + rte->lr_downis = down; + } +out: + spin_unlock(&gw->lpni_lock); +} + +static void +lnet_router_checker_event(struct lnet_event *event) +{ + struct lnet_rc_data *rcd = event->md.user_ptr; + struct lnet_peer_ni *lp; + + LASSERT(rcd != NULL); + + if (event->unlinked) { + LNetInvalidateMDHandle(&rcd->rcd_mdh); + return; + } + + LASSERT(event->type == LNET_EVENT_SEND || + event->type == LNET_EVENT_REPLY); + + lp = rcd->rcd_gateway; + LASSERT(lp != NULL); + + /* NB: it's called with holding lnet_res_lock, we have a few + * places need to hold both locks at the same time, please take + * care of lock ordering */ + lnet_net_lock(lp->lpni_cpt); + if (!lnet_isrouter(lp) || lp->lpni_rcd != rcd) { + /* ignore if no longer a router or rcd is replaced */ + goto out; + } + + if (event->type == LNET_EVENT_SEND) { + lp->lpni_ping_notsent = 0; + if (event->status == 0) + goto out; + } + + /* LNET_EVENT_REPLY */ + /* A successful REPLY means the router is up. If _any_ comms + * to the router fail I assume it's down (this will happen if + * we ping alive routers to try to detect router death before + * apps get burned). */ + + lnet_notify_locked(lp, 1, !event->status, ktime_get_seconds()); + /* The router checker will wake up very shortly and do the + * actual notification. + * XXX If 'lp' stops being a router before then, it will still + * have the notification pending!!! */ + + if (avoid_asym_router_failure && event->status == 0) + lnet_parse_rc_info(rcd); + + out: + lnet_net_unlock(lp->lpni_cpt); +} + +static void +lnet_wait_known_routerstate(void) +{ + struct lnet_peer_ni *rtr; + struct list_head *entry; + int all_known; + + LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING); + + /* the_lnet.ln_api_mutex must be locked */ + for (;;) { + int cpt = lnet_net_lock_current(); + + all_known = 1; + list_for_each(entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, struct lnet_peer_ni, + lpni_rtr_list); + + spin_lock(&rtr->lpni_lock); + + if (rtr->lpni_alive_count == 0) { + all_known = 0; + spin_unlock(&rtr->lpni_lock); + break; + } + spin_unlock(&rtr->lpni_lock); + } + + lnet_net_unlock(cpt); + + if (all_known) + return; + + mutex_unlock(&the_lnet.ln_api_mutex); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + mutex_lock(&the_lnet.ln_api_mutex); + } +} + +void +lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net) +{ + struct lnet_route *rte; + + if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) { + list_for_each_entry(rte, &gw->lpni_routes, lr_gwlist) { + if (rte->lr_net == net) { + rte->lr_downis = 0; + break; + } + } + } +} + +static void +lnet_update_ni_status_locked(void) +{ + struct lnet_ni *ni = NULL; + time64_t now; + time64_t timeout; + + LASSERT(the_lnet.ln_routing); + + timeout = router_ping_timeout + + MAX(live_router_check_interval, dead_router_check_interval); + + now = ktime_get_real_seconds(); + while ((ni = lnet_get_next_ni_locked(NULL, ni))) { + if (ni->ni_net->net_lnd->lnd_type == LOLND) + continue; + + if (now < ni->ni_last_alive + timeout) + continue; + + lnet_ni_lock(ni); + /* re-check with lock */ + if (now < ni->ni_last_alive + timeout) { + lnet_ni_unlock(ni); + continue; + } + + LASSERT(ni->ni_status != NULL); + + if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) { + CDEBUG(D_NET, "NI(%s:%lld) status changed to down\n", + libcfs_nid2str(ni->ni_nid), timeout); + /* NB: so far, this is the only place to set + * NI status to "down" */ + ni->ni_status->ns_status = LNET_NI_STATUS_DOWN; + } + lnet_ni_unlock(ni); + } +} + +static void +lnet_destroy_rc_data(struct lnet_rc_data *rcd) +{ + LASSERT(list_empty(&rcd->rcd_list)); + /* detached from network */ + LASSERT(LNetMDHandleIsInvalid(rcd->rcd_mdh)); + + if (rcd->rcd_gateway != NULL) { + int cpt = rcd->rcd_gateway->lpni_cpt; + + lnet_net_lock(cpt); + lnet_peer_ni_decref_locked(rcd->rcd_gateway); + lnet_net_unlock(cpt); + } + + if (rcd->rcd_pingbuffer != NULL) + lnet_ping_buffer_decref(rcd->rcd_pingbuffer); + + LIBCFS_FREE(rcd, sizeof(*rcd)); +} + +static struct lnet_rc_data * +lnet_update_rc_data_locked(struct lnet_peer_ni *gateway) +{ + struct lnet_handle_md mdh; + struct lnet_rc_data *rcd; + struct lnet_ping_buffer *pbuf = NULL; + int nnis = LNET_INTERFACES_MIN; + int rc; + int i; + + rcd = gateway->lpni_rcd; + if (rcd) { + nnis = rcd->rcd_nnis; + mdh = rcd->rcd_mdh; + LNetInvalidateMDHandle(&rcd->rcd_mdh); + pbuf = rcd->rcd_pingbuffer; + rcd->rcd_pingbuffer = NULL; + } else { + LNetInvalidateMDHandle(&mdh); + } + + lnet_net_unlock(gateway->lpni_cpt); + + if (rcd) { + LNetMDUnlink(mdh); + lnet_ping_buffer_decref(pbuf); + } else { + LIBCFS_ALLOC(rcd, sizeof(*rcd)); + if (rcd == NULL) + goto out; + + LNetInvalidateMDHandle(&rcd->rcd_mdh); + INIT_LIST_HEAD(&rcd->rcd_list); + rcd->rcd_nnis = nnis; + } + + pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS); + if (pbuf == NULL) + goto out; + + for (i = 0; i < nnis; i++) { + pbuf->pb_info.pi_ni[i].ns_nid = LNET_NID_ANY; + pbuf->pb_info.pi_ni[i].ns_status = LNET_NI_STATUS_INVALID; + } + rcd->rcd_pingbuffer = pbuf; + + LASSERT(!LNetEQHandleIsInvalid(the_lnet.ln_rc_eqh)); + rc = LNetMDBind((struct lnet_md){.start = &pbuf->pb_info, + .user_ptr = rcd, + .length = LNET_PING_INFO_SIZE(nnis), + .threshold = LNET_MD_THRESH_INF, + .options = LNET_MD_TRUNCATE, + .eq_handle = the_lnet.ln_rc_eqh}, + LNET_UNLINK, + &rcd->rcd_mdh); + if (rc < 0) { + CERROR("Can't bind MD: %d\n", rc); + goto out_ping_buffer_decref; + } + LASSERT(rc == 0); + + lnet_net_lock(gateway->lpni_cpt); + /* Check if this is still a router. */ + if (!lnet_isrouter(gateway)) + goto out_unlock; + /* Check if someone else installed router data. */ + if (gateway->lpni_rcd && gateway->lpni_rcd != rcd) + goto out_unlock; + + /* Install and/or update the router data. */ + if (!gateway->lpni_rcd) { + lnet_peer_ni_addref_locked(gateway); + rcd->rcd_gateway = gateway; + gateway->lpni_rcd = rcd; + } + gateway->lpni_ping_notsent = 0; + + return rcd; + +out_unlock: + lnet_net_unlock(gateway->lpni_cpt); + rc = LNetMDUnlink(mdh); + LASSERT(rc == 0); +out_ping_buffer_decref: + lnet_ping_buffer_decref(pbuf); +out: + if (rcd && rcd != gateway->lpni_rcd) + lnet_destroy_rc_data(rcd); + lnet_net_lock(gateway->lpni_cpt); + return gateway->lpni_rcd; +} + +static int +lnet_router_check_interval(struct lnet_peer_ni *rtr) +{ + int secs; + + secs = rtr->lpni_alive ? live_router_check_interval : + dead_router_check_interval; + if (secs < 0) + secs = 0; + + return secs; +} + +static void +lnet_ping_router_locked(struct lnet_peer_ni *rtr) +{ + struct lnet_rc_data *rcd = NULL; + time64_t now = ktime_get_seconds(); + time64_t secs; + struct lnet_ni *ni; + + lnet_peer_ni_addref_locked(rtr); + + if (rtr->lpni_ping_deadline != 0 && /* ping timed out? */ + now > rtr->lpni_ping_deadline) + lnet_notify_locked(rtr, 1, 0, now); + + /* Run any outstanding notifications */ + ni = lnet_get_next_ni_locked(rtr->lpni_net, NULL); + lnet_ni_notify_locked(ni, rtr); + + if (!lnet_isrouter(rtr) || + the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) { + /* router table changed or router checker is shutting down */ + lnet_peer_ni_decref_locked(rtr); + return; + } + + rcd = rtr->lpni_rcd; + + /* + * The response to the router checker ping could've timed out and + * the mdh might've been invalidated, so we need to update it + * again. + */ + if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis || + LNetMDHandleIsInvalid(rcd->rcd_mdh)) + rcd = lnet_update_rc_data_locked(rtr); + if (rcd == NULL) + return; + + secs = lnet_router_check_interval(rtr); + + CDEBUG(D_NET, + "rtr %s %lld: deadline %lld ping_notsent %d alive %d " + "alive_count %d lpni_ping_timestamp %lld\n", + libcfs_nid2str(rtr->lpni_nid), secs, + rtr->lpni_ping_deadline, rtr->lpni_ping_notsent, + rtr->lpni_alive, rtr->lpni_alive_count, rtr->lpni_ping_timestamp); + + if (secs != 0 && !rtr->lpni_ping_notsent && + now > rtr->lpni_ping_timestamp + secs) { + int rc; + struct lnet_process_id id; + struct lnet_handle_md mdh; + + id.nid = rtr->lpni_nid; + id.pid = LNET_PID_LUSTRE; + CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id)); + + rtr->lpni_ping_notsent = 1; + rtr->lpni_ping_timestamp = now; + + mdh = rcd->rcd_mdh; + + if (rtr->lpni_ping_deadline == 0) { + rtr->lpni_ping_deadline = ktime_get_seconds() + + router_ping_timeout; + } + + lnet_net_unlock(rtr->lpni_cpt); + + rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0, false); + + lnet_net_lock(rtr->lpni_cpt); + if (rc != 0) + rtr->lpni_ping_notsent = 0; /* no event pending */ + } + + lnet_peer_ni_decref_locked(rtr); + return; +} + +int lnet_router_pre_mt_start(void) +{ + int rc; + + if (check_routers_before_use && + dead_router_check_interval <= 0) { + LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be" + " set if 'check_routers_before_use' is set" + "\n"); + return -EINVAL; + } + + rc = LNetEQAlloc(0, lnet_router_checker_event, &the_lnet.ln_rc_eqh); + if (rc != 0) { + CERROR("Can't allocate EQ(0): %d\n", rc); + return -ENOMEM; + } + + return 0; +} + +void lnet_router_post_mt_start(void) +{ + if (check_routers_before_use) { + /* Note that a helpful side-effect of pinging all known routers + * at startup is that it makes them drop stale connections they + * may have to a previous instance of me. */ + lnet_wait_known_routerstate(); + } +} + +void +lnet_router_cleanup(void) +{ + int rc; + + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT(rc == 0); + return; +} + +void +lnet_prune_rc_data(int wait_unlink) +{ + struct lnet_rc_data *rcd; + struct lnet_rc_data *tmp; + struct lnet_peer_ni *lp; + struct list_head head; + int i = 2; + + if (likely(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING && + list_empty(&the_lnet.ln_rcd_deathrow) && + list_empty(&the_lnet.ln_rcd_zombie))) + return; + + INIT_LIST_HEAD(&head); + + lnet_net_lock(LNET_LOCK_EX); + + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) { + /* router checker is stopping, prune all */ + list_for_each_entry(lp, &the_lnet.ln_routers, + lpni_rtr_list) { + if (lp->lpni_rcd == NULL) + continue; + + LASSERT(list_empty(&lp->lpni_rcd->rcd_list)); + list_add(&lp->lpni_rcd->rcd_list, + &the_lnet.ln_rcd_deathrow); + lp->lpni_rcd = NULL; + } + } + + /* unlink all RCDs on deathrow list */ + list_splice_init(&the_lnet.ln_rcd_deathrow, &head); + + if (!list_empty(&head)) { + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry(rcd, &head, rcd_list) + LNetMDUnlink(rcd->rcd_mdh); + + lnet_net_lock(LNET_LOCK_EX); + } + + list_splice_init(&head, &the_lnet.ln_rcd_zombie); + + /* release all zombie RCDs */ + while (!list_empty(&the_lnet.ln_rcd_zombie)) { + list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie, + rcd_list) { + if (LNetMDHandleIsInvalid(rcd->rcd_mdh)) + list_move(&rcd->rcd_list, &head); + } + + wait_unlink = wait_unlink && + !list_empty(&the_lnet.ln_rcd_zombie); + + lnet_net_unlock(LNET_LOCK_EX); + + while (!list_empty(&head)) { + rcd = list_entry(head.next, + struct lnet_rc_data, rcd_list); + list_del_init(&rcd->rcd_list); + lnet_destroy_rc_data(rcd); + } + + if (!wait_unlink) + return; + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for rc buffers to unlink\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 4); + + lnet_net_lock(LNET_LOCK_EX); + } + + lnet_net_unlock(LNET_LOCK_EX); +} + +/* + * This function is called from the monitor thread to check if there are + * any active routers that need to be checked. + */ +inline bool +lnet_router_checker_active(void) +{ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) + return true; + + /* Router Checker thread needs to run when routing is enabled in + * order to call lnet_update_ni_status_locked() */ + if (the_lnet.ln_routing) + return true; + + /* if there are routers that need to be cleaned up then do so */ + if (!list_empty(&the_lnet.ln_rcd_deathrow) || + !list_empty(&the_lnet.ln_rcd_zombie)) + return true; + + return !list_empty(&the_lnet.ln_routers) && + (live_router_check_interval > 0 || + dead_router_check_interval > 0); +} + +void +lnet_check_routers(void) +{ + struct lnet_peer_ni *rtr; + struct list_head *entry; + __u64 version; + int cpt; + int cpt2; + + cpt = lnet_net_lock_current(); +rescan: + version = the_lnet.ln_routers_version; + + list_for_each(entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, struct lnet_peer_ni, + lpni_rtr_list); + + cpt2 = rtr->lpni_cpt; + if (cpt != cpt2) { + lnet_net_unlock(cpt); + cpt = cpt2; + lnet_net_lock(cpt); + /* the routers list has changed */ + if (version != the_lnet.ln_routers_version) + goto rescan; + } + + lnet_ping_router_locked(rtr); + + /* NB dropped lock */ + if (version != the_lnet.ln_routers_version) { + /* the routers list has changed */ + goto rescan; + } + } + + if (the_lnet.ln_routing) + lnet_update_ni_status_locked(); + + lnet_net_unlock(cpt); + + lnet_prune_rc_data(0); /* don't wait for UNLINK */ +} + +void +lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages) +{ + int sz = offsetof(struct lnet_rtrbuf, rb_kiov[npages]); + + while (--npages >= 0) + __free_page(rb->rb_kiov[npages].kiov_page); + + LIBCFS_FREE(rb, sz); +} + +static struct lnet_rtrbuf * +lnet_new_rtrbuf(struct lnet_rtrbufpool *rbp, int cpt) +{ + int npages = rbp->rbp_npages; + int sz = offsetof(struct lnet_rtrbuf, rb_kiov[npages]); + struct page *page; + struct lnet_rtrbuf *rb; + int i; + + LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz); + if (rb == NULL) + return NULL; + + rb->rb_pool = rbp; + + for (i = 0; i < npages; i++) { + page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, + GFP_KERNEL | __GFP_ZERO); + if (page == NULL) { + while (--i >= 0) + __free_page(rb->rb_kiov[i].kiov_page); + + LIBCFS_FREE(rb, sz); + return NULL; + } + + rb->rb_kiov[i].kiov_len = PAGE_SIZE; + rb->rb_kiov[i].kiov_offset = 0; + rb->rb_kiov[i].kiov_page = page; + } + + return rb; +} + +static void +lnet_rtrpool_free_bufs(struct lnet_rtrbufpool *rbp, int cpt) +{ + int npages = rbp->rbp_npages; + struct lnet_rtrbuf *rb; + struct list_head tmp; + + if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */ + return; + + INIT_LIST_HEAD(&tmp); + + lnet_net_lock(cpt); + list_splice_init(&rbp->rbp_msgs, &tmp); + lnet_drop_routed_msgs_locked(&tmp, cpt); + list_splice_init(&rbp->rbp_bufs, &tmp); + rbp->rbp_req_nbuffers = 0; + rbp->rbp_nbuffers = rbp->rbp_credits = 0; + rbp->rbp_mincredits = 0; + lnet_net_unlock(cpt); + + /* Free buffers on the free list. */ + while (!list_empty(&tmp)) { + rb = list_entry(tmp.next, struct lnet_rtrbuf, rb_list); + list_del(&rb->rb_list); + lnet_destroy_rtrbuf(rb, npages); + } +} + +static int +lnet_rtrpool_adjust_bufs(struct lnet_rtrbufpool *rbp, int nbufs, int cpt) +{ + struct list_head rb_list; + struct lnet_rtrbuf *rb; + int num_rb; + int num_buffers = 0; + int old_req_nbufs; + int npages = rbp->rbp_npages; + + lnet_net_lock(cpt); + /* If we are called for less buffers than already in the pool, we + * just lower the req_nbuffers number and excess buffers will be + * thrown away as they are returned to the free list. Credits + * then get adjusted as well. + * If we already have enough buffers allocated to serve the + * increase requested, then we can treat that the same way as we + * do the decrease. */ + num_rb = nbufs - rbp->rbp_nbuffers; + if (nbufs <= rbp->rbp_req_nbuffers || num_rb <= 0) { + rbp->rbp_req_nbuffers = nbufs; + lnet_net_unlock(cpt); + return 0; + } + /* store the older value of rbp_req_nbuffers and then set it to + * the new request to prevent lnet_return_rx_credits_locked() from + * freeing buffers that we need to keep around */ + old_req_nbufs = rbp->rbp_req_nbuffers; + rbp->rbp_req_nbuffers = nbufs; + lnet_net_unlock(cpt); + + INIT_LIST_HEAD(&rb_list); + + /* allocate the buffers on a local list first. If all buffers are + * allocated successfully then join this list to the rbp buffer + * list. If not then free all allocated buffers. */ + while (num_rb-- > 0) { + rb = lnet_new_rtrbuf(rbp, cpt); + if (rb == NULL) { + CERROR("Failed to allocate %d route bufs of %d pages\n", + nbufs, npages); + + lnet_net_lock(cpt); + rbp->rbp_req_nbuffers = old_req_nbufs; + lnet_net_unlock(cpt); + + goto failed; + } + + list_add(&rb->rb_list, &rb_list); + num_buffers++; + } + + lnet_net_lock(cpt); + + list_splice_tail(&rb_list, &rbp->rbp_bufs); + rbp->rbp_nbuffers += num_buffers; + rbp->rbp_credits += num_buffers; + rbp->rbp_mincredits = rbp->rbp_credits; + /* We need to schedule blocked msg using the newly + * added buffers. */ + while (!list_empty(&rbp->rbp_bufs) && + !list_empty(&rbp->rbp_msgs)) + lnet_schedule_blocked_locked(rbp); + + lnet_net_unlock(cpt); + + return 0; + +failed: + while (!list_empty(&rb_list)) { + rb = list_entry(rb_list.next, struct lnet_rtrbuf, rb_list); + list_del(&rb->rb_list); + lnet_destroy_rtrbuf(rb, npages); + } + + return -ENOMEM; +} + +static void +lnet_rtrpool_init(struct lnet_rtrbufpool *rbp, int npages) +{ + INIT_LIST_HEAD(&rbp->rbp_msgs); + INIT_LIST_HEAD(&rbp->rbp_bufs); + + rbp->rbp_npages = npages; + rbp->rbp_credits = 0; + rbp->rbp_mincredits = 0; +} + +void +lnet_rtrpools_free(int keep_pools) +{ + struct lnet_rtrbufpool *rtrp; + int i; + + if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */ + return; + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_free_bufs(&rtrp[LNET_TINY_BUF_IDX], i); + lnet_rtrpool_free_bufs(&rtrp[LNET_SMALL_BUF_IDX], i); + lnet_rtrpool_free_bufs(&rtrp[LNET_LARGE_BUF_IDX], i); + } + + if (!keep_pools) { + cfs_percpt_free(the_lnet.ln_rtrpools); + the_lnet.ln_rtrpools = NULL; + } +} + +static int +lnet_nrb_tiny_calculate(void) +{ + int nrbs = LNET_NRB_TINY; + + if (tiny_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "tiny_router_buffers=%d invalid when " + "routing enabled\n", tiny_router_buffers); + return -EINVAL; + } + + if (tiny_router_buffers > 0) + nrbs = tiny_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_TINY_MIN); +} + +static int +lnet_nrb_small_calculate(void) +{ + int nrbs = LNET_NRB_SMALL; + + if (small_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "small_router_buffers=%d invalid when " + "routing enabled\n", small_router_buffers); + return -EINVAL; + } + + if (small_router_buffers > 0) + nrbs = small_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_SMALL_MIN); +} + +static int +lnet_nrb_large_calculate(void) +{ + int nrbs = LNET_NRB_LARGE; + + if (large_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "large_router_buffers=%d invalid when " + "routing enabled\n", large_router_buffers); + return -EINVAL; + } + + if (large_router_buffers > 0) + nrbs = large_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_LARGE_MIN); +} + +int +lnet_rtrpools_alloc(int im_a_router) +{ + struct lnet_rtrbufpool *rtrp; + int nrb_tiny; + int nrb_small; + int nrb_large; + int rc; + int i; + + if (!strcmp(forwarding, "")) { + /* not set either way */ + if (!im_a_router) + return 0; + } else if (!strcmp(forwarding, "disabled")) { + /* explicitly disabled */ + return 0; + } else if (!strcmp(forwarding, "enabled")) { + /* explicitly enabled */ + } else { + LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either " + "'enabled' or 'disabled'\n"); + return -EINVAL; + } + + nrb_tiny = lnet_nrb_tiny_calculate(); + if (nrb_tiny < 0) + return -EINVAL; + + nrb_small = lnet_nrb_small_calculate(); + if (nrb_small < 0) + return -EINVAL; + + nrb_large = lnet_nrb_large_calculate(); + if (nrb_large < 0) + return -EINVAL; + + the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(), + LNET_NRBPOOLS * + sizeof(struct lnet_rtrbufpool)); + if (the_lnet.ln_rtrpools == NULL) { + LCONSOLE_ERROR_MSG(0x10c, + "Failed to initialize router buffe pool\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_init(&rtrp[LNET_TINY_BUF_IDX], 0); + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX], + nrb_tiny, i); + if (rc != 0) + goto failed; + + lnet_rtrpool_init(&rtrp[LNET_SMALL_BUF_IDX], + LNET_NRB_SMALL_PAGES); + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX], + nrb_small, i); + if (rc != 0) + goto failed; + + lnet_rtrpool_init(&rtrp[LNET_LARGE_BUF_IDX], + LNET_NRB_LARGE_PAGES); + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX], + nrb_large, i); + if (rc != 0) + goto failed; + } + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_routing = 1; + lnet_net_unlock(LNET_LOCK_EX); + return 0; + + failed: + lnet_rtrpools_free(0); + return rc; +} + +static int +lnet_rtrpools_adjust_helper(int tiny, int small, int large) +{ + int nrb = 0; + int rc = 0; + int i; + struct lnet_rtrbufpool *rtrp; + + /* If the provided values for each buffer pool are different than the + * configured values, we need to take action. */ + if (tiny >= 0) { + tiny_router_buffers = tiny; + nrb = lnet_nrb_tiny_calculate(); + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX], + nrb, i); + if (rc != 0) + return rc; + } + } + if (small >= 0) { + small_router_buffers = small; + nrb = lnet_nrb_small_calculate(); + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX], + nrb, i); + if (rc != 0) + return rc; + } + } + if (large >= 0) { + large_router_buffers = large; + nrb = lnet_nrb_large_calculate(); + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX], + nrb, i); + if (rc != 0) + return rc; + } + } + + return 0; +} + +int +lnet_rtrpools_adjust(int tiny, int small, int large) +{ + /* this function doesn't revert the changes if adding new buffers + * failed. It's up to the user space caller to revert the + * changes. */ + + if (!the_lnet.ln_routing) + return 0; + + return lnet_rtrpools_adjust_helper(tiny, small, large); +} + +int +lnet_rtrpools_enable(void) +{ + int rc = 0; + + if (the_lnet.ln_routing) + return 0; + + if (the_lnet.ln_rtrpools == NULL) + /* If routing is turned off, and we have never + * initialized the pools before, just call the + * standard buffer pool allocation routine as + * if we are just configuring this for the first + * time. */ + rc = lnet_rtrpools_alloc(1); + else + rc = lnet_rtrpools_adjust_helper(0, 0, 0); + if (rc != 0) + return rc; + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_routing = 1; + + the_lnet.ln_ping_target->pb_info.pi_features &= + ~LNET_PING_FEAT_RTE_DISABLED; + lnet_net_unlock(LNET_LOCK_EX); + + return rc; +} + +void +lnet_rtrpools_disable(void) +{ + if (!the_lnet.ln_routing) + return; + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_routing = 0; + the_lnet.ln_ping_target->pb_info.pi_features |= + LNET_PING_FEAT_RTE_DISABLED; + + tiny_router_buffers = 0; + small_router_buffers = 0; + large_router_buffers = 0; + lnet_net_unlock(LNET_LOCK_EX); + lnet_rtrpools_free(1); +} + +int +lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, time64_t when) +{ + struct lnet_peer_ni *lp = NULL; + time64_t now = ktime_get_seconds(); + int cpt = lnet_cpt_of_nid(nid, ni); + + LASSERT (!in_interrupt ()); + + CDEBUG (D_NET, "%s notifying %s: %s\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), + alive ? "up" : "down"); + + if (ni != NULL && + LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) { + CWARN("Ignoring notification of %s %s by %s (different net)\n", + libcfs_nid2str(nid), alive ? "birth" : "death", + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + } + + /* can't do predictions... */ + if (when > now) { + CWARN("Ignoring prediction from %s of %s %s " + "%lld seconds in the future\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), alive ? "up" : "down", when - now); + return -EINVAL; + } + + if (ni != NULL && !alive && /* LND telling me she's down */ + !auto_down) { /* auto-down disabled */ + CDEBUG(D_NET, "Auto-down disabled\n"); + return 0; + } + + lnet_net_lock(cpt); + + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + + lp = lnet_find_peer_ni_locked(nid); + if (lp == NULL) { + /* nid not found */ + lnet_net_unlock(cpt); + CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid)); + return 0; + } + + /* + * It is possible for this function to be called for the same peer + * but with different NIs. We want to synchronize the notification + * between the different calls. So we will use the lpni_cpt to + * grab the net lock. + */ + if (lp->lpni_cpt != cpt) { + lnet_net_unlock(cpt); + cpt = lp->lpni_cpt; + lnet_net_lock(cpt); + } + + /* We can't fully trust LND on reporting exact peer last_alive + * if he notifies us about dead peer. For example ksocklnd can + * call us with when == _time_when_the_node_was_booted_ if + * no connections were successfully established */ + if (ni != NULL && !alive && when < lp->lpni_last_alive) + when = lp->lpni_last_alive; + + lnet_notify_locked(lp, ni == NULL, alive, when); + + if (ni != NULL) + lnet_ni_notify_locked(ni, lp); + + lnet_peer_ni_decref_locked(lp); + + lnet_net_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(lnet_notify); diff --git a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c new file mode 100644 index 0000000000000..2e60609ee229d --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c @@ -0,0 +1,966 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * This file is part of Lustre, https://wiki.whamcloud.com/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +#include +#include + +/* This is really lnet_proc.c. You might need to update sanity test 215 + * if any file format is changed. */ + +#define LNET_LOFFT_BITS (sizeof(loff_t) * 8) +/* + * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system + */ +#define LNET_PROC_CPT_BITS (LNET_CPT_BITS + 1) +/* change version, 16 bits or 8 bits */ +#define LNET_PROC_VER_BITS MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8) + +#define LNET_PROC_HASH_BITS LNET_PEER_HASH_BITS +/* + * bits for peer hash offset + * NB: we don't use the highest bit of *ppos because it's signed + */ +#define LNET_PROC_HOFF_BITS (LNET_LOFFT_BITS - \ + LNET_PROC_CPT_BITS - \ + LNET_PROC_VER_BITS - \ + LNET_PROC_HASH_BITS - 1) +/* bits for hash index + position */ +#define LNET_PROC_HPOS_BITS (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS) +/* bits for peer hash table + hash version */ +#define LNET_PROC_VPOS_BITS (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS) + +#define LNET_PROC_CPT_MASK ((1ULL << LNET_PROC_CPT_BITS) - 1) +#define LNET_PROC_VER_MASK ((1ULL << LNET_PROC_VER_BITS) - 1) +#define LNET_PROC_HASH_MASK ((1ULL << LNET_PROC_HASH_BITS) - 1) +#define LNET_PROC_HOFF_MASK ((1ULL << LNET_PROC_HOFF_BITS) - 1) + +#define LNET_PROC_CPT_GET(pos) \ + (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK) + +#define LNET_PROC_VER_GET(pos) \ + (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK) + +#define LNET_PROC_HASH_GET(pos) \ + (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK) + +#define LNET_PROC_HOFF_GET(pos) \ + (int)((pos) & LNET_PROC_HOFF_MASK) + +#define LNET_PROC_POS_MAKE(cpt, ver, hash, off) \ + (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) | \ + ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) | \ + ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \ + ((off) & LNET_PROC_HOFF_MASK)) + +#define LNET_PROC_VERSION(v) ((unsigned int)((v) & LNET_PROC_VER_MASK)) + +static int __proc_lnet_stats(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + int rc; + struct lnet_counters *ctrs; + struct lnet_counters_common common; + int len; + char *tmpstr; + const int tmpsiz = 256; /* 7 %u and 4 __u64 */ + + if (write) { + lnet_counters_reset(); + return 0; + } + + /* read */ + + LIBCFS_ALLOC(ctrs, sizeof(*ctrs)); + if (ctrs == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) { + LIBCFS_FREE(ctrs, sizeof(*ctrs)); + return -ENOMEM; + } + + lnet_counters_get(ctrs); + common = ctrs->lct_common; + + len = snprintf(tmpstr, tmpsiz, + "%u %u %u %u %u %u %u %llu %llu " + "%llu %llu", + common.lcc_msgs_alloc, common.lcc_msgs_max, + common.lcc_errors, + common.lcc_send_count, common.lcc_recv_count, + common.lcc_route_count, common.lcc_drop_count, + common.lcc_send_length, common.lcc_recv_length, + common.lcc_route_length, common.lcc_drop_length); + + if (pos >= min_t(int, len, strlen(tmpstr))) + rc = 0; + else + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, "\n"); + + LIBCFS_FREE(tmpstr, tmpsiz); + LIBCFS_FREE(ctrs, sizeof(*ctrs)); + return rc; +} + +static int +proc_lnet_stats(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, + __proc_lnet_stats); +} + +static int +proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + const int tmpsiz = 256; + char *tmpstr; + char *s; + int rc = 0; + int len; + int ver; + int off; + + CLASSERT(sizeof(loff_t) >= 4); + + off = LNET_PROC_HOFF_GET(*ppos); + ver = LNET_PROC_VER_GET(*ppos); + + LASSERT(!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n", + the_lnet.ln_routing ? "enabled" : "disabled"); + LASSERT(tmpstr + tmpsiz - s > 0); + + s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %8s %7s %s\n", + "net", "hops", "priority", "state", "router"); + LASSERT(tmpstr + tmpsiz - s > 0); + + lnet_net_lock(0); + ver = (unsigned int)the_lnet.ln_remote_nets_version; + lnet_net_unlock(0); + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } else { + struct list_head *n; + struct list_head *r; + struct lnet_route *route = NULL; + struct lnet_remotenet *rnet = NULL; + int skip = off - 1; + struct list_head *rn_list; + int i; + + lnet_net_lock(0); + + if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) { + lnet_net_unlock(0); + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL; + i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + + n = rn_list->next; + + while (n != rn_list && route == NULL) { + rnet = list_entry(n, struct lnet_remotenet, + lrn_list); + + r = rnet->lrn_routes.next; + + while (r != &rnet->lrn_routes) { + struct lnet_route *re = + list_entry(r, struct lnet_route, + lr_list); + if (skip == 0) { + route = re; + break; + } + + skip--; + r = r->next; + } + + n = n->next; + } + } + + if (route != NULL) { + __u32 net = rnet->lrn_net; + __u32 hops = route->lr_hops; + unsigned int priority = route->lr_priority; + lnet_nid_t nid = route->lr_gateway->lpni_nid; + int alive = lnet_is_route_alive(route); + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-8s %4d %8u %7s %s\n", + libcfs_net2str(net), hops, + priority, + alive ? "up" : "down", + libcfs_nid2str(nid)); + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else { + off += 1; + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +static int +proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int rc = 0; + char *tmpstr; + char *s; + const int tmpsiz = 256; + int len; + int ver; + int off; + + off = LNET_PROC_HOFF_GET(*ppos); + ver = LNET_PROC_VER_GET(*ppos); + + LASSERT(!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n", + "ref", "rtr_ref", "alive_cnt", "state", + "last_ping", "ping_sent", "deadline", + "down_ni", "router"); + LASSERT(tmpstr + tmpsiz - s > 0); + + lnet_net_lock(0); + ver = (unsigned int)the_lnet.ln_routers_version; + lnet_net_unlock(0); + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } else { + struct list_head *r; + struct lnet_peer_ni *peer = NULL; + int skip = off - 1; + + lnet_net_lock(0); + + if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) { + lnet_net_unlock(0); + + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + r = the_lnet.ln_routers.next; + + while (r != &the_lnet.ln_routers) { + struct lnet_peer_ni *lp = + list_entry(r, struct lnet_peer_ni, + lpni_rtr_list); + + if (skip == 0) { + peer = lp; + break; + } + + skip--; + r = r->next; + } + + if (peer != NULL) { + lnet_nid_t nid = peer->lpni_nid; + time64_t now = ktime_get_seconds(); + time64_t deadline = peer->lpni_ping_deadline; + int nrefs = atomic_read(&peer->lpni_refcount); + int nrtrrefs = peer->lpni_rtr_refcount; + int alive_cnt = peer->lpni_alive_count; + int alive = peer->lpni_alive; + int pingsent = !peer->lpni_ping_notsent; + time64_t last_ping = now - peer->lpni_ping_timestamp; + int down_ni = 0; + struct lnet_route *rtr; + + if ((peer->lpni_ping_feats & + LNET_PING_FEAT_NI_STATUS) != 0) { + list_for_each_entry(rtr, &peer->lpni_routes, + lr_gwlist) { + /* downis on any route should be the + * number of downis on the gateway */ + if (rtr->lr_downis != 0) { + down_ni = rtr->lr_downis; + break; + } + } + } + + if (deadline == 0) + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %9d %6s %12llu %9d %8s %7d %s\n", + nrefs, nrtrrefs, alive_cnt, + alive ? "up" : "down", last_ping, + pingsent, "NA", down_ni, + libcfs_nid2str(nid)); + else + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %9d %6s %12llu %9d %8llu %7d %s\n", + nrefs, nrtrrefs, alive_cnt, + alive ? "up" : "down", last_ping, + pingsent, + deadline - now, + down_ni, libcfs_nid2str(nid)); + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else { + off += 1; + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +/* TODO: there should be no direct access to ptable. We should add a set + * of APIs that give access to the ptable and its members */ +static int +proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + const int tmpsiz = 256; + struct lnet_peer_table *ptable; + char *tmpstr = NULL; + char *s; + int cpt = LNET_PROC_CPT_GET(*ppos); + int ver = LNET_PROC_VER_GET(*ppos); + int hash = LNET_PROC_HASH_GET(*ppos); + int hoff = LNET_PROC_HOFF_GET(*ppos); + int rc = 0; + int len; + + if (write) { + int i; + struct lnet_peer_ni *peer; + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + lnet_net_lock(i); + for (hash = 0; hash < LNET_PEER_HASH_SIZE; hash++) { + list_for_each_entry(peer, + &ptable->pt_hash[hash], + lpni_hashlist) { + peer->lpni_mintxcredits = + peer->lpni_txcredits; + peer->lpni_minrtrcredits = + peer->lpni_rtrcredits; + } + } + lnet_net_unlock(i); + } + *ppos += *lenp; + return 0; + } + + if (*lenp == 0) + return 0; + + CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS); + + if (cpt >= LNET_CPT_NUMBER) { + *lenp = 0; + return 0; + } + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n", + "nid", "refs", "state", "last", "max", + "rtr", "min", "tx", "min", "queue"); + LASSERT(tmpstr + tmpsiz - s > 0); + + hoff++; + } else { + struct lnet_peer_ni *peer; + struct list_head *p; + int skip; + + again: + p = NULL; + peer = NULL; + skip = hoff - 1; + + lnet_net_lock(cpt); + ptable = the_lnet.ln_peer_tables[cpt]; + if (hoff == 1) + ver = LNET_PROC_VERSION(ptable->pt_version); + + if (ver != LNET_PROC_VERSION(ptable->pt_version)) { + lnet_net_unlock(cpt); + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + while (hash < LNET_PEER_HASH_SIZE) { + if (p == NULL) + p = ptable->pt_hash[hash].next; + + while (p != &ptable->pt_hash[hash]) { + struct lnet_peer_ni *lp = + list_entry(p, struct lnet_peer_ni, + lpni_hashlist); + if (skip == 0) { + peer = lp; + + /* minor optimization: start from idx+1 + * on next iteration if we've just + * drained lpni_hashlist */ + if (lp->lpni_hashlist.next == + &ptable->pt_hash[hash]) { + hoff = 1; + hash++; + } else { + hoff++; + } + + break; + } + + skip--; + p = lp->lpni_hashlist.next; + } + + if (peer != NULL) + break; + + p = NULL; + hoff = 1; + hash++; + } + + if (peer != NULL) { + lnet_nid_t nid = peer->lpni_nid; + int nrefs = atomic_read(&peer->lpni_refcount); + time64_t lastalive = -1; + char *aliveness = "NA"; + int maxcr = (peer->lpni_net) ? + peer->lpni_net->net_tunables.lct_peer_tx_credits : 0; + int txcr = peer->lpni_txcredits; + int mintxcr = peer->lpni_mintxcredits; + int rtrcr = peer->lpni_rtrcredits; + int minrtrcr = peer->lpni_minrtrcredits; + int txqnob = peer->lpni_txqnob; + + if (lnet_isrouter(peer) || + lnet_peer_aliveness_enabled(peer)) + aliveness = peer->lpni_alive ? "up" : "down"; + + if (lnet_peer_aliveness_enabled(peer)) { + time64_t now = ktime_get_seconds(); + + lastalive = now - peer->lpni_last_alive; + + /* No need to mess up peers contents with + * arbitrarily long integers - it suffices to + * know that lastalive is more than 10000s old + */ + if (lastalive >= 10000) + lastalive = 9999; + } + + lnet_net_unlock(cpt); + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %4d %5s %5lld %5d %5d %5d %5d %5d %d\n", + libcfs_nid2str(nid), nrefs, aliveness, + lastalive, maxcr, rtrcr, minrtrcr, txcr, + mintxcr, txqnob); + LASSERT(tmpstr + tmpsiz - s > 0); + + } else { /* peer is NULL */ + lnet_net_unlock(cpt); + } + + if (hash == LNET_PEER_HASH_SIZE) { + cpt++; + hash = 0; + hoff = 1; + if (peer == NULL && cpt < LNET_CPT_NUMBER) + goto again; + } + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else + *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff); + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +static int __proc_lnet_buffers(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + char *s; + char *tmpstr; + int tmpsiz; + int idx; + int len; + int rc; + int i; + + LASSERT(!write); + + /* (4 %d) * 4 * LNET_CPT_NUMBER */ + tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER; + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + s += snprintf(s, tmpstr + tmpsiz - s, + "%5s %5s %7s %7s\n", + "pages", "count", "credits", "min"); + LASSERT(tmpstr + tmpsiz - s > 0); + + if (the_lnet.ln_rtrpools == NULL) + goto out; /* I'm not a router */ + + for (idx = 0; idx < LNET_NRBPOOLS; idx++) { + struct lnet_rtrbufpool *rbp; + + lnet_net_lock(LNET_LOCK_EX); + cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%5d %5d %7d %7d\n", + rbp[idx].rbp_npages, + rbp[idx].rbp_nbuffers, + rbp[idx].rbp_credits, + rbp[idx].rbp_mincredits); + LASSERT(tmpstr + tmpsiz - s > 0); + } + lnet_net_unlock(LNET_LOCK_EX); + } + + out: + len = s - tmpstr; + + if (pos >= min_t(int, len, strlen(tmpstr))) + rc = 0; + else + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, NULL); + + LIBCFS_FREE(tmpstr, tmpsiz); + return rc; +} + +static int +proc_lnet_buffers(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, + __proc_lnet_buffers); +} + +static int +proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int tmpsiz = 128 * LNET_CPT_NUMBER; + int rc = 0; + char *tmpstr; + char *s; + int len; + + if (*lenp == 0) + return 0; + + if (write) { + /* Just reset the min stat. */ + struct lnet_ni *ni; + struct lnet_net *net; + + lnet_net_lock(0); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + struct lnet_tx_queue *tq; + int i; + int j; + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + for (j = 0; ni->ni_cpts != NULL && + j < ni->ni_ncpts; j++) { + if (i == ni->ni_cpts[j]) + break; + } + + if (j == ni->ni_ncpts) + continue; + + if (i != 0) + lnet_net_lock(i); + tq->tq_credits_min = tq->tq_credits; + if (i != 0) + lnet_net_unlock(i); + } + } + } + lnet_net_unlock(0); + *ppos += *lenp; + return 0; + } + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n", + "nid", "status", "alive", "refs", "peer", + "rtr", "max", "tx", "min"); + LASSERT (tmpstr + tmpsiz - s > 0); + } else { + struct lnet_ni *ni = NULL; + int skip = *ppos - 1; + + lnet_net_lock(0); + + ni = lnet_get_ni_idx_locked(skip); + + if (ni != NULL) { + struct lnet_tx_queue *tq; + char *stat; + time64_t now = ktime_get_real_seconds(); + time64_t last_alive = -1; + int i; + int j; + + if (the_lnet.ln_routing) + last_alive = now - ni->ni_last_alive; + + /* @lo forever alive */ + if (ni->ni_net->net_lnd->lnd_type == LOLND) + last_alive = 0; + + lnet_ni_lock(ni); + LASSERT(ni->ni_status != NULL); + stat = (ni->ni_status->ns_status == + LNET_NI_STATUS_UP) ? "up" : "down"; + lnet_ni_unlock(ni); + + /* we actually output credits information for + * TX queue of each partition */ + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + for (j = 0; ni->ni_cpts != NULL && + j < ni->ni_ncpts; j++) { + if (i == ni->ni_cpts[j]) + break; + } + + if (j == ni->ni_ncpts) + continue; + + if (i != 0) + lnet_net_lock(i); + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %6s %5lld %4d %4d %4d %5d %5d %5d\n", + libcfs_nid2str(ni->ni_nid), stat, + last_alive, *ni->ni_refs[i], + ni->ni_net->net_tunables.lct_peer_tx_credits, + ni->ni_net->net_tunables.lct_peer_rtr_credits, + tq->tq_credits_max, + tq->tq_credits, tq->tq_credits_min); + if (i != 0) + lnet_net_unlock(i); + } + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else + *ppos += 1; + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +struct lnet_portal_rotors { + int pr_value; + const char *pr_name; + const char *pr_desc; +}; + +static struct lnet_portal_rotors portal_rotors[] = { + { + .pr_value = LNET_PTL_ROTOR_OFF, + .pr_name = "OFF", + .pr_desc = "Turn off message rotor for wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_ON, + .pr_name = "ON", + .pr_desc = "round-robin dispatch all PUT messages for " + "wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_RR_RT, + .pr_name = "RR_RT", + .pr_desc = "round-robin dispatch routed PUT message for " + "wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_HASH_RT, + .pr_name = "HASH_RT", + .pr_desc = "dispatch routed PUT message by hashing source " + "NID for wildcard portals" + }, + { + .pr_value = -1, + .pr_name = NULL, + .pr_desc = NULL + }, +}; + +static int __proc_lnet_portal_rotor(void *data, int write, + loff_t pos, void __user *buffer, int nob) +{ + const int buf_len = 128; + char *buf; + char *tmp; + int rc; + int i; + + LIBCFS_ALLOC(buf, buf_len); + if (buf == NULL) + return -ENOMEM; + + if (!write) { + lnet_res_lock(0); + + for (i = 0; portal_rotors[i].pr_value >= 0; i++) { + if (portal_rotors[i].pr_value == portal_rotor) + break; + } + + LASSERT(portal_rotors[i].pr_value == portal_rotor); + lnet_res_unlock(0); + + rc = snprintf(buf, buf_len, + "{\n\tportals: all\n" + "\trotor: %s\n\tdescription: %s\n}", + portal_rotors[i].pr_name, + portal_rotors[i].pr_desc); + + if (pos >= min_t(int, rc, buf_len)) { + rc = 0; + } else { + rc = cfs_trace_copyout_string(buffer, nob, + buf + pos, "\n"); + } + goto out; + } + + rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob); + if (rc < 0) + goto out; + + tmp = cfs_trimwhite(buf); + + rc = -EINVAL; + lnet_res_lock(0); + for (i = 0; portal_rotors[i].pr_name != NULL; i++) { + if (strncasecmp(portal_rotors[i].pr_name, tmp, + strlen(portal_rotors[i].pr_name)) == 0) { + portal_rotor = portal_rotors[i].pr_value; + rc = 0; + break; + } + } + lnet_res_unlock(0); +out: + LIBCFS_FREE(buf, buf_len); + return rc; +} + +static int +proc_lnet_portal_rotor(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, + __proc_lnet_portal_rotor); +} + + +static struct ctl_table lnet_table[] = { + /* + * NB No .strategy entries have been provided since sysctl(8) prefers + * to go via /proc for portability. + */ + { + INIT_CTL_NAME + .procname = "stats", + .mode = 0644, + .proc_handler = &proc_lnet_stats, + }, + { + INIT_CTL_NAME + .procname = "routes", + .mode = 0444, + .proc_handler = &proc_lnet_routes, + }, + { + INIT_CTL_NAME + .procname = "routers", + .mode = 0444, + .proc_handler = &proc_lnet_routers, + }, + { + INIT_CTL_NAME + .procname = "peers", + .mode = 0644, + .proc_handler = &proc_lnet_peers, + }, + { + INIT_CTL_NAME + .procname = "buffers", + .mode = 0444, + .proc_handler = &proc_lnet_buffers, + }, + { + INIT_CTL_NAME + .procname = "nis", + .mode = 0644, + .proc_handler = &proc_lnet_nis, + }, + { + INIT_CTL_NAME + .procname = "portal_rotor", + .mode = 0644, + .proc_handler = &proc_lnet_portal_rotor, + }, + { .procname = NULL } +}; + +void lnet_router_debugfs_init(void) +{ + lnet_insert_debugfs(lnet_table); +} + +void lnet_router_debugfs_fini(void) +{ + lnet_remove_debugfs(lnet_table); +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/Makefile b/drivers/staging/lustrefsx/lnet/selftest/Makefile new file mode 100644 index 0000000000000..5380812715f7f --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_LUSTREFSX_LNET_SELFTEST) += lnet_selftest.o + +lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o +lnet_selftest-y += rpc.o module.o ping_test.o brw_test.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lnet/selftest/brw_test.c b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c new file mode 100644 index 0000000000000..a03f6078c0589 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c @@ -0,0 +1,527 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/brw_test.c + * + * Author: Isaac Huang + */ + +#include "selftest.h" + +static int brw_srv_workitems = SFW_TEST_WI_MAX; +module_param(brw_srv_workitems, int, 0644); +MODULE_PARM_DESC(brw_srv_workitems, "# BRW server workitems"); + +static int brw_inject_errors; +module_param(brw_inject_errors, int, 0644); +MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by default"); + +#define BRW_POISON 0xbeefbeefbeefbeefULL +#define BRW_MAGIC 0xeeb0eeb1eeb2eeb3ULL +#define BRW_MSIZE sizeof(__u64) + +static void +brw_client_fini(struct sfw_test_instance *tsi) +{ + struct srpc_bulk *bulk; + struct sfw_test_unit *tsu; + + LASSERT(tsi->tsi_is_client); + + list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { + bulk = tsu->tsu_private; + if (bulk == NULL) + continue; + + srpc_free_bulk(bulk); + tsu->tsu_private = NULL; + } +} + +static int +brw_client_init(struct sfw_test_instance *tsi) +{ + struct sfw_session *sn = tsi->tsi_batch->bat_session; + int flags; + int off; + int npg; + int len; + int opc; + struct srpc_bulk *bulk; + struct sfw_test_unit *tsu; + + LASSERT(sn != NULL); + LASSERT(tsi->tsi_is_client); + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0; + + opc = breq->blk_opc; + flags = breq->blk_flags; + npg = breq->blk_npg; + /* NB: this is not going to work for variable page size, + * but we have to keep it for compatibility */ + len = npg * PAGE_SIZE; + off = 0; + + } else { + struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1; + + /* I should never get this step if it's unknown feature + * because make_session will reject unknown feature */ + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + opc = breq->blk_opc; + flags = breq->blk_flags; + len = breq->blk_len; + off = breq->blk_offset & ~PAGE_MASK; + npg = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + + if (off % BRW_MSIZE != 0) + return -EINVAL; + + if (npg > LNET_MAX_IOV || npg <= 0) + return -EINVAL; + + if (opc != LST_BRW_READ && opc != LST_BRW_WRITE) + return -EINVAL; + + if (flags != LST_BRW_CHECK_NONE && + flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE) + return -EINVAL; + + list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { + bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid, NULL), + off, npg, len, opc == LST_BRW_READ); + if (bulk == NULL) { + brw_client_fini(tsi); + return -ENOMEM; + } + + tsu->tsu_private = bulk; + } + + return 0; +} + +#define BRW_POISON 0xbeefbeefbeefbeefULL +#define BRW_MAGIC 0xeeb0eeb1eeb2eeb3ULL +#define BRW_MSIZE sizeof(__u64) + +static int brw_inject_one_error(void) +{ + struct timespec64 ts; + + if (brw_inject_errors <= 0) return 0; + + ktime_get_ts64(&ts); + + if (((ts.tv_nsec / NSEC_PER_USEC) & 1) == 0) + return 0; + + return brw_inject_errors--; +} + +static void +brw_fill_page(struct page *pg, int off, int len, int pattern, __u64 magic) +{ + char *addr = page_address(pg) + off; + int i; + + LASSERT(addr != NULL); + LASSERT(off % BRW_MSIZE == 0 && len % BRW_MSIZE == 0); + + if (pattern == LST_BRW_CHECK_NONE) + return; + + if (magic == BRW_MAGIC) + magic += brw_inject_one_error(); + + if (pattern == LST_BRW_CHECK_SIMPLE) { + memcpy(addr, &magic, BRW_MSIZE); + if (len > BRW_MSIZE) { + addr += len - BRW_MSIZE; + memcpy(addr, &magic, BRW_MSIZE); + } + return; + } + + if (pattern == LST_BRW_CHECK_FULL) { + for (i = 0; i < len; i += BRW_MSIZE) + memcpy(addr + i, &magic, BRW_MSIZE); + return; + } + LBUG(); +} + +static int +brw_check_page(struct page *pg, int off, int len, int pattern, __u64 magic) +{ + char *addr = page_address(pg) + off; + __u64 data = 0; /* make compiler happy */ + int i; + + LASSERT(addr != NULL); + LASSERT(off % BRW_MSIZE == 0 && len % BRW_MSIZE == 0); + + if (pattern == LST_BRW_CHECK_NONE) + return 0; + + if (pattern == LST_BRW_CHECK_SIMPLE) { + data = *((__u64 *) addr); + if (data != magic) + goto bad_data; + + if (len > BRW_MSIZE) { + addr += len - BRW_MSIZE; + data = *((__u64 *) addr); + if (data != magic) + goto bad_data; + } + return 0; + } + + if (pattern == LST_BRW_CHECK_FULL) { + for (i = 0; i < len; i += BRW_MSIZE) { + data = *(__u64 *)(addr + i); + if (data != magic) + goto bad_data; + } + return 0; + } + + LBUG(); + +bad_data: + CERROR ("Bad data in page %p: %#llx, %#llx expected\n", + pg, data, magic); + return 1; +} + +static void +brw_fill_bulk(struct srpc_bulk *bk, int pattern, __u64 magic) +{ + int i; + struct page *pg; + + for (i = 0; i < bk->bk_niov; i++) { + int off; + int len; + + pg = bk->bk_iovs[i].kiov_page; + off = bk->bk_iovs[i].kiov_offset; + len = bk->bk_iovs[i].kiov_len; + brw_fill_page(pg, off, len, pattern, magic); + } +} + +static int +brw_check_bulk(struct srpc_bulk *bk, int pattern, __u64 magic) +{ + int i; + struct page *pg; + + for (i = 0; i < bk->bk_niov; i++) { + int off; + int len; + + pg = bk->bk_iovs[i].kiov_page; + off = bk->bk_iovs[i].kiov_offset; + len = bk->bk_iovs[i].kiov_len; + if (brw_check_page(pg, off, len, pattern, magic) != 0) { + CERROR("Bulk page %p (%d/%d) is corrupted!\n", + pg, i, bk->bk_niov); + return 1; + } + } + + return 0; +} + +static int +brw_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest, + struct srpc_client_rpc **rpcpp) +{ + struct srpc_bulk *bulk = tsu->tsu_private; + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct sfw_session *sn = tsi->tsi_batch->bat_session; + struct srpc_client_rpc *rpc; + struct srpc_brw_reqst *req; + int flags; + int npg; + int len; + int opc; + int rc; + + LASSERT(sn != NULL); + LASSERT(bulk != NULL); + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0; + + opc = breq->blk_opc; + flags = breq->blk_flags; + npg = breq->blk_npg; + len = npg * PAGE_SIZE; + + } else { + struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1; + int off; + + /* I should never get this step if it's unknown feature + * because make_session will reject unknown feature */ + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + opc = breq->blk_opc; + flags = breq->blk_flags; + len = breq->blk_len; + off = breq->blk_offset; + npg = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + + rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc); + if (rc != 0) + return rc; + + memcpy(&rpc->crpc_bulk, bulk, offsetof(struct srpc_bulk, bk_iovs[npg])); + if (opc == LST_BRW_WRITE) + brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC); + else + brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON); + + req = &rpc->crpc_reqstmsg.msg_body.brw_reqst; + req->brw_flags = flags; + req->brw_rw = opc; + req->brw_len = len; + + *rpcpp = rpc; + return 0; +} + +static void +brw_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc) +{ + __u64 magic = BRW_MAGIC; + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct sfw_session *sn = tsi->tsi_batch->bat_session; + struct srpc_msg *msg = &rpc->crpc_replymsg; + struct srpc_brw_reply *reply = &msg->msg_body.brw_reply; + struct srpc_brw_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst; + + LASSERT(sn != NULL); + + if (rpc->crpc_status != 0) { + CERROR("BRW RPC to %s failed with %d\n", + libcfs_id2str(rpc->crpc_dest), rpc->crpc_status); + if (!tsi->tsi_stopping) /* rpc could have been aborted */ + atomic_inc(&sn->sn_brw_errors); + return; + } + + if (msg->msg_magic != SRPC_MSG_MAGIC) { + __swab64s(&magic); + __swab32s(&reply->brw_status); + } + + CDEBUG(reply->brw_status ? D_WARNING : D_NET, + "BRW RPC to %s finished with brw_status: %d\n", + libcfs_id2str(rpc->crpc_dest), reply->brw_status); + + if (reply->brw_status != 0) { + atomic_inc(&sn->sn_brw_errors); + rpc->crpc_status = -(int)reply->brw_status; + return; + } + + if (reqst->brw_rw == LST_BRW_WRITE) + return; + + if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) { + CERROR("Bulk data from %s is corrupted!\n", + libcfs_id2str(rpc->crpc_dest)); + atomic_inc(&sn->sn_brw_errors); + rpc->crpc_status = -EBADMSG; + } + + return; +} + +static void +brw_server_rpc_done(struct srpc_server_rpc *rpc) +{ + struct srpc_bulk *blk = rpc->srpc_bulk; + + if (blk == NULL) + return; + + if (rpc->srpc_status != 0) + CERROR("Bulk transfer %s %s has failed: %d\n", + blk->bk_sink ? "from" : "to", + libcfs_id2str(rpc->srpc_peer), rpc->srpc_status); + else + CDEBUG(D_NET, "Transferred %d pages bulk data %s %s\n", + blk->bk_niov, blk->bk_sink ? "from" : "to", + libcfs_id2str(rpc->srpc_peer)); + + sfw_free_pages(rpc); +} + +static int +brw_bulk_ready(struct srpc_server_rpc *rpc, int status) +{ + __u64 magic = BRW_MAGIC; + struct srpc_brw_reply *reply = &rpc->srpc_replymsg.msg_body.brw_reply; + struct srpc_brw_reqst *reqst; + struct srpc_msg *reqstmsg; + + LASSERT (rpc->srpc_bulk != NULL); + LASSERT (rpc->srpc_reqstbuf != NULL); + + reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + reqst = &reqstmsg->msg_body.brw_reqst; + + if (status != 0) { + CERROR ("BRW bulk %s failed for RPC from %s: %d\n", + reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE", + libcfs_id2str(rpc->srpc_peer), status); + return -EIO; + } + + if (reqst->brw_rw == LST_BRW_READ) + return 0; + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) + __swab64s(&magic); + + if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) { + CERROR ("Bulk data from %s is corrupted!\n", + libcfs_id2str(rpc->srpc_peer)); + reply->brw_status = EBADMSG; + } + + return 0; +} + +static int +brw_server_handle(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + struct srpc_msg *replymsg = &rpc->srpc_replymsg; + struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + struct srpc_brw_reply *reply = &replymsg->msg_body.brw_reply; + struct srpc_brw_reqst *reqst = &reqstmsg->msg_body.brw_reqst; + int npg; + int rc; + + LASSERT (sv->sv_id == SRPC_SERVICE_BRW); + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) { + LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + __swab32s(&reqst->brw_rw); + __swab32s(&reqst->brw_len); + __swab32s(&reqst->brw_flags); + __swab64s(&reqst->brw_rpyid); + __swab64s(&reqst->brw_bulkid); + } + LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id)); + + reply->brw_status = 0; + rpc->srpc_done = brw_server_rpc_done; + + if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) || + (reqst->brw_flags != LST_BRW_CHECK_NONE && + reqst->brw_flags != LST_BRW_CHECK_FULL && + reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) { + reply->brw_status = EINVAL; + return 0; + } + + if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + replymsg->msg_ses_feats = LST_FEATS_MASK; + reply->brw_status = EPROTO; + return 0; + } + + if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) { + /* compat with old version */ + if ((reqst->brw_len & ~PAGE_MASK) != 0) { + reply->brw_status = EINVAL; + return 0; + } + npg = reqst->brw_len >> PAGE_SHIFT; + + } else { + npg = (reqst->brw_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + + replymsg->msg_ses_feats = reqstmsg->msg_ses_feats; + + if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) { + reply->brw_status = EINVAL; + return 0; + } + + rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg, + reqst->brw_len, + reqst->brw_rw == LST_BRW_WRITE); + if (rc != 0) + return rc; + + if (reqst->brw_rw == LST_BRW_READ) + brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC); + else + brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON); + + return 0; +} + +struct sfw_test_client_ops brw_test_client; + +void brw_init_test_client(void) +{ + brw_test_client.tso_init = brw_client_init; + brw_test_client.tso_fini = brw_client_fini; + brw_test_client.tso_prep_rpc = brw_client_prep_rpc; + brw_test_client.tso_done_rpc = brw_client_done_rpc; +}; + +struct srpc_service brw_test_service; + +void brw_init_test_service(void) +{ + brw_test_service.sv_id = SRPC_SERVICE_BRW; + brw_test_service.sv_name = "brw_test"; + brw_test_service.sv_handler = brw_server_handle; + brw_test_service.sv_bulk_ready = brw_bulk_ready; + brw_test_service.sv_wi_total = brw_srv_workitems; +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/conctl.c b/drivers/staging/lustrefsx/lnet/selftest/conctl.c new file mode 100644 index 0000000000000..189435b4375f9 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/conctl.c @@ -0,0 +1,930 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * IOC handle in kernel + * + * Author: Liang Zhen + */ + +#include +#include +#include "console.h" + +static int +lst_session_new_ioctl(struct lstio_session_new_args *args) +{ + char *name; + int rc; + + if (args->lstio_ses_idp == NULL || /* address for output sid */ + args->lstio_ses_key == 0 || /* no key is specified */ + args->lstio_ses_namep == NULL || /* session name */ + args->lstio_ses_nmlen <= 0 || + args->lstio_ses_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_ses_namep, + args->lstio_ses_nmlen)) { + LIBCFS_FREE(name, args->lstio_ses_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_ses_nmlen] = 0; + + rc = lstcon_session_new(name, + args->lstio_ses_key, + args->lstio_ses_feats, + args->lstio_ses_timeout, + args->lstio_ses_force, + args->lstio_ses_idp); + + LIBCFS_FREE(name, args->lstio_ses_nmlen + 1); + return rc; +} + +static int +lst_session_end_ioctl(struct lstio_session_end_args *args) +{ + if (args->lstio_ses_key != console_session.ses_key) + return -EACCES; + + return lstcon_session_end(); +} + +static int +lst_session_info_ioctl(struct lstio_session_info_args *args) +{ + /* no checking of key */ + + if (args->lstio_ses_idp == NULL || /* address for ouput sid */ + args->lstio_ses_keyp == NULL || /* address for ouput key */ + args->lstio_ses_featp == NULL || /* address for ouput features */ + args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */ + args->lstio_ses_namep == NULL || /* address for ouput name */ + args->lstio_ses_nmlen <= 0 || + args->lstio_ses_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_session_info(args->lstio_ses_idp, + args->lstio_ses_keyp, + args->lstio_ses_featp, + args->lstio_ses_ndinfo, + args->lstio_ses_namep, + args->lstio_ses_nmlen); +} + +static int +lst_debug_ioctl(struct lstio_debug_args *args) +{ + char *name = NULL; + int client = 1; + int rc; + + if (args->lstio_dbg_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_dbg_resultp == NULL) + return -EINVAL; + + if (args->lstio_dbg_namep != NULL && /* name of batch/group */ + (args->lstio_dbg_nmlen <= 0 || + args->lstio_dbg_nmlen > LST_NAME_SIZE)) + return -EINVAL; + + if (args->lstio_dbg_namep != NULL) { + LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_dbg_namep, + args->lstio_dbg_nmlen)) { + LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1); + + return -EFAULT; + } + + name[args->lstio_dbg_nmlen] = 0; + } + + rc = -EINVAL; + + switch (args->lstio_dbg_type) { + case LST_OPC_SESSION: + rc = lstcon_session_debug(args->lstio_dbg_timeout, + args->lstio_dbg_resultp); + break; + + case LST_OPC_BATCHSRV: + client = 0; + fallthrough; + case LST_OPC_BATCHCLI: + if (name == NULL) + goto out; + + rc = lstcon_batch_debug(args->lstio_dbg_timeout, + name, client, args->lstio_dbg_resultp); + break; + + case LST_OPC_GROUP: + if (name == NULL) + goto out; + + rc = lstcon_group_debug(args->lstio_dbg_timeout, + name, args->lstio_dbg_resultp); + break; + + case LST_OPC_NODES: + if (args->lstio_dbg_count <= 0 || + args->lstio_dbg_idsp == NULL) + goto out; + + rc = lstcon_nodes_debug(args->lstio_dbg_timeout, + args->lstio_dbg_count, + args->lstio_dbg_idsp, + args->lstio_dbg_resultp); + break; + + default: + break; + } + +out: + if (name != NULL) + LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1); + + return rc; +} + +static int +lst_group_add_ioctl(struct lstio_group_add_args *args) +{ + char *name; + int rc; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_add(name); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +static int +lst_group_del_ioctl(struct lstio_group_del_args *args) +{ + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_del(name); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +static int +lst_group_update_ioctl(struct lstio_group_update_args *args) +{ + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_resultp == NULL || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + switch (args->lstio_grp_opc) { + case LST_GROUP_CLEAN: + rc = lstcon_group_clean(name, args->lstio_grp_args); + break; + + case LST_GROUP_REFRESH: + rc = lstcon_group_refresh(name, args->lstio_grp_resultp); + break; + + case LST_GROUP_RMND: + if (args->lstio_grp_count <= 0 || + args->lstio_grp_idsp == NULL) { + rc = -EINVAL; + break; + } + rc = lstcon_nodes_remove(name, args->lstio_grp_count, + args->lstio_grp_idsp, + args->lstio_grp_resultp); + break; + + default: + rc = -EINVAL; + break; + } + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +static int +lst_nodes_add_ioctl(struct lstio_group_nodes_args *args) +{ + unsigned int feats; + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_idsp == NULL || /* array of ids */ + args->lstio_grp_count <= 0 || + args->lstio_grp_resultp == NULL || + args->lstio_grp_featp == NULL || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_nodes_add(name, args->lstio_grp_count, + args->lstio_grp_idsp, &feats, + args->lstio_grp_resultp); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + if (rc == 0 && + copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) { + return -EINVAL; + } + + return rc; +} + +static int +lst_group_list_ioctl(struct lstio_group_list_args *args) +{ + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_idx < 0 || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_group_list(args->lstio_grp_idx, + args->lstio_grp_nmlen, + args->lstio_grp_namep); +} + +static int +lst_group_info_ioctl(struct lstio_group_info_args *args) +{ + char *name; + int ndent; + int index; + int rc; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_grp_entp == NULL && /* output: group entry */ + args->lstio_grp_dentsp == NULL) /* output: node entry */ + return -EINVAL; + + if (args->lstio_grp_dentsp != NULL) { /* have node entry */ + if (args->lstio_grp_idxp == NULL || /* node index */ + args->lstio_grp_ndentp == NULL) /* # of node entry */ + return -EINVAL; + + if (copy_from_user(&ndent, args->lstio_grp_ndentp, + sizeof(ndent)) || + copy_from_user(&index, args->lstio_grp_idxp, + sizeof(index))) + return -EFAULT; + + if (ndent <= 0 || index < 0) + return -EINVAL; + } + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_info(name, args->lstio_grp_entp, + &index, &ndent, args->lstio_grp_dentsp); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + if (rc != 0) + return rc; + + if (args->lstio_grp_dentsp != NULL && + (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) || + copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent)))) + return -EFAULT; + + return 0; +} + +static int +lst_batch_add_ioctl(struct lstio_batch_add_args *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_add(name); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_run_ioctl(struct lstio_batch_run_args *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_run(name, args->lstio_bat_timeout, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_stop_ioctl(struct lstio_batch_stop_args *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_resultp == NULL || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_stop(name, args->lstio_bat_force, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_query_ioctl(struct lstio_batch_query_args *args) +{ + char *name; + int rc; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_resultp == NULL || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_bat_testidx < 0) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_test_batch_query(name, + args->lstio_bat_testidx, + args->lstio_bat_client, + args->lstio_bat_timeout, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +static int +lst_batch_list_ioctl(struct lstio_batch_list_args *args) +{ + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_idx < 0 || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_batch_list(args->lstio_bat_idx, + args->lstio_bat_nmlen, + args->lstio_bat_namep); +} + +static int +lst_batch_info_ioctl(struct lstio_batch_info_args *args) +{ + char *name; + int rc; + int index; + int ndent; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || /* batch name */ + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_bat_entp == NULL && /* output: batch entry */ + args->lstio_bat_dentsp == NULL) /* output: node entry */ + return -EINVAL; + + if (args->lstio_bat_dentsp != NULL) { /* have node entry */ + if (args->lstio_bat_idxp == NULL || /* node index */ + args->lstio_bat_ndentp == NULL) /* # of node entry */ + return -EINVAL; + + if (copy_from_user(&index, args->lstio_bat_idxp, + sizeof(index)) || + copy_from_user(&ndent, args->lstio_bat_ndentp, + sizeof(ndent))) + return -EFAULT; + + if (ndent <= 0 || index < 0) + return -EINVAL; + } + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_info(name, + args->lstio_bat_entp, args->lstio_bat_server, + args->lstio_bat_testidx, &index, &ndent, + args->lstio_bat_dentsp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + if (rc != 0) + return rc; + + if (args->lstio_bat_dentsp != NULL && + (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) || + copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent)))) + rc = -EFAULT; + + return rc; +} + +static int +lst_stat_query_ioctl(struct lstio_stat_args *args) +{ + int rc; + char *name = NULL; + + /* TODO: not finished */ + if (args->lstio_sta_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_sta_resultp == NULL) + return -EINVAL; + + if (args->lstio_sta_idsp != NULL) { + if (args->lstio_sta_count <= 0) + return -EINVAL; + + rc = lstcon_nodes_stat(args->lstio_sta_count, + args->lstio_sta_idsp, + args->lstio_sta_timeout, + args->lstio_sta_resultp); + } else if (args->lstio_sta_namep != NULL) { + if (args->lstio_sta_nmlen <= 0 || + args->lstio_sta_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + rc = copy_from_user(name, args->lstio_sta_namep, + args->lstio_sta_nmlen); + if (rc == 0) + rc = lstcon_group_stat(name, args->lstio_sta_timeout, + args->lstio_sta_resultp); + else + rc = -EFAULT; + + } else { + rc = -EINVAL; + } + + if (name != NULL) + LIBCFS_FREE(name, args->lstio_sta_nmlen + 1); + return rc; +} + +static int lst_test_add_ioctl(struct lstio_test_args *args) +{ + char *batch_name; + char *src_name = NULL; + char *dst_name = NULL; + void *param = NULL; + int ret = 0; + int rc = -ENOMEM; + + if (args->lstio_tes_resultp == NULL || + args->lstio_tes_retp == NULL || + args->lstio_tes_bat_name == NULL || /* no specified batch */ + args->lstio_tes_bat_nmlen <= 0 || + args->lstio_tes_bat_nmlen > LST_NAME_SIZE || + args->lstio_tes_sgrp_name == NULL || /* no source group */ + args->lstio_tes_sgrp_nmlen <= 0 || + args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE || + args->lstio_tes_dgrp_name == NULL || /* no target group */ + args->lstio_tes_dgrp_nmlen <= 0 || + args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_tes_loop == 0 || /* negative is infinite */ + args->lstio_tes_concur <= 0 || + args->lstio_tes_dist <= 0 || + args->lstio_tes_span <= 0) + return -EINVAL; + + /* have parameter, check if parameter length is valid */ + if (args->lstio_tes_param != NULL && + (args->lstio_tes_param_len <= 0 || + args->lstio_tes_param_len > + PAGE_SIZE - sizeof(struct lstcon_test))) + return -EINVAL; + + LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1); + if (batch_name == NULL) + return rc; + + LIBCFS_ALLOC(src_name, args->lstio_tes_sgrp_nmlen + 1); + if (src_name == NULL) + goto out; + + LIBCFS_ALLOC(dst_name, args->lstio_tes_dgrp_nmlen + 1); + if (dst_name == NULL) + goto out; + + if (args->lstio_tes_param != NULL) { + LIBCFS_ALLOC(param, args->lstio_tes_param_len); + if (param == NULL) + goto out; + if (copy_from_user(param, args->lstio_tes_param, + args->lstio_tes_param_len)) { + rc = -EFAULT; + goto out; + } + } + + rc = -EFAULT; + if (copy_from_user(batch_name, args->lstio_tes_bat_name, + args->lstio_tes_bat_nmlen) || + copy_from_user(src_name, args->lstio_tes_sgrp_name, + args->lstio_tes_sgrp_nmlen) || + copy_from_user(dst_name, args->lstio_tes_dgrp_name, + args->lstio_tes_dgrp_nmlen)) + goto out; + + rc = lstcon_test_add(batch_name, + args->lstio_tes_type, + args->lstio_tes_loop, + args->lstio_tes_concur, + args->lstio_tes_dist, args->lstio_tes_span, + src_name, dst_name, param, + args->lstio_tes_param_len, + &ret, args->lstio_tes_resultp); + + if (ret != 0) + rc = (copy_to_user(args->lstio_tes_retp, &ret, + sizeof(ret))) ? -EFAULT : 0; +out: + if (batch_name != NULL) + LIBCFS_FREE(batch_name, args->lstio_tes_bat_nmlen + 1); + + if (src_name != NULL) + LIBCFS_FREE(src_name, args->lstio_tes_sgrp_nmlen + 1); + + if (dst_name != NULL) + LIBCFS_FREE(dst_name, args->lstio_tes_dgrp_nmlen + 1); + + if (param != NULL) + LIBCFS_FREE(param, args->lstio_tes_param_len); + + return rc; +} + +int +lstcon_ioctl_entry(struct notifier_block *nb, + unsigned long cmd, void *vdata) +{ + struct libcfs_ioctl_hdr *hdr = vdata; + struct libcfs_ioctl_data *data; + char *buf = NULL; + int rc = -EINVAL; + int opc; + + if (cmd != IOC_LIBCFS_LNETST) + goto err; + + data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr); + + opc = data->ioc_u32[0]; + + if (data->ioc_plen1 > PAGE_SIZE) + goto err; + + LIBCFS_ALLOC(buf, data->ioc_plen1); + if (buf == NULL) { + rc = -ENOMEM; + goto err; + } + + /* copy in parameter */ + if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) { + rc = -EFAULT; + goto out_free_buf; + } + + mutex_lock(&console_session.ses_mutex); + + console_session.ses_laststamp = ktime_get_real_seconds(); + + if (console_session.ses_shutdown) { + rc = -ESHUTDOWN; + goto out; + } + + if (console_session.ses_expired) + lstcon_session_end(); + + if (opc != LSTIO_SESSION_NEW && + console_session.ses_state == LST_SESSION_NONE) { + CDEBUG(D_NET, "LST no active session\n"); + rc = -ESRCH; + goto out; + } + + memset(&console_session.ses_trans_stat, 0, + sizeof(struct lstcon_trans_stat)); + + switch (opc) { + case LSTIO_SESSION_NEW: + rc = lst_session_new_ioctl((struct lstio_session_new_args *)buf); + break; + case LSTIO_SESSION_END: + rc = lst_session_end_ioctl((struct lstio_session_end_args *)buf); + break; + case LSTIO_SESSION_INFO: + rc = lst_session_info_ioctl((struct lstio_session_info_args *)buf); + break; + case LSTIO_DEBUG: + rc = lst_debug_ioctl((struct lstio_debug_args *)buf); + break; + case LSTIO_GROUP_ADD: + rc = lst_group_add_ioctl((struct lstio_group_add_args *)buf); + break; + case LSTIO_GROUP_DEL: + rc = lst_group_del_ioctl((struct lstio_group_del_args *)buf); + break; + case LSTIO_GROUP_UPDATE: + rc = lst_group_update_ioctl((struct lstio_group_update_args *)buf); + break; + case LSTIO_NODES_ADD: + rc = lst_nodes_add_ioctl((struct lstio_group_nodes_args *)buf); + break; + case LSTIO_GROUP_LIST: + rc = lst_group_list_ioctl((struct lstio_group_list_args *)buf); + break; + case LSTIO_GROUP_INFO: + rc = lst_group_info_ioctl((struct lstio_group_info_args *)buf); + break; + case LSTIO_BATCH_ADD: + rc = lst_batch_add_ioctl((struct lstio_batch_add_args *)buf); + break; + case LSTIO_BATCH_START: + rc = lst_batch_run_ioctl((struct lstio_batch_run_args *)buf); + break; + case LSTIO_BATCH_STOP: + rc = lst_batch_stop_ioctl((struct lstio_batch_stop_args *)buf); + break; + case LSTIO_BATCH_QUERY: + rc = lst_batch_query_ioctl((struct lstio_batch_query_args *)buf); + break; + case LSTIO_BATCH_LIST: + rc = lst_batch_list_ioctl((struct lstio_batch_list_args *)buf); + break; + case LSTIO_BATCH_INFO: + rc = lst_batch_info_ioctl((struct lstio_batch_info_args *)buf); + break; + case LSTIO_TEST_ADD: + rc = lst_test_add_ioctl((struct lstio_test_args *)buf); + break; + case LSTIO_STAT_QUERY: + rc = lst_stat_query_ioctl((struct lstio_stat_args *)buf); + break; + default: + rc = -EINVAL; + goto out; + } + + if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat, + sizeof(struct lstcon_trans_stat))) + rc = -EFAULT; +out: + mutex_unlock(&console_session.ses_mutex); +out_free_buf: + LIBCFS_FREE(buf, data->ioc_plen1); +err: + return notifier_from_ioctl_errno(rc); +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c new file mode 100644 index 0000000000000..b39756f724a2a --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c @@ -0,0 +1,1407 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * Console framework rpcs + * + * Author: Liang Zhen + */ + + +#include +#include +#include "timer.h" +#include "conrpc.h" +#include "console.h" + +void lstcon_rpc_stat_reply(struct lstcon_rpc_trans *, struct srpc_msg *, + struct lstcon_node *, struct lstcon_trans_stat *); + +static void +lstcon_rpc_done(struct srpc_client_rpc *rpc) +{ + struct lstcon_rpc *crpc = rpc->crpc_priv; + + LASSERT(crpc != NULL && rpc == crpc->crp_rpc); + LASSERT(crpc->crp_posted && !crpc->crp_finished); + + spin_lock(&rpc->crpc_lock); + + if (crpc->crp_trans == NULL) { + /* Orphan RPC is not in any transaction, + * I'm just a poor body and nobody loves me */ + spin_unlock(&rpc->crpc_lock); + + /* release it */ + lstcon_rpc_put(crpc); + return; + } + + /* not an orphan RPC */ + crpc->crp_finished = 1; + + if (crpc->crp_stamp_ns == 0) { + /* not aborted */ + LASSERT(crpc->crp_status == 0); + + crpc->crp_stamp_ns = ktime_get_ns(); + crpc->crp_status = rpc->crpc_status; + } + + /* wakeup (transaction)thread if I'm the last RPC in the transaction */ + if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining)) + wake_up(&crpc->crp_trans->tas_waitq); + + spin_unlock(&rpc->crpc_lock); +} + +static int +lstcon_rpc_init(struct lstcon_node *nd, int service, unsigned int feats, + int bulk_npg, int bulk_len, int embedded, + struct lstcon_rpc *crpc) +{ + memset(crpc, 0, sizeof(*crpc)); + + crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service, + feats, bulk_npg, bulk_len, + lstcon_rpc_done, (void *)crpc); + if (crpc->crp_rpc == NULL) + return -ENOMEM; + + crpc->crp_node = nd; + crpc->crp_embedded = embedded; + INIT_LIST_HEAD(&crpc->crp_link); + + atomic_inc(&console_session.ses_rpc_counter); + + return 0; +} + +static int +lstcon_rpc_prep(struct lstcon_node *nd, int service, unsigned int feats, + int bulk_npg, int bulk_len, struct lstcon_rpc **crpcpp) +{ + struct lstcon_rpc *crpc = NULL; + int rc; + + spin_lock(&console_session.ses_rpc_lock); + + if (!list_empty(&console_session.ses_rpc_freelist)) { + crpc = list_entry(console_session.ses_rpc_freelist.next, + struct lstcon_rpc, crp_link); + list_del_init(&crpc->crp_link); + } + + spin_unlock(&console_session.ses_rpc_lock); + + if (crpc == NULL) { + LIBCFS_ALLOC(crpc, sizeof(*crpc)); + if (crpc == NULL) + return -ENOMEM; + } + + rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc); + if (rc == 0) { + *crpcpp = crpc; + return 0; + } + + LIBCFS_FREE(crpc, sizeof(*crpc)); + + return rc; +} + +void +lstcon_rpc_put(struct lstcon_rpc *crpc) +{ + struct srpc_bulk *bulk = &crpc->crp_rpc->crpc_bulk; + int i; + + LASSERT(list_empty(&crpc->crp_link)); + + for (i = 0; i < bulk->bk_niov; i++) { + if (bulk->bk_iovs[i].kiov_page == NULL) + continue; + + __free_page(bulk->bk_iovs[i].kiov_page); + } + + srpc_client_rpc_decref(crpc->crp_rpc); + + if (crpc->crp_embedded) { + /* embedded RPC, don't recycle it */ + memset(crpc, 0, sizeof(*crpc)); + crpc->crp_embedded = 1; + + } else { + spin_lock(&console_session.ses_rpc_lock); + + list_add(&crpc->crp_link, + &console_session.ses_rpc_freelist); + + spin_unlock(&console_session.ses_rpc_lock); + } + + /* RPC is not alive now */ + atomic_dec(&console_session.ses_rpc_counter); +} + +static void +lstcon_rpc_post(struct lstcon_rpc *crpc) +{ + struct lstcon_rpc_trans *trans = crpc->crp_trans; + + LASSERT (trans != NULL); + + atomic_inc(&trans->tas_remaining); + crpc->crp_posted = 1; + + sfw_post_rpc(crpc->crp_rpc); +} + +static char * +lstcon_rpc_trans_name(int transop) +{ + if (transop == LST_TRANS_SESNEW) + return "SESNEW"; + + if (transop == LST_TRANS_SESEND) + return "SESEND"; + + if (transop == LST_TRANS_SESQRY) + return "SESQRY"; + + if (transop == LST_TRANS_SESPING) + return "SESPING"; + + if (transop == LST_TRANS_TSBCLIADD) + return "TSBCLIADD"; + + if (transop == LST_TRANS_TSBSRVADD) + return "TSBSRVADD"; + + if (transop == LST_TRANS_TSBRUN) + return "TSBRUN"; + + if (transop == LST_TRANS_TSBSTOP) + return "TSBSTOP"; + + if (transop == LST_TRANS_TSBCLIQRY) + return "TSBCLIQRY"; + + if (transop == LST_TRANS_TSBSRVQRY) + return "TSBSRVQRY"; + + if (transop == LST_TRANS_STATQRY) + return "STATQRY"; + + return "Unknown"; +} + +int +lstcon_rpc_trans_prep(struct list_head *translist, int transop, + struct lstcon_rpc_trans **transpp) +{ + struct lstcon_rpc_trans *trans; + + if (translist != NULL) { + list_for_each_entry(trans, translist, tas_link) { + /* Can't enqueue two private transaction on + * the same object */ + if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE) + return -EPERM; + } + } + + /* create a trans group */ + LIBCFS_ALLOC(trans, sizeof(*trans)); + if (trans == NULL) + return -ENOMEM; + + trans->tas_opc = transop; + + if (translist == NULL) + INIT_LIST_HEAD(&trans->tas_olink); + else + list_add_tail(&trans->tas_olink, translist); + + list_add_tail(&trans->tas_link, &console_session.ses_trans_list); + + INIT_LIST_HEAD(&trans->tas_rpcs_list); + atomic_set(&trans->tas_remaining, 0); + init_waitqueue_head(&trans->tas_waitq); + + spin_lock(&console_session.ses_rpc_lock); + trans->tas_features = console_session.ses_features; + spin_unlock(&console_session.ses_rpc_lock); + + *transpp = trans; + return 0; +} + +void +lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans, struct lstcon_rpc *crpc) +{ + list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list); + crpc->crp_trans = trans; +} + +void +lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error) +{ + struct srpc_client_rpc *rpc; + struct lstcon_rpc *crpc; + struct lstcon_node *nd; + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + rpc = crpc->crp_rpc; + + spin_lock(&rpc->crpc_lock); + + if (!crpc->crp_posted || /* not posted */ + crpc->crp_stamp_ns != 0) { /* rpc done or aborted already */ + if (crpc->crp_stamp_ns == 0) { + crpc->crp_stamp_ns = ktime_get_ns(); + crpc->crp_status = -EINTR; + } + spin_unlock(&rpc->crpc_lock); + continue; + } + + crpc->crp_stamp_ns = ktime_get_ns(); + crpc->crp_status = error; + + spin_unlock(&rpc->crpc_lock); + + sfw_abort_rpc(rpc); + + if (error != -ETIMEDOUT) + continue; + + nd = crpc->crp_node; + if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns) + continue; + + nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns); + nd->nd_state = LST_NODE_DOWN; + } +} + +static int +lstcon_rpc_trans_check(struct lstcon_rpc_trans *trans) +{ + if (console_session.ses_shutdown && + !list_empty(&trans->tas_olink)) /* Not an end session RPC */ + return 1; + + return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0; +} + +int +lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout) +{ + struct lstcon_rpc *crpc; + int rc; + + if (list_empty(&trans->tas_rpcs_list)) + return 0; + + if (timeout < LST_TRANS_MIN_TIMEOUT) + timeout = LST_TRANS_MIN_TIMEOUT; + + CDEBUG(D_NET, "Transaction %s started\n", + lstcon_rpc_trans_name(trans->tas_opc)); + + /* post all requests */ + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + LASSERT(!crpc->crp_posted); + + lstcon_rpc_post(crpc); + } + + mutex_unlock(&console_session.ses_mutex); + + rc = wait_event_interruptible_timeout(trans->tas_waitq, + lstcon_rpc_trans_check(trans), + cfs_time_seconds(timeout)); + + rc = (rc > 0)? 0: ((rc < 0)? -EINTR: -ETIMEDOUT); + + mutex_lock(&console_session.ses_mutex); + + if (console_session.ses_shutdown) + rc = -ESHUTDOWN; + + if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) { + /* treat short timeout as canceled */ + if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2) + rc = -EINTR; + + lstcon_rpc_trans_abort(trans, rc); + } + + CDEBUG(D_NET, "Transaction %s stopped: %d\n", + lstcon_rpc_trans_name(trans->tas_opc), rc); + + lstcon_rpc_trans_stat(trans, lstcon_trans_stat()); + + return rc; +} + +static int +lstcon_rpc_get_reply(struct lstcon_rpc *crpc, struct srpc_msg **msgpp) +{ + struct lstcon_node *nd = crpc->crp_node; + struct srpc_client_rpc *rpc = crpc->crp_rpc; + struct srpc_generic_reply *rep; + + LASSERT(nd != NULL && rpc != NULL); + LASSERT(crpc->crp_stamp_ns != 0); + + if (crpc->crp_status != 0) { + *msgpp = NULL; + return crpc->crp_status; + } + + *msgpp = &rpc->crpc_replymsg; + if (!crpc->crp_unpacked) { + sfw_unpack_message(*msgpp); + crpc->crp_unpacked = 1; + } + + if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns) + return 0; + + nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns); + rep = &(*msgpp)->msg_body.reply; + + if (rep->sid.ses_nid == LNET_NID_ANY) + nd->nd_state = LST_NODE_UNKNOWN; + else if (lstcon_session_match(rep->sid)) + nd->nd_state = LST_NODE_ACTIVE; + else + nd->nd_state = LST_NODE_BUSY; + + return 0; +} + +void +lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans, + struct lstcon_trans_stat *stat) +{ + struct lstcon_rpc *crpc; + struct srpc_msg *rep; + int error; + + LASSERT(stat != NULL); + + memset(stat, 0, sizeof(*stat)); + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + lstcon_rpc_stat_total(stat, 1); + + LASSERT(crpc->crp_stamp_ns != 0); + + error = lstcon_rpc_get_reply(crpc, &rep); + if (error != 0) { + lstcon_rpc_stat_failure(stat, 1); + if (stat->trs_rpc_errno == 0) + stat->trs_rpc_errno = -error; + + continue; + } + + lstcon_rpc_stat_success(stat, 1); + + lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat); + } + + if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) { + stat->trs_fwk_errno = + lstcon_session_feats_check(trans->tas_features); + } + + CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, " + "RPC error(%d), Framework error(%d)\n", + lstcon_rpc_trans_name(trans->tas_opc), + lstcon_rpc_stat_success(stat, 0), + lstcon_rpc_stat_failure(stat, 0), + lstcon_rpc_stat_total(stat, 0), + stat->trs_rpc_errno, stat->trs_fwk_errno); + + return; +} + +int +lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans, + struct list_head __user *head_up, + lstcon_rpc_readent_func_t readent) +{ + struct list_head tmp; + struct list_head __user *next; + struct lstcon_rpc_ent *ent; + struct srpc_generic_reply *rep; + struct lstcon_rpc *crpc; + struct srpc_msg *msg; + struct lstcon_node *nd; + struct timespec64 ts; + int error; + s64 dur; + + LASSERT(head_up != NULL); + + next = head_up; + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + if (copy_from_user(&tmp, next, + sizeof(struct list_head))) + return -EFAULT; + + if (tmp.next == head_up) + return 0; + + next = tmp.next; + + ent = list_entry(next, struct lstcon_rpc_ent, rpe_link); + + LASSERT(crpc->crp_stamp_ns != 0); + + error = lstcon_rpc_get_reply(crpc, &msg); + + nd = crpc->crp_node; + + dur = crpc->crp_stamp_ns - + console_session.ses_id.ses_stamp * NSEC_PER_MSEC; + ts = ns_to_timespec64(dur); + + if (copy_to_user(&ent->rpe_peer, + &nd->nd_id, sizeof(struct lnet_process_id)) || + copy_to_user(&ent->rpe_stamp, &ts, sizeof(ts)) || + copy_to_user(&ent->rpe_state, + &nd->nd_state, sizeof(nd->nd_state)) || + copy_to_user(&ent->rpe_rpc_errno, &error, + sizeof(error))) + return -EFAULT; + + if (error != 0) + continue; + + /* RPC is done */ + rep = (struct srpc_generic_reply *)&msg->msg_body.reply; + + if (copy_to_user(&ent->rpe_sid, + &rep->sid, sizeof(rep->sid)) || + copy_to_user(&ent->rpe_fwk_errno, + &rep->status, sizeof(rep->status))) + return -EFAULT; + + if (readent == NULL) + continue; + + error = readent(trans->tas_opc, msg, ent); + if (error != 0) + return error; + } + + return 0; +} + +void +lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans) +{ + struct srpc_client_rpc *rpc; + struct lstcon_rpc *crpc; + struct lstcon_rpc *tmp; + int count = 0; + + list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list, crp_link) { + rpc = crpc->crp_rpc; + + spin_lock(&rpc->crpc_lock); + + /* free it if not posted or finished already */ + if (!crpc->crp_posted || crpc->crp_finished) { + spin_unlock(&rpc->crpc_lock); + + list_del_init(&crpc->crp_link); + lstcon_rpc_put(crpc); + + continue; + } + + /* rpcs can be still not callbacked (even LNetMDUnlink is + * called) because huge timeout for inaccessible network, + * don't make user wait for them, just abandon them, they + * will be recycled in callback */ + + LASSERT(crpc->crp_status != 0); + + crpc->crp_node = NULL; + crpc->crp_trans = NULL; + list_del_init(&crpc->crp_link); + count++; + + spin_unlock(&rpc->crpc_lock); + + atomic_dec(&trans->tas_remaining); + } + + LASSERT(atomic_read(&trans->tas_remaining) == 0); + + list_del(&trans->tas_link); + if (!list_empty(&trans->tas_olink)) + list_del(&trans->tas_olink); + + CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n", + lstcon_rpc_trans_name(trans->tas_opc), count); + + LIBCFS_FREE(trans, sizeof(*trans)); + + return; +} + +int +lstcon_sesrpc_prep(struct lstcon_node *nd, int transop, + unsigned int feats, struct lstcon_rpc **crpc) +{ + struct srpc_mksn_reqst *msrq; + struct srpc_rmsn_reqst *rsrq; + int rc; + + switch (transop) { + case LST_TRANS_SESNEW: + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION, + feats, 0, 0, crpc); + if (rc != 0) + return rc; + + msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst; + msrq->mksn_sid = console_session.ses_id; + msrq->mksn_force = console_session.ses_force; + strlcpy(msrq->mksn_name, console_session.ses_name, + sizeof(msrq->mksn_name)); + break; + + case LST_TRANS_SESEND: + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION, + feats, 0, 0, crpc); + if (rc != 0) + return rc; + + rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst; + rsrq->rmsn_sid = console_session.ses_id; + break; + + default: + LBUG(); + } + + return 0; +} + +int +lstcon_dbgrpc_prep(struct lstcon_node *nd, unsigned int feats, + struct lstcon_rpc **crpc) +{ + struct srpc_debug_reqst *drq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst; + + drq->dbg_sid = console_session.ses_id; + drq->dbg_flags = 0; + + return rc; +} + +int +lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats, + struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc) +{ + struct lstcon_batch *batch; + struct srpc_batch_reqst *brq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst; + + brq->bar_sid = console_session.ses_id; + brq->bar_bid = tsb->tsb_id; + brq->bar_testidx = tsb->tsb_index; + brq->bar_opc = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN : + (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP: + SRPC_BATCH_OPC_QUERY); + + if (transop != LST_TRANS_TSBRUN && + transop != LST_TRANS_TSBSTOP) + return 0; + + LASSERT (tsb->tsb_index == 0); + + batch = (struct lstcon_batch *)tsb; + brq->bar_arg = batch->bat_arg; + + return 0; +} + +int +lstcon_statrpc_prep(struct lstcon_node *nd, unsigned int feats, + struct lstcon_rpc **crpc) +{ + struct srpc_stat_reqst *srq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst; + + srq->str_sid = console_session.ses_id; + srq->str_type = 0; /* XXX remove it */ + + return 0; +} + +static struct lnet_process_id_packed * +lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov) +{ + struct lnet_process_id_packed *pid; + int i; + + i = idx / SFW_ID_PER_PAGE; + + LASSERT (i < nkiov); + + pid = (struct lnet_process_id_packed *)page_address(kiov[i].kiov_page); + + return &pid[idx % SFW_ID_PER_PAGE]; +} + +static int +lstcon_dstnodes_prep(struct lstcon_group *grp, int idx, + int dist, int span, int nkiov, lnet_kiov_t *kiov) +{ + struct lnet_process_id_packed *pid; + struct lstcon_ndlink *ndl; + struct lstcon_node *nd; + int start; + int end; + int i = 0; + + LASSERT (dist >= 1); + LASSERT (span >= 1); + LASSERT (grp->grp_nnode >= 1); + + if (span > grp->grp_nnode) + return -EINVAL; + + start = ((idx / dist) * span) % grp->grp_nnode; + end = ((idx / dist) * span + span - 1) % grp->grp_nnode; + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) { + nd = ndl->ndl_node; + if (i < start) { + i++; + continue; + } + + if (i > (end >= start ? end : grp->grp_nnode)) + break; + + pid = lstcon_next_id((i - start), nkiov, kiov); + pid->nid = nd->nd_id.nid; + pid->pid = nd->nd_id.pid; + i++; + } + + if (start <= end) /* done */ + return 0; + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) { + if (i > grp->grp_nnode + end) + break; + + nd = ndl->ndl_node; + pid = lstcon_next_id((i - start), nkiov, kiov); + pid->nid = nd->nd_id.nid; + pid->pid = nd->nd_id.pid; + i++; + } + + return 0; +} + +static int +lstcon_pingrpc_prep(struct lst_test_ping_param *param, + struct srpc_test_reqst *req) +{ + struct test_ping_req *prq = &req->tsr_u.ping; + + prq->png_size = param->png_size; + prq->png_flags = param->png_flags; + /* TODO dest */ + return 0; +} + +static int +lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param, + struct srpc_test_reqst *req) +{ + struct test_bulk_req *brq = &req->tsr_u.bulk_v0; + + brq->blk_opc = param->blk_opc; + brq->blk_npg = (param->blk_size + PAGE_SIZE - 1) / + PAGE_SIZE; + brq->blk_flags = param->blk_flags; + + return 0; +} + +static int +lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client, + struct srpc_test_reqst *req) +{ + struct test_bulk_req_v1 *brq = &req->tsr_u.bulk_v1; + + brq->blk_opc = param->blk_opc; + brq->blk_flags = param->blk_flags; + brq->blk_len = param->blk_size; + brq->blk_offset = is_client ? param->blk_cli_off : param->blk_srv_off; + + return 0; +} + +int +lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats, + struct lstcon_test *test, struct lstcon_rpc **crpc) +{ + struct lstcon_group *sgrp = test->tes_src_grp; + struct lstcon_group *dgrp = test->tes_dst_grp; + struct srpc_test_reqst *trq; + struct srpc_bulk *bulk; + int i; + int npg = 0; + int nob = 0; + int rc = 0; + + if (transop == LST_TRANS_TSBCLIADD) { + npg = sfw_id_pages(test->tes_span); + nob = (feats & LST_FEAT_BULK_LEN) == 0 ? + npg * PAGE_SIZE : + sizeof(struct lnet_process_id_packed) * test->tes_span; + } + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc); + if (rc != 0) + return rc; + + trq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst; + + if (transop == LST_TRANS_TSBSRVADD) { + int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist; + int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span; + int nmax = (ndist + nspan - 1) / nspan; + + trq->tsr_ndest = 0; + trq->tsr_loop = nmax * test->tes_dist * test->tes_concur; + + } else { + bulk = &(*crpc)->crp_rpc->crpc_bulk; + + for (i = 0; i < npg; i++) { + int len; + + LASSERT(nob > 0); + + len = (feats & LST_FEAT_BULK_LEN) == 0 ? + PAGE_SIZE : min_t(int, nob, PAGE_SIZE); + nob -= len; + + bulk->bk_iovs[i].kiov_offset = 0; + bulk->bk_iovs[i].kiov_len = len; + bulk->bk_iovs[i].kiov_page = + alloc_page(GFP_KERNEL); + + if (bulk->bk_iovs[i].kiov_page == NULL) { + lstcon_rpc_put(*crpc); + return -ENOMEM; + } + } + + bulk->bk_sink = 0; + + LASSERT (transop == LST_TRANS_TSBCLIADD); + + rc = lstcon_dstnodes_prep(test->tes_dst_grp, + test->tes_cliidx++, + test->tes_dist, + test->tes_span, + npg, &bulk->bk_iovs[0]); + if (rc != 0) { + lstcon_rpc_put(*crpc); + return rc; + } + + trq->tsr_ndest = test->tes_span; + trq->tsr_loop = test->tes_loop; + } + + trq->tsr_sid = console_session.ses_id; + trq->tsr_bid = test->tes_hdr.tsb_id; + trq->tsr_concur = test->tes_concur; + trq->tsr_is_client = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0; + trq->tsr_stop_onerr = !!test->tes_stop_onerr; + + switch (test->tes_type) { + case LST_TEST_PING: + trq->tsr_service = SRPC_SERVICE_PING; + rc = lstcon_pingrpc_prep((struct lst_test_ping_param *) + &test->tes_param[0], trq); + break; + + case LST_TEST_BULK: + trq->tsr_service = SRPC_SERVICE_BRW; + if ((feats & LST_FEAT_BULK_LEN) == 0) { + rc = lstcon_bulkrpc_v0_prep((struct lst_test_bulk_param *) + &test->tes_param[0], trq); + } else { + rc = lstcon_bulkrpc_v1_prep((struct lst_test_bulk_param *) + &test->tes_param[0], + trq->tsr_is_client, trq); + } + + break; + default: + LBUG(); + break; + } + + return rc; +} + +static int +lstcon_sesnew_stat_reply(struct lstcon_rpc_trans *trans, + struct lstcon_node *nd, struct srpc_msg *reply) +{ + struct srpc_mksn_reply *mksn_rep = &reply->msg_body.mksn_reply; + int status = mksn_rep->mksn_status; + + if (status == 0 && + (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + mksn_rep->mksn_status = EPROTO; + status = EPROTO; + } + + if (status == EPROTO) { + CNETERR("session protocol error from %s: %u\n", + libcfs_nid2str(nd->nd_id.nid), + reply->msg_ses_feats); + } + + if (status != 0) + return status; + + if (!trans->tas_feats_updated) { + spin_lock(&console_session.ses_rpc_lock); + if (!trans->tas_feats_updated) { /* recheck with lock */ + trans->tas_feats_updated = 1; + trans->tas_features = reply->msg_ses_feats; + } + spin_unlock(&console_session.ses_rpc_lock); + } + + if (reply->msg_ses_feats != trans->tas_features) { + CNETERR("Framework features %x from %s is different with " + "features on this transaction: %x\n", + reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid), + trans->tas_features); + status = mksn_rep->mksn_status = EPROTO; + } + + if (status == 0) { + /* session timeout on remote node */ + nd->nd_timeout = mksn_rep->mksn_timeout; + } + + return status; +} + +void +lstcon_rpc_stat_reply(struct lstcon_rpc_trans *trans, struct srpc_msg *msg, + struct lstcon_node *nd, struct lstcon_trans_stat *stat) +{ + struct srpc_rmsn_reply *rmsn_rep; + struct srpc_debug_reply *dbg_rep; + struct srpc_batch_reply *bat_rep; + struct srpc_test_reply *test_rep; + struct srpc_stat_reply *stat_rep; + int rc = 0; + + switch (trans->tas_opc) { + case LST_TRANS_SESNEW: + rc = lstcon_sesnew_stat_reply(trans, nd, msg); + if (rc == 0) { + lstcon_sesop_stat_success(stat, 1); + return; + } + + lstcon_sesop_stat_failure(stat, 1); + break; + + case LST_TRANS_SESEND: + rmsn_rep = &msg->msg_body.rmsn_reply; + /* ESRCH is not an error for end session */ + if (rmsn_rep->rmsn_status == 0 || + rmsn_rep->rmsn_status == ESRCH) { + lstcon_sesop_stat_success(stat, 1); + return; + } + + lstcon_sesop_stat_failure(stat, 1); + rc = rmsn_rep->rmsn_status; + break; + + case LST_TRANS_SESQRY: + case LST_TRANS_SESPING: + dbg_rep = &msg->msg_body.dbg_reply; + + if (dbg_rep->dbg_status == ESRCH) { + lstcon_sesqry_stat_unknown(stat, 1); + return; + } + + if (lstcon_session_match(dbg_rep->dbg_sid)) + lstcon_sesqry_stat_active(stat, 1); + else + lstcon_sesqry_stat_busy(stat, 1); + return; + + case LST_TRANS_TSBRUN: + case LST_TRANS_TSBSTOP: + bat_rep = &msg->msg_body.bat_reply; + + if (bat_rep->bar_status == 0) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + if (bat_rep->bar_status == EPERM && + trans->tas_opc == LST_TRANS_TSBSTOP) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + lstcon_tsbop_stat_failure(stat, 1); + rc = bat_rep->bar_status; + break; + + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + bat_rep = &msg->msg_body.bat_reply; + + if (bat_rep->bar_active != 0) + lstcon_tsbqry_stat_run(stat, 1); + else + lstcon_tsbqry_stat_idle(stat, 1); + + if (bat_rep->bar_status == 0) + return; + + lstcon_tsbqry_stat_failure(stat, 1); + rc = bat_rep->bar_status; + break; + + case LST_TRANS_TSBCLIADD: + case LST_TRANS_TSBSRVADD: + test_rep = &msg->msg_body.tes_reply; + + if (test_rep->tsr_status == 0) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + lstcon_tsbop_stat_failure(stat, 1); + rc = test_rep->tsr_status; + break; + + case LST_TRANS_STATQRY: + stat_rep = &msg->msg_body.stat_reply; + + if (stat_rep->str_status == 0) { + lstcon_statqry_stat_success(stat, 1); + return; + } + + lstcon_statqry_stat_failure(stat, 1); + rc = stat_rep->str_status; + break; + + default: + LBUG(); + } + + if (stat->trs_fwk_errno == 0) + stat->trs_fwk_errno = rc; + + return; +} + +int +lstcon_rpc_trans_ndlist(struct list_head *ndlist, + struct list_head *translist, int transop, + void *arg, lstcon_rpc_cond_func_t condition, + struct lstcon_rpc_trans **transpp) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_ndlink *ndl; + struct lstcon_node *nd; + struct lstcon_rpc *rpc; + unsigned int feats; + int rc; + + /* Creating session RPG for list of nodes */ + + rc = lstcon_rpc_trans_prep(translist, transop, &trans); + if (rc != 0) { + CERROR("Can't create transaction %d: %d\n", transop, rc); + return rc; + } + + feats = trans->tas_features; + list_for_each_entry(ndl, ndlist, ndl_link) { + rc = condition == NULL ? 1 : + condition(transop, ndl->ndl_node, arg); + + if (rc == 0) + continue; + + if (rc < 0) { + CDEBUG(D_NET, "Condition error while creating RPC " + " for transaction %d: %d\n", transop, rc); + break; + } + + nd = ndl->ndl_node; + + switch (transop) { + case LST_TRANS_SESNEW: + case LST_TRANS_SESEND: + rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc); + break; + case LST_TRANS_SESQRY: + case LST_TRANS_SESPING: + rc = lstcon_dbgrpc_prep(nd, feats, &rpc); + break; + case LST_TRANS_TSBCLIADD: + case LST_TRANS_TSBSRVADD: + rc = lstcon_testrpc_prep(nd, transop, feats, + (struct lstcon_test *)arg, + &rpc); + break; + case LST_TRANS_TSBRUN: + case LST_TRANS_TSBSTOP: + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + rc = lstcon_batrpc_prep(nd, transop, feats, + (struct lstcon_tsb_hdr *)arg, + &rpc); + break; + case LST_TRANS_STATQRY: + rc = lstcon_statrpc_prep(nd, feats, &rpc); + break; + default: + rc = -EINVAL; + break; + } + + if (rc != 0) { + CERROR("Failed to create RPC for transaction %s: %d\n", + lstcon_rpc_trans_name(transop), rc); + break; + } + + lstcon_rpc_trans_addreq(trans, rpc); + } + + if (rc == 0) { + *transpp = trans; + return 0; + } + + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +static void +lstcon_rpc_pinger(void *arg) +{ + struct stt_timer *ptimer = arg; + struct lstcon_rpc_trans *trans; + struct lstcon_rpc *crpc; + struct srpc_msg *rep; + struct srpc_debug_reqst *drq; + struct lstcon_ndlink *ndl; + struct lstcon_node *nd; + int intv; + int count = 0; + int rc; + + /* RPC pinger is a special case of transaction, + * it's called by timer at 8 seconds interval. + */ + mutex_lock(&console_session.ses_mutex); + + if (console_session.ses_shutdown || console_session.ses_expired) { + mutex_unlock(&console_session.ses_mutex); + return; + } + + if (!console_session.ses_expired && + ktime_get_real_seconds() - console_session.ses_laststamp > + (time64_t)console_session.ses_timeout) + console_session.ses_expired = 1; + + trans = console_session.ses_ping; + + LASSERT(trans != NULL); + + list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) { + nd = ndl->ndl_node; + + if (console_session.ses_expired) { + /* idle console, end session on all nodes */ + if (nd->nd_state != LST_NODE_ACTIVE) + continue; + + rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND, + trans->tas_features, &crpc); + if (rc != 0) { + CERROR("Out of memory\n"); + break; + } + + lstcon_rpc_trans_addreq(trans, crpc); + lstcon_rpc_post(crpc); + + continue; + } + + crpc = &nd->nd_ping; + + if (crpc->crp_rpc != NULL) { + LASSERT(crpc->crp_trans == trans); + LASSERT(!list_empty(&crpc->crp_link)); + + spin_lock(&crpc->crp_rpc->crpc_lock); + + LASSERT(crpc->crp_posted); + + if (!crpc->crp_finished) { + /* in flight */ + spin_unlock(&crpc->crp_rpc->crpc_lock); + continue; + } + + spin_unlock(&crpc->crp_rpc->crpc_lock); + + lstcon_rpc_get_reply(crpc, &rep); + + list_del_init(&crpc->crp_link); + + lstcon_rpc_put(crpc); + } + + if (nd->nd_state != LST_NODE_ACTIVE) + continue; + + intv = div_u64(ktime_ms_delta(ktime_get(), nd->nd_stamp), + MSEC_PER_SEC); + if (intv < nd->nd_timeout / 2) + continue; + + rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG, + trans->tas_features, 0, 0, 1, crpc); + if (rc != 0) { + CERROR("Out of memory\n"); + break; + } + + drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst; + + drq->dbg_sid = console_session.ses_id; + drq->dbg_flags = 0; + + lstcon_rpc_trans_addreq(trans, crpc); + lstcon_rpc_post(crpc); + + count++; + } + + if (console_session.ses_expired) { + mutex_unlock(&console_session.ses_mutex); + return; + } + + CDEBUG(D_NET, "Ping %d nodes in session\n", count); + + ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL; + stt_add_timer(ptimer); + + mutex_unlock(&console_session.ses_mutex); +} + +int +lstcon_rpc_pinger_start(void) +{ + struct stt_timer *ptimer; + int rc; + + LASSERT(list_empty(&console_session.ses_rpc_freelist)); + LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0); + + rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING, + &console_session.ses_ping); + if (rc != 0) { + CERROR("Failed to create console pinger\n"); + return rc; + } + + ptimer = &console_session.ses_ping_timer; + ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL; + + stt_add_timer(ptimer); + + return 0; +} + +void +lstcon_rpc_pinger_stop(void) +{ + LASSERT (console_session.ses_shutdown); + + stt_del_timer(&console_session.ses_ping_timer); + + lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN); + lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat()); + lstcon_rpc_trans_destroy(console_session.ses_ping); + + memset(lstcon_trans_stat(), 0, sizeof(struct lstcon_trans_stat)); + + console_session.ses_ping = NULL; +} + +void +lstcon_rpc_cleanup_wait(void) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_rpc *crpc; + struct list_head *pacer; + struct list_head zlist; + + /* Called with hold of global mutex */ + + LASSERT(console_session.ses_shutdown); + + while (!list_empty(&console_session.ses_trans_list)) { + list_for_each(pacer, &console_session.ses_trans_list) { + trans = list_entry(pacer, struct lstcon_rpc_trans, + tas_link); + + CDEBUG(D_NET, "Session closed, wakeup transaction %s\n", + lstcon_rpc_trans_name(trans->tas_opc)); + + wake_up(&trans->tas_waitq); + } + + mutex_unlock(&console_session.ses_mutex); + + CWARN("Session is shutting down, " + "waiting for termination of transactions\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + + mutex_lock(&console_session.ses_mutex); + } + + spin_lock(&console_session.ses_rpc_lock); + + lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0), + console_session.ses_rpc_lock, + "Network is not accessable or target is down, " + "waiting for %d console RPCs to being recycled\n", + atomic_read(&console_session.ses_rpc_counter)); + + list_add(&zlist, &console_session.ses_rpc_freelist); + list_del_init(&console_session.ses_rpc_freelist); + + spin_unlock(&console_session.ses_rpc_lock); + + while (!list_empty(&zlist)) { + crpc = list_entry(zlist.next, struct lstcon_rpc, crp_link); + + list_del(&crpc->crp_link); + LIBCFS_FREE(crpc, sizeof(*crpc)); + } +} + +int +lstcon_rpc_module_init(void) +{ + INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list); + console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger; + console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer; + + console_session.ses_ping = NULL; + + spin_lock_init(&console_session.ses_rpc_lock); + atomic_set(&console_session.ses_rpc_counter, 0); + INIT_LIST_HEAD(&console_session.ses_rpc_freelist); + + return 0; +} + +void +lstcon_rpc_module_fini(void) +{ + LASSERT(list_empty(&console_session.ses_rpc_freelist)); + LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0); +} + diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h new file mode 100644 index 0000000000000..51d4ee90e07cc --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h @@ -0,0 +1,145 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * /lnet/selftest/conrpc.h + * + * Console rpc + * + * Author: Liang Zhen + */ + +#ifndef __LST_CONRPC_H__ +#define __LST_CONRPC_H__ + +#include +#include +#include "rpc.h" +#include "selftest.h" + +/* Console rpc and rpc transaction */ +#define LST_TRANS_TIMEOUT 30 +#define LST_TRANS_MIN_TIMEOUT 3 + +#define LST_VALIDATE_TIMEOUT(t) MIN(MAX(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT) + +#define LST_PING_INTERVAL 8 + +struct lstcon_rpc_trans; +struct lstcon_tsb_hdr; +struct lstcon_test; +struct lstcon_node; + +struct lstcon_rpc { + struct list_head crp_link; /* chain on rpc transaction */ + struct srpc_client_rpc *crp_rpc; /* client rpc */ + struct lstcon_node *crp_node; /* destination node */ + struct lstcon_rpc_trans *crp_trans; /* conrpc transaction */ + + unsigned int crp_posted:1; /* rpc is posted */ + unsigned int crp_finished:1; /* rpc is finished */ + unsigned int crp_unpacked:1; /* reply is unpacked */ + /** RPC is embedded in other structure and can't free it */ + unsigned int crp_embedded:1; + int crp_status; /* console rpc errors */ + s64 crp_stamp_ns; /* replied time stamp */ +}; + +struct lstcon_rpc_trans { + /* link chain on owner list */ + struct list_head tas_olink; + /* link chain on global list */ + struct list_head tas_link; + /* operation code of transaction */ + int tas_opc; + /* features mask is uptodate */ + unsigned tas_feats_updated; + /* test features mask */ + unsigned tas_features; + wait_queue_head_t tas_waitq; /* wait queue head */ + atomic_t tas_remaining; /* # of un-scheduled rpcs */ + struct list_head tas_rpcs_list; /* queued requests */ +}; + +#define LST_TRANS_PRIVATE 0x1000 + +#define LST_TRANS_SESNEW (LST_TRANS_PRIVATE | 0x01) +#define LST_TRANS_SESEND (LST_TRANS_PRIVATE | 0x02) +#define LST_TRANS_SESQRY 0x03 +#define LST_TRANS_SESPING 0x04 + +#define LST_TRANS_TSBCLIADD (LST_TRANS_PRIVATE | 0x11) +#define LST_TRANS_TSBSRVADD (LST_TRANS_PRIVATE | 0x12) +#define LST_TRANS_TSBRUN (LST_TRANS_PRIVATE | 0x13) +#define LST_TRANS_TSBSTOP (LST_TRANS_PRIVATE | 0x14) +#define LST_TRANS_TSBCLIQRY 0x15 +#define LST_TRANS_TSBSRVQRY 0x16 + +#define LST_TRANS_STATQRY 0x21 + +typedef int (*lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *); +typedef int (*lstcon_rpc_readent_func_t)(int, struct srpc_msg *, + struct lstcon_rpc_ent __user *); + +int lstcon_sesrpc_prep(struct lstcon_node *nd, int transop, + unsigned int version, struct lstcon_rpc **crpc); +int lstcon_dbgrpc_prep(struct lstcon_node *nd, + unsigned int version, struct lstcon_rpc **crpc); +int lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version, + struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc); +int lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version, + struct lstcon_test *test, struct lstcon_rpc **crpc); +int lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version, + struct lstcon_rpc **crpc); +void lstcon_rpc_put(struct lstcon_rpc *crpc); +int lstcon_rpc_trans_prep(struct list_head *translist, + int transop, struct lstcon_rpc_trans **transpp); +int lstcon_rpc_trans_ndlist(struct list_head *ndlist, + struct list_head *translist, int transop, + void *arg, lstcon_rpc_cond_func_t condition, + struct lstcon_rpc_trans **transpp); +void lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans, + struct lstcon_trans_stat *stat); +int lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans, + struct list_head __user *head_up, + lstcon_rpc_readent_func_t readent); +void lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error); +void lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans); +void lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans, + struct lstcon_rpc *req); +int lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout); +int lstcon_rpc_pinger_start(void); +void lstcon_rpc_pinger_stop(void); +void lstcon_rpc_cleanup_wait(void); +int lstcon_rpc_module_init(void); +void lstcon_rpc_module_fini(void); + + +#endif diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.c b/drivers/staging/lustrefsx/lnet/selftest/console.c new file mode 100644 index 0000000000000..1e37454732cd1 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/console.c @@ -0,0 +1,2115 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * Infrastructure of LST console + * + * Author: Liang Zhen + */ + +#include +#include +#include "console.h" +#include "conrpc.h" + +#define LST_NODE_STATE_COUNTER(nd, p) \ +do { \ + if ((nd)->nd_state == LST_NODE_ACTIVE) \ + (p)->nle_nactive ++; \ + else if ((nd)->nd_state == LST_NODE_BUSY) \ + (p)->nle_nbusy ++; \ + else if ((nd)->nd_state == LST_NODE_DOWN) \ + (p)->nle_ndown ++; \ + else \ + (p)->nle_nunknown ++; \ + (p)->nle_nnode ++; \ +} while (0) + +struct lstcon_session console_session; + +static void +lstcon_node_get(struct lstcon_node *nd) +{ + LASSERT (nd->nd_ref >= 1); + + nd->nd_ref++; +} + +static int +lstcon_node_find(struct lnet_process_id id, struct lstcon_node **ndpp, + int create) +{ + struct lstcon_ndlink *ndl; + unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE; + + LASSERT(id.nid != LNET_NID_ANY); + + list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], + ndl_hlink) { + if (ndl->ndl_node->nd_id.nid != id.nid || + ndl->ndl_node->nd_id.pid != id.pid) + continue; + + lstcon_node_get(ndl->ndl_node); + *ndpp = ndl->ndl_node; + return 0; + } + + if (!create) + return -ENOENT; + + LIBCFS_ALLOC(*ndpp, sizeof(**ndpp) + sizeof(*ndl)); + if (*ndpp == NULL) + return -ENOMEM; + + ndl = (struct lstcon_ndlink *)(*ndpp + 1); + + ndl->ndl_node = *ndpp; + + ndl->ndl_node->nd_ref = 1; + ndl->ndl_node->nd_id = id; + ndl->ndl_node->nd_stamp = ktime_get(); + ndl->ndl_node->nd_state = LST_NODE_UNKNOWN; + ndl->ndl_node->nd_timeout = 0; + memset(&ndl->ndl_node->nd_ping, 0, sizeof(ndl->ndl_node->nd_ping)); + + /* queued in global hash & list, no refcount is taken by + * global hash & list, if caller release his refcount, + * node will be released */ + list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]); + list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list); + + return 0; +} + +static void +lstcon_node_put(struct lstcon_node *nd) +{ + struct lstcon_ndlink *ndl; + + LASSERT(nd->nd_ref > 0); + + if (--nd->nd_ref > 0) + return; + + ndl = (struct lstcon_ndlink *)(nd + 1); + + LASSERT(!list_empty(&ndl->ndl_link)); + LASSERT(!list_empty(&ndl->ndl_hlink)); + + /* remove from session */ + list_del(&ndl->ndl_link); + list_del(&ndl->ndl_hlink); + + LIBCFS_FREE(nd, sizeof(*nd) + sizeof(*ndl)); +} + +static int +lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id, + struct lstcon_ndlink **ndlpp, int create) +{ + unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE; + struct lstcon_ndlink *ndl; + struct lstcon_node *nd; + int rc; + + if (id.nid == LNET_NID_ANY) + return -EINVAL; + + /* search in hash */ + list_for_each_entry(ndl, &hash[idx], ndl_hlink) { + if (ndl->ndl_node->nd_id.nid != id.nid || + ndl->ndl_node->nd_id.pid != id.pid) + continue; + + *ndlpp = ndl; + return 0; + } + + if (create == 0) + return -ENOENT; + + /* find or create in session hash */ + rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0); + if (rc != 0) + return rc; + + LIBCFS_ALLOC(ndl, sizeof(*ndl)); + if (ndl == NULL) { + lstcon_node_put(nd); + return -ENOMEM; + } + + *ndlpp = ndl; + + ndl->ndl_node = nd; + INIT_LIST_HEAD(&ndl->ndl_link); + list_add_tail(&ndl->ndl_hlink, &hash[idx]); + + return 0; +} + +static void +lstcon_ndlink_release(struct lstcon_ndlink *ndl) +{ + LASSERT(list_empty(&ndl->ndl_link)); + LASSERT(!list_empty(&ndl->ndl_hlink)); + + list_del(&ndl->ndl_hlink); /* delete from hash */ + lstcon_node_put(ndl->ndl_node); + + LIBCFS_FREE(ndl, sizeof(*ndl)); +} + +static int +lstcon_group_alloc(char *name, struct lstcon_group **grpp) +{ + struct lstcon_group *grp; + int i; + + LIBCFS_ALLOC(grp, offsetof(struct lstcon_group, + grp_ndl_hash[LST_NODE_HASHSIZE])); + if (grp == NULL) + return -ENOMEM; + + grp->grp_ref = 1; + if (name != NULL) { + if (strlen(name) > sizeof(grp->grp_name)-1) { + LIBCFS_FREE(grp, offsetof(struct lstcon_group, + grp_ndl_hash[LST_NODE_HASHSIZE])); + return -E2BIG; + } + strncpy(grp->grp_name, name, sizeof(grp->grp_name)); + } + + INIT_LIST_HEAD(&grp->grp_link); + INIT_LIST_HEAD(&grp->grp_ndl_list); + INIT_LIST_HEAD(&grp->grp_trans_list); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) + INIT_LIST_HEAD(&grp->grp_ndl_hash[i]); + + *grpp = grp; + + return 0; +} + +static void +lstcon_group_addref(struct lstcon_group *grp) +{ + grp->grp_ref++; +} + +static void lstcon_group_ndlink_release(struct lstcon_group *, + struct lstcon_ndlink *); + +static void +lstcon_group_drain(struct lstcon_group *grp, int keep) +{ + struct lstcon_ndlink *ndl; + struct lstcon_ndlink *tmp; + + list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) { + if ((ndl->ndl_node->nd_state & keep) == 0) + lstcon_group_ndlink_release(grp, ndl); + } +} + +static void +lstcon_group_decref(struct lstcon_group *grp) +{ + int i; + + if (--grp->grp_ref > 0) + return; + + if (!list_empty(&grp->grp_link)) + list_del(&grp->grp_link); + + lstcon_group_drain(grp, 0); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) + LASSERT(list_empty(&grp->grp_ndl_hash[i])); + + LIBCFS_FREE(grp, offsetof(struct lstcon_group, + grp_ndl_hash[LST_NODE_HASHSIZE])); +} + +static int +lstcon_group_find(const char *name, struct lstcon_group **grpp) +{ + struct lstcon_group *grp; + + list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) { + if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0) + continue; + + lstcon_group_addref(grp); /* +1 ref for caller */ + *grpp = grp; + return 0; + } + + return -ENOENT; +} + +static int +lstcon_group_ndlink_find(struct lstcon_group *grp, struct lnet_process_id id, + struct lstcon_ndlink **ndlpp, int create) +{ + int rc; + + rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create); + if (rc != 0) + return rc; + + if (!list_empty(&(*ndlpp)->ndl_link)) + return 0; + + list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list); + grp->grp_nnode++; + + return 0; +} + +static void +lstcon_group_ndlink_release(struct lstcon_group *grp, struct lstcon_ndlink *ndl) +{ + list_del_init(&ndl->ndl_link); + lstcon_ndlink_release(ndl); + grp->grp_nnode--; +} + +static void +lstcon_group_ndlink_move(struct lstcon_group *old, + struct lstcon_group *new, struct lstcon_ndlink *ndl) +{ + unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) % + LST_NODE_HASHSIZE; + + list_del(&ndl->ndl_hlink); + list_del(&ndl->ndl_link); + old->grp_nnode--; + + list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]); + list_add_tail(&ndl->ndl_link, &new->grp_ndl_list); + new->grp_nnode++; + + return; +} + +static void +lstcon_group_move(struct lstcon_group *old, struct lstcon_group *new) +{ + struct lstcon_ndlink *ndl; + + while (!list_empty(&old->grp_ndl_list)) { + ndl = list_entry(old->grp_ndl_list.next, + struct lstcon_ndlink, ndl_link); + lstcon_group_ndlink_move(old, new, ndl); + } +} + +static int +lstcon_sesrpc_condition(int transop, struct lstcon_node *nd, void *arg) +{ + struct lstcon_group *grp = arg; + + switch (transop) { + case LST_TRANS_SESNEW: + if (nd->nd_state == LST_NODE_ACTIVE) + return 0; + break; + + case LST_TRANS_SESEND: + if (nd->nd_state != LST_NODE_ACTIVE) + return 0; + + if (grp != NULL && nd->nd_ref > 1) + return 0; + break; + + case LST_TRANS_SESQRY: + break; + + default: + LBUG(); + } + + return 1; +} + +static int +lstcon_sesrpc_readent(int transop, struct srpc_msg *msg, + struct lstcon_rpc_ent __user *ent_up) +{ + struct srpc_debug_reply *rep; + + switch (transop) { + case LST_TRANS_SESNEW: + case LST_TRANS_SESEND: + return 0; + + case LST_TRANS_SESQRY: + rep = &msg->msg_body.dbg_reply; + + if (copy_to_user(&ent_up->rpe_priv[0], + &rep->dbg_timeout, sizeof(int)) || + copy_to_user(&ent_up->rpe_payload[0], + &rep->dbg_name, LST_NAME_SIZE)) + return -EFAULT; + + return 0; + + default: + LBUG(); + } + + return 0; +} + +static int +lstcon_group_nodes_add(struct lstcon_group *grp, + int count, struct lnet_process_id __user *ids_up, + unsigned int *featp, + struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_ndlink *ndl; + struct lstcon_group *tmp; + struct lnet_process_id id; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0 ; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* skip if it's in this group already */ + rc = lstcon_group_ndlink_find(grp, id, &ndl, 0); + if (rc == 0) + continue; + + /* add to tmp group */ + rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1); + if (rc != 0) { + CERROR("Can't create ndlink, out of memory\n"); + break; + } + } + + if (rc != 0) { + lstcon_group_decref(tmp); + return rc; + } + + rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list, + &tmp->grp_trans_list, LST_TRANS_SESNEW, + tmp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + lstcon_group_decref(tmp); + return rc; + } + + /* post all RPCs */ + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_sesrpc_readent); + *featp = trans->tas_features; + + /* destroy all RPGs */ + lstcon_rpc_trans_destroy(trans); + + lstcon_group_move(tmp, grp); + lstcon_group_decref(tmp); + + return rc; +} + +static int +lstcon_group_nodes_remove(struct lstcon_group *grp, + int count, struct lnet_process_id __user *ids_up, + struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_ndlink *ndl; + struct lstcon_group *tmp; + struct lnet_process_id id; + int rc; + int i; + + /* End session and remove node from the group */ + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + goto error; + } + + /* move node to tmp group */ + if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0) + lstcon_group_ndlink_move(grp, tmp, ndl); + } + + rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list, + &tmp->grp_trans_list, LST_TRANS_SESEND, + tmp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + goto error; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* release nodes anyway, because we can't rollback status */ + lstcon_group_decref(tmp); + + return rc; +error: + lstcon_group_move(tmp, grp); + lstcon_group_decref(tmp); + + return rc; +} + +int +lstcon_group_add(char *name) +{ + struct lstcon_group *grp; + int rc; + + rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0; + if (rc != 0) { + /* find a group with same name */ + lstcon_group_decref(grp); + return rc; + } + + rc = lstcon_group_alloc(name, &grp); + if (rc != 0) { + CERROR("Can't allocate descriptor for group %s\n", name); + return -ENOMEM; + } + + list_add_tail(&grp->grp_link, &console_session.ses_grp_list); + + return rc; +} + +int +lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up, + unsigned *featp, struct list_head __user *result_up) +{ + struct lstcon_group *grp; + int rc; + + LASSERT (count > 0); + LASSERT (ids_up != NULL); + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by other threads or test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_decref(grp); + + return -EBUSY; + } + + rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up); + + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_group_del(char *name) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_group *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by others threads or test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_decref(grp); + return -EBUSY; + } + + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &grp->grp_trans_list, LST_TRANS_SESEND, + grp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + lstcon_group_decref(grp); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + lstcon_rpc_trans_destroy(trans); + + lstcon_group_decref(grp); + /* -ref for session, it's destroyed, + * status can't be rolled back, destroy group anway */ + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_group_clean(char *name, int args) +{ + struct lstcon_group *grp = NULL; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_decref(grp); + return -EBUSY; + } + + args = (LST_NODE_ACTIVE | LST_NODE_BUSY | + LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args; + + lstcon_group_drain(grp, args); + + lstcon_group_decref(grp); + /* release empty group */ + if (list_empty(&grp->grp_ndl_list)) + lstcon_group_decref(grp); + + return 0; +} + +int +lstcon_nodes_remove(char *name, int count, + struct lnet_process_id __user *ids_up, + struct list_head __user *result_up) +{ + struct lstcon_group *grp = NULL; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_decref(grp); + return -EBUSY; + } + + rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up); + + lstcon_group_decref(grp); + /* release empty group */ + if (list_empty(&grp->grp_ndl_list)) + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_group_refresh(char *name, struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_group *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_decref(grp); + return -EBUSY; + } + + /* re-invite all inactive nodes int the group */ + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &grp->grp_trans_list, LST_TRANS_SESNEW, + grp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + /* local error, return */ + CDEBUG(D_NET, "Can't create transaction: %d\n", rc); + lstcon_group_decref(grp); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* -ref for me */ + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_group_list(int index, int len, char __user *name_up) +{ + struct lstcon_group *grp; + + LASSERT(index >= 0); + LASSERT(name_up != NULL); + + list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) { + if (index-- == 0) { + return copy_to_user(name_up, grp->grp_name, len) ? + -EFAULT : 0; + } + } + + return -ENOENT; +} + +static int +lstcon_nodes_getent(struct list_head *head, int *index_p, + int *count_p, struct lstcon_node_ent __user *dents_up) +{ + struct lstcon_ndlink *ndl; + struct lstcon_node *nd; + int count = 0; + int index = 0; + + LASSERT(index_p != NULL && count_p != NULL); + LASSERT(dents_up != NULL); + LASSERT(*index_p >= 0); + LASSERT(*count_p > 0); + + list_for_each_entry(ndl, head, ndl_link) { + if (index++ < *index_p) + continue; + + if (count >= *count_p) + break; + + nd = ndl->ndl_node; + if (copy_to_user(&dents_up[count].nde_id, + &nd->nd_id, sizeof(nd->nd_id)) || + copy_to_user(&dents_up[count].nde_state, + &nd->nd_state, sizeof(nd->nd_state))) + return -EFAULT; + + count ++; + } + + if (index <= *index_p) + return -ENOENT; + + *count_p = count; + *index_p = index; + + return 0; +} + +int +lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p, + int *index_p, int *count_p, + struct lstcon_node_ent __user *dents_up) +{ + struct lstcon_ndlist_ent *gentp; + struct lstcon_group *grp; + struct lstcon_ndlink *ndl; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (dents_up != NULL) { + /* verbose query */ + rc = lstcon_nodes_getent(&grp->grp_ndl_list, + index_p, count_p, dents_up); + lstcon_group_decref(grp); + + return rc; + } + + /* non-verbose query */ + LIBCFS_ALLOC(gentp, sizeof(struct lstcon_ndlist_ent)); + if (gentp == NULL) { + CERROR("Can't allocate ndlist_ent\n"); + lstcon_group_decref(grp); + + return -ENOMEM; + } + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp); + + rc = copy_to_user(gents_p, gentp, + sizeof(struct lstcon_ndlist_ent)) ? -EFAULT : 0; + + LIBCFS_FREE(gentp, sizeof(struct lstcon_ndlist_ent)); + + lstcon_group_decref(grp); + + return 0; +} + +static int +lstcon_batch_find(const char *name, struct lstcon_batch **batpp) +{ + struct lstcon_batch *bat; + + list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) { + if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) { + *batpp = bat; + return 0; + } + } + + return -ENOENT; +} + +int +lstcon_batch_add(char *name) +{ + struct lstcon_batch *bat; + int i; + int rc; + + rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0; + if (rc != 0) { + CDEBUG(D_NET, "Batch %s already exists\n", name); + return rc; + } + + LIBCFS_ALLOC(bat, sizeof(*bat)); + if (bat == NULL) { + CERROR("Can't allocate descriptor for batch %s\n", name); + return -ENOMEM; + } + + LIBCFS_ALLOC(bat->bat_cli_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + if (bat->bat_cli_hash == NULL) { + CERROR("Can't allocate hash for batch %s\n", name); + LIBCFS_FREE(bat, sizeof(*bat)); + + return -ENOMEM; + } + + LIBCFS_ALLOC(bat->bat_srv_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + if (bat->bat_srv_hash == NULL) { + CERROR("Can't allocate hash for batch %s\n", name); + LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE); + LIBCFS_FREE(bat, sizeof(*bat)); + + return -ENOMEM; + } + + if (strlen(name) > sizeof(bat->bat_name)-1) { + LIBCFS_FREE(bat->bat_srv_hash, LST_NODE_HASHSIZE); + LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE); + LIBCFS_FREE(bat, sizeof(*bat)); + return -E2BIG; + } + strncpy(bat->bat_name, name, sizeof(bat->bat_name)); + bat->bat_hdr.tsb_index = 0; + bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie; + + bat->bat_ntest = 0; + bat->bat_state = LST_BATCH_IDLE; + + INIT_LIST_HEAD(&bat->bat_cli_list); + INIT_LIST_HEAD(&bat->bat_srv_list); + INIT_LIST_HEAD(&bat->bat_test_list); + INIT_LIST_HEAD(&bat->bat_trans_list); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + INIT_LIST_HEAD(&bat->bat_cli_hash[i]); + INIT_LIST_HEAD(&bat->bat_srv_hash[i]); + } + + list_add_tail(&bat->bat_link, &console_session.ses_bat_list); + + return rc; +} + +int +lstcon_batch_list(int index, int len, char __user *name_up) +{ + struct lstcon_batch *bat; + + LASSERT(name_up != NULL); + LASSERT(index >= 0); + + list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) { + if (index-- == 0) { + return copy_to_user(name_up, bat->bat_name, len) ? + -EFAULT : 0; + } + } + + return -ENOENT; +} + +int +lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up, + int server, int testidx, int *index_p, int *ndent_p, + struct lstcon_node_ent __user *dents_up) +{ + struct lstcon_test_batch_ent *entp; + struct list_head *clilst; + struct list_head *srvlst; + struct lstcon_test *test = NULL; + struct lstcon_batch *bat; + struct lstcon_ndlink *ndl; + int rc; + + rc = lstcon_batch_find(name, &bat); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + if (testidx > 0) { + /* query test, test index start from 1 */ + list_for_each_entry(test, &bat->bat_test_list, tes_link) { + if (testidx-- == 1) + break; + } + + if (testidx > 0) { + CDEBUG(D_NET, "Can't find specified test in batch\n"); + return -ENOENT; + } + } + + clilst = (test == NULL) ? &bat->bat_cli_list : + &test->tes_src_grp->grp_ndl_list; + srvlst = (test == NULL) ? &bat->bat_srv_list : + &test->tes_dst_grp->grp_ndl_list; + + if (dents_up != NULL) { + rc = lstcon_nodes_getent((server ? srvlst: clilst), + index_p, ndent_p, dents_up); + return rc; + } + + /* non-verbose query */ + LIBCFS_ALLOC(entp, sizeof(struct lstcon_test_batch_ent)); + if (entp == NULL) + return -ENOMEM; + + if (test == NULL) { + entp->u.tbe_batch.bae_ntest = bat->bat_ntest; + entp->u.tbe_batch.bae_state = bat->bat_state; + + } else { + + entp->u.tbe_test.tse_type = test->tes_type; + entp->u.tbe_test.tse_loop = test->tes_loop; + entp->u.tbe_test.tse_concur = test->tes_concur; + } + + list_for_each_entry(ndl, clilst, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle); + + list_for_each_entry(ndl, srvlst, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle); + + rc = copy_to_user(ent_up, entp, + sizeof(struct lstcon_test_batch_ent)) ? -EFAULT : 0; + + LIBCFS_FREE(entp, sizeof(struct lstcon_test_batch_ent)); + + return rc; +} + +static int +lstcon_batrpc_condition(int transop, struct lstcon_node *nd, void *arg) +{ + switch (transop) { + case LST_TRANS_TSBRUN: + if (nd->nd_state != LST_NODE_ACTIVE) + return -ENETDOWN; + break; + + case LST_TRANS_TSBSTOP: + if (nd->nd_state != LST_NODE_ACTIVE) + return 0; + break; + + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + break; + } + + return 1; +} + +static int +lstcon_batch_op(struct lstcon_batch *bat, int transop, + struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + int rc; + + rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list, + &bat->bat_trans_list, transop, + bat, lstcon_batrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up) +{ + struct lstcon_batch *bat; + int rc; + + if (lstcon_batch_find(name, &bat) != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + bat->bat_arg = timeout; + + rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up); + + /* mark batch as running if it's started in any node */ + if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0) + bat->bat_state = LST_BATCH_RUNNING; + + return rc; +} + +int +lstcon_batch_stop(char *name, int force, struct list_head __user *result_up) +{ + struct lstcon_batch *bat; + int rc; + + if (lstcon_batch_find(name, &bat) != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + bat->bat_arg = force; + + rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up); + + /* mark batch as stopped if all RPCs finished */ + if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0) + bat->bat_state = LST_BATCH_IDLE; + + return rc; +} + +static void +lstcon_batch_destroy(struct lstcon_batch *bat) +{ + struct lstcon_ndlink *ndl; + struct lstcon_test *test; + int i; + + list_del(&bat->bat_link); + + while (!list_empty(&bat->bat_test_list)) { + test = list_entry(bat->bat_test_list.next, + struct lstcon_test, tes_link); + LASSERT(list_empty(&test->tes_trans_list)); + + list_del(&test->tes_link); + + lstcon_group_decref(test->tes_src_grp); + lstcon_group_decref(test->tes_dst_grp); + + LIBCFS_FREE(test, offsetof(struct lstcon_test, + tes_param[test->tes_paramlen])); + } + + LASSERT(list_empty(&bat->bat_trans_list)); + + while (!list_empty(&bat->bat_cli_list)) { + ndl = list_entry(bat->bat_cli_list.next, + struct lstcon_ndlink, ndl_link); + list_del_init(&ndl->ndl_link); + + lstcon_ndlink_release(ndl); + } + + while (!list_empty(&bat->bat_srv_list)) { + ndl = list_entry(bat->bat_srv_list.next, + struct lstcon_ndlink, ndl_link); + list_del_init(&ndl->ndl_link); + + lstcon_ndlink_release(ndl); + } + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + LASSERT(list_empty(&bat->bat_cli_hash[i])); + LASSERT(list_empty(&bat->bat_srv_hash[i])); + } + + LIBCFS_FREE(bat->bat_cli_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + LIBCFS_FREE(bat->bat_srv_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + LIBCFS_FREE(bat, sizeof(*bat)); +} + +static int +lstcon_testrpc_condition(int transop, struct lstcon_node *nd, void *arg) +{ + struct lstcon_test *test = arg; + struct lstcon_batch *batch; + struct lstcon_ndlink *ndl; + struct list_head *hash; + struct list_head *head; + + LASSERT(test != NULL); + + batch = test->tes_batch; + LASSERT(batch != NULL); + + if (test->tes_oneside && + transop == LST_TRANS_TSBSRVADD) + return 0; + + if (nd->nd_state != LST_NODE_ACTIVE) + return -ENETDOWN; + + if (transop == LST_TRANS_TSBCLIADD) { + hash = batch->bat_cli_hash; + head = &batch->bat_cli_list; + + } else { + LASSERT (transop == LST_TRANS_TSBSRVADD); + + hash = batch->bat_srv_hash; + head = &batch->bat_srv_list; + } + + LASSERT (nd->nd_id.nid != LNET_NID_ANY); + + if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0) + return -ENOMEM; + + if (list_empty(&ndl->ndl_link)) + list_add_tail(&ndl->ndl_link, head); + + return 1; +} + +static int +lstcon_test_nodes_add(struct lstcon_test *test, + struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + struct lstcon_group *grp; + int transop; + int rc; + + LASSERT (test->tes_src_grp != NULL); + LASSERT (test->tes_dst_grp != NULL); + + transop = LST_TRANS_TSBSRVADD; + grp = test->tes_dst_grp; +again: + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &test->tes_trans_list, transop, + test, lstcon_testrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + if (lstcon_trans_stat()->trs_rpc_errno != 0 || + lstcon_trans_stat()->trs_fwk_errno != 0) { + lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* return if any error */ + CDEBUG(D_NET, "Failed to add test %s, " + "RPC error %d, framework error %d\n", + transop == LST_TRANS_TSBCLIADD ? "client" : "server", + lstcon_trans_stat()->trs_rpc_errno, + lstcon_trans_stat()->trs_fwk_errno); + + return rc; + } + + lstcon_rpc_trans_destroy(trans); + + if (transop == LST_TRANS_TSBCLIADD) + return rc; + + transop = LST_TRANS_TSBCLIADD; + grp = test->tes_src_grp; + test->tes_cliidx = 0; + + /* requests to test clients */ + goto again; +} + +static int +lstcon_verify_batch(const char *name, struct lstcon_batch **batch) +{ + int rc; + + rc = lstcon_batch_find(name, batch); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return rc; + } + + if ((*batch)->bat_state != LST_BATCH_IDLE) { + CDEBUG(D_NET, "Can't change running batch %s\n", name); + return -EINVAL; + } + + return 0; +} + +static int +lstcon_verify_group(const char *name, struct lstcon_group **grp) +{ + int rc; + struct lstcon_ndlink *ndl; + + rc = lstcon_group_find(name, grp); + if (rc != 0) { + CDEBUG(D_NET, "can't find group %s\n", name); + return rc; + } + + list_for_each_entry(ndl, &(*grp)->grp_ndl_list, ndl_link) { + if (ndl->ndl_node->nd_state == LST_NODE_ACTIVE) { + return 0; + } + } + + CDEBUG(D_NET, "Group %s has no ACTIVE nodes\n", name); + + return -EINVAL; +} + +int +lstcon_test_add(char *batch_name, int type, int loop, + int concur, int dist, int span, + char *src_name, char *dst_name, + void *param, int paramlen, int *retp, + struct list_head __user *result_up) +{ + struct lstcon_test *test = NULL; + int rc; + struct lstcon_group *src_grp = NULL; + struct lstcon_group *dst_grp = NULL; + struct lstcon_batch *batch = NULL; + + /* + * verify that a batch of the given name exists, and the groups + * that will be part of the batch exist and have at least one + * active node + */ + rc = lstcon_verify_batch(batch_name, &batch); + if (rc != 0) + goto out; + + rc = lstcon_verify_group(src_name, &src_grp); + if (rc != 0) + goto out; + + rc = lstcon_verify_group(dst_name, &dst_grp); + if (rc != 0) + goto out; + + if (dst_grp->grp_userland) + *retp = 1; + + LIBCFS_ALLOC(test, offsetof(struct lstcon_test, tes_param[paramlen])); + if (!test) { + CERROR("Can't allocate test descriptor\n"); + rc = -ENOMEM; + + goto out; + } + + test->tes_hdr.tsb_id = batch->bat_hdr.tsb_id; + test->tes_batch = batch; + test->tes_type = type; + test->tes_oneside = 0; /* TODO */ + test->tes_loop = loop; + test->tes_concur = concur; + test->tes_stop_onerr = 1; /* TODO */ + test->tes_span = span; + test->tes_dist = dist; + test->tes_cliidx = 0; /* just used for creating RPC */ + test->tes_src_grp = src_grp; + test->tes_dst_grp = dst_grp; + INIT_LIST_HEAD(&test->tes_trans_list); + + if (param != NULL) { + test->tes_paramlen = paramlen; + memcpy(&test->tes_param[0], param, paramlen); + } + + rc = lstcon_test_nodes_add(test, result_up); + + if (rc != 0) + goto out; + + if (lstcon_trans_stat()->trs_rpc_errno != 0 || + lstcon_trans_stat()->trs_fwk_errno != 0) + CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, + batch_name); + + /* add to test list anyway, so user can check what's going on */ + list_add_tail(&test->tes_link, &batch->bat_test_list); + + batch->bat_ntest++; + test->tes_hdr.tsb_index = batch->bat_ntest; + + /* hold groups so nobody can change them */ + return rc; +out: + if (test != NULL) + LIBCFS_FREE(test, offsetof(struct lstcon_test, + tes_param[paramlen])); + + if (dst_grp != NULL) + lstcon_group_decref(dst_grp); + + if (src_grp != NULL) + lstcon_group_decref(src_grp); + + return rc; +} + +static int +lstcon_test_find(struct lstcon_batch *batch, int idx, + struct lstcon_test **testpp) +{ + struct lstcon_test *test; + + list_for_each_entry(test, &batch->bat_test_list, tes_link) { + if (idx == test->tes_hdr.tsb_index) { + *testpp = test; + return 0; + } + } + + return -ENOENT; +} + +static int +lstcon_tsbrpc_readent(int transop, struct srpc_msg *msg, + struct lstcon_rpc_ent __user *ent_up) +{ + struct srpc_batch_reply *rep = &msg->msg_body.bat_reply; + + LASSERT (transop == LST_TRANS_TSBCLIQRY || + transop == LST_TRANS_TSBSRVQRY); + + /* positive errno, framework error code */ + if (copy_to_user(&ent_up->rpe_priv[0], + &rep->bar_active, sizeof(rep->bar_active))) + return -EFAULT; + + return 0; +} + +int +lstcon_test_batch_query(char *name, int testidx, int client, + int timeout, struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + struct list_head *translist; + struct list_head *ndlist; + struct lstcon_tsb_hdr *hdr; + struct lstcon_batch *batch; + struct lstcon_test *test = NULL; + int transop; + int rc; + + rc = lstcon_batch_find(name, &batch); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch: %s\n", name); + return rc; + } + + if (testidx == 0) { + translist = &batch->bat_trans_list; + ndlist = &batch->bat_cli_list; + hdr = &batch->bat_hdr; + + } else { + /* query specified test only */ + rc = lstcon_test_find(batch, testidx, &test); + if (rc != 0) { + CDEBUG(D_NET, "Can't find test: %d\n", testidx); + return rc; + } + + translist = &test->tes_trans_list; + ndlist = &test->tes_src_grp->grp_ndl_list; + hdr = &test->tes_hdr; + } + + transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY; + + rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr, + lstcon_batrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, timeout); + + if (testidx == 0 && /* query a batch, not a test */ + lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 && + lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) { + /* all RPCs finished, and no active test */ + batch->bat_state = LST_BATCH_IDLE; + } + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_tsbrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +static int +lstcon_statrpc_readent(int transop, struct srpc_msg *msg, + struct lstcon_rpc_ent __user *ent_up) +{ + struct srpc_stat_reply *rep = &msg->msg_body.stat_reply; + struct sfw_counters __user *sfwk_stat; + struct srpc_counters __user *srpc_stat; + struct lnet_counters_common __user *lnet_stat; + + if (rep->str_status != 0) + return 0; + + sfwk_stat = (struct sfw_counters __user *)&ent_up->rpe_payload[0]; + srpc_stat = (struct srpc_counters __user *) + ((char __user *)sfwk_stat + sizeof(*sfwk_stat)); + lnet_stat = (struct lnet_counters_common __user *) + ((char __user *)srpc_stat + sizeof(*srpc_stat)); + + if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) || + copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) || + copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat))) + return -EFAULT; + + return 0; +} + +static int +lstcon_ndlist_stat(struct list_head *ndlist, + int timeout, struct list_head __user *result_up) +{ + struct list_head head; + struct lstcon_rpc_trans *trans; + int rc; + + INIT_LIST_HEAD(&head); + + rc = lstcon_rpc_trans_ndlist(ndlist, &head, + LST_TRANS_STATQRY, NULL, NULL, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout)); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_statrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_group_stat(char *grp_name, int timeout, + struct list_head __user *result_up) +{ + struct lstcon_group *grp; + int rc; + + rc = lstcon_group_find(grp_name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", grp_name); + return rc; + } + + rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up); + + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up, + int timeout, struct list_head __user *result_up) +{ + struct lstcon_ndlink *ndl; + struct lstcon_group *tmp; + struct lnet_process_id id; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0 ; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* add to tmp group */ + rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2); + if (rc != 0) { + CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET, + "Failed to find or create %s: %d\n", + libcfs_id2str(id), rc); + break; + } + } + + if (rc != 0) { + lstcon_group_decref(tmp); + return rc; + } + + rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up); + + lstcon_group_decref(tmp); + + return rc; +} + +static int +lstcon_debug_ndlist(struct list_head *ndlist, + struct list_head *translist, + int timeout, struct list_head __user *result_up) +{ + struct lstcon_rpc_trans *trans; + int rc; + + rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY, + NULL, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout)); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_sesrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_session_debug(int timeout, struct list_head __user *result_up) +{ + return lstcon_debug_ndlist(&console_session.ses_ndl_list, + NULL, timeout, result_up); +} + +int +lstcon_batch_debug(int timeout, char *name, + int client, struct list_head __user *result_up) +{ + struct lstcon_batch *bat; + int rc; + + rc = lstcon_batch_find(name, &bat); + if (rc != 0) + return -ENOENT; + + rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list : + &bat->bat_srv_list, + NULL, timeout, result_up); + + return rc; +} + +int +lstcon_group_debug(int timeout, char *name, + struct list_head __user *result_up) +{ + struct lstcon_group *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) + return -ENOENT; + + rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL, + timeout, result_up); + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_nodes_debug(int timeout, int count, + struct lnet_process_id __user *ids_up, + struct list_head __user *result_up) +{ + struct lnet_process_id id; + struct lstcon_ndlink *ndl; + struct lstcon_group *grp; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Out of memory\n"); + return rc; + } + + for (i = 0; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* node is added to tmp group */ + rc = lstcon_group_ndlink_find(grp, id, &ndl, 1); + if (rc != 0) { + CERROR("Can't create node link\n"); + break; + } + } + + if (rc != 0) { + lstcon_group_decref(grp); + return rc; + } + + rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL, + timeout, result_up); + + lstcon_group_decref(grp); + + return rc; +} + +int +lstcon_session_match(struct lst_sid sid) +{ + return (console_session.ses_id.ses_nid == sid.ses_nid && + console_session.ses_id.ses_stamp == sid.ses_stamp) ? 1: 0; +} + +static void +lstcon_new_session_id(struct lst_sid *sid) +{ + struct lnet_process_id id; + + LASSERT(console_session.ses_state == LST_SESSION_NONE); + + LNetGetId(1, &id); + sid->ses_nid = id.nid; + sid->ses_stamp = div_u64(ktime_get_ns(), NSEC_PER_MSEC); +} + +int +lstcon_session_new(char *name, int key, unsigned feats, + int timeout, int force, struct lst_sid __user *sid_up) +{ + int rc = 0; + int i; + + if (console_session.ses_state != LST_SESSION_NONE) { + /* session exists */ + if (!force) { + CNETERR("Session %s already exists\n", + console_session.ses_name); + return -EEXIST; + } + + rc = lstcon_session_end(); + + /* lstcon_session_end() only return local error */ + if (rc != 0) + return rc; + } + + if ((feats & ~LST_FEATS_MASK) != 0) { + CNETERR("Unknown session features %x\n", + (feats & ~LST_FEATS_MASK)); + return -EINVAL; + } + + for (i = 0; i < LST_GLOBAL_HASHSIZE; i++) + LASSERT(list_empty(&console_session.ses_ndl_hash[i])); + + lstcon_new_session_id(&console_session.ses_id); + + console_session.ses_key = key; + console_session.ses_state = LST_SESSION_ACTIVE; + console_session.ses_force = !!force; + console_session.ses_features = feats; + console_session.ses_feats_updated = 0; + console_session.ses_timeout = (timeout <= 0) ? + LST_CONSOLE_TIMEOUT : timeout; + + if (strlen(name) > sizeof(console_session.ses_name)-1) + return -E2BIG; + strlcpy(console_session.ses_name, name, + sizeof(console_session.ses_name)); + + rc = lstcon_batch_add(LST_DEFAULT_BATCH); + if (rc != 0) + return rc; + + rc = lstcon_rpc_pinger_start(); + if (rc != 0) { + struct lstcon_batch *bat = NULL; + + lstcon_batch_find(LST_DEFAULT_BATCH, &bat); + lstcon_batch_destroy(bat); + + return rc; + } + + if (copy_to_user(sid_up, &console_session.ses_id, + sizeof(struct lst_sid)) == 0) + return rc; + + lstcon_session_end(); + + return -EFAULT; +} + +int +lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up, + unsigned __user *featp, + struct lstcon_ndlist_ent __user *ndinfo_up, + char __user *name_up, int len) +{ + struct lstcon_ndlist_ent *entp; + struct lstcon_ndlink *ndl; + int rc = 0; + + if (console_session.ses_state != LST_SESSION_ACTIVE) + return -ESRCH; + + LIBCFS_ALLOC(entp, sizeof(*entp)); + if (entp == NULL) + return -ENOMEM; + + list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, entp); + + if (copy_to_user(sid_up, &console_session.ses_id, + sizeof(struct lst_sid)) || + copy_to_user(key_up, &console_session.ses_key, + sizeof(*key_up)) || + copy_to_user(featp, &console_session.ses_features, + sizeof(*featp)) || + copy_to_user(ndinfo_up, entp, sizeof(*entp)) || + copy_to_user(name_up, console_session.ses_name, len)) + rc = -EFAULT; + + LIBCFS_FREE(entp, sizeof(*entp)); + + return rc; +} + +int +lstcon_session_end() +{ + struct lstcon_rpc_trans *trans; + struct lstcon_group *grp; + struct lstcon_batch *bat; + int rc = 0; + + LASSERT (console_session.ses_state == LST_SESSION_ACTIVE); + + rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list, + NULL, LST_TRANS_SESEND, NULL, + lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + console_session.ses_shutdown = 1; + + lstcon_rpc_pinger_stop(); + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + lstcon_rpc_trans_destroy(trans); + /* User can do nothing even rpc failed, so go on */ + + /* waiting for orphan rpcs to die */ + lstcon_rpc_cleanup_wait(); + + console_session.ses_id = LST_INVALID_SID; + console_session.ses_state = LST_SESSION_NONE; + console_session.ses_key = 0; + console_session.ses_force = 0; + console_session.ses_feats_updated = 0; + + /* destroy all batches */ + while (!list_empty(&console_session.ses_bat_list)) { + bat = list_entry(console_session.ses_bat_list.next, + struct lstcon_batch, bat_link); + + lstcon_batch_destroy(bat); + } + + /* destroy all groups */ + while (!list_empty(&console_session.ses_grp_list)) { + grp = list_entry(console_session.ses_grp_list.next, + struct lstcon_group, grp_link); + LASSERT(grp->grp_ref == 1); + + lstcon_group_decref(grp); + } + + /* all nodes should be released */ + LASSERT(list_empty(&console_session.ses_ndl_list)); + + console_session.ses_shutdown = 0; + console_session.ses_expired = 0; + + return rc; +} + +int +lstcon_session_feats_check(unsigned feats) +{ + int rc = 0; + + if ((feats & ~LST_FEATS_MASK) != 0) { + CERROR("Can't support these features: %x\n", + (feats & ~LST_FEATS_MASK)); + return -EPROTO; + } + + spin_lock(&console_session.ses_rpc_lock); + + if (!console_session.ses_feats_updated) { + console_session.ses_feats_updated = 1; + console_session.ses_features = feats; + } + + if (console_session.ses_features != feats) + rc = -EPROTO; + + spin_unlock(&console_session.ses_rpc_lock); + + if (rc != 0) { + CERROR("remote features %x do not match with " + "session features %x of console\n", + feats, console_session.ses_features); + } + + return rc; +} + +static int +lstcon_acceptor_handle(struct srpc_server_rpc *rpc) +{ + struct srpc_msg *rep = &rpc->srpc_replymsg; + struct srpc_msg *req = &rpc->srpc_reqstbuf->buf_msg; + struct srpc_join_reqst *jreq = &req->msg_body.join_reqst; + struct srpc_join_reply *jrep = &rep->msg_body.join_reply; + struct lstcon_group *grp = NULL; + struct lstcon_ndlink *ndl; + int rc = 0; + + sfw_unpack_message(req); + + mutex_lock(&console_session.ses_mutex); + + jrep->join_sid = console_session.ses_id; + + if (console_session.ses_id.ses_nid == LNET_NID_ANY) { + jrep->join_status = ESRCH; + goto out; + } + + if (lstcon_session_feats_check(req->msg_ses_feats) != 0) { + jrep->join_status = EPROTO; + goto out; + } + + if (jreq->join_sid.ses_nid != LNET_NID_ANY && + !lstcon_session_match(jreq->join_sid)) { + jrep->join_status = EBUSY; + goto out; + } + + if (lstcon_group_find(jreq->join_group, &grp) != 0) { + rc = lstcon_group_alloc(jreq->join_group, &grp); + if (rc != 0) { + CERROR("Out of memory\n"); + goto out; + } + + list_add_tail(&grp->grp_link, + &console_session.ses_grp_list); + lstcon_group_addref(grp); + } + + if (grp->grp_ref > 2) { + /* Group in using */ + jrep->join_status = EBUSY; + goto out; + } + + rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0); + if (rc == 0) { + jrep->join_status = EEXIST; + goto out; + } + + rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1); + if (rc != 0) { + CERROR("Out of memory\n"); + goto out; + } + + ndl->ndl_node->nd_state = LST_NODE_ACTIVE; + ndl->ndl_node->nd_timeout = console_session.ses_timeout; + + if (grp->grp_userland == 0) + grp->grp_userland = 1; + + strlcpy(jrep->join_session, console_session.ses_name, + sizeof(jrep->join_session)); + jrep->join_timeout = console_session.ses_timeout; + jrep->join_status = 0; + +out: + rep->msg_ses_feats = console_session.ses_features; + if (grp != NULL) + lstcon_group_decref(grp); + + mutex_unlock(&console_session.ses_mutex); + + return rc; +} + +static struct srpc_service lstcon_acceptor_service; + +static void lstcon_init_acceptor_service(void) +{ + /* initialize selftest console acceptor service table */ + lstcon_acceptor_service.sv_name = "join session"; + lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle; + lstcon_acceptor_service.sv_id = SRPC_SERVICE_JOIN; + lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX; +} + +static struct notifier_block lstcon_ioctl_handler = { + .notifier_call = lstcon_ioctl_entry, +}; + +/* initialize console */ +int +lstcon_console_init(void) +{ + int i; + int rc; + + console_session.ses_id = LST_INVALID_SID; + console_session.ses_state = LST_SESSION_NONE; + console_session.ses_timeout = 0; + console_session.ses_force = 0; + console_session.ses_expired = 0; + console_session.ses_feats_updated = 0; + console_session.ses_features = LST_FEATS_MASK; + console_session.ses_laststamp = ktime_get_real_seconds(); + + mutex_init(&console_session.ses_mutex); + + INIT_LIST_HEAD(&console_session.ses_ndl_list); + INIT_LIST_HEAD(&console_session.ses_grp_list); + INIT_LIST_HEAD(&console_session.ses_bat_list); + INIT_LIST_HEAD(&console_session.ses_trans_list); + + LIBCFS_ALLOC(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + if (console_session.ses_ndl_hash == NULL) + return -ENOMEM; + + for (i = 0; i < LST_GLOBAL_HASHSIZE; i++) + INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]); + + + /* initialize acceptor service table */ + lstcon_init_acceptor_service(); + + rc = srpc_add_service(&lstcon_acceptor_service); + LASSERT(rc != -EBUSY); + if (rc != 0) { + LIBCFS_FREE(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + return rc; + } + + rc = srpc_service_add_buffers(&lstcon_acceptor_service, + lstcon_acceptor_service.sv_wi_total); + if (rc != 0) { + rc = -ENOMEM; + goto out; + } + + rc = blocking_notifier_chain_register(&libcfs_ioctl_list, + &lstcon_ioctl_handler); + if (rc == 0) { + lstcon_rpc_module_init(); + return 0; + } + +out: + srpc_shutdown_service(&lstcon_acceptor_service); + srpc_remove_service(&lstcon_acceptor_service); + + LIBCFS_FREE(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + + srpc_wait_service_shutdown(&lstcon_acceptor_service); + + return rc; +} + +int +lstcon_console_fini(void) +{ + int i; + + blocking_notifier_chain_unregister(&libcfs_ioctl_list, + &lstcon_ioctl_handler); + + mutex_lock(&console_session.ses_mutex); + + srpc_shutdown_service(&lstcon_acceptor_service); + srpc_remove_service(&lstcon_acceptor_service); + + if (console_session.ses_state != LST_SESSION_NONE) + lstcon_session_end(); + + lstcon_rpc_module_fini(); + + mutex_unlock(&console_session.ses_mutex); + + LASSERT(list_empty(&console_session.ses_ndl_list)); + LASSERT(list_empty(&console_session.ses_grp_list)); + LASSERT(list_empty(&console_session.ses_bat_list)); + LASSERT(list_empty(&console_session.ses_trans_list)); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) + LASSERT(list_empty(&console_session.ses_ndl_hash[i])); + + LIBCFS_FREE(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + + srpc_wait_service_shutdown(&lstcon_acceptor_service); + + return 0; +} + diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.h b/drivers/staging/lustrefsx/lnet/selftest/console.h new file mode 100644 index 0000000000000..02c76a89627e6 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/console.h @@ -0,0 +1,263 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/console.h + * + * kernel structure for LST console + * + * Author: Liang Zhen + */ + +#ifndef __LST_CONSOLE_H__ +#define __LST_CONSOLE_H__ + +#include + +#include +#include +#include "selftest.h" +#include "conrpc.h" + +/* node descriptor */ +struct lstcon_node { + struct lnet_process_id nd_id; /* id of the node */ + int nd_ref; /* reference count */ + int nd_state; /* state of the node */ + int nd_timeout; /* session timeout */ + ktime_t nd_stamp; /* last RPC reply timestamp */ + struct lstcon_rpc nd_ping; /* ping rpc */ +}; + +/* node link descriptor */ +struct lstcon_ndlink { + struct list_head ndl_link; /* chain on list */ + struct list_head ndl_hlink; /* chain on hash */ + struct lstcon_node *ndl_node; /* pointer to node */ +}; + +/* (alias of nodes) group descriptor */ +struct lstcon_group { + struct list_head grp_link; /* chain on global group list */ + int grp_ref; /* reference count */ + int grp_userland; /* has userland nodes */ + int grp_nnode; /* # of nodes */ + char grp_name[LST_NAME_SIZE]; /* group name */ + + struct list_head grp_trans_list; /* transaction list */ + struct list_head grp_ndl_list; /* nodes list */ + struct list_head grp_ndl_hash[0];/* hash table for nodes */ +}; + +#define LST_BATCH_IDLE 0xB0 /* idle batch */ +#define LST_BATCH_RUNNING 0xB1 /* running batch */ + +struct lstcon_tsb_hdr { + struct lst_bid tsb_id; /* batch ID */ + int tsb_index; /* test index */ +}; + +/* (tests ) batch descriptor */ +struct lstcon_batch { + /* test_batch header */ + struct lstcon_tsb_hdr bat_hdr; + /* chain on session's batches list */ + struct list_head bat_link; + /* # of test */ + int bat_ntest; + /* state of the batch */ + int bat_state; + /* parameter for run|stop, timeout for run, force for stop */ + int bat_arg; + /* name of batch */ + char bat_name[LST_NAME_SIZE]; + + /* list head of tests (lstcon_test_t) */ + struct list_head bat_test_list; + /* list head of transaction */ + struct list_head bat_trans_list; + /* list head of client nodes (struct lstcon_node) */ + struct list_head bat_cli_list; + /* hash table of client nodes */ + struct list_head *bat_cli_hash; + /* list head of server nodes */ + struct list_head bat_srv_list; + /* hash table of server nodes */ + struct list_head *bat_srv_hash; +}; + +/* a single test descriptor */ +struct lstcon_test { + /* test batch header */ + struct lstcon_tsb_hdr tes_hdr; + /* chain on batch's tests list */ + struct list_head tes_link; + /* pointer to batch */ + struct lstcon_batch *tes_batch; + + int tes_type; /* type of the test, i.e: bulk, ping */ + int tes_stop_onerr; /* stop on error */ + int tes_oneside; /* one-sided test */ + int tes_concur; /* concurrency */ + int tes_loop; /* loop count */ + int tes_dist; /* nodes distribution of target group */ + int tes_span; /* nodes span of target group */ + int tes_cliidx; /* client index, used for RPC creating */ + + struct list_head tes_trans_list; /* transaction list */ + struct lstcon_group *tes_src_grp; /* group run the test */ + struct lstcon_group *tes_dst_grp; /* target group */ + + int tes_paramlen; /* test parameter length */ + char tes_param[0]; /* test parameter */ +}; + +#define LST_GLOBAL_HASHSIZE 503 /* global nodes hash table size */ +#define LST_NODE_HASHSIZE 239 /* node hash table (for batch or group) */ + +#define LST_SESSION_NONE 0x0 /* no session */ +#define LST_SESSION_ACTIVE 0x1 /* working session */ + +#define LST_CONSOLE_TIMEOUT 300 /* default console timeout */ + +struct lstcon_session { + struct mutex ses_mutex; /* only 1 thread in session */ + struct lst_sid ses_id; /* global session id */ + int ses_key; /* local session key */ + int ses_state; /* state of session */ + int ses_timeout; /* timeout in seconds */ + time64_t ses_laststamp; /* last operation stamp (seconds) */ + /** tests features of the session */ + unsigned ses_features; + /** features are synced with remote test nodes */ + unsigned ses_feats_updated:1; + /** force creating */ + unsigned ses_force:1; + /** session is shutting down */ + unsigned ses_shutdown:1; + /** console is timedout */ + unsigned ses_expired:1; + __u64 ses_id_cookie; /* batch id cookie */ + char ses_name[LST_NAME_SIZE]; /* session name */ + struct lstcon_rpc_trans *ses_ping; /* session pinger */ + struct stt_timer ses_ping_timer; /* timer for pinger */ + struct lstcon_trans_stat ses_trans_stat;/* transaction stats */ + + struct list_head ses_trans_list; /* global list of transaction */ + struct list_head ses_grp_list; /* global list of groups */ + struct list_head ses_bat_list; /* global list of batches */ + struct list_head ses_ndl_list; /* global list of nodes */ + struct list_head *ses_ndl_hash; /* hash table of nodes */ + + spinlock_t ses_rpc_lock; /* serialize */ + atomic_t ses_rpc_counter;/* # of initialized RPCs */ + struct list_head ses_rpc_freelist;/* idle console rpc */ +}; /* session descriptor */ + +extern struct lstcon_session console_session; + +static inline struct lstcon_trans_stat * +lstcon_trans_stat(void) +{ + return &console_session.ses_trans_stat; +} + +static inline struct list_head * +lstcon_id2hash(struct lnet_process_id id, struct list_head *hash) +{ + unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE; + + return &hash[idx]; +} + +extern int lstcon_session_match(struct lst_sid sid); +extern int lstcon_session_new(char *name, int key, unsigned version, + int timeout, int flags, struct lst_sid __user *sid_up); +extern int lstcon_session_info(struct lst_sid __user *sid_up, int __user *key, + unsigned __user *verp, + struct lstcon_ndlist_ent __user *entp, + char __user *name_up, int len); +extern int lstcon_session_end(void); +extern int lstcon_session_debug(int timeout, + struct list_head __user *result_up); +extern int lstcon_session_feats_check(unsigned feats); +extern int lstcon_batch_debug(int timeout, char *name, + int client, struct list_head __user *result_up); +extern int lstcon_group_debug(int timeout, char *name, + struct list_head __user *result_up); +extern int lstcon_nodes_debug(int timeout, int nnd, + struct lnet_process_id __user *nds_up, + struct list_head __user *result_up); +extern int lstcon_group_add(char *name); +extern int lstcon_group_del(char *name); +extern int lstcon_group_clean(char *name, int args); +extern int lstcon_group_refresh(char *name, struct list_head __user *result_up); +extern int lstcon_nodes_add(char *name, int nnd, + struct lnet_process_id __user *nds_up, + unsigned *featp, + struct list_head __user *result_up); +extern int lstcon_nodes_remove(char *name, int nnd, + struct lnet_process_id __user *nds_up, + struct list_head __user *result_up); +extern int lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gent_up, + int *index_p, int *ndent_p, + struct lstcon_node_ent __user *ndents_up); +extern int lstcon_group_list(int idx, int len, char __user *name_up); +extern int lstcon_batch_add(char *name); +extern int lstcon_batch_run(char *name, int timeout, + struct list_head __user *result_up); +extern int lstcon_batch_stop(char *name, int force, + struct list_head __user *result_up); +extern int lstcon_test_batch_query(char *name, int testidx, + int client, int timeout, + struct list_head __user *result_up); +extern int lstcon_batch_del(char *name); +extern int lstcon_batch_list(int idx, int namelen, char __user *name_up); +extern int lstcon_batch_info(char *name, + struct lstcon_test_batch_ent __user *ent_up, + int server, int testidx, int *index_p, + int *ndent_p, + struct lstcon_node_ent __user *dents_up); +extern int lstcon_group_stat(char *grp_name, int timeout, + struct list_head __user *result_up); +extern int lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up, + int timeout, struct list_head __user *result_up); +extern int lstcon_test_add(char *batch_name, int type, int loop, + int concur, int dist, int span, + char *src_name, char *dst_name, + void *param, int paramlen, int *retp, + struct list_head __user *result_up); + +int lstcon_ioctl_entry(struct notifier_block *nb, + unsigned long cmd, void *vdata); +int lstcon_console_init(void); +int lstcon_console_fini(void); + +#endif diff --git a/drivers/staging/lustrefsx/lnet/selftest/framework.c b/drivers/staging/lustrefsx/lnet/selftest/framework.c new file mode 100644 index 0000000000000..000fca9d34e33 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/framework.c @@ -0,0 +1,1783 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/framework.c + * + * Author: Isaac Huang + * Author: Liang Zhen + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + +struct lst_sid LST_INVALID_SID = { .ses_nid = LNET_NID_ANY, .ses_stamp = -1}; + +static int session_timeout = 100; +module_param(session_timeout, int, 0444); +MODULE_PARM_DESC(session_timeout, "test session timeout in seconds (100 by default, 0 == never)"); + +static int rpc_timeout = 64; +module_param(rpc_timeout, int, 0644); +MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never)"); + +#define sfw_unpack_id(id) \ +do { \ + __swab64s(&(id).nid); \ + __swab32s(&(id).pid); \ +} while (0) + +#define sfw_unpack_sid(sid) \ +do { \ + __swab64s(&(sid).ses_nid); \ + __swab64s(&(sid).ses_stamp); \ +} while (0) + +#define sfw_unpack_fw_counters(fc) \ +do { \ + __swab32s(&(fc).running_ms); \ + __swab32s(&(fc).active_batches); \ + __swab32s(&(fc).zombie_sessions); \ + __swab32s(&(fc).brw_errors); \ + __swab32s(&(fc).ping_errors); \ +} while (0) + +#define sfw_unpack_rpc_counters(rc) \ +do { \ + __swab32s(&(rc).errors); \ + __swab32s(&(rc).rpcs_sent); \ + __swab32s(&(rc).rpcs_rcvd); \ + __swab32s(&(rc).rpcs_dropped); \ + __swab32s(&(rc).rpcs_expired); \ + __swab64s(&(rc).bulk_get); \ + __swab64s(&(rc).bulk_put); \ +} while (0) + +#define sfw_unpack_lnet_counters(lc) \ +do { \ + __swab32s(&(lc).lcc_errors); \ + __swab32s(&(lc).lcc_msgs_max); \ + __swab32s(&(lc).lcc_msgs_alloc); \ + __swab32s(&(lc).lcc_send_count); \ + __swab32s(&(lc).lcc_recv_count); \ + __swab32s(&(lc).lcc_drop_count); \ + __swab32s(&(lc).lcc_route_count); \ + __swab64s(&(lc).lcc_send_length); \ + __swab64s(&(lc).lcc_recv_length); \ + __swab64s(&(lc).lcc_drop_length); \ + __swab64s(&(lc).lcc_route_length); \ +} while (0) + +#define sfw_test_active(t) (atomic_read(&(t)->tsi_nactive) != 0) +#define sfw_batch_active(b) (atomic_read(&(b)->bat_nactive) != 0) + +static struct smoketest_framework { + /* RPCs to be recycled */ + struct list_head fw_zombie_rpcs; + /* stopping sessions */ + struct list_head fw_zombie_sessions; + /* registered test cases */ + struct list_head fw_tests; + /* # zombie sessions */ + atomic_t fw_nzombies; + /* serialise */ + spinlock_t fw_lock; + /* _the_ session */ + struct sfw_session *fw_session; + /* shutdown in progress */ + int fw_shuttingdown; + /* running RPC */ + struct srpc_server_rpc *fw_active_srpc; +} sfw_data; + +/* forward ref's */ +static int sfw_stop_batch(struct sfw_batch *tsb, int force); +static void sfw_destroy_session(struct sfw_session *sn); + +static inline struct sfw_test_case * +sfw_find_test_case(int id) +{ + struct sfw_test_case *tsc; + + LASSERT(id <= SRPC_SERVICE_MAX_ID); + LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID); + + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + if (tsc->tsc_srv_service->sv_id == id) + return tsc; + } + + return NULL; +} + +static int +sfw_register_test(struct srpc_service *service, + struct sfw_test_client_ops *cliops) +{ + struct sfw_test_case *tsc; + + if (sfw_find_test_case(service->sv_id) != NULL) { + CERROR ("Failed to register test %s (%d)\n", + service->sv_name, service->sv_id); + return -EEXIST; + } + + LIBCFS_ALLOC(tsc, sizeof(*tsc)); + if (tsc == NULL) + return -ENOMEM; + + tsc->tsc_cli_ops = cliops; + tsc->tsc_srv_service = service; + + list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests); + return 0; +} + +static void +sfw_add_session_timer (void) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct stt_timer *timer = &sn->sn_timer; + + LASSERT (!sfw_data.fw_shuttingdown); + + if (sn == NULL || sn->sn_timeout == 0) + return; + + LASSERT (!sn->sn_timer_active); + + sn->sn_timer_active = 1; + timer->stt_expires = ktime_get_real_seconds()+ sn->sn_timeout; + stt_add_timer(timer); + return; +} + +static int +sfw_del_session_timer (void) +{ + struct sfw_session *sn = sfw_data.fw_session; + + if (sn == NULL || !sn->sn_timer_active) + return 0; + + LASSERT (sn->sn_timeout != 0); + + if (stt_del_timer(&sn->sn_timer)) { /* timer defused */ + sn->sn_timer_active = 0; + return 0; + } + + return EBUSY; /* racing with sfw_session_expired() */ +} + +/* called with sfw_data.fw_lock held */ +static void +sfw_deactivate_session (void) +__must_hold(&sfw_data.fw_lock) +{ + struct sfw_session *sn = sfw_data.fw_session; + int nactive = 0; + struct sfw_batch *tsb; + struct sfw_test_case *tsc; + + if (sn == NULL) return; + + LASSERT(!sn->sn_timer_active); + + sfw_data.fw_session = NULL; + atomic_inc(&sfw_data.fw_nzombies); + list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions); + + spin_unlock(&sfw_data.fw_lock); + + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + srpc_abort_service(tsc->tsc_srv_service); + } + + spin_lock(&sfw_data.fw_lock); + + list_for_each_entry(tsb, &sn->sn_batches, bat_list) { + if (sfw_batch_active(tsb)) { + nactive++; + sfw_stop_batch(tsb, 1); + } + } + + if (nactive != 0) + return; /* wait for active batches to stop */ + + list_del_init(&sn->sn_list); + spin_unlock(&sfw_data.fw_lock); + + sfw_destroy_session(sn); + + spin_lock(&sfw_data.fw_lock); +} + + +static void +sfw_session_expired (void *data) +{ + struct sfw_session *sn = data; + + spin_lock(&sfw_data.fw_lock); + + LASSERT (sn->sn_timer_active); + LASSERT (sn == sfw_data.fw_session); + + CWARN ("Session expired! sid: %s-%llu, name: %s\n", + libcfs_nid2str(sn->sn_id.ses_nid), + sn->sn_id.ses_stamp, &sn->sn_name[0]); + + sn->sn_timer_active = 0; + sfw_deactivate_session(); + + spin_unlock(&sfw_data.fw_lock); +} + +static inline void +sfw_init_session(struct sfw_session *sn, struct lst_sid sid, + unsigned features, const char *name) +{ + struct stt_timer *timer = &sn->sn_timer; + + memset(sn, 0, sizeof(struct sfw_session)); + INIT_LIST_HEAD(&sn->sn_list); + INIT_LIST_HEAD(&sn->sn_batches); + atomic_set(&sn->sn_refcount, 1); /* +1 for caller */ + atomic_set(&sn->sn_brw_errors, 0); + atomic_set(&sn->sn_ping_errors, 0); + strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name)); + + sn->sn_timer_active = 0; + sn->sn_id = sid; + sn->sn_features = features; + sn->sn_timeout = session_timeout; + sn->sn_started = ktime_get(); + + timer->stt_data = sn; + timer->stt_func = sfw_session_expired; + INIT_LIST_HEAD(&timer->stt_list); +} + +/* completion handler for incoming framework RPCs */ +static void +sfw_server_rpc_done(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + int status = rpc->srpc_status; + + CDEBUG (D_NET, + "Incoming framework RPC done: " + "service %s, peer %s, status %s:%d\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), + status); + + if (rpc->srpc_bulk != NULL) + sfw_free_pages(rpc); + return; +} + +static void +sfw_client_rpc_fini(struct srpc_client_rpc *rpc) +{ + LASSERT(rpc->crpc_bulk.bk_niov == 0); + LASSERT(list_empty(&rpc->crpc_list)); + LASSERT(atomic_read(&rpc->crpc_refcount) == 0); + + CDEBUG(D_NET, "Outgoing framework RPC done: " + "service %d, peer %s, status %s:%d:%d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(rpc->crpc_wi.swi_state), + rpc->crpc_aborted, rpc->crpc_status); + + spin_lock(&sfw_data.fw_lock); + + /* my callers must finish all RPCs before shutting me down */ + LASSERT(!sfw_data.fw_shuttingdown); + list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs); + + spin_unlock(&sfw_data.fw_lock); +} + +static struct sfw_batch * +sfw_find_batch(struct lst_bid bid) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct sfw_batch *bat; + + LASSERT(sn != NULL); + + list_for_each_entry(bat, &sn->sn_batches, bat_list) { + if (bat->bat_id.bat_id == bid.bat_id) + return bat; + } + + return NULL; +} + +static struct sfw_batch * +sfw_bid2batch(struct lst_bid bid) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct sfw_batch *bat; + + LASSERT (sn != NULL); + + bat = sfw_find_batch(bid); + if (bat != NULL) + return bat; + + LIBCFS_ALLOC(bat, sizeof(*bat)); + if (bat == NULL) + return NULL; + + bat->bat_error = 0; + bat->bat_session = sn; + bat->bat_id = bid; + atomic_set(&bat->bat_nactive, 0); + INIT_LIST_HEAD(&bat->bat_tests); + + list_add_tail(&bat->bat_list, &sn->sn_batches); + return bat; +} + +static int +sfw_get_stats(struct srpc_stat_reqst *request, struct srpc_stat_reply *reply) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct sfw_counters *cnt = &reply->str_fw; + struct sfw_batch *bat; + + reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->str_sid.ses_nid == LNET_NID_ANY) { + reply->str_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) { + reply->str_status = ESRCH; + return 0; + } + + lnet_counters_get_common(&reply->str_lnet); + srpc_get_counters(&reply->str_rpc); + + /* send over the msecs since the session was started + - with 32 bits to send, this is ~49 days */ + cnt->running_ms = ktime_ms_delta(ktime_get(), sn->sn_started); + cnt->brw_errors = atomic_read(&sn->sn_brw_errors); + cnt->ping_errors = atomic_read(&sn->sn_ping_errors); + cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies); + + cnt->active_batches = 0; + list_for_each_entry(bat, &sn->sn_batches, bat_list) { + if (atomic_read(&bat->bat_nactive) > 0) + cnt->active_batches++; + } + + reply->str_status = 0; + return 0; +} + +int +sfw_make_session(struct srpc_mksn_reqst *request, struct srpc_mksn_reply *reply) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct srpc_msg *msg = container_of(request, struct srpc_msg, + msg_body.mksn_reqst); + int cplen = 0; + + if (request->mksn_sid.ses_nid == LNET_NID_ANY) { + reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + reply->mksn_status = EINVAL; + return 0; + } + + if (sn != NULL) { + reply->mksn_status = 0; + reply->mksn_sid = sn->sn_id; + reply->mksn_timeout = sn->sn_timeout; + + if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) { + atomic_inc(&sn->sn_refcount); + return 0; + } + + if (!request->mksn_force) { + reply->mksn_status = EBUSY; + cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0], + sizeof(reply->mksn_name)); + if (cplen >= sizeof(reply->mksn_name)) + return -E2BIG; + return 0; + } + } + + /* reject the request if it requires unknown features + * NB: old version will always accept all features because it's not + * aware of struct srpc_msg::msg_ses_feats, it's a defect but it's also + * harmless because it will return zero feature to console, and it's + * console's responsibility to make sure all nodes in a session have + * same feature mask. */ + if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + reply->mksn_status = EPROTO; + return 0; + } + + /* brand new or create by force */ + LIBCFS_ALLOC(sn, sizeof(*sn)); + if (sn == NULL) { + CERROR("dropping RPC mksn under memory pressure\n"); + return -ENOMEM; + } + + sfw_init_session(sn, request->mksn_sid, + msg->msg_ses_feats, &request->mksn_name[0]); + + spin_lock(&sfw_data.fw_lock); + + sfw_deactivate_session(); + LASSERT(sfw_data.fw_session == NULL); + sfw_data.fw_session = sn; + + spin_unlock(&sfw_data.fw_lock); + + reply->mksn_status = 0; + reply->mksn_sid = sn->sn_id; + reply->mksn_timeout = sn->sn_timeout; + return 0; +} + +static int +sfw_remove_session(struct srpc_rmsn_reqst *request, + struct srpc_rmsn_reply *reply) +{ + struct sfw_session *sn = sfw_data.fw_session; + + reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->rmsn_sid.ses_nid == LNET_NID_ANY) { + reply->rmsn_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) { + reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY; + return 0; + } + + if (!atomic_dec_and_test(&sn->sn_refcount)) { + reply->rmsn_status = 0; + return 0; + } + + spin_lock(&sfw_data.fw_lock); + sfw_deactivate_session(); + spin_unlock(&sfw_data.fw_lock); + + reply->rmsn_status = 0; + reply->rmsn_sid = LST_INVALID_SID; + LASSERT(sfw_data.fw_session == NULL); + return 0; +} + +static int +sfw_debug_session(struct srpc_debug_reqst *request, + struct srpc_debug_reply *reply) +{ + struct sfw_session *sn = sfw_data.fw_session; + + if (sn == NULL) { + reply->dbg_status = ESRCH; + reply->dbg_sid = LST_INVALID_SID; + return 0; + } + + reply->dbg_status = 0; + reply->dbg_sid = sn->sn_id; + reply->dbg_timeout = sn->sn_timeout; + if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name)) + >= sizeof(reply->dbg_name)) + return -E2BIG; + + return 0; +} + +static void +sfw_test_rpc_fini(struct srpc_client_rpc *rpc) +{ + struct sfw_test_unit *tsu = rpc->crpc_priv; + struct sfw_test_instance *tsi = tsu->tsu_instance; + + /* Called with hold of tsi->tsi_lock */ + LASSERT(list_empty(&rpc->crpc_list)); + list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs); +} + +static inline int +sfw_test_buffers(struct sfw_test_instance *tsi) +{ + struct sfw_test_case *tsc; + struct srpc_service *svc; + int nbuf; + + LASSERT(tsi != NULL); + tsc = sfw_find_test_case(tsi->tsi_service); + LASSERT(tsc != NULL); + svc = tsc->tsc_srv_service; + LASSERT(svc != NULL); + + nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts; + return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA); +} + +static int +sfw_load_test(struct sfw_test_instance *tsi) +{ + struct sfw_test_case *tsc; + struct srpc_service *svc; + int nbuf; + int rc; + + LASSERT(tsi != NULL); + tsc = sfw_find_test_case(tsi->tsi_service); + nbuf = sfw_test_buffers(tsi); + LASSERT(tsc != NULL); + svc = tsc->tsc_srv_service; + + if (tsi->tsi_is_client) { + tsi->tsi_ops = tsc->tsc_cli_ops; + return 0; + } + + rc = srpc_service_add_buffers(svc, nbuf); + if (rc != 0) { + CWARN("Failed to reserve enough buffers: " + "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc); + /* NB: this error handler is not strictly correct, because + * it may release more buffers than already allocated, + * but it doesn't matter because request portal should + * be lazy portal and will grow buffers if necessary. */ + srpc_service_remove_buffers(svc, nbuf); + return -ENOMEM; + } + + CDEBUG(D_NET, "Reserved %d buffers for test %s\n", + nbuf * (srpc_serv_is_framework(svc) ? + 1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name); + return 0; +} + +static void +sfw_unload_test(struct sfw_test_instance *tsi) +{ + struct sfw_test_case *tsc; + + LASSERT(tsi != NULL); + tsc = sfw_find_test_case(tsi->tsi_service); + LASSERT(tsc != NULL); + + if (tsi->tsi_is_client) + return; + + /* shrink buffers, because request portal is lazy portal + * which can grow buffers at runtime so we may leave + * some buffers behind, but never mind... */ + srpc_service_remove_buffers(tsc->tsc_srv_service, + sfw_test_buffers(tsi)); + return; +} + +static void +sfw_destroy_test_instance(struct sfw_test_instance *tsi) +{ + struct srpc_client_rpc *rpc; + struct sfw_test_unit *tsu; + + if (!tsi->tsi_is_client) goto clean; + + tsi->tsi_ops->tso_fini(tsi); + + LASSERT(!tsi->tsi_stopping); + LASSERT(list_empty(&tsi->tsi_active_rpcs)); + LASSERT(!sfw_test_active(tsi)); + + while (!list_empty(&tsi->tsi_units)) { + tsu = list_entry(tsi->tsi_units.next, + struct sfw_test_unit, tsu_list); + list_del(&tsu->tsu_list); + LIBCFS_FREE(tsu, sizeof(*tsu)); + } + + while (!list_empty(&tsi->tsi_free_rpcs)) { + rpc = list_entry(tsi->tsi_free_rpcs.next, + struct srpc_client_rpc, crpc_list); + list_del(&rpc->crpc_list); + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } + +clean: + sfw_unload_test(tsi); + LIBCFS_FREE(tsi, sizeof(*tsi)); + return; +} + +static void +sfw_destroy_batch(struct sfw_batch *tsb) +{ + struct sfw_test_instance *tsi; + + LASSERT(!sfw_batch_active(tsb)); + LASSERT(list_empty(&tsb->bat_list)); + + while (!list_empty(&tsb->bat_tests)) { + tsi = list_entry(tsb->bat_tests.next, + struct sfw_test_instance, tsi_list); + list_del_init(&tsi->tsi_list); + sfw_destroy_test_instance(tsi); + } + + LIBCFS_FREE(tsb, sizeof(*tsb)); + return; +} + +static void +sfw_destroy_session(struct sfw_session *sn) +{ + struct sfw_batch *batch; + + LASSERT(list_empty(&sn->sn_list)); + LASSERT(sn != sfw_data.fw_session); + + while (!list_empty(&sn->sn_batches)) { + batch = list_entry(sn->sn_batches.next, + struct sfw_batch, bat_list); + list_del_init(&batch->bat_list); + sfw_destroy_batch(batch); + } + + LIBCFS_FREE(sn, sizeof(*sn)); + atomic_dec(&sfw_data.fw_nzombies); + return; +} + +static void +sfw_unpack_addtest_req(struct srpc_msg *msg) +{ + struct srpc_test_reqst *req = &msg->msg_body.tes_reqst; + + LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST); + LASSERT (req->tsr_is_client); + + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + if (req->tsr_service == SRPC_SERVICE_BRW) { + if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) { + struct test_bulk_req *bulk = &req->tsr_u.bulk_v0; + + __swab32s(&bulk->blk_opc); + __swab32s(&bulk->blk_npg); + __swab32s(&bulk->blk_flags); + + } else { + struct test_bulk_req_v1 *bulk = &req->tsr_u.bulk_v1; + + __swab16s(&bulk->blk_opc); + __swab16s(&bulk->blk_flags); + __swab32s(&bulk->blk_offset); + __swab32s(&bulk->blk_len); + } + + return; + } + + if (req->tsr_service == SRPC_SERVICE_PING) { + struct test_ping_req *ping = &req->tsr_u.ping; + + __swab32s(&ping->png_size); + __swab32s(&ping->png_flags); + return; + } + + LBUG(); + return; +} + +static int +sfw_add_test_instance(struct sfw_batch *tsb, struct srpc_server_rpc *rpc) +{ + struct srpc_msg *msg = &rpc->srpc_reqstbuf->buf_msg; + struct srpc_test_reqst *req = &msg->msg_body.tes_reqst; + struct srpc_bulk *bk = rpc->srpc_bulk; + int ndest = req->tsr_ndest; + struct sfw_test_unit *tsu; + struct sfw_test_instance *tsi; + int i; + int rc; + + LIBCFS_ALLOC(tsi, sizeof(*tsi)); + if (tsi == NULL) { + CERROR ("Can't allocate test instance for batch: %llu\n", + tsb->bat_id.bat_id); + return -ENOMEM; + } + + spin_lock_init(&tsi->tsi_lock); + atomic_set(&tsi->tsi_nactive, 0); + INIT_LIST_HEAD(&tsi->tsi_units); + INIT_LIST_HEAD(&tsi->tsi_free_rpcs); + INIT_LIST_HEAD(&tsi->tsi_active_rpcs); + + tsi->tsi_stopping = 0; + tsi->tsi_batch = tsb; + tsi->tsi_loop = req->tsr_loop; + tsi->tsi_concur = req->tsr_concur; + tsi->tsi_service = req->tsr_service; + tsi->tsi_is_client = !!(req->tsr_is_client); + tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr); + + rc = sfw_load_test(tsi); + if (rc != 0) { + LIBCFS_FREE(tsi, sizeof(*tsi)); + return rc; + } + + LASSERT (!sfw_batch_active(tsb)); + + if (!tsi->tsi_is_client) { + /* it's test server, just add it to tsb */ + list_add_tail(&tsi->tsi_list, &tsb->bat_tests); + return 0; + } + + LASSERT (bk != NULL); + LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest); + LASSERT((unsigned int)bk->bk_len >= + sizeof(struct lnet_process_id_packed) * ndest); + + sfw_unpack_addtest_req(msg); + memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u)); + + for (i = 0; i < ndest; i++) { + struct lnet_process_id_packed *dests; + struct lnet_process_id_packed id; + int j; + + dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page); + LASSERT (dests != NULL); /* my pages are within KVM always */ + id = dests[i % SFW_ID_PER_PAGE]; + if (msg->msg_magic != SRPC_MSG_MAGIC) + sfw_unpack_id(id); + + for (j = 0; j < tsi->tsi_concur; j++) { + LIBCFS_ALLOC(tsu, sizeof(*tsu)); + if (tsu == NULL) { + rc = -ENOMEM; + CERROR ("Can't allocate tsu for %d\n", + tsi->tsi_service); + goto error; + } + + tsu->tsu_dest.nid = id.nid; + tsu->tsu_dest.pid = id.pid; + tsu->tsu_instance = tsi; + tsu->tsu_private = NULL; + list_add_tail(&tsu->tsu_list, &tsi->tsi_units); + } + } + + rc = tsi->tsi_ops->tso_init(tsi); + if (rc == 0) { + list_add_tail(&tsi->tsi_list, &tsb->bat_tests); + return 0; + } + +error: + LASSERT(rc != 0); + sfw_destroy_test_instance(tsi); + return rc; +} + +static void +sfw_test_unit_done(struct sfw_test_unit *tsu) +{ + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct sfw_batch *tsb = tsi->tsi_batch; + struct sfw_session *sn = tsb->bat_session; + + LASSERT (sfw_test_active(tsi)); + + if (!atomic_dec_and_test(&tsi->tsi_nactive)) + return; + + /* the test instance is done */ + spin_lock(&tsi->tsi_lock); + + tsi->tsi_stopping = 0; + + spin_unlock(&tsi->tsi_lock); + + spin_lock(&sfw_data.fw_lock); + + if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */ + sn == sfw_data.fw_session) { /* sn also active */ + spin_unlock(&sfw_data.fw_lock); + return; + } + + LASSERT(!list_empty(&sn->sn_list)); /* I'm a zombie! */ + + list_for_each_entry(tsb, &sn->sn_batches, bat_list) { + if (sfw_batch_active(tsb)) { + spin_unlock(&sfw_data.fw_lock); + return; + } + } + + list_del_init(&sn->sn_list); + spin_unlock(&sfw_data.fw_lock); + + sfw_destroy_session(sn); + return; +} + +static void +sfw_test_rpc_done(struct srpc_client_rpc *rpc) +{ + struct sfw_test_unit *tsu = rpc->crpc_priv; + struct sfw_test_instance *tsi = tsu->tsu_instance; + int done = 0; + + tsi->tsi_ops->tso_done_rpc(tsu, rpc); + + spin_lock(&tsi->tsi_lock); + + LASSERT(sfw_test_active(tsi)); + LASSERT(!list_empty(&rpc->crpc_list)); + + list_del_init(&rpc->crpc_list); + + /* batch is stopping or loop is done or get error */ + if (tsi->tsi_stopping || + tsu->tsu_loop == 0 || + (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr)) + done = 1; + + /* dec ref for poster */ + srpc_client_rpc_decref(rpc); + + spin_unlock(&tsi->tsi_lock); + + if (!done) { + swi_schedule_workitem(&tsu->tsu_worker); + return; + } + + sfw_test_unit_done(tsu); + return; +} + +int +sfw_create_test_rpc(struct sfw_test_unit *tsu, struct lnet_process_id peer, + unsigned features, int nblk, int blklen, + struct srpc_client_rpc **rpcpp) +{ + struct srpc_client_rpc *rpc = NULL; + struct sfw_test_instance *tsi = tsu->tsu_instance; + + spin_lock(&tsi->tsi_lock); + + LASSERT (sfw_test_active(tsi)); + + if (!list_empty(&tsi->tsi_free_rpcs)) { + /* pick request from buffer */ + rpc = list_entry(tsi->tsi_free_rpcs.next, + struct srpc_client_rpc, crpc_list); + LASSERT(nblk == rpc->crpc_bulk.bk_niov); + list_del_init(&rpc->crpc_list); + } + + spin_unlock(&tsi->tsi_lock); + + if (rpc == NULL) { + rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk, + blklen, sfw_test_rpc_done, + sfw_test_rpc_fini, tsu); + } else { + srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk, + blklen, sfw_test_rpc_done, + sfw_test_rpc_fini, tsu); + } + + if (rpc == NULL) { + CERROR("Can't create rpc for test %d\n", tsi->tsi_service); + return -ENOMEM; + } + + rpc->crpc_reqstmsg.msg_ses_feats = features; + *rpcpp = rpc; + + return 0; +} + +static int +sfw_run_test(struct swi_workitem *wi) +{ + struct sfw_test_unit *tsu = wi->swi_workitem.wi_data; + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct srpc_client_rpc *rpc = NULL; + + LASSERT (wi == &tsu->tsu_worker); + + if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) { + LASSERT (rpc == NULL); + goto test_done; + } + + LASSERT (rpc != NULL); + + spin_lock(&tsi->tsi_lock); + + if (tsi->tsi_stopping) { + list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs); + spin_unlock(&tsi->tsi_lock); + goto test_done; + } + + if (tsu->tsu_loop > 0) + tsu->tsu_loop--; + + list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs); + spin_unlock(&tsi->tsi_lock); + + spin_lock(&rpc->crpc_lock); + rpc->crpc_timeout = rpc_timeout; + srpc_post_rpc(rpc); + spin_unlock(&rpc->crpc_lock); + return 0; + +test_done: + /* + * No one can schedule me now since: + * - previous RPC, if any, has done and + * - no new RPC is initiated. + * - my batch is still active; no one can run it again now. + * Cancel pending schedules and prevent future schedule attempts: + */ + swi_exit_workitem(wi); + sfw_test_unit_done(tsu); + return 1; +} + +static int +sfw_run_batch(struct sfw_batch *tsb) +{ + struct swi_workitem *wi; + struct sfw_test_unit *tsu; + struct sfw_test_instance *tsi; + + if (sfw_batch_active(tsb)) { + CDEBUG(D_NET, "Batch already active: %llu (%d)\n", + tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive)); + return 0; + } + + list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { + if (!tsi->tsi_is_client) /* skip server instances */ + continue; + + LASSERT(!tsi->tsi_stopping); + LASSERT(!sfw_test_active(tsi)); + + atomic_inc(&tsb->bat_nactive); + + list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { + atomic_inc(&tsi->tsi_nactive); + tsu->tsu_loop = tsi->tsi_loop; + wi = &tsu->tsu_worker; + swi_init_workitem(wi, tsu, sfw_run_test, + lst_sched_test[\ + lnet_cpt_of_nid(tsu->tsu_dest.nid, + NULL)]); + swi_schedule_workitem(wi); + } + } + + return 0; +} + +static int +sfw_stop_batch(struct sfw_batch *tsb, int force) +{ + struct sfw_test_instance *tsi; + struct srpc_client_rpc *rpc; + + if (!sfw_batch_active(tsb)) { + CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id); + return 0; + } + + list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { + spin_lock(&tsi->tsi_lock); + + if (!tsi->tsi_is_client || + !sfw_test_active(tsi) || tsi->tsi_stopping) { + spin_unlock(&tsi->tsi_lock); + continue; + } + + tsi->tsi_stopping = 1; + + if (!force) { + spin_unlock(&tsi->tsi_lock); + continue; + } + + /* abort launched rpcs in the test */ + list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) { + spin_lock(&rpc->crpc_lock); + + srpc_abort_rpc(rpc, -EINTR); + + spin_unlock(&rpc->crpc_lock); + } + + spin_unlock(&tsi->tsi_lock); + } + + return 0; +} + +static int +sfw_query_batch(struct sfw_batch *tsb, int testidx, + struct srpc_batch_reply *reply) +{ + struct sfw_test_instance *tsi; + + if (testidx < 0) + return -EINVAL; + + if (testidx == 0) { + reply->bar_active = atomic_read(&tsb->bat_nactive); + return 0; + } + + list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { + if (testidx-- > 1) + continue; + + reply->bar_active = atomic_read(&tsi->tsi_nactive); + return 0; + } + + return -ENOENT; +} + +void +sfw_free_pages(struct srpc_server_rpc *rpc) +{ + srpc_free_bulk(rpc->srpc_bulk); + rpc->srpc_bulk = NULL; +} + +int +sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len, + int sink) +{ + LASSERT(rpc->srpc_bulk == NULL); + LASSERT(npages > 0 && npages <= LNET_MAX_IOV); + + rpc->srpc_bulk = srpc_alloc_bulk(cpt, 0, npages, len, sink); + if (rpc->srpc_bulk == NULL) + return -ENOMEM; + + return 0; +} + +static int +sfw_add_test(struct srpc_server_rpc *rpc) +{ + struct sfw_session *sn = sfw_data.fw_session; + struct srpc_test_reply *reply = &rpc->srpc_replymsg.msg_body.tes_reply; + struct srpc_test_reqst *request; + int rc; + struct sfw_batch *bat; + + request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst; + reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->tsr_loop == 0 || + request->tsr_concur == 0 || + request->tsr_sid.ses_nid == LNET_NID_ANY || + request->tsr_ndest > SFW_MAX_NDESTS || + (request->tsr_is_client && request->tsr_ndest == 0) || + request->tsr_concur > SFW_MAX_CONCUR || + request->tsr_service > SRPC_SERVICE_MAX_ID || + request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) { + reply->tsr_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) || + sfw_find_test_case(request->tsr_service) == NULL) { + reply->tsr_status = ENOENT; + return 0; + } + + bat = sfw_bid2batch(request->tsr_bid); + if (bat == NULL) { + CERROR("dropping RPC %s from %s under memory pressure\n", + rpc->srpc_scd->scd_svc->sv_name, + libcfs_id2str(rpc->srpc_peer)); + return -ENOMEM; + } + + if (sfw_batch_active(bat)) { + reply->tsr_status = EBUSY; + return 0; + } + + if (request->tsr_is_client && rpc->srpc_bulk == NULL) { + /* rpc will be resumed later in sfw_bulk_ready */ + int npg = sfw_id_pages(request->tsr_ndest); + int len; + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + len = npg * PAGE_SIZE; + + } else { + len = sizeof(struct lnet_process_id_packed) * + request->tsr_ndest; + } + + return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1); + } + + rc = sfw_add_test_instance(bat, rpc); + CDEBUG (rc == 0 ? D_NET : D_WARNING, + "%s test: sv %d %s, loop %d, concur %d, ndest %d\n", + rc == 0 ? "Added" : "Failed to add", request->tsr_service, + request->tsr_is_client ? "client" : "server", + request->tsr_loop, request->tsr_concur, request->tsr_ndest); + + reply->tsr_status = (rc < 0) ? -rc : rc; + return 0; +} + +static int +sfw_control_batch(struct srpc_batch_reqst *request, + struct srpc_batch_reply *reply) +{ + struct sfw_session *sn = sfw_data.fw_session; + int rc = 0; + struct sfw_batch *bat; + + reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) { + reply->bar_status = ESRCH; + return 0; + } + + bat = sfw_find_batch(request->bar_bid); + if (bat == NULL) { + reply->bar_status = ENOENT; + return 0; + } + + switch (request->bar_opc) { + case SRPC_BATCH_OPC_RUN: + rc = sfw_run_batch(bat); + break; + + case SRPC_BATCH_OPC_STOP: + rc = sfw_stop_batch(bat, request->bar_arg); + break; + + case SRPC_BATCH_OPC_QUERY: + rc = sfw_query_batch(bat, request->bar_testidx, reply); + break; + + default: + return -EINVAL; /* drop it */ + } + + reply->bar_status = (rc < 0) ? -rc : rc; + return 0; +} + +static int +sfw_handle_server_rpc(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + struct srpc_msg *reply = &rpc->srpc_replymsg; + struct srpc_msg *request = &rpc->srpc_reqstbuf->buf_msg; + unsigned features = LST_FEATS_MASK; + int rc = 0; + + LASSERT(sfw_data.fw_active_srpc == NULL); + LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + spin_lock(&sfw_data.fw_lock); + + if (sfw_data.fw_shuttingdown) { + spin_unlock(&sfw_data.fw_lock); + return -ESHUTDOWN; + } + + /* Remove timer to avoid racing with it or expiring active session */ + if (sfw_del_session_timer() != 0) { + CERROR("dropping RPC %s from %s: racing with expiry timer\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer)); + spin_unlock(&sfw_data.fw_lock); + return -EAGAIN; + } + + sfw_data.fw_active_srpc = rpc; + spin_unlock(&sfw_data.fw_lock); + + sfw_unpack_message(request); + LASSERT(request->msg_type == srpc_service2request(sv->sv_id)); + + /* rpc module should have checked this */ + LASSERT(request->msg_version == SRPC_MSG_VERSION); + + if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION && + sv->sv_id != SRPC_SERVICE_DEBUG) { + struct sfw_session *sn = sfw_data.fw_session; + + if (sn != NULL && + sn->sn_features != request->msg_ses_feats) { + CNETERR("Features of framework RPC don't match " + "features of current session: %x/%x\n", + request->msg_ses_feats, sn->sn_features); + reply->msg_body.reply.status = EPROTO; + reply->msg_body.reply.sid = sn->sn_id; + goto out; + } + + } else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + /* NB: at this point, old version will ignore features and + * create new session anyway, so console should be able + * to handle this */ + reply->msg_body.reply.status = EPROTO; + goto out; + } + + switch(sv->sv_id) { + default: + LBUG (); + case SRPC_SERVICE_TEST: + rc = sfw_add_test(rpc); + break; + + case SRPC_SERVICE_BATCH: + rc = sfw_control_batch(&request->msg_body.bat_reqst, + &reply->msg_body.bat_reply); + break; + + case SRPC_SERVICE_QUERY_STAT: + rc = sfw_get_stats(&request->msg_body.stat_reqst, + &reply->msg_body.stat_reply); + break; + + case SRPC_SERVICE_DEBUG: + rc = sfw_debug_session(&request->msg_body.dbg_reqst, + &reply->msg_body.dbg_reply); + break; + + case SRPC_SERVICE_MAKE_SESSION: + rc = sfw_make_session(&request->msg_body.mksn_reqst, + &reply->msg_body.mksn_reply); + break; + + case SRPC_SERVICE_REMOVE_SESSION: + rc = sfw_remove_session(&request->msg_body.rmsn_reqst, + &reply->msg_body.rmsn_reply); + break; + } + + if (sfw_data.fw_session != NULL) + features = sfw_data.fw_session->sn_features; + out: + reply->msg_ses_feats = features; + rpc->srpc_done = sfw_server_rpc_done; + spin_lock(&sfw_data.fw_lock); + + if (!sfw_data.fw_shuttingdown) + sfw_add_session_timer(); + + sfw_data.fw_active_srpc = NULL; + spin_unlock(&sfw_data.fw_lock); + return rc; +} + +static int +sfw_bulk_ready(struct srpc_server_rpc *rpc, int status) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + int rc; + + LASSERT(rpc->srpc_bulk != NULL); + LASSERT(sv->sv_id == SRPC_SERVICE_TEST); + LASSERT(sfw_data.fw_active_srpc == NULL); + LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client); + + spin_lock(&sfw_data.fw_lock); + + if (status != 0) { + CERROR("Bulk transfer failed for RPC: " + "service %s, peer %s, status %d\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer), status); + spin_unlock(&sfw_data.fw_lock); + return -EIO; + } + + if (sfw_data.fw_shuttingdown) { + spin_unlock(&sfw_data.fw_lock); + return -ESHUTDOWN; + } + + if (sfw_del_session_timer() != 0) { + CERROR("dropping RPC %s from %s: racing with expiry timer\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer)); + spin_unlock(&sfw_data.fw_lock); + return -EAGAIN; + } + + sfw_data.fw_active_srpc = rpc; + spin_unlock(&sfw_data.fw_lock); + + rc = sfw_add_test(rpc); + + spin_lock(&sfw_data.fw_lock); + + if (!sfw_data.fw_shuttingdown) + sfw_add_session_timer(); + + sfw_data.fw_active_srpc = NULL; + spin_unlock(&sfw_data.fw_lock); + return rc; +} + +struct srpc_client_rpc * +sfw_create_rpc(struct lnet_process_id peer, int service, + unsigned features, int nbulkiov, int bulklen, + void (*done)(struct srpc_client_rpc *), void *priv) +{ + struct srpc_client_rpc *rpc = NULL; + + spin_lock(&sfw_data.fw_lock); + + LASSERT (!sfw_data.fw_shuttingdown); + LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) { + rpc = list_entry(sfw_data.fw_zombie_rpcs.next, + struct srpc_client_rpc, crpc_list); + list_del(&rpc->crpc_list); + + srpc_init_client_rpc(rpc, peer, service, 0, 0, + done, sfw_client_rpc_fini, priv); + } + + spin_unlock(&sfw_data.fw_lock); + + if (rpc == NULL) { + rpc = srpc_create_client_rpc(peer, service, + nbulkiov, bulklen, done, + nbulkiov != 0 ? NULL : + sfw_client_rpc_fini, + priv); + } + + if (rpc != NULL) /* "session" is concept in framework */ + rpc->crpc_reqstmsg.msg_ses_feats = features; + + return rpc; +} + +void +sfw_unpack_message(struct srpc_msg *msg) +{ + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + /* srpc module should guarantee I wouldn't get crap */ + LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + if (msg->msg_type == SRPC_MSG_STAT_REQST) { + struct srpc_stat_reqst *req = &msg->msg_body.stat_reqst; + + __swab32s(&req->str_type); + __swab64s(&req->str_rpyid); + sfw_unpack_sid(req->str_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_STAT_REPLY) { + struct srpc_stat_reply *rep = &msg->msg_body.stat_reply; + + __swab32s(&rep->str_status); + sfw_unpack_sid(rep->str_sid); + sfw_unpack_fw_counters(rep->str_fw); + sfw_unpack_rpc_counters(rep->str_rpc); + sfw_unpack_lnet_counters(rep->str_lnet); + return; + } + + if (msg->msg_type == SRPC_MSG_MKSN_REQST) { + struct srpc_mksn_reqst *req = &msg->msg_body.mksn_reqst; + + __swab64s(&req->mksn_rpyid); + __swab32s(&req->mksn_force); + sfw_unpack_sid(req->mksn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_MKSN_REPLY) { + struct srpc_mksn_reply *rep = &msg->msg_body.mksn_reply; + + __swab32s(&rep->mksn_status); + __swab32s(&rep->mksn_timeout); + sfw_unpack_sid(rep->mksn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_RMSN_REQST) { + struct srpc_rmsn_reqst *req = &msg->msg_body.rmsn_reqst; + + __swab64s(&req->rmsn_rpyid); + sfw_unpack_sid(req->rmsn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_RMSN_REPLY) { + struct srpc_rmsn_reply *rep = &msg->msg_body.rmsn_reply; + + __swab32s(&rep->rmsn_status); + sfw_unpack_sid(rep->rmsn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_DEBUG_REQST) { + struct srpc_debug_reqst *req = &msg->msg_body.dbg_reqst; + + __swab64s(&req->dbg_rpyid); + __swab32s(&req->dbg_flags); + sfw_unpack_sid(req->dbg_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) { + struct srpc_debug_reply *rep = &msg->msg_body.dbg_reply; + + __swab32s(&rep->dbg_nbatch); + __swab32s(&rep->dbg_timeout); + sfw_unpack_sid(rep->dbg_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_BATCH_REQST) { + struct srpc_batch_reqst *req = &msg->msg_body.bat_reqst; + + __swab32s(&req->bar_opc); + __swab64s(&req->bar_rpyid); + __swab32s(&req->bar_testidx); + __swab32s(&req->bar_arg); + sfw_unpack_sid(req->bar_sid); + __swab64s(&req->bar_bid.bat_id); + return; + } + + if (msg->msg_type == SRPC_MSG_BATCH_REPLY) { + struct srpc_batch_reply *rep = &msg->msg_body.bat_reply; + + __swab32s(&rep->bar_status); + sfw_unpack_sid(rep->bar_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_TEST_REQST) { + struct srpc_test_reqst *req = &msg->msg_body.tes_reqst; + + __swab64s(&req->tsr_rpyid); + __swab64s(&req->tsr_bulkid); + __swab32s(&req->tsr_loop); + __swab32s(&req->tsr_ndest); + __swab32s(&req->tsr_concur); + __swab32s(&req->tsr_service); + sfw_unpack_sid(req->tsr_sid); + __swab64s(&req->tsr_bid.bat_id); + return; + } + + if (msg->msg_type == SRPC_MSG_TEST_REPLY) { + struct srpc_test_reply *rep = &msg->msg_body.tes_reply; + + __swab32s(&rep->tsr_status); + sfw_unpack_sid(rep->tsr_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_JOIN_REQST) { + struct srpc_join_reqst *req = &msg->msg_body.join_reqst; + + __swab64s(&req->join_rpyid); + sfw_unpack_sid(req->join_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_JOIN_REPLY) { + struct srpc_join_reply *rep = &msg->msg_body.join_reply; + + __swab32s(&rep->join_status); + __swab32s(&rep->join_timeout); + sfw_unpack_sid(rep->join_sid); + return; + } + + LBUG (); + return; +} + +void +sfw_abort_rpc(struct srpc_client_rpc *rpc) +{ + LASSERT(atomic_read(&rpc->crpc_refcount) > 0); + LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + spin_lock(&rpc->crpc_lock); + srpc_abort_rpc(rpc, -EINTR); + spin_unlock(&rpc->crpc_lock); + return; +} + +void +sfw_post_rpc(struct srpc_client_rpc *rpc) +{ + spin_lock(&rpc->crpc_lock); + + LASSERT(!rpc->crpc_closed); + LASSERT(!rpc->crpc_aborted); + LASSERT(list_empty(&rpc->crpc_list)); + LASSERT(!sfw_data.fw_shuttingdown); + + rpc->crpc_timeout = rpc_timeout; + srpc_post_rpc(rpc); + + spin_unlock(&rpc->crpc_lock); + return; +} + +static struct srpc_service sfw_services[] = { + { .sv_id = SRPC_SERVICE_DEBUG, .sv_name = "debug", }, + { .sv_id = SRPC_SERVICE_QUERY_STAT, .sv_name = "query stats", }, + { .sv_id = SRPC_SERVICE_MAKE_SESSION, .sv_name = "make session", }, + { .sv_id = SRPC_SERVICE_REMOVE_SESSION, .sv_name = "remove session", }, + { .sv_id = SRPC_SERVICE_BATCH, .sv_name = "batch service", }, + { .sv_id = SRPC_SERVICE_TEST, .sv_name = "test service", }, + { .sv_id = 0, } }; + +int +sfw_startup (void) +{ + int i; + int rc; + int error; + struct srpc_service *sv; + struct sfw_test_case *tsc; + + + if (session_timeout < 0) { + CERROR ("Session timeout must be non-negative: %d\n", + session_timeout); + return -EINVAL; + } + + if (rpc_timeout < 0) { + CERROR ("RPC timeout must be non-negative: %d\n", + rpc_timeout); + return -EINVAL; + } + + if (session_timeout == 0) + CWARN ("Zero session_timeout specified " + "- test sessions never expire.\n"); + + if (rpc_timeout == 0) + CWARN ("Zero rpc_timeout specified " + "- test RPC never expire.\n"); + + memset(&sfw_data, 0, sizeof(struct smoketest_framework)); + + sfw_data.fw_session = NULL; + sfw_data.fw_active_srpc = NULL; + spin_lock_init(&sfw_data.fw_lock); + atomic_set(&sfw_data.fw_nzombies, 0); + INIT_LIST_HEAD(&sfw_data.fw_tests); + INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs); + INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions); + + brw_init_test_client(); + brw_init_test_service(); + rc = sfw_register_test(&brw_test_service, &brw_test_client); + LASSERT (rc == 0); + + ping_init_test_client(); + ping_init_test_service(); + rc = sfw_register_test(&ping_test_service, &ping_test_client); + LASSERT (rc == 0); + + error = 0; + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + sv = tsc->tsc_srv_service; + + rc = srpc_add_service(sv); + LASSERT(rc != -EBUSY); + if (rc != 0) { + CWARN("Failed to add %s service: %d\n", + sv->sv_name, rc); + error = rc; + } + } + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) break; + + sv->sv_bulk_ready = NULL; + sv->sv_handler = sfw_handle_server_rpc; + sv->sv_wi_total = SFW_FRWK_WI_MAX; + if (sv->sv_id == SRPC_SERVICE_TEST) + sv->sv_bulk_ready = sfw_bulk_ready; + + rc = srpc_add_service(sv); + LASSERT (rc != -EBUSY); + if (rc != 0) { + CWARN ("Failed to add %s service: %d\n", + sv->sv_name, rc); + error = rc; + } + + /* about to sfw_shutdown, no need to add buffer */ + if (error) continue; + + rc = srpc_service_add_buffers(sv, sv->sv_wi_total); + if (rc != 0) { + CWARN("Failed to reserve enough buffers: " + "service %s, %d needed: %d\n", + sv->sv_name, sv->sv_wi_total, rc); + error = -ENOMEM; + } + } + + if (error != 0) + sfw_shutdown(); + return error; +} + +void +sfw_shutdown (void) +{ + struct srpc_service *sv; + struct sfw_test_case *tsc; + int i; + + spin_lock(&sfw_data.fw_lock); + + sfw_data.fw_shuttingdown = 1; + lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock, + "waiting for active RPC to finish.\n"); + + if (sfw_del_session_timer() != 0) + lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock, + "waiting for session timer to explode.\n"); + + sfw_deactivate_session(); + lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0, + sfw_data.fw_lock, + "waiting for %d zombie sessions to die.\n", + atomic_read(&sfw_data.fw_nzombies)); + + spin_unlock(&sfw_data.fw_lock); + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) + break; + + srpc_shutdown_service(sv); + srpc_remove_service(sv); + } + + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + sv = tsc->tsc_srv_service; + srpc_shutdown_service(sv); + srpc_remove_service(sv); + } + + while (!list_empty(&sfw_data.fw_zombie_rpcs)) { + struct srpc_client_rpc *rpc; + + rpc = list_entry(sfw_data.fw_zombie_rpcs.next, + struct srpc_client_rpc, crpc_list); + list_del(&rpc->crpc_list); + + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) + break; + + srpc_wait_service_shutdown(sv); + } + + while (!list_empty(&sfw_data.fw_tests)) { + tsc = list_entry(sfw_data.fw_tests.next, + struct sfw_test_case, tsc_list); + + srpc_wait_service_shutdown(tsc->tsc_srv_service); + + list_del(&tsc->tsc_list); + LIBCFS_FREE(tsc, sizeof(*tsc)); + } + + return; +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/module.c b/drivers/staging/lustrefsx/lnet/selftest/module.c new file mode 100644 index 0000000000000..e0baadb6b9202 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/module.c @@ -0,0 +1,168 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" +#include "console.h" + +enum { + LST_INIT_NONE = 0, + LST_INIT_WI_SERIAL, + LST_INIT_WI_TEST, + LST_INIT_RPC, + LST_INIT_FW, + LST_INIT_CONSOLE +}; + +static int lst_init_step = LST_INIT_NONE; + +struct cfs_wi_sched *lst_sched_serial; +struct cfs_wi_sched **lst_sched_test; + +static void +lnet_selftest_exit(void) +{ + int i; + + switch (lst_init_step) { + case LST_INIT_CONSOLE: + lstcon_console_fini(); + fallthrough; + case LST_INIT_FW: + sfw_shutdown(); + fallthrough; + case LST_INIT_RPC: + srpc_shutdown(); + fallthrough; + case LST_INIT_WI_TEST: + for (i = 0; + i < cfs_cpt_number(lnet_cpt_table()); i++) { + if (lst_sched_test[i] == NULL) + continue; + cfs_wi_sched_destroy(lst_sched_test[i]); + } + LIBCFS_FREE(lst_sched_test, + sizeof(lst_sched_test[0]) * + cfs_cpt_number(lnet_cpt_table())); + lst_sched_test = NULL; + fallthrough; + case LST_INIT_WI_SERIAL: + cfs_wi_sched_destroy(lst_sched_serial); + lst_sched_serial = NULL; + fallthrough; + case LST_INIT_NONE: + break; + default: + LBUG(); + } +} + +void +lnet_selftest_structure_assertion(void) +{ + CLASSERT(sizeof(struct srpc_msg) == 160); + CLASSERT(sizeof(struct srpc_test_reqst) == 70); + CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_concur) == 72); + CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_ndest) == 78); + CLASSERT(sizeof(struct srpc_stat_reply) == 136); + CLASSERT(sizeof(struct srpc_stat_reqst) == 28); +} + +static int __init +lnet_selftest_init(void) +{ + int nscheds; + int rc; + int i; + + rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY, + 1, &lst_sched_serial); + if (rc != 0) { + CERROR("Failed to create serial WI scheduler for LST\n"); + return rc; + } + lst_init_step = LST_INIT_WI_SERIAL; + + nscheds = cfs_cpt_number(lnet_cpt_table()); + LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds); + if (lst_sched_test == NULL) + goto error; + + lst_init_step = LST_INIT_WI_TEST; + for (i = 0; i < nscheds; i++) { + int nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + + /* reserve at least one CPU for LND */ + nthrs = max(nthrs - 1, 1); + rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i, + nthrs, &lst_sched_test[i]); + if (rc != 0) { + CERROR("Failed to create CPU partition affinity WI scheduler %d for LST\n", + i); + goto error; + } + } + + rc = srpc_startup(); + if (rc != 0) { + CERROR("LST can't startup rpc\n"); + goto error; + } + lst_init_step = LST_INIT_RPC; + + rc = sfw_startup(); + if (rc != 0) { + CERROR("LST can't startup framework\n"); + goto error; + } + lst_init_step = LST_INIT_FW; + + rc = lstcon_console_init(); + if (rc != 0) { + CERROR("LST can't startup console\n"); + goto error; + } + lst_init_step = LST_INIT_CONSOLE; + return 0; +error: + lnet_selftest_exit(); + return rc; +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("LNet Selftest"); +MODULE_VERSION("2.8.0"); +MODULE_LICENSE("GPL"); + +module_init(lnet_selftest_init); +module_exit(lnet_selftest_exit); diff --git a/drivers/staging/lustrefsx/lnet/selftest/ping_test.c b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c new file mode 100644 index 0000000000000..2d1403b34c7bc --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c @@ -0,0 +1,228 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * Test client & Server + * + * Author: Liang Zhen + */ + +#include "selftest.h" + +#define LST_PING_TEST_MAGIC 0xbabeface + +static int ping_srv_workitems = SFW_TEST_WI_MAX; +module_param(ping_srv_workitems, int, 0644); +MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems"); + +struct lst_ping_data { + spinlock_t pnd_lock; /* serialize */ + int pnd_counter; /* sequence counter */ +}; + +static struct lst_ping_data lst_ping_data; + +static int +ping_client_init(struct sfw_test_instance *tsi) +{ + struct sfw_session *sn = tsi->tsi_batch->bat_session; + + LASSERT(tsi->tsi_is_client); + LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0); + + spin_lock_init(&lst_ping_data.pnd_lock); + lst_ping_data.pnd_counter = 0; + + return 0; +} + +static void +ping_client_fini(struct sfw_test_instance *tsi) +{ + struct sfw_session *sn = tsi->tsi_batch->bat_session; + int errors; + + LASSERT (sn != NULL); + LASSERT (tsi->tsi_is_client); + + errors = atomic_read(&sn->sn_ping_errors); + if (errors) + CWARN ("%d pings have failed.\n", errors); + else + CDEBUG (D_NET, "Ping test finished OK.\n"); +} + +static int +ping_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest, + struct srpc_client_rpc **rpc) +{ + struct srpc_ping_reqst *req; + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct sfw_session *sn = tsi->tsi_batch->bat_session; + struct timespec64 ts; + int rc; + + LASSERT(sn != NULL); + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc); + if (rc != 0) + return rc; + + req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst; + + req->pnr_magic = LST_PING_TEST_MAGIC; + + spin_lock(&lst_ping_data.pnd_lock); + req->pnr_seq = lst_ping_data.pnd_counter++; + spin_unlock(&lst_ping_data.pnd_lock); + + ktime_get_real_ts64(&ts); + req->pnr_time_sec = ts.tv_sec; + req->pnr_time_nsec = ts.tv_nsec; + + return rc; +} + +static void +ping_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc) +{ + struct sfw_test_instance *tsi = tsu->tsu_instance; + struct sfw_session *sn = tsi->tsi_batch->bat_session; + struct srpc_ping_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst; + struct srpc_ping_reply *reply = &rpc->crpc_replymsg.msg_body.ping_reply; + struct timespec64 ts; + + LASSERT(sn != NULL); + + if (rpc->crpc_status != 0) { + if (!tsi->tsi_stopping) /* rpc could have been aborted */ + atomic_inc(&sn->sn_ping_errors); + CERROR ("Unable to ping %s (%d): %d\n", + libcfs_id2str(rpc->crpc_dest), + reqst->pnr_seq, rpc->crpc_status); + return; + } + + if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) { + __swab32s(&reply->pnr_seq); + __swab32s(&reply->pnr_magic); + __swab32s(&reply->pnr_status); + } + + if (reply->pnr_magic != LST_PING_TEST_MAGIC) { + rpc->crpc_status = -EBADMSG; + atomic_inc(&sn->sn_ping_errors); + CERROR ("Bad magic %u from %s, %u expected.\n", + reply->pnr_magic, libcfs_id2str(rpc->crpc_dest), + LST_PING_TEST_MAGIC); + return; + } + + if (reply->pnr_seq != reqst->pnr_seq) { + rpc->crpc_status = -EBADMSG; + atomic_inc(&sn->sn_ping_errors); + CERROR ("Bad seq %u from %s, %u expected.\n", + reply->pnr_seq, libcfs_id2str(rpc->crpc_dest), + reqst->pnr_seq); + return; + } + + ktime_get_real_ts64(&ts); + CDEBUG(D_NET, "%d reply in %llu nsec\n", reply->pnr_seq, + (u64)((ts.tv_sec - reqst->pnr_time_sec) * NSEC_PER_SEC + + (ts.tv_nsec - reqst->pnr_time_nsec))); + return; +} + +static int +ping_server_handle(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + struct srpc_msg *replymsg = &rpc->srpc_replymsg; + struct srpc_ping_reqst *req = &reqstmsg->msg_body.ping_reqst; + struct srpc_ping_reply *rep = &rpc->srpc_replymsg.msg_body.ping_reply; + + LASSERT (sv->sv_id == SRPC_SERVICE_PING); + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) { + LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + __swab32s(&req->pnr_seq); + __swab32s(&req->pnr_magic); + __swab64s(&req->pnr_time_sec); + __swab64s(&req->pnr_time_nsec); + } + LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id)); + + if (req->pnr_magic != LST_PING_TEST_MAGIC) { + CERROR ("Unexpect magic %08x from %s\n", + req->pnr_magic, libcfs_id2str(rpc->srpc_peer)); + return -EINVAL; + } + + rep->pnr_seq = req->pnr_seq; + rep->pnr_magic = LST_PING_TEST_MAGIC; + + if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + replymsg->msg_ses_feats = LST_FEATS_MASK; + rep->pnr_status = EPROTO; + return 0; + } + + replymsg->msg_ses_feats = reqstmsg->msg_ses_feats; + + CDEBUG(D_NET, "Get ping %d from %s\n", + req->pnr_seq, libcfs_id2str(rpc->srpc_peer)); + return 0; +} + +struct sfw_test_client_ops ping_test_client; + +void ping_init_test_client(void) +{ + ping_test_client.tso_init = ping_client_init; + ping_test_client.tso_fini = ping_client_fini; + ping_test_client.tso_prep_rpc = ping_client_prep_rpc; + ping_test_client.tso_done_rpc = ping_client_done_rpc; +} + +struct srpc_service ping_test_service; + +void ping_init_test_service(void) +{ + ping_test_service.sv_id = SRPC_SERVICE_PING; + ping_test_service.sv_name = "ping_test"; + ping_test_service.sv_handler = ping_server_handle; + ping_test_service.sv_wi_total = ping_srv_workitems; +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.c b/drivers/staging/lustrefsx/lnet/selftest/rpc.c new file mode 100644 index 0000000000000..bd7a2d5ec0757 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.c @@ -0,0 +1,1699 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/rpc.c + * + * Author: Isaac Huang + * + * 2012-05-13: Liang Zhen + * - percpt data for service to improve smp performance + * - code cleanup + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + +enum srpc_state { + SRPC_STATE_NONE, + SRPC_STATE_NI_INIT, + SRPC_STATE_EQ_INIT, + SRPC_STATE_RUNNING, + SRPC_STATE_STOPPING, +}; + +static struct smoketest_rpc { + spinlock_t rpc_glock; /* global lock */ + struct srpc_service *rpc_services[SRPC_SERVICE_MAX_ID + 1]; + struct lnet_handle_eq rpc_lnet_eq; /* _the_ LNet event queue */ + enum srpc_state rpc_state; + struct srpc_counters rpc_counters; + __u64 rpc_matchbits; /* matchbits counter */ +} srpc_data; + +static inline int +srpc_serv_portal(int svc_id) +{ + return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ? + SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL; +} + +/* forward ref's */ +static int srpc_handle_rpc(struct swi_workitem *wi); + +void srpc_get_counters(struct srpc_counters *cnt) +{ + spin_lock(&srpc_data.rpc_glock); + *cnt = srpc_data.rpc_counters; + spin_unlock(&srpc_data.rpc_glock); +} + +void srpc_set_counters(const struct srpc_counters *cnt) +{ + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters = *cnt; + spin_unlock(&srpc_data.rpc_glock); +} + +static int +srpc_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i, int off, + int nob) +{ + LASSERT(off < PAGE_SIZE); + LASSERT(nob > 0 && nob <= PAGE_SIZE); + + bk->bk_iovs[i].kiov_offset = off; + bk->bk_iovs[i].kiov_page = pg; + bk->bk_iovs[i].kiov_len = nob; + return nob; +} + +void +srpc_free_bulk(struct srpc_bulk *bk) +{ + int i; + struct page *pg; + + LASSERT(bk != NULL); + + for (i = 0; i < bk->bk_niov; i++) { + pg = bk->bk_iovs[i].kiov_page; + if (pg == NULL) + break; + + __free_page(pg); + } + + LIBCFS_FREE(bk, offsetof(struct srpc_bulk, bk_iovs[bk->bk_niov])); + return; +} + +struct srpc_bulk * +srpc_alloc_bulk(int cpt, unsigned bulk_off, unsigned bulk_npg, + unsigned bulk_len, int sink) +{ + struct srpc_bulk *bk; + int i; + + LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV); + + LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt, + offsetof(struct srpc_bulk, bk_iovs[bulk_npg])); + if (bk == NULL) { + CERROR("Can't allocate descriptor for %d pages\n", bulk_npg); + return NULL; + } + + memset(bk, 0, offsetof(struct srpc_bulk, bk_iovs[bulk_npg])); + bk->bk_sink = sink; + bk->bk_len = bulk_len; + bk->bk_niov = bulk_npg; + + for (i = 0; i < bulk_npg; i++) { + struct page *pg; + int nob; + + pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_KERNEL); + if (pg == NULL) { + CERROR("Can't allocate page %d of %d\n", i, bulk_npg); + srpc_free_bulk(bk); + return NULL; + } + + nob = min_t(unsigned, bulk_off + bulk_len, PAGE_SIZE) - + bulk_off; + + srpc_add_bulk_page(bk, pg, i, bulk_off, nob); + bulk_len -= nob; + bulk_off = 0; + } + + return bk; +} + +static inline __u64 +srpc_next_id (void) +{ + __u64 id; + + spin_lock(&srpc_data.rpc_glock); + id = srpc_data.rpc_matchbits++; + spin_unlock(&srpc_data.rpc_glock); + return id; +} + +static void +srpc_init_server_rpc(struct srpc_server_rpc *rpc, + struct srpc_service_cd *scd, + struct srpc_buffer *buffer) +{ + memset(rpc, 0, sizeof(*rpc)); + swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc, + srpc_serv_is_framework(scd->scd_svc) ? + lst_sched_serial : lst_sched_test[scd->scd_cpt]); + + rpc->srpc_ev.ev_fired = 1; /* no event expected now */ + + rpc->srpc_scd = scd; + rpc->srpc_reqstbuf = buffer; + rpc->srpc_peer = buffer->buf_peer; + rpc->srpc_self = buffer->buf_self; + LNetInvalidateMDHandle(&rpc->srpc_replymdh); +} + +static void +srpc_service_fini(struct srpc_service *svc) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + struct srpc_buffer *buf; + struct list_head *q; + int i; + + if (svc->sv_cpt_data == NULL) + return; + + cfs_percpt_for_each(scd, i, svc->sv_cpt_data) { + while (1) { + if (!list_empty(&scd->scd_buf_posted)) + q = &scd->scd_buf_posted; + else if (!list_empty(&scd->scd_buf_blocked)) + q = &scd->scd_buf_blocked; + else + break; + + while (!list_empty(q)) { + buf = list_entry(q->next, + struct srpc_buffer, + buf_list); + list_del(&buf->buf_list); + LIBCFS_FREE(buf, sizeof(*buf)); + } + } + + LASSERT(list_empty(&scd->scd_rpc_active)); + + while (!list_empty(&scd->scd_rpc_free)) { + rpc = list_entry(scd->scd_rpc_free.next, + struct srpc_server_rpc, + srpc_list); + list_del(&rpc->srpc_list); + LIBCFS_FREE(rpc, sizeof(*rpc)); + } + } + + cfs_percpt_free(svc->sv_cpt_data); + svc->sv_cpt_data = NULL; +} + +static int +srpc_service_nrpcs(struct srpc_service *svc) +{ + int nrpcs = svc->sv_wi_total / svc->sv_ncpts; + + return srpc_serv_is_framework(svc) ? + max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN); +} + +int srpc_add_buffer(struct swi_workitem *wi); + +static int +srpc_service_init(struct srpc_service *svc) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int nrpcs; + int i; + int j; + + svc->sv_shuttingdown = 0; + + svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct srpc_service_cd)); + if (svc->sv_cpt_data == NULL) + return -ENOMEM; + + svc->sv_ncpts = srpc_serv_is_framework(svc) ? + 1 : cfs_cpt_number(lnet_cpt_table()); + nrpcs = srpc_service_nrpcs(svc); + + cfs_percpt_for_each(scd, i, svc->sv_cpt_data) { + scd->scd_cpt = i; + scd->scd_svc = svc; + spin_lock_init(&scd->scd_lock); + INIT_LIST_HEAD(&scd->scd_rpc_free); + INIT_LIST_HEAD(&scd->scd_rpc_active); + INIT_LIST_HEAD(&scd->scd_buf_posted); + INIT_LIST_HEAD(&scd->scd_buf_blocked); + + scd->scd_ev.ev_data = scd; + scd->scd_ev.ev_type = SRPC_REQUEST_RCVD; + + /* NB: don't use lst_sched_serial for adding buffer, + * see details in srpc_service_add_buffers() */ + swi_init_workitem(&scd->scd_buf_wi, scd, + srpc_add_buffer, lst_sched_test[i]); + + if (i != 0 && srpc_serv_is_framework(svc)) { + /* NB: framework service only needs srpc_service_cd for + * one partition, but we allocate for all to make + * it easier to implement, it will waste a little + * memory but nobody should care about this */ + continue; + } + + for (j = 0; j < nrpcs; j++) { + LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(), + i, sizeof(*rpc)); + if (rpc == NULL) { + srpc_service_fini(svc); + return -ENOMEM; + } + list_add(&rpc->srpc_list, &scd->scd_rpc_free); + } + } + + return 0; +} + +int +srpc_add_service(struct srpc_service *sv) +{ + int id = sv->sv_id; + + LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID); + + if (srpc_service_init(sv) != 0) + return -ENOMEM; + + spin_lock(&srpc_data.rpc_glock); + + LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING); + + if (srpc_data.rpc_services[id] != NULL) { + spin_unlock(&srpc_data.rpc_glock); + goto failed; + } + + srpc_data.rpc_services[id] = sv; + spin_unlock(&srpc_data.rpc_glock); + + CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name); + return 0; + +failed: + srpc_service_fini(sv); + return -EBUSY; +} + +int +srpc_remove_service(struct srpc_service *sv) +{ + int id = sv->sv_id; + + spin_lock(&srpc_data.rpc_glock); + + if (srpc_data.rpc_services[id] != sv) { + spin_unlock(&srpc_data.rpc_glock); + return -ENOENT; + } + + srpc_data.rpc_services[id] = NULL; + spin_unlock(&srpc_data.rpc_glock); + return 0; +} + +static int +srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf, + int len, int options, struct lnet_process_id peer, + struct lnet_handle_md *mdh, struct srpc_event *ev) +{ + int rc; + struct lnet_md md; + struct lnet_handle_me meh; + + rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK, + local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh); + if (rc != 0) { + CERROR("LNetMEAttach failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + return -ENOMEM; + } + + md.threshold = 1; + md.user_ptr = ev; + md.start = buf; + md.length = len; + md.options = options; + md.eq_handle = srpc_data.rpc_lnet_eq; + + rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh); + if (rc != 0) { + CERROR("LNetMDAttach failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + + rc = LNetMEUnlink(meh); + LASSERT(rc == 0); + return -ENOMEM; + } + + CDEBUG(D_NET, + "Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n", + libcfs_id2str(peer), portal, matchbits); + return 0; +} + +static int +srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len, + int options, struct lnet_process_id peer, + lnet_nid_t self, struct lnet_handle_md *mdh, + struct srpc_event *ev) +{ + int rc; + struct lnet_md md; + + md.user_ptr = ev; + md.start = buf; + md.length = len; + md.eq_handle = srpc_data.rpc_lnet_eq; + md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1; + md.options = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET); + + rc = LNetMDBind(md, LNET_UNLINK, mdh); + if (rc != 0) { + CERROR("LNetMDBind failed: %d\n", rc); + LASSERT(rc == -ENOMEM); + return -ENOMEM; + } + + /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options. + * they're only meaningful for MDs attached to an ME (i.e. passive + * buffers... + */ + if ((options & LNET_MD_OP_PUT) != 0) { + rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer, + portal, matchbits, 0, 0); + } else { + LASSERT((options & LNET_MD_OP_GET) != 0); + + rc = LNetGet(self, *mdh, peer, portal, matchbits, 0, false); + } + + if (rc != 0) { + CERROR("LNet%s(%s, %d, %lld) failed: %d\n", + ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get", + libcfs_id2str(peer), portal, matchbits, rc); + + /* The forthcoming unlink event will complete this operation + * with failure, so fall through and return success here. + */ + rc = LNetMDUnlink(*mdh); + LASSERT(rc == 0); + } else { + CDEBUG(D_NET, + "Posted active RDMA: peer %s, portal %u, matchbits %#llx\n", + libcfs_id2str(peer), portal, matchbits); + } + return 0; +} + +static int +srpc_post_passive_rqtbuf(int service, int local, void *buf, int len, + struct lnet_handle_md *mdh, struct srpc_event *ev) +{ + struct lnet_process_id any = {0}; + + any.nid = LNET_NID_ANY; + any.pid = LNET_PID_ANY; + + return srpc_post_passive_rdma(srpc_serv_portal(service), + local, service, buf, len, + LNET_MD_OP_PUT, any, mdh, ev); +} + +static int +srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf) +__must_hold(&scd->scd_lock) +{ + struct srpc_service *sv = scd->scd_svc; + struct srpc_msg *msg = &buf->buf_msg; + int rc; + + LNetInvalidateMDHandle(&buf->buf_mdh); + list_add(&buf->buf_list, &scd->scd_buf_posted); + scd->scd_buf_nposted++; + spin_unlock(&scd->scd_lock); + + rc = srpc_post_passive_rqtbuf(sv->sv_id, + !srpc_serv_is_framework(sv), + msg, sizeof(*msg), &buf->buf_mdh, + &scd->scd_ev); + + /* At this point, a RPC (new or delayed) may have arrived in + * msg and its event handler has been called. So we must add + * buf to scd_buf_posted _before_ dropping scd_lock */ + + spin_lock(&scd->scd_lock); + + if (rc == 0) { + if (!sv->sv_shuttingdown) + return 0; + + spin_unlock(&scd->scd_lock); + /* srpc_shutdown_service might have tried to unlink me + * when my buf_mdh was still invalid */ + LNetMDUnlink(buf->buf_mdh); + spin_lock(&scd->scd_lock); + return 0; + } + + scd->scd_buf_nposted--; + if (sv->sv_shuttingdown) + return rc; /* don't allow to change scd_buf_posted */ + + list_del(&buf->buf_list); + spin_unlock(&scd->scd_lock); + + LIBCFS_FREE(buf, sizeof(*buf)); + + spin_lock(&scd->scd_lock); + return rc; +} + +int +srpc_add_buffer(struct swi_workitem *wi) +{ + struct srpc_service_cd *scd = container_of(wi, struct srpc_service_cd, + scd_buf_wi); + struct srpc_buffer *buf; + int rc = 0; + + /* it's called by workitem scheduler threads, these threads + * should have been set CPT affinity, so buffers will be posted + * on CPT local list of Portal */ + spin_lock(&scd->scd_lock); + + while (scd->scd_buf_adjust > 0 && + !scd->scd_svc->sv_shuttingdown) { + scd->scd_buf_adjust--; /* consume it */ + scd->scd_buf_posting++; + + spin_unlock(&scd->scd_lock); + + LIBCFS_ALLOC(buf, sizeof(*buf)); + if (buf == NULL) { + CERROR("Failed to add new buf to service: %s\n", + scd->scd_svc->sv_name); + spin_lock(&scd->scd_lock); + rc = -ENOMEM; + break; + } + + spin_lock(&scd->scd_lock); + if (scd->scd_svc->sv_shuttingdown) { + spin_unlock(&scd->scd_lock); + LIBCFS_FREE(buf, sizeof(*buf)); + + spin_lock(&scd->scd_lock); + rc = -ESHUTDOWN; + break; + } + + rc = srpc_service_post_buffer(scd, buf); + if (rc != 0) + break; /* buf has been freed inside */ + + LASSERT(scd->scd_buf_posting > 0); + scd->scd_buf_posting--; + scd->scd_buf_total++; + scd->scd_buf_low = MAX(2, scd->scd_buf_total / 4); + } + + if (rc != 0) { + scd->scd_buf_err_stamp = ktime_get_real_seconds(); + scd->scd_buf_err = rc; + + LASSERT(scd->scd_buf_posting > 0); + scd->scd_buf_posting--; + } + + spin_unlock(&scd->scd_lock); + return 0; +} + +int +srpc_service_add_buffers(struct srpc_service *sv, int nbuffer) +{ + struct srpc_service_cd *scd; + int rc = 0; + int i; + + LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + scd->scd_buf_err = 0; + scd->scd_buf_err_stamp = 0; + scd->scd_buf_posting = 0; + scd->scd_buf_adjust = nbuffer; + /* start to post buffers */ + swi_schedule_workitem(&scd->scd_buf_wi); + spin_unlock(&scd->scd_lock); + + /* framework service only post buffer for one partition */ + if (srpc_serv_is_framework(sv)) + break; + } + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + /* + * NB: srpc_service_add_buffers() can be called inside + * thread context of lst_sched_serial, and we don't normally + * allow to sleep inside thread context of WI scheduler + * because it will block current scheduler thread from doing + * anything else, even worse, it could deadlock if it's + * waiting on result from another WI of the same scheduler. + * However, it's safe at here because scd_buf_wi is scheduled + * by thread in a different WI scheduler (lst_sched_test), + * so we don't have any risk of deadlock, though this could + * block all WIs pending on lst_sched_serial for a moment + * which is not good but not fatal. + */ + lst_wait_until(scd->scd_buf_err != 0 || + (scd->scd_buf_adjust == 0 && + scd->scd_buf_posting == 0), + scd->scd_lock, "waiting for adding buffer\n"); + + if (scd->scd_buf_err != 0 && rc == 0) + rc = scd->scd_buf_err; + + spin_unlock(&scd->scd_lock); + } + + return rc; +} + +void +srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer) +{ + struct srpc_service_cd *scd; + int num; + int i; + + LASSERT(!sv->sv_shuttingdown); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + num = scd->scd_buf_total + scd->scd_buf_posting; + scd->scd_buf_adjust -= min(nbuffer, num); + + spin_unlock(&scd->scd_lock); + } +} + +/* returns 1 if sv has finished, otherwise 0 */ +int +srpc_finish_service(struct srpc_service *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int i; + + LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */ + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + if (!swi_deschedule_workitem(&scd->scd_buf_wi)) { + spin_unlock(&scd->scd_lock); + return 0; + } + + if (scd->scd_buf_nposted > 0) { + CDEBUG(D_NET, "waiting for %d posted buffers to unlink\n", + scd->scd_buf_nposted); + spin_unlock(&scd->scd_lock); + return 0; + } + + if (list_empty(&scd->scd_rpc_active)) { + spin_unlock(&scd->scd_lock); + continue; + } + + rpc = list_entry(scd->scd_rpc_active.next, + struct srpc_server_rpc, srpc_list); + CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s scheduled %d running %d, ev fired %d type %d status %d lnet %d\n", + rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), + rpc->srpc_wi.swi_workitem.wi_scheduled, + rpc->srpc_wi.swi_workitem.wi_running, + rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type, + rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet); + spin_unlock(&scd->scd_lock); + return 0; + } + + /* no lock needed from now on */ + srpc_service_fini(sv); + return 1; +} + +/* called with sv->sv_lock held */ +static void +srpc_service_recycle_buffer(struct srpc_service_cd *scd, + struct srpc_buffer *buf) +__must_hold(&scd->scd_lock) +{ + if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) { + if (srpc_service_post_buffer(scd, buf) != 0) { + CWARN("Failed to post %s buffer\n", + scd->scd_svc->sv_name); + } + return; + } + + /* service is shutting down, or we want to recycle some buffers */ + scd->scd_buf_total--; + + if (scd->scd_buf_adjust < 0) { + scd->scd_buf_adjust++; + if (scd->scd_buf_adjust < 0 && + scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) { + CDEBUG(D_INFO, + "Try to recyle %d buffers but nothing left\n", + scd->scd_buf_adjust); + scd->scd_buf_adjust = 0; + } + } + + spin_unlock(&scd->scd_lock); + LIBCFS_FREE(buf, sizeof(*buf)); + spin_lock(&scd->scd_lock); +} + +void +srpc_abort_service(struct srpc_service *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int i; + + CDEBUG(D_NET, "Aborting service: id %d, name %s\n", + sv->sv_id, sv->sv_name); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + /* schedule in-flight RPCs to notice the abort, NB: + * racing with incoming RPCs; complete fix should make test + * RPCs carry session ID in its headers + */ + list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) { + rpc->srpc_aborted = 1; + swi_schedule_workitem(&rpc->srpc_wi); + } + + spin_unlock(&scd->scd_lock); + } +} + +void +srpc_shutdown_service(struct srpc_service *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + struct srpc_buffer *buf; + int i; + + CDEBUG(D_NET, "Shutting down service: id %d, name %s\n", + sv->sv_id, sv->sv_name); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) + spin_lock(&scd->scd_lock); + + sv->sv_shuttingdown = 1; /* i.e. no new active RPC */ + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) + spin_unlock(&scd->scd_lock); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + /* schedule in-flight RPCs to notice the shutdown */ + list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) + swi_schedule_workitem(&rpc->srpc_wi); + + spin_unlock(&scd->scd_lock); + + /* OK to traverse scd_buf_posted without lock, since no one + * touches scd_buf_posted now + */ + list_for_each_entry(buf, &scd->scd_buf_posted, buf_list) + LNetMDUnlink(buf->buf_mdh); + } +} + +static int +srpc_send_request(struct srpc_client_rpc *rpc) +{ + struct srpc_event *ev = &rpc->crpc_reqstev; + int rc; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REQUEST_SENT; + + rc = srpc_post_active_rdma(srpc_serv_portal(rpc->crpc_service), + rpc->crpc_service, &rpc->crpc_reqstmsg, + sizeof(struct srpc_msg), LNET_MD_OP_PUT, + rpc->crpc_dest, LNET_NID_ANY, + &rpc->crpc_reqstmdh, ev); + if (rc != 0) { + LASSERT(rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +static int +srpc_prepare_reply(struct srpc_client_rpc *rpc) +{ + struct srpc_event *ev = &rpc->crpc_replyev; + u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid; + int rc; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REPLY_RCVD; + + *id = srpc_next_id(); + + rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id, + &rpc->crpc_replymsg, + sizeof(struct srpc_msg), + LNET_MD_OP_PUT, rpc->crpc_dest, + &rpc->crpc_replymdh, ev); + if (rc != 0) { + LASSERT(rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +static int +srpc_prepare_bulk(struct srpc_client_rpc *rpc) +{ + struct srpc_bulk *bk = &rpc->crpc_bulk; + struct srpc_event *ev = &rpc->crpc_bulkev; + __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid; + int rc; + int opt; + + LASSERT(bk->bk_niov <= LNET_MAX_IOV); + + /* nothing to do */ + if (bk->bk_niov == 0) + return 0; + + opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET; + opt |= LNET_MD_KIOV; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_BULK_REQ_RCVD; + + *id = srpc_next_id(); + + rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id, + &bk->bk_iovs[0], bk->bk_niov, opt, + rpc->crpc_dest, &bk->bk_mdh, ev); + if (rc != 0) { + LASSERT(rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +static int +srpc_do_bulk(struct srpc_server_rpc *rpc) +{ + struct srpc_event *ev = &rpc->srpc_ev; + struct srpc_bulk *bk = rpc->srpc_bulk; + __u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid; + int rc; + int opt; + + LASSERT(bk != NULL); + + opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT; + opt |= LNET_MD_KIOV; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT; + + rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id, + &bk->bk_iovs[0], bk->bk_niov, opt, + rpc->srpc_peer, rpc->srpc_self, + &bk->bk_mdh, ev); + if (rc != 0) + ev->ev_fired = 1; /* no more event expected */ + return rc; +} + +/* only called from srpc_handle_rpc */ +static void +srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status) +{ + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + struct srpc_buffer *buffer; + + LASSERT(status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE); + + rpc->srpc_status = status; + + CDEBUG_LIMIT(status == 0 ? D_NET : D_NETERROR, + "Server RPC %p done: service %s, peer %s, status %s:%d\n", + rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), status); + + if (status != 0) { + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_dropped++; + spin_unlock(&srpc_data.rpc_glock); + } + + if (rpc->srpc_done != NULL) + (*rpc->srpc_done) (rpc); + LASSERT(rpc->srpc_bulk == NULL); + + spin_lock(&scd->scd_lock); + + if (rpc->srpc_reqstbuf != NULL) { + /* NB might drop sv_lock in srpc_service_recycle_buffer, but + * sv won't go away for scd_rpc_active must not be empty + */ + srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf); + rpc->srpc_reqstbuf = NULL; + } + + list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */ + + /* + * No one can schedule me now since: + * - I'm not on scd_rpc_active. + * - all LNet events have been fired. + * Cancel pending schedules and prevent future schedule attempts: + */ + LASSERT(rpc->srpc_ev.ev_fired); + swi_exit_workitem(&rpc->srpc_wi); + + if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) { + buffer = list_entry(scd->scd_buf_blocked.next, + struct srpc_buffer, buf_list); + list_del(&buffer->buf_list); + + srpc_init_server_rpc(rpc, scd, buffer); + list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active); + swi_schedule_workitem(&rpc->srpc_wi); + } else { + list_add(&rpc->srpc_list, &scd->scd_rpc_free); + } + + spin_unlock(&scd->scd_lock); + return; +} + +/* handles an incoming RPC */ +static int srpc_handle_rpc(struct swi_workitem *wi) +{ + struct srpc_server_rpc *rpc = container_of(wi, struct srpc_server_rpc, + srpc_wi); + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + struct srpc_event *ev = &rpc->srpc_ev; + int rc = 0; + + LASSERT(wi == &rpc->srpc_wi); + + spin_lock(&scd->scd_lock); + + if (sv->sv_shuttingdown || rpc->srpc_aborted) { + spin_unlock(&scd->scd_lock); + + if (rpc->srpc_bulk != NULL) + LNetMDUnlink(rpc->srpc_bulk->bk_mdh); + LNetMDUnlink(rpc->srpc_replymdh); + + if (ev->ev_fired) { /* no more event, OK to finish */ + srpc_server_rpc_done(rpc, -ESHUTDOWN); + return 1; + } + return 0; + } + + spin_unlock(&scd->scd_lock); + + switch (wi->swi_state) { + default: + LBUG(); + fallthrough; + case SWI_STATE_NEWBORN: { + struct srpc_msg *msg; + struct srpc_generic_reply *reply; + + msg = &rpc->srpc_reqstbuf->buf_msg; + reply = &rpc->srpc_replymsg.msg_body.reply; + + if (msg->msg_magic == 0) { + /* moaned already in srpc_lnet_ev_handler */ + srpc_server_rpc_done(rpc, EBADMSG); + return 1; + } + + srpc_unpack_msg_hdr(msg); + if (msg->msg_version != SRPC_MSG_VERSION) { + CWARN("Version mismatch: %u, %u expected, from %s\n", + msg->msg_version, SRPC_MSG_VERSION, + libcfs_id2str(rpc->srpc_peer)); + reply->status = EPROTO; + /* drop through and send reply */ + } else { + reply->status = 0; + rc = (*sv->sv_handler)(rpc); + LASSERT(reply->status == 0 || !rpc->srpc_bulk); + if (rc != 0) { + srpc_server_rpc_done(rpc, rc); + return 1; + } + } + + wi->swi_state = SWI_STATE_BULK_STARTED; + + if (rpc->srpc_bulk != NULL) { + rc = srpc_do_bulk(rpc); + if (rc == 0) + return 0; /* wait for bulk */ + + LASSERT(ev->ev_fired); + ev->ev_status = rc; + } + } + fallthrough; + case SWI_STATE_BULK_STARTED: + LASSERT(rpc->srpc_bulk == NULL || ev->ev_fired); + + if (rpc->srpc_bulk != NULL) { + rc = ev->ev_status; + + if (sv->sv_bulk_ready != NULL) + rc = (*sv->sv_bulk_ready) (rpc, rc); + + if (rc != 0) { + srpc_server_rpc_done(rpc, rc); + return 1; + } + } + + wi->swi_state = SWI_STATE_REPLY_SUBMITTED; + rc = srpc_send_reply(rpc); + if (rc == 0) + return 0; /* wait for reply */ + srpc_server_rpc_done(rpc, rc); + return 1; + + case SWI_STATE_REPLY_SUBMITTED: + if (!ev->ev_fired) { + CERROR("RPC %p: bulk %p, service %d\n", + rpc, rpc->srpc_bulk, sv->sv_id); + CERROR("Event: status %d, type %d, lnet %d\n", + ev->ev_status, ev->ev_type, ev->ev_lnet); + LASSERT(ev->ev_fired); + } + + wi->swi_state = SWI_STATE_DONE; + srpc_server_rpc_done(rpc, ev->ev_status); + return 1; + } + + return 0; +} + +static void +srpc_client_rpc_expired (void *data) +{ + struct srpc_client_rpc *rpc = data; + + CWARN("Client RPC expired: service %d, peer %s, timeout %d.\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + rpc->crpc_timeout); + + spin_lock(&rpc->crpc_lock); + + rpc->crpc_timeout = 0; + srpc_abort_rpc(rpc, -ETIMEDOUT); + + spin_unlock(&rpc->crpc_lock); + + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_expired++; + spin_unlock(&srpc_data.rpc_glock); +} + +static void +srpc_add_client_rpc_timer(struct srpc_client_rpc *rpc) +{ + struct stt_timer *timer = &rpc->crpc_timer; + + if (rpc->crpc_timeout == 0) + return; + + INIT_LIST_HEAD(&timer->stt_list); + timer->stt_data = rpc; + timer->stt_func = srpc_client_rpc_expired; + timer->stt_expires = ktime_get_real_seconds() + rpc->crpc_timeout; + stt_add_timer(timer); + return; +} + +/* + * Called with rpc->crpc_lock held. + * + * Upon exit the RPC expiry timer is not queued and the handler is not + * running on any CPU. + */ +static void +srpc_del_client_rpc_timer(struct srpc_client_rpc *rpc) +{ + /* timer not planted or already exploded */ + if (rpc->crpc_timeout == 0) + return; + + /* timer successfully defused */ + if (stt_del_timer(&rpc->crpc_timer)) + return; + + /* timer detonated, wait for it to explode */ + while (rpc->crpc_timeout != 0) { + spin_unlock(&rpc->crpc_lock); + + schedule(); + + spin_lock(&rpc->crpc_lock); + } +} + +static void +srpc_client_rpc_done(struct srpc_client_rpc *rpc, int status) +{ + struct swi_workitem *wi = &rpc->crpc_wi; + + LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE); + + spin_lock(&rpc->crpc_lock); + + rpc->crpc_closed = 1; + if (rpc->crpc_status == 0) + rpc->crpc_status = status; + + srpc_del_client_rpc_timer(rpc); + + CDEBUG_LIMIT((status == 0) ? D_NET : D_NETERROR, + "Client RPC done: service %d, peer %s, status %s:%d:%d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(wi->swi_state), rpc->crpc_aborted, status); + + /* + * No one can schedule me now since: + * - RPC timer has been defused. + * - all LNet events have been fired. + * - crpc_closed has been set, preventing srpc_abort_rpc from + * scheduling me. + * Cancel pending schedules and prevent future schedule attempts: + */ + LASSERT(!srpc_event_pending(rpc)); + swi_exit_workitem(wi); + + spin_unlock(&rpc->crpc_lock); + + (*rpc->crpc_done)(rpc); + return; +} + +/* sends an outgoing RPC */ +int +srpc_send_rpc(struct swi_workitem *wi) +{ + int rc = 0; + struct srpc_client_rpc *rpc; + struct srpc_msg *reply; + int do_bulk; + + LASSERT(wi != NULL); + + rpc = wi->swi_workitem.wi_data; + + LASSERT(rpc != NULL); + LASSERT(wi == &rpc->crpc_wi); + + reply = &rpc->crpc_replymsg; + do_bulk = rpc->crpc_bulk.bk_niov > 0; + + spin_lock(&rpc->crpc_lock); + + if (rpc->crpc_aborted) { + spin_unlock(&rpc->crpc_lock); + goto abort; + } + + spin_unlock(&rpc->crpc_lock); + + switch (wi->swi_state) { + default: + LBUG(); + case SWI_STATE_NEWBORN: + LASSERT(!srpc_event_pending(rpc)); + + rc = srpc_prepare_reply(rpc); + if (rc != 0) { + srpc_client_rpc_done(rpc, rc); + return 1; + } + + rc = srpc_prepare_bulk(rpc); + if (rc != 0) + break; + + wi->swi_state = SWI_STATE_REQUEST_SUBMITTED; + rc = srpc_send_request(rpc); + break; + + case SWI_STATE_REQUEST_SUBMITTED: + /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any + * order; however, they're processed in a strict order: + * rqt, rpy, and bulk. + */ + if (!rpc->crpc_reqstev.ev_fired) + break; + + rc = rpc->crpc_reqstev.ev_status; + if (rc != 0) + break; + + wi->swi_state = SWI_STATE_REQUEST_SENT; + fallthrough; + case SWI_STATE_REQUEST_SENT: { + enum srpc_msg_type type; + + type = srpc_service2reply(rpc->crpc_service); + + if (!rpc->crpc_replyev.ev_fired) + break; + + rc = rpc->crpc_replyev.ev_status; + if (rc != 0) + break; + + srpc_unpack_msg_hdr(reply); + if (reply->msg_type != type || + (reply->msg_magic != SRPC_MSG_MAGIC && + reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) { + CWARN("Bad message from %s: type %u (%d expected), magic %u (%d expected).\n", + libcfs_id2str(rpc->crpc_dest), + reply->msg_type, type, + reply->msg_magic, SRPC_MSG_MAGIC); + rc = -EBADMSG; + break; + } + + if (do_bulk && reply->msg_body.reply.status != 0) { + CWARN("Remote error %d at %s, unlink bulk buffer in case peer didn't initiate bulk transfer\n", + reply->msg_body.reply.status, + libcfs_id2str(rpc->crpc_dest)); + LNetMDUnlink(rpc->crpc_bulk.bk_mdh); + } + + wi->swi_state = SWI_STATE_REPLY_RECEIVED; + } + fallthrough; + case SWI_STATE_REPLY_RECEIVED: + if (do_bulk && !rpc->crpc_bulkev.ev_fired) + break; + + rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0; + + /* Bulk buffer was unlinked due to remote error. Clear error + * since reply buffer still contains valid data. + * NB rpc->crpc_done shouldn't look into bulk data in case of + * remote error. + */ + if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK && + rpc->crpc_status == 0 && reply->msg_body.reply.status != 0) + rc = 0; + + wi->swi_state = SWI_STATE_DONE; + srpc_client_rpc_done(rpc, rc); + return 1; + } + + if (rc != 0) { + spin_lock(&rpc->crpc_lock); + srpc_abort_rpc(rpc, rc); + spin_unlock(&rpc->crpc_lock); + } + +abort: + if (rpc->crpc_aborted) { + LNetMDUnlink(rpc->crpc_reqstmdh); + LNetMDUnlink(rpc->crpc_replymdh); + LNetMDUnlink(rpc->crpc_bulk.bk_mdh); + + if (!srpc_event_pending(rpc)) { + srpc_client_rpc_done(rpc, -EINTR); + return 1; + } + } + return 0; +} + +struct srpc_client_rpc * +srpc_create_client_rpc(struct lnet_process_id peer, int service, + int nbulkiov, int bulklen, + void (*rpc_done)(struct srpc_client_rpc *), + void (*rpc_fini)(struct srpc_client_rpc *), void *priv) +{ + struct srpc_client_rpc *rpc; + + LIBCFS_ALLOC(rpc, offsetof(struct srpc_client_rpc, + crpc_bulk.bk_iovs[nbulkiov])); + if (rpc == NULL) + return NULL; + + srpc_init_client_rpc(rpc, peer, service, nbulkiov, + bulklen, rpc_done, rpc_fini, priv); + return rpc; +} + +/* called with rpc->crpc_lock held */ +void +srpc_abort_rpc(struct srpc_client_rpc *rpc, int why) +{ + LASSERT(why != 0); + + if (rpc->crpc_aborted || /* already aborted */ + rpc->crpc_closed) /* callback imminent */ + return; + + CDEBUG(D_NET, + "Aborting RPC: service %d, peer %s, state %s, why %d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(rpc->crpc_wi.swi_state), why); + + rpc->crpc_aborted = 1; + rpc->crpc_status = why; + swi_schedule_workitem(&rpc->crpc_wi); + return; +} + +/* called with rpc->crpc_lock held */ +void +srpc_post_rpc(struct srpc_client_rpc *rpc) +{ + LASSERT(!rpc->crpc_aborted); + LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING); + + CDEBUG(D_NET, "Posting RPC: peer %s, service %d, timeout %d\n", + libcfs_id2str(rpc->crpc_dest), rpc->crpc_service, + rpc->crpc_timeout); + + srpc_add_client_rpc_timer(rpc); + swi_schedule_workitem(&rpc->crpc_wi); + return; +} + + +int +srpc_send_reply(struct srpc_server_rpc *rpc) +{ + struct srpc_event *ev = &rpc->srpc_ev; + struct srpc_msg *msg = &rpc->srpc_replymsg; + struct srpc_buffer *buffer = rpc->srpc_reqstbuf; + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + __u64 rpyid; + int rc; + + LASSERT(buffer != NULL); + rpyid = buffer->buf_msg.msg_body.reqst.rpyid; + + spin_lock(&scd->scd_lock); + + if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) { + /* Repost buffer before replying since test client + * might send me another RPC once it gets the reply + */ + if (srpc_service_post_buffer(scd, buffer) != 0) + CWARN("Failed to repost %s buffer\n", sv->sv_name); + rpc->srpc_reqstbuf = NULL; + } + + spin_unlock(&scd->scd_lock); + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REPLY_SENT; + + msg->msg_magic = SRPC_MSG_MAGIC; + msg->msg_version = SRPC_MSG_VERSION; + msg->msg_type = srpc_service2reply(sv->sv_id); + + rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg, + sizeof(*msg), LNET_MD_OP_PUT, + rpc->srpc_peer, rpc->srpc_self, + &rpc->srpc_replymdh, ev); + if (rc != 0) + ev->ev_fired = 1; /* no more event expected */ + return rc; +} + +/* when in kernel always called with LNET_LOCK() held, and in thread context */ +static void +srpc_lnet_ev_handler(struct lnet_event *ev) +{ + struct srpc_service_cd *scd; + struct srpc_event *rpcev = ev->md.user_ptr; + struct srpc_client_rpc *crpc; + struct srpc_server_rpc *srpc; + struct srpc_buffer *buffer; + struct srpc_service *sv; + struct srpc_msg *msg; + enum srpc_msg_type type; + + LASSERT(!in_interrupt()); + + if (ev->status != 0) { + __u32 errors; + + spin_lock(&srpc_data.rpc_glock); + if (ev->status != -ECANCELED) /* cancellation is not error */ + srpc_data.rpc_counters.errors++; + errors = srpc_data.rpc_counters.errors; + spin_unlock(&srpc_data.rpc_glock); + + CNETERR("LNet event status %d type %d, RPC errors %u\n", + ev->status, ev->type, errors); + } + + rpcev->ev_lnet = ev->type; + + switch (rpcev->ev_type) { + default: + CERROR("Unknown event: status %d, type %d, lnet %d\n", + rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet); + LBUG(); + fallthrough; + case SRPC_REQUEST_SENT: + if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) { + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_sent++; + spin_unlock(&srpc_data.rpc_glock); + } + fallthrough; + case SRPC_REPLY_RCVD: + case SRPC_BULK_REQ_RCVD: + crpc = rpcev->ev_data; + + if (rpcev != &crpc->crpc_reqstev && + rpcev != &crpc->crpc_replyev && + rpcev != &crpc->crpc_bulkev) { + CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n", + rpcev, crpc, &crpc->crpc_reqstev, + &crpc->crpc_replyev, &crpc->crpc_bulkev); + CERROR("Bad event: status %d, type %d, lnet %d\n", + rpcev->ev_status, rpcev->ev_type, + rpcev->ev_lnet); + LBUG(); + } + + spin_lock(&crpc->crpc_lock); + + LASSERT(rpcev->ev_fired == 0); + rpcev->ev_fired = 1; + rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? + -EINTR : ev->status; + swi_schedule_workitem(&crpc->crpc_wi); + + spin_unlock(&crpc->crpc_lock); + break; + + case SRPC_REQUEST_RCVD: + scd = rpcev->ev_data; + sv = scd->scd_svc; + + LASSERT(rpcev == &scd->scd_ev); + + spin_lock(&scd->scd_lock); + + LASSERT(ev->unlinked); + LASSERT(ev->type == LNET_EVENT_PUT || + ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->type != LNET_EVENT_UNLINK || + sv->sv_shuttingdown); + + buffer = container_of(ev->md.start, struct srpc_buffer, + buf_msg); + buffer->buf_peer = ev->source; + buffer->buf_self = ev->target.nid; + + LASSERT(scd->scd_buf_nposted > 0); + scd->scd_buf_nposted--; + + if (sv->sv_shuttingdown) { + /* Leave buffer on scd->scd_buf_nposted since + * srpc_finish_service needs to traverse it. + */ + spin_unlock(&scd->scd_lock); + break; + } + + if (scd->scd_buf_err_stamp != 0 && + scd->scd_buf_err_stamp < ktime_get_real_seconds()) { + /* re-enable adding buffer */ + scd->scd_buf_err_stamp = 0; + scd->scd_buf_err = 0; + } + + if (scd->scd_buf_err == 0 && /* adding buffer is enabled */ + scd->scd_buf_adjust == 0 && + scd->scd_buf_nposted < scd->scd_buf_low) { + scd->scd_buf_adjust = MAX(scd->scd_buf_total / 2, + SFW_TEST_WI_MIN); + swi_schedule_workitem(&scd->scd_buf_wi); + } + + list_del(&buffer->buf_list); /* from scd->scd_buf_posted */ + msg = &buffer->buf_msg; + type = srpc_service2request(sv->sv_id); + + if (ev->status != 0 || ev->mlength != sizeof(*msg) || + (msg->msg_type != type && + msg->msg_type != __swab32(type)) || + (msg->msg_magic != SRPC_MSG_MAGIC && + msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) { + CERROR("Dropping RPC (%s) from %s: status %d mlength %d type %u magic %u.\n", + sv->sv_name, libcfs_id2str(ev->initiator), + ev->status, ev->mlength, + msg->msg_type, msg->msg_magic); + + /* NB can't call srpc_service_recycle_buffer here since + * it may call LNetM[DE]Attach. The invalid magic tells + * srpc_handle_rpc to drop this RPC + */ + msg->msg_magic = 0; + } + + if (!list_empty(&scd->scd_rpc_free)) { + srpc = list_entry(scd->scd_rpc_free.next, + struct srpc_server_rpc, + srpc_list); + list_del(&srpc->srpc_list); + + srpc_init_server_rpc(srpc, scd, buffer); + list_add_tail(&srpc->srpc_list, + &scd->scd_rpc_active); + swi_schedule_workitem(&srpc->srpc_wi); + } else { + list_add_tail(&buffer->buf_list, + &scd->scd_buf_blocked); + } + + spin_unlock(&scd->scd_lock); + + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_rcvd++; + spin_unlock(&srpc_data.rpc_glock); + break; + + case SRPC_BULK_GET_RPLD: + LASSERT(ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_REPLY || + ev->type == LNET_EVENT_UNLINK); + + if (!ev->unlinked) + break; /* wait for final event */ + fallthrough; + case SRPC_BULK_PUT_SENT: + if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) { + spin_lock(&srpc_data.rpc_glock); + + if (rpcev->ev_type == SRPC_BULK_GET_RPLD) + srpc_data.rpc_counters.bulk_get += ev->mlength; + else + srpc_data.rpc_counters.bulk_put += ev->mlength; + + spin_unlock(&srpc_data.rpc_glock); + } + fallthrough; + case SRPC_REPLY_SENT: + srpc = rpcev->ev_data; + scd = srpc->srpc_scd; + + LASSERT(rpcev == &srpc->srpc_ev); + + spin_lock(&scd->scd_lock); + + rpcev->ev_fired = 1; + rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? + -EINTR : ev->status; + swi_schedule_workitem(&srpc->srpc_wi); + + spin_unlock(&scd->scd_lock); + break; + } +} + + +int +srpc_startup (void) +{ + int rc; + + memset(&srpc_data, 0, sizeof(struct smoketest_rpc)); + spin_lock_init(&srpc_data.rpc_glock); + + /* 1 second pause to avoid timestamp reuse */ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + srpc_data.rpc_matchbits = ((__u64) ktime_get_real_seconds()) << 48; + + srpc_data.rpc_state = SRPC_STATE_NONE; + + rc = LNetNIInit(LNET_PID_LUSTRE); + if (rc < 0) { + CERROR("LNetNIInit() has failed: %d\n", rc); + return rc; + } + + srpc_data.rpc_state = SRPC_STATE_NI_INIT; + + LNetInvalidateEQHandle(&srpc_data.rpc_lnet_eq); + rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq); + if (rc != 0) { + CERROR("LNetEQAlloc() has failed: %d\n", rc); + goto bail; + } + + rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); + LASSERT(rc == 0); + rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL); + LASSERT(rc == 0); + + srpc_data.rpc_state = SRPC_STATE_EQ_INIT; + + rc = stt_startup(); + +bail: + if (rc != 0) + srpc_shutdown(); + else + srpc_data.rpc_state = SRPC_STATE_RUNNING; + + return rc; +} + +void +srpc_shutdown (void) +{ + int i; + int rc; + int state; + + state = srpc_data.rpc_state; + srpc_data.rpc_state = SRPC_STATE_STOPPING; + + switch (state) { + default: + LBUG(); + fallthrough; + case SRPC_STATE_RUNNING: + spin_lock(&srpc_data.rpc_glock); + + for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) { + struct srpc_service *sv = srpc_data.rpc_services[i]; + + LASSERTF(sv == NULL, + "service not empty: id %d, name %s\n", + i, sv->sv_name); + } + + spin_unlock(&srpc_data.rpc_glock); + + stt_shutdown(); + fallthrough; + + case SRPC_STATE_EQ_INIT: + rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); + rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL); + LASSERT(rc == 0); + rc = LNetEQFree(srpc_data.rpc_lnet_eq); + LASSERT(rc == 0); /* the EQ should have no user by now */ + fallthrough; + + case SRPC_STATE_NI_INIT: + LNetNIFini(); + } + + return; +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.h b/drivers/staging/lustrefsx/lnet/selftest/rpc.h new file mode 100644 index 0000000000000..8cc8c434645d5 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.h @@ -0,0 +1,297 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __SELFTEST_RPC_H__ +#define __SELFTEST_RPC_H__ + +#include + +/* + * LST wired structures + * + * XXX: *REPLY == *REQST + 1 + */ +enum srpc_msg_type { + SRPC_MSG_MKSN_REQST = 0, + SRPC_MSG_MKSN_REPLY = 1, + SRPC_MSG_RMSN_REQST = 2, + SRPC_MSG_RMSN_REPLY = 3, + SRPC_MSG_BATCH_REQST = 4, + SRPC_MSG_BATCH_REPLY = 5, + SRPC_MSG_STAT_REQST = 6, + SRPC_MSG_STAT_REPLY = 7, + SRPC_MSG_TEST_REQST = 8, + SRPC_MSG_TEST_REPLY = 9, + SRPC_MSG_DEBUG_REQST = 10, + SRPC_MSG_DEBUG_REPLY = 11, + SRPC_MSG_BRW_REQST = 12, + SRPC_MSG_BRW_REPLY = 13, + SRPC_MSG_PING_REQST = 14, + SRPC_MSG_PING_REPLY = 15, + SRPC_MSG_JOIN_REQST = 16, + SRPC_MSG_JOIN_REPLY = 17, +}; + +/* CAVEAT EMPTOR: + * All struct srpc_*_reqst's 1st field must be matchbits of reply buffer, + * and 2nd field matchbits of bulk buffer if any. + * + * All struct srpc_*_reply's 1st field must be a __u32 status, and 2nd field + * session id if needed. + */ +struct srpc_generic_reqst { + __u64 rpyid; /* reply buffer matchbits */ + __u64 bulkid; /* bulk buffer matchbits */ +} WIRE_ATTR; + +struct srpc_generic_reply { + __u32 status; + struct lst_sid sid; +} WIRE_ATTR; + +/* FRAMEWORK RPCs */ +struct srpc_mksn_reqst { + __u64 mksn_rpyid; /* reply buffer matchbits */ + struct lst_sid mksn_sid; /* session id */ + __u32 mksn_force; /* use brute force */ + char mksn_name[LST_NAME_SIZE]; +} WIRE_ATTR; /* make session request */ + +struct srpc_mksn_reply { + __u32 mksn_status; /* session status */ + struct lst_sid mksn_sid; /* session id */ + __u32 mksn_timeout; /* session timeout */ + char mksn_name[LST_NAME_SIZE]; +} WIRE_ATTR; /* make session reply */ + +struct srpc_rmsn_reqst { + __u64 rmsn_rpyid; /* reply buffer matchbits */ + struct lst_sid rmsn_sid; /* session id */ +} WIRE_ATTR; /* remove session request */ + +struct srpc_rmsn_reply { + __u32 rmsn_status; + struct lst_sid rmsn_sid; /* session id */ +} WIRE_ATTR; /* remove session reply */ + +struct srpc_join_reqst { + __u64 join_rpyid; /* reply buffer matchbits */ + struct lst_sid join_sid; /* session id to join */ + char join_group[LST_NAME_SIZE]; /* group name */ +} WIRE_ATTR; + +struct srpc_join_reply { + __u32 join_status; /* returned status */ + struct lst_sid join_sid; /* session id */ + __u32 join_timeout; /* # seconds' inactivity to expire */ + char join_session[LST_NAME_SIZE]; /* session name */ +} WIRE_ATTR; + +struct srpc_debug_reqst { + __u64 dbg_rpyid; /* reply buffer matchbits */ + struct lst_sid dbg_sid; /* session id */ + __u32 dbg_flags; /* bitmap of debug */ +} WIRE_ATTR; + +struct srpc_debug_reply { + __u32 dbg_status; /* returned code */ + struct lst_sid dbg_sid; /* session id */ + __u32 dbg_timeout; /* session timeout */ + __u32 dbg_nbatch; /* # of batches in the node */ + char dbg_name[LST_NAME_SIZE]; /* session name */ +} WIRE_ATTR; + +#define SRPC_BATCH_OPC_RUN 1 +#define SRPC_BATCH_OPC_STOP 2 +#define SRPC_BATCH_OPC_QUERY 3 + +struct srpc_batch_reqst { + __u64 bar_rpyid; /* reply buffer matchbits */ + struct lst_sid bar_sid; /* session id */ + struct lst_bid bar_bid; /* batch id */ + __u32 bar_opc; /* create/start/stop batch */ + __u32 bar_testidx; /* index of test */ + __u32 bar_arg; /* parameters */ +} WIRE_ATTR; + +struct srpc_batch_reply { + __u32 bar_status; /* status of request */ + struct lst_sid bar_sid; /* session id */ + __u32 bar_active; /* # of active tests in batch/test */ + __u32 bar_time; /* remained time */ +} WIRE_ATTR; + +struct srpc_stat_reqst { + __u64 str_rpyid; /* reply buffer matchbits */ + struct lst_sid str_sid; /* session id */ + __u32 str_type; /* type of stat */ +} WIRE_ATTR; + +struct srpc_stat_reply { + __u32 str_status; + struct lst_sid str_sid; + struct sfw_counters str_fw; + struct srpc_counters str_rpc; + struct lnet_counters_common str_lnet; +} WIRE_ATTR; + +struct test_bulk_req { + __u32 blk_opc; /* bulk operation code */ + __u32 blk_npg; /* # of pages */ + __u32 blk_flags; /* reserved flags */ +} WIRE_ATTR; + +struct test_bulk_req_v1 { + /** bulk operation code */ + __u16 blk_opc; + /** data check flags */ + __u16 blk_flags; + /** data length */ + __u32 blk_len; + /** bulk offset */ + __u32 blk_offset; +} WIRE_ATTR; + +struct test_ping_req { + __u32 png_size; /* size of ping message */ + __u32 png_flags; /* reserved flags */ +} WIRE_ATTR; + +struct srpc_test_reqst { + __u64 tsr_rpyid; /* reply buffer matchbits */ + __u64 tsr_bulkid; /* bulk buffer matchbits */ + struct lst_sid tsr_sid; /* session id */ + struct lst_bid tsr_bid; /* batch id */ + __u32 tsr_service; /* test type: bulk|ping|... */ + /* test client loop count or # server buffers needed */ + __u32 tsr_loop; + __u32 tsr_concur; /* concurrency of test */ + __u8 tsr_is_client; /* is test client or not */ + __u8 tsr_stop_onerr; /* stop on error */ + __u32 tsr_ndest; /* # of dest nodes */ + + union { + struct test_ping_req ping; + struct test_bulk_req bulk_v0; + struct test_bulk_req_v1 bulk_v1; + } tsr_u; +} WIRE_ATTR; + +struct srpc_test_reply { + __u32 tsr_status; /* returned code */ + struct lst_sid tsr_sid; +} WIRE_ATTR; + +/* TEST RPCs */ +struct srpc_ping_reqst { + __u64 pnr_rpyid; + __u32 pnr_magic; + __u32 pnr_seq; + __u64 pnr_time_sec; + __u64 pnr_time_nsec; +} WIRE_ATTR; + +struct srpc_ping_reply { + __u32 pnr_status; + __u32 pnr_magic; + __u32 pnr_seq; +} WIRE_ATTR; + +struct srpc_brw_reqst { + __u64 brw_rpyid; /* reply buffer matchbits */ + __u64 brw_bulkid; /* bulk buffer matchbits */ + __u32 brw_rw; /* read or write */ + __u32 brw_len; /* bulk data len */ + __u32 brw_flags; /* bulk data patterns */ +} WIRE_ATTR; /* bulk r/w request */ + +struct srpc_brw_reply { + __u32 brw_status; +} WIRE_ATTR; /* bulk r/w reply */ + +#define SRPC_MSG_MAGIC 0xeeb0f00d +#define SRPC_MSG_VERSION 1 + +struct srpc_msg { + /** magic number */ + __u32 msg_magic; + /** message version number */ + __u32 msg_version; + /** type of message body: enum srpc_msg_type */ + __u32 msg_type; + __u32 msg_reserved0; + __u32 msg_reserved1; + /** test session features */ + __u32 msg_ses_feats; + union { + struct srpc_generic_reqst reqst; + struct srpc_generic_reply reply; + + struct srpc_mksn_reqst mksn_reqst; + struct srpc_mksn_reply mksn_reply; + struct srpc_rmsn_reqst rmsn_reqst; + struct srpc_rmsn_reply rmsn_reply; + struct srpc_debug_reqst dbg_reqst; + struct srpc_debug_reply dbg_reply; + struct srpc_batch_reqst bat_reqst; + struct srpc_batch_reply bat_reply; + struct srpc_stat_reqst stat_reqst; + struct srpc_stat_reply stat_reply; + struct srpc_test_reqst tes_reqst; + struct srpc_test_reply tes_reply; + struct srpc_join_reqst join_reqst; + struct srpc_join_reply join_reply; + + struct srpc_ping_reqst ping_reqst; + struct srpc_ping_reply ping_reply; + struct srpc_brw_reqst brw_reqst; + struct srpc_brw_reply brw_reply; + } msg_body; +} WIRE_ATTR; + +static inline void +srpc_unpack_msg_hdr(struct srpc_msg *msg) +{ + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + /* We do not swap the magic number here as it is needed to + determine whether the body needs to be swapped. */ + /* __swab32s(&msg->msg_magic); */ + __swab32s(&msg->msg_type); + __swab32s(&msg->msg_version); + __swab32s(&msg->msg_ses_feats); + __swab32s(&msg->msg_reserved0); + __swab32s(&msg->msg_reserved1); +} + +#endif /* __SELFTEST_RPC_H__ */ diff --git a/drivers/staging/lustrefsx/lnet/selftest/selftest.h b/drivers/staging/lustrefsx/lnet/selftest/selftest.h new file mode 100644 index 0000000000000..3f7c295e9a90c --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/selftest.h @@ -0,0 +1,618 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/selftest.h + * + * Author: Isaac Huang + */ +#ifndef __SELFTEST_SELFTEST_H__ +#define __SELFTEST_SELFTEST_H__ + +#define LNET_ONLY + +#include +#include +#include +#include +#include + +#include "rpc.h" +#include "timer.h" + +#ifndef MADE_WITHOUT_COMPROMISE +#define MADE_WITHOUT_COMPROMISE +#endif + + +#define SWI_STATE_NEWBORN 0 +#define SWI_STATE_REPLY_SUBMITTED 1 +#define SWI_STATE_REPLY_SENT 2 +#define SWI_STATE_REQUEST_SUBMITTED 3 +#define SWI_STATE_REQUEST_SENT 4 +#define SWI_STATE_REPLY_RECEIVED 5 +#define SWI_STATE_BULK_STARTED 6 +#define SWI_STATE_DONE 10 + +/* forward refs */ +struct srpc_service; +struct srpc_service_cd; +struct sfw_test_unit; +struct sfw_test_instance; + +/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework + * services, e.g. create/modify session. + */ +#define SRPC_SERVICE_DEBUG 0 +#define SRPC_SERVICE_MAKE_SESSION 1 +#define SRPC_SERVICE_REMOVE_SESSION 2 +#define SRPC_SERVICE_BATCH 3 +#define SRPC_SERVICE_TEST 4 +#define SRPC_SERVICE_QUERY_STAT 5 +#define SRPC_SERVICE_JOIN 6 +#define SRPC_FRAMEWORK_SERVICE_MAX_ID 10 +/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */ +#define SRPC_SERVICE_BRW 11 +#define SRPC_SERVICE_PING 12 +#define SRPC_SERVICE_MAX_ID 12 + +#define SRPC_REQUEST_PORTAL 50 +/* a lazy portal for framework RPC requests */ +#define SRPC_FRAMEWORK_REQUEST_PORTAL 51 +/* all reply/bulk RDMAs go to this portal */ +#define SRPC_RDMA_PORTAL 52 + +static inline enum srpc_msg_type +srpc_service2request (int service) +{ + switch (service) { + default: + LBUG (); + case SRPC_SERVICE_DEBUG: + return SRPC_MSG_DEBUG_REQST; + + case SRPC_SERVICE_MAKE_SESSION: + return SRPC_MSG_MKSN_REQST; + + case SRPC_SERVICE_REMOVE_SESSION: + return SRPC_MSG_RMSN_REQST; + + case SRPC_SERVICE_BATCH: + return SRPC_MSG_BATCH_REQST; + + case SRPC_SERVICE_TEST: + return SRPC_MSG_TEST_REQST; + + case SRPC_SERVICE_QUERY_STAT: + return SRPC_MSG_STAT_REQST; + + case SRPC_SERVICE_BRW: + return SRPC_MSG_BRW_REQST; + + case SRPC_SERVICE_PING: + return SRPC_MSG_PING_REQST; + + case SRPC_SERVICE_JOIN: + return SRPC_MSG_JOIN_REQST; + } +} + +static inline enum srpc_msg_type +srpc_service2reply (int service) +{ + return srpc_service2request(service) + 1; +} + +enum srpc_event_type { + SRPC_BULK_REQ_RCVD = 1, /* passive bulk request(PUT sink/GET source) received */ + SRPC_BULK_PUT_SENT = 2, /* active bulk PUT sent (source) */ + SRPC_BULK_GET_RPLD = 3, /* active bulk GET replied (sink) */ + SRPC_REPLY_RCVD = 4, /* incoming reply received */ + SRPC_REPLY_SENT = 5, /* outgoing reply sent */ + SRPC_REQUEST_RCVD = 6, /* incoming request received */ + SRPC_REQUEST_SENT = 7, /* outgoing request sent */ +}; + +/* RPC event */ +struct srpc_event { + enum srpc_event_type ev_type; /* what's up */ + enum lnet_event_kind ev_lnet; /* LNet event type */ + int ev_fired; /* LNet event fired? */ + int ev_status; /* LNet event status */ + void *ev_data; /* owning server/client RPC */ +}; + +/* bulk descriptor */ +struct srpc_bulk { + int bk_len; /* len of bulk data */ + struct lnet_handle_md bk_mdh; + int bk_sink; /* sink/source */ + int bk_niov; /* # iov in bk_iovs */ + lnet_kiov_t bk_iovs[0]; +}; + +/* message buffer descriptor */ +struct srpc_buffer { + struct list_head buf_list; /* chain on srpc_service::*_msgq */ + struct srpc_msg buf_msg; + struct lnet_handle_md buf_mdh; + lnet_nid_t buf_self; + struct lnet_process_id buf_peer; +}; + +struct swi_workitem; +typedef int (*swi_action_t)(struct swi_workitem *); + +struct swi_workitem { + struct cfs_wi_sched *swi_sched; + struct cfs_workitem swi_workitem; + swi_action_t swi_action; + int swi_state; +}; + +/* server-side state of a RPC */ +struct srpc_server_rpc { + /* chain on srpc_service::*_rpcq */ + struct list_head srpc_list; + struct srpc_service_cd *srpc_scd; + struct swi_workitem srpc_wi; + struct srpc_event srpc_ev; /* bulk/reply event */ + lnet_nid_t srpc_self; + struct lnet_process_id srpc_peer; + struct srpc_msg srpc_replymsg; + struct lnet_handle_md srpc_replymdh; + struct srpc_buffer *srpc_reqstbuf; + struct srpc_bulk *srpc_bulk; + + unsigned int srpc_aborted; /* being given up */ + int srpc_status; + void (*srpc_done)(struct srpc_server_rpc *); +}; + +/* client-side state of a RPC */ +struct srpc_client_rpc { + struct list_head crpc_list; /* chain on user's lists */ + spinlock_t crpc_lock; /* serialize */ + int crpc_service; + atomic_t crpc_refcount; + /* # seconds to wait for reply */ + int crpc_timeout; + struct stt_timer crpc_timer; + struct swi_workitem crpc_wi; + struct lnet_process_id crpc_dest; + + void (*crpc_done)(struct srpc_client_rpc *); + void (*crpc_fini)(struct srpc_client_rpc *); + int crpc_status; /* completion status */ + void *crpc_priv; /* caller data */ + + /* state flags */ + unsigned int crpc_aborted:1; /* being given up */ + unsigned int crpc_closed:1; /* completed */ + + /* RPC events */ + struct srpc_event crpc_bulkev; /* bulk event */ + struct srpc_event crpc_reqstev; /* request event */ + struct srpc_event crpc_replyev; /* reply event */ + + /* bulk, request(reqst), and reply exchanged on wire */ + struct srpc_msg crpc_reqstmsg; + struct srpc_msg crpc_replymsg; + struct lnet_handle_md crpc_reqstmdh; + struct lnet_handle_md crpc_replymdh; + struct srpc_bulk crpc_bulk; +}; + +#define srpc_client_rpc_size(rpc) \ +offsetof(struct srpc_client_rpc, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov]) + +#define srpc_client_rpc_addref(rpc) \ +do { \ + CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n", \ + (rpc), libcfs_id2str((rpc)->crpc_dest), \ + atomic_read(&(rpc)->crpc_refcount)); \ + LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \ + atomic_inc(&(rpc)->crpc_refcount); \ +} while (0) + +#define srpc_client_rpc_decref(rpc) \ +do { \ + CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n", \ + (rpc), libcfs_id2str((rpc)->crpc_dest), \ + atomic_read(&(rpc)->crpc_refcount)); \ + LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \ + if (atomic_dec_and_test(&(rpc)->crpc_refcount)) \ + srpc_destroy_client_rpc(rpc); \ +} while (0) + +#define srpc_event_pending(rpc) ((rpc)->crpc_bulkev.ev_fired == 0 || \ + (rpc)->crpc_reqstev.ev_fired == 0 || \ + (rpc)->crpc_replyev.ev_fired == 0) + +/* CPU partition data of srpc service */ +struct srpc_service_cd { + /** serialize */ + spinlock_t scd_lock; + /** backref to service */ + struct srpc_service *scd_svc; + /** event buffer */ + struct srpc_event scd_ev; + /** free RPC descriptors */ + struct list_head scd_rpc_free; + /** in-flight RPCs */ + struct list_head scd_rpc_active; + /** workitem for posting buffer */ + struct swi_workitem scd_buf_wi; + /** CPT id */ + int scd_cpt; + /** error code for scd_buf_wi */ + int scd_buf_err; + /** timestamp for scd_buf_err */ + time64_t scd_buf_err_stamp; + /** total # request buffers */ + int scd_buf_total; + /** # posted request buffers */ + int scd_buf_nposted; + /** in progress of buffer posting */ + int scd_buf_posting; + /** allocate more buffers if scd_buf_nposted < scd_buf_low */ + int scd_buf_low; + /** increase/decrease some buffers */ + int scd_buf_adjust; + /** posted message buffers */ + struct list_head scd_buf_posted; + /** blocked for RPC descriptor */ + struct list_head scd_buf_blocked; +}; + +/* number of server workitems (mini-thread) for testing service */ +#define SFW_TEST_WI_MIN 256 +#define SFW_TEST_WI_MAX 2048 +/* extra buffers for tolerating buggy peers, or unbalanced number + * of peers between partitions */ +#define SFW_TEST_WI_EXTRA 64 + +/* number of server workitems (mini-thread) for framework service */ +#define SFW_FRWK_WI_MIN 16 +#define SFW_FRWK_WI_MAX 256 + +struct srpc_service { + int sv_id; /* service id */ + const char *sv_name; /* human readable name */ + int sv_wi_total; /* total server workitems */ + int sv_shuttingdown; + int sv_ncpts; + /* percpt data for srpc_service */ + struct srpc_service_cd **sv_cpt_data; + /* Service callbacks: + * - sv_handler: process incoming RPC request + * - sv_bulk_ready: notify bulk data + */ + int (*sv_handler)(struct srpc_server_rpc *); + int (*sv_bulk_ready)(struct srpc_server_rpc *, int); +}; + +struct sfw_session { + /* chain on fw_zombie_sessions */ + struct list_head sn_list; + struct lst_sid sn_id; /* unique identifier */ + /* # seconds' inactivity to expire */ + unsigned int sn_timeout; + int sn_timer_active; + unsigned int sn_features; + struct stt_timer sn_timer; + struct list_head sn_batches; /* list of batches */ + char sn_name[LST_NAME_SIZE]; + atomic_t sn_refcount; + atomic_t sn_brw_errors; + atomic_t sn_ping_errors; + ktime_t sn_started; +}; + +#define sfw_sid_equal(sid0, sid1) ((sid0).ses_nid == (sid1).ses_nid && \ + (sid0).ses_stamp == (sid1).ses_stamp) + +struct sfw_batch { + struct list_head bat_list; /* chain on sn_batches */ + struct lst_bid bat_id; /* batch id */ + int bat_error; /* error code of batch */ + struct sfw_session *bat_session; /* batch's session */ + atomic_t bat_nactive; /* # of active tests */ + struct list_head bat_tests; /* test instances */ +}; + +struct sfw_test_client_ops { + int (*tso_init)(struct sfw_test_instance *tsi); /* intailize test client */ + void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */ + int (*tso_prep_rpc)(struct sfw_test_unit *tsu, + struct lnet_process_id dest, + struct srpc_client_rpc **rpc); /* prep a tests rpc */ + void (*tso_done_rpc)(struct sfw_test_unit *tsu, + struct srpc_client_rpc *rpc); /* done a test rpc */ +}; + +struct sfw_test_instance { + struct list_head tsi_list; /* chain on batch */ + int tsi_service; /* test type */ + struct sfw_batch *tsi_batch; /* batch */ + struct sfw_test_client_ops *tsi_ops; /* test client operations */ + + /* public parameter for all test units */ + unsigned int tsi_is_client:1; /* is test client */ + unsigned int tsi_stoptsu_onerr:1; /* stop tsu on error */ + int tsi_concur; /* concurrency */ + int tsi_loop; /* loop count */ + + /* status of test instance */ + spinlock_t tsi_lock; /* serialize */ + unsigned int tsi_stopping:1; /* test is stopping */ + atomic_t tsi_nactive; /* # of active test unit */ + struct list_head tsi_units; /* test units */ + struct list_head tsi_free_rpcs; /* free rpcs */ + struct list_head tsi_active_rpcs;/* active rpcs */ + + union { + struct test_ping_req ping; /* ping parameter */ + struct test_bulk_req bulk_v0; /* bulk parameter */ + struct test_bulk_req_v1 bulk_v1; /* bulk v1 parameter */ + } tsi_u; +}; + +/* XXX: trailing (PAGE_SIZE % sizeof(struct lnet_process_id)) bytes at + * the end of pages are not used */ +#define SFW_MAX_CONCUR LST_MAX_CONCUR +#define SFW_ID_PER_PAGE (PAGE_SIZE / sizeof(struct lnet_process_id_packed)) +#define SFW_MAX_NDESTS (LNET_MAX_IOV * SFW_ID_PER_PAGE) +#define sfw_id_pages(n) (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE) + +struct sfw_test_unit { + struct list_head tsu_list; /* chain on lst_test_instance */ + struct lnet_process_id tsu_dest; /* id of dest node */ + int tsu_loop; /* loop count of the test */ + struct sfw_test_instance *tsu_instance; /* pointer to test instance */ + void *tsu_private; /* private data */ + struct swi_workitem tsu_worker; /* workitem of the test unit */ +}; + +struct sfw_test_case { + struct list_head tsc_list; /* chain on fw_tests */ + struct srpc_service *tsc_srv_service; /* test service */ + struct sfw_test_client_ops *tsc_cli_ops; /* ops of test client */ +}; + +struct srpc_client_rpc * +sfw_create_rpc(struct lnet_process_id peer, int service, + unsigned features, int nbulkiov, int bulklen, + void (*done)(struct srpc_client_rpc *), void *priv); +int sfw_create_test_rpc(struct sfw_test_unit *tsu, + struct lnet_process_id peer, unsigned int features, + int nblk, int blklen, struct srpc_client_rpc **rpc); +void sfw_abort_rpc(struct srpc_client_rpc *rpc); +void sfw_post_rpc(struct srpc_client_rpc *rpc); +void sfw_client_rpc_done(struct srpc_client_rpc *rpc); +void sfw_unpack_message(struct srpc_msg *msg); +void sfw_free_pages(struct srpc_server_rpc *rpc); +void sfw_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i); +int sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len, + int sink); +int sfw_make_session(struct srpc_mksn_reqst *request, + struct srpc_mksn_reply *reply); + +struct srpc_client_rpc * +srpc_create_client_rpc(struct lnet_process_id peer, int service, + int nbulkiov, int bulklen, + void (*rpc_done)(struct srpc_client_rpc *), + void (*rpc_fini)(struct srpc_client_rpc *), void *priv); +void srpc_post_rpc(struct srpc_client_rpc *rpc); +void srpc_abort_rpc(struct srpc_client_rpc *rpc, int why); +void srpc_free_bulk(struct srpc_bulk *bk); +struct srpc_bulk *srpc_alloc_bulk(int cpt, unsigned int off, + unsigned int bulk_npg, unsigned int bulk_len, + int sink); +int srpc_send_rpc(struct swi_workitem *wi); +int srpc_send_reply(struct srpc_server_rpc *rpc); +int srpc_add_service(struct srpc_service *sv); +int srpc_remove_service(struct srpc_service *sv); +void srpc_shutdown_service(struct srpc_service *sv); +void srpc_abort_service(struct srpc_service *sv); +int srpc_finish_service(struct srpc_service *sv); +int srpc_service_add_buffers(struct srpc_service *sv, int nbuffer); +void srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer); +void srpc_get_counters(struct srpc_counters *cnt); +void srpc_set_counters(const struct srpc_counters *cnt); + +extern struct cfs_wi_sched *lst_sched_serial; +extern struct cfs_wi_sched **lst_sched_test; + +static inline int +srpc_serv_is_framework(struct srpc_service *svc) +{ + return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID; +} + +static inline int +swi_wi_action(struct cfs_workitem *wi) +{ + struct swi_workitem *swi; + + swi = container_of(wi, struct swi_workitem, swi_workitem); + return swi->swi_action(swi); +} + +static inline void +swi_init_workitem(struct swi_workitem *swi, void *data, + swi_action_t action, struct cfs_wi_sched *sched) +{ + swi->swi_sched = sched; + swi->swi_action = action; + swi->swi_state = SWI_STATE_NEWBORN; + cfs_wi_init(&swi->swi_workitem, data, swi_wi_action); +} + +static inline void +swi_schedule_workitem(struct swi_workitem *wi) +{ + cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem); +} + +static inline void +swi_exit_workitem(struct swi_workitem *swi) +{ + cfs_wi_exit(swi->swi_sched, &swi->swi_workitem); +} + +static inline int +swi_deschedule_workitem(struct swi_workitem *swi) +{ + return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem); +} + +int sfw_startup(void); +int srpc_startup(void); +void sfw_shutdown(void); +void srpc_shutdown(void); + +static inline void +srpc_destroy_client_rpc(struct srpc_client_rpc *rpc) +{ + LASSERT (rpc != NULL); + LASSERT (!srpc_event_pending(rpc)); + LASSERT (atomic_read(&rpc->crpc_refcount) == 0); + + if (rpc->crpc_fini == NULL) { + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } else { + (*rpc->crpc_fini) (rpc); + } + + return; +} + +static inline void +srpc_init_client_rpc(struct srpc_client_rpc *rpc, struct lnet_process_id peer, + int service, int nbulkiov, int bulklen, + void (*rpc_done)(struct srpc_client_rpc *), + void (*rpc_fini)(struct srpc_client_rpc *), void *priv) +{ + LASSERT(nbulkiov <= LNET_MAX_IOV); + + memset(rpc, 0, offsetof(struct srpc_client_rpc, + crpc_bulk.bk_iovs[nbulkiov])); + + INIT_LIST_HEAD(&rpc->crpc_list); + swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc, + lst_sched_test[lnet_cpt_of_nid(peer.nid, NULL)]); + spin_lock_init(&rpc->crpc_lock); + atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */ + + rpc->crpc_dest = peer; + rpc->crpc_priv = priv; + rpc->crpc_service = service; + rpc->crpc_bulk.bk_len = bulklen; + rpc->crpc_bulk.bk_niov = nbulkiov; + rpc->crpc_done = rpc_done; + rpc->crpc_fini = rpc_fini; + LNetInvalidateMDHandle(&rpc->crpc_reqstmdh); + LNetInvalidateMDHandle(&rpc->crpc_replymdh); + LNetInvalidateMDHandle(&rpc->crpc_bulk.bk_mdh); + + /* no event is expected at this point */ + rpc->crpc_bulkev.ev_fired = + rpc->crpc_reqstev.ev_fired = + rpc->crpc_replyev.ev_fired = 1; + + rpc->crpc_reqstmsg.msg_magic = SRPC_MSG_MAGIC; + rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION; + rpc->crpc_reqstmsg.msg_type = srpc_service2request(service); + return; +} + +static inline const char * +swi_state2str (int state) +{ +#define STATE2STR(x) case x: return #x + switch(state) { + default: + LBUG(); + STATE2STR(SWI_STATE_NEWBORN); + STATE2STR(SWI_STATE_REPLY_SUBMITTED); + STATE2STR(SWI_STATE_REPLY_SENT); + STATE2STR(SWI_STATE_REQUEST_SUBMITTED); + STATE2STR(SWI_STATE_REQUEST_SENT); + STATE2STR(SWI_STATE_REPLY_RECEIVED); + STATE2STR(SWI_STATE_BULK_STARTED); + STATE2STR(SWI_STATE_DONE); + } +#undef STATE2STR +} + +#define lst_wait_until(cond, lock, fmt, ...) \ +do { \ + int __I = 2; \ + while (!(cond)) { \ + CDEBUG(is_power_of_2(++__I) ? D_WARNING : D_NET, \ + fmt, ## __VA_ARGS__); \ + spin_unlock(&(lock)); \ + \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + schedule_timeout(cfs_time_seconds(1) / 10); \ + \ + spin_lock(&(lock)); \ + } \ +} while (0) + +static inline void +srpc_wait_service_shutdown(struct srpc_service *sv) +{ + int i = 2; + + LASSERT(sv->sv_shuttingdown); + + while (srpc_finish_service(sv) == 0) { + i++; + CDEBUG(((i & -i) == i) ? D_WARNING : D_NET, + "Waiting for %s service to shutdown...\n", + sv->sv_name); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 10); + } +} + +extern struct sfw_test_client_ops ping_test_client; +extern struct srpc_service ping_test_service; +void ping_init_test_client(void); +void ping_init_test_service(void); + +extern struct sfw_test_client_ops brw_test_client; +extern struct srpc_service brw_test_service; +void brw_init_test_client(void); +void brw_init_test_service(void); + +#endif /* __SELFTEST_SELFTEST_H__ */ diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.c b/drivers/staging/lustrefsx/lnet/selftest/timer.c new file mode 100644 index 0000000000000..3ceec81bf1b08 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/timer.c @@ -0,0 +1,247 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/timer.c + * + * Author: Isaac Huang + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + + +/* + * Timers are implemented as a sorted queue of expiry times. The queue + * is slotted, with each slot holding timers which expire in a + * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are + * sorted by increasing expiry time. The number of slots is 2**7 (128), + * to cover a time period of 1024 seconds into the future before wrapping. + */ +#define STTIMER_MINPOLL 3 /* log2 min poll interval (8 s) */ +#define STTIMER_SLOTTIME (1 << STTIMER_MINPOLL) +#define STTIMER_SLOTTIMEMASK (~(STTIMER_SLOTTIME - 1)) +#define STTIMER_NSLOTS (1 << 7) +#define STTIMER_SLOT(t) (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \ + (STTIMER_NSLOTS - 1))]) + +static struct st_timer_data { + spinlock_t stt_lock; + /* start time of the slot processed previously */ + time64_t stt_prev_slot; + struct list_head stt_hash[STTIMER_NSLOTS]; + int stt_shuttingdown; + wait_queue_head_t stt_waitq; + int stt_nthreads; +} stt_data; + +void +stt_add_timer(struct stt_timer *timer) +{ + struct list_head *pos; + + spin_lock(&stt_data.stt_lock); + + LASSERT(stt_data.stt_nthreads > 0); + LASSERT(!stt_data.stt_shuttingdown); + LASSERT(timer->stt_func != NULL); + LASSERT(list_empty(&timer->stt_list)); + LASSERT(timer->stt_expires > ktime_get_real_seconds()); + + /* a simple insertion sort */ + list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) { + struct stt_timer *old = list_entry(pos, struct stt_timer, + stt_list); + + if (timer->stt_expires >= old->stt_expires) + break; + } + list_add(&timer->stt_list, pos); + + spin_unlock(&stt_data.stt_lock); +} + +/* + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + * + * CAVEAT EMPTOR: + * When 0 is returned, it is possible that timer->stt_func _is_ running on + * another CPU. + */ +int +stt_del_timer(struct stt_timer *timer) +{ + int ret = 0; + + spin_lock(&stt_data.stt_lock); + + LASSERT(stt_data.stt_nthreads > 0); + LASSERT(!stt_data.stt_shuttingdown); + + if (!list_empty(&timer->stt_list)) { + ret = 1; + list_del_init(&timer->stt_list); + } + + spin_unlock(&stt_data.stt_lock); + return ret; +} + +/* called with stt_data.stt_lock held */ +static int +stt_expire_list(struct list_head *slot, time64_t now) +{ + int expired = 0; + struct stt_timer *timer; + + while (!list_empty(slot)) { + timer = list_entry(slot->next, struct stt_timer, stt_list); + + if (timer->stt_expires > now) + break; + + list_del_init(&timer->stt_list); + spin_unlock(&stt_data.stt_lock); + + expired++; + (*timer->stt_func) (timer->stt_data); + + spin_lock(&stt_data.stt_lock); + } + + return expired; +} + +static int +stt_check_timers(time64_t *last) +{ + int expired = 0; + time64_t now; + time64_t this_slot; + + now = ktime_get_real_seconds(); + this_slot = now & STTIMER_SLOTTIMEMASK; + + spin_lock(&stt_data.stt_lock); + + while (this_slot >= *last) { + expired += stt_expire_list(STTIMER_SLOT(this_slot), now); + this_slot = this_slot - STTIMER_SLOTTIME; + } + + *last = now & STTIMER_SLOTTIMEMASK; + spin_unlock(&stt_data.stt_lock); + return expired; +} + + +static int +stt_timer_main (void *arg) +{ + int rc = 0; + + cfs_block_allsigs(); + + while (!stt_data.stt_shuttingdown) { + stt_check_timers(&stt_data.stt_prev_slot); + + rc = wait_event_timeout(stt_data.stt_waitq, + stt_data.stt_shuttingdown, + cfs_time_seconds(STTIMER_SLOTTIME)); + } + + spin_lock(&stt_data.stt_lock); + stt_data.stt_nthreads--; + spin_unlock(&stt_data.stt_lock); + return rc; +} + +static int +stt_start_timer_thread (void) +{ + struct task_struct *task; + + LASSERT(!stt_data.stt_shuttingdown); + + task = kthread_run(stt_timer_main, NULL, "st_timer"); + if (IS_ERR(task)) + return PTR_ERR(task); + + spin_lock(&stt_data.stt_lock); + stt_data.stt_nthreads++; + spin_unlock(&stt_data.stt_lock); + return 0; +} + + +int +stt_startup (void) +{ + int rc = 0; + int i; + + stt_data.stt_shuttingdown = 0; + stt_data.stt_prev_slot = ktime_get_real_seconds() & STTIMER_SLOTTIMEMASK; + + spin_lock_init(&stt_data.stt_lock); + for (i = 0; i < STTIMER_NSLOTS; i++) + INIT_LIST_HEAD(&stt_data.stt_hash[i]); + + stt_data.stt_nthreads = 0; + init_waitqueue_head(&stt_data.stt_waitq); + rc = stt_start_timer_thread(); + if (rc != 0) + CERROR ("Can't spawn timer thread: %d\n", rc); + + return rc; +} + +void +stt_shutdown(void) +{ + int i; + + spin_lock(&stt_data.stt_lock); + + for (i = 0; i < STTIMER_NSLOTS; i++) + LASSERT(list_empty(&stt_data.stt_hash[i])); + + stt_data.stt_shuttingdown = 1; + + wake_up(&stt_data.stt_waitq); + lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock, + "waiting for %d threads to terminate\n", + stt_data.stt_nthreads); + + spin_unlock(&stt_data.stt_lock); +} diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.h b/drivers/staging/lustrefsx/lnet/selftest/timer.h new file mode 100644 index 0000000000000..e769c4cc9ebd7 --- /dev/null +++ b/drivers/staging/lustrefsx/lnet/selftest/timer.h @@ -0,0 +1,49 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/timer.h + * + * Author: Isaac Huang + */ +#ifndef __SELFTEST_TIMER_H__ +#define __SELFTEST_TIMER_H__ + +struct stt_timer { + struct list_head stt_list; + time64_t stt_expires; + void (*stt_func)(void *); + void *stt_data; +}; + +void stt_add_timer(struct stt_timer *timer); +int stt_del_timer(struct stt_timer *timer); +int stt_startup(void); +void stt_shutdown(void); + +#endif /* __SELFTEST_TIMER_H__ */ diff --git a/drivers/staging/lustrefsx/lustre/Kconfig b/drivers/staging/lustrefsx/lustre/Kconfig new file mode 100644 index 0000000000000..c565c870d805b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/Kconfig @@ -0,0 +1,41 @@ +config LUSTREFSX_FS + tristate "Lustre file system client support" + depends on m + select LUSTREFSX_LIBCFS + depends on LUSTREFSX_LNET + select CRYPTO + select CRYPTO_CRC32 + select CRYPTO_CRC32_PCLMUL if X86 + select CRYPTO_CRC32C + select CRYPTO_MD5 + select CRYPTO_SHA1 + select CRYPTO_SHA256 + select CRYPTO_SHA512 + depends on MULTIUSER + help + This option enables Lustre file system client support. Choose Y + here if you want to access a Lustre file system cluster. To compile + this file system support as a module, choose M here: the module will + be called lustre. + + To mount Lustre file systems, you also need to install the user space + mount.lustre and other user space commands which can be found in the + lustre-client package. + + Lustre file system is the most popular cluster file system in high + performance computing. Source code of both kernel space and user space + Lustre components can also be found at + http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary + + If unsure, say N. + + See also http://wiki.lustre.org/ + +config LUSTRE_DEBUG_EXPENSIVE_CHECK + bool "Enable Lustre DEBUG checks" + depends on LUSTREFSX_FS + help + This option is mainly for debug purpose. It enables Lustre code to do + expensive checks that may have a performance impact. + + Use with caution. If unsure, say N. diff --git a/drivers/staging/lustrefsx/lustre/LICENSE b/drivers/staging/lustrefsx/lustre/LICENSE new file mode 100644 index 0000000000000..edb73cdedca6a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/LICENSE @@ -0,0 +1,372 @@ +Each file in this distribution contains a header stating the copyright +owner(s), and the licensing terms for that file. Some files are not +eligible for copyright protection, and contain neither. + +There are many files which may be covered by a separate license that +you signed or otherwise agreed to before downloading this software. +If you did not agree to such an agreement, or if the file does not +mention that license, then you can redistribute and/or modify it under +the terms of version 2 of the GNU General Public License. Each file +is very clear about which license is applicable. + +In any case, Lustre is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the license +text for more details. + +Reproduced below is the GNU General Public License version 2, and +Linus's clarifying statement from the Linux kernel source code: + +---------------------------------------- + + NOTE! This copyright does *not* cover user programs that use kernel + services by normal system calls - this is merely considered normal use + of the kernel, and does *not* fall under the heading of "derived work". + Also note that the GPL below is copyrighted by the Free Software + Foundation, but the instance of code that it refers to (the Linux + kernel) is copyrighted by me and others who actually wrote it. + + Linus Torvalds + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/drivers/staging/lustrefsx/lustre/Makefile b/drivers/staging/lustrefsx/lustre/Makefile new file mode 100644 index 0000000000000..207cab53c0633 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LUSTREFSX_FS) += fid/ +obj-$(CONFIG_LUSTREFSX_FS) += obdclass/ +obj-$(CONFIG_LUSTREFSX_FS) += ptlrpc/ +obj-$(CONFIG_LUSTREFSX_FS) += obdecho/ +obj-$(CONFIG_LUSTREFSX_FS) += mgc/ +obj-$(CONFIG_LUSTREFSX_FS) += lov/ osc/ mdc/ lmv/ llite/ fld/ + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/fid/Makefile b/drivers/staging/lustrefsx/lustre/fid/Makefile new file mode 100644 index 0000000000000..22be6773ba08f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += fid.o + +fid-y := fid_request.o lproc_fid.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c new file mode 100644 index 0000000000000..8676ec223548d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c @@ -0,0 +1,645 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_handler.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include +#include +#include +#include +#include +#include +#include +#include +#include "fid_internal.h" + +/* Assigns client to sequence controller node. */ +int seq_server_set_cli(const struct lu_env *env, struct lu_server_seq *seq, + struct lu_client_seq *cli) +{ + int rc = 0; + ENTRY; + + /* + * Ask client for new range, assign that range to ->seq_space and write + * seq state to backing store should be atomic. + */ + mutex_lock(&seq->lss_mutex); + + if (cli == NULL) { + CDEBUG(D_INFO, "%s: Detached sequence client\n", seq->lss_name); + seq->lss_cli = NULL; + GOTO(out_up, rc = 0); + } + + if (seq->lss_cli != NULL) { + CDEBUG(D_HA, "%s: Sequence controller is already " + "assigned\n", seq->lss_name); + GOTO(out_up, rc = -EEXIST); + } + + CDEBUG(D_INFO, "%s: Attached sequence controller %s\n", + seq->lss_name, cli->lcs_name); + + seq->lss_cli = cli; + cli->lcs_space.lsr_index = seq->lss_site->ss_node_id; + EXIT; +out_up: + mutex_unlock(&seq->lss_mutex); + return rc; +} +EXPORT_SYMBOL(seq_server_set_cli); +/* + * allocate \a w units of sequence from range \a from. + */ +static inline void range_alloc(struct lu_seq_range *to, + struct lu_seq_range *from, + __u64 width) +{ + width = min(lu_seq_range_space(from), width); + to->lsr_start = from->lsr_start; + to->lsr_end = from->lsr_start + width; + from->lsr_start += width; +} + +/** + * On controller node, allocate new super sequence for regular sequence server. + * As this super sequence controller, this node suppose to maintain fld + * and update index. + * \a out range always has currect mds node number of requester. + */ + +static int __seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc; + ENTRY; + + LASSERT(lu_seq_range_is_sane(space)); + + if (lu_seq_range_is_exhausted(space)) { + CERROR("%s: Sequences space is exhausted\n", + seq->lss_name); + RETURN(-ENOSPC); + } else { + range_alloc(out, space, seq->lss_width); + } + + rc = seq_store_update(env, seq, out, 1 /* sync */); + + LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n", + seq->lss_name, rc, PRANGE(out)); + + RETURN(rc); +} + +int seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + int rc; + ENTRY; + + mutex_lock(&seq->lss_mutex); + rc = __seq_server_alloc_super(seq, out, env); + mutex_unlock(&seq->lss_mutex); + + RETURN(rc); +} + +int seq_server_alloc_spec(struct lu_server_seq *seq, + struct lu_seq_range *spec, + const struct lu_env *env) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc = -ENOSPC; + ENTRY; + + /* + * In some cases (like recovery after a disaster) + * we may need to allocate sequences manually + * Notice some sequences can be lost if requested + * range doesn't start at the beginning of current + * free space. Also notice it's not possible now + * to allocate sequences out of natural order. + */ + if (spec->lsr_start >= spec->lsr_end) + RETURN(-EINVAL); + if (spec->lsr_flags != LU_SEQ_RANGE_MDT && + spec->lsr_flags != LU_SEQ_RANGE_OST) + RETURN(-EINVAL); + + mutex_lock(&seq->lss_mutex); + if (spec->lsr_start >= space->lsr_start) { + space->lsr_start = spec->lsr_end; + rc = seq_store_update(env, seq, spec, 1 /* sync */); + + LCONSOLE_INFO("%s: "DRANGE" sequences allocated: rc = %d \n", + seq->lss_name, PRANGE(spec), rc); + } + mutex_unlock(&seq->lss_mutex); + + RETURN(rc); +} + +static int __seq_set_init(const struct lu_env *env, + struct lu_server_seq *seq) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc; + + range_alloc(&seq->lss_lowater_set, space, seq->lss_set_width); + range_alloc(&seq->lss_hiwater_set, space, seq->lss_set_width); + + rc = seq_store_update(env, seq, NULL, 1); + + return rc; +} + +/* + * This function implements new seq allocation algorithm using async + * updates to seq file on disk. ref bug 18857 for details. + * there are four variable to keep track of this process + * + * lss_space; - available lss_space + * lss_lowater_set; - lu_seq_range for all seqs before barrier, i.e. safe to use + * lss_hiwater_set; - lu_seq_range after barrier, i.e. allocated but may be + * not yet committed + * + * when lss_lowater_set reaches the end it is replaced with hiwater one and + * a write operation is initiated to allocate new hiwater range. + * if last seq write opearion is still not committed, current operation is + * flaged as sync write op. + */ +static int range_alloc_set(const struct lu_env *env, + struct lu_seq_range *out, + struct lu_server_seq *seq) +{ + struct lu_seq_range *space = &seq->lss_space; + struct lu_seq_range *loset = &seq->lss_lowater_set; + struct lu_seq_range *hiset = &seq->lss_hiwater_set; + int rc = 0; + + if (lu_seq_range_is_zero(loset)) + __seq_set_init(env, seq); + + if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_ALLOC)) /* exhaust set */ + loset->lsr_start = loset->lsr_end; + + if (lu_seq_range_is_exhausted(loset)) { + /* reached high water mark. */ + struct lu_device *dev = seq->lss_site->ss_lu->ls_top_dev; + int obd_num_clients = dev->ld_obd->obd_num_exports; + __u64 set_sz; + + /* calculate new seq width based on number of clients */ + set_sz = max(seq->lss_set_width, + obd_num_clients * seq->lss_width); + set_sz = min(lu_seq_range_space(space), set_sz); + + /* Switch to hiwater range now */ + *loset = *hiset; + /* allocate new hiwater range */ + range_alloc(hiset, space, set_sz); + + /* update ondisk seq with new *space */ + rc = seq_store_update(env, seq, NULL, seq->lss_need_sync); + } + + LASSERTF(!lu_seq_range_is_exhausted(loset) || + lu_seq_range_is_sane(loset), + DRANGE"\n", PRANGE(loset)); + + if (rc == 0) + range_alloc(out, loset, seq->lss_width); + + RETURN(rc); +} + +/** + * Check if the sequence server has sequence avaible + * + * Check if the sequence server has sequence avaible, if not, then + * allocating super sequence from sequence manager (MDT0). + * + * \param[in] env execution environment + * \param[in] seq server sequence + * + * \retval negative errno if allocating new sequence fails + * \retval 0 if there is enough sequence or allocating + * new sequence succeeds + */ +int seq_server_check_and_alloc_super(const struct lu_env *env, + struct lu_server_seq *seq) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc = 0; + + ENTRY; + + /* Check if available space ends and allocate new super seq */ + if (lu_seq_range_is_exhausted(space)) { + if (!seq->lss_cli) { + CERROR("%s: No sequence controller is attached.\n", + seq->lss_name); + RETURN(-ENODEV); + } + + rc = seq_client_alloc_super(seq->lss_cli, env); + if (rc) { + CDEBUG(D_HA, "%s: Can't allocate super-sequence:" + " rc %d\n", seq->lss_name, rc); + RETURN(rc); + } + + /* Saving new range to allocation space. */ + *space = seq->lss_cli->lcs_space; + LASSERT(lu_seq_range_is_sane(space)); + if (seq->lss_cli->lcs_srv == NULL) { + struct lu_server_fld *fld; + + /* Insert it to the local FLDB */ + fld = seq->lss_site->ss_server_fld; + mutex_lock(&fld->lsf_lock); + rc = fld_insert_entry(env, fld, space); + mutex_unlock(&fld->lsf_lock); + } + } + + if (lu_seq_range_is_zero(&seq->lss_lowater_set)) + __seq_set_init(env, seq); + + RETURN(rc); +} +EXPORT_SYMBOL(seq_server_check_and_alloc_super); + +static int __seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc = 0; + + ENTRY; + + LASSERT(lu_seq_range_is_sane(space)); + + rc = seq_server_check_and_alloc_super(env, seq); + if (rc < 0) { + if (rc == -EINPROGRESS) { + static int printed; + + if (printed++ % 8 == 0) + LCONSOLE_INFO("%s: Waiting to contact MDT0000 " + "to allocate super-sequence\n", + seq->lss_name); + } else { + CERROR("%s: Allocated super-sequence failed: rc = %d\n", + seq->lss_name, rc); + } + RETURN(rc); + } + + rc = range_alloc_set(env, out, seq); + if (rc != 0) { + CERROR("%s: Allocated meta-sequence failed: rc = %d\n", + seq->lss_name, rc); + RETURN(rc); + } + + CDEBUG(D_INFO, "%s: Allocated meta-sequence " DRANGE"\n", + seq->lss_name, PRANGE(out)); + + RETURN(rc); +} + +int seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + int rc; + ENTRY; + + mutex_lock(&seq->lss_mutex); + rc = __seq_server_alloc_meta(seq, out, env); + mutex_unlock(&seq->lss_mutex); + + RETURN(rc); +} +EXPORT_SYMBOL(seq_server_alloc_meta); + +static int seq_server_handle(struct lu_site *site, + const struct lu_env *env, + __u32 opc, struct lu_seq_range *out) +{ + int rc; + struct seq_server_site *ss_site; + struct dt_device *dev; + ENTRY; + + ss_site = lu_site2seq(site); + + switch (opc) { + case SEQ_ALLOC_META: + if (!ss_site->ss_server_seq) { + CERROR("Sequence server is not " + "initialized\n"); + RETURN(-EINVAL); + } + + dev = lu2dt_dev(ss_site->ss_server_seq->lss_obj->do_lu.lo_dev); + if (dev->dd_rdonly) + RETURN(-EROFS); + + rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env); + break; + case SEQ_ALLOC_SUPER: + if (!ss_site->ss_control_seq) { + CERROR("Sequence controller is not " + "initialized\n"); + RETURN(-EINVAL); + } + + dev = lu2dt_dev(ss_site->ss_control_seq->lss_obj->do_lu.lo_dev); + if (dev->dd_rdonly) + RETURN(-EROFS); + + rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env); + break; + default: + rc = -EINVAL; + break; + } + + RETURN(rc); +} + +static int seq_handler(struct tgt_session_info *tsi) +{ + struct lu_seq_range *out, *tmp; + struct lu_site *site; + int rc; + __u32 *opc; + + ENTRY; + + LASSERT(!(lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) & MSG_REPLAY)); + site = tsi->tsi_exp->exp_obd->obd_lu_dev->ld_site; + LASSERT(site != NULL); + + opc = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_OPC); + if (opc != NULL) { + out = req_capsule_server_get(tsi->tsi_pill, &RMF_SEQ_RANGE); + if (out == NULL) + RETURN(err_serious(-EPROTO)); + + tmp = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_RANGE); + + /* seq client passed mdt id, we need to pass that using out + * range parameter */ + + out->lsr_index = tmp->lsr_index; + out->lsr_flags = tmp->lsr_flags; + rc = seq_server_handle(site, tsi->tsi_env, *opc, out); + } else { + rc = err_serious(-EPROTO); + } + + RETURN(rc); +} + +struct tgt_handler seq_handlers[] = { +TGT_SEQ_HDL(HABEO_REFERO, SEQ_QUERY, seq_handler), +}; +EXPORT_SYMBOL(seq_handlers); + +/* context key constructor/destructor: seq_key_init, seq_key_fini */ +LU_KEY_INIT_FINI(seq, struct seq_thread_info); + +/* context key: seq_thread_key */ +LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD); + +extern const struct file_operations seq_fld_debugfs_seq_fops; + +static void seq_server_debugfs_fini(struct lu_server_seq *seq) +{ + if (!IS_ERR_OR_NULL(seq->lss_debugfs_entry)) + ldebugfs_remove(&seq->lss_debugfs_entry); +} + +static int seq_server_debugfs_init(struct lu_server_seq *seq) +{ + int rc; + ENTRY; + + seq->lss_debugfs_entry = ldebugfs_register(seq->lss_name, + seq_debugfs_dir, + NULL, NULL); + if (IS_ERR_OR_NULL(seq->lss_debugfs_entry)) { + rc = seq->lss_debugfs_entry ? PTR_ERR(seq->lss_debugfs_entry) + : -ENOMEM; + seq->lss_debugfs_entry = NULL; + RETURN(rc); + } + + rc = ldebugfs_add_vars(seq->lss_debugfs_entry, + seq_server_debugfs_list, seq); + if (rc) { + CERROR("%s: Can't init sequence manager debugfs, rc %d\n", + seq->lss_name, rc); + GOTO(out_cleanup, rc); + } + + if (seq->lss_type == LUSTRE_SEQ_CONTROLLER) { + rc = ldebugfs_seq_create(seq->lss_debugfs_entry, "fldb", 0644, + &seq_fld_debugfs_seq_fops, seq); + if (rc) { + CERROR("%s: Can't create fldb for sequence manager debugfs: rc = %d\n", + seq->lss_name, rc); + GOTO(out_cleanup, rc); + } + } + + RETURN(0); + +out_cleanup: + seq_server_debugfs_fini(seq); + return rc; +} + +int seq_server_init(const struct lu_env *env, + struct lu_server_seq *seq, + struct dt_device *dev, + const char *prefix, + enum lu_mgr_type type, + struct seq_server_site *ss) +{ + int rc, is_srv = (type == LUSTRE_SEQ_SERVER); + ENTRY; + + LASSERT(dev != NULL); + LASSERT(prefix != NULL); + LASSERT(ss != NULL); + LASSERT(ss->ss_lu != NULL); + + /* A compile-time check for FIDs that used to be in lustre_idl.h + * but is moved here to remove CLASSERT/LASSERT in that header. + * Check all lu_fid fields are converted in fid_cpu_to_le() and friends + * and that there is no padding added by compiler to the struct. */ + { + struct lu_fid tst; + + CLASSERT(sizeof(tst) == sizeof(tst.f_seq) + + sizeof(tst.f_oid) + sizeof(tst.f_ver)); + } + + seq->lss_cli = NULL; + seq->lss_type = type; + seq->lss_site = ss; + lu_seq_range_init(&seq->lss_space); + + lu_seq_range_init(&seq->lss_lowater_set); + lu_seq_range_init(&seq->lss_hiwater_set); + seq->lss_set_width = LUSTRE_SEQ_BATCH_WIDTH; + + mutex_init(&seq->lss_mutex); + + seq->lss_width = is_srv ? + LUSTRE_SEQ_META_WIDTH : LUSTRE_SEQ_SUPER_WIDTH; + + snprintf(seq->lss_name, sizeof(seq->lss_name), + "%s-%s", (is_srv ? "srv" : "ctl"), prefix); + + rc = seq_store_init(seq, env, dev); + if (rc) + GOTO(out, rc); + /* Request backing store for saved sequence info. */ + rc = seq_store_read(seq, env); + if (rc == -ENODATA) { + + /* Nothing is read, init by default value. */ + seq->lss_space = is_srv ? + LUSTRE_SEQ_ZERO_RANGE: + LUSTRE_SEQ_SPACE_RANGE; + + seq->lss_space.lsr_index = ss->ss_node_id; + LCONSOLE_INFO("%s: No data found " + "on store. Initialize space\n", + seq->lss_name); + + rc = seq_store_update(env, seq, NULL, 0); + if (rc) { + CERROR("%s: Can't write space data, " + "rc %d\n", seq->lss_name, rc); + } + } else if (rc) { + CERROR("%s: Can't read space data, rc %d\n", + seq->lss_name, rc); + GOTO(out, rc); + } + + if (is_srv) { + LASSERT(lu_seq_range_is_sane(&seq->lss_space)); + } else { + LASSERT(!lu_seq_range_is_zero(&seq->lss_space) && + lu_seq_range_is_sane(&seq->lss_space)); + } + + rc = seq_server_debugfs_init(seq); + if (rc) + GOTO(out, rc); + + EXIT; +out: + if (rc) + seq_server_fini(seq, env); + return rc; +} +EXPORT_SYMBOL(seq_server_init); + +void seq_server_fini(struct lu_server_seq *seq, + const struct lu_env *env) +{ + ENTRY; + + seq_server_debugfs_fini(seq); + seq_store_fini(seq, env); + + EXIT; +} +EXPORT_SYMBOL(seq_server_fini); + +int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss) +{ + if (ss == NULL) + RETURN(0); + + if (ss->ss_server_seq) { + seq_server_fini(ss->ss_server_seq, env); + OBD_FREE_PTR(ss->ss_server_seq); + ss->ss_server_seq = NULL; + } + + if (ss->ss_control_seq) { + seq_server_fini(ss->ss_control_seq, env); + OBD_FREE_PTR(ss->ss_control_seq); + ss->ss_control_seq = NULL; + } + + if (ss->ss_client_seq) { + seq_client_fini(ss->ss_client_seq); + OBD_FREE_PTR(ss->ss_client_seq); + ss->ss_client_seq = NULL; + } + + RETURN(0); +} +EXPORT_SYMBOL(seq_site_fini); + +int fid_server_mod_init(void) +{ + LU_CONTEXT_KEY_INIT(&seq_thread_key); + return lu_context_key_register(&seq_thread_key); +} + +void fid_server_mod_exit(void) +{ + lu_context_key_degister(&seq_thread_key); +} diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_internal.h b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h new file mode 100644 index 0000000000000..1c6587d43b52b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h @@ -0,0 +1,94 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_internal.h + * + * Author: Yury Umanets + */ +#ifndef __FID_INTERNAL_H +#define __FID_INTERNAL_H + +#include + +#ifdef HAVE_SERVER_SUPPORT +# define HAVE_SEQ_SERVER + +struct req_capsule; + +struct seq_thread_info { + struct req_capsule *sti_pill; + struct lu_seq_range sti_space; + struct lu_buf sti_buf; +}; + +enum { + SEQ_TXN_STORE_CREDITS = 20 +}; + +extern struct lu_context_key seq_thread_key; + +extern struct ldebugfs_vars seq_server_debugfs_list[]; + +/* Store API functions. */ +struct dt_device; + +int seq_store_init(struct lu_server_seq *seq, + const struct lu_env *env, + struct dt_device *dt); + +void seq_store_fini(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_store_read(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq, + struct lu_seq_range *out, int sync); + +int seq_server_alloc_spec(struct lu_server_seq *seq, + struct lu_seq_range *spec, + const struct lu_env *env); + +int fid_server_mod_init(void); + +void fid_server_mod_exit(void); + +# endif /* HAVE_SERVER_SUPPORT */ + +/* Functions used internally in module. */ +int seq_client_alloc_super(struct lu_client_seq *seq, + const struct lu_env *env); + +extern struct dentry *seq_debugfs_dir; + +extern struct ldebugfs_vars seq_client_debugfs_list[]; + +#endif /* __FID_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_lib.c b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c new file mode 100644 index 0000000000000..ab3a59820abc7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c @@ -0,0 +1,99 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_lib.c + * + * Miscellaneous fid functions. + * + * Author: Nikita Danilov + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include +#include +#include + +/** + * A cluster-wide range from which fid-sequences are granted to servers and + * then clients. + * + * Fid namespace: + *
+ * Normal FID:        seq:64 [2^33,2^64-1]      oid:32          ver:32
+ * IGIF      :        0:32, ino:32              gen:32          0:32
+ * IDIF      :        0:31, 1:1, ost-index:16,  objd:48         0:32
+ * 
+ * + * The first 0x400 sequences of normal FID are reserved for special purpose. + * FID_SEQ_START + 1 is for local file id generation. + * FID_SEQ_START + 2 is for .lustre directory and its objects + */ +const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = { + .lsr_start = FID_SEQ_NORMAL, + .lsr_end = (__u64)~0ULL, +}; + +/* Zero range, used for init and other purposes. */ +const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = { + .lsr_start = 0, +}; + +/* Lustre Big Fs Lock fid. */ +const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL, + .f_oid = FID_OID_SPECIAL_BFL, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LUSTRE_BFL_FID); + +/** Special fid for ".lustre" directory */ +const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, + .f_oid = FID_OID_DOT_LUSTRE, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_DOT_LUSTRE_FID); + +/** Special fid for "fid" special object in .lustre */ +const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, + .f_oid = FID_OID_DOT_LUSTRE_OBF, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_OBF_FID); + +/** Special fid for "lost+found" special object in .lustre */ +const struct lu_fid LU_LPF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, + .f_oid = FID_OID_DOT_LUSTRE_LPF, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_LPF_FID); + +/** "/lost+found" - special FID for ldiskfs backend, invislbe to client. */ +const struct lu_fid LU_BACKEND_LPF_FID = { .f_seq = FID_SEQ_LOCAL_FILE, + .f_oid = OSD_LPF_OID, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_BACKEND_LPF_FID); diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_request.c b/drivers/staging/lustrefsx/lustre/fid/fid_request.c new file mode 100644 index 0000000000000..93f6402a12232 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/fid_request.c @@ -0,0 +1,615 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_request.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include +#include +#include +#include +#include +#include +/* mdc RPC locks */ +#include +#include "fid_internal.h" + +struct dentry *seq_debugfs_dir; + +static int seq_client_rpc(struct lu_client_seq *seq, + struct lu_seq_range *output, __u32 opc, + const char *opcname) +{ + struct obd_export *exp = seq->lcs_exp; + struct ptlrpc_request *req; + struct lu_seq_range *out, *in; + __u32 *op; + unsigned int debug_mask; + int rc; + ENTRY; + + LASSERT(exp != NULL && !IS_ERR(exp)); + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY, + LUSTRE_MDS_VERSION, SEQ_QUERY); + if (req == NULL) + RETURN(-ENOMEM); + + /* Init operation code */ + op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC); + *op = opc; + + /* Zero out input range, this is not recovery yet. */ + in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE); + lu_seq_range_init(in); + + ptlrpc_request_set_replen(req); + + in->lsr_index = seq->lcs_space.lsr_index; + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + fld_range_set_mdt(in); + else + fld_range_set_ost(in); + + if (opc == SEQ_ALLOC_SUPER) { + req->rq_request_portal = SEQ_CONTROLLER_PORTAL; + req->rq_reply_portal = MDC_REPLY_PORTAL; + /* During allocating super sequence for data object, + * the current thread might hold the export of MDT0(MDT0 + * precreating objects on this OST), and it will send the + * request to MDT0 here, so we can not keep resending the + * request here, otherwise if MDT0 is failed(umounted), + * it can not release the export of MDT0 */ + if (seq->lcs_type == LUSTRE_SEQ_DATA) + req->rq_no_delay = req->rq_no_resend = 1; + debug_mask = D_CONSOLE; + } else { + if (seq->lcs_type == LUSTRE_SEQ_METADATA) { + req->rq_reply_portal = MDC_REPLY_PORTAL; + req->rq_request_portal = SEQ_METADATA_PORTAL; + } else { + req->rq_reply_portal = OSC_REPLY_PORTAL; + req->rq_request_portal = SEQ_DATA_PORTAL; + } + + debug_mask = D_INFO; + } + + /* Allow seq client RPC during recovery time. */ + req->rq_allow_replay = 1; + + ptlrpc_at_set_req_timeout(req); + + rc = ptlrpc_queue_wait(req); + + if (rc) + GOTO(out_req, rc); + + out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE); + *output = *out; + + if (!lu_seq_range_is_sane(output)) { + CERROR("%s: Invalid range received from server: " + DRANGE"\n", seq->lcs_name, PRANGE(output)); + GOTO(out_req, rc = -EINVAL); + } + + if (lu_seq_range_is_exhausted(output)) { + CERROR("%s: Range received from server is exhausted: " + DRANGE"]\n", seq->lcs_name, PRANGE(output)); + GOTO(out_req, rc = -EINVAL); + } + + CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n", + seq->lcs_name, opcname, PRANGE(output)); + + EXIT; +out_req: + ptlrpc_req_finished(req); + return rc; +} + +/* Request sequence-controller node to allocate new super-sequence. */ +int seq_client_alloc_super(struct lu_client_seq *seq, + const struct lu_env *env) +{ + int rc; + ENTRY; + + mutex_lock(&seq->lcs_mutex); + + if (seq->lcs_srv) { +#ifdef HAVE_SEQ_SERVER + LASSERT(env != NULL); + rc = seq_server_alloc_super(seq->lcs_srv, &seq->lcs_space, + env); +#else + rc = 0; +#endif + } else { + /* Check whether the connection to seq controller has been + * setup (lcs_exp != NULL) */ + if (seq->lcs_exp == NULL) { + mutex_unlock(&seq->lcs_mutex); + RETURN(-EINPROGRESS); + } + + rc = seq_client_rpc(seq, &seq->lcs_space, + SEQ_ALLOC_SUPER, "super"); + } + mutex_unlock(&seq->lcs_mutex); + RETURN(rc); +} + +/* Request sequence-controller node to allocate new meta-sequence. */ +static int seq_client_alloc_meta(const struct lu_env *env, + struct lu_client_seq *seq) +{ + int rc; + ENTRY; + + if (seq->lcs_srv) { +#ifdef HAVE_SEQ_SERVER + LASSERT(env != NULL); + rc = seq_server_alloc_meta(seq->lcs_srv, &seq->lcs_space, env); +#else + rc = 0; +#endif + } else { + do { + /* If meta server return -EINPROGRESS or EAGAIN, + * it means meta server might not be ready to + * allocate super sequence from sequence controller + * (MDT0)yet */ + rc = seq_client_rpc(seq, &seq->lcs_space, + SEQ_ALLOC_META, "meta"); + if (rc == -EINPROGRESS || rc == -EAGAIN) { + wait_queue_head_t waitq; + struct l_wait_info lwi; + + /* MDT0 is not ready, let's wait for 2 + * seconds and retry. */ + init_waitqueue_head(&waitq); + lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, + NULL); + l_wait_event(waitq, 0, &lwi); + } + } while (rc == -EINPROGRESS || rc == -EAGAIN); + } + + RETURN(rc); +} + +/* Allocate new sequence for client. */ +static int seq_client_alloc_seq(const struct lu_env *env, + struct lu_client_seq *seq, u64 *seqnr) +{ + int rc; + ENTRY; + + LASSERT(lu_seq_range_is_sane(&seq->lcs_space)); + + if (lu_seq_range_is_exhausted(&seq->lcs_space)) { + rc = seq_client_alloc_meta(env, seq); + if (rc) { + if (rc != -EINPROGRESS) + CERROR("%s: Can't allocate new meta-sequence," + "rc = %d\n", seq->lcs_name, rc); + RETURN(rc); + } else { + CDEBUG(D_INFO, "%s: New range - "DRANGE"\n", + seq->lcs_name, PRANGE(&seq->lcs_space)); + } + } else { + rc = 0; + } + + LASSERT(!lu_seq_range_is_exhausted(&seq->lcs_space)); + *seqnr = seq->lcs_space.lsr_start; + seq->lcs_space.lsr_start += 1; + + CDEBUG(D_INFO, "%s: Allocated sequence [%#llx]\n", seq->lcs_name, + *seqnr); + + RETURN(rc); +} + +static int seq_fid_alloc_prep(struct lu_client_seq *seq, + wait_queue_entry_t *link) +{ + if (seq->lcs_update) { + add_wait_queue(&seq->lcs_waitq, link); + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&seq->lcs_mutex); + + schedule(); + + mutex_lock(&seq->lcs_mutex); + remove_wait_queue(&seq->lcs_waitq, link); + set_current_state(TASK_RUNNING); + return -EAGAIN; + } + + ++seq->lcs_update; + mutex_unlock(&seq->lcs_mutex); + + return 0; +} + +static void seq_fid_alloc_fini(struct lu_client_seq *seq, __u64 seqnr, + bool whole) +{ + LASSERT(seq->lcs_update == 1); + + mutex_lock(&seq->lcs_mutex); + if (seqnr != 0) { + CDEBUG(D_INFO, "%s: New sequence [0x%16.16llx]\n", + seq->lcs_name, seqnr); + + seq->lcs_fid.f_seq = seqnr; + if (whole) { + /* Since the caller require the whole seq, + * so marked this seq to be used */ + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + seq->lcs_fid.f_oid = + LUSTRE_METADATA_SEQ_MAX_WIDTH; + else + seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH; + } else { + seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID; + } + seq->lcs_fid.f_ver = 0; + } + + --seq->lcs_update; + wake_up_all(&seq->lcs_waitq); +} + +/** + * Allocate the whole non-used seq to the caller. + * + * \param[in] env pointer to the thread context + * \param[in,out] seq pointer to the client sequence manager + * \param[out] seqnr to hold the new allocated sequence + * + * \retval 0 for new sequence allocated. + * \retval Negative error number on failure. + */ +int seq_client_get_seq(const struct lu_env *env, + struct lu_client_seq *seq, u64 *seqnr) +{ + wait_queue_entry_t link; + int rc; + + LASSERT(seqnr != NULL); + + mutex_lock(&seq->lcs_mutex); + init_waitqueue_entry(&link, current); + + /* To guarantee that we can get a whole non-used sequence. */ + while (seq_fid_alloc_prep(seq, &link) != 0); + + rc = seq_client_alloc_seq(env, seq, seqnr); + seq_fid_alloc_fini(seq, rc ? 0 : *seqnr, true); + if (rc) + CERROR("%s: Can't allocate new sequence: rc = %d\n", + seq->lcs_name, rc); + mutex_unlock(&seq->lcs_mutex); + + return rc; +} +EXPORT_SYMBOL(seq_client_get_seq); + +/** + * Allocate new fid on passed client @seq and save it to @fid. + * + * \param[in] env pointer to the thread context + * \param[in,out] seq pointer to the client sequence manager + * \param[out] fid to hold the new allocated fid + * + * \retval 1 for notify the caller that sequence switch + * is performed to allow it to setup FLD for it. + * \retval 0 for new FID allocated in current sequence. + * \retval Negative error number on failure. + */ +int seq_client_alloc_fid(const struct lu_env *env, + struct lu_client_seq *seq, struct lu_fid *fid) +{ + wait_queue_entry_t link; + int rc; + ENTRY; + + LASSERT(seq != NULL); + LASSERT(fid != NULL); + + init_waitqueue_entry(&link, current); + mutex_lock(&seq->lcs_mutex); + + if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST)) + seq->lcs_fid.f_oid = seq->lcs_width; + + while (1) { + u64 seqnr; + + if (unlikely(!fid_is_zero(&seq->lcs_fid) && + fid_oid(&seq->lcs_fid) < seq->lcs_width)) { + /* Just bump last allocated fid and return to caller. */ + seq->lcs_fid.f_oid++; + rc = 0; + break; + } + + /* Release seq::lcs_mutex via seq_fid_alloc_prep() to avoid + * deadlock during seq_client_alloc_seq(). */ + rc = seq_fid_alloc_prep(seq, &link); + if (rc) + continue; + + rc = seq_client_alloc_seq(env, seq, &seqnr); + /* Re-take seq::lcs_mutex via seq_fid_alloc_fini(). */ + seq_fid_alloc_fini(seq, rc ? 0 : seqnr, false); + if (rc) { + if (rc != -EINPROGRESS) + CERROR("%s: Can't allocate new sequence: " + "rc = %d\n", seq->lcs_name, rc); + mutex_unlock(&seq->lcs_mutex); + + RETURN(rc); + } + + rc = 1; + break; + } + + *fid = seq->lcs_fid; + mutex_unlock(&seq->lcs_mutex); + + CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name, PFID(fid)); + + RETURN(rc); +} +EXPORT_SYMBOL(seq_client_alloc_fid); + +/* + * Finish the current sequence due to disconnect. + * See mdc_import_event() + */ +void seq_client_flush(struct lu_client_seq *seq) +{ + wait_queue_entry_t link; + + LASSERT(seq != NULL); + init_waitqueue_entry(&link, current); + mutex_lock(&seq->lcs_mutex); + + while (seq->lcs_update) { + add_wait_queue(&seq->lcs_waitq, &link); + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&seq->lcs_mutex); + + schedule(); + + mutex_lock(&seq->lcs_mutex); + remove_wait_queue(&seq->lcs_waitq, &link); + set_current_state(TASK_RUNNING); + } + + fid_zero(&seq->lcs_fid); + /** + * this id shld not be used for seq range allocation. + * set to -1 for dgb check. + */ + + seq->lcs_space.lsr_index = -1; + + lu_seq_range_init(&seq->lcs_space); + mutex_unlock(&seq->lcs_mutex); +} +EXPORT_SYMBOL(seq_client_flush); + +static void seq_client_debugfs_fini(struct lu_client_seq *seq) +{ + if (!IS_ERR_OR_NULL(seq->lcs_debugfs_entry)) + ldebugfs_remove(&seq->lcs_debugfs_entry); +} + +static int seq_client_debugfs_init(struct lu_client_seq *seq) +{ + int rc; + + seq->lcs_debugfs_entry = ldebugfs_register(seq->lcs_name, + seq_debugfs_dir, + NULL, NULL); + if (IS_ERR_OR_NULL(seq->lcs_debugfs_entry)) { + CERROR("%s: LdebugFS failed in seq-init\n", seq->lcs_name); + rc = seq->lcs_debugfs_entry ? PTR_ERR(seq->lcs_debugfs_entry) + : -ENOMEM; + seq->lcs_debugfs_entry = NULL; + RETURN(rc); + } + + rc = ldebugfs_add_vars(seq->lcs_debugfs_entry, + seq_client_debugfs_list, seq); + if (rc) { + CERROR("%s: Can't init sequence manager debugfs, rc %d\n", + seq->lcs_name, rc); + GOTO(out_cleanup, rc); + } + + RETURN(0); + +out_cleanup: + seq_client_debugfs_fini(seq); + return rc; +} + +void seq_client_fini(struct lu_client_seq *seq) +{ + ENTRY; + + seq_client_debugfs_fini(seq); + + if (seq->lcs_exp != NULL) { + class_export_put(seq->lcs_exp); + seq->lcs_exp = NULL; + } + + seq->lcs_srv = NULL; + EXIT; +} +EXPORT_SYMBOL(seq_client_fini); + +int seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + const char *prefix, + struct lu_server_seq *srv) +{ + int rc; + ENTRY; + + LASSERT(seq != NULL); + LASSERT(prefix != NULL); + + seq->lcs_srv = srv; + seq->lcs_type = type; + + mutex_init(&seq->lcs_mutex); + if (type == LUSTRE_SEQ_METADATA) + seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH; + else + seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH; + + init_waitqueue_head(&seq->lcs_waitq); + /* Make sure that things are clear before work is started. */ + seq_client_flush(seq); + + if (exp != NULL) + seq->lcs_exp = class_export_get(exp); + + snprintf(seq->lcs_name, sizeof(seq->lcs_name), + "cli-%s", prefix); + + rc = seq_client_debugfs_init(seq); + if (rc) + seq_client_fini(seq); + RETURN(rc); +} +EXPORT_SYMBOL(seq_client_init); + +int client_fid_init(struct obd_device *obd, + struct obd_export *exp, enum lu_cli_type type) +{ + struct client_obd *cli = &obd->u.cli; + char *prefix; + int rc; + ENTRY; + + down_write(&cli->cl_seq_rwsem); + OBD_ALLOC_PTR(cli->cl_seq); + if (!cli->cl_seq) + GOTO(out, rc = -ENOMEM); + + OBD_ALLOC(prefix, MAX_OBD_NAME + 5); + if (!prefix) + GOTO(out, rc = -ENOMEM); + + snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name); + + /* Init client side sequence-manager */ + rc = seq_client_init(cli->cl_seq, exp, type, prefix, NULL); + OBD_FREE(prefix, MAX_OBD_NAME + 5); + + GOTO(out, rc); + +out: + if (rc && cli->cl_seq) { + OBD_FREE_PTR(cli->cl_seq); + cli->cl_seq = NULL; + } + up_write(&cli->cl_seq_rwsem); + + return rc; +} +EXPORT_SYMBOL(client_fid_init); + +int client_fid_fini(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + ENTRY; + + down_write(&cli->cl_seq_rwsem); + if (cli->cl_seq) { + seq_client_fini(cli->cl_seq); + OBD_FREE_PTR(cli->cl_seq); + cli->cl_seq = NULL; + } + up_write(&cli->cl_seq_rwsem); + + RETURN(0); +} +EXPORT_SYMBOL(client_fid_fini); + +static int __init fid_init(void) +{ +#ifdef HAVE_SERVER_SUPPORT + int rc = fid_server_mod_init(); + + if (rc) + return rc; +#endif + seq_debugfs_dir = ldebugfs_register(LUSTRE_SEQ_NAME, + debugfs_lustre_root, + NULL, NULL); + return PTR_ERR_OR_ZERO(seq_debugfs_dir); +} + +static void __exit fid_exit(void) +{ +# ifdef HAVE_SERVER_SUPPORT + fid_server_mod_exit(); +# endif + if (!IS_ERR_OR_NULL(seq_debugfs_dir)) + ldebugfs_remove(&seq_debugfs_dir); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre File IDentifier"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(fid_init); +module_exit(fid_exit); diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_store.c b/drivers/staging/lustrefsx/lustre/fid/fid_store.c new file mode 100644 index 0000000000000..1565d80811d29 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/fid_store.c @@ -0,0 +1,248 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_store.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include +#include +#include +#include +#include +#include "fid_internal.h" + +static struct lu_buf *seq_store_buf(struct seq_thread_info *info) +{ + struct lu_buf *buf; + + buf = &info->sti_buf; + buf->lb_buf = &info->sti_space; + buf->lb_len = sizeof(info->sti_space); + return buf; +} + +struct seq_update_callback { + struct dt_txn_commit_cb suc_cb; + struct lu_server_seq *suc_seq; +}; + +void seq_update_cb(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err) +{ + struct seq_update_callback *ccb; + + ccb = container_of0(cb, struct seq_update_callback, suc_cb); + + LASSERT(ccb->suc_seq != NULL); + + ccb->suc_seq->lss_need_sync = 0; + OBD_FREE_PTR(ccb); +} + +int seq_update_cb_add(struct thandle *th, struct lu_server_seq *seq) +{ + struct seq_update_callback *ccb; + struct dt_txn_commit_cb *dcb; + int rc; + + OBD_ALLOC_PTR(ccb); + if (ccb == NULL) + return -ENOMEM; + + ccb->suc_seq = seq; + seq->lss_need_sync = 1; + + dcb = &ccb->suc_cb; + dcb->dcb_func = seq_update_cb; + INIT_LIST_HEAD(&dcb->dcb_linkage); + strlcpy(dcb->dcb_name, "seq_update_cb", sizeof(dcb->dcb_name)); + + rc = dt_trans_cb_add(th, dcb); + if (rc) + OBD_FREE_PTR(ccb); + return rc; +} + +/* This function implies that caller takes care about locking. */ +int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq, + struct lu_seq_range *out, int sync) +{ + struct dt_device *dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev); + struct seq_thread_info *info; + struct thandle *th; + loff_t pos = 0; + int rc; + + if (dt_dev->dd_rdonly) + RETURN(0); + + info = lu_context_key_get(&env->le_ctx, &seq_thread_key); + LASSERT(info != NULL); + + th = dt_trans_create(env, dt_dev); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + /* Store ranges in le format. */ + range_cpu_to_le(&info->sti_space, &seq->lss_space); + + rc = dt_declare_record_write(env, seq->lss_obj, + seq_store_buf(info), 0, th); + if (rc) + GOTO(exit, rc); + + if (out != NULL) { + rc = fld_declare_server_create(env, + seq->lss_site->ss_server_fld, + out, th); + if (rc) + GOTO(exit, rc); + } + + rc = dt_trans_start_local(env, dt_dev, th); + if (rc) + GOTO(exit, rc); + + rc = dt_record_write(env, seq->lss_obj, seq_store_buf(info), &pos, th); + if (rc) { + CERROR("%s: Can't write space data, rc %d\n", + seq->lss_name, rc); + GOTO(exit, rc); + } else if (out != NULL) { + rc = fld_server_create(env, seq->lss_site->ss_server_fld, out, + th); + if (rc) { + CERROR("%s: Can't Update fld database, rc %d\n", + seq->lss_name, rc); + GOTO(exit, rc); + } + } + /* next sequence update will need sync until this update is committed + * in case of sync operation this is not needed obviously */ + if (!sync) + /* if callback can't be added then sync always */ + sync = !!seq_update_cb_add(th, seq); + + th->th_sync |= sync; +exit: + dt_trans_stop(env, dt_dev, th); + return rc; +} + +/* + * This function implies that caller takes care about locking or locking is not + * needed (init time). + */ +int seq_store_read(struct lu_server_seq *seq, + const struct lu_env *env) +{ + struct seq_thread_info *info; + loff_t pos = 0; + int rc; + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &seq_thread_key); + LASSERT(info != NULL); + + rc = dt_read(env, seq->lss_obj, seq_store_buf(info), &pos); + + if (rc == sizeof(info->sti_space)) { + range_le_to_cpu(&seq->lss_space, &info->sti_space); + CDEBUG(D_INFO, "%s: Space - "DRANGE"\n", + seq->lss_name, PRANGE(&seq->lss_space)); + rc = 0; + } else if (rc == 0) { + rc = -ENODATA; + } else if (rc > 0) { + CERROR("%s: Read only %d bytes of %d\n", seq->lss_name, + rc, (int)sizeof(info->sti_space)); + rc = -EIO; + } + + RETURN(rc); +} + +int seq_store_init(struct lu_server_seq *seq, + const struct lu_env *env, + struct dt_device *dt) +{ + struct dt_object *dt_obj; + struct lu_fid fid; + struct lu_attr attr; + struct dt_object_format dof; + const char *name; + int rc; + ENTRY; + + name = seq->lss_type == LUSTRE_SEQ_SERVER ? + LUSTRE_SEQ_SRV_NAME : LUSTRE_SEQ_CTL_NAME; + + if (seq->lss_type == LUSTRE_SEQ_SERVER) + lu_local_obj_fid(&fid, FID_SEQ_SRV_OID); + else + lu_local_obj_fid(&fid, FID_SEQ_CTL_OID); + + memset(&attr, 0, sizeof(attr)); + attr.la_valid = LA_MODE; + attr.la_mode = S_IFREG | 0666; + dof.dof_type = DFT_REGULAR; + + dt_obj = dt_find_or_create(env, dt, &fid, &dof, &attr); + if (!IS_ERR(dt_obj)) { + seq->lss_obj = dt_obj; + rc = 0; + } else { + CERROR("%s: Can't find \"%s\" obj %d\n", + seq->lss_name, name, (int)PTR_ERR(dt_obj)); + rc = PTR_ERR(dt_obj); + } + + RETURN(rc); +} + +void seq_store_fini(struct lu_server_seq *seq, const struct lu_env *env) +{ + ENTRY; + + if (seq->lss_obj != NULL) { + if (!IS_ERR(seq->lss_obj)) + dt_object_put(env, seq->lss_obj); + seq->lss_obj = NULL; + } + + EXIT; +} diff --git a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c new file mode 100644 index 0000000000000..5ac2b883d0861 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c @@ -0,0 +1,621 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/lproc_fid.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +#include +#include +#include +#include +#include +#include +#include +#include "fid_internal.h" + +/* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */ +#define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64)) +/** + * Reduce the SEQ range allocated to a node to a strict subset of the range + * currently-allocated SEQ range. If the specified range is "clear", then + * drop all allocated sequences and request a new one from the master. + * + * Note: this function should only be used for testing, it is not necessarily + * safe for production use. + */ +static int +ldebugfs_fid_write_common(const char __user *buffer, size_t count, + struct lu_seq_range *range) +{ + char kernbuf[MAX_FID_RANGE_STRLEN]; + struct lu_seq_range tmp = { + .lsr_start = 0, + }; + int rc; + + ENTRY; + LASSERT(range); + + if (count >= sizeof(kernbuf)) + RETURN(-EINVAL); + + if (copy_from_user(kernbuf, buffer, count)) + RETURN(-EFAULT); + + kernbuf[count] = 0; + + if (count == 5 && strcmp(kernbuf, "clear") == 0) { + memset(range, 0, sizeof(*range)); + RETURN(count); + } + + /* of the form "[0x0000000240000400 - 0x000000028000400]" */ + rc = sscanf(kernbuf, "[%llx - %llx]\n", + (unsigned long long *)&tmp.lsr_start, + (unsigned long long *)&tmp.lsr_end); + if (rc != 2) + RETURN(-EINVAL); + if (!lu_seq_range_is_sane(&tmp) || lu_seq_range_is_zero(&tmp) || + tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end) + RETURN(-EINVAL); + *range = tmp; + RETURN(0); +} + +#ifdef HAVE_SERVER_SUPPORT +/* + * Server side debugfs stuff. + */ +static ssize_t +ldebugfs_server_fid_space_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct lu_server_seq *seq; + int rc; + + ENTRY; + seq = ((struct seq_file *)file->private_data)->private; + + mutex_lock(&seq->lss_mutex); + rc = ldebugfs_fid_write_common(buffer, count, &seq->lss_space); + if (rc == 0) { + CDEBUG(D_INFO, "%s: Space: " DRANGE "\n", + seq->lss_name, PRANGE(&seq->lss_space)); + } + mutex_unlock(&seq->lss_mutex); + + RETURN(count); +} + +static int +ldebugfs_server_fid_space_seq_show(struct seq_file *m, void *unused) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)m->private; + ENTRY; + + mutex_lock(&seq->lss_mutex); + seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lss_space)); + mutex_unlock(&seq->lss_mutex); + + RETURN(0); +} + +static int +ldebugfs_server_fid_server_seq_show(struct seq_file *m, void *unused) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)m->private; + struct client_obd *cli; + ENTRY; + + if (seq->lss_cli) { + if (seq->lss_cli->lcs_exp != NULL) { + cli = &seq->lss_cli->lcs_exp->exp_obd->u.cli; + seq_printf(m, "%s\n", cli->cl_target_uuid.uuid); + } else { + seq_printf(m, "%s\n", seq->lss_cli->lcs_srv->lss_name); + } + } else { + seq_puts(m, "\n"); + } + + RETURN(0); +} + +static ssize_t ldebugfs_server_fid_width_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct lu_server_seq *seq = m->private; + int rc; + + ENTRY; + mutex_lock(&seq->lss_mutex); + + rc = kstrtoull_from_user(buffer, count, 0, &seq->lss_width); + if (rc) { + CERROR("%s: invalid FID sequence width: rc = %d\n", + seq->lss_name, rc); + GOTO(out_unlock, count = rc); + } + + CDEBUG(D_INFO, "%s: Width: %llu\n", + seq->lss_name, seq->lss_width); +out_unlock: + mutex_unlock(&seq->lss_mutex); + + RETURN(count); +} + +static int +ldebugfs_server_fid_width_seq_show(struct seq_file *m, void *unused) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)m->private; + + ENTRY; + mutex_lock(&seq->lss_mutex); + seq_printf(m, "%llu\n", seq->lss_width); + mutex_unlock(&seq->lss_mutex); + + RETURN(0); +} + +LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_space); +LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_width); +LDEBUGFS_SEQ_FOPS_RO(ldebugfs_server_fid_server); + +struct ldebugfs_vars seq_server_debugfs_list[] = { + { .name = "space", + .fops = &ldebugfs_server_fid_space_fops }, + { .name = "width", + .fops = &ldebugfs_server_fid_width_fops }, + { .name = "server", + .fops = &ldebugfs_server_fid_server_fops}, + { NULL } +}; + +struct fld_seq_param { + struct lu_env fsp_env; + struct dt_it *fsp_it; + struct lu_server_fld *fsp_fld; + struct lu_server_seq *fsp_seq; + unsigned int fsp_stop:1; +}; + +/* + * XXX: below is a copy of the functions in lustre/fld/lproc_fld.c. + * we want to avoid this duplication either by exporting the + * functions or merging fid and fld into a single module. + */ +static void *fldb_seq_start(struct seq_file *p, loff_t *pos) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct dt_key *key; + int rc; + + if (param == NULL || param->fsp_stop) + return NULL; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->load(¶m->fsp_env, param->fsp_it, *pos); + if (rc <= 0) + return NULL; + + key = iops->key(¶m->fsp_env, param->fsp_it); + if (IS_ERR(key)) + return NULL; + + *pos = be64_to_cpu(*(__u64 *)key); + + return param; +} + +static void fldb_seq_stop(struct seq_file *p, void *v) +{ + struct fld_seq_param *param = p->private; + const struct dt_it_ops *iops; + struct lu_server_fld *fld; + struct dt_object *obj; + + if (param == NULL) + return; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + iops->put(¶m->fsp_env, param->fsp_it); +} + +static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + int rc; + + if (param == NULL || param->fsp_stop) + return NULL; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->next(¶m->fsp_env, param->fsp_it); + if (rc > 0) { + param->fsp_stop = 1; + return NULL; + } + + *pos = be64_to_cpu(*(__u64 *)iops->key(¶m->fsp_env, param->fsp_it)); + return param; +} + +static int fldb_seq_show(struct seq_file *p, void *v) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct lu_seq_range fld_rec; + int rc; + + if (param == NULL || param->fsp_stop) + return 0; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->rec(¶m->fsp_env, param->fsp_it, + (struct dt_rec *)&fld_rec, 0); + if (rc != 0) { + CERROR("%s: read record error: rc = %d\n", + fld->lsf_name, rc); + } else if (fld_rec.lsr_start != 0) { + range_be_to_cpu(&fld_rec, &fld_rec); + seq_printf(p, DRANGE"\n", PRANGE(&fld_rec)); + } + + return rc; +} + +struct seq_operations fldb_sops = { + .start = fldb_seq_start, + .stop = fldb_seq_stop, + .next = fldb_seq_next, + .show = fldb_seq_show, +}; + +static int fldb_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct lu_server_seq *ss = inode->i_private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct fld_seq_param *param = NULL; + int env_init = 0; + int rc; + + fld = ss->lss_site->ss_server_fld; + LASSERT(fld != NULL); + + rc = seq_open(file, &fldb_sops); + if (rc) + return rc; + + obj = fld->lsf_obj; + if (obj == NULL) { + seq = file->private_data; + seq->private = NULL; + return 0; + } + + OBD_ALLOC_PTR(param); + if (param == NULL) + GOTO(out, rc = -ENOMEM); + + rc = lu_env_init(¶m->fsp_env, LCT_MD_THREAD); + if (rc != 0) + GOTO(out, rc); + + env_init = 1; + iops = &obj->do_index_ops->dio_it; + param->fsp_it = iops->init(¶m->fsp_env, obj, 0); + if (IS_ERR(param->fsp_it)) + GOTO(out, rc = PTR_ERR(param->fsp_it)); + + param->fsp_fld = fld; + param->fsp_seq = ss; + param->fsp_stop = 0; + + seq = file->private_data; + seq->private = param; +out: + if (rc != 0) { + if (env_init == 1) + lu_env_fini(¶m->fsp_env); + if (param != NULL) + OBD_FREE_PTR(param); + } + return rc; +} + +static int fldb_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct fld_seq_param *param; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + + param = seq->private; + if (param == NULL) { + seq_release(inode, file); + return 0; + } + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + LASSERT(iops != NULL); + LASSERT(param->fsp_it != NULL); + iops->fini(¶m->fsp_env, param->fsp_it); + lu_env_fini(¶m->fsp_env); + OBD_FREE_PTR(param); + seq_release(inode, file); + + return 0; +} + +static ssize_t fldb_seq_write(struct file *file, const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct fld_seq_param *param; + struct lu_seq_range range; + int rc = 0; + char _buffer[MAX_FID_RANGE_STRLEN]; + char *buffer = _buffer; + ENTRY; + + param = seq->private; + if (param == NULL) + RETURN(-EINVAL); + + if (len >= sizeof(_buffer)) + RETURN(-EINVAL); + + if (copy_from_user(buffer, buf, len)) + GOTO(out, rc = -EFAULT); + buffer[len] = 0; + + /* + * format - [0x0000000200000007-0x0000000200000008):0:mdt + */ + if (*buffer != '[') + GOTO(out, rc = -EINVAL); + buffer++; + + range.lsr_start = simple_strtoull(buffer, &buffer, 0); + if (*buffer != '-') + GOTO(out, rc = -EINVAL); + buffer++; + + range.lsr_end = simple_strtoull(buffer, &buffer, 0); + if (*buffer != ')') + GOTO(out, rc = -EINVAL); + buffer++; + if (*buffer != ':') + GOTO(out, rc = -EINVAL); + buffer++; + + range.lsr_index = simple_strtoul(buffer, &buffer, 0); + if (*buffer != ':') + GOTO(out, rc = -EINVAL); + buffer++; + + if (strncmp(buffer, "mdt", 3) == 0) + range.lsr_flags = LU_SEQ_RANGE_MDT; + else if (strncmp(buffer, "ost", 3) == 0) + range.lsr_flags = LU_SEQ_RANGE_OST; + else + GOTO(out, rc = -EINVAL); + + rc = seq_server_alloc_spec(param->fsp_seq->lss_site->ss_control_seq, + &range, ¶m->fsp_env); + +out: + RETURN(rc < 0 ? rc : len); +} + +const struct file_operations seq_fld_debugfs_seq_fops = { + .owner = THIS_MODULE, + .open = fldb_seq_open, + .read = seq_read, + .write = fldb_seq_write, + .release = fldb_seq_release, +}; + +#endif /* HAVE_SERVER_SUPPORT */ + +/* Client side debugfs stuff */ +static ssize_t +ldebugfs_client_fid_space_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct lu_client_seq *seq; + int rc; + + ENTRY; + seq = ((struct seq_file *)file->private_data)->private; + + mutex_lock(&seq->lcs_mutex); + rc = ldebugfs_fid_write_common(buffer, count, &seq->lcs_space); + if (rc == 0) { + CDEBUG(D_INFO, "%s: Space: " DRANGE "\n", + seq->lcs_name, PRANGE(&seq->lcs_space)); + } + + mutex_unlock(&seq->lcs_mutex); + + RETURN(count); +} + +static int ldebugfs_client_fid_space_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + + ENTRY; + mutex_lock(&seq->lcs_mutex); + seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lcs_space)); + mutex_unlock(&seq->lcs_mutex); + + RETURN(0); +} + +static ssize_t ldebugfs_client_fid_width_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct lu_client_seq *seq = m->private; + u64 val; + u64 max; + int rc; + + ENTRY; + rc = kstrtoull_from_user(buffer, count, 0, &val); + if (rc) + return rc; + + mutex_lock(&seq->lcs_mutex); + if (seq->lcs_type == LUSTRE_SEQ_DATA) + max = LUSTRE_DATA_SEQ_MAX_WIDTH; + else + max = LUSTRE_METADATA_SEQ_MAX_WIDTH; + + if (val <= max) { + seq->lcs_width = val; + + CDEBUG(D_INFO, "%s: Sequence size: %llu\n", seq->lcs_name, + seq->lcs_width); + } else { + count = -ERANGE; + } + + mutex_unlock(&seq->lcs_mutex); + RETURN(count); +} + +static int +ldebugfs_client_fid_width_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + + ENTRY; + mutex_lock(&seq->lcs_mutex); + seq_printf(m, "%llu\n", seq->lcs_width); + mutex_unlock(&seq->lcs_mutex); + + RETURN(0); +} + +static int +ldebugfs_client_fid_fid_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + + ENTRY; + mutex_lock(&seq->lcs_mutex); + seq_printf(m, DFID"\n", PFID(&seq->lcs_fid)); + mutex_unlock(&seq->lcs_mutex); + + RETURN(0); +} + +static int +ldebugfs_client_fid_server_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)m->private; + struct client_obd *cli; + ENTRY; + + if (seq->lcs_exp) { + cli = &seq->lcs_exp->exp_obd->u.cli; + seq_printf(m, "%s\n", cli->cl_target_uuid.uuid); +#ifdef HAVE_SERVER_SUPPORT + } else { + seq_printf(m, "%s\n", seq->lcs_srv->lss_name); +#endif /* HAVE_SERVER_SUPPORT */ + } + + RETURN(0); +} + +LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_space); +LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_width); +LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_server); +LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_fid); + +struct ldebugfs_vars seq_client_debugfs_list[] = { + { .name = "space", + .fops = &ldebugfs_client_fid_space_fops }, + { .name = "width", + .fops = &ldebugfs_client_fid_width_fops }, + { .name = "server", + .fops = &ldebugfs_client_fid_server_fops}, + { .name = "fid", + .fops = &ldebugfs_client_fid_fid_fops }, + { NULL } +}; diff --git a/drivers/staging/lustrefsx/lustre/fld/Makefile b/drivers/staging/lustrefsx/lustre/fld/Makefile new file mode 100644 index 0000000000000..722c19fe30409 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LUSTREFSX_LNET) += fld.o + +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/include + +fld-y := fld_request.o fld_cache.o lproc_fld.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules + diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_cache.c b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c new file mode 100644 index 0000000000000..f638e0dcd1ea4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c @@ -0,0 +1,541 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_cache.c + * + * FLD (Fids Location Database) + * + * Author: Pravin Shelar + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include +#include +#include +#include +#include +#include "fld_internal.h" + +/** + * create fld cache. + */ +struct fld_cache *fld_cache_init(const char *name, + int cache_size, int cache_threshold) +{ + struct fld_cache *cache; + ENTRY; + + LASSERT(name != NULL); + LASSERT(cache_threshold < cache_size); + + OBD_ALLOC_PTR(cache); + if (cache == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + INIT_LIST_HEAD(&cache->fci_entries_head); + INIT_LIST_HEAD(&cache->fci_lru); + + cache->fci_cache_count = 0; + rwlock_init(&cache->fci_lock); + + strlcpy(cache->fci_name, name, + sizeof(cache->fci_name)); + + cache->fci_cache_size = cache_size; + cache->fci_threshold = cache_threshold; + + /* Init fld cache info. */ + memset(&cache->fci_stat, 0, sizeof(cache->fci_stat)); + + CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n", + cache->fci_name, cache_size, cache_threshold); + + RETURN(cache); +} + +/** + * destroy fld cache. + */ +void fld_cache_fini(struct fld_cache *cache) +{ + LASSERT(cache != NULL); + fld_cache_flush(cache); + + CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name); + CDEBUG(D_INFO, " Cache reqs: %llu\n", cache->fci_stat.fst_cache); + CDEBUG(D_INFO, " Total reqs: %llu\n", cache->fci_stat.fst_count); + + OBD_FREE_PTR(cache); +} + +/** + * delete given node from list. + */ +void fld_cache_entry_delete(struct fld_cache *cache, + struct fld_cache_entry *node) +{ + list_del(&node->fce_list); + list_del(&node->fce_lru); + cache->fci_cache_count--; + OBD_FREE_PTR(node); +} + +/** + * fix list by checking new entry with NEXT entry in order. + */ +static void fld_fix_new_list(struct fld_cache *cache) +{ + struct fld_cache_entry *f_curr; + struct fld_cache_entry *f_next; + struct lu_seq_range *c_range; + struct lu_seq_range *n_range; + struct list_head *head = &cache->fci_entries_head; + ENTRY; + +restart_fixup: + + list_for_each_entry_safe(f_curr, f_next, head, fce_list) { + c_range = &f_curr->fce_range; + n_range = &f_next->fce_range; + + LASSERT(lu_seq_range_is_sane(c_range)); + if (&f_next->fce_list == head) + break; + + if (c_range->lsr_flags != n_range->lsr_flags) + continue; + + LASSERTF(c_range->lsr_start <= n_range->lsr_start, + "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n", + PRANGE(c_range), PRANGE(n_range)); + + /* check merge possibility with next range */ + if (c_range->lsr_end == n_range->lsr_start) { + if (c_range->lsr_index != n_range->lsr_index) + continue; + n_range->lsr_start = c_range->lsr_start; + fld_cache_entry_delete(cache, f_curr); + continue; + } + + /* check if current range overlaps with next range. */ + if (n_range->lsr_start < c_range->lsr_end) { + if (c_range->lsr_index == n_range->lsr_index) { + n_range->lsr_start = c_range->lsr_start; + n_range->lsr_end = max(c_range->lsr_end, + n_range->lsr_end); + fld_cache_entry_delete(cache, f_curr); + } else { + if (n_range->lsr_end <= c_range->lsr_end) { + *n_range = *c_range; + fld_cache_entry_delete(cache, f_curr); + } else + n_range->lsr_start = c_range->lsr_end; + } + + /* we could have overlap over next + * range too. better restart. */ + goto restart_fixup; + } + + /* kill duplicates */ + if (c_range->lsr_start == n_range->lsr_start && + c_range->lsr_end == n_range->lsr_end) + fld_cache_entry_delete(cache, f_curr); + } + + EXIT; +} + +/** + * add node to fld cache + */ +static inline void fld_cache_entry_add(struct fld_cache *cache, + struct fld_cache_entry *f_new, + struct list_head *pos) +{ + list_add(&f_new->fce_list, pos); + list_add(&f_new->fce_lru, &cache->fci_lru); + + cache->fci_cache_count++; + fld_fix_new_list(cache); +} + +/** + * Check if cache needs to be shrunk. If so - do it. + * Remove one entry in list and so on until cache is shrunk enough. + */ +static int fld_cache_shrink(struct fld_cache *cache) +{ + struct fld_cache_entry *flde; + struct list_head *curr; + int num = 0; + ENTRY; + + LASSERT(cache != NULL); + + if (cache->fci_cache_count < cache->fci_cache_size) + RETURN(0); + + curr = cache->fci_lru.prev; + + while (cache->fci_cache_count + cache->fci_threshold > + cache->fci_cache_size && curr != &cache->fci_lru) { + + flde = list_entry(curr, struct fld_cache_entry, fce_lru); + curr = curr->prev; + fld_cache_entry_delete(cache, flde); + num++; + } + + CDEBUG(D_INFO, "%s: FLD cache - Shrunk by " + "%d entries\n", cache->fci_name, num); + + RETURN(0); +} + +/** + * kill all fld cache entries. + */ +void fld_cache_flush(struct fld_cache *cache) +{ + ENTRY; + + write_lock(&cache->fci_lock); + cache->fci_cache_size = 0; + fld_cache_shrink(cache); + write_unlock(&cache->fci_lock); + + EXIT; +} + +/** + * punch hole in existing range. divide this range and add new + * entry accordingly. + */ + +static void fld_cache_punch_hole(struct fld_cache *cache, + struct fld_cache_entry *f_curr, + struct fld_cache_entry *f_new) +{ + const struct lu_seq_range *range = &f_new->fce_range; + const u64 new_start = range->lsr_start; + const u64 new_end = range->lsr_end; + struct fld_cache_entry *fldt; + + ENTRY; + OBD_ALLOC_GFP(fldt, sizeof *fldt, GFP_ATOMIC); + if (!fldt) { + OBD_FREE_PTR(f_new); + EXIT; + /* overlap is not allowed, so dont mess up list. */ + return; + } + /* break f_curr RANGE into three RANGES: + * f_curr, f_new , fldt + */ + + /* f_new = *range */ + + /* fldt */ + fldt->fce_range.lsr_start = new_end; + fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end; + fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index; + + /* f_curr */ + f_curr->fce_range.lsr_end = new_start; + + /* add these two entries to list */ + fld_cache_entry_add(cache, f_new, &f_curr->fce_list); + fld_cache_entry_add(cache, fldt, &f_new->fce_list); + + /* no need to fixup */ + EXIT; +} + +/** + * handle range overlap in fld cache. + */ +static void fld_cache_overlap_handle(struct fld_cache *cache, + struct fld_cache_entry *f_curr, + struct fld_cache_entry *f_new) +{ + const struct lu_seq_range *range = &f_new->fce_range; + const u64 new_start = range->lsr_start; + const u64 new_end = range->lsr_end; + const u32 mdt = range->lsr_index; + + /* this is overlap case, these case are checking overlapping with + * prev range only. fixup will handle overlaping with next range. */ + + if (f_curr->fce_range.lsr_index == mdt) { + f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start, + new_start); + + f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end, + new_end); + + OBD_FREE_PTR(f_new); + fld_fix_new_list(cache); + + } else if (new_start <= f_curr->fce_range.lsr_start && + f_curr->fce_range.lsr_end <= new_end) { + /* case 1: new range completely overshadowed existing range. + * e.g. whole range migrated. update fld cache entry */ + + f_curr->fce_range = *range; + OBD_FREE_PTR(f_new); + fld_fix_new_list(cache); + + } else if (f_curr->fce_range.lsr_start < new_start && + new_end < f_curr->fce_range.lsr_end) { + /* case 2: new range fit within existing range. */ + + fld_cache_punch_hole(cache, f_curr, f_new); + + } else if (new_end <= f_curr->fce_range.lsr_end) { + /* case 3: overlap: + * [new_start [c_start new_end) c_end) + */ + + LASSERT(new_start <= f_curr->fce_range.lsr_start); + + f_curr->fce_range.lsr_start = new_end; + fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev); + + } else if (f_curr->fce_range.lsr_start <= new_start) { + /* case 4: overlap: + * [c_start [new_start c_end) new_end) + */ + + LASSERT(f_curr->fce_range.lsr_end <= new_end); + + f_curr->fce_range.lsr_end = new_start; + fld_cache_entry_add(cache, f_new, &f_curr->fce_list); + } else + CERROR("NEW range ="DRANGE" curr = "DRANGE"\n", + PRANGE(range),PRANGE(&f_curr->fce_range)); +} + +struct fld_cache_entry +*fld_cache_entry_create(const struct lu_seq_range *range) +{ + struct fld_cache_entry *f_new; + + LASSERT(lu_seq_range_is_sane(range)); + + OBD_ALLOC_PTR(f_new); + if (!f_new) + RETURN(ERR_PTR(-ENOMEM)); + + f_new->fce_range = *range; + RETURN(f_new); +} + +/** + * Insert FLD entry in FLD cache. + * + * This function handles all cases of merging and breaking up of + * ranges. + */ +int fld_cache_insert_nolock(struct fld_cache *cache, + struct fld_cache_entry *f_new) +{ + struct fld_cache_entry *f_curr; + struct fld_cache_entry *n; + struct list_head *head; + struct list_head *prev = NULL; + const u64 new_start = f_new->fce_range.lsr_start; + const u64 new_end = f_new->fce_range.lsr_end; + __u32 new_flags = f_new->fce_range.lsr_flags; + ENTRY; + + /* + * Duplicate entries are eliminated in insert op. + * So we don't need to search new entry before starting + * insertion loop. + */ + + if (!cache->fci_no_shrink) + fld_cache_shrink(cache); + + head = &cache->fci_entries_head; + + list_for_each_entry_safe(f_curr, n, head, fce_list) { + /* add list if next is end of list */ + if (new_end < f_curr->fce_range.lsr_start || + (new_end == f_curr->fce_range.lsr_start && + new_flags != f_curr->fce_range.lsr_flags)) + break; + + prev = &f_curr->fce_list; + /* check if this range is to left of new range. */ + if (new_start < f_curr->fce_range.lsr_end && + new_flags == f_curr->fce_range.lsr_flags) { + fld_cache_overlap_handle(cache, f_curr, f_new); + goto out; + } + } + + if (prev == NULL) + prev = head; + + CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range)); + /* Add new entry to cache and lru list. */ + fld_cache_entry_add(cache, f_new, prev); +out: + RETURN(0); +} + +int fld_cache_insert(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + int rc; + + flde = fld_cache_entry_create(range); + if (IS_ERR(flde)) + RETURN(PTR_ERR(flde)); + + write_lock(&cache->fci_lock); + rc = fld_cache_insert_nolock(cache, flde); + write_unlock(&cache->fci_lock); + if (rc) + OBD_FREE_PTR(flde); + + RETURN(rc); +} + +void fld_cache_delete_nolock(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *tmp; + struct list_head *head; + + head = &cache->fci_entries_head; + list_for_each_entry_safe(flde, tmp, head, fce_list) { + /* add list if next is end of list */ + if (range->lsr_start == flde->fce_range.lsr_start || + (range->lsr_end == flde->fce_range.lsr_end && + range->lsr_flags == flde->fce_range.lsr_flags)) { + fld_cache_entry_delete(cache, flde); + break; + } + } +} + +/** + * Delete FLD entry in FLD cache. + * + */ +void fld_cache_delete(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + write_lock(&cache->fci_lock); + fld_cache_delete_nolock(cache, range); + write_unlock(&cache->fci_lock); +} + +struct fld_cache_entry * +fld_cache_entry_lookup_nolock(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *got = NULL; + struct list_head *head; + + head = &cache->fci_entries_head; + list_for_each_entry(flde, head, fce_list) { + if (range->lsr_start == flde->fce_range.lsr_start || + (range->lsr_end == flde->fce_range.lsr_end && + range->lsr_flags == flde->fce_range.lsr_flags)) { + got = flde; + break; + } + } + + RETURN(got); +} + +/** + * lookup \a seq sequence for range in fld cache. + */ +struct fld_cache_entry * +fld_cache_entry_lookup(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + struct fld_cache_entry *got = NULL; + ENTRY; + + read_lock(&cache->fci_lock); + got = fld_cache_entry_lookup_nolock(cache, range); + read_unlock(&cache->fci_lock); + + RETURN(got); +} + +/** + * lookup \a seq sequence for range in fld cache. + */ +int fld_cache_lookup(struct fld_cache *cache, + const u64 seq, struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *prev = NULL; + struct list_head *head; + ENTRY; + + read_lock(&cache->fci_lock); + head = &cache->fci_entries_head; + + cache->fci_stat.fst_count++; + list_for_each_entry(flde, head, fce_list) { + if (flde->fce_range.lsr_start > seq) { + if (prev != NULL) + *range = prev->fce_range; + break; + } + + prev = flde; + if (lu_seq_range_within(&flde->fce_range, seq)) { + *range = flde->fce_range; + + cache->fci_stat.fst_cache++; + read_unlock(&cache->fci_lock); + RETURN(0); + } + } + read_unlock(&cache->fci_lock); + RETURN(-ENOENT); +} diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_handler.c b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c new file mode 100644 index 0000000000000..42f00da7f1363 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c @@ -0,0 +1,497 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_handler.c + * + * FLD (Fids Location Database) + * + * Author: Yury Umanets + * Author: WangDi + * Author: Pravin Shelar + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include +#include + +#include +#include +#include +#include +#include +#include +#include "fld_internal.h" + +/* context key constructor/destructor: fld_key_init, fld_key_fini */ +LU_KEY_INIT_FINI(fld, struct fld_thread_info); + +/* context key: fld_thread_key */ +/* MGS thread may create llog file causing FLD lookup */ +LU_CONTEXT_KEY_DEFINE(fld, LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD); + +int fld_server_mod_init(void) +{ + LU_CONTEXT_KEY_INIT(&fld_thread_key); + return lu_context_key_register(&fld_thread_key); +} + +void fld_server_mod_exit(void) +{ + lu_context_key_degister(&fld_thread_key); +} + +int fld_declare_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range, + struct thandle *th) +{ + int rc; + + rc = fld_declare_index_create(env, fld, range, th); + RETURN(rc); +} +EXPORT_SYMBOL(fld_declare_server_create); + +/** + * Insert FLD index entry and update FLD cache. + * + * This function is called from the sequence allocator when a super-sequence + * is granted to a server. + */ +int fld_server_create(const struct lu_env *env, struct lu_server_fld *fld, + const struct lu_seq_range *range, struct thandle *th) +{ + int rc; + + mutex_lock(&fld->lsf_lock); + rc = fld_index_create(env, fld, range, th); + mutex_unlock(&fld->lsf_lock); + + RETURN(rc); +} +EXPORT_SYMBOL(fld_server_create); + +/** + * Extract index information from fld name like srv-fsname-MDT0000 + **/ +int fld_name_to_index(const char *name, u32 *index) +{ + char *dash; + int rc; + + ENTRY; + + CDEBUG(D_INFO, "get index from %s\n", name); + dash = strrchr(name, '-'); + if (!dash) + RETURN(-EINVAL); + dash++; + rc = target_name2index(dash, index, NULL); + RETURN(rc); +} + +/** + * Retrieve fldb entry from MDT0 and add to local FLDB and cache. + **/ +int fld_update_from_controller(const struct lu_env *env, + struct lu_server_fld *fld) +{ + struct fld_thread_info *info; + struct lu_seq_range *range; + struct lu_seq_range_array *lsra; + u32 index; + struct ptlrpc_request *req; + int rc; + int i; + + ENTRY; + + /* + * Update only happens during initalization, i.e. local FLDB + * does not exist yet + */ + if (!fld->lsf_new) + RETURN(0); + + rc = fld_name_to_index(fld->lsf_name, &index); + if (rc < 0) + RETURN(rc); + + /* No need update fldb for MDT0 */ + if (index == 0) + RETURN(0); + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + range = &info->fti_lrange; + memset(range, 0, sizeof(*range)); + range->lsr_index = index; + fld_range_set_mdt(range); + + do { + rc = fld_client_rpc(fld->lsf_control_exp, range, FLD_READ, + &req); + if (rc != 0 && rc != -EAGAIN) + GOTO(out, rc); + + LASSERT(req != NULL); + lsra = (struct lu_seq_range_array *)req_capsule_server_get( + &req->rq_pill, &RMF_GENERIC_DATA); + if (!lsra) + GOTO(out, rc = -EPROTO); + + range_array_le_to_cpu(lsra, lsra); + for (i = 0; i < lsra->lsra_count; i++) { + int rc1; + + if (lsra->lsra_lsr[i].lsr_flags != LU_SEQ_RANGE_MDT) + GOTO(out, rc = -EINVAL); + + if (lsra->lsra_lsr[i].lsr_index != index) + GOTO(out, rc = -EINVAL); + + mutex_lock(&fld->lsf_lock); + rc1 = fld_insert_entry(env, fld, &lsra->lsra_lsr[i]); + mutex_unlock(&fld->lsf_lock); + + if (rc1 != 0) + GOTO(out, rc = rc1); + } + if (rc == -EAGAIN) + *range = lsra->lsra_lsr[lsra->lsra_count - 1]; + } while (rc == -EAGAIN); + + fld->lsf_new = 1; +out: + if (req) + ptlrpc_req_finished(req); + + RETURN(rc); +} +EXPORT_SYMBOL(fld_update_from_controller); + +/** + * Lookup sequece in local cache/fldb. + **/ +int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range) +{ + struct lu_seq_range *erange; + struct fld_thread_info *info; + int rc; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + erange = &info->fti_lrange; + + /* Lookup it in the cache. */ + rc = fld_cache_lookup(fld->lsf_cache, seq, erange); + if (rc == 0) { + if (unlikely(fld_range_type(erange) != fld_range_type(range) && + !fld_range_is_any(range))) { + CERROR("%s: FLD cache range "DRANGE" does not match requested flag %x: rc = %d\n", + fld->lsf_name, PRANGE(erange), range->lsr_flags, + -EIO); + RETURN(-EIO); + } + *range = *erange; + RETURN(0); + } + RETURN(rc); +} +EXPORT_SYMBOL(fld_local_lookup); + +/** + * Lookup MDT/OST by seq, returns a range for given seq. + * + * If that entry is not cached in fld cache, request is sent to super + * sequence controller node (MDT0). All other MDT[1...N] and client + * cache fld entries, but this cache is not persistent. + */ +int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range) +{ + u32 index; + int rc; + + ENTRY; + + rc = fld_local_lookup(env, fld, seq, range); + if (likely(rc == 0)) + RETURN(rc); + + rc = fld_name_to_index(fld->lsf_name, &index); + if (rc < 0) + RETURN(rc); + + if (index == 0 && rc == LDD_F_SV_TYPE_MDT) { + /* + * On server side, all entries should be in cache. + * If we can not find it in cache, just return error + */ + CERROR("%s: Cannot find sequence %#llx: rc = %d\n", + fld->lsf_name, seq, -ENOENT); + RETURN(-ENOENT); + } else { + if (!fld->lsf_control_exp) { + CERROR("%s: lookup %#llx, but not connects to MDT0 yet: rc = %d.\n", + fld->lsf_name, seq, -EIO); + RETURN(-EIO); + } + /* + * send request to mdt0 i.e. super seq. controller. + * This is temporary solution, long term solution is fld + * replication on all mdt servers. + */ + range->lsr_start = seq; + rc = fld_client_rpc(fld->lsf_control_exp, + range, FLD_QUERY, NULL); + if (rc == 0) + fld_cache_insert(fld->lsf_cache, range); + } + RETURN(rc); +} +EXPORT_SYMBOL(fld_server_lookup); + +/** + * All MDT server handle fld lookup operation. But only MDT0 has fld index. + * if entry is not found in cache we need to forward lookup request to MDT0 + */ +static int fld_handle_lookup(struct tgt_session_info *tsi) +{ + struct obd_export *exp = tsi->tsi_exp; + struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site; + struct lu_server_fld *fld; + struct lu_seq_range *in; + struct lu_seq_range *out; + int rc; + + ENTRY; + + in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD); + if (!in) + RETURN(err_serious(-EPROTO)); + + rc = req_capsule_server_pack(tsi->tsi_pill); + if (unlikely(rc != 0)) + RETURN(err_serious(rc)); + + out = req_capsule_server_get(tsi->tsi_pill, &RMF_FLD_MDFLD); + if (!out) + RETURN(err_serious(-EPROTO)); + *out = *in; + + fld = lu_site2seq(site)->ss_server_fld; + + rc = fld_server_lookup(tsi->tsi_env, fld, in->lsr_start, out); + + CDEBUG(D_INFO, "%s: FLD req handle: error %d (range: "DRANGE")\n", + fld->lsf_name, rc, PRANGE(out)); + + RETURN(rc); +} + +static int fld_handle_read(struct tgt_session_info *tsi) +{ + struct obd_export *exp = tsi->tsi_exp; + struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site; + struct lu_seq_range *in; + void *data; + int rc; + + ENTRY; + + req_capsule_set(tsi->tsi_pill, &RQF_FLD_READ); + + in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD); + if (!in) + RETURN(err_serious(-EPROTO)); + + req_capsule_set_size(tsi->tsi_pill, &RMF_GENERIC_DATA, RCL_SERVER, + PAGE_SIZE); + + rc = req_capsule_server_pack(tsi->tsi_pill); + if (unlikely(rc != 0)) + RETURN(err_serious(rc)); + + data = req_capsule_server_get(tsi->tsi_pill, &RMF_GENERIC_DATA); + + rc = fld_server_read(tsi->tsi_env, lu_site2seq(site)->ss_server_fld, + in, data, PAGE_SIZE); + RETURN(rc); +} + +static int fld_handle_query(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + req_capsule_set(tsi->tsi_pill, &RQF_FLD_QUERY); + + rc = fld_handle_lookup(tsi); + + RETURN(rc); +} + +/* + * Returns true, if fid is local to this server node. + * + * WARNING: this function is *not* guaranteed to return false if fid is + * remote: it makes an educated conservative guess only. + * + * fid_is_local() is supposed to be used in assertion checks only. + */ +int fid_is_local(const struct lu_env *env, + struct lu_site *site, const struct lu_fid *fid) +{ + int result; + struct seq_server_site *ss_site; + struct lu_seq_range *range; + struct fld_thread_info *info; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + range = &info->fti_lrange; + + result = 1; /* conservatively assume fid is local */ + ss_site = lu_site2seq(site); + if (ss_site->ss_client_fld) { + int rc; + + rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache, + fid_seq(fid), range); + if (rc == 0) + result = (range->lsr_index == ss_site->ss_node_id); + } + return result; +} +EXPORT_SYMBOL(fid_is_local); + +static void fld_server_debugfs_fini(struct lu_server_fld *fld) +{ + if (!IS_ERR_OR_NULL(fld->lsf_debugfs_entry)) + ldebugfs_remove(&fld->lsf_debugfs_entry); +} + +static int fld_server_debugfs_init(struct lu_server_fld *fld) +{ + int rc = 0; + + ENTRY; + fld->lsf_debugfs_entry = ldebugfs_register(fld->lsf_name, + fld_debugfs_dir, + NULL, NULL); + if (IS_ERR_OR_NULL(fld->lsf_debugfs_entry)) { + rc = fld->lsf_debugfs_entry ? PTR_ERR(fld->lsf_debugfs_entry) + : -ENOMEM; + fld->lsf_debugfs_entry = NULL; + RETURN(rc); + } + + rc = ldebugfs_seq_create(fld->lsf_debugfs_entry, "fldb", 0444, + &fld_debugfs_seq_fops, fld); + if (rc) + ldebugfs_remove(&fld->lsf_debugfs_entry); + + RETURN(rc); +} + +int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, const char *prefix, int type) +{ + int cache_size, cache_threshold; + int rc; + + ENTRY; + + snprintf(fld->lsf_name, sizeof(fld->lsf_name), "srv-%s", prefix); + + cache_size = FLD_SERVER_CACHE_SIZE / sizeof(struct fld_cache_entry); + + cache_threshold = cache_size * FLD_SERVER_CACHE_THRESHOLD / 100; + + mutex_init(&fld->lsf_lock); + fld->lsf_cache = fld_cache_init(fld->lsf_name, cache_size, + cache_threshold); + if (IS_ERR(fld->lsf_cache)) { + rc = PTR_ERR(fld->lsf_cache); + fld->lsf_cache = NULL; + RETURN(rc); + } + + rc = fld_index_init(env, fld, dt, type); + if (rc) + GOTO(out_cache, rc); + + rc = fld_server_debugfs_init(fld); + if (rc) + GOTO(out_index, rc); + + fld->lsf_control_exp = NULL; + fld->lsf_seq_lookup = fld_server_lookup; + + fld->lsf_seq_lookup = fld_server_lookup; + RETURN(0); +out_index: + fld_index_fini(env, fld); +out_cache: + fld_cache_fini(fld->lsf_cache); + return rc; +} +EXPORT_SYMBOL(fld_server_init); + +void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld) +{ + ENTRY; + + fld_server_debugfs_fini(fld); + fld_index_fini(env, fld); + + if (fld->lsf_cache) { + if (!IS_ERR(fld->lsf_cache)) + fld_cache_fini(fld->lsf_cache); + fld->lsf_cache = NULL; + } + + EXIT; +} +EXPORT_SYMBOL(fld_server_fini); + +struct tgt_handler fld_handlers[] = { +TGT_FLD_HDL_VAR(0, FLD_QUERY, fld_handle_query), +TGT_FLD_HDL_VAR(0, FLD_READ, fld_handle_read), +}; +EXPORT_SYMBOL(fld_handlers); diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_index.c b/drivers/staging/lustrefsx/lustre/fld/fld_index.c new file mode 100644 index 0000000000000..f2079cb5b1f49 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/fld_index.c @@ -0,0 +1,534 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_index.c + * + * Author: WangDi + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include +#include +#include +#include +#include +#include +#include "fld_internal.h" + +static const char fld_index_name[] = "fld"; + +static const struct lu_seq_range IGIF_FLD_RANGE = { + .lsr_start = FID_SEQ_IGIF, + .lsr_end = FID_SEQ_IGIF_MAX + 1, + .lsr_index = 0, + .lsr_flags = LU_SEQ_RANGE_MDT +}; + +static const struct lu_seq_range DOT_LUSTRE_FLD_RANGE = { + .lsr_start = FID_SEQ_DOT_LUSTRE, + .lsr_end = FID_SEQ_DOT_LUSTRE + 1, + .lsr_index = 0, + .lsr_flags = LU_SEQ_RANGE_MDT +}; + +static const struct lu_seq_range ROOT_FLD_RANGE = { + .lsr_start = FID_SEQ_ROOT, + .lsr_end = FID_SEQ_ROOT + 1, + .lsr_index = 0, + .lsr_flags = LU_SEQ_RANGE_MDT +}; + +static const struct dt_index_features fld_index_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(u64), + .dif_keysize_max = sizeof(u64), + .dif_recsize_min = sizeof(struct lu_seq_range), + .dif_recsize_max = sizeof(struct lu_seq_range), + .dif_ptrsize = 4 +}; + +extern struct lu_context_key fld_thread_key; + +int fld_declare_index_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *new_range, + struct thandle *th) +{ + struct lu_seq_range *tmp; + struct lu_seq_range *range; + struct fld_thread_info *info; + int rc = 0; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + range = &info->fti_lrange; + tmp = &info->fti_irange; + memset(range, 0, sizeof(*range)); + + rc = fld_index_lookup(env, fld, new_range->lsr_start, range); + if (rc == 0) { + /* In case of duplicate entry, the location must be same */ + LASSERT((lu_seq_range_compare_loc(new_range, range) == 0)); + GOTO(out, rc = -EEXIST); + } + + if (rc != -ENOENT) { + CERROR("%s: lookup range "DRANGE" error: rc = %d\n", + fld->lsf_name, PRANGE(range), rc); + GOTO(out, rc); + } + + /* + * Check for merge case, since the fld entry can only be increamental, + * so we will only check whether it can be merged from the left. + */ + if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 && + lu_seq_range_compare_loc(new_range, range) == 0) { + range_cpu_to_be(tmp, range); + rc = dt_declare_delete(env, fld->lsf_obj, + (struct dt_key *)&tmp->lsr_start, th); + if (rc) { + CERROR("%s: declare record "DRANGE" failed: rc = %d\n", + fld->lsf_name, PRANGE(range), rc); + GOTO(out, rc); + } + *tmp = *new_range; + tmp->lsr_start = range->lsr_start; + } else { + *tmp = *new_range; + } + + range_cpu_to_be(tmp, tmp); + rc = dt_declare_insert(env, fld->lsf_obj, (struct dt_rec *)tmp, + (struct dt_key *)&tmp->lsr_start, th); +out: + RETURN(rc); +} + +/** + * insert range in fld store. + * + * \param range range to be inserted + * \param th transaction for this operation as it could compound + * transaction. + * + * \retval 0 success + * \retval -ve error + * + * The whole fld index insertion is protected by seq->lss_mutex (see + * seq_server_alloc_super), i.e. only one thread will access fldb each + * time, so we do not need worry the fld file and cache will being + * changed between declare and create. + * Because the fld entry can only be increamental, so we will only check + * whether it can be merged from the left. + * + * Caller must hold fld->lsf_lock + **/ +int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld, + const struct lu_seq_range *new_range, struct thandle *th) +{ + struct lu_seq_range *range; + struct lu_seq_range *tmp; + struct fld_thread_info *info; + int rc = 0; + int deleted = 0; + struct fld_cache_entry *flde; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + + LASSERT(mutex_is_locked(&fld->lsf_lock)); + + range = &info->fti_lrange; + memset(range, 0, sizeof(*range)); + tmp = &info->fti_irange; + rc = fld_index_lookup(env, fld, new_range->lsr_start, range); + if (rc != -ENOENT) { + rc = rc == 0 ? -EEXIST : rc; + GOTO(out, rc); + } + + if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 && + lu_seq_range_compare_loc(new_range, range) == 0) { + range_cpu_to_be(tmp, range); + rc = dt_delete(env, fld->lsf_obj, + (struct dt_key *)&tmp->lsr_start, th); + if (rc != 0) + GOTO(out, rc); + *tmp = *new_range; + tmp->lsr_start = range->lsr_start; + deleted = 1; + } else { + *tmp = *new_range; + } + + range_cpu_to_be(tmp, tmp); + rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp, + (struct dt_key *)&tmp->lsr_start, th); + if (rc != 0) { + CERROR("%s: insert range "DRANGE" failed: rc = %d\n", + fld->lsf_name, PRANGE(new_range), rc); + GOTO(out, rc); + } + + flde = fld_cache_entry_create(new_range); + if (IS_ERR(flde)) + GOTO(out, rc = PTR_ERR(flde)); + + write_lock(&fld->lsf_cache->fci_lock); + if (deleted) + fld_cache_delete_nolock(fld->lsf_cache, new_range); + rc = fld_cache_insert_nolock(fld->lsf_cache, flde); + write_unlock(&fld->lsf_cache->fci_lock); + if (rc) + OBD_FREE_PTR(flde); +out: + RETURN(rc); +} + +/** + * lookup range for a seq passed. note here we only care about the start/end, + * caller should handle the attached location data (flags, index). + * + * \param seq seq for lookup. + * \param range result of lookup. + * + * \retval 0 found, \a range is the matched range; + * \retval -ENOENT not found, \a range is the left-side range; + * \retval -ve other error; + */ +int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range) +{ + struct lu_seq_range *fld_rec; + struct fld_thread_info *info; + int rc; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + fld_rec = &info->fti_rec; + + rc = fld_cache_lookup(fld->lsf_cache, seq, fld_rec); + if (rc == 0) { + *range = *fld_rec; + if (lu_seq_range_within(range, seq)) + rc = 0; + else + rc = -ENOENT; + } + + CDEBUG(D_INFO, "%s: lookup seq = %#llx range : "DRANGE" rc = %d\n", + fld->lsf_name, seq, PRANGE(range), rc); + + RETURN(rc); +} + +/** + * insert entry in fld store. + * + * \param env relevant lu_env + * \param fld fld store + * \param range range to be inserted + * + * \retval 0 success + * \retval -ve error + * + * Caller must hold fld->lsf_lock + **/ + +int fld_insert_entry(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range) +{ + struct thandle *th; + struct dt_device *dt = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev); + int rc; + + ENTRY; + + LASSERT(mutex_is_locked(&fld->lsf_lock)); + + if (dt->dd_rdonly) + RETURN(0); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = fld_declare_index_create(env, fld, range, th); + if (rc != 0) { + if (rc == -EEXIST) + rc = 0; + GOTO(out, rc); + } + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(out, rc); + + rc = fld_index_create(env, fld, range, th); + if (rc == -EEXIST) + rc = 0; +out: + dt_trans_stop(env, dt, th); + RETURN(rc); +} +EXPORT_SYMBOL(fld_insert_entry); + +static int fld_insert_special_entries(const struct lu_env *env, + struct lu_server_fld *fld) +{ + int rc; + + rc = fld_insert_entry(env, fld, &IGIF_FLD_RANGE); + if (rc != 0) + RETURN(rc); + + rc = fld_insert_entry(env, fld, &DOT_LUSTRE_FLD_RANGE); + if (rc != 0) + RETURN(rc); + + rc = fld_insert_entry(env, fld, &ROOT_FLD_RANGE); + + RETURN(rc); +} + +int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, int type) +{ + struct dt_object *dt_obj = NULL; + struct lu_fid fid; + struct lu_attr *attr = NULL; + struct lu_seq_range *range = NULL; + struct fld_thread_info *info; + struct dt_object_format dof; + struct dt_it *it; + const struct dt_it_ops *iops; + int rc; + u32 index; + int range_count = 0; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + + lu_local_obj_fid(&fid, FLD_INDEX_OID); + OBD_ALLOC_PTR(attr); + if (!attr) + RETURN(-ENOMEM); + + memset(attr, 0, sizeof(*attr)); + attr->la_valid = LA_MODE; + attr->la_mode = S_IFREG | 0666; + dof.dof_type = DFT_INDEX; + dof.u.dof_idx.di_feat = &fld_index_features; + + dt_obj = dt_locate(env, dt, &fid); + if (IS_ERR(dt_obj)) { + rc = PTR_ERR(dt_obj); + dt_obj = NULL; + GOTO(out, rc); + } + + LASSERT(dt_obj != NULL); + if (!dt_object_exists(dt_obj)) { + dt_object_put(env, dt_obj); + dt_obj = dt_find_or_create(env, dt, &fid, &dof, attr); + fld->lsf_new = 1; + if (IS_ERR(dt_obj)) { + rc = PTR_ERR(dt_obj); + CERROR("%s: Can't find \"%s\" obj %d\n", fld->lsf_name, + fld_index_name, rc); + dt_obj = NULL; + GOTO(out, rc); + } + } + + fld->lsf_obj = dt_obj; + rc = dt_obj->do_ops->do_index_try(env, dt_obj, &fld_index_features); + if (rc != 0) { + CERROR("%s: File \"%s\" is not an index: rc = %d!\n", + fld->lsf_name, fld_index_name, rc); + GOTO(out, rc); + } + + range = &info->fti_rec; + /* Load fld entry to cache */ + iops = &dt_obj->do_index_ops->dio_it; + it = iops->init(env, dt_obj, 0); + if (IS_ERR(it)) + GOTO(out, rc = PTR_ERR(it)); + + rc = iops->load(env, it, 0); + if (rc > 0) + rc = 0; + else if (rc == 0) + rc = iops->next(env, it); + + if (rc < 0) + GOTO(out_it_fini, rc); + + while (rc == 0) { + rc = iops->rec(env, it, (struct dt_rec *)range, 0); + if (rc != 0) + GOTO(out_it_put, rc); + + range_be_to_cpu(range, range); + + /* + * Newly created ldiskfs IAM indexes may include a + * zeroed-out key and record. Ignore it here. + */ + if (range->lsr_start < range->lsr_end) { + rc = fld_cache_insert(fld->lsf_cache, range); + if (rc != 0) + GOTO(out_it_put, rc); + + range_count++; + } + + rc = iops->next(env, it); + if (rc < 0) + GOTO(out_it_fini, rc); + } + + if (range_count == 0) + fld->lsf_new = 1; + + rc = fld_name_to_index(fld->lsf_name, &index); + if (rc < 0) + GOTO(out_it_put, rc); + else + rc = 0; + + if (index == 0 && type == LU_SEQ_RANGE_MDT) { + /* + * Note: fld_insert_entry will detect whether these + * special entries already exist inside FLDB + */ + mutex_lock(&fld->lsf_lock); + rc = fld_insert_special_entries(env, fld); + mutex_unlock(&fld->lsf_lock); + if (rc != 0) { + CERROR("%s: insert special entries failed!: rc = %d\n", + fld->lsf_name, rc); + GOTO(out_it_put, rc); + } + } +out_it_put: + iops->put(env, it); +out_it_fini: + iops->fini(env, it); +out: + if (attr) + OBD_FREE_PTR(attr); + + if (rc < 0) { + if (dt_obj) + dt_object_put(env, dt_obj); + fld->lsf_obj = NULL; + } + RETURN(rc); +} + +void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld) +{ + ENTRY; + if (fld->lsf_obj) { + if (!IS_ERR(fld->lsf_obj)) + dt_object_put(env, fld->lsf_obj); + fld->lsf_obj = NULL; + } + EXIT; +} + +int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld, + struct lu_seq_range *range, void *data, int data_len) +{ + struct lu_seq_range_array *lsra = data; + struct fld_thread_info *info; + struct dt_object *dt_obj = fld->lsf_obj; + struct lu_seq_range *entry; + struct dt_it *it; + const struct dt_it_ops *iops; + int rc; + + ENTRY; + + lsra->lsra_count = 0; + iops = &dt_obj->do_index_ops->dio_it; + it = iops->init(env, dt_obj, 0); + if (IS_ERR(it)) + RETURN(PTR_ERR(it)); + + rc = iops->load(env, it, range->lsr_end); + if (rc <= 0) + GOTO(out_it_fini, rc); + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + entry = &info->fti_rec; + do { + rc = iops->rec(env, it, (struct dt_rec *)entry, 0); + if (rc != 0) + GOTO(out_it_put, rc); + + if (offsetof(typeof(*lsra), lsra_lsr[lsra->lsra_count + 1]) > + data_len) + GOTO(out, rc = -EAGAIN); + + range_be_to_cpu(entry, entry); + if (entry->lsr_index == range->lsr_index && + entry->lsr_flags == range->lsr_flags && + entry->lsr_start > range->lsr_start) { + lsra->lsra_lsr[lsra->lsra_count] = *entry; + lsra->lsra_count++; + } + + rc = iops->next(env, it); + } while (rc == 0); + if (rc > 0) + rc = 0; +out: + range_array_cpu_to_le(lsra, lsra); +out_it_put: + iops->put(env, it); +out_it_fini: + iops->fini(env, it); + + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h new file mode 100644 index 0000000000000..48337e0b6839b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h @@ -0,0 +1,230 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_internal.h + * + * Subsystem Description: + * FLD is FID Location Database, which stores where (IE, on which MDT) + * FIDs are located. + * The database is basically a record file, each record consists of a FID + * sequence range, MDT/OST index, and flags. The FLD for the whole FS + * is only stored on the sequence controller(MDT0) right now, but each target + * also has its local FLD, which only stores the local sequence. + * + * The FLD subsystem usually has two tasks: + * 1. maintain the database, i.e. when the sequence controller allocates + * new sequence ranges to some nodes, it will call the FLD API to insert the + * location information in FLDB. + * + * 2. Handle requests from other nodes, i.e. if client needs to know where + * the FID is located, if it can not find the information in the local cache, + * it will send a FLD lookup RPC to the FLD service, and the FLD service will + * look up the FLDB entry and return the location information to client. + * + * Author: Yury Umanets + * Author: Tom WangDi + */ +#ifndef __FLD_INTERNAL_H +#define __FLD_INTERNAL_H + +#include +#include +#include + +struct fld_stats { + __u64 fst_count; + __u64 fst_cache; +}; + +typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64); + +typedef struct lu_fld_target * +(*fld_scan_func_t) (struct lu_client_fld *, __u64); + +struct lu_fld_hash { + const char *fh_name; + fld_hash_func_t fh_hash_func; + fld_scan_func_t fh_scan_func; +}; + +struct fld_cache_entry { + struct list_head fce_lru; + struct list_head fce_list; + /** + * fld cache entries are sorted on range->lsr_start field. */ + struct lu_seq_range fce_range; +}; + +struct fld_cache { + /** + * Cache guard, protects fci_hash mostly because others immutable after + * init is finished. + */ + rwlock_t fci_lock; + + /** + * Cache shrink threshold */ + int fci_threshold; + + /** + * Prefered number of cached entries */ + int fci_cache_size; + + /** + * Current number of cached entries. Protected by \a fci_lock */ + int fci_cache_count; + + /** + * LRU list fld entries. */ + struct list_head fci_lru; + + /** + * sorted fld entries. */ + struct list_head fci_entries_head; + + /** + * Cache statistics. */ + struct fld_stats fci_stat; + + /** + * Cache name used for debug and messages. */ + char fci_name[80]; + unsigned int fci_no_shrink:1; +}; + +enum { + /* 4M of FLD cache will not hurt client a lot. */ + FLD_SERVER_CACHE_SIZE = (4 * 0x100000), + + /* 1M of FLD cache will not hurt client a lot. */ + FLD_CLIENT_CACHE_SIZE = (1 * 0x100000) +}; + +enum { + /* Cache threshold is 10 percent of size. */ + FLD_SERVER_CACHE_THRESHOLD = 10, + + /* Cache threshold is 10 percent of size. */ + FLD_CLIENT_CACHE_THRESHOLD = 10 +}; + +extern struct lu_fld_hash fld_hash[]; + +# ifdef HAVE_SERVER_SUPPORT +struct fld_thread_info { + struct lu_seq_range fti_rec; + struct lu_seq_range fti_lrange; + struct lu_seq_range fti_irange; +}; + +extern struct lu_context_key fld_thread_key; + +struct dt_device; +int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, int type); + +void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld); + +int fld_declare_index_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *new_range, + struct thandle *th); + +int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld, + const struct lu_seq_range *new_range, struct thandle *th); + +int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range); + +int fld_name_to_index(const char *name, __u32 *index); + +int fld_server_mod_init(void); +void fld_server_mod_exit(void); + +int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld, + struct lu_seq_range *range, void *data, int data_len); + +extern const struct file_operations fld_debugfs_seq_fops; +extern struct dentry *fld_debugfs_dir; + +# endif /* HAVE_SERVER_SUPPORT */ + +int fld_client_rpc(struct obd_export *exp, + struct lu_seq_range *range, __u32 fld_op, + struct ptlrpc_request **reqp); + +extern struct ldebugfs_vars fld_client_debugfs_list[]; + +struct fld_cache *fld_cache_init(const char *name, + int cache_size, int cache_threshold); + +void fld_cache_fini(struct fld_cache *cache); + +void fld_cache_flush(struct fld_cache *cache); + +int fld_cache_insert(struct fld_cache *cache, + const struct lu_seq_range *range); + +struct fld_cache_entry +*fld_cache_entry_create(const struct lu_seq_range *range); + +int fld_cache_insert_nolock(struct fld_cache *cache, + struct fld_cache_entry *f_new); +void fld_cache_delete(struct fld_cache *cache, + const struct lu_seq_range *range); +void fld_cache_delete_nolock(struct fld_cache *cache, + const struct lu_seq_range *range); +int fld_cache_lookup(struct fld_cache *cache, + const u64 seq, struct lu_seq_range *range); + +struct fld_cache_entry * +fld_cache_entry_lookup(struct fld_cache *cache, + const struct lu_seq_range *range); + +void fld_cache_entry_delete(struct fld_cache *cache, + struct fld_cache_entry *node); + +struct fld_cache_entry * +fld_cache_entry_lookup_nolock(struct fld_cache *cache, + const struct lu_seq_range *range); + +static inline const char * +fld_target_name(const struct lu_fld_target *tar) +{ +#ifdef HAVE_SERVER_SUPPORT + if (tar->ft_srv != NULL) + return tar->ft_srv->lsf_name; +#endif + + return tar->ft_exp->exp_obd->obd_name; +} + +#endif /* __FLD_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_request.c b/drivers/staging/lustrefsx/lustre/fld/fld_request.c new file mode 100644 index 0000000000000..3dd616e0a6e94 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/fld_request.c @@ -0,0 +1,556 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_request.c + * + * FLD (Fids Location Database) + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "fld_internal.h" + +static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq) +{ + LASSERT(fld->lcf_count > 0); + return do_div(seq, fld->lcf_count); +} + +static struct lu_fld_target * +fld_rrb_scan(struct lu_client_fld *fld, u64 seq) +{ + struct lu_fld_target *target; + int hash; + + ENTRY; + + /* + * Because almost all of special sequence located in MDT0, + * it should go to index 0 directly, instead of calculating + * hash again, and also if other MDTs is not being connected, + * the fld lookup requests(for seq on MDT0) should not be + * blocked because of other MDTs + */ + if (fid_seq_is_norm(seq)) + hash = fld_rrb_hash(fld, seq); + else + hash = 0; + +again: + list_for_each_entry(target, &fld->lcf_targets, ft_chain) { + if (target->ft_idx == hash) + RETURN(target); + } + + if (hash != 0) { + /* + * It is possible the remote target(MDT) are not connected to + * with client yet, so we will refer this to MDT0, which should + * be connected during mount + */ + hash = 0; + goto again; + } + + CERROR("%s: Can't find target by hash %d (seq %#llx). Targets (%d):\n", + fld->lcf_name, hash, seq, fld->lcf_count); + + list_for_each_entry(target, &fld->lcf_targets, ft_chain) { + const char *srv_name = target->ft_srv != NULL ? + target->ft_srv->lsf_name : ""; + const char *exp_name = target->ft_exp != NULL ? + (char *)target->ft_exp->exp_obd->obd_uuid.uuid : + ""; + + CERROR(" exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n", + target->ft_exp, exp_name, target->ft_srv, + srv_name, target->ft_idx); + } + + /* + * If target is not found, there is logical error anyway, so here is + * LBUG() to catch this situation. + */ + LBUG(); + RETURN(NULL); +} + +struct lu_fld_hash fld_hash[] = { + { + .fh_name = "RRB", + .fh_hash_func = fld_rrb_hash, + .fh_scan_func = fld_rrb_scan + }, + { + NULL, + } +}; + +static struct lu_fld_target * +fld_client_get_target(struct lu_client_fld *fld, u64 seq) +{ + struct lu_fld_target *target; + + ENTRY; + + LASSERT(fld->lcf_hash != NULL); + + spin_lock(&fld->lcf_lock); + target = fld->lcf_hash->fh_scan_func(fld, seq); + spin_unlock(&fld->lcf_lock); + + if (target) { + CDEBUG(D_INFO, "%s: Found target (idx %llu) by seq %#llx\n", + fld->lcf_name, target->ft_idx, seq); + } + + RETURN(target); +} + +/* + * Add export to FLD. This is usually done by CMM and LMV as they are main users + * of FLD module. + */ +int fld_client_add_target(struct lu_client_fld *fld, + struct lu_fld_target *tar) +{ + const char *name; + struct lu_fld_target *target, *tmp; + + ENTRY; + + LASSERT(tar != NULL); + name = fld_target_name(tar); + LASSERT(name != NULL); + LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL); + + CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n", fld->lcf_name, + name, tar->ft_idx); + + OBD_ALLOC_PTR(target); + if (!target) + RETURN(-ENOMEM); + + spin_lock(&fld->lcf_lock); + list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) { + if (tmp->ft_idx == tar->ft_idx) { + spin_unlock(&fld->lcf_lock); + OBD_FREE_PTR(target); + CERROR("Target %s exists in FLD and known as %s:#%llu\n", + name, fld_target_name(tmp), tmp->ft_idx); + RETURN(-EEXIST); + } + } + + target->ft_exp = tar->ft_exp; + if (target->ft_exp) + class_export_get(target->ft_exp); + target->ft_srv = tar->ft_srv; + target->ft_idx = tar->ft_idx; + + list_add_tail(&target->ft_chain, &fld->lcf_targets); + + fld->lcf_count++; + spin_unlock(&fld->lcf_lock); + + RETURN(0); +} +EXPORT_SYMBOL(fld_client_add_target); + +/* Remove export from FLD */ +int fld_client_del_target(struct lu_client_fld *fld, u64 idx) +{ + struct lu_fld_target *target, *tmp; + + ENTRY; + + spin_lock(&fld->lcf_lock); + list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) { + if (target->ft_idx == idx) { + fld->lcf_count--; + list_del(&target->ft_chain); + spin_unlock(&fld->lcf_lock); + + if (target->ft_exp) + class_export_put(target->ft_exp); + + OBD_FREE_PTR(target); + RETURN(0); + } + } + spin_unlock(&fld->lcf_lock); + RETURN(-ENOENT); +} + +struct dentry *fld_debugfs_dir; + +static int fld_client_debugfs_init(struct lu_client_fld *fld) +{ + int rc; + + ENTRY; + fld->lcf_debugfs_entry = ldebugfs_register(fld->lcf_name, + fld_debugfs_dir, + fld_client_debugfs_list, + fld); + if (IS_ERR_OR_NULL(fld->lcf_debugfs_entry)) { + CERROR("%s: LdebugFS failed in fld-init\n", fld->lcf_name); + rc = fld->lcf_debugfs_entry ? PTR_ERR(fld->lcf_debugfs_entry) + : -ENOMEM; + fld->lcf_debugfs_entry = NULL; + RETURN(rc); + } + + return 0; +} + +void fld_client_debugfs_fini(struct lu_client_fld *fld) +{ + if (!IS_ERR_OR_NULL(fld->lcf_debugfs_entry)) + ldebugfs_remove(&fld->lcf_debugfs_entry); +} +EXPORT_SYMBOL(fld_client_debugfs_fini); + +static inline int hash_is_sane(int hash) +{ + return (hash >= 0 && hash < ARRAY_SIZE(fld_hash)); +} + +int fld_client_init(struct lu_client_fld *fld, + const char *prefix, int hash) +{ + int cache_size, cache_threshold; + int rc; + + ENTRY; + snprintf(fld->lcf_name, sizeof(fld->lcf_name), + "cli-%s", prefix); + + if (!hash_is_sane(hash)) { + CERROR("%s: Wrong hash function %#x\n", + fld->lcf_name, hash); + RETURN(-EINVAL); + } + + fld->lcf_count = 0; + spin_lock_init(&fld->lcf_lock); + fld->lcf_hash = &fld_hash[hash]; + INIT_LIST_HEAD(&fld->lcf_targets); + + cache_size = FLD_CLIENT_CACHE_SIZE / + sizeof(struct fld_cache_entry); + + cache_threshold = cache_size * + FLD_CLIENT_CACHE_THRESHOLD / 100; + + fld->lcf_cache = fld_cache_init(fld->lcf_name, + cache_size, cache_threshold); + if (IS_ERR(fld->lcf_cache)) { + rc = PTR_ERR(fld->lcf_cache); + fld->lcf_cache = NULL; + GOTO(out, rc); + } + + rc = fld_client_debugfs_init(fld); + if (rc) + GOTO(out, rc); + EXIT; +out: + if (rc) + fld_client_fini(fld); + else + CDEBUG(D_INFO, "%s: Using \"%s\" hash\n", + fld->lcf_name, fld->lcf_hash->fh_name); + return rc; +} +EXPORT_SYMBOL(fld_client_init); + +void fld_client_fini(struct lu_client_fld *fld) +{ + struct lu_fld_target *target, *tmp; + + ENTRY; + + spin_lock(&fld->lcf_lock); + list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) { + fld->lcf_count--; + list_del(&target->ft_chain); + if (target->ft_exp) + class_export_put(target->ft_exp); + OBD_FREE_PTR(target); + } + spin_unlock(&fld->lcf_lock); + + if (fld->lcf_cache) { + if (!IS_ERR(fld->lcf_cache)) + fld_cache_fini(fld->lcf_cache); + fld->lcf_cache = NULL; + } + + EXIT; +} +EXPORT_SYMBOL(fld_client_fini); + +int fld_client_rpc(struct obd_export *exp, + struct lu_seq_range *range, u32 fld_op, + struct ptlrpc_request **reqp) +{ + struct ptlrpc_request *req = NULL; + struct lu_seq_range *prange; + u32 *op; + int rc = 0; + struct obd_import *imp; + + ENTRY; + + LASSERT(exp != NULL); + + imp = class_exp2cliimp(exp); + switch (fld_op) { + case FLD_QUERY: + req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, + LUSTRE_MDS_VERSION, FLD_QUERY); + if (!req) + RETURN(-ENOMEM); + + /* + * XXX: only needed when talking to old server(< 2.6), it should + * be removed when < 2.6 server is not supported + */ + op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC); + *op = FLD_LOOKUP; + + /* + * For MDS_MDS seq lookup, it will always use LWP connection, + * but LWP will be evicted after restart, so cause the error. + * so we will set no_delay for seq lookup request, once the + * request fails because of the eviction. always retry here + */ + if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) { + req->rq_allow_replay = 1; + req->rq_no_delay = 1; + } + break; + case FLD_READ: + req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_READ, + LUSTRE_MDS_VERSION, FLD_READ); + if (!req) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, + RCL_SERVER, PAGE_SIZE); + break; + default: + rc = -EINVAL; + break; + } + + if (rc != 0) + RETURN(rc); + + prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD); + *prange = *range; + ptlrpc_request_set_replen(req); + req->rq_request_portal = FLD_REQUEST_PORTAL; + req->rq_reply_portal = MDC_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ && req->rq_no_delay)) { + /* the same error returned by ptlrpc_import_delay_req */ + rc = -EWOULDBLOCK; + req->rq_status = rc; + } else { + obd_get_request_slot(&exp->exp_obd->u.cli); + rc = ptlrpc_queue_wait(req); + obd_put_request_slot(&exp->exp_obd->u.cli); + } + + if (rc == -ENOENT) { + /* Don't loop forever on non-existing FID sequences. */ + GOTO(out_req, rc); + } + + if (rc != 0) { + if (imp->imp_state != LUSTRE_IMP_CLOSED && + !imp->imp_deactive && + imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS && + OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) && + rc != -ENOTSUPP) { + /* + * Since LWP is not replayable, so notify the caller + * to retry if needed after a while. + */ + rc = -EAGAIN; + } + GOTO(out_req, rc); + } + + if (fld_op == FLD_QUERY) { + prange = req_capsule_server_get(&req->rq_pill, + &RMF_FLD_MDFLD); + if (!prange) + GOTO(out_req, rc = -EFAULT); + *range = *prange; + } + + EXIT; +out_req: + if (rc != 0 || !reqp) { + ptlrpc_req_finished(req); + req = NULL; + } + + if (reqp) + *reqp = req; + + return rc; +} + +int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds, + u32 flags, const struct lu_env *env) +{ + struct lu_seq_range res = { 0 }; + struct lu_fld_target *target; + struct lu_fld_target *origin; + int rc; + + ENTRY; + + rc = fld_cache_lookup(fld->lcf_cache, seq, &res); + if (rc == 0) { + *mds = res.lsr_index; + RETURN(0); + } + + /* Can not find it in the cache */ + target = fld_client_get_target(fld, seq); + LASSERT(target != NULL); + origin = target; +again: + CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on target %s (idx %llu)\n", + fld->lcf_name, seq, fld_target_name(target), target->ft_idx); + + res.lsr_start = seq; + fld_range_set_type(&res, flags); + +#ifdef HAVE_SERVER_SUPPORT + if (target->ft_srv) { + LASSERT(env != NULL); + rc = fld_server_lookup(env, target->ft_srv, seq, &res); + } else +#endif /* HAVE_SERVER_SUPPORT */ + { + rc = fld_client_rpc(target->ft_exp, &res, FLD_QUERY, NULL); + } + + if (rc == -ESHUTDOWN) { + /* + * If fld lookup failed because the target has been shutdown, + * then try next target in the list, until trying all targets + * or fld lookup succeeds + */ + spin_lock(&fld->lcf_lock); + /* + * If the next entry in the list is the head of the list, + * move to the next entry after the head and retrieve + * the target. Else retreive the next target entry. + */ + if (target->ft_chain.next == &fld->lcf_targets) + target = list_entry(target->ft_chain.next->next, + struct lu_fld_target, ft_chain); + else + target = list_entry(target->ft_chain.next, + struct lu_fld_target, + ft_chain); + spin_unlock(&fld->lcf_lock); + if (target != origin) + goto again; + } + if (rc == 0) { + *mds = res.lsr_index; + fld_cache_insert(fld->lcf_cache, &res); + } + + RETURN(rc); +} +EXPORT_SYMBOL(fld_client_lookup); + +void fld_client_flush(struct lu_client_fld *fld) +{ + fld_cache_flush(fld->lcf_cache); +} + +static int __init fld_init(void) +{ +#ifdef HAVE_SERVER_SUPPORT + int rc; + + rc = fld_server_mod_init(); + if (rc) + return rc; +#endif /* HAVE_SERVER_SUPPORT */ + + fld_debugfs_dir = ldebugfs_register(LUSTRE_FLD_NAME, + debugfs_lustre_root, + NULL, NULL); + return PTR_ERR_OR_ZERO(fld_debugfs_dir); +} + +static void __exit fld_exit(void) +{ +#ifdef HAVE_SERVER_SUPPORT + fld_server_mod_exit(); +#endif /* HAVE_SERVER_SUPPORT */ + + if (!IS_ERR_OR_NULL(fld_debugfs_dir)) + ldebugfs_remove(&fld_debugfs_dir); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre FID Location Database"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(fld_init); +module_exit(fld_exit); diff --git a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c new file mode 100644 index 0000000000000..a555889f57730 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c @@ -0,0 +1,358 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/lproc_fld.c + * + * FLD (FIDs Location Database) + * + * Author: Yury Umanets + * Di Wang + */ + +#define DEBUG_SUBSYSTEM S_FLD + +#include +#include + +#ifdef HAVE_SERVER_SUPPORT +#include +#endif +#include +#include +#include +#include "fld_internal.h" + +static int +fld_debugfs_targets_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_fld *fld = (struct lu_client_fld *)m->private; + struct lu_fld_target *target; + + ENTRY; + spin_lock(&fld->lcf_lock); + list_for_each_entry(target, &fld->lcf_targets, ft_chain) + seq_printf(m, "%s\n", fld_target_name(target)); + spin_unlock(&fld->lcf_lock); + + RETURN(0); +} + +static int +fld_debugfs_hash_seq_show(struct seq_file *m, void *unused) +{ + struct lu_client_fld *fld = (struct lu_client_fld *)m->private; + + ENTRY; + spin_lock(&fld->lcf_lock); + seq_printf(m, "%s\n", fld->lcf_hash->fh_name); + spin_unlock(&fld->lcf_lock); + + RETURN(0); +} + +static ssize_t +fld_debugfs_hash_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct lu_client_fld *fld; + struct lu_fld_hash *hash = NULL; + char fh_name[8]; + int i; + + if (count > sizeof(fh_name)) + return -ENAMETOOLONG; + + if (copy_from_user(fh_name, buffer, count) != 0) + return -EFAULT; + + fld = ((struct seq_file *)file->private_data)->private; + + for (i = 0; fld_hash[i].fh_name; i++) { + if (count != strlen(fld_hash[i].fh_name)) + continue; + + if (!strncmp(fld_hash[i].fh_name, fh_name, count)) { + hash = &fld_hash[i]; + break; + } + } + + if (hash) { + spin_lock(&fld->lcf_lock); + fld->lcf_hash = hash; + spin_unlock(&fld->lcf_lock); + + CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n", + fld->lcf_name, hash->fh_name); + } + + return count; +} + +static ssize_t ldebugfs_cache_flush_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *pos) +{ + struct seq_file *m = file->private_data; + struct lu_client_fld *fld = m->private; + + ENTRY; + fld_cache_flush(fld->lcf_cache); + + CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name); + + RETURN(count); +} + +LDEBUGFS_SEQ_FOPS_RO(fld_debugfs_targets); +LDEBUGFS_SEQ_FOPS(fld_debugfs_hash); +LDEBUGFS_FOPS_WR_ONLY(fld, cache_flush); + +struct ldebugfs_vars fld_client_debugfs_list[] = { + { .name = "targets", + .fops = &fld_debugfs_targets_fops }, + { .name = "hash", + .fops = &fld_debugfs_hash_fops }, + { .name = "cache_flush", + .fops = &fld_cache_flush_fops }, + { NULL } +}; + +#ifdef HAVE_SERVER_SUPPORT +struct fld_seq_param { + struct lu_env fsp_env; + struct dt_it *fsp_it; + struct lu_server_fld *fsp_fld; + unsigned int fsp_stop:1; +}; + +static void *fldb_seq_start(struct seq_file *p, loff_t *pos) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct dt_key *key; + int rc; + + if (param == NULL || param->fsp_stop) + return NULL; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->load(¶m->fsp_env, param->fsp_it, *pos); + if (rc <= 0) + return NULL; + + key = iops->key(¶m->fsp_env, param->fsp_it); + if (IS_ERR(key)) + return NULL; + + *pos = be64_to_cpu(*(__u64 *)key); + + return param; +} + +static void fldb_seq_stop(struct seq_file *p, void *v) +{ + struct fld_seq_param *param = p->private; + const struct dt_it_ops *iops; + struct lu_server_fld *fld; + struct dt_object *obj; + + if (param == NULL) + return; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + iops->put(¶m->fsp_env, param->fsp_it); +} + +static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + int rc; + + if (param == NULL || param->fsp_stop) + return NULL; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->next(¶m->fsp_env, param->fsp_it); + if (rc > 0) { + param->fsp_stop = 1; + return NULL; + } + + *pos = be64_to_cpu(*(__u64 *)iops->key(¶m->fsp_env, param->fsp_it)); + return param; +} + +static int fldb_seq_show(struct seq_file *p, void *v) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct fld_thread_info *info; + struct lu_seq_range *fld_rec; + int rc; + + if (param == NULL || param->fsp_stop) + return 0; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + info = lu_context_key_get(¶m->fsp_env.le_ctx, + &fld_thread_key); + fld_rec = &info->fti_rec; + rc = iops->rec(¶m->fsp_env, param->fsp_it, + (struct dt_rec *)fld_rec, 0); + if (rc != 0) { + CERROR("%s:read record error: rc %d\n", + fld->lsf_name, rc); + } else if (fld_rec->lsr_start != 0) { + range_be_to_cpu(fld_rec, fld_rec); + seq_printf(p, DRANGE"\n", PRANGE(fld_rec)); + } + + return rc; +} + +struct seq_operations fldb_sops = { + .start = fldb_seq_start, + .stop = fldb_seq_stop, + .next = fldb_seq_next, + .show = fldb_seq_show, +}; + +static int fldb_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct lu_server_fld *fld = inode->i_private; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct fld_seq_param *param = NULL; + int env_init = 0; + int rc; + + rc = seq_open(file, &fldb_sops); + if (rc) + GOTO(out, rc); + + obj = fld->lsf_obj; + if (obj == NULL) { + seq = file->private_data; + seq->private = NULL; + return 0; + } + + OBD_ALLOC_PTR(param); + if (param == NULL) + GOTO(out, rc = -ENOMEM); + + rc = lu_env_init(¶m->fsp_env, LCT_MD_THREAD); + if (rc != 0) + GOTO(out, rc); + + env_init = 1; + iops = &obj->do_index_ops->dio_it; + param->fsp_it = iops->init(¶m->fsp_env, obj, 0); + if (IS_ERR(param->fsp_it)) + GOTO(out, rc = PTR_ERR(param->fsp_it)); + + param->fsp_fld = fld; + param->fsp_stop = 0; + + seq = file->private_data; + seq->private = param; +out: + if (rc != 0) { + if (env_init == 1) + lu_env_fini(¶m->fsp_env); + if (param != NULL) + OBD_FREE_PTR(param); + } + return rc; +} + +static int fldb_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct fld_seq_param *param; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + + param = seq->private; + if (param == NULL) { + lprocfs_seq_release(inode, file); + return 0; + } + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + LASSERT(iops != NULL); + LASSERT(param->fsp_it != NULL); + iops->fini(¶m->fsp_env, param->fsp_it); + lu_env_fini(¶m->fsp_env); + OBD_FREE_PTR(param); + lprocfs_seq_release(inode, file); + + return 0; +} + +const struct file_operations fld_debugfs_seq_fops = { + .owner = THIS_MODULE, + .open = fldb_seq_open, + .read = seq_read, + .release = fldb_seq_release, +}; + +# endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/include/cl_object.h b/drivers/staging/lustrefsx/lustre/include/cl_object.h new file mode 100644 index 0000000000000..f0c8a5b4bfda0 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/cl_object.h @@ -0,0 +1,2555 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef _LUSTRE_CL_OBJECT_H +#define _LUSTRE_CL_OBJECT_H + +/** \defgroup clio clio + * + * Client objects implement io operations and cache pages. + * + * Examples: lov and osc are implementations of cl interface. + * + * Big Theory Statement. + * + * Layered objects. + * + * Client implementation is based on the following data-types: + * + * - cl_object + * + * - cl_page + * + * - cl_lock represents an extent lock on an object. + * + * - cl_io represents high-level i/o activity such as whole read/write + * system call, or write-out of pages from under the lock being + * canceled. cl_io has sub-ios that can be stopped and resumed + * independently, thus achieving high degree of transfer + * parallelism. Single cl_io can be advanced forward by + * the multiple threads (although in the most usual case of + * read/write system call it is associated with the single user + * thread, that issued the system call). + * + * Terminology + * + * - to avoid confusion high-level I/O operation like read or write system + * call is referred to as "an io", whereas low-level I/O operation, like + * RPC, is referred to as "a transfer" + * + * - "generic code" means generic (not file system specific) code in the + * hosting environment. "cl-code" means code (mostly in cl_*.c files) that + * is not layer specific. + * + * Locking. + * + * - i_mutex + * - PG_locked + * - cl_object_header::coh_page_guard + * - lu_site::ls_guard + * + * See the top comment in cl_object.c for the description of overall locking and + * reference-counting design. + * + * See comments below for the description of i/o, page, and dlm-locking + * design. + * + * @{ + */ + +/* + * super-class definitions. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct obd_info; +struct inode; + +struct cl_device; + +struct cl_object; + +struct cl_page; +struct cl_page_slice; +struct cl_lock; +struct cl_lock_slice; + +struct cl_lock_operations; +struct cl_page_operations; + +struct cl_io; +struct cl_io_slice; + +struct cl_req_attr; + +/** + * Device in the client stack. + * + * \see vvp_device, lov_device, lovsub_device, osc_device + */ +struct cl_device { + /** Super-class. */ + struct lu_device cd_lu_dev; +}; + +/** \addtogroup cl_object cl_object + * @{ */ +/** + * "Data attributes" of cl_object. Data attributes can be updated + * independently for a sub-object, and top-object's attributes are calculated + * from sub-objects' ones. + */ +struct cl_attr { + /** Object size, in bytes */ + loff_t cat_size; + /** + * Known minimal size, in bytes. + * + * This is only valid when at least one DLM lock is held. + */ + loff_t cat_kms; + /** Modification time. Measured in seconds since epoch. */ + time64_t cat_mtime; + /** Access time. Measured in seconds since epoch. */ + time64_t cat_atime; + /** Change time. Measured in seconds since epoch. */ + time64_t cat_ctime; + /** + * Blocks allocated to this cl_object on the server file system. + * + * \todo XXX An interface for block size is needed. + */ + __u64 cat_blocks; + /** + * User identifier for quota purposes. + */ + uid_t cat_uid; + /** + * Group identifier for quota purposes. + */ + gid_t cat_gid; + + /* nlink of the directory */ + __u64 cat_nlink; + + /* Project identifier for quota purpose. */ + __u32 cat_projid; +}; + +/** + * Fields in cl_attr that are being set. + */ +enum cl_attr_valid { + CAT_SIZE = 1 << 0, + CAT_KMS = 1 << 1, + CAT_MTIME = 1 << 3, + CAT_ATIME = 1 << 4, + CAT_CTIME = 1 << 5, + CAT_BLOCKS = 1 << 6, + CAT_UID = 1 << 7, + CAT_GID = 1 << 8, + CAT_PROJID = 1 << 9 +}; + +/** + * Sub-class of lu_object with methods common for objects on the client + * stacks. + * + * cl_object: represents a regular file system object, both a file and a + * stripe. cl_object is based on lu_object: it is identified by a fid, + * layered, cached, hashed, and lrued. Important distinction with the server + * side, where md_object and dt_object are used, is that cl_object "fans out" + * at the lov/sns level: depending on the file layout, single file is + * represented as a set of "sub-objects" (stripes). At the implementation + * level, struct lov_object contains an array of cl_objects. Each sub-object + * is a full-fledged cl_object, having its fid, living in the lru and hash + * table. + * + * This leads to the next important difference with the server side: on the + * client, it's quite usual to have objects with the different sequence of + * layers. For example, typical top-object is composed of the following + * layers: + * + * - vvp + * - lov + * + * whereas its sub-objects are composed of + * + * - lovsub + * - osc + * + * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep + * track of the object-subobject relationship. + * + * Sub-objects are not cached independently: when top-object is about to + * be discarded from the memory, all its sub-objects are torn-down and + * destroyed too. + * + * \see vvp_object, lov_object, lovsub_object, osc_object + */ +struct cl_object { + /** super class */ + struct lu_object co_lu; + /** per-object-layer operations */ + const struct cl_object_operations *co_ops; + /** offset of page slice in cl_page buffer */ + int co_slice_off; +}; + +/** + * Description of the client object configuration. This is used for the + * creation of a new client object that is identified by a more state than + * fid. + */ +struct cl_object_conf { + /** Super-class. */ + struct lu_object_conf coc_lu; + union { + /** + * Object layout. This is consumed by lov. + */ + struct lu_buf coc_layout; + /** + * Description of particular stripe location in the + * cluster. This is consumed by osc. + */ + struct lov_oinfo *coc_oinfo; + } u; + /** + * VFS inode. This is consumed by vvp. + */ + struct inode *coc_inode; + /** + * Layout lock handle. + */ + struct ldlm_lock *coc_lock; + /** + * Operation to handle layout, OBJECT_CONF_XYZ. + */ + int coc_opc; +}; + +enum { + /** configure layout, set up a new stripe, must be called while + * holding layout lock. */ + OBJECT_CONF_SET = 0, + /** invalidate the current stripe configuration due to losing + * layout lock. */ + OBJECT_CONF_INVALIDATE = 1, + /** wait for old layout to go away so that new layout can be + * set up. */ + OBJECT_CONF_WAIT = 2 +}; + +enum { + CL_LAYOUT_GEN_NONE = (u32)-2, /* layout lock was cancelled */ + CL_LAYOUT_GEN_EMPTY = (u32)-1, /* for empty layout */ +}; + +struct cl_layout { + /** the buffer to return the layout in lov_mds_md format. */ + struct lu_buf cl_buf; + /** size of layout in lov_mds_md format. */ + size_t cl_size; + /** Layout generation. */ + u32 cl_layout_gen; + /** whether layout is a composite one */ + bool cl_is_composite; +}; + +/** + * Operations implemented for each cl object layer. + * + * \see vvp_ops, lov_ops, lovsub_ops, osc_ops + */ +struct cl_object_operations { + /** + * Initialize page slice for this layer. Called top-to-bottom through + * every object layer when a new cl_page is instantiated. Layer + * keeping private per-page data, or requiring its own page operations + * vector should allocate these data here, and attach then to the page + * by calling cl_page_slice_add(). \a vmpage is locked (in the VM + * sense). Optional. + * + * \retval NULL success. + * + * \retval ERR_PTR(errno) failure code. + * + * \retval valid-pointer pointer to already existing referenced page + * to be used instead of newly created. + */ + int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); + /** + * Initialize lock slice for this layer. Called top-to-bottom through + * every object layer when a new cl_lock is instantiated. Layer + * keeping private per-lock data, or requiring its own lock operations + * vector should allocate these data here, and attach then to the lock + * by calling cl_lock_slice_add(). Mandatory. + */ + int (*coo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + /** + * Initialize io state for a given layer. + * + * called top-to-bottom once per io existence to initialize io + * state. If layer wants to keep some state for this type of io, it + * has to embed struct cl_io_slice in lu_env::le_ses, and register + * slice with cl_io_slice_add(). It is guaranteed that all threads + * participating in this io share the same session. + */ + int (*coo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + /** + * Fill portion of \a attr that this layer controls. This method is + * called top-to-bottom through all object layers. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return 0: to continue + * \return +ve: to stop iterating through layers (but 0 is returned + * from enclosing cl_object_attr_get()) + * \return -ve: to signal error + */ + int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); + /** + * Update attributes. + * + * \a valid is a bitmask composed from enum #cl_attr_valid, and + * indicating what attributes are to be set. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return the same convention as for + * cl_object_operations::coo_attr_get() is used. + */ + int (*coo_attr_update)(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); + /** + * Update object configuration. Called top-to-bottom to modify object + * configuration. + * + * XXX error conditions and handling. + */ + int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); + /** + * Glimpse ast. Executed when glimpse ast arrives for a lock on this + * object. Layers are supposed to fill parts of \a lvb that will be + * shipped to the glimpse originator as a glimpse result. + * + * \see vvp_object_glimpse(), lovsub_object_glimpse(), + * \see osc_object_glimpse() + */ + int (*coo_glimpse)(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb); + /** + * Object prune method. Called when the layout is going to change on + * this object, therefore each layer has to clean up their cache, + * mainly pages and locks. + */ + int (*coo_prune)(const struct lu_env *env, struct cl_object *obj); + /** + * Object getstripe method. + */ + int (*coo_getstripe)(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *lum, size_t size); + /** + * Get FIEMAP mapping from the object. + */ + int (*coo_fiemap)(const struct lu_env *env, struct cl_object *obj, + struct ll_fiemap_info_key *fmkey, + struct fiemap *fiemap, size_t *buflen); + /** + * Get layout and generation of the object. + */ + int (*coo_layout_get)(const struct lu_env *env, struct cl_object *obj, + struct cl_layout *layout); + /** + * Get maximum size of the object. + */ + loff_t (*coo_maxbytes)(struct cl_object *obj); + /** + * Set request attributes. + */ + void (*coo_req_attr_set)(const struct lu_env *env, + struct cl_object *obj, + struct cl_req_attr *attr); + /** + * Flush \a obj data corresponding to \a lock. Used for DoM + * locks in llite's cancelling blocking ast callback. + */ + int (*coo_object_flush)(const struct lu_env *env, + struct cl_object *obj, + struct ldlm_lock *lock); +}; + +/** + * Extended header for client object. + */ +struct cl_object_header { + /** Standard lu_object_header. cl_object::co_lu::lo_header points + * here. */ + struct lu_object_header coh_lu; + + /** + * Parent object. It is assumed that an object has a well-defined + * parent, but not a well-defined child (there may be multiple + * sub-objects, for the same top-object). cl_object_header::coh_parent + * field allows certain code to be written generically, without + * limiting possible cl_object layouts unduly. + */ + struct cl_object_header *coh_parent; + /** + * Protects consistency between cl_attr of parent object and + * attributes of sub-objects, that the former is calculated ("merged") + * from. + * + * \todo XXX this can be read/write lock if needed. + */ + spinlock_t coh_attr_guard; + /** + * Size of cl_page + page slices + */ + unsigned short coh_page_bufsize; + /** + * Number of objects above this one: 0 for a top-object, 1 for its + * sub-object, etc. + */ + unsigned char coh_nesting; +}; + +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer top-to-bottom to \a slice. + */ +#define cl_object_for_each(slice, obj) \ + list_for_each_entry((slice), \ + &(obj)->co_lu.lo_header->loh_layers,\ + co_lu.lo_linkage) + +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer bottom-to-top to \a slice. + */ +#define cl_object_for_each_reverse(slice, obj) \ + list_for_each_entry_reverse((slice), \ + &(obj)->co_lu.lo_header->loh_layers,\ + co_lu.lo_linkage) + +/** @} cl_object */ + +#define CL_PAGE_EOF ((pgoff_t)~0ull) + +/** \addtogroup cl_page cl_page + * @{ */ + +/** \struct cl_page + * Layered client page. + * + * cl_page: represents a portion of a file, cached in the memory. All pages + * of the given file are of the same size, and are kept in the radix tree + * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects + * of the top-level file object are first class cl_objects, they have their + * own radix trees of pages and hence page is implemented as a sequence of + * struct cl_pages's, linked into double-linked list through + * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the + * corresponding radix tree at the corresponding logical offset. + * + * cl_page is associated with VM page of the hosting environment (struct + * page in Linux kernel, for example), struct page. It is assumed, that this + * association is implemented by one of cl_page layers (top layer in the + * current design) that + * + * - intercepts per-VM-page call-backs made by the environment (e.g., + * memory pressure), + * + * - translates state (page flag bits) and locking between lustre and + * environment. + * + * The association between cl_page and struct page is immutable and + * established when cl_page is created. + * + * cl_page can be "owned" by a particular cl_io (see below), guaranteeing + * this io an exclusive access to this page w.r.t. other io attempts and + * various events changing page state (such as transfer completion, or + * eviction of the page from the memory). Note, that in general cl_io + * cannot be identified with a particular thread, and page ownership is not + * exactly equal to the current thread holding a lock on the page. Layer + * implementing association between cl_page and struct page has to implement + * ownership on top of available synchronization mechanisms. + * + * While lustre client maintains the notion of an page ownership by io, + * hosting MM/VM usually has its own page concurrency control + * mechanisms. For example, in Linux, page access is synchronized by the + * per-page PG_locked bit-lock, and generic kernel code (generic_file_*()) + * takes care to acquire and release such locks as necessary around the + * calls to the file system methods (->readpage(), ->prepare_write(), + * ->commit_write(), etc.). This leads to the situation when there are two + * different ways to own a page in the client: + * + * - client code explicitly and voluntary owns the page (cl_page_own()); + * + * - VM locks a page and then calls the client, that has "to assume" + * the ownership from the VM (cl_page_assume()). + * + * Dual methods to release ownership are cl_page_disown() and + * cl_page_unassume(). + * + * cl_page is reference counted (cl_page::cp_ref). When reference counter + * drops to 0, the page is returned to the cache, unless it is in + * cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * The general logic guaranteeing the absence of "existential races" for + * pages is the following: + * + * - there are fixed known ways for a thread to obtain a new reference + * to a page: + * + * - by doing a lookup in the cl_object radix tree, protected by the + * spin-lock; + * + * - by starting from VM-locked struct page and following some + * hosting environment method (e.g., following ->private pointer in + * the case of Linux kernel), see cl_vmpage_page(); + * + * - when the page enters cl_page_state::CPS_FREEING state, all these + * ways are severed with the proper synchronization + * (cl_page_delete()); + * + * - entry into cl_page_state::CPS_FREEING is serialized by the VM page + * lock; + * + * - no new references to the page in cl_page_state::CPS_FREEING state + * are allowed (checked in cl_page_get()). + * + * Together this guarantees that when last reference to a + * cl_page_state::CPS_FREEING page is released, it is safe to destroy the + * page, as neither references to it can be acquired at that point, nor + * ones exist. + * + * cl_page is a state machine. States are enumerated in enum + * cl_page_state. Possible state transitions are enumerated in + * cl_page_state_set(). State transition process (i.e., actual changing of + * cl_page::cp_state field) is protected by the lock on the underlying VM + * page. + * + * Linux Kernel implementation. + * + * Binding between cl_page and struct page (which is a typedef for + * struct page) is implemented in the vvp layer. cl_page is attached to the + * ->private pointer of the struct page, together with the setting of + * PG_private bit in page->flags, and acquiring additional reference on the + * struct page (much like struct buffer_head, or any similar file system + * private data structures). + * + * PG_locked lock is used to implement both ownership and transfer + * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}} + * states. No additional references are acquired for the duration of the + * transfer. + * + * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where + * write-out is "protected" by the special PG_writeback bit. + */ + +/** + * States of cl_page. cl_page.c assumes particular order here. + * + * The page state machine is rather crude, as it doesn't recognize finer page + * states like "dirty" or "up to date". This is because such states are not + * always well defined for the whole stack (see, for example, the + * implementation of the read-ahead, that hides page up-to-dateness to track + * cache hits accurately). Such sub-states are maintained by the layers that + * are interested in them. + */ +enum cl_page_state { + /** + * Page is in the cache, un-owned. Page leaves cached state in the + * following cases: + * + * - [cl_page_state::CPS_OWNED] io comes across the page and + * owns it; + * + * - [cl_page_state::CPS_PAGEOUT] page is dirty, the + * req-formation engine decides that it wants to include this page + * into an RPC being constructed, and yanks it from the cache; + * + * - [cl_page_state::CPS_FREEING] VM callback is executed to + * evict the page form the memory; + * + * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_CACHED, + /** + * Page is exclusively owned by some cl_io. Page may end up in this + * state as a result of + * + * - io creating new page and immediately owning it; + * + * - [cl_page_state::CPS_CACHED] io finding existing cached page + * and owning it; + * + * - [cl_page_state::CPS_OWNED] io finding existing owned page + * and waiting for owner to release the page; + * + * Page leaves owned state in the following cases: + * + * - [cl_page_state::CPS_CACHED] io decides to leave the page in + * the cache, doing nothing; + * + * - [cl_page_state::CPS_PAGEIN] io starts read transfer for + * this page; + * + * - [cl_page_state::CPS_PAGEOUT] io starts immediate write + * transfer for this page; + * + * - [cl_page_state::CPS_FREEING] io decides to destroy this + * page (e.g., as part of truncate or extent lock cancellation). + * + * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL + */ + CPS_OWNED, + /** + * Page is being written out, as a part of a transfer. This state is + * entered when req-formation logic decided that it wants this page to + * be sent through the wire _now_. Specifically, it means that once + * this state is achieved, transfer completion handler (with either + * success or failure indication) is guaranteed to be executed against + * this page independently of any locks and any scheduling decisions + * made by the hosting environment (that effectively means that the + * page is never put into cl_page_state::CPS_PAGEOUT state "in + * advance". This property is mentioned, because it is important when + * reasoning about possible dead-locks in the system). The page can + * enter this state as a result of + * + * - [cl_page_state::CPS_OWNED] an io requesting an immediate + * write-out of this page, or + * + * - [cl_page_state::CPS_CACHED] req-forming engine deciding + * that it has enough dirty pages cached to issue a "good" + * transfer. + * + * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer + * is completed---it is moved into cl_page_state::CPS_CACHED state. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEOUT, + /** + * Page is being read in, as a part of a transfer. This is quite + * similar to the cl_page_state::CPS_PAGEOUT state, except that + * read-in is always "immediate"---there is no such thing a sudden + * construction of read request from cached, presumably not up to date, + * pages. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEIN, + /** + * Page is being destroyed. This state is entered when client decides + * that page has to be deleted from its host object, as, e.g., a part + * of truncate. + * + * Once this state is reached, there is no way to escape it. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_FREEING, + CPS_NR +}; + +enum cl_page_type { + /** Host page, the page is from the host inode which the cl_page + * belongs to. */ + CPT_CACHEABLE = 1, + + /** Transient page, the transient cl_page is used to bind a cl_page + * to vmpage which is not belonging to the same object of cl_page. + * it is used in DirectIO and lockless IO. */ + CPT_TRANSIENT, +}; + +/** + * Fields are protected by the lock on struct page, except for atomics and + * immutables. + * + * \invariant Data type invariants are in cl_page_invariant(). Basically: + * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked + * list, consistent with the parent/child pointers in the cl_page::cp_obj and + * cl_page::cp_owner (when set). + */ +struct cl_page { + /** Reference counter. */ + atomic_t cp_ref; + /** An object this page is a part of. Immutable after creation. */ + struct cl_object *cp_obj; + /** vmpage */ + struct page *cp_vmpage; + /** Linkage of pages within group. Pages must be owned */ + struct list_head cp_batch; + /** List of slices. Immutable after creation. */ + struct list_head cp_layers; + /** + * Page state. This field is const to avoid accidental update, it is + * modified only internally within cl_page.c. Protected by a VM lock. + */ + const enum cl_page_state cp_state; + /** + * Page type. Only CPT_TRANSIENT is used so far. Immutable after + * creation. + */ + enum cl_page_type cp_type; + + /** + * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned + * by sub-io. Protected by a VM lock. + */ + struct cl_io *cp_owner; + /** List of references to this page, for debugging. */ + struct lu_ref cp_reference; + /** Link to an object, for debugging. */ + struct lu_ref_link cp_obj_ref; + /** Link to a queue, for debugging. */ + struct lu_ref_link cp_queue_ref; + /** Assigned if doing a sync_io */ + struct cl_sync_io *cp_sync_io; +}; + +/** + * Per-layer part of cl_page. + * + * \see vvp_page, lov_page, osc_page + */ +struct cl_page_slice { + struct cl_page *cpl_page; + pgoff_t cpl_index; + /** + * Object slice corresponding to this page slice. Immutable after + * creation. + */ + struct cl_object *cpl_obj; + const struct cl_page_operations *cpl_ops; + /** Linkage into cl_page::cp_layers. Immutable after creation. */ + struct list_head cpl_linkage; +}; + +/** + * Lock mode. For the client extent locks. + * + * \ingroup cl_lock + */ +enum cl_lock_mode { + CLM_READ, + CLM_WRITE, + CLM_GROUP, + CLM_MAX, +}; + +/** + * Requested transfer type. + */ +enum cl_req_type { + CRT_READ, + CRT_WRITE, + CRT_NR +}; + +/** + * Per-layer page operations. + * + * Methods taking an \a io argument are for the activity happening in the + * context of given \a io. Page is assumed to be owned by that io, except for + * the obvious cases (like cl_page_operations::cpo_own()). + * + * \see vvp_page_ops, lov_page_ops, osc_page_ops + */ +struct cl_page_operations { + /** + * cl_page<->struct page methods. Only one layer in the stack has to + * implement these. Current code assumes that this functionality is + * provided by the topmost layer, see cl_page_disown0() as an example. + */ + + /** + * Called when \a io acquires this page into the exclusive + * ownership. When this method returns, it is guaranteed that the is + * not owned by other io, and no transfer is going on against + * it. Optional. + * + * \see cl_page_own() + * \see vvp_page_own(), lov_page_own() + */ + int (*cpo_own)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock); + /** Called when ownership it yielded. Optional. + * + * \see cl_page_disown() + * \see vvp_page_disown() + */ + void (*cpo_disown)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** + * Called for a page that is already "owned" by \a io from VM point of + * view. Optional. + * + * \see cl_page_assume() + * \see vvp_page_assume(), lov_page_assume() + */ + void (*cpo_assume)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** Dual to cl_page_operations::cpo_assume(). Optional. Called + * bottom-to-top when IO releases a page without actually unlocking + * it. + * + * \see cl_page_unassume() + * \see vvp_page_unassume() + */ + void (*cpo_unassume)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Announces whether the page contains valid data or not by \a uptodate. + * + * \see cl_page_export() + * \see vvp_page_export() + */ + void (*cpo_export)(const struct lu_env *env, + const struct cl_page_slice *slice, int uptodate); + /** + * Checks whether underlying VM page is locked (in the suitable + * sense). Used for assertions. + * + * \retval -EBUSY: page is protected by a lock of a given mode; + * \retval -ENODATA: page is not protected by a lock; + * \retval 0: this layer cannot decide. (Should never happen.) + */ + int (*cpo_is_vmlocked)(const struct lu_env *env, + const struct cl_page_slice *slice); + + /** + * Update file attributes when all we have is this page. Used for tiny + * writes to update attributes when we don't have a full cl_io. + */ + void (*cpo_page_touch)(const struct lu_env *env, + const struct cl_page_slice *slice, size_t to); + /** + * Page destruction. + */ + + /** + * Called when page is truncated from the object. Optional. + * + * \see cl_page_discard() + * \see vvp_page_discard(), osc_page_discard() + */ + void (*cpo_discard)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Called when page is removed from the cache, and is about to being + * destroyed. Optional. + * + * \see cl_page_delete() + * \see vvp_page_delete(), osc_page_delete() + */ + void (*cpo_delete)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** Destructor. Frees resources and slice itself. */ + void (*cpo_fini)(const struct lu_env *env, + struct cl_page_slice *slice, + struct pagevec *pvec); + /** + * Optional debugging helper. Prints given page slice. + * + * \see cl_page_print() + */ + int (*cpo_print)(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t p); + /** + * \name transfer + * + * Transfer methods. + * + * @{ + */ + /** + * Request type dependent vector of operations. + * + * Transfer operations depend on transfer mode (cl_req_type). To avoid + * passing transfer mode to each and every of these methods, and to + * avoid branching on request type inside of the methods, separate + * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are + * provided. That is, method invocation usually looks like + * + * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...); + */ + struct { + /** + * Called when a page is submitted for a transfer as a part of + * cl_page_list. + * + * \return 0 : page is eligible for submission; + * \return -EALREADY : skip this page; + * \return -ve : error. + * + * \see cl_page_prep() + */ + int (*cpo_prep)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Completion handler. This is guaranteed to be eventually + * fired after cl_page_operations::cpo_prep() or + * cl_page_operations::cpo_make_ready() call. + * + * This method can be called in a non-blocking context. It is + * guaranteed however, that the page involved and its object + * are pinned in memory (and, hence, calling cl_page_put() is + * safe). + * + * \see cl_page_completion() + */ + void (*cpo_completion)(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret); + /** + * Called when cached page is about to be added to the + * ptlrpc request as a part of req formation. + * + * \return 0 : proceed with this page; + * \return -EAGAIN : skip this page; + * \return -ve : error. + * + * \see cl_page_make_ready() + */ + int (*cpo_make_ready)(const struct lu_env *env, + const struct cl_page_slice *slice); + } io[CRT_NR]; + /** + * Tell transfer engine that only [to, from] part of a page should be + * transmitted. + * + * This is used for immediate transfers. + * + * \todo XXX this is not very good interface. It would be much better + * if all transfer parameters were supplied as arguments to + * cl_io_operations::cio_submit() call, but it is not clear how to do + * this for page queues. + * + * \see cl_page_clip() + */ + void (*cpo_clip)(const struct lu_env *env, + const struct cl_page_slice *slice, + int from, int to); + /** + * \pre the page was queued for transferring. + * \post page is removed from client's pending list, or -EBUSY + * is returned if it has already been in transferring. + * + * This is one of seldom page operation which is: + * 0. called from top level; + * 1. don't have vmpage locked; + * 2. every layer should synchronize execution of its ->cpo_cancel() + * with completion handlers. Osc uses client obd lock for this + * purpose. Based on there is no vvp_page_cancel and + * lov_page_cancel(), cpo_cancel is defacto protected by client lock. + * + * \see osc_page_cancel(). + */ + int (*cpo_cancel)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Write out a page by kernel. This is only called by ll_writepage + * right now. + * + * \see cl_page_flush() + */ + int (*cpo_flush)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** @} transfer */ +}; + +/** + * Helper macro, dumping detailed information about \a page into a log. + */ +#define CL_PAGE_DEBUG(mask, env, page, format, ...) \ +do { \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + cl_page_print(env, &msgdata, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Helper macro, dumping shorter information about \a page into a log. + */ +#define CL_PAGE_HEADER(mask, env, page, format, ...) \ +do { \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +static inline struct page *cl_page_vmpage(const struct cl_page *page) +{ + LASSERT(page->cp_vmpage != NULL); + return page->cp_vmpage; +} + +/** + * Check if a cl_page is in use. + * + * Client cache holds a refcount, this refcount will be dropped when + * the page is taken out of cache, see vvp_page_delete(). + */ +static inline bool __page_in_use(const struct cl_page *page, int refc) +{ + return (atomic_read(&page->cp_ref) > refc + 1); +} + +/** + * Caller itself holds a refcount of cl_page. + */ +#define cl_page_in_use(pg) __page_in_use(pg, 1) +/** + * Caller doesn't hold a refcount. + */ +#define cl_page_in_use_noref(pg) __page_in_use(pg, 0) + +/** @} cl_page */ + +/** \addtogroup cl_lock cl_lock + * @{ */ +/** \struct cl_lock + * + * Extent locking on the client. + * + * LAYERING + * + * The locking model of the new client code is built around + * + * struct cl_lock + * + * data-type representing an extent lock on a regular file. cl_lock is a + * layered object (much like cl_object and cl_page), it consists of a header + * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to + * cl_lock::cll_layers list through cl_lock_slice::cls_linkage. + * + * Typical cl_lock consists of one layer: + * + * - lov_lock (lov specific data). + * + * lov_lock contains an array of sub-locks. Each of these sub-locks is a + * normal cl_lock: it has a header (struct cl_lock) and a list of layers: + * + * - osc_lock + * + * Each sub-lock is associated with a cl_object (representing stripe + * sub-object or the file to which top-level cl_lock is associated to), and is + * linked into that cl_object::coh_locks. In this respect cl_lock is similar to + * cl_object (that at lov layer also fans out into multiple sub-objects), and + * is different from cl_page, that doesn't fan out (there is usually exactly + * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock + * a "top-lock" and its lovsub-osc portion a "sub-lock". + * + * LIFE CYCLE + * + * cl_lock is a cacheless data container for the requirements of locks to + * complete the IO. cl_lock is created before I/O starts and destroyed when the + * I/O is complete. + * + * cl_lock depends on LDLM lock to fulfill lock semantics. LDLM lock is attached + * to cl_lock at OSC layer. LDLM lock is still cacheable. + * + * INTERFACE AND USAGE + * + * Two major methods are supported for cl_lock: clo_enqueue and clo_cancel. A + * cl_lock is enqueued by cl_lock_request(), which will call clo_enqueue() + * methods for each layer to enqueue the lock. At the LOV layer, if a cl_lock + * consists of multiple sub cl_locks, each sub locks will be enqueued + * correspondingly. At OSC layer, the lock enqueue request will tend to reuse + * cached LDLM lock; otherwise a new LDLM lock will have to be requested from + * OST side. + * + * cl_lock_cancel() must be called to release a cl_lock after use. clo_cancel() + * method will be called for each layer to release the resource held by this + * lock. At OSC layer, the reference count of LDLM lock, which is held at + * clo_enqueue time, is released. + * + * LDLM lock can only be canceled if there is no cl_lock using it. + * + * Overall process of the locking during IO operation is as following: + * + * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock() + * is called on each layer. Responsibility of this method is to add locks, + * needed by a given layer into cl_io.ci_lockset. + * + * - once locks for all layers were collected, they are sorted to avoid + * dead-locks (cl_io_locks_sort()), and enqueued. + * + * - when all locks are acquired, IO is performed; + * + * - locks are released after IO is complete. + * + * Striping introduces major additional complexity into locking. The + * fundamental problem is that it is generally unsafe to actively use (hold) + * two locks on the different OST servers at the same time, as this introduces + * inter-server dependency and can lead to cascading evictions. + * + * Basic solution is to sub-divide large read/write IOs into smaller pieces so + * that no multi-stripe locks are taken (note that this design abandons POSIX + * read/write semantics). Such pieces ideally can be executed concurrently. At + * the same time, certain types of IO cannot be sub-divived, without + * sacrificing correctness. This includes: + * + * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee + * atomicity; + * + * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken. + * + * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where + * buf is a part of memory mapped Lustre file, a lock or locks protecting buf + * has to be held together with the usual lock on [offset, offset + count]. + * + * Interaction with DLM + * + * In the expected setup, cl_lock is ultimately backed up by a collection of + * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is + * implemented in osc layer, that also matches DLM events (ASTs, cancellation, + * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed + * description of interaction with DLM. + */ + +/** + * Lock description. + */ +struct cl_lock_descr { + /** Object this lock is granted for. */ + struct cl_object *cld_obj; + /** Index of the first page protected by this lock. */ + pgoff_t cld_start; + /** Index of the last page (inclusive) protected by this lock. */ + pgoff_t cld_end; + /** Group ID, for group lock */ + __u64 cld_gid; + /** Lock mode. */ + enum cl_lock_mode cld_mode; + /** + * flags to enqueue lock. A combination of bit-flags from + * enum cl_enq_flags. + */ + __u32 cld_enq_flags; +}; + +#define DDESCR "%s(%d):[%lu, %lu]:%x" +#define PDESCR(descr) \ + cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \ + (descr)->cld_start, (descr)->cld_end, (descr)->cld_enq_flags + +const char *cl_lock_mode_name(const enum cl_lock_mode mode); + +/** + * Layered client lock. + */ +struct cl_lock { + /** List of slices. Immutable after creation. */ + struct list_head cll_layers; + /** lock attribute, extent, cl_object, etc. */ + struct cl_lock_descr cll_descr; +}; + +/** + * Per-layer part of cl_lock + * + * \see lov_lock, osc_lock + */ +struct cl_lock_slice { + struct cl_lock *cls_lock; + /** Object slice corresponding to this lock slice. Immutable after + * creation. */ + struct cl_object *cls_obj; + const struct cl_lock_operations *cls_ops; + /** Linkage into cl_lock::cll_layers. Immutable after creation. */ + struct list_head cls_linkage; +}; + +/** + * + * \see lov_lock_ops, osc_lock_ops + */ +struct cl_lock_operations { + /** @{ */ + /** + * Attempts to enqueue the lock. Called top-to-bottom. + * + * \retval 0 this layer has enqueued the lock successfully + * \retval >0 this layer has enqueued the lock, but need to wait on + * @anchor for resources + * \retval -ve failure + * + * \see lov_lock_enqueue(), osc_lock_enqueue() + */ + int (*clo_enqueue)(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, struct cl_sync_io *anchor); + /** + * Cancel a lock, release its DLM lock ref, while does not cancel the + * DLM lock + */ + void (*clo_cancel)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** @} */ + /** + * Destructor. Frees resources and the slice. + * + * \see lov_lock_fini(), osc_lock_fini() + */ + void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice); + /** + * Optional debugging helper. Prints given lock slice. + */ + int (*clo_print)(const struct lu_env *env, + void *cookie, lu_printer_t p, + const struct cl_lock_slice *slice); +}; + +#define CL_LOCK_DEBUG(mask, env, lock, format, ...) \ +do { \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +#define CL_LOCK_ASSERT(expr, env, lock) do { \ + if (likely(expr)) \ + break; \ + \ + CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \ + LBUG(); \ +} while (0) + +/** @} cl_lock */ + +/** \addtogroup cl_page_list cl_page_list + * Page list used to perform collective operations on a group of pages. + * + * Pages are added to the list one by one. cl_page_list acquires a reference + * for every page in it. Page list is used to perform collective operations on + * pages: + * + * - submit pages for an immediate transfer, + * + * - own pages on behalf of certain io (waiting for each page in turn), + * + * - discard pages. + * + * When list is finalized, it releases references on all pages it still has. + * + * \todo XXX concurrency control. + * + * @{ + */ +struct cl_page_list { + unsigned pl_nr; + struct list_head pl_pages; + struct task_struct *pl_owner; +}; + +/** + * A 2-queue of pages. A convenience data-type for common use case, 2-queue + * contains an incoming page list and an outgoing page list. + */ +struct cl_2queue { + struct cl_page_list c2_qin; + struct cl_page_list c2_qout; +}; + +/** @} cl_page_list */ + +/** \addtogroup cl_io cl_io + * @{ */ +/** \struct cl_io + * I/O + * + * cl_io represents a high level I/O activity like + * read(2)/write(2)/truncate(2) system call, or cancellation of an extent + * lock. + * + * cl_io is a layered object, much like cl_{object,page,lock} but with one + * important distinction. We want to minimize number of calls to the allocator + * in the fast path, e.g., in the case of read(2) when everything is cached: + * client already owns the lock over region being read, and data are cached + * due to read-ahead. To avoid allocation of cl_io layers in such situations, + * per-layer io state is stored in the session, associated with the io, see + * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized + * by using free-lists, see cl_env_get(). + * + * There is a small predefined number of possible io types, enumerated in enum + * cl_io_type. + * + * cl_io is a state machine, that can be advanced concurrently by the multiple + * threads. It is up to these threads to control the concurrency and, + * specifically, to detect when io is done, and its state can be safely + * released. + * + * For read/write io overall execution plan is as following: + * + * (0) initialize io state through all layers; + * + * (1) loop: prepare chunk of work to do + * + * (2) call all layers to collect locks they need to process current chunk + * + * (3) sort all locks to avoid dead-locks, and acquire them + * + * (4) process the chunk: call per-page methods + * cl_io_operations::cio_prepare_write(), + * cl_io_operations::cio_commit_write() for write) + * + * (5) release locks + * + * (6) repeat loop. + * + * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to + * address allocation efficiency issues mentioned above), and returns with the + * special error condition from per-page method when current sub-io has to + * block. This causes io loop to be repeated, and lov switches to the next + * sub-io in its cl_io_operations::cio_iter_init() implementation. + */ + +/** IO types */ +enum cl_io_type { + /** read system call */ + CIT_READ = 1, + /** write system call */ + CIT_WRITE, + /** truncate, utime system calls */ + CIT_SETATTR, + /** get data version */ + CIT_DATA_VERSION, + /** + * page fault handling + */ + CIT_FAULT, + /** + * fsync system call handling + * To write out a range of file + */ + CIT_FSYNC, + /** + * glimpse. An io context to acquire glimpse lock. + */ + CIT_GLIMPSE, + /** + * Miscellaneous io. This is used for occasional io activity that + * doesn't fit into other types. Currently this is used for: + * + * - cancellation of an extent lock. This io exists as a context + * to write dirty pages from under the lock being canceled back + * to the server; + * + * - VM induced page write-out. An io context for writing page out + * for memory cleansing; + * + * - grouplock. An io context to acquire group lock. + * + * CIT_MISC io is used simply as a context in which locks and pages + * are manipulated. Such io has no internal "process", that is, + * cl_io_loop() is never called for it. + */ + CIT_MISC, + /** + * ladvise handling + * To give advice about access of a file + */ + CIT_LADVISE, + CIT_OP_NR +}; + +/** + * States of cl_io state machine + */ +enum cl_io_state { + /** Not initialized. */ + CIS_ZERO, + /** Initialized. */ + CIS_INIT, + /** IO iteration started. */ + CIS_IT_STARTED, + /** Locks taken. */ + CIS_LOCKED, + /** Actual IO is in progress. */ + CIS_IO_GOING, + /** IO for the current iteration finished. */ + CIS_IO_FINISHED, + /** Locks released. */ + CIS_UNLOCKED, + /** Iteration completed. */ + CIS_IT_ENDED, + /** cl_io finalized. */ + CIS_FINI +}; + +/** + * IO state private for a layer. + * + * This is usually embedded into layer session data, rather than allocated + * dynamically. + * + * \see vvp_io, lov_io, osc_io + */ +struct cl_io_slice { + struct cl_io *cis_io; + /** corresponding object slice. Immutable after creation. */ + struct cl_object *cis_obj; + /** io operations. Immutable after creation. */ + const struct cl_io_operations *cis_iop; + /** + * linkage into a list of all slices for a given cl_io, hanging off + * cl_io::ci_layers. Immutable after creation. + */ + struct list_head cis_linkage; +}; + +typedef void (*cl_commit_cbt)(const struct lu_env *, struct cl_io *, + struct cl_page *); + +struct cl_read_ahead { + /* Maximum page index the readahead window will end. + * This is determined DLM lock coverage, RPC and stripe boundary. + * cra_end is included. */ + pgoff_t cra_end; + /* optimal RPC size for this read, by pages */ + unsigned long cra_rpc_size; + /* Release callback. If readahead holds resources underneath, this + * function should be called to release it. */ + void (*cra_release)(const struct lu_env *env, void *cbdata); + /* Callback data for cra_release routine */ + void *cra_cbdata; +}; + +static inline void cl_read_ahead_release(const struct lu_env *env, + struct cl_read_ahead *ra) +{ + if (ra->cra_release != NULL) + ra->cra_release(env, ra->cra_cbdata); + memset(ra, 0, sizeof(*ra)); +} + + +/** + * Per-layer io operations. + * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops + */ +struct cl_io_operations { + /** + * Vector of io state transition methods for every io type. + * + * \see cl_page_operations::io + */ + struct { + /** + * Prepare io iteration at a given layer. + * + * Called top-to-bottom at the beginning of each iteration of + * "io loop" (if it makes sense for this type of io). Here + * layer selects what work it will do during this iteration. + * + * \see cl_io_operations::cio_iter_fini() + */ + int (*cio_iter_init) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize io iteration. + * + * Called bottom-to-top at the end of each iteration of "io + * loop". Here layers can decide whether IO has to be + * continued. + * + * \see cl_io_operations::cio_iter_init() + */ + void (*cio_iter_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Collect locks for the current iteration of io. + * + * Called top-to-bottom to collect all locks necessary for + * this iteration. This methods shouldn't actually enqueue + * anything, instead it should post a lock through + * cl_io_lock_add(). Once all locks are collected, they are + * sorted and enqueued in the proper order. + */ + int (*cio_lock) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize unlocking. + * + * Called bottom-to-top to finish layer specific unlocking + * functionality, after generic code released all locks + * acquired by cl_io_operations::cio_lock(). + */ + void (*cio_unlock)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Start io iteration. + * + * Once all locks are acquired, called top-to-bottom to + * commence actual IO. In the current implementation, + * top-level vvp_io_{read,write}_start() does all the work + * synchronously by calling generic_file_*(), so other layers + * are called when everything is done. + */ + int (*cio_start)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called top-to-bottom at the end of io loop. Here layer + * might wait for an unfinished asynchronous io. + */ + void (*cio_end) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called bottom-to-top to notify layers that read/write IO + * iteration finished, with \a nob bytes transferred. + */ + void (*cio_advance)(const struct lu_env *env, + const struct cl_io_slice *slice, + size_t nob); + /** + * Called once per io, bottom-to-top to release io resources. + */ + void (*cio_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + } op[CIT_OP_NR]; + + /** + * Submit pages from \a queue->c2_qin for IO, and move + * successfully submitted pages into \a queue->c2_qout. Return + * non-zero if failed to submit even the single page. If + * submission failed after some pages were moved into \a + * queue->c2_qout, completion callback with non-zero ioret is + * executed on them. + */ + int (*cio_submit)(const struct lu_env *env, + const struct cl_io_slice *slice, + enum cl_req_type crt, + struct cl_2queue *queue); + /** + * Queue async page for write. + * The difference between cio_submit and cio_queue is that + * cio_submit is for urgent request. + */ + int (*cio_commit_async)(const struct lu_env *env, + const struct cl_io_slice *slice, + struct cl_page_list *queue, int from, int to, + cl_commit_cbt cb); + /** + * Decide maximum read ahead extent + * + * \pre io->ci_type == CIT_READ + */ + int (*cio_read_ahead)(const struct lu_env *env, + const struct cl_io_slice *slice, + pgoff_t start, struct cl_read_ahead *ra); + /** + * Optional debugging helper. Print given io slice. + */ + int (*cio_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_io_slice *slice); +}; + +/** + * Flags to lock enqueue procedure. + * \ingroup cl_lock + */ +enum cl_enq_flags { + /** + * instruct server to not block, if conflicting lock is found. Instead + * -EWOULDBLOCK is returned immediately. + */ + CEF_NONBLOCK = 0x00000001, + /** + * Tell lower layers this is a glimpse request, translated to + * LDLM_FL_HAS_INTENT at LDLM layer. + * + * Also, because glimpse locks never block other locks, we count this + * as automatically compatible with other osc locks. + * (see osc_lock_compatible) + */ + CEF_GLIMPSE = 0x00000002, + /** + * tell the server to instruct (though a flag in the blocking ast) an + * owner of the conflicting lock, that it can drop dirty pages + * protected by this lock, without sending them to the server. + */ + CEF_DISCARD_DATA = 0x00000004, + /** + * tell the sub layers that it must be a `real' lock. This is used for + * mmapped-buffer locks, glimpse locks, manually requested locks + * (LU_LADVISE_LOCKAHEAD) that must never be converted into lockless + * mode. + * + * \see vvp_mmap_locks(), cl_glimpse_lock, cl_request_lock(). + */ + CEF_MUST = 0x00000008, + /** + * tell the sub layers that never request a `real' lock. This flag is + * not used currently. + * + * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless + * conversion policy: ci_lockreq describes generic information of lock + * requirement for this IO, especially for locks which belong to the + * object doing IO; however, lock itself may have precise requirements + * that are described by the enqueue flags. + */ + CEF_NEVER = 0x00000010, + /** + * tell the dlm layer this is a speculative lock request + * speculative lock requests are locks which are not requested as part + * of an I/O operation. Instead, they are requested because we expect + * to use them in the future. They are requested asynchronously at the + * ptlrpc layer. + * + * Currently used for asynchronous glimpse locks and manually requested + * locks (LU_LADVISE_LOCKAHEAD). + */ + CEF_SPECULATIVE = 0x00000020, + /** + * enqueue a lock to test DLM lock existence. + */ + CEF_PEEK = 0x00000040, + /** + * Lock match only. Used by group lock in I/O as group lock + * is known to exist. + */ + CEF_LOCK_MATCH = 0x00000080, + /** + * tell the DLM layer to lock only the requested range + */ + CEF_LOCK_NO_EXPAND = 0x00000100, + /** + * mask of enq_flags. + */ + CEF_MASK = 0x000001ff, +}; + +/** + * Link between lock and io. Intermediate structure is needed, because the + * same lock can be part of multiple io's simultaneously. + */ +struct cl_io_lock_link { + /** linkage into one of cl_lockset lists. */ + struct list_head cill_linkage; + struct cl_lock cill_lock; + /** optional destructor */ + void (*cill_fini)(const struct lu_env *env, + struct cl_io_lock_link *link); +}; +#define cill_descr cill_lock.cll_descr + +/** + * Lock-set represents a collection of locks, that io needs at a + * time. Generally speaking, client tries to avoid holding multiple locks when + * possible, because + * + * - holding extent locks over multiple ost's introduces the danger of + * "cascading timeouts"; + * + * - holding multiple locks over the same ost is still dead-lock prone, + * see comment in osc_lock_enqueue(), + * + * but there are certain situations where this is unavoidable: + * + * - O_APPEND writes have to take [0, EOF] lock for correctness; + * + * - truncate has to take [new-size, EOF] lock for correctness; + * + * - SNS has to take locks across full stripe for correctness; + * + * - in the case when user level buffer, supplied to {read,write}(file0), + * is a part of a memory mapped lustre file, client has to take a dlm + * locks on file0, and all files that back up the buffer (or a part of + * the buffer, that is being processed in the current chunk, in any + * case, there are situations where at least 2 locks are necessary). + * + * In such cases we at least try to take locks in the same consistent + * order. To this end, all locks are first collected, then sorted, and then + * enqueued. + */ +struct cl_lockset { + /** locks to be acquired. */ + struct list_head cls_todo; + /** locks acquired. */ + struct list_head cls_done; +}; + +/** + * Lock requirements(demand) for IO. It should be cl_io_lock_req, + * but 'req' is always to be thought as 'request' :-) + */ +enum cl_io_lock_dmd { + /** Always lock data (e.g., O_APPEND). */ + CILR_MANDATORY = 0, + /** Layers are free to decide between local and global locking. */ + CILR_MAYBE, + /** Never lock: there is no cache (e.g., liblustre). */ + CILR_NEVER +}; + +enum cl_fsync_mode { + /** start writeback, do not wait for them to finish */ + CL_FSYNC_NONE = 0, + /** start writeback and wait for them to finish */ + CL_FSYNC_LOCAL = 1, + /** discard all of dirty pages in a specific file range */ + CL_FSYNC_DISCARD = 2, + /** start writeback and make sure they have reached storage before + * return. OST_SYNC RPC must be issued and finished */ + CL_FSYNC_ALL = 3 +}; + +struct cl_io_rw_common { + loff_t crw_pos; + size_t crw_count; + int crw_nonblock; +}; + +/** + * State for io. + * + * cl_io is shared by all threads participating in this IO (in current + * implementation only one thread advances IO, but parallel IO design and + * concurrent copy_*_user() require multiple threads acting on the same IO. It + * is up to these threads to serialize their activities, including updates to + * mutable cl_io fields. + */ +struct cl_io { + /** type of this IO. Immutable after creation. */ + enum cl_io_type ci_type; + /** current state of cl_io state machine. */ + enum cl_io_state ci_state; + /** main object this io is against. Immutable after creation. */ + struct cl_object *ci_obj; + /** + * Upper layer io, of which this io is a part of. Immutable after + * creation. + */ + struct cl_io *ci_parent; + /** List of slices. Immutable after creation. */ + struct list_head ci_layers; + /** list of locks (to be) acquired by this io. */ + struct cl_lockset ci_lockset; + /** lock requirements, this is just a help info for sublayers. */ + enum cl_io_lock_dmd ci_lockreq; + /** layout version when this IO occurs */ + __u32 ci_layout_version; + union { + struct cl_rd_io { + struct cl_io_rw_common rd; + } ci_rd; + struct cl_wr_io { + struct cl_io_rw_common wr; + int wr_append; + int wr_sync; + } ci_wr; + struct cl_io_rw_common ci_rw; + struct cl_setattr_io { + struct ost_lvb sa_attr; + unsigned int sa_attr_flags; + unsigned int sa_avalid; /* ATTR_* */ + unsigned int sa_xvalid; /* OP_XVALID */ + int sa_stripe_index; + struct ost_layout sa_layout; + const struct lu_fid *sa_parent_fid; + } ci_setattr; + struct cl_data_version_io { + u64 dv_data_version; + u32 dv_layout_version; + int dv_flags; + } ci_data_version; + struct cl_fault_io { + /** page index within file. */ + pgoff_t ft_index; + /** bytes valid byte on a faulted page. */ + size_t ft_nob; + /** writable page? for nopage() only */ + int ft_writable; + /** page of an executable? */ + int ft_executable; + /** page_mkwrite() */ + int ft_mkwrite; + /** resulting page */ + struct cl_page *ft_page; + } ci_fault; + struct cl_fsync_io { + loff_t fi_start; + loff_t fi_end; + /** file system level fid */ + struct lu_fid *fi_fid; + enum cl_fsync_mode fi_mode; + /* how many pages were written/discarded */ + unsigned int fi_nr_written; + } ci_fsync; + struct cl_ladvise_io { + __u64 li_start; + __u64 li_end; + /** file system level fid */ + struct lu_fid *li_fid; + enum lu_ladvise_type li_advice; + __u64 li_flags; + } ci_ladvise; + } u; + struct cl_2queue ci_queue; + size_t ci_nob; + int ci_result; + unsigned int ci_continue:1, + /** + * This io has held grouplock, to inform sublayers that + * don't do lockless i/o. + */ + ci_no_srvlock:1, + /** + * The whole IO need to be restarted because layout has been changed + */ + ci_need_restart:1, + /** + * to not refresh layout - the IO issuer knows that the layout won't + * change(page operations, layout change causes all page to be + * discarded), or it doesn't matter if it changes(sync). + */ + ci_ignore_layout:1, + /** + * Need MDS intervention to complete a write. + * Write intent is required for the following cases: + * 1. component being written is not initialized, or + * 2. the mirrored files are NOT in WRITE_PENDING state. + */ + ci_need_write_intent:1, + /** + * Check if layout changed after the IO finishes. Mainly for HSM + * requirement. If IO occurs to openning files, it doesn't need to + * verify layout because HSM won't release openning files. + * Right now, only two opertaions need to verify layout: glimpse + * and setattr. + */ + ci_verify_layout:1, + /** + * file is released, restore has to to be triggered by vvp layer + */ + ci_restore_needed:1, + /** + * O_NOATIME + */ + ci_noatime:1, + /* Tell sublayers not to expand LDLM locks requested for this IO */ + ci_lock_no_expand:1, + /** + * Set if non-delay RPC should be used for this IO. + * + * If this file has multiple mirrors, and if the OSTs of the current + * mirror is inaccessible, non-delay RPC would error out quickly so + * that the upper layer can try to access the next mirror. + */ + ci_ndelay:1, + /** + * Set if we've tried all mirrors for this read IO, if it's not set, + * the read IO will check to-be-read OSCs' status, and make fast-switch + * another mirror if some of the OSTs are not healthy. + */ + ci_tried_all_mirrors:1; + /** + * Bypass quota check + */ + unsigned ci_noquota:1; + /** + * How many times the read has retried before this one. + * Set by the top level and consumed by the LOV. + */ + unsigned ci_ndelay_tried; + /** + * Designated mirror index for this I/O. + */ + unsigned ci_designated_mirror; + /** + * Number of pages owned by this IO. For invariant checking. + */ + unsigned ci_owned_nr; + /** + * Range of write intent. Valid if ci_need_write_intent is set. + */ + struct lu_extent ci_write_intent; +}; + +/** @} cl_io */ + +/** + * Per-transfer attributes. + */ +struct cl_req_attr { + enum cl_req_type cra_type; + u64 cra_flags; + struct cl_page *cra_page; + /** Generic attributes for the server consumption. */ + struct obdo *cra_oa; + /** Jobid */ + char cra_jobid[LUSTRE_JOBID_SIZE]; +}; + +enum cache_stats_item { + /** how many cache lookups were performed */ + CS_lookup = 0, + /** how many times cache lookup resulted in a hit */ + CS_hit, + /** how many entities are in the cache right now */ + CS_total, + /** how many entities in the cache are actively used (and cannot be + * evicted) right now */ + CS_busy, + /** how many entities were created at all */ + CS_create, + CS_NR +}; + +#define CS_NAMES { "lookup", "hit", "total", "busy", "create" } + +/** + * Stats for a generic cache (similar to inode, lu_object, etc. caches). + */ +struct cache_stats { + const char *cs_name; + atomic_t cs_stats[CS_NR]; +}; + +/** These are not exported so far */ +void cache_stats_init (struct cache_stats *cs, const char *name); + +/** + * Client-side site. This represents particular client stack. "Global" + * variables should (directly or indirectly) be added here to allow multiple + * clients to co-exist in the single address space. + */ +struct cl_site { + struct lu_site cs_lu; + /** + * Statistical counters. Atomics do not scale, something better like + * per-cpu counters is needed. + * + * These are exported as /proc/fs/lustre/llite/.../site + * + * When interpreting keep in mind that both sub-locks (and sub-pages) + * and top-locks (and top-pages) are accounted here. + */ + struct cache_stats cs_pages; + atomic_t cs_pages_state[CPS_NR]; +}; + +int cl_site_init(struct cl_site *s, struct cl_device *top); +void cl_site_fini(struct cl_site *s); +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl); + +/** + * Output client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *site, struct seq_file *m); + +/** + * \name helpers + * + * Type conversion and accessory functions. + */ +/** @{ */ + +static inline struct cl_site *lu2cl_site(const struct lu_site *site) +{ + return container_of(site, struct cl_site, cs_lu); +} + +static inline struct cl_device *lu2cl_dev(const struct lu_device *d) +{ + LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d)); + return container_of0(d, struct cl_device, cd_lu_dev); +} + +static inline struct lu_device *cl2lu_dev(struct cl_device *d) +{ + return &d->cd_lu_dev; +} + +static inline struct cl_object *lu2cl(const struct lu_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev)); + return container_of0(o, struct cl_object, co_lu); +} + +static inline const struct cl_object_conf * +lu2cl_conf(const struct lu_object_conf *conf) +{ + return container_of0(conf, struct cl_object_conf, coc_lu); +} + +static inline struct cl_object *cl_object_next(const struct cl_object *obj) +{ + return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL; +} + +static inline struct cl_object_header *luh2coh(const struct lu_object_header *h) +{ + return container_of0(h, struct cl_object_header, coh_lu); +} + +static inline struct cl_site *cl_object_site(const struct cl_object *obj) +{ + return lu2cl_site(obj->co_lu.lo_dev->ld_site); +} + +static inline +struct cl_object_header *cl_object_header(const struct cl_object *obj) +{ + return luh2coh(obj->co_lu.lo_header); +} + +static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t) +{ + return lu_device_init(&d->cd_lu_dev, t); +} + +static inline void cl_device_fini(struct cl_device *d) +{ + lu_device_fini(&d->cd_lu_dev); +} + +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, pgoff_t index, + const struct cl_page_operations *ops); +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops); +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, const struct cl_io_operations *ops); +/** @} helpers */ + +/** \defgroup cl_object cl_object + * @{ */ +struct cl_object *cl_object_top (struct cl_object *o); +struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd, + const struct lu_fid *fid, + const struct cl_object_conf *c); + +int cl_object_header_init(struct cl_object_header *h); +void cl_object_header_fini(struct cl_object_header *h); +void cl_object_put (const struct lu_env *env, struct cl_object *o); +void cl_object_get (struct cl_object *o); +void cl_object_attr_lock (struct cl_object *o); +void cl_object_attr_unlock(struct cl_object *o); +int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +int cl_object_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int cl_object_glimpse (const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb); +int cl_conf_set (const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); +int cl_object_prune (const struct lu_env *env, struct cl_object *obj); +void cl_object_kill (const struct lu_env *env, struct cl_object *obj); +int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *lum, size_t size); +int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj, + struct ll_fiemap_info_key *fmkey, struct fiemap *fiemap, + size_t *buflen); +int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj, + struct cl_layout *cl); +loff_t cl_object_maxbytes(struct cl_object *obj); +int cl_object_flush(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock); + + +/** + * Returns true, iff \a o0 and \a o1 are slices of the same object. + */ +static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1) +{ + return cl_object_header(o0) == cl_object_header(o1); +} + +static inline void cl_object_page_init(struct cl_object *clob, int size) +{ + clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize; + cl_object_header(clob)->coh_page_bufsize += cfs_size_round(size); + WARN_ON(cl_object_header(clob)->coh_page_bufsize > 512); +} + +static inline void *cl_object_page_slice(struct cl_object *clob, + struct cl_page *page) +{ + return (void *)((char *)page + clob->co_slice_off); +} + +/** + * Return refcount of cl_object. + */ +static inline int cl_object_refc(struct cl_object *clob) +{ + struct lu_object_header *header = clob->co_lu.lo_header; + return atomic_read(&header->loh_ref); +} + +/** @} cl_object */ + +/** \defgroup cl_page cl_page + * @{ */ +enum { + CLP_GANG_OKAY = 0, + CLP_GANG_RESCHED, + CLP_GANG_AGAIN, + CLP_GANG_ABORT +}; +/* callback of cl_page_gang_lookup() */ + +struct cl_page *cl_page_find (const struct lu_env *env, + struct cl_object *obj, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type); +struct cl_page *cl_page_alloc (const struct lu_env *env, + struct cl_object *o, pgoff_t ind, + struct page *vmpage, + enum cl_page_type type); +void cl_page_get (struct cl_page *page); +void cl_page_put (const struct lu_env *env, + struct cl_page *page); +void cl_pagevec_put (const struct lu_env *env, + struct cl_page *page, + struct pagevec *pvec); +void cl_page_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +struct cl_page *cl_vmpage_page (struct page *vmpage, struct cl_object *obj); +struct cl_page *cl_page_top (struct cl_page *page); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype); + +/** + * \name ownership + * + * Functions dealing with the ownership of page by io. + */ +/** @{ */ + +int cl_page_own (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_own_try (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_unassume (const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); +void cl_page_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_is_owned (const struct cl_page *pg, const struct cl_io *io); + +/** @} ownership */ + +/** + * \name transfer + * + * Functions dealing with the preparation of a page for a transfer, and + * tracking transfer state. + */ +/** @{ */ +int cl_page_prep (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_completion (const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret); +int cl_page_make_ready (const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt); +int cl_page_cache_add (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_clip (const struct lu_env *env, struct cl_page *pg, + int from, int to); +int cl_page_cancel (const struct lu_env *env, struct cl_page *page); +int cl_page_flush (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); + +/** @} transfer */ + + +/** + * \name helper routines + * Functions to discard, delete and export a cl_page. + */ +/** @{ */ +void cl_page_discard(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); +void cl_page_delete(const struct lu_env *env, struct cl_page *pg); +int cl_page_is_vmlocked(const struct lu_env *env, + const struct cl_page *pg); +void cl_page_touch(const struct lu_env *env, const struct cl_page *pg, + size_t to); +void cl_page_export(const struct lu_env *env, + struct cl_page *pg, int uptodate); +loff_t cl_offset(const struct cl_object *obj, pgoff_t idx); +pgoff_t cl_index(const struct cl_object *obj, loff_t offset); +size_t cl_page_size(const struct cl_object *obj); + +void cl_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock); +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr); +/* @} helper */ + +/** + * Data structure managing a client's cached pages. A count of + * "unstable" pages is maintained, and an LRU of clean pages is + * maintained. "unstable" pages are pages pinned by the ptlrpc + * layer for recovery purposes. + */ +struct cl_client_cache { + /** + * # of client cache refcount + * # of users (OSCs) + 2 (held by llite and lov) + */ + atomic_t ccc_users; + /** + * # of threads are doing shrinking + */ + unsigned int ccc_lru_shrinkers; + /** + * # of LRU entries available + */ + atomic_long_t ccc_lru_left; + /** + * List of entities(OSCs) for this LRU cache + */ + struct list_head ccc_lru; + /** + * Max # of LRU entries + */ + unsigned long ccc_lru_max; + /** + * Lock to protect ccc_lru list + */ + spinlock_t ccc_lru_lock; + /** + * Set if unstable check is enabled + */ + unsigned int ccc_unstable_check:1; + /** + * # of unstable pages for this mount point + */ + atomic_long_t ccc_unstable_nr; + /** + * Waitq for awaiting unstable pages to reach zero. + * Used at umounting time and signaled on BRW commit + */ + wait_queue_head_t ccc_unstable_waitq; +}; +/** + * cl_cache functions + */ +struct cl_client_cache *cl_cache_init(unsigned long lru_page_max); +void cl_cache_incref(struct cl_client_cache *cache); +void cl_cache_decref(struct cl_client_cache *cache); + +/** @} cl_page */ + +/** \defgroup cl_lock cl_lock + * @{ */ +int cl_lock_request(const struct lu_env *env, struct cl_io *io, + struct cl_lock *lock); +int cl_lock_init(const struct lu_env *env, struct cl_lock *lock, + const struct cl_io *io); +void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock); +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype); +void cl_lock_release(const struct lu_env *env, struct cl_lock *lock); + +int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io, + struct cl_lock *lock, struct cl_sync_io *anchor); +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock); + +/** @} cl_lock */ + +/** \defgroup cl_io cl_io + * @{ */ + +int cl_io_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_sub_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_rw_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count); +int cl_io_loop (const struct lu_env *env, struct cl_io *io); + +void cl_io_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_iter_init (const struct lu_env *env, struct cl_io *io); +void cl_io_iter_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_lock (const struct lu_env *env, struct cl_io *io); +void cl_io_unlock (const struct lu_env *env, struct cl_io *io); +int cl_io_start (const struct lu_env *env, struct cl_io *io); +void cl_io_end (const struct lu_env *env, struct cl_io *io); +int cl_io_lock_add (const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link); +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr); +int cl_io_submit_rw (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue); +int cl_io_submit_sync (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue, + long timeout); +int cl_io_commit_async (const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, int from, int to, + cl_commit_cbt cb); +int cl_io_read_ahead (const struct lu_env *env, struct cl_io *io, + pgoff_t start, struct cl_read_ahead *ra); +void cl_io_rw_advance (const struct lu_env *env, struct cl_io *io, + size_t nob); +int cl_io_cancel (const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue); + +/** + * True, iff \a io is an O_APPEND write(2). + */ +static inline int cl_io_is_append(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append; +} + +static inline int cl_io_is_sync_write(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync; +} + +static inline int cl_io_is_mkwrite(const struct cl_io *io) +{ + return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite; +} + +/** + * True, iff \a io is a truncate(2). + */ +static inline int cl_io_is_trunc(const struct cl_io *io) +{ + return io->ci_type == CIT_SETATTR && + (io->u.ci_setattr.sa_avalid & ATTR_SIZE); +} + +struct cl_io *cl_io_top(struct cl_io *io); + +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io); + +#define CL_IO_SLICE_CLEAN(foo_io, base) \ +do { \ + typeof(foo_io) __foo_io = (foo_io); \ + \ + memset(&__foo_io->base, 0, \ + sizeof(*__foo_io) - offsetof(typeof(*__foo_io), base)); \ +} while (0) + +/** @} cl_io */ + +/** \defgroup cl_page_list cl_page_list + * @{ */ + +/** + * Last page in the page list. + */ +static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist) +{ + LASSERT(plist->pl_nr > 0); + return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch); +} + +static inline struct cl_page *cl_page_list_first(struct cl_page_list *plist) +{ + LASSERT(plist->pl_nr > 0); + return list_entry(plist->pl_pages.next, struct cl_page, cp_batch); +} + +/** + * Iterate over pages in a page list. + */ +#define cl_page_list_for_each(page, list) \ + list_for_each_entry((page), &(list)->pl_pages, cp_batch) + +/** + * Iterate over pages in a page list, taking possible removals into account. + */ +#define cl_page_list_for_each_safe(page, temp, list) \ + list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch) + +void cl_page_list_init (struct cl_page_list *plist); +void cl_page_list_add (struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_move (struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page); +void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page); +void cl_page_list_splice (struct cl_page_list *list, + struct cl_page_list *head); +void cl_page_list_del (const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_fini (const struct lu_env *env, struct cl_page_list *plist); + +void cl_2queue_init (struct cl_2queue *queue); +void cl_2queue_add (struct cl_2queue *queue, struct cl_page *page); +void cl_2queue_disown (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_assume (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_discard (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_fini (const struct lu_env *env, struct cl_2queue *queue); +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page); + +/** @} cl_page_list */ + +void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr); + +/** \defgroup cl_sync_io cl_sync_io + * @{ */ + +/** + * Anchor for synchronous transfer. This is allocated on a stack by thread + * doing synchronous transfer, and a pointer to this structure is set up in + * every page submitted for transfer. Transfer completion routine updates + * anchor and wakes up waiting thread when transfer is complete. + */ +struct cl_sync_io { + /** number of pages yet to be transferred. */ + atomic_t csi_sync_nr; + /** error code. */ + int csi_sync_rc; + /** barrier of destroy this structure */ + atomic_t csi_barrier; + /** completion to be signaled when transfer is complete. */ + wait_queue_head_t csi_waitq; + /** callback to invoke when this IO is finished */ + void (*csi_end_io)(const struct lu_env *, + struct cl_sync_io *); +}; + +void cl_sync_io_init(struct cl_sync_io *anchor, int nr, + void (*end)(const struct lu_env *, struct cl_sync_io *)); +int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor, + long timeout); +void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor, + int ioret); +void cl_sync_io_end(const struct lu_env *env, struct cl_sync_io *anchor); + +/** @} cl_sync_io */ + +/** \defgroup cl_env cl_env + * + * lu_env handling for a client. + * + * lu_env is an environment within which lustre code executes. Its major part + * is lu_context---a fast memory allocation mechanism that is used to conserve + * precious kernel stack space. Originally lu_env was designed for a server, + * where + * + * - there is a (mostly) fixed number of threads, and + * + * - call chains have no non-lustre portions inserted between lustre code. + * + * On a client both these assumtpion fails, because every user thread can + * potentially execute lustre code as part of a system call, and lustre calls + * into VFS or MM that call back into lustre. + * + * To deal with that, cl_env wrapper functions implement the following + * optimizations: + * + * - allocation and destruction of environment is amortized by caching no + * longer used environments instead of destroying them; + * + * \see lu_env, lu_context, lu_context_key + * @{ */ + +struct lu_env *cl_env_get(__u16 *refcheck); +struct lu_env *cl_env_alloc(__u16 *refcheck, __u32 tags); +void cl_env_put(struct lu_env *env, __u16 *refcheck); +unsigned cl_env_cache_purge(unsigned nr); +struct lu_env *cl_env_percpu_get(void); +void cl_env_percpu_put(struct lu_env *env); + +/** @} cl_env */ + +/* + * Misc + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr); +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb); + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next); +/** @} clio */ + +int cl_global_init(void); +void cl_global_fini(void); + +#endif /* _LINUX_CL_OBJECT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/dt_object.h b/drivers/staging/lustrefsx/lustre/include/dt_object.h new file mode 100644 index 0000000000000..f16895ddafba6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/dt_object.h @@ -0,0 +1,2860 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_DT_OBJECT_H +#define __LUSTRE_DT_OBJECT_H + +/** \defgroup dt dt + * Sub-class of lu_object with methods common for "data" objects in OST stack. + * + * Data objects behave like regular files: you can read/write them, get and + * set their attributes. Implementation of dt interface is supposed to + * implement some form of garbage collection, normally reference counting + * (nlink) based one. + * + * Examples: osd (lustre/osd) is an implementation of dt interface. + * @{ + */ + +#include +/* + * super-class definitions. + */ +#include + +#include + +struct seq_file; +struct proc_dir_entry; +struct lustre_cfg; + +struct thandle; +struct dt_device; +struct dt_object; +struct dt_index_features; +struct niobuf_local; +struct niobuf_remote; +struct ldlm_enqueue_info; + +typedef enum { + MNTOPT_USERXATTR = 0x00000001, + MNTOPT_ACL = 0x00000002, +} mntopt_t; + +struct dt_device_param { + unsigned ddp_max_name_len; + unsigned ddp_max_nlink; + unsigned ddp_symlink_max; + mntopt_t ddp_mntopts; + unsigned ddp_max_ea_size; + unsigned ddp_mount_type; + unsigned long long ddp_maxbytes; + /* per-inode space consumption */ + short ddp_inodespace; + /* maximum number of blocks in an extent */ + unsigned ddp_max_extent_blks; + /* per-extent insertion overhead to be used by client for grant + * calculation */ + unsigned int ddp_extent_tax; + unsigned int ddp_brw_size; /* optimal RPC size */ + /* T10PI checksum type, zero if not supported */ + enum cksum_types ddp_t10_cksum_type; +}; + +/** + * Per-transaction commit callback function + */ +struct dt_txn_commit_cb; +typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err); +/** + * Special per-transaction callback for cases when just commit callback + * is needed and per-device callback are not convenient to use + */ +#define TRANS_COMMIT_CB_MAGIC 0xa0a00a0a +#define MAX_COMMIT_CB_STR_LEN 32 + +#define DCB_TRANS_STOP 0x1 +struct dt_txn_commit_cb { + struct list_head dcb_linkage; + dt_cb_t dcb_func; + void *dcb_data; + __u32 dcb_magic; + __u32 dcb_flags; + char dcb_name[MAX_COMMIT_CB_STR_LEN]; +}; + +/** + * Operations on dt device. + */ +struct dt_device_operations { + /** + * Return device-wide statistics. + * + * Return device-wide stats including block size, total and + * free blocks, total and free objects, etc. See struct obd_statfs + * for the details. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[out] osfs stats information + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_statfs)(const struct lu_env *env, + struct dt_device *dev, + struct obd_statfs *osfs); + + /** + * Create transaction. + * + * Create in-memory structure representing the transaction for the + * caller. The structure returned will be used by the calling thread + * to specify the transaction the updates belong to. Once created + * successfully ->dt_trans_stop() must be called in any case (with + * ->dt_trans_start() and updates or not) so that the transaction + * handle and other resources can be released by the layers below. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * + * \retval pointer to handle if creation succeeds + * \retval ERR_PTR(errno) if creation fails + */ + struct thandle *(*dt_trans_create)(const struct lu_env *env, + struct dt_device *dev); + + /** + * Start transaction. + * + * Start the transaction. The transaction described by \a th can be + * started only once. Another start is considered as an error. + * A thread is not supposed to start a transaction while another + * transaction isn't closed by the thread (though multiple handles + * can be created). The caller should start the transaction once + * all possible updates are declared (see the ->do_declare_* methods + * below) and all the needed resources are reserved. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_trans_start)(const struct lu_env *env, + struct dt_device *dev, + struct thandle *th); + + /** + * Stop transaction. + * + * Once stopped the transaction described by \a th is complete (all + * the needed updates are applied) and further processing such as + * flushing to disk, sending to another target, etc, is handled by + * lower layers. The caller can't access this transaction by the + * handle anymore (except from the commit callbacks, see below). + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_trans_stop)(const struct lu_env *env, + struct dt_device *dev, + struct thandle *th); + + /** + * Add commit callback to the transaction. + * + * Add a commit callback to the given transaction handle. The callback + * will be called when the associated transaction is stored. I.e. the + * transaction will survive an event like power off if the callback did + * run. The number of callbacks isn't limited, but you should note that + * some disk filesystems do handle the commit callbacks in the thread + * handling commit/flush of all the transactions, meaning that new + * transactions are blocked from commit and flush until all the + * callbacks are done. Also, note multiple callbacks can be running + * concurrently using multiple CPU cores. The callbacks will be running + * in a special environment which can not be used to pass data around. + * + * \param[in] th transaction handle + * \param[in] dcb commit callback description + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_trans_cb_add)(struct thandle *th, + struct dt_txn_commit_cb *dcb); + + /** + * Return FID of root index object. + * + * Return the FID of the root object in the filesystem. This object + * is usually provided as a bootstrap point by a disk filesystem. + * This is up to the implementation which FID to use, though + * [FID_SEQ_ROOT:1:0] is reserved for this purpose. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[out] fid FID of the root object + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_root_get)(const struct lu_env *env, + struct dt_device *dev, + struct lu_fid *f); + + /** + * Return device configuration data. + * + * Return device (disk fs, actually) specific configuration. + * The configuration isn't subject to change at runtime. + * See struct dt_device_param for the details. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[out] param configuration parameters + */ + void (*dt_conf_get)(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param); + + /** + * Return device's super block. + * + * \param[in] dev dt device + */ + struct super_block *(*dt_mnt_sb_get)(const struct dt_device *dev); + + /** + * Sync the device. + * + * Sync all the cached state (dirty buffers, pages, etc) to the + * persistent storage. The method returns control once the sync is + * complete. This operation may incur significant I/O to disk and + * should be reserved for cases where a global sync is strictly + * necessary. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_sync)(const struct lu_env *env, + struct dt_device *dev); + + /** + * Make device read-only. + * + * Prevent new modifications to the device. This is a very specific + * state where all the changes are accepted successfully and the + * commit callbacks are called, but persistent state never changes. + * Used only in the tests to simulate power-off scenario. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_ro)(const struct lu_env *env, + struct dt_device *dev); + + /** + * Start transaction commit asynchronously. + * + + * Provide a hint to the underlying filesystem that it should start + * committing soon. The control returns immediately. It's up to the + * layer implementing the method how soon to start committing. Usually + * this should be throttled to some extent, otherwise the number of + * aggregated transaction goes too high causing performance drop. + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dt_commit_async)(const struct lu_env *env, + struct dt_device *dev); +}; + +struct dt_index_features { + /** required feature flags from enum dt_index_flags */ + __u32 dif_flags; + /** minimal required key size */ + size_t dif_keysize_min; + /** maximal required key size, 0 if no limit */ + size_t dif_keysize_max; + /** minimal required record size */ + size_t dif_recsize_min; + /** maximal required record size, 0 if no limit */ + size_t dif_recsize_max; + /** pointer size for record */ + size_t dif_ptrsize; +}; + +enum dt_index_flags { + /** index supports variable sized keys */ + DT_IND_VARKEY = 1 << 0, + /** index supports variable sized records */ + DT_IND_VARREC = 1 << 1, + /** index can be modified */ + DT_IND_UPDATE = 1 << 2, + /** index supports records with non-unique (duplicate) keys */ + DT_IND_NONUNQ = 1 << 3, + /** + * index support fixed-size keys sorted with natural numerical way + * and is able to return left-side value if no exact value found + */ + DT_IND_RANGE = 1 << 4, +}; + +/** + * Features, required from index to support file system directories (mapping + * names to fids). + */ +extern const struct dt_index_features dt_directory_features; +extern const struct dt_index_features dt_otable_features; +extern const struct dt_index_features dt_lfsck_layout_orphan_features; +extern const struct dt_index_features dt_lfsck_layout_dangling_features; +extern const struct dt_index_features dt_lfsck_namespace_features; + +/* index features supported by the accounting objects */ +extern const struct dt_index_features dt_acct_features; + +/* index features supported by the quota global indexes */ +extern const struct dt_index_features dt_quota_glb_features; + +/* index features supported by the quota slave indexes */ +extern const struct dt_index_features dt_quota_slv_features; + +/* index features supported by the nodemap index */ +extern const struct dt_index_features dt_nodemap_features; + +/** + * This is a general purpose dt allocation hint. + * It now contains the parent object. + * It can contain any allocation hint in the future. + */ +struct dt_allocation_hint { + struct dt_object *dah_parent; + const void *dah_eadata; + int dah_eadata_len; + __u32 dah_mode; + int dah_append_stripes; + bool dah_can_block; + char *dah_append_pool; +}; + +/** + * object type specifier. + */ + +enum dt_format_type { + DFT_REGULAR, + DFT_DIR, + /** for mknod */ + DFT_NODE, + /** for special index */ + DFT_INDEX, + /** for symbolic link */ + DFT_SYM, +}; + +/** + * object format specifier. + */ +struct dt_object_format { + /** type for dt object */ + enum dt_format_type dof_type; + union { + struct dof_regular { + int striped; + } dof_reg; + struct dof_dir { + } dof_dir; + struct dof_node { + } dof_node; + /** + * special index need feature as parameter to create + * special idx + */ + struct dof_index { + const struct dt_index_features *di_feat; + } dof_idx; + } u; +}; + +enum dt_format_type dt_mode_to_dft(__u32 mode); + +typedef __u64 dt_obj_version_t; + +union ldlm_policy_data; + +struct md_layout_change; + +/** + * A dt_object provides common operations to create and destroy + * objects and to manage regular and extended attributes. + */ +struct dt_object_operations { + /** + * Get read lock on object. + * + * Read lock is compatible with other read locks, so it's shared. + * Read lock is not compatible with write lock which is exclusive. + * The lock is blocking and can't be used from an interrupt context. + * + * \param[in] env execution environment for this thread + * \param[in] dt object to lock for reading + * \param[in] role a hint to debug locks (see kernel's mutexes) + */ + void (*do_read_lock)(const struct lu_env *env, + struct dt_object *dt, + unsigned role); + + /* + * Get write lock on object. + * + * Write lock is exclusive and cannot be shared. The lock is blocking + * and can't be used from an interrupt context. + * + * \param[in] env execution environment for this thread + * \param[in] dt object to lock for writing + * \param[in] role a hint to debug locks (see kernel's mutexes) + * + */ + void (*do_write_lock)(const struct lu_env *env, + struct dt_object *dt, + unsigned role); + + /** + * Release read lock. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + */ + void (*do_read_unlock)(const struct lu_env *env, + struct dt_object *dt); + + /** + * Release write lock. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + */ + void (*do_write_unlock)(const struct lu_env *env, + struct dt_object *dt); + + /** + * Check whether write lock is held. + * + * The caller can learn whether write lock is held on the object + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * + * \retval 0 no write lock + * \retval 1 write lock is held + */ + int (*do_write_locked)(const struct lu_env *env, + struct dt_object *dt); + + /** + * Declare intention to request reqular attributes. + * + * Notity the underlying filesystem that the caller may request regular + * attributes with ->do_attr_get() soon. This allows OSD to implement + * prefetching logic in an object-oriented manner. The implementation + * can be noop. This method should avoid expensive delays such as + * waiting on disk I/O, otherwise the goal of enabling a performance + * optimization would be defeated. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_attr_get)(const struct lu_env *env, + struct dt_object *dt); + + /** + * Return regular attributes. + * + * The object must exist. Currently all the attributes should be + * returned, but in the future this can be improved so that only + * a selected set is returned. This can improve performance as in + * some cases attributes are stored in different places and + * getting them all can be an iterative and expensive process. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] attr attributes to fill + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_attr_get)(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr); + + /** + * Declare intention to change regular object's attributes. + * + * Notify the underlying filesystem that the regular attributes may + * change in this transaction. This enables the layer below to prepare + * resources (e.g. journal credits in ext4). This method should be + * called between creating the transaction and starting it. Note that + * the la_valid field of \a attr specifies which attributes will change. + * The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] attr attributes to change specified in attr.la_valid + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_attr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *attr, + struct thandle *th); + + /** + * Change regular attributes. + * + * Change regular attributes in the given transaction. Note only + * attributes flagged by attr.la_valid change. The object must + * exist. If the layer implementing this method is responsible for + * quota, then the method should maintain object accounting for the + * given credentials when la_uid/la_gid changes. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] attr new attributes to apply + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_attr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *attr, + struct thandle *th); + + /** + * Declare intention to request extented attribute. + * + * Notify the underlying filesystem that the caller may request extended + * attribute with ->do_xattr_get() soon. This allows OSD to implement + * prefetching logic in an object-oriented manner. The implementation + * can be noop. This method should avoid expensive delays such as + * waiting on disk I/O, otherwise the goal of enabling a performance + * optimization would be defeated. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] buf unused, may be removed in the future + * \param[in] name name of the extended attribute + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_xattr_get)(const struct lu_env *env, + struct dt_object *dt, + struct lu_buf *buf, + const char *name); + + /** + * Return a value of an extended attribute. + * + * The object must exist. If the buffer is NULL, then the method + * must return the size of the value. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] buf buffer in which to store the value + * \param[in] name name of the extended attribute + * + * \retval 0 on success + * \retval -ERANGE if \a buf is too small + * \retval negative negated errno on error + * \retval positive value's size if \a buf is NULL or has zero size + */ + int (*do_xattr_get)(const struct lu_env *env, + struct dt_object *dt, + struct lu_buf *buf, + const char *name); + + /** + * Declare intention to change an extended attribute. + * + * Notify the underlying filesystem that the extended attribute may + * change in this transaction. This enables the layer below to prepare + * resources (e.g. journal credits in ext4). This method should be + * called between creating the transaction and starting it. The object + * need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] buf buffer storing new value of the attribute + * \param[in] name name of the attribute + * \param[in] fl LU_XATTR_CREATE - fail if EA exists + * LU_XATTR_REPLACE - fail if EA doesn't exist + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_xattr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, + int fl, + struct thandle *th); + + /** + * Set an extended attribute. + * + * Change or replace the specified extended attribute (EA). + * The flags passed in \a fl dictate whether the EA is to be + * created or replaced, as follows. + * LU_XATTR_CREATE - fail if EA exists + * LU_XATTR_REPLACE - fail if EA doesn't exist + * The object must exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] buf buffer storing new value of the attribute + * \param[in] name name of the attribute + * \param[in] fl flags indicating EA creation or replacement + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_xattr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, + int fl, + struct thandle *th); + + /** + * Declare intention to delete an extended attribute. + * + * Notify the underlying filesystem that the extended attribute may + * be deleted in this transaction. This enables the layer below to + * prepare resources (e.g. journal credits in ext4). This method + * should be called between creating the transaction and starting it. + * The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] name name of the attribute + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_xattr_del)(const struct lu_env *env, + struct dt_object *dt, + const char *name, + struct thandle *th); + + /** + * Delete an extended attribute. + * + * This method deletes the specified extended attribute. The object + * must exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] name name of the attribute + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_xattr_del)(const struct lu_env *env, + struct dt_object *dt, + const char *name, + struct thandle *th); + + /** + * Return a list of the extended attributes. + * + * Fills the passed buffer with a list of the extended attributes + * found in the object. The names are separated with '\0'. + * The object must exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] buf buffer to put the list in + * + * \retval positive bytes used/required in the buffer + * \retval negative negated errno on error + */ + int (*do_xattr_list)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf); + + /** + * Prepare allocation hint for a new object. + * + * This method is used by the caller to inform OSD of the parent-child + * relationship between two objects and enable efficient object + * allocation. Filled allocation hint will be passed to ->do_create() + * later. + * + * \param[in] env execution environment for this thread + * \param[out] ah allocation hint + * \param[in] parent parent object (can be NULL) + * \param[in] child child object + * \param[in] _mode type of the child object + */ + void (*do_ah_init)(const struct lu_env *env, + struct dt_allocation_hint *ah, + struct dt_object *parent, + struct dt_object *child, + umode_t mode); + + /** + * Declare intention to create a new object. + * + * Notify the underlying filesystem that the object may be created + * in this transaction. This enables the layer below to prepare + * resources (e.g. journal credits in ext4). This method should be + * called between creating the transaction and starting it. + * + * If the layer implementing this method is responsible for quota, + * then the method should reserve an object for the given credentials + * and return an error if quota is over. If object creation later + * fails for some reason, then the reservation should be released + * properly (usually in ->dt_trans_stop()). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] attr attributes of the new object + * \param[in] hint allocation hint + * \param[in] dof object format + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_create)(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + + /** + * Create new object. + * + * The method creates the object passed with the specified attributes + * and object format. Object allocation procedure can use information + * stored in the allocation hint. Different object formats are supported + * (see enum dt_format_type and struct dt_object_format) depending on + * the device. If creation succeeds, then LOHA_EXISTS flag must be set + * in the LU-object header attributes. + * + * If the layer implementing this method is responsible for quota, + * then the method should maintain object accounting for the given + * credentials. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] attr attributes of the new object + * \param[in] hint allocation hint + * \param[in] dof object format + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_create)(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + + /** + * Declare intention to destroy an object. + * + * Notify the underlying filesystem that the object may be destroyed + * in this transaction. This enables the layer below to prepare + * resources (e.g. journal credits in ext4). This method should be + * called between creating the transaction and starting it. The object + * need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_destroy)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + + /** + * Destroy an object. + * + * This method destroys the object and all the resources associated + * with the object (data, key/value pairs, extended attributes, etc). + * The object must exist. If destroy is successful, then flag + * LU_OBJECT_HEARD_BANSHEE should be set to forbid access to this + * instance of in-core object. Any subsequent access to the same FID + * should get another instance with no LOHA_EXIST flag set. + * + * If the layer implementing this method is responsible for quota, + * then the method should maintain object accounting for the given + * credentials. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_destroy)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + + /** + * Try object as an index. + * + * Announce that this object is going to be used as an index. This + * operation checks that object supports indexing operations and + * installs appropriate dt_index_operations vector on success. + * Also probes for features. Operation is successful if all required + * features are supported. It's not possible to access the object + * with index methods before ->do_index_try() returns success. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] feat index features + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_index_try)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_index_features *feat); + + /** + * Declare intention to increment nlink count. + * + * Notify the underlying filesystem that the nlink regular attribute + * be changed in this transaction. This enables the layer below to + * prepare resources (e.g. journal credits in ext4). This method + * should be called between creating the transaction and starting it. + * The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_ref_add)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + + /** + * Increment nlink. + * + * Increment nlink (from the regular attributes set) in the given + * transaction. Note the absolute limit for nlink should be learnt + * from struct dt_device_param::ddp_max_nlink. The object must exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_ref_add)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + + /** + * Declare intention to decrement nlink count. + * + * Notify the underlying filesystem that the nlink regular attribute + * be changed in this transaction. This enables the layer below to + * prepare resources (e.g. journal credits in ext4). This method + * should be called between creating the transaction and starting it. + * The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_declare_ref_del)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + + /** + * Decrement nlink. + * + * Decrement nlink (from the regular attributes set) in the given + * transaction. The object must exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_ref_del)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + + /** + * Sync obect. + * + * The method is called to sync specified range of the object to a + * persistent storage. The control is returned once the operation is + * complete. The difference from ->do_sync() is that the object can + * be in-sync with the persistent storage (nothing to flush), then + * the method returns quickly with no I/O overhead. So, this method + * should be preferred over ->do_sync() where possible. Also note that + * if the object isn't clean, then some disk filesystems will call + * ->do_sync() to maintain overall consistency, in which case it's + * still very expensive. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] start start of the range to sync + * \param[in] end end of the range to sync + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_object_sync)(const struct lu_env *env, struct dt_object *obj, + __u64 start, __u64 end); + + /** + * Lock object. + * + * Lock object(s) using Distributed Lock Manager (LDLM). + * + * Get LDLM locks for the object. Currently used to lock "remote" + * objects in DNE configuration - a service running on MDTx needs + * to lock an object on MDTy. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] lh lock handle, sometimes used, sometimes not + * \param[in] einfo ldlm callbacks, locking type and mode + * \param[out] einfo private data to be passed to unlock later + * \param[in] policy inodebits data + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy); + + /** + * Unlock object. + * + * Release LDLM lock(s) granted with ->do_object_lock(). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] einfo lock handles, from ->do_object_lock() + * \param[in] policy inodebits data + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_object_unlock)(const struct lu_env *env, + struct dt_object *dt, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy); + + /** + * Invalidate attribute cache. + * + * This method invalidate attribute cache of the object, which is on OSP + * only. + * + * \param[in] env execution envionment for this thread + * \param[in] dt object + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*do_invalidate)(const struct lu_env *env, struct dt_object *dt); + + /** + * Declare intention to instaintiate extended layout component. + * + * \param[in] env execution environment + * \param[in] dt DT object + * \param[in] layout data structure to describe the changes to + * the DT object's layout + * \param[in] buf buffer containing client's lovea or empty + * + * \retval 0 success + * \retval -ne error code + */ + int (*do_declare_layout_change)(const struct lu_env *env, + struct dt_object *dt, + struct md_layout_change *mlc, + struct thandle *th); + + /** + * Client is trying to write to un-instantiated layout component. + * + * \param[in] env execution environment + * \param[in] dt DT object + * \param[in] layout data structure to describe the changes to + * the DT object's layout + * \param[in] buf buffer containing client's lovea or empty + * + * \retval 0 success + * \retval -ne error code + */ + int (*do_layout_change)(const struct lu_env *env, struct dt_object *dt, + struct md_layout_change *mlc, + struct thandle *th); +}; + +enum dt_bufs_type { + DT_BUFS_TYPE_READ = 0x0000, + DT_BUFS_TYPE_WRITE = 0x0001, + DT_BUFS_TYPE_READAHEAD = 0x0002, + DT_BUFS_TYPE_LOCAL = 0x0004, +}; + +/** + * Per-dt-object operations on "file body" - unstructure raw data. + */ +struct dt_body_operations { + /** + * Read data. + * + * Read unstructured data from an existing regular object. + * Only data before attr.la_size is returned. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] buf buffer (including size) to copy data in + * \param[in] pos position in the object to start + * \param[out] pos original value of \a pos + bytes returned + * + * \retval positive bytes read on success + * \retval negative negated errno on error + */ + ssize_t (*dbo_read)(const struct lu_env *env, + struct dt_object *dt, + struct lu_buf *buf, + loff_t *pos); + + /** + * Declare intention to write data to object. + * + * Notify the underlying filesystem that data may be written in + * this transaction. This enables the layer below to prepare resources + * (e.g. journal credits in ext4). This method should be called + * between creating the transaction and starting it. The object need + * not exist. If the layer implementing this method is responsible for + * quota, then the method should reserve space for the given credentials + * and return an error if quota is over. If the write later fails + * for some reason, then the reserve should be released properly + * (usually in ->dt_trans_stop()). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] buf buffer (including size) to copy data from + * \param[in] pos position in the object to start + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + ssize_t (*dbo_declare_write)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + loff_t pos, + struct thandle *th); + + /** + * Write unstructured data to regular existing object. + * + * The method allocates space and puts data in. Also, the method should + * maintain attr.la_size properly. Partial writes are possible. + * + * If the layer implementing this method is responsible for quota, + * then the method should maintain space accounting for the given + * credentials. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] buf buffer (including size) to copy data from + * \param[in] pos position in the object to start + * \param[out] pos \a pos + bytes written + * \param[in] th transaction handle + * + * \retval positive bytes written on success + * \retval negative negated errno on error + */ + ssize_t (*dbo_write)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + loff_t *pos, + struct thandle *th); + + /** + * Return buffers for data. + * + * This method is used to access data with no copying. It's so-called + * zero-copy I/O. The method returns the descriptors for the internal + * buffers where data are managed by the disk filesystem. For example, + * pagecache in case of ext4 or ARC with ZFS. Then other components + * (e.g. networking) can transfer data from or to the buffers with no + * additional copying. + * + * The method should fill an array of struct niobuf_local, where + * each element describes a full or partial page for data at specific + * offset. The caller should use page/lnb_page_offset/len to find data + * at object's offset lnb_file_offset. + * + * The memory referenced by the descriptors can't change its purpose + * until the complementary ->dbo_bufs_put() is called. The caller should + * specify if the buffers are used to read or modify data so that OSD + * can decide how to initialize the buffers: bring all the data for + * reads or just bring partial buffers for write. Note: the method does + * not check whether output array is large enough. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] pos position in the object to start + * \param[in] len size of region in bytes + * \param[out] lb array of descriptors to fill + * \param[in] maxlnb max slots in @lnb array + * \param[in] rw 0 if used to read, 1 if used for write + * + * \retval positive number of descriptors on success + * \retval negative negated errno on error + */ + int (*dbo_bufs_get)(const struct lu_env *env, + struct dt_object *dt, + loff_t pos, + ssize_t len, + struct niobuf_local *lb, + int maxlnb, + enum dt_bufs_type rw); + + /** + * Release reference granted by ->dbo_bufs_get(). + * + * Release the reference granted by the previous ->dbo_bufs_get(). + * Note the references are counted. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] lb array of descriptors to fill + * \param[in] nr size of the array + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_bufs_put)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *lb, + int nr); + + /** + * Prepare buffers for reading. + * + * The method is called on the given buffers to fill them with data + * if that wasn't done in ->dbo_bufs_get(). The idea is that the + * caller should be able to get few buffers for discontiguous regions + * using few calls to ->dbo_bufs_get() and then request them all for + * the preparation with a single call, so that OSD can fire many I/Os + * to run concurrently. It's up to the specific OSD whether to implement + * this logic in ->dbo_read_prep() or just use ->dbo_bufs_get() to + * prepare data for every requested region individually. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] lnb array of buffer descriptors + * \param[in] nr size of the array + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_read_prep)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *lnb, + int nr); + + /** + * Prepare buffers for write. + * + * This method is called on the given buffers to ensure the partial + * buffers contain correct data. The underlying idea is the same as + * in ->db_read_prep(). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] lb array of buffer descriptors + * \param[in] nr size of the array + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_write_prep)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *lb, + int nr); + + /** + * Declare intention to write data stored in the buffers. + * + * Notify the underlying filesystem that data may be written in + * this transaction. This enables the layer below to prepare resources + * (e.g. journal credits in ext4). This method should be called + * between creating the transaction and starting it. + * + * If the layer implementing this method is responsible for quota, + * then the method should be reserving a space for the given + * credentials and return an error if quota is exceeded. If the write + * later fails for some reason, then the reserve should be released + * properly (usually in ->dt_trans_stop()). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] lb array of descriptors + * \param[in] nr size of the array + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_declare_write_commit)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *lb, + int nr, + struct thandle *th); + + /** + * Write to existing object. + * + * This method is used to write data to a persistent storage using + * the buffers returned by ->dbo_bufs_get(). The caller puts new + * data into the buffers using own mechanisms (e.g. direct transfer + * from a NIC). The method should maintain attr.la_size. Also, + * attr.la_blocks should be maintained but this can be done in lazy + * manner, when actual allocation happens. + * + * If the layer implementing this method is responsible for quota, + * then the method should maintain space accounting for the given + * credentials. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] lb array of descriptors for the buffers + * \param[in] nr size of the array + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_write_commit)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *lb, + int nr, + struct thandle *th); + + /** + * Return logical to physical block mapping for a given extent + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] fm describe the region to map and the output buffer + * see the details in include/linux/fiemap.h + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_fiemap_get)(const struct lu_env *env, + struct dt_object *dt, + struct fiemap *fm); + + /** + * Declare intention to deallocate space from an object. + * + * Notify the underlying filesystem that space may be deallocated in + * this transactions. This enables the layer below to prepare resources + * (e.g. journal credits in ext4). This method should be called between + * creating the transaction and starting it. The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] start the start of the region to deallocate + * \param[in] end the end of the region to deallocate + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_declare_punch)(const struct lu_env *env, + struct dt_object *dt, + __u64 start, + __u64 end, + struct thandle *th); + + /** + * Deallocate specified region in an object. + * + * This method is used to deallocate (release) space possibly consumed + * by the given region of the object. If the layer implementing this + * method is responsible for quota, then the method should maintain + * space accounting for the given credentials. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] start the start of the region to deallocate + * \param[in] end the end of the region to deallocate + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_punch)(const struct lu_env *env, + struct dt_object *dt, + __u64 start, + __u64 end, + struct thandle *th); + /** + * Give advices on specified region in an object. + * + * This method is used to give advices about access pattern on an + * given region of the object. The disk filesystem understands + * the advices and tunes cache/read-ahead policies. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] start the start of the region affected + * \param[in] end the end of the region affected + * \param[in] advice advice type + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dbo_ladvise)(const struct lu_env *env, + struct dt_object *dt, + __u64 start, + __u64 end, + enum lu_ladvise_type advice); +}; + +/** + * Incomplete type of index record. + */ +struct dt_rec; + +/** + * Incomplete type of index key. + */ +struct dt_key; + +/** + * Incomplete type of dt iterator. + */ +struct dt_it; + +/** + * Per-dt-object operations on object as index. Index is a set of key/value + * pairs abstracted from an on-disk representation. An index supports the + * number of operations including lookup by key, insert and delete. Also, + * an index can be iterated to find the pairs one by one, from a beginning + * or specified point. + */ +struct dt_index_operations { + /** + * Lookup in an index by key. + * + * The method returns a value for the given key. Key/value format + * and size should have been negotiated with ->do_index_try() before. + * Thus it's the caller's responsibility to provide the method with + * proper key and big enough buffer. No external locking is required, + * all the internal consistency should be implemented by the method + * or lower layers. The object should should have been created with + * type DFT_INDEX or DFT_DIR. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[out] rec buffer where value will be stored + * \param[in] key key + * + * \retval 0 on success + * \retval -ENOENT if key isn't found + * \retval negative negated errno on error + */ + int (*dio_lookup)(const struct lu_env *env, + struct dt_object *dt, + struct dt_rec *rec, + const struct dt_key *key); + + /** + * Declare intention to insert a key/value into an index. + * + * Notify the underlying filesystem that new key/value may be inserted + * in this transaction. This enables the layer below to prepare + * resources (e.g. journal credits in ext4). This method should be + * called between creating the transaction and starting it. key/value + * format and size is subject to ->do_index_try(). + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] rec buffer storing value + * \param[in] key key + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dio_declare_insert)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th); + + /** + * Insert a new key/value pair into an index. + * + * The method inserts specified key/value pair into the given index + * object. The internal consistency is maintained by the method or + * the functionality below. The format and size of key/value should + * have been negotiated before using ->do_index_try(), no additional + * information can be specified to the method. The keys are unique + * in a given index. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] rec buffer storing value + * \param[in] key key + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dio_insert)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th); + + /** + * Declare intention to delete a key/value from an index. + * + * Notify the underlying filesystem that key/value may be deleted in + * this transaction. This enables the layer below to prepare resources + * (e.g. journal credits in ext4). This method should be called + * between creating the transaction and starting it. Key/value format + * and size is subject to ->do_index_try(). The object need not exist. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] key key + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dio_declare_delete)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th); + + /** + * Delete key/value pair from an index. + * + * The method deletes specified key and corresponding value from the + * given index object. The internal consistency is maintained by the + * method or the functionality below. The format and size of the key + * should have been negotiated before using ->do_index_try(), no + * additional information can be specified to the method. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] key key + * \param[in] th transaction handle + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*dio_delete)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th); + + /** + * Iterator interface. + * + * Methods to iterate over an existing index, list the keys stored and + * associated values, get key/value size, etc. + */ + struct dt_it_ops { + /** + * Allocate and initialize new iterator. + * + * The iterator is a handler to be used in the subsequent + * methods to access index's content. Note the position is + * not defined at this point and should be initialized with + * ->get() or ->load() method. + * + * \param[in] env execution environment for this thread + * \param[in] dt object + * \param[in] attr ask the iterator to return part of + the records, see LUDA_* for details + * + * \retval pointer iterator pointer on success + * \retval ERR_PTR(errno) on error + */ + struct dt_it *(*init)(const struct lu_env *env, + struct dt_object *dt, + __u32 attr); + + /** + * Release iterator. + * + * Release the specified iterator and all the resources + * associated (e.g. the object, index cache, etc). + * + * \param[in] env execution environment for this thread + * \param[in] di iterator to release + */ + void (*fini)(const struct lu_env *env, + struct dt_it *di); + + /** + * Move position of iterator. + * + * Move the position of the specified iterator to the specified + * key. + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * \param[in] key key to position to + * + * \retval 0 if exact key is found + * \retval 1 if at the record with least key + * not larger than the key + * \retval negative negated errno on error + */ + int (*get)(const struct lu_env *env, + struct dt_it *di, + const struct dt_key *key); + + /** + * Release position + * + * Complimentary method for dt_it_ops::get() above. Some + * implementation can increase a reference on the iterator in + * dt_it_ops::get(). So the caller should be able to release + * with dt_it_ops::put(). + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + */ + void (*put)(const struct lu_env *env, + struct dt_it *di); + + /** + * Move to next record. + * + * Moves the position of the iterator to a next record + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * + * \retval 1 if no more records + * \retval 0 on success, the next record is found + * \retval negative negated errno on error + */ + int (*next)(const struct lu_env *env, + struct dt_it *di); + + /** + * Return key. + * + * Returns a pointer to a buffer containing the key of the + * record at the current position. The pointer is valid and + * retains data until ->get(), ->load() and ->fini() methods + * are called. + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * + * \retval pointer to key on success + * \retval ERR_PTR(errno) on error + */ + struct dt_key *(*key)(const struct lu_env *env, + const struct dt_it *di); + + /** + * Return key size. + * + * Returns size of the key at the current position. + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * + * \retval key's size on success + * \retval negative negated errno on error + */ + int (*key_size)(const struct lu_env *env, + const struct dt_it *di); + + /** + * Return record. + * + * Stores the value of the record at the current position. The + * buffer must be big enough (as negotiated with + * ->do_index_try() or ->rec_size()). The caller can specify + * she is interested only in part of the record, using attr + * argument (see LUDA_* definitions for the details). + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * \param[out] rec buffer to store value in + * \param[in] attr specify part of the value to copy + * + * \retval 0 on success + * \retval negative negated errno on error + */ + int (*rec)(const struct lu_env *env, + const struct dt_it *di, + struct dt_rec *rec, + __u32 attr); + + /** + * Return record size. + * + * Returns size of the record at the current position. The + * \a attr can be used to specify only the parts of the record + * needed to be returned. (see LUDA_* definitions for the + * details). + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * \param[in] attr part of the record to return + * + * \retval record's size on success + * \retval negative negated errno on error + */ + int (*rec_size)(const struct lu_env *env, + const struct dt_it *di, + __u32 attr); + + /** + * Return a cookie (hash). + * + * Returns the cookie (usually hash) of the key at the current + * position. This allows the caller to resume iteration at this + * position later. The exact value is specific to implementation + * and should not be interpreted by the caller. + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * + * \retval cookie/hash of the key + */ + __u64 (*store)(const struct lu_env *env, + const struct dt_it *di); + + /** + * Initialize position using cookie/hash. + * + * Initializes the current position of the iterator to one + * described by the cookie/hash as returned by ->store() + * previously. + * + * \param[in] env execution environment for this thread + * \param[in] di iterator + * \param[in] hash cookie/hash value + * + * \retval positive if current position points to + * record with least cookie not larger + * than cookie + * \retval 0 if current position matches cookie + * \retval negative negated errno on error + */ + int (*load)(const struct lu_env *env, + const struct dt_it *di, + __u64 hash); + + /** + * Not used + */ + int (*key_rec)(const struct lu_env *env, + const struct dt_it *di, + void *key_rec); + } dio_it; +}; + +enum dt_otable_it_valid { + DOIV_ERROR_HANDLE = 0x0001, + DOIV_DRYRUN = 0x0002, +}; + +enum dt_otable_it_flags { + /* Exit when fail. */ + DOIF_FAILOUT = 0x0001, + + /* Reset iteration position to the device beginning. */ + DOIF_RESET = 0x0002, + + /* There is up layer component uses the iteration. */ + DOIF_OUTUSED = 0x0004, + + /* Check only without repairing. */ + DOIF_DRYRUN = 0x0008, +}; + +/* otable based iteration needs to use the common DT iteration APIs. + * To initialize the iteration, it needs call dio_it::init() firstly. + * Here is how the otable based iteration should prepare arguments to + * call dt_it_ops::init(). + * + * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init() + * is composed of two parts: + * low 16-bits is for valid bits, high 16-bits is for flags bits. */ +#define DT_OTABLE_IT_FLAGS_SHIFT 16 +#define DT_OTABLE_IT_FLAGS_MASK 0xffff0000 + +struct dt_device { + struct lu_device dd_lu_dev; + const struct dt_device_operations *dd_ops; + + /** + * List of dt_txn_callback (see below). This is not protected in any + * way, because callbacks are supposed to be added/deleted only during + * single-threaded start-up shut-down procedures. + */ + struct list_head dd_txn_callbacks; + unsigned int dd_record_fid_accessed:1, + dd_rdonly:1; + + /* sysfs and debugfs handling */ + struct dentry *dd_debugfs_entry; + + const struct attribute **dd_def_attrs; + struct kobject dd_kobj; + struct kobj_type dd_ktype; + struct completion dd_kobj_unregister; +}; + +int dt_device_init(struct dt_device *dev, struct lu_device_type *t); +void dt_device_fini(struct dt_device *dev); + +static inline int lu_device_is_dt(const struct lu_device *d) +{ + return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT); +} + +static inline struct dt_device * lu2dt_dev(struct lu_device *l) +{ + LASSERT(lu_device_is_dt(l)); + return container_of0(l, struct dt_device, dd_lu_dev); +} + +struct dt_object { + struct lu_object do_lu; + const struct dt_object_operations *do_ops; + const struct dt_body_operations *do_body_ops; + const struct dt_index_operations *do_index_ops; +}; + +/* + * In-core representation of per-device local object OID storage + */ +struct local_oid_storage { + /* all initialized llog systems on this node linked by this */ + struct list_head los_list; + + /* how many handle's reference this los has */ + atomic_t los_refcount; + struct dt_device *los_dev; + struct dt_object *los_obj; + + /* data used to generate new fids */ + struct mutex los_id_lock; + __u64 los_seq; + __u32 los_last_oid; +}; + +static inline struct lu_device *dt2lu_dev(struct dt_device *d) +{ + return &d->dd_lu_dev; +} + +static inline struct dt_object *lu2dt(struct lu_object *l) +{ + LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev)); + return container_of0(l, struct dt_object, do_lu); +} + +int dt_object_init(struct dt_object *obj, + struct lu_object_header *h, struct lu_device *d); + +void dt_object_fini(struct dt_object *obj); + +static inline int dt_object_exists(const struct dt_object *dt) +{ + return lu_object_exists(&dt->do_lu); +} + +static inline int dt_object_remote(const struct dt_object *dt) +{ + return lu_object_remote(&dt->do_lu); +} + +static inline struct dt_object *lu2dt_obj(struct lu_object *o) +{ + LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev))); + return container_of0(o, struct dt_object, do_lu); +} + +static inline struct dt_object *dt_object_child(struct dt_object *o) +{ + return container_of0(lu_object_next(&(o)->do_lu), + struct dt_object, do_lu); +} + +/** + * This is the general purpose transaction handle. + * 1. Transaction Life Cycle + * This transaction handle is allocated upon starting a new transaction, + * and deallocated after this transaction is committed. + * 2. Transaction Nesting + * We do _NOT_ support nested transaction. So, every thread should only + * have one active transaction, and a transaction only belongs to one + * thread. Due to this, transaction handle need no reference count. + * 3. Transaction & dt_object locking + * dt_object locks should be taken inside transaction. + * 4. Transaction & RPC + * No RPC request should be issued inside transaction. + */ +struct thandle { + /** the dt device on which the transactions are executed */ + struct dt_device *th_dev; + + /* point to the top thandle, XXX this is a bit hacky right now, + * but normal device trans callback triggered by the bottom + * device (OSP/OSD == sub thandle layer) needs to get the + * top_thandle (see dt_txn_hook_start/stop()), so we put the + * top thandle here for now, will fix it when we have better + * callback mechanism */ + struct thandle *th_top; + + /** the last operation result in this transaction. + * this value is used in recovery */ + __s32 th_result; + + /** whether we need sync commit */ + unsigned int th_sync:1, + /* local transation, no need to inform other layers */ + th_local:1, + /* Whether we need wait the transaction to be submitted + * (send to remote target) */ + th_wait_submit:1, + /* complex transaction which will track updates on all targets, + * including OSTs */ + th_complex:1, + /* whether ignore quota */ + th_ignore_quota:1; +}; + +/** + * Transaction call-backs. + * + * These are invoked by osd (or underlying transaction engine) when + * transaction changes state. + * + * Call-backs are used by upper layers to modify transaction parameters and to + * perform some actions on for each transaction state transition. Typical + * example is mdt registering call-back to write into last-received file + * before each transaction commit. + */ +struct dt_txn_callback { + int (*dtc_txn_start)(const struct lu_env *env, + struct thandle *txn, void *cookie); + int (*dtc_txn_stop)(const struct lu_env *env, + struct thandle *txn, void *cookie); + void *dtc_cookie; + __u32 dtc_tag; + struct list_head dtc_linkage; +}; + +void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb); +void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb); + +int dt_txn_hook_start(const struct lu_env *env, + struct dt_device *dev, struct thandle *txn); +int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn); + +int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj); + +/** + * Callback function used for parsing path. + * \see llo_store_resolve + */ +typedef int (*dt_entry_func_t)(const struct lu_env *env, + const char *name, + void *pvt); + +#define DT_MAX_PATH 1024 + +int dt_path_parser(const struct lu_env *env, + char *local, dt_entry_func_t entry_func, + void *data); + +struct dt_object * +dt_store_resolve(const struct lu_env *env, struct dt_device *dt, + const char *path, struct lu_fid *fid); + +struct dt_object *dt_store_open(const struct lu_env *env, + struct dt_device *dt, + const char *dirname, + const char *filename, + struct lu_fid *fid); + +struct dt_object *dt_find_or_create(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object_format *dof, + struct lu_attr *attr); + +struct dt_object *dt_locate_at(const struct lu_env *env, + struct dt_device *dev, + const struct lu_fid *fid, + struct lu_device *top_dev, + const struct lu_object_conf *conf); + +static inline struct dt_object * +dt_locate(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *fid) +{ + return dt_locate_at(env, dev, fid, + dev->dd_lu_dev.ld_site->ls_top_dev, NULL); +} + +static inline struct dt_object * +dt_object_locate(struct dt_object *dto, struct dt_device *dt_dev) +{ + struct lu_object *lo; + + list_for_each_entry(lo, &dto->do_lu.lo_header->loh_layers, lo_linkage) { + if (lo->lo_dev == &dt_dev->dd_lu_dev) + return container_of(lo, struct dt_object, do_lu); + } + return NULL; +} + +static inline void dt_object_put(const struct lu_env *env, + struct dt_object *dto) +{ + lu_object_put(env, &dto->do_lu); +} + +static inline void dt_object_put_nocache(const struct lu_env *env, + struct dt_object *dto) +{ + lu_object_put_nocache(env, &dto->do_lu); +} + +int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *first_fid, + struct local_oid_storage **los); +void local_oid_storage_fini(const struct lu_env *env, + struct local_oid_storage *los); +int local_object_fid_generate(const struct lu_env *env, + struct local_oid_storage *los, + struct lu_fid *fid); +int local_object_declare_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct lu_attr *attr, + struct dt_object_format *dof, + struct thandle *th); +int local_object_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct lu_attr *attr, struct dt_object_format *dof, + struct thandle *th); +struct dt_object *local_file_find(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name); +struct dt_object *local_file_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode); +struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, + __u32 mode); +struct dt_object * +local_index_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft); +struct dt_object * +local_index_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft); +int local_object_unlink(const struct lu_env *env, struct dt_device *dt, + struct dt_object *parent, const char *name); + +static inline int dt_object_lock(const struct lu_env *env, + struct dt_object *o, struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy) +{ + LASSERT(o != NULL); + LASSERT(o->do_ops != NULL); + LASSERT(o->do_ops->do_object_lock != NULL); + return o->do_ops->do_object_lock(env, o, lh, einfo, policy); +} + +static inline int dt_object_unlock(const struct lu_env *env, + struct dt_object *o, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy) +{ + LASSERT(o != NULL); + LASSERT(o->do_ops != NULL); + LASSERT(o->do_ops->do_object_unlock != NULL); + return o->do_ops->do_object_unlock(env, o, einfo, policy); +} + +int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir, + const char *name, struct lu_fid *fid); + +static inline int dt_object_sync(const struct lu_env *env, struct dt_object *o, + __u64 start, __u64 end) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_object_sync); + return o->do_ops->do_object_sync(env, o, start, end); +} + +int dt_declare_version_set(const struct lu_env *env, struct dt_object *o, + struct thandle *th); +void dt_version_set(const struct lu_env *env, struct dt_object *o, + dt_obj_version_t version, struct thandle *th); +dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o); + + +int dt_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos); +int dt_record_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos); +int dt_record_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, struct thandle *th); +typedef int (*dt_index_page_build_t)(const struct lu_env *env, + union lu_page *lp, size_t nob, + const struct dt_it_ops *iops, + struct dt_it *it, __u32 attr, void *arg); +int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + const struct lu_rdpg *rdpg, dt_index_page_build_t filler, + void *arg); +int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg); + +static inline struct thandle *dt_trans_create(const struct lu_env *env, + struct dt_device *d) +{ + LASSERT(d->dd_ops->dt_trans_create); + return d->dd_ops->dt_trans_create(env, d); +} + +static inline int dt_trans_start(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_start); + return d->dd_ops->dt_trans_start(env, d, th); +} + +/* for this transaction hooks shouldn't be called */ +static inline int dt_trans_start_local(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_start); + th->th_local = 1; + return d->dd_ops->dt_trans_start(env, d, th); +} + +static inline int dt_trans_stop(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_stop); + return d->dd_ops->dt_trans_stop(env, d, th); +} + +static inline int dt_trans_cb_add(struct thandle *th, + struct dt_txn_commit_cb *dcb) +{ + LASSERT(th->th_dev->dd_ops->dt_trans_cb_add); + dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC; + return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb); +} +/** @} dt */ + + +static inline int dt_declare_record_write(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + loff_t pos, + struct thandle *th) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); + LASSERT(th != NULL); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_write); + rc = dt->do_body_ops->dbo_declare_write(env, dt, buf, pos, th); + return rc; +} + +static inline int dt_declare_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_create); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_CREATE)) + return cfs_fail_err; + + return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th); +} + +static inline int dt_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_create); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_CREATE)) + return cfs_fail_err; + + return dt->do_ops->do_create(env, dt, attr, hint, dof, th); +} + +static inline int dt_declare_destroy(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_destroy); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_DESTROY)) + return cfs_fail_err; + + return dt->do_ops->do_declare_destroy(env, dt, th); +} + +static inline int dt_destroy(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_destroy); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DESTROY)) + return cfs_fail_err; + + return dt->do_ops->do_destroy(env, dt, th); +} + +static inline void dt_read_lock(const struct lu_env *env, + struct dt_object *dt, + unsigned role) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_read_lock); + dt->do_ops->do_read_lock(env, dt, role); +} + +static inline void dt_write_lock(const struct lu_env *env, + struct dt_object *dt, + unsigned role) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_lock); + dt->do_ops->do_write_lock(env, dt, role); +} + +static inline void dt_read_unlock(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_read_unlock); + dt->do_ops->do_read_unlock(env, dt); +} + +static inline void dt_write_unlock(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_unlock); + dt->do_ops->do_write_unlock(env, dt); +} + +static inline int dt_write_locked(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_locked); + return dt->do_ops->do_write_locked(env, dt); +} + +static inline int dt_declare_attr_get(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_attr_get); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_ATTR_GET)) + return cfs_fail_err; + + return dt->do_ops->do_declare_attr_get(env, dt); +} + +static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *la) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_attr_get); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_ATTR_GET)) + return cfs_fail_err; + + return dt->do_ops->do_attr_get(env, dt, la); +} + +static inline int dt_declare_attr_set(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *la, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_attr_set); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_ATTR_SET)) + return cfs_fail_err; + + return dt->do_ops->do_declare_attr_set(env, dt, la, th); +} + +static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_attr *la, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_attr_set); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_ATTR_SET)) + return cfs_fail_err; + + return dt->do_ops->do_attr_set(env, dt, la, th); +} + +static inline int dt_declare_ref_add(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_ref_add); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_REF_ADD)) + return cfs_fail_err; + + return dt->do_ops->do_declare_ref_add(env, dt, th); +} + +static inline int dt_ref_add(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_add); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_REF_ADD)) + return cfs_fail_err; + + return dt->do_ops->do_ref_add(env, dt, th); +} + +static inline int dt_declare_ref_del(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_ref_del); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_REF_DEL)) + return cfs_fail_err; + + return dt->do_ops->do_declare_ref_del(env, dt, th); +} + +static inline int dt_ref_del(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_del); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_REF_DEL)) + return cfs_fail_err; + + return dt->do_ops->do_ref_del(env, dt, th); +} + +static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d, + struct niobuf_remote *rnb, + struct niobuf_local *lnb, int maxlnb, + enum dt_bufs_type rw) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_bufs_get); + return d->do_body_ops->dbo_bufs_get(env, d, rnb->rnb_offset, + rnb->rnb_len, lnb, maxlnb, rw); +} + +static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_bufs_put); + return d->do_body_ops->dbo_bufs_put(env, d, lnb, n); +} + +static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_write_prep); + return d->do_body_ops->dbo_write_prep(env, d, lnb, n); +} + +static inline int dt_declare_write_commit(const struct lu_env *env, + struct dt_object *d, + struct niobuf_local *lnb, + int n, struct thandle *th) +{ + LASSERTF(d != NULL, "dt is NULL when we want to declare write\n"); + LASSERT(th != NULL); + return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th); +} + + +static inline int dt_write_commit(const struct lu_env *env, + struct dt_object *d, struct niobuf_local *lnb, + int n, struct thandle *th) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_write_commit); + return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th); +} + +static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_read_prep); + return d->do_body_ops->dbo_read_prep(env, d, lnb, n); +} + +static inline int dt_declare_write(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, loff_t pos, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_write); + return dt->do_body_ops->dbo_declare_write(env, dt, buf, pos, th); +} + +static inline ssize_t dt_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_write); + return dt->do_body_ops->dbo_write(env, dt, buf, pos, th); +} + +static inline int dt_declare_punch(const struct lu_env *env, + struct dt_object *dt, __u64 start, + __u64 end, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_punch); + return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th); +} + +static inline int dt_punch(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_punch); + return dt->do_body_ops->dbo_punch(env, dt, start, end, th); +} + +static inline int dt_ladvise(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, int advice) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_ladvise); + return dt->do_body_ops->dbo_ladvise(env, dt, start, end, advice); +} + +static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d, + struct fiemap *fm) +{ + LASSERT(d); + if (d->do_body_ops == NULL) + return -EPROTO; + if (d->do_body_ops->dbo_fiemap_get == NULL) + return -EOPNOTSUPP; + return d->do_body_ops->dbo_fiemap_get(env, d, fm); +} + +static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev, + struct obd_statfs *osfs) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_statfs); + return dev->dd_ops->dt_statfs(env, dev, osfs); +} + +static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev, + struct lu_fid *f) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_root_get); + return dev->dd_ops->dt_root_get(env, dev, f); +} + +static inline void dt_conf_get(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_conf_get); + return dev->dd_ops->dt_conf_get(env, dev, param); +} + +static inline struct super_block *dt_mnt_sb_get(const struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + if (dev->dd_ops->dt_mnt_sb_get) + return dev->dd_ops->dt_mnt_sb_get(dev); + + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int dt_sync(const struct lu_env *env, struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_sync); + return dev->dd_ops->dt_sync(env, dev); +} + +static inline int dt_ro(const struct lu_env *env, struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_ro); + return dev->dd_ops->dt_ro(env, dev); +} + +static inline int dt_declare_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_declare_insert); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_INSERT)) + return cfs_fail_err; + + return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th); +} + +static inline int dt_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_insert); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_INSERT)) + return cfs_fail_err; + + return dt->do_index_ops->dio_insert(env, dt, rec, key, th); +} + +static inline int dt_declare_xattr_del(const struct lu_env *env, + struct dt_object *dt, + const char *name, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_del); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_DEL)) + return cfs_fail_err; + + return dt->do_ops->do_declare_xattr_del(env, dt, name, th); +} + +static inline int dt_xattr_del(const struct lu_env *env, + struct dt_object *dt, const char *name, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_del); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_DEL)) + return cfs_fail_err; + + return dt->do_ops->do_xattr_del(env, dt, name, th); +} + +static inline int dt_declare_xattr_set(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, int fl, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_set); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_SET)) + return cfs_fail_err; + + return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th); +} + +static inline int dt_xattr_set(const struct lu_env *env, + struct dt_object *dt, const struct lu_buf *buf, + const char *name, int fl, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_set); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_SET)) + return cfs_fail_err; + + return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th); +} + +static inline int dt_declare_xattr_get(const struct lu_env *env, + struct dt_object *dt, + struct lu_buf *buf, + const char *name) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_get); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_GET)) + return cfs_fail_err; + + return dt->do_ops->do_declare_xattr_get(env, dt, buf, name); +} + +static inline int dt_xattr_get(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + const char *name) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_get); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_GET)) + return cfs_fail_err; + + return dt->do_ops->do_xattr_get(env, dt, buf, name); +} + +static inline int dt_xattr_list(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_list); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_LIST)) + return cfs_fail_err; + + return dt->do_ops->do_xattr_list(env, dt, buf); +} + +static inline int dt_invalidate(const struct lu_env *env, struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_invalidate); + + return dt->do_ops->do_invalidate(env, dt); +} + +static inline int dt_declare_delete(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_declare_delete); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_DELETE)) + return cfs_fail_err; + + return dt->do_index_ops->dio_declare_delete(env, dt, key, th); +} + +static inline int dt_delete(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_delete); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_DELETE)) + return cfs_fail_err; + + return dt->do_index_ops->dio_delete(env, dt, key, th); +} + +static inline int dt_commit_async(const struct lu_env *env, + struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_commit_async); + return dev->dd_ops->dt_commit_async(env, dev); +} + +static inline int dt_lookup(const struct lu_env *env, + struct dt_object *dt, + struct dt_rec *rec, + const struct dt_key *key) +{ + int ret; + + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_lookup); + + if (CFS_FAULT_CHECK(OBD_FAIL_DT_LOOKUP)) + return cfs_fail_err; + + ret = dt->do_index_ops->dio_lookup(env, dt, rec, key); + if (ret > 0) + ret = 0; + else if (ret == 0) + ret = -ENOENT; + return ret; +} + +static inline int dt_declare_layout_change(const struct lu_env *env, + struct dt_object *o, + struct md_layout_change *mlc, + struct thandle *th) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_declare_layout_change); + return o->do_ops->do_declare_layout_change(env, o, mlc, th); +} + +static inline int dt_layout_change(const struct lu_env *env, + struct dt_object *o, + struct md_layout_change *mlc, + struct thandle *th) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_layout_change); + return o->do_ops->do_layout_change(env, o, mlc, th); +} + +struct dt_find_hint { + struct lu_fid *dfh_fid; + struct dt_device *dfh_dt; + struct dt_object *dfh_o; +}; + +struct dt_insert_rec { + union { + const struct lu_fid *rec_fid; + void *rec_data; + }; + union { + struct { + __u32 rec_type; + __u32 rec_padding; + }; + __u64 rec_misc; + }; +}; + +struct dt_thread_info { + char dti_buf[DT_MAX_PATH]; + struct dt_find_hint dti_dfh; + struct lu_attr dti_attr; + struct lu_fid dti_fid; + struct dt_object_format dti_dof; + struct lustre_mdt_attrs dti_lma; + struct lu_buf dti_lb; + struct lu_object_conf dti_conf; + loff_t dti_off; + struct dt_insert_rec dti_dt_rec; +}; + +extern struct lu_context_key dt_key; + +static inline struct dt_thread_info *dt_info(const struct lu_env *env) +{ + struct dt_thread_info *dti; + + dti = lu_context_key_get(&env->le_ctx, &dt_key); + LASSERT(dti); + return dti; +} + +int dt_global_init(void); +void dt_global_fini(void); +int dt_tunables_init(struct dt_device *dt, struct obd_type *type, + const char *name, struct ldebugfs_vars *list); +int dt_tunables_fini(struct dt_device *dt); + +# ifdef CONFIG_PROC_FS +int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v); +int lprocfs_dt_kbytestotal_seq_show(struct seq_file *m, void *v); +int lprocfs_dt_kbytesfree_seq_show(struct seq_file *m, void *v); +int lprocfs_dt_kbytesavail_seq_show(struct seq_file *m, void *v); +int lprocfs_dt_filestotal_seq_show(struct seq_file *m, void *v); +int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v); +# endif /* CONFIG_PROC_FS */ + +#endif /* __LUSTRE_DT_OBJECT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/interval_tree.h b/drivers/staging/lustrefsx/lustre/include/interval_tree.h new file mode 100644 index 0000000000000..1598119aba5b5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/interval_tree.h @@ -0,0 +1,131 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/interval_tree.h + * + * Author: Huang Wei + * Author: Jay Xiong + */ + +#ifndef _INTERVAL_H__ +#define _INTERVAL_H__ + +#include +#include +#include + +struct interval_node { + struct interval_node *in_left; + struct interval_node *in_right; + struct interval_node *in_parent; + unsigned in_color:1, + in_intree:1, /** set if the node is in tree */ + in_res1:30; + __u8 in_res2[4]; /** tags, 8-bytes aligned */ + __u64 in_max_high; + struct interval_node_extent { + __u64 start; + __u64 end; + } in_extent; +}; + +enum interval_iter { + INTERVAL_ITER_CONT = 1, + INTERVAL_ITER_STOP = 2 +}; + +static inline int interval_is_intree(struct interval_node *node) +{ + return node->in_intree == 1; +} + +static inline __u64 interval_low(struct interval_node *node) +{ + return node->in_extent.start; +} + +static inline __u64 interval_high(struct interval_node *node) +{ + return node->in_extent.end; +} + +static inline int interval_set(struct interval_node *node, + __u64 start, __u64 end) +{ + if (start > end) + return -ERANGE; + node->in_extent.start = start; + node->in_extent.end = end; + node->in_max_high = end; + return 0; +} + +static inline void interval_init(struct interval_node *node) +{ + memset(node, 0, sizeof(*node)); +} + +int node_equal(struct interval_node *n1, struct interval_node *n2); + +/* Rules to write an interval callback. + * - the callback returns INTERVAL_ITER_STOP when it thinks the iteration + * should be stopped. It will then cause the iteration function to return + * immediately with return value INTERVAL_ITER_STOP. + * - callbacks for interval_iterate and interval_iterate_reverse: Every + * nodes in the tree will be set to @node before the callback being called + * - callback for interval_search: Only overlapped node will be set to @node + * before the callback being called. + */ +typedef enum interval_iter (*interval_callback_t)(struct interval_node *node, + void *args); + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root); +void interval_erase(struct interval_node *node, struct interval_node **root); + +/* Search the extents in the tree and call @func for each overlapped + * extents. */ +enum interval_iter interval_search(struct interval_node *root, + struct interval_node_extent *ex, + interval_callback_t func, void *data); + +/* Iterate every node in the tree - by reverse order or regular order. */ +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, void *data); +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func,void *data); + +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter); +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ex); +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex); +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/llog_swab.h b/drivers/staging/lustrefsx/lustre/include/llog_swab.h new file mode 100644 index 0000000000000..6fe62bce3bcb3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/llog_swab.h @@ -0,0 +1,68 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2015 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * We assume all nodes are either little-endian or big-endian, and we + * always send messages in the sender's native format. The receiver + * detects the message format by checking the 'magic' field of the message + * (see lustre_msg_swabbed() below). + * + * Each type has corresponding 'lustre_swab_xxxtypexxx()' routines + * are implemented in ptlrpc/pack_generic.c. These 'swabbers' convert the + * type from "other" endian, in-place in the message buffer. + * + * A swabber takes a single pointer argument. The caller must already have + * verified that the length of the message buffer >= sizeof (type). + * + * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine + * may be defined that swabs just the variable part, after the caller has + * verified that the message buffer is large enough. + */ + +#ifndef _LLOG_SWAB_H_ +#define _LLOG_SWAB_H_ + +#include +struct lustre_cfg; + +void lustre_swab_lu_fid(struct lu_fid *fid); +void lustre_swab_ost_id(struct ost_id *oid); +void lustre_swab_ll_fid(struct ll_fid *fid); +void lustre_swab_llogd_body(struct llogd_body *d); +void lustre_swab_llog_hdr(struct llog_log_hdr *h); +void lustre_swab_llogd_conn_body(struct llogd_conn_body *d); +void lustre_swab_llog_rec(struct llog_rec_hdr *rec); +void lustre_swab_llog_id(struct llog_logid *lid); +void lustre_swab_lu_seq_range(struct lu_seq_range *range); +void lustre_swab_update_ops(struct update_ops *uops, unsigned int op_count); +void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg); +void lustre_swab_cfg_marker(struct cfg_marker *marker, + int swab, int size); + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h new file mode 100644 index 0000000000000..6d032dfec8029 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h @@ -0,0 +1,1102 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lprocfs_status.h + * + * Top level header file for LProc + * + * Author: Hariharan Thantry thantry@users.sourceforge.net + */ +#ifndef _LPROCFS_STATUS_H +#define _LPROCFS_STATUS_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Liuux 5.6 introduces proc_ops with v5.5-8862-gd56c0d45f0e2 + * Now that proc and debugfs use separate operation vector types + * separate containers are also needed. + */ +struct lprocfs_vars { + const char *name; + const struct proc_ops *fops; + void *data; + /** /proc file mode. */ + mode_t proc_mode; +}; + +/** Provide a debugfs container */ +struct ldebugfs_vars { + const char *name; + const struct file_operations *fops; + void *data; + /** debugfs file mode. */ + mode_t proc_mode; +}; + +static inline unsigned int pct(unsigned long a, unsigned long b) +{ + return b ? a * 100 / b : 0; +} + +#define PAGES_TO_MiB(pages) ((pages) >> (20 - PAGE_SHIFT)) +#define MiB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + +/** + * Append a space separated list of current set flags to str. + */ +#define flag2str(port, flag) \ + do { \ + if ((port)->port##_##flag) { \ + seq_printf(m, "%s" #flag, first ? "" : ", "); \ + first = false; \ + } \ + } while (0) + +void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2, + const char *sep); +void obd_connect_data_seqprint(struct seq_file *m, + struct obd_connect_data *ocd); + +/* if we find more consumers this could be generalized */ +#define OBD_HIST_MAX 32 +struct obd_histogram { + spinlock_t oh_lock; + unsigned long oh_buckets[OBD_HIST_MAX]; +}; + +enum { + BRW_R_PAGES = 0, + BRW_W_PAGES, + BRW_R_RPC_HIST, + BRW_W_RPC_HIST, + BRW_R_IO_TIME, + BRW_W_IO_TIME, + BRW_R_DISCONT_PAGES, + BRW_W_DISCONT_PAGES, + BRW_R_DISCONT_BLOCKS, + BRW_W_DISCONT_BLOCKS, + BRW_R_DISK_IOSIZE, + BRW_W_DISK_IOSIZE, + BRW_R_DIO_FRAGS, + BRW_W_DIO_FRAGS, + BRW_LAST, +}; + +struct brw_stats { + struct obd_histogram hist[BRW_LAST]; +}; + +enum { + RENAME_SAMEDIR_SIZE = 0, + RENAME_CROSSDIR_SRC_SIZE, + RENAME_CROSSDIR_TGT_SIZE, + RENAME_LAST, +}; + +struct rename_stats { + struct obd_histogram hist[RENAME_LAST]; +}; + +/* An lprocfs counter can be configured using the enum bit masks below. + * + * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already + * protects this counter from concurrent updates. If not specified, + * lprocfs an internal per-counter lock variable. External locks are + * not used to protect counter increments, but are used to protect + * counter readout and resets. + * + * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples, + * (i.e. counter can be incremented by more than "1"). When specified, + * the counter maintains min, max and sum in addition to a simple + * invocation count. This allows averages to be be computed. + * If not specified, the counter is an increment-by-1 counter. + * min, max, sum, etc. are not maintained. + * + * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of + * squares (for multi-valued counter samples only). This allows + * external computation of standard deviation, but involves a 64-bit + * multiply per counter increment. + */ + +enum { + LPROCFS_CNTR_EXTERNALLOCK = 0x0001, + LPROCFS_CNTR_AVGMINMAX = 0x0002, + LPROCFS_CNTR_STDDEV = 0x0004, + + /* counter data type */ + LPROCFS_TYPE_REGS = 0x0100, + LPROCFS_TYPE_BYTES = 0x0200, + LPROCFS_TYPE_PAGES = 0x0400, + LPROCFS_TYPE_CYCLE = 0x0800, +}; + +#define LC_MIN_INIT ((~(__u64)0) >> 1) + +struct lprocfs_counter_header { + unsigned int lc_config; + const char *lc_name; /* must be static */ + const char *lc_units; /* must be static */ +}; + +struct lprocfs_counter { + __s64 lc_count; + __s64 lc_min; + __s64 lc_max; + __s64 lc_sumsquare; + /* + * Every counter has lc_array_sum[0], while lc_array_sum[1] is only + * for irq context counter, i.e. stats with + * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need + * lc_array_sum[1] + */ + __s64 lc_array_sum[1]; +}; +#define lc_sum lc_array_sum[0] +#define lc_sum_irq lc_array_sum[1] + +struct lprocfs_percpu { + struct lprocfs_counter lp_cntr[0]; +}; + +enum lprocfs_stats_lock_ops { + LPROCFS_GET_NUM_CPU = 0x0001, /* number allocated per-CPU stats */ + LPROCFS_GET_SMP_ID = 0x0002, /* current stat to be updated */ +}; + +enum lprocfs_stats_flags { + LPROCFS_STATS_FLAG_NONE = 0x0000, /* per cpu counter */ + LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu + * area and need locking */ + LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */ +}; + +enum lprocfs_fields_flags { + LPROCFS_FIELDS_FLAGS_CONFIG = 0x0001, + LPROCFS_FIELDS_FLAGS_SUM = 0x0002, + LPROCFS_FIELDS_FLAGS_MIN = 0x0003, + LPROCFS_FIELDS_FLAGS_MAX = 0x0004, + LPROCFS_FIELDS_FLAGS_AVG = 0x0005, + LPROCFS_FIELDS_FLAGS_SUMSQUARE = 0x0006, + LPROCFS_FIELDS_FLAGS_COUNT = 0x0007, +}; + +struct lprocfs_stats { + /* # of counters */ + unsigned short ls_num; + /* 1 + the biggest cpu # whose ls_percpu slot has been allocated */ + unsigned short ls_biggest_alloc_num; + enum lprocfs_stats_flags ls_flags; + /* Lock used when there are no percpu stats areas; For percpu stats, + * it is used to protect ls_biggest_alloc_num change */ + spinlock_t ls_lock; + + /* has ls_num of counter headers */ + struct lprocfs_counter_header *ls_cnt_header; + struct lprocfs_percpu *ls_percpu[0]; +}; + +#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC) + +/* Pack all opcodes down into a single monotonically increasing index */ +static inline int opcode_offset(__u32 opc) { + if (opc < OST_LAST_OPC) { + /* OST opcode */ + return (opc - OST_FIRST_OPC); + } else if (opc < MDS_LAST_OPC) { + /* MDS opcode */ + return (opc - MDS_FIRST_OPC + + OPC_RANGE(OST)); + } else if (opc < LDLM_LAST_OPC) { + /* LDLM Opcode */ + return (opc - LDLM_FIRST_OPC + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < MGS_LAST_OPC) { + /* MGS Opcode */ + return (opc - MGS_FIRST_OPC + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < OBD_LAST_OPC) { + /* OBD Ping */ + return (opc - OBD_FIRST_OPC + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < LLOG_LAST_OPC) { + /* LLOG Opcode */ + return (opc - LLOG_FIRST_OPC + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < QUOTA_LAST_OPC) { + /* LQUOTA Opcode */ + return (opc - QUOTA_FIRST_OPC + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < SEQ_LAST_OPC) { + /* SEQ opcode */ + return (opc - SEQ_FIRST_OPC + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < SEC_LAST_OPC) { + /* SEC opcode */ + return (opc - SEC_FIRST_OPC + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < FLD_LAST_OPC) { + /* FLD opcode */ + return (opc - FLD_FIRST_OPC + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < OUT_UPDATE_LAST_OPC) { + /* update opcode */ + return (opc - OUT_UPDATE_FIRST_OPC + + OPC_RANGE(FLD) + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < LFSCK_LAST_OPC) { + /* LFSCK opcode */ + return (opc - LFSCK_FIRST_OPC + + OPC_RANGE(OUT_UPDATE) + + OPC_RANGE(FLD) + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else { + /* Unknown Opcode */ + return -1; + } +} + + +#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST) + \ + OPC_RANGE(MDS) + \ + OPC_RANGE(LDLM) + \ + OPC_RANGE(MGS) + \ + OPC_RANGE(OBD) + \ + OPC_RANGE(LLOG) + \ + OPC_RANGE(SEC) + \ + OPC_RANGE(SEQ) + \ + OPC_RANGE(SEC) + \ + OPC_RANGE(FLD) + \ + OPC_RANGE(OUT_UPDATE) + \ + OPC_RANGE(LFSCK)) + +#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \ + OPC_RANGE(EXTRA)) + +enum { + PTLRPC_REQWAIT_CNTR = 0, + PTLRPC_REQQDEPTH_CNTR, + PTLRPC_REQACTIVE_CNTR, + PTLRPC_TIMEOUT, + PTLRPC_REQBUF_AVAIL_CNTR, + PTLRPC_LAST_CNTR +}; + +#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR + +enum lprocfs_extra_opc { + LDLM_GLIMPSE_ENQUEUE = 0, + LDLM_PLAIN_ENQUEUE, + LDLM_EXTENT_ENQUEUE, + LDLM_FLOCK_ENQUEUE, + LDLM_IBITS_ENQUEUE, + MDS_REINT_SETATTR, + MDS_REINT_CREATE, + MDS_REINT_LINK, + MDS_REINT_UNLINK, + MDS_REINT_RENAME, + MDS_REINT_OPEN, + MDS_REINT_SETXATTR, + MDS_REINT_RESYNC, + BRW_READ_BYTES, + BRW_WRITE_BYTES, + EXTRA_LAST_OPC +}; + +#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE +/* class_obd.c */ +extern struct proc_dir_entry *proc_lustre_root; +extern struct dentry *debugfs_lustre_root; +extern struct kset *lustre_kset; + +struct obd_device; +struct obd_histogram; + +#define JOBSTATS_JOBID_VAR_MAX_LEN 20 +#define JOBSTATS_DISABLE "disable" +#define JOBSTATS_PROCNAME_UID "procname_uid" +#define JOBSTATS_NODELOCAL "nodelocal" + +typedef void (*cntr_init_callback)(struct lprocfs_stats *stats); + +struct obd_job_stats { + struct cfs_hash *ojs_hash; /* hash of jobids */ + struct list_head ojs_list; /* list of job_stat structs */ + rwlock_t ojs_lock; /* protect ojs_list/js_list */ + unsigned int ojs_cleanup_interval;/* seconds before expiry */ + time64_t ojs_last_cleanup; /* previous cleanup time */ + cntr_init_callback ojs_cntr_init_fn;/* lprocfs_stats initializer */ + unsigned short ojs_cntr_num; /* number of stats in struct */ + bool ojs_cleaning; /* currently expiring stats */ +}; + +#ifdef CONFIG_PROC_FS + +int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, + unsigned int cpuid); +int lprocfs_stats_lock(struct lprocfs_stats *stats, + enum lprocfs_stats_lock_ops opc, + unsigned long *flags); +void lprocfs_stats_unlock(struct lprocfs_stats *stats, + enum lprocfs_stats_lock_ops opc, + unsigned long *flags); + +static inline unsigned int +lprocfs_stats_counter_size(struct lprocfs_stats *stats) +{ + unsigned int percpusize; + + percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]); + + /* irq safe stats need lc_array_sum[1] */ + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpusize += stats->ls_num * sizeof(__s64); + + if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0) + percpusize = L1_CACHE_ALIGN(percpusize); + + return percpusize; +} + +static inline struct lprocfs_counter * +lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid, + int index) +{ + struct lprocfs_counter *cntr; + + cntr = &stats->ls_percpu[cpuid]->lp_cntr[index]; + + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + cntr = (void *)cntr + index * sizeof(__s64); + + return cntr; +} + +/* Two optimized LPROCFS counter increment functions are provided: + * lprocfs_counter_incr(cntr, value) - optimized for by-one counters + * lprocfs_counter_add(cntr) - use for multi-valued counters + * Counter data layout allows config flag, counter lock and the + * count itself to reside within a single cache line. + */ + +extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, + long amount); +extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, + long amount); + +#define lprocfs_counter_incr(stats, idx) \ + lprocfs_counter_add(stats, idx, 1) +#define lprocfs_counter_decr(stats, idx) \ + lprocfs_counter_sub(stats, idx, 1) + +extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc, + struct lprocfs_counter_header *header, + enum lprocfs_stats_flags flags, + enum lprocfs_fields_flags field); +u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, + enum lprocfs_fields_flags field); + +extern struct lprocfs_stats * +lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags); +extern void lprocfs_clear_stats(struct lprocfs_stats *stats); +extern void lprocfs_free_stats(struct lprocfs_stats **stats); +extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats); +extern int lprocfs_alloc_obd_stats(struct obd_device *obddev, + unsigned int num_stats); +extern int lprocfs_alloc_md_stats(struct obd_device *obddev, + unsigned int num_private_stats); +extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + unsigned conf, const char *name, + const char *units); +extern void lprocfs_free_obd_stats(struct obd_device *obddev); +extern void lprocfs_free_md_stats(struct obd_device *obddev); +struct obd_export; +struct nid_stat; +extern int lprocfs_add_clear_entry(struct obd_device * obd, + struct proc_dir_entry *entry); +#ifdef HAVE_SERVER_SUPPORT +extern int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *peer_nid); +extern int lprocfs_exp_cleanup(struct obd_export *exp); +struct dentry *ldebugfs_add_symlink(const char *name, const char *target, + const char *format, ...); +#else +static inline int lprocfs_exp_cleanup(struct obd_export *exp) +{ return 0; } +#endif +struct dentry *ldebugfs_add_simple(struct dentry *root, char *name, void *data, + const struct file_operations *fops); +extern struct proc_dir_entry * +lprocfs_add_simple(struct proc_dir_entry *root, char *name, + void *data, const struct proc_ops *ops); +extern struct proc_dir_entry * +lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, + const char *format, ...); +extern void lprocfs_free_per_client_stats(struct obd_device *obd); +#ifdef HAVE_SERVER_SUPPORT +extern ssize_t +lprocfs_nid_stats_clear_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +extern int lprocfs_nid_stats_clear_seq_show(struct seq_file *file, void *data); +#endif +extern int ldebugfs_register_stats(struct dentry *parent, const char *name, + struct lprocfs_stats *stats); +extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name, + struct lprocfs_stats *stats); +extern const struct file_operations ldebugfs_stats_seq_fops; + +/* lprocfs_status.c */ +extern int ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *var, + void *data); +extern int lprocfs_add_vars(struct proc_dir_entry *root, + struct lprocfs_vars *var, void *data); + +extern struct dentry *ldebugfs_register(const char *name, + struct dentry *parent, + struct ldebugfs_vars *list, + void *data); +extern struct proc_dir_entry * +lprocfs_register(const char *name, struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data); + +extern void ldebugfs_remove(struct dentry **entryp); +extern void lprocfs_remove(struct proc_dir_entry **root); +extern void lprocfs_remove_proc_entry(const char *name, + struct proc_dir_entry *parent); +#ifndef HAVE_REMOVE_PROC_SUBTREE +extern int remove_proc_subtree(const char *name, + struct proc_dir_entry *parent); +#define PDE_DATA(inode) (PDE(inode)->data) + +static inline int LPROCFS_ENTRY_CHECK(struct inode *inode) +{ + struct proc_dir_entry *dp = PDE(inode); + int deleted = 0; + + spin_lock(&(dp)->pde_unload_lock); + if (dp->proc_fops == NULL) + deleted = 1; + spin_unlock(&(dp)->pde_unload_lock); + if (deleted) + return -ENODEV; + return 0; +} +#else +static inline int LPROCFS_ENTRY_CHECK(struct inode *inode) +{ return 0; } +#endif + +extern int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only); +extern int lprocfs_obd_cleanup(struct obd_device *obd); +#ifdef HAVE_SERVER_SUPPORT +extern const struct file_operations lprocfs_evict_client_fops; +#endif + +int ldebugfs_seq_create(struct dentry *parent, const char *name, umode_t mode, + const struct file_operations *seq_fops, void *data); +extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name, + mode_t mode, const struct proc_ops *seq_fops, + void *data); +extern int lprocfs_obd_seq_create(struct obd_device *obd, const char *name, + mode_t mode, const struct proc_ops *seq_fops, + void *data); + +/* Generic callbacks */ +extern int lprocfs_uuid_seq_show(struct seq_file *m, void *data); +extern int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data); +ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf); +extern int lprocfs_import_seq_show(struct seq_file *m, void *data); +extern int lprocfs_state_seq_show(struct seq_file *m, void *data); +extern int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data); +#ifdef HAVE_SERVER_SUPPORT +ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr, + char *buf); +#endif +struct adaptive_timeout; +extern int lprocfs_at_hist_helper(struct seq_file *m, + struct adaptive_timeout *at); +extern int lprocfs_timeouts_seq_show(struct seq_file *m, void *data); +extern ssize_t +lprocfs_timeouts_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +#ifdef HAVE_SERVER_SUPPORT +extern ssize_t +lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +#endif +ssize_t ping_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count); +ssize_t ping_show(struct kobject *kobj, struct attribute *attr, + char *buffer); + +extern ssize_t +ldebugfs_import_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); +static inline ssize_t +lprocfs_import_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + return ldebugfs_import_seq_write(file, buffer, count, off); +} + +extern int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data); +extern ssize_t +lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); + +int lu_str_to_s64(char *buffer, unsigned long count, __s64 *val, char defunit); +extern int lprocfs_str_with_units_to_s64(const char __user *buffer, + unsigned long count, __s64 *val, + char defunit); + +char *lprocfs_strnstr(const char *s1, const char *s2, size_t len); +char *lprocfs_find_named_value(const char *buffer, const char *name, + size_t *count); +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value); +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value); +void lprocfs_oh_clear(struct obd_histogram *oh); +unsigned long lprocfs_oh_sum(struct obd_histogram *oh); + +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt); + +#ifdef HAVE_SERVER_SUPPORT +/* lprocfs_status.c: recovery status */ +int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data); + +/* lprocfs_status.c: hash statistics */ +int lprocfs_hash_seq_show(struct seq_file *m, void *data); + +/* lprocfs_status.c: IR factor */ +ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count); +#endif + +/* lprocfs_status.c: dump pages on cksum error */ +int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data); +ssize_t +lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off); + +extern int lprocfs_single_release(struct inode *, struct file *); +extern int lprocfs_seq_release(struct inode *, struct file *); + +/* You must use these macros when you want to refer to + * the import in a client obd_device for a lprocfs entry */ +#define LPROCFS_CLIMP_CHECK(obd) do { \ + typecheck(struct obd_device *, obd); \ + down_read(&(obd)->u.cli.cl_sem); \ + if ((obd)->u.cli.cl_import == NULL) { \ + up_read(&(obd)->u.cli.cl_sem); \ + return -ENODEV; \ + } \ +} while(0) +#define LPROCFS_CLIMP_EXIT(obd) \ + up_read(&(obd)->u.cli.cl_sem); + +/* write the name##_seq_show function, call LDEBUGFS_SEQ_FOPS_RO for read-only + * debugfs entries; otherwise, you will define name##_seq_write function also + * for a read-write debugfs entry, and then call LDEBUGFS_SEQ_FOPS instead. + * Finally, call ldebugfs_seq_create(obd, filename, 0444, &name#_fops, data); + */ +#define __LDEBUGFS_SEQ_FOPS(name, custom_seq_write) \ +static int name##_single_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, name##_seq_show, inode->i_private); \ +} \ +static const struct file_operations name##_fops = { \ + .owner = THIS_MODULE, \ + .open = name##_single_open, \ + .read = seq_read, \ + .write = custom_seq_write, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} + +#define LDEBUGFS_SEQ_FOPS_RO(name) __LDEBUGFS_SEQ_FOPS(name, NULL) +#define LDEBUGFS_SEQ_FOPS(name) __LDEBUGFS_SEQ_FOPS(name, \ + name##_seq_write) + +#define LDEBUGFS_SEQ_FOPS_RO_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + return lprocfs_##type##_seq_show(m, m->private); \ + } \ + LDEBUGFS_SEQ_FOPS_RO(name##_##type) + +#define LDEBUGFS_SEQ_FOPS_RW_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + return lprocfs_##type##_seq_show(m, m->private); \ + } \ + static ssize_t name##_##type##_seq_write(struct file *file, \ + const char __user *buffer, size_t count, \ + loff_t *off) \ + { \ + struct seq_file *seq = file->private_data; \ + return ldebugfs_##type##_seq_write(file, buffer, count, \ + seq->private); \ + } \ + LDEBUGFS_SEQ_FOPS(name##_##type); + +#define LDEBUGFS_FOPS_WR_ONLY(name, type) \ + static ssize_t name##_##type##_write(struct file *file, \ + const char __user *buffer, size_t count, \ + loff_t *off) \ + { \ + return ldebugfs_##type##_seq_write(file, buffer, count, \ + off); \ + } \ + static int name##_##type##_open(struct inode *inode, \ + struct file *file) \ + { \ + return single_open(file, NULL, inode->i_private); \ + } \ + static const struct file_operations name##_##type##_fops = { \ + .open = name##_##type##_open, \ + .write = name##_##type##_write, \ + .release = single_release, \ + }; + +/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only + * proc entries; otherwise, you will define name##_seq_write function also for + * a read-write proc entry, and then call LPROC_SEQ_FOPS instead. Finally, + * call ldebugfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); + */ +#define __LPROC_SEQ_FOPS(name, custom_seq_write) \ +static int name##_single_open(struct inode *inode, struct file *file) \ +{ \ + int rc; \ + \ + rc = LPROCFS_ENTRY_CHECK(inode); \ + if (rc < 0) \ + return rc; \ + \ + return single_open(file, name##_seq_show, \ + inode->i_private ? inode->i_private : \ + PDE_DATA(inode)); \ +} \ +static const struct proc_ops name##_fops = { \ + PROC_OWNER(THIS_MODULE) \ + .proc_open = name##_single_open, \ + .proc_read = seq_read, \ + .proc_write = custom_seq_write, \ + .proc_lseek = seq_lseek, \ + .proc_release = lprocfs_single_release, \ +} + +#define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL) +#define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write) + +#define LPROC_SEQ_FOPS_RO_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + return lprocfs_##type##_seq_show(m, m->private); \ + } \ + LPROC_SEQ_FOPS_RO(name##_##type) + +#define LPROC_SEQ_FOPS_RW_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + return lprocfs_##type##_seq_show(m, m->private); \ + } \ + static ssize_t name##_##type##_seq_write(struct file *file, \ + const char __user *buffer, size_t count, \ + loff_t *off) \ + { \ + struct seq_file *seq = file->private_data; \ + return lprocfs_##type##_seq_write(file, buffer, \ + count, seq->private); \ + } \ + LPROC_SEQ_FOPS(name##_##type); + +#define LPROC_SEQ_FOPS_WR_ONLY(name, type) \ + static ssize_t name##_##type##_write(struct file *file, \ + const char __user *buffer, size_t count, \ + loff_t *off) \ + { \ + return lprocfs_##type##_seq_write(file, buffer, count, off);\ + } \ + static int name##_##type##_open(struct inode *inode, struct file *file)\ + { \ + return single_open(file, NULL, \ + inode->i_private ? inode->i_private : \ + PDE_DATA(inode)); \ + } \ + static const struct proc_ops name##_##type##_fops = { \ + .proc_open = name##_##type##_open, \ + .proc_write = name##_##type##_write, \ + .proc_release = lprocfs_single_release, \ + }; + +struct lustre_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len); +}; + +#define LUSTRE_ATTR(name, mode, show, store) \ +static struct lustre_attr lustre_attr_##name = __ATTR(name, mode, show, store) + +#define LUSTRE_WO_ATTR(name) LUSTRE_ATTR(name, 0200, NULL, name##_store) +#define LUSTRE_RO_ATTR(name) LUSTRE_ATTR(name, 0444, name##_show, NULL) +#define LUSTRE_RW_ATTR(name) LUSTRE_ATTR(name, 0644, name##_show, name##_store) + +ssize_t lustre_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t lustre_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len); + +extern const struct sysfs_ops lustre_sysfs_ops; + +/* lproc_ptlrpc.c */ +struct ptlrpc_request; +extern void target_print_req(void *seq_file, struct ptlrpc_request *req); + +#ifdef HAVE_SERVER_SUPPORT +/* lprocfs_jobstats.c */ +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, + int event, long amount); +void lprocfs_job_stats_fini(struct obd_device *obd); +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback fn); +ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t job_cleanup_interval_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +/* lproc_status_server.c */ +ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t recovery_time_soft_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t recovery_time_hard_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); +ssize_t instance_show(struct kobject *kobj, struct attribute *attr, + char *buf); +#endif +/* lproc_status.c */ +int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data); +ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off); +int lprocfs_obd_short_io_bytes_seq_show(struct seq_file *m, void *data); +ssize_t lprocfs_obd_short_io_bytes_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off); +ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count); + +struct root_squash_info; +int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count, + struct root_squash_info *squash, char *name); +int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count, + struct root_squash_info *squash, char *name); + +#else /* !CONFIG_PROC_FS */ + +#define proc_lustre_root NULL + +static inline void lprocfs_counter_add(struct lprocfs_stats *stats, + int index, long amount) +{ return; } +static inline void lprocfs_counter_incr(struct lprocfs_stats *stats, + int index) +{ return; } +static inline void lprocfs_counter_sub(struct lprocfs_stats *stats, + int index, long amount) +{ return; } +static inline void lprocfs_counter_decr(struct lprocfs_stats *stats, + int index) +{ return; } +static inline void lprocfs_counter_init(struct lprocfs_stats *stats, + int index, unsigned conf, + const char *name, const char *units) +{ return; } + +static inline __u64 lc_read_helper(struct lprocfs_counter *lc, + enum lprocfs_fields_flags field) +{ return 0; } + +/* NB: we return !NULL to satisfy error checker */ +static inline struct lprocfs_stats * +lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags) +{ return (struct lprocfs_stats *)1; } +static inline void lprocfs_clear_stats(struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_free_stats(struct lprocfs_stats **stats) +{ return; } +static inline int lprocfs_register_stats(struct proc_dir_entry *root, + const char *name, + struct lprocfs_stats *stats) +{ return 0; } +static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) +{ return; } +static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev, + unsigned int num_stats) +{ return 0; } +static inline int lprocfs_alloc_md_stats(struct obd_device *obddev, + unsigned int num_private_stats) +{ return 0; } +static inline void lprocfs_free_obd_stats(struct obd_device *obddev) +{ return; } +static inline void lprocfs_free_md_stats(struct obd_device *obddev) +{ return; } + +struct obd_export; +static inline int lprocfs_add_clear_entry(struct obd_export *exp) +{ return 0; } +static inline void lprocfs_free_per_client_stats(struct obd_device *obd) +{ return; } +#ifdef HAVE_SERVER_SUPPORT +static inline +ssize_t lprocfs_nid_stats_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{return 0;} +static inline +int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data) +{return 0;} +static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid) +{ return 0; } +#endif +static inline int lprocfs_exp_cleanup(struct obd_export *exp) +{ return 0; } +static inline struct proc_dir_entry * +lprocfs_add_simple(struct proc_dir_entry *root, char *name, + void *data, const struct file_operations *fops) +{return 0; } +static inline struct proc_dir_entry * +lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, + const char *format, ...) +{return NULL; } +static inline int lprocfs_add_vars(struct proc_dir_entry *root, + struct lprocfs_vars *var, void *data) +{ return 0; } +static inline struct proc_dir_entry * +lprocfs_register(const char *name, struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data) +{ return NULL; } +static inline void lprocfs_remove(struct proc_dir_entry **root) +{ return; } +static inline void lprocfs_remove_proc_entry(const char *name, + struct proc_dir_entry *parent) +{ return; } +static inline int lprocfs_obd_setup(struct obd_device *dev, bool uuid_only) +{ return 0; } +static inline int lprocfs_obd_cleanup(struct obd_device *dev) +{ return 0; } +static inline int lprocfs_uuid_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_server_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_import_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_state_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data) +{ return 0; } +#ifdef HAVE_SERVER_SUPPORT +static inline int lprocfs_num_exports_seq_show(struct seq_file *m, void *data) +{ return 0; } +#endif +struct adaptive_timeout; +static inline int lprocfs_at_hist_helper(struct seq_file *m, + struct adaptive_timeout *at) +{ return 0; } +static inline int lprocfs_timeouts_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline ssize_t +lprocfs_timeouts_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +#ifdef HAVE_SERVER_SUPPORT +static inline ssize_t +lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +#endif +static inline ssize_t +lprocfs_ping_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline ssize_t +ldebugfs_import_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline ssize_t +lprocfs_import_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline int +lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline ssize_t +lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ return 0; } + +/* Statfs helpers */ +static inline +int lprocfs_blksize_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_filestotal_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_filesfree_seq_show(struct seq_file *m, void *data) +{ return 0; } +static inline +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) +{ return; } +static inline +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) +{ return; } +static inline +void lprocfs_oh_clear(struct obd_histogram *oh) +{ return; } +static inline +unsigned long lprocfs_oh_sum(struct obd_histogram *oh) +{ return 0; } +static inline +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt) +{ return; } +static inline +u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, + enum lprocfs_fields_flags field) +{ return (__u64)0; } + +#define LPROC_SEQ_FOPS_RO(name) +#define LPROC_SEQ_FOPS(name) +#define LPROC_SEQ_FOPS_RO_TYPE(name, type) +#define LPROC_SEQ_FOPS_RW_TYPE(name, type) +#define LPROC_SEQ_FOPS_WR_ONLY(name, type) + +/* lprocfs_jobstats.c */ +static inline +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event, + long amount) +{ return 0; } +static inline +void lprocfs_job_stats_fini(struct obd_device *obd) +{ return; } +static inline +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback fn) +{ return 0; } + + +/* lproc_ptlrpc.c */ +#define target_print_req NULL + +#endif /* CONFIG_PROC_FS */ + +#endif /* LPROCFS_STATUS_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lu_object.h b/drivers/staging/lustrefsx/lustre/include/lu_object.h new file mode 100644 index 0000000000000..7734bab329e89 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lu_object.h @@ -0,0 +1,1634 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_LU_OBJECT_H +#define __LUSTRE_LU_OBJECT_H + +#ifdef HAVE_LINUX_STDARG_HEADER +#include +#else +#include +#endif +#include +#include +#include +#include + +struct seq_file; +struct proc_dir_entry; +struct lustre_cfg; +struct lprocfs_stats; + +/** \defgroup lu lu + * lu_* data-types represent server-side entities shared by data and meta-data + * stacks. + * + * Design goals: + * + * -# support for layering. + * + * Server side object is split into layers, one per device in the + * corresponding device stack. Individual layer is represented by struct + * lu_object. Compound layered object --- by struct lu_object_header. Most + * interface functions take lu_object as an argument and operate on the + * whole compound object. This decision was made due to the following + * reasons: + * + * - it's envisaged that lu_object will be used much more often than + * lu_object_header; + * + * - we want lower (non-top) layers to be able to initiate operations + * on the whole object. + * + * Generic code supports layering more complex than simple stacking, e.g., + * it is possible that at some layer object "spawns" multiple sub-objects + * on the lower layer. + * + * -# fid-based identification. + * + * Compound object is uniquely identified by its fid. Objects are indexed + * by their fids (hash table is used for index). + * + * -# caching and life-cycle management. + * + * Object's life-time is controlled by reference counting. When reference + * count drops to 0, object is returned to cache. Cached objects still + * retain their identity (i.e., fid), and can be recovered from cache. + * + * Objects are kept in the global LRU list, and lu_site_purge() function + * can be used to reclaim given number of unused objects from the tail of + * the LRU. + * + * -# avoiding recursion. + * + * Generic code tries to replace recursion through layers by iterations + * where possible. Additionally to the end of reducing stack consumption, + * data, when practically possible, are allocated through lu_context_key + * interface rather than on stack. + * @{ + */ + +struct lu_site; +struct lu_object; +struct lu_device; +struct lu_object_header; +struct lu_context; +struct lu_env; + +/** + * Operations common for data and meta-data devices. + */ +struct lu_device_operations { + /** + * Allocate object for the given device (without lower-layer + * parts). This is called by lu_object_operations::loo_object_init() + * from the parent layer, and should setup at least lu_object::lo_dev + * and lu_object::lo_ops fields of resulting lu_object. + * + * Object creation protocol. + * + * Due to design goal of avoiding recursion, object creation (see + * lu_object_alloc()) is somewhat involved: + * + * - first, lu_device_operations::ldo_object_alloc() method of the + * top-level device in the stack is called. It should allocate top + * level object (including lu_object_header), but without any + * lower-layer sub-object(s). + * + * - then lu_object_alloc() sets fid in the header of newly created + * object. + * + * - then lu_object_operations::loo_object_init() is called. It has + * to allocate lower-layer object(s). To do this, + * lu_object_operations::loo_object_init() calls ldo_object_alloc() + * of the lower-layer device(s). + * + * - for all new objects allocated by + * lu_object_operations::loo_object_init() (and inserted into object + * stack), lu_object_operations::loo_object_init() is called again + * repeatedly, until no new objects are created. + * + * \post ergo(!IS_ERR(result), result->lo_dev == d && + * result->lo_ops != NULL); + */ + struct lu_object *(*ldo_object_alloc)(const struct lu_env *env, + const struct lu_object_header *h, + struct lu_device *d); + /** + * process config specific for device. + */ + int (*ldo_process_config)(const struct lu_env *env, + struct lu_device *, struct lustre_cfg *); + int (*ldo_recovery_complete)(const struct lu_env *, + struct lu_device *); + + /** + * initialize local objects for device. this method called after layer has + * been initialized (after LCFG_SETUP stage) and before it starts serving + * user requests. + */ + + int (*ldo_prepare)(const struct lu_env *, + struct lu_device *parent, + struct lu_device *dev); + +}; + +/** + * For lu_object_conf flags + */ +typedef enum { + /* This is a new object to be allocated, or the file + * corresponding to the object does not exists. */ + LOC_F_NEW = 0x00000001, +} loc_flags_t; + +/** + * Object configuration, describing particulars of object being created. On + * server this is not used, as server objects are full identified by fid. On + * client configuration contains struct lustre_md. + */ +struct lu_object_conf { + /** + * Some hints for obj find and alloc. + */ + loc_flags_t loc_flags; +}; + +/** + * Type of "printer" function used by lu_object_operations::loo_object_print() + * method. + * + * Printer function is needed to provide some flexibility in (semi-)debugging + * output: possible implementations: printk, CDEBUG, sysfs/seq_file + */ +typedef int (*lu_printer_t)(const struct lu_env *env, + void *cookie, const char *format, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Operations specific for particular lu_object. + */ +struct lu_object_operations { + + /** + * Allocate lower-layer parts of the object by calling + * lu_device_operations::ldo_object_alloc() of the corresponding + * underlying device. + * + * This method is called once for each object inserted into object + * stack. It's responsibility of this method to insert lower-layer + * object(s) it create into appropriate places of object stack. + */ + int (*loo_object_init)(const struct lu_env *env, + struct lu_object *o, + const struct lu_object_conf *conf); + /** + * Called (in top-to-bottom order) during object allocation after all + * layers were allocated and initialized. Can be used to perform + * initialization depending on lower layers. + */ + int (*loo_object_start)(const struct lu_env *env, + struct lu_object *o); + /** + * Called before lu_object_operations::loo_object_free() to signal + * that object is being destroyed. Dual to + * lu_object_operations::loo_object_init(). + */ + void (*loo_object_delete)(const struct lu_env *env, + struct lu_object *o); + /** + * Dual to lu_device_operations::ldo_object_alloc(). Called when + * object is removed from memory. + */ + void (*loo_object_free)(const struct lu_env *env, + struct lu_object *o); + /** + * Called when last active reference to the object is released (and + * object returns to the cache). This method is optional. + */ + void (*loo_object_release)(const struct lu_env *env, + struct lu_object *o); + /** + * Optional debugging helper. Print given object. + */ + int (*loo_object_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o); + /** + * Optional debugging method. Returns true iff method is internally + * consistent. + */ + int (*loo_object_invariant)(const struct lu_object *o); +}; + +/** + * Type of lu_device. + */ +struct lu_device_type; + +/** + * Device: a layer in the server side abstraction stacking. + */ +struct lu_device { + /** + * reference count. This is incremented, in particular, on each object + * created at this layer. + * + * \todo XXX which means that atomic_t is probably too small. + */ + atomic_t ld_ref; + /** + * Pointer to device type. Never modified once set. + */ + struct lu_device_type *ld_type; + /** + * Operation vector for this device. + */ + const struct lu_device_operations *ld_ops; + /** + * Stack this device belongs to. + */ + struct lu_site *ld_site; + struct proc_dir_entry *ld_proc_entry; + + /** \todo XXX: temporary back pointer into obd. */ + struct obd_device *ld_obd; + /** + * A list of references to this object, for debugging. + */ + struct lu_ref ld_reference; + /** + * Link the device to the site. + **/ + struct list_head ld_linkage; +}; + +struct lu_device_type_operations; + +/** + * Tag bits for device type. They are used to distinguish certain groups of + * device types. + */ +enum lu_device_tag { + /** this is meta-data device */ + LU_DEVICE_MD = (1 << 0), + /** this is data device */ + LU_DEVICE_DT = (1 << 1), + /** data device in the client stack */ + LU_DEVICE_CL = (1 << 2) +}; + +/** + * Type of device. + */ +struct lu_device_type { + /** + * Tag bits. Taken from enum lu_device_tag. Never modified once set. + */ + __u32 ldt_tags; + /** + * Name of this class. Unique system-wide. Never modified once set. + */ + char *ldt_name; + /** + * Operations for this type. + */ + const struct lu_device_type_operations *ldt_ops; + /** + * \todo XXX: temporary pointer to associated obd_type. + */ + struct obd_type *ldt_obd_type; + /** + * \todo XXX: temporary: context tags used by obd_*() calls. + */ + __u32 ldt_ctx_tags; + /** + * Number of existing device type instances. + */ + atomic_t ldt_device_nr; +}; + +/** + * Operations on a device type. + */ +struct lu_device_type_operations { + /** + * Allocate new device. + */ + struct lu_device *(*ldto_device_alloc)(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *lcfg); + /** + * Free device. Dual to + * lu_device_type_operations::ldto_device_alloc(). Returns pointer to + * the next device in the stack. + */ + struct lu_device *(*ldto_device_free)(const struct lu_env *, + struct lu_device *); + + /** + * Initialize the devices after allocation + */ + int (*ldto_device_init)(const struct lu_env *env, + struct lu_device *, const char *, + struct lu_device *); + /** + * Finalize device. Dual to + * lu_device_type_operations::ldto_device_init(). Returns pointer to + * the next device in the stack. + */ + struct lu_device *(*ldto_device_fini)(const struct lu_env *env, + struct lu_device *); + /** + * Initialize device type. This is called on module load. + */ + int (*ldto_init)(struct lu_device_type *t); + /** + * Finalize device type. Dual to + * lu_device_type_operations::ldto_init(). Called on module unload. + */ + void (*ldto_fini)(struct lu_device_type *t); + /** + * Called when the first device is created. + */ + void (*ldto_start)(struct lu_device_type *t); + /** + * Called when number of devices drops to 0. + */ + void (*ldto_stop)(struct lu_device_type *t); +}; + +static inline int lu_device_is_md(const struct lu_device *d) +{ + return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD); +} + +/** + * Common object attributes. + */ +struct lu_attr { + /** + * valid bits + * + * \see enum la_valid + */ + __u64 la_valid; + /** size in bytes */ + __u64 la_size; + /** modification time in seconds since Epoch */ + s64 la_mtime; + /** access time in seconds since Epoch */ + s64 la_atime; + /** change time in seconds since Epoch */ + s64 la_ctime; + /** 512-byte blocks allocated to object */ + __u64 la_blocks; + /** permission bits and file type */ + __u32 la_mode; + /** owner id */ + __u32 la_uid; + /** group id */ + __u32 la_gid; + /** object flags */ + __u32 la_flags; + /** number of persistent references to this object */ + __u32 la_nlink; + /** blk bits of the object*/ + __u32 la_blkbits; + /** blk size of the object*/ + __u32 la_blksize; + /** real device */ + __u32 la_rdev; + /** project id */ + __u32 la_projid; + /** set layout version to OST objects. */ + __u32 la_layout_version; +}; + +/** + * Layer in the layered object. + */ +struct lu_object { + /** + * Header for this object. + */ + struct lu_object_header *lo_header; + /** + * Device for this layer. + */ + struct lu_device *lo_dev; + /** + * Operations for this object. + */ + const struct lu_object_operations *lo_ops; + /** + * Linkage into list of all layers. + */ + struct list_head lo_linkage; + /** + * Link to the device, for debugging. + */ + struct lu_ref_link lo_dev_ref; +}; + +enum lu_object_header_flags { + /** + * Don't keep this object in cache. Object will be destroyed as soon + * as last reference to it is released. This flag cannot be cleared + * once set. + */ + LU_OBJECT_HEARD_BANSHEE = 0, + /** + * Mark this object has already been taken out of cache. + */ + LU_OBJECT_UNHASHED = 1, + /** + * Object is initialized, when object is found in cache, it may not be + * intialized yet, the object allocator will initialize it. + */ + LU_OBJECT_INITED = 2 +}; + +enum lu_object_header_attr { + LOHA_EXISTS = 1 << 0, + LOHA_REMOTE = 1 << 1, + LOHA_HAS_AGENT_ENTRY = 1 << 2, + /** + * UNIX file type is stored in S_IFMT bits. + */ + LOHA_FT_START = 001 << 12, /**< S_IFIFO */ + LOHA_FT_END = 017 << 12, /**< S_IFMT */ +}; + +/** + * "Compound" object, consisting of multiple layers. + * + * Compound object with given fid is unique with given lu_site. + * + * Note, that object does *not* necessary correspond to the real object in the + * persistent storage: object is an anchor for locking and method calling, so + * it is created for things like not-yet-existing child created by mkdir or + * create calls. lu_object_operations::loo_exists() can be used to check + * whether object is backed by persistent storage entity. + */ +struct lu_object_header { + /** + * Fid, uniquely identifying this object. + */ + struct lu_fid loh_fid; + /** + * Object flags from enum lu_object_header_flags. Set and checked + * atomically. + */ + unsigned long loh_flags; + /** + * Object reference count. Protected by lu_site::ls_guard. + */ + atomic_t loh_ref; + /** + * Common object attributes, cached for efficiency. From enum + * lu_object_header_attr. + */ + __u32 loh_attr; + /** + * Linkage into per-site hash table. Protected by lu_site::ls_guard. + */ + struct hlist_node loh_hash; + /** + * Linkage into per-site LRU list. Protected by lu_site::ls_guard. + */ + struct list_head loh_lru; + /** + * Linkage into list of layers. Never modified once set (except lately + * during object destruction). No locking is necessary. + */ + struct list_head loh_layers; + /** + * A list of references to this object, for debugging. + */ + struct lu_ref loh_reference; +}; + +struct fld; + +enum { + LU_SS_CREATED = 0, + LU_SS_CACHE_HIT, + LU_SS_CACHE_MISS, + LU_SS_CACHE_RACE, + LU_SS_CACHE_DEATH_RACE, + LU_SS_LRU_PURGED, + LU_SS_LAST_STAT +}; + +/** + * lu_site is a "compartment" within which objects are unique, and LRU + * discipline is maintained. + * + * lu_site exists so that multiple layered stacks can co-exist in the same + * address space. + * + * lu_site has the same relation to lu_device as lu_object_header to + * lu_object. + */ +struct lu_site { + /** + * objects hash table + */ + struct cfs_hash *ls_obj_hash; + /** + * index of bucket on hash table while purging + */ + unsigned int ls_purge_start; + /** + * Top-level device for this stack. + */ + struct lu_device *ls_top_dev; + /** + * Bottom-level device for this stack + */ + struct lu_device *ls_bottom_dev; + /** + * Linkage into global list of sites. + */ + struct list_head ls_linkage; + /** + * List for lu device for this site, protected + * by ls_ld_lock. + **/ + struct list_head ls_ld_linkage; + spinlock_t ls_ld_lock; + /** + * Lock to serialize site purge. + */ + struct mutex ls_purge_mutex; + /** + * lu_site stats + */ + struct lprocfs_stats *ls_stats; + /** + * XXX: a hack! fld has to find md_site via site, remove when possible + */ + struct seq_server_site *ld_seq_site; + /** + * Pointer to the lu_target for this site. + */ + struct lu_target *ls_tgt; + + /** + * Number of objects in lsb_lru_lists - used for shrinking + */ + struct percpu_counter ls_lru_len_counter; +}; + +wait_queue_head_t * +lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid); + +static inline struct seq_server_site *lu_site2seq(const struct lu_site *s) +{ + return s->ld_seq_site; +} + +/** \name ctors + * Constructors/destructors. + * @{ + */ + +int lu_site_init (struct lu_site *s, struct lu_device *d); +void lu_site_fini (struct lu_site *s); +int lu_site_init_finish (struct lu_site *s); +void lu_stack_fini (const struct lu_env *env, struct lu_device *top); +void lu_device_get (struct lu_device *d); +void lu_device_put (struct lu_device *d); +int lu_device_init (struct lu_device *d, struct lu_device_type *t); +void lu_device_fini (struct lu_device *d); +int lu_object_header_init(struct lu_object_header *h); +void lu_object_header_fini(struct lu_object_header *h); +int lu_object_init (struct lu_object *o, + struct lu_object_header *h, struct lu_device *d); +void lu_object_fini (struct lu_object *o); +void lu_object_add_top (struct lu_object_header *h, struct lu_object *o); +void lu_object_add (struct lu_object *before, struct lu_object *o); + +void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d); +void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d); + +/** + * Helpers to initialize and finalize device types. + */ + +int lu_device_type_init(struct lu_device_type *ldt); +void lu_device_type_fini(struct lu_device_type *ldt); + +/** @} ctors */ + +/** \name caching + * Caching and reference counting. + * @{ + */ + +/** + * Acquire additional reference to the given object. This function is used to + * attain additional reference. To acquire initial reference use + * lu_object_find(). + */ +static inline void lu_object_get(struct lu_object *o) +{ + LASSERT(atomic_read(&o->lo_header->loh_ref) > 0); + atomic_inc(&o->lo_header->loh_ref); +} + +/** + * Return true if object will not be cached after last reference to it is + * released. + */ +static inline int lu_object_is_dying(const struct lu_object_header *h) +{ + return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags); +} + +/** + * Return true if object is initialized. + */ +static inline int lu_object_is_inited(const struct lu_object_header *h) +{ + return test_bit(LU_OBJECT_INITED, &h->loh_flags); +} + +void lu_object_put(const struct lu_env *env, struct lu_object *o); +void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o); +void lu_object_unhash(const struct lu_env *env, struct lu_object *o); +int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s, int nr, + int canblock); + +static inline int lu_site_purge(const struct lu_env *env, struct lu_site *s, + int nr) +{ + return lu_site_purge_objects(env, s, nr, 1); +} + +void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, + lu_printer_t printer); +struct lu_object *lu_object_find(const struct lu_env *env, + struct lu_device *dev, const struct lu_fid *f, + const struct lu_object_conf *conf); +struct lu_object *lu_object_find_at(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf); +struct lu_object *lu_object_find_slice(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf); +/** @} caching */ + +/** \name helpers + * Helpers. + * @{ + */ + +/** + * First (topmost) sub-object of given compound object + */ +static inline struct lu_object *lu_object_top(struct lu_object_header *h) +{ + LASSERT(!list_empty(&h->loh_layers)); + return container_of0(h->loh_layers.next, struct lu_object, lo_linkage); +} + +/** + * Next sub-object in the layering + */ +static inline struct lu_object *lu_object_next(const struct lu_object *o) +{ + return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage); +} + +/** + * Pointer to the fid of this object. + */ +static inline const struct lu_fid *lu_object_fid(const struct lu_object *o) +{ + return &o->lo_header->loh_fid; +} + +/** + * return device operations vector for this object + */ +static const inline struct lu_device_operations * +lu_object_ops(const struct lu_object *o) +{ + return o->lo_dev->ld_ops; +} + +/** + * Given a compound object, find its slice, corresponding to the device type + * \a dtype. + */ +struct lu_object *lu_object_locate(struct lu_object_header *h, + const struct lu_device_type *dtype); + +/** + * Printer function emitting messages through libcfs_debug_msg(). + */ +int lu_cdebug_printer(const struct lu_env *env, + void *cookie, const char *format, ...); + +/** + * Print object description followed by a user-supplied message. + */ +#define LU_OBJECT_DEBUG(mask, env, object, format, ...) \ +do { \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + lu_object_print(env, &msgdata, lu_cdebug_printer, object);\ + CDEBUG(mask, format "\n", ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Print short object description followed by a user-supplied message. + */ +#define LU_OBJECT_HEADER(mask, env, object, format, ...) \ +do { \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + lu_object_header_print(env, &msgdata, lu_cdebug_printer,\ + (object)->lo_header); \ + lu_cdebug_printer(env, &msgdata, "\n"); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +void lu_object_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct lu_object *o); +void lu_object_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct lu_object_header *hdr); + +/** + * Check object consistency. + */ +int lu_object_invariant(const struct lu_object *o); + + +/** + * Check whether object exists, no matter on local or remote storage. + * Note: LOHA_EXISTS will be set once some one created the object, + * and it does not needs to be committed to storage. + */ +#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS) + +/** + * Check whether object on the remote storage. + */ +#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE) + +/** + * Check whether the object as agent entry on current target + */ +#define lu_object_has_agent_entry(o) \ + unlikely((o)->lo_header->loh_attr & LOHA_HAS_AGENT_ENTRY) + +static inline void lu_object_set_agent_entry(struct lu_object *o) +{ + o->lo_header->loh_attr |= LOHA_HAS_AGENT_ENTRY; +} + +static inline void lu_object_clear_agent_entry(struct lu_object *o) +{ + o->lo_header->loh_attr &= ~LOHA_HAS_AGENT_ENTRY; +} + +static inline int lu_object_assert_exists(const struct lu_object *o) +{ + return lu_object_exists(o); +} + +static inline int lu_object_assert_not_exists(const struct lu_object *o) +{ + return !lu_object_exists(o); +} + +/** + * Attr of this object. + */ +static inline __u32 lu_object_attr(const struct lu_object *o) +{ + LASSERT(lu_object_exists(o) != 0); + + return o->lo_header->loh_attr & S_IFMT; +} + +static inline void lu_object_ref_add(struct lu_object *o, + const char *scope, + const void *source) +{ + lu_ref_add(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_add_at(struct lu_object *o, + struct lu_ref_link *link, + const char *scope, + const void *source) +{ + lu_ref_add_at(&o->lo_header->loh_reference, link, scope, source); +} + +static inline void lu_object_ref_del(struct lu_object *o, + const char *scope, const void *source) +{ + lu_ref_del(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_del_at(struct lu_object *o, + struct lu_ref_link *link, + const char *scope, const void *source) +{ + lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source); +} + +/** input params, should be filled out by mdt */ +struct lu_rdpg { + /** hash */ + __u64 rp_hash; + /** count in bytes */ + unsigned int rp_count; + /** number of pages */ + unsigned int rp_npages; + /** requested attr */ + __u32 rp_attrs; + /** pointers to pages */ + struct page **rp_pages; +}; + +enum lu_xattr_flags { + LU_XATTR_REPLACE = (1 << 0), + LU_XATTR_CREATE = (1 << 1), + LU_XATTR_MERGE = (1 << 2), + LU_XATTR_SPLIT = (1 << 3), +}; + +/** @} helpers */ + +/** \name lu_context + * @{ */ + +/** For lu_context health-checks */ +enum lu_context_state { + LCS_INITIALIZED = 1, + LCS_ENTERED, + LCS_LEFT, + LCS_FINALIZED +}; + +/** + * lu_context. Execution context for lu_object methods. Currently associated + * with thread. + * + * All lu_object methods, except device and device type methods (called during + * system initialization and shutdown) are executed "within" some + * lu_context. This means, that pointer to some "current" lu_context is passed + * as an argument to all methods. + * + * All service ptlrpc threads create lu_context as part of their + * initialization. It is possible to create "stand-alone" context for other + * execution environments (like system calls). + * + * lu_object methods mainly use lu_context through lu_context_key interface + * that allows each layer to associate arbitrary pieces of data with each + * context (see pthread_key_create(3) for similar interface). + * + * On a client, lu_context is bound to a thread, see cl_env_get(). + * + * \see lu_context_key + */ +struct lu_context { + /** + * lu_context is used on the client side too. Yet we don't want to + * allocate values of server-side keys for the client contexts and + * vice versa. + * + * To achieve this, set of tags in introduced. Contexts and keys are + * marked with tags. Key value are created only for context whose set + * of tags has non-empty intersection with one for key. Tags are taken + * from enum lu_context_tag. + */ + __u32 lc_tags; + enum lu_context_state lc_state; + /** + * Pointer to the home service thread. NULL for other execution + * contexts. + */ + struct ptlrpc_thread *lc_thread; + /** + * Pointer to an array with key values. Internal implementation + * detail. + */ + void **lc_value; + /** + * Linkage into a list of all remembered contexts. Only + * `non-transient' contexts, i.e., ones created for service threads + * are placed here. + */ + struct list_head lc_remember; + /** + * Version counter used to skip calls to lu_context_refill() when no + * keys were registered. + */ + unsigned lc_version; + /** + * Debugging cookie. + */ + unsigned lc_cookie; +}; + +/** + * lu_context_key interface. Similar to pthread_key. + */ + +enum lu_context_tag { + /** + * Thread on md server + */ + LCT_MD_THREAD = 1 << 0, + /** + * Thread on dt server + */ + LCT_DT_THREAD = 1 << 1, + /** + * Thread on client + */ + LCT_CL_THREAD = 1 << 3, + /** + * A per-request session on a server, and a per-system-call session on + * a client. + */ + LCT_SESSION = 1 << 4, + /** + * A per-request data on OSP device + */ + LCT_OSP_THREAD = 1 << 5, + /** + * MGS device thread + */ + LCT_MG_THREAD = 1 << 6, + /** + * Context for local operations + */ + LCT_LOCAL = 1 << 7, + /** + * session for server thread + **/ + LCT_SERVER_SESSION = 1 << 8, + /** + * Set when at least one of keys, having values in this context has + * non-NULL lu_context_key::lct_exit() method. This is used to + * optimize lu_context_exit() call. + */ + LCT_HAS_EXIT = 1 << 28, + /** + * Don't add references for modules creating key values in that context. + * This is only for contexts used internally by lu_object framework. + */ + LCT_NOREF = 1 << 29, + /** + * Key is being prepared for retiring, don't create new values for it. + */ + LCT_QUIESCENT = 1 << 30, + /** + * Context should be remembered. + */ + LCT_REMEMBER = 1 << 31, + /** + * Contexts usable in cache shrinker thread. + */ + LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF +}; + +/** + * Key. Represents per-context value slot. + * + * Keys are usually registered when module owning the key is initialized, and + * de-registered when module is unloaded. Once key is registered, all new + * contexts with matching tags, will get key value. "Old" contexts, already + * initialized at the time of key registration, can be forced to get key value + * by calling lu_context_refill(). + * + * Every key value is counted in lu_context_key::lct_used and acquires a + * reference on an owning module. This means, that all key values have to be + * destroyed before module can be unloaded. This is usually achieved by + * stopping threads started by the module, that created contexts in their + * entry functions. Situation is complicated by the threads shared by multiple + * modules, like ptlrpcd daemon on a client. To work around this problem, + * contexts, created in such threads, are `remembered' (see + * LCT_REMEMBER)---i.e., added into a global list. When module is preparing + * for unloading it does the following: + * + * - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT) + * preventing new key values from being allocated in the new contexts, + * and + * + * - scans a list of remembered contexts, destroying values of module + * keys, thus releasing references to the module. + * + * This is done by lu_context_key_quiesce(). If module is re-activated + * before key has been de-registered, lu_context_key_revive() call clears + * `quiescent' marker. + * + * lu_context code doesn't provide any internal synchronization for these + * activities---it's assumed that startup (including threads start-up) and + * shutdown are serialized by some external means. + * + * \see lu_context + */ +struct lu_context_key { + /** + * Set of tags for which values of this key are to be instantiated. + */ + __u32 lct_tags; + /** + * Value constructor. This is called when new value is created for a + * context. Returns pointer to new value of error pointer. + */ + void *(*lct_init)(const struct lu_context *ctx, + struct lu_context_key *key); + /** + * Value destructor. Called when context with previously allocated + * value of this slot is destroyed. \a data is a value that was returned + * by a matching call to lu_context_key::lct_init(). + */ + void (*lct_fini)(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + /** + * Optional method called on lu_context_exit() for all allocated + * keys. Can be used by debugging code checking that locks are + * released, etc. + */ + void (*lct_exit)(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + /** + * Internal implementation detail: index within lu_context::lc_value[] + * reserved for this key. + */ + int lct_index; + /** + * Internal implementation detail: number of values created for this + * key. + */ + atomic_t lct_used; + /** + * Internal implementation detail: module for this key. + */ + struct module *lct_owner; + /** + * References to this key. For debugging. + */ + struct lu_ref lct_reference; +}; + +#define LU_KEY_INIT(mod, type) \ + static void *mod##_key_init(const struct lu_context *ctx, \ + struct lu_context_key *key) \ + { \ + type *value; \ + \ + CLASSERT(PAGE_SIZE >= sizeof(*value)); \ + \ + OBD_ALLOC_PTR(value); \ + if (value == NULL) \ + value = ERR_PTR(-ENOMEM); \ + \ + return value; \ + } \ + struct __##mod##__dummy_init { ; } /* semicolon catcher */ + +#define LU_KEY_FINI(mod, type) \ + static void mod##_key_fini(const struct lu_context *ctx, \ + struct lu_context_key *key, void* data) \ + { \ + type *info = data; \ + \ + OBD_FREE_PTR(info); \ + } \ + struct __##mod##__dummy_fini {;} /* semicolon catcher */ + +#define LU_KEY_INIT_FINI(mod, type) \ + LU_KEY_INIT(mod,type); \ + LU_KEY_FINI(mod,type) + +#define LU_CONTEXT_KEY_DEFINE(mod, tags) \ + struct lu_context_key mod##_thread_key = { \ + .lct_tags = tags, \ + .lct_init = mod##_key_init, \ + .lct_fini = mod##_key_fini \ + } + +#define LU_CONTEXT_KEY_INIT(key) \ +do { \ + (key)->lct_owner = THIS_MODULE; \ +} while (0) + +int lu_context_key_register(struct lu_context_key *key); +void lu_context_key_degister(struct lu_context_key *key); +void *lu_context_key_get (const struct lu_context *ctx, + const struct lu_context_key *key); +void lu_context_key_quiesce (struct lu_context_key *key); +void lu_context_key_revive (struct lu_context_key *key); + + +/* + * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an + * owning module. + */ + +#define LU_KEY_INIT_GENERIC(mod) \ + static void mod##_key_init_generic(struct lu_context_key *k, ...) \ + { \ + struct lu_context_key *key = k; \ + va_list args; \ + \ + va_start(args, k); \ + do { \ + LU_CONTEXT_KEY_INIT(key); \ + key = va_arg(args, struct lu_context_key *); \ + } while (key != NULL); \ + va_end(args); \ + } + +#define LU_TYPE_INIT(mod, ...) \ + LU_KEY_INIT_GENERIC(mod) \ + static int mod##_type_init(struct lu_device_type *t) \ + { \ + mod##_key_init_generic(__VA_ARGS__, NULL); \ + return lu_context_key_register_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_init {;} + +#define LU_TYPE_FINI(mod, ...) \ + static void mod##_type_fini(struct lu_device_type *t) \ + { \ + lu_context_key_degister_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_fini {;} + +#define LU_TYPE_START(mod, ...) \ + static void mod##_type_start(struct lu_device_type *t) \ + { \ + lu_context_key_revive_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_start {;} + +#define LU_TYPE_STOP(mod, ...) \ + static void mod##_type_stop(struct lu_device_type *t) \ + { \ + lu_context_key_quiesce_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_stop {;} + + + +#define LU_TYPE_INIT_FINI(mod, ...) \ + LU_TYPE_INIT(mod, __VA_ARGS__); \ + LU_TYPE_FINI(mod, __VA_ARGS__); \ + LU_TYPE_START(mod, __VA_ARGS__); \ + LU_TYPE_STOP(mod, __VA_ARGS__) + +int lu_context_init (struct lu_context *ctx, __u32 tags); +void lu_context_fini (struct lu_context *ctx); +void lu_context_enter (struct lu_context *ctx); +void lu_context_exit (struct lu_context *ctx); +int lu_context_refill(struct lu_context *ctx); + +/* + * Helper functions to operate on multiple keys. These are used by the default + * device type operations, defined by LU_TYPE_INIT_FINI(). + */ + +int lu_context_key_register_many(struct lu_context_key *k, ...); +void lu_context_key_degister_many(struct lu_context_key *k, ...); +void lu_context_key_revive_many (struct lu_context_key *k, ...); +void lu_context_key_quiesce_many (struct lu_context_key *k, ...); + +/* + * update/clear ctx/ses tags. + */ +void lu_context_tags_update(__u32 tags); +void lu_context_tags_clear(__u32 tags); +void lu_session_tags_update(__u32 tags); +void lu_session_tags_clear(__u32 tags); + +/** + * Environment. + */ +struct lu_env { + /** + * "Local" context, used to store data instead of stack. + */ + struct lu_context le_ctx; + /** + * "Session" context for per-request data. + */ + struct lu_context *le_ses; +}; + +int lu_env_init (struct lu_env *env, __u32 tags); +void lu_env_fini (struct lu_env *env); +int lu_env_refill(struct lu_env *env); +int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags); + +static inline void* lu_env_info(const struct lu_env *env, + const struct lu_context_key *key) +{ + void *info; + info = lu_context_key_get(&env->le_ctx, key); + if (!info) { + if (!lu_env_refill((struct lu_env *)env)) + info = lu_context_key_get(&env->le_ctx, key); + } + LASSERT(info); + return info; +} + +#ifdef HAVE_SERVER_SUPPORT +struct lu_env *lu_env_find(void); +int lu_env_add(struct lu_env *env); +void lu_env_remove(struct lu_env *env); +#else +static inline struct lu_env *lu_env_find(void) +{ + return NULL; +} +static inline int lu_env_add(struct lu_env *env) +{ + return 0; +} +static inline void lu_env_remove(struct lu_env *env) +{ +} +#endif /* HAVE_SERVER_SUPPORT */ + +/** @} lu_context */ + +/** + * Output site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m); + +/** + * Common name structure to be passed around for various name related methods. + */ +struct lu_name { + const char *ln_name; + int ln_namelen; +}; + +static inline bool name_is_dot_or_dotdot(const char *name, int namelen) +{ + return name[0] == '.' && + (namelen == 1 || (namelen == 2 && name[1] == '.')); +} + +static inline bool lu_name_is_dot_or_dotdot(const struct lu_name *lname) +{ + return name_is_dot_or_dotdot(lname->ln_name, lname->ln_namelen); +} + +static inline bool lu_name_is_valid_len(const char *name, size_t name_len) +{ + return name != NULL && + name_len > 0 && + name_len < INT_MAX && + strlen(name) == name_len && + memchr(name, '/', name_len) == NULL; +} + +/** + * Validate names (path components) + * + * To be valid \a name must be non-empty, '\0' terminated of length \a + * name_len, and not contain '/'. The maximum length of a name (before + * say -ENAMETOOLONG will be returned) is really controlled by llite + * and the server. We only check for something insane coming from bad + * integer handling here. + */ +static inline bool lu_name_is_valid_2(const char *name, size_t name_len) +{ + return lu_name_is_valid_len(name, name_len) && name[name_len] == '\0'; +} + +static inline bool lu_name_is_valid(const struct lu_name *ln) +{ + return lu_name_is_valid_2(ln->ln_name, ln->ln_namelen); +} + +#define DNAME "%.*s" +#define PNAME(ln) \ + (lu_name_is_valid(ln) ? (ln)->ln_namelen : 0), \ + (lu_name_is_valid(ln) ? (ln)->ln_name : "") + +/** + * Common buffer structure to be passed around for various xattr_{s,g}et() + * methods. + */ +struct lu_buf { + void *lb_buf; + size_t lb_len; +}; + +#define DLUBUF "(%p %zu)" +#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len + +/* read buffer params, should be filled out by out */ +struct lu_rdbuf { + /** number of buffers */ + unsigned int rb_nbufs; + /** pointers to buffers */ + struct lu_buf rb_bufs[]; +}; + +/** + * One-time initializers, called at obdclass module initialization, not + * exported. + */ + +/** + * Initialization of global lu_* data. + */ +int lu_global_init(void); + +/** + * Dual to lu_global_init(). + */ +void lu_global_fini(void); + +struct lu_kmem_descr { + struct kmem_cache **ckd_cache; + const char *ckd_name; + const size_t ckd_size; +}; + +int lu_kmem_init(struct lu_kmem_descr *caches); +void lu_kmem_fini(struct lu_kmem_descr *caches); + +void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o, + const struct lu_fid *fid); +struct lu_object *lu_object_anon(const struct lu_env *env, + struct lu_device *dev, + const struct lu_object_conf *conf); + +/** null buffer */ +extern struct lu_buf LU_BUF_NULL; + +void lu_buf_free(struct lu_buf *buf); +void lu_buf_alloc(struct lu_buf *buf, size_t size); +void lu_buf_realloc(struct lu_buf *buf, size_t size); + +int lu_buf_check_and_grow(struct lu_buf *buf, size_t len); +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len); + +extern __u32 lu_context_tags_default; +extern __u32 lu_session_tags_default; + +static inline bool lu_device_is_cl(const struct lu_device *d) +{ + return d->ld_type->ldt_tags & LU_DEVICE_CL; +} + +static inline bool lu_object_is_cl(const struct lu_object *o) +{ + return lu_device_is_cl(o->lo_dev); +} + +/* Generic subset of tgts */ +struct lu_tgt_pool { + __u32 *op_array; /* array of index of + * lov_obd->lov_tgts */ + unsigned int op_count; /* number of tgts in the array */ + unsigned int op_size; /* allocated size of op_array */ + struct rw_semaphore op_rw_sem; /* to protect lu_tgt_pool use */ +}; + +/* bitflags used in rr / qos allocation */ +enum lq_flag { + LQ_DIRTY = 0, /* recalc qos data */ + LQ_SAME_SPACE, /* the OSTs all have approx. + * the same space avail */ + LQ_RESET, /* zero current penalties */ +}; + +/* round-robin QoS data for LOD/LMV */ +struct lu_qos_rr { + spinlock_t lqr_alloc; /* protect allocation index */ + __u32 lqr_start_idx; /* start index of new inode */ + __u32 lqr_offset_idx;/* aliasing for start_idx */ + int lqr_start_count;/* reseed counter */ + struct lu_tgt_pool lqr_pool; /* round-robin optimized list */ + unsigned long lqr_flags; +}; + +/* QoS data per MDS/OSS */ +struct lu_svr_qos { + struct obd_uuid lsq_uuid; /* ptlrpc's c_remote_uuid */ + struct list_head lsq_svr_list; /* link to lq_svr_list */ + __u64 lsq_bavail; /* total bytes avail on svr */ + __u64 lsq_iavail; /* total inode avail on svr */ + __u64 lsq_penalty; /* current penalty */ + __u64 lsq_penalty_per_obj; /* penalty decrease + * every obj*/ + time64_t lsq_used; /* last used time, seconds */ + __u32 lsq_tgt_count; /* number of tgts on this svr */ + __u32 lsq_id; /* unique svr id */ +}; + +/* QoS data per MDT/OST */ +struct lu_tgt_qos { + struct lu_svr_qos *ltq_svr; /* svr info */ + __u64 ltq_penalty; /* current penalty */ + __u64 ltq_penalty_per_obj; /* penalty decrease + * every obj*/ + __u64 ltq_avail; /* bytes/inode avail */ + __u64 ltq_weight; /* net weighting */ + time64_t ltq_used; /* last used time, seconds */ + bool ltq_usable:1; /* usable for striping */ +}; + +/* target descriptor */ +struct lu_tgt_desc { + union { + struct dt_device *ltd_tgt; + struct obd_device *ltd_obd; + }; + struct obd_export *ltd_exp; + struct obd_uuid ltd_uuid; + __u32 ltd_index; + __u32 ltd_gen; + struct list_head ltd_kill; + struct ptlrpc_thread *ltd_recovery_thread; + struct mutex ltd_fid_mutex; + struct lu_tgt_qos ltd_qos; /* qos info per target */ + struct obd_statfs ltd_statfs; + time64_t ltd_statfs_age; + unsigned long ltd_active:1,/* is this target up for requests */ + ltd_activate:1,/* should target be activated */ + ltd_reap:1, /* should this target be deleted */ + ltd_got_update_log:1, /* Already got update log */ + ltd_connecting:1; /* target is connecting */ +}; + +/* number of pointers at 1st level */ +#define TGT_PTRS (PAGE_SIZE / sizeof(void *)) +/* number of pointers at 2nd level */ +#define TGT_PTRS_PER_BLOCK (PAGE_SIZE / sizeof(void *)) + +struct lu_tgt_desc_idx { + struct lu_tgt_desc *ldi_tgt[TGT_PTRS_PER_BLOCK]; +}; + +/* QoS data for LOD/LMV */ +struct lu_qos { + struct list_head lq_svr_list; /* lu_svr_qos list */ + struct rw_semaphore lq_rw_sem; + __u32 lq_active_svr_count; + unsigned int lq_prio_free; /* priority for free space */ + unsigned int lq_threshold_rr;/* priority for rr */ + struct lu_qos_rr lq_rr; /* round robin qos data */ + unsigned long lq_flags; +#if 0 + unsigned long lq_dirty:1, /* recalc qos data */ + lq_same_space:1,/* the servers all have approx. + * the same space avail */ + lq_reset:1; /* zero current penalties */ +#endif +}; + +struct lu_tgt_descs { + union { + struct lov_desc ltd_lov_desc; + struct lmv_desc ltd_lmv_desc; + }; + /* list of known TGTs */ + struct lu_tgt_desc_idx *ltd_tgt_idx[TGT_PTRS]; + /* Size of the lu_tgts array, granted to be a power of 2 */ + __u32 ltd_tgts_size; + /* bitmap of TGTs available */ + struct cfs_bitmap *ltd_tgt_bitmap; + /* TGTs scheduled to be deleted */ + __u32 ltd_death_row; + /* Table refcount used for delayed deletion */ + int ltd_refcount; + /* mutex to serialize concurrent updates to the tgt table */ + struct mutex ltd_mutex; + /* read/write semaphore used for array relocation */ + struct rw_semaphore ltd_rw_sem; + /* QoS */ + struct lu_qos ltd_qos; + /* all tgts in a packed array */ + struct lu_tgt_pool ltd_tgt_pool; + /* true if tgt is MDT */ + bool ltd_is_mdt; +}; + +#define LTD_TGT(ltd, index) \ + (ltd)->ltd_tgt_idx[(index) / \ + TGT_PTRS_PER_BLOCK]->ldi_tgt[(index) % TGT_PTRS_PER_BLOCK] + +u64 lu_prandom_u64_max(u64 ep_ro); +void lu_qos_rr_init(struct lu_qos_rr *lqr); +int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); +void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt); + +int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt); +void lu_tgt_descs_fini(struct lu_tgt_descs *ltd); +int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); +void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); +int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd); +int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, + __u64 *total_wt); + +/** + * Whether MDT inode and space usages are balanced. + */ +static inline bool ltd_qos_is_balanced(struct lu_tgt_descs *ltd) +{ + return !test_bit(LQ_DIRTY, <d->ltd_qos.lq_flags) && + test_bit(LQ_SAME_SPACE, <d->ltd_qos.lq_flags); +} + +/** + * Whether QoS data is up-to-date and QoS can be applied. + */ +static inline bool ltd_qos_is_usable(struct lu_tgt_descs *ltd) +{ + if (ltd_qos_is_balanced(ltd)) + return false; + + if (ltd->ltd_lov_desc.ld_active_tgt_count < 2) + return false; + + return true; +} + +static inline struct lu_tgt_desc *ltd_first_tgt(struct lu_tgt_descs *ltd) +{ + int index; + + index = find_first_bit(ltd->ltd_tgt_bitmap->data, + ltd->ltd_tgt_bitmap->size); + return (index < ltd->ltd_tgt_bitmap->size) ? LTD_TGT(ltd, index) : NULL; +} + +static inline struct lu_tgt_desc *ltd_next_tgt(struct lu_tgt_descs *ltd, + struct lu_tgt_desc *tgt) +{ + int index; + + if (!tgt) + return NULL; + + index = tgt->ltd_index; + LASSERT(index < ltd->ltd_tgt_bitmap->size); + index = find_next_bit(ltd->ltd_tgt_bitmap->data, + ltd->ltd_tgt_bitmap->size, index + 1); + return (index < ltd->ltd_tgt_bitmap->size) ? LTD_TGT(ltd, index) : NULL; +} + +#define ltd_foreach_tgt(ltd, tgt) \ + for (tgt = ltd_first_tgt(ltd); tgt; tgt = ltd_next_tgt(ltd, tgt)) + +#define ltd_foreach_tgt_safe(ltd, tgt, tmp) \ + for (tgt = ltd_first_tgt(ltd), tmp = ltd_next_tgt(ltd, tgt); tgt; \ + tgt = tmp, tmp = ltd_next_tgt(ltd, tgt)) + +/** @} lu */ +#endif /* __LUSTRE_LU_OBJECT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lu_ref.h b/drivers/staging/lustrefsx/lustre/include/lu_ref.h new file mode 100644 index 0000000000000..c7366c0481320 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lu_ref.h @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + * + * Author: Nikita Danilov + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef __LUSTRE_LU_REF_H +#define __LUSTRE_LU_REF_H + +#include + +/** \defgroup lu_ref lu_ref + * + * An interface to track references between objects. Mostly for debugging. + * + * Suppose there is a reference counted data-structure struct foo. To track + * who acquired references to instance of struct foo, add lu_ref field to it: + * + * \code + * struct foo { + * atomic_t foo_refcount; + * struct lu_ref foo_reference; + * ... + * }; + * \endcode + * + * foo::foo_reference has to be initialized by calling + * lu_ref_init(). Typically there will be functions or macros to increment and + * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo) + * and foo_put(struct foo *foo), respectively. + * + * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add() + * has to be called to insert into foo::foo_reference a record, describing + * acquired reference. Dually, lu_ref_del() removes matching record. Typical + * usages are: + * + * \code + * struct bar *bar; + * + * // bar owns a reference to foo. + * bar->bar_foo = foo_get(foo); + * lu_ref_add(&foo->foo_reference, "bar", bar); + * + * ... + * + * // reference from bar to foo is released. + * lu_ref_del(&foo->foo_reference, "bar", bar); + * foo_put(bar->bar_foo); + * + * + * // current thread acquired a temporary reference to foo. + * foo_get(foo); + * lu_ref_add(&foo->reference, __FUNCTION__, current); + * + * ... + * + * // temporary reference is released. + * lu_ref_del(&foo->reference, __FUNCTION__, current); + * foo_put(foo); + * \endcode + * + * \e Et \e cetera. Often it makes sense to include lu_ref_add() and + * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct + * foo is destroyed, lu_ref_fini() has to be called that checks that no + * pending references remain. lu_ref_print() can be used to dump a list of + * pending references, while hunting down a leak. + * + * For objects to which a large number of references can be acquired, + * lu_ref_del() can become cpu consuming, as it has to scan the list of + * references. To work around this, remember result of lu_ref_add() (usually + * in the same place where pointer to struct foo is stored), and use + * lu_ref_del_at(): + * + * \code + * // There is a large number of bar's for a single foo. + * bar->bar_foo = foo_get(foo); + * bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar); + * + * ... + * + * // reference from bar to foo is released. + * lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar); + * foo_put(bar->bar_foo); + * \endcode + * + * lu_ref interface degrades gracefully in case of memory shortages. + * + * @{ + */ + +#ifdef USE_LU_REF + +/** + * Data-structure to keep track of references to a given object. This is used + * for debugging. + * + * lu_ref is embedded into an object which other entities (objects, threads, + * etc.) refer to. + */ +struct lu_ref { + /** + * Spin-lock protecting lu_ref::lf_list. + */ + spinlock_t lf_guard; + /** + * List of all outstanding references (each represented by struct + * lu_ref_link), pointing to this object. + */ + struct list_head lf_list; + /** + * # of links. + */ + short lf_refs; + /** + * Flag set when lu_ref_add() failed to allocate lu_ref_link. It is + * used to mask spurious failure of the following lu_ref_del(). + */ + short lf_failed; + /** + * flags - attribute for the lu_ref, for pad and future use. + */ + short lf_flags; + /** + * Where was I initialized? + */ + short lf_line; + const char *lf_func; + /** + * Linkage into a global list of all lu_ref's (lu_ref_refs). + */ + struct list_head lf_linkage; +}; + +struct lu_ref_link { + struct lu_ref *ll_ref; + struct list_head ll_linkage; + const char *ll_scope; + const void *ll_source; +}; + +void lu_ref_init_loc(struct lu_ref *ref, const char *func, const int line); +void lu_ref_fini (struct lu_ref *ref); +#define lu_ref_init(ref) lu_ref_init_loc(ref, __FUNCTION__, __LINE__) + +void lu_ref_add(struct lu_ref *ref, const char *scope, const void *source); + +void lu_ref_add_atomic(struct lu_ref *ref, const char *scope, + const void *source); + +void lu_ref_add_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source); + +void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source); + +void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source0, const void *source1); + +void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source); + +void lu_ref_print(const struct lu_ref *ref); + +void lu_ref_print_all(void); + +int lu_ref_global_init(void); + +void lu_ref_global_fini(void); + +#else /* !USE_LU_REF */ + +struct lu_ref { +}; + +struct lu_ref_link { +}; + +static inline void lu_ref_init(struct lu_ref *ref) +{ +} + +static inline void lu_ref_fini(struct lu_ref *ref) +{ +} + +static inline void lu_ref_add(struct lu_ref *ref, + const char *scope, + const void *source) +{ +} + +static inline void lu_ref_add_atomic(struct lu_ref *ref, + const char *scope, + const void *source) +{ +} + +static inline void lu_ref_add_at(struct lu_ref *ref, + struct lu_ref_link *link, + const char *scope, + const void *source) +{ +} + +static inline void lu_ref_del(struct lu_ref *ref, const char *scope, + const void *source) +{ +} + +static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source0, + const void *source1) +{ +} + +static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source) +{ +} + +static inline int lu_ref_global_init(void) +{ + return 0; +} + +static inline void lu_ref_global_fini(void) +{ +} + +static inline void lu_ref_print(const struct lu_ref *ref) +{ +} + +static inline void lu_ref_print_all(void) +{ +} +#endif /* USE_LU_REF */ + +/** @} lu */ + +#endif /* __LUSTRE_LU_REF_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lu_target.h b/drivers/staging/lustrefsx/lustre/include/lu_target.h new file mode 100644 index 0000000000000..0810fbea8b55e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lu_target.h @@ -0,0 +1,719 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_LU_TARGET_H +#define _LUSTRE_LU_TARGET_H + +#include +#include +#include +#include +#include + +/* Each one represents a distribute transaction replay + * operation, and updates on each MDTs are linked to + * dtr_sub_list */ +struct distribute_txn_replay_req { + /* update record, may be vmalloc'd */ + struct llog_update_record *dtrq_lur; + int dtrq_lur_size; + + /* linked to the distribute transaction replay + * list (tdtd_replay_list) */ + struct list_head dtrq_list; + __u64 dtrq_master_transno; + __u64 dtrq_batchid; + __u64 dtrq_xid; + + /* all of sub updates are linked here */ + struct list_head dtrq_sub_list; + spinlock_t dtrq_sub_list_lock; + + /* If the local update has been executed during replay */ + __u32 dtrq_local_update_executed:1; +}; + +/* Each one represents a sub replay item under a distribute + * transaction. A distribute transaction will be operated in + * two or more MDTs, and updates on each MDT will be represented + * by this structure */ +struct distribute_txn_replay_req_sub { + __u32 dtrqs_mdt_index; + + /* All of cookies for the update will be linked here */ + spinlock_t dtrqs_cookie_list_lock; + struct list_head dtrqs_cookie_list; + struct list_head dtrqs_list; +}; + +struct target_distribute_txn_data; +typedef int (*distribute_txn_replay_handler_t)(struct lu_env *env, + struct target_distribute_txn_data *tdtd, + struct distribute_txn_replay_req *dtrq); +typedef char *(*target_show_update_logs_retrievers_t)(void *data, int *size, + int *count); +struct target_distribute_txn_data { + /* Distribution ID is used to identify updates log on different + * MDTs for one operation */ + spinlock_t tdtd_batchid_lock; + __u64 tdtd_batchid; + struct lu_target *tdtd_lut; + struct dt_object *tdtd_batchid_obj; + struct dt_device *tdtd_dt; + + /* Committed batchid for distribute transaction */ + __u64 tdtd_committed_batchid; + + /* List for distribute transaction */ + struct list_head tdtd_list; + + /* Threads to manage distribute transaction */ + wait_queue_head_t tdtd_commit_thread_waitq; + atomic_t tdtd_refcount; + + /* recovery update */ + distribute_txn_replay_handler_t tdtd_replay_handler; + struct list_head tdtd_replay_list; + struct list_head tdtd_replay_finish_list; + spinlock_t tdtd_replay_list_lock; + /* last replay update transno */ + __u32 tdtd_replay_ready:1; + + /* Manage the llog recovery threads */ + atomic_t tdtd_recovery_threads_count; + wait_queue_head_t tdtd_recovery_threads_waitq; + target_show_update_logs_retrievers_t + tdtd_show_update_logs_retrievers; + void *tdtd_show_retrievers_cbdata; +}; + +struct tg_grants_data { + /* grants: all values in bytes */ + /* grant lock to protect all grant counters */ + spinlock_t tgd_grant_lock; + /* total amount of dirty data reported by clients in incoming obdo */ + u64 tgd_tot_dirty; + /* sum of filesystem space granted to clients for async writes */ + u64 tgd_tot_granted; + /* grant used by I/Os in progress (between prepare and commit) */ + u64 tgd_tot_pending; + /* amount of available space in percentage that is never used for + * grants, used on MDT to always keep space for metadata. */ + u64 tgd_reserved_pcnt; + /* number of clients using grants */ + int tgd_tot_granted_clients; + /* shall we grant space to clients not + * supporting OBD_CONNECT_GRANT_PARAM? */ + unsigned int tgd_grant_compat_disable:1; + /* protect all statfs-related counters */ + spinlock_t tgd_osfs_lock; + time64_t tgd_osfs_age; + int tgd_blockbits; + /* counters used during statfs update, protected by ofd_osfs_lock. + * record when some statfs refresh are in progress */ + int tgd_statfs_inflight; + /* writes between prep & commit which might be accounted twice in + * ofd_osfs.os_bavail */ + u64 tgd_osfs_unstable; + /* track writes completed while statfs refresh is underway. + * tracking is only effective when ofd_statfs_inflight > 1 */ + u64 tgd_osfs_inflight; + /* statfs optimization: we cache a bit */ + struct obd_statfs tgd_osfs; +}; + +struct lu_target { + struct obd_device *lut_obd; + struct dt_device *lut_bottom; + struct dt_device_param lut_dt_conf; + + struct target_distribute_txn_data *lut_tdtd; + struct ptlrpc_thread lut_tdtd_commit_thread; + + /* supported opcodes and handlers for this target */ + struct tgt_opc_slice *lut_slice; + __u32 lut_reply_fail_id; + __u32 lut_request_fail_id; + + /* sptlrpc rules */ + rwlock_t lut_sptlrpc_lock; + struct sptlrpc_rule_set lut_sptlrpc_rset; + spinlock_t lut_flags_lock; + unsigned int lut_syncjournal:1, + lut_sync_lock_cancel:2, + /* e.g. OST node */ + lut_no_reconstruct:1; + /** last_rcvd file */ + struct dt_object *lut_last_rcvd; + /* transaction callbacks */ + struct dt_txn_callback lut_txn_cb; + /** server data in last_rcvd file */ + struct lr_server_data lut_lsd; + /** Server last transaction number */ + __u64 lut_last_transno; + /** Lock protecting last transaction number */ + spinlock_t lut_translock; + /** Lock protecting client bitmap */ + spinlock_t lut_client_bitmap_lock; + /** Bitmap of known clients */ + unsigned long *lut_client_bitmap; + /* Number of clients supporting multiple modify RPCs + * recorded in the bitmap */ + atomic_t lut_num_clients; + /* Client generation to identify client slot reuse */ + atomic_t lut_client_generation; + /** reply_data file */ + struct dt_object *lut_reply_data; + /** Bitmap of used slots in the reply data file */ + unsigned long **lut_reply_bitmap; + /** target sync count, used for debug & test */ + atomic_t lut_sync_count; + + /** cross MDT locks which should trigger Sync-on-Lock-Cancel */ + spinlock_t lut_slc_locks_guard; + struct list_head lut_slc_locks; + + /* target grants fields */ + struct tg_grants_data lut_tgd; + + /* target tunables */ + const struct attribute **lut_attrs; + + /* FMD (file modification data) values */ + int lut_fmd_max_num; + time64_t lut_fmd_max_age; +}; + +#define LUT_FMD_MAX_NUM_DEFAULT 128 +#define LUT_FMD_MAX_AGE_DEFAULT (obd_timeout + 10) + +/* number of slots in reply bitmap */ +#define LUT_REPLY_SLOTS_PER_CHUNK (1<<20) +#define LUT_REPLY_SLOTS_MAX_CHUNKS 16 + +/** + * Target reply data + */ +struct tg_reply_data { + /** chain of reply data anchored in tg_export_data */ + struct list_head trd_list; + /** copy of on-disk reply data */ + struct lsd_reply_data trd_reply; + /** versions for Version Based Recovery */ + __u64 trd_pre_versions[4]; + /** slot index in reply_data file */ + int trd_index; + /** tag the client used */ + __u16 trd_tag; +}; + +extern struct lu_context_key tgt_session_key; + +struct tgt_session_info { + /* + * The following members will be filled explicitly + * with specific data in tgt_ses_init(). + */ + struct req_capsule *tsi_pill; + + /* + * Lock request for "habeo clavis" operations. + */ + struct ldlm_request *tsi_dlm_req; + + /* although we have export in req, there are cases when it is not + * available, e.g. closing files upon export destroy */ + struct obd_export *tsi_exp; + const struct lu_env *tsi_env; + struct lu_target *tsi_tgt; + + const struct mdt_body *tsi_mdt_body; + struct ost_body *tsi_ost_body; + struct lu_object *tsi_corpus; + + struct lu_fid tsi_fid; + struct ldlm_res_id tsi_resid; + + /* object affected by VBR, for last_rcvd_update */ + struct dt_object *tsi_vbr_obj; + /* opdata for mdt_reint_open(), has the same value as + * ldlm_reply:lock_policy_res1. The tgt_update_last_rcvd() stores + * this value onto disk for recovery when tgt_txn_stop_cb() is called. + */ + __u64 tsi_opdata; + + /* + * Additional fail id that can be set by handler. + */ + int tsi_reply_fail_id; + bool tsi_preprocessed; + /* request JobID */ + char *tsi_jobid; + + /* update replay */ + __u64 tsi_xid; + __u32 tsi_result; + __u32 tsi_client_gen; +}; + +static inline struct tgt_session_info *tgt_ses_info(const struct lu_env *env) +{ + struct tgt_session_info *tsi; + + LASSERT(env->le_ses != NULL); + tsi = lu_context_key_get(env->le_ses, &tgt_session_key); + LASSERT(tsi); + return tsi; +} + +static inline void tgt_vbr_obj_set(const struct lu_env *env, + struct dt_object *obj) +{ + struct tgt_session_info *tsi; + + if (env->le_ses != NULL) { + tsi = tgt_ses_info(env); + tsi->tsi_vbr_obj = obj; + } +} + +static inline void tgt_opdata_set(const struct lu_env *env, __u64 flags) +{ + struct tgt_session_info *tsi; + + if (env->le_ses != NULL) { + tsi = tgt_ses_info(env); + tsi->tsi_opdata |= flags; + } +} + +static inline void tgt_opdata_clear(const struct lu_env *env, __u64 flags) +{ + struct tgt_session_info *tsi; + + if (env->le_ses != NULL) { + tsi = tgt_ses_info(env); + tsi->tsi_opdata &= ~flags; + } +} + +/* + * Generic unified target support. + */ +enum tgt_handler_flags { + /* + * struct *_body is passed in the incoming message, and object + * identified by this fid exists on disk. + * * + * "habeo corpus" == "I have a body" + */ + HABEO_CORPUS = (1 << 0), + /* + * struct ldlm_request is passed in the incoming message. + * + * "habeo clavis" == "I have a key" + * */ + HABEO_CLAVIS = (1 << 1), + /* + * this request has fixed reply format, so that reply message can be + * packed by generic code. + * + * "habeo refero" == "I have a reply" + */ + HABEO_REFERO = (1 << 2), + /* + * this request will modify something, so check whether the file system + * is readonly or not, then return -EROFS to client asap if necessary. + * + * "mutabor" == "I shall modify" + */ + MUTABOR = (1 << 3) +}; + +struct tgt_handler { + /* The name of this handler. */ + const char *th_name; + /* Fail id, check at the beginning */ + int th_fail_id; + /* Operation code */ + __u32 th_opc; + /* Flags in enum tgt_handler_flags */ + __u32 th_flags; + /* Request version for this opcode */ + enum lustre_msg_version th_version; + /* Handler function */ + int (*th_act)(struct tgt_session_info *tsi); + /* Handler function for high priority requests */ + void (*th_hp)(struct tgt_session_info *tsi); + /* Request format for this request */ + const struct req_format *th_fmt; +}; + +struct tgt_opc_slice { + __u32 tos_opc_start; /* First op code */ + __u32 tos_opc_end; /* Last op code */ + struct tgt_handler *tos_hs; /* Registered handler */ +}; + +static inline struct ptlrpc_request *tgt_ses_req(struct tgt_session_info *tsi) +{ + return tsi->tsi_pill ? tsi->tsi_pill->rc_req : NULL; +} + +static inline __u64 tgt_conn_flags(struct tgt_session_info *tsi) +{ + LASSERT(tsi->tsi_exp); + return exp_connect_flags(tsi->tsi_exp); +} + +static inline int req_is_replay(struct ptlrpc_request *req) +{ + LASSERT(req->rq_reqmsg); + return !!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY); +} + +static inline bool tgt_is_multimodrpcs_client(struct obd_export *exp) +{ + return exp_connect_flags(exp) & OBD_CONNECT_MULTIMODRPCS; +} + + +/* target/tgt_handler.c */ +int tgt_request_handle(struct ptlrpc_request *req); +char *tgt_name(struct lu_target *tgt); +void tgt_counter_incr(struct obd_export *exp, int opcode); +int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, + struct obd_export *exp); +int tgt_adapt_sptlrpc_conf(struct lu_target *tgt); +int tgt_connect(struct tgt_session_info *tsi); +int tgt_disconnect(struct tgt_session_info *uti); +int tgt_obd_ping(struct tgt_session_info *tsi); +int tgt_enqueue(struct tgt_session_info *tsi); +int tgt_convert(struct tgt_session_info *tsi); +int tgt_bl_callback(struct tgt_session_info *tsi); +int tgt_cp_callback(struct tgt_session_info *tsi); +int tgt_llog_open(struct tgt_session_info *tsi); +int tgt_llog_read_header(struct tgt_session_info *tsi); +int tgt_llog_next_block(struct tgt_session_info *tsi); +int tgt_llog_prev_block(struct tgt_session_info *tsi); +int tgt_sec_ctx_init(struct tgt_session_info *tsi); +int tgt_sec_ctx_init_cont(struct tgt_session_info *tsi); +int tgt_sec_ctx_fini(struct tgt_session_info *tsi); +int tgt_sendpage(struct tgt_session_info *tsi, struct lu_rdpg *rdpg, int nob); +int tgt_send_buffer(struct tgt_session_info *tsi, struct lu_rdbuf *rdbuf); +int tgt_validate_obdo(struct tgt_session_info *tsi, struct obdo *oa); +int tgt_sync(const struct lu_env *env, struct lu_target *tgt, + struct dt_object *obj, __u64 start, __u64 end); + +int tgt_io_thread_init(struct ptlrpc_thread *thread); +void tgt_io_thread_done(struct ptlrpc_thread *thread); + +int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id, + struct lustre_handle *lh, int mode, __u64 *flags); +void tgt_mdt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode); +int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns, + struct ldlm_res_id *res_id, __u64 start, __u64 end, + struct lustre_handle *lh, int mode, __u64 *flags); +void tgt_extent_unlock(struct lustre_handle *lh, enum ldlm_mode mode); +int tgt_brw_read(struct tgt_session_info *tsi); +int tgt_brw_write(struct tgt_session_info *tsi); +int tgt_hpreq_handler(struct ptlrpc_request *req); +void tgt_register_lfsck_in_notify_local(int (*notify)(const struct lu_env *, + struct dt_device *, + struct lfsck_req_local *, + struct thandle *)); +void tgt_register_lfsck_in_notify(int (*notify)(const struct lu_env *, + struct dt_device *, + struct lfsck_request *)); +void tgt_register_lfsck_query(int (*query)(const struct lu_env *, + struct dt_device *, + struct lfsck_request *, + struct lfsck_reply *, + struct lfsck_query *)); +bool req_can_reconstruct(struct ptlrpc_request *req, struct tg_reply_data *trd); + +extern struct tgt_handler tgt_sec_ctx_handlers[]; +extern struct tgt_handler tgt_lfsck_handlers[]; +extern struct tgt_handler tgt_obd_handlers[]; +extern struct tgt_handler tgt_dlm_handlers[]; +extern struct tgt_handler tgt_llog_handlers[]; +extern struct tgt_handler tgt_out_handlers[]; +extern struct tgt_handler fld_handlers[]; +extern struct tgt_handler seq_handlers[]; + +typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno, + void *data, int err); +struct tgt_commit_cb { + tgt_cb_t tgt_cb_func; + void *tgt_cb_data; +}; + +int tgt_hpreq_handler(struct ptlrpc_request *req); + +/* target/tgt_main.c */ +void tgt_boot_epoch_update(struct lu_target *lut); +void tgt_save_slc_lock(struct lu_target *lut, struct ldlm_lock *lock, + __u64 transno); +void tgt_discard_slc_lock(struct lu_target *lut, struct ldlm_lock *lock); +int tgt_init(const struct lu_env *env, struct lu_target *lut, + struct obd_device *obd, struct dt_device *dt, + struct tgt_opc_slice *slice, + int request_fail_id, int reply_fail_id); +void tgt_fini(const struct lu_env *env, struct lu_target *lut); +int tgt_client_alloc(struct obd_export *exp); +void tgt_client_free(struct obd_export *exp); +int tgt_client_del(const struct lu_env *env, struct obd_export *exp); +int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int); +int tgt_client_new(const struct lu_env *env, struct obd_export *exp); +int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg, + int sync); +int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt); +bool tgt_lookup_reply(struct ptlrpc_request *req, struct tg_reply_data *trd); +int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt, + struct tg_export_data *ted, struct tg_reply_data *trd, + struct thandle *th, bool update_lrd_file); +struct tg_reply_data *tgt_lookup_reply_by_xid(struct tg_export_data *ted, + __u64 xid); +int tgt_tunables_init(struct lu_target *lut); +void tgt_tunables_fini(struct lu_target *lut); + +/* target/tgt_grant.c */ +static inline int exp_grant_param_supp(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_GRANT_PARAM); +} + +/* Blocksize used for client not supporting OBD_CONNECT_GRANT_PARAM. + * That's 4KB=2^12 which is the biggest block size known to work whatever + * the client's page size is. */ +#define COMPAT_BSIZE_SHIFT 12 + +void tgt_grant_sanity_check(struct obd_device *obd, const char *func); +void tgt_grant_connect(const struct lu_env *env, struct obd_export *exp, + struct obd_connect_data *data, bool new_conn); +void tgt_grant_discard(struct obd_export *exp); +void tgt_grant_prepare_read(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa); +void tgt_grant_prepare_write(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct niobuf_remote *rnb, + int niocount); +void tgt_grant_commit(struct obd_export *exp, unsigned long grant_used, int rc); +int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp, + unsigned long grant); +long tgt_grant_create(const struct lu_env *env, struct obd_export *exp, + s64 *nr); +int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut, + struct obd_statfs *osfs, time64_t max_age, + int *from_cache); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0) +ssize_t sync_lock_cancel_show(struct kobject *kobj, + struct attribute *attr, char *buf); +ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count); +#endif +ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr, + char *buf); +ssize_t grant_compat_disable_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count); + +/* FMD */ +void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid, + __u64 xid); +bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid, + __u64 xid); +#ifdef DO_FMD_DROP +void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid); +#else +#define tgt_fmd_drop(exp, fid) do {} while (0) +#endif + +/* target/update_trans.c */ +int distribute_txn_init(const struct lu_env *env, + struct lu_target *lut, + struct target_distribute_txn_data *tdtd, + __u32 index); +void distribute_txn_fini(const struct lu_env *env, + struct target_distribute_txn_data *tdtd); + +/* target/update_recovery.c */ +int insert_update_records_to_replay_list(struct target_distribute_txn_data *, + struct llog_update_record *, + struct llog_cookie *, __u32); +void dtrq_list_dump(struct target_distribute_txn_data *tdtd, + unsigned int mask); +void dtrq_list_destroy(struct target_distribute_txn_data *tdtd); +int distribute_txn_replay_handle(struct lu_env *env, + struct target_distribute_txn_data *tdtd, + struct distribute_txn_replay_req *dtrq); +__u64 distribute_txn_get_next_transno(struct target_distribute_txn_data *tdtd); +struct distribute_txn_replay_req * +distribute_txn_get_next_req(struct target_distribute_txn_data *tdtd); +void dtrq_destroy(struct distribute_txn_replay_req *dtrq); +struct distribute_txn_replay_req_sub * +dtrq_sub_lookup(struct distribute_txn_replay_req *dtrq, __u32 mdt_index); +struct distribute_txn_replay_req * +distribute_txn_lookup_finish_list(struct target_distribute_txn_data *tdtd, + __u64 transno); +bool is_req_replayed_by_update(struct ptlrpc_request *req); +enum { + ESERIOUS = 0x0001000 +}; + +static inline int err_serious(int rc) +{ + LASSERT(rc < 0); + return -(-rc | ESERIOUS); +} + +static inline int clear_serious(int rc) +{ + if (rc < 0) + rc = -(-rc & ~ESERIOUS); + return rc; +} + +static inline int is_serious(int rc) +{ + return (rc < 0 && -rc & ESERIOUS); +} + +/* + * Unified target generic handers macros and generic functions. + */ +#define TGT_RPC_HANDLER_HP(base, flags, opc, fn, hp, fmt, version) \ +[opc - base] = { \ + .th_name = #opc, \ + .th_fail_id = OBD_FAIL_ ## opc ## _NET, \ + .th_opc = opc, \ + .th_flags = flags, \ + .th_act = fn, \ + .th_fmt = fmt, \ + .th_version = version, \ + .th_hp = hp, \ +} +#define TGT_RPC_HANDLER(base, flags, opc, fn, fmt, version) \ + TGT_RPC_HANDLER_HP(base, flags, opc, fn, NULL, fmt, version) + +/* MDT Request with a format known in advance */ +#define TGT_MDT_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(MDS_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_MDS_VERSION) +/* Request with a format we do not yet know */ +#define TGT_MDT_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(MDS_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_MDS_VERSION) + +/* OST Request with a format known in advance */ +#define TGT_OST_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(OST_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_OST_VERSION) +#define TGT_OST_HDL_HP(flags, name, fn, hp) \ + TGT_RPC_HANDLER_HP(OST_FIRST_OPC, flags, name, fn, hp, \ + &RQF_ ## name, LUSTRE_OST_VERSION) + +/* MGS request with a format known in advance */ +#define TGT_MGS_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(MGS_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_MGS_VERSION) +#define TGT_MGS_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(MGS_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_MGS_VERSION) + +/* + * OBD handler macros and generic functions. + */ +#define TGT_OBD_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(OBD_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_OBD_VERSION) +#define TGT_OBD_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(OBD_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_OBD_VERSION) + +/* + * DLM handler macros and generic functions. + */ +#define TGT_DLM_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(LDLM_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_DLM_VERSION) +#define TGT_DLM_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(LDLM_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_DLM_VERSION) + +/* + * LLOG handler macros and generic functions. + */ +#define TGT_LLOG_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(LLOG_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_LOG_VERSION) +#define TGT_LLOG_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(LLOG_FIRST_OPC, flags, name, fn, &RQF_ ## name, \ + LUSTRE_LOG_VERSION) + +/* + * Sec context handler macros and generic functions. + */ +#define TGT_SEC_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(SEC_FIRST_OPC, flags, name, fn, NULL, \ + LUSTRE_OBD_VERSION) + +#define TGT_QUOTA_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(QUOTA_DQACQ, flags, name, fn, &RQF_ ## name, \ + LUSTRE_MDS_VERSION) + +/* Sequence service handlers */ +#define TGT_SEQ_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(SEQ_QUERY, flags, name, fn, &RQF_ ## name, \ + LUSTRE_MDS_VERSION) + +/* FID Location Database handlers */ +#define TGT_FLD_HDL_VAR(flags, name, fn) \ + TGT_RPC_HANDLER(FLD_QUERY, flags, name, fn, NULL, \ + LUSTRE_MDS_VERSION) + +/* LFSCK handlers */ +#define TGT_LFSCK_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(LFSCK_FIRST_OPC, flags, name, fn, \ + &RQF_ ## name, LUSTRE_OBD_VERSION) + +/* Request with a format known in advance */ +#define TGT_UPDATE_HDL(flags, name, fn) \ + TGT_RPC_HANDLER(OUT_UPDATE, flags, name, fn, &RQF_ ## name, \ + LUSTRE_MDS_VERSION) + +#endif /* __LUSTRE_LU_TARGET_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h b/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h new file mode 100644 index 0000000000000..9b73278254206 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h @@ -0,0 +1,141 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/libiam.h + * + * iam user level library + * + * Author: Wang Di + * Author: Nikita Danilov + * Author: Fan Yong + */ + +/* + * lustre/libiam.h + */ + +#ifndef __IAM_ULIB_H__ +#define __IAM_ULIB_H__ + +/** \defgroup libiam libiam + * + * @{ + */ + + +#define DX_FMT_NAME_LEN 16 + +enum iam_fmt_t { + FMT_LFIX, + FMT_LVAR +}; + +struct iam_uapi_info { + __u16 iui_keysize; + __u16 iui_recsize; + __u16 iui_ptrsize; + __u16 iui_height; + char iui_fmt_name[DX_FMT_NAME_LEN]; +}; + +/* + * Creat an iam file, but do NOT open it. + * Return 0 if success, else -1. + */ +int iam_creat(char *filename, enum iam_fmt_t fmt, + int blocksize, int keysize, int recsize, int ptrsize); + +/* + * Open an iam file, but do NOT creat it if the file doesn't exist. + * Please use iam_creat for creating the file before use iam_open. + * Return file id (fd) if success, else -1. + */ +int iam_open(char *filename, struct iam_uapi_info *ua); + +/* + * Close file opened by iam_open. + */ +int iam_close(int fd); + +/* + * Please use iam_open before use this function. + */ +int iam_insert(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Please use iam_open before use this function. + */ +int iam_lookup(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_delete(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Please use iam_open before use this function. + */ +int iam_it_start(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_it_next(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_it_stop(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Change iam file mode. + */ +int iam_polymorph(char *filename, unsigned long mode); + +/** @} libiam */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h new file mode 100644 index 0000000000000..ec64bb610b825 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h @@ -0,0 +1,39 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/* + * NOTE: This file is DEPRECATED! Please include lustreapi.h directly + * instead of this file. This file will be removed from a future version + * of lustre! + */ + +#ifndef _LIBLUSTREAPI_H_ +#define _LIBLUSTREAPI_H_ + +#include +#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly." + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h new file mode 100644 index 0000000000000..e5466c7886238 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h @@ -0,0 +1,49 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/ll_fiemap.h + * + * FIEMAP data structures and flags. This header file will be used until + * fiemap.h is available in the upstream kernel. + * + * Author: Kalpak Shah + * Author: Andreas Dilger + */ + +/* + * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_fiemap.h + * directly instead of this file. This file will be removed from a + * future version of lustre! + */ + +#include + +#warning "Including ll_fiemap.h is deprecated. Include linux/lustre/lustre_fiemap.h directly." diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h new file mode 100644 index 0000000000000..f8489d55a3b44 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h @@ -0,0 +1,40 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + * + * lustre/include/lustre/lustre_barrier_user.h + * + * Lustre write barrier (on MDT) userspace interfaces. + * + * Author: Fan, Yong + */ + +/* + * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_barrier_user.h + * directly instead of this file. This file will be removed from a + * future version of lustre! + */ + +#include + +#warning "Including lustre_barrier_user.h is deprecated. Include linux/lustre/lustre_barrier_user.h directly." diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h new file mode 100644 index 0000000000000..7b84426fa2750 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h @@ -0,0 +1,40 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/include/lustre/lustre_lfsck_user.h + * + * Lustre LFSCK userspace interfaces. + * + * Author: Fan, Yong + */ + +/* + * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_lfsck_user.h + * directly instead of this file. This file will be removed from a + * future version of lustre! + */ + +#include +#warning "Including lustre_lfsck_user.h is deprecated. Include linux/lustre/lustre_lfsck_user.h directly." diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h new file mode 100644 index 0000000000000..9d8f5ebefa569 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h @@ -0,0 +1,48 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/lustre_user.h + * + * Lustre public user-space interface definitions. + */ + +/* + * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_user.h + * directly instead of this file. This file will be removed from a + * future version of lustre! + */ + +#include + +/* Disable warning until 2.16 or 3.0, until new header is widely available. + * This gives apps time to move to the new header without spurious warnings. +#warning "Including lustre/lustre_user.h is deprecated. Include linux/lustre/lustre_user.h instead." +*/ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h new file mode 100644 index 0000000000000..6e61cd98ad4ff --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h @@ -0,0 +1,1056 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTREAPI_H_ +#define _LUSTREAPI_H_ + +/** \defgroup llapi llapi + * + * @{ + */ + +#include +#include +#include +#include +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef LL_MAXQUOTAS +#define LL_MAXQUOTAS 3 +#endif + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0]))) +#endif + +#ifndef fallthrough +#define fallthrough do {} while (0) /* fallthrough */ +#endif + +#define lustre_fid struct lu_fid + +/* Currently external applications can access this but in the + * future this will no longer be exposed for the user. Instead + * if you want to know if the library is initialized just call + * llapi_liblustreapi_initialized() which is now available. */ +extern bool liblustreapi_initialized; + +typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid, + void *args); + +/* lustreapi message severity level */ +enum llapi_message_level { + LLAPI_MSG_OFF = 0, + LLAPI_MSG_FATAL = 1, + LLAPI_MSG_ERROR = 2, + LLAPI_MSG_WARN = 3, + LLAPI_MSG_NORMAL = 4, + LLAPI_MSG_INFO = 5, + LLAPI_MSG_DEBUG = 6, + LLAPI_MSG_MAX +}; + +typedef void (*llapi_log_callback_t)(enum llapi_message_level level, int err, + const char *fmt, va_list ap); + +static inline bool llapi_liblustreapi_initialized(void) +{ + return liblustreapi_initialized; +} + +/* the bottom three bits reserved for llapi_message_level */ +#define LLAPI_MSG_MASK 0x00000007 +#define LLAPI_MSG_NO_ERRNO 0x00000010 + +static inline const char *llapi_msg_level2str(enum llapi_message_level level) +{ + static const char *levels[LLAPI_MSG_MAX] = {"OFF", "FATAL", "ERROR", + "WARNING", "NORMAL", + "INFO", "DEBUG"}; + + if (level >= LLAPI_MSG_MAX) + return NULL; + + return levels[level]; +} + +void llapi_msg_set_level(int level); +int llapi_msg_get_level(void); +llapi_log_callback_t llapi_error_callback_set(llapi_log_callback_t cb); +llapi_log_callback_t llapi_info_callback_set(llapi_log_callback_t cb); + +void llapi_error(enum llapi_message_level level, int err, const char *fmt, ...) + __attribute__((__format__(__printf__, 3, 4))); +#define llapi_err_noerrno(level, fmt, a...) \ + llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a) +void llapi_printf(enum llapi_message_level level, const char *fmt, ...) + __attribute__((__format__(__printf__, 2, 3))); + +struct llapi_stripe_param { + unsigned long long lsp_stripe_size; + char *lsp_pool; + int lsp_stripe_offset; + int lsp_stripe_pattern; + /* Number of stripes. Size of lsp_osts[] if lsp_specific is true.*/ + int lsp_stripe_count; + bool lsp_is_specific; + bool lsp_is_create; + __u8 lsp_max_inherit; + __u8 lsp_max_inherit_rr; + __u32 lsp_osts[0]; +}; + +#define lsp_tgts lsp_osts + +int llapi_file_open_param(const char *name, int flags, mode_t mode, + const struct llapi_stripe_param *param); +int llapi_file_create(const char *name, unsigned long long stripe_size, + int stripe_offset, int stripe_count, int stripe_pattern); +int llapi_file_open(const char *name, int flags, int mode, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern); +int llapi_file_create_pool(const char *name, unsigned long long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern, char *pool_name); +int llapi_file_open_pool(const char *name, int flags, int mode, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern, char *pool_name); +int llapi_poollist(const char *name); +int llapi_get_poollist(const char *name, char **poollist, int list_size, + char *buffer, int buffer_size); +int llapi_get_poolmembers(const char *poolname, char **members, int list_size, + char *buffer, int buffer_size); +int llapi_file_get_stripe(const char *path, struct lov_user_md *lum); +int llapi_file_lookup(int dirfd, const char *name); +void llapi_set_command_name(const char *cmd); +void llapi_clear_command_name(void); + +enum llapi_layout_verbose { + VERBOSE_STRIPE_COUNT = 0x1, + VERBOSE_STRIPE_SIZE = 0x2, + VERBOSE_STRIPE_OFFSET = 0x4, + VERBOSE_POOL = 0x8, + VERBOSE_DETAIL = 0x10, + VERBOSE_OBJID = 0x20, + VERBOSE_GENERATION = 0x40, + VERBOSE_MDTINDEX = 0x80, + VERBOSE_PATTERN = 0x100, + VERBOSE_COMP_COUNT = 0x200, + VERBOSE_COMP_FLAGS = 0x400, + VERBOSE_COMP_START = 0x800, + VERBOSE_COMP_END = 0x1000, + VERBOSE_COMP_ID = 0x2000, + VERBOSE_DFID = 0x4000, + VERBOSE_HASH_TYPE = 0x8000, + VERBOSE_MIRROR_COUNT = 0x10000, + VERBOSE_MIRROR_ID = 0x20000, + VERBOSE_EXT_SIZE = 0x40000, + VERBOSE_INHERIT = 0x80000, + VERBOSE_INHERIT_RR = 0x100000, + VERBOSE_DEFAULT = VERBOSE_STRIPE_COUNT | VERBOSE_STRIPE_SIZE | + VERBOSE_STRIPE_OFFSET | VERBOSE_POOL | + VERBOSE_OBJID | VERBOSE_GENERATION | + VERBOSE_PATTERN | VERBOSE_HASH_TYPE | + VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS | + VERBOSE_COMP_START | VERBOSE_COMP_END | + VERBOSE_COMP_ID | VERBOSE_MIRROR_COUNT | + VERBOSE_MIRROR_ID | VERBOSE_EXT_SIZE | + VERBOSE_INHERIT | VERBOSE_INHERIT_RR +}; +/* Compatibility with original names */ +#define VERBOSE_SIZE VERBOSE_STRIPE_SIZE +#define VERBOSE_COUNT VERBOSE_STRIPE_COUNT +#define VERBOSE_OFFSET VERBOSE_STRIPE_OFFSET +#define VERBOSE_LAYOUT VERBOSE_PATTERN + +struct find_param { + unsigned int fp_max_depth; + dev_t fp_dev; + mode_t fp_type; /* S_IFIFO,... */ + uid_t fp_uid; + gid_t fp_gid; + time_t fp_atime; + time_t fp_mtime; + time_t fp_ctime; + /* {a,m,c}sign cannot be bitfields due to using pointers to + * access them during argument parsing. */ + int fp_asign; + int fp_msign; + int fp_csign; + /* these need to be signed values */ + int fp_size_sign:2, + fp_stripe_size_sign:2, + fp_stripe_count_sign:2, + fp_comp_start_sign:2, + fp_comp_end_sign:2, + fp_comp_count_sign:2, + fp_mirror_count_sign:2, + fp_mirror_index_sign:2, + fp_mirror_id_sign:2, + fp_mdt_count_sign:2, + fp_blocks_sign:2; + unsigned long long fp_size; + unsigned long long fp_size_units; + + unsigned long long fp_zero_end:1, + fp_recursive:1, + fp_exclude_pattern:1, + fp_exclude_type:1, + fp_exclude_obd:1, + fp_exclude_mdt:1, + fp_exclude_gid:1, + fp_exclude_uid:1, + fp_check_gid:1, + fp_check_uid:1, + fp_check_pool:1, /* LOV pool name */ + fp_check_size:1, /* file size */ + fp_exclude_pool:1, + fp_exclude_size:1, + fp_exclude_atime:1, + fp_exclude_mtime:1, + fp_exclude_ctime:1, + fp_get_lmv:1, /* get MDT list from LMV */ + fp_raw:1, /* do not fill in defaults */ + fp_check_stripe_size:1, /* LOV stripe size */ + fp_exclude_stripe_size:1, + fp_check_stripe_count:1, /* LOV stripe count */ + fp_exclude_stripe_count:1, + fp_check_layout:1, + fp_exclude_layout:1, + fp_get_default_lmv:1, /* Get default LMV */ + fp_migrate:1, + fp_check_projid:1, + fp_exclude_projid:1, + fp_check_comp_count:1, + fp_exclude_comp_count:1, + fp_check_mirror_count:1, + fp_exclude_mirror_count:1, + fp_check_comp_flags:1, + fp_check_mirror_state:1, + fp_check_comp_start:1, + fp_exclude_comp_start:1, + fp_check_comp_end:1, + fp_exclude_comp_end:1, + fp_check_comp_id:1, + fp_exclude_comp_id:1, + fp_check_mirror_id:1, + fp_exclude_mirror_id:1, + fp_check_mirror_index:1, + fp_exclude_mirror_index:1, + fp_check_mdt_count:1, + fp_exclude_mdt_count:1, + fp_check_hash_type:1, + fp_exclude_hash_type:1, + fp_yaml:1, /* output layout in YAML */ + fp_check_blocks:1, + fp_exclude_blocks:1, + fp_lazy:1; + + enum llapi_layout_verbose fp_verbose; + int fp_quiet; + + /* regular expression */ + char *fp_pattern; + + struct obd_uuid *fp_obd_uuid; + int fp_num_obds; + int fp_num_alloc_obds; + int fp_obd_index; + int *fp_obd_indexes; + + struct obd_uuid *fp_mdt_uuid; + int fp_num_mdts; + int fp_num_alloc_mdts; + int fp_mdt_index; + int *fp_mdt_indexes; + int fp_file_mdt_index; + + size_t fp_lum_size; + struct lov_user_mds_data *fp_lmd; + + char fp_poolname[LOV_MAXPOOLNAME + 1]; + + __u32 fp_lmv_stripe_count; + struct lmv_user_md *fp_lmv_md; + + unsigned long long fp_stripe_size; + unsigned long long fp_stripe_size_units; + unsigned long long fp_stripe_count; + __u32 fp_layout; + + __u32 fp_comp_count; + __u32 fp_mirror_count; + __u32 fp_comp_flags; + __u32 fp_comp_neg_flags; + __u16 fp_mirror_state; + __u16 fp_mirror_neg_state; + __u32 fp_comp_id; + __u16 fp_mirror_id; + __u16 fp_mirror_index; + unsigned long long fp_comp_start; + unsigned long long fp_comp_start_units; + unsigned long long fp_comp_end; + unsigned long long fp_comp_end_units; + unsigned long long fp_mdt_count; + unsigned fp_projid; + unsigned long long fp_blocks; + unsigned long long fp_blocks_units; + + /* In-process parameters. */ + unsigned long fp_got_uuids:1, + fp_obds_printed:1; + unsigned int fp_depth; + unsigned int fp_hash_type; +}; + +int llapi_ostlist(char *path, struct find_param *param); +int llapi_uuid_match(char *real_uuid, char *search_uuid); +int llapi_getstripe(char *path, struct find_param *param); +int llapi_find(char *path, struct find_param *param); + +int llapi_file_fget_mdtidx(int fd, int *mdtidx); +int llapi_dir_set_default_lmv(const char *name, + const struct llapi_stripe_param *param); +int llapi_dir_set_default_lmv_stripe(const char *name, int stripe_offset, + int stripe_count, int stripe_pattern, + const char *pool_name); +int llapi_dir_create(const char *name, mode_t mode, + const struct llapi_stripe_param *param); +int llapi_dir_create_pool(const char *name, int flags, int stripe_offset, + int stripe_count, int stripe_pattern, + const char *poolname); +int llapi_direntry_remove(char *dname); + +int llapi_obd_fstatfs(int fd, __u32 type, __u32 index, + struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf); +int llapi_obd_statfs(char *path, __u32 type, __u32 index, + struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf); +int llapi_ping(char *obd_type, char *obd_name); +int llapi_target_check(int num_types, char **obd_types, char *dir); +int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid); +int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid); +int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid); +int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count); +int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count); +int llapi_is_lustre_mnttype(const char *type); +int llapi_search_tgt(char *fsname, char *poolname, char *tgtname, bool is_mdt); +int llapi_search_ost(char *fsname, char *poolname, char *ostname); +int llapi_get_obd_count(char *mnt, int *count, int is_mdt); +int llapi_parse_size(const char *optarg, unsigned long long *size, + unsigned long long *size_units, int bytes_spec); +int llapi_search_mounts(const char *pathname, int index, char *mntdir, + char *fsname); +int llapi_search_fsname(const char *pathname, char *fsname); +int llapi_getname(const char *path, char *buf, size_t size); +int llapi_search_fileset(const char *pathname, char *fileset); + +int llapi_search_rootpath(char *pathname, const char *fsname); +int llapi_nodemap_exists(const char *name); +int llapi_migrate_mdt(char *path, struct find_param *param); +int llapi_mv(char *path, struct find_param *param); + +struct mntent; + +#define HAVE_LLAPI_IS_LUSTRE_MNT +int llapi_is_lustre_mnt(struct mntent *mnt); +int llapi_quotactl(char *mnt, struct if_quotactl *qctl); +int llapi_target_iterate(int type_num, char **obd_type, void *args, + llapi_cb_t cb); +int llapi_get_connect_flags(const char *mnt, __u64 *flags); +int llapi_cp(int argc, char *argv[]); +int llapi_ls(int argc, char *argv[]); +int llapi_fid2path(const char *device, const char *fidstr, char *path, + int pathlen, long long *recno, int *linkno); +int llapi_path2fid(const char *path, struct lu_fid *fid); +int llapi_get_mdt_index_by_fid(int fd, const struct lu_fid *fid, + int *mdt_index); +int llapi_get_lum_file(const char *path, __u64 *valid, lstatx_t *statx, + struct lov_user_md *lum, size_t lumsize); +int llapi_get_lum_dir(const char *path, __u64 *valid, lstatx_t *statx, + struct lov_user_md *lum, size_t lumsize); +int llapi_get_lum_file_fd(int dir_fd, const char *fname, __u64 *valid, + lstatx_t *statx, struct lov_user_md *lum, + size_t lumsize); +int llapi_get_lum_dir_fd(int dir_fd, __u64 *valid, lstatx_t *statx, + struct lov_user_md *lum, size_t lumsize); + +int llapi_fd2fid(int fd, struct lu_fid *fid); +/* get FID of parent dir + the related name of entry in this parent dir */ +int llapi_path2parent(const char *path, unsigned int linkno, + struct lu_fid *parent_fid, char *name, size_t name_size); +int llapi_fd2parent(int fd, unsigned int linkno, struct lu_fid *parent_fid, + char *name, size_t name_size); +int llapi_rmfid(const char *path, struct fid_array *fa); +int llapi_chomp_string(char *buf); +int llapi_open_by_fid(const char *dir, const struct lu_fid *fid, + int open_flags); +int llapi_get_version_string(char *version, unsigned int version_size); +/* llapi_get_version() is deprecated, use llapi_get_version_string() instead */ +int llapi_get_version(char *buffer, int buffer_size, char **version) + __attribute__((deprecated)); +int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags); +int llapi_file_flush(int fd); +extern int llapi_get_ost_layout_version(int fd, __u32 *layout_version); +int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus); +int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus); +int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask, + __u32 archive_id); +int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask, + __u32 archive_id); +int llapi_hsm_register_event_fifo(const char *path); +int llapi_hsm_unregister_event_fifo(const char *path); +void llapi_hsm_log_error(enum llapi_message_level level, int _rc, + const char *fmt, va_list args); + +int llapi_get_agent_uuid(char *path, char *buf, size_t bufsize); +int llapi_create_volatile_idx(const char *directory, int mdt_idx, + int open_flags); +int llapi_create_volatile_param(const char *directory, int mdt_idx, + int open_flags, mode_t mode, + const struct llapi_stripe_param *stripe_param); + +static inline int llapi_create_volatile(char *directory, int open_flags) +{ + return llapi_create_volatile_idx(directory, -1, open_flags); +} + + +int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2, + int gid, __u64 flags); +int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags); +int llapi_swap_layouts(const char *path1, const char *path2, __u64 dv1, + __u64 dv2, __u64 flags); + +/* Changelog interface. priv is private state, managed internally by these + * functions */ + +/* Records received are in extended format now, though most of them are still + * written in disk in changelog_rec format (to save space and time), it's + * converted to extended format in the lustre api to ease changelog analysis. */ +#define HAVE_CHANGELOG_EXTEND_REC 1 + +int llapi_changelog_start(void **priv, enum changelog_send_flag flags, + const char *mdtname, long long startrec); +int llapi_changelog_fini(void **priv); +int llapi_changelog_recv(void *priv, struct changelog_rec **rech); +int llapi_changelog_in_buf(void *priv); +int llapi_changelog_free(struct changelog_rec **rech); +int llapi_changelog_get_fd(void *priv); +/* Allow records up to endrec to be destroyed; requires registered id. */ +int llapi_changelog_clear(const char *mdtname, const char *idstr, + long long endrec); +extern int llapi_changelog_set_xflags(void *priv, + enum changelog_send_extra_flag extra_flags); + +/* HSM copytool interface. + * priv is private state, managed internally by these functions + */ +struct hsm_copytool_private; +struct hsm_copyaction_private; + +int llapi_hsm_copytool_register(struct hsm_copytool_private **priv, + const char *mnt, int archive_count, + int *archives, int rfd_flags); +int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv); +int llapi_hsm_copytool_get_fd(struct hsm_copytool_private *ct); +int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv, + struct hsm_action_list **hal, int *msgsize); +int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp, + const struct hsm_copytool_private *ct, + const struct hsm_action_item *hai, + int restore_mdt_index, int restore_open_flags, + bool is_error); +int llapi_hsm_action_end(struct hsm_copyaction_private **phcp, + const struct hsm_extent *he, int hp_flags, int errval); +int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp, + const struct hsm_extent *he, __u64 total, + int hp_flags); +int llapi_hsm_action_get_dfid(const struct hsm_copyaction_private *hcp, + struct lu_fid *fid); +int llapi_hsm_action_get_fd(const struct hsm_copyaction_private *hcp); +int llapi_hsm_import(const char *dst, int archive, const struct stat *st, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern, char *pool_name, + struct lu_fid *newfid); + +/* HSM user interface */ +struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount, + int data_len); +int llapi_hsm_request(const char *path, const struct hsm_user_request *request); +int llapi_hsm_current_action(const char *path, struct hsm_current_action *hca); + +/* JSON handling */ +int llapi_json_init_list(struct llapi_json_item_list **item_list); +int llapi_json_destroy_list(struct llapi_json_item_list **item_list); +int llapi_json_add_item(struct llapi_json_item_list **item_list, char *key, + __u32 type, void *val); +int llapi_json_write_list(struct llapi_json_item_list **item_list, FILE *fp); + +/* File lease */ +int llapi_lease_acquire(int fd, enum ll_lease_mode mode); +int llapi_lease_release(int fd); +int llapi_lease_set(int fd, const struct ll_ioc_lease *data); +int llapi_lease_check(int fd); +int llapi_lease_get(int fd, int mode); /* obsoleted */ +int llapi_lease_put(int fd); /* obsoleted */ + +/* Group lock */ +int llapi_group_lock(int fd, int gid); +int llapi_group_unlock(int fd, int gid); + +/* Ladvise */ +int llapi_ladvise(int fd, unsigned long long flags, int num_advise, + struct llapi_lu_ladvise *ladvise); +/** @} llapi */ + +/* llapi_layout user interface */ + +/** + * An array element storing component info to be resynced during mirror + * resynchronization. + */ +struct llapi_resync_comp { + uint64_t lrc_start; + uint64_t lrc_end; + uint32_t lrc_mirror_id; + uint32_t lrc_id; /* component id */ + bool lrc_synced; +}; + +/** Opaque data type abstracting the layout of a Lustre file. */ +struct llapi_layout; + +int llapi_mirror_truncate(int fd, unsigned int id, off_t length); +ssize_t llapi_mirror_write(int fd, unsigned int id, const void *buf, + size_t count, off_t pos); +uint32_t llapi_mirror_find(struct llapi_layout *layout, + uint64_t file_start, uint64_t file_end, + uint64_t *endp); +int llapi_mirror_find_stale(struct llapi_layout *layout, + struct llapi_resync_comp *comp, size_t comp_size, + __u16 *mirror_ids, int ids_nr); +int llapi_mirror_resync_many(int fd, struct llapi_layout *layout, + struct llapi_resync_comp *comp_array, + int comp_size, uint64_t start, uint64_t end); +/* + * Flags to control how layouts are retrieved. + */ + +/* Replace non-specified values with expected inherited values. */ +#define LAYOUT_GET_EXPECTED 0x1 + +/** + * Return a pointer to a newly-allocated opaque data structure containing + * the layout for the file at \a path. The pointer should be freed with + * llapi_layout_free() when it is no longer needed. Failure is indicated + * by a NULL return value and an appropriate error code stored in errno. + */ +struct llapi_layout *llapi_layout_get_by_path(const char *path, uint32_t flags); + +/** + * Return a pointer to a newly-allocated opaque data type containing the + * layout for the file referenced by open file descriptor \a fd. The + * pointer should be freed with llapi_layout_free() when it is no longer + * needed. Failure is indicated by a NULL return value and an + * appropriate error code stored in errno. + */ +struct llapi_layout *llapi_layout_get_by_fd(int fd, uint32_t flags); + +/** + * Return a pointer to a newly-allocated opaque data type containing the + * layout for the file associated with Lustre file identifier + * \a fid. The string \a path must name a path within the + * filesystem that contains the file being looked up, such as the + * filesystem root. The returned pointer should be freed with + * llapi_layout_free() when it is no longer needed. Failure is + * indicated with a NULL return value and an appropriate error code + * stored in errno. + */ +struct llapi_layout *llapi_layout_get_by_fid(const char *path, + const struct lu_fid *fid, + uint32_t flags); + +enum llapi_layout_xattr_flags { + LLAPI_LXF_CHECK = 0x0001, + LLAPI_LXF_COPY = 0x0002, +}; + +/** + * Return a pointer to a newly-allocated opaque data type containing the + * layout for the file associated with extended attribute \a lov_xattr. The + * length of the extended attribute is \a lov_xattr_size. The \a lov_xattr + * should be raw xattr without being swapped, since this function will swap it + * properly. Thus, \a lov_xattr will be modified during the process. If the + * \a LLAPI_LXF_CHECK flag of \a flags is set, this function will check whether + * the objects count in lum is consistent with the stripe count in lum. This + * check only apply to regular file, so \a LLAPI_LXF_CHECK flag should be + * cleared if the xattr belongs to a directory. If the \a LLAPI_LXF_COPY flag + * of \a flags is set, this function will use a temporary buffer for byte + * swapping when necessary, leaving \a lov_xattr untouched. Otherwise, the byte + * swapping will be done to the \a lov_xattr buffer directly. The returned + * pointer should be freed with llapi_layout_free() when it is no longer + * needed. Failure is * indicated with a NULL return value and an appropriate + * error code stored in errno. + */ +struct llapi_layout *llapi_layout_get_by_xattr(void *lov_xattr, + ssize_t lov_xattr_size, + uint32_t flags); + +/** + * Allocate a new layout. Use this when creating a new file with + * llapi_layout_file_create(). + */ +struct llapi_layout *llapi_layout_alloc(void); + +/** + * Free memory allocated for \a layout. + */ +void llapi_layout_free(struct llapi_layout *layout); + +/** + * llapi_layout_merge() - Merge a composite layout into another one. + * @dst_layout: Destination composite layout. + * @src_layout: Source composite layout. + * + * This function copies all of the components from @src_layout and + * appends them to @dst_layout. + * + * Return: 0 on success or -1 on failure. + */ +int llapi_layout_merge(struct llapi_layout **dst_layout, + const struct llapi_layout *src_layout); + +/** Not a valid stripe size, offset, or RAID pattern. */ +#define LLAPI_LAYOUT_INVALID 0x1000000000000001ULL + +/** + * When specified or returned as the value for stripe count, + * stripe size, offset, or RAID pattern, the filesystem-wide + * default behavior will apply. + */ +#define LLAPI_LAYOUT_DEFAULT (LLAPI_LAYOUT_INVALID + 1) + +/** + * When specified or returned as the value for stripe count, all + * available OSTs will be used. + */ +#define LLAPI_LAYOUT_WIDE (LLAPI_LAYOUT_INVALID + 2) + +/** + * When specified as the value for layout pattern, file objects will be + * stored using RAID0. That is, data will be split evenly and without + * redundancy across all OSTs in the layout. + */ +#define LLAPI_LAYOUT_RAID0 0ULL +#define LLAPI_LAYOUT_MDT 2ULL + +/** +* The layout includes a specific set of OSTs on which to allocate. +*/ +#define LLAPI_LAYOUT_SPECIFIC 0x2000000000000000ULL + +/** + * A valid ost index should be less than maximum valid OST index (UINT_MAX). + */ +#define LLAPI_LAYOUT_IDX_MAX 0x00000000FFFFFFFFULL + +/** + * Flags to modify how layouts are retrieved. + */ +/******************** Stripe Count ********************/ + +/** + * Store the stripe count of \a layout in \a count. + * + * \retval 0 Success + * \retval -1 Error with status code in errno. + */ +int llapi_layout_stripe_count_get(const struct llapi_layout *layout, + uint64_t *count); + +/** + * Set the stripe count of \a layout to \a count. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_stripe_count_set(struct llapi_layout *layout, uint64_t count); + +/******************** Stripe Size ********************/ + +/** + * Store the stripe size of \a layout in \a size. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_stripe_size_get(const struct llapi_layout *layout, + uint64_t *size); + +/** + * Set the stripe size of \a layout to \a stripe_size. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_stripe_size_set(struct llapi_layout *layout, uint64_t size); + +/******************** Stripe Pattern ********************/ + +/** + * Store the stripe pattern of \a layout in \a pattern. + * + * \retval 0 Success. + * \retval -1 Error with status code in errno. + */ +int llapi_layout_pattern_get(const struct llapi_layout *layout, + uint64_t *pattern); + +/** + * Set the stripe pattern of \a layout to \a pattern. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_pattern_set(struct llapi_layout *layout, uint64_t pattern); + +/******************** OST Index ********************/ + +/** + * Store the index of the OST where stripe number \a stripe_number is stored + * in \a index. + * + * An error return value will result from a NULL layout, if \a + * stripe_number is out of range, or if \a layout was not initialized + * with llapi_layout_lookup_by{path,fd,fid}(). + * + * \retval 0 Success + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_ost_index_get(const struct llapi_layout *layout, + uint64_t stripe_number, uint64_t *index); + +/** + * Set the OST index associated with stripe number \a stripe_number to + * \a ost_index. + * NB: This is currently supported only for \a stripe_number = 0 and + * other usage will return ENOTSUPP in errno. A NULL \a layout or + * out-of-range \a stripe_number will return EINVAL in errno. + * + * \retval 0 Success. + * \retval -1 Error with errno set to non-zero value. + */ +int llapi_layout_ost_index_set(struct llapi_layout *layout, int stripe_number, + uint64_t index); + +/******************** Pool Name ********************/ + +/** + * Store up to \a pool_name_len characters of the name of the pool of + * OSTs associated with \a layout into the buffer pointed to by + * \a pool_name. + * + * The correct calling form is: + * + * llapi_layout_pool_name_get(layout, pool_name, sizeof(pool_name)); + * + * A pool defines a set of OSTs from which file objects may be + * allocated for a file using \a layout. + * + * On success, the number of bytes stored is returned, excluding the + * terminating '\0' character (zero indicates that \a layout does not + * have an associated OST pool). On error, -1 is returned and errno is + * set appropriately. Possible sources of error include a NULL pointer + * argument or insufficient space in \a dest to store the pool name, + * in which cases errno will be set to EINVAL. + * + * \retval 0+ The number of bytes stored in \a dest. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_pool_name_get(const struct llapi_layout *layout, + char *pool_name, size_t pool_name_len); + +/** + * Set the name of the pool of OSTs from which file objects will be + * allocated to \a pool_name. + * + * If the pool name uses "fsname.pool" notation to qualify the pool name + * with a filesystem name, the "fsname." portion will be silently + * discarded before storing the value. No validation that \a pool_name + * is an existing non-empty pool in filesystem \a fsname will be + * performed. Such validation can be performed by the application if + * desired using the llapi_search_ost() function. The maximum length of + * the stored value is defined by the constant LOV_MAXPOOLNAME. + * + * \retval 0 Success. + * \retval -1 Invalid argument, errno set to EINVAL. + */ +int llapi_layout_pool_name_set(struct llapi_layout *layout, + const char *pool_name); + +/******************** File Creation ********************/ + +/** + * Open an existing file at \a path, or create it with the specified + * \a layout and \a mode. + * + * One access mode and zero or more file creation flags and file status + * flags May be bitwise-or'd in \a open_flags (see open(2)). Return an + * open file descriptor for the file. If \a layout is non-NULL and + * \a path is not on a Lustre filesystem this function will fail and set + * errno to ENOTTY. + * + * An already existing file may be opened with this function, but + * \a layout and \a mode will not be applied to it. Callers requiring a + * guarantee that the opened file is created with the specified + * \a layout and \a mode should use llapi_layout_file_create(). + * + * A NULL \a layout may be specified, in which case the standard Lustre + * behavior for assigning layouts to newly-created files will apply. + * + * \retval 0+ An open file descriptor. + * \retval -1 Error with status code in errno. + */ +int llapi_layout_file_open(const char *path, int open_flags, mode_t mode, + const struct llapi_layout *layout); + +/** + * Create a new file at \a path with the specified \a layout and \a mode. + * + * One access mode and zero or more file creation flags and file status + * flags May be bitwise-or'd in \a open_flags (see open(2)). Return an + * open file descriptor for the file. If \a layout is non-NULL and + * \a path is not on a Lustre filesystem this function will fail and set + * errno to ENOTTY. + * + * The function call + * + * llapi_layout_file_create(path, open_flags, mode, layout) + * + * shall be equivalent to: + * + * llapi_layout_file_open(path, open_flags|O_CREAT|O_EXCL, mode, layout) + * + * It is an error if \a path specifies an existing file. + * + * A NULL \a layout may be specified, in which the standard Lustre + * behavior for assigning layouts to newly-created files will apply. + * + * \retval 0+ An open file descriptor. + * \retval -1 Error with status code in errno. + */ +int llapi_layout_file_create(const char *path, int open_flags, int mode, + const struct llapi_layout *layout); + +/** + * Set flags to the header of component layout. + */ +int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags); +int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags); +const char *llapi_layout_flags_string(uint32_t flags); +const __u16 llapi_layout_string_flags(char *string); + +/** + * llapi_layout_mirror_count_get() - Get mirror count from the header of + * a layout. + * @layout: Layout to get mirror count from. + * @count: Returned mirror count value. + * + * This function gets mirror count from the header of a layout. + * + * Return: 0 on success or -1 on failure. + */ +int llapi_layout_mirror_count_get(struct llapi_layout *layout, + uint16_t *count); + +/** + * llapi_layout_mirror_count_set() - Set mirror count to the header of a layout. + * @layout: Layout to set mirror count in. + * @count: Mirror count value to be set. + * + * This function sets mirror count to the header of a layout. + * + * Return: 0 on success or -1 on failure. + */ +int llapi_layout_mirror_count_set(struct llapi_layout *layout, + uint16_t count); + +/** + * Fetch the start and end offset of the current layout component. + */ +int llapi_layout_comp_extent_get(const struct llapi_layout *layout, + uint64_t *start, uint64_t *end); +/** + * Set the extent of current layout component. + */ +int llapi_layout_comp_extent_set(struct llapi_layout *layout, + uint64_t start, uint64_t end); + +/* PFL component flags table */ +static const struct comp_flag_name { + enum lov_comp_md_entry_flags cfn_flag; + const char *cfn_name; +} comp_flags_table[] = { + { LCME_FL_INIT, "init" }, + { LCME_FL_STALE, "stale" }, + { LCME_FL_PREF_RW, "prefer" }, + { LCME_FL_OFFLINE, "offline" }, + { LCME_FL_NOSYNC, "nosync" }, +}; + +/** + * Gets the attribute flags of the current component. + */ +int llapi_layout_comp_flags_get(const struct llapi_layout *layout, + uint32_t *flags); +/** + * Sets the specified flags of the current component leaving other flags as-is. + */ +int llapi_layout_comp_flags_set(struct llapi_layout *layout, uint32_t flags); +/** + * Clears the flags specified in the flags leaving other flags as-is. + */ +int llapi_layout_comp_flags_clear(struct llapi_layout *layout, uint32_t flags); +/** + * Fetches the file-unique component ID of the current layout component. + */ +int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id); +/** + * Fetches the mirror ID of the current layout component. + */ +int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id); +/** + * Adds one component to the existing composite or plain layout. + */ +int llapi_layout_comp_add(struct llapi_layout *layout); +/** + * Adds a first component of a mirror to the existing composite layout. + */ +int llapi_layout_add_first_comp(struct llapi_layout *layout); +/** + * Deletes the current layout component from the composite layout. + */ +int llapi_layout_comp_del(struct llapi_layout *layout); + +enum llapi_layout_comp_use { + LLAPI_LAYOUT_COMP_USE_FIRST = 1, + LLAPI_LAYOUT_COMP_USE_LAST = 2, + LLAPI_LAYOUT_COMP_USE_NEXT = 3, + LLAPI_LAYOUT_COMP_USE_PREV = 4, +}; + +/** + * Set the currently active component to the specified component ID. + */ +int llapi_layout_comp_use_id(struct llapi_layout *layout, uint32_t id); +/** + * Select the currently active component at the specified position. + */ +int llapi_layout_comp_use(struct llapi_layout *layout, uint32_t pos); +/** + * Add layout components to an existing file. + */ +int llapi_layout_file_comp_add(const char *path, + const struct llapi_layout *layout); +/** + * Delete component(s) by the specified component id or flags. + */ +int llapi_layout_file_comp_del(const char *path, uint32_t id, uint32_t flags); +/** + * Change flags or other parameters of the component(s) by component ID of an + * existing file. The component to be modified is specified by the + * comp->lcme_id value, which must be an unique component ID. The new + * attributes are passed in by @comp and @valid is used to specify which + * attributes in the component are going to be changed. + */ +int llapi_layout_file_comp_set(const char *path, uint32_t *ids, uint32_t *flags, + size_t count); +/** + * Check if the file layout is composite. + */ +bool llapi_layout_is_composite(struct llapi_layout *layout); + +enum { + LLAPI_LAYOUT_ITER_CONT = 0, + LLAPI_LAYOUT_ITER_STOP = 1, +}; + +/** + * Iteration callback function. + * + * \retval LLAPI_LAYOUT_ITER_CONT Iteration proceeds + * \retval LLAPI_LAYOUT_ITER_STOP Stop iteration + * \retval < 0 error code + */ +typedef int (*llapi_layout_iter_cb)(struct llapi_layout *layout, void *cbdata); + +/** + * Iterate all components in the corresponding layout + */ +int llapi_layout_comp_iterate(struct llapi_layout *layout, + llapi_layout_iter_cb cb, void *cbdata); + +/** + * FLR: mirror operation APIs + */ +int llapi_mirror_set(int fd, unsigned int id); +int llapi_mirror_clear(int fd); +ssize_t llapi_mirror_read(int fd, unsigned int id, + void *buf, size_t count, off_t pos); +ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count); +int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst, + off_t pos, size_t count); + +int llapi_param_get_paths(const char *pattern, glob_t *paths); +int llapi_param_get_value(const char *path, char **buf, size_t *buflen); +void llapi_param_paths_free(glob_t *paths); + +/* MDLL */ +int llapi_dir_open_pool(const char *name, int flags, int mode, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern, char *pool_name); + +void llapi_hsm_action_begin_restore_dir(struct hsm_copytool_private *ct); + +/** @} llapi */ + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_acl.h b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h new file mode 100644 index 0000000000000..933d09ab4ef1f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h @@ -0,0 +1,52 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_acl.h + */ + +#ifndef _LUSTRE_ACL_H +#define _LUSTRE_ACL_H + +#include +#include +#ifdef CONFIG_FS_POSIX_ACL +# include +# define LUSTRE_POSIX_ACL_MAX_ENTRIES 32 +# define LUSTRE_POSIX_ACL_MAX_SIZE_OLD \ + (sizeof(posix_acl_xattr_header) + \ + LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry)) +#endif /* CONFIG_FS_POSIX_ACL */ + +#ifndef LUSTRE_POSIX_ACL_MAX_SIZE_OLD +# define LUSTRE_POSIX_ACL_MAX_SIZE_OLD 0 +#endif /* LUSTRE_POSIX_ACL_MAX_SIZE */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h new file mode 100644 index 0000000000000..df6f78bb4b29b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h @@ -0,0 +1,44 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + * + * lustre/include/lustre_barrier.h + * + * Lustre write barrier (on MDT) exported functions. + * + * Author: Fan, Yong + */ + +#ifndef _LUSTRE_BARRIER_H +# define _LUSTRE_BARRIER_H + +#include +#include + +bool barrier_entry(struct dt_device *key); +void barrier_exit(struct dt_device *key); +int barrier_handler(struct dt_device *key, struct ptlrpc_request *req); +int barrier_register(struct dt_device *key, struct dt_device *next); +void barrier_deregister(struct dt_device *key); + +#endif /* _LUSTRE_BARRIER_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h new file mode 100644 index 0000000000000..6306734c9c575 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h @@ -0,0 +1,885 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_COMPAT_H +#define _LUSTRE_COMPAT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef HAVE_FS_STRUCT_RWLOCK +# define LOCK_FS_STRUCT(fs) write_lock(&(fs)->lock) +# define UNLOCK_FS_STRUCT(fs) write_unlock(&(fs)->lock) +#else +# define LOCK_FS_STRUCT(fs) spin_lock(&(fs)->lock) +# define UNLOCK_FS_STRUCT(fs) spin_unlock(&(fs)->lock) +#endif + +#ifdef HAVE_FS_STRUCT_SEQCOUNT +# define WRITE_FS_SEQ_BEGIN(fs) write_seqcount_begin(&(fs)->seq) +# define WRITE_FS_SEQ_END(fs) write_seqcount_end(&(fs)->seq) +#else +# define WRITE_FS_SEQ_BEGIN(fs) +# define WRITE_FS_SEQ_END(fs) +#endif +static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct path path; + struct path old_pwd; + + path.mnt = mnt; + path.dentry = dentry; + path_get(&path); + LOCK_FS_STRUCT(fs); + WRITE_FS_SEQ_BEGIN(fs); + old_pwd = fs->pwd; + fs->pwd = path; + WRITE_FS_SEQ_END(fs); + UNLOCK_FS_STRUCT(fs); + + if (old_pwd.dentry) + path_put(&old_pwd); +} + +#define current_ngroups current_cred()->group_info->ngroups +#define current_groups current_cred()->group_info->small_block + +/* + * OBD need working random driver, thus all our + * initialization routines must be called after device + * driver initialization + */ +#ifndef MODULE +#undef module_init +#define module_init(a) late_initcall(a) +#endif + +#ifndef MODULE_ALIAS_FS +#define MODULE_ALIAS_FS(name) +#endif + +#ifdef HAVE_GENERIC_PERMISSION_2ARGS +# define ll_generic_permission(mnt_userns, inode, mask, flags, check_acl) \ + generic_permission(inode, mask) +#elif defined HAVE_GENERIC_PERMISSION_4ARGS +# define ll_generic_permission(mnt_userns, inode, mask, flags, check_acl) \ + generic_permission(inode, mask, flags, check_acl) +#elif defined HAVE_USER_NAMESPACE_ARG +# define ll_generic_permission(mnt_userns, inode, mask, flags, check_acl) \ + generic_permission(mnt_userns, inode, mask) +#else +# define ll_generic_permission(mnt_userns, inode, mask, flags, check_acl) \ + generic_permission(inode, mask, check_acl) +#endif + +#ifdef HAVE_4ARGS_VFS_SYMLINK +#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \ + vfs_symlink(dir, dentry, path, mode) +#else +#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \ + vfs_symlink(dir, dentry, path) +#endif + +#if !defined(HAVE_FILE_LLSEEK_SIZE) || defined(HAVE_FILE_LLSEEK_SIZE_5ARGS) +#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \ + generic_file_llseek_size(file, offset, origin, maxbytes, eof); +#else +#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \ + generic_file_llseek_size(file, offset, origin, maxbytes); +#endif + +#ifdef HAVE_INODE_DIO_WAIT +/* inode_dio_wait(i) use as-is for write lock */ +# define inode_dio_write_done(i) do {} while (0) /* for write unlock */ +#else +# define inode_dio_wait(i) down_write(&(i)->i_alloc_sem) +# define inode_dio_write_done(i) up_write(&(i)->i_alloc_sem) +#endif + +#ifndef FS_HAS_FIEMAP +#define FS_HAS_FIEMAP (0) +#endif + +#ifndef HAVE_SIMPLE_SETATTR +#define simple_setattr(dentry, ops) inode_setattr((dentry)->d_inode, ops) +#endif + +#ifndef HAVE_INIT_LIST_HEAD_RCU +static inline void INIT_LIST_HEAD_RCU(struct list_head *list) +{ + WRITE_ONCE(list->next, list); + WRITE_ONCE(list->prev, list); +} +#endif + +#ifndef HAVE_DQUOT_SUSPEND +# define ll_vfs_dq_init vfs_dq_init +# define ll_vfs_dq_drop vfs_dq_drop +# define ll_vfs_dq_transfer vfs_dq_transfer +# define ll_vfs_dq_off(sb, remount) vfs_dq_off(sb, remount) +#else +# define ll_vfs_dq_init dquot_initialize +# define ll_vfs_dq_drop dquot_drop +# define ll_vfs_dq_transfer dquot_transfer +# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1) +#endif + +#ifndef HAVE_BLKDEV_GET_BY_DEV +# define blkdev_get_by_dev(dev, mode, holder) open_by_devnum(dev, mode) +#endif + +#ifdef HAVE_BVEC_ITER +#define bio_idx(bio) (bio->bi_iter.bi_idx) +#define bio_set_sector(bio, sector) (bio->bi_iter.bi_sector = sector) +#define bvl_to_page(bvl) (bvl->bv_page) +#else +#define bio_idx(bio) (bio->bi_idx) +#define bio_set_sector(bio, sector) (bio->bi_sector = sector) +#define bio_sectors(bio) ((bio)->bi_size >> 9) +#ifndef HAVE_BIO_END_SECTOR +#define bio_end_sector(bio) (bio->bi_sector + bio_sectors(bio)) +#endif +#define bvl_to_page(bvl) (bvl->bv_page) +#endif + +#ifdef HAVE_BVEC_ITER +#define bio_start_sector(bio) (bio->bi_iter.bi_sector) +#else +#define bio_start_sector(bio) (bio->bi_sector) +#endif + +#ifndef HAVE_BLK_QUEUE_MAX_SEGMENTS +#define blk_queue_max_segments(rq, seg) \ + do { blk_queue_max_phys_segments(rq, seg); \ + blk_queue_max_hw_segments(rq, seg); } while (0) +#else +#define queue_max_phys_segments(rq) queue_max_segments(rq) +#define queue_max_hw_segments(rq) queue_max_segments(rq) +#endif + +#ifdef HAVE_BLK_PLUG +#define DECLARE_PLUG(plug) struct blk_plug plug +#else /* !HAVE_BLK_PLUG */ +#define DECLARE_PLUG(name) +#define blk_start_plug(plug) do {} while (0) +#define blk_finish_plug(plug) do {} while (0) +#endif + +#ifdef HAVE_KMAP_ATOMIC_HAS_1ARG +#define ll_kmap_atomic(a, b) kmap_atomic(a) +#define ll_kunmap_atomic(a, b) kunmap_atomic(a) +#else +#define ll_kmap_atomic(a, b) kmap_atomic(a, b) +#define ll_kunmap_atomic(a, b) kunmap_atomic(a, b) +#endif + +#ifndef HAVE_CLEAR_INODE +#define clear_inode(i) end_writeback(i) +#endif + +#ifndef HAVE_DENTRY_D_CHILD +#define d_child d_u.d_child +#endif + +#ifdef HAVE_DENTRY_D_U_D_ALIAS +#define d_alias d_u.d_alias +#endif + +#ifndef DATA_FOR_LLITE_IS_LIST +#define ll_d_hlist_node hlist_node +#define ll_d_hlist_empty(list) hlist_empty(list) +#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name) +#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry) +# ifdef HAVE_HLIST_FOR_EACH_3ARG +# define ll_d_hlist_for_each_entry(dentry, p, i_dentry) \ + p = NULL; hlist_for_each_entry(dentry, i_dentry, d_alias) +# else +# define ll_d_hlist_for_each_entry(dentry, p, i_dentry) \ + hlist_for_each_entry(dentry, p, i_dentry, d_alias) +# endif +#define DECLARE_LL_D_HLIST_NODE_PTR(name) struct ll_d_hlist_node *name +#else +#define ll_d_hlist_node list_head +#define ll_d_hlist_empty(list) list_empty(list) +#define ll_d_hlist_entry(ptr, type, name) list_entry(ptr.next, type, name) +#define ll_d_hlist_for_each(tmp, i_dentry) list_for_each(tmp, i_dentry) +#define ll_d_hlist_for_each_entry(dentry, p, i_dentry) \ + list_for_each_entry(dentry, i_dentry, d_alias) +#define DECLARE_LL_D_HLIST_NODE_PTR(name) /* nothing */ +#endif /* !DATA_FOR_LLITE_IS_LIST */ + +#ifndef QUOTA_OK +# define QUOTA_OK 0 +#endif +#ifndef NO_QUOTA +# define NO_QUOTA (-EDQUOT) +#endif + +#ifndef SEEK_DATA +#define SEEK_DATA 3 /* seek to the next data */ +#endif +#ifndef SEEK_HOLE +#define SEEK_HOLE 4 /* seek to the next hole */ +#endif + +#ifndef FMODE_UNSIGNED_OFFSET +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) +#endif + +#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit) +# define ext2_set_bit __test_and_set_bit_le +# define ext2_clear_bit __test_and_clear_bit_le +# define ext2_test_bit test_bit_le +# define ext2_find_first_zero_bit find_first_zero_bit_le +# define ext2_find_next_zero_bit find_next_zero_bit_le +#endif + +#ifdef ATTR_TIMES_SET +# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) +#else +# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET) +#endif + +#ifndef XATTR_NAME_POSIX_ACL_ACCESS +# define XATTR_NAME_POSIX_ACL_ACCESS POSIX_ACL_XATTR_ACCESS +#endif + +#ifndef XATTR_NAME_POSIX_ACL_DEFAULT +# define XATTR_NAME_POSIX_ACL_DEFAULT POSIX_ACL_XATTR_DEFAULT +#endif + +#ifndef HAVE_LM_XXX_LOCK_MANAGER_OPS +# define lm_compare_owner fl_compare_owner +#endif + +/* + * After 3.1, kernel's nameidata.intent.open.flags is different + * with lustre's lookup_intent.it_flags, as lustre's it_flags' + * lower bits equal to FMODE_xxx while kernel doesn't transliterate + * lower bits of nameidata.intent.open.flags to FMODE_xxx. + * */ +#include +static inline int ll_namei_to_lookup_intent_flag(int flag) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0) + flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag); +#endif + return flag; +} + +#include +#ifndef HAVE_PROTECT_I_NLINK +static inline void set_nlink(struct inode *inode, unsigned int nlink) +{ + inode->i_nlink = nlink; +} +#endif + +#if defined(HAVE_INODEOPS_USE_UMODE_T) || defined(HAVE_USER_NAMESPACE_ARG) +# define ll_umode_t umode_t +#else +# define ll_umode_t int +#endif + +#ifndef HAVE_VM_FAULT_T +#define vm_fault_t int +#endif + +#include +#ifndef HAVE_D_MAKE_ROOT +static inline struct dentry *d_make_root(struct inode *root) +{ + struct dentry *res = d_alloc_root(root); + + if (res == NULL && root) + iput(root); + + return res; +} +#endif + +#ifdef HAVE_DIRTY_INODE_HAS_FLAG +# define ll_dirty_inode(inode, flag) (inode)->i_sb->s_op->dirty_inode((inode), flag) +#else +# define ll_dirty_inode(inode, flag) (inode)->i_sb->s_op->dirty_inode((inode)) +#endif + +#ifdef HAVE_FILE_F_INODE +# define set_file_inode(file, inode) (file)->f_inode = inode +#else +# define set_file_inode(file, inode) +#endif + +#ifndef HAVE_FILE_INODE +static inline struct inode *file_inode(const struct file *file) +{ + return file->f_path.dentry->d_inode; +} +#endif + +#ifdef HAVE_OLDSIZE_TRUNCATE_PAGECACHE +#define ll_truncate_pagecache(inode, size) truncate_pagecache(inode, 0, size) +#else +#define ll_truncate_pagecache(inode, size) truncate_pagecache(inode, size) +#endif + +#ifdef HAVE_VFS_RENAME_5ARGS +#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d, NULL) +#elif defined HAVE_VFS_RENAME_6ARGS +#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d, NULL, 0) +#else +#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d) +#endif + +#ifdef HAVE_USER_NAMESPACE_ARG +#define vfs_unlink(ns, dir, de) vfs_unlink(ns, dir, de, NULL) +#elif defined HAVE_VFS_UNLINK_3ARGS +#define vfs_unlink(ns, dir, de) vfs_unlink(dir, de, NULL) +#else +#define vfs_unlink(ns, dir, de) vfs_unlink(dir, de) +#endif + +#ifndef HAVE_INODE_LOCK +# define inode_lock(inode) mutex_lock(&(inode)->i_mutex) +# define inode_unlock(inode) mutex_unlock(&(inode)->i_mutex) +# define inode_trylock(inode) mutex_trylock(&(inode)->i_mutex) +#endif + +#ifndef HAVE_RADIX_EXCEPTION_ENTRY +static inline int radix_tree_exceptional_entry(void *arg) +{ + return 0; +} +#endif + +#ifndef HAVE_XA_IS_VALUE +static inline bool xa_is_value(void *entry) +{ + return radix_tree_exceptional_entry(entry); +} +#endif + +#ifndef HAVE_TRUNCATE_INODE_PAGES_FINAL +static inline void truncate_inode_pages_final(struct address_space *map) +{ + truncate_inode_pages(map, 0); +} +#endif + +#ifndef HAVE_PTR_ERR_OR_ZERO +static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) +{ + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + else + return 0; +} +#endif + +#ifndef SIZE_MAX +#define SIZE_MAX (~(size_t)0) +#endif + +#ifdef HAVE_SECURITY_IINITSEC_CALLBACK +# define ll_security_inode_init_security(inode, dir, name, value, len, \ + initxattrs, dentry) \ + security_inode_init_security(inode, dir, &((dentry)->d_name), \ + initxattrs, dentry) +#elif defined HAVE_SECURITY_IINITSEC_QSTR +# define ll_security_inode_init_security(inode, dir, name, value, len, \ + initxattrs, dentry) \ + security_inode_init_security(inode, dir, &((dentry)->d_name), \ + name, value, len) +#else /* !HAVE_SECURITY_IINITSEC_CALLBACK && !HAVE_SECURITY_IINITSEC_QSTR */ +# define ll_security_inode_init_security(inode, dir, name, value, len, \ + initxattrs, dentry) \ + security_inode_init_security(inode, dir, name, value, len) +#endif + +#ifndef bio_for_each_segment_all /* since kernel version 3.9 */ +#ifdef HAVE_BVEC_ITER +#define bio_for_each_segment_all(bv, bio, it) \ + for (it = 0, bv = (bio)->bi_io_vec; it < (bio)->bi_vcnt; it++, bv++) +#else +#define bio_for_each_segment_all(bv, bio, it) bio_for_each_segment(bv, bio, it) +#endif +#endif + +#ifdef HAVE_PID_NS_FOR_CHILDREN +# define ll_task_pid_ns(task) \ + ((task)->nsproxy ? ((task)->nsproxy->pid_ns_for_children) : NULL) +#else +# define ll_task_pid_ns(task) \ + ((task)->nsproxy ? ((task)->nsproxy->pid_ns) : NULL) +#endif + +#ifdef HAVE_FULL_NAME_HASH_3ARGS +# define ll_full_name_hash(salt, name, len) full_name_hash(salt, name, len) +#else +# define ll_full_name_hash(salt, name, len) full_name_hash(name, len) +#endif + +#ifdef HAVE_STRUCT_POSIX_ACL_XATTR +# define posix_acl_xattr_header struct posix_acl_xattr_header +# define posix_acl_xattr_entry struct posix_acl_xattr_entry +# define GET_POSIX_ACL_XATTR_ENTRY(head) ((void *)((head) + 1)) +#else +# define GET_POSIX_ACL_XATTR_ENTRY(head) ((head)->a_entries) +#endif + +#ifdef HAVE_IOP_XATTR +#ifdef HAVE_XATTR_HANDLER_FLAGS +#define ll_setxattr generic_setxattr +#define ll_getxattr generic_getxattr +#define ll_removexattr generic_removexattr +#else +int ll_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ll_getxattr(struct dentry *dentry, const char *name, + void *buf, size_t buf_size); +int ll_removexattr(struct dentry *dentry, const char *name); +#endif /* ! HAVE_XATTR_HANDLER_FLAGS */ +#endif /* HAVE_IOP_XATTR */ + +#ifdef HAVE_IOP_SET_ACL +#ifdef CONFIG_FS_POSIX_ACL +#if !defined(HAVE_USER_NAMESPACE_ARG) && !defined(HAVE_POSIX_ACL_UPDATE_MODE) +static inline int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} +#endif /* HAVE_POSIX_ACL_UPDATE_MODE */ +#endif +#endif + +#ifndef HAVE_IOV_ITER_TRUNCATE +static inline void iov_iter_truncate(struct iov_iter *i, u64 count) +{ + if (i->count > count) + i->count = count; +} +#endif + +#ifndef HAVE_IS_SXID +static inline bool is_sxid(umode_t mode) +{ + return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); +} +#endif + +#ifndef IS_NOSEC +#define IS_NOSEC(inode) (!is_sxid(inode->i_mode)) +#endif + +/* + * mount MS_* flags split from superblock SB_* flags + * if the SB_* flags are not available use the MS_* flags + */ +#if !defined(SB_RDONLY) && defined(MS_RDONLY) +# define SB_RDONLY MS_RDONLY +#endif +#if !defined(SB_ACTIVE) && defined(MS_ACTIVE) +# define SB_ACTIVE MS_ACTIVE +#endif +#if !defined(SB_NOSEC) && defined(MS_NOSEC) +# define SB_NOSEC MS_NOSEC +#endif +#if !defined(SB_POSIXACL) && defined(MS_POSIXACL) +# define SB_POSIXACL MS_POSIXACL +#endif +#if !defined(SB_NODIRATIME) && defined(MS_NODIRATIME) +# define SB_NODIRATIME MS_NODIRATIME +#endif + +#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) +{ + i->count = count; +} + +static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) +{ + return (struct iovec) { + .iov_base = iter->iov->iov_base + iter->iov_offset, + .iov_len = min(iter->count, + iter->iov->iov_len - iter->iov_offset), + }; +} + +#define iov_for_each(iov, iter, start) \ + for (iter = (start); \ + (iter).count && ((iov = iov_iter_iovec(&(iter))), 1); \ + iov_iter_advance(&(iter), (iov).iov_len)) + +static inline ssize_t +generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct iovec iov; + struct iov_iter i; + ssize_t bytes = 0; + + iov_for_each(iov, i, *iter) { + ssize_t res; + + res = generic_file_aio_read(iocb, &iov, 1, iocb->ki_pos); + if (res <= 0) { + if (bytes == 0) + bytes = res; + break; + } + + bytes += res; + if (res < iov.iov_len) + break; + } + + if (bytes > 0) + iov_iter_advance(iter, bytes); + return bytes; +} + +static inline ssize_t +__generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct iovec iov; + struct iov_iter i; + ssize_t bytes = 0; + + /* Since LLITE updates file size at the end of I/O in + * vvp_io_commit_write(), append write has to be done in atomic when + * there are multiple segments because otherwise each iteration to + * __generic_file_aio_write() will see original file size */ + if (unlikely(iocb->ki_filp->f_flags & O_APPEND && iter->nr_segs > 1)) { + struct iovec *iov_copy; + int count = 0; + + OBD_ALLOC(iov_copy, sizeof(*iov_copy) * iter->nr_segs); + if (!iov_copy) + return -ENOMEM; + + iov_for_each(iov, i, *iter) + iov_copy[count++] = iov; + + bytes = __generic_file_aio_write(iocb, iov_copy, count, + &iocb->ki_pos); + OBD_FREE(iov_copy, sizeof(*iov_copy) * iter->nr_segs); + + if (bytes > 0) + iov_iter_advance(iter, bytes); + return bytes; + } + + iov_for_each(iov, i, *iter) { + ssize_t res; + + res = __generic_file_aio_write(iocb, &iov, 1, &iocb->ki_pos); + if (res <= 0) { + if (bytes == 0) + bytes = res; + break; + } + + bytes += res; + if (res < iov.iov_len) + break; + } + + if (bytes > 0) + iov_iter_advance(iter, bytes); + return bytes; +} +#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + +static inline void __user *get_vmf_address(struct vm_fault *vmf) +{ +#ifdef HAVE_VM_FAULT_ADDRESS + return (void __user *)vmf->address; +#else + return vmf->virtual_address; +#endif +} + +#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY +# define ll_filemap_fault(vma, vmf) filemap_fault(vmf) +#else +# define ll_filemap_fault(vma, vmf) filemap_fault(vma, vmf) +#endif + +#ifndef HAVE_CURRENT_TIME +static inline struct timespec current_time(struct inode *inode) +{ + return CURRENT_TIME; +} +#endif + +#ifndef time_after32 +/** + * time_after32 - compare two 32-bit relative times + * @a: the time which may be after @b + * @b: the time which may be before @a + * + * time_after32(a, b) returns true if the time @a is after time @b. + * time_before32(b, a) returns true if the time @b is before time @a. + * + * Similar to time_after(), compare two 32-bit timestamps for relative + * times. This is useful for comparing 32-bit seconds values that can't + * be converted to 64-bit values (e.g. due to disk format or wire protocol + * issues) when it is known that the times are less than 68 years apart. + */ +#define time_after32(a, b) ((s32)((u32)(b) - (u32)(a)) < 0) +#define time_before32(b, a) time_after32(a, b) + +#endif + +#ifndef __GFP_COLD +#define __GFP_COLD 0 +#endif + +#ifndef alloc_workqueue +#define alloc_workqueue(name, flags, max_active) create_workqueue(name) +#endif + +#ifndef READ_ONCE +#define READ_ONCE ACCESS_ONCE +#endif + +#if IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) +static inline unsigned short blk_integrity_interval(struct blk_integrity *bi) +{ +#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY + return bi->interval_exp ? 1 << bi->interval_exp : 0; +#elif defined(HAVE_INTERVAL_BLK_INTEGRITY) + return bi->interval; +#else + return bi->sector_size; +#endif /* !HAVE_INTERVAL_EXP_BLK_INTEGRITY */ +} + +static inline const char *blk_integrity_name(struct blk_integrity *bi) +{ +#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY + return bi->profile->name; +#else + return bi->name; +#endif +} + +static inline unsigned int bip_size(struct bio_integrity_payload *bip) +{ +#ifdef HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD + return bip->bip_iter.bi_size; +#else + return bip->bip_size; +#endif +} +#else /* !CONFIG_BLK_DEV_INTEGRITY */ +static inline unsigned short blk_integrity_interval(struct blk_integrity *bi) +{ + return 0; +} +static inline const char *blk_integrity_name(struct blk_integrity *bi) +{ + /* gcc8 dislikes when strcmp() is called against NULL */ + return ""; +} +#endif /* !CONFIG_BLK_DEV_INTEGRITY */ + +#ifndef INTEGRITY_FLAG_READ +#define INTEGRITY_FLAG_READ BLK_INTEGRITY_VERIFY +#endif + +#ifndef INTEGRITY_FLAG_WRITE +#define INTEGRITY_FLAG_WRITE BLK_INTEGRITY_GENERATE +#endif + +static inline bool bdev_integrity_enabled(struct block_device *bdev, int rw) +{ +#if IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi == NULL) + return false; + +#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY + if (rw == 0 && bi->profile->verify_fn != NULL && + (bi->flags & INTEGRITY_FLAG_READ)) + return true; + + if (rw == 1 && bi->profile->generate_fn != NULL && + (bi->flags & INTEGRITY_FLAG_WRITE)) + return true; +#else + if (rw == 0 && bi->verify_fn != NULL && + (bi->flags & INTEGRITY_FLAG_READ)) + return true; + + if (rw == 1 && bi->generate_fn != NULL && + (bi->flags & INTEGRITY_FLAG_WRITE)) + return true; +#endif /* !HAVE_INTERVAL_EXP_BLK_INTEGRITY */ +#endif /* !CONFIG_BLK_DEV_INTEGRITY */ + + return false; +} + +#ifdef HAVE_PAGEVEC_INIT_ONE_PARAM +#define ll_pagevec_init(pvec, n) pagevec_init(pvec) +#else +#define ll_pagevec_init(pvec, n) pagevec_init(pvec, n) +#endif + +#ifdef HAVE_I_PAGES +#define page_tree i_pages +#else +#define i_pages tree_lock +#define xa_lock_irq(lockp) spin_lock_irq(lockp) +#define xa_unlock_irq(lockp) spin_unlock_irq(lockp) +#endif + +#ifndef KMEM_CACHE_USERCOPY +#define kmem_cache_create_usercopy(name, size, align, flags, useroffset, \ + usersize, ctor) \ + kmem_cache_create(name, size, align, flags, ctor) +#endif + +#ifndef HAVE_LINUX_SELINUX_IS_ENABLED +#define selinux_is_enabled() 1 +#endif + +static inline void ll_security_release_secctx(char *secdata, u32 seclen) +{ +#ifdef HAVE_SEC_RELEASE_SECCTX_1ARG + struct lsmcontext context = { }; + + lsmcontext_init(&context, secdata, seclen, 0); + return security_release_secctx(&context); +#else + return security_release_secctx(secdata, seclen); +#endif +} + +static inline int ll_vfs_getxattr(struct dentry *dentry, struct inode *inode, + const char *name, + void *value, size_t size) +{ +#ifdef HAVE_USER_NAMESPACE_ARG + return vfs_getxattr(&init_user_ns, dentry, name, value, size); +#elif defined(HAVE_VFS_SETXATTR) + return __vfs_getxattr(dentry, inode, name, value, size); +#else + if (unlikely(!inode->i_op->getxattr)) + return -ENODATA; + + return inode->i_op->getxattr(dentry, name, value, size); +#endif +} + +static inline int ll_vfs_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, + const void *value, size_t size, int flags) +{ +#ifdef HAVE_USER_NAMESPACE_ARG + return vfs_setxattr(&init_user_ns, dentry, name, value, size, flags); +#elif defined(HAVE_VFS_SETXATTR) + return __vfs_setxattr(dentry, inode, name, value, size, flags); +#else + if (unlikely(!inode->i_op->setxattr)) + return -EOPNOTSUPP; + + return inode->i_op->setxattr(dentry, name, value, size, flags); +#endif +} + +static inline int ll_vfs_removexattr(struct dentry *dentry, struct inode *inode, + const char *name) +{ +#ifdef HAVE_USER_NAMESPACE_ARG + return vfs_removexattr(&init_user_ns, dentry, name); +#elif defined(HAVE_VFS_SETXATTR) + return __vfs_removexattr(dentry, name); +#else + if (unlikely(!inode->i_op->setxattr)) + return -EOPNOTSUPP; + + return inode->i_op->removexattr(dentry, name); +#endif +} + +#ifndef fallthrough +#define fallthrough do {} while (0) /* fallthrough */ +#endif + +#ifndef HAVE_USER_NAMESPACE_ARG +#define posix_acl_update_mode(ns, inode, mode, acl) \ + posix_acl_update_mode(inode, mode, acl) +#define notify_change(ns, de, attr, inode) notify_change(de, attr, inode) +#define inode_owner_or_capable(ns, inode) inode_owner_or_capable(inode) +#define vfs_create(ns, dir, de, mode, ex) vfs_create(dir, de, mode, ex) +#define vfs_mkdir(ns, dir, de, mode) vfs_mkdir(dir, de, mode) +#define ll_set_acl(ns, inode, acl, type) ll_set_acl(inode, acl, type) +#endif + +#ifndef HAVE_IS_ROOT_INODE +static inline bool is_root_inode(struct inode *inode) +{ + return inode == inode->i_sb->s_root->d_inode; +} +#endif + +#endif /* _LUSTRE_COMPAT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_debug.h b/drivers/staging/lustrefsx/lustre/include/lustre_debug.h new file mode 100644 index 0000000000000..bf67e6816a77e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_debug.h @@ -0,0 +1,74 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_DEBUG_H +#define _LUSTRE_DEBUG_H + +/** \defgroup debug debug + * + * @{ + */ + +#include +#include + +#define LL_CDEBUG_PAGE(mask, page, fmt, arg...) \ + CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: " \ + fmt, page, page->mapping, page->index, (long)page->flags, \ + page_count(page), page_private(page), ## arg) + +#define ASSERT_MAX_SIZE_MB 60000ULL +#define ASSERT_PAGE_INDEX(index, OP) \ +do { if (index > ASSERT_MAX_SIZE_MB << (20 - PAGE_SHIFT)) { \ + CERROR("bad page index %lu > %llu\n", index, \ + ASSERT_MAX_SIZE_MB << (20 - PAGE_SHIFT)); \ + libcfs_debug = ~0UL; \ + OP; \ +}} while(0) + +#define ASSERT_FILE_OFFSET(offset, OP) \ +do { if (offset > ASSERT_MAX_SIZE_MB << 20) { \ + CERROR("bad file offset %llu > %llu\n", offset, \ + ASSERT_MAX_SIZE_MB << 20); \ + libcfs_debug = ~0UL; \ + OP; \ +}} while(0) + +/* lib/debug.c */ +void dump_lniobuf(struct niobuf_local *lnb); +int dump_req(struct ptlrpc_request *req); +int block_debug_setup(void *addr, int len, __u64 off, __u64 id); +int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id); + +/** @} debug */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h new file mode 100644 index 0000000000000..1529b1ad75d07 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h @@ -0,0 +1,366 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_disk.h + * + * Lustre disk format definitions. + * + * Author: Nathan Rutman + */ + +#ifndef _LUSTRE_DISK_H +#define _LUSTRE_DISK_H + +/** \defgroup disk disk + * + * @{ + */ +#include +#include +#include +#include +#include +#include +#include + +#define IS_MDT(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MDT) +#define IS_OST(data) ((data)->lsi_flags & LDD_F_SV_TYPE_OST) +#define IS_MGS(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MGS) +#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \ + LDD_F_SV_TYPE_MDT | \ + LDD_F_SV_TYPE_OST)) +#define MT_STR(data) mt_str((data)->ldd_mount_type) + +/****************** mount command *********************/ + +/* The lmd is only used internally by Lustre; mount simply passes + * everything as string options + */ +#define LMD_MAGIC 0xbdacbd03 +#define LMD_PARAMS_MAXLEN 4096 + +/* gleaned from the mount command - no persistent info here */ +struct lustre_mount_data { + u32 lmd_magic; + u32 lmd_flags; /* lustre mount flags */ + int lmd_mgs_failnodes; /* mgs failover node count */ + int lmd_exclude_count; + int lmd_recovery_time_soft; + int lmd_recovery_time_hard; + char *lmd_dev; /* device name */ + char *lmd_profile; /* client only */ + char *lmd_fileset; /* mount fileset */ + char *lmd_mgssec; /* sptlrpc flavor to mgs */ + char *lmd_opts; /* lustre mount options (as opposed to + * device_ mount options) */ + char *lmd_params; /* lustre params */ + u32 *lmd_exclude; /* array of OSTs to ignore */ + char *lmd_mgs; /* MGS nid */ + char *lmd_osd_type; /* OSD type */ + char *lmd_nidnet; /* network to restrict this client to */ +}; + +#define LMD_FLG_SERVER 0x0001 /* Mounting a server */ +#define LMD_FLG_CLIENT 0x0002 /* Mounting a client */ +#define LMD_FLG_SKIP_LFSCK 0x0004 /* NOT auto resume LFSCK when mount */ +#define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */ +#define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers, + no other services */ +#define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, reusing + existing MGS services */ +#define LMD_FLG_WRITECONF 0x0040 /* Rewrite config log */ +#define LMD_FLG_NOIR 0x0080 /* NO imperative recovery */ +#define LMD_FLG_NOSCRUB 0x0100 /* Do not trigger scrub automatically */ +#define LMD_FLG_MGS 0x0200 /* Also start MGS along with server */ +#define LMD_FLG_IAM 0x0400 /* IAM dir */ +#define LMD_FLG_NO_PRIMNODE 0x0800 /* all nodes are service nodes */ +#define LMD_FLG_VIRGIN 0x1000 /* the service registers first time */ +#define LMD_FLG_UPDATE 0x2000 /* update parameters */ +#define LMD_FLG_HSM 0x4000 /* Start coordinator */ +#define LMD_FLG_DEV_RDONLY 0x8000 /* discard modification quitely */ + +#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT) + +/****************** superblock additional info *********************/ +struct ll_sb_info; +struct kobject; + +struct lustre_sb_info { + int lsi_flags; + struct obd_device *lsi_mgc; /* mgc obd */ + struct lustre_mount_data *lsi_lmd; /* mount command info */ + struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */ + struct dt_device *lsi_dt_dev; /* dt device to access disk fs*/ + atomic_t lsi_mounts; /* references to the srv_mnt */ + struct kobject *lsi_kobj; + char lsi_svname[MTI_NAME_MAXLEN]; + /* lsi_osd_obdname format = 'lsi->ls_svname'-osd */ + char lsi_osd_obdname[MTI_NAME_MAXLEN + 4]; + /* lsi_osd_uuid format = 'lsi->ls_osd_obdname'_UUID */ + char lsi_osd_uuid[MTI_NAME_MAXLEN + 9]; + struct obd_export *lsi_osd_exp; + char lsi_osd_type[16]; + char lsi_fstype[16]; + struct backing_dev_info lsi_bdi; /* each client mountpoint needs + own backing_dev_info */ + /* protect lsi_lwp_list */ + struct mutex lsi_lwp_mutex; + struct list_head lsi_lwp_list; + unsigned long lsi_lwp_started:1; +}; + +#define LSI_UMOUNT_FAILOVER 0x00200000 + +#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info)) +#define s2lsi_nocast(sb) ((sb)->s_fs_info) + +#define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile) +#define get_mount_fileset(sb) (s2lsi(sb)->lsi_lmd->lmd_fileset) + +# ifdef HAVE_SERVER_SUPPORT +/* opc for target register */ +#define LDD_F_OPC_REG 0x10000000 +#define LDD_F_OPC_UNREG 0x20000000 +#define LDD_F_OPC_READY 0x40000000 +#define LDD_F_OPC_MASK 0xf0000000 + +#define LDD_F_MASK 0xFFFF + +/* + * This limit is arbitrary (131072 clients on x86), but it is convenient to use + * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. + * If we need more than 131072 clients (order-2 allocation on x86) then this + * should become an array of single-page pointers that are allocated on demand. + */ +#if (128 * 1024UL) > (PAGE_SIZE * 8) +#define LR_MAX_CLIENTS (128 * 1024UL) +#else +#define LR_MAX_CLIENTS (PAGE_SIZE * 8) +#endif + +/** COMPAT_146: this is an OST (temporary) */ +#define OBD_COMPAT_OST 0x00000002 +/** COMPAT_146: this is an MDT (temporary) */ +#define OBD_COMPAT_MDT 0x00000004 +/** 2.0 server, interop flag to show server version is changed */ +#define OBD_COMPAT_20 0x00000008 + +/** MDS handles LOV_OBJID file */ +#define OBD_ROCOMPAT_LOVOBJID 0x00000001 +/** store OST index in the IDIF */ +#define OBD_ROCOMPAT_IDX_IN_IDIF 0x00000002 + +/** OST handles group subdirs */ +#define OBD_INCOMPAT_GROUPS 0x00000001 +/** this is an OST */ +#define OBD_INCOMPAT_OST 0x00000002 +/** this is an MDT */ +#define OBD_INCOMPAT_MDT 0x00000004 +/** common last_rvcd format */ +#define OBD_INCOMPAT_COMMON_LR 0x00000008 +/** FID is enabled */ +#define OBD_INCOMPAT_FID 0x00000010 +/** Size-on-MDS is enabled */ +#define OBD_INCOMPAT_SOM 0x00000020 +/** filesystem using iam format to store directory entries */ +#define OBD_INCOMPAT_IAM_DIR 0x00000040 +/** LMA attribute contains per-inode incompatible flags */ +#define OBD_INCOMPAT_LMA 0x00000080 +/** lmm_stripe_count has been shrunk from u32 to u16 and the remaining 16 + * bits are now used to store a generation. Once we start changing the layout + * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count + * will be confused by interpreting stripe_count | gen << 16 as the actual + * stripe count */ +#define OBD_INCOMPAT_LMM_VER 0x00000100 +/** multiple OI files for MDT */ +#define OBD_INCOMPAT_MULTI_OI 0x00000200 +/** multiple RPCs in flight */ +#define OBD_INCOMPAT_MULTI_RPCS 0x00000400 + +/* last_rcvd handling */ +static inline void lsd_le_to_cpu(struct lr_server_data *buf, + struct lr_server_data *lsd) +{ + int i; + + memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid)); + lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno); + lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14); + lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count); + lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat); + lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat); + lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat); + lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size); + lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start); + lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size); + lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count); + lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid); + lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen); + memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid)); + lsd->lsd_osd_index = le32_to_cpu(buf->lsd_osd_index); + lsd->lsd_padding1 = le32_to_cpu(buf->lsd_padding1); + lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch); + for (i = 0; i < LR_EXPIRE_INTERVALS; i++) + lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]); + lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time); + lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals); +} + +static inline void lsd_cpu_to_le(struct lr_server_data *lsd, + struct lr_server_data *buf) +{ + int i; + + memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid)); + buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno); + buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14); + buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count); + buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat); + buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat); + buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat); + buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size); + buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start); + buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size); + buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count); + buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid); + buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen); + memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid)); + buf->lsd_osd_index = cpu_to_le32(lsd->lsd_osd_index); + buf->lsd_padding1 = cpu_to_le32(lsd->lsd_padding1); + buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch); + for (i = 0; i < LR_EXPIRE_INTERVALS; i++) + buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]); + buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time); + buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals); +} + +static inline void lcd_le_to_cpu(struct lsd_client_data *buf, + struct lsd_client_data *lcd) +{ + memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid)); + lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno); + lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid); + lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result); + lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data); + lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno); + lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid); + lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result); + lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data); + lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]); + lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]); + lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]); + lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]); + lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch); + lcd->lcd_generation = le32_to_cpu(buf->lcd_generation); +} + +static inline void lcd_cpu_to_le(struct lsd_client_data *lcd, + struct lsd_client_data *buf) +{ + memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid)); + buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno); + buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid); + buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result); + buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data); + buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno); + buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid); + buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result); + buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data); + buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]); + buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]); + buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]); + buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]); + buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch); + buf->lcd_generation = cpu_to_le32(lcd->lcd_generation); +} + +static inline u64 lcd_last_transno(struct lsd_client_data *lcd) +{ + return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ? + lcd->lcd_last_transno : lcd->lcd_last_close_transno); +} + +static inline u64 lcd_last_xid(struct lsd_client_data *lcd) +{ + return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ? + lcd->lcd_last_xid : lcd->lcd_last_close_xid); +} + +/****************** mount lookup info *********************/ + +struct lustre_mount_info { + char *lmi_name; + struct super_block *lmi_sb; + struct list_head lmi_list_chain; +}; + +/****************** prototypes *********************/ + +/* obd_mount.c */ +int server_name2fsname(const char *svname, char *fsname, const char **endptr); +#endif /* HAVE_SERVER_SUPPORT */ + +int server_name2svname(const char *label, char *svname, const char **endptr, + size_t svsize); +void obdname2fsname(const char *tgt, char *fsname, size_t buflen); + +#ifdef HAVE_SERVER_SUPPORT +int server_name_is_ost(const char *svname); +int target_name2index(const char *svname, u32 *idx, const char **endptr); + +int lustre_put_lsi(struct super_block *sb); +int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2, char *s3, char *s4); +int lustre_start_mgc(struct super_block *sb); +#endif /* HAVE_SERVER_SUPPORT */ +void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb, + struct vfsmount *mnt)); +void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb)); +int lustre_common_put_super(struct super_block *sb); + +# ifdef HAVE_SERVER_SUPPORT +/* obd_mount_server.c */ +int server_fill_super(struct super_block *sb); +struct lustre_mount_info *server_get_mount(const char *name); +int server_put_mount(const char *name, bool dereg_mnt); +struct mgs_target_info; +int server_mti_print(const char *title, struct mgs_target_info *mti); +void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd); +# endif + +int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, + enum mgs_cfg_type type); +int mgc_logname2resid(char *fsname, struct ldlm_res_id *res_id, + enum mgs_cfg_type type); + +/** @} disk */ + +#endif /* _LUSTRE_DISK_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h new file mode 100644 index 0000000000000..c6291b62f4259 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h @@ -0,0 +1,1832 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** \defgroup LDLM Lustre Distributed Lock Manager + * + * Lustre DLM is based on VAX DLM. + * Its two main roles are: + * - To provide locking assuring consistency of data on all Lustre nodes. + * - To allow clients to cache state protected by a lock by holding the + * lock until a conflicting lock is requested or it is expired by the LRU. + * + * @{ + */ + +#ifndef _LUSTRE_DLM_H__ +#define _LUSTRE_DLM_H__ + +#include +#include +#include +#include +#include /* for interval_node{}, ldlm_extent */ +#include + +#include "lustre_dlm_flags.h" + +struct obd_ops; +struct obd_device; + +extern struct kset *ldlm_ns_kset; +extern struct kset *ldlm_svc_kset; + +#define OBD_LDLM_DEVICENAME "ldlm" + +#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus()) +#define LDLM_DEFAULT_MAX_ALIVE 3900 /* 3900 seconds ~65 min */ +#define LDLM_CTIME_AGE_LIMIT (10) +/* if client lock is unused for that time it can be cancelled if any other + * client shows interest in that lock, e.g. glimpse is occured. */ +#define LDLM_DIRTY_AGE_LIMIT (10) +#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024 + +/** + * LDLM non-error return states + */ +enum ldlm_error { + ELDLM_OK = 0, + ELDLM_LOCK_MATCHED = 1, + + ELDLM_LOCK_CHANGED = 300, + ELDLM_LOCK_ABORTED = 301, + ELDLM_LOCK_REPLACED = 302, + ELDLM_NO_LOCK_DATA = 303, + ELDLM_LOCK_WOULDBLOCK = 304, + + ELDLM_NAMESPACE_EXISTS = 400, + ELDLM_BAD_NAMESPACE = 401, +}; + +/** + * LDLM namespace type. + * The "client" type is actually an indication that this is a narrow local view + * into complete namespace on the server. Such namespaces cannot make any + * decisions about lack of conflicts or do any autonomous lock granting without + * first speaking to a server. + */ +enum ldlm_side { + LDLM_NAMESPACE_SERVER = 0x01, + LDLM_NAMESPACE_CLIENT = 0x02 +}; + +/** + * The blocking callback is overloaded to perform two functions. These flags + * indicate which operation should be performed. + */ +#define LDLM_CB_BLOCKING 1 +#define LDLM_CB_CANCELING 2 + +/** + * \name Lock Compatibility Matrix. + * + * A lock has both a type (extent, flock, inode bits, or plain) and a mode. + * Lock types are described in their respective implementation files: + * ldlm_{extent,flock,inodebits,plain}.c. + * + * There are six lock modes along with a compatibility matrix to indicate if + * two locks are compatible. + * + * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock + * on the parent. + * - PW: Protective Write (normal write) mode. When a client requests a write + * lock from an OST, a lock with PW mode will be issued. + * - PR: Protective Read (normal read) mode. When a client requests a read from + * an OST, a lock with PR mode will be issued. Also, if the client opens a + * file for execution, it is granted a lock with PR mode. + * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client + * requests a write lock during a file open operation. + * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants + * an inodebit lock with the CR mode on the intermediate path component. + * - NL Null mode. + * + *
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * 
+ */ +/** @{ */ +#define LCK_COMPAT_EX LCK_NL +#define LCK_COMPAT_PW (LCK_COMPAT_EX | LCK_CR) +#define LCK_COMPAT_PR (LCK_COMPAT_PW | LCK_PR) +#define LCK_COMPAT_CW (LCK_COMPAT_PW | LCK_CW) +#define LCK_COMPAT_CR (LCK_COMPAT_CW | LCK_PR | LCK_PW) +#define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX | LCK_GROUP) +#define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL) +#define LCK_COMPAT_COS (LCK_COS) +/** @} Lock Compatibility Matrix */ + +extern enum ldlm_mode lck_compat_array[]; + +static inline void lockmode_verify(enum ldlm_mode mode) +{ + LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE); +} + +static inline int lockmode_compat(enum ldlm_mode exist_mode, + enum ldlm_mode new_mode) +{ + return lck_compat_array[exist_mode] & new_mode; +} + +/* + * + * cluster name spaces + * + */ + +#define DLM_OST_NAMESPACE 1 +#define DLM_MDS_NAMESPACE 2 + +/* XXX + - do we just separate this by security domains and use a prefix for + multiple namespaces in the same domain? + - +*/ + +/** + * Locking rules for LDLM: + * + * lr_lock + * + * lr_lock + * waiting_locks_spinlock + * + * lr_lock + * led_lock + * + * lr_lock + * ns_lock + * + * lr_lvb_mutex + * lr_lock + * + */ + +struct ldlm_pool; +struct ldlm_lock; +struct ldlm_resource; +struct ldlm_namespace; + +/** + * Operations on LDLM pools. + * LDLM pool is a pool of locks in the namespace without any implicitly + * specified limits. + * Locks in the pool are organized in LRU. + * Local memory pressure or server instructions (e.g. mempressure on server) + * can trigger freeing of locks from the pool + */ +struct ldlm_pool_ops { + /** Recalculate pool \a pl usage */ + int (*po_recalc)(struct ldlm_pool *pl); + /** Cancel at least \a nr locks from pool \a pl */ + int (*po_shrink)(struct ldlm_pool *pl, int nr, gfp_t gfp_mask); + int (*po_setup)(struct ldlm_pool *pl, int limit); +}; + +/** One second for pools thread check interval. Each pool has own period. */ +#define LDLM_POOLS_THREAD_PERIOD (1) + +/** ~6% margin for modest pools. See ldlm_pool.c for details. */ +#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4) + +/** Default recalc period for server side pools in sec. */ +#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1) + +/** Default recalc period for client side pools in sec. */ +#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10) + +/** + * LDLM pool structure to track granted locks. + * For purposes of determining when to release locks on e.g. memory pressure. + * This feature is commonly referred to as lru_resize. + */ +struct ldlm_pool { + /** Pool debugfs directory. */ + struct dentry *pl_debugfs_entry; + /** Pool name, must be long enough to hold compound proc entry name. */ + char pl_name[100]; + /** Lock for protecting SLV/CLV updates. */ + spinlock_t pl_lock; + /** Number of allowed locks in in pool, both, client and server side. */ + atomic_t pl_limit; + /** Number of granted locks in */ + atomic_t pl_granted; + /** Grant rate per T. */ + atomic_t pl_grant_rate; + /** Cancel rate per T. */ + atomic_t pl_cancel_rate; + /** Server lock volume (SLV). Protected by pl_lock. */ + __u64 pl_server_lock_volume; + /** Current biggest client lock volume. Protected by pl_lock. */ + __u64 pl_client_lock_volume; + /** Lock volume factor. SLV on client is calculated as following: + * server_slv * lock_volume_factor. */ + atomic_t pl_lock_volume_factor; + /** Time when last SLV from server was obtained. */ + time64_t pl_recalc_time; + /** Recalculation period for pool. */ + time64_t pl_recalc_period; + /** Recalculation and shrink operations. */ + struct ldlm_pool_ops *pl_ops; + /** Number of planned locks for next period. */ + int pl_grant_plan; + /** Pool statistics. */ + struct lprocfs_stats *pl_stats; + + /* sysfs object */ + struct kobject pl_kobj; + struct completion pl_kobj_unregister; +}; + +typedef int (*ldlm_res_policy)(const struct lu_env *env, + struct ldlm_namespace *, + struct ldlm_lock **, void *req_cookie, + enum ldlm_mode mode, __u64 flags, void *data); + +typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock); + +/** + * LVB operations. + * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could + * be associated with an LDLM lock and transferred from client to server and + * back. + * + * Currently LVBs are used by: + * - OSC-OST code to maintain current object size/times + * - layout lock code to return the layout when the layout lock is granted + * + * To ensure delayed LVB initialization, it is highly recommended to use the set + * of ldlm_[res_]lvbo_[init,update,fill]() functions. + */ +struct ldlm_valblock_ops { + int (*lvbo_init)(struct ldlm_resource *res); + int (*lvbo_update)(struct ldlm_resource *res, struct ldlm_lock *lock, + struct ptlrpc_request *r, int increase); + int (*lvbo_free)(struct ldlm_resource *res); + /* Return size of lvb data appropriate RPC size can be reserved */ + int (*lvbo_size)(struct ldlm_lock *lock); + /* Called to fill in lvb data to RPC buffer @buf */ + int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int *buflen); +}; + +/** + * LDLM pools related, type of lock pool in the namespace. + * Greedy means release cached locks aggressively + */ +enum ldlm_appetite { + LDLM_NAMESPACE_GREEDY = 1 << 0, + LDLM_NAMESPACE_MODEST = 1 << 1 +}; + +/** + * Default values for the "max_nolock_size", "contention_time" and + * "contended_locks" namespace tunables. + */ +#define NS_DEFAULT_MAX_NOLOCK_BYTES 0 +#define NS_DEFAULT_CONTENTION_SECONDS 2 +#define NS_DEFAULT_CONTENDED_LOCKS 32 + +struct ldlm_ns_bucket { + /** back pointer to namespace */ + struct ldlm_namespace *nsb_namespace; + /** + * Estimated lock callback time. Used by adaptive timeout code to + * avoid spurious client evictions due to unresponsiveness when in + * fact the network or overall system load is at fault + */ + struct adaptive_timeout nsb_at_estimate; + /** + * Which res in the bucket should we start with the reclaim. + */ + int nsb_reclaim_start; +}; + +enum { + /** LDLM namespace lock stats */ + LDLM_NSS_LOCKS = 0, + LDLM_NSS_LAST +}; + +enum ldlm_ns_type { + LDLM_NS_TYPE_UNKNOWN = 0, /**< invalid type */ + LDLM_NS_TYPE_MDC, /**< MDC namespace */ + LDLM_NS_TYPE_MDT, /**< MDT namespace */ + LDLM_NS_TYPE_OSC, /**< OSC namespace */ + LDLM_NS_TYPE_OST, /**< OST namespace */ + LDLM_NS_TYPE_MGC, /**< MGC namespace */ + LDLM_NS_TYPE_MGT, /**< MGT namespace */ +}; + +enum ldlm_namespace_flags { + /** + * Flag to indicate the LRU cancel is in progress. + * Used to limit the process by 1 thread only. + */ + LDLM_LRU_CANCEL = 0 +}; + +/** + * LDLM Namespace. + * + * Namespace serves to contain locks related to a particular service. + * There are two kinds of namespaces: + * - Server namespace has knowledge of all locks and is therefore authoritative + * to make decisions like what locks could be granted and what conflicts + * exist during new lock enqueue. + * - Client namespace only has limited knowledge about locks in the namespace, + * only seeing locks held by the client. + * + * Every Lustre service has one server namespace present on the server serving + * that service. Every client connected to the service has a client namespace + * for it. + * Every lock obtained by client in that namespace is actually represented by + * two in-memory locks. One on the server and one on the client. The locks are + * linked by a special cookie by which one node can tell to the other which lock + * it actually means during communications. Such locks are called remote locks. + * The locks held by server only without any reference to a client are called + * local locks. + */ +struct ldlm_namespace { + /** Backward link to OBD, required for LDLM pool to store new SLV. */ + struct obd_device *ns_obd; + + /** Flag indicating if namespace is on client instead of server */ + enum ldlm_side ns_client; + + /** name of this namespace */ + char *ns_name; + + /** Resource hash table for namespace. */ + struct cfs_hash *ns_rs_hash; + + /** serialize */ + spinlock_t ns_lock; + + /** big refcount (by bucket) */ + atomic_t ns_bref; + + /** + * Namespace connect flags supported by server (may be changed via + * /proc, LRU resize may be disabled/enabled). + */ + __u64 ns_connect_flags; + + /** Client side original connect flags supported by server. */ + __u64 ns_orig_connect_flags; + + /* namespace debugfs dir entry */ + struct dentry *ns_debugfs_entry; + + /** + * Position in global namespace list linking all namespaces on + * the node. + */ + struct list_head ns_list_chain; + + /** + * List of unused locks for this namespace. This list is also called + * LRU lock list. + * Unused locks are locks with zero reader/writer reference counts. + * This list is only used on clients for lock caching purposes. + * When we want to release some locks voluntarily or if server wants + * us to release some locks due to e.g. memory pressure, we take locks + * to release from the head of this list. + * Locks are linked via l_lru field in \see struct ldlm_lock. + */ + struct list_head ns_unused_list; + /** Number of locks in the LRU list above */ + int ns_nr_unused; + struct list_head *ns_last_pos; + + /** + * Maximum number of locks permitted in the LRU. If 0, means locks + * are managed by pools and there is no preset limit, rather it is all + * controlled by available memory on this client and on server. + */ + unsigned int ns_max_unused; + + /** Maximum allowed age (last used time) for locks in the LRU */ + ktime_t ns_max_age; + + /** + * Server only: number of times we evicted clients due to lack of reply + * to ASTs. + */ + unsigned int ns_timeouts; + /** + * Number of seconds since the file change time after which the + * MDT will return an UPDATE lock along with a LOOKUP lock. + * This allows the client to start caching negative dentries + * for a directory and may save an RPC for a later stat. + */ + time64_t ns_ctime_age_limit; + /** + * Number of seconds since the lock was last used. The client may + * cancel the lock limited by this age and flush related data if + * any other client shows interest in it doing glimpse request. + * This allows to cache stat data locally for such files early. + */ + time64_t ns_dirty_age_limit; + /** + * Used to rate-limit ldlm_namespace_dump calls. + * \see ldlm_namespace_dump. Increased by 10 seconds every time + * it is called. + */ + time64_t ns_next_dump; + + /** "policy" function that does actual lock conflict determination */ + ldlm_res_policy ns_policy; + + /** + * LVB operations for this namespace. + * \see struct ldlm_valblock_ops + */ + struct ldlm_valblock_ops *ns_lvbo; + + /** + * Used by filter code to store pointer to OBD of the service. + * Should be dropped in favor of \a ns_obd + */ + void *ns_lvbp; + + /** + * Wait queue used by __ldlm_namespace_free. Gets woken up every time + * a resource is removed. + */ + wait_queue_head_t ns_waitq; + /** LDLM pool structure for this namespace */ + struct ldlm_pool ns_pool; + /** Definition of how eagerly unused locks will be released from LRU */ + enum ldlm_appetite ns_appetite; + + /** + * If more than \a ns_contended_locks are found, the resource is + * considered to be contended. Lock enqueues might specify that no + * contended locks should be granted + */ + unsigned ns_contended_locks; + + /** + * The resources in this namespace remember contended state during + * \a ns_contention_time, in seconds. + */ + time64_t ns_contention_time; + + /** + * Limit size of contended extent locks, in bytes. + * If extended lock is requested for more then this many bytes and + * caller instructs us not to grant contended locks, we would disregard + * such a request. + */ + unsigned ns_max_nolock_size; + + /** Limit of parallel AST RPC count. */ + unsigned ns_max_parallel_ast; + + /** + * Callback to check if a lock is good to be canceled by ELC or + * during recovery. + */ + ldlm_cancel_cbt ns_cancel; + + /** LDLM lock stats */ + struct lprocfs_stats *ns_stats; + + /** + * Flag to indicate namespace is being freed. Used to determine if + * recalculation of LDLM pool statistics should be skipped. + */ + unsigned ns_stopping:1; + + /** + * Which bucket should we start with the lock reclaim. + */ + int ns_reclaim_start; + + struct kobject ns_kobj; /* sysfs object */ + struct completion ns_kobj_unregister; + + /** + * To avoid another ns_lock usage, a separate bitops field. + */ + unsigned long ns_flags; +}; + +/** + * Returns 1 if namespace \a ns is a client namespace. + */ +static inline int ns_is_client(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT || + ns->ns_client == LDLM_NAMESPACE_SERVER); + return ns->ns_client == LDLM_NAMESPACE_CLIENT; +} + +/** + * Returns 1 if namespace \a ns is a server namespace. + */ +static inline int ns_is_server(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT || + ns->ns_client == LDLM_NAMESPACE_SERVER); + return ns->ns_client == LDLM_NAMESPACE_SERVER; +} + +/** + * Returns 1 if namespace \a ns supports early lock cancel (ELC). + */ +static inline int ns_connect_cancelset(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET); +} + +/** + * Returns 1 if this namespace supports lru_resize. + */ +static inline int ns_connect_lru_resize(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE); +} + +static inline void ns_register_cancel(struct ldlm_namespace *ns, + ldlm_cancel_cbt arg) +{ + LASSERT(ns != NULL); + ns->ns_cancel = arg; +} + +struct ldlm_lock; + +/** Type for blocking callback function of a lock. */ +typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock, + struct ldlm_lock_desc *new, void *data, + int flag); +/** Type for completion callback function of a lock. */ +typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags, + void *data); +/** Type for glimpse callback function of a lock. */ +typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data); + +/** Type for created callback function of a lock. */ +typedef void (*ldlm_created_callback)(struct ldlm_lock *lock); + +/** Work list for sending GL ASTs to multiple locks. */ +struct ldlm_glimpse_work { + struct ldlm_lock *gl_lock; /* lock to glimpse */ + struct list_head gl_list; /* linkage to other gl work structs */ + __u32 gl_flags;/* see LDLM_GL_WORK_* below */ + union ldlm_gl_desc *gl_desc; /* glimpse descriptor to be packed in + * glimpse callback request */ + ptlrpc_interpterer_t gl_interpret_reply; + void *gl_interpret_data; +}; + +struct ldlm_bl_desc { + unsigned int bl_same_client:1, + bl_cos_incompat:1; +}; + +struct ldlm_cb_set_arg { + struct ptlrpc_request_set *set; + int type; /* LDLM_{CP,BL,GL}_CALLBACK */ + atomic_t restart; + struct list_head *list; + union ldlm_gl_desc *gl_desc; /* glimpse AST descriptor */ + ptlrpc_interpterer_t gl_interpret_reply; + void *gl_interpret_data; + struct ldlm_bl_desc *bl_desc; +}; + +struct ldlm_cb_async_args { + struct ldlm_cb_set_arg *ca_set_arg; + struct ldlm_lock *ca_lock; +}; + +/** The ldlm_glimpse_work was slab allocated & must be freed accordingly.*/ +#define LDLM_GL_WORK_SLAB_ALLOCATED 0x1 + +/** Interval node data for each LDLM_EXTENT lock. */ +struct ldlm_interval { + struct interval_node li_node; /* node for tree management */ + struct list_head li_group; /* the locks which have the same + * policy - group of the policy */ +}; +#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node) + +/** + * Interval tree for extent locks. + * The interval tree must be accessed under the resource lock. + * Interval trees are used for granted extent locks to speed up conflicts + * lookup. See ldlm/interval_tree.c for more details. + */ +struct ldlm_interval_tree { + /** Tree size. */ + int lit_size; + enum ldlm_mode lit_mode; /* lock mode */ + struct interval_node *lit_root; /* actual ldlm_interval */ +}; + +/** + * Lists of waiting locks for each inodebit type. + * A lock can be in several liq_waiting lists and it remains in lr_waiting. + */ +struct ldlm_ibits_queues { + struct list_head liq_waiting[MDS_INODELOCK_NUMBITS]; +}; + +struct ldlm_ibits_node { + struct list_head lin_link[MDS_INODELOCK_NUMBITS]; + struct ldlm_lock *lock; +}; + +/** Whether to track references to exports by LDLM locks. */ +#define LUSTRE_TRACKS_LOCK_EXP_REFS (0) + +/** Cancel flags. */ +enum ldlm_cancel_flags { + LCF_ASYNC = 0x1, /* Cancel locks asynchronously. */ + LCF_LOCAL = 0x2, /* Cancel locks locally, not notifing server */ + LCF_BL_AST = 0x4, /* Cancel LDLM_FL_BL_AST locks in the same RPC */ +}; + +struct ldlm_flock { + __u64 start; + __u64 end; + __u64 owner; + __u64 blocking_owner; + struct obd_export *blocking_export; + atomic_t blocking_refs; + __u32 pid; +}; + +union ldlm_policy_data { + struct ldlm_extent l_extent; + struct ldlm_flock l_flock; + struct ldlm_inodebits l_inodebits; +}; + +void ldlm_convert_policy_to_wire(enum ldlm_type type, + const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy); +void ldlm_convert_policy_to_local(struct obd_export *exp, enum ldlm_type type, + const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy); + +enum lvb_type { + LVB_T_NONE = 0, + LVB_T_OST = 1, + LVB_T_LQUOTA = 2, + LVB_T_LAYOUT = 3, +}; + +/** + * LDLM_GID_ANY is used to match any group id in ldlm_lock_match(). + */ +#define LDLM_GID_ANY ((__u64)-1) + +/** + * LDLM lock structure + * + * Represents a single LDLM lock and its state in memory. Each lock is + * associated with a single ldlm_resource, the object which is being + * locked. There may be multiple ldlm_locks on a single resource, + * depending on the lock type and whether the locks are conflicting or + * not. + */ +struct ldlm_lock { + /** + * Local lock handle. + * When remote side wants to tell us about a lock, they address + * it by this opaque handle. The handle does not hold a + * reference on the ldlm_lock, so it can be safely passed to + * other threads or nodes. When the lock needs to be accessed + * from the handle, it is looked up again in the lock table, and + * may no longer exist. + * + * Must be first in the structure. + */ + struct portals_handle l_handle; + /** + * Lock reference count. + * This is how many users have pointers to actual structure, so that + * we do not accidentally free lock structure that is in use. + */ + atomic_t l_refc; + /** + * Internal spinlock protects l_resource. We should hold this lock + * first before taking res_lock. + */ + spinlock_t l_lock; + /** + * Pointer to actual resource this lock is in. + * ldlm_lock_change_resource() can change this. + */ + struct ldlm_resource *l_resource; + /** + * List item for client side LRU list. + * Protected by ns_lock in struct ldlm_namespace. + */ + struct list_head l_lru; + /** + * Linkage to resource's lock queues according to current lock state. + * (could be granted or waiting) + * Protected by lr_lock in struct ldlm_resource. + */ + struct list_head l_res_link; + /** + * Internal structures per lock type.. + */ + union { + struct ldlm_interval *l_tree_node; + struct ldlm_ibits_node *l_ibits_node; + }; + /** + * Per export hash of locks. + * Protected by per-bucket exp->exp_lock_hash locks. + */ + struct hlist_node l_exp_hash; + /** + * Per export hash of flock locks. + * Protected by per-bucket exp->exp_flock_hash locks. + */ + struct hlist_node l_exp_flock_hash; + /** + * Requested mode. + * Protected by lr_lock. + */ + enum ldlm_mode l_req_mode; + /** + * Granted mode, also protected by lr_lock. + */ + enum ldlm_mode l_granted_mode; + /** Lock completion handler pointer. Called when lock is granted. */ + ldlm_completion_callback l_completion_ast; + /** + * Lock blocking AST handler pointer. + * It plays two roles: + * - as a notification of an attempt to queue a conflicting lock (once) + * - as a notification when the lock is being cancelled. + * + * As such it's typically called twice: once for the initial conflict + * and then once more when the last user went away and the lock is + * cancelled (could happen recursively). + */ + ldlm_blocking_callback l_blocking_ast; + /** + * Lock glimpse handler. + * Glimpse handler is used to obtain LVB updates from a client by + * server + */ + ldlm_glimpse_callback l_glimpse_ast; + + /** + * Lock export. + * This is a pointer to actual client export for locks that were granted + * to clients. Used server-side. + */ + struct obd_export *l_export; + /** + * Lock connection export. + * Pointer to server export on a client. + */ + struct obd_export *l_conn_export; + + /** + * Remote lock handle. + * If the lock is remote, this is the handle of the other side lock + * (l_handle) + */ + struct lustre_handle l_remote_handle; + + /** + * Representation of private data specific for a lock type. + * Examples are: extent range for extent lock or bitmask for ibits locks + */ + union ldlm_policy_data l_policy_data; + + /** + * Lock state flags. Protected by lr_lock. + * \see lustre_dlm_flags.h where the bits are defined. + */ + __u64 l_flags; + + /** + * Lock r/w usage counters. + * Protected by lr_lock. + */ + __u32 l_readers; + __u32 l_writers; + /** + * If the lock is granted, a process sleeps on this waitq to learn when + * it's no longer in use. If the lock is not granted, a process sleeps + * on this waitq to learn when it becomes granted. + */ + wait_queue_head_t l_waitq; + + /** + * Time, in nanoseconds, last used by e.g. being matched by lock match. + */ + ktime_t l_last_used; + + /** Originally requested extent for the extent lock. */ + struct ldlm_extent l_req_extent; + + /* + * Client-side-only members. + */ + + enum lvb_type l_lvb_type; + + /** + * Temporary storage for a LVB received during an enqueue operation. + * May be vmalloc'd, so needs to be freed with OBD_FREE_LARGE(). + */ + __u32 l_lvb_len; + void *l_lvb_data; + + /** Private storage for lock user. Opaque to LDLM. */ + void *l_ast_data; + + union { + /** + * Seconds. It will be updated if there is any activity related to + * the lock at client, e.g. enqueue the lock. For server it is the + * time when blocking ast was sent. + */ + time64_t l_activity; + time64_t l_blast_sent; + }; + + /* separate ost_lvb used mostly by Data-on-MDT for now. + * It is introduced to don't mix with layout lock data. */ + struct ost_lvb l_ost_lvb; + /* + * Server-side-only members. + */ + + /** + * Connection cookie for the client originating the operation. + * Used by Commit on Share (COS) code. Currently only used for + * inodebits locks on MDS. + */ + __u64 l_client_cookie; + + /** + * List item for locks waiting for cancellation from clients. + * The lists this could be linked into are: + * waiting_locks_list (protected by waiting_locks_spinlock), + * then if the lock timed out, it is moved to + * expired_lock_list for further processing. + */ + struct list_head l_pending_chain; + + /** + * Set when lock is sent a blocking AST. Time in seconds when timeout + * is reached and client holding this lock could be evicted. + * This timeout could be further extended by e.g. certain IO activity + * under this lock. + * \see ost_rw_prolong_locks + */ + time64_t l_callback_timeout; + + /** Local PID of process which created this lock. */ + __u32 l_pid; + + /** + * Number of times blocking AST was sent for this lock. + * This is for debugging. Valid values are 0 and 1, if there is an + * attempt to send blocking AST more than once, an assertion would be + * hit. \see ldlm_work_bl_ast_lock + */ + int l_bl_ast_run; + /** List item ldlm_add_ast_work_item() for case of blocking ASTs. */ + struct list_head l_bl_ast; + /** List item ldlm_add_ast_work_item() for case of completion ASTs. */ + struct list_head l_cp_ast; + /** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */ + struct list_head l_rk_ast; + + /** + * Pointer to a conflicting lock that caused blocking AST to be sent + * for this lock + */ + struct ldlm_lock *l_blocking_lock; + + /** + * Protected by lr_lock, linkages to "skip lists". + * For more explanations of skip lists see ldlm/ldlm_inodebits.c + */ + struct list_head l_sl_mode; + struct list_head l_sl_policy; + + /** Reference tracking structure to debug leaked locks. */ + struct lu_ref l_reference; +#if LUSTRE_TRACKS_LOCK_EXP_REFS + /* Debugging stuff for bug 20498, for tracking export references. */ + /** number of export references taken */ + int l_exp_refs_nr; + /** link all locks referencing one export */ + struct list_head l_exp_refs_link; + /** referenced export object */ + struct obd_export *l_exp_refs_target; +#endif + /** + * export blocking dlm lock list, protected by + * l_export->exp_bl_list_lock. + * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock + * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock. + */ + struct list_head l_exp_list; +}; + +/** + * Describe the overlap between two locks. itree_overlap_cb data. + */ +struct ldlm_match_data { + struct ldlm_lock *lmd_old; + struct ldlm_lock *lmd_lock; + enum ldlm_mode *lmd_mode; + union ldlm_policy_data *lmd_policy; + __u64 lmd_flags; + __u64 lmd_skip_flags; + int lmd_unref; + bool lmd_has_ast_data; +}; + +/** For uncommitted cross-MDT lock, store transno this lock belongs to */ +#define l_transno l_client_cookie + +/** For uncommitted cross-MDT lock, which is client lock, share with l_rk_ast + * which is for server. */ +#define l_slc_link l_rk_ast + +#define HANDLE_MAP_SIZE ((LMV_MAX_STRIPE_COUNT + 7) >> 3) + +struct lustre_handle_array { + unsigned int ha_count; + /* ha_map is used as bit flag to indicate handle is remote or local */ + char ha_map[HANDLE_MAP_SIZE]; + struct lustre_handle ha_handles[0]; +}; + +/** + * LDLM resource description. + * Basically, resource is a representation for a single object. + * Object has a name which is currently 4 64-bit integers. LDLM user is + * responsible for creation of a mapping between objects it wants to be + * protected and resource names. + * + * A resource can only hold locks of a single lock type, though there may be + * multiple ldlm_locks on a single resource, depending on the lock type and + * whether the locks are conflicting or not. + */ +struct ldlm_resource { + struct ldlm_ns_bucket *lr_ns_bucket; + + /** + * List item for list in namespace hash. + * protected by ns_lock + */ + struct hlist_node lr_hash; + + /** Reference count for this resource */ + atomic_t lr_refcount; + + /** Spinlock to protect locks under this resource. */ + spinlock_t lr_lock; + + /** + * protected by lr_lock + * @{ */ + /** List of locks in granted state */ + struct list_head lr_granted; + /** + * List of locks that could not be granted due to conflicts and + * that are waiting for conflicts to go away */ + struct list_head lr_waiting; + /** @} */ + + /** Resource name */ + struct ldlm_res_id lr_name; + + union { + /** + * Interval trees (only for extent locks) for all modes of + * this resource + */ + struct ldlm_interval_tree *lr_itree; + struct ldlm_ibits_queues *lr_ibits_queues; + }; + + union { + /** + * When the resource was considered as contended, + * used only on server side. + */ + time64_t lr_contention_time; + /** + * Associated inode, used only on client side. + */ + struct inode *lr_lvb_inode; + }; + + /** Type of locks this resource can hold. Only one type per resource. */ + enum ldlm_type lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */ + + /** + * Server-side-only lock value block elements. + * To serialize lvbo_init. + */ + int lr_lvb_len; + struct mutex lr_lvb_mutex; + /** protected by lr_lock */ + void *lr_lvb_data; + /** is lvb initialized ? */ + bool lr_lvb_initialized; + + /** List of references to this resource. For debugging. */ + struct lu_ref lr_reference; +}; + +static inline int ldlm_is_granted(struct ldlm_lock *lock) +{ + return lock->l_req_mode == lock->l_granted_mode; +} + +static inline bool ldlm_has_layout(struct ldlm_lock *lock) +{ + return lock->l_resource->lr_type == LDLM_IBITS && + lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT; +} + +static inline bool ldlm_has_dom(struct ldlm_lock *lock) +{ + return lock->l_resource->lr_type == LDLM_IBITS && + lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_DOM; +} + +static inline char * +ldlm_ns_name(struct ldlm_namespace *ns) +{ + return ns->ns_name; +} + +static inline struct ldlm_namespace * +ldlm_res_to_ns(struct ldlm_resource *res) +{ + return res->lr_ns_bucket->nsb_namespace; +} + +static inline struct ldlm_namespace * +ldlm_lock_to_ns(struct ldlm_lock *lock) +{ + return ldlm_res_to_ns(lock->l_resource); +} + +static inline char * +ldlm_lock_to_ns_name(struct ldlm_lock *lock) +{ + return ldlm_ns_name(ldlm_lock_to_ns(lock)); +} + +static inline struct adaptive_timeout * +ldlm_lock_to_ns_at(struct ldlm_lock *lock) +{ + return &lock->l_resource->lr_ns_bucket->nsb_at_estimate; +} + +static inline int ldlm_lvbo_init(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + int rc = 0; + + if (ns->ns_lvbo == NULL || ns->ns_lvbo->lvbo_init == NULL || + res->lr_lvb_initialized) + return 0; + + mutex_lock(&res->lr_lvb_mutex); + /* Did we lose the race? */ + if (res->lr_lvb_initialized) { + mutex_unlock(&res->lr_lvb_mutex); + return 0; + } + rc = ns->ns_lvbo->lvbo_init(res); + if (rc < 0) { + CDEBUG(D_DLMTRACE, "lvbo_init failed for resource : rc = %d\n", + rc); + if (res->lr_lvb_data != NULL) { + OBD_FREE(res->lr_lvb_data, res->lr_lvb_len); + res->lr_lvb_data = NULL; + } + res->lr_lvb_len = rc; + } else { + res->lr_lvb_initialized = true; + } + mutex_unlock(&res->lr_lvb_mutex); + return rc; +} + +static inline int ldlm_lvbo_size(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL) + return ns->ns_lvbo->lvbo_size(lock); + + return 0; +} + +static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int *len) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + int rc; + + if (ns->ns_lvbo != NULL) { + LASSERT(ns->ns_lvbo->lvbo_fill != NULL); + /* init lvb now if not already */ + rc = ldlm_lvbo_init(lock->l_resource); + if (rc < 0) { + CERROR("lock %p: delayed lvb init failed (rc %d)", + lock, rc); + return rc; + } + return ns->ns_lvbo->lvbo_fill(lock, buf, len); + } + return 0; +} + +struct ldlm_ast_work { + struct ldlm_lock *w_lock; + int w_blocking; + struct ldlm_lock_desc w_desc; + struct list_head w_list; + int w_flags; + void *w_data; + int w_datalen; +}; + +/** + * Common ldlm_enqueue parameters + */ +struct ldlm_enqueue_info { + enum ldlm_type ei_type; /** Type of the lock being enqueued. */ + enum ldlm_mode ei_mode; /** Mode of the lock being enqueued. */ + void *ei_cb_bl; /** blocking lock callback */ + void *ei_cb_local_bl; /** blocking local lock callback */ + void *ei_cb_cp; /** lock completion callback */ + void *ei_cb_gl; /** lock glimpse callback */ + ldlm_created_callback ei_cb_created; /** lock created callback */ + void *ei_cbdata; /** Data to be passed into callbacks. */ + void *ei_namespace; /** lock namespace **/ + u64 ei_inodebits; /** lock inode bits **/ + unsigned int ei_enq_slave:1; /** whether enqueue slave stripes */ +}; + +#define ei_res_id ei_cb_gl + +extern struct obd_ops ldlm_obd_ops; + +extern char *ldlm_lockname[]; +extern char *ldlm_typename[]; +extern const char *ldlm_it2str(enum ldlm_intent_flags it); + +/** + * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG. + * For the cases where we do not have actual lock to print along + * with a debugging message that is ldlm-related + */ +#define LDLM_DEBUG_NOLOCK(format, a...) \ + CDEBUG(D_DLMTRACE, "### " format "\n" , ##a) + +/** + * Support function for lock information printing into debug logs. + * \see LDLM_DEBUG + */ +#ifdef LIBCFS_DEBUG +#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _ldlm_lock_debug(lock, msgdata, fmt, ##a); \ +} while(0) + +void _ldlm_lock_debug(struct ldlm_lock *lock, + struct libcfs_debug_msg_data *data, + const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Rate-limited version of lock printing function. + */ +#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \ + static struct cfs_debug_limit_state _ldlm_cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls); \ + ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\ +} while (0) + +#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a) +#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a) + +/** Non-rate-limited lock printing function for debugging purposes. */ +#define LDLM_DEBUG(lock, fmt, a...) do { \ + if (likely(lock != NULL)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL); \ + ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock, \ + "### " fmt , ##a); \ + } else { \ + LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a); \ + } \ +} while (0) +#else /* !LIBCFS_DEBUG */ +# define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) ((void)0) +# define LDLM_DEBUG(lock, fmt, a...) ((void)0) +# define LDLM_ERROR(lock, fmt, a...) ((void)0) +#endif + +/* + * Three intentions can be used for the policy functions in + * ldlm_processing_policy. + * + * LDLM_PROCESS_RESCAN: + * + * It's used when policy functions are called from ldlm_reprocess_queue() to + * reprocess the wait list and try to grant locks, blocking ASTs + * have already been sent in this situation, completion ASTs need be sent for + * the locks being granted. + * + * LDLM_PROCESS_ENQUEUE: + * + * It's used when policy functions are called from ldlm_lock_enqueue() to + * process the wait list for handling an enqueue request, blocking + * ASTs have not been sent yet, so list of conflicting locks would be + * collected and ASTs sent. + * + * LDLM_PROCESS_RECOVERY: + * + * It's used when policy functions are called from ldlm_reprocess_queue() to + * reprocess the wait list when recovery done. In case of blocking + * ASTs are lost before recovery, it needs not only to grant locks if + * available, but also send blocking ASTs to the locks doesn't have AST sent + * flag. Completion ASTs need be sent for the locks being granted. + */ +enum ldlm_process_intention { + LDLM_PROCESS_RESCAN = 0, + LDLM_PROCESS_ENQUEUE = 1, + LDLM_PROCESS_RECOVERY = 2, +}; + +typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, + struct list_head *work_list); + +typedef int (*ldlm_reprocessing_policy)(struct ldlm_resource *res, + struct list_head *queue, + struct list_head *work_list, + enum ldlm_process_intention intention, + struct ldlm_lock *hint); + +/** + * Return values for lock iterators. + * Also used during deciding of lock grants and cancellations. + */ +#define LDLM_ITER_CONTINUE 1 /* keep iterating */ +#define LDLM_ITER_STOP 2 /* stop iterating */ + +typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *); +typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *); + +/** \defgroup ldlm_iterator Lock iterators + * + * LDLM provides for a way to iterate through every lock on a resource or + * namespace or every resource in a namespace. + * @{ */ +int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, + void *closure); +void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter, + void *closure); +int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *, + ldlm_iterator_t iter, void *data); +/** @} ldlm_iterator */ + +int ldlm_replay_locks(struct obd_import *imp); + +/* ldlm_flock.c */ +int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); + +/* ldlm_extent.c */ +__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms); + +struct ldlm_prolong_args { + struct obd_export *lpa_export; + struct ldlm_res_id lpa_resid; + struct ldlm_extent lpa_extent; + enum ldlm_mode lpa_mode; + time64_t lpa_timeout; + int lpa_locks_cnt; + int lpa_blocks_cnt; +}; +void ldlm_lock_prolong_one(struct ldlm_lock *lock, + struct ldlm_prolong_args *arg); +void ldlm_resource_prolong(struct ldlm_prolong_args *arg); + +struct ldlm_callback_suite { + ldlm_completion_callback lcs_completion; + ldlm_blocking_callback lcs_blocking; + ldlm_glimpse_callback lcs_glimpse; +}; + +/* ldlm_lockd.c */ +#ifdef HAVE_SERVER_SUPPORT +/** \defgroup ldlm_srv_ast Server AST handlers + * These are AST handlers used by server code. + * Their property is that they are just preparing RPCs to be sent to clients. + * @{ + */ +int ldlm_server_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *data, int flag); +int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); +int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data); +int ldlm_glimpse_locks(struct ldlm_resource *res, + struct list_head *gl_work_list); +/** @} ldlm_srv_ast */ + +/** \defgroup ldlm_handlers Server LDLM handlers + * These are handler functions that should be called by "frontends" such as + * MDT or OST to pass through LDLM requests to LDLM for handling + * @{ + */ +int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + const struct ldlm_callback_suite *cbs); +int ldlm_handle_convert0(struct ptlrpc_request *req, + const struct ldlm_request *dlm_req); +int ldlm_handle_cancel(struct ptlrpc_request *req); +int ldlm_request_cancel(struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + int first, enum lustre_at_flags flags); +/** @} ldlm_handlers */ + +void ldlm_revoke_export_locks(struct obd_export *exp); +time64_t ldlm_bl_timeout(struct ldlm_lock *lock); +#endif +int ldlm_del_waiting_lock(struct ldlm_lock *lock); +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout); +int ldlm_get_ref(void); +void ldlm_put_ref(void); +int ldlm_init_export(struct obd_export *exp); +void ldlm_destroy_export(struct obd_export *exp); +struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req); + +/* ldlm_lock.c */ +#ifdef HAVE_SERVER_SUPPORT +ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res); +ldlm_reprocessing_policy +ldlm_get_reprocessing_policy(struct ldlm_resource *res); +#endif +void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg); +void ldlm_lock2handle(const struct ldlm_lock *lock, + struct lustre_handle *lockh); +struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags); +void ldlm_cancel_callback(struct ldlm_lock *); +int ldlm_lock_remove_from_lru(struct ldlm_lock *); +int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data); + +/** + * Obtain a lock reference by its handle. + */ +static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h) +{ + return __ldlm_handle2lock(h, 0); +} + +#define LDLM_LOCK_REF_DEL(lock) \ + lu_ref_del(&lock->l_reference, "handle", current) + +static inline struct ldlm_lock * +ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags) +{ + struct ldlm_lock *lock; + + lock = __ldlm_handle2lock(h, flags); + if (lock != NULL) + LDLM_LOCK_REF_DEL(lock); + return lock; +} + +/** + * Update Lock Value Block Operations (LVBO) on a resource taking into account + * data from request \a r + */ +static inline int ldlm_lvbo_update(struct ldlm_resource *res, + struct ldlm_lock *lock, + struct ptlrpc_request *req, int increase) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + int rc; + + /* delayed lvb init may be required */ + rc = ldlm_lvbo_init(res); + if (rc < 0) { + CERROR("delayed lvb init failed (rc %d)\n", rc); + return rc; + } + + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_update) + return ns->ns_lvbo->lvbo_update(res, lock, req, increase); + + return 0; +} + +static inline int ldlm_res_lvbo_update(struct ldlm_resource *res, + struct ptlrpc_request *req, + int increase) +{ + return ldlm_lvbo_update(res, NULL, req, increase); +} + +int is_granted_or_cancelled_nolock(struct ldlm_lock *lock); + +int ldlm_error2errno(enum ldlm_error error); +enum ldlm_error ldlm_errno2error(int err_no); /* don't call it `errno': this + * confuses user-space. */ +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void ldlm_dump_export_locks(struct obd_export *exp); +#endif + +/** + * Release a temporary lock reference obtained by ldlm_handle2lock() or + * __ldlm_handle2lock(). + */ +#define LDLM_LOCK_PUT(lock) \ +do { \ + LDLM_LOCK_REF_DEL(lock); \ + /*LDLM_DEBUG((lock), "put");*/ \ + ldlm_lock_put(lock); \ +} while (0) + +/** + * Release a lock reference obtained by some other means (see + * LDLM_LOCK_PUT()). + */ +#define LDLM_LOCK_RELEASE(lock) \ +do { \ + /*LDLM_DEBUG((lock), "put");*/ \ + ldlm_lock_put(lock); \ +} while (0) + +#define LDLM_LOCK_GET(lock) \ +({ \ + ldlm_lock_get(lock); \ + /*LDLM_DEBUG((lock), "get");*/ \ + lock; \ +}) + +#define ldlm_lock_list_put(head, member, count) \ +({ \ + struct ldlm_lock *_lock, *_next; \ + int c = count; \ + list_for_each_entry_safe(_lock, _next, head, member) { \ + if (c-- == 0) \ + break; \ + list_del_init(&_lock->member); \ + LDLM_LOCK_RELEASE(_lock); \ + } \ + LASSERT(c <= 0); \ +}) + +struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); +void ldlm_lock_put(struct ldlm_lock *lock); +void ldlm_lock_destroy(struct ldlm_lock *lock); +void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc); +void ldlm_lock_addref(const struct lustre_handle *lockh, enum ldlm_mode mode); +int ldlm_lock_addref_try(const struct lustre_handle *lockh, + enum ldlm_mode mode); +void ldlm_lock_decref(const struct lustre_handle *lockh, enum ldlm_mode mode); +void ldlm_lock_decref_and_cancel(const struct lustre_handle *lockh, + enum ldlm_mode mode); +void ldlm_lock_fail_match_locked(struct ldlm_lock *lock); +void ldlm_lock_fail_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock); +enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns, + __u64 flags, __u64 skip_flags, + const struct ldlm_res_id *res_id, + enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + struct lustre_handle *lh, + int unref); +static inline enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, + __u64 flags, + const struct ldlm_res_id *res_id, + enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + struct lustre_handle *lh, + int unref) +{ + return ldlm_lock_match_with_skip(ns, flags, 0, res_id, type, policy, + mode, lh, unref); +} +struct ldlm_lock *search_itree(struct ldlm_resource *res, + struct ldlm_match_data *data); +enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh, + __u64 *bits); +void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode); +void ldlm_lock_cancel(struct ldlm_lock *lock); +void ldlm_reprocess_all(struct ldlm_resource *res, struct ldlm_lock *hint); +void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns); +void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh); +void ldlm_unlink_lock_skiplist(struct ldlm_lock *req); + +/* resource.c */ +struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, + enum ldlm_side client, + enum ldlm_appetite apt, + enum ldlm_ns_type ns_type); +int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags); +void ldlm_namespace_free_prior(struct ldlm_namespace *ns, + struct obd_import *imp, + int force); +void ldlm_namespace_free_post(struct ldlm_namespace *ns); +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, int force); +void ldlm_namespace_register(struct ldlm_namespace *ns, enum ldlm_side client); +void ldlm_namespace_unregister(struct ldlm_namespace *ns, + enum ldlm_side client); +void ldlm_namespace_get(struct ldlm_namespace *ns); +void ldlm_namespace_put(struct ldlm_namespace *ns); + +int ldlm_debugfs_setup(void); +void ldlm_debugfs_cleanup(void); + +static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req, + struct lprocfs_stats *srv_stats) +{ + int lock_type = 0, op = 0; + + lock_type = dlm_req->lock_desc.l_resource.lr_type; + + switch (lock_type) { + case LDLM_PLAIN: + op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE; + break; + case LDLM_EXTENT: + op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE; + break; + case LDLM_FLOCK: + op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE; + break; + case LDLM_IBITS: + op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE; + break; + default: + op = 0; + break; + } + + if (op != 0) + lprocfs_counter_incr(srv_stats, op); + + return; +} + +/* resource.c - internal */ +struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns, + struct ldlm_resource *parent, + const struct ldlm_res_id *, + enum ldlm_type type, int create); +struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res); +int ldlm_resource_putref(struct ldlm_resource *res); +void ldlm_resource_add_lock(struct ldlm_resource *res, + struct list_head *head, + struct ldlm_lock *lock); +void ldlm_resource_unlink_lock(struct ldlm_lock *lock); +void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc); +void ldlm_dump_all_namespaces(enum ldlm_side client, int level); +void ldlm_namespace_dump(int level, struct ldlm_namespace *); +void ldlm_resource_dump(int level, struct ldlm_resource *); +int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *, + const struct ldlm_res_id *); + +#define LDLM_RESOURCE_ADDREF(res) do { \ + lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current); \ +} while (0) + +#define LDLM_RESOURCE_DELREF(res) do { \ + lu_ref_del(&(res)->lr_reference, __FUNCTION__, current); \ +} while (0) + +/* ldlm_request.c */ +int ldlm_expired_completion_wait(void *data); +/** \defgroup ldlm_local_ast Default AST handlers for local locks + * These AST handlers are typically used for server-side local locks and are + * also used by client-side lock handlers to perform minimum level base + * processing. + * @{ */ +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock); +int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag); +int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp); +int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data); +int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); +/** @} ldlm_local_ast */ + +/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users. + * These are typically used by client and server (*_local versions) + * to obtain and release locks. + * @{ */ +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, + struct ldlm_enqueue_info *einfo, + const struct ldlm_res_id *res_id, + union ldlm_policy_data const *policy, __u64 *flags, + void *lvb, __u32 lvb_len, enum lvb_type lvb_type, + struct lustre_handle *lockh, int async); +int ldlm_prep_enqueue_req(struct obd_export *exp, + struct ptlrpc_request *req, + struct list_head *cancels, + int count); +int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req, + int version, int opc, int canceloff, + struct list_head *cancels, int count); + +struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len); +int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + const struct ldlm_callback_suite *cbs); +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + enum ldlm_type type, __u8 with_policy, + enum ldlm_mode mode, __u64 *flags, void *lvb, + __u32 lvb_len, + const struct lustre_handle *lockh, int rc); +int ldlm_cli_enqueue_local(const struct lu_env *env, + struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + enum ldlm_type type, union ldlm_policy_data *policy, + enum ldlm_mode mode, __u64 *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, enum lvb_type lvb_type, + const __u64 *client_cookie, + struct lustre_handle *lockh); +int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits); +int ldlm_cli_convert(struct ldlm_lock *lock, + enum ldlm_cancel_flags cancel_flags); +int ldlm_cli_update_pool(struct ptlrpc_request *req); +int ldlm_cli_cancel(const struct lustre_handle *lockh, + enum ldlm_cancel_flags cancel_flags); +int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *, + enum ldlm_cancel_flags flags, void *opaque); +int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + enum ldlm_cancel_flags flags, void *opaque); +int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head, + int count, enum ldlm_cancel_flags flags); +int ldlm_cancel_resource_local(struct ldlm_resource *res, + struct list_head *cancels, + union ldlm_policy_data *policy, + enum ldlm_mode mode, __u64 lock_flags, + enum ldlm_cancel_flags cancel_flags, + void *opaque); +int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, + enum ldlm_cancel_flags flags); +int ldlm_cli_cancel_list(struct list_head *head, int count, + struct ptlrpc_request *req, + enum ldlm_cancel_flags flags); + +int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop); +int ldlm_cli_inodebits_convert(struct ldlm_lock *lock, + enum ldlm_cancel_flags cancel_flags); + +/** @} ldlm_cli_api */ + +extern unsigned int ldlm_enqueue_min; + +/* mds/handler.c */ +/* This has to be here because recursive inclusion sucks. */ +int intent_disposition(struct ldlm_reply *rep, int flag); +void intent_set_disposition(struct ldlm_reply *rep, int flag); + +/** + * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more + * than one lock_res is dead-lock safe. + */ +enum lock_res_type { + LRT_NORMAL, + LRT_NEW +}; + +/** Lock resource. */ +static inline void lock_res(struct ldlm_resource *res) +{ + spin_lock(&res->lr_lock); +} + +/** Lock resource with a way to instruct lockdep code about nestedness-safe. */ +static inline void lock_res_nested(struct ldlm_resource *res, + enum lock_res_type mode) +{ + spin_lock_nested(&res->lr_lock, mode); +} + +/** Unlock resource. */ +static inline void unlock_res(struct ldlm_resource *res) +{ + spin_unlock(&res->lr_lock); +} + +/** Check if resource is already locked, assert if not. */ +static inline void check_res_locked(struct ldlm_resource *res) +{ + assert_spin_locked(&res->lr_lock); +} + +struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock); +void unlock_res_and_lock(struct ldlm_lock *lock); + +/* ldlm_pool.c */ +/** \defgroup ldlm_pools Various LDLM pool related functions + * There are not used outside of ldlm. + * @{ + */ +int ldlm_pools_init(void); +void ldlm_pools_fini(void); + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, enum ldlm_side client); +int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask); +void ldlm_pool_fini(struct ldlm_pool *pl); +int ldlm_pool_setup(struct ldlm_pool *pl, int limit); +time64_t ldlm_pool_recalc(struct ldlm_pool *pl); +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl); +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl); +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl); +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl); +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv); +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv); +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit); +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock); +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock); +/** @} */ + +static inline int ldlm_extent_overlap(const struct ldlm_extent *ex1, + const struct ldlm_extent *ex2) +{ + return ex1->start <= ex2->end && ex2->start <= ex1->end; +} + +/* check if @ex1 contains @ex2 */ +static inline int ldlm_extent_contain(const struct ldlm_extent *ex1, + const struct ldlm_extent *ex2) +{ + return ex1->start <= ex2->start && ex1->end >= ex2->end; +} + +int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop); + +#endif +/** @} LDLM */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h new file mode 100644 index 0000000000000..9fdebcefe66a5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h @@ -0,0 +1,454 @@ +/* -*- buffer-read-only: t -*- vi: set ro: + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . + */ +/** + * \file lustre_dlm_flags.h + * The flags and collections of flags (masks) for \see struct ldlm_lock. + * + * \addtogroup LDLM Lustre Distributed Lock Manager + * @{ + * + * \name flags + * The flags and collections of flags (masks) for \see struct ldlm_lock. + * @{ + */ +#ifndef LDLM_ALL_FLAGS_MASK + +/** l_flags bits marked as "all_flags" bits */ +#define LDLM_FL_ALL_FLAGS_MASK 0x00FFFFFFC28F932FULL + +/** extent, mode, or resource changed */ +#define LDLM_FL_LOCK_CHANGED 0x0000000000000001ULL // bit 0 +#define ldlm_is_lock_changed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 0) +#define ldlm_set_lock_changed(_l) LDLM_SET_FLAG(( _l), 1ULL << 0) +#define ldlm_clear_lock_changed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 0) + +/** + * Server placed lock on granted list, or a recovering client wants the + * lock added to the granted list, no questions asked. */ +#define LDLM_FL_BLOCK_GRANTED 0x0000000000000002ULL // bit 1 +#define ldlm_is_block_granted(_l) LDLM_TEST_FLAG(( _l), 1ULL << 1) +#define ldlm_set_block_granted(_l) LDLM_SET_FLAG(( _l), 1ULL << 1) +#define ldlm_clear_block_granted(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 1) + +/** + * Server placed lock on conv list, or a recovering client wants the lock + * added to the conv list, no questions asked. (obsoleted) */ +#define LDLM_FL_BLOCK_CONV 0x0000000000000004ULL // bit 2 +#define ldlm_is_block_conv(_l) LDLM_TEST_FLAG(( _l), 1ULL << 2) +#define ldlm_set_block_conv(_l) LDLM_SET_FLAG(( _l), 1ULL << 2) +#define ldlm_clear_block_conv(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 2) + +/** + * Server placed lock on wait list, or a recovering client wants the lock + * added to the wait list, no questions asked. */ +#define LDLM_FL_BLOCK_WAIT 0x0000000000000008ULL // bit 3 +#define ldlm_is_block_wait(_l) LDLM_TEST_FLAG(( _l), 1ULL << 3) +#define ldlm_set_block_wait(_l) LDLM_SET_FLAG(( _l), 1ULL << 3) +#define ldlm_clear_block_wait(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 3) + +/** + * Lock request is speculative/asynchronous, and cannot wait for any reason. + * Fail the lock request if any blocking locks are encountered. + * */ +#define LDLM_FL_SPECULATIVE 0x0000000000000010ULL /* bit 4 */ +#define ldlm_is_speculative(_l) LDLM_TEST_FLAG((_l), 1ULL << 4) +#define ldlm_set_speculative(_l) LDLM_SET_FLAG((_l), 1ULL << 4) +#define ldlm_clear_specualtive_(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 4) + +/** blocking or cancel packet was queued for sending. */ +#define LDLM_FL_AST_SENT 0x0000000000000020ULL // bit 5 +#define ldlm_is_ast_sent(_l) LDLM_TEST_FLAG(( _l), 1ULL << 5) +#define ldlm_set_ast_sent(_l) LDLM_SET_FLAG(( _l), 1ULL << 5) +#define ldlm_clear_ast_sent(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 5) + +/** + * Lock is being replayed. This could probably be implied by the fact that + * one of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous. */ +#define LDLM_FL_REPLAY 0x0000000000000100ULL // bit 8 +#define ldlm_is_replay(_l) LDLM_TEST_FLAG(( _l), 1ULL << 8) +#define ldlm_set_replay(_l) LDLM_SET_FLAG(( _l), 1ULL << 8) +#define ldlm_clear_replay(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 8) + +/** Don't grant lock, just do intent. */ +#define LDLM_FL_INTENT_ONLY 0x0000000000000200ULL // bit 9 +#define ldlm_is_intent_only(_l) LDLM_TEST_FLAG(( _l), 1ULL << 9) +#define ldlm_set_intent_only(_l) LDLM_SET_FLAG(( _l), 1ULL << 9) +#define ldlm_clear_intent_only(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 9) + +/** lock request has intent */ +#define LDLM_FL_HAS_INTENT 0x0000000000001000ULL // bit 12 +#define ldlm_is_has_intent(_l) LDLM_TEST_FLAG(( _l), 1ULL << 12) +#define ldlm_set_has_intent(_l) LDLM_SET_FLAG(( _l), 1ULL << 12) +#define ldlm_clear_has_intent(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 12) + +/** flock deadlock detected */ +#define LDLM_FL_FLOCK_DEADLOCK 0x0000000000008000ULL // bit 15 +#define ldlm_is_flock_deadlock(_l) LDLM_TEST_FLAG(( _l), 1ULL << 15) +#define ldlm_set_flock_deadlock(_l) LDLM_SET_FLAG(( _l), 1ULL << 15) +#define ldlm_clear_flock_deadlock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 15) + +/** discard (no writeback (PW locks) or page retention (PR locks)) on cancel */ +#define LDLM_FL_DISCARD_DATA 0x0000000000010000ULL // bit 16 +#define ldlm_is_discard_data(_l) LDLM_TEST_FLAG(( _l), 1ULL << 16) +#define ldlm_set_discard_data(_l) LDLM_SET_FLAG(( _l), 1ULL << 16) +#define ldlm_clear_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 16) + +/** Blocked by group lock - wait indefinitely */ +#define LDLM_FL_NO_TIMEOUT 0x0000000000020000ULL // bit 17 +#define ldlm_is_no_timeout(_l) LDLM_TEST_FLAG(( _l), 1ULL << 17) +#define ldlm_set_no_timeout(_l) LDLM_SET_FLAG(( _l), 1ULL << 17) +#define ldlm_clear_no_timeout(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 17) + +/** + * Server told not to wait if blocked. For AGL, OST will not send glimpse + * callback. */ +#define LDLM_FL_BLOCK_NOWAIT 0x0000000000040000ULL // bit 18 +#define ldlm_is_block_nowait(_l) LDLM_TEST_FLAG(( _l), 1ULL << 18) +#define ldlm_set_block_nowait(_l) LDLM_SET_FLAG(( _l), 1ULL << 18) +#define ldlm_clear_block_nowait(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 18) + +/** return blocking lock */ +#define LDLM_FL_TEST_LOCK 0x0000000000080000ULL // bit 19 +#define ldlm_is_test_lock(_l) LDLM_TEST_FLAG(( _l), 1ULL << 19) +#define ldlm_set_test_lock(_l) LDLM_SET_FLAG(( _l), 1ULL << 19) +#define ldlm_clear_test_lock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 19) + +/** match lock only */ +#define LDLM_FL_MATCH_LOCK 0x0000000000100000ULL // bit 20 + +/** + * Immediatelly cancel such locks when they block some other locks. Send + * cancel notification to original lock holder, but expect no reply. This + * is for clients (like liblustre) that cannot be expected to reliably + * response to blocking AST. */ +#define LDLM_FL_CANCEL_ON_BLOCK 0x0000000000800000ULL // bit 23 +#define ldlm_is_cancel_on_block(_l) LDLM_TEST_FLAG(( _l), 1ULL << 23) +#define ldlm_set_cancel_on_block(_l) LDLM_SET_FLAG(( _l), 1ULL << 23) +#define ldlm_clear_cancel_on_block(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 23) + +/** Flag whether a lock is enqueued from a distributed transaction, and the + * requesting lock mode is PW/EX, if so, it will check compatibility with COS + * locks, and different from original COS semantic, transactions from the same + * client is also treated as lock conflict. */ +#define LDLM_FL_COS_INCOMPAT 0x0000000001000000ULL /* bit 24 */ +#define ldlm_is_cos_incompat(_l) LDLM_TEST_FLAG((_l), 1ULL << 24) +#define ldlm_set_cos_incompat(_l) LDLM_SET_FLAG((_l), 1ULL << 24) +#define ldlm_clear_cos_incompat(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 24) + +/* + * Flag indicates that lock is being converted (downgraded) during the blocking + * AST instead of cancelling. Used for IBITS locks now and drops conflicting + * bits only keepeing other. + */ +#define LDLM_FL_CONVERTING 0x0000000002000000ULL /* bit 25 */ +#define ldlm_is_converting(_l) LDLM_TEST_FLAG((_l), 1ULL << 25) +#define ldlm_set_converting(_l) LDLM_SET_FLAG((_l), 1ULL << 25) +#define ldlm_clear_converting(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 25) + +/** + * Part of original lockahead implementation, OBD_CONNECT_LOCKAHEAD_OLD. + * Reserved temporarily to allow those implementations to keep working. + * Will be removed after 2.12 release. + * */ +#define LDLM_FL_LOCKAHEAD_OLD_RESERVED 0x0000000010000000ULL /* bit 28 */ +#define ldlm_is_do_not_expand_io(_l) LDLM_TEST_FLAG((_l), 1ULL << 28) +#define ldlm_set_do_not_expand_io(_l) LDLM_SET_FLAG((_l), 1ULL << 28) +#define ldlm_clear_do_not_expand_io(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 28) + +/** + * Do not expand this lock. Grant it only on the extent requested. + * Used for manually requested locks from the client (LU_LADVISE_LOCKAHEAD). + * */ +#define LDLM_FL_NO_EXPANSION 0x0000000020000000ULL /* bit 29 */ +#define ldlm_is_do_not_expand(_l) LDLM_TEST_FLAG((_l), 1ULL << 29) +#define ldlm_set_do_not_expand(_l) LDLM_SET_FLAG((_l), 1ULL << 29) +#define ldlm_clear_do_not_expand(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 29) + +/** + * measure lock contention and return -EUSERS if locking contention is high */ +#define LDLM_FL_DENY_ON_CONTENTION 0x0000000040000000ULL // bit 30 +#define ldlm_is_deny_on_contention(_l) LDLM_TEST_FLAG(( _l), 1ULL << 30) +#define ldlm_set_deny_on_contention(_l) LDLM_SET_FLAG(( _l), 1ULL << 30) +#define ldlm_clear_deny_on_contention(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 30) + +/** + * These are flags that are mapped into the flags and ASTs of blocking + * locks Add FL_DISCARD to blocking ASTs */ +#define LDLM_FL_AST_DISCARD_DATA 0x0000000080000000ULL // bit 31 +#define ldlm_is_ast_discard_data(_l) LDLM_TEST_FLAG(( _l), 1ULL << 31) +#define ldlm_set_ast_discard_data(_l) LDLM_SET_FLAG(( _l), 1ULL << 31) +#define ldlm_clear_ast_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 31) + +/** + * Used for marking lock as a target for -EINTR while cp_ast sleep emulation + * + race with upcoming bl_ast. */ +#define LDLM_FL_FAIL_LOC 0x0000000100000000ULL // bit 32 +#define ldlm_is_fail_loc(_l) LDLM_TEST_FLAG(( _l), 1ULL << 32) +#define ldlm_set_fail_loc(_l) LDLM_SET_FLAG(( _l), 1ULL << 32) +#define ldlm_clear_fail_loc(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 32) + +/** this lock is being destroyed */ +#define LDLM_FL_CBPENDING 0x0000000400000000ULL // bit 34 +#define ldlm_is_cbpending(_l) LDLM_TEST_FLAG(( _l), 1ULL << 34) +#define ldlm_set_cbpending(_l) LDLM_SET_FLAG(( _l), 1ULL << 34) +#define ldlm_clear_cbpending(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 34) + +/** not a real flag, not saved in lock */ +#define LDLM_FL_WAIT_NOREPROC 0x0000000800000000ULL // bit 35 +#define ldlm_is_wait_noreproc(_l) LDLM_TEST_FLAG(( _l), 1ULL << 35) +#define ldlm_set_wait_noreproc(_l) LDLM_SET_FLAG(( _l), 1ULL << 35) +#define ldlm_clear_wait_noreproc(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 35) + +/** cancellation callback already run */ +#define LDLM_FL_CANCEL 0x0000001000000000ULL // bit 36 +#define ldlm_is_cancel(_l) LDLM_TEST_FLAG(( _l), 1ULL << 36) +#define ldlm_set_cancel(_l) LDLM_SET_FLAG(( _l), 1ULL << 36) +#define ldlm_clear_cancel(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 36) + +/** whatever it might mean -- never transmitted? */ +#define LDLM_FL_LOCAL_ONLY 0x0000002000000000ULL // bit 37 +#define ldlm_is_local_only(_l) LDLM_TEST_FLAG(( _l), 1ULL << 37) +#define ldlm_set_local_only(_l) LDLM_SET_FLAG(( _l), 1ULL << 37) +#define ldlm_clear_local_only(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 37) + +/** don't run the cancel callback under ldlm_cli_cancel_unused */ +#define LDLM_FL_FAILED 0x0000004000000000ULL // bit 38 +#define ldlm_is_failed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 38) +#define ldlm_set_failed(_l) LDLM_SET_FLAG(( _l), 1ULL << 38) +#define ldlm_clear_failed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 38) + +/** lock cancel has already been sent */ +#define LDLM_FL_CANCELING 0x0000008000000000ULL // bit 39 +#define ldlm_is_canceling(_l) LDLM_TEST_FLAG(( _l), 1ULL << 39) +#define ldlm_set_canceling(_l) LDLM_SET_FLAG(( _l), 1ULL << 39) +#define ldlm_clear_canceling(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 39) + +/** local lock (ie, no srv/cli split) */ +#define LDLM_FL_LOCAL 0x0000010000000000ULL // bit 40 +#define ldlm_is_local(_l) LDLM_TEST_FLAG(( _l), 1ULL << 40) +#define ldlm_set_local(_l) LDLM_SET_FLAG(( _l), 1ULL << 40) +#define ldlm_clear_local(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 40) + +/** + * XXX FIXME: This is being added to b_size as a low-risk fix to the + * fact that the LVB filling happens _after_ the lock has been granted, + * so another thread can match it before the LVB has been updated. As a + * dirty hack, we set LDLM_FL_LVB_READY only after we've done the LVB poop. + * this is only needed on LOV/OSC now, where LVB is actually used and + * callers must set it in input flags. + * + * The proper fix is to do the granting inside of the completion AST, + * which can be replaced with a LVB-aware wrapping function for OSC locks. + * That change is pretty high-risk, though, and would need a lot more + * testing. */ +#define LDLM_FL_LVB_READY 0x0000020000000000ULL // bit 41 +#define ldlm_is_lvb_ready(_l) LDLM_TEST_FLAG(( _l), 1ULL << 41) +#define ldlm_set_lvb_ready(_l) LDLM_SET_FLAG(( _l), 1ULL << 41) +#define ldlm_clear_lvb_ready(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 41) + +/** + * A lock contributes to the known minimum size (KMS) calculation until it + * has finished the part of its cancelation that performs write back on its + * dirty pages. It can remain on the granted list during this whole time. + * Threads racing to update the KMS after performing their writeback need + * to know to exclude each other's locks from the calculation as they walk + * the granted list. */ +#define LDLM_FL_KMS_IGNORE 0x0000040000000000ULL // bit 42 +#define ldlm_is_kms_ignore(_l) LDLM_TEST_FLAG(( _l), 1ULL << 42) +#define ldlm_set_kms_ignore(_l) LDLM_SET_FLAG(( _l), 1ULL << 42) +#define ldlm_clear_kms_ignore(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 42) + +/** completion AST to be executed */ +#define LDLM_FL_CP_REQD 0x0000080000000000ULL // bit 43 +#define ldlm_is_cp_reqd(_l) LDLM_TEST_FLAG(( _l), 1ULL << 43) +#define ldlm_set_cp_reqd(_l) LDLM_SET_FLAG(( _l), 1ULL << 43) +#define ldlm_clear_cp_reqd(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 43) + +/** cleanup_resource has already handled the lock */ +#define LDLM_FL_CLEANED 0x0000100000000000ULL // bit 44 +#define ldlm_is_cleaned(_l) LDLM_TEST_FLAG(( _l), 1ULL << 44) +#define ldlm_set_cleaned(_l) LDLM_SET_FLAG(( _l), 1ULL << 44) +#define ldlm_clear_cleaned(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 44) + +/** + * optimization hint: LDLM can run blocking callback from current context + * w/o involving separate thread. in order to decrease cs rate */ +#define LDLM_FL_ATOMIC_CB 0x0000200000000000ULL // bit 45 +#define ldlm_is_atomic_cb(_l) LDLM_TEST_FLAG(( _l), 1ULL << 45) +#define ldlm_set_atomic_cb(_l) LDLM_SET_FLAG(( _l), 1ULL << 45) +#define ldlm_clear_atomic_cb(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 45) + +/** + * It may happen that a client initiates two operations, e.g. unlink and + * mkdir, such that the server sends a blocking AST for conflicting locks + * to this client for the first operation, whereas the second operation + * has canceled this lock and is waiting for rpc_lock which is taken by + * the first operation. LDLM_FL_BL_AST is set by ldlm_callback_handler() in + * the lock to prevent the Early Lock Cancel (ELC) code from cancelling it. */ +#define LDLM_FL_BL_AST 0x0000400000000000ULL // bit 46 +#define ldlm_is_bl_ast(_l) LDLM_TEST_FLAG(( _l), 1ULL << 46) +#define ldlm_set_bl_ast(_l) LDLM_SET_FLAG(( _l), 1ULL << 46) +#define ldlm_clear_bl_ast(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 46) + +/** + * Set by ldlm_cancel_callback() when lock cache is dropped to let + * ldlm_callback_handler() return EINVAL to the server. It is used when + * ELC RPC is already prepared and is waiting for rpc_lock, too late to + * send a separate CANCEL RPC. */ +#define LDLM_FL_BL_DONE 0x0000800000000000ULL // bit 47 +#define ldlm_is_bl_done(_l) LDLM_TEST_FLAG(( _l), 1ULL << 47) +#define ldlm_set_bl_done(_l) LDLM_SET_FLAG(( _l), 1ULL << 47) +#define ldlm_clear_bl_done(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 47) + +/** + * Don't put lock into the LRU list, so that it is not canceled due + * to aging. Used by MGC locks, they are cancelled only at unmount or + * by callback. */ +#define LDLM_FL_NO_LRU 0x0001000000000000ULL // bit 48 +#define ldlm_is_no_lru(_l) LDLM_TEST_FLAG(( _l), 1ULL << 48) +#define ldlm_set_no_lru(_l) LDLM_SET_FLAG(( _l), 1ULL << 48) +#define ldlm_clear_no_lru(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 48) + +/** + * Set for locks that failed and where the server has been notified. + * + * Protected by lock and resource locks. */ +#define LDLM_FL_FAIL_NOTIFIED 0x0002000000000000ULL // bit 49 +#define ldlm_is_fail_notified(_l) LDLM_TEST_FLAG(( _l), 1ULL << 49) +#define ldlm_set_fail_notified(_l) LDLM_SET_FLAG(( _l), 1ULL << 49) +#define ldlm_clear_fail_notified(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 49) + +/** + * Set for locks that were removed from class hash table and will + * be destroyed when last reference to them is released. Set by + * ldlm_lock_destroy_internal(). + * + * Protected by lock and resource locks. */ +#define LDLM_FL_DESTROYED 0x0004000000000000ULL // bit 50 +#define ldlm_is_destroyed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 50) +#define ldlm_set_destroyed(_l) LDLM_SET_FLAG(( _l), 1ULL << 50) +#define ldlm_clear_destroyed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 50) + +/** flag whether this is a server namespace lock */ +#define LDLM_FL_SERVER_LOCK 0x0008000000000000ULL // bit 51 +#define ldlm_is_server_lock(_l) LDLM_TEST_FLAG(( _l), 1ULL << 51) +#define ldlm_set_server_lock(_l) LDLM_SET_FLAG(( _l), 1ULL << 51) +#define ldlm_clear_server_lock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 51) + +/** + * It's set in lock_res_and_lock() and unset in unlock_res_and_lock(). + * + * NB: compared with check_res_locked(), checking this bit is cheaper. + * Also, spin_is_locked() is deprecated for kernel code; one reason is + * because it works only for SMP so user needs to add extra macros like + * LASSERT_SPIN_LOCKED for uniprocessor kernels. */ +#define LDLM_FL_RES_LOCKED 0x0010000000000000ULL // bit 52 +#define ldlm_is_res_locked(_l) LDLM_TEST_FLAG(( _l), 1ULL << 52) +#define ldlm_set_res_locked(_l) LDLM_SET_FLAG(( _l), 1ULL << 52) +#define ldlm_clear_res_locked(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 52) + +/** + * It's set once we call ldlm_add_waiting_lock_res_locked() to start the + * lock-timeout timer and it will never be reset. + * + * Protected by lock and resource locks. */ +#define LDLM_FL_WAITED 0x0020000000000000ULL // bit 53 +#define ldlm_is_waited(_l) LDLM_TEST_FLAG(( _l), 1ULL << 53) +#define ldlm_set_waited(_l) LDLM_SET_FLAG(( _l), 1ULL << 53) +#define ldlm_clear_waited(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 53) + +/** Flag whether this is a server namespace lock. */ +#define LDLM_FL_NS_SRV 0x0040000000000000ULL // bit 54 +#define ldlm_is_ns_srv(_l) LDLM_TEST_FLAG(( _l), 1ULL << 54) +#define ldlm_set_ns_srv(_l) LDLM_SET_FLAG(( _l), 1ULL << 54) +#define ldlm_clear_ns_srv(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 54) + +/** Flag whether this lock can be reused. Used by exclusive open. */ +#define LDLM_FL_EXCL 0x0080000000000000ULL // bit 55 +#define ldlm_is_excl(_l) LDLM_TEST_FLAG(( _l), 1ULL << 55) +#define ldlm_set_excl(_l) LDLM_SET_FLAG(( _l), 1ULL << 55) +#define ldlm_clear_excl(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 55) + +/** Flag whether a lock is found on server for re-sent RPC. */ +#define LDLM_FL_RESENT 0x0100000000000000ULL // bit 56 + +/** Flag whether Commit-on-Sharing is enabled, if LDLM_FL_COS_INCOMPAT is set + * this flag may not be set because once the former is set this flag won't be + * checked, and for cross-MDT lock COS_INCOMPAT is always set but ast handle is + * in ldlm context which doesn't know whether COS is enabled or not. */ +#define LDLM_FL_COS_ENABLED 0x0200000000000000ULL /* bit 57 */ +#define ldlm_is_cos_enabled(_l) LDLM_TEST_FLAG((_l), 1ULL << 57) +#define ldlm_set_cos_enabled(_l) LDLM_SET_FLAG((_l), 1ULL << 57) + +/** + * This flags means to use non-delay RPC to send dlm request RPC. + */ +#define LDLM_FL_NDELAY 0x0400000000000000ULL /* bit 58 */ +#define ldlm_is_ndelay(_l) LDLM_TEST_FLAG((_l), 1ULL << 58) +#define ldlm_set_ndelay(_l) LDLM_SET_FLAG((_l), 1ULL << 58) + +/** + * LVB from this lock is cached in osc object + */ +#define LDLM_FL_LVB_CACHED 0x0800000000000000ULL /* bit 59 */ +#define ldlm_is_lvb_cached(_l) LDLM_TEST_FLAG((_l), 1ULL << 59) +#define ldlm_set_lvb_cached(_l) LDLM_SET_FLAG((_l), 1ULL << 59) +#define ldlm_clear_lvb_cached(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 59) + +/** l_flags bits marked as "ast" bits */ +#define LDLM_FL_AST_MASK (LDLM_FL_FLOCK_DEADLOCK |\ + LDLM_FL_DISCARD_DATA) + +/** l_flags bits marked as "blocked" bits */ +#define LDLM_FL_BLOCKED_MASK (LDLM_FL_BLOCK_GRANTED |\ + LDLM_FL_BLOCK_WAIT) + +/** l_flags bits marked as "gone" bits */ +#define LDLM_FL_GONE_MASK (LDLM_FL_DESTROYED |\ + LDLM_FL_FAILED) + +/** l_flags bits marked as "inherit" bits + * Flags inherited from wire on enqueue/reply between client/server. + * CANCEL_ON_BLOCK so server will not grant if a blocking lock is found + * NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout. + * TEST_LOCK flag to not let TEST lock to be granted. + * NO_EXPANSION to tell server not to expand extent of lock request */ +#define LDLM_FL_INHERIT_MASK (LDLM_FL_CANCEL_ON_BLOCK |\ + LDLM_FL_NO_TIMEOUT |\ + LDLM_FL_TEST_LOCK |\ + LDLM_FL_NO_EXPANSION) + +/** flags returned in @flags parameter on ldlm_lock_enqueue, + * to be re-constructed on re-send */ +#define LDLM_FL_SRV_ENQ_MASK (LDLM_FL_LOCK_CHANGED |\ + LDLM_FL_BLOCKED_MASK |\ + LDLM_FL_NO_TIMEOUT) + +/** test for ldlm_lock flag bit set */ +#define LDLM_TEST_FLAG(_l, _b) (((_l)->l_flags & (_b)) != 0) + +/** multi-bit test: are any of mask bits set? */ +#define LDLM_HAVE_MASK(_l, _m) (((_l)->l_flags & LDLM_FL_##_m##_MASK) != 0) + +/** set a ldlm_lock flag bit */ +#define LDLM_SET_FLAG(_l, _b) ((_l)->l_flags |= (_b)) + +/** clear a ldlm_lock flag bit */ +#define LDLM_CLEAR_FLAG(_l, _b) ((_l)->l_flags &= ~(_b)) + +/** @} subgroup */ +/** @} group */ +#endif /* LDLM_ALL_FLAGS_MASK */ + diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h b/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h new file mode 100644 index 0000000000000..03b9adc84897c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h @@ -0,0 +1,94 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/include/lustre_idmap.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_EACL_H +#define _LUSTRE_EACL_H + +/** \defgroup eacl eacl + * + * @{ + */ + +#ifdef CONFIG_FS_POSIX_ACL +# include +# include + +typedef struct { + __u16 e_tag; + __u16 e_perm; + __u32 e_id; + __u32 e_stat; +} ext_acl_xattr_entry; + +typedef struct { + __u32 a_count; + ext_acl_xattr_entry a_entries[0]; +} ext_acl_xattr_header; + +#define CFS_ACL_XATTR_SIZE(count, prefix) \ + (sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry)) + +#define CFS_ACL_XATTR_COUNT(size, prefix) \ + (((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry)) + +#ifdef HAVE_SERVER_SUPPORT +struct lu_ucred; +struct lu_attr; +struct lustre_idmap_table; + +#ifdef HAVE_STRUCT_POSIX_ACL_XATTR +# define posix_acl_xattr_header struct posix_acl_xattr_header +# define posix_acl_xattr_entry struct posix_acl_xattr_entry +#endif + +extern int lustre_posix_acl_permission(struct lu_ucred *mu, + const struct lu_attr *la, int want, + posix_acl_xattr_entry *entry, + int count); +extern int lustre_posix_acl_chmod_masq(posix_acl_xattr_entry *entry, + __u32 mode, int count); +extern int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, + __u32 *pmode, int count); +extern int lustre_posix_acl_equiv_mode(posix_acl_xattr_entry *entry, mode_t *mode_p, + int count); +#endif /* HAVE_SERVER_SUPPORT */ +#endif /* CONFIG_FS_POSIX_ACL */ + +/** @} eacl */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_errno.h b/drivers/staging/lustrefsx/lustre/include/lustre_errno.h new file mode 100644 index 0000000000000..fe9ccd2e07a82 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_errno.h @@ -0,0 +1,218 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * GPL HEADER END + */ +/* + * Copyright (C) 2011 FUJITSU LIMITED. All rights reserved. + * + * Copyright (c) 2013, Intel Corporation. + */ + +#ifndef LUSTRE_ERRNO_H +#define LUSTRE_ERRNO_H + +/* + * Only "network" errnos, which are defined below, are allowed on wire (or on + * disk). Generic routines exist to help translate between these and a subset + * of the "host" errnos. Some host errnos (e.g., EDEADLOCK) are intentionally + * left out. See also the comment on lustre_errno_hton_mapping[]. + * + * To maintain compatibility with existing x86 clients and servers, each of + * these network errnos has the same numerical value as its corresponding host + * errno on x86. + */ +#define LUSTRE_EPERM 1 /* Operation not permitted */ +#define LUSTRE_ENOENT 2 /* No such file or directory */ +#define LUSTRE_ESRCH 3 /* No such process */ +#define LUSTRE_EINTR 4 /* Interrupted system call */ +#define LUSTRE_EIO 5 /* I/O error */ +#define LUSTRE_ENXIO 6 /* No such device or address */ +#define LUSTRE_E2BIG 7 /* Argument list too long */ +#define LUSTRE_ENOEXEC 8 /* Exec format error */ +#define LUSTRE_EBADF 9 /* Bad file number */ +#define LUSTRE_ECHILD 10 /* No child processes */ +#define LUSTRE_EAGAIN 11 /* Try again */ +#define LUSTRE_ENOMEM 12 /* Out of memory */ +#define LUSTRE_EACCES 13 /* Permission denied */ +#define LUSTRE_EFAULT 14 /* Bad address */ +#define LUSTRE_ENOTBLK 15 /* Block device required */ +#define LUSTRE_EBUSY 16 /* Device or resource busy */ +#define LUSTRE_EEXIST 17 /* File exists */ +#define LUSTRE_EXDEV 18 /* Cross-device link */ +#define LUSTRE_ENODEV 19 /* No such device */ +#define LUSTRE_ENOTDIR 20 /* Not a directory */ +#define LUSTRE_EISDIR 21 /* Is a directory */ +#define LUSTRE_EINVAL 22 /* Invalid argument */ +#define LUSTRE_ENFILE 23 /* File table overflow */ +#define LUSTRE_EMFILE 24 /* Too many open files */ +#define LUSTRE_ENOTTY 25 /* Not a typewriter */ +#define LUSTRE_ETXTBSY 26 /* Text file busy */ +#define LUSTRE_EFBIG 27 /* File too large */ +#define LUSTRE_ENOSPC 28 /* No space left on device */ +#define LUSTRE_ESPIPE 29 /* Illegal seek */ +#define LUSTRE_EROFS 30 /* Read-only file system */ +#define LUSTRE_EMLINK 31 /* Too many links */ +#define LUSTRE_EPIPE 32 /* Broken pipe */ +#define LUSTRE_EDOM 33 /* Math argument out of domain of + func */ +#define LUSTRE_ERANGE 34 /* Math result not representable */ +#define LUSTRE_EDEADLK 35 /* Resource deadlock would occur */ +#define LUSTRE_ENAMETOOLONG 36 /* File name too long */ +#define LUSTRE_ENOLCK 37 /* No record locks available */ +#define LUSTRE_ENOSYS 38 /* Function not implemented */ +#define LUSTRE_ENOTEMPTY 39 /* Directory not empty */ +#define LUSTRE_ELOOP 40 /* Too many symbolic links + encountered */ +#define LUSTRE_ENOMSG 42 /* No message of desired type */ +#define LUSTRE_EIDRM 43 /* Identifier removed */ +#define LUSTRE_ECHRNG 44 /* Channel number out of range */ +#define LUSTRE_EL2NSYNC 45 /* Level 2 not synchronized */ +#define LUSTRE_EL3HLT 46 /* Level 3 halted */ +#define LUSTRE_EL3RST 47 /* Level 3 reset */ +#define LUSTRE_ELNRNG 48 /* Link number out of range */ +#define LUSTRE_EUNATCH 49 /* Protocol driver not attached */ +#define LUSTRE_ENOCSI 50 /* No CSI structure available */ +#define LUSTRE_EL2HLT 51 /* Level 2 halted */ +#define LUSTRE_EBADE 52 /* Invalid exchange */ +#define LUSTRE_EBADR 53 /* Invalid request descriptor */ +#define LUSTRE_EXFULL 54 /* Exchange full */ +#define LUSTRE_ENOANO 55 /* No anode */ +#define LUSTRE_EBADRQC 56 /* Invalid request code */ +#define LUSTRE_EBADSLT 57 /* Invalid slot */ +#define LUSTRE_EBFONT 59 /* Bad font file format */ +#define LUSTRE_ENOSTR 60 /* Device not a stream */ +#define LUSTRE_ENODATA 61 /* No data available */ +#define LUSTRE_ETIME 62 /* Timer expired */ +#define LUSTRE_ENOSR 63 /* Out of streams resources */ +#define LUSTRE_ENONET 64 /* Machine is not on the network */ +#define LUSTRE_ENOPKG 65 /* Package not installed */ +#define LUSTRE_EREMOTE 66 /* Object is remote */ +#define LUSTRE_ENOLINK 67 /* Link has been severed */ +#define LUSTRE_EADV 68 /* Advertise error */ +#define LUSTRE_ESRMNT 69 /* Srmount error */ +#define LUSTRE_ECOMM 70 /* Communication error on send */ +#define LUSTRE_EPROTO 71 /* Protocol error */ +#define LUSTRE_EMULTIHOP 72 /* Multihop attempted */ +#define LUSTRE_EDOTDOT 73 /* RFS specific error */ +#define LUSTRE_EBADMSG 74 /* Not a data message */ +#define LUSTRE_EOVERFLOW 75 /* Value too large for defined data + type */ +#define LUSTRE_ENOTUNIQ 76 /* Name not unique on network */ +#define LUSTRE_EBADFD 77 /* File descriptor in bad state */ +#define LUSTRE_EREMCHG 78 /* Remote address changed */ +#define LUSTRE_ELIBACC 79 /* Can not access a needed shared + library */ +#define LUSTRE_ELIBBAD 80 /* Accessing a corrupted shared + library */ +#define LUSTRE_ELIBSCN 81 /* .lib section in a.out corrupted */ +#define LUSTRE_ELIBMAX 82 /* Attempting to link in too many shared + libraries */ +#define LUSTRE_ELIBEXEC 83 /* Cannot exec a shared library + directly */ +#define LUSTRE_EILSEQ 84 /* Illegal byte sequence */ +#define LUSTRE_ERESTART 85 /* Interrupted system call should be + restarted */ +#define LUSTRE_ESTRPIPE 86 /* Streams pipe error */ +#define LUSTRE_EUSERS 87 /* Too many users */ +#define LUSTRE_ENOTSOCK 88 /* Socket operation on non-socket */ +#define LUSTRE_EDESTADDRREQ 89 /* Destination address required */ +#define LUSTRE_EMSGSIZE 90 /* Message too long */ +#define LUSTRE_EPROTOTYPE 91 /* Protocol wrong type for socket */ +#define LUSTRE_ENOPROTOOPT 92 /* Protocol not available */ +#define LUSTRE_EPROTONOSUPPORT 93 /* Protocol not supported */ +#define LUSTRE_ESOCKTNOSUPPORT 94 /* Socket type not supported */ +#define LUSTRE_EOPNOTSUPP 95 /* Operation not supported on transport + endpoint */ +#define LUSTRE_EPFNOSUPPORT 96 /* Protocol family not supported */ +#define LUSTRE_EAFNOSUPPORT 97 /* Address family not supported by + protocol */ +#define LUSTRE_EADDRINUSE 98 /* Address already in use */ +#define LUSTRE_EADDRNOTAVAIL 99 /* Cannot assign requested address */ +#define LUSTRE_ENETDOWN 100 /* Network is down */ +#define LUSTRE_ENETUNREACH 101 /* Network is unreachable */ +#define LUSTRE_ENETRESET 102 /* Network dropped connection because of + reset */ +#define LUSTRE_ECONNABORTED 103 /* Software caused connection abort */ +#define LUSTRE_ECONNRESET 104 /* Connection reset by peer */ +#define LUSTRE_ENOBUFS 105 /* No buffer space available */ +#define LUSTRE_EISCONN 106 /* Transport endpoint is already + connected */ +#define LUSTRE_ENOTCONN 107 /* Transport endpoint is not + connected */ +#define LUSTRE_ESHUTDOWN 108 /* Cannot send after transport endpoint + shutdown */ +#define LUSTRE_ETOOMANYREFS 109 /* Too many references: cannot splice */ +#define LUSTRE_ETIMEDOUT 110 /* Connection timed out */ +#define LUSTRE_ECONNREFUSED 111 /* Connection refused */ +#define LUSTRE_EHOSTDOWN 112 /* Host is down */ +#define LUSTRE_EHOSTUNREACH 113 /* No route to host */ +#define LUSTRE_EALREADY 114 /* Operation already in progress */ +#define LUSTRE_EINPROGRESS 115 /* Operation now in progress */ +#define LUSTRE_ESTALE 116 /* Stale NFS file handle */ +#define LUSTRE_EUCLEAN 117 /* Structure needs cleaning */ +#define LUSTRE_ENOTNAM 118 /* Not a XENIX named type file */ +#define LUSTRE_ENAVAIL 119 /* No XENIX semaphores available */ +#define LUSTRE_EISNAM 120 /* Is a named type file */ +#define LUSTRE_EREMOTEIO 121 /* Remote I/O error */ +#define LUSTRE_EDQUOT 122 /* Quota exceeded */ +#define LUSTRE_ENOMEDIUM 123 /* No medium found */ +#define LUSTRE_EMEDIUMTYPE 124 /* Wrong medium type */ +#define LUSTRE_ECANCELED 125 /* Operation Canceled */ +#define LUSTRE_ENOKEY 126 /* Required key not available */ +#define LUSTRE_EKEYEXPIRED 127 /* Key has expired */ +#define LUSTRE_EKEYREVOKED 128 /* Key has been revoked */ +#define LUSTRE_EKEYREJECTED 129 /* Key was rejected by service */ +#define LUSTRE_EOWNERDEAD 130 /* Owner died */ +#define LUSTRE_ENOTRECOVERABLE 131 /* State not recoverable */ +#define LUSTRE_ERESTARTSYS 512 +#define LUSTRE_ERESTARTNOINTR 513 +#define LUSTRE_ERESTARTNOHAND 514 /* restart if no handler.. */ +#define LUSTRE_ENOIOCTLCMD 515 /* No ioctl command */ +#define LUSTRE_ERESTART_RESTARTBLOCK 516 /* restart by calling + sys_restart_syscall */ +#define LUSTRE_EBADHANDLE 521 /* Illegal NFS file handle */ +#define LUSTRE_ENOTSYNC 522 /* Update synchronization mismatch */ +#define LUSTRE_EBADCOOKIE 523 /* Cookie is stale */ +#define LUSTRE_ENOTSUPP 524 /* Operation is not supported */ +#define LUSTRE_ETOOSMALL 525 /* Buffer or request is too small */ +#define LUSTRE_ESERVERFAULT 526 /* An untranslatable error occurred */ +#define LUSTRE_EBADTYPE 527 /* Type not supported by server */ +#define LUSTRE_EJUKEBOX 528 /* Request initiated, but will not + complete before timeout */ +#define LUSTRE_EIOCBQUEUED 529 /* iocb queued, will get completion + event */ + +/* + * Translations are optimized away on x86. Host errnos that shouldn't be put + * on wire could leak through as a result. Do not count on this side effect. + */ +#if !defined(__x86_64__) && !defined(__i386__) +#define LUSTRE_TRANSLATE_ERRNOS +#endif + +#ifdef LUSTRE_TRANSLATE_ERRNOS +unsigned int lustre_errno_hton(unsigned int h); +unsigned int lustre_errno_ntoh(unsigned int n); +#else +#define lustre_errno_hton(h) (h) +#define lustre_errno_ntoh(n) (n) +#endif + +#endif /* LUSTRE_ERRNO_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_export.h b/drivers/staging/lustrefsx/lustre/include/lustre_export.h new file mode 100644 index 0000000000000..aa627e60ffd8c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_export.h @@ -0,0 +1,501 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup obd_export PortalRPC export definitions + * + * @{ + */ + +#ifndef __EXPORT_H +#define __EXPORT_H + +/** \defgroup export export + * + * @{ + */ + +#include + +#include +#include +#include + +struct mds_client_data; +struct mdt_client_data; +struct mds_idmap_table; +struct mdt_idmap_table; + +/** + * Target-specific export data + */ +struct tg_export_data { + /** Protects ted_lcd, ted_reply_* and + * ted_release_* fields below */ + struct mutex ted_lcd_lock; + /** Per-client data for each export */ + struct lsd_client_data *ted_lcd; + /** Offset of record in last_rcvd file */ + loff_t ted_lr_off; + /** Client index in last_rcvd file */ + int ted_lr_idx; + + /** + * ted_nodemap_lock is used to ensure that the nodemap is not destroyed + * between the time that ted_nodemap is checked for NULL, and a + * reference is taken. Modifications to ted_nodemap require that the + * active_config_lock and the nodemap(s)'s nm_member_list_lock be + * taken, as well as ted_nodemap_lock, so the export can be properly + * added to or removed from the nodemap's member list. When an export + * is added to a nodemap, a reference on that nodemap must be taken. + * That reference can be put only after ted_nodemap no longer refers to + * it. + */ + spinlock_t ted_nodemap_lock; + struct lu_nodemap *ted_nodemap; + struct list_head ted_nodemap_member; + + /** last version of nodemap config sent to client */ + __u64 ted_nodemap_version; + + /* Every reply data fields below are + * protected by ted_lcd_lock */ + /** List of reply data */ + struct list_head ted_reply_list; + int ted_reply_cnt; + /** Reply data with highest transno is retained */ + struct tg_reply_data *ted_reply_last; + /* Statistics */ + int ted_reply_max; /* high water mark */ + int ted_release_xid; + int ted_release_tag; + /* grants */ + long ted_dirty; /* in bytes */ + long ted_grant; /* in bytes */ + long ted_pending; /* bytes just being written */ + __u8 ted_pagebits; /* log2 of client page size */ + + /** + * File Modification Data (FMD) tracking + */ + spinlock_t ted_fmd_lock; /* protects ted_fmd_list */ + struct list_head ted_fmd_list; /* FIDs being modified */ + int ted_fmd_count;/* items in ted_fmd_list */ +}; + +/** + * MDT-specific export data + */ +struct mdt_export_data { + struct tg_export_data med_ted; + /** List of all files opened by client on this MDT */ + struct list_head med_open_head; + spinlock_t med_open_lock; /* med_open_head, mfd_list */ +}; + +struct ec_export_data { /* echo client */ + struct list_head eced_locks; +}; + +/* In-memory access to client data from OST struct */ +/** Filter (oss-side) specific import data */ +struct filter_export_data { + struct tg_export_data fed_ted; + __u64 fed_lastid_gen; + /* count of SOFT_SYNC RPCs, which will be reset after + * ofd_soft_sync_limit number of RPCs, and trigger a sync. */ + atomic_t fed_soft_sync_count; + __u32 fed_group; +}; + +struct mgs_export_data { + struct list_head med_clients; /* mgc fs client via this exp */ + spinlock_t med_lock; /* protect med_clients */ +}; + +/** + * per-NID statistics structure. + * It tracks access patterns to this export on a per-client-NID basis + */ +struct nid_stat { + lnet_nid_t nid; + struct hlist_node nid_hash; + struct list_head nid_list; + struct obd_device *nid_obd; + struct proc_dir_entry *nid_proc; + struct lprocfs_stats *nid_stats; + struct lprocfs_stats *nid_ldlm_stats; + atomic_t nid_exp_ref_count; /* for obd_nid_stats_hash + exp_nid_stats */ +}; + +#define nidstat_getref(nidstat) \ +do { \ + atomic_inc(&(nidstat)->nid_exp_ref_count); \ +} while(0) + +#define nidstat_putref(nidstat) \ +do { \ + atomic_dec(&(nidstat)->nid_exp_ref_count); \ + LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0, \ + "stat %p nid_exp_ref_count < 0\n", nidstat); \ +} while(0) + +enum obd_option { + OBD_OPT_FORCE = 0x0001, + OBD_OPT_FAILOVER = 0x0002, + OBD_OPT_ABORT_RECOV = 0x0004, +}; + +/** + * Export structure. Represents target-side of connection in portals. + * Also used in Lustre to connect between layers on the same node when + * there is no network-connection in-between. + * For every connected client there is an export structure on the server + * attached to the same obd device. + */ +struct obd_export { + /** + * Export handle, it's id is provided to client on connect + * Subsequent client RPCs contain this handle id to identify + * what export they are talking to. + */ + struct portals_handle exp_handle; + atomic_t exp_refcount; + /** + * Set of counters below is to track where export references are + * kept. The exp_rpc_count is used for reconnect handling also, + * the cb_count and locks_count are for debug purposes only for now. + * The sum of them should be less than exp_refcount by 3 + */ + atomic_t exp_rpc_count; /* RPC references */ + atomic_t exp_cb_count; /* Commit callback references */ + /** Number of queued replay requests to be processes */ + atomic_t exp_replay_count; + atomic_t exp_locks_count; /** Lock references */ +#if LUSTRE_TRACKS_LOCK_EXP_REFS + struct list_head exp_locks_list; + spinlock_t exp_locks_list_guard; +#endif + /** UUID of client connected to this export */ + struct obd_uuid exp_client_uuid; + /** To link all exports on an obd device */ + struct list_head exp_obd_chain; + /** work_struct for destruction of export */ + struct work_struct exp_zombie_work; + /* Unlinked export list */ + struct list_head exp_stale_list; + struct hlist_node exp_uuid_hash; /** uuid-export hash*/ + struct hlist_node exp_nid_hash; /** nid-export hash */ + struct hlist_node exp_gen_hash; /** last_rcvd clt gen hash */ + /** + * All exports eligible for ping evictor are linked into a list + * through this field in "most time since last request on this export" + * order + * protected by obd_dev_lock + */ + struct list_head exp_obd_chain_timed; + /** Obd device of this export */ + struct obd_device *exp_obd; + /** + * "reverse" import to send requests (e.g. from ldlm) back to client + * exp_lock protect its change + */ + struct obd_import *exp_imp_reverse; + struct nid_stat *exp_nid_stats; + /** Active connetion */ + struct ptlrpc_connection *exp_connection; + /** Connection count value from last successful reconnect rpc */ + __u32 exp_conn_cnt; + /** Hash list of all ldlm locks granted on this export */ + struct cfs_hash *exp_lock_hash; + /** + * Hash list for Posix lock deadlock detection, added with + * ldlm_lock::l_exp_flock_hash. + */ + struct cfs_hash *exp_flock_hash; + struct list_head exp_outstanding_replies; + struct list_head exp_uncommitted_replies; + spinlock_t exp_uncommitted_replies_lock; + /** Last committed transno for this export */ + __u64 exp_last_committed; + /** When was last request received */ + time64_t exp_last_request_time; + /** On replay all requests waiting for replay are linked here */ + struct list_head exp_req_replay_queue; + /** + * protects exp_flags, exp_outstanding_replies and the change + * of exp_imp_reverse + */ + spinlock_t exp_lock; + /** Compatibility flags for this export are embedded into + * exp_connect_data */ + struct obd_connect_data exp_connect_data; + enum obd_option exp_flags; + unsigned long exp_failed:1, + exp_in_recovery:1, + exp_disconnected:1, + exp_connecting:1, + /** VBR: export missed recovery */ + exp_delayed:1, + /** VBR: failed version checking */ + exp_vbr_failed:1, + exp_req_replay_needed:1, + exp_lock_replay_needed:1, + exp_need_sync:1, + exp_flvr_changed:1, + exp_flvr_adapt:1, + /* if to swap nidtbl entries for 2.2 clients. + * Only used by the MGS to fix LU-1644. */ + exp_need_mne_swab:1, + /* The export already got final replay ping + * request. */ + exp_replay_done:1; + /* also protected by exp_lock */ + enum lustre_sec_part exp_sp_peer; + struct sptlrpc_flavor exp_flvr; /* current */ + struct sptlrpc_flavor exp_flvr_old[2]; /* about-to-expire */ + time64_t exp_flvr_expire[2]; /* seconds */ + + /** protects exp_hp_rpcs */ + spinlock_t exp_rpc_lock; + struct list_head exp_hp_rpcs; /* (potential) HP RPCs */ + struct list_head exp_reg_rpcs; /* RPC being handled */ + + /** blocking dlm lock list, protected by exp_bl_list_lock */ + struct list_head exp_bl_list; + spinlock_t exp_bl_list_lock; + + /** Target specific data */ + union { + struct tg_export_data eu_target_data; + struct mdt_export_data eu_mdt_data; + struct filter_export_data eu_filter_data; + struct ec_export_data eu_ec_data; + struct mgs_export_data eu_mgs_data; + } u; + + struct adaptive_timeout exp_bl_lock_at; + + /** highest XID received by export client that has no + * unreceived lower-numbered XID + */ + __u64 exp_last_xid; +}; + +#define exp_target_data u.eu_target_data +#define exp_mdt_data u.eu_mdt_data +#define exp_filter_data u.eu_filter_data +#define exp_ec_data u.eu_ec_data + +static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp) +{ + return &exp->exp_connect_data.ocd_connect_flags; +} + +static inline __u64 exp_connect_flags(struct obd_export *exp) +{ + return *exp_connect_flags_ptr(exp); +} + +static inline __u64 *exp_connect_flags2_ptr(struct obd_export *exp) +{ + return &exp->exp_connect_data.ocd_connect_flags2; +} + +static inline __u64 exp_connect_flags2(struct obd_export *exp) +{ + if (exp_connect_flags(exp) & OBD_CONNECT_FLAGS2) + return *exp_connect_flags2_ptr(exp); + return 0; +} + +static inline int exp_max_brw_size(struct obd_export *exp) +{ + LASSERT(exp != NULL); + if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE) + return exp->exp_connect_data.ocd_brw_size; + + return ONE_MB_BRW_SIZE; +} + +static inline int exp_connect_multibulk(struct obd_export *exp) +{ + return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE; +} + +static inline int exp_connect_cancelset(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET); +} + +static inline int exp_connect_lru_resize(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE); +} + +static inline int exp_connect_vbr(struct obd_export *exp) +{ + LASSERT(exp != NULL); + LASSERT(exp->exp_connection); + return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR); +} + +static inline int exp_connect_umask(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK); +} + +static inline int imp_connect_lru_resize(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE); +} + +static inline int exp_connect_layout(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK); +} + +static inline bool exp_connect_lvb_type(struct obd_export *exp) +{ + LASSERT(exp != NULL); + if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE) + return true; + else + return false; +} + +static inline bool imp_connect_lvb_type(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE) + return true; + else + return false; +} + +static inline bool imp_connect_disp_stripe(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE; +} + +static inline bool imp_connect_shortio(struct obd_import *imp) +{ + struct obd_connect_data *ocd = &imp->imp_connect_data; + + return ocd->ocd_connect_flags & OBD_CONNECT_SHORTIO; +} + +static inline __u64 exp_connect_ibits(struct obd_export *exp) +{ + struct obd_connect_data *ocd; + + ocd = &exp->exp_connect_data; + return ocd->ocd_ibits_known; +} + +static inline int exp_connect_large_acl(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_LARGE_ACL); +} + +static inline int exp_connect_lockahead_old(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_LOCKAHEAD_OLD); +} + +static inline int exp_connect_lockahead(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCKAHEAD); +} + +static inline int exp_connect_flr(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_FLR); +} + +static inline int exp_bypass_mdll(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_MDLL_BYPASS); +} + +static inline int exp_mdll(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_MDLL); +} + +static inline int exp_connect_lock_convert(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCK_CONVERT); +} + +extern struct obd_export *class_conn2export(struct lustre_handle *conn); + +static inline int exp_connect_archive_id_array(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_ARCHIVE_ID_ARRAY); +} + +static inline int exp_connect_sepol(struct obd_export *exp) +{ + return !!(exp_connect_flags2(exp) & OBD_CONNECT2_SELINUX_POLICY); +} + +enum { + /* archive_ids in array format */ + KKUC_CT_DATA_ARRAY_MAGIC = 0x092013cea, + /* archive_ids in bitmap format */ + KKUC_CT_DATA_BITMAP_MAGIC = 0x082018cea, +}; + + +struct kkuc_ct_data { + __u32 kcd_magic; + __u32 kcd_nr_archives; + __u32 kcd_archives[0]; +}; + +/** @} export */ + +#endif /* __EXPORT_H */ +/** @} obd_export */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h new file mode 100644 index 0000000000000..ea6d743b1aaae --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h @@ -0,0 +1,951 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_fid.h + * + * Author: Yury Umanets + */ + +#ifndef __LUSTRE_FID_H +#define __LUSTRE_FID_H + +/** \defgroup fid fid + * + * @{ + * + * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs + * describes the FID namespace and interoperability requirements for FIDs. + * The important parts of that document are included here for reference. + * + * FID + * File IDentifier generated by client from range allocated by the SEQuence + * service and stored in struct lu_fid. The FID is composed of three parts: + * SEQuence, ObjectID, and VERsion. The SEQ component is a filesystem + * unique 64-bit integer, and only one client is ever assigned any SEQ value. + * The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved + * for system use. The OID component is a 32-bit value generated by the + * client on a per-SEQ basis to allow creating many unique FIDs without + * communication with the server. The VER component is a 32-bit value that + * distinguishes between different FID instantiations, such as snapshots or + * separate subtrees within the filesystem. FIDs with the same VER field + * are considered part of the same namespace. + * + * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and + * MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while + * OSTs use 64-bit Lustre object IDs and generation numbers. + * + * NEW filesystems are those formatted since the introduction of FIDs. + * + * IGIF + * Inode and Generation In FID, a surrogate FID used to globally identify + * an existing object on OLD formatted MDT file system. This would only be + * used on MDT0 in a DNE filesystem, because there cannot be more than one + * MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1] + * range, where inode number is stored in SEQ, and inode generation is in OID. + * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem, + * which is the maximum possible for an ldiskfs backend. It also assumes + * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible + * to clients, which has always been true. + * + * IDIF + * object ID In FID, a surrogate FID used to globally identify an existing + * OST object on OLD formatted OST file system. Belongs to a sequence in + * [2^32, 2^33 - 1]. Sequence number is calculated as: + * + * 1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff) + * + * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object + * ID. The generation of unique SEQ values per OST allows the IDIF FIDs to + * be identified in the FLD correctly. The OID field is calculated as: + * + * objid & 0xffffffff + * + * that is, it consists of lower 32 bits of object ID. For objects within + * the IDIF range, object ID extraction will be: + * + * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid; + * o_seq = 0; // formerly group number + * + * NOTE: This assumes that no more than 2^48-1 objects have ever been created + * on any OST, and that no more than 65535 OSTs are in use. Both are very + * reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming + * a maximum creation rate of 1M objects per second for a maximum of 9 years, + * or combinations thereof. + * + * OST_MDT0 + * Surrogate FID used to identify an existing object on OLD formatted OST + * filesystem. Belongs to the reserved SEQuence 0, and is used prior to + * the introduction of FID-on-OST, at which point IDIF will be used to + * identify objects as residing on a specific OST. + * + * LLOG + * For Lustre Log objects the object sequence 1 is used. This is compatible + * with both OLD and NEW namespaces, as this SEQ number is in the + * ext3/ldiskfs reserved inode range and does not conflict with IGIF + * sequence numbers. + * + * ECHO + * For testing OST IO performance the object sequence 2 is used. This is + * compatible with both OLD and NEW namespaces, as this SEQ number is in + * the ext3/ldiskfs reserved inode range and does not conflict with IGIF + * sequence numbers. + * + * OST_MDT1 .. OST_MAX + * For testing with multiple MDTs the object sequence 3 through 9 is used, + * allowing direct mapping of MDTs 1 through 7 respectively, for a total + * of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group" + * mappings. However, this SEQ range is only for testing prior to any + * production DNE release, as the objects in this range conflict across all + * OSTs, as the OST index is not part of the FID. For production DNE usage, + * OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs. + * + * DLM OST objid to IDIF mapping + * For compatibility with existing OLD OST network protocol structures, the + * FID must map onto the o_id and o_seq in a manner that ensures existing + * objects are identified consistently for IO, as well as onto the LDLM + * namespace to ensure IDIFs there is only a single resource name for any + * object in the DLM. The OLD OST object DLM resource mapping is: + * + * resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases + * + * The NEW OST object DLM resource mapping is the same for both MDT and OST: + * + * resource[] = {SEQ, OID, VER, HASH}; + * + * NOTE: for mapping IDIF values to DLM resource names the o_id may be + * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible + * for the o_id numbers to overlap FID SEQ numbers in the resource. However, + * in all production releases the OLD o_seq field is always zero, and all + * valid FID OID values are non-zero, so the lock resources will not collide. + * Even so, the MDT and OST resources are also in different LDLM namespaces. + */ + +#include +#include +#include +#include + +struct lu_env; +struct lu_site; +struct lu_context; +struct obd_device; +struct obd_export; + +/* Whole sequences space range and zero range definitions */ +extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE; +extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE; +extern const struct lu_fid LUSTRE_BFL_FID; +extern const struct lu_fid LU_OBF_FID; +extern const struct lu_fid LU_LPF_FID; +extern const struct lu_fid LU_DOT_LUSTRE_FID; +extern const struct lu_fid LU_BACKEND_LPF_FID; + +enum { + /* + * This is how may metadata FIDs may be allocated in one sequence(128k) + */ + LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL, + + /* + * This is how many data FIDs could be allocated in one sequence(4B - 1) + */ + LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL, + + /* + * How many sequences to allocate to a client at once. + */ + LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL, + + /* + * seq allocation pool size. + */ + LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000, + + /* + * This is how many sequences may be in one super-sequence allocated to + * MDTs. + */ + LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH) +}; + +/** special OID for local objects */ +enum local_oid { + /** \see fld_mod_init */ + FLD_INDEX_OID = 3UL, + /** \see fid_mod_init */ + FID_SEQ_CTL_OID = 4UL, + FID_SEQ_SRV_OID = 5UL, + /** \see mdd_mod_init */ + MDD_ROOT_INDEX_OID = 6UL, /* deprecated in 2.4 */ + MDD_ORPHAN_OID = 7UL, /* deprecated in 2.4 */ + MDD_LOV_OBJ_OID = 8UL, + MDD_CAPA_KEYS_OID = 9UL, + /** \see mdt_mod_init */ + LAST_RECV_OID = 11UL, + OSD_FS_ROOT_OID = 13UL, + ACCT_USER_OID = 15UL, + ACCT_GROUP_OID = 16UL, + LFSCK_BOOKMARK_OID = 17UL, + OTABLE_IT_OID = 18UL, + OSD_LPF_OID = 19UL, + REPLY_DATA_OID = 21UL, + ACCT_PROJECT_OID = 22UL, + INDEX_BACKUP_OID = 4116UL, + OFD_LAST_GROUP_OID = 4117UL, + LLOG_CATALOGS_OID = 4118UL, + MGS_CONFIGS_OID = 4119UL, + OFD_HEALTH_CHECK_OID = 4120UL, + MDD_LOV_OBJ_OSEQ = 4121UL, + LFSCK_NAMESPACE_OID = 4122UL, + REMOTE_PARENT_DIR_OID = 4123UL, + /* This definition is obsolete + * SLAVE_LLOG_CATALOGS_OID = 4124UL, + */ + BATCHID_COMMITTED_OID = 4125UL, +}; + +static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_FILE; + fid->f_oid = oid; + fid->f_ver = 0; +} + +static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_NAME; + fid->f_oid = oid; + fid->f_ver = 0; +} + +/* For new FS (>= 2.4), the root FID will be changed to + * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4), + * the root FID will still be IGIF */ +static inline int fid_is_root(const struct lu_fid *fid) +{ + return unlikely((fid_seq(fid) == FID_SEQ_ROOT && + fid_oid(fid) == FID_OID_ROOT)); +} + +static inline int fid_is_dot_lustre(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && + fid_oid(fid) == FID_OID_DOT_LUSTRE); +} + +static inline int fid_is_obf(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && + fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF); +} + +static inline int fid_is_otable_it(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE && + fid_oid(fid) == OTABLE_IT_OID); +} + +static inline int fid_oid_is_quota(const struct lu_fid *fid) +{ + switch (fid_oid(fid)) { + case ACCT_USER_OID: + case ACCT_GROUP_OID: + case ACCT_PROJECT_OID: + return 1; + default: + return 0; + } +} + +static inline int fid_is_acct(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_LOCAL_FILE && + fid_oid_is_quota(fid); +} + +static inline int fid_is_quota(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_QUOTA || + fid_seq(fid) == FID_SEQ_QUOTA_GLB; +} + +static inline int fid_is_name_llog(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_LLOG_NAME; +} + +static inline int fid_is_namespace_visible(const struct lu_fid *fid) +{ + const __u64 seq = fid_seq(fid); + + /* Here, we cannot distinguish whether the normal FID is for OST + * object or not. It is caller's duty to check more if needed. */ + return (!fid_is_last_id(fid) && + (fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) || + fid_is_root(fid) || fid_seq_is_dot(seq); +} + +static inline int fid_seq_in_fldb(__u64 seq) +{ + return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) || + fid_seq_is_root(seq) || fid_seq_is_dot(seq); +} + +static inline void ost_layout_cpu_to_le(struct ost_layout *dst, + const struct ost_layout *src) +{ + dst->ol_stripe_size = __cpu_to_le32(src->ol_stripe_size); + dst->ol_stripe_count = __cpu_to_le32(src->ol_stripe_count); + dst->ol_comp_start = __cpu_to_le64(src->ol_comp_start); + dst->ol_comp_end = __cpu_to_le64(src->ol_comp_end); + dst->ol_comp_id = __cpu_to_le32(src->ol_comp_id); +} + +static inline void ost_layout_le_to_cpu(struct ost_layout *dst, + const struct ost_layout *src) +{ + dst->ol_stripe_size = __le32_to_cpu(src->ol_stripe_size); + dst->ol_stripe_count = __le32_to_cpu(src->ol_stripe_count); + dst->ol_comp_start = __le64_to_cpu(src->ol_comp_start); + dst->ol_comp_end = __le64_to_cpu(src->ol_comp_end); + dst->ol_comp_id = __le32_to_cpu(src->ol_comp_id); +} + +static inline void filter_fid_cpu_to_le(struct filter_fid *dst, + const struct filter_fid *src, int size) +{ + fid_cpu_to_le(&dst->ff_parent, &src->ff_parent); + + if (size < sizeof(struct filter_fid)) { + memset(&dst->ff_layout, 0, sizeof(dst->ff_layout)); + } else { + ost_layout_cpu_to_le(&dst->ff_layout, &src->ff_layout); + dst->ff_layout_version = cpu_to_le32(src->ff_layout_version); + dst->ff_range = cpu_to_le32(src->ff_range); + } + + /* XXX: Add more if filter_fid is enlarged in the future. */ +} + +static inline void filter_fid_le_to_cpu(struct filter_fid *dst, + const struct filter_fid *src, int size) +{ + fid_le_to_cpu(&dst->ff_parent, &src->ff_parent); + + if (size < sizeof(struct filter_fid)) { + memset(&dst->ff_layout, 0, sizeof(dst->ff_layout)); + } else { + ost_layout_le_to_cpu(&dst->ff_layout, &src->ff_layout); + dst->ff_layout_version = le32_to_cpu(src->ff_layout_version); + dst->ff_range = le32_to_cpu(src->ff_range); + } + + /* XXX: Add more if filter_fid is enlarged in the future. */ +} + +static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq, __u32 ost_idx) +{ + if (fid_seq_is_mdt0(seq)) { + fid->f_seq = fid_idif_seq(0, ost_idx); + } else { + LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) || + fid_seq_is_idif(seq), "%#llx\n", seq); + fid->f_seq = seq; + } + fid->f_oid = 0; + fid->f_ver = 0; +} + +static inline bool fid_is_md_operative(const struct lu_fid *fid) +{ + return fid_is_mdt0(fid) || fid_is_igif(fid) || + fid_is_norm(fid) || fid_is_root(fid); +} + +/* seq client type */ +enum lu_cli_type { + LUSTRE_SEQ_METADATA = 1, + LUSTRE_SEQ_DATA +}; + +enum lu_mgr_type { + LUSTRE_SEQ_SERVER, + LUSTRE_SEQ_CONTROLLER +}; + +struct lu_server_seq; + +/* Client sequence manager interface. */ +struct lu_client_seq { + /* Sequence-controller export. */ + struct obd_export *lcs_exp; + struct mutex lcs_mutex; + + /* + * Range of allowed for allocation sequeces. When using lu_client_seq on + * clients, this contains meta-sequence range. And for servers this + * contains super-sequence range. + */ + struct lu_seq_range lcs_space; + + /* Seq related debugfs */ + struct dentry *lcs_debugfs_entry; + + /* This holds last allocated fid in last obtained seq */ + struct lu_fid lcs_fid; + + /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */ + enum lu_cli_type lcs_type; + + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with debugfs. + */ + char lcs_name[80]; + + /* + * Sequence width, that is how many objects may be allocated in one + * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH. + */ + __u64 lcs_width; + + /* Seq-server for direct talking */ + struct lu_server_seq *lcs_srv; + + /* wait queue for fid allocation and update indicator */ + wait_queue_head_t lcs_waitq; + int lcs_update; +}; + +/* server sequence manager interface */ +struct lu_server_seq { + /* Available sequences space */ + struct lu_seq_range lss_space; + + /* keeps highwater in lsr_end for seq allocation algorithm */ + struct lu_seq_range lss_lowater_set; + struct lu_seq_range lss_hiwater_set; + + /* + * Device for server side seq manager needs (saving sequences to backing + * store). + */ + struct dt_device *lss_dev; + + /* /seq file object device */ + struct dt_object *lss_obj; + + /* Seq related debugfs */ + struct dentry *lss_debugfs_entry; + + /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */ + enum lu_mgr_type lss_type; + + /* Client interface to request controller */ + struct lu_client_seq *lss_cli; + + /* Mutex for protecting allocation */ + struct mutex lss_mutex; + + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with debugfs. + */ + char lss_name[80]; + + /* + * Allocation chunks for super and meta sequences. Default values are + * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH. + */ + __u64 lss_width; + + /* + * minimum lss_alloc_set size that should be allocated from + * lss_space + */ + __u64 lss_set_width; + + /* sync is needed for update operation */ + __u32 lss_need_sync; + + /** + * Pointer to site object, required to access site fld. + */ + struct seq_server_site *lss_site; +}; + +struct seq_server_site { + struct lu_site *ss_lu; + /** + * mds number of this site. + */ + u32 ss_node_id; + /** + * Fid location database + */ + struct lu_server_fld *ss_server_fld; + struct lu_client_fld *ss_client_fld; + + /** + * Server Seq Manager + */ + struct lu_server_seq *ss_server_seq; + + /** + * Controller Seq Manager + */ + struct lu_server_seq *ss_control_seq; + struct obd_export *ss_control_exp; + + /** + * Client Seq Manager + */ + struct lu_client_seq *ss_client_seq; +}; + +/* Server methods */ + +int seq_server_init(const struct lu_env *env, + struct lu_server_seq *seq, + struct dt_device *dev, + const char *prefix, + enum lu_mgr_type type, + struct seq_server_site *ss); + +void seq_server_fini(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env); + +int seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env); + +int seq_server_set_cli(const struct lu_env *env, + struct lu_server_seq *seq, + struct lu_client_seq *cli); + +int seq_server_check_and_alloc_super(const struct lu_env *env, + struct lu_server_seq *seq); +/* Client methods */ +int seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + const char *prefix, + struct lu_server_seq *srv); + +void seq_client_fini(struct lu_client_seq *seq); + +void seq_client_flush(struct lu_client_seq *seq); + +int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq, + struct lu_fid *fid); +int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq, + u64 *seqnr); +int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss); +/* Fids common stuff */ +int fid_is_local(const struct lu_env *env, + struct lu_site *site, const struct lu_fid *fid); + +enum lu_cli_type; +int client_fid_init(struct obd_device *obd, struct obd_export *exp, + enum lu_cli_type type); +int client_fid_fini(struct obd_device *obd); + +/* fid locking */ + +struct ldlm_namespace; + +/* + * Build (DLM) resource name from FID. + * + * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], + * but was moved into name[1] along with the OID to avoid consuming the + * renaming name[2,3] fields that need to be used for the quota identifier. + */ +static inline void +fid_build_reg_res_name(const struct lu_fid *fid, struct ldlm_res_id *res) +{ + memset(res, 0, sizeof(*res)); + res->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(fid); + res->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(fid); +} + +/* + * Return true if resource is for object identified by FID. + */ +static inline int fid_res_name_eq(const struct lu_fid *fid, + const struct ldlm_res_id *res) +{ + return res->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(fid) && + res->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(fid); +} + +/* + * Extract FID from LDLM resource. Reverse of fid_build_reg_res_name(). + */ +static inline void +fid_extract_from_res_name(struct lu_fid *fid, const struct ldlm_res_id *res) +{ + fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF]; + fid->f_oid = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF]); + fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); + LASSERT(fid_res_name_eq(fid, res)); +} + +/* + * Build (DLM) resource identifier from global quota FID and quota ID. + */ +static inline void +fid_build_quota_res_name(const struct lu_fid *glb_fid, union lquota_id *qid, + struct ldlm_res_id *res) +{ + fid_build_reg_res_name(glb_fid, res); + res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid); + res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid); +} + +/* + * Extract global FID and quota ID from resource name + */ +static inline void fid_extract_from_quota_res(struct lu_fid *glb_fid, + union lquota_id *qid, + const struct ldlm_res_id *res) +{ + fid_extract_from_res_name(glb_fid, res); + qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF]; + qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF]; + qid->qid_fid.f_ver = + (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32); +} + +static inline void +fid_build_pdo_res_name(const struct lu_fid *fid, unsigned int hash, + struct ldlm_res_id *res) +{ + fid_build_reg_res_name(fid, res); + res->name[LUSTRE_RES_ID_HSH_OFF] = hash; +} + +/** + * Build DLM resource name from object id & seq, which will be removed + * finally, when we replace ost_id with FID in data stack. + * + * Currently, resid from the old client, whose res[0] = object_id, + * res[1] = object_seq, is just oposite with Metatdata + * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid. + * To unifiy the resid identification, we will reverse the data + * resid to keep it same with Metadata resid, i.e. + * + * For resid from the old client, + * res[0] = objid, res[1] = 0, still keep the original order, + * for compatiblity. + * + * For new resid + * res will be built from normal FID directly, i.e. res[0] = f_seq, + * res[1] = f_oid + f_ver. + */ +static inline void ostid_build_res_name(const struct ost_id *oi, + struct ldlm_res_id *name) +{ + memset(name, 0, sizeof *name); + if (fid_seq_is_mdt0(ostid_seq(oi))) { + name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi); + name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi); + } else { + fid_build_reg_res_name(&oi->oi_fid, name); + } +} + +/** + * Return true if the resource is for the object identified by this id & group. + */ +static inline bool ostid_res_name_eq(const struct ost_id *oi, + const struct ldlm_res_id *name) +{ + /* Note: it is just a trick here to save some effort, probably the + * correct way would be turn them into the FID and compare */ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi); + } else { + return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi); + } +} + +/** + * Note: we need check oi_seq to decide where to set oi_id, + * so oi_seq should always be set ahead of oi_id. + */ +static inline int ostid_set_id(struct ost_id *oi, __u64 oid) +{ + if (fid_seq_is_mdt0(oi->oi.oi_seq)) { + if (oid >= IDIF_MAX_OID) + return -E2BIG; + oi->oi.oi_id = oid; + } else if (fid_is_idif(&oi->oi_fid)) { + if (oid >= IDIF_MAX_OID) + return -E2BIG; + oi->oi_fid.f_seq = fid_idif_seq(oid, + fid_idif_ost_idx(&oi->oi_fid)); + oi->oi_fid.f_oid = oid; + oi->oi_fid.f_ver = oid >> 48; + } else { + if (oid >= OBIF_MAX_OID) + return -E2BIG; + oi->oi_fid.f_oid = oid; + } + return 0; +} + +/* pack any OST FID into an ostid (id/seq) for the wire/disk */ +static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid) +{ + int rc = 0; + + if (fid_seq_is_igif(fid->f_seq)) + return -EBADF; + + if (fid_is_idif(fid)) { + ostid_set_seq_mdt0(ostid); + rc = ostid_set_id(ostid, fid_idif_id(fid_seq(fid), + fid_oid(fid), fid_ver(fid))); + } else { + ostid->oi_fid = *fid; + } + + return rc; +} + +/* The same as osc_build_res_name() */ +static inline void ost_fid_build_resid(const struct lu_fid *fid, + struct ldlm_res_id *resname) +{ + if (fid_is_mdt0(fid) || fid_is_idif(fid)) { + struct ost_id oi; + oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */ + if (fid_to_ostid(fid, &oi) != 0) + return; + ostid_build_res_name(&oi, resname); + } else { + fid_build_reg_res_name(fid, resname); + } +} + +static inline void ost_fid_from_resid(struct lu_fid *fid, + const struct ldlm_res_id *name, + int ost_idx) +{ + if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) { + /* old resid */ + struct ost_id oi; + + memset(&oi, 0, sizeof(oi)); + ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]); + if (ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF])) { + CERROR("Bad %llu to set " DOSTID "\n", + name->name[LUSTRE_RES_ID_SEQ_OFF], POSTID(&oi)); + } + ostid_to_fid(fid, &oi, ost_idx); + } else { + /* new resid */ + fid_extract_from_res_name(fid, name); + } +} + +/** + * Flatten 128-bit FID values into a 64-bit value for use as an inode number. + * For non-IGIF FIDs this starts just over 2^32, and continues without + * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ + * into the range where there may not be many OID values in use, to minimize + * the risk of conflict. + * + * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true, + * the time between re-used inode numbers is very long - 2^40 SEQ numbers, + * or about 2^40 client mounts, if clients create less than 2^24 files/mount. + */ +static inline __u64 fid_flatten(const struct lu_fid *fid) +{ + __u64 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + return ino; + } + + seq = fid_seq(fid); + + ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid); + + return ino ?: fid_oid(fid); +} + +static inline __u32 fid_hash(const struct lu_fid *f, int bits) +{ + /* all objects with same id and different versions will belong to same + * collisions list. */ + return hash_long(fid_flatten(f), bits); +} + +/** + * map fid to 32 bit value for ino on 32bit systems. */ +static inline __u32 fid_flatten32(const struct lu_fid *fid) +{ + __u32 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + return ino; + } + + seq = fid_seq(fid) - FID_SEQ_START; + + /* Map the high bits of the OID into higher bits of the inode number so + * that inodes generated at about the same time have a reduced chance + * of collisions. This will give a period of 2^12 = 1024 unique clients + * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects + * (from OID), or up to 128M inodes without collisions for new files. */ + ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) + + (seq >> (64 - (40-8)) & 0xffffff00) + + (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8); + + return ino ?: fid_oid(fid); +} + +static inline int +lu_fid_diff(const struct lu_fid *fid1, const struct lu_fid *fid2) +{ + LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n", + PFID(fid1), PFID(fid2)); + + if (fid_is_idif(fid1) && fid_is_idif(fid2)) + return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) - + fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver); + + return fid_oid(fid1) - fid_oid(fid2); +} + +static inline int fid_set_id(struct lu_fid *fid, u64 oid) +{ + if (unlikely(fid_seq_is_igif(fid->f_seq))) { + CERROR("bad IGIF, "DFID"\n", PFID(fid)); + return -EBADF; + } + + if (fid_is_idif(fid)) { + if (oid >= IDIF_MAX_OID) { + CERROR("Too large OID %#llx to set IDIF "DFID"\n", + (unsigned long long)oid, PFID(fid)); + return -EBADF; + } + fid->f_seq = fid_idif_seq(oid, fid_idif_ost_idx(fid)); + fid->f_oid = oid; + fid->f_ver = oid >> 48; + } else { + if (oid > OBIF_MAX_OID) { + CERROR("Too large OID %#llx to set REG "DFID"\n", + (unsigned long long)oid, PFID(fid)); + return -EBADF; + } + fid->f_oid = oid; + } + return 0; +} + +#define LUSTRE_SEQ_SRV_NAME "seq_srv" +#define LUSTRE_SEQ_CTL_NAME "seq_ctl" + +/* Range common stuff */ +static inline void +range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_le64(src->lsr_start); + dst->lsr_end = cpu_to_le64(src->lsr_end); + dst->lsr_index = cpu_to_le32(src->lsr_index); + dst->lsr_flags = cpu_to_le32(src->lsr_flags); +} + +static inline void +range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = le64_to_cpu(src->lsr_start); + dst->lsr_end = le64_to_cpu(src->lsr_end); + dst->lsr_index = le32_to_cpu(src->lsr_index); + dst->lsr_flags = le32_to_cpu(src->lsr_flags); +} + +static inline void +range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_be64(src->lsr_start); + dst->lsr_end = cpu_to_be64(src->lsr_end); + dst->lsr_index = cpu_to_be32(src->lsr_index); + dst->lsr_flags = cpu_to_be32(src->lsr_flags); +} + +static inline void +range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = be64_to_cpu(src->lsr_start); + dst->lsr_end = be64_to_cpu(src->lsr_end); + dst->lsr_index = be32_to_cpu(src->lsr_index); + dst->lsr_flags = be32_to_cpu(src->lsr_flags); +} + +static inline void range_array_cpu_to_le(struct lu_seq_range_array *dst, + const struct lu_seq_range_array *src) +{ + __u32 i; + + for (i = 0; i < src->lsra_count; i++) + range_cpu_to_le(&dst->lsra_lsr[i], &src->lsra_lsr[i]); + + dst->lsra_count = cpu_to_le32(src->lsra_count); +} + +static inline void range_array_le_to_cpu(struct lu_seq_range_array *dst, + const struct lu_seq_range_array *src) +{ + __u32 i; + + dst->lsra_count = le32_to_cpu(src->lsra_count); + for (i = 0; i < dst->lsra_count; i++) + range_le_to_cpu(&dst->lsra_lsr[i], &src->lsra_lsr[i]); +} + +/** @} fid */ + +#endif /* __LUSTRE_FID_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fld.h b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h new file mode 100644 index 0000000000000..102dcfac77480 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h @@ -0,0 +1,199 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_FLD_H +#define __LINUX_FLD_H + +/** \defgroup fld fld + * + * @{ + */ + +#include +#include +#include + +struct lu_env; +struct lu_client_fld; +struct lu_server_fld; +struct lu_fld_hash; +struct fld_cache; +struct thandle; +struct dt_device; +struct dt_object; + +/* + * FLD (Fid Location Database) interface. + */ +enum { + LUSTRE_CLI_FLD_HASH_DHT = 0, + LUSTRE_CLI_FLD_HASH_RRB +}; + +struct lu_fld_target { + struct list_head ft_chain; + struct obd_export *ft_exp; + struct lu_server_fld *ft_srv; + __u64 ft_idx; +}; + +struct lu_server_fld { + /** + * Fld dir debugfs entry. + */ + struct dentry *lsf_debugfs_entry; + + /** + * /fld file object device */ + struct dt_object *lsf_obj; + + /** + * super sequence controller export, needed to forward fld + * lookup request. */ + struct obd_export *lsf_control_exp; + + /** + * Client FLD cache. */ + struct fld_cache *lsf_cache; + + /** + * Protect index modifications */ + struct mutex lsf_lock; + + /** + * Fld service name in form "fld-srv-lustre-MDTXXX" */ + char lsf_name[80]; + + int (*lsf_seq_lookup)(const struct lu_env *env, + struct lu_server_fld *fld, u64 seq, + struct lu_seq_range *range); + + /** + * Just reformatted or upgraded, and this flag is being + * used to check whether the local FLDB is needs to be + * synced with global FLDB(in MDT0), and it is only needed + * if the MDT is upgraded from < 2.6 to 2.6, i.e. when the + * local FLDB is being invited */ + unsigned int lsf_new:1; + +}; + +struct lu_client_fld { + /** + * Client side debugfs entry. + */ + struct dentry *lcf_debugfs_entry; + + /** + * List of exports client FLD knows about. */ + struct list_head lcf_targets; + + /** + * Current hash to be used to chose an export. */ + struct lu_fld_hash *lcf_hash; + + /** + * Exports count. */ + int lcf_count; + + /** + * Lock protecting exports list and fld_hash. */ + spinlock_t lcf_lock; + + /** + * Client FLD cache. */ + struct fld_cache *lcf_cache; + + /** + * Client fld debugfs entry name. + */ + char lcf_name[80]; +}; + +/* Server methods */ +int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, const char *prefix, int type); + +void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld); + +int fld_declare_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range, + struct thandle *th); + +int fld_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *add_range, + struct thandle *th); + +int fld_insert_entry(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range); + +int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range); + +int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld, + u64 seq, struct lu_seq_range *range); + +int fld_update_from_controller(const struct lu_env *env, + struct lu_server_fld *fld); + +/* Client methods */ +int fld_client_init(struct lu_client_fld *fld, + const char *prefix, int hash); + +void fld_client_fini(struct lu_client_fld *fld); + +void fld_client_flush(struct lu_client_fld *fld); + +int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds, + __u32 flags, const struct lu_env *env); + +int fld_client_create(struct lu_client_fld *fld, + struct lu_seq_range *range, + const struct lu_env *env); + +int fld_client_delete(struct lu_client_fld *fld, u64 seq, + const struct lu_env *env); + +int fld_client_add_target(struct lu_client_fld *fld, + struct lu_fld_target *tar); + +int fld_client_del_target(struct lu_client_fld *fld, + __u64 idx); + +void fld_client_debugfs_fini(struct lu_client_fld *fld); + +/** @} fld */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_ha.h b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h new file mode 100644 index 0000000000000..2cb4969b615bf --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h @@ -0,0 +1,60 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_HA_H +#define _LUSTRE_HA_H + +/** \defgroup ha ha + * + * @{ + */ + +struct obd_import; +struct obd_export; +struct obd_device; +struct ptlrpc_request; + + +int ptlrpc_replay(struct obd_import *imp); +int ptlrpc_resend(struct obd_import *imp); +void ptlrpc_free_committed(struct obd_import *imp); +void ptlrpc_wake_delayed(struct obd_import *imp); +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async); +int ptlrpc_set_import_active(struct obd_import *imp, int active); +void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full); +void ptlrpc_deactivate_import(struct obd_import *imp); +void ptlrpc_invalidate_import(struct obd_import *imp); +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt); +void ptlrpc_pinger_force(struct obd_import *imp); +/** @} ha */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_handles.h b/drivers/staging/lustrefsx/lustre/include/lustre_handles.h new file mode 100644 index 0000000000000..16917caccdb7c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_handles.h @@ -0,0 +1,88 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_HANDLES_H_ +#define __LUSTRE_HANDLES_H_ + +/** \defgroup handles handles + * + * @{ + */ + +#include +#include +#include + +struct portals_handle_ops { + void (*hop_addref)(void *object); + void (*hop_free)(void *object, int size); +}; + +/* These handles are most easily used by having them appear at the very top of + * whatever object that you want to make handles for. ie: + * + * struct ldlm_lock { + * struct portals_handle handle; + * ... + * }; + * + * Now you're able to assign the results of cookie2handle directly to an + * ldlm_lock. If it's not at the top, you'll want to use container_of() + * to compute the start of the structure based on the handle field. */ +struct portals_handle { + struct list_head h_link; + __u64 h_cookie; + const void *h_owner; + struct portals_handle_ops *h_ops; + + /* newly added fields to handle the RCU issue. -jxiong */ + struct rcu_head h_rcu; + spinlock_t h_lock; + unsigned int h_size:31; + unsigned int h_in:1; +}; + +/* handles.c */ + +/* Add a handle to the hash table */ +void class_handle_hash(struct portals_handle *, + struct portals_handle_ops *ops); +void class_handle_unhash(struct portals_handle *); +void class_handle_hash_back(struct portals_handle *); +void *class_handle2object(__u64 cookie, const void *owner); +void class_handle_free_cb(struct rcu_head *rcu); +int class_handle_init(void); +void class_handle_cleanup(void); + +/** @} handles */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h new file mode 100644 index 0000000000000..a8c5a218b6c7d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h @@ -0,0 +1,71 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/include/lustre_idmap.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_IDMAP_H +#define _LUSTRE_IDMAP_H + +/** \defgroup idmap idmap + * + * @{ + */ + +#include + +#ifdef HAVE_GROUP_INFO_GID + +#define CFS_GROUP_AT(gi, i) ((gi)->gid[(i)]) + +#else /* !HAVE_GROUP_INFO_GID */ + +#define CFS_NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) + +#define CFS_GROUP_AT(gi, i) \ + ((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK]) + +#endif /* HAVE_GROUP_INFO_GID */ + +#include + +struct lu_ucred; + +extern void lustre_groups_from_list(struct group_info *ginfo, gid_t *glist); +extern void lustre_groups_sort(struct group_info *group_info); +extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp); + +/** @} idmap */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_import.h b/drivers/staging/lustrefsx/lustre/include/lustre_import.h new file mode 100644 index 0000000000000..430fde2e92738 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_import.h @@ -0,0 +1,400 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup obd_import PtlRPC import definitions + * Imports are client-side representation of remote obd target. + * + * @{ + */ + +#ifndef __IMPORT_H +#define __IMPORT_H + +/** \defgroup export export + * + * @{ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Adaptive Timeout stuff + * + * @{ + */ +#define D_ADAPTTO D_OTHER +#define AT_BINS 4 /* "bin" means "N seconds of history" */ +#define AT_FLG_NOHIST 0x1 /* use last reported value only */ + +struct adaptive_timeout { + time64_t at_binstart; /* bin start time */ + unsigned int at_hist[AT_BINS]; /* timeout history bins */ + unsigned int at_flags; + unsigned int at_current; /* current timeout value */ + unsigned int at_worst_ever; /* worst-ever timeout value */ + time64_t at_worst_time; /* worst-ever timeout timestamp */ + spinlock_t at_lock; +}; + +enum lustre_at_flags { + LATF_SKIP = 0x0, + LATF_STATS = 0x1, +}; + +struct ptlrpc_at_array { + struct list_head *paa_reqs_array; /** array to hold requests */ + __u32 paa_size; /** the size of array */ + __u32 paa_count; /** the total count of reqs */ + time64_t paa_deadline; /** the earliest deadline of reqs */ + __u32 *paa_reqs_count; /** the count of reqs in each entry */ +}; + +#define IMP_AT_MAX_PORTALS 8 +struct imp_at { + int iat_portal[IMP_AT_MAX_PORTALS]; + struct adaptive_timeout iat_net_latency; + struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS]; +}; + + +/** @} */ + +/** Possible import states */ +enum lustre_imp_state { + LUSTRE_IMP_CLOSED = 1, + LUSTRE_IMP_NEW = 2, + LUSTRE_IMP_DISCON = 3, + LUSTRE_IMP_CONNECTING = 4, + LUSTRE_IMP_REPLAY = 5, + LUSTRE_IMP_REPLAY_LOCKS = 6, + LUSTRE_IMP_REPLAY_WAIT = 7, + LUSTRE_IMP_RECOVER = 8, + LUSTRE_IMP_FULL = 9, + LUSTRE_IMP_EVICTED = 10, + LUSTRE_IMP_IDLE = 11, + LUSTRE_IMP_LAST +}; + +/** Returns test string representation of numeric import state \a state */ +static inline char * ptlrpc_import_state_name(enum lustre_imp_state state) +{ + static char *import_state_names[] = { + "", "CLOSED", "NEW", "DISCONN", + "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT", + "RECOVER", "FULL", "EVICTED", "IDLE", + }; + + LASSERT(state < LUSTRE_IMP_LAST); + return import_state_names[state]; +} + +/** + * List of import event types + */ +enum obd_import_event { + IMP_EVENT_DISCON = 0x808001, + IMP_EVENT_INACTIVE = 0x808002, + IMP_EVENT_INVALIDATE = 0x808003, + IMP_EVENT_ACTIVE = 0x808004, + IMP_EVENT_OCD = 0x808005, + IMP_EVENT_DEACTIVATE = 0x808006, + IMP_EVENT_ACTIVATE = 0x808007, +}; + +/** + * Definition of import connection structure + */ +struct obd_import_conn { + /** Item for linking connections together */ + struct list_head oic_item; + /** Pointer to actual PortalRPC connection */ + struct ptlrpc_connection *oic_conn; + /** uuid of remote side */ + struct obd_uuid oic_uuid; + /** + * Time (64 bit seconds) of last connection attempt on this connection + */ + time64_t oic_last_attempt; +}; + +/* state history */ +#define IMP_STATE_HIST_LEN 16 +struct import_state_hist { + enum lustre_imp_state ish_state; + time64_t ish_time; +}; + +/** + * Defintion of PortalRPC import structure. + * Imports are representing client-side view to remote target. + */ +struct obd_import { + /** Reference counter */ + atomic_t imp_refcount; + struct lustre_handle imp_dlm_handle; /* client's ldlm export */ + /** Currently active connection */ + struct ptlrpc_connection *imp_connection; + /** PortalRPC client structure for this import */ + struct ptlrpc_client *imp_client; + /** List element for linking into pinger chain */ + struct list_head imp_pinger_chain; + /** work struct for destruction of import */ + struct work_struct imp_zombie_work; + + /** + * Lists of requests that are retained for replay, waiting for a reply, + * or waiting for recovery to complete, respectively. + * @{ + */ + struct list_head imp_replay_list; + struct list_head imp_sending_list; + struct list_head imp_delayed_list; + /** @} */ + + /** + * List of requests that are retained for committed open replay. Once + * open is committed, open replay request will be moved from the + * imp_replay_list into the imp_committed_list. + * The imp_replay_cursor is for accelerating searching during replay. + * @{ + */ + struct list_head imp_committed_list; + struct list_head *imp_replay_cursor; + /** @} */ + + /** List of not replied requests */ + struct list_head imp_unreplied_list; + /** Known maximal replied XID */ + __u64 imp_known_replied_xid; + + /** obd device for this import */ + struct obd_device *imp_obd; + + /** + * some seciruty-related fields + * @{ + */ + struct ptlrpc_sec *imp_sec; + struct mutex imp_sec_mutex; + time64_t imp_sec_expire; + pid_t imp_sec_refpid; + /** @} */ + + /** Wait queue for those who need to wait for recovery completion */ + wait_queue_head_t imp_recovery_waitq; + + /** Number of requests allocated */ + atomic_t imp_reqs; + /** Number of requests currently in-flight */ + atomic_t imp_inflight; + /** Number of requests currently unregistering */ + atomic_t imp_unregistering; + /** Number of replay requests inflight */ + atomic_t imp_replay_inflight; + /** In-flight replays rate control */ + wait_queue_head_t imp_replay_waitq; + + /** Number of currently happening import invalidations */ + atomic_t imp_inval_count; + /** Numbner of request timeouts */ + atomic_t imp_timeouts; + /** Current import state */ + enum lustre_imp_state imp_state; + /** Last replay state */ + enum lustre_imp_state imp_replay_state; + /** History of import states */ + struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN]; + int imp_state_hist_idx; + /** Current import generation. Incremented on every reconnect */ + int imp_generation; + /** Idle connection initiated at this generation */ + int imp_initiated_at; + /** Incremented every time we send reconnection request */ + __u32 imp_conn_cnt; + /** + * \see ptlrpc_free_committed remembers imp_generation value here + * after a check to save on unnecessary replay list iterations + */ + int imp_last_generation_checked; + /** Last tranno we replayed */ + __u64 imp_last_replay_transno; + /** Last transno committed on remote side */ + __u64 imp_peer_committed_transno; + /** + * \see ptlrpc_free_committed remembers last_transno since its last + * check here and if last_transno did not change since last run of + * ptlrpc_free_committed and import generation is the same, we can + * skip looking for requests to remove from replay list as optimisation + */ + __u64 imp_last_transno_checked; + /** + * Remote export handle. This is how remote side knows what export + * we are talking to. Filled from response to connect request + */ + struct lustre_handle imp_remote_handle; + /** When to perform next ping. time in jiffies. */ + time64_t imp_next_ping; + /** When we last successfully connected. time in 64bit jiffies */ + time64_t imp_last_success_conn; + + /** List of all possible connection for import. */ + struct list_head imp_conn_list; + /** + * Current connection. \a imp_connection is imp_conn_current->oic_conn + */ + struct obd_import_conn *imp_conn_current; + + /** Protects flags, level, generation, conn_cnt, *_list */ + spinlock_t imp_lock; + + /* flags */ + unsigned long imp_no_timeout:1, /* timeouts are disabled */ + imp_invalid:1, /* evicted */ + /* administratively disabled */ + imp_deactive:1, + /* try to recover the import */ + imp_replayable:1, + /* don't run recovery (timeout instead) */ + imp_dlm_fake:1, + /* use 1/2 timeout on MDS' OSCs */ + imp_server_timeout:1, + /* VBR: imp in delayed recovery */ + imp_delayed_recovery:1, + /* recovery by versions was failed */ + imp_vbr_failed:1, + /* force an immidiate ping */ + imp_force_verify:1, + /* force a scheduled ping */ + imp_force_next_verify:1, + /* pingable */ + imp_pingable:1, + /* resend for replay */ + imp_resend_replay:1, + /* disable normal recovery, for test only. */ + imp_no_pinger_recover:1, + /* import must be reconnected instead of + * chouse new connection */ + imp_force_reconnect:1, + /* import has tried to connect with server */ + imp_connect_tried:1, + /* connected but not FULL yet */ + imp_connected:1, + /* grant shrink disabled */ + imp_grant_shrink_disabled:1, + /* to supress LCONSOLE() at conn.restore */ + imp_was_idle:1; + u32 imp_connect_op; + u32 imp_idle_timeout; + u32 imp_idle_debug; + struct obd_connect_data imp_connect_data; + __u64 imp_connect_flags_orig; + __u64 imp_connect_flags2_orig; + int imp_connect_error; + + enum lustre_msg_magic imp_msg_magic; + /* adjusted based on server capability */ + enum lustre_msghdr imp_msghdr_flags; + + /* adaptive timeout data */ + struct imp_at imp_at; + time64_t imp_last_reply_time; /* for health check */ +}; + +/* import.c */ +static inline unsigned int at_est2timeout(unsigned int val) +{ + /* add an arbitrary minimum: 125% +5 sec */ + return (val + (val >> 2) + 5); +} + +static inline timeout_t at_timeout2est(timeout_t timeout) +{ + /* restore estimate value from timeout: e=4/5(t-5) */ + LASSERT(timeout > 0); + return max((timeout << 2) / 5, 5) - 4; +} + +static inline void at_reset_nolock(struct adaptive_timeout *at, int val) +{ + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = ktime_get_real_seconds(); +} + +static inline void at_reset(struct adaptive_timeout *at, int val) +{ + spin_lock(&at->at_lock); + at_reset_nolock(at, val); + spin_unlock(&at->at_lock); +} + +static inline void at_init(struct adaptive_timeout *at, int val, int flags) { + memset(at, 0, sizeof(*at)); + spin_lock_init(&at->at_lock); + at->at_flags = flags; + at_reset(at, val); +} + +static inline void at_reinit(struct adaptive_timeout *at, int val, int flags) +{ + spin_lock(&at->at_lock); + at->at_binstart = 0; + memset(at->at_hist, 0, sizeof(at->at_hist)); + at->at_flags = flags; + at_reset_nolock(at, val); + spin_unlock(&at->at_lock); +} + +extern unsigned int at_min; +static inline int at_get(struct adaptive_timeout *at) { + return (at->at_current > at_min) ? at->at_current : at_min; +} +int at_measured(struct adaptive_timeout *at, unsigned int val); +int import_at_get_index(struct obd_import *imp, int portal); +extern unsigned int at_max; +#define AT_OFF (at_max == 0) + +/* genops.c */ +struct obd_export; +extern struct obd_import *class_exp2cliimp(struct obd_export *); + +/** @} import */ + +#endif /* __IMPORT_H */ + +/** @} obd_import */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_intent.h b/drivers/staging/lustrefsx/lustre/include/lustre_intent.h new file mode 100644 index 0000000000000..76dcd8878985a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_intent.h @@ -0,0 +1,68 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LUSTRE_INTENT_H +#define LUSTRE_INTENT_H + +/* intent IT_XXX are defined in lustre/include/obd.h */ + +struct lookup_intent { + int it_op; + int it_create_mode; + __u64 it_flags; + int it_disposition; + int it_status; + __u64 it_lock_handle; + __u64 it_lock_bits; + int it_lock_mode; + int it_remote_lock_mode; + __u64 it_remote_lock_handle; + struct ptlrpc_request *it_request; + unsigned int it_lock_set:1; +}; + +static inline int it_disposition(const struct lookup_intent *it, int flag) +{ + return it->it_disposition & flag; +} + +static inline void it_set_disposition(struct lookup_intent *it, int flag) +{ + it->it_disposition |= flag; +} + +static inline void it_clear_disposition(struct lookup_intent *it, int flag) +{ + it->it_disposition &= ~flag; +} + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h new file mode 100644 index 0000000000000..4af88af0edf87 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: Nathan Rutman + * + * Kernel <-> userspace communication routines. + * The definitions below are used in the kernel and userspace. + */ + +#ifndef __LUSTRE_KERNELCOMM_H__ +#define __LUSTRE_KERNELCOMM_H__ + +/* For declarations shared with userspace */ +#include + +/* prototype for callback function on kuc groups */ +typedef int (*libcfs_kkuc_cb_t)(void *data, void *cb_arg); + +/* Kernel methods */ +void libcfs_kkuc_init(void); +int libcfs_kkuc_msg_put(struct file *fp, void *payload); +int libcfs_kkuc_group_put(const struct obd_uuid *uuid, int group, void *data); +int libcfs_kkuc_group_add(struct file *fp, const struct obd_uuid *uuid, int uid, + int group, void *data, size_t data_len); +int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group); +int libcfs_kkuc_group_foreach(const struct obd_uuid *uuid, int group, + libcfs_kkuc_cb_t cb_func, void *cb_arg); + +#endif /* __LUSTRE_KERNELCOMM_H__ */ + diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h new file mode 100644 index 0000000000000..11409b97e66c8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h @@ -0,0 +1,130 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * lustre/include/lustre_lfsck.h + * + * Lustre LFSCK exported functions. + * + * Author: Fan, Yong + */ + +#ifndef _LUSTRE_LFSCK_H +# define _LUSTRE_LFSCK_H + +#include +#include +#include +#include + +struct lfsck_start_param { + struct lfsck_start *lsp_start; + __u32 lsp_index; + unsigned int lsp_index_valid:1; +}; + +/* For LE_PAIRS_VERIFY returned status */ +enum lfsck_pv_status { + LPVS_INIT = 0, + LPVS_INCONSISTENT = 1, + LPVS_INCONSISTENT_TOFIX = 2, +}; + +enum lfsck_events_local { + LEL_FID_ACCESSED = 1, + LEL_PAIRS_VERIFY_LOCAL = 2, +}; + +struct lfsck_req_local { + __u32 lrl_event; + __u32 lrl_status; + __u16 lrl_active; + __u16 lrl_padding0; + __u32 lrl_padding1; + struct lu_fid lrl_fid; + struct filter_fid lrl_ff_client; + struct filter_fid lrl_ff_local; +}; + +struct lfsck_layout_dangling_key { + struct lu_fid lldk_fid; + __u32 lldk_comp_id; + __u32 lldk_ea_off; +}; + +typedef int (*lfsck_out_notify)(const struct lu_env *env, void *data, + enum lfsck_events event); + +int lfsck_register_namespace(const struct lu_env *env, struct dt_device *key, + struct ldlm_namespace *ns); +int lfsck_register(const struct lu_env *env, struct dt_device *key, + struct dt_device *next, struct obd_device *obd, + lfsck_out_notify notify, void *notify_data, bool master); +void lfsck_degister(const struct lu_env *env, struct dt_device *key); + +int lfsck_add_target(const struct lu_env *env, struct dt_device *key, + struct dt_device *tgt, struct obd_export *exp, + __u32 index, bool for_ost); +void lfsck_del_target(const struct lu_env *env, struct dt_device *key, + struct dt_device *tgt, __u32 index, bool for_ost); + +int lfsck_start(const struct lu_env *env, struct dt_device *key, + struct lfsck_start_param *lsp); +int lfsck_stop(const struct lu_env *env, struct dt_device *key, + struct lfsck_stop *stop); +int lfsck_in_notify_local(const struct lu_env *env, struct dt_device *key, + struct lfsck_req_local *lrl, struct thandle *th); +int lfsck_in_notify(const struct lu_env *env, struct dt_device *key, + struct lfsck_request *lr); +int lfsck_query(const struct lu_env *env, struct dt_device *key, + struct lfsck_request *req, struct lfsck_reply *rep, + struct lfsck_query *que); + +int lfsck_get_speed(struct seq_file *m, char *buf, struct dt_device *key); +int lfsck_set_speed(struct dt_device *key, __u32 val); +int lfsck_get_windows(char *buf, struct dt_device *key); +int lfsck_set_windows(struct dt_device *key, unsigned int val); + +int lfsck_dump(struct seq_file *m, struct dt_device *key, enum lfsck_type type); + +static inline void lfsck_pack_rfa(struct lfsck_req_local *lrl, + const struct lu_fid *fid, + enum lfsck_events_local event, __u16 com) +{ + memset(lrl, 0, sizeof(*lrl)); + lrl->lrl_fid = *fid; + lrl->lrl_event = event; + lrl->lrl_active = com; +} + +static inline bool lovea_slot_is_dummy(const struct lov_ost_data_v1 *obj) +{ + /* zero area does not care about the bytes-order. */ + if (obj->l_ost_oi.oi.oi_id == 0 && obj->l_ost_oi.oi.oi_seq == 0 && + obj->l_ost_idx == 0 && obj->l_ost_gen == 0) + return true; + + return false; +} +#endif /* _LUSTRE_LFSCK_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lib.h b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h new file mode 100644 index 0000000000000..f67791252056d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h @@ -0,0 +1,407 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_lib.h + * + * Basic Lustre library routines. + */ + +#ifndef _LUSTRE_LIB_H +#define _LUSTRE_LIB_H + +/** \defgroup lib lib + * + * @{ + */ + +#ifdef HAVE_SCHED_HEADERS +#include +#include +#endif + +#include +#include +#include +#include + +/* target.c */ +struct ptlrpc_request; +struct obd_export; +struct lu_target; +struct l_wait_info; +#include +#include + +#define LI_POISON 0x5a5a5a5a +#if BITS_PER_LONG > 32 +# define LL_POISON 0x5a5a5a5a5a5a5a5aL +#else +# define LL_POISON 0x5a5a5a5aL +#endif +#define LP_POISON ((void *)LL_POISON) + +#ifdef HAVE_SERVER_SUPPORT +int rev_import_init(struct obd_export *exp); +int target_handle_connect(struct ptlrpc_request *req); +int target_handle_disconnect(struct ptlrpc_request *req); +void target_destroy_export(struct obd_export *exp); +void target_committed_to_req(struct ptlrpc_request *req); +void target_cancel_recovery_timer(struct obd_device *obd); +void target_stop_recovery_thread(struct obd_device *obd); +void target_cleanup_recovery(struct obd_device *obd); +int target_queue_recovery_request(struct ptlrpc_request *req, + struct obd_device *obd); +int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc, + struct l_wait_info *lwi); +#endif + +int target_pack_pool_reply(struct ptlrpc_request *req); +int do_set_info_async(struct obd_import *imp, + int opcode, int version, + size_t keylen, void *key, + size_t vallen, void *val, + struct ptlrpc_request_set *set); + +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id); + +/* + * l_wait_event is a flexible sleeping function, permitting simple caller + * configuration of interrupt and timeout sensitivity along with actions to + * be performed in the event of either exception. + * + * The first form of usage looks like this: + * + * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler, + * intr_handler, callback_data); + * rc = l_wait_event(waitq, condition, &lwi); + * + * l_wait_event() makes the current process wait on 'waitq' until 'condition' + * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending. It + * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before + * 'condition' becomes true, it optionally calls the specified 'intr_handler' + * if not NULL, and returns -EINTR. + * + * If a non-zero timeout is specified, signals are ignored until the timeout + * has expired. At this time, if 'timeout_handler' is not NULL it is called. + * If it returns FALSE l_wait_event() continues to wait as described above with + * signals enabled. Otherwise it returns -ETIMEDOUT. + * + * LWI_INTR(intr_handler, callback_data) is shorthand for + * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data) + * + * The second form of usage looks like this: + * + * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler); + * rc = l_wait_event(waitq, condition, &lwi); + * + * This form is the same as the first except that it COMPLETELY IGNORES + * SIGNALS. The caller must therefore beware that if 'timeout' is zero, or if + * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that + * can unblock the current process is 'condition' becoming TRUE. + * + * Another form of usage is: + * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval, + * timeout_handler); + * rc = l_wait_event(waitq, condition, &lwi); + * This is the same as previous case, but condition is checked once every + * 'interval' jiffies (if non-zero). + * + * Subtle synchronization point: this macro does *not* necessary takes + * wait-queue spin-lock before returning, and, hence, following idiom is safe + * ONLY when caller provides some external locking: + * + * Thread1 Thread2 + * + * l_wait_event(&obj->wq, ....); (1) + * + * wake_up(&obj->wq): (2) + * spin_lock(&q->lock); (2.1) + * __wake_up_common(q, ...); (2.2) + * spin_unlock(&q->lock, flags); (2.3) + * + * OBD_FREE_PTR(obj); (3) + * + * As l_wait_event() may "short-cut" execution and return without taking + * wait-queue spin-lock, some additional synchronization is necessary to + * guarantee that step (3) can begin only after (2.3) finishes. + * + * XXX nikita: some ptlrpc daemon threads have races of that sort. + * + */ +static inline int back_to_sleep(void *arg) +{ + return 0; +} + +#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1)) + +struct l_wait_info { + long lwi_timeout; + long lwi_interval; + int lwi_allow_intr; + int (*lwi_on_timeout)(void *); + void (*lwi_on_signal)(void *); + void *lwi_cb_data; +}; + +/* NB: LWI_TIMEOUT ignores signals completely */ +#define LWI_TIMEOUT(time, cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = cb, \ + .lwi_cb_data = data, \ + .lwi_interval = interval, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = time_cb, \ + .lwi_on_signal = sig_cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = time_cb, \ + .lwi_on_signal = sig_cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 1 \ +}) + +#define LWI_INTR(cb, data) LWI_TIMEOUT_INTR(0, NULL, cb, data) + +#define LUSTRE_FATAL_SIGS \ + (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGTERM) | \ + sigmask(SIGQUIT) | sigmask(SIGALRM)) + +/* + * Wait Queue + */ +#if !defined(HAVE___ADD_WAIT_QUEUE_EXCLUSIVE) && !defined(HAVE_WAIT_QUEUE_ENTRY) +static inline void __add_wait_queue_exclusive(wait_queue_head_t *q, + wait_queue_t *wait) +{ + wait->flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue(q, wait); +} +#endif /* HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */ + +/** + * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively + * waiting threads, which is not always desirable because all threads will + * be waken up again and again, even user only needs a few of them to be + * active most time. This is not good for performance because cache can + * be polluted by different threads. + * + * LIFO list can resolve this problem because we always wakeup the most + * recent active thread by default. + * + * NB: please don't call non-exclusive & exclusive wait on the same + * waitq if add_wait_queue_exclusive_head is used. + */ +#define add_wait_queue_exclusive_head(waitq, link) \ +{ \ + unsigned long flags; \ + \ + spin_lock_irqsave(&((waitq)->lock), flags); \ + __add_wait_queue_exclusive(waitq, link); \ + spin_unlock_irqrestore(&((waitq)->lock), flags); \ +} + +/* + * wait for @condition to become true, but no longer than timeout, specified + * by @info. + */ +#define __l_wait_event(wq, condition, info, ret, l_add_wait) \ +do { \ + wait_queue_entry_t __wait; \ + long __timeout = info->lwi_timeout; \ + sigset_t __blocked; \ + int __allow_intr = info->lwi_allow_intr; \ + \ + ret = 0; \ + if (condition) \ + break; \ + \ + init_waitqueue_entry(&__wait, current); \ + l_add_wait(&wq, &__wait); \ + \ + /* Block all signals (just the non-fatal ones if no timeout). */ \ + if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr)) \ + __blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS); \ + else \ + __blocked = cfs_block_sigsinv(0); \ + \ + for (;;) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + \ + /* To guarantee that the condition check will be done */ \ + /* after setting the thread state as TASK_INTERRUPTIBLE. */ \ + /* Otherwise, out-of-order execution may cause some race. */ \ + /* Consider the following real execution order: */ \ + \ + /* 1. Thread1 checks condition on CPU1, gets false. */ \ + /* 2. Thread2 sets condition on CPU2. */ \ + /* 3. Thread2 calls wake_up() on CPU2 to wake the threads */ \ + /* with state TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE. */ \ + /* But the Thread1's state is TASK_RUNNING at that time. */ \ + /* 4. Thread1 sets its state as TASK_INTERRUPTIBLE on CPU1, */ \ + /* then schedule. */ \ + \ + /* If the '__timeout' variable is zero, the Thread1 will */ \ + /* have no chance to check the condition again. */ \ + \ + /* Generally, the interval between out-of-ordered step1 and */ \ + /* step4 is very tiny, as to above step2 and step3 cannot */ \ + /* happen. On some degree, it can explain why we seldom hit */ \ + /* related trouble. But such race really exists, especially */ \ + /* consider that the step1 and step4 can be interruptible. */ \ + /* So add barrier to avoid Thread1 out-of-order execution. */ \ + smp_mb(); \ + \ + if (condition) \ + break; \ + \ + if (__timeout == 0) { \ + schedule(); \ + } else { \ + long interval = info->lwi_interval ? \ + min_t(long, info->lwi_interval,\ + __timeout) : __timeout; \ + long remaining = schedule_timeout(interval); \ + \ + __timeout -= interval - remaining; \ + if (__timeout == 0) { \ + if (info->lwi_on_timeout == NULL || \ + info->lwi_on_timeout(info->lwi_cb_data)) { \ + ret = -ETIMEDOUT; \ + break; \ + } \ + /* Take signals after the timeout expires. */ \ + if (info->lwi_on_signal != NULL) \ + (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\ + } \ + } \ + \ + if (condition) \ + break; \ + if (signal_pending(current)) { \ + if (info->lwi_on_signal != NULL && \ + (__timeout == 0 || __allow_intr)) { \ + if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \ + info->lwi_on_signal(info->lwi_cb_data);\ + ret = -EINTR; \ + break; \ + } \ + /* We have to do this here because some signals */ \ + /* are not blockable - ie from strace(1). */ \ + /* In these cases we want to schedule_timeout() */ \ + /* again, because we don't want that to return */ \ + /* -EINTR when the RPC actually succeeded. */ \ + /* the recalc_sigpending() below will deliver the */ \ + /* signal properly. */ \ + cfs_clear_sigpending(); \ + } \ + } \ + \ + cfs_restore_sigs(__blocked); \ + \ + set_current_state(TASK_RUNNING); \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + + +#define l_wait_event(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue); \ + __ret; \ +}) + +#define l_wait_event_exclusive(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue_exclusive); \ + __ret; \ +}) + +#define l_wait_event_exclusive_head(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue_exclusive_head); \ + __ret; \ +}) + +#define l_wait_condition(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event(wq, condition, &lwi); \ +}) + +#define l_wait_condition_exclusive(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event_exclusive(wq, condition, &lwi); \ +}) + +#define l_wait_condition_exclusive_head(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event_exclusive_head(wq, condition, &lwi); \ +}) + +/** @} lib */ + +#endif /* _LUSTRE_LIB_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h new file mode 100644 index 0000000000000..3bf6e2b54fd9b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + * Use is subject to license terms. + * + * Author: di wang + */ + +/* There are several reasons to restrict the linkEA size: + * + * 1. Under DNE mode, if we do not restrict the linkEA size, and if there + * are too many cross-MDTs hard links to the same object, then it will + * casue the llog overflow. + * + * 2. Some backend has limited size for EA. For example, if without large + * EA enabled, the ldiskfs will make all EAs to share one (4K) EA block. + * + * 3. Too many entries in linkEA will seriously affect linkEA performance + * because we only support to locate linkEA entry consecutively. */ +#define MAX_LINKEA_SIZE 4096 + +struct linkea_data { + /** + * Buffer to keep link EA body. + */ + struct lu_buf *ld_buf; + /** + * The matched header, entry and its lenght in the EA + */ + struct link_ea_header *ld_leh; + struct link_ea_entry *ld_lee; + int ld_reclen; +}; + +int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf); +int linkea_init(struct linkea_data *ldata); +int linkea_init_with_rec(struct linkea_data *ldata); +void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, + struct lu_name *lname, struct lu_fid *pfid); +int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname, + const struct lu_fid *pfid); +int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid); +void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname); +int linkea_links_new(struct linkea_data *ldata, struct lu_buf *buf, + const struct lu_name *cname, const struct lu_fid *pfid); +int linkea_overflow_shrink(struct linkea_data *ldata); +int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid); + +static inline void linkea_first_entry(struct linkea_data *ldata) +{ + LASSERT(ldata != NULL); + LASSERT(ldata->ld_leh != NULL); + + if (ldata->ld_leh->leh_reccount == 0) + ldata->ld_lee = NULL; + else + ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1); +} + +static inline void linkea_next_entry(struct linkea_data *ldata) +{ + LASSERT(ldata != NULL); + LASSERT(ldata->ld_leh != NULL); + + if (ldata->ld_lee != NULL) { + ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + + ldata->ld_reclen); + if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh + + ldata->ld_leh->leh_len)) + ldata->ld_lee = NULL; + } +} diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h new file mode 100644 index 0000000000000..091b80c59c7d9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h @@ -0,0 +1,239 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, 2016, Intel Corporation. + */ +/* + * lustre/include/lustre_lmv.h + * + * Lustre LMV structures and functions. + * + * Author: Di Wang + */ + +#ifndef _LUSTRE_LMV_H +#define _LUSTRE_LMV_H +#include + +struct lmv_oinfo { + struct lu_fid lmo_fid; + u32 lmo_mds; + struct inode *lmo_root; +}; + +struct lmv_stripe_md { + __u32 lsm_md_magic; + __u32 lsm_md_stripe_count; + __u32 lsm_md_master_mdt_index; + __u32 lsm_md_hash_type; + __u8 lsm_md_max_inherit; + __u8 lsm_md_max_inherit_rr; + __u32 lsm_md_layout_version; + __u32 lsm_md_migrate_offset; + __u32 lsm_md_migrate_hash; + __u32 lsm_md_default_count; + __u32 lsm_md_default_index; + char lsm_md_pool_name[LOV_MAXPOOLNAME + 1]; + struct lmv_oinfo lsm_md_oinfo[0]; +}; + +static inline bool lmv_dir_striped(const struct lmv_stripe_md *lsm) +{ + return lsm && lsm->lsm_md_magic == LMV_MAGIC; +} + +static inline bool lmv_dir_migrating(const struct lmv_stripe_md *lsm) +{ + return lmv_dir_striped(lsm) && + lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION; +} + +static inline bool lmv_dir_bad_hash(const struct lmv_stripe_md *lsm) +{ + if (!lmv_dir_striped(lsm)) + return false; + + if (lmv_dir_migrating(lsm) && + lsm->lsm_md_stripe_count - lsm->lsm_md_migrate_offset <= 1) + return false; + + return !lmv_is_known_hash_type(lsm->lsm_md_hash_type); +} + +static inline bool +lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2) +{ + __u32 idx; + + if (lsm1->lsm_md_magic != lsm2->lsm_md_magic || + lsm1->lsm_md_stripe_count != lsm2->lsm_md_stripe_count || + lsm1->lsm_md_master_mdt_index != + lsm2->lsm_md_master_mdt_index || + lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type || + lsm1->lsm_md_max_inherit != lsm2->lsm_md_max_inherit || + lsm1->lsm_md_max_inherit_rr != lsm2->lsm_md_max_inherit_rr || + lsm1->lsm_md_layout_version != + lsm2->lsm_md_layout_version || + lsm1->lsm_md_migrate_offset != + lsm2->lsm_md_migrate_offset || + lsm1->lsm_md_migrate_hash != + lsm2->lsm_md_migrate_hash || + strcmp(lsm1->lsm_md_pool_name, + lsm2->lsm_md_pool_name) != 0) + return false; + + if (lmv_dir_striped(lsm1)) { + for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) { + if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid, + &lsm2->lsm_md_oinfo[idx].lmo_fid)) + return false; + } + } else if (lsm1->lsm_md_magic == LMV_USER_MAGIC_SPECIFIC) { + for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) { + if (lsm1->lsm_md_oinfo[idx].lmo_mds != + lsm2->lsm_md_oinfo[idx].lmo_mds) + return false; + } + } + + return true; +} + +static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm) +{ + int i; + + /* If lsm_md_magic == LMV_MAGIC_FOREIGN pool_name may not be a null + * terminated string so only print LOV_MAXPOOLNAME bytes. + */ + CDEBUG(mask, + "magic %#x stripe count %d master mdt %d hash type %#x max-inherit %hhu max-inherit-rr %hhu version %d migrate offset %d migrate hash %#x pool %.*s\n", + lsm->lsm_md_magic, lsm->lsm_md_stripe_count, + lsm->lsm_md_master_mdt_index, + lsm->lsm_md_hash_type, lsm->lsm_md_max_inherit, + lsm->lsm_md_max_inherit_rr, lsm->lsm_md_layout_version, + lsm->lsm_md_migrate_offset, lsm->lsm_md_migrate_hash, + LOV_MAXPOOLNAME, lsm->lsm_md_pool_name); + + if (!lmv_dir_striped(lsm)) + return; + + for (i = 0; i < lsm->lsm_md_stripe_count; i++) + CDEBUG(mask, "stripe[%d] "DFID"\n", + i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid)); +} + +union lmv_mds_md; + +void lmv_free_memmd(struct lmv_stripe_md *lsm); + +static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst, + const struct lmv_mds_md_v1 *lmv_src) +{ + __u32 i; + + lmv_dst->lmv_magic = le32_to_cpu(lmv_src->lmv_magic); + lmv_dst->lmv_stripe_count = le32_to_cpu(lmv_src->lmv_stripe_count); + lmv_dst->lmv_master_mdt_index = + le32_to_cpu(lmv_src->lmv_master_mdt_index); + lmv_dst->lmv_hash_type = le32_to_cpu(lmv_src->lmv_hash_type); + lmv_dst->lmv_layout_version = le32_to_cpu(lmv_src->lmv_layout_version); + if (lmv_src->lmv_stripe_count > LMV_MAX_STRIPE_COUNT) + return; + for (i = 0; i < lmv_src->lmv_stripe_count; i++) + fid_le_to_cpu(&lmv_dst->lmv_stripe_fids[i], + &lmv_src->lmv_stripe_fids[i]); +} + +static inline void lmv_le_to_cpu(union lmv_mds_md *lmv_dst, + const union lmv_mds_md *lmv_src) +{ + switch (le32_to_cpu(lmv_src->lmv_magic)) { + case LMV_MAGIC_V1: + lmv1_le_to_cpu(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1); + break; + default: + break; + } +} + +/* This hash is only for testing purpose */ +static inline unsigned int +lmv_hash_all_chars(unsigned int count, const char *name, int namelen) +{ + unsigned int c = 0; + const unsigned char *p = (const unsigned char *)name; + + while (--namelen >= 0) + c += p[namelen]; + + c = c % count; + + return c; +} + +static inline unsigned int +lmv_hash_fnv1a(unsigned int count, const char *name, int namelen) +{ + __u64 hash; + + hash = lustre_hash_fnv_1a_64(name, namelen); + + return do_div(hash, count); +} + +static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type, + unsigned int stripe_count, + const char *name, int namelen) +{ + int idx; + + LASSERT(namelen > 0); + + if (stripe_count <= 1) + return 0; + + switch (lmv_hash_type & LMV_HASH_TYPE_MASK) { + case LMV_HASH_TYPE_ALL_CHARS: + idx = lmv_hash_all_chars(stripe_count, name, namelen); + break; + case LMV_HASH_TYPE_FNV_1A_64: + idx = lmv_hash_fnv1a(stripe_count, name, namelen); + break; + default: + idx = -EBADFD; + break; + } + + CDEBUG(D_INFO, "name %.*s hash_type %#x idx %d/%u\n", namelen, name, + lmv_hash_type, idx, stripe_count); + + return idx; +} + +static inline bool lmv_magic_supported(__u32 lum_magic) +{ + return lum_magic == LMV_USER_MAGIC || + lum_magic == LMV_USER_MAGIC_SPECIFIC; +} + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_log.h b/drivers/staging/lustrefsx/lustre/include/lustre_log.h new file mode 100644 index 0000000000000..f2522050f7337 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_log.h @@ -0,0 +1,557 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_log.h + * + * Generic infrastructure for managing a collection of logs. + * These logs are used for: + * + * - orphan recovery: OST adds record on create + * - mtime/size consistency: the OST adds a record on first write + * - open/unlinked objects: OST adds a record on destroy + * + * - mds unlink log: the MDS adds an entry upon delete + * + * - raid1 replication log between OST's + * - MDS replication logs + */ + +#ifndef _LUSTRE_LOG_H +#define _LUSTRE_LOG_H + +/** \defgroup log log + * + * @{ + */ + +#include +#include +#include +#include + +#define LOG_NAME_LIMIT(logname, name) \ + snprintf(logname, sizeof(logname), "LOGS/%s", name) +#define LLOG_EEMPTY 4711 + +enum llog_open_param { + LLOG_OPEN_EXISTS = 0x0000, + LLOG_OPEN_NEW = 0x0001, +}; + +struct plain_handle_data { + struct list_head phd_entry; + struct llog_handle *phd_cat_handle; + /* cookie of this log in its cat */ + struct llog_cookie phd_cookie; +}; + +struct cat_handle_data { + struct list_head chd_head; + struct llog_handle *chd_current_log;/* currently open log */ + struct llog_handle *chd_next_log; /* llog to be used next */ +}; + +struct llog_handle; + +/* llog.c - general API */ +int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, + int flags, struct obd_uuid *uuid); +int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data); +int llog_process(const struct lu_env *env, struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata); +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork); +int llog_reverse_process(const struct lu_env *env, + struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata); +int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle, + int index); +int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param); +int llog_close(const struct lu_env *env, struct llog_handle *cathandle); +int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name); +int llog_backup(const struct lu_env *env, struct obd_device *obd, + struct llog_ctxt *ctxt, struct llog_ctxt *bak_ctxt, + char *name, char *backup); +int llog_read_header(const struct lu_env *env, struct llog_handle *handle, + const struct obd_uuid *uuid); +__u64 llog_size(const struct lu_env *env, struct llog_handle *llh); + +/* llog_process flags */ +#define LLOG_FLAG_NODEAMON 0x0001 + +/* llog_cat.c - catalog api */ +struct llog_process_data { + /** + * Any useful data needed while processing catalog. This is + * passed later to process callback. + */ + void *lpd_data; + /** + * Catalog process callback function, called for each record + * in catalog. + */ + llog_cb_t lpd_cb; + /** + * Start processing the catalog from startcat/startidx + */ + int lpd_startcat; + int lpd_startidx; +}; + +struct llog_process_cat_data { + /** + * Temporary stored first_idx while scanning log. + */ + int lpcd_first_idx; + /** + * Temporary stored last_idx while scanning log. + */ + int lpcd_last_idx; +}; + +int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle); +int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + struct thandle *th); +int llog_cat_declare_add_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct thandle *th); +int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie); +int llog_cat_cancel_records(const struct lu_env *env, + struct llog_handle *cathandle, int count, + struct llog_cookie *cookies); +int llog_cat_process_or_fork(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cat_cb, + llog_cb_t cb, void *data, int startcat, + int startidx, bool fork); +int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, int startidx); +__u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh); +__u32 llog_cat_free_space(struct llog_handle *cat_llh); +int llog_cat_reverse_process(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cb, + void *data); +/* llog_obd.c */ +int llog_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int index, + struct obd_device *disk_obd, struct llog_operations *op); +int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt); +int llog_cleanup(const struct lu_env *env, struct llog_ctxt *); +int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags); + +/* llog_ioctl.c */ +struct obd_ioctl_data; +int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd, + struct obd_ioctl_data *data); +int llog_catalog_list(const struct lu_env *env, struct dt_device *d, + int count, struct obd_ioctl_data *data, + const struct lu_fid *fid); + +/* llog_net.c */ +int llog_initiator_connect(struct llog_ctxt *ctxt); + +struct llog_operations { + int (*lop_declare_destroy)(const struct lu_env *env, + struct llog_handle *handle, struct thandle *th); + int (*lop_destroy)(const struct lu_env *env, + struct llog_handle *handle, struct thandle *th); + int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h, + int *curr_idx, int next_idx, __u64 *offset, + void *buf, int len); + int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h, + int prev_idx, void *buf, int len); + int (*lop_read_header)(const struct lu_env *env, + struct llog_handle *handle); + int (*lop_setup)(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int ctxt_idx, + struct obd_device *disk_obd); + int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp, + int flags); + int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt); + int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid, + struct llog_gen *gen, struct obd_uuid *uuid); + /** + * Any llog file must be opened first using llog_open(). Llog can be + * opened by name, logid or without both, in last case the new logid + * will be generated. + */ + int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh, + struct llog_logid *logid, char *name, + enum llog_open_param); + /** + * Opened llog may not exist and this must be checked where needed using + * the llog_exist() call. + */ + int (*lop_exist)(struct llog_handle *lgh); + /** + * Close llog file and calls llog_free_handle() implicitly. + * Any opened llog must be closed by llog_close() call. + */ + int (*lop_close)(const struct lu_env *env, struct llog_handle *handle); + /** + * Create new llog file. The llog must be opened. + * Must be used only for local llog operations. + */ + int (*lop_declare_create)(const struct lu_env *env, + struct llog_handle *handle, + struct thandle *th); + int (*lop_create)(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); + /** + * write new record in llog. It appends records usually but can edit + * existing records too. + */ + int (*lop_declare_write_rec)(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, + int idx, struct thandle *th); + int (*lop_write_rec)(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *cookie, + int idx, struct thandle *th); + /** + * Add new record in llog catalog. Does the same as llog_write_rec() + * but using llog catalog. + */ + int (*lop_declare_add)(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th); + int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *cookie, + struct thandle *th); +}; + +/* In-memory descriptor for a log object or log catalog */ +struct llog_handle { + struct rw_semaphore lgh_lock; + struct mutex lgh_hdr_mutex; /* protect lgh_hdr data */ + struct llog_logid lgh_id; /* id of this log */ + struct llog_log_hdr *lgh_hdr; /* may be vmalloc'd */ + size_t lgh_hdr_size; + struct dt_object *lgh_obj; + /* For a Catalog, is the last/newest used index for a plain slot. + * Used in conjunction with llh_cat_idx to handle Catalog wrap-around + * case, after it will have reached LLOG_HDR_BITMAP_SIZE, llh_cat_idx + * will become its upper limit */ + int lgh_last_idx; + struct rw_semaphore lgh_last_sem; + __u64 lgh_cur_offset; /* used for test only */ + struct llog_ctxt *lgh_ctxt; + union { + struct plain_handle_data phd; + struct cat_handle_data chd; + } u; + char *lgh_name; + void *private_data; + struct llog_operations *lgh_logops; + atomic_t lgh_refcount; + + int lgh_max_size; + bool lgh_destroyed; +}; + +/* llog_osd.c */ +extern struct llog_operations llog_osd_ops; +extern struct llog_operations llog_common_cat_ops; +int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray, + const struct lu_fid *fid); +int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray, + const struct lu_fid *fid); + +#define LLOG_CTXT_FLAG_UNINITIALIZED 0x00000001 +#define LLOG_CTXT_FLAG_STOP 0x00000002 + +/* Indicate the llog objects under this context are normal FID objects, + * instead of objects with local FID. */ +#define LLOG_CTXT_FLAG_NORMAL_FID 0x00000004 + +struct llog_ctxt { + int loc_idx; /* my index the obd array of ctxt's */ + struct obd_device *loc_obd; /* points back to the containing obd*/ + struct obd_llog_group *loc_olg; /* group containing that ctxt */ + struct obd_export *loc_exp; /* parent "disk" export (e.g. MDS) */ + struct obd_import *loc_imp; /* to use in RPC's: can be backward + pointing import */ + struct llog_operations *loc_logops; + struct llog_handle *loc_handle; + struct mutex loc_mutex; /* protect loc_imp */ + atomic_t loc_refcount; + long loc_flags; /* flags, see above defines */ + struct dt_object *loc_dir; + struct local_oid_storage *loc_los_nameless; + struct local_oid_storage *loc_los_named; + /* llog chunk size, and llog record size can not be bigger than + * loc_chunk_size */ + __u32 loc_chunk_size; +}; + +#define LLOG_PROC_BREAK 0x0001 +#define LLOG_DEL_RECORD 0x0002 +#define LLOG_DEL_PLAIN 0x0003 + +static inline int llog_obd2ops(struct llog_ctxt *ctxt, + struct llog_operations **lop) +{ + if (ctxt == NULL) + return -ENOTCONN; + + *lop = ctxt->loc_logops; + if (*lop == NULL) + return -EOPNOTSUPP; + + return 0; +} + +static inline int llog_handle2ops(struct llog_handle *loghandle, + struct llog_operations **lop) +{ + if (loghandle == NULL || loghandle->lgh_logops == NULL) + return -EINVAL; + + *lop = loghandle->lgh_logops; + return 0; +} + +static inline int llog_data_len(int len) +{ + return cfs_size_round(len); +} + +static inline int llog_get_size(struct llog_handle *loghandle) +{ + if (loghandle && loghandle->lgh_hdr) + return loghandle->lgh_hdr->llh_count; + return 0; +} + +static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt) +{ + atomic_inc(&ctxt->loc_refcount); + CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt, + atomic_read(&ctxt->loc_refcount)); + return ctxt; +} + +static inline void llog_ctxt_put(struct llog_ctxt *ctxt) +{ + if (ctxt == NULL) + return; + LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt, + atomic_read(&ctxt->loc_refcount) - 1); + __llog_ctxt_put(NULL, ctxt); +} + +static inline void llog_group_init(struct obd_llog_group *olg) +{ + init_waitqueue_head(&olg->olg_waitq); + spin_lock_init(&olg->olg_lock); + mutex_init(&olg->olg_cat_processing); +} + +static inline int llog_group_set_ctxt(struct obd_llog_group *olg, + struct llog_ctxt *ctxt, int index) +{ + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + + spin_lock(&olg->olg_lock); + if (olg->olg_ctxts[index] != NULL) { + spin_unlock(&olg->olg_lock); + return -EEXIST; + } + olg->olg_ctxts[index] = ctxt; + spin_unlock(&olg->olg_lock); + return 0; +} + +static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg, + int index) +{ + struct llog_ctxt *ctxt; + + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + + spin_lock(&olg->olg_lock); + if (olg->olg_ctxts[index] == NULL) + ctxt = NULL; + else + ctxt = llog_ctxt_get(olg->olg_ctxts[index]); + spin_unlock(&olg->olg_lock); + return ctxt; +} + +static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index) +{ + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + spin_lock(&olg->olg_lock); + olg->olg_ctxts[index] = NULL; + spin_unlock(&olg->olg_lock); +} + +static inline struct llog_ctxt *llog_get_context(struct obd_device *obd, + int index) +{ + return llog_group_get_ctxt(&obd->obd_olg, index); +} + +static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index) +{ + return (olg->olg_ctxts[index] == NULL); +} + +static inline int llog_ctxt_null(struct obd_device *obd, int index) +{ + return (llog_group_ctxt_null(&obd->obd_olg, index)); +} + +static inline int llog_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_next_block == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx, + cur_offset, buf, len); + RETURN(rc); +} + +static inline int llog_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_prev_block == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len); + RETURN(rc); +} + +static inline int llog_connect(struct llog_ctxt *ctxt, + struct llog_logid *logid, struct llog_gen *gen, + struct obd_uuid *uuid) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_obd2ops(ctxt, &lop); + if (rc) + RETURN(rc); + if (lop->lop_connect == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_connect(ctxt, logid, gen, uuid); + RETURN(rc); +} + +static inline int llog_is_full(struct llog_handle *llh) +{ + return llh->lgh_last_idx >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1; +} + +struct llog_cfg_rec { + struct llog_rec_hdr lcr_hdr; + struct lustre_cfg lcr_cfg; + struct llog_rec_tail lcr_tail; +}; + +struct llog_cfg_rec *lustre_cfg_rec_new(int cmd, struct lustre_cfg_bufs *bufs); +void lustre_cfg_rec_free(struct llog_cfg_rec *lcr); + +enum { + LLOG_NEXT_IDX = -1, + LLOG_HEADER_IDX = 0, +}; + +/* llog.c */ +int llog_exist(struct llog_handle *loghandle); +int llog_declare_create(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th); +int llog_create(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); +int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); +int llog_destroy(const struct lu_env *env, struct llog_handle *handle); + +int llog_declare_write_rec(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, int idx, + struct thandle *th); +int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + int idx, struct thandle *th); +int llog_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + struct thandle *th); +int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th); +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **res, struct llog_logid *logid, + char *name); +int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_logid *logid, char *name); +int llog_write(const struct lu_env *env, struct llog_handle *loghandle, + struct llog_rec_hdr *rec, int idx); + +/** @} log */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h new file mode 100644 index 0000000000000..826eef7bc646f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h @@ -0,0 +1,152 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_mdc.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_MDC_H +#define _LUSTRE_MDC_H + +/** \defgroup mdc mdc + * + * @{ + */ + +#include +#include +#ifdef CONFIG_FS_POSIX_ACL +# include +#endif /* CONFIG_FS_POSIX_ACL */ +#include +#include +#include +#include +#include +#include +#include +#include + +struct ptlrpc_client; +struct obd_export; +struct ptlrpc_request; +struct obd_device; + +static inline void mdc_get_mod_rpc_slot(struct ptlrpc_request *req, + struct lookup_intent *it) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + __u32 opc; + __u16 tag; + + opc = lustre_msg_get_opc(req->rq_reqmsg); + tag = obd_get_mod_rpc_slot(cli, opc, it); + lustre_msg_set_tag(req->rq_reqmsg, tag); +} + +static inline void mdc_put_mod_rpc_slot(struct ptlrpc_request *req, + struct lookup_intent *it) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + __u32 opc; + __u16 tag; + + opc = lustre_msg_get_opc(req->rq_reqmsg); + tag = lustre_msg_get_tag(req->rq_reqmsg); + obd_put_mod_rpc_slot(cli, opc, it, tag); +} + + +/** + * Update the maximum possible easize. + * + * This value is learned from ptlrpc replies sent by the MDT. The + * default easize is initialized to the minimum value but allowed to + * grow up to a single page in size if required to handle the common + * case. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] exp export for MDC device + * \param[in] body body of ptlrpc reply from MDT + * + */ +static inline void mdc_update_max_ea_from_body(struct obd_export *exp, + struct mdt_body *body) +{ + if (body->mbo_valid & OBD_MD_FLMODEASIZE) { + struct client_obd *cli = &exp->exp_obd->u.cli; + __u32 def_easize; + + if (cli->cl_max_mds_easize < body->mbo_max_mdsize) + cli->cl_max_mds_easize = body->mbo_max_mdsize; + + def_easize = min_t(__u32, body->mbo_max_mdsize, + OBD_MAX_DEFAULT_EA_SIZE); + cli->cl_default_mds_easize = def_easize; + } +} + + +/* mdc/mdc_locks.c */ +int it_open_error(int phase, struct lookup_intent *it); + +static inline bool cl_is_lov_delay_create(unsigned int flags) +{ + return (flags & O_LOV_DELAY_CREATE_1_8) != 0 || + (flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK; +} + +static inline void cl_lov_delay_create_clear(unsigned int *flags) +{ + if ((*flags & O_LOV_DELAY_CREATE_1_8) != 0) + *flags &= ~O_LOV_DELAY_CREATE_1_8; + if ((*flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK) + *flags &= ~O_LOV_DELAY_CREATE_MASK; +} + +static inline bool cl_is_lu_noimport(unsigned int flags) +{ + return (flags & O_LU_NOIMPORT_MASK) == O_LU_NOIMPORT_MASK; +} + +static inline void cl_lu_noimport_clear(unsigned int *flags) +{ + if (cl_is_lu_noimport(*flags)) + *flags &= ~O_LU_NOIMPORT_MASK; +} + +/** @} mdc */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mds.h b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h new file mode 100644 index 0000000000000..cb43281574890 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_mds.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_MDS_H +#define _LUSTRE_MDS_H + +/** \defgroup mds mds + * + * @{ + */ + +#include +#include +#include +#include +#include +#include + +struct mds_group_info { + struct obd_uuid *uuid; + int group; +}; + +struct mds_capa_info { + struct obd_uuid *uuid; + struct lustre_capa_key *capa; +}; + +struct md_rejig_data { + struct md_object *mrd_obj; + __u16 mrd_mirror_id; +}; + +#define MDD_OBD_NAME "mdd_obd" +#define MDD_OBD_UUID "mdd_obd_uuid" + +static inline int md_should_create(u64 open_flags) +{ + return !(open_flags & MDS_OPEN_DELAY_CREATE) && + (open_flags & MDS_FMODE_WRITE) && + !(open_flags & MDS_OPEN_LEASE); +} + +/* do NOT or the MAY_*'s, you'll get the weakest */ +static inline int mds_accmode(u64 open_flags) +{ + int res = 0; + + if (open_flags & MDS_FMODE_READ) + res |= MAY_READ; + if (open_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC | MDS_OPEN_APPEND)) + res |= MAY_WRITE; + if (open_flags & MDS_FMODE_EXEC) + res = MAY_EXEC; + + return res; +} + +/** @} mds */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_net.h b/drivers/staging/lustrefsx/lustre/include/lustre_net.h new file mode 100644 index 0000000000000..3a94a921e11de --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_net.h @@ -0,0 +1,2737 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup PtlRPC Portal RPC and networking module. + * + * PortalRPC is the layer used by rest of lustre code to achieve network + * communications: establish connections with corresponding export and import + * states, listen for a service, send and receive RPCs. + * PortalRPC also includes base recovery framework: packet resending and + * replaying, reconnections, pinger. + * + * PortalRPC utilizes LNet as its transport layer. + * + * @{ + */ + + +#ifndef _LUSTRE_NET_H +#define _LUSTRE_NET_H + +/** \defgroup net net + * + * @{ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* MD flags we _always_ use */ +#define PTLRPC_MD_OPTIONS 0 + +/** + * log2 max # of bulk operations in one request: 2=4MB/RPC, 5=32MB/RPC, ... + * In order for the client and server to properly negotiate the maximum + * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two + * value. The client is free to limit the actual RPC size for any bulk + * transfer via cl_max_pages_per_rpc to some non-power-of-two value. + * NOTE: This is limited to 16 (=64GB RPCs) by IOOBJ_MAX_BRW_BITS. */ +#define PTLRPC_BULK_OPS_BITS 6 +#if PTLRPC_BULK_OPS_BITS > 16 +#error "More than 65536 BRW RPCs not allowed by IOOBJ_MAX_BRW_BITS." +#endif +#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS) +/** + * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and + * should not be used on the server at all. Otherwise, it imposes a + * protocol limitation on the maximum RPC size that can be used by any + * RPC sent to that server in the future. Instead, the server should + * use the negotiated per-client ocd_brw_size to determine the bulk + * RPC count. */ +#define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1)) + +/** + * Define maxima for bulk I/O. + * + * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT + * of LNET_MTU sized RDMA transfers. Clients and servers negotiate the + * currently supported maximum between peers at connect via ocd_brw_size. + */ +#define PTLRPC_MAX_BRW_BITS (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS) +#define PTLRPC_MAX_BRW_SIZE (1U << PTLRPC_MAX_BRW_BITS) +#define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE >> PAGE_SHIFT) + +#define ONE_MB_BRW_SIZE (1U << LNET_MTU_BITS) +#define MD_MAX_BRW_SIZE (1U << LNET_MTU_BITS) +#define MD_MAX_BRW_PAGES (MD_MAX_BRW_SIZE >> PAGE_SHIFT) +#define DT_MAX_BRW_SIZE PTLRPC_MAX_BRW_SIZE +#define DT_DEF_BRW_SIZE (4 * ONE_MB_BRW_SIZE) +#define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> PAGE_SHIFT) +#define OFD_MAX_BRW_SIZE (1U << LNET_MTU_BITS) + +/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */ +#if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0) +# error "PTLRPC_MAX_BRW_PAGES isn't a power of two" +#endif +#if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_SIZE)) +# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_SIZE" +#endif +#if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_MAX_BRW_SIZE too big" +#endif +#if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_MAX_BRW_PAGES too big" +#endif + +#define PTLRPC_NTHRS_INIT 2 + +/** + * Buffer Constants + * + * Constants determine how memory is used to buffer incoming service requests. + * + * ?_NBUFS # buffers to allocate when growing the pool + * ?_BUFSIZE # bytes in a single request buffer + * ?_MAXREQSIZE # maximum request service will receive + * + * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk + * of ?_NBUFS is added to the pool. + * + * Messages larger than ?_MAXREQSIZE are dropped. Request buffers are + * considered full when less than ?_MAXREQSIZE is left in them. + */ +/** + * Thread Constants + * + * Constants determine how threads are created for ptlrpc service. + * + * ?_NTHRS_INIT # threads to create for each service partition on + * initializing. If it's non-affinity service and + * there is only one partition, it's the overall # + * threads for the service while initializing. + * ?_NTHRS_BASE # threads should be created at least for each + * ptlrpc partition to keep the service healthy. + * It's the low-water mark of threads upper-limit + * for each partition. + * ?_THR_FACTOR # threads can be added on threads upper-limit for + * each CPU core. This factor is only for reference, + * we might decrease value of factor if number of cores + * per CPT is above a limit. + * ?_NTHRS_MAX # overall threads can be created for a service, + * it's a soft limit because if service is running + * on machine with hundreds of cores and tens of + * CPU partitions, we need to guarantee each partition + * has ?_NTHRS_BASE threads, which means total threads + * will be ?_NTHRS_BASE * number_of_cpts which can + * exceed ?_NTHRS_MAX. + * + * Examples + * + * #define MDS_NTHRS_INIT 2 + * #define MDS_NTHRS_BASE 64 + * #define MDS_NTHRS_FACTOR 8 + * #define MDS_NTHRS_MAX 1024 + * + * Example 1): + * --------------------------------------------------------------------- + * Server(A) has 16 cores, user configured it to 4 partitions so each + * partition has 4 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96 + * + * Total number of threads for the service is: + * 96 * partitions(4) = 384 + * + * Example 2): + * --------------------------------------------------------------------- + * Server(B) has 32 cores, user configured it to 4 partitions so each + * partition has 8 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128 + * + * Total number of threads for the service is: + * 128 * partitions(4) = 512 + * + * Example 3): + * --------------------------------------------------------------------- + * Server(B) has 96 cores, user configured it to 8 partitions so each + * partition has 12 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160 + * + * Total number of threads for the service is: + * 160 * partitions(8) = 1280 + * + * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number + * as upper limit of threads number for each partition: + * MDS_NTHRS_MAX(1024) / partitions(8) = 128 + * + * Example 4): + * --------------------------------------------------------------------- + * Server(C) have a thousand of cores and user configured it to 32 partitions + * MDS_NTHRS_BASE(64) * 32 = 2048 + * + * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need + * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads + * to keep service healthy, so total number of threads will just be 2048. + * + * NB: we don't suggest to choose server with that many cores because backend + * filesystem itself, buffer cache, or underlying network stack might + * have some SMP scalability issues at that large scale. + * + * If user already has a fat machine with hundreds or thousands of cores, + * there are two choices for configuration: + * a) create CPU table from subset of all CPUs and run Lustre on + * top of this subset + * b) bind service threads on a few partitions, see modparameters of + * MDS and OSS for details +* + * NB: these calculations (and examples below) are simplified to help + * understanding, the real implementation is a little more complex, + * please see ptlrpc_server_nthreads_check() for details. + * + */ + + /* + * LDLM threads constants: + * + * Given 8 as factor and 24 as base threads number + * + * example 1) + * On 4-core machine we will have 24 + 8 * 4 = 56 threads. + * + * example 2) + * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56 + * threads for each partition and total threads number will be 112. + * + * example 3) + * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24) + * threads for each partition to keep service healthy, so total threads + * number should be 24 * 8 = 192. + * + * So with these constants, threads number will be at the similar level + * of old versions, unless target machine has over a hundred cores + */ +#define LDLM_THR_FACTOR 8 +#define LDLM_NTHRS_INIT PTLRPC_NTHRS_INIT +#define LDLM_NTHRS_BASE 24 +#define LDLM_NTHRS_MAX (num_online_cpus() == 1 ? 64 : 128) + +#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT +#define LDLM_CLIENT_NBUFS 1 +#define LDLM_SERVER_NBUFS 64 +#define LDLM_BUFSIZE (8 * 1024) +#define LDLM_MAXREQSIZE (5 * 1024) +#define LDLM_MAXREPSIZE (1024) + + /* + * MDS threads constants: + * + * Please see examples in "Thread Constants", MDS threads number will be at + * the comparable level of old versions, unless the server has many cores. + */ +#ifndef MDS_MAX_THREADS +#define MDS_MAX_THREADS 1024 +#define MDS_MAX_OTHR_THREADS 256 + +#else /* MDS_MAX_THREADS */ +#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT +#undef MDS_MAX_THREADS +#define MDS_MAX_THREADS PTLRPC_NTHRS_INIT +#endif +#define MDS_MAX_OTHR_THREADS max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2) +#endif + +/* default service */ +#define MDS_THR_FACTOR 8 +#define MDS_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_NTHRS_MAX MDS_MAX_THREADS +#define MDS_NTHRS_BASE min(64, MDS_NTHRS_MAX) + +/* read-page service */ +#define MDS_RDPG_THR_FACTOR 4 +#define MDS_RDPG_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_RDPG_NTHRS_MAX MDS_MAX_OTHR_THREADS +#define MDS_RDPG_NTHRS_BASE min(48, MDS_RDPG_NTHRS_MAX) + +/* these should be removed when we remove setattr service in the future */ +#define MDS_SETA_THR_FACTOR 4 +#define MDS_SETA_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_SETA_NTHRS_MAX MDS_MAX_OTHR_THREADS +#define MDS_SETA_NTHRS_BASE min(48, MDS_SETA_NTHRS_MAX) + +/* non-affinity threads */ +#define MDS_OTHR_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_OTHR_NTHRS_MAX MDS_MAX_OTHR_THREADS + +#define MDS_NBUFS 64 + +/** + * Assume file name length = FNAME_MAX = 256 (true for ext3). + * path name length = PATH_MAX = 4096 + * LOV MD size max = EA_MAX = 24 * 2000 + * (NB: 24 is size of lov_ost_data) + * LOV LOGCOOKIE size max = 32 * 2000 + * (NB: 32 is size of llog_cookie) + * symlink: FNAME_MAX + PATH_MAX <- largest + * link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create) + * rename: FNAME_MAX + FNAME_MAX + * open: FNAME_MAX + EA_MAX + * + * MDS_MAXREQSIZE ~= 4736 bytes = + * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX + * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header + * + * Realistic size is about 512 bytes (20 character name + 128 char symlink), + * except in the open case where there are a large number of OSTs in a LOV. + */ +#define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */ +#define MDS_MAXREPSIZE (9 * 1024) /* >= 8300 */ + +/** + * MDS incoming request with LOV EA + * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate + */ +#define MDS_LOV_MAXREQSIZE max(MDS_MAXREQSIZE, \ + 362 + LOV_MAX_STRIPE_COUNT * 24) +/** + * MDS outgoing reply with LOV EA + * + * NB: max reply size Lustre 2.4+ client can get from old MDS is: + * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes + * + * but 2.4 or later MDS will never send reply with llog_cookie to any + * version client. This macro is defined for server side reply buffer size. + */ +#define MDS_LOV_MAXREPSIZE MDS_LOV_MAXREQSIZE + +/** + * This is the size of a maximum REINT_SETXATTR request: + * + * lustre_msg 56 (32 + 4 x 5 + 4) + * ptlrpc_body 184 + * mdt_rec_setxattr 136 + * lustre_capa 120 + * name 256 (XATTR_NAME_MAX) + * value 65536 (XATTR_SIZE_MAX) + */ +#define MDS_EA_MAXREQSIZE 66288 + +/** + * These are the maximum request and reply sizes (rounded up to 1 KB + * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL. + */ +#define MDS_REG_MAXREQSIZE (((max(MDS_EA_MAXREQSIZE, \ + MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10) +#define MDS_REG_MAXREPSIZE MDS_REG_MAXREQSIZE + +/** + * The update request includes all of updates from the create, which might + * include linkea (4K maxim), together with other updates, we set it to 1000K: + * lustre_msg + ptlrpc_body + OUT_UPDATE_BUFFER_SIZE_MAX + */ +#define OUT_MAXREQSIZE (1000 * 1024) +#define OUT_MAXREPSIZE MDS_MAXREPSIZE + +/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */ +#define MDS_BUFSIZE max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 8 * 1024) + +/** + * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD. + * However, we need to allocate a much larger buffer for it because LNet + * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid + * dropping of maximum-sized incoming request. So if MDS_REG_BUFSIZE is only a + * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request + * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory + * utilization is very low. + * + * In the meanwhile, size of rqbd can't be too large, because rqbd can't be + * reused until all requests fit in it have been processed and released, + * which means one long blocked request can prevent the rqbd be reused. + * Now we set request buffer size to 160 KB, so even each rqbd is unlinked + * from LNet with unused 65 KB, buffer utilization will be about 59%. + * Please check LU-2432 for details. + */ +#define MDS_REG_BUFSIZE max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 160 * 1024) + +/** + * OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is + * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some + * extra bytes to each request buffer to improve buffer utilization rate. + */ +#define OUT_BUFSIZE max(OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 24 * 1024) + +/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */ +#define FLD_MAXREQSIZE (160) + +/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */ +#define FLD_MAXREPSIZE (152) +#define FLD_BUFSIZE (1 << 12) + +/** + * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range + + * __u32 padding */ +#define SEQ_MAXREQSIZE (160) + +/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */ +#define SEQ_MAXREPSIZE (152) +#define SEQ_BUFSIZE (1 << 12) + +/** MGS threads must be >= 3, see bug 22458 comment #28 */ +#define MGS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) +#define MGS_NTHRS_MAX 32 + +#define MGS_NBUFS 64 +#define MGS_BUFSIZE (8 * 1024) +#define MGS_MAXREQSIZE (7 * 1024) +#define MGS_MAXREPSIZE (9 * 1024) + + /* + * OSS threads constants: + * + * Given 8 as factor and 64 as base threads number + * + * example 1): + * On 8-core server configured to 2 partitions, we will have + * 64 + 8 * 4 = 96 threads for each partition, 192 total threads. + * + * example 2): + * On 32-core machine configured to 4 partitions, we will have + * 64 + 8 * 8 = 112 threads for each partition, so total threads number + * will be 112 * 4 = 448. + * + * example 3): + * On 64-core machine configured to 4 partitions, we will have + * 64 + 16 * 8 = 192 threads for each partition, so total threads number + * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we + * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads + * for each partition. + * + * So we can see that with these constants, threads number wil be at the + * similar level of old versions, unless the server has many cores. + */ + /* depress threads factor for VM with small memory size */ +#define OSS_THR_FACTOR min_t(int, 8, \ + NUM_CACHEPAGES >> (28 - PAGE_SHIFT)) +#define OSS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) +#define OSS_NTHRS_BASE 64 + +/* threads for handling "create" request */ +#define OSS_CR_THR_FACTOR 1 +#define OSS_CR_NTHRS_INIT PTLRPC_NTHRS_INIT +#define OSS_CR_NTHRS_BASE 8 +#define OSS_CR_NTHRS_MAX 64 + +/** + * OST_IO_MAXREQSIZE ~= + * lustre_msg + ptlrpc_body + obdo + obd_ioobj + + * DT_MAX_BRW_PAGES * niobuf_remote + * + * - single object with 16 pages is 512 bytes + * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover + * - Must be a multiple of 1024 + */ +#define _OST_MAXREQSIZE_BASE ((unsigned long)(sizeof(struct lustre_msg) + \ + sizeof(struct ptlrpc_body) + \ + sizeof(struct obdo) + \ + sizeof(struct obd_ioobj) + \ + sizeof(struct niobuf_remote))) +#define _OST_MAXREQSIZE_SUM ((unsigned long)(_OST_MAXREQSIZE_BASE + \ + sizeof(struct niobuf_remote) * \ + (DT_MAX_BRW_PAGES - 1))) +/** + * FIEMAP request can be 4K+ for now + */ +#define OST_MAXREQSIZE (16UL * 1024UL) +#define OST_IO_MAXREQSIZE max(OST_MAXREQSIZE, \ + ((_OST_MAXREQSIZE_SUM - 1) | \ + (1024UL - 1)) + 1) +/* Safe estimate of free space in standard RPC, provides upper limit for # of + * bytes of i/o to pack in RPC (skipping bulk transfer). */ +#define OST_SHORT_IO_SPACE (OST_IO_MAXREQSIZE - _OST_MAXREQSIZE_BASE) + +/* Actual size used for short i/o buffer. Calculation means this: + * At least one page (for large PAGE_SIZE), or 16 KiB, but not more + * than the available space aligned to a page boundary. */ +#define OBD_MAX_SHORT_IO_BYTES min(max(PAGE_SIZE, 16UL * 1024UL), \ + OST_SHORT_IO_SPACE & PAGE_MASK) + +#define OST_MAXREPSIZE (9 * 1024) +#define OST_IO_MAXREPSIZE OST_MAXREPSIZE + +#define OST_NBUFS 64 +/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */ +#define OST_BUFSIZE max_t(int, OST_MAXREQSIZE + 1024, 16 * 1024) +/** + * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization + * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details. + */ +#define OST_IO_BUFSIZE max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024) + + +/* Macro to hide a typecast. */ +#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args) + +struct ptlrpc_replay_async_args { + int praa_old_state; + int praa_old_status; +}; + +/** + * Structure to single define portal connection. + */ +struct ptlrpc_connection { + /** linkage for connections hash table */ + struct hlist_node c_hash; + /** Our own lnet nid for this connection */ + lnet_nid_t c_self; + /** Remote side nid for this connection */ + struct lnet_process_id c_peer; + /** UUID of the other side */ + struct obd_uuid c_remote_uuid; + /** reference counter for this connection */ + atomic_t c_refcount; +}; + +/** Client definition for PortalRPC */ +struct ptlrpc_client { + /** What lnet portal does this client send messages to by default */ + __u32 cli_request_portal; + /** What portal do we expect replies on */ + __u32 cli_reply_portal; + /** Name of the client */ + char *cli_name; +}; + +/** state flags of requests */ +/* XXX only ones left are those used by the bulk descs as well! */ +#define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */ +#define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */ + +#define REQ_MAX_ACK_LOCKS 8 + +union ptlrpc_async_args { + /** + * Scratchpad for passing args to completion interpreter. Users + * cast to the struct of their choosing, and CLASSERT that this is + * big enough. For _tons_ of context, OBD_ALLOC a struct and store + * a pointer to it here. The pointer_arg ensures this struct is at + * least big enough for that. + */ + void *pointer_arg[11]; + __u64 space[7]; +}; + +struct ptlrpc_request_set; +typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *); + +/** + * Definition of request set structure. + * Request set is a list of requests (not necessary to the same target) that + * once populated with RPCs could be sent in parallel. + * There are two kinds of request sets. General purpose and with dedicated + * serving thread. Example of the latter is ptlrpcd set. + * For general purpose sets once request set started sending it is impossible + * to add new requests to such set. + * Provides a way to call "completion callbacks" when all requests in the set + * returned. + */ +struct ptlrpc_request_set { + atomic_t set_refcount; + /** number of in queue requests */ + atomic_t set_new_count; + /** number of uncompleted requests */ + atomic_t set_remaining; + /** wait queue to wait on for request events */ + wait_queue_head_t set_waitq; + /** List of requests in the set */ + struct list_head set_requests; + /** + * Lock for \a set_new_requests manipulations + * locked so that any old caller can communicate requests to + * the set holder who can then fold them into the lock-free set + */ + spinlock_t set_new_req_lock; + /** List of new yet unsent requests. Only used with ptlrpcd now. */ + struct list_head set_new_requests; + + /** rq_status of requests that have been freed already */ + int set_rc; + /** Additional fields used by the flow control extension */ + /** Maximum number of RPCs in flight */ + int set_max_inflight; + /** Callback function used to generate RPCs */ + set_producer_func set_producer; + /** opaq argument passed to the producer callback */ + void *set_producer_arg; + unsigned int set_allow_intr:1; +}; + +struct ptlrpc_bulk_desc; +struct ptlrpc_service_part; +struct ptlrpc_service; + +/** + * ptlrpc callback & work item stuff + */ +struct ptlrpc_cb_id { + void (*cbid_fn)(struct lnet_event *ev); /* specific callback fn */ + void *cbid_arg; /* additional arg */ +}; + +/** Maximum number of locks to fit into reply state */ +#define RS_MAX_LOCKS 8 +#define RS_DEBUG 0 + +/** + * Structure to define reply state on the server + * Reply state holds various reply message information. Also for "difficult" + * replies (rep-ack case) we store the state after sending reply and wait + * for the client to acknowledge the reception. In these cases locks could be + * added to the state for replay/failover consistency guarantees. + */ +struct ptlrpc_reply_state { + /** Callback description */ + struct ptlrpc_cb_id rs_cb_id; + /** Linkage for list of all reply states in a system */ + struct list_head rs_list; + /** Linkage for list of all reply states on same export */ + struct list_head rs_exp_list; + /** Linkage for list of all reply states for same obd */ + struct list_head rs_obd_list; +#if RS_DEBUG + struct list_head rs_debug_list; +#endif + /** A spinlock to protect the reply state flags */ + spinlock_t rs_lock; + /** Reply state flags */ + unsigned long rs_difficult:1; /* ACK/commit stuff */ + unsigned long rs_no_ack:1; /* no ACK, even for + difficult requests */ + unsigned long rs_scheduled:1; /* being handled? */ + unsigned long rs_scheduled_ever:1;/* any schedule attempts? */ + unsigned long rs_handled:1; /* been handled yet? */ + unsigned long rs_on_net:1; /* reply_out_callback pending? */ + unsigned long rs_prealloc:1; /* rs from prealloc list */ + unsigned long rs_committed:1;/* the transaction was committed + and the rs was dispatched + by ptlrpc_commit_replies */ + unsigned long rs_convert_lock:1; /* need to convert saved + * locks to COS mode */ + atomic_t rs_refcount; /* number of users */ + /** Number of locks awaiting client ACK */ + int rs_nlocks; + + /** Size of the state */ + int rs_size; + /** opcode */ + __u32 rs_opc; + /** Transaction number */ + __u64 rs_transno; + /** xid */ + __u64 rs_xid; + struct obd_export *rs_export; + struct ptlrpc_service_part *rs_svcpt; + /** Lnet metadata handle for the reply */ + struct lnet_handle_md rs_md_h; + + /** Context for the sevice thread */ + struct ptlrpc_svc_ctx *rs_svc_ctx; + /** Reply buffer (actually sent to the client), encoded if needed */ + struct lustre_msg *rs_repbuf; /* wrapper */ + /** Size of the reply buffer */ + int rs_repbuf_len; /* wrapper buf length */ + /** Size of the reply message */ + int rs_repdata_len; /* wrapper msg length */ + /** + * Actual reply message. Its content is encrupted (if needed) to + * produce reply buffer for actual sending. In simple case + * of no network encryption we jus set \a rs_repbuf to \a rs_msg + */ + struct lustre_msg *rs_msg; /* reply message */ + + /** Handles of locks awaiting client reply ACK */ + struct lustre_handle rs_locks[RS_MAX_LOCKS]; + /** Lock modes of locks in \a rs_locks */ + enum ldlm_mode rs_modes[RS_MAX_LOCKS]; +}; + +struct ptlrpc_thread; + +/** RPC stages */ +enum rq_phase { + RQ_PHASE_NEW = 0xebc0de00, + RQ_PHASE_RPC = 0xebc0de01, + RQ_PHASE_BULK = 0xebc0de02, + RQ_PHASE_INTERPRET = 0xebc0de03, + RQ_PHASE_COMPLETE = 0xebc0de04, + RQ_PHASE_UNREG_RPC = 0xebc0de05, + RQ_PHASE_UNREG_BULK = 0xebc0de06, + RQ_PHASE_UNDEFINED = 0xebc0de07 +}; + +/** Type of request interpreter call-back */ +typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc); +/** Type of request resend call-back */ +typedef void (*ptlrpc_resend_cb_t)(struct ptlrpc_request *req, + void *arg); + +/** + * Definition of request pool structure. + * The pool is used to store empty preallocated requests for the case + * when we would actually need to send something without performing + * any allocations (to avoid e.g. OOM). + */ +struct ptlrpc_request_pool { + /** Locks the list */ + spinlock_t prp_lock; + /** list of ptlrpc_request structs */ + struct list_head prp_req_list; + /** Maximum message size that would fit into a rquest from this pool */ + int prp_rq_size; + /** Function to allocate more requests for this pool */ + int (*prp_populate)(struct ptlrpc_request_pool *, int); +}; + +struct lu_context; +struct lu_env; + +struct ldlm_lock; + +#include + +/** + * Basic request prioritization operations structure. + * The whole idea is centered around locks and RPCs that might affect locks. + * When a lock is contended we try to give priority to RPCs that might lead + * to fastest release of that lock. + * Currently only implemented for OSTs only in a way that makes all + * IO and truncate RPCs that are coming from a locked region where a lock is + * contended a priority over other requests. + */ +struct ptlrpc_hpreq_ops { + /** + * Check if the lock handle of the given lock is the same as + * taken from the request. + */ + int (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *); + /** + * Check if the request is a high priority one. + */ + int (*hpreq_check)(struct ptlrpc_request *); + /** + * Called after the request has been handled. + */ + void (*hpreq_fini)(struct ptlrpc_request *); +}; + +struct ptlrpc_cli_req { + /** For bulk requests on client only: bulk descriptor */ + struct ptlrpc_bulk_desc *cr_bulk; + /** optional time limit for send attempts */ + time64_t cr_delay_limit; + /** time request was first queued */ + time64_t cr_queued_time; + /** request sent in nanoseconds */ + ktime_t cr_sent_ns; + /** time for request really sent out */ + time64_t cr_sent_out; + /** when req reply unlink must finish. */ + time64_t cr_reply_deadline; + /** when req bulk unlink must finish. */ + time64_t cr_bulk_deadline; + /** when req unlink must finish. */ + time64_t cr_req_deadline; + /** Portal to which this request would be sent */ + short cr_req_ptl; + /** Portal where to wait for reply and where reply would be sent */ + short cr_rep_ptl; + /** request resending number */ + unsigned int cr_resend_nr; + /** What was import generation when this request was sent */ + int cr_imp_gen; + enum lustre_imp_state cr_send_state; + /** Per-request waitq introduced by bug 21938 for recovery waiting */ + wait_queue_head_t cr_set_waitq; + /** Link item for request set lists */ + struct list_head cr_set_chain; + /** link to waited ctx */ + struct list_head cr_ctx_chain; + + /** client's half ctx */ + struct ptlrpc_cli_ctx *cr_cli_ctx; + /** Link back to the request set */ + struct ptlrpc_request_set *cr_set; + /** outgoing request MD handle */ + struct lnet_handle_md cr_req_md_h; + /** request-out callback parameter */ + struct ptlrpc_cb_id cr_req_cbid; + /** incoming reply MD handle */ + struct lnet_handle_md cr_reply_md_h; + wait_queue_head_t cr_reply_waitq; + /** reply callback parameter */ + struct ptlrpc_cb_id cr_reply_cbid; + /** Async completion handler, called when reply is received */ + ptlrpc_interpterer_t cr_reply_interp; + /** Resend handler, called when request is resend to update RPC data */ + ptlrpc_resend_cb_t cr_resend_cb; + /** Async completion context */ + union ptlrpc_async_args cr_async_args; + /** Opaq data for replay and commit callbacks. */ + void *cr_cb_data; + /** Link to the imp->imp_unreplied_list */ + struct list_head cr_unreplied_list; + /** + * Commit callback, called when request is committed and about to be + * freed. + */ + void (*cr_commit_cb)(struct ptlrpc_request *); + /** Replay callback, called after request is replayed at recovery */ + void (*cr_replay_cb)(struct ptlrpc_request *); +}; + +/** client request member alias */ +/* NB: these alias should NOT be used by any new code, instead they should + * be removed step by step to avoid potential abuse */ +#define rq_bulk rq_cli.cr_bulk +#define rq_delay_limit rq_cli.cr_delay_limit +#define rq_queued_time rq_cli.cr_queued_time +#define rq_sent_ns rq_cli.cr_sent_ns +#define rq_real_sent rq_cli.cr_sent_out +#define rq_reply_deadline rq_cli.cr_reply_deadline +#define rq_bulk_deadline rq_cli.cr_bulk_deadline +#define rq_req_deadline rq_cli.cr_req_deadline +#define rq_nr_resend rq_cli.cr_resend_nr +#define rq_request_portal rq_cli.cr_req_ptl +#define rq_reply_portal rq_cli.cr_rep_ptl +#define rq_import_generation rq_cli.cr_imp_gen +#define rq_send_state rq_cli.cr_send_state +#define rq_set_chain rq_cli.cr_set_chain +#define rq_ctx_chain rq_cli.cr_ctx_chain +#define rq_set rq_cli.cr_set +#define rq_set_waitq rq_cli.cr_set_waitq +#define rq_cli_ctx rq_cli.cr_cli_ctx +#define rq_req_md_h rq_cli.cr_req_md_h +#define rq_req_cbid rq_cli.cr_req_cbid +#define rq_reply_md_h rq_cli.cr_reply_md_h +#define rq_reply_waitq rq_cli.cr_reply_waitq +#define rq_reply_cbid rq_cli.cr_reply_cbid +#define rq_interpret_reply rq_cli.cr_reply_interp +#define rq_resend_cb rq_cli.cr_resend_cb +#define rq_async_args rq_cli.cr_async_args +#define rq_cb_data rq_cli.cr_cb_data +#define rq_unreplied_list rq_cli.cr_unreplied_list +#define rq_commit_cb rq_cli.cr_commit_cb +#define rq_replay_cb rq_cli.cr_replay_cb + +struct ptlrpc_srv_req { + /** initial thread servicing this request */ + struct ptlrpc_thread *sr_svc_thread; + /** + * Server side list of incoming unserved requests sorted by arrival + * time. Traversed from time to time to notice about to expire + * requests and sent back "early replies" to clients to let them + * know server is alive and well, just very busy to service their + * requests in time + */ + struct list_head sr_timed_list; + /** server-side per-export list */ + struct list_head sr_exp_list; + /** server-side history, used for debuging purposes. */ + struct list_head sr_hist_list; + /** history sequence # */ + __u64 sr_hist_seq; + /** the index of service's srv_at_array into which request is linked */ + __u32 sr_at_index; + /** authed uid */ + uid_t sr_auth_uid; + /** authed uid mapped to */ + uid_t sr_auth_mapped_uid; + /** RPC is generated from what part of Lustre */ + enum lustre_sec_part sr_sp_from; + /** request session context */ + struct lu_context sr_ses; + /** \addtogroup nrs + * @{ + */ + /** stub for NRS request */ + struct ptlrpc_nrs_request sr_nrq; + /** @} nrs */ + /** request arrival time */ + struct timespec64 sr_arrival_time; + /** server's half ctx */ + struct ptlrpc_svc_ctx *sr_svc_ctx; + /** (server side), pointed directly into req buffer */ + struct ptlrpc_user_desc *sr_user_desc; + /** separated reply state, may be vmalloc'd */ + struct ptlrpc_reply_state *sr_reply_state; + /** server-side hp handlers */ + struct ptlrpc_hpreq_ops *sr_ops; + /** incoming request buffer */ + struct ptlrpc_request_buffer_desc *sr_rqbd; +}; + +/** server request member alias */ +/* NB: these alias should NOT be used by any new code, instead they should + * be removed step by step to avoid potential abuse */ +#define rq_svc_thread rq_srv.sr_svc_thread +#define rq_timed_list rq_srv.sr_timed_list +#define rq_exp_list rq_srv.sr_exp_list +#define rq_history_list rq_srv.sr_hist_list +#define rq_history_seq rq_srv.sr_hist_seq +#define rq_at_index rq_srv.sr_at_index +#define rq_auth_uid rq_srv.sr_auth_uid +#define rq_auth_mapped_uid rq_srv.sr_auth_mapped_uid +#define rq_sp_from rq_srv.sr_sp_from +#define rq_session rq_srv.sr_ses +#define rq_nrq rq_srv.sr_nrq +#define rq_arrival_time rq_srv.sr_arrival_time +#define rq_reply_state rq_srv.sr_reply_state +#define rq_svc_ctx rq_srv.sr_svc_ctx +#define rq_user_desc rq_srv.sr_user_desc +#define rq_ops rq_srv.sr_ops +#define rq_rqbd rq_srv.sr_rqbd + +/** + * Represents remote procedure call. + * + * This is a staple structure used by everybody wanting to send a request + * in Lustre. + */ +struct ptlrpc_request { + /* Request type: one of PTL_RPC_MSG_* */ + int rq_type; + /** Result of request processing */ + int rq_status; + /** + * Linkage item through which this request is included into + * sending/delayed lists on client and into rqbd list on server + */ + struct list_head rq_list; + /** Lock to protect request flags and some other important bits, like + * rq_list + */ + spinlock_t rq_lock; + spinlock_t rq_early_free_lock; + /** client-side flags are serialized by rq_lock @{ */ + unsigned int rq_intr:1, rq_replied:1, rq_err:1, + rq_timedout:1, rq_resend:1, rq_restart:1, + /** + * when ->rq_replay is set, request is kept by the client even + * after server commits corresponding transaction. This is + * used for operations that require sequence of multiple + * requests to be replayed. The only example currently is file + * open/close. When last request in such a sequence is + * committed, ->rq_replay is cleared on all requests in the + * sequence. + */ + rq_replay:1, + rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, + rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1, + rq_early:1, + rq_req_unlinked:1, /* unlinked request buffer from lnet */ + rq_reply_unlinked:1, /* unlinked reply buffer from lnet */ + rq_memalloc:1, /* req originated from "kswapd" */ + rq_committed:1, + rq_reply_truncated:1, + /** whether the "rq_set" is a valid one */ + rq_invalid_rqset:1, + rq_generation_set:1, + /** do not resend request on -EINPROGRESS */ + rq_no_retry_einprogress:1, + /* allow the req to be sent if the import is in recovery + * status */ + rq_allow_replay:1, + /* bulk request, sent to server, but uncommitted */ + rq_unstable:1, + rq_early_free_repbuf:1, /* free reply buffer in advance */ + rq_allow_intr:1; + /** @} */ + + /** server-side flags @{ */ + unsigned int + rq_hp:1, /**< high priority RPC */ + rq_at_linked:1, /**< link into service's srv_at_array */ + rq_packed_final:1; /**< packed final reply */ + /** @} */ + + /** one of RQ_PHASE_* */ + enum rq_phase rq_phase; + /** one of RQ_PHASE_* to be used next */ + enum rq_phase rq_next_phase; + /** + * client-side refcount for SENT race, server-side refcounf + * for multiple replies + */ + atomic_t rq_refcount; + /** + * client-side: + * !rq_truncate : # reply bytes actually received, + * rq_truncate : required repbuf_len for resend + */ + int rq_nob_received; + /** Request length */ + int rq_reqlen; + /** Reply length */ + int rq_replen; + /** Pool if request is from preallocated list */ + struct ptlrpc_request_pool *rq_pool; + /** Request message - what client sent */ + struct lustre_msg *rq_reqmsg; + /** Reply message - server response */ + struct lustre_msg *rq_repmsg; + /** Transaction number */ + __u64 rq_transno; + /** xid */ + __u64 rq_xid; + /** bulk match bits */ + __u64 rq_mbits; + /** + * List item to for replay list. Not yet committed requests get linked + * there. + * Also see \a rq_replay comment above. + * It's also link chain on obd_export::exp_req_replay_queue + */ + struct list_head rq_replay_list; + /** non-shared members for client & server request*/ + union { + struct ptlrpc_cli_req rq_cli; + struct ptlrpc_srv_req rq_srv; + }; + /** + * security and encryption data + * @{ */ + /** description of flavors for client & server */ + struct sptlrpc_flavor rq_flvr; + + /** + * SELinux policy info at the time of the request + * sepol string format is: + * ::: + */ + char rq_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1]; + + /* client/server security flags */ + unsigned int + rq_ctx_init:1, /* context initiation */ + rq_ctx_fini:1, /* context destroy */ + rq_bulk_read:1, /* request bulk read */ + rq_bulk_write:1, /* request bulk write */ + /* server authentication flags */ + rq_auth_gss:1, /* authenticated by gss */ + rq_auth_usr_root:1, /* authed as root */ + rq_auth_usr_mdt:1, /* authed as mdt */ + rq_auth_usr_ost:1, /* authed as ost */ + /* security tfm flags */ + rq_pack_udesc:1, + rq_pack_bulk:1, + /* doesn't expect reply FIXME */ + rq_no_reply:1, + rq_pill_init:1, /* pill initialized */ + rq_srv_req:1; /* server request */ + + + /** various buffer pointers */ + struct lustre_msg *rq_reqbuf; /**< req wrapper, vmalloc*/ + char *rq_repbuf; /**< rep buffer, vmalloc */ + struct lustre_msg *rq_repdata; /**< rep wrapper msg */ + /** only in priv mode */ + struct lustre_msg *rq_clrbuf; + int rq_reqbuf_len; /* req wrapper buf len */ + int rq_reqdata_len; /* req wrapper msg len */ + int rq_repbuf_len; /* rep buffer len */ + int rq_repdata_len; /* rep wrapper msg len */ + int rq_clrbuf_len; /* only in priv mode */ + int rq_clrdata_len; /* only in priv mode */ + + /** early replies go to offset 0, regular replies go after that */ + unsigned int rq_reply_off; + /** @} */ + + /** Fields that help to see if request and reply were swabbed or not */ + __u32 rq_req_swab_mask; + __u32 rq_rep_swab_mask; + + /** how many early replies (for stats) */ + int rq_early_count; + /** Server-side, export on which request was received */ + struct obd_export *rq_export; + /** import where request is being sent */ + struct obd_import *rq_import; + /** our LNet NID */ + lnet_nid_t rq_self; + /** Peer description (the other side) */ + struct lnet_process_id rq_peer; + /** Descriptor for the NID from which the peer sent the request. */ + struct lnet_process_id rq_source; + /** + * service time estimate (secs) + * If the request is not served by this time, it is marked as timed out. + * Do not change to time64_t since this is transmitted over the wire. + * + * The linux kernel handles timestamps with time64_t and timeouts + * are normally done with jiffies. Lustre shares the rq_timeout between + * nodes. Since jiffies can vary from node to node Lustre instead + * will express the timeout value in seconds. To avoid confusion with + * timestamps (time64_t) and jiffy timeouts (long) Lustre timeouts + * are expressed in s32 (timeout_t). Also what is transmitted over + * the wire is 32 bits. + */ + timeout_t rq_timeout; + /** + * when request/reply sent (secs), or time when request should be sent + */ + time64_t rq_sent; + /** when request must finish. */ + time64_t rq_deadline; + /** request format description */ + struct req_capsule rq_pill; +}; + +/** + * Call completion handler for rpc if any, return it's status or original + * rc if there was no handler defined for this request. + */ +static inline int ptlrpc_req_interpret(const struct lu_env *env, + struct ptlrpc_request *req, int rc) +{ + if (req->rq_interpret_reply != NULL) { + req->rq_status = req->rq_interpret_reply(env, req, + &req->rq_async_args, + rc); + return req->rq_status; + } + return rc; +} + +/** \addtogroup nrs + * @{ + */ +int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf); +int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf); +void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req); +void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_pol_info *info); + +/* + * Can the request be moved from the regular NRS head to the high-priority NRS + * head (of the same PTLRPC service partition), if any? + * + * For a reliable result, this should be checked under svcpt->scp_req lock. + */ +static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_request *nrq = &req->rq_nrq; + + /** + * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the + * request has been enqueued first, and ptlrpc_nrs_request::nr_started + * to make sure it has not been scheduled yet (analogous to previous + * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list). + */ + return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp; +} +/** @} nrs */ + +/** + * Returns true if request buffer at offset \a index was already swabbed + */ +static inline bool lustre_req_swabbed(struct ptlrpc_request *req, size_t index) +{ + LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); + return req->rq_req_swab_mask & (1 << index); +} + +/** + * Returns true if request reply buffer at offset \a index was already swabbed + */ +static inline bool lustre_rep_swabbed(struct ptlrpc_request *req, size_t index) +{ + LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); + return req->rq_rep_swab_mask & (1 << index); +} + +/** + * Returns true if request needs to be swabbed into local cpu byteorder + */ +static inline bool ptlrpc_req_need_swab(struct ptlrpc_request *req) +{ + return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); +} + +/** + * Returns true if request reply needs to be swabbed into local cpu byteorder + */ +static inline bool ptlrpc_rep_need_swab(struct ptlrpc_request *req) +{ + return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); +} + +/** + * Mark request buffer at offset \a index that it was already swabbed + */ +static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, + size_t index) +{ + LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); + LASSERT((req->rq_req_swab_mask & (1 << index)) == 0); + req->rq_req_swab_mask |= 1 << index; +} + +/** + * Mark request reply buffer at offset \a index that it was already swabbed + */ +static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, + size_t index) +{ + LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); + LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0); + req->rq_rep_swab_mask |= 1 << index; +} + +/** + * Convert numerical request phase value \a phase into text string description + */ +static inline const char * +ptlrpc_phase2str(enum rq_phase phase) +{ + switch (phase) { + case RQ_PHASE_NEW: + return "New"; + case RQ_PHASE_RPC: + return "Rpc"; + case RQ_PHASE_BULK: + return "Bulk"; + case RQ_PHASE_INTERPRET: + return "Interpret"; + case RQ_PHASE_COMPLETE: + return "Complete"; + case RQ_PHASE_UNREG_RPC: + return "UnregRPC"; + case RQ_PHASE_UNREG_BULK: + return "UnregBULK"; + default: + return "?Phase?"; + } +} + +/** + * Convert numerical request phase of the request \a req into text stringi + * description + */ +static inline const char * +ptlrpc_rqphase2str(struct ptlrpc_request *req) +{ + return ptlrpc_phase2str(req->rq_phase); +} + +/** + * Debugging functions and helpers to print request structure into debug log + * @{ + */ +/* Spare the preprocessor, spoil the bugs. */ +#define FLAG(field, str) (field ? str : "") + +/** Convert bit flags into a string */ +#define DEBUG_REQ_FLAGS(req) \ + ptlrpc_rqphase2str(req), \ + FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \ + FLAG(req->rq_err, "E"), FLAG(req->rq_net_err, "e"), \ + FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \ + FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \ + FLAG(req->rq_no_resend, "N"), \ + FLAG(req->rq_waiting, "W"), \ + FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"), \ + FLAG(req->rq_committed, "M") + +#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s%s" + +void _debug_req(struct ptlrpc_request *req, + struct libcfs_debug_msg_data *data, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Helper that decides if we need to print request accordig to current debug + * level settings + */ +#define debug_req(msgdata, mask, cdls, req, fmt, a...) \ +do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _debug_req((req), msgdata, fmt, ##a); \ +} while(0) + +/** + * This is the debug print function you need to use to print request sturucture + * content into lustre debug log. + * for most callers (level is a constant) this is resolved at compile time */ +#define DEBUG_REQ(level, req, fmt, args...) \ +do { \ + if ((level) & (D_ERROR | D_WARNING)) { \ + static struct cfs_debug_limit_state cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ + debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\ + } else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \ + } \ +} while (0) +/** @} */ + +/** + * Structure that defines a single page of a bulk transfer + */ +struct ptlrpc_bulk_page { + /** Linkage to list of pages in a bulk */ + struct list_head bp_link; + /** + * Number of bytes in a page to transfer starting from \a bp_pageoffset + */ + int bp_buflen; + /** offset within a page */ + int bp_pageoffset; + /** The page itself */ + struct page *bp_page; +}; + +enum ptlrpc_bulk_op_type { + PTLRPC_BULK_OP_ACTIVE = 0x00000001, + PTLRPC_BULK_OP_PASSIVE = 0x00000002, + PTLRPC_BULK_OP_PUT = 0x00000004, + PTLRPC_BULK_OP_GET = 0x00000008, + PTLRPC_BULK_BUF_KVEC = 0x00000010, + PTLRPC_BULK_BUF_KIOV = 0x00000020, + PTLRPC_BULK_GET_SOURCE = PTLRPC_BULK_OP_PASSIVE | PTLRPC_BULK_OP_GET, + PTLRPC_BULK_PUT_SINK = PTLRPC_BULK_OP_PASSIVE | PTLRPC_BULK_OP_PUT, + PTLRPC_BULK_GET_SINK = PTLRPC_BULK_OP_ACTIVE | PTLRPC_BULK_OP_GET, + PTLRPC_BULK_PUT_SOURCE = PTLRPC_BULK_OP_ACTIVE | PTLRPC_BULK_OP_PUT, +}; + +static inline bool ptlrpc_is_bulk_op_get(enum ptlrpc_bulk_op_type type) +{ + return (type & PTLRPC_BULK_OP_GET) == PTLRPC_BULK_OP_GET; +} + +static inline bool ptlrpc_is_bulk_get_source(enum ptlrpc_bulk_op_type type) +{ + return (type & PTLRPC_BULK_GET_SOURCE) == PTLRPC_BULK_GET_SOURCE; +} + +static inline bool ptlrpc_is_bulk_put_sink(enum ptlrpc_bulk_op_type type) +{ + return (type & PTLRPC_BULK_PUT_SINK) == PTLRPC_BULK_PUT_SINK; +} + +static inline bool ptlrpc_is_bulk_get_sink(enum ptlrpc_bulk_op_type type) +{ + return (type & PTLRPC_BULK_GET_SINK) == PTLRPC_BULK_GET_SINK; +} + +static inline bool ptlrpc_is_bulk_put_source(enum ptlrpc_bulk_op_type type) +{ + return (type & PTLRPC_BULK_PUT_SOURCE) == PTLRPC_BULK_PUT_SOURCE; +} + +static inline bool ptlrpc_is_bulk_desc_kvec(enum ptlrpc_bulk_op_type type) +{ + return ((type & PTLRPC_BULK_BUF_KVEC) | (type & PTLRPC_BULK_BUF_KIOV)) + == PTLRPC_BULK_BUF_KVEC; +} + +static inline bool ptlrpc_is_bulk_desc_kiov(enum ptlrpc_bulk_op_type type) +{ + return ((type & PTLRPC_BULK_BUF_KVEC) | (type & PTLRPC_BULK_BUF_KIOV)) + == PTLRPC_BULK_BUF_KIOV; +} + +static inline bool ptlrpc_is_bulk_op_active(enum ptlrpc_bulk_op_type type) +{ + return ((type & PTLRPC_BULK_OP_ACTIVE) | + (type & PTLRPC_BULK_OP_PASSIVE)) + == PTLRPC_BULK_OP_ACTIVE; +} + +static inline bool ptlrpc_is_bulk_op_passive(enum ptlrpc_bulk_op_type type) +{ + return ((type & PTLRPC_BULK_OP_ACTIVE) | + (type & PTLRPC_BULK_OP_PASSIVE)) + == PTLRPC_BULK_OP_PASSIVE; +} + +struct ptlrpc_bulk_frag_ops { + /** + * Add a page \a page to the bulk descriptor \a desc + * Data to transfer in the page starts at offset \a pageoffset and + * amount of data to transfer from the page is \a len + */ + void (*add_kiov_frag)(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len); + + /* + * Add a \a fragment to the bulk descriptor \a desc. + * Data to transfer in the fragment is pointed to by \a frag + * The size of the fragment is \a len + */ + int (*add_iov_frag)(struct ptlrpc_bulk_desc *desc, void *frag, int len); + + /** + * Uninitialize and free bulk descriptor \a desc. + * Works on bulk descriptors both from server and client side. + */ + void (*release_frags)(struct ptlrpc_bulk_desc *desc); +}; + +extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops; +extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops; +extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kvec_ops; + +/* + * Definition of bulk descriptor. + * Bulks are special "Two phase" RPCs where initial request message + * is sent first and it is followed bt a transfer (o receiving) of a large + * amount of data to be settled into pages referenced from the bulk descriptors. + * Bulks transfers (the actual data following the small requests) are done + * on separate LNet portals. + * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs. + * Another user is readpage for MDT. + */ +struct ptlrpc_bulk_desc { + /** number MD's assigned including zero-sends */ + unsigned int bd_refs; + /** completed with failure */ + unsigned long bd_failure:1; + /** client side */ + unsigned long bd_registered:1; + /** For serialization with callback */ + spinlock_t bd_lock; + /** Import generation when request for this bulk was sent */ + int bd_import_generation; + /** {put,get}{source,sink}{kvec,kiov} */ + enum ptlrpc_bulk_op_type bd_type; + /** LNet portal for this bulk */ + __u32 bd_portal; + /** Server side - export this bulk created for */ + struct obd_export *bd_export; + /** Client side - import this bulk was sent on */ + struct obd_import *bd_import; + /** Back pointer to the request */ + struct ptlrpc_request *bd_req; + struct ptlrpc_bulk_frag_ops *bd_frag_ops; + wait_queue_head_t bd_waitq; /* server side only WQ */ + int bd_iov_count; /* # entries in bd_iov */ + int bd_max_iov; /* allocated size of bd_iov */ + int bd_nob; /* # bytes covered */ + int bd_nob_transferred; /* # bytes GOT/PUT */ + unsigned int bd_nob_last; /* # bytes in last MD */ + + __u64 bd_last_mbits; + + struct ptlrpc_cb_id bd_cbid; /* network callback info */ + lnet_nid_t bd_sender; /* stash event::sender */ + int bd_md_count; /* # valid entries in bd_mds */ + int bd_md_max_brw; /* max entries in bd_mds */ + /** array of offsets for each MD */ + unsigned int bd_mds_off[PTLRPC_BULK_OPS_COUNT]; + /** array of associated MDs */ + struct lnet_handle_md bd_mds[PTLRPC_BULK_OPS_COUNT]; + + union { + struct { + /* + * encrypt iov, size is either 0 or bd_iov_count. + */ + lnet_kiov_t *bd_enc_vec; + lnet_kiov_t *bd_vec; + } bd_kiov; + + struct { + struct kvec *bd_enc_kvec; + struct kvec *bd_kvec; + } bd_kvec; + } bd_u; + +}; + +#define GET_KIOV(desc) ((desc)->bd_u.bd_kiov.bd_vec) +#define BD_GET_KIOV(desc, i) ((desc)->bd_u.bd_kiov.bd_vec[i]) +#define GET_ENC_KIOV(desc) ((desc)->bd_u.bd_kiov.bd_enc_vec) +#define BD_GET_ENC_KIOV(desc, i) ((desc)->bd_u.bd_kiov.bd_enc_vec[i]) +#define GET_KVEC(desc) ((desc)->bd_u.bd_kvec.bd_kvec) +#define BD_GET_KVEC(desc, i) ((desc)->bd_u.bd_kvec.bd_kvec[i]) +#define GET_ENC_KVEC(desc) ((desc)->bd_u.bd_kvec.bd_enc_kvec) +#define BD_GET_ENC_KVEC(desc, i) ((desc)->bd_u.bd_kvec.bd_enc_kvec[i]) + +enum { + SVC_INIT = 0, + SVC_STOPPED = 1 << 0, + SVC_STOPPING = 1 << 1, + SVC_STARTING = 1 << 2, + SVC_RUNNING = 1 << 3, + SVC_EVENT = 1 << 4, + SVC_SIGNAL = 1 << 5, +}; + +#define PTLRPC_THR_NAME_LEN 32 +/** + * Definition of server service thread structure + */ +struct ptlrpc_thread { + /** + * List of active threads in svc->srv_threads + */ + struct list_head t_link; + /** + * thread-private data (preallocated vmalloc'd memory) + */ + void *t_data; + __u32 t_flags; + /** + * service thread index, from ptlrpc_start_threads + */ + unsigned int t_id; + /** + * service thread pid + */ + pid_t t_pid; + /** + * put watchdog in the structure per thread b=14840 + */ + struct lc_watchdog *t_watchdog; + /** + * the svc this thread belonged to b=18582 + */ + struct ptlrpc_service_part *t_svcpt; + wait_queue_head_t t_ctl_waitq; + struct lu_env *t_env; + char t_name[PTLRPC_THR_NAME_LEN]; +}; + +static inline int thread_is_init(struct ptlrpc_thread *thread) +{ + return thread->t_flags == 0; +} + +static inline int thread_is_stopped(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STOPPED); +} + +static inline int thread_is_stopping(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STOPPING); +} + +static inline int thread_is_starting(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STARTING); +} + +static inline int thread_is_running(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_RUNNING); +} + +static inline int thread_is_event(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_EVENT); +} + +static inline int thread_is_signal(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_SIGNAL); +} + +static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags &= ~flags; +} + +static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags = flags; +} + +static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags |= flags; +} + +static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread, + __u32 flags) +{ + if (thread->t_flags & flags) { + thread->t_flags &= ~flags; + return 1; + } + return 0; +} + +/** + * Request buffer descriptor structure. + * This is a structure that contains one posted request buffer for service. + * Once data land into a buffer, event callback creates actual request and + * notifies wakes one of the service threads to process new incoming request. + * More than one request can fit into the buffer. + */ +struct ptlrpc_request_buffer_desc { + /** Link item for rqbds on a service */ + struct list_head rqbd_list; + /** History of requests for this buffer */ + struct list_head rqbd_reqs; + /** Back pointer to service for which this buffer is registered */ + struct ptlrpc_service_part *rqbd_svcpt; + /** LNet descriptor */ + struct lnet_handle_md rqbd_md_h; + int rqbd_refcount; + /** The buffer itself */ + char *rqbd_buffer; + struct ptlrpc_cb_id rqbd_cbid; + /** + * This "embedded" request structure is only used for the + * last request to fit into the buffer + */ + struct ptlrpc_request rqbd_req; +}; + +typedef int (*svc_handler_t)(struct ptlrpc_request *req); + +struct ptlrpc_service_ops { + /** + * if non-NULL called during thread creation (ptlrpc_start_thread()) + * to initialize service specific per-thread state. + */ + int (*so_thr_init)(struct ptlrpc_thread *thr); + /** + * if non-NULL called during thread shutdown (ptlrpc_main()) to + * destruct state created by ->srv_init(). + */ + void (*so_thr_done)(struct ptlrpc_thread *thr); + /** + * Handler function for incoming requests for this service + */ + int (*so_req_handler)(struct ptlrpc_request *req); + /** + * function to determine priority of the request, it's called + * on every new request + */ + int (*so_hpreq_handler)(struct ptlrpc_request *); + /** + * service-specific print fn + */ + void (*so_req_printer)(void *, struct ptlrpc_request *); +}; + +#ifndef __cfs_cacheline_aligned +/* NB: put it here for reducing patche dependence */ +# define __cfs_cacheline_aligned +#endif + +/** + * How many high priority requests to serve before serving one normal + * priority request + */ +#define PTLRPC_SVC_HP_RATIO 10 + +/** + * Definition of PortalRPC service. + * The service is listening on a particular portal (like tcp port) + * and perform actions for a specific server like IO service for OST + * or general metadata service for MDS. + */ +struct ptlrpc_service { + /** serialize /proc operations */ + spinlock_t srv_lock; + /** most often accessed fields */ + /** chain thru all services */ + struct list_head srv_list; + /** service operations table */ + struct ptlrpc_service_ops srv_ops; + /** only statically allocated strings here; we don't clean them */ + char *srv_name; + /** only statically allocated strings here; we don't clean them */ + char *srv_thread_name; + /** service thread list */ + struct list_head srv_threads; + /** threads # should be created for each partition on initializing */ + int srv_nthrs_cpt_init; + /** limit of threads number for each partition */ + int srv_nthrs_cpt_limit; + /** Root of debugfs dir tree for this service */ + struct dentry *srv_debugfs_entry; + /** Pointer to statistic data for this service */ + struct lprocfs_stats *srv_stats; + /** # hp per lp reqs to handle */ + int srv_hpreq_ratio; + /** biggest request to receive */ + int srv_max_req_size; + /** biggest reply to send */ + int srv_max_reply_size; + /** size of individual buffers */ + int srv_buf_size; + /** # buffers to allocate in 1 group */ + int srv_nbuf_per_group; + /** Local portal on which to receive requests */ + __u32 srv_req_portal; + /** Portal on the client to send replies to */ + __u32 srv_rep_portal; + /** + * Tags for lu_context associated with this thread, see struct + * lu_context. + */ + __u32 srv_ctx_tags; + /** soft watchdog timeout multiplier */ + int srv_watchdog_factor; + /** under unregister_service */ + unsigned srv_is_stopping:1; + /** Whether or not to restrict service threads to CPUs in this CPT */ + unsigned srv_cpt_bind:1; + + /** max # request buffers */ + int srv_nrqbds_max; + /** max # request buffers in history per partition */ + int srv_hist_nrqbds_cpt_max; + /** number of CPTs this service associated with */ + int srv_ncpts; + /** CPTs array this service associated with */ + __u32 *srv_cpts; + /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */ + int srv_cpt_bits; + /** CPT table this service is running over */ + struct cfs_cpt_table *srv_cptable; + + /* sysfs object */ + struct kobject srv_kobj; + struct completion srv_kobj_unregister; + /** + * partition data for ptlrpc service + */ + struct ptlrpc_service_part *srv_parts[0]; +}; + +/** + * Definition of PortalRPC service partition data. + * Although a service only has one instance of it right now, but we + * will have multiple instances very soon (instance per CPT). + * + * it has four locks: + * \a scp_lock + * serialize operations on rqbd and requests waiting for preprocess + * \a scp_req_lock + * serialize operations active requests sent to this portal + * \a scp_at_lock + * serialize adaptive timeout stuff + * \a scp_rep_lock + * serialize operations on RS list (reply states) + * + * We don't have any use-case to take two or more locks at the same time + * for now, so there is no lock order issue. + */ +struct ptlrpc_service_part { + /** back reference to owner */ + struct ptlrpc_service *scp_service __cfs_cacheline_aligned; + /* CPT id, reserved */ + int scp_cpt; + /** always increasing number */ + int scp_thr_nextid; + /** # of starting threads */ + int scp_nthrs_starting; + /** # of stopping threads, reserved for shrinking threads */ + int scp_nthrs_stopping; + /** # running threads */ + int scp_nthrs_running; + /** service threads list */ + struct list_head scp_threads; + + /** + * serialize the following fields, used for protecting + * rqbd list and incoming requests waiting for preprocess, + * threads starting & stopping are also protected by this lock. + */ + spinlock_t scp_lock __cfs_cacheline_aligned; + /** userland serialization */ + struct mutex scp_mutex; + /** total # req buffer descs allocated */ + int scp_nrqbds_total; + /** # posted request buffers for receiving */ + int scp_nrqbds_posted; + /** in progress of allocating rqbd */ + int scp_rqbd_allocating; + /** # incoming reqs */ + int scp_nreqs_incoming; + /** request buffers to be reposted */ + struct list_head scp_rqbd_idle; + /** req buffers receiving */ + struct list_head scp_rqbd_posted; + /** incoming reqs */ + struct list_head scp_req_incoming; + /** timeout before re-posting reqs, in jiffies */ + long scp_rqbd_timeout; + /** + * all threads sleep on this. This wait-queue is signalled when new + * incoming request arrives and when difficult reply has to be handled. + */ + wait_queue_head_t scp_waitq; + + /** request history */ + struct list_head scp_hist_reqs; + /** request buffer history */ + struct list_head scp_hist_rqbds; + /** # request buffers in history */ + int scp_hist_nrqbds; + /** sequence number for request */ + __u64 scp_hist_seq; + /** highest seq culled from history */ + __u64 scp_hist_seq_culled; + + /** + * serialize the following fields, used for processing requests + * sent to this portal + */ + spinlock_t scp_req_lock __cfs_cacheline_aligned; + /** # reqs in either of the NRS heads below */ + /** # reqs being served */ + int scp_nreqs_active; + /** # HPreqs being served */ + int scp_nhreqs_active; + /** # hp requests handled */ + int scp_hreq_count; + + /** NRS head for regular requests */ + struct ptlrpc_nrs scp_nrs_reg; + /** NRS head for HP requests; this is only valid for services that can + * handle HP requests */ + struct ptlrpc_nrs *scp_nrs_hp; + + /** AT stuff */ + /** @{ */ + /** + * serialize the following fields, used for changes on + * adaptive timeout + */ + spinlock_t scp_at_lock __cfs_cacheline_aligned; + /** estimated rpc service time */ + struct adaptive_timeout scp_at_estimate; + /** reqs waiting for replies */ + struct ptlrpc_at_array scp_at_array; + /** early reply timer */ + struct timer_list scp_at_timer; + /** debug */ + ktime_t scp_at_checktime; + /** check early replies */ + unsigned scp_at_check; + /** @} */ + + /** + * serialize the following fields, used for processing + * replies for this portal + */ + spinlock_t scp_rep_lock __cfs_cacheline_aligned; + /** all the active replies */ + struct list_head scp_rep_active; + /** List of free reply_states */ + struct list_head scp_rep_idle; + /** waitq to run, when adding stuff to srv_free_rs_list */ + wait_queue_head_t scp_rep_waitq; + /** # 'difficult' replies */ + atomic_t scp_nreps_difficult; +}; + +#define ptlrpc_service_for_each_part(part, i, svc) \ + for (i = 0; \ + i < (svc)->srv_ncpts && \ + (svc)->srv_parts != NULL && \ + ((part) = (svc)->srv_parts[i]) != NULL; i++) + +/** + * Declaration of ptlrpcd control structure + */ +struct ptlrpcd_ctl { + /** + * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE) + */ + unsigned long pc_flags; + /** + * Thread lock protecting structure fields. + */ + spinlock_t pc_lock; + /** + * Start completion. + */ + struct completion pc_starting; + /** + * Stop completion. + */ + struct completion pc_finishing; + /** + * Thread requests set. + */ + struct ptlrpc_request_set *pc_set; + /** + * Thread name used in kthread_run() + */ + char pc_name[16]; + /** + * CPT the thread is bound on. + */ + int pc_cpt; + /** + * Index of ptlrpcd thread in the array. + */ + int pc_index; + /** + * Pointer to the array of partners' ptlrpcd_ctl structure. + */ + struct ptlrpcd_ctl **pc_partners; + /** + * Number of the ptlrpcd's partners. + */ + int pc_npartners; + /** + * Record the partner index to be processed next. + */ + int pc_cursor; + /** + * Error code if the thread failed to fully start. + */ + int pc_error; +}; + +/* Bits for pc_flags */ +enum ptlrpcd_ctl_flags { + /** + * Ptlrpc thread start flag. + */ + LIOD_START = 1 << 0, + /** + * Ptlrpc thread stop flag. + */ + LIOD_STOP = 1 << 1, + /** + * Ptlrpc thread force flag (only stop force so far). + * This will cause aborting any inflight rpcs handled + * by thread if LIOD_STOP is specified. + */ + LIOD_FORCE = 1 << 2, + /** + * This is a recovery ptlrpc thread. + */ + LIOD_RECOVERY = 1 << 3, +}; + +/** + * \addtogroup nrs + * @{ + * + * Service compatibility function; the policy is compatible with all services. + * + * \param[in] svc The service the policy is attempting to register with. + * \param[in] desc The policy descriptor + * + * \retval true The policy is compatible with the service + * + * \see ptlrpc_nrs_pol_desc::pd_compat() + */ +static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + return true; +} + +/** + * Service compatibility function; the policy is compatible with only a specific + * service which is identified by its human-readable name at + * ptlrpc_service::srv_name. + * + * \param[in] svc The service the policy is attempting to register with. + * \param[in] desc The policy descriptor + * + * \retval false The policy is not compatible with the service + * \retval true The policy is compatible with the service + * + * \see ptlrpc_nrs_pol_desc::pd_compat() + */ +static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + LASSERT(desc->pd_compat_svc_name != NULL); + return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0; +} + +/** @} nrs */ + +/* ptlrpc/events.c */ +extern struct lnet_handle_eq ptlrpc_eq_h; +extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, + struct lnet_process_id *peer, lnet_nid_t *self); +/** + * These callbacks are invoked by LNet when something happened to + * underlying buffer + * @{ + */ +extern void request_out_callback(struct lnet_event *ev); +extern void reply_in_callback(struct lnet_event *ev); +extern void client_bulk_callback(struct lnet_event *ev); +extern void request_in_callback(struct lnet_event *ev); +extern void reply_out_callback(struct lnet_event *ev); +#ifdef HAVE_SERVER_SUPPORT +extern void server_bulk_callback(struct lnet_event *ev); +#endif +/** @} */ + +/* ptlrpc/connection.c */ +struct ptlrpc_connection *ptlrpc_connection_get(struct lnet_process_id peer, + lnet_nid_t self, + struct obd_uuid *uuid); +int ptlrpc_connection_put(struct ptlrpc_connection *c); +struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *); +int ptlrpc_connection_init(void); +void ptlrpc_connection_fini(void); +extern lnet_pid_t ptl_get_pid(void); + +/* + * Check if the peer connection is on the local node. We need to use GFP_NOFS + * for requests from a local client to avoid recursing into the filesystem + * as we might end up waiting on a page sent in the request we're serving. + * + * Use __GFP_HIGHMEM so that the pages can use all of the available memory + * on 32-bit machines. Use more aggressive GFP_HIGHUSER flags from non-local + * clients to be able to generate more memory pressure on the OSS and allow + * inactive pages to be reclaimed, since it doesn't have any other processes + * or allocations that generate memory reclaim pressure. + * + * See b=17576 (bdf50dc9) and b=19529 (3dcf18d3) for details. + */ +static inline bool ptlrpc_connection_is_local(struct ptlrpc_connection *conn) +{ + if (!conn) + return false; + + if (conn->c_peer.nid == conn->c_self) + return true; + + RETURN(LNetIsPeerLocal(conn->c_peer.nid)); +} + +/* ptlrpc/niobuf.c */ +/** + * Actual interfacing with LNet to put/get/register/unregister stuff + * @{ + */ +#ifdef HAVE_SERVER_SUPPORT +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req, + unsigned nfrags, unsigned max_brw, + unsigned int type, + unsigned portal, + const struct ptlrpc_bulk_frag_ops + *ops); +int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc); +void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc); + +static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc) +{ + int rc; + + LASSERT(desc != NULL); + + spin_lock(&desc->bd_lock); + rc = desc->bd_refs; + spin_unlock(&desc->bd_lock); + return rc; +} +#endif + +int ptlrpc_register_bulk(struct ptlrpc_request *req); +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async); + +static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc; + int rc; + + LASSERT(req != NULL); + desc = req->rq_bulk; + + if (!desc) + return 0; + + if (req->rq_bulk_deadline > ktime_get_real_seconds()) + return 1; + + + spin_lock(&desc->bd_lock); + rc = desc->bd_refs; + spin_unlock(&desc->bd_lock); + return rc; +} + +#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01 +#define PTLRPC_REPLY_EARLY 0x02 +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags); +int ptlrpc_reply(struct ptlrpc_request *req); +int ptlrpc_send_error(struct ptlrpc_request *req, int difficult); +int ptlrpc_error(struct ptlrpc_request *req); +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req); +int ptl_send_rpc(struct ptlrpc_request *request, int noreply); +int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd); +/** @} */ + +/* ptlrpc/client.c */ +/** + * Client-side portals API. Everything to send requests, receive replies, + * request queues, request management, etc. + * @{ + */ +void ptlrpc_request_committed(struct ptlrpc_request *req, int force); + +void ptlrpc_init_client(int req_portal, int rep_portal, char *name, + struct ptlrpc_client *); +void ptlrpc_cleanup_client(struct obd_import *imp); +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid, + lnet_nid_t nid4refnet); + +int ptlrpc_queue_wait(struct ptlrpc_request *req); +int ptlrpc_replay_req(struct ptlrpc_request *req); +void ptlrpc_restart_req(struct ptlrpc_request *req); +void ptlrpc_abort_inflight(struct obd_import *imp); +void ptlrpc_cleanup_imp(struct obd_import *imp); +void ptlrpc_abort_set(struct ptlrpc_request_set *set); + +struct ptlrpc_request_set *ptlrpc_prep_set(void); +struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, + void *arg); +int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set); +int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *); +void ptlrpc_mark_interrupted(struct ptlrpc_request *req); +void ptlrpc_set_destroy(struct ptlrpc_request_set *); +void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *); + +void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); +int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); + +struct ptlrpc_request_pool * +ptlrpc_init_rq_pool(int, int, + int (*populate_pool)(struct ptlrpc_request_pool *, int)); + +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req); +struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, + const struct req_format *format); +struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, + struct ptlrpc_request_pool *, + const struct req_format *format); +void ptlrpc_request_free(struct ptlrpc_request *request); +int ptlrpc_request_pack(struct ptlrpc_request *request, + __u32 version, int opcode); +struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, + const struct req_format *format, + __u32 version, int opcode); +int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, char **bufs, + struct ptlrpc_cli_ctx *ctx); +void ptlrpc_req_finished(struct ptlrpc_request *request); +void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request); +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req); +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, + unsigned nfrags, unsigned max_brw, + unsigned int type, + unsigned portal, + const struct ptlrpc_bulk_frag_ops + *ops); + +int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc, + void *frag, int len); +void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len, + int pin); +static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1); +} + +static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0); +} + +void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk); + +static inline void ptlrpc_release_bulk_page_pin(struct ptlrpc_bulk_desc *desc) +{ + int i; + + for (i = 0; i < desc->bd_iov_count ; i++) + put_page(BD_GET_KIOV(desc, i).kiov_page); +} + +static inline void ptlrpc_release_bulk_noop(struct ptlrpc_bulk_desc *desc) +{ +} + +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp); +__u64 ptlrpc_next_xid(void); +__u64 ptlrpc_sample_next_xid(void); +__u64 ptlrpc_req_xid(struct ptlrpc_request *request); + +/* Set of routines to run a function in ptlrpcd context */ +void *ptlrpcd_alloc_work(struct obd_import *imp, + int (*cb)(const struct lu_env *, void *), void *data); +void ptlrpcd_destroy_work(void *handler); +int ptlrpcd_queue_work(void *handler); + +/** @} */ +struct ptlrpc_service_buf_conf { + /* nbufs is buffers # to allocate when growing the pool */ + unsigned int bc_nbufs; + /* buffer size to post */ + unsigned int bc_buf_size; + /* portal to listed for requests on */ + unsigned int bc_req_portal; + /* portal of where to send replies to */ + unsigned int bc_rep_portal; + /* maximum request size to be accepted for this service */ + unsigned int bc_req_max_size; + /* maximum reply size this service can ever send */ + unsigned int bc_rep_max_size; +}; + +struct ptlrpc_service_thr_conf { + /* threadname should be 8 characters or less - 6 will be added on */ + char *tc_thr_name; + /* threads increasing factor for each CPU */ + unsigned int tc_thr_factor; + /* service threads # to start on each partition while initializing */ + unsigned int tc_nthrs_init; + /* + * low water of threads # upper-limit on each partition while running, + * service availability may be impacted if threads number is lower + * than this value. It can be ZERO if the service doesn't require + * CPU affinity or there is only one partition. + */ + unsigned int tc_nthrs_base; + /* "soft" limit for total threads number */ + unsigned int tc_nthrs_max; + /* user specified threads number, it will be validated due to + * other members of this structure. */ + unsigned int tc_nthrs_user; + /* bind service threads to only CPUs in their associated CPT */ + unsigned int tc_cpu_bind; + /* Tags for lu_context associated with service thread */ + __u32 tc_ctx_tags; +}; + +struct ptlrpc_service_cpt_conf { + struct cfs_cpt_table *cc_cptable; + /* string pattern to describe CPTs for a service */ + char *cc_pattern; + /* whether or not to have per-CPT service partitions */ + bool cc_affinity; +}; + +struct ptlrpc_service_conf { + /* service name */ + char *psc_name; + /* soft watchdog timeout multiplifier to print stuck service traces */ + unsigned int psc_watchdog_factor; + /* buffer information */ + struct ptlrpc_service_buf_conf psc_buf; + /* thread information */ + struct ptlrpc_service_thr_conf psc_thr; + /* CPU partition information */ + struct ptlrpc_service_cpt_conf psc_cpt; + /* function table */ + struct ptlrpc_service_ops psc_ops; +}; + +/* ptlrpc/service.c */ +/** + * Server-side services API. Register/unregister service, request state + * management, service thread management + * + * @{ + */ +void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock, + int mode, bool no_ack, bool convert_lock); +void ptlrpc_commit_replies(struct obd_export *exp); +void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs); +void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs); +int ptlrpc_hpreq_handler(struct ptlrpc_request *req); +struct ptlrpc_service *ptlrpc_register_service( + struct ptlrpc_service_conf *conf, + struct kset *parent, + struct dentry *debugfs_entry); +void ptlrpc_stop_all_threads(struct ptlrpc_service *svc); + +int ptlrpc_start_threads(struct ptlrpc_service *svc); +int ptlrpc_unregister_service(struct ptlrpc_service *service); +int ptlrpc_service_health_check(struct ptlrpc_service *); +void ptlrpc_server_drop_request(struct ptlrpc_request *req); +void ptlrpc_request_change_export(struct ptlrpc_request *req, + struct obd_export *export); +void ptlrpc_update_export_timer(struct obd_export *exp, + time64_t extra_delay); + +int ptlrpc_hr_init(void); +void ptlrpc_hr_fini(void); + +/** @} */ + +/* ptlrpc/import.c */ +/** + * Import API + * @{ + */ +int ptlrpc_connect_import(struct obd_import *imp); +int ptlrpc_connect_import_locked(struct obd_import *imp); +int ptlrpc_init_import(struct obd_import *imp); +int ptlrpc_disconnect_import(struct obd_import *imp, int noclose); +int ptlrpc_disconnect_and_idle_import(struct obd_import *imp); +int ptlrpc_import_recovery_state_machine(struct obd_import *imp); +void deuuidify(char *uuid, const char *prefix, char **uuid_start, + int *uuid_len); +void ptlrpc_import_enter_resend(struct obd_import *imp); +/* ptlrpc/pack_generic.c */ +int ptlrpc_reconnect_import(struct obd_import *imp); +/** @} */ + +/** + * ptlrpc msg buffer and swab interface + * + * @{ + */ +#define PTLRPC_MAX_BUFCOUNT \ + (sizeof(((struct ptlrpc_request *)0)->rq_req_swab_mask) * 8) +#define MD_MAX_BUFLEN (MDS_REG_MAXREQSIZE > OUT_MAXREQSIZE ? \ + MDS_REG_MAXREQSIZE : OUT_MAXREQSIZE) +#define PTLRPC_MAX_BUFLEN (OST_IO_MAXREQSIZE > MD_MAX_BUFLEN ? \ + OST_IO_MAXREQSIZE : MD_MAX_BUFLEN) +bool ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, + __u32 index); +void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, + __u32 index); +int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len); +int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len); + +int lustre_msg_check_version(struct lustre_msg *msg, __u32 version); +void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, + char **bufs); +int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count, + __u32 *lens, char **bufs); +int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens, + char **bufs); +int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, + __u32 *lens, char **bufs, int flags); +#define LPRFL_EARLY_REPLY 1 +int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens, + char **bufs, int flags); +int lustre_shrink_msg(struct lustre_msg *msg, int segment, + unsigned int newlen, int move_data); +void lustre_free_reply_state(struct ptlrpc_reply_state *rs); +int __lustre_unpack_msg(struct lustre_msg *m, int len); +__u32 lustre_msg_hdr_size(__u32 magic, __u32 count); +__u32 lustre_msg_size(__u32 magic, int count, __u32 *lengths); +__u32 lustre_msg_size_v2(int count, __u32 *lengths); +__u32 lustre_packed_msg_size(struct lustre_msg *msg); +__u32 lustre_msg_early_size(void); +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size); +void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 minlen); +__u32 lustre_msg_buflen(struct lustre_msg *m, __u32 n); +void lustre_msg_set_buflen(struct lustre_msg *m, __u32 n, __u32 len); +__u32 lustre_msg_bufcount(struct lustre_msg *m); +char *lustre_msg_string(struct lustre_msg *m, __u32 n, __u32 max_len); +__u32 lustre_msghdr_get_flags(struct lustre_msg *msg); +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags); +__u32 lustre_msg_get_flags(struct lustre_msg *msg); +void lustre_msg_add_flags(struct lustre_msg *msg, __u32 flags); +void lustre_msg_set_flags(struct lustre_msg *msg, __u32 flags); +void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags); +__u32 lustre_msg_get_op_flags(struct lustre_msg *msg); +void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags); +struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg); +__u32 lustre_msg_get_type(struct lustre_msg *msg); +enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg); +void lustre_msg_add_version(struct lustre_msg *msg, __u32 version); +__u32 lustre_msg_get_opc(struct lustre_msg *msg); +__u64 lustre_msg_get_last_xid(struct lustre_msg *msg); +__u16 lustre_msg_get_tag(struct lustre_msg *msg); +__u64 lustre_msg_get_last_committed(struct lustre_msg *msg); +__u64 *lustre_msg_get_versions(struct lustre_msg *msg); +__u64 lustre_msg_get_transno(struct lustre_msg *msg); +__u64 lustre_msg_get_slv(struct lustre_msg *msg); +__u32 lustre_msg_get_limit(struct lustre_msg *msg); +void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv); +void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit); +int lustre_msg_get_status(struct lustre_msg *msg); +__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg); +__u32 lustre_msg_get_magic(struct lustre_msg *msg); +timeout_t lustre_msg_get_timeout(struct lustre_msg *msg); +timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg); +char *lustre_msg_get_jobid(struct lustre_msg *msg); +__u32 lustre_msg_get_cksum(struct lustre_msg *msg); +__u64 lustre_msg_get_mbits(struct lustre_msg *msg); +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg); +void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle); +void lustre_msg_set_type(struct lustre_msg *msg, __u32 type); +void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc); +void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid); +void lustre_msg_set_tag(struct lustre_msg *msg, __u16 tag); +void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed); +void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions); +void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno); +void lustre_msg_set_status(struct lustre_msg *msg, __u32 status); +void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt); +void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes); +void ptlrpc_request_set_replen(struct ptlrpc_request *req); +void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout); +void lustre_msg_set_service_timeout(struct lustre_msg *msg, + timeout_t service_timeout); +void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid); +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum); +void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits); + +static inline void +lustre_shrink_reply(struct ptlrpc_request *req, int segment, + unsigned int newlen, int move_data) +{ + LASSERT(req->rq_reply_state); + LASSERT(req->rq_repmsg); + req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment, + newlen, move_data); +} + +#ifdef LUSTRE_TRANSLATE_ERRNOS + +static inline int ptlrpc_status_hton(int h) +{ + /* + * Positive errnos must be network errnos, such as LUSTRE_EDEADLK, + * ELDLM_LOCK_ABORTED, etc. + */ + if (h < 0) + return -lustre_errno_hton(-h); + else + return h; +} + +static inline int ptlrpc_status_ntoh(int n) +{ + /* + * See the comment in ptlrpc_status_hton(). + */ + if (n < 0) + return -lustre_errno_ntoh(-n); + else + return n; +} + +#else + +#define ptlrpc_status_hton(h) (h) +#define ptlrpc_status_ntoh(n) (n) + +#endif +/** @} */ + +/** Change request phase of \a req to \a new_phase */ +static inline void +ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase) +{ + if (req->rq_phase == new_phase) + return; + + if (new_phase == RQ_PHASE_UNREG_RPC || + new_phase == RQ_PHASE_UNREG_BULK) { + /* No embedded unregistering phases */ + if (req->rq_phase == RQ_PHASE_UNREG_RPC || + req->rq_phase == RQ_PHASE_UNREG_BULK) + return; + + req->rq_next_phase = req->rq_phase; + if (req->rq_import) + atomic_inc(&req->rq_import->imp_unregistering); + } + + if (req->rq_phase == RQ_PHASE_UNREG_RPC || + req->rq_phase == RQ_PHASE_UNREG_BULK) { + if (req->rq_import) + atomic_dec(&req->rq_import->imp_unregistering); + } + + DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"", + ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase)); + + req->rq_phase = new_phase; +} + +/** + * Returns true if request \a req got early reply and hard deadline is not met + */ +static inline int +ptlrpc_client_early(struct ptlrpc_request *req) +{ + return req->rq_early; +} + +/** + * Returns true if we got real reply from server for this request + */ +static inline int +ptlrpc_client_replied(struct ptlrpc_request *req) +{ + if (req->rq_reply_deadline > ktime_get_real_seconds()) + return 0; + return req->rq_replied; +} + +/** Returns true if request \a req is in process of receiving server reply */ +static inline int +ptlrpc_client_recv(struct ptlrpc_request *req) +{ + if (req->rq_reply_deadline > ktime_get_real_seconds()) + return 1; + return req->rq_receiving_reply; +} + +static inline int +ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req) +{ + int rc; + + spin_lock(&req->rq_lock); + if (req->rq_reply_deadline > ktime_get_real_seconds()) { + spin_unlock(&req->rq_lock); + return 1; + } + if (req->rq_req_deadline > ktime_get_real_seconds()) { + spin_unlock(&req->rq_lock); + return 1; + } + + rc = !req->rq_req_unlinked || !req->rq_reply_unlinked || + req->rq_receiving_reply; + spin_unlock(&req->rq_lock); + return rc; +} + +static inline void +ptlrpc_client_wake_req(struct ptlrpc_request *req) +{ + smp_mb(); + if (req->rq_set == NULL) + wake_up(&req->rq_reply_waitq); + else + wake_up(&req->rq_set->set_waitq); +} + +static inline void +ptlrpc_rs_addref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + atomic_inc(&rs->rs_refcount); +} + +static inline void +ptlrpc_rs_decref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + if (atomic_dec_and_test(&rs->rs_refcount)) + lustre_free_reply_state(rs); +} + +/* Should only be called once per req */ +static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req) +{ + if (req->rq_reply_state == NULL) + return; /* shouldn't occur */ + ptlrpc_rs_decref(req->rq_reply_state); + req->rq_reply_state = NULL; + req->rq_repmsg = NULL; +} + +static inline __u32 lustre_request_magic(struct ptlrpc_request *req) +{ + return lustre_msg_get_magic(req->rq_reqmsg); +} + +static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req) +{ + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return req->rq_reqmsg->lm_repsize; + default: + LASSERTF(0, "incorrect message magic: %08x\n", + req->rq_reqmsg->lm_magic); + return -EFAULT; + } +} + +static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req) +{ + if (req->rq_delay_limit != 0 && + req->rq_queued_time + req->rq_delay_limit < ktime_get_seconds()) + return 1; + return 0; +} + +static inline int ptlrpc_no_resend(struct ptlrpc_request *req) +{ + if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) { + spin_lock(&req->rq_lock); + req->rq_no_resend = 1; + spin_unlock(&req->rq_lock); + } + return req->rq_no_resend; +} + +static inline int +ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt) +{ + int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate); + + return svcpt->scp_service->srv_watchdog_factor * + max_t(int, at, obd_timeout); +} + +static inline struct ptlrpc_service * +ptlrpc_req2svc(struct ptlrpc_request *req) +{ + LASSERT(req->rq_rqbd != NULL); + return req->rq_rqbd->rqbd_svcpt->scp_service; +} + +/* ldlm/ldlm_lib.c */ +/** + * Target client logic + * @{ + */ +int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg); +int client_obd_cleanup(struct obd_device *obddev); +int client_connect_import(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *, + void *localdata); +int client_disconnect_export(struct obd_export *exp); +int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority); +int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid); +int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, + struct obd_uuid *uuid); +int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid); +void client_destroy_import(struct obd_import *imp); +/** @} */ + +#ifdef HAVE_SERVER_SUPPORT +int server_disconnect_export(struct obd_export *exp); +#endif + +/* ptlrpc/pinger.c */ +/** + * Pinger API (client side only) + * @{ + */ +enum timeout_event { + TIMEOUT_GRANT = 1 +}; +struct timeout_item; +typedef int (*timeout_cb_t)(struct timeout_item *, void *); +int ptlrpc_pinger_add_import(struct obd_import *imp); +int ptlrpc_pinger_del_import(struct obd_import *imp); +struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp); +int ptlrpc_obd_ping(struct obd_device *obd); +void ping_evictor_start(void); +void ping_evictor_stop(void); +void ptlrpc_pinger_ir_up(void); +void ptlrpc_pinger_ir_down(void); +/** @} */ +int ptlrpc_pinger_suppress_pings(void); + +/* ptlrpc/ptlrpcd.c */ +void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force); +void ptlrpcd_free(struct ptlrpcd_ctl *pc); +void ptlrpcd_wake(struct ptlrpc_request *req); +void ptlrpcd_add_req(struct ptlrpc_request *req); +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set); +int ptlrpcd_addref(void); +void ptlrpcd_decref(void); + +/* ptlrpc/lproc_ptlrpc.c */ +/** + * procfs output related functions + * @{ + */ +const char* ll_opcode2str(__u32 opcode); +const int ll_str2opcode(const char *ops); +#ifdef CONFIG_PROC_FS +void ptlrpc_lprocfs_register_obd(struct obd_device *obd); +void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd); +void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes); +#else +static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {} +static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {} +static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {} +#endif +/** @} */ + +/* ptlrpc/llog_server.c */ +int llog_origin_handle_open(struct ptlrpc_request *req); +int llog_origin_handle_prev_block(struct ptlrpc_request *req); +int llog_origin_handle_next_block(struct ptlrpc_request *req); +int llog_origin_handle_read_header(struct ptlrpc_request *req); + +/* ptlrpc/llog_client.c */ +extern struct llog_operations llog_client_ops; +/** @} net */ + +#endif +/** @} PtlRPC */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h new file mode 100644 index 0000000000000..9d200bf651b64 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h @@ -0,0 +1,223 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013, Trustees of Indiana University + * + * Copyright (c) 2017, Intel Corporation. + * + * Author: Joshua Walgenbach + */ + +#ifndef _LUSTRE_NODEMAP_H +#define _LUSTRE_NODEMAP_H + +#include + +#define LUSTRE_NODEMAP_NAME "nodemap" + +#define LUSTRE_NODEMAP_DEFAULT_ID 0 + +/** enums containing the types of ids contained in a nodemap + * kept so other modules (mgs, mdt, etc) can define the type + * of search easily + */ + +enum nodemap_id_type { + NODEMAP_UID, + NODEMAP_GID, +}; + +enum nodemap_tree_type { + NODEMAP_FS_TO_CLIENT, + NODEMAP_CLIENT_TO_FS, +}; + +enum nodemap_mapping_modes { + NODEMAP_MAP_BOTH, + NODEMAP_MAP_UID_ONLY, + NODEMAP_MAP_GID_ONLY, +}; + +struct nodemap_pde { + char npe_name[LUSTRE_NODEMAP_NAME_LENGTH + 1]; + struct proc_dir_entry *npe_proc_entry; + struct list_head npe_list_member; +}; + +/** The nodemap id 0 will be the default nodemap. It will have a configuration + * set by the MGS, but no ranges will be allowed as all NIDs that do not map + * will be added to the default nodemap + */ + +struct lu_nodemap { + /* human readable ID */ + char nm_name[LUSTRE_NODEMAP_NAME_LENGTH + 1]; + /* flags to govern nodemap behavior */ + bool nmf_trust_client_ids:1, + nmf_deny_unknown:1, + nmf_allow_root_access:1, + nmf_map_uid_only:1, + nmf_map_gid_only:1, + nmf_enable_audit:1; + /* unique ID set by MGS */ + unsigned int nm_id; + /* nodemap ref counter */ + atomic_t nm_refcount; + /* UID to squash unmapped UIDs */ + uid_t nm_squash_uid; + /* GID to squash unmapped GIDs */ + gid_t nm_squash_gid; + /* NID range list */ + struct list_head nm_ranges; + /* lock for idmap red/black trees */ + struct rw_semaphore nm_idmap_lock; + /* UID map keyed by local UID */ + struct rb_root nm_fs_to_client_uidmap; + /* UID map keyed by remote UID */ + struct rb_root nm_client_to_fs_uidmap; + /* GID map keyed by local UID */ + struct rb_root nm_fs_to_client_gidmap; + /* GID map keyed by remote UID */ + struct rb_root nm_client_to_fs_gidmap; + /* attached client members of this nodemap */ + struct mutex nm_member_list_lock; + struct list_head nm_member_list; + /* access by nodemap name */ + struct hlist_node nm_hash; + struct nodemap_pde *nm_pde_data; + /* fileset the nodes of this nodemap are restricted to */ + char nm_fileset[PATH_MAX+1]; + /* information about the expected SELinux policy on the nodes */ + char nm_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1]; + + /* used when loading/unloading nodemaps */ + struct list_head nm_list; +}; + +/* Store handles to local MGC storage to save config locally. In future + * versions of nodemap, mgc will receive the config directly and so this might + * not be needed. + */ +struct nm_config_file { + struct local_oid_storage *ncf_los; + struct dt_object *ncf_obj; + struct list_head ncf_list; +}; + +void nodemap_activate(const bool value); +int nodemap_add(const char *nodemap_name); +int nodemap_del(const char *nodemap_name); +int nodemap_add_member(lnet_nid_t nid, struct obd_export *exp); +void nodemap_del_member(struct obd_export *exp); +int nodemap_parse_range(const char *range_string, lnet_nid_t range[2]); +int nodemap_parse_idmap(char *idmap_string, __u32 idmap[2]); +int nodemap_add_range(const char *name, const lnet_nid_t nid[2]); +int nodemap_del_range(const char *name, const lnet_nid_t nid[2]); +int nodemap_set_allow_root(const char *name, bool allow_root); +int nodemap_set_trust_client_ids(const char *name, bool trust_client_ids); +int nodemap_set_deny_unknown(const char *name, bool deny_unknown); +int nodemap_set_mapping_mode(const char *name, enum nodemap_mapping_modes mode); +int nodemap_set_squash_uid(const char *name, uid_t uid); +int nodemap_set_squash_gid(const char *name, gid_t gid); +int nodemap_set_audit_mode(const char *name, bool enable_audit); +bool nodemap_can_setquota(const struct lu_nodemap *nodemap); +int nodemap_add_idmap(const char *name, enum nodemap_id_type id_type, + const __u32 map[2]); +int nodemap_del_idmap(const char *name, enum nodemap_id_type id_type, + const __u32 map[2]); +int nodemap_set_fileset(const char *name, const char *fileset); +char *nodemap_get_fileset(const struct lu_nodemap *nodemap); +int nodemap_set_sepol(const char *name, const char *sepol); +const char *nodemap_get_sepol(const struct lu_nodemap *nodemap); +__u32 nodemap_map_id(struct lu_nodemap *nodemap, + enum nodemap_id_type id_type, + enum nodemap_tree_type tree_type, __u32 id); +ssize_t nodemap_map_acl(struct lu_nodemap *nodemap, void *buf, size_t size, + enum nodemap_tree_type tree_type); +#ifdef HAVE_SERVER_SUPPORT +void nodemap_test_nid(lnet_nid_t nid, char *name_buf, size_t name_len); +#else +#define nodemap_test_nid(nid, name_buf, name_len) do {} while(0) +#endif +int nodemap_test_id(lnet_nid_t nid, enum nodemap_id_type idtype, + __u32 client_id, __u32 *fs_id); + +struct nm_config_file *nm_config_file_register_mgs(const struct lu_env *env, + struct dt_object *obj, + struct local_oid_storage *los); +struct dt_device; +struct nm_config_file *nm_config_file_register_tgt(const struct lu_env *env, + struct dt_device *dev, + struct local_oid_storage *los); +void nm_config_file_deregister_mgs(const struct lu_env *env, + struct nm_config_file *ncf); +void nm_config_file_deregister_tgt(const struct lu_env *env, + struct nm_config_file *ncf); +struct lu_nodemap *nodemap_get_from_exp(struct obd_export *exp); +void nodemap_putref(struct lu_nodemap *nodemap); + +#ifdef HAVE_SERVER_SUPPORT +struct nodemap_range_tree { + struct interval_node *nmrt_range_interval_root; + unsigned int nmrt_range_highest_id; +}; + +struct nodemap_config { + /* Highest numerical lu_nodemap.nm_id defined */ + unsigned int nmc_nodemap_highest_id; + + /* Simple flag to determine if nodemaps are active */ + bool nmc_nodemap_is_active; + + /* Pointer to default nodemap as it is needed more often */ + struct lu_nodemap *nmc_default_nodemap; + + /** + * Lock required to access the range tree. + */ + struct rw_semaphore nmc_range_tree_lock; + struct nodemap_range_tree nmc_range_tree; + + /** + * Hash keyed on nodemap name containing all + * nodemaps + */ + struct cfs_hash *nmc_nodemap_hash; +}; + +struct nodemap_config *nodemap_config_alloc(void); +void nodemap_config_dealloc(struct nodemap_config *config); +void nodemap_config_set_active_mgc(struct nodemap_config *config); + +int nodemap_process_idx_pages(struct nodemap_config *config, union lu_page *lip, + struct lu_nodemap **recent_nodemap); + +#else /* disable nodemap processing in MGC of non-servers */ +static inline int nodemap_process_idx_pages(void *config, + union lu_page *lip, + struct lu_nodemap **recent_nodemap) +{ return 0; } +#endif /* HAVE_SERVER_SUPPORT */ + +int nodemap_get_config_req(struct obd_device *mgs_obd, + struct ptlrpc_request *req); +#endif /* _LUSTRE_NODEMAP_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h new file mode 100644 index 0000000000000..6397cf2f0d377 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h @@ -0,0 +1,738 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * + * Network Request Scheduler (NRS) + * + */ + +#ifndef _LUSTRE_NRS_H +#define _LUSTRE_NRS_H + +/** + * \defgroup nrs Network Request Scheduler + * @{ + */ +struct ptlrpc_nrs_policy; +struct ptlrpc_nrs_resource; +struct ptlrpc_nrs_request; + +/** + * NRS control operations. + * + * These are common for all policies. + */ +enum ptlrpc_nrs_ctl { + /** + * Not a valid opcode. + */ + PTLRPC_NRS_CTL_INVALID, + /** + * Activate the policy. + */ + PTLRPC_NRS_CTL_START, + /** + * Reserved for multiple primary policies, which may be a possibility + * in the future. + */ + PTLRPC_NRS_CTL_STOP, + /** + * Policies can start using opcodes from this value and onwards for + * their own purposes; the assigned value itself is arbitrary. + */ + PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20, +}; + +/** + * NRS policy operations. + * + * These determine the behaviour of a policy, and are called in response to + * NRS core events. + */ +struct ptlrpc_nrs_pol_ops { + /** + * Called during policy registration; this operation is optional. + * + * \param[in,out] policy The policy being initialized + */ + int (*op_policy_init) (struct ptlrpc_nrs_policy *policy); + /** + * Called during policy unregistration; this operation is optional. + * + * \param[in,out] policy The policy being unregistered/finalized + */ + void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy); + /** + * Called when activating a policy via lprocfs; policies allocate and + * initialize their resources here; this operation is optional. + * + * \param[in,out] policy The policy being started + * \param[in,out] arg A generic char buffer + * + * \see nrs_policy_start_locked() + */ + int (*op_policy_start) (struct ptlrpc_nrs_policy *policy, + char *arg); + /** + * Called when deactivating a policy via lprocfs; policies deallocate + * their resources here; this operation is optional + * + * \param[in,out] policy The policy being stopped + * + * \see nrs_policy_stop0() + */ + void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy); + /** + * Used for policy-specific operations; i.e. not generic ones like + * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous + * to an ioctl; this operation is optional. + * + * \param[in,out] policy The policy carrying out operation \a opc + * \param[in] opc The command operation being carried out + * \param[in,out] arg An generic buffer for communication between the + * user and the control operation + * + * \retval -ve error + * \retval 0 success + * + * \see ptlrpc_nrs_policy_control() + */ + int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg); + + /** + * Called when obtaining references to the resources of the resource + * hierarchy for a request that has arrived for handling at the PTLRPC + * service. Policies should return -ve for requests they do not wish + * to handle. This operation is mandatory. + * + * \param[in,out] policy The policy we're getting resources for. + * \param[in,out] nrq The request we are getting resources for. + * \param[in] parent The parent resource of the resource being + * requested; set to NULL if none. + * \param[out] resp The resource is to be returned here; the + * fallback policy in an NRS head should + * \e always return a non-NULL pointer value. + * \param[in] moving_req When set, signifies that this is an attempt + * to obtain resources for a request being moved + * to the high-priority NRS head by + * ldlm_lock_reorder_req(). + * This implies two things: + * 1. We are under obd_export::exp_rpc_lock and + * so should not sleep. + * 2. We should not perform non-idempotent or can + * skip performing idempotent operations that + * were carried out when resources were first + * taken for the request when it was initialized + * in ptlrpc_nrs_req_initialize(). + * + * \retval 0, +ve The level of the returned resource in the resource + * hierarchy; currently only 0 (for a non-leaf resource) + * and 1 (for a leaf resource) are supported by the + * framework. + * \retval -ve error + * + * \see ptlrpc_nrs_req_initialize() + * \see ptlrpc_nrs_hpreq_add_nolock() + * \see ptlrpc_nrs_req_hp_move() + */ + int (*op_res_get) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, + bool moving_req); + /** + * Called when releasing references taken for resources in the resource + * hierarchy for the request; this operation is optional. + * + * \param[in,out] policy The policy the resource belongs to + * \param[in] res The resource to be freed + * + * \see ptlrpc_nrs_req_finalize() + * \see ptlrpc_nrs_hpreq_add_nolock() + * \see ptlrpc_nrs_req_hp_move() + */ + void (*op_res_put) (struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res); + + /** + * Obtains a request for handling from the policy, and optionally + * removes the request from the policy; this operation is mandatory. + * + * \param[in,out] policy The policy to poll + * \param[in] peek When set, signifies that we just want to + * examine the request, and not handle it, so the + * request is not removed from the policy. + * \param[in] force When set, it will force a policy to return a + * request if it has one queued. + * + * \retval NULL No request available for handling + * \retval valid-pointer The request polled for handling + * + * \see ptlrpc_nrs_req_get_nolock() + */ + struct ptlrpc_nrs_request * + (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek, + bool force); + /** + * Called when attempting to add a request to a policy for later + * handling; this operation is mandatory. + * + * \param[in,out] policy The policy on which to enqueue \a nrq + * \param[in,out] nrq The request to enqueue + * + * \retval 0 success + * \retval != 0 error + * + * \see ptlrpc_nrs_req_add_nolock() + */ + int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Removes a request from the policy's set of pending requests. Normally + * called after a request has been polled successfully from the policy + * for handling; this operation is mandatory. + * + * \param[in,out] policy The policy the request \a nrq belongs to + * \param[in,out] nrq The request to dequeue + * + * \see ptlrpc_nrs_req_del_nolock() + */ + void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Called after the request being carried out. Could be used for + * job/resource control; this operation is optional. + * + * \param[in,out] policy The policy which is stopping to handle request + * \a nrq + * \param[in,out] nrq The request + * + * \pre assert_spin_locked(&svcpt->scp_req_lock) + * + * \see ptlrpc_nrs_req_stop_nolock() + */ + void (*op_req_stop) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Registers the policy's lprocfs interface with a PTLRPC service. + * + * \param[in] svc The service + * + * \retval 0 success + * \retval != 0 error + */ + int (*op_lprocfs_init) (struct ptlrpc_service *svc); + /** + * Unegisters the policy's lprocfs interface with a PTLRPC service. + * + * In cases of failed policy registration in + * \e ptlrpc_nrs_policy_register(), this function may be called for a + * service which has not registered the policy successfully, so + * implementations of this method should make sure their operations are + * safe in such cases. + * + * \param[in] svc The service + */ + void (*op_lprocfs_fini) (struct ptlrpc_service *svc); +}; + +/** + * Policy flags + */ +enum nrs_policy_flags { + /** + * Fallback policy, use this flag only on a single supported policy per + * service. The flag cannot be used on policies that use + * \e PTLRPC_NRS_FL_REG_EXTERN + */ + PTLRPC_NRS_FL_FALLBACK = (1 << 0), + /** + * Start policy immediately after registering. + */ + PTLRPC_NRS_FL_REG_START = (1 << 1), + /** + * This is a policy registering from a module different to the one NRS + * core ships in (currently ptlrpc). + */ + PTLRPC_NRS_FL_REG_EXTERN = (1 << 2), +}; + +/** + * NRS queue type. + * + * Denotes whether an NRS instance is for handling normal or high-priority + * RPCs, or whether an operation pertains to one or both of the NRS instances + * in a service. + */ +enum ptlrpc_nrs_queue_type { + PTLRPC_NRS_QUEUE_REG = (1 << 0), + PTLRPC_NRS_QUEUE_HP = (1 << 1), + PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP) +}; + +/** + * NRS head + * + * A PTLRPC service has at least one NRS head instance for handling normal + * priority RPCs, and may optionally have a second NRS head instance for + * handling high-priority RPCs. Each NRS head maintains a list of available + * policies, of which one and only one policy is acting as the fallback policy, + * and optionally a different policy may be acting as the primary policy. For + * all RPCs handled by this NRS head instance, NRS core will first attempt to + * enqueue the RPC using the primary policy (if any). The fallback policy is + * used in the following cases: + * - when there was no primary policy in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request + * was initialized. + * - when the primary policy that was at the + * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the + * RPC was initialized, denoted it did not wish, or for some other reason was + * not able to handle the request, by returning a non-valid NRS resource + * reference. + * - when the primary policy that was at the + * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the + * RPC was initialized, fails later during the request enqueueing stage. + * + * \see nrs_resource_get_safe() + * \see nrs_request_enqueue() + */ +struct ptlrpc_nrs { + spinlock_t nrs_lock; + /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */ + /** + * List of registered policies + */ + struct list_head nrs_policy_list; + /** + * List of policies with queued requests. Policies that have any + * outstanding requests are queued here, and this list is queried + * in a round-robin manner from NRS core when obtaining a request + * for handling. This ensures that requests from policies that at some + * point transition away from the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained. + */ + struct list_head nrs_policy_queued; + /** + * Service partition for this NRS head + */ + struct ptlrpc_service_part *nrs_svcpt; + /** + * Primary policy, which is the preferred policy for handling RPCs + */ + struct ptlrpc_nrs_policy *nrs_policy_primary; + /** + * Fallback policy, which is the backup policy for handling RPCs + */ + struct ptlrpc_nrs_policy *nrs_policy_fallback; + /** + * This NRS head handles either HP or regular requests + */ + enum ptlrpc_nrs_queue_type nrs_queue_type; + /** + * # queued requests from all policies in this NRS head + */ + unsigned long nrs_req_queued; + /** + * # scheduled requests from all policies in this NRS head + */ + unsigned long nrs_req_started; + /** + * # policies on this NRS + */ + unsigned nrs_num_pols; + /** + * This NRS head is in progress of starting a policy + */ + unsigned nrs_policy_starting:1; + /** + * In progress of shutting down the whole NRS head; used during + * unregistration + */ + unsigned nrs_stopping:1; + /** + * NRS policy is throttling reqeust + */ + unsigned nrs_throttling:1; +}; + +#define NRS_POL_NAME_MAX 16 +#define NRS_POL_ARG_MAX 16 + +struct ptlrpc_nrs_pol_desc; + +/** + * Service compatibility predicate; this determines whether a policy is adequate + * for handling RPCs of a particular PTLRPC service. + * + * XXX:This should give the same result during policy registration and + * unregistration, and for all partitions of a service; so the result should not + * depend on temporal service or other properties, that may influence the + * result. + */ +typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc); + +struct ptlrpc_nrs_pol_conf { + /** + * Human-readable policy name + */ + char nc_name[NRS_POL_NAME_MAX]; + /** + * NRS operations for this policy + */ + const struct ptlrpc_nrs_pol_ops *nc_ops; + /** + * Service compatibility predicate + */ + nrs_pol_desc_compat_t nc_compat; + /** + * Set for policies that support a single ptlrpc service, i.e. ones that + * have \a pd_compat set to nrs_policy_compat_one(). The variable value + * depicts the name of the single service that such policies are + * compatible with. + */ + const char *nc_compat_svc_name; + /** + * Owner module for this policy descriptor; policies registering from a + * different module to the one the NRS framework is held within + * (currently ptlrpc), should set this field to THIS_MODULE. + */ + struct module *nc_owner; + /** + * Policy registration flags; a bitmast of \e nrs_policy_flags + */ + unsigned nc_flags; +}; + +/** + * NRS policy registering descriptor + * + * Is used to hold a description of a policy that can be passed to NRS core in + * order to register the policy with NRS heads in different PTLRPC services. + */ +struct ptlrpc_nrs_pol_desc { + /** + * Human-readable policy name + */ + char pd_name[NRS_POL_NAME_MAX]; + /** + * Link into nrs_core::nrs_policies + */ + struct list_head pd_list; + /** + * NRS operations for this policy + */ + const struct ptlrpc_nrs_pol_ops *pd_ops; + /** + * Service compatibility predicate + */ + nrs_pol_desc_compat_t pd_compat; + /** + * Set for policies that are compatible with only one PTLRPC service. + * + * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name + */ + const char *pd_compat_svc_name; + /** + * Owner module for this policy descriptor. + * + * We need to hold a reference to the module whenever we might make use + * of any of the module's contents, i.e. + * - If one or more instances of the policy are at a state where they + * might be handling a request, i.e. + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to + * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference + * is taken on the module when + * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it + * becomes 0, so that we hold only one reference to the module maximum + * at any time. + * + * We do not need to hold a reference to the module, even though we + * might use code and data from the module, in the following cases: + * - During external policy registration, because this should happen in + * the module's init() function, in which case the module is safe from + * removal because a reference is being held on the module by the + * kernel, and iirc kmod (and I guess module-init-tools also) will + * serialize any racing processes properly anyway. + * - During external policy unregistration, because this should happen + * in a module's exit() function, and any attempts to start a policy + * instance would need to take a reference on the module, and this is + * not possible once we have reached the point where the exit() + * handler is called. + * - During service registration and unregistration, as service setup + * and cleanup, and policy registration, unregistration and policy + * instance starting, are serialized by \e nrs_core::nrs_mutex, so + * as long as users adhere to the convention of registering policies + * in init() and unregistering them in module exit() functions, there + * should not be a race between these operations. + * - During any policy-specific lprocfs operations, because a reference + * is held by the kernel on a proc entry that has been entered by a + * syscall, so as long as proc entries are removed during + * unregistration time, then unregistration and lprocfs operations + * will be properly serialized. + */ + struct module *pd_owner; + /** + * Bitmask of \e nrs_policy_flags + */ + unsigned pd_flags; + /** + * # of references on this descriptor + */ + atomic_t pd_refs; +}; + +/** + * NRS policy state + * + * Policies transition from one state to the other during their lifetime + */ +enum ptlrpc_nrs_pol_state { + /** + * Not a valid policy state. + */ + NRS_POL_STATE_INVALID, + /** + * Policies are at this state either at the start of their life, or + * transition here when the user selects a different policy to act + * as the primary one. + */ + NRS_POL_STATE_STOPPED, + /** + * Policy is progress of stopping + */ + NRS_POL_STATE_STOPPING, + /** + * Policy is in progress of starting + */ + NRS_POL_STATE_STARTING, + /** + * A policy is in this state in two cases: + * - it is the fallback policy, which is always in this state. + * - it has been activated by the user; i.e. it is the primary policy, + */ + NRS_POL_STATE_STARTED, +}; + +/** + * NRS policy information + * + * Used for obtaining information for the status of a policy via lprocfs + */ +struct ptlrpc_nrs_pol_info { + /** + * Policy name + */ + char pi_name[NRS_POL_NAME_MAX]; + /** + * Policy argument + */ + char pi_arg[NRS_POL_ARG_MAX]; + /** + * Current policy state + */ + enum ptlrpc_nrs_pol_state pi_state; + /** + * # RPCs enqueued for later dispatching by the policy + */ + long pi_req_queued; + /** + * # RPCs started for dispatch by the policy + */ + long pi_req_started; + /** + * Is this a fallback policy? + */ + unsigned pi_fallback:1; +}; + +/** + * NRS policy + * + * There is one instance of this for each policy in each NRS head of each + * PTLRPC service partition. + */ +struct ptlrpc_nrs_policy { + /** + * Linkage into the NRS head's list of policies, + * ptlrpc_nrs:nrs_policy_list + */ + struct list_head pol_list; + /** + * Linkage into the NRS head's list of policies with enqueued + * requests ptlrpc_nrs:nrs_policy_queued + */ + struct list_head pol_list_queued; + /** + * Current state of this policy + */ + enum ptlrpc_nrs_pol_state pol_state; + /** + * Bitmask of nrs_policy_flags + */ + unsigned pol_flags; + /** + * # RPCs enqueued for later dispatching by the policy + */ + long pol_req_queued; + /** + * # RPCs started for dispatch by the policy + */ + long pol_req_started; + /** + * Usage Reference count taken on the policy instance + */ + long pol_ref; + /** + * Human-readable policy argument + */ + char pol_arg[NRS_POL_ARG_MAX]; + /** + * The NRS head this policy has been created at + */ + struct ptlrpc_nrs *pol_nrs; + /** + * Private policy data; varies by policy type + */ + void *pol_private; + /** + * Policy descriptor for this policy instance. + */ + struct ptlrpc_nrs_pol_desc *pol_desc; +}; + +/** + * NRS resource + * + * Resources are embedded into two types of NRS entities: + * - Inside NRS policies, in the policy's private data in + * ptlrpc_nrs_policy::pol_private + * - In objects that act as prime-level scheduling entities in different NRS + * policies; e.g. on a policy that performs round robin or similar order + * scheduling across client NIDs, there would be one NRS resource per unique + * client NID. On a policy which performs round robin scheduling across + * backend filesystem objects, there would be one resource associated with + * each of the backend filesystem objects partaking in the scheduling + * performed by the policy. + * + * NRS resources share a parent-child relationship, in which resources embedded + * in policy instances are the parent entities, with all scheduling entities + * a policy schedules across being the children, thus forming a simple resource + * hierarchy. This hierarchy may be extended with one or more levels in the + * future if the ability to have more than one primary policy is added. + * + * Upon request initialization, references to the then active NRS policies are + * taken and used to later handle the dispatching of the request with one of + * these policies. + * + * \see nrs_resource_get_safe() + * \see ptlrpc_nrs_req_add() + */ +struct ptlrpc_nrs_resource { + /** + * This NRS resource's parent; is NULL for resources embedded in NRS + * policy instances; i.e. those are top-level ones. + */ + struct ptlrpc_nrs_resource *res_parent; + /** + * The policy associated with this resource. + */ + struct ptlrpc_nrs_policy *res_policy; +}; + +enum { + NRS_RES_FALLBACK, + NRS_RES_PRIMARY, + NRS_RES_MAX +}; + +#include +#include +#include +#include +#include + +/** + * NRS request + * + * Instances of this object exist embedded within ptlrpc_request; the main + * purpose of this object is to hold references to the request's resources + * for the lifetime of the request, and to hold properties that policies use + * use for determining the request's scheduling priority. + * */ +struct ptlrpc_nrs_request { + /** + * The request's resource hierarchy. + */ + struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX]; + /** + * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the + * policy that was used to enqueue the request. + * + * \see nrs_request_enqueue() + */ + unsigned nr_res_idx; + unsigned nr_initialized:1; + unsigned nr_enqueued:1; + unsigned nr_started:1; + unsigned nr_finalized:1; + struct cfs_binheap_node nr_node; + + /** + * Policy-specific fields, used for determining a request's scheduling + * priority, and other supporting functionality. + */ + union { + /** + * Fields for the FIFO policy + */ + struct nrs_fifo_req fifo; + /** + * CRR-N request defintion + */ + struct nrs_crrn_req crr; + /** ORR and TRR share the same request definition */ + struct nrs_orr_req orr; + /** + * TBF request definition + */ + struct nrs_tbf_req tbf; + /** + * Fields for the delay policy + */ + struct nrs_delay_req delay; + } nr_u; + /** + * Externally-registering policies may want to use this to allocate + * their own request properties. + */ + void *ext; +}; + +/** @} nrs */ +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h new file mode 100644 index 0000000000000..f057ec72d9289 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h @@ -0,0 +1,126 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * + * Network Request Scheduler (NRS) Client Round Robin over NIDs (CRR-N) policy + * + */ + +#ifndef _LUSTRE_NRS_CRR_H +#define _LUSTRE_NRS_CRR_H + +/** + * \name CRR-N + * + * CRR-N, Client Round Robin over NIDs + * @{ + */ + +/** + * private data structure for CRR-N NRS + */ +struct nrs_crrn_net { + struct ptlrpc_nrs_resource cn_res; + struct cfs_binheap *cn_binheap; + struct cfs_hash *cn_cli_hash; + /** + * Used when a new scheduling round commences, in order to synchronize + * all clients with the new round number. + */ + __u64 cn_round; + /** + * Determines the relevant ordering amongst request batches within a + * scheduling round. + */ + __u64 cn_sequence; + /** + * Round Robin quantum; the maximum number of RPCs that each request + * batch for each client can have in a scheduling round. + */ + __u16 cn_quantum; +}; + +/** + * Object representing a client in CRR-N, as identified by its NID + */ +struct nrs_crrn_client { + struct ptlrpc_nrs_resource cc_res; + struct hlist_node cc_hnode; + lnet_nid_t cc_nid; + /** + * The round number against which this client is currently scheduling + * requests. + */ + __u64 cc_round; + /** + * The sequence number used for requests scheduled by this client during + * the current round number. + */ + __u64 cc_sequence; + atomic_t cc_ref; + /** + * Round Robin quantum; the maximum number of RPCs the client is allowed + * to schedule in a single batch of each round. + */ + __u16 cc_quantum; + /** + * # of pending requests for this client, on all existing rounds + */ + __u16 cc_active; +}; + +/** + * CRR-N NRS request definition + */ +struct nrs_crrn_req { + /** + * Round number for this request; shared with all other requests in the + * same batch. + */ + __u64 cr_round; + /** + * Sequence number for this request; shared with all other requests in + * the same batch. + */ + __u64 cr_sequence; +}; + +/** + * CRR-N policy operations. + */ +enum nrs_ctl_crr { + /** + * Read the RR quantum size of a CRR-N policy. + */ + NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC, + /** + * Write the RR quantum size of a CRR-N policy. + */ + NRS_CTL_CRRN_WR_QUANTUM, +}; + +/** @} CRR-N */ +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h new file mode 100644 index 0000000000000..01605a7f4129e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, Cray Inc. All Rights Reserved. + * + * Copyright (c) 2015, Intel Corporation. + */ +/* + * + * Network Request Scheduler (NRS) Delay policy + * + */ + +#ifndef _LUSTRE_NRS_DELAY_H +#define _LUSTRE_NRS_DELAY_H + +/* \name delay + * + * Delay policy + * @{ + */ + +/** + * Private data structure for the delay policy + */ +struct nrs_delay_data { + struct ptlrpc_nrs_resource delay_res; + + /** + * Delayed requests are stored in this binheap until they are + * removed for handling. + */ + struct cfs_binheap *delay_binheap; + + /** + * Minimum service time + */ + __u32 min_delay; + + /** + * Maximum service time + */ + __u32 max_delay; + + /** + * We'll delay this percent of requests + */ + __u32 delay_pct; +}; + +struct nrs_delay_req { + /** + * This is the time at which a request becomes eligible for handling + */ + time64_t req_start_time; +}; + +enum nrs_ctl_delay { + NRS_CTL_DELAY_RD_MIN = PTLRPC_NRS_CTL_1ST_POL_SPEC, + NRS_CTL_DELAY_WR_MIN, + NRS_CTL_DELAY_RD_MAX, + NRS_CTL_DELAY_WR_MAX, + NRS_CTL_DELAY_RD_PCT, + NRS_CTL_DELAY_WR_PCT, +}; + +/** @} delay */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h new file mode 100644 index 0000000000000..3b5418eac6c44 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h @@ -0,0 +1,70 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * + * Network Request Scheduler (NRS) First-in First-out (FIFO) policy + * + */ + +#ifndef _LUSTRE_NRS_FIFO_H +#define _LUSTRE_NRS_FIFO_H + +/* \name fifo + * + * FIFO policy + * + * This policy is a logical wrapper around previous, non-NRS functionality. + * It dispatches RPCs in the same order as they arrive from the network. This + * policy is currently used as the fallback policy, and the only enabled policy + * on all NRS heads of all PTLRPC service partitions. + * @{ + */ + +/** + * Private data structure for the FIFO policy + */ +struct nrs_fifo_head { + /** + * Resource object for policy instance. + */ + struct ptlrpc_nrs_resource fh_res; + /** + * List of queued requests. + */ + struct list_head fh_list; + /** + * For debugging purposes. + */ + __u64 fh_sequence; +}; + +struct nrs_fifo_req { + struct list_head fr_list; + __u64 fr_sequence; +}; + +/** @} fifo */ +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h new file mode 100644 index 0000000000000..d9789b26286aa --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h @@ -0,0 +1,225 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * + * Network Request Scheduler (NRS) Object-based Round Robin and Target-based + * Round Robin (ORR and TRR) policies + * + */ + +#ifndef _LUSTRE_NRS_ORR_H +#define _LUSTRE_NRS_ORR_H + +/** + * ORR policy operations + */ +enum nrs_ctl_orr { + NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC, + NRS_CTL_ORR_WR_QUANTUM, + NRS_CTL_ORR_RD_OFF_TYPE, + NRS_CTL_ORR_WR_OFF_TYPE, + NRS_CTL_ORR_RD_SUPP_REQ, + NRS_CTL_ORR_WR_SUPP_REQ, +}; + +/** + * \name ORR/TRR + * + * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies + * @{ + */ + +/** + * Lower and upper byte offsets of a brw RPC + */ +struct nrs_orr_req_range { + __u64 or_start; + __u64 or_end; +}; + +/** + * RPC types supported by the ORR/TRR policies + */ +enum nrs_orr_supp { + NOS_OST_READ = (1 << 0), + NOS_OST_WRITE = (1 << 1), + NOS_OST_RW = (NOS_OST_READ | NOS_OST_WRITE), + /** + * Default value for policies. + */ + NOS_DFLT = NOS_OST_READ +}; + +/** + * As unique keys for grouping RPCs together, we use the object's OST FID for + * the ORR policy, and the OST index for the TRR policy. + * + * XXX: We waste some space for TRR policy instances by using a union, but it + * allows to consolidate some of the code between ORR and TRR, and these + * policies will probably eventually merge into one anyway. + */ +struct nrs_orr_key { + union { + /** object FID for ORR */ + struct lu_fid ok_fid; + /** OST index for TRR */ + __u32 ok_idx; + }; +}; + +/** + * The largest base string for unique hash/slab object names is + * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT + * id number, so this _should_ be more than enough for the maximum number of + * CPTs on any system. If it does happen that this statement is incorrect, + * nrs_orr_genobjname() will inevitably yield a non-unique name and cause + * kmem_cache_create() to complain (on Linux), so the erroneous situation + * will hopefully not go unnoticed. + */ +#define NRS_ORR_OBJ_NAME_MAX (sizeof("nrs_orr_reg_") + 3) + +/** + * private data structure for ORR and TRR NRS + */ +struct nrs_orr_data { + struct ptlrpc_nrs_resource od_res; + struct cfs_binheap *od_binheap; + struct cfs_hash *od_obj_hash; + struct kmem_cache *od_cache; + /** + * Used when a new scheduling round commences, in order to synchronize + * all object or OST batches with the new round number. + */ + __u64 od_round; + /** + * Determines the relevant ordering amongst request batches within a + * scheduling round. + */ + __u64 od_sequence; + /** + * RPC types that are currently supported. + */ + enum nrs_orr_supp od_supp; + /** + * Round Robin quantum; the maxium number of RPCs that each request + * batch for each object or OST can have in a scheduling round. + */ + __u16 od_quantum; + /** + * Whether to use physical disk offsets or logical file offsets. + */ + bool od_physical; + /** + * XXX: We need to provide a persistently allocated string to hold + * unique object names for this policy, since in currently supported + * versions of Linux by Lustre, kmem_cache_create() just sets a pointer + * to the name string provided. kstrdup() is used in the version of + * kmeme_cache_create() in current Linux mainline, so we may be able to + * remove this in the future. + */ + char od_objname[NRS_ORR_OBJ_NAME_MAX]; +}; + +/** + * Represents a backend-fs object or OST in the ORR and TRR policies + * respectively + */ +struct nrs_orr_object { + struct ptlrpc_nrs_resource oo_res; + struct hlist_node oo_hnode; + /** + * The round number against which requests are being scheduled for this + * object or OST + */ + __u64 oo_round; + /** + * The sequence number used for requests scheduled for this object or + * OST during the current round number. + */ + __u64 oo_sequence; + /** + * The key of the object or OST for which this structure instance is + * scheduling RPCs + */ + struct nrs_orr_key oo_key; + long oo_ref; + /** + * Round Robin quantum; the maximum number of RPCs that are allowed to + * be scheduled for the object or OST in a single batch of each round. + */ + __u16 oo_quantum; + /** + * # of pending requests for this object or OST, on all existing rounds + */ + __u16 oo_active; +}; + +/** + * ORR/TRR NRS request definition + */ +struct nrs_orr_req { + /** + * The offset range this request covers + */ + struct nrs_orr_req_range or_range; + /** + * Round number for this request; shared with all other requests in the + * same batch. + */ + __u64 or_round; + /** + * Sequence number for this request; shared with all other requests in + * the same batch. + */ + __u64 or_sequence; + /** + * For debugging purposes. + */ + struct nrs_orr_key or_key; + /** + * An ORR policy instance has filled in request information while + * enqueueing the request on the service partition's regular NRS head. + */ + unsigned int or_orr_set:1; + /** + * A TRR policy instance has filled in request information while + * enqueueing the request on the service partition's regular NRS head. + */ + unsigned int or_trr_set:1; + /** + * Request offset ranges have been filled in with logical offset + * values. + */ + unsigned int or_logical_set:1; + /** + * Request offset ranges have been filled in with physical offset + * values. + */ + unsigned int or_physical_set:1; +}; + +/** @} ORR/TRR */ +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h new file mode 100644 index 0000000000000..0a407197c36f6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h @@ -0,0 +1,381 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013 DataDirect Networks, Inc. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * + * Network Request Scheduler (NRS) Token Bucket Filter(TBF) policy + * + */ + +#ifndef _LUSTRE_NRS_TBF_H +#define _LUSTRE_NRS_TBF_H + +/* \name tbf + * + * TBF policy + * + * @{ + */ + +struct nrs_tbf_head; +struct nrs_tbf_cmd; + +#define NRS_TBF_MATCH_FULL 0x0000001 +#define NRS_TBF_MATCH_WILDCARD 0x0000002 + +struct nrs_tbf_jobid { + char *tj_id; + __u32 tj_match_flag; + struct list_head tj_linkage; +}; + +#define MAX_U32_STR_LEN 10 +#define NRS_TBF_KEY_LEN (LNET_NIDSTR_SIZE + LUSTRE_JOBID_SIZE + \ + MAX_U32_STR_LEN + MAX_U32_STR_LEN + 3 + 2) + +enum nrs_tbf_flag { + NRS_TBF_FLAG_INVALID = 0x0000000, + NRS_TBF_FLAG_JOBID = 0x0000001, + NRS_TBF_FLAG_NID = 0x0000002, + NRS_TBF_FLAG_OPCODE = 0x0000004, + NRS_TBF_FLAG_GENERIC = 0x0000008, + NRS_TBF_FLAG_UID = 0x0000010, + NRS_TBF_FLAG_GID = 0x0000020, +}; + +struct tbf_id { + enum nrs_tbf_flag ti_type; + u32 ti_uid; + u32 ti_gid; +}; + +struct nrs_tbf_id { + struct tbf_id nti_id; + struct list_head nti_linkage; +}; + +struct nrs_tbf_client { + /** Resource object for policy instance. */ + struct ptlrpc_nrs_resource tc_res; + /** Node in the hash table. */ + struct hlist_node tc_hnode; + /** NID of the client. */ + lnet_nid_t tc_nid; + /** Jobid of the client. */ + char tc_jobid[LUSTRE_JOBID_SIZE]; + /** opcode of the client. */ + __u32 tc_opcode; + /** gid or uid of the client. */ + struct tbf_id tc_id; + /** Hash key of the client. */ + char tc_key[NRS_TBF_KEY_LEN]; + /** Reference number of the client. */ + atomic_t tc_ref; + /** Lock to protect rule and linkage. */ + spinlock_t tc_rule_lock; + /** Linkage to rule. */ + struct list_head tc_linkage; + /** Pointer to rule. */ + struct nrs_tbf_rule *tc_rule; + /** Generation of the rule matched. */ + __u64 tc_rule_generation; + /** Limit of RPC rate. */ + __u64 tc_rpc_rate; + /** Time to wait for next token. */ + __u64 tc_nsecs; + /** RPC token number. */ + __u64 tc_ntoken; + /** Token bucket depth. */ + __u64 tc_depth; + /** Time check-point. */ + __u64 tc_check_time; + /** Deadline of a class */ + __u64 tc_deadline; + /** + * Time residue: the remainder of elapsed time + * divided by nsecs when dequeue a request. + */ + __u64 tc_nsecs_resid; + /** List of queued requests. */ + struct list_head tc_list; + /** Node in binary heap. */ + struct cfs_binheap_node tc_node; + /** Whether the client is in heap. */ + bool tc_in_heap; + /** Sequence of the newest rule. */ + __u32 tc_rule_sequence; + /** + * Linkage into LRU list. Protected bucket lock of + * nrs_tbf_head::th_cli_hash. + */ + struct list_head tc_lru; +}; + +#define MAX_TBF_NAME (16) + +enum nrs_rule_flags { + NTRS_STOPPING = 0x00000001, + NTRS_DEFAULT = 0x00000002, + NTRS_REALTIME = 0x00000004, +}; + +struct nrs_tbf_rule { + /** Name of the rule. */ + char tr_name[MAX_TBF_NAME]; + /** Head belongs to. */ + struct nrs_tbf_head *tr_head; + /** Likage to head. */ + struct list_head tr_linkage; + /** Nid list of the rule. */ + struct list_head tr_nids; + /** Nid list string of the rule.*/ + char *tr_nids_str; + /** Jobid list of the rule. */ + struct list_head tr_jobids; + /** Jobid list string of the rule.*/ + char *tr_jobids_str; + /** uid/gid list of the rule. */ + struct list_head tr_ids; + /** uid/gid list string of the rule. */ + char *tr_ids_str; + /** Opcode bitmap of the rule. */ + struct cfs_bitmap *tr_opcodes; + /** Opcode list string of the rule.*/ + char *tr_opcodes_str; + /** Condition list of the rule.*/ + struct list_head tr_conds; + /** Generic condition string of the rule. */ + char *tr_conds_str; + /** RPC/s limit. */ + __u64 tr_rpc_rate; + /** Time to wait for next token. */ + __u64 tr_nsecs; + /** Token bucket depth. */ + __u64 tr_depth; + /** Lock to protect the list of clients. */ + spinlock_t tr_rule_lock; + /** List of client. */ + struct list_head tr_cli_list; + /** Flags of the rule. */ + enum nrs_rule_flags tr_flags; + /** Usage Reference count taken on the rule. */ + atomic_t tr_ref; + /** Generation of the rule. */ + __u64 tr_generation; +}; + +struct nrs_tbf_ops { + char *o_name; + int (*o_startup)(struct ptlrpc_nrs_policy *, struct nrs_tbf_head *); + struct nrs_tbf_client *(*o_cli_find)(struct nrs_tbf_head *, + struct ptlrpc_request *); + struct nrs_tbf_client *(*o_cli_findadd)(struct nrs_tbf_head *, + struct nrs_tbf_client *); + void (*o_cli_put)(struct nrs_tbf_head *, struct nrs_tbf_client *); + void (*o_cli_init)(struct nrs_tbf_client *, struct ptlrpc_request *); + int (*o_rule_init)(struct ptlrpc_nrs_policy *, + struct nrs_tbf_rule *, + struct nrs_tbf_cmd *); + int (*o_rule_dump)(struct nrs_tbf_rule *, struct seq_file *); + int (*o_rule_match)(struct nrs_tbf_rule *, + struct nrs_tbf_client *); + void (*o_rule_fini)(struct nrs_tbf_rule *); +}; + +#define NRS_TBF_TYPE_JOBID "jobid" +#define NRS_TBF_TYPE_NID "nid" +#define NRS_TBF_TYPE_OPCODE "opcode" +#define NRS_TBF_TYPE_GENERIC "generic" +#define NRS_TBF_TYPE_UID "uid" +#define NRS_TBF_TYPE_GID "gid" +#define NRS_TBF_TYPE_MAX_LEN 20 + +struct nrs_tbf_type { + const char *ntt_name; + enum nrs_tbf_flag ntt_flag; + struct nrs_tbf_ops *ntt_ops; +}; + +struct nrs_tbf_bucket { + /** + * LRU list, updated on each access to client. Protected by + * bucket lock of nrs_tbf_head::th_cli_hash. + */ + struct list_head ntb_lru; +}; + +/** + * Private data structure for the TBF policy + */ +struct nrs_tbf_head { + /** + * Resource object for policy instance. + */ + struct ptlrpc_nrs_resource th_res; + /** + * List of rules. + */ + struct list_head th_list; + /** + * Lock to protect the list of rules. + */ + spinlock_t th_rule_lock; + /** + * Generation of rules. + */ + atomic_t th_rule_sequence; + /** + * Default rule. + */ + struct nrs_tbf_rule *th_rule; + /** + * Timer for next token. + */ + struct hrtimer th_timer; + /** + * Deadline of the timer. + */ + __u64 th_deadline; + /** + * Sequence of requests. + */ + __u64 th_sequence; + /** + * Heap of queues. + */ + struct cfs_binheap *th_binheap; + /** + * Hash of clients. + */ + struct cfs_hash *th_cli_hash; + /** + * Type of TBF policy. + */ + char th_type[NRS_TBF_TYPE_MAX_LEN + 1]; + /** + * Rule operations. + */ + struct nrs_tbf_ops *th_ops; + /** + * Flag of type. + */ + __u32 th_type_flag; + /** + * Index of bucket on hash table while purging. + */ + int th_purge_start; +}; + +enum nrs_tbf_cmd_type { + NRS_CTL_TBF_START_RULE = 0, + NRS_CTL_TBF_STOP_RULE, + NRS_CTL_TBF_CHANGE_RULE, +}; + +struct nrs_tbf_cmd { + enum nrs_tbf_cmd_type tc_cmd; + char *tc_name; + union { + struct nrs_tbf_cmd_start { + __u64 ts_rpc_rate; + struct list_head ts_nids; + char *ts_nids_str; + struct list_head ts_jobids; + char *ts_jobids_str; + struct list_head ts_ids; + char *ts_ids_str; + struct cfs_bitmap *ts_opcodes; + char *ts_opcodes_str; + struct list_head ts_conds; + char *ts_conds_str; + __u32 ts_valid_type; + enum nrs_rule_flags ts_rule_flags; + char *ts_next_name; + } tc_start; + struct nrs_tbf_cmd_change { + __u64 tc_rpc_rate; + char *tc_next_name; + } tc_change; + } u; +}; + +enum nrs_tbf_field { + NRS_TBF_FIELD_NID, + NRS_TBF_FIELD_JOBID, + NRS_TBF_FIELD_OPCODE, + NRS_TBF_FIELD_UID, + NRS_TBF_FIELD_GID, + NRS_TBF_FIELD_MAX +}; + +struct nrs_tbf_expression { + enum nrs_tbf_field te_field; + struct list_head te_cond; + struct cfs_bitmap *te_opcodes; + struct list_head te_linkage; +}; + +struct nrs_tbf_conjunction { + /** + * link to disjunction. + */ + struct list_head tc_linkage; + /** + * list of logical conjunction + */ + struct list_head tc_expressions; +}; + +struct nrs_tbf_req { + /** + * Linkage to queue. + */ + struct list_head tr_list; + /** + * Sequence of the request. + */ + __u64 tr_sequence; +}; + +/** + * TBF policy operations. + */ +enum nrs_ctl_tbf { + /** + * Read the the data of a TBF policy. + */ + NRS_CTL_TBF_RD_RULE = PTLRPC_NRS_CTL_1ST_POL_SPEC, + /** + * Write the the data of a TBF policy. + */ + NRS_CTL_TBF_WR_RULE, + /** + * Read the TBF policy type preset by proc entry "nrs_policies". + */ + NRS_CTL_TBF_RD_TYPE_FLAG, +}; + +/** @} tbf */ +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h new file mode 100644 index 0000000000000..dd99eee5af714 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h @@ -0,0 +1,53 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2015 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * Define obdo associated functions + * obdo: OBject Device o... + */ + +#ifndef _LUSTRE_OBDO_H_ +#define _LUSTRE_OBDO_H_ + +#include + +/** + * Create an obdo to send over the wire + */ +void lustre_set_wire_obdo(const struct obd_connect_data *ocd, + struct obdo *wobdo, + const struct obdo *lobdo); + +/** + * Create a local obdo from a wire based odbo + */ +void lustre_get_wire_obdo(const struct obd_connect_data *ocd, + struct obdo *lobdo, + const struct obdo *wobdo); +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_osc.h b/drivers/staging/lustrefsx/lustre/include/lustre_osc.h new file mode 100644 index 0000000000000..f865036f897cf --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_osc.h @@ -0,0 +1,979 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/* + * lustre/include/lustre_osc.h + * + * OSC layer structures and methods common for both OSC and MDC. + * + * This file contains OSC interfaces used by OSC and MDC. Most of them + * were just moved from lustre/osc/osc_cl_internal.h for Data-on-MDT + * purposes. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + * Author: Mikhail Pershin + */ + +#ifndef LUSTRE_OSC_H +#define LUSTRE_OSC_H + +#include +#include +#include + +/** \defgroup osc osc + * @{ + */ + +struct osc_quota_info { + /** linkage for quota hash table */ + struct hlist_node oqi_hash; + __u32 oqi_id; +}; + +enum async_flags { + ASYNC_READY = 0x1, /* ap_make_ready will not be called before this + page is added to an rpc */ + ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */ + ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called + to give the caller a chance to update + or cancel the size of the io */ + ASYNC_HP = 0x10, +}; + +struct osc_async_page { + int oap_magic; + unsigned short oap_cmd; + unsigned short oap_interrupted:1; + + struct list_head oap_pending_item; + struct list_head oap_rpc_item; + + loff_t oap_obj_off; + unsigned oap_page_off; + enum async_flags oap_async_flags; + + struct brw_page oap_brw_page; + + struct ptlrpc_request *oap_request; + struct client_obd *oap_cli; + struct osc_object *oap_obj; + + spinlock_t oap_lock; +}; + +#define oap_page oap_brw_page.pg +#define oap_count oap_brw_page.count +#define oap_brw_flags oap_brw_page.flag + +static inline struct osc_async_page *brw_page2oap(struct brw_page *pga) +{ + return container_of(pga, struct osc_async_page, oap_brw_page); +} + +struct osc_device { + struct cl_device od_cl; + struct obd_export *od_exp; + + /* Write stats is actually protected by client_obd's lock. */ + struct osc_stats { + uint64_t os_lockless_writes; /* by bytes */ + uint64_t os_lockless_reads; /* by bytes */ + uint64_t os_lockless_truncates; /* by times */ + } od_stats; + + /* configuration item(s) */ + time64_t od_contention_time; + int od_lockless_truncate; +}; + +struct osc_extent; + +/** + * State maintained by osc layer for each IO context. + */ +struct osc_io { + /** super class */ + struct cl_io_slice oi_cl; + /** true if this io is lockless. */ + unsigned int oi_lockless:1, + /** true if this io is counted as active IO */ + oi_is_active:1, + /** true if this io has CAP_SYS_RESOURCE */ + oi_cap_sys_resource:1; + /** how many LRU pages are reserved for this IO */ + unsigned long oi_lru_reserved; + + /** active extents, we know how many bytes is going to be written, + * so having an active extent will prevent it from being fragmented */ + struct osc_extent *oi_active; + /** partially truncated extent, we need to hold this extent to prevent + * page writeback from happening. */ + struct osc_extent *oi_trunc; + /** write osc_lock for this IO, used by osc_extent_find(). */ + struct osc_lock *oi_write_osclock; + struct obdo oi_oa; + struct osc_async_cbargs { + bool opc_rpc_sent; + int opc_rc; + struct completion opc_sync; + } oi_cbarg; +}; + +/** + * State maintained by osc layer for the duration of a system call. + */ +struct osc_session { + struct osc_io os_io; +}; + +#define OTI_PVEC_SIZE 256 +struct osc_thread_info { + struct ldlm_res_id oti_resname; + union ldlm_policy_data oti_policy; + struct cl_lock_descr oti_descr; + struct cl_attr oti_attr; + struct lustre_handle oti_handle; + struct cl_page_list oti_plist; + struct cl_io oti_io; + struct pagevec oti_pagevec; + void *oti_pvec[OTI_PVEC_SIZE]; + /** + * Fields used by cl_lock_discard_pages(). + */ + pgoff_t oti_next_index; + pgoff_t oti_fn_index; /* first non-overlapped index */ + struct cl_sync_io oti_anchor; + struct cl_req_attr oti_req_attr; + struct lu_buf oti_ladvise_buf; +}; + +static inline __u64 osc_enq2ldlm_flags(__u32 enqflags) +{ + __u64 result = 0; + + CDEBUG(D_DLMTRACE, "flags: %x\n", enqflags); + + LASSERT((enqflags & ~CEF_MASK) == 0); + + if (enqflags & CEF_NONBLOCK) + result |= LDLM_FL_BLOCK_NOWAIT; + if (enqflags & CEF_GLIMPSE) + result |= LDLM_FL_HAS_INTENT; + if (enqflags & CEF_DISCARD_DATA) + result |= LDLM_FL_AST_DISCARD_DATA; + if (enqflags & CEF_PEEK) + result |= LDLM_FL_TEST_LOCK; + if (enqflags & CEF_LOCK_MATCH) + result |= LDLM_FL_MATCH_LOCK; + if (enqflags & CEF_LOCK_NO_EXPAND) + result |= LDLM_FL_NO_EXPANSION; + if (enqflags & CEF_SPECULATIVE) + result |= LDLM_FL_SPECULATIVE; + return result; +} + +typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh, + int rc); + +struct osc_enqueue_args { + struct obd_export *oa_exp; + enum ldlm_type oa_type; + enum ldlm_mode oa_mode; + __u64 *oa_flags; + osc_enqueue_upcall_f oa_upcall; + void *oa_cookie; + struct ost_lvb *oa_lvb; + struct lustre_handle oa_lockh; + bool oa_speculative; +}; + +/** + * Bit flags for osc_dlm_lock_at_pageoff(). + */ +enum osc_dap_flags { + /** + * Just check if the desired lock exists, it won't hold reference + * count on lock. + */ + OSC_DAP_FL_TEST_LOCK = 1 << 0, + /** + * Return the lock even if it is being canceled. + */ + OSC_DAP_FL_CANCELING = 1 << 1 +}; + +/* + * The set of operations which are different for MDC and OSC objects + */ +struct osc_object_operations { + void (*oto_build_res_name)(struct osc_object *osc, + struct ldlm_res_id *resname); + struct ldlm_lock* (*oto_dlmlock_at_pgoff)(const struct lu_env *env, + struct osc_object *obj, + pgoff_t index, + enum osc_dap_flags dap_flags); +}; + +struct osc_object { + struct cl_object oo_cl; + struct lov_oinfo *oo_oinfo; + /** + * True if locking against this stripe got -EUSERS. + */ + int oo_contended; + ktime_t oo_contention_time; +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK + /** + * IO context used for invariant checks in osc_lock_has_pages(). + */ + struct cl_io oo_debug_io; + /** Serialization object for osc_object::oo_debug_io. */ + struct mutex oo_debug_mutex; +#endif + /** + * used by the osc to keep track of what objects to build into rpcs. + * Protected by client_obd->cli_loi_list_lock. + */ + struct list_head oo_ready_item; + struct list_head oo_hp_ready_item; + struct list_head oo_write_item; + struct list_head oo_read_item; + + /** + * extent is a red black tree to manage (async) dirty pages. + */ + struct rb_root oo_root; + /** + * Manage write(dirty) extents. + */ + struct list_head oo_hp_exts; /* list of hp extents */ + struct list_head oo_urgent_exts; /* list of writeback extents */ + struct list_head oo_full_exts; + + struct list_head oo_reading_exts; + + atomic_t oo_nr_reads; + atomic_t oo_nr_writes; + + /** Protect extent tree. Will be used to protect + * oo_{read|write}_pages soon. */ + spinlock_t oo_lock; + + /** + * Radix tree for caching pages + */ + spinlock_t oo_tree_lock; + struct radix_tree_root oo_tree; + unsigned long oo_npages; + + /* Protect osc_lock this osc_object has */ + struct list_head oo_ol_list; + spinlock_t oo_ol_spin; + + /** number of active IOs of this object */ + atomic_t oo_nr_ios; + wait_queue_head_t oo_io_waitq; + + const struct osc_object_operations *oo_obj_ops; + bool oo_initialized; +}; + +static inline void osc_build_res_name(struct osc_object *osc, + struct ldlm_res_id *resname) +{ + return osc->oo_obj_ops->oto_build_res_name(osc, resname); +} + +static inline struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env, + struct osc_object *obj, + pgoff_t index, + enum osc_dap_flags flags) +{ + return obj->oo_obj_ops->oto_dlmlock_at_pgoff(env, obj, index, flags); +} + +static inline void osc_object_lock(struct osc_object *obj) +{ + spin_lock(&obj->oo_lock); +} + +static inline int osc_object_trylock(struct osc_object *obj) +{ + return spin_trylock(&obj->oo_lock); +} + +static inline void osc_object_unlock(struct osc_object *obj) +{ + spin_unlock(&obj->oo_lock); +} + +static inline int osc_object_is_locked(struct osc_object *obj) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + return spin_is_locked(&obj->oo_lock); +#else + /* + * It is not perfect to return true all the time. + * But since this function is only used for assertion + * and checking, it seems OK. + */ + return 1; +#endif +} + +static inline void osc_object_set_contended(struct osc_object *obj) +{ + obj->oo_contention_time = ktime_get(); + /* mb(); */ + obj->oo_contended = 1; +} + +static inline void osc_object_clear_contended(struct osc_object *obj) +{ + obj->oo_contended = 0; +} + +/* + * Lock "micro-states" for osc layer. + */ +enum osc_lock_state { + OLS_NEW, + OLS_ENQUEUED, + OLS_UPCALL_RECEIVED, + OLS_GRANTED, + OLS_CANCELLED +}; + +/** + * osc-private state of cl_lock. + * + * Interaction with DLM. + * + * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in + * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_dlmlock. + * + * This pointer is protected through a reference, acquired by + * osc_lock_upcall0(). Also, an additional reference is acquired by + * ldlm_lock_addref() call protecting the lock from cancellation, until + * osc_lock_unuse() releases it. + * + * Below is a description of how lock references are acquired and released + * inside of DLM. + * + * - When new lock is created and enqueued to the server (ldlm_cli_enqueue()) + * - ldlm_lock_create() + * - ldlm_lock_new(): initializes a lock with 2 references. One for + * the caller (released when reply from the server is received, or on + * error), and another for the hash table. + * - ldlm_lock_addref_internal(): protects the lock from cancellation. + * + * - When reply is received from the server (osc_enqueue_interpret()) + * - ldlm_cli_enqueue_fini() + * - LDLM_LOCK_PUT(): releases caller reference acquired by + * ldlm_lock_new(). + * - if (rc != 0) + * ldlm_lock_decref(): error case: matches ldlm_cli_enqueue(). + * - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue(). + * + * - When lock is being cancelled (ldlm_lock_cancel()) + * - ldlm_lock_destroy() + * - LDLM_LOCK_PUT(): releases hash-table reference acquired by + * ldlm_lock_new(). + * + * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called + * either when lock is cancelled (osc_lock_blocking()), or when locks is + * deleted without cancellation (e.g., from cl_locks_prune()). In the latter + * case ldlm lock remains in memory, and can be re-attached to osc_lock in the + * future. + */ +struct osc_lock { + struct cl_lock_slice ols_cl; + /** Internal lock to protect states, etc. */ + spinlock_t ols_lock; + /** Owner sleeps on this channel for state change */ + struct cl_sync_io *ols_owner; + /** waiting list for this lock to be cancelled */ + struct list_head ols_waiting_list; + /** wait entry of ols_waiting_list */ + struct list_head ols_wait_entry; + /** list entry for osc_object::oo_ol_list */ + struct list_head ols_nextlock_oscobj; + + /** underlying DLM lock */ + struct ldlm_lock *ols_dlmlock; + /** DLM flags with which osc_lock::ols_lock was enqueued */ + __u64 ols_flags; + /** osc_lock::ols_lock handle */ + struct lustre_handle ols_handle; + struct ldlm_enqueue_info ols_einfo; + enum osc_lock_state ols_state; + /** lock value block */ + struct ost_lvb ols_lvb; + /** Lockless operations to be used by lockless lock */ + const struct cl_lock_operations *ols_lockless_ops; + /** + * true, if ldlm_lock_addref() was called against + * osc_lock::ols_lock. This is used for sanity checking. + * + * \see osc_lock::ols_has_ref + */ + unsigned ols_hold :1, + /** + * this is much like osc_lock::ols_hold, except that this bit is + * cleared _after_ reference in released in osc_lock_unuse(). This + * fine distinction is needed because: + * + * - if ldlm lock still has a reference, osc_ast_data_get() needs + * to return associated cl_lock (so that a flag is needed that is + * cleared after ldlm_lock_decref() returned), and + * + * - ldlm_lock_decref() can invoke blocking ast (for a + * LDLM_FL_CBPENDING lock), and osc_lock functions like + * osc_lock_cancel() called from there need to know whether to + * release lock reference (so that a flag is needed that is + * cleared before ldlm_lock_decref() is called). + */ + ols_has_ref:1, + /** + * inherit the lockless attribute from top level cl_io. + * If true, osc_lock_enqueue is able to tolerate the -EUSERS error. + */ + ols_locklessable:1, + /** + * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat + * the EVAVAIL error as torerable, this will make upper logic happy + * to wait all glimpse locks to each OSTs to be completed. + * Glimpse lock converts to normal lock if the server lock is granted. + * Glimpse lock should be destroyed immediately after use. + */ + ols_glimpse:1, + /** + * For async glimpse lock. + */ + ols_agl:1, + /** + * for speculative locks - asynchronous glimpse locks and ladvise + * lockahead manual lock requests + * + * Used to tell osc layer to not wait for the ldlm reply from the + * server, so the osc lock will be short lived - It only exists to + * create the ldlm request and is not updated on request completion. + */ + ols_speculative:1; +}; + +static inline int osc_lock_is_lockless(const struct osc_lock *ols) +{ + return (ols->ols_cl.cls_ops == ols->ols_lockless_ops); +} + +/** + * Page state private for osc layer. + */ +struct osc_page { + struct cl_page_slice ops_cl; + /** + * Page queues used by osc to detect when RPC can be formed. + */ + struct osc_async_page ops_oap; + /** + * An offset within page from which next transfer starts. This is used + * by cl_page_clip() to submit partial page transfers. + */ + int ops_from; + /** + * An offset within page at which next transfer ends. + * + * \see osc_page::ops_from. + */ + int ops_to; + /** + * Boolean, true iff page is under transfer. Used for sanity checking. + */ + unsigned ops_transfer_pinned:1, + /** + * in LRU? + */ + ops_in_lru:1, + /** + * Set if the page must be transferred with OBD_BRW_SRVLOCK. + */ + ops_srvlock:1, + /** + * If the page is in osc_object::oo_tree. + */ + ops_intree:1; + /** + * lru page list. See osc_lru_{del|use}() in osc_page.c for usage. + */ + struct list_head ops_lru; + /** + * Submit time - the time when the page is starting RPC. For debugging. + */ + ktime_t ops_submit_time; +}; + +struct osc_brw_async_args { + struct obdo *aa_oa; + int aa_requested_nob; + int aa_nio_count; + u32 aa_page_count; + s32 aa_resends; + struct brw_page **aa_ppga; + struct client_obd *aa_cli; + struct list_head aa_oaps; + struct list_head aa_exts; +}; + +extern struct kmem_cache *osc_lock_kmem; +extern struct kmem_cache *osc_object_kmem; +extern struct kmem_cache *osc_thread_kmem; +extern struct kmem_cache *osc_session_kmem; +extern struct kmem_cache *osc_extent_kmem; +extern struct kmem_cache *osc_quota_kmem; +extern struct kmem_cache *osc_obdo_kmem; + +extern struct lu_context_key osc_key; +extern struct lu_context_key osc_session_key; + +#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY) + +/* osc_page.c */ +int osc_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t ind); +void osc_index2policy(union ldlm_policy_data *policy, const struct cl_object *obj, + pgoff_t start, pgoff_t end); +void osc_lru_add_batch(struct client_obd *cli, struct list_head *list); +void osc_page_submit(const struct lu_env *env, struct osc_page *opg, + enum cl_req_type crt, int brw_flags); +int lru_queue_work(const struct lu_env *env, void *data); +long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, + long target, bool force); + +/* osc_cache.c */ +int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops); +int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg, + u32 async_flags); +int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, + struct page *page, loff_t offset); +int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops); +int osc_page_cache_add(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); +int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj, + struct osc_page *ops); +int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops); +int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io, + struct osc_object *obj, struct list_head *list, + int brw_flags); +int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, + __u64 size, struct osc_extent **extp); +void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext); +int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end, int hp, int discard); +int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end); +int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, int async); +static inline void osc_wake_cache_waiters(struct client_obd *cli) +{ + wake_up(&cli->cl_cache_waiters); +} + +static inline int osc_io_unplug_async(const struct lu_env *env, + struct client_obd *cli, + struct osc_object *osc) +{ + return osc_io_unplug0(env, cli, osc, 1); +} + +static inline void osc_io_unplug(const struct lu_env *env, + struct client_obd *cli, + struct osc_object *osc) +{ + (void)osc_io_unplug0(env, cli, osc, 0); +} + +typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *, + struct osc_page *, void *); +int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, + struct osc_object *osc, pgoff_t start, pgoff_t end, + osc_page_gang_cbt cb, void *cbdata); +int osc_discard_cb(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops, void *cbdata); + +/* osc_dev.c */ +int osc_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next); +struct lu_device *osc_device_fini(const struct lu_env *env, + struct lu_device *d); +struct lu_device *osc_device_free(const struct lu_env *env, + struct lu_device *d); + +/* osc_object.c */ +int osc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +void osc_object_free(const struct lu_env *env, struct lu_object *obj); +int osc_lvb_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct ost_lvb *lvb); +int osc_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj); +int osc_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +int osc_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj, + struct ost_lvb *lvb); +int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc); +int osc_object_is_contended(struct osc_object *obj); +int osc_object_find_cbdata(const struct lu_env *env, struct cl_object *obj, + ldlm_iterator_t iter, void *data); +int osc_object_prune(const struct lu_env *env, struct cl_object *obj); + +/* osc_request.c */ +void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd); +int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg); +int osc_precleanup_common(struct obd_device *obd); +int osc_cleanup_common(struct obd_device *obd); +int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, u32 vallen, void *val, + struct ptlrpc_request_set *set); +int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg); +int osc_reconnect(const struct lu_env *env, struct obd_export *exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata); +int osc_disconnect(struct obd_export *exp); +int osc_punch_send(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie); + +/* osc_io.c */ +int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue); +int osc_io_commit_async(const struct lu_env *env, + const struct cl_io_slice *ios, + struct cl_page_list *qin, int from, int to, + cl_commit_cbt cb); +int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios); +void osc_io_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios); +int osc_io_write_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios); +void osc_io_write_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios); +int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios); +void osc_io_setattr_end(const struct lu_env *env, + const struct cl_io_slice *slice); +int osc_io_read_start(const struct lu_env *env, + const struct cl_io_slice *slice); +int osc_io_write_start(const struct lu_env *env, + const struct cl_io_slice *slice); +void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice); +int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj, + struct cl_fsync_io *fio); +void osc_io_fsync_end(const struct lu_env *env, + const struct cl_io_slice *slice); +void osc_read_ahead_release(const struct lu_env *env, void *cbdata); + +/* osc_lock.c */ +void osc_lock_to_lockless(const struct lu_env *env, struct osc_lock *ols, + int force); +void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc, + struct osc_lock *oscl); +int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj, + struct osc_lock *oscl); +void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io, + struct cl_object *obj, struct osc_lock *oscl); +int osc_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice); +void osc_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice); +void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice); +int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data); +unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock); + +/***************************************************************************** + * + * Accessors and type conversions. + * + */ +static inline struct osc_thread_info *osc_env_info(const struct lu_env *env) +{ + struct osc_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &osc_key); + LASSERT(info != NULL); + return info; +} + +static inline struct osc_session *osc_env_session(const struct lu_env *env) +{ + struct osc_session *ses; + + ses = lu_context_key_get(env->le_ses, &osc_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct osc_io *osc_env_io(const struct lu_env *env) +{ + return &osc_env_session(env)->os_io; +} + +static inline struct osc_device *lu2osc_dev(const struct lu_device *d) +{ + return container_of0(d, struct osc_device, od_cl.cd_lu_dev); +} + +static inline struct obd_export *osc_export(const struct osc_object *obj) +{ + return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp; +} + +static inline struct client_obd *osc_cli(const struct osc_object *obj) +{ + return &osc_export(obj)->exp_obd->u.cli; +} + +static inline struct osc_object *cl2osc(const struct cl_object *obj) +{ + return container_of0(obj, struct osc_object, oo_cl); +} + +static inline struct cl_object *osc2cl(const struct osc_object *obj) +{ + return (struct cl_object *)&obj->oo_cl; +} + +static inline struct osc_device *obd2osc_dev(const struct obd_device *d) +{ + return container_of0(d->obd_lu_dev, struct osc_device, + od_cl.cd_lu_dev); +} + +static inline struct lu_device *osc2lu_dev(struct osc_device *osc) +{ + return &osc->od_cl.cd_lu_dev; +} + +static inline struct lu_object *osc2lu(struct osc_object *osc) +{ + return &osc->oo_cl.co_lu; +} + +static inline struct osc_object *lu2osc(const struct lu_object *obj) +{ + return container_of0(obj, struct osc_object, oo_cl.co_lu); +} + +static inline struct osc_io *cl2osc_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl); + + LINVRNT(oio == osc_env_io(env)); + return oio; +} + +static inline enum ldlm_mode osc_cl_lock2ldlm(enum cl_lock_mode mode) +{ + LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP); + if (mode == CLM_READ) + return LCK_PR; + if (mode == CLM_WRITE) + return LCK_PW; + return LCK_GROUP; +} + +static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode) +{ + LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP); + if (mode == LCK_PR) + return CLM_READ; + if (mode == LCK_PW) + return CLM_WRITE; + return CLM_GROUP; +} + +static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice) +{ + return container_of0(slice, struct osc_page, ops_cl); +} + +static inline struct osc_page *oap2osc(struct osc_async_page *oap) +{ + return container_of0(oap, struct osc_page, ops_oap); +} + +static inline pgoff_t osc_index(struct osc_page *opg) +{ + return opg->ops_cl.cpl_index; +} + +static inline struct cl_page *oap2cl_page(struct osc_async_page *oap) +{ + return oap2osc(oap)->ops_cl.cpl_page; +} + +static inline struct osc_page *oap2osc_page(struct osc_async_page *oap) +{ + return (struct osc_page *)container_of(oap, struct osc_page, ops_oap); +} + +static inline struct osc_page * +osc_cl_page_osc(struct cl_page *page, struct osc_object *osc) +{ + const struct cl_page_slice *slice; + + LASSERT(osc != NULL); + slice = cl_object_page_slice(&osc->oo_cl, page); + return cl2osc_page(slice); +} + +static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice) +{ + return container_of0(slice, struct osc_lock, ols_cl); +} + +static inline int osc_io_srvlock(struct osc_io *oio) +{ + return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock); +} + +enum osc_extent_state { + OES_INV = 0, /** extent is just initialized or destroyed */ + OES_ACTIVE = 1, /** process is using this extent */ + OES_CACHE = 2, /** extent is ready for IO */ + OES_LOCKING = 3, /** locking page to prepare IO */ + OES_LOCK_DONE = 4, /** locking finished, ready to send */ + OES_RPC = 5, /** in RPC */ + OES_TRUNC = 6, /** being truncated */ + OES_STATE_MAX +}; + +/** + * osc_extent data to manage dirty pages. + * osc_extent has the following attributes: + * 1. all pages in the same must be in one RPC in write back; + * 2. # of pages must be less than max_pages_per_rpc - implied by 1; + * 3. must be covered by only 1 osc_lock; + * 4. exclusive. It's impossible to have overlapped osc_extent. + * + * The lifetime of an extent is from when the 1st page is dirtied to when + * all pages inside it are written out. + * + * LOCKING ORDER + * ============= + * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock) + */ +struct osc_extent { + /** red-black tree node */ + struct rb_node oe_node; + /** osc_object of this extent */ + struct osc_object *oe_obj; + /** refcount, removed from red-black tree if reaches zero. */ + atomic_t oe_refc; + /** busy if non-zero */ + atomic_t oe_users; + /** link list of osc_object's oo_{hp|urgent|locking}_exts. */ + struct list_head oe_link; + /** state of this extent */ + enum osc_extent_state oe_state; + /** flags for this extent. */ + unsigned int oe_intree:1, + /** 0 is write, 1 is read */ + oe_rw:1, + /** sync extent, queued by osc_queue_sync_pages() */ + oe_sync:1, + /** set if this extent has partial, sync pages. + * Extents with partial page(s) can't merge with others in RPC */ + oe_no_merge:1, + oe_srvlock:1, + oe_memalloc:1, + /** an ACTIVE extent is going to be truncated, so when this extent + * is released, it will turn into TRUNC state instead of CACHE. */ + oe_trunc_pending:1, + /** this extent should be written asap and someone may wait for the + * write to finish. This bit is usually set along with urgent if + * the extent was CACHE state. + * fsync_wait extent can't be merged because new extent region may + * exceed fsync range. */ + oe_fsync_wait:1, + /** covering lock is being canceled */ + oe_hp:1, + /** this extent should be written back asap. set if one of pages is + * called by page WB daemon, or sync write or reading requests. */ + oe_urgent:1, + /** Non-delay RPC should be used for this extent. */ + oe_ndelay:1; + /** how many grants allocated for this extent. + * Grant allocated for this extent. There is no grant allocated + * for reading extents and sync write extents. */ + unsigned int oe_grants; + /** # of dirty pages in this extent */ + unsigned int oe_nr_pages; + /** list of pending oap pages. Pages in this list are NOT sorted. */ + struct list_head oe_pages; + /** Since an extent has to be written out in atomic, this is used to + * remember the next page need to be locked to write this extent out. + * Not used right now. + */ + struct osc_page *oe_next_page; + /** start and end index of this extent, include start and end + * themselves. Page offset here is the page index of osc_pages. + * oe_start is used as keyword for red-black tree. */ + pgoff_t oe_start; + pgoff_t oe_end; + /** maximum ending index of this extent, this is limited by + * max_pages_per_rpc, lock extent and chunk size. */ + pgoff_t oe_max_end; + /** waitqueue - for those who want to be notified if this extent's + * state has changed. */ + wait_queue_head_t oe_waitq; + /** lock covering this extent */ + struct ldlm_lock *oe_dlmlock; + /** terminator of this extent. Must be true if this extent is in IO. */ + struct task_struct *oe_owner; + /** return value of writeback. If somebody is waiting for this extent, + * this value can be known by outside world. */ + int oe_rc; + /** max pages per rpc when this extent was created */ + unsigned int oe_mppr; + /** FLR: layout version when this osc_extent is publised */ + __u32 oe_layout_version; +}; + +/** @} osc */ + +#endif /* LUSTRE_OSC_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h new file mode 100644 index 0000000000000..b6070871e555c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h @@ -0,0 +1,118 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LUSTRE_PATCHLESS_COMPAT_H +#define LUSTRE_PATCHLESS_COMPAT_H + +#include +#include +#ifndef HAVE_TRUNCATE_COMPLETE_PAGE +#include +#include + +#ifndef HAVE_DELETE_FROM_PAGE_CACHE /* 2.6.39 */ +#ifndef HAVE_REMOVE_FROM_PAGE_CACHE /* 2.6.35 - 2.6.38 */ + +/* XXX copy & paste from 2.6.15 kernel */ +static inline void ll_remove_from_page_cache(struct page *page) +{ + struct address_space *mapping = page->mapping; + + BUG_ON(!PageLocked(page)); + + spin_lock_irq(&mapping->tree_lock); + radix_tree_delete(&mapping->page_tree, page->index); + page->mapping = NULL; + mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); + + spin_unlock_irq(&mapping->tree_lock); +} +#else /* HAVE_REMOVE_FROM_PAGE_CACHE */ +#define ll_remove_from_page_cache(page) remove_from_page_cache(page) +#endif /* !HAVE_REMOVE_FROM_PAGE_CACHE */ + +static inline void ll_delete_from_page_cache(struct page *page) +{ + ll_remove_from_page_cache(page); + put_page(page); +} +#else /* HAVE_DELETE_FROM_PAGE_CACHE */ +#define ll_delete_from_page_cache(page) delete_from_page_cache(page) +#endif /* !HAVE_DELETE_FROM_PAGE_CACHE */ + +static inline void +ll_cancel_dirty_page(struct address_space *mapping, struct page *page) +{ +#ifdef HAVE_NEW_CANCEL_DIRTY_PAGE + cancel_dirty_page(page); +#elif defined(HAVE_CANCEL_DIRTY_PAGE) + cancel_dirty_page(page, PAGE_SIZE); +#else + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping); +#endif /* HAVE_NEW_CANCEL_DIRTY_PAGE */ +} + +static inline void +truncate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return; + + if (PagePrivate(page)) +#ifdef HAVE_INVALIDATE_RANGE + page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); +#else + page->mapping->a_ops->invalidatepage(page, 0); +#endif + + ll_cancel_dirty_page(mapping, page); + ClearPageMappedToDisk(page); + ll_delete_from_page_cache(page); +} +#endif /* !HAVE_TRUNCATE_COMPLETE_PAGE */ + +#ifdef HAVE_DCACHE_LOCK +# define dget_dlock(d) dget_locked(d) +# define ll_d_count(d) atomic_read(&(d)->d_count) +#elif defined(HAVE_D_COUNT) +# define ll_d_count(d) d_count(d) +#else +# define ll_d_count(d) ((d)->d_count) +#endif /* HAVE_DCACHE_LOCK */ + +#ifndef HAVE_IN_COMPAT_SYSCALL +#define in_compat_syscall is_compat_task +#endif + +#endif /* LUSTRE_PATCHLESS_COMPAT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_quota.h b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h new file mode 100644 index 0000000000000..17ff2da6240ca --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h @@ -0,0 +1,247 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + * Use is subject to license terms. + */ + +#ifndef _LUSTRE_QUOTA_H +#define _LUSTRE_QUOTA_H + +/** \defgroup quota quota + * + */ + +#include +#include +#include +#include +#include +#include + +#ifndef MAX_IQ_TIME +#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#endif + +#ifndef MAX_DQ_TIME +#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ +#endif + +struct lquota_id_info; +struct lquota_trans; + +/* Gather all quota record type in an union that can be used to read any records + * from disk. All fields of these records must be 64-bit aligned, otherwise the + * OSD layer may swab them incorrectly. */ +union lquota_rec { + struct lquota_glb_rec lqr_glb_rec; + struct lquota_slv_rec lqr_slv_rec; + struct lquota_acct_rec lqr_acct_rec; +}; + +/* flags for inode/block quota accounting */ +enum osd_qid_declare_flags { + OSD_QID_INODE = 1 << 0, + OSD_QID_BLK = 1 << 1, + OSD_QID_FORCE = 1 << 2, +}; + +/* Index features supported by the global index objects + * Only used for migration purpose and should be removed once on-disk migration + * is no longer needed */ +extern struct dt_index_features dt_quota_iusr_features; +extern struct dt_index_features dt_quota_busr_features; +extern struct dt_index_features dt_quota_igrp_features; +extern struct dt_index_features dt_quota_bgrp_features; + +/* Name used in the configuration logs to identify the default metadata pool + * (composed of all the MDTs, with pool ID 0) and the default data pool (all + * the OSTs, with pool ID 0 too). */ +#define QUOTA_METAPOOL_NAME "mdt=" +#define QUOTA_DATAPOOL_NAME "ost=" + +/* + * Quota Master Target support + */ + +/* Request handlers for quota master operations. + * This is used by the MDT to pass quota/lock requests to the quota master + * target. This won't be needed any more once the QMT is a real target and + * does not rely any more on the MDT service threads and namespace. */ +struct qmt_handlers { + /* Handle quotactl request from client. */ + int (*qmth_quotactl)(const struct lu_env *, struct lu_device *, + struct obd_quotactl *); + + /* Handle dqacq/dqrel request from slave. */ + int (*qmth_dqacq)(const struct lu_env *, struct lu_device *, + struct ptlrpc_request *); + + /* LDLM intent policy associated with quota locks */ + int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *, + struct ptlrpc_request *, struct ldlm_lock **, + int); + + /* Initialize LVB of ldlm resource associated with quota objects */ + int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *); + + /* Update LVB of ldlm resource associated with quota objects */ + int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *, + struct ptlrpc_request *, int); + + /* Return size of LVB to be packed in ldlm message */ + int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *); + + /* Fill request buffer with lvb */ + int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *, + int); + + /* Free lvb associated with ldlm resource */ + int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *); +}; + +/* actual handlers are defined in lustre/quota/qmt_handler.c */ +extern struct qmt_handlers qmt_hdls; + +/* + * Quota enforcement support on slaves + */ + +struct qsd_instance; + +/* The quota slave feature is implemented under the form of a library. + * The API is the following: + * + * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd + * instance via qsd_init(). This creates all required structures + * to manage quota enforcement for this target and performs all + * low-level initialization which does not involve any lustre + * object. qsd_init() should typically be called when the OSD + * is being set up. + * + * - qsd_prepare(): This sets up on-disk objects associated with the quota slave + * feature and initiates the quota reintegration procedure if + * needed. qsd_prepare() should typically be called when + * ->ldo_prepare is invoked. + * + * - qsd_start(): a qsd instance should be started once recovery is completed + * (i.e. when ->ldo_recovery_complete is called). This is used + * to notify the qsd layer that quota should now be enforced + * again via the qsd_op_begin/end functions. The last step of the + * reintegration prodecure (namely usage reconciliation) will be + * completed during start. + * + * - qsd_fini(): is used to release a qsd_instance structure allocated with + * qsd_init(). This releases all quota slave objects and frees the + * structures associated with the qsd_instance. + * + * - qsd_op_begin(): is used to enforce quota, it must be called in the + * declaration of each operation. qsd_op_end() should then be + * invoked later once all operations have been completed in + * order to release/adjust the quota space. + * Running qsd_op_begin() before qsd_start() isn't fatal and + * will return success. + * Once qsd_start() has been run, qsd_op_begin() will block + * until the reintegration procedure is completed. + * + * - qsd_op_end(): performs the post operation quota processing. This must be + * called after the operation transaction stopped. + * While qsd_op_begin() must be invoked each time a new + * operation is declared, qsd_op_end() should be called only + * once for the whole transaction. + * + * - qsd_op_adjust(): triggers pre-acquire/release if necessary. + * + * Below are the function prototypes to be used by OSD layer to manage quota + * enforcement. Arguments are documented where each function is defined. */ + +/* flags for quota local enforcement */ +enum osd_quota_local_flags { + QUOTA_FL_OVER_USRQUOTA = 1 << 0, + QUOTA_FL_OVER_GRPQUOTA = 1 << 1, + QUOTA_FL_SYNC = 1 << 2, + QUOTA_FL_OVER_PRJQUOTA = 1 << 3, +}; + +struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *, + struct proc_dir_entry *, bool is_md); +int qsd_prepare(const struct lu_env *, struct qsd_instance *); +int qsd_start(const struct lu_env *, struct qsd_instance *); +void qsd_fini(const struct lu_env *, struct qsd_instance *); +int qsd_op_begin(const struct lu_env *, struct qsd_instance *, + struct lquota_trans *, struct lquota_id_info *, + enum osd_quota_local_flags *); +void qsd_op_end(const struct lu_env *, struct qsd_instance *, + struct lquota_trans *); +void qsd_op_adjust(const struct lu_env *, struct qsd_instance *, + union lquota_id *, int); + +/* + * Quota information attached to a transaction + */ + +struct lquota_entry; + +struct lquota_id_info { + /* quota identifier */ + union lquota_id lqi_id; + + /* USRQUOTA or GRPQUOTA for now, could be expanded for + * directory quota or other types later. */ + int lqi_type; + + /* inodes or kbytes to be consumed or released, it could + * be negative when releasing space. */ + long long lqi_space; + + /* quota slave entry structure associated with this ID */ + struct lquota_entry *lqi_qentry; + + /* whether we are reporting blocks or inodes */ + bool lqi_is_blk; +}; + +/* With the DoM, both inode quota in meta pool and block quota in data pool + * will be enforced at MDT, there are at most 4 quota ids being enforced in + * a single transaction for inode and block quota, which is chown transaction: + * original uid and gid, new uid and gid. + * + * This value might need to be revised when directory quota is added. */ +#define QUOTA_MAX_TRANSIDS 8 + +/* all qids involved in a single transaction */ +struct lquota_trans { + unsigned short lqt_id_cnt; + struct lquota_id_info lqt_ids[QUOTA_MAX_TRANSIDS]; +}; + +#define IS_LQUOTA_RES(res) \ + (res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA || \ + res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB) + +/* helper function used by MDT & OFD to retrieve quota accounting information + * on slave */ +int lquotactl_slv(const struct lu_env *, struct dt_device *, + struct obd_quotactl *); +/** @} quota */ +#endif /* _LUSTRE_QUOTA_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h new file mode 100644 index 0000000000000..8b2c9240660a5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h @@ -0,0 +1,347 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_req_layout.h + * + * Lustre Metadata Target (mdt) request handler + * + * Author: Nikita Danilov + */ + +#ifndef _LUSTRE_REQ_LAYOUT_H__ +#define _LUSTRE_REQ_LAYOUT_H__ + +#include + +/** \defgroup req_layout req_layout + * + * @{ + */ + +struct req_msg_field; +struct req_format; +struct req_capsule; + +struct ptlrpc_request; + +enum req_location { + RCL_CLIENT, + RCL_SERVER, + RCL_NR +}; + +/* Maximal number of fields (buffers) in a request message. */ +#define REQ_MAX_FIELD_NR 11 + +struct req_capsule { + struct ptlrpc_request *rc_req; + const struct req_format *rc_fmt; + enum req_location rc_loc; + __u32 rc_area[RCL_NR][REQ_MAX_FIELD_NR]; +}; + +void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req, + enum req_location location); +void req_capsule_fini(struct req_capsule *pill); + +void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt); +void req_capsule_client_dump(struct req_capsule *pill); +void req_capsule_server_dump(struct req_capsule *pill); +void req_capsule_init_area(struct req_capsule *pill); +size_t req_capsule_filled_sizes(struct req_capsule *pill, + enum req_location loc); +int req_capsule_server_pack(struct req_capsule *pill); + +void *req_capsule_client_get(struct req_capsule *pill, + const struct req_msg_field *field); +void *req_capsule_client_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber); +void *req_capsule_client_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len); +void *req_capsule_server_get(struct req_capsule *pill, + const struct req_msg_field *field); +void *req_capsule_server_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len); +void *req_capsule_server_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber); +void *req_capsule_server_sized_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len, void *swabber); +const void *req_capsule_other_get(struct req_capsule *pill, + const struct req_msg_field *field); + +void req_capsule_set_size(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, __u32 size); +__u32 req_capsule_get_size(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +__u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc); +__u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, + enum req_location loc); +void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt); + +int req_capsule_has_field(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +int req_capsule_field_present(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +void req_capsule_shrink(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 newlen, + enum req_location loc); +int req_capsule_server_grow(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 newlen); +int req_layout_init(void); +void req_layout_fini(void); +int req_check_sepol(struct req_capsule *pill); + +extern struct req_format RQF_OBD_PING; +extern struct req_format RQF_OBD_SET_INFO; +extern struct req_format RQF_SEC_CTX; +extern struct req_format RQF_OBD_IDX_READ; +/* MGS req_format */ +extern struct req_format RQF_MGS_TARGET_REG; +extern struct req_format RQF_MGS_SET_INFO; +extern struct req_format RQF_MGS_CONFIG_READ; +/* fid/fld req_format */ +extern struct req_format RQF_SEQ_QUERY; +extern struct req_format RQF_FLD_QUERY; +extern struct req_format RQF_FLD_READ; +/* MDS req_format */ +extern struct req_format RQF_MDS_CONNECT; +extern struct req_format RQF_MDS_DISCONNECT; +extern struct req_format RQF_MDS_STATFS; +extern struct req_format RQF_MDS_STATFS_NEW; +extern struct req_format RQF_MDS_GET_ROOT; +extern struct req_format RQF_MDS_SYNC; +extern struct req_format RQF_MDS_GETXATTR; +extern struct req_format RQF_MDS_GETATTR; +extern struct req_format RQF_OUT_UPDATE; + +/* + * This is format of direct (non-intent) MDS_GETATTR_NAME request. + */ +extern struct req_format RQF_MDS_GETATTR_NAME; +extern struct req_format RQF_MDS_CLOSE; +extern struct req_format RQF_MDS_CLOSE_INTENT; +extern struct req_format RQF_MDS_CONNECT; +extern struct req_format RQF_MDS_DISCONNECT; +extern struct req_format RQF_MDS_GET_INFO; +extern struct req_format RQF_MDS_READPAGE; +extern struct req_format RQF_MDS_REINT; +extern struct req_format RQF_MDS_REINT_CREATE; +extern struct req_format RQF_MDS_REINT_CREATE_ACL; +extern struct req_format RQF_MDS_REINT_CREATE_SLAVE; +extern struct req_format RQF_MDS_REINT_CREATE_SYM; +extern struct req_format RQF_MDS_REINT_OPEN; +extern struct req_format RQF_MDS_REINT_UNLINK; +extern struct req_format RQF_MDS_REINT_LINK; +extern struct req_format RQF_MDS_REINT_RENAME; +extern struct req_format RQF_MDS_REINT_SETATTR; +extern struct req_format RQF_MDS_REINT_SETXATTR; +extern struct req_format RQF_MDS_QUOTACTL; +extern struct req_format RQF_QUOTA_DQACQ; +extern struct req_format RQF_MDS_SWAP_LAYOUTS; +extern struct req_format RQF_MDS_REINT_MIGRATE; +extern struct req_format RQF_MDS_REINT_RESYNC; +extern struct req_format RQF_MDS_RMFID; +/* MDS hsm formats */ +extern struct req_format RQF_MDS_HSM_STATE_GET; +extern struct req_format RQF_MDS_HSM_STATE_SET; +extern struct req_format RQF_MDS_HSM_ACTION; +extern struct req_format RQF_MDS_HSM_PROGRESS; +extern struct req_format RQF_MDS_HSM_CT_REGISTER; +extern struct req_format RQF_MDS_HSM_CT_UNREGISTER; +extern struct req_format RQF_MDS_HSM_REQUEST; +/* OST req_format */ +extern struct req_format RQF_OST_CONNECT; +extern struct req_format RQF_OST_DISCONNECT; +extern struct req_format RQF_OST_QUOTACTL; +extern struct req_format RQF_OST_GETATTR; +extern struct req_format RQF_OST_SETATTR; +extern struct req_format RQF_OST_CREATE; +extern struct req_format RQF_OST_PUNCH; +extern struct req_format RQF_OST_SYNC; +extern struct req_format RQF_OST_DESTROY; +extern struct req_format RQF_OST_BRW_READ; +extern struct req_format RQF_OST_BRW_WRITE; +extern struct req_format RQF_OST_STATFS; +extern struct req_format RQF_OST_SET_GRANT_INFO; +extern struct req_format RQF_OST_GET_INFO; +extern struct req_format RQF_OST_GET_INFO_LAST_ID; +extern struct req_format RQF_OST_GET_INFO_LAST_FID; +extern struct req_format RQF_OST_SET_INFO_LAST_FID; +extern struct req_format RQF_OST_GET_INFO_FIEMAP; +extern struct req_format RQF_OST_LADVISE; + +/* LDLM req_format */ +extern struct req_format RQF_LDLM_ENQUEUE; +extern struct req_format RQF_LDLM_ENQUEUE_LVB; +extern struct req_format RQF_LDLM_CONVERT; +extern struct req_format RQF_LDLM_INTENT; +extern struct req_format RQF_LDLM_INTENT_BASIC; +extern struct req_format RQF_LDLM_INTENT_LAYOUT; +extern struct req_format RQF_LDLM_INTENT_GETATTR; +extern struct req_format RQF_LDLM_INTENT_OPEN; +extern struct req_format RQF_LDLM_INTENT_CREATE; +extern struct req_format RQF_LDLM_INTENT_GETXATTR; +extern struct req_format RQF_LDLM_INTENT_QUOTA; +extern struct req_format RQF_LDLM_CANCEL; +extern struct req_format RQF_LDLM_CALLBACK; +extern struct req_format RQF_LDLM_CP_CALLBACK; +extern struct req_format RQF_LDLM_BL_CALLBACK; +extern struct req_format RQF_LDLM_GL_CALLBACK; +extern struct req_format RQF_LDLM_GL_CALLBACK_DESC; +/* LOG req_format */ +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER; + +extern struct req_format RQF_CONNECT; + +/* LFSCK req_format */ +extern struct req_format RQF_LFSCK_NOTIFY; +extern struct req_format RQF_LFSCK_QUERY; + +extern struct req_msg_field RMF_GENERIC_DATA; +extern struct req_msg_field RMF_PTLRPC_BODY; +extern struct req_msg_field RMF_MDT_BODY; +extern struct req_msg_field RMF_MDT_EPOCH; +extern struct req_msg_field RMF_OBD_STATFS; +extern struct req_msg_field RMF_NAME; +extern struct req_msg_field RMF_SYMTGT; +extern struct req_msg_field RMF_TGTUUID; +extern struct req_msg_field RMF_CLUUID; +extern struct req_msg_field RMF_SETINFO_VAL; +extern struct req_msg_field RMF_SETINFO_KEY; +extern struct req_msg_field RMF_GETINFO_VAL; +extern struct req_msg_field RMF_GETINFO_VALLEN; +extern struct req_msg_field RMF_GETINFO_KEY; +extern struct req_msg_field RMF_IDX_INFO; +extern struct req_msg_field RMF_CLOSE_DATA; +extern struct req_msg_field RMF_FILE_SECCTX_NAME; +extern struct req_msg_field RMF_FILE_SECCTX; +extern struct req_msg_field RMF_FID_ARRAY; + +/* + * connection handle received in MDS_CONNECT request. + */ +extern struct req_msg_field RMF_CONN; +extern struct req_msg_field RMF_CONNECT_DATA; +extern struct req_msg_field RMF_DLM_REQ; +extern struct req_msg_field RMF_DLM_REP; +extern struct req_msg_field RMF_DLM_LVB; +extern struct req_msg_field RMF_DLM_GL_DESC; +extern struct req_msg_field RMF_LDLM_INTENT; +extern struct req_msg_field RMF_LAYOUT_INTENT; +extern struct req_msg_field RMF_MDT_MD; +extern struct req_msg_field RMF_DEFAULT_MDT_MD; +extern struct req_msg_field RMF_REC_REINT; +extern struct req_msg_field RMF_EADATA; +extern struct req_msg_field RMF_EAVALS; +extern struct req_msg_field RMF_EAVALS_LENS; +extern struct req_msg_field RMF_ACL; +extern struct req_msg_field RMF_LOGCOOKIES; +extern struct req_msg_field RMF_CAPA1; +extern struct req_msg_field RMF_CAPA2; +extern struct req_msg_field RMF_OBD_QUOTACHECK; +extern struct req_msg_field RMF_OBD_QUOTACTL; +extern struct req_msg_field RMF_QUOTA_BODY; +extern struct req_msg_field RMF_STRING; +extern struct req_msg_field RMF_SWAP_LAYOUTS; +extern struct req_msg_field RMF_MDS_HSM_PROGRESS; +extern struct req_msg_field RMF_MDS_HSM_REQUEST; +extern struct req_msg_field RMF_MDS_HSM_USER_ITEM; +extern struct req_msg_field RMF_MDS_HSM_ARCHIVE; +extern struct req_msg_field RMF_HSM_USER_STATE; +extern struct req_msg_field RMF_HSM_STATE_SET; +extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION; +extern struct req_msg_field RMF_MDS_HSM_REQUEST; +extern struct req_msg_field RMF_SELINUX_POL; + +/* seq-mgr fields */ +extern struct req_msg_field RMF_SEQ_OPC; +extern struct req_msg_field RMF_SEQ_RANGE; +extern struct req_msg_field RMF_FID_SPACE; + +/* FLD fields */ +extern struct req_msg_field RMF_FLD_OPC; +extern struct req_msg_field RMF_FLD_MDFLD; + +extern struct req_msg_field RMF_LLOGD_BODY; +extern struct req_msg_field RMF_LLOG_LOG_HDR; +extern struct req_msg_field RMF_LLOGD_CONN_BODY; + +extern struct req_msg_field RMF_MGS_TARGET_INFO; +extern struct req_msg_field RMF_MGS_SEND_PARAM; + +extern struct req_msg_field RMF_OST_BODY; +extern struct req_msg_field RMF_OBD_IOOBJ; +extern struct req_msg_field RMF_OBD_ID; +extern struct req_msg_field RMF_FID; +extern struct req_msg_field RMF_NIOBUF_REMOTE; +extern struct req_msg_field RMF_NIOBUF_INLINE; +extern struct req_msg_field RMF_RCS; +extern struct req_msg_field RMF_FIEMAP_KEY; +extern struct req_msg_field RMF_FIEMAP_VAL; +extern struct req_msg_field RMF_OST_ID; +extern struct req_msg_field RMF_SHORT_IO; + +/* MGS config read message format */ +extern struct req_msg_field RMF_MGS_CONFIG_BODY; +extern struct req_msg_field RMF_MGS_CONFIG_RES; + +/* generic uint32 */ +extern struct req_msg_field RMF_U32; + +/* OBJ update format */ +extern struct req_msg_field RMF_OUT_UPDATE; +extern struct req_msg_field RMF_OUT_UPDATE_REPLY; +extern struct req_msg_field RMF_OUT_UPDATE_HEADER; +extern struct req_msg_field RMF_OUT_UPDATE_BUF; + +/* LFSCK format */ +extern struct req_msg_field RMF_LFSCK_REQUEST; +extern struct req_msg_field RMF_LFSCK_REPLY; + +extern struct req_msg_field RMF_OST_LADVISE_HDR; +extern struct req_msg_field RMF_OST_LADVISE; +/** @} req_layout */ + +#endif /* _LUSTRE_REQ_LAYOUT_H__ */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h new file mode 100644 index 0000000000000..3eba040fac690 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h @@ -0,0 +1,375 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + */ +/* + * lustre/include/lustre_scrub.h + * + * Shared definitions and declarations for Lustre OI scrub. + * + * Author: Fan Yong + */ + +#ifndef _LUSTRE_SCRUB_H +# define _LUSTRE_SCRUB_H + +#include +#include + +#define OSD_OI_FID_OID_BITS_MAX 10 +#define OSD_OI_FID_NR_MAX (1UL << OSD_OI_FID_OID_BITS_MAX) +#define SCRUB_OI_BITMAP_SIZE (OSD_OI_FID_NR_MAX >> 3) +#define PFID_STRIPE_IDX_BITS 16 +#define PFID_STRIPE_COUNT_MASK ((1 << PFID_STRIPE_IDX_BITS) - 1) + +#define SCRUB_MAGIC_V1 0x4C5FD252 +#define SCRUB_CHECKPOINT_INTERVAL 60 +#define SCRUB_WINDOW_SIZE 1024 + +enum scrub_next_status { + /* exit current loop and process next group */ + SCRUB_NEXT_BREAK = 1, + + /* skip current object and process next bit */ + SCRUB_NEXT_CONTINUE = 2, + + /* exit all the loops */ + SCRUB_NEXT_EXIT = 3, + + /* wait for free cache slot */ + SCRUB_NEXT_WAIT = 4, + + /* simulate system crash during OI scrub */ + SCRUB_NEXT_CRASH = 5, + + /* simulate failure during OI scrub */ + SCRUB_NEXT_FATAL = 6, + + /* new created object, no scrub on it */ + SCRUB_NEXT_NOSCRUB = 7, + + /* the object has no FID-in-LMA */ + SCRUB_NEXT_NOLMA = 8, + + /* for OST-object */ + SCRUB_NEXT_OSTOBJ = 9, + + /* old OST-object, no LMA or no FID-on-OST flags in LMA */ + SCRUB_NEXT_OSTOBJ_OLD = 10, +}; + +enum scrub_local_file_flags { + SLFF_SCAN_SUBITEMS = 0x0001, + SLFF_HIDE_FID = 0x0002, + SLFF_SHOW_NAME = 0x0004, + SLFF_NO_OI = 0x0008, + SLFF_IDX_IN_FID = 0x0010, +}; + +enum scrub_status { + /* The scrub file is new created, for new MDT, upgrading from old disk, + * or re-creating the scrub file manually. */ + SS_INIT = 0, + + /* The scrub is checking/repairing the OI files. */ + SS_SCANNING = 1, + + /* The scrub checked/repaired the OI files successfully. */ + SS_COMPLETED = 2, + + /* The scrub failed to check/repair the OI files. */ + SS_FAILED = 3, + + /* The scrub is stopped manually, the OI files may be inconsistent. */ + SS_STOPPED = 4, + + /* The scrub is paused automatically when umount. */ + SS_PAUSED = 5, + + /* The scrub crashed during the scanning, should be restarted. */ + SS_CRASHED = 6, +}; + +enum scrub_flags { + /* OI files have been recreated, OI mappings should be re-inserted. */ + SF_RECREATED = 0x0000000000000001ULL, + + /* OI files are invalid, should be rebuild ASAP */ + SF_INCONSISTENT = 0x0000000000000002ULL, + + /* OI scrub is triggered automatically. */ + SF_AUTO = 0x0000000000000004ULL, + + /* The device is upgraded from 1.8 format. */ + SF_UPGRADE = 0x0000000000000008ULL, +}; + +enum scrub_param { + /* Exit when fail. */ + SP_FAILOUT = 0x0001, + + /* Check only without repairing. */ + SP_DRYRUN = 0x0002, +}; + +enum scrub_start { + /* Set failout flag. */ + SS_SET_FAILOUT = 0x00000001, + + /* Clear failout flag. */ + SS_CLEAR_FAILOUT = 0x00000002, + + /* Reset scrub start position. */ + SS_RESET = 0x00000004, + + /* Trigger full scrub automatically. */ + SS_AUTO_FULL = 0x00000008, + + /* Trigger partial scrub automatically. */ + SS_AUTO_PARTIAL = 0x00000010, + + /* Set dryrun flag. */ + SS_SET_DRYRUN = 0x00000020, + + /* Clear dryrun flag. */ + SS_CLEAR_DRYRUN = 0x00000040, +}; + +enum osd_lf_flags { + OLF_SCAN_SUBITEMS = 0x0001, + OLF_HIDE_FID = 0x0002, + OLF_SHOW_NAME = 0x0004, + OLF_NO_OI = 0x0008, + OLF_IDX_IN_FID = 0x0010, + OLF_NOT_BACKUP = 0x0020, +}; + +/* There are some overhead to detect OI inconsistency automatically + * during normal RPC handling. We do not want to always auto detect + * OI inconsistency especailly when OI scrub just done recently. + * + * The 'auto_scrub' defines the time (united as second) interval to + * enable auto detect OI inconsistency since last OI scurb done. */ +enum auto_scrub { + /* Disable auto scrub. */ + AS_NEVER = 0, + + /* 1 second is too short interval, it is almost equal to always auto + * detect inconsistent OI, usually used for test. */ + AS_ALWAYS = 1, + + /* Enable auto detect OI inconsistency one month (60 * 60 * 24 * 30) + * after last OI scrub. */ + AS_DEFAULT = 2592000LL, +}; + +struct scrub_file { + /* 128-bit uuid for volume. */ + __u8 sf_uuid[16]; + + /* See 'enum scrub_flags'. */ + __u64 sf_flags; + + /* The scrub magic. */ + __u32 sf_magic; + + /* See 'enum scrub_status'. */ + __u16 sf_status; + + /* See 'enum scrub_param'. */ + __u16 sf_param; + + /* The time for the last OI scrub completed. */ + time64_t sf_time_last_complete; + + /* The ttime for the latest OI scrub ran. */ + time64_t sf_time_latest_start; + + /* The time for the last OI scrub checkpoint. */ + time64_t sf_time_last_checkpoint; + + /* The position for the latest OI scrub started from. */ + __u64 sf_pos_latest_start; + + /* The position for the last OI scrub checkpoint. */ + __u64 sf_pos_last_checkpoint; + + /* The position for the first should be updated object. */ + __u64 sf_pos_first_inconsistent; + + /* How many objects have been checked. */ + __u64 sf_items_checked; + + /* How many objects have been updated. */ + __u64 sf_items_updated; + + /* How many objects failed to be processed. */ + __u64 sf_items_failed; + + /* How many prior objects have been updated during scanning. */ + __u64 sf_items_updated_prior; + + /* How many objects marked as LDISKFS_STATE_LUSTRE_NOSCRUB. */ + __u64 sf_items_noscrub; + + /* How many IGIF objects. */ + __u64 sf_items_igif; + + /* How long the OI scrub has run in seconds. Do NOT change + * to time64_t since this breaks backwards compatibility. + * It shouldn't take more than 136 years to complete :-) + */ + time_t sf_run_time; + + /* How many completed OI scrub ran on the device. */ + __u32 sf_success_count; + + /* How many OI files. */ + __u16 sf_oi_count; + + /* Keep the flags after scrub reset. See 'enum scrub_internal_flags' */ + __u16 sf_internal_flags; + + __u32 sf_reserved_1; + __u64 sf_reserved_2[16]; + + /* Bitmap for OI files recreated case. */ + __u8 sf_oi_bitmap[SCRUB_OI_BITMAP_SIZE]; +}; + +struct lustre_scrub { + /* Object for the scrub file. */ + struct dt_object *os_obj; + + struct ptlrpc_thread os_thread; + struct list_head os_inconsistent_items; + + /* write lock for scrub prep/update/post/checkpoint, + * read lock for scrub dump. */ + struct rw_semaphore os_rwsem; + spinlock_t os_lock; + + /* Scrub file in memory. */ + struct scrub_file os_file; + + /* Buffer for scrub file load/store. */ + struct scrub_file os_file_disk; + + const char *os_name; + + /* The time for last checkpoint, seconds */ + time64_t os_time_last_checkpoint; + + /* The time for next checkpoint, seconds */ + time64_t os_time_next_checkpoint; + + /* How many objects have been checked since last checkpoint. */ + __u64 os_new_checked; + __u64 os_pos_current; + __u32 os_start_flags; + unsigned int os_in_prior:1, /* process inconsistent item + * found by RPC prior */ + os_waiting:1, /* Waiting for scan window. */ + os_full_speed:1, /* run w/o speed limit */ + os_paused:1, /* The scrub is paused. */ + os_convert_igif:1, + os_partial_scan:1, + os_in_join:1, + os_full_scrub:1; +}; + +#define INDEX_BACKUP_MAGIC_V1 0x1E41F208 +#define INDEX_BACKUP_BUFSIZE (4096 * 4) + +enum lustre_index_backup_policy { + /* By default, do not backup the index */ + LIBP_NONE = 0, + + /* Backup the dirty index objects when umount */ + LIBP_AUTO = 1, +}; + +struct lustre_index_backup_header { + __u32 libh_magic; + __u32 libh_count; + __u32 libh_keysize; + __u32 libh_recsize; + struct lu_fid libh_owner; + __u64 libh_pad[60]; /* keep header 512 bytes aligned */ +}; + +struct lustre_index_backup_unit { + struct list_head libu_link; + struct lu_fid libu_fid; + __u32 libu_keysize; + __u32 libu_recsize; +}; + +struct lustre_index_restore_unit { + struct list_head liru_link; + struct lu_fid liru_pfid; + struct lu_fid liru_cfid; + __u64 liru_clid; + int liru_len; + char liru_name[0]; +}; + +void scrub_file_init(struct lustre_scrub *scrub, __u8 *uuid); +void scrub_file_reset(struct lustre_scrub *scrub, __u8 *uuid, __u64 flags); +int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub); +int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub); +int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub); +int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub, + void *data, __u32 flags); +void scrub_stop(struct lustre_scrub *scrub); +void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub); + +int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid, + const struct lu_fid *cfid, __u64 child, + const char *name, int namelen); + +int lustre_index_register(struct dt_device *dev, const char *devname, + struct list_head *head, spinlock_t *lock, int *guard, + const struct lu_fid *fid, + __u32 keysize, __u32 recsize); + +void lustre_index_backup(const struct lu_env *env, struct dt_device *dev, + const char *devname, struct list_head *head, + spinlock_t *lock, int *guard, bool backup); +int lustre_index_restore(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *parent_fid, + const struct lu_fid *tgt_fid, + const struct lu_fid *bak_fid, const char *name, + struct list_head *head, spinlock_t *lock, + char *buf, int bufsize); + +static inline void lustre_fid2lbx(char *buf, const struct lu_fid *fid, int len) +{ + snprintf(buf, len, DFID_NOBRACE".lbx", PFID(fid)); +} + +static inline const char *osd_scrub2name(struct lustre_scrub *scrub) +{ + return scrub->os_name; +} +#endif /* _LUSTRE_SCRUB_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_sec.h b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h new file mode 100644 index 0000000000000..6a69d01150aa1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h @@ -0,0 +1,1211 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_SEC_H_ +#define _LUSTRE_SEC_H_ + +/** \defgroup sptlrpc sptlrpc + * + * @{ + */ + +/* + * to avoid include + */ +struct obd_import; +struct obd_export; +struct ptlrpc_request; +struct ptlrpc_reply_state; +struct ptlrpc_bulk_desc; +struct brw_page; +struct lu_env; +/* Linux specific */ +struct key; +struct seq_file; +struct lustre_cfg; + +/* + * forward declaration + */ +struct ptlrpc_sec_policy; +struct ptlrpc_sec_cops; +struct ptlrpc_sec_sops; +struct ptlrpc_sec; +struct ptlrpc_svc_ctx; +struct ptlrpc_cli_ctx; +struct ptlrpc_ctx_ops; +struct req_msg_field; + +/** + * \addtogroup flavor flavor + * + * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits + * are unused, must be set to 0 for future expansion. + *
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * 
+ * + * @{ + */ + +/* + * flavor constants + */ +enum sptlrpc_policy { + SPTLRPC_POLICY_NULL = 0, + SPTLRPC_POLICY_PLAIN = 1, + SPTLRPC_POLICY_GSS = 2, + SPTLRPC_POLICY_MAX, +}; + +enum sptlrpc_mech_null { + SPTLRPC_MECH_NULL = 0, + SPTLRPC_MECH_NULL_MAX, +}; + +enum sptlrpc_mech_plain { + SPTLRPC_MECH_PLAIN = 0, + SPTLRPC_MECH_PLAIN_MAX, +}; + +enum sptlrpc_mech_gss { + SPTLRPC_MECH_GSS_NULL = 0, + SPTLRPC_MECH_GSS_KRB5 = 1, + SPTLRPC_MECH_GSS_SK = 2, + SPTLRPC_MECH_GSS_MAX, +}; + +enum sptlrpc_service_type { + SPTLRPC_SVC_NULL = 0, /**< no security */ + SPTLRPC_SVC_AUTH = 1, /**< authentication only */ + SPTLRPC_SVC_INTG = 2, /**< integrity */ + SPTLRPC_SVC_PRIV = 3, /**< privacy */ + SPTLRPC_SVC_MAX, +}; + +enum sptlrpc_bulk_type { + SPTLRPC_BULK_DEFAULT = 0, /**< follow rpc flavor */ + SPTLRPC_BULK_HASH = 1, /**< hash integrity */ + SPTLRPC_BULK_MAX, +}; + +enum sptlrpc_bulk_service { + SPTLRPC_BULK_SVC_NULL = 0, /**< no security */ + SPTLRPC_BULK_SVC_AUTH = 1, /**< authentication only */ + SPTLRPC_BULK_SVC_INTG = 2, /**< integrity */ + SPTLRPC_BULK_SVC_PRIV = 3, /**< privacy */ + SPTLRPC_BULK_SVC_MAX, +}; + +/* + * compose/extract macros + */ +#define FLVR_POLICY_OFFSET (0) +#define FLVR_MECH_OFFSET (4) +#define FLVR_SVC_OFFSET (8) +#define FLVR_BULK_TYPE_OFFSET (12) +#define FLVR_BULK_SVC_OFFSET (16) + +#define MAKE_FLVR(policy, mech, svc, btype, bsvc) \ + (((__u32)(policy) << FLVR_POLICY_OFFSET) | \ + ((__u32)(mech) << FLVR_MECH_OFFSET) | \ + ((__u32)(svc) << FLVR_SVC_OFFSET) | \ + ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) | \ + ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET)) + +/* + * extraction + */ +#define SPTLRPC_FLVR_POLICY(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF) +#define SPTLRPC_FLVR_MECH(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF) +#define SPTLRPC_FLVR_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_TYPE(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF) + +#define SPTLRPC_FLVR_BASE(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF) +#define SPTLRPC_FLVR_BASE_SUB(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF) + +/* + * gss subflavors + */ +#define MAKE_BASE_SUBFLVR(mech, svc) \ + ((__u32)(mech) | \ + ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET))) + +#define SPTLRPC_SUBFLVR_GSSNULL \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_NULL, SPTLRPC_SVC_NULL) +#define SPTLRPC_SUBFLVR_KRB5N \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL) +#define SPTLRPC_SUBFLVR_KRB5A \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH) +#define SPTLRPC_SUBFLVR_KRB5I \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG) +#define SPTLRPC_SUBFLVR_KRB5P \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV) +#define SPTLRPC_SUBFLVR_SKN \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_NULL) +#define SPTLRPC_SUBFLVR_SKA \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_AUTH) +#define SPTLRPC_SUBFLVR_SKI \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_INTG) +#define SPTLRPC_SUBFLVR_SKPI \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_PRIV) + +/* + * "end user" flavors + */ +#define SPTLRPC_FLVR_NULL \ + MAKE_FLVR(SPTLRPC_POLICY_NULL, \ + SPTLRPC_MECH_NULL, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_PLAIN \ + MAKE_FLVR(SPTLRPC_POLICY_PLAIN, \ + SPTLRPC_MECH_PLAIN, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_HASH, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_GSSNULL \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_NULL, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5N \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5A \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_AUTH, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5I \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_INTG, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_KRB5P \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_PRIV, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_PRIV) +#define SPTLRPC_FLVR_SKN \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_SK, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_SKA \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_SK, \ + SPTLRPC_SVC_AUTH, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_SKI \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_SK, \ + SPTLRPC_SVC_INTG, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_SKPI \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_SK, \ + SPTLRPC_SVC_PRIV, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_PRIV) + +#define SPTLRPC_FLVR_DEFAULT SPTLRPC_FLVR_NULL + +#define SPTLRPC_FLVR_INVALID ((__u32) 0xFFFFFFFF) +#define SPTLRPC_FLVR_ANY ((__u32) 0xFFF00000) + +/** + * extract the useful part from wire flavor + */ +#define WIRE_FLVR(wflvr) (((__u32) (wflvr)) & 0x000FFFFF) + +/** @} flavor */ + +static inline void flvr_set_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + svc, + SPTLRPC_FLVR_BULK_TYPE(*flvr), + SPTLRPC_FLVR_BULK_SVC(*flvr)); +} + +static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_BULK_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + SPTLRPC_FLVR_SVC(*flvr), + SPTLRPC_FLVR_BULK_TYPE(*flvr), + svc); +} + +struct bulk_spec_hash { + __u8 hash_alg; +}; + +/** + * Full description of flavors being used on a ptlrpc connection, include + * both regular RPC and bulk transfer parts. + */ +struct sptlrpc_flavor { + /** + * wire flavor, should be renamed to sf_wire. + */ + __u32 sf_rpc; + /** + * general flags of PTLRPC_SEC_FL_* + */ + __u32 sf_flags; + /** + * rpc flavor specification + */ + union { + /* nothing for now */ + } u_rpc; + /** + * bulk flavor specification + */ + union { + struct bulk_spec_hash hash; + } u_bulk; +}; + +/** + * identify the RPC is generated from what part of Lustre. It's encoded into + * RPC requests and to be checked by ptlrpc service. + */ +enum lustre_sec_part { + LUSTRE_SP_CLI = 0, + LUSTRE_SP_MDT, + LUSTRE_SP_OST, + LUSTRE_SP_MGC, + LUSTRE_SP_MGS, + LUSTRE_SP_ANY = 0xFF +}; + +const char *sptlrpc_part2name(enum lustre_sec_part sp); +enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd); + +/** + * A rule specifies a flavor to be used by a ptlrpc connection between + * two Lustre parts. + */ +struct sptlrpc_rule { + __u32 sr_netid; /* LNET network ID */ + __u8 sr_from; /* sec_part */ + __u8 sr_to; /* sec_part */ + __u16 sr_padding; + struct sptlrpc_flavor sr_flvr; +}; + +/** + * A set of rules in memory. + * + * Rules are generated and stored on MGS, and propagated to MDT, OST, + * and client when needed. + */ +struct sptlrpc_rule_set { + int srs_nslot; + int srs_nrule; + struct sptlrpc_rule *srs_rules; +}; + +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr); +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr); + +static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set) +{ + memset(set, 0, sizeof(*set)); +} + +void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set); +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set); +int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set, + struct sptlrpc_rule *rule); +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf); +void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set); + +int sptlrpc_process_config(struct lustre_cfg *lcfg); +void sptlrpc_conf_log_start(const char *logname); +void sptlrpc_conf_log_stop(const char *logname); +void sptlrpc_conf_log_update_begin(const char *logname); +void sptlrpc_conf_log_update_end(const char *logname); +void sptlrpc_conf_client_adapt(struct obd_device *obd); +int sptlrpc_conf_target_get_rules(struct obd_device *obd, + struct sptlrpc_rule_set *rset); +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *flavor); + +/* The maximum length of security payload. 1024 is enough for Kerberos 5, + * and should be enough for other future mechanisms but not sure. + * Only used by pre-allocated request/reply pool. + */ +#define SPTLRPC_MAX_PAYLOAD (1024) + + +struct vfs_cred { + uint32_t vc_uid; + uint32_t vc_gid; +}; + +struct ptlrpc_ctx_ops { + /** + * To determine whether it's suitable to use the \a ctx for \a vcred. + */ + int (*match) (struct ptlrpc_cli_ctx *ctx, + struct vfs_cred *vcred); + + /** + * To bring the \a ctx uptodate. + */ + int (*refresh) (struct ptlrpc_cli_ctx *ctx); + + /** + * Validate the \a ctx. + */ + int (*validate) (struct ptlrpc_cli_ctx *ctx); + + /** + * Force the \a ctx to die. + */ + void (*die) (struct ptlrpc_cli_ctx *ctx, + int grace); + int (*display) (struct ptlrpc_cli_ctx *ctx, + char *buf, int bufsize); + + /** + * Sign the request message using \a ctx. + * + * \pre req->rq_reqmsg point to request message. + * \pre req->rq_reqlen is the request message length. + * \post req->rq_reqbuf point to request message with signature. + * \post req->rq_reqdata_len is set to the final request message size. + * + * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign(). + */ + int (*sign) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Verify the reply message using \a ctx. + * + * \pre req->rq_repdata point to reply message with signature. + * \pre req->rq_repdata_len is the total reply message length. + * \post req->rq_repmsg point to reply message without signature. + * \post req->rq_replen is the reply message length. + * + * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify(). + */ + int (*verify) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Encrypt the request message using \a ctx. + * + * \pre req->rq_reqmsg point to request message in clear text. + * \pre req->rq_reqlen is the request message length. + * \post req->rq_reqbuf point to request message. + * \post req->rq_reqdata_len is set to the final request message size. + * + * \see gss_cli_ctx_seal(). + */ + int (*seal) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Decrypt the reply message using \a ctx. + * + * \pre req->rq_repdata point to encrypted reply message. + * \pre req->rq_repdata_len is the total cipher text length. + * \post req->rq_repmsg point to reply message in clear text. + * \post req->rq_replen is the reply message length in clear text. + * + * \see gss_cli_ctx_unseal(). + */ + int (*unseal) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Wrap bulk request data. This is called before wrapping RPC + * request message. + * + * \pre bulk buffer is descripted by desc->bd_iov and + * desc->bd_iov_count. note for read it's just buffer, no data + * need to be sent; for write it contains data in clear text. + * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared + * (usually inside of RPC request message). + * - encryption: cipher text bulk buffer is descripted by + * desc->bd_enc_iov and desc->bd_iov_count (currently assume iov + * count remains the same). + * - otherwise: bulk buffer is still desc->bd_iov and + * desc->bd_iov_count. + * + * \return 0: success. + * \return -ev: error code. + * + * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk(). + */ + int (*wrap_bulk) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Unwrap bulk reply data. This is called after wrapping RPC + * reply message. + * + * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and + * desc->bd_iov_count, according to wrap_bulk(). + * \post final bulk data in clear text is placed in buffer described + * by desc->bd_iov and desc->bd_iov_count. + * \return +ve nob of actual bulk data in clear text. + * \return -ve error code. + * + * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk(). + */ + int (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +}; + +#define PTLRPC_CTX_NEW_BIT (0) /* newly created */ +#define PTLRPC_CTX_UPTODATE_BIT (1) /* uptodate */ +#define PTLRPC_CTX_DEAD_BIT (2) /* mark expired gracefully */ +#define PTLRPC_CTX_ERROR_BIT (3) /* fatal error (refresh, etc.) */ +#define PTLRPC_CTX_CACHED_BIT (8) /* in ctx cache (hash etc.) */ +#define PTLRPC_CTX_ETERNAL_BIT (9) /* always valid */ + +#define PTLRPC_CTX_NEW (1 << PTLRPC_CTX_NEW_BIT) +#define PTLRPC_CTX_UPTODATE (1 << PTLRPC_CTX_UPTODATE_BIT) +#define PTLRPC_CTX_DEAD (1 << PTLRPC_CTX_DEAD_BIT) +#define PTLRPC_CTX_ERROR (1 << PTLRPC_CTX_ERROR_BIT) +#define PTLRPC_CTX_CACHED (1 << PTLRPC_CTX_CACHED_BIT) +#define PTLRPC_CTX_ETERNAL (1 << PTLRPC_CTX_ETERNAL_BIT) + +#define PTLRPC_CTX_STATUS_MASK (PTLRPC_CTX_NEW_BIT | \ + PTLRPC_CTX_UPTODATE | \ + PTLRPC_CTX_DEAD | \ + PTLRPC_CTX_ERROR) + +struct ptlrpc_cli_ctx { + struct hlist_node cc_cache; /* linked into ctx cache */ + atomic_t cc_refcount; + struct ptlrpc_sec *cc_sec; + struct ptlrpc_ctx_ops *cc_ops; + time64_t cc_expire; /* in seconds */ + unsigned int cc_early_expire:1; + unsigned long cc_flags; + struct vfs_cred cc_vcred; + spinlock_t cc_lock; + struct list_head cc_req_list; /* waiting reqs linked here */ + struct list_head cc_gc_chain; /* linked to gc chain */ +}; + +/** + * client side policy operation vector. + */ +struct ptlrpc_sec_cops { + /** + * Given an \a imp, create and initialize a ptlrpc_sec structure. + * \param ctx service context: + * - regular import: \a ctx should be NULL; + * - reverse import: \a ctx is obtained from incoming request. + * \param flavor specify what flavor to use. + * + * When necessary, policy module is responsible for taking reference + * on the import. + * + * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr(). + */ + struct ptlrpc_sec * (*create_sec) (struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *flavor); + + /** + * Destructor of ptlrpc_sec. When called, refcount has been dropped + * to 0 and all contexts has been destroyed. + * + * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr(). + */ + void (*destroy_sec) (struct ptlrpc_sec *sec); + + /** + * Notify that this ptlrpc_sec is going to die. Optionally, policy + * module is supposed to set sec->ps_dying and whatever necessary + * actions. + * + * \see plain_kill_sec(), gss_sec_kill(). + */ + void (*kill_sec) (struct ptlrpc_sec *sec); + + /** + * Given \a vcred, lookup and/or create its context. The policy module + * is supposed to maintain its own context cache. + * XXX currently \a create and \a remove_dead is always 1, perhaps + * should be removed completely. + * + * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr(). + */ + struct ptlrpc_cli_ctx * (*lookup_ctx) (struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, + int remove_dead); + + /** + * Called then the reference of \a ctx dropped to 0. The policy module + * is supposed to destroy this context or whatever else according to + * its cache maintainance mechamism. + * + * \param sync if zero, we shouldn't wait for the context being + * destroyed completely. + * + * \see plain_release_ctx(), gss_sec_release_ctx_kr(). + */ + void (*release_ctx) (struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync); + + /** + * Flush the context cache. + * + * \param uid context of which user, -1 means all contexts. + * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected + * contexts should be cleared immediately. + * \param force if zero, only idle contexts will be flushed. + * + * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr(). + */ + int (*flush_ctx_cache) + (struct ptlrpc_sec *sec, + uid_t uid, + int grace, + int force); + + /** + * Called periodically by garbage collector to remove dead contexts + * from cache. + * + * \see gss_sec_gc_ctx_kr(). + */ + void (*gc_ctx) (struct ptlrpc_sec *sec); + + /** + * Given an context \a ctx, install a corresponding reverse service + * context on client side. + * XXX currently it's only used by GSS module, maybe we should remove + * this from general API. + */ + int (*install_rctx)(struct obd_import *imp, + struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); + + /** + * To allocate request buffer for \a req. + * + * \pre req->rq_reqmsg == NULL. + * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated, + * we are not supposed to free it. + * \post if success, req->rq_reqmsg point to a buffer with size + * at least \a lustre_msg_size. + * + * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf(). + */ + int (*alloc_reqbuf)(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + + /** + * To free request buffer for \a req. + * + * \pre req->rq_reqbuf != NULL. + * + * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf(). + */ + void (*free_reqbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + + /** + * To allocate reply buffer for \a req. + * + * \pre req->rq_repbuf == NULL. + * \post if success, req->rq_repbuf point to a buffer with size + * req->rq_repbuf_len, the size should be large enough to receive + * reply which be transformed from \a lustre_msg_size of clear text. + * + * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf(). + */ + int (*alloc_repbuf)(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + + /** + * To free reply buffer for \a req. + * + * \pre req->rq_repbuf != NULL. + * \post req->rq_repbuf == NULL. + * \post req->rq_repbuf_len == 0. + * + * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf(). + */ + void (*free_repbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + + /** + * To expand the request buffer of \a req, thus the \a segment in + * the request message pointed by req->rq_reqmsg can accommodate + * at least \a newsize of data. + * + * \pre req->rq_reqmsg->lm_buflens[segment] < newsize. + * + * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(), + * gss_enlarge_reqbuf(). + */ + int (*enlarge_reqbuf) + (struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize); + /* + * misc + */ + int (*display) (struct ptlrpc_sec *sec, + struct seq_file *seq); +}; + +/** + * server side policy operation vector. + */ +struct ptlrpc_sec_sops { + /** + * verify an incoming request. + * + * \pre request message is pointed by req->rq_reqbuf, size is + * req->rq_reqdata_len; and the message has been unpacked to + * host byte order. + * + * \retval SECSVC_OK success, req->rq_reqmsg point to request message + * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set; + * req->rq_sp_from is decoded from request. + * \retval SECSVC_COMPLETE success, the request has been fully + * processed, and reply message has been prepared; req->rq_sp_from is + * decoded from request. + * \retval SECSVC_DROP failed, this request should be dropped. + * + * \see null_accept(), plain_accept(), gss_svc_accept_kr(). + */ + int (*accept) (struct ptlrpc_request *req); + + /** + * Perform security transformation upon reply message. + * + * \pre reply message is pointed by req->rq_reply_state->rs_msg, size + * is req->rq_replen. + * \post req->rs_repdata_len is the final message size. + * \post req->rq_reply_off is set. + * + * \see null_authorize(), plain_authorize(), gss_svc_authorize(). + */ + int (*authorize) (struct ptlrpc_request *req); + + /** + * Invalidate server context \a ctx. + * + * \see gss_svc_invalidate_ctx(). + */ + void (*invalidate_ctx) + (struct ptlrpc_svc_ctx *ctx); + + /** + * Allocate a ptlrpc_reply_state. + * + * \param msgsize size of the reply message in clear text. + * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we + * should simply use it; otherwise we'll responsible for allocating + * a new one. + * \post req->rq_reply_state != NULL; + * \post req->rq_reply_state->rs_msg != NULL; + * + * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs(). + */ + int (*alloc_rs) (struct ptlrpc_request *req, + int msgsize); + + /** + * Free a ptlrpc_reply_state. + */ + void (*free_rs) (struct ptlrpc_reply_state *rs); + + /** + * Release the server context \a ctx. + * + * \see gss_svc_free_ctx(). + */ + void (*free_ctx) (struct ptlrpc_svc_ctx *ctx); + + /** + * Install a reverse context based on the server context \a ctx. + * + * \see gss_svc_install_rctx_kr(). + */ + int (*install_rctx)(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx); + + /** + * Prepare buffer for incoming bulk write. + * + * \pre desc->bd_iov and desc->bd_iov_count describes the buffer + * intended to receive the write. + * + * \see gss_svc_prep_bulk(). + */ + int (*prep_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Unwrap the bulk write data. + * + * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk(). + */ + int (*unwrap_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Wrap the bulk read data. + * + * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk(). + */ + int (*wrap_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +}; + +struct ptlrpc_sec_policy { + struct module *sp_owner; + char *sp_name; + __u16 sp_policy; /* policy number */ + struct ptlrpc_sec_cops *sp_cops; /* client ops */ + struct ptlrpc_sec_sops *sp_sops; /* server ops */ +}; + +#define PTLRPC_SEC_FL_REVERSE 0x0001 /* reverse sec */ +#define PTLRPC_SEC_FL_ROOTONLY 0x0002 /* treat everyone as root */ +#define PTLRPC_SEC_FL_UDESC 0x0004 /* ship udesc */ +#define PTLRPC_SEC_FL_BULK 0x0008 /* intensive bulk i/o expected */ +#define PTLRPC_SEC_FL_PAG 0x0010 /* PAG mode */ + +/** + * The ptlrpc_sec represents the client side ptlrpc security facilities, + * each obd_import (both regular and reverse import) must associate with + * a ptlrpc_sec. + * + * \see sptlrpc_import_sec_adapt(). + */ +struct ptlrpc_sec { + struct ptlrpc_sec_policy *ps_policy; + atomic_t ps_refcount; + /** statistic only */ + atomic_t ps_nctx; + /** unique identifier */ + int ps_id; + struct sptlrpc_flavor ps_flvr; + enum lustre_sec_part ps_part; + /** after set, no more new context will be created */ + unsigned int ps_dying:1; + /** owning import */ + struct obd_import *ps_import; + spinlock_t ps_lock; + /** mtime of SELinux policy file */ + ktime_t ps_sepol_mtime; + /** next check time of SELinux policy file */ + ktime_t ps_sepol_checknext; + /** + * SELinux policy info + * sepol string format is: + * ::: + */ + char ps_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + + 1]; + + /* + * garbage collection + */ + struct list_head ps_gc_list; + time64_t ps_gc_interval; /* in seconds */ + time64_t ps_gc_next; /* in seconds */ +}; + +static inline int flvr_is_rootonly(__u32 flavor) +{ + return (SPTLRPC_FLVR_POLICY(flavor) == SPTLRPC_POLICY_GSS && + (SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_NULL || + SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_SK)); +} + +static inline int flvr_allows_user_desc(__u32 flavor) +{ + return (SPTLRPC_FLVR_POLICY(flavor) == SPTLRPC_POLICY_GSS && + (SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_NULL || + SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_SK)); +} + +static inline int sec_is_reverse(struct ptlrpc_sec *sec) +{ + return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE); +} + +static inline int sec_is_rootonly(struct ptlrpc_sec *sec) +{ + return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY); +} + + +struct ptlrpc_svc_ctx { + atomic_t sc_refcount; + struct ptlrpc_sec_policy *sc_policy; +}; + +/* + * user identity descriptor + */ +#define LUSTRE_MAX_GROUPS (128) + +struct ptlrpc_user_desc { + __u32 pud_uid; + __u32 pud_gid; + __u32 pud_fsuid; + __u32 pud_fsgid; + __u32 pud_cap; + __u32 pud_ngroups; + __u32 pud_groups[0]; +}; + +/* + * bulk flavors + */ +enum sptlrpc_bulk_hash_alg { + BULK_HASH_ALG_NULL = 0, + BULK_HASH_ALG_ADLER32, + BULK_HASH_ALG_CRC32, + BULK_HASH_ALG_MD5, + BULK_HASH_ALG_SHA1, + BULK_HASH_ALG_SHA256, + BULK_HASH_ALG_SHA384, + BULK_HASH_ALG_SHA512, + BULK_HASH_ALG_MAX +}; + +const char * sptlrpc_get_hash_name(__u8 hash_alg); +__u8 sptlrpc_get_hash_alg(const char *algname); + +enum { + BSD_FL_ERR = 1, +}; + +struct ptlrpc_bulk_sec_desc { + __u8 bsd_version; /* 0 */ + __u8 bsd_type; /* SPTLRPC_BULK_XXX */ + __u8 bsd_svc; /* SPTLRPC_BULK_SVC_XXXX */ + __u8 bsd_flags; /* flags */ + __u32 bsd_nob; /* nob of bulk data */ + __u8 bsd_data[0]; /* policy-specific token */ +}; + + +/* + * lprocfs + */ +struct proc_dir_entry; +extern struct proc_dir_entry *sptlrpc_proc_root; + +/* + * round size up to next power of 2, for slab allocation. + * @size must be sane (can't overflow after round up) + */ +static inline int size_roundup_power2(int size) +{ + size--; + size |= size >> 1; + size |= size >> 2; + size |= size >> 4; + size |= size >> 8; + size |= size >> 16; + size++; + return size; +} + +/* + * internal support libraries + */ +void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, + int segment, int newsize); + +/* + * security policies + */ +int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy); +int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy); + +__u32 sptlrpc_name2flavor_base(const char *name); +const char *sptlrpc_flavor2name_base(__u32 flvr); +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize); +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize); +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize); + +static inline struct ptlrpc_sec_policy * +sptlrpc_policy_get(struct ptlrpc_sec_policy *policy) +{ + __module_get(policy->sp_owner); + return policy; +} + +static inline void +sptlrpc_policy_put(struct ptlrpc_sec_policy *policy) +{ + module_put(policy->sp_owner); +} + +/* + * client credential + */ +static inline +unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx) +{ + return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK); +} + +static inline +int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx) +{ + return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE); +} + +static inline +int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx) +{ + return (cli_ctx_status(ctx) != 0); +} + +static inline +int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0); +} + +static inline +int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0); +} + +static inline +int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0); +} + +static inline +int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0); +} + +/* + * sec get/put + */ +struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec); +void sptlrpc_sec_put(struct ptlrpc_sec *sec); + +/* + * internal apis which only used by policy impelentation + */ +int sptlrpc_get_next_secid(void); +void sptlrpc_sec_destroy(struct ptlrpc_sec *sec); + +/* + * exported client context api + */ +struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx); +void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync); +void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx); +void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx); +int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize); + +/* + * exported client context wrap/buffers + */ +int sptlrpc_cli_wrap_request(struct ptlrpc_request *req); +int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req); +int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize); +void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req); +int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize); +void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req); +int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, + const struct req_msg_field *field, + int newsize); +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, + struct ptlrpc_request **req_ret); +void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req); + +void sptlrpc_request_out_callback(struct ptlrpc_request *req); +int sptlrpc_get_sepol(struct ptlrpc_request *req); + +/* + * exported higher interface of import & request + */ +int sptlrpc_import_sec_adapt(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *flvr); +struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp); +void sptlrpc_import_sec_put(struct obd_import *imp); + +int sptlrpc_import_check_ctx(struct obd_import *imp); +void sptlrpc_import_flush_root_ctx(struct obd_import *imp); +void sptlrpc_import_flush_my_ctx(struct obd_import *imp); +void sptlrpc_import_flush_all_ctx(struct obd_import *imp); +int sptlrpc_req_get_ctx(struct ptlrpc_request *req); +void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync); +int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout); +int sptlrpc_export_update_ctx(struct obd_export *exp); +int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req); +void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode); + +int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule); + +/* gc */ +void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec); +void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec); +void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx); + +/* misc */ +const char * sec2target_str(struct ptlrpc_sec *sec); +int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev); + +/* + * server side + */ +enum secsvc_accept_res { + SECSVC_OK = 0, + SECSVC_COMPLETE, + SECSVC_DROP, +}; + +int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req); +int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen); +int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req); +void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs); +void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req); +void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req); +void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req); + +int sptlrpc_target_export_check(struct obd_export *exp, + struct ptlrpc_request *req); +void sptlrpc_target_update_exp_flavor(struct obd_device *obd, + struct sptlrpc_rule_set *rset); + +/* + * reverse context + */ +int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx); +int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_cli_ctx *ctx); + +/* bulk security api */ +int sptlrpc_enc_pool_add_user(void); +int sptlrpc_enc_pool_del_user(void); +int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc); +void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc); +int get_free_pages_in_pool(void); +int pool_is_at_full_capacity(void); + +int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + int nob); +int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +#ifdef HAVE_SERVER_SUPPORT +int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +#endif + +/* bulk helpers (internal use only by policies) */ +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen); + +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed); + +/* user descriptor helpers */ +static inline int sptlrpc_user_desc_size(int ngroups) +{ + return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32); +} + +int sptlrpc_current_user_desc_size(void); +int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset); +int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed); + +/** @} sptlrpc */ + +#endif /* _LUSTRE_SEC_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_swab.h b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h new file mode 100644 index 0000000000000..96dcd493f5f33 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h @@ -0,0 +1,138 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2017, Intel Corporation. + * + * Copyright 2015 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * We assume all nodes are either little-endian or big-endian, and we + * always send messages in the sender's native format. The receiver + * detects the message format by checking the 'magic' field of the message + * (see lustre_msg_swabbed() below). + * + * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines + * are implemented in ptlrpc/lustre_swab.c. These 'swabbers' convert the + * type from "other" endian, in-place in the message buffer. + * + * A swabber takes a single pointer argument. The caller must already have + * verified that the length of the message buffer >= sizeof (type). + * + * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine + * may be defined that swabs just the variable part, after the caller has + * verified that the message buffer is large enough. + */ + +#ifndef _LUSTRE_SWAB_H_ +#define _LUSTRE_SWAB_H_ + +#include + +void lustre_swab_orphan_ent(struct lu_orphan_ent *ent); +void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent); +void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent); +void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); +void lustre_swab_connect(struct obd_connect_data *ocd); +void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +void lustre_swab_hsm_state_set(struct hsm_state_set *hss); +void lustre_swab_obd_statfs(struct obd_statfs *os); +void lustre_swab_obd_ioobj(struct obd_ioobj *ioo); +void lustre_swab_niobuf_remote(struct niobuf_remote *nbr); +void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb); +void lustre_swab_ost_lvb(struct ost_lvb *lvb); +void lustre_swab_obd_quotactl(struct obd_quotactl *q); +void lustre_swab_quota_body(struct quota_body *b); +void lustre_swab_lquota_lvb(struct lquota_lvb *lvb); +void lustre_swab_barrier_lvb(struct barrier_lvb *lvb); +void lustre_swab_generic_32s(__u32 *val); +void lustre_swab_mdt_body(struct mdt_body *b); +void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b); +void lustre_swab_mdt_rec_setattr(struct mdt_rec_setattr *sa); +void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr); +void lustre_swab_lmv_desc(struct lmv_desc *ld); +void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm); +void lustre_swab_lov_desc(struct lov_desc *ld); +void lustre_swab_ldlm_res_id(struct ldlm_res_id *id); +void lustre_swab_ldlm_policy_data(union ldlm_wire_policy_data *d); +void lustre_swab_gl_lquota_desc(struct ldlm_gl_lquota_desc *); +void lustre_swab_gl_barrier_desc(struct ldlm_gl_barrier_desc *); +void lustre_swab_ldlm_intent(struct ldlm_intent *i); +void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r); +void lustre_swab_ldlm_lock_desc(struct ldlm_lock_desc *l); +void lustre_swab_ldlm_request(struct ldlm_request *rq); +void lustre_swab_ldlm_reply(struct ldlm_reply *r); +void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo); +void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo); +void lustre_swab_mgs_config_body(struct mgs_config_body *body); +void lustre_swab_mgs_config_res(struct mgs_config_res *body); +void lustre_swab_lfsck_request(struct lfsck_request *lr); +void lustre_swab_lfsck_reply(struct lfsck_reply *lr); +void lustre_swab_obdo(struct obdo *o); +void lustre_swab_ost_body(struct ost_body *b); +void lustre_swab_ost_last_id(__u64 *id); +void lustre_swab_fiemap(struct fiemap *fiemap); +void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info); +void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum); +void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum); +void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum); +void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count); +void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size); +void lustre_swab_lov_mds_md(struct lov_mds_md *lmm); +void lustre_swab_idx_info(struct idx_info *ii); +void lustre_swab_lip_header(struct lu_idxpage *lip); +void lustre_swab_lustre_capa(struct lustre_capa *c); +void lustre_swab_lustre_capa_key(struct lustre_capa_key *k); +void lustre_swab_fid2path(struct getinfo_fid2path *gf); +void lustre_swab_layout_intent(struct layout_intent *li); +void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +void lustre_swab_hsm_current_action(struct hsm_current_action *action); +void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk); +void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +void lustre_swab_hsm_user_item(struct hsm_user_item *hui); +void lustre_swab_hsm_request(struct hsm_request *hr); +void lustre_swab_object_update(struct object_update *ou); +void lustre_swab_object_update_request(struct object_update_request *our); +void lustre_swab_out_update_header(struct out_update_header *ouh); +void lustre_swab_out_update_buffer(struct out_update_buffer *oub); +void lustre_swab_object_update_result(struct object_update_result *our); +void lustre_swab_object_update_reply(struct object_update_reply *our); +void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl); +void lustre_swab_close_data(struct close_data *data); +void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync); +void lustre_swab_lmv_user_md(struct lmv_user_md *lum); +void lustre_swab_ladvise(struct lu_ladvise *ladvise); +void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr); + +/* Functions for dumping PTLRPC fields */ +void dump_rniobuf(struct niobuf_remote *rnb); +void dump_ioo(struct obd_ioobj *nb); +void dump_ost_body(struct ost_body *ob); +void dump_rcs(__u32 *rc); + +void lustre_print_user_md(unsigned int level, struct lov_user_md *lum, + const char *msg); + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_update.h b/drivers/staging/lustrefsx/lustre/include/lustre_update.h new file mode 100644 index 0000000000000..78cd3d4bfdd51 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lustre_update.h @@ -0,0 +1,709 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.htm + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * lustre/include/lustre_update.h + * + * Author: Di Wang + */ + +#ifndef _LUSTRE_UPDATE_H +#define _LUSTRE_UPDATE_H +#include +#include +#include + +#define OUT_UPDATE_REPLY_SIZE 4096 +#define OUT_BULK_BUFFER_SIZE 4096 + +struct dt_key; +struct dt_rec; +struct object_update_param; +struct llog_update_record; + +static inline size_t update_params_size(const struct update_params *params, + unsigned int param_count) +{ + struct object_update_param *param; + size_t total_size = sizeof(*params); + unsigned int i; + + param = (struct object_update_param *)¶ms->up_params[0]; + for (i = 0; i < param_count; i++) { + size_t size = object_update_param_size(param); + + param = (struct object_update_param *)((char *)param + size); + total_size += size; + } + + return total_size; +} + +static inline struct object_update_param * +update_params_get_param(const struct update_params *params, + unsigned int index, unsigned int param_count) +{ + struct object_update_param *param; + unsigned int i; + + if (index > param_count) + return NULL; + + param = (struct object_update_param *)¶ms->up_params[0]; + for (i = 0; i < index; i++) + param = (struct object_update_param *)((char *)param + + object_update_param_size(param)); + + return param; +} + +static inline void* +update_params_get_param_buf(const struct update_params *params, __u16 index, + unsigned int param_count, __u16 *size) +{ + struct object_update_param *param; + + param = update_params_get_param(params, (unsigned int)index, + param_count); + if (param == NULL) + return NULL; + + if (size != NULL) + *size = param->oup_len; + + return param->oup_buf; +} + +static inline size_t +update_op_size(unsigned int param_count) +{ + return offsetof(struct update_op, uop_params_off[param_count]); +} + +static inline struct update_op * +update_op_next_op(const struct update_op *uop) +{ + return (struct update_op *)((char *)uop + + update_op_size(uop->uop_param_count)); +} + +static inline size_t update_ops_size(const struct update_ops *ops, + unsigned int update_count) +{ + struct update_op *op; + size_t total_size = sizeof(*ops); + unsigned int i; + + op = (struct update_op *)&ops->uops_op[0]; + for (i = 0; i < update_count; i++, op = update_op_next_op(op)) + total_size += update_op_size(op->uop_param_count); + + return total_size; +} + +static inline struct update_params * +update_records_get_params(const struct update_records *record) +{ + return (struct update_params *)((char *)record + + offsetof(struct update_records, ur_ops) + + update_ops_size(&record->ur_ops, record->ur_update_count)); +} + +static inline struct update_param * +update_param_next_param(const struct update_param *param) +{ + return (struct update_param *)((char *)param + + object_update_param_size( + (struct object_update_param *)param)); +} + +static inline size_t +__update_records_size(size_t raw_size) +{ + return cfs_size_round(offsetof(struct update_records, ur_ops) + + raw_size); +} + +static inline size_t +update_records_size(const struct update_records *record) +{ + size_t op_size = 0; + size_t param_size = 0; + + if (record->ur_update_count > 0) + op_size = update_ops_size(&record->ur_ops, + record->ur_update_count); + if (record->ur_param_count > 0) { + struct update_params *params; + + params = update_records_get_params(record); + param_size = update_params_size(params, record->ur_param_count); + } + + return __update_records_size(op_size + param_size); +} + +static inline size_t +__llog_update_record_size(size_t records_size) +{ + return cfs_size_round(sizeof(struct llog_rec_hdr) + records_size + + sizeof(struct llog_rec_tail)); +} + +static inline size_t +llog_update_record_size(const struct llog_update_record *lur) +{ + return __llog_update_record_size( + update_records_size(&lur->lur_update_rec)); +} + +static inline struct update_op * +update_ops_get_op(const struct update_ops *ops, unsigned int index, + unsigned int update_count) +{ + struct update_op *op; + unsigned int i; + + if (index > update_count) + return NULL; + + op = (struct update_op *)&ops->uops_op[0]; + for (i = 0; i < index; i++) + op = update_op_next_op(op); + + return op; +} + +static inline void +*object_update_param_get(const struct object_update *update, size_t index, + size_t *size) +{ + const struct object_update_param *param; + size_t i; + + if (index >= update->ou_params_count) + return ERR_PTR(-EINVAL); + + param = &update->ou_params[0]; + for (i = 0; i < index; i++) + param = (struct object_update_param *)((char *)param + + object_update_param_size(param)); + + if (size != NULL) + *size = param->oup_len; + + if (param->oup_len == 0) + return ERR_PTR(-ENODATA); + + return (void *)¶m->oup_buf[0]; +} + +static inline unsigned long +object_update_request_size(const struct object_update_request *our) +{ + unsigned long size; + size_t i = 0; + + size = offsetof(struct object_update_request, ourq_updates[0]); + for (i = 0; i < our->ourq_count; i++) { + struct object_update *update; + + update = (struct object_update *)((char *)our + size); + size += object_update_size(update); + } + return size; +} + +static inline void +object_update_result_insert(struct object_update_reply *reply, + void *data, size_t data_len, size_t index, + int rc) +{ + struct object_update_result *update_result; + + update_result = object_update_result_get(reply, index, NULL); + LASSERT(update_result); + + update_result->our_rc = ptlrpc_status_hton(rc); + if (rc >= 0) { + if (data_len > 0 && data) + memcpy(update_result->our_data, data, data_len); + update_result->our_datalen = data_len; + } + + reply->ourp_lens[index] = cfs_size_round(data_len + + sizeof(struct object_update_result)); +} + +static inline int +object_update_result_data_get(const struct object_update_reply *reply, + struct lu_buf *lbuf, size_t index) +{ + struct object_update_result *update_result; + size_t size = 0; + int result; + + LASSERT(lbuf != NULL); + update_result = object_update_result_get(reply, index, &size); + if (update_result == NULL || + size < cfs_size_round(sizeof(struct object_update_reply)) || + update_result->our_datalen > size) + RETURN(-EFAULT); + + result = ptlrpc_status_ntoh(update_result->our_rc); + if (result < 0) + return result; + + lbuf->lb_buf = update_result->our_data; + lbuf->lb_len = update_result->our_datalen; + + return result; +} + +/** + * Attached in the thandle to record the updates for distribute + * distribution. + */ +struct thandle_update_records { + /* All of updates for the cross-MDT operation, vmalloc'd. */ + struct llog_update_record *tur_update_records; + size_t tur_update_records_buf_size; + + /* All of parameters for the cross-MDT operation, vmalloc'd */ + struct update_params *tur_update_params; + unsigned int tur_update_param_count; + size_t tur_update_params_buf_size; +}; + +#define TOP_THANDLE_MAGIC 0x20140917 +struct top_multiple_thandle { + struct dt_device *tmt_master_sub_dt; + atomic_t tmt_refcount; + /* Other sub transactions will be listed here. */ + struct list_head tmt_sub_thandle_list; + spinlock_t tmt_sub_lock; + + struct list_head tmt_commit_list; + /* All of update records will packed here */ + struct thandle_update_records *tmt_update_records; + + wait_queue_head_t tmt_stop_waitq; + __u64 tmt_batchid; + int tmt_result; + __u32 tmt_magic; + size_t tmt_record_size; + __u32 tmt_committed:1; +}; + +/* {top,sub}_thandle are used to manage distributed transactions which + * include updates on several nodes. A top_handle represents the + * whole operation, and sub_thandle represents updates on each node. */ +struct top_thandle { + struct thandle tt_super; + /* The master sub transaction. */ + struct thandle *tt_master_sub_thandle; + + struct top_multiple_thandle *tt_multiple_thandle; +}; + +struct sub_thandle_cookie { + struct llog_cookie stc_cookie; + struct list_head stc_list; +}; + +/* Sub thandle is used to track multiple sub thandles under one parent + * thandle */ +struct sub_thandle { + struct thandle *st_sub_th; + struct dt_device *st_dt; + struct list_head st_cookie_list; + struct dt_txn_commit_cb st_commit_dcb; + struct dt_txn_commit_cb st_stop_dcb; + int st_result; + + /* linked to top_thandle */ + struct list_head st_sub_list; + + /* If this sub thandle is committed */ + bool st_committed:1, + st_stopped:1, + st_started:1; +}; + +struct tx_arg; +typedef int (*tx_exec_func_t)(const struct lu_env *env, struct thandle *th, + struct tx_arg *ta); + +/* Structure for holding one update execution */ +struct tx_arg { + tx_exec_func_t exec_fn; + tx_exec_func_t undo_fn; + struct dt_object *object; + const char *file; + struct object_update_reply *reply; + int line; + int index; + union { + struct { + struct dt_insert_rec rec; + const struct dt_key *key; + } insert; + struct { + } ref; + struct { + struct lu_attr attr; + } attr_set; + struct { + struct lu_buf buf; + const char *name; + int flags; + __u32 csum; + } xattr_set; + struct { + struct lu_attr attr; + struct dt_allocation_hint hint; + struct dt_object_format dof; + struct lu_fid fid; + } create; + struct { + struct lu_buf buf; + loff_t pos; + } write; + struct { + struct ost_body *body; + } destroy; + } u; +}; + +/* Structure for holding all update executations of one transaction */ +struct thandle_exec_args { + struct thandle *ta_handle; + int ta_argno; /* used args */ + int ta_alloc_args; /* allocated args count */ + struct tx_arg **ta_args; +}; + +/* target/out_lib.c */ +int out_update_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, enum update_type op, + const struct lu_fid *fid, unsigned int params_count, + __u16 *param_sizes, const void **param_bufs, + __u32 reply_size); +int out_create_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof); +int out_destroy_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid); +int out_index_delete_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const struct dt_key *key); +int out_index_insert_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const struct dt_rec *rec, + const struct dt_key *key); +int out_xattr_set_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const struct lu_buf *buf, + const char *name, __u32 flag); +int out_xattr_del_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const char *name); +int out_attr_set_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const struct lu_attr *attr); +int out_ref_add_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid); +int out_ref_del_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid); +int out_write_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const struct lu_buf *buf, + __u64 pos); +int out_attr_get_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid); +int out_index_lookup_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, struct dt_rec *rec, + const struct dt_key *key); +int out_xattr_get_pack(const struct lu_env *env, + struct object_update *update, size_t *max_update_size, + const struct lu_fid *fid, const char *name, + const int bufsize); +int out_xattr_list_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const int bufsize); +int out_read_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_length, const struct lu_fid *fid, + size_t size, loff_t pos); + +const char *update_op_str(__u16 opcode); + +/* target/update_trans.c */ +struct thandle *thandle_get_sub_by_dt(const struct lu_env *env, + struct thandle *th, + struct dt_device *sub_dt); + +static inline struct thandle * +thandle_get_sub(const struct lu_env *env, struct thandle *th, + const struct dt_object *sub_obj) +{ + return thandle_get_sub_by_dt(env, th, lu2dt_dev(sub_obj->do_lu.lo_dev)); +} + +struct thandle * +top_trans_create(const struct lu_env *env, struct dt_device *master_dev); +int top_trans_start(const struct lu_env *env, struct dt_device *master_dev, + struct thandle *th); +int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev, + struct thandle *th); +void top_multiple_thandle_destroy(struct top_multiple_thandle *tmt); + +static inline void top_multiple_thandle_get(struct top_multiple_thandle *tmt) +{ + atomic_inc(&tmt->tmt_refcount); +} + +static inline void top_multiple_thandle_put(struct top_multiple_thandle *tmt) +{ + if (atomic_dec_and_test(&tmt->tmt_refcount)) + top_multiple_thandle_destroy(tmt); +} + +struct sub_thandle *lookup_sub_thandle(struct top_multiple_thandle *tmt, + struct dt_device *dt_dev); +int sub_thandle_trans_create(const struct lu_env *env, + struct top_thandle *top_th, + struct sub_thandle *st); + +/* update_records.c */ +size_t update_records_create_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_attr *attr, + const struct dt_allocation_hint *hint, + struct dt_object_format *dof); +size_t update_records_attr_set_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_attr *attr); +size_t update_records_ref_add_size(const struct lu_env *env, + const struct lu_fid *fid); +size_t update_records_ref_del_size(const struct lu_env *env, + const struct lu_fid *fid); +size_t update_records_destroy_size(const struct lu_env *env, + const struct lu_fid *fid); +size_t update_records_index_insert_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct dt_rec *rec, + const struct dt_key *key); +size_t update_records_index_delete_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct dt_key *key); +size_t update_records_xattr_set_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_buf *buf, + const char *name, + __u32 flag); +size_t update_records_xattr_del_size(const struct lu_env *env, + const struct lu_fid *fid, + const char *name); +size_t update_records_write_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_buf *buf, + __u64 pos); +size_t update_records_punch_size(const struct lu_env *env, + const struct lu_fid *fid, + __u64 start, __u64 end); + +int update_records_create_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_attr *attr, + const struct dt_allocation_hint *hint, + struct dt_object_format *dof); +int update_records_attr_set_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_attr *attr); +int update_records_ref_add_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid); +int update_records_ref_del_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid); +int update_records_destroy_pack(const struct lu_env *env, + struct update_ops *ops, unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid); +int update_records_index_insert_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct dt_rec *rec, + const struct dt_key *key); +int update_records_index_delete_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct dt_key *key); +int update_records_xattr_set_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_buf *buf, const char *name, + __u32 flag); +int update_records_xattr_del_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const char *name); +int update_records_write_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_buf *buf, + __u64 pos); +int update_records_punch_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + __u64 start, __u64 end); +int update_records_noop_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid); + +int tur_update_records_extend(struct thandle_update_records *tur, + size_t new_size); +int tur_update_params_extend(struct thandle_update_records *tur, + size_t new_size); +int tur_update_extend(struct thandle_update_records *tur, + size_t new_op_size, size_t new_param_size); + +#define update_record_pack(name, th, ...) \ +({ \ + struct top_thandle *top_th; \ + struct top_multiple_thandle *tmt; \ + struct thandle_update_records *tur; \ + struct llog_update_record *lur; \ + size_t avail_param_size; \ + size_t avail_op_size; \ + int ret; \ + \ + while (1) { \ + top_th = container_of(th, struct top_thandle, tt_super);\ + tmt = top_th->tt_multiple_thandle; \ + tur = tmt->tmt_update_records; \ + lur = tur->tur_update_records; \ + avail_param_size = tur->tur_update_params_buf_size - \ + update_params_size(tur->tur_update_params, \ + tur->tur_update_param_count); \ + avail_op_size = tur->tur_update_records_buf_size - \ + llog_update_record_size(lur); \ + ret = update_records_##name##_pack(env, \ + &lur->lur_update_rec.ur_ops, \ + &lur->lur_update_rec.ur_update_count, \ + &avail_op_size, \ + tur->tur_update_params, \ + &tur->tur_update_param_count, \ + &avail_param_size, __VA_ARGS__); \ + if (ret == -E2BIG) { \ + ret = tur_update_extend(tur, avail_op_size, \ + avail_param_size); \ + if (ret != 0) \ + break; \ + continue; \ + } else { \ + break; \ + } \ + } \ + ret; \ +}) + +#define update_record_size(env, name, th, ...) \ +({ \ + struct top_thandle *top_th; \ + struct top_multiple_thandle *tmt; \ + \ + top_th = container_of(th, struct top_thandle, tt_super); \ + \ + LASSERT(top_th->tt_multiple_thandle != NULL); \ + tmt = top_th->tt_multiple_thandle; \ + tmt->tmt_record_size += \ + update_records_##name##_size(env, __VA_ARGS__); \ +}) +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/lvfs.h b/drivers/staging/lustrefsx/lustre/include/lvfs.h new file mode 100644 index 0000000000000..f24aff819f668 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/lvfs.h @@ -0,0 +1,101 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lvfs.h + * + * lustre VFS/process permission interface + */ + +#ifndef __LVFS_H__ +#define __LVFS_H__ + +#include +#include +#include +#include +#include +#include + +#define OBD_RUN_CTXT_MAGIC 0xC0FFEEAA +#define OBD_CTXT_DEBUG /* development-only debugging */ + +struct dt_device; + +struct lvfs_run_ctxt { + struct vfsmount *pwdmnt; + struct dentry *pwd; + int umask; + struct dt_device *dt; +#ifdef OBD_CTXT_DEBUG + unsigned int magic; +#endif +}; + +static inline void OBD_SET_CTXT_MAGIC(struct lvfs_run_ctxt *ctxt) +{ +#ifdef OBD_CTXT_DEBUG + ctxt->magic = OBD_RUN_CTXT_MAGIC; +#endif +} + +/* ptlrpc_sec_ctx.c */ +void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx); +void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx); + +/* We need to hold the inode semaphore over the dcache lookup itself, or we + * run the risk of entering the filesystem lookup path concurrently on SMP + * systems, and instantiating two inodes for the same entry. We still + * protect against concurrent addition/removal races with the DLM locking. + */ +static inline struct dentry * +ll_lookup_one_len(const char *fid_name, struct dentry *dparent, + int fid_namelen) +{ + struct dentry *dchild; + + inode_lock(dparent->d_inode); + dchild = lookup_one_len(fid_name, dparent, fid_namelen); + inode_unlock(dparent->d_inode); + + if (IS_ERR(dchild) || dchild->d_inode == NULL) + return dchild; + + if (is_bad_inode(dchild->d_inode)) { + CERROR("bad inode returned %lu/%u\n", + dchild->d_inode->i_ino, dchild->d_inode->i_generation); + dput(dchild); + dchild = ERR_PTR(-ENOENT); + } + + return dchild; +} + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/md_object.h b/drivers/staging/lustrefsx/lustre/include/md_object.h new file mode 100644 index 0000000000000..d84f07e054201 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/md_object.h @@ -0,0 +1,731 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/md_object.h + * + * Extention of lu_object.h for metadata objects + */ + +#ifndef _LUSTRE_MD_OBJECT_H +#define _LUSTRE_MD_OBJECT_H + +#ifndef HAVE_SERVER_SUPPORT +# error "client code should not depend on md_object.h" +#endif /* !HAVE_SERVER_SUPPORT */ + +/** \defgroup md md + * Sub-class of lu_object with methods common for "meta-data" objects in MDT + * stack. + * + * Meta-data objects implement namespace operations: you can link, unlink + * them, and treat them as directories. + * + * Examples: mdt, cmm, and mdt are implementations of md interface. + * @{ + */ + + +/* + * super-class definitions. + */ +#include + +struct md_device; +struct md_device_operations; +struct md_object; +struct obd_export; + +/** metadata attributes */ +enum ma_valid { + MA_INODE = 1 << 0, + MA_LOV = 1 << 1, + MA_FLAGS = 1 << 2, + MA_LMV = 1 << 3, + MA_ACL_DEF = 1 << 4, + MA_LOV_DEF = 1 << 5, + MA_HSM = 1 << 6, + MA_PFID = 1 << 7, + MA_LMV_DEF = 1 << 8, + MA_SOM = 1 << 9, +}; + +typedef enum { + MDL_MINMODE = 0, + MDL_EX = 1, + MDL_PW = 2, + MDL_PR = 4, + MDL_CW = 8, + MDL_CR = 16, + MDL_NL = 32, + MDL_GROUP = 64, + MDL_MAXMODE +} mdl_mode_t; + +typedef enum { + MDT_NUL_LOCK = 0, + MDT_REG_LOCK = (1 << 0), + MDT_PDO_LOCK = (1 << 1) +} mdl_type_t; + +/* lfs rgetfacl permission check */ +#define MAY_RGETFACL (1 << 14) + +/* memory structure for hsm attributes + * for fields description see the on disk structure hsm_attrs + * which is defined in lustre_idl.h + */ +struct md_hsm { + __u32 mh_compat; + __u32 mh_flags; + __u64 mh_arch_id; + __u64 mh_arch_ver; +}; + + +/* memory structure for SOM attributes + * for fields description see the on disk structure som_attrs + * which is defined in lustre_idl.h + */ +struct md_som { + __u16 ms_valid; + __u64 ms_size; + __u64 ms_blocks; +}; + +struct md_attr { + __u64 ma_valid; + __u64 ma_need; + __u64 ma_attr_flags; + struct lu_attr ma_attr; + struct lu_fid ma_pfid; + struct md_hsm ma_hsm; + struct md_som ma_som; + struct lov_mds_md *ma_lmm; + union lmv_mds_md *ma_lmv; + struct lmv_user_md *ma_default_lmv; + void *ma_acl; + int ma_lmm_size; + int ma_lmv_size; + int ma_default_lmv_size; + int ma_acl_size; + int ma_enable_chprojid_gid; +}; + +/** Additional parameters for create */ +struct md_op_spec { + union { + /** symlink target */ + const char *sp_symname; + /** eadata for regular files */ + struct md_spec_reg { + void *eadata; + int eadatalen; + } sp_ea; + } u; + + /** Open flags from client: such as MDS_OPEN_CREAT, and others. */ + __u64 sp_cr_flags; + + /* File security context for creates. */ + const char *sp_cr_file_secctx_name; /* (security) xattr name */ + void *sp_cr_file_secctx; /* xattr value */ + size_t sp_cr_file_secctx_size; /* xattr value size */ + + /** don't create lov objects or llog cookie - this replay */ + unsigned int no_create:1, + sp_cr_lookup:1, /* do lookup sanity check or not. */ + sp_rm_entry:1, /* only remove name entry */ + sp_permitted:1, /* do not check permission */ + sp_migrate_close:1; /* close the file during migrate */ + /** Current lock mode for parent dir where create is performing. */ + mdl_mode_t sp_cr_mode; + + /** to create directory */ + const struct dt_index_features *sp_feat; +}; + +enum md_layout_opc { + MD_LAYOUT_NOP = 0, + MD_LAYOUT_WRITE, /* FLR: write the file */ + MD_LAYOUT_RESYNC, /* FLR: resync starts */ + MD_LAYOUT_RESYNC_DONE, /* FLR: resync done */ +}; + +/** + * Parameters for layout change API. + */ +struct md_layout_change { + enum md_layout_opc mlc_opc; + __u16 mlc_mirror_id; + struct layout_intent *mlc_intent; + struct lu_buf mlc_buf; + struct lustre_som_attrs mlc_som; + size_t mlc_resync_count; + __u32 *mlc_resync_ids; +}; + +union ldlm_policy_data; +/** + * Operations implemented for each md object (both directory and leaf). + */ +struct md_object_operations { + int (*moo_permission)(const struct lu_env *env, + struct md_object *pobj, struct md_object *cobj, + struct md_attr *attr, int mask); + + int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj, + struct md_attr *attr); + + int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj, + const struct md_attr *attr); + + int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf, const char *name); + + int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf); + + int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj, + const struct lu_buf *buf, const char *name, + int fl); + + int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj, + const char *name); + + /** This method is used to swap the layouts between 2 objects */ + int (*moo_swap_layouts)(const struct lu_env *env, + struct md_object *obj1, struct md_object *obj2, + __u64 flags); + + /** \retval number of bytes actually read upon success */ + int (*moo_readpage)(const struct lu_env *env, struct md_object *obj, + const struct lu_rdpg *rdpg); + + int (*moo_readlink)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf); + + int (*moo_changelog)(const struct lu_env *env, + enum changelog_rec_type type, + enum changelog_rec_flags clf_flags, + struct md_device *m, const struct lu_fid *fid); + + int (*moo_open)(const struct lu_env *env, + struct md_object *obj, u64 open_flags); + + int (*moo_close)(const struct lu_env *env, struct md_object *obj, + struct md_attr *ma, u64 open_flags); + + int (*moo_object_sync)(const struct lu_env *, struct md_object *); + + int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy); + int (*moo_object_unlock)(const struct lu_env *env, + struct md_object *obj, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy); + + int (*moo_invalidate)(const struct lu_env *env, struct md_object *obj); + /** + * Trying to write to un-instantiated layout component. + * + * The caller should have held layout lock. + * + * This API can be extended to support every other layout changing + * operations, such as component {add,del,change}, layout swap, + * layout merge, etc. One of the benefits by doing this is that the MDT + * no longer needs to understand layout. + * + * However, layout creation, removal, and fetch should still use + * xattr_{get,set}() because they don't interpret layout on the + * MDT layer. + * + * \param[in] env execution environment + * \param[in] obj MD object + * \param[in] layout data structure to describe the changes to + * the MD object's layout + * + * \retval 0 success + * \retval -ne error code + */ + int (*moo_layout_change)(const struct lu_env *env, + struct md_object *obj, + struct md_layout_change *layout); +}; + +/** + * Operations implemented for each directory object. + */ +struct md_dir_operations { + int (*mdo_is_subdir)(const struct lu_env *env, struct md_object *obj, + const struct lu_fid *fid); + + int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj, + const struct lu_name *lname, struct lu_fid *fid, + struct md_op_spec *spec); + + mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env, + struct md_object *obj, + mdl_mode_t mode); + + int (*mdo_create)(const struct lu_env *env, struct md_object *pobj, + const struct lu_name *lname, struct md_object *child, + struct md_op_spec *spec, + struct md_attr *ma); + + /** This method is used for creating data object for this meta object*/ + int (*mdo_create_data)(const struct lu_env *env, struct md_object *p, + struct md_object *o, + const struct md_op_spec *spec, + struct md_attr *ma); + + int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj, + struct md_object *tpobj, const struct lu_fid *lf, + const struct lu_name *lsname, struct md_object *tobj, + const struct lu_name *ltname, struct md_attr *ma); + + int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj, + struct md_object *src_obj, const struct lu_name *lname, + struct md_attr *ma); + + int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj, + struct md_object *cobj, const struct lu_name *lname, + struct md_attr *ma, int no_name); + + int (*mdo_migrate)(const struct lu_env *env, struct md_object *pobj, + struct md_object *sobj, const struct lu_name *lname, + struct md_object *tobj, struct md_op_spec *spec, + struct md_attr *ma); +}; + +struct md_device_operations { + /** meta-data device related handlers. */ + int (*mdo_root_get)(const struct lu_env *env, struct md_device *m, + struct lu_fid *f); + + const struct dt_device_param *(*mdo_dtconf_get)(const struct lu_env *e, + struct md_device *m); + + int (*mdo_statfs)(const struct lu_env *env, struct md_device *m, + struct obd_statfs *sfs); + + int (*mdo_llog_ctxt_get)(const struct lu_env *env, + struct md_device *m, int idx, void **h); + + int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m, + unsigned int cmd, int len, void *data); +}; + +struct md_device { + struct lu_device md_lu_dev; + const struct md_device_operations *md_ops; +}; + +struct md_object { + struct lu_object mo_lu; + const struct md_object_operations *mo_ops; + const struct md_dir_operations *mo_dir_ops; +}; + +static inline struct md_device *lu2md_dev(const struct lu_device *d) +{ + LASSERT(IS_ERR(d) || lu_device_is_md(d)); + return container_of0(d, struct md_device, md_lu_dev); +} + +static inline struct lu_device *md2lu_dev(struct md_device *d) +{ + return &d->md_lu_dev; +} + +static inline struct md_object *lu2md(const struct lu_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev)); + return container_of0(o, struct md_object, mo_lu); +} + +static inline int md_device_init(struct md_device *md, struct lu_device_type *t) +{ + return lu_device_init(&md->md_lu_dev, t); +} + +static inline void md_device_fini(struct md_device *md) +{ + lu_device_fini(&md->md_lu_dev); +} + +static inline struct md_object *md_object_find_slice(const struct lu_env *env, + struct md_device *md, + const struct lu_fid *f) +{ + return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL)); +} + + +/** md operations */ +static inline int mo_permission(const struct lu_env *env, struct md_object *p, + struct md_object *c, struct md_attr *at, + int mask) +{ + LASSERT(c->mo_ops->moo_permission); + return c->mo_ops->moo_permission(env, p, c, at, mask); +} + +static inline int mo_attr_get(const struct lu_env *env, struct md_object *m, + struct md_attr *at) +{ + LASSERT(m->mo_ops->moo_attr_get); + return m->mo_ops->moo_attr_get(env, m, at); +} + +static inline int mo_readlink(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf) +{ + LASSERT(m->mo_ops->moo_readlink); + return m->mo_ops->moo_readlink(env, m, buf); +} + +static inline int mo_changelog(const struct lu_env *env, + enum changelog_rec_type type, + enum changelog_rec_flags clf_flags, + struct md_device *m, const struct lu_fid *fid) +{ + struct lu_fid rootfid; + struct md_object *root; + int rc; + + rc = m->md_ops->mdo_root_get(env, m, &rootfid); + if (rc) + return rc; + + root = md_object_find_slice(env, m, &rootfid); + if (IS_ERR(root)) + RETURN(PTR_ERR(root)); + + LASSERT(root->mo_ops->moo_changelog); + rc = root->mo_ops->moo_changelog(env, type, clf_flags, m, fid); + + lu_object_put(env, &root->mo_lu); + + return rc; +} + +static inline int mo_attr_set(const struct lu_env *env, + struct md_object *m, + const struct md_attr *at) +{ + LASSERT(m->mo_ops->moo_attr_set); + return m->mo_ops->moo_attr_set(env, m, at); +} + +static inline int mo_xattr_get(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf, + const char *name) +{ + LASSERT(m->mo_ops->moo_xattr_get); + return m->mo_ops->moo_xattr_get(env, m, buf, name); +} + +static inline int mo_xattr_del(const struct lu_env *env, + struct md_object *m, + const char *name) +{ + LASSERT(m->mo_ops->moo_xattr_del); + return m->mo_ops->moo_xattr_del(env, m, name); +} + +static inline int mo_xattr_set(const struct lu_env *env, + struct md_object *m, + const struct lu_buf *buf, + const char *name, + int flags) +{ + LASSERT(m->mo_ops->moo_xattr_set); + return m->mo_ops->moo_xattr_set(env, m, buf, name, flags); +} + +static inline int mo_xattr_list(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf) +{ + LASSERT(m->mo_ops->moo_xattr_list); + return m->mo_ops->moo_xattr_list(env, m, buf); +} + +static inline int mo_invalidate(const struct lu_env *env, struct md_object *m) +{ + LASSERT(m->mo_ops->moo_invalidate); + return m->mo_ops->moo_invalidate(env, m); +} + +static inline int mo_layout_change(const struct lu_env *env, + struct md_object *m, + struct md_layout_change *layout) +{ + /* need instantiate objects which in the access range */ + LASSERT(m->mo_ops->moo_layout_change); + return m->mo_ops->moo_layout_change(env, m, layout); +} + +static inline int mo_swap_layouts(const struct lu_env *env, + struct md_object *o1, + struct md_object *o2, __u64 flags) +{ + LASSERT(o1->mo_ops->moo_swap_layouts); + LASSERT(o2->mo_ops->moo_swap_layouts); + if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts) + return -EPERM; + return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags); +} + +static inline int mo_open(const struct lu_env *env, struct md_object *m, + u64 open_flags) +{ + LASSERT(m->mo_ops->moo_open); + return m->mo_ops->moo_open(env, m, open_flags); +} + +static inline int mo_close(const struct lu_env *env, struct md_object *m, + struct md_attr *ma, u64 open_flags) +{ + LASSERT(m->mo_ops->moo_close); + return m->mo_ops->moo_close(env, m, ma, open_flags); +} + +static inline int mo_readpage(const struct lu_env *env, + struct md_object *m, + const struct lu_rdpg *rdpg) +{ + LASSERT(m->mo_ops->moo_readpage); + return m->mo_ops->moo_readpage(env, m, rdpg); +} + +static inline int mo_object_sync(const struct lu_env *env, struct md_object *m) +{ + LASSERT(m->mo_ops->moo_object_sync); + return m->mo_ops->moo_object_sync(env, m); +} + +static inline int mo_object_lock(const struct lu_env *env, + struct md_object *m, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy) +{ + LASSERT(m->mo_ops->moo_object_lock); + return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy); +} + +static inline int mo_object_unlock(const struct lu_env *env, + struct md_object *m, + struct ldlm_enqueue_info *einfo, + union ldlm_policy_data *policy) +{ + LASSERT(m->mo_ops->moo_object_unlock); + return m->mo_ops->moo_object_unlock(env, m, einfo, policy); +} + +static inline int mdo_lookup(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lname, + struct lu_fid *f, + struct md_op_spec *spec) +{ + LASSERT(p->mo_dir_ops->mdo_lookup); + return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec); +} + +static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env, + struct md_object *mo, + mdl_mode_t lm) +{ + if (mo->mo_dir_ops->mdo_lock_mode == NULL) + return MDL_MINMODE; + return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm); +} + +static inline int mdo_create(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lchild_name, + struct md_object *c, + struct md_op_spec *spc, + struct md_attr *at) +{ + LASSERT(p->mo_dir_ops->mdo_create); + return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at); +} + +static inline int mdo_create_data(const struct lu_env *env, + struct md_object *p, + struct md_object *c, + const struct md_op_spec *spec, + struct md_attr *ma) +{ + LASSERT(c->mo_dir_ops->mdo_create_data); + return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma); +} + +static inline int mdo_rename(const struct lu_env *env, + struct md_object *sp, + struct md_object *tp, + const struct lu_fid *lf, + const struct lu_name *lsname, + struct md_object *t, + const struct lu_name *ltname, + struct md_attr *ma) +{ + LASSERT(tp->mo_dir_ops->mdo_rename); + return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname, + ma); +} + +static inline int mdo_migrate(const struct lu_env *env, + struct md_object *pobj, + struct md_object *sobj, + const struct lu_name *lname, + struct md_object *tobj, + struct md_op_spec *spec, + struct md_attr *ma) +{ + LASSERT(pobj->mo_dir_ops->mdo_migrate); + return pobj->mo_dir_ops->mdo_migrate(env, pobj, sobj, lname, tobj, spec, + ma); +} + +static inline int mdo_is_subdir(const struct lu_env *env, + struct md_object *mo, + const struct lu_fid *fid) +{ + LASSERT(mo->mo_dir_ops->mdo_is_subdir); + return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid); +} + +static inline int mdo_link(const struct lu_env *env, + struct md_object *p, + struct md_object *s, + const struct lu_name *lname, + struct md_attr *ma) +{ + LASSERT(s->mo_dir_ops->mdo_link); + return s->mo_dir_ops->mdo_link(env, p, s, lname, ma); +} + +static inline int mdo_unlink(const struct lu_env *env, + struct md_object *p, + struct md_object *c, + const struct lu_name *lname, + struct md_attr *ma, int no_name) +{ + LASSERT(p->mo_dir_ops->mdo_unlink); + return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name); +} + +static inline int mdo_statfs(const struct lu_env *env, + struct md_device *m, + struct obd_statfs *sfs) +{ + LASSERT(m->md_ops->mdo_statfs); + return m->md_ops->mdo_statfs(env, m, sfs); +} + +/** + * Used in MDD/OUT layer for object lock rule + **/ +enum mdd_object_role { + MOR_SRC_PARENT, + MOR_SRC_CHILD, + MOR_TGT_PARENT, + MOR_TGT_CHILD, + MOR_TGT_ORPHAN +}; + +struct dt_device; + +void lustre_som_swab(struct lustre_som_attrs *attrs); +int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh); +void lustre_hsm2buf(void *buf, const struct md_hsm *mh); + +enum { + UCRED_INVALID = -1, + UCRED_INIT = 0, + UCRED_OLD = 1, + UCRED_NEW = 2, +}; + +struct lu_ucred { + __u32 uc_valid; + __u32 uc_o_uid; + __u32 uc_o_gid; + __u32 uc_o_fsuid; + __u32 uc_o_fsgid; + __u32 uc_uid; + __u32 uc_gid; + __u32 uc_fsuid; + __u32 uc_fsgid; + __u32 uc_suppgids[2]; + cfs_cap_t uc_cap; + __u32 uc_umask; + struct group_info *uc_ginfo; + struct md_identity *uc_identity; + char uc_jobid[LUSTRE_JOBID_SIZE]; + lnet_nid_t uc_nid; + bool uc_enable_audit; +}; + +struct lu_ucred *lu_ucred(const struct lu_env *env); + +struct lu_ucred *lu_ucred_check(const struct lu_env *env); + +struct lu_ucred *lu_ucred_assert(const struct lu_env *env); + +int lu_ucred_global_init(void); + +void lu_ucred_global_fini(void); + +#define md_cap_t(x) (x) + +#define MD_CAP_TO_MASK(x) (1 << (x)) + +#define md_cap_raised(c, flag) (md_cap_t(c) & MD_CAP_TO_MASK(flag)) + +/* capable() is copied from linux kernel! */ +static inline int md_capable(struct lu_ucred *uc, cfs_cap_t cap) +{ + if (md_cap_raised(uc->uc_cap, cap)) + return 1; + return 0; +} + +/** @} md */ +#endif /* _LINUX_MD_OBJECT_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/obd.h b/drivers/staging/lustrefsx/lustre/include/obd.h new file mode 100644 index 0000000000000..62f751a44d0fb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd.h @@ -0,0 +1,1286 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __OBD_H +#define __OBD_H + +#include +#include +#include + +#include +#include +#include +#ifdef HAVE_SERVER_SUPPORT +# include +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_OBD_DEVICES 8192 + +struct osc_async_rc { + int ar_rc; + int ar_force_sync; + __u64 ar_min_xid; +}; + +struct lov_oinfo { /* per-stripe data structure */ + struct ost_id loi_oi; /* object ID/Sequence on the target OST */ + int loi_ost_idx; /* OST stripe index in lov_tgt_desc->tgts */ + int loi_ost_gen; /* generation of this loi_ost_idx */ + + unsigned long loi_kms_valid:1; + __u64 loi_kms; /* known minimum size */ + struct ost_lvb loi_lvb; + struct osc_async_rc loi_ar; +}; + +static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms) +{ + oinfo->loi_kms = kms; + oinfo->loi_kms_valid = 1; +} + +struct lov_stripe_md; +struct obd_info; + +typedef int (*obd_enqueue_update_f)(void *cookie, int rc); + +/* obd info for a particular level (lov, osc). */ +struct obd_info { + /* OBD_STATFS_* flags */ + __u64 oi_flags; + struct obd_device *oi_obd; + struct lu_tgt_desc *oi_tgt; + /* statfs data specific for every OSC, if needed at all. */ + struct obd_statfs *oi_osfs; + /* An update callback which is called to update some data on upper + * level. E.g. it is used for update lsm->lsm_oinfo at every received + * request in osc level for enqueue requests. It is also possible to + * update some caller data from LOV layer if needed. */ + obd_enqueue_update_f oi_cb_up; +}; + +struct obd_type { + struct list_head typ_chain; + struct obd_ops *typ_dt_ops; + struct md_ops *typ_md_ops; + struct proc_dir_entry *typ_procroot; + struct proc_dir_entry *typ_procsym; + struct dentry *typ_debugfs_entry; +#ifdef HAVE_SERVER_SUPPORT + bool typ_sym_filter; +#endif + char *typ_name; + int typ_refcnt; + struct lu_device_type *typ_lu; + spinlock_t obd_type_lock; + struct kobject *typ_kobj; +}; + +struct brw_page { + u64 off; + struct page *pg; + u32 count; + u32 flag; +}; + +struct timeout_item { + enum timeout_event ti_event; + time64_t ti_timeout; + timeout_cb_t ti_cb; + void *ti_cb_data; + struct list_head ti_obd_list; + struct list_head ti_chain; +}; + +#define OBD_MAX_RIF_DEFAULT 8 +#define OBD_MAX_RIF_MAX 512 +#define OSC_MAX_RIF_MAX 256 +#define OSC_MAX_DIRTY_DEFAULT 2000 /* Arbitrary large value */ +#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */ +#define OSC_DEFAULT_RESENDS 10 + +/* possible values for lut_sync_lock_cancel */ +enum tgt_sync_lock_cancel { + SYNC_LOCK_CANCEL_NEVER = 0, + SYNC_LOCK_CANCEL_BLOCKING = 1, + SYNC_LOCK_CANCEL_ALWAYS = 2, +}; + +/* + * Limit reply buffer size for striping data to one x86_64 page. This + * value is chosen to fit the striping data for common use cases while + * staying well below the limit at which the buffer must be backed by + * vmalloc(). Excessive use of vmalloc() may cause spinlock contention + * on the MDS. + */ +#define OBD_MAX_DEFAULT_EA_SIZE 4096 + +enum obd_cl_sem_lock_class { + OBD_CLI_SEM_NORMAL, + OBD_CLI_SEM_MGC, + OBD_CLI_SEM_MDCOSC, +}; + +struct mdc_rpc_lock; +struct obd_import; +struct client_obd { + struct rw_semaphore cl_sem; + struct obd_uuid cl_target_uuid; + struct obd_import *cl_import; /* ptlrpc connection state */ + size_t cl_conn_count; + + /* Cache maximum and default values for easize. This is + * strictly a performance optimization to minimize calls to + * obd_size_diskmd(). The default values are used to calculate the + * initial size of a request buffer. The ptlrpc layer will resize the + * buffer as needed to accommodate a larger reply from the + * server. The default values should be small enough to avoid wasted + * memory and excessive use of vmalloc(), yet large enough to avoid + * reallocating the buffer in the common use case. */ + + /* Default EA size for striping attributes. It is initialized at + * mount-time based on the default stripe width of the filesystem, + * then it tracks the largest observed EA size advertised by + * the MDT, up to a maximum value of OBD_MAX_DEFAULT_EA_SIZE. */ + __u32 cl_default_mds_easize; + + /* Maximum possible EA size computed at mount-time based on + * the number of OSTs in the filesystem. May be increased at + * run-time if a larger observed size is advertised by the MDT. */ + __u32 cl_max_mds_easize; + + /* Data-on-MDT specific value to set larger reply buffer for possible + * data read along with open/stat requests. By default it tries to use + * unused space in reply buffer. + * This value is used to ensure that reply buffer has at least as + * much free space as value indicates. That free space is gained from + * LOV EA buffer which is small for DoM files and on big systems can + * provide up to 32KB of extra space in reply buffer. + * Default value is 8K now. + */ + __u32 cl_dom_min_inline_repsize; + + enum lustre_sec_part cl_sp_me; + enum lustre_sec_part cl_sp_to; + struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */ + + /* the grant values are protected by loi_list_lock below */ + unsigned long cl_dirty_pages; /* all _dirty_ in pages */ + unsigned long cl_dirty_max_pages; /* allowed w/o rpc */ + unsigned long cl_avail_grant; /* bytes of credit for ost */ + unsigned long cl_lost_grant; /* lost credits (trunc) */ + /* grant consumed for dirty pages */ + unsigned long cl_dirty_grant; + + /* since we allocate grant by blocks, we don't know how many grant will + * be used to add a page into cache. As a solution, we reserve maximum + * grant before trying to dirty a page and unreserve the rest. + * See osc_{reserve|unreserve}_grant for details. */ + long cl_reserved_grant; + wait_queue_head_t cl_cache_waiters; /* waiting for cache/grant */ + time64_t cl_next_shrink_grant; /* seconds */ + struct list_head cl_grant_chain; + time64_t cl_grant_shrink_interval; /* seconds */ + + /* A chunk is an optimal size used by osc_extent to determine + * the extent size. A chunk is max(PAGE_SIZE, OST block size) */ + int cl_chunkbits; + /* extent insertion metadata overhead to be accounted in grant, + * in bytes */ + unsigned int cl_grant_extent_tax; + /* maximum extent size, in number of pages */ + unsigned int cl_max_extent_pages; + + /* keep track of objects that have lois that contain pages which + * have been queued for async brw. this lock also protects the + * lists of osc_client_pages that hang off of the loi */ + /* + * ->cl_loi_list_lock protects consistency of + * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and + * ->ap_completion() call-backs are executed under this lock. As we + * cannot guarantee that these call-backs never block on all platforms + * (as a matter of fact they do block on Mac OS X), type of + * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux + * and blocking mutex on Mac OS X. (Alternative is to make this lock + * blocking everywhere, but we don't want to slow down fast-path of + * our main platform.) + * + * NB by Jinshan: though field names are still _loi_, but actually + * osc_object{}s are in the list. + */ + spinlock_t cl_loi_list_lock; + struct list_head cl_loi_ready_list; + struct list_head cl_loi_hp_ready_list; + struct list_head cl_loi_write_list; + struct list_head cl_loi_read_list; + __u32 cl_r_in_flight; + __u32 cl_w_in_flight; + /* just a sum of the loi/lop pending numbers to be exported by /proc */ + atomic_t cl_pending_w_pages; + atomic_t cl_pending_r_pages; + u32 cl_max_pages_per_rpc; + u32 cl_max_rpcs_in_flight; + u32 cl_max_short_io_bytes; + struct obd_histogram cl_read_rpc_hist; + struct obd_histogram cl_write_rpc_hist; + struct obd_histogram cl_read_page_hist; + struct obd_histogram cl_write_page_hist; + struct obd_histogram cl_read_offset_hist; + struct obd_histogram cl_write_offset_hist; + + /** LRU for osc caching pages */ + struct cl_client_cache *cl_cache; + /** member of cl_cache->ccc_lru */ + struct list_head cl_lru_osc; + /** # of available LRU slots left in the per-OSC cache. + * Available LRU slots are shared by all OSCs of the same file system, + * therefore this is a pointer to cl_client_cache::ccc_lru_left. */ + atomic_long_t *cl_lru_left; + /** # of busy LRU pages. A page is considered busy if it's in writeback + * queue, or in transfer. Busy pages can't be discarded so they are not + * in LRU cache. */ + atomic_long_t cl_lru_busy; + /** # of LRU pages in the cache for this client_obd */ + atomic_long_t cl_lru_in_list; + /** # of threads are shrinking LRU cache. To avoid contention, it's not + * allowed to have multiple threads shrinking LRU cache. */ + atomic_t cl_lru_shrinkers; + /** The time when this LRU cache was last used. */ + time64_t cl_lru_last_used; + /** stats: how many reclaims have happened for this client_obd. + * reclaim and shrink - shrink is async, voluntarily rebalancing; + * reclaim is sync, initiated by IO thread when the LRU slots are + * in shortage. */ + __u64 cl_lru_reclaim; + /** List of LRU pages for this client_obd */ + struct list_head cl_lru_list; + /** Lock for LRU page list */ + spinlock_t cl_lru_list_lock; + /** # of unstable pages in this client_obd. + * An unstable page is a page state that WRITE RPC has finished but + * the transaction has NOT yet committed. */ + atomic_long_t cl_unstable_count; + /** Link to osc_shrinker_list */ + struct list_head cl_shrink_list; + + /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */ + atomic_t cl_destroy_in_flight; + wait_queue_head_t cl_destroy_waitq; + + /* modify rpcs in flight + * currently used for metadata only */ + spinlock_t cl_mod_rpcs_lock; + __u16 cl_max_mod_rpcs_in_flight; + __u16 cl_mod_rpcs_in_flight; + __u16 cl_close_rpcs_in_flight; + wait_queue_head_t cl_mod_rpcs_waitq; + unsigned long *cl_mod_tag_bitmap; + struct obd_histogram cl_mod_rpcs_hist; + + /* mgc datastruct */ + struct mutex cl_mgc_mutex; + struct local_oid_storage *cl_mgc_los; + struct dt_object *cl_mgc_configs_dir; + struct obd_export *cl_mgc_mgsexp; + atomic_t cl_mgc_refcount; + /* in-flight control list and total RPCs counter */ + struct list_head cl_flight_waiters; + __u32 cl_rpcs_in_flight; + + /* checksumming for data sent over the network */ + unsigned int cl_checksum:1, /* 0 = disabled, 1 = enabled */ + cl_checksum_dump:1; /* same */ + /* supported checksum types that are worked out at connect time */ + __u32 cl_supp_cksum_types; + /* checksum algorithm to be used */ + enum cksum_types cl_cksum_type; + + /* also protected by the poorly named _loi_list_lock lock above */ + struct osc_async_rc cl_ar; + + /* sequence manager */ + struct lu_client_seq *cl_seq; + struct rw_semaphore cl_seq_rwsem; + + atomic_t cl_resends; /* resend count */ + + /* ptlrpc work for writeback in ptlrpcd context */ + void *cl_writeback_work; + void *cl_lru_work; + struct mutex cl_quota_mutex; + /* hash tables for osc_quota_info */ + struct cfs_hash *cl_quota_hash[LL_MAXQUOTAS]; + /* the xid of the request updating the hash tables */ + __u64 cl_quota_last_xid; + /* Links to the global list of registered changelog devices */ + struct list_head cl_chg_dev_linkage; +}; +#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) + +struct obd_id_info { + u32 idx; + u64 *data; +}; + +struct echo_client_obd { + struct obd_export *ec_exp; /* the local connection to osc/lov */ + spinlock_t ec_lock; + struct list_head ec_objects; + struct list_head ec_locks; + __u64 ec_unique; +}; + +/* allow statfs data caching for 1 second */ +#define OBD_STATFS_CACHE_SECONDS 1 +/* arbitrary maximum. larger would be useless, allows catching bogus input */ +#define OBD_STATFS_CACHE_MAX_AGE 3600 /* seconds */ +/* By default, don't do time based negative cache invalidation */ +#define OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS (-1) /* seconds */ + +#define lov_tgt_desc lu_tgt_desc + +struct lov_md_tgt_desc { + struct obd_device *lmtd_mdc; + __u32 lmtd_index; +}; + +struct lov_obd { + struct lov_desc desc; + struct lov_tgt_desc **lov_tgts; /* sparse array */ + struct lu_tgt_pool lov_packed; /* all OSTs in a packed + array */ + struct mutex lov_lock; + struct obd_connect_data lov_ocd; + struct proc_dir_entry *targets_proc_entry; + atomic_t lov_refcount; + __u32 lov_death_row; /* tgts scheduled to be deleted */ + __u32 lov_tgt_size; /* size of tgts array */ + int lov_connects; + int lov_pool_count; + struct cfs_hash *lov_pools_hash_body; /* used for key access */ + struct list_head lov_pool_list; /* used for sequential access */ + struct proc_dir_entry *lov_pool_proc_entry; + enum lustre_sec_part lov_sp_me; + + /* Cached LRU and unstable data from upper layer */ + struct cl_client_cache *lov_cache; + + struct rw_semaphore lov_notify_lock; + /* Data-on-MDT: MDC array */ + struct lov_md_tgt_desc *lov_mdc_tgts; + + struct kobject *lov_tgts_kobj; +}; + +#define lmv_tgt_desc lu_tgt_desc + +struct lmv_obd { + struct lu_client_fld lmv_fld; + spinlock_t lmv_lock; + + int connected; + int max_easize; + int max_def_easize; + u32 lmv_statfs_start; + + struct lu_tgt_descs lmv_mdt_descs; + + struct obd_connect_data conn_data; + struct kobject *lmv_tgts_kobj; + void *lmv_cache; + + __u32 lmv_qos_rr_index; +}; + +#define lmv_mdt_count lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count +#define lmv_qos lmv_mdt_descs.ltd_qos + +/* Minimum sector size is 512 */ +#define MAX_GUARD_NUMBER (PAGE_SIZE / 512) + +struct niobuf_local { + __u64 lnb_file_offset; + __u32 lnb_page_offset; + __u32 lnb_len; + __u32 lnb_flags; + int lnb_rc; + struct page *lnb_page; + void *lnb_data; + __u16 lnb_guards[MAX_GUARD_NUMBER]; + __u16 lnb_guard_rpc:1; + __u16 lnb_guard_disk:1; + /* separate unlock for read path to allow shared access */ + __u16 lnb_locked:1; +}; + +struct tgt_thread_big_cache { + struct niobuf_local local[PTLRPC_MAX_BRW_PAGES]; +}; + +#define LUSTRE_FLD_NAME "fld" +#define LUSTRE_SEQ_NAME "seq" + +#define LUSTRE_MDD_NAME "mdd" +#define LUSTRE_OSD_LDISKFS_NAME "osd-ldiskfs" +#define LUSTRE_OSD_ZFS_NAME "osd-zfs" +#define LUSTRE_VVP_NAME "vvp" +#define LUSTRE_LMV_NAME "lmv" +#define LUSTRE_SLP_NAME "slp" +#define LUSTRE_LOD_NAME "lod" +#define LUSTRE_OSP_NAME "osp" +#define LUSTRE_LWP_NAME "lwp" + +/* obd device type names */ + /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */ +#define LUSTRE_MDS_NAME "mds" +#define LUSTRE_MDT_NAME "mdt" +#define LUSTRE_MDC_NAME "mdc" +#define LUSTRE_OSS_NAME "ost" /* FIXME change name to oss */ +#define LUSTRE_OST_NAME "obdfilter" /* FIXME change name to ost */ +#define LUSTRE_OSC_NAME "osc" +#define LUSTRE_LOV_NAME "lov" +#define LUSTRE_MGS_NAME "mgs" +#define LUSTRE_MGC_NAME "mgc" + +#define LUSTRE_ECHO_NAME "obdecho" +#define LUSTRE_ECHO_CLIENT_NAME "echo_client" +#define LUSTRE_QMT_NAME "qmt" + +/* Constant obd names (post-rename) */ +#define LUSTRE_MDS_OBDNAME "MDS" +#define LUSTRE_OSS_OBDNAME "OSS" +#define LUSTRE_MGS_OBDNAME "MGS" +#define LUSTRE_MGC_OBDNAME "MGC" + +static inline int is_lwp_on_mdt(char *name) +{ + char *ptr; + + ptr = strrchr(name, '-'); + if (ptr == NULL) { + CERROR("%s is not a obdname\n", name); + return 0; + } + + /* LWP name on MDT is fsname-MDTxxxx-lwp-MDTxxxx */ + + if (strncmp(ptr + 1, "MDT", 3) != 0) + return 0; + + while (*(--ptr) != '-' && ptr != name); + + if (ptr == name) + return 0; + + if (strncmp(ptr + 1, LUSTRE_LWP_NAME, strlen(LUSTRE_LWP_NAME)) != 0) + return 0; + + return 1; +} + +static inline int is_lwp_on_ost(char *name) +{ + char *ptr; + + ptr = strrchr(name, '-'); + if (ptr == NULL) { + CERROR("%s is not a obdname\n", name); + return 0; + } + + /* LWP name on OST is fsname-MDTxxxx-lwp-OSTxxxx */ + + if (strncmp(ptr + 1, "OST", 3) != 0) + return 0; + + while (*(--ptr) != '-' && ptr != name); + + if (ptr == name) + return 0; + + if (strncmp(ptr + 1, LUSTRE_LWP_NAME, strlen(LUSTRE_LWP_NAME)) != 0) + return 0; + + return 1; +} + +/* + * Events signalled through obd_notify() upcall-chain. + */ +enum obd_notify_event { + /* Device connect start */ + OBD_NOTIFY_CONNECT, + /* Device activated */ + OBD_NOTIFY_ACTIVE, + /* Device deactivated */ + OBD_NOTIFY_INACTIVE, + /* Connect data for import were changed */ + OBD_NOTIFY_OCD, + /* Administratively deactivate/activate event */ + OBD_NOTIFY_DEACTIVATE, + OBD_NOTIFY_ACTIVATE +}; + +/* + * Data structure used to pass obd_notify()-event to non-obd listeners (llite + * being main example). + */ +struct obd_notify_upcall { + int (*onu_upcall)(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner); + /* Opaque datum supplied by upper layer listener */ + void *onu_owner; +}; + +struct target_recovery_data { + svc_handler_t trd_recovery_handler; + pid_t trd_processing_task; + struct completion trd_starting; + struct completion trd_finishing; +}; + +struct obd_llog_group { + struct llog_ctxt *olg_ctxts[LLOG_MAX_CTXTS]; + wait_queue_head_t olg_waitq; + spinlock_t olg_lock; + struct mutex olg_cat_processing; +}; + +/* corresponds to one of the obd's */ +#define OBD_DEVICE_MAGIC 0XAB5CD6EF + +struct obd_device { + struct obd_type *obd_type; + __u32 obd_magic; /* OBD_DEVICE_MAGIC */ + int obd_minor; /* device number: lctl dl */ + struct lu_device *obd_lu_dev; + + /* common and UUID name of this device */ + struct obd_uuid obd_uuid; + char obd_name[MAX_OBD_NAME]; + + /* bitfield modification is protected by obd_dev_lock */ + unsigned long + obd_attached:1, /* finished attach */ + obd_set_up:1, /* finished setup */ + obd_recovering:1, /* there are recoverable clients */ + obd_abort_recovery:1, /* recovery expired */ + obd_version_recov:1, /* obd uses version checking */ + obd_replayable:1, /* recovery enabled; inform clients */ + obd_no_transno:1, /* no committed-transno notification */ + obd_no_recov:1, /* fail instead of retry messages */ + obd_stopping:1, /* started cleanup */ + obd_starting:1, /* started setup */ + obd_force:1, /* cleanup with > 0 obd refcount */ + obd_fail:1, /* cleanup with failover */ + obd_no_conn:1, /* deny new connections */ + obd_inactive:1, /* device active/inactive + * (for /proc/status only!!) */ + obd_no_ir:1, /* no imperative recovery. */ + obd_process_conf:1, /* device is processing mgs config */ + obd_checksum_dump:1; /* dump pages upon cksum error */ + + /* use separate field as it is set in interrupt to don't mess with + * protection of other bits using _bh lock */ + unsigned long obd_recovery_expired:1; + /* uuid-export hash body */ + struct cfs_hash *obd_uuid_hash; + /* nid-export hash body */ + struct cfs_hash *obd_nid_hash; + /* nid stats body */ + struct cfs_hash *obd_nid_stats_hash; + /* client_generation-export hash body */ + struct cfs_hash *obd_gen_hash; + struct list_head obd_nid_stats; + struct list_head obd_exports; + struct list_head obd_unlinked_exports; + struct list_head obd_delayed_exports; + struct list_head obd_lwp_list; + atomic_t obd_refcount; + int obd_num_exports; + spinlock_t obd_nid_lock; + struct ldlm_namespace *obd_namespace; + struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */ + /* a spinlock is OK for what we do now, may need a semaphore later */ + spinlock_t obd_dev_lock; /* protect OBD bitfield above */ + spinlock_t obd_osfs_lock; + struct obd_statfs obd_osfs; /* locked by obd_osfs_lock */ + time64_t obd_osfs_age; + __u64 obd_last_committed; + struct mutex obd_dev_mutex; + struct lvfs_run_ctxt obd_lvfs_ctxt; + struct obd_llog_group obd_olg; /* default llog group */ + struct obd_device *obd_observer; + struct rw_semaphore obd_observer_link_sem; + struct obd_notify_upcall obd_upcall; + struct obd_export *obd_self_export; + struct obd_export *obd_lwp_export; + /* list of exports in LRU order, for ping evictor, with obd_dev_lock */ + struct list_head obd_exports_timed; + time64_t obd_eviction_timer; /* for ping evictor */ + + atomic_t obd_max_recoverable_clients; + atomic_t obd_connected_clients; + int obd_stale_clients; + /* this lock protects all recovery list_heads, timer and + * obd_next_recovery_transno value */ + spinlock_t obd_recovery_task_lock; + __u64 obd_next_recovery_transno; + int obd_replayed_requests; + int obd_requests_queued_for_recovery; + wait_queue_head_t obd_next_transno_waitq; + /* protected by obd_recovery_task_lock */ + struct hrtimer obd_recovery_timer; + /* seconds */ + time64_t obd_recovery_start; + /* seconds, for lprocfs_status */ + time64_t obd_recovery_end; + time64_t obd_recovery_time_hard; + time64_t obd_recovery_timeout; + int obd_recovery_ir_factor; + + /* new recovery stuff from CMD2 */ + int obd_replayed_locks; + atomic_t obd_req_replay_clients; + atomic_t obd_lock_replay_clients; + struct target_recovery_data obd_recovery_data; + + /* all lists are protected by obd_recovery_task_lock */ + struct list_head obd_req_replay_queue; + struct list_head obd_lock_replay_queue; + struct list_head obd_final_req_queue; + + union { +#ifdef HAVE_SERVER_SUPPORT + struct obd_device_target obt; + struct filter_obd filter; + struct ost_obd ost; + struct echo_obd echo; +#endif + struct client_obd cli; + struct echo_client_obd echo_client; + struct lov_obd lov; + struct lmv_obd lmv; + } u; + + /* Fields used by LProcFS */ + struct lprocfs_stats *obd_stats; + + struct lprocfs_stats *obd_md_stats; + + struct dentry *obd_debugfs_entry; + struct proc_dir_entry *obd_proc_entry; + struct proc_dir_entry *obd_proc_exports_entry; + struct dentry *obd_svc_debugfs_entry; + struct lprocfs_stats *obd_svc_stats; + const struct attribute **obd_attrs; + struct lprocfs_vars *obd_vars; + struct ldebugfs_vars *obd_debugfs_vars; + atomic_t obd_evict_inprogress; + wait_queue_head_t obd_evict_inprogress_waitq; + struct list_head obd_evict_list; /* protected with pet_lock */ + + /** + * LDLM pool part. Save last calculated SLV and Limit. + */ + rwlock_t obd_pool_lock; + __u64 obd_pool_slv; + int obd_pool_limit; + + int obd_conn_inprogress; + + /** + * List of outstanding class_incref()'s fo this OBD. For debugging. */ + struct lu_ref obd_reference; + + struct kset obd_kset; /* sysfs object collection */ + struct kobj_type obd_ktype; + struct completion obd_kobj_unregister; +}; + +/* get/set_info keys */ +#define KEY_ASYNC "async" +#define KEY_CHANGELOG_CLEAR "changelog_clear" +#define KEY_FID2PATH "fid2path" +#define KEY_CHECKSUM "checksum" +#define KEY_CLEAR_FS "clear_fs" +#define KEY_CONN_DATA "conn_data" +#define KEY_EVICT_BY_NID "evict_by_nid" +#define KEY_FIEMAP "fiemap" +#define KEY_FLUSH_CTX "flush_ctx" +#define KEY_GRANT_SHRINK "grant_shrink" +#define KEY_HSM_COPYTOOL_SEND "hsm_send" +#define KEY_INIT_RECOV_BACKUP "init_recov_bk" +#define KEY_INTERMDS "inter_mds" +#define KEY_LAST_ID "last_id" +#define KEY_LAST_FID "last_fid" +#define KEY_MAX_EASIZE "max_easize" +#define KEY_DEFAULT_EASIZE "default_easize" +#define KEY_MGSSEC "mgssec" +#define KEY_READ_ONLY "read-only" +#define KEY_REGISTER_TARGET "register_target" +#define KEY_SET_FS "set_fs" +#define KEY_TGT_COUNT "tgt_count" +/* KEY_SET_INFO in lustre_idl.h */ +#define KEY_SPTLRPC_CONF "sptlrpc_conf" + +#define KEY_CACHE_SET "cache_set" +#define KEY_CACHE_LRU_SHRINK "cache_lru_shrink" +#define KEY_OSP_CONNECTED "osp_connected" + +/* Flags for op_xvalid */ +enum op_xvalid { + OP_XVALID_CTIME_SET = BIT(0), /* 0x0001 */ + OP_XVALID_BLOCKS = BIT(1), /* 0x0002 */ + OP_XVALID_OWNEROVERRIDE = BIT(2), /* 0x0004 */ + OP_XVALID_FLAGS = BIT(3), /* 0x0008 */ + OP_XVALID_PROJID = BIT(4), /* 0x0010 */ + OP_XVALID_LAZYSIZE = BIT(5), /* 0x0020 */ + OP_XVALID_LAZYBLOCKS = BIT(6), /* 0x0040 */ +}; + +struct lu_context; + +static inline int it_to_lock_mode(struct lookup_intent *it) +{ + /* CREAT needs to be tested before open (both could be set) */ + if (it->it_op & IT_CREAT) + return LCK_CW; + else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP)) + return LCK_CR; + else if (it->it_op & IT_LAYOUT) + return (it->it_flags & FMODE_WRITE) ? LCK_EX : LCK_CR; + else if (it->it_op & IT_READDIR) + return LCK_PR; + else if (it->it_op & IT_GETXATTR) + return LCK_PR; + + LASSERTF(0, "Invalid it_op: %d\n", it->it_op); + return -EINVAL; +} + +enum md_op_flags { + MF_MDC_CANCEL_FID1 = BIT(0), + MF_MDC_CANCEL_FID2 = BIT(1), + MF_MDC_CANCEL_FID3 = BIT(2), + MF_MDC_CANCEL_FID4 = BIT(3), + MF_GET_MDT_IDX = BIT(4), + MF_GETATTR_BY_FID = BIT(5), + MF_QOS_MKDIR = BIT(6), + MF_RR_MKDIR = BIT(7), +}; + +enum md_cli_flags { + CLI_SET_MEA = 1 << 0, + CLI_RM_ENTRY = 1 << 1, + CLI_HASH64 = 1 << 2, + CLI_API32 = 1 << 3, + CLI_MIGRATE = 1 << 4, +}; + +enum md_op_code { + LUSTRE_OPC_MKDIR = 1, + LUSTRE_OPC_SYMLINK, + LUSTRE_OPC_MKNOD, + LUSTRE_OPC_CREATE, + LUSTRE_OPC_ANY, +}; + +/** + * GETXATTR is not included as only a couple of fields in the reply body + * is filled, but not FID which is needed for common intent handling in + * mdc_finish_intent_lock() + */ +static inline bool it_has_reply_body(const struct lookup_intent *it) +{ + return it->it_op & (IT_OPEN | IT_LOOKUP | IT_GETATTR); +} + +struct md_op_data { + struct lu_fid op_fid1; /* operation fid1 (usualy parent) */ + struct lu_fid op_fid2; /* operation fid2 (usualy child) */ + struct lu_fid op_fid3; /* 2 extra fids to find conflicting */ + struct lu_fid op_fid4; /* to the operation locks. */ + u32 op_mds; /* what mds server open will go to */ + __u32 op_mode; + enum md_op_code op_code; + struct lustre_handle op_open_handle; + s64 op_mod_time; + const char *op_name; + size_t op_namelen; + struct rw_semaphore *op_mea1_sem; + struct rw_semaphore *op_mea2_sem; + struct lmv_stripe_md *op_mea1; + struct lmv_stripe_md *op_mea2; + struct lmv_stripe_md *op_default_mea1; /* default LMV */ + __u32 op_suppgids[2]; + __u32 op_fsuid; + __u32 op_fsgid; + cfs_cap_t op_cap; + void *op_data; + size_t op_data_size; + + /* iattr fields and blocks. */ + struct iattr op_attr; + enum op_xvalid op_xvalid; /* eXtra validity flags */ + loff_t op_attr_blocks; + u64 op_valid; /* OBD_MD_* */ + unsigned int op_attr_flags; /* LUSTRE_{SYNC,..}_FL */ + + enum md_op_flags op_flags; + + /* Various operation flags. */ + enum mds_op_bias op_bias; + + /* used to transfer info between the stacks of MD client + * see enum op_cli_flags */ + enum md_cli_flags op_cli_flags; + + /* File object data version for HSM release, on client */ + __u64 op_data_version; + struct lustre_handle op_lease_handle; + + /* File security context, for creates/metadata ops */ + const char *op_file_secctx_name; + __u32 op_file_secctx_name_size; + void *op_file_secctx; + __u32 op_file_secctx_size; + + __u32 op_projid; + + union { + /* Used by readdir */ + unsigned int op_max_pages; + /* mkdir */ + unsigned short op_dir_depth; + }; + + __u16 op_mirror_id; + + /* + * used to access migrating dir: if it's set, assume migration is + * finished, use the new layout to access dir, otherwise use old layout. + * By default it's not set, because new files are created under new + * layout, if we can't find file with name under both old and new + * layout, we are sure file with name doesn't exist, but in reverse + * order there may be a race with creation by others. + */ + bool op_post_migrate; + /* used to access dir with bash hash */ + __u32 op_stripe_index; +}; + +struct md_callback { + int (*md_blocking_ast)(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, + void *data, int flag); +}; + +struct md_enqueue_info; +/* metadata stat-ahead */ +typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, + int rc); + +struct md_enqueue_info { + struct md_op_data mi_data; + struct lookup_intent mi_it; + struct lustre_handle mi_lockh; + struct inode *mi_dir; + struct ldlm_enqueue_info mi_einfo; + md_enqueue_cb_t mi_cb; + void *mi_cbdata; +}; + +struct obd_ops { + struct module *o_owner; + int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void __user *uarg); + int (*o_get_info)(const struct lu_env *env, struct obd_export *, + __u32 keylen, void *key, __u32 *vallen, void *val); + int (*o_set_info_async)(const struct lu_env *, struct obd_export *, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set); + int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg); + int (*o_precleanup)(struct obd_device *dev); + int (*o_cleanup)(struct obd_device *dev); + int (*o_process_config)(struct obd_device *dev, size_t len, void *data); + int (*o_postrecov)(struct obd_device *dev); + int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid, + int priority); + int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid); + /* connect to the target device with given connection + * data. @ocd->ocd_connect_flags is modified to reflect flags actually + * granted by the target, which are guaranteed to be a subset of flags + * asked for. If @ocd == NULL, use default parameters. */ + int (*o_connect)(const struct lu_env *env, + struct obd_export **exp, struct obd_device *src, + struct obd_uuid *cluuid, struct obd_connect_data *ocd, + void *localdata); + int (*o_reconnect)(const struct lu_env *env, + struct obd_export *exp, struct obd_device *src, + struct obd_uuid *cluuid, + struct obd_connect_data *ocd, + void *localdata); + int (*o_disconnect)(struct obd_export *exp); + + /* Initialize/finalize fids infrastructure. */ + int (*o_fid_init)(struct obd_device *obd, + struct obd_export *exp, enum lu_cli_type type); + int (*o_fid_fini)(struct obd_device *obd); + + /* Allocate new fid according to passed @hint. */ + int (*o_fid_alloc)(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data); + + /* + * Object with @fid is getting deleted, we may want to do something + * about this. + */ + int (*o_statfs)(const struct lu_env *, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, __u32 flags); + int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo, + time64_t max_age, struct ptlrpc_request_set *set); + int (*o_create)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa); + int (*o_destroy)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa); + int (*o_setattr)(const struct lu_env *, struct obd_export *exp, + struct obdo *oa); + int (*o_getattr)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa); + int (*o_preprw)(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, int objcount, + struct obd_ioobj *obj, struct niobuf_remote *remote, + int *nr_pages, struct niobuf_local *local); + int (*o_commitrw)(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *remote, int pages, + struct niobuf_local *local, int rc); + int (*o_init_export)(struct obd_export *exp); + int (*o_destroy_export)(struct obd_export *exp); + + int (*o_import_event)(struct obd_device *, struct obd_import *, + enum obd_import_event); + + int (*o_notify)(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev); + + int (*o_health_check)(const struct lu_env *env, struct obd_device *); + struct obd_uuid *(*o_get_uuid) (struct obd_export *exp); + + /* quota methods */ + int (*o_quotactl)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + + /* pools methods */ + int (*o_pool_new)(struct obd_device *obd, char *poolname); + int (*o_pool_del)(struct obd_device *obd, char *poolname); + int (*o_pool_add)(struct obd_device *obd, char *poolname, + char *ostname); + int (*o_pool_rem)(struct obd_device *obd, char *poolname, + char *ostname); +}; + +/* lmv structures */ +struct lustre_md { + struct mdt_body *body; + struct lu_buf layout; + union { + struct lmv_stripe_md *lmv; + struct lmv_foreign_md *lfm; + }; + struct lmv_stripe_md *default_lmv; +#ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *posix_acl; +#endif +}; + +struct md_open_data { + struct obd_client_handle *mod_och; + struct ptlrpc_request *mod_open_req; + struct ptlrpc_request *mod_close_req; + atomic_t mod_refcount; + bool mod_is_create; +}; + +struct obd_client_handle { + struct lustre_handle och_open_handle; + struct lu_fid och_fid; + struct md_open_data *och_mod; + struct lustre_handle och_lease_handle; /* open lock for lease */ + __u32 och_magic; + int och_flags; +}; + +#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed + +struct lookup_intent; +struct cl_attr; + +struct md_ops { + int (*m_close)(struct obd_export *, struct md_op_data *, + struct md_open_data *, struct ptlrpc_request **); + + int (*m_create)(struct obd_export *, struct md_op_data *, + const void *, size_t, umode_t, uid_t, gid_t, + cfs_cap_t, __u64, struct ptlrpc_request **); + + int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *, + const union ldlm_policy_data *, struct md_op_data *, + struct lustre_handle *, __u64); + + int (*m_getattr)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_intent_lock)(struct obd_export *, struct md_op_data *, + struct lookup_intent *, + struct ptlrpc_request **, + ldlm_blocking_callback, __u64); + + int (*m_link)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_rename)(struct obd_export *, struct md_op_data *, + const char *, size_t, const char *, size_t, + struct ptlrpc_request **); + + int (*m_setattr)(struct obd_export *, struct md_op_data *, void *, + size_t , struct ptlrpc_request **); + + int (*m_fsync)(struct obd_export *, const struct lu_fid *, + struct ptlrpc_request **); + + int (*m_read_page)(struct obd_export *, struct md_op_data *, + struct md_callback *cb_op, __u64 hash_offset, + struct page **ppage); + + int (*m_unlink)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_setxattr)(struct obd_export *, const struct lu_fid *, + u64, const char *, const void *, size_t, unsigned int, + u32, struct ptlrpc_request **); + + int (*m_getxattr)(struct obd_export *, const struct lu_fid *, + u64, const char *, size_t, struct ptlrpc_request **); + + int (*m_intent_getattr_async)(struct obd_export *, + struct md_enqueue_info *); + + int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *, + struct lu_fid *, __u64 *bits); + + int (*m_file_resync)(struct obd_export *, struct md_op_data *); + + int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *); + int (*m_null_inode)(struct obd_export *, const struct lu_fid *); + + int (*m_getattr_name)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_init_ea_size)(struct obd_export *, __u32, __u32); + + int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *, + struct obd_export *, struct obd_export *, + struct lustre_md *); + + int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *); + + int (*m_merge_attr)(struct obd_export *, + const struct lmv_stripe_md *lsm, + struct cl_attr *attr, ldlm_blocking_callback); + + int (*m_set_open_replay_data)(struct obd_export *, + struct obd_client_handle *, + struct lookup_intent *); + + int (*m_clear_open_replay_data)(struct obd_export *, + struct obd_client_handle *); + + int (*m_set_lock_data)(struct obd_export *, + const struct lustre_handle *, void *, __u64 *); + + enum ldlm_mode (*m_lock_match)(struct obd_export *, __u64, + const struct lu_fid *, enum ldlm_type, + union ldlm_policy_data *, enum ldlm_mode, + struct lustre_handle *); + + int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *, + union ldlm_policy_data *, enum ldlm_mode, + enum ldlm_cancel_flags flags, void *opaque); + + int (*m_get_fid_from_lsm)(struct obd_export *, + const struct lmv_stripe_md *, + const char *name, int namelen, + struct lu_fid *fid); + int (*m_unpackmd)(struct obd_export *exp, struct lmv_stripe_md **plsm, + const union lmv_mds_md *lmv, size_t lmv_size); + int (*m_rmfid)(struct obd_export *exp, struct fid_array *fa, int *rcs, + struct ptlrpc_request_set *set); +}; + +static inline struct md_open_data *obd_mod_alloc(void) +{ + struct md_open_data *mod; + OBD_ALLOC_PTR(mod); + if (mod == NULL) + return NULL; + atomic_set(&mod->mod_refcount, 1); + return mod; +} + +#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount) +#define obd_mod_put(mod) \ +({ \ + if (atomic_dec_and_test(&(mod)->mod_refcount)) { \ + if ((mod)->mod_open_req) \ + ptlrpc_req_finished((mod)->mod_open_req); \ + OBD_FREE_PTR(mod); \ + } \ +}) + +void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid); +void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent); +void obdo_set_o_projid(struct obdo *dst, u32 projid); + +/* return 1 if client should be resend request */ +static inline int client_should_resend(int resend, struct client_obd *cli) +{ + return atomic_read(&cli->cl_resends) ? + atomic_read(&cli->cl_resends) > resend : 1; +} + +/** + * Return device name for this device + * + * XXX: lu_device is declared before obd_device, while a pointer pointing + * back to obd_device in lu_device, so this helper function defines here + * instead of in lu_object.h + */ +static inline const char *lu_dev_name(const struct lu_device *lu_dev) +{ + return lu_dev->ld_obd->obd_name; +} + +static inline bool filename_is_volatile(const char *name, size_t namelen, + int *idx) +{ + const char *start; + char *end; + + if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0) + return false; + + /* caller does not care of idx */ + if (idx == NULL) + return true; + + /* volatile file, the MDT can be set from name */ + /* name format is LUSTRE_VOLATILE_HDR:[idx]: */ + /* if no MDT is specified, use std way */ + if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2) + goto bad_format; + /* test for no MDT idx case */ + if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') && + (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) { + *idx = -1; + return true; + } + /* we have an idx, read it */ + start = name + LUSTRE_VOLATILE_HDR_LEN + 1; + *idx = simple_strtoul(start, &end, 16); + /* error cases: + * no digit, no trailing :, negative value + */ + if (((*idx == 0) && (end == start)) || + (*end != ':') || (*idx < 0)) + goto bad_format; + + return true; +bad_format: + /* bad format of mdt idx, we cannot return an error + * to caller so we use hash algo */ + CERROR("Bad volatile file name format: %s\n", + name + LUSTRE_VOLATILE_HDR_LEN); + return false; +} + +static inline int cli_brw_size(struct obd_device *obd) +{ + LASSERT(obd != NULL); + return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT; +} + +/* + * When RPC size or the max RPCs in flight is increased, the max dirty pages + * of the client should be increased accordingly to avoid sending fragmented + * RPCs over the network when the client runs out of the maximum dirty space + * when so many RPCs are being generated. + */ +static inline void client_adjust_max_dirty(struct client_obd *cli) +{ + /* initializing */ + if (cli->cl_dirty_max_pages <= 0) { + cli->cl_dirty_max_pages = + (OSC_MAX_DIRTY_DEFAULT * 1024 * 1024) >> PAGE_SHIFT; + } else { + unsigned long dirty_max = cli->cl_max_rpcs_in_flight * + cli->cl_max_pages_per_rpc; + + if (dirty_max > cli->cl_dirty_max_pages) + cli->cl_dirty_max_pages = dirty_max; + } + + if (cli->cl_dirty_max_pages > cfs_totalram_pages() / 8) + cli->cl_dirty_max_pages = cfs_totalram_pages() / 8; + + /* This value is exported to userspace through the max_dirty_mb + * parameter. So we round up the number of pages to make it a round + * number of MBs. */ + cli->cl_dirty_max_pages = round_up(cli->cl_dirty_max_pages, + 1 << (20 - PAGE_SHIFT)); +} + +#endif /* __OBD_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cache.h b/drivers/staging/lustrefsx/lustre/include/obd_cache.h new file mode 100644 index 0000000000000..3378e5fc93375 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd_cache.h @@ -0,0 +1,35 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _OBD_CACHE_H__ +#define _OBD_CACHE_H__ + + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cksum.h b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h new file mode 100644 index 0000000000000..6e807d762c354 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h @@ -0,0 +1,180 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __OBD_CKSUM +#define __OBD_CKSUM +#include +#include +#include + +int obd_t10_cksum_speed(const char *obd_name, + enum cksum_types cksum_type); + +static inline unsigned char cksum_obd2cfs(enum cksum_types cksum_type) +{ + switch (cksum_type) { + case OBD_CKSUM_CRC32: + return CFS_HASH_ALG_CRC32; + case OBD_CKSUM_ADLER: + return CFS_HASH_ALG_ADLER32; + case OBD_CKSUM_CRC32C: + return CFS_HASH_ALG_CRC32C; + default: + CERROR("Unknown checksum type (%x)!!!\n", cksum_type); + LBUG(); + } + return 0; +} + +u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type); + +static inline enum cksum_types obd_cksum_type_unpack(u32 o_flags) +{ + switch (o_flags & OBD_FL_CKSUM_ALL) { + case OBD_FL_CKSUM_CRC32C: + return OBD_CKSUM_CRC32C; + case OBD_FL_CKSUM_CRC32: + return OBD_CKSUM_CRC32; + case OBD_FL_CKSUM_T10IP512: + return OBD_CKSUM_T10IP512; + case OBD_FL_CKSUM_T10IP4K: + return OBD_CKSUM_T10IP4K; + case OBD_FL_CKSUM_T10CRC512: + return OBD_CKSUM_T10CRC512; + case OBD_FL_CKSUM_T10CRC4K: + return OBD_CKSUM_T10CRC4K; + default: + break; + } + + return OBD_CKSUM_ADLER; +} + +/* Return a bitmask of the checksum types supported on this system. + * 1.8 supported ADLER it is base and not depend on hw + * Client uses all available local algos + */ +static inline enum cksum_types obd_cksum_types_supported_client(void) +{ + enum cksum_types ret = OBD_CKSUM_ADLER; + + CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n", + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER))); + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0) + ret |= OBD_CKSUM_CRC32C; + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0) + ret |= OBD_CKSUM_CRC32; + + /* Client support all kinds of T10 checksum */ + ret |= OBD_CKSUM_T10_ALL; + + return ret; +} + +enum cksum_types obd_cksum_types_supported_server(const char *obd_name); + +/* Select the best checksum algorithm among those supplied in the cksum_types + * input. + * + * Currently, calling cksum_type_pack() with a mask will return the fastest + * checksum type due to its benchmarking at libcfs module load. + * Caution is advised, however, since what is fastest on a single client may + * not be the fastest or most efficient algorithm on the server. */ +static inline enum cksum_types +obd_cksum_type_select(const char *obd_name, enum cksum_types cksum_types) +{ + u32 flag = obd_cksum_type_pack(obd_name, cksum_types); + + return obd_cksum_type_unpack(flag); +} + +/* Checksum algorithm names. Must be defined in the same order as the + * OBD_CKSUM_* flags. */ +#define DECLARE_CKSUM_NAME const char *cksum_name[] = {"crc32", "adler", \ + "crc32c", "reserved", "t10ip512", "t10ip4K", "t10crc512", "t10crc4K"} + +typedef __u16 (obd_dif_csum_fn) (void *, unsigned int); + +__u16 obd_dif_crc_fn(void *data, unsigned int len); +__u16 obd_dif_ip_fn(void *data, unsigned int len); +int obd_page_dif_generate_buffer(const char *obd_name, struct page *page, + __u32 offset, __u32 length, + __u16 *guard_start, int guard_number, + int *used_number, int sector_size, + obd_dif_csum_fn *fn); +/* + * If checksum type is one T10 checksum types, init the csum_fn and sector + * size. Otherwise, init them to NULL/zero. + */ +static inline void obd_t10_cksum2dif(enum cksum_types cksum_type, + obd_dif_csum_fn **fn, int *sector_size) +{ + *fn = NULL; + *sector_size = 0; + +#if IS_ENABLED(CONFIG_CRC_T10DIF) + switch (cksum_type) { + case OBD_CKSUM_T10IP512: + *fn = obd_dif_ip_fn; + *sector_size = 512; + break; + case OBD_CKSUM_T10IP4K: + *fn = obd_dif_ip_fn; + *sector_size = 4096; + break; + case OBD_CKSUM_T10CRC512: + *fn = obd_dif_crc_fn; + *sector_size = 512; + break; + case OBD_CKSUM_T10CRC4K: + *fn = obd_dif_crc_fn; + *sector_size = 4096; + break; + default: + break; + } +#endif /* CONFIG_CRC_T10DIF */ +} + +enum obd_t10_cksum_type { + OBD_T10_CKSUM_UNKNOWN = 0, + OBD_T10_CKSUM_IP512, + OBD_T10_CKSUM_IP4K, + OBD_T10_CKSUM_CRC512, + OBD_T10_CKSUM_CRC4K, + OBD_T10_CKSUM_MAX +}; + +#endif /* __OBD_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h new file mode 100644 index 0000000000000..b579fc995babb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h @@ -0,0 +1,1871 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef __CLASS_OBD_H +#define __CLASS_OBD_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define OBD_STATFS_NODELAY 0x0001 /* requests should be send without delay + * and resends for avoid deadlocks */ +#define OBD_STATFS_FROM_CACHE 0x0002 /* the statfs callback should not update + * obd_osfs_age */ +#define OBD_STATFS_FOR_MDT0 0x0004 /* The statfs is only for retrieving + * information from MDT0. */ +#define OBD_STATFS_SUM 0x0008 /* get aggregated statfs from MDT */ + +extern rwlock_t obd_dev_lock; + +/* OBD Operations Declarations */ +extern struct obd_device *class_exp2obd(struct obd_export *); +extern int class_handle_ioctl(unsigned int cmd, unsigned long arg); +int lustre_get_jobid(char *jobid, size_t len); +void lustre_jobid_clear(const char *jobid); +void jobid_cache_fini(void); +int jobid_cache_init(void); + +struct lu_device_type; + +/* genops.c */ +struct obd_export *class_conn2export(struct lustre_handle *); +struct kobject *class_setup_tunables(const char *name); +int class_register_type(const struct obd_ops *dt_ops, + const struct md_ops *md_ops, bool enable_proc, + struct ldebugfs_vars *module_vars, + const char *nm, struct lu_device_type *ldt); +int class_unregister_type(const char *nm); + +struct obd_device *class_newdev(const char *type_name, const char *name, + const char *uuid); +int class_register_device(struct obd_device *obd); +void class_unregister_device(struct obd_device *obd); +void class_free_dev(struct obd_device *obd); + +struct obd_device *class_dev_by_str(const char *str); +int class_name2dev(const char *name); +struct obd_device *class_name2obd(const char *name); +int class_uuid2dev(struct obd_uuid *uuid); +struct obd_device *class_uuid2obd(struct obd_uuid *uuid); +void class_obd_list(void); +struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, + const char * typ_name, + struct obd_uuid *grp_uuid); +struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, + int *next); +struct obd_device * class_num2obd(int num); +int get_devices_count(void); + +int class_notify_sptlrpc_conf(const char *fsname, int namelen); + +static inline char *obd_export_nid2str(struct obd_export *exp) +{ + return exp->exp_connection == NULL ? + "" : libcfs_nid2str(exp->exp_connection->c_peer.nid); +} + +static inline char *obd_import_nid2str(struct obd_import *imp) +{ + return imp->imp_connection == NULL ? + "" : libcfs_nid2str(imp->imp_connection->c_peer.nid); +} + +int obd_export_evict_by_nid(struct obd_device *obd, const char *nid); +int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid); +int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2, + const char *sep); + +int obd_zombie_impexp_init(void); +void obd_zombie_impexp_stop(void); +void obd_zombie_impexp_cull(void); +void obd_zombie_barrier(void); +void obd_exports_barrier(struct obd_device *obd); +int kuc_len(int payload_len); +struct kuc_hdr * kuc_ptr(void *p); +void *kuc_alloc(int payload_len, int transport, int type); +void kuc_free(void *p, int payload_len); +int obd_get_request_slot(struct client_obd *cli); +void obd_put_request_slot(struct client_obd *cli); +__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli); +int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max); +__u16 obd_get_max_mod_rpcs_in_flight(struct client_obd *cli); +int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max); +int obd_mod_rpc_stats_seq_show(struct client_obd *cli, struct seq_file *seq); + +__u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc, + struct lookup_intent *it); +void obd_put_mod_rpc_slot(struct client_obd *cli, __u32 opc, + struct lookup_intent *it, __u16 tag); + +struct llog_handle; +struct llog_rec_hdr; +typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *, + struct llog_rec_hdr *, void *); + +struct obd_export *obd_stale_export_get(void); +void obd_stale_export_put(struct obd_export *exp); +void obd_stale_export_adjust(struct obd_export *exp); + +/* obd_config.c */ +/* For interoperability */ +struct cfg_interop_param { + char *old_param; + char *new_param; +}; + +char *lustre_cfg_string(struct lustre_cfg *lcfg, u32 index); +struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg, + const char *new_name); +void print_lustre_cfg(struct lustre_cfg *lcfg); +int class_process_config(struct lustre_cfg *lcfg); +ssize_t class_set_global(const char *param); +ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix, + struct kobject *kobj); +int class_attach(struct lustre_cfg *lcfg); +int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg); + +int class_find_param(char *buf, char *key, char **valp); +struct cfg_interop_param *class_find_old_param(const char *param, + struct cfg_interop_param *ptr); +int class_get_next_param(char **params, char *copy); +int class_match_param(char *buf, const char *key, char **valp); +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh); +int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh); +int class_parse_net(char *buf, u32 *net, char **endh); +int class_match_nid(char *buf, char *key, lnet_nid_t nid); +int class_match_net(char *buf, char *key, u32 net); + +struct obd_device *class_incref(struct obd_device *obd, + const char *scope, const void *source); +void class_decref(struct obd_device *obd, + const char *scope, const void *source); +void dump_exports(struct obd_device *obd, int locks, int debug_level); +int class_config_llog_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data); +int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg); + +#define CFG_F_START 0x01 /* Set when we start updating from a log */ +#define CFG_F_MARKER 0x02 /* We are within a maker */ +#define CFG_F_SKIP 0x04 /* We should ignore this cfg command */ +#define CFG_F_EXCLUDE 0x10 /* OST exclusion list */ + +/* Passed as data param to class_config_parse_llog */ +struct config_llog_instance { + unsigned long cfg_instance; + struct super_block *cfg_sb; + struct obd_uuid cfg_uuid; + llog_cb_t cfg_callback; + int cfg_last_idx; /* for partial llog processing */ + int cfg_flags; + __u32 cfg_lwp_idx; + __u32 cfg_sub_clds; +}; +int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg); + +/** + * Generate a unique configuration instance for this mount + * + * Temporary hack to bypass ASLR in 4.15+ kernels, a better fix soon. + * For now, use the same value as before - the superblock pointer value. + * + * Using the client UUID would be an option, but it needs more testing. + */ +static inline unsigned long ll_get_cfg_instance(struct super_block *sb) +{ + return (unsigned long)sb; +} + +#define CONFIG_SUB_SPTLRPC 0x01 +#define CONFIG_SUB_RECOVER 0x02 +#define CONFIG_SUB_PARAMS 0x04 +#define CONFIG_SUB_NODEMAP 0x08 +#define CONFIG_SUB_BARRIER 0x10 + +/* Sub clds should be attached to the config_llog_data when processing + * config log for client or server target. */ +#define CONFIG_SUB_CLIENT (CONFIG_SUB_SPTLRPC | CONFIG_SUB_RECOVER | \ + CONFIG_SUB_PARAMS) +#define CONFIG_SUB_SERVER (CONFIG_SUB_CLIENT | CONFIG_SUB_NODEMAP | \ + CONFIG_SUB_BARRIER) + +#define PARAMS_FILENAME "params" +#define BARRIER_FILENAME "barrier" +#define LCTL_UPCALL "lctl" + +static inline bool logname_is_barrier(const char *logname) +{ + char *ptr; + + /* logname for barrier is "fsname-barrier" */ + ptr = strstr(logname, BARRIER_FILENAME); + if (ptr && (ptr - logname) >= 2 && + *(ptr - 1) == '-' && *(ptr + 7) == '\0') + return true; + + return false; +} + +/* list of active configuration logs */ +struct config_llog_data { + struct ldlm_res_id cld_resid; + struct config_llog_instance cld_cfg; + struct list_head cld_list_chain;/* on config_llog_list */ + atomic_t cld_refcount; + struct config_llog_data *cld_sptlrpc;/* depended sptlrpc log */ + struct config_llog_data *cld_params; /* common parameters log */ + struct config_llog_data *cld_recover;/* imperative recover log */ + struct config_llog_data *cld_nodemap;/* nodemap log */ + struct config_llog_data *cld_barrier;/* barrier log (for MDT only) */ + struct obd_export *cld_mgcexp; + struct mutex cld_lock; + enum mgs_cfg_type cld_type; + unsigned int cld_stopping:1, /* we were told to stop + * watching */ + cld_lostlock:1; /* lock not requeued */ + char cld_logname[0]; +}; + +struct lustre_profile { + struct list_head lp_list; + char *lp_profile; + char *lp_dt; + char *lp_md; + int lp_refs; + bool lp_list_deleted; +}; + +struct lustre_profile *class_get_profile(const char * prof); +void class_del_profile(const char *prof); +void class_put_profile(struct lustre_profile *lprof); +void class_del_profiles(void); + + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + +void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *); +void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *); +extern void (*class_export_dump_hook)(struct obd_export *); + +#else + +#define __class_export_add_lock_ref(exp, lock) do {} while(0) +#define __class_export_del_lock_ref(exp, lock) do {} while(0) + +#endif + +#define class_export_rpc_inc(exp) \ +({ \ + atomic_inc(&(exp)->exp_rpc_count); \ + CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n", \ + (exp), atomic_read(&(exp)->exp_rpc_count)); \ +}) + +#define class_export_rpc_dec(exp) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_rpc_count); \ + atomic_dec(&(exp)->exp_rpc_count); \ + CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n", \ + (exp), atomic_read(&(exp)->exp_rpc_count)); \ +}) + +#define class_export_lock_get(exp, lock) \ +({ \ + atomic_inc(&(exp)->exp_locks_count); \ + __class_export_add_lock_ref(exp, lock); \ + CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \ + (exp), atomic_read(&(exp)->exp_locks_count)); \ + class_export_get(exp); \ +}) + +#define class_export_lock_put(exp, lock) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_locks_count); \ + atomic_dec(&(exp)->exp_locks_count); \ + __class_export_del_lock_ref(exp, lock); \ + CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \ + (exp), atomic_read(&(exp)->exp_locks_count)); \ + class_export_put(exp); \ +}) + +#define class_export_cb_get(exp) \ +({ \ + atomic_inc(&(exp)->exp_cb_count); \ + CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\ + (exp), atomic_read(&(exp)->exp_cb_count)); \ + class_export_get(exp); \ +}) + +#define class_export_cb_put(exp) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_cb_count); \ + atomic_dec(&(exp)->exp_cb_count); \ + CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\ + (exp), atomic_read(&(exp)->exp_cb_count)); \ + class_export_put(exp); \ +}) + +/* genops.c */ +struct obd_export *class_export_get(struct obd_export *exp); +void class_export_put(struct obd_export *exp); +struct obd_export *class_new_export(struct obd_device *obddev, + struct obd_uuid *cluuid); +struct obd_export *class_new_export_self(struct obd_device *obd, + struct obd_uuid *uuid); +void class_unlink_export(struct obd_export *exp); + +struct obd_import *class_import_get(struct obd_import *); +void class_import_put(struct obd_import *); +struct obd_import *class_new_import(struct obd_device *obd); +void class_destroy_import(struct obd_import *exp); + +struct obd_type *class_search_type(const char *name); +struct obd_type *class_get_type(const char *name); +void class_put_type(struct obd_type *type); +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid); +int class_disconnect(struct obd_export *exp); +void class_fail_export(struct obd_export *exp); +int class_connected_export(struct obd_export *exp); +void class_disconnect_exports(struct obd_device *obddev); +int class_manual_cleanup(struct obd_device *obd); +void class_disconnect_stale_exports(struct obd_device *, + int (*test_export)(struct obd_export *)); + +static inline enum obd_option exp_flags_from_obd(struct obd_device *obd) +{ + return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) | + (obd->obd_force ? OBD_OPT_FORCE : 0) | + (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) | + 0); +} + +#ifdef HAVE_SERVER_SUPPORT +static inline struct lu_target *class_exp2tgt(struct obd_export *exp) +{ + LASSERT(exp->exp_obd); + if (exp->exp_obd->u.obt.obt_magic != OBT_MAGIC) + return NULL; + return exp->exp_obd->u.obt.obt_lut; +} + +static inline struct lr_server_data *class_server_data(struct obd_device *obd) +{ + LASSERT(obd->u.obt.obt_lut); + return &obd->u.obt.obt_lut->lut_lsd; +} +#endif + +/* obdo.c */ +struct lu_attr; +struct inode; + +void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid); +void la_from_obdo(struct lu_attr *la, const struct obdo *dst, u64 valid); + +void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid); +void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj); + +#define OBP(dev, op) (dev)->obd_type->typ_dt_ops->o_ ## op +#define MDP(dev, op) (dev)->obd_type->typ_md_ops->m_ ## op + +static inline int obd_check_dev(struct obd_device *obd) +{ + if (!obd) { + CERROR("NULL device\n"); + return -ENODEV; + } + return 0; +} + +/* ensure obd_setup and !obd_stopping */ +#define OBD_CHECK_DEV_ACTIVE(obd) \ +do { \ + rc = obd_check_dev(obd); \ + if (rc) \ + return rc; \ + \ + if (!(obd)->obd_set_up || (obd)->obd_stopping) { \ + CERROR("Device %d not setup\n", \ + (obd)->obd_minor); \ + RETURN(-ENODEV); \ + } \ +} while (0) + + +static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp) +{ + /* Always add in ldlm_stats */ + tmp->nid_ldlm_stats = + lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC, + LPROCFS_STATS_FLAG_NOPERCPU); + if (tmp->nid_ldlm_stats == NULL) + return -ENOMEM; + + lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats); + + return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats", + tmp->nid_ldlm_stats); +} + +static inline int exp_check_ops(struct obd_export *exp) +{ + if (exp == NULL) { + RETURN(-ENODEV); + } + if (exp->exp_obd == NULL || !exp->exp_obd->obd_type) { + RETURN(-EOPNOTSUPP); + } + RETURN(0); +} + +static inline int class_devno_max(void) +{ + return MAX_OBD_DEVICES; +} + +static inline int obd_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, + __u32 *vallen, void *val) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_get_info) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val); + RETURN(rc); +} + +static inline int obd_set_info_async(const struct lu_env *env, + struct obd_export *exp, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_set_info_async) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen, + val, set); + RETURN(rc); +} + +/* + * obd-lu integration. + * + * Functionality is being moved into new lu_device-based layering, but some + * pieces of configuration process are still based on obd devices. + * + * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully + * subsume ->o_setup() methods of obd devices they replace. The same for + * lu_device_operations::ldo_process_config() and ->o_process_config(). As a + * result, obd_setup() and obd_process_config() branch and call one XOR + * another. + * + * Yet neither lu_device_type_operations::ldto_device_fini() nor + * lu_device_type_operations::ldto_device_free() fully implement the + * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence, + * obd_precleanup() and obd_cleanup() call both lu_device and obd operations. + */ +static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg) +{ + int rc; + struct lu_device_type *ldt = obd->obd_type->typ_lu; + struct lu_device *d; + + ENTRY; + + if (ldt != NULL) { + struct lu_context session_ctx; + struct lu_env env; + lu_context_init(&session_ctx, LCT_SESSION | LCT_SERVER_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + env.le_ses = &session_ctx; + d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg); + lu_env_fini(&env); + if (!IS_ERR(d)) { + obd->obd_lu_dev = d; + d->ld_obd = obd; + rc = 0; + } else + rc = PTR_ERR(d); + } + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + + } else { + if (!obd->obd_type->typ_dt_ops->o_setup) { + CERROR("%s: no %s operation\n", obd->obd_name, + __func__); + RETURN(-EOPNOTSUPP); + } + rc = OBP(obd, setup)(obd, cfg); + } + RETURN(rc); +} + +static inline int obd_precleanup(struct obd_device *obd) +{ + int rc; + struct lu_device_type *ldt = obd->obd_type->typ_lu; + struct lu_device *d = obd->obd_lu_dev; + + ENTRY; + + if (ldt != NULL && d != NULL) { + struct lu_env *env = lu_env_find(); + struct lu_env _env; + + if (!env) { + env = &_env; + rc = lu_env_init(env, ldt->ldt_ctx_tags); + LASSERT(rc == 0); + lu_env_add(env); + } + ldt->ldt_ops->ldto_device_fini(env, d); + if (env == &_env) { + lu_env_remove(env); + lu_env_fini(env); + } + } + + if (!obd->obd_type->typ_dt_ops->o_precleanup) + RETURN(0); + + rc = OBP(obd, precleanup)(obd); + RETURN(rc); +} + +static inline int obd_cleanup(struct obd_device *obd) +{ + int rc; + struct lu_device_type *ldt = obd->obd_type->typ_lu; + struct lu_device *d = obd->obd_lu_dev; + + ENTRY; + if (ldt != NULL && d != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + ldt->ldt_ops->ldto_device_free(&env, d); + lu_env_fini(&env); + obd->obd_lu_dev = NULL; + } + } + if (!obd->obd_type->typ_dt_ops->o_cleanup) + RETURN(0); + + rc = OBP(obd, cleanup)(obd); + RETURN(rc); +} + +static inline void obd_cleanup_client_import(struct obd_device *obd) +{ + ENTRY; + + /* If we set up but never connected, the + client import will not have been cleaned. */ + down_write(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) { + struct obd_import *imp; + imp = obd->u.cli.cl_import; + CDEBUG(D_CONFIG, "%s: client import never connected\n", + obd->obd_name); + ptlrpc_invalidate_import(imp); + client_destroy_import(imp); + obd->u.cli.cl_import = NULL; + } + up_write(&obd->u.cli.cl_sem); + + EXIT; +} + +static inline int obd_process_config(struct obd_device *obd, int datalen, + void *data) +{ + int rc; + struct lu_device_type *ldt = obd->obd_type->typ_lu; + struct lu_device *d = obd->obd_lu_dev; + + ENTRY; + + obd->obd_process_conf = 1; + if (ldt != NULL && d != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + rc = d->ld_ops->ldo_process_config(&env, d, data); + lu_env_fini(&env); + } + } else { + if (!obd->obd_type->typ_dt_ops->o_process_config) { + CERROR("%s: no %s operation\n", + obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + rc = OBP(obd, process_config)(obd, datalen, data); + } + + obd->obd_process_conf = 0; + + RETURN(rc); +} + +static inline int obd_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *obdo) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_create) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, create)(env, exp, obdo); + RETURN(rc); +} + +static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *obdo) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_destroy) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, destroy)(env, exp, obdo); + RETURN(rc); +} + +static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + int rc; + + ENTRY; + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_getattr) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, getattr)(env, exp, oa); + + RETURN(rc); +} + +static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + int rc; + + ENTRY; + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_setattr) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, setattr)(env, exp, oa); + + RETURN(rc); +} + +static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority) +{ + struct obd_device *obd = imp->imp_obd; + int rc; + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_add_conn) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, add_conn)(imp, uuid, priority); + RETURN(rc); +} + +static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid) +{ + struct obd_device *obd = imp->imp_obd; + int rc; + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_del_conn) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, del_conn)(imp, uuid); + RETURN(rc); +} + +static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp) +{ + struct obd_uuid *uuid; + ENTRY; + + if (!exp->exp_obd->obd_type || + !exp->exp_obd->obd_type->typ_dt_ops->o_get_uuid) + RETURN(NULL); + + uuid = OBP(exp->exp_obd, get_uuid)(exp); + RETURN(uuid); +} + +/** Create a new /a exp on device /a obd for the uuid /a cluuid + * @param exp New export handle + * @param d Connect data, supported flags are set, flags also understood + * by obd are returned. + */ +static inline int obd_connect(const struct lu_env *env, + struct obd_export **exp,struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *data, + void *localdata) +{ + int rc; + __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition + * check */ + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_connect) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata); + /* check that only subset is granted */ + LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) == + data->ocd_connect_flags)); + RETURN(rc); +} + +static inline int obd_reconnect(const struct lu_env *env, + struct obd_export *exp, + struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *d, + void *localdata) +{ + int rc; + __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition + * check */ + + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_reconnect) + RETURN(0); + + rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata); + /* check that only subset is granted */ + LASSERT(ergo(d != NULL, + (d->ocd_connect_flags & ocf) == d->ocd_connect_flags)); + RETURN(rc); +} + +static inline int obd_disconnect(struct obd_export *exp) +{ + int rc; + ENTRY; + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_disconnect) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, disconnect)(exp); + RETURN(rc); +} + +static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp, + enum lu_cli_type type) +{ + int rc; + ENTRY; + + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_init) + RETURN(0); + + rc = OBP(obd, fid_init)(obd, exp, type); + RETURN(rc); +} + +static inline int obd_fid_fini(struct obd_device *obd) +{ + int rc; + ENTRY; + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_fini) + RETURN(0); + + rc = OBP(obd, fid_fini)(obd); + RETURN(rc); +} + +static inline int obd_fid_alloc(const struct lu_env *env, + struct obd_export *exp, + struct lu_fid *fid, + struct md_op_data *op_data) +{ + int rc; + ENTRY; + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_fid_alloc) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, fid_alloc)(env, exp, fid, op_data); + RETURN(rc); +} + +static inline int obd_pool_new(struct obd_device *obd, char *poolname) +{ + int rc; + ENTRY; + + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_new) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, pool_new)(obd, poolname); + RETURN(rc); +} + +static inline int obd_pool_del(struct obd_device *obd, char *poolname) +{ + int rc; + ENTRY; + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_del) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, pool_del)(obd, poolname); + RETURN(rc); +} + +static inline int obd_pool_add(struct obd_device *obd, char *poolname, + char *ostname) +{ + int rc; + ENTRY; + + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_add) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, pool_add)(obd, poolname, ostname); + RETURN(rc); +} + +static inline int obd_pool_rem(struct obd_device *obd, char *poolname, + char *ostname) +{ + int rc; + + ENTRY; + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_rem) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + rc = OBP(obd, pool_rem)(obd, poolname, ostname); + RETURN(rc); +} + +static inline int obd_init_export(struct obd_export *exp) +{ + int rc = 0; + + ENTRY; + if (exp->exp_obd != NULL && exp->exp_obd->obd_type && + OBP((exp)->exp_obd, init_export)) + rc = OBP(exp->exp_obd, init_export)(exp); + RETURN(rc); +} + +static inline int obd_destroy_export(struct obd_export *exp) +{ + ENTRY; + if (exp->exp_obd != NULL && exp->exp_obd->obd_type && + OBP(exp->exp_obd, destroy_export)) + OBP(exp->exp_obd, destroy_export)(exp); + RETURN(0); +} + +/* @max_age is the oldest time in seconds that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness. + */ +static inline int obd_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, + time64_t max_age, + struct ptlrpc_request_set *rqset) +{ + struct obd_device *obd; + int rc = 0; + + ENTRY; + + if (exp == NULL || exp->exp_obd == NULL) + RETURN(-EINVAL); + + obd = exp->exp_obd; + if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs) { + rc = -EOPNOTSUPP; + CERROR("%s: no statfs operation: rc = %d\n", obd->obd_name, rc); + RETURN(rc); + } + + CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n", + obd->obd_name, obd->obd_osfs_age, max_age); + rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset); + + RETURN(rc); +} + +/* @max_age is the oldest time in seconds that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness. + */ +static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, + __u32 flags) +{ + struct obd_device *obd = exp->exp_obd; + int rc = 0; + + ENTRY; + if (unlikely(obd == NULL)) + RETURN(-EINVAL); + + OBD_CHECK_DEV_ACTIVE(obd); + + if (unlikely(!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs)) { + CERROR("%s: no %s operation\n", obd->obd_name, __func__); + RETURN(-EOPNOTSUPP); + } + + CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n", + obd->obd_name, obd->obd_osfs_age, max_age); + /* ignore cache if aggregated isn't expected */ + if (obd->obd_osfs_age < max_age || + ((obd->obd_osfs.os_state & OS_STATE_SUM) && + !(flags & OBD_STATFS_SUM))) { + /* the RPC will block anyway, so avoid sending many at once */ + rc = mutex_lock_interruptible(&obd->obd_dev_mutex); + if (rc) + RETURN(rc); + if (obd->obd_osfs_age < max_age || + ((obd->obd_osfs.os_state & OS_STATE_SUM) && + !(flags & OBD_STATFS_SUM))) { + rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags); + } else { + mutex_unlock(&obd->obd_dev_mutex); + GOTO(cached, rc = 0); + } + if (rc == 0) { + CDEBUG(D_SUPER, + "%s: update %p cache blocks %llu/%llu objects %llu/%llu\n", + obd->obd_name, &obd->obd_osfs, + osfs->os_bavail, osfs->os_blocks, + osfs->os_ffree, osfs->os_files); + + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs)); + obd->obd_osfs_age = ktime_get_seconds(); + spin_unlock(&obd->obd_osfs_lock); + } + mutex_unlock(&obd->obd_dev_mutex); + } else { +cached: + CDEBUG(D_SUPER, + "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n", + obd->obd_name, &obd->obd_osfs, + obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, + obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); + spin_lock(&obd->obd_osfs_lock); + memcpy(osfs, &obd->obd_osfs, sizeof(*osfs)); + spin_unlock(&obd->obd_osfs_lock); + } + RETURN(rc); +} + +static inline int obd_preprw(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *remote, int *pages, + struct niobuf_local *local) +{ + int rc; + + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_preprw) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote, + pages, local); + + RETURN(rc); +} + +static inline int obd_commitrw(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *rnb, int pages, + struct niobuf_local *local, const int orig_rc) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_commitrw) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj, + rnb, pages, local, orig_rc); + + RETURN(rc); +} + +static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp, + int len, void *karg, void __user *uarg) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_iocontrol) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg); + RETURN(rc); +} + +static inline void obd_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + ENTRY; + if (!obd) { + CERROR("NULL device\n"); + EXIT; + return; + } + + if (obd->obd_set_up && OBP(obd, import_event)) + OBP(obd, import_event)(obd, imp, event); + + EXIT; +} + +static inline int obd_notify(struct obd_device *obd, + struct obd_device *watched, + enum obd_notify_event ev) +{ + int rc; + ENTRY; + + rc = obd_check_dev(obd); + if (rc) + return rc; + + if (!obd->obd_set_up) { + CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name); + RETURN(-EINVAL); + } + + if (!OBP(obd, notify)) { + CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name); + RETURN(-ENOSYS); + } + + rc = OBP(obd, notify)(obd, watched, ev); + + RETURN(rc); +} + +static inline int obd_notify_observer(struct obd_device *observer, + struct obd_device *observed, + enum obd_notify_event ev) +{ + int rc = 0; + int rc2 = 0; + struct obd_notify_upcall *onu; + + if (observer->obd_observer) + rc = obd_notify(observer->obd_observer, observed, ev); + + /* + * Also, call non-obd listener, if any + */ + onu = &observer->obd_upcall; + if (onu->onu_upcall != NULL) + rc2 = onu->onu_upcall(observer, observed, ev, onu->onu_owner); + + return rc ? rc : rc2; +} + +static inline int obd_quotactl(struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + int rc; + ENTRY; + + rc = exp_check_ops(exp); + if (rc) + RETURN(rc); + + if (!exp->exp_obd->obd_type->typ_dt_ops->o_quotactl) { + CERROR("%s: no %s operation\n", + (exp)->exp_obd->obd_name, __func__); + RETURN(-ENOTSUPP); + } + + rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl); + RETURN(rc); +} + +static inline int obd_health_check(const struct lu_env *env, + struct obd_device *obd) +{ + /* returns: 0 on healthy + * >0 on unhealthy + reason code/flag + * however the only suppored reason == 1 right now + * We'll need to define some better reasons + * or flags in the future. + * <0 on error + */ + int rc; + + ENTRY; + + /* NULL method is normal here */ + if (obd == NULL || !obd->obd_type) { + CERROR("cleaned up obd\n"); + RETURN(-EOPNOTSUPP); + } + if (!obd->obd_set_up || obd->obd_stopping) + RETURN(0); + if (!OBP(obd, health_check)) + RETURN(0); + + rc = OBP(obd, health_check)(env, obd); + RETURN(rc); +} + +static inline int obd_register_observer(struct obd_device *obd, + struct obd_device *observer) +{ + int rc; + ENTRY; + + rc = obd_check_dev(obd); + if (rc) + return rc; + + down_write(&obd->obd_observer_link_sem); + if (obd->obd_observer && observer) { + up_write(&obd->obd_observer_link_sem); + RETURN(-EALREADY); + } + obd->obd_observer = observer; + up_write(&obd->obd_observer_link_sem); + RETURN(0); +} + +/* metadata helpers */ +enum mps_stat_idx { + LPROC_MD_CLOSE, + LPROC_MD_CREATE, + LPROC_MD_ENQUEUE, + LPROC_MD_GETATTR, + LPROC_MD_INTENT_LOCK, + LPROC_MD_LINK, + LPROC_MD_RENAME, + LPROC_MD_SETATTR, + LPROC_MD_FSYNC, + LPROC_MD_READ_PAGE, + LPROC_MD_UNLINK, + LPROC_MD_SETXATTR, + LPROC_MD_GETXATTR, + LPROC_MD_INTENT_GETATTR_ASYNC, + LPROC_MD_REVALIDATE_LOCK, + LPROC_MD_LAST_OPC, +}; + +static inline int md_get_root(struct obd_export *exp, const char *fileset, + struct lu_fid *fid) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, get_root)(exp, fileset, fid); +} + +static inline int md_getattr(struct obd_export *exp, + struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_GETATTR); + + return MDP(exp->exp_obd, getattr)(exp, op_data, request); +} + +static inline int md_null_inode(struct obd_export *exp, + const struct lu_fid *fid) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, null_inode)(exp, fid); +} + +static inline int md_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_CLOSE); + + return MDP(exp->exp_obd, close)(exp, op_data, mod, request); +} + +static inline int md_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, size_t datalen, umode_t mode, + uid_t uid, gid_t gid, cfs_cap_t cap_effective, + __u64 rdev, struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_CREATE); + + return MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode, + uid, gid, cap_effective, rdev, + request); +} + +static inline int md_enqueue(struct obd_export *exp, + struct ldlm_enqueue_info *einfo, + const union ldlm_policy_data *policy, + struct md_op_data *op_data, + struct lustre_handle *lockh, + __u64 extra_lock_flags) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_ENQUEUE); + + return MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh, + extra_lock_flags); +} + +static inline int md_getattr_name(struct obd_export *exp, + struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, getattr_name)(exp, op_data, request); +} + +static inline int md_intent_lock(struct obd_export *exp, + struct md_op_data *op_data, + struct lookup_intent *it, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_INTENT_LOCK); + + return MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp, + cb_blocking, extra_lock_flags); +} + +static inline int md_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_LINK); + + return MDP(exp->exp_obd, link)(exp, op_data, request); +} + +static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old_name, size_t oldlen, + const char *new_name, size_t newlen, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_RENAME); + + return MDP(exp->exp_obd, rename)(exp, op_data, old_name, oldlen, + new_name, newlen, request); +} + +static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, size_t ealen, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_SETATTR); + + return MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request); +} + +static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_FSYNC); + + return MDP(exp->exp_obd, fsync)(exp, fid, request); +} + +/* FLR: resync mirrored files. */ +static inline int md_file_resync(struct obd_export *exp, + struct md_op_data *data) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, file_resync)(exp, data); +} + +static inline int md_read_page(struct obd_export *exp, + struct md_op_data *op_data, + struct md_callback *cb_op, + __u64 hash_offset, + struct page **ppage) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_READ_PAGE); + + return MDP(exp->exp_obd, read_page)(exp, op_data, cb_op, hash_offset, + ppage); +} + +static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_UNLINK); + + return MDP(exp->exp_obd, unlink)(exp, op_data, request); +} + +static inline int md_get_lustre_md(struct obd_export *exp, + struct ptlrpc_request *req, + struct obd_export *dt_exp, + struct obd_export *md_exp, + struct lustre_md *md) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md); +} + +static inline int md_free_lustre_md(struct obd_export *exp, + struct lustre_md *md) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, free_lustre_md)(exp, md); +} + +static inline int md_merge_attr(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + struct cl_attr *attr, + ldlm_blocking_callback cb) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb); +} + +static inline int md_setxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, + const void *value, size_t value_size, + unsigned int xattr_flags, u32 suppgid, + struct ptlrpc_request **req) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_SETXATTR); + + return MDP(exp->exp_obd, setxattr)(exp, fid, obd_md_valid, name, + value, value_size, xattr_flags, + suppgid, req); +} + +static inline int md_getxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, + size_t buf_size, struct ptlrpc_request **req) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_GETXATTR); + + return MDP(exp->exp_obd, getxattr)(exp, fid, obd_md_valid, name, + buf_size, req); +} + +static inline int md_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, set_open_replay_data)(exp, och, it); +} + +static inline int md_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, clear_open_replay_data)(exp, och); +} + +static inline int md_set_lock_data(struct obd_export *exp, + const struct lustre_handle *lockh, + void *data, __u64 *bits) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits); +} + +static inline +int md_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + union ldlm_policy_data *policy, enum ldlm_mode mode, + enum ldlm_cancel_flags cancel_flags, void *opaque) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode, + cancel_flags, opaque); +} + +static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, + enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + struct lustre_handle *lockh) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, lock_match)(exp, flags, fid, type, + policy, mode, lockh); +} + +static inline int md_init_ea_size(struct obd_export *exp, __u32 ea_size, + __u32 def_ea_size) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, init_ea_size)(exp, ea_size, def_ea_size); +} + +static inline int md_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_INTENT_GETATTR_ASYNC); + + return MDP(exp->exp_obd, intent_getattr_async)(exp, minfo); +} + +static inline int md_revalidate_lock(struct obd_export *exp, + struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_REVALIDATE_LOCK); + + return MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits); +} + +static inline int md_get_fid_from_lsm(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + const char *name, int namelen, + struct lu_fid *fid) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen, + fid); +} + +/* Unpack an MD struct from disk to in-memory format. + * Returns +ve size of unpacked MD (0 for free), or -ve error. + * + * If *plsm != NULL and lmm == NULL then *lsm will be freed. + * If *plsm == NULL then it will be allocated. + */ +static inline int md_unpackmd(struct obd_export *exp, + struct lmv_stripe_md **plsm, + const union lmv_mds_md *lmm, size_t lmm_size) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size); +} + +static inline int md_rmfid(struct obd_export *exp, struct fid_array *fa, + int *rcs, struct ptlrpc_request_set *set) +{ + int rc; + + rc = exp_check_ops(exp); + if (rc) + return rc; + + return MDP(exp->exp_obd, rmfid)(exp, fa, rcs, set); +} + +/* OBD Metadata Support */ + +extern int obd_init_caches(void); +extern void obd_cleanup_caches(void); + +typedef int (*register_lwp_cb)(void *data); + +struct lwp_register_item { + struct obd_export **lri_exp; + register_lwp_cb lri_cb_func; + void *lri_cb_data; + struct list_head lri_list; + atomic_t lri_ref; + char lri_name[MTI_NAME_MAXLEN]; +}; + +/* I'm as embarrassed about this as you are. + * + * // XXX do not look into _superhack with remaining eye + * // XXX if this were any uglier, I'd get my own show on MTV */ +extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); + +/* obd_mount.c */ +#ifdef HAVE_SERVER_SUPPORT +int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp, + register_lwp_cb cb_func, void *cb_data); +void lustre_deregister_lwp_item(struct obd_export **exp); +struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx); +void lustre_notify_lwp_list(struct obd_export *exp); +int tgt_name2lwp_name(const char *tgt_name, char *lwp_name, int len, __u32 idx); +#endif /* HAVE_SERVER_SUPPORT */ +int lustre_register_fs(void); +int lustre_unregister_fs(void); +int lustre_check_exclusion(struct super_block *sb, char *svname); + +typedef __u8 class_uuid_t[16]; +static inline void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out) +{ + snprintf(out->uuid, sizeof(out->uuid), "%02x%02x%02x%02x-%02x%02x-" + "%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uu[14], uu[15], uu[12], uu[13], uu[10], uu[11], uu[8], uu[9], + uu[6], uu[7], uu[4], uu[5], uu[2], uu[3], uu[0], uu[1]); +} + +/* lustre_peer.c */ +int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index); +int class_add_uuid(const char *uuid, __u64 nid); +int class_del_uuid (const char *uuid); +int class_check_uuid(struct obd_uuid *uuid, __u64 nid); + +/* class_obd.c */ +extern char obd_jobid_name[]; + +/* prng.c */ +#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t)) + +/* statfs_pack.c */ +struct kstatfs; +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs); +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs); + +/* root squash info */ +struct rw_semaphore; +struct root_squash_info { + uid_t rsi_uid; + gid_t rsi_gid; + struct list_head rsi_nosquash_nids; + struct rw_semaphore rsi_sem; +}; + +int server_name2index(const char *svname, __u32 *idx, const char **endptr); + +/* linux-module.c */ +extern struct miscdevice obd_psdev; +int obd_ioctl_getdata(char **buf, int *len, void __user *arg); +int class_procfs_init(void); +int class_procfs_clean(void); +#endif /* __LINUX_OBD_CLASS_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/obd_support.h b/drivers/staging/lustrefsx/lustre/include/obd_support.h new file mode 100644 index 0000000000000..356585d91932b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd_support.h @@ -0,0 +1,990 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _OBD_SUPPORT +#define _OBD_SUPPORT + +#include +#include +#include +#include +#include + +#include +#include +#include + +/* global variables */ +extern struct lprocfs_stats *obd_memory; +enum { + OBD_MEMORY_STAT = 0, + OBD_STATS_NUM, +}; + +extern unsigned int obd_debug_peer_on_timeout; +extern unsigned int obd_dump_on_timeout; +extern unsigned int obd_dump_on_eviction; +extern unsigned int obd_lbug_on_eviction; +/* obd_timeout should only be used for recovery, not for + networking / disk / timings affected by load (use Adaptive Timeouts) */ +extern unsigned int obd_timeout; /* seconds */ +extern unsigned int ldlm_timeout; /* seconds */ +extern unsigned int obd_timeout_set; +extern unsigned int ldlm_timeout_set; +extern unsigned int bulk_timeout; +extern unsigned int at_min; +extern unsigned int at_max; +extern unsigned int at_history; +extern int at_early_margin; +extern int at_extra; +extern unsigned long obd_max_dirty_pages; +extern atomic_long_t obd_dirty_pages; +extern char obd_jobid_var[]; + +/* Some hash init argument constants */ +#define HASH_POOLS_BKT_BITS 3 +#define HASH_POOLS_CUR_BITS 3 +#define HASH_POOLS_MAX_BITS 7 +#define HASH_UUID_BKT_BITS 5 +#define HASH_UUID_CUR_BITS 7 +#define HASH_UUID_MAX_BITS 12 +#define HASH_NID_BKT_BITS 5 +#define HASH_NID_CUR_BITS 7 +#define HASH_NID_MAX_BITS 12 +#define HASH_NID_STATS_BKT_BITS 5 +#define HASH_NID_STATS_CUR_BITS 7 +#define HASH_NID_STATS_MAX_BITS 12 +#define HASH_GEN_BKT_BITS 5 +#define HASH_GEN_CUR_BITS 7 +#define HASH_GEN_MAX_BITS 12 +#define HASH_LQE_BKT_BITS 5 +#define HASH_LQE_CUR_BITS 7 +#define HASH_LQE_MAX_BITS 12 +#define HASH_CONN_BKT_BITS 5 +#define HASH_CONN_CUR_BITS 5 +#define HASH_CONN_MAX_BITS 15 +#define HASH_EXP_LOCK_BKT_BITS 5 +#define HASH_EXP_LOCK_CUR_BITS 7 +#define HASH_EXP_LOCK_MAX_BITS 16 +#define HASH_CL_ENV_BKT_BITS 5 +#define HASH_CL_ENV_BITS 10 +#define HASH_JOB_STATS_BKT_BITS 5 +#define HASH_JOB_STATS_CUR_BITS 7 +#define HASH_JOB_STATS_MAX_BITS 12 + +/* Timeout definitions */ +#define OBD_TIMEOUT_DEFAULT 100 +#define LDLM_TIMEOUT_DEFAULT 20 +#define MDS_LDLM_TIMEOUT_DEFAULT 6 +/* Time to wait for all clients to reconnect during recovery (hard limit) */ +#define OBD_RECOVERY_TIME_HARD (obd_timeout * 9) +/* Time to wait for all clients to reconnect during recovery (soft limit) */ +/* Should be very conservative; must catch the first reconnect after reboot */ +#define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3) +/* Change recovery-small 26b time if you change this */ +#define PING_INTERVAL max(obd_timeout / 4, 1U) +/* a bit more than maximal journal commit time in seconds */ +#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U) +/* Client may skip 1 ping; we must wait at least 2.5. But for multiple + * failover targets the client only pings one server at a time, and pings + * can be lost on a loaded network. Since eviction has serious consequences, + * and there's no urgent need to evict a client just because it's idle, we + * should be very conservative here. */ +#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6) +#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */ +#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */ + /* Max connect interval for nonresponsive servers; ~50s to avoid building up + connect requests in the LND queues, but within obd_timeout so we don't + miss the recovery window */ +#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout)) +#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */ +/* In general this should be low to have quick detection of a system + running on a backup server. (If it's too low, import_select_connection + will increase the timeout anyhow.) */ +#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20) +/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */ +#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \ + INITIAL_CONNECT_TIMEOUT) +/* The min time a target should wait for clients to reconnect in recovery */ +#define OBD_RECOVERY_TIME_MIN (2*RECONNECT_DELAY_MAX) +#define OBD_IR_FACTOR_MIN 1 +#define OBD_IR_FACTOR_MAX 10 +#define OBD_IR_FACTOR_DEFAULT (OBD_IR_FACTOR_MAX/2) +/* default timeout for the MGS to become IR_FULL */ +#define OBD_IR_MGS_TIMEOUT (4*obd_timeout) +#define LONG_UNLINK 300 /* Unlink should happen before now */ + +/** + * Time interval of shrink, if the client is "idle" more than this interval, + * then the ll_grant thread will return the requested grant space to filter + */ +#define GRANT_SHRINK_INTERVAL 1200/*20 minutes*/ + +#define OBD_FAIL_MDS 0x100 +#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 +#define OBD_FAIL_MDS_GETATTR_NET 0x102 +#define OBD_FAIL_MDS_GETATTR_PACK 0x103 +#define OBD_FAIL_MDS_READPAGE_NET 0x104 +#define OBD_FAIL_MDS_READPAGE_PACK 0x105 +#define OBD_FAIL_MDS_SENDPAGE 0x106 +#define OBD_FAIL_MDS_REINT_NET 0x107 +#define OBD_FAIL_MDS_REINT_UNPACK 0x108 +#define OBD_FAIL_MDS_REINT_SETATTR 0x109 +#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a +#define OBD_FAIL_MDS_REINT_CREATE 0x10b +#define OBD_FAIL_MDS_REINT_CREATE_WRITE 0x10c +#define OBD_FAIL_MDS_REINT_UNLINK 0x10d +#define OBD_FAIL_MDS_REINT_UNLINK_WRITE 0x10e +#define OBD_FAIL_MDS_REINT_LINK 0x10f +#define OBD_FAIL_MDS_REINT_LINK_WRITE 0x110 +#define OBD_FAIL_MDS_REINT_RENAME 0x111 +#define OBD_FAIL_MDS_REINT_RENAME_WRITE 0x112 +#define OBD_FAIL_MDS_OPEN_NET 0x113 +#define OBD_FAIL_MDS_OPEN_PACK 0x114 +#define OBD_FAIL_MDS_CLOSE_NET 0x115 +#define OBD_FAIL_MDS_CLOSE_PACK 0x116 +#define OBD_FAIL_MDS_CONNECT_NET 0x117 +#define OBD_FAIL_MDS_CONNECT_PACK 0x118 +#define OBD_FAIL_MDS_REINT_NET_REP 0x119 +#define OBD_FAIL_MDS_DISCONNECT_NET 0x11a +#define OBD_FAIL_MDS_GET_ROOT_NET 0x11b +#define OBD_FAIL_MDS_GET_ROOT_PACK 0x11c +#define OBD_FAIL_MDS_STATFS_PACK 0x11d +#define OBD_FAIL_MDS_STATFS_SUM_PACK 0x11d +#define OBD_FAIL_MDS_STATFS_NET 0x11e +#define OBD_FAIL_MDS_STATFS_SUM_NET 0x11e +#define OBD_FAIL_MDS_GETATTR_NAME_NET 0x11f +#define OBD_FAIL_MDS_PIN_NET 0x120 +#define OBD_FAIL_MDS_UNPIN_NET 0x121 +#define OBD_FAIL_MDS_ALL_REPLY_NET 0x122 +#define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123 +#define OBD_FAIL_MDS_SYNC_NET 0x124 +#define OBD_FAIL_MDS_SYNC_PACK 0x125 +/* OBD_FAIL_MDS_DONE_WRITING_NET 0x126 obsolete since 2.8.0 */ +/* OBD_FAIL_MDS_DONE_WRITING_PACK 0x127 obsolete since 2.8.0 */ +#define OBD_FAIL_MDS_ALLOC_OBDO 0x128 +#define OBD_FAIL_MDS_PAUSE_OPEN 0x129 +#define OBD_FAIL_MDS_STATFS_LCW_SLEEP 0x12a +#define OBD_FAIL_MDS_OPEN_CREATE 0x12b +#define OBD_FAIL_MDS_OST_SETATTR 0x12c +/* OBD_FAIL_MDS_QUOTACHECK_NET 0x12d obsolete since 2.4 */ +#define OBD_FAIL_MDS_QUOTACTL_NET 0x12e +#define OBD_FAIL_MDS_CLIENT_ADD 0x12f +#define OBD_FAIL_MDS_GETXATTR_NET 0x130 +#define OBD_FAIL_MDS_GETXATTR_PACK 0x131 +#define OBD_FAIL_MDS_SETXATTR_NET 0x132 +#define OBD_FAIL_MDS_SETXATTR 0x133 +#define OBD_FAIL_MDS_SETXATTR_WRITE 0x134 +#define OBD_FAIL_MDS_FS_SETUP 0x135 +#define OBD_FAIL_MDS_RESEND 0x136 +#define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137 +#define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138 +#define OBD_FAIL_MDS_OSC_PRECREATE 0x139 +#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a +#define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b +#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x13c +#define OBD_FAIL_MDS_DROP_QUOTA_REQ 0x13d +#define OBD_FAIL_MDS_REMOVE_COMMON_EA 0x13e +#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING 0x13f +#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140 +#define OBD_FAIL_MDS_LOV_PREP_CREATE 0x141 +#define OBD_FAIL_MDS_REINT_DELAY 0x142 +#define OBD_FAIL_MDS_READLINK_EPROTO 0x143 +#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144 +#define OBD_FAIL_MDS_PDO_LOCK 0x145 +#define OBD_FAIL_MDS_PDO_LOCK2 0x146 +#define OBD_FAIL_MDS_OSC_CREATE_FAIL 0x147 +#define OBD_FAIL_MDS_NEGATIVE_POSITIVE 0x148 +#define OBD_FAIL_MDS_HSM_STATE_GET_NET 0x149 +#define OBD_FAIL_MDS_HSM_STATE_SET_NET 0x14a +#define OBD_FAIL_MDS_HSM_PROGRESS_NET 0x14b +#define OBD_FAIL_MDS_HSM_REQUEST_NET 0x14c +#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d +#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET 0x14e +#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET 0x14f +#define OBD_FAIL_MDS_HSM_ACTION_NET 0x150 +#define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 +#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS 0x152 +#define OBD_FAIL_MDS_RENAME 0x153 +#define OBD_FAIL_MDS_RENAME2 0x154 +#define OBD_FAIL_MDS_RENAME3 0x155 +#define OBD_FAIL_MDS_RENAME4 0x156 +#define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157 +#define OBD_FAIL_MDS_STALE_DIR_LAYOUT 0x158 +#define OBD_FAIL_MDS_REINT_MULTI_NET 0x159 +#define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a +#define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b +#define OBD_FAIL_MDS_FLD_LOOKUP 0x15c +#define OBD_FAIL_MDS_CHANGELOG_REORDER 0x15d +#define OBD_FAIL_MDS_INTENT_DELAY 0x160 +#define OBD_FAIL_MDS_XATTR_REP 0x161 +#define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162 +#define OBD_FAIL_MDS_LOV_CREATE_RACE 0x163 +#define OBD_FAIL_MDS_HSM_CDT_DELAY 0x164 +#define OBD_FAIL_MDS_ORPHAN_DELETE 0x165 +#define OBD_FAIL_MDS_RMFID_NET 0x166 +#define OBD_FAIL_MDS_REINT_OPEN 0x169 +#define OBD_FAIL_MDS_REINT_OPEN2 0x16a + +/* layout lock */ +#define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 +#define OBD_FAIL_MDS_NO_LL_OPEN 0x171 +#define OBD_FAIL_MDS_LL_BLOCK 0x172 + +/* CMD */ +#define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 +#define OBD_FAIL_MDS_IS_SUBDIR_PACK 0x181 +#define OBD_FAIL_MDS_SET_INFO_NET 0x182 +#define OBD_FAIL_MDS_WRITEPAGE_NET 0x183 +#define OBD_FAIL_MDS_WRITEPAGE_PACK 0x184 +#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185 +#define OBD_FAIL_MDS_GET_INFO_NET 0x186 +#define OBD_FAIL_MDS_DQACQ_NET 0x187 +#define OBD_FAIL_MDS_STRIPE_CREATE 0x188 +#define OBD_FAIL_MDS_STRIPE_FID 0x189 + +/* OI scrub */ +#define OBD_FAIL_OSD_SCRUB_DELAY 0x190 +#define OBD_FAIL_OSD_SCRUB_CRASH 0x191 +#define OBD_FAIL_OSD_SCRUB_FATAL 0x192 +#define OBD_FAIL_OSD_FID_MAPPING 0x193 +#define OBD_FAIL_OSD_LMA_INCOMPAT 0x194 +#define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY 0x195 +#define OBD_FAIL_OSD_COMPAT_NO_ENTRY 0x196 +#define OBD_FAIL_OSD_OST_EA_FID_SET 0x197 +#define OBD_FAIL_OSD_NO_OI_ENTRY 0x198 +#define OBD_FAIL_OSD_INDEX_CRASH 0x199 + +#define OBD_FAIL_OSD_TXN_START 0x19a + +#define OBD_FAIL_OSD_DUPLICATE_MAP 0x19b + +#define OBD_FAIL_OFD_SET_OID 0x1e0 + +#define OBD_FAIL_OST 0x200 +#define OBD_FAIL_OST_CONNECT_NET 0x201 +#define OBD_FAIL_OST_DISCONNECT_NET 0x202 +#define OBD_FAIL_OST_GET_INFO_NET 0x203 +#define OBD_FAIL_OST_CREATE_NET 0x204 +#define OBD_FAIL_OST_DESTROY_NET 0x205 +#define OBD_FAIL_OST_GETATTR_NET 0x206 +#define OBD_FAIL_OST_SETATTR_NET 0x207 +#define OBD_FAIL_OST_OPEN_NET 0x208 +#define OBD_FAIL_OST_CLOSE_NET 0x209 +#define OBD_FAIL_OST_BRW_NET 0x20a +#define OBD_FAIL_OST_PUNCH_NET 0x20b +#define OBD_FAIL_OST_STATFS_NET 0x20c +#define OBD_FAIL_OST_HANDLE_UNPACK 0x20d +#define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e +#define OBD_FAIL_OST_BRW_READ_BULK 0x20f +#define OBD_FAIL_OST_SYNC_NET 0x210 +#define OBD_FAIL_OST_ALL_REPLY_NET 0x211 +#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212 +#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 +#define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 +#define OBD_FAIL_OST_ENOSPC 0x215 +#define OBD_FAIL_OST_EROFS 0x216 +#define OBD_FAIL_SRV_ENOENT 0x217 +/* OBD_FAIL_OST_QUOTACHECK_NET 0x218 obsolete since 2.4 */ +#define OBD_FAIL_OST_QUOTACTL_NET 0x219 +#define OBD_FAIL_OST_CHECKSUM_RECEIVE 0x21a +#define OBD_FAIL_OST_CHECKSUM_SEND 0x21b +#define OBD_FAIL_OST_BRW_SIZE 0x21c +#define OBD_FAIL_OST_DROP_REQ 0x21d +#define OBD_FAIL_OST_SETATTR_CREDITS 0x21e +#define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f +#define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 +#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 +#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 +#define OBD_FAIL_OST_PAUSE_CREATE 0x223 +#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 +#define OBD_FAIL_OST_CONNECT_NET2 0x225 +#define OBD_FAIL_OST_NOMEM 0x226 +#define OBD_FAIL_OST_BRW_PAUSE_BULK2 0x227 +#define OBD_FAIL_OST_MAPBLK_ENOSPC 0x228 +#define OBD_FAIL_OST_ENOINO 0x229 +#define OBD_FAIL_OST_DQACQ_NET 0x230 +#define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231 +#define OBD_FAIL_OST_SET_INFO_NET 0x232 +#define OBD_FAIL_OST_NODESTROY 0x233 +#define OBD_FAIL_OST_READ_SIZE 0x234 +#define OBD_FAIL_OST_LADVISE_NET 0x235 +#define OBD_FAIL_OST_PAUSE_PUNCH 0x236 +#define OBD_FAIL_OST_LADVISE_PAUSE 0x237 +#define OBD_FAIL_OST_FAKE_RW 0x238 +#define OBD_FAIL_OST_LIST_ASSERT 0x239 +#define OBD_FAIL_OST_GL_WORK_ALLOC 0x240 +#define OBD_FAIL_OST_SKIP_LV_CHECK 0x241 +#define OBD_FAIL_OST_STATFS_DELAY 0x242 +#define OBD_FAIL_OST_INTEGRITY_FAULT 0x243 +#define OBD_FAIL_OST_INTEGRITY_CMP 0x244 +#define OBD_FAIL_OST_DISCONNECT_DELAY 0x245 +#define OBD_FAIL_OST_2BIG_NIOBUF 0x248 + +#define OBD_FAIL_LDLM 0x300 +#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 +#define OBD_FAIL_LDLM_ENQUEUE_NET 0x302 +#define OBD_FAIL_LDLM_CONVERT_NET 0x303 +#define OBD_FAIL_LDLM_CANCEL_NET 0x304 +#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305 +#define OBD_FAIL_LDLM_CP_CALLBACK_NET 0x306 +#define OBD_FAIL_LDLM_GL_CALLBACK_NET 0x307 +#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 +#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309 +#define OBD_FAIL_LDLM_CREATE_RESOURCE 0x30a +#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b +#define OBD_FAIL_LDLM_REPLY 0x30c +#define OBD_FAIL_LDLM_RECOV_CLIENTS 0x30d +#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e +#define OBD_FAIL_LDLM_GLIMPSE 0x30f +#define OBD_FAIL_LDLM_CANCEL_RACE 0x310 +#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311 +#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 +#define OBD_FAIL_LDLM_CLOSE_THREAD 0x313 +#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 +#define OBD_FAIL_LDLM_CP_CB_WAIT 0x315 +#define OBD_FAIL_LDLM_OST_FAIL_RACE 0x316 +#define OBD_FAIL_LDLM_INTR_CP_AST 0x317 +#define OBD_FAIL_LDLM_CP_BL_RACE 0x318 +#define OBD_FAIL_LDLM_NEW_LOCK 0x319 +#define OBD_FAIL_LDLM_AGL_DELAY 0x31a +#define OBD_FAIL_LDLM_AGL_NOLOCK 0x31b +#define OBD_FAIL_LDLM_OST_LVB 0x31c +#define OBD_FAIL_LDLM_ENQUEUE_HANG 0x31d +#define OBD_FAIL_LDLM_BL_EVICT 0x31e +#define OBD_FAIL_LDLM_PAUSE_CANCEL2 0x31f +#define OBD_FAIL_LDLM_CP_CB_WAIT2 0x320 +#define OBD_FAIL_LDLM_CP_CB_WAIT3 0x321 +#define OBD_FAIL_LDLM_CP_CB_WAIT4 0x322 +#define OBD_FAIL_LDLM_CP_CB_WAIT5 0x323 +#define OBD_FAIL_LDLM_SRV_BL_AST 0x324 +#define OBD_FAIL_LDLM_SRV_CP_AST 0x325 +#define OBD_FAIL_LDLM_SRV_GL_AST 0x326 +#define OBD_FAIL_LDLM_WATERMARK_LOW 0x327 +#define OBD_FAIL_LDLM_WATERMARK_HIGH 0x328 +#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329 + +#define OBD_FAIL_LDLM_GRANT_CHECK 0x32a +#define OBD_FAIL_LDLM_PROLONG_PAUSE 0x32b +#define OBD_FAIL_LDLM_LOCK_REPLAY 0x32d + +/* LOCKLESS IO */ +#define OBD_FAIL_LDLM_SET_CONTENTION 0x385 + +#define OBD_FAIL_OSC 0x400 +#define OBD_FAIL_OSC_BRW_READ_BULK 0x401 +#define OBD_FAIL_OSC_BRW_WRITE_BULK 0x402 +#define OBD_FAIL_OSC_LOCK_BL_AST 0x403 +#define OBD_FAIL_OSC_LOCK_CP_AST 0x404 +#define OBD_FAIL_OSC_MATCH 0x405 +#define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 +#define OBD_FAIL_OSC_SHUTDOWN 0x407 +#define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408 +#define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 +#define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a +/* #define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b Obsolete since 2.9 */ +#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c +#define OBD_FAIL_OSC_DIO_PAUSE 0x40d +#define OBD_FAIL_OSC_OBJECT_CONTENTION 0x40e +#define OBD_FAIL_OSC_CP_CANCEL_RACE 0x40f +#define OBD_FAIL_OSC_CP_ENQ_RACE 0x410 +#define OBD_FAIL_OSC_NO_GRANT 0x411 +#define OBD_FAIL_OSC_DELAY_SETTIME 0x412 +#define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413 +#define OBD_FAIL_OSC_DELAY_IO 0x414 +#define OBD_FAIL_OSC_NO_SIZE_DATA 0x415 + +#define OBD_FAIL_PTLRPC 0x500 +#define OBD_FAIL_PTLRPC_ACK 0x501 +#define OBD_FAIL_PTLRPC_RQBD 0x502 +#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503 +#define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 +#define OBD_FAIL_PTLRPC_DROP_RPC 0x505 +#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 +#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a +#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c +#define OBD_FAIL_PTLRPC_IMP_DEACTIVE 0x50d +#define OBD_FAIL_PTLRPC_DUMP_LOG 0x50e +#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f +#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510 +#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT 0x511 +#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512 +#define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513 +#define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 +#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL 0x516 +#define OBD_FAIL_PTLRPC_CANCEL_RESEND 0x517 +#define OBD_FAIL_PTLRPC_DROP_BULK 0x51a +#define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK 0x51b +#define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3 0x520 +#define OBD_FAIL_PTLRPC_BULK_ATTACH 0x521 +#define OBD_FAIL_PTLRPC_RESEND_RACE 0x525 +#define OBD_FAIL_PTLRPC_CONNECT_RACE 0x531 + +#define OBD_FAIL_OBD_PING_NET 0x600 +/* OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 obsolete since 1.5 */ +#define OBD_FAIL_OBD_LOGD_NET 0x602 +/* OBD_FAIL_OBD_QC_CALLBACK_NET 0x603 obsolete since 2.4 */ +#define OBD_FAIL_OBD_DQACQ 0x604 +#define OBD_FAIL_OBD_LLOG_SETUP 0x605 +/* OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 obsolete since 1.5 */ +#define OBD_FAIL_OBD_IDX_READ_NET 0x607 +#define OBD_FAIL_OBD_IDX_READ_BREAK 0x608 +#define OBD_FAIL_OBD_NO_LRU 0x609 +#define OBD_FAIL_OBDCLASS_MODULE_LOAD 0x60a +#define OBD_FAIL_OBD_ZERO_NLINK_RACE 0x60b + +#define OBD_FAIL_TGT_REPLY_NET 0x700 +#define OBD_FAIL_TGT_CONN_RACE 0x701 +#define OBD_FAIL_TGT_FORCE_RECONNECT 0x702 +#define OBD_FAIL_TGT_DELAY_CONNECT 0x703 +#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 +#define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 +#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 +#define OBD_FAIL_TGT_REPLAY_DROP 0x707 +#define OBD_FAIL_TGT_FAKE_EXP 0x708 +#define OBD_FAIL_TGT_REPLAY_DELAY 0x709 +/* #define OBD_FAIL_TGT_LAST_REPLAY 0x710 (obsoleted) */ +#define OBD_FAIL_TGT_CLIENT_ADD 0x711 +#define OBD_FAIL_TGT_RCVG_FLAG 0x712 +#define OBD_FAIL_TGT_DELAY_CONDITIONAL 0x713 +#define OBD_FAIL_TGT_REPLAY_DELAY2 0x714 +#define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715 +#define OBD_FAIL_TGT_MOUNT_RACE 0x716 +#define OBD_FAIL_TGT_REPLAY_TIMEOUT 0x717 +#define OBD_FAIL_TGT_CLIENT_DEL 0x718 +#define OBD_FAIL_TGT_SLUGGISH_NET 0x719 +#define OBD_FAIL_TGT_RCVD_EIO 0x720 +#define OBD_FAIL_TGT_RECOVERY_REQ_RACE 0x721 +#define OBD_FAIL_TGT_REPLY_DATA_RACE 0x722 +#define OBD_FAIL_TGT_NO_GRANT 0x725 + +#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 +#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 +#define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802 +#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 +#define OBD_FAIL_MDC_RPCS_SEM 0x804 /* deprecated */ +#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 +#define OBD_FAIL_MDC_CLOSE 0x806 +#define OBD_FAIL_MDC_MERGE 0x807 +#define OBD_FAIL_MDC_GLIMPSE_DDOS 0x808 + +#define OBD_FAIL_MGS 0x900 +#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 +#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902 +#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903 +#define OBD_FAIL_MGS_PAUSE_REQ 0x904 +#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905 +#define OBD_FAIL_MGS_CONNECT_NET 0x906 +#define OBD_FAIL_MGS_DISCONNECT_NET 0x907 +#define OBD_FAIL_MGS_SET_INFO_NET 0x908 +#define OBD_FAIL_MGS_EXCEPTION_NET 0x909 +#define OBD_FAIL_MGS_TARGET_REG_NET 0x90a +#define OBD_FAIL_MGS_TARGET_DEL_NET 0x90b +#define OBD_FAIL_MGS_CONFIG_READ_NET 0x90c +#define OBD_FAIL_MGS_LDLM_REPLY_NET 0x90d +#define OBD_FAIL_MGS_WRITE_TARGET_DELAY 0x90e + +#define OBD_FAIL_QUOTA_DQACQ_NET 0xA01 +#define OBD_FAIL_QUOTA_EDQUOT 0xA02 +#define OBD_FAIL_QUOTA_DELAY_REINT 0xA03 +#define OBD_FAIL_QUOTA_RECOVERABLE_ERR 0xA04 + +#define OBD_FAIL_LPROC_REMOVE 0xB00 + +#define OBD_FAIL_SEQ 0x1000 +#define OBD_FAIL_SEQ_QUERY_NET 0x1001 +#define OBD_FAIL_SEQ_EXHAUST 0x1002 + +#define OBD_FAIL_FLD 0x1100 +#define OBD_FAIL_FLD_QUERY_NET 0x1101 +#define OBD_FAIL_FLD_READ_NET 0x1102 +#define OBD_FAIL_FLD_QUERY_REQ 0x1103 + +#define OBD_FAIL_SEC_CTX 0x1200 +#define OBD_FAIL_SEC_CTX_INIT_NET 0x1201 +#define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202 +#define OBD_FAIL_SEC_CTX_FINI_NET 0x1203 +#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204 + +#define OBD_FAIL_LLOG 0x1300 +/* was OBD_FAIL_LLOG_ORIGIN_CONNECT_NET 0x1301 until 2.4 */ +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET 0x1302 +/* was OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET 0x1303 until 2.11 */ +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET 0x1305 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET 0x1306 +/* was OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET 0x1307 until 2.1 */ +/* was OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET 0x1308 until 1.8 */ +/* was OBD_FAIL_LLOG_CATINFO_NET 0x1309 until 2.3 */ +#define OBD_FAIL_MDS_SYNC_CAPA_SL 0x1310 +#define OBD_FAIL_SEQ_ALLOC 0x1311 +#define OBD_FAIL_CAT_RECORDS 0x1312 +#define OBD_FAIL_CAT_FREE_RECORDS 0x1313 +#define OBD_FAIL_TIME_IN_CHLOG_USER 0x1314 +#define CFS_FAIL_CHLOG_USER_REG_UNREG_RACE 0x1315 +#define OBD_FAIL_FORCE_GC_THREAD 0x1316 +#define OBD_FAIL_LLOG_PROCESS_TIMEOUT 0x1317 +#define OBD_FAIL_LLOG_PURGE_DELAY 0x1318 +#define OBD_FAIL_CATLIST 0x131b + +#define OBD_FAIL_LLITE 0x1400 +#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401 +#define OBD_FAIL_LOCK_STATE_WAIT_INTR 0x1402 +#define OBD_FAIL_LOV_INIT 0x1403 +#define OBD_FAIL_GLIMPSE_DELAY 0x1404 +#define OBD_FAIL_LLITE_XATTR_ENOMEM 0x1405 +#define OBD_FAIL_MAKE_LOVEA_HOLE 0x1406 +#define OBD_FAIL_LLITE_LOST_LAYOUT 0x1407 +#define OBD_FAIL_LLITE_NO_CHECK_DEAD 0x1408 +#define OBD_FAIL_GETATTR_DELAY 0x1409 +#define OBD_FAIL_LLITE_CREATE_FILE_PAUSE 0x1409 +#define OBD_FAIL_LLITE_NEWNODE_PAUSE 0x140a +#define OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE 0x140b +#define OBD_FAIL_LLITE_CREATE_NODE_PAUSE 0x140c +#define OBD_FAIL_LLITE_IMUTEX_SEC 0x140e +#define OBD_FAIL_LLITE_IMUTEX_NOSEC 0x140f +#define OBD_FAIL_LLITE_OPEN_BY_NAME 0x1410 +#define OBD_FAIL_LLITE_SHORT_COMMIT 0x1415 + +#define OBD_FAIL_FID_INDIR 0x1501 +#define OBD_FAIL_FID_INLMA 0x1502 +#define OBD_FAIL_FID_IGIF 0x1504 +#define OBD_FAIL_FID_LOOKUP 0x1505 +#define OBD_FAIL_FID_NOLMA 0x1506 + +/* LFSCK */ +#define OBD_FAIL_LFSCK_DELAY1 0x1600 +#define OBD_FAIL_LFSCK_DELAY2 0x1601 +#define OBD_FAIL_LFSCK_DELAY3 0x1602 +#define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603 +#define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604 +#define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605 +#define OBD_FAIL_LFSCK_FATAL1 0x1608 +#define OBD_FAIL_LFSCK_FATAL2 0x1609 +#define OBD_FAIL_LFSCK_CRASH 0x160a +#define OBD_FAIL_LFSCK_NO_AUTO 0x160b +#define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c +#define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d +#define OBD_FAIL_LFSCK_DELAY4 0x160e +#define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f +#define OBD_FAIL_LFSCK_DANGLING 0x1610 +#define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611 +#define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612 +#define OBD_FAIL_LFSCK_BAD_OWNER 0x1613 +#define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614 +#define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615 +#define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616 +#define OBD_FAIL_LFSCK_NOPFID 0x1617 +#define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618 +#define OBD_FAIL_LFSCK_INVALID_PFID 0x1619 +#define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a +#define OBD_FAIL_LFSCK_DELAY5 0x161b +#define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c +#define OBD_FAIL_LFSCK_NO_LINKEA 0x161d +#define OBD_FAIL_LFSCK_BAD_PARENT 0x161e +#define OBD_FAIL_LFSCK_DANGLING2 0x1620 +#define OBD_FAIL_LFSCK_DANGLING3 0x1621 +#define OBD_FAIL_LFSCK_MUL_REF 0x1622 +#define OBD_FAIL_LFSCK_BAD_TYPE 0x1623 +#define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624 +#define OBD_FAIL_LFSCK_LESS_NLINK 0x1626 +#define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628 +#define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629 +#define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a +#define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b +#define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c +#define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d +#define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e +#define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f +#define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630 +#define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631 + +#define OBD_FAIL_LFSCK_NOTIFY_NET 0x16f0 +#define OBD_FAIL_LFSCK_QUERY_NET 0x16f1 + +/* UPDATE */ +#define OBD_FAIL_OUT_UPDATE_NET 0x1700 +#define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701 +#define OBD_FAIL_SPLIT_UPDATE_REC 0x1702 +#define OBD_FAIL_LARGE_STRIPE 0x1703 +#define OBD_FAIL_OUT_ENOSPC 0x1704 +#define OBD_FAIL_INVALIDATE_UPDATE 0x1705 + +/* MIGRATE */ +#define OBD_FAIL_MIGRATE_ENTRIES 0x1801 + +/* LMV */ +#define OBD_FAIL_UNKNOWN_LMV_STRIPE 0x1901 + +/* FLR */ +#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE 0x1A00 +#define OBD_FAIL_FLR_LV_DELAY 0x1A01 +#define OBD_FAIL_FLR_LV_INC 0x1A02 +#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR 0x1A03 + +/* DT */ +#define OBD_FAIL_DT_DECLARE_ATTR_GET 0x2000 +#define OBD_FAIL_DT_ATTR_GET 0x2001 +#define OBD_FAIL_DT_DECLARE_ATTR_SET 0x2002 +#define OBD_FAIL_DT_ATTR_SET 0x2003 +#define OBD_FAIL_DT_DECLARE_XATTR_GET 0x2004 +#define OBD_FAIL_DT_XATTR_GET 0x2005 +#define OBD_FAIL_DT_DECLARE_XATTR_SET 0x2006 +#define OBD_FAIL_DT_XATTR_SET 0x2007 +#define OBD_FAIL_DT_DECLARE_XATTR_DEL 0x2008 +#define OBD_FAIL_DT_XATTR_DEL 0x2009 +#define OBD_FAIL_DT_XATTR_LIST 0x200a +#define OBD_FAIL_DT_DECLARE_CREATE 0x200b +#define OBD_FAIL_DT_CREATE 0x200c +#define OBD_FAIL_DT_DECLARE_DESTROY 0x200d +#define OBD_FAIL_DT_DESTROY 0x200e +#define OBD_FAIL_DT_INDEX_TRY 0x200f +#define OBD_FAIL_DT_DECLARE_REF_ADD 0x2010 +#define OBD_FAIL_DT_REF_ADD 0x2011 +#define OBD_FAIL_DT_DECLARE_REF_DEL 0x2012 +#define OBD_FAIL_DT_REF_DEL 0x2013 +#define OBD_FAIL_DT_DECLARE_INSERT 0x2014 +#define OBD_FAIL_DT_INSERT 0x2015 +#define OBD_FAIL_DT_DECLARE_DELETE 0x2016 +#define OBD_FAIL_DT_DELETE 0x2017 +#define OBD_FAIL_DT_LOOKUP 0x2018 +#define OBD_FAIL_DT_TXN_STOP 0x2019 + +#define OBD_FAIL_OSP_CHECK_INVALID_REC 0x2100 +#define OBD_FAIL_OSP_CHECK_ENOMEM 0x2101 +#define OBD_FAIL_OSP_FAKE_PRECREATE 0x2102 +#define OBD_FAIL_OSP_RPCS_SEM 0x2104 +#define OBD_FAIL_OSP_CANT_PROCESS_LLOG 0x2105 +#define OBD_FAIL_OSP_INVALID_LOGID 0x2106 + +/* barrier */ +#define OBD_FAIL_MGS_BARRIER_READ_NET 0x2200 +#define OBD_FAIL_MGS_BARRIER_NOTIFY_NET 0x2201 + +#define OBD_FAIL_BARRIER_DELAY 0x2202 +#define OBD_FAIL_BARRIER_FAILURE 0x2203 + +#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE 0x2301 + +/* Assign references to moved code to reduce code changes */ +#define OBD_FAIL_PRECHECK(id) CFS_FAIL_PRECHECK(id) +#define OBD_FAIL_CHECK(id) CFS_FAIL_CHECK(id) +#define OBD_FAIL_CHECK_VALUE(id, value) CFS_FAIL_CHECK_VALUE(id, value) +#define OBD_FAIL_CHECK_ORSET(id, value) CFS_FAIL_CHECK_ORSET(id, value) +#define OBD_FAIL_CHECK_RESET(id, value) CFS_FAIL_CHECK_RESET(id, value) +#define OBD_FAIL_RETURN(id, ret) CFS_FAIL_RETURN(id, ret) +#define OBD_FAIL_TIMEOUT(id, secs) CFS_FAIL_TIMEOUT(id, secs) +#define OBD_FAIL_TIMEOUT_MS(id, ms) CFS_FAIL_TIMEOUT_MS(id, ms) +#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs) +#define OBD_RACE(id) CFS_RACE(id) +#define OBD_FAIL_ONCE CFS_FAIL_ONCE +#define OBD_FAILED CFS_FAILED + +#define LUT_FAIL_CLASS(fail_id) (((fail_id) >> 8) << 16) +#define LUT_FAIL_MGT LUT_FAIL_CLASS(OBD_FAIL_MGS) +#define LUT_FAIL_MDT LUT_FAIL_CLASS(OBD_FAIL_MDS) +#define LUT_FAIL_OST LUT_FAIL_CLASS(OBD_FAIL_OST) + +extern atomic_t libcfs_kmemory; + +#ifdef CONFIG_PROC_FS +#define obd_memory_add(size) \ + lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size)) +#define obd_memory_sub(size) \ + lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size)) +#define obd_memory_sum() \ + lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT, \ + LPROCFS_FIELDS_FLAGS_SUM) + +extern void obd_update_maxusage(void); +extern __u64 obd_memory_max(void); + +#else /* CONFIG_PROC_FS */ + +extern __u64 obd_alloc; + +extern __u64 obd_max_alloc; + +static inline void obd_memory_add(long size) +{ + obd_alloc += size; + if (obd_alloc > obd_max_alloc) + obd_max_alloc = obd_alloc; +} + +static inline void obd_memory_sub(long size) +{ + obd_alloc -= size; +} + +#define obd_memory_sum() (obd_alloc) + +#define obd_memory_max() (obd_max_alloc) + +#endif /* !CONFIG_PROC_FS */ + +#define OBD_DEBUG_MEMUSAGE (1) + +#if OBD_DEBUG_MEMUSAGE +#define OBD_ALLOC_POST(ptr, size, name) \ + obd_memory_add(size); \ + CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \ + (int)(size), ptr) + +#define OBD_FREE_PRE(ptr, size, name) \ + LASSERT(ptr); \ + obd_memory_sub(size); \ + CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \ + (int)(size), ptr); \ + POISON(ptr, 0x5a, size) + +#else /* !OBD_DEBUG_MEMUSAGE */ + +#define OBD_ALLOC_POST(ptr, size, name) ((void)0) +#define OBD_FREE_PRE(ptr, size, name) ((void)0) + +#endif /* !OBD_DEBUG_MEMUSAGE */ + +#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \ +do { \ + if (cptab) \ + ptr = cfs_cpt_malloc((cptab), (cpt), (size), \ + (flags) | __GFP_ZERO | __GFP_NOWARN); \ + if (!(cptab) || unlikely(!(ptr))) /* retry without CPT if failure */ \ + ptr = kmalloc(size, (flags) | __GFP_ZERO); \ + if (likely((ptr) != NULL)) \ + OBD_ALLOC_POST((ptr), (size), "kmalloced"); \ +} while (0) + +#define OBD_ALLOC_GFP(ptr, size, gfp_mask) \ + __OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask) + +#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_NOFS) +#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_KERNEL) +#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof *(ptr)) +#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof *(ptr)) + +#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask) \ + __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask) + +#define OBD_CPT_ALLOC(ptr, cptab, cpt, size) \ + OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS) + +#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt) \ + OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof *(ptr)) + +/* Direct use of __vmalloc() allows for protection flag specification + * (and particularly to not set __GFP_FS, which is likely to cause some + * deadlock situations in our code). + */ +#define __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size) \ +do { \ + (ptr) = cptab == NULL ? \ + __ll_vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO) : \ + cfs_cpt_vzalloc(cptab, cpt, size); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n", \ + (int)(size)); \ + CERROR("%llu total bytes allocated by Lustre, %d by LNET\n", \ + obd_memory_sum(), atomic_read(&libcfs_kmemory)); \ + } else { \ + OBD_ALLOC_POST(ptr, size, "vmalloced"); \ + } \ +} while(0) + +#define OBD_VMALLOC(ptr, size) \ + __OBD_VMALLOC_VERBOSE(ptr, NULL, 0, size) +#define OBD_CPT_VMALLOC(ptr, cptab, cpt, size) \ + __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size) + +#define OBD_ALLOC_LARGE(ptr, size) \ +do { \ + /* LU-8196 - force large allocations to use vmalloc, not kmalloc */ \ + if ((size) > KMALLOC_MAX_SIZE) \ + ptr = NULL; \ + else \ + OBD_ALLOC_GFP(ptr, size, GFP_NOFS | __GFP_NOWARN); \ + if (ptr == NULL) \ + OBD_VMALLOC(ptr, size); \ +} while (0) + +#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size) \ +do { \ + OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS | __GFP_NOWARN); \ + if (ptr == NULL) \ + OBD_CPT_VMALLOC(ptr, cptab, cpt, size); \ +} while (0) + +#ifdef CONFIG_DEBUG_SLAB +#define POISON(ptr, c, s) do {} while (0) +#define POISON_PTR(ptr) ((void)0) +#else +#define POISON(ptr, c, s) memset(ptr, c, s) +#define POISON_PTR(ptr) (ptr) = (void *)0xdeadbeef +#endif + +#ifdef POISON_BULK +#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_SIZE); \ + kunmap(page); } while (0) +#else +#define POISON_PAGE(page, val) do { } while (0) +#endif + +#define OBD_FREE(ptr, size) \ +do { \ + OBD_FREE_PRE(ptr, size, "kfreed"); \ + kfree(ptr); \ + POISON_PTR(ptr); \ +} while (0) + +#define OBD_FREE_LARGE(ptr, size) \ +do { \ + if (is_vmalloc_addr(ptr)) { \ + OBD_FREE_PRE(ptr, size, "vfreed"); \ + libcfs_vfree_atomic(ptr); \ + POISON_PTR(ptr); \ + } else { \ + OBD_FREE(ptr, size); \ + } \ +} while (0) + +#define OBD_FREE_RCU(ptr, size, handle) \ +do { \ + struct portals_handle *__h = (handle); \ + \ + LASSERT(handle != NULL); \ + __h->h_cookie = (unsigned long)(ptr); \ + __h->h_size = (size); \ + call_rcu(&__h->h_rcu, class_handle_free_cb); \ + POISON_PTR(ptr); \ +} while(0) + +/* we memset() the slab object to 0 when allocation succeeds, so DO NOT + * HAVE A CTOR THAT DOES ANYTHING. its work will be cleared here. we'd + * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */ +#define OBD_SLAB_FREE_RTN0(ptr, slab) \ +({ \ + kmem_cache_free((slab), (ptr)); \ + (ptr) = NULL; \ + 0; \ +}) + +#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type) \ +do { \ + LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt())); \ + (ptr) = (cptab) == NULL ? \ + kmem_cache_alloc(slab, (type) | __GFP_ZERO) : \ + cfs_mem_cache_cpt_alloc(slab, cptab, cpt, (type) | __GFP_ZERO); \ + if (likely((ptr))) \ + OBD_ALLOC_POST(ptr, size, "slab-alloced"); \ +} while(0) + +#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags) \ + __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags) +#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags) \ + __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags) + +#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof *(ptr)) + +#define OBD_SLAB_FREE(ptr, slab, size) \ +do { \ + OBD_FREE_PRE(ptr, size, "slab-freed"); \ + kmem_cache_free(slab, ptr); \ + POISON_PTR(ptr); \ +} while(0) + +#define OBD_SLAB_ALLOC(ptr, slab, size) \ + OBD_SLAB_ALLOC_GFP(ptr, slab, size, GFP_NOFS) + +#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size) \ + OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, GFP_NOFS) + +#define OBD_SLAB_ALLOC_PTR(ptr, slab) \ + OBD_SLAB_ALLOC(ptr, slab, sizeof *(ptr)) + +#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt) \ + OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof *(ptr)) + +#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags) \ + OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof *(ptr), flags) + +#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags) \ + OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof *(ptr), flags) + +#define OBD_SLAB_FREE_PTR(ptr, slab) \ + OBD_SLAB_FREE((ptr), (slab), sizeof *(ptr)) + +#define KEY_IS(str) \ + (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0) + +/* LUSTRE_LMA_FL_MASKS defines which flags will be stored in LMA */ + +static inline int lma_to_lustre_flags(__u32 lma_flags) +{ + return (lma_flags & LMAI_ORPHAN) ? LUSTRE_ORPHAN_FL : 0; +} + +static inline int lustre_to_lma_flags(__u32 la_flags) +{ + return (la_flags & LUSTRE_ORPHAN_FL) ? LMAI_ORPHAN : 0; +} + +/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values + * for the client inode i_flags. The LUSTRE_*_FL are the Lustre wire + * protocol equivalents of LDISKFS_*_FL values stored on disk, while + * the S_* flags are kernel-internal values that change between kernel + * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS. + * See b=16526 for a full history. + */ +static inline int ll_ext_to_inode_flags(int flags) +{ + return (((flags & LUSTRE_SYNC_FL) ? S_SYNC : 0) | + ((flags & LUSTRE_NOATIME_FL) ? S_NOATIME : 0) | + ((flags & LUSTRE_APPEND_FL) ? S_APPEND : 0) | + ((flags & LUSTRE_DIRSYNC_FL) ? S_DIRSYNC : 0) | + ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0)); +} + +static inline int ll_inode_to_ext_flags(int iflags) +{ + return (((iflags & S_SYNC) ? LUSTRE_SYNC_FL : 0) | + ((iflags & S_NOATIME) ? LUSTRE_NOATIME_FL : 0) | + ((iflags & S_APPEND) ? LUSTRE_APPEND_FL : 0) | + ((iflags & S_DIRSYNC) ? LUSTRE_DIRSYNC_FL : 0) | + ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0)); +} + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/obd_target.h b/drivers/staging/lustrefsx/lustre/include/obd_target.h new file mode 100644 index 0000000000000..79f29dd374d86 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obd_target.h @@ -0,0 +1,73 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __OBD_TARGET_H +#define __OBD_TARGET_H + +/* server-side individual type definitions */ + +#define OBT_MAGIC 0xBDDECEAE +/* hold common fields for "target" device */ +struct obd_device_target { + __u32 obt_magic; + __u32 obt_instance; + struct lu_target *obt_lut; + __u64 obt_mount_count; + struct obd_job_stats obt_jobstats; + struct nm_config_file *obt_nodemap_config_file; +}; + +#define OBJ_SUBDIR_COUNT 32 /* set to zero for no subdirs */ + +struct filter_obd { + /* NB this field MUST be first */ + struct obd_device_target fo_obt; +}; + +struct echo_obd { + struct obd_device_target eo_obt; + struct obdo eo_oa; + spinlock_t eo_lock; + u64 eo_lastino; + struct lustre_handle eo_nl_lock; + atomic_t eo_prep; +}; + +struct ost_obd { + struct ptlrpc_service *ost_service; + struct ptlrpc_service *ost_create_service; + struct ptlrpc_service *ost_io_service; + struct ptlrpc_service *ost_seq_service; + struct ptlrpc_service *ost_out_service; + struct mutex ost_health_mutex; +}; + +#endif /* __OBD_TARGET_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/obj_update.h b/drivers/staging/lustrefsx/lustre/include/obj_update.h new file mode 100644 index 0000000000000..8c88de86005ea --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/obj_update.h @@ -0,0 +1,115 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Selection of object_update and object_update_param handling functions + */ + +#ifndef _OBJ_UPDATE_H_ +#define _OBJ_UPDATE_H_ + +#include + +static inline size_t +object_update_param_size(const struct object_update_param *param) +{ + return cfs_size_round(sizeof(*param) + param->oup_len); +} + +static inline size_t +object_update_params_size(const struct object_update *update) +{ + const struct object_update_param *param; + size_t total_size = 0; + unsigned int i; + + param = &update->ou_params[0]; + for (i = 0; i < update->ou_params_count; i++) { + size_t size = object_update_param_size(param); + + param = (struct object_update_param *)((char *)param + size); + total_size += size; + } + + return total_size; +} + +static inline size_t +object_update_size(const struct object_update *update) +{ + return offsetof(struct object_update, ou_params[0]) + + object_update_params_size(update); +} + +static inline struct object_update * +object_update_request_get(const struct object_update_request *our, + unsigned int index, size_t *size) +{ + void *ptr; + unsigned int i; + + if (index >= our->ourq_count) + return NULL; + + ptr = (void *)&our->ourq_updates[0]; + for (i = 0; i < index; i++) + ptr += object_update_size(ptr); + + if (size != NULL) + *size = object_update_size(ptr); + + return ptr; +} + + + +static inline struct object_update_result * +object_update_result_get(const struct object_update_reply *reply, + unsigned int index, size_t *size) +{ + __u16 count = reply->ourp_count; + unsigned int i; + void *ptr; + + if (index >= count) + return NULL; + + ptr = (char *)reply + + cfs_size_round(offsetof(struct object_update_reply, + ourp_lens[count])); + for (i = 0; i < index; i++) { + if (reply->ourp_lens[i] == 0) + return NULL; + + ptr += cfs_size_round(reply->ourp_lens[i]); + } + + if (size != NULL) + *size = reply->ourp_lens[index]; + + return ptr; +} +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/seq_range.h b/drivers/staging/lustrefsx/lustre/include/seq_range.h new file mode 100644 index 0000000000000..374d1932f0bdf --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/seq_range.h @@ -0,0 +1,192 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2015 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * Define lu_seq_range associated functions + */ + +#ifndef _SEQ_RANGE_H_ +#define _SEQ_RANGE_H_ + +#include + +/** + * computes the sequence range type \a range + */ + +static inline unsigned fld_range_type(const struct lu_seq_range *range) +{ + return range->lsr_flags & LU_SEQ_RANGE_MASK; +} + +/** + * Is this sequence range an OST? \a range + */ + +static inline bool fld_range_is_ost(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_OST; +} + +/** + * Is this sequence range an MDT? \a range + */ + +static inline bool fld_range_is_mdt(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_MDT; +} + +/** + * ANY range is only used when the fld client sends a fld query request, + * but it does not know whether the seq is an MDT or OST, so it will send the + * request with ANY type, which means any seq type from the lookup can be + * expected. /a range + */ +static inline unsigned fld_range_is_any(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_ANY; +} + +/** + * Apply flags to range \a range \a flags + */ + +static inline void fld_range_set_type(struct lu_seq_range *range, + unsigned flags) +{ + range->lsr_flags |= flags; +} + +/** + * Add MDT to range type \a range + */ + +static inline void fld_range_set_mdt(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_MDT); +} + +/** + * Add OST to range type \a range + */ + +static inline void fld_range_set_ost(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_OST); +} + +/** + * Add ANY to range type \a range + */ + +static inline void fld_range_set_any(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_ANY); +} + +/** + * computes width of given sequence range \a range + */ + +static inline __u64 lu_seq_range_space(const struct lu_seq_range *range) +{ + return range->lsr_end - range->lsr_start; +} + +/** + * initialize range to zero \a range + */ + +static inline void lu_seq_range_init(struct lu_seq_range *range) +{ + memset(range, 0, sizeof(*range)); +} + +/** + * check if given seq id \a s is within given range \a range + */ + +static inline bool lu_seq_range_within(const struct lu_seq_range *range, + __u64 seq) +{ + return seq >= range->lsr_start && seq < range->lsr_end; +} + +/** + * Is the range sane? Is the end after the beginning? \a range + */ + +static inline bool lu_seq_range_is_sane(const struct lu_seq_range *range) +{ + return range->lsr_end >= range->lsr_start; +} + +/** + * Is the range 0? \a range + */ + +static inline bool lu_seq_range_is_zero(const struct lu_seq_range *range) +{ + return range->lsr_start == 0 && range->lsr_end == 0; +} + +/** + * Is the range out of space? \a range + */ + +static inline bool lu_seq_range_is_exhausted(const struct lu_seq_range *range) +{ + return lu_seq_range_space(range) == 0; +} + +/** + * return 0 if two ranges have the same location, nonzero if they are + * different \a r1 \a r2 + */ + +static inline int lu_seq_range_compare_loc(const struct lu_seq_range *r1, + const struct lu_seq_range *r2) +{ + return r1->lsr_index != r2->lsr_index || + r1->lsr_flags != r2->lsr_flags; +} + +/** + * printf string and argument list for sequence range + */ +#define DRANGE "[%#16.16llx-%#16.16llx]:%x:%s" + +#define PRANGE(range) \ + (unsigned long long)(range)->lsr_start, \ + (unsigned long long)(range)->lsr_end, \ + (range)->lsr_index, \ + fld_range_is_mdt(range) ? "mdt" : "ost" + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h new file mode 100644 index 0000000000000..38084241d8998 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h @@ -0,0 +1,74 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + * + * lustre/include/lustre/lustre_barrier_user.h + * + * Lustre write barrier (on MDT) userspace interfaces. + * + * Author: Fan, Yong + */ +#ifndef _LUSTRE_BARRIER_USER_H +# define _LUSTRE_BARRIER_USER_H + +#include +#include + +#define BARRIER_VERSION_V1 1 +#define BARRIER_TIMEOUT_DEFAULT 30 + +enum barrier_commands { + BC_FREEZE = 1, + BC_THAW = 2, + BC_STAT = 3, + BC_RESCAN = 4, +}; + +enum barrier_status { + BS_INIT = 0, + BS_FREEZING_P1 = 1, + BS_FREEZING_P2 = 2, + BS_FROZEN = 3, + BS_THAWING = 4, + BS_THAWED = 5, + BS_FAILED = 6, + BS_EXPIRED = 7, + BS_RESCAN = 8, +}; + +struct barrier_ctl { + __u32 bc_version; + __u32 bc_cmd; + union { + __s32 bc_timeout; + __u32 bc_total; + }; + union { + __u32 bc_status; + __u32 bc_absence; + }; + char bc_name[12]; + __u32 bc_padding; +}; + +#endif /* _LUSTRE_BARRIER_USER_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h new file mode 100644 index 0000000000000..30d5c7d614892 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h @@ -0,0 +1,343 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _UAPI_LUSTRE_CFG_H +#define _UAPI_LUSTRE_CFG_H + +#include +#include +#include +#include + +/** \defgroup cfg cfg + * + * @{ + */ + +/* + * 1cf6 + * lcfG + */ +#define LUSTRE_CFG_VERSION 0x1cf60001 +#define LUSTRE_CFG_MAX_BUFCOUNT 8 + +#define LCFG_HDR_SIZE(count) \ + __ALIGN_KERNEL(offsetof(struct lustre_cfg, lcfg_buflens[(count)]), 8) + +/** If the LCFG_REQUIRED bit is set in a configuration command, + * then the client is required to understand this parameter + * in order to mount the filesystem. If it does not understand + * a REQUIRED command the client mount will fail. + */ +#define LCFG_REQUIRED 0x0001000 + +enum lcfg_command_type { + LCFG_ATTACH = 0x00cf001, /**< create a new obd instance */ + LCFG_DETACH = 0x00cf002, /**< destroy obd instance */ + LCFG_SETUP = 0x00cf003, /**< call type-specific setup */ + LCFG_CLEANUP = 0x00cf004, /**< call type-specific cleanup + */ + LCFG_ADD_UUID = 0x00cf005, /**< add a nid to a niduuid */ + LCFG_DEL_UUID = 0x00cf006, /**< remove a nid from + * a niduuid + */ + LCFG_MOUNTOPT = 0x00cf007, /**< create a profile + * (mdc, osc) + */ + LCFG_DEL_MOUNTOPT = 0x00cf008, /**< destroy a profile */ + LCFG_SET_TIMEOUT = 0x00cf009, /**< set obd_timeout */ + LCFG_SET_UPCALL = 0x00cf00a, /**< deprecated */ + LCFG_ADD_CONN = 0x00cf00b, /**< add a failover niduuid to + * an obd + */ + LCFG_DEL_CONN = 0x00cf00c, /**< remove a failover niduuid */ + LCFG_LOV_ADD_OBD = 0x00cf00d, /**< add an osc to a lov */ + LCFG_LOV_DEL_OBD = 0x00cf00e, /**< remove an osc from a lov */ + LCFG_PARAM = 0x00cf00f, /**< set a proc parameter */ + LCFG_MARKER = 0x00cf010, /**< metadata about next + * cfg rec + */ + LCFG_LOG_START = 0x00ce011, /**< mgc only, process a + * cfg log + */ + LCFG_LOG_END = 0x00ce012, /**< stop processing updates */ + LCFG_LOV_ADD_INA = 0x00ce013, /**< like LOV_ADD_OBD, + * inactive + */ + LCFG_ADD_MDC = 0x00cf014, /**< add an mdc to a lmv */ + LCFG_DEL_MDC = 0x00cf015, /**< remove an mdc from a lmv */ + LCFG_SPTLRPC_CONF = 0x00ce016, /**< security */ + LCFG_POOL_NEW = 0x00ce020, /**< create an ost pool name */ + LCFG_POOL_ADD = 0x00ce021, /**< add an ost to a pool */ + LCFG_POOL_REM = 0x00ce022, /**< remove an ost from a pool */ + LCFG_POOL_DEL = 0x00ce023, /**< destroy an ost pool name */ + LCFG_SET_LDLM_TIMEOUT = 0x00ce030, /**< set ldlm_timeout */ + LCFG_PRE_CLEANUP = 0x00cf031, /**< call type-specific pre + * cleanup cleanup + */ + LCFG_SET_PARAM = 0x00ce032, /**< use set_param syntax to set + * a proc parameters + */ + LCFG_NODEMAP_ADD = 0x00ce040, /**< create a cluster */ + LCFG_NODEMAP_DEL = 0x00ce041, /**< destroy a cluster */ + LCFG_NODEMAP_ADD_RANGE = 0x00ce042, /**< add a nid range */ + LCFG_NODEMAP_DEL_RANGE = 0x00ce043, /**< delete an nid range */ + LCFG_NODEMAP_ADD_UIDMAP = 0x00ce044, /**< add a uidmap */ + LCFG_NODEMAP_DEL_UIDMAP = 0x00ce045, /**< delete a uidmap */ + LCFG_NODEMAP_ADD_GIDMAP = 0x00ce046, /**< add a gidmap */ + LCFG_NODEMAP_DEL_GIDMAP = 0x00ce047, /**< delete a gidmap */ + LCFG_NODEMAP_ACTIVATE = 0x00ce048, /**< activate cluster + * id mapping + */ + LCFG_NODEMAP_ADMIN = 0x00ce049, /**< allow cluster to use id 0 */ + LCFG_NODEMAP_TRUSTED = 0x00ce050, /**< trust a clusters ids */ + LCFG_NODEMAP_SQUASH_UID = 0x00ce051, /**< default map uid */ + LCFG_NODEMAP_SQUASH_GID = 0x00ce052, /**< default map gid */ + LCFG_NODEMAP_ADD_SHKEY = 0x00ce053, /**< add shared key to cluster */ + LCFG_NODEMAP_DEL_SHKEY = 0x00ce054, /**< delete shared key from + * cluster + */ + LCFG_NODEMAP_TEST_NID = 0x00ce055, /**< test for nodemap + * membership + */ + LCFG_NODEMAP_TEST_ID = 0x00ce056, /**< test uid/gid mapping */ + LCFG_NODEMAP_SET_FILESET = 0x00ce057, /**< set fileset */ + LCFG_NODEMAP_DENY_UNKNOWN = 0x00ce058, /**< deny squashed nodemap + * users + */ + LCFG_NODEMAP_MAP_MODE = 0x00ce059, /**< set the mapping mode */ + LCFG_NODEMAP_AUDIT_MODE = 0x00ce05a, /**< set the audit mode */ + LCFG_NODEMAP_SET_SEPOL = 0x00ce05b, /**< set SELinux policy */ +}; + +struct lustre_cfg_bufs { + void *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT]; + __u32 lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT]; + __u32 lcfg_bufcount; +}; + +struct lustre_cfg { + __u32 lcfg_version; + __u32 lcfg_command; + + __u32 lcfg_num; + __u32 lcfg_flags; + __u64 lcfg_nid; + __u32 lcfg_nal; /* not used any more */ + + __u32 lcfg_bufcount; + __u32 lcfg_buflens[0]; +}; + +struct lcfg_type_data { + __u32 ltd_type; + char *ltd_name; + char *ltd_bufs[4]; +}; + +static struct lcfg_type_data lcfg_data_table[] = { + { LCFG_ATTACH, "attach", { "type", "UUID", "3", "4" } }, + { LCFG_DETACH, "detach", { "1", "2", "3", "4" } }, + { LCFG_SETUP, "setup", { "UUID", "node", "options", "failout" } }, + { LCFG_CLEANUP, "cleanup", { "1", "2", "3", "4" } }, + { LCFG_ADD_UUID, "add_uuid", { "node", "2", "3", "4" } }, + { LCFG_DEL_UUID, "del_uuid", { "1", "2", "3", "4" } }, + { LCFG_MOUNTOPT, "new_profile", { "name", "lov", "lmv", "4" } }, + { LCFG_DEL_MOUNTOPT, "del_mountopt", { "1", "2", "3", "4" } }, + { LCFG_SET_TIMEOUT, "set_timeout", { "parameter", "2", "3", "4" } }, + { LCFG_SET_UPCALL, "set_upcall", { "1", "2", "3", "4" } }, + { LCFG_ADD_CONN, "add_conn", { "node", "2", "3", "4" } }, + { LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" } }, + { LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } }, + { LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } }, + { LCFG_PARAM, "conf_param", { "parameter", "value", "3", "4" } }, + { LCFG_MARKER, "marker", { "1", "2", "3", "4" } }, + { LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } }, + { LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } }, + { LCFG_LOV_ADD_INA, "add_osc_inactive", { "1", "2", "3", "4" } }, + { LCFG_ADD_MDC, "add_mdc", { "mdt", "index", "gen", "UUID" } }, + { LCFG_DEL_MDC, "del_mdc", { "1", "2", "3", "4" } }, + { LCFG_SPTLRPC_CONF, "security", { "parameter", "2", "3", "4" } }, + { LCFG_POOL_NEW, "new_pool", { "fsname", "pool", "3", "4" } }, + { LCFG_POOL_ADD, "add_pool", { "fsname", "pool", "ost", "4" } }, + { LCFG_POOL_REM, "remove_pool", { "fsname", "pool", "ost", "4" } }, + { LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } }, + { LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout", + { "parameter", "2", "3", "4" } }, + { LCFG_SET_PARAM, "set_param", { "parameter", "value", "3", "4" } }, + { 0, NULL, { NULL, NULL, NULL, NULL } } +}; + +static inline struct lcfg_type_data *lcfg_cmd2data(__u32 cmd) +{ + int i = 0; + + while (lcfg_data_table[i].ltd_type != 0) { + if (lcfg_data_table[i].ltd_type == cmd) + return &lcfg_data_table[i]; + i++; + } + return NULL; +} + +enum cfg_record_type { + PORTALS_CFG_TYPE = 1, + LUSTRE_CFG_TYPE = 123, +}; + +#define LUSTRE_CFG_BUFLEN(lcfg, idx) \ + ((lcfg)->lcfg_bufcount <= (idx) ? 0 : (lcfg)->lcfg_buflens[(idx)]) + +static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs, + __u32 index, void *buf, __u32 buflen) +{ + if (index >= LUSTRE_CFG_MAX_BUFCOUNT) + return; + + if (!bufs) + return; + + if (bufs->lcfg_bufcount <= index) + bufs->lcfg_bufcount = index + 1; + + bufs->lcfg_buf[index] = buf; + bufs->lcfg_buflen[index] = buflen; +} + +static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs, + __u32 index, char *str) +{ + lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0); +} + +static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, + char *name) +{ + memset((bufs), 0, sizeof(*bufs)); + if (name) + lustre_cfg_bufs_set_string(bufs, 0, name); +} + +static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, __u32 index) +{ + __u32 i; + __kernel_size_t offset; + __u32 bufcount; + + if (!lcfg) + return NULL; + + bufcount = lcfg->lcfg_bufcount; + if (index >= bufcount) + return NULL; + + offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < index; i++) + offset += __ALIGN_KERNEL(lcfg->lcfg_buflens[i], 8); + return (char *)lcfg + offset; +} + +static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs, + struct lustre_cfg *lcfg) +{ + __u32 i; + + bufs->lcfg_bufcount = lcfg->lcfg_bufcount; + for (i = 0; i < bufs->lcfg_bufcount; i++) { + bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i]; + bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i); + } +} + +static inline __u32 lustre_cfg_len(__u32 bufcount, __u32 *buflens) +{ + __u32 i; + __u32 len; + + len = LCFG_HDR_SIZE(bufcount); + for (i = 0; i < bufcount; i++) + len += __ALIGN_KERNEL(buflens[i], 8); + + return __ALIGN_KERNEL(len, 8); +} + +static inline void lustre_cfg_init(struct lustre_cfg *lcfg, int cmd, + struct lustre_cfg_bufs *bufs) +{ + char *ptr; + __u32 i; + + lcfg->lcfg_version = LUSTRE_CFG_VERSION; + lcfg->lcfg_command = cmd; + lcfg->lcfg_bufcount = bufs->lcfg_bufcount; + + ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i]; + if (bufs->lcfg_buf[i]) { + memcpy(ptr, bufs->lcfg_buf[i], bufs->lcfg_buflen[i]); + ptr += __ALIGN_KERNEL(bufs->lcfg_buflen[i], 8); + } + } +} + +static inline int lustre_cfg_sanity_check(void *buf, __kernel_size_t len) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)buf; + + if (!lcfg) + return -EINVAL; + + /* check that the first bits of the struct are valid */ + if (len < LCFG_HDR_SIZE(0)) + return -EINVAL; + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) + return -EINVAL; + + if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT) + return -EINVAL; + + /* check that the buflens are valid */ + if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount)) + return -EINVAL; + + /* make sure all the pointers point inside the data */ + if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)) + return -EINVAL; + + return 0; +} + +/** @} cfg */ + +#endif /* _UAPI_LUSTRE_CFG_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h new file mode 100644 index 0000000000000..e9cbf3066738a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h @@ -0,0 +1,205 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Lustre disk format definitions. + * + * Author: Nathan Rutman + */ + +#ifndef _UAPI_LUSTRE_DISK_H +#define _UAPI_LUSTRE_DISK_H + +/** \defgroup disk disk + * + * @{ + */ +#include + +/****************** on-disk files ********************/ + +#define MDT_LOGS_DIR "LOGS" /* COMPAT_146 */ +#define MOUNT_CONFIGS_DIR "CONFIGS" +#define CONFIGS_FILE "mountdata" +/** Persistent mount data are stored on the disk in this file. */ +#define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/"CONFIGS_FILE +#define LAST_RCVD "last_rcvd" +#define REPLY_DATA "reply_data" +#define LOV_OBJID "lov_objid" +#define LOV_OBJSEQ "lov_objseq" +#define HEALTH_CHECK "health_check" +#define CAPA_KEYS "capa_keys" +#define CHANGELOG_USERS "changelog_users" +#define MGS_NIDTBL_DIR "NIDTBL_VERSIONS" +#define QMT_DIR "quota_master" +#define QSD_DIR "quota_slave" +#define QSD_DIR_DT "quota_slave_dt" +#define QSD_DIR_MD "quota_slave_md" +#define HSM_ACTIONS "hsm_actions" +#define LFSCK_DIR "LFSCK" +#define LFSCK_BOOKMARK "lfsck_bookmark" +#define LFSCK_LAYOUT "lfsck_layout" +#define LFSCK_NAMESPACE "lfsck_namespace" +#define REMOTE_PARENT_DIR "REMOTE_PARENT_DIR" +#define INDEX_BACKUP_DIR "index_backup" +#define MDT_ORPHAN_DIR "PENDING" + +/****************** persistent mount data *********************/ + +#define LDD_F_SV_TYPE_MDT 0x0001 +#define LDD_F_SV_TYPE_OST 0x0002 +#define LDD_F_SV_TYPE_MGS 0x0004 +#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT | \ + LDD_F_SV_TYPE_OST | \ + LDD_F_SV_TYPE_MGS) +#define LDD_F_SV_ALL 0x0008 +/** need an index assignment */ +#define LDD_F_NEED_INDEX 0x0010 +/** never registered */ +#define LDD_F_VIRGIN 0x0020 +/** update the config logs for this server */ +#define LDD_F_UPDATE 0x0040 +/** rewrite the LDD */ +#define LDD_F_REWRITE_LDD 0x0080 +/** regenerate config logs for this fs or server */ +#define LDD_F_WRITECONF 0x0100 +/** COMPAT_14 */ +/*#define LDD_F_UPGRADE14 0x0200 deprecated since 1.8 */ +/** process as lctl conf_param */ +#define LDD_F_PARAM 0x0400 +/** all nodes are specified as service nodes */ +#define LDD_F_NO_PRIMNODE 0x1000 +/** IR enable flag */ +#define LDD_F_IR_CAPABLE 0x2000 +/** the MGS refused to register the target. */ +#define LDD_F_ERROR 0x4000 +/** process at lctl conf_param */ +#define LDD_F_PARAM2 0x8000 + +#define LDD_MAGIC 0x1dd00001 + +#define XATTR_TARGET_RENAME "trusted.rename_tgt" + +enum ldd_mount_type { + LDD_MT_EXT3 = 0, + LDD_MT_LDISKFS, + LDD_MT_SMFS, + LDD_MT_REISERFS, + LDD_MT_LDISKFS2, + LDD_MT_ZFS, + LDD_MT_LAST +}; + +/****************** last_rcvd file *********************/ + +#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */ +#define LR_SERVER_SIZE 512 +#define LR_CLIENT_START 8192 +#define LR_CLIENT_SIZE 128 +#if LR_CLIENT_START < LR_SERVER_SIZE +#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE" +#endif + +/* + * Data stored per server at the head of the last_rcvd file. In le32 order. + */ +struct lr_server_data { + __u8 lsd_uuid[40]; /* server UUID */ + __u64 lsd_last_transno; /* last completed transaction ID */ + __u64 lsd_compat14; /* reserved - compat with old last_rcvd */ + __u64 lsd_mount_count; /* incarnation number */ + __u32 lsd_feature_compat; /* compatible feature flags */ + __u32 lsd_feature_rocompat;/* read-only compatible feature flags */ + __u32 lsd_feature_incompat;/* incompatible feature flags */ + __u32 lsd_server_size; /* size of server data area */ + __u32 lsd_client_start; /* start of per-client data area */ + __u16 lsd_client_size; /* size of per-client data area */ + __u16 lsd_subdir_count; /* number of subdirectories for objects */ + __u64 lsd_catalog_oid; /* recovery catalog object id */ + __u32 lsd_catalog_ogen; /* recovery catalog inode generation */ + __u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */ + __u32 lsd_osd_index; /* index number of OST in LOV */ + __u32 lsd_padding1; /* was lsd_mdt_index, unused in 2.4.0 */ + __u32 lsd_start_epoch; /* VBR: start epoch from last boot */ + /** transaction values since lsd_trans_table_time */ + __u64 lsd_trans_table[LR_EXPIRE_INTERVALS]; + /** start point of transno table below */ + __u32 lsd_trans_table_time; /* time of first slot in table above */ + __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */ + __u8 lsd_padding[LR_SERVER_SIZE - 288]; +}; + +/* Data stored per client in the last_rcvd file. In le32 order. */ +struct lsd_client_data { + __u8 lcd_uuid[40]; /* client UUID */ + __u64 lcd_last_transno; /* last completed transaction ID */ + __u64 lcd_last_xid; /* xid for the last transaction */ + __u32 lcd_last_result; /* result from last RPC */ + __u32 lcd_last_data; /* per-op data (disposition for + * open &c.) + */ + /* for MDS_CLOSE requests */ + __u64 lcd_last_close_transno; /* last completed transaction ID */ + __u64 lcd_last_close_xid; /* xid for the last transaction */ + __u32 lcd_last_close_result; /* result from last RPC */ + __u32 lcd_last_close_data; /* per-op data */ + /* VBR: last versions */ + __u64 lcd_pre_versions[4]; + __u32 lcd_last_epoch; + /* generation counter of client slot in last_rcvd */ + __u32 lcd_generation; + __u8 lcd_padding[LR_CLIENT_SIZE - 128]; +}; + +/* Data stored in each slot of the reply_data file. + * + * The lrd_client_gen field is assigned with lcd_generation value + * to allow identify which client the reply data belongs to. + */ +struct lsd_reply_data { + __u64 lrd_transno; /* transaction number */ + __u64 lrd_xid; /* transmission id */ + __u64 lrd_data; /* per-operation data */ + __u32 lrd_result; /* request result */ + __u32 lrd_client_gen; /* client generation */ +}; + +/* Header of the reply_data file */ +#define LRH_MAGIC 0xbdabda01 +struct lsd_reply_header { + __u32 lrh_magic; + __u32 lrh_header_size; + __u32 lrh_reply_size; + __u8 lrh_pad[sizeof(struct lsd_reply_data) - 12]; +}; + +/** @} disk */ + +#endif /* _UAPI_LUSTRE_DISK_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h new file mode 100644 index 0000000000000..f11ad3b3b2115 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h @@ -0,0 +1,364 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * Copyright 2016 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * all fid manipulation functions go here + * + * FIDS are globally unique within a Lustre filessytem, and are made up + * of three parts: sequence, Object ID, and version. + * + */ +#ifndef _UAPI_LUSTRE_FID_H_ +#define _UAPI_LUSTRE_FID_H_ + +#include +#include + +/** returns fid object sequence */ +static inline __u64 fid_seq(const struct lu_fid *fid) +{ + return fid->f_seq; +} + +/** returns fid object id */ +static inline __u32 fid_oid(const struct lu_fid *fid) +{ + return fid->f_oid; +} + +/** returns fid object version */ +static inline __u32 fid_ver(const struct lu_fid *fid) +{ + return fid->f_ver; +} + +static inline void fid_zero(struct lu_fid *fid) +{ + memset(fid, 0, sizeof(*fid)); +} + +static inline __u64 fid_ver_oid(const struct lu_fid *fid) +{ + return (__u64)fid_ver(fid) << 32 | fid_oid(fid); +} + +static inline bool fid_seq_is_mdt0(__u64 seq) +{ + return seq == FID_SEQ_OST_MDT0; +} + +static inline bool fid_seq_is_mdt(__u64 seq) +{ + return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL; +}; + +static inline bool fid_seq_is_echo(__u64 seq) +{ + return seq == FID_SEQ_ECHO; +} + +static inline bool fid_is_echo(const struct lu_fid *fid) +{ + return fid_seq_is_echo(fid_seq(fid)); +} + +static inline bool fid_seq_is_llog(__u64 seq) +{ + return seq == FID_SEQ_LLOG; +} + +static inline bool fid_is_llog(const struct lu_fid *fid) +{ + /* file with OID == 0 is not llog but contains last oid */ + return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0; +} + +static inline bool fid_seq_is_rsvd(__u64 seq) +{ + return seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD; +}; + +static inline bool fid_seq_is_special(__u64 seq) +{ + return seq == FID_SEQ_SPECIAL; +}; + +static inline bool fid_seq_is_local_file(__u64 seq) +{ + return seq == FID_SEQ_LOCAL_FILE || + seq == FID_SEQ_LOCAL_NAME; +}; + +static inline bool fid_seq_is_root(__u64 seq) +{ + return seq == FID_SEQ_ROOT; +} + +static inline bool fid_seq_is_dot(__u64 seq) +{ + return seq == FID_SEQ_DOT_LUSTRE; +} + +static inline bool fid_seq_is_default(__u64 seq) +{ + return seq == FID_SEQ_LOV_DEFAULT; +} + +static inline bool fid_is_mdt0(const struct lu_fid *fid) +{ + return fid_seq_is_mdt0(fid_seq(fid)); +} + +static inline void lu_root_fid(struct lu_fid *fid) +{ + fid->f_seq = FID_SEQ_ROOT; + fid->f_oid = FID_OID_ROOT; + fid->f_ver = 0; +} + +static inline void lu_echo_root_fid(struct lu_fid *fid) +{ + fid->f_seq = FID_SEQ_ROOT; + fid->f_oid = FID_OID_ECHO_ROOT; + fid->f_ver = 0; +} + +static inline void lu_update_log_fid(struct lu_fid *fid, __u32 index) +{ + fid->f_seq = FID_SEQ_UPDATE_LOG; + fid->f_oid = index; + fid->f_ver = 0; +} + +static inline void lu_update_log_dir_fid(struct lu_fid *fid, __u32 index) +{ + fid->f_seq = FID_SEQ_UPDATE_LOG_DIR; + fid->f_oid = index; + fid->f_ver = 0; +} + +/** + * Check if a fid is igif or not. + * \param fid the fid to be tested. + * \return true if the fid is an igif; otherwise false. + */ +static inline bool fid_seq_is_igif(__u64 seq) +{ + return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX; +} + +static inline bool fid_is_igif(const struct lu_fid *fid) +{ + return fid_seq_is_igif(fid_seq(fid)); +} + +/** + * Check if a fid is idif or not. + * \param fid the fid to be tested. + * \return true if the fid is an idif; otherwise false. + */ +static inline bool fid_seq_is_idif(__u64 seq) +{ + return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX; +} + +static inline bool fid_is_idif(const struct lu_fid *fid) +{ + return fid_seq_is_idif(fid_seq(fid)); +} + +static inline bool fid_is_local_file(const struct lu_fid *fid) +{ + return fid_seq_is_local_file(fid_seq(fid)); +} + +static inline bool fid_seq_is_norm(__u64 seq) +{ + return (seq >= FID_SEQ_NORMAL); +} + +static inline bool fid_is_norm(const struct lu_fid *fid) +{ + return fid_seq_is_norm(fid_seq(fid)); +} + +static inline int fid_is_layout_rbtree(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_LAYOUT_RBTREE; +} + +static inline bool fid_seq_is_update_log(__u64 seq) +{ + return seq == FID_SEQ_UPDATE_LOG; +} + +static inline bool fid_is_update_log(const struct lu_fid *fid) +{ + return fid_seq_is_update_log(fid_seq(fid)); +} + +static inline bool fid_seq_is_update_log_dir(__u64 seq) +{ + return seq == FID_SEQ_UPDATE_LOG_DIR; +} + +static inline bool fid_is_update_log_dir(const struct lu_fid *fid) +{ + return fid_seq_is_update_log_dir(fid_seq(fid)); +} + +/* convert an OST objid into an IDIF FID SEQ number */ +static inline __u64 fid_idif_seq(__u64 id, __u32 ost_idx) +{ + return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff); +} + +/* convert a packed IDIF FID into an OST objid */ +static inline __u64 fid_idif_id(__u64 seq, __u32 oid, __u32 ver) +{ + return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid; +} + +static inline __u32 idif_ost_idx(__u64 seq) +{ + return (seq >> 16) & 0xffff; +} + +/* extract ost index from IDIF FID */ +static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid) +{ + return idif_ost_idx(fid_seq(fid)); +} + +/* Check whether the fid is for LAST_ID */ +static inline bool fid_is_last_id(const struct lu_fid *fid) +{ + if (fid_oid(fid) != 0) + return false; + + if (fid_is_idif(fid) && ((fid_seq(fid) & 0xFFFF) != 0)) + return false; + + if (fid_seq(fid) == FID_SEQ_UPDATE_LOG || + fid_seq(fid) == FID_SEQ_UPDATE_LOG_DIR || + fid_seq_is_igif(fid_seq(fid))) + return false; + + return true; +} + +/** + * Get inode number from an igif. + * \param fid an igif to get inode number from. + * \return inode number for the igif. + */ +static inline __kernel_ino_t lu_igif_ino(const struct lu_fid *fid) +{ + return fid_seq(fid); +} + +/** + * Get inode generation from an igif. + * \param fid an igif to get inode generation from. + * \return inode generation for the igif. + */ +static inline __u32 lu_igif_gen(const struct lu_fid *fid) +{ + return fid_oid(fid); +} + +/** + * Build igif from the inode number/generation. + */ +static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen) +{ + fid->f_seq = ino; + fid->f_oid = gen; + fid->f_ver = 0; +} + +/* + * Fids are transmitted across network (in the sender byte-ordering), + * and stored on disk in big-endian order. + */ +static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = __cpu_to_le64(fid_seq(src)); + dst->f_oid = __cpu_to_le32(fid_oid(src)); + dst->f_ver = __cpu_to_le32(fid_ver(src)); +} + +static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = __le64_to_cpu(fid_seq(src)); + dst->f_oid = __le32_to_cpu(fid_oid(src)); + dst->f_ver = __le32_to_cpu(fid_ver(src)); +} + +static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = __cpu_to_be64(fid_seq(src)); + dst->f_oid = __cpu_to_be32(fid_oid(src)); + dst->f_ver = __cpu_to_be32(fid_ver(src)); +} + +static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + dst->f_seq = __be64_to_cpu(fid_seq(src)); + dst->f_oid = __be32_to_cpu(fid_oid(src)); + dst->f_ver = __be32_to_cpu(fid_ver(src)); +} + +static inline bool fid_is_sane(const struct lu_fid *fid) +{ + return fid && ((fid_seq(fid) >= FID_SEQ_START && !fid_ver(fid)) || + fid_is_igif(fid) || fid_is_idif(fid) || + fid_seq_is_rsvd(fid_seq(fid))); +} + +static inline bool lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1) +{ + return !memcmp(f0, f1, sizeof(*f0)); +} + +static inline int lu_fid_cmp(const struct lu_fid *f0, + const struct lu_fid *f1) +{ + if (fid_seq(f0) != fid_seq(f1)) + return fid_seq(f0) > fid_seq(f1) ? 1 : -1; + + if (fid_oid(f0) != fid_oid(f1)) + return fid_oid(f0) > fid_oid(f1) ? 1 : -1; + + if (fid_ver(f0) != fid_ver(f1)) + return fid_ver(f0) > fid_ver(f1) ? 1 : -1; + + return 0; +} +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h new file mode 100644 index 0000000000000..8cdb05dedbd8c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h @@ -0,0 +1,72 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * FIEMAP data structures and flags. This header file will be used until + * fiemap.h is available in the upstream kernel. + * + * Author: Kalpak Shah + * Author: Andreas Dilger + */ + +#ifndef _LUSTRE_FIEMAP_H +#define _LUSTRE_FIEMAP_H + +#include +#include +#include + +/* XXX: We use fiemap_extent::fe_reserved[0] */ +#define fe_device fe_reserved[0] + +static inline __kernel_size_t fiemap_count_to_size(__kernel_size_t extent_count) +{ + return sizeof(struct fiemap) + extent_count * + sizeof(struct fiemap_extent); +} + +static inline unsigned int fiemap_size_to_count(__kernel_size_t array_size) +{ + return (array_size - sizeof(struct fiemap)) / + sizeof(struct fiemap_extent); +} + +#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */ + +#ifdef FIEMAP_FLAGS_COMPAT +#undef FIEMAP_FLAGS_COMPAT +#endif + +/* Lustre specific flags - use a high bit, don't conflict with upstream flag */ +#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */ +#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely. + * Sets NO_DIRECT flag */ + +#endif /* _LUSTRE_FIEMAP_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h new file mode 100644 index 0000000000000..43e97f14b3e42 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h @@ -0,0 +1,3641 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Lustre wire protocol definitions. + */ + +/** \defgroup lustreidl lustreidl + * + * Lustre wire protocol definitions. + * + * ALL structs passing over the wire should be declared here. Structs + * that are used in interfaces with userspace should go in lustre_user.h. + * + * All structs being declared here should be built from simple fixed-size + * types defined in linux/types.h or be built from other types or + * structs also declared in this file. Similarly, all flags and magic + * values in those structs should also be declared here. This ensures + * that the Lustre wire protocol is not influenced by external dependencies. + * + * The only other acceptable items in this file are VERY SIMPLE accessor + * functions to avoid callers grubbing inside the structures. Nothing that + * depends on external functions or definitions should be in here. + * + * Structs must be properly aligned to put 64-bit values on an 8-byte + * boundary. Any structs being added here must also be added to + * utils/wirecheck.c and "make newwiretest" run to regenerate the + * utils/wiretest.c sources. This allows us to verify that wire structs + * have the proper alignment/size on all architectures. + * + * DO NOT CHANGE any of the structs, flags, values declared here and used + * in released Lustre versions. Some structs may have padding fields that + * can be used. Some structs might allow addition at the end (verify this + * in the code to ensure that new/old clients that see this larger struct + * do not fail, otherwise you need to implement protocol compatibility). + * + * @{ + */ + +#ifndef _LUSTRE_IDL_H_ +#define _LUSTRE_IDL_H_ + +#include +#include +#include +#include +/* + * This is due to us being out of kernel and the way the OpenSFS branch + * handles CFLAGS. + */ +#ifdef __KERNEL__ +# include +#else +# include +#endif +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * GENERAL STUFF + */ +/* FOO_REQUEST_PORTAL is for incoming requests on the FOO + * FOO_REPLY_PORTAL is for incoming replies on the FOO + * FOO_BULK_PORTAL is for incoming bulk on the FOO + */ + +#define CONNMGR_REQUEST_PORTAL 1 +#define CONNMGR_REPLY_PORTAL 2 +/* #define OSC_REQUEST_PORTAL 3*/ +#define OSC_REPLY_PORTAL 4 +/*#define OSC_BULK_PORTAL 5*/ +#define OST_IO_PORTAL 6 +#define OST_CREATE_PORTAL 7 +#define OST_BULK_PORTAL 8 +/*#define MDC_REQUEST_PORTAL 9*/ +#define MDC_REPLY_PORTAL 10 +/*#define MDC_BULK_PORTAL 11*/ +#define MDS_REQUEST_PORTAL 12 +#define MDS_IO_PORTAL 13 +#define MDS_BULK_PORTAL 14 +#define LDLM_CB_REQUEST_PORTAL 15 +#define LDLM_CB_REPLY_PORTAL 16 +#define LDLM_CANCEL_REQUEST_PORTAL 17 +#define LDLM_CANCEL_REPLY_PORTAL 18 +/*#define PTLBD_REQUEST_PORTAL 19*/ +/*#define PTLBD_REPLY_PORTAL 20*/ +/*#define PTLBD_BULK_PORTAL 21*/ +#define MDS_SETATTR_PORTAL 22 +#define MDS_READPAGE_PORTAL 23 +#define OUT_PORTAL 24 +#define MGC_REPLY_PORTAL 25 +#define MGS_REQUEST_PORTAL 26 +#define MGS_REPLY_PORTAL 27 +#define OST_REQUEST_PORTAL 28 +#define FLD_REQUEST_PORTAL 29 +#define SEQ_METADATA_PORTAL 30 +#define SEQ_DATA_PORTAL 31 +#define SEQ_CONTROLLER_PORTAL 32 +#define MGS_BULK_PORTAL 33 +/* #define DVS_PORTAL 63 */ +/* reserved for Cray DVS - spitzcor@cray.com, roe@cray.com, n8851@cray.com */ + +/** + * Describes a range of sequence, lsr_start is included but lsr_end is + * not in the range. + * Same structure is used in fld module where lsr_index field holds mdt id + * of the home mdt. + */ +struct lu_seq_range { + __u64 lsr_start; + __u64 lsr_end; + __u32 lsr_index; + __u32 lsr_flags; +}; + +struct lu_seq_range_array { + __u32 lsra_count; + __u32 lsra_padding; + struct lu_seq_range lsra_lsr[0]; +}; + +#define LU_SEQ_RANGE_MDT 0x0 +#define LU_SEQ_RANGE_OST 0x1 +#define LU_SEQ_RANGE_ANY 0x3 + +#define LU_SEQ_RANGE_MASK 0x3 + +/** \defgroup lu_fid lu_fid + * @{ */ + +extern void lustre_lma_swab(struct lustre_mdt_attrs *lma); +extern void lustre_lma_init(struct lustre_mdt_attrs *lma, + const struct lu_fid *fid, + __u32 compat, __u32 incompat); +extern void lustre_loa_swab(struct lustre_ost_attrs *loa, + bool to_cpu); +extern void lustre_loa_init(struct lustre_ost_attrs *loa, + const struct lu_fid *fid, + __u32 compat, __u32 incompat); + +/* copytool can use any nonnegative integer to represent archive-Ids during + * register with MDT thru kuc. + * archive num = 0 => all + * archive num from 1 to MAX_U32 + */ +#define LL_HSM_ORIGIN_MAX_ARCHIVE (sizeof(__u32) * 8) +/* the max count of archive ids that one agent can support */ +#define LL_HSM_MAX_ARCHIVES_PER_AGENT 1024 + +/** + * HSM on-disk attributes stored in a separate xattr. + */ +struct hsm_attrs { + /** Bitfield for supported data in this structure. For future use. */ + __u32 hsm_compat; + + /** HSM flags, see hsm_flags enum below */ + __u32 hsm_flags; + /** backend archive id associated with the file */ + __u64 hsm_arch_id; + /** version associated with the last archiving, if any */ + __u64 hsm_arch_ver; +}; +extern void lustre_hsm_swab(struct hsm_attrs *attrs); + +/** + * fid constants + */ +enum { + /** LASTID file has zero OID */ + LUSTRE_FID_LASTID_OID = 0UL, + /** initial fid id value */ + LUSTRE_FID_INIT_OID = 1UL +}; + +/** + * Different FID Format + * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0 + * + * FID: + * File IDentifier generated by client from range allocated by the seq service. + * First 0x400 sequences [2^33, 2^33 + 0x400] are reserved for system use. Note + * that on ldiskfs MDTs that IGIF FIDs can use inode numbers starting at 12, + * but this is in the IGIF SEQ rangeand does not conflict with assigned FIDs. + * + * IGIF: + * Inode and Generation In FID, a surrogate FID used to globally identify an + * existing object on OLD formatted MDT file system. This would only be used on + * MDT0 in a DNE filesystem, because there are not expected to be any OLD + * formatted DNE filesystems. Belongs to a sequence in [12, 2^32 - 1] range, + * where sequence number is inode number, and inode generation is used as OID. + * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem, + * which is the maximum possible for an ldiskfs backend. NOTE: This assumes + * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible + * to clients, which has always been true. + * + * IDIF: + * Object ID in FID, a surrogate FID used to globally identify an existing + * object on OLD formatted OST file system. Belongs to a sequence in + * [2^32, 2^33 - 1]. Sequence number is calculated as: + * 1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff) + * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object ID. + * The generation of unique SEQ values per OST allows the IDIF FIDs to be + * identified in the FLD correctly. The OID field is calculated as: + * objid & 0xffffffff + * that is, it consists of lower 32 bits of object ID. NOTE This assumes that + * no more than 2^48-1 objects have ever been created on an OST, and that no + * more than 65535 OSTs are in use. Both are very reasonable assumptions (can + * uniquely map all objects on an OST that created 1M objects per second for 9 + * years, or combinations thereof). + * + * OST_MDT0: + * Surrogate FID used to identify an existing object on OLD formatted OST + * filesystem. Belongs to the reserved sequence 0, and is used internally prior + * to the introduction of FID-on-OST, at which point IDIF will be used to + * identify objects as residing on a specific OST. + * + * LLOG: + * For Lustre Log objects the object sequence 1 is used. This is compatible with + * both OLD and NEW.1 namespaces, as this SEQ number is in the ext3/ldiskfs + * reserved inode range and does not conflict with IGIF sequence numbers. + * + * ECHO: + * For testing OST IO performance the object sequence 2 is used. This is + * compatible with both OLD and NEW.1 namespaces, as this SEQ number is in the + * ext3/ldiskfs reserved inode range and does not conflict with IGIF sequence + * numbers. + * + * OST_MDT1 .. OST_MAX: + * For testing with multiple MDTs the object sequence 3 through 9 is used, + * allowing direct mapping of MDTs 1 through 7 respectively, for a total of 8 + * MDTs including OST_MDT0. This matches the legacy CMD project "group" + * mappings. However, this SEQ range is only for testing prior to any production + * DNE release, as the objects in this range conflict across all OSTs, as the + * OST index is not part of the FID. + * + * + * For compatibility with existing OLD OST network protocol structures, the FID + * must map onto the o_id and o_gr in a manner that ensures existing objects are + * identified consistently for IO, as well as onto the lock namespace to ensure + * both IDIFs map onto the same objects for IO as well as resources in the DLM. + * + * DLM OLD OBIF/IDIF: + * resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases + * + * DLM NEW.1 FID (this is the same for both the MDT and OST): + * resource[] = {SEQ, OID, VER, HASH}; + * + * Note that for mapping IDIF values to DLM resource names the o_id may be + * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible + * for the o_id numbers to overlap FID SEQ numbers in the resource. However, in + * all production releases the OLD o_seq field is always zero, and all valid FID + * OID values are non-zero, so the lock resources will not collide. + * + * For objects within the IDIF range, group extraction (non-CMD) will be: + * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid; + * o_seq = 0; // formerly group number + */ + +/** + * Note that reserved SEQ numbers below 12 will conflict with ldiskfs + * inodes in the IGIF namespace, so these reserved SEQ numbers can be + * used for other purposes and not risk collisions with existing inodes. + */ +enum fid_seq { + FID_SEQ_OST_MDT0 = 0, + FID_SEQ_LLOG = 1, /* unnamed llogs */ + FID_SEQ_ECHO = 2, + FID_SEQ_UNUSED_START = 3, + FID_SEQ_UNUSED_END = 9, + FID_SEQ_LLOG_NAME = 10, /* named llogs */ + FID_SEQ_RSVD = 11, + FID_SEQ_IGIF = 12, + FID_SEQ_IGIF_MAX = 0x0ffffffffULL, + FID_SEQ_IDIF = 0x100000000ULL, + FID_SEQ_IDIF_MAX = 0x1ffffffffULL, + /* Normal FID sequence starts from this value, i.e. 1<<33 */ + FID_SEQ_START = 0x200000000ULL, + /* sequence for local pre-defined FIDs listed in local_oid */ + FID_SEQ_LOCAL_FILE = 0x200000001ULL, + FID_SEQ_DOT_LUSTRE = 0x200000002ULL, + /* sequence is used for local named objects FIDs generated + * by local_object_storage library */ + FID_SEQ_LOCAL_NAME = 0x200000003ULL, + /* Because current FLD will only cache the fid sequence, instead + * of oid on the client side, if the FID needs to be exposed to + * clients sides, it needs to make sure all of fids under one + * sequence will be located in one MDT. */ + FID_SEQ_SPECIAL = 0x200000004ULL, + FID_SEQ_QUOTA = 0x200000005ULL, + FID_SEQ_QUOTA_GLB = 0x200000006ULL, + FID_SEQ_ROOT = 0x200000007ULL, /* Located on MDT0 */ + FID_SEQ_LAYOUT_RBTREE = 0x200000008ULL, + /* sequence is used for update logs of cross-MDT operation */ + FID_SEQ_UPDATE_LOG = 0x200000009ULL, + /* Sequence is used for the directory under which update logs + * are created. */ + FID_SEQ_UPDATE_LOG_DIR = 0x20000000aULL, + FID_SEQ_NORMAL = 0x200000400ULL, + FID_SEQ_LOV_DEFAULT = 0xffffffffffffffffULL +}; + +#define OBIF_OID_MAX_BITS 32 +#define OBIF_MAX_OID (1ULL << OBIF_OID_MAX_BITS) +#define OBIF_OID_MASK ((1ULL << OBIF_OID_MAX_BITS) - 1) +#define IDIF_OID_MAX_BITS 48 +#define IDIF_MAX_OID (1ULL << IDIF_OID_MAX_BITS) +#define IDIF_OID_MASK ((1ULL << IDIF_OID_MAX_BITS) - 1) + +/** OID for FID_SEQ_SPECIAL */ +enum special_oid { + /* Big Filesystem Lock to serialize rename operations */ + FID_OID_SPECIAL_BFL = 1UL, +}; + +/** OID for FID_SEQ_DOT_LUSTRE */ +enum dot_lustre_oid { + FID_OID_DOT_LUSTRE = 1UL, + FID_OID_DOT_LUSTRE_OBF = 2UL, + FID_OID_DOT_LUSTRE_LPF = 3UL, +}; + +/** OID for FID_SEQ_ROOT */ +enum root_oid { + FID_OID_ROOT = 1UL, + FID_OID_ECHO_ROOT = 2UL, +}; + +struct lu_orphan_rec { + /* The MDT-object's FID referenced by the orphan OST-object */ + struct lu_fid lor_fid; + __u32 lor_uid; + __u32 lor_gid; +}; + +struct lu_orphan_ent { + /* The orphan OST-object's FID */ + struct lu_fid loe_key; + struct lu_orphan_rec loe_rec; +}; + +struct lu_orphan_rec_v2 { + struct lu_orphan_rec lor_rec; + struct ost_layout lor_layout; + __u32 lor_padding; +}; + +struct lu_orphan_ent_v2 { + /* The orphan OST-object's FID */ + struct lu_fid loe_key; + struct lu_orphan_rec_v2 loe_rec; +}; + +struct lu_orphan_rec_v3 { + struct lu_orphan_rec lor_rec; + struct ost_layout lor_layout; + /* The OST-object declared layout version in PFID EA.*/ + __u32 lor_layout_version; + /* The OST-object declared layout range (of version) in PFID EA.*/ + __u32 lor_range; + __u32 lor_padding_1; + __u64 lor_padding_2; +}; + +struct lu_orphan_ent_v3 { + /* The orphan OST-object's FID */ + struct lu_fid loe_key; + struct lu_orphan_rec_v3 loe_rec; +}; + +/** @} lu_fid */ + +/** \defgroup lu_dir lu_dir + * @{ */ + +/** + * Enumeration of possible directory entry attributes. + * + * Attributes follow directory entry header in the order they appear in this + * enumeration. + */ +enum lu_dirent_attrs { + LUDA_FID = 0x0001, + LUDA_TYPE = 0x0002, + LUDA_64BITHASH = 0x0004, + + /* The following attrs are used for MDT internal only, + * not visible to client */ + + /* Something in the record is unknown, to be verified in further. */ + LUDA_UNKNOWN = 0x0400, + /* Ignore this record, go to next directly. */ + LUDA_IGNORE = 0x0800, + /* The system is upgraded, has beed or to be repaired (dryrun). */ + LUDA_UPGRADE = 0x1000, + /* The dirent has been repaired, or to be repaired (dryrun). */ + LUDA_REPAIR = 0x2000, + /* Only check but not repair the dirent inconsistency */ + LUDA_VERIFY_DRYRUN = 0x4000, + /* Verify the dirent consistency */ + LUDA_VERIFY = 0x8000, +}; + +#define LU_DIRENT_ATTRS_MASK 0xff00 + +/** + * Layout of readdir pages, as transmitted on wire. + */ +struct lu_dirent { + /** valid if LUDA_FID is set. */ + struct lu_fid lde_fid; + /** a unique entry identifier: a hash or an offset. */ + __u64 lde_hash; + /** total record length, including all attributes. */ + __u16 lde_reclen; + /** name length */ + __u16 lde_namelen; + /** optional variable size attributes following this entry. + * taken from enum lu_dirent_attrs. + */ + __u32 lde_attrs; + /** name is followed by the attributes indicated in ->ldp_attrs, in + * their natural order. After the last attribute, padding bytes are + * added to make ->lde_reclen a multiple of 8. + */ + char lde_name[0]; +}; + +/* + * Definitions of optional directory entry attributes formats. + * + * Individual attributes do not have their length encoded in a generic way. It + * is assumed that consumer of an attribute knows its format. This means that + * it is impossible to skip over an unknown attribute, except by skipping over all + * remaining attributes (by using ->lde_reclen), which is not too + * constraining, because new server versions will append new attributes at + * the end of an entry. + */ + +/** + * Fid directory attribute: a fid of an object referenced by the entry. This + * will be almost always requested by the client and supplied by the server. + * + * Aligned to 8 bytes. + */ +/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */ + +/** + * File type. + * + * Aligned to 2 bytes. + */ +struct luda_type { + __u16 lt_type; +}; + +struct lu_dirpage { + __u64 ldp_hash_start; + __u64 ldp_hash_end; + __u32 ldp_flags; + __u32 ldp_pad0; + struct lu_dirent ldp_entries[0]; +}; + +enum lu_dirpage_flags { + /** + * dirpage contains no entry. + */ + LDF_EMPTY = 1 << 0, + /** + * last entry's lde_hash equals ldp_hash_end. + */ + LDF_COLLIDE = 1 << 1 +}; + +static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp) +{ + if (__le32_to_cpu(dp->ldp_flags) & LDF_EMPTY) + return NULL; + else + return dp->ldp_entries; +} + +static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent) +{ + struct lu_dirent *next; + + if (__le16_to_cpu(ent->lde_reclen) != 0) + next = ((void *)ent) + __le16_to_cpu(ent->lde_reclen); + else + next = NULL; + + return next; +} + +static inline __kernel_size_t lu_dirent_calc_size(size_t namelen, __u16 attr) +{ + __kernel_size_t size; + + if (attr & LUDA_TYPE) { + const __kernel_size_t align = sizeof(struct luda_type) - 1; + + size = (sizeof(struct lu_dirent) + namelen + 1 + align) & + ~align; + size += sizeof(struct luda_type); + } else { + size = sizeof(struct lu_dirent) + namelen + 1; + } + + return (size + 7) & ~7; +} + +#define MDS_DIR_END_OFF 0xfffffffffffffffeULL + +/** + * MDS_READPAGE page size + * + * This is the directory page size packed in MDS_READPAGE RPC. + * It's different than PAGE_SIZE because the client needs to + * access the struct lu_dirpage header packed at the beginning of + * the "page" and without this there isn't any way to know find the + * lu_dirpage header is if client and server PAGE_SIZE differ. + */ +#define LU_PAGE_SHIFT 12 +#define LU_PAGE_SIZE (1UL << LU_PAGE_SHIFT) +#define LU_PAGE_MASK (~(LU_PAGE_SIZE - 1)) + +#define LU_PAGE_COUNT (1 << (PAGE_SHIFT - LU_PAGE_SHIFT)) + +/** @} lu_dir */ + +struct lustre_handle { + __u64 cookie; +}; +#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL + +static inline bool lustre_handle_is_used(const struct lustre_handle *lh) +{ + return lh->cookie != 0; +} + +static inline bool lustre_handle_equal(const struct lustre_handle *lh1, + const struct lustre_handle *lh2) +{ + return lh1->cookie == lh2->cookie; +} + +static inline void lustre_handle_copy(struct lustre_handle *tgt, + const struct lustre_handle *src) +{ + tgt->cookie = src->cookie; +} + +/* lustre_msg struct magic. DON'T use swabbed values of MAGIC as magic! */ +enum lustre_msg_magic { + LUSTRE_MSG_MAGIC_V2 = 0x0BD00BD3, + LUSTRE_MSG_MAGIC_V2_SWABBED = 0xD30BD00B, + LUSTRE_MSG_MAGIC = LUSTRE_MSG_MAGIC_V2 +}; + +/* flags for lm_flags */ +enum lustre_msghdr { + MSGHDR_AT_SUPPORT = 0x1, /* adaptive timeouts, lm_cksum valid + * in early reply messages */ + MSGHDR_CKSUM_INCOMPAT18 = 0x2, /* compat for 1.8, needs to be set well + * beyond 2.8.0 for compatibility */ +}; + +#define lustre_msg lustre_msg_v2 +/* we depend on this structure to be 8-byte aligned */ +/* this type is only endian-adjusted in lustre_unpack_msg() */ +struct lustre_msg_v2 { + __u32 lm_bufcount; /* number of buffers in lm_buflens[] */ + __u32 lm_secflvr; /* 0 = no crypto, or sptlrpc security flavour */ + __u32 lm_magic; /* RPC version magic = LUSTRE_MSG_MAGIC_V2 */ + __u32 lm_repsize; /* size of preallocated reply buffer */ + __u32 lm_cksum; /* CRC32 of ptlrpc_body early reply messages */ + __u32 lm_flags; /* enum lustre_msghdr MSGHDR_* flags */ + __u32 lm_padding_2; /* unused */ + __u32 lm_padding_3; /* unused */ + __u32 lm_buflens[0]; /* length of additional buffers in bytes, + * padded to a multiple of 8 bytes. */ + /* + * message buffers are packed after padded lm_buflens[] array, + * padded to a multiple of 8 bytes each to align contents. + */ +}; + +/* ptlrpc_body packet pb_types */ +#define PTL_RPC_MSG_REQUEST 4711 /* normal RPC request message */ +#define PTL_RPC_MSG_ERR 4712 /* error reply if request unprocessed */ +#define PTL_RPC_MSG_REPLY 4713 /* normal RPC reply message */ + +/* ptlrpc_body pb_version ((target_version << 16) | rpc_version) */ +enum lustre_msg_version { + PTLRPC_MSG_VERSION = 0x00000003, + LUSTRE_VERSION_MASK = 0xffff0000, + LUSTRE_OBD_VERSION = 0x00010000, + LUSTRE_MDS_VERSION = 0x00020000, + LUSTRE_OST_VERSION = 0x00030000, + LUSTRE_DLM_VERSION = 0x00040000, + LUSTRE_LOG_VERSION = 0x00050000, + LUSTRE_MGS_VERSION = 0x00060000, +}; + +/* pb_flags that apply to all request messages */ +/* #define MSG_LAST_REPLAY 0x0001 obsolete 2.0 => {REQ,LOCK}_REPLAY_DONE */ +#define MSG_RESENT 0x0002 /* was previously sent, no reply seen */ +#define MSG_REPLAY 0x0004 /* was processed, got reply, recovery */ +/* #define MSG_AT_SUPPORT 0x0008 obsolete since 1.5, AT always enabled */ +/* #define MSG_DELAY_REPLAY 0x0010 obsolete since 2.0 */ +/* #define MSG_VERSION_REPLAY 0x0020 obsolete since 1.8.2, VBR always on */ +#define MSG_REQ_REPLAY_DONE 0x0040 /* request replay over, locks next */ +#define MSG_LOCK_REPLAY_DONE 0x0080 /* lock replay over, client done */ + +/* pb_op_flags for connect opcodes: MDS_CONNECT, OST_CONNECT, MGS_CONNECT */ +#define MSG_CONNECT_RECOVERING 0x00000001 /* target is in recovery */ +#define MSG_CONNECT_RECONNECT 0x00000002 /* tgt already has client import */ +#define MSG_CONNECT_REPLAYABLE 0x00000004 /* target supports RPC replay */ +/* #define MSG_CONNECT_PEER 0x00000008 obsolete since 1.2, removed in 1.5 */ +#define MSG_CONNECT_LIBCLIENT 0x00000010 /* obsolete since 2.3, removed 2.6 */ +#define MSG_CONNECT_INITIAL 0x00000020 /* first client connection attempt */ +/* #define MSG_CONNECT_ASYNC 0x00000040 obsolete since 1.5 */ +#define MSG_CONNECT_NEXT_VER 0x00000080 /* use next version of lustre_msg */ +#define MSG_CONNECT_TRANSNO 0x00000100 /* client sent transno in replay */ + +/* number of previous object versions in pb_pre_versions[] */ +#define PTLRPC_NUM_VERSIONS 4 +/* without gss, ptlrpc_body is put at the first buffer. */ +struct ptlrpc_body_v3 { + struct lustre_handle pb_handle; + __u32 pb_type; /* request/reply/err type: PTL_RPC_MSG_* */ + __u32 pb_version; /* LUSTRE_*_VERSION | PTLRPC_MSG_VERSION */ + __u32 pb_opc; /* RPC opcodes: MDS_*, OST_*, LDLM_, ... */ + __u32 pb_status; /* negative Linux x86 error number */ + __u64 pb_last_xid; /* highest replied XID w/o lower unreplied XID*/ + __u16 pb_tag; /* multiple modifying RPCs virtual slot index */ + __u16 pb_padding0; + __u32 pb_padding1; + __u64 pb_last_committed;/* rep: highest pb_transno committed to disk */ + __u64 pb_transno; /* server-assigned transno for modifying RPCs */ + __u32 pb_flags; /* req: MSG_* flags */ + __u32 pb_op_flags; /* req: MSG_CONNECT_* flags */ + __u32 pb_conn_cnt; /* connect instance of this client on server */ + __u32 pb_timeout; /* req: max wait time; rep: service estimate */ + __u32 pb_service_time; /* rep: server arrival to reply in seconds */ + __u32 pb_limit; /* rep: dynamic DLM LRU lock count limit */ + __u64 pb_slv; /* rep: dynamic DLM LRU server lock volume */ + /* VBR: rep: previous pb_version(s) of objects modified by this RPC */ + __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; + __u64 pb_mbits; /**< match bits for bulk request */ + /* padding for future needs - fix lustre_swab_ptlrpc_body() also */ + __u64 pb_padding64_0; + __u64 pb_padding64_1; + __u64 pb_padding64_2; + char pb_jobid[LUSTRE_JOBID_SIZE]; /* req: ASCII jobid from env + NUL */ +}; +#define ptlrpc_body ptlrpc_body_v3 + +struct ptlrpc_body_v2 { + struct lustre_handle pb_handle; + __u32 pb_type; + __u32 pb_version; + __u32 pb_opc; + __u32 pb_status; + __u64 pb_last_xid; /* highest replied XID without lower unreplied XID */ + __u16 pb_tag; /* virtual slot idx for multiple modifying RPCs */ + __u16 pb_padding0; + __u32 pb_padding1; + __u64 pb_last_committed; + __u64 pb_transno; + __u32 pb_flags; + __u32 pb_op_flags; + __u32 pb_conn_cnt; + __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ + __u32 pb_service_time; /* for rep, actual service time, also used for + net_latency of req */ + __u32 pb_limit; + __u64 pb_slv; + /* VBR: pre-versions */ + __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; + __u64 pb_mbits; /**< unused in V2 */ + /* padding for future needs */ + __u64 pb_padding64_0; + __u64 pb_padding64_1; + __u64 pb_padding64_2; +}; + +/* message body offset for lustre_msg_v2 */ +/* ptlrpc body offset in all request/reply messages */ +#define MSG_PTLRPC_BODY_OFF 0 + +/* normal request/reply message record offset */ +#define REQ_REC_OFF 1 +#define REPLY_REC_OFF 1 + +/* ldlm request message body offset */ +#define DLM_LOCKREQ_OFF 1 /* lockreq offset */ +#define DLM_REQ_REC_OFF 2 /* normal dlm request record offset */ + +/* ldlm intent lock message body offset */ +#define DLM_INTENT_IT_OFF 2 /* intent lock it offset */ +#define DLM_INTENT_REC_OFF 3 /* intent lock record offset */ + +/* ldlm reply message body offset */ +#define DLM_LOCKREPLY_OFF 1 /* lockrep offset */ +#define DLM_REPLY_REC_OFF 2 /* reply record offset */ + +/** only use in req->rq_{req,rep}_swab_mask */ +#define MSG_PTLRPC_HEADER_OFF 31 + +/* Connect flags */ +#define OBD_CONNECT_RDONLY 0x1ULL /*client has read-only access*/ +#define OBD_CONNECT_INDEX 0x2ULL /*connect specific LOV idx */ +#define OBD_CONNECT_MDS 0x4ULL /*connect from MDT to OST */ +#define OBD_CONNECT_GRANT 0x8ULL /*OSC gets grant at connect */ +#define OBD_CONNECT_SRVLOCK 0x10ULL /*server takes locks for cli */ +#define OBD_CONNECT_VERSION 0x20ULL /*Lustre versions in ocd */ +#define OBD_CONNECT_REQPORTAL 0x40ULL /*Separate non-IO req portal */ +#define OBD_CONNECT_ACL 0x80ULL /*access control lists */ +#define OBD_CONNECT_XATTR 0x100ULL /*client use extended attr */ +#define OBD_CONNECT_LARGE_ACL 0x200ULL /* more than 32 ACL entries */ +#define OBD_CONNECT_TRUNCLOCK 0x400ULL /*locks on server for punch */ +#define OBD_CONNECT_TRANSNO 0x800ULL /*replay sends init transno */ +#define OBD_CONNECT_IBITS 0x1000ULL /* not checked in 2.11+ */ +#define OBD_CONNECT_BARRIER 0x2000ULL /* write barrier */ +#define OBD_CONNECT_ATTRFID 0x4000ULL /*Server can GetAttr By Fid*/ +#define OBD_CONNECT_NODEVOH 0x8000ULL /*No open hndl on specl nodes*/ +#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /* Remote client, never used + * in production. Removed in + * 2.9. Keep this flag to + * avoid reusing. + */ +#define OBD_CONNECT_RMT_CLIENT_FORCE 0x20000ULL /* Remote client by force, + * never used in production. + * Removed in 2.9. Keep this + * flag to avoid reusing. + */ +#define OBD_CONNECT_BRW_SIZE 0x40000ULL /*Max bytes per rpc */ +#define OBD_CONNECT_QUOTA64 0x80000ULL /*Not used since 2.4 */ +#define OBD_CONNECT_MDS_CAPA 0x100000ULL /*MDS capability */ +#define OBD_CONNECT_OSS_CAPA 0x200000ULL /*OSS capability */ +#define OBD_CONNECT_CANCELSET 0x400000ULL /*Early batched cancels. */ +#define OBD_CONNECT_SOM 0x800000ULL /*Size on MDS */ +#define OBD_CONNECT_AT 0x1000000ULL /*client uses AT */ +#define OBD_CONNECT_LRU_RESIZE 0x2000000ULL /*LRU resize feature. */ +#define OBD_CONNECT_MDS_MDS 0x4000000ULL /*MDS-MDS connection */ +#define OBD_CONNECT_REAL 0x8000000ULL /* obsolete since 2.8 */ +#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /*Not used since 2.4 */ +#define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos*/ +#define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */ +#define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */ +#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */ +#define OBD_CONNECT_GRANT_SHRINK 0x200000000ULL /* support grant shrink */ +#define OBD_CONNECT_SKIP_ORPHAN 0x400000000ULL /* don't reuse orphan objids */ +#define OBD_CONNECT_MAX_EASIZE 0x800000000ULL /* preserved for large EA */ +#define OBD_CONNECT_FULL20 0x1000000000ULL /* it is 2.0 client */ +#define OBD_CONNECT_LAYOUTLOCK 0x2000000000ULL /* client uses layout lock */ +#define OBD_CONNECT_64BITHASH 0x4000000000ULL /* client supports 64-bits + * directory hash */ +#define OBD_CONNECT_MAXBYTES 0x8000000000ULL /* max stripe size */ +#define OBD_CONNECT_IMP_RECOV 0x10000000000ULL /* imp recovery support */ +#define OBD_CONNECT_JOBSTATS 0x20000000000ULL /* jobid in ptlrpc_body */ +#define OBD_CONNECT_UMASK 0x40000000000ULL /* create uses client umask */ +#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS + * RPC error properly */ +#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for + * finer space reservation */ +#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8 + * policy and 2.x server */ +#define OBD_CONNECT_LVB_TYPE 0x400000000000ULL /* variable type of LVB */ +#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */ +#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */ +#define OBD_CONNECT_SHORTIO 0x2000000000000ULL/* short io */ +#define OBD_CONNECT_PINGLESS 0x4000000000000ULL/* pings not required */ +#define OBD_CONNECT_FLOCK_DEAD 0x8000000000000ULL/* improved flock deadlock detection */ +#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/* create stripe disposition*/ +#define OBD_CONNECT_OPEN_BY_FID 0x20000000000000ULL /* open by fid won't pack + name in request */ +#define OBD_CONNECT_LFSCK 0x40000000000000ULL/* support online LFSCK */ +#define OBD_CONNECT_UNLINK_CLOSE 0x100000000000000ULL/* close file in unlink */ +#define OBD_CONNECT_MULTIMODRPCS 0x200000000000000ULL /* support multiple modify + RPCs in parallel */ +#define OBD_CONNECT_DIR_STRIPE 0x400000000000000ULL /* striped DNE dir */ +#define OBD_CONNECT_SUBTREE 0x800000000000000ULL /* fileset mount */ +#define OBD_CONNECT_LOCKAHEAD_OLD 0x1000000000000000ULL /* Old Cray lockahead */ + +/** bulk matchbits is sent within ptlrpc_body */ +#define OBD_CONNECT_BULK_MBITS 0x2000000000000000ULL +#define OBD_CONNECT_OBDOPACK 0x4000000000000000ULL /* compact OUT obdo */ +#define OBD_CONNECT_FLAGS2 0x8000000000000000ULL /* second flags word */ +/* ocd_connect_flags2 flags */ +#define OBD_CONNECT2_FILE_SECCTX 0x1ULL /* set file security context at create */ +#define OBD_CONNECT2_LOCKAHEAD 0x2ULL /* ladvise lockahead v2 */ +#define OBD_CONNECT2_DIR_MIGRATE 0x4ULL /* migrate striped dir */ +#define OBD_CONNECT2_SUM_STATFS 0x8ULL /* MDT return aggregated stats */ +#define OBD_CONNECT2_FLR 0x20ULL /* FLR support */ +#define OBD_CONNECT2_WBC_INTENTS 0x40ULL /* create/unlink/... intents for wbc, also operations under client-held parent locks */ +#define OBD_CONNECT2_LOCK_CONVERT 0x80ULL /* IBITS lock convert support */ +#define OBD_CONNECT2_ARCHIVE_ID_ARRAY 0x100ULL /* store HSM archive_id in array */ +#define OBD_CONNECT2_SELINUX_POLICY 0x400ULL /* has client SELinux policy */ +#define OBD_CONNECT2_LSOM 0x800ULL /* LSOM support */ +#define OBD_CONNECT2_ASYNC_DISCARD 0x4000ULL /* support async DoM data discard */ +#define OBD_CONNECT2_ENCRYPT 0x8000ULL /* client-to-disk encrypt */ +#define OBD_CONNECT2_FIDMAP 0x10000ULL /* FID map */ +#define OBD_CONNECT2_GETATTR_PFID 0x20000ULL /* pack parent FID in getattr */ +/* risk of forwards incompatibility with upstream - use high order bits to mitigate */ +#define OBD_CONNECT2_MDLL_BYPASS 0x800000000000000ULL /* disable metadata lazy load */ +#define OBD_CONNECT2_MDLL 0x1000000000000000ULL /* enable metadata lazy load */ +#define OBD_CONNECT2_MDLL_AUTO_REFRESH 0x2000000000000000ULL /* enable metadata lazy load auto-refresh */ +/* XXX README XXX: + * Please DO NOT add flag values here before first ensuring that this same + * flag value is not in use on some other branch. Please clear any such + * changes with senior engineers before starting to use a new flag. Then, + * submit a small patch against EVERY branch that ONLY adds the new flag, + * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the + * flag to check_obd_connect_data(), and updates wiretests accordingly, so it + * can be approved and landed easily to reserve the flag for future use. */ + +/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS + * connection. It is a temporary bug fix for Imperative Recovery interop + * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for + * 2.2 clients/servers is no longer needed. LU-1252/LU-1644. */ +#define OBD_CONNECT_MNE_SWAB OBD_CONNECT_MDS_MDS + +#define OCD_HAS_FLAG(ocd, flg) \ + (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg)) + + +#ifdef HAVE_LRU_RESIZE_SUPPORT +#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE +#else +#define LRU_RESIZE_CONNECT_FLAG 0 +#endif + +#define MDT_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \ + OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \ + OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | \ + OBD_CONNECT_ATTRFID | OBD_CONNECT_CANCELSET | \ + OBD_CONNECT_AT | OBD_CONNECT_BRW_SIZE | \ + OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | \ + LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_VBR | \ + OBD_CONNECT_LOV_V3 | OBD_CONNECT_FULL20 | \ + OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \ + OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\ + OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE |\ + OBD_CONNECT_FLOCK_DEAD | \ + OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | \ + OBD_CONNECT_OPEN_BY_FID | \ + OBD_CONNECT_DIR_STRIPE | OBD_CONNECT_GRANT | \ + OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_SRVLOCK | \ + OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | \ + OBD_CONNECT_MULTIMODRPCS | \ + OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL | \ + OBD_CONNECT_GRANT_PARAM | \ + OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2) + +#define MDT_CONNECT_SUPPORTED2 (OBD_CONNECT2_FILE_SECCTX | OBD_CONNECT2_FLR | \ + OBD_CONNECT2_SUM_STATFS | \ + OBD_CONNECT2_LOCK_CONVERT | \ + OBD_CONNECT2_DIR_MIGRATE | \ + OBD_CONNECT2_ARCHIVE_ID_ARRAY | \ + OBD_CONNECT2_SELINUX_POLICY | \ + OBD_CONNECT2_LSOM | \ + OBD_CONNECT2_ASYNC_DISCARD | \ + OBD_CONNECT2_GETATTR_PFID | \ + OBD_CONNECT2_MDLL_BYPASS | \ + OBD_CONNECT2_MDLL | \ + OBD_CONNECT2_MDLL_AUTO_REFRESH) + +#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ + OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ + OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \ + OBD_CONNECT_BRW_SIZE | OBD_CONNECT_CANCELSET | \ + OBD_CONNECT_AT | LRU_RESIZE_CONNECT_FLAG | \ + OBD_CONNECT_CKSUM | OBD_CONNECT_VBR | \ + OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \ + OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 |\ + OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \ + OBD_CONNECT_MAX_EASIZE | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\ + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \ + OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | \ + OBD_CONNECT_BULK_MBITS | \ + OBD_CONNECT_GRANT_PARAM | \ + OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2) + +#define OST_CONNECT_SUPPORTED2 OBD_CONNECT2_LOCKAHEAD + +#define ECHO_CONNECT_SUPPORTED (OBD_CONNECT_FID) +#define ECHO_CONNECT_SUPPORTED2 0 + +#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \ + OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \ + OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS |\ + OBD_CONNECT_BULK_MBITS | OBD_CONNECT_BARRIER) + +#define MGS_CONNECT_SUPPORTED2 0 + +/* Features required for this version of the client to work with server */ +#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_FID | \ + OBD_CONNECT_ATTRFID | \ + OBD_CONNECT_FULL20) + +/* This structure is used for both request and reply. + * + * If we eventually have separate connect data for different types, which we + * almost certainly will, then perhaps we stick a union in here. */ +struct obd_connect_data { + __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */ + __u32 ocd_version; /* lustre release version number */ + __u32 ocd_grant; /* initial cache grant amount (bytes) */ + __u32 ocd_index; /* LOV index to connect to */ + __u32 ocd_brw_size; /* Maximum BRW size in bytes */ + __u64 ocd_ibits_known; /* inode bits this client understands */ + __u8 ocd_grant_blkbits; /* log2 of the backend filesystem blocksize */ + __u8 ocd_grant_inobits; /* log2 of the per-inode space consumption */ + __u16 ocd_grant_tax_kb; /* extent insertion overhead, in 1K blocks */ + __u32 ocd_grant_max_blks;/* maximum number of blocks per extent */ + __u64 ocd_transno; /* first transno from client to be replayed */ + __u32 ocd_group; /* MDS group on OST */ + __u32 ocd_cksum_types; /* supported checksum algorithms */ + __u32 ocd_max_easize; /* How big LOV EA can be on MDS */ + __u32 ocd_instance; /* instance # of this target */ + __u64 ocd_maxbytes; /* Maximum stripe size in bytes */ + /* Fields after ocd_maxbytes are only accessible by the receiver + * if the corresponding flag in ocd_connect_flags is set. Accessing + * any field after ocd_maxbytes on the receiver without a valid flag + * may result in out-of-bound memory access and kernel oops. */ + __u16 ocd_maxmodrpcs; /* Maximum modify RPCs in parallel */ + __u16 padding0; /* added 2.1.0. also fix lustre_swab_connect */ + __u32 padding1; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 ocd_connect_flags2; + __u64 padding3; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding4; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding5; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding6; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding7; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding8; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding9; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingA; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingB; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingC; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingD; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingE; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingF; /* added 2.1.0. also fix lustre_swab_connect */ +}; +/* XXX README XXX: + * Please DO NOT use any fields here before first ensuring that this same + * field is not in use on some other branch. Please clear any such changes + * with senior engineers before starting to use a new field. Then, submit + * a small patch against EVERY branch that ONLY adds the new field along with + * the matching OBD_CONNECT flag, so that can be approved and landed easily to + * reserve the flag for future use. */ + +/* + * Supported checksum algorithms. Up to 32 checksum types are supported. + * (32-bit mask stored in obd_connect_data::ocd_cksum_types) + * Please update DECLARE_CKSUM_NAME in obd_cksum.h when adding a new + * algorithm and also the OBD_FL_CKSUM* flags, OBD_CKSUM_ALL flag, + * OBD_FL_CKSUM_ALL flag and potentially OBD_CKSUM_T10_ALL flag. + */ +enum cksum_types { + OBD_CKSUM_CRC32 = 0x00000001, + OBD_CKSUM_ADLER = 0x00000002, + OBD_CKSUM_CRC32C = 0x00000004, + OBD_CKSUM_RESERVED = 0x00000008, + OBD_CKSUM_T10IP512 = 0x00000010, + OBD_CKSUM_T10IP4K = 0x00000020, + OBD_CKSUM_T10CRC512 = 0x00000040, + OBD_CKSUM_T10CRC4K = 0x00000080, +}; + +#define OBD_CKSUM_T10_ALL (OBD_CKSUM_T10IP512 | OBD_CKSUM_T10IP4K | \ + OBD_CKSUM_T10CRC512 | OBD_CKSUM_T10CRC4K) + +#define OBD_CKSUM_ALL (OBD_CKSUM_CRC32 | OBD_CKSUM_ADLER | OBD_CKSUM_CRC32C | \ + OBD_CKSUM_T10_ALL) + +/* + * The default checksum algorithm used on top of T10PI GRD tags for RPC. + * Considering that the checksum-of-checksums is only computing CRC32 on a + * 4KB chunk of GRD tags for a 1MB RPC for 512B sectors, or 16KB of GRD + * tags for 16MB of 4KB sectors, this is only 1/256 or 1/1024 of the + * total data being checksummed, so the checksum type used here should not + * affect overall system performance noticeably. + */ +#define OBD_CKSUM_T10_TOP OBD_CKSUM_ADLER + +/* + * OST requests: OBDO & OBD request records + */ + +/* opcodes */ +enum ost_cmd { + OST_REPLY = 0, /* reply ? */ + OST_GETATTR = 1, + OST_SETATTR = 2, + OST_READ = 3, + OST_WRITE = 4, + OST_CREATE = 5, + OST_DESTROY = 6, + OST_GET_INFO = 7, + OST_CONNECT = 8, + OST_DISCONNECT = 9, + OST_PUNCH = 10, + OST_OPEN = 11, + OST_CLOSE = 12, + OST_STATFS = 13, + OST_SYNC = 16, + OST_SET_INFO = 17, + OST_QUOTACHECK = 18, /* not used since 2.4 */ + OST_QUOTACTL = 19, + OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */ + OST_LADVISE = 21, + OST_LAST_OPC, /* must be < 33 to avoid MDS_GETATTR */ + OST_FALLOCATE = 22, + OST_SEEK = 23, +}; +#define OST_FIRST_OPC OST_REPLY + +enum obdo_flags { + OBD_FL_INLINEDATA = 0x00000001, + OBD_FL_OBDMDEXISTS = 0x00000002, + OBD_FL_DELORPHAN = 0x00000004, /* if set in o_flags delete orphans */ + OBD_FL_NORPC = 0x00000008, /* set in o_flags do in OSC not OST */ + OBD_FL_IDONLY = 0x00000010, /* set in o_flags only adjust obj id*/ + OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */ + OBD_FL_DEBUG_CHECK = 0x00000040, /* echo client/server debug check */ + OBD_FL_NO_PRJQUOTA = 0x00000080, /* the object's project is over + * quota */ + OBD_FL_NO_USRQUOTA = 0x00000100, /* the object's owner is over quota */ + OBD_FL_NO_GRPQUOTA = 0x00000200, /* the object's group is over quota */ + OBD_FL_CREATE_CROW = 0x00000400, /* object should be create on write */ + OBD_FL_SRVLOCK = 0x00000800, /* delegate DLM locking to server */ + OBD_FL_CKSUM_CRC32 = 0x00001000, /* CRC32 checksum type */ + OBD_FL_CKSUM_ADLER = 0x00002000, /* ADLER checksum type */ + OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */ + OBD_FL_CKSUM_T10IP512 = 0x00005000, /* T10PI IP cksum, 512B sector */ + OBD_FL_CKSUM_T10IP4K = 0x00006000, /* T10PI IP cksum, 4KB sector */ + OBD_FL_CKSUM_T10CRC512 = 0x00007000, /* T10PI CRC cksum, 512B sector */ + OBD_FL_CKSUM_T10CRC4K = 0x00008000, /* T10PI CRC cksum, 4KB sector */ + OBD_FL_CKSUM_RSVD3 = 0x00010000, /* for future cksum types */ + OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */ + OBD_FL_MMAP = 0x00040000, /* object is mmapped on the client. + * XXX: obsoleted - reserved for old + * clients prior than 2.2 */ + OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */ + OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */ + OBD_FL_FLUSH = 0x00200000, /* flush pages on the OST */ + OBD_FL_SHORT_IO = 0x00400000, /* short io request */ + /* OBD_FL_LOCAL_MASK = 0xF0000000, was local-only flags until 2.10 */ + + /* + * Note that while the original checksum values were separate bits, + * in 2.x we can actually allow all values from 1-31. T10-PI checksum + * types already use values which are not separate bits. + */ + OBD_FL_CKSUM_ALL = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER | + OBD_FL_CKSUM_CRC32C | OBD_FL_CKSUM_T10IP512 | + OBD_FL_CKSUM_T10IP4K | OBD_FL_CKSUM_T10CRC512 | + OBD_FL_CKSUM_T10CRC4K, + + OBD_FL_NO_QUOTA_ALL = OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA | + OBD_FL_NO_PRJQUOTA, +}; + +/* + * All LOV EA magics should have the same postfix, if some new version + * Lustre instroduces new LOV EA magic, then when down-grade to an old + * Lustre, even though the old version system does not recognizes such + * new magic, it still can distinguish the corrupted cases by checking + * the magic's postfix. + */ +#define LOV_MAGIC_MAGIC 0x0BD0 +#define LOV_MAGIC_MASK 0xFFFF + +#define LOV_MAGIC_V1 (0x0BD10000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_JOIN_V1 (0x0BD20000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_V3 (0x0BD30000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_MIGRATE (0x0BD40000 | LOV_MAGIC_MAGIC) +/* reserved for specifying OSTs */ +#define LOV_MAGIC_SPECIFIC (0x0BD50000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC LOV_MAGIC_V1 +#define LOV_MAGIC_COMP_V1 (0x0BD60000 | LOV_MAGIC_MAGIC) + +/* + * magic for fully defined striping + * the idea is that we should have different magics for striping "hints" + * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct + * lov_mds_md_v[13]). at the moment the magics are used in wire protocol, + * we can't just change it w/o long way preparation, but we still need a + * mechanism to allow LOD to differentiate hint versus ready striping. + * so, at the moment we do a trick: MDT knows what to expect from request + * depending on the case (replay uses ready striping, non-replay req uses + * hints), so MDT replaces magic with appropriate one and now LOD can + * easily understand what's inside -bzzz + * + * those *_DEF magics are only used on server side internally, they + * won't be put on wire or disk. + */ +#define LOV_MAGIC_DEFINED 0x10000000 +#define LOV_MAGIC_V1_DEFINED (LOV_MAGIC_DEFINED | LOV_MAGIC_V1) +#define LOV_MAGIC_V3_DEFINED (LOV_MAGIC_DEFINED | LOV_MAGIC_V3) +#define LOV_MAGIC_COMP_V1_DEFINED (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1) + +#define lov_pattern(pattern) (pattern & ~LOV_PATTERN_F_MASK) +#define lov_pattern_flags(pattern) (pattern & LOV_PATTERN_F_MASK) + +#define lov_ost_data lov_ost_data_v1 +struct lov_ost_data_v1 { /* per-stripe data structure (little-endian)*/ + struct ost_id l_ost_oi; /* OST object ID */ + __u32 l_ost_gen; /* generation of this l_ost_idx */ + __u32 l_ost_idx; /* OST index in LOV (lov_tgt_desc->tgts) */ +}; + +#define lov_mds_md lov_mds_md_v1 +struct lov_mds_md_v1 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V1 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + /* lmm_stripe_count used to be __u32 */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_layout_gen; /* layout generation number */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + +#define MAX_MD_SIZE_OLD (sizeof(struct lov_mds_md) + \ + 4 * sizeof(struct lov_ost_data)) +#define MAX_MD_SIZE (sizeof(struct lov_comp_md_v1) + \ + 4 * (sizeof(struct lov_comp_md_entry_v1) + \ + MAX_MD_SIZE_OLD)) +#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data)) + +/* This is the default MDT reply size allocated, should the striping be bigger, + * it will be reallocated in mdt_fix_reply. + * 100 stripes is a bit less than 2.5k of data */ +#define DEF_REP_MD_SIZE (sizeof(struct lov_mds_md) + \ + 100 * sizeof(struct lov_ost_data)) + +#define XATTR_NAME_ACL_ACCESS "system.posix_acl_access" +#define XATTR_NAME_ACL_DEFAULT "system.posix_acl_default" +#define XATTR_USER_PREFIX "user." +#define XATTR_TRUSTED_PREFIX "trusted." +#define XATTR_SECURITY_PREFIX "security." + +#define XATTR_NAME_SOM "trusted.som" +#define XATTR_NAME_LOV "trusted.lov" +#define XATTR_NAME_LMA "trusted.lma" +#define XATTR_NAME_LMV "trusted.lmv" +#define XATTR_NAME_DEFAULT_LMV "trusted.dmv" +#define XATTR_NAME_LINK "trusted.link" +#define XATTR_NAME_FID "trusted.fid" +#define XATTR_NAME_VERSION "trusted.version" +#define XATTR_NAME_SOM "trusted.som" +#define XATTR_NAME_HSM "trusted.hsm" +#define XATTR_NAME_LFSCK_BITMAP "trusted.lfsck_bitmap" +#define XATTR_NAME_DUMMY "trusted.dummy" + +#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_ns" +#define XATTR_NAME_MAX_LEN 32 /* increase this, if there is longer name. */ + +struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + /* lmm_stripe_count used to be __u32 */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_layout_gen; /* layout generation number */ + char lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* must be 32bit aligned */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + +static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic) +{ + if (stripes == (__u16)-1) + stripes = 0; + + if (lmm_magic == LOV_MAGIC_V3) + return sizeof(struct lov_mds_md_v3) + + stripes * sizeof(struct lov_ost_data_v1); + else + return sizeof(struct lov_mds_md_v1) + + stripes * sizeof(struct lov_ost_data_v1); +} + +static inline __u32 +lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic) +{ + switch (lmm_magic) { + case LOV_MAGIC_V1: { + struct lov_mds_md_v1 lmm; + + if (buf_size < sizeof(lmm)) + return 0; + + return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]); + } + case LOV_MAGIC_V3: { + struct lov_mds_md_v3 lmm; + + if (buf_size < sizeof(lmm)) + return 0; + + return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]); + } + default: + return 0; + } +} + +#define OBD_MD_FLID (0x00000001ULL) /* object ID */ +#define OBD_MD_FLATIME (0x00000002ULL) /* access time */ +#define OBD_MD_FLMTIME (0x00000004ULL) /* data modification time */ +#define OBD_MD_FLCTIME (0x00000008ULL) /* change time */ +#define OBD_MD_FLSIZE (0x00000010ULL) /* size */ +#define OBD_MD_FLBLOCKS (0x00000020ULL) /* allocated blocks count */ +#define OBD_MD_FLBLKSZ (0x00000040ULL) /* block size */ +#define OBD_MD_FLMODE (0x00000080ULL) /* access bits (mode & ~S_IFMT) */ +#define OBD_MD_FLTYPE (0x00000100ULL) /* object type (mode & S_IFMT) */ +#define OBD_MD_FLUID (0x00000200ULL) /* user ID */ +#define OBD_MD_FLGID (0x00000400ULL) /* group ID */ +#define OBD_MD_FLFLAGS (0x00000800ULL) /* flags word */ +#define OBD_MD_DOM_SIZE (0X00001000ULL) /* Data-on-MDT component size */ +#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */ +#define OBD_MD_FLPARENT (0x00004000ULL) /* parent FID */ +#define OBD_MD_LAYOUT_VERSION (0x00008000ULL) /* OST object layout version */ +#define OBD_MD_FLRDEV (0x00010000ULL) /* device number */ +#define OBD_MD_FLEASIZE (0x00020000ULL) /* extended attribute data */ +#define OBD_MD_LINKNAME (0x00040000ULL) /* symbolic link target */ +#define OBD_MD_FLHANDLE (0x00080000ULL) /* file/lock handle */ +#define OBD_MD_FLCKSUM (0x00100000ULL) /* bulk data checksum */ +/* OBD_MD_FLQOS (0x00200000ULL) has never been used */ +/* OBD_MD_FLCOOKIE (0x00800000ULL) obsolete in 2.8 */ +#define OBD_MD_FLPRJQUOTA (0x00400000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLGROUP (0x01000000ULL) /* group */ +#define OBD_MD_FLFID (0x02000000ULL) /* ->ost write inline fid */ +/* OBD_MD_FLEPOCH (0x04000000ULL) obsolete 2.7.50 */ + /* ->mds if epoch opens or closes */ +#define OBD_MD_FLGRANT (0x08000000ULL) /* ost preallocation space grant */ +#define OBD_MD_FLDIREA (0x10000000ULL) /* dir's extended attribute data */ +#define OBD_MD_FLUSRQUOTA (0x20000000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLGRPQUOTA (0x40000000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */ + +#define OBD_MD_MDS (0x0000000100000000ULL) /* where an inode lives on */ +/* OBD_MD_REINT (0x0000000200000000ULL) obsolete 1.8 */ +#define OBD_MD_MEA (0x0000000400000000ULL) /* CMD split EA */ +#define OBD_MD_TSTATE (0x0000000800000000ULL) /* transient state field */ + +#define OBD_MD_FLXATTR (0x0000001000000000ULL) /* xattr */ +#define OBD_MD_FLXATTRLS (0x0000002000000000ULL) /* xattr list */ +#define OBD_MD_FLXATTRRM (0x0000004000000000ULL) /* xattr remove */ +#define OBD_MD_FLACL (0x0000008000000000ULL) /* ACL */ +#define OBD_MD_FLAGSTATFS (0x0000010000000000ULL) /* aggregated statfs */ +/* OBD_MD_FLMDSCAPA (0x0000020000000000ULL) obsolete 2.7.54 */ +/* OBD_MD_FLOSSCAPA (0x0000040000000000ULL) obsolete 2.7.54 */ +/* OBD_MD_FLCKSPLIT (0x0000080000000000ULL) obsolete 2.3.58*/ +#define OBD_MD_FLCROSSREF (0x0000100000000000ULL) /* Cross-ref case */ +#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes + * under lock; for xattr + * requests means the + * client holds the lock */ +#define OBD_MD_FLOBJCOUNT (0x0000400000000000ULL) /* for multiple destroy */ + +#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */ +#define OBD_MD_CLOSE_INTENT_EXECED (0x0020000000000000ULL) /* close intent + executed */ + +#define OBD_MD_DEFAULT_MEA (0x0040000000000000ULL) /* default MEA */ +#define OBD_MD_FLOSTLAYOUT (0x0080000000000000ULL) /* contain ost_layout */ +#define OBD_MD_FLPROJID (0x0100000000000000ULL) /* project ID */ +#define OBD_MD_SECCTX (0x0200000000000000ULL) /* embed security xattr */ + +#define OBD_MD_FLLAZYSIZE (0x0400000000000000ULL) /* Lazy size */ +#define OBD_MD_FLLAZYBLOCKS (0x0800000000000000ULL) /* Lazy blocks */ + +#define OBD_MD_FLALLQUOTA (OBD_MD_FLUSRQUOTA | \ + OBD_MD_FLGRPQUOTA | \ + OBD_MD_FLPRJQUOTA) + +#define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \ + OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \ + OBD_MD_FLMODE | OBD_MD_FLTYPE | OBD_MD_FLUID | \ + OBD_MD_FLGID | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \ + OBD_MD_FLPARENT | OBD_MD_FLRDEV | OBD_MD_FLGROUP | \ + OBD_MD_FLPROJID) + +#define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS) + +/* don't forget obdo_fid which is way down at the bottom so it can + * come after the definition of llog_cookie */ + +enum hss_valid { + HSS_SETMASK = 0x01, + HSS_CLEARMASK = 0x02, + HSS_ARCHIVE_ID = 0x04, +}; + +struct hsm_state_set { + __u32 hss_valid; + __u32 hss_archive_id; + __u64 hss_setmask; + __u64 hss_clearmask; +}; + +/* ost_body.data values for OST_BRW */ + +#define OBD_BRW_READ 0x01 +#define OBD_BRW_WRITE 0x02 +#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE) +#define OBD_BRW_NDELAY 0x04 /* Non-delay RPC should be issued for + * this page. Non-delay RPCs have bit + * rq_no_delay set. */ +#define OBD_BRW_SYNC 0x08 /* this page is a part of synchronous + * transfer and is not accounted in + * the grant. */ +#define OBD_BRW_CHECK 0x10 +#define OBD_BRW_FROM_GRANT 0x20 /* the osc manages this under llite */ +#define OBD_BRW_GRANTED 0x40 /* the ost manages this */ +#define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */ +#define OBD_BRW_NOQUOTA 0x100 +#define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */ +#define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */ +#define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */ +#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */ +#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */ +#define OBD_BRW_SOFT_SYNC 0x4000 /* This flag notifies the server + * that the client is running low on + * space for unstable pages; asking + * it to sync quickly */ +#define OBD_BRW_OVER_PRJQUOTA 0x8000 /* Running out of project quota */ + +#define OBD_BRW_OVER_ALLQUOTA (OBD_BRW_OVER_USRQUOTA | \ + OBD_BRW_OVER_GRPQUOTA | \ + OBD_BRW_OVER_PRJQUOTA) + +#define OBD_BRW_LOCAL1 0x80000000UL /* + * osd-ldiskfs internal, + * page mapped to real block + */ + +#define OBD_BRW_LOCALS (OBD_BRW_LOCAL1) + +#define OBD_MAX_GRANT 0x7fffffffUL /* Max grant allowed to one client: 2 GiB */ + +#define OBD_OBJECT_EOF LUSTRE_EOF + +#define OST_MIN_PRECREATE 32 +#define OST_MAX_PRECREATE 20000 + +struct obd_ioobj { + struct ost_id ioo_oid; /* object ID, if multi-obj BRW */ + __u32 ioo_max_brw; /* low 16 bits were o_mode before 2.4, + * now (PTLRPC_BULK_OPS_COUNT - 1) in + * high 16 bits in 2.4 and later */ + __u32 ioo_bufcnt; /* number of niobufs for this object */ +}; + +/* NOTE: IOOBJ_MAX_BRW_BITS defines the _offset_ of the max_brw field in + * ioo_max_brw, NOT the maximum number of bits in PTLRPC_BULK_OPS_BITS. + * That said, ioo_max_brw is a 32-bit field so the limit is also 16 bits. */ +#define IOOBJ_MAX_BRW_BITS 16 +#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1) +#define ioobj_max_brw_set(ioo, num) \ +do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0) + +/* multiple of 8 bytes => can array */ +struct niobuf_remote { + __u64 rnb_offset; + __u32 rnb_len; + __u32 rnb_flags; +}; + +/* lock value block communicated between the filter and llite */ + +/* OST_LVB_ERR_INIT is needed because the return code in rc is + * negative, i.e. because ((MASK + rc) & MASK) != MASK. */ +#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL +#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL +#define OST_LVB_IS_ERR(blocks) \ + ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK) +#define OST_LVB_SET_ERR(blocks, rc) \ + do { blocks = OST_LVB_ERR_INIT + rc; } while (0) +#define OST_LVB_GET_ERR(blocks) (int)(blocks - OST_LVB_ERR_INIT) + +struct ost_lvb_v1 { + __u64 lvb_size; + __s64 lvb_mtime; + __s64 lvb_atime; + __s64 lvb_ctime; + __u64 lvb_blocks; +}; + +struct ost_lvb { + __u64 lvb_size; + __s64 lvb_mtime; + __s64 lvb_atime; + __s64 lvb_ctime; + __u64 lvb_blocks; + __u32 lvb_mtime_ns; + __u32 lvb_atime_ns; + __u32 lvb_ctime_ns; + __u32 lvb_padding; +}; + +/* + * lquota data structures + */ + +#ifndef QUOTABLOCK_BITS +# define QUOTABLOCK_BITS LUSTRE_QUOTABLOCK_BITS +#endif + +#ifndef QUOTABLOCK_SIZE +# define QUOTABLOCK_SIZE LUSTRE_QUOTABLOCK_SIZE +#endif + +#ifndef toqb +# define toqb lustre_stoqb +#endif + +/* The lquota_id structure is an union of all the possible identifier types that + * can be used with quota, this includes: + * - 64-bit user ID + * - 64-bit group ID + * - a FID which can be used for per-directory quota in the future */ +union lquota_id { + struct lu_fid qid_fid; /* FID for per-directory quota */ + __u64 qid_uid; /* user identifier */ + __u64 qid_gid; /* group identifier */ + __u64 qid_projid; /* project identifier */ +}; + +/* quotactl management */ +struct obd_quotactl { + __u32 qc_cmd; + __u32 qc_type; /* see Q_* flag below */ + __u32 qc_id; + __u32 qc_stat; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; +}; + +#define Q_COPY(out, in, member) (out)->member = (in)->member + +#define QCTL_COPY(out, in) \ +do { \ + Q_COPY(out, in, qc_cmd); \ + Q_COPY(out, in, qc_type); \ + Q_COPY(out, in, qc_id); \ + Q_COPY(out, in, qc_stat); \ + Q_COPY(out, in, qc_dqinfo); \ + Q_COPY(out, in, qc_dqblk); \ +} while (0) + +/* Body of quota request used for quota acquire/release RPCs between quota + * master (aka QMT) and slaves (ak QSD). */ +struct quota_body { + struct lu_fid qb_fid; /* FID of global index packing the pool ID + * and type (data or metadata) as well as + * the quota type (user or group). */ + union lquota_id qb_id; /* uid or gid or directory FID */ + __u32 qb_flags; /* see below */ + __u32 qb_padding; + __u64 qb_count; /* acquire/release count (kbytes/inodes) */ + __u64 qb_usage; /* current slave usage (kbytes/inodes) */ + __u64 qb_slv_ver; /* slave index file version */ + struct lustre_handle qb_lockh; /* per-ID lock handle */ + struct lustre_handle qb_glb_lockh; /* global lock handle */ + __u64 qb_padding1[4]; +}; + +/* When the quota_body is used in the reply of quota global intent + * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */ +#define qb_slv_fid qb_fid +/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in + * quota reply */ +#define qb_qunit qb_usage + +#define QUOTA_DQACQ_FL_ACQ 0x1 /* acquire quota */ +#define QUOTA_DQACQ_FL_PREACQ 0x2 /* pre-acquire */ +#define QUOTA_DQACQ_FL_REL 0x4 /* release quota */ +#define QUOTA_DQACQ_FL_REPORT 0x8 /* report usage */ + +/* Quota types currently supported */ +enum { + LQUOTA_TYPE_USR = 0x00, /* maps to USRQUOTA */ + LQUOTA_TYPE_GRP = 0x01, /* maps to GRPQUOTA */ + LQUOTA_TYPE_PRJ = 0x02, /* maps to PRJQUOTA */ + LQUOTA_TYPE_MAX +}; + +/* There are 2 different resource types on which a quota limit can be enforced: + * - inodes on the MDTs + * - blocks on the OSTs */ +enum { + LQUOTA_RES_MD = 0x01, /* skip 0 to avoid null oid in FID */ + LQUOTA_RES_DT = 0x02, + LQUOTA_LAST_RES, + LQUOTA_FIRST_RES = LQUOTA_RES_MD +}; +#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1) + +/* + * Space accounting support + * Format of an accounting record, providing disk usage information for a given + * user or group + */ +struct lquota_acct_rec { /* 16 bytes */ + __u64 bspace; /* current space in use */ + __u64 ispace; /* current # inodes in use */ +}; + +/* + * Global quota index support + * Format of a global record, providing global quota settings for a given quota + * identifier + */ +struct lquota_glb_rec { /* 32 bytes */ + __u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */ + __u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */ + __u64 qbr_time; /* grace time, in seconds */ + __u64 qbr_granted; /* how much is granted to slaves, in #inodes or + * kbytes */ +}; + +/* + * Slave index support + * Format of a slave record, recording how much space is granted to a given + * slave + */ +struct lquota_slv_rec { /* 8 bytes */ + __u64 qsr_granted; /* space granted to the slave for the key=ID, + * in #inodes or kbytes */ +}; + +/* Data structures associated with the quota locks */ + +/* Glimpse descriptor used for the index & per-ID quota locks */ +struct ldlm_gl_lquota_desc { + union lquota_id gl_id; /* quota ID subject to the glimpse */ + __u64 gl_flags; /* see LQUOTA_FL* below */ + __u64 gl_ver; /* new index version */ + __u64 gl_hardlimit; /* new hardlimit or qunit value */ + __u64 gl_softlimit; /* new softlimit */ + __u64 gl_time; + __u64 gl_pad2; +}; +#define gl_qunit gl_hardlimit /* current qunit value used when + * glimpsing per-ID quota locks */ + +/* quota glimpse flags */ +#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */ + +/* LVB used with quota (global and per-ID) locks */ +struct lquota_lvb { + __u64 lvb_flags; /* see LQUOTA_FL* above */ + __u64 lvb_id_may_rel; /* space that might be released later */ + __u64 lvb_id_rel; /* space released by the slave for this ID */ + __u64 lvb_id_qunit; /* current qunit value */ + __u64 lvb_pad1; +}; + +/* LVB used with global quota lock */ +#define lvb_glb_ver lvb_id_may_rel /* current version of the global index */ + +/* op codes */ +enum quota_cmd { + QUOTA_DQACQ = 601, + QUOTA_DQREL = 602, + QUOTA_LAST_OPC +}; +#define QUOTA_FIRST_OPC QUOTA_DQACQ + +/* + * MDS REQ RECORDS + */ + +/* opcodes */ +enum mds_cmd { + MDS_GETATTR = 33, + MDS_GETATTR_NAME = 34, + MDS_CLOSE = 35, + MDS_REINT = 36, + MDS_READPAGE = 37, + MDS_CONNECT = 38, + MDS_DISCONNECT = 39, + MDS_GET_ROOT = 40, + MDS_STATFS = 41, + MDS_PIN = 42, /* obsolete, never used in a release */ + MDS_UNPIN = 43, /* obsolete, never used in a release */ + MDS_SYNC = 44, + MDS_DONE_WRITING = 45, /* obsolete since 2.8.0 */ + MDS_SET_INFO = 46, + MDS_QUOTACHECK = 47, /* not used since 2.4 */ + MDS_QUOTACTL = 48, + MDS_GETXATTR = 49, + MDS_SETXATTR = 50, /* obsolete, now it's MDS_REINT op */ + MDS_WRITEPAGE = 51, + MDS_IS_SUBDIR = 52, /* obsolete, never used in a release */ + MDS_GET_INFO = 53, + MDS_HSM_STATE_GET = 54, + MDS_HSM_STATE_SET = 55, + MDS_HSM_ACTION = 56, + MDS_HSM_PROGRESS = 57, + MDS_HSM_REQUEST = 58, + MDS_HSM_CT_REGISTER = 59, + MDS_HSM_CT_UNREGISTER = 60, + MDS_SWAP_LAYOUTS = 61, + MDS_RMFID = 62, + MDS_LAST_OPC +}; + +#define MDS_FIRST_OPC MDS_GETATTR + + +/* opcodes for object update */ +enum update_cmd { + OUT_UPDATE = 1000, + OUT_UPDATE_LAST_OPC +}; + +#define OUT_UPDATE_FIRST_OPC OUT_UPDATE + +/* + * Do not exceed 63 + */ + +enum mds_reint_op { + REINT_SETATTR = 1, + REINT_CREATE = 2, + REINT_LINK = 3, + REINT_UNLINK = 4, + REINT_RENAME = 5, + REINT_OPEN = 6, + REINT_SETXATTR = 7, + REINT_RMENTRY = 8, + REINT_MIGRATE = 9, + REINT_RESYNC = 10, + REINT_MAX +}; + +/* the disposition of the intent outlines what was executed */ +#define DISP_IT_EXECD 0x00000001 +#define DISP_LOOKUP_EXECD 0x00000002 +#define DISP_LOOKUP_NEG 0x00000004 +#define DISP_LOOKUP_POS 0x00000008 +#define DISP_OPEN_CREATE 0x00000010 +#define DISP_OPEN_OPEN 0x00000020 +#define DISP_ENQ_COMPLETE 0x00400000 /* obsolete and unused */ +#define DISP_ENQ_OPEN_REF 0x00800000 +#define DISP_ENQ_CREATE_REF 0x01000000 +#define DISP_OPEN_LOCK 0x02000000 +#define DISP_OPEN_LEASE 0x04000000 +#define DISP_OPEN_STRIPE 0x08000000 +#define DISP_OPEN_DENY 0x10000000 + +/* INODE LOCK PARTS */ +enum mds_ibits_locks { + MDS_INODELOCK_LOOKUP = 0x000001, /* For namespace, dentry etc. Was + * used to protect permission (mode, + * owner, group, etc) before 2.4. */ + MDS_INODELOCK_UPDATE = 0x000002, /* size, links, timestamps */ + MDS_INODELOCK_OPEN = 0x000004, /* For opened files */ + MDS_INODELOCK_LAYOUT = 0x000008, /* for layout */ + + /* The PERM bit is added in 2.4, and is used to protect permission + * (mode, owner, group, ACL, etc.) separate from LOOKUP lock. + * For remote directories (in DNE) these locks will be granted by + * different MDTs (different LDLM namespace). + * + * For local directory, the MDT always grants UPDATE|PERM together. + * For remote directory, master MDT (where remote directory is) grants + * UPDATE|PERM, and remote MDT (where name entry is) grants LOOKUP_LOCK. + */ + MDS_INODELOCK_PERM = 0x000010, + MDS_INODELOCK_XATTR = 0x000020, /* non-permission extended attrs */ + MDS_INODELOCK_DOM = 0x000040, /* Data for Data-on-MDT files */ + /* Do not forget to increase MDS_INODELOCK_NUMBITS when adding bits */ +}; +#define MDS_INODELOCK_NUMBITS 7 +/* This FULL lock is useful to take on unlink sort of operations */ +#define MDS_INODELOCK_FULL ((1 << MDS_INODELOCK_NUMBITS) - 1) +/* DOM lock shouldn't be canceled early, use this macro for ELC */ +#define MDS_INODELOCK_ELC (MDS_INODELOCK_FULL & ~MDS_INODELOCK_DOM) + +/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], + * but was moved into name[1] along with the OID to avoid consuming the + * name[2,3] fields that need to be used for the quota id (also a FID). */ +enum { + LUSTRE_RES_ID_SEQ_OFF = 0, + LUSTRE_RES_ID_VER_OID_OFF = 1, + LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */ + LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2, + LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3, + LUSTRE_RES_ID_HSH_OFF = 3 +}; + +#define MDS_STATUS_CONN 1 +#define MDS_STATUS_LOV 2 + +enum { + /* these should be identical to their EXT4_*_FL counterparts, they are + * redefined here only to avoid dragging in fs/ext4/ext4.h */ + LUSTRE_SYNC_FL = 0x00000008, /* Synchronous updates */ + LUSTRE_IMMUTABLE_FL = 0x00000010, /* Immutable file */ + LUSTRE_APPEND_FL = 0x00000020, /* file writes may only append */ + LUSTRE_NODUMP_FL = 0x00000040, /* do not dump file */ + LUSTRE_NOATIME_FL = 0x00000080, /* do not update atime */ + LUSTRE_INDEX_FL = 0x00001000, /* hash-indexed directory */ + LUSTRE_DIRSYNC_FL = 0x00010000, /* dirsync behaviour (dir only) */ + LUSTRE_TOPDIR_FL = 0x00020000, /* Top of directory hierarchies*/ + LUSTRE_DIRECTIO_FL = 0x00100000, /* Use direct i/o */ + LUSTRE_INLINE_DATA_FL = 0x10000000, /* Inode has inline data. */ + LUSTRE_PROJINHERIT_FL = 0x20000000, /* Create with parents projid */ + + /* These flags will not be identical to any EXT4_*_FL counterparts, + * and only reserved for lustre purpose. Note: these flags might + * be conflict with some of EXT4 flags, so + * 1. these conflict flags needs to be removed when the flag is + * wired by la_flags see osd_attr_get(). + * 2. If these flags needs to be stored into inode, they will be + * stored in LMA. see LMAI_XXXX */ + LUSTRE_ORPHAN_FL = 0x00002000, + LUSTRE_SET_SYNC_FL = 0x00040000, /* Synchronous setattr on OSTs */ + + LUSTRE_LMA_FL_MASKS = LUSTRE_ORPHAN_FL, +}; + +#ifndef FS_XFLAG_SYNC +#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#endif +#ifndef FS_XFLAG_NOATIME +#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#endif +#ifndef FS_XFLAG_IMMUTABLE +#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#endif +#ifndef FS_XFLAG_APPEND +#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ +#endif +#ifndef FS_XFLAG_PROJINHERIT +#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#endif + +/* 64 possible states */ +enum md_transient_state { + MS_RESTORE = (1 << 0), /* restore is running */ +}; + +struct mdt_body { + struct lu_fid mbo_fid1; + struct lu_fid mbo_fid2; + struct lustre_handle mbo_open_handle; + __u64 mbo_valid; + __u64 mbo_size; /* Offset, in the case of MDS_READPAGE */ + __s64 mbo_mtime; + __s64 mbo_atime; + __s64 mbo_ctime; + __u64 mbo_blocks; /* XID, in the case of MDS_READPAGE */ + __u64 mbo_version; /* was mbo_ioepoch before 2.11 */ + __u64 mbo_t_state; /* transient file state defined in + * enum md_transient_state + * was "ino" until 2.4.0 */ + __u32 mbo_fsuid; + __u32 mbo_fsgid; + __u32 mbo_capability; + __u32 mbo_mode; + __u32 mbo_uid; + __u32 mbo_gid; + __u32 mbo_flags; /* LUSTRE_*_FL file attributes */ + __u32 mbo_rdev; + __u32 mbo_nlink; /* #bytes to read in the case of MDS_READPAGE */ + __u32 mbo_layout_gen; /* was "generation" until 2.4.0 */ + __u32 mbo_suppgid; + __u32 mbo_eadatasize; + __u32 mbo_aclsize; + __u32 mbo_max_mdsize; + __u32 mbo_unused3; /* was max_cookiesize until 2.8 */ + __u32 mbo_uid_h; /* high 32-bits of uid, for FUID */ + __u32 mbo_gid_h; /* high 32-bits of gid, for FUID */ + __u32 mbo_projid; + __u64 mbo_dom_size; /* size of DOM component */ + __u64 mbo_dom_blocks; /* blocks consumed by DOM component */ + __u64 mbo_padding_8; /* also fix lustre_swab_mdt_body */ + __u64 mbo_padding_9; + __u64 mbo_padding_10; +}; /* 216 */ + +struct mdt_ioepoch { + struct lustre_handle mio_open_handle; + __u64 mio_unused1; /* was ioepoch */ + __u32 mio_unused2; /* was flags */ + __u32 mio_padding; +}; + +/* permissions for md_perm.mp_perm */ +enum { + CFS_SETUID_PERM = 0x01, + CFS_SETGID_PERM = 0x02, + CFS_SETGRP_PERM = 0x04, +}; + +struct mdt_rec_setattr { + __u32 sa_opcode; + __u32 sa_cap; + __u32 sa_fsuid; + __u32 sa_fsuid_h; + __u32 sa_fsgid; + __u32 sa_fsgid_h; + __u32 sa_suppgid; + __u32 sa_suppgid_h; + __u32 sa_padding_1; + __u32 sa_padding_1_h; + struct lu_fid sa_fid; + __u64 sa_valid; + __u32 sa_uid; + __u32 sa_gid; + __u64 sa_size; + __u64 sa_blocks; + __s64 sa_mtime; + __s64 sa_atime; + __s64 sa_ctime; + __u32 sa_attr_flags; + __u32 sa_mode; + __u32 sa_bias; /* some operation flags */ + __u32 sa_projid; + __u32 sa_padding_4; + __u32 sa_padding_5; +}; + +/* + * Attribute flags used in mdt_rec_setattr::sa_valid. + * The kernel's #defines for ATTR_* should not be used over the network + * since the client and MDS may run different kernels (see bug 13828) + * Therefore, we should only use MDS_ATTR_* attributes for sa_valid. + */ +#define MDS_ATTR_MODE 0x1ULL /* = 1 */ +#define MDS_ATTR_UID 0x2ULL /* = 2 */ +#define MDS_ATTR_GID 0x4ULL /* = 4 */ +#define MDS_ATTR_SIZE 0x8ULL /* = 8 */ +#define MDS_ATTR_ATIME 0x10ULL /* = 16 */ +#define MDS_ATTR_MTIME 0x20ULL /* = 32 */ +#define MDS_ATTR_CTIME 0x40ULL /* = 64 */ +#define MDS_ATTR_ATIME_SET 0x80ULL /* = 128 */ +#define MDS_ATTR_MTIME_SET 0x100ULL /* = 256 */ +#define MDS_ATTR_FORCE 0x200ULL /* = 512, Not a change, but a change it */ +#define MDS_ATTR_ATTR_FLAG 0x400ULL /* = 1024 */ +#define MDS_ATTR_KILL_SUID 0x800ULL /* = 2048 */ +#define MDS_ATTR_KILL_SGID 0x1000ULL /* = 4096 */ +#define MDS_ATTR_CTIME_SET 0x2000ULL /* = 8192 */ +#define MDS_ATTR_FROM_OPEN 0x4000ULL /* = 16384, called from open path, ie O_TRUNC */ +#define MDS_ATTR_BLOCKS 0x8000ULL /* = 32768 */ +#define MDS_ATTR_PROJID 0x10000ULL /* = 65536 */ +#define MDS_ATTR_LSIZE 0x20000ULL /* = 131072 */ +#define MDS_ATTR_LBLOCKS 0x40000ULL /* = 262144 */ +#define MDS_ATTR_OVERRIDE 0x2000000ULL /* = 33554432 */ + +enum mds_op_bias { +/* MDS_CHECK_SPLIT = 1 << 0, obsolete before 2.3.58 */ + /* used for remote object getattr/open by name: in the original + * getattr/open request, MDT found the object against name is on another + * MDT, then packed FID and LOOKUP lock in reply and returned -EREMOTE, + * and client knew it's a remote object, then set this flag in + * getattr/open request and sent to the corresponding MDT to finish + * getattr/open, which fetched attributes and UPDATE lock/opened file. + */ + MDS_CROSS_REF = 1 << 1, +/* MDS_VTX_BYPASS = 1 << 2, obsolete since 2.3.54 */ + MDS_PERM_BYPASS = 1 << 3, +/* MDS_SOM = 1 << 4, obsolete since 2.8.0 */ + MDS_QUOTA_IGNORE = 1 << 5, +/* MDS_CLOSE_CLEANUP = 1 << 6, obsolete since 2.3.51 */ + MDS_KEEP_ORPHAN = 1 << 7, + MDS_RECOV_OPEN = 1 << 8, + MDS_DATA_MODIFIED = 1 << 9, + MDS_CREATE_VOLATILE = 1 << 10, + MDS_OWNEROVERRIDE = 1 << 11, + MDS_HSM_RELEASE = 1 << 12, + MDS_CLOSE_MIGRATE = 1 << 13, + MDS_CLOSE_LAYOUT_SWAP = 1 << 14, + MDS_CLOSE_LAYOUT_MERGE = 1 << 15, + MDS_CLOSE_RESYNC_DONE = 1 << 16, + MDS_CLOSE_LAYOUT_SPLIT = 1 << 17, + MDS_TRUNC_KEEP_LEASE = 1 << 18, + MDS_CLOSE_UPDATE_TIMES = 1 << 20, +}; + +#define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP | \ + MDS_CLOSE_LAYOUT_MERGE | MDS_CLOSE_LAYOUT_SPLIT | \ + MDS_CLOSE_RESYNC_DONE) + +/* instance of mdt_reint_rec */ +struct mdt_rec_create { + __u32 cr_opcode; + __u32 cr_cap; + __u32 cr_fsuid; + __u32 cr_fsuid_h; + __u32 cr_fsgid; + __u32 cr_fsgid_h; + __u32 cr_suppgid1; + __u32 cr_suppgid1_h; + __u32 cr_suppgid2; + __u32 cr_suppgid2_h; + struct lu_fid cr_fid1; + struct lu_fid cr_fid2; + struct lustre_handle cr_open_handle_old; /* in case of open replay */ + __s64 cr_time; + __u64 cr_rdev; + __u64 cr_ioepoch; + __u64 cr_padding_1; /* rr_blocks */ + __u32 cr_mode; + __u32 cr_bias; + /* use of helpers set/get_mrc_cr_flags() is needed to access + * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to + * extend cr_flags size without breaking 1.8 compat */ + __u32 cr_flags_l; /* for use with open, low 32 bits */ + __u32 cr_flags_h; /* for use with open, high 32 bits */ + __u32 cr_umask; /* umask for create */ + __u32 cr_padding_4; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_link { + __u32 lk_opcode; + __u32 lk_cap; + __u32 lk_fsuid; + __u32 lk_fsuid_h; + __u32 lk_fsgid; + __u32 lk_fsgid_h; + __u32 lk_suppgid1; + __u32 lk_suppgid1_h; + __u32 lk_suppgid2; + __u32 lk_suppgid2_h; + struct lu_fid lk_fid1; + struct lu_fid lk_fid2; + __s64 lk_time; + __u64 lk_padding_1; /* rr_atime */ + __u64 lk_padding_2; /* rr_ctime */ + __u64 lk_padding_3; /* rr_size */ + __u64 lk_padding_4; /* rr_blocks */ + __u32 lk_bias; + __u32 lk_padding_5; /* rr_mode */ + __u32 lk_padding_6; /* rr_flags */ + __u32 lk_padding_7; /* rr_padding_2 */ + __u32 lk_padding_8; /* rr_padding_3 */ + __u32 lk_padding_9; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_unlink { + __u32 ul_opcode; + __u32 ul_cap; + __u32 ul_fsuid; + __u32 ul_fsuid_h; + __u32 ul_fsgid; + __u32 ul_fsgid_h; + __u32 ul_suppgid1; + __u32 ul_suppgid1_h; + __u32 ul_suppgid2; + __u32 ul_suppgid2_h; + struct lu_fid ul_fid1; + struct lu_fid ul_fid2; + __s64 ul_time; + __u64 ul_padding_2; /* rr_atime */ + __u64 ul_padding_3; /* rr_ctime */ + __u64 ul_padding_4; /* rr_size */ + __u64 ul_padding_5; /* rr_blocks */ + __u32 ul_bias; + __u32 ul_mode; + __u32 ul_padding_6; /* rr_flags */ + __u32 ul_padding_7; /* rr_padding_2 */ + __u32 ul_padding_8; /* rr_padding_3 */ + __u32 ul_padding_9; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_rename { + __u32 rn_opcode; + __u32 rn_cap; + __u32 rn_fsuid; + __u32 rn_fsuid_h; + __u32 rn_fsgid; + __u32 rn_fsgid_h; + __u32 rn_suppgid1; + __u32 rn_suppgid1_h; + __u32 rn_suppgid2; + __u32 rn_suppgid2_h; + struct lu_fid rn_fid1; + struct lu_fid rn_fid2; + __s64 rn_time; + __u64 rn_padding_1; /* rr_atime */ + __u64 rn_padding_2; /* rr_ctime */ + __u64 rn_padding_3; /* rr_size */ + __u64 rn_padding_4; /* rr_blocks */ + __u32 rn_bias; /* some operation flags */ + __u32 rn_mode; /* cross-ref rename has mode */ + __u32 rn_padding_5; /* rr_flags */ + __u32 rn_padding_6; /* rr_padding_2 */ + __u32 rn_padding_7; /* rr_padding_3 */ + __u32 rn_padding_8; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_setxattr { + __u32 sx_opcode; + __u32 sx_cap; + __u32 sx_fsuid; + __u32 sx_fsuid_h; + __u32 sx_fsgid; + __u32 sx_fsgid_h; + __u32 sx_suppgid1; + __u32 sx_suppgid1_h; + __u32 sx_suppgid2; + __u32 sx_suppgid2_h; + struct lu_fid sx_fid; + __u64 sx_padding_1; /* These three are rr_fid2 */ + __u32 sx_padding_2; + __u32 sx_padding_3; + __u64 sx_valid; + __s64 sx_time; + __u64 sx_padding_5; /* rr_ctime */ + __u64 sx_padding_6; /* rr_size */ + __u64 sx_padding_7; /* rr_blocks */ + __u32 sx_size; + __u32 sx_flags; + __u32 sx_padding_8; /* rr_flags */ + __u32 sx_padding_9; /* rr_padding_2 */ + __u32 sx_padding_10; /* rr_padding_3 */ + __u32 sx_padding_11; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec + * FLR: for file resync MDS_REINT_RESYNC RPC. */ +struct mdt_rec_resync { + __u32 rs_opcode; + __u32 rs_cap; + __u32 rs_fsuid; + __u32 rs_fsuid_h; + __u32 rs_fsgid; + __u32 rs_fsgid_h; + __u32 rs_suppgid1; + __u32 rs_suppgid1_h; + __u32 rs_suppgid2; + __u32 rs_suppgid2_h; + struct lu_fid rs_fid; + __u8 rs_padding0[sizeof(struct lu_fid)]; + struct lustre_handle rs_lease_handle; /* rr_mtime */ + __s64 rs_padding1; /* rr_atime */ + __s64 rs_padding2; /* rr_ctime */ + __u64 rs_padding3; /* rr_size */ + __u64 rs_padding4; /* rr_blocks */ + __u32 rs_bias; + __u32 rs_padding5; /* rr_mode */ + __u32 rs_padding6; /* rr_flags */ + __u32 rs_padding7; /* rr_flags_h */ + __u32 rs_padding8; /* rr_umask */ + __u16 rs_mirror_id; + __u16 rs_padding9; /* rr_padding_4 */ +}; + +/* + * mdt_rec_reint is the template for all mdt_reint_xxx structures. + * Do NOT change the size of various members, otherwise the value + * will be broken in lustre_swab_mdt_rec_reint(). + * + * If you add new members in other mdt_reint_xxx structres and need to use the + * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also. + */ +struct mdt_rec_reint { + __u32 rr_opcode; + __u32 rr_cap; + __u32 rr_fsuid; + __u32 rr_fsuid_h; + __u32 rr_fsgid; + __u32 rr_fsgid_h; + __u32 rr_suppgid1; + __u32 rr_suppgid1_h; + __u32 rr_suppgid2; + __u32 rr_suppgid2_h; + struct lu_fid rr_fid1; + struct lu_fid rr_fid2; + __s64 rr_mtime; + __s64 rr_atime; + __s64 rr_ctime; + __u64 rr_size; + __u64 rr_blocks; + __u32 rr_bias; + __u32 rr_mode; + __u32 rr_flags; + __u32 rr_flags_h; + __u32 rr_umask; + __u16 rr_mirror_id; + __u16 rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */ +}; + +#define LMV_DESC_QOS_MAXAGE_DEFAULT 60 /* Seconds */ + +/* lmv structures */ +struct lmv_desc { + __u32 ld_tgt_count; /* how many MDS's */ + __u32 ld_active_tgt_count; /* how many active */ + __u32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default hash pattern */ + __u64 ld_default_hash_size; + __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */ + struct obd_uuid ld_uuid; +}; + +/* LMV layout EA, and it will be stored both in master and slave object */ +struct lmv_mds_md_v1 { + __u32 lmv_magic; + __u32 lmv_stripe_count; + __u32 lmv_master_mdt_index; /* On master object, it is master + * MDT index, on slave object, it + * is stripe index of the slave obj */ + __u32 lmv_hash_type; /* dir stripe policy, i.e. indicate + * which hash function to be used, + * Note: only lower 16 bits is being + * used for now. Higher 16 bits will + * be used to mark the object status, + * for example migrating or dead. */ + __u32 lmv_layout_version; /* increased each time layout changed, + * by directory migration, restripe + * and LFSCK. */ + __u32 lmv_migrate_offset; /* once this is set, it means this + * directory is been migrated, stripes + * before this offset belong to target, + * from this to source. */ + __u32 lmv_migrate_hash; /* hash type of source stripes of + * migrating directory */ + __u32 lmv_padding2; + __u64 lmv_padding3; + char lmv_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */ + struct lu_fid lmv_stripe_fids[0]; /* FIDs for each stripe */ +}; + +#define LMV_MAGIC_V1 0x0CD20CD0 /* normal stripe lmv magic */ +#define LMV_MAGIC LMV_MAGIC_V1 + +/* #define LMV_USER_MAGIC 0x0CD30CD0 */ +#define LMV_MAGIC_STRIPE 0x0CD40CD0 /* magic for dir sub_stripe */ + +/** + * The FNV-1a hash algorithm is as follows: + * hash = FNV_offset_basis + * for each octet_of_data to be hashed + * hash = hash XOR octet_of_data + * hash = hash × FNV_prime + * return hash + * http://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function#FNV-1a_hash + * + * http://www.isthe.com/chongo/tech/comp/fnv/index.html#FNV-reference-source + * FNV_prime is 2^40 + 2^8 + 0xb3 = 0x100000001b3ULL + **/ +#define LUSTRE_FNV_1A_64_PRIME 0x100000001b3ULL +#define LUSTRE_FNV_1A_64_OFFSET_BIAS 0xcbf29ce484222325ULL +static inline __u64 lustre_hash_fnv_1a_64(const void *buf, __kernel_size_t size) +{ + __u64 hash = LUSTRE_FNV_1A_64_OFFSET_BIAS; + const unsigned char *p = buf; + __kernel_size_t i; + + for (i = 0; i < size; i++) { + hash ^= p[i]; + hash *= LUSTRE_FNV_1A_64_PRIME; + } + + return hash; +} + +union lmv_mds_md { + __u32 lmv_magic; + struct lmv_mds_md_v1 lmv_md_v1; + struct lmv_user_md lmv_user_md; +}; + +static inline __kernel_ssize_t lmv_mds_md_size(int stripe_count, + unsigned int lmm_magic) +{ + __kernel_ssize_t len = -EINVAL; + + switch (lmm_magic) { + case LMV_MAGIC_V1: { + struct lmv_mds_md_v1 *lmm1; + + len = sizeof(*lmm1); + len += stripe_count * sizeof(lmm1->lmv_stripe_fids[0]); + break; } + default: + break; + } + return len; +} + +static inline int lmv_mds_md_stripe_count_get(const union lmv_mds_md *lmm) +{ + switch (__le32_to_cpu(lmm->lmv_magic)) { + case LMV_MAGIC_V1: + return __le32_to_cpu(lmm->lmv_md_v1.lmv_stripe_count); + case LMV_USER_MAGIC: + return __le32_to_cpu(lmm->lmv_user_md.lum_stripe_count); + default: + return -EINVAL; + } +} + +static inline int lmv_mds_md_hash_type_get(const union lmv_mds_md *lmm) +{ + switch (__le32_to_cpu(lmm->lmv_magic)) { + case LMV_MAGIC_V1: + return __le32_to_cpu(lmm->lmv_md_v1.lmv_hash_type); + case LMV_USER_MAGIC: + return __le32_to_cpu(lmm->lmv_user_md.lum_hash_type); + default: + return -EINVAL; + } +} + +enum fld_rpc_opc { + FLD_QUERY = 900, + FLD_READ = 901, + FLD_LAST_OPC, + FLD_FIRST_OPC = FLD_QUERY +}; + +enum seq_rpc_opc { + SEQ_QUERY = 700, + SEQ_LAST_OPC, + SEQ_FIRST_OPC = SEQ_QUERY +}; + +enum seq_op { + SEQ_ALLOC_SUPER = 0, + SEQ_ALLOC_META = 1 +}; + +enum fld_op { + FLD_CREATE = 0, + FLD_DELETE = 1, + FLD_LOOKUP = 2, +}; + +/* LFSCK opcodes */ +enum lfsck_cmd { + LFSCK_NOTIFY = 1101, + LFSCK_QUERY = 1102, + LFSCK_LAST_OPC, + LFSCK_FIRST_OPC = LFSCK_NOTIFY +}; + +/* + * LOV data structures + */ + +#define LOV_MAX_UUID_BUFFER_SIZE 8192 +/* The size of the buffer the lov/mdc reserves for the + * array of UUIDs returned by the MDS. With the current + * protocol, this will limit the max number of OSTs per LOV */ + +#define LOV_DESC_MAGIC 0xB0CCDE5C +#define LOV_DESC_QOS_MAXAGE_DEFAULT 5 /* Seconds */ +#define LOV_DESC_STRIPE_SIZE_DEFAULT (1 << LNET_MTU_BITS) + +/* LOV settings descriptor (should only contain static info) */ +struct lov_desc { + __u32 ld_tgt_count; /* how many OBD's */ + __u32 ld_active_tgt_count; /* how many active */ + __s32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default PATTERN_RAID0 */ + __u64 ld_default_stripe_size; /* in bytes */ + __s64 ld_default_stripe_offset; /* starting OST index */ + __u32 ld_padding_0; /* unused */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */ + struct obd_uuid ld_uuid; +}; + +#define ld_magic ld_active_tgt_count /* for swabbing from llogs */ + +/* + * LDLM requests: + */ +/* opcodes -- MUST be distinct from OST/MDS opcodes */ +enum ldlm_cmd { + LDLM_ENQUEUE = 101, + LDLM_CONVERT = 102, + LDLM_CANCEL = 103, + LDLM_BL_CALLBACK = 104, + LDLM_CP_CALLBACK = 105, + LDLM_GL_CALLBACK = 106, + LDLM_SET_INFO = 107, + LDLM_LAST_OPC +}; +#define LDLM_FIRST_OPC LDLM_ENQUEUE + +#define RES_NAME_SIZE 4 +struct ldlm_res_id { + __u64 name[RES_NAME_SIZE]; +}; + +#define DLDLMRES "[%#llx:%#llx:%#llx].%#llx" +#define PLDLMRES(res) (unsigned long long)(res)->lr_name.name[0], \ + (unsigned long long)(res)->lr_name.name[1], \ + (unsigned long long)(res)->lr_name.name[2], \ + (unsigned long long)(res)->lr_name.name[3] + +/* lock types */ +enum ldlm_mode { + LCK_MINMODE = 0, + LCK_EX = 1, + LCK_PW = 2, + LCK_PR = 4, + LCK_CW = 8, + LCK_CR = 16, + LCK_NL = 32, + LCK_GROUP = 64, + LCK_COS = 128, + LCK_MAXMODE +}; + +#define LCK_MODE_NUM 8 + +enum ldlm_type { + LDLM_PLAIN = 10, + LDLM_EXTENT = 11, + LDLM_FLOCK = 12, + LDLM_IBITS = 13, + LDLM_MAX_TYPE +}; + +#define LDLM_MIN_TYPE LDLM_PLAIN + +struct ldlm_extent { + __u64 start; + __u64 end; + __u64 gid; +}; + +static inline bool ldlm_extent_equal(const struct ldlm_extent *ex1, + const struct ldlm_extent *ex2) +{ + return ex1->start == ex2->start && ex1->end == ex2->end; +} + +struct ldlm_inodebits { + __u64 bits; + union { + __u64 try_bits; /* optional bits to try */ + __u64 cancel_bits; /* for lock convert */ + }; +}; + +struct ldlm_flock_wire { + __u64 lfw_start; + __u64 lfw_end; + __u64 lfw_owner; + __u32 lfw_padding; + __u32 lfw_pid; +}; + +/* it's important that the fields of the ldlm_extent structure match + * the first fields of the ldlm_flock structure because there is only + * one ldlm_swab routine to process the ldlm_policy_data_t union. if + * this ever changes we will need to swab the union differently based + * on the resource type. */ + +union ldlm_wire_policy_data { + struct ldlm_extent l_extent; + struct ldlm_flock_wire l_flock; + struct ldlm_inodebits l_inodebits; +}; + +struct barrier_lvb { + __u32 lvb_status; + __u32 lvb_index; + __u64 lvb_padding; +}; + +struct ldlm_gl_barrier_desc { + __u32 lgbd_status; + __u32 lgbd_timeout; + __u64 lgbd_padding; +}; + +union ldlm_gl_desc { + struct ldlm_gl_lquota_desc lquota_desc; + struct ldlm_gl_barrier_desc barrier_desc; +}; + +enum ldlm_intent_flags { + IT_OPEN = 0x00000001, + IT_CREAT = 0x00000002, + IT_OPEN_CREAT = IT_OPEN | IT_CREAT, /* To allow case label. */ + IT_READDIR = 0x00000004, /* Used by mdc, not put on the wire. */ + IT_GETATTR = 0x00000008, + IT_LOOKUP = 0x00000010, +/* IT_UNLINK = 0x00000020, Obsolete. */ +/* IT_TRUNC = 0x00000040, Obsolete. */ + IT_GETXATTR = 0x00000080, +/* IT_EXEC = 0x00000100, Obsolete. */ +/* IT_PIN = 0x00000200, Obsolete. */ + IT_LAYOUT = 0x00000400, + IT_QUOTA_DQACQ = 0x00000800, + IT_QUOTA_CONN = 0x00001000, +/* IT_SETXATTR = 0x00002000, Obsolete. */ + IT_GLIMPSE = 0x00004000, + IT_BRW = 0x00008000, +}; + +struct ldlm_intent { + __u64 opc; +}; + +struct ldlm_resource_desc { + enum ldlm_type lr_type; + __u32 lr_pad; /* also fix lustre_swab_ldlm_resource_desc */ + struct ldlm_res_id lr_name; +}; + +struct ldlm_lock_desc { + struct ldlm_resource_desc l_resource; + enum ldlm_mode l_req_mode; + enum ldlm_mode l_granted_mode; + union ldlm_wire_policy_data l_policy_data; +}; + +#define LDLM_LOCKREQ_HANDLES 2 +#define LDLM_ENQUEUE_CANCEL_OFF 1 + +struct ldlm_request { + __u32 lock_flags; /* LDLM_FL_*, see lustre_dlm_flags.h */ + __u32 lock_count; /* number of locks in lock_handle[] */ + struct ldlm_lock_desc lock_desc;/* lock descriptor */ + struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES]; +}; + +struct ldlm_reply { + __u32 lock_flags; + __u32 lock_padding; /* also fix lustre_swab_ldlm_reply */ + struct ldlm_lock_desc lock_desc; + struct lustre_handle lock_handle; + __u64 lock_policy_res1; + __u64 lock_policy_res2; +}; + +#define ldlm_flags_to_wire(flags) ((__u32)(flags)) +#define ldlm_flags_from_wire(flags) ((__u64)(flags)) + +/* + * Opcodes for mountconf (mgs and mgc) + */ +enum mgs_cmd { + MGS_CONNECT = 250, + MGS_DISCONNECT = 251, + MGS_EXCEPTION = 252, /* node died, etc. */ + MGS_TARGET_REG = 253, /* whenever target starts up */ + MGS_TARGET_DEL = 254, + MGS_SET_INFO = 255, + MGS_CONFIG_READ = 256, + MGS_LAST_OPC, + MGS_FIRST_OPC = MGS_CONNECT +}; + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0) +#define MGS_PARAM_MAXLEN 1024 +#define KEY_SET_INFO "set_info" + +struct mgs_send_param { + char mgs_param[MGS_PARAM_MAXLEN]; +}; +#endif + +/* We pass this info to the MGS so it can write config logs */ +#define MTI_NAME_MAXLEN 64 +#define MTI_PARAM_MAXLEN 4096 +#define MTI_NIDS_MAX 32 +struct mgs_target_info { + __u32 mti_lustre_ver; + __u32 mti_stripe_index; + __u32 mti_config_ver; + __u32 mti_flags; /* LDD_F_* */ + __u32 mti_nid_count; + __u32 mti_instance; /* Running instance of target */ + char mti_fsname[MTI_NAME_MAXLEN]; + char mti_svname[MTI_NAME_MAXLEN]; + char mti_uuid[sizeof(struct obd_uuid)]; + __u64 mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t) */ + char mti_params[MTI_PARAM_MAXLEN]; +}; + +struct mgs_nidtbl_entry { + __u64 mne_version; /* table version of this entry */ + __u32 mne_instance; /* target instance # */ + __u32 mne_index; /* target index */ + __u32 mne_length; /* length of this entry - by bytes */ + __u8 mne_type; /* target type LDD_F_SV_TYPE_OST/MDT */ + __u8 mne_nid_type; /* type of nid(mbz). for ipv6. */ + __u8 mne_nid_size; /* size of each NID, by bytes */ + __u8 mne_nid_count; /* # of NIDs in buffer */ + union { + lnet_nid_t nids[0]; /* variable size buffer for NIDs. */ + } u; +}; + +enum mgs_cfg_type { + MGS_CFG_T_CONFIG = 0, + MGS_CFG_T_SPTLRPC = 1, + MGS_CFG_T_RECOVER = 2, + MGS_CFG_T_PARAMS = 3, + MGS_CFG_T_NODEMAP = 4, + MGS_CFG_T_BARRIER = 5, + MGS_CFG_T_MAX +}; + +struct mgs_config_body { + char mcb_name[MTI_NAME_MAXLEN]; /* logname */ + __u64 mcb_offset; /* next index of config log to request */ + __u16 mcb_type; /* type of log: MGS_CFG_T_[CONFIG|RECOVER] */ + __u8 mcb_nm_cur_pass; + __u8 mcb_bits; /* bits unit size of config log */ + __u32 mcb_units; /* # of units for bulk transfer */ +}; + +struct mgs_config_res { + __u64 mcr_offset; /* index of last config log */ + union { + __u64 mcr_size; /* size of the log */ + __u64 mcr_nm_cur_pass; /* current nodemap config pass */ + }; +}; + +/* Config marker flags (in config log) */ +#define CM_START 0x01 +#define CM_END 0x02 +#define CM_SKIP 0x04 +#define CM_UPGRADE146 0x08 +#define CM_EXCLUDE 0x10 +#define CM_START_SKIP (CM_START | CM_SKIP) + +struct cfg_marker { + __u32 cm_step; /* aka config version */ + __u32 cm_flags; + __u32 cm_vers; /* lustre release version number */ + __u32 cm_padding; /* 64 bit align */ + __s64 cm_createtime; /*when this record was first created */ + __s64 cm_canceltime; /*when this record is no longer valid*/ + char cm_tgtname[MTI_NAME_MAXLEN]; + char cm_comment[MTI_NAME_MAXLEN]; +}; + +/* + * Opcodes for multiple servers. + */ +enum obd_cmd { + OBD_PING = 400, +/* OBD_LOG_CANCEL = 401, obsolete since 1.5 */ +/* OBD_QC_CALLBACK = 402, obsolete since 2.4 */ + OBD_IDX_READ = 403, + OBD_LAST_OPC, + OBD_FIRST_OPC = OBD_PING +}; + +/** + * llog contexts indices. + * + * There is compatibility problem with indexes below, they are not + * continuous and must keep their numbers for compatibility needs. + * See LU-5218 for details. + */ +enum llog_ctxt_id { + LLOG_CONFIG_ORIG_CTXT = 0, + LLOG_CONFIG_REPL_CTXT = 1, + LLOG_MDS_OST_ORIG_CTXT = 2, + LLOG_MDS_OST_REPL_CTXT = 3, /* kept just to avoid re-assignment */ + LLOG_SIZE_ORIG_CTXT = 4, + LLOG_SIZE_REPL_CTXT = 5, + LLOG_TEST_ORIG_CTXT = 8, + LLOG_TEST_REPL_CTXT = 9, /* kept just to avoid re-assignment */ + LLOG_CHANGELOG_ORIG_CTXT = 12, /**< changelog generation on mdd */ + LLOG_CHANGELOG_REPL_CTXT = 13, /**< changelog access on clients */ + /* for multiple changelog consumers */ + LLOG_CHANGELOG_USER_ORIG_CTXT = 14, + LLOG_AGENT_ORIG_CTXT = 15, /**< agent requests generation on cdt */ + LLOG_UPDATELOG_ORIG_CTXT = 16, /* update log */ + LLOG_UPDATELOG_REPL_CTXT = 17, /* update log */ + LLOG_MAX_CTXTS +}; + +/** Identifier for a single log object */ +struct llog_logid { + struct ost_id lgl_oi; + __u32 lgl_ogen; +} __attribute__((packed)); + +/** Records written to the CATALOGS list */ +#define CATLIST "CATALOGS" +struct llog_catid { + struct llog_logid lci_logid; + __u32 lci_padding1; + __u32 lci_padding2; + __u32 lci_padding3; +} __attribute__((packed)); + +/* Log data record types - there is no specific reason that these need to + * be related to the RPC opcodes, but no reason not to (may be handy later?) + */ +#define LLOG_OP_MAGIC 0x10600000 +#define LLOG_OP_MASK 0xfff00000 + +enum llog_op_type { + LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0x00000, + OST_SZ_REC = LLOG_OP_MAGIC | 0x00f00, + /* OST_RAID1_REC = LLOG_OP_MAGIC | 0x01000, never used */ + MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | + REINT_UNLINK, /* obsolete after 2.5.0 */ + MDS_UNLINK64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | + REINT_UNLINK, + /* MDS_SETATTR_REC = LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */ + MDS_SETATTR64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | + REINT_SETATTR, + OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000, + /* PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */ + LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000, + /* LLOG_JOIN_REC = LLOG_OP_MAGIC | 0x50000, obsolete 1.8.0 */ + CHANGELOG_REC = LLOG_OP_MAGIC | 0x60000, + CHANGELOG_USER_REC = LLOG_OP_MAGIC | 0x70000, + CHANGELOG_USER_REC2 = LLOG_OP_MAGIC | 0x70002, + HSM_AGENT_REC = LLOG_OP_MAGIC | 0x80000, + UPDATE_REC = LLOG_OP_MAGIC | 0xa0000, + LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539, + LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b, +}; + +#define LLOG_REC_HDR_NEEDS_SWABBING(r) \ + (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC)) + +/** Log record header - stored in little endian order. + * Each record must start with this struct, end with a llog_rec_tail, + * and be a multiple of 256 bits in size. + */ +struct llog_rec_hdr { + __u32 lrh_len; + __u32 lrh_index; + __u32 lrh_type; + __u32 lrh_id; +} __attribute__((packed)); + +struct llog_rec_tail { + __u32 lrt_len; + __u32 lrt_index; +} __attribute__((packed)); + +/* Where data follow just after header */ +#define REC_DATA(ptr) \ + ((void *)((char *)ptr + sizeof(struct llog_rec_hdr))) + +#define REC_DATA_LEN(rec) \ + (rec->lrh_len - sizeof(struct llog_rec_hdr) - \ + sizeof(struct llog_rec_tail)) + +struct llog_logid_rec { + struct llog_rec_hdr lid_hdr; + struct llog_logid lid_id; + __u32 lid_padding1; + __u64 lid_padding2; + __u64 lid_padding3; + struct llog_rec_tail lid_tail; +} __attribute__((packed)); + +struct llog_unlink_rec { + struct llog_rec_hdr lur_hdr; + __u64 lur_oid; + __u32 lur_oseq; + __u32 lur_count; + struct llog_rec_tail lur_tail; +} __attribute__((packed)); + +struct llog_unlink64_rec { + struct llog_rec_hdr lur_hdr; + struct lu_fid lur_fid; + __u32 lur_count; /* to destroy the lost precreated */ + __u32 lur_padding1; + __u64 lur_padding2; + __u64 lur_padding3; + struct llog_rec_tail lur_tail; +} __attribute__((packed)); + +struct llog_setattr64_rec { + struct llog_rec_hdr lsr_hdr; + struct ost_id lsr_oi; + __u32 lsr_uid; + __u32 lsr_uid_h; + __u32 lsr_gid; + __u32 lsr_gid_h; + __u64 lsr_valid; + struct llog_rec_tail lsr_tail; +} __attribute__((packed)); + +/* Extended to support project quota */ +struct llog_setattr64_rec_v2 { + struct llog_rec_hdr lsr_hdr; + struct ost_id lsr_oi; + __u32 lsr_uid; + __u32 lsr_uid_h; + __u32 lsr_gid; + __u32 lsr_gid_h; + __u64 lsr_valid; + __u32 lsr_projid; + __u32 lsr_layout_version; + __u64 lsr_padding2; + __u64 lsr_padding3; + struct llog_rec_tail lsr_tail; +} __attribute__((packed)); + +struct llog_size_change_rec { + struct llog_rec_hdr lsc_hdr; + struct ll_fid lsc_fid; + __u32 lsc_ioepoch; + __u32 lsc_padding1; + __u64 lsc_padding2; + __u64 lsc_padding3; + struct llog_rec_tail lsc_tail; +} __attribute__((packed)); + +#define CHANGELOG_MAGIC 0xca103000 + +/** \a changelog_rec_type's that can't be masked */ +#define CHANGELOG_MINMASK (1 << CL_MARK) +/** bits covering all \a changelog_rec_type's */ +#define CHANGELOG_ALLMASK 0XFFFFFFFF +/** default \a changelog_rec_type mask. Allow all of them, except + * CL_ATIME since it can really be time consuming, and not necessary + * under normal use. + * Remove also CL_OPEN, CL_GETXATTR and CL_DN_OPEN from default list as it can + * be costly and only necessary for audit purpose. + */ +#define CHANGELOG_DEFMASK (CHANGELOG_ALLMASK & \ + ~(1 << CL_ATIME | 1 << CL_OPEN | 1 << CL_GETXATTR | \ + 1 << CL_DN_OPEN)) + +/* changelog llog name, needed by client replicators */ +#define CHANGELOG_CATALOG "changelog_catalog" + +struct changelog_setinfo { + __u64 cs_recno; + __u32 cs_id; +} __attribute__((packed)); + +/** changelog record */ +struct llog_changelog_rec { + struct llog_rec_hdr cr_hdr; + struct changelog_rec cr; /**< Variable length field */ + struct llog_rec_tail cr_do_not_use; /**< for_sizeof_only */ +} __attribute__((packed)); + +#define CHANGELOG_USER_PREFIX "cl" + +struct llog_changelog_user_rec { + struct llog_rec_hdr cur_hdr; + __u32 cur_id; + /* only intended to be used in relative time comparisons to + * detect idle users */ + __u32 cur_time; + __u64 cur_endrec; + struct llog_rec_tail cur_tail; +} __attribute__((packed)); + +enum agent_req_status { + ARS_WAITING, + ARS_STARTED, + ARS_FAILED, + ARS_CANCELED, + ARS_SUCCEED, +}; + +static inline const char *agent_req_status2name(enum agent_req_status ars) +{ + switch (ars) { + case ARS_WAITING: + return "WAITING"; + case ARS_STARTED: + return "STARTED"; + case ARS_FAILED: + return "FAILED"; + case ARS_CANCELED: + return "CANCELED"; + case ARS_SUCCEED: + return "SUCCEED"; + default: + return "UNKNOWN"; + } +} + +struct llog_agent_req_rec { + struct llog_rec_hdr arr_hdr; /**< record header */ + __u32 arr_status; /**< status of the request */ + /* must match enum + * agent_req_status */ + __u32 arr_archive_id; /**< backend archive number */ + __u64 arr_flags; /**< req flags */ + __u64 arr_compound_id; /**< compound cookie, ignored */ + __u64 arr_req_create; /**< req. creation time */ + __u64 arr_req_change; /**< req. status change time */ + struct hsm_action_item arr_hai; /**< req. to the agent */ + struct llog_rec_tail arr_tail; /**< record tail for_sizezof_only */ +} __attribute__((packed)); + +/* Old llog gen for compatibility */ +struct llog_gen { + __u64 mnt_cnt; + __u64 conn_cnt; +} __attribute__((packed)); + +struct llog_gen_rec { + struct llog_rec_hdr lgr_hdr; + struct llog_gen lgr_gen; + __u64 padding1; + __u64 padding2; + __u64 padding3; + struct llog_rec_tail lgr_tail; +}; + +/* flags for the logs */ +enum llog_flag { + LLOG_F_ZAP_WHEN_EMPTY = 0x1, + LLOG_F_IS_CAT = 0x2, + LLOG_F_IS_PLAIN = 0x4, + LLOG_F_EXT_JOBID = 0x8, + LLOG_F_IS_FIXSIZE = 0x10, + LLOG_F_EXT_EXTRA_FLAGS = 0x20, + LLOG_F_EXT_X_UIDGID = 0x40, + LLOG_F_EXT_X_NID = 0x80, + LLOG_F_EXT_X_OMODE = 0x100, + LLOG_F_EXT_X_XATTR = 0x200, + LLOG_F_RM_ON_ERR = 0x400, + + /* Note: Flags covered by LLOG_F_EXT_MASK will be inherited from + * catlog to plain log, so do not add LLOG_F_IS_FIXSIZE here, + * because the catlog record is usually fixed size, but its plain + * log record can be variable */ + LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID | LLOG_F_EXT_EXTRA_FLAGS | + LLOG_F_EXT_X_UIDGID | LLOG_F_EXT_X_NID | + LLOG_F_EXT_X_OMODE | LLOG_F_EXT_X_XATTR, +}; + +/* means first record of catalog */ +enum { + LLOG_CAT_FIRST = -1, +}; + +/* On-disk header structure of each log object, stored in little endian order */ +#define LLOG_MIN_CHUNK_SIZE 8192 +#define LLOG_HEADER_SIZE (96) /* sizeof (llog_log_hdr) + sizeof(llh_tail) + * - sizeof(llh_bitmap) */ +#define LLOG_BITMAP_BYTES (LLOG_MIN_CHUNK_SIZE - LLOG_HEADER_SIZE) +#define LLOG_MIN_REC_SIZE (24) /* round(llog_rec_hdr + llog_rec_tail) */ + +struct llog_log_hdr { + struct llog_rec_hdr llh_hdr; + __s64 llh_timestamp; + __u32 llh_count; + __u32 llh_bitmap_offset; + __u32 llh_size; + __u32 llh_flags; + /* for a catalog the first/oldest and still in-use plain slot is just + * next to it. It will serve as the upper limit after Catalog has + * wrapped around */ + __u32 llh_cat_idx; + struct obd_uuid llh_tgtuuid; + __u32 llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32)-23]; + /* These fields must always be at the end of the llog_log_hdr. + * Note: llh_bitmap size is variable because llog chunk size could be + * bigger than LLOG_MIN_CHUNK_SIZE, i.e. sizeof(llog_log_hdr) > 8192 + * bytes, and the real size is stored in llh_hdr.lrh_len, which means + * llh_tail should only be refered by LLOG_HDR_TAIL(). + * But this structure is also used by client/server llog interface + * (see llog_client.c), it will be kept in its original way to avoid + * compatiblity issue. */ + __u32 llh_bitmap[LLOG_BITMAP_BYTES / sizeof(__u32)]; + struct llog_rec_tail llh_tail; +} __attribute__((packed)); +#undef LLOG_HEADER_SIZE +#undef LLOG_BITMAP_BYTES + +#define LLOG_HDR_BITMAP_SIZE(llh) (__u32)((llh->llh_hdr.lrh_len - \ + llh->llh_bitmap_offset - \ + sizeof(llh->llh_tail)) * 8) +#define LLOG_HDR_BITMAP(llh) (__u32 *)((char *)(llh) + \ + (llh)->llh_bitmap_offset) +#define LLOG_HDR_TAIL(llh) ((struct llog_rec_tail *)((char *)llh + \ + llh->llh_hdr.lrh_len - \ + sizeof(llh->llh_tail))) + +/** log cookies are used to reference a specific log file and a record therein, + and pass record offset from llog_process_thread to llog_write */ +struct llog_cookie { + union { + struct llog_logid lgc_lgl; + __u64 lgc_offset; + }; + __u32 lgc_subsys; + __u32 lgc_index; + __u32 lgc_padding; +} __attribute__((packed)); + +/** llog protocol */ +enum llogd_rpc_ops { + LLOG_ORIGIN_HANDLE_CREATE = 501, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK = 502, + LLOG_ORIGIN_HANDLE_READ_HEADER = 503, +/* LLOG_ORIGIN_HANDLE_WRITE_REC = 504, Obsolete by 2.1. */ +/* LLOG_ORIGIN_HANDLE_CLOSE = 505, Obsolete by 1.8. */ +/* LLOG_ORIGIN_CONNECT = 506, Obsolete by 2.4. */ +/* LLOG_CATINFO = 507, Obsolete by 2.3. */ + LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508, + LLOG_ORIGIN_HANDLE_DESTROY = 509, /* Obsolete by 2.11. */ + LLOG_LAST_OPC, + LLOG_FIRST_OPC = LLOG_ORIGIN_HANDLE_CREATE +}; + +struct llogd_body { + struct llog_logid lgd_logid; + __u32 lgd_ctxt_idx; + __u32 lgd_llh_flags; + __u32 lgd_index; + __u32 lgd_saved_index; + __u32 lgd_len; + __u64 lgd_cur_offset; +} __attribute__((packed)); + +struct llogd_conn_body { + struct llog_gen lgdc_gen; + struct llog_logid lgdc_logid; + __u32 lgdc_ctxt_idx; +} __attribute__((packed)); + +/* Note: 64-bit types are 64-bit aligned in structure */ +struct obdo { + __u64 o_valid; /* hot fields in this obdo */ + struct ost_id o_oi; + __u64 o_parent_seq; + __u64 o_size; /* o_size-o_blocks == ost_lvb */ + __s64 o_mtime; + __s64 o_atime; + __s64 o_ctime; + __u64 o_blocks; /* brw: cli sent cached bytes */ + __u64 o_grant; + + /* 32-bit fields start here: keep an even number of them via padding */ + __u32 o_blksize; /* optimal IO blocksize */ + __u32 o_mode; /* brw: cli sent cache remain */ + __u32 o_uid; + __u32 o_gid; + __u32 o_flags; + __u32 o_nlink; /* brw: checksum */ + __u32 o_parent_oid; + __u32 o_misc; /* brw: o_dropped */ + + __u64 o_ioepoch; /* epoch in ost writes */ + __u32 o_stripe_idx; /* holds stripe idx */ + __u32 o_parent_ver; + struct lustre_handle o_handle; /* brw: lock handle to prolong + * locks */ + /* Originally, the field is llog_cookie for destroy with unlink cookie + * from MDS, it is obsolete in 2.8. Then reuse it by client to transfer + * layout and PFL information in IO, setattr RPCs. Since llog_cookie is + * not used on wire any longer, remove it from the obdo, then it can be + * enlarged freely in the further without affect related RPCs. + * + * sizeof(ost_layout) + sieof(__u32) == sizeof(llog_cookie). */ + struct ost_layout o_layout; + __u32 o_layout_version; + __u32 o_uid_h; + __u32 o_gid_h; + + __u64 o_data_version; /* getattr: sum of iversion for + * each stripe. + * brw: grant space consumed on + * the client for the write */ + __u32 o_projid; + __u32 o_padding_4; /* also fix + * lustre_swab_obdo() */ + __u64 o_padding_5; + __u64 o_padding_6; +}; + +#define o_dirty o_blocks +#define o_undirty o_mode +#define o_dropped o_misc +#define o_cksum o_nlink +#define o_grant_used o_data_version + +struct lfsck_request { + __u32 lr_event; + __u32 lr_index; + __u32 lr_flags; + __u32 lr_valid; + union { + __u32 lr_speed; + __u32 lr_status; + }; + __u16 lr_version; + __u16 lr_active; + __u16 lr_param; + __u16 lr_async_windows; + __u32 lr_flags2; + struct lu_fid lr_fid; + struct lu_fid lr_fid2; + __u32 lr_comp_id; + __u32 lr_padding_0; + __u64 lr_padding_1; + __u64 lr_padding_2; + __u64 lr_padding_3; +}; + +struct lfsck_reply { + __u32 lr_status; + __u32 lr_padding_1; + __u64 lr_repaired; +}; + +enum lfsck_events { + LE_LASTID_REBUILDING = 1, + LE_LASTID_REBUILT = 2, + LE_PHASE1_DONE = 3, + LE_PHASE2_DONE = 4, + LE_START = 5, + LE_STOP = 6, + LE_QUERY = 7, + /* LE_FID_ACCESSED = 8, moved to lfsck_events_local */ + LE_PEER_EXIT = 9, + LE_CONDITIONAL_DESTROY = 10, + LE_PAIRS_VERIFY = 11, + LE_SET_LMV_MASTER = 15, + LE_SET_LMV_SLAVE = 16, +}; + +enum lfsck_event_flags { + LEF_TO_OST = 0x00000001, + LEF_FROM_OST = 0x00000002, + LEF_SET_LMV_HASH = 0x00000004, + LEF_SET_LMV_ALL = 0x00000008, + LEF_RECHECK_NAME_HASH = 0x00000010, + LEF_QUERY_ALL = 0x00000020, +}; + +/* request structure for OST's */ +struct ost_body { + struct obdo oa; +}; + +/* Key for FIEMAP to be used in get_info calls */ +struct ll_fiemap_info_key { + char lfik_name[8]; + struct obdo lfik_oa; + struct fiemap lfik_fiemap; +}; + +#define IDX_INFO_MAGIC 0x3D37CC37 + +/* Index file transfer through the network. The server serializes the index into + * a byte stream which is sent to the client via a bulk transfer */ +struct idx_info { + __u32 ii_magic; + + /* reply: see idx_info_flags below */ + __u32 ii_flags; + + /* request & reply: number of lu_idxpage (to be) transferred */ + __u16 ii_count; + __u16 ii_pad0; + + /* request: requested attributes passed down to the iterator API */ + __u32 ii_attrs; + + /* request & reply: index file identifier (FID) */ + struct lu_fid ii_fid; + + /* reply: version of the index file before starting to walk the index. + * Please note that the version can be modified at any time during the + * transfer */ + __u64 ii_version; + + /* request: hash to start with: + * reply: hash of the first entry of the first lu_idxpage and hash + * of the entry to read next if any */ + __u64 ii_hash_start; + __u64 ii_hash_end; + + /* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is + * set */ + __u16 ii_keysize; + + /* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC + * is set */ + __u16 ii_recsize; + + __u32 ii_pad1; + __u64 ii_pad2; + __u64 ii_pad3; +}; + +#define II_END_OFF MDS_DIR_END_OFF /* all entries have been read */ + +/* List of flags used in idx_info::ii_flags */ +enum idx_info_flags { + II_FL_NOHASH = 1 << 0, /* client doesn't care about hash value */ + II_FL_VARKEY = 1 << 1, /* keys can be of variable size */ + II_FL_VARREC = 1 << 2, /* records can be of variable size */ + II_FL_NONUNQ = 1 << 3, /* index supports non-unique keys */ + II_FL_NOKEY = 1 << 4, /* client doesn't care about key */ +}; + +#define LIP_MAGIC 0x8A6D6B6C + +/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */ +struct lu_idxpage { + /* 16-byte header */ + __u32 lip_magic; + __u16 lip_flags; + __u16 lip_nr; /* number of entries in the container */ + __u64 lip_pad0; /* additional padding for future use */ + + /* key/record pairs are stored in the remaining 4080 bytes. + * depending upon the flags in idx_info::ii_flags, each key/record + * pair might be preceded by: + * - a hash value + * - the key size (II_FL_VARKEY is set) + * - the record size (II_FL_VARREC is set) + * + * For the time being, we only support fixed-size key & record. */ + char lip_entries[0]; +}; + +#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries)) + +/* Gather all possible type associated with a 4KB container */ +union lu_page { + struct lu_dirpage lp_dir; /* for MDS_READPAGE */ + struct lu_idxpage lp_idx; /* for OBD_IDX_READ */ + char lp_array[LU_PAGE_SIZE]; +}; + +/* security opcodes */ +enum sec_cmd { + SEC_CTX_INIT = 801, + SEC_CTX_INIT_CONT = 802, + SEC_CTX_FINI = 803, + SEC_LAST_OPC, + SEC_FIRST_OPC = SEC_CTX_INIT +}; + +/* + * capa related definitions + */ +#define CAPA_HMAC_MAX_LEN 64 +#define CAPA_HMAC_KEY_MAX_LEN 56 + +/* NB take care when changing the sequence of elements this struct, + * because the offset info is used in find_capa() */ +struct lustre_capa { + struct lu_fid lc_fid; /** fid */ + __u64 lc_opc; /** operations allowed */ + __u64 lc_uid; /** file owner */ + __u64 lc_gid; /** file group */ + __u32 lc_flags; /** HMAC algorithm & flags */ + __u32 lc_keyid; /** key# used for the capability */ + __u32 lc_timeout; /** capa timeout value (sec) */ + __u32 lc_expiry; /** expiry time (sec) */ + __u8 lc_hmac[CAPA_HMAC_MAX_LEN]; /** HMAC */ +} __attribute__((packed)); + +/** lustre_capa::lc_opc */ +enum { + CAPA_OPC_BODY_WRITE = 1<<0, /**< write object data */ + CAPA_OPC_BODY_READ = 1<<1, /**< read object data */ + CAPA_OPC_INDEX_LOOKUP = 1<<2, /**< lookup object fid */ + CAPA_OPC_INDEX_INSERT = 1<<3, /**< insert object fid */ + CAPA_OPC_INDEX_DELETE = 1<<4, /**< delete object fid */ + CAPA_OPC_OSS_WRITE = 1<<5, /**< write oss object data */ + CAPA_OPC_OSS_READ = 1<<6, /**< read oss object data */ + CAPA_OPC_OSS_TRUNC = 1<<7, /**< truncate oss object */ + CAPA_OPC_OSS_DESTROY = 1<<8, /**< destroy oss object */ + CAPA_OPC_META_WRITE = 1<<9, /**< write object meta data */ + CAPA_OPC_META_READ = 1<<10, /**< read object meta data */ +}; + +#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE) +#define CAPA_OPC_MDS_ONLY \ + (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \ + CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE) +#define CAPA_OPC_OSS_ONLY \ + (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC | \ + CAPA_OPC_OSS_DESTROY) +#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY +#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY) + +/* lustre_capa::lc_hmac_alg */ +enum { + CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */ + CAPA_HMAC_ALG_MAX, +}; + +#define CAPA_FL_MASK 0x00ffffff +#define CAPA_HMAC_ALG_MASK 0xff000000 + +struct lustre_capa_key { + __u64 lk_seq; /**< mds# */ + __u32 lk_keyid; /**< key# */ + __u32 lk_padding; + __u8 lk_key[CAPA_HMAC_KEY_MAX_LEN]; /**< key */ +} __attribute__((packed)); + +/** The link ea holds 1 \a link_ea_entry for each hardlink */ +#define LINK_EA_MAGIC 0x11EAF1DFUL +struct link_ea_header { + __u32 leh_magic; + __u32 leh_reccount; + __u64 leh_len; /* total size */ + __u32 leh_overflow_time; + __u32 leh_padding; +}; + +/** Hardlink data is name and parent fid. + * Stored in this crazy struct for maximum packing and endian-neutrality + */ +struct link_ea_entry { + /** __u16 stored big-endian, unaligned */ + unsigned char lee_reclen[2]; + unsigned char lee_parent_fid[sizeof(struct lu_fid)]; + char lee_name[0]; +} __attribute__((packed)); + +/** fid2path request/reply structure */ +struct getinfo_fid2path { + struct lu_fid gf_fid; + __u64 gf_recno; + __u32 gf_linkno; + __u32 gf_pathlen; + union { + char gf_path[0]; + struct lu_fid gf_root_fid[0]; + } gf_u; +} __attribute__((packed)); + +/** path2parent request/reply structures */ +struct getparent { + struct lu_fid gp_fid; /**< parent FID */ + __u32 gp_linkno; /**< hardlink number */ + __u32 gp_name_size; /**< size of the name field */ + char gp_name[0]; /**< zero-terminated link name */ +} __attribute__((packed)); + +enum layout_intent_opc { + LAYOUT_INTENT_ACCESS = 0, /** generic access */ + LAYOUT_INTENT_READ = 1, /** not used */ + LAYOUT_INTENT_WRITE = 2, /** write file, for comp layout */ + LAYOUT_INTENT_GLIMPSE = 3, /** not used */ + LAYOUT_INTENT_TRUNC = 4, /** truncate file, for comp layout */ + LAYOUT_INTENT_RELEASE = 5, /** reserved for HSM release */ + LAYOUT_INTENT_RESTORE = 6, /** reserved for HSM restore */ +}; + +/* enqueue layout lock with intent */ +struct layout_intent { + __u32 li_opc; /* intent operation for enqueue, read, write etc */ + __u32 li_flags; + struct lu_extent li_extent; +} __attribute__((packed)); + +/** + * On the wire version of hsm_progress structure. + * + * Contains the userspace hsm_progress and some internal fields. + */ +struct hsm_progress_kernel { + /* Field taken from struct hsm_progress */ + struct lu_fid hpk_fid; + __u64 hpk_cookie; + struct hsm_extent hpk_extent; + __u16 hpk_flags; + __u16 hpk_errval; /* positive val */ + __u32 hpk_padding1; + /* Additional fields */ + __u64 hpk_data_version; + __u64 hpk_padding2; +} __attribute__((packed)); + +/** + * OUT_UPDATE RPC Format + * + * During the cross-ref operation, the Master MDT, which the client send the + * request to, will disassembly the operation into object updates, then OSP + * will send these updates to the remote MDT to be executed. + * + * An UPDATE_OBJ RPC does a list of updates. Each update belongs to an + * operation and does a type of modification to an object. + * + * Request Format + * + * update_buf + * update (1st) + * update (2nd) + * ... + * update (ub_count-th) + * + * ub_count must be less than or equal to UPDATE_PER_RPC_MAX. + * + * Reply Format + * + * update_reply + * rc [+ buffers] (1st) + * rc [+ buffers] (2st) + * ... + * rc [+ buffers] (nr_count-th) + * + * ur_count must be less than or equal to UPDATE_PER_RPC_MAX and should usually + * be equal to ub_count. + */ + +/** + * Type of each update, if adding/deleting update, please also update + * update_opcode in lustre/target/out_lib.c. + */ +enum update_type { + OUT_START = 0, + OUT_CREATE = 1, + OUT_DESTROY = 2, + OUT_REF_ADD = 3, + OUT_REF_DEL = 4, + OUT_ATTR_SET = 5, + OUT_ATTR_GET = 6, + OUT_XATTR_SET = 7, + OUT_XATTR_GET = 8, + OUT_INDEX_LOOKUP = 9, + OUT_INDEX_INSERT = 10, + OUT_INDEX_DELETE = 11, + OUT_WRITE = 12, + OUT_XATTR_DEL = 13, + OUT_PUNCH = 14, + OUT_READ = 15, + OUT_NOOP = 16, + OUT_XATTR_LIST = 17, + OUT_LAST +}; + +enum update_flag { + UPDATE_FL_OST = 0x00000001, /* op from OST (not MDT) */ + UPDATE_FL_SYNC = 0x00000002, /* commit before replying */ + UPDATE_FL_COMMITTED = 0x00000004, /* op committed globally */ + UPDATE_FL_NOLOG = 0x00000008 /* for idempotent updates */ +}; + +struct object_update_param { + __u16 oup_len; /* length of this parameter */ + __u16 oup_padding; + __u32 oup_padding2; + char oup_buf[0]; +}; + +/* object update */ +struct object_update { + __u16 ou_type; /* enum update_type */ + __u16 ou_params_count; /* update parameters count */ + __u32 ou_result_size; /* how many bytes can return */ + __u32 ou_flags; /* enum update_flag */ + __u32 ou_padding1; /* padding 1 */ + __u64 ou_batchid; /* op transno on master */ + struct lu_fid ou_fid; /* object to be updated */ + struct object_update_param ou_params[0]; /* update params */ +}; + +#define UPDATE_REQUEST_MAGIC_V1 0xBDDE0001 +#define UPDATE_REQUEST_MAGIC_V2 0xBDDE0002 +#define UPDATE_REQUEST_MAGIC UPDATE_REQUEST_MAGIC_V2 +/* Hold object_updates sending to the remote OUT in single RPC */ +struct object_update_request { + __u32 ourq_magic; + __u16 ourq_count; /* number of ourq_updates[] */ + __u16 ourq_padding; + struct object_update ourq_updates[0]; +}; + +#define OUT_UPDATE_HEADER_MAGIC 0xBDDF0001 +#define OUT_UPDATE_MAX_INLINE_SIZE 4096 +/* Header for updates request between MDTs */ +struct out_update_header { + __u32 ouh_magic; + __u32 ouh_count; + __u32 ouh_inline_length; + __u32 ouh_reply_size; + __u32 ouh_inline_data[0]; +}; + +struct out_update_buffer { + __u32 oub_size; + __u32 oub_padding; +}; + +/* the result of object update */ +struct object_update_result { + __u32 our_rc; + __u16 our_datalen; + __u16 our_padding; + __u32 our_data[0]; +}; + +#define UPDATE_REPLY_MAGIC_V1 0x00BD0001 +#define UPDATE_REPLY_MAGIC_V2 0x00BD0002 +#define UPDATE_REPLY_MAGIC UPDATE_REPLY_MAGIC_V2 +/* Hold object_update_results being replied from the remote OUT. */ +struct object_update_reply { + __u32 ourp_magic; + __u16 ourp_count; + __u16 ourp_padding; + __u16 ourp_lens[0]; +}; + +/* read update result */ +struct out_read_reply { + __u32 orr_size; + __u32 orr_padding; + __u64 orr_offset; + char orr_data[0]; +}; + +/** layout swap request structure + * fid1 and fid2 are in mdt_body + */ +struct mdc_swap_layouts { + __u64 msl_flags; +} __attribute__((packed)); + +#define INLINE_RESYNC_ARRAY_SIZE 15 +struct close_data_resync_done { + __u32 resync_count; + __u32 resync_ids_inline[INLINE_RESYNC_ARRAY_SIZE]; +}; + +struct close_data { + struct lustre_handle cd_handle; + struct lu_fid cd_fid; + __u64 cd_data_version; + union { + __u64 cd_reserved[8]; + struct close_data_resync_done cd_resync; + /* split close */ + __u16 cd_mirror_id; + }; +}; + +/* Update llog format */ +struct update_op { + struct lu_fid uop_fid; + __u16 uop_type; + __u16 uop_param_count; + __u16 uop_params_off[0]; +} __attribute__((packed)); + +struct update_ops { + struct update_op uops_op[0]; +}; + +struct update_params { + struct object_update_param up_params[0]; +}; + +enum update_records_flag { + UPDATE_RECORD_CONTINUE = 1 >> 0, +}; +/* + * This is the update record format used to store the updates in + * disk. All updates of the operation will be stored in ur_ops. + * All of parameters for updates of the operation will be stored + * in ur_params. + * To save the space of the record, parameters in ur_ops will only + * remember their offset in ur_params, so to avoid storing duplicate + * parameters in ur_params, which can help us save a lot space for + * operation like creating striped directory. + */ +struct update_records { + __u64 ur_master_transno; + __u64 ur_batchid; + __u32 ur_flags; + /* If the operation includes multiple updates, then ur_index + * means the index of the update inside the whole updates. */ + __u32 ur_index; + __u32 ur_update_count; + __u32 ur_param_count; + struct update_ops ur_ops; + /* Note ur_ops has a variable size, so comment out + * the following ur_params, in case some use it directly + * update_records->ur_params + * + * struct update_params ur_params; + */ +}; + +struct llog_update_record { + struct llog_rec_hdr lur_hdr; + struct update_records lur_update_rec; + /* Note ur_update_rec has a variable size, so comment out + * the following ur_tail, in case someone use it directly + * + * struct llog_rec_tail lur_tail; + */ +}; + +/* sepol string format is: + * <1-digit for SELinux status>::: + */ +/* Max length of the sepol string + * Should be large enough to contain a sha512sum of the policy + */ +#define SELINUX_MODE_LEN 1 +#define SELINUX_POLICY_VER_LEN 3 /* 3 chars to leave room for the future */ +#define SELINUX_POLICY_HASH_LEN 64 +#define LUSTRE_NODEMAP_SEPOL_LENGTH (SELINUX_MODE_LEN + NAME_MAX + \ + SELINUX_POLICY_VER_LEN + \ + SELINUX_POLICY_HASH_LEN + 3) + +/* nodemap records, uses 32 byte record length */ +#define LUSTRE_NODEMAP_NAME_LENGTH 16 +struct nodemap_cluster_rec { + char ncr_name[LUSTRE_NODEMAP_NAME_LENGTH + 1]; + __u8 ncr_flags; + __u16 ncr_padding1; + __u32 ncr_padding2; + __u32 ncr_squash_uid; + __u32 ncr_squash_gid; +}; + +/* lnet_nid_t is 8 bytes */ +struct nodemap_range_rec { + lnet_nid_t nrr_start_nid; + lnet_nid_t nrr_end_nid; + __u64 nrr_padding1; + __u64 nrr_padding2; +}; + +struct nodemap_id_rec { + __u32 nir_id_fs; + __u32 nir_padding1; + __u64 nir_padding2; + __u64 nir_padding3; + __u64 nir_padding4; +}; + +struct nodemap_global_rec { + __u8 ngr_is_active; + __u8 ngr_padding1; + __u16 ngr_padding2; + __u32 ngr_padding3; + __u64 ngr_padding4; + __u64 ngr_padding5; + __u64 ngr_padding6; +}; + +union nodemap_rec { + struct nodemap_cluster_rec ncr; + struct nodemap_range_rec nrr; + struct nodemap_id_rec nir; + struct nodemap_global_rec ngr; +}; + +/* This is the lu_ladvise struct which goes out on the wire. + * Corresponds to the userspace arg llapi_lu_ladvise. + * value[1-4] are unspecified fields, used differently by different advices */ +struct lu_ladvise { + __u16 lla_advice; /* advice type */ + __u16 lla_value1; /* values for different advice types */ + __u32 lla_value2; + __u64 lla_start; /* first byte of extent for advice */ + __u64 lla_end; /* last byte of extent for advice */ + __u32 lla_value3; + __u32 lla_value4; +}; + +/* This is the ladvise_hdr which goes on the wire, corresponds to the userspace + * arg llapi_ladvise_hdr. + * value[1-3] are unspecified fields, used differently by different advices */ +struct ladvise_hdr { + __u32 lah_magic; /* LADVISE_MAGIC */ + __u32 lah_count; /* number of advices */ + __u64 lah_flags; /* from enum ladvise_flag */ + __u32 lah_value1; /* unused */ + __u32 lah_value2; /* unused */ + __u64 lah_value3; /* unused */ + struct lu_ladvise lah_advise[0]; /* advices in this header */ +}; + +#if defined(__cplusplus) +} +#endif + +#endif +/** @} lustreidl */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h new file mode 100644 index 0000000000000..d0dc08bda5433 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h @@ -0,0 +1,238 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2017, Intel Corporation. + */ +#ifndef _UAPI_LUSTRE_IOCTL_H +#define _UAPI_LUSTRE_IOCTL_H + +#include +#include +#include +#include + +/* + * sparse kernel source annotations + */ +#ifndef __user +#define __user +#endif + +enum md_echo_cmd { + ECHO_MD_CREATE = 1, /* Open/Create file on MDT */ + ECHO_MD_MKDIR = 2, /* Mkdir on MDT */ + ECHO_MD_DESTROY = 3, /* Unlink file on MDT */ + ECHO_MD_RMDIR = 4, /* Rmdir on MDT */ + ECHO_MD_LOOKUP = 5, /* Lookup on MDT */ + ECHO_MD_GETATTR = 6, /* Getattr on MDT */ + ECHO_MD_SETATTR = 7, /* Setattr on MDT */ + ECHO_MD_ALLOC_FID = 8, /* Get FIDs from MDT */ +}; + +#define OBD_DEV_ID 1 +#define OBD_DEV_NAME "obd" +#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME + +#define OBD_IOCTL_VERSION 0x00010004 +#define OBD_DEV_BY_DEVNAME 0xffffd0de + +struct obd_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + + union { + __u64 ioc_cookie; + __u64 ioc_u64_1; + }; + union { + __u32 ioc_conn1; + __u32 ioc_u32_1; + }; + union { + __u32 ioc_conn2; + __u32 ioc_u32_2; + }; + + struct obdo ioc_obdo1; + struct obdo ioc_obdo2; + + __u64 ioc_count; + __u64 ioc_offset; + __u32 ioc_dev; + __u32 ioc_command; + + __u64 ioc_nid; + __u32 ioc_nal; + __u32 ioc_type; + + /* buffers the kernel will treat as user pointers */ + __u32 ioc_plen1; + char __user *ioc_pbuf1; + __u32 ioc_plen2; + char __user *ioc_pbuf2; + + /* inline buffers for various arguments */ + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + __u32 ioc_inllen3; + char *ioc_inlbuf3; + __u32 ioc_inllen4; + char *ioc_inlbuf4; + + char ioc_bulk[0]; +}; + +struct obd_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +static inline __u32 obd_ioctl_packlen(struct obd_ioctl_data *data) +{ + __u32 len = __ALIGN_KERNEL(sizeof(*data), 8); + + len += __ALIGN_KERNEL(data->ioc_inllen1, 8); + len += __ALIGN_KERNEL(data->ioc_inllen2, 8); + len += __ALIGN_KERNEL(data->ioc_inllen3, 8); + len += __ALIGN_KERNEL(data->ioc_inllen4, 8); + + return len; +} + +/* + * OBD_IOC_DATA_TYPE is only for compatibility reasons with older + * Linux Lustre user tools. New ioctls should NOT use this macro as + * the ioctl "size". Instead the ioctl should get a "size" argument + * which is the actual data type used by the ioctl, to ensure the + * ioctl interface is versioned correctly. + */ +#define OBD_IOC_DATA_TYPE long + +/* IOC_LDLM_TEST _IOWR('f', 40, long) */ +/* IOC_LDLM_DUMP _IOWR('f', 41, long) */ +/* IOC_LDLM_REGRESS_START _IOWR('f', 42, long) */ +/* IOC_LDLM_REGRESS_STOP _IOWR('f', 43, long) */ + +#define OBD_IOC_CREATE _IOWR('f', 101, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DESTROY _IOW('f', 104, OBD_IOC_DATA_TYPE) +/* OBD_IOC_PREALLOCATE _IOWR('f', 105, OBD_IOC_DATA_TYPE) */ + +#define OBD_IOC_SETATTR _IOW('f', 107, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETATTR _IOWR('f', 108, OBD_IOC_DATA_TYPE) +#define OBD_IOC_READ _IOWR('f', 109, OBD_IOC_DATA_TYPE) +#define OBD_IOC_WRITE _IOWR('f', 110, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_STATFS _IOWR('f', 113, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SYNC _IOW('f', 114, OBD_IOC_DATA_TYPE) +/* OBD_IOC_READ2 _IOWR('f', 115, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_FORMAT _IOWR('f', 116, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_PARTITION _IOWR('f', 117, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_COPY _IOWR('f', 120, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_MIGR _IOWR('f', 121, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_PUNCH _IOWR('f', 122, OBD_IOC_DATA_TYPE) */ + +/* OBD_IOC_MODULE_DEBUG _IOWR('f', 124, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_BRW_READ _IOWR('f', 125, OBD_IOC_DATA_TYPE) +#define OBD_IOC_BRW_WRITE _IOWR('f', 126, OBD_IOC_DATA_TYPE) +#define OBD_IOC_NAME2DEV _IOWR('f', 127, OBD_IOC_DATA_TYPE) +#define OBD_IOC_UUID2DEV _IOWR('f', 130, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETNAME _IOWR('f', 131, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETMDNAME _IOR('f', 131, char[MAX_OBD_NAME]) +#define OBD_IOC_GETDTNAME OBD_IOC_GETNAME +#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, OBD_IOC_DATA_TYPE) +#define OBD_IOC_CLIENT_RECOVER _IOW('f', 133, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PING_TARGET _IOW('f', 136, OBD_IOC_DATA_TYPE) + +/* OBD_IOC_DEC_FS_USE_COUNT _IO('f', 139) */ +#define OBD_IOC_NO_TRANSNO _IOW('f', 140, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SET_READONLY _IOW('f', 141, OBD_IOC_DATA_TYPE) +#define OBD_IOC_ABORT_RECOVERY _IOR('f', 142, OBD_IOC_DATA_TYPE) +/* OBD_IOC_ROOT_SQUASH _IOWR('f', 143, OBD_IOC_DATA_TYPE) */ +#define OBD_GET_VERSION _IOWR('f', 144, OBD_IOC_DATA_TYPE) +/* OBD_IOC_GSS_SUPPORT _IOWR('f', 145, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_CLOSE_UUID _IOWR('f', 147, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_CHANGELOG_SEND _IOW('f', 148, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_GETDEVICE _IOWR('f', 149, OBD_IOC_DATA_TYPE) +#define OBD_IOC_FID2PATH _IOWR('f', 150, OBD_IOC_DATA_TYPE) +/* lustre/lustre_user.h 151-153 */ +/* OBD_IOC_LOV_SETSTRIPE 154 LL_IOC_LOV_SETSTRIPE */ +/* OBD_IOC_LOV_GETSTRIPE 155 LL_IOC_LOV_GETSTRIPE */ +/* OBD_IOC_LOV_SETEA 156 LL_IOC_LOV_SETEA */ +/* lustre/lustre_user.h 157-159 */ +/* OBD_IOC_QUOTACHECK _IOW('f', 160, int) */ +/* OBD_IOC_POLL_QUOTACHECK _IOR('f', 161, struct if_quotacheck *) */ +#define OBD_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) +/* lustre/lustre_user.h 163-176 */ +#define OBD_IOC_CHANGELOG_REG _IOW('f', 177, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_DEREG _IOW('f', 178, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_CLEAR _IOW('f', 179, struct obd_ioctl_data) +/* OBD_IOC_RECORD _IOWR('f', 180, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_ENDRECORD _IOWR('f', 181, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_PARSE _IOWR('f', 182, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_DORECORD _IOWR('f', 183, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE) +/* OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_PARAM _IOW('f', 187, OBD_IOC_DATA_TYPE) +#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE) +#define OBD_IOC_REPLACE_NIDS _IOWR('f', 189, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_PRINT _IOWR('f', 192, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CANCEL _IOWR('f', 193, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_REMOVE _IOWR('f', 194, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CHECK _IOWR('f', 195, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LLOG_CATINFO _IOWR('f', 196, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_NODEMAP _IOWR('f', 197, OBD_IOC_DATA_TYPE) +#define OBD_IOC_CLEAR_CONFIGS _IOWR('f', 198, OBD_IOC_DATA_TYPE) + +/* ECHO_IOC_GET_STRIPE _IOWR('f', 200, OBD_IOC_DATA_TYPE) */ +/* ECHO_IOC_SET_STRIPE _IOWR('f', 201, OBD_IOC_DATA_TYPE) */ +/* ECHO_IOC_ENQUEUE _IOWR('f', 202, OBD_IOC_DATA_TYPE) */ +/* ECHO_IOC_CANCEL _IOWR('f', 203, OBD_IOC_DATA_TYPE) */ + +#define OBD_IOC_LCFG_FORK _IOWR('f', 208, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LCFG_ERASE _IOWR('f', 209, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GET_OBJ_VERSION _IOR('f', 210, OBD_IOC_DATA_TYPE) + +/* lustre/lustre_user.h 211-220 */ +/* was #define OBD_IOC_GET_MNTOPT _IOW('f', 220, mntopt_t) until 2.11 */ +#define OBD_IOC_ECHO_MD _IOR('f', 221, struct obd_ioctl_data) +#define OBD_IOC_ECHO_ALLOC_SEQ _IOWR('f', 222, struct obd_ioctl_data) +#define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE) +#define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE) +#define OBD_IOC_QUERY_LFSCK _IOR('f', 232, struct obd_ioctl_data) +#define OBD_IOC_CHLG_POLL _IOR('f', 233, long) +/* lustre/lustre_user.h 240-249 */ +/* LIBCFS_IOC_DEBUG_MASK 250 */ + +#define OBD_IOC_BARRIER _IOWR('f', 261, OBD_IOC_DATA_TYPE) + +#define IOC_OSC_SET_ACTIVE _IOWR('h', 21, void *) + +#endif /* _UAPI_LUSTRE_IOCTL_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h new file mode 100644 index 0000000000000..26819ff7995cf --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h @@ -0,0 +1,98 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. + * + * LGPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: Nathan Rutman + * + * Kernel <-> userspace communication routines. + * The definitions below are used in the kernel and userspace. + */ + +#ifndef __UAPI_KERNELCOMM_H__ +#define __UAPI_KERNELCOMM_H__ + +#include + +/* KUC message header. + * All current and future KUC messages should use this header. + * To avoid having to include Lustre headers from libcfs, define this here. + */ +struct kuc_hdr { + __u16 kuc_magic; + __u8 kuc_transport; /* Each new Lustre feature should use a different + transport */ + __u8 kuc_flags; + __u16 kuc_msgtype; /* Message type or opcode, transport-specific */ + __u16 kuc_msglen; /* Including header */ +} __attribute__((aligned(sizeof(__u64)))); + + +#define KUC_MAGIC 0x191C /*Lustre9etLinC */ + +/* kuc_msgtype values are defined in each transport */ +enum kuc_transport_type { + KUC_TRANSPORT_GENERIC = 1, + KUC_TRANSPORT_HSM = 2, +}; + +enum kuc_generic_message_type { + KUC_MSG_SHUTDOWN = 1, +}; + +/* KUC Broadcast Groups. This determines which userspace process hears which + * messages. Mutliple transports may be used within a group, or multiple + * groups may use the same transport. Broadcast + * groups need not be used if e.g. a UID is specified instead; + * use group 0 to signify unicast. + */ +#define KUC_GRP_HSM 0x02 +#define KUC_GRP_MAX KUC_GRP_HSM + +enum lk_flags { + LK_FLG_STOP = 0x0001, + LK_FLG_DATANR = 0x0002, +}; +#define LK_NOFD -1U + +/* kernelcomm control structure, passed from userspace to kernel. + * For compatibility with old copytools, users who pass ARCHIVE_IDs + * to kernel using lk_data_count and lk_data should fill lk_flags with + * LK_FLG_DATANR. Otherwise kernel will take lk_data_count as bitmap of + * ARCHIVE IDs. + */ +struct lustre_kernelcomm { + __u32 lk_wfd; + __u32 lk_rfd; + __u32 lk_uid; + __u32 lk_group; + __u32 lk_data_count; + __u32 lk_flags; + __u32 lk_data[0]; +} __attribute__((packed)); + +#endif /* __UAPI_KERNELCOMM_H__ */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h new file mode 100644 index 0000000000000..68c8d3a1009c4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h @@ -0,0 +1,238 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/include/lustre/lustre_lfsck_user.h + * + * Lustre LFSCK userspace interfaces. + * + * Author: Fan, Yong + */ + +#ifndef _LUSTRE_LFSCK_USER_H +# define _LUSTRE_LFSCK_USER_H + +#include +#include + +/** + * state machine: + * + * LS_INIT + * | + * (lfsck|start) + * | + * v + * LS_SCANNING_PHASE1 + * | ^ + * | : + * | (lfsck:restart) + * | : + * v : + * ----------------------------------------------------------------- + * | |^ |^ |^ |^ |^ + * | |: |: |: |: |: + * v v: v: v: v: v: + * LS_SCANNING_PHASE2 LS_FAILED LS_STOPPED LS_PAUSED LS_CRASHED LS_PARTIAL + * (CO_) (CO_) (CO_) + * | ^ ^: ^: ^: ^: ^: + * | : |: |: |: |: |: + * | (lfsck:restart) |: |: |: |: |: + * v : |v |v |v |v |v + * ----------------------------------------------------------------- + * | + * v + * LS_COMPLETED + */ +enum lfsck_status { + /* The lfsck file is new created, for new MDT, upgrading from old disk, + * or re-creating the lfsck file manually. */ + LS_INIT = 0, + + /* The first-step system scanning. The checked items during the phase1 + * scanning depends on the LFSCK type. */ + LS_SCANNING_PHASE1 = 1, + + /* The second-step system scanning. The checked items during the phase2 + * scanning depends on the LFSCK type. */ + LS_SCANNING_PHASE2 = 2, + + /* The LFSCK processing has completed for all objects. */ + LS_COMPLETED = 3, + + /* The LFSCK exited automatically for failure, will not auto restart. */ + LS_FAILED = 4, + + /* The LFSCK is stopped manually, will not auto restart. */ + LS_STOPPED = 5, + + /* LFSCK is paused automatically when umount, + * will be restarted automatically when remount. */ + LS_PAUSED = 6, + + /* System crashed during the LFSCK, + * will be restarted automatically after recovery. */ + LS_CRASHED = 7, + + /* Some OST/MDT failed during the LFSCK, or not join the LFSCK. */ + LS_PARTIAL = 8, + + /* The LFSCK is failed because its controller is failed. */ + LS_CO_FAILED = 9, + + /* The LFSCK is stopped because its controller is stopped. */ + LS_CO_STOPPED = 10, + + /* The LFSCK is paused because its controller is paused. */ + LS_CO_PAUSED = 11, + + LS_MAX +}; + +static inline const char *lfsck_status2name(int status) +{ + static const char * const lfsck_status_names[] = { + [LS_INIT] = "init", + [LS_SCANNING_PHASE1] = "scanning-phase1", + [LS_SCANNING_PHASE2] = "scanning-phase2", + [LS_COMPLETED] = "completed", + [LS_FAILED] = "failed", + [LS_STOPPED] = "stopped", + [LS_PAUSED] = "paused", + [LS_CRASHED] = "crashed", + [LS_PARTIAL] = "partial", + [LS_CO_FAILED] = "co-failed", + [LS_CO_STOPPED] = "co-stopped", + [LS_CO_PAUSED] = "co-paused" + }; + + if (status < 0 || status >= LS_MAX) + return "unknown"; + + return lfsck_status_names[status]; +} + +enum lfsck_param_flags { + /* Reset LFSCK iterator position to the device beginning. */ + LPF_RESET = 0x0001, + + /* Exit when fail. */ + LPF_FAILOUT = 0x0002, + + /* Dryrun mode, only check without modification */ + LPF_DRYRUN = 0x0004, + + /* LFSCK runs on all targets. */ + LPF_ALL_TGT = 0x0008, + + /* Broadcast the command to other MDTs. Only valid on the sponsor MDT */ + LPF_BROADCAST = 0x0010, + + /* Handle orphan OST-objects. */ + LPF_OST_ORPHAN = 0x0020, + + /* Create OST-object for dangling LOV EA. */ + LPF_CREATE_OSTOBJ = 0x0040, + + /* Create MDT-object for dangling name entry. */ + LPF_CREATE_MDTOBJ = 0x0080, + + /* Do not return until the LFSCK not running. */ + LPF_WAIT = 0x0100, + + /* Delay to create OST-object for dangling LOV EA. */ + LPF_DELAY_CREATE_OSTOBJ = 0x0200, +}; + +enum lfsck_type { + /* For MDT and OST internal OSD consistency check/repair. */ + LFSCK_TYPE_SCRUB = 0x0000, + + /* For MDT-OST (layout, object) consistency check/repair. */ + LFSCK_TYPE_LAYOUT = 0x0001, + + /* For MDT (FID-in-dirent, linkEA) consistency check/repair. */ + LFSCK_TYPE_NAMESPACE = 0x0004, + LFSCK_TYPES_SUPPORTED = (LFSCK_TYPE_SCRUB | LFSCK_TYPE_LAYOUT | + LFSCK_TYPE_NAMESPACE), + LFSCK_TYPES_DEF = LFSCK_TYPES_SUPPORTED, + LFSCK_TYPES_ALL = ((__u16)(~0)) +}; + +#define LFSCK_VERSION_V1 1 +#define LFSCK_VERSION_V2 2 + +#define LFSCK_SPEED_NO_LIMIT 0 +#define LFSCK_SPEED_LIMIT_DEF LFSCK_SPEED_NO_LIMIT +#define LFSCK_ASYNC_WIN_DEFAULT 1024 +#define LFSCK_ASYNC_WIN_MAX ((__u16)(~0)) +#define LFSCK_TYPE_BITS 16 + +enum lfsck_start_valid { + LSV_SPEED_LIMIT = 0x00000001, + LSV_ERROR_HANDLE = 0x00000002, + LSV_DRYRUN = 0x00000004, + LSV_ASYNC_WINDOWS = 0x00000008, + LSV_CREATE_OSTOBJ = 0x00000010, + LSV_CREATE_MDTOBJ = 0x00000020, + LSV_DELAY_CREATE_OSTOBJ = 0x00000040, +}; + +/* Arguments for starting lfsck. */ +struct lfsck_start { + /* Which arguments are valid, see 'enum lfsck_start_valid'. */ + __u32 ls_valid; + + /* How many items can be scanned at most per second. */ + __u32 ls_speed_limit; + + /* For compatibility between user space tools and kernel service. */ + __u16 ls_version; + + /* Which LFSCK components to be (have been) started. */ + __u16 ls_active; + + /* Flags for the LFSCK, see 'enum lfsck_param_flags'. */ + __u16 ls_flags; + + /* The windows size for async requests pipeline. */ + __u16 ls_async_windows; +}; + +struct lfsck_stop { + __u32 ls_status; + __u16 ls_flags; + __u16 ls_padding_1; /* For 64-bits aligned. */ + __u64 ls_padding_2; +}; + +struct lfsck_query { + __u16 lu_types; + __u16 lu_flags; + __u32 lu_mdts_count[LFSCK_TYPE_BITS][LS_MAX + 1]; + __u32 lu_osts_count[LFSCK_TYPE_BITS][LS_MAX + 1]; + __u64 lu_repaired[LFSCK_TYPE_BITS]; +}; + +#endif /* _LUSTRE_LFSCK_USER_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h new file mode 100644 index 0000000000000..bcf46eb21e6c2 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h @@ -0,0 +1,80 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/include/lustre_log_user.h + * + * Userspace-usable portion of Generic infrastructure for managing + * a collection of logs. + * See lustre_log.h for more details. + */ + +#ifndef _LUSTRE_LOG_USER_H +#define _LUSTRE_LOG_USER_H + +#include +#include + +/* Lustre logs use FIDs constructed from oi_id and oi_seq directly, + * without attempting to use the IGIF and IDIF ranges as is done + * elsewhere, because of compatibility concerns (see lu-2888). + */ + +static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid) +{ + /* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS) + * logid's by non-zero ogen (inode generation) and convert them + * into IGIF */ + if (id->lgl_ogen == 0) { + fid->f_seq = id->lgl_oi.oi.oi_seq; + fid->f_oid = id->lgl_oi.oi.oi_id; + fid->f_ver = 0; + } else { + lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen); + } +} + +static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id) +{ + id->lgl_oi.oi.oi_seq = fid->f_seq; + id->lgl_oi.oi.oi_id = fid->f_oid; + id->lgl_ogen = 0; +} + +static inline void logid_set_id(struct llog_logid *log_id, __u64 id) +{ + log_id->lgl_oi.oi.oi_id = id; +} + +static inline __u64 logid_id(struct llog_logid *log_id) +{ + return log_id->lgl_oi.oi.oi_id; +} + +#endif /* ifndef _LUSTRE_LOG_USER_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h new file mode 100644 index 0000000000000..90fa213f83e90 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h @@ -0,0 +1,237 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + * + * Copyright 2015 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * Define ost_id associated functions + */ + +#ifndef _UAPI_LUSTRE_OSTID_H_ +#define _UAPI_LUSTRE_OSTID_H_ + +#include +#include +#include + +static inline __u64 lmm_oi_id(const struct ost_id *oi) +{ + return oi->oi.oi_id; +} + +static inline __u64 lmm_oi_seq(const struct ost_id *oi) +{ + return oi->oi.oi_seq; +} + +static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq) +{ + oi->oi.oi_seq = seq; +} + +static inline void lmm_oi_set_id(struct ost_id *oi, __u64 oid) +{ + oi->oi.oi_id = oid; +} + +static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi, + const struct ost_id *src_oi) +{ + dst_oi->oi.oi_id = __le64_to_cpu(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = __le64_to_cpu(src_oi->oi.oi_seq); +} + +static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi, + const struct ost_id *src_oi) +{ + dst_oi->oi.oi_id = __cpu_to_le64(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = __cpu_to_le64(src_oi->oi.oi_seq); +} + +/* extract OST sequence (group) from a wire ost_id (id/seq) pair */ +static inline __u64 ostid_seq(const struct ost_id *ostid) +{ + if (fid_seq_is_mdt0(ostid->oi.oi_seq)) + return FID_SEQ_OST_MDT0; + + if (fid_seq_is_default(ostid->oi.oi_seq)) + return FID_SEQ_LOV_DEFAULT; + + if (fid_is_idif(&ostid->oi_fid)) + return FID_SEQ_OST_MDT0; + + return fid_seq(&ostid->oi_fid); +} + +/* extract OST objid from a wire ost_id (id/seq) pair */ +static inline __u64 ostid_id(const struct ost_id *ostid) +{ + if (fid_seq_is_mdt0(ostid->oi.oi_seq)) + return ostid->oi.oi_id & IDIF_OID_MASK; + + if (fid_seq_is_default(ostid->oi.oi_seq)) + return ostid->oi.oi_id; + + if (fid_is_idif(&ostid->oi_fid)) + return fid_idif_id(fid_seq(&ostid->oi_fid), + fid_oid(&ostid->oi_fid), 0); + + return fid_oid(&ostid->oi_fid); +} + +static inline void ostid_set_seq(struct ost_id *oi, __u64 seq) +{ + if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) { + oi->oi.oi_seq = seq; + } else { + oi->oi_fid.f_seq = seq; + /* + * Note: if f_oid + f_ver is zero, we need init it + * to be 1, otherwise, ostid_seq will treat this + * as old ostid (oi_seq == 0) + */ + if (!oi->oi_fid.f_oid && !oi->oi_fid.f_ver) + oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID; + } +} + +static inline void ostid_set_seq_mdt0(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_OST_MDT0); +} + +static inline void ostid_set_seq_echo(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_ECHO); +} + +static inline void ostid_set_seq_llog(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_LLOG); +} + +static inline void ostid_cpu_to_le(const struct ost_id *src_oi, + struct ost_id *dst_oi) +{ + if (fid_seq_is_mdt0(src_oi->oi.oi_seq)) { + dst_oi->oi.oi_id = __cpu_to_le64(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = __cpu_to_le64(src_oi->oi.oi_seq); + } else { + fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid); + } +} + +static inline void ostid_le_to_cpu(const struct ost_id *src_oi, + struct ost_id *dst_oi) +{ + if (fid_seq_is_mdt0(src_oi->oi.oi_seq)) { + dst_oi->oi.oi_id = __le64_to_cpu(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = __le64_to_cpu(src_oi->oi.oi_seq); + } else { + fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid); + } +} + +/** + * Sigh, because pre-2.4 uses + * struct lov_mds_md_v1 { + * ........ + * __u64 lmm_object_id; + * __u64 lmm_object_seq; + * ...... + * } + * to identify the LOV(MDT) object, and lmm_object_seq will + * be normal_fid, which make it hard to combine these conversion + * to ostid_to FID. so we will do lmm_oi/fid conversion separately + * + * We can tell the lmm_oi by this way, + * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0 + * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL + * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k}, + * lmm_oi.f_ver = 0 + * + * But currently lmm_oi/lsm_oi does not have any "real" usages, + * except for printing some information, and the user can always + * get the real FID from LMA, besides this multiple case check might + * make swab more complicate. So we will keep using id/seq for lmm_oi. + */ + +static inline void fid_to_lmm_oi(const struct lu_fid *fid, + struct ost_id *oi) +{ + oi->oi.oi_id = fid_oid(fid); + oi->oi.oi_seq = fid_seq(fid); +} + +/** + * Unpack an OST object id/seq (group) into a FID. This is needed for + * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper + * FIDs. Note that if an id/seq is already in FID/IDIF format it will + * be passed through unchanged. Only legacy OST objects in "group 0" + * will be mapped into the IDIF namespace so that they can fit into the + * struct lu_fid fields without loss. + */ +static inline int ostid_to_fid(struct lu_fid *fid, const struct ost_id *ostid, + __u32 ost_idx) +{ + __u64 seq = ostid_seq(ostid); + + if (ost_idx > 0xffff) + return -EBADF; + + if (fid_seq_is_mdt0(seq)) { + __u64 oid = ostid_id(ostid); + + /* This is a "legacy" (old 1.x/2.early) OST object in "group 0" + * that we map into the IDIF namespace. It allows up to 2^48 + * objects per OST, as this is the object namespace that has + * been in production for years. This can handle create rates + * of 1M objects/s/OST for 9 years, or combinations thereof. + */ + if (oid >= IDIF_MAX_OID) + return -EBADF; + + fid->f_seq = fid_idif_seq(oid, ost_idx); + /* truncate to 32 bits by assignment */ + fid->f_oid = oid; + /* in theory, not currently used */ + fid->f_ver = oid >> 48; + } else if (!fid_seq_is_default(seq)) { + /* This is either an IDIF object, which identifies objects + * across all OSTs, or a regular FID. The IDIF namespace + * maps legacy OST objects into the FID namespace. In both + * cases, we just pass the FID through, no conversion needed. + */ + if (ostid->oi_fid.f_ver) + return -EBADF; + + *fid = ostid->oi_fid; + } + + return 0; +} +#endif /* _UAPI_LUSTRE_OSTID_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h new file mode 100644 index 0000000000000..022d253bbc353 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h @@ -0,0 +1,94 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * User-settable parameter keys + * + * Author: Nathan Rutman + */ + +#ifndef _UAPI_LUSTRE_PARAM_H +#define _UAPI_LUSTRE_PARAM_H + +/** \defgroup param param + * + * @{ + */ + +/****************** User-settable parameter keys *********************/ +/* e.g. + * tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda + * lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0 + * ... testfs-MDT0000.lov.stripesize=4M + * ... testfs-OST0000.ost.client_cache_seconds=15 + * ... testfs.sys.timeout= + * ... testfs.llite.max_read_ahead_mb=16 + */ + +/* System global or special params not handled in obd's proc + * See mgs_write_log_sys() + */ +#define PARAM_TIMEOUT "timeout=" /* global */ +#define PARAM_LDLM_TIMEOUT "ldlm_timeout=" /* global */ +#define PARAM_AT_MIN "at_min=" /* global */ +#define PARAM_AT_MAX "at_max=" /* global */ +#define PARAM_AT_EXTRA "at_extra=" /* global */ +#define PARAM_AT_EARLY_MARGIN "at_early_margin=" /* global */ +#define PARAM_AT_HISTORY "at_history=" /* global */ +#define PARAM_JOBID_VAR "jobid_var=" /* global */ +#define PARAM_MGSNODE "mgsnode=" /* only at mounttime */ +#define PARAM_FAILNODE "failover.node=" /* add failover nid */ +#define PARAM_FAILMODE "failover.mode=" /* initial mount only */ +#define PARAM_ACTIVE "active=" /* activate/deactivate */ +#define PARAM_NETWORK "network=" /* bind on nid */ +#define PARAM_ID_UPCALL "identity_upcall=" /* identity upcall */ + +/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */ +#define PARAM_OST "ost." +#define PARAM_OSD "osd." +#define PARAM_OSC "osc." +#define PARAM_MDT "mdt." +#define PARAM_HSM "mdt.hsm." +#define PARAM_MDD "mdd." +#define PARAM_MDC "mdc." +#define PARAM_LLITE "llite." +#define PARAM_LOV "lov." +#define PARAM_LOD "lod." +#define PARAM_OSP "osp." +#define PARAM_SYS "sys." /* global */ +#define PARAM_SRPC "srpc." +#define PARAM_SRPC_FLVR "srpc.flavor." +#define PARAM_SRPC_UDESC "srpc.udesc.cli2mdt" +#define PARAM_SEC "security." +#define PARAM_QUOTA "quota." /* global */ + +/** @} param */ + +#endif /* _UAPI_LUSTRE_PARAM_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h new file mode 100644 index 0000000000000..d1172a637fcee --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h @@ -0,0 +1,2456 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/lustre_user.h + * + * Lustre public user-space interface definitions. + */ + +#ifndef _LUSTRE_USER_H +#define _LUSTRE_USER_H + +/** \defgroup lustreuser lustreuser + * + * @{ + */ + +#ifdef __KERNEL__ +# include +# include +# include +#else /* ! __KERNEL__ */ +# define __USE_ISOC99 1 +# include +# include /* snprintf() */ +# define NEED_QUOTA_DEFS +/* # include - this causes complaints about caddr_t */ +# include + +# define __USE_GNU 1 +# define __USE_XOPEN2K8 1 +# define FILEID_LUSTRE 0x97 /* for name_to_handle_at() (and llapi_fd2fid()) */ +#endif /* !__KERNEL__ */ + +#include +#include +#include +#include + +/* Handle older distros */ +#ifndef __ALIGN_KERNEL +#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) +#endif + +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifdef __STRICT_ANSI__ +#define typeof __typeof__ +#endif + +/* + * This is a temporary solution of adding quota type. + * Should be removed as soon as system header is updated. + */ +#undef LL_MAXQUOTAS +#define LL_MAXQUOTAS 3 +#undef INITQFNAMES +#define INITQFNAMES { \ + "user", /* USRQUOTA */ \ + "group", /* GRPQUOTA */ \ + "project", /* PRJQUOTA */ \ + "undefined", \ +}; +#ifndef USRQUOTA +#define USRQUOTA 0 +#endif +#ifndef GRPQUOTA +#define GRPQUOTA 1 +#endif +#ifndef PRJQUOTA +#define PRJQUOTA 2 +#endif + +/* + * We need to always use 64bit version because the structure + * is shared across entire cluster where 32bit and 64bit machines + * are co-existing. + */ +#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64) +typedef struct stat64 lstat_t; +#define lstat_f lstat64 +#define fstat_f fstat64 +#define fstatat_f fstatat64 +#else +typedef struct stat lstat_t; +#define lstat_f lstat +#define fstat_f fstat +#define fstatat_f fstatat +#endif + +#ifndef STATX_BASIC_STATS +/* + * Timestamp structure for the timestamps in struct statx. + * + * tv_sec holds the number of seconds before (negative) or after (positive) + * 00:00:00 1st January 1970 UTC. + * + * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time. + * + * __reserved is held in case we need a yet finer resolution. + */ +struct statx_timestamp { + __s64 tv_sec; + __u32 tv_nsec; + __s32 __reserved; +}; + +/* + * Structures for the extended file attribute retrieval system call + * (statx()). + * + * The caller passes a mask of what they're specifically interested in as a + * parameter to statx(). What statx() actually got will be indicated in + * st_mask upon return. + * + * For each bit in the mask argument: + * + * - if the datum is not supported: + * + * - the bit will be cleared, and + * + * - the datum will be set to an appropriate fabricated value if one is + * available (eg. CIFS can take a default uid and gid), otherwise + * + * - the field will be cleared; + * + * - otherwise, if explicitly requested: + * + * - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is + * set or if the datum is considered out of date, and + * + * - the field will be filled in and the bit will be set; + * + * - otherwise, if not requested, but available in approximate form without any + * effort, it will be filled in anyway, and the bit will be set upon return + * (it might not be up to date, however, and no attempt will be made to + * synchronise the internal state first); + * + * - otherwise the field and the bit will be cleared before returning. + * + * Items in STATX_BASIC_STATS may be marked unavailable on return, but they + * will have values installed for compatibility purposes so that stat() and + * co. can be emulated in userspace. + */ +struct statx { + /* 0x00 */ + __u32 stx_mask; /* What results were written [uncond] */ + __u32 stx_blksize; /* Preferred general I/O size [uncond] */ + __u64 stx_attributes; /* Flags conveying information about the file [uncond] */ + /* 0x10 */ + __u32 stx_nlink; /* Number of hard links */ + __u32 stx_uid; /* User ID of owner */ + __u32 stx_gid; /* Group ID of owner */ + __u16 stx_mode; /* File mode */ + __u16 __spare0[1]; + /* 0x20 */ + __u64 stx_ino; /* Inode number */ + __u64 stx_size; /* File size */ + __u64 stx_blocks; /* Number of 512-byte blocks allocated */ + __u64 stx_attributes_mask; /* Mask to show what's supported in stx_attributes */ + /* 0x40 */ + struct statx_timestamp stx_atime; /* Last access time */ + struct statx_timestamp stx_btime; /* File creation time */ + struct statx_timestamp stx_ctime; /* Last attribute change time */ + struct statx_timestamp stx_mtime; /* Last data modification time */ + /* 0x80 */ + __u32 stx_rdev_major; /* Device ID of special file [if bdev/cdev] */ + __u32 stx_rdev_minor; + __u32 stx_dev_major; /* ID of device containing file [uncond] */ + __u32 stx_dev_minor; + /* 0x90 */ + __u64 __spare2[14]; /* Spare space for future expansion */ + /* 0x100 */ +}; + +/* + * Flags to be stx_mask + * + * Query request/result mask for statx() and struct statx::stx_mask. + * + * These bits should be set in the mask argument of statx() to request + * particular items when calling statx(). + */ +#define STATX_TYPE 0x00000001U /* Want/got stx_mode & S_IFMT */ +#define STATX_MODE 0x00000002U /* Want/got stx_mode & ~S_IFMT */ +#define STATX_NLINK 0x00000004U /* Want/got stx_nlink */ +#define STATX_UID 0x00000008U /* Want/got stx_uid */ +#define STATX_GID 0x00000010U /* Want/got stx_gid */ +#define STATX_ATIME 0x00000020U /* Want/got stx_atime */ +#define STATX_MTIME 0x00000040U /* Want/got stx_mtime */ +#define STATX_CTIME 0x00000080U /* Want/got stx_ctime */ +#define STATX_INO 0x00000100U /* Want/got stx_ino */ +#define STATX_SIZE 0x00000200U /* Want/got stx_size */ +#define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ +#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ +#define STATX_BTIME 0x00000800U /* Want/got stx_btime */ +#define STATX_ALL 0x00000fffU /* All currently supported flags */ +#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ + +/* + * Attributes to be found in stx_attributes and masked in stx_attributes_mask. + * + * These give information about the features or the state of a file that might + * be of use to ordinary userspace programs such as GUIs or ls rather than + * specialised tools. + * + * Note that the flags marked [I] correspond to generic FS_IOC_FLAGS + * semantically. Where possible, the numerical value is picked to correspond + * also. + */ +#define STATX_ATTR_COMPRESSED 0x00000004 /* [I] File is compressed by the fs */ +#define STATX_ATTR_IMMUTABLE 0x00000010 /* [I] File is marked immutable */ +#define STATX_ATTR_APPEND 0x00000020 /* [I] File is append-only */ +#define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ +#define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ + +#define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ + +#endif + +typedef struct statx lstatx_t; + +#define HAVE_LOV_USER_MDS_DATA + +#define LUSTRE_EOF 0xffffffffffffffffULL + +/* for statfs() */ +#define LL_SUPER_MAGIC 0x0BD00BD0 + +#define FSFILT_IOC_GETVERSION _IOR('f', 3, long) + +/* FIEMAP flags supported by Lustre */ +#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER) + +enum obd_statfs_state { + OS_STATE_DEGRADED = 0x00000001, /**< RAID degraded/rebuilding */ + OS_STATE_READONLY = 0x00000002, /**< filesystem is read-only */ + OS_STATE_NOPRECREATE = 0x00000004, /**< no object precreation */ + OS_STATE_UNUSED1 = 0x00000008, /**< obsolete 1.6, was EROFS=30 */ + OS_STATE_UNUSED2 = 0x00000010, /**< obsolete 1.6, was EROFS=30 */ + OS_STATE_ENOSPC = 0x00000020, /**< not enough free space */ + OS_STATE_ENOINO = 0x00000040, /**< not enough inodes */ + OS_STATE_SUM = 0x00000100, /**< aggregated for all tagrets */ + OS_STATE_NONROT = 0x00000200, /**< non-rotational device */ +}; + +/** filesystem statistics/attributes for target device */ +struct obd_statfs { + __u64 os_type; /* EXT4_SUPER_MAGIC, UBERBLOCK_MAGIC */ + __u64 os_blocks; /* total size in #os_bsize blocks */ + __u64 os_bfree; /* number of unused blocks */ + __u64 os_bavail; /* blocks available for allocation */ + __u64 os_files; /* total number of objects */ + __u64 os_ffree; /* # objects that could be created */ + __u8 os_fsid[40]; /* identifier for filesystem */ + __u32 os_bsize; /* block size in bytes for os_blocks */ + __u32 os_namelen; /* maximum length of filename in bytes*/ + __u64 os_maxbytes; /* maximum object size in bytes */ + __u32 os_state; /**< obd_statfs_state OS_STATE_* flag */ + __u32 os_fprecreated; /* objs available now to the caller */ + /* used in QoS code to find preferred + * OSTs */ + __u32 os_granted; /* space granted for MDS */ + __u32 os_spare3; /* Unused padding fields. Remember */ + __u32 os_spare4; /* to fix lustre_swab_obd_statfs() */ + __u32 os_spare5; + __u32 os_spare6; + __u32 os_spare7; + __u32 os_spare8; + __u32 os_spare9; +}; + +/** + * File IDentifier. + * + * FID is a cluster-wide unique identifier of a file or an object (stripe). + * FIDs are never reused. + **/ +struct lu_fid { + /** + * FID sequence. Sequence is a unit of migration: all files (objects) + * with FIDs from a given sequence are stored on the same server. + * Lustre should support 2^64 objects, so even if each sequence + * has only a single object we can still enumerate 2^64 objects. + **/ + __u64 f_seq; + /* FID number within sequence. */ + __u32 f_oid; + /** + * FID version, used to distinguish different versions (in the sense + * of snapshots, etc.) of the same file system object. Not currently + * used. + **/ + __u32 f_ver; +} __attribute__((packed)); + +static inline bool fid_is_zero(const struct lu_fid *fid) +{ + return fid->f_seq == 0 && fid->f_oid == 0; +} + +/* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ +#define f_stripe_idx f_ver + +struct ost_layout { + __u32 ol_stripe_size; + __u32 ol_stripe_count; + __u64 ol_comp_start; + __u64 ol_comp_end; + __u32 ol_comp_id; +} __attribute__((packed)); + +/* The filter_fid structure has changed several times over its lifetime. + * For a long time "trusted.fid" held the MDT inode parent FID/IGIF and + * stripe_index and the "self FID" (objid/seq) to be able to recover the + * OST objects in case of corruption. With the move to 2.4 and OSD-API for + * the OST, the "trusted.lma" xattr was added to the OST objects to store + * the "self FID" to be consistent with the MDT on-disk format, and the + * filter_fid only stored the MDT inode parent FID and stripe index. + * + * In 2.10, the addition of PFL composite layouts required more information + * to be stored into the filter_fid in order to be able to identify which + * component the OST object belonged. As well, the stripe size may vary + * between components, so it was no longer safe to assume the stripe size + * or stripe_count of a file. This is also more robust for plain layouts. + * + * For ldiskfs OSTs that were formatted with 256-byte inodes, there is not + * enough space to store both the filter_fid and LMA in the inode, so they + * are packed into struct lustre_ost_attrs on disk in trusted.lma to avoid + * an extra seek for every OST object access. + * + * In 2.11, FLR mirror layouts also need to store the layout version and + * range so that writes to old versions of the layout are not allowed. + * That ensures that mirrored objects are not modified by evicted clients, + * and ensures that the components are correctly marked stale on the MDT. + */ +struct filter_fid_18_23 { + struct lu_fid ff_parent; /* stripe_idx in f_ver */ + __u64 ff_objid; + __u64 ff_seq; +}; + +struct filter_fid_24_29 { + struct lu_fid ff_parent; /* stripe_idx in f_ver */ +}; + +struct filter_fid_210 { + struct lu_fid ff_parent; /* stripe_idx in f_ver */ + struct ost_layout ff_layout; +}; + +struct filter_fid { + struct lu_fid ff_parent; /* stripe_idx in f_ver */ + struct ost_layout ff_layout; + __u32 ff_layout_version; + __u32 ff_range; /* range of layout version that + * write are allowed */ +} __attribute__((packed)); + +/* Userspace should treat lu_fid as opaque, and only use the following methods + * to print or parse them. Other functions (e.g. compare, swab) could be moved + * here from lustre_idl.h if needed. */ +struct lu_fid; + +enum lma_compat { + LMAC_HSM = 0x00000001, +/* LMAC_SOM = 0x00000002, obsolete since 2.8.0 */ + LMAC_NOT_IN_OI = 0x00000004, /* the object does NOT need OI mapping */ + LMAC_FID_ON_OST = 0x00000008, /* For OST-object, its OI mapping is + * under /O//d. */ + LMAC_STRIPE_INFO = 0x00000010, /* stripe info in the LMA EA. */ + LMAC_COMP_INFO = 0x00000020, /* Component info in the LMA EA. */ + LMAC_IDX_BACKUP = 0x00000040, /* Has index backup. */ +}; + +/** + * Masks for all features that should be supported by a Lustre version to + * access a specific file. + * This information is stored in lustre_mdt_attrs::lma_incompat. + */ +enum lma_incompat { + LMAI_RELEASED = 0x00000001, /* file is released */ + LMAI_AGENT = 0x00000002, /* agent inode */ + LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object + is on the remote MDT */ + LMAI_STRIPED = 0x00000008, /* striped directory inode */ + LMAI_ORPHAN = 0x00000010, /* inode is orphan */ + LMA_INCOMPAT_SUPP = (LMAI_AGENT | LMAI_REMOTE_PARENT | \ + LMAI_STRIPED | LMAI_ORPHAN) +}; + + +/** + * Following struct for object attributes, that will be kept inode's EA. + * Introduced in 2.0 release (please see b15993, for details) + * Added to all objects since Lustre 2.4 as contains self FID + */ +struct lustre_mdt_attrs { + /** + * Bitfield for supported data in this structure. From enum lma_compat. + * lma_self_fid and lma_flags are always available. + */ + __u32 lma_compat; + /** + * Per-file incompat feature list. Lustre version should support all + * flags set in this field. The supported feature mask is available in + * LMA_INCOMPAT_SUPP. + */ + __u32 lma_incompat; + /** FID of this inode */ + struct lu_fid lma_self_fid; +}; + +struct lustre_ost_attrs { + /* Use lustre_mdt_attrs directly for now, need a common header + * structure if want to change lustre_mdt_attrs in future. */ + struct lustre_mdt_attrs loa_lma; + + /* Below five elements are for OST-object's PFID EA, the + * lma_parent_fid::f_ver is composed of the stripe_count (high 16 bits) + * and the stripe_index (low 16 bits), the size should not exceed + * 5 * sizeof(__u64)) to be accessable by old Lustre. If the flag + * LMAC_STRIPE_INFO is set, then loa_parent_fid and loa_stripe_size + * are valid; if the flag LMAC_COMP_INFO is set, then the next three + * loa_comp_* elements are valid. */ + struct lu_fid loa_parent_fid; + __u32 loa_stripe_size; + __u32 loa_comp_id; + __u64 loa_comp_start; + __u64 loa_comp_end; +}; + +/** + * Prior to 2.4, the LMA structure also included SOM attributes which has since + * been moved to a dedicated xattr + * lma_flags was also removed because of lma_compat/incompat fields. + */ +#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64)) + +enum lustre_som_flags { + /* Unknow or no SoM data, must get size from OSTs. */ + SOM_FL_UNKNOWN = 0x0000, + /* Known strictly correct, FLR or DoM file (SoM guaranteed). */ + SOM_FL_STRICT = 0x0001, + /* Known stale - was right at some point in the past, but it is + * known (or likely) to be incorrect now (e.g. opened for write). */ + SOM_FL_STALE = 0x0002, + /* Approximate, may never have been strictly correct, + * need to sync SOM data to achieve eventual consistency. */ + SOM_FL_LAZY = 0x0004, +}; + +struct lustre_som_attrs { + __u16 lsa_valid; + __u16 lsa_reserved[3]; + __u64 lsa_size; + __u64 lsa_blocks; +}; + +/** + * OST object IDentifier. + */ +struct ost_id { + union { + struct { + __u64 oi_id; + __u64 oi_seq; + } oi; + struct lu_fid oi_fid; + }; +} __attribute__((packed)); + +#define DOSTID "%#llx:%llu" +#define POSTID(oi) ((unsigned long long)ostid_seq(oi)), \ + ((unsigned long long)ostid_id(oi)) + +struct ll_futimes_3 { + __u64 lfu_atime_sec; + __u64 lfu_atime_nsec; + __u64 lfu_mtime_sec; + __u64 lfu_mtime_nsec; + __u64 lfu_ctime_sec; + __u64 lfu_ctime_nsec; +}; + +/* + * Maximum number of mirrors currently implemented. + */ +#define LUSTRE_MIRROR_COUNT_MAX 16 + +/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */ +enum ll_lease_mode { + LL_LEASE_RDLCK = 0x01, + LL_LEASE_WRLCK = 0x02, + LL_LEASE_UNLCK = 0x04, +}; + +enum ll_lease_flags { + LL_LEASE_RESYNC = 0x1, + LL_LEASE_RESYNC_DONE = 0x2, + LL_LEASE_LAYOUT_MERGE = 0x4, + LL_LEASE_LAYOUT_SPLIT = 0x8, +}; + +#define IOC_IDS_MAX 4096 +struct ll_ioc_lease { + __u32 lil_mode; + __u32 lil_flags; + __u32 lil_count; + __u32 lil_ids[0]; +}; + +struct ll_ioc_lease_id { + __u32 lil_mode; + __u32 lil_flags; + __u32 lil_count; + __u16 lil_mirror_id; + __u16 lil_padding1; + __u64 lil_padding2; + __u32 lil_ids[0]; +}; + +/* + * The ioctl naming rules: + * LL_* - works on the currently opened filehandle instead of parent dir + * *_OBD_* - gets data for both OSC or MDC (LOV, LMV indirectly) + * *_MDC_* - gets/sets data related to MDC + * *_LOV_* - gets/sets data related to OSC/LOV + * *FILE* - called on parent dir and passes in a filename + * *STRIPE* - set/get lov_user_md + * *INFO - set/get lov_user_mds_data + */ +/* lustre_ioctl.h 101-150 */ +#define LL_IOC_GETFLAGS _IOR ('f', 151, long) +#define LL_IOC_SETFLAGS _IOW ('f', 152, long) +#define LL_IOC_CLRFLAGS _IOW ('f', 153, long) +#define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long) +#define LL_IOC_LOV_SETSTRIPE_NEW _IOWR('f', 154, struct lov_user_md) +#define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long) +#define LL_IOC_LOV_GETSTRIPE_NEW _IOR('f', 155, struct lov_user_md) +#define LL_IOC_LOV_SETEA _IOW ('f', 156, long) +/* LL_IOC_RECREATE_OBJ 157 obsolete */ +/* LL_IOC_RECREATE_FID 157 obsolete */ +#define LL_IOC_GROUP_LOCK _IOW ('f', 158, long) +#define LL_IOC_GROUP_UNLOCK _IOW ('f', 159, long) +/* LL_IOC_QUOTACHECK 160 OBD_IOC_QUOTACHECK */ +/* LL_IOC_POLL_QUOTACHECK 161 OBD_IOC_POLL_QUOTACHECK */ +/* LL_IOC_QUOTACTL 162 OBD_IOC_QUOTACTL */ +#define IOC_OBD_STATFS _IOWR('f', 164, struct obd_statfs *) +/* IOC_LOV_GETINFO 165 obsolete */ +#define LL_IOC_FLUSHCTX _IOW ('f', 166, long) +/* LL_IOC_RMTACL 167 obsolete */ +#define LL_IOC_GETOBDCOUNT _IOR ('f', 168, long) +#define LL_IOC_LLOOP_ATTACH _IOWR('f', 169, long) +#define LL_IOC_LLOOP_DETACH _IOWR('f', 170, long) +#define LL_IOC_LLOOP_INFO _IOWR('f', 171, struct lu_fid) +#define LL_IOC_LLOOP_DETACH_BYDEV _IOWR('f', 172, long) +#define LL_IOC_PATH2FID _IOR ('f', 173, long) +#define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *) +#define LL_IOC_GET_MDTIDX _IOR ('f', 175, int) +#define LL_IOC_FUTIMES_3 _IOWR('f', 176, struct ll_futimes_3) +#define LL_IOC_FLR_SET_MIRROR _IOW ('f', 177, long) +/* lustre_ioctl.h 177-210 */ +#define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state) +#define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set) +#define LL_IOC_HSM_CT_START _IOW('f', 213, struct lustre_kernelcomm) +#define LL_IOC_HSM_COPY_START _IOW('f', 214, struct hsm_copy *) +#define LL_IOC_HSM_COPY_END _IOW('f', 215, struct hsm_copy *) +#define LL_IOC_HSM_PROGRESS _IOW('f', 216, struct hsm_user_request) +#define LL_IOC_HSM_REQUEST _IOW('f', 217, struct hsm_user_request) +#define LL_IOC_DATA_VERSION _IOR('f', 218, struct ioc_data_version) +#define LL_IOC_LOV_SWAP_LAYOUTS _IOW('f', 219, \ + struct lustre_swap_layouts) +#define LL_IOC_HSM_ACTION _IOR('f', 220, \ + struct hsm_current_action) +/* lustre_ioctl.h 221-232 */ +#define LL_IOC_LMV_SETSTRIPE _IOWR('f', 240, struct lmv_user_md) +#define LL_IOC_LMV_GETSTRIPE _IOWR('f', 241, struct lmv_user_md) +#define LL_IOC_REMOVE_ENTRY _IOWR('f', 242, __u64) +#define LL_IOC_RMFID _IOR('f', 242, struct fid_array) +#define LL_IOC_SET_LEASE _IOWR('f', 243, struct ll_ioc_lease) +#define LL_IOC_SET_LEASE_OLD _IOWR('f', 243, long) +#define LL_IOC_GET_LEASE _IO('f', 244) +#define LL_IOC_HSM_IMPORT _IOWR('f', 245, struct hsm_user_import) +#define LL_IOC_LMV_SET_DEFAULT_STRIPE _IOWR('f', 246, struct lmv_user_md) +#define LL_IOC_MIGRATE _IOR('f', 247, int) +#define LL_IOC_FID2MDTIDX _IOWR('f', 248, struct lu_fid) +#define LL_IOC_GETPARENT _IOWR('f', 249, struct getparent) +#define LL_IOC_LADVISE _IOR('f', 250, struct llapi_lu_ladvise) + +#ifndef FS_IOC_FSGETXATTR +/* + * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. +*/ +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; +#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) +#endif +#define LL_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define LL_IOC_FSSETXATTR FS_IOC_FSSETXATTR +#ifndef FS_XFLAG_PROJINHERIT +#define FS_XFLAG_PROJINHERIT 0x00000200 +#endif + + +#define LL_STATFS_LMV 1 +#define LL_STATFS_LOV 2 +#define LL_STATFS_NODELAY 4 + +#define IOC_MDC_TYPE 'i' +#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) +#define IOC_MDC_GETFILESTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *) +#ifdef HAVE_LOV_USER_MDS_DATA +#define IOC_MDC_GETFILEINFO_OLD _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data_v1 *) +#define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data) +#define LL_IOC_MDC_GETINFO_OLD _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data_v1 *) +#define LL_IOC_MDC_GETINFO _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data) +#endif + +#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */ + +/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular + * files, but are unlikely to be used in practice and are not harmful if + * used incorrectly. O_NOCTTY and FASYNC are only meaningful for character + * devices and are safe for use on new files. See LU-4209. */ +/* To be compatible with old statically linked binary we keep the check for + * the older 0100000000 flag. This is already removed upstream. LU-812. */ +#define O_LOV_DELAY_CREATE_1_8 0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */ +#ifndef FASYNC +#define FASYNC 00020000 /* fcntl, for BSD compatibility */ +#endif +#define O_LOV_DELAY_CREATE_MASK (O_NOCTTY | FASYNC) +#define O_LOV_DELAY_CREATE (O_LOV_DELAY_CREATE_1_8 | \ + O_LOV_DELAY_CREATE_MASK) + +#define O_LU_NOIMPORT_MASK (O_NOCTTY | O_DSYNC | O_DIRECT) +#define O_LU_NOIMPORT O_LU_NOIMPORT_MASK + +#define LL_FILE_IGNORE_LOCK 0x00000001 +#define LL_FILE_GROUP_LOCKED 0x00000002 +#define LL_FILE_READAHEA 0x00000004 +#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */ +#define LL_FILE_LOCKLESS_IO 0x00000010 /* server-side locks with cio */ + +#define LOV_USER_MAGIC_V1 0x0BD10BD0 +#define LOV_USER_MAGIC LOV_USER_MAGIC_V1 +#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0 +#define LOV_USER_MAGIC_V3 0x0BD30BD0 +/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */ +#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0 /* for specific OSTs */ +#define LOV_USER_MAGIC_COMP_V1 0x0BD60BD0 + +#define LMV_USER_MAGIC 0x0CD30CD0 /* default lmv magic */ +#define LMV_USER_MAGIC_V0 0x0CD20CD0 /* old default lmv magic*/ +#define LMV_USER_MAGIC_SPECIFIC 0x0CD40CD0 + +#define LOV_PATTERN_NONE 0x000 +#define LOV_PATTERN_RAID0 0x001 +#define LOV_PATTERN_RAID1 0x002 +#define LOV_PATTERN_MDT 0x100 +#define LOV_PATTERN_CMOBD 0x200 + +#define LOV_PATTERN_F_MASK 0xffff0000 +#define LOV_PATTERN_F_HOLE 0x40000000 /* there is hole in LOV EA */ +#define LOV_PATTERN_F_RELEASED 0x80000000 /* HSM released file */ +#define LOV_PATTERN_DEFAULT 0xffffffff + +#define LOV_OFFSET_DEFAULT ((__u16)-1) +#define LMV_OFFSET_DEFAULT ((__u32)-1) + +#define LOV_QOS_DEF_THRESHOLD_RR_PCT 17 +#define LMV_QOS_DEF_THRESHOLD_RR_PCT 5 + +#define LOV_QOS_DEF_PRIO_FREE 90 +#define LMV_QOS_DEF_PRIO_FREE 90 + +static inline bool lov_pattern_supported(__u32 pattern) +{ + return (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_RAID0 || + (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_MDT; +} + +#define LOV_MAXPOOLNAME 15 +#define LOV_POOLNAMEF "%.15s" + +#define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */ +#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS) +#define LOV_MAX_STRIPE_COUNT_OLD 160 +/* This calculation is crafted so that input of 4096 will result in 160 + * which in turn is equal to old maximal stripe count. + * XXX: In fact this is too simpified for now, what it also need is to get + * ea_type argument to clearly know how much space each stripe consumes. + * + * The limit of 12 pages is somewhat arbitrary, but is a reasonably large + * allocation that is sufficient for the current generation of systems. + * + * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */ +#define LOV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */ +#define LOV_ALL_STRIPES 0xffff /* only valid for directories */ +#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */ + +#define XATTR_LUSTRE_PREFIX "lustre." +#define XATTR_LUSTRE_LOV XATTR_LUSTRE_PREFIX"lov" + +/* Please update if XATTR_LUSTRE_LOV".set" groks more flags in the future */ +#define allowed_lustre_lov(att) (strcmp((att), XATTR_LUSTRE_LOV".add") == 0 || \ + strcmp((att), XATTR_LUSTRE_LOV".set") == 0 || \ + strcmp((att), XATTR_LUSTRE_LOV".set.flags") == 0 || \ + strcmp((att), XATTR_LUSTRE_LOV".del") == 0) + +#define lov_user_ost_data lov_user_ost_data_v1 +struct lov_user_ost_data_v1 { /* per-stripe data structure */ + struct ost_id l_ost_oi; /* OST object ID */ + __u32 l_ost_gen; /* generation of this OST index */ + __u32 l_ost_idx; /* OST index in LOV */ +} __attribute__((packed)); + +#define lov_user_md lov_user_md_v1 +struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* MDT parent inode id/seq (id/0 for 1.x) */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + union { + __u16 lmm_stripe_offset; /* starting stripe offset in + * lmm_objects, use when writing */ + __u16 lmm_layout_gen; /* layout generation number + * used when reading */ + }; + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed, __may_alias__)); + +struct lov_user_md_v3 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* MDT parent inode id/seq (id/0 for 1.x) */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + union { + __u16 lmm_stripe_offset; /* starting stripe offset in + * lmm_objects, use when writing */ + __u16 lmm_layout_gen; /* layout generation number + * used when reading */ + }; + char lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */ + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed, __may_alias__)); + +struct lu_extent { + __u64 e_start; + __u64 e_end; +} __attribute__((packed)); + +#define DEXT "[%#llx, %#llx)" +#define PEXT(ext) (unsigned long long)(ext)->e_start, (unsigned long long)(ext)->e_end + +static inline bool lu_extent_is_overlapped(struct lu_extent *e1, + struct lu_extent *e2) +{ + return e1->e_start < e2->e_end && e2->e_start < e1->e_end; +} + +static inline bool lu_extent_is_whole(struct lu_extent *e) +{ + return e->e_start == 0 && e->e_end == LUSTRE_EOF; +} + +enum lov_comp_md_entry_flags { + LCME_FL_STALE = 0x00000001, /* FLR: stale data */ + LCME_FL_PREF_RD = 0x00000002, /* FLR: preferred for reading */ + LCME_FL_PREF_WR = 0x00000004, /* FLR: preferred for writing */ + LCME_FL_PREF_RW = LCME_FL_PREF_RD | LCME_FL_PREF_WR, + LCME_FL_OFFLINE = 0x00000008, /* Not used */ + LCME_FL_INIT = 0x00000010, /* instantiated */ + LCME_FL_NOSYNC = 0x00000020, /* FLR: no sync for the mirror */ + LCME_FL_NEG = 0x80000000 /* used to indicate a negative flag, + won't be stored on disk */ +}; + +#define LCME_KNOWN_FLAGS (LCME_FL_NEG | LCME_FL_INIT | LCME_FL_STALE | \ + LCME_FL_PREF_RW | LCME_FL_NOSYNC) +/* The flags can be set by users at mirror creation time. */ +#define LCME_USER_FLAGS (LCME_FL_PREF_RW) + +/* The flags are for mirrors */ +#define LCME_MIRROR_FLAGS (LCME_FL_NOSYNC) + +/* These flags have meaning when set in a default layout and will be inherited + * from the default/template layout set on a directory. + */ +#define LCME_TEMPLATE_FLAGS (LCME_FL_PREF_RW | LCME_FL_NOSYNC) + +/* the highest bit in obdo::o_layout_version is used to mark if the file is + * being resynced. */ +#define LU_LAYOUT_RESYNC LCME_FL_NEG + +/* lcme_id can be specified as certain flags, and the the first + * bit of lcme_id is used to indicate that the ID is representing + * certain LCME_FL_* but not a real ID. Which implies we can have + * at most 31 flags (see LCME_FL_XXX). */ +enum lcme_id { + LCME_ID_INVAL = 0x0, + LCME_ID_MAX = 0x7FFFFFFF, + LCME_ID_ALL = 0xFFFFFFFF, + LCME_ID_NOT_ID = LCME_FL_NEG +}; + +#define LCME_ID_MASK LCME_ID_MAX + +struct lov_comp_md_entry_v1 { + __u32 lcme_id; /* unique id of component */ + __u32 lcme_flags; /* LCME_FL_XXX */ + struct lu_extent lcme_extent; /* file extent for component */ + __u32 lcme_offset; /* offset of component blob, + start from lov_comp_md_v1 */ + __u32 lcme_size; /* size of component blob */ + __u32 lcme_layout_gen; + __u64 lcme_timestamp; /* snapshot time if applicable*/ + __u32 lcme_padding_1; +} __attribute__((packed)); + +#define SEQ_ID_MAX 0x0000FFFF +#define SEQ_ID_MASK SEQ_ID_MAX +/* bit 30:16 of lcme_id is used to store mirror id */ +#define MIRROR_ID_MASK 0x7FFF0000 +#define MIRROR_ID_NEG 0x8000 +#define MIRROR_ID_SHIFT 16 + +static inline __u32 pflr_id(__u16 mirror_id, __u16 seqid) +{ + return ((mirror_id << MIRROR_ID_SHIFT) & MIRROR_ID_MASK) | seqid; +} + +static inline __u16 mirror_id_of(__u32 id) +{ + return (id & MIRROR_ID_MASK) >> MIRROR_ID_SHIFT; +} + +/** + * on-disk data for lcm_flags. Valid if lcm_magic is LOV_MAGIC_COMP_V1. + */ +enum lov_comp_md_flags { + /* the least 2 bits are used by FLR to record file state */ + LCM_FL_NONE = 0, + LCM_FL_RDONLY = 1, + LCM_FL_WRITE_PENDING = 2, + LCM_FL_SYNC_PENDING = 3, + LCM_FL_FLR_MASK = 0x3, +}; + +struct lov_comp_md_v1 { + __u32 lcm_magic; /* LOV_USER_MAGIC_COMP_V1 */ + __u32 lcm_size; /* overall size including this struct */ + __u32 lcm_layout_gen; + __u16 lcm_flags; + __u16 lcm_entry_count; + /* lcm_mirror_count stores the number of actual mirrors minus 1, + * so that non-flr files will have value 0 meaning 1 mirror. */ + __u16 lcm_mirror_count; + __u16 lcm_padding1[3]; + __u64 lcm_padding2; + struct lov_comp_md_entry_v1 lcm_entries[0]; +} __attribute__((packed)); + +static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic) +{ + if (stripes == (__u16)-1) + stripes = 0; + + if (lmm_magic == LOV_USER_MAGIC_V1) + return sizeof(struct lov_user_md_v1) + + stripes * sizeof(struct lov_user_ost_data_v1); + return sizeof(struct lov_user_md_v3) + + stripes * sizeof(struct lov_user_ost_data_v1); +} + +/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to + * use this. It is unsafe to #define those values in this header as it + * is possible the application has already #included . */ +#ifdef HAVE_LOV_USER_MDS_DATA +#define lov_user_mds_data lov_user_mds_data_v2 +struct lov_user_mds_data_v1 { + lstat_t lmd_st; /* MDS stat struct */ + struct lov_user_md_v1 lmd_lmm; /* LOV EA V1 user data */ +} __attribute__((packed)); + +struct lov_user_mds_data_v2 { + struct lu_fid lmd_fid; /* Lustre FID */ + lstatx_t lmd_stx; /* MDS statx struct */ + __u64 lmd_flags; /* MDS stat flags */ + __u32 lmd_lmmsize; /* LOV EA size */ + __u32 lmd_padding; /* unused */ + struct lov_user_md_v1 lmd_lmm; /* LOV EA user data */ +} __attribute__((packed)); +#endif + +struct lmv_user_mds_data { + struct lu_fid lum_fid; + __u32 lum_padding; + __u32 lum_mds; +} __attribute__((packed, __may_alias__)); + +enum lmv_hash_type { + LMV_HASH_TYPE_UNKNOWN = 0, /* 0 is reserved for testing purpose */ + LMV_HASH_TYPE_ALL_CHARS = 1, + LMV_HASH_TYPE_FNV_1A_64 = 2, + LMV_HASH_TYPE_MAX, +}; + +#define LMV_HASH_TYPE_DEFAULT LMV_HASH_TYPE_FNV_1A_64 + +#define LMV_HASH_NAME_ALL_CHARS "all_char" +#define LMV_HASH_NAME_FNV_1A_64 "fnv_1a_64" + +/* not real hash type, but exposed to user as "space" hash type */ +#define LMV_HASH_NAME_SPACE "space" + +/* Right now only the lower part(0-16bits) of lmv_hash_type is being used, + * and the higher part will be the flag to indicate the status of object, + * for example the object is being migrated. And the hash function + * might be interpreted differently with different flags. */ +#define LMV_HASH_TYPE_MASK 0x0000ffff + +static inline bool lmv_is_known_hash_type(__u32 type) +{ + return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 || + (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS; +} + +/* The striped directory has ever lost its master LMV EA, then LFSCK + * re-generated it. This flag is used to indicate such case. It is an + * on-disk flag. */ +#define LMV_HASH_FLAG_LOST_LMV 0x10000000 + +#define LMV_HASH_FLAG_BAD_TYPE 0x20000000 +#define LMV_HASH_FLAG_MIGRATION 0x80000000 + +extern char *mdt_hash_name[LMV_HASH_TYPE_MAX]; + +/* Got this according to how get LOV_MAX_STRIPE_COUNT, see above, + * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */ +#define LMV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */ +#define lmv_user_md lmv_user_md_v1 +struct lmv_user_md_v1 { + __u32 lum_magic; /* must be the first field */ + __u32 lum_stripe_count; /* dirstripe count */ + __u32 lum_stripe_offset; /* MDT idx for default dirstripe */ + __u32 lum_hash_type; /* Dir stripe policy */ + __u32 lum_type; /* LMV type: default */ + __u8 lum_max_inherit; /* inherit depth of default LMV */ + __u8 lum_max_inherit_rr; /* inherit depth of default LMV to round-robin mkdir */ + __u16 lum_padding1; + __u32 lum_padding2; + __u32 lum_padding3; + char lum_pool_name[LOV_MAXPOOLNAME + 1]; + struct lmv_user_mds_data lum_objects[0]; +} __attribute__((packed)); + +/* + * NB, historically default layout didn't set type, but use XATTR name to differ + * from normal layout, for backward compatibility, define LMV_TYPE_DEFAULT 0x0, + * and still use the same method. + */ +enum lmv_type { + LMV_TYPE_DEFAULT = 0x0000, +}; + +/* lum_max_inherit will be decreased by 1 after each inheritance if it's not + * LMV_INHERIT_UNLIMITED or > LMV_INHERIT_MAX. + */ +enum { + /* for historical reason, 0 means unlimited inheritance */ + LMV_INHERIT_UNLIMITED = 0, + /* unlimited lum_max_inherit by default */ + LMV_INHERIT_DEFAULT = 0, + /* not inherit any more */ + LMV_INHERIT_END = 1, + /* max inherit depth */ + LMV_INHERIT_MAX = 250, + /* [251, 254] are reserved */ + /* not set, or when inherit depth goes beyond end, */ + LMV_INHERIT_NONE = 255, +}; + +enum { + /* not set, or when inherit_rr depth goes beyond end, */ + LMV_INHERIT_RR_NONE = 0, + /* disable lum_max_inherit_rr by default */ + LMV_INHERIT_RR_DEFAULT = 0, + /* not inherit any more */ + LMV_INHERIT_RR_END = 1, + /* default inherit_rr of ROOT */ + LMV_INHERIT_RR_ROOT = 3, + /* max inherit depth */ + LMV_INHERIT_RR_MAX = 250, + /* [251, 254] are reserved */ + /* unlimited inheritance */ + LMV_INHERIT_RR_UNLIMITED = 255, +}; + +static inline int lmv_user_md_size(int stripes, int lmm_magic) +{ + int size = sizeof(struct lmv_user_md); + + if (lmm_magic == LMV_USER_MAGIC_SPECIFIC) + size += stripes * sizeof(struct lmv_user_mds_data); + + return size; +} + +struct ll_recreate_obj { + __u64 lrc_id; + __u32 lrc_ost_idx; +}; + +struct ll_fid { + __u64 id; /* holds object id */ + __u32 generation; /* holds object generation */ + __u32 f_type; /* holds object type or stripe idx when passing it to + * OST for saving into EA. */ +}; + +#define UUID_MAX 40 +struct obd_uuid { + char uuid[UUID_MAX]; +}; + +static inline bool obd_uuid_equals(const struct obd_uuid *u1, + const struct obd_uuid *u2) +{ + return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0; +} + +static inline int obd_uuid_empty(struct obd_uuid *uuid) +{ + return uuid->uuid[0] == '\0'; +} + +static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp) +{ + strncpy((char *)uuid->uuid, tmp, sizeof(*uuid)); + uuid->uuid[sizeof(*uuid) - 1] = '\0'; +} + +/* For printf's only, make sure uuid is terminated */ +static inline char *obd_uuid2str(const struct obd_uuid *uuid) +{ + if (uuid == NULL) + return NULL; + + if (uuid->uuid[sizeof(*uuid) - 1] != '\0') { + /* Obviously not safe, but for printfs, no real harm done... + we're always null-terminated, even in a race. */ + static char temp[sizeof(*uuid->uuid)]; + + memcpy(temp, uuid->uuid, sizeof(*uuid->uuid) - 1); + temp[sizeof(*uuid->uuid) - 1] = '\0'; + + return temp; + } + return (char *)(uuid->uuid); +} + +#define LUSTRE_MAXFSNAME 8 + +/* Extract fsname from uuid (or target name) of a target + e.g. (myfs-OST0007_UUID -> myfs) + see also deuuidify. */ +static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen) +{ + char *p; + + strncpy(buf, uuid, buflen - 1); + buf[buflen - 1] = '\0'; + p = strrchr(buf, '-'); + if (p != NULL) + *p = '\0'; +} + +/* printf display format for Lustre FIDs + * usage: printf("file FID is "DFID"\n", PFID(fid)); */ +#define FID_NOBRACE_LEN 40 +#define FID_LEN (FID_NOBRACE_LEN + 2) +#define DFID_NOBRACE "%#llx:0x%x:0x%x" +#define DFID "["DFID_NOBRACE"]" +#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver + +/* scanf input parse format for fids in DFID_NOBRACE format + * Need to strip '[' from DFID format first or use "["SFID"]" at caller. + * usage: sscanf(fidstr, SFID, RFID(&fid)); */ +#define SFID "0x%llx:0x%x:0x%x" +#define RFID(fid) (unsigned long long *)&((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver) + +/********* Quotas **********/ + +#define LUSTRE_QUOTABLOCK_BITS 10 +#define LUSTRE_QUOTABLOCK_SIZE (1 << LUSTRE_QUOTABLOCK_BITS) + +static inline __u64 lustre_stoqb(__kernel_size_t space) +{ + return (space + LUSTRE_QUOTABLOCK_SIZE - 1) >> LUSTRE_QUOTABLOCK_BITS; +} + +#define Q_QUOTACHECK 0x800100 /* deprecated as of 2.4 */ +#define Q_INITQUOTA 0x800101 /* deprecated as of 2.4 */ +#define Q_GETOINFO 0x800102 /* get obd quota info */ +#define Q_GETOQUOTA 0x800103 /* get obd quotas */ +#define Q_FINVALIDATE 0x800104 /* deprecated as of 2.4 */ + +/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */ +#define LUSTRE_Q_QUOTAON 0x800002 /* deprecated as of 2.4 */ +#define LUSTRE_Q_QUOTAOFF 0x800003 /* deprecated as of 2.4 */ +#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */ +#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */ +#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */ +#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */ +/* lustre-specific control commands */ +#define LUSTRE_Q_INVALIDATE 0x80000b /* deprecated as of 2.4 */ +#define LUSTRE_Q_FINVALIDATE 0x80000c /* deprecated as of 2.4 */ +#define LUSTRE_Q_GETDEFAULT 0x80000d /* get default quota */ +#define LUSTRE_Q_SETDEFAULT 0x80000e /* set default quota */ + +/* In the current Lustre implementation, the grace time is either the time + * or the timestamp to be used after some quota ID exceeds the soft limt, + * 48 bits should be enough, its high 16 bits can be used as quota flags. + * */ +#define LQUOTA_GRACE_BITS 48 +#define LQUOTA_GRACE_MASK ((1ULL << LQUOTA_GRACE_BITS) - 1) +#define LQUOTA_GRACE_MAX LQUOTA_GRACE_MASK +#define LQUOTA_GRACE(t) (t & LQUOTA_GRACE_MASK) +#define LQUOTA_FLAG(t) (t >> LQUOTA_GRACE_BITS) +#define LQUOTA_GRACE_FLAG(t, f) ((__u64)t | (__u64)f << LQUOTA_GRACE_BITS) + +/* different quota flags */ + +/* the default quota flag, the corresponding quota ID will use the default + * quota setting, the hardlimit and softlimit of its quota record in the global + * quota file will be set to 0, the low 48 bits of the grace will be set to 0 + * and high 16 bits will contain this flag (see above comment). + * */ +#define LQUOTA_FLAG_DEFAULT 0x0001 + +#define ALLQUOTA 255 /* set all quota */ +static inline char *qtype_name(int qtype) +{ + switch (qtype) { + case USRQUOTA: + return "usr"; + case GRPQUOTA: + return "grp"; + case PRJQUOTA: + return "prj"; + } + return "unknown"; +} + +#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629 +#define SEPOL_DOWNCALL_MAGIC 0x8b8bb842 + +/* permission */ +#define N_PERMS_MAX 64 + +struct perm_downcall_data { + __u64 pdd_nid; + __u32 pdd_perm; + __u32 pdd_padding; +}; + +struct identity_downcall_data { + __u32 idd_magic; + __u32 idd_err; + __u32 idd_uid; + __u32 idd_gid; + __u32 idd_nperms; + __u32 idd_ngroups; + struct perm_downcall_data idd_perms[N_PERMS_MAX]; + __u32 idd_groups[0]; +}; + +struct sepol_downcall_data { + __u32 sdd_magic; + __s64 sdd_sepol_mtime; + __u16 sdd_sepol_len; + char sdd_sepol[0]; +}; + +#ifdef NEED_QUOTA_DEFS +#ifndef QIF_BLIMITS +#define QIF_BLIMITS 1 +#define QIF_SPACE 2 +#define QIF_ILIMITS 4 +#define QIF_INODES 8 +#define QIF_BTIME 16 +#define QIF_ITIME 32 +#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS) +#define QIF_USAGE (QIF_SPACE | QIF_INODES) +#define QIF_TIMES (QIF_BTIME | QIF_ITIME) +#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES) +#endif + +#endif /* !__KERNEL__ */ + +/* lustre volatile file support + * file name header: ".^L^S^T^R:volatile" + */ +#define LUSTRE_VOLATILE_HDR ".\x0c\x13\x14\x12:VOLATILE" +#define LUSTRE_VOLATILE_HDR_LEN 14 + +enum lustre_quota_version { + LUSTRE_QUOTA_V2 = 1 +}; + +/* XXX: same as if_dqinfo struct in kernel */ +struct obd_dqinfo { + __u64 dqi_bgrace; + __u64 dqi_igrace; + __u32 dqi_flags; + __u32 dqi_valid; +}; + +/* XXX: same as if_dqblk struct in kernel, plus one padding */ +struct obd_dqblk { + __u64 dqb_bhardlimit; + __u64 dqb_bsoftlimit; + __u64 dqb_curspace; + __u64 dqb_ihardlimit; + __u64 dqb_isoftlimit; + __u64 dqb_curinodes; + __u64 dqb_btime; + __u64 dqb_itime; + __u32 dqb_valid; + __u32 dqb_padding; +}; + +enum { + QC_GENERAL = 0, + QC_MDTIDX = 1, + QC_OSTIDX = 2, + QC_UUID = 3 +}; + +struct if_quotactl { + __u32 qc_cmd; + __u32 qc_type; + __u32 qc_id; + __u32 qc_stat; + __u32 qc_valid; + __u32 qc_idx; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; + char obd_type[16]; + struct obd_uuid obd_uuid; +}; + +/* swap layout flags */ +#define SWAP_LAYOUTS_CHECK_DV1 (1 << 0) +#define SWAP_LAYOUTS_CHECK_DV2 (1 << 1) +#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2) +#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3) +#define SWAP_LAYOUTS_CLOSE (1 << 4) + +/* Swap XATTR_NAME_HSM as well, only on the MDT so far */ +#define SWAP_LAYOUTS_MDS_HSM (1 << 31) +struct lustre_swap_layouts { + __u64 sl_flags; + __u32 sl_fd; + __u32 sl_gid; + __u64 sl_dv1; + __u64 sl_dv2; +}; + +/** Bit-mask of valid attributes */ +/* The LA_* flags are written to disk as part of the ChangeLog records + * so they are part of the on-disk and network protocol, and cannot be changed. + * Only the first 12 bits are currently saved. + */ +enum la_valid { + LA_ATIME = 1 << 0, /* 0x00001 */ + LA_MTIME = 1 << 1, /* 0x00002 */ + LA_CTIME = 1 << 2, /* 0x00004 */ + LA_SIZE = 1 << 3, /* 0x00008 */ + LA_MODE = 1 << 4, /* 0x00010 */ + LA_UID = 1 << 5, /* 0x00020 */ + LA_GID = 1 << 6, /* 0x00040 */ + LA_BLOCKS = 1 << 7, /* 0x00080 */ + LA_TYPE = 1 << 8, /* 0x00100 */ + LA_FLAGS = 1 << 9, /* 0x00200 */ + LA_NLINK = 1 << 10, /* 0x00400 */ + LA_RDEV = 1 << 11, /* 0x00800 */ + LA_BLKSIZE = 1 << 12, /* 0x01000 */ + LA_KILL_SUID = 1 << 13, /* 0x02000 */ + LA_KILL_SGID = 1 << 14, /* 0x04000 */ + LA_PROJID = 1 << 15, /* 0x08000 */ + LA_LAYOUT_VERSION = 1 << 16, /* 0x10000 */ + LA_LSIZE = 1 << 17, /* 0x20000 */ + LA_LBLOCKS = 1 << 18, /* 0x40000 */ + /** + * Attributes must be transmitted to OST objects + */ + LA_REMOTE_ATTR_SET = (LA_UID | LA_GID | LA_PROJID | LA_LAYOUT_VERSION) +}; + +#define MDS_FMODE_READ 00000001 +#define MDS_FMODE_WRITE 00000002 + +#define MDS_FMODE_CLOSED 00000000 +#define MDS_FMODE_EXEC 00000004 +/* MDS_FMODE_EPOCH 01000000 obsolete since 2.8.0 */ +/* MDS_FMODE_TRUNC 02000000 obsolete since 2.8.0 */ +/* MDS_FMODE_SOM 04000000 obsolete since 2.8.0 */ + +#define MDS_OPEN_CREATED 00000010 +/* MDS_OPEN_CROSS 00000020 obsolete in 2.12, internal use only */ + +#define MDS_OPEN_CREAT 00000100 +#define MDS_OPEN_EXCL 00000200 +#define MDS_OPEN_TRUNC 00001000 +#define MDS_OPEN_APPEND 00002000 +#define MDS_OPEN_SYNC 00010000 +#define MDS_OPEN_DIRECTORY 00200000 + +#define MDS_OPEN_NOIMPORT 020000000 /* nocache object create */ +#define MDS_OPEN_BY_FID 040000000 /* open_by_fid for known object */ +#define MDS_OPEN_DELAY_CREATE 0100000000 /* delay initial object create */ +#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */ +#define MDS_OPEN_JOIN_FILE 0400000000 /* open for join file. + * We do not support JOIN FILE + * anymore, reserve this flags + * just for preventing such bit + * to be reused. */ + +#define MDS_OPEN_LOCK 04000000000 /* This open requires open lock */ +#define MDS_OPEN_HAS_EA 010000000000 /* specify object create pattern */ +#define MDS_OPEN_HAS_OBJS 020000000000 /* Just set the EA the obj exist */ +#define MDS_OPEN_NORESTORE 0100000000000ULL /* Do not restore file at open */ +#define MDS_OPEN_NEWSTRIPE 0200000000000ULL /* New stripe needed (restripe or + * hsm restore) */ +#define MDS_OPEN_VOLATILE 0400000000000ULL /* File is volatile = created + unlinked */ +#define MDS_OPEN_LEASE 01000000000000ULL /* Open the file and grant lease + * delegation, succeed if it's not + * being opened with conflict mode. + */ +#define MDS_OPEN_RELEASE 02000000000000ULL /* Open the file for HSM release */ + +#define MDS_OPEN_RESYNC 04000000000000ULL /* FLR: file resync */ + +/* lustre internal open flags, which should not be set from user space */ +#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS | \ + MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK | \ + MDS_OPEN_BY_FID | MDS_OPEN_LEASE | \ + MDS_OPEN_RELEASE | MDS_OPEN_RESYNC) + + +/********* Changelogs **********/ +/** Changelog record types */ +enum changelog_rec_type { + CL_NONE = -1, + CL_MARK = 0, + CL_CREATE = 1, /* namespace */ + CL_MKDIR = 2, /* namespace */ + CL_HARDLINK = 3, /* namespace */ + CL_SOFTLINK = 4, /* namespace */ + CL_MKNOD = 5, /* namespace */ + CL_UNLINK = 6, /* namespace */ + CL_RMDIR = 7, /* namespace */ + CL_RENAME = 8, /* namespace */ + CL_EXT = 9, /* namespace extended record (2nd half of rename) */ + CL_OPEN = 10, /* not currently used */ + CL_CLOSE = 11, /* may be written to log only with mtime change */ + CL_LAYOUT = 12, /* file layout/striping modified */ + CL_TRUNC = 13, + CL_SETATTR = 14, + CL_SETXATTR = 15, + CL_XATTR = CL_SETXATTR, /* Deprecated name */ + CL_HSM = 16, /* HSM specific events, see flags */ + CL_MTIME = 17, /* Precedence: setattr > mtime > ctime > atime */ + CL_CTIME = 18, + CL_ATIME = 19, + CL_MIGRATE = 20, + CL_FLRW = 21, /* FLR: file was firstly written */ + CL_RESYNC = 22, /* FLR: file was resync-ed */ + CL_GETXATTR = 23, + CL_DN_OPEN = 24, /* denied open */ + CL_LAST +}; + +static inline const char *changelog_type2str(int type) { + static const char *changelog_str[] = { + "MARK", "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK", + "RMDIR", "RENME", "RNMTO", "OPEN", "CLOSE", "LYOUT", "TRUNC", + "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME", "MIGRT", + "FLRW", "RESYNC","GXATR", "NOPEN", + }; + + if (type >= 0 && type < CL_LAST) + return changelog_str[type]; + return NULL; +} + +/* 12 bits of per-record data can be stored in the bottom of the flags */ +#define CLF_FLAGSHIFT 12 +enum changelog_rec_flags { + CLF_VERSION = 0x1000, + CLF_RENAME = 0x2000, + CLF_JOBID = 0x4000, + CLF_EXTRA_FLAGS = 0x8000, + CLF_SUPPORTED = CLF_VERSION | CLF_RENAME | CLF_JOBID | + CLF_EXTRA_FLAGS, + CLF_FLAGMASK = (1U << CLF_FLAGSHIFT) - 1, + CLF_VERMASK = ~CLF_FLAGMASK, +}; + + +/* Anything under the flagmask may be per-type (if desired) */ +/* Flags for unlink */ +#define CLF_UNLINK_LAST 0x0001 /* Unlink of last hardlink */ +#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */ + /* HSM cleaning needed */ +/* Flags for rename */ +#define CLF_RENAME_LAST 0x0001 /* rename unlink last hardlink + * of target */ +#define CLF_RENAME_LAST_EXISTS 0x0002 /* rename unlink last hardlink of target + * has an archive in backend */ + +/* Flags for HSM */ +/* 12b used (from high weight to low weight): + * 2b for flags + * 3b for event + * 7b for error code + */ +#define CLF_HSM_ERR_L 0 /* HSM return code, 7 bits */ +#define CLF_HSM_ERR_H 6 +#define CLF_HSM_EVENT_L 7 /* HSM event, 3 bits, see enum hsm_event */ +#define CLF_HSM_EVENT_H 9 +#define CLF_HSM_FLAG_L 10 /* HSM flags, 2 bits, 1 used, 1 spare */ +#define CLF_HSM_FLAG_H 11 +#define CLF_HSM_SPARE_L 12 /* 4 spare bits */ +#define CLF_HSM_SPARE_H 15 +#define CLF_HSM_LAST 15 + +/* Remove bits higher than _h, then extract the value + * between _h and _l by shifting lower weigth to bit 0. */ +#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \ + >> (CLF_HSM_LAST - _h + _l)) + +#define CLF_HSM_SUCCESS 0x00 +#define CLF_HSM_MAXERROR 0x7E +#define CLF_HSM_ERROVERFLOW 0x7F + +#define CLF_HSM_DIRTY 1 /* file is dirty after HSM request end */ + +/* 3 bits field => 8 values allowed */ +enum hsm_event { + HE_ARCHIVE = 0, + HE_RESTORE = 1, + HE_CANCEL = 2, + HE_RELEASE = 3, + HE_REMOVE = 4, + HE_STATE = 5, + HE_SPARE1 = 6, + HE_SPARE2 = 7, + /* Leaving HE_SPARE2 as is. Its referred in the Lemur code */ + HE_IMPORT = 7, +}; + +static inline enum hsm_event hsm_get_cl_event(__u16 flags) +{ + return (enum hsm_event)CLF_GET_BITS(flags, CLF_HSM_EVENT_H, + CLF_HSM_EVENT_L); +} + +static inline void hsm_set_cl_event(enum changelog_rec_flags *clf_flags, + enum hsm_event he) +{ + *clf_flags |= (he << CLF_HSM_EVENT_L); +} + +static inline __u16 hsm_get_cl_flags(enum changelog_rec_flags clf_flags) +{ + return CLF_GET_BITS(clf_flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L); +} + +static inline void hsm_set_cl_flags(enum changelog_rec_flags *clf_flags, + unsigned int bits) +{ + *clf_flags |= (bits << CLF_HSM_FLAG_L); +} + +static inline int hsm_get_cl_error(enum changelog_rec_flags clf_flags) +{ + return CLF_GET_BITS(clf_flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L); +} + +static inline void hsm_set_cl_error(enum changelog_rec_flags *clf_flags, + unsigned int error) +{ + *clf_flags |= (error << CLF_HSM_ERR_L); +} + +enum changelog_rec_extra_flags { + CLFE_INVALID = 0, + CLFE_UIDGID = 0x0001, + CLFE_NID = 0x0002, + CLFE_OPEN = 0x0004, + CLFE_XATTR = 0x0008, + CLFE_SUPPORTED = CLFE_UIDGID | CLFE_NID | CLFE_OPEN | CLFE_XATTR +}; + +enum changelog_send_flag { + /* Not yet implemented */ + CHANGELOG_FLAG_FOLLOW = 0x01, + /* Blocking IO makes sense in case of slow user parsing of the records, + * but it also prevents us from cleaning up if the records are not + * consumed. */ + CHANGELOG_FLAG_BLOCK = 0x02, + /* Pack jobid into the changelog records if available. */ + CHANGELOG_FLAG_JOBID = 0x04, + /* Pack additional flag bits into the changelog record */ + CHANGELOG_FLAG_EXTRA_FLAGS = 0x08, +}; + +enum changelog_send_extra_flag { + /* Pack uid/gid into the changelog record */ + CHANGELOG_EXTRA_FLAG_UIDGID = 0x01, + /* Pack nid into the changelog record */ + CHANGELOG_EXTRA_FLAG_NID = 0x02, + /* Pack open mode into the changelog record */ + CHANGELOG_EXTRA_FLAG_OMODE = 0x04, + /* Pack xattr name into the changelog record */ + CHANGELOG_EXTRA_FLAG_XATTR = 0x08, +}; + +#define CR_MAXSIZE __ALIGN_KERNEL(2 * NAME_MAX + 2 + \ + changelog_rec_offset(CLF_SUPPORTED, \ + CLFE_SUPPORTED), 8) + +/* 31 usable bytes string + null terminator. */ +#define LUSTRE_JOBID_SIZE 32 + +/* This is the minimal changelog record. It can contain extensions + * such as rename fields or process jobid. Its exact content is described + * by the cr_flags and cr_extra_flags. + * + * Extensions are packed in the same order as their corresponding flags, + * then in the same order as their corresponding extra flags. + */ +struct changelog_rec { + __u16 cr_namelen; + __u16 cr_flags; /**< \a changelog_rec_flags */ + __u32 cr_type; /**< \a changelog_rec_type */ + __u64 cr_index; /**< changelog record number */ + __u64 cr_prev; /**< last index for this target fid */ + __u64 cr_time; + union { + struct lu_fid cr_tfid; /**< target fid */ + __u32 cr_markerflags; /**< CL_MARK flags */ + }; + struct lu_fid cr_pfid; /**< parent fid */ +} __attribute__ ((packed)); + +/* Changelog extension for RENAME. */ +struct changelog_ext_rename { + struct lu_fid cr_sfid; /**< source fid, or zero */ + struct lu_fid cr_spfid; /**< source parent fid, or zero */ +}; + +/* Changelog extension to include JOBID. */ +struct changelog_ext_jobid { + char cr_jobid[LUSTRE_JOBID_SIZE]; /**< zero-terminated string. */ +}; + +/* Changelog extension to include additional flags. */ +struct changelog_ext_extra_flags { + __u64 cr_extra_flags; /* Additional CLFE_* flags */ +}; + +/* Changelog extra extension to include UID/GID. */ +struct changelog_ext_uidgid { + __u64 cr_uid; + __u64 cr_gid; +}; + +/* Changelog extra extension to include NID. */ +struct changelog_ext_nid { + /* have __u64 instead of lnet_nid_t type for use by client api */ + __u64 cr_nid; + /* for use when IPv6 support is added */ + __u64 extra; + __u32 padding; +}; + +/* Changelog extra extension to include low 32 bits of MDS_OPEN_* flags. */ +struct changelog_ext_openmode { + __u32 cr_openflags; +}; + +/* Changelog extra extension to include xattr */ +struct changelog_ext_xattr { + char cr_xattr[XATTR_NAME_MAX + 1]; /**< zero-terminated string. */ +}; + +static inline struct changelog_ext_extra_flags *changelog_rec_extra_flags( + const struct changelog_rec *rec); + +static inline __kernel_size_t changelog_rec_offset(enum changelog_rec_flags crf, + enum changelog_rec_extra_flags cref) +{ + __kernel_size_t size = sizeof(struct changelog_rec); + + if (crf & CLF_RENAME) + size += sizeof(struct changelog_ext_rename); + + if (crf & CLF_JOBID) + size += sizeof(struct changelog_ext_jobid); + + if (crf & CLF_EXTRA_FLAGS) { + size += sizeof(struct changelog_ext_extra_flags); + if (cref & CLFE_UIDGID) + size += sizeof(struct changelog_ext_uidgid); + if (cref & CLFE_NID) + size += sizeof(struct changelog_ext_nid); + if (cref & CLFE_OPEN) + size += sizeof(struct changelog_ext_openmode); + if (cref & CLFE_XATTR) + size += sizeof(struct changelog_ext_xattr); + } + + return size; +} + +static inline __kernel_size_t changelog_rec_size(const struct changelog_rec *rec) +{ + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + if (rec->cr_flags & CLF_EXTRA_FLAGS) + cref = changelog_rec_extra_flags(rec)->cr_extra_flags; + + return changelog_rec_offset(rec->cr_flags, cref); +} + +static inline __kernel_size_t changelog_rec_varsize(const struct changelog_rec *rec) +{ + return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen; +} + +static inline +struct changelog_ext_rename *changelog_rec_rename(const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = rec->cr_flags & CLF_VERSION; + + return (struct changelog_ext_rename *)((char *)rec + + changelog_rec_offset(crf, + CLFE_INVALID)); +} + +/* The jobid follows the rename extension, if present */ +static inline +struct changelog_ext_jobid *changelog_rec_jobid(const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = rec->cr_flags & + (CLF_VERSION | CLF_RENAME); + + return (struct changelog_ext_jobid *)((char *)rec + + changelog_rec_offset(crf, + CLFE_INVALID)); +} + +/* The additional flags follow the rename and jobid extensions, if present */ +static inline +struct changelog_ext_extra_flags *changelog_rec_extra_flags( + const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = rec->cr_flags & + (CLF_VERSION | CLF_RENAME | CLF_JOBID); + + return (struct changelog_ext_extra_flags *)((char *)rec + + changelog_rec_offset(crf, + CLFE_INVALID)); +} + +/* The uid/gid is the first extra extension */ +static inline +struct changelog_ext_uidgid *changelog_rec_uidgid( + const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = rec->cr_flags & + (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS); + + return (struct changelog_ext_uidgid *)((char *)rec + + changelog_rec_offset(crf, + CLFE_INVALID)); +} + +/* The nid is the second extra extension */ +static inline +struct changelog_ext_nid *changelog_rec_nid(const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = rec->cr_flags & + (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS); + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + if (rec->cr_flags & CLF_EXTRA_FLAGS) + cref = changelog_rec_extra_flags(rec)->cr_extra_flags & + CLFE_UIDGID; + + return (struct changelog_ext_nid *)((char *)rec + + changelog_rec_offset(crf, cref)); +} + +/* The OPEN mode is the third extra extension */ +static inline +struct changelog_ext_openmode *changelog_rec_openmode( + const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = rec->cr_flags & + (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS); + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + if (rec->cr_flags & CLF_EXTRA_FLAGS) + cref = changelog_rec_extra_flags(rec)->cr_extra_flags & + (CLFE_UIDGID | CLFE_NID); + + return (struct changelog_ext_openmode *)((char *)rec + + changelog_rec_offset(crf, cref)); +} + +/* The xattr name is the fourth extra extension */ +static inline +struct changelog_ext_xattr *changelog_rec_xattr( + const struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = rec->cr_flags & + (CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS); + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + if (rec->cr_flags & CLF_EXTRA_FLAGS) + cref = changelog_rec_extra_flags(rec)->cr_extra_flags & + (CLFE_UIDGID | CLFE_NID | CLFE_OPEN); + + return (struct changelog_ext_xattr *)((char *)rec + + changelog_rec_offset(crf, cref)); +} + +/* The name follows the rename, jobid and extra flags extns, if present */ +static inline char *changelog_rec_name(const struct changelog_rec *rec) +{ + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + if (rec->cr_flags & CLF_EXTRA_FLAGS) + cref = changelog_rec_extra_flags(rec)->cr_extra_flags; + + return (char *)rec + changelog_rec_offset(rec->cr_flags & CLF_SUPPORTED, + cref & CLFE_SUPPORTED); +} + +static inline char *changelog_rec_sname(const struct changelog_rec *rec) +{ + return strchrnul(changelog_rec_name(rec), '\0') + 1; +} + +static inline __kernel_size_t changelog_rec_snamelen(const struct changelog_rec *rec) +{ + return strlen(changelog_rec_sname(rec)); +} + +/** + * Remap a record to the desired format as specified by the crf flags. + * The record must be big enough to contain the final remapped version. + * Superfluous extension fields are removed and missing ones are added + * and zeroed. The flags of the record are updated accordingly. + * + * The jobid and rename extensions can be added to a record, to match the + * format an application expects, typically. In this case, the newly added + * fields will be zeroed. + * The Jobid field can be removed, to guarantee compatibility with older + * clients that don't expect this field in the records they process. + * + * The following assumptions are being made: + * - CLF_RENAME will not be removed + * - CLF_JOBID will not be added without CLF_RENAME being added too + * - CLF_EXTRA_FLAGS will not be added without CLF_JOBID being added too + * + * @param[in,out] rec The record to remap. + * @param[in] crf_wanted Flags describing the desired extensions. + * @param[in] cref_want Flags describing the desired extra extensions. + */ +static inline void changelog_remap_rec(struct changelog_rec *rec, + enum changelog_rec_flags crf_wanted, + enum changelog_rec_extra_flags cref_want) +{ + char *xattr_mov = NULL; + char *omd_mov = NULL; + char *nid_mov = NULL; + char *uidgid_mov = NULL; + char *ef_mov; + char *jid_mov; + char *rnm_mov; + enum changelog_rec_extra_flags cref = CLFE_INVALID; + + crf_wanted &= CLF_SUPPORTED; + cref_want &= CLFE_SUPPORTED; + + if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted) { + if (!(rec->cr_flags & CLF_EXTRA_FLAGS) || + (rec->cr_flags & CLF_EXTRA_FLAGS && + (changelog_rec_extra_flags(rec)->cr_extra_flags & + CLFE_SUPPORTED) == + cref_want)) + return; + } + + /* First move the variable-length name field */ + memmove((char *)rec + changelog_rec_offset(crf_wanted, cref_want), + changelog_rec_name(rec), rec->cr_namelen); + + /* Locations of extensions in the remapped record */ + if (rec->cr_flags & CLF_EXTRA_FLAGS) { + xattr_mov = (char *)rec + + changelog_rec_offset(crf_wanted & CLF_SUPPORTED, + cref_want & ~CLFE_XATTR); + omd_mov = (char *)rec + + changelog_rec_offset(crf_wanted & CLF_SUPPORTED, + cref_want & ~(CLFE_OPEN | + CLFE_XATTR)); + nid_mov = (char *)rec + + changelog_rec_offset(crf_wanted & CLF_SUPPORTED, + cref_want & ~(CLFE_NID | + CLFE_OPEN | + CLFE_XATTR)); + uidgid_mov = (char *)rec + + changelog_rec_offset(crf_wanted & CLF_SUPPORTED, + cref_want & ~(CLFE_UIDGID | + CLFE_NID | + CLFE_OPEN | + CLFE_XATTR)); + cref = changelog_rec_extra_flags(rec)->cr_extra_flags; + } + + ef_mov = (char *)rec + + changelog_rec_offset(crf_wanted & ~CLF_EXTRA_FLAGS, + CLFE_INVALID); + jid_mov = (char *)rec + + changelog_rec_offset(crf_wanted & + ~(CLF_EXTRA_FLAGS | CLF_JOBID), + CLFE_INVALID); + rnm_mov = (char *)rec + + changelog_rec_offset(crf_wanted & + ~(CLF_EXTRA_FLAGS | + CLF_JOBID | + CLF_RENAME), + CLFE_INVALID); + + /* Move the extension fields to the desired positions */ + if ((crf_wanted & CLF_EXTRA_FLAGS) && + (rec->cr_flags & CLF_EXTRA_FLAGS)) { + if ((cref_want & CLFE_XATTR) && (cref & CLFE_XATTR)) + memmove(xattr_mov, changelog_rec_xattr(rec), + sizeof(struct changelog_ext_xattr)); + + if ((cref_want & CLFE_OPEN) && (cref & CLFE_OPEN)) + memmove(omd_mov, changelog_rec_openmode(rec), + sizeof(struct changelog_ext_openmode)); + + if ((cref_want & CLFE_NID) && (cref & CLFE_NID)) + memmove(nid_mov, changelog_rec_nid(rec), + sizeof(struct changelog_ext_nid)); + + if ((cref_want & CLFE_UIDGID) && (cref & CLFE_UIDGID)) + memmove(uidgid_mov, changelog_rec_uidgid(rec), + sizeof(struct changelog_ext_uidgid)); + + memmove(ef_mov, changelog_rec_extra_flags(rec), + sizeof(struct changelog_ext_extra_flags)); + } + + if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID)) + memmove(jid_mov, changelog_rec_jobid(rec), + sizeof(struct changelog_ext_jobid)); + + if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME)) + memmove(rnm_mov, changelog_rec_rename(rec), + sizeof(struct changelog_ext_rename)); + + /* Clear newly added fields */ + if (xattr_mov && (cref_want & CLFE_XATTR) && + !(cref & CLFE_XATTR)) + memset(xattr_mov, 0, sizeof(struct changelog_ext_xattr)); + + if (omd_mov && (cref_want & CLFE_OPEN) && + !(cref & CLFE_OPEN)) + memset(omd_mov, 0, sizeof(struct changelog_ext_openmode)); + + if (nid_mov && (cref_want & CLFE_NID) && + !(cref & CLFE_NID)) + memset(nid_mov, 0, sizeof(struct changelog_ext_nid)); + + if (uidgid_mov && (cref_want & CLFE_UIDGID) && + !(cref & CLFE_UIDGID)) + memset(uidgid_mov, 0, sizeof(struct changelog_ext_uidgid)); + + if ((crf_wanted & CLF_EXTRA_FLAGS) && + !(rec->cr_flags & CLF_EXTRA_FLAGS)) + memset(ef_mov, 0, sizeof(struct changelog_ext_extra_flags)); + + if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID)) + memset(jid_mov, 0, sizeof(struct changelog_ext_jobid)); + + if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME)) + memset(rnm_mov, 0, sizeof(struct changelog_ext_rename)); + + /* Update the record's flags accordingly */ + rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted; + if (rec->cr_flags & CLF_EXTRA_FLAGS) + changelog_rec_extra_flags(rec)->cr_extra_flags = + changelog_rec_extra_flags(rec)->cr_extra_flags | + cref_want; +} + +enum changelog_message_type { + CL_RECORD = 10, /* message is a changelog_rec */ + CL_EOF = 11, /* at end of current changelog */ +}; + +/********* Misc **********/ + +struct ioc_data_version { + __u64 idv_version; + __u32 idv_layout_version; /* FLR: layout version for OST objects */ + __u32 idv_flags; /* enum ioc_data_version_flags */ +}; + +enum ioc_data_version_flags { + LL_DV_RD_FLUSH = (1 << 0), /* Flush dirty pages from clients */ + LL_DV_WR_FLUSH = (1 << 1), /* Flush all caching pages from clients */ +}; + +#ifndef offsetof +#define offsetof(typ, memb) ((unsigned long)((char *)&(((typ *)0)->memb))) +#endif + +#define dot_lustre_name ".lustre" + + +/********* HSM **********/ + +/** HSM per-file state + * See HSM_FLAGS below. + */ +enum hsm_states { + HS_NONE = 0x00000000, + HS_EXISTS = 0x00000001, + HS_DIRTY = 0x00000002, + HS_RELEASED = 0x00000004, + HS_ARCHIVED = 0x00000008, + HS_NORELEASE = 0x00000010, + HS_NOARCHIVE = 0x00000020, + HS_LOST = 0x00000040, +}; + +/* HSM user-setable flags. */ +#define HSM_USER_MASK (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY) + +/* Other HSM flags. */ +#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED) + +/* + * All HSM-related possible flags that could be applied to a file. + * This should be kept in sync with hsm_states. + */ +#define HSM_FLAGS_MASK (HSM_USER_MASK | HSM_STATUS_MASK) + +/** + * HSM request progress state + */ +enum hsm_progress_states { + HPS_NONE = 0, + HPS_WAITING = 1, + HPS_RUNNING = 2, + HPS_DONE = 3, +}; + +static inline const char *hsm_progress_state2name(enum hsm_progress_states s) +{ + switch (s) { + case HPS_WAITING: return "waiting"; + case HPS_RUNNING: return "running"; + case HPS_DONE: return "done"; + default: return "unknown"; + } +} + +struct hsm_extent { + __u64 offset; + __u64 length; +} __attribute__((packed)); + +/** + * Current HSM states of a Lustre file. + * + * This structure purpose is to be sent to user-space mainly. It describes the + * current HSM flags and in-progress action. + */ +struct hsm_user_state { + /** Current HSM states, from enum hsm_states. */ + __u32 hus_states; + __u32 hus_archive_id; + /** The current undergoing action, if there is one */ + __u32 hus_in_progress_state; + __u32 hus_in_progress_action; + struct hsm_extent hus_in_progress_location; + char hus_extended_info[]; +}; + +struct hsm_state_set_ioc { + struct lu_fid hssi_fid; + __u64 hssi_setmask; + __u64 hssi_clearmask; +}; + +/* + * This structure describes the current in-progress action for a file. + * it is retuned to user space and send over the wire + */ +struct hsm_current_action { + /** The current undergoing action, if there is one */ + /* state is one of hsm_progress_states */ + __u32 hca_state; + /* action is one of hsm_user_action */ + __u32 hca_action; + struct hsm_extent hca_location; +}; + +/***** HSM user requests ******/ +/* User-generated (lfs/ioctl) request types */ +enum hsm_user_action { + HUA_NONE = 1, /* no action (noop) */ + HUA_ARCHIVE = 10, /* copy to hsm */ + HUA_RESTORE = 11, /* prestage */ + HUA_RELEASE = 12, /* drop ost objects */ + HUA_REMOVE = 13, /* remove from archive */ + HUA_CANCEL = 14, /* cancel a request */ + HUA_IMPORT = 15, /* add a new file */ +}; + +static inline const char *hsm_user_action2name(enum hsm_user_action a) +{ + switch (a) { + case HUA_NONE: return "NOOP"; + case HUA_ARCHIVE: return "ARCHIVE"; + case HUA_RESTORE: return "RESTORE"; + case HUA_RELEASE: return "RELEASE"; + case HUA_REMOVE: return "REMOVE"; + case HUA_CANCEL: return "CANCEL"; + case HUA_IMPORT: return "IMPORT"; + default: return "UNKNOWN"; + } +} + +/* + * List of hr_flags (bit field) + */ +#define HSM_FORCE_ACTION 0x0001 +/* used by CT, cannot be set by user */ +#define HSM_GHOST_COPY 0x0002 + +/** + * Contains all the fixed part of struct hsm_user_request. + * + */ +struct hsm_request { + __u32 hr_action; /* enum hsm_user_action */ + __u32 hr_archive_id; /* archive id, used only with HUA_ARCHIVE */ + __u64 hr_flags; /* request flags */ + __u32 hr_itemcount; /* item count in hur_user_item vector */ + __u32 hr_data_len; +}; + +struct hsm_user_item { + struct lu_fid hui_fid; + struct hsm_extent hui_extent; +} __attribute__((packed)); + +struct hsm_user_request { + struct hsm_request hur_request; + struct hsm_user_item hur_user_item[0]; + /* extra data blob at end of struct (after all + * hur_user_items), only use helpers to access it + */ +} __attribute__((packed)); + +/** Return pointer to data field in a hsm user request */ +static inline void *hur_data(struct hsm_user_request *hur) +{ + return &(hur->hur_user_item[hur->hur_request.hr_itemcount]); +} + +/** + * Compute the current length of the provided hsm_user_request. This returns -1 + * instead of an errno because __kernel_ssize_t is defined to be only + * [ -1, SSIZE_MAX ] + * + * return -1 on bounds check error. + */ +static inline __kernel_size_t hur_len(struct hsm_user_request *hur) +{ + __u64 size; + + /* can't overflow a __u64 since hr_itemcount is only __u32 */ + size = offsetof(struct hsm_user_request, hur_user_item[0]) + + (__u64)hur->hur_request.hr_itemcount * + sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len; + + if ((__kernel_ssize_t)size < 0) + return -1; + + return size; +} + +/****** HSM RPCs to copytool *****/ +/* Message types the copytool may receive */ +enum hsm_message_type { + HMT_ACTION_LIST = 100, /* message is a hsm_action_list */ +}; + +/* Actions the copytool may be instructed to take for a given action_item */ +enum hsm_copytool_action { + HSMA_NONE = 10, /* no action */ + HSMA_ARCHIVE = 20, /* arbitrary offset */ + HSMA_RESTORE = 21, + HSMA_REMOVE = 22, + HSMA_CANCEL = 23, + HSMA_IMPORT = 24 +}; + +static inline const char *hsm_copytool_action2name(enum hsm_copytool_action a) +{ + switch (a) { + case HSMA_NONE: return "NOOP"; + case HSMA_ARCHIVE: return "ARCHIVE"; + case HSMA_RESTORE: return "RESTORE"; + case HSMA_REMOVE: return "REMOVE"; + case HSMA_CANCEL: return "CANCEL"; + case HSMA_IMPORT: return "IMPORT"; + default: return "UNKNOWN"; + } +} + +/* Copytool item action description */ +struct hsm_action_item { + __u32 hai_len; /* valid size of this struct */ + __u32 hai_action; /* hsm_copytool_action, but use known size */ + struct lu_fid hai_fid; /* Lustre FID to operate on */ + struct lu_fid hai_dfid; /* fid used for data access */ + struct hsm_extent hai_extent; /* byte range to operate on */ + __u64 hai_cookie; /* action cookie from coordinator */ + __u64 hai_gid; /* grouplock id */ + char hai_data[0]; /* variable length */ +} __attribute__((packed)); + +/** + * helper function which print in hexa the first bytes of + * hai opaque field + * + * \param hai [IN] record to print + * \param buffer [IN,OUT] buffer to write the hex string to + * \param len [IN] max buffer length + * + * \retval buffer + */ +static inline char *hai_dump_data_field(const struct hsm_action_item *hai, + char *buffer, __kernel_size_t len) +{ + int i; + int data_len; + char *ptr; + + ptr = buffer; + data_len = hai->hai_len - sizeof(*hai); + for (i = 0; (i < data_len) && (len > 2); i++) { + snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]); + ptr += 2; + len -= 2; + } + + *ptr = '\0'; + + return buffer; +} + +/* Copytool action list */ +#define HAL_VERSION 1 +#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */ +struct hsm_action_list { + __u32 hal_version; + __u32 hal_count; /* number of hai's to follow */ + __u64 hal_compound_id; /* returned by coordinator, ignored */ + __u64 hal_flags; + __u32 hal_archive_id; /* which archive backend */ + __u32 padding1; + char hal_fsname[0]; /* null-terminated */ + /* struct hsm_action_item[hal_count] follows, aligned on 8-byte + boundaries. See hai_zero */ +} __attribute__((packed)); + +/* Return pointer to first hai in action list */ +static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal) +{ + __kernel_size_t offset = __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8); + + return (struct hsm_action_item *)(hal->hal_fsname + offset); +} + +/* Return pointer to next hai */ +static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai) +{ + __kernel_size_t offset = __ALIGN_KERNEL(hai->hai_len, 8); + + return (struct hsm_action_item *)((char *)hai + offset); +} + +/* Return size of an hsm_action_list */ +static inline __kernel_size_t hal_size(struct hsm_action_list *hal) +{ + __u32 i; + __kernel_size_t sz; + struct hsm_action_item *hai; + + sz = sizeof(*hal) + __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8); + hai = hai_first(hal); + for (i = 0; i < hal->hal_count ; i++, hai = hai_next(hai)) + sz += __ALIGN_KERNEL(hai->hai_len, 8); + + return sz; +} + +/* HSM file import + * describe the attributes to be set on imported file + */ +struct hsm_user_import { + __u64 hui_size; + __u64 hui_atime; + __u64 hui_mtime; + __u32 hui_atime_ns; + __u32 hui_mtime_ns; + __u32 hui_uid; + __u32 hui_gid; + __u32 hui_mode; + __u32 hui_archive_id; +}; + +/* Copytool progress reporting */ +#define HP_FLAG_COMPLETED 0x01 +#define HP_FLAG_RETRY 0x02 + +struct hsm_progress { + struct lu_fid hp_fid; + __u64 hp_cookie; + struct hsm_extent hp_extent; + __u16 hp_flags; + __u16 hp_errval; /* positive val */ + __u32 padding; +}; + +struct hsm_copy { + __u64 hc_data_version; + __u16 hc_flags; + __u16 hc_errval; /* positive val */ + __u32 padding; + struct hsm_action_item hc_hai; +}; + +/* JSON objects */ +enum llapi_json_types { + LLAPI_JSON_INTEGER = 1, + LLAPI_JSON_BIGNUM, + LLAPI_JSON_REAL, + LLAPI_JSON_STRING +}; + +struct llapi_json_item { + char *lji_key; + __u32 lji_type; + union { + int lji_integer; + __u64 lji_u64; + double lji_real; + char *lji_string; + }; + struct llapi_json_item *lji_next; +}; + +struct llapi_json_item_list { + int ljil_item_count; + struct llapi_json_item *ljil_items; +}; + +enum lu_ladvise_type { + LU_LADVISE_INVALID = 0, + LU_LADVISE_WILLREAD = 1, + LU_LADVISE_DONTNEED = 2, + LU_LADVISE_LOCKNOEXPAND = 3, + LU_LADVISE_LOCKAHEAD = 4, + LU_LADVISE_MAX +}; + +#define LU_LADVISE_NAMES { \ + [LU_LADVISE_WILLREAD] = "willread", \ + [LU_LADVISE_DONTNEED] = "dontneed", \ + [LU_LADVISE_LOCKNOEXPAND] = "locknoexpand", \ + [LU_LADVISE_LOCKAHEAD] = "lockahead", \ +} + +/* This is the userspace argument for ladvise. It is currently the same as + * what goes on the wire (struct lu_ladvise), but is defined separately as we + * may need info which is only used locally. */ +struct llapi_lu_ladvise { + __u16 lla_advice; /* advice type */ + __u16 lla_value1; /* values for different advice types */ + __u32 lla_value2; + __u64 lla_start; /* first byte of extent for advice */ + __u64 lla_end; /* last byte of extent for advice */ + __u32 lla_value3; + __u32 lla_value4; +}; + +enum ladvise_flag { + LF_ASYNC = 0x00000001, + LF_UNSET = 0x00000002, +}; + +#define LADVISE_MAGIC 0x1ADF1CE0 +/* Masks of valid flags for each advice */ +#define LF_LOCKNOEXPAND_MASK LF_UNSET +/* Flags valid for all advices not explicitly specified */ +#define LF_DEFAULT_MASK LF_ASYNC +/* All flags */ +#define LF_MASK (LF_ASYNC | LF_UNSET) + +#define lla_lockahead_mode lla_value1 +#define lla_peradvice_flags lla_value2 +#define lla_lockahead_result lla_value3 + +/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which + * is used on the wire. It is defined separately as we may need info which is + * only used locally. */ +struct llapi_ladvise_hdr { + __u32 lah_magic; /* LADVISE_MAGIC */ + __u32 lah_count; /* number of advices */ + __u64 lah_flags; /* from enum ladvise_flag */ + __u32 lah_value1; /* unused */ + __u32 lah_value2; /* unused */ + __u64 lah_value3; /* unused */ + struct llapi_lu_ladvise lah_advise[0]; /* advices in this header */ +}; + +#define LAH_COUNT_MAX (1024) + +/* Shared key */ +enum sk_crypt_alg { + SK_CRYPT_INVALID = -1, + SK_CRYPT_EMPTY = 0, + SK_CRYPT_AES256_CTR = 1, +}; + +enum sk_hmac_alg { + SK_HMAC_INVALID = -1, + SK_HMAC_EMPTY = 0, + SK_HMAC_SHA256 = 1, + SK_HMAC_SHA512 = 2, +}; + +struct sk_crypt_type { + const char *sct_name; + int sct_type; +}; + +struct sk_hmac_type { + const char *sht_name; + int sht_type; +}; + +enum lock_mode_user { + MODE_READ_USER = 1, + MODE_WRITE_USER, + MODE_MAX_USER, +}; + +#define LOCK_MODE_NAMES { \ + [MODE_READ_USER] = "READ",\ + [MODE_WRITE_USER] = "WRITE"\ +} + +enum lockahead_results { + LLA_RESULT_SENT = 0, + LLA_RESULT_DIFFERENT, + LLA_RESULT_SAME, +}; + +struct fid_array { + __u32 fa_nr; + /* make header's size equal lu_fid */ + __u32 fa_padding0; + __u64 fa_padding1; + struct lu_fid fa_fids[0]; +}; +#define OBD_MAX_FIDS_IN_ARRAY 4096 + +#if defined(__cplusplus) +} +#endif + +/** @} lustreuser */ + +#endif /* _LUSTRE_USER_H */ diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h new file mode 100644 index 0000000000000..90aa25d8aab8a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h @@ -0,0 +1,31 @@ +#ifndef _LUSTRE_VER_H_ +#define _LUSTRE_VER_H_ + +/* + * LUSTRE_VERSION_STRING + * + * Note that some files may seem to include this header unnecessarily. + * If the file uses LUSTRE_VERSION_STRING, it is likely doing the include + * for compatibility with the Lustre code in the Linux kernel. + * In the Linux kernel, they are likely hard coding LUSTRE_VERSION_STRING + * right here in this file. The out-of-kernel Lustre code generates + * LUSTRE_VERSION_STRING in autoconf with AC_DEFINE. + */ + +#define OBD_OCD_VERSION(major, minor, patch, fix) \ + (((major) << 24) + ((minor) << 16) + ((patch) << 8) + (fix)) + +#define OBD_OCD_VERSION_MAJOR(version) ((int)((version) >> 24) & 255) +#define OBD_OCD_VERSION_MINOR(version) ((int)((version) >> 16) & 255) +#define OBD_OCD_VERSION_PATCH(version) ((int)((version) >> 8) & 255) +#define OBD_OCD_VERSION_FIX(version) ((int)((version) >> 0) & 255) + +#define LUSTRE_VERSION_CODE \ + OBD_OCD_VERSION(LUSTRE_MAJOR, LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX) + +/* If lustre version of client and servers it connects to differs by more + * than this amount, client would issue a warning. + * (set in lustre/autoconf/lustre-version.ac) */ +#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 50, 0) + +#endif diff --git a/drivers/staging/lustrefsx/lustre/include/upcall_cache.h b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h new file mode 100644 index 0000000000000..1f02294b9660d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h @@ -0,0 +1,154 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _UPCALL_CACHE_H +#define _UPCALL_CACHE_H + +#include +#include + +/** \defgroup ucache ucache + * + * @{ + */ + +#define UC_CACHE_NEW 0x01 +#define UC_CACHE_ACQUIRING 0x02 +#define UC_CACHE_INVALID 0x04 +#define UC_CACHE_EXPIRED 0x08 + +#define UC_CACHE_IS_NEW(i) ((i)->ue_flags & UC_CACHE_NEW) +#define UC_CACHE_IS_INVALID(i) ((i)->ue_flags & UC_CACHE_INVALID) +#define UC_CACHE_IS_ACQUIRING(i) ((i)->ue_flags & UC_CACHE_ACQUIRING) +#define UC_CACHE_IS_EXPIRED(i) ((i)->ue_flags & UC_CACHE_EXPIRED) +#define UC_CACHE_IS_VALID(i) ((i)->ue_flags == 0) + +#define UC_CACHE_SET_NEW(i) ((i)->ue_flags |= UC_CACHE_NEW) +#define UC_CACHE_SET_INVALID(i) ((i)->ue_flags |= UC_CACHE_INVALID) +#define UC_CACHE_SET_ACQUIRING(i) ((i)->ue_flags |= UC_CACHE_ACQUIRING) +#define UC_CACHE_SET_EXPIRED(i) ((i)->ue_flags |= UC_CACHE_EXPIRED) +#define UC_CACHE_SET_VALID(i) ((i)->ue_flags = 0) + +#define UC_CACHE_CLEAR_NEW(i) ((i)->ue_flags &= ~UC_CACHE_NEW) +#define UC_CACHE_CLEAR_ACQUIRING(i) ((i)->ue_flags &= ~UC_CACHE_ACQUIRING) +#define UC_CACHE_CLEAR_INVALID(i) ((i)->ue_flags &= ~UC_CACHE_INVALID) +#define UC_CACHE_CLEAR_EXPIRED(i) ((i)->ue_flags &= ~UC_CACHE_EXPIRED) + +struct upcall_cache_entry; + +struct md_perm { + lnet_nid_t mp_nid; + uint32_t mp_perm; +}; + +struct md_identity { + struct upcall_cache_entry *mi_uc_entry; + uid_t mi_uid; + gid_t mi_gid; + struct group_info *mi_ginfo; + int mi_nperms; + struct md_perm *mi_perms; +}; + +struct upcall_cache_entry { + struct list_head ue_hash; + uint64_t ue_key; + atomic_t ue_refcount; + int ue_flags; + wait_queue_head_t ue_waitq; + time64_t ue_acquire_expire; + time64_t ue_expire; + union { + struct md_identity identity; + } u; +}; + +#define UC_CACHE_HASH_SIZE (128) +#define UC_CACHE_HASH_INDEX(id) ((id) & (UC_CACHE_HASH_SIZE - 1)) +#define UC_CACHE_UPCALL_MAXPATH (1024UL) + +struct upcall_cache; + +struct upcall_cache_ops { + void (*init_entry)(struct upcall_cache_entry *, void *args); + void (*free_entry)(struct upcall_cache *, + struct upcall_cache_entry *); + int (*upcall_compare)(struct upcall_cache *, + struct upcall_cache_entry *, + __u64 key, void *args); + int (*downcall_compare)(struct upcall_cache *, + struct upcall_cache_entry *, + __u64 key, void *args); + int (*do_upcall)(struct upcall_cache *, + struct upcall_cache_entry *); + int (*parse_downcall)(struct upcall_cache *, + struct upcall_cache_entry *, void *); +}; + +struct upcall_cache { + struct list_head uc_hashtable[UC_CACHE_HASH_SIZE]; + spinlock_t uc_lock; + struct rw_semaphore uc_upcall_rwsem; + + char uc_name[40]; /* for upcall */ + char uc_upcall[UC_CACHE_UPCALL_MAXPATH]; + time64_t uc_acquire_expire; /* seconds */ + time64_t uc_entry_expire; /* seconds */ + struct upcall_cache_ops *uc_ops; +}; + +struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache, + __u64 key, void *args); +void upcall_cache_put_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry); +int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key, + void *args); +void upcall_cache_flush(struct upcall_cache *cache, int force); + +static inline void upcall_cache_flush_idle(struct upcall_cache *cache) +{ + upcall_cache_flush(cache, 0); +} + +static inline void upcall_cache_flush_all(struct upcall_cache *cache) +{ + upcall_cache_flush(cache, 1); +} + +void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args); +struct upcall_cache *upcall_cache_init(const char *name, const char *upcall, + struct upcall_cache_ops *ops); +void upcall_cache_cleanup(struct upcall_cache *cache); + +/** @} ucache */ + +#endif /* _UPCALL_CACHE_H */ diff --git a/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c b/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c new file mode 100644 index 0000000000000..b39b105a894e6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c @@ -0,0 +1,762 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/interval_tree.c + * + * Interval tree library used by ldlm extent lock code + * + * Author: Huang Wei + * Author: Jay Xiong + */ + +#include +#include + +enum { + INTERVAL_RED = 0, + INTERVAL_BLACK = 1 +}; + +static inline int node_is_left_child(struct interval_node *node) +{ + LASSERT(node->in_parent != NULL); + return node == node->in_parent->in_left; +} + +static inline int node_is_right_child(struct interval_node *node) +{ + LASSERT(node->in_parent != NULL); + return node == node->in_parent->in_right; +} + +static inline int node_is_red(struct interval_node *node) +{ + return node->in_color == INTERVAL_RED; +} + +static inline int node_is_black(struct interval_node *node) +{ + return node->in_color == INTERVAL_BLACK; +} + +static inline int extent_compare(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + int rc; + if (e1->start == e2->start) { + if (e1->end < e2->end) + rc = -1; + else if (e1->end > e2->end) + rc = 1; + else + rc = 0; + } else { + if (e1->start < e2->start) + rc = -1; + else + rc = 1; + } + return rc; +} + +static inline int extent_equal(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start == e2->start) && (e1->end == e2->end); +} + +static inline int extent_overlapped(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start <= e2->end) && (e2->start <= e1->end); +} + +static inline int node_compare(struct interval_node *n1, + struct interval_node *n2) +{ + return extent_compare(&n1->in_extent, &n2->in_extent); +} + +int node_equal(struct interval_node *n1, struct interval_node *n2) +{ + return extent_equal(&n1->in_extent, &n2->in_extent); +} + +static inline __u64 max_u64(__u64 x, __u64 y) +{ + return x > y ? x : y; +} + +static inline __u64 min_u64(__u64 x, __u64 y) +{ + return x < y ? x : y; +} + +#define interval_for_each(node, root) \ +for (node = interval_first(root); node != NULL; \ + node = interval_next(node)) + +#define interval_for_each_reverse(node, root) \ +for (node = interval_last(root); node != NULL; \ + node = interval_prev(node)) + +static struct interval_node *interval_first(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + while (node->in_left) + node = node->in_left; + RETURN(node); +} + +static struct interval_node *interval_last(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + while (node->in_right) + node = node->in_right; + RETURN(node); +} + +static struct interval_node *interval_next(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + if (node->in_right) + RETURN(interval_first(node->in_right)); + while (node->in_parent && node_is_right_child(node)) + node = node->in_parent; + RETURN(node->in_parent); +} + +static struct interval_node *interval_prev(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + + if (node->in_left) + RETURN(interval_last(node->in_left)); + + while (node->in_parent && node_is_left_child(node)) + node = node->in_parent; + + RETURN(node->in_parent); +} + +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, + void *data) +{ + struct interval_node *node; + enum interval_iter rc = INTERVAL_ITER_CONT; + ENTRY; + + interval_for_each(node, root) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + RETURN(rc); +} +EXPORT_SYMBOL(interval_iterate); + +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func, + void *data) +{ + struct interval_node *node; + enum interval_iter rc = INTERVAL_ITER_CONT; + ENTRY; + + interval_for_each_reverse(node, root) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + RETURN(rc); +} +EXPORT_SYMBOL(interval_iterate_reverse); + +/* try to find a node with same interval in the tree, + * if found, return the pointer to the node, otherwise return NULL*/ +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex) +{ + struct interval_node *walk = root; + int rc; + ENTRY; + + while (walk) { + rc = extent_compare(ex, &walk->in_extent); + if (rc == 0) + break; + else if (rc < 0) + walk = walk->in_left; + else + walk = walk->in_right; + } + + RETURN(walk); +} +EXPORT_SYMBOL(interval_find); + +static void __rotate_change_maxhigh(struct interval_node *node, + struct interval_node *rotate) +{ + __u64 left_max, right_max; + + rotate->in_max_high = node->in_max_high; + left_max = node->in_left ? node->in_left->in_max_high : 0; + right_max = node->in_right ? node->in_right->in_max_high : 0; + node->in_max_high = max_u64(interval_high(node), + max_u64(left_max,right_max)); +} + +/* The left rotation "pivots" around the link from node to node->right, and + * - node will be linked to node->right's left child, and + * - node->right's left child will be linked to node's right child. */ +static void __rotate_left(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *right = node->in_right; + struct interval_node *parent = node->in_parent; + + node->in_right = right->in_left; + if (node->in_right) + right->in_left->in_parent = node; + + right->in_left = node; + right->in_parent = parent; + if (parent) { + if (node_is_left_child(node)) + parent->in_left = right; + else + parent->in_right = right; + } else { + *root = right; + } + node->in_parent = right; + + /* update max_high for node and right */ + __rotate_change_maxhigh(node, right); +} + +/* The right rotation "pivots" around the link from node to node->left, and + * - node will be linked to node->left's right child, and + * - node->left's right child will be linked to node's left child. */ +static void __rotate_right(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *left = node->in_left; + struct interval_node *parent = node->in_parent; + + node->in_left = left->in_right; + if (node->in_left) + left->in_right->in_parent = node; + left->in_right = node; + + left->in_parent = parent; + if (parent) { + if (node_is_right_child(node)) + parent->in_right = left; + else + parent->in_left = left; + } else { + *root = left; + } + node->in_parent = left; + + /* update max_high for node and left */ + __rotate_change_maxhigh(node, left); +} + +#define interval_swap(a, b) do { \ + struct interval_node *c = a; a = b; b = c; \ +} while (0) + +/* + * Operations INSERT and DELETE, when run on a tree with n keys, + * take O(logN) time.Because they modify the tree, the result + * may violate the red-black properties.To restore these properties, + * we must change the colors of some of the nodes in the tree + * and also change the pointer structure. + */ +static void interval_insert_color(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *parent, *gparent; + ENTRY; + + while ((parent = node->in_parent) && node_is_red(parent)) { + gparent = parent->in_parent; + /* Parent is RED, so gparent must not be NULL */ + if (node_is_left_child(parent)) { + struct interval_node *uncle; + uncle = gparent->in_right; + if (uncle && node_is_red(uncle)) { + uncle->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + node = gparent; + continue; + } + + if (parent->in_right == node) { + __rotate_left(parent, root); + interval_swap(node, parent); + } + + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + __rotate_right(gparent, root); + } else { + struct interval_node *uncle; + uncle = gparent->in_left; + if (uncle && node_is_red(uncle)) { + uncle->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + node = gparent; + continue; + } + + if (node_is_left_child(node)) { + __rotate_right(parent, root); + interval_swap(node, parent); + } + + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + __rotate_left(gparent, root); + } + } + + (*root)->in_color = INTERVAL_BLACK; + EXIT; +} + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root) + +{ + struct interval_node **p, *parent = NULL; + ENTRY; + + LASSERT(!interval_is_intree(node)); + p = root; + while (*p) { + parent = *p; + if (node_equal(parent, node)) + RETURN(parent); + + /* max_high field must be updated after each iteration */ + if (parent->in_max_high < interval_high(node)) + parent->in_max_high = interval_high(node); + + if (node_compare(node, parent) < 0) + p = &parent->in_left; + else + p = &parent->in_right; + } + + /* link node into the tree */ + node->in_parent = parent; + node->in_color = INTERVAL_RED; + node->in_left = node->in_right = NULL; + *p = node; + + interval_insert_color(node, root); + node->in_intree = 1; + + RETURN(NULL); +} +EXPORT_SYMBOL(interval_insert); + +static inline int node_is_black_or_0(struct interval_node *node) +{ + return !node || node_is_black(node); +} + +static void interval_erase_color(struct interval_node *node, + struct interval_node *parent, + struct interval_node **root) +{ + struct interval_node *tmp; + ENTRY; + + while (node_is_black_or_0(node) && node != *root) { + if (parent->in_left == node) { + tmp = parent->in_right; + if (node_is_red(tmp)) { + tmp->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_RED; + __rotate_left(parent, root); + tmp = parent->in_right; + } + if (node_is_black_or_0(tmp->in_left) && + node_is_black_or_0(tmp->in_right)) { + tmp->in_color = INTERVAL_RED; + node = parent; + parent = node->in_parent; + } else { + if (node_is_black_or_0(tmp->in_right)) { + struct interval_node *o_left; + if ((o_left = tmp->in_left)) + o_left->in_color = INTERVAL_BLACK; + tmp->in_color = INTERVAL_RED; + __rotate_right(tmp, root); + tmp = parent->in_right; + } + tmp->in_color = parent->in_color; + parent->in_color = INTERVAL_BLACK; + if (tmp->in_right) + tmp->in_right->in_color = INTERVAL_BLACK; + __rotate_left(parent, root); + node = *root; + break; + } + } else { + tmp = parent->in_left; + if (node_is_red(tmp)) { + tmp->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_RED; + __rotate_right(parent, root); + tmp = parent->in_left; + } + if (node_is_black_or_0(tmp->in_left) && + node_is_black_or_0(tmp->in_right)) { + tmp->in_color = INTERVAL_RED; + node = parent; + parent = node->in_parent; + } else { + if (node_is_black_or_0(tmp->in_left)) { + struct interval_node *o_right; + if ((o_right = tmp->in_right)) + o_right->in_color = INTERVAL_BLACK; + tmp->in_color = INTERVAL_RED; + __rotate_left(tmp, root); + tmp = parent->in_left; + } + tmp->in_color = parent->in_color; + parent->in_color = INTERVAL_BLACK; + if (tmp->in_left) + tmp->in_left->in_color = INTERVAL_BLACK; + __rotate_right(parent, root); + node = *root; + break; + } + } + } + if (node) + node->in_color = INTERVAL_BLACK; + EXIT; +} + +/* + * if the @max_high value of @node is changed, this function traverse a path + * from node up to the root to update max_high for the whole tree. + */ +static void update_maxhigh(struct interval_node *node, + __u64 old_maxhigh) +{ + __u64 left_max, right_max; + ENTRY; + + while (node) { + left_max = node->in_left ? node->in_left->in_max_high : 0; + right_max = node->in_right ? node->in_right->in_max_high : 0; + node->in_max_high = max_u64(interval_high(node), + max_u64(left_max, right_max)); + + if (node->in_max_high >= old_maxhigh) + break; + node = node->in_parent; + } + EXIT; +} + +void interval_erase(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *child, *parent; + int color; + ENTRY; + + LASSERT(interval_is_intree(node)); + node->in_intree = 0; + if (!node->in_left) { + child = node->in_right; + } else if (!node->in_right) { + child = node->in_left; + } else { /* Both left and right child are not NULL */ + struct interval_node *old = node; + + node = interval_next(node); + child = node->in_right; + parent = node->in_parent; + color = node->in_color; + + if (child) + child->in_parent = parent; + if (parent == old) + parent->in_right = child; + else + parent->in_left = child; + + node->in_color = old->in_color; + node->in_right = old->in_right; + node->in_left = old->in_left; + node->in_parent = old->in_parent; + + if (old->in_parent) { + if (node_is_left_child(old)) + old->in_parent->in_left = node; + else + old->in_parent->in_right = node; + } else { + *root = node; + } + + old->in_left->in_parent = node; + if (old->in_right) + old->in_right->in_parent = node; + update_maxhigh(child ? : parent, node->in_max_high); + update_maxhigh(node, old->in_max_high); + if (parent == old) + parent = node; + goto color; + } + parent = node->in_parent; + color = node->in_color; + + if (child) + child->in_parent = parent; + if (parent) { + if (node_is_left_child(node)) + parent->in_left = child; + else + parent->in_right = child; + } else { + *root = child; + } + + update_maxhigh(child ? : parent, node->in_max_high); + +color: + if (color == INTERVAL_BLACK) + interval_erase_color(child, parent, root); + EXIT; +} +EXPORT_SYMBOL(interval_erase); + +static inline int interval_may_overlap(struct interval_node *node, + struct interval_node_extent *ext) +{ + return (ext->start <= node->in_max_high && + ext->end >= interval_low(node)); +} + +/* + * This function finds all intervals that overlap interval ext, + * and calls func to handle resulted intervals one by one. + * in lustre, this function will find all conflicting locks in + * the granted queue and add these locks to the ast work list. + * + * { + * if (node == NULL) + * return 0; + * if (ext->end < interval_low(node)) { + * interval_search(node->in_left, ext, func, data); + * } else if (interval_may_overlap(node, ext)) { + * if (extent_overlapped(ext, &node->in_extent)) + * func(node, data); + * interval_search(node->in_left, ext, func, data); + * interval_search(node->in_right, ext, func, data); + * } + * return 0; + * } + * + */ +enum interval_iter interval_search(struct interval_node *node, + struct interval_node_extent *ext, + interval_callback_t func, + void *data) +{ + struct interval_node *parent; + enum interval_iter rc = INTERVAL_ITER_CONT; + + ENTRY; + + LASSERT(ext != NULL); + LASSERT(func != NULL); + + while (node) { + if (ext->end < interval_low(node)) { + if (node->in_left) { + node = node->in_left; + continue; + } + } else if (interval_may_overlap(node, ext)) { + if (extent_overlapped(ext, &node->in_extent)) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + if (node->in_left) { + node = node->in_left; + continue; + } + if (node->in_right) { + node = node->in_right; + continue; + } + } + + parent = node->in_parent; + while (parent) { + if (node_is_left_child(node) && + parent->in_right) { + /* If we ever got the left, it means that the + * parent met ext->endin_right; + break; + } + node = parent; + parent = parent->in_parent; + } + if (parent == NULL || !interval_may_overlap(parent, ext)) + break; + } + + RETURN(rc); +} +EXPORT_SYMBOL(interval_search); + +static enum interval_iter interval_overlap_cb(struct interval_node *n, + void *args) +{ + *(int *)args = 1; + return INTERVAL_ITER_STOP; +} + +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ext) +{ + int has = 0; + (void)interval_search(root, ext, interval_overlap_cb, &has); + return has; +} +EXPORT_SYMBOL(interval_is_overlapped); + +/* Don't expand to low. Expanding downwards is expensive, and meaningless to + * some extents, because programs seldom do IO backward. + * + * The recursive algorithm of expanding low: + * expand_low { + * struct interval_node *tmp; + * static __u64 res = 0; + * + * if (root == NULL) + * return res; + * if (root->in_max_high < low) { + * res = max_u64(root->in_max_high + 1, res); + * return res; + * } else if (low < interval_low(root)) { + * interval_expand_low(root->in_left, low); + * return res; + * } + * + * if (interval_high(root) < low) + * res = max_u64(interval_high(root) + 1, res); + * interval_expand_low(root->in_left, low); + * interval_expand_low(root->in_right, low); + * + * return res; + * } + * + * It's much easy to eliminate the recursion, see interval_search for + * an example. -jay + */ +static inline __u64 interval_expand_low(struct interval_node *root, __u64 low) +{ + /* we only concern the empty tree right now. */ + if (root == NULL) + return 0; + return low; +} + +static inline __u64 interval_expand_high(struct interval_node *node, __u64 high) +{ + __u64 result = ~0; + + while (node != NULL) { + if (node->in_max_high < high) + break; + + if (interval_low(node) > high) { + result = interval_low(node) - 1; + node = node->in_left; + } else { + node = node->in_right; + } + } + + return result; +} + +/* expanding the extent based on @ext. */ +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter) +{ + /* The assertion of interval_is_overlapped is expensive because we may + * travel many nodes to find the overlapped node. */ + LASSERT(interval_is_overlapped(root, ext) == 0); + if (!limiter || limiter->start < ext->start) + ext->start = interval_expand_low(root, ext->start); + if (!limiter || limiter->end > ext->end) + ext->end = interval_expand_high(root, ext->end); + LASSERT(interval_is_overlapped(root, ext) == 0); +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c new file mode 100644 index 0000000000000..a4f7c85a42efb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c @@ -0,0 +1,72 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LDLM +#include + +#include +#include + +/** + * Lock a lock and its resource. + * + * LDLM locking uses resource to serialize access to locks + * but there is a case when we change resource of lock upon + * enqueue reply. We rely on lock->l_resource = new_res + * being an atomic operation. + */ +struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock) +{ + /* on server-side resource of lock doesn't change */ + if (!ldlm_is_ns_srv(lock)) + spin_lock(&lock->l_lock); + + lock_res(lock->l_resource); + + ldlm_set_res_locked(lock); + return lock->l_resource; +} +EXPORT_SYMBOL(lock_res_and_lock); + +/** + * Unlock a lock and its resource previously locked with lock_res_and_lock + */ +void unlock_res_and_lock(struct ldlm_lock *lock) +{ + /* on server-side resource of lock doesn't change */ + ldlm_clear_res_locked(lock); + + unlock_res(lock->l_resource); + if (!ldlm_is_ns_srv(lock)) + spin_unlock(&lock->l_lock); +} +EXPORT_SYMBOL(unlock_res_and_lock); diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c new file mode 100644 index 0000000000000..59d1302a36516 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c @@ -0,0 +1,1138 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_extent.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of EXTENT lock type + * + * EXTENT lock type is for locking a contiguous range of values, represented + * by 64-bit starting and ending offsets (inclusive). There are several extent + * lock modes, some of which may be mutually incompatible. Extent locks are + * considered incompatible if their modes are incompatible and their extents + * intersect. See the lock mode compatibility matrix in lustre_dlm.h. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include +#include + +#include "ldlm_internal.h" + +#ifdef HAVE_SERVER_SUPPORT +# define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1) + +/** + * Fix up the ldlm_extent after expanding it. + * + * After expansion has been done, we might still want to do certain adjusting + * based on overall contention of the resource and the like to avoid granting + * overly wide locks. + */ +static void ldlm_extent_internal_policy_fixup(struct ldlm_lock *req, + struct ldlm_extent *new_ex, + int conflicting) +{ + enum ldlm_mode req_mode = req->l_req_mode; + __u64 req_start = req->l_req_extent.start; + __u64 req_end = req->l_req_extent.end; + __u64 req_align, mask; + + if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) { + if (req_end < req_start + LDLM_MAX_GROWN_EXTENT) + new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT, + new_ex->end); + } + + if (new_ex->start == 0 && new_ex->end == OBD_OBJECT_EOF) { + EXIT; + return; + } + + /* we need to ensure that the lock extent is properly aligned to what + * the client requested. Also we need to make sure it's also server + * page size aligned otherwise a server page can be covered by two + * write locks. */ + mask = PAGE_SIZE; + req_align = (req_end + 1) | req_start; + if (req_align != 0 && (req_align & (mask - 1)) == 0) { + while ((req_align & mask) == 0) + mask <<= 1; + } + mask -= 1; + /* We can only shrink the lock, not grow it. + * This should never cause lock to be smaller than requested, + * since requested lock was already aligned on these boundaries. */ + new_ex->start = ((new_ex->start - 1) | mask) + 1; + new_ex->end = ((new_ex->end + 1) & ~mask) - 1; + LASSERTF(new_ex->start <= req_start, + "mask %#llx grant start %llu req start %llu\n", + mask, new_ex->start, req_start); + LASSERTF(new_ex->end >= req_end, + "mask %#llx grant end %llu req end %llu\n", + mask, new_ex->end, req_end); +} + +/** + * Return the maximum extent that: + * - contains the requested extent + * - does not overlap existing conflicting extents outside the requested one + * + * This allows clients to request a small required extent range, but if there + * is no contention on the lock the full lock can be granted to the client. + * This avoids the need for many smaller lock requests to be granted in the + * common (uncontended) case. + * + * Use interval tree to expand the lock extent for granted lock. + */ +static void ldlm_extent_internal_policy_granted(struct ldlm_lock *req, + struct ldlm_extent *new_ex) +{ + struct ldlm_resource *res = req->l_resource; + enum ldlm_mode req_mode = req->l_req_mode; + __u64 req_start = req->l_req_extent.start; + __u64 req_end = req->l_req_extent.end; + struct ldlm_interval_tree *tree; + struct interval_node_extent limiter = { + .start = new_ex->start, + .end = new_ex->end, + }; + int conflicting = 0; + int idx; + ENTRY; + + lockmode_verify(req_mode); + + /* Using interval tree to handle the LDLM extent granted locks. */ + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + struct interval_node_extent ext = { + .start = req_start, + .end = req_end, + }; + + tree = &res->lr_itree[idx]; + if (lockmode_compat(tree->lit_mode, req_mode)) + continue; + + conflicting += tree->lit_size; + if (conflicting > 4) + limiter.start = req_start; + + if (interval_is_overlapped(tree->lit_root, &ext)) + CDEBUG(D_INFO, + "req_mode = %d, tree->lit_mode = %d, " + "tree->lit_size = %d\n", + req_mode, tree->lit_mode, tree->lit_size); + interval_expand(tree->lit_root, &ext, &limiter); + limiter.start = max(limiter.start, ext.start); + limiter.end = min(limiter.end, ext.end); + if (limiter.start == req_start && limiter.end == req_end) + break; + } + + new_ex->start = limiter.start; + new_ex->end = limiter.end; + LASSERT(new_ex->start <= req_start); + LASSERT(new_ex->end >= req_end); + + ldlm_extent_internal_policy_fixup(req, new_ex, conflicting); + EXIT; +} + +/* The purpose of this function is to return: + * - the maximum extent + * - containing the requested extent + * - and not overlapping existing conflicting extents outside the requested one + */ +static void +ldlm_extent_internal_policy_waiting(struct ldlm_lock *req, + struct ldlm_extent *new_ex) +{ + struct ldlm_resource *res = req->l_resource; + enum ldlm_mode req_mode = req->l_req_mode; + __u64 req_start = req->l_req_extent.start; + __u64 req_end = req->l_req_extent.end; + struct ldlm_lock *lock; + int conflicting = 0; + ENTRY; + + lockmode_verify(req_mode); + + /* for waiting locks */ + list_for_each_entry(lock, &res->lr_waiting, l_res_link) { + struct ldlm_extent *l_extent = &lock->l_policy_data.l_extent; + + /* We already hit the minimum requested size, search no more */ + if (new_ex->start == req_start && new_ex->end == req_end) { + EXIT; + return; + } + + /* Don't conflict with ourselves */ + if (req == lock) + continue; + + /* Locks are compatible, overlap doesn't matter */ + /* Until bug 20 is fixed, try to avoid granting overlapping + * locks on one client (they take a long time to cancel) */ + if (lockmode_compat(lock->l_req_mode, req_mode) && + lock->l_export != req->l_export) + continue; + + /* If this is a high-traffic lock, don't grow downwards at all + * or grow upwards too much */ + ++conflicting; + if (conflicting > 4) + new_ex->start = req_start; + + /* If lock doesn't overlap new_ex, skip it. */ + if (!ldlm_extent_overlap(l_extent, new_ex)) + continue; + + /* Locks conflicting in requested extents and we can't satisfy + * both locks, so ignore it. Either we will ping-pong this + * extent (we would regardless of what extent we granted) or + * lock is unused and it shouldn't limit our extent growth. */ + if (ldlm_extent_overlap(&lock->l_req_extent,&req->l_req_extent)) + continue; + + /* We grow extents downwards only as far as they don't overlap + * with already-granted locks, on the assumption that clients + * will be writing beyond the initial requested end and would + * then need to enqueue a new lock beyond previous request. + * l_req_extent->end strictly < req_start, checked above. */ + if (l_extent->start < req_start && new_ex->start != req_start) { + if (l_extent->end >= req_start) + new_ex->start = req_start; + else + new_ex->start = min(l_extent->end+1, req_start); + } + + /* If we need to cancel this lock anyways because our request + * overlaps the granted lock, we grow up to its requested + * extent start instead of limiting this extent, assuming that + * clients are writing forwards and the lock had over grown + * its extent downwards before we enqueued our request. */ + if (l_extent->end > req_end) { + if (l_extent->start <= req_end) + new_ex->end = max(lock->l_req_extent.start - 1, + req_end); + else + new_ex->end = max(l_extent->start - 1, req_end); + } + } + + ldlm_extent_internal_policy_fixup(req, new_ex, conflicting); + EXIT; +} + + +/* In order to determine the largest possible extent we can grant, we need + * to scan all of the queues. */ +static void ldlm_extent_policy(struct ldlm_resource *res, + struct ldlm_lock *lock, __u64 *flags) +{ + struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF }; + + if (lock->l_export == NULL) + /* + * this is a local lock taken by server (e.g., as a part of + * OST-side locking, or unlink handling). Expansion doesn't + * make a lot of sense for local locks, because they are + * dropped immediately on operation completion and would only + * conflict with other threads. + */ + return; + + if (lock->l_policy_data.l_extent.start == 0 && + lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF) + /* fast-path whole file locks */ + return; + + /* Because reprocess_queue zeroes flags and uses it to return + * LDLM_FL_LOCK_CHANGED, we must check for the NO_EXPANSION flag + * in the lock flags rather than the 'flags' argument */ + if (likely(!(lock->l_flags & LDLM_FL_NO_EXPANSION))) { + ldlm_extent_internal_policy_granted(lock, &new_ex); + ldlm_extent_internal_policy_waiting(lock, &new_ex); + } else { + LDLM_DEBUG(lock, "Not expanding manually requested lock.\n"); + new_ex.start = lock->l_policy_data.l_extent.start; + new_ex.end = lock->l_policy_data.l_extent.end; + /* In case the request is not on correct boundaries, we call + * fixup. (normally called in ldlm_extent_internal_policy_*) */ + ldlm_extent_internal_policy_fixup(lock, &new_ex, 0); + } + + if (!ldlm_extent_equal(&new_ex, &lock->l_policy_data.l_extent)) { + *flags |= LDLM_FL_LOCK_CHANGED; + lock->l_policy_data.l_extent.start = new_ex.start; + lock->l_policy_data.l_extent.end = new_ex.end; + } +} + +static int ldlm_check_contention(struct ldlm_lock *lock, int contended_locks) +{ + struct ldlm_resource *res = lock->l_resource; + time64_t now = ktime_get_seconds(); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_SET_CONTENTION)) + return 1; + + CDEBUG(D_DLMTRACE, "contended locks = %d\n", contended_locks); + if (contended_locks > ldlm_res_to_ns(res)->ns_contended_locks) + res->lr_contention_time = now; + + return now < res->lr_contention_time + + ldlm_res_to_ns(res)->ns_contention_time; +} + +struct ldlm_extent_compat_args { + struct list_head *work_list; + struct ldlm_lock *lock; + enum ldlm_mode mode; + int *locks; + int *compat; +}; + +static enum interval_iter ldlm_extent_compat_cb(struct interval_node *n, + void *data) +{ + struct ldlm_extent_compat_args *priv = data; + struct ldlm_interval *node = to_ldlm_interval(n); + struct ldlm_extent *extent; + struct list_head *work_list = priv->work_list; + struct ldlm_lock *lock, *enq = priv->lock; + enum ldlm_mode mode = priv->mode; + int count = 0; + ENTRY; + + LASSERT(!list_empty(&node->li_group)); + + list_for_each_entry(lock, &node->li_group, l_sl_policy) { + /* interval tree is for granted lock */ + LASSERTF(mode == lock->l_granted_mode, + "mode = %s, lock->l_granted_mode = %s\n", + ldlm_lockname[mode], + ldlm_lockname[lock->l_granted_mode]); + count++; + if (lock->l_blocking_ast && + lock->l_granted_mode != LCK_GROUP) + ldlm_add_ast_work_item(lock, enq, work_list); + } + + /* don't count conflicting glimpse locks */ + extent = ldlm_interval_extent(node); + if (!(mode == LCK_PR && + extent->start == 0 && extent->end == OBD_OBJECT_EOF)) + *priv->locks += count; + + if (priv->compat) + *priv->compat = 0; + + RETURN(INTERVAL_ITER_CONT); +} + +/** + * Determine if the lock is compatible with all locks on the queue. + * + * If \a work_list is provided, conflicting locks are linked there. + * If \a work_list is not provided, we exit this function on first conflict. + * + * \retval 0 if the lock is not compatible + * \retval 1 if the lock is compatible + * \retval 2 if \a req is a group lock and it is compatible and requires + * no further checking + * \retval negative error, such as EWOULDBLOCK for group locks + */ +static int +ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req, + __u64 *flags, enum ldlm_error *err, + struct list_head *work_list, int *contended_locks) +{ + struct ldlm_resource *res = req->l_resource; + enum ldlm_mode req_mode = req->l_req_mode; + __u64 req_start = req->l_req_extent.start; + __u64 req_end = req->l_req_extent.end; + struct ldlm_lock *lock; + int check_contention; + int compat = 1; + int scan = 0; + ENTRY; + + lockmode_verify(req_mode); + + /* Using interval tree for granted lock */ + if (queue == &res->lr_granted) { + struct ldlm_interval_tree *tree; + struct ldlm_extent_compat_args data = {.work_list = work_list, + .lock = req, + .locks = contended_locks, + .compat = &compat }; + struct interval_node_extent ex = { .start = req_start, + .end = req_end }; + int idx, rc; + + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + tree = &res->lr_itree[idx]; + if (tree->lit_root == NULL) /* empty tree, skipped */ + continue; + + data.mode = tree->lit_mode; + if (lockmode_compat(req_mode, tree->lit_mode)) { + struct ldlm_interval *node; + struct ldlm_extent *extent; + + if (req_mode != LCK_GROUP) + continue; + + /* group lock, grant it immediately if + * compatible */ + node = to_ldlm_interval(tree->lit_root); + extent = ldlm_interval_extent(node); + if (req->l_policy_data.l_extent.gid == + extent->gid) + RETURN(2); + } + + if (tree->lit_mode == LCK_GROUP) { + if (*flags & (LDLM_FL_BLOCK_NOWAIT | + LDLM_FL_SPECULATIVE)) { + compat = -EWOULDBLOCK; + goto destroylock; + } + + *flags |= LDLM_FL_NO_TIMEOUT; + if (!work_list) + RETURN(0); + + /* if work list is not NULL,add all + locks in the tree to work list */ + compat = 0; + interval_iterate(tree->lit_root, + ldlm_extent_compat_cb, &data); + continue; + } + + /* We've found a potentially blocking lock, check + * compatibility. This handles locks other than GROUP + * locks, which are handled separately above. + * + * Locks with FL_SPECULATIVE are asynchronous requests + * which must never wait behind another lock, so they + * fail if any conflicting lock is found. */ + if (!work_list || (*flags & LDLM_FL_SPECULATIVE)) { + rc = interval_is_overlapped(tree->lit_root, + &ex); + if (rc) { + if (!work_list) { + RETURN(0); + } else { + compat = -EWOULDBLOCK; + goto destroylock; + } + } + } else { + interval_search(tree->lit_root, &ex, + ldlm_extent_compat_cb, &data); + if (!list_empty(work_list) && compat) + compat = 0; + } + } + } else { /* for waiting queue */ + list_for_each_entry(lock, queue, l_res_link) { + check_contention = 1; + + /* We stop walking the queue if we hit ourselves so + * we don't take conflicting locks enqueued after us + * into account, or we'd wait forever. */ + if (req == lock) + break; + + if (unlikely(scan)) { + /* We only get here if we are queuing GROUP lock + and met some incompatible one. The main idea of this + code is to insert GROUP lock past compatible GROUP + lock in the waiting queue or if there is not any, + then in front of first non-GROUP lock */ + if (lock->l_req_mode != LCK_GROUP) { + /* Ok, we hit non-GROUP lock, there should + * be no more GROUP locks later on, queue in + * front of first non-GROUP lock */ + + ldlm_resource_insert_lock_after(lock, req); + list_del_init(&lock->l_res_link); + ldlm_resource_insert_lock_after(req, lock); + compat = 0; + break; + } + if (req->l_policy_data.l_extent.gid == + lock->l_policy_data.l_extent.gid) { + /* found it */ + ldlm_resource_insert_lock_after(lock, req); + compat = 0; + break; + } + continue; + } + + /* locks are compatible, overlap doesn't matter */ + if (lockmode_compat(lock->l_req_mode, req_mode)) { + if (req_mode == LCK_PR && + ((lock->l_policy_data.l_extent.start <= + req->l_policy_data.l_extent.start) && + (lock->l_policy_data.l_extent.end >= + req->l_policy_data.l_extent.end))) { + /* If we met a PR lock just like us or + wider, and nobody down the list + conflicted with it, that means we + can skip processing of the rest of + the list and safely place ourselves + at the end of the list, or grant + (dependent if we met an conflicting + locks before in the list). In case + of 1st enqueue only we continue + traversing if there is something + conflicting down the list because + we need to make sure that something + is marked as AST_SENT as well, in + cse of empy worklist we would exit + on first conflict met. */ + /* There IS a case where such flag is + not set for a lock, yet it blocks + something. Luckily for us this is + only during destroy, so lock is + exclusive. So here we are safe */ + if (!ldlm_is_ast_sent(lock)) + RETURN(compat); + } + + /* non-group locks are compatible, overlap doesn't + matter */ + if (likely(req_mode != LCK_GROUP)) + continue; + + /* If we are trying to get a GROUP lock and there is + another one of this kind, we need to compare gid */ + if (req->l_policy_data.l_extent.gid == + lock->l_policy_data.l_extent.gid) { + /* If existing lock with matched gid is granted, + we grant new one too. */ + if (ldlm_is_granted(lock)) + RETURN(2); + + /* Otherwise we are scanning queue of waiting + * locks and it means current request would + * block along with existing lock (that is + * already blocked. + * If we are in nonblocking mode - return + * immediately */ + if (*flags & (LDLM_FL_BLOCK_NOWAIT + | LDLM_FL_SPECULATIVE)) { + compat = -EWOULDBLOCK; + goto destroylock; + } + /* If this group lock is compatible with another + * group lock on the waiting list, they must be + * together in the list, so they can be granted + * at the same time. Otherwise the later lock + * can get stuck behind another, incompatible, + * lock. */ + ldlm_resource_insert_lock_after(lock, req); + /* Because 'lock' is not granted, we can stop + * processing this queue and return immediately. + * There is no need to check the rest of the + * list. */ + RETURN(0); + } + } + + if (unlikely(req_mode == LCK_GROUP && + !ldlm_is_granted(lock))) { + scan = 1; + compat = 0; + if (lock->l_req_mode != LCK_GROUP) { + /* Ok, we hit non-GROUP lock, there should be no + more GROUP locks later on, queue in front of + first non-GROUP lock */ + + ldlm_resource_insert_lock_after(lock, req); + list_del_init(&lock->l_res_link); + ldlm_resource_insert_lock_after(req, lock); + break; + } + if (req->l_policy_data.l_extent.gid == + lock->l_policy_data.l_extent.gid) { + /* found it */ + ldlm_resource_insert_lock_after(lock, req); + break; + } + continue; + } + + if (unlikely(lock->l_req_mode == LCK_GROUP)) { + /* If compared lock is GROUP, then requested is + * PR/PW so this is not compatible; extent + * range does not matter */ + if (*flags & (LDLM_FL_BLOCK_NOWAIT + | LDLM_FL_SPECULATIVE)) { + compat = -EWOULDBLOCK; + goto destroylock; + } else { + *flags |= LDLM_FL_NO_TIMEOUT; + } + } else if (lock->l_policy_data.l_extent.end < req_start || + lock->l_policy_data.l_extent.start > req_end) { + /* if a non group lock doesn't overlap skip it */ + continue; + } else if (lock->l_req_extent.end < req_start || + lock->l_req_extent.start > req_end) { + /* false contention, the requests doesn't really overlap */ + check_contention = 0; + } + + if (!work_list) + RETURN(0); + + if (*flags & LDLM_FL_SPECULATIVE) { + compat = -EWOULDBLOCK; + goto destroylock; + } + + /* don't count conflicting glimpse locks */ + if (lock->l_req_mode == LCK_PR && + lock->l_policy_data.l_extent.start == 0 && + lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF) + check_contention = 0; + + *contended_locks += check_contention; + + compat = 0; + if (lock->l_blocking_ast && + lock->l_req_mode != LCK_GROUP) + ldlm_add_ast_work_item(lock, req, work_list); + } + } + + if (ldlm_check_contention(req, *contended_locks) && + compat == 0 && + (*flags & LDLM_FL_DENY_ON_CONTENTION) && + req->l_req_mode != LCK_GROUP && + req_end - req_start <= + ldlm_res_to_ns(req->l_resource)->ns_max_nolock_size) + GOTO(destroylock, compat = -EUSERS); + + RETURN(compat); +destroylock: + list_del_init(&req->l_res_link); + ldlm_lock_destroy_nolock(req); + *err = compat; + RETURN(compat); +} + +/** + * This function refresh eviction timer for cancelled lock. + * \param[in] lock ldlm lock for refresh + * \param[in] arg ldlm prolong arguments, timeout, export, extent + * and counter are used + */ +void ldlm_lock_prolong_one(struct ldlm_lock *lock, + struct ldlm_prolong_args *arg) +{ + time64_t timeout; + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PROLONG_PAUSE, 3); + + if (arg->lpa_export != lock->l_export || + lock->l_flags & LDLM_FL_DESTROYED) + /* ignore unrelated locks */ + return; + + arg->lpa_locks_cnt++; + + if (!(lock->l_flags & LDLM_FL_AST_SENT)) + /* ignore locks not being cancelled */ + return; + + /* We are in the middle of the process - BL AST is sent, CANCEL + * is ahead. Take half of BL AT + IO AT process time. + */ + timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1); + + LDLM_DEBUG(lock, "refreshed to %llds.\n", timeout); + + arg->lpa_blocks_cnt++; + + /* OK. this is a possible lock the user holds doing I/O + * let's refresh eviction timer for it. + */ + ldlm_refresh_waiting_lock(lock, timeout); +} +EXPORT_SYMBOL(ldlm_lock_prolong_one); + +static enum interval_iter ldlm_resource_prolong_cb(struct interval_node *n, + void *data) +{ + struct ldlm_prolong_args *arg = data; + struct ldlm_interval *node = to_ldlm_interval(n); + struct ldlm_lock *lock; + + ENTRY; + + LASSERT(!list_empty(&node->li_group)); + + list_for_each_entry(lock, &node->li_group, l_sl_policy) { + ldlm_lock_prolong_one(lock, arg); + } + + RETURN(INTERVAL_ITER_CONT); +} + +/** + * Walk through granted tree and prolong locks if they overlaps extent. + * + * \param[in] arg prolong args + */ +void ldlm_resource_prolong(struct ldlm_prolong_args *arg) +{ + struct ldlm_interval_tree *tree; + struct ldlm_resource *res; + struct interval_node_extent ex = { .start = arg->lpa_extent.start, + .end = arg->lpa_extent.end }; + int idx; + + ENTRY; + + res = ldlm_resource_get(arg->lpa_export->exp_obd->obd_namespace, NULL, + &arg->lpa_resid, LDLM_EXTENT, 0); + if (IS_ERR(res)) { + CDEBUG(D_DLMTRACE, "Failed to get resource for resid %llu/%llu\n", + arg->lpa_resid.name[0], arg->lpa_resid.name[1]); + RETURN_EXIT; + } + + lock_res(res); + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + tree = &res->lr_itree[idx]; + if (tree->lit_root == NULL) /* empty tree, skipped */ + continue; + + /* There is no possibility to check for the groupID + * so all the group locks are considered as valid + * here, especially because the client is supposed + * to check it has such a lock before sending an RPC. + */ + if (!(tree->lit_mode & arg->lpa_mode)) + continue; + + interval_search(tree->lit_root, &ex, + ldlm_resource_prolong_cb, arg); + } + + unlock_res(res); + ldlm_resource_putref(res); + + EXIT; +} +EXPORT_SYMBOL(ldlm_resource_prolong); + +/** + * Process a granting attempt for extent lock. + * Must be called with ns lock held. + * + * This function looks for any conflicts for \a lock in the granted or + * waiting queues. The lock is granted if no conflicts are found in + * either queue. + */ +int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list) +{ + struct ldlm_resource *res = lock->l_resource; + int rc, rc2; + int contended_locks = 0; + struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ? + NULL : work_list; + ENTRY; + + LASSERT(!ldlm_is_granted(lock)); + LASSERT(!(*flags & LDLM_FL_DENY_ON_CONTENTION) || + !ldlm_is_ast_discard_data(lock)); + check_res_locked(res); + *err = ELDLM_OK; + + if (intention == LDLM_PROCESS_RESCAN) { + /* Careful observers will note that we don't handle -EWOULDBLOCK + * here, but it's ok for a non-obvious reason -- compat_queue + * can only return -EWOULDBLOCK if (flags & BLOCK_NOWAIT | + * SPECULATIVE). flags should always be zero here, and if that + * ever stops being true, we want to find out. */ + LASSERT(*flags == 0); + rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags, + err, NULL, &contended_locks); + if (rc == 1) { + rc = ldlm_extent_compat_queue(&res->lr_waiting, lock, + flags, err, NULL, + &contended_locks); + } + if (rc == 0) + RETURN(LDLM_ITER_STOP); + + ldlm_resource_unlink_lock(lock); + + if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_EVICT_RACE)) + ldlm_extent_policy(res, lock, flags); + ldlm_grant_lock(lock, grant_work); + RETURN(LDLM_ITER_CONTINUE); + } + + contended_locks = 0; + rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags, err, + work_list, &contended_locks); + if (rc < 0) + GOTO(out_rpc_list, rc); + + rc2 = 0; + if (rc != 2) { + rc2 = ldlm_extent_compat_queue(&res->lr_waiting, lock, + flags, err, work_list, + &contended_locks); + if (rc2 < 0) + GOTO(out_rpc_list, rc = rc2); + } + + if (rc + rc2 == 2) { + ldlm_extent_policy(res, lock, flags); + ldlm_resource_unlink_lock(lock); + ldlm_grant_lock(lock, grant_work); + } else { + /* Adding LDLM_FL_NO_TIMEOUT flag to granted lock to + * force client to wait for the lock endlessly once + * the lock is enqueued -bzzz */ + *flags |= LDLM_FL_NO_TIMEOUT; + } + rc = LDLM_ITER_CONTINUE; + +out_rpc_list: + RETURN(rc); +} +#endif /* HAVE_SERVER_SUPPORT */ + +struct ldlm_kms_shift_args { + __u64 old_kms; + __u64 kms; + bool complete; +}; + +/* Callback for interval_iterate functions, used by ldlm_extent_shift_Kms */ +static enum interval_iter ldlm_kms_shift_cb(struct interval_node *n, + void *args) +{ + struct ldlm_kms_shift_args *arg = args; + struct ldlm_interval *node = to_ldlm_interval(n); + struct ldlm_lock *tmplock; + struct ldlm_lock *lock = NULL; + + ENTRY; + + /* Since all locks in an interval have the same extent, we can just + * use the first lock without kms_ignore set. */ + list_for_each_entry(tmplock, &node->li_group, l_sl_policy) { + if (ldlm_is_kms_ignore(tmplock)) + continue; + + lock = tmplock; + + break; + } + + /* No locks in this interval without kms_ignore set */ + if (!lock) + RETURN(INTERVAL_ITER_CONT); + + /* If we find a lock with a greater or equal kms, we are not the + * highest lock (or we share that distinction with another lock), and + * don't need to update KMS. Return old_kms and stop looking. */ + if (lock->l_policy_data.l_extent.end >= arg->old_kms) { + arg->kms = arg->old_kms; + arg->complete = true; + RETURN(INTERVAL_ITER_STOP); + } + + if (lock->l_policy_data.l_extent.end + 1 > arg->kms) + arg->kms = lock->l_policy_data.l_extent.end + 1; + + /* Since interval_iterate_reverse starts with the highest lock and + * works down, for PW locks, we only need to check if we should update + * the kms, then stop walking the tree. PR locks are not exclusive, so + * the highest start does not imply the highest end and we must + * continue. (Only one group lock is allowed per resource, so this is + * irrelevant for group locks.)*/ + if (lock->l_granted_mode == LCK_PW) + RETURN(INTERVAL_ITER_STOP); + else + RETURN(INTERVAL_ITER_CONT); +} + +/* When a lock is cancelled by a client, the KMS may undergo change if this + * is the "highest lock". This function returns the new KMS value, updating + * it only if we were the highest lock. + * + * Caller must hold lr_lock already. + * + * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */ +__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms) +{ + struct ldlm_resource *res = lock->l_resource; + struct ldlm_interval_tree *tree; + struct ldlm_kms_shift_args args; + int idx = 0; + + ENTRY; + + args.old_kms = old_kms; + args.kms = 0; + args.complete = false; + + /* don't let another thread in ldlm_extent_shift_kms race in + * just after we finish and take our lock into account in its + * calculation of the kms */ + ldlm_set_kms_ignore(lock); + + /* We iterate over the lock trees, looking for the largest kms smaller + * than the current one. */ + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + tree = &res->lr_itree[idx]; + + /* If our already known kms is >= than the highest 'end' in + * this tree, we don't need to check this tree, because + * the kms from a tree can be lower than in_max_high (due to + * kms_ignore), but it can never be higher. */ + if (!tree->lit_root || args.kms >= tree->lit_root->in_max_high) + continue; + + interval_iterate_reverse(tree->lit_root, ldlm_kms_shift_cb, + &args); + + /* this tells us we're not the highest lock, so we don't need + * to check the remaining trees */ + if (args.complete) + break; + } + + LASSERTF(args.kms <= args.old_kms, "kms %llu old_kms %llu\n", args.kms, + args.old_kms); + + RETURN(args.kms); +} +EXPORT_SYMBOL(ldlm_extent_shift_kms); + +struct kmem_cache *ldlm_interval_slab; +static struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock) +{ + struct ldlm_interval *node; + ENTRY; + + LASSERT(lock->l_resource->lr_type == LDLM_EXTENT); + OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS); + if (node == NULL) + RETURN(NULL); + + INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + RETURN(node); +} + +void ldlm_interval_free(struct ldlm_interval *node) +{ + if (node) { + LASSERT(list_empty(&node->li_group)); + LASSERT(!interval_is_intree(&node->li_node)); + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + } +} + +/* interval tree, for LDLM_EXTENT. */ +void ldlm_interval_attach(struct ldlm_interval *n, + struct ldlm_lock *l) +{ + LASSERT(l->l_tree_node == NULL); + LASSERT(l->l_resource->lr_type == LDLM_EXTENT); + + list_add_tail(&l->l_sl_policy, &n->li_group); + l->l_tree_node = n; +} + +struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l) +{ + struct ldlm_interval *n = l->l_tree_node; + + if (n == NULL) + return NULL; + + LASSERT(!list_empty(&n->li_group)); + l->l_tree_node = NULL; + list_del_init(&l->l_sl_policy); + + return list_empty(&n->li_group) ? n : NULL; +} + +static inline int ldlm_mode_to_index(enum ldlm_mode mode) +{ + int index; + + LASSERT(mode != 0); + LASSERT(is_power_of_2(mode)); + for (index = -1; mode != 0; index++, mode >>= 1) + /* do nothing */; + LASSERT(index < LCK_MODE_NUM); + return index; +} + +int ldlm_extent_alloc_lock(struct ldlm_lock *lock) +{ + lock->l_tree_node = NULL; + if (ldlm_interval_alloc(lock) == NULL) + return -ENOMEM; + return 0; +} + +/** Add newly granted lock into interval tree for the resource. */ +void ldlm_extent_add_lock(struct ldlm_resource *res, + struct ldlm_lock *lock) +{ + struct interval_node *found, **root; + struct ldlm_interval *node; + struct ldlm_extent *extent; + int idx, rc; + + LASSERT(ldlm_is_granted(lock)); + + node = lock->l_tree_node; + LASSERT(node != NULL); + LASSERT(!interval_is_intree(&node->li_node)); + + idx = ldlm_mode_to_index(lock->l_granted_mode); + LASSERT(lock->l_granted_mode == 1 << idx); + LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode); + + /* node extent initialize */ + extent = &lock->l_policy_data.l_extent; + + rc = interval_set(&node->li_node, extent->start, extent->end); + LASSERT(!rc); + + root = &res->lr_itree[idx].lit_root; + found = interval_insert(&node->li_node, root); + if (found) { /* The policy group found. */ + struct ldlm_interval *tmp = ldlm_interval_detach(lock); + LASSERT(tmp != NULL); + ldlm_interval_free(tmp); + ldlm_interval_attach(to_ldlm_interval(found), lock); + } + res->lr_itree[idx].lit_size++; + + /* even though we use interval tree to manage the extent lock, we also + * add the locks into grant list, for debug purpose, .. */ + ldlm_resource_add_lock(res, &res->lr_granted, lock); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GRANT_CHECK)) { + struct ldlm_lock *lck; + + list_for_each_entry_reverse(lck, &res->lr_granted, + l_res_link) { + if (lck == lock) + continue; + if (lockmode_compat(lck->l_granted_mode, + lock->l_granted_mode)) + continue; + if (ldlm_extent_overlap(&lck->l_req_extent, + &lock->l_req_extent)) { + CDEBUG(D_ERROR, "granting conflicting lock %p " + "%p\n", lck, lock); + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + } + } +} + +/** Remove cancelled lock from resource interval tree. */ +void ldlm_extent_unlink_lock(struct ldlm_lock *lock) +{ + struct ldlm_resource *res = lock->l_resource; + struct ldlm_interval *node = lock->l_tree_node; + struct ldlm_interval_tree *tree; + int idx; + + if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */ + return; + + idx = ldlm_mode_to_index(lock->l_granted_mode); + LASSERT(lock->l_granted_mode == 1 << idx); + tree = &res->lr_itree[idx]; + + LASSERT(tree->lit_root != NULL); /* assure the tree is not null */ + + tree->lit_size--; + node = ldlm_interval_detach(lock); + if (node) { + interval_erase(&node->li_node, &tree->lit_root); + ldlm_interval_free(node); + } +} + +void ldlm_extent_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy) +{ + lpolicy->l_extent.start = wpolicy->l_extent.start; + lpolicy->l_extent.end = wpolicy->l_extent.end; + lpolicy->l_extent.gid = wpolicy->l_extent.gid; +} + +void ldlm_extent_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_extent.start = lpolicy->l_extent.start; + wpolicy->l_extent.end = lpolicy->l_extent.end; + wpolicy->l_extent.gid = lpolicy->l_extent.gid; +} + diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c new file mode 100644 index 0000000000000..be849938cc6c6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c @@ -0,0 +1,951 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003 Hewlett-Packard Development Company LP. + * Developed under the sponsorship of the US Government under + * Subcontract No. B514193 + * + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** + * This file implements POSIX lock type for Lustre. + * Its policy properties are start and end of extent and PID. + * + * These locks are only done through MDS due to POSIX semantics requiring + * e.g. that locks could be only partially released and as such split into + * two parts, and also that two adjacent locks from the same process may be + * merged into a single wider lock. + * + * Lock modes are mapped like this: + * PR and PW for READ and WRITE locks + * NL to request a releasing of a portion of the lock + * + * These flock locks never timeout. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include + +#include "ldlm_internal.h" + +int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag); + +/** + * list_for_remaining_safe - iterate over the remaining entries in a list + * and safeguard against removal of a list entry. + * \param pos the &struct list_head to use as a loop counter. pos MUST + * have been initialized prior to using it in this macro. + * \param n another &struct list_head to use as temporary storage + * \param head the head for your list. + */ +#define list_for_remaining_safe(pos, n, head) \ + for (n = pos->next; pos != (head); pos = n, n = pos->next) + +static inline int +ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new) +{ + return((new->l_policy_data.l_flock.owner == + lock->l_policy_data.l_flock.owner) && + (new->l_export == lock->l_export)); +} + +static inline int +ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new) +{ + return((new->l_policy_data.l_flock.start <= + lock->l_policy_data.l_flock.end) && + (new->l_policy_data.l_flock.end >= + lock->l_policy_data.l_flock.start)); +} + +static inline void ldlm_flock_blocking_link(struct ldlm_lock *req, + struct ldlm_lock *lock) +{ + /* For server only */ + if (req->l_export == NULL) + return; + + LASSERT(hlist_unhashed(&req->l_exp_flock_hash)); + + req->l_policy_data.l_flock.blocking_owner = + lock->l_policy_data.l_flock.owner; + req->l_policy_data.l_flock.blocking_export = + lock->l_export; + atomic_set(&req->l_policy_data.l_flock.blocking_refs, 0); + + cfs_hash_add(req->l_export->exp_flock_hash, + &req->l_policy_data.l_flock.owner, + &req->l_exp_flock_hash); +} + +static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req) +{ + /* For server only */ + if (req->l_export == NULL) + return; + + check_res_locked(req->l_resource); + if (req->l_export->exp_flock_hash != NULL && + !hlist_unhashed(&req->l_exp_flock_hash)) + cfs_hash_del(req->l_export->exp_flock_hash, + &req->l_policy_data.l_flock.owner, + &req->l_exp_flock_hash); +} + +static inline void +ldlm_flock_destroy(struct ldlm_lock *lock, enum ldlm_mode mode, __u64 flags) +{ + ENTRY; + + LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: %#llx)", + mode, flags); + + /* Safe to not lock here, since it should be empty anyway */ + LASSERT(hlist_unhashed(&lock->l_exp_flock_hash)); + + list_del_init(&lock->l_res_link); + if (flags == LDLM_FL_WAIT_NOREPROC) { + /* client side - set a flag to prevent sending a CANCEL */ + lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING; + + /* when reaching here, it is under lock_res_and_lock(). Thus, + need call the nolock version of ldlm_lock_decref_internal*/ + ldlm_lock_decref_internal_nolock(lock, mode); + } + + ldlm_lock_destroy_nolock(lock); + EXIT; +} + +/** + * POSIX locks deadlock detection code. + * + * Given a new lock \a req and an existing lock \a bl_lock it conflicts + * with, we need to iterate through all blocked POSIX locks for this + * export and see if there is a deadlock condition arising. (i.e. when + * one client holds a lock on something and want a lock on something + * else and at the same time another client has the opposite situation). + */ + +struct ldlm_flock_lookup_cb_data { + __u64 *bl_owner; + struct ldlm_lock *lock; + struct obd_export *exp; +}; + +static int ldlm_flock_lookup_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + struct ldlm_flock_lookup_cb_data *cb_data = data; + struct obd_export *exp = cfs_hash_object(hs, hnode); + struct ldlm_lock *lock; + + lock = cfs_hash_lookup(exp->exp_flock_hash, cb_data->bl_owner); + if (lock == NULL) + return 0; + + /* Stop on first found lock. Same process can't sleep twice */ + cb_data->lock = lock; + cb_data->exp = class_export_get(exp); + + return 1; +} + +static int +ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock) +{ + struct obd_export *req_exp = req->l_export; + struct obd_export *bl_exp = bl_lock->l_export; + __u64 req_owner = req->l_policy_data.l_flock.owner; + __u64 bl_owner = bl_lock->l_policy_data.l_flock.owner; + + /* For server only */ + if (req_exp == NULL) + return 0; + + class_export_get(bl_exp); + while (1) { + struct ldlm_flock_lookup_cb_data cb_data = { + .bl_owner = &bl_owner, + .lock = NULL, + .exp = NULL }; + struct obd_export *bl_exp_new; + struct ldlm_lock *lock = NULL; + struct ldlm_flock *flock; + + if (bl_exp->exp_flock_hash != NULL) { + cfs_hash_for_each_key(bl_exp->exp_obd->obd_nid_hash, + &bl_exp->exp_connection->c_peer.nid, + ldlm_flock_lookup_cb, &cb_data); + lock = cb_data.lock; + } + if (lock == NULL) + break; + + class_export_put(bl_exp); + bl_exp = cb_data.exp; + + LASSERT(req != lock); + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->owner == bl_owner); + bl_owner = flock->blocking_owner; + bl_exp_new = class_export_get(flock->blocking_export); + class_export_put(bl_exp); + + cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash); + bl_exp = bl_exp_new; + + if (bl_exp->exp_failed) + break; + + if (bl_owner == req_owner && + (bl_exp->exp_connection->c_peer.nid == + req_exp->exp_connection->c_peer.nid)) { + class_export_put(bl_exp); + return 1; + } + } + class_export_put(bl_exp); + + return 0; +} + +static void ldlm_flock_cancel_on_deadlock(struct ldlm_lock *lock, + struct list_head *work_list) +{ + CDEBUG(D_INFO, "reprocess deadlock req=%p\n", lock); + + if ((exp_connect_flags(lock->l_export) & + OBD_CONNECT_FLOCK_DEAD) == 0) { + CERROR("deadlock found, but client doesn't " + "support flock canceliation\n"); + } else { + LASSERT(lock->l_completion_ast); + LASSERT(!ldlm_is_ast_sent(lock)); + lock->l_flags |= LDLM_FL_AST_SENT | LDLM_FL_CANCEL_ON_BLOCK | + LDLM_FL_FLOCK_DEADLOCK; + ldlm_flock_blocking_unlink(lock); + ldlm_resource_unlink_lock(lock); + ldlm_add_ast_work_item(lock, NULL, work_list); + } +} + +/** + * Process a granting attempt for flock lock. + * Must be called under ns lock held. + * + * This function looks for any conflicts for \a lock in the granted or + * waiting queues. The lock is granted if no conflicts are found in + * either queue. + */ +int +ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list) +{ + struct ldlm_resource *res = req->l_resource; + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + struct list_head *tmp; + struct list_head *ownlocks = NULL; + struct ldlm_lock *lock = NULL; + struct ldlm_lock *new = req; + struct ldlm_lock *new2 = NULL; + enum ldlm_mode mode = req->l_req_mode; + int local = ns_is_client(ns); + int added = (mode == LCK_NL); + int overlaps = 0; + int splitted = 0; + const struct ldlm_callback_suite null_cbs = { NULL }; + struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ? + NULL : work_list; + ENTRY; + + CDEBUG(D_DLMTRACE, "flags %#llx owner %llu pid %u mode %u start " + "%llu end %llu\n", *flags, + new->l_policy_data.l_flock.owner, + new->l_policy_data.l_flock.pid, mode, + req->l_policy_data.l_flock.start, + req->l_policy_data.l_flock.end); + + *err = ELDLM_OK; + + if (local) { + /* No blocking ASTs are sent to the clients for + * Posix file & record locks */ + req->l_blocking_ast = NULL; + } else { + /* Called on the server for lock cancels. */ + req->l_blocking_ast = ldlm_flock_blocking_ast; + } + +reprocess: + if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) { + /* This loop determines where this processes locks start + * in the resource lr_granted list. */ + list_for_each(tmp, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + if (ldlm_same_flock_owner(lock, req)) { + ownlocks = tmp; + break; + } + } + } else { + int reprocess_failed = 0; + lockmode_verify(mode); + + /* This loop determines if there are existing locks + * that conflict with the new lock request. */ + list_for_each(tmp, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + + if (ldlm_same_flock_owner(lock, req)) { + if (!ownlocks) + ownlocks = tmp; + continue; + } + + /* locks are compatible, overlap doesn't matter */ + if (lockmode_compat(lock->l_granted_mode, mode)) + continue; + + if (!ldlm_flocks_overlap(lock, req)) + continue; + + if (intention != LDLM_PROCESS_ENQUEUE) { + reprocess_failed = 1; + if (ldlm_flock_deadlock(req, lock)) { + ldlm_flock_cancel_on_deadlock(req, + grant_work); + RETURN(LDLM_ITER_CONTINUE); + } + continue; + } + + if (*flags & LDLM_FL_BLOCK_NOWAIT) { + ldlm_flock_destroy(req, mode, *flags); + *err = -EAGAIN; + RETURN(LDLM_ITER_STOP); + } + + if (*flags & LDLM_FL_TEST_LOCK) { + ldlm_flock_destroy(req, mode, *flags); + req->l_req_mode = lock->l_granted_mode; + req->l_policy_data.l_flock.pid = + lock->l_policy_data.l_flock.pid; + req->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + req->l_policy_data.l_flock.end = + lock->l_policy_data.l_flock.end; + *flags |= LDLM_FL_LOCK_CHANGED; + RETURN(LDLM_ITER_STOP); + } + + /* add lock to blocking list before deadlock + * check to prevent race */ + ldlm_flock_blocking_link(req, lock); + + if (ldlm_flock_deadlock(req, lock)) { + ldlm_flock_blocking_unlink(req); + ldlm_flock_destroy(req, mode, *flags); + *err = -EDEADLK; + RETURN(LDLM_ITER_STOP); + } + + ldlm_resource_add_lock(res, &res->lr_waiting, req); + *flags |= LDLM_FL_BLOCK_GRANTED; + RETURN(LDLM_ITER_STOP); + } + if (reprocess_failed) + RETURN(LDLM_ITER_CONTINUE); + } + + if (*flags & LDLM_FL_TEST_LOCK) { + ldlm_flock_destroy(req, mode, *flags); + req->l_req_mode = LCK_NL; + *flags |= LDLM_FL_LOCK_CHANGED; + RETURN(LDLM_ITER_STOP); + } + + /* In case we had slept on this lock request take it off of the + * deadlock detection hash list. */ + ldlm_flock_blocking_unlink(req); + + /* Scan the locks owned by this process that overlap this request. + * We may have to merge or split existing locks. */ + + if (!ownlocks) + ownlocks = &res->lr_granted; + + list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) { + lock = list_entry(ownlocks, struct ldlm_lock, l_res_link); + + if (!ldlm_same_flock_owner(lock, new)) + break; + + if (lock->l_granted_mode == mode) { + /* If the modes are the same then we need to process + * locks that overlap OR adjoin the new lock. The extra + * logic condition is necessary to deal with arithmetic + * overflow and underflow. */ + if ((new->l_policy_data.l_flock.start > + (lock->l_policy_data.l_flock.end + 1)) + && (lock->l_policy_data.l_flock.end != + OBD_OBJECT_EOF)) + continue; + + if ((new->l_policy_data.l_flock.end < + (lock->l_policy_data.l_flock.start - 1)) + && (lock->l_policy_data.l_flock.start != 0)) + break; + + if (new->l_policy_data.l_flock.start < + lock->l_policy_data.l_flock.start) { + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.start; + } else { + new->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + } + + if (new->l_policy_data.l_flock.end > + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.end; + } else { + new->l_policy_data.l_flock.end = + lock->l_policy_data.l_flock.end; + } + + if (added) { + ldlm_flock_destroy(lock, mode, *flags); + } else { + new = lock; + added = 1; + } + continue; + } + + if (new->l_policy_data.l_flock.start > + lock->l_policy_data.l_flock.end) + continue; + + if (new->l_policy_data.l_flock.end < + lock->l_policy_data.l_flock.start) + break; + + ++overlaps; + + if (new->l_policy_data.l_flock.start <= + lock->l_policy_data.l_flock.start) { + if (new->l_policy_data.l_flock.end < + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.end + 1; + break; + } + ldlm_flock_destroy(lock, lock->l_req_mode, *flags); + continue; + } + if (new->l_policy_data.l_flock.end >= + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.start - 1; + continue; + } + + /* split the existing lock into two locks */ + + /* if this is an F_UNLCK operation then we could avoid + * allocating a new lock and use the req lock passed in + * with the request but this would complicate the reply + * processing since updates to req get reflected in the + * reply. The client side replays the lock request so + * it must see the original lock data in the reply. */ + + /* XXX - if ldlm_lock_new() can sleep we should + * release the lr_lock, allocate the new lock, + * and restart processing this lock. */ + if (new2 == NULL) { + unlock_res_and_lock(req); + new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK, + lock->l_granted_mode, &null_cbs, + NULL, 0, LVB_T_NONE); + lock_res_and_lock(req); + if (IS_ERR(new2)) { + ldlm_flock_destroy(req, lock->l_granted_mode, + *flags); + *err = PTR_ERR(new2); + RETURN(LDLM_ITER_STOP); + } + goto reprocess; + } + + splitted = 1; + + new2->l_granted_mode = lock->l_granted_mode; + new2->l_policy_data.l_flock.pid = + new->l_policy_data.l_flock.pid; + new2->l_policy_data.l_flock.owner = + new->l_policy_data.l_flock.owner; + new2->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + new2->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.start - 1; + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.end + 1; + new2->l_conn_export = lock->l_conn_export; + if (lock->l_export != NULL) { + new2->l_export = class_export_lock_get(lock->l_export, new2); + if (new2->l_export->exp_lock_hash && + hlist_unhashed(&new2->l_exp_hash)) + cfs_hash_add(new2->l_export->exp_lock_hash, + &new2->l_remote_handle, + &new2->l_exp_hash); + } + if (*flags == LDLM_FL_WAIT_NOREPROC) + ldlm_lock_addref_internal_nolock(new2, + lock->l_granted_mode); + + /* insert new2 at lock */ + ldlm_resource_add_lock(res, ownlocks, new2); + LDLM_LOCK_RELEASE(new2); + break; + } + + /* if new2 is created but never used, destroy it*/ + if (splitted == 0 && new2 != NULL) + ldlm_lock_destroy_nolock(new2); + + /* At this point we're granting the lock request. */ + req->l_granted_mode = req->l_req_mode; + + /* Add req to the granted queue before calling ldlm_reprocess_all(). */ + if (!added) { + list_del_init(&req->l_res_link); + /* insert new lock before ownlocks in list. */ + ldlm_resource_add_lock(res, ownlocks, req); + } + + if (*flags != LDLM_FL_WAIT_NOREPROC) { +#ifdef HAVE_SERVER_SUPPORT + if (intention == LDLM_PROCESS_ENQUEUE) { + /* If this is an unlock, reprocess the waitq and + * send completions ASTs for locks that can now be + * granted. The only problem with doing this + * reprocessing here is that the completion ASTs for + * newly granted locks will be sent before the unlock + * completion is sent. It shouldn't be an issue. Also + * note that ldlm_process_flock_lock() will recurse, + * but only once because 'intention' won't be + * LDLM_PROCESS_ENQUEUE from ldlm_reprocess_queue. */ + if ((mode == LCK_NL) && overlaps) { + struct list_head rpc_list; + int rc; + + INIT_LIST_HEAD(&rpc_list); +restart: + ldlm_reprocess_queue(res, &res->lr_waiting, + &rpc_list, + LDLM_PROCESS_RESCAN, NULL); + + unlock_res_and_lock(req); + rc = ldlm_run_ast_work(ns, &rpc_list, + LDLM_WORK_CP_AST); + lock_res_and_lock(req); + if (rc == -ERESTART) + GOTO(restart, rc); + } + } else { + LASSERT(req->l_completion_ast); + ldlm_add_ast_work_item(req, NULL, grant_work); + } +#else /* !HAVE_SERVER_SUPPORT */ + /* The only one possible case for client-side calls flock + * policy function is ldlm_flock_completion_ast inside which + * carries LDLM_FL_WAIT_NOREPROC flag. */ + CERROR("Illegal parameter for client-side-only module.\n"); + LBUG(); +#endif /* HAVE_SERVER_SUPPORT */ + } + + /* In case we're reprocessing the requested lock we can't destroy + * it until after calling ldlm_add_ast_work_item() above so that laawi() + * can bump the reference count on \a req. Otherwise \a req + * could be freed before the completion AST can be sent. */ + if (added) + ldlm_flock_destroy(req, mode, *flags); + + ldlm_resource_dump(D_INFO, res); + RETURN(LDLM_ITER_CONTINUE); +} + +struct ldlm_flock_wait_data { + struct ldlm_lock *fwd_lock; + int fwd_generation; +}; + +static void +ldlm_flock_interrupted_wait(void *data) +{ + struct ldlm_lock *lock; + ENTRY; + + lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock; + + /* take lock off the deadlock detection hash list. */ + lock_res_and_lock(lock); + ldlm_flock_blocking_unlink(lock); + + /* client side - set flag to prevent lock from being put on LRU list */ + ldlm_set_cbpending(lock); + unlock_res_and_lock(lock); + + EXIT; +} + +/** + * Flock completion callback function. + * + * \param lock [in,out]: A lock to be handled + * \param flags [in]: flags + * \param *data [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg + * + * \retval 0 : success + * \retval <0 : failure + */ +int +ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) +{ + struct file_lock *getlk = lock->l_ast_data; + struct obd_device *obd; + struct obd_import *imp = NULL; + struct ldlm_flock_wait_data fwd; + struct l_wait_info lwi; + enum ldlm_error err; + int rc = 0; + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT2, 4); + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT3)) { + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_FAIL_LOC; + unlock_res_and_lock(lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT3, 4); + } + CDEBUG(D_DLMTRACE, "flags: %#llx data: %p getlk: %p\n", + flags, data, getlk); + + LASSERT(flags != LDLM_FL_WAIT_NOREPROC); + + if (flags & LDLM_FL_FAILED) + goto granted; + + if (!(flags & LDLM_FL_BLOCKED_MASK)) { + if (NULL == data) + /* mds granted the lock in the reply */ + goto granted; + /* CP AST RPC: lock get granted, wake it up */ + wake_up(&lock->l_waitq); + RETURN(0); + } + + LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, " + "sleeping"); + fwd.fwd_lock = lock; + obd = class_exp2obd(lock->l_conn_export); + + /* if this is a local lock, there is no import */ + if (NULL != obd) + imp = obd->u.cli.cl_import; + + if (NULL != imp) { + spin_lock(&imp->imp_lock); + fwd.fwd_generation = imp->imp_generation; + spin_unlock(&imp->imp_lock); + } + + lwi = LWI_TIMEOUT_INTR(0, NULL, ldlm_flock_interrupted_wait, &fwd); + + /* Go to sleep until the lock is granted. */ + rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi); + + if (rc) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", + rc); + RETURN(rc); + } + +granted: + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT4)) { + lock_res_and_lock(lock); + /* DEADLOCK is always set with CBPENDING */ + lock->l_flags |= LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING; + unlock_res_and_lock(lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT4, 4); + } + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT5)) { + lock_res_and_lock(lock); + /* DEADLOCK is always set with CBPENDING */ + lock->l_flags |= LDLM_FL_FAIL_LOC | + LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING; + unlock_res_and_lock(lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT5, 4); + } + + lock_res_and_lock(lock); + + + /* Protect against race where lock could have been just destroyed + * due to overlap in ldlm_process_flock_lock(). + */ + if (ldlm_is_destroyed(lock)) { + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed"); + + /* An error is still to be returned, to propagate it up to + * ldlm_cli_enqueue_fini() caller. */ + RETURN(-EIO); + } + + /* ldlm_lock_enqueue() has already placed lock on the granted list. */ + ldlm_resource_unlink_lock(lock); + + /* Import invalidation. We need to actually release the lock + * references being held, so that it can go away. No point in + * holding the lock even if app still believes it has it, since + * server already dropped it anyway. Only for granted locks too. */ + /* Do the same for DEADLOCK'ed locks. */ + if (ldlm_is_failed(lock) || ldlm_is_flock_deadlock(lock)) { + int mode; + + if (flags & LDLM_FL_TEST_LOCK) + LASSERT(ldlm_is_test_lock(lock)); + + if (ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock)) + mode = getlk->fl_type; + else + mode = lock->l_granted_mode; + + if (ldlm_is_flock_deadlock(lock)) { + LDLM_DEBUG(lock, "client-side enqueue deadlock " + "received"); + rc = -EDEADLK; + } + ldlm_flock_destroy(lock, mode, LDLM_FL_WAIT_NOREPROC); + unlock_res_and_lock(lock); + + /* Need to wake up the waiter if we were evicted */ + wake_up(&lock->l_waitq); + + /* An error is still to be returned, to propagate it up to + * ldlm_cli_enqueue_fini() caller. */ + RETURN(rc ? : -EIO); + } + + LDLM_DEBUG(lock, "client-side enqueue granted"); + + if (flags & LDLM_FL_TEST_LOCK) { + /* + * fcntl(F_GETLK) request + * The old mode was saved in getlk->fl_type so that if the mode + * in the lock changes we can decref the appropriate refcount. + */ + LASSERT(ldlm_is_test_lock(lock)); + ldlm_flock_destroy(lock, getlk->fl_type, LDLM_FL_WAIT_NOREPROC); + switch (lock->l_granted_mode) { + case LCK_PR: + getlk->fl_type = F_RDLCK; + break; + case LCK_PW: + getlk->fl_type = F_WRLCK; + break; + default: + getlk->fl_type = F_UNLCK; + } + getlk->fl_pid = (pid_t)lock->l_policy_data.l_flock.pid; + getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start; + getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end; + } else { + __u64 noreproc = LDLM_FL_WAIT_NOREPROC; + + /* We need to reprocess the lock to do merges or splits + * with existing locks owned by this process. */ + ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL); + } + unlock_res_and_lock(lock); + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_flock_completion_ast); + +int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + ENTRY; + + LASSERT(lock); + LASSERT(flag == LDLM_CB_CANCELING); + + /* take lock off the deadlock detection hash list. */ + lock_res_and_lock(lock); + ldlm_flock_blocking_unlink(lock); + unlock_res_and_lock(lock); + RETURN(0); +} + +void ldlm_flock_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy) +{ + lpolicy->l_flock.start = wpolicy->l_flock.lfw_start; + lpolicy->l_flock.end = wpolicy->l_flock.lfw_end; + lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid; + lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner; +} + +void ldlm_flock_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_flock.lfw_start = lpolicy->l_flock.start; + wpolicy->l_flock.lfw_end = lpolicy->l_flock.end; + wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid; + wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner; +} + +/* + * Export handle<->flock hash operations. + */ +static unsigned +ldlm_export_flock_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_u64_hash(*(__u64 *)key, mask); +} + +static void * +ldlm_export_flock_key(struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + return &lock->l_policy_data.l_flock.owner; +} + +static int +ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode) +{ + return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64)); +} + +static void * +ldlm_export_flock_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); +} + +static void +ldlm_export_flock_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + struct ldlm_flock *flock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + LDLM_LOCK_GET(lock); + + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->blocking_export != NULL); + class_export_get(flock->blocking_export); + atomic_inc(&flock->blocking_refs); +} + +static void +ldlm_export_flock_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + struct ldlm_flock *flock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->blocking_export != NULL); + class_export_put(flock->blocking_export); + if (atomic_dec_and_test(&flock->blocking_refs)) { + flock->blocking_owner = 0; + flock->blocking_export = NULL; + } + LDLM_LOCK_RELEASE(lock); +} + +static struct cfs_hash_ops ldlm_export_flock_ops = { + .hs_hash = ldlm_export_flock_hash, + .hs_key = ldlm_export_flock_key, + .hs_keycmp = ldlm_export_flock_keycmp, + .hs_object = ldlm_export_flock_object, + .hs_get = ldlm_export_flock_get, + .hs_put = ldlm_export_flock_put, + .hs_put_locked = ldlm_export_flock_put, +}; + +int ldlm_init_flock_export(struct obd_export *exp) +{ + if( strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0) + RETURN(0); + + exp->exp_flock_hash = + cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid), + HASH_EXP_LOCK_CUR_BITS, + HASH_EXP_LOCK_MAX_BITS, + HASH_EXP_LOCK_BKT_BITS, 0, + CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, + &ldlm_export_flock_ops, + CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE); + if (!exp->exp_flock_hash) + RETURN(-ENOMEM); + + RETURN(0); +} + +void ldlm_destroy_flock_export(struct obd_export *exp) +{ + ENTRY; + if (exp->exp_flock_hash) { + cfs_hash_putref(exp->exp_flock_hash); + exp->exp_flock_hash = NULL; + } + EXIT; +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c new file mode 100644 index 0000000000000..c407cf676fba8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c @@ -0,0 +1,577 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_inodebits.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of IBITS lock type + * + * IBITS lock type contains a bit mask determining various properties of an + * object. The meanings of specific bits are specific to the caller and are + * opaque to LDLM code. + * + * Locks with intersecting bitmasks and conflicting lock modes (e.g. LCK_PW) + * are considered conflicting. See the lock mode compatibility matrix + * in lustre_dlm.h. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include + +#include "ldlm_internal.h" + +#ifdef HAVE_SERVER_SUPPORT + +/** + * It should iterate through all waiting locks on a given resource queue and + * attempt to grant them. An optimization is to check only heads waitintg + * locks for each inodebit type. + * + * Must be called with resource lock held. + */ +int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res, + struct list_head *queue, + struct list_head *work_list, + enum ldlm_process_intention intention, + struct ldlm_lock *hint) +{ + __u64 flags; + int rc = LDLM_ITER_CONTINUE; + enum ldlm_error err; + struct list_head bl_ast_list = LIST_HEAD_INIT(bl_ast_list); + struct ldlm_ibits_queues *queues = res->lr_ibits_queues; + int i; + + ENTRY; + + check_res_locked(res); + + LASSERT(res->lr_type == LDLM_IBITS); + LASSERT(intention == LDLM_PROCESS_RESCAN || + intention == LDLM_PROCESS_RECOVERY); + + if (intention == LDLM_PROCESS_RECOVERY) + return ldlm_reprocess_queue(res, queue, work_list, intention, + NULL); + +restart: + CDEBUG(D_DLMTRACE, "--- Reprocess resource "DLDLMRES" (%p)\n", + PLDLMRES(res), res); + + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) { + struct list_head rpc_list = LIST_HEAD_INIT(rpc_list); + struct list_head *head = &queues->liq_waiting[i]; + struct ldlm_lock *pending; + struct ldlm_ibits_node *node; + + if (list_empty(head)) + continue; + if (hint && !(hint->l_policy_data.l_inodebits.bits & (1 << i))) + continue; + + node = list_entry(head->next, struct ldlm_ibits_node, + lin_link[i]); + + pending = node->lock; + LDLM_DEBUG(pending, "Reprocessing lock from queue %d", i); + + flags = 0; + rc = ldlm_process_inodebits_lock(pending, &flags, intention, + &err, &rpc_list); + if (ldlm_is_granted(pending)) { + list_splice(&rpc_list, work_list); + /* Try to grant more locks from current queue */ + i--; + } else { + list_splice(&rpc_list, &bl_ast_list); + } + } + + if (!list_empty(&bl_ast_list)) { + unlock_res(res); + + rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list, + LDLM_WORK_BL_AST); + + lock_res(res); + if (rc == -ERESTART) + GOTO(restart, rc); + } + + if (!list_empty(&bl_ast_list)) + ldlm_discard_bl_list(&bl_ast_list); + + RETURN(rc); +} + +/** + * Determine if the lock is compatible with all locks on the queue. + * + * If \a work_list is provided, conflicting locks are linked there. + * If \a work_list is not provided, we exit this function on first conflict. + * + * \retval 0 if there are conflicting locks in the \a queue + * \retval 1 if the lock is compatible to all locks in \a queue + * + * IBITS locks in granted queue are organized in bunches of + * same-mode/same-bits locks called "skip lists". The First lock in the + * bunch contains a pointer to the end of the bunch. This allows us to + * skip an entire bunch when iterating the list in search for conflicting + * locks if first lock of the bunch is not conflicting with us. + */ +static int +ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req, + struct list_head *work_list) +{ + struct list_head *tmp; + struct ldlm_lock *lock; + __u64 req_bits = req->l_policy_data.l_inodebits.bits; + __u64 *try_bits = &req->l_policy_data.l_inodebits.try_bits; + int compat = 1; + + ENTRY; + + /* There is no sense in lock with no bits set. Also such a lock + * would be compatible with any other bit lock. + * Meanwhile that can be true if there were just try_bits and all + * are failed, so just exit gracefully and let the caller to care. + */ + if ((req_bits | *try_bits) == 0) + RETURN(0); + + list_for_each(tmp, queue) { + struct list_head *mode_tail; + + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + /* We stop walking the queue if we hit ourselves so we don't + * take conflicting locks enqueued after us into account, + * or we'd wait forever. */ + if (req == lock) + RETURN(compat); + + /* last lock in mode group */ + LASSERT(lock->l_sl_mode.prev != NULL); + mode_tail = &list_entry(lock->l_sl_mode.prev, struct ldlm_lock, + l_sl_mode)->l_res_link; + + /* if request lock is not COS_INCOMPAT and COS is disabled, + * they are compatible, IOW this request is from a local + * transaction on a DNE system. */ + if (lock->l_req_mode == LCK_COS && !ldlm_is_cos_incompat(req) && + !ldlm_is_cos_enabled(req)) { + /* jump to last lock in mode group */ + tmp = mode_tail; + continue; + } + + /* locks' mode are compatible, bits don't matter */ + if (lockmode_compat(lock->l_req_mode, req->l_req_mode)) { + /* jump to last lock in mode group */ + tmp = mode_tail; + continue; + } + + for (;;) { + struct list_head *head; + + /* Advance loop cursor to last lock in policy group. */ + tmp = &list_entry(lock->l_sl_policy.prev, + struct ldlm_lock, + l_sl_policy)->l_res_link; + + /* New lock's try_bits are filtered out by ibits + * of all locks in both granted and waiting queues. + */ + *try_bits &= ~(lock->l_policy_data.l_inodebits.bits | + lock->l_policy_data.l_inodebits.try_bits); + + if ((req_bits | *try_bits) == 0) + RETURN(0); + + /* The new lock ibits is more preferable than try_bits + * of waiting locks so drop conflicting try_bits in + * the waiting queue. + * Notice that try_bits of granted locks must be zero. + */ + lock->l_policy_data.l_inodebits.try_bits &= ~req_bits; + + /* Locks with overlapping bits conflict. */ + if (lock->l_policy_data.l_inodebits.bits & req_bits) { + /* COS lock mode has a special compatibility + * requirement: it is only compatible with + * locks from the same client. */ + if (lock->l_req_mode == LCK_COS && + !ldlm_is_cos_incompat(req) && + ldlm_is_cos_enabled(req) && + lock->l_client_cookie == req->l_client_cookie) + goto not_conflicting; + + /* Found a conflicting policy group. */ + if (!work_list) + RETURN(0); + + compat = 0; + + /* Add locks of the policy group to @work_list + * as blocking locks for @req */ + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, req, + work_list); + head = &lock->l_sl_policy; + list_for_each_entry(lock, head, l_sl_policy) + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, + req, work_list); + } +not_conflicting: + if (tmp == mode_tail) + break; + + tmp = tmp->next; + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + } /* Loop over policy groups within one mode group. */ + } /* Loop over mode groups within @queue. */ + + RETURN(compat); +} + +/** + * Process a granting attempt for IBITS lock. + * Must be called with ns lock held + * + * This function looks for any conflicts for \a lock in the granted or + * waiting queues. The lock is granted if no conflicts are found in + * either queue. + */ +int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, + struct list_head *work_list) +{ + struct ldlm_resource *res = lock->l_resource; + struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ? + NULL : work_list; + int rc; + + ENTRY; + + LASSERT(!ldlm_is_granted(lock)); + check_res_locked(res); + + if (intention == LDLM_PROCESS_RESCAN) { + struct list_head *bl_list; + + if (*flags & LDLM_FL_BLOCK_NOWAIT) { + bl_list = NULL; + *err = ELDLM_LOCK_WOULDBLOCK; + } else { + bl_list = work_list; + *err = ELDLM_LOCK_ABORTED; + } + + LASSERT(lock->l_policy_data.l_inodebits.bits != 0); + + /* It is possible that some of granted locks was not canceled + * but converted and is kept in granted queue. So there is + * a window where lock with 'ast_sent' might become granted + * again. Meanwhile a new lock may appear in that window and + * conflicts with the converted lock so the following scenario + * is possible: + * + * 1) lock1 conflicts with lock2 + * 2) bl_ast was sent for lock2 + * 3) lock3 comes and conflicts with lock2 too + * 4) no bl_ast sent because lock2->l_bl_ast_sent is 1 + * 5) lock2 was converted for lock1 but not for lock3 + * 6) lock1 granted, lock3 still is waiting for lock2, but + * there will never be another bl_ast for that + * + * To avoid this scenario the work_list is used below to collect + * any blocked locks from granted queue during every reprocess + * and bl_ast will be sent if needed. + */ + rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, + bl_list); + if (!rc) + RETURN(LDLM_ITER_STOP); + rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, NULL); + if (!rc) + RETURN(LDLM_ITER_STOP); + + /* grant also try_bits if any */ + if (lock->l_policy_data.l_inodebits.try_bits != 0) { + lock->l_policy_data.l_inodebits.bits |= + lock->l_policy_data.l_inodebits.try_bits; + lock->l_policy_data.l_inodebits.try_bits = 0; + *flags |= LDLM_FL_LOCK_CHANGED; + } + ldlm_resource_unlink_lock(lock); + ldlm_grant_lock(lock, grant_work); + + *err = ELDLM_OK; + RETURN(LDLM_ITER_CONTINUE); + } + + rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, work_list); + rc += ldlm_inodebits_compat_queue(&res->lr_waiting, lock, work_list); + + if (rc != 2) { + /* if there were only bits to try and all are conflicting */ + if ((lock->l_policy_data.l_inodebits.bits | + lock->l_policy_data.l_inodebits.try_bits) == 0) { + *err = ELDLM_LOCK_WOULDBLOCK; + } else { + *err = ELDLM_OK; + } + } else { + /* grant also all remaining try_bits */ + if (lock->l_policy_data.l_inodebits.try_bits != 0) { + lock->l_policy_data.l_inodebits.bits |= + lock->l_policy_data.l_inodebits.try_bits; + lock->l_policy_data.l_inodebits.try_bits = 0; + *flags |= LDLM_FL_LOCK_CHANGED; + } + LASSERT(lock->l_policy_data.l_inodebits.bits); + ldlm_resource_unlink_lock(lock); + ldlm_grant_lock(lock, grant_work); + *err = ELDLM_OK; + } + + RETURN(LDLM_ITER_CONTINUE); +} +#endif /* HAVE_SERVER_SUPPORT */ + +void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy) +{ + lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits; + /** + * try_bits are to be handled outside of generic write_to_local due + * to different behavior on a server and client. + */ +} + +void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits; + wpolicy->l_inodebits.try_bits = lpolicy->l_inodebits.try_bits; +} + +/** + * Attempt to convert already granted IBITS lock with several bits set to + * a lock with less bits (downgrade). + * + * Such lock conversion is used to keep lock with non-blocking bits instead of + * cancelling it, introduced for better support of DoM files. + */ +int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop) +{ + ENTRY; + + check_res_locked(lock->l_resource); + + /* Just return if there are no conflicting bits */ + if ((lock->l_policy_data.l_inodebits.bits & to_drop) == 0) { + LDLM_WARN(lock, "try to drop unset bits %#llx/%#llx", + lock->l_policy_data.l_inodebits.bits, to_drop); + /* nothing to do */ + RETURN(0); + } + + /* remove lock from a skiplist and put in the new place + * according with new inodebits */ + ldlm_resource_unlink_lock(lock); + lock->l_policy_data.l_inodebits.bits &= ~to_drop; + ldlm_grant_lock_with_skiplist(lock); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_inodebits_drop); + +/* convert single lock */ +int ldlm_cli_inodebits_convert(struct ldlm_lock *lock, + enum ldlm_cancel_flags cancel_flags) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + struct ldlm_lock_desc ld = { { 0 } }; + __u64 drop_bits, new_bits; + __u32 flags = 0; + int rc; + + ENTRY; + + check_res_locked(lock->l_resource); + + /* Lock is being converted already */ + if (ldlm_is_converting(lock)) { + if (!(cancel_flags & LCF_ASYNC)) { + struct l_wait_info lwi = { 0 }; + + unlock_res_and_lock(lock); + l_wait_event(lock->l_waitq, + is_lock_converted(lock), &lwi); + lock_res_and_lock(lock); + } + RETURN(0); + } + + /* lru_cancel may happen in parallel and call ldlm_cli_cancel_list() + * independently. + */ + if (ldlm_is_canceling(lock)) + RETURN(-EINVAL); + + /* no need in only local convert */ + if (lock->l_flags & (LDLM_FL_LOCAL_ONLY | LDLM_FL_CANCEL_ON_BLOCK)) + RETURN(-EINVAL); + + drop_bits = lock->l_policy_data.l_inodebits.cancel_bits; + /* no cancel bits - means that caller needs full cancel */ + if (drop_bits == 0) + RETURN(-EINVAL); + + new_bits = lock->l_policy_data.l_inodebits.bits & ~drop_bits; + /* check if all lock bits are dropped, proceed with cancel */ + if (!new_bits) + RETURN(-EINVAL); + + /* check if no dropped bits, consider this as successful convert */ + if (lock->l_policy_data.l_inodebits.bits == new_bits) + RETURN(0); + + ldlm_set_converting(lock); + /* Finally call cancel callback for remaining bits only. + * It is important to have converting flag during that + * so blocking_ast callback can distinguish convert from + * cancels. + */ + ld.l_policy_data.l_inodebits.cancel_bits = drop_bits; + unlock_res_and_lock(lock); + lock->l_blocking_ast(lock, &ld, lock->l_ast_data, LDLM_CB_CANCELING); + /* now notify server about convert */ + rc = ldlm_cli_convert_req(lock, &flags, new_bits); + lock_res_and_lock(lock); + if (rc) + GOTO(full_cancel, rc); + + /* Finally clear these bits in lock ibits */ + ldlm_inodebits_drop(lock, drop_bits); + + /* Being locked again check if lock was canceled, it is important + * to do and don't drop cbpending below + */ + if (ldlm_is_canceling(lock)) + GOTO(full_cancel, rc = -EINVAL); + + /* also check again if more bits to be cancelled appeared */ + if (drop_bits != lock->l_policy_data.l_inodebits.cancel_bits) + GOTO(clear_converting, rc = -EAGAIN); + + /* clear cbpending flag early, it is safe to match lock right after + * client convert because it is downgrade always. + */ + ldlm_clear_cbpending(lock); + ldlm_clear_bl_ast(lock); + spin_lock(&ns->ns_lock); + if (list_empty(&lock->l_lru)) + ldlm_lock_add_to_lru_nolock(lock); + spin_unlock(&ns->ns_lock); + + /* the job is done, zero the cancel_bits. If more conflicts appear, + * it will result in another cycle of ldlm_cli_inodebits_convert(). + */ +full_cancel: + lock->l_policy_data.l_inodebits.cancel_bits = 0; +clear_converting: + ldlm_clear_converting(lock); + RETURN(rc); +} + +int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock) +{ + if (ldlm_is_ns_srv(lock)) { + int i; + + OBD_SLAB_ALLOC_PTR(lock->l_ibits_node, ldlm_inodebits_slab); + if (lock->l_ibits_node == NULL) + return -ENOMEM; + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) + INIT_LIST_HEAD(&lock->l_ibits_node->lin_link[i]); + lock->l_ibits_node->lock = lock; + } else { + lock->l_ibits_node = NULL; + } + return 0; +} + +void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head, + struct ldlm_lock *lock) +{ + int i; + + if (!ldlm_is_ns_srv(lock)) + return; + + if (head == &res->lr_waiting) { + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) { + if (lock->l_policy_data.l_inodebits.bits & (1 << i)) + list_add_tail(&lock->l_ibits_node->lin_link[i], + &res->lr_ibits_queues->liq_waiting[i]); + } + } else if (head == &res->lr_granted && lock->l_ibits_node != NULL) { + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) + LASSERT(list_empty(&lock->l_ibits_node->lin_link[i])); + OBD_SLAB_FREE_PTR(lock->l_ibits_node, ldlm_inodebits_slab); + lock->l_ibits_node = NULL; + } +} + +void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock) +{ + int i; + + ldlm_unlink_lock_skiplist(lock); + if (!ldlm_is_ns_srv(lock)) + return; + + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) + list_del_init(&lock->l_ibits_node->lin_link[i]); +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h new file mode 100644 index 0000000000000..733773c50ed0c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h @@ -0,0 +1,438 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define MAX_STRING_SIZE 128 + +extern int ldlm_srv_namespace_nr; +extern int ldlm_cli_namespace_nr; +extern struct mutex ldlm_srv_namespace_lock; +extern struct list_head ldlm_srv_namespace_list; +extern struct mutex ldlm_cli_namespace_lock; +extern struct list_head ldlm_cli_active_namespace_list; +extern struct list_head ldlm_cli_inactive_namespace_list; +extern unsigned int ldlm_cancel_unused_locks_before_replay; +extern struct kmem_cache *ldlm_glimpse_work_kmem; + +static inline int ldlm_namespace_nr_read(enum ldlm_side client) +{ + return client == LDLM_NAMESPACE_SERVER ? + ldlm_srv_namespace_nr : ldlm_cli_namespace_nr; +} + +static inline void ldlm_namespace_nr_inc(enum ldlm_side client) +{ + if (client == LDLM_NAMESPACE_SERVER) + ldlm_srv_namespace_nr++; + else + ldlm_cli_namespace_nr++; +} + +static inline void ldlm_namespace_nr_dec(enum ldlm_side client) +{ + if (client == LDLM_NAMESPACE_SERVER) + ldlm_srv_namespace_nr--; + else + ldlm_cli_namespace_nr--; +} + +static inline struct list_head *ldlm_namespace_list(enum ldlm_side client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_list : &ldlm_cli_active_namespace_list; +} + +static inline +struct list_head *ldlm_namespace_inactive_list(enum ldlm_side client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_list : &ldlm_cli_inactive_namespace_list; +} + +static inline struct mutex *ldlm_namespace_lock(enum ldlm_side client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock; +} + +/* ns_bref is the number of resources in this namespace */ +static inline int ldlm_ns_empty(struct ldlm_namespace *ns) +{ + return atomic_read(&ns->ns_bref) == 0; +} + +void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *, + enum ldlm_side); +void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *, + enum ldlm_side); +struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side); + +/* ldlm_request.c */ +/* Cancel lru flag, it indicates we cancel aged locks. */ +enum ldlm_lru_flags { + LDLM_LRU_FLAG_NO_WAIT = 0x1, /* Cancel locks w/o blocking (neither + * sending nor waiting for any RPCs) */ + LDLM_LRU_FLAG_CLEANUP = 0x2, /* Used when clearing lru, tells + * prepare_lru_list to set discard flag + * on PR extent locks so we don't waste + * time saving pages that will be + * discarded momentarily */ +}; + +int ldlm_cancel_lru(struct ldlm_namespace *ns, int min, + enum ldlm_cancel_flags cancel_flags, + enum ldlm_lru_flags lru_flags); +int ldlm_cancel_lru_local(struct ldlm_namespace *ns, + struct list_head *cancels, int min, int max, + enum ldlm_cancel_flags cancel_flags, + enum ldlm_lru_flags lru_flags); +extern unsigned int ldlm_enqueue_min; +/* ldlm_resource.c */ +extern struct kmem_cache *ldlm_resource_slab; +extern struct kmem_cache *ldlm_lock_slab; +extern struct kmem_cache *ldlm_inodebits_slab; +extern struct kmem_cache *ldlm_interval_tree_slab; + +void ldlm_resource_insert_lock_after(struct ldlm_lock *original, + struct ldlm_lock *new); + +/* ldlm_lock.c */ + +typedef enum { + LDLM_WORK_BL_AST, + LDLM_WORK_CP_AST, + LDLM_WORK_REVOKE_AST, + LDLM_WORK_GL_AST +} ldlm_desc_ast_t; + +void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock); +void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list); +int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, + enum req_location loc, void *data, int size); +struct ldlm_lock * +ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *, + enum ldlm_type type, enum ldlm_mode mode, + const struct ldlm_callback_suite *cbs, + void *data, __u32 lvb_len, enum lvb_type lvb_type); +enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env, + struct ldlm_namespace *, + struct ldlm_lock **, + void *cookie, __u64 *flags); +void ldlm_lock_addref_internal(struct ldlm_lock *, enum ldlm_mode mode); +void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode); +void ldlm_lock_decref_internal(struct ldlm_lock *, enum ldlm_mode mode); +void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode); +void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list); +#ifdef HAVE_SERVER_SUPPORT +int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue, + struct list_head *work_list, + enum ldlm_process_intention intention, + struct ldlm_lock *hint); +int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags, + struct list_head *rpc_list); +void ldlm_discard_bl_list(struct list_head *bl_list); +void ldlm_clear_blocking_lock(struct ldlm_lock *lock); +void ldlm_clear_blocking_data(struct ldlm_lock *lock); +#endif +int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list, + ldlm_desc_ast_t ast_type); +int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq); +int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use); +#define ldlm_lock_remove_from_lru(lock) \ + ldlm_lock_remove_from_lru_check(lock, ktime_set(0, 0)) +int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock); +void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock); +void ldlm_lock_add_to_lru(struct ldlm_lock *lock); +void ldlm_lock_touch_in_lru(struct ldlm_lock *lock); +void ldlm_lock_destroy_nolock(struct ldlm_lock *lock); + +int ldlm_export_cancel_blocked_locks(struct obd_export *exp); +int ldlm_export_cancel_locks(struct obd_export *exp); +void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock); + +/* ldlm_lockd.c */ +int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct ldlm_lock *lock); +int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + enum ldlm_cancel_flags cancel_flags); +int ldlm_bl_thread_wakeup(void); + +void ldlm_handle_bl_callback(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, struct ldlm_lock *lock); +void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock); + +#ifdef HAVE_SERVER_SUPPORT +/* ldlm_plain.c */ +int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list); + +/* ldlm_inodebits.c */ +int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, + struct list_head *work_list); +int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res, + struct list_head *queue, + struct list_head *work_list, + enum ldlm_process_intention intention, + struct ldlm_lock *hint); +/* ldlm_extent.c */ +int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list); +#endif +int ldlm_extent_alloc_lock(struct ldlm_lock *lock); +void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock); +void ldlm_extent_unlink_lock(struct ldlm_lock *lock); + +int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock); +void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head, + struct ldlm_lock *lock); +void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock); + +/* ldlm_flock.c */ +int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list); +int ldlm_init_flock_export(struct obd_export *exp); +void ldlm_destroy_flock_export(struct obd_export *exp); + +/* l_lock.c */ +void l_check_ns_lock(struct ldlm_namespace *ns); +void l_check_no_ns_lock(struct ldlm_namespace *ns); + +extern struct dentry *ldlm_svc_debugfs_dir; + +struct ldlm_state { + struct ptlrpc_service *ldlm_cb_service; + struct ptlrpc_service *ldlm_cancel_service; + struct ptlrpc_client *ldlm_client; + struct ptlrpc_connection *ldlm_server_conn; + struct ldlm_bl_pool *ldlm_bl_pool; +}; + +/* interval tree, for LDLM_EXTENT. */ +extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */ +extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l); +extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l); +extern void ldlm_interval_free(struct ldlm_interval *node); +/* this function must be called with res lock held */ +static inline struct ldlm_extent * +ldlm_interval_extent(struct ldlm_interval *node) +{ + struct ldlm_lock *lock; + LASSERT(!list_empty(&node->li_group)); + + lock = list_entry(node->li_group.next, struct ldlm_lock, + l_sl_policy); + return &lock->l_policy_data.l_extent; +} + +int ldlm_init(void); +void ldlm_exit(void); + +enum ldlm_policy_res { + LDLM_POLICY_CANCEL_LOCK, + LDLM_POLICY_KEEP_LOCK, + LDLM_POLICY_SKIP_LOCK +}; + +#define LDLM_POOL_SYSFS_PRINT_int(v) sprintf(buf, "%d\n", v) +#define LDLM_POOL_SYSFS_SET_int(a, b) { a = b; } +#define LDLM_POOL_SYSFS_PRINT_u64(v) sprintf(buf, "%lld\n", v) +#define LDLM_POOL_SYSFS_SET_u64(a, b) { a = b; } +#define LDLM_POOL_SYSFS_PRINT_atomic(v) sprintf(buf, "%d\n", atomic_read(&v)) +#define LDLM_POOL_SYSFS_SET_atomic(a, b) atomic_set(&a, b) + +#define LDLM_POOL_SYSFS_READER_SHOW(var, type) \ + static ssize_t var##_show(struct kobject *kobj, \ + struct attribute *attr, \ + char *buf) \ + { \ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\ + pl_kobj); \ + type tmp; \ + \ + spin_lock(&pl->pl_lock); \ + tmp = pl->pl_##var; \ + spin_unlock(&pl->pl_lock); \ + \ + return LDLM_POOL_SYSFS_PRINT_##type(tmp); \ + } \ + struct __##var##__dummy_read {;} /* semicolon catcher */ + +#define LDLM_POOL_SYSFS_WRITER_STORE(var, type) \ + static ssize_t var##_store(struct kobject *kobj, \ + struct attribute *attr, \ + const char *buffer, \ + size_t count) \ + { \ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\ + pl_kobj); \ + unsigned long tmp; \ + int rc; \ + \ + rc = kstrtoul(buffer, 10, &tmp); \ + if (rc < 0) { \ + return rc; \ + } \ + \ + spin_lock(&pl->pl_lock); \ + LDLM_POOL_SYSFS_SET_##type(pl->pl_##var, tmp); \ + spin_unlock(&pl->pl_lock); \ + \ + return count; \ + } \ + struct __##var##__dummy_write {; } /* semicolon catcher */ + +#define LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(var, type) \ + static ssize_t var##_show(struct kobject *kobj, \ + struct attribute *attr, \ + char *buf) \ + { \ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\ + pl_kobj); \ + \ + return LDLM_POOL_SYSFS_PRINT_##type(pl->pl_##var); \ + } \ + struct __##var##__dummy_read {; } /* semicolon catcher */ + +#define LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(var, type) \ + static ssize_t var##_store(struct kobject *kobj, \ + struct attribute *attr, \ + const char *buffer, \ + size_t count) \ + { \ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\ + pl_kobj); \ + unsigned long tmp; \ + int rc; \ + \ + rc = kstrtoul(buffer, 10, &tmp); \ + if (rc < 0) { \ + return rc; \ + } \ + \ + LDLM_POOL_SYSFS_SET_##type(pl->pl_##var, tmp); \ + \ + return count; \ + } \ + struct __##var##__dummy_write {; } /* semicolon catcher */ + +static inline void +ldlm_add_var(struct ldebugfs_vars *vars, struct dentry *debugfs_entry, + const char *name, void *data, const struct file_operations *ops) +{ + snprintf((char *)vars->name, MAX_STRING_SIZE, "%s", name); + vars->data = data; + vars->fops = ops; + ldebugfs_add_vars(debugfs_entry, vars, NULL); +} + +static inline int is_granted_or_cancelled(struct ldlm_lock *lock) +{ + int ret = 0; + + lock_res_and_lock(lock); + ret = is_granted_or_cancelled_nolock(lock); + unlock_res_and_lock(lock); + + return ret; +} + +static inline bool is_bl_done(struct ldlm_lock *lock) +{ + bool bl_done = true; + + if (!ldlm_is_bl_done(lock)) { + lock_res_and_lock(lock); + bl_done = ldlm_is_bl_done(lock); + unlock_res_and_lock(lock); + } + + return bl_done; +} + +static inline bool is_lock_converted(struct ldlm_lock *lock) +{ + bool ret = 0; + + lock_res_and_lock(lock); + ret = (lock->l_policy_data.l_inodebits.cancel_bits == 0); + unlock_res_and_lock(lock); + + return ret; +} + +typedef void (*ldlm_policy_wire_to_local_t)(const union ldlm_wire_policy_data *, + union ldlm_policy_data *); +typedef void (*ldlm_policy_local_to_wire_t)(const union ldlm_policy_data *, + union ldlm_wire_policy_data *); +void ldlm_plain_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy); +void ldlm_plain_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy); +void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy); +void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy); +void ldlm_extent_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy); +void ldlm_extent_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy); +void ldlm_flock_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy); +void ldlm_flock_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy); + +/* ldlm_reclaim.c */ +#ifdef HAVE_SERVER_SUPPORT +extern __u64 ldlm_reclaim_threshold; +extern __u64 ldlm_lock_limit; +extern __u64 ldlm_reclaim_threshold_mb; +extern __u64 ldlm_lock_limit_mb; +extern struct percpu_counter ldlm_granted_total; +#endif +int ldlm_reclaim_setup(void); +void ldlm_reclaim_cleanup(void); +void ldlm_reclaim_add(struct ldlm_lock *lock); +void ldlm_reclaim_del(struct ldlm_lock *lock); +bool ldlm_reclaim_full(void); + +static inline bool ldlm_res_eq(const struct ldlm_res_id *res0, + const struct ldlm_res_id *res1) +{ + return memcmp(res0, res1, sizeof(*res0)) == 0; +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c new file mode 100644 index 0000000000000..41e655b6fc353 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c @@ -0,0 +1,3392 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** + * This file deals with various client/target related logic including recovery. + * + * TODO: This code more logically belongs in the ptlrpc module than in ldlm and + * should be moved. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ldlm_internal.h" + +/* @priority: If non-zero, move the selected connection to the list head. + * @create: If zero, only search in existing connections. + */ +static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority, int create) +{ + struct ptlrpc_connection *ptlrpc_conn; + struct obd_import_conn *imp_conn = NULL, *item; + lnet_nid_t nid4refnet = LNET_NID_ANY; + int rc = 0; + ENTRY; + + if (!create && !priority) { + CDEBUG(D_HA, "Nothing to do\n"); + RETURN(-EINVAL); + } + + if (imp->imp_connection && + imp->imp_connection->c_remote_uuid.uuid[0] == 0) + /* nid4refnet is used to restrict network connections */ + nid4refnet = imp->imp_connection->c_self; + ptlrpc_conn = ptlrpc_uuid_to_connection(uuid, nid4refnet); + if (!ptlrpc_conn) { + CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid); + RETURN(-ENOENT); + } + + if (create) { + OBD_ALLOC(imp_conn, sizeof(*imp_conn)); + if (!imp_conn) + GOTO(out_put, rc = -ENOMEM); + } + + spin_lock(&imp->imp_lock); + list_for_each_entry(item, &imp->imp_conn_list, oic_item) { + if (obd_uuid_equals(uuid, &item->oic_uuid)) { + if (priority) { + list_del(&item->oic_item); + list_add(&item->oic_item, + &imp->imp_conn_list); + item->oic_last_attempt = 0; + } + CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n", + imp, imp->imp_obd->obd_name, uuid->uuid, + (priority ? ", moved to head" : "")); + spin_unlock(&imp->imp_lock); + GOTO(out_free, rc = 0); + } + } + /* No existing import connection found for \a uuid. */ + if (create) { + imp_conn->oic_conn = ptlrpc_conn; + imp_conn->oic_uuid = *uuid; + imp_conn->oic_last_attempt = 0; + if (priority) + list_add(&imp_conn->oic_item, &imp->imp_conn_list); + else + list_add_tail(&imp_conn->oic_item, + &imp->imp_conn_list); + CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n", + imp, imp->imp_obd->obd_name, uuid->uuid, + (priority ? "head" : "tail")); + } else { + spin_unlock(&imp->imp_lock); + GOTO(out_free, rc = -ENOENT); + } + + spin_unlock(&imp->imp_lock); + RETURN(0); +out_free: + if (imp_conn) + OBD_FREE(imp_conn, sizeof(*imp_conn)); +out_put: + ptlrpc_connection_put(ptlrpc_conn); + RETURN(rc); +} + +int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid) +{ + return import_set_conn(imp, uuid, 1, 0); +} + +int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority) +{ + return import_set_conn(imp, uuid, priority, 1); +} +EXPORT_SYMBOL(client_import_add_conn); + +int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid) +{ + struct obd_import_conn *imp_conn; + struct obd_export *dlmexp; + int rc = -ENOENT; + ENTRY; + + spin_lock(&imp->imp_lock); + if (list_empty(&imp->imp_conn_list)) { + LASSERT(!imp->imp_connection); + GOTO(out, rc); + } + + list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) { + if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid)) + continue; + LASSERT(imp_conn->oic_conn); + + if (imp_conn == imp->imp_conn_current) { + LASSERT(imp_conn->oic_conn == imp->imp_connection); + + if (imp->imp_state != LUSTRE_IMP_CLOSED && + imp->imp_state != LUSTRE_IMP_DISCON) { + CERROR("can't remove current connection\n"); + GOTO(out, rc = -EBUSY); + } + + ptlrpc_connection_put(imp->imp_connection); + imp->imp_connection = NULL; + + dlmexp = class_conn2export(&imp->imp_dlm_handle); + if (dlmexp && dlmexp->exp_connection) { + LASSERT(dlmexp->exp_connection == + imp_conn->oic_conn); + ptlrpc_connection_put(dlmexp->exp_connection); + dlmexp->exp_connection = NULL; + } + + if (dlmexp != NULL) + class_export_put(dlmexp); + } + + list_del(&imp_conn->oic_item); + ptlrpc_connection_put(imp_conn->oic_conn); + OBD_FREE(imp_conn, sizeof(*imp_conn)); + CDEBUG(D_HA, "imp %p@%s: remove connection %s\n", + imp, imp->imp_obd->obd_name, uuid->uuid); + rc = 0; + break; + } +out: + spin_unlock(&imp->imp_lock); + if (rc == -ENOENT) + CERROR("connection %s not found\n", uuid->uuid); + RETURN(rc); +} +EXPORT_SYMBOL(client_import_del_conn); + +/** + * Find conn UUID by peer NID. \a peer is a server NID. This function is used + * to find a conn uuid of \a imp which can reach \a peer. + */ +int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, + struct obd_uuid *uuid) +{ + struct obd_import_conn *conn; + int rc = -ENOENT; + ENTRY; + + spin_lock(&imp->imp_lock); + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + /* Check if conn UUID does have this peer NID. */ + if (class_check_uuid(&conn->oic_uuid, peer)) { + *uuid = conn->oic_uuid; + rc = 0; + break; + } + } + spin_unlock(&imp->imp_lock); + RETURN(rc); +} +EXPORT_SYMBOL(client_import_find_conn); + +void client_destroy_import(struct obd_import *imp) +{ + /* Drop security policy instance after all RPCs have finished/aborted + * to let all busy contexts be released. */ + class_import_get(imp); + class_destroy_import(imp); + sptlrpc_import_sec_put(imp); + class_import_put(imp); +} +EXPORT_SYMBOL(client_destroy_import); + +/** + * Check whether or not the OSC is on MDT. + * In the config log, + * osc on MDT + * setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID + * osc on client + * setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID + * + **/ +static int osc_on_mdt(char *obdname) +{ + char *ptr; + + ptr = strrchr(obdname, '-'); + if (ptr == NULL) + return 0; + + if (strncmp(ptr + 1, "MDT", 3) == 0) + return 1; + + return 0; +} + +/* Configure an RPC client OBD device. + * + * lcfg parameters: + * 1 - client UUID + * 2 - server UUID + * 3 - inactive-on-startup + * 4 - restrictive net + */ +int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) +{ + struct client_obd *cli = &obddev->u.cli; + struct obd_import *imp; + struct obd_uuid server_uuid; + int rq_portal, rp_portal, connect_op; + char *name = obddev->obd_type->typ_name; + enum ldlm_ns_type ns_type = LDLM_NS_TYPE_UNKNOWN; + char *cli_name = lustre_cfg_buf(lcfg, 0); + struct ptlrpc_connection fake_conn = { .c_self = 0, + .c_remote_uuid.uuid[0] = 0 }; + int rc; + ENTRY; + + /* In a more perfect world, we would hang a ptlrpc_client off of + * obd_type and just use the values from there. */ + if (!strcmp(name, LUSTRE_OSC_NAME)) { + rq_portal = OST_REQUEST_PORTAL; + rp_portal = OSC_REPLY_PORTAL; + connect_op = OST_CONNECT; + cli->cl_sp_me = LUSTRE_SP_CLI; + cli->cl_sp_to = LUSTRE_SP_OST; + ns_type = LDLM_NS_TYPE_OSC; + } else if (!strcmp(name, LUSTRE_MDC_NAME) || + !strcmp(name, LUSTRE_LWP_NAME)) { + rq_portal = MDS_REQUEST_PORTAL; + rp_portal = MDC_REPLY_PORTAL; + connect_op = MDS_CONNECT; + if (is_lwp_on_ost(cli_name)) + cli->cl_sp_me = LUSTRE_SP_OST; + else if (is_lwp_on_mdt(cli_name)) + cli->cl_sp_me = LUSTRE_SP_MDT; + else + cli->cl_sp_me = LUSTRE_SP_CLI; + cli->cl_sp_to = LUSTRE_SP_MDT; + ns_type = LDLM_NS_TYPE_MDC; + } else if (!strcmp(name, LUSTRE_OSP_NAME)) { + if (strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL) { + /* OSP_on_MDT for other MDTs */ + connect_op = MDS_CONNECT; + cli->cl_sp_to = LUSTRE_SP_MDT; + ns_type = LDLM_NS_TYPE_MDC; + rq_portal = OUT_PORTAL; + } else { + /* OSP on MDT for OST */ + connect_op = OST_CONNECT; + cli->cl_sp_to = LUSTRE_SP_OST; + ns_type = LDLM_NS_TYPE_OSC; + rq_portal = OST_REQUEST_PORTAL; + } + rp_portal = OSC_REPLY_PORTAL; + cli->cl_sp_me = LUSTRE_SP_MDT; + } else if (!strcmp(name, LUSTRE_MGC_NAME)) { + rq_portal = MGS_REQUEST_PORTAL; + rp_portal = MGC_REPLY_PORTAL; + connect_op = MGS_CONNECT; + cli->cl_sp_me = LUSTRE_SP_MGC; + cli->cl_sp_to = LUSTRE_SP_MGS; + cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID; + ns_type = LDLM_NS_TYPE_MGC; + } else { + CERROR("unknown client OBD type \"%s\", can't setup\n", + name); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("requires a TARGET UUID\n"); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) { + CERROR("client UUID must be less than 38 characters\n"); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) { + CERROR("setup requires a SERVER UUID\n"); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) { + CERROR("target UUID must be less than 38 characters\n"); + RETURN(-EINVAL); + } + + init_rwsem(&cli->cl_sem); + mutex_init(&cli->cl_mgc_mutex); + cli->cl_seq = NULL; + init_rwsem(&cli->cl_seq_rwsem); + cli->cl_conn_count = 0; + memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), + min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), + sizeof(server_uuid))); + + cli->cl_dirty_pages = 0; + cli->cl_dirty_max_pages = 0; + cli->cl_avail_grant = 0; + /* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */ + /* cl_dirty_max_pages may be changed at connect time in + * ptlrpc_connect_interpret(). */ + client_adjust_max_dirty(cli); + init_waitqueue_head(&cli->cl_cache_waiters); + INIT_LIST_HEAD(&cli->cl_loi_ready_list); + INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list); + INIT_LIST_HEAD(&cli->cl_loi_write_list); + INIT_LIST_HEAD(&cli->cl_loi_read_list); + spin_lock_init(&cli->cl_loi_list_lock); + atomic_set(&cli->cl_pending_w_pages, 0); + atomic_set(&cli->cl_pending_r_pages, 0); + cli->cl_r_in_flight = 0; + cli->cl_w_in_flight = 0; + + spin_lock_init(&cli->cl_read_rpc_hist.oh_lock); + spin_lock_init(&cli->cl_write_rpc_hist.oh_lock); + spin_lock_init(&cli->cl_read_page_hist.oh_lock); + spin_lock_init(&cli->cl_write_page_hist.oh_lock); + spin_lock_init(&cli->cl_read_offset_hist.oh_lock); + spin_lock_init(&cli->cl_write_offset_hist.oh_lock); + + /* lru for osc. */ + INIT_LIST_HEAD(&cli->cl_lru_osc); + atomic_set(&cli->cl_lru_shrinkers, 0); + atomic_long_set(&cli->cl_lru_busy, 0); + atomic_long_set(&cli->cl_lru_in_list, 0); + INIT_LIST_HEAD(&cli->cl_lru_list); + spin_lock_init(&cli->cl_lru_list_lock); + atomic_long_set(&cli->cl_unstable_count, 0); + INIT_LIST_HEAD(&cli->cl_shrink_list); + INIT_LIST_HEAD(&cli->cl_grant_chain); + + INIT_LIST_HEAD(&cli->cl_flight_waiters); + cli->cl_rpcs_in_flight = 0; + + init_waitqueue_head(&cli->cl_destroy_waitq); + atomic_set(&cli->cl_destroy_in_flight, 0); + + cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; +#ifdef ENABLE_CHECKSUM + /* Turn on checksumming by default. */ + cli->cl_checksum = 1; + /* + * The supported checksum types will be worked out at connect time + * Set cl_chksum* to CRC32 for now to avoid returning screwed info + * through procfs. + */ + cli->cl_cksum_type = cli->cl_supp_cksum_types; +#endif + atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS); + + /* Set it to possible maximum size. It may be reduced by ocd_brw_size + * from OFD after connecting. */ + cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES; + + cli->cl_max_short_io_bytes = OBD_MAX_SHORT_IO_BYTES; + + /* set cl_chunkbits default value to PAGE_SHIFT, + * it will be updated at OSC connection time. */ + cli->cl_chunkbits = PAGE_SHIFT; + + if (!strcmp(name, LUSTRE_MDC_NAME)) { + cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; + } else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 128 /* MB */) { + cli->cl_max_rpcs_in_flight = 2; + } else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 256 /* MB */) { + cli->cl_max_rpcs_in_flight = 3; + } else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 512 /* MB */) { + cli->cl_max_rpcs_in_flight = 4; + } else { + if (osc_on_mdt(obddev->obd_name)) + cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX; + else + cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; + } + + spin_lock_init(&cli->cl_mod_rpcs_lock); + spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock); + cli->cl_max_mod_rpcs_in_flight = 0; + cli->cl_mod_rpcs_in_flight = 0; + cli->cl_close_rpcs_in_flight = 0; + init_waitqueue_head(&cli->cl_mod_rpcs_waitq); + cli->cl_mod_tag_bitmap = NULL; + + INIT_LIST_HEAD(&cli->cl_chg_dev_linkage); + + if (connect_op == MDS_CONNECT) { + cli->cl_max_mod_rpcs_in_flight = cli->cl_max_rpcs_in_flight - 1; + OBD_ALLOC(cli->cl_mod_tag_bitmap, + BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long)); + if (cli->cl_mod_tag_bitmap == NULL) + GOTO(err, rc = -ENOMEM); + } + + rc = ldlm_get_ref(); + if (rc) { + CERROR("ldlm_get_ref failed: %d\n", rc); + GOTO(err, rc); + } + + ptlrpc_init_client(rq_portal, rp_portal, name, + &obddev->obd_ldlm_client); + + imp = class_new_import(obddev); + if (imp == NULL) + GOTO(err_ldlm, rc = -ENOENT); + imp->imp_client = &obddev->obd_ldlm_client; + imp->imp_connect_op = connect_op; + memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 1)); + class_import_put(imp); + + if (lustre_cfg_buf(lcfg, 4)) { + __u32 refnet = libcfs_str2net(lustre_cfg_string(lcfg, 4)); + + if (refnet == LNET_NIDNET(LNET_NID_ANY)) { + rc = -EINVAL; + CERROR("%s: bad mount option 'network=%s': rc = %d\n", + obddev->obd_name, lustre_cfg_string(lcfg, 4), + rc); + GOTO(err_import, rc); + } + fake_conn.c_self = LNET_MKNID(refnet, 0); + imp->imp_connection = &fake_conn; + } + + rc = client_import_add_conn(imp, &server_uuid, 1); + if (rc) { + CERROR("can't add initial connection\n"); + GOTO(err_import, rc); + } + imp->imp_connection = NULL; + + cli->cl_import = imp; + /* cli->cl_max_mds_easize updated by mdc_init_ea_size() */ + cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3); + + if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { + if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) { + CDEBUG(D_HA, "marking %s %s->%s as inactive\n", + name, obddev->obd_name, + cli->cl_target_uuid.uuid); + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + } + } + + obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name, + LDLM_NAMESPACE_CLIENT, + LDLM_NAMESPACE_GREEDY, + ns_type); + if (obddev->obd_namespace == NULL) { + CERROR("Unable to create client namespace - %s\n", + obddev->obd_name); + GOTO(err_import, rc = -ENOMEM); + } + + RETURN(rc); + +err_import: + class_destroy_import(imp); +err_ldlm: + ldlm_put_ref(); +err: + if (cli->cl_mod_tag_bitmap != NULL) + OBD_FREE(cli->cl_mod_tag_bitmap, + BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long)); + cli->cl_mod_tag_bitmap = NULL; + RETURN(rc); + +} +EXPORT_SYMBOL(client_obd_setup); + +int client_obd_cleanup(struct obd_device *obddev) +{ + struct client_obd *cli = &obddev->u.cli; + ENTRY; + + ldlm_namespace_free_post(obddev->obd_namespace); + obddev->obd_namespace = NULL; + + obd_cleanup_client_import(obddev); + LASSERT(obddev->u.cli.cl_import == NULL); + + ldlm_put_ref(); + + if (cli->cl_mod_tag_bitmap != NULL) + OBD_FREE(cli->cl_mod_tag_bitmap, + BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long)); + cli->cl_mod_tag_bitmap = NULL; + + RETURN(0); +} +EXPORT_SYMBOL(client_obd_cleanup); + +/* ->o_connect() method for client side (OSC and MDC and MGC) */ +int client_connect_import(const struct lu_env *env, + struct obd_export **exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + struct obd_connect_data *ocd; + struct lustre_handle conn = { 0 }; + int rc; + ENTRY; + + *exp = NULL; + down_write(&cli->cl_sem); + if (cli->cl_conn_count > 0) + GOTO(out_sem, rc = -EALREADY); + + rc = class_connect(&conn, obd, cluuid); + if (rc) + GOTO(out_sem, rc); + + cli->cl_conn_count++; + *exp = class_conn2export(&conn); + + LASSERT(obd->obd_namespace); + + imp->imp_dlm_handle = conn; + rc = ptlrpc_init_import(imp); + if (rc != 0) + GOTO(out_ldlm, rc); + + ocd = &imp->imp_connect_data; + if (data) { + *ocd = *data; + imp->imp_connect_flags_orig = data->ocd_connect_flags; + imp->imp_connect_flags2_orig = data->ocd_connect_flags2; + } + + rc = ptlrpc_connect_import(imp); + if (rc != 0) { + LASSERT(imp->imp_state == LUSTRE_IMP_DISCON); + GOTO(out_ldlm, rc); + } + LASSERT(*exp != NULL && (*exp)->exp_connection); + + if (data) { + LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) == + ocd->ocd_connect_flags, "old %#llx, new %#llx\n", + data->ocd_connect_flags, ocd->ocd_connect_flags); + data->ocd_connect_flags = ocd->ocd_connect_flags; + data->ocd_connect_flags2 = ocd->ocd_connect_flags2; + } + + ptlrpc_pinger_add_import(imp); + + EXIT; + + if (rc) { +out_ldlm: + cli->cl_conn_count--; + class_disconnect(*exp); + *exp = NULL; + } +out_sem: + up_write(&cli->cl_sem); + + return rc; +} +EXPORT_SYMBOL(client_connect_import); + +int client_disconnect_export(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct client_obd *cli; + struct obd_import *imp; + int rc = 0, err; + ENTRY; + + if (!obd) { + CERROR("invalid export for disconnect: exp %p cookie %#llx\n", + exp, exp ? exp->exp_handle.h_cookie : -1); + RETURN(-EINVAL); + } + + cli = &obd->u.cli; + imp = cli->cl_import; + + down_write(&cli->cl_sem); + CDEBUG(D_INFO, "disconnect %s - %zu\n", obd->obd_name, + cli->cl_conn_count); + + if (cli->cl_conn_count == 0) { + CERROR("disconnecting disconnected device (%s)\n", + obd->obd_name); + GOTO(out_disconnect, rc = -EINVAL); + } + + cli->cl_conn_count--; + if (cli->cl_conn_count != 0) + GOTO(out_disconnect, rc = 0); + + /* Mark import deactivated now, so we don't try to reconnect if any + * of the cleanup RPCs fails (e.g. LDLM cancel, etc). We don't + * fully deactivate the import, or that would drop all requests. */ + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + + /* Some non-replayable imports (MDS's OSCs) are pinged, so just + * delete it regardless. (It's safe to delete an import that was + * never added.) */ + (void)ptlrpc_pinger_del_import(imp); + + if (obd->obd_namespace != NULL) { + /* obd_force == local only */ + ldlm_cli_cancel_unused(obd->obd_namespace, NULL, + obd->obd_force ? LCF_LOCAL : 0, NULL); + ldlm_namespace_free_prior(obd->obd_namespace, imp, obd->obd_force); + } + + /* There's no need to hold sem while disconnecting an import, + * and it may actually cause deadlock in GSS. */ + up_write(&cli->cl_sem); + rc = ptlrpc_disconnect_import(imp, 0); + down_write(&cli->cl_sem); + + ptlrpc_invalidate_import(imp); + + EXIT; + +out_disconnect: + /* Use server style - class_disconnect should be always called for + * o_disconnect. */ + err = class_disconnect(exp); + if (!rc && err) + rc = err; + + up_write(&cli->cl_sem); + + RETURN(rc); +} +EXPORT_SYMBOL(client_disconnect_export); + +#ifdef HAVE_SERVER_SUPPORT +int server_disconnect_export(struct obd_export *exp) +{ + int rc; + ENTRY; + + /* Disconnect early so that clients can't keep using export. */ + rc = class_disconnect(exp); + /* Close import to avoid sending any requests. */ + if (exp->exp_imp_reverse) + ptlrpc_cleanup_imp(exp->exp_imp_reverse); + + ldlm_bl_thread_wakeup(); + + /* complete all outstanding replies */ + spin_lock(&exp->exp_lock); + while (!list_empty(&exp->exp_outstanding_replies)) { + struct ptlrpc_reply_state *rs = + list_entry(exp->exp_outstanding_replies.next, + struct ptlrpc_reply_state, rs_exp_list); + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + spin_lock(&svcpt->scp_rep_lock); + + list_del_init(&rs->rs_exp_list); + + spin_lock(&rs->rs_lock); + /* clear rs_convert_lock to make sure rs is handled and put */ + rs->rs_convert_lock = 0; + ptlrpc_schedule_difficult_reply(rs); + spin_unlock(&rs->rs_lock); + + spin_unlock(&svcpt->scp_rep_lock); + } + spin_unlock(&exp->exp_lock); + + RETURN(rc); +} +EXPORT_SYMBOL(server_disconnect_export); + +static inline int target_check_recovery_timer(struct obd_device *target) +{ + ktime_t remaining; + s64 timeout; + + if (!target->obd_recovering || target->obd_recovery_start == 0) + return 0; + + remaining = hrtimer_expires_remaining(&target->obd_recovery_timer); + timeout = ktime_divns(remaining, NSEC_PER_SEC); + if (timeout > -30) + return 0; + + /* the recovery timer should expire, but it isn't triggered, + * it's better to abort the recovery of this target to speed up + * the recovery of the whole cluster. */ + spin_lock(&target->obd_dev_lock); + if (target->obd_recovering) { + CERROR("%s: Aborting recovery\n", target->obd_name); + target->obd_abort_recovery = 1; + wake_up(&target->obd_next_transno_waitq); + } + spin_unlock(&target->obd_dev_lock); + return 0; +} + +/* -------------------------------------------------------------------------- + * from old lib/target.c + * -------------------------------------------------------------------------- */ + +static int target_handle_reconnect(struct lustre_handle *conn, + struct obd_export *exp, + struct obd_uuid *cluuid) +{ + struct obd_device *target; + struct lustre_handle *hdl; + ktime_t remaining; + s64 timeout; + int rc = 0; + + ENTRY; + hdl = &exp->exp_imp_reverse->imp_remote_handle; + if (!exp->exp_connection || !lustre_handle_is_used(hdl)) { + conn->cookie = exp->exp_handle.h_cookie; + CDEBUG(D_HA, "connect export for UUID '%s' at %p," + " cookie %#llx\n", cluuid->uuid, exp, conn->cookie); + RETURN(0); + } + + target = exp->exp_obd; + + /* Might be a re-connect after a partition. */ + if (memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) { + LCONSOLE_WARN("%s: already connected client %s (at %s) " + "with handle %#llx. Rejecting client " + "with the same UUID trying to reconnect " + "with handle %#llx\n", target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), + hdl->cookie, conn->cookie); + memset(conn, 0, sizeof *conn); + /* target_handle_connect() treats EALREADY and + * -EALREADY differently. -EALREADY is an error + * (same UUID, different handle). */ + RETURN(-EALREADY); + } + + if (!target->obd_recovering) { + LCONSOLE_WARN("%s: Client %s (at %s) reconnecting\n", + target->obd_name, obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp)); + GOTO(out_already, rc); + } + + remaining = hrtimer_expires_remaining(&target->obd_recovery_timer); + timeout = ktime_divns(remaining, NSEC_PER_SEC); + if (timeout > 0) { + LCONSOLE_WARN("%s: Client %s (at %s) reconnected, waiting for %d clients in recovery for %lld:%.02lld\n", + target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), + atomic_read(&target->obd_max_recoverable_clients), + timeout / 60, timeout % 60); + } else { + struct target_distribute_txn_data *tdtd; + int size = 0; + int count = 0; + char *buf = NULL; + + target_check_recovery_timer(target); + + tdtd = class_exp2tgt(exp)->lut_tdtd; + if (tdtd && tdtd->tdtd_show_update_logs_retrievers) + buf = tdtd->tdtd_show_update_logs_retrievers( + tdtd->tdtd_show_retrievers_cbdata, + &size, &count); + + if (count > 0) + LCONSOLE_WARN("%s: Client %s (at %s) reconnecting, waiting for %d MDTs (%s) in recovery for %lld:%.02lld. Please wait until all MDTs recovered or you may force MDT evicition via 'lctl --device %s abort_recovery.\n", + target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), count, + buf ? buf : "unknown (not enough RAM)", + (abs(timeout) + target->obd_recovery_timeout) / 60, + (abs(timeout) + target->obd_recovery_timeout) % 60, + target->obd_name); + else + LCONSOLE_WARN("%s: Recovery already passed deadline %lld:%.02lld. If you do not want to wait more, you may force taget eviction via 'lctl --device %s abort_recovery.\n", + target->obd_name, abs(timeout) / 60, + abs(timeout) % 60, target->obd_name); + + if (buf != NULL) + OBD_FREE(buf, size); + } + +out_already: + conn->cookie = exp->exp_handle.h_cookie; + /* target_handle_connect() treats EALREADY and + * -EALREADY differently. EALREADY means we are + * doing a valid reconnect from the same client. */ + RETURN(EALREADY); +} + +static void +check_and_start_recovery_timer(struct obd_device *obd, + struct ptlrpc_request *req, int new_client); + +/** + * update flags for import during reconnect process + */ +static int rev_import_flags_update(struct obd_import *revimp, + struct ptlrpc_request *req) +{ + int rc; + struct obd_connect_data *data; + + data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA); + + if (data->ocd_connect_flags & OBD_CONNECT_AT) + revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; + else + revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + + revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18; + + rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr); + if (rc) { + CERROR("%s: cannot get reverse import %s security: rc = %d\n", + revimp->imp_client->cli_name, + libcfs_id2str(req->rq_peer), rc); + return rc; + } + + return 0; +} + +/** + * Allocate a new reverse import for an export. + * + * \retval -errno in case error hit + * \retval 0 if reverse import correctly init + **/ +int rev_import_init(struct obd_export *export) +{ + struct obd_device *obd = export->exp_obd; + struct obd_import *revimp; + + LASSERT(export->exp_imp_reverse == NULL); + + revimp = class_new_import(obd); + if (revimp == NULL) + return -ENOMEM; + + revimp->imp_remote_handle.cookie = 0ULL; + revimp->imp_client = &obd->obd_ldlm_client; + revimp->imp_dlm_fake = 1; + + /* it is safe to connect import in new state as no sends possible */ + spin_lock(&export->exp_lock); + export->exp_imp_reverse = revimp; + spin_unlock(&export->exp_lock); + class_import_put(revimp); + + return 0; +} +EXPORT_SYMBOL(rev_import_init); + +/** + * Handle reconnect for an export. + * + * \param exp export to handle reconnect process + * \param req client reconnect request + * + * \retval -rc in case securitfy flavor can't be changed + * \retval 0 in case none problems + */ +static int rev_import_reconnect(struct obd_export *exp, + struct ptlrpc_request *req) +{ + struct obd_import *revimp = exp->exp_imp_reverse; + struct lustre_handle *lh; + int rc; + + /* avoid sending a request until import flags are changed */ + ptlrpc_import_enter_resend(revimp); + + if (revimp->imp_connection != NULL) + ptlrpc_connection_put(revimp->imp_connection); + + /* + * client from recovery don't have a handle so we need to take from + * request. it may produce situation when wrong client connected + * to recovery as we trust a client uuid + */ + lh = req_capsule_client_get(&req->rq_pill, &RMF_CONN); + revimp->imp_remote_handle = *lh; + + /* unknown versions will be caught in + * ptlrpc_handle_server_req_in->lustre_unpack_msg() */ + revimp->imp_msg_magic = req->rq_reqmsg->lm_magic; + + revimp->imp_connection = ptlrpc_connection_addref(exp->exp_connection); + + rc = rev_import_flags_update(revimp, req); + if (rc != 0) { + /* it is safe to still be in RECOVERY phase as we are not able + * to setup correct security flavor so requests are not able to + * be delivered correctly */ + return rc; + } + + /* resend all rpc's via new connection */ + return ptlrpc_import_recovery_state_machine(revimp); +} + +int target_handle_connect(struct ptlrpc_request *req) +{ + struct obd_device *target = NULL; + struct obd_export *export = NULL; + /* connect handle - filled from target_handle_reconnect in + * reconnect case */ + struct lustre_handle conn; + struct lustre_handle *tmp; + struct obd_uuid cluuid; + char *str; + int rc = 0; + char *target_start; + int target_len; + bool mds_conn = false, lw_client = false, initial_conn = false; + bool mds_mds_conn = false; + bool new_mds_mds_conn = false; + struct obd_connect_data *data, *tmpdata; + int size, tmpsize; + lnet_nid_t *client_nid = NULL; + ENTRY; + + OBD_RACE(OBD_FAIL_TGT_CONN_RACE); + + str = req_capsule_client_get(&req->rq_pill, &RMF_TGTUUID); + if (str == NULL) { + DEBUG_REQ(D_ERROR, req, "bad target UUID for connect"); + GOTO(out, rc = -EINVAL); + } + + target = class_dev_by_str(str); + if (!target) { + deuuidify(str, NULL, &target_start, &target_len); + LCONSOLE_ERROR_MSG(0x137, "%s: not available for connect " + "from %s (no target). If you are running " + "an HA pair check that the target is " + "mounted on the other server.\n", str, + libcfs_nid2str(req->rq_peer.nid)); + GOTO(out, rc = -ENODEV); + } + + spin_lock(&target->obd_dev_lock); + + target->obd_conn_inprogress++; + + if (target->obd_stopping || !target->obd_set_up) { + spin_unlock(&target->obd_dev_lock); + + deuuidify(str, NULL, &target_start, &target_len); + LCONSOLE_INFO("%.*s: Not available for connect from %s (%s)\n", + target_len, target_start, + libcfs_nid2str(req->rq_peer.nid), + (target->obd_stopping ? + "stopping" : "not set up")); + GOTO(out, rc = -ENODEV); + } + + if (target->obd_no_conn) { + spin_unlock(&target->obd_dev_lock); + + CDEBUG(D_INFO, "%s: Temporarily refusing client connection " + "from %s\n", target->obd_name, + libcfs_nid2str(req->rq_peer.nid)); + GOTO(out, rc = -EAGAIN); + } + + spin_unlock(&target->obd_dev_lock); + + str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID); + if (str == NULL) { + DEBUG_REQ(D_ERROR, req, "bad client UUID for connect"); + GOTO(out, rc = -EINVAL); + } + + obd_str2uuid(&cluuid, str); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_CONN); + if (tmp == NULL) + GOTO(out, rc = -EPROTO); + + conn = *tmp; + + size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA, + RCL_CLIENT); + if (size < 0 || size > 8 * sizeof(struct obd_connect_data)) + GOTO(out, rc = -EPROTO); + data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA); + if (!data) + GOTO(out, rc = -EPROTO); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out, rc); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0) + /* Don't allow clients to connect that are using old 1.8 format + * protocol conventions (LUSTRE_MSG_MAGIC_v1, !MSGHDR_CKSUM_INCOMPAT18, + * ldlm_flock_policy_wire format, MDT_ATTR_xTIME_SET, etc). The + * FULL20 flag should be set on all connections since 2.0, but no + * longer affects behaviour. + * + * Later this check will be disabled and the flag can be retired + * completely once interop with 3.0 is no longer needed. + */ + if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20)) + GOTO(out, rc = -EPROTO); + + /* Don't allow liblustre clients to connect. + * - testing was disabled in v2_2_50_0-61-g6a75d65 + * - building was disabled in v2_5_58_0-28-g7277179 + * - client code was deleted in v2_6_50_0-101-gcdfbc72, + * - clients were refused connect for version difference > 0.0.1.32 */ + if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) { + DEBUG_REQ(D_WARNING, req, "Refusing libclient connection"); + GOTO(out, rc = -EPROTO); + } +#endif + + /* Note: lw_client is needed in MDS-MDS failover during update log + * processing, so we needs to allow lw_client to be connected at + * anytime, instead of only the initial connection + */ + lw_client = OCD_HAS_FLAG(data, LIGHTWEIGHT); + + if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) { + initial_conn = true; + mds_conn = OCD_HAS_FLAG(data, MDS); + mds_mds_conn = OCD_HAS_FLAG(data, MDS_MDS); + + /* OBD_CONNECT_MNE_SWAB is defined as OBD_CONNECT_MDS_MDS + * for Imperative Recovery connection from MGC to MGS. + * + * Via check OBD_CONNECT_FID, we can distinguish whether + * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from + * MGC or MDT, since MGC does not use OBD_CONNECT_FID. + */ + if (!lw_client && + (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) && + (data->ocd_connect_flags & OBD_CONNECT_FID) && + (data->ocd_connect_flags & OBD_CONNECT_VERSION)) { + __u32 major = OBD_OCD_VERSION_MAJOR(data->ocd_version); + __u32 minor = OBD_OCD_VERSION_MINOR(data->ocd_version); + __u32 patch = OBD_OCD_VERSION_PATCH(data->ocd_version); + + /* We do not support the MDT-MDT interoperations with + * different version MDT because of protocol changes. */ + if (unlikely(major != LUSTRE_MAJOR || + minor != LUSTRE_MINOR || + abs(patch - LUSTRE_PATCH) > 3)) { + LCONSOLE_WARN("%s (%u.%u.%u.%u) refused the " + "connection from different version MDT " + "(%d.%d.%d.%d) %s %s\n", + target->obd_name, LUSTRE_MAJOR, + LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX, + major, minor, patch, + OBD_OCD_VERSION_FIX(data->ocd_version), + libcfs_nid2str(req->rq_peer.nid), str); + + GOTO(out, rc = -EPROTO); + } + } + } + + /* lctl gets a backstage, all-access pass. */ + if (obd_uuid_equals(&cluuid, &target->obd_uuid)) + goto dont_check_exports; + + export = cfs_hash_lookup(target->obd_uuid_hash, &cluuid); + if (!export) + goto no_export; + + /* We've found an export in the hash. */ + + spin_lock(&export->exp_lock); + + if (export->exp_connecting) { /* bug 9635, et. al. */ + spin_unlock(&export->exp_lock); + LCONSOLE_WARN("%s: Export %p already connecting from %s\n", + export->exp_obd->obd_name, export, + libcfs_nid2str(req->rq_peer.nid)); + class_export_put(export); + export = NULL; + rc = -EALREADY; + } else if ((mds_conn || (lw_client && initial_conn) || + OCD_HAS_FLAG(data, MDS_MDS)) && export->exp_connection) { + spin_unlock(&export->exp_lock); + if (req->rq_peer.nid != export->exp_connection->c_peer.nid) { + /* MDS or LWP reconnected after failover. */ + LCONSOLE_WARN("%s: Received %s connection from %s, removing former export from %s\n", + target->obd_name, + lw_client ? "LWP" : "MDS", + libcfs_nid2str(req->rq_peer.nid), + libcfs_nid2str(export->exp_connection->c_peer.nid)); + } else { + /* New connection from the same NID. */ + LCONSOLE_WARN("%s: Received new %s connection from %s, %s former export from same NID\n", + target->obd_name, + lw_client ? "LWP" : "MDS", + libcfs_nid2str(req->rq_peer.nid), + OCD_HAS_FLAG(data, MDS_MDS) ? + "keep" : "remove"); + } + + if (req->rq_peer.nid == export->exp_connection->c_peer.nid && + OCD_HAS_FLAG(data, MDS_MDS)) { + /* + * Because exports between MDTs will always be + * kept, let's do not fail such export if they + * come from the same NID, otherwise it might + * cause eviction between MDTs, which might + * cause namespace inconsistency */ + spin_lock(&export->exp_lock); + export->exp_connecting = 1; + export->exp_conn_cnt = 0; + spin_unlock(&export->exp_lock); + conn.cookie = export->exp_handle.h_cookie; + rc = EALREADY; + } else { + class_fail_export(export); + class_export_put(export); + export = NULL; + rc = 0; + } + } else if (export->exp_connection != NULL && initial_conn && + req->rq_peer.nid != export->exp_connection->c_peer.nid) { + spin_unlock(&export->exp_lock); + /* In MDS failover we have static UUID but NID can change. */ + LCONSOLE_WARN("%s: Client %s seen on new nid %s when " + "existing nid %s is already connected\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid), + libcfs_nid2str( + export->exp_connection->c_peer.nid)); + rc = -EALREADY; + class_export_put(export); + export = NULL; + } else { + export->exp_connecting = 1; + spin_unlock(&export->exp_lock); + LASSERT(export->exp_obd == target); + + rc = target_handle_reconnect(&conn, export, &cluuid); + } + + /* If we found an export, we already unlocked. */ + if (!export) { +no_export: + OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout); + } else if (req->rq_export == NULL && + atomic_read(&export->exp_rpc_count) > 0) { + LCONSOLE_WARN("%s: Client %s (at %s) refused connection, " + "still busy with %d references\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid), + atomic_read(&export->exp_refcount)); + GOTO(out, rc = -EBUSY); + } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 && + rc != EALREADY) { + if (!strstr(cluuid.uuid, "mdt")) + LCONSOLE_WARN("%s: Rejecting reconnect from the " + "known client %s (at %s) because it " + "is indicating it is a new client", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid)); + GOTO(out, rc = -EALREADY); + } else { + OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); + } + + if (rc < 0) { + GOTO(out, rc); + } + + CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n", + target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), + target->obd_recovering ? "recovering/" : "", data->ocd_transno, + export, ktime_get_seconds(), + export ? export->exp_last_request_time : 0); + + /* If this is the first time a client connects, reset the recovery + * timer. Discard lightweight connections which might be local. */ + if (!lw_client && rc == 0 && target->obd_recovering) + check_and_start_recovery_timer(target, req, export == NULL); + + /* We want to handle EALREADY but *not* -EALREADY from + * target_handle_reconnect(), return reconnection state in a flag. */ + if (rc == EALREADY) { + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT); + rc = 0; + } else { + LASSERT(rc == 0); + } + + /* Tell the client if we support replayable requests. */ + if (target->obd_replayable) + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE); + client_nid = &req->rq_peer.nid; + + if (export == NULL) { + /* allow lightweight connections during recovery */ + /* allow "new" MDT to be connected during recovery, since we + * need retrieve recovery update records from it */ + if (target->obd_recovering && !lw_client && !mds_mds_conn) { + struct hrtimer *timer = &target->obd_recovery_timer; + ktime_t remaining; + s64 timeout, left; + int in_progress; + int connected; + int known; + int stale; + char *msg; + + connected = atomic_read(&target->obd_connected_clients); + in_progress = atomic_read(&target->obd_lock_replay_clients); + known = + atomic_read(&target->obd_max_recoverable_clients); + stale = target->obd_stale_clients; + remaining = hrtimer_expires_remaining(timer); + left = ktime_divns(remaining, NSEC_PER_SEC); + if (ktime_to_ns(remaining) > 0) { + msg = "to recover in"; + timeout = left; + } else { + msg = "already passed deadline"; + timeout = -left; + + target_check_recovery_timer(target); + } + + LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid), known, + connected - in_progress, in_progress, + stale, msg, timeout / 60, timeout % 60); + rc = -EBUSY; + } else { +dont_check_exports: + rc = obd_connect(req->rq_svc_thread->t_env, + &export, target, &cluuid, data, + client_nid); + if (mds_conn && OBD_FAIL_CHECK(OBD_FAIL_TGT_RCVG_FLAG)) + lustre_msg_add_op_flags(req->rq_repmsg, + MSG_CONNECT_RECOVERING); + if (rc == 0) { + conn.cookie = export->exp_handle.h_cookie; + rc = rev_import_init(export); + } + + if (mds_mds_conn) + new_mds_mds_conn = true; + } + } else { + rc = obd_reconnect(req->rq_svc_thread->t_env, + export, target, &cluuid, data, client_nid); + } + if (rc) + GOTO(out, rc); + + LASSERT(target->u.obt.obt_magic == OBT_MAGIC); + data->ocd_instance = target->u.obt.obt_instance; + + /* Return only the parts of obd_connect_data that we understand, so the + * client knows that we don't understand the rest. */ + if (data) { + tmpsize = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA, + RCL_SERVER); + tmpdata = req_capsule_server_get(&req->rq_pill, + &RMF_CONNECT_DATA); + /* Don't use struct assignment here, because the client reply + * buffer may be smaller/larger than the local struct + * obd_connect_data. */ + memcpy(tmpdata, data, min(tmpsize, size)); + } + + /* If the client and the server are the same node, we will already + * have an export that really points to the client's DLM export, + * because we have a shared handles table. + * + * XXX this will go away when shaver stops sending the "connect" handle + * in the real "remote handle" field of the request --phik 24 Apr 2003 + */ + ptlrpc_request_change_export(req, export); + + spin_lock(&export->exp_lock); + if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) { + spin_unlock(&export->exp_lock); + CDEBUG(D_RPCTRACE, "%s: %s already connected at greater " + "or equal conn_cnt: %d >= %d\n", + cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), + export->exp_conn_cnt, + lustre_msg_get_conn_cnt(req->rq_reqmsg)); + + GOTO(out, rc = -EALREADY); + } + LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0); + export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); + spin_unlock(&export->exp_lock); + + if (export->exp_connection != NULL) { + /* Check to see if connection came from another NID. */ + if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) && + !hlist_unhashed(&export->exp_nid_hash)) + cfs_hash_del(export->exp_obd->obd_nid_hash, + &export->exp_connection->c_peer.nid, + &export->exp_nid_hash); + + ptlrpc_connection_put(export->exp_connection); + } + + export->exp_connection = ptlrpc_connection_get(req->rq_peer, + req->rq_self, + &cluuid); + if (hlist_unhashed(&export->exp_nid_hash)) + cfs_hash_add(export->exp_obd->obd_nid_hash, + &export->exp_connection->c_peer.nid, + &export->exp_nid_hash); + + lustre_msg_set_handle(req->rq_repmsg, &conn); + + rc = rev_import_reconnect(export, req); + if (rc != 0) + GOTO(out, rc); + + if (target->obd_recovering && !export->exp_in_recovery && !lw_client) { + int has_transno; + __u64 transno = data->ocd_transno; + + spin_lock(&export->exp_lock); + /* possible race with class_disconnect_stale_exports, + * export may be already in the eviction process */ + if (export->exp_failed) { + spin_unlock(&export->exp_lock); + GOTO(out, rc = -ENODEV); + } + export->exp_in_recovery = 1; + export->exp_req_replay_needed = 1; + export->exp_lock_replay_needed = 1; + spin_unlock(&export->exp_lock); + + has_transno = !!(lustre_msg_get_op_flags(req->rq_reqmsg) & + MSG_CONNECT_TRANSNO); + if (has_transno && transno == 0) + CWARN("Connect with zero transno!\n"); + + if (has_transno && transno > 0 && + transno < target->obd_next_recovery_transno && + transno > target->obd_last_committed) { + /* Another way is to use cmpxchg() to be lock-free. */ + spin_lock(&target->obd_recovery_task_lock); + if (transno < target->obd_next_recovery_transno) + target->obd_next_recovery_transno = transno; + spin_unlock(&target->obd_recovery_task_lock); + } + + atomic_inc(&target->obd_req_replay_clients); + atomic_inc(&target->obd_lock_replay_clients); + /* Note: MDS-MDS connection is allowed to be connected during + * recovery, no matter if the exports needs to be recoveried. + * Because we need retrieve updates logs from all other MDTs. + * So if the MDS-MDS export is new, obd_max_recoverable_clients + * also needs to be increased to match other recovery checking + * condition. */ + if (new_mds_mds_conn) + atomic_inc(&target->obd_max_recoverable_clients); + + if (atomic_inc_return(&target->obd_connected_clients) == + atomic_read(&target->obd_max_recoverable_clients)) + wake_up(&target->obd_next_transno_waitq); + } + + /* Tell the client we're in recovery, when client is involved in it. */ + if (target->obd_recovering && !lw_client) + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); + +out: + if (export) { + spin_lock(&export->exp_lock); + export->exp_connecting = 0; + spin_unlock(&export->exp_lock); + + class_export_put(export); + } + if (target != NULL) { + spin_lock(&target->obd_dev_lock); + target->obd_conn_inprogress--; + spin_unlock(&target->obd_dev_lock); + class_decref(target, "find", current); + } + req->rq_status = rc; + RETURN(rc); +} + +int target_handle_disconnect(struct ptlrpc_request *req) +{ + int rc; + ENTRY; + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(rc); + + /* In case of target disconnect, updating sec ctx immediately is + * required in order to record latest sequence number used. + * Sequence is normally updated on export destroy, but this event + * can occur too late, ie after a new target connect request has + * been processed. + * Maintaining correct sequence when client connection becomes idle + * ensures that GSS does not erroneously consider requests as replays. + */ + rc = sptlrpc_export_update_ctx(req->rq_export); + if (rc) + RETURN(rc); + + /* Keep the rq_export around so we can send the reply. */ + req->rq_status = obd_disconnect(class_export_get(req->rq_export)); + + RETURN(0); +} + +void target_destroy_export(struct obd_export *exp) +{ + struct obd_import *imp = NULL; + /* exports created from last_rcvd data, and "fake" + exports created by lctl don't have an import */ + spin_lock(&exp->exp_lock); + if (exp->exp_imp_reverse != NULL) { + imp = exp->exp_imp_reverse; + exp->exp_imp_reverse = NULL; + } + spin_unlock(&exp->exp_lock); + if (imp != NULL) + client_destroy_import(imp); + + LASSERT_ATOMIC_ZERO(&exp->exp_locks_count); + LASSERT_ATOMIC_ZERO(&exp->exp_rpc_count); + LASSERT_ATOMIC_ZERO(&exp->exp_cb_count); + LASSERT_ATOMIC_ZERO(&exp->exp_replay_count); +} +EXPORT_SYMBOL(target_destroy_export); + +/* + * Recovery functions + */ +static void target_request_copy_get(struct ptlrpc_request *req) +{ + class_export_rpc_inc(req->rq_export); + LASSERT(list_empty(&req->rq_list)); + INIT_LIST_HEAD(&req->rq_replay_list); + + /* Increase refcount to keep request in queue. */ + atomic_inc(&req->rq_refcount); + /* Let export know it has replays to be handled. */ + atomic_inc(&req->rq_export->exp_replay_count); +} + +static void target_request_copy_put(struct ptlrpc_request *req) +{ + LASSERT(list_empty(&req->rq_replay_list)); + LASSERT_ATOMIC_POS(&req->rq_export->exp_replay_count); + + atomic_dec(&req->rq_export->exp_replay_count); + class_export_rpc_dec(req->rq_export); + ptlrpc_server_drop_request(req); +} + +static int target_exp_enqueue_req_replay(struct ptlrpc_request *req) +{ + __u64 transno = lustre_msg_get_transno(req->rq_reqmsg); + struct obd_export *exp = req->rq_export; + struct ptlrpc_request *reqiter; + struct ptlrpc_request *dup_req = NULL; + int dup = 0; + + LASSERT(exp); + + spin_lock(&exp->exp_lock); + list_for_each_entry(reqiter, &exp->exp_req_replay_queue, + rq_replay_list) { + if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) { + dup_req = reqiter; + dup = 1; + break; + } + } + + if (dup) { + /* We expect it with RESENT and REPLAY flags. */ + if ((lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY)) + CERROR("invalid flags %x of resent replay\n", + lustre_msg_get_flags(req->rq_reqmsg)); + + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { + __u32 new_conn; + + new_conn = lustre_msg_get_conn_cnt(req->rq_reqmsg); + if (new_conn > + lustre_msg_get_conn_cnt(dup_req->rq_reqmsg)) + lustre_msg_set_conn_cnt(dup_req->rq_reqmsg, + new_conn); + } + } else { + list_add_tail(&req->rq_replay_list, + &exp->exp_req_replay_queue); + } + + spin_unlock(&exp->exp_lock); + return dup; +} + +static void target_exp_dequeue_req_replay(struct ptlrpc_request *req) +{ + LASSERT(!list_empty(&req->rq_replay_list)); + LASSERT(req->rq_export); + + spin_lock(&req->rq_export->exp_lock); + list_del_init(&req->rq_replay_list); + spin_unlock(&req->rq_export->exp_lock); +} + +static void target_finish_recovery(struct lu_target *lut) +{ + struct obd_device *obd = lut->lut_obd; + ENTRY; + + /* Only log a recovery message when recovery has occurred. */ + if (obd->obd_recovery_start) { + time64_t now = ktime_get_seconds(); + time64_t elapsed_time; + + elapsed_time = max_t(time64_t, now - obd->obd_recovery_start, 1); + LCONSOLE_INFO("%s: Recovery over after %lld:%.02lld, of %d clients " + "%d recovered and %d %s evicted.\n", obd->obd_name, + (s64)elapsed_time / 60, (s64)elapsed_time % 60, + atomic_read(&obd->obd_max_recoverable_clients), + atomic_read(&obd->obd_connected_clients), + obd->obd_stale_clients, + obd->obd_stale_clients == 1 ? "was" : "were"); + } + + ldlm_reprocess_recovery_done(obd->obd_namespace); + spin_lock(&obd->obd_recovery_task_lock); + if (!list_empty(&obd->obd_req_replay_queue) || + !list_empty(&obd->obd_lock_replay_queue) || + !list_empty(&obd->obd_final_req_queue)) { + CERROR("%s: Recovery queues ( %s%s%s) are not empty\n", + obd->obd_name, + list_empty(&obd->obd_req_replay_queue) ? "" : "req ", + list_empty(&obd->obd_lock_replay_queue) ? \ + "" : "lock ", + list_empty(&obd->obd_final_req_queue) ? \ + "" : "final "); + spin_unlock(&obd->obd_recovery_task_lock); + LBUG(); + } + spin_unlock(&obd->obd_recovery_task_lock); + + obd->obd_recovery_end = ktime_get_seconds(); + + /* When recovery finished, cleanup orphans on MDS and OST. */ + if (obd->obd_type && OBP(obd, postrecov)) { + int rc = OBP(obd, postrecov)(obd); + + if (rc < 0) + LCONSOLE_WARN("%s: Post recovery failed, rc %d\n", + obd->obd_name, rc); + } + EXIT; +} + +static void abort_req_replay_queue(struct obd_device *obd) +{ + struct ptlrpc_request *req, *n; + struct list_head abort_list; + + INIT_LIST_HEAD(&abort_list); + spin_lock(&obd->obd_recovery_task_lock); + list_splice_init(&obd->obd_req_replay_queue, &abort_list); + spin_unlock(&obd->obd_recovery_task_lock); + list_for_each_entry_safe(req, n, &abort_list, rq_list) { + DEBUG_REQ(D_WARNING, req, "aborted:"); + req->rq_status = -ENOTCONN; + if (ptlrpc_error(req)) { + DEBUG_REQ(D_ERROR, req, + "failed abort_req_reply; skipping"); + } + target_exp_dequeue_req_replay(req); + target_request_copy_put(req); + } +} + +static void abort_lock_replay_queue(struct obd_device *obd) +{ + struct ptlrpc_request *req, *n; + struct list_head abort_list; + + INIT_LIST_HEAD(&abort_list); + spin_lock(&obd->obd_recovery_task_lock); + list_splice_init(&obd->obd_lock_replay_queue, &abort_list); + spin_unlock(&obd->obd_recovery_task_lock); + list_for_each_entry_safe(req, n, &abort_list, rq_list) { + DEBUG_REQ(D_ERROR, req, "aborted:"); + req->rq_status = -ENOTCONN; + if (ptlrpc_error(req)) { + DEBUG_REQ(D_ERROR, req, + "failed abort_lock_reply; skipping"); + } + target_request_copy_put(req); + } +} + +/* Called from a cleanup function if the device is being cleaned up + forcefully. The exports should all have been disconnected already, + the only thing left to do is + - clear the recovery flags + - cancel the timer + - free queued requests and replies, but don't send replies + Because the obd_stopping flag is set, no new requests should be received. + +*/ +void target_cleanup_recovery(struct obd_device *obd) +{ + struct ptlrpc_request *req, *n; + struct list_head clean_list; + + INIT_LIST_HEAD(&clean_list); + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_recovering) { + spin_unlock(&obd->obd_dev_lock); + EXIT; + return; + } + obd->obd_recovering = obd->obd_abort_recovery = 0; + spin_unlock(&obd->obd_dev_lock); + + spin_lock(&obd->obd_recovery_task_lock); + target_cancel_recovery_timer(obd); + list_splice_init(&obd->obd_req_replay_queue, &clean_list); + spin_unlock(&obd->obd_recovery_task_lock); + + list_for_each_entry_safe(req, n, &clean_list, rq_list) { + LASSERT(req->rq_reply_state == NULL); + target_exp_dequeue_req_replay(req); + target_request_copy_put(req); + } + + spin_lock(&obd->obd_recovery_task_lock); + list_splice_init(&obd->obd_lock_replay_queue, &clean_list); + list_splice_init(&obd->obd_final_req_queue, &clean_list); + spin_unlock(&obd->obd_recovery_task_lock); + + list_for_each_entry_safe(req, n, &clean_list, rq_list) { + LASSERT(req->rq_reply_state == NULL); + target_request_copy_put(req); + } + + EXIT; +} +EXPORT_SYMBOL(target_cleanup_recovery); + +/* obd_recovery_task_lock should be held */ +void target_cancel_recovery_timer(struct obd_device *obd) +{ + CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name); + hrtimer_cancel(&obd->obd_recovery_timer); +} + +static void target_start_recovery_timer(struct obd_device *obd) +{ + ktime_t delay; + + if (obd->obd_recovery_start != 0) + return; + + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_recovering || obd->obd_abort_recovery) { + spin_unlock(&obd->obd_dev_lock); + return; + } + + LASSERT(obd->obd_recovery_timeout != 0); + + if (obd->obd_recovery_start != 0) { + spin_unlock(&obd->obd_dev_lock); + return; + } + + obd->obd_recovery_start = ktime_get_seconds(); + delay = ktime_set(obd->obd_recovery_start + + obd->obd_recovery_timeout, 0); + hrtimer_start(&obd->obd_recovery_timer, delay, HRTIMER_MODE_ABS); + spin_unlock(&obd->obd_dev_lock); + + LCONSOLE_WARN("%s: Will be in recovery for at least %lu:%02lu, or until %d client%s reconnect%s\n", + obd->obd_name, + obd->obd_recovery_timeout / 60, + obd->obd_recovery_timeout % 60, + atomic_read(&obd->obd_max_recoverable_clients), + (atomic_read(&obd->obd_max_recoverable_clients) == 1) ? + "" : "s", + (atomic_read(&obd->obd_max_recoverable_clients) == 1) ? + "s" : ""); +} + +/** + * extend recovery window. + * + * if @extend is true, extend recovery window to have @dr_timeout remaining + * at least; otherwise, make sure the recovery timeout value is not less + * than @dr_timeout. + */ +static void extend_recovery_timer(struct obd_device *obd, time_t dr_timeout, + bool extend) +{ + ktime_t left_ns; + time_t timeout; + time_t left; + + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_recovering || obd->obd_abort_recovery) { + spin_unlock(&obd->obd_dev_lock); + return; + } + LASSERT(obd->obd_recovery_start != 0); + + left_ns = hrtimer_expires_remaining(&obd->obd_recovery_timer); + left = ktime_divns(left_ns, NSEC_PER_SEC); + + if (extend) { + timeout = obd->obd_recovery_timeout; + /* dr_timeout will happen after the hrtimer has expired. + * Add the excess time to the soft recovery timeout without + * exceeding the hard recovery timeout. + */ + if (dr_timeout > left) { + timeout += dr_timeout - left; + timeout = min_t(time_t, obd->obd_recovery_time_hard, + timeout); + } + } else { + timeout = clamp_t(time_t, dr_timeout, obd->obd_recovery_timeout, + obd->obd_recovery_time_hard); + } + + if (timeout == obd->obd_recovery_time_hard) + CWARN("%s: extended recovery timer reached hard limit: %ld, extend: %d\n", + obd->obd_name, timeout, extend); + + if (obd->obd_recovery_timeout < timeout) { + ktime_t end, now; + + obd->obd_recovery_timeout = timeout; + end = ktime_set(obd->obd_recovery_start + timeout, 0); + now = ktime_set(ktime_get_seconds(), 0); + left_ns = ktime_sub(end, now); + hrtimer_start(&obd->obd_recovery_timer, end, HRTIMER_MODE_ABS); + left = ktime_divns(left_ns, NSEC_PER_SEC); + } + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_HA, "%s: recovery timer will expire in %ld seconds\n", + obd->obd_name, left); +} + +/* Reset the timer with each new client connection */ +/* + * This timer is actually reconnect_timer, which is for making sure + * the total recovery window is at least as big as my reconnect + * attempt timing. So the initial recovery time_out will be set to + * OBD_RECOVERY_FACTOR * obd_timeout. If the timeout coming + * from client is bigger than this, then the recovery time_out will + * be extended to make sure the client could be reconnected, in the + * process, the timeout from the new client should be ignored. + */ +static void +check_and_start_recovery_timer(struct obd_device *obd, + struct ptlrpc_request *req, + int new_client) +{ + timeout_t service_timeout = lustre_msg_get_service_timeout(req->rq_reqmsg); + struct obd_device_target *obt = &obd->u.obt; + + if (!new_client && service_timeout) + /* + * Teach server about old server's estimates, as first guess + * at how long new requests will take. + */ + at_measured(&req->rq_rqbd->rqbd_svcpt->scp_at_estimate, + service_timeout); + + target_start_recovery_timer(obd); + + /* + * Convert the service time to RPC timeout, + * and reuse service_timeout to limit stack usage. + */ + service_timeout = at_est2timeout(service_timeout); + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) && + service_timeout < at_extra) + service_timeout = at_extra; + + /* + * We expect other clients to timeout within service_timeout, then try + * to reconnect, then try the failover server. The max delay between + * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL. + */ + service_timeout += 2 * INITIAL_CONNECT_TIMEOUT; + + LASSERT(obt->obt_magic == OBT_MAGIC); + service_timeout += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC); + if (service_timeout > obd->obd_recovery_timeout && !new_client) + extend_recovery_timer(obd, service_timeout, false); +} + +/** Health checking routines */ +static inline int exp_connect_healthy(struct obd_export *exp) +{ + return (exp->exp_in_recovery); +} + +/** if export done req_replay or has replay in queue */ +static inline int exp_req_replay_healthy(struct obd_export *exp) +{ + return (!exp->exp_req_replay_needed || + atomic_read(&exp->exp_replay_count) > 0); +} + + +static inline int exp_req_replay_healthy_or_from_mdt(struct obd_export *exp) +{ + return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) || + exp_req_replay_healthy(exp); +} + +/** if export done lock_replay or has replay in queue */ +static inline int exp_lock_replay_healthy(struct obd_export *exp) +{ + return (!exp->exp_lock_replay_needed || + atomic_read(&exp->exp_replay_count) > 0); +} + +static inline int exp_vbr_healthy(struct obd_export *exp) +{ + return (!exp->exp_vbr_failed); +} + +static inline int exp_finished(struct obd_export *exp) +{ + return (exp->exp_in_recovery && !exp->exp_lock_replay_needed); +} + +static inline int exp_finished_or_from_mdt(struct obd_export *exp) +{ + return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) || + exp_finished(exp); +} + +static int check_for_next_transno(struct lu_target *lut) +{ + struct ptlrpc_request *req = NULL; + struct obd_device *obd = lut->lut_obd; + struct target_distribute_txn_data *tdtd = lut->lut_tdtd; + int wake_up = 0, connected, completed, queue_len; + __u64 req_transno = 0; + __u64 update_transno = 0; + __u64 next_transno = 0; + ENTRY; + + spin_lock(&obd->obd_recovery_task_lock); + if (!list_empty(&obd->obd_req_replay_queue)) { + req = list_entry(obd->obd_req_replay_queue.next, + struct ptlrpc_request, rq_list); + req_transno = lustre_msg_get_transno(req->rq_reqmsg); + } + + if (tdtd != NULL) + update_transno = distribute_txn_get_next_transno(tdtd); + + connected = atomic_read(&obd->obd_connected_clients); + completed = connected - atomic_read(&obd->obd_req_replay_clients); + queue_len = obd->obd_requests_queued_for_recovery; + next_transno = obd->obd_next_recovery_transno; + + CDEBUG(D_HA, + "max: %d, connected: %d, completed: %d, queue_len: %d, req_transno: %llu, next_transno: %llu\n", + atomic_read(&obd->obd_max_recoverable_clients), + connected, completed, + queue_len, req_transno, next_transno); + + if (obd->obd_abort_recovery) { + CDEBUG(D_HA, "waking for aborted recovery\n"); + wake_up = 1; + } else if (obd->obd_recovery_expired) { + CDEBUG(D_HA, "waking for expired recovery\n"); + wake_up = 1; + } else if (tdtd != NULL && req != NULL && + is_req_replayed_by_update(req)) { + LASSERTF(req_transno < next_transno, "req_transno %llu" + "next_transno%llu\n", req_transno, next_transno); + CDEBUG(D_HA, "waking for duplicate req (%llu)\n", + req_transno); + wake_up = 1; + } else if (req_transno == next_transno || + (update_transno != 0 && update_transno <= next_transno)) { + CDEBUG(D_HA, "waking for next (%lld)\n", next_transno); + wake_up = 1; + } else if (queue_len > 0 && + queue_len == atomic_read(&obd->obd_req_replay_clients)) { + /** handle gaps occured due to lost reply or VBR */ + LASSERTF(req_transno >= next_transno, + "req_transno: %llu, next_transno: %llu\n", + req_transno, next_transno); + CDEBUG(D_HA, + "%s: waking for gap in transno, VBR is %s (skip: " + "%lld, ql: %d, comp: %d, conn: %d, next: %lld" + ", next_update %lld last_committed: %lld)\n", + obd->obd_name, obd->obd_version_recov ? "ON" : "OFF", + next_transno, queue_len, completed, connected, + req_transno, update_transno, obd->obd_last_committed); + obd->obd_next_recovery_transno = req_transno; + wake_up = 1; + } else if (atomic_read(&obd->obd_req_replay_clients) == 0) { + CDEBUG(D_HA, "waking for completed recovery\n"); + wake_up = 1; + } else if (OBD_FAIL_CHECK(OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS)) { + CDEBUG(D_HA, "accepting transno gaps is explicitly allowed" + " by fail_lock, waking up (%lld)\n", next_transno); + obd->obd_next_recovery_transno = req_transno; + wake_up = 1; + } + spin_unlock(&obd->obd_recovery_task_lock); + return wake_up; +} + +static int check_for_next_lock(struct lu_target *lut) +{ + struct obd_device *obd = lut->lut_obd; + int wake_up = 0; + + spin_lock(&obd->obd_recovery_task_lock); + if (!list_empty(&obd->obd_lock_replay_queue)) { + CDEBUG(D_HA, "waking for next lock\n"); + wake_up = 1; + } else if (atomic_read(&obd->obd_lock_replay_clients) == 0) { + CDEBUG(D_HA, "waking for completed lock replay\n"); + wake_up = 1; + } else if (obd->obd_abort_recovery) { + CDEBUG(D_HA, "waking for aborted recovery\n"); + wake_up = 1; + } else if (obd->obd_recovery_expired) { + CDEBUG(D_HA, "waking for expired recovery\n"); + wake_up = 1; + } + spin_unlock(&obd->obd_recovery_task_lock); + + return wake_up; +} + +static int check_update_llog(struct lu_target *lut) +{ + struct obd_device *obd = lut->lut_obd; + struct target_distribute_txn_data *tdtd = lut->lut_tdtd; + + if (obd->obd_abort_recovery) { + CDEBUG(D_HA, "waking for aborted recovery\n"); + return 1; + } + + if (atomic_read(&tdtd->tdtd_recovery_threads_count) == 0) { + CDEBUG(D_HA, "waking for completion of reading update log\n"); + return 1; + } + + return 0; +} + +/** + * wait for recovery events, + * check its status with help of check_routine + * evict dead clients via health_check + */ +static int target_recovery_overseer(struct lu_target *lut, + int (*check_routine)(struct lu_target *), + int (*health_check)(struct obd_export *)) +{ + struct obd_device *obd = lut->lut_obd; + struct target_distribute_txn_data *tdtd; + time64_t last = 0; + time64_t now; +repeat: + if (obd->obd_recovering && obd->obd_recovery_start == 0) { + now = ktime_get_seconds(); + if (now - last > 600) { + LCONSOLE_INFO("%s: in recovery but waiting for " + "the first client to connect\n", + obd->obd_name); + last = now; + } + } + if (obd->obd_recovery_start != 0 && ktime_get_seconds() >= + (obd->obd_recovery_start + obd->obd_recovery_time_hard)) { + __u64 next_update_transno = 0; + + /* Only abort the recovery if there are no update recovery + * left in the queue */ + spin_lock(&obd->obd_recovery_task_lock); + if (lut->lut_tdtd != NULL) { + next_update_transno = + distribute_txn_get_next_transno(lut->lut_tdtd); + + tdtd = lut->lut_tdtd; + /* If next_update_transno == 0, it probably because + * updatelog retrieve threads did not get any records + * yet, let's wait those threads stopped */ + if (next_update_transno == 0) { + spin_unlock(&obd->obd_recovery_task_lock); + + while (wait_event_timeout( + tdtd->tdtd_recovery_threads_waitq, + check_update_llog(lut), + cfs_time_seconds(60)) == 0); + + spin_lock(&obd->obd_recovery_task_lock); + next_update_transno = + distribute_txn_get_next_transno(tdtd); + } + } + + if (next_update_transno != 0 && !obd->obd_abort_recovery) { + obd->obd_next_recovery_transno = next_update_transno; + spin_unlock(&obd->obd_recovery_task_lock); + /* Disconnect unfinished exports from clients, and + * keep connection from MDT to make sure the update + * recovery will still keep trying until some one + * manually abort the recovery */ + class_disconnect_stale_exports(obd, + exp_finished_or_from_mdt); + /* Abort all of replay and replay lock req from + * clients */ + abort_req_replay_queue(obd); + abort_lock_replay_queue(obd); + CDEBUG(D_HA, "%s: there are still update replay (%#llx" + ")in the queue.\n", obd->obd_name, + next_update_transno); + } else { + obd->obd_abort_recovery = 1; + spin_unlock(&obd->obd_recovery_task_lock); + CWARN("%s recovery is aborted by hard timeout\n", + obd->obd_name); + } + } + + while (wait_event_timeout(obd->obd_next_transno_waitq, + check_routine(lut), + msecs_to_jiffies(60 * MSEC_PER_SEC)) == 0) + /* wait indefinitely for event, but don't trigger watchdog */; + + if (obd->obd_abort_recovery) { + CWARN("recovery is aborted, evict exports in recovery\n"); + if (lut->lut_tdtd != NULL) { + struct l_wait_info lwi = { 0 }; + + tdtd = lut->lut_tdtd; + /* Let's wait all of the update log recovery thread + * finished */ + l_wait_event(tdtd->tdtd_recovery_threads_waitq, + atomic_read(&tdtd->tdtd_recovery_threads_count) == 0, + &lwi); + /* Then abort the update recovery list */ + dtrq_list_destroy(lut->lut_tdtd); + } + + /** evict exports which didn't finish recovery yet */ + class_disconnect_stale_exports(obd, exp_finished); + return 1; + } else if (obd->obd_recovery_expired) { + obd->obd_recovery_expired = 0; + + /** If some clients died being recovered, evict them */ + LCONSOLE_WARN("%s: recovery is timed out, " + "evict stale exports\n", obd->obd_name); + /** evict cexports with no replay in queue, they are stalled */ + class_disconnect_stale_exports(obd, health_check); + + /** continue with VBR */ + spin_lock(&obd->obd_dev_lock); + obd->obd_version_recov = 1; + spin_unlock(&obd->obd_dev_lock); + /** + * reset timer, recovery will proceed with versions now, + * timeout is set just to handle reconnection delays + */ + extend_recovery_timer(obd, RECONNECT_DELAY_MAX, true); + /** Wait for recovery events again, after evicting bad clients */ + goto repeat; + } + return 0; +} + +static struct ptlrpc_request *target_next_replay_lock(struct lu_target *lut) +{ + struct obd_device *obd = lut->lut_obd; + struct ptlrpc_request *req = NULL; + + CDEBUG(D_HA, "Waiting for lock\n"); + if (target_recovery_overseer(lut, check_for_next_lock, + exp_lock_replay_healthy)) + abort_lock_replay_queue(obd); + + spin_lock(&obd->obd_recovery_task_lock); + if (!list_empty(&obd->obd_lock_replay_queue)) { + req = list_entry(obd->obd_lock_replay_queue.next, + struct ptlrpc_request, rq_list); + list_del_init(&req->rq_list); + spin_unlock(&obd->obd_recovery_task_lock); + } else { + spin_unlock(&obd->obd_recovery_task_lock); + LASSERT(list_empty(&obd->obd_lock_replay_queue)); + LASSERT(atomic_read(&obd->obd_lock_replay_clients) == 0); + /** evict exports failed VBR */ + class_disconnect_stale_exports(obd, exp_vbr_healthy); + } + return req; +} + +static struct ptlrpc_request *target_next_final_ping(struct obd_device *obd) +{ + struct ptlrpc_request *req = NULL; + + spin_lock(&obd->obd_recovery_task_lock); + if (!list_empty(&obd->obd_final_req_queue)) { + req = list_entry(obd->obd_final_req_queue.next, + struct ptlrpc_request, rq_list); + list_del_init(&req->rq_list); + spin_unlock(&obd->obd_recovery_task_lock); + if (req->rq_export->exp_in_recovery) { + spin_lock(&req->rq_export->exp_lock); + req->rq_export->exp_in_recovery = 0; + spin_unlock(&req->rq_export->exp_lock); + } + } else { + spin_unlock(&obd->obd_recovery_task_lock); + } + return req; +} + +static void handle_recovery_req(struct ptlrpc_thread *thread, + struct ptlrpc_request *req, + svc_handler_t handler) +{ + ENTRY; + + /** + * export can be evicted during recovery, no need to handle replays for + * it after that, discard such request silently + */ + if (req->rq_export->exp_disconnected) + RETURN_EXIT; + + req->rq_session.lc_thread = thread; + req->rq_svc_thread = thread; + req->rq_svc_thread->t_env->le_ses = &req->rq_session; + + /* thread context */ + lu_context_enter(&thread->t_env->le_ctx); + (void)handler(req); + lu_context_exit(&thread->t_env->le_ctx); + + req->rq_svc_thread->t_env->le_ses = NULL; + + /* don't reset timer for final stage */ + if (!exp_finished(req->rq_export)) { + timeout_t timeout = obd_timeout; + + /** + * Add request @timeout to the recovery time so next request from + * this client may come in recovery time + */ + if (!AT_OFF) { + struct ptlrpc_service_part *svcpt; + timeout_t est_timeout; + + svcpt = req->rq_rqbd->rqbd_svcpt; + /* If the server sent early reply for this request, + * the client will recalculate the timeout according to + * current server estimate service time, so we will + * use the maxium timeout here for waiting the client + * sending the next req + */ + est_timeout = at_get(&svcpt->scp_at_estimate); + timeout = max_t(timeout_t, at_est2timeout(est_timeout), + lustre_msg_get_timeout(req->rq_reqmsg)); + /* + * Add 2 net_latency, one for balance rq_deadline + * (see ptl_send_rpc), one for resend the req to server, + * Note: client will pack net_latency in replay req + * (see ptlrpc_replay_req) + */ + timeout += 2 * lustre_msg_get_service_timeout(req->rq_reqmsg); + } + extend_recovery_timer(class_exp2obd(req->rq_export), timeout, + true); + } + EXIT; +} + +/** Checking routines for recovery */ +static int check_for_recovery_ready(struct lu_target *lut) +{ + struct obd_device *obd = lut->lut_obd; + unsigned int clnts = atomic_read(&obd->obd_connected_clients); + + CDEBUG(D_HA, + "connected %d stale %d max_recoverable_clients %d abort %d expired %d\n", + clnts, obd->obd_stale_clients, + atomic_read(&obd->obd_max_recoverable_clients), + obd->obd_abort_recovery, obd->obd_recovery_expired); + + if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) { + LASSERT(clnts <= + atomic_read(&obd->obd_max_recoverable_clients)); + if (clnts + obd->obd_stale_clients < + atomic_read(&obd->obd_max_recoverable_clients)) + return 0; + } + + if (lut->lut_tdtd != NULL) { + if (!lut->lut_tdtd->tdtd_replay_ready && + !obd->obd_abort_recovery) { + /* Let's extend recovery timer, in case the recovery + * timer expired, and some clients got evicted */ + extend_recovery_timer(obd, obd->obd_recovery_timeout, + true); + CDEBUG(D_HA, + "%s update recovery is not ready, extend recovery %lu\n", + obd->obd_name, obd->obd_recovery_timeout); + return 0; + } + } + + return 1; +} + +enum { + REQUEST_RECOVERY = 1, + UPDATE_RECOVERY = 2, +}; + +static __u64 get_next_replay_req_transno(struct obd_device *obd) +{ + __u64 transno = 0; + + if (!list_empty(&obd->obd_req_replay_queue)) { + struct ptlrpc_request *req; + + req = list_entry(obd->obd_req_replay_queue.next, + struct ptlrpc_request, rq_list); + transno = lustre_msg_get_transno(req->rq_reqmsg); + } + + return transno; +} + +static __u64 get_next_transno(struct lu_target *lut, int *type) +{ + struct obd_device *obd = lut->lut_obd; + struct target_distribute_txn_data *tdtd = lut->lut_tdtd; + __u64 transno = 0; + __u64 update_transno; + ENTRY; + + transno = get_next_replay_req_transno(obd); + if (type != NULL) + *type = REQUEST_RECOVERY; + + if (tdtd == NULL) + RETURN(transno); + + update_transno = distribute_txn_get_next_transno(tdtd); + if (transno == 0 || (transno >= update_transno && + update_transno != 0)) { + transno = update_transno; + if (type != NULL) + *type = UPDATE_RECOVERY; + } + + RETURN(transno); +} + +/** + * drop duplicate replay request + * + * Because the operation has been replayed by update recovery, the request + * with the same transno will be dropped and also notify the client to send + * next replay request. + * + * \param[in] env execution environment + * \param[in] obd failover obd device + * \param[in] req request to be dropped + */ +static void drop_duplicate_replay_req(struct lu_env *env, + struct obd_device *obd, + struct ptlrpc_request *req) +{ + DEBUG_REQ(D_HA, req, "remove t%lld from %s because of duplicate" + " update records are found.\n", + lustre_msg_get_transno(req->rq_reqmsg), + libcfs_nid2str(req->rq_peer.nid)); + + /* Right now, only for MDS reint operation update replay and + * normal request replay can have the same transno */ + if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT) { + req_capsule_set(&req->rq_pill, &RQF_MDS_REINT); + req->rq_status = req_capsule_server_pack(&req->rq_pill); + if (likely(req->rq_export)) + target_committed_to_req(req); + lustre_msg_set_transno(req->rq_repmsg, req->rq_transno); + target_send_reply(req, req->rq_status, 0); + } else { + DEBUG_REQ(D_ERROR, req, "wrong opc" "from %s\n", + libcfs_nid2str(req->rq_peer.nid)); + } + target_exp_dequeue_req_replay(req); + target_request_copy_put(req); + obd->obd_replayed_requests++; +} + +#define WATCHDOG_TIMEOUT (obd_timeout * 10) + +static void replay_request_or_update(struct lu_env *env, + struct lu_target *lut, + struct target_recovery_data *trd, + struct ptlrpc_thread *thread) +{ + struct obd_device *obd = lut->lut_obd; + struct ptlrpc_request *req = NULL; + int type; + __u64 transno; + ENTRY; + + CDEBUG(D_HA, "Waiting for transno %lld\n", + obd->obd_next_recovery_transno); + + /* Replay all of request and update by transno */ + do { + struct target_distribute_txn_data *tdtd = lut->lut_tdtd; + + CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val); + + /** It is needed to extend recovery window above + * recovery_time_soft. Extending is possible only in the + * end of recovery window (see more details in + * handle_recovery_req()). + */ + CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300); + + if (target_recovery_overseer(lut, check_for_next_transno, + exp_req_replay_healthy_or_from_mdt)) { + abort_req_replay_queue(obd); + abort_lock_replay_queue(obd); + goto abort; + } + + spin_lock(&obd->obd_recovery_task_lock); + transno = get_next_transno(lut, &type); + if (type == REQUEST_RECOVERY && transno != 0) { + /* Drop replay request from client side, if the + * replay has been executed by update with the + * same transno */ + req = list_entry(obd->obd_req_replay_queue.next, + struct ptlrpc_request, rq_list); + + list_del_init(&req->rq_list); + obd->obd_requests_queued_for_recovery--; + spin_unlock(&obd->obd_recovery_task_lock); + + /* Let's check if the request has been redone by + * update replay */ + if (is_req_replayed_by_update(req)) { + struct distribute_txn_replay_req *dtrq; + + dtrq = distribute_txn_lookup_finish_list(tdtd, + req->rq_xid); + LASSERT(dtrq != NULL); + spin_lock(&tdtd->tdtd_replay_list_lock); + list_del_init(&dtrq->dtrq_list); + spin_unlock(&tdtd->tdtd_replay_list_lock); + dtrq_destroy(dtrq); + + drop_duplicate_replay_req(env, obd, req); + + continue; + } + + LASSERT(trd->trd_processing_task == current_pid()); + DEBUG_REQ(D_HA, req, "processing t%lld from %s", + lustre_msg_get_transno(req->rq_reqmsg), + libcfs_nid2str(req->rq_peer.nid)); + + thread->t_watchdog = lc_watchdog_add(WATCHDOG_TIMEOUT, + NULL, NULL); + handle_recovery_req(thread, req, + trd->trd_recovery_handler); + lc_watchdog_delete(thread->t_watchdog); + thread->t_watchdog = NULL; + + /** + * bz18031: increase next_recovery_transno before + * target_request_copy_put() will drop exp_rpc reference + */ + spin_lock(&obd->obd_recovery_task_lock); + obd->obd_next_recovery_transno++; + spin_unlock(&obd->obd_recovery_task_lock); + target_exp_dequeue_req_replay(req); + target_request_copy_put(req); + obd->obd_replayed_requests++; + } else if (type == UPDATE_RECOVERY && transno != 0) { + struct distribute_txn_replay_req *dtrq; + int rc; + + spin_unlock(&obd->obd_recovery_task_lock); + + LASSERT(tdtd != NULL); + dtrq = distribute_txn_get_next_req(tdtd); + lu_context_enter(&thread->t_env->le_ctx); + thread->t_watchdog = lc_watchdog_add(WATCHDOG_TIMEOUT, + NULL, NULL); + rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq); + lc_watchdog_delete(thread->t_watchdog); + thread->t_watchdog = NULL; + lu_context_exit(&thread->t_env->le_ctx); + extend_recovery_timer(obd, obd_timeout, true); + + if (rc == 0 && dtrq->dtrq_xid != 0) { + CDEBUG(D_HA, "Move x%llu t%llu" + " to finish list\n", dtrq->dtrq_xid, + dtrq->dtrq_master_transno); + + /* Add it to the replay finish list */ + spin_lock(&tdtd->tdtd_replay_list_lock); + list_add(&dtrq->dtrq_list, + &tdtd->tdtd_replay_finish_list); + spin_unlock(&tdtd->tdtd_replay_list_lock); + + spin_lock(&obd->obd_recovery_task_lock); + if (transno == obd->obd_next_recovery_transno) + obd->obd_next_recovery_transno++; + else if (transno > + obd->obd_next_recovery_transno) + obd->obd_next_recovery_transno = + transno + 1; + spin_unlock(&obd->obd_recovery_task_lock); + } else { + dtrq_destroy(dtrq); + } + } else { + spin_unlock(&obd->obd_recovery_task_lock); +abort: + LASSERT(list_empty(&obd->obd_req_replay_queue)); + LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0); + /** evict exports failed VBR */ + class_disconnect_stale_exports(obd, exp_vbr_healthy); + break; + } + } while (1); +} + +static int target_recovery_thread(void *arg) +{ + struct lu_target *lut = arg; + struct obd_device *obd = lut->lut_obd; + struct ptlrpc_request *req; + struct target_recovery_data *trd = &obd->obd_recovery_data; + unsigned long delta; + struct lu_env *env; + struct ptlrpc_thread *thread = NULL; + int rc = 0; + ENTRY; + + unshare_fs_struct(); + OBD_ALLOC_PTR(thread); + if (thread == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC_PTR(env); + if (env == NULL) + GOTO(out_thread, rc = -ENOMEM); + rc = lu_env_add(env); + if (rc) + GOTO(out_env, rc); + + rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD); + if (rc) + GOTO(out_env_remove, rc); + + thread->t_env = env; + thread->t_id = -1; /* force filter_iobuf_get/put to use local buffers */ + env->le_ctx.lc_thread = thread; + tgt_io_thread_init(thread); /* init thread_big_cache for IO requests */ + thread->t_watchdog = NULL; + + CDEBUG(D_HA, "%s: started recovery thread pid %d\n", obd->obd_name, + current_pid()); + trd->trd_processing_task = current_pid(); + + spin_lock(&obd->obd_dev_lock); + obd->obd_recovering = 1; + spin_unlock(&obd->obd_dev_lock); + complete(&trd->trd_starting); + + /* first of all, we have to know the first transno to replay */ + if (target_recovery_overseer(lut, check_for_recovery_ready, + exp_connect_healthy)) { + abort_req_replay_queue(obd); + abort_lock_replay_queue(obd); + if (lut->lut_tdtd != NULL) + dtrq_list_destroy(lut->lut_tdtd); + } + + /* next stage: replay requests or update */ + delta = jiffies; + CDEBUG(D_INFO, "1: request replay stage - %d clients from t%llu\n", + atomic_read(&obd->obd_req_replay_clients), + obd->obd_next_recovery_transno); + replay_request_or_update(env, lut, trd, thread); + + /** + * The second stage: replay locks + */ + CDEBUG(D_INFO, "2: lock replay stage - %d clients\n", + atomic_read(&obd->obd_lock_replay_clients)); + while ((req = target_next_replay_lock(lut))) { + LASSERT(trd->trd_processing_task == current_pid()); + DEBUG_REQ(D_HA, req, "processing lock from %s: ", + libcfs_nid2str(req->rq_peer.nid)); + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_REPLAY)) { + req->rq_status = -ENODEV; + target_request_copy_put(req); + continue; + } + handle_recovery_req(thread, req, + trd->trd_recovery_handler); + target_request_copy_put(req); + obd->obd_replayed_locks++; + } + + /** + * The third stage: reply on final pings, at this moment all clients + * must have request in final queue + */ + CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_RECONNECT, cfs_fail_val); + CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n"); + /** Update server last boot epoch */ + tgt_boot_epoch_update(lut); + /* We drop recoverying flag to forward all new requests + * to regular mds_handle() since now */ + spin_lock(&obd->obd_dev_lock); + obd->obd_recovering = obd->obd_abort_recovery = 0; + spin_unlock(&obd->obd_dev_lock); + spin_lock(&obd->obd_recovery_task_lock); + target_cancel_recovery_timer(obd); + spin_unlock(&obd->obd_recovery_task_lock); + while ((req = target_next_final_ping(obd))) { + LASSERT(trd->trd_processing_task == current_pid()); + DEBUG_REQ(D_HA, req, "processing final ping from %s: ", + libcfs_nid2str(req->rq_peer.nid)); + handle_recovery_req(thread, req, + trd->trd_recovery_handler); + /* Because the waiting client can not send ping to server, + * so we need refresh the last_request_time, to avoid the + * export is being evicted */ + ptlrpc_update_export_timer(req->rq_export, 0); + target_request_copy_put(req); + } + + delta = jiffies_to_msecs(jiffies - delta) / MSEC_PER_SEC; + CDEBUG(D_INFO,"4: recovery completed in %lus - %d/%d reqs/locks\n", + delta, obd->obd_replayed_requests, obd->obd_replayed_locks); + if (delta > OBD_RECOVERY_TIME_SOFT) { + CWARN("too long recovery - read logs\n"); + libcfs_debug_dumplog(); + } + + target_finish_recovery(lut); + + lu_context_fini(&env->le_ctx); + trd->trd_processing_task = 0; + complete(&trd->trd_finishing); + + tgt_io_thread_done(thread); +out_env_remove: + lu_env_remove(env); +out_env: + OBD_FREE_PTR(env); +out_thread: + OBD_FREE_PTR(thread); + RETURN(rc); +} + +static int target_start_recovery_thread(struct lu_target *lut, + svc_handler_t handler) +{ + struct obd_device *obd = lut->lut_obd; + int rc = 0; + struct target_recovery_data *trd = &obd->obd_recovery_data; + int index; + + memset(trd, 0, sizeof(*trd)); + init_completion(&trd->trd_starting); + init_completion(&trd->trd_finishing); + trd->trd_recovery_handler = handler; + + rc = server_name2index(obd->obd_name, &index, NULL); + if (rc < 0) + return rc; + + if (!IS_ERR(kthread_run(target_recovery_thread, + lut, "tgt_recover_%d", index))) { + wait_for_completion(&trd->trd_starting); + LASSERT(obd->obd_recovering != 0); + } else { + rc = -ECHILD; + } + + return rc; +} + +void target_stop_recovery_thread(struct obd_device *obd) +{ + if (obd->obd_recovery_data.trd_processing_task > 0) { + struct target_recovery_data *trd = &obd->obd_recovery_data; + /** recovery can be done but postrecovery is not yet */ + spin_lock(&obd->obd_dev_lock); + if (obd->obd_recovering) { + CERROR("%s: Aborting recovery\n", obd->obd_name); + obd->obd_abort_recovery = 1; + wake_up(&obd->obd_next_transno_waitq); + } + spin_unlock(&obd->obd_dev_lock); + wait_for_completion(&trd->trd_finishing); + } +} +EXPORT_SYMBOL(target_stop_recovery_thread); + +void target_recovery_fini(struct obd_device *obd) +{ + class_disconnect_exports(obd); + target_stop_recovery_thread(obd); + target_cleanup_recovery(obd); +} +EXPORT_SYMBOL(target_recovery_fini); + +static enum hrtimer_restart target_recovery_expired(struct hrtimer *timer) +{ + struct obd_device *obd = container_of(timer, struct obd_device, + obd_recovery_timer); + + CDEBUG(D_HA, + "%s: recovery timed out; %d clients are still in recovery after %llu seconds (%d clients connected)\n", + obd->obd_name, atomic_read(&obd->obd_lock_replay_clients), + ktime_get_real_seconds() - obd->obd_recovery_start, + atomic_read(&obd->obd_connected_clients)); + + obd->obd_recovery_expired = 1; + wake_up(&obd->obd_next_transno_waitq); + return HRTIMER_NORESTART; +} + +void target_recovery_init(struct lu_target *lut, svc_handler_t handler) +{ + struct obd_device *obd = lut->lut_obd; + + if (lut->lut_bottom->dd_rdonly) + return; + + if (atomic_read(&obd->obd_max_recoverable_clients) == 0) { + /** Update server last boot epoch */ + tgt_boot_epoch_update(lut); + return; + } + + CDEBUG(D_HA, "RECOVERY: service %s, %d recoverable clients, " + "last_transno %llu\n", obd->obd_name, + atomic_read(&obd->obd_max_recoverable_clients), + obd->obd_last_committed); + LASSERT(obd->obd_stopping == 0); + obd->obd_next_recovery_transno = obd->obd_last_committed + 1; + obd->obd_recovery_start = 0; + obd->obd_recovery_end = 0; + + hrtimer_init(&obd->obd_recovery_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + obd->obd_recovery_timer.function = &target_recovery_expired; + target_start_recovery_thread(lut, handler); +} +EXPORT_SYMBOL(target_recovery_init); + +static int target_process_req_flags(struct obd_device *obd, + struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + LASSERT(exp != NULL); + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) { + /* client declares he's ready to replay locks */ + spin_lock(&exp->exp_lock); + if (exp->exp_req_replay_needed) { + exp->exp_req_replay_needed = 0; + spin_unlock(&exp->exp_lock); + + LASSERT_ATOMIC_POS(&obd->obd_req_replay_clients); + atomic_dec(&obd->obd_req_replay_clients); + } else { + spin_unlock(&exp->exp_lock); + } + } + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) { + /* client declares he's ready to complete recovery + * so, we put the request on th final queue */ + spin_lock(&exp->exp_lock); + if (exp->exp_lock_replay_needed) { + exp->exp_lock_replay_needed = 0; + spin_unlock(&exp->exp_lock); + + LASSERT_ATOMIC_POS(&obd->obd_lock_replay_clients); + atomic_dec(&obd->obd_lock_replay_clients); + } else { + spin_unlock(&exp->exp_lock); + } + } + return 0; +} + +int target_queue_recovery_request(struct ptlrpc_request *req, + struct obd_device *obd) +{ + __u64 transno = lustre_msg_get_transno(req->rq_reqmsg); + struct ptlrpc_request *reqiter; + int inserted = 0; + ENTRY; + + if (obd->obd_recovery_data.trd_processing_task == current_pid()) { + /* Processing the queue right now, don't re-add. */ + RETURN(1); + } + + target_process_req_flags(obd, req); + + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) { + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) { + if (cfs_fail_val == 1) { + cfs_race_state = 1; + cfs_fail_val = 0; + wake_up(&cfs_race_waitq); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + } + + /* client declares he's ready to complete recovery + * so, we put the request on th final queue */ + target_request_copy_get(req); + DEBUG_REQ(D_HA, req, "queue final req"); + wake_up(&obd->obd_next_transno_waitq); + spin_lock(&obd->obd_recovery_task_lock); + if (obd->obd_recovering) { + struct ptlrpc_request *tmp; + struct ptlrpc_request *duplicate = NULL; + + if (likely(!req->rq_export->exp_replay_done)) { + req->rq_export->exp_replay_done = 1; + list_add_tail(&req->rq_list, + &obd->obd_final_req_queue); + spin_unlock(&obd->obd_recovery_task_lock); + RETURN(0); + } + + /* XXX O(n), but only happens if final ping is + * timed out, probably reorganize the list as + * a hash list later */ + list_for_each_entry_safe(reqiter, tmp, + &obd->obd_final_req_queue, + rq_list) { + if (reqiter->rq_export == req->rq_export) { + list_del_init(&reqiter->rq_list); + duplicate = reqiter; + break; + } + } + + list_add_tail(&req->rq_list, + &obd->obd_final_req_queue); + req->rq_export->exp_replay_done = 1; + spin_unlock(&obd->obd_recovery_task_lock); + + if (duplicate != NULL) { + DEBUG_REQ(D_HA, duplicate, + "put prev final req\n"); + target_request_copy_put(duplicate); + } + RETURN(0); + } else { + spin_unlock(&obd->obd_recovery_task_lock); + target_request_copy_put(req); + RETURN(obd->obd_stopping ? -ENOTCONN : 1); + } + } + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) { + /* client declares he's ready to replay locks */ + target_request_copy_get(req); + DEBUG_REQ(D_HA, req, "queue lock replay req"); + wake_up(&obd->obd_next_transno_waitq); + spin_lock(&obd->obd_recovery_task_lock); + LASSERT(obd->obd_recovering); + /* usually due to recovery abort */ + if (!req->rq_export->exp_in_recovery) { + spin_unlock(&obd->obd_recovery_task_lock); + target_request_copy_put(req); + RETURN(-ENOTCONN); + } + LASSERT(req->rq_export->exp_lock_replay_needed); + list_add_tail(&req->rq_list, &obd->obd_lock_replay_queue); + spin_unlock(&obd->obd_recovery_task_lock); + RETURN(0); + } + + /* CAVEAT EMPTOR: The incoming request message has been swabbed + * (i.e. buflens etc are in my own byte order), but type-dependent + * buffers (eg mdt_body, ost_body etc) have NOT been swabbed. */ + + if (!transno) { + INIT_LIST_HEAD(&req->rq_list); + DEBUG_REQ(D_HA, req, "not queueing"); + RETURN(1); + } + + /* If we're processing the queue, we want don't want to queue this + * message. + * + * Also, if this request has a transno less than the one we're waiting + * for, we should process it now. It could (and currently always will) + * be an open request for a descriptor that was opened some time ago. + * + * Also, a resent, replayed request that has already been + * handled will pass through here and be processed immediately. + */ + CDEBUG(D_HA, "Next recovery transno: %llu" + ", current: %llu, replaying\n", + obd->obd_next_recovery_transno, transno); + + /* If the request has been replayed by update replay, then sends this + * request to the recovery thread (replay_request_or_update()), where + * it will be handled */ + spin_lock(&obd->obd_recovery_task_lock); + if (transno < obd->obd_next_recovery_transno && + !is_req_replayed_by_update(req)) { + /* Processing the queue right now, don't re-add. */ + LASSERT(list_empty(&req->rq_list)); + spin_unlock(&obd->obd_recovery_task_lock); + RETURN(1); + } + spin_unlock(&obd->obd_recovery_task_lock); + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_DROP)) + RETURN(0); + + target_request_copy_get(req); + if (!req->rq_export->exp_in_recovery) { + target_request_copy_put(req); + RETURN(-ENOTCONN); + } + LASSERT(req->rq_export->exp_req_replay_needed); + + if (target_exp_enqueue_req_replay(req)) { + DEBUG_REQ(D_ERROR, req, "dropping resent queued req"); + target_request_copy_put(req); + RETURN(0); + } + + /* XXX O(n^2) */ + spin_lock(&obd->obd_recovery_task_lock); + LASSERT(obd->obd_recovering); + list_for_each_entry(reqiter, &obd->obd_req_replay_queue, rq_list) { + if (lustre_msg_get_transno(reqiter->rq_reqmsg) > transno) { + list_add_tail(&req->rq_list, &reqiter->rq_list); + inserted = 1; + goto added; + } + + if (unlikely(lustre_msg_get_transno(reqiter->rq_reqmsg) == + transno)) { + DEBUG_REQ(D_ERROR, req, "dropping replay: transno " + "has been claimed by another client"); + spin_unlock(&obd->obd_recovery_task_lock); + target_exp_dequeue_req_replay(req); + target_request_copy_put(req); + RETURN(0); + } + } +added: + if (!inserted) + list_add_tail(&req->rq_list, &obd->obd_req_replay_queue); + + obd->obd_requests_queued_for_recovery++; + spin_unlock(&obd->obd_recovery_task_lock); + wake_up(&obd->obd_next_transno_waitq); + RETURN(0); +} + +void target_committed_to_req(struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + + if (!exp->exp_obd->obd_no_transno && req->rq_repmsg != NULL) + lustre_msg_set_last_committed(req->rq_repmsg, + exp->exp_last_committed); + else + DEBUG_REQ(D_IOCTL, req, "not sending last_committed update (%d/" + "%d)", exp->exp_obd->obd_no_transno, + req->rq_repmsg == NULL); + + CDEBUG(D_INFO, "last_committed %llu, transno %llu, xid %llu\n", + exp->exp_last_committed, req->rq_transno, req->rq_xid); +} + +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * Packs current SLV and Limit into \a req. + */ +int target_pack_pool_reply(struct ptlrpc_request *req) +{ + struct obd_device *obd; + ENTRY; + + /* Check that we still have all structures alive as this may + * be some late RPC at shutdown time. */ + if (unlikely(!req->rq_export || !req->rq_export->exp_obd || + !exp_connect_lru_resize(req->rq_export))) { + lustre_msg_set_slv(req->rq_repmsg, 0); + lustre_msg_set_limit(req->rq_repmsg, 0); + RETURN(0); + } + + /* OBD is alive here as export is alive, which we checked above. */ + obd = req->rq_export->exp_obd; + + read_lock(&obd->obd_pool_lock); + lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv); + lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit); + read_unlock(&obd->obd_pool_lock); + + RETURN(0); +} + +static int target_send_reply_msg(struct ptlrpc_request *req, + int rc, int fail_id) +{ + if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) { + DEBUG_REQ(D_ERROR, req, "dropping reply"); + return -ECOMM; + } + /* We can have a null rq_reqmsg in the event of bad signature or + * no context when unwrapping */ + if (req->rq_reqmsg && + unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT && + OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET_REP))) + return -ECOMM; + + if (unlikely(rc)) { + DEBUG_REQ(D_NET, req, "processing error (%d)", rc); + req->rq_status = rc; + return ptlrpc_send_error(req, 1); + } else { + DEBUG_REQ(D_NET, req, "sending reply"); + } + + return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT); +} + +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) +{ + struct ptlrpc_service_part *svcpt; + int netrc; + struct ptlrpc_reply_state *rs; + struct obd_export *exp; + ENTRY; + + if (req->rq_no_reply) { + EXIT; + return; + } + + svcpt = req->rq_rqbd->rqbd_svcpt; + rs = req->rq_reply_state; + if (rs == NULL || !rs->rs_difficult) { + /* no notifiers */ + target_send_reply_msg (req, rc, fail_id); + EXIT; + return; + } + + /* must be an export if locks saved */ + LASSERT(req->rq_export != NULL); + /* req/reply consistent */ + LASSERT(rs->rs_svcpt == svcpt); + + /* "fresh" reply */ + LASSERT(!rs->rs_scheduled); + LASSERT(!rs->rs_scheduled_ever); + LASSERT(!rs->rs_handled); + LASSERT(!rs->rs_on_net); + LASSERT(rs->rs_export == NULL); + LASSERT(list_empty(&rs->rs_obd_list)); + LASSERT(list_empty(&rs->rs_exp_list)); + + exp = class_export_get(req->rq_export); + + /* disable reply scheduling while I'm setting up */ + rs->rs_scheduled = 1; + rs->rs_on_net = 1; + rs->rs_xid = req->rq_xid; + rs->rs_transno = req->rq_transno; + rs->rs_export = exp; + rs->rs_opc = lustre_msg_get_opc(req->rq_reqmsg); + + spin_lock(&exp->exp_uncommitted_replies_lock); + CDEBUG(D_NET, "rs transno = %llu, last committed = %llu\n", + rs->rs_transno, exp->exp_last_committed); + if (rs->rs_transno > exp->exp_last_committed) { + /* not committed already */ + list_add_tail(&rs->rs_obd_list, + &exp->exp_uncommitted_replies); + } + spin_unlock(&exp->exp_uncommitted_replies_lock); + + spin_lock(&exp->exp_lock); + list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies); + spin_unlock(&exp->exp_lock); + + netrc = target_send_reply_msg(req, rc, fail_id); + + spin_lock(&svcpt->scp_rep_lock); + + atomic_inc(&svcpt->scp_nreps_difficult); + + if (netrc != 0) { + /* error sending: reply is off the net. Also we need +1 + * reply ref until ptlrpc_handle_rs() is done + * with the reply state (if the send was successful, there + * would have been +1 ref for the net, which + * reply_out_callback leaves alone) */ + rs->rs_on_net = 0; + ptlrpc_rs_addref(rs); + } + + spin_lock(&rs->rs_lock); + if (rs->rs_transno <= exp->exp_last_committed || + (!rs->rs_on_net && !rs->rs_no_ack) || + list_empty(&rs->rs_exp_list) || /* completed already */ + list_empty(&rs->rs_obd_list)) { + CDEBUG(D_HA, "Schedule reply immediately\n"); + ptlrpc_dispatch_difficult_reply(rs); + } else { + list_add(&rs->rs_list, &svcpt->scp_rep_active); + rs->rs_scheduled = 0; /* allow notifier to schedule */ + } + spin_unlock(&rs->rs_lock); + spin_unlock(&svcpt->scp_rep_lock); + EXIT; +} + +enum ldlm_mode lck_compat_array[] = { + [LCK_EX] = LCK_COMPAT_EX, + [LCK_PW] = LCK_COMPAT_PW, + [LCK_PR] = LCK_COMPAT_PR, + [LCK_CW] = LCK_COMPAT_CW, + [LCK_CR] = LCK_COMPAT_CR, + [LCK_NL] = LCK_COMPAT_NL, + [LCK_GROUP] = LCK_COMPAT_GROUP, + [LCK_COS] = LCK_COMPAT_COS, +}; + +/** + * Rather arbitrary mapping from LDLM error codes to errno values. This should + * not escape to the user level. + */ +int ldlm_error2errno(enum ldlm_error error) +{ + int result; + + switch (error) { + case ELDLM_OK: + case ELDLM_LOCK_MATCHED: + result = 0; + break; + case ELDLM_LOCK_CHANGED: + result = -ESTALE; + break; + case ELDLM_LOCK_ABORTED: + result = -ENAVAIL; + break; + case ELDLM_LOCK_REPLACED: + result = -ESRCH; + break; + case ELDLM_NO_LOCK_DATA: + result = -ENOENT; + break; + case ELDLM_NAMESPACE_EXISTS: + result = -EEXIST; + break; + case ELDLM_BAD_NAMESPACE: + result = -EBADF; + break; + default: + if (((int)error) < 0) { /* cast to signed type */ + result = error; /* as ldlm_error can be unsigned */ + } else { + CERROR("Invalid DLM result code: %d\n", error); + result = -EPROTO; + } + } + return result; +} +EXPORT_SYMBOL(ldlm_error2errno); + +/** + * Dual to ldlm_error2errno(): maps errno values back to enum ldlm_error. + */ +enum ldlm_error ldlm_errno2error(int err_no) +{ + int error; + + switch (err_no) { + case 0: + error = ELDLM_OK; + break; + case -ESTALE: + error = ELDLM_LOCK_CHANGED; + break; + case -ENAVAIL: + error = ELDLM_LOCK_ABORTED; + break; + case -ESRCH: + error = ELDLM_LOCK_REPLACED; + break; + case -ENOENT: + error = ELDLM_NO_LOCK_DATA; + break; + case -EEXIST: + error = ELDLM_NAMESPACE_EXISTS; + break; + case -EBADF: + error = ELDLM_BAD_NAMESPACE; + break; + default: + error = err_no; + } + return error; +} + +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void ldlm_dump_export_locks(struct obd_export *exp) +{ + spin_lock(&exp->exp_locks_list_guard); + if (!list_empty(&exp->exp_locks_list)) { + struct ldlm_lock *lock; + + CERROR("dumping locks for export %p," + "ignore if the unmount doesn't hang\n", exp); + list_for_each_entry(lock, &exp->exp_locks_list, + l_exp_refs_link) + LDLM_ERROR(lock, "lock:"); + } + spin_unlock(&exp->exp_locks_list_guard); +} +#endif + +#ifdef HAVE_SERVER_SUPPORT +static int target_bulk_timeout(void *data) +{ + ENTRY; + /* We don't fail the connection here, because having the export + * killed makes the (vital) call to commitrw very sad. + */ + RETURN(1); +} + +static inline const char *bulk2type(struct ptlrpc_request *req) +{ + if (req->rq_bulk_read) + return "READ"; + if (req->rq_bulk_write) + return "WRITE"; + return "UNKNOWN"; +} + +int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc, + struct l_wait_info *lwi) +{ + struct ptlrpc_request *req = desc->bd_req; + time64_t start = ktime_get_seconds(); + time64_t deadline; + int rc = 0; + + ENTRY; + + /* If there is eviction in progress, wait for it to finish. */ + if (unlikely(atomic_read(&exp->exp_obd->obd_evict_inprogress))) { + *lwi = LWI_INTR(NULL, NULL); + rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq, + !atomic_read(&exp->exp_obd-> + obd_evict_inprogress), + lwi); + } + + /* Check if client was evicted or reconnected already. */ + if (exp->exp_failed || + exp->exp_conn_cnt > lustre_msg_get_conn_cnt(req->rq_reqmsg)) { + rc = -ENOTCONN; + } else { + if (req->rq_bulk_read) + rc = sptlrpc_svc_wrap_bulk(req, desc); + + if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS)) + req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg); + else /* old version, bulk matchbits is rq_xid */ + req->rq_mbits = req->rq_xid; + + if (rc == 0) + rc = ptlrpc_start_bulk_transfer(desc); + } + + if (rc < 0) { + DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc %d", + bulk2type(req), rc); + RETURN(rc); + } + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) { + ptlrpc_abort_bulk(desc); + RETURN(0); + } + + /* limit actual bulk transfer to bulk_timeout seconds */ + deadline = start + bulk_timeout; + if (deadline > req->rq_deadline) + deadline = req->rq_deadline; + + do { + time64_t timeoutl = deadline - ktime_get_seconds(); + long timeout_jiffies = timeoutl <= 0 ? + 1 : cfs_time_seconds(timeoutl); + time64_t rq_deadline; + + *lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies, + cfs_time_seconds(1), + target_bulk_timeout, desc); + rc = l_wait_event(desc->bd_waitq, + !ptlrpc_server_bulk_active(desc) || + exp->exp_failed || + exp->exp_conn_cnt > + lustre_msg_get_conn_cnt(req->rq_reqmsg), + lwi); + LASSERT(rc == 0 || rc == -ETIMEDOUT); + /* Wait again if we changed rq_deadline. */ + rq_deadline = READ_ONCE(req->rq_deadline); + deadline = start + bulk_timeout; + if (deadline > rq_deadline) + deadline = rq_deadline; + } while (rc == -ETIMEDOUT && + deadline > ktime_get_seconds()); + + if (rc == -ETIMEDOUT) { + DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds", + bulk2type(req), deadline - start, + ktime_get_real_seconds() - deadline); + ptlrpc_abort_bulk(desc); + } else if (exp->exp_failed) { + DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s", + bulk2type(req)); + rc = -ENOTCONN; + ptlrpc_abort_bulk(desc); + } else if (exp->exp_conn_cnt > + lustre_msg_get_conn_cnt(req->rq_reqmsg)) { + DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s", + bulk2type(req)); + /* We don't reply anyway. */ + rc = -ETIMEDOUT; + ptlrpc_abort_bulk(desc); + } else if (desc->bd_failure) { + DEBUG_REQ(D_ERROR, req, "network error on bulk %s", + bulk2type(req)); + /* XXX should this be a different errno? */ + rc = -ETIMEDOUT; + } else { + if (req->rq_bulk_write) + rc = sptlrpc_svc_unwrap_bulk(req, desc); + if (rc == 0 && desc->bd_nob_transferred != desc->bd_nob) { + DEBUG_REQ(D_ERROR, req, "truncated bulk %s %d(%d)", + bulk2type(req), desc->bd_nob_transferred, + desc->bd_nob); + /* XXX should this be a different errno? */ + rc = -ETIMEDOUT; + } + } + + RETURN(rc); +} +EXPORT_SYMBOL(target_bulk_io); + +#endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c new file mode 100644 index 0000000000000..42eccaf9cf861 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c @@ -0,0 +1,2867 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_lock.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include + +#include +#include + +#include "ldlm_internal.h" + +struct kmem_cache *ldlm_glimpse_work_kmem; +EXPORT_SYMBOL(ldlm_glimpse_work_kmem); + +/* lock types */ +char *ldlm_lockname[] = { + [0] = "--", + [LCK_EX] = "EX", + [LCK_PW] = "PW", + [LCK_PR] = "PR", + [LCK_CW] = "CW", + [LCK_CR] = "CR", + [LCK_NL] = "NL", + [LCK_GROUP] = "GROUP", + [LCK_COS] = "COS" +}; +EXPORT_SYMBOL(ldlm_lockname); + +char *ldlm_typename[] = { + [LDLM_PLAIN] = "PLN", + [LDLM_EXTENT] = "EXT", + [LDLM_FLOCK] = "FLK", + [LDLM_IBITS] = "IBT", +}; + +static ldlm_policy_wire_to_local_t ldlm_policy_wire_to_local[] = { + [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_wire_to_local, + [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_wire_to_local, + [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_wire_to_local, + [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_wire_to_local, +}; + +static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = { + [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_local_to_wire, + [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_local_to_wire, + [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_local_to_wire, + [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_local_to_wire, +}; + +/** + * Converts lock policy from local format to on the wire lock_desc format + */ +void ldlm_convert_policy_to_wire(enum ldlm_type type, + const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy) +{ + ldlm_policy_local_to_wire_t convert; + + convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE]; + + convert(lpolicy, wpolicy); +} + +/** + * Converts lock policy from on the wire lock_desc format to local format + */ +void ldlm_convert_policy_to_local(struct obd_export *exp, enum ldlm_type type, + const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy) +{ + ldlm_policy_wire_to_local_t convert; + + convert = ldlm_policy_wire_to_local[type - LDLM_MIN_TYPE]; + + convert(wpolicy, lpolicy); +} + +const char *ldlm_it2str(enum ldlm_intent_flags it) +{ + switch (it) { + case IT_OPEN: + return "open"; + case IT_CREAT: + return "creat"; + case (IT_OPEN | IT_CREAT): + return "open|creat"; + case IT_READDIR: + return "readdir"; + case IT_GETATTR: + return "getattr"; + case IT_LOOKUP: + return "lookup"; + case IT_GETXATTR: + return "getxattr"; + case IT_LAYOUT: + return "layout"; + default: + CERROR("Unknown intent 0x%08x\n", it); + return "UNKNOWN"; + } +} +EXPORT_SYMBOL(ldlm_it2str); + +extern struct kmem_cache *ldlm_lock_slab; + +#ifdef HAVE_SERVER_SUPPORT +static ldlm_processing_policy ldlm_processing_policy_table[] = { + [LDLM_PLAIN] = ldlm_process_plain_lock, + [LDLM_EXTENT] = ldlm_process_extent_lock, + [LDLM_FLOCK] = ldlm_process_flock_lock, + [LDLM_IBITS] = ldlm_process_inodebits_lock, +}; + +ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res) +{ + return ldlm_processing_policy_table[res->lr_type]; +} +EXPORT_SYMBOL(ldlm_get_processing_policy); + +static ldlm_reprocessing_policy ldlm_reprocessing_policy_table[] = { + [LDLM_PLAIN] = ldlm_reprocess_queue, + [LDLM_EXTENT] = ldlm_reprocess_queue, + [LDLM_FLOCK] = ldlm_reprocess_queue, + [LDLM_IBITS] = ldlm_reprocess_inodebits_queue, +}; + +ldlm_reprocessing_policy ldlm_get_reprocessing_policy(struct ldlm_resource *res) +{ + return ldlm_reprocessing_policy_table[res->lr_type]; +} + +#endif /* HAVE_SERVER_SUPPORT */ + +void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg) +{ + ns->ns_policy = arg; +} +EXPORT_SYMBOL(ldlm_register_intent); + +/* + * REFCOUNTED LOCK OBJECTS + */ + + +/** + * Get a reference on a lock. + * + * Lock refcounts, during creation: + * - one special one for allocation, dec'd only once in destroy + * - one for being a lock that's in-use + * - one for the addref associated with a new lock + */ +struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock) +{ + atomic_inc(&lock->l_refc); + return lock; +} +EXPORT_SYMBOL(ldlm_lock_get); + +/** + * Release lock reference. + * + * Also frees the lock if it was last reference. + */ +void ldlm_lock_put(struct ldlm_lock *lock) +{ + ENTRY; + + LASSERT(lock->l_resource != LP_POISON); + LASSERT(atomic_read(&lock->l_refc) > 0); + if (atomic_dec_and_test(&lock->l_refc)) { + struct ldlm_resource *res; + + LDLM_DEBUG(lock, + "final lock_put on destroyed lock, freeing it."); + + res = lock->l_resource; + LASSERT(ldlm_is_destroyed(lock)); + LASSERT(list_empty(&lock->l_exp_list)); + LASSERT(list_empty(&lock->l_res_link)); + LASSERT(list_empty(&lock->l_pending_chain)); + + lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats, + LDLM_NSS_LOCKS); + lu_ref_del(&res->lr_reference, "lock", lock); + if (lock->l_export) { + class_export_lock_put(lock->l_export, lock); + lock->l_export = NULL; + } + + if (lock->l_lvb_data != NULL) + OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len); + + if (res->lr_type == LDLM_EXTENT) { + ldlm_interval_free(ldlm_interval_detach(lock)); + } else if (res->lr_type == LDLM_IBITS) { + if (lock->l_ibits_node != NULL) + OBD_SLAB_FREE_PTR(lock->l_ibits_node, + ldlm_inodebits_slab); + } + ldlm_resource_putref(res); + lock->l_resource = NULL; + lu_ref_fini(&lock->l_reference); + OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle); + } + + EXIT; +} +EXPORT_SYMBOL(ldlm_lock_put); + +/** + * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked. + */ +int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock) +{ + int rc = 0; + if (!list_empty(&lock->l_lru)) { + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); + if (ns->ns_last_pos == &lock->l_lru) + ns->ns_last_pos = lock->l_lru.prev; + list_del_init(&lock->l_lru); + LASSERT(ns->ns_nr_unused > 0); + ns->ns_nr_unused--; + rc = 1; + } + return rc; +} + +/** + * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first. + * + * If \a last_use is non-zero, it will remove the lock from LRU only if + * it matches lock's l_last_used. + * + * \retval 0 if \a last_use is set, the lock is not in LRU list or \a last_use + * doesn't match lock's l_last_used; + * otherwise, the lock hasn't been in the LRU list. + * \retval 1 the lock was in LRU list and removed. + */ +int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + int rc = 0; + + ENTRY; + if (ldlm_is_ns_srv(lock)) { + LASSERT(list_empty(&lock->l_lru)); + RETURN(0); + } + + spin_lock(&ns->ns_lock); + if (!ktime_compare(last_use, ktime_set(0, 0)) || + !ktime_compare(last_use, lock->l_last_used)) + rc = ldlm_lock_remove_from_lru_nolock(lock); + spin_unlock(&ns->ns_lock); + + RETURN(rc); +} + +/** + * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked. + */ +void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + lock->l_last_used = ktime_get(); + LASSERT(list_empty(&lock->l_lru)); + LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); + list_add_tail(&lock->l_lru, &ns->ns_unused_list); + LASSERT(ns->ns_nr_unused >= 0); + ns->ns_nr_unused++; +} + +/** + * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks + * first. + */ +void ldlm_lock_add_to_lru(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + ENTRY; + spin_lock(&ns->ns_lock); + ldlm_lock_add_to_lru_nolock(lock); + spin_unlock(&ns->ns_lock); + EXIT; +} + +/** + * Moves LDLM lock \a lock that is already in namespace LRU to the tail of + * the LRU. Performs necessary LRU locking + */ +void ldlm_lock_touch_in_lru(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + ENTRY; + if (ldlm_is_ns_srv(lock)) { + LASSERT(list_empty(&lock->l_lru)); + EXIT; + return; + } + + spin_lock(&ns->ns_lock); + if (!list_empty(&lock->l_lru)) { + ldlm_lock_remove_from_lru_nolock(lock); + ldlm_lock_add_to_lru_nolock(lock); + } + spin_unlock(&ns->ns_lock); + EXIT; +} + +/** + * Helper to destroy a locked lock. + * + * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock + * Must be called with l_lock and lr_lock held. + * + * Does not actually free the lock data, but rather marks the lock as + * destroyed by setting l_destroyed field in the lock to 1. Destroys a + * handle->lock association too, so that the lock can no longer be found + * and removes the lock from LRU list. Actual lock freeing occurs when + * last lock reference goes away. + * + * Original comment (of some historical value): + * This used to have a 'strict' flag, which recovery would use to mark an + * in-use lock as needing-to-die. Lest I am ever tempted to put it back, I + * shall explain why it's gone: with the new hash table scheme, once you call + * ldlm_lock_destroy, you can never drop your final references on this lock. + * Because it's not in the hash table anymore. -phil + */ +static int ldlm_lock_destroy_internal(struct ldlm_lock *lock) +{ + ENTRY; + + if (lock->l_readers || lock->l_writers) { + LDLM_ERROR(lock, "lock still has references"); + LBUG(); + } + + if (!list_empty(&lock->l_res_link)) { + LDLM_ERROR(lock, "lock still on resource"); + LBUG(); + } + + if (ldlm_is_destroyed(lock)) { + LASSERT(list_empty(&lock->l_lru)); + EXIT; + return 0; + } + ldlm_set_destroyed(lock); + + if (lock->l_export && lock->l_export->exp_lock_hash) { + /* NB: it's safe to call cfs_hash_del() even lock isn't + * in exp_lock_hash. */ + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_del(lock->l_export->exp_lock_hash, + &lock->l_remote_handle, &lock->l_exp_hash); + } + + ldlm_lock_remove_from_lru(lock); + class_handle_unhash(&lock->l_handle); + + EXIT; + return 1; +} + +/** + * Destroys a LDLM lock \a lock. Performs necessary locking first. + */ +void ldlm_lock_destroy(struct ldlm_lock *lock) +{ + int first; + ENTRY; + lock_res_and_lock(lock); + first = ldlm_lock_destroy_internal(lock); + unlock_res_and_lock(lock); + + /* drop reference from hashtable only for first destroy */ + if (first) { + lu_ref_del(&lock->l_reference, "hash", lock); + LDLM_LOCK_RELEASE(lock); + } + EXIT; +} + +/** + * Destroys a LDLM lock \a lock that is already locked. + */ +void ldlm_lock_destroy_nolock(struct ldlm_lock *lock) +{ + int first; + ENTRY; + first = ldlm_lock_destroy_internal(lock); + /* drop reference from hashtable only for first destroy */ + if (first) { + lu_ref_del(&lock->l_reference, "hash", lock); + LDLM_LOCK_RELEASE(lock); + } + EXIT; +} + +/* this is called by portals_handle2object with the handle lock taken */ +static void lock_handle_addref(void *lock) +{ + LDLM_LOCK_GET((struct ldlm_lock *)lock); +} + +static void lock_handle_free(void *lock, int size) +{ + LASSERT(size == sizeof(struct ldlm_lock)); + OBD_SLAB_FREE(lock, ldlm_lock_slab, size); +} + +static struct portals_handle_ops lock_handle_ops = { + .hop_addref = lock_handle_addref, + .hop_free = lock_handle_free, +}; + +/** + * + * Allocate and initialize new lock structure. + * + * usage: pass in a resource on which you have done ldlm_resource_get + * new lock will take over the refcount. + * returns: lock with refcount 2 - one for current caller and one for remote + */ +static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource) +{ + struct ldlm_lock *lock; + ENTRY; + + if (resource == NULL) + LBUG(); + + OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, GFP_NOFS); + if (lock == NULL) + RETURN(NULL); + + spin_lock_init(&lock->l_lock); + lock->l_resource = resource; + lu_ref_add(&resource->lr_reference, "lock", lock); + + atomic_set(&lock->l_refc, 2); + INIT_LIST_HEAD(&lock->l_res_link); + INIT_LIST_HEAD(&lock->l_lru); + INIT_LIST_HEAD(&lock->l_pending_chain); + INIT_LIST_HEAD(&lock->l_bl_ast); + INIT_LIST_HEAD(&lock->l_cp_ast); + INIT_LIST_HEAD(&lock->l_rk_ast); + init_waitqueue_head(&lock->l_waitq); + lock->l_blocking_lock = NULL; + INIT_LIST_HEAD(&lock->l_sl_mode); + INIT_LIST_HEAD(&lock->l_sl_policy); + INIT_HLIST_NODE(&lock->l_exp_hash); + INIT_HLIST_NODE(&lock->l_exp_flock_hash); + + lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats, + LDLM_NSS_LOCKS); + INIT_LIST_HEAD_RCU(&lock->l_handle.h_link); + class_handle_hash(&lock->l_handle, &lock_handle_ops); + + lu_ref_init(&lock->l_reference); + lu_ref_add(&lock->l_reference, "hash", lock); + lock->l_callback_timeout = 0; + lock->l_activity = 0; + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + INIT_LIST_HEAD(&lock->l_exp_refs_link); + lock->l_exp_refs_nr = 0; + lock->l_exp_refs_target = NULL; +#endif + INIT_LIST_HEAD(&lock->l_exp_list); + + RETURN(lock); +} + +/** + * Moves LDLM lock \a lock to another resource. + * This is used on client when server returns some other lock than requested + * (typically as a result of intent operation) + */ +int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, + const struct ldlm_res_id *new_resid) +{ + struct ldlm_resource *oldres = lock->l_resource; + struct ldlm_resource *newres; + int type; + ENTRY; + + LASSERT(ns_is_client(ns)); + + lock_res_and_lock(lock); + if (memcmp(new_resid, &lock->l_resource->lr_name, + sizeof(lock->l_resource->lr_name)) == 0) { + /* Nothing to do */ + unlock_res_and_lock(lock); + RETURN(0); + } + + LASSERT(new_resid->name[0] != 0); + + /* This function assumes that the lock isn't on any lists */ + LASSERT(list_empty(&lock->l_res_link)); + + type = oldres->lr_type; + unlock_res_and_lock(lock); + + newres = ldlm_resource_get(ns, NULL, new_resid, type, 1); + if (IS_ERR(newres)) + RETURN(PTR_ERR(newres)); + + lu_ref_add(&newres->lr_reference, "lock", lock); + /* + * To flip the lock from the old to the new resource, lock, oldres and + * newres have to be locked. Resource spin-locks are nested within + * lock->l_lock, and are taken in the memory address order to avoid + * dead-locks. + */ + spin_lock(&lock->l_lock); + oldres = lock->l_resource; + if (oldres < newres) { + lock_res(oldres); + lock_res_nested(newres, LRT_NEW); + } else { + lock_res(newres); + lock_res_nested(oldres, LRT_NEW); + } + LASSERT(memcmp(new_resid, &oldres->lr_name, + sizeof oldres->lr_name) != 0); + lock->l_resource = newres; + unlock_res(oldres); + unlock_res_and_lock(lock); + + /* ...and the flowers are still standing! */ + lu_ref_del(&oldres->lr_reference, "lock", lock); + ldlm_resource_putref(oldres); + + RETURN(0); +} + +/** \defgroup ldlm_handles LDLM HANDLES + * Ways to get hold of locks without any addresses. + * @{ + */ + +/** + * Fills in handle for LDLM lock \a lock into supplied \a lockh + * Does not take any references. + */ +void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh) +{ + lockh->cookie = lock->l_handle.h_cookie; +} +EXPORT_SYMBOL(ldlm_lock2handle); + +/** + * Obtain a lock reference by handle. + * + * if \a flags: atomically get the lock and set the flags. + * Return NULL if flag already set + */ +struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle, + __u64 flags) +{ + struct ldlm_lock *lock; + ENTRY; + + LASSERT(handle); + + lock = class_handle2object(handle->cookie, NULL); + if (lock == NULL) + RETURN(NULL); + + if (lock->l_export != NULL && lock->l_export->exp_failed) { + CDEBUG(D_INFO, "lock export failed: lock %p, exp %p\n", + lock, lock->l_export); + LDLM_LOCK_PUT(lock); + RETURN(NULL); + } + + /* It's unlikely but possible that someone marked the lock as + * destroyed after we did handle2object on it */ + if ((flags == 0) && !ldlm_is_destroyed(lock)) { + lu_ref_add(&lock->l_reference, "handle", current); + RETURN(lock); + } + + lock_res_and_lock(lock); + + LASSERT(lock->l_resource != NULL); + + lu_ref_add_atomic(&lock->l_reference, "handle", current); + if (unlikely(ldlm_is_destroyed(lock))) { + unlock_res_and_lock(lock); + CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock); + LDLM_LOCK_PUT(lock); + RETURN(NULL); + } + + /* If we're setting flags, make sure none of them are already set. */ + if (flags != 0) { + if ((lock->l_flags & flags) != 0) { + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + RETURN(NULL); + } + + lock->l_flags |= flags; + } + + unlock_res_and_lock(lock); + RETURN(lock); +} +EXPORT_SYMBOL(__ldlm_handle2lock); +/** @} ldlm_handles */ + +/** + * Fill in "on the wire" representation for given LDLM lock into supplied + * lock descriptor \a desc structure. + */ +void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc) +{ + ldlm_res2desc(lock->l_resource, &desc->l_resource); + desc->l_req_mode = lock->l_req_mode; + desc->l_granted_mode = lock->l_granted_mode; + ldlm_convert_policy_to_wire(lock->l_resource->lr_type, + &lock->l_policy_data, + &desc->l_policy_data); +} + +/** + * Add a lock to list of conflicting locks to send AST to. + * + * Only add if we have not sent a blocking AST to the lock yet. + */ +static void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list) +{ + if (!ldlm_is_ast_sent(lock)) { + LDLM_DEBUG(lock, "lock incompatible; sending blocking AST."); + ldlm_set_ast_sent(lock); + /* If the enqueuing client said so, tell the AST recipient to + * discard dirty data, rather than writing back. */ + if (ldlm_is_ast_discard_data(new)) + ldlm_set_discard_data(lock); + + /* Lock can be converted from a blocking state back to granted + * after lock convert or COS downgrade but still be in an + * older bl_list because it is controlled only by + * ldlm_work_bl_ast_lock(), let it be processed there. + */ + if (list_empty(&lock->l_bl_ast)) { + list_add(&lock->l_bl_ast, work_list); + LDLM_LOCK_GET(lock); + } + LASSERT(lock->l_blocking_lock == NULL); + lock->l_blocking_lock = LDLM_LOCK_GET(new); + } +} + +/** + * Add a lock to list of just granted locks to send completion AST to. + */ +static void ldlm_add_cp_work_item(struct ldlm_lock *lock, + struct list_head *work_list) +{ + if (!ldlm_is_cp_reqd(lock)) { + ldlm_set_cp_reqd(lock); + LDLM_DEBUG(lock, "lock granted; sending completion AST."); + LASSERT(list_empty(&lock->l_cp_ast)); + list_add(&lock->l_cp_ast, work_list); + LDLM_LOCK_GET(lock); + } +} + +/** + * Aggregator function to add AST work items into a list. Determines + * what sort of an AST work needs to be done and calls the proper + * adding function. + * Must be called with lr_lock held. + */ +void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list) +{ + ENTRY; + check_res_locked(lock->l_resource); + if (new) + ldlm_add_bl_work_item(lock, new, work_list); + else + ldlm_add_cp_work_item(lock, work_list); + EXIT; +} + +/** + * Add specified reader/writer reference to LDLM lock with handle \a lockh. + * r/w reference type is determined by \a mode + * Calls ldlm_lock_addref_internal. + */ +void ldlm_lock_addref(const struct lustre_handle *lockh, enum ldlm_mode mode) +{ + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(lockh); + LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie); + ldlm_lock_addref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_addref); + +/** + * Helper function. + * Add specified reader/writer reference to LDLM lock \a lock. + * r/w reference type is determined by \a mode + * Removes lock from LRU if it is there. + * Assumes the LDLM lock is already locked. + */ +void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, + enum ldlm_mode mode) +{ + ldlm_lock_remove_from_lru(lock); + if (mode & (LCK_NL | LCK_CR | LCK_PR)) { + lock->l_readers++; + lu_ref_add_atomic(&lock->l_reference, "reader", lock); + } + if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { + lock->l_writers++; + lu_ref_add_atomic(&lock->l_reference, "writer", lock); + } + LDLM_LOCK_GET(lock); + lu_ref_add_atomic(&lock->l_reference, "user", lock); + LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]); +} + +/** + * Attempts to add reader/writer reference to a lock with handle \a lockh, and + * fails if lock is already LDLM_FL_CBPENDING or destroyed. + * + * \retval 0 success, lock was addref-ed + * + * \retval -EAGAIN lock is being canceled. + */ +int ldlm_lock_addref_try(const struct lustre_handle *lockh, enum ldlm_mode mode) +{ + struct ldlm_lock *lock; + int result; + + result = -EAGAIN; + lock = ldlm_handle2lock(lockh); + if (lock != NULL) { + lock_res_and_lock(lock); + if (lock->l_readers != 0 || lock->l_writers != 0 || + !ldlm_is_cbpending(lock)) { + ldlm_lock_addref_internal_nolock(lock, mode); + result = 0; + } + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + return result; +} +EXPORT_SYMBOL(ldlm_lock_addref_try); + +/** + * Add specified reader/writer reference to LDLM lock \a lock. + * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work. + * Only called for local locks. + */ +void ldlm_lock_addref_internal(struct ldlm_lock *lock, enum ldlm_mode mode) +{ + lock_res_and_lock(lock); + ldlm_lock_addref_internal_nolock(lock, mode); + unlock_res_and_lock(lock); +} + +/** + * Removes reader/writer reference for LDLM lock \a lock. + * Assumes LDLM lock is already locked. + * only called in ldlm_flock_destroy and for local locks. + * Does NOT add lock to LRU if no r/w references left to accomodate flock locks + * that cannot be placed in LRU. + */ +void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, + enum ldlm_mode mode) +{ + LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); + if (mode & (LCK_NL | LCK_CR | LCK_PR)) { + LASSERT(lock->l_readers > 0); + lu_ref_del(&lock->l_reference, "reader", lock); + lock->l_readers--; + } + if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { + LASSERT(lock->l_writers > 0); + lu_ref_del(&lock->l_reference, "writer", lock); + lock->l_writers--; + } + + lu_ref_del(&lock->l_reference, "user", lock); + LDLM_LOCK_RELEASE(lock); /* matches the LDLM_LOCK_GET() in addref */ +} + +/** + * Removes reader/writer reference for LDLM lock \a lock. + * Locks LDLM lock first. + * If the lock is determined to be client lock on a client and r/w refcount + * drops to zero and the lock is not blocked, the lock is added to LRU lock + * on the namespace. + * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called. + */ +void ldlm_lock_decref_internal(struct ldlm_lock *lock, enum ldlm_mode mode) +{ + struct ldlm_namespace *ns; + ENTRY; + + lock_res_and_lock(lock); + + ns = ldlm_lock_to_ns(lock); + + ldlm_lock_decref_internal_nolock(lock, mode); + + if ((ldlm_is_local(lock) || lock->l_req_mode == LCK_GROUP) && + !lock->l_readers && !lock->l_writers) { + /* If this is a local lock on a server namespace and this was + * the last reference, cancel the lock. + * + * Group locks are special: + * They must not go in LRU, but they are not called back + * like non-group locks, instead they are manually released. + * They have an l_writers reference which they keep until + * they are manually released, so we remove them when they have + * no more reader or writer references. - LU-6368 */ + ldlm_set_cbpending(lock); + } + + if (!lock->l_readers && !lock->l_writers && ldlm_is_cbpending(lock)) { + /* If we received a blocked AST and this was the last reference, + * run the callback. */ + if (ldlm_is_ns_srv(lock) && lock->l_export) + CERROR("FL_CBPENDING set on non-local lock--just a " + "warning\n"); + + LDLM_DEBUG(lock, "final decref done on cbpending lock"); + + LDLM_LOCK_GET(lock); /* dropped by bl thread */ + ldlm_lock_remove_from_lru(lock); + unlock_res_and_lock(lock); + + if (ldlm_is_fail_loc(lock)) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + if (ldlm_is_atomic_cb(lock) || + ldlm_bl_to_thread_lock(ns, NULL, lock) != 0) + ldlm_handle_bl_callback(ns, NULL, lock); + } else if (ns_is_client(ns) && + !lock->l_readers && !lock->l_writers && + !ldlm_is_no_lru(lock) && + !ldlm_is_bl_ast(lock) && + !ldlm_is_converting(lock)) { + + LDLM_DEBUG(lock, "add lock into lru list"); + + /* If this is a client-side namespace and this was the last + * reference, put it on the LRU. */ + ldlm_lock_add_to_lru(lock); + unlock_res_and_lock(lock); + + if (ldlm_is_fail_loc(lock)) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + /* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE + * are not supported by the server, otherwise, it is done on + * enqueue. */ + if (!exp_connect_cancelset(lock->l_conn_export) && + !ns_connect_lru_resize(ns)) + ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0); + } else { + LDLM_DEBUG(lock, "do not add lock into lru list"); + unlock_res_and_lock(lock); + } + + EXIT; +} + +/** + * Decrease reader/writer refcount for LDLM lock with handle \a lockh + */ +void ldlm_lock_decref(const struct lustre_handle *lockh, enum ldlm_mode mode) +{ + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); + LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie); + ldlm_lock_decref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_decref); + +/** + * Decrease reader/writer refcount for LDLM lock with handle + * \a lockh and mark it for subsequent cancellation once r/w refcount + * drops to zero instead of putting into LRU. + * + */ +void ldlm_lock_decref_and_cancel(const struct lustre_handle *lockh, + enum ldlm_mode mode) +{ + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); + ENTRY; + + LASSERT(lock != NULL); + + LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); + lock_res_and_lock(lock); + ldlm_set_cbpending(lock); + unlock_res_and_lock(lock); + ldlm_lock_decref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_decref_and_cancel); + +struct sl_insert_point { + struct list_head *res_link; + struct list_head *mode_link; + struct list_head *policy_link; +}; + +/** + * Finds a position to insert the new lock into granted lock list. + * + * Used for locks eligible for skiplist optimization. + * + * Parameters: + * queue [input]: the granted list where search acts on; + * req [input]: the lock whose position to be located; + * prev [output]: positions within 3 lists to insert @req to + * Return Value: + * filled @prev + * NOTE: called by + * - ldlm_grant_lock_with_skiplist + */ +static void search_granted_lock(struct list_head *queue, + struct ldlm_lock *req, + struct sl_insert_point *prev) +{ + struct list_head *tmp; + struct ldlm_lock *lock, *mode_end, *policy_end; + ENTRY; + + list_for_each(tmp, queue) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + mode_end = list_entry(lock->l_sl_mode.prev, + struct ldlm_lock, l_sl_mode); + + if (lock->l_req_mode != req->l_req_mode) { + /* jump to last lock of mode group */ + tmp = &mode_end->l_res_link; + continue; + } + + /* suitable mode group is found */ + if (lock->l_resource->lr_type == LDLM_PLAIN) { + /* insert point is last lock of the mode group */ + prev->res_link = &mode_end->l_res_link; + prev->mode_link = &mode_end->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; + return; + } else if (lock->l_resource->lr_type == LDLM_IBITS) { + for (;;) { + policy_end = + list_entry(lock->l_sl_policy.prev, + struct ldlm_lock, + l_sl_policy); + + if (lock->l_policy_data.l_inodebits.bits == + req->l_policy_data.l_inodebits.bits) { + /* insert point is last lock of + * the policy group */ + prev->res_link = + &policy_end->l_res_link; + prev->mode_link = + &policy_end->l_sl_mode; + prev->policy_link = + &policy_end->l_sl_policy; + EXIT; + return; + } + + if (policy_end == mode_end) + /* done with mode group */ + break; + + /* go to next policy group within mode group */ + tmp = policy_end->l_res_link.next; + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + } /* loop over policy groups within the mode group */ + + /* insert point is last lock of the mode group, + * new policy group is started */ + prev->res_link = &mode_end->l_res_link; + prev->mode_link = &mode_end->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; + return; + } else { + LDLM_ERROR(lock,"is not LDLM_PLAIN or LDLM_IBITS lock"); + LBUG(); + } + } + + /* insert point is last lock on the queue, + * new mode group and new policy group are started */ + prev->res_link = queue->prev; + prev->mode_link = &req->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; + return; +} + +/** + * Add a lock into resource granted list after a position described by + * \a prev. + */ +static void ldlm_granted_list_add_lock(struct ldlm_lock *lock, + struct sl_insert_point *prev) +{ + struct ldlm_resource *res = lock->l_resource; + ENTRY; + + check_res_locked(res); + + ldlm_resource_dump(D_INFO, res); + LDLM_DEBUG(lock, "About to add lock:"); + + if (ldlm_is_destroyed(lock)) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + return; + } + + LASSERT(list_empty(&lock->l_res_link)); + LASSERT(list_empty(&lock->l_sl_mode)); + LASSERT(list_empty(&lock->l_sl_policy)); + + /* + * lock->link == prev->link means lock is first starting the group. + * Don't re-add to itself to suppress kernel warnings. + */ + if (&lock->l_res_link != prev->res_link) + list_add(&lock->l_res_link, prev->res_link); + if (&lock->l_sl_mode != prev->mode_link) + list_add(&lock->l_sl_mode, prev->mode_link); + if (&lock->l_sl_policy != prev->policy_link) + list_add(&lock->l_sl_policy, prev->policy_link); + + EXIT; +} + +/** + * Add a lock to granted list on a resource maintaining skiplist + * correctness. + */ +void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock) +{ + struct sl_insert_point prev; + + LASSERT(ldlm_is_granted(lock)); + + search_granted_lock(&lock->l_resource->lr_granted, lock, &prev); + ldlm_granted_list_add_lock(lock, &prev); +} + +/** + * Perform lock granting bookkeeping. + * + * Includes putting the lock into granted list and updating lock mode. + * NOTE: called by + * - ldlm_lock_enqueue + * - ldlm_reprocess_queue + * + * must be called with lr_lock held + */ +void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list) +{ + struct ldlm_resource *res = lock->l_resource; + ENTRY; + + check_res_locked(res); + + lock->l_granted_mode = lock->l_req_mode; + + if (work_list && lock->l_completion_ast != NULL) + ldlm_add_ast_work_item(lock, NULL, work_list); + + if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) + ldlm_grant_lock_with_skiplist(lock); + else if (res->lr_type == LDLM_EXTENT) + ldlm_extent_add_lock(res, lock); + else if (res->lr_type == LDLM_FLOCK) { + /* We should not add locks to granted list in the following + * cases: + * - this is an UNLOCK but not a real lock; + * - this is a TEST lock; + * - this is a F_CANCELLK lock (async flock has req_mode == 0) + * - this is a deadlock (flock cannot be granted) */ + if (lock->l_req_mode == 0 || + lock->l_req_mode == LCK_NL || + ldlm_is_test_lock(lock) || + ldlm_is_flock_deadlock(lock)) + RETURN_EXIT; + ldlm_resource_add_lock(res, &res->lr_granted, lock); + } else { + LBUG(); + } + + ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock); + EXIT; +} + +/** + * Check if the given @lock meets the criteria for a match. + * A reference on the lock is taken if matched. + * + * \param lock test-against this lock + * \param data parameters + */ +static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data) +{ + union ldlm_policy_data *lpol = &lock->l_policy_data; + enum ldlm_mode match = LCK_MINMODE; + + if (lock == data->lmd_old) + return INTERVAL_ITER_STOP; + + /* Check if this lock can be matched. + * Used by LU-2919(exclusive open) for open lease lock */ + if (ldlm_is_excl(lock)) + return INTERVAL_ITER_CONT; + + /* llite sometimes wants to match locks that will be + * canceled when their users drop, but we allow it to match + * if it passes in CBPENDING and the lock still has users. + * this is generally only going to be used by children + * whose parents already hold a lock so forward progress + * can still happen. */ + if (ldlm_is_cbpending(lock) && + !(data->lmd_flags & LDLM_FL_CBPENDING)) + return INTERVAL_ITER_CONT; + if (!data->lmd_unref && ldlm_is_cbpending(lock) && + lock->l_readers == 0 && lock->l_writers == 0) + return INTERVAL_ITER_CONT; + + if (!(lock->l_req_mode & *data->lmd_mode)) + return INTERVAL_ITER_CONT; + + /* When we search for ast_data, we are not doing a traditional match, + * so we don't worry about IBITS or extent matching. + */ + if (data->lmd_has_ast_data) { + if (!lock->l_ast_data) + return INTERVAL_ITER_CONT; + + goto matched; + } + + match = lock->l_req_mode; + + switch (lock->l_resource->lr_type) { + case LDLM_EXTENT: + if (lpol->l_extent.start > data->lmd_policy->l_extent.start || + lpol->l_extent.end < data->lmd_policy->l_extent.end) + return INTERVAL_ITER_CONT; + + if (unlikely(match == LCK_GROUP) && + data->lmd_policy->l_extent.gid != LDLM_GID_ANY && + lpol->l_extent.gid != data->lmd_policy->l_extent.gid) + return INTERVAL_ITER_CONT; + break; + case LDLM_IBITS: + /* We match if we have existing lock with same or wider set + of bits. */ + if ((lpol->l_inodebits.bits & + data->lmd_policy->l_inodebits.bits) != + data->lmd_policy->l_inodebits.bits) + return INTERVAL_ITER_CONT; + break; + default: + ; + } + + /* We match if we have existing lock with same or wider set + of bits. */ + if (!data->lmd_unref && LDLM_HAVE_MASK(lock, GONE)) + return INTERVAL_ITER_CONT; + + if (!equi(data->lmd_flags & LDLM_FL_LOCAL_ONLY, ldlm_is_local(lock))) + return INTERVAL_ITER_CONT; + + /* Filter locks by skipping flags */ + if (data->lmd_skip_flags & lock->l_flags) + return INTERVAL_ITER_CONT; + +matched: + if (data->lmd_flags & LDLM_FL_TEST_LOCK) { + LDLM_LOCK_GET(lock); + ldlm_lock_touch_in_lru(lock); + } else { + ldlm_lock_addref_internal_nolock(lock, match); + } + + *data->lmd_mode = match; + data->lmd_lock = lock; + + return INTERVAL_ITER_STOP; +} + +static unsigned int itree_overlap_cb(struct interval_node *in, void *args) +{ + struct ldlm_interval *node = to_ldlm_interval(in); + struct ldlm_match_data *data = args; + struct ldlm_lock *lock; + int rc; + + list_for_each_entry(lock, &node->li_group, l_sl_policy) { + rc = lock_matches(lock, data); + if (rc == INTERVAL_ITER_STOP) + return INTERVAL_ITER_STOP; + } + return INTERVAL_ITER_CONT; +} + +/** + * Search for a lock with given parameters in interval trees. + * + * \param res search for a lock in this resource + * \param data parameters + * + * \retval a referenced lock or NULL. + */ +struct ldlm_lock *search_itree(struct ldlm_resource *res, + struct ldlm_match_data *data) +{ + struct interval_node_extent ext = { + .start = data->lmd_policy->l_extent.start, + .end = data->lmd_policy->l_extent.end + }; + int idx; + + data->lmd_lock = NULL; + + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + struct ldlm_interval_tree *tree = &res->lr_itree[idx]; + + if (tree->lit_root == NULL) + continue; + + if (!(tree->lit_mode & *data->lmd_mode)) + continue; + + interval_search(tree->lit_root, &ext, + itree_overlap_cb, data); + if (data->lmd_lock) + return data->lmd_lock; + } + + return NULL; +} +EXPORT_SYMBOL(search_itree); + + +/** + * Search for a lock with given properties in a queue. + * + * \param queue search for a lock in this queue + * \param data parameters + * + * \retval a referenced lock or NULL. + */ +static struct ldlm_lock *search_queue(struct list_head *queue, + struct ldlm_match_data *data) +{ + struct ldlm_lock *lock; + int rc; + + data->lmd_lock = NULL; + + list_for_each_entry(lock, queue, l_res_link) { + rc = lock_matches(lock, data); + if (rc == INTERVAL_ITER_STOP) + return data->lmd_lock; + } + + return NULL; +} + +void ldlm_lock_fail_match_locked(struct ldlm_lock *lock) +{ + if ((lock->l_flags & LDLM_FL_FAIL_NOTIFIED) == 0) { + lock->l_flags |= LDLM_FL_FAIL_NOTIFIED; + wake_up_all(&lock->l_waitq); + } +} +EXPORT_SYMBOL(ldlm_lock_fail_match_locked); + +void ldlm_lock_fail_match(struct ldlm_lock *lock) +{ + lock_res_and_lock(lock); + ldlm_lock_fail_match_locked(lock); + unlock_res_and_lock(lock); +} + +/** + * Mark lock as "matchable" by OST. + * + * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB + * is not yet valid. + * Assumes LDLM lock is already locked. + */ +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock) +{ + ldlm_set_lvb_ready(lock); + wake_up_all(&lock->l_waitq); +} +EXPORT_SYMBOL(ldlm_lock_allow_match_locked); + +/** + * Mark lock as "matchable" by OST. + * Locks the lock and then \see ldlm_lock_allow_match_locked + */ +void ldlm_lock_allow_match(struct ldlm_lock *lock) +{ + lock_res_and_lock(lock); + ldlm_lock_allow_match_locked(lock); + unlock_res_and_lock(lock); +} +EXPORT_SYMBOL(ldlm_lock_allow_match); + +/** + * Attempt to find a lock with specified properties. + * + * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is + * set in \a flags + * + * Can be called in two ways: + * + * If 'ns' is NULL, then lockh describes an existing lock that we want to look + * for a duplicate of. + * + * Otherwise, all of the fields must be filled in, to match against. + * + * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the + * server (ie, connh is NULL) + * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted + * list will be considered + * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked + * to be canceled can still be matched as long as they still have reader + * or writer refernces + * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock, + * just tell us if we would have matched. + * + * \retval 1 if it finds an already-existing lock that is compatible; in this + * case, lockh is filled in with a addref()ed lock + * + * We also check security context, and if that fails we simply return 0 (to + * keep caller code unchanged), the context failure will be discovered by + * caller sometime later. + */ +enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns, + __u64 flags, __u64 skip_flags, + const struct ldlm_res_id *res_id, + enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + struct lustre_handle *lockh, int unref) +{ + struct ldlm_match_data data = { + .lmd_old = NULL, + .lmd_lock = NULL, + .lmd_mode = &mode, + .lmd_policy = policy, + .lmd_flags = flags, + .lmd_skip_flags = skip_flags, + .lmd_unref = unref, + .lmd_has_ast_data = false, + }; + struct ldlm_resource *res; + struct ldlm_lock *lock; + int matched; + + ENTRY; + + if (ns == NULL) { + data.lmd_old = ldlm_handle2lock(lockh); + LASSERT(data.lmd_old != NULL); + + ns = ldlm_lock_to_ns(data.lmd_old); + res_id = &data.lmd_old->l_resource->lr_name; + type = data.lmd_old->l_resource->lr_type; + *data.lmd_mode = data.lmd_old->l_req_mode; + } + + res = ldlm_resource_get(ns, NULL, res_id, type, 0); + if (IS_ERR(res)) { + LASSERT(data.lmd_old == NULL); + RETURN(0); + } + + LDLM_RESOURCE_ADDREF(res); + lock_res(res); + if (res->lr_type == LDLM_EXTENT) + lock = search_itree(res, &data); + else + lock = search_queue(&res->lr_granted, &data); + if (!lock && !(flags & LDLM_FL_BLOCK_GRANTED)) + lock = search_queue(&res->lr_waiting, &data); + matched = lock ? mode : 0; + unlock_res(res); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + + if (lock) { + ldlm_lock2handle(lock, lockh); + if ((flags & LDLM_FL_LVB_READY) && + (!ldlm_is_lvb_ready(lock))) { + __u64 wait_flags = LDLM_FL_LVB_READY | + LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED; + struct l_wait_info lwi; + + if (lock->l_completion_ast) { + int err = lock->l_completion_ast(lock, + LDLM_FL_WAIT_NOREPROC, + NULL); + if (err) + GOTO(out_fail_match, matched = 0); + } + + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout), + NULL, LWI_ON_SIGNAL_NOOP, NULL); + + /* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */ + l_wait_event(lock->l_waitq, lock->l_flags & wait_flags, + &lwi); + if (!ldlm_is_lvb_ready(lock)) + GOTO(out_fail_match, matched = 0); + } + + /* check user's security context */ + if (lock->l_conn_export && + sptlrpc_import_check_ctx( + class_exp2cliimp(lock->l_conn_export))) + GOTO(out_fail_match, matched = 0); + + LDLM_DEBUG(lock, "matched (%llu %llu)", + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[2] : policy->l_extent.start, + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[3] : policy->l_extent.end); + +out_fail_match: + if (flags & LDLM_FL_TEST_LOCK) + LDLM_LOCK_RELEASE(lock); + else if (!matched) + ldlm_lock_decref_internal(lock, mode); + } + + /* less verbose for test-only */ + if (!matched && !(flags & LDLM_FL_TEST_LOCK)) { + LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res " + "%llu/%llu (%llu %llu)", ns, + type, mode, res_id->name[0], res_id->name[1], + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[2] : policy->l_extent.start, + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[3] : policy->l_extent.end); + } + if (data.lmd_old != NULL) + LDLM_LOCK_PUT(data.lmd_old); + + return matched; +} +EXPORT_SYMBOL(ldlm_lock_match_with_skip); + +enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh, + __u64 *bits) +{ + struct ldlm_lock *lock; + enum ldlm_mode mode = 0; + ENTRY; + + lock = ldlm_handle2lock(lockh); + if (lock != NULL) { + lock_res_and_lock(lock); + if (LDLM_HAVE_MASK(lock, GONE)) + GOTO(out, mode); + + if (ldlm_is_cbpending(lock) && + lock->l_readers == 0 && lock->l_writers == 0) + GOTO(out, mode); + + if (bits) + *bits = lock->l_policy_data.l_inodebits.bits; + mode = lock->l_granted_mode; + ldlm_lock_addref_internal_nolock(lock, mode); + } + + EXIT; + +out: + if (lock != NULL) { + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + return mode; +} +EXPORT_SYMBOL(ldlm_revalidate_lock_handle); + +/** The caller must guarantee that the buffer is large enough. */ +int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, + enum req_location loc, void *data, int size) +{ + void *lvb; + ENTRY; + + LASSERT(data != NULL); + LASSERT(size >= 0); + + switch (lock->l_lvb_type) { + case LVB_T_OST: + if (size == sizeof(struct ost_lvb)) { + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb); + else + lvb = req_capsule_server_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + } else if (size == sizeof(struct ost_lvb_v1)) { + struct ost_lvb *olvb = data; + + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb_v1); + else + lvb = req_capsule_server_sized_swab_get(pill, + &RMF_DLM_LVB, size, + lustre_swab_ost_lvb_v1); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + olvb->lvb_mtime_ns = 0; + olvb->lvb_atime_ns = 0; + olvb->lvb_ctime_ns = 0; + } else { + LDLM_ERROR(lock, "Replied unexpected ost LVB size %d", + size); + RETURN(-EINVAL); + } + break; + case LVB_T_LQUOTA: + if (size == sizeof(struct lquota_lvb)) { + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_lquota_lvb); + else + lvb = req_capsule_server_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_lquota_lvb); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + } else { + LDLM_ERROR(lock, "Replied unexpected lquota LVB size %d", + size); + RETURN(-EINVAL); + } + break; + case LVB_T_LAYOUT: + if (size == 0) + break; + + if (loc == RCL_CLIENT) + lvb = req_capsule_client_get(pill, &RMF_DLM_LVB); + else + lvb = req_capsule_server_get(pill, &RMF_DLM_LVB); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + break; + default: + LDLM_ERROR(lock, "Unknown LVB type: %d", lock->l_lvb_type); + libcfs_debug_dumpstack(NULL); + RETURN(-EINVAL); + } + + RETURN(0); +} + +/** + * Create and fill in new LDLM lock with specified properties. + * Returns a referenced lock + */ +struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + enum ldlm_type type, + enum ldlm_mode mode, + const struct ldlm_callback_suite *cbs, + void *data, __u32 lvb_len, + enum lvb_type lvb_type) +{ + struct ldlm_lock *lock; + struct ldlm_resource *res; + int rc; + ENTRY; + + res = ldlm_resource_get(ns, NULL, res_id, type, 1); + if (IS_ERR(res)) + RETURN(ERR_CAST(res)); + + lock = ldlm_lock_new(res); + if (lock == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + lock->l_req_mode = mode; + lock->l_ast_data = data; + lock->l_pid = current_pid(); + if (ns_is_server(ns)) + ldlm_set_ns_srv(lock); + if (cbs) { + lock->l_blocking_ast = cbs->lcs_blocking; + lock->l_completion_ast = cbs->lcs_completion; + lock->l_glimpse_ast = cbs->lcs_glimpse; + } + + switch (type) { + case LDLM_EXTENT: + rc = ldlm_extent_alloc_lock(lock); + break; + case LDLM_IBITS: + rc = ldlm_inodebits_alloc_lock(lock); + break; + default: + rc = 0; + } + if (rc) + GOTO(out, rc); + + if (lvb_len) { + lock->l_lvb_len = lvb_len; + OBD_ALLOC_LARGE(lock->l_lvb_data, lvb_len); + if (lock->l_lvb_data == NULL) + GOTO(out, rc = -ENOMEM); + } + + lock->l_lvb_type = lvb_type; + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK)) + GOTO(out, rc = -ENOENT); + + RETURN(lock); + +out: + ldlm_lock_destroy(lock); + LDLM_LOCK_RELEASE(lock); + RETURN(ERR_PTR(rc)); +} + +#ifdef HAVE_SERVER_SUPPORT +static enum ldlm_error ldlm_lock_enqueue_helper(struct ldlm_lock *lock, + __u64 *flags) +{ + struct ldlm_resource *res = lock->l_resource; + enum ldlm_error rc = ELDLM_OK; + struct list_head rpc_list = LIST_HEAD_INIT(rpc_list); + ldlm_processing_policy policy; + + ENTRY; + + policy = ldlm_get_processing_policy(res); + policy(lock, flags, LDLM_PROCESS_ENQUEUE, &rc, &rpc_list); + if (rc == ELDLM_OK && lock->l_granted_mode != lock->l_req_mode && + res->lr_type != LDLM_FLOCK) + rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list); + + if (!list_empty(&rpc_list)) + ldlm_discard_bl_list(&rpc_list); + + RETURN(rc); +} +#endif + +/** + * Enqueue (request) a lock. + * + * Does not block. As a result of enqueue the lock would be put + * into granted or waiting list. + * + * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag + * set, skip all the enqueueing and delegate lock processing to intent policy + * function. + */ +enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env, + struct ldlm_namespace *ns, + struct ldlm_lock **lockp, + void *cookie, __u64 *flags) +{ + struct ldlm_lock *lock = *lockp; + struct ldlm_resource *res = lock->l_resource; + int local = ns_is_client(ldlm_res_to_ns(res)); + enum ldlm_error rc = ELDLM_OK; + struct ldlm_interval *node = NULL; + ENTRY; + + /* policies are not executed on the client or during replay */ + if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT + && !local && ns->ns_policy) { + rc = ns->ns_policy(env, ns, lockp, cookie, lock->l_req_mode, + *flags, NULL); + if (rc == ELDLM_LOCK_REPLACED) { + /* The lock that was returned has already been granted, + * and placed into lockp. If it's not the same as the + * one we passed in, then destroy the old one and our + * work here is done. */ + if (lock != *lockp) { + ldlm_lock_destroy(lock); + LDLM_LOCK_RELEASE(lock); + } + *flags |= LDLM_FL_LOCK_CHANGED; + RETURN(0); + } else if (rc != ELDLM_OK && + ldlm_is_granted(lock)) { + LASSERT(*flags & LDLM_FL_RESENT); + /* It may happen that ns_policy returns an error in + * resend case, object may be unlinked or just some + * error occurs. It is unclear if lock reached the + * client in the original reply, just leave the lock on + * server, not returning it again to client. Due to + * LU-6529, the server will not OOM. */ + RETURN(rc); + } else if (rc != ELDLM_OK || + (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) { + ldlm_lock_destroy(lock); + RETURN(rc); + } + } + + if (*flags & LDLM_FL_RESENT) { + /* Reconstruct LDLM_FL_SRV_ENQ_MASK @flags for reply. + * Set LOCK_CHANGED always. + * Check if the lock is granted for BLOCK_GRANTED. + * Take NO_TIMEOUT from the lock as it is inherited through + * LDLM_FL_INHERIT_MASK */ + *flags |= LDLM_FL_LOCK_CHANGED; + if (!ldlm_is_granted(lock)) + *flags |= LDLM_FL_BLOCK_GRANTED; + *flags |= lock->l_flags & LDLM_FL_NO_TIMEOUT; + RETURN(ELDLM_OK); + } + + /* For a replaying lock, it might be already in granted list. So + * unlinking the lock will cause the interval node to be freed, we + * have to allocate the interval node early otherwise we can't regrant + * this lock in the future. - jay */ + if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT) + OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS); + + lock_res_and_lock(lock); + if (local && ldlm_is_granted(lock)) { + /* The server returned a blocked lock, but it was granted + * before we got a chance to actually enqueue it. We don't + * need to do anything else. */ + *flags &= ~LDLM_FL_BLOCKED_MASK; + GOTO(out, rc = ELDLM_OK); + } + + ldlm_resource_unlink_lock(lock); + if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) { + if (node == NULL) { + ldlm_lock_destroy_nolock(lock); + GOTO(out, rc = -ENOMEM); + } + + INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + node = NULL; + } + + /* Some flags from the enqueue want to make it into the AST, via the + * lock's l_flags. */ + if (*flags & LDLM_FL_AST_DISCARD_DATA) + ldlm_set_ast_discard_data(lock); + if (*flags & LDLM_FL_TEST_LOCK) + ldlm_set_test_lock(lock); + if (*flags & LDLM_FL_COS_INCOMPAT) + ldlm_set_cos_incompat(lock); + if (*flags & LDLM_FL_COS_ENABLED) + ldlm_set_cos_enabled(lock); + + /* This distinction between local lock trees is very important; a client + * namespace only has information about locks taken by that client, and + * thus doesn't have enough information to decide for itself if it can + * be granted (below). In this case, we do exactly what the server + * tells us to do, as dictated by the 'flags'. + * + * We do exactly the same thing during recovery, when the server is + * more or less trusting the clients not to lie. + * + * FIXME (bug 268): Detect obvious lies by checking compatibility in + * granted queue. */ + if (local) { + if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED)) + ldlm_resource_add_lock(res, &res->lr_waiting, lock); + else + ldlm_grant_lock(lock, NULL); + GOTO(out, rc = ELDLM_OK); +#ifdef HAVE_SERVER_SUPPORT + } else if (*flags & LDLM_FL_REPLAY) { + if (*flags & LDLM_FL_BLOCK_WAIT) { + ldlm_resource_add_lock(res, &res->lr_waiting, lock); + GOTO(out, rc = ELDLM_OK); + } else if (*flags & LDLM_FL_BLOCK_GRANTED) { + ldlm_grant_lock(lock, NULL); + GOTO(out, rc = ELDLM_OK); + } + /* If no flags, fall through to normal enqueue path. */ + } + + rc = ldlm_lock_enqueue_helper(lock, flags); + GOTO(out, rc); +#else + } else { + CERROR("This is client-side-only module, cannot handle " + "LDLM_NAMESPACE_SERVER resource type lock.\n"); + LBUG(); + } +#endif + +out: + unlock_res_and_lock(lock); + if (node) + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + return rc; +} + +#ifdef HAVE_SERVER_SUPPORT +/** + * Iterate through all waiting locks on a given resource queue and attempt to + * grant them. + * + * Must be called with resource lock held. + */ +int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue, + struct list_head *work_list, + enum ldlm_process_intention intention, + struct ldlm_lock *hint) +{ + struct list_head *tmp, *pos; + ldlm_processing_policy policy; + __u64 flags; + int rc = LDLM_ITER_CONTINUE; + enum ldlm_error err; + struct list_head bl_ast_list = LIST_HEAD_INIT(bl_ast_list); + + ENTRY; + + check_res_locked(res); + + policy = ldlm_get_processing_policy(res); + LASSERT(policy); + LASSERT(intention == LDLM_PROCESS_RESCAN || + intention == LDLM_PROCESS_RECOVERY); + +restart: + list_for_each_safe(tmp, pos, queue) { + struct ldlm_lock *pending; + struct list_head rpc_list = LIST_HEAD_INIT(rpc_list); + + pending = list_entry(tmp, struct ldlm_lock, l_res_link); + + CDEBUG(D_INFO, "Reprocessing lock %p\n", pending); + + flags = 0; + rc = policy(pending, &flags, intention, &err, &rpc_list); + if (pending->l_granted_mode == pending->l_req_mode || + res->lr_type == LDLM_FLOCK) { + list_splice(&rpc_list, work_list); + } else { + list_splice(&rpc_list, &bl_ast_list); + } + /* + * When this is called from recovery done, we always want + * to scan the whole list no matter what 'rc' is returned. + */ + if (rc != LDLM_ITER_CONTINUE && + intention == LDLM_PROCESS_RESCAN) + break; + } + + if (!list_empty(&bl_ast_list)) { + unlock_res(res); + + rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list, + LDLM_WORK_BL_AST); + + lock_res(res); + if (rc == -ERESTART) + GOTO(restart, rc); + } + + if (!list_empty(&bl_ast_list)) + ldlm_discard_bl_list(&bl_ast_list); + + RETURN(intention == LDLM_PROCESS_RESCAN ? rc : LDLM_ITER_CONTINUE); +} + +/** + * Conflicting locks are detected for a lock to be enqueued, add the lock + * into waiting list and send blocking ASTs to the conflicting locks. + * + * \param[in] lock The lock to be enqueued. + * \param[out] flags Lock flags for the lock to be enqueued. + * \param[in] rpc_list Conflicting locks list. + * + * \retval -ERESTART: Some lock was instantly canceled while sending + * blocking ASTs, caller needs to re-check conflicting + * locks. + * \retval -EAGAIN: Lock was destroyed, caller should return error. + * \reval 0: Lock is successfully added in waiting list. + */ +int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags, + struct list_head *rpc_list) +{ + struct ldlm_resource *res = lock->l_resource; + int rc; + ENTRY; + + check_res_locked(res); + + /* If either of the compat_queue()s returned failure, then we + * have ASTs to send and must go onto the waiting list. + * + * bug 2322: we used to unlink and re-add here, which was a + * terrible folly -- if we goto restart, we could get + * re-ordered! Causes deadlock, because ASTs aren't sent! */ + if (list_empty(&lock->l_res_link)) + ldlm_resource_add_lock(res, &res->lr_waiting, lock); + unlock_res(res); + + rc = ldlm_run_ast_work(ldlm_res_to_ns(res), rpc_list, + LDLM_WORK_BL_AST); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_OST_FAIL_RACE) && + !ns_is_client(ldlm_res_to_ns(res))) + class_fail_export(lock->l_export); + + if (rc == -ERESTART) + ldlm_reprocess_all(res, NULL); + + lock_res(res); + if (rc == -ERESTART) { + /* 15715: The lock was granted and destroyed after + * resource lock was dropped. Interval node was freed + * in ldlm_lock_destroy. Anyway, this always happens + * when a client is being evicted. So it would be + * ok to return an error. -jay */ + if (ldlm_is_destroyed(lock)) + RETURN(-EAGAIN); + + /* lock was granted while resource was unlocked. */ + if (ldlm_is_granted(lock)) { + /* bug 11300: if the lock has been granted, + * break earlier because otherwise, we will go + * to restart and ldlm_resource_unlink will be + * called and it causes the interval node to be + * freed. Then we will fail at + * ldlm_extent_add_lock() */ + *flags &= ~LDLM_FL_BLOCKED_MASK; + } + + } + *flags |= LDLM_FL_BLOCK_GRANTED; + + RETURN(0); +} + +/** + * Discard all AST work items from list. + * + * If for whatever reason we do not want to send ASTs to conflicting locks + * anymore, disassemble the list with this function. + */ +void ldlm_discard_bl_list(struct list_head *bl_list) +{ + struct ldlm_lock *lock, *tmp; + + ENTRY; + + list_for_each_entry_safe(lock, tmp, bl_list, l_bl_ast) { + LASSERT(!list_empty(&lock->l_bl_ast)); + list_del_init(&lock->l_bl_ast); + ldlm_clear_ast_sent(lock); + LASSERT(lock->l_bl_ast_run == 0); + ldlm_clear_blocking_lock(lock); + LDLM_LOCK_RELEASE(lock); + } + EXIT; +} + +/** + * Process a call to blocking AST callback for a lock in ast_work list + */ +static int +ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_lock *lock; + struct ldlm_lock_desc d; + struct ldlm_bl_desc bld; + int rc; + + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast); + + /* nobody should touch l_bl_ast but some locks in the list may become + * granted after lock convert or COS downgrade, these locks should be + * just skipped here and removed from the list. + */ + lock_res_and_lock(lock); + list_del_init(&lock->l_bl_ast); + + /* lock is not blocking lock anymore, but was kept in the list because + * it can managed only here. + */ + if (!ldlm_is_ast_sent(lock)) { + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + RETURN(0); + } + + LASSERT(lock->l_blocking_lock); + ldlm_lock2desc(lock->l_blocking_lock, &d); + /* copy blocking lock ibits in cancel_bits as well, + * new client may use them for lock convert and it is + * important to use new field to convert locks from + * new servers only + */ + d.l_policy_data.l_inodebits.cancel_bits = + lock->l_blocking_lock->l_policy_data.l_inodebits.bits; + + /* Blocking lock is being destroyed here but some information about it + * may be needed inside l_blocking_ast() function below, + * e.g. in mdt_blocking_ast(). So save needed data in bl_desc. + */ + bld.bl_same_client = lock->l_client_cookie == + lock->l_blocking_lock->l_client_cookie; + bld.bl_cos_incompat = ldlm_is_cos_incompat(lock->l_blocking_lock); + arg->bl_desc = &bld; + + LASSERT(ldlm_is_ast_sent(lock)); + LASSERT(lock->l_bl_ast_run == 0); + lock->l_bl_ast_run++; + ldlm_clear_blocking_lock(lock); + unlock_res_and_lock(lock); + + rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING); + + LDLM_LOCK_RELEASE(lock); + + RETURN(rc); +} + +/** + * Process a call to revocation AST callback for a lock in ast_work list + */ +static int +ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_lock_desc desc; + int rc; + struct ldlm_lock *lock; + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast); + list_del_init(&lock->l_rk_ast); + + /* the desc just pretend to exclusive */ + ldlm_lock2desc(lock, &desc); + desc.l_req_mode = LCK_EX; + desc.l_granted_mode = 0; + + rc = lock->l_blocking_ast(lock, &desc, (void*)arg, LDLM_CB_BLOCKING); + LDLM_LOCK_RELEASE(lock); + + RETURN(rc); +} + +/** + * Process a call to glimpse AST callback for a lock in ast_work list + */ +int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_glimpse_work *gl_work; + struct ldlm_lock *lock; + int rc = 0; + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work, + gl_list); + list_del_init(&gl_work->gl_list); + + lock = gl_work->gl_lock; + + /* transfer the glimpse descriptor to ldlm_cb_set_arg */ + arg->gl_desc = gl_work->gl_desc; + arg->gl_interpret_reply = gl_work->gl_interpret_reply; + arg->gl_interpret_data = gl_work->gl_interpret_data; + + /* invoke the actual glimpse callback */ + if (lock->l_glimpse_ast(lock, (void*)arg) == 0) + rc = 1; + + LDLM_LOCK_RELEASE(lock); + if (gl_work->gl_flags & LDLM_GL_WORK_SLAB_ALLOCATED) + OBD_SLAB_FREE_PTR(gl_work, ldlm_glimpse_work_kmem); + else + OBD_FREE_PTR(gl_work); + + RETURN(rc); +} +#endif + +/** + * Process a call to completion AST callback for a lock in ast_work list + */ +static int +ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_lock *lock; + ldlm_completion_callback completion_callback; + int rc = 0; + + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast); + + /* It's possible to receive a completion AST before we've set + * the l_completion_ast pointer: either because the AST arrived + * before the reply, or simply because there's a small race + * window between receiving the reply and finishing the local + * enqueue. (bug 842) + * + * This can't happen with the blocking_ast, however, because we + * will never call the local blocking_ast until we drop our + * reader/writer reference, which we won't do until we get the + * reply and finish enqueueing. */ + + /* nobody should touch l_cp_ast */ + lock_res_and_lock(lock); + list_del_init(&lock->l_cp_ast); + LASSERT(ldlm_is_cp_reqd(lock)); + /* save l_completion_ast since it can be changed by + * mds_intent_policy(), see bug 14225 */ + completion_callback = lock->l_completion_ast; + ldlm_clear_cp_reqd(lock); + unlock_res_and_lock(lock); + + if (completion_callback != NULL) + rc = completion_callback(lock, 0, (void *)arg); + LDLM_LOCK_RELEASE(lock); + + RETURN(rc); +} + +/** + * Process list of locks in need of ASTs being sent. + * + * Used on server to send multiple ASTs together instead of sending one by + * one. + */ +int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list, + ldlm_desc_ast_t ast_type) +{ + struct ldlm_cb_set_arg *arg; + set_producer_func work_ast_lock; + int rc; + + if (list_empty(rpc_list)) + RETURN(0); + + OBD_ALLOC_PTR(arg); + if (arg == NULL) + RETURN(-ENOMEM); + + atomic_set(&arg->restart, 0); + arg->list = rpc_list; + + switch (ast_type) { + case LDLM_WORK_CP_AST: + arg->type = LDLM_CP_CALLBACK; + work_ast_lock = ldlm_work_cp_ast_lock; + break; +#ifdef HAVE_SERVER_SUPPORT + case LDLM_WORK_BL_AST: + arg->type = LDLM_BL_CALLBACK; + work_ast_lock = ldlm_work_bl_ast_lock; + break; + case LDLM_WORK_REVOKE_AST: + arg->type = LDLM_BL_CALLBACK; + work_ast_lock = ldlm_work_revoke_ast_lock; + break; + case LDLM_WORK_GL_AST: + arg->type = LDLM_GL_CALLBACK; + work_ast_lock = ldlm_work_gl_ast_lock; + break; +#endif + default: + LBUG(); + } + + /* We create a ptlrpc request set with flow control extension. + * This request set will use the work_ast_lock function to produce new + * requests and will send a new request each time one completes in order + * to keep the number of requests in flight to ns_max_parallel_ast */ + arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX, + work_ast_lock, arg); + if (arg->set == NULL) + GOTO(out, rc = -ENOMEM); + + ptlrpc_set_wait(NULL, arg->set); + ptlrpc_set_destroy(arg->set); + + rc = atomic_read(&arg->restart) ? -ERESTART : 0; + GOTO(out, rc); +out: + OBD_FREE_PTR(arg); + return rc; +} + +/** + * Try to grant all waiting locks on a resource. + * + * Calls ldlm_reprocess_queue on waiting queue. + * + * Typically called after some resource locks are cancelled to see + * if anything could be granted as a result of the cancellation. + */ +static void __ldlm_reprocess_all(struct ldlm_resource *res, + enum ldlm_process_intention intention, + struct ldlm_lock *hint) +{ + struct list_head rpc_list; +#ifdef HAVE_SERVER_SUPPORT + ldlm_reprocessing_policy reprocess; + struct obd_device *obd; + int rc; + + ENTRY; + + INIT_LIST_HEAD(&rpc_list); + /* Local lock trees don't get reprocessed. */ + if (ns_is_client(ldlm_res_to_ns(res))) { + EXIT; + return; + } + + /* Disable reprocess during lock replay stage but allow during + * request replay stage. + */ + obd = ldlm_res_to_ns(res)->ns_obd; + if (obd->obd_recovering && + atomic_read(&obd->obd_req_replay_clients) == 0) + RETURN_EXIT; +restart: + lock_res(res); + reprocess = ldlm_get_reprocessing_policy(res); + reprocess(res, &res->lr_waiting, &rpc_list, intention, hint); + unlock_res(res); + + rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &rpc_list, + LDLM_WORK_CP_AST); + if (rc == -ERESTART) { + LASSERT(list_empty(&rpc_list)); + goto restart; + } +#else + ENTRY; + + INIT_LIST_HEAD(&rpc_list); + if (!ns_is_client(ldlm_res_to_ns(res))) { + CERROR("This is client-side-only module, cannot handle " + "LDLM_NAMESPACE_SERVER resource type lock.\n"); + LBUG(); + } +#endif + EXIT; +} + +void ldlm_reprocess_all(struct ldlm_resource *res, struct ldlm_lock *hint) +{ + __ldlm_reprocess_all(res, LDLM_PROCESS_RESCAN, hint); +} +EXPORT_SYMBOL(ldlm_reprocess_all); + +static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + + /* This is only called once after recovery done. LU-8306. */ + __ldlm_reprocess_all(res, LDLM_PROCESS_RECOVERY, NULL); + return 0; +} + +/** + * Iterate through all resources on a namespace attempting to grant waiting + * locks. + */ +void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns) +{ + ENTRY; + + if (ns != NULL) { + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_reprocess_res, NULL, 0); + } + EXIT; +} + +/** + * Helper function to call blocking AST for LDLM lock \a lock in a + * "cancelling" mode. + */ +void ldlm_cancel_callback(struct ldlm_lock *lock) +{ + check_res_locked(lock->l_resource); + if (!ldlm_is_cancel(lock)) { + ldlm_set_cancel(lock); + if (lock->l_blocking_ast) { + unlock_res_and_lock(lock); + lock->l_blocking_ast(lock, NULL, lock->l_ast_data, + LDLM_CB_CANCELING); + lock_res_and_lock(lock); + } else { + LDLM_DEBUG(lock, "no blocking ast"); + } + + /* only canceller can set bl_done bit */ + ldlm_set_bl_done(lock); + wake_up_all(&lock->l_waitq); + } else if (!ldlm_is_bl_done(lock)) { + struct l_wait_info lwi = { 0 }; + + /* The lock is guaranteed to have been canceled once + * returning from this function. */ + unlock_res_and_lock(lock); + l_wait_event(lock->l_waitq, is_bl_done(lock), &lwi); + lock_res_and_lock(lock); + } +} + +/** + * Remove skiplist-enabled LDLM lock \a req from granted list + */ +void ldlm_unlink_lock_skiplist(struct ldlm_lock *req) +{ + if (req->l_resource->lr_type != LDLM_PLAIN && + req->l_resource->lr_type != LDLM_IBITS) + return; + + list_del_init(&req->l_sl_policy); + list_del_init(&req->l_sl_mode); +} + +/** + * Attempts to cancel LDLM lock \a lock that has no reader/writer references. + */ +void ldlm_lock_cancel(struct ldlm_lock *lock) +{ + struct ldlm_resource *res; + struct ldlm_namespace *ns; + ENTRY; + + lock_res_and_lock(lock); + + res = lock->l_resource; + ns = ldlm_res_to_ns(res); + + /* Please do not, no matter how tempting, remove this LBUG without + * talking to me first. -phik */ + if (lock->l_readers || lock->l_writers) { + LDLM_ERROR(lock, "lock still has references"); + unlock_res_and_lock(lock); + LBUG(); + } + + if (ldlm_is_waited(lock)) + ldlm_del_waiting_lock(lock); + + /* Releases cancel callback. */ + ldlm_cancel_callback(lock); + + /* Yes, second time, just in case it was added again while we were + * running with no res lock in ldlm_cancel_callback */ + if (ldlm_is_waited(lock)) + ldlm_del_waiting_lock(lock); + + ldlm_resource_unlink_lock(lock); + ldlm_lock_destroy_nolock(lock); + + if (ldlm_is_granted(lock)) + ldlm_pool_del(&ns->ns_pool, lock); + + /* Make sure we will not be called again for same lock what is possible + * if not to zero out lock->l_granted_mode */ + lock->l_granted_mode = LCK_MINMODE; + unlock_res_and_lock(lock); + + EXIT; +} +EXPORT_SYMBOL(ldlm_lock_cancel); + +/** + * Set opaque data into the lock that only makes sense to upper layer. + */ +int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data) +{ + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + int rc = -EINVAL; + ENTRY; + + if (lock) { + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + rc = 0; + LDLM_LOCK_PUT(lock); + } + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_lock_set_data); + +struct export_cl_data { + const struct lu_env *ecl_env; + struct obd_export *ecl_exp; + int ecl_loop; +}; + +static void ldlm_cancel_lock_for_export(struct obd_export *exp, + struct ldlm_lock *lock, + struct export_cl_data *ecl) +{ + struct ldlm_resource *res; + + res = ldlm_resource_getref(lock->l_resource); + + ldlm_lvbo_update(res, lock, NULL, 1); + ldlm_lock_cancel(lock); + if (!exp->exp_obd->obd_stopping) + ldlm_reprocess_all(res, lock); + ldlm_resource_putref(res); + + ecl->ecl_loop++; + if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) { + CDEBUG(D_INFO, "Export %p, %d locks cancelled.\n", + exp, ecl->ecl_loop); + } +} + +/** + * Iterator function for ldlm_export_cancel_locks. + * Cancels passed locks. + */ +static int +ldlm_cancel_locks_for_export_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) + +{ + struct export_cl_data *ecl = (struct export_cl_data *)data; + struct obd_export *exp = ecl->ecl_exp; + struct ldlm_lock *lock = cfs_hash_object(hs, hnode); + + LDLM_LOCK_GET(lock); + ldlm_cancel_lock_for_export(exp, lock, ecl); + LDLM_LOCK_RELEASE(lock); + + return 0; +} + +/** + * Cancel all blocked locks for given export. + * + * Typically called on client disconnection/eviction + */ +int ldlm_export_cancel_blocked_locks(struct obd_export *exp) +{ + struct lu_env env; + struct export_cl_data ecl = { + .ecl_exp = exp, + .ecl_loop = 0, + }; + int rc; + + rc = lu_env_init(&env, LCT_DT_THREAD); + if (rc) + RETURN(rc); + ecl.ecl_env = &env; + + while (!list_empty(&exp->exp_bl_list)) { + struct ldlm_lock *lock; + + spin_lock_bh(&exp->exp_bl_list_lock); + if (!list_empty(&exp->exp_bl_list)) { + lock = list_entry(exp->exp_bl_list.next, + struct ldlm_lock, l_exp_list); + LDLM_LOCK_GET(lock); + list_del_init(&lock->l_exp_list); + } else { + lock = NULL; + } + spin_unlock_bh(&exp->exp_bl_list_lock); + + if (lock == NULL) + break; + + ldlm_cancel_lock_for_export(exp, lock, &ecl); + LDLM_LOCK_RELEASE(lock); + } + + lu_env_fini(&env); + + CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, " + "left on hash table %d.\n", exp, ecl.ecl_loop, + atomic_read(&exp->exp_lock_hash->hs_count)); + + return ecl.ecl_loop; +} + +/** + * Cancel all locks for given export. + * + * Typically called after client disconnection/eviction + */ +int ldlm_export_cancel_locks(struct obd_export *exp) +{ + struct export_cl_data ecl; + struct lu_env env; + int rc; + + rc = lu_env_init(&env, LCT_DT_THREAD); + if (rc) + RETURN(rc); + ecl.ecl_env = &env; + ecl.ecl_exp = exp; + ecl.ecl_loop = 0; + + cfs_hash_for_each_empty(exp->exp_lock_hash, + ldlm_cancel_locks_for_export_cb, &ecl); + + CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, " + "left on hash table %d.\n", exp, ecl.ecl_loop, + atomic_read(&exp->exp_lock_hash->hs_count)); + + if (ecl.ecl_loop > 0 && + atomic_read(&exp->exp_lock_hash->hs_count) == 0 && + exp->exp_obd->obd_stopping) + ldlm_reprocess_recovery_done(exp->exp_obd->obd_namespace); + + lu_env_fini(&env); + + return ecl.ecl_loop; +} + +/** + * Downgrade an PW/EX lock to COS | CR mode. + * + * A lock mode convertion from PW/EX mode to less conflict mode. The + * convertion may fail if lock was canceled before downgrade, but it doesn't + * indicate any problem, because such lock has no reader or writer, and will + * be released soon. + * + * Used by Commit on Sharing (COS) code to force object changes commit in case + * of conflict. Converted lock is considered as new lock and all blocking AST + * things are cleared, so any pending or new blocked lock on that lock will + * cause new call to blocking_ast and force resource object commit. + * + * Also used by layout_change to replace EX lock to CR lock. + * + * \param lock A lock to convert + * \param new_mode new lock mode + */ +void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode) +{ +#ifdef HAVE_SERVER_SUPPORT + ENTRY; + + LASSERT(new_mode == LCK_COS || new_mode == LCK_CR); + + lock_res_and_lock(lock); + + if (!(lock->l_granted_mode & (LCK_PW | LCK_EX))) { + unlock_res_and_lock(lock); + + LASSERT(lock->l_granted_mode == LCK_MINMODE); + LDLM_DEBUG(lock, "lock was canceled before downgrade"); + RETURN_EXIT; + } + + ldlm_resource_unlink_lock(lock); + /* + * Remove the lock from pool as it will be added again in + * ldlm_grant_lock() called below. + */ + ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock); + + /* Consider downgraded lock as a new lock and clear all states + * related to a previous blocking AST processing. + */ + ldlm_clear_blocking_data(lock); + + lock->l_req_mode = new_mode; + ldlm_grant_lock(lock, NULL); + unlock_res_and_lock(lock); + + ldlm_reprocess_all(lock->l_resource, lock); + + EXIT; +#endif +} +EXPORT_SYMBOL(ldlm_lock_mode_downgrade); + +/** + * Print lock with lock handle \a lockh description into debug log. + * + * Used when printing all locks on a resource for debug purposes. + */ +void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh) +{ + struct ldlm_lock *lock; + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + lock = ldlm_handle2lock(lockh); + if (lock == NULL) + return; + + LDLM_DEBUG_LIMIT(level, lock, "###"); + + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_dump_handle); + +/** + * Print lock information with custom message into debug log. + * Helper function. + */ +void _ldlm_lock_debug(struct ldlm_lock *lock, + struct libcfs_debug_msg_data *msgdata, + const char *fmt, ...) +{ + va_list args; + struct obd_export *exp = lock->l_export; + struct ldlm_resource *resource = NULL; + char *nid = "local"; + + /* on server-side resource of lock doesn't change */ + if ((lock->l_flags & LDLM_FL_NS_SRV) != 0) { + if (lock->l_resource != NULL) + resource = ldlm_resource_getref(lock->l_resource); + } else if (spin_trylock(&lock->l_lock)) { + if (lock->l_resource != NULL) + resource = ldlm_resource_getref(lock->l_resource); + spin_unlock(&lock->l_lock); + } + + va_start(args, fmt); + + if (exp && exp->exp_connection) { + nid = obd_export_nid2str(exp); + } else if (exp && exp->exp_obd != NULL) { + struct obd_import *imp = exp->exp_obd->u.cli.cl_import; + nid = obd_import_nid2str(imp); + } + + if (resource == NULL) { + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s " + "res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s " + "remote: %#llx expref: %d pid: %u timeout: %lld " + "lvb_type: %d\n", + lock, + lock->l_handle.h_cookie, atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type); + va_end(args); + return; + } + + switch (resource->lr_type) { + case LDLM_EXTENT: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s " + "res: "DLDLMRES" rrc: %d type: %s [%llu->%llu] " + "(req %llu->%llu) flags: %#llx nid: %s remote: " + "%#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n", + ldlm_lock_to_ns_name(lock), lock, + lock->l_handle.h_cookie, atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_policy_data.l_extent.start, + lock->l_policy_data.l_extent.end, + lock->l_req_extent.start, lock->l_req_extent.end, + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, + lock->l_lvb_type); + break; + + case LDLM_FLOCK: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s " + "res: "DLDLMRES" rrc: %d type: %s pid: %d " + "[%llu->%llu] flags: %#llx nid: %s " + "remote: %#llx expref: %d pid: %u timeout: %lld\n", + ldlm_lock_to_ns_name(lock), lock, + lock->l_handle.h_cookie, atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_policy_data.l_flock.pid, + lock->l_policy_data.l_flock.start, + lock->l_policy_data.l_flock.end, + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout); + break; + + case LDLM_IBITS: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s " + "res: "DLDLMRES" bits %#llx/%#llx rrc: %d type: %s " + "flags: %#llx nid: %s remote: %#llx expref: %d " + "pid: %u timeout: %lld lvb_type: %d\n", + ldlm_lock_to_ns_name(lock), + lock, lock->l_handle.h_cookie, + atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + lock->l_policy_data.l_inodebits.bits, + lock->l_policy_data.l_inodebits.try_bits, + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, + lock->l_lvb_type); + break; + + default: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s " + "res: "DLDLMRES" rrc: %d type: %s flags: %#llx " + "nid: %s remote: %#llx expref: %d pid: %u " + "timeout: %lld lvb_type: %d\n", + ldlm_lock_to_ns_name(lock), + lock, lock->l_handle.h_cookie, + atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, + lock->l_lvb_type); + break; + } + va_end(args); + ldlm_resource_putref(resource); +} +EXPORT_SYMBOL(_ldlm_lock_debug); diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c new file mode 100644 index 0000000000000..ac7a9910e4d45 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c @@ -0,0 +1,3342 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_lockd.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include +#include +#include "ldlm_internal.h" + +static int ldlm_num_threads; +module_param(ldlm_num_threads, int, 0444); +MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start"); + +static unsigned int ldlm_cpu_bind = 1; +module_param(ldlm_cpu_bind, uint, 0444); +MODULE_PARM_DESC(ldlm_cpu_bind, + "bind DLM service threads to particular CPU partitions"); + +static char *ldlm_cpts; +module_param(ldlm_cpts, charp, 0444); +MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on"); + +static DEFINE_MUTEX(ldlm_ref_mutex); +static int ldlm_refcount; + +struct kobject *ldlm_kobj; +struct kset *ldlm_ns_kset; +struct kset *ldlm_svc_kset; + +/* LDLM state */ + +static struct ldlm_state *ldlm_state; + +/* timeout for initial callback (AST) reply (bz10399) + * Due to having to send a 32 bit time value over the + * wire return it as timeout_t instead of time64_t + */ +static inline timeout_t ldlm_get_rq_timeout(void) +{ + /* Non-AT value */ + timeout_t timeout = min(ldlm_timeout, obd_timeout / 3); + + return timeout < 1 ? 1 : timeout; +} + +struct ldlm_bl_pool { + spinlock_t blp_lock; + + /* + * blp_prio_list is used for callbacks that should be handled + * as a priority. It is used for LDLM_FL_DISCARD_DATA requests. + * see bug 13843 + */ + struct list_head blp_prio_list; + + /* + * blp_list is used for all other callbacks which are likely + * to take longer to process. + */ + struct list_head blp_list; + + wait_queue_head_t blp_waitq; + struct completion blp_comp; + atomic_t blp_num_threads; + atomic_t blp_busy_threads; + int blp_min_threads; + int blp_max_threads; +}; + +struct ldlm_bl_work_item { + struct list_head blwi_entry; + struct ldlm_namespace *blwi_ns; + struct ldlm_lock_desc blwi_ld; + struct ldlm_lock *blwi_lock; + struct list_head blwi_head; + int blwi_count; + struct completion blwi_comp; + enum ldlm_cancel_flags blwi_flags; + int blwi_mem_pressure; +}; + +#ifdef HAVE_SERVER_SUPPORT + +/** + * Protects both waiting_locks_list and expired_lock_thread. + */ +static DEFINE_SPINLOCK(waiting_locks_spinlock); /* BH lock (timer) */ + +/** + * List for contended locks. + * + * As soon as a lock is contended, it gets placed on this list and + * expected time to get a response is filled in the lock. A special + * thread walks the list looking for locks that should be released and + * schedules client evictions for those that have not been released in + * time. + * + * All access to it should be under waiting_locks_spinlock. + */ +static LIST_HEAD(waiting_locks_list); +static void waiting_locks_callback(TIMER_DATA_TYPE unused); +static CFS_DEFINE_TIMER(waiting_locks_timer, waiting_locks_callback, 0, 0); + +enum elt_state { + ELT_STOPPED, + ELT_READY, + ELT_TERMINATE, +}; + +static DECLARE_WAIT_QUEUE_HEAD(expired_lock_wait_queue); +static enum elt_state expired_lock_thread_state = ELT_STOPPED; +static int expired_lock_dump; +static LIST_HEAD(expired_lock_list); + +static int ldlm_lock_busy(struct ldlm_lock *lock); +static int ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t timeout); +static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds); + +static inline int have_expired_locks(void) +{ + int need_to_run; + + ENTRY; + spin_lock_bh(&waiting_locks_spinlock); + need_to_run = !list_empty(&expired_lock_list); + spin_unlock_bh(&waiting_locks_spinlock); + + RETURN(need_to_run); +} + +/** + * Check expired lock list for expired locks and time them out. + */ +static int expired_lock_main(void *arg) +{ + struct list_head *expired = &expired_lock_list; + struct l_wait_info lwi = { 0 }; + int do_dump; + + ENTRY; + + expired_lock_thread_state = ELT_READY; + wake_up(&expired_lock_wait_queue); + + while (1) { + l_wait_event(expired_lock_wait_queue, + have_expired_locks() || + expired_lock_thread_state == ELT_TERMINATE, + &lwi); + + spin_lock_bh(&waiting_locks_spinlock); + if (expired_lock_dump) { + spin_unlock_bh(&waiting_locks_spinlock); + + /* from waiting_locks_callback, but not in timer */ + libcfs_debug_dumplog(); + + spin_lock_bh(&waiting_locks_spinlock); + expired_lock_dump = 0; + } + + do_dump = 0; + + while (!list_empty(expired)) { + struct obd_export *export; + struct ldlm_lock *lock; + + lock = list_entry(expired->next, struct ldlm_lock, + l_pending_chain); + if ((void *)lock < LP_POISON + PAGE_SIZE && + (void *)lock >= LP_POISON) { + spin_unlock_bh(&waiting_locks_spinlock); + CERROR("free lock on elt list %p\n", lock); + LBUG(); + } + list_del_init(&lock->l_pending_chain); + if ((void *)lock->l_export < + LP_POISON + PAGE_SIZE && + (void *)lock->l_export >= LP_POISON) { + CERROR("lock with free export on elt list %p\n", + lock->l_export); + lock->l_export = NULL; + LDLM_ERROR(lock, "free export"); + /* release extra ref grabbed by + * ldlm_add_waiting_lock() or + * ldlm_failed_ast() */ + LDLM_LOCK_RELEASE(lock); + continue; + } + + if (ldlm_is_destroyed(lock)) { + /* release the lock refcount where + * waiting_locks_callback() founds */ + LDLM_LOCK_RELEASE(lock); + continue; + } + export = class_export_lock_get(lock->l_export, lock); + spin_unlock_bh(&waiting_locks_spinlock); + + /* Check if we need to prolong timeout */ + if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) && + lock->l_callback_timeout != 0 && /* not AST error */ + ldlm_lock_busy(lock)) { + LDLM_DEBUG(lock, "prolong the busy lock"); + lock_res_and_lock(lock); + ldlm_add_waiting_lock(lock, + ldlm_bl_timeout(lock) >> 1); + unlock_res_and_lock(lock); + } else { + spin_lock_bh(&export->exp_bl_list_lock); + list_del_init(&lock->l_exp_list); + spin_unlock_bh(&export->exp_bl_list_lock); + + LDLM_ERROR(lock, + "lock callback timer expired after %llds: evicting client at %s ", + ktime_get_real_seconds() - + lock->l_blast_sent, + obd_export_nid2str(export)); + ldlm_lock_to_ns(lock)->ns_timeouts++; + do_dump++; + class_fail_export(export); + } + class_export_lock_put(export, lock); + /* release extra ref grabbed by ldlm_add_waiting_lock() + * or ldlm_failed_ast() */ + LDLM_LOCK_RELEASE(lock); + + spin_lock_bh(&waiting_locks_spinlock); + } + spin_unlock_bh(&waiting_locks_spinlock); + + if (do_dump && obd_dump_on_eviction) { + CERROR("dump the log upon eviction\n"); + libcfs_debug_dumplog(); + } + + if (expired_lock_thread_state == ELT_TERMINATE) + break; + } + + expired_lock_thread_state = ELT_STOPPED; + wake_up(&expired_lock_wait_queue); + RETURN(0); +} + +/** + * Check if there is a request in the export request list + * which prevents the lock canceling. + */ +static int ldlm_lock_busy(struct ldlm_lock *lock) +{ + struct ptlrpc_request *req; + int match = 0; + ENTRY; + + if (lock->l_export == NULL) + return 0; + + spin_lock(&lock->l_export->exp_rpc_lock); + list_for_each_entry(req, &lock->l_export->exp_hp_rpcs, + rq_exp_list) { + if (req->rq_ops->hpreq_lock_match) { + match = req->rq_ops->hpreq_lock_match(req, lock); + if (match) + break; + } + } + spin_unlock(&lock->l_export->exp_rpc_lock); + RETURN(match); +} + +/* This is called from within a timer interrupt and cannot schedule */ +static void waiting_locks_callback(TIMER_DATA_TYPE unused) +{ + struct ldlm_lock *lock; + int need_dump = 0; + + spin_lock_bh(&waiting_locks_spinlock); + while (!list_empty(&waiting_locks_list)) { + lock = list_entry(waiting_locks_list.next, struct ldlm_lock, + l_pending_chain); + if (lock->l_callback_timeout > ktime_get_seconds() || + lock->l_req_mode == LCK_GROUP) + break; + + /* no needs to take an extra ref on the lock since it was in + * the waiting_locks_list and ldlm_add_waiting_lock() + * already grabbed a ref */ + list_del(&lock->l_pending_chain); + list_add(&lock->l_pending_chain, &expired_lock_list); + need_dump = 1; + } + + if (!list_empty(&expired_lock_list)) { + if (obd_dump_on_timeout && need_dump) + expired_lock_dump = __LINE__; + + wake_up(&expired_lock_wait_queue); + } + + /* + * Make sure the timer will fire again if we have any locks + * left. + */ + if (!list_empty(&waiting_locks_list)) { + unsigned long timeout_jiffies; + + lock = list_entry(waiting_locks_list.next, struct ldlm_lock, + l_pending_chain); + timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout); + mod_timer(&waiting_locks_timer, timeout_jiffies); + } + spin_unlock_bh(&waiting_locks_spinlock); +} + +/** + * Add lock to the list of contended locks. + * + * Indicate that we're waiting for a client to call us back cancelling a given + * lock. We add it to the pending-callback chain, and schedule the lock-timeout + * timer to fire appropriately. (We round up to the next second, to avoid + * floods of timer firings during periods of high lock contention and traffic). + * As done by ldlm_add_waiting_lock(), the caller must grab a lock reference + * if it has been added to the waiting list (1 is returned). + * + * Called with the namespace lock held. + */ +static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds) +{ + unsigned long timeout_jiffies; + time64_t timeout; + + if (!list_empty(&lock->l_pending_chain)) + return 0; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT) || + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) + seconds = 1; + + timeout = ktime_get_seconds() + seconds; + if (likely(timeout > lock->l_callback_timeout)) + lock->l_callback_timeout = timeout; + + timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout); + + if (time_before(timeout_jiffies, waiting_locks_timer.expires) || + !timer_pending(&waiting_locks_timer)) + mod_timer(&waiting_locks_timer, timeout_jiffies); + + /* if the new lock has a shorter timeout than something earlier on + * the list, we'll wait the longer amount of time; no big deal. + */ + /* FIFO */ + list_add_tail(&lock->l_pending_chain, &waiting_locks_list); + return 1; +} + +static void ldlm_add_blocked_lock(struct ldlm_lock *lock) +{ + spin_lock_bh(&lock->l_export->exp_bl_list_lock); + if (list_empty(&lock->l_exp_list)) { + if (!ldlm_is_granted(lock)) + list_add_tail(&lock->l_exp_list, + &lock->l_export->exp_bl_list); + else + list_add(&lock->l_exp_list, + &lock->l_export->exp_bl_list); + } + spin_unlock_bh(&lock->l_export->exp_bl_list_lock); + + /* A blocked lock is added. Adjust the position in + * the stale list if the export is in the list. + * If export is stale and not in the list - it is being + * processed and will be placed on the right position + * on obd_stale_export_put(). */ + if (!list_empty(&lock->l_export->exp_stale_list)) + obd_stale_export_adjust(lock->l_export); +} + +static int ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t timeout) +{ + int ret; + + /* NB: must be called with hold of lock_res_and_lock() */ + LASSERT(ldlm_is_res_locked(lock)); + LASSERT(!ldlm_is_cancel_on_block(lock)); + + /* Do not put cross-MDT lock in the waiting list, since we + * will not evict it due to timeout for now */ + if (lock->l_export != NULL && + (exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS)) + return 0; + + spin_lock_bh(&waiting_locks_spinlock); + if (ldlm_is_cancel(lock)) { + spin_unlock_bh(&waiting_locks_spinlock); + return 0; + } + + if (ldlm_is_destroyed(lock)) { + static time64_t next; + + spin_unlock_bh(&waiting_locks_spinlock); + LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)"); + if (ktime_get_seconds() > next) { + next = ktime_get_seconds() + 14400; + libcfs_debug_dumpstack(NULL); + } + return 0; + } + + ldlm_set_waited(lock); + lock->l_blast_sent = ktime_get_real_seconds(); + ret = __ldlm_add_waiting_lock(lock, timeout); + if (ret) { + /* grab ref on the lock if it has been added to the + * waiting list */ + LDLM_LOCK_GET(lock); + } + spin_unlock_bh(&waiting_locks_spinlock); + + if (ret) + ldlm_add_blocked_lock(lock); + + LDLM_DEBUG(lock, "%sadding to wait list(timeout: %lld, AT: %s)", + ret == 0 ? "not re-" : "", timeout, + AT_OFF ? "off" : "on"); + return ret; +} + +/** + * Remove a lock from the pending list, likely because it had its cancellation + * callback arrive without incident. This adjusts the lock-timeout timer if + * needed. Returns 0 if the lock wasn't pending after all, 1 if it was. + * As done by ldlm_del_waiting_lock(), the caller must release the lock + * reference when the lock is removed from any list (1 is returned). + * + * Called with namespace lock held. + */ +static int __ldlm_del_waiting_lock(struct ldlm_lock *lock) +{ + struct list_head *list_next; + + if (list_empty(&lock->l_pending_chain)) + return 0; + + list_next = lock->l_pending_chain.next; + if (lock->l_pending_chain.prev == &waiting_locks_list) { + /* Removing the head of the list, adjust timer. */ + if (list_next == &waiting_locks_list) { + /* No more, just cancel. */ + del_timer(&waiting_locks_timer); + } else { + struct ldlm_lock *next; + + next = list_entry(list_next, struct ldlm_lock, + l_pending_chain); + mod_timer(&waiting_locks_timer, + cfs_time_seconds(next->l_callback_timeout)); + } + } + list_del_init(&lock->l_pending_chain); + + return 1; +} + +int ldlm_del_waiting_lock(struct ldlm_lock *lock) +{ + int ret; + + if (lock->l_export == NULL) { + /* We don't have a "waiting locks list" on clients. */ + CDEBUG(D_DLMTRACE, "Client lock %p : no-op\n", lock); + return 0; + } + + spin_lock_bh(&waiting_locks_spinlock); + ret = __ldlm_del_waiting_lock(lock); + ldlm_clear_waited(lock); + spin_unlock_bh(&waiting_locks_spinlock); + + /* remove the lock out of export blocking list */ + spin_lock_bh(&lock->l_export->exp_bl_list_lock); + list_del_init(&lock->l_exp_list); + spin_unlock_bh(&lock->l_export->exp_bl_list_lock); + + if (ret) { + /* release lock ref if it has indeed been removed + * from a list */ + LDLM_LOCK_RELEASE(lock); + } + + LDLM_DEBUG(lock, "%s", ret == 0 ? "wasn't waiting" : "removed"); + return ret; +} + +/** + * Prolong the contended lock waiting time. + * + * Called with namespace lock held. + */ +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout) +{ + if (lock->l_export == NULL) { + /* We don't have a "waiting locks list" on clients. */ + LDLM_DEBUG(lock, "client lock: no-op"); + return 0; + } + + if (exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) { + /* We don't have a "waiting locks list" on OSP. */ + LDLM_DEBUG(lock, "MDS-MDS lock: no-op"); + return 0; + } + + spin_lock_bh(&waiting_locks_spinlock); + + if (list_empty(&lock->l_pending_chain)) { + spin_unlock_bh(&waiting_locks_spinlock); + LDLM_DEBUG(lock, "wasn't waiting"); + return 0; + } + + /* we remove/add the lock to the waiting list, so no needs to + * release/take a lock reference */ + __ldlm_del_waiting_lock(lock); + __ldlm_add_waiting_lock(lock, timeout); + spin_unlock_bh(&waiting_locks_spinlock); + + LDLM_DEBUG(lock, "refreshed"); + return 1; +} +EXPORT_SYMBOL(ldlm_refresh_waiting_lock); + +#else /* HAVE_SERVER_SUPPORT */ + +int ldlm_del_waiting_lock(struct ldlm_lock *lock) +{ + RETURN(0); +} + +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout) +{ + RETURN(0); +} + +#endif /* !HAVE_SERVER_SUPPORT */ + +#ifdef HAVE_SERVER_SUPPORT + +/** + * Calculate the per-export Blocking timeout (covering BL AST, data flush, + * lock cancel, and their replies). Used for lock callback timeout and AST + * re-send period. + * + * \param[in] lock lock which is getting the blocking callback + * + * \retval timeout in seconds to wait for the client reply + */ +time64_t ldlm_bl_timeout(struct ldlm_lock *lock) +{ + time64_t timeout; + + if (AT_OFF) + return obd_timeout / 2; + + /* Since these are non-updating timeouts, we should be conservative. + * Take more than usually, 150% + * It would be nice to have some kind of "early reply" mechanism for + * lock callbacks too... */ + timeout = at_get(&lock->l_export->exp_bl_lock_at); + return max(timeout + (timeout >> 1), (time64_t)ldlm_enqueue_min); +} +EXPORT_SYMBOL(ldlm_bl_timeout); + +/** + * Perform lock cleanup if AST sending failed. + */ +static void ldlm_failed_ast(struct ldlm_lock *lock, int rc, + const char *ast_type) +{ + LCONSOLE_ERROR_MSG(0x138, "%s: A client on nid %s was evicted due " + "to a lock %s callback time out: rc %d\n", + lock->l_export->exp_obd->obd_name, + obd_export_nid2str(lock->l_export), ast_type, rc); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + spin_lock_bh(&waiting_locks_spinlock); + if (__ldlm_del_waiting_lock(lock) == 0) + /* the lock was not in any list, grab an extra ref before adding + * the lock to the expired list */ + LDLM_LOCK_GET(lock); + lock->l_callback_timeout = 0; /* differentiate it from expired locks */ + list_add(&lock->l_pending_chain, &expired_lock_list); + wake_up(&expired_lock_wait_queue); + spin_unlock_bh(&waiting_locks_spinlock); +} + +/** + * Perform lock cleanup if AST reply came with error. + */ +static int ldlm_handle_ast_error(struct ldlm_lock *lock, + struct ptlrpc_request *req, int rc, + const char *ast_type) +{ + struct lnet_process_id peer = req->rq_import->imp_connection->c_peer; + + if (!req->rq_replied || (rc && rc != -EINVAL)) { + if (ldlm_is_cancel(lock)) { + LDLM_DEBUG(lock, + "%s AST (req@%p x%llu) timeout from nid %s, but cancel was received (AST reply lost?)", + ast_type, req, req->rq_xid, + libcfs_nid2str(peer.nid)); + ldlm_lock_cancel(lock); + rc = -ERESTART; + } else if (rc == -ENODEV || rc == -ESHUTDOWN || + (rc == -EIO && + req->rq_import->imp_state == LUSTRE_IMP_CLOSED)) { + /* Upon umount process the AST fails because cannot be + * sent. This shouldn't lead to the client eviction. + * -ENODEV error is returned by ptl_send_rpc() for + * new request in such import. + * -SHUTDOWN is returned by ptlrpc_import_delay_req() + * if imp_invalid is set or obd_no_recov. + * Meanwhile there is also check for LUSTRE_IMP_CLOSED + * in ptlrpc_import_delay_req() as well with -EIO code. + * In all such cases errors are ignored. + */ + LDLM_DEBUG(lock, "%s AST can't be sent due to a server" + " %s failure or umount process: rc = %d\n", + ast_type, + req->rq_import->imp_obd->obd_name, rc); + } else { + LDLM_ERROR(lock, + "client (nid %s) %s %s AST (req@%p x%llu status %d rc %d), evict it", + libcfs_nid2str(peer.nid), + req->rq_replied ? "returned error from" : + "failed to reply to", + ast_type, req, req->rq_xid, + (req->rq_repmsg != NULL) ? + lustre_msg_get_status(req->rq_repmsg) : 0, + rc); + ldlm_failed_ast(lock, rc, ast_type); + } + return rc; + } + + if (rc == -EINVAL) { + struct ldlm_resource *res = lock->l_resource; + + LDLM_DEBUG(lock, + "client (nid %s) returned %d from %s AST (req@%p x%llu) - normal race", + libcfs_nid2str(peer.nid), + req->rq_repmsg ? + lustre_msg_get_status(req->rq_repmsg) : -1, + ast_type, req, req->rq_xid); + if (res) { + /* update lvbo to return proper attributes. + * see bug 23174 */ + ldlm_resource_getref(res); + ldlm_lvbo_update(res, lock, NULL, 1); + ldlm_resource_putref(res); + } + ldlm_lock_cancel(lock); + rc = -ERESTART; + } + + return rc; +} + +static int ldlm_cb_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct ldlm_cb_async_args *ca = args; + struct ldlm_lock *lock = ca->ca_lock; + struct ldlm_cb_set_arg *arg = ca->ca_set_arg; + ENTRY; + + LASSERT(lock != NULL); + + switch (arg->type) { + case LDLM_GL_CALLBACK: + /* Update the LVB from disk if the AST failed + * (this is a legal race) + * + * - Glimpse callback of local lock just returns + * -ELDLM_NO_LOCK_DATA. + * - Glimpse callback of remote lock might return + * -ELDLM_NO_LOCK_DATA when inode is cleared. LU-274 + */ + if (unlikely(arg->gl_interpret_reply)) { + rc = arg->gl_interpret_reply(NULL, req, args, rc); + } else if (rc == -ELDLM_NO_LOCK_DATA) { + LDLM_DEBUG(lock, + "lost race - client has a lock but no inode"); + ldlm_lvbo_update(lock->l_resource, lock, NULL, 1); + } else if (rc != 0) { + rc = ldlm_handle_ast_error(lock, req, rc, "glimpse"); + } else { + rc = ldlm_lvbo_update(lock->l_resource, + lock, req, 1); + } + break; + case LDLM_BL_CALLBACK: + if (rc != 0) + rc = ldlm_handle_ast_error(lock, req, rc, "blocking"); + break; + case LDLM_CP_CALLBACK: + if (rc != 0) + rc = ldlm_handle_ast_error(lock, req, rc, "completion"); + break; + default: + LDLM_ERROR(lock, "invalid opcode for lock callback %d", + arg->type); + LBUG(); + } + + /* release extra reference taken in ldlm_ast_fini() */ + LDLM_LOCK_RELEASE(lock); + + if (rc == -ERESTART) + atomic_inc(&arg->restart); + + RETURN(0); +} + +static void ldlm_update_resend(struct ptlrpc_request *req, void *data) +{ + struct ldlm_cb_async_args *ca = data; + struct ldlm_lock *lock = ca->ca_lock; + + ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock)); +} + +static inline int ldlm_ast_fini(struct ptlrpc_request *req, + struct ldlm_cb_set_arg *arg, + struct ldlm_lock *lock, + int instant_cancel) +{ + int rc = 0; + ENTRY; + + if (unlikely(instant_cancel)) { + rc = ptl_send_rpc(req, 1); + ptlrpc_req_finished(req); + if (rc == 0) + atomic_inc(&arg->restart); + } else { + LDLM_LOCK_GET(lock); + ptlrpc_set_add_req(arg->set, req); + } + + RETURN(rc); +} + +/** + * Check if there are requests in the export request list which prevent + * the lock canceling and make these requests high priority ones. + */ +static void ldlm_lock_reorder_req(struct ldlm_lock *lock) +{ + struct ptlrpc_request *req; + ENTRY; + + if (lock->l_export == NULL) { + LDLM_DEBUG(lock, "client lock: no-op"); + RETURN_EXIT; + } + + spin_lock(&lock->l_export->exp_rpc_lock); + list_for_each_entry(req, &lock->l_export->exp_hp_rpcs, + rq_exp_list) { + /* Do not process requests that were not yet added to there + * incoming queue or were already removed from there for + * processing. We evaluate ptlrpc_nrs_req_can_move() without + * holding svcpt->scp_req_lock, and then redo the check with + * the lock held once we need to obtain a reliable result. + */ + if (ptlrpc_nrs_req_can_move(req) && + req->rq_ops->hpreq_lock_match && + req->rq_ops->hpreq_lock_match(req, lock)) + ptlrpc_nrs_req_hp_move(req); + } + spin_unlock(&lock->l_export->exp_rpc_lock); + EXIT; +} + +/** + * ->l_blocking_ast() method for server-side locks. This is invoked when newly + * enqueued server lock conflicts with given one. + * + * Sends blocking AST RPC to the client owning that lock; arms timeout timer + * to wait for client response. + */ +int ldlm_server_blocking_ast(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct ldlm_cb_async_args *ca; + struct ldlm_cb_set_arg *arg = data; + struct ldlm_request *body; + struct ptlrpc_request *req; + int instant_cancel = 0; + int rc = 0; + ENTRY; + + if (flag == LDLM_CB_CANCELING) + /* Don't need to do anything here. */ + RETURN(0); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_SRV_BL_AST)) { + LDLM_DEBUG(lock, "dropping BL AST"); + RETURN(0); + } + + LASSERT(lock); + LASSERT(data != NULL); + if (lock->l_export->exp_obd->obd_recovering != 0) + LDLM_ERROR(lock, "BUG 6063: lock collide during recovery"); + + ldlm_lock_reorder_req(lock); + + req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse, + &RQF_LDLM_BL_CALLBACK, + LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK); + if (req == NULL) + RETURN(-ENOMEM); + + CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args)); + ca = ptlrpc_req_async_args(req); + ca->ca_set_arg = arg; + ca->ca_lock = lock; + + req->rq_interpret_reply = ldlm_cb_interpret; + + lock_res_and_lock(lock); + if (ldlm_is_destroyed(lock)) { + /* What's the point? */ + unlock_res_and_lock(lock); + ptlrpc_req_finished(req); + RETURN(0); + } + + if (!ldlm_is_granted(lock)) { + /* this blocking AST will be communicated as part of the + * completion AST instead */ + ldlm_add_blocked_lock(lock); + ldlm_set_waited(lock); + unlock_res_and_lock(lock); + + ptlrpc_req_finished(req); + LDLM_DEBUG(lock, "lock not granted, not sending blocking AST"); + RETURN(0); + } + + if (ldlm_is_cancel_on_block(lock)) + instant_cancel = 1; + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + body->lock_handle[0] = lock->l_remote_handle; + body->lock_desc = *desc; + body->lock_flags |= ldlm_flags_to_wire(lock->l_flags & LDLM_FL_AST_MASK); + + LDLM_DEBUG(lock, "server preparing blocking AST"); + + ptlrpc_request_set_replen(req); + ldlm_set_cbpending(lock); + if (instant_cancel) { + unlock_res_and_lock(lock); + ldlm_lock_cancel(lock); + + req->rq_no_resend = 1; + } else { + LASSERT(ldlm_is_granted(lock)); + ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock)); + unlock_res_and_lock(lock); + + /* Do not resend after lock callback timeout */ + req->rq_delay_limit = ldlm_bl_timeout(lock); + req->rq_resend_cb = ldlm_update_resend; + } + + req->rq_send_state = LUSTRE_IMP_FULL; + /* ptlrpc_request_alloc_pack already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); + + if (lock->l_export && lock->l_export->exp_nid_stats && + lock->l_export->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats, + LDLM_BL_CALLBACK - LDLM_FIRST_OPC); + + rc = ldlm_ast_fini(req, arg, lock, instant_cancel); + + RETURN(rc); +} + +/** + * ->l_completion_ast callback for a remote lock in server namespace. + * + * Sends AST to the client notifying it of lock granting. If initial + * lock response was not sent yet, instead of sending another RPC, just + * mark the lock as granted and client will understand + */ +int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) +{ + struct ldlm_cb_set_arg *arg = data; + struct ldlm_request *body; + struct ptlrpc_request *req; + struct ldlm_cb_async_args *ca; + int instant_cancel = 0; + int rc = 0; + int lvb_len; + ENTRY; + + LASSERT(lock != NULL); + LASSERT(data != NULL); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_SRV_CP_AST)) { + LDLM_DEBUG(lock, "dropping CP AST"); + RETURN(0); + } + + req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse, + &RQF_LDLM_CP_CALLBACK); + if (req == NULL) + RETURN(-ENOMEM); + + /* server namespace, doesn't need lock */ + lvb_len = ldlm_lvbo_size(lock); + /* LU-3124 & LU-2187: to not return layout in completion AST because + * it may deadlock for LU-2187, or client may not have enough space + * for large layout. The layout will be returned to client with an + * extra RPC to fetch xattr.lov */ + if (ldlm_has_layout(lock)) + lvb_len = 0; + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len); + rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args)); + ca = ptlrpc_req_async_args(req); + ca->ca_set_arg = arg; + ca->ca_lock = lock; + + req->rq_interpret_reply = ldlm_cb_interpret; + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + + body->lock_handle[0] = lock->l_remote_handle; + body->lock_flags = ldlm_flags_to_wire(flags); + ldlm_lock2desc(lock, &body->lock_desc); + if (lvb_len > 0) { + void *lvb = req_capsule_client_get(&req->rq_pill, &RMF_DLM_LVB); + lvb_len = ldlm_lvbo_fill(lock, lvb, &lvb_len); + if (lvb_len < 0) { + /* We still need to send the RPC to wake up the blocked + * enqueue thread on the client. + * + * Consider old client, there is no better way to notify + * the failure, just zero-sized the LVB, then the client + * will fail out as "-EPROTO". */ + req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, 0, + RCL_CLIENT); + instant_cancel = 1; + } else { + req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, lvb_len, + RCL_CLIENT); + } + } + + LDLM_DEBUG(lock, "server preparing completion AST"); + + ptlrpc_request_set_replen(req); + + req->rq_send_state = LUSTRE_IMP_FULL; + /* ptlrpc_request_pack already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); + + /* We only send real blocking ASTs after the lock is granted */ + lock_res_and_lock(lock); + if (ldlm_is_ast_sent(lock)) { + body->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT); + /* Copy AST flags like LDLM_FL_DISCARD_DATA. */ + body->lock_flags |= ldlm_flags_to_wire(lock->l_flags & + LDLM_FL_AST_MASK); + + /* We might get here prior to ldlm_handle_enqueue setting + * LDLM_FL_CANCEL_ON_BLOCK flag. Then we will put this lock + * into waiting list, but this is safe and similar code in + * ldlm_handle_enqueue will call ldlm_lock_cancel() still, + * that would not only cancel the lock, but will also remove + * it from waiting list */ + if (ldlm_is_cancel_on_block(lock)) { + unlock_res_and_lock(lock); + ldlm_lock_cancel(lock); + + instant_cancel = 1; + req->rq_no_resend = 1; + + lock_res_and_lock(lock); + } else { + /* start the lock-timeout clock */ + ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock)); + /* Do not resend after lock callback timeout */ + req->rq_delay_limit = ldlm_bl_timeout(lock); + req->rq_resend_cb = ldlm_update_resend; + } + } + unlock_res_and_lock(lock); + + if (lock->l_export && lock->l_export->exp_nid_stats && + lock->l_export->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats, + LDLM_CP_CALLBACK - LDLM_FIRST_OPC); + + rc = ldlm_ast_fini(req, arg, lock, instant_cancel); + + RETURN(lvb_len < 0 ? lvb_len : rc); +} + +/** + * Server side ->l_glimpse_ast handler for client locks. + * + * Sends glimpse AST to the client and waits for reply. Then updates + * lvbo with the result. + */ +int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data) +{ + struct ldlm_cb_set_arg *arg = data; + struct ldlm_request *body; + struct ptlrpc_request *req; + struct ldlm_cb_async_args *ca; + int rc; + struct req_format *req_fmt; + ENTRY; + + LASSERT(lock != NULL); + + if (arg->gl_desc != NULL) + /* There is a glimpse descriptor to pack */ + req_fmt = &RQF_LDLM_GL_CALLBACK_DESC; + else + req_fmt = &RQF_LDLM_GL_CALLBACK; + + req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse, + req_fmt, LUSTRE_DLM_VERSION, + LDLM_GL_CALLBACK); + + if (req == NULL) + RETURN(-ENOMEM); + + if (arg->gl_desc != NULL) { + /* copy the GL descriptor */ + union ldlm_gl_desc *desc; + desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC); + *desc = *arg->gl_desc; + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + body->lock_handle[0] = lock->l_remote_handle; + ldlm_lock2desc(lock, &body->lock_desc); + + CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args)); + ca = ptlrpc_req_async_args(req); + ca->ca_set_arg = arg; + ca->ca_lock = lock; + + /* server namespace, doesn't need lock */ + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + ldlm_lvbo_size(lock)); + ptlrpc_request_set_replen(req); + + req->rq_send_state = LUSTRE_IMP_FULL; + /* ptlrpc_request_alloc_pack already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); + + req->rq_interpret_reply = ldlm_cb_interpret; + + if (lock->l_export && lock->l_export->exp_nid_stats && + lock->l_export->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats, + LDLM_GL_CALLBACK - LDLM_FIRST_OPC); + + rc = ldlm_ast_fini(req, arg, lock, 0); + + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_server_glimpse_ast); + +int ldlm_glimpse_locks(struct ldlm_resource *res, + struct list_head *gl_work_list) +{ + int rc; + ENTRY; + + rc = ldlm_run_ast_work(ldlm_res_to_ns(res), gl_work_list, + LDLM_WORK_GL_AST); + if (rc == -ERESTART) + ldlm_reprocess_all(res, NULL); + + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_glimpse_locks); + +/* return LDLM lock associated with a lock callback request */ +struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req) +{ + struct ldlm_cb_async_args *ca; + struct ldlm_lock *lock; + ENTRY; + + ca = ptlrpc_req_async_args(req); + lock = ca->ca_lock; + if (lock == NULL) + RETURN(ERR_PTR(-EFAULT)); + + RETURN(lock); +} +EXPORT_SYMBOL(ldlm_request_lock); + +/** + * Main server-side entry point into LDLM for enqueue. This is called by ptlrpc + * service threads to carry out client lock enqueueing requests. + */ +int ldlm_handle_enqueue0(struct ldlm_namespace *ns, + struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + const struct ldlm_callback_suite *cbs) +{ + struct ldlm_reply *dlm_rep; + __u64 flags; + enum ldlm_error err = ELDLM_OK; + struct ldlm_lock *lock = NULL; + void *cookie = NULL; + int rc = 0; + struct ldlm_resource *res = NULL; + const struct lu_env *env = req->rq_svc_thread->t_env; + ENTRY; + + LDLM_DEBUG_NOLOCK("server-side enqueue handler START"); + + ldlm_request_cancel(req, dlm_req, LDLM_ENQUEUE_CANCEL_OFF, LATF_SKIP); + flags = ldlm_flags_from_wire(dlm_req->lock_flags); + + LASSERT(req->rq_export); + + /* for intent enqueue the stat will be updated inside intent policy */ + if (ptlrpc_req2svc(req)->srv_stats != NULL && + !(dlm_req->lock_flags & LDLM_FL_HAS_INTENT)) + ldlm_svc_get_eopc(dlm_req, ptlrpc_req2svc(req)->srv_stats); + + if (req->rq_export && req->rq_export->exp_nid_stats && + req->rq_export->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats, + LDLM_ENQUEUE - LDLM_FIRST_OPC); + + if (unlikely(dlm_req->lock_desc.l_resource.lr_type < LDLM_MIN_TYPE || + dlm_req->lock_desc.l_resource.lr_type >= LDLM_MAX_TYPE)) { + DEBUG_REQ(D_ERROR, req, "invalid lock request type %d", + dlm_req->lock_desc.l_resource.lr_type); + GOTO(out, rc = -EFAULT); + } + + if (unlikely(dlm_req->lock_desc.l_req_mode <= LCK_MINMODE || + dlm_req->lock_desc.l_req_mode >= LCK_MAXMODE || + dlm_req->lock_desc.l_req_mode & + (dlm_req->lock_desc.l_req_mode-1))) { + DEBUG_REQ(D_ERROR, req, "invalid lock request mode %d", + dlm_req->lock_desc.l_req_mode); + GOTO(out, rc = -EFAULT); + } + + if (unlikely((flags & LDLM_FL_REPLAY) || + (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) { + /* Find an existing lock in the per-export lock hash */ + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + lock = cfs_hash_lookup(req->rq_export->exp_lock_hash, + (void *)&dlm_req->lock_handle[0]); + if (lock != NULL) { + DEBUG_REQ(D_DLMTRACE, req, "found existing lock cookie %#llx", + lock->l_handle.h_cookie); + flags |= LDLM_FL_RESENT; + GOTO(existing_lock, rc = 0); + } + } else { + if (ldlm_reclaim_full()) { + DEBUG_REQ(D_DLMTRACE, req, "Too many granted locks, " + "reject current enqueue request and let the " + "client retry later.\n"); + GOTO(out, rc = -EINPROGRESS); + } + } + + /* The lock's callback data might be set in the policy function */ + lock = ldlm_lock_create(ns, &dlm_req->lock_desc.l_resource.lr_name, + dlm_req->lock_desc.l_resource.lr_type, + dlm_req->lock_desc.l_req_mode, + cbs, NULL, 0, LVB_T_NONE); + if (IS_ERR(lock)) { + rc = PTR_ERR(lock); + lock = NULL; + GOTO(out, rc); + } + + lock->l_remote_handle = dlm_req->lock_handle[0]; + LDLM_DEBUG(lock, "server-side enqueue handler, new lock created"); + + /* Initialize resource lvb but not for a lock being replayed since + * Client already got lvb sent in this case. + * This must occur early since some policy methods assume resource + * lvb is available (lr_lvb_data != NULL). + */ + res = lock->l_resource; + if (!(flags & LDLM_FL_REPLAY)) { + /* non-replayed lock, delayed lvb init may need to be done */ + rc = ldlm_lvbo_init(res); + if (rc < 0) { + LDLM_DEBUG(lock, "delayed lvb init failed (rc %d)", rc); + GOTO(out, rc); + } + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_BLOCKED, obd_timeout * 2); + /* Don't enqueue a lock onto the export if it is been disonnected + * due to eviction (bug 3822) or server umount (bug 24324). + * Cancel it now instead. */ + if (req->rq_export->exp_disconnected) { + LDLM_ERROR(lock, "lock on disconnected export %p", + req->rq_export); + GOTO(out, rc = -ENOTCONN); + } + + lock->l_export = class_export_lock_get(req->rq_export, lock); + if (lock->l_export->exp_lock_hash) + cfs_hash_add(lock->l_export->exp_lock_hash, + &lock->l_remote_handle, + &lock->l_exp_hash); + + /* Inherit the enqueue flags before the operation, because we do not + * keep the res lock on return and next operations (BL AST) may proceed + * without them. */ + lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags & + LDLM_FL_INHERIT_MASK); + + ldlm_convert_policy_to_local(req->rq_export, + dlm_req->lock_desc.l_resource.lr_type, + &dlm_req->lock_desc.l_policy_data, + &lock->l_policy_data); + if (dlm_req->lock_desc.l_resource.lr_type == LDLM_EXTENT) + lock->l_req_extent = lock->l_policy_data.l_extent; + else if (dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) + lock->l_policy_data.l_inodebits.try_bits = + dlm_req->lock_desc.l_policy_data.l_inodebits.try_bits; + +existing_lock: + if (flags & LDLM_FL_HAS_INTENT) { + /* In this case, the reply buffer is allocated deep in + * local_lock_enqueue by the policy function. */ + cookie = req; + } else { + /* based on the assumption that lvb size never changes during + * resource life time otherwise it need resource->lr_lock's + * protection */ + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, + RCL_SERVER, ldlm_lvbo_size(lock)); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR)) + GOTO(out, rc = -ENOMEM); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out, rc); + } + + err = ldlm_lock_enqueue(env, ns, &lock, cookie, &flags); + if (err) { + if ((int)err < 0) + rc = (int)err; + GOTO(out, err); + } + + dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + + ldlm_lock2desc(lock, &dlm_rep->lock_desc); + ldlm_lock2handle(lock, &dlm_rep->lock_handle); + + if (lock && lock->l_resource->lr_type == LDLM_EXTENT) + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 6); + + /* We never send a blocking AST until the lock is granted, but + * we can tell it right now */ + lock_res_and_lock(lock); + + /* Now take into account flags to be inherited from original lock + request both in reply to client and in our own lock flags. */ + dlm_rep->lock_flags = ldlm_flags_to_wire(flags); + lock->l_flags |= flags & LDLM_FL_INHERIT_MASK; + + /* Don't move a pending lock onto the export if it has already been + * disconnected due to eviction (bug 5683) or server umount (bug 24324). + * Cancel it now instead. */ + if (unlikely(req->rq_export->exp_disconnected || + OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT))) { + LDLM_ERROR(lock, "lock on destroyed export %p", req->rq_export); + rc = -ENOTCONN; + } else if (ldlm_is_ast_sent(lock)) { + /* fill lock desc for possible lock convert */ + if (lock->l_blocking_lock && + lock->l_resource->lr_type == LDLM_IBITS) { + struct ldlm_lock *bl_lock = lock->l_blocking_lock; + struct ldlm_lock_desc *rep_desc = &dlm_rep->lock_desc; + + LDLM_DEBUG(lock, + "save blocking bits %llx in granted lock", + bl_lock->l_policy_data.l_inodebits.bits); + /* If lock is blocked then save blocking ibits + * in returned lock policy for the possible lock + * convert on a client. + */ + rep_desc->l_policy_data.l_inodebits.cancel_bits = + bl_lock->l_policy_data.l_inodebits.bits; + } + dlm_rep->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT); + if (ldlm_is_granted(lock)) { + /* + * Only cancel lock if it was granted, because it would + * be destroyed immediately and would never be granted + * in the future, causing timeouts on client. Not + * granted lock will be cancelled immediately after + * sending completion AST. + */ + if (dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK) { + unlock_res_and_lock(lock); + ldlm_lock_cancel(lock); + lock_res_and_lock(lock); + } else { + ldlm_add_waiting_lock(lock, + ldlm_bl_timeout(lock)); + } + } + } + unlock_res_and_lock(lock); + + EXIT; + out: + req->rq_status = rc ?: err; /* return either error - bug 11190 */ + if (!req->rq_packed_final) { + err = lustre_pack_reply(req, 1, NULL, NULL); + if (rc == 0) + rc = err; + } + + /* The LOCK_CHANGED code in ldlm_lock_enqueue depends on this + * ldlm_reprocess_all. If this moves, revisit that code. -phil */ + if (lock != NULL) { + LDLM_DEBUG(lock, "server-side enqueue handler, sending reply" + "(err=%d, rc=%d)", err, rc); + + if (rc == 0 && + req_capsule_has_field(&req->rq_pill, &RMF_DLM_LVB, + RCL_SERVER) && + ldlm_lvbo_size(lock) > 0) { + void *buf; + int buflen; + +retry: + buf = req_capsule_server_get(&req->rq_pill, + &RMF_DLM_LVB); + LASSERTF(buf != NULL, "req %p, lock %p\n", req, lock); + buflen = req_capsule_get_size(&req->rq_pill, + &RMF_DLM_LVB, RCL_SERVER); + /* non-replayed lock, delayed lvb init may + * need to be occur now + */ + if ((buflen > 0) && !(flags & LDLM_FL_REPLAY)) { + int rc2; + + rc2 = ldlm_lvbo_fill(lock, buf, &buflen); + if (rc2 >= 0) { + req_capsule_shrink(&req->rq_pill, + &RMF_DLM_LVB, + rc2, RCL_SERVER); + } else if (rc2 == -ERANGE) { + rc2 = req_capsule_server_grow( + &req->rq_pill, + &RMF_DLM_LVB, buflen); + if (!rc2) { + goto retry; + } else { + /* if we can't grow the buffer, + * it's ok to return empty lvb + * to client. + */ + req_capsule_shrink( + &req->rq_pill, + &RMF_DLM_LVB, 0, + RCL_SERVER); + } + } else { + rc = rc2; + } + } else if (flags & LDLM_FL_REPLAY) { + /* no LVB resend upon replay */ + if (buflen > 0) + req_capsule_shrink(&req->rq_pill, + &RMF_DLM_LVB, + 0, RCL_SERVER); + else + rc = buflen; + } else { + rc = buflen; + } + } + + if (rc != 0 && !(flags & LDLM_FL_RESENT)) { + if (lock->l_export) { + ldlm_lock_cancel(lock); + } else { + lock_res_and_lock(lock); + ldlm_resource_unlink_lock(lock); + ldlm_lock_destroy_nolock(lock); + unlock_res_and_lock(lock); + + } + } + + if (!err && !ldlm_is_cbpending(lock) && + dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK) + ldlm_reprocess_all(lock->l_resource, lock); + + LDLM_LOCK_RELEASE(lock); + } + + LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)", + lock, rc); + + return rc; +} + +/* Clear the blocking lock, the race is possible between ldlm_handle_convert0() + * and ldlm_work_bl_ast_lock(), so this is done under lock with check for NULL. + */ +void ldlm_clear_blocking_lock(struct ldlm_lock *lock) +{ + if (lock->l_blocking_lock) { + LDLM_LOCK_RELEASE(lock->l_blocking_lock); + lock->l_blocking_lock = NULL; + } +} + +/* A lock can be converted to new ibits or mode and should be considered + * as new lock. Clear all states related to a previous blocking AST + * processing so new conflicts will cause new blocking ASTs. + * + * This is used during lock convert below and lock downgrade to COS mode in + * ldlm_lock_mode_downgrade(). + */ +void ldlm_clear_blocking_data(struct ldlm_lock *lock) +{ + ldlm_clear_ast_sent(lock); + lock->l_bl_ast_run = 0; + ldlm_clear_blocking_lock(lock); +} + +/** + * Main LDLM entry point for server code to process lock conversion requests. + */ +int ldlm_handle_convert0(struct ptlrpc_request *req, + const struct ldlm_request *dlm_req) +{ + struct obd_export *exp = req->rq_export; + struct ldlm_reply *dlm_rep; + struct ldlm_lock *lock; + __u64 bits; + __u64 new_bits; + int rc; + + ENTRY; + + if (exp && exp->exp_nid_stats && exp->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(exp->exp_nid_stats->nid_ldlm_stats, + LDLM_CONVERT - LDLM_FIRST_OPC); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(rc); + + dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + dlm_rep->lock_flags = dlm_req->lock_flags; + + lock = ldlm_handle2lock(&dlm_req->lock_handle[0]); + if (!lock) { + LDLM_DEBUG_NOLOCK("server lock is canceled already"); + req->rq_status = ELDLM_NO_LOCK_DATA; + RETURN(0); + } + + LDLM_DEBUG(lock, "server-side convert handler START"); + + lock_res_and_lock(lock); + bits = lock->l_policy_data.l_inodebits.bits; + new_bits = dlm_req->lock_desc.l_policy_data.l_inodebits.bits; + + if (ldlm_is_cancel(lock)) { + LDLM_DEBUG(lock, "convert on canceled lock!"); + unlock_res_and_lock(lock); + GOTO(out_put, rc = ELDLM_NO_LOCK_DATA); + } + + if (dlm_req->lock_desc.l_req_mode != lock->l_granted_mode) { + LDLM_ERROR(lock, "lock mode differs!"); + unlock_res_and_lock(lock); + GOTO(out_put, rc = -EPROTO); + } + + if (bits == new_bits) { + /* + * This can be valid situation if CONVERT RPCs are + * re-ordered. Just finish silently + */ + LDLM_DEBUG(lock, "lock is converted already!"); + unlock_res_and_lock(lock); + } else { + if (ldlm_is_waited(lock)) + ldlm_del_waiting_lock(lock); + + ldlm_clear_cbpending(lock); + lock->l_policy_data.l_inodebits.cancel_bits = 0; + ldlm_inodebits_drop(lock, bits & ~new_bits); + + ldlm_clear_blocking_data(lock); + unlock_res_and_lock(lock); + + ldlm_reprocess_all(lock->l_resource, NULL); + } + + dlm_rep->lock_handle = lock->l_remote_handle; + ldlm_ibits_policy_local_to_wire(&lock->l_policy_data, + &dlm_rep->lock_desc.l_policy_data); + rc = ELDLM_OK; + EXIT; +out_put: + LDLM_DEBUG(lock, "server-side convert handler END, rc = %d", rc); + LDLM_LOCK_PUT(lock); + req->rq_status = rc; + return 0; +} + +/** + * Cancel all the locks whose handles are packed into ldlm_request + * + * Called by server code expecting such combined cancel activity + * requests. + */ +int ldlm_request_cancel(struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + int first, enum lustre_at_flags flags) +{ + struct ldlm_resource *res, *pres = NULL; + struct ldlm_lock *lock; + int i, count, done = 0; + unsigned int size; + + ENTRY; + + size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT); + if (size <= offsetof(struct ldlm_request, lock_handle) || + (size - offsetof(struct ldlm_request, lock_handle)) / + sizeof(struct lustre_handle) < dlm_req->lock_count) + RETURN(0); + + count = dlm_req->lock_count ? dlm_req->lock_count : 1; + if (first >= count) + RETURN(0); + + if (count == 1 && dlm_req->lock_handle[0].cookie == 0) + RETURN(0); + + /* There is no lock on the server at the replay time, + * skip lock cancelling to make replay tests to pass. */ + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) + RETURN(0); + + LDLM_DEBUG_NOLOCK("server-side cancel handler START: %d locks, " + "starting at %d", count, first); + + for (i = first; i < count; i++) { + lock = ldlm_handle2lock(&dlm_req->lock_handle[i]); + if (!lock) { + LDLM_DEBUG_NOLOCK("server-side cancel handler stale " + "lock (cookie %llu)", + dlm_req->lock_handle[i].cookie); + continue; + } + + res = lock->l_resource; + done++; + + /* This code is an optimization to only attempt lock + * granting on the resource (that could be CPU-expensive) + * after we are done cancelling lock in that resource. + */ + if (res != pres) { + if (pres != NULL) { + ldlm_reprocess_all(pres, NULL); + LDLM_RESOURCE_DELREF(pres); + ldlm_resource_putref(pres); + } + if (res != NULL) { + ldlm_resource_getref(res); + LDLM_RESOURCE_ADDREF(res); + + if (!ldlm_is_discard_data(lock)) + ldlm_lvbo_update(res, lock, + NULL, 1); + } + pres = res; + } + + if ((flags & LATF_STATS) && ldlm_is_ast_sent(lock) && + lock->l_blast_sent != 0) { + time64_t delay = ktime_get_real_seconds() - + lock->l_blast_sent; + LDLM_DEBUG(lock, "server cancels blocked lock after %llds", + (s64)delay); + at_measured(&lock->l_export->exp_bl_lock_at, delay); + } + ldlm_lock_cancel(lock); + LDLM_LOCK_PUT(lock); + } + if (pres != NULL) { + ldlm_reprocess_all(pres, NULL); + LDLM_RESOURCE_DELREF(pres); + ldlm_resource_putref(pres); + } + LDLM_DEBUG_NOLOCK("server-side cancel handler END"); + RETURN(done); +} +EXPORT_SYMBOL(ldlm_request_cancel); + +/** + * Main LDLM entry point for server code to cancel locks. + * + * Typically gets called from service handler on LDLM_CANCEL opc. + */ +int ldlm_handle_cancel(struct ptlrpc_request *req) +{ + struct ldlm_request *dlm_req; + int rc; + ENTRY; + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) { + CDEBUG(D_INFO, "bad request buffer for cancel\n"); + RETURN(-EFAULT); + } + + if (req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) < + offsetof(struct ldlm_request, lock_handle[1])) + RETURN(-EPROTO); + + if (req->rq_export && req->rq_export->exp_nid_stats && + req->rq_export->exp_nid_stats->nid_ldlm_stats) + lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats, + LDLM_CANCEL - LDLM_FIRST_OPC); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(rc); + + if (!ldlm_request_cancel(req, dlm_req, 0, LATF_STATS)) + req->rq_status = LUSTRE_ESTALE; + + RETURN(ptlrpc_reply(req)); +} +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * Server may pass additional information about blocking lock. + * For IBITS locks it is conflicting bits which can be used for + * lock convert instead of cancel. + */ +void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + check_res_locked(lock->l_resource); + if (ns_is_client(ns) && ld && + (lock->l_resource->lr_type == LDLM_IBITS)) { + /* + * Lock description contains policy of blocking lock, + * and its cancel_bits is used to pass conflicting bits. + * NOTE: ld can be NULL or can be not NULL but zeroed if + * passed from ldlm_bl_thread_blwi(), check below used bits + * in ld to make sure it is valid description. + */ + if (ld->l_policy_data.l_inodebits.cancel_bits && + ldlm_res_eq(&ld->l_resource.lr_name, + &lock->l_resource->lr_name) && + !(ldlm_is_cbpending(lock) && + lock->l_policy_data.l_inodebits.cancel_bits == 0)) { + /* always combine conflicting ibits */ + lock->l_policy_data.l_inodebits.cancel_bits |= + ld->l_policy_data.l_inodebits.cancel_bits; + } else { + /* If cancel_bits are not obtained or + * if the lock is already CBPENDING and + * has no cancel_bits set + * - the full lock is to be cancelled + */ + lock->l_policy_data.l_inodebits.cancel_bits = 0; + } + } +} + +/** + * Callback handler for receiving incoming blocking ASTs. + * + * This can only happen on client side. + */ +void ldlm_handle_bl_callback(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, struct ldlm_lock *lock) +{ + int do_ast; + + ENTRY; + + LDLM_DEBUG(lock, "client blocking AST callback handler"); + + lock_res_and_lock(lock); + + /* get extra information from desc if any */ + ldlm_bl_desc2lock(ld, lock); + ldlm_set_cbpending(lock); + + if (ldlm_is_cancel_on_block(lock)) + ldlm_set_cancel(lock); + + do_ast = (!lock->l_readers && !lock->l_writers); + unlock_res_and_lock(lock); + + if (do_ast) { + CDEBUG(D_DLMTRACE, "Lock %p already unused, calling callback (%p)\n", + lock, lock->l_blocking_ast); + if (lock->l_blocking_ast != NULL) + lock->l_blocking_ast(lock, ld, lock->l_ast_data, + LDLM_CB_BLOCKING); + } else { + CDEBUG(D_DLMTRACE, "Lock %p is referenced, will be cancelled later\n", + lock); + } + + LDLM_DEBUG(lock, "client blocking callback handler END"); + LDLM_LOCK_RELEASE(lock); + EXIT; +} + +static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) +{ + if (req->rq_no_reply) + return 0; + + req->rq_status = rc; + if (!req->rq_packed_final) { + rc = lustre_pack_reply(req, 1, NULL, NULL); + if (rc) + return rc; + } + return ptlrpc_reply(req); +} + +/** + * Callback handler for receiving incoming completion ASTs. + * + * This only can happen on client side. + */ +static int ldlm_handle_cp_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns, + struct ldlm_request *dlm_req, + struct ldlm_lock *lock) +{ + struct list_head ast_list; + int lvb_len; + int rc = 0; + ENTRY; + + LDLM_DEBUG(lock, "client completion callback handler START"); + + INIT_LIST_HEAD(&ast_list); + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) { + long to = cfs_time_seconds(1); + + ldlm_callback_reply(req, 0); + + while (to > 0) { + set_current_state(TASK_INTERRUPTIBLE); + to = schedule_timeout(to); + if (ldlm_is_granted(lock) || + ldlm_is_destroyed(lock)) + break; + } + } + + lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT); + if (lvb_len < 0) { + LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len); + GOTO(out, rc = lvb_len); + } else if (lvb_len > 0) { + if (lock->l_lvb_len > 0) { + /* for extent lock, lvb contains ost_lvb{}. */ + LASSERT(lock->l_lvb_data != NULL); + + if (unlikely(lock->l_lvb_len < lvb_len)) { + LDLM_ERROR(lock, "Replied LVB is larger than " + "expectation, expected = %d, " + "replied = %d", + lock->l_lvb_len, lvb_len); + GOTO(out, rc = -EINVAL); + } + } + } + + lock_res_and_lock(lock); + + if (!ldlm_res_eq(&dlm_req->lock_desc.l_resource.lr_name, + &lock->l_resource->lr_name)) { + ldlm_resource_unlink_lock(lock); + unlock_res_and_lock(lock); + rc = ldlm_lock_change_resource(ns, lock, + &dlm_req->lock_desc.l_resource.lr_name); + if (rc < 0) { + LDLM_ERROR(lock, "Failed to allocate resource"); + GOTO(out, rc); + } + LDLM_DEBUG(lock, "completion AST, new resource"); + lock_res_and_lock(lock); + } + + if (ldlm_is_failed(lock)) { + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + RETURN(-EINVAL); + } + + if (ldlm_is_destroyed(lock) || + ldlm_is_granted(lock)) { + /* bug 11300: the lock has already been granted */ + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "Double grant race happened"); + GOTO(out, rc = 0); + } + + /* If we receive the completion AST before the actual enqueue returned, + * then we might need to switch lock modes, resources, or extents. */ + if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) { + lock->l_req_mode = dlm_req->lock_desc.l_granted_mode; + LDLM_DEBUG(lock, "completion AST, new lock mode"); + } + + if (lock->l_resource->lr_type != LDLM_PLAIN) { + ldlm_convert_policy_to_local(req->rq_export, + dlm_req->lock_desc.l_resource.lr_type, + &dlm_req->lock_desc.l_policy_data, + &lock->l_policy_data); + LDLM_DEBUG(lock, "completion AST, new policy data"); + } + + ldlm_resource_unlink_lock(lock); + + if (dlm_req->lock_flags & LDLM_FL_AST_SENT) { + /* + * BL_AST locks are not needed in LRU. + * Let ldlm_cancel_lru() be fast. + */ + ldlm_lock_remove_from_lru(lock); + ldlm_bl_desc2lock(&dlm_req->lock_desc, lock); + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST; + LDLM_DEBUG(lock, "completion AST includes blocking AST"); + } + + if (lock->l_lvb_len > 0) { + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT, + lock->l_lvb_data, lvb_len); + if (rc < 0) { + unlock_res_and_lock(lock); + GOTO(out, rc); + } + } + + ldlm_grant_lock(lock, &ast_list); + unlock_res_and_lock(lock); + + LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work"); + + /* Let Enqueue to call osc_lock_upcall() and initialize + * l_ast_data */ + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2); + + ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST); + + LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)", + lock); + GOTO(out, rc); + +out: + if (rc < 0) { + lock_res_and_lock(lock); + ldlm_set_failed(lock); + unlock_res_and_lock(lock); + wake_up(&lock->l_waitq); + } + LDLM_LOCK_RELEASE(lock); + + return 0; +} + +/** + * Callback handler for receiving incoming glimpse ASTs. + * + * This only can happen on client side. After handling the glimpse AST + * we also consider dropping the lock here if it is unused locally for a + * long time. + */ +static void ldlm_handle_gl_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns, + struct ldlm_request *dlm_req, + struct ldlm_lock *lock) +{ + struct ldlm_lock_desc *ld = &dlm_req->lock_desc; + int rc = -ENOSYS; + + ENTRY; + + LDLM_DEBUG(lock, "client glimpse AST callback handler"); + + if (lock->l_glimpse_ast != NULL) + rc = lock->l_glimpse_ast(lock, req); + + if (req->rq_repmsg != NULL) { + ptlrpc_reply(req); + } else { + req->rq_status = rc; + ptlrpc_error(req); + } + + lock_res_and_lock(lock); + if (lock->l_granted_mode == LCK_PW && + !lock->l_readers && !lock->l_writers && + ktime_after(ktime_get(), + ktime_add(lock->l_last_used, + ktime_set(ns->ns_dirty_age_limit, 0)))) { + unlock_res_and_lock(lock); + + /* For MDS glimpse it is always DOM lock, set corresponding + * cancel_bits to perform lock convert if needed + */ + if (lock->l_resource->lr_type == LDLM_IBITS) + ld->l_policy_data.l_inodebits.cancel_bits = + MDS_INODELOCK_DOM; + if (ldlm_bl_to_thread_lock(ns, ld, lock)) + ldlm_handle_bl_callback(ns, ld, lock); + + EXIT; + return; + } + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + EXIT; +} + +static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi, + enum ldlm_cancel_flags cancel_flags) +{ + struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; + ENTRY; + + spin_lock(&blp->blp_lock); + if (blwi->blwi_lock && + ldlm_is_discard_data(blwi->blwi_lock)) { + /* add LDLM_FL_DISCARD_DATA requests to the priority list */ + list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list); + } else { + /* other blocking callbacks are added to the regular list */ + list_add_tail(&blwi->blwi_entry, &blp->blp_list); + } + spin_unlock(&blp->blp_lock); + + wake_up(&blp->blp_waitq); + + /* can not check blwi->blwi_flags as blwi could be already freed in + LCF_ASYNC mode */ + if (!(cancel_flags & LCF_ASYNC)) + wait_for_completion(&blwi->blwi_comp); + + RETURN(0); +} + +static inline void init_blwi(struct ldlm_bl_work_item *blwi, + struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + struct ldlm_lock *lock, + enum ldlm_cancel_flags cancel_flags) +{ + init_completion(&blwi->blwi_comp); + INIT_LIST_HEAD(&blwi->blwi_head); + + if (memory_pressure_get()) + blwi->blwi_mem_pressure = 1; + + blwi->blwi_ns = ns; + blwi->blwi_flags = cancel_flags; + if (ld != NULL) + blwi->blwi_ld = *ld; + if (count) { + list_add(&blwi->blwi_head, cancels); + list_del_init(cancels); + blwi->blwi_count = count; + } else { + blwi->blwi_lock = lock; + } +} + +/** + * Queues a list of locks \a cancels containing \a count locks + * for later processing by a blocking thread. If \a count is zero, + * then the lock referenced as \a lock is queued instead. + * + * The blocking thread would then call ->l_blocking_ast callback in the lock. + * If list addition fails an error is returned and caller is supposed to + * call ->l_blocking_ast itself. + */ +static int ldlm_bl_to_thread(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct ldlm_lock *lock, + struct list_head *cancels, int count, + enum ldlm_cancel_flags cancel_flags) +{ + ENTRY; + + if (cancels && count == 0) + RETURN(0); + + if (cancel_flags & LCF_ASYNC) { + struct ldlm_bl_work_item *blwi; + + OBD_ALLOC(blwi, sizeof(*blwi)); + if (blwi == NULL) + RETURN(-ENOMEM); + init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags); + + RETURN(__ldlm_bl_to_thread(blwi, cancel_flags)); + } else { + /* if it is synchronous call do minimum mem alloc, as it could + * be triggered from kernel shrinker + */ + struct ldlm_bl_work_item blwi; + + memset(&blwi, 0, sizeof(blwi)); + init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags); + RETURN(__ldlm_bl_to_thread(&blwi, cancel_flags)); + } +} + + +int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct ldlm_lock *lock) +{ + return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC); +} + +int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + enum ldlm_cancel_flags cancel_flags) +{ + return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags); +} + +int ldlm_bl_thread_wakeup(void) +{ + wake_up(&ldlm_state->ldlm_bl_pool->blp_waitq); + return 0; +} + +/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */ +static int ldlm_handle_setinfo(struct ptlrpc_request *req) +{ + struct obd_device *obd = req->rq_export->exp_obd; + char *key; + void *val; + int keylen, vallen; + int rc = -ENOSYS; + ENTRY; + + DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name); + + req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO); + + key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + if (key == NULL) { + DEBUG_REQ(D_IOCTL, req, "no set_info key"); + RETURN(-EFAULT); + } + keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT); + val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); + if (val == NULL) { + DEBUG_REQ(D_IOCTL, req, "no set_info val"); + RETURN(-EFAULT); + } + vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT); + + /* We are responsible for swabbing contents of val */ + + if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) + /* Pass it on to mdc (the "export" in this case) */ + rc = obd_set_info_async(req->rq_svc_thread->t_env, + req->rq_export, + sizeof(KEY_HSM_COPYTOOL_SEND), + KEY_HSM_COPYTOOL_SEND, + vallen, val, NULL); + else + DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key); + + return rc; +} + +static inline void ldlm_callback_errmsg(struct ptlrpc_request *req, + const char *msg, int rc, + const struct lustre_handle *handle) +{ + DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req, + "%s: [nid %s] [rc %d] [lock %#llx]", + msg, libcfs_id2str(req->rq_peer), rc, + handle ? handle->cookie : 0); + if (req->rq_no_reply) + CWARN("No reply was sent, maybe cause bug 21636.\n"); + else if (rc) + CWARN("Send reply failed, maybe cause bug 21636.\n"); +} + +/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */ +static int ldlm_callback_handler(struct ptlrpc_request *req) +{ + struct ldlm_namespace *ns; + struct ldlm_request *dlm_req; + struct ldlm_lock *lock; + int rc; + ENTRY; + + /* Requests arrive in sender's byte order. The ptlrpc service + * handler has already checked and, if necessary, byte-swapped the + * incoming request message body, but I am responsible for the + * message buffers. */ + + /* do nothing for sec context finalize */ + if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI) + RETURN(0); + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + + if (req->rq_export == NULL) { + rc = ldlm_callback_reply(req, -ENOTCONN); + ldlm_callback_errmsg(req, "Operate on unconnected server", + rc, NULL); + RETURN(0); + } + + LASSERT(req->rq_export != NULL); + LASSERT(req->rq_export->exp_obd != NULL); + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case LDLM_BL_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) { + if (cfs_fail_err) + ldlm_callback_reply(req, -(int)cfs_fail_err); + RETURN(0); + } + break; + case LDLM_CP_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET)) + RETURN(0); + break; + case LDLM_GL_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET)) + RETURN(0); + break; + case LDLM_SET_INFO: + rc = ldlm_handle_setinfo(req); + ldlm_callback_reply(req, rc); + RETURN(0); + default: + CERROR("unknown opcode %u\n", + lustre_msg_get_opc(req->rq_reqmsg)); + ldlm_callback_reply(req, -EPROTO); + RETURN(0); + } + + ns = req->rq_export->exp_obd->obd_namespace; + LASSERT(ns != NULL); + + req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK); + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) { + rc = ldlm_callback_reply(req, -EPROTO); + ldlm_callback_errmsg(req, "Operate without parameter", rc, + NULL); + RETURN(0); + } + + /* Force a known safe race, send a cancel to the server for a lock + * which the server has already started a blocking callback on. */ + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) && + lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { + rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0); + if (rc < 0) + CERROR("ldlm_cli_cancel: %d\n", rc); + } + + lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0); + if (!lock) { + CDEBUG(D_DLMTRACE, "callback on lock %#llx - lock " + "disappeared\n", dlm_req->lock_handle[0].cookie); + rc = ldlm_callback_reply(req, -EINVAL); + ldlm_callback_errmsg(req, "Operate with invalid parameter", rc, + &dlm_req->lock_handle[0]); + RETURN(0); + } + + if (ldlm_is_fail_loc(lock) && + lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + /* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */ + lock_res_and_lock(lock); + lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags & + LDLM_FL_AST_MASK); + if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { + /* If somebody cancels lock and cache is already dropped, + * or lock is failed before cp_ast received on client, + * we can tell the server we have no lock. Otherwise, we + * should send cancel after dropping the cache. */ + if ((ldlm_is_canceling(lock) && ldlm_is_bl_done(lock)) || + ldlm_is_failed(lock)) { + LDLM_DEBUG(lock, "callback on lock %llx - lock disappeared", + dlm_req->lock_handle[0].cookie); + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + rc = ldlm_callback_reply(req, -EINVAL); + ldlm_callback_errmsg(req, "Operate on stale lock", rc, + &dlm_req->lock_handle[0]); + RETURN(0); + } + /* BL_AST locks are not needed in LRU. + * Let ldlm_cancel_lru() be fast. */ + ldlm_lock_remove_from_lru(lock); + ldlm_set_bl_ast(lock); + } + unlock_res_and_lock(lock); + + /* We want the ost thread to get this reply so that it can respond + * to ost requests (write cache writeback) that might be triggered + * in the callback. + * + * But we'd also like to be able to indicate in the reply that we're + * cancelling right now, because it's unused, or have an intent result + * in the reply, so we might have to push the responsibility for sending + * the reply down into the AST handlers, alas. */ + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case LDLM_BL_CALLBACK: + CDEBUG(D_INODE, "blocking ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK); + if (!ldlm_is_cancel_on_block(lock)) { + rc = ldlm_callback_reply(req, 0); + if (req->rq_no_reply || rc) + ldlm_callback_errmsg(req, "Normal process", rc, + &dlm_req->lock_handle[0]); + } + if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock)) + ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock); + break; + case LDLM_CP_CALLBACK: + CDEBUG(D_INODE, "completion ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK); + rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock); + if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) + ldlm_callback_reply(req, rc); + break; + case LDLM_GL_CALLBACK: + CDEBUG(D_INODE, "glimpse ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK); + ldlm_handle_gl_callback(req, ns, dlm_req, lock); + break; + default: + LBUG(); /* checked above */ + } + + RETURN(0); +} + +#ifdef HAVE_SERVER_SUPPORT +/** + * Main handler for canceld thread. + * + * Separated into its own thread to avoid deadlocks. + */ +static int ldlm_cancel_handler(struct ptlrpc_request *req) +{ + int rc; + + ENTRY; + + /* Requests arrive in sender's byte order. The ptlrpc service + * handler has already checked and, if necessary, byte-swapped the + * incoming request message body, but I am responsible for the + * message buffers. */ + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + + if (req->rq_export == NULL) { + struct ldlm_request *dlm_req; + + CERROR("%s from %s arrived at %llu with bad export cookie %llu\n", + ll_opcode2str(lustre_msg_get_opc(req->rq_reqmsg)), + libcfs_nid2str(req->rq_peer.nid), + (unsigned long long)req->rq_arrival_time.tv_sec, + lustre_msg_get_handle(req->rq_reqmsg)->cookie); + + if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_CANCEL) { + req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK); + dlm_req = req_capsule_client_get(&req->rq_pill, + &RMF_DLM_REQ); + if (dlm_req != NULL) + ldlm_lock_dump_handle(D_ERROR, + &dlm_req->lock_handle[0]); + } + ldlm_callback_reply(req, -ENOTCONN); + RETURN(0); + } + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + /* XXX FIXME move this back to mds/handler.c, bug 249 */ + case LDLM_CANCEL: + req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL); + CDEBUG(D_INODE, "cancel\n"); + if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET) || + CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND) || + CFS_FAIL_CHECK(OBD_FAIL_LDLM_BL_EVICT)) + RETURN(0); + rc = ldlm_handle_cancel(req); + break; + case LDLM_CONVERT: + { + struct ldlm_request *dlm_req; + + req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT); + CDEBUG(D_INODE, "convert\n"); + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) { + CDEBUG(D_INFO, "bad request buffer for cancel\n"); + rc = ldlm_callback_reply(req, -EPROTO); + } else { + req->rq_status = ldlm_handle_convert0(req, dlm_req); + rc = ptlrpc_reply(req); + } + break; + } + default: + CERROR("invalid opcode %d\n", + lustre_msg_get_opc(req->rq_reqmsg)); + req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK); + rc = ldlm_callback_reply(req, -EINVAL); + } + + RETURN(rc); +} + +static int ldlm_cancel_hpreq_lock_match(struct ptlrpc_request *req, + struct ldlm_lock *lock) +{ + struct ldlm_request *dlm_req; + struct lustre_handle lockh; + int rc = 0; + int i; + + ENTRY; + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) + RETURN(0); + + ldlm_lock2handle(lock, &lockh); + for (i = 0; i < dlm_req->lock_count; i++) { + if (lustre_handle_equal(&dlm_req->lock_handle[i], + &lockh)) { + DEBUG_REQ(D_RPCTRACE, req, + "Prio raised by lock %#llx.", lockh.cookie); + rc = 1; + break; + } + } + + RETURN(rc); +} + +static int ldlm_cancel_hpreq_check(struct ptlrpc_request *req) +{ + struct ldlm_request *dlm_req; + int rc = 0; + int i; + unsigned int size; + + ENTRY; + + /* no prolong in recovery */ + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) + RETURN(0); + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) + RETURN(-EFAULT); + + size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT); + if (size <= offsetof(struct ldlm_request, lock_handle) || + (size - offsetof(struct ldlm_request, lock_handle)) / + sizeof(struct lustre_handle) < dlm_req->lock_count) + RETURN(-EPROTO); + + for (i = 0; i < dlm_req->lock_count; i++) { + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(&dlm_req->lock_handle[i]); + if (lock == NULL) + continue; + + rc = ldlm_is_ast_sent(lock) ? 1 : 0; + if (rc) + LDLM_DEBUG(lock, "hpreq cancel/convert lock"); + LDLM_LOCK_PUT(lock); + + if (rc) + break; + } + + RETURN(rc); +} + +static struct ptlrpc_hpreq_ops ldlm_cancel_hpreq_ops = { + .hpreq_lock_match = ldlm_cancel_hpreq_lock_match, + .hpreq_check = ldlm_cancel_hpreq_check, + .hpreq_fini = NULL, +}; + +static int ldlm_hpreq_handler(struct ptlrpc_request *req) +{ + ENTRY; + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + + if (req->rq_export == NULL) + RETURN(0); + + if (LDLM_CANCEL == lustre_msg_get_opc(req->rq_reqmsg)) { + req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL); + req->rq_ops = &ldlm_cancel_hpreq_ops; + } else if (LDLM_CONVERT == lustre_msg_get_opc(req->rq_reqmsg)) { + req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT); + req->rq_ops = &ldlm_cancel_hpreq_ops; + } + RETURN(0); +} + +static int ldlm_revoke_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) + +{ + struct list_head *rpc_list = data; + struct ldlm_lock *lock = cfs_hash_object(hs, hnode); + + lock_res_and_lock(lock); + + if (!ldlm_is_granted(lock)) { + unlock_res_and_lock(lock); + return 0; + } + + LASSERT(lock->l_resource); + if (lock->l_resource->lr_type != LDLM_IBITS && + lock->l_resource->lr_type != LDLM_PLAIN) { + unlock_res_and_lock(lock); + return 0; + } + + if (ldlm_is_ast_sent(lock)) { + unlock_res_and_lock(lock); + return 0; + } + + LASSERT(lock->l_blocking_ast); + LASSERT(!lock->l_blocking_lock); + + ldlm_set_ast_sent(lock); + if (lock->l_export && lock->l_export->exp_lock_hash) { + /* NB: it's safe to call cfs_hash_del() even lock isn't + * in exp_lock_hash. */ + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_del(lock->l_export->exp_lock_hash, + &lock->l_remote_handle, &lock->l_exp_hash); + } + + list_add_tail(&lock->l_rk_ast, rpc_list); + LDLM_LOCK_GET(lock); + + unlock_res_and_lock(lock); + return 0; +} + +void ldlm_revoke_export_locks(struct obd_export *exp) +{ + struct list_head rpc_list; + ENTRY; + + INIT_LIST_HEAD(&rpc_list); + cfs_hash_for_each_nolock(exp->exp_lock_hash, + ldlm_revoke_lock_cb, &rpc_list, 0); + ldlm_run_ast_work(exp->exp_obd->obd_namespace, &rpc_list, + LDLM_WORK_REVOKE_AST); + + EXIT; +} +EXPORT_SYMBOL(ldlm_revoke_export_locks); +#endif /* HAVE_SERVER_SUPPORT */ + +static int ldlm_bl_get_work(struct ldlm_bl_pool *blp, + struct ldlm_bl_work_item **p_blwi, + struct obd_export **p_exp) +{ + struct ldlm_bl_work_item *blwi = NULL; + static unsigned int num_bl = 0; + static unsigned int num_stale; + int num_th = atomic_read(&blp->blp_num_threads); + + *p_exp = obd_stale_export_get(); + + spin_lock(&blp->blp_lock); + if (*p_exp != NULL) { + if (num_th == 1 || ++num_stale < num_th) { + spin_unlock(&blp->blp_lock); + return 1; + } else { + num_stale = 0; + } + } + + /* process a request from the blp_list at least every blp_num_threads */ + if (!list_empty(&blp->blp_list) && + (list_empty(&blp->blp_prio_list) || num_bl == 0)) + blwi = list_entry(blp->blp_list.next, + struct ldlm_bl_work_item, blwi_entry); + else + if (!list_empty(&blp->blp_prio_list)) + blwi = list_entry(blp->blp_prio_list.next, + struct ldlm_bl_work_item, + blwi_entry); + + if (blwi) { + if (++num_bl >= num_th) + num_bl = 0; + list_del(&blwi->blwi_entry); + } + spin_unlock(&blp->blp_lock); + *p_blwi = blwi; + + if (*p_exp != NULL && *p_blwi != NULL) { + obd_stale_export_put(*p_exp); + *p_exp = NULL; + } + + return (*p_blwi != NULL || *p_exp != NULL) ? 1 : 0; +} + +/* This only contains temporary data until the thread starts */ +struct ldlm_bl_thread_data { + struct ldlm_bl_pool *bltd_blp; + struct completion bltd_comp; + int bltd_num; +}; + +static int ldlm_bl_thread_main(void *arg); + +static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp, bool check_busy) +{ + struct ldlm_bl_thread_data bltd = { .bltd_blp = blp }; + struct task_struct *task; + + init_completion(&bltd.bltd_comp); + + bltd.bltd_num = atomic_inc_return(&blp->blp_num_threads); + if (bltd.bltd_num >= blp->blp_max_threads) { + atomic_dec(&blp->blp_num_threads); + return 0; + } + + LASSERTF(bltd.bltd_num > 0, "thread num:%d\n", bltd.bltd_num); + if (check_busy && + atomic_read(&blp->blp_busy_threads) < (bltd.bltd_num - 1)) { + atomic_dec(&blp->blp_num_threads); + return 0; + } + + task = kthread_run(ldlm_bl_thread_main, &bltd, "ldlm_bl_%02d", + bltd.bltd_num); + if (IS_ERR(task)) { + CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n", + bltd.bltd_num, PTR_ERR(task)); + atomic_dec(&blp->blp_num_threads); + return PTR_ERR(task); + } + wait_for_completion(&bltd.bltd_comp); + + return 0; +} + +/* Not fatal if racy and have a few too many threads */ +static int ldlm_bl_thread_need_create(struct ldlm_bl_pool *blp, + struct ldlm_bl_work_item *blwi) +{ + if (atomic_read(&blp->blp_num_threads) >= blp->blp_max_threads) + return 0; + + if (atomic_read(&blp->blp_busy_threads) < + atomic_read(&blp->blp_num_threads)) + return 0; + + if (blwi != NULL && (blwi->blwi_ns == NULL || + blwi->blwi_mem_pressure)) + return 0; + + return 1; +} + +static int ldlm_bl_thread_blwi(struct ldlm_bl_pool *blp, + struct ldlm_bl_work_item *blwi) +{ + ENTRY; + + if (blwi->blwi_ns == NULL) + /* added by ldlm_cleanup() */ + RETURN(LDLM_ITER_STOP); + + if (blwi->blwi_mem_pressure) + memory_pressure_set(); + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL2, 4); + + if (blwi->blwi_count) { + int count; + /* The special case when we cancel locks in lru + * asynchronously, we pass the list of locks here. + * Thus locks are marked LDLM_FL_CANCELING, but NOT + * canceled locally yet. */ + count = ldlm_cli_cancel_list_local(&blwi->blwi_head, + blwi->blwi_count, + LCF_BL_AST); + ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL, + blwi->blwi_flags); + } else { + ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld, + blwi->blwi_lock); + } + if (blwi->blwi_mem_pressure) + memory_pressure_clr(); + + if (blwi->blwi_flags & LCF_ASYNC) + OBD_FREE(blwi, sizeof(*blwi)); + else + complete(&blwi->blwi_comp); + + RETURN(0); +} + +/** + * Cancel stale locks on export. Cancel blocked locks first. + * If the given export has blocked locks, the next in the list may have + * them too, thus cancel not blocked locks only if the current export has + * no blocked locks. + **/ +static int ldlm_bl_thread_exports(struct ldlm_bl_pool *blp, + struct obd_export *exp) +{ + int num; + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 4); + + num = ldlm_export_cancel_blocked_locks(exp); + if (num == 0) + ldlm_export_cancel_locks(exp); + + obd_stale_export_put(exp); + + RETURN(0); +} + + +/** + * Main blocking requests processing thread. + * + * Callers put locks into its queue by calling ldlm_bl_to_thread. + * This thread in the end ends up doing actual call to ->l_blocking_ast + * for queued locks. + */ +static int ldlm_bl_thread_main(void *arg) +{ + struct lu_env *env; + struct ldlm_bl_pool *blp; + struct ldlm_bl_thread_data *bltd = arg; + int rc; + + ENTRY; + + OBD_ALLOC_PTR(env); + if (!env) + RETURN(-ENOMEM); + rc = lu_env_init(env, LCT_DT_THREAD); + if (rc) + GOTO(out_env, rc); + rc = lu_env_add(env); + if (rc) + GOTO(out_env_fini, rc); + + blp = bltd->bltd_blp; + + complete(&bltd->bltd_comp); + /* cannot use bltd after this, it is only on caller's stack */ + + while (1) { + struct l_wait_info lwi = { 0 }; + struct ldlm_bl_work_item *blwi = NULL; + struct obd_export *exp = NULL; + int rc; + + rc = ldlm_bl_get_work(blp, &blwi, &exp); + + if (rc == 0) + l_wait_event_exclusive(blp->blp_waitq, + ldlm_bl_get_work(blp, &blwi, + &exp), + &lwi); + atomic_inc(&blp->blp_busy_threads); + + if (ldlm_bl_thread_need_create(blp, blwi)) + /* discard the return value, we tried */ + ldlm_bl_thread_start(blp, true); + + if (exp) + rc = ldlm_bl_thread_exports(blp, exp); + else if (blwi) + rc = ldlm_bl_thread_blwi(blp, blwi); + + atomic_dec(&blp->blp_busy_threads); + + if (rc == LDLM_ITER_STOP) + break; + + /* If there are many namespaces, we will not sleep waiting for + * work, and must do a cond_resched to avoid holding the CPU + * for too long */ + cond_resched(); + } + + atomic_dec(&blp->blp_num_threads); + complete(&blp->blp_comp); + + lu_env_remove(env); +out_env_fini: + lu_env_fini(env); +out_env: + OBD_FREE_PTR(env); + RETURN(rc); +} + + +static int ldlm_setup(void); +static int ldlm_cleanup(void); + +int ldlm_get_ref(void) +{ + int rc = 0; + ENTRY; + mutex_lock(&ldlm_ref_mutex); + if (++ldlm_refcount == 1) { + rc = ldlm_setup(); + if (rc) + ldlm_refcount--; + } + mutex_unlock(&ldlm_ref_mutex); + + RETURN(rc); +} + +void ldlm_put_ref(void) +{ + ENTRY; + mutex_lock(&ldlm_ref_mutex); + if (ldlm_refcount == 1) { + int rc = ldlm_cleanup(); + if (rc) + CERROR("ldlm_cleanup failed: %d\n", rc); + else + ldlm_refcount--; + } else { + ldlm_refcount--; + } + mutex_unlock(&ldlm_ref_mutex); + + EXIT; +} + +/* + * Export handle<->lock hash operations. + */ +static unsigned +ldlm_export_lock_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask); +} + +static void * +ldlm_export_lock_key(struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + return &lock->l_remote_handle; +} + +static void +ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + lock->l_remote_handle = *(struct lustre_handle *)key; +} + +static int +ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode) +{ + return lustre_handle_equal(ldlm_export_lock_key(hnode), key); +} + +static void * +ldlm_export_lock_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_lock, l_exp_hash); +} + +static void +ldlm_export_lock_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + LDLM_LOCK_GET(lock); +} + +static void +ldlm_export_lock_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + LDLM_LOCK_RELEASE(lock); +} + +static struct cfs_hash_ops ldlm_export_lock_ops = { + .hs_hash = ldlm_export_lock_hash, + .hs_key = ldlm_export_lock_key, + .hs_keycmp = ldlm_export_lock_keycmp, + .hs_keycpy = ldlm_export_lock_keycpy, + .hs_object = ldlm_export_lock_object, + .hs_get = ldlm_export_lock_get, + .hs_put = ldlm_export_lock_put, + .hs_put_locked = ldlm_export_lock_put, +}; + +int ldlm_init_export(struct obd_export *exp) +{ + int rc; + ENTRY; + + exp->exp_lock_hash = + cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid), + HASH_EXP_LOCK_CUR_BITS, + HASH_EXP_LOCK_MAX_BITS, + HASH_EXP_LOCK_BKT_BITS, 0, + CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, + &ldlm_export_lock_ops, + CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY | + CFS_HASH_NBLK_CHANGE); + + if (!exp->exp_lock_hash) + RETURN(-ENOMEM); + + rc = ldlm_init_flock_export(exp); + if (rc) + GOTO(err, rc); + + RETURN(0); +err: + ldlm_destroy_export(exp); + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_init_export); + +void ldlm_destroy_export(struct obd_export *exp) +{ + ENTRY; + cfs_hash_putref(exp->exp_lock_hash); + exp->exp_lock_hash = NULL; + + ldlm_destroy_flock_export(exp); + EXIT; +} +EXPORT_SYMBOL(ldlm_destroy_export); + +static ssize_t cancel_unused_locks_before_replay_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", ldlm_cancel_unused_locks_before_replay); +} + +static ssize_t cancel_unused_locks_before_replay_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + int rc; + unsigned long val; + + rc = kstrtoul(buffer, 10, &val); + if (rc) + return rc; + + ldlm_cancel_unused_locks_before_replay = val; + + return count; +} +LUSTRE_RW_ATTR(cancel_unused_locks_before_replay); + +static struct attribute *ldlm_attrs[] = { + &lustre_attr_cancel_unused_locks_before_replay.attr, + NULL, +}; + +static struct attribute_group ldlm_attr_group = { + .attrs = ldlm_attrs, +}; + +static int ldlm_setup(void) +{ + static struct ptlrpc_service_conf conf; + struct ldlm_bl_pool *blp = NULL; +#ifdef HAVE_SERVER_SUPPORT + struct task_struct *task; +#endif /* HAVE_SERVER_SUPPORT */ + int i; + int rc = 0; + + ENTRY; + + if (ldlm_state != NULL) + RETURN(-EALREADY); + + OBD_ALLOC(ldlm_state, sizeof(*ldlm_state)); + if (ldlm_state == NULL) + RETURN(-ENOMEM); + + ldlm_kobj = kobject_create_and_add("ldlm", &lustre_kset->kobj); + if (!ldlm_kobj) + GOTO(out, -ENOMEM); + + rc = sysfs_create_group(ldlm_kobj, &ldlm_attr_group); + if (rc) + GOTO(out, rc); + + ldlm_ns_kset = kset_create_and_add("namespaces", NULL, ldlm_kobj); + if (!ldlm_ns_kset) + GOTO(out, -ENOMEM); + + ldlm_svc_kset = kset_create_and_add("services", NULL, ldlm_kobj); + if (!ldlm_svc_kset) + GOTO(out, -ENOMEM); + + rc = ldlm_debugfs_setup(); + if (rc != 0) + GOTO(out, rc); + + memset(&conf, 0, sizeof(conf)); + conf = (typeof(conf)) { + .psc_name = "ldlm_cbd", + .psc_watchdog_factor = 2, + .psc_buf = { + .bc_nbufs = LDLM_CLIENT_NBUFS, + .bc_buf_size = LDLM_BUFSIZE, + .bc_req_max_size = LDLM_MAXREQSIZE, + .bc_rep_max_size = LDLM_MAXREPSIZE, + .bc_req_portal = LDLM_CB_REQUEST_PORTAL, + .bc_rep_portal = LDLM_CB_REPLY_PORTAL, + }, + .psc_thr = { + .tc_thr_name = "ldlm_cb", + .tc_thr_factor = LDLM_THR_FACTOR, + .tc_nthrs_init = LDLM_NTHRS_INIT, + .tc_nthrs_base = LDLM_NTHRS_BASE, + .tc_nthrs_max = LDLM_NTHRS_MAX, + .tc_nthrs_user = ldlm_num_threads, + .tc_cpu_bind = ldlm_cpu_bind, + .tc_ctx_tags = LCT_MD_THREAD | LCT_DT_THREAD, + }, + .psc_cpt = { + .cc_pattern = ldlm_cpts, + .cc_affinity = true, + }, + .psc_ops = { + .so_req_handler = ldlm_callback_handler, + }, + }; + ldlm_state->ldlm_cb_service = \ + ptlrpc_register_service(&conf, ldlm_svc_kset, + ldlm_svc_debugfs_dir); + if (IS_ERR(ldlm_state->ldlm_cb_service)) { + CERROR("failed to start service\n"); + rc = PTR_ERR(ldlm_state->ldlm_cb_service); + ldlm_state->ldlm_cb_service = NULL; + GOTO(out, rc); + } + +#ifdef HAVE_SERVER_SUPPORT + memset(&conf, 0, sizeof(conf)); + conf = (typeof(conf)) { + .psc_name = "ldlm_canceld", + .psc_watchdog_factor = 6, + .psc_buf = { + .bc_nbufs = LDLM_SERVER_NBUFS, + .bc_buf_size = LDLM_BUFSIZE, + .bc_req_max_size = LDLM_MAXREQSIZE, + .bc_rep_max_size = LDLM_MAXREPSIZE, + .bc_req_portal = LDLM_CANCEL_REQUEST_PORTAL, + .bc_rep_portal = LDLM_CANCEL_REPLY_PORTAL, + + }, + .psc_thr = { + .tc_thr_name = "ldlm_cn", + .tc_thr_factor = LDLM_THR_FACTOR, + .tc_nthrs_init = LDLM_NTHRS_INIT, + .tc_nthrs_base = LDLM_NTHRS_BASE, + .tc_nthrs_max = LDLM_NTHRS_MAX, + .tc_nthrs_user = ldlm_num_threads, + .tc_cpu_bind = ldlm_cpu_bind, + .tc_ctx_tags = LCT_MD_THREAD | \ + LCT_DT_THREAD | \ + LCT_CL_THREAD, + }, + .psc_cpt = { + .cc_pattern = ldlm_cpts, + .cc_affinity = true, + }, + .psc_ops = { + .so_req_handler = ldlm_cancel_handler, + .so_hpreq_handler = ldlm_hpreq_handler, + }, + }; + ldlm_state->ldlm_cancel_service = \ + ptlrpc_register_service(&conf, ldlm_svc_kset, + ldlm_svc_debugfs_dir); + if (IS_ERR(ldlm_state->ldlm_cancel_service)) { + CERROR("failed to start service\n"); + rc = PTR_ERR(ldlm_state->ldlm_cancel_service); + ldlm_state->ldlm_cancel_service = NULL; + GOTO(out, rc); + } +#endif /* HAVE_SERVER_SUPPORT */ + + OBD_ALLOC(blp, sizeof(*blp)); + if (blp == NULL) + GOTO(out, rc = -ENOMEM); + ldlm_state->ldlm_bl_pool = blp; + + spin_lock_init(&blp->blp_lock); + INIT_LIST_HEAD(&blp->blp_list); + INIT_LIST_HEAD(&blp->blp_prio_list); + init_waitqueue_head(&blp->blp_waitq); + atomic_set(&blp->blp_num_threads, 0); + atomic_set(&blp->blp_busy_threads, 0); + + if (ldlm_num_threads == 0) { + blp->blp_min_threads = LDLM_NTHRS_INIT; + blp->blp_max_threads = LDLM_NTHRS_MAX; + } else { + blp->blp_min_threads = blp->blp_max_threads = \ + min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT, + ldlm_num_threads)); + } + + for (i = 0; i < blp->blp_min_threads; i++) { + rc = ldlm_bl_thread_start(blp, false); + if (rc < 0) + GOTO(out, rc); + } + +#ifdef HAVE_SERVER_SUPPORT + task = kthread_run(expired_lock_main, NULL, "ldlm_elt"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("Cannot start ldlm expired-lock thread: %d\n", rc); + GOTO(out, rc); + } + + wait_event(expired_lock_wait_queue, + expired_lock_thread_state == ELT_READY); +#endif /* HAVE_SERVER_SUPPORT */ + + rc = ldlm_pools_init(); + if (rc) { + CERROR("Failed to initialize LDLM pools: %d\n", rc); + GOTO(out, rc); + } + + rc = ldlm_reclaim_setup(); + if (rc) { + CERROR("Failed to setup reclaim thread: rc = %d\n", rc); + GOTO(out, rc); + } + RETURN(0); + + out: + ldlm_cleanup(); + RETURN(rc); +} + +static int ldlm_cleanup(void) +{ + ENTRY; + + if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) || + !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) { + CERROR("ldlm still has namespaces; clean these up first.\n"); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE); + RETURN(-EBUSY); + } + + ldlm_reclaim_cleanup(); + ldlm_pools_fini(); + + if (ldlm_state->ldlm_bl_pool != NULL) { + struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; + + while (atomic_read(&blp->blp_num_threads) > 0) { + struct ldlm_bl_work_item blwi = { .blwi_ns = NULL }; + + init_completion(&blp->blp_comp); + + spin_lock(&blp->blp_lock); + list_add_tail(&blwi.blwi_entry, &blp->blp_list); + wake_up(&blp->blp_waitq); + spin_unlock(&blp->blp_lock); + + wait_for_completion(&blp->blp_comp); + } + + OBD_FREE(blp, sizeof(*blp)); + } + + if (ldlm_state->ldlm_cb_service != NULL) + ptlrpc_unregister_service(ldlm_state->ldlm_cb_service); +#ifdef HAVE_SERVER_SUPPORT + if (ldlm_state->ldlm_cancel_service != NULL) + ptlrpc_unregister_service(ldlm_state->ldlm_cancel_service); +#endif + + if (ldlm_ns_kset) + kset_unregister(ldlm_ns_kset); + if (ldlm_svc_kset) + kset_unregister(ldlm_svc_kset); + if (ldlm_kobj) { + sysfs_remove_group(ldlm_kobj, &ldlm_attr_group); + kobject_put(ldlm_kobj); + } + + ldlm_debugfs_cleanup(); + +#ifdef HAVE_SERVER_SUPPORT + if (expired_lock_thread_state != ELT_STOPPED) { + expired_lock_thread_state = ELT_TERMINATE; + wake_up(&expired_lock_wait_queue); + wait_event(expired_lock_wait_queue, + expired_lock_thread_state == ELT_STOPPED); + } +#endif + + OBD_FREE(ldlm_state, sizeof(*ldlm_state)); + ldlm_state = NULL; + + RETURN(0); +} + +int ldlm_init(void) +{ + ldlm_resource_slab = kmem_cache_create("ldlm_resources", + sizeof(struct ldlm_resource), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_resource_slab == NULL) + return -ENOMEM; + + ldlm_lock_slab = kmem_cache_create("ldlm_locks", + sizeof(struct ldlm_lock), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_lock_slab == NULL) + goto out_resource; + + ldlm_interval_slab = kmem_cache_create("interval_node", + sizeof(struct ldlm_interval), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_interval_slab == NULL) + goto out_lock; + + ldlm_interval_tree_slab = kmem_cache_create("interval_tree", + sizeof(struct ldlm_interval_tree) * LCK_MODE_NUM, + 0, SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_interval_tree_slab == NULL) + goto out_interval; + +#ifdef HAVE_SERVER_SUPPORT + ldlm_inodebits_slab = kmem_cache_create("ldlm_ibits_node", + sizeof(struct ldlm_ibits_node), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_inodebits_slab == NULL) + goto out_interval_tree; + + ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem", + sizeof(struct ldlm_glimpse_work), + 0, 0, NULL); + if (ldlm_glimpse_work_kmem == NULL) + goto out_inodebits; +#endif + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + class_export_dump_hook = ldlm_dump_export_locks; +#endif + return 0; +#ifdef HAVE_SERVER_SUPPORT +out_inodebits: + kmem_cache_destroy(ldlm_inodebits_slab); +out_interval_tree: + kmem_cache_destroy(ldlm_interval_tree_slab); +#endif +out_interval: + kmem_cache_destroy(ldlm_interval_slab); +out_lock: + kmem_cache_destroy(ldlm_lock_slab); +out_resource: + kmem_cache_destroy(ldlm_resource_slab); + + return -ENOMEM; +} + +void ldlm_exit(void) +{ + if (ldlm_refcount) + CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount); + kmem_cache_destroy(ldlm_resource_slab); + /* + * ldlm_lock_put() use RCU to call ldlm_lock_free, so need call + * rcu_barrier() to wait all outstanding RCU callbacks to complete, + * so that ldlm_lock_free() get a chance to be called. + */ + rcu_barrier(); + kmem_cache_destroy(ldlm_lock_slab); + kmem_cache_destroy(ldlm_interval_slab); + kmem_cache_destroy(ldlm_interval_tree_slab); +#ifdef HAVE_SERVER_SUPPORT + kmem_cache_destroy(ldlm_inodebits_slab); + kmem_cache_destroy(ldlm_glimpse_work_kmem); +#endif +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c new file mode 100644 index 0000000000000..6407fd20884f8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c @@ -0,0 +1,177 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_plain.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of PLAIN lock type. + * + * PLAIN locks are the simplest form of LDLM locking, and are used when + * there only needs to be a single lock on a resource. This avoids some + * of the complexity of EXTENT and IBITS lock types, but doesn't allow + * different "parts" of a resource to be locked concurrently. Example + * use cases for PLAIN locks include locking of MGS configuration logs + * and (as of Lustre 2.4) quota records. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include + +#include "ldlm_internal.h" + +#ifdef HAVE_SERVER_SUPPORT +/** + * Determine if the lock is compatible with all locks on the queue. + * + * If \a work_list is provided, conflicting locks are linked there. + * If \a work_list is not provided, we exit this function on first conflict. + * + * \retval 0 if there are conflicting locks in the \a queue + * \retval 1 if the lock is compatible to all locks in \a queue + */ +static inline int +ldlm_plain_compat_queue(struct list_head *queue, struct ldlm_lock *req, + struct list_head *work_list) +{ + enum ldlm_mode req_mode = req->l_req_mode; + struct ldlm_lock *lock, *next_lock; + int compat = 1; + ENTRY; + + lockmode_verify(req_mode); + + list_for_each_entry_safe(lock, next_lock, queue, l_res_link) { + + /* We stop walking the queue if we hit ourselves so we don't + * take conflicting locks enqueued after us into account, + * or we'd wait forever. */ + if (req == lock) + RETURN(compat); + + /* Advance loop cursor to last lock of mode group. */ + next_lock = list_entry(list_entry(lock->l_sl_mode.prev, + struct ldlm_lock, + l_sl_mode)->l_res_link.next, + struct ldlm_lock, l_res_link); + + if (lockmode_compat(lock->l_req_mode, req_mode)) + continue; + + if (!work_list) + RETURN(0); + + compat = 0; + + /* Add locks of the mode group to \a work_list as + * blocking locks for \a req. */ + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, req, work_list); + + { + struct list_head *head; + + head = &lock->l_sl_mode; + list_for_each_entry(lock, head, l_sl_mode) + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, req, + work_list); + } + } + + RETURN(compat); +} + +/** + * Process a granting attempt for plain lock. + * Must be called with ns lock held. + * + * This function looks for any conflicts for \a lock in the granted or + * waiting queues. The lock is granted if no conflicts are found in + * either queue. + */ +int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags, + enum ldlm_process_intention intention, + enum ldlm_error *err, struct list_head *work_list) +{ + struct ldlm_resource *res = lock->l_resource; + struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ? + NULL : work_list; + int rc; + ENTRY; + + LASSERT(!ldlm_is_granted(lock)); + check_res_locked(res); + *err = ELDLM_OK; + + if (intention == LDLM_PROCESS_RESCAN) { + LASSERT(work_list != NULL); + rc = ldlm_plain_compat_queue(&res->lr_granted, lock, NULL); + if (!rc) + RETURN(LDLM_ITER_STOP); + rc = ldlm_plain_compat_queue(&res->lr_waiting, lock, NULL); + if (!rc) + RETURN(LDLM_ITER_STOP); + + ldlm_resource_unlink_lock(lock); + ldlm_grant_lock(lock, grant_work); + RETURN(LDLM_ITER_CONTINUE); + } + + rc = ldlm_plain_compat_queue(&res->lr_granted, lock, work_list); + rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, work_list); + + if (rc == 2) { + ldlm_resource_unlink_lock(lock); + ldlm_grant_lock(lock, grant_work); + } + + RETURN(LDLM_ITER_CONTINUE); +} +#endif /* HAVE_SERVER_SUPPORT */ + +void ldlm_plain_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, + union ldlm_policy_data *lpolicy) +{ + /* No policy for plain locks */ +} + +void ldlm_plain_policy_local_to_wire(const union ldlm_policy_data *lpolicy, + union ldlm_wire_policy_data *wpolicy) +{ + /* No policy for plain locks */ +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c new file mode 100644 index 0000000000000..0a423d5615b5b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c @@ -0,0 +1,1511 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_pool.c + * + * Author: Yury Umanets + */ + +/* + * Idea of this code is rather simple. Each second, for each server namespace + * we have SLV - server lock volume which is calculated on current number of + * granted locks, grant speed for past period, etc - that is, locking load. + * This SLV number may be thought as a flow definition for simplicity. It is + * sent to clients with each occasion to let them know what is current load + * situation on the server. By default, at the beginning, SLV on server is + * set max value which is calculated as the following: allow to one client + * have all locks of limit ->pl_limit for 10h. + * + * Next, on clients, number of cached locks is not limited artificially in any + * way as it was before. Instead, client calculates CLV, that is, client lock + * volume for each lock and compares it with last SLV from the server. CLV is + * calculated as the number of locks in LRU * lock live time in seconds. If + * CLV > SLV - lock is canceled. + * + * Client has LVF, that is, lock volume factor which regulates how much sensitive + * client should be about last SLV from server. The higher LVF is the more locks + * will be canceled on client. Default value for it is 1. Setting LVF to 2 means + * that client will cancel locks 2 times faster. + * + * Locks on a client will be canceled more intensively in these cases: + * (1) if SLV is smaller, that is, load is higher on the server; + * (2) client has a lot of locks (the more locks are held by client, the bigger + * chances that some of them should be canceled); + * (3) client has old locks (taken some time ago); + * + * Thus, according to flow paradigm that we use for better understanding SLV, + * CLV is the volume of particle in flow described by SLV. According to this, + * if flow is getting thinner, more and more particles become outside of it and + * as particles are locks, they should be canceled. + * + * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas + * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many + * cleanups. Flow definition to allow more easy understanding of the logic belongs + * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes. + * And design and implementation are done by Yury Umanets (umka@clusterfs.com). + * + * Glossary for terms used: + * + * pl_limit - Number of allowed locks in pool. Applies to server and client + * side (tunable); + * + * pl_granted - Number of granted locks (calculated); + * pl_grant_rate - Number of granted locks for last T (calculated); + * pl_cancel_rate - Number of canceled locks for last T (calculated); + * pl_grant_speed - Grant speed (GR - CR) for last T (calculated); + * pl_grant_plan - Planned number of granted locks for next T (calculated); + * pl_server_lock_volume - Current server lock volume (calculated); + * + * As it may be seen from list above, we have few possible tunables which may + * affect behavior much. They all may be modified via sysfs. However, they also + * give a possibility for constructing few pre-defined behavior policies. If + * none of predefines is suitable for a working pattern being used, new one may + * be "constructed" via sysfs tunables. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include +#include +#include "ldlm_internal.h" + +#ifdef HAVE_LRU_RESIZE_SUPPORT + +/* + * 50 ldlm locks for 1MB of RAM. + */ +#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_SHIFT)) * 50) + +/* + * Maximal possible grant step plan in %. + */ +#define LDLM_POOL_MAX_GSP (30) + +/* + * Minimal possible grant step plan in %. + */ +#define LDLM_POOL_MIN_GSP (1) + +/* + * This controls the speed of reaching LDLM_POOL_MAX_GSP + * with increasing thread period. + */ +#define LDLM_POOL_GSP_STEP_SHIFT (2) + +/* + * LDLM_POOL_GSP% of all locks is default GP. + */ +#define LDLM_POOL_GP(L) (((L) * LDLM_POOL_MAX_GSP) / 100) + +/* + * Max age for locks on clients. + */ +#define LDLM_POOL_MAX_AGE (36000) + +/* + * The granularity of SLV calculation. + */ +#define LDLM_POOL_SLV_SHIFT (10) + +extern struct proc_dir_entry *ldlm_ns_proc_dir; + +static inline __u64 dru(__u64 val, __u32 shift, int round_up) +{ + return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift; +} + +static inline __u64 ldlm_pool_slv_max(__u32 L) +{ + /* + * Allow to have all locks for 1 client for 10 hrs. + * Formula is the following: limit * 10h / 1 client. + */ + __u64 lim = (__u64)L * LDLM_POOL_MAX_AGE / 1; + return lim; +} + +static inline __u64 ldlm_pool_slv_min(__u32 L) +{ + return 1; +} + +enum { + LDLM_POOL_FIRST_STAT = 0, + LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT, + LDLM_POOL_GRANT_STAT, + LDLM_POOL_CANCEL_STAT, + LDLM_POOL_GRANT_RATE_STAT, + LDLM_POOL_CANCEL_RATE_STAT, + LDLM_POOL_GRANT_PLAN_STAT, + LDLM_POOL_SLV_STAT, + LDLM_POOL_SHRINK_REQTD_STAT, + LDLM_POOL_SHRINK_FREED_STAT, + LDLM_POOL_RECALC_STAT, + LDLM_POOL_TIMING_STAT, + LDLM_POOL_LAST_STAT +}; + +static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl) +{ + return container_of(pl, struct ldlm_namespace, ns_pool); +} + +/** + * Calculates suggested grant_step in % of available locks for passed + * \a period. This is later used in grant_plan calculations. + */ +static inline int ldlm_pool_t2gsp(unsigned int t) +{ + /* + * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP + * and up to 30% for anything higher than LDLM_POOL_GSP_STEP. + * + * How this will affect execution is the following: + * + * - for thread period 1s we will have grant_step 1% which good from + * pov of taking some load off from server and push it out to clients. + * This is like that because 1% for grant_step means that server will + * not allow clients to get lots of locks in short period of time and + * keep all old locks in their caches. Clients will always have to + * get some locks back if they want to take some new; + * + * - for thread period 10s (which is default) we will have 23% which + * means that clients will have enough of room to take some new locks + * without getting some back. All locks from this 23% which were not + * taken by clients in current period will contribute in SLV growing. + * SLV growing means more locks cached on clients until limit or grant + * plan is reached. + */ + return LDLM_POOL_MAX_GSP - + ((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >> + (t >> LDLM_POOL_GSP_STEP_SHIFT)); +} + +static inline int ldlm_pool_granted(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_granted); +} + +/** + * Recalculates next grant limit on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl) +{ + int granted, grant_step, limit; + + limit = ldlm_pool_get_limit(pl); + granted = ldlm_pool_granted(pl); + + grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period); + grant_step = ((limit - granted) * grant_step) / 100; + pl->pl_grant_plan = granted + grant_step; + limit = (limit * 5) >> 2; + if (pl->pl_grant_plan > limit) + pl->pl_grant_plan = limit; +} + +/** + * Recalculates next SLV on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_slv(struct ldlm_pool *pl) +{ + int granted; + int grant_plan; + int round_up; + __u64 slv; + __u64 slv_factor; + __u64 grant_usage; + __u32 limit; + + slv = pl->pl_server_lock_volume; + grant_plan = pl->pl_grant_plan; + limit = ldlm_pool_get_limit(pl); + granted = ldlm_pool_granted(pl); + round_up = granted < limit; + + grant_usage = max_t(int, limit - (granted - grant_plan), 1); + + /* + * Find out SLV change factor which is the ratio of grant usage + * from limit. SLV changes as fast as the ratio of grant plan + * consumption. The more locks from grant plan are not consumed + * by clients in last interval (idle time), the faster grows + * SLV. And the opposite, the more grant plan is over-consumed + * (load time) the faster drops SLV. + */ + slv_factor = (grant_usage << LDLM_POOL_SLV_SHIFT); + do_div(slv_factor, limit); + slv = slv * slv_factor; + slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up); + + if (slv > ldlm_pool_slv_max(limit)) { + slv = ldlm_pool_slv_max(limit); + } else if (slv < ldlm_pool_slv_min(limit)) { + slv = ldlm_pool_slv_min(limit); + } + + pl->pl_server_lock_volume = slv; +} + +/** + * Recalculates next stats on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_stats(struct ldlm_pool *pl) +{ + int grant_plan = pl->pl_grant_plan; + __u64 slv = pl->pl_server_lock_volume; + int granted = ldlm_pool_granted(pl); + int grant_rate = atomic_read(&pl->pl_grant_rate); + int cancel_rate = atomic_read(&pl->pl_cancel_rate); + + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT, + slv); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT, + granted); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, + grant_rate); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, + grant_plan); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, + cancel_rate); +} + +/** + * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd. + */ +static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl) +{ + struct obd_device *obd; + + /* + * Set new SLV in obd field for using it later without accessing the + * pool. This is required to avoid race between sending reply to client + * with new SLV and cleanup server stack in which we can't guarantee + * that namespace is still alive. We know only that obd is alive as + * long as valid export is alive. + */ + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL); + write_lock(&obd->obd_pool_lock); + obd->obd_pool_slv = pl->pl_server_lock_volume; + write_unlock(&obd->obd_pool_lock); +} + +/** + * Recalculates all pool fields on passed \a pl. + * + * \pre ->pl_lock is not locked. + */ +static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) +{ + time64_t recalc_interval_sec; + ENTRY; + + recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) + RETURN(0); + + spin_lock(&pl->pl_lock); + recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) { + spin_unlock(&pl->pl_lock); + RETURN(0); + } + /* + * Recalc SLV after last period. This should be done + * _before_ recalculating new grant plan. + */ + ldlm_pool_recalc_slv(pl); + + /* + * Make sure that pool informed obd of last SLV changes. + */ + ldlm_srv_pool_push_slv(pl); + + /* + * Update grant_plan for new period. + */ + ldlm_pool_recalc_grant_plan(pl); + + pl->pl_recalc_time = ktime_get_real_seconds(); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, + recalc_interval_sec); + spin_unlock(&pl->pl_lock); + RETURN(0); +} + +/** + * This function is used on server side as main entry point for memory + * pressure handling. It decreases SLV on \a pl according to passed + * \a nr and \a gfp_mask. + * + * Our goal here is to decrease SLV such a way that clients hold \a nr + * locks smaller in next 10h. + */ +static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, + int nr, gfp_t gfp_mask) +{ + __u32 limit; + + /* + * VM is asking how many entries may be potentially freed. + */ + if (nr == 0) + return ldlm_pool_granted(pl); + + /* + * Client already canceled locks but server is already in shrinker + * and can't cancel anything. Let's catch this race. + */ + if (ldlm_pool_granted(pl) == 0) + RETURN(0); + + spin_lock(&pl->pl_lock); + + /* + * We want shrinker to possibly cause cancellation of @nr locks from + * clients or grant approximately @nr locks smaller next intervals. + * + * This is why we decreased SLV by @nr. This effect will only be as + * long as one re-calc interval (1s these days) and this should be + * enough to pass this decreased SLV to all clients. On next recalc + * interval pool will either increase SLV if locks load is not high + * or will keep on same level or even decrease again, thus, shrinker + * decreased SLV will affect next recalc intervals and this way will + * make locking load lower. + */ + if (nr < pl->pl_server_lock_volume) { + pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr; + } else { + limit = ldlm_pool_get_limit(pl); + pl->pl_server_lock_volume = ldlm_pool_slv_min(limit); + } + + /* + * Make sure that pool informed obd of last SLV changes. + */ + ldlm_srv_pool_push_slv(pl); + spin_unlock(&pl->pl_lock); + + /* + * We did not really free any memory here so far, it only will be + * freed later may be, so that we return 0 to not confuse VM. + */ + return 0; +} + +/** + * Setup server side pool \a pl with passed \a limit. + */ +static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit) +{ + struct obd_device *obd; + + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL && obd != LP_POISON); + LASSERT(obd->obd_type != LP_POISON); + write_lock(&obd->obd_pool_lock); + obd->obd_pool_limit = limit; + write_unlock(&obd->obd_pool_lock); + + ldlm_pool_set_limit(pl, limit); + return 0; +} + +/** + * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl. + */ +static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl) +{ + struct obd_device *obd; + + /* + * Get new SLV and Limit from obd which is updated with coming + * RPCs. + */ + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL); + read_lock(&obd->obd_pool_lock); + pl->pl_server_lock_volume = obd->obd_pool_slv; + ldlm_pool_set_limit(pl, obd->obd_pool_limit); + read_unlock(&obd->obd_pool_lock); +} + +/** + * Recalculates client size pool \a pl according to current SLV and Limit. + */ +static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) +{ + time64_t recalc_interval_sec; + int ret; + ENTRY; + + recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) + RETURN(0); + + spin_lock(&pl->pl_lock); + /* + * Check if we need to recalc lists now. + */ + recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) { + spin_unlock(&pl->pl_lock); + RETURN(0); + } + + /* + * Make sure that pool knows last SLV and Limit from obd. + */ + ldlm_cli_pool_pop_slv(pl); + spin_unlock(&pl->pl_lock); + + /* + * In the time of canceling locks on client we do not need to maintain + * sharp timing, we only want to cancel locks asap according to new SLV. + * It may be called when SLV has changed much, this is why we do not + * take into account pl->pl_recalc_time here. + */ + ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC, 0); + + spin_lock(&pl->pl_lock); + /* + * Time of LRU resizing might be longer than period, + * so update after LRU resizing rather than before it. + */ + pl->pl_recalc_time = ktime_get_real_seconds(); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, + recalc_interval_sec); + spin_unlock(&pl->pl_lock); + RETURN(ret); +} + +/** + * This function is main entry point for memory pressure handling on client + * side. Main goal of this function is to cancel some number of locks on + * passed \a pl according to \a nr and \a gfp_mask. + */ +static int ldlm_cli_pool_shrink(struct ldlm_pool *pl, + int nr, gfp_t gfp_mask) +{ + struct ldlm_namespace *ns; + int unused; + + ns = ldlm_pl2ns(pl); + + /* + * Do not cancel locks in case lru resize is disabled for this ns. + */ + if (!ns_connect_lru_resize(ns)) + RETURN(0); + + /* + * Make sure that pool knows last SLV and Limit from obd. + */ + ldlm_cli_pool_pop_slv(pl); + + spin_lock(&ns->ns_lock); + unused = ns->ns_nr_unused; + spin_unlock(&ns->ns_lock); + + if (nr == 0) + return (unused / 100) * sysctl_vfs_cache_pressure; + else + return ldlm_cancel_lru(ns, nr, LCF_ASYNC, 0); +} + +static struct ldlm_pool_ops ldlm_srv_pool_ops = { + .po_recalc = ldlm_srv_pool_recalc, + .po_shrink = ldlm_srv_pool_shrink, + .po_setup = ldlm_srv_pool_setup +}; + +static struct ldlm_pool_ops ldlm_cli_pool_ops = { + .po_recalc = ldlm_cli_pool_recalc, + .po_shrink = ldlm_cli_pool_shrink +}; + +/** + * Pool recalc wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + */ +time64_t ldlm_pool_recalc(struct ldlm_pool *pl) +{ + time64_t recalc_interval_sec; + int count; + + recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time; + if (recalc_interval_sec > 0) { + spin_lock(&pl->pl_lock); + recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time; + + if (recalc_interval_sec > 0) { + /* + * Update pool statistics every 1s. + */ + ldlm_pool_recalc_stats(pl); + + /* + * Zero out all rates and speed for the last period. + */ + atomic_set(&pl->pl_grant_rate, 0); + atomic_set(&pl->pl_cancel_rate, 0); + } + spin_unlock(&pl->pl_lock); + } + + if (pl->pl_ops->po_recalc != NULL) { + count = pl->pl_ops->po_recalc(pl); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, + count); + } + + recalc_interval_sec = pl->pl_recalc_time - ktime_get_real_seconds() + + pl->pl_recalc_period; + if (recalc_interval_sec <= 0) { + /* DEBUG: should be re-removed after LU-4536 is fixed */ + CDEBUG(D_DLMTRACE, "%s: Negative interval(%lld), too short period(%lld)\n", + pl->pl_name, recalc_interval_sec, + (s64)pl->pl_recalc_period); + + /* Prevent too frequent recalculation. */ + recalc_interval_sec = 1; + } + + return recalc_interval_sec; +} + +/** + * Pool shrink wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + */ +int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask) +{ + int cancel = 0; + + if (pl->pl_ops->po_shrink != NULL) { + cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask); + if (nr > 0) { + lprocfs_counter_add(pl->pl_stats, + LDLM_POOL_SHRINK_REQTD_STAT, + nr); + lprocfs_counter_add(pl->pl_stats, + LDLM_POOL_SHRINK_FREED_STAT, + cancel); + CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, " + "shrunk %d\n", pl->pl_name, nr, cancel); + } + } + return cancel; +} + +/** + * Pool setup wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + * + * Sets passed \a limit into pool \a pl. + */ +int ldlm_pool_setup(struct ldlm_pool *pl, int limit) +{ + if (pl->pl_ops->po_setup != NULL) + return(pl->pl_ops->po_setup(pl, limit)); + return 0; +} + +static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused) +{ + int granted, grant_rate, cancel_rate, grant_step; + int grant_speed, grant_plan, lvf; + struct ldlm_pool *pl = m->private; + __u64 slv, clv; + __u32 limit; + + spin_lock(&pl->pl_lock); + slv = pl->pl_server_lock_volume; + clv = pl->pl_client_lock_volume; + limit = ldlm_pool_get_limit(pl); + grant_plan = pl->pl_grant_plan; + granted = ldlm_pool_granted(pl); + grant_rate = atomic_read(&pl->pl_grant_rate); + cancel_rate = atomic_read(&pl->pl_cancel_rate); + grant_speed = grant_rate - cancel_rate; + lvf = atomic_read(&pl->pl_lock_volume_factor); + grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period); + spin_unlock(&pl->pl_lock); + + seq_printf(m, "LDLM pool state (%s):\n" + " SLV: %llu\n" + " CLV: %llu\n" + " LVF: %d\n", + pl->pl_name, slv, clv, lvf); + + if (ns_is_server(ldlm_pl2ns(pl))) { + seq_printf(m, " GSP: %d%%\n", grant_step); + seq_printf(m, " GP: %d\n", grant_plan); + } + + seq_printf(m, " GR: %d\n CR: %d\n GS: %d\n G: %d\n L: %d\n", + grant_rate, cancel_rate, grant_speed, + granted, limit); + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(lprocfs_pool_state); + +static ssize_t grant_speed_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, + pl_kobj); + int grant_speed; + + spin_lock(&pl->pl_lock); + /* serialize with ldlm_pool_recalc */ + grant_speed = atomic_read(&pl->pl_grant_rate) - + atomic_read(&pl->pl_cancel_rate); + spin_unlock(&pl->pl_lock); + return sprintf(buf, "%d\n", grant_speed); +} +LUSTRE_RO_ATTR(grant_speed); + +LDLM_POOL_SYSFS_READER_SHOW(grant_plan, int); +LUSTRE_RO_ATTR(grant_plan); + +LDLM_POOL_SYSFS_READER_SHOW(recalc_period, int); +LDLM_POOL_SYSFS_WRITER_STORE(recalc_period, int); +LUSTRE_RW_ATTR(recalc_period); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(server_lock_volume, u64); +LUSTRE_RO_ATTR(server_lock_volume); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(limit, atomic); +LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(limit, atomic); +LUSTRE_RW_ATTR(limit); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(granted, atomic); +LUSTRE_RO_ATTR(granted); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(cancel_rate, atomic); +LUSTRE_RO_ATTR(cancel_rate); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(grant_rate, atomic); +LUSTRE_RO_ATTR(grant_rate); + +LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(lock_volume_factor, atomic); +LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(lock_volume_factor, atomic); +LUSTRE_RW_ATTR(lock_volume_factor); + +/* These are for pools in /sys/fs/lustre/ldlm/namespaces/.../pool */ +static struct attribute *ldlm_pl_attrs[] = { + &lustre_attr_grant_speed.attr, + &lustre_attr_grant_plan.attr, + &lustre_attr_recalc_period.attr, + &lustre_attr_server_lock_volume.attr, + &lustre_attr_limit.attr, + &lustre_attr_granted.attr, + &lustre_attr_cancel_rate.attr, + &lustre_attr_grant_rate.attr, + &lustre_attr_lock_volume_factor.attr, + NULL, +}; + +static void ldlm_pl_release(struct kobject *kobj) +{ + struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, + pl_kobj); + complete(&pl->pl_kobj_unregister); +} + +static struct kobj_type ldlm_pl_ktype = { + .default_attrs = ldlm_pl_attrs, + .sysfs_ops = &lustre_sysfs_ops, + .release = ldlm_pl_release, +}; + +static int ldlm_pool_sysfs_init(struct ldlm_pool *pl) +{ + struct ldlm_namespace *ns = ldlm_pl2ns(pl); + int err; + + init_completion(&pl->pl_kobj_unregister); + err = kobject_init_and_add(&pl->pl_kobj, &ldlm_pl_ktype, &ns->ns_kobj, + "pool"); + + return err; +} + +static int ldlm_pool_debugfs_init(struct ldlm_pool *pl) +{ + struct ldlm_namespace *ns = ldlm_pl2ns(pl); + struct dentry *debugfs_ns_parent; + struct ldebugfs_vars pool_vars[2]; + char *var_name = NULL; + int rc = 0; + ENTRY; + + OBD_ALLOC(var_name, MAX_STRING_SIZE + 1); + if (!var_name) + RETURN(-ENOMEM); + + debugfs_ns_parent = ns->ns_debugfs_entry; + if (IS_ERR_OR_NULL(debugfs_ns_parent)) { + CERROR("%s: debugfs entry is not initialized\n", + ldlm_ns_name(ns)); + GOTO(out_free_name, rc = -EINVAL); + } + pl->pl_debugfs_entry = ldebugfs_register("pool", debugfs_ns_parent, + NULL, NULL); + if (IS_ERR(pl->pl_debugfs_entry)) { + rc = PTR_ERR(pl->pl_debugfs_entry); + pl->pl_debugfs_entry = NULL; + CERROR("%s: cannot create 'pool' debugfs entry: rc = %d\n", + ldlm_ns_name(ns), rc); + GOTO(out_free_name, rc); + } + + var_name[MAX_STRING_SIZE] = '\0'; + memset(pool_vars, 0, sizeof(pool_vars)); + pool_vars[0].name = var_name; + + ldlm_add_var(&pool_vars[0], pl->pl_debugfs_entry, "state", pl, + &lprocfs_pool_state_fops); + + pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT - + LDLM_POOL_FIRST_STAT, 0); + if (!pl->pl_stats) + GOTO(out_free_name, rc = -ENOMEM); + + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "granted", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "grant", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "cancel", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "grant_rate", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "cancel_rate", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "grant_plan", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "slv", "slv"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "shrink_request", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "shrink_freed", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "recalc_freed", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "recalc_timing", "sec"); + rc = ldebugfs_register_stats(pl->pl_debugfs_entry, "stats", + pl->pl_stats); + + EXIT; +out_free_name: + OBD_FREE(var_name, MAX_STRING_SIZE + 1); + return rc; +} + +static void ldlm_pool_sysfs_fini(struct ldlm_pool *pl) +{ + kobject_put(&pl->pl_kobj); + wait_for_completion(&pl->pl_kobj_unregister); +} + +static void ldlm_pool_debugfs_fini(struct ldlm_pool *pl) +{ + if (pl->pl_stats != NULL) { + lprocfs_free_stats(&pl->pl_stats); + pl->pl_stats = NULL; + } + if (pl->pl_debugfs_entry != NULL) { + ldebugfs_remove(&pl->pl_debugfs_entry); + pl->pl_debugfs_entry = NULL; + } +} + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, enum ldlm_side client) +{ + int rc; + ENTRY; + + spin_lock_init(&pl->pl_lock); + atomic_set(&pl->pl_granted, 0); + pl->pl_recalc_time = ktime_get_real_seconds(); + atomic_set(&pl->pl_lock_volume_factor, 1); + + atomic_set(&pl->pl_grant_rate, 0); + atomic_set(&pl->pl_cancel_rate, 0); + pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L); + + snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d", + ldlm_ns_name(ns), idx); + + if (client == LDLM_NAMESPACE_SERVER) { + pl->pl_ops = &ldlm_srv_pool_ops; + ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L); + pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD; + pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L); + } else { + ldlm_pool_set_limit(pl, 1); + pl->pl_server_lock_volume = 0; + pl->pl_ops = &ldlm_cli_pool_ops; + pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD; + } + pl->pl_client_lock_volume = 0; + rc = ldlm_pool_debugfs_init(pl); + if (rc) + RETURN(rc); + + rc = ldlm_pool_sysfs_init(pl); + if (rc) + RETURN(rc); + + CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name); + + RETURN(rc); +} + +void ldlm_pool_fini(struct ldlm_pool *pl) +{ + ENTRY; + ldlm_pool_sysfs_fini(pl); + ldlm_pool_debugfs_fini(pl); + + /* + * Pool should not be used after this point. We can't free it here as + * it lives in struct ldlm_namespace, but still interested in catching + * any abnormal using cases. + */ + POISON(pl, 0x5a, sizeof(*pl)); + EXIT; +} + +/** + * Add new taken ldlm lock \a lock into pool \a pl accounting. + */ +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + /* + * FLOCK locks are special in a sense that they are almost never + * cancelled, instead special kind of lock is used to drop them. + * also there is no LRU for flock locks, so no point in tracking + * them anyway. + * + * PLAIN locks are used by config and quota, the quantity is small + * and usually they are not in LRU. + */ + if (lock->l_resource->lr_type == LDLM_FLOCK || + lock->l_resource->lr_type == LDLM_PLAIN) + return; + + ldlm_reclaim_add(lock); + + atomic_inc(&pl->pl_granted); + atomic_inc(&pl->pl_grant_rate); + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT); + /* + * Do not do pool recalc for client side as all locks which + * potentially may be canceled has already been packed into + * enqueue/cancel rpc. Also we do not want to run out of stack + * with too long call paths. + */ + if (ns_is_server(ldlm_pl2ns(pl))) + ldlm_pool_recalc(pl); +} + +/** + * Remove ldlm lock \a lock from pool \a pl accounting. + */ +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + /* + * Filter out FLOCK & PLAIN locks. Read above comment in + * ldlm_pool_add(). + */ + if (lock->l_resource->lr_type == LDLM_FLOCK || + lock->l_resource->lr_type == LDLM_PLAIN) + return; + + ldlm_reclaim_del(lock); + + LASSERT(atomic_read(&pl->pl_granted) > 0); + atomic_dec(&pl->pl_granted); + atomic_inc(&pl->pl_cancel_rate); + + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT); + + if (ns_is_server(ldlm_pl2ns(pl))) + ldlm_pool_recalc(pl); +} + +/** + * Returns current \a pl SLV. + * + * \pre ->pl_lock is not locked. + */ +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl) +{ + __u64 slv; + spin_lock(&pl->pl_lock); + slv = pl->pl_server_lock_volume; + spin_unlock(&pl->pl_lock); + return slv; +} + +/** + * Sets passed \a slv to \a pl. + * + * \pre ->pl_lock is not locked. + */ +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv) +{ + spin_lock(&pl->pl_lock); + pl->pl_server_lock_volume = slv; + spin_unlock(&pl->pl_lock); +} + +/** + * Returns current \a pl CLV. + * + * \pre ->pl_lock is not locked. + */ +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl) +{ + __u64 slv; + spin_lock(&pl->pl_lock); + slv = pl->pl_client_lock_volume; + spin_unlock(&pl->pl_lock); + return slv; +} + +/** + * Sets passed \a clv to \a pl. + * + * \pre ->pl_lock is not locked. + */ +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv) +{ + spin_lock(&pl->pl_lock); + pl->pl_client_lock_volume = clv; + spin_unlock(&pl->pl_lock); +} + +/** + * Returns current \a pl limit. + */ +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_limit); +} + +/** + * Sets passed \a limit to \a pl. + */ +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit) +{ + atomic_set(&pl->pl_limit, limit); +} + +/** + * Returns current LVF from \a pl. + */ +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_lock_volume_factor); +} + +static struct shrinker *ldlm_pools_srv_shrinker; +static struct shrinker *ldlm_pools_cli_shrinker; + +/* +* count locks from all namespaces (if possible). Returns number of +* cached locks. +*/ +static unsigned long ldlm_pools_count(enum ldlm_side client, gfp_t gfp_mask) +{ + unsigned long total = 0; + int nr_ns; + struct ldlm_namespace *ns; + struct ldlm_namespace *ns_old = NULL; /* loop detection */ + + if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS)) + return 0; + + CDEBUG(D_DLMTRACE, "Request to count %s locks from all pools\n", + client == LDLM_NAMESPACE_CLIENT ? "client" : "server"); + + /* + * Find out how many resources we may release. + */ + for (nr_ns = ldlm_namespace_nr_read(client); + nr_ns > 0; nr_ns--) { + mutex_lock(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_unlock(ldlm_namespace_lock(client)); + return 0; + } + ns = ldlm_namespace_first_locked(client); + + if (ns == ns_old) { + mutex_unlock(ldlm_namespace_lock(client)); + break; + } + + if (ldlm_ns_empty(ns)) { + ldlm_namespace_move_to_inactive_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + continue; + } + + if (ns_old == NULL) + ns_old = ns; + + ldlm_namespace_get(ns); + ldlm_namespace_move_to_active_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask); + ldlm_namespace_put(ns); + } + + return total; +} + +static unsigned long ldlm_pools_scan(enum ldlm_side client, int nr, + gfp_t gfp_mask) +{ + unsigned long freed = 0; + int tmp, nr_ns; + struct ldlm_namespace *ns; + + if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS)) + return -1; + + /* + * Shrink at least ldlm_namespace_nr_read(client) namespaces. + */ + for (tmp = nr_ns = ldlm_namespace_nr_read(client); + tmp > 0; tmp--) { + int cancel, nr_locks; + + /* + * Do not call shrink under ldlm_namespace_lock(client) + */ + mutex_lock(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_unlock(ldlm_namespace_lock(client)); + break; + } + ns = ldlm_namespace_first_locked(client); + ldlm_namespace_get(ns); + ldlm_namespace_move_to_active_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + + nr_locks = ldlm_pool_granted(&ns->ns_pool); + /* + * We use to shrink propotionally but with new shrinker API, + * we lost the total number of freeable locks. + */ + cancel = 1 + min_t(int, nr_locks, nr / nr_ns); + freed += ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask); + ldlm_namespace_put(ns); + } + /* + * we only decrease the SLV in server pools shrinker, return + * SHRINK_STOP to kernel to avoid needless loop. LU-1128 + */ + return (client == LDLM_NAMESPACE_SERVER) ? SHRINK_STOP : freed; +} + +#ifdef HAVE_SHRINKER_COUNT +static unsigned long ldlm_pools_srv_count(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_count(LDLM_NAMESPACE_SERVER, sc->gfp_mask); +} + +static unsigned long ldlm_pools_srv_scan(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_scan(LDLM_NAMESPACE_SERVER, sc->nr_to_scan, + sc->gfp_mask); +} + +static unsigned long ldlm_pools_cli_count(struct shrinker *s, struct shrink_control *sc) +{ + return ldlm_pools_count(LDLM_NAMESPACE_CLIENT, sc->gfp_mask); +} + +static unsigned long ldlm_pools_cli_scan(struct shrinker *s, + struct shrink_control *sc) +{ + return ldlm_pools_scan(LDLM_NAMESPACE_CLIENT, sc->nr_to_scan, + sc->gfp_mask); +} + +#else +/* + * Cancel \a nr locks from all namespaces (if possible). Returns number of + * cached locks after shrink is finished. All namespaces are asked to + * cancel approximately equal amount of locks to keep balancing. + */ +static int ldlm_pools_shrink(enum ldlm_side client, int nr, gfp_t gfp_mask) +{ + unsigned long total = 0; + + if (client == LDLM_NAMESPACE_CLIENT && nr != 0 && + !(gfp_mask & __GFP_FS)) + return -1; + + CDEBUG(D_DLMTRACE, "Request to shrink %d %s locks from all pools\n", + nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server"); + + total = ldlm_pools_count(client, gfp_mask); + + if (nr == 0 || total == 0) + return total; + + return ldlm_pools_scan(client, nr, gfp_mask); +} + +static int ldlm_pools_srv_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, + shrink_param(sc, nr_to_scan), + shrink_param(sc, gfp_mask)); +} + +static int ldlm_pools_cli_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, + shrink_param(sc, nr_to_scan), + shrink_param(sc, gfp_mask)); +} + +#endif /* HAVE_SHRINKER_COUNT */ + +static time64_t ldlm_pools_recalc_delay(enum ldlm_side side) +{ + struct ldlm_namespace *ns; + struct ldlm_namespace *ns_old = NULL; + /* seconds of sleep if no active namespaces */ + time64_t delay = side == LDLM_NAMESPACE_SERVER ? + LDLM_POOL_SRV_DEF_RECALC_PERIOD : + LDLM_POOL_CLI_DEF_RECALC_PERIOD; + int nr; + + /* Recalc at least ldlm_namespace_nr(side) namespaces. */ + for (nr = ldlm_namespace_nr_read(side); nr > 0; nr--) { + int skip; + /* + * Lock the list, get first @ns in the list, getref, move it + * to the tail, unlock and call pool recalc. This way we avoid + * calling recalc under @ns lock, which is really good as we + * get rid of potential deadlock on side nodes when canceling + * locks synchronously. + */ + mutex_lock(ldlm_namespace_lock(side)); + if (list_empty(ldlm_namespace_list(side))) { + mutex_unlock(ldlm_namespace_lock(side)); + break; + } + ns = ldlm_namespace_first_locked(side); + + if (ns_old == ns) { /* Full pass complete */ + mutex_unlock(ldlm_namespace_lock(side)); + break; + } + + /* We got an empty namespace, need to move it back to inactive + * list. + * The race with parallel resource creation is fine: + * - If they do namespace_get before our check, we fail the + * check and they move this item to the end of the list anyway + * - If we do the check and then they do namespace_get, then + * we move the namespace to inactive and they will move + * it back to active (synchronised by the lock, so no clash + * there). + */ + if (ldlm_ns_empty(ns)) { + ldlm_namespace_move_to_inactive_locked(ns, side); + mutex_unlock(ldlm_namespace_lock(side)); + continue; + } + + if (ns_old == NULL) + ns_old = ns; + + spin_lock(&ns->ns_lock); + /* + * skip ns which is being freed, and we don't want to increase + * its refcount again, not even temporarily. bz21519 & LU-499. + */ + if (ns->ns_stopping) { + skip = 1; + } else { + skip = 0; + ldlm_namespace_get(ns); + } + spin_unlock(&ns->ns_lock); + + ldlm_namespace_move_to_active_locked(ns, side); + mutex_unlock(ldlm_namespace_lock(side)); + + /* + * After setup is done - recalc the pool. + */ + if (!skip) { + delay = min(delay, ldlm_pool_recalc(&ns->ns_pool)); + ldlm_namespace_put(ns); + } + } + + return delay; +} + +static void ldlm_pools_recalc_task(struct work_struct *ws); +static DECLARE_DELAYED_WORK(ldlm_pools_recalc_work, ldlm_pools_recalc_task); + +static void ldlm_pools_recalc_task(struct work_struct *ws) +{ + /* seconds of sleep if no active namespaces */ + time64_t delay; +#ifdef HAVE_SERVER_SUPPORT + struct ldlm_namespace *ns; + unsigned long nr_l = 0, nr_p = 0, l; + int equal = 0; + + /* Check all modest namespaces first. */ + mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER)); + list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER), + ns_list_chain) { + if (ns->ns_appetite != LDLM_NAMESPACE_MODEST) + continue; + + l = ldlm_pool_granted(&ns->ns_pool); + if (l == 0) + l = 1; + + /* + * Set the modest pools limit equal to their avg granted + * locks + ~6%. + */ + l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0); + ldlm_pool_setup(&ns->ns_pool, l); + nr_l += l; + nr_p++; + } + + /* + * Make sure than modest namespaces did not eat more that 2/3 + * of limit. + */ + if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) { + CWARN("'Modest' pools eat out 2/3 of server locks " + "limit (%lu of %lu). This means that you have too " + "many clients for this amount of server RAM. " + "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L); + equal = 1; + } + + /* The rest is given to greedy namespaces. */ + list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER), + ns_list_chain) { + if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY) + continue; + + if (equal) { + /* + * In the case 2/3 locks are eaten out by + * modest pools, we re-setup equal limit + * for _all_ pools. + */ + l = LDLM_POOL_HOST_L / + ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER); + } else { + /* + * All the rest of greedy pools will have + * all locks in equal parts. + */ + l = (LDLM_POOL_HOST_L - nr_l) / + (ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER) - + nr_p); + } + ldlm_pool_setup(&ns->ns_pool, l); + } + mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER)); + + delay = min(ldlm_pools_recalc_delay(LDLM_NAMESPACE_SERVER), + ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT)); +#else /* !HAVE_SERVER_SUPPORT */ + delay = ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT); +#endif /* HAVE_SERVER_SUPPORT */ + + /* Wake up the blocking threads from time to time. */ + ldlm_bl_thread_wakeup(); + + schedule_delayed_work(&ldlm_pools_recalc_work, cfs_time_seconds(delay)); +} + +int ldlm_pools_init(void) +{ + DEF_SHRINKER_VAR(shsvar, ldlm_pools_srv_shrink, + ldlm_pools_srv_count, ldlm_pools_srv_scan); + DEF_SHRINKER_VAR(shcvar, ldlm_pools_cli_shrink, + ldlm_pools_cli_count, ldlm_pools_cli_scan); + + schedule_delayed_work(&ldlm_pools_recalc_work, + LDLM_POOL_CLI_DEF_RECALC_PERIOD); + ldlm_pools_srv_shrinker = set_shrinker(DEFAULT_SEEKS, &shsvar); + ldlm_pools_cli_shrinker = set_shrinker(DEFAULT_SEEKS, &shcvar); + + return 0; +} + +void ldlm_pools_fini(void) +{ + if (ldlm_pools_srv_shrinker != NULL) { + remove_shrinker(ldlm_pools_srv_shrinker); + ldlm_pools_srv_shrinker = NULL; + } + if (ldlm_pools_cli_shrinker != NULL) { + remove_shrinker(ldlm_pools_cli_shrinker); + ldlm_pools_cli_shrinker = NULL; + } + cancel_delayed_work_sync(&ldlm_pools_recalc_work); +} + +#else /* !HAVE_LRU_RESIZE_SUPPORT */ +int ldlm_pool_setup(struct ldlm_pool *pl, int limit) +{ + return 0; +} + +time64_t ldlm_pool_recalc(struct ldlm_pool *pl) +{ + return 0; +} + +int ldlm_pool_shrink(struct ldlm_pool *pl, + int nr, gfp_t gfp_mask) +{ + return 0; +} + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, enum ldlm_side client) +{ + return 0; +} + +void ldlm_pool_fini(struct ldlm_pool *pl) +{ + return; +} + +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + return; +} + +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + return; +} + +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl) +{ + return 1; +} + +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv) +{ + return; +} + +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl) +{ + return 1; +} + +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv) +{ + return; +} + +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl) +{ + return 0; +} + +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit) +{ + return; +} + +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl) +{ + return 0; +} + +int ldlm_pools_init(void) +{ + return 0; +} + +void ldlm_pools_fini(void) +{ + return; +} + +#endif /* HAVE_LRU_RESIZE_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c new file mode 100644 index 0000000000000..cf4c87f9e2312 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c @@ -0,0 +1,411 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, Intel Corporation. + * Use is subject to license terms. + * + * Author: Niu Yawei + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include "ldlm_internal.h" + +/* + * To avoid ldlm lock exhausting server memory, two global parameters: + * ldlm_reclaim_threshold & ldlm_lock_limit are used for reclaiming + * granted locks and rejecting incoming enqueue requests defensively. + * + * ldlm_reclaim_threshold: When the amount of granted locks reaching this + * threshold, server start to revoke locks gradually. + * + * ldlm_lock_limit: When the amount of granted locks reaching this + * threshold, server will return -EINPROGRESS to any incoming enqueue + * request until the lock count is shrunk below the threshold again. + * + * ldlm_reclaim_threshold & ldlm_lock_limit is set to 20% & 30% of the + * total memory by default. It is tunable via proc entry, when it's set + * to 0, the feature is disabled. + */ + +#ifdef HAVE_SERVER_SUPPORT + +/* Lock count is stored in ldlm_reclaim_threshold & ldlm_lock_limit */ +__u64 ldlm_reclaim_threshold; +__u64 ldlm_lock_limit; + +/* Represents ldlm_reclaim_threshold & ldlm_lock_limit in MB, used for + * proc interface. */ +__u64 ldlm_reclaim_threshold_mb; +__u64 ldlm_lock_limit_mb; + +struct percpu_counter ldlm_granted_total; +static atomic_t ldlm_nr_reclaimer; +static s64 ldlm_last_reclaim_age_ns; +static ktime_t ldlm_last_reclaim_time; + +struct ldlm_reclaim_cb_data { + struct list_head rcd_rpc_list; + int rcd_added; + int rcd_total; + int rcd_cursor; + int rcd_start; + bool rcd_skip; + s64 rcd_age_ns; + struct cfs_hash_bd *rcd_prev_bd; +}; + +static inline bool ldlm_lock_reclaimable(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + /* FLOCK & PLAIN lock are not reclaimable. FLOCK is + * explicitly controlled by application, PLAIN lock + * is used by quota global lock and config lock. + */ + if (ns->ns_client == LDLM_NAMESPACE_SERVER && + (lock->l_resource->lr_type == LDLM_IBITS || + lock->l_resource->lr_type == LDLM_EXTENT)) + return true; + return false; +} + +/** + * Callback function for revoking locks from certain resource. + * + * \param [in] hs ns_rs_hash + * \param [in] bd current bucket of ns_rsh_hash + * \param [in] hnode hnode of the resource + * \param [in] arg opaque data + * + * \retval 0 continue the scan + * \retval 1 stop the iteration + */ +static int ldlm_reclaim_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) + +{ + struct ldlm_resource *res; + struct ldlm_reclaim_cb_data *data; + struct ldlm_lock *lock; + struct ldlm_ns_bucket *nsb; + int rc = 0; + + data = (struct ldlm_reclaim_cb_data *)arg; + + LASSERTF(data->rcd_added < data->rcd_total, "added:%d >= total:%d\n", + data->rcd_added, data->rcd_total); + + nsb = cfs_hash_bd_extra_get(hs, bd); + res = cfs_hash_object(hs, hnode); + + if (data->rcd_prev_bd != bd) { + if (data->rcd_prev_bd != NULL) + ldlm_res_to_ns(res)->ns_reclaim_start++; + data->rcd_prev_bd = bd; + data->rcd_cursor = 0; + data->rcd_start = nsb->nsb_reclaim_start % + cfs_hash_bd_count_get(bd); + } + + if (data->rcd_skip && data->rcd_cursor < data->rcd_start) { + data->rcd_cursor++; + return 0; + } + + nsb->nsb_reclaim_start++; + + lock_res(res); + list_for_each_entry(lock, &res->lr_granted, l_res_link) { + if (!ldlm_lock_reclaimable(lock)) + continue; + + if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW) && + ktime_before(ktime_get(), + ktime_add_ns(lock->l_last_used, + data->rcd_age_ns))) + continue; + + if (!ldlm_is_ast_sent(lock)) { + ldlm_set_ast_sent(lock); + LASSERT(list_empty(&lock->l_rk_ast)); + list_add(&lock->l_rk_ast, &data->rcd_rpc_list); + LDLM_LOCK_GET(lock); + if (++data->rcd_added == data->rcd_total) { + rc = 1; /* stop the iteration */ + break; + } + } + } + unlock_res(res); + + return rc; +} + +/** + * Revoke locks from the resources of a namespace in a roundrobin + * manner. + * + * \param[in] ns namespace to do the lock revoke on + * \param[in] count count of lock to be revoked + * \param[in] age only revoke locks older than the 'age' + * \param[in] skip scan from the first lock on resource if the + * 'skip' is false, otherwise, continue scan + * from the last scanned position + * \param[out] count count of lock still to be revoked + */ +static void ldlm_reclaim_res(struct ldlm_namespace *ns, int *count, + s64 age_ns, bool skip) +{ + struct ldlm_reclaim_cb_data data; + int idx, type, start; + ENTRY; + + LASSERT(*count != 0); + + if (ns->ns_obd) { + type = server_name2index(ns->ns_obd->obd_name, &idx, NULL); + if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) { + EXIT; + return; + } + } + + if (atomic_read(&ns->ns_bref) == 0) { + EXIT; + return; + } + + INIT_LIST_HEAD(&data.rcd_rpc_list); + data.rcd_added = 0; + data.rcd_total = *count; + data.rcd_age_ns = age_ns; + data.rcd_skip = skip; + data.rcd_prev_bd = NULL; + start = ns->ns_reclaim_start % CFS_HASH_NBKT(ns->ns_rs_hash); + + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_reclaim_lock_cb, &data, + start); + + CDEBUG(D_DLMTRACE, "NS(%s): %d locks to be reclaimed, found %d/%d " + "locks.\n", ldlm_ns_name(ns), *count, data.rcd_added, + data.rcd_total); + + LASSERTF(*count >= data.rcd_added, "count:%d, added:%d\n", *count, + data.rcd_added); + + ldlm_run_ast_work(ns, &data.rcd_rpc_list, LDLM_WORK_REVOKE_AST); + *count -= data.rcd_added; + EXIT; +} + +#define LDLM_RECLAIM_BATCH 512 +#define LDLM_RECLAIM_AGE_MIN (300 * NSEC_PER_SEC) +#define LDLM_RECLAIM_AGE_MAX (LDLM_DEFAULT_MAX_ALIVE * NSEC_PER_SEC * 3 / 4) + +static inline s64 ldlm_reclaim_age(void) +{ + s64 age_ns = ldlm_last_reclaim_age_ns; + ktime_t now = ktime_get(); + ktime_t diff; + + diff = ktime_sub(now, ldlm_last_reclaim_time); + age_ns += ktime_to_ns(diff); + if (age_ns > LDLM_RECLAIM_AGE_MAX) + age_ns = LDLM_RECLAIM_AGE_MAX; + else if (age_ns < (LDLM_RECLAIM_AGE_MIN * 2)) + age_ns = LDLM_RECLAIM_AGE_MIN; + return age_ns; +} + +/** + * Revoke certain amount of locks from all the server namespaces + * in a roundrobin manner. Lock age is used to avoid reclaim on + * the non-aged locks. + */ +static void ldlm_reclaim_ns(void) +{ + struct ldlm_namespace *ns; + int count = LDLM_RECLAIM_BATCH; + int ns_nr, nr_processed; + enum ldlm_side ns_cli = LDLM_NAMESPACE_SERVER; + s64 age_ns; + bool skip = true; + ENTRY; + + if (!atomic_add_unless(&ldlm_nr_reclaimer, 1, 1)) { + EXIT; + return; + } + + age_ns = ldlm_reclaim_age(); +again: + nr_processed = 0; + ns_nr = ldlm_namespace_nr_read(ns_cli); + while (count > 0 && nr_processed < ns_nr) { + mutex_lock(ldlm_namespace_lock(ns_cli)); + + if (list_empty(ldlm_namespace_list(ns_cli))) { + mutex_unlock(ldlm_namespace_lock(ns_cli)); + goto out; + } + + ns = ldlm_namespace_first_locked(ns_cli); + ldlm_namespace_move_to_active_locked(ns, ns_cli); + mutex_unlock(ldlm_namespace_lock(ns_cli)); + + ldlm_reclaim_res(ns, &count, age_ns, skip); + ldlm_namespace_put(ns); + nr_processed++; + } + + if (count > 0 && age_ns > LDLM_RECLAIM_AGE_MIN) { + age_ns >>= 1; + if (age_ns < (LDLM_RECLAIM_AGE_MIN * 2)) + age_ns = LDLM_RECLAIM_AGE_MIN; + skip = false; + goto again; + } + + ldlm_last_reclaim_age_ns = age_ns; + ldlm_last_reclaim_time = ktime_get(); +out: + atomic_add_unless(&ldlm_nr_reclaimer, -1, 0); + EXIT; +} + +void ldlm_reclaim_add(struct ldlm_lock *lock) +{ + if (!ldlm_lock_reclaimable(lock)) + return; + percpu_counter_add(&ldlm_granted_total, 1); + lock->l_last_used = ktime_get(); +} + +void ldlm_reclaim_del(struct ldlm_lock *lock) +{ + if (!ldlm_lock_reclaimable(lock)) + return; + percpu_counter_sub(&ldlm_granted_total, 1); +} + +/** + * Check on the total granted locks: return true if it reaches the + * high watermark (ldlm_lock_limit), otherwise return false; It also + * triggers lock reclaim if the low watermark (ldlm_reclaim_threshold) + * is reached. + * + * \retval true high watermark reached. + * \retval false high watermark not reached. + */ +bool ldlm_reclaim_full(void) +{ + __u64 high = ldlm_lock_limit; + __u64 low = ldlm_reclaim_threshold; + + if (low != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW)) + low = cfs_fail_val; + + if (low != 0 && + percpu_counter_sum_positive(&ldlm_granted_total) > low) + ldlm_reclaim_ns(); + + if (high != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH)) + high = cfs_fail_val; + + if (high != 0 && + percpu_counter_sum_positive(&ldlm_granted_total) > high) + return true; + + return false; +} + +static inline __u64 ldlm_ratio2locknr(int ratio) +{ + __u64 locknr; + + locknr = ((__u64)NUM_CACHEPAGES << PAGE_SHIFT) * ratio; + do_div(locknr, 100 * sizeof(struct ldlm_lock)); + + return locknr; +} + +static inline __u64 ldlm_locknr2mb(__u64 locknr) +{ + return (locknr * sizeof(struct ldlm_lock) + 512 * 1024) >> 20; +} + +#define LDLM_WM_RATIO_LOW_DEFAULT 20 +#define LDLM_WM_RATIO_HIGH_DEFAULT 30 + +int ldlm_reclaim_setup(void) +{ + atomic_set(&ldlm_nr_reclaimer, 0); + + ldlm_reclaim_threshold = ldlm_ratio2locknr(LDLM_WM_RATIO_LOW_DEFAULT); + ldlm_reclaim_threshold_mb = ldlm_locknr2mb(ldlm_reclaim_threshold); + ldlm_lock_limit = ldlm_ratio2locknr(LDLM_WM_RATIO_HIGH_DEFAULT); + ldlm_lock_limit_mb = ldlm_locknr2mb(ldlm_lock_limit); + + ldlm_last_reclaim_age_ns = LDLM_RECLAIM_AGE_MAX; + ldlm_last_reclaim_time = ktime_get(); + +#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG + return percpu_counter_init(&ldlm_granted_total, 0, GFP_KERNEL); +#else + return percpu_counter_init(&ldlm_granted_total, 0); +#endif +} + +void ldlm_reclaim_cleanup(void) +{ + percpu_counter_destroy(&ldlm_granted_total); +} + +#else /* HAVE_SERVER_SUPPORT */ + +bool ldlm_reclaim_full(void) +{ + return false; +} + +void ldlm_reclaim_add(struct ldlm_lock *lock) +{ +} + +void ldlm_reclaim_del(struct ldlm_lock *lock) +{ +} + +int ldlm_reclaim_setup(void) +{ + return 0; +} + +void ldlm_reclaim_cleanup(void) +{ +} + +#endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c new file mode 100644 index 0000000000000..f16aaa954a54c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c @@ -0,0 +1,2471 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** + * This file contains Asynchronous System Trap (AST) handlers and related + * LDLM request-processing routines. + * + * An AST is a callback issued on a lock when its state is changed. There are + * several different types of ASTs (callbacks) registered for each lock: + * + * - completion AST: when a lock is enqueued by some process, but cannot be + * granted immediately due to other conflicting locks on the same resource, + * the completion AST is sent to notify the caller when the lock is + * eventually granted + * + * - blocking AST: when a lock is granted to some process, if another process + * enqueues a conflicting (blocking) lock on a resource, a blocking AST is + * sent to notify the holder(s) of the lock(s) of the conflicting lock + * request. The lock holder(s) must release their lock(s) on that resource in + * a timely manner or be evicted by the server. + * + * - glimpse AST: this is used when a process wants information about a lock + * (i.e. the lock value block (LVB)) but does not necessarily require holding + * the lock. If the resource is locked, the lock holder(s) are sent glimpse + * ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL + * their lock(s) if they are idle. If the resource is not locked, the server + * may grant the lock. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include + +#include "ldlm_internal.h" + +unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; +module_param(ldlm_enqueue_min, uint, 0644); +MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum"); +EXPORT_SYMBOL(ldlm_enqueue_min); + +/* in client side, whether the cached locks will be canceled before replay */ +unsigned int ldlm_cancel_unused_locks_before_replay = 1; + +static void interrupted_completion_wait(void *data) +{ +} + +struct lock_wait_data { + struct ldlm_lock *lwd_lock; + __u32 lwd_conn_cnt; +}; + +struct ldlm_async_args { + struct lustre_handle lock_handle; +}; + +/** + * ldlm_request_bufsize + * + * If opcode=LDLM_ENQUEUE, 1 slot is already occupied, + * LDLM_LOCKREQ_HANDLE -1 slots are available. + * Otherwise, LDLM_LOCKREQ_HANDLE slots are available. + * + * \param[in] count + * \param[in] type + * + * \retval size of the request buffer + */ + +int ldlm_request_bufsize(int count, int type) +{ + int avail = LDLM_LOCKREQ_HANDLES; + if (type == LDLM_ENQUEUE) + avail -= LDLM_ENQUEUE_CANCEL_OFF; + + if (count > avail) + avail = (count - avail) * sizeof(struct lustre_handle); + else + avail = 0; + + return sizeof(struct ldlm_request) + avail; +} + +int ldlm_expired_completion_wait(void *data) +{ + struct lock_wait_data *lwd = data; + struct ldlm_lock *lock = lwd->lwd_lock; + struct obd_import *imp; + struct obd_device *obd; + + ENTRY; + if (lock->l_conn_export == NULL) { + static time64_t next_dump, last_dump; + + LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago); " + "not entering recovery in server code, just going back to sleep", + (s64)lock->l_activity, + (s64)(ktime_get_real_seconds() - + lock->l_activity)); + if (ktime_get_seconds() > next_dump) { + last_dump = next_dump; + next_dump = ktime_get_seconds() + 300; + ldlm_namespace_dump(D_DLMTRACE, + ldlm_lock_to_ns(lock)); + if (last_dump == 0) + libcfs_debug_dumplog(); + } + RETURN(0); + } + + obd = lock->l_conn_export->exp_obd; + imp = obd->u.cli.cl_import; + ptlrpc_fail_import(imp, lwd->lwd_conn_cnt); + LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago), entering recovery for %s@%s", + (s64)lock->l_activity, + (s64)(ktime_get_real_seconds() - lock->l_activity), + obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid); + + RETURN(0); +} + +int is_granted_or_cancelled_nolock(struct ldlm_lock *lock) +{ + int ret = 0; + + check_res_locked(lock->l_resource); + if (ldlm_is_granted(lock) && !ldlm_is_cp_reqd(lock)) + ret = 1; + else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock)) + ret = 1; + return ret; +} +EXPORT_SYMBOL(is_granted_or_cancelled_nolock); + +/** + * Calculate the Completion timeout (covering enqueue, BL AST, data flush, + * lock cancel, and their replies). Used for lock completion timeout on the + * client side. + * + * \param[in] lock lock which is waiting the completion callback + * + * \retval timeout in seconds to wait for the server reply + */ + +/* We use the same basis for both server side and client side functions + from a single node. */ +static time64_t ldlm_cp_timeout(struct ldlm_lock *lock) +{ + time64_t timeout; + + if (AT_OFF) + return obd_timeout; + + /* Wait a long time for enqueue - server may have to callback a + * lock from another client. Server will evict the other client if it + * doesn't respond reasonably, and then give us the lock. */ + timeout = at_get(ldlm_lock_to_ns_at(lock)); + return max(3 * timeout, (time64_t) ldlm_enqueue_min); +} + +/** + * Helper function for ldlm_completion_ast(), updating timings when lock is + * actually granted. + */ +static int ldlm_completion_tail(struct ldlm_lock *lock, void *data) +{ + time64_t delay; + int result = 0; + + if (ldlm_is_destroyed(lock) || ldlm_is_failed(lock)) { + LDLM_DEBUG(lock, "client-side enqueue: destroyed"); + result = -EIO; + } else if (data == NULL) { + LDLM_DEBUG(lock, "client-side enqueue: granted"); + } else { + /* Take into AT only CP RPC, not immediately granted locks */ + delay = ktime_get_real_seconds() - lock->l_activity; + LDLM_DEBUG(lock, "client-side enqueue: granted after %llds", + (s64)delay); + + /* Update our time estimate */ + at_measured(ldlm_lock_to_ns_at(lock), delay); + } + return result; +} + +/** + * Implementation of ->l_completion_ast() for a client, that doesn't wait + * until lock is granted. Suitable for locks enqueued through ptlrpcd, of + * other threads that cannot block for long. + */ +int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data) +{ + ENTRY; + + if (flags == LDLM_FL_WAIT_NOREPROC) { + LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock"); + RETURN(0); + } + + if (!(flags & LDLM_FL_BLOCKED_MASK)) { + wake_up(&lock->l_waitq); + RETURN(ldlm_completion_tail(lock, data)); + } + + LDLM_DEBUG(lock, + "client-side enqueue returned a blocked lock, going forward"); + ldlm_reprocess_all(lock->l_resource, NULL); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_completion_ast_async); + +/** + * Generic LDLM "completion" AST. This is called in several cases: + * + * - when a reply to an ENQUEUE RPC is received from the server + * (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at + * this point (determined by flags); + * + * - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has + * been granted; + * + * - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock + * gets correct lvb; + * + * - to force all locks when resource is destroyed (cleanup_resource()); + * + * If lock is not granted in the first case, this function waits until second + * or penultimate cases happen in some other thread. + * + */ +int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) +{ + /* XXX ALLOCATE - 160 bytes */ + struct lock_wait_data lwd; + struct obd_device *obd; + struct obd_import *imp = NULL; + struct l_wait_info lwi; + time64_t timeout; + int rc = 0; + ENTRY; + + if (flags == LDLM_FL_WAIT_NOREPROC) { + LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock"); + goto noreproc; + } + + if (!(flags & LDLM_FL_BLOCKED_MASK)) { + wake_up(&lock->l_waitq); + RETURN(0); + } + + LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, " + "sleeping"); + +noreproc: + + obd = class_exp2obd(lock->l_conn_export); + + /* if this is a local lock, then there is no import */ + if (obd != NULL) { + imp = obd->u.cli.cl_import; + } + + timeout = ldlm_cp_timeout(lock); + + lwd.lwd_lock = lock; + lock->l_activity = ktime_get_real_seconds(); + + if (ldlm_is_no_timeout(lock)) { + LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT"); + lwi = LWI_INTR(interrupted_completion_wait, &lwd); + } else { + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout), + ldlm_expired_completion_wait, + interrupted_completion_wait, &lwd); + } + + if (imp != NULL) { + spin_lock(&imp->imp_lock); + lwd.lwd_conn_cnt = imp->imp_conn_cnt; + spin_unlock(&imp->imp_lock); + } + + if (ns_is_client(ldlm_lock_to_ns(lock)) && + OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST, + OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) { + ldlm_set_fail_loc(lock); + rc = -EINTR; + } else { + /* Go to sleep until the lock is granted or cancelled. */ + rc = l_wait_event(lock->l_waitq, + is_granted_or_cancelled(lock), &lwi); + } + + if (rc) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", + rc); + RETURN(rc); + } + + RETURN(ldlm_completion_tail(lock, data)); +} +EXPORT_SYMBOL(ldlm_completion_ast); + +/** + * A helper to build a blocking AST function + * + * Perform a common operation for blocking ASTs: + * defferred lock cancellation. + * + * \param lock the lock blocking or canceling AST was called on + * \retval 0 + * \see mdt_blocking_ast + * \see ldlm_blocking_ast + */ +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock) +{ + int do_ast; + ENTRY; + + ldlm_set_cbpending(lock); + do_ast = (!lock->l_readers && !lock->l_writers); + unlock_res_and_lock(lock); + + if (do_ast) { + struct lustre_handle lockh; + int rc; + + LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel"); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc < 0) + CERROR("ldlm_cli_cancel: %d\n", rc); + } else { + LDLM_DEBUG(lock, "Lock still has references, will be " + "cancelled later"); + } + RETURN(0); +} +EXPORT_SYMBOL(ldlm_blocking_ast_nocheck); + +/** + * Server blocking AST + * + * ->l_blocking_ast() callback for LDLM locks acquired by server-side + * OBDs. + * + * \param lock the lock which blocks a request or cancelling lock + * \param desc unused + * \param data unused + * \param flag indicates whether this cancelling or blocking callback + * \retval 0 + * \see ldlm_blocking_ast_nocheck + */ +int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + ENTRY; + + if (flag == LDLM_CB_CANCELING) { + /* Don't need to do anything here. */ + RETURN(0); + } + + lock_res_and_lock(lock); + /* Get this: if ldlm_blocking_ast is racing with intent_policy, such + * that ldlm_blocking_ast is called just before intent_policy method + * takes the lr_lock, then by the time we get the lock, we might not + * be the correct blocking function anymore. So check, and return + * early, if so. */ + if (lock->l_blocking_ast != ldlm_blocking_ast) { + unlock_res_and_lock(lock); + RETURN(0); + } + RETURN(ldlm_blocking_ast_nocheck(lock)); +} +EXPORT_SYMBOL(ldlm_blocking_ast); + +/** + * Implements ldlm_lock::l_glimpse_ast for extent locks acquired on the server. + * + * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for that is + * rather subtle: with OST-side locking, it may so happen that _all_ extent + * locks are held by the OST. If client wants to obtain the current file size + * it calls ll_glimpse_size(), and (as all locks are held only on the server), + * this dummy glimpse callback fires and does nothing. The client still + * receives the correct file size due to the following fragment of code in + * ldlm_cb_interpret(): + * + * if (rc == -ELDLM_NO_LOCK_DATA) { + * LDLM_DEBUG(lock, "lost race - client has a lock but no" + * "inode"); + * ldlm_res_lvbo_update(lock->l_resource, NULL, 1); + * } + * + * That is, after the glimpse returns this error, ofd_lvbo_update() is called + * and returns the updated file attributes from the inode to the client. + * + * See also comment in ofd_intent_policy() on why servers must set a non-NULL + * l_glimpse_ast when grabbing DLM locks. Otherwise, the server will assume + * that the object is in the process of being destroyed. + * + * \param[in] lock DLM lock being glimpsed, unused + * \param[in] reqp pointer to ptlrpc_request, unused + * + * \retval -ELDLM_NO_LOCK_DATA to get attributes from disk object + */ +int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp) +{ + return -ELDLM_NO_LOCK_DATA; +} + +/** + * Enqueue a local lock (typically on a server). + */ +int ldlm_cli_enqueue_local(const struct lu_env *env, + struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + enum ldlm_type type, union ldlm_policy_data *policy, + enum ldlm_mode mode, __u64 *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, enum lvb_type lvb_type, + const __u64 *client_cookie, + struct lustre_handle *lockh) +{ + struct ldlm_lock *lock; + int err; + const struct ldlm_callback_suite cbs = { .lcs_completion = completion, + .lcs_blocking = blocking, + .lcs_glimpse = glimpse, + }; + ENTRY; + + LASSERT(!(*flags & LDLM_FL_REPLAY)); + if (unlikely(ns_is_client(ns))) { + CERROR("Trying to enqueue local lock in a shadow namespace\n"); + LBUG(); + } + + lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len, + lvb_type); + if (IS_ERR(lock)) + GOTO(out_nolock, err = PTR_ERR(lock)); + + err = ldlm_lvbo_init(lock->l_resource); + if (err < 0) { + LDLM_ERROR(lock, "delayed lvb init failed (rc %d)", err); + ldlm_lock_destroy_nolock(lock); + GOTO(out, err); + } + + ldlm_lock2handle(lock, lockh); + + /* NB: we don't have any lock now (lock_res_and_lock) + * because it's a new lock */ + ldlm_lock_addref_internal_nolock(lock, mode); + ldlm_set_local(lock); + if (*flags & LDLM_FL_ATOMIC_CB) + ldlm_set_atomic_cb(lock); + + if (policy != NULL) + lock->l_policy_data = *policy; + if (client_cookie != NULL) + lock->l_client_cookie = *client_cookie; + if (type == LDLM_EXTENT) { + /* extent lock without policy is a bug */ + if (policy == NULL) + LBUG(); + + lock->l_req_extent = policy->l_extent; + } + + err = ldlm_lock_enqueue(env, ns, &lock, policy, flags); + if (unlikely(err != ELDLM_OK)) + GOTO(out, err); + + if (policy != NULL) + *policy = lock->l_policy_data; + + if (lock->l_completion_ast) + lock->l_completion_ast(lock, *flags, NULL); + + LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created"); + EXIT; + out: + LDLM_LOCK_RELEASE(lock); + out_nolock: + return err; +} +EXPORT_SYMBOL(ldlm_cli_enqueue_local); + +static void failed_lock_cleanup(struct ldlm_namespace *ns, + struct ldlm_lock *lock, int mode) +{ + int need_cancel = 0; + + /* Set a flag to prevent us from sending a CANCEL (bug 407) */ + lock_res_and_lock(lock); + /* Check that lock is not granted or failed, we might race. */ + if (!ldlm_is_granted(lock) && !ldlm_is_failed(lock)) { + /* Make sure that this lock will not be found by raced + * bl_ast and -EINVAL reply is sent to server anyways. + * b=17645*/ + lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED | + LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING; + need_cancel = 1; + } + unlock_res_and_lock(lock); + + if (need_cancel) + LDLM_DEBUG(lock, + "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | " + "LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING"); + else + LDLM_DEBUG(lock, "lock was granted or failed in race"); + + /* XXX - HACK because we shouldn't call ldlm_lock_destroy() + * from llite/file.c/ll_file_flock(). */ + /* This code makes for the fact that we do not have blocking handler on + * a client for flock locks. As such this is the place where we must + * completely kill failed locks. (interrupted and those that + * were waiting to be granted when server evicted us. */ + if (lock->l_resource->lr_type == LDLM_FLOCK) { + lock_res_and_lock(lock); + if (!ldlm_is_destroyed(lock)) { + ldlm_resource_unlink_lock(lock); + ldlm_lock_decref_internal_nolock(lock, mode); + ldlm_lock_destroy_nolock(lock); + } + unlock_res_and_lock(lock); + } else { + ldlm_lock_decref_internal(lock, mode); + } +} + +/** + * Finishing portion of client lock enqueue code. + * + * Called after receiving reply from server. + */ +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + enum ldlm_type type, __u8 with_policy, + enum ldlm_mode mode, __u64 *flags, void *lvb, + __u32 lvb_len, const struct lustre_handle *lockh, + int rc) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + const struct lu_env *env = NULL; + int is_replay = *flags & LDLM_FL_REPLAY; + struct ldlm_lock *lock; + struct ldlm_reply *reply; + int cleanup_phase = 1; + ENTRY; + + if (req && req->rq_svc_thread) + env = req->rq_svc_thread->t_env; + + lock = ldlm_handle2lock(lockh); + /* ldlm_cli_enqueue is holding a reference on this lock. */ + if (!lock) { + LASSERT(type == LDLM_FLOCK); + RETURN(-ENOLCK); + } + + LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len), + "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len); + + if (rc != ELDLM_OK) { + LASSERT(!is_replay); + LDLM_DEBUG(lock, "client-side enqueue END (%s)", + rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED"); + + if (rc != ELDLM_LOCK_ABORTED) + GOTO(cleanup, rc); + } + + /* Before we return, swab the reply */ + reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (reply == NULL) + GOTO(cleanup, rc = -EPROTO); + + if (lvb_len > 0) { + int size = 0; + + size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, + RCL_SERVER); + if (size < 0) { + LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size); + GOTO(cleanup, rc = size); + } else if (unlikely(size > lvb_len)) { + LDLM_ERROR(lock, "Replied LVB is larger than " + "expectation, expected = %d, replied = %d", + lvb_len, size); + GOTO(cleanup, rc = -EINVAL); + } + lvb_len = size; + } + + if (rc == ELDLM_LOCK_ABORTED) { + if (lvb_len > 0 && lvb != NULL) + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER, + lvb, lvb_len); + GOTO(cleanup, rc = rc ? : ELDLM_LOCK_ABORTED); + } + + /* lock enqueued on the server */ + cleanup_phase = 0; + + lock_res_and_lock(lock); + /* Key change rehash lock in per-export hash with new key */ + if (exp->exp_lock_hash) { + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_rehash_key(exp->exp_lock_hash, + &lock->l_remote_handle, + &reply->lock_handle, + &lock->l_exp_hash); + } else { + lock->l_remote_handle = reply->lock_handle; + } + + *flags = ldlm_flags_from_wire(reply->lock_flags); + lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags & + LDLM_FL_INHERIT_MASK); + unlock_res_and_lock(lock); + + CDEBUG(D_INFO, "local: %p, remote cookie: %#llx, flags: %#llx\n", + lock, reply->lock_handle.cookie, *flags); + + /* If enqueue returned a blocked lock but the completion handler has + * already run, then it fixed up the resource and we don't need to do it + * again. */ + if ((*flags) & LDLM_FL_LOCK_CHANGED) { + int newmode = reply->lock_desc.l_req_mode; + LASSERT(!is_replay); + if (newmode && newmode != lock->l_req_mode) { + LDLM_DEBUG(lock, "server returned different mode %s", + ldlm_lockname[newmode]); + lock->l_req_mode = newmode; + } + + if (!ldlm_res_eq(&reply->lock_desc.l_resource.lr_name, + &lock->l_resource->lr_name)) { + CDEBUG(D_INFO, "remote intent success, locking "DLDLMRES + " instead of "DLDLMRES"\n", + PLDLMRES(&reply->lock_desc.l_resource), + PLDLMRES(lock->l_resource)); + + rc = ldlm_lock_change_resource(ns, lock, + &reply->lock_desc.l_resource.lr_name); + if (rc || lock->l_resource == NULL) + GOTO(cleanup, rc = -ENOMEM); + LDLM_DEBUG(lock, "client-side enqueue, new resource"); + } + + if (with_policy) { + /* We assume lock type cannot change on server*/ + ldlm_convert_policy_to_local(exp, + lock->l_resource->lr_type, + &reply->lock_desc.l_policy_data, + &lock->l_policy_data); + } + + if (type != LDLM_PLAIN) + LDLM_DEBUG(lock,"client-side enqueue, new policy data"); + } + + if ((*flags) & LDLM_FL_AST_SENT) { + lock_res_and_lock(lock); + ldlm_bl_desc2lock(&reply->lock_desc, lock); + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST; + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "enqueue reply includes blocking AST"); + } + + /* If the lock has already been granted by a completion AST, don't + * clobber the LVB with an older one. */ + if (lvb_len > 0) { + /* We must lock or a racing completion might update lvb without + * letting us know and we'll clobber the correct value. + * Cannot unlock after the check either, a that still leaves + * a tiny window for completion to get in */ + lock_res_and_lock(lock); + if (!ldlm_is_granted(lock)) + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER, + lock->l_lvb_data, lvb_len); + unlock_res_and_lock(lock); + if (rc < 0) { + cleanup_phase = 1; + GOTO(cleanup, rc); + } + } + + if (!is_replay) { + rc = ldlm_lock_enqueue(env, ns, &lock, NULL, flags); + if (lock->l_completion_ast != NULL) { + int err = lock->l_completion_ast(lock, *flags, NULL); + if (!rc) + rc = err; + if (rc) + cleanup_phase = 1; + } + } + + if (lvb_len > 0 && lvb != NULL) { + /* Copy the LVB here, and not earlier, because the completion + * AST (if any) can override what we got in the reply */ + memcpy(lvb, lock->l_lvb_data, lvb_len); + } + + LDLM_DEBUG(lock, "client-side enqueue END"); + EXIT; +cleanup: + if (cleanup_phase == 1 && rc) + failed_lock_cleanup(ns, lock, mode); + /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */ + LDLM_LOCK_PUT(lock); + LDLM_LOCK_RELEASE(lock); + return rc; +} +EXPORT_SYMBOL(ldlm_cli_enqueue_fini); + +/** + * Estimate number of lock handles that would fit into request of given + * size. PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into + * a single page on the send/receive side. XXX: 512 should be changed to + * more adequate value. + */ +static inline int ldlm_req_handles_avail(int req_size, int off) +{ + int avail; + + avail = min_t(int, LDLM_MAXREQSIZE, PAGE_SIZE - 512) - req_size; + if (likely(avail >= 0)) + avail /= (int)sizeof(struct lustre_handle); + else + avail = 0; + avail += LDLM_LOCKREQ_HANDLES - off; + + return avail; +} + +static inline int ldlm_capsule_handles_avail(struct req_capsule *pill, + enum req_location loc, + int off) +{ + __u32 size = req_capsule_msg_size(pill, loc); + return ldlm_req_handles_avail(size, off); +} + +static inline int ldlm_format_handles_avail(struct obd_import *imp, + const struct req_format *fmt, + enum req_location loc, int off) +{ + __u32 size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc); + return ldlm_req_handles_avail(size, off); +} + +/** + * Cancel LRU locks and pack them into the enqueue request. Pack there the given + * \a count locks in \a cancels. + * + * This is to be called by functions preparing their own requests that + * might contain lists of locks to cancel in addition to actual operation + * that needs to be performed. + */ +int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req, + int version, int opc, int canceloff, + struct list_head *cancels, int count) + { + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + struct req_capsule *pill = &req->rq_pill; + struct ldlm_request *dlm = NULL; + struct list_head head = LIST_HEAD_INIT(head); + int avail, to_free = 0, pack = 0; + int rc; + ENTRY; + + if (cancels == NULL) + cancels = &head; + if (ns_connect_cancelset(ns)) { + /* Estimate the amount of available space in the request. */ + req_capsule_filled_sizes(pill, RCL_CLIENT); + avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff); + + /* If we have reached the limit, free +1 slot for the new one */ + if (!ns_connect_lru_resize(ns) && opc == LDLM_ENQUEUE && + ns->ns_nr_unused >= ns->ns_max_unused) + to_free = 1; + + /* Cancel LRU locks here _only_ if the server supports + * EARLY_CANCEL. Otherwise we have to send extra CANCEL + * RPC, which will make us slower. */ + if (avail > count) + count += ldlm_cancel_lru_local(ns, cancels, to_free, + avail - count, 0, + LDLM_LRU_FLAG_NO_WAIT); + if (avail > count) + pack = count; + else + pack = avail; + req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT, + ldlm_request_bufsize(pack, opc)); + } + + rc = ptlrpc_request_pack(req, version, opc); + if (rc) { + ldlm_lock_list_put(cancels, l_bl_ast, count); + RETURN(rc); + } + + if (ns_connect_cancelset(ns)) { + if (canceloff) { + dlm = req_capsule_client_get(pill, &RMF_DLM_REQ); + LASSERT(dlm); + /* Skip first lock handler in ldlm_request_pack(), + * this method will increment @lock_count according + * to the lock handle amount actually written to + * the buffer. */ + dlm->lock_count = canceloff; + } + /* Pack into the request @pack lock handles. */ + ldlm_cli_cancel_list(cancels, pack, req, 0); + /* Prepare and send separate cancel RPC for others. */ + ldlm_cli_cancel_list(cancels, count - pack, NULL, 0); + } else { + ldlm_lock_list_put(cancels, l_bl_ast, count); + } + RETURN(0); +} +EXPORT_SYMBOL(ldlm_prep_elc_req); + +int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req, + struct list_head *cancels, int count) +{ + return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE, + LDLM_ENQUEUE_CANCEL_OFF, cancels, count); +} +EXPORT_SYMBOL(ldlm_prep_enqueue_req); + +struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len); + ptlrpc_request_set_replen(req); + RETURN(req); +} +EXPORT_SYMBOL(ldlm_enqueue_pack); + +/** + * Client-side lock enqueue. + * + * If a request has some specific initialisation it is passed in \a reqp, + * otherwise it is created in ldlm_cli_enqueue. + * + * Supports sync and async requests, pass \a async flag accordingly. If a + * request was created in ldlm_cli_enqueue and it is the async request, + * pass it to the caller in \a reqp. + */ +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, + struct ldlm_enqueue_info *einfo, + const struct ldlm_res_id *res_id, + union ldlm_policy_data const *policy, __u64 *flags, + void *lvb, __u32 lvb_len, enum lvb_type lvb_type, + struct lustre_handle *lockh, int async) +{ + struct ldlm_namespace *ns; + struct ldlm_lock *lock; + struct ldlm_request *body; + int is_replay = *flags & LDLM_FL_REPLAY; + int req_passed_in = 1; + int rc, err; + struct ptlrpc_request *req; + ENTRY; + + LASSERT(exp != NULL); + + ns = exp->exp_obd->obd_namespace; + + /* If we're replaying this lock, just check some invariants. + * If we're creating a new lock, get everything all setup nice. */ + if (is_replay) { + lock = ldlm_handle2lock_long(lockh, 0); + LASSERT(lock != NULL); + LDLM_DEBUG(lock, "client-side enqueue START"); + LASSERT(exp == lock->l_conn_export); + } else { + const struct ldlm_callback_suite cbs = { + .lcs_completion = einfo->ei_cb_cp, + .lcs_blocking = einfo->ei_cb_bl, + .lcs_glimpse = einfo->ei_cb_gl + }; + lock = ldlm_lock_create(ns, res_id, einfo->ei_type, + einfo->ei_mode, &cbs, einfo->ei_cbdata, + lvb_len, lvb_type); + if (IS_ERR(lock)) + RETURN(PTR_ERR(lock)); + + if (einfo->ei_cb_created) + einfo->ei_cb_created(lock); + + /* for the local lock, add the reference */ + ldlm_lock_addref_internal(lock, einfo->ei_mode); + ldlm_lock2handle(lock, lockh); + if (policy != NULL) + lock->l_policy_data = *policy; + + if (einfo->ei_type == LDLM_EXTENT) { + /* extent lock without policy is a bug */ + if (policy == NULL) + LBUG(); + + lock->l_req_extent = policy->l_extent; + } + LDLM_DEBUG(lock, "client-side enqueue START, flags %#llx", + *flags); + } + + lock->l_conn_export = exp; + lock->l_export = NULL; + lock->l_blocking_ast = einfo->ei_cb_bl; + lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL)); + lock->l_activity = ktime_get_real_seconds(); + + /* lock not sent to server yet */ + if (reqp == NULL || *reqp == NULL) { + req = ldlm_enqueue_pack(exp, lvb_len); + if (IS_ERR(req)) { + failed_lock_cleanup(ns, lock, einfo->ei_mode); + LDLM_LOCK_RELEASE(lock); + RETURN(PTR_ERR(req)); + } + + req_passed_in = 0; + if (reqp) + *reqp = req; + } else { + int len; + + req = *reqp; + len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, + RCL_CLIENT); + LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n", + DLM_LOCKREQ_OFF, len, (int)sizeof(*body)); + } + + if (*flags & LDLM_FL_NDELAY) { + DEBUG_REQ(D_DLMTRACE, req, "enque lock with no delay\n"); + req->rq_no_resend = req->rq_no_delay = 1; + /* probably set a shorter timeout value and handle ETIMEDOUT + * in osc_lock_upcall() correctly */ + /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */ + } + + /* Dump lock data into the request buffer */ + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + ldlm_lock2desc(lock, &body->lock_desc); + body->lock_flags = ldlm_flags_to_wire(*flags); + body->lock_handle[0] = *lockh; + + /* extended LDLM opcodes in client stats */ + if (exp->exp_obd->obd_svc_stats != NULL) { + bool glimpse = *flags & LDLM_FL_HAS_INTENT; + + /* OST glimpse has no intent buffer */ + if (req_capsule_has_field(&req->rq_pill, &RMF_LDLM_INTENT, + RCL_CLIENT)) { + struct ldlm_intent *it; + + it = req_capsule_client_get(&req->rq_pill, + &RMF_LDLM_INTENT); + glimpse = (it && (it->opc == IT_GLIMPSE)); + } + + if (!glimpse) + ldlm_svc_get_eopc(body, exp->exp_obd->obd_svc_stats); + else + lprocfs_counter_incr(exp->exp_obd->obd_svc_stats, + PTLRPC_LAST_CNTR + + LDLM_GLIMPSE_ENQUEUE); + } + + if (async) { + LASSERT(reqp != NULL); + RETURN(0); + } + + LDLM_DEBUG(lock, "sending request"); + + rc = ptlrpc_queue_wait(req); + + err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0, + einfo->ei_mode, flags, lvb, lvb_len, + lockh, rc); + + /* If ldlm_cli_enqueue_fini did not find the lock, we need to free + * one reference that we took */ + if (err == -ENOLCK) + LDLM_LOCK_RELEASE(lock); + else + rc = err; + + if (!req_passed_in && req != NULL) { + ptlrpc_req_finished(req); + if (reqp) + *reqp = NULL; + } + + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_cli_enqueue); + +/** + * Client-side IBITS lock convert. + * + * Inform server that lock has been converted instead of canceling. + * Server finishes convert on own side and does reprocess to grant + * all related waiting locks. + * + * Since convert means only ibits downgrading, client doesn't need to + * wait for server reply to finish local converting process so this request + * is made asynchronous. + * + */ +int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits) +{ + struct ldlm_request *body; + struct ptlrpc_request *req; + struct obd_export *exp = lock->l_conn_export; + + ENTRY; + + LASSERT(exp != NULL); + + /* this is better to check earlier and it is done so already, + * but this check is kept too as final one to issue an error + * if any new code will miss such check. + */ + if (!exp_connect_lock_convert(exp)) { + LDLM_ERROR(lock, "server doesn't support lock convert\n"); + RETURN(-EPROTO); + } + + if (lock->l_resource->lr_type != LDLM_IBITS) { + LDLM_ERROR(lock, "convert works with IBITS locks only."); + RETURN(-EINVAL); + } + + LDLM_DEBUG(lock, "client-side convert"); + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION, + LDLM_CONVERT); + if (req == NULL) + RETURN(-ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + body->lock_handle[0] = lock->l_remote_handle; + + body->lock_desc.l_req_mode = lock->l_req_mode; + body->lock_desc.l_granted_mode = lock->l_granted_mode; + + body->lock_desc.l_policy_data.l_inodebits.bits = new_bits; + body->lock_desc.l_policy_data.l_inodebits.cancel_bits = 0; + + body->lock_flags = ldlm_flags_to_wire(*flags); + body->lock_count = 1; + + ptlrpc_request_set_replen(req); + + /* + * Use cancel portals for convert as well as high-priority handling. + */ + req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; + req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; + + ptlrpc_at_set_req_timeout(req); + + if (exp->exp_obd->obd_svc_stats != NULL) + lprocfs_counter_incr(exp->exp_obd->obd_svc_stats, + LDLM_CONVERT - LDLM_FIRST_OPC); + + ptlrpcd_add_req(req); + RETURN(0); +} + +/** + * Cancel locks locally. + * Returns: + * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server + * \retval LDLM_FL_CANCELING otherwise; + * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC. + */ +static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock) +{ + __u64 rc = LDLM_FL_LOCAL_ONLY; + ENTRY; + + if (lock->l_conn_export) { + bool local_only; + + LDLM_DEBUG(lock, "client-side cancel"); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL, + cfs_fail_val); + + /* Set this flag to prevent others from getting new references*/ + lock_res_and_lock(lock); + ldlm_set_cbpending(lock); + local_only = !!(lock->l_flags & + (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK)); + ldlm_cancel_callback(lock); + rc = (ldlm_is_bl_ast(lock)) ? + LDLM_FL_BL_AST : LDLM_FL_CANCELING; + unlock_res_and_lock(lock); + + if (local_only) { + CDEBUG(D_DLMTRACE, + "not sending request (at caller's instruction)\n"); + rc = LDLM_FL_LOCAL_ONLY; + } + ldlm_lock_cancel(lock); + } else { + if (ns_is_client(ldlm_lock_to_ns(lock))) { + LDLM_ERROR(lock, "Trying to cancel local lock"); + LBUG(); + } + LDLM_DEBUG(lock, "server-side local cancel"); + ldlm_lock_cancel(lock); + ldlm_reprocess_all(lock->l_resource, lock); + } + + RETURN(rc); +} + +/** + * Pack \a count locks in \a head into ldlm_request buffer of request \a req. + */ +static void ldlm_cancel_pack(struct ptlrpc_request *req, + struct list_head *head, int count) +{ + struct ldlm_request *dlm; + struct ldlm_lock *lock; + int max, packed = 0; + ENTRY; + + dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + LASSERT(dlm != NULL); + + /* Check the room in the request buffer. */ + max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) - + sizeof(struct ldlm_request); + max /= sizeof(struct lustre_handle); + max += LDLM_LOCKREQ_HANDLES; + LASSERT(max >= dlm->lock_count + count); + + /* XXX: it would be better to pack lock handles grouped by resource. + * so that the server cancel would call filter_lvbo_update() less + * frequently. */ + list_for_each_entry(lock, head, l_bl_ast) { + if (!count--) + break; + LASSERT(lock->l_conn_export); + /* Pack the lock handle to the given request buffer. */ + LDLM_DEBUG(lock, "packing"); + dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle; + packed++; + } + CDEBUG(D_DLMTRACE, "%d locks packed\n", packed); + EXIT; +} + +/** + * Prepare and send a batched cancel RPC. It will include \a count lock + * handles of locks given in \a cancels list. */ +int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels, + int count, enum ldlm_cancel_flags flags) +{ + struct ptlrpc_request *req = NULL; + struct obd_import *imp; + int free, sent = 0; + int rc = 0; + ENTRY; + + LASSERT(exp != NULL); + LASSERT(count > 0); + + CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val); + + if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE)) + RETURN(count); + + free = ldlm_format_handles_avail(class_exp2cliimp(exp), + &RQF_LDLM_CANCEL, RCL_CLIENT, 0); + if (count > free) + count = free; + + while (1) { + imp = class_exp2cliimp(exp); + if (imp == NULL || imp->imp_invalid) { + CDEBUG(D_DLMTRACE, + "skipping cancel on invalid import %p\n", imp); + RETURN(count); + } + + req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT, + ldlm_request_bufsize(count, LDLM_CANCEL)); + + rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL); + if (rc) { + ptlrpc_request_free(req); + GOTO(out, rc); + } + + /* If OSP want cancel cross-MDT lock, let's not block it in + * in recovery, otherwise the lock will not released, if + * the remote target is also in recovery, and it also need + * this lock, it might cause deadlock. */ + if (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS && + exp->exp_obd->obd_lu_dev != NULL && + exp->exp_obd->obd_lu_dev->ld_site != NULL) { + struct lu_device *top_dev; + + top_dev = exp->exp_obd->obd_lu_dev->ld_site->ls_top_dev; + if (top_dev != NULL && + top_dev->ld_obd->obd_recovering) + req->rq_allow_replay = 1; + } + + req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; + req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(req); + + ldlm_cancel_pack(req, cancels, count); + + ptlrpc_request_set_replen(req); + if (flags & LCF_ASYNC) { + ptlrpcd_add_req(req); + sent = count; + GOTO(out, 0); + } + + rc = ptlrpc_queue_wait(req); + if (rc == LUSTRE_ESTALE) { + CDEBUG(D_DLMTRACE, "client/server (nid %s) " + "out of sync -- not fatal\n", + libcfs_nid2str(req->rq_import-> + imp_connection->c_peer.nid)); + rc = 0; + } else if (rc == -ETIMEDOUT && /* check there was no reconnect*/ + req->rq_import_generation == imp->imp_generation) { + ptlrpc_req_finished(req); + continue; + } else if (rc != ELDLM_OK) { + /* -ESHUTDOWN is common on umount */ + CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR, + "Got rc %d from cancel RPC: " + "canceling anyway\n", rc); + break; + } + sent = count; + break; + } + + ptlrpc_req_finished(req); + EXIT; +out: + return sent ? sent : rc; +} + +static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp) +{ + LASSERT(imp != NULL); + return &imp->imp_obd->obd_namespace->ns_pool; +} + +/** + * Update client's OBD pool related fields with new SLV and Limit from \a req. + */ +int ldlm_cli_update_pool(struct ptlrpc_request *req) +{ + struct obd_device *obd; + __u64 new_slv; + __u32 new_limit; + ENTRY; + if (unlikely(!req->rq_import || !req->rq_import->imp_obd || + !imp_connect_lru_resize(req->rq_import))) + { + /* + * Do nothing for corner cases. + */ + RETURN(0); + } + + /* In some cases RPC may contain SLV and limit zeroed out. This + * is the case when server does not support LRU resize feature. + * This is also possible in some recovery cases when server-side + * reqs have no reference to the OBD export and thus access to + * server-side namespace is not possible. */ + if (lustre_msg_get_slv(req->rq_repmsg) == 0 || + lustre_msg_get_limit(req->rq_repmsg) == 0) { + DEBUG_REQ(D_HA, req, "Zero SLV or Limit found " + "(SLV: %llu, Limit: %u)", + lustre_msg_get_slv(req->rq_repmsg), + lustre_msg_get_limit(req->rq_repmsg)); + RETURN(0); + } + + new_limit = lustre_msg_get_limit(req->rq_repmsg); + new_slv = lustre_msg_get_slv(req->rq_repmsg); + obd = req->rq_import->imp_obd; + + /* Set new SLV and limit in OBD fields to make them accessible + * to the pool thread. We do not access obd_namespace and pool + * directly here as there is no reliable way to make sure that + * they are still alive at cleanup time. Evil races are possible + * which may cause Oops at that time. */ + write_lock(&obd->obd_pool_lock); + obd->obd_pool_slv = new_slv; + obd->obd_pool_limit = new_limit; + write_unlock(&obd->obd_pool_lock); + + RETURN(0); +} + +int ldlm_cli_convert(struct ldlm_lock *lock, + enum ldlm_cancel_flags cancel_flags) +{ + int rc = -EINVAL; + + LASSERT(!lock->l_readers && !lock->l_writers); + LDLM_DEBUG(lock, "client lock convert START"); + + if (lock->l_resource->lr_type == LDLM_IBITS) { + lock_res_and_lock(lock); + do { + rc = ldlm_cli_inodebits_convert(lock, cancel_flags); + } while (rc == -EAGAIN); + unlock_res_and_lock(lock); + } + + LDLM_DEBUG(lock, "client lock convert END"); + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_cli_convert); + +/** + * Client side lock cancel. + * + * Lock must not have any readers or writers by this time. + */ +int ldlm_cli_cancel(const struct lustre_handle *lockh, + enum ldlm_cancel_flags cancel_flags) +{ + struct obd_export *exp; + int avail, count = 1; + __u64 rc = 0; + struct ldlm_namespace *ns; + struct ldlm_lock *lock; + struct list_head cancels = LIST_HEAD_INIT(cancels); + + ENTRY; + + lock = ldlm_handle2lock_long(lockh, 0); + if (lock == NULL) { + LDLM_DEBUG_NOLOCK("lock is already being destroyed"); + RETURN(0); + } + + lock_res_and_lock(lock); + LASSERT(!ldlm_is_converting(lock)); + + /* Lock is being canceled and the caller doesn't want to wait */ + if (ldlm_is_canceling(lock)) { + if (cancel_flags & LCF_ASYNC) { + unlock_res_and_lock(lock); + } else { + struct l_wait_info lwi = { 0 }; + + unlock_res_and_lock(lock); + l_wait_event(lock->l_waitq, is_bl_done(lock), &lwi); + } + LDLM_LOCK_RELEASE(lock); + RETURN(0); + } + + ldlm_set_canceling(lock); + unlock_res_and_lock(lock); + + rc = ldlm_cli_cancel_local(lock); + if (rc == LDLM_FL_LOCAL_ONLY || cancel_flags & LCF_LOCAL) { + LDLM_LOCK_RELEASE(lock); + RETURN(0); + } + /* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL + * RPC which goes to canceld portal, so we can cancel other LRU locks + * here and send them all as one LDLM_CANCEL RPC. */ + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, &cancels); + + exp = lock->l_conn_export; + if (exp_connect_cancelset(exp)) { + avail = ldlm_format_handles_avail(class_exp2cliimp(exp), + &RQF_LDLM_CANCEL, + RCL_CLIENT, 0); + LASSERT(avail > 0); + + ns = ldlm_lock_to_ns(lock); + count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1, + LCF_BL_AST, 0); + } + ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_cli_cancel); + +/** + * Locally cancel up to \a count locks in list \a cancels. + * Return the number of cancelled locks. + */ +int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, + enum ldlm_cancel_flags cancel_flags) +{ + struct list_head head = LIST_HEAD_INIT(head); + struct ldlm_lock *lock, *next; + int left = 0, bl_ast = 0; + __u64 rc; + + left = count; + list_for_each_entry_safe(lock, next, cancels, l_bl_ast) { + if (left-- == 0) + break; + + if (cancel_flags & LCF_LOCAL) { + rc = LDLM_FL_LOCAL_ONLY; + ldlm_lock_cancel(lock); + } else { + rc = ldlm_cli_cancel_local(lock); + } + /* Until we have compound requests and can send LDLM_CANCEL + * requests batched with generic RPCs, we need to send cancels + * with the LDLM_FL_BL_AST flag in a separate RPC from + * the one being generated now. */ + if (!(cancel_flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) { + LDLM_DEBUG(lock, "Cancel lock separately"); + list_del_init(&lock->l_bl_ast); + list_add(&lock->l_bl_ast, &head); + bl_ast++; + continue; + } + if (rc == LDLM_FL_LOCAL_ONLY) { + /* CANCEL RPC should not be sent to server. */ + list_del_init(&lock->l_bl_ast); + LDLM_LOCK_RELEASE(lock); + count--; + } + } + if (bl_ast > 0) { + count -= bl_ast; + ldlm_cli_cancel_list(&head, bl_ast, NULL, 0); + } + + RETURN(count); +} + +/** + * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back + * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g. + * readahead requests, ...) + */ +static enum ldlm_policy_res +ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, + int added, int min) +{ + enum ldlm_policy_res result = LDLM_POLICY_CANCEL_LOCK; + + /* don't check @added & @min since we want to process all locks + * from unused list. + * It's fine to not take lock to access lock->l_resource since + * the lock has already been granted so it won't change. */ + switch (lock->l_resource->lr_type) { + case LDLM_EXTENT: + case LDLM_IBITS: + if (ns->ns_cancel != NULL && ns->ns_cancel(lock) != 0) + break; + fallthrough; + default: + result = LDLM_POLICY_SKIP_LOCK; + break; + } + + RETURN(result); +} + +/** + * Callback function for LRU-resize policy. Decides whether to keep + * \a lock in LRU for \a added in current scan and \a min number of locks + * to be preferably canceled. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int added, int min) +{ + ktime_t cur = ktime_get(); + struct ldlm_pool *pl = &ns->ns_pool; + u64 slv, lvf, lv; + s64 la; + + if (added < min) + return LDLM_POLICY_CANCEL_LOCK; + + /* Despite of the LV, It doesn't make sense to keep the lock which + * is unused for ns_max_age time. + */ + if (ktime_after(cur, ktime_add(lock->l_last_used, ns->ns_max_age))) + return LDLM_POLICY_CANCEL_LOCK; + + slv = ldlm_pool_get_slv(pl); + lvf = ldlm_pool_get_lvf(pl); + la = div_u64(ktime_to_ns(ktime_sub(cur, lock->l_last_used)), + NSEC_PER_SEC); + lv = lvf * la * ns->ns_nr_unused; + + /* Inform pool about current CLV to see it via debugfs. */ + ldlm_pool_set_clv(pl, lv); + + /* Stop when SLV is not yet come from server or lv is smaller than + * it is. */ + if (slv == 0 || lv < slv) + return LDLM_POLICY_KEEP_LOCK; + + return LDLM_POLICY_CANCEL_LOCK; +} + +static enum ldlm_policy_res +ldlm_cancel_lrur_no_wait_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int added, int min) +{ + enum ldlm_policy_res result; + + result = ldlm_cancel_lrur_policy(ns, lock, added, min); + if (result == LDLM_POLICY_KEEP_LOCK) + return result; + + return ldlm_cancel_no_wait_policy(ns, lock, added, min); +} + +/** + * Callback function for aged policy. Decides whether to keep + * \a lock in LRU for \a added in current scan and \a min number of locks + * to be preferably canceled. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int added, int min) +{ + if ((added >= min) && + ktime_before(ktime_get(), + ktime_add(lock->l_last_used, ns->ns_max_age))) + return LDLM_POLICY_KEEP_LOCK; + + return LDLM_POLICY_CANCEL_LOCK; +} + +static enum ldlm_policy_res +ldlm_cancel_aged_no_wait_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int added, int min) +{ + enum ldlm_policy_res result; + + result = ldlm_cancel_aged_policy(ns, lock, added, min); + if (result == LDLM_POLICY_KEEP_LOCK) + return result; + + return ldlm_cancel_no_wait_policy(ns, lock, added, min); +} + +typedef enum ldlm_policy_res +(*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *ns, struct ldlm_lock *lock, + int added, int min); + +static ldlm_cancel_lru_policy_t +ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags) +{ + if (ns_connect_lru_resize(ns)) { + if (lru_flags & LDLM_LRU_FLAG_NO_WAIT) + return ldlm_cancel_lrur_no_wait_policy; + else + return ldlm_cancel_lrur_policy; + } else { + if (lru_flags & LDLM_LRU_FLAG_NO_WAIT) + return ldlm_cancel_aged_no_wait_policy; + else + return ldlm_cancel_aged_policy; + } +} + +/** + * - Free space in LRU for \a min new locks, + * redundant unused locks are canceled locally; + * - also cancel locally unused aged locks; + * - do not cancel more than \a max locks; + * - if some locks are cancelled, try to cancel at least \a batch locks + * - GET the found locks and add them into the \a cancels list. + * + * A client lock can be added to the l_bl_ast list only when it is + * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing + * CANCEL. There are the following use cases: + * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and + * ldlm_cli_cancel(), which check and set this flag properly. As any + * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed + * later without any special locking. + * + * Locks are cancelled according to the LRU resize policy (SLV from server) + * if LRU resize is enabled; otherwise, the "aged policy" is used; + * + * LRU flags: + * ---------------------------------------- + * + * flags & LDLM_LRU_FLAG_NO_WAIT - cancel locks w/o sending any RPCs or waiting + * for any outstanding RPC to complete. + * + * flags & LDLM_CANCEL_CLEANUP - when cancelling read locks, do not check for + * other read locks covering the same pages, just + * discard those pages. + */ +static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, + struct list_head *cancels, + int min, int max, int batch, + enum ldlm_lru_flags lru_flags) +{ + ldlm_cancel_lru_policy_t pf; + int added = 0; + int no_wait = lru_flags & LDLM_LRU_FLAG_NO_WAIT; + + ENTRY; + + /* + * Let only 1 thread to proceed. However, not for those which have the + * @max limit given (ELC), as LRU may be left not cleaned up in full. + */ + if (max == 0) { + if (test_and_set_bit(LDLM_LRU_CANCEL, &ns->ns_flags)) + RETURN(0); + } else if (test_bit(LDLM_LRU_CANCEL, &ns->ns_flags)) + RETURN(0); + + LASSERT(ergo(max, min <= max)); + /* No sense to give @batch for ELC */ + LASSERT(ergo(max, batch == 0)); + + if (!ns_connect_lru_resize(ns)) + min = max_t(int, min, ns->ns_nr_unused - ns->ns_max_unused); + + /* If at least 1 lock is to be cancelled, cancel at least @batch locks */ + if (min && min < batch) + min = batch; + + pf = ldlm_cancel_lru_policy(ns, lru_flags); + LASSERT(pf != NULL); + + /* For any flags, stop scanning if @max is reached. */ + while (!list_empty(&ns->ns_unused_list) && (max == 0 || added < max)) { + struct ldlm_lock *lock; + struct list_head *item, *next; + enum ldlm_policy_res result; + ktime_t last_use = ktime_set(0, 0); + + spin_lock(&ns->ns_lock); + item = no_wait ? ns->ns_last_pos : &ns->ns_unused_list; + for (item = item->next, next = item->next; + item != &ns->ns_unused_list; + item = next, next = item->next) { + lock = list_entry(item, struct ldlm_lock, l_lru); + + /* No locks which got blocking requests. */ + LASSERT(!ldlm_is_bl_ast(lock)); + + if (!ldlm_is_canceling(lock)) + break; + + /* Somebody is already doing CANCEL. No need for this + * lock in LRU, do not traverse it again. */ + ldlm_lock_remove_from_lru_nolock(lock); + } + if (item == &ns->ns_unused_list) { + spin_unlock(&ns->ns_lock); + break; + } + + last_use = lock->l_last_used; + + LDLM_LOCK_GET(lock); + spin_unlock(&ns->ns_lock); + lu_ref_add(&lock->l_reference, __FUNCTION__, current); + + /* Pass the lock through the policy filter and see if it + * should stay in LRU. + * + * Even for shrinker policy we stop scanning if + * we find a lock that should stay in the cache. + * We should take into account lock age anyway + * as a new lock is a valuable resource even if + * it has a low weight. + * + * That is, for shrinker policy we drop only + * old locks, but additionally choose them by + * their weight. Big extent locks will stay in + * the cache. */ + result = pf(ns, lock, added, min); + if (result == LDLM_POLICY_KEEP_LOCK) { + lu_ref_del(&lock->l_reference, __func__, current); + LDLM_LOCK_RELEASE(lock); + break; + } + + if (result == LDLM_POLICY_SKIP_LOCK) { + lu_ref_del(&lock->l_reference, __func__, current); + if (no_wait) { + spin_lock(&ns->ns_lock); + if (!list_empty(&lock->l_lru) && + lock->l_lru.prev == ns->ns_last_pos) + ns->ns_last_pos = &lock->l_lru; + spin_unlock(&ns->ns_lock); + } + + LDLM_LOCK_RELEASE(lock); + continue; + } + + lock_res_and_lock(lock); + /* Check flags again under the lock. */ + if (ldlm_is_canceling(lock) || + ldlm_lock_remove_from_lru_check(lock, last_use) == 0) { + /* Another thread is removing lock from LRU, or + * somebody is already doing CANCEL, or there + * is a blocking request which will send cancel + * by itself, or the lock is no longer unused or + * the lock has been used since the pf() call and + * pages could be put under it. */ + unlock_res_and_lock(lock); + lu_ref_del(&lock->l_reference, __FUNCTION__, current); + LDLM_LOCK_RELEASE(lock); + continue; + } + LASSERT(!lock->l_readers && !lock->l_writers); + + /* If we have chosen to cancel this lock voluntarily, we + * better send cancel notification to server, so that it + * frees appropriate state. This might lead to a race + * where while we are doing cancel here, server is also + * silently cancelling this lock. */ + ldlm_clear_cancel_on_block(lock); + + /* Setting the CBPENDING flag is a little misleading, + * but prevents an important race; namely, once + * CBPENDING is set, the lock can accumulate no more + * readers/writers. Since readers and writers are + * already zero here, ldlm_lock_decref() won't see + * this flag and call l_blocking_ast */ + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING; + + if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) && + (lock->l_resource->lr_type == LDLM_EXTENT || + ldlm_has_dom(lock)) && lock->l_granted_mode == LCK_PR) + ldlm_set_discard_data(lock); + + /* We can't re-add to l_lru as it confuses the + * refcounting in ldlm_lock_remove_from_lru() if an AST + * arrives after we drop lr_lock below. We use l_bl_ast + * and can't use l_pending_chain as it is used both on + * server and client nevertheless bug 5666 says it is + * used only on server */ + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, cancels); + unlock_res_and_lock(lock); + lu_ref_del(&lock->l_reference, __FUNCTION__, current); + added++; + /* Once a lock added, batch the requested amount */ + if (min == 0) + min = batch; + } + + if (max == 0) + clear_bit(LDLM_LRU_CANCEL, &ns->ns_flags); + + RETURN(added); +} + +int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, + int min, int max, + enum ldlm_cancel_flags cancel_flags, + enum ldlm_lru_flags lru_flags) +{ + int added; + + added = ldlm_prepare_lru_list(ns, cancels, min, max, 0, lru_flags); + if (added <= 0) + return added; + + return ldlm_cli_cancel_list_local(cancels, added, cancel_flags); +} + +/** + * Cancel at least \a min locks from given namespace LRU. + * + * When called with LCF_ASYNC the blocking callback will be handled + * in a thread and this function will return after the thread has been + * asked to call the callback. When called with LCF_ASYNC the blocking + * callback will be performed in this function. + */ +int ldlm_cancel_lru(struct ldlm_namespace *ns, int min, + enum ldlm_cancel_flags cancel_flags, + enum ldlm_lru_flags lru_flags) +{ + struct list_head cancels = LIST_HEAD_INIT(cancels); + int count, rc; + ENTRY; + + /* Just prepare the list of locks, do not actually cancel them yet. + * Locks are cancelled later in a separate thread. */ + count = ldlm_prepare_lru_list(ns, &cancels, min, 0, 0, lru_flags); + rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags); + if (rc == 0) + RETURN(count); + + RETURN(0); +} + +/** + * Find and cancel locally unused locks found on resource, matched to the + * given policy, mode. GET the found locks and add them into the \a cancels + * list. + */ +int ldlm_cancel_resource_local(struct ldlm_resource *res, + struct list_head *cancels, + union ldlm_policy_data *policy, + enum ldlm_mode mode, __u64 lock_flags, + enum ldlm_cancel_flags cancel_flags, + void *opaque) +{ + struct ldlm_lock *lock; + int count = 0; + + ENTRY; + + lock_res(res); + list_for_each_entry(lock, &res->lr_granted, l_res_link) { + if (opaque != NULL && lock->l_ast_data != opaque) { + LDLM_ERROR(lock, "data %p doesn't match opaque %p", + lock->l_ast_data, opaque); + continue; + } + + if (lock->l_readers || lock->l_writers) + continue; + + /* + * If somebody is already doing CANCEL, or blocking AST came + * then skip this lock. + */ + if (ldlm_is_bl_ast(lock) || ldlm_is_canceling(lock)) + continue; + + if (lockmode_compat(lock->l_granted_mode, mode)) + continue; + + /* If policy is given and this is IBITS lock, add to list only + * those locks that match by policy. + * Skip locks with DoM bit always to don't flush data. + */ + if (policy && (lock->l_resource->lr_type == LDLM_IBITS) && + (!(lock->l_policy_data.l_inodebits.bits & + policy->l_inodebits.bits) || ldlm_has_dom(lock))) + continue; + + /* See CBPENDING comment in ldlm_cancel_lru */ + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING | + lock_flags; + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, cancels); + LDLM_LOCK_GET(lock); + count++; + } + unlock_res(res); + + RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags)); +} +EXPORT_SYMBOL(ldlm_cancel_resource_local); + +/** + * Cancel client-side locks from a list and send/prepare cancel RPCs to the + * server. + * If \a req is NULL, send CANCEL request to server with handles of locks + * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests + * separately per lock. + * If \a req is not NULL, put handles of locks in \a cancels into the request + * buffer at the offset \a off. + * Destroy \a cancels at the end. + */ +int ldlm_cli_cancel_list(struct list_head *cancels, int count, + struct ptlrpc_request *req, + enum ldlm_cancel_flags flags) +{ + struct ldlm_lock *lock; + int res = 0; + ENTRY; + + if (list_empty(cancels) || count == 0) + RETURN(0); + + /* XXX: requests (both batched and not) could be sent in parallel. + * Usually it is enough to have just 1 RPC, but it is possible that + * there are too many locks to be cancelled in LRU or on a resource. + * It would also speed up the case when the server does not support + * the feature. */ + while (count > 0) { + LASSERT(!list_empty(cancels)); + lock = list_entry(cancels->next, struct ldlm_lock, + l_bl_ast); + LASSERT(lock->l_conn_export); + + if (exp_connect_cancelset(lock->l_conn_export)) { + res = count; + if (req) + ldlm_cancel_pack(req, cancels, count); + else + res = ldlm_cli_cancel_req(lock->l_conn_export, + cancels, count, + flags); + } else { + res = ldlm_cli_cancel_req(lock->l_conn_export, + cancels, 1, flags); + } + + if (res < 0) { + CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR, + "ldlm_cli_cancel_list: %d\n", res); + res = count; + } + + count -= res; + ldlm_lock_list_put(cancels, l_bl_ast, res); + } + LASSERT(count == 0); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_cli_cancel_list); + +/** + * Cancel all locks on a resource that have 0 readers/writers. + * + * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying + * to notify the server. */ +int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + union ldlm_policy_data *policy, + enum ldlm_mode mode, + enum ldlm_cancel_flags flags, void *opaque) +{ + struct ldlm_resource *res; + struct list_head cancels = LIST_HEAD_INIT(cancels); + int count; + int rc; + ENTRY; + + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + if (IS_ERR(res)) { + /* This is not a problem. */ + CDEBUG(D_INFO, "No resource %llu\n", res_id->name[0]); + RETURN(0); + } + + LDLM_RESOURCE_ADDREF(res); + count = ldlm_cancel_resource_local(res, &cancels, policy, mode, + 0, flags | LCF_BL_AST, opaque); + rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags); + if (rc != ELDLM_OK) + CERROR("canceling unused lock "DLDLMRES": rc = %d\n", + PLDLMRES(res), rc); + + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource); + +struct ldlm_cli_cancel_arg { + int lc_flags; + void *lc_opaque; +}; + +static int +ldlm_cli_hash_cancel_unused(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + struct ldlm_cli_cancel_arg *lc = arg; + + ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name, + NULL, LCK_MINMODE, lc->lc_flags, + lc->lc_opaque); + /* must return 0 for hash iteration */ + return 0; +} + +/** + * Cancel all locks on a namespace (or a specific resource, if given) + * that have 0 readers/writers. + * + * If flags & LCF_LOCAL, throw the locks away without trying + * to notify the server. */ +int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + enum ldlm_cancel_flags flags, void *opaque) +{ + struct ldlm_cli_cancel_arg arg = { + .lc_flags = flags, + .lc_opaque = opaque, + }; + + ENTRY; + + if (ns == NULL) + RETURN(ELDLM_OK); + + if (res_id != NULL) { + RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL, + LCK_MINMODE, flags, + opaque)); + } else { + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_cli_hash_cancel_unused, &arg, 0); + RETURN(ELDLM_OK); + } +} + +/* Lock iterators. */ + +int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, + void *closure) +{ + struct list_head *tmp, *next; + struct ldlm_lock *lock; + int rc = LDLM_ITER_CONTINUE; + + ENTRY; + + if (!res) + RETURN(LDLM_ITER_CONTINUE); + + lock_res(res); + list_for_each_safe(tmp, next, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (iter(lock, closure) == LDLM_ITER_STOP) + GOTO(out, rc = LDLM_ITER_STOP); + } + + list_for_each_safe(tmp, next, &res->lr_waiting) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (iter(lock, closure) == LDLM_ITER_STOP) + GOTO(out, rc = LDLM_ITER_STOP); + } +out: + unlock_res(res); + RETURN(rc); +} + +struct iter_helper_data { + ldlm_iterator_t iter; + void *closure; +}; + +static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure) +{ + struct iter_helper_data *helper = closure; + return helper->iter(lock, helper->closure); +} + +static int ldlm_res_iter_helper(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) + +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + + return ldlm_resource_foreach(res, ldlm_iter_helper, arg) == + LDLM_ITER_STOP; +} + +void ldlm_namespace_foreach(struct ldlm_namespace *ns, + ldlm_iterator_t iter, void *closure) + +{ + struct iter_helper_data helper = { .iter = iter, .closure = closure }; + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_res_iter_helper, &helper, 0); + +} + +/* non-blocking function to manipulate a lock whose cb_data is being put away. + * return 0: find no resource + * > 0: must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE. + * < 0: errors + */ +int ldlm_resource_iterate(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_iterator_t iter, void *data) +{ + struct ldlm_resource *res; + int rc; + ENTRY; + + LASSERTF(ns != NULL, "must pass in namespace\n"); + + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + if (IS_ERR(res)) + RETURN(0); + + LDLM_RESOURCE_ADDREF(res); + rc = ldlm_resource_foreach(res, iter, data); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_resource_iterate); + +/* Lock replay */ + +static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure) +{ + struct list_head *list = closure; + + /* we use l_pending_chain here, because it's unused on clients. */ + LASSERTF(list_empty(&lock->l_pending_chain), + "lock %p next %p prev %p\n", + lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev); + /* bug 9573: don't replay locks left after eviction, or + * bug 17614: locks being actively cancelled. Get a reference + * on a lock so that it does not disapear under us (e.g. due to cancel) + */ + if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_BL_DONE))) { + list_add(&lock->l_pending_chain, list); + LDLM_LOCK_GET(lock); + } + + return LDLM_ITER_CONTINUE; +} + +static int replay_lock_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct ldlm_async_args *aa, int rc) +{ + struct ldlm_lock *lock; + struct ldlm_reply *reply; + struct obd_export *exp; + + ENTRY; + atomic_dec(&req->rq_import->imp_replay_inflight); + wake_up(&req->rq_import->imp_replay_waitq); + + if (rc != ELDLM_OK) + GOTO(out, rc); + + reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (reply == NULL) + GOTO(out, rc = -EPROTO); + + lock = ldlm_handle2lock(&aa->lock_handle); + if (!lock) { + CERROR("received replay ack for unknown local cookie %#llx" + " remote cookie %#llx from server %s id %s\n", + aa->lock_handle.cookie, reply->lock_handle.cookie, + req->rq_export->exp_client_uuid.uuid, + libcfs_id2str(req->rq_peer)); + GOTO(out, rc = -ESTALE); + } + + /* Key change rehash lock in per-export hash with new key */ + exp = req->rq_export; + if (exp && exp->exp_lock_hash) { + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_rehash_key(exp->exp_lock_hash, + &lock->l_remote_handle, + &reply->lock_handle, + &lock->l_exp_hash); + } else { + lock->l_remote_handle = reply->lock_handle; + } + + LDLM_DEBUG(lock, "replayed lock:"); + ptlrpc_import_recovery_state_machine(req->rq_import); + LDLM_LOCK_PUT(lock); +out: + if (rc != ELDLM_OK) + ptlrpc_connect_import(req->rq_import); + + RETURN(rc); +} + +static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) +{ + struct ptlrpc_request *req; + struct ldlm_async_args *aa; + struct ldlm_request *body; + int flags; + ENTRY; + + + /* Bug 11974: Do not replay a lock which is actively being canceled */ + if (ldlm_is_bl_done(lock)) { + LDLM_DEBUG(lock, "Not replaying canceled lock:"); + RETURN(0); + } + + /* If this is reply-less callback lock, we cannot replay it, since + * server might have long dropped it, but notification of that event was + * lost by network. (and server granted conflicting lock already) */ + if (ldlm_is_cancel_on_block(lock)) { + LDLM_DEBUG(lock, "Not replaying reply-less lock:"); + ldlm_lock_cancel(lock); + RETURN(0); + } + + /* + * If granted mode matches the requested mode, this lock is granted. + * + * If we haven't been granted anything and are on a resource list, + * then we're blocked/waiting. + * + * If we haven't been granted anything and we're NOT on a resource list, + * then we haven't got a reply yet and don't have a known disposition. + * This happens whenever a lock enqueue is the request that triggers + * recovery. + */ + if (ldlm_is_granted(lock)) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED; + else if (!list_empty(&lock->l_res_link)) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT; + else + flags = LDLM_FL_REPLAY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE, + LUSTRE_DLM_VERSION, LDLM_ENQUEUE); + if (req == NULL) + RETURN(-ENOMEM); + + /* We're part of recovery, so don't wait for it. */ + req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS; + /* If the state changed while we were prepared, don't wait */ + req->rq_no_delay = 1; + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + ldlm_lock2desc(lock, &body->lock_desc); + body->lock_flags = ldlm_flags_to_wire(flags); + + ldlm_lock2handle(lock, &body->lock_handle[0]); + if (lock->l_lvb_len > 0) + req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + lock->l_lvb_len); + ptlrpc_request_set_replen(req); + /* notify the server we've replayed all requests. + * also, we mark the request to be put on a dedicated + * queue to be processed after all request replayes. + * bug 6063 */ + lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE); + + LDLM_DEBUG(lock, "replaying lock:"); + + atomic_inc(&req->rq_import->imp_replay_inflight); + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->lock_handle = body->lock_handle[0]; + req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret; + ptlrpcd_add_req(req); + + RETURN(0); +} + +/** + * Cancel as many unused locks as possible before replay. since we are + * in recovery, we can't wait for any outstanding RPCs to send any RPC + * to the server. + * + * Called only in recovery before replaying locks. there is no need to + * replay locks that are unused. since the clients may hold thousands of + * cached unused locks, dropping the unused locks can greatly reduce the + * load on the servers at recovery time. + */ +static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns) +{ + int canceled; + struct list_head cancels = LIST_HEAD_INIT(cancels); + + CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before" + "replay for namespace %s (%d)\n", + ldlm_ns_name(ns), ns->ns_nr_unused); + + /* We don't need to care whether or not LRU resize is enabled + * because the LDLM_LRU_FLAG_NO_WAIT policy doesn't use the + * count parameter */ + canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0, + LCF_LOCAL, LDLM_LRU_FLAG_NO_WAIT); + + CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n", + canceled, ldlm_ns_name(ns)); +} + +static int lock_can_replay(struct obd_import *imp) +{ + struct client_obd *cli = &imp->imp_obd->u.cli; + + CDEBUG(D_HA, "check lock replay limit, inflights = %u(%u)\n", + atomic_read(&imp->imp_replay_inflight) - 1, + cli->cl_max_rpcs_in_flight); + + /* +1 due to ldlm_lock_replay() increment */ + return atomic_read(&imp->imp_replay_inflight) < + 1 + min_t(u32, cli->cl_max_rpcs_in_flight, 8); +} + +int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit) +{ + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + struct list_head list = LIST_HEAD_INIT(list); + struct ldlm_lock *lock, *next; + int rc = 0; + + ENTRY; + + LASSERT(atomic_read(&imp->imp_replay_inflight) == 1); + + /* don't replay locks if import failed recovery */ + if (imp->imp_vbr_failed) + RETURN(0); + + if (ldlm_cancel_unused_locks_before_replay) + ldlm_cancel_unused_locks_for_replay(ns); + + ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list); + + list_for_each_entry_safe(lock, next, &list, l_pending_chain) { + list_del_init(&lock->l_pending_chain); + /* If we disconnected in the middle - cleanup and let + * reconnection to happen again. LU-14027 */ + if (rc || (imp->imp_state != LUSTRE_IMP_REPLAY_LOCKS)) { + LDLM_LOCK_RELEASE(lock); + continue; + } + rc = replay_one_lock(imp, lock); + LDLM_LOCK_RELEASE(lock); + + if (rate_limit) + wait_event_idle_exclusive(imp->imp_replay_waitq, + lock_can_replay(imp)); + } + + RETURN(rc); +} + +/** + * Lock replay uses rate control and can sleep waiting so + * must be in separate thread from ptlrpcd itself + */ +static int ldlm_lock_replay_thread(void *data) +{ + struct obd_import *imp = data; + + unshare_fs_struct(); + + CDEBUG(D_HA, "lock replay thread %s to %s@%s\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + __ldlm_replay_locks(imp, true); + atomic_dec(&imp->imp_replay_inflight); + ptlrpc_import_recovery_state_machine(imp); + class_import_put(imp); + + return 0; +} + +int ldlm_replay_locks(struct obd_import *imp) +{ + struct task_struct *task; + int rc = 0; + + class_import_get(imp); + /* ensure this doesn't fall to 0 before all have been queued */ + atomic_inc(&imp->imp_replay_inflight); + + task = kthread_run(ldlm_lock_replay_thread, imp, "ldlm_lock_replay"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CDEBUG(D_HA, "can't start lock replay thread: rc = %d\n", rc); + + /* run lock replay without rate control */ + rc = __ldlm_replay_locks(imp, false); + atomic_dec(&imp->imp_replay_inflight); + class_import_put(imp); + } + + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c new file mode 100644 index 0000000000000..8b36f70af7f56 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c @@ -0,0 +1,1787 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_resource.c + * + * Author: Phil Schwan + * Author: Peter Braam + */ + +#define DEBUG_SUBSYSTEM S_LDLM +#include +#include +#include +#include "ldlm_internal.h" + +struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab; +struct kmem_cache *ldlm_interval_tree_slab; +struct kmem_cache *ldlm_inodebits_slab; + +int ldlm_srv_namespace_nr = 0; +int ldlm_cli_namespace_nr = 0; + +DEFINE_MUTEX(ldlm_srv_namespace_lock); +LIST_HEAD(ldlm_srv_namespace_list); + +DEFINE_MUTEX(ldlm_cli_namespace_lock); +/* Client Namespaces that have active resources in them. + * Once all resources go away, ldlm_poold moves such namespaces to the + * inactive list */ +LIST_HEAD(ldlm_cli_active_namespace_list); +/* Client namespaces that don't have any locks in them */ +LIST_HEAD(ldlm_cli_inactive_namespace_list); + +static struct dentry *ldlm_debugfs_dir; +static struct dentry *ldlm_ns_debugfs_dir; +struct dentry *ldlm_svc_debugfs_dir; + +/* during debug dump certain amount of granted locks for one resource to avoid + * DDOS. */ +static unsigned int ldlm_dump_granted_max = 256; + +static ssize_t ldebugfs_dump_ns_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE); + RETURN(count); +} + +LDEBUGFS_FOPS_WR_ONLY(ldlm, dump_ns); + +static int ldlm_rw_uint_seq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%u\n", *(unsigned int *)m->private); + return 0; +} + +static ssize_t +ldlm_rw_uint_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *seq = file->private_data; + + if (!count) + return 0; + + return kstrtouint_from_user(buffer, count, 0, + (unsigned int *)seq->private); +} + +LDEBUGFS_SEQ_FOPS(ldlm_rw_uint); + +#ifdef HAVE_SERVER_SUPPORT + +static int seq_watermark_show(struct seq_file *m, void *data) +{ + seq_printf(m, "%llu\n", *(__u64 *)m->private); + return 0; +} + +static ssize_t seq_watermark_write(struct file *file, + const char __user *buffer, size_t count, + loff_t *off) +{ + __s64 value; + __u64 watermark; + __u64 *data = ((struct seq_file *)file->private_data)->private; + bool wm_low = (data == &ldlm_reclaim_threshold_mb) ? true : false; + int rc; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &value, 'M'); + if (rc) { + CERROR("Failed to set %s, rc = %d.\n", + wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb", + rc); + return rc; + } else if (value != 0 && value < (1 << 20)) { + CERROR("%s should be greater than 1MB.\n", + wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb"); + return -EINVAL; + } + watermark = value >> 20; + + if (wm_low) { + if (ldlm_lock_limit_mb != 0 && watermark > ldlm_lock_limit_mb) { + CERROR("lock_reclaim_threshold_mb must be smaller than " + "lock_limit_mb.\n"); + return -EINVAL; + } + + *data = watermark; + if (watermark != 0) { + watermark <<= 20; + do_div(watermark, sizeof(struct ldlm_lock)); + } + ldlm_reclaim_threshold = watermark; + } else { + if (ldlm_reclaim_threshold_mb != 0 && + watermark < ldlm_reclaim_threshold_mb) { + CERROR("lock_limit_mb must be greater than " + "lock_reclaim_threshold_mb.\n"); + return -EINVAL; + } + + *data = watermark; + if (watermark != 0) { + watermark <<= 20; + do_div(watermark, sizeof(struct ldlm_lock)); + } + ldlm_lock_limit = watermark; + } + + return count; +} + +static int seq_watermark_open(struct inode *inode, struct file *file) +{ + return single_open(file, seq_watermark_show, inode->i_private); +} + +static const struct file_operations ldlm_watermark_fops = { + .owner = THIS_MODULE, + .open = seq_watermark_open, + .read = seq_read, + .write = seq_watermark_write, + .llseek = seq_lseek, + .release = lprocfs_single_release, +}; + +static int seq_granted_show(struct seq_file *m, void *data) +{ + seq_printf(m, "%llu\n", percpu_counter_sum_positive( + (struct percpu_counter *)m->private)); + return 0; +} + +static int seq_granted_open(struct inode *inode, struct file *file) +{ + return single_open(file, seq_granted_show, inode->i_private); +} + +static const struct file_operations ldlm_granted_fops = { + .owner = THIS_MODULE, + .open = seq_granted_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* HAVE_SERVER_SUPPORT */ + +static struct ldebugfs_vars ldlm_debugfs_list[] = { + { .name = "dump_namespaces", + .fops = &ldlm_dump_ns_fops, + .proc_mode = 0222 }, + { .name = "dump_granted_max", + .fops = &ldlm_rw_uint_fops, + .data = &ldlm_dump_granted_max }, +#ifdef HAVE_SERVER_SUPPORT + { .name = "lock_reclaim_threshold_mb", + .fops = &ldlm_watermark_fops, + .data = &ldlm_reclaim_threshold_mb }, + { .name = "lock_limit_mb", + .fops = &ldlm_watermark_fops, + .data = &ldlm_lock_limit_mb }, + { .name = "lock_granted_count", + .fops = &ldlm_granted_fops, + .data = &ldlm_granted_total }, +#endif + { NULL } +}; + +int ldlm_debugfs_setup(void) +{ + int rc; + + ENTRY; + ldlm_debugfs_dir = ldebugfs_register(OBD_LDLM_DEVICENAME, + debugfs_lustre_root, + NULL, NULL); + if (IS_ERR_OR_NULL(ldlm_debugfs_dir)) { + CERROR("LDebugFS failed in ldlm-init\n"); + rc = ldlm_debugfs_dir ? PTR_ERR(ldlm_debugfs_dir) : -ENOMEM; + GOTO(err, rc); + } + + ldlm_ns_debugfs_dir = ldebugfs_register("namespaces", + ldlm_debugfs_dir, + NULL, NULL); + if (IS_ERR_OR_NULL(ldlm_ns_debugfs_dir)) { + CERROR("LProcFS failed in ldlm-init\n"); + rc = ldlm_ns_debugfs_dir ? PTR_ERR(ldlm_ns_debugfs_dir) + : -ENOMEM; + GOTO(err_type, rc); + } + + ldlm_svc_debugfs_dir = ldebugfs_register("services", + ldlm_debugfs_dir, + NULL, NULL); + if (IS_ERR_OR_NULL(ldlm_svc_debugfs_dir)) { + CERROR("LProcFS failed in ldlm-init\n"); + rc = ldlm_svc_debugfs_dir ? PTR_ERR(ldlm_svc_debugfs_dir) + : -ENOMEM; + GOTO(err_ns, rc); + } + + rc = ldebugfs_add_vars(ldlm_debugfs_dir, ldlm_debugfs_list, NULL); + if (rc != 0) { + CERROR("LProcFS failed in ldlm-init\n"); + GOTO(err_svc, rc); + } + + RETURN(0); + +err_svc: + ldebugfs_remove(&ldlm_svc_debugfs_dir); +err_ns: + ldebugfs_remove(&ldlm_ns_debugfs_dir); +err_type: + ldebugfs_remove(&ldlm_debugfs_dir); +err: + ldlm_svc_debugfs_dir = NULL; + ldlm_ns_debugfs_dir = NULL; + ldlm_debugfs_dir = NULL; + RETURN(rc); +} + +void ldlm_debugfs_cleanup(void) +{ + if (!IS_ERR_OR_NULL(ldlm_svc_debugfs_dir)) + ldebugfs_remove(&ldlm_svc_debugfs_dir); + + if (!IS_ERR_OR_NULL(ldlm_ns_debugfs_dir)) + ldebugfs_remove(&ldlm_ns_debugfs_dir); + + if (!IS_ERR_OR_NULL(ldlm_debugfs_dir)) + ldebugfs_remove(&ldlm_debugfs_dir); + + ldlm_svc_debugfs_dir = NULL; + ldlm_ns_debugfs_dir = NULL; + ldlm_debugfs_dir = NULL; +} + +static ssize_t resource_count_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + __u64 res = 0; + struct cfs_hash_bd bd; + int i; + + /* result is not strictly consistant */ + cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i) + res += cfs_hash_bd_count_get(&bd); + return sprintf(buf, "%lld\n", res); +} +LUSTRE_RO_ATTR(resource_count); + +static ssize_t lock_count_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + __u64 locks; + + locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS, + LPROCFS_FIELDS_FLAGS_SUM); + return sprintf(buf, "%lld\n", locks); +} +LUSTRE_RO_ATTR(lock_count); + +static ssize_t lock_unused_count_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%d\n", ns->ns_nr_unused); +} +LUSTRE_RO_ATTR(lock_unused_count); + +static ssize_t lru_size_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + __u32 *nr = &ns->ns_max_unused; + + if (ns_connect_lru_resize(ns)) + nr = &ns->ns_nr_unused; + return sprintf(buf, "%u\n", *nr); +} + +static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + int lru_resize; + int err; + + if (strncmp(buffer, "clear", 5) == 0) { + CDEBUG(D_DLMTRACE, + "dropping all unused locks from namespace %s\n", + ldlm_ns_name(ns)); + /* Try to cancel all @ns_nr_unused locks. */ + ldlm_cancel_lru(ns, INT_MAX, 0, LDLM_LRU_FLAG_CLEANUP); + return count; + } + + err = kstrtoul(buffer, 10, &tmp); + if (err != 0) { + CERROR("lru_size: invalid value written\n"); + return -EINVAL; + } + lru_resize = (tmp == 0); + + if (ns_connect_lru_resize(ns)) { + if (!lru_resize) + ns->ns_max_unused = (unsigned int)tmp; + + if (tmp > ns->ns_nr_unused) + tmp = ns->ns_nr_unused; + tmp = ns->ns_nr_unused - tmp; + + CDEBUG(D_DLMTRACE, + "changing namespace %s unused locks from %u to %u\n", + ldlm_ns_name(ns), ns->ns_nr_unused, + (unsigned int)tmp); + + if (!lru_resize) { + CDEBUG(D_DLMTRACE, + "disable lru_resize for namespace %s\n", + ldlm_ns_name(ns)); + ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE; + } + ldlm_cancel_lru(ns, tmp, LCF_ASYNC, 0); + } else { + CDEBUG(D_DLMTRACE, + "changing namespace %s max_unused from %u to %u\n", + ldlm_ns_name(ns), ns->ns_max_unused, + (unsigned int)tmp); + + /* Make sure that LRU resize was originally supported before + * turning it on here. + */ + if (lru_resize && + (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) { + CDEBUG(D_DLMTRACE, + "enable lru_resize for namespace %s\n", + ldlm_ns_name(ns)); + ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE; + } + ns->ns_max_unused = (unsigned int)tmp; + ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0); + } + + return count; +} +LUSTRE_RW_ATTR(lru_size); + +static ssize_t lru_max_age_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%lld\n", ktime_to_ms(ns->ns_max_age)); +} + +static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + int scale = NSEC_PER_MSEC; + unsigned long long tmp; + char *buf; + + /* Did the user ask in seconds or milliseconds. Default is in ms */ + buf = strstr(buffer, "ms"); + if (!buf) { + buf = strchr(buffer, 's'); + if (buf) + scale = NSEC_PER_SEC; + } + + if (buf) + *buf = '\0'; + + if (kstrtoull(buffer, 10, &tmp)) + return -EINVAL; + + ns->ns_max_age = ktime_set(0, tmp * scale); + + return count; +} +LUSTRE_RW_ATTR(lru_max_age); + +static ssize_t early_lock_cancel_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%d\n", ns_connect_cancelset(ns)); +} + +static ssize_t early_lock_cancel_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long supp = -1; + int rc; + + rc = kstrtoul(buffer, 10, &supp); + if (rc < 0) + return rc; + + if (supp == 0) + ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET; + else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET) + ns->ns_connect_flags |= OBD_CONNECT_CANCELSET; + return count; +} +LUSTRE_RW_ATTR(early_lock_cancel); + +static ssize_t dirty_age_limit_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%llu\n", ns->ns_dirty_age_limit); +} + +static ssize_t dirty_age_limit_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long long tmp; + + if (kstrtoull(buffer, 10, &tmp)) + return -EINVAL; + + ns->ns_dirty_age_limit = tmp; + + return count; +} +LUSTRE_RW_ATTR(dirty_age_limit); + +#ifdef HAVE_SERVER_SUPPORT +static ssize_t ctime_age_limit_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%llu\n", ns->ns_ctime_age_limit); +} + +static ssize_t ctime_age_limit_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long long tmp; + + if (kstrtoull(buffer, 10, &tmp)) + return -EINVAL; + + ns->ns_ctime_age_limit = tmp; + + return count; +} +LUSTRE_RW_ATTR(ctime_age_limit); + +static ssize_t lock_timeouts_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%d\n", ns->ns_timeouts); +} +LUSTRE_RO_ATTR(lock_timeouts); + +static ssize_t max_nolock_bytes_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%u\n", ns->ns_max_nolock_size); +} + +static ssize_t max_nolock_bytes_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + int err; + + err = kstrtoul(buffer, 10, &tmp); + if (err != 0) + return -EINVAL; + + ns->ns_max_nolock_size = tmp; + + return count; +} +LUSTRE_RW_ATTR(max_nolock_bytes); + +static ssize_t contention_seconds_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%llu\n", ns->ns_contention_time); +} + +static ssize_t contention_seconds_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long long tmp; + + if (kstrtoull(buffer, 10, &tmp)) + return -EINVAL; + + ns->ns_contention_time = tmp; + + return count; +} +LUSTRE_RW_ATTR(contention_seconds); + +static ssize_t contended_locks_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%u\n", ns->ns_contended_locks); +} + +static ssize_t contended_locks_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + int err; + + err = kstrtoul(buffer, 10, &tmp); + if (err != 0) + return -EINVAL; + + ns->ns_contended_locks = tmp; + + return count; +} +LUSTRE_RW_ATTR(contended_locks); + +static ssize_t max_parallel_ast_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return sprintf(buf, "%u\n", ns->ns_max_parallel_ast); +} + +static ssize_t max_parallel_ast_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + unsigned long tmp; + int err; + + err = kstrtoul(buffer, 10, &tmp); + if (err != 0) + return -EINVAL; + + ns->ns_max_parallel_ast = tmp; + + return count; +} +LUSTRE_RW_ATTR(max_parallel_ast); + +#endif /* HAVE_SERVER_SUPPORT */ + +/* These are for namespaces in /sys/fs/lustre/ldlm/namespaces/ */ +static struct attribute *ldlm_ns_attrs[] = { + &lustre_attr_resource_count.attr, + &lustre_attr_lock_count.attr, + &lustre_attr_lock_unused_count.attr, + &lustre_attr_lru_size.attr, + &lustre_attr_lru_max_age.attr, + &lustre_attr_early_lock_cancel.attr, + &lustre_attr_dirty_age_limit.attr, +#ifdef HAVE_SERVER_SUPPORT + &lustre_attr_ctime_age_limit.attr, + &lustre_attr_lock_timeouts.attr, + &lustre_attr_max_nolock_bytes.attr, + &lustre_attr_contention_seconds.attr, + &lustre_attr_contended_locks.attr, + &lustre_attr_max_parallel_ast.attr, +#endif + NULL, +}; + +static void ldlm_ns_release(struct kobject *kobj) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + complete(&ns->ns_kobj_unregister); +} + +static struct kobj_type ldlm_ns_ktype = { + .default_attrs = ldlm_ns_attrs, + .sysfs_ops = &lustre_sysfs_ops, + .release = ldlm_ns_release, +}; + +static void ldlm_namespace_debugfs_unregister(struct ldlm_namespace *ns) +{ + if (IS_ERR_OR_NULL(ns->ns_debugfs_entry)) + CERROR("dlm namespace %s has no procfs dir?\n", + ldlm_ns_name(ns)); + else + ldebugfs_remove(&ns->ns_debugfs_entry); + + if (ns->ns_stats != NULL) + lprocfs_free_stats(&ns->ns_stats); +} + +void ldlm_namespace_sysfs_unregister(struct ldlm_namespace *ns) +{ + kobject_put(&ns->ns_kobj); + wait_for_completion(&ns->ns_kobj_unregister); +} + +int ldlm_namespace_sysfs_register(struct ldlm_namespace *ns) +{ + int err; + + ns->ns_kobj.kset = ldlm_ns_kset; + init_completion(&ns->ns_kobj_unregister); + err = kobject_init_and_add(&ns->ns_kobj, &ldlm_ns_ktype, NULL, + "%s", ldlm_ns_name(ns)); + + ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0); + if (!ns->ns_stats) { + kobject_put(&ns->ns_kobj); + return -ENOMEM; + } + + lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS, + LPROCFS_CNTR_AVGMINMAX, "locks", "locks"); + + return err; +} + +static int ldlm_namespace_debugfs_register(struct ldlm_namespace *ns) +{ + struct dentry *ns_entry; + + if (!IS_ERR_OR_NULL(ns->ns_debugfs_entry)) { + ns_entry = ns->ns_debugfs_entry; + } else { + ns_entry = debugfs_create_dir(ldlm_ns_name(ns), + ldlm_ns_debugfs_dir); + if (!ns_entry) + return -ENOMEM; + ns->ns_debugfs_entry = ns_entry; + } + + return 0; +} +#undef MAX_STRING_SIZE + +static unsigned ldlm_res_hop_hash(struct cfs_hash *hs, + const void *key, unsigned mask) +{ + const struct ldlm_res_id *id = key; + unsigned val = 0; + unsigned i; + + for (i = 0; i < RES_NAME_SIZE; i++) + val += id->name[i]; + return val & mask; +} + +static unsigned ldlm_res_hop_fid_hash(struct cfs_hash *hs, + const void *key, unsigned mask) +{ + const struct ldlm_res_id *id = key; + struct lu_fid fid; + __u32 hash; + __u32 val; + + fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF]; + fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF]; + fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); + + hash = fid_flatten32(&fid); + hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */ + if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) { + val = id->name[LUSTRE_RES_ID_HSH_OFF]; + hash += (val >> 5) + (val << 11); + } else { + val = fid_oid(&fid); + } + hash = hash_long(hash, hs->hs_bkt_bits); + /* give me another random factor */ + hash -= hash_long((unsigned long)hs, val % 11 + 3); + + hash <<= hs->hs_cur_bits - hs->hs_bkt_bits; + hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1); + + return hash & mask; +} + +static void *ldlm_res_hop_key(struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + return &res->lr_name; +} + +static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + return ldlm_res_eq((const struct ldlm_res_id *)key, + (const struct ldlm_res_id *)&res->lr_name); +} + +static void *ldlm_res_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_resource, lr_hash); +} + +static void +ldlm_res_hop_get_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + ldlm_resource_getref(res); +} + +static void ldlm_res_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + ldlm_resource_putref(res); +} + +static struct cfs_hash_ops ldlm_ns_hash_ops = { + .hs_hash = ldlm_res_hop_hash, + .hs_key = ldlm_res_hop_key, + .hs_keycmp = ldlm_res_hop_keycmp, + .hs_keycpy = NULL, + .hs_object = ldlm_res_hop_object, + .hs_get = ldlm_res_hop_get_locked, + .hs_put = ldlm_res_hop_put +}; + +static struct cfs_hash_ops ldlm_ns_fid_hash_ops = { + .hs_hash = ldlm_res_hop_fid_hash, + .hs_key = ldlm_res_hop_key, + .hs_keycmp = ldlm_res_hop_keycmp, + .hs_keycpy = NULL, + .hs_object = ldlm_res_hop_object, + .hs_get = ldlm_res_hop_get_locked, + .hs_put = ldlm_res_hop_put +}; + +typedef struct ldlm_ns_hash_def { + enum ldlm_ns_type nsd_type; + /** hash bucket bits */ + unsigned nsd_bkt_bits; + /** hash bits */ + unsigned nsd_all_bits; + /** hash operations */ + struct cfs_hash_ops *nsd_hops; +} ldlm_ns_hash_def_t; + +static struct ldlm_ns_hash_def ldlm_ns_hash_defs[] = +{ + { + .nsd_type = LDLM_NS_TYPE_MDC, + .nsd_bkt_bits = 11, + .nsd_all_bits = 16, + .nsd_hops = &ldlm_ns_fid_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_MDT, + .nsd_bkt_bits = 14, + .nsd_all_bits = 21, + .nsd_hops = &ldlm_ns_fid_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_OSC, + .nsd_bkt_bits = 8, + .nsd_all_bits = 12, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_OST, + .nsd_bkt_bits = 11, + .nsd_all_bits = 17, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_MGC, + .nsd_bkt_bits = 4, + .nsd_all_bits = 4, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_MGT, + .nsd_bkt_bits = 4, + .nsd_all_bits = 4, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_UNKNOWN, + }, +}; + +/** + * Create and initialize new empty namespace. + */ +struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, + enum ldlm_side client, + enum ldlm_appetite apt, + enum ldlm_ns_type ns_type) +{ + struct ldlm_namespace *ns = NULL; + struct ldlm_ns_bucket *nsb; + struct ldlm_ns_hash_def *nsd; + struct cfs_hash_bd bd; + int idx; + int rc; + ENTRY; + + LASSERT(obd != NULL); + + rc = ldlm_get_ref(); + if (rc) { + CERROR("ldlm_get_ref failed: %d\n", rc); + RETURN(NULL); + } + + for (idx = 0;;idx++) { + nsd = &ldlm_ns_hash_defs[idx]; + if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) { + CERROR("Unknown type %d for ns %s\n", ns_type, name); + GOTO(out_ref, NULL); + } + + if (nsd->nsd_type == ns_type) + break; + } + + OBD_ALLOC_PTR(ns); + if (!ns) + GOTO(out_ref, NULL); + + ns->ns_rs_hash = cfs_hash_create(name, + nsd->nsd_all_bits, nsd->nsd_all_bits, + nsd->nsd_bkt_bits, sizeof(*nsb), + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + nsd->nsd_hops, + CFS_HASH_DEPTH | + CFS_HASH_BIGNAME | + CFS_HASH_SPIN_BKTLOCK | + CFS_HASH_NO_ITEMREF); + if (ns->ns_rs_hash == NULL) + GOTO(out_ns, NULL); + + cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) { + nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd); + at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0); + nsb->nsb_namespace = ns; + nsb->nsb_reclaim_start = 0; + } + + ns->ns_obd = obd; + ns->ns_appetite = apt; + ns->ns_client = client; + ns->ns_name = kstrdup(name, GFP_KERNEL); + if (!ns->ns_name) + goto out_hash; + + INIT_LIST_HEAD(&ns->ns_list_chain); + INIT_LIST_HEAD(&ns->ns_unused_list); + spin_lock_init(&ns->ns_lock); + atomic_set(&ns->ns_bref, 0); + init_waitqueue_head(&ns->ns_waitq); + + ns->ns_max_nolock_size = NS_DEFAULT_MAX_NOLOCK_BYTES; + ns->ns_contention_time = NS_DEFAULT_CONTENTION_SECONDS; + ns->ns_contended_locks = NS_DEFAULT_CONTENDED_LOCKS; + + ns->ns_max_parallel_ast = LDLM_DEFAULT_PARALLEL_AST_LIMIT; + ns->ns_nr_unused = 0; + ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE; + ns->ns_max_age = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0); + ns->ns_ctime_age_limit = LDLM_CTIME_AGE_LIMIT; + ns->ns_dirty_age_limit = LDLM_DIRTY_AGE_LIMIT; + ns->ns_timeouts = 0; + ns->ns_orig_connect_flags = 0; + ns->ns_connect_flags = 0; + ns->ns_stopping = 0; + ns->ns_reclaim_start = 0; + ns->ns_last_pos = &ns->ns_unused_list; + ns->ns_flags = 0; + + rc = ldlm_namespace_sysfs_register(ns); + if (rc) { + CERROR("Can't initialize ns sysfs, rc %d\n", rc); + GOTO(out_hash, rc); + } + + rc = ldlm_namespace_debugfs_register(ns); + if (rc) { + CERROR("Can't initialize ns proc, rc %d\n", rc); + GOTO(out_sysfs, rc); + } + + idx = ldlm_namespace_nr_read(client); + rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client); + if (rc) { + CERROR("Can't initialize lock pool, rc %d\n", rc); + GOTO(out_proc, rc); + } + + ldlm_namespace_register(ns, client); + RETURN(ns); +out_proc: + ldlm_namespace_debugfs_unregister(ns); +out_sysfs: + ldlm_namespace_sysfs_unregister(ns); + ldlm_namespace_cleanup(ns, 0); +out_hash: + kfree(ns->ns_name); + cfs_hash_putref(ns->ns_rs_hash); +out_ns: + OBD_FREE_PTR(ns); +out_ref: + ldlm_put_ref(); + RETURN(NULL); +} +EXPORT_SYMBOL(ldlm_namespace_new); + +extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); + +/** + * Cancel and destroy all locks on a resource. + * + * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just + * clean up. This is currently only used for recovery, and we make + * certain assumptions as a result--notably, that we shouldn't cancel + * locks with refs. + */ +static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, + __u64 flags) +{ + struct list_head *tmp; + int rc = 0, client = ns_is_client(ldlm_res_to_ns(res)); + bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY); + + do { + struct ldlm_lock *lock = NULL; + + /* First, we look for non-cleaned-yet lock + * all cleaned locks are marked by CLEANED flag. */ + lock_res(res); + list_for_each(tmp, q) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + if (ldlm_is_cleaned(lock)) { + lock = NULL; + continue; + } + LDLM_LOCK_GET(lock); + ldlm_set_cleaned(lock); + break; + } + + if (lock == NULL) { + unlock_res(res); + break; + } + + /* Set CBPENDING so nothing in the cancellation path + * can match this lock. */ + ldlm_set_cbpending(lock); + ldlm_set_failed(lock); + lock->l_flags |= flags; + + /* ... without sending a CANCEL message for local_only. */ + if (local_only) + ldlm_set_local_only(lock); + + if (local_only && (lock->l_readers || lock->l_writers)) { + /* This is a little bit gross, but much better than the + * alternative: pretend that we got a blocking AST from + * the server, so that when the lock is decref'd, it + * will go away ... */ + unlock_res(res); + LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY"); + if (lock->l_flags & LDLM_FL_FAIL_LOC) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(4)); + set_current_state(TASK_RUNNING); + } + if (lock->l_completion_ast) + lock->l_completion_ast(lock, + LDLM_FL_FAILED, NULL); + LDLM_LOCK_RELEASE(lock); + continue; + } + + if (client) { + struct lustre_handle lockh; + + unlock_res(res); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_LOCAL); + if (rc) + CERROR("ldlm_cli_cancel: %d\n", rc); + } else { + unlock_res(res); + LDLM_DEBUG(lock, "Freeing a lock still held by a " + "client node"); + ldlm_lock_cancel(lock); + } + LDLM_LOCK_RELEASE(lock); + } while (1); +} + +static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + __u64 flags = *(__u64 *)arg; + + cleanup_resource(res, &res->lr_granted, flags); + cleanup_resource(res, &res->lr_waiting, flags); + + return 0; +} + +static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + + lock_res(res); + CERROR("%s: namespace resource "DLDLMRES" (%p) refcount nonzero " + "(%d) after lock cleanup; forcing cleanup.\n", + ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res, + atomic_read(&res->lr_refcount) - 1); + + /* Use D_NETERROR since it is in the default mask */ + ldlm_resource_dump(D_NETERROR, res); + unlock_res(res); + return 0; +} + +/** + * Cancel and destroy all locks in the namespace. + * + * Typically used during evictions when server notified client that it was + * evicted and all of its state needs to be destroyed. + * Also used during shutdown. + */ +int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags) +{ + if (ns == NULL) { + CDEBUG(D_INFO, "NULL ns, skipping cleanup\n"); + return ELDLM_OK; + } + + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, + &flags, 0); + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, + NULL, 0); + return ELDLM_OK; +} +EXPORT_SYMBOL(ldlm_namespace_cleanup); + +/** + * Attempts to free namespace. + * + * Only used when namespace goes away, like during an unmount. + */ +static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force) +{ + ENTRY; + + /* At shutdown time, don't call the cancellation callback */ + ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0); + + if (atomic_read(&ns->ns_bref) > 0) { + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + int rc; + CDEBUG(D_DLMTRACE, + "dlm namespace %s free waiting on refcount %d\n", + ldlm_ns_name(ns), atomic_read(&ns->ns_bref)); +force_wait: + if (force) + lwi = LWI_TIMEOUT(msecs_to_jiffies(obd_timeout * + MSEC_PER_SEC) / 4, NULL, NULL); + + rc = l_wait_event(ns->ns_waitq, + atomic_read(&ns->ns_bref) == 0, &lwi); + + /* Forced cleanups should be able to reclaim all references, + * so it's safe to wait forever... we can't leak locks... */ + if (force && rc == -ETIMEDOUT) { + LCONSOLE_ERROR("Forced cleanup waiting for %s " + "namespace with %d resources in use, " + "(rc=%d)\n", ldlm_ns_name(ns), + atomic_read(&ns->ns_bref), rc); + GOTO(force_wait, rc); + } + + if (atomic_read(&ns->ns_bref)) { + LCONSOLE_ERROR("Cleanup waiting for %s namespace " + "with %d resources in use, (rc=%d)\n", + ldlm_ns_name(ns), + atomic_read(&ns->ns_bref), rc); + RETURN(ELDLM_NAMESPACE_EXISTS); + } + CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n", + ldlm_ns_name(ns)); + } + + RETURN(ELDLM_OK); +} + +/** + * Performs various cleanups for passed \a ns to make it drop refc and be + * ready for freeing. Waits for refc == 0. + * + * The following is done: + * (0) Unregister \a ns from its list to make inaccessible for potential + * users like pools thread and others; + * (1) Clear all locks in \a ns. + */ +void ldlm_namespace_free_prior(struct ldlm_namespace *ns, + struct obd_import *imp, + int force) +{ + int rc; + ENTRY; + if (!ns) { + EXIT; + return; + } + + spin_lock(&ns->ns_lock); + ns->ns_stopping = 1; + spin_unlock(&ns->ns_lock); + + /* + * Can fail with -EINTR when force == 0 in which case try harder. + */ + rc = __ldlm_namespace_free(ns, force); + if (rc != ELDLM_OK) { + if (imp) { + ptlrpc_disconnect_import(imp, 0); + ptlrpc_invalidate_import(imp); + } + + /* + * With all requests dropped and the import inactive + * we are gaurenteed all reference will be dropped. + */ + rc = __ldlm_namespace_free(ns, 1); + LASSERT(rc == 0); + } + EXIT; +} +EXPORT_SYMBOL(ldlm_namespace_free_prior); + +/** + * Performs freeing memory structures related to \a ns. This is only done + * when ldlm_namespce_free_prior() successfully removed all resources + * referencing \a ns and its refc == 0. + */ +void ldlm_namespace_free_post(struct ldlm_namespace *ns) +{ + ENTRY; + if (!ns) { + EXIT; + return; + } + + /* Make sure that nobody can find this ns in its list. */ + ldlm_namespace_unregister(ns, ns->ns_client); + /* Fini pool _before_ parent proc dir is removed. This is important as + * ldlm_pool_fini() removes own proc dir which is child to @dir. + * Removing it after @dir may cause oops. */ + ldlm_pool_fini(&ns->ns_pool); + + ldlm_namespace_debugfs_unregister(ns); + ldlm_namespace_sysfs_unregister(ns); + cfs_hash_putref(ns->ns_rs_hash); + kfree(ns->ns_name); + /* Namespace \a ns should be not on list at this time, otherwise + * this will cause issues related to using freed \a ns in poold + * thread. + */ + LASSERT(list_empty(&ns->ns_list_chain)); + OBD_FREE_PTR(ns); + ldlm_put_ref(); + EXIT; +} +EXPORT_SYMBOL(ldlm_namespace_free_post); + +/** + * Cleanup the resource, and free namespace. + * bug 12864: + * Deadlock issue: + * proc1: destroy import + * class_disconnect_export(grab cl_sem) -> + * -> ldlm_namespace_free -> + * -> lprocfs_remove(grab _lprocfs_lock). + * proc2: read proc info + * lprocfs_fops_read(grab _lprocfs_lock) -> + * -> osc_rd_active, etc(grab cl_sem). + * + * So that I have to split the ldlm_namespace_free into two parts - the first + * part ldlm_namespace_free_prior is used to cleanup the resource which is + * being used; the 2nd part ldlm_namespace_free_post is used to unregister the + * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem + * held. + */ +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, + int force) +{ + ldlm_namespace_free_prior(ns, imp, force); + ldlm_namespace_free_post(ns); +} +EXPORT_SYMBOL(ldlm_namespace_free); + +void ldlm_namespace_get(struct ldlm_namespace *ns) +{ + atomic_inc(&ns->ns_bref); +} + +/* This is only for callers that care about refcount */ +static int ldlm_namespace_get_return(struct ldlm_namespace *ns) +{ + return atomic_inc_return(&ns->ns_bref); +} + +void ldlm_namespace_put(struct ldlm_namespace *ns) +{ + if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) { + wake_up(&ns->ns_waitq); + spin_unlock(&ns->ns_lock); + } +} + +/** Register \a ns in the list of namespaces */ +void ldlm_namespace_register(struct ldlm_namespace *ns, enum ldlm_side client) +{ + mutex_lock(ldlm_namespace_lock(client)); + LASSERT(list_empty(&ns->ns_list_chain)); + list_add(&ns->ns_list_chain, ldlm_namespace_inactive_list(client)); + ldlm_namespace_nr_inc(client); + mutex_unlock(ldlm_namespace_lock(client)); +} + +/** Unregister \a ns from the list of namespaces. */ +void ldlm_namespace_unregister(struct ldlm_namespace *ns, enum ldlm_side client) +{ + mutex_lock(ldlm_namespace_lock(client)); + LASSERT(!list_empty(&ns->ns_list_chain)); + /* Some asserts and possibly other parts of the code are still + * using list_empty(&ns->ns_list_chain). This is why it is + * important to use list_del_init() here. */ + list_del_init(&ns->ns_list_chain); + ldlm_namespace_nr_dec(client); + mutex_unlock(ldlm_namespace_lock(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *ns, + enum ldlm_side client) +{ + LASSERT(!list_empty(&ns->ns_list_chain)); + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *ns, + enum ldlm_side client) +{ + LASSERT(!list_empty(&ns->ns_list_chain)); + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + list_move_tail(&ns->ns_list_chain, + ldlm_namespace_inactive_list(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side client) +{ + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + LASSERT(!list_empty(ldlm_namespace_list(client))); + return container_of(ldlm_namespace_list(client)->next, + struct ldlm_namespace, ns_list_chain); +} + +static bool ldlm_resource_extent_new(struct ldlm_resource *res) +{ + int idx; + + OBD_SLAB_ALLOC(res->lr_itree, ldlm_interval_tree_slab, + sizeof(*res->lr_itree) * LCK_MODE_NUM); + if (res->lr_itree == NULL) + return false; + /* Initialize interval trees for each lock mode. */ + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + res->lr_itree[idx].lit_size = 0; + res->lr_itree[idx].lit_mode = 1 << idx; + res->lr_itree[idx].lit_root = NULL; + } + return true; +} + +static bool ldlm_resource_inodebits_new(struct ldlm_resource *res) +{ + int i; + + OBD_ALLOC_PTR(res->lr_ibits_queues); + if (res->lr_ibits_queues == NULL) + return false; + for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) + INIT_LIST_HEAD(&res->lr_ibits_queues->liq_waiting[i]); + return true; +} + +/** Create and initialize new resource. */ +static struct ldlm_resource *ldlm_resource_new(enum ldlm_type ldlm_type) +{ + struct ldlm_resource *res; + bool rc; + + OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, GFP_NOFS); + if (res == NULL) + return NULL; + + switch (ldlm_type) { + case LDLM_EXTENT: + rc = ldlm_resource_extent_new(res); + break; + case LDLM_IBITS: + rc = ldlm_resource_inodebits_new(res); + break; + default: + rc = true; + break; + } + if (!rc) { + OBD_SLAB_FREE_PTR(res, ldlm_resource_slab); + return NULL; + } + + INIT_LIST_HEAD(&res->lr_granted); + INIT_LIST_HEAD(&res->lr_waiting); + + atomic_set(&res->lr_refcount, 1); + spin_lock_init(&res->lr_lock); + lu_ref_init(&res->lr_reference); + + /* Since LVB init can be delayed now, there is no longer need to + * immediatelly acquire mutex here. */ + mutex_init(&res->lr_lvb_mutex); + res->lr_lvb_initialized = false; + + return res; +} + +static void ldlm_resource_free(struct ldlm_resource *res) +{ + if (res->lr_type == LDLM_EXTENT) { + if (res->lr_itree != NULL) + OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab, + sizeof(*res->lr_itree) * LCK_MODE_NUM); + } else if (res->lr_type == LDLM_IBITS) { + if (res->lr_ibits_queues != NULL) + OBD_FREE_PTR(res->lr_ibits_queues); + } + + OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res); +} + +/** + * Return a reference to resource with given name, creating it if necessary. + * Args: namespace with ns_lock unlocked + * Locks: takes and releases NS hash-lock and res->lr_lock + * Returns: referenced, unlocked ldlm_resource or NULL + */ +struct ldlm_resource * +ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, + const struct ldlm_res_id *name, enum ldlm_type type, + int create) +{ + struct hlist_node *hnode; + struct ldlm_resource *res = NULL; + struct cfs_hash_bd bd; + __u64 version; + int ns_refcount = 0; + + LASSERT(ns != NULL); + LASSERT(parent == NULL); + LASSERT(ns->ns_rs_hash != NULL); + LASSERT(name->name[0] != 0); + + cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0); + hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); + if (hnode != NULL) { + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); + GOTO(found, res); + } + + version = cfs_hash_bd_version_get(&bd); + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); + + if (create == 0) + return ERR_PTR(-ENOENT); + + LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE, + "type: %d\n", type); + res = ldlm_resource_new(type); + if (res == NULL) + return ERR_PTR(-ENOMEM); + + res->lr_ns_bucket = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd); + res->lr_name = *name; + res->lr_type = type; + + cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1); + hnode = (version == cfs_hash_bd_version_get(&bd)) ? NULL : + cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); + + if (hnode != NULL) { + /* Someone won the race and already added the resource. */ + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + /* Clean lu_ref for failed resource. */ + lu_ref_fini(&res->lr_reference); + ldlm_resource_free(res); +found: + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + return res; + } + /* We won! Let's add the resource. */ + cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash); + if (cfs_hash_bd_count_get(&bd) == 1) + ns_refcount = ldlm_namespace_get_return(ns); + + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2); + + /* Let's see if we happened to be the very first resource in this + * namespace. If so, and this is a client namespace, we need to move + * the namespace into the active namespaces list to be patrolled by + * the ldlm_poold. */ + if (ns_is_client(ns) && ns_refcount == 1) { + mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); + ldlm_namespace_move_to_active_locked(ns, LDLM_NAMESPACE_CLIENT); + mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); + } + + return res; +} +EXPORT_SYMBOL(ldlm_resource_get); + +struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res) +{ + LASSERT(res != NULL); + LASSERT(res != LP_POISON); + atomic_inc(&res->lr_refcount); + CDEBUG(D_INFO, "getref res: %p count: %d\n", res, + atomic_read(&res->lr_refcount)); + return res; +} + +static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd, + struct ldlm_resource *res) +{ + struct ldlm_ns_bucket *nsb = res->lr_ns_bucket; + + if (!list_empty(&res->lr_granted)) { + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + + if (!list_empty(&res->lr_waiting)) { + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + + cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash, + bd, &res->lr_hash); + lu_ref_fini(&res->lr_reference); + if (cfs_hash_bd_count_get(bd) == 0) + ldlm_namespace_put(nsb->nsb_namespace); +} + +/* Returns 1 if the resource was freed, 0 if it remains. */ +int ldlm_resource_putref(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + struct cfs_hash_bd bd; + + LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "putref res: %p count: %d\n", + res, atomic_read(&res->lr_refcount) - 1); + + cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd); + if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) { + __ldlm_resource_putref_final(&bd, res); + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free) + ns->ns_lvbo->lvbo_free(res); + ldlm_resource_free(res); + return 1; + } + return 0; +} +EXPORT_SYMBOL(ldlm_resource_putref); + +/** + * Add a lock into a given resource into specified lock list. + */ +void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head, + struct ldlm_lock *lock) +{ + check_res_locked(res); + + LDLM_DEBUG(lock, "About to add this lock"); + + if (ldlm_is_destroyed(lock)) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + return; + } + + LASSERT(list_empty(&lock->l_res_link)); + + list_add_tail(&lock->l_res_link, head); + + if (res->lr_type == LDLM_IBITS) + ldlm_inodebits_add_lock(res, head, lock); +} + +/** + * Insert a lock into resource after specified lock. + * + * Obtain resource description from the lock we are inserting after. + */ +void ldlm_resource_insert_lock_after(struct ldlm_lock *original, + struct ldlm_lock *new) +{ + struct ldlm_resource *res = original->l_resource; + + check_res_locked(res); + + ldlm_resource_dump(D_INFO, res); + LDLM_DEBUG(new, "About to insert this lock after %p: ", original); + + if (ldlm_is_destroyed(new)) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + goto out; + } + + LASSERT(list_empty(&new->l_res_link)); + + list_add(&new->l_res_link, &original->l_res_link); + out:; +} + +void ldlm_resource_unlink_lock(struct ldlm_lock *lock) +{ + int type = lock->l_resource->lr_type; + + check_res_locked(lock->l_resource); + switch (type) { + case LDLM_PLAIN: + ldlm_unlink_lock_skiplist(lock); + break; + case LDLM_EXTENT: + ldlm_extent_unlink_lock(lock); + break; + case LDLM_IBITS: + ldlm_inodebits_unlink_lock(lock); + break; + } + list_del_init(&lock->l_res_link); +} +EXPORT_SYMBOL(ldlm_resource_unlink_lock); + +void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc) +{ + desc->lr_type = res->lr_type; + desc->lr_name = res->lr_name; +} + +/** + * Print information about all locks in all namespaces on this node to debug + * log. + */ +void ldlm_dump_all_namespaces(enum ldlm_side client, int level) +{ + struct list_head *tmp; + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + mutex_lock(ldlm_namespace_lock(client)); + + list_for_each(tmp, ldlm_namespace_list(client)) { + struct ldlm_namespace *ns; + + ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain); + ldlm_namespace_dump(level, ns); + } + + mutex_unlock(ldlm_namespace_lock(client)); +} + +static int ldlm_res_hash_dump(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + int level = (int)(unsigned long)arg; + + lock_res(res); + ldlm_resource_dump(level, res); + unlock_res(res); + + return 0; +} + +/** + * Print information about all locks in this namespace on this node to debug + * log. + */ +void ldlm_namespace_dump(int level, struct ldlm_namespace *ns) +{ + if (!((libcfs_debug | D_ERROR) & level)) + return; + + CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n", + ldlm_ns_name(ns), atomic_read(&ns->ns_bref), + ns_is_client(ns) ? "client" : "server"); + + if (ktime_get_seconds() < ns->ns_next_dump) + return; + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_res_hash_dump, + (void *)(unsigned long)level, 0); + spin_lock(&ns->ns_lock); + ns->ns_next_dump = ktime_get_seconds() + 10; + spin_unlock(&ns->ns_lock); +} + +/** + * Print information about all locks in this resource to debug log. + */ +void ldlm_resource_dump(int level, struct ldlm_resource *res) +{ + struct ldlm_lock *lock; + unsigned int granted = 0; + + CLASSERT(RES_NAME_SIZE == 4); + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + CDEBUG(level, "--- Resource: "DLDLMRES" (%p) refcount = %d\n", + PLDLMRES(res), res, atomic_read(&res->lr_refcount)); + + if (!list_empty(&res->lr_granted)) { + CDEBUG(level, "Granted locks (in reverse order):\n"); + list_for_each_entry_reverse(lock, &res->lr_granted, + l_res_link) { + LDLM_DEBUG_LIMIT(level, lock, "###"); + if (!(level & D_CANTMASK) && + ++granted > ldlm_dump_granted_max) { + CDEBUG(level, "only dump %d granted locks to " + "avoid DDOS.\n", granted); + break; + } + } + } + + if (!list_empty(&res->lr_waiting)) { + CDEBUG(level, "Waiting locks:\n"); + list_for_each_entry(lock, &res->lr_waiting, l_res_link) + LDLM_DEBUG_LIMIT(level, lock, "###"); + } +} +EXPORT_SYMBOL(ldlm_resource_dump); diff --git a/drivers/staging/lustrefsx/lustre/llite/Makefile b/drivers/staging/lustrefsx/lustre/llite/Makefile new file mode 100644 index 0000000000000..19f415face716 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_LUSTREFSX_FS) += lustre.o + +lustre-y := dcache.o dir.o file.o llite_lib.o llite_nfs.o +lustre-y += rw.o lproc_llite.o namei.o symlink.o llite_mmap.o +lustre-y += xattr.o xattr_cache.o +lustre-y += rw26.o super25.o statahead.o xattr_security.o +lustre-y += glimpse.o +lustre-y += lcommon_cl.o +lustre-y += lcommon_misc.o +lustre-y += vvp_dev.o vvp_page.o vvp_io.o vvp_object.o +lustre-y += range_lock.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/llite/dcache.c b/drivers/staging/lustrefsx/lustre/llite/dcache.c new file mode 100644 index 0000000000000..801cfc988b273 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/dcache.c @@ -0,0 +1,421 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include + +#include "llite_internal.h" + +static void free_dentry_data(struct rcu_head *head) +{ + struct ll_dentry_data *lld; + + lld = container_of(head, struct ll_dentry_data, lld_rcu_head); + OBD_FREE_PTR(lld); +} + +/* should NOT be called with the dcache lock, see fs/dcache.c */ +static void ll_release(struct dentry *de) +{ + struct ll_dentry_data *lld; + ENTRY; + LASSERT(de != NULL); + lld = ll_d2d(de); + if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */ + RETURN_EXIT; + + if (lld->lld_it) { + ll_intent_release(lld->lld_it); + OBD_FREE(lld->lld_it, sizeof(*lld->lld_it)); + } + + de->d_fsdata = NULL; + call_rcu(&lld->lld_rcu_head, free_dentry_data); + + EXIT; +} + +/* Compare if two dentries are the same. Don't match if the existing dentry + * is marked invalid. Returns 1 if different, 0 if the same. + * + * This avoids a race where ll_lookup_it() instantiates a dentry, but we get + * an AST before calling d_revalidate_it(). The dentry still exists (marked + * INVALID) so d_lookup() matches it, but we have no lock on it (so + * lock_match() fails) and we spin around real_lookup(). */ +#ifdef HAVE_D_COMPARE_7ARGS +static int ll_dcompare(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, + const struct qstr *name) +#elif defined(HAVE_D_COMPARE_5ARGS) +static int ll_dcompare(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, + const struct qstr *name) +#elif defined(HAVE_D_COMPARE_4ARGS) +static int ll_dcompare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +#else +static int ll_dcompare(struct dentry *parent, struct qstr *d_name, + struct qstr *name) +#endif +{ +#if !defined(HAVE_D_COMPARE_7ARGS) && !defined(HAVE_D_COMPARE_5ARGS) && !defined(HAVE_D_COMPARE_4ARGS) + /* XXX: (ugh !) d_name must be in-dentry structure */ + struct dentry *dentry = container_of(d_name, struct dentry, d_name); + unsigned int len = d_name->len; + const char *str = d_name->name; +#endif + ENTRY; + + if (len != name->len) + RETURN(1); + + if (memcmp(str, name->name, len)) + RETURN(1); + + CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n", + name->len, name->name, dentry, dentry->d_flags, + ll_d_count(dentry)); + + /* mountpoint is always valid */ + if (d_mountpoint((struct dentry *)dentry)) + RETURN(0); + + if (d_lustre_invalid(dentry)) + RETURN(1); + + RETURN(0); +} + +/** + * Called when last reference to a dentry is dropped and dcache wants to know + * whether or not it should cache it: + * - return 1 to delete the dentry immediately + * - return 0 to cache the dentry + * Should NOT be called with the dcache lock, see fs/dcache.c + */ +static int ll_ddelete(HAVE_D_DELETE_CONST struct dentry *de) +{ + ENTRY; + LASSERT(de); + + CDEBUG(D_DENTRY, "%s dentry %.*s (%p, parent %p, inode %p) %s%s\n", + d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping", + de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode, + d_unhashed((struct dentry *)de) ? "" : "hashed,", + list_empty(&de->d_subdirs) ? "" : "subdirs"); + +#ifdef HAVE_DCACHE_LOCK + LASSERT(ll_d_count(de) == 0); +#else + /* kernel >= 2.6.38 last refcount is decreased after this function. */ + LASSERT(ll_d_count(de) == 1); +#endif + + if (d_lustre_invalid((struct dentry *)de)) + RETURN(1); + RETURN(0); +} + +int ll_d_init(struct dentry *de) +{ + ENTRY; + LASSERT(de != NULL); + + CDEBUG(D_DENTRY, "ldd on dentry %.*s (%p) parent %p inode %p refc %d\n", + de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode, + ll_d_count(de)); + + if (de->d_fsdata == NULL) { + struct ll_dentry_data *lld; + + OBD_ALLOC_PTR(lld); + if (likely(lld != NULL)) { + spin_lock(&de->d_lock); + if (likely(de->d_fsdata == NULL)) { +#ifdef HAVE_DCACHE_LOCK + /* kernel >= 2.6.38 d_op is set in d_alloc() */ + de->d_op = &ll_d_ops; + smp_mb(); +#endif + de->d_fsdata = lld; + __d_lustre_invalidate(de); + } else { + OBD_FREE_PTR(lld); + } + spin_unlock(&de->d_lock); + } else { + RETURN(-ENOMEM); + } + } + LASSERT(de->d_op == &ll_d_ops); + + RETURN(0); +} + +void ll_intent_drop_lock(struct lookup_intent *it) +{ + if (it->it_op && it->it_lock_mode) { + struct lustre_handle handle; + + handle.cookie = it->it_lock_handle; + + CDEBUG(D_DLMTRACE, "releasing lock with cookie %#llx from it %p\n", + handle.cookie, it); + ldlm_lock_decref(&handle, it->it_lock_mode); + + /* bug 494: intent_release may be called multiple times, from + * this thread and we don't want to double-decref this lock */ + it->it_lock_mode = 0; + if (it->it_remote_lock_mode != 0) { + handle.cookie = it->it_remote_lock_handle; + + CDEBUG(D_DLMTRACE, "releasing remote lock with cookie" + "%#llx from it %p\n", handle.cookie, it); + ldlm_lock_decref(&handle, + it->it_remote_lock_mode); + it->it_remote_lock_mode = 0; + } + } +} + +void ll_intent_release(struct lookup_intent *it) +{ + ENTRY; + + CDEBUG(D_INFO, "intent %p released\n", it); + ll_intent_drop_lock(it); + /* We are still holding extra reference on a request, need to free it */ + if (it_disposition(it, DISP_ENQ_OPEN_REF)) + ptlrpc_req_finished(it->it_request); /* ll_file_open */ + + if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */ + ptlrpc_req_finished(it->it_request); + + it->it_disposition = 0; + it->it_request = NULL; + EXIT; +} + +void ll_invalidate_aliases(struct inode *inode) +{ + struct dentry *dentry; + DECLARE_LL_D_HLIST_NODE_PTR(p); + ENTRY; + + LASSERT(inode != NULL); + + CDEBUG(D_INODE, "marking dentries for inode "DFID"(%p) invalid\n", + PFID(ll_inode2fid(inode)), inode); + + ll_lock_dcache(inode); + ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry) { + CDEBUG(D_DENTRY, "dentry in drop %.*s (%p) parent %p " + "inode %p flags %d\n", dentry->d_name.len, + dentry->d_name.name, dentry, dentry->d_parent, + dentry->d_inode, dentry->d_flags); + + d_lustre_invalidate(dentry, 0); + } + ll_unlock_dcache(inode); + + EXIT; +} + +int ll_revalidate_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, + struct dentry *de) +{ + struct inode *inode = de->d_inode; + __u64 bits = 0; + int rc = 0; + + ENTRY; + + if (!request) + RETURN(0); + + if (it_disposition(it, DISP_LOOKUP_NEG)) + RETURN(-ENOENT); + + rc = ll_prep_inode(&de->d_inode, request, NULL, it); + if (rc) + RETURN(rc); + + ll_set_lock_data(ll_i2sbi(inode)->ll_md_exp, inode, it, + &bits); + if (bits & MDS_INODELOCK_LOOKUP) { + ll_update_dir_depth(de->d_parent->d_inode, inode); + rc = ll_d_init(de); + if (rc < 0) + RETURN(rc); + d_lustre_revalidate(de); + } + + RETURN(rc); +} + +void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry) +{ + LASSERT(it != NULL); + LASSERT(dentry != NULL); + + if (it->it_lock_mode && dentry->d_inode != NULL) { + struct inode *inode = dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode); + + CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); + } + + /* drop lookup or getattr locks immediately */ + if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) { + /* on 2.6 there are situation when several lookups and + * revalidations may be requested during single operation. + * therefore, we don't release intent here -bzzz */ + ll_intent_drop_lock(it); + } +} + +static int ll_revalidate_dentry(struct dentry *dentry, + unsigned int lookup_flags) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct ll_dentry_data *lld = dentry->d_fsdata; + struct ll_sb_info *sbi; + + /* If this is intermediate component path lookup and we were able to get + * to this dentry, then its lock has not been revoked and the + * path component is valid. */ + if (lookup_flags & (LOOKUP_CONTINUE | LOOKUP_PARENT)) + return 1; + + /* Symlink - always valid as long as the dentry was found */ +#ifdef HAVE_IOP_GET_LINK + if (dentry->d_inode && dentry->d_inode->i_op->get_link) +#else + if (dentry->d_inode && dentry->d_inode->i_op->follow_link) +#endif + return 1; + + /* + * VFS warns us that this is the second go around and previous + * operation failed (most likely open|creat), so this time + * we better talk to the server via the lookup path by name, + * not by fid. + */ + if (lookup_flags & LOOKUP_REVAL) + return 0; + +#ifndef HAVE_DCACHE_LOCK + if (lookup_flags & LOOKUP_RCU) + return -ECHILD; +#endif + + /* + * To support metadata lazy load, we want to bypass negative lookup cache + * on the client. A negative dentry cache is a dentry node that does not + * have an inode associated with it. In these cases, return 0 here + * to force a lookup call to the server. + */ + sbi = ll_s2sbi(dentry->d_sb); + if (d_is_negative(dentry) && + sbi->ll_neg_dentry_timeout != OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS) { + LASSERT(lld != NULL); + if (!lld->lld_neg_cache_timeout) + lld->lld_neg_cache_timeout = jiffies + sbi->ll_neg_dentry_timeout * HZ; + + if (time_after(jiffies, lld->lld_neg_cache_timeout)) { + CDEBUG(D_VFSTRACE, + "negative dentry past timeout - flags: %u\n", lookup_flags); + return 0; + } + CDEBUG(D_VFSTRACE, + "negative dentry within timeout - flags: %u\n", lookup_flags); + } + + if (dentry_may_statahead(dir, dentry)) + ll_statahead(dir, &dentry, dentry->d_inode == NULL); + + return 1; +} + +/* + * Always trust cached dentries. Update statahead window if necessary. + */ +#ifdef HAVE_IOP_ATOMIC_OPEN +static int ll_revalidate_nd(struct dentry *dentry, unsigned int flags) +{ + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%s, flags=%u\n", + dentry->d_name.name, flags); + + rc = ll_revalidate_dentry(dentry, flags); + RETURN(rc); +} +#else +static int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd) +{ + int rc; + ENTRY; + + /* + * this is normally called from NFS export, and we don't know whether + * this is the last component. + */ + if (nd == NULL) + RETURN(1); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%s, flags=%u\n", + dentry->d_name.name, nd->flags); + + rc = ll_revalidate_dentry(dentry, nd->flags); + RETURN(rc); +} +#endif + +const struct dentry_operations ll_d_ops = { + .d_revalidate = ll_revalidate_nd, + .d_release = ll_release, + .d_delete = ll_ddelete, + .d_compare = ll_dcompare, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/dir.c b/drivers/staging/lustrefsx/lustre/llite/dir.c new file mode 100644 index 0000000000000..ce74bfb18dc57 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/dir.c @@ -0,0 +1,2410 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/dir.c + * + * Directory code for lustre client. + */ + +#include +#include +#include +#include +#include +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include +#include // for wait_on_buffer +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llite_internal.h" + +static int ll_check_and_trigger_restore(struct inode *dir) +{ + struct ll_sb_info *sbi = ll_i2sbi(dir); + const int max_retry = atomic_read(&sbi->ll_dir_restore_max_retry_count); + int retry_count = 0; + u32 hus_states; + __u32 gen = 0; + int rc; + + /* Skip restore if server does not support or if disabled */ + if (!exp_mdll(sbi->ll_md_exp) || exp_bypass_mdll(sbi->ll_md_exp)) + return 0; + + /* + * TODO-MDLL: + * use API that does a cached read instead of + * going to the mdt for getting the hsm state. + * Tracked with Simba-21644 + */ +try_again: + rc = ll_get_hsm_state(dir, &hus_states); + if (rc == 0 && (hus_states & HS_RELEASED)) { + CDEBUG(D_HSM, + "MDLL Calling ll_layout_restore for dir "DFID" retry: %d" + "\n", PFID(ll_inode2fid(dir)), retry_count); + rc = ll_layout_restore(dir, 0, OBD_OBJECT_EOF); + if (rc) { + CERROR("MDLL ll_layout_restore ("DFID") error rc: %d\n", + PFID(ll_inode2fid(dir)), rc); + rc = -EAGAIN; + if (max_retry == 0) + goto out_exit; + } else { + CDEBUG(D_HSM, "MDLL Restore triggered for dir "DFID"\n", + PFID(ll_inode2fid(dir))); + ll_layout_refresh(dir, &gen); + CDEBUG(D_HSM, "MDLL Restore done for dir "DFID"\n", + PFID(ll_inode2fid(dir))); + } + /* If the max_retry is set to 0, then the behavior would be + * without a retry. There wont be any check for the hsm state + * after the completed restore. This case would be similar to + * the behaviour without this retry changes. The default + * value of the max_retry would be 1. + * A value of -1 would retry indefinitely. + */ + /* In case of an mdt restart, the ll_layout_refresh would + * return back only after the mdt has restarted and the + * existing network connection gets a reset. When the retry + * happens, the mdt would be up and running. + * Ideally the directory restore would be done with a single + * retry if the mdt does not crash/restart again. + */ + if ((max_retry < 0) || + (max_retry >= 0 && retry_count < max_retry)) { + retry_count++; + goto try_again; + } else if (max_retry > 0 && retry_count >= max_retry) { + rc = ll_get_hsm_state(dir, &hus_states); + if (rc == 0 && (hus_states & HS_RELEASED)) { + CDEBUG(D_HSM, + "MDLL reached max retry %d for ("DFID")" + "hsm_state: %d\n", + retry_count, PFID(ll_inode2fid(dir)), + hus_states); + rc = -EAGAIN; + goto out_exit; + } + } + } + if (rc != 0) { + CDEBUG(D_HSM, + "MDLL error calling ll_get_hsm_state for dir "DFID" rc: " + "%d\n", PFID(ll_inode2fid(dir)), rc); + rc = -EAGAIN; + } + +out_exit: + return rc; +} + +/* + * (new) readdir implementation overview. + * + * Original lustre readdir implementation cached exact copy of raw directory + * pages on the client. These pages were indexed in client page cache by + * logical offset in the directory file. This design, while very simple and + * intuitive had some inherent problems: + * + * . it implies that byte offset to the directory entry serves as a + * telldir(3)/seekdir(3) cookie, but that offset is not stable: in + * ext3/htree directory entries may move due to splits, and more + * importantly, + * + * . it is incompatible with the design of split directories for cmd3, + * that assumes that names are distributed across nodes based on their + * hash, and so readdir should be done in hash order. + * + * New readdir implementation does readdir in hash order, and uses hash of a + * file name as a telldir/seekdir cookie. This led to number of complications: + * + * . hash is not unique, so it cannot be used to index cached directory + * pages on the client (note, that it requires a whole pageful of hash + * collided entries to cause two pages to have identical hashes); + * + * . hash is not unique, so it cannot, strictly speaking, be used as an + * entry cookie. ext3/htree has the same problem and lustre implementation + * mimics their solution: seekdir(hash) positions directory at the first + * entry with the given hash. + * + * Client side. + * + * 0. caching + * + * Client caches directory pages using hash of the first entry as an index. As + * noted above hash is not unique, so this solution doesn't work as is: + * special processing is needed for "page hash chains" (i.e., sequences of + * pages filled with entries all having the same hash value). + * + * First, such chains have to be detected. To this end, server returns to the + * client the hash of the first entry on the page next to one returned. When + * client detects that this hash is the same as hash of the first entry on the + * returned page, page hash collision has to be handled. Pages in the + * hash chain, except first one, are termed "overflow pages". + * + * Solution to index uniqueness problem is to not cache overflow + * pages. Instead, when page hash collision is detected, all overflow pages + * from emerging chain are immediately requested from the server and placed in + * a special data structure (struct ll_dir_chain). This data structure is used + * by ll_readdir() to process entries from overflow pages. When readdir + * invocation finishes, overflow pages are discarded. If page hash collision + * chain weren't completely processed, next call to readdir will again detect + * page hash collision, again read overflow pages in, process next portion of + * entries and again discard the pages. This is not as wasteful as it looks, + * because, given reasonable hash, page hash collisions are extremely rare. + * + * 1. directory positioning + * + * When seekdir(hash) is called, original + * + * + * + * + * + * + * + * + * Server. + * + * identification of and access to overflow pages + * + * page format + * + * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains + * a header lu_dirpage which describes the start/end hash, and whether this + * page is empty (contains no dir entry) or hash collide with next page. + * After client receives reply, several pages will be integrated into dir page + * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the + * lu_dirpage for this integrated page will be adjusted. See + * mdc_adjust_dirpages(). + * + */ +struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data, + __u64 offset, struct ll_dir_chain *chain) +{ + struct md_callback cb_op; + struct page *page; + int rc; + + rc = ll_check_and_trigger_restore(dir); + if (rc != 0) + return ERR_PTR(rc); + + cb_op.md_blocking_ast = ll_md_blocking_ast; + rc = md_read_page(ll_i2mdexp(dir), op_data, &cb_op, offset, &page); + if (rc != 0) + return ERR_PTR(rc); + + return page; +} + +void ll_release_page(struct inode *inode, struct page *page, + bool remove) +{ + kunmap(page); + + /* Always remove the page for striped dir, because the page is + * built from temporarily in LMV layer */ + if (inode && ll_dir_striped(inode)) { + __free_page(page); + return; + } + + if (remove) { + lock_page(page); + if (likely(page->mapping != NULL)) + truncate_complete_page(page->mapping, page); + unlock_page(page); + } + put_page(page); +} + +/** + * return IF_* type for given lu_dirent entry. + * IF_* flag shld be converted to particular OS file type in + * platform llite module. + */ +static u16 ll_dirent_type_get(struct lu_dirent *ent) +{ + u16 type = 0; + struct luda_type *lt; + int len = 0; + + if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) { + const unsigned align = sizeof(struct luda_type) - 1; + + len = le16_to_cpu(ent->lde_namelen); + len = (len + align) & ~align; + lt = (void *)ent->lde_name + len; + type = IFTODT(le16_to_cpu(lt->lt_type)); + } + + return type; +} + +#ifdef HAVE_DIR_CONTEXT +int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data, + struct dir_context *ctx) +{ +#else +int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data, + void *cookie, filldir_t filldir) +{ +#endif + struct ll_sb_info *sbi = ll_i2sbi(inode); + __u64 pos = *ppos; + bool is_api32 = ll_need_32bit_api(sbi); + bool is_hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; + struct page *page; + struct ll_dir_chain chain; + bool done = false; + int rc = 0; + ENTRY; + + ll_dir_chain_init(&chain); + + page = ll_get_dir_page(inode, op_data, pos, &chain); + + while (rc == 0 && !done) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + __u64 hash; + __u64 next; + + if (IS_ERR(page)) { + rc = PTR_ERR(page); + break; + } + + hash = MDS_DIR_END_OFF; + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL && !done; + ent = lu_dirent_next(ent)) { + __u16 type; + int namelen; + struct lu_fid fid; + __u64 lhash; + __u64 ino; + + hash = le64_to_cpu(ent->lde_hash); + if (hash < pos) /* Skip until we find target hash */ + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (namelen == 0) /* Skip dummy record */ + continue; + + if (is_api32 && is_hash64) + lhash = hash >> 32; + else + lhash = hash; + fid_le_to_cpu(&fid, &ent->lde_fid); + ino = cl_fid_build_ino(&fid, is_api32); + type = ll_dirent_type_get(ent); + /* For ll_nfs_get_name_filldir(), it will try to access + * 'ent' through 'lde_name', so the parameter 'name' + * for 'filldir()' must be part of the 'ent'. */ +#ifdef HAVE_DIR_CONTEXT + ctx->pos = lhash; + done = !dir_emit(ctx, ent->lde_name, namelen, ino, + type); +#else + done = filldir(cookie, ent->lde_name, namelen, lhash, + ino, type); +#endif + } + + if (done) { + pos = hash; + ll_release_page(inode, page, false); + break; + } + + next = le64_to_cpu(dp->ldp_hash_end); + pos = next; + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + done = 1; + ll_release_page(inode, page, false); + } else { + /* + * Normal case: continue to the next + * page. + */ + ll_release_page(inode, page, + le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + next = pos; + page = ll_get_dir_page(inode, op_data, pos, + &chain); + } + } +#ifdef HAVE_DIR_CONTEXT + ctx->pos = pos; +#else + *ppos = pos; +#endif + ll_dir_chain_fini(&chain); + RETURN(rc); +} + +#ifdef HAVE_DIR_CONTEXT +static int ll_iterate(struct file *filp, struct dir_context *ctx) +#else +static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) +#endif +{ + struct inode *inode = file_inode(filp); + struct ll_file_data *lfd = LUSTRE_FPRIVATE(filp); + struct ll_sb_info *sbi = ll_i2sbi(inode); + int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; + int api32 = ll_need_32bit_api(sbi); + struct md_op_data *op_data; + struct lu_fid pfid = { 0 }; + __u64 pos; + int rc; + ENTRY; + + if (lfd != NULL) + pos = lfd->lfd_pos; + else + pos = 0; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) pos/size" + "%lu/%llu 32bit_api %d\n", PFID(ll_inode2fid(inode)), + inode, (unsigned long)pos, i_size_read(inode), api32); + + if (pos == MDS_DIR_END_OFF) + /* + * end-of-file. + */ + GOTO(out, rc = 0); + + if (unlikely(ll_dir_striped(inode))) { + /* + * This is only needed for striped dir to fill .., + * see lmv_read_page() + */ + if (file_dentry(filp)->d_parent != NULL && + file_dentry(filp)->d_parent->d_inode != NULL) { + __u64 ibits = MDS_INODELOCK_LOOKUP; + struct inode *parent = + file_dentry(filp)->d_parent->d_inode; + + if (ll_have_md_lock(parent, &ibits, LCK_MINMODE)) + pfid = *ll_inode2fid(parent); + } + + /* If it can not find in cache, do lookup .. on the master + * object */ + if (fid_is_zero(&pfid)) { + rc = ll_dir_get_parent_fid(inode, &pfid); + if (rc != 0) + RETURN(rc); + } + } + + op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0, + LUSTRE_OPC_ANY, inode); + if (IS_ERR(op_data)) + GOTO(out, rc = PTR_ERR(op_data)); + op_data->op_fid3 = pfid; + +#ifdef HAVE_DIR_CONTEXT + ctx->pos = pos; + rc = ll_dir_read(inode, &pos, op_data, ctx); + pos = ctx->pos; +#else + rc = ll_dir_read(inode, &pos, op_data, cookie, filldir); +#endif + if (lfd != NULL) + lfd->lfd_pos = pos; + + if (pos == MDS_DIR_END_OFF) { + if (api32) + pos = LL_DIR_END_OFF_32BIT; + else + pos = LL_DIR_END_OFF; + } else { + if (api32 && hash64) + pos = pos >> 32; + } +#ifdef HAVE_DIR_CONTEXT + ctx->pos = pos; +#else + filp->f_pos = pos; +#endif + ll_finish_md_op_data(op_data); + +out: + if (!rc) + ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1); + + RETURN(rc); +} + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0) +static int ll_send_mgc_param(struct obd_export *mgc, char *string) +{ + struct mgs_send_param *msp; + int rc = 0; + + OBD_ALLOC_PTR(msp); + if (!msp) + return -ENOMEM; + + strlcpy(msp->mgs_param, string, sizeof(msp->mgs_param)); + rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO, + sizeof(struct mgs_send_param), msp, NULL); + if (rc) + CERROR("Failed to set parameter: %d\n", rc); + OBD_FREE_PTR(msp); + + return rc; +} +#endif + +/** + * Create striped directory with specified stripe(@lump) + * + * \param[in] dparent the parent of the directory. + * \param[in] lump the specified stripes. + * \param[in] dirname the name of the directory. + * \param[in] mode the specified mode of the directory. + * + * \retval =0 if striped directory is being created successfully. + * <0 if the creation is failed. + */ +static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump, + size_t len, const char *dirname, umode_t mode) +{ + struct inode *parent = dparent->d_inode; + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + struct ll_sb_info *sbi = ll_i2sbi(parent); + struct inode *inode = NULL; + struct dentry dentry = { + .d_parent = dparent, + .d_name = { + .name = dirname, + .len = strlen(dirname), + .hash = ll_full_name_hash(dparent, dirname, + strlen(dirname)), + }, + }; + int err; + ENTRY; + + if (unlikely(lump->lum_magic != LMV_USER_MAGIC && + lump->lum_magic != LMV_USER_MAGIC_SPECIFIC)) + RETURN(-EINVAL); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) name %s " + "stripe_offset %d, stripe_count: %u\n", + PFID(ll_inode2fid(parent)), parent, dirname, + (int)lump->lum_stripe_offset, lump->lum_stripe_count); + + if (lump->lum_stripe_count > 1 && + !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_DIR_STRIPE)) + RETURN(-EINVAL); + + if (IS_DEADDIR(parent) && + !OBD_FAIL_CHECK(OBD_FAIL_LLITE_NO_CHECK_DEAD)) + RETURN(-ENOENT); + + if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC) && + lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC)) + lustre_swab_lmv_user_md(lump); + + if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) + mode &= ~current_umask(); + mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; + op_data = ll_prep_md_op_data(NULL, parent, NULL, dirname, + strlen(dirname), mode, LUSTRE_OPC_MKDIR, + lump); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_dir_depth = ll_i2info(parent)->lli_dir_depth; + + if (sbi->ll_flags & LL_SBI_FILE_SECCTX) { + /* selinux_dentry_init_security() uses dentry->d_parent and name + * to determine the security context for the file. So our fake + * dentry should be real enough for this purpose. */ + err = ll_dentry_init_security(&dentry, mode, &dentry.d_name, + &op_data->op_file_secctx_name, + &op_data->op_file_secctx, + &op_data->op_file_secctx_size); + if (err < 0) + GOTO(out_op_data, err); + } + + op_data->op_cli_flags |= CLI_SET_MEA; + err = md_create(sbi->ll_md_exp, op_data, lump, len, mode, + from_kuid(&init_user_ns, current_fsuid()), + from_kgid(&init_user_ns, current_fsgid()), + cfs_curproc_cap_pack(), 0, &request); + if (err) + GOTO(out_request, err); + + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE, cfs_fail_val); + + err = ll_prep_inode(&inode, request, parent->i_sb, NULL); + if (err) + GOTO(out_inode, err); + + dentry.d_inode = inode; + + if (sbi->ll_flags & LL_SBI_FILE_SECCTX) { + /* no need to protect selinux_inode_setsecurity() by + * inode_lock. Taking it would lead to a client deadlock + * LU-13617 + */ + err = security_inode_notifysecctx(inode, + op_data->op_file_secctx, + op_data->op_file_secctx_size); + } else { + err = ll_inode_init_security(&dentry, inode, parent); + } + if (err) + GOTO(out_inode, err); + +out_inode: + if (inode != NULL) + iput(inode); +out_request: + ptlrpc_req_finished(request); +out_op_data: + ll_finish_md_op_data(op_data); + + return err; +} + +int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, + int set_default) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + int rc = 0; +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0) + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + struct obd_device *mgc = lsi->lsi_mgc; +#endif + int lum_size; + ENTRY; + + if (lump != NULL) { + switch (lump->lmm_magic) { + case LOV_USER_MAGIC_V1: + lum_size = sizeof(struct lov_user_md_v1); + break; + case LOV_USER_MAGIC_V3: + lum_size = sizeof(struct lov_user_md_v3); + break; + case LOV_USER_MAGIC_COMP_V1: + lum_size = ((struct lov_comp_md_v1 *)lump)->lcm_size; + break; + case LMV_USER_MAGIC: + if (lump->lmm_magic != cpu_to_le32(LMV_USER_MAGIC)) + lustre_swab_lmv_user_md( + (struct lmv_user_md *)lump); + lum_size = sizeof(struct lmv_user_md); + break; + case LOV_USER_MAGIC_SPECIFIC: { + struct lov_user_md_v3 *v3 = + (struct lov_user_md_v3 *)lump; + if (v3->lmm_stripe_count > LOV_MAX_STRIPE_COUNT) + RETURN(-EINVAL); + lum_size = lov_user_md_size(v3->lmm_stripe_count, + LOV_USER_MAGIC_SPECIFIC); + break; + } + default: + CDEBUG(D_IOCTL, "bad userland LOV MAGIC:" + " %#08x != %#08x nor %#08x\n", + lump->lmm_magic, LOV_USER_MAGIC_V1, + LOV_USER_MAGIC_V3); + RETURN(-EINVAL); + } + + /* + * This is coming from userspace, so should be in + * local endian. But the MDS would like it in little + * endian, so we swab it before we send it. + */ + if ((__swab32(lump->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) == + le32_to_cpu(LOV_MAGIC_MAGIC)) + lustre_swab_lov_user_md(lump, 0); + } else { + lum_size = sizeof(struct lov_user_md_v1); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + /* swabbing is done in lov_setstripe() on server side */ + rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc) + RETURN(rc); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0) + /* + * 2.9 server has stored filesystem default stripe in ROOT xattr, + * and it's stored into system config for backward compatibility. + * + * In the following we use the fact that LOV_USER_MAGIC_V1 and + * LOV_USER_MAGIC_V3 have the same initial fields so we do not + * need the make the distiction between the 2 versions + */ + if (set_default && mgc->u.cli.cl_mgc_mgsexp && + (lump == NULL || + le32_to_cpu(lump->lmm_magic) == LOV_USER_MAGIC_V1 || + le32_to_cpu(lump->lmm_magic) == LOV_USER_MAGIC_V3)) { + char *param = NULL; + char *buf; + + OBD_ALLOC(param, MGS_PARAM_MAXLEN); + if (param == NULL) + GOTO(end, rc = -ENOMEM); + + buf = param; + /* Get fsname and assume devname to be -MDT0000. */ + ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN); + strcat(buf, "-MDT0000.lov"); + buf += strlen(buf); + + /* Set root stripesize */ + snprintf(buf, MGS_PARAM_MAXLEN, ".stripesize=%u", + lump ? le32_to_cpu(lump->lmm_stripe_size) : 0); + rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); + if (rc) + GOTO(end, rc); + + /* Set root stripecount */ + snprintf(buf, MGS_PARAM_MAXLEN, ".stripecount=%hd", + lump ? le16_to_cpu(lump->lmm_stripe_count) : 0); + rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); + if (rc) + GOTO(end, rc); + + /* Set root stripeoffset */ + snprintf(buf, MGS_PARAM_MAXLEN, ".stripeoffset=%hd", + lump ? le16_to_cpu(lump->lmm_stripe_offset) : + (typeof(lump->lmm_stripe_offset))(-1)); + rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); + +end: + if (param != NULL) + OBD_FREE(param, MGS_PARAM_MAXLEN); + } +#endif + RETURN(rc); +} + +int ll_dir_get_default_layout(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, u64 valid, + enum get_default_layout_type type) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct mdt_body *body; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *req = NULL; + int lmm_size = OBD_MAX_DEFAULT_EA_SIZE; + struct md_op_data *op_data; + struct lu_fid fid; + int rc; + + ENTRY; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, lmm_size, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = valid | OBD_MD_FLEASIZE | OBD_MD_FLDIREA; + + if (type == GET_DEFAULT_LAYOUT_ROOT) { + lu_root_fid(&op_data->op_fid1); + fid = op_data->op_fid1; + } else { + fid = *ll_inode2fid(inode); + } + + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr failed on inode "DFID": rc %d\n", + PFID(&fid), rc); + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + lmm_size = body->mbo_eadatasize; + + if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || + lmm_size == 0) { + GOTO(out, rc = -ENODATA); + } + + lmm = req_capsule_server_sized_get(&req->rq_pill, + &RMF_MDT_MD, lmm_size); + LASSERT(lmm != NULL); + + /* + * This is coming from the MDS, so is probably in + * little endian. We convert it to host endian before + * passing it to userspace. + */ + /* We don't swab objects for directories */ + switch (le32_to_cpu(lmm->lmm_magic)) { + case LOV_MAGIC_V1: + case LOV_MAGIC_V3: + case LOV_MAGIC_COMP_V1: + case LOV_USER_MAGIC_SPECIFIC: + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) + lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0); + break; + case LMV_MAGIC_V1: + if (LMV_MAGIC != cpu_to_le32(LMV_MAGIC)) + lustre_swab_lmv_mds_md((union lmv_mds_md *)lmm); + break; + case LMV_USER_MAGIC: + if (LMV_USER_MAGIC != cpu_to_le32(LMV_USER_MAGIC)) + lustre_swab_lmv_user_md((struct lmv_user_md *)lmm); + break; + default: + CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic); + rc = -EPROTO; + } +out: + *plmm = lmm; + *plmm_size = lmm_size; + *request = req; + return rc; +} + +/** + * This function will be used to get default LOV/LMV/Default LMV + * @valid will be used to indicate which stripe it will retrieve. + * If the directory does not have its own default layout, then the + * function will request the default layout from root FID. + * OBD_MD_MEA LMV stripe EA + * OBD_MD_DEFAULT_MEA Default LMV stripe EA + * otherwise Default LOV EA. + * Each time, it can only retrieve 1 stripe EA + **/ +int ll_dir_getstripe_default(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, + struct ptlrpc_request **root_request, + u64 valid) +{ + struct ptlrpc_request *req = NULL; + struct ptlrpc_request *root_req = NULL; + struct lov_mds_md *lmm = NULL; + int lmm_size = 0; + int rc = 0; + ENTRY; + + rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size, + &req, valid, 0); + if (rc == -ENODATA && !fid_is_root(ll_inode2fid(inode)) && + !(valid & OBD_MD_MEA) && root_request != NULL) { + int rc2 = ll_dir_get_default_layout(inode, (void **)&lmm, + &lmm_size, &root_req, valid, + GET_DEFAULT_LAYOUT_ROOT); + if (rc2 == 0) + rc = 0; + } + + *plmm = lmm; + *plmm_size = lmm_size; + *request = req; + if (root_request != NULL) + *root_request = root_req; + + RETURN(rc); +} + +/** + * This function will be used to get default LOV/LMV/Default LMV + * @valid will be used to indicate which stripe it will retrieve + * OBD_MD_MEA LMV stripe EA + * OBD_MD_DEFAULT_MEA Default LMV stripe EA + * otherwise Default LOV EA. + * Each time, it can only retrieve 1 stripe EA + **/ +int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, u64 valid) +{ + struct ptlrpc_request *req = NULL; + struct lov_mds_md *lmm = NULL; + int lmm_size = 0; + int rc = 0; + ENTRY; + + rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size, + &req, valid, 0); + + *plmm = lmm; + *plmm_size = lmm_size; + *request = req; + + RETURN(rc); +} + +int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid) +{ + struct md_op_data *op_data; + int rc; + int mdt_index; + ENTRY; + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + RETURN(-ENOMEM); + + op_data->op_flags |= MF_GET_MDT_IDX; + op_data->op_fid1 = *fid; + rc = md_getattr(sbi->ll_md_exp, op_data, NULL); + mdt_index = op_data->op_mds; + OBD_FREE_PTR(op_data); + if (rc < 0) + RETURN(rc); + + RETURN(mdt_index); +} + +/* + * Get MDT index for the inode. + */ +int ll_get_mdt_idx(struct inode *inode) +{ + return ll_get_mdt_idx_by_fid(ll_i2sbi(inode), ll_inode2fid(inode)); +} + +/** + * Generic handler to do any pre-copy work. + * + * It sends a first hsm_progress (with extent length == 0) to coordinator as a + * first information for it that real work has started. + * + * Moreover, for a ARCHIVE request, it will sample the file data version and + * store it in \a copy. + * + * \return 0 on success. + */ +static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct hsm_progress_kernel hpk; + int rc = 0; + int rc2; + ENTRY; + + /* Forge a hsm_progress based on data from copy. */ + hpk.hpk_fid = copy->hc_hai.hai_fid; + hpk.hpk_cookie = copy->hc_hai.hai_cookie; + hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset; + hpk.hpk_extent.length = 0; + hpk.hpk_flags = 0; + hpk.hpk_errval = 0; + hpk.hpk_data_version = 0; + + + /* For archive request, we need to read the current file version. */ + if (copy->hc_hai.hai_action == HSMA_ARCHIVE) { + struct inode *inode; + __u64 data_version = 0; + + /* Get inode for this fid */ + inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); + if (IS_ERR(inode)) { + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval is >= 0 */ + hpk.hpk_errval = -PTR_ERR(inode); + GOTO(progress, rc = PTR_ERR(inode)); + } + + /* Read current file data version */ + rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH); + iput(inode); + if (rc != 0) { + CDEBUG(D_HSM, "Could not read file data version of " + DFID" (rc = %d). Archive request (" + "%#llx) could not be done.\n", + PFID(©->hc_hai.hai_fid), rc, + copy->hc_hai.hai_cookie); + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = -rc; + GOTO(progress, rc); + } + + /* Store in the hsm_copy for later copytool use. + * Always modified even if no lsm. */ + copy->hc_data_version = data_version; + + } else if (copy->hc_hai.hai_action == HSMA_IMPORT) { + + /* IMPORT sends its progress using alloc fid when possible */ + hpk.hpk_fid = copy->hc_hai.hai_dfid; + } + +progress: + /* On error, the request should be considered as completed */ + if (hpk.hpk_errval > 0) + hpk.hpk_flags |= HP_FLAG_COMPLETED; + + rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), + &hpk, NULL); + + /* Return first error */ + RETURN(rc != 0 ? rc : rc2); +} + +/** + * Generic handler to do any post-copy work. + * + * It will send the last hsm_progress update to coordinator to inform it + * that copy is finished and whether it was successful or not. + * + * Moreover, + * - for ARCHIVE request, it will sample the file data version and compare it + * with the version saved in ll_ioc_copy_start(). If they do not match, copy + * will be considered as failed. + * - for RESTORE request, it will sample the file data version and send it to + * coordinator which is useful if the file was imported as 'released'. + * + * \return 0 on success. + */ +static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct hsm_progress_kernel hpk; + int rc = 0; + int rc2; + ENTRY; + + /* If you modify the logic here, also check llapi_hsm_copy_end(). */ + /* Take care: copy->hc_hai.hai_action, len, gid and data are not + * initialized if copy_end was called with copy == NULL. + */ + + /* Forge a hsm_progress based on data from copy. */ + hpk.hpk_fid = copy->hc_hai.hai_fid; + hpk.hpk_cookie = copy->hc_hai.hai_cookie; + hpk.hpk_extent = copy->hc_hai.hai_extent; + hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED; + hpk.hpk_errval = copy->hc_errval; + hpk.hpk_data_version = 0; + + /* For archive request, we need to check the file data was not changed. + * + * For restore request, we need to send the file data version, this is + * useful when the file was created using hsm_import. + */ + if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) || + (copy->hc_hai.hai_action == HSMA_RESTORE)) && + (copy->hc_errval == 0)) { + struct inode *inode; + __u64 data_version = 0; + + /* Get lsm for this fid */ + inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); + if (IS_ERR(inode)) { + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = -PTR_ERR(inode); + GOTO(progress, rc = PTR_ERR(inode)); + } + + rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH); + iput(inode); + if (rc) { + CDEBUG(D_HSM, "Could not read file data version. " + "Request could not be confirmed.\n"); + if (hpk.hpk_errval == 0) + hpk.hpk_errval = -rc; + GOTO(progress, rc); + } + + /* Store in the hsm_copy for later copytool use. + * Always modified even if no lsm. */ + hpk.hpk_data_version = data_version; + + /* File could have been stripped during archiving, so we need + * to check anyway. */ + if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) && + (copy->hc_data_version != data_version)) { + CDEBUG(D_HSM, "File data version mismatched. " + "File content was changed during archiving. " + DFID", start:%#llx current:%#llx\n", + PFID(©->hc_hai.hai_fid), + copy->hc_data_version, data_version); + /* File was changed, send error to cdt. Do not ask for + * retry because if a file is modified frequently, + * the cdt will loop on retried archive requests. + * The policy engine will ask for a new archive later + * when the file will not be modified for some tunable + * time */ + hpk.hpk_flags &= ~HP_FLAG_RETRY; + rc = -EBUSY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = -rc; + GOTO(progress, rc); + } + + } else if (copy->hc_hai.hai_action == HSMA_IMPORT) { + + /* IMPORT sends its progress using alloc fid when possible */ + hpk.hpk_fid = copy->hc_hai.hai_dfid; + } + +progress: + rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), + &hpk, NULL); + + /* Return first error */ + RETURN(rc != 0 ? rc : rc2); +} + + +static int copy_and_ct_start(int cmd, struct obd_export *exp, + const struct lustre_kernelcomm __user *data) +{ + struct lustre_kernelcomm *lk; + struct lustre_kernelcomm *tmp; + size_t size = sizeof(*lk); + size_t new_size; + int i; + int rc; + + /* copy data from userspace to get numbers of archive_id */ + OBD_ALLOC(lk, size); + if (lk == NULL) + return -ENOMEM; + + if (copy_from_user(lk, data, size)) + GOTO(out_lk, rc = -EFAULT); + + if (lk->lk_flags & LK_FLG_STOP) + goto do_ioctl; + + if (!(lk->lk_flags & LK_FLG_DATANR)) { + __u32 archive_mask = lk->lk_data_count; + int count; + + /* old hsm agent to old MDS */ + if (!exp_connect_archive_id_array(exp)) + goto do_ioctl; + + /* old hsm agent to new MDS */ + lk->lk_flags |= LK_FLG_DATANR; + + if (archive_mask == 0) + goto do_ioctl; + + count = hweight32(archive_mask); + new_size = offsetof(struct lustre_kernelcomm, lk_data[count]); + OBD_ALLOC(tmp, new_size); + if (tmp == NULL) + GOTO(out_lk, rc = -ENOMEM); + + memcpy(tmp, lk, size); + tmp->lk_data_count = count; + OBD_FREE(lk, size); + lk = tmp; + size = new_size; + + count = 0; + for (i = 0; i < sizeof(archive_mask) * 8; i++) { + if ((1 << i) & archive_mask) { + lk->lk_data[count] = i + 1; + count++; + } + } + goto do_ioctl; + } + + /* new hsm agent to new mds */ + if (lk->lk_data_count > 0) { + new_size = offsetof(struct lustre_kernelcomm, + lk_data[lk->lk_data_count]); + OBD_ALLOC(tmp, new_size); + if (tmp == NULL) + GOTO(out_lk, rc = -ENOMEM); + + OBD_FREE(lk, size); + lk = tmp; + size = new_size; + + if (copy_from_user(lk, data, size)) + GOTO(out_lk, rc = -EFAULT); + } + + /* new hsm agent to old MDS */ + if (!exp_connect_archive_id_array(exp)) { + __u32 archives = 0; + + if (lk->lk_data_count > LL_HSM_ORIGIN_MAX_ARCHIVE) + GOTO(out_lk, rc = -EINVAL); + + for (i = 0; i < lk->lk_data_count; i++) { + if (lk->lk_data[i] > LL_HSM_ORIGIN_MAX_ARCHIVE) { + rc = -EINVAL; + CERROR("%s: archive id %d requested but only " + "[0 - %zu] supported: rc = %d\n", + exp->exp_obd->obd_name, lk->lk_data[i], + LL_HSM_ORIGIN_MAX_ARCHIVE, rc); + GOTO(out_lk, rc); + } + + if (lk->lk_data[i] == 0) { + archives = 0; + break; + } + + archives |= (1 << (lk->lk_data[i] - 1)); + } + lk->lk_flags &= ~LK_FLG_DATANR; + lk->lk_data_count = archives; + } +do_ioctl: + rc = obd_iocontrol(cmd, exp, size, lk, NULL); +out_lk: + OBD_FREE(lk, size); + return rc; +} + +static int check_owner(int type, int id) +{ + switch (type) { + case USRQUOTA: + if (!uid_eq(current_euid(), make_kuid(&init_user_ns, id))) + return -EPERM; + break; + case GRPQUOTA: + if (!in_egroup_p(make_kgid(&init_user_ns, id))) + return -EPERM; + break; + case PRJQUOTA: + break; + } + return 0; +} + +static int quotactl_ioctl(struct super_block *sb, struct if_quotactl *qctl) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + int cmd = qctl->qc_cmd; + int type = qctl->qc_type; + int id = qctl->qc_id; + int valid = qctl->qc_valid; + int rc = 0; + ENTRY; + + switch (cmd) { + case Q_SETQUOTA: + case Q_SETINFO: + case LUSTRE_Q_SETDEFAULT: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + RETURN(-EPERM); + + if (sb->s_flags & SB_RDONLY) + RETURN(-EROFS); + break; + case Q_GETQUOTA: + case LUSTRE_Q_GETDEFAULT: + if (check_owner(type, id) && + (!cfs_capable(CFS_CAP_SYS_ADMIN))) + RETURN(-EPERM); + break; + case Q_GETINFO: + break; + default: + CERROR("unsupported quotactl op: %#x\n", cmd); + RETURN(-ENOTSUPP); + } + + if (valid != QC_GENERAL) { + if (cmd == Q_GETINFO) + qctl->qc_cmd = Q_GETOINFO; + else if (cmd == Q_GETQUOTA) + qctl->qc_cmd = Q_GETOQUOTA; + else + RETURN(-EINVAL); + + switch (valid) { + case QC_MDTIDX: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, + sizeof(*qctl), qctl, NULL); + break; + case QC_OSTIDX: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp, + sizeof(*qctl), qctl, NULL); + break; + case QC_UUID: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, + sizeof(*qctl), qctl, NULL); + if (rc == -EAGAIN) + rc = obd_iocontrol(OBD_IOC_QUOTACTL, + sbi->ll_dt_exp, + sizeof(*qctl), qctl, NULL); + break; + default: + rc = -EINVAL; + break; + } + + if (rc) + RETURN(rc); + + qctl->qc_cmd = cmd; + } else { + struct obd_quotactl *oqctl; + + OBD_ALLOC_PTR(oqctl); + if (oqctl == NULL) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(sbi->ll_md_exp, oqctl); + if (rc) { + OBD_FREE_PTR(oqctl); + RETURN(rc); + } + /* If QIF_SPACE is not set, client should collect the + * space usage from OSSs by itself */ + if (cmd == Q_GETQUOTA && + !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) && + !oqctl->qc_dqblk.dqb_curspace) { + struct obd_quotactl *oqctl_tmp; + + OBD_ALLOC_PTR(oqctl_tmp); + if (oqctl_tmp == NULL) + GOTO(out, rc = -ENOMEM); + + oqctl_tmp->qc_cmd = Q_GETOQUOTA; + oqctl_tmp->qc_id = oqctl->qc_id; + oqctl_tmp->qc_type = oqctl->qc_type; + + /* collect space usage from OSTs */ + oqctl_tmp->qc_dqblk.dqb_curspace = 0; + rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp); + if (!rc || rc == -EREMOTEIO) { + oqctl->qc_dqblk.dqb_curspace = + oqctl_tmp->qc_dqblk.dqb_curspace; + oqctl->qc_dqblk.dqb_valid |= QIF_SPACE; + } + + /* collect space & inode usage from MDTs */ + oqctl_tmp->qc_dqblk.dqb_curspace = 0; + oqctl_tmp->qc_dqblk.dqb_curinodes = 0; + rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp); + if (!rc || rc == -EREMOTEIO) { + oqctl->qc_dqblk.dqb_curspace += + oqctl_tmp->qc_dqblk.dqb_curspace; + oqctl->qc_dqblk.dqb_curinodes = + oqctl_tmp->qc_dqblk.dqb_curinodes; + oqctl->qc_dqblk.dqb_valid |= QIF_INODES; + } else { + oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE; + } + + OBD_FREE_PTR(oqctl_tmp); + } +out: + QCTL_COPY(qctl, oqctl); + OBD_FREE_PTR(oqctl); + } + + RETURN(rc); +} + +int ll_rmfid(struct file *file, void __user *arg) +{ + const struct fid_array __user *ufa = arg; + struct fid_array *lfa = NULL; + size_t size; + unsigned nr; + int i, rc, *rcs = NULL; + ENTRY; + + if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) && + !(ll_i2sbi(file_inode(file))->ll_flags & LL_SBI_USER_FID2PATH)) + RETURN(-EPERM); + /* Only need to get the buflen */ + if (get_user(nr, &ufa->fa_nr)) + RETURN(-EFAULT); + /* DoS protection */ + if (nr > OBD_MAX_FIDS_IN_ARRAY) + RETURN(-E2BIG); + + size = offsetof(struct fid_array, fa_fids[nr]); + OBD_ALLOC(lfa, size); + if (!lfa) + RETURN(-ENOMEM); + OBD_ALLOC(rcs, sizeof(int) * nr); + if (!rcs) + GOTO(free_lfa, rc = -ENOMEM); + + if (copy_from_user(lfa, arg, size)) + GOTO(free_rcs, rc = -EFAULT); + + /* Call mdc_iocontrol */ + rc = md_rmfid(ll_i2mdexp(file_inode(file)), lfa, rcs, NULL); + if (!rc) { + for (i = 0; i < nr; i++) + if (rcs[i]) + lfa->fa_fids[i].f_ver = rcs[i]; + if (copy_to_user(arg, lfa, size)) + rc = -EFAULT; + } + +free_rcs: + OBD_FREE(rcs, sizeof(int) * nr); +free_lfa: + OBD_FREE(lfa, size); + + RETURN(rc); +} + +/* This function tries to get a single name component, + * to send to the server. No actual path traversal involved, + * so we limit to NAME_MAX */ +static char *ll_getname(const char __user *filename) +{ + int ret = 0, len; + char *tmp; + + OBD_ALLOC(tmp, NAME_MAX + 1); + + if (!tmp) + return ERR_PTR(-ENOMEM); + + len = strncpy_from_user(tmp, filename, NAME_MAX + 1); + if (len < 0) + ret = -ENOENT; + else if (len > NAME_MAX) + ret = -ENAMETOOLONG; + + if (ret) { + OBD_FREE(tmp, NAME_MAX + 1); + tmp = ERR_PTR(ret); + } + return tmp; +} + +#define ll_putname(filename) OBD_FREE(filename, NAME_MAX + 1); + +static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct dentry *dentry = file_dentry(file); + struct inode *inode = file_inode(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_ioctl_data *data; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%#x\n", + PFID(ll_inode2fid(inode)), inode, cmd); + + /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ + if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ + return -ENOTTY; + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); + switch (cmd) { + case FS_IOC_GETFLAGS: + case FS_IOC_SETFLAGS: + RETURN(ll_iocontrol(inode, file, cmd, arg)); + case FSFILT_IOC_GETVERSION: + case FS_IOC_GETVERSION: + RETURN(put_user(inode->i_generation, (int __user *)arg)); + /* We need to special case any other ioctls we want to handle, + * to send them to the MDS/OST as appropriate and to properly + * network encode the arg field. */ + case FS_IOC_SETVERSION: + RETURN(-ENOTSUPP); + + case LL_IOC_GET_MDTIDX: { + int mdtidx; + + mdtidx = ll_get_mdt_idx(inode); + if (mdtidx < 0) + RETURN(mdtidx); + + if (put_user((int)mdtidx, (int __user *)arg)) + RETURN(-EFAULT); + + return 0; + } + case IOC_MDC_LOOKUP: { + int namelen, len = 0; + char *buf = NULL; + char *filename; + + rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); + if (rc != 0) + RETURN(rc); + data = (void *)buf; + + filename = data->ioc_inlbuf1; + namelen = strlen(filename); + if (namelen < 1) { + CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); + GOTO(out_free, rc = -EINVAL); + } + + rc = ll_get_fid_by_name(inode, filename, namelen, NULL, NULL); + if (rc < 0) { + CERROR("%s: lookup %.*s failed: rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), namelen, + filename, rc); + GOTO(out_free, rc); + } +out_free: + OBD_FREE_LARGE(buf, len); + return rc; + } + case LL_IOC_LMV_SETSTRIPE: { + struct lmv_user_md *lum; + char *buf = NULL; + char *filename; + int namelen = 0; + int lumlen = 0; + umode_t mode; + int len; + int rc; + + rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); + if (rc) + RETURN(rc); + + data = (void *)buf; + if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL || + data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) + GOTO(lmv_out_free, rc = -EINVAL); + + filename = data->ioc_inlbuf1; + namelen = data->ioc_inllen1; + + if (namelen < 1) { + CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); + GOTO(lmv_out_free, rc = -EINVAL); + } + lum = (struct lmv_user_md *)data->ioc_inlbuf2; + lumlen = data->ioc_inllen2; + + if ((lum->lum_magic != LMV_USER_MAGIC && + lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) || + lumlen < sizeof(*lum)) { + CERROR("%s: wrong lum magic %x or size %d: rc = %d\n", + filename, lum->lum_magic, lumlen, -EFAULT); + GOTO(lmv_out_free, rc = -EINVAL); + } + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 50, 0) + mode = data->ioc_type != 0 ? data->ioc_type : S_IRWXUGO; +#else + mode = data->ioc_type; +#endif + rc = ll_dir_setdirstripe(dentry, lum, lumlen, filename, mode); +lmv_out_free: + OBD_FREE_LARGE(buf, len); + RETURN(rc); + + } + case LL_IOC_LMV_SET_DEFAULT_STRIPE: { + struct lmv_user_md lum; + struct lmv_user_md __user *ulump = + (struct lmv_user_md __user *)arg; + int rc; + + if (copy_from_user(&lum, ulump, sizeof(lum))) + RETURN(-EFAULT); + + if (lum.lum_magic != LMV_USER_MAGIC) + RETURN(-EINVAL); + + rc = ll_dir_setstripe(inode, (struct lov_user_md *)&lum, 0); + + RETURN(rc); + } + case LL_IOC_LOV_SETSTRIPE_NEW: + case LL_IOC_LOV_SETSTRIPE: { + struct lov_user_md_v3 *lumv3 = NULL; + struct lov_user_md_v1 lumv1; + struct lov_user_md_v1 *lumv1_ptr = &lumv1; + struct lov_user_md_v1 __user *lumv1p = + (struct lov_user_md_v1 __user *)arg; + struct lov_user_md_v3 __user *lumv3p = + (struct lov_user_md_v3 __user *)arg; + int lum_size = 0; + + int set_default = 0; + + CLASSERT(sizeof(struct lov_user_md_v3) > + sizeof(struct lov_comp_md_v1)); + CLASSERT(sizeof(*lumv3) == sizeof(*lumv3p)); + /* first try with v1 which is smaller than v3 */ + if (copy_from_user(&lumv1, lumv1p, sizeof(lumv1))) + RETURN(-EFAULT); + + if (is_root_inode(inode)) + set_default = 1; + + switch (lumv1.lmm_magic) { + case LOV_USER_MAGIC_V3: + case LOV_USER_MAGIC_SPECIFIC: + lum_size = ll_lov_user_md_size(&lumv1); + if (lum_size < 0) + RETURN(lum_size); + OBD_ALLOC(lumv3, lum_size); + if (!lumv3) + RETURN(-ENOMEM); + if (copy_from_user(lumv3, lumv3p, lum_size)) + GOTO(out, rc = -EFAULT); + lumv1_ptr = (struct lov_user_md_v1 *)lumv3; + break; + case LOV_USER_MAGIC_V1: + break; + default: + GOTO(out, rc = -ENOTSUPP); + } + + /* in v1 and v3 cases lumv1 points to data */ + rc = ll_dir_setstripe(inode, lumv1_ptr, set_default); +out: + if (lumv3) + OBD_FREE(lumv3, lum_size); + RETURN(rc); + } + case LL_IOC_LMV_GETSTRIPE: { + struct lmv_user_md __user *ulmv = + (struct lmv_user_md __user *)arg; + struct lmv_user_md lum; + struct ptlrpc_request *request = NULL; + struct ptlrpc_request *root_request = NULL; + union lmv_mds_md *lmm = NULL; + int lmmsize; + u64 valid = 0; + struct lmv_user_md *tmp = NULL; + int mdt_index; + int lum_size; + int stripe_count; + int max_stripe_count; + int i; + int rc; + + if (copy_from_user(&lum, ulmv, sizeof(*ulmv))) + RETURN(-EFAULT); + + max_stripe_count = lum.lum_stripe_count; + /* lum_magic will indicate which stripe the ioctl will like + * to get, LMV_MAGIC_V1 is for normal LMV stripe, LMV_USER_MAGIC + * is for default LMV stripe */ + if (lum.lum_magic == LMV_MAGIC_V1) + valid |= OBD_MD_MEA; + else if (lum.lum_magic == LMV_USER_MAGIC) + valid |= OBD_MD_DEFAULT_MEA; + else + RETURN(-EINVAL); + + rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmmsize, + &request, &root_request, valid); + if (rc != 0) + GOTO(finish_req, rc); + + /* Get default LMV EA */ + if (lum.lum_magic == LMV_USER_MAGIC) { + if (lmmsize > sizeof(*ulmv)) + GOTO(finish_req, rc = -EINVAL); + + if (root_request != NULL) { + struct lmv_user_md *lum; + struct ll_inode_info *lli; + + lum = (struct lmv_user_md *)lmm; + lli = ll_i2info(inode); + if (lum->lum_max_inherit != + LMV_INHERIT_UNLIMITED) { + if (lum->lum_max_inherit == + LMV_INHERIT_NONE || + lum->lum_max_inherit < + LMV_INHERIT_END || + lum->lum_max_inherit > + LMV_INHERIT_MAX || + lum->lum_max_inherit < + lli->lli_dir_depth) + GOTO(finish_req, rc = -ENODATA); + + if (lum->lum_max_inherit == + lli->lli_dir_depth) { + lum->lum_max_inherit = + LMV_INHERIT_NONE; + lum->lum_max_inherit_rr = + LMV_INHERIT_RR_NONE; + goto out_copy; + } + + lum->lum_max_inherit -= + lli->lli_dir_depth; + } + + if (lum->lum_max_inherit_rr != + LMV_INHERIT_RR_UNLIMITED) { + if (lum->lum_max_inherit_rr == + LMV_INHERIT_NONE || + lum->lum_max_inherit_rr < + LMV_INHERIT_RR_END || + lum->lum_max_inherit_rr > + LMV_INHERIT_RR_MAX || + lum->lum_max_inherit_rr <= + lli->lli_dir_depth) { + lum->lum_max_inherit_rr = + LMV_INHERIT_RR_NONE; + goto out_copy; + } + + if (lum->lum_max_inherit_rr > + lli->lli_dir_depth) + lum->lum_max_inherit_rr -= + lli->lli_dir_depth; + } + } +out_copy: + if (copy_to_user(ulmv, lmm, lmmsize)) + GOTO(finish_req, rc = -EFAULT); + + GOTO(finish_req, rc); + } + + stripe_count = lmv_mds_md_stripe_count_get(lmm); + if (max_stripe_count < stripe_count) { + lum.lum_stripe_count = stripe_count; + if (copy_to_user(ulmv, &lum, sizeof(lum))) + GOTO(finish_req, rc = -EFAULT); + GOTO(finish_req, rc = -E2BIG); + } + + lum_size = lmv_user_md_size(stripe_count, + LMV_USER_MAGIC_SPECIFIC); + OBD_ALLOC(tmp, lum_size); + if (tmp == NULL) + GOTO(finish_req, rc = -ENOMEM); + + mdt_index = ll_get_mdt_idx(inode); + if (mdt_index < 0) + GOTO(out_tmp, rc = -ENOMEM); + + tmp->lum_magic = LMV_MAGIC_V1; + tmp->lum_stripe_count = 0; + tmp->lum_stripe_offset = mdt_index; + tmp->lum_hash_type = lmv_mds_md_hash_type_get(lmm); + for (i = 0; i < stripe_count; i++) { + struct lu_fid fid; + + fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]); + if (fid_is_sane(&fid)) { + mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid); + if (mdt_index < 0) + GOTO(out_tmp, rc = mdt_index); + + tmp->lum_objects[i].lum_mds = mdt_index; + tmp->lum_objects[i].lum_fid = fid; + } + + tmp->lum_stripe_count++; + } + + if (copy_to_user(ulmv, tmp, lum_size)) + GOTO(out_tmp, rc = -EFAULT); +out_tmp: + OBD_FREE(tmp, lum_size); +finish_req: + ptlrpc_req_finished(request); + ptlrpc_req_finished(root_request); + return rc; + } + + case LL_IOC_REMOVE_ENTRY: { + char *filename = NULL; + int namelen = 0; + int rc; + + /* Here is a little hack to avoid sending REINT_RMENTRY to + * unsupported server, which might crash the server(LU-2730), + * Because both LVB_TYPE and REINT_RMENTRY will be supported + * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the + * server will support REINT_RMENTRY XXX*/ + if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE)) + RETURN(-ENOTSUPP); + + filename = ll_getname((const char __user *)arg); + if (IS_ERR(filename)) + RETURN(PTR_ERR(filename)); + + namelen = strlen(filename); + if (namelen < 1) + GOTO(out_rmdir, rc = -EINVAL); + + rc = ll_rmdir_entry(inode, filename, namelen); +out_rmdir: + if (filename) + ll_putname(filename); + RETURN(rc); + } + case LL_IOC_RMFID: + RETURN(ll_rmfid(file, (void __user *)arg)); + case LL_IOC_LOV_SWAP_LAYOUTS: + RETURN(-EPERM); + case IOC_OBD_STATFS: + RETURN(ll_obd_statfs(inode, (void __user *)arg)); + case LL_IOC_LOV_GETSTRIPE: + case LL_IOC_LOV_GETSTRIPE_NEW: + case LL_IOC_MDC_GETINFO: + case LL_IOC_MDC_GETINFO_OLD: + case IOC_MDC_GETFILEINFO: + case IOC_MDC_GETFILEINFO_OLD: + case IOC_MDC_GETFILESTRIPE: { + struct ptlrpc_request *request = NULL; + struct ptlrpc_request *root_request = NULL; + struct lov_user_md __user *lump; + struct lov_mds_md *lmm = NULL; + struct mdt_body *body; + char *filename = NULL; + lstat_t __user *statp = NULL; + lstatx_t __user *stxp = NULL; + __u64 __user *flagsp = NULL; + __u32 __user *lmmsizep = NULL; + struct lu_fid __user *fidp = NULL; + int lmmsize; + + if (cmd == IOC_MDC_GETFILEINFO_OLD || + cmd == IOC_MDC_GETFILEINFO || + cmd == IOC_MDC_GETFILESTRIPE) { + filename = ll_getname((const char __user *)arg); + if (IS_ERR(filename)) + RETURN(PTR_ERR(filename)); + + rc = ll_lov_getstripe_ea_info(inode, filename, &lmm, + &lmmsize, &request); + } else { + rc = ll_dir_getstripe_default(inode, (void **)&lmm, + &lmmsize, &request, + &root_request, 0); + } + + if (request) { + body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + LASSERT(body != NULL); + } else { + GOTO(out_req, rc); + } + + if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO || + cmd == LL_IOC_MDC_GETINFO || + cmd == IOC_MDC_GETFILEINFO_OLD || + cmd == LL_IOC_MDC_GETINFO_OLD)) { + lmmsize = 0; + rc = 0; + } + + if (rc < 0) + GOTO(out_req, rc); + + if (cmd == IOC_MDC_GETFILESTRIPE || + cmd == LL_IOC_LOV_GETSTRIPE || + cmd == LL_IOC_LOV_GETSTRIPE_NEW) { + lump = (struct lov_user_md __user *)arg; + } else if (cmd == IOC_MDC_GETFILEINFO_OLD || + cmd == LL_IOC_MDC_GETINFO_OLD){ + struct lov_user_mds_data_v1 __user *lmdp; + + lmdp = (struct lov_user_mds_data_v1 __user *)arg; + statp = &lmdp->lmd_st; + lump = &lmdp->lmd_lmm; + } else { + struct lov_user_mds_data __user *lmdp; + + lmdp = (struct lov_user_mds_data __user *)arg; + fidp = &lmdp->lmd_fid; + stxp = &lmdp->lmd_stx; + flagsp = &lmdp->lmd_flags; + lmmsizep = &lmdp->lmd_lmmsize; + lump = &lmdp->lmd_lmm; + } + + if (lmmsize == 0) { + /* If the file has no striping then zero out *lump so + * that the caller isn't confused by garbage. */ + if (clear_user(lump, sizeof(*lump))) + GOTO(out_req, rc = -EFAULT); + } else if (copy_to_user(lump, lmm, lmmsize)) { + if (copy_to_user(lump, lmm, sizeof(*lump))) + GOTO(out_req, rc = -EFAULT); + rc = -EOVERFLOW; + } + + if (cmd == IOC_MDC_GETFILEINFO_OLD || + cmd == LL_IOC_MDC_GETINFO_OLD) { + lstat_t st = { 0 }; + + st.st_dev = inode->i_sb->s_dev; + st.st_mode = body->mbo_mode; + st.st_nlink = body->mbo_nlink; + st.st_uid = body->mbo_uid; + st.st_gid = body->mbo_gid; + st.st_rdev = body->mbo_rdev; + st.st_size = body->mbo_size; + st.st_blksize = PAGE_SIZE; + st.st_blocks = body->mbo_blocks; + st.st_atime = body->mbo_atime; + st.st_mtime = body->mbo_mtime; + st.st_ctime = body->mbo_ctime; + st.st_ino = cl_fid_build_ino(&body->mbo_fid1, + sbi->ll_flags & + LL_SBI_32BIT_API); + + if (copy_to_user(statp, &st, sizeof(st))) + GOTO(out_req, rc = -EFAULT); + } else if (cmd == IOC_MDC_GETFILEINFO || + cmd == LL_IOC_MDC_GETINFO) { + lstatx_t stx = { 0 }; + __u64 valid = body->mbo_valid; + + stx.stx_blksize = PAGE_SIZE; + stx.stx_nlink = body->mbo_nlink; + stx.stx_uid = body->mbo_uid; + stx.stx_gid = body->mbo_gid; + stx.stx_mode = body->mbo_mode; + stx.stx_ino = cl_fid_build_ino(&body->mbo_fid1, + sbi->ll_flags & + LL_SBI_32BIT_API); + stx.stx_size = body->mbo_size; + stx.stx_blocks = body->mbo_blocks; + stx.stx_atime.tv_sec = body->mbo_atime; + stx.stx_ctime.tv_sec = body->mbo_ctime; + stx.stx_mtime.tv_sec = body->mbo_mtime; + stx.stx_rdev_major = MAJOR(body->mbo_rdev); + stx.stx_rdev_minor = MINOR(body->mbo_rdev); + stx.stx_dev_major = MAJOR(inode->i_sb->s_dev); + stx.stx_dev_minor = MINOR(inode->i_sb->s_dev); + stx.stx_mask |= STATX_BASIC_STATS; + + /* + * For a striped directory, the size and blocks returned + * from MDT is not correct. + * The size and blocks are aggregated by client across + * all stripes. + * Thus for a striped directory, do not return the valid + * FLSIZE and FLBLOCKS flags to the caller. + * However, this whould be better decided by the MDS + * instead of the client. + */ + if (cmd == LL_IOC_MDC_GETINFO && + ll_i2info(inode)->lli_lsm_md != NULL) + valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); + + if (flagsp && copy_to_user(flagsp, &valid, + sizeof(*flagsp))) + GOTO(out_req, rc = -EFAULT); + + if (fidp && copy_to_user(fidp, &body->mbo_fid1, + sizeof(*fidp))) + GOTO(out_req, rc = -EFAULT); + + if (!(valid & OBD_MD_FLSIZE)) + stx.stx_mask &= ~STATX_SIZE; + if (!(valid & OBD_MD_FLBLOCKS)) + stx.stx_mask &= ~STATX_BLOCKS; + + if (stxp && copy_to_user(stxp, &stx, sizeof(stx))) + GOTO(out_req, rc = -EFAULT); + + if (lmmsizep && copy_to_user(lmmsizep, &lmmsize, + sizeof(*lmmsizep))) + GOTO(out_req, rc = -EFAULT); + } + + EXIT; +out_req: + ptlrpc_req_finished(request); + ptlrpc_req_finished(root_request); + if (filename) + ll_putname(filename); + return rc; + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl; + + OBD_ALLOC_PTR(qctl); + if (!qctl) + RETURN(-ENOMEM); + + if (copy_from_user(qctl, (void __user *)arg, sizeof(*qctl))) + GOTO(out_quotactl, rc = -EFAULT); + + rc = quotactl_ioctl(inode->i_sb, qctl); + + if (rc == 0 && + copy_to_user((void __user *)arg, qctl, sizeof(*qctl))) + rc = -EFAULT; + + out_quotactl: + OBD_FREE_PTR(qctl); + RETURN(rc); + } + case OBD_IOC_GETDTNAME: + case OBD_IOC_GETMDNAME: + RETURN(ll_get_obd_name(inode, cmd, arg)); + case LL_IOC_HSM_STATE_GET: { + struct md_op_data *op_data; + struct hsm_user_state *hus; + int rc; + + OBD_ALLOC_PTR(hus); + if (hus == NULL) + RETURN(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hus); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(hus); + RETURN(PTR_ERR(op_data)); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((void __user *)arg, hus, sizeof(*hus))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hus); + RETURN(rc); + } + case LL_IOC_HSM_STATE_SET: { + struct hsm_state_set *hss; + int rc; + + OBD_ALLOC_PTR(hss); + if (hss == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) { + OBD_FREE_PTR(hss); + RETURN(-EFAULT); + } + + rc = ll_hsm_state_set(inode, hss); + + OBD_FREE_PTR(hss); + RETURN(rc); + } + case LL_IOC_HSM_ACTION: { + struct md_op_data *op_data; + struct hsm_current_action *hca; + int rc; + + OBD_ALLOC_PTR(hca); + if (hca == NULL) + RETURN(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hca); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(hca); + RETURN(PTR_ERR(op_data)); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((char __user *)arg, hca, sizeof(*hca))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hca); + RETURN(rc); + } + case LL_IOC_FLUSHCTX: + RETURN(ll_flush_ctx(inode)); + case LL_IOC_GETOBDCOUNT: { + u32 count, vallen; + struct obd_export *exp; + + if (copy_from_user(&count, (int __user *)arg, sizeof(int))) + RETURN(-EFAULT); + + /* get ost count when count is zero, get mdt count otherwise */ + exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp; + vallen = sizeof(count); + rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT), + KEY_TGT_COUNT, &vallen, &count); + if (rc) { + CERROR("get target count failed: %d\n", rc); + RETURN(rc); + } + + if (copy_to_user((int __user *)arg, &count, sizeof(int))) + RETURN(-EFAULT); + + RETURN(0); + } + case LL_IOC_PATH2FID: + if (copy_to_user((void __user *)arg, ll_inode2fid(inode), + sizeof(struct lu_fid))) + RETURN(-EFAULT); + RETURN(0); + case LL_IOC_GET_CONNECT_FLAGS: { + RETURN(obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, + (void __user *)arg)); + } + case OBD_IOC_FID2PATH: + RETURN(ll_fid2path(inode, (void __user *)arg)); + case LL_IOC_GETPARENT: + RETURN(ll_getparent(file, (void __user *)arg)); + case LL_IOC_FID2MDTIDX: { + struct obd_export *exp = ll_i2mdexp(inode); + struct lu_fid fid; + __u32 index; + + if (copy_from_user(&fid, (const struct lu_fid __user *)arg, + sizeof(fid))) + RETURN(-EFAULT); + + /* Call mdc_iocontrol */ + rc = obd_iocontrol(LL_IOC_FID2MDTIDX, exp, sizeof(fid), &fid, + (__u32 __user *)&index); + if (rc != 0) + RETURN(rc); + + RETURN(index); + } + case LL_IOC_HSM_REQUEST: { + struct hsm_user_request *hur; + ssize_t totalsize; + + OBD_ALLOC_PTR(hur); + if (hur == NULL) + RETURN(-ENOMEM); + + /* We don't know the true size yet; copy the fixed-size part */ + if (copy_from_user(hur, (void __user *)arg, sizeof(*hur))) { + OBD_FREE_PTR(hur); + RETURN(-EFAULT); + } + + /* Compute the whole struct size */ + totalsize = hur_len(hur); + OBD_FREE_PTR(hur); + if (totalsize < 0) + RETURN(-E2BIG); + + /* Final size will be more than double totalsize */ + if (totalsize >= MDS_MAXREQSIZE / 3) + RETURN(-E2BIG); + + OBD_ALLOC_LARGE(hur, totalsize); + if (hur == NULL) + RETURN(-ENOMEM); + + /* Copy the whole struct */ + if (copy_from_user(hur, (void __user *)arg, totalsize)) + GOTO(out_hur, rc = -EFAULT); + + if (hur->hur_request.hr_action == HUA_RELEASE) { + const struct lu_fid *fid; + struct inode *f; + int i; + + for (i = 0; i < hur->hur_request.hr_itemcount; i++) { + fid = &hur->hur_user_item[i].hui_fid; + f = search_inode_for_lustre(inode->i_sb, fid); + if (IS_ERR(f)) { + rc = PTR_ERR(f); + break; + } + + rc = ll_hsm_release(f); + iput(f); + if (rc != 0) + break; + } + } else { + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize, + hur, NULL); + } + +out_hur: + OBD_FREE_LARGE(hur, totalsize); + + RETURN(rc); + } + case LL_IOC_HSM_PROGRESS: { + struct hsm_progress_kernel hpk; + struct hsm_progress hp; + + if (copy_from_user(&hp, (void __user *)arg, sizeof(hp))) + RETURN(-EFAULT); + + hpk.hpk_fid = hp.hp_fid; + hpk.hpk_cookie = hp.hp_cookie; + hpk.hpk_extent = hp.hp_extent; + hpk.hpk_flags = hp.hp_flags; + hpk.hpk_errval = hp.hp_errval; + hpk.hpk_data_version = 0; + + /* File may not exist in Lustre; all progress + * reported to Lustre root */ + rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk, + NULL); + RETURN(rc); + } + case LL_IOC_HSM_CT_START: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + RETURN(-EPERM); + + rc = copy_and_ct_start(cmd, sbi->ll_md_exp, + (struct lustre_kernelcomm __user *)arg); + RETURN(rc); + + case LL_IOC_HSM_COPY_START: { + struct hsm_copy *copy; + int rc; + + OBD_ALLOC_PTR(copy); + if (copy == NULL) + RETURN(-ENOMEM); + if (copy_from_user(copy, (char __user *)arg, sizeof(*copy))) { + OBD_FREE_PTR(copy); + RETURN(-EFAULT); + } + + rc = ll_ioc_copy_start(inode->i_sb, copy); + if (copy_to_user((char __user *)arg, copy, sizeof(*copy))) + rc = -EFAULT; + + OBD_FREE_PTR(copy); + RETURN(rc); + } + case LL_IOC_HSM_IMPORT: { + struct hsm_user_import *hui; + + OBD_ALLOC_PTR(hui); + if (hui == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) { + OBD_FREE_PTR(hui); + RETURN(-EFAULT); + } + + rc = ll_hsm_import(inode, file, hui); + + CDEBUG(D_HSM, "MDLL hsm_state import: %d\n", rc); + OBD_FREE_PTR(hui); + RETURN(rc); + } + case LL_IOC_HSM_COPY_END: { + struct hsm_copy *copy; + int rc; + + OBD_ALLOC_PTR(copy); + if (copy == NULL) + RETURN(-ENOMEM); + if (copy_from_user(copy, (char __user *)arg, sizeof(*copy))) { + OBD_FREE_PTR(copy); + RETURN(-EFAULT); + } + + rc = ll_ioc_copy_end(inode->i_sb, copy); + CDEBUG(D_HSM, "MDLL hsm_copy_end: %d\n", rc); + + if (copy_to_user((char __user *)arg, copy, sizeof(*copy))) + rc = -EFAULT; + + OBD_FREE_PTR(copy); + RETURN(rc); + } + case LL_IOC_MIGRATE: { + struct lmv_user_md *lum; + char *buf = NULL; + int len; + char *filename; + int namelen = 0; + int rc; + + rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); + if (rc) + RETURN(rc); + + data = (struct obd_ioctl_data *)buf; + if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL || + data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) + GOTO(migrate_free, rc = -EINVAL); + + filename = data->ioc_inlbuf1; + namelen = data->ioc_inllen1; + + if (namelen < 1 || namelen != strlen(filename) + 1) { + CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); + GOTO(migrate_free, rc = -EINVAL); + } + + lum = (struct lmv_user_md *)data->ioc_inlbuf2; + if (lum->lum_magic != LMV_USER_MAGIC && + lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) { + rc = -EINVAL; + CERROR("%s: wrong lum magic %x: rc = %d\n", + filename, lum->lum_magic, rc); + GOTO(migrate_free, rc); + } + + rc = ll_migrate(inode, file, lum, filename); +migrate_free: + OBD_FREE_LARGE(buf, len); + + RETURN(rc); + } + case LL_IOC_FSGETXATTR: + RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg)); + case LL_IOC_FSSETXATTR: + RETURN(ll_ioctl_fssetxattr(inode, cmd, arg)); + default: + RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, + (void __user *)arg)); + } +} + +static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + int api32 = ll_need_32bit_api(sbi); + loff_t ret = -EINVAL; + ENTRY; + + inode_lock(inode); + switch (origin) { + case SEEK_SET: + break; + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_END: + if (offset > 0) + GOTO(out, ret); + if (api32) + offset += LL_DIR_END_OFF_32BIT; + else + offset += LL_DIR_END_OFF; + break; + default: + GOTO(out, ret); + } + + if (offset >= 0 && + ((api32 && offset <= LL_DIR_END_OFF_32BIT) || + (!api32 && offset <= LL_DIR_END_OFF))) { + if (offset != file->f_pos) { + if ((api32 && offset == LL_DIR_END_OFF_32BIT) || + (!api32 && offset == LL_DIR_END_OFF)) + fd->lfd_pos = MDS_DIR_END_OFF; + else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH) + fd->lfd_pos = offset << 32; + else + fd->lfd_pos = offset; + file->f_pos = offset; + file->f_version = 0; + } + ret = offset; + } + GOTO(out, ret); + +out: + inode_unlock(inode); + return ret; +} + +static int ll_dir_open(struct inode *inode, struct file *file) +{ + ENTRY; + RETURN(ll_file_open(inode, file)); +} + +static int ll_dir_release(struct inode *inode, struct file *file) +{ + ENTRY; + RETURN(ll_file_release(inode, file)); +} + +const struct file_operations ll_dir_operations = { + .llseek = ll_dir_seek, + .open = ll_dir_open, + .release = ll_dir_release, + .read = generic_read_dir, +#ifdef HAVE_DIR_CONTEXT + .iterate = ll_iterate, +#else + .readdir = ll_readdir, +#endif + .unlocked_ioctl = ll_dir_ioctl, + .fsync = ll_fsync, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c new file mode 100644 index 0000000000000..7c1a3f0741cc7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/file.c @@ -0,0 +1,5353 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/file.c + * + * Author: Peter Braam + * Author: Phil Schwan + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LLITE +#include +#include +#include +#include +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif + +#include +#include + +#include "cl_object.h" +#include "llite_internal.h" +#include "vvp_internal.h" + +struct split_param { + struct inode *sp_inode; + __u16 sp_mirror_id; +}; + +static int +ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg); + +static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, + bool *lease_broken); + +static struct ll_file_data *ll_file_data_get(void) +{ + struct ll_file_data *fd; + + OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS); + if (fd == NULL) + return NULL; + + fd->fd_write_failed = false; + + return fd; +} + +static void ll_file_data_put(struct ll_file_data *fd) +{ + if (fd != NULL) + OBD_SLAB_FREE_PTR(fd, ll_file_data_slab); +} + +/** + * Packs all the attributes into @op_data for the CLOSE rpc. + */ +static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data, + struct obd_client_handle *och) +{ + ENTRY; + + ll_prep_md_op_data(op_data, inode, NULL, NULL, + 0, 0, LUSTRE_OPC_ANY, NULL); + + op_data->op_attr.ia_mode = inode->i_mode; + op_data->op_attr.ia_atime = inode->i_atime; + op_data->op_attr.ia_mtime = inode->i_mtime; + op_data->op_attr.ia_ctime = inode->i_ctime; + op_data->op_attr.ia_size = i_size_read(inode); + op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET | + ATTR_MTIME | ATTR_MTIME_SET | + ATTR_CTIME); + op_data->op_xvalid |= OP_XVALID_CTIME_SET; + op_data->op_attr_blocks = inode->i_blocks; + op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags); + if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) + op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL; + op_data->op_open_handle = och->och_open_handle; + + if (och->och_flags & FMODE_WRITE && + ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED)) + /* For HSM: if inode data has been modified, pack it so that + * MDT can set data dirty flag in the archive. */ + op_data->op_bias |= MDS_DATA_MODIFIED; + + EXIT; +} + +/** + * Perform a close, possibly with a bias. + * The meaning of "data" depends on the value of "bias". + * + * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version. + * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to + * swap layouts with. + */ +static int ll_close_inode_openhandle(struct inode *inode, + struct obd_client_handle *och, + enum mds_op_bias bias, void *data) +{ + struct obd_export *md_exp = ll_i2mdexp(inode); + const struct ll_inode_info *lli = ll_i2info(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + int rc; + ENTRY; + + if (class_exp2obd(md_exp) == NULL) { + CERROR("%s: invalid MDC connection handle closing "DFID"\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&lli->lli_fid)); + GOTO(out, rc = 0); + } + + OBD_ALLOC_PTR(op_data); + /* We leak openhandle and request here on error, but not much to be + * done in OOM case since app won't retry close on error either. */ + if (op_data == NULL) + GOTO(out, rc = -ENOMEM); + + ll_prepare_close(inode, op_data, och); + switch (bias) { + case MDS_CLOSE_LAYOUT_MERGE: + /* merge blocks from the victim inode */ + op_data->op_attr_blocks += ((struct inode *)data)->i_blocks; + op_data->op_attr.ia_valid |= ATTR_SIZE; + op_data->op_xvalid |= OP_XVALID_BLOCKS; + fallthrough; + case MDS_CLOSE_LAYOUT_SPLIT: + case MDS_CLOSE_LAYOUT_SWAP: { + struct split_param *sp = data; + + LASSERT(data != NULL); + op_data->op_bias |= bias; + op_data->op_data_version = 0; + op_data->op_lease_handle = och->och_lease_handle; + if (bias == MDS_CLOSE_LAYOUT_SPLIT) { + op_data->op_fid2 = *ll_inode2fid(sp->sp_inode); + op_data->op_mirror_id = sp->sp_mirror_id; + } else { + op_data->op_fid2 = *ll_inode2fid(data); + } + break; + } + + case MDS_CLOSE_RESYNC_DONE: { + struct ll_ioc_lease *ioc = data; + + LASSERT(data != NULL); + op_data->op_attr_blocks += + ioc->lil_count * op_data->op_attr_blocks; + op_data->op_attr.ia_valid |= ATTR_SIZE; + op_data->op_xvalid |= OP_XVALID_BLOCKS; + op_data->op_bias |= MDS_CLOSE_RESYNC_DONE; + + op_data->op_lease_handle = och->och_lease_handle; + op_data->op_data = &ioc->lil_ids[0]; + op_data->op_data_size = + ioc->lil_count * sizeof(ioc->lil_ids[0]); + break; + } + + case MDS_HSM_RELEASE: + LASSERT(data != NULL); + op_data->op_bias |= MDS_HSM_RELEASE; + op_data->op_data_version = *(__u64 *)data; + op_data->op_lease_handle = och->och_lease_handle; + op_data->op_attr.ia_valid |= ATTR_SIZE; + op_data->op_xvalid |= OP_XVALID_BLOCKS; + break; + + default: + LASSERT(data == NULL); + break; + } + + if (!(op_data->op_attr.ia_valid & ATTR_SIZE)) + op_data->op_xvalid |= OP_XVALID_LAZYSIZE; + if (!(op_data->op_xvalid & OP_XVALID_BLOCKS)) + op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS; + + rc = md_close(md_exp, op_data, och->och_mod, &req); + if (rc != 0 && rc != -EINTR) + CERROR("%s: inode "DFID" mdc close failed: rc = %d\n", + md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc); + + if (rc == 0 && op_data->op_bias & bias) { + struct mdt_body *body; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED)) + rc = -EBUSY; + } + + ll_finish_md_op_data(op_data); + EXIT; +out: + + md_clear_open_replay_data(md_exp, och); + och->och_open_handle.cookie = DEAD_HANDLE_MAGIC; + OBD_FREE_PTR(och); + + ptlrpc_req_finished(req); /* This is close request */ + return rc; +} + +int ll_md_real_close(struct inode *inode, fmode_t fmode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_client_handle **och_p; + struct obd_client_handle *och; + __u64 *och_usecount; + int rc = 0; + ENTRY; + + if (fmode & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else if (fmode & FMODE_EXEC) { + och_p = &lli->lli_mds_exec_och; + och_usecount = &lli->lli_open_fd_exec_count; + } else { + LASSERT(fmode & FMODE_READ); + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + mutex_lock(&lli->lli_och_mutex); + if (*och_usecount > 0) { + /* There are still users of this handle, so skip + * freeing it. */ + mutex_unlock(&lli->lli_och_mutex); + RETURN(0); + } + + och = *och_p; + *och_p = NULL; + mutex_unlock(&lli->lli_och_mutex); + + if (och != NULL) { + /* There might be a race and this handle may already + * be closed. */ + rc = ll_close_inode_openhandle(inode, och, 0, NULL); + } + + RETURN(rc); +} + +static int ll_md_close(struct inode *inode, struct file *file) +{ + union ldlm_policy_data policy = { + .l_inodebits = { MDS_INODELOCK_OPEN }, + }; + __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK; + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct lustre_handle lockh; + enum ldlm_mode lockmode; + int rc = 0; + ENTRY; + + /* clear group lock, if present */ + if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) + ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid); + + if (fd->fd_lease_och != NULL) { + bool lease_broken; + + /* Usually the lease is not released when the + * application crashed, we need to release here. */ + rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken); + CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n", + PFID(&lli->lli_fid), rc, lease_broken); + + fd->fd_lease_och = NULL; + } + + if (fd->fd_och != NULL) { + rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL); + fd->fd_och = NULL; + GOTO(out, rc); + } + + /* Let's see if we have good enough OPEN lock on the file and if + we can skip talking to MDS */ + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_omode & FMODE_WRITE) { + lockmode = LCK_CW; + LASSERT(lli->lli_open_fd_write_count); + lli->lli_open_fd_write_count--; + } else if (fd->fd_omode & FMODE_EXEC) { + lockmode = LCK_PR; + LASSERT(lli->lli_open_fd_exec_count); + lli->lli_open_fd_exec_count--; + } else { + lockmode = LCK_CR; + LASSERT(lli->lli_open_fd_read_count); + lli->lli_open_fd_read_count--; + } + mutex_unlock(&lli->lli_och_mutex); + + /* LU-4398: do not cache write open lock if the file has exec bit */ + if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) || + !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode), + LDLM_IBITS, &policy, lockmode, &lockh)) + rc = ll_md_real_close(inode, fd->fd_omode); + +out: + LUSTRE_FPRIVATE(file) = NULL; + ll_file_data_put(fd); + + RETURN(rc); +} + +/* While this returns an error code, fput() the caller does not, so we need + * to make every effort to clean up all of our state here. Also, applications + * rarely check close errors and even if an error is returned they will not + * re-try the close call. + */ +int ll_file_release(struct inode *inode, struct file *file) +{ + struct ll_file_data *fd; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + + if (inode->i_sb->s_root != file_dentry(file)) + ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1); + fd = LUSTRE_FPRIVATE(file); + LASSERT(fd != NULL); + + /* The last ref on @file, maybe not the the owner pid of statahead, + * because parent and child process can share the same file handle. */ + if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd) + ll_deauthorize_statahead(inode, fd); + + if (is_root_inode(inode)) { + LUSTRE_FPRIVATE(file) = NULL; + ll_file_data_put(fd); + RETURN(0); + } + + if (!S_ISDIR(inode->i_mode)) { + if (lli->lli_clob != NULL) + lov_read_and_clear_async_rc(lli->lli_clob); + lli->lli_async_rc = 0; + } + + rc = ll_md_close(inode, file); + + if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val)) + libcfs_debug_dumplog(); + + RETURN(rc); +} + +static inline int ll_dom_readpage(void *data, struct page *page) +{ + struct niobuf_local *lnb = data; + void *kaddr; + + kaddr = ll_kmap_atomic(page, KM_USER0); + memcpy(kaddr, lnb->lnb_data, lnb->lnb_len); + if (lnb->lnb_len < PAGE_SIZE) + memset(kaddr + lnb->lnb_len, 0, + PAGE_SIZE - lnb->lnb_len); + flush_dcache_page(page); + SetPageUptodate(page); + ll_kunmap_atomic(kaddr, KM_USER0); + unlock_page(page); + + return 0; +} + +void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req) +{ + struct lu_env *env; + struct cl_io *io; + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct address_space *mapping = inode->i_mapping; + struct page *vmpage; + struct niobuf_remote *rnb; + struct mdt_body *body; + char *data; + unsigned long index, start; + struct niobuf_local lnb; + __u16 refcheck; + int rc; + + ENTRY; + + if (obj == NULL) + RETURN_EXIT; + + if (!req_capsule_field_present(&req->rq_pill, &RMF_NIOBUF_INLINE, + RCL_SERVER)) + RETURN_EXIT; + + rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE); + if (rnb == NULL || rnb->rnb_len == 0) + RETURN_EXIT; + + /* LU-11595: Server may return whole file and that is OK always or + * it may return just file tail and its offset must be aligned with + * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is + * smaller then offset may be not aligned and that data is just ignored. + */ + if (rnb->rnb_offset % PAGE_SIZE) + RETURN_EXIT; + + /* Server returns whole file or just file tail if it fills in reply + * buffer, in both cases total size should be equal to the file size. + */ + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) { + CERROR("%s: server returns off/len %llu/%u but size %llu\n", + ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset, + rnb->rnb_len, body->mbo_dom_size); + RETURN_EXIT; + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN_EXIT; + io = vvp_env_thread_io(env); + io->ci_obj = obj; + io->ci_ignore_layout = 1; + rc = cl_io_init(env, io, CIT_MISC, obj); + if (rc) + GOTO(out_io, rc); + + CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n", + rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size); + + data = (char *)rnb + sizeof(*rnb); + + lnb.lnb_file_offset = rnb->rnb_offset; + start = lnb.lnb_file_offset / PAGE_SIZE; + index = 0; + LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0); + lnb.lnb_page_offset = 0; + do { + struct cl_page *page; + + lnb.lnb_data = data + (index << PAGE_SHIFT); + lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT); + if (lnb.lnb_len > PAGE_SIZE) + lnb.lnb_len = PAGE_SIZE; + + vmpage = read_cache_page(mapping, index + start, + ll_dom_readpage, &lnb); + if (IS_ERR(vmpage)) { + CWARN("%s: cannot fill page %lu for "DFID + " with data: rc = %li\n", + ll_get_fsname(inode->i_sb, NULL, 0), + index + start, PFID(lu_object_fid(&obj->co_lu)), + PTR_ERR(vmpage)); + break; + } + lock_page(vmpage); + if (vmpage->mapping == NULL) { + unlock_page(vmpage); + put_page(vmpage); + /* page was truncated */ + break; + } + /* attach VM page to CL page cache */ + page = cl_page_find(env, obj, vmpage->index, vmpage, + CPT_CACHEABLE); + if (IS_ERR(page)) { + ClearPageUptodate(vmpage); + unlock_page(vmpage); + put_page(vmpage); + break; + } + cl_page_export(env, page, 1); + cl_page_put(env, page); + unlock_page(vmpage); + put_page(vmpage); + index++; + } while (rnb->rnb_len > (index << PAGE_SHIFT)); + +out_io: + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + + EXIT; +} + +static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize, + struct lookup_intent *itp) +{ + struct ll_sb_info *sbi = ll_i2sbi(de->d_inode); + struct dentry *parent = de->d_parent; + char *name = NULL; + int len = 0; + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + int rc; + ENTRY; + + LASSERT(parent != NULL); + LASSERT(itp->it_flags & MDS_OPEN_BY_FID); + + /* if server supports open-by-fid, or file name is invalid, don't pack + * name in open request */ + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) || + !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) { +retry: + len = de->d_name.len; + name = kmalloc(len + 1, GFP_NOFS); + if (!name) + RETURN(-ENOMEM); + + /* race here */ + spin_lock(&de->d_lock); + if (len != de->d_name.len) { + spin_unlock(&de->d_lock); + kfree(name); + goto retry; + } + memcpy(name, de->d_name.name, len); + name[len] = '\0'; + spin_unlock(&de->d_lock); + + if (!lu_name_is_valid_2(name, len)) { + kfree(name); + RETURN(-ESTALE); + } + } + + op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode, + name, len, 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + kfree(name); + RETURN(PTR_ERR(op_data)); + } + op_data->op_data = lmm; + op_data->op_data_size = lmmsize; + + rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req, + &ll_md_blocking_ast, 0); + kfree(name); + ll_finish_md_op_data(op_data); + if (rc == -ESTALE) { + /* reason for keep own exit path - don`t flood log + * with messages with -ESTALE errors. + */ + if (!it_disposition(itp, DISP_OPEN_OPEN) || + it_open_error(DISP_OPEN_OPEN, itp)) + GOTO(out, rc); + ll_release_openhandle(de, itp); + GOTO(out, rc); + } + + if (it_disposition(itp, DISP_LOOKUP_NEG)) + GOTO(out, rc = -ENOENT); + + if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) { + rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp); + CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc); + GOTO(out, rc); + } + + rc = ll_prep_inode(&de->d_inode, req, NULL, itp); + + if (!rc && itp->it_lock_mode) { + __u64 bits = 0; + + /* If we got a lock back and it has a LOOKUP bit set, + * make sure the dentry is marked as valid so we can find it. + * We don't need to care about actual hashing since other bits + * of kernel will deal with that later. + */ + ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, &bits); + if (bits & MDS_INODELOCK_LOOKUP) { + d_lustre_revalidate(de); + ll_update_dir_depth(parent->d_inode, de->d_inode); + } + + /* if DoM bit returned along with LAYOUT bit then there + * can be read-on-open data returned. + */ + if (bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT) + ll_dom_finish_open(de->d_inode, req); + } + +out: + ptlrpc_req_finished(req); + ll_intent_drop_lock(itp); + + /* We did open by fid, but by the time we got to the server, + * the object disappeared. If this is a create, we cannot really + * tell the userspace that the file it was trying to create + * does not exist. Instead let's return -ESTALE, and the VFS will + * retry the create with LOOKUP_REVAL that we are going to catch + * in ll_revalidate_dentry() and use lookup then. + */ + if (rc == -ENOENT && itp->it_op & IT_CREAT) + rc = -ESTALE; + + RETURN(rc); +} + +static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it, + struct obd_client_handle *och) +{ + struct mdt_body *body; + + body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY); + och->och_open_handle = body->mbo_open_handle; + och->och_fid = body->mbo_fid1; + och->och_lease_handle.cookie = it->it_lock_handle; + och->och_magic = OBD_CLIENT_HANDLE_MAGIC; + och->och_flags = it->it_flags; + + return md_set_open_replay_data(md_exp, och, it); +} + +static int ll_local_open(struct file *file, struct lookup_intent *it, + struct ll_file_data *fd, struct obd_client_handle *och) +{ + struct inode *inode = file_inode(file); + ENTRY; + + LASSERT(!LUSTRE_FPRIVATE(file)); + + LASSERT(fd != NULL); + + if (och) { + int rc; + + rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och); + if (rc != 0) + RETURN(rc); + } + + LUSTRE_FPRIVATE(file) = fd; + ll_readahead_init(inode, &fd->fd_ras); + fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + + /* ll_cl_context initialize */ + rwlock_init(&fd->fd_lock); + INIT_LIST_HEAD(&fd->fd_lccs); + + RETURN(0); +} + +/* Open a file, and (for the very first open) create objects on the OSTs at + * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object + * creation or open until ll_lov_setstripe() ioctl is called. + * + * If we already have the stripe MD locally then we don't request it in + * md_open(), by passing a lmm_size = 0. + * + * It is up to the application to ensure no other processes open this file + * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be + * used. We might be able to avoid races of that sort by getting lli_open_sem + * before returning in the O_LOV_DELAY_CREATE case and dropping it here + * or in ll_file_release(), but I'm not sure that is desirable/necessary. + */ +int ll_file_open(struct inode *inode, struct file *file) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lookup_intent *it, oit = { .it_op = IT_OPEN, + .it_flags = file->f_flags }; + struct obd_client_handle **och_p = NULL; + __u64 *och_usecount = NULL; + struct ll_file_data *fd; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n", + PFID(ll_inode2fid(inode)), inode, file->f_flags); + + it = file->private_data; /* XXX: compat macro */ + file->private_data = NULL; /* prevent ll_local_open assertion */ + + fd = ll_file_data_get(); + if (fd == NULL) + GOTO(out_nofiledata, rc = -ENOMEM); + + fd->fd_file = file; + if (S_ISDIR(inode->i_mode)) + ll_authorize_statahead(inode, fd); + + if (is_root_inode(inode)) { + LUSTRE_FPRIVATE(file) = fd; + RETURN(0); + } + + if (!it || !it->it_disposition) { + CDEBUG(D_HSM, "MDLL file->f_flags=0x%x/0%o\n", + file->f_flags, file->f_flags); + /* Convert f_flags into access mode. We cannot use file->f_mode, + * because everything but O_ACCMODE mask was stripped from + * there */ + if ((oit.it_flags + 1) & O_ACCMODE) + oit.it_flags++; + if (file->f_flags & O_TRUNC) + oit.it_flags |= FMODE_WRITE; + + /* kernel only call f_op->open in dentry_open. filp_open calls + * dentry_open after call to open_namei that checks permissions. + * Only nfsd_open call dentry_open directly without checking + * permissions and because of that this code below is safe. + */ + if (oit.it_flags & (FMODE_WRITE | FMODE_READ)) + oit.it_flags |= MDS_OPEN_OWNEROVERRIDE; + + /* We do not want O_EXCL here, presumably we opened the file + * already? XXX - NFS implications? */ + oit.it_flags &= ~O_EXCL; + + /* bug20584, if "it_flags" contains O_CREAT, the file will be + * created if necessary, then "IT_CREAT" should be set to keep + * consistent with it */ + if (oit.it_flags & O_CREAT) + oit.it_op |= IT_CREAT; + + it = &oit; + } + +restart: + /* Let's see if we have file open on MDS already. */ + if (it->it_flags & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else if (it->it_flags & FMODE_EXEC) { + och_p = &lli->lli_mds_exec_och; + och_usecount = &lli->lli_open_fd_exec_count; + } else { + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + mutex_lock(&lli->lli_och_mutex); + if (*och_p) { /* Open handle is present */ + if (it_disposition(it, DISP_OPEN_OPEN)) { + /* Well, there's extra open request that we do not need, + let's close it somehow. This will decref request. */ + rc = it_open_error(DISP_OPEN_OPEN, it); + if (rc) { + mutex_unlock(&lli->lli_och_mutex); + GOTO(out_openerr, rc); + } + + ll_release_openhandle(file_dentry(file), it); + } + (*och_usecount)++; + + rc = ll_local_open(file, it, fd, NULL); + if (rc) { + (*och_usecount)--; + mutex_unlock(&lli->lli_och_mutex); + GOTO(out_openerr, rc); + } + } else { + LASSERT(*och_usecount == 0); + if (!it->it_disposition) { + struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry); + /* We cannot just request lock handle now, new ELC code + means that one of other OPEN locks for this file + could be cancelled, and since blocking ast handler + would attempt to grab och_mutex as well, that would + result in a deadlock */ + mutex_unlock(&lli->lli_och_mutex); + /* + * Normally called under two situations: + * 1. NFS export. + * 2. A race/condition on MDS resulting in no open + * handle to be returned from LOOKUP|OPEN request, + * for example if the target entry was a symlink. + * + * Only fetch MDS_OPEN_LOCK if this is in NFS path, + * marked by a bit set in ll_iget_for_nfs. Clear the + * bit so that it's not confusing later callers. + * + * NB; when ldd is NULL, it must have come via normal + * lookup path only, since ll_iget_for_nfs always calls + * ll_d_init(). + */ + if (ldd && ldd->lld_nfs_dentry) { + ldd->lld_nfs_dentry = 0; + it->it_flags |= MDS_OPEN_LOCK; + } + + /* + * Always specify MDS_OPEN_BY_FID because we don't want + * to get file with different fid. + */ + it->it_flags |= MDS_OPEN_BY_FID; + rc = ll_intent_file_open(file_dentry(file), NULL, 0, + it); + if (rc) + GOTO(out_openerr, rc); + + goto restart; + } + OBD_ALLOC(*och_p, sizeof (struct obd_client_handle)); + if (!*och_p) + GOTO(out_och_free, rc = -ENOMEM); + + (*och_usecount)++; + + /* md_intent_lock() didn't get a request ref if there was an + * open error, so don't do cleanup on the request here + * (bug 3430) */ + /* XXX (green): Should not we bail out on any error here, not + * just open error? */ + rc = it_open_error(DISP_OPEN_OPEN, it); + if (rc != 0) + GOTO(out_och_free, rc); + + LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF), + "inode %p: disposition %x, status %d\n", inode, + it_disposition(it, ~0), it->it_status); + + rc = ll_local_open(file, it, fd, *och_p); + if (rc) + GOTO(out_och_free, rc); + } + mutex_unlock(&lli->lli_och_mutex); + fd = NULL; + + /* Must do this outside lli_och_mutex lock to prevent deadlock where + different kind of OPEN lock for this same inode gets cancelled + by ldlm_cancel_lru */ + if (!S_ISREG(inode->i_mode)) + GOTO(out_och_free, rc); + + cl_lov_delay_create_clear(&file->f_flags); + cl_lu_noimport_clear(&file->f_flags); + GOTO(out_och_free, rc); + +out_och_free: + if (rc) { + if (och_p && *och_p) { + OBD_FREE(*och_p, sizeof (struct obd_client_handle)); + *och_p = NULL; /* OBD_FREE writes some magic there */ + (*och_usecount)--; + } + mutex_unlock(&lli->lli_och_mutex); + +out_openerr: + if (lli->lli_opendir_key == fd) + ll_deauthorize_statahead(inode, fd); + if (fd != NULL) + ll_file_data_put(fd); + } else { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1); + } + +out_nofiledata: + if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) { + ptlrpc_req_finished(it->it_request); + it_clear_disposition(it, DISP_ENQ_OPEN_REF); + } + + return rc; +} + +static int ll_md_blocking_lease_ast(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, void *data, int flag) +{ + int rc; + struct lustre_handle lockh; + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc < 0) { + CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc); + RETURN(rc); + } + break; + case LDLM_CB_CANCELING: + /* do nothing */ + break; + } + RETURN(0); +} + +/** + * When setting a lease on a file, we take ownership of the lli_mds_*_och + * and save it as fd->fd_och so as to force client to reopen the file even + * if it has an open lock in cache already. + */ +static int ll_lease_och_acquire(struct inode *inode, struct file *file, + struct lustre_handle *old_open_handle) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct obd_client_handle **och_p; + __u64 *och_usecount; + int rc = 0; + ENTRY; + + /* Get the openhandle of the file */ + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) + GOTO(out_unlock, rc = -EBUSY); + + if (fd->fd_och == NULL) { + if (file->f_mode & FMODE_WRITE) { + LASSERT(lli->lli_mds_write_och != NULL); + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else { + LASSERT(lli->lli_mds_read_och != NULL); + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + if (*och_usecount > 1) + GOTO(out_unlock, rc = -EBUSY); + + fd->fd_och = *och_p; + *och_usecount = 0; + *och_p = NULL; + } + + *old_open_handle = fd->fd_och->och_open_handle; + + EXIT; +out_unlock: + mutex_unlock(&lli->lli_och_mutex); + return rc; +} + +/** + * Release ownership on lli_mds_*_och when putting back a file lease. + */ +static int ll_lease_och_release(struct inode *inode, struct file *file) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct obd_client_handle **och_p; + struct obd_client_handle *old_och = NULL; + __u64 *och_usecount; + int rc = 0; + ENTRY; + + mutex_lock(&lli->lli_och_mutex); + if (file->f_mode & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else { + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + /* The file may have been open by another process (broken lease) so + * *och_p is not NULL. In this case we should simply increase usecount + * and close fd_och. + */ + if (*och_p != NULL) { + old_och = fd->fd_och; + (*och_usecount)++; + } else { + *och_p = fd->fd_och; + *och_usecount = 1; + } + fd->fd_och = NULL; + mutex_unlock(&lli->lli_och_mutex); + + if (old_och != NULL) + rc = ll_close_inode_openhandle(inode, old_och, 0, NULL); + + RETURN(rc); +} + +/** + * Acquire a lease and open the file. + */ +static struct obd_client_handle * +ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode, + __u64 open_flags) +{ + struct lookup_intent it = { .it_op = IT_OPEN }; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + struct lustre_handle old_open_handle = { 0 }; + struct obd_client_handle *och = NULL; + int rc; + int rc2; + ENTRY; + + if (fmode != FMODE_WRITE && fmode != FMODE_READ) + RETURN(ERR_PTR(-EINVAL)); + + if (file != NULL) { + if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC)) + RETURN(ERR_PTR(-EPERM)); + + rc = ll_lease_och_acquire(inode, file, &old_open_handle); + if (rc) + RETURN(ERR_PTR(rc)); + } + + OBD_ALLOC_PTR(och); + if (och == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + GOTO(out, rc = PTR_ERR(op_data)); + + /* To tell the MDT this openhandle is from the same owner */ + op_data->op_open_handle = old_open_handle; + + it.it_flags = fmode | open_flags; + it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE; + rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req, + &ll_md_blocking_lease_ast, + /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise + * it can be cancelled which may mislead applications that the lease is + * broken; + * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal + * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast + * doesn't deal with openhandle, so normal openhandle will be leaked. */ + LDLM_FL_NO_LRU | LDLM_FL_EXCL); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc < 0) + GOTO(out_release_it, rc); + + if (it_disposition(&it, DISP_LOOKUP_NEG)) + GOTO(out_release_it, rc = -ENOENT); + + rc = it_open_error(DISP_OPEN_OPEN, &it); + if (rc) + GOTO(out_release_it, rc); + + LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF)); + rc = ll_och_fill(sbi->ll_md_exp, &it, och); + if (rc) + GOTO(out_release_it, rc); + + if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ + GOTO(out_close, rc = -EOPNOTSUPP); + + /* already get lease, handle lease lock */ + ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); + if (it.it_lock_mode == 0 || + it.it_lock_bits != MDS_INODELOCK_OPEN) { + /* open lock must return for lease */ + CERROR(DFID "lease granted but no open lock, %d/%llu.\n", + PFID(ll_inode2fid(inode)), it.it_lock_mode, + it.it_lock_bits); + GOTO(out_close, rc = -EPROTO); + } + + ll_intent_release(&it); + RETURN(och); + +out_close: + /* Cancel open lock */ + if (it.it_lock_mode != 0) { + ldlm_lock_decref_and_cancel(&och->och_lease_handle, + it.it_lock_mode); + it.it_lock_mode = 0; + och->och_lease_handle.cookie = 0ULL; + } + rc2 = ll_close_inode_openhandle(inode, och, 0, NULL); + if (rc2 < 0) + CERROR("%s: error closing file "DFID": %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&ll_i2info(inode)->lli_fid), rc2); + och = NULL; /* och has been freed in ll_close_inode_openhandle() */ +out_release_it: + ll_intent_release(&it); +out: + if (och != NULL) + OBD_FREE_PTR(och); + RETURN(ERR_PTR(rc)); +} + +/** + * Check whether a layout swap can be done between two inodes. + * + * \param[in] inode1 First inode to check + * \param[in] inode2 Second inode to check + * + * \retval 0 on success, layout swap can be performed between both inodes + * \retval negative error code if requirements are not met + */ +static int ll_check_swap_layouts_validity(struct inode *inode1, + struct inode *inode2) +{ + if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) + return -EINVAL; + + if (inode_permission(&init_user_ns, inode1, MAY_WRITE) || + inode_permission(&init_user_ns, inode2, MAY_WRITE)) + return -EPERM; + + if (inode1->i_sb != inode2->i_sb) + return -EXDEV; + + return 0; +} + +static int ll_swap_layouts_close(struct obd_client_handle *och, + struct inode *inode, struct inode *inode2) +{ + const struct lu_fid *fid1 = ll_inode2fid(inode); + const struct lu_fid *fid2; + int rc; + ENTRY; + + CDEBUG(D_INODE, "%s: biased close of file "DFID"\n", + ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1)); + + rc = ll_check_swap_layouts_validity(inode, inode2); + if (rc < 0) + GOTO(out_free_och, rc); + + /* We now know that inode2 is a lustre inode */ + fid2 = ll_inode2fid(inode2); + + rc = lu_fid_cmp(fid1, fid2); + if (rc == 0) + GOTO(out_free_och, rc = -EINVAL); + + /* Close the file and {swap,merge} layouts between inode & inode2. + * NB: lease lock handle is released in mdc_close_layout_swap_pack() + * because we still need it to pack l_remote_handle to MDT. */ + rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP, + inode2); + + och = NULL; /* freed in ll_close_inode_openhandle() */ + +out_free_och: + if (och != NULL) + OBD_FREE_PTR(och); + + RETURN(rc); +} + +/** + * Release lease and close the file. + * It will check if the lease has ever broken. + */ +static int ll_lease_close_intent(struct obd_client_handle *och, + struct inode *inode, + bool *lease_broken, enum mds_op_bias bias, + void *data) +{ + struct ldlm_lock *lock; + bool cancelled = true; + int rc; + ENTRY; + + lock = ldlm_handle2lock(&och->och_lease_handle); + if (lock != NULL) { + lock_res_and_lock(lock); + cancelled = ldlm_is_cancel(lock); + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + + CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n", + PFID(&ll_i2info(inode)->lli_fid), cancelled, bias); + + if (lease_broken != NULL) + *lease_broken = cancelled; + + if (!cancelled && !bias) + ldlm_cli_cancel(&och->och_lease_handle, 0); + + if (cancelled) { /* no need to excute intent */ + bias = 0; + data = NULL; + } + + rc = ll_close_inode_openhandle(inode, och, bias, data); + RETURN(rc); +} + +static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, + bool *lease_broken) +{ + return ll_lease_close_intent(och, inode, lease_broken, 0, NULL); +} + +/** + * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT + */ +static int ll_lease_file_resync(struct obd_client_handle *och, + struct inode *inode, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct ll_ioc_lease_id ioc; + __u64 data_version_unused; + int rc; + ENTRY; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg, + sizeof(ioc))) + RETURN(-EFAULT); + + /* before starting file resync, it's necessary to clean up page cache + * in client memory, otherwise once the layout version is increased, + * writing back cached data will be denied the OSTs. */ + rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH); + if (rc) + GOTO(out, rc); + + op_data->op_lease_handle = och->och_lease_handle; + op_data->op_mirror_id = ioc.lil_mirror_id; + rc = md_file_resync(sbi->ll_md_exp, op_data); + if (rc) + GOTO(out, rc); + + EXIT; +out: + ll_finish_md_op_data(op_data); + return rc; +} + +int ll_merge_attr(const struct lu_env *env, struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct cl_attr *attr = vvp_env_thread_attr(env); + s64 atime; + s64 mtime; + s64 ctime; + int rc = 0; + + ENTRY; + + ll_inode_size_lock(inode); + + /* Merge timestamps the most recently obtained from MDS with + * timestamps obtained from OSTs. + * + * Do not overwrite atime of inode because it may be refreshed + * by file_accessed() function. If the read was served by cache + * data, there is no RPC to be sent so that atime may not be + * transferred to OSTs at all. MDT only updates atime at close time + * if it's at least 'mdd.*.atime_diff' older. + * All in all, the atime in Lustre does not strictly comply with + * POSIX. Solving this problem needs to send an RPC to MDT for each + * read, this will hurt performance. + */ + if (inode->i_atime.tv_sec < lli->lli_atime || + lli->lli_update_atime) { + inode->i_atime.tv_sec = lli->lli_atime; + lli->lli_update_atime = 0; + } + inode->i_mtime.tv_sec = lli->lli_mtime; + inode->i_ctime.tv_sec = lli->lli_ctime; + + mtime = inode->i_mtime.tv_sec; + atime = inode->i_atime.tv_sec; + ctime = inode->i_ctime.tv_sec; + + cl_object_attr_lock(obj); + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE)) + rc = -EINVAL; + else + rc = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + + if (rc != 0) + GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc)); + + if (atime < attr->cat_atime) + atime = attr->cat_atime; + + if (ctime < attr->cat_ctime) + ctime = attr->cat_ctime; + + if (mtime < attr->cat_mtime) + mtime = attr->cat_mtime; + + CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n", + PFID(&lli->lli_fid), attr->cat_size); + + i_size_write(inode, attr->cat_size); + inode->i_blocks = attr->cat_blocks; + + inode->i_mtime.tv_sec = mtime; + inode->i_atime.tv_sec = atime; + inode->i_ctime.tv_sec = ctime; + +out_size_unlock: + ll_inode_size_unlock(inode); + + RETURN(rc); +} + +/** + * Set designated mirror for I/O. + * + * So far only read, write, and truncated can support to issue I/O to + * designated mirror. + */ +void ll_io_set_mirror(struct cl_io *io, const struct file *file) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + /* clear layout version for generic(non-resync) I/O in case it carries + * stale layout version due to I/O restart */ + io->ci_layout_version = 0; + + /* FLR: disable non-delay for designated mirror I/O because obviously + * only one mirror is available */ + if (fd->fd_designated_mirror > 0) { + io->ci_ndelay = 0; + io->ci_designated_mirror = fd->fd_designated_mirror; + io->ci_layout_version = fd->fd_layout_version; + } + + CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n", + file->f_path.dentry->d_name.name, io->ci_designated_mirror); +} + +/* + * This is relatime_need_update() from Linux 5.17, which is not exported. + */ +static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, + struct timespec64 now) +{ + + if (!(mnt->mnt_flags & MNT_RELATIME)) + return 1; + /* + * Is mtime younger than atime? If yes, update atime: + */ + if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) + return 1; + /* + * Is ctime younger than atime? If yes, update atime: + */ + if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) + return 1; + + /* + * Is the previous atime value older than a day? If yes, + * update atime: + */ + if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) + return 1; + /* + * Good, we can skip the atime update: + */ + return 0; +} + +/* + * Very similar to kernel function: !__atime_needs_update() + */ +static bool file_is_noatime(const struct file *file) +{ + struct vfsmount *mnt = file->f_path.mnt; + struct inode *inode = file_inode((struct file *)file); + struct timespec64 now; + + if (file->f_flags & O_NOATIME) + return true; + + if (inode->i_flags & S_NOATIME) + return true; + + if (IS_NOATIME(inode)) + return true; + + if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY)) + return true; + + if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) + return true; + + if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) + return true; + + now = current_time(inode); + + if (!relatime_need_update(mnt, inode, now)) + return true; + + return false; +} + +static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK; + io->ci_lock_no_expand = fd->ll_lock_no_expand; + + if (iot == CIT_WRITE) { + io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND); + io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC || + file->f_flags & O_DIRECT || + IS_SYNC(inode)); + } + io->ci_obj = ll_i2info(inode)->lli_clob; + io->ci_lockreq = CILR_MAYBE; + if (ll_file_nolock(file)) { + io->ci_lockreq = CILR_NEVER; + io->ci_no_srvlock = 1; + } else if (file->f_flags & O_APPEND) { + io->ci_lockreq = CILR_MANDATORY; + } + io->ci_noatime = file_is_noatime(file); + + /* FLR: only use non-delay I/O for read as there is only one + * avaliable mirror for write. */ + io->ci_ndelay = !(iot == CIT_WRITE); + + ll_io_set_mirror(io, file); +} + +static ssize_t +ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args, + struct file *file, enum cl_io_type iot, + loff_t *ppos, size_t count) +{ + struct vvp_io *vio = vvp_env_io(env); + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct range_lock range; + struct cl_io *io; + ssize_t result = 0; + int rc = 0; + unsigned retried = 0; + bool restarted = false; + + ENTRY; + + CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n", + file_dentry(file)->d_name.name, + iot == CIT_READ ? "read" : "write", *ppos, count); + +restart: + io = vvp_env_thread_io(env); + ll_io_init(io, file, iot); + io->ci_ndelay_tried = retried; + + if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) { + bool range_locked = false; + + if (file->f_flags & O_APPEND) + range_lock_init(&range, 0, LUSTRE_EOF); + else + range_lock_init(&range, *ppos, *ppos + count - 1); + + vio->vui_fd = LUSTRE_FPRIVATE(file); + vio->vui_io_subtype = args->via_io_subtype; + + switch (vio->vui_io_subtype) { + case IO_NORMAL: + vio->vui_iter = args->u.normal.via_iter; + vio->vui_iocb = args->u.normal.via_iocb; + /* Direct IO reads must also take range lock, + * or multiple reads will try to work on the same pages + * See LU-6227 for details. */ + if (((iot == CIT_WRITE) || + (iot == CIT_READ && (file->f_flags & O_DIRECT))) && + !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n", + RL_PARA(&range)); + rc = range_lock(&lli->lli_write_tree, &range); + if (rc < 0) + GOTO(out, rc); + + range_locked = true; + } + break; + case IO_SPLICE: + vio->u.splice.vui_pipe = args->u.splice.via_pipe; + vio->u.splice.vui_flags = args->u.splice.via_flags; + break; + default: + CERROR("unknown IO subtype %u\n", vio->vui_io_subtype); + LBUG(); + } + + ll_cl_add(file, env, io, LCC_RW); + rc = cl_io_loop(env, io); + ll_cl_remove(file, env); + + if (range_locked) { + CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n", + RL_PARA(&range)); + range_unlock(&lli->lli_write_tree, &range); + } + } else { + /* cl_io_rw_init() handled IO */ + rc = io->ci_result; + } + + if (io->ci_nob > 0) { + result += io->ci_nob; + count -= io->ci_nob; + *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */ + + /* prepare IO restart */ + if (count > 0 && args->via_io_subtype == IO_NORMAL) + args->u.normal.via_iter = vio->vui_iter; + } +out: + cl_io_fini(env, io); + + CDEBUG(D_VFSTRACE, + "%s: %d io complete with rc: %d, result: %zd, restart: %d\n", + file->f_path.dentry->d_name.name, + iot, rc, result, io->ci_need_restart); + + if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) { + CDEBUG(D_VFSTRACE, + "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n", + file_dentry(file)->d_name.name, + iot == CIT_READ ? "read" : "write", + *ppos, count, result, rc); + /* preserve the tried count for FLR */ + retried = io->ci_ndelay_tried; + restarted = true; + goto restart; + } + + if (iot == CIT_READ) { + if (result > 0) + ll_stats_ops_tally(ll_i2sbi(inode), + LPROC_LL_READ_BYTES, result); + } else if (iot == CIT_WRITE) { + if (result > 0) { + ll_stats_ops_tally(ll_i2sbi(inode), + LPROC_LL_WRITE_BYTES, result); + fd->fd_write_failed = false; + } else if (result == 0 && rc == 0) { + rc = io->ci_result; + if (rc < 0) + fd->fd_write_failed = true; + else + fd->fd_write_failed = false; + } else if (rc != -ERESTARTSYS) { + fd->fd_write_failed = true; + } + } + + CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result); + + RETURN(result > 0 ? result : rc); +} + +/** + * The purpose of fast read is to overcome per I/O overhead and improve IOPS + * especially for small I/O. + * + * To serve a read request, CLIO has to create and initialize a cl_io and + * then request DLM lock. This has turned out to have siginificant overhead + * and affects the performance of small I/O dramatically. + * + * It's not necessary to create a cl_io for each I/O. Under the help of read + * ahead, most of the pages being read are already in memory cache and we can + * read those pages directly because if the pages exist, the corresponding DLM + * lock must exist so that page content must be valid. + * + * In fast read implementation, the llite speculatively finds and reads pages + * in memory cache. There are three scenarios for fast read: + * - If the page exists and is uptodate, kernel VM will provide the data and + * CLIO won't be intervened; + * - If the page was brought into memory by read ahead, it will be exported + * and read ahead parameters will be updated; + * - Otherwise the page is not in memory, we can't do fast read. Therefore, + * it will go back and invoke normal read, i.e., a cl_io will be created + * and DLM lock will be requested. + * + * POSIX compliance: posix standard states that read is intended to be atomic. + * Lustre read implementation is in line with Linux kernel read implementation + * and neither of them complies with POSIX standard in this matter. Fast read + * doesn't make the situation worse on single node but it may interleave write + * results from multiple nodes due to short read handling in ll_file_aio_read(). + * + * \param env - lu_env + * \param iocb - kiocb from kernel + * \param iter - user space buffers where the data will be copied + * + * \retval - number of bytes have been read, or error code if error occurred. + */ +static ssize_t +ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t result; + + if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp)))) + return 0; + + /* NB: we can't do direct IO for fast read because it will need a lock + * to make IO engine happy. */ + if (iocb->ki_filp->f_flags & O_DIRECT) + return 0; + + result = generic_file_read_iter(iocb, iter); + + /* If the first page is not in cache, generic_file_aio_read() will be + * returned with -ENODATA. + * See corresponding code in ll_readpage(). */ + if (result == -ENODATA) + result = 0; + + if (result > 0) + ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)), + LPROC_LL_READ_BYTES, result); + + return result; +} + +/* + * Read from a file (through the page cache). + */ +static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct lu_env *env; + struct vvp_io_args *args; + struct file *file = iocb->ki_filp; + ssize_t result; + ssize_t rc2; + __u16 refcheck; + + if (!iov_iter_count(to)) + return 0; + + result = ll_do_fast_read(iocb, to); + if (result < 0 || iov_iter_count(to) == 0) + GOTO(out, result); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + args = ll_env_args(env, IO_NORMAL); + args->u.normal.via_iter = to; + args->u.normal.via_iocb = iocb; + + rc2 = ll_file_io_generic(env, args, file, CIT_READ, + &iocb->ki_pos, iov_iter_count(to)); + if (rc2 > 0) + result += rc2; + else if (result == 0) + result = rc2; + + cl_env_put(env, &refcheck); +out: + if (result > 0) + ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid, + LUSTRE_FPRIVATE(file), iocb->ki_pos, result, + READ); + + return result; +} + +/** + * Similar trick to ll_do_fast_read, this improves write speed for tiny writes. + * If a page is already in the page cache and dirty (and some other things - + * See ll_tiny_write_begin for the instantiation of these rules), then we can + * write to it without doing a full I/O, because Lustre already knows about it + * and will write it out. This saves a lot of processing time. + * + * All writes here are within one page, so exclusion is handled by the page + * lock on the vm page. We do not do tiny writes for writes which touch + * multiple pages because it's very unlikely multiple sequential pages are + * are already dirty. + * + * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common + * and are unlikely to be to already dirty pages. + * + * Attribute updates are important here, we do them in ll_tiny_write_end. + */ +static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t count = iov_iter_count(iter); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + bool lock_inode = !IS_NOSEC(inode); + ssize_t result = 0; + + ENTRY; + + /* Restrict writes to single page and < PAGE_SIZE. See comment at top + * of function for why. + */ + if (count >= PAGE_SIZE || + (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE) + RETURN(0); + + if (unlikely(lock_inode)) + inode_lock(inode); + result = __generic_file_write_iter(iocb, iter); + + if (unlikely(lock_inode)) + inode_unlock(inode); + + /* If the page is not already dirty, ll_tiny_write_begin returns + * -ENODATA. We continue on to normal write. + */ + if (result == -ENODATA) + result = 0; + + if (result > 0) { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES, + result); + ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED); + } + + CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count); + + RETURN(result); +} + +/* + * Write to a file (through the page cache). + */ +static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct vvp_io_args *args; + struct lu_env *env; + ssize_t rc_tiny = 0, rc_normal; + struct file *file = iocb->ki_filp; + __u16 refcheck; + + ENTRY; + + if (!iov_iter_count(from)) + GOTO(out, rc_normal = 0); + + /* NB: we can't do direct IO for tiny writes because they use the page + * cache, we can't do sync writes because tiny writes can't flush + * pages, and we can't do append writes because we can't guarantee the + * required DLM locks are held to protect file size. + */ + if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) && + !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND))) + rc_tiny = ll_do_tiny_write(iocb, from); + + /* In case of error, go on and try normal write - Only stop if tiny + * write completed I/O. + */ + if (iov_iter_count(from) == 0) + GOTO(out, rc_normal = rc_tiny); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + args = ll_env_args(env, IO_NORMAL); + args->u.normal.via_iter = from; + args->u.normal.via_iocb = iocb; + + rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE, + &iocb->ki_pos, iov_iter_count(from)); + + /* On success, combine bytes written. */ + if (rc_tiny >= 0 && rc_normal > 0) + rc_normal += rc_tiny; + /* On error, only return error from normal write if tiny write did not + * write any bytes. Otherwise return bytes written by tiny write. + */ + else if (rc_tiny > 0) + rc_normal = rc_tiny; + + cl_env_put(env, &refcheck); +out: + if (rc_normal > 0) + ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid, + LUSTRE_FPRIVATE(file), iocb->ki_pos, + rc_normal, WRITE); + RETURN(rc_normal); +} + +#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +/* + * XXX: exact copy from kernel code (__generic_file_aio_write_nolock) + */ +static int ll_file_get_iov_count(const struct iovec *iov, + unsigned long *nr_segs, size_t *count) +{ + size_t cnt = 0; + unsigned long seg; + + for (seg = 0; seg < *nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + cnt += iv->iov_len; + if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + *nr_segs = seg; + cnt -= iv->iov_len; /* This segment is no good */ + break; + } + *count = cnt; + return 0; +} + +static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct iov_iter to; + size_t iov_count; + ssize_t result; + ENTRY; + + result = ll_file_get_iov_count(iov, &nr_segs, &iov_count); + if (result) + RETURN(result); + + if (!iov_count) + RETURN(0); + +# ifdef HAVE_IOV_ITER_INIT_DIRECTION + iov_iter_init(&to, READ, iov, nr_segs, iov_count); +# else /* !HAVE_IOV_ITER_INIT_DIRECTION */ + iov_iter_init(&to, iov, nr_segs, iov_count, 0); +# endif /* HAVE_IOV_ITER_INIT_DIRECTION */ + + result = ll_file_read_iter(iocb, &to); + + RETURN(result); +} + +static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct iovec iov = { .iov_base = buf, .iov_len = count }; + struct kiocb kiocb; + ssize_t result; + + ENTRY; + + if (!count) + RETURN(0); + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; +#ifdef HAVE_KIOCB_KI_LEFT + kiocb.ki_left = count; +#elif defined(HAVE_KI_NBYTES) + kiocb.i_nbytes = count; +#endif + + result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos); + *ppos = kiocb.ki_pos; + + RETURN(result); +} + +/* + * Write to a file (through the page cache). + * AIO stuff + */ +static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct iov_iter from; + size_t iov_count; + ssize_t result; + ENTRY; + + result = ll_file_get_iov_count(iov, &nr_segs, &iov_count); + if (result) + RETURN(result); + + if (!iov_count) + RETURN(0); + +# ifdef HAVE_IOV_ITER_INIT_DIRECTION + iov_iter_init(&from, WRITE, iov, nr_segs, iov_count); +# else /* !HAVE_IOV_ITER_INIT_DIRECTION */ + iov_iter_init(&from, iov, nr_segs, iov_count, 0); +# endif /* HAVE_IOV_ITER_INIT_DIRECTION */ + + result = ll_file_write_iter(iocb, &from); + + RETURN(result); +} + +static ssize_t ll_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct iovec iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + struct kiocb kiocb; + ssize_t result; + + ENTRY; + + if (!count) + RETURN(0); + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; +#ifdef HAVE_KIOCB_KI_LEFT + kiocb.ki_left = count; +#elif defined(HAVE_KI_NBYTES) + kiocb.ki_nbytes = count; +#endif + + result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos); + *ppos = kiocb.ki_pos; + + RETURN(result); +} +#endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + +/* + * Send file content (through pagecache) somewhere with helper + */ +static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) +{ + struct lu_env *env; + struct vvp_io_args *args; + ssize_t result; + __u16 refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + args = ll_env_args(env, IO_SPLICE); + args->u.splice.via_pipe = pipe; + args->u.splice.via_flags = flags; + + result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count); + cl_env_put(env, &refcheck); + + if (result > 0) + ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid, + LUSTRE_FPRIVATE(in_file), *ppos, result, + READ); + RETURN(result); +} + +int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, + __u64 flags, struct lov_user_md *lum, int lum_size) +{ + struct lookup_intent oit = { + .it_op = IT_OPEN, + .it_flags = flags | MDS_OPEN_BY_FID, + }; + int rc; + ENTRY; + + if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) == + le32_to_cpu(LOV_MAGIC_MAGIC)) { + /* this code will only exist for big-endian systems */ + lustre_swab_lov_user_md(lum, 0); + } + + ll_inode_size_lock(inode); + rc = ll_intent_file_open(dentry, lum, lum_size, &oit); + if (rc < 0) + GOTO(out_unlock, rc); + + ll_release_openhandle(dentry, &oit); + +out_unlock: + ll_inode_size_unlock(inode); + ll_intent_release(&oit); + + RETURN(rc); +} + +int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, + struct lov_mds_md **lmmp, int *lmm_size, + struct ptlrpc_request **request) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct mdt_body *body; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *req = NULL; + struct md_op_data *op_data; + int rc, lmmsize; + + rc = ll_get_default_mdsize(sbi, &lmmsize); + if (rc) + RETURN(rc); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, + strlen(filename), lmmsize, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; + rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr_name failed " + "on %s: rc %d\n", filename, rc); + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); /* checked by mdc_getattr_name */ + + lmmsize = body->mbo_eadatasize; + + if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || + lmmsize == 0) { + GOTO(out, rc = -ENODATA); + } + + lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize); + LASSERT(lmm != NULL); + + if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) && + lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) && + lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1)) + GOTO(out, rc = -EPROTO); + + /* + * This is coming from the MDS, so is probably in + * little endian. We convert it to host endian before + * passing it to userspace. + */ + if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) == + __swab32(LOV_MAGIC_MAGIC)) { + int stripe_count = 0; + + if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) || + lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { + stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + if (le32_to_cpu(lmm->lmm_pattern) & + LOV_PATTERN_F_RELEASED) + stripe_count = 0; + } + + lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0); + + /* if function called for directory - we should + * avoid swab not existent lsm objects */ + if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode)) + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v1 *)lmm)->lmm_objects, + stripe_count); + else if (lmm->lmm_magic == LOV_MAGIC_V3 && + S_ISREG(body->mbo_mode)) + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v3 *)lmm)->lmm_objects, + stripe_count); + } + +out: + *lmmp = lmm; + *lmm_size = lmmsize; + *request = req; + return rc; +} + +static int ll_lov_setea(struct inode *inode, struct file *file, + void __user *arg) +{ + __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE; + struct lov_user_md *lump; + int lum_size = sizeof(struct lov_user_md) + + sizeof(struct lov_user_ost_data); + int rc; + ENTRY; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + RETURN(-EPERM); + + OBD_ALLOC_LARGE(lump, lum_size); + if (lump == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(lump, arg, lum_size)) + GOTO(out_lump, rc = -EFAULT); + + rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump, + lum_size); + cl_lov_delay_create_clear(&file->f_flags); + +out_lump: + OBD_FREE_LARGE(lump, lum_size); + RETURN(rc); +} + +static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size) +{ + struct lu_env *env; + __u16 refcheck; + int rc; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size); + cl_env_put(env, &refcheck); + RETURN(rc); +} + +static int ll_lov_setstripe(struct inode *inode, struct file *file, + void __user *arg) +{ + struct lov_user_md __user *lum = (struct lov_user_md __user *)arg; + struct lov_user_md *klum; + int lum_size, rc; + __u64 flags = FMODE_WRITE; + ENTRY; + + rc = ll_copy_user_md(lum, &klum); + if (rc < 0) + RETURN(rc); + + lum_size = rc; + rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum, + lum_size); + if (!rc) { + __u32 gen; + + rc = put_user(0, &lum->lmm_stripe_count); + if (rc) + GOTO(out, rc); + + rc = ll_layout_refresh(inode, &gen); + if (rc) + GOTO(out, rc); + + rc = ll_file_getstripe(inode, arg, lum_size); + } + cl_lov_delay_create_clear(&file->f_flags); + +out: + OBD_FREE_LARGE(klum, lum_size); + RETURN(rc); +} + +static int +ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_grouplock grouplock; + int rc; + ENTRY; + + if (arg == 0) { + CWARN("group id for group lock must not be 0\n"); + RETURN(-EINVAL); + } + + if (ll_file_nolock(file)) + RETURN(-EOPNOTSUPP); + + spin_lock(&lli->lli_lock); + if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { + CWARN("group lock already existed with gid %lu\n", + fd->fd_grouplock.lg_gid); + spin_unlock(&lli->lli_lock); + RETURN(-EINVAL); + } + LASSERT(fd->fd_grouplock.lg_lock == NULL); + spin_unlock(&lli->lli_lock); + + /** + * XXX: group lock needs to protect all OST objects while PFL + * can add new OST objects during the IO, so we'd instantiate + * all OST objects before getting its group lock. + */ + if (obj) { + struct lu_env *env; + __u16 refcheck; + struct cl_layout cl = { + .cl_is_composite = false, + }; + struct lu_extent ext = { + .e_start = 0, + .e_end = OBD_OBJECT_EOF, + }; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = cl_object_layout_get(env, obj, &cl); + if (!rc && cl.cl_is_composite) + rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE, + &ext); + + cl_env_put(env, &refcheck); + if (rc) + RETURN(rc); + } + + rc = cl_get_grouplock(ll_i2info(inode)->lli_clob, + arg, (file->f_flags & O_NONBLOCK), &grouplock); + if (rc) + RETURN(rc); + + spin_lock(&lli->lli_lock); + if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { + spin_unlock(&lli->lli_lock); + CERROR("another thread just won the race\n"); + cl_put_grouplock(&grouplock); + RETURN(-EINVAL); + } + + fd->fd_flags |= LL_FILE_GROUP_LOCKED; + fd->fd_grouplock = grouplock; + spin_unlock(&lli->lli_lock); + + CDEBUG(D_INFO, "group lock %lu obtained\n", arg); + RETURN(0); +} + +static int ll_put_grouplock(struct inode *inode, struct file *file, + unsigned long arg) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_grouplock grouplock; + ENTRY; + + spin_lock(&lli->lli_lock); + if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + spin_unlock(&lli->lli_lock); + CWARN("no group lock held\n"); + RETURN(-EINVAL); + } + + LASSERT(fd->fd_grouplock.lg_lock != NULL); + + if (fd->fd_grouplock.lg_gid != arg) { + CWARN("group lock %lu doesn't match current id %lu\n", + arg, fd->fd_grouplock.lg_gid); + spin_unlock(&lli->lli_lock); + RETURN(-EINVAL); + } + + grouplock = fd->fd_grouplock; + memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock)); + fd->fd_flags &= ~LL_FILE_GROUP_LOCKED; + spin_unlock(&lli->lli_lock); + + cl_put_grouplock(&grouplock); + CDEBUG(D_INFO, "group lock %lu released\n", arg); + RETURN(0); +} + +/** + * Close inode open handle + * + * \param dentry [in] dentry which contains the inode + * \param it [in,out] intent which contains open info and result + * + * \retval 0 success + * \retval <0 failure + */ +int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it) +{ + struct inode *inode = dentry->d_inode; + struct obd_client_handle *och; + int rc; + ENTRY; + + LASSERT(inode); + + /* Root ? Do nothing. */ + if (is_root_inode(inode)) + RETURN(0); + + /* No open handle to close? Move away */ + if (!it_disposition(it, DISP_OPEN_OPEN)) + RETURN(0); + + LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0); + + OBD_ALLOC(och, sizeof(*och)); + if (!och) + GOTO(out, rc = -ENOMEM); + + rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och); + if (rc) + GOTO(out, rc); + + rc = ll_close_inode_openhandle(inode, och, 0, NULL); +out: + /* this one is in place of ll_file_open */ + if (it_disposition(it, DISP_ENQ_OPEN_REF)) { + ptlrpc_req_finished(it->it_request); + it_clear_disposition(it, DISP_ENQ_OPEN_REF); + } + RETURN(rc); +} + +/** + * Get size for inode for which FIEMAP mapping is requested. + * Make the FIEMAP get_info call and returns the result. + * \param fiemap kernel buffer to hold extens + * \param num_bytes kernel buffer size + */ +static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap, + size_t num_bytes) +{ + struct lu_env *env; + __u16 refcheck; + int rc = 0; + struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, }; + ENTRY; + + /* Checks for fiemap flags */ + if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) { + fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT; + return -EBADR; + } + + /* Check for FIEMAP_FLAG_SYNC */ + if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) { + rc = filemap_fdatawrite(inode->i_mapping); + if (rc) + return rc; + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + if (i_size_read(inode) == 0) { + rc = ll_glimpse_size(inode); + if (rc) + GOTO(out, rc); + } + + fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE); + obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid); + + /* If filesize is 0, then there would be no objects for mapping */ + if (fmkey.lfik_oa.o_size == 0) { + fiemap->fm_mapped_extents = 0; + GOTO(out, rc = 0); + } + + fmkey.lfik_fiemap = *fiemap; + + rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob, + &fmkey, fiemap, &num_bytes); +out: + cl_env_put(env, &refcheck); + RETURN(rc); +} + +int ll_fid2path(struct inode *inode, void __user *arg) +{ + struct obd_export *exp = ll_i2mdexp(inode); + const struct getinfo_fid2path __user *gfin = arg; + __u32 pathlen; + struct getinfo_fid2path *gfout; + size_t outsize; + int rc; + + ENTRY; + + if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) && + !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH)) + RETURN(-EPERM); + + /* Only need to get the buflen */ + if (get_user(pathlen, &gfin->gf_pathlen)) + RETURN(-EFAULT); + + if (pathlen > PATH_MAX) + RETURN(-EINVAL); + + outsize = sizeof(*gfout) + pathlen; + OBD_ALLOC(gfout, outsize); + if (gfout == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(gfout, arg, sizeof(*gfout))) + GOTO(gf_free, rc = -EFAULT); + /* append root FID after gfout to let MDT know the root FID so that it + * can lookup the correct path, this is mainly for fileset. + * old server without fileset mount support will ignore this. */ + *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode); + + /* Call mdc_iocontrol */ + rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL); + if (rc != 0) + GOTO(gf_free, rc); + + if (copy_to_user(arg, gfout, outsize)) + rc = -EFAULT; + +gf_free: + OBD_FREE(gfout, outsize); + RETURN(rc); +} + +static int +ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc) +{ + struct cl_object *obj = ll_i2info(inode)->lli_clob; + struct lu_env *env; + struct cl_io *io; + __u16 refcheck; + int result; + + ENTRY; + + ioc->idv_version = 0; + ioc->idv_layout_version = UINT_MAX; + + /* If no file object initialized, we consider its version is 0. */ + if (obj == NULL) + RETURN(0); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = obj; + io->u.ci_data_version.dv_data_version = 0; + io->u.ci_data_version.dv_layout_version = UINT_MAX; + io->u.ci_data_version.dv_flags = ioc->idv_flags; + +restart: + if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0) + result = cl_io_loop(env, io); + else + result = io->ci_result; + + ioc->idv_version = io->u.ci_data_version.dv_data_version; + ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version; + + cl_io_fini(env, io); + + if (unlikely(io->ci_need_restart)) + goto restart; + + cl_env_put(env, &refcheck); + + RETURN(result); +} + +/* + * Read the data_version for inode. + * + * This value is computed using stripe object version on OST. + * Version is computed using server side locking. + * + * @param flags if do sync on the OST side; + * 0: no sync + * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs + * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs + */ +int ll_data_version(struct inode *inode, __u64 *data_version, int flags) +{ + struct ioc_data_version ioc = { .idv_flags = flags }; + int rc; + + rc = ll_ioc_data_version(inode, &ioc); + if (!rc) + *data_version = ioc.idv_version; + + return rc; +} + +/* + * Trigger a HSM release request for the provided inode. + */ +int ll_hsm_release(struct inode *inode) +{ + struct lu_env *env; + struct obd_client_handle *och = NULL; + __u64 data_version = 0; + int rc; + __u16 refcheck; + ENTRY; + + CDEBUG(D_INODE, "%s: Releasing file "DFID".\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&ll_i2info(inode)->lli_fid)); + + /* + * For directory, this is not the right + * way to do the release. Ideally this should clean + * up the directory without triggering update to the backend. + * Right now, this just sets the RELEASED bit for the + * directory. This is left as is so as to have a way to set + * the RELEASED bit as a deug/recovery method + * instead of doing a rm on the directory. + * TODO-MDLL: Tracking SIM - Simba-21969 + */ + if (S_ISDIR(inode->i_mode)) + och = ll_lease_open(inode, NULL, FMODE_READ, MDS_OPEN_RELEASE); + else + och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE); + if (IS_ERR(och)) + GOTO(out, rc = PTR_ERR(och)); + + /* Grab latest data_version and [am]time values */ + rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH); + if (rc != 0) + GOTO(out, rc); + + /* Don't need to merge these attrs for directories */ + if (!S_ISDIR(inode->i_mode)) { + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, rc = PTR_ERR(env)); + + rc = ll_merge_attr(env, inode); + cl_env_put(env, &refcheck); + + /* If error happen, we have the wrong size for a file. + * Don't release it. + */ + if (rc != 0) + GOTO(out, rc); + } + + /* Release the file. + * NB: lease lock handle is released in mdc_hsm_release_pack() because + * we still need it to pack l_remote_handle to MDT. */ + rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE, + &data_version); + och = NULL; + + EXIT; +out: + if (och != NULL && !IS_ERR(och)) /* close the file */ + ll_lease_close(och, inode, NULL); + + return rc; +} + +struct ll_swap_stack { + __u64 dv1; + __u64 dv2; + struct inode *inode1; + struct inode *inode2; + bool check_dv1; + bool check_dv2; +}; + +static int ll_swap_layouts(struct file *file1, struct file *file2, + struct lustre_swap_layouts *lsl) +{ + struct mdc_swap_layouts msl; + struct md_op_data *op_data; + __u32 gid; + __u64 dv; + struct ll_swap_stack *llss = NULL; + int rc; + + OBD_ALLOC_PTR(llss); + if (llss == NULL) + RETURN(-ENOMEM); + + llss->inode1 = file_inode(file1); + llss->inode2 = file_inode(file2); + + rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2); + if (rc < 0) + GOTO(free, rc); + + /* we use 2 bool because it is easier to swap than 2 bits */ + if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1) + llss->check_dv1 = true; + + if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2) + llss->check_dv2 = true; + + /* we cannot use lsl->sl_dvX directly because we may swap them */ + llss->dv1 = lsl->sl_dv1; + llss->dv2 = lsl->sl_dv2; + + rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2)); + if (rc == 0) /* same file, done! */ + GOTO(free, rc); + + if (rc < 0) { /* sequentialize it */ + swap(llss->inode1, llss->inode2); + swap(file1, file2); + swap(llss->dv1, llss->dv2); + swap(llss->check_dv1, llss->check_dv2); + } + + gid = lsl->sl_gid; + if (gid != 0) { /* application asks to flush dirty cache */ + rc = ll_get_grouplock(llss->inode1, file1, gid); + if (rc < 0) + GOTO(free, rc); + + rc = ll_get_grouplock(llss->inode2, file2, gid); + if (rc < 0) { + ll_put_grouplock(llss->inode1, file1, gid); + GOTO(free, rc); + } + } + + /* ultimate check, before swaping the layouts we check if + * dataversion has changed (if requested) */ + if (llss->check_dv1) { + rc = ll_data_version(llss->inode1, &dv, 0); + if (rc) + GOTO(putgl, rc); + if (dv != llss->dv1) + GOTO(putgl, rc = -EAGAIN); + } + + if (llss->check_dv2) { + rc = ll_data_version(llss->inode2, &dv, 0); + if (rc) + GOTO(putgl, rc); + if (dv != llss->dv2) + GOTO(putgl, rc = -EAGAIN); + } + + /* struct md_op_data is used to send the swap args to the mdt + * only flags is missing, so we use struct mdc_swap_layouts + * through the md_op_data->op_data */ + /* flags from user space have to be converted before they are send to + * server, no flag is sent today, they are only used on the client */ + msl.msl_flags = 0; + rc = -ENOMEM; + op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0, + 0, LUSTRE_OPC_ANY, &msl); + if (IS_ERR(op_data)) + GOTO(free, rc = PTR_ERR(op_data)); + + rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1), + sizeof(*op_data), op_data, NULL); + ll_finish_md_op_data(op_data); + + if (rc < 0) + GOTO(putgl, rc); + +putgl: + if (gid != 0) { + ll_put_grouplock(llss->inode2, file2, gid); + ll_put_grouplock(llss->inode1, file1, gid); + } + +free: + if (llss != NULL) + OBD_FREE_PTR(llss); + + RETURN(rc); +} + +int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss) +{ + struct obd_export *exp = ll_i2mdexp(inode); + struct md_op_data *op_data; + int rc; + ENTRY; + + /* Detect out-of range masks */ + if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK) + RETURN(-EINVAL); + + /* Non-root users are forbidden to set or clear flags which are + * NOT defined in HSM_USER_MASK. */ + if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) && + !cfs_capable(CFS_CAP_SYS_ADMIN)) + RETURN(-EPERM); + + if (!exp_connect_archive_id_array(exp)) { + /* Detect out-of range archive id */ + if ((hss->hss_valid & HSS_ARCHIVE_ID) && + (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE)) + RETURN(-EINVAL); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hss); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data), + op_data, NULL); + + ll_finish_md_op_data(op_data); + + RETURN(rc); +} + +int ll_hsm_import(struct inode *inode, struct file *file, + struct hsm_user_import *hui) +{ + struct hsm_state_set *hss = NULL; + struct iattr *attr = NULL; + int rc; + ENTRY; + + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + RETURN(-EINVAL); + + /* set HSM flags */ + OBD_ALLOC_PTR(hss); + if (hss == NULL) + GOTO(out, rc = -ENOMEM); + + hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID; + hss->hss_archive_id = hui->hui_archive_id; + hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED; + rc = ll_hsm_state_set(inode, hss); + if (rc != 0) + GOTO(out, rc); + + OBD_ALLOC_PTR(attr); + if (attr == NULL) + GOTO(out, rc = -ENOMEM); + + attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO); + + if (S_ISDIR(inode->i_mode)) + attr->ia_mode |= S_IFDIR; + else + attr->ia_mode |= S_IFREG; + + attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid); + attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid); + attr->ia_size = hui->hui_size; + attr->ia_mtime.tv_sec = hui->hui_mtime; + attr->ia_mtime.tv_nsec = hui->hui_mtime_ns; + attr->ia_atime.tv_sec = hui->hui_atime; + attr->ia_atime.tv_nsec = hui->hui_atime_ns; + + attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE | + ATTR_UID | ATTR_GID | + ATTR_MTIME | ATTR_MTIME_SET | + ATTR_ATIME | ATTR_ATIME_SET; + + /* + * TODO-MDLL check if this needs to be done here + * or in ll_setattr_raw(). The ll_setattr_raw does a + * unlock() before it calls the ll_md_setattr() for + * regular files using S_ISREG(). Calling this for + * inodes other than files might result in a deadlock. + * Tracked with Simba-20393. + */ + if (S_ISREG(inode->i_mode)) + inode_lock(inode); + + rc = ll_setattr_raw(file_dentry(file), attr, 0, true); + if (rc == -ENODATA) + rc = 0; + + if (S_ISREG(inode->i_mode)) + inode_unlock(inode); + +out: + if (hss != NULL) + OBD_FREE_PTR(hss); + + if (attr != NULL) + OBD_FREE_PTR(attr); + + RETURN(rc); +} + +static inline long ll_lease_type_from_fmode(fmode_t fmode) +{ + return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) | + ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0); +} + +static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu) +{ + struct inode *inode = file_inode(file); + struct iattr ia = { + .ia_valid = ATTR_ATIME | ATTR_ATIME_SET | + ATTR_MTIME | ATTR_MTIME_SET | + ATTR_CTIME, + .ia_atime = { + .tv_sec = lfu->lfu_atime_sec, + .tv_nsec = lfu->lfu_atime_nsec, + }, + .ia_mtime = { + .tv_sec = lfu->lfu_mtime_sec, + .tv_nsec = lfu->lfu_mtime_nsec, + }, + .ia_ctime = { + .tv_sec = lfu->lfu_ctime_sec, + .tv_nsec = lfu->lfu_ctime_nsec, + }, + }; + int rc; + ENTRY; + + if (!capable(CAP_SYS_ADMIN)) + RETURN(-EPERM); + + if (!S_ISREG(inode->i_mode)) + RETURN(-EINVAL); + + inode_lock(inode); + rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET, + false); + inode_unlock(inode); + + RETURN(rc); +} + +static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode) +{ + switch (mode) { + case MODE_READ_USER: + return CLM_READ; + case MODE_WRITE_USER: + return CLM_WRITE; + default: + return -EINVAL; + } +} + +static const char *const user_lockname[] = LOCK_MODE_NAMES; + +/* Used to allow the upper layers of the client to request an LDLM lock + * without doing an actual read or write. + * + * Used for ladvise lockahead to manually request specific locks. + * + * \param[in] file file this ladvise lock request is on + * \param[in] ladvise ladvise struct describing this lock request + * + * \retval 0 success, no detailed result available (sync requests + * and requests sent to the server [not handled locally] + * cannot return detailed results) + * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request, + * see definitions for details. + * \retval negative negative errno on error + */ +int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise) +{ + struct lu_env *env = NULL; + struct cl_io *io = NULL; + struct cl_lock *lock = NULL; + struct cl_lock_descr *descr = NULL; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + enum cl_lock_mode cl_mode; + off_t start = ladvise->lla_start; + off_t end = ladvise->lla_end; + int result; + __u16 refcheck; + + ENTRY; + + CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s " + "start=%llu, end=%llu\n", dentry->d_name.len, + dentry->d_name.name, dentry->d_inode, + user_lockname[ladvise->lla_lockahead_mode], (__u64) start, + (__u64) end); + + cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode); + if (cl_mode < 0) + GOTO(out, result = cl_mode); + + /* Get IO environment */ + result = cl_io_get(inode, &env, &io, &refcheck); + if (result <= 0) + GOTO(out, result); + + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result > 0) { + /* + * nothing to do for this io. This currently happens when + * stripe sub-object's are not yet created. + */ + result = io->ci_result; + } else if (result == 0) { + lock = vvp_env_lock(env); + descr = &lock->cll_descr; + + descr->cld_obj = io->ci_obj; + /* Convert byte offsets to pages */ + descr->cld_start = cl_index(io->ci_obj, start); + descr->cld_end = cl_index(io->ci_obj, end); + descr->cld_mode = cl_mode; + /* CEF_MUST is used because we do not want to convert a + * lockahead request to a lockless lock */ + descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND | + CEF_NONBLOCK; + + if (ladvise->lla_peradvice_flags & LF_ASYNC) + descr->cld_enq_flags |= CEF_SPECULATIVE; + + result = cl_lock_request(env, io, lock); + + /* On success, we need to release the lock */ + if (result >= 0) + cl_lock_release(env, lock); + } + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + + /* -ECANCELED indicates a matching lock with a different extent + * was already present, and -EEXIST indicates a matching lock + * on exactly the same extent was already present. + * We convert them to positive values for userspace to make + * recognizing true errors easier. + * Note we can only return these detailed results on async requests, + * as sync requests look the same as i/o requests for locking. */ + if (result == -ECANCELED) + result = LLA_RESULT_DIFFERENT; + else if (result == -EEXIST) + result = LLA_RESULT_SAME; + +out: + RETURN(result); +} +static const char *const ladvise_names[] = LU_LADVISE_NAMES; + +static int ll_ladvise_sanity(struct inode *inode, + struct llapi_lu_ladvise *ladvise) +{ + enum lu_ladvise_type advice = ladvise->lla_advice; + /* Note the peradvice flags is a 32 bit field, so per advice flags must + * be in the first 32 bits of enum ladvise_flags */ + __u32 flags = ladvise->lla_peradvice_flags; + /* 3 lines at 80 characters per line, should be plenty */ + int rc = 0; + + if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) { + rc = -EINVAL; + CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized," + "last supported advice is %s (value '%d'): rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), advice, + ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc); + GOTO(out, rc); + } + + /* Per-advice checks */ + switch (advice) { + case LU_LADVISE_LOCKNOEXPAND: + if (flags & ~LF_LOCKNOEXPAND_MASK) { + rc = -EINVAL; + CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: " + "rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), flags, + ladvise_names[advice], rc); + GOTO(out, rc); + } + break; + case LU_LADVISE_LOCKAHEAD: + /* Currently only READ and WRITE modes can be requested */ + if (ladvise->lla_lockahead_mode >= MODE_MAX_USER || + ladvise->lla_lockahead_mode == 0) { + rc = -EINVAL; + CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: " + "rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + ladvise->lla_lockahead_mode, + ladvise_names[advice], rc); + GOTO(out, rc); + } + fallthrough; + case LU_LADVISE_WILLREAD: + case LU_LADVISE_DONTNEED: + default: + /* Note fall through above - These checks apply to all advices + * except LOCKNOEXPAND */ + if (flags & ~LF_DEFAULT_MASK) { + rc = -EINVAL; + CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: " + "rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), flags, + ladvise_names[advice], rc); + GOTO(out, rc); + } + if (ladvise->lla_start >= ladvise->lla_end) { + rc = -EINVAL; + CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) " + "for %s: rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + ladvise->lla_start, ladvise->lla_end, + ladvise_names[advice], rc); + GOTO(out, rc); + } + break; + } + +out: + return rc; +} +#undef ERRSIZE + +/* + * Give file access advices + * + * The ladvise interface is similar to Linux fadvise() system call, except it + * forwards the advices directly from Lustre client to server. The server side + * codes will apply appropriate read-ahead and caching techniques for the + * corresponding files. + * + * A typical workload for ladvise is e.g. a bunch of different clients are + * doing small random reads of a file, so prefetching pages into OSS cache + * with big linear reads before the random IO is a net benefit. Fetching + * all that data into each client cache with fadvise() may not be, due to + * much more data being sent to the client. + */ +static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags, + struct llapi_lu_ladvise *ladvise) +{ + struct lu_env *env; + struct cl_io *io; + struct cl_ladvise_io *lio; + int rc; + __u16 refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = ll_i2info(inode)->lli_clob; + + /* initialize parameters for ladvise */ + lio = &io->u.ci_ladvise; + lio->li_start = ladvise->lla_start; + lio->li_end = ladvise->lla_end; + lio->li_fid = ll_inode2fid(inode); + lio->li_advice = ladvise->lla_advice; + lio->li_flags = flags; + + if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0) + rc = cl_io_loop(env, io); + else + rc = io->ci_result; + + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + RETURN(rc); +} + +static int ll_lock_noexpand(struct file *file, int flags) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + fd->ll_lock_no_expand = !(flags & LF_UNSET); + + return 0; +} + +int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd, + unsigned long arg) +{ + struct fsxattr fsxattr; + + if (copy_from_user(&fsxattr, + (const struct fsxattr __user *)arg, + sizeof(fsxattr))) + RETURN(-EFAULT); + + fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags); + if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) + fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT; + fsxattr.fsx_projid = ll_i2info(inode)->lli_projid; + if (copy_to_user((struct fsxattr __user *)arg, + &fsxattr, sizeof(fsxattr))) + RETURN(-EFAULT); + + RETURN(0); +} + +int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa) +{ + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (ll_i2info(inode)->lli_projid != fa->fsx_projid) + return -EINVAL; + + if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) { + if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) + return -EINVAL; + } else { + if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + return 0; +} + +int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd, + unsigned long arg) +{ + + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + int rc = 0; + struct fsxattr fsxattr; + struct cl_object *obj; + struct iattr *attr; + int flags; + + if (copy_from_user(&fsxattr, + (const struct fsxattr __user *)arg, + sizeof(fsxattr))) + RETURN(-EFAULT); + + rc = ll_ioctl_check_project(inode, &fsxattr); + if (rc) + RETURN(rc); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags); + op_data->op_attr_flags = ll_inode_to_ext_flags(flags); + if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT) + op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL; + op_data->op_projid = fsxattr.fsx_projid; + op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS; + rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL, + 0, &req); + ptlrpc_req_finished(req); + if (rc) + GOTO(out_fsxattr, rc); + ll_update_inode_flags(inode, op_data->op_attr_flags); + obj = ll_i2info(inode)->lli_clob; + if (obj == NULL) + GOTO(out_fsxattr, rc); + + OBD_ALLOC_PTR(attr); + if (attr == NULL) + GOTO(out_fsxattr, rc = -ENOMEM); + + rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS, + fsxattr.fsx_xflags); + OBD_FREE_PTR(attr); +out_fsxattr: + ll_finish_md_op_data(op_data); + RETURN(rc); +} + +static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_client_handle *och = NULL; + struct split_param sp; + bool lease_broken; + fmode_t fmode = 0; + enum mds_op_bias bias = 0; + struct file *layout_file = NULL; + void *data = NULL; + size_t data_size = 0; + long rc; + ENTRY; + + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) { + och = fd->fd_lease_och; + fd->fd_lease_och = NULL; + } + mutex_unlock(&lli->lli_och_mutex); + + if (och == NULL) + GOTO(out, rc = -ENOLCK); + + fmode = och->och_flags; + + switch (ioc->lil_flags) { + case LL_LEASE_RESYNC_DONE: + if (ioc->lil_count > IOC_IDS_MAX) + GOTO(out, rc = -EINVAL); + + data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]); + OBD_ALLOC(data, data_size); + if (!data) + GOTO(out, rc = -ENOMEM); + + if (copy_from_user(data, (void __user *)arg, data_size)) + GOTO(out, rc = -EFAULT); + + bias = MDS_CLOSE_RESYNC_DONE; + break; + case LL_LEASE_LAYOUT_MERGE: { + int fd; + + if (ioc->lil_count != 1) + GOTO(out, rc = -EINVAL); + + arg += sizeof(*ioc); + if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32))) + GOTO(out, rc = -EFAULT); + + layout_file = fget(fd); + if (!layout_file) + GOTO(out, rc = -EBADF); + + if ((file->f_flags & O_ACCMODE) == O_RDONLY || + (layout_file->f_flags & O_ACCMODE) == O_RDONLY) + GOTO(out, rc = -EPERM); + + data = file_inode(layout_file); + bias = MDS_CLOSE_LAYOUT_MERGE; + break; + } + case LL_LEASE_LAYOUT_SPLIT: { + int fdv; + int mirror_id; + + if (ioc->lil_count != 2) + GOTO(out, rc = -EINVAL); + + arg += sizeof(*ioc); + if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32))) + GOTO(out, rc = -EFAULT); + + arg += sizeof(__u32); + if (copy_from_user(&mirror_id, (void __user *)arg, + sizeof(__u32))) + GOTO(out, rc = -EFAULT); + + layout_file = fget(fdv); + if (!layout_file) + GOTO(out, rc = -EBADF); + + sp.sp_inode = file_inode(layout_file); + sp.sp_mirror_id = (__u16)mirror_id; + data = &sp; + bias = MDS_CLOSE_LAYOUT_SPLIT; + break; + } + default: + /* without close intent */ + break; + } + + rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data); + if (rc < 0) + GOTO(out, rc); + + rc = ll_lease_och_release(inode, file); + if (rc < 0) + GOTO(out, rc); + + if (lease_broken) + fmode = 0; + EXIT; + +out: + switch (ioc->lil_flags) { + case LL_LEASE_RESYNC_DONE: + if (data) + OBD_FREE(data, data_size); + break; + case LL_LEASE_LAYOUT_MERGE: + case LL_LEASE_LAYOUT_SPLIT: + if (layout_file) + fput(layout_file); + break; + } + + if (!rc) + rc = ll_lease_type_from_fmode(fmode); + RETURN(rc); +} + +static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct obd_client_handle *och = NULL; + __u64 open_flags = 0; + bool lease_broken; + fmode_t fmode; + long rc; + ENTRY; + + switch (ioc->lil_mode) { + case LL_LEASE_WRLCK: + if (!(file->f_mode & FMODE_WRITE)) + RETURN(-EPERM); + fmode = FMODE_WRITE; + break; + case LL_LEASE_RDLCK: + if (!(file->f_mode & FMODE_READ)) + RETURN(-EPERM); + fmode = FMODE_READ; + break; + case LL_LEASE_UNLCK: + RETURN(ll_file_unlock_lease(file, ioc, arg)); + default: + RETURN(-EINVAL); + } + + CDEBUG(D_INODE, "Set lease with mode %u\n", fmode); + + /* apply for lease */ + if (ioc->lil_flags & LL_LEASE_RESYNC) + open_flags = MDS_OPEN_RESYNC; + och = ll_lease_open(inode, file, fmode, open_flags); + if (IS_ERR(och)) + RETURN(PTR_ERR(och)); + + if (ioc->lil_flags & LL_LEASE_RESYNC) { + rc = ll_lease_file_resync(och, inode, arg); + if (rc) { + ll_lease_close(och, inode, NULL); + RETURN(rc); + } + rc = ll_layout_refresh(inode, &fd->fd_layout_version); + if (rc) { + ll_lease_close(och, inode, NULL); + RETURN(rc); + } + } + + rc = 0; + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och == NULL) { + fd->fd_lease_och = och; + och = NULL; + } + mutex_unlock(&lli->lli_och_mutex); + if (och != NULL) { + /* impossible now that only excl is supported for now */ + ll_lease_close(och, inode, &lease_broken); + rc = -EBUSY; + } + RETURN(rc); +} + +static long +ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + int flags, rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n", + PFID(ll_inode2fid(inode)), inode, cmd); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); + + /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ + if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ + RETURN(-ENOTTY); + + switch (cmd) { + case LL_IOC_GETFLAGS: + /* Get the current value of the file flags */ + return put_user(fd->fd_flags, (int __user *)arg); + case LL_IOC_SETFLAGS: + case LL_IOC_CLRFLAGS: + /* Set or clear specific file flags */ + /* XXX This probably needs checks to ensure the flags are + * not abused, and to handle any flag side effects. + */ + if (get_user(flags, (int __user *) arg)) + RETURN(-EFAULT); + + if (cmd == LL_IOC_SETFLAGS) { + if ((flags & LL_FILE_IGNORE_LOCK) && + !(file->f_flags & O_DIRECT)) { + CERROR("%s: unable to disable locking on " + "non-O_DIRECT file\n", current->comm); + RETURN(-EINVAL); + } + + fd->fd_flags |= flags; + } else { + fd->fd_flags &= ~flags; + } + RETURN(0); + case LL_IOC_LOV_SETSTRIPE: + case LL_IOC_LOV_SETSTRIPE_NEW: + RETURN(ll_lov_setstripe(inode, file, (void __user *)arg)); + case LL_IOC_LOV_SETEA: + RETURN(ll_lov_setea(inode, file, (void __user *)arg)); + case LL_IOC_LOV_SWAP_LAYOUTS: { + struct file *file2; + struct lustre_swap_layouts lsl; + + if (copy_from_user(&lsl, (char __user *)arg, + sizeof(struct lustre_swap_layouts))) + RETURN(-EFAULT); + + if ((file->f_flags & O_ACCMODE) == O_RDONLY) + RETURN(-EPERM); + + file2 = fget(lsl.sl_fd); + if (file2 == NULL) + RETURN(-EBADF); + + /* O_WRONLY or O_RDWR */ + if ((file2->f_flags & O_ACCMODE) == O_RDONLY) + GOTO(out, rc = -EPERM); + + if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) { + struct inode *inode2; + struct ll_inode_info *lli; + struct obd_client_handle *och = NULL; + + lli = ll_i2info(inode); + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) { + och = fd->fd_lease_och; + fd->fd_lease_och = NULL; + } + mutex_unlock(&lli->lli_och_mutex); + if (och == NULL) + GOTO(out, rc = -ENOLCK); + inode2 = file_inode(file2); + rc = ll_swap_layouts_close(och, inode, inode2); + } else { + rc = ll_swap_layouts(file, file2, &lsl); + } +out: + fput(file2); + RETURN(rc); + } + case LL_IOC_LOV_GETSTRIPE: + case LL_IOC_LOV_GETSTRIPE_NEW: + RETURN(ll_file_getstripe(inode, (void __user *)arg, 0)); + case FS_IOC_GETFLAGS: + case FS_IOC_SETFLAGS: + RETURN(ll_iocontrol(inode, file, cmd, arg)); + case FSFILT_IOC_GETVERSION: + case FS_IOC_GETVERSION: + RETURN(put_user(inode->i_generation, (int __user *)arg)); + /* We need to special case any other ioctls we want to handle, + * to send them to the MDS/OST as appropriate and to properly + * network encode the arg field. */ + case FS_IOC_SETVERSION: + RETURN(-ENOTSUPP); + + case LL_IOC_GROUP_LOCK: + RETURN(ll_get_grouplock(inode, file, arg)); + case LL_IOC_GROUP_UNLOCK: + RETURN(ll_put_grouplock(inode, file, arg)); + case IOC_OBD_STATFS: + RETURN(ll_obd_statfs(inode, (void __user *)arg)); + + case LL_IOC_FLUSHCTX: + RETURN(ll_flush_ctx(inode)); + case LL_IOC_PATH2FID: { + if (copy_to_user((void __user *)arg, ll_inode2fid(inode), + sizeof(struct lu_fid))) + RETURN(-EFAULT); + + RETURN(0); + } + case LL_IOC_GETPARENT: + RETURN(ll_getparent(file, (struct getparent __user *)arg)); + + case OBD_IOC_FID2PATH: + RETURN(ll_fid2path(inode, (void __user *)arg)); + case LL_IOC_DATA_VERSION: { + struct ioc_data_version idv; + int rc; + + if (copy_from_user(&idv, (char __user *)arg, sizeof(idv))) + RETURN(-EFAULT); + + idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH; + rc = ll_ioc_data_version(inode, &idv); + + if (rc == 0 && + copy_to_user((char __user *)arg, &idv, sizeof(idv))) + RETURN(-EFAULT); + + RETURN(rc); + } + + case LL_IOC_GET_MDTIDX: { + int mdtidx; + + mdtidx = ll_get_mdt_idx(inode); + if (mdtidx < 0) + RETURN(mdtidx); + + if (put_user((int)mdtidx, (int __user *)arg)) + RETURN(-EFAULT); + + RETURN(0); + } + case OBD_IOC_GETDTNAME: + case OBD_IOC_GETMDNAME: + RETURN(ll_get_obd_name(inode, cmd, arg)); + case LL_IOC_HSM_STATE_GET: { + struct md_op_data *op_data; + struct hsm_user_state *hus; + int rc; + + OBD_ALLOC_PTR(hus); + if (hus == NULL) + RETURN(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hus); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(hus); + RETURN(PTR_ERR(op_data)); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((void __user *)arg, hus, sizeof(*hus))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hus); + RETURN(rc); + } + case LL_IOC_HSM_STATE_SET: { + struct hsm_state_set *hss; + int rc; + + OBD_ALLOC_PTR(hss); + if (hss == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) { + OBD_FREE_PTR(hss); + RETURN(-EFAULT); + } + + rc = ll_hsm_state_set(inode, hss); + + OBD_FREE_PTR(hss); + RETURN(rc); + } + case LL_IOC_HSM_ACTION: { + struct md_op_data *op_data; + struct hsm_current_action *hca; + int rc; + + OBD_ALLOC_PTR(hca); + if (hca == NULL) + RETURN(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hca); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(hca); + RETURN(PTR_ERR(op_data)); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((char __user *)arg, hca, sizeof(*hca))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hca); + RETURN(rc); + } + case LL_IOC_SET_LEASE_OLD: { + struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg }; + + RETURN(ll_file_set_lease(file, &ioc, 0)); + } + case LL_IOC_SET_LEASE: { + struct ll_ioc_lease ioc; + + if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc))) + RETURN(-EFAULT); + + RETURN(ll_file_set_lease(file, &ioc, arg)); + } + case LL_IOC_GET_LEASE: { + struct ll_inode_info *lli = ll_i2info(inode); + struct ldlm_lock *lock = NULL; + fmode_t fmode = 0; + + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_lease_och != NULL) { + struct obd_client_handle *och = fd->fd_lease_och; + + lock = ldlm_handle2lock(&och->och_lease_handle); + if (lock != NULL) { + lock_res_and_lock(lock); + if (!ldlm_is_cancel(lock)) + fmode = och->och_flags; + + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + } + mutex_unlock(&lli->lli_och_mutex); + + RETURN(ll_lease_type_from_fmode(fmode)); + } + case LL_IOC_HSM_IMPORT: { + struct hsm_user_import *hui; + + OBD_ALLOC_PTR(hui); + if (hui == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) { + OBD_FREE_PTR(hui); + RETURN(-EFAULT); + } + + rc = ll_hsm_import(inode, file, hui); + + OBD_FREE_PTR(hui); + RETURN(rc); + } + case LL_IOC_FUTIMES_3: { + struct ll_futimes_3 lfu; + + if (copy_from_user(&lfu, + (const struct ll_futimes_3 __user *)arg, + sizeof(lfu))) + RETURN(-EFAULT); + + RETURN(ll_file_futimes_3(file, &lfu)); + } + case LL_IOC_LADVISE: { + struct llapi_ladvise_hdr *k_ladvise_hdr; + struct llapi_ladvise_hdr __user *u_ladvise_hdr; + int i; + int num_advise; + int alloc_size = sizeof(*k_ladvise_hdr); + + rc = 0; + u_ladvise_hdr = (void __user *)arg; + OBD_ALLOC_PTR(k_ladvise_hdr); + if (k_ladvise_hdr == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size)) + GOTO(out_ladvise, rc = -EFAULT); + + if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC || + k_ladvise_hdr->lah_count < 1) + GOTO(out_ladvise, rc = -EINVAL); + + num_advise = k_ladvise_hdr->lah_count; + if (num_advise >= LAH_COUNT_MAX) + GOTO(out_ladvise, rc = -EFBIG); + + OBD_FREE_PTR(k_ladvise_hdr); + alloc_size = offsetof(typeof(*k_ladvise_hdr), + lah_advise[num_advise]); + OBD_ALLOC(k_ladvise_hdr, alloc_size); + if (k_ladvise_hdr == NULL) + RETURN(-ENOMEM); + + /* + * TODO: submit multiple advices to one server in a single RPC + */ + if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size)) + GOTO(out_ladvise, rc = -EFAULT); + + for (i = 0; i < num_advise; i++) { + struct llapi_lu_ladvise *k_ladvise = + &k_ladvise_hdr->lah_advise[i]; + struct llapi_lu_ladvise __user *u_ladvise = + &u_ladvise_hdr->lah_advise[i]; + + rc = ll_ladvise_sanity(inode, k_ladvise); + if (rc) + GOTO(out_ladvise, rc); + + switch (k_ladvise->lla_advice) { + case LU_LADVISE_LOCKNOEXPAND: + rc = ll_lock_noexpand(file, + k_ladvise->lla_peradvice_flags); + GOTO(out_ladvise, rc); + case LU_LADVISE_LOCKAHEAD: + + rc = ll_file_lock_ahead(file, k_ladvise); + + if (rc < 0) + GOTO(out_ladvise, rc); + + if (put_user(rc, + &u_ladvise->lla_lockahead_result)) + GOTO(out_ladvise, rc = -EFAULT); + break; + default: + rc = ll_ladvise(inode, file, + k_ladvise_hdr->lah_flags, + k_ladvise); + if (rc) + GOTO(out_ladvise, rc); + break; + } + + } + +out_ladvise: + OBD_FREE(k_ladvise_hdr, alloc_size); + RETURN(rc); + } + case LL_IOC_FLR_SET_MIRROR: { + /* mirror I/O must be direct to avoid polluting page cache + * by stale data. */ + if (!(file->f_flags & O_DIRECT)) + RETURN(-EINVAL); + + fd->fd_designated_mirror = (__u32)arg; + RETURN(0); + } + case LL_IOC_FSGETXATTR: + RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg)); + case LL_IOC_FSSETXATTR: + RETURN(ll_ioctl_fssetxattr(inode, cmd, arg)); + case BLKSSZGET: + RETURN(put_user(PAGE_SIZE, (int __user *)arg)); + default: + RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL, + (void __user *)arg)); + } +} + +#ifndef HAVE_FILE_LLSEEK_SIZE +static inline loff_t +llseek_execute(struct file *file, loff_t offset, loff_t maxsize) +{ + if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + return -EINVAL; + if (offset > maxsize) + return -EINVAL; + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + return offset; +} + +static loff_t +generic_file_llseek_size(struct file *file, loff_t offset, int origin, + loff_t maxsize, loff_t eof) +{ + struct inode *inode = file_inode(file); + + switch (origin) { + case SEEK_END: + offset += eof; + break; + case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) + return file->f_pos; + /* + * f_lock protects against read/modify/write race with other + * SEEK_CURs. Note that parallel writes and reads behave + * like SEEK_SET. + */ + inode_lock(inode); + offset = llseek_execute(file, file->f_pos + offset, maxsize); + inode_unlock(inode); + return offset; + case SEEK_DATA: + /* + * In the generic case the entire file is data, so as long as + * offset isn't at the end of the file then the offset is data. + */ + if (offset >= eof) + return -ENXIO; + break; + case SEEK_HOLE: + /* + * There is a virtual hole at the end of the file, so as long as + * offset isn't i_size or larger, return i_size. + */ + if (offset >= eof) + return -ENXIO; + offset = eof; + break; + } + + return llseek_execute(file, offset, maxsize); +} +#endif + +static loff_t ll_file_seek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file_inode(file); + loff_t retval, eof = 0; + + ENTRY; + retval = offset + ((origin == SEEK_END) ? i_size_read(inode) : + (origin == SEEK_CUR) ? file->f_pos : 0); + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n", + PFID(ll_inode2fid(inode)), inode, retval, retval, + origin); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1); + + if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) { + retval = ll_glimpse_size(inode); + if (retval != 0) + RETURN(retval); + eof = i_size_read(inode); + } + + retval = ll_generic_file_llseek_size(file, offset, origin, + ll_file_maxbytes(inode), eof); + RETURN(retval); +} + +static int ll_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + int rc, err; + + LASSERT(!S_ISDIR(inode->i_mode)); + + /* catch async errors that were recorded back when async writeback + * failed for pages in this mapping. */ + rc = lli->lli_async_rc; + lli->lli_async_rc = 0; + if (lli->lli_clob != NULL) { + err = lov_read_and_clear_async_rc(lli->lli_clob); + if (rc == 0) + rc = err; + } + + /* The application has been told write failure already. + * Do not report failure again. */ + if (fd->fd_write_failed) + return 0; + return rc ? -EIO : 0; +} + +/** + * Called to make sure a portion of file has been written out. + * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST. + * + * Return how many pages have been written. + */ +int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, + enum cl_fsync_mode mode, int ignore_layout) +{ + struct lu_env *env; + struct cl_io *io; + struct cl_fsync_io *fio; + int result; + __u16 refcheck; + ENTRY; + + if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL && + mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL) + RETURN(-EINVAL); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = ll_i2info(inode)->lli_clob; + io->ci_ignore_layout = ignore_layout; + + /* initialize parameters for sync */ + fio = &io->u.ci_fsync; + fio->fi_start = start; + fio->fi_end = end; + fio->fi_fid = ll_inode2fid(inode); + fio->fi_mode = mode; + fio->fi_nr_written = 0; + + if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0) + result = cl_io_loop(env, io); + else + result = io->ci_result; + if (result == 0) + result = fio->fi_nr_written; + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + + RETURN(result); +} + +/* + * When dentry is provided (the 'else' case), file_dentry() may be + * null and dentry must be used directly rather than pulled from + * file_dentry() as is done otherwise. + */ + +#ifdef HAVE_FILE_FSYNC_4ARGS +int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct dentry *dentry = file_dentry(file); +#elif defined(HAVE_FILE_FSYNC_2ARGS) +int ll_fsync(struct file *file, int datasync) +{ + struct dentry *dentry = file_dentry(file); + loff_t start = 0; + loff_t end = LLONG_MAX; +#else +int ll_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + loff_t start = 0; + loff_t end = LLONG_MAX; +#endif + struct inode *inode = dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + struct ptlrpc_request *req; + int rc, err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1); + +#ifdef HAVE_FILE_FSYNC_4ARGS + rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + inode_lock(inode); +#else + /* fsync's caller has already called _fdata{sync,write}, we want + * that IO to finish before calling the osc and mdc sync methods */ + rc = filemap_fdatawait(inode->i_mapping); +#endif + + /* catch async errors that were recorded back when async writeback + * failed for pages in this mapping. */ + if (!S_ISDIR(inode->i_mode)) { + err = lli->lli_async_rc; + lli->lli_async_rc = 0; + if (rc == 0) + rc = err; + if (lli->lli_clob != NULL) { + err = lov_read_and_clear_async_rc(lli->lli_clob); + if (rc == 0) + rc = err; + } + } + + err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req); + if (!rc) + rc = err; + if (!err) + ptlrpc_req_finished(req); + + if (S_ISREG(inode->i_mode)) { + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0); + if (rc == 0 && err < 0) + rc = err; + if (rc < 0) + fd->fd_write_failed = true; + else + fd->fd_write_failed = false; + } + +#ifdef HAVE_FILE_FSYNC_4ARGS + inode_unlock(inode); +#endif + RETURN(rc); +} + +static int +ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) +{ + struct inode *inode = file_inode(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ldlm_enqueue_info einfo = { + .ei_type = LDLM_FLOCK, + .ei_cb_cp = ldlm_flock_completion_ast, + .ei_cbdata = file_lock, + }; + struct md_op_data *op_data; + struct lustre_handle lockh = { 0 }; + union ldlm_policy_data flock = { { 0 } }; + int fl_type = file_lock->fl_type; + __u64 flags = 0; + int rc; + int rc2 = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n", + PFID(ll_inode2fid(inode)), file_lock); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1); + + if (file_lock->fl_flags & FL_FLOCK) { + LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK)); + /* flocks are whole-file locks */ + flock.l_flock.end = OFFSET_MAX; + /* For flocks owner is determined by the local file desctiptor*/ + flock.l_flock.owner = (unsigned long)file_lock->fl_file; + } else if (file_lock->fl_flags & FL_POSIX) { + flock.l_flock.owner = (unsigned long)file_lock->fl_owner; + flock.l_flock.start = file_lock->fl_start; + flock.l_flock.end = file_lock->fl_end; + } else { + RETURN(-EINVAL); + } + flock.l_flock.pid = file_lock->fl_pid; + +#if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner) + /* Somewhat ugly workaround for svc lockd. + * lockd installs custom fl_lmops->lm_compare_owner that checks + * for the fl_owner to be the same (which it always is on local node + * I guess between lockd processes) and then compares pid. + * As such we assign pid to the owner field to make it all work, + * conflict with normal locks is unlikely since pid space and + * pointer space for current->files are not intersecting */ + if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner) + flock.l_flock.owner = (unsigned long)file_lock->fl_pid; +#endif + + switch (fl_type) { + case F_RDLCK: + einfo.ei_mode = LCK_PR; + break; + case F_UNLCK: + /* An unlock request may or may not have any relation to + * existing locks so we may not be able to pass a lock handle + * via a normal ldlm_lock_cancel() request. The request may even + * unlock a byte range in the middle of an existing lock. In + * order to process an unlock request we need all of the same + * information that is given with a normal read or write record + * lock request. To avoid creating another ldlm unlock (cancel) + * message we'll treat a LCK_NL flock request as an unlock. */ + einfo.ei_mode = LCK_NL; + break; + case F_WRLCK: + einfo.ei_mode = LCK_PW; + break; + default: + CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type); + RETURN (-ENOTSUPP); + } + + switch (cmd) { + case F_SETLKW: +#ifdef F_SETLKW64 + case F_SETLKW64: +#endif + flags = 0; + break; + case F_SETLK: +#ifdef F_SETLK64 + case F_SETLK64: +#endif + flags = LDLM_FL_BLOCK_NOWAIT; + break; + case F_GETLK: +#ifdef F_GETLK64 + case F_GETLK64: +#endif + flags = LDLM_FL_TEST_LOCK; + break; + default: + CERROR("unknown fcntl lock command: %d\n", cmd); + RETURN (-EINVAL); + } + + /* Save the old mode so that if the mode in the lock changes we + * can decrement the appropriate reader or writer refcount. */ + file_lock->fl_type = einfo.ei_mode; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, " + "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)), + flock.l_flock.pid, flags, einfo.ei_mode, + flock.l_flock.start, flock.l_flock.end); + + rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh, + flags); + + /* Restore the file lock type if not TEST lock. */ + if (!(flags & LDLM_FL_TEST_LOCK)) + file_lock->fl_type = fl_type; + +#ifdef HAVE_LOCKS_LOCK_FILE_WAIT + if ((rc == 0 || file_lock->fl_type == F_UNLCK) && + !(flags & LDLM_FL_TEST_LOCK)) + rc2 = locks_lock_file_wait(file, file_lock); +#else + if ((file_lock->fl_flags & FL_FLOCK) && + (rc == 0 || file_lock->fl_type == F_UNLCK)) + rc2 = flock_lock_file_wait(file, file_lock); + if ((file_lock->fl_flags & FL_POSIX) && + (rc == 0 || file_lock->fl_type == F_UNLCK) && + !(flags & LDLM_FL_TEST_LOCK)) + rc2 = posix_lock_file_wait(file, file_lock); +#endif /* HAVE_LOCKS_LOCK_FILE_WAIT */ + + if (rc2 && file_lock->fl_type != F_UNLCK) { + einfo.ei_mode = LCK_NL; + md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, + &lockh, flags); + rc = rc2; + } + + ll_finish_md_op_data(op_data); + + RETURN(rc); +} + +int ll_get_fid_by_name(struct inode *parent, const char *name, + int namelen, struct lu_fid *fid, + struct inode **inode) +{ + struct md_op_data *op_data = NULL; + struct mdt_body *body; + struct ptlrpc_request *req; + int rc; + ENTRY; + + op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE; + rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) + RETURN(rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out_req, rc = -EFAULT); + if (fid != NULL) + *fid = body->mbo_fid1; + + if (inode != NULL) + rc = ll_prep_inode(inode, req, parent->i_sb, NULL); +out_req: + ptlrpc_req_finished(req); + RETURN(rc); +} + +int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum, + const char *name) +{ + struct dentry *dchild = NULL; + struct inode *child_inode = NULL; + struct md_op_data *op_data; + struct ptlrpc_request *request = NULL; + struct obd_client_handle *och = NULL; + struct qstr qstr; + struct mdt_body *body; + __u64 data_version = 0; + size_t namelen = strlen(name); + int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic); + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n", + PFID(ll_inode2fid(parent)), name, + lum->lum_stripe_offset, lum->lum_stripe_count); + + if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) && + lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC)) + lustre_swab_lmv_user_md(lum); + + /* Get child FID first */ + qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen); + qstr.name = name; + qstr.len = namelen; + dchild = d_lookup(file_dentry(file), &qstr); + if (dchild) { + if (dchild->d_inode) + child_inode = igrab(dchild->d_inode); + dput(dchild); + } + + if (!child_inode) { + rc = ll_get_fid_by_name(parent, name, namelen, NULL, + &child_inode); + if (rc) + RETURN(rc); + } + + if (!child_inode) + RETURN(-ENOENT); + + if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) & + OBD_CONNECT2_DIR_MIGRATE)) { + if (le32_to_cpu(lum->lum_stripe_count) > 1 || + ll_dir_striped(child_inode)) { + CERROR("%s: MDT doesn't support stripe directory " + "migration!\n", + ll_get_fsname(parent->i_sb, NULL, 0)); + GOTO(out_iput, rc = -EOPNOTSUPP); + } + } + + /* + * lfs migrate command needs to be blocked on the client + * by checking the migrate FID against the FID of the + * filesystem root. + */ + if (is_root_inode(child_inode)) + GOTO(out_iput, rc = -EINVAL); + + op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, + child_inode->i_mode, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + GOTO(out_iput, rc = PTR_ERR(op_data)); + + inode_lock(child_inode); + op_data->op_fid3 = *ll_inode2fid(child_inode); + if (!fid_is_sane(&op_data->op_fid3)) { + CERROR("%s: migrate %s, but FID "DFID" is insane\n", + ll_get_fsname(parent->i_sb, NULL, 0), name, + PFID(&op_data->op_fid3)); + GOTO(out_unlock, rc = -EINVAL); + } + + op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA; + op_data->op_data = lum; + op_data->op_data_size = lumlen; + +again: + if (S_ISREG(child_inode->i_mode)) { + och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0); + if (IS_ERR(och)) { + rc = PTR_ERR(och); + och = NULL; + GOTO(out_unlock, rc); + } + + rc = ll_data_version(child_inode, &data_version, + LL_DV_WR_FLUSH); + if (rc != 0) + GOTO(out_close, rc); + + op_data->op_open_handle = och->och_open_handle; + op_data->op_data_version = data_version; + op_data->op_lease_handle = och->och_lease_handle; + op_data->op_bias |= MDS_CLOSE_MIGRATE; + + spin_lock(&och->och_mod->mod_open_req->rq_lock); + och->och_mod->mod_open_req->rq_replay = 0; + spin_unlock(&och->och_mod->mod_open_req->rq_lock); + } + + rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen, + name, namelen, &request); + if (rc == 0) { + LASSERT(request != NULL); + ll_update_times(request, parent); + + body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + /* If the server does release layout lock, then we cleanup + * the client och here, otherwise release it in out_close: */ + if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) { + obd_mod_put(och->och_mod); + md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp, + och); + och->och_open_handle.cookie = DEAD_HANDLE_MAGIC; + OBD_FREE_PTR(och); + och = NULL; + } + } + + if (request != NULL) { + ptlrpc_req_finished(request); + request = NULL; + } + + /* Try again if the file layout has changed. */ + if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) + goto again; + +out_close: + if (och) + ll_lease_close(och, child_inode, NULL); + if (!rc) + clear_nlink(child_inode); +out_unlock: + inode_unlock(child_inode); + ll_finish_md_op_data(op_data); +out_iput: + iput(child_inode); + RETURN(rc); +} + +static int +ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock) +{ + ENTRY; + + RETURN(-ENOSYS); +} + +/** + * test if some locks matching bits and l_req_mode are acquired + * - bits can be in different locks + * - if found clear the common lock bits in *bits + * - the bits not found, are kept in *bits + * \param inode [IN] + * \param bits [IN] searched lock bits [IN] + * \param l_req_mode [IN] searched lock mode + * \retval boolean, true iff all bits are found + */ +int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode) +{ + struct lustre_handle lockh; + union ldlm_policy_data policy; + enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ? + (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode; + struct lu_fid *fid; + __u64 flags; + int i; + ENTRY; + + if (!inode) + RETURN(0); + + fid = &ll_i2info(inode)->lli_fid; + CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid), + ldlm_lockname[mode]); + + flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK; + for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) { + policy.l_inodebits.bits = *bits & (1 << i); + if (policy.l_inodebits.bits == 0) + continue; + + if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, + &policy, mode, &lockh)) { + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(&lockh); + if (lock) { + *bits &= + ~(lock->l_policy_data.l_inodebits.bits); + LDLM_LOCK_PUT(lock); + } else { + *bits &= ~policy.l_inodebits.bits; + } + } + } + RETURN(*bits == 0); +} + +enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits, + struct lustre_handle *lockh, __u64 flags, + enum ldlm_mode mode) +{ + union ldlm_policy_data policy = { .l_inodebits = { bits } }; + struct lu_fid *fid; + enum ldlm_mode rc; + ENTRY; + + fid = &ll_i2info(inode)->lli_fid; + CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid)); + + rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags, + fid, LDLM_IBITS, &policy, mode, lockh); + + RETURN(rc); +} + +static int ll_inode_revalidate_fini(struct inode *inode, int rc) +{ + /* Already unlinked. Just update nlink and return success */ + if (rc == -ENOENT) { + clear_nlink(inode); + /* If it is striped directory, and there is bad stripe + * Let's revalidate the dentry again, instead of returning + * error */ + if (ll_dir_striped(inode)) + return 0; + + /* This path cannot be hit for regular files unless in + * case of obscure races, so no need to to validate + * size. */ + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return 0; + } else if (rc != 0) { + CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR, + "%s: revalidate FID "DFID" error: rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(ll_inode2fid(inode)), rc); + } + + return rc; +} + +static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op) +{ + struct inode *parent; + struct inode *inode = dentry->d_inode; + struct obd_export *exp = ll_i2mdexp(inode); + struct lookup_intent oit = { + .it_op = op, + }; + struct ptlrpc_request *req = NULL; + struct md_op_data *op_data; + const char *name = NULL; + size_t namelen = 0; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n", + PFID(ll_inode2fid(inode)), inode, dentry->d_name.name); + + if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID) { + parent = dentry->d_parent->d_inode; + name = dentry->d_name.name; + namelen = dentry->d_name.len; + } else { + parent = inode; + } + + op_data = ll_prep_md_op_data(NULL, parent, inode, name, namelen, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + /* Call getattr by fid */ + if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID) + op_data->op_flags = MF_GETATTR_BY_FID; + rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0); + ll_finish_md_op_data(op_data); + if (rc < 0) { + rc = ll_inode_revalidate_fini(inode, rc); + GOTO(out, rc); + } + + rc = ll_revalidate_it_finish(req, &oit, dentry); + if (rc != 0) { + ll_intent_release(&oit); + GOTO(out, rc); + } + + /* Unlinked? Unhash dentry, so it is not picked up later by + * do_lookup() -> ll_revalidate_it(). We cannot use d_drop + * here to preserve get_cwd functionality on 2.6. + * Bug 10503 */ + if (!dentry->d_inode->i_nlink) { + ll_lock_dcache(inode); + d_lustre_invalidate(dentry, 0); + ll_unlock_dcache(inode); + } + + ll_lookup_finish_locks(&oit, dentry); +out: + ptlrpc_req_finished(req); + + return rc; +} + +static int ll_merge_md_attr(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_attr attr = { 0 }; + int rc; + + LASSERT(lli->lli_lsm_md != NULL); + + if (!lmv_dir_striped(lli->lli_lsm_md)) + RETURN(0); + + down_read(&lli->lli_lsm_sem); + rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md, + &attr, ll_md_blocking_ast); + up_read(&lli->lli_lsm_sem); + if (rc != 0) + RETURN(rc); + + set_nlink(inode, attr.cat_nlink); + inode->i_blocks = attr.cat_blocks; + i_size_write(inode, attr.cat_size); + + ll_i2info(inode)->lli_atime = attr.cat_atime; + ll_i2info(inode)->lli_mtime = attr.cat_mtime; + ll_i2info(inode)->lli_ctime = attr.cat_ctime; + + RETURN(0); +} + +static inline dev_t ll_compat_encode_dev(dev_t dev) +{ + /* The compat_sys_*stat*() syscalls will fail unless the + * device majors and minors are both less than 256. Note that + * the value returned here will be passed through + * old_encode_dev() in cp_compat_stat(). And so we are not + * trying to return a valid compat (u16) device number, just + * one that will pass the old_valid_dev() check. */ + + return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff); +} + +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR) +int ll_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) +{ + struct dentry *de = path->dentry; +#else +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) +{ +#endif + struct inode *inode = de->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + + ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1); + + rc = ll_inode_revalidate(de, IT_GETATTR); + if (rc < 0) + RETURN(rc); + + if (S_ISREG(inode->i_mode)) { + /* In case of restore, the MDT has the right size and has + * already send it back without granting the layout lock, + * inode is up-to-date so glimpse is useless. + * Also to glimpse we need the layout, in case of a running + * restore the MDT holds the layout lock so the glimpse will + * block up to the end of restore (getattr will block) + */ + if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) { + rc = ll_glimpse_size(inode); + if (rc < 0) + RETURN(rc); + } + } else { + /* If object isn't regular a file then don't validate size. */ + if (ll_dir_striped(inode)) { + rc = ll_merge_md_attr(inode); + if (rc < 0) + RETURN(rc); + } + + inode->i_atime.tv_sec = lli->lli_atime; + inode->i_mtime.tv_sec = lli->lli_mtime; + inode->i_ctime.tv_sec = lli->lli_ctime; + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30); + + if (ll_need_32bit_api(sbi)) { + stat->ino = cl_fid_build_ino(&lli->lli_fid, 1); + stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev); + stat->rdev = ll_compat_encode_dev(inode->i_rdev); + } else { + stat->ino = inode->i_ino; + stat->dev = inode->i_sb->s_dev; + stat->rdev = inode->i_rdev; + } + + stat->mode = inode->i_mode; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits; + + stat->nlink = inode->i_nlink; + stat->size = i_size_read(inode); + stat->blocks = inode->i_blocks; + + return 0; +} + +static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + int rc; + size_t num_bytes; + struct fiemap *fiemap; + unsigned int extent_count = fieinfo->fi_extents_max; + + num_bytes = sizeof(*fiemap) + (extent_count * + sizeof(struct fiemap_extent)); + OBD_ALLOC_LARGE(fiemap, num_bytes); + + if (fiemap == NULL) + RETURN(-ENOMEM); + + fiemap->fm_flags = fieinfo->fi_flags; + fiemap->fm_extent_count = fieinfo->fi_extents_max; + fiemap->fm_start = start; + fiemap->fm_length = len; + if (extent_count > 0 && + copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start, + sizeof(struct fiemap_extent)) != 0) + GOTO(out, rc = -EFAULT); + + rc = ll_do_fiemap(inode, fiemap, num_bytes); + + fieinfo->fi_flags = fiemap->fm_flags; + fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents; + if (extent_count > 0 && + copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0], + fiemap->fm_mapped_extents * + sizeof(struct fiemap_extent)) != 0) + GOTO(out, rc = -EFAULT); +out: + OBD_FREE_LARGE(fiemap, num_bytes); + return rc; +} + +struct posix_acl *ll_get_acl(struct inode *inode, int type +#ifdef HAVE_GET_ACL_RCU_ARG + , bool rcu +#endif /* HAVE_GET_ACL_RCU_ARG */ + ) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct posix_acl *acl = NULL; + ENTRY; + +#ifdef HAVE_GET_ACL_RCU_ARG + if (rcu) + return ERR_PTR(-ECHILD); +#endif + + spin_lock(&lli->lli_lock); + /* VFS' acl_permission_check->check_acl will release the refcount */ + acl = posix_acl_dup(lli->lli_posix_acl); + spin_unlock(&lli->lli_lock); + + RETURN(acl); +} + +#ifdef HAVE_IOP_SET_ACL +#ifdef CONFIG_FS_POSIX_ACL +int ll_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + const char *name = NULL; + char *value = NULL; + size_t value_size = 0; + int rc = 0; + ENTRY; + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + if (acl) + rc = posix_acl_update_mode(mnt_userns, inode, + &inode->i_mode, &acl); + break; + + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + rc = acl ? -EACCES : 0; + break; + + default: + rc = -EINVAL; + break; + } + if (rc) + return rc; + + if (acl) { + value_size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(value_size, GFP_NOFS); + if (value == NULL) + GOTO(out, rc = -ENOMEM); + + rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size); + if (rc < 0) + GOTO(out_value, rc); + } + + rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), + value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM, + name, value, value_size, 0, 0, &req); + + ptlrpc_req_finished(req); +out_value: + kfree(value); +out: + if (rc) + forget_cached_acl(inode, type); + else + set_cached_acl(inode, type, acl); + RETURN(rc); +} +#endif /* CONFIG_FS_POSIX_ACL */ +#endif /* HAVE_IOP_SET_ACL */ + +#ifndef HAVE_USER_NAMESPACE_ARG +#ifndef HAVE_GENERIC_PERMISSION_2ARGS +static int +# ifdef HAVE_GENERIC_PERMISSION_4ARGS +ll_check_acl(struct inode *inode, int mask, unsigned int flags) +# else +ll_check_acl(struct inode *inode, int mask) +# endif +{ +# ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *acl; + int rc; + ENTRY; + +# ifdef HAVE_GENERIC_PERMISSION_4ARGS + if (flags & IPERM_FLAG_RCU) + return -ECHILD; +# endif + acl = ll_get_acl(inode, ACL_TYPE_ACCESS); + + if (!acl) + RETURN(-EAGAIN); + + rc = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + + RETURN(rc); +# else /* !CONFIG_FS_POSIX_ACL */ + return -EAGAIN; +# endif /* CONFIG_FS_POSIX_ACL */ +} +#endif /* HAVE_GENERIC_PERMISSION_2ARGS */ +#endif /* HAVE_USER_NAMESPACE_ARG */ + +#ifdef HAVE_GENERIC_PERMISSION_4ARGS +int ll_inode_permission(struct inode *inode, int mask, unsigned int flags) +#else +# ifdef HAVE_INODE_PERMISION_2ARGS +int ll_inode_permission(struct inode *inode, int mask) +# else +# ifdef HAVE_USER_NAMESPACE_ARG +int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) +# else +int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd) +# endif +# endif +#endif +{ + int rc = 0; + struct ll_sb_info *sbi; + struct root_squash_info *squash; + struct cred *cred = NULL; + const struct cred *old_cred = NULL; + cfs_cap_t cap; + bool squash_id = false; + ENTRY; + +#ifdef MAY_NOT_BLOCK + if (mask & MAY_NOT_BLOCK) + return -ECHILD; +#elif defined(HAVE_GENERIC_PERMISSION_4ARGS) + if (flags & IPERM_FLAG_RCU) + return -ECHILD; +#endif + + /* as root inode are NOT getting validated in lookup operation, + * need to do it before permission check. */ + + if (is_root_inode(inode)) { + rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP); + if (rc) + RETURN(rc); + } + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n", + PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask); + + /* squash fsuid/fsgid if needed */ + sbi = ll_i2sbi(inode); + squash = &sbi->ll_squash; + if (unlikely(squash->rsi_uid != 0 && + uid_eq(current_fsuid(), GLOBAL_ROOT_UID) && + !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) { + squash_id = true; + } + if (squash_id) { + CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n", + __kuid_val(current_fsuid()), __kgid_val(current_fsgid()), + squash->rsi_uid, squash->rsi_gid); + + /* update current process's credentials + * and FS capability */ + cred = prepare_creds(); + if (cred == NULL) + RETURN(-ENOMEM); + + cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid); + cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid); + for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) { + if ((1 << cap) & CFS_CAP_FS_MASK) + cap_lower(cred->cap_effective, cap); + } + old_cred = override_creds(cred); + } + + ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1); + rc = ll_generic_permission(mnt_userns, inode, mask, flags, ll_check_acl); + /* restore current process's credentials and FS capability */ + if (squash_id) { + revert_creds(old_cred); + put_cred(cred); + } + + RETURN(rc); +} + +/* -o localflock - only provides locally consistent flock locks */ +struct file_operations ll_file_operations = { +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +# ifdef HAVE_SYNC_READ_WRITE + .read = new_sync_read, + .write = new_sync_write, +# endif + .read_iter = ll_file_read_iter, + .write_iter = ll_file_write_iter, +#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .read = ll_file_read, + .aio_read = ll_file_aio_read, + .write = ll_file_write, + .aio_write = ll_file_aio_write, +#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, + .splice_read = ll_file_splice_read, + .fsync = ll_fsync, + .flush = ll_flush +}; + +struct file_operations ll_file_operations_flock = { +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +# ifdef HAVE_SYNC_READ_WRITE + .read = new_sync_read, + .write = new_sync_write, +# endif /* HAVE_SYNC_READ_WRITE */ + .read_iter = ll_file_read_iter, + .write_iter = ll_file_write_iter, +#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .read = ll_file_read, + .aio_read = ll_file_aio_read, + .write = ll_file_write, + .aio_write = ll_file_aio_write, +#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, + .splice_read = ll_file_splice_read, + .fsync = ll_fsync, + .flush = ll_flush, + .flock = ll_file_flock, + .lock = ll_file_flock +}; + +/* These are for -o noflock - to return ENOSYS on flock calls */ +struct file_operations ll_file_operations_noflock = { +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +# ifdef HAVE_SYNC_READ_WRITE + .read = new_sync_read, + .write = new_sync_write, +# endif /* HAVE_SYNC_READ_WRITE */ + .read_iter = ll_file_read_iter, + .write_iter = ll_file_write_iter, +#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .read = ll_file_read, + .aio_read = ll_file_aio_read, + .write = ll_file_write, + .aio_write = ll_file_aio_write, +#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, + .splice_read = ll_file_splice_read, + .fsync = ll_fsync, + .flush = ll_flush, + .flock = ll_file_noflock, + .lock = ll_file_noflock +}; + +struct inode_operations ll_file_inode_operations = { + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, +#ifdef HAVE_IOP_XATTR + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .removexattr = ll_removexattr, +#endif + .listxattr = ll_listxattr, + .fiemap = ll_fiemap, +#ifdef HAVE_IOP_GET_ACL + .get_acl = ll_get_acl, +#endif +#ifdef HAVE_IOP_SET_ACL + .set_acl = ll_set_acl, +#endif +}; + +int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct lu_env *env; + int rc; + __u16 refcheck; + ENTRY; + + if (obj == NULL) + RETURN(0); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = cl_conf_set(env, lli->lli_clob, conf); + if (rc < 0) + GOTO(out, rc); + + if (conf->coc_opc == OBJECT_CONF_SET) { + struct ldlm_lock *lock = conf->coc_lock; + struct cl_layout cl = { + .cl_layout_gen = 0, + }; + + LASSERT(lock != NULL); + LASSERT(ldlm_has_layout(lock)); + + /* it can only be allowed to match after layout is + * applied to inode otherwise false layout would be + * seen. Applying layout shoud happen before dropping + * the intent lock. */ + ldlm_lock_allow_match(lock); + + rc = cl_object_layout_get(env, obj, &cl); + if (rc < 0) + GOTO(out, rc); + + CDEBUG(D_VFSTRACE, + DFID": layout version change: %u -> %u\n", + PFID(&lli->lli_fid), ll_layout_version_get(lli), + cl.cl_layout_gen); + ll_layout_version_set(lli, cl.cl_layout_gen); + } + +out: + cl_env_put(env, &refcheck); + + RETURN(rc); +} + +/* Fetch layout from MDT with getxattr request, if it's not ready yet */ +static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock) + +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req; + void *lvbdata; + void *lmm; + int lmmsize; + int rc; + ENTRY; + + CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n", + PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock), + lock->l_lvb_data, lock->l_lvb_len); + + if (lock->l_lvb_data != NULL) + RETURN(0); + + /* if layout lock was granted right away, the layout is returned + * within DLM_LVB of dlm reply; otherwise if the lock was ever + * blocked and then granted via completion ast, we have to fetch + * layout here. Please note that we can't use the LVB buffer in + * completion AST because it doesn't have a large enough buffer */ + rc = ll_get_default_mdsize(sbi, &lmmsize); + if (rc < 0) + RETURN(rc); + + rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR, + XATTR_NAME_LOV, lmmsize, &req); + if (rc < 0) { + if (rc == -ENODATA) + GOTO(out, rc = 0); /* empty layout */ + else + RETURN(rc); + } + + lmmsize = rc; + rc = 0; + if (lmmsize == 0) /* empty layout */ + GOTO(out, rc = 0); + + lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize); + if (lmm == NULL) + GOTO(out, rc = -EFAULT); + + OBD_ALLOC_LARGE(lvbdata, lmmsize); + if (lvbdata == NULL) + GOTO(out, rc = -ENOMEM); + + memcpy(lvbdata, lmm, lmmsize); + lock_res_and_lock(lock); + if (unlikely(lock->l_lvb_data == NULL)) { + lock->l_lvb_type = LVB_T_LAYOUT; + lock->l_lvb_data = lvbdata; + lock->l_lvb_len = lmmsize; + lvbdata = NULL; + } + unlock_res_and_lock(lock); + + if (lvbdata) + OBD_FREE_LARGE(lvbdata, lmmsize); + + EXIT; + +out: + ptlrpc_req_finished(req); + return rc; +} + +/** + * Apply the layout to the inode. Layout lock is held and will be released + * in this function. + */ +static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode, + struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ldlm_lock *lock; + struct cl_object_conf conf; + int rc = 0; + bool lvb_ready; + bool wait_layout = false; + ENTRY; + + LASSERT(lustre_handle_is_used(lockh)); + + lock = ldlm_handle2lock(lockh); + LASSERT(lock != NULL); + LASSERT(ldlm_has_layout(lock)); + + LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured", + PFID(&lli->lli_fid), inode); + + /* in case this is a caching lock and reinstate with new inode */ + md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL); + + lock_res_and_lock(lock); + lvb_ready = ldlm_is_lvb_ready(lock); + unlock_res_and_lock(lock); + + /* checking lvb_ready is racy but this is okay. The worst case is + * that multi processes may configure the file on the same time. */ + if (lvb_ready) + GOTO(out, rc = 0); + + rc = ll_layout_fetch(inode, lock); + if (rc < 0) + GOTO(out, rc); + + /* for layout lock, lmm is stored in lock's lvb. + * lvb_data is immutable if the lock is held so it's safe to access it + * without res lock. + * + * set layout to file. Unlikely this will fail as old layout was + * surely eliminated */ + memset(&conf, 0, sizeof conf); + conf.coc_opc = OBJECT_CONF_SET; + conf.coc_inode = inode; + conf.coc_lock = lock; + conf.u.coc_layout.lb_buf = lock->l_lvb_data; + conf.u.coc_layout.lb_len = lock->l_lvb_len; + rc = ll_layout_conf(inode, &conf); + + /* refresh layout failed, need to wait */ + wait_layout = rc == -EBUSY; + EXIT; +out: + LDLM_LOCK_PUT(lock); + ldlm_lock_decref(lockh, mode); + + /* wait for IO to complete if it's still being used. */ + if (wait_layout) { + CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&lli->lli_fid), inode); + + memset(&conf, 0, sizeof conf); + conf.coc_opc = OBJECT_CONF_WAIT; + conf.coc_inode = inode; + rc = ll_layout_conf(inode, &conf); + if (rc == 0) + rc = -EAGAIN; + + CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&lli->lli_fid), rc); + } + RETURN(rc); +} + +/** + * Issue layout intent RPC to MDS. + * \param inode [in] file inode + * \param intent [in] layout intent + * + * \retval 0 on success + * \retval < 0 error code + */ +static int ll_layout_intent(struct inode *inode, struct layout_intent *intent) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct lookup_intent it; + struct ptlrpc_request *req; + int rc; + ENTRY; + + op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, + 0, 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_data = intent; + op_data->op_data_size = sizeof(*intent); + + memset(&it, 0, sizeof(it)); + it.it_op = IT_LAYOUT; + if (intent->li_opc == LAYOUT_INTENT_WRITE || + intent->li_opc == LAYOUT_INTENT_TRUNC) + it.it_flags = FMODE_WRITE; + + LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&lli->lli_fid), inode); + + rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req, + &ll_md_blocking_ast, 0); + if (it.it_request != NULL) + ptlrpc_req_finished(it.it_request); + it.it_request = NULL; + + ll_finish_md_op_data(op_data); + + /* set lock data in case this is a new lock */ + if (!rc) + ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); + + ll_intent_drop_lock(&it); + + RETURN(rc); +} + +/** + * This function checks if there exists a LAYOUT lock on the client side, + * or enqueues it if it doesn't have one in cache. + * + * This function will not hold layout lock so it may be revoked any time after + * this function returns. Any operations depend on layout should be redone + * in that case. + * + * This function should be called before lov_io_init() to get an uptodate + * layout version, the caller should save the version number and after IO + * is finished, this function should be called again to verify that layout + * is not changed during IO time. + */ +int ll_layout_refresh(struct inode *inode, __u32 *gen) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct lustre_handle lockh; + struct layout_intent intent = { + .li_opc = LAYOUT_INTENT_ACCESS, + }; + enum ldlm_mode mode; + int rc; + ENTRY; + + *gen = ll_layout_version_get(lli); + if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE) + RETURN(0); + + /* sanity checks */ + LASSERT(fid_is_sane(ll_inode2fid(inode))); + + /* take layout lock mutex to enqueue layout lock exclusively. */ + mutex_lock(&lli->lli_layout_mutex); + + while (1) { + /* mostly layout lock is caching on the local side, so try to + * match it before grabbing layout lock mutex. */ + mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0, + LCK_CR | LCK_CW | LCK_PR | LCK_PW); + if (mode != 0) { /* hit cached lock */ + rc = ll_layout_lock_set(&lockh, mode, inode); + if (rc == -EAGAIN) + continue; + break; + } + + rc = ll_layout_intent(inode, &intent); + if (rc != 0) + break; + } + + if (rc == 0) + *gen = ll_layout_version_get(lli); + mutex_unlock(&lli->lli_layout_mutex); + + RETURN(rc); +} + +/** + * Issue layout intent RPC indicating where in a file an IO is about to write. + * + * \param[in] inode file inode. + * \param[in] ext write range with start offset of fille in bytes where + * an IO is about to write, and exclusive end offset in + * bytes. + * + * \retval 0 on success + * \retval < 0 error code + */ +int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc, + struct lu_extent *ext) +{ + struct layout_intent intent = { + .li_opc = opc, + .li_extent.e_start = ext->e_start, + .li_extent.e_end = ext->e_end, + }; + int rc; + ENTRY; + + rc = ll_layout_intent(inode, &intent); + + RETURN(rc); +} + +/** + * This function send a restore request to the MDT + */ +int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length) +{ + struct hsm_user_request *hur; + int len, rc; + ENTRY; + + len = sizeof(struct hsm_user_request) + + sizeof(struct hsm_user_item); + OBD_ALLOC(hur, len); + if (hur == NULL) + RETURN(-ENOMEM); + + hur->hur_request.hr_action = HUA_RESTORE; + hur->hur_request.hr_archive_id = 0; + hur->hur_request.hr_flags = 0; + memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid, + sizeof(hur->hur_user_item[0].hui_fid)); + hur->hur_user_item[0].hui_extent.offset = offset; + hur->hur_user_item[0].hui_extent.length = length; + hur->hur_request.hr_itemcount = 1; + rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp, + len, hur, NULL); + OBD_FREE(hur, len); + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/glimpse.c b/drivers/staging/lustrefsx/lustre/llite/glimpse.c new file mode 100644 index 0000000000000..ddbaa142514de --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/glimpse.c @@ -0,0 +1,222 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * glimpse code used by vvp (and other Lustre clients in the future). + * + * Author: Nikita Danilov + * Author: Oleg Drokin + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "cl_object.h" +#include "llite_internal.h" +#include "vvp_internal.h" + +static const struct cl_lock_descr whole_file = { + .cld_start = 0, + .cld_end = CL_PAGE_EOF, + .cld_mode = CLM_READ +}; + +/* + * Check whether file has possible unwritten pages. + * + * \retval 1 file is mmap-ed or has dirty pages + * 0 otherwise + */ +blkcnt_t dirty_cnt(struct inode *inode) +{ + blkcnt_t cnt = 0; + struct vvp_object *vob = cl_inode2vvp(inode); + void *results[1]; + + if (inode->i_mapping != NULL) + cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree, + results, 0, 1, + PAGECACHE_TAG_DIRTY); + if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0) + cnt = 1; + + return (cnt > 0) ? 1 : 0; +} + +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob, int agl) +{ + const struct lu_fid *fid = lu_object_fid(&clob->co_lu); + struct cl_lock *lock = vvp_env_lock(env); + struct cl_lock_descr *descr = &lock->cll_descr; + int result; + + ENTRY; + result = 0; + + CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid)); + + /* NOTE: this looks like DLM lock request, but it may + * not be one. Due to CEF_GLIMPSE flag (translated + * to LDLM_FL_HAS_INTENT by osc), this is + * glimpse request, that won't revoke any + * conflicting DLM locks held. Instead, + * ll_glimpse_callback() will be called on each + * client holding a DLM lock against this file, + * and resulting size will be returned for each + * stripe. DLM lock on [0, EOF] is acquired only + * if there were no conflicting locks. If there + * were conflicting locks, enqueuing or waiting + * fails with -ENAVAIL, but valid inode + * attributes are returned anyway. */ + *descr = whole_file; + descr->cld_obj = clob; + descr->cld_mode = CLM_READ; + descr->cld_enq_flags = CEF_GLIMPSE | CEF_MUST; + if (agl) + descr->cld_enq_flags |= CEF_SPECULATIVE | CEF_NONBLOCK; + /* + * CEF_MUST protects glimpse lock from conversion into + * a lockless mode. + */ + result = cl_lock_request(env, io, lock); + if (result < 0) + RETURN(result); + + if (!agl) { + ll_merge_attr(env, inode); + if (i_size_read(inode) > 0 && inode->i_blocks == 0) { + /* + * LU-417: Add dirty pages block count + * lest i_blocks reports 0, some "cp" or + * "tar" may think it's a completely + * sparse file and skip it. + */ + inode->i_blocks = dirty_cnt(inode); + } + } + + cl_lock_release(env, lock); + + RETURN(result); +} + +/** + * Get an IO environment for special operations such as glimpse locks and + * manually requested locks (ladvise lockahead) + * + * \param[in] inode inode the operation is being performed on + * \param[out] envout thread specific execution environment + * \param[out] ioout client io description + * \param[out] refcheck reference check + * + * \retval 1 on success + * \retval 0 not a regular file, cannot get environment + * \retval negative negative errno on error + */ +int cl_io_get(struct inode *inode, struct lu_env **envout, + struct cl_io **ioout, __u16 *refcheck) +{ + struct lu_env *env; + struct cl_io *io; + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *clob = lli->lli_clob; + int result; + + if (S_ISREG(inode->i_mode)) { + env = cl_env_get(refcheck); + if (!IS_ERR(env)) { + io = vvp_env_thread_io(env); + io->ci_obj = clob; + *envout = env; + *ioout = io; + result = +1; + } else + result = PTR_ERR(env); + } else + result = 0; + return result; +} + +int cl_glimpse_size0(struct inode *inode, int agl) +{ + /* + * We don't need ast_flags argument to cl_glimpse_size(), because + * osc_lock_enqueue() takes care of the possible deadlock that said + * argument was introduced to avoid. + */ + /* + * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to + * cl_glimpse_size(), which doesn't make sense: glimpse locks are not + * blocking anyway. + */ + struct lu_env *env = NULL; + struct cl_io *io = NULL; + __u16 refcheck; + int retried = 0; + int result; + + ENTRY; + + result = cl_io_get(inode, &env, &io, &refcheck); + if (result <= 0) + RETURN(result); + + do { + io->ci_ndelay_tried = retried++; + io->ci_ndelay = io->ci_verify_layout = 1; + result = cl_io_init(env, io, CIT_GLIMPSE, io->ci_obj); + if (result > 0) { + /* + * nothing to do for this io. This currently happens + * when stripe sub-object's are not yet created. + */ + result = io->ci_result; + } else if (result == 0) { + result = cl_glimpse_lock(env, io, inode, io->ci_obj, + agl); + if (!agl && result == -EWOULDBLOCK) + io->ci_need_restart = 1; + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2); + cl_io_fini(env, io); + } while (unlikely(io->ci_need_restart)); + + cl_env_put(env, &refcheck); + RETURN(result); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c new file mode 100644 index 0000000000000..21a10ec551e44 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c @@ -0,0 +1,298 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "llite_internal.h" +#include "vvp_internal.h" + +/** + * An `emergency' environment used by cl_inode_fini() when cl_env_get() + * fails. Access to this environment is serialized by cl_inode_fini_guard + * mutex. + */ +struct lu_env *cl_inode_fini_env; +__u16 cl_inode_fini_refcheck; + +/** + * A mutex serializing calls to slp_inode_fini() under extreme memory + * pressure, when environments cannot be allocated. + */ +static DEFINE_MUTEX(cl_inode_fini_guard); + +int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr, + enum op_xvalid xvalid, unsigned int attr_flags) +{ + struct lu_env *env; + struct cl_io *io; + int result; + __u16 refcheck; + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = obj; + io->ci_verify_layout = 1; + + io->u.ci_setattr.sa_attr.lvb_atime = attr->ia_atime.tv_sec; + io->u.ci_setattr.sa_attr.lvb_mtime = attr->ia_mtime.tv_sec; + io->u.ci_setattr.sa_attr.lvb_ctime = attr->ia_ctime.tv_sec; + io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size; + io->u.ci_setattr.sa_attr_flags = attr_flags; + io->u.ci_setattr.sa_avalid = attr->ia_valid; + io->u.ci_setattr.sa_xvalid = xvalid; + io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu); + +again: + if (attr->ia_valid & ATTR_FILE) + ll_io_set_mirror(io, attr->ia_file); + + if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) { + struct vvp_io *vio = vvp_env_io(env); + + if (attr->ia_valid & ATTR_FILE) + /* populate the file descriptor for ftruncate to honor + * group lock - see LU-787 */ + vio->vui_fd = LUSTRE_FPRIVATE(attr->ia_file); + + result = cl_io_loop(env, io); + } else { + result = io->ci_result; + } + cl_io_fini(env, io); + if (unlikely(io->ci_need_restart)) + goto again; + + cl_env_put(env, &refcheck); + RETURN(result); +} + +/** + * Initialize or update CLIO structures for regular files when new + * meta-data arrives from the server. + * + * \param inode regular file inode + * \param md new file metadata from MDS + * - allocates cl_object if necessary, + * - updated layout, if object was already here. + */ +int cl_file_inode_init(struct inode *inode, struct lustre_md *md) +{ + struct lu_env *env; + struct ll_inode_info *lli; + struct cl_object *clob; + struct lu_site *site; + struct lu_fid *fid; + struct cl_object_conf conf = { + .coc_inode = inode, + .u = { + .coc_layout = md->layout, + } + }; + int result = 0; + __u16 refcheck; + + LASSERT(md->body->mbo_valid & OBD_MD_FLID); + LASSERT(S_ISREG(inode->i_mode)); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + site = ll_i2sbi(inode)->ll_site; + lli = ll_i2info(inode); + fid = &lli->lli_fid; + LASSERT(fid_is_sane(fid)); + + if (lli->lli_clob == NULL) { + /* clob is slave of inode, empty lli_clob means for new inode, + * there is no clob in cache with the given fid, so it is + * unnecessary to perform lookup-alloc-lookup-insert, just + * alloc and insert directly. + */ + if (!(inode->i_state & I_NEW)) { + result = -EIO; + CERROR("%s: unexpected not-NEW inode "DFID": rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid), + result); + goto out; + } + + conf.coc_lu.loc_flags = LOC_F_NEW; + clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev), + fid, &conf); + if (!IS_ERR(clob)) { + /* + * No locking is necessary, as new inode is + * locked by I_NEW bit. + */ + lli->lli_clob = clob; + lu_object_ref_add(&clob->co_lu, "inode", inode); + } else { + result = PTR_ERR(clob); + } + } else { + result = cl_conf_set(env, lli->lli_clob, &conf); + if (result == -EBUSY) { + /* ignore the error since I/O will handle it later */ + result = 0; + } + } + + if (result != 0) + CERROR("%s: failed to initialize cl_object "DFID": rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid), result); + +out: + cl_env_put(env, &refcheck); + + return result; +} + +/** + * Wait for others drop their references of the object at first, then we drop + * the last one, which will lead to the object be destroyed immediately. + * Must be called after cl_object_kill() against this object. + * + * The reason we want to do this is: destroying top object will wait for sub + * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs) + * to initiate top object destroying which may deadlock. See bz22520. + */ +static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) +{ + struct lu_object_header *header = obj->co_lu.lo_header; + wait_queue_entry_t waiter; + + if (unlikely(atomic_read(&header->loh_ref) != 1)) { + struct lu_site *site = obj->co_lu.lo_dev->ld_site; + wait_queue_head_t *wq; + + wq = lu_site_wq_from_fid(site, &header->loh_fid); + + init_waitqueue_entry(&waiter, current); + add_wait_queue(wq, &waiter); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&header->loh_ref) == 1) + break; + schedule(); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(wq, &waiter); + } + + cl_object_put(env, obj); +} + +void cl_inode_fini(struct inode *inode) +{ + struct lu_env *env; + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *clob = lli->lli_clob; + __u16 refcheck; + int emergency; + + if (clob != NULL) { + env = cl_env_get(&refcheck); + emergency = IS_ERR(env); + if (emergency) { + mutex_lock(&cl_inode_fini_guard); + LASSERT(cl_inode_fini_env != NULL); + env = cl_inode_fini_env; + } + + /* + * cl_object cache is a slave to inode cache (which, in turn + * is a slave to dentry cache), don't keep cl_object in memory + * when its master is evicted. + */ + cl_object_kill(env, clob); + lu_object_ref_del(&clob->co_lu, "inode", inode); + cl_object_put_last(env, clob); + lli->lli_clob = NULL; + if (emergency) + mutex_unlock(&cl_inode_fini_guard); + else + cl_env_put(env, &refcheck); + } +} + +/** + * build inode number from passed @fid. + * + * For 32-bit systems or syscalls limit the inode number to a 32-bit value + * to avoid EOVERFLOW errors. This will inevitably result in inode number + * collisions, but fid_flatten32() tries hard to avoid this if possible. + */ +__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32) +{ + if (BITS_PER_LONG == 32 || api32) + RETURN(fid_flatten32(fid)); + + RETURN(fid_flatten(fid)); +} + +/** + * build inode generation from passed @fid. If our FID overflows the 32-bit + * inode number then return a non-zero generation to distinguish them. + */ +__u32 cl_fid_build_gen(const struct lu_fid *fid) +{ + if (fid_is_igif(fid)) + RETURN(lu_igif_gen(fid)); + + RETURN(fid_flatten(fid) >> 32); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c new file mode 100644 index 0000000000000..5869d949ff97b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c @@ -0,0 +1,184 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl code used by vvp (and other Lustre clients in the future). + * + */ +#define DEBUG_SUBSYSTEM S_LLITE +#include +#include +#include +#include + +#include "llite_internal.h" + +/* Initialize the default and maximum LOV EA and cookie sizes. This allows + * us to make MDS RPCs with large enough reply buffers to hold the + * maximum-sized (= maximum striped) EA and cookie without having to + * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */ +static int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp) +{ + u32 val_size; + u32 max_easize; + u32 def_easize; + int rc; + ENTRY; + + val_size = sizeof(max_easize); + rc = obd_get_info(NULL, dt_exp, sizeof(KEY_MAX_EASIZE), KEY_MAX_EASIZE, + &val_size, &max_easize); + if (rc != 0) + RETURN(rc); + + val_size = sizeof(def_easize); + rc = obd_get_info(NULL, dt_exp, sizeof(KEY_DEFAULT_EASIZE), + KEY_DEFAULT_EASIZE, &val_size, &def_easize); + if (rc != 0) + RETURN(rc); + + /* default cookiesize is 0 because from 2.4 server doesn't send + * llog cookies to client. */ + CDEBUG(D_HA, "updating def/max_easize: %d/%d\n", + def_easize, max_easize); + + rc = md_init_ea_size(md_exp, max_easize, def_easize); + RETURN(rc); +} + +/** + * This function is used as an upcall-callback hooked llite clients + * into obd_notify() listeners chain to handle notifications about + * change of import connect_flags. See lustre_common_fill_super(). + */ +int cl_ocd_update(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner) +{ + struct lustre_client_ocd *lco; + struct client_obd *cli; + __u64 flags; + int result; + + ENTRY; + if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME) && + watched->obd_set_up && !watched->obd_stopping) { + cli = &watched->u.cli; + lco = owner; + flags = cli->cl_import->imp_connect_data.ocd_connect_flags; + CDEBUG(D_SUPER, "Changing connect_flags: %#llx -> %#llx\n", + lco->lco_flags, flags); + mutex_lock(&lco->lco_lock); + lco->lco_flags &= flags; + /* for each osc event update ea size */ + if (lco->lco_dt_exp) + cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp); + + mutex_unlock(&lco->lco_lock); + result = 0; + } else { + CERROR("unexpected notification from %s %s" + "(setup:%d,stopping:%d)!\n", + watched->obd_type->typ_name, + watched->obd_name, watched->obd_set_up, + watched->obd_stopping); + result = -EINVAL; + } + RETURN(result); +} + +#define GROUPLOCK_SCOPE "grouplock" + +int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, + struct ll_grouplock *lg) +{ + struct lu_env *env; + struct cl_io *io; + struct cl_lock *lock; + struct cl_lock_descr *descr; + __u32 enqflags; + __u16 refcheck; + int rc; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + io = vvp_env_thread_io(env); + io->ci_obj = obj; + + rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (rc != 0) { + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + /* Does not make sense to take GL for released layout */ + if (rc > 0) + rc = -ENOTSUPP; + return rc; + } + + lock = vvp_env_lock(env); + descr = &lock->cll_descr; + descr->cld_obj = obj; + descr->cld_start = 0; + descr->cld_end = CL_PAGE_EOF; + descr->cld_gid = gid; + descr->cld_mode = CLM_GROUP; + + enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0); + descr->cld_enq_flags = enqflags; + + rc = cl_lock_request(env, io, lock); + if (rc < 0) { + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + return rc; + } + + lg->lg_env = env; + lg->lg_io = io; + lg->lg_lock = lock; + lg->lg_gid = gid; + + return 0; +} + +void cl_put_grouplock(struct ll_grouplock *lg) +{ + struct lu_env *env = lg->lg_env; + struct cl_io *io = lg->lg_io; + struct cl_lock *lock = lg->lg_lock; + + LASSERT(lg->lg_env != NULL); + LASSERT(lg->lg_gid != 0); + + cl_lock_release(env, lock); + cl_io_fini(env, io); + cl_env_put(env, NULL); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h new file mode 100644 index 0000000000000..ab9c99eb6139e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h @@ -0,0 +1,1568 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LLITE_INTERNAL_H +#define LLITE_INTERNAL_H +#include +#include /* for s2sbi */ +#include +#include + +/* for struct cl_lock_descr and struct cl_io */ +#include +#include +#include +#include +#include +#include +#include + +#include "vvp_internal.h" +#include "range_lock.h" + +#ifndef FMODE_EXEC +#define FMODE_EXEC 0 +#endif + +#ifndef HAVE_VM_FAULT_RETRY +#define VM_FAULT_RETRY 0 +#endif + +/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it. + * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */ +#ifndef LOOKUP_CONTINUE +#define LOOKUP_CONTINUE LOOKUP_PARENT +#endif + +/** Only used on client-side for indicating the tail of dir hash/offset. */ +#define LL_DIR_END_OFF 0x7fffffffffffffffULL +#define LL_DIR_END_OFF_32BIT 0x7fffffffUL + +/* 4UL * 1024 * 1024 */ +#define LL_MAX_BLKSIZE_BITS 22 + +#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0") +#define LUSTRE_FPRIVATE(file) ((file)->private_data) + +struct ll_dentry_data { + struct lookup_intent *lld_it; + unsigned int lld_sa_generation; + unsigned int lld_invalid:1; + unsigned int lld_nfs_dentry:1; + struct rcu_head lld_rcu_head; + unsigned long lld_neg_cache_timeout; +}; + +#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata)) + +#define LLI_INODE_MAGIC 0x111d0de5 +#define LLI_INODE_DEAD 0xdeadd00d + +struct ll_getname_data { +#ifdef HAVE_DIR_CONTEXT + struct dir_context ctx; +#endif + char *lgd_name; /* points to a buffer with NAME_MAX+1 size */ + struct lu_fid lgd_fid; /* target fid we are looking for */ + int lgd_found; /* inode matched? */ +}; + +struct ll_grouplock { + struct lu_env *lg_env; + struct cl_io *lg_io; + struct cl_lock *lg_lock; + unsigned long lg_gid; +}; + +struct ll_inode_info { + __u32 lli_inode_magic; + spinlock_t lli_lock; + + volatile unsigned long lli_flags; + struct posix_acl *lli_posix_acl; + + /* identifying fields for both metadata and data stacks. */ + struct lu_fid lli_fid; + /* master inode fid for stripe directory */ + struct lu_fid lli_pfid; + + /* We need all three because every inode may be opened in different + * modes */ + struct obd_client_handle *lli_mds_read_och; + struct obd_client_handle *lli_mds_write_och; + struct obd_client_handle *lli_mds_exec_och; + __u64 lli_open_fd_read_count; + __u64 lli_open_fd_write_count; + __u64 lli_open_fd_exec_count; + /* Protects access to och pointers and their usage counters */ + struct mutex lli_och_mutex; + + struct inode lli_vfs_inode; + + /* the most recent timestamps obtained from mds */ + s64 lli_atime; + s64 lli_mtime; + s64 lli_ctime; + spinlock_t lli_agl_lock; + + /* update atime from MDS no matter if it's older than + * local inode atime. */ + unsigned int lli_update_atime:1; + + /* Try to make the d::member and f::member are aligned. Before using + * these members, make clear whether it is directory or not. */ + union { + /* for directory */ + struct { + /* serialize normal readdir and statahead-readdir. */ + struct mutex lli_readdir_mutex; + + /* metadata statahead */ + /* since parent-child threads can share the same @file + * struct, "opendir_key" is the token when dir close for + * case of parent exit before child -- it is me should + * cleanup the dir readahead. */ + void *lli_opendir_key; + struct ll_statahead_info *lli_sai; + /* protect statahead stuff. */ + spinlock_t lli_sa_lock; + /* "opendir_pid" is the token when lookup/revalid + * -- I am the owner of dir statahead. */ + pid_t lli_opendir_pid; + /* directory depth to ROOT */ + unsigned short lli_dir_depth; + /* stat will try to access statahead entries or start + * statahead if this flag is set, and this flag will be + * set upon dir open, and cleared when dir is closed, + * statahead hit ratio is too low, or start statahead + * thread failed. */ + unsigned short lli_sa_enabled:1; + /* generation for statahead */ + unsigned int lli_sa_generation; + /* rw lock protects lli_lsm_md */ + struct rw_semaphore lli_lsm_sem; + /* directory stripe information */ + struct lmv_stripe_md *lli_lsm_md; + /* directory default LMV */ + struct lmv_stripe_md *lli_default_lsm_md; + }; + + /* for non-directory */ + struct { + struct mutex lli_size_mutex; + char *lli_symlink_name; + /* + * struct rw_semaphore { + * signed long count; // align d.d_def_acl + * spinlock_t wait_lock; // align d.d_sa_lock + * struct list_head wait_list; + * } + */ + struct rw_semaphore lli_trunc_sem; + struct range_lock_tree lli_write_tree; + + struct rw_semaphore lli_glimpse_sem; + ktime_t lli_glimpse_time; + struct list_head lli_agl_list; + __u64 lli_agl_index; + + /* for writepage() only to communicate to fsync */ + int lli_async_rc; + + /* + * Whenever a process try to read/write the file, the + * jobid of the process will be saved here, and it'll + * be packed into the write PRC when flush later. + * + * So the read/write statistics for jobid will not be + * accurate if the file is shared by different jobs. + */ + char lli_jobid[LUSTRE_JOBID_SIZE]; + }; + }; + + /* XXX: For following frequent used members, although they maybe special + * used for non-directory object, it is some time-wasting to check + * whether the object is directory or not before using them. On the + * other hand, currently, sizeof(f) > sizeof(d), it cannot reduce + * the "ll_inode_info" size even if moving those members into u.f. + * So keep them out side. + * + * In the future, if more members are added only for directory, + * some of the following members can be moved into u.f. + */ + struct cl_object *lli_clob; + + /* mutex to request for layout lock exclusively. */ + struct mutex lli_layout_mutex; + /* Layout version, protected by lli_layout_lock */ + __u32 lli_layout_gen; + spinlock_t lli_layout_lock; + + __u32 lli_projid; /* project id */ + + struct rw_semaphore lli_xattrs_list_rwsem; + struct mutex lli_xattrs_enq_lock; + struct list_head lli_xattrs; /* ll_xattr_entry->xe_list */ +}; + +#ifndef HAVE_USER_NAMESPACE_ARG +#define inode_permission(ns, inode, mask) inode_permission(inode, mask) +#define simple_setattr(ns, de, iattr) simple_setattr(de, iattr) +#ifdef HAVE_INODEOPS_ENHANCED_GETATTR +#define ll_getattr(ns, path, stat, mask, fl) ll_getattr(path, stat, mask, fl) +#endif /* HAVE_INODEOPS_ENHANCED_GETATTR */ +#define ll_setattr(ns, de, attr) ll_setattr(de, attr) +#endif + +static inline __u32 ll_layout_version_get(struct ll_inode_info *lli) +{ + __u32 gen; + + spin_lock(&lli->lli_layout_lock); + gen = lli->lli_layout_gen; + spin_unlock(&lli->lli_layout_lock); + + return gen; +} + +static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen) +{ + spin_lock(&lli->lli_layout_lock); + lli->lli_layout_gen = gen; + spin_unlock(&lli->lli_layout_lock); +} + +enum ll_file_flags { + /* File data is modified. */ + LLIF_DATA_MODIFIED = 0, + /* File is being restored */ + LLIF_FILE_RESTORING = 1, + /* Xattr cache is attached to the file */ + LLIF_XATTR_CACHE = 2, + /* Project inherit */ + LLIF_PROJECT_INHERIT = 3, +}; + +static inline void ll_file_set_flag(struct ll_inode_info *lli, + enum ll_file_flags flag) +{ + set_bit(flag, &lli->lli_flags); +} + +static inline void ll_file_clear_flag(struct ll_inode_info *lli, + enum ll_file_flags flag) +{ + clear_bit(flag, &lli->lli_flags); +} + +static inline bool ll_file_test_flag(struct ll_inode_info *lli, + enum ll_file_flags flag) +{ + return test_bit(flag, &lli->lli_flags); +} + +static inline bool ll_file_test_and_clear_flag(struct ll_inode_info *lli, + enum ll_file_flags flag) +{ + return test_and_clear_bit(flag, &lli->lli_flags); +} + +int ll_xattr_cache_destroy(struct inode *inode); + +int ll_xattr_cache_get(struct inode *inode, + const char *name, + char *buffer, + size_t size, + __u64 valid); + +static inline bool obd_connect_has_secctx(struct obd_connect_data *data) +{ +#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY) + return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 && + data->ocd_connect_flags2 & OBD_CONNECT2_FILE_SECCTX; +#else + return false; +#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */ +} + +static inline void obd_connect_set_secctx(struct obd_connect_data *data) +{ +#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY) + data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX; +#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */ +} + +int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name, + const char **secctx_name, void **secctx, + __u32 *secctx_size); +int ll_inode_init_security(struct dentry *dentry, struct inode *inode, + struct inode *dir); + +int ll_listsecurity(struct inode *inode, char *secctx_name, + size_t secctx_name_size); + +/* + * Locking to guarantee consistency of non-atomic updates to long long i_size, + * consistency between file size and KMS. + * + * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order. + */ + +void ll_inode_size_lock(struct inode *inode); +void ll_inode_size_unlock(struct inode *inode); + +static inline struct ll_inode_info *ll_i2info(struct inode *inode) +{ + return container_of(inode, struct ll_inode_info, lli_vfs_inode); +} + +/* default to use at least 16M for fast read if possible */ +#define RA_REMAIN_WINDOW_MIN MiB_TO_PAGES(16UL) + +/* default to about 64M of readahead on a given system. */ +#define SBI_DEFAULT_READAHEAD_MAX MiB_TO_PAGES(64UL) + +/* default to read-ahead full files smaller than 2MB on the second read */ +#define SBI_DEFAULT_READAHEAD_WHOLE_MAX MiB_TO_PAGES(2UL) + +enum ra_stat { + RA_STAT_HIT = 0, + RA_STAT_MISS, + RA_STAT_DISTANT_READPAGE, + RA_STAT_MISS_IN_WINDOW, + RA_STAT_FAILED_GRAB_PAGE, + RA_STAT_FAILED_MATCH, + RA_STAT_DISCARDED, + RA_STAT_ZERO_LEN, + RA_STAT_ZERO_WINDOW, + RA_STAT_EOF, + RA_STAT_MAX_IN_FLIGHT, + RA_STAT_WRONG_GRAB_PAGE, + RA_STAT_FAILED_REACH_END, + _NR_RA_STAT, +}; + +struct ll_ra_info { + atomic_t ra_cur_pages; + unsigned long ra_max_pages; + unsigned long ra_max_pages_per_file; + unsigned long ra_max_read_ahead_whole_pages; +}; + +/* ra_io_arg will be filled in the beginning of ll_readahead with + * ras_lock, then the following ll_read_ahead_pages will read RA + * pages according to this arg, all the items in this structure are + * counted by page index. + */ +struct ra_io_arg { + unsigned long ria_start; /* start offset of read-ahead*/ + unsigned long ria_end; /* end offset of read-ahead*/ + unsigned long ria_reserved; /* reserved pages for read-ahead */ + unsigned long ria_end_min; /* minimum end to cover current read */ + bool ria_eof; /* reach end of file */ + /* If stride read pattern is detected, ria_stoff means where + * stride read is started. Note: for normal read-ahead, the + * value here is meaningless, and also it will not be accessed*/ + pgoff_t ria_stoff; + /* ria_length and ria_pages are the length and pages length in the + * stride I/O mode. And they will also be used to check whether + * it is stride I/O read-ahead in the read-ahead pages*/ + unsigned long ria_length; + unsigned long ria_pages; +}; + +/* LL_HIST_MAX=32 causes an overflow */ +#define LL_HIST_MAX 28 +#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */ +#define LL_PROCESS_HIST_MAX 10 +struct per_process_info { + pid_t pid; + struct obd_histogram pp_r_hist; + struct obd_histogram pp_w_hist; +}; + +/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */ +struct ll_rw_extents_info { + struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1]; +}; + +#define LL_OFFSET_HIST_MAX 100 +struct ll_rw_process_info { + pid_t rw_pid; + int rw_op; + loff_t rw_range_start; + loff_t rw_range_end; + loff_t rw_last_file_pos; + loff_t rw_offset; + size_t rw_smallest_extent; + size_t rw_largest_extent; + struct ll_file_data *rw_last_file; +}; + +enum stats_track_type { + STATS_TRACK_ALL = 0, /* track all processes */ + STATS_TRACK_PID, /* track process with this pid */ + STATS_TRACK_PPID, /* track processes with this ppid */ + STATS_TRACK_GID, /* track processes with this gid */ + STATS_TRACK_LAST, +}; + +/* flags for sbi->ll_flags */ +#define LL_SBI_NOLCK 0x01 /* DLM locking disabled (directio-only) */ +#define LL_SBI_CHECKSUM 0x02 /* checksum each page as it's written */ +#define LL_SBI_FLOCK 0x04 +#define LL_SBI_USER_XATTR 0x08 /* support user xattr */ +#define LL_SBI_ACL 0x10 /* support ACL */ +/* LL_SBI_RMT_CLIENT 0x40 remote client */ +#define LL_SBI_MDS_CAPA 0x80 /* support mds capa, obsolete */ +#define LL_SBI_OSS_CAPA 0x100 /* support oss capa, obsolete */ +#define LL_SBI_LOCALFLOCK 0x200 /* Local flocks support by kernel */ +#define LL_SBI_LRU_RESIZE 0x400 /* lru resize support */ +#define LL_SBI_LAZYSTATFS 0x800 /* lazystatfs mount option */ +/* LL_SBI_SOM_PREVIEW 0x1000 SOM preview mount option, obsolete */ +#define LL_SBI_32BIT_API 0x2000 /* generate 32 bit inodes. */ +#define LL_SBI_64BIT_HASH 0x4000 /* support 64-bits dir hash/offset */ +#define LL_SBI_AGL_ENABLED 0x8000 /* enable agl */ +#define LL_SBI_VERBOSE 0x10000 /* verbose mount/umount */ +#define LL_SBI_LAYOUT_LOCK 0x20000 /* layout lock support */ +#define LL_SBI_USER_FID2PATH 0x40000 /* allow fid2path by unprivileged users */ +#define LL_SBI_XATTR_CACHE 0x80000 /* support for xattr cache */ +#define LL_SBI_NOROOTSQUASH 0x100000 /* do not apply root squash */ +#define LL_SBI_ALWAYS_PING 0x200000 /* always ping even if server + * suppress_pings */ +#define LL_SBI_FAST_READ 0x400000 /* fast read support */ +#define LL_SBI_FILE_SECCTX 0x800000 /* set file security context at create */ +/* LL_SBI_PIO 0x1000000 parallel IO support, introduced in + 2.10, abandoned */ +#define LL_SBI_TINY_WRITE 0x2000000 /* tiny write support */ +#define LL_SBI_MDLL_AUTO_REFRESH 0x10000000 /* enable metadata lazy load */ +#define LL_SBI_MDLL 0x20000000 /* enable metadata lazy load auto-refresh */ +#define LL_SBI_MDLL_BYPASS 0x40000000 /* disable metadata lazy load auto-refresh */ + +#define LL_SBI_FLAGS { \ + "nolck", \ + "checksum", \ + "flock", \ + "user_xattr", \ + "acl", \ + "???", \ + "???", \ + "mds_capa", \ + "oss_capa", \ + "flock", \ + "lru_resize", \ + "lazy_statfs", \ + "som", \ + "32bit_api", \ + "64bit_hash", \ + "agl", \ + "verbose", \ + "layout", \ + "user_fid2path",\ + "xattr_cache", \ + "norootsquash", \ + "always_ping", \ + "fast_read", \ + "file_secctx", \ + "pio", \ + "tiny_write", \ +} + +/* This is embedded into llite super-blocks to keep track of connect + * flags (capabilities) supported by all imports given mount is + * connected to. */ +struct lustre_client_ocd { + /* This is conjunction of connect_flags across all imports + * (LOVs) this mount is connected to. This field is updated by + * cl_ocd_update() under ->lco_lock. */ + __u64 lco_flags; + struct mutex lco_lock; + struct obd_export *lco_md_exp; + struct obd_export *lco_dt_exp; +}; + +struct ll_sb_info { + /* this protects pglist and ra_info. It isn't safe to + * grab from interrupt contexts */ + spinlock_t ll_lock; + spinlock_t ll_pp_extent_lock; /* pp_extent entry*/ + spinlock_t ll_process_lock; /* ll_rw_process_info */ + struct obd_uuid ll_sb_uuid; + struct obd_export *ll_md_exp; + struct obd_export *ll_dt_exp; + struct obd_device *ll_md_obd; + struct obd_device *ll_dt_obd; + struct dentry *ll_debugfs_entry; + struct lu_fid ll_root_fid; /* root object fid */ + + int ll_flags; + unsigned int ll_umounting:1, + ll_xattr_cache_enabled:1, + ll_xattr_cache_set:1, /* already set to 0/1 */ + ll_client_common_fill_super_succeeded:1, + ll_checksum_set:1, + ll_inode_cache_enabled:1; + + struct lustre_client_ocd ll_lco; + + struct lprocfs_stats *ll_stats; /* lprocfs stats counter */ + + /* Used to track "unstable" pages on a client, and maintain a + * LRU list of clean pages. An "unstable" page is defined as + * any page which is sent to a server as part of a bulk request, + * but is uncommitted to stable storage. */ + struct cl_client_cache *ll_cache; + + struct lprocfs_stats *ll_ra_stats; + + struct ll_ra_info ll_ra_info; + unsigned int ll_namelen; + struct file_operations *ll_fop; + + struct lu_site *ll_site; + struct cl_device *ll_cl; + /* Statistics */ + struct ll_rw_extents_info ll_rw_extents_info; + int ll_extent_process_count; + struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX]; + unsigned int ll_offset_process_count; + struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX]; + unsigned int ll_rw_offset_entry_count; + int ll_stats_track_id; + enum stats_track_type ll_stats_track_type; + int ll_rw_stats_on; + + /* metadata stat-ahead */ + unsigned int ll_sa_running_max;/* max concurrent + * statahead instances */ + unsigned int ll_sa_max; /* max statahead RPCs */ + atomic_t ll_sa_total; /* statahead thread started + * count */ + atomic_t ll_sa_wrong; /* statahead thread stopped for + * low hit ratio */ + atomic_t ll_sa_running; /* running statahead thread + * count */ + atomic_t ll_agl_total; /* AGL thread started count */ + + dev_t ll_sdev_orig; /* save s_dev before assign for + * clustred nfs */ + /* root squash */ + struct root_squash_info ll_squash; + struct path ll_mnt; + + /* st_blksize returned by stat(2), when non-zero */ + unsigned int ll_stat_blksize; + + /* maximum relative age of cached statfs results */ + unsigned int ll_statfs_max_age; + + /* + * seconds after which negative dentries should be invalidated. + * -1 disables invalidation of negative entries based on timeout + * 0 always triggers serverside validation + */ + int ll_neg_dentry_timeout; + + /* + * MDLL directory restore retry count + * This would determine the number of times the restore would be + * retried before returning error to the client. The retry would + * be based on the released bit of the directory. + * A value of -1 would retry indefinitely. + */ +#define LL_MDLL_DIR_RESTORE_DEF_RETRY_COUNT 1 + atomic_t ll_dir_restore_max_retry_count; + + struct kset ll_kset; /* sysfs object */ + struct completion ll_kobj_unregister; +}; + +/* + * per file-descriptor read-ahead data. + */ +struct ll_readahead_state { + spinlock_t ras_lock; + /* + * index of the last page that read(2) needed and that wasn't in the + * cache. Used by ras_update() to detect seeks. + * + * XXX nikita: if access seeks into cached region, Lustre doesn't see + * this. + */ + unsigned long ras_last_readpage; + /* + * number of pages read after last read-ahead window reset. As window + * is reset on each seek, this is effectively a number of consecutive + * accesses. Maybe ->ras_accessed_in_window is better name. + * + * XXX nikita: window is also reset (by ras_update()) when Lustre + * believes that memory pressure evicts read-ahead pages. In that + * case, it probably doesn't make sense to expand window to + * PTLRPC_MAX_BRW_PAGES on the third access. + */ + unsigned long ras_consecutive_pages; + /* + * number of read requests after the last read-ahead window reset + * As window is reset on each seek, this is effectively the number + * on consecutive read request and is used to trigger read-ahead. + */ + unsigned long ras_consecutive_requests; + /* + * Parameters of current read-ahead window. Handled by + * ras_update(). On the initial access to the file or after a seek, + * window is reset to 0. After 3 consecutive accesses, window is + * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by + * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages. + */ + unsigned long ras_window_start, ras_window_len; + /* + * Optimal RPC size. It decides how many pages will be sent + * for each read-ahead. + */ + unsigned long ras_rpc_size; + /* + * Where next read-ahead should start at. This lies within read-ahead + * window. Read-ahead window is read in pieces rather than at once + * because: 1. lustre limits total number of pages under read-ahead by + * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages + * not covered by DLM lock. + */ + unsigned long ras_next_readahead; + /* + * Total number of ll_file_read requests issued, reads originating + * due to mmap are not counted in this total. This value is used to + * trigger full file read-ahead after multiple reads to a small file. + */ + unsigned long ras_requests; + /* + * Page index with respect to the current request, these value + * will not be accurate when dealing with reads issued via mmap. + */ + unsigned long ras_request_index; + /* + * The following 3 items are used for detecting the stride I/O + * mode. + * In stride I/O mode, + * ...............|-----data-----|****gap*****|--------|******|.... + * offset |-stride_pages-|-stride_gap-| + * ras_stride_offset = offset; + * ras_stride_length = stride_pages + stride_gap; + * ras_stride_pages = stride_pages; + * Note: all these three items are counted by pages. + */ + unsigned long ras_stride_length; + unsigned long ras_stride_pages; + pgoff_t ras_stride_offset; + /* + * number of consecutive stride request count, and it is similar as + * ras_consecutive_requests, but used for stride I/O mode. + * Note: only more than 2 consecutive stride request are detected, + * stride read-ahead will be enable + */ + unsigned long ras_consecutive_stride_requests; +}; + +extern struct kmem_cache *ll_file_data_slab; +struct lustre_handle; +struct ll_file_data { + struct ll_readahead_state fd_ras; + struct ll_grouplock fd_grouplock; + __u64 lfd_pos; + __u32 fd_flags; + fmode_t fd_omode; + /* openhandle if lease exists for this file. + * Borrow lli->lli_och_mutex to protect assignment */ + struct obd_client_handle *fd_lease_och; + struct obd_client_handle *fd_och; + struct file *fd_file; + /* Indicate whether need to report failure when close. + * true: failure is known, not report again. + * false: unknown failure, should report. */ + bool fd_write_failed; + bool ll_lock_no_expand; + rwlock_t fd_lock; /* protect lcc list */ + struct list_head fd_lccs; /* list of ll_cl_context */ + /* Used by mirrored file to lead IOs to a specific mirror, usually + * for mirror resync. 0 means default. */ + __u32 fd_designated_mirror; + /* The layout version when resync starts. Resync I/O should carry this + * layout version for verification to OST objects */ + __u32 fd_layout_version; +}; + +void llite_tunables_unregister(void); +int llite_tunables_register(void); + +static inline struct inode *ll_info2i(struct ll_inode_info *lli) +{ + return &lli->lli_vfs_inode; +} + +__u32 ll_i2suppgid(struct inode *i); +void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2); + +static inline int ll_need_32bit_api(struct ll_sb_info *sbi) +{ +#if BITS_PER_LONG == 32 + return 1; +#elif defined(CONFIG_COMPAT) + if (unlikely(sbi->ll_flags & LL_SBI_32BIT_API)) + return true; + +# ifdef CONFIG_X86_X32 + /* in_compat_syscall() returns true when called from a kthread + * and CONFIG_X86_X32 is enabled, which is wrong. So check + * whether the caller comes from a syscall (ie. not a kthread) + * before calling in_compat_syscall(). */ + if (current->flags & PF_KTHREAD) + return false; +# endif + + return unlikely(in_compat_syscall()); +#else + return unlikely(sbi->ll_flags & LL_SBI_32BIT_API); +#endif +} + +static inline bool ll_sbi_has_fast_read(struct ll_sb_info *sbi) +{ + return !!(sbi->ll_flags & LL_SBI_FAST_READ); +} + +static inline bool ll_sbi_has_tiny_write(struct ll_sb_info *sbi) +{ + return !!(sbi->ll_flags & LL_SBI_TINY_WRITE); +} + +void ll_ras_enter(struct file *f); + +/* llite/lcommon_misc.c */ +int cl_ocd_update(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner); +int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, + struct ll_grouplock *lg); +void cl_put_grouplock(struct ll_grouplock *lg); + +/* llite/lproc_llite.c */ +int ll_debugfs_register_super(struct super_block *sb, const char *name); +void ll_debugfs_unregister_super(struct super_block *sb); +void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count); + +enum { + LPROC_LL_DIRTY_HITS, + LPROC_LL_DIRTY_MISSES, + LPROC_LL_READ_BYTES, + LPROC_LL_WRITE_BYTES, + LPROC_LL_BRW_READ, + LPROC_LL_BRW_WRITE, + LPROC_LL_IOCTL, + LPROC_LL_OPEN, + LPROC_LL_RELEASE, + LPROC_LL_MAP, + LPROC_LL_FAULT, + LPROC_LL_MKWRITE, + LPROC_LL_LLSEEK, + LPROC_LL_FSYNC, + LPROC_LL_READDIR, + LPROC_LL_SETATTR, + LPROC_LL_TRUNC, + LPROC_LL_FLOCK, + LPROC_LL_GETATTR, + LPROC_LL_CREATE, + LPROC_LL_LINK, + LPROC_LL_UNLINK, + LPROC_LL_SYMLINK, + LPROC_LL_MKDIR, + LPROC_LL_RMDIR, + LPROC_LL_MKNOD, + LPROC_LL_RENAME, + LPROC_LL_STAFS, + LPROC_LL_ALLOC_INODE, + LPROC_LL_SETXATTR, + LPROC_LL_GETXATTR, + LPROC_LL_GETXATTR_HITS, + LPROC_LL_LISTXATTR, + LPROC_LL_REMOVEXATTR, + LPROC_LL_INODE_PERM, + LPROC_LL_FILE_OPCODES +}; + +/* llite/dir.c */ +enum get_default_layout_type { + GET_DEFAULT_LAYOUT_ROOT = 1, +}; + +struct ll_dir_chain { +}; + +static inline void ll_dir_chain_init(struct ll_dir_chain *chain) +{ +} + +static inline void ll_dir_chain_fini(struct ll_dir_chain *chain) +{ +} + +extern const struct file_operations ll_dir_operations; +extern const struct inode_operations ll_dir_inode_operations; +#ifdef HAVE_DIR_CONTEXT +int ll_dir_read(struct inode *inode, __u64 *pos, struct md_op_data *op_data, + struct dir_context *ctx); +#else +int ll_dir_read(struct inode *inode, __u64 *pos, struct md_op_data *op_data, + void *cookie, filldir_t filldir); +#endif +int ll_get_mdt_idx(struct inode *inode); +int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid); +struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data, + __u64 offset, struct ll_dir_chain *chain); +void ll_release_page(struct inode *inode, struct page *page, bool remove); + +/* llite/namei.c */ +extern const struct inode_operations ll_special_inode_operations; + +struct inode *ll_iget(struct super_block *sb, ino_t hash, + struct lustre_md *lic); +int ll_test_inode_by_fid(struct inode *inode, void *opaque); +int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *data, int flag); +struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de); +int ll_rmdir_entry(struct inode *dir, char *name, int namelen); +void ll_update_times(struct ptlrpc_request *request, struct inode *inode); + +/* llite/rw.c */ +int ll_writepage(struct page *page, struct writeback_control *wbc); +int ll_writepages(struct address_space *, struct writeback_control *wbc); +int ll_readpage(struct file *file, struct page *page); +int ll_io_read_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, struct file *file); +void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras); +int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); + +enum lcc_type; +void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io, + enum lcc_type type); +void ll_cl_remove(struct file *file, const struct lu_env *env); +struct ll_cl_context *ll_cl_find(struct file *file); + +extern const struct address_space_operations ll_aops; + +/* llite/file.c */ +extern struct file_operations ll_file_operations; +extern struct file_operations ll_file_operations_flock; +extern struct file_operations ll_file_operations_noflock; +extern struct inode_operations ll_file_inode_operations; +extern int ll_have_md_lock(struct inode *inode, __u64 *bits, + enum ldlm_mode l_req_mode); +extern enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits, + struct lustre_handle *lockh, __u64 flags, + enum ldlm_mode mode); + +int ll_file_open(struct inode *inode, struct file *file); +int ll_file_release(struct inode *inode, struct file *file); +int ll_release_openhandle(struct dentry *, struct lookup_intent *); +int ll_md_real_close(struct inode *inode, fmode_t fmode); +extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw); +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR) +int ll_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +#else +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat); +#endif /* HAVE_USER_NAMESPACE_ARG */ +struct posix_acl *ll_get_acl(struct inode *inode, int type +#ifdef HAVE_GET_ACL_RCU_ARG + , bool rcu +#endif /* HAVE_GET_ACL_RCU_ARG */ + ); +#ifdef HAVE_IOP_SET_ACL +#ifdef CONFIG_FS_POSIX_ACL +int ll_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); +#else /* !CONFIG_FS_POSIX_ACL */ +#define ll_set_acl NULL +#endif /* CONFIG_FS_POSIX_ACL */ + +#endif + +static inline int ll_xflags_to_inode_flags(int xflags) +{ + return ((xflags & FS_XFLAG_SYNC) ? S_SYNC : 0) | + ((xflags & FS_XFLAG_NOATIME) ? S_NOATIME : 0) | + ((xflags & FS_XFLAG_APPEND) ? S_APPEND : 0) | + ((xflags & FS_XFLAG_IMMUTABLE) ? S_IMMUTABLE : 0); +} + +static inline int ll_inode_flags_to_xflags(int flags) +{ + return ((flags & S_SYNC) ? FS_XFLAG_SYNC : 0) | + ((flags & S_NOATIME) ? FS_XFLAG_NOATIME : 0) | + ((flags & S_APPEND) ? FS_XFLAG_APPEND : 0) | + ((flags & S_IMMUTABLE) ? FS_XFLAG_IMMUTABLE : 0); +} + +int ll_migrate(struct inode *parent, struct file *file, + struct lmv_user_md *lum, const char *name); +int ll_get_fid_by_name(struct inode *parent, const char *name, + int namelen, struct lu_fid *fid, struct inode **inode); +#ifdef HAVE_GENERIC_PERMISSION_4ARGS +int ll_inode_permission(struct inode *inode, int mask, unsigned int flags); +#else +# ifndef HAVE_INODE_PERMISION_2ARGS +# ifdef HAVE_USER_NAMESPACE_ARG +int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); +# else +int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd); +# endif +# else +int ll_inode_permission(struct inode *inode, int mask); +# endif +#endif +int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa); +int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd, + unsigned long arg); +int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd, + unsigned long arg); + +int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, + __u64 flags, struct lov_user_md *lum, + int lum_size); +int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, + struct lov_mds_md **lmm, int *lmm_size, + struct ptlrpc_request **request); +int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, + int set_default); +int ll_dir_get_default_layout(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, u64 valid, + enum get_default_layout_type type); +int ll_dir_getstripe_default(struct inode *inode, void **lmmp, + int *lmm_size, struct ptlrpc_request **request, + struct ptlrpc_request **root_request, u64 valid); +int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, u64 valid); +#ifdef HAVE_FILE_FSYNC_4ARGS +int ll_fsync(struct file *file, loff_t start, loff_t end, int data); +#elif defined(HAVE_FILE_FSYNC_2ARGS) +int ll_fsync(struct file *file, int data); +#else +int ll_fsync(struct file *file, struct dentry *dentry, int data); +#endif +int ll_merge_attr(const struct lu_env *env, struct inode *inode); +int ll_fid2path(struct inode *inode, void __user *arg); +int ll_data_version(struct inode *inode, __u64 *data_version, int flags); +int ll_hsm_release(struct inode *inode); +int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss); +void ll_io_set_mirror(struct cl_io *io, const struct file *file); +int ll_hsm_import(struct inode *inode, struct file *file, + struct hsm_user_import *hui); + +/* llite/dcache.c */ + +int ll_d_init(struct dentry *de); +extern const struct dentry_operations ll_d_ops; +void ll_intent_drop_lock(struct lookup_intent *); +void ll_intent_release(struct lookup_intent *); +void ll_invalidate_aliases(struct inode *); +void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry); +int ll_revalidate_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, struct dentry *de); + +/* llite/llite_lib.c */ +extern struct super_operations lustre_super_operations; + +void ll_lli_init(struct ll_inode_info *lli); +int ll_fill_super(struct super_block *sb, struct vfsmount *mnt); +void ll_put_super(struct super_block *sb); +void ll_kill_super(struct super_block *sb); +struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock); +void ll_dir_clear_lsm_md(struct inode *inode); +void ll_clear_inode(struct inode *inode); +int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, + enum op_xvalid xvalid, bool hsm_import); +int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de, + struct iattr *attr); +int ll_statfs(struct dentry *de, struct kstatfs *sfs); +int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs, + u32 flags); +int ll_update_inode(struct inode *inode, struct lustre_md *md); +void ll_update_inode_flags(struct inode *inode, int ext_flags); +void ll_update_dir_depth(struct inode *dir, struct inode *inode); +int ll_read_inode2(struct inode *inode, void *opaque); +void ll_delete_inode(struct inode *inode); +int ll_iocontrol(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +int ll_flush_ctx(struct inode *inode); +void ll_umount_begin(struct super_block *sb); +int ll_remount_fs(struct super_block *sb, int *flags, char *data); +#ifdef HAVE_SUPEROPS_USE_DENTRY +int ll_show_options(struct seq_file *seq, struct dentry *dentry); +#else +int ll_show_options(struct seq_file *seq, struct vfsmount *vfs); +#endif +void ll_dirty_page_discard_warn(struct page *page, int ioret); +int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, + struct super_block *, struct lookup_intent *); +int ll_obd_statfs(struct inode *inode, void __user *arg); +int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize); +int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize); +int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize); + +void ll_unlock_md_op_lsm(struct md_op_data *op_data); +struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, + struct inode *i1, struct inode *i2, + const char *name, size_t namelen, + __u32 mode, enum md_op_code opc, + void *data); +void ll_finish_md_op_data(struct md_op_data *op_data); +int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg); +char *ll_get_fsname(struct super_block *sb, char *buf, int buflen); +void ll_compute_rootsquash_state(struct ll_sb_info *sbi); +ssize_t ll_copy_user_md(const struct lov_user_md __user *md, + struct lov_user_md **kbuf); +void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req); + +void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req); + +/* Compute expected user md size when passing in a md from user space */ +static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum) +{ + switch (lum->lmm_magic) { + case LOV_USER_MAGIC_V1: + return sizeof(struct lov_user_md_v1); + case LOV_USER_MAGIC_V3: + return sizeof(struct lov_user_md_v3); + case LOV_USER_MAGIC_SPECIFIC: + if (lum->lmm_stripe_count > LOV_MAX_STRIPE_COUNT) + return -EINVAL; + + return lov_user_md_size(lum->lmm_stripe_count, + LOV_USER_MAGIC_SPECIFIC); + case LOV_USER_MAGIC_COMP_V1: + return ((struct lov_comp_md_v1 *)lum)->lcm_size; + } + + return -EINVAL; +} + +/* llite/llite_nfs.c */ +extern struct export_operations lustre_export_operations; +__u32 get_uuid2int(const char *name, int len); +struct inode *search_inode_for_lustre(struct super_block *sb, + const struct lu_fid *fid); +int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid); + +/* llite/symlink.c */ +extern struct inode_operations ll_fast_symlink_inode_operations; + +/** + * IO arguments for various VFS I/O interfaces. + */ +struct vvp_io_args { + /** normal/sendfile/splice */ + enum vvp_io_subtype via_io_subtype; + + union { + struct { + struct kiocb *via_iocb; + struct iov_iter *via_iter; + } normal; + struct { + struct pipe_inode_info *via_pipe; + unsigned int via_flags; + } splice; + } u; +}; + +enum lcc_type { + LCC_RW = 1, + LCC_MMAP +}; + +struct ll_cl_context { + struct list_head lcc_list; + void *lcc_cookie; + const struct lu_env *lcc_env; + struct cl_io *lcc_io; + struct cl_page *lcc_page; + enum lcc_type lcc_type; +}; + +struct ll_thread_info { + struct iov_iter lti_iter; + struct vvp_io_args lti_args; + struct ra_io_arg lti_ria; + struct ll_cl_context lti_io_ctx; +}; + +extern struct lu_context_key ll_thread_key; + +static inline struct ll_thread_info *ll_env_info(const struct lu_env *env) +{ + struct ll_thread_info *lti; + + lti = lu_context_key_get(&env->le_ctx, &ll_thread_key); + LASSERT(lti != NULL); + + return lti; +} + +static inline struct vvp_io_args *ll_env_args(const struct lu_env *env, + enum vvp_io_subtype type) +{ + struct vvp_io_args *via = &ll_env_info(env)->lti_args; + + via->via_io_subtype = type; + + return via; +} + +/* llite/llite_mmap.c */ + +int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last); +int ll_file_mmap(struct file * file, struct vm_area_struct * vma); +void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma, + unsigned long addr, size_t count); +struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, + size_t count); + +static inline void ll_invalidate_page(struct page *vmpage) +{ + struct address_space *mapping = vmpage->mapping; + loff_t offset = vmpage->index << PAGE_SHIFT; + + LASSERT(PageLocked(vmpage)); + if (mapping == NULL) + return; + + /* + * truncate_complete_page() calls + * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete(). + */ + ll_teardown_mmaps(mapping, offset, offset + PAGE_SIZE); + truncate_complete_page(mapping, vmpage); +} + +#define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi) + +/* don't need an addref as the sb_info should be holding one */ +static inline struct obd_export *ll_s2dtexp(struct super_block *sb) +{ + return ll_s2sbi(sb)->ll_dt_exp; +} + +/* don't need an addref as the sb_info should be holding one */ +static inline struct obd_export *ll_s2mdexp(struct super_block *sb) +{ + return ll_s2sbi(sb)->ll_md_exp; +} + +static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi) +{ + struct obd_device *obd = sbi->ll_md_exp->exp_obd; + if (obd == NULL) + LBUG(); + return &obd->u.cli; +} + +// FIXME: replace the name of this with LL_SB to conform to kernel stuff +static inline struct ll_sb_info *ll_i2sbi(struct inode *inode) +{ + return ll_s2sbi(inode->i_sb); +} + +static inline struct obd_export *ll_i2dtexp(struct inode *inode) +{ + return ll_s2dtexp(inode->i_sb); +} + +static inline struct obd_export *ll_i2mdexp(struct inode *inode) +{ + return ll_s2mdexp(inode->i_sb); +} + +static inline struct lu_fid *ll_inode2fid(struct inode *inode) +{ + struct lu_fid *fid; + + LASSERT(inode != NULL); + fid = &ll_i2info(inode)->lli_fid; + + return fid; +} + +static inline bool ll_dir_striped(struct inode *inode) +{ + LASSERT(inode); + return S_ISDIR(inode->i_mode) && + lmv_dir_striped(ll_i2info(inode)->lli_lsm_md); +} + +static inline loff_t ll_file_maxbytes(struct inode *inode) +{ + struct cl_object *obj = ll_i2info(inode)->lli_clob; + + if (obj == NULL) + return MAX_LFS_FILESIZE; + + return min_t(loff_t, cl_object_maxbytes(obj), MAX_LFS_FILESIZE); +} + +/* llite/xattr.c */ +extern const struct xattr_handler *ll_xattr_handlers[]; + +#define XATTR_USER_T 1 +#define XATTR_TRUSTED_T 2 +#define XATTR_SECURITY_T 3 +#define XATTR_ACL_ACCESS_T 4 +#define XATTR_ACL_DEFAULT_T 5 +#define XATTR_LUSTRE_T 6 +#define XATTR_OTHER_T 7 + +ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size); +int ll_xattr_list(struct inode *inode, const char *name, int type, + void *buffer, size_t size, u64 valid); +const struct xattr_handler *get_xattr_type(const char *name); + +/** + * Common IO arguments for various VFS I/O interfaces. + */ +int cl_sb_init(struct super_block *sb); +int cl_sb_fini(struct super_block *sb); + +enum ras_update_flags { + LL_RAS_HIT = 0x1, + LL_RAS_MMAP = 0x2 +}; +void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len); +void ll_ra_stats_inc(struct inode *inode, enum ra_stat which); + +/* statahead.c */ + +#define LL_SA_RPC_MIN 2 +#define LL_SA_RPC_DEF 32 +#define LL_SA_RPC_MAX 512 + +/* XXX: If want to support more concurrent statahead instances, + * please consider to decentralize the RPC lists attached + * on related import, such as imp_{sending,delayed}_list. + * LU-11079 */ +#define LL_SA_RUNNING_MAX 256 +#define LL_SA_RUNNING_DEF 16 + +#define LL_SA_CACHE_BIT 5 +#define LL_SA_CACHE_SIZE (1 << LL_SA_CACHE_BIT) +#define LL_SA_CACHE_MASK (LL_SA_CACHE_SIZE - 1) + +/* per inode struct, for dir only */ +struct ll_statahead_info { + struct dentry *sai_dentry; + atomic_t sai_refcount; /* when access this struct, hold + * refcount */ + unsigned int sai_max; /* max ahead of lookup */ + __u64 sai_sent; /* stat requests sent count */ + __u64 sai_replied; /* stat requests which received + * reply */ + __u64 sai_index; /* index of statahead entry */ + __u64 sai_index_wait; /* index of entry which is the + * caller is waiting for */ + __u64 sai_hit; /* hit count */ + __u64 sai_miss; /* miss count: + * for "ls -al" case, includes + * hidden dentry miss; + * for "ls -l" case, it does not + * include hidden dentry miss. + * "sai_miss_hidden" is used for + * the later case. + */ + unsigned int sai_consecutive_miss; /* consecutive miss */ + unsigned int sai_miss_hidden;/* "ls -al", but first dentry + * is not a hidden one */ + unsigned int sai_skip_hidden;/* skipped hidden dentry count + */ + unsigned int sai_ls_all:1, /* "ls -al", do stat-ahead for + * hidden entries */ + sai_agl_valid:1,/* AGL is valid for the dir */ + sai_in_readpage:1;/* statahead is in readdir()*/ + wait_queue_head_t sai_waitq; /* stat-ahead wait queue */ + struct ptlrpc_thread sai_thread; /* stat-ahead thread */ + struct ptlrpc_thread sai_agl_thread; /* AGL thread */ + struct list_head sai_interim_entries; /* entries which got async + * stat reply, but not + * instantiated */ + struct list_head sai_entries; /* completed entries */ + struct list_head sai_agls; /* AGLs to be sent */ + struct list_head sai_cache[LL_SA_CACHE_SIZE]; + spinlock_t sai_cache_lock[LL_SA_CACHE_SIZE]; + atomic_t sai_cache_count; /* entry count in cache */ +}; + +int ll_statahead(struct inode *dir, struct dentry **dentry, bool unplug); +void ll_authorize_statahead(struct inode *dir, void *key); +void ll_deauthorize_statahead(struct inode *dir, void *key); + +/* glimpse.c */ +blkcnt_t dirty_cnt(struct inode *inode); + +int cl_glimpse_size0(struct inode *inode, int agl); +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob, int agl); + +static inline int cl_glimpse_size(struct inode *inode) +{ + return cl_glimpse_size0(inode, 0); +} + +/* AGL is 'asychronous glimpse lock', which is a speculative lock taken as + * part of statahead */ +static inline int cl_agl(struct inode *inode) +{ + return cl_glimpse_size0(inode, 1); +} + +int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise); + +int cl_io_get(struct inode *inode, struct lu_env **envout, + struct cl_io **ioout, __u16 *refcheck); + +static inline int ll_glimpse_size(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + + down_read(&lli->lli_glimpse_sem); + rc = cl_glimpse_size(inode); + lli->lli_glimpse_time = ktime_get(); + up_read(&lli->lli_glimpse_sem); + return rc; +} + +/* dentry may statahead when statahead is enabled and current process has opened + * parent directory, and this dentry hasn't accessed statahead cache before */ +static inline bool +dentry_may_statahead(struct inode *dir, struct dentry *dentry) +{ + struct ll_inode_info *lli; + struct ll_dentry_data *ldd; + + if (ll_i2sbi(dir)->ll_sa_max == 0) + return false; + + lli = ll_i2info(dir); + + /* statahead is not allowed for this dir, there may be three causes: + * 1. dir is not opened. + * 2. statahead hit ratio is too low. + * 3. previous stat started statahead thread failed. */ + if (!lli->lli_sa_enabled) + return false; + + /* not the same process, don't statahead */ + if (lli->lli_opendir_pid != current_pid()) + return false; + + /* + * When stating a dentry, kernel may trigger 'revalidate' or 'lookup' + * multiple times, eg. for 'getattr', 'getxattr' and etc. + * For patchless client, lookup intent is not accurate, which may + * misguide statahead. For example: + * The 'revalidate' call for 'getattr' and 'getxattr' of a dentry will + * have the same intent -- IT_GETATTR, while one dentry should access + * statahead cache once, otherwise statahead windows is messed up. + * The solution is as following: + * Assign 'lld_sa_generation' with 'lli_sa_generation' when a dentry + * IT_GETATTR for the first time, and subsequent IT_GETATTR will + * bypass interacting with statahead cache by checking + * 'lld_sa_generation == lli->lli_sa_generation'. + */ + ldd = ll_d2d(dentry); + if (ldd != NULL && ldd->lld_sa_generation == lli->lli_sa_generation) + return false; + + return true; +} + +int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, + enum cl_fsync_mode mode, int ignore_layout); + +static inline int ll_file_nolock(const struct file *file) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct inode *inode = file_inode((struct file *)file); + + LASSERT(fd != NULL); + return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) || + (ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK)); +} + +static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode, + struct lookup_intent *it, __u64 *bits) +{ + if (!it->it_lock_set) { + struct lustre_handle handle; + + /* If this inode is a remote object, it will get two + * separate locks in different namespaces, Master MDT, + * where the name entry is, will grant LOOKUP lock, + * remote MDT, where the object is, will grant + * UPDATE|PERM lock. The inode will be attched to both + * LOOKUP and PERM locks, so revoking either locks will + * case the dcache being cleared */ + if (it->it_remote_lock_mode) { + handle.cookie = it->it_remote_lock_handle; + CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID + "(%p) for remote lock %#llx\n", + PFID(ll_inode2fid(inode)), inode, + handle.cookie); + md_set_lock_data(exp, &handle, inode, NULL); + } + + handle.cookie = it->it_lock_handle; + + CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"(%p)" + " for lock %#llx\n", + PFID(ll_inode2fid(inode)), inode, handle.cookie); + + md_set_lock_data(exp, &handle, inode, &it->it_lock_bits); + it->it_lock_set = 1; + } + + if (bits != NULL) + *bits = it->it_lock_bits; +} + +static inline void ll_lock_dcache(struct inode *inode) +{ +#ifdef HAVE_DCACHE_LOCK + spin_lock(&dcache_lock); +#else + spin_lock(&inode->i_lock); +#endif +} + +static inline void ll_unlock_dcache(struct inode *inode) +{ +#ifdef HAVE_DCACHE_LOCK + spin_unlock(&dcache_lock); +#else + spin_unlock(&inode->i_lock); +#endif +} + +static inline int d_lustre_invalid(const struct dentry *dentry) +{ + struct ll_dentry_data *lld = ll_d2d(dentry); + + return (lld == NULL) || lld->lld_invalid; +} + +static inline void __d_lustre_invalidate(struct dentry *dentry) +{ + struct ll_dentry_data *lld = ll_d2d(dentry); + struct ll_sb_info *sbi = ll_s2sbi(dentry->d_sb); + + if (lld == NULL) + return; + + if (sbi->ll_neg_dentry_timeout != OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS) + lld->lld_neg_cache_timeout = + jiffies + sbi->ll_neg_dentry_timeout * HZ; + lld->lld_invalid = 1; +} + +/* + * Mark dentry INVALID, if dentry refcount is zero (this is normally case for + * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later; + * else dput() of the last refcount will unhash this dentry and kill it. + */ +static inline void d_lustre_invalidate(struct dentry *dentry, int nested) +{ + CDEBUG(D_DENTRY, "invalidate dentry %.*s (%p) parent %p inode %p " + "refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry, + dentry->d_parent, dentry->d_inode, ll_d_count(dentry)); + + spin_lock_nested(&dentry->d_lock, + nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL); + __d_lustre_invalidate(dentry); + /* + * We should be careful about dentries created by d_obtain_alias(). + * These dentries are not put in the dentry tree, instead they are + * linked to sb->s_anon through dentry->d_hash. + * shrink_dcache_for_umount() shrinks the tree and sb->s_anon list. + * If we unhashed such a dentry, unmount would not be able to find + * it and busy inodes would be reported. + */ + if (ll_d_count(dentry) == 0 && !(dentry->d_flags & DCACHE_DISCONNECTED)) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void d_lustre_revalidate(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + LASSERT(ll_d2d(dentry) != NULL); + ll_d2d(dentry)->lld_invalid = 0; + spin_unlock(&dentry->d_lock); +} + +int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf); +int ll_layout_refresh(struct inode *inode, __u32 *gen); +int ll_layout_restore(struct inode *inode, loff_t start, __u64 length); +int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc, + struct lu_extent *ext); + +int ll_xattr_init(void); +void ll_xattr_fini(void); + +int ll_page_sync_io(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, enum cl_req_type crt); + +int ll_getparent(struct file *file, struct getparent __user *arg); + +/* lcommon_cl.c */ +int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr, + enum op_xvalid xvalid, unsigned int attr_flags); + +extern struct lu_env *cl_inode_fini_env; +extern __u16 cl_inode_fini_refcheck; + +int cl_file_inode_init(struct inode *inode, struct lustre_md *md); +void cl_inode_fini(struct inode *inode); + +u64 cl_fid_build_ino(const struct lu_fid *fid, int api32); +u32 cl_fid_build_gen(const struct lu_fid *fid); + +int ll_get_hsm_state(struct inode *inode, u32 *hus_states); + +#endif /* LLITE_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c new file mode 100644 index 0000000000000..d030796c71a0f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c @@ -0,0 +1,3134 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/llite_lib.c + * + * Lustre Light Super operations + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include + +#include +#ifdef HAVE_UAPI_LINUX_MOUNT_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include "llite_internal.h" + +struct kmem_cache *ll_file_data_slab; + +#ifndef log2 +#define log2(n) ffz(~(n)) +#endif + +static struct ll_sb_info *ll_init_sbi(void) +{ + struct ll_sb_info *sbi = NULL; + unsigned long pages; + unsigned long lru_page_max; + struct sysinfo si; + class_uuid_t uuid; + int i; + ENTRY; + + OBD_ALLOC_PTR(sbi); + if (sbi == NULL) + RETURN(NULL); + + spin_lock_init(&sbi->ll_lock); + mutex_init(&sbi->ll_lco.lco_lock); + spin_lock_init(&sbi->ll_pp_extent_lock); + spin_lock_init(&sbi->ll_process_lock); + sbi->ll_rw_stats_on = 0; + sbi->ll_statfs_max_age = OBD_STATFS_CACHE_SECONDS; + sbi->ll_neg_dentry_timeout = OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS; + + si_meminfo(&si); + pages = si.totalram - si.totalhigh; + lru_page_max = pages / 2; + + /* initialize ll_cache data */ + sbi->ll_cache = cl_cache_init(lru_page_max); + if (sbi->ll_cache == NULL) { + OBD_FREE(sbi, sizeof(*sbi)); + RETURN(NULL); + } + + sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32, + SBI_DEFAULT_READAHEAD_MAX); + sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file; + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1; + + ll_generate_random_uuid(uuid); + class_uuid_unparse(uuid, &sbi->ll_sb_uuid); + CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid); + + sbi->ll_flags |= LL_SBI_VERBOSE; +#ifdef ENABLE_CHECKSUM + sbi->ll_flags |= LL_SBI_CHECKSUM; +#endif +#ifdef ENABLE_FLOCK + sbi->ll_flags |= LL_SBI_FLOCK; +#endif + +#ifdef HAVE_LRU_RESIZE_SUPPORT + sbi->ll_flags |= LL_SBI_LRU_RESIZE; +#endif + sbi->ll_flags |= LL_SBI_LAZYSTATFS; + + for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { + spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. + pp_r_hist.oh_lock); + spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. + pp_w_hist.oh_lock); + } + + /* metadata statahead is enabled by default */ + sbi->ll_sa_running_max = LL_SA_RUNNING_DEF; + sbi->ll_sa_max = LL_SA_RPC_DEF; + atomic_set(&sbi->ll_sa_total, 0); + atomic_set(&sbi->ll_sa_wrong, 0); + atomic_set(&sbi->ll_sa_running, 0); + atomic_set(&sbi->ll_agl_total, 0); + sbi->ll_flags |= LL_SBI_AGL_ENABLED; + sbi->ll_flags |= LL_SBI_FAST_READ; + sbi->ll_flags |= LL_SBI_TINY_WRITE; + + /* root squash */ + sbi->ll_squash.rsi_uid = 0; + sbi->ll_squash.rsi_gid = 0; + INIT_LIST_HEAD(&sbi->ll_squash.rsi_nosquash_nids); + init_rwsem(&sbi->ll_squash.rsi_sem); + + atomic_set(&sbi->ll_dir_restore_max_retry_count, + LL_MDLL_DIR_RESTORE_DEF_RETRY_COUNT); + + RETURN(sbi); +} + +static void ll_free_sbi(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + ENTRY; + + if (sbi != NULL) { + if (!list_empty(&sbi->ll_squash.rsi_nosquash_nids)) + cfs_free_nidlist(&sbi->ll_squash.rsi_nosquash_nids); + if (sbi->ll_cache != NULL) { + cl_cache_decref(sbi->ll_cache); + sbi->ll_cache = NULL; + } + OBD_FREE(sbi, sizeof(*sbi)); + } + EXIT; +} + +static int client_common_fill_super(struct super_block *sb, char *md, char *dt, + struct vfsmount *mnt) +{ + struct inode *root = NULL; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_statfs *osfs = NULL; + struct ptlrpc_request *request = NULL; + struct obd_connect_data *data = NULL; + struct obd_uuid *uuid; + struct md_op_data *op_data; + struct lustre_md lmd; + u64 valid; + int size, err, checksum; + + ENTRY; + sbi->ll_md_obd = class_name2obd(md); + if (!sbi->ll_md_obd) { + CERROR("MD %s: not setup or attached\n", md); + RETURN(-EINVAL); + } + + OBD_ALLOC_PTR(data); + if (data == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC_PTR(osfs); + if (osfs == NULL) { + OBD_FREE_PTR(data); + RETURN(-ENOMEM); + } + + /* pass client page size via ocd_grant_blkbits, the server should report + * back its backend blocksize for grant calculation purpose */ + data->ocd_grant_blkbits = PAGE_SHIFT; + + /* indicate MDT features supported by this client */ + data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | + OBD_CONNECT_ATTRFID | OBD_CONNECT_GRANT | + OBD_CONNECT_VERSION | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK| + OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | + OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | + OBD_CONNECT_AT | OBD_CONNECT_LOV_V3 | + OBD_CONNECT_VBR | OBD_CONNECT_FULL20 | + OBD_CONNECT_64BITHASH | + OBD_CONNECT_EINPROGRESS | + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS| + OBD_CONNECT_MAX_EASIZE | + OBD_CONNECT_FLOCK_DEAD | + OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | + OBD_CONNECT_OPEN_BY_FID | + OBD_CONNECT_DIR_STRIPE | + OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | + OBD_CONNECT_SUBTREE | + OBD_CONNECT_MULTIMODRPCS | + OBD_CONNECT_GRANT_PARAM | + OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2; + + data->ocd_connect_flags2 = OBD_CONNECT2_FLR | + OBD_CONNECT2_LOCK_CONVERT | + OBD_CONNECT2_DIR_MIGRATE | + OBD_CONNECT2_SUM_STATFS | + OBD_CONNECT2_ARCHIVE_ID_ARRAY | + OBD_CONNECT2_LSOM | + OBD_CONNECT2_ASYNC_DISCARD | + OBD_CONNECT2_GETATTR_PFID | + OBD_CONNECT2_MDLL; + + if (sbi->ll_flags & LL_SBI_MDLL) + data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL; + + if (sbi->ll_flags & LL_SBI_MDLL_BYPASS) + data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_BYPASS; + + if (sbi->ll_flags & LL_SBI_MDLL_AUTO_REFRESH) + data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_AUTO_REFRESH; + +#ifdef HAVE_LRU_RESIZE_SUPPORT + if (sbi->ll_flags & LL_SBI_LRU_RESIZE) + data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; +#endif +#ifdef CONFIG_FS_POSIX_ACL + data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK | + OBD_CONNECT_LARGE_ACL; +#endif + + data->ocd_cksum_types = obd_cksum_types_supported_client(); + + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT)) + /* flag mdc connection as lightweight, only used for test + * purpose, use with care */ + data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT; + + data->ocd_ibits_known = MDS_INODELOCK_FULL; + data->ocd_version = LUSTRE_VERSION_CODE; + + if (sb->s_flags & SB_RDONLY) + data->ocd_connect_flags |= OBD_CONNECT_RDONLY; + if (sbi->ll_flags & LL_SBI_USER_XATTR) + data->ocd_connect_flags |= OBD_CONNECT_XATTR; + +#ifdef SB_NOSEC + /* Setting this indicates we correctly support S_NOSEC (See kernel + * commit 9e1f1de02c2275d7172e18dc4e7c2065777611bf) + */ + sb->s_flags |= SB_NOSEC; +#endif + + if (sbi->ll_flags & LL_SBI_FLOCK) + sbi->ll_fop = &ll_file_operations_flock; + else if (sbi->ll_flags & LL_SBI_LOCALFLOCK) + sbi->ll_fop = &ll_file_operations; + else + sbi->ll_fop = &ll_file_operations_noflock; + + /* always ping even if server suppress_pings */ + if (sbi->ll_flags & LL_SBI_ALWAYS_PING) + data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS; + + obd_connect_set_secctx(data); + +#if defined(CONFIG_SECURITY) + data->ocd_connect_flags2 |= OBD_CONNECT2_SELINUX_POLICY; +#endif + + data->ocd_brw_size = MD_MAX_BRW_SIZE; + + err = obd_connect(NULL, &sbi->ll_md_exp, sbi->ll_md_obd, + &sbi->ll_sb_uuid, data, NULL); + if (err == -EBUSY) { + LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing " + "recovery, of which this client is not a " + "part. Please wait for recovery to complete," + " abort, or time out.\n", md); + GOTO(out, err); + } else if (err) { + CERROR("cannot connect to %s: rc = %d\n", md, err); + GOTO(out, err); + } + + sbi->ll_md_exp->exp_connect_data = *data; + + err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp, + LUSTRE_SEQ_METADATA); + if (err) { + CERROR("%s: Can't init metadata layer FID infrastructure, " + "rc = %d\n", sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_md, err); + } + + /* For mount, we only need fs info from MDT0, and also in DNE, it + * can make sure the client can be mounted as long as MDT0 is + * avaible */ + err = obd_statfs(NULL, sbi->ll_md_exp, osfs, + ktime_get_seconds() - sbi->ll_statfs_max_age, + OBD_STATFS_FOR_MDT0); + if (err) + GOTO(out_md_fid, err); + + /* This needs to be after statfs to ensure connect has finished. + * Note that "data" does NOT contain the valid connect reply. + * If connecting to a 1.8 server there will be no LMV device, so + * we can access the MDC export directly and exp_connect_flags will + * be non-zero, but if accessing an upgraded 2.1 server it will + * have the correct flags filled in. + * XXX: fill in the LMV exp_connect_flags from MDC(s). */ + valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD; + if (exp_connect_flags(sbi->ll_md_exp) != 0 && + valid != CLIENT_CONNECT_MDT_REQD) { + char *buf; + + OBD_ALLOC_WAIT(buf, PAGE_SIZE); + obd_connect_flags2str(buf, PAGE_SIZE, + valid ^ CLIENT_CONNECT_MDT_REQD, 0, ","); + LCONSOLE_ERROR_MSG(0x170, "Server %s does not support " + "feature(s) needed for correct operation " + "of this client (%s). Please upgrade " + "server or downgrade client.\n", + sbi->ll_md_exp->exp_obd->obd_name, buf); + OBD_FREE(buf, PAGE_SIZE); + GOTO(out_md_fid, err = -EPROTO); + } + + size = sizeof(*data); + err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA), + KEY_CONN_DATA, &size, data); + if (err) { + CERROR("%s: Get connect data failed: rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_md_fid, err); + } + + LASSERT(osfs->os_bsize); + sb->s_blocksize = osfs->os_bsize; + sb->s_blocksize_bits = log2(osfs->os_bsize); + sb->s_magic = LL_SUPER_MAGIC; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sbi->ll_inode_cache_enabled = 1; + sbi->ll_namelen = osfs->os_namelen; + sbi->ll_mnt.mnt = current->fs->root.mnt; + + if ((sbi->ll_flags & LL_SBI_USER_XATTR) && + !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) { + LCONSOLE_INFO("Disabling user_xattr feature because " + "it is not supported on the server\n"); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } + + if (data->ocd_connect_flags & OBD_CONNECT_ACL) { +#ifdef SB_POSIXACL + sb->s_flags |= SB_POSIXACL; +#endif + sbi->ll_flags |= LL_SBI_ACL; + } else { + LCONSOLE_INFO("client wants to enable acl, but mdt not!\n"); +#ifdef SB_POSIXACL + sb->s_flags &= ~SB_POSIXACL; +#endif + sbi->ll_flags &= ~LL_SBI_ACL; + } + + if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH) + sbi->ll_flags |= LL_SBI_64BIT_HASH; + + if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) + sbi->ll_flags |= LL_SBI_LAYOUT_LOCK; + + if (obd_connect_has_secctx(data)) + sbi->ll_flags |= LL_SBI_FILE_SECCTX; + + if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) { + if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) { + LCONSOLE_INFO("%s: disabling xattr cache due to " + "unknown maximum xattr size.\n", dt); + } else if (!sbi->ll_xattr_cache_set) { + /* If xattr_cache is already set (no matter 0 or 1) + * during processing llog, it won't be enabled here. */ + sbi->ll_flags |= LL_SBI_XATTR_CACHE; + sbi->ll_xattr_cache_enabled = 1; + } + } + + sbi->ll_dt_obd = class_name2obd(dt); + if (!sbi->ll_dt_obd) { + CERROR("DT %s: not setup or attached\n", dt); + GOTO(out_md_fid, err = -ENODEV); + } + + /* pass client page size via ocd_grant_blkbits, the server should report + * back its backend blocksize for grant calculation purpose */ + data->ocd_grant_blkbits = PAGE_SHIFT; + + /* indicate OST features supported by this client */ + data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | + OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | + OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK| + OBD_CONNECT_AT | OBD_CONNECT_OSS_CAPA | + OBD_CONNECT_VBR | OBD_CONNECT_FULL20 | + OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | + OBD_CONNECT_EINPROGRESS | + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LAYOUTLOCK | + OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | + OBD_CONNECT_BULK_MBITS | OBD_CONNECT_SHORTIO | + OBD_CONNECT_FLAGS2 | OBD_CONNECT_GRANT_SHRINK; + +/* The client currently advertises support for OBD_CONNECT_LOCKAHEAD_OLD so it + * can interoperate with an older version of lockahead which was released prior + * to landing in master. This support will be dropped when 2.13 development + * starts. At the point, we should not just drop the connect flag (below), we + * should also remove the support in the code. + * + * Removing it means a few things: + * 1. Remove this section here + * 2. Remove CEF_NONBLOCK in ll_file_lockahead() + * 3. Remove function exp_connect_lockahead_old + * 4. Remove LDLM_FL_LOCKAHEAD_OLD_RESERVED in lustre_dlm_flags.h + * */ +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 12, 50, 0) + data->ocd_connect_flags |= OBD_CONNECT_LOCKAHEAD_OLD; +#endif + + data->ocd_connect_flags2 = OBD_CONNECT2_LOCKAHEAD; + + if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_GRANT_PARAM)) + data->ocd_connect_flags |= OBD_CONNECT_GRANT_PARAM; + + /* OBD_CONNECT_CKSUM should always be set, even if checksums are + * disabled by default, because it can still be enabled on the + * fly via /sys. As a consequence, we still need to come to an + * agreement on the supported algorithms at connect time + */ + data->ocd_connect_flags |= OBD_CONNECT_CKSUM; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY)) + data->ocd_cksum_types = OBD_CKSUM_ADLER; + else + data->ocd_cksum_types = obd_cksum_types_supported_client(); + +#ifdef HAVE_LRU_RESIZE_SUPPORT + data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; +#endif + /* always ping even if server suppress_pings */ + if (sbi->ll_flags & LL_SBI_ALWAYS_PING) + data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS; + + CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d " + "ocd_grant: %d\n", data->ocd_connect_flags, + data->ocd_version, data->ocd_grant); + + sbi->ll_dt_obd->obd_upcall.onu_owner = &sbi->ll_lco; + sbi->ll_dt_obd->obd_upcall.onu_upcall = cl_ocd_update; + + data->ocd_brw_size = DT_MAX_BRW_SIZE; + + err = obd_connect(NULL, &sbi->ll_dt_exp, sbi->ll_dt_obd, + &sbi->ll_sb_uuid, data, NULL); + if (err == -EBUSY) { + LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing " + "recovery, of which this client is not a " + "part. Please wait for recovery to " + "complete, abort, or time out.\n", dt); + GOTO(out_md, err); + } else if (err) { + CERROR("%s: Cannot connect to %s: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, dt, err); + GOTO(out_md, err); + } + + sbi->ll_dt_exp->exp_connect_data = *data; + + /* Don't change value if it was specified in the config log */ + if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1) { + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = + max_t(unsigned long, SBI_DEFAULT_READAHEAD_WHOLE_MAX, + (data->ocd_brw_size >> PAGE_SHIFT)); + if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages > + sbi->ll_ra_info.ra_max_pages_per_file) + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = + sbi->ll_ra_info.ra_max_pages_per_file; + } + + err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp, + LUSTRE_SEQ_METADATA); + if (err) { + CERROR("%s: Can't init data layer FID infrastructure, " + "rc = %d\n", sbi->ll_dt_exp->exp_obd->obd_name, err); + GOTO(out_dt, err); + } + + mutex_lock(&sbi->ll_lco.lco_lock); + sbi->ll_lco.lco_flags = data->ocd_connect_flags; + sbi->ll_lco.lco_md_exp = sbi->ll_md_exp; + sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp; + mutex_unlock(&sbi->ll_lco.lco_lock); + + fid_zero(&sbi->ll_root_fid); + err = md_get_root(sbi->ll_md_exp, get_mount_fileset(sb), + &sbi->ll_root_fid); + if (err) { + CERROR("cannot mds_connect: rc = %d\n", err); + GOTO(out_lock_cn_cb, err); + } + if (!fid_is_sane(&sbi->ll_root_fid)) { + CERROR("%s: Invalid root fid "DFID" during mount\n", + sbi->ll_md_exp->exp_obd->obd_name, + PFID(&sbi->ll_root_fid)); + GOTO(out_lock_cn_cb, err = -EINVAL); + } + CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid)); + + sb->s_op = &lustre_super_operations; +#ifdef HAVE_XATTR_HANDLER_FLAGS + sb->s_xattr = ll_xattr_handlers; +#endif +#if THREAD_SIZE >= 8192 /*b=17630*/ + sb->s_export_op = &lustre_export_operations; +#endif + + /* make root inode + * XXX: move this to after cbd setup? */ + valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMODEASIZE; + if (sbi->ll_flags & LL_SBI_ACL) + valid |= OBD_MD_FLACL; + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + GOTO(out_lock_cn_cb, err = -ENOMEM); + + op_data->op_fid1 = sbi->ll_root_fid; + op_data->op_mode = 0; + op_data->op_valid = valid; + + err = md_getattr(sbi->ll_md_exp, op_data, &request); + + OBD_FREE_PTR(op_data); + if (err) { + CERROR("%s: md_getattr failed for root: rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_lock_cn_cb, err); + } + + err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, + sbi->ll_md_exp, &lmd); + if (err) { + CERROR("failed to understand root inode md: rc = %d\n", err); + ptlrpc_req_finished(request); + GOTO(out_lock_cn_cb, err); + } + + LASSERT(fid_is_sane(&sbi->ll_root_fid)); + root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid, + sbi->ll_flags & LL_SBI_32BIT_API), + &lmd); + md_free_lustre_md(sbi->ll_md_exp, &lmd); + ptlrpc_req_finished(request); + + if (IS_ERR(root)) { +#ifdef CONFIG_FS_POSIX_ACL + if (lmd.posix_acl) { + posix_acl_release(lmd.posix_acl); + lmd.posix_acl = NULL; + } +#endif + err = IS_ERR(root) ? PTR_ERR(root) : -EBADF; + root = NULL; + CERROR("lustre_lite: bad iget4 for root\n"); + GOTO(out_root, err); + } + + checksum = sbi->ll_flags & LL_SBI_CHECKSUM; + if (sbi->ll_checksum_set) { + err = obd_set_info_async(NULL, sbi->ll_dt_exp, + sizeof(KEY_CHECKSUM), KEY_CHECKSUM, + sizeof(checksum), &checksum, NULL); + if (err) { + CERROR("%s: Set checksum failed: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, err); + GOTO(out_root, err); + } + } + cl_sb_init(sb); + + err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET), + KEY_CACHE_SET, sizeof(*sbi->ll_cache), + sbi->ll_cache, NULL); + if (err) { + CERROR("%s: Set cache_set failed: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, err); + GOTO(out_root, err); + } + + sb->s_root = d_make_root(root); + if (sb->s_root == NULL) { + CERROR("%s: can't make root dentry\n", + ll_get_fsname(sb, NULL, 0)); + GOTO(out_root, err = -ENOMEM); + } +#ifdef HAVE_DCACHE_LOCK + sb->s_root->d_op = &ll_d_ops; +#endif + + sbi->ll_sdev_orig = sb->s_dev; + + /* We set sb->s_dev equal on all lustre clients in order to support + * NFS export clustering. NFSD requires that the FSID be the same + * on all clients. */ + /* s_dev is also used in lt_compare() to compare two fs, but that is + * only a node-local comparison. */ + uuid = obd_get_uuid(sbi->ll_md_exp); + if (uuid != NULL) + sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid)); + + if (data != NULL) + OBD_FREE_PTR(data); + if (osfs != NULL) + OBD_FREE_PTR(osfs); + + if (sbi->ll_dt_obd) { + err = sysfs_create_link(&sbi->ll_kset.kobj, + &sbi->ll_dt_obd->obd_kset.kobj, + sbi->ll_dt_obd->obd_type->typ_name); + if (err < 0) { + CERROR("%s: could not register %s in llite: rc = %d\n", + dt, ll_get_fsname(sb, NULL, 0), err); + err = 0; + } + } + + if (sbi->ll_md_obd) { + err = sysfs_create_link(&sbi->ll_kset.kobj, + &sbi->ll_md_obd->obd_kset.kobj, + sbi->ll_md_obd->obd_type->typ_name); + if (err < 0) { + CERROR("%s: could not register %s in llite: rc = %d\n", + md, ll_get_fsname(sb, NULL, 0), err); + err = 0; + } + } + + RETURN(err); +out_root: + if (root) + iput(root); +out_lock_cn_cb: + obd_fid_fini(sbi->ll_dt_exp->exp_obd); +out_dt: + obd_disconnect(sbi->ll_dt_exp); + sbi->ll_dt_exp = NULL; + sbi->ll_dt_obd = NULL; +out_md_fid: + obd_fid_fini(sbi->ll_md_exp->exp_obd); +out_md: + obd_disconnect(sbi->ll_md_exp); + sbi->ll_md_exp = NULL; + sbi->ll_md_obd = NULL; +out: + if (data != NULL) + OBD_FREE_PTR(data); + if (osfs != NULL) + OBD_FREE_PTR(osfs); + return err; +} + +int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize) +{ + int size, rc; + + size = sizeof(*lmmsize); + rc = obd_get_info(NULL, sbi->ll_dt_exp, sizeof(KEY_MAX_EASIZE), + KEY_MAX_EASIZE, &size, lmmsize); + if (rc != 0) { + CERROR("%s: cannot get max LOV EA size: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, rc); + RETURN(rc); + } + + size = sizeof(int); + rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE), + KEY_MAX_EASIZE, &size, lmmsize); + if (rc) + CERROR("Get max mdsize error rc %d\n", rc); + + RETURN(rc); +} + +/** + * Get the value of the default_easize parameter. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] sbi superblock info for this filesystem + * \param[out] lmmsize pointer to storage location for value + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize) +{ + int size, rc; + + size = sizeof(int); + rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE), + KEY_DEFAULT_EASIZE, &size, lmmsize); + if (rc) + CERROR("Get default mdsize error rc %d\n", rc); + + RETURN(rc); +} + +/** + * Set the default_easize parameter to the given value. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] sbi superblock info for this filesystem + * \param[in] lmmsize the size to set + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +int ll_set_default_mdsize(struct ll_sb_info *sbi, int lmmsize) +{ + int rc; + + if (lmmsize < sizeof(struct lov_mds_md) || + lmmsize > OBD_MAX_DEFAULT_EA_SIZE) + return -EINVAL; + + rc = obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_DEFAULT_EASIZE), KEY_DEFAULT_EASIZE, + sizeof(int), &lmmsize, NULL); + + RETURN(rc); +} + +static void client_common_put_super(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + ENTRY; + + cl_sb_fini(sb); + + obd_fid_fini(sbi->ll_dt_exp->exp_obd); + obd_disconnect(sbi->ll_dt_exp); + sbi->ll_dt_exp = NULL; + + ll_debugfs_unregister_super(sb); + + obd_fid_fini(sbi->ll_md_exp->exp_obd); + obd_disconnect(sbi->ll_md_exp); + sbi->ll_md_exp = NULL; + + EXIT; +} + +void ll_kill_super(struct super_block *sb) +{ + struct ll_sb_info *sbi; + ENTRY; + + /* not init sb ?*/ + if (!(sb->s_flags & SB_ACTIVE)) + return; + + sbi = ll_s2sbi(sb); + /* we need restore s_dev from changed for clustred NFS before put_super + * because new kernels have cached s_dev and change sb->s_dev in + * put_super not affected real removing devices */ + if (sbi) { + sb->s_dev = sbi->ll_sdev_orig; + sbi->ll_umounting = 1; + + /* wait running statahead threads to quit */ + while (atomic_read(&sbi->ll_sa_running) > 0) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC >> 3)); + } + } + + EXIT; +} + +static inline int ll_set_opt(const char *opt, char *data, int fl) +{ + if (strncmp(opt, data, strlen(opt)) != 0) + return 0; + else + return fl; +} + +/* non-client-specific mount options are parsed in lmd_parse */ +static int ll_options(char *options, struct ll_sb_info *sbi) +{ + int tmp; + char *s1 = options, *s2; + int *flags = &sbi->ll_flags; + ENTRY; + + if (!options) + RETURN(0); + + CDEBUG(D_CONFIG, "Parsing opts %s\n", options); + + while (*s1) { + CDEBUG(D_SUPER, "next opt=%s\n", s1); + tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK); + if (tmp) { + *flags = (*flags & ~LL_SBI_LOCALFLOCK) | tmp; + goto next; + } + tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK); + if (tmp) { + *flags = (*flags & ~LL_SBI_FLOCK) | tmp; + goto next; + } + tmp = ll_set_opt("mdll_auto_refresh", s1, LL_SBI_MDLL_AUTO_REFRESH); + if (tmp) { + *flags = (*flags & ~LL_SBI_MDLL_AUTO_REFRESH) | tmp; + goto next; + } + tmp = ll_set_opt("mdll_bypass", s1, LL_SBI_MDLL_BYPASS); + if (tmp) { + *flags = (*flags & ~LL_SBI_MDLL_BYPASS) | tmp; + goto next; + } + tmp = ll_set_opt("mdll", s1, LL_SBI_MDLL); + if (tmp) { + *flags = (*flags & ~LL_SBI_MDLL) | tmp; + goto next; + } + tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("context", s1, 1); + if (tmp) + goto next; + tmp = ll_set_opt("fscontext", s1, 1); + if (tmp) + goto next; + tmp = ll_set_opt("defcontext", s1, 1); + if (tmp) + goto next; + tmp = ll_set_opt("rootcontext", s1, 1); + if (tmp) + goto next; + tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH); + if (tmp) { + *flags &= ~tmp; + goto next; + } + + tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM); + if (tmp) { + *flags |= tmp; + sbi->ll_checksum_set = 1; + goto next; + } + tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM); + if (tmp) { + *flags &= ~tmp; + sbi->ll_checksum_set = 1; + goto next; + } + tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("always_ping", s1, LL_SBI_ALWAYS_PING); + if (tmp) { + *flags |= tmp; + goto next; + } + LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n", + s1); + RETURN(-EINVAL); + +next: + /* Find next opt */ + s2 = strchr(s1, ','); + if (s2 == NULL) + break; + s1 = s2 + 1; + } + RETURN(0); +} + +void ll_lli_init(struct ll_inode_info *lli) +{ + lli->lli_inode_magic = LLI_INODE_MAGIC; + lli->lli_flags = 0; + spin_lock_init(&lli->lli_lock); + lli->lli_posix_acl = NULL; + /* Do not set lli_fid, it has been initialized already. */ + fid_zero(&lli->lli_pfid); + lli->lli_mds_read_och = NULL; + lli->lli_mds_write_och = NULL; + lli->lli_mds_exec_och = NULL; + lli->lli_open_fd_read_count = 0; + lli->lli_open_fd_write_count = 0; + lli->lli_open_fd_exec_count = 0; + mutex_init(&lli->lli_och_mutex); + spin_lock_init(&lli->lli_agl_lock); + spin_lock_init(&lli->lli_layout_lock); + ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE); + lli->lli_clob = NULL; + + init_rwsem(&lli->lli_xattrs_list_rwsem); + mutex_init(&lli->lli_xattrs_enq_lock); + + LASSERT(lli->lli_vfs_inode.i_mode != 0); + if (S_ISDIR(lli->lli_vfs_inode.i_mode)) { + mutex_init(&lli->lli_readdir_mutex); + lli->lli_opendir_key = NULL; + lli->lli_sai = NULL; + spin_lock_init(&lli->lli_sa_lock); + lli->lli_opendir_pid = 0; + lli->lli_sa_enabled = 0; + init_rwsem(&lli->lli_lsm_sem); + } else { + mutex_init(&lli->lli_size_mutex); + lli->lli_symlink_name = NULL; + init_rwsem(&lli->lli_trunc_sem); + range_lock_tree_init(&lli->lli_write_tree); + init_rwsem(&lli->lli_glimpse_sem); + lli->lli_glimpse_time = ktime_set(0, 0); + INIT_LIST_HEAD(&lli->lli_agl_list); + lli->lli_agl_index = 0; + lli->lli_async_rc = 0; + } + mutex_init(&lli->lli_layout_mutex); + memset(lli->lli_jobid, 0, sizeof(lli->lli_jobid)); +} + +#define MAX_STRING_SIZE 128 + +#ifndef HAVE_SUPER_SETUP_BDI_NAME + +#define LSI_BDI_INITIALIZED 0x00400000 + +#ifndef HAVE_BDI_CAP_MAP_COPY +# define BDI_CAP_MAP_COPY 0 +#endif + +static int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + char buf[MAX_STRING_SIZE]; + va_list args; + int err; + + err = bdi_init(&lsi->lsi_bdi); + if (err) + return err; + + lsi->lsi_flags |= LSI_BDI_INITIALIZED; + lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY; + lsi->lsi_bdi.name = "lustre"; + va_start(args, fmt); + vsnprintf(buf, MAX_STRING_SIZE, fmt, args); + va_end(args); + err = bdi_register(&lsi->lsi_bdi, NULL, "%s", buf); + va_end(args); + if (!err) + sb->s_bdi = &lsi->lsi_bdi; + + return err; +} +#endif /* !HAVE_SUPER_SETUP_BDI_NAME */ + +int ll_fill_super(struct super_block *sb, struct vfsmount *mnt) +{ + struct lustre_profile *lprof = NULL; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = NULL; + char *dt = NULL, *md = NULL; + char *profilenm = get_profile_name(sb); + struct config_llog_instance *cfg; + /* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */ + const int instlen = 16 + 2; + unsigned long cfg_instance = ll_get_cfg_instance(sb); + char name[MAX_STRING_SIZE]; + int md_len = 0; + int dt_len = 0; + char *ptr; + int len; + int err; + + ENTRY; + /* for ASLR, to map between cfg_instance and hashed ptr */ + CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n", + profilenm, cfg_instance, sb); + + try_module_get(THIS_MODULE); + + OBD_ALLOC_PTR(cfg); + if (cfg == NULL) + GOTO(out_free_cfg, err = -ENOMEM); + + /* client additional sb info */ + lsi->lsi_llsbi = sbi = ll_init_sbi(); + if (!sbi) + GOTO(out_free_cfg, err = -ENOMEM); + + err = ll_options(lsi->lsi_lmd->lmd_opts, sbi); + if (err) + GOTO(out_free_cfg, err); + + err = super_setup_bdi_name(sb, "lustre-%016lx", cfg_instance); + if (err) + GOTO(out_free_cfg, err); + +#ifndef HAVE_DCACHE_LOCK + /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */ + sb->s_d_op = &ll_d_ops; +#endif + /* Get fsname */ + len = strlen(profilenm); + ptr = strrchr(profilenm, '-'); + if (ptr && (strcmp(ptr, "-client") == 0)) + len -= 7; + + /* Mount info */ + snprintf(name, MAX_STRING_SIZE, "%.*s-%016lx", len, + profilenm, cfg_instance); + + /* Call ll_debugfs_register_super() before lustre_process_log() + * so that "llite.*.*" params can be processed correctly. + */ + err = ll_debugfs_register_super(sb, name); + if (err < 0) { + CERROR("%s: could not register mountpoint in llite: rc = %d\n", + ll_get_fsname(sb, NULL, 0), err); + err = 0; + } + + /* The cfg_instance is a value unique to this super, in case some + * joker tries to mount the same fs at two mount points. + */ + cfg->cfg_instance = cfg_instance; + cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid; + cfg->cfg_callback = class_config_llog_handler; + cfg->cfg_sub_clds = CONFIG_SUB_CLIENT; + /* set up client obds */ + err = lustre_process_log(sb, profilenm, cfg); + if (err < 0) + GOTO(out_debugfs, err); + + /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */ + lprof = class_get_profile(profilenm); + if (lprof == NULL) { + LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be" + " read from the MGS. Does that filesystem " + "exist?\n", profilenm); + GOTO(out_debugfs, err = -EINVAL); + } + CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm, + lprof->lp_md, lprof->lp_dt); + + dt_len = strlen(lprof->lp_dt) + instlen + 2; + OBD_ALLOC(dt, dt_len); + if (!dt) + GOTO(out_profile, err = -ENOMEM); + snprintf(dt, dt_len - 1, "%s-%016lx", lprof->lp_dt, cfg_instance); + + md_len = strlen(lprof->lp_md) + instlen + 2; + OBD_ALLOC(md, md_len); + if (!md) + GOTO(out_free_dt, err = -ENOMEM); + snprintf(md, md_len - 1, "%s-%016lx", lprof->lp_md, cfg_instance); + + /* connections, registrations, sb setup */ + err = client_common_fill_super(sb, md, dt, mnt); + if (err < 0) + GOTO(out_free_md, err); + + sbi->ll_client_common_fill_super_succeeded = 1; + +out_free_md: + if (md) + OBD_FREE(md, md_len); +out_free_dt: + if (dt) + OBD_FREE(dt, dt_len); +out_profile: + if (lprof) + class_put_profile(lprof); +out_debugfs: + if (err < 0) + ll_debugfs_unregister_super(sb); +out_free_cfg: + if (cfg) + OBD_FREE_PTR(cfg); + + if (err) + ll_put_super(sb); + else if (sbi->ll_flags & LL_SBI_VERBOSE) + LCONSOLE_WARN("Mounted %s\n", profilenm); + RETURN(err); +} /* ll_fill_super */ + +void ll_put_super(struct super_block *sb) +{ + struct config_llog_instance cfg, params_cfg; + struct obd_device *obd; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + char *profilenm = get_profile_name(sb); + unsigned long cfg_instance = ll_get_cfg_instance(sb); + long ccc_count; + int next, force = 1, rc = 0; + ENTRY; + + if (!sbi) + GOTO(out_no_sbi, 0); + + /* Should replace instance_id with something better for ASLR */ + CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n", + profilenm, cfg_instance, sb); + + cfg.cfg_instance = cfg_instance; + lustre_end_log(sb, profilenm, &cfg); + + params_cfg.cfg_instance = cfg_instance; + lustre_end_log(sb, PARAMS_FILENAME, ¶ms_cfg); + + if (sbi->ll_md_exp) { + obd = class_exp2obd(sbi->ll_md_exp); + if (obd) + force = obd->obd_force; + } + + /* Wait for unstable pages to be committed to stable storage */ + if (force == 0) { + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(sbi->ll_cache->ccc_unstable_waitq, + atomic_long_read(&sbi->ll_cache->ccc_unstable_nr) == 0, + &lwi); + } + + ccc_count = atomic_long_read(&sbi->ll_cache->ccc_unstable_nr); + if (force == 0 && rc != -EINTR) + LASSERTF(ccc_count == 0, "count: %li\n", ccc_count); + + /* We need to set force before the lov_disconnect in + lustre_common_put_super, since l_d cleans up osc's as well. */ + if (force) { + next = 0; + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, + &next)) != NULL) { + obd->obd_force = force; + } + } + + if (sbi->ll_client_common_fill_super_succeeded) { + /* Only if client_common_fill_super succeeded */ + client_common_put_super(sb); + } + + next = 0; + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) { + class_manual_cleanup(obd); + } + + if (sbi->ll_flags & LL_SBI_VERBOSE) + LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : ""); + + if (profilenm) + class_del_profile(profilenm); + +#ifndef HAVE_SUPER_SETUP_BDI_NAME + if (lsi->lsi_flags & LSI_BDI_INITIALIZED) { + bdi_destroy(&lsi->lsi_bdi); + lsi->lsi_flags &= ~LSI_BDI_INITIALIZED; + } +#endif + + ll_free_sbi(sb); + lsi->lsi_llsbi = NULL; +out_no_sbi: + lustre_common_put_super(sb); + + cl_env_cache_purge(~0); + + module_put(THIS_MODULE); + + EXIT; +} /* client_put_super */ + +struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock) +{ + struct inode *inode = NULL; + + /* NOTE: we depend on atomic igrab() -bzzz */ + lock_res_and_lock(lock); + if (lock->l_resource->lr_lvb_inode) { + struct ll_inode_info * lli; + lli = ll_i2info(lock->l_resource->lr_lvb_inode); + if (lli->lli_inode_magic == LLI_INODE_MAGIC) { + inode = igrab(lock->l_resource->lr_lvb_inode); + } else { + inode = lock->l_resource->lr_lvb_inode; + LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ? D_INFO : + D_WARNING, lock, "lr_lvb_inode %p is " + "bogus: magic %08x", + lock->l_resource->lr_lvb_inode, + lli->lli_inode_magic); + inode = NULL; + } + } + unlock_res_and_lock(lock); + return inode; +} + +void ll_dir_clear_lsm_md(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + LASSERT(S_ISDIR(inode->i_mode)); + + if (lli->lli_lsm_md) { + lmv_free_memmd(lli->lli_lsm_md); + lli->lli_lsm_md = NULL; + } + + if (lli->lli_default_lsm_md) { + lmv_free_memmd(lli->lli_default_lsm_md); + lli->lli_default_lsm_md = NULL; + } +} + +static struct inode *ll_iget_anon_dir(struct super_block *sb, + const struct lu_fid *fid, + struct lustre_md *md) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct mdt_body *body = md->body; + struct inode *inode; + ino_t ino; + ENTRY; + + ino = cl_fid_build_ino(fid, sbi->ll_flags & LL_SBI_32BIT_API); + inode = iget_locked(sb, ino); + if (inode == NULL) { + CERROR("%s: failed get simple inode "DFID": rc = -ENOENT\n", + ll_get_fsname(sb, NULL, 0), PFID(fid)); + RETURN(ERR_PTR(-ENOENT)); + } + + if (inode->i_state & I_NEW) { + struct ll_inode_info *lli = ll_i2info(inode); + struct lmv_stripe_md *lsm = md->lmv; + + inode->i_mode = (inode->i_mode & ~S_IFMT) | + (body->mbo_mode & S_IFMT); + LASSERTF(S_ISDIR(inode->i_mode), "Not slave inode "DFID"\n", + PFID(fid)); + + inode->i_mtime.tv_sec = 0; + inode->i_atime.tv_sec = 0; + inode->i_ctime.tv_sec = 0; + inode->i_rdev = 0; + +#ifdef HAVE_BACKING_DEV_INFO + /* initializing backing dev info. */ + inode->i_mapping->backing_dev_info = + &s2lsi(inode->i_sb)->lsi_bdi; +#endif + inode->i_op = &ll_dir_inode_operations; + inode->i_fop = &ll_dir_operations; + lli->lli_fid = *fid; + ll_lli_init(lli); + + LASSERT(lsm != NULL); + /* master object FID */ + lli->lli_pfid = body->mbo_fid1; + CDEBUG(D_INODE, "lli %p slave "DFID" master "DFID"\n", + lli, PFID(fid), PFID(&lli->lli_pfid)); + unlock_new_inode(inode); + } + + RETURN(inode); +} + +static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md) +{ + struct lu_fid *fid; + struct lmv_stripe_md *lsm = md->lmv; + struct ll_inode_info *lli = ll_i2info(inode); + int i; + + LASSERT(lsm != NULL); + + CDEBUG(D_INODE, "%s: "DFID" set dir layout:\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&lli->lli_fid)); + lsm_md_dump(D_INODE, lsm); + + if (!lmv_dir_striped(lsm)) + goto out; + + /* XXX sigh, this lsm_root initialization should be in + * LMV layer, but it needs ll_iget right now, so we + * put this here right now. */ + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + fid = &lsm->lsm_md_oinfo[i].lmo_fid; + LASSERT(lsm->lsm_md_oinfo[i].lmo_root == NULL); + + if (!fid_is_sane(fid)) + continue; + + /* Unfortunately ll_iget will call ll_update_inode, + * where the initialization of slave inode is slightly + * different, so it reset lsm_md to NULL to avoid + * initializing lsm for slave inode. */ + lsm->lsm_md_oinfo[i].lmo_root = + ll_iget_anon_dir(inode->i_sb, fid, md); + if (IS_ERR(lsm->lsm_md_oinfo[i].lmo_root)) { + int rc = PTR_ERR(lsm->lsm_md_oinfo[i].lmo_root); + + lsm->lsm_md_oinfo[i].lmo_root = NULL; + while (i-- > 0) { + iput(lsm->lsm_md_oinfo[i].lmo_root); + lsm->lsm_md_oinfo[i].lmo_root = NULL; + } + return rc; + } + } +out: + lli->lli_lsm_md = lsm; + + return 0; +} + +static void ll_update_default_lsm_md(struct inode *inode, struct lustre_md *md) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + ENTRY; + + if (!md->default_lmv) { + /* clear default lsm */ + if (lli->lli_default_lsm_md) { + down_write(&lli->lli_lsm_sem); + if (lli->lli_default_lsm_md) { + lmv_free_memmd(lli->lli_default_lsm_md); + lli->lli_default_lsm_md = NULL; + } + up_write(&lli->lli_lsm_sem); + } + RETURN_EXIT; + } + + if (lli->lli_default_lsm_md) { + /* do nonthing if default lsm isn't changed */ + down_read(&lli->lli_lsm_sem); + if (lli->lli_default_lsm_md && + lsm_md_eq(lli->lli_default_lsm_md, md->default_lmv)) { + up_read(&lli->lli_lsm_sem); + RETURN_EXIT; + } + up_read(&lli->lli_lsm_sem); + } + + down_write(&lli->lli_lsm_sem); + if (lli->lli_default_lsm_md) + lmv_free_memmd(lli->lli_default_lsm_md); + lli->lli_default_lsm_md = md->default_lmv; + lsm_md_dump(D_INODE, md->default_lmv); + md->default_lmv = NULL; + up_write(&lli->lli_lsm_sem); + RETURN_EXIT; +} + +static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lmv_stripe_md *lsm = md->lmv; + int rc = 0; + + ENTRY; + + LASSERT(S_ISDIR(inode->i_mode)); + CDEBUG(D_INODE, "update lsm %p of "DFID"\n", lli->lli_lsm_md, + PFID(ll_inode2fid(inode))); + + /* update default LMV */ + if (md->default_lmv) + ll_update_default_lsm_md(inode, md); + + /* + * no striped information from request, lustre_md from req does not + * include stripeEA, see ll_md_setattr() + */ + if (!lsm) + RETURN(0); + + /* + * normally dir layout doesn't change, only take read lock to check + * that to avoid blocking other MD operations. + */ + down_read(&lli->lli_lsm_sem); + + /* some concurrent lookup initialized lsm, and unchanged */ + if (lli->lli_lsm_md && lsm_md_eq(lli->lli_lsm_md, lsm)) + GOTO(unlock, rc = 0); + + /* if dir layout doesn't match, check whether version is increased, + * which means layout is changed, this happens in dir split/merge and + * lfsck. + */ + if (lli->lli_lsm_md && !lsm_md_eq(lli->lli_lsm_md, lsm)) { + if (lmv_dir_striped(lli->lli_lsm_md) && + lsm->lsm_md_layout_version <= + lli->lli_lsm_md->lsm_md_layout_version) { + CERROR("%s: "DFID" dir layout mismatch:\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&lli->lli_fid)); + lsm_md_dump(D_ERROR, lli->lli_lsm_md); + lsm_md_dump(D_ERROR, lsm); + GOTO(unlock, rc = -EINVAL); + } + + /* layout changed, switch to write lock */ + up_read(&lli->lli_lsm_sem); + down_write(&lli->lli_lsm_sem); + ll_dir_clear_lsm_md(inode); + } + + /* set directory layout */ + if (!lli->lli_lsm_md) { + struct cl_attr *attr; + + rc = ll_init_lsm_md(inode, md); + up_write(&lli->lli_lsm_sem); + if (rc != 0) + RETURN(rc); + + /* set md->lmv to NULL, so the following free lustre_md + * will not free this lsm */ + md->lmv = NULL; + + /* + * md_merge_attr() may take long, since lsm is already set, + * switch to read lock. + */ + down_read(&lli->lli_lsm_sem); + + if (!lmv_dir_striped(lli->lli_lsm_md)) + GOTO(unlock, rc); + + OBD_ALLOC_PTR(attr); + if (attr == NULL) + GOTO(unlock, rc = -ENOMEM); + + /* validate the lsm */ + rc = md_merge_attr(ll_i2mdexp(inode), lsm, attr, + ll_md_blocking_ast); + if (rc != 0) { + OBD_FREE_PTR(attr); + GOTO(unlock, rc); + } + + if (md->body->mbo_valid & OBD_MD_FLNLINK) + md->body->mbo_nlink = attr->cat_nlink; + if (md->body->mbo_valid & OBD_MD_FLSIZE) + md->body->mbo_size = attr->cat_size; + if (md->body->mbo_valid & OBD_MD_FLATIME) + md->body->mbo_atime = attr->cat_atime; + if (md->body->mbo_valid & OBD_MD_FLCTIME) + md->body->mbo_ctime = attr->cat_ctime; + if (md->body->mbo_valid & OBD_MD_FLMTIME) + md->body->mbo_mtime = attr->cat_mtime; + + OBD_FREE_PTR(attr); + } + +unlock: + up_read(&lli->lli_lsm_sem); + + return rc; +} + +void ll_clear_inode(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + + if (S_ISDIR(inode->i_mode)) { + /* these should have been cleared in ll_file_release */ + LASSERT(lli->lli_opendir_key == NULL); + LASSERT(lli->lli_sai == NULL); + LASSERT(lli->lli_opendir_pid == 0); + } + + md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode)); + + LASSERT(!lli->lli_open_fd_write_count); + LASSERT(!lli->lli_open_fd_read_count); + LASSERT(!lli->lli_open_fd_exec_count); + + if (lli->lli_mds_write_och) + ll_md_real_close(inode, FMODE_WRITE); + if (lli->lli_mds_exec_och) + ll_md_real_close(inode, FMODE_EXEC); + if (lli->lli_mds_read_och) + ll_md_real_close(inode, FMODE_READ); + + if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) { + OBD_FREE(lli->lli_symlink_name, + strlen(lli->lli_symlink_name) + 1); + lli->lli_symlink_name = NULL; + } + + ll_xattr_cache_destroy(inode); + +#ifdef CONFIG_FS_POSIX_ACL + forget_all_cached_acls(inode); + if (lli->lli_posix_acl) { + posix_acl_release(lli->lli_posix_acl); + lli->lli_posix_acl = NULL; + } +#endif + lli->lli_inode_magic = LLI_INODE_DEAD; + + if (S_ISDIR(inode->i_mode)) + ll_dir_clear_lsm_md(inode); + else if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) + LASSERT(list_empty(&lli->lli_agl_list)); + + /* + * XXX This has to be done before lsm is freed below, because + * cl_object still uses inode lsm. + */ + cl_inode_fini(inode); + + EXIT; +} + +static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data) +{ + struct lustre_md md; + struct inode *inode = dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *request = NULL; + int rc, ia_valid; + ENTRY; + + op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &request); + if (rc) { + ptlrpc_req_finished(request); + if (rc == -ENOENT) { + clear_nlink(inode); + /* Unlinked special device node? Or just a race? + * Pretend we done everything. */ + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) { + ia_valid = op_data->op_attr.ia_valid; + op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS; + rc = simple_setattr(&init_user_ns, dentry, + &op_data->op_attr); + op_data->op_attr.ia_valid = ia_valid; + } + } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) { + CERROR("md_setattr fails: rc = %d\n", rc); + } + RETURN(rc); + } + + rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, + sbi->ll_md_exp, &md); + if (rc) { + ptlrpc_req_finished(request); + RETURN(rc); + } + + ia_valid = op_data->op_attr.ia_valid; + /* inode size will be in ll_setattr_ost, can't do it now since dirty + * cache is not cleared yet. */ + op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE); + if (S_ISREG(inode->i_mode)) + inode_lock(inode); + rc = simple_setattr(&init_user_ns, dentry, &op_data->op_attr); + if (S_ISREG(inode->i_mode)) + inode_unlock(inode); + op_data->op_attr.ia_valid = ia_valid; + + rc = ll_update_inode(inode, &md); + ptlrpc_req_finished(request); + + RETURN(rc); +} + +/* If this inode has objects allocated to it (lsm != NULL), then the OST + * object(s) determine the file size and mtime. Otherwise, the MDS will + * keep these values until such a time that objects are allocated for it. + * We do the MDS operations first, as it is checking permissions for us. + * We don't to the MDS RPC if there is nothing that we want to store there, + * otherwise there is no harm in updating mtime/atime on the MDS if we are + * going to do an RPC anyways. + * + * If we are doing a truncate, we will send the mtime and ctime updates + * to the OST with the punch RPC, otherwise we do an explicit setattr RPC. + * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE + * at the same time. + * + * In case of HSMimport, we only set attr on MDS. + */ +int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, + enum op_xvalid xvalid, bool hsm_import) +{ + struct inode *inode = dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + struct md_op_data *op_data = NULL; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "%s: setattr inode "DFID"(%p) from %llu to %llu, " + "valid %x, hsm_import %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid), + inode, i_size_read(inode), attr->ia_size, attr->ia_valid, + hsm_import); + + if (attr->ia_valid & ATTR_SIZE) { + /* Check new size against VFS/VM file size limit and rlimit */ + rc = inode_newsize_ok(inode, attr->ia_size); + if (rc) + RETURN(rc); + + /* The maximum Lustre file size is variable, based on the + * OST maximum object size and number of stripes. This + * needs another check in addition to the VFS check above. */ + if (attr->ia_size > ll_file_maxbytes(inode)) { + CDEBUG(D_INODE,"file "DFID" too large %llu > %llu\n", + PFID(&lli->lli_fid), attr->ia_size, + ll_file_maxbytes(inode)); + RETURN(-EFBIG); + } + + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + + /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */ + if (attr->ia_valid & TIMES_SET_FLAGS) { + if ((!uid_eq(current_fsuid(), inode->i_uid)) && + !cfs_capable(CFS_CAP_FOWNER)) + RETURN(-EPERM); + } + + /* We mark all of the fields "set" so MDS/OST does not re-set them */ + if (!(xvalid & OP_XVALID_CTIME_SET) && + (attr->ia_valid & ATTR_CTIME)) { + attr->ia_ctime = current_time(inode); + xvalid |= OP_XVALID_CTIME_SET; + } + if (!(attr->ia_valid & ATTR_ATIME_SET) && + (attr->ia_valid & ATTR_ATIME)) { + attr->ia_atime = current_time(inode); + attr->ia_valid |= ATTR_ATIME_SET; + } + if (!(attr->ia_valid & ATTR_MTIME_SET) && + (attr->ia_valid & ATTR_MTIME)) { + attr->ia_mtime = current_time(inode); + attr->ia_valid |= ATTR_MTIME_SET; + } + + if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME)) + CDEBUG(D_INODE, "setting mtime %lld, ctime %lld, now = %lld\n", + (s64)attr->ia_mtime.tv_sec, (s64)attr->ia_ctime.tv_sec, + ktime_get_real_seconds()); + + if (S_ISREG(inode->i_mode)) { + if (attr->ia_valid & ATTR_SIZE) + inode_dio_write_done(inode); + inode_unlock(inode); + } + + /* We always do an MDS RPC, even if we're only changing the size; + * only the MDS knows whether truncate() should fail with -ETXTBUSY */ + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + GOTO(out, rc = -ENOMEM); + + if (!hsm_import && attr->ia_valid & ATTR_SIZE) { + /* If we are changing file size, file content is + * modified, flag it. + */ + xvalid |= OP_XVALID_OWNEROVERRIDE; + op_data->op_bias |= MDS_DATA_MODIFIED; + ll_file_clear_flag(lli, LLIF_DATA_MODIFIED); + } + + if (attr->ia_valid & ATTR_FILE) { + struct ll_file_data *fd = LUSTRE_FPRIVATE(attr->ia_file); + + if (fd->fd_lease_och) + op_data->op_bias |= MDS_TRUNC_KEEP_LEASE; + } + + op_data->op_attr = *attr; + op_data->op_xvalid = xvalid; + + rc = ll_md_setattr(dentry, op_data); + if (rc) + GOTO(out, rc); + + if (!S_ISREG(inode->i_mode) || hsm_import) + GOTO(out, rc = 0); + + if (attr->ia_valid & (ATTR_SIZE | ATTR_ATIME | ATTR_ATIME_SET | + ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME) || + xvalid & OP_XVALID_CTIME_SET) { + /* For truncate and utimes sending attributes to OSTs, setting + * mtime/atime to the past will be performed under PW [0:EOF] + * extent lock (new_size:EOF for truncate). It may seem + * excessive to send mtime/atime updates to OSTs when not + * setting times to past, but it is necessary due to possible + * time de-synchronization between MDT inode and OST objects + */ + rc = cl_setattr_ost(lli->lli_clob, attr, xvalid, 0); + } + + /* If the file was restored, it needs to set dirty flag. + * + * We've already sent MDS_DATA_MODIFIED flag in + * ll_md_setattr() for truncate. However, the MDT refuses to + * set the HS_DIRTY flag on released files, so we have to set + * it again if the file has been restored. Please check how + * LLIF_DATA_MODIFIED is set in vvp_io_setattr_fini(). + * + * Please notice that if the file is not released, the previous + * MDS_DATA_MODIFIED has taken effect and usually + * LLIF_DATA_MODIFIED is not set(see vvp_io_setattr_fini()). + * This way we can save an RPC for common open + trunc + * operation. */ + if (ll_file_test_and_clear_flag(lli, LLIF_DATA_MODIFIED)) { + struct hsm_state_set hss = { + .hss_valid = HSS_SETMASK, + .hss_setmask = HS_DIRTY, + }; + int rc2; + + rc2 = ll_hsm_state_set(inode, &hss); + /* truncate and write can happen at the same time, so that + * the file can be set modified even though the file is not + * restored from released state, and ll_hsm_state_set() is + * not applicable for the file, and rc2 < 0 is normal in this + * case. */ + if (rc2 < 0) + CDEBUG(D_INFO, DFID "HSM set dirty failed: rc2 = %d\n", + PFID(ll_inode2fid(inode)), rc2); + } + + EXIT; +out: + if (op_data != NULL) + ll_finish_md_op_data(op_data); + + if (S_ISREG(inode->i_mode)) { + inode_lock(inode); + if ((attr->ia_valid & ATTR_SIZE) && !hsm_import) + inode_dio_wait(inode); + /* Once we've got the i_mutex, it's safe to set the S_NOSEC + * flag. ll_update_inode (called from ll_md_setattr), clears + * inode flags, so there is a gap where S_NOSEC is not set. + * This can cause a writer to take the i_mutex unnecessarily, + * but this is safe to do and should be rare. */ + inode_has_no_xattr(inode); + } + + ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ? + LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1); + + return rc; +} + +int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de, + struct iattr *attr) +{ + int mode = de->d_inode->i_mode; + enum op_xvalid xvalid = 0; + + if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) == + (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) + xvalid |= OP_XVALID_OWNEROVERRIDE; + + if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) == + (ATTR_SIZE|ATTR_MODE)) && + (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) || + (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) && + !(attr->ia_mode & S_ISGID)))) + attr->ia_valid |= ATTR_FORCE; + + if ((attr->ia_valid & ATTR_MODE) && + (mode & S_ISUID) && + !(attr->ia_mode & S_ISUID) && + !(attr->ia_valid & ATTR_KILL_SUID)) + attr->ia_valid |= ATTR_KILL_SUID; + + if ((attr->ia_valid & ATTR_MODE) && + ((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) && + !(attr->ia_mode & S_ISGID) && + !(attr->ia_valid & ATTR_KILL_SGID)) + attr->ia_valid |= ATTR_KILL_SGID; + + return ll_setattr_raw(de, attr, xvalid, false); +} + +int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs, + u32 flags) +{ + struct obd_statfs obd_osfs = { 0 }; + time64_t max_age; + int rc; + + ENTRY; + max_age = ktime_get_seconds() - sbi->ll_statfs_max_age; + + rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags); + if (rc) + RETURN(rc); + + osfs->os_type = LL_SUPER_MAGIC; + + CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n", + osfs->os_bavail, osfs->os_blocks, osfs->os_ffree, osfs->os_files); + + if (osfs->os_state & OS_STATE_SUM) + GOTO(out, rc); + + if (sbi->ll_flags & LL_SBI_LAZYSTATFS) + flags |= OBD_STATFS_NODELAY; + + rc = obd_statfs(NULL, sbi->ll_dt_exp, &obd_osfs, max_age, flags); + if (rc) /* Possibly a filesystem with no OSTs. Report MDT totals. */ + GOTO(out, rc = 0); + + CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n", + obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree, + obd_osfs.os_files); + + osfs->os_bsize = obd_osfs.os_bsize; + osfs->os_blocks = obd_osfs.os_blocks; + osfs->os_bfree = obd_osfs.os_bfree; + osfs->os_bavail = obd_osfs.os_bavail; + + /* If we have _some_ OSTs, but don't have as many free objects on the + * OSTs as inodes on the MDTs, reduce the reported number of inodes + * to compensate, so that the "inodes in use" number is correct. + * This should be kept in sync with lod_statfs() behaviour. + */ + if (obd_osfs.os_files && obd_osfs.os_ffree < osfs->os_ffree) { + osfs->os_files = (osfs->os_files - osfs->os_ffree) + + obd_osfs.os_ffree; + osfs->os_ffree = obd_osfs.os_ffree; + } + +out: + RETURN(rc); +} +int ll_statfs(struct dentry *de, struct kstatfs *sfs) +{ + struct super_block *sb = de->d_sb; + struct obd_statfs osfs; + __u64 fsid = huge_encode_dev(sb->s_dev); + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64()); + ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1); + + /* Some amount of caching on the client is allowed */ + rc = ll_statfs_internal(ll_s2sbi(sb), &osfs, OBD_STATFS_SUM); + if (rc) + return rc; + + statfs_unpack(sfs, &osfs); + + /* We need to downshift for all 32-bit kernels, because we can't + * tell if the kernel is being called via sys_statfs64() or not. + * Stop before overflowing f_bsize - in which case it is better + * to just risk EOVERFLOW if caller is using old sys_statfs(). */ + if (sizeof(long) < 8) { + while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) { + sfs->f_bsize <<= 1; + + osfs.os_blocks >>= 1; + osfs.os_bfree >>= 1; + osfs.os_bavail >>= 1; + } + } + + sfs->f_blocks = osfs.os_blocks; + sfs->f_bfree = osfs.os_bfree; + sfs->f_bavail = osfs.os_bavail; + sfs->f_fsid.val[0] = (__u32)fsid; + sfs->f_fsid.val[1] = (__u32)(fsid >> 32); + return 0; +} + +void ll_inode_size_lock(struct inode *inode) +{ + struct ll_inode_info *lli; + + LASSERT(!S_ISDIR(inode->i_mode)); + + lli = ll_i2info(inode); + mutex_lock(&lli->lli_size_mutex); +} + +void ll_inode_size_unlock(struct inode *inode) +{ + struct ll_inode_info *lli; + + lli = ll_i2info(inode); + mutex_unlock(&lli->lli_size_mutex); +} + +void ll_update_inode_flags(struct inode *inode, int ext_flags) +{ + inode->i_flags = ll_ext_to_inode_flags(ext_flags); + if (ext_flags & LUSTRE_PROJINHERIT_FL) + ll_file_set_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT); + else + ll_file_clear_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT); +} + +int ll_update_inode(struct inode *inode, struct lustre_md *md) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body = md->body; + struct ll_sb_info *sbi = ll_i2sbi(inode); + int rc = 0; + + if (body->mbo_valid & OBD_MD_FLEASIZE) { + rc = cl_file_inode_init(inode, md); + if (rc) + return rc; + } + + if (S_ISDIR(inode->i_mode)) { + rc = ll_update_lsm_md(inode, md); + if (rc != 0) + return rc; + } + +#ifdef CONFIG_FS_POSIX_ACL + if (body->mbo_valid & OBD_MD_FLACL) { + spin_lock(&lli->lli_lock); + if (lli->lli_posix_acl) + posix_acl_release(lli->lli_posix_acl); + lli->lli_posix_acl = md->posix_acl; + spin_unlock(&lli->lli_lock); + } +#endif + inode->i_ino = cl_fid_build_ino(&body->mbo_fid1, + sbi->ll_flags & LL_SBI_32BIT_API); + inode->i_generation = cl_fid_build_gen(&body->mbo_fid1); + + if (body->mbo_valid & OBD_MD_FLATIME) { + if (body->mbo_atime > inode->i_atime.tv_sec) + inode->i_atime.tv_sec = body->mbo_atime; + lli->lli_atime = body->mbo_atime; + } + + if (body->mbo_valid & OBD_MD_FLMTIME) { + if (body->mbo_mtime > inode->i_mtime.tv_sec) { + CDEBUG(D_INODE, + "setting ino %lu mtime from %lld to %llu\n", + inode->i_ino, (s64)inode->i_mtime.tv_sec, + body->mbo_mtime); + inode->i_mtime.tv_sec = body->mbo_mtime; + } + lli->lli_mtime = body->mbo_mtime; + } + + if (body->mbo_valid & OBD_MD_FLCTIME) { + if (body->mbo_ctime > inode->i_ctime.tv_sec) + inode->i_ctime.tv_sec = body->mbo_ctime; + lli->lli_ctime = body->mbo_ctime; + } + + /* Clear i_flags to remove S_NOSEC before permissions are updated */ + if (body->mbo_valid & OBD_MD_FLFLAGS) + ll_update_inode_flags(inode, body->mbo_flags); + if (body->mbo_valid & OBD_MD_FLMODE) + inode->i_mode = (inode->i_mode & S_IFMT) | + (body->mbo_mode & ~S_IFMT); + + if (body->mbo_valid & OBD_MD_FLTYPE) + inode->i_mode = (inode->i_mode & ~S_IFMT) | + (body->mbo_mode & S_IFMT); + + LASSERT(inode->i_mode != 0); + if (S_ISREG(inode->i_mode)) + inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1, + LL_MAX_BLKSIZE_BITS); + else + inode->i_blkbits = inode->i_sb->s_blocksize_bits; + + if (body->mbo_valid & OBD_MD_FLUID) + inode->i_uid = make_kuid(&init_user_ns, body->mbo_uid); + if (body->mbo_valid & OBD_MD_FLGID) + inode->i_gid = make_kgid(&init_user_ns, body->mbo_gid); + if (body->mbo_valid & OBD_MD_FLPROJID) + lli->lli_projid = body->mbo_projid; + if (body->mbo_valid & OBD_MD_FLNLINK) + set_nlink(inode, body->mbo_nlink); + if (body->mbo_valid & OBD_MD_FLRDEV) + inode->i_rdev = old_decode_dev(body->mbo_rdev); + + if (body->mbo_valid & OBD_MD_FLID) { + /* FID shouldn't be changed! */ + if (fid_is_sane(&lli->lli_fid)) { + LASSERTF(lu_fid_eq(&lli->lli_fid, &body->mbo_fid1), + "Trying to change FID "DFID + " to the "DFID", inode "DFID"(%p)\n", + PFID(&lli->lli_fid), PFID(&body->mbo_fid1), + PFID(ll_inode2fid(inode)), inode); + } else { + lli->lli_fid = body->mbo_fid1; + } + } + + LASSERT(fid_seq(&lli->lli_fid) != 0); + + if (body->mbo_valid & OBD_MD_FLSIZE) { + i_size_write(inode, body->mbo_size); + + CDEBUG(D_VFSTRACE, "inode="DFID", updating i_size %llu\n", + PFID(ll_inode2fid(inode)), + (unsigned long long)body->mbo_size); + + if (body->mbo_valid & OBD_MD_FLBLOCKS) + inode->i_blocks = body->mbo_blocks; + } + + if (body->mbo_valid & OBD_MD_TSTATE) { + /* Set LLIF_FILE_RESTORING if restore ongoing and + * clear it when done to ensure to start again + * glimpsing updated attrs + */ + if (body->mbo_t_state & MS_RESTORE) + ll_file_set_flag(lli, LLIF_FILE_RESTORING); + else + ll_file_clear_flag(lli, LLIF_FILE_RESTORING); + } + + return 0; +} + +/* update directory depth to ROOT, called after LOOKUP lock is fetched. */ +void ll_update_dir_depth(struct inode *dir, struct inode *inode) +{ + struct ll_inode_info *lli; + + if (!S_ISDIR(inode->i_mode)) + return; + + if (inode == dir) + return; + + lli = ll_i2info(inode); + lli->lli_dir_depth = ll_i2info(dir)->lli_dir_depth + 1; + CDEBUG(D_INODE, DFID" depth %hu\n", + PFID(&lli->lli_fid), lli->lli_dir_depth); +} + +int ll_read_inode2(struct inode *inode, void *opaque) +{ + struct lustre_md *md = opaque; + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(&lli->lli_fid), inode); + + /* Core attributes from the MDS first. This is a new inode, and + * the VFS doesn't zero times in the core inode so we have to do + * it ourselves. They will be overwritten by either MDS or OST + * attributes - we just need to make sure they aren't newer. + */ + inode->i_mtime.tv_sec = 0; + inode->i_atime.tv_sec = 0; + inode->i_ctime.tv_sec = 0; + inode->i_rdev = 0; + rc = ll_update_inode(inode, md); + if (rc != 0) + RETURN(rc); + + /* OIDEBUG(inode); */ + +#ifdef HAVE_BACKING_DEV_INFO + /* initializing backing dev info. */ + inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi; +#endif + if (S_ISREG(inode->i_mode)) { + struct ll_sb_info *sbi = ll_i2sbi(inode); + inode->i_op = &ll_file_inode_operations; + inode->i_fop = sbi->ll_fop; + inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops; + EXIT; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ll_dir_inode_operations; + inode->i_fop = &ll_dir_operations; + EXIT; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &ll_fast_symlink_inode_operations; + EXIT; + } else { + inode->i_op = &ll_special_inode_operations; + + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + + EXIT; + } + + return 0; +} + +void ll_delete_inode(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct address_space *mapping = &inode->i_data; + unsigned long nrpages; + ENTRY; + + if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL) { + /* It is last chance to write out dirty pages, + * otherwise we may lose data while umount. + * + * If i_nlink is 0 then just discard data. This is safe because + * local inode gets i_nlink 0 from server only for the last + * unlink, so that file is not opened somewhere else + */ + cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, inode->i_nlink ? + CL_FSYNC_LOCAL : CL_FSYNC_DISCARD, 1); + } + truncate_inode_pages_final(mapping); + + /* Workaround for LU-118: Note nrpages may not be totally updated when + * truncate_inode_pages() returns, as there can be a page in the process + * of deletion (inside __delete_from_page_cache()) in the specified + * range. Thus mapping->nrpages can be non-zero when this function + * returns even after truncation of the whole mapping. Only do this if + * npages isn't already zero. + */ + nrpages = mapping->nrpages; + if (nrpages) { + xa_lock_irq(&mapping->i_pages); + nrpages = mapping->nrpages; + xa_unlock_irq(&mapping->i_pages); + } /* Workaround end */ + + LASSERTF(nrpages == 0, "%s: inode="DFID"(%p) nrpages=%lu, " + "see https://jira.whamcloud.com/browse/LU-118\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(ll_inode2fid(inode)), inode, nrpages); + +#ifdef HAVE_SBOPS_EVICT_INODE + ll_clear_inode(inode); +#endif + clear_inode(inode); + + EXIT; +} + +int ll_iocontrol(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + int rc, flags = 0; + ENTRY; + + switch (cmd) { + case FS_IOC_GETFLAGS: { + struct mdt_body *body; + struct md_op_data *op_data; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, + 0, 0, LUSTRE_OPC_ANY, + NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_FLFLAGS; + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc) { + CERROR("%s: failure inode "DFID": rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, + PFID(ll_inode2fid(inode)), rc); + RETURN(-abs(rc)); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + + flags = body->mbo_flags; + + ptlrpc_req_finished(req); + + RETURN(put_user(flags, (int __user *)arg)); + } + case FS_IOC_SETFLAGS: { + struct iattr *attr; + struct md_op_data *op_data; + struct cl_object *obj; + struct fsxattr fa = { 0 }; + + if (get_user(flags, (int __user *)arg)) + RETURN(-EFAULT); + + fa.fsx_projid = ll_i2info(inode)->lli_projid; + if (flags & LUSTRE_PROJINHERIT_FL) + fa.fsx_xflags = FS_XFLAG_PROJINHERIT; + + rc = ll_ioctl_check_project(inode, &fa); + if (rc) + RETURN(rc); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_attr_flags = flags; + op_data->op_xvalid |= OP_XVALID_FLAGS; + rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &req); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc) + RETURN(rc); + + ll_update_inode_flags(inode, flags); + + obj = ll_i2info(inode)->lli_clob; + if (obj == NULL) + RETURN(0); + + OBD_ALLOC_PTR(attr); + if (attr == NULL) + RETURN(-ENOMEM); + + rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS, flags); + + OBD_FREE_PTR(attr); + RETURN(rc); + } + default: + RETURN(-ENOSYS); + } + + RETURN(0); +} + +int ll_flush_ctx(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + + CDEBUG(D_SEC, "flush context for user %d\n", + from_kuid(&init_user_ns, current_uid())); + + obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, + 0, NULL, NULL); + obd_set_info_async(NULL, sbi->ll_dt_exp, + sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, + 0, NULL, NULL); + return 0; +} + +/* umount -f client means force down, don't save state */ +void ll_umount_begin(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_device *obd; + struct obd_ioctl_data *ioc_data; + struct l_wait_info lwi; + wait_queue_head_t waitq; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb, + sb->s_count, atomic_read(&sb->s_active)); + + obd = class_exp2obd(sbi->ll_md_exp); + if (obd == NULL) { + CERROR("Invalid MDC connection handle %#llx\n", + sbi->ll_md_exp->exp_handle.h_cookie); + EXIT; + return; + } + obd->obd_force = 1; + + obd = class_exp2obd(sbi->ll_dt_exp); + if (obd == NULL) { + CERROR("Invalid LOV connection handle %#llx\n", + sbi->ll_dt_exp->exp_handle.h_cookie); + EXIT; + return; + } + obd->obd_force = 1; + + OBD_ALLOC_PTR(ioc_data); + if (ioc_data) { + obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp, + sizeof *ioc_data, ioc_data, NULL); + + obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp, + sizeof *ioc_data, ioc_data, NULL); + + OBD_FREE_PTR(ioc_data); + } + + /* Really, we'd like to wait until there are no requests outstanding, + * and then continue. For now, we just periodically checking for vfs + * to decrement mnt_cnt and hope to finish it within 10sec. + */ + init_waitqueue_head(&waitq); + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(10), + cfs_time_seconds(1), NULL, NULL); + l_wait_event(waitq, may_umount(sbi->ll_mnt.mnt), &lwi); + + EXIT; +} + +int ll_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + char *profilenm = get_profile_name(sb); + int err; + __u32 read_only; + + if ((*flags & MS_RDONLY) != (sb->s_flags & SB_RDONLY)) { + read_only = *flags & MS_RDONLY; + err = obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_READ_ONLY), + KEY_READ_ONLY, sizeof(read_only), + &read_only, NULL); + if (err) { + LCONSOLE_WARN("Failed to remount %s %s (%d)\n", + profilenm, read_only ? + "read-only" : "read-write", err); + return err; + } + + if (read_only) + sb->s_flags |= SB_RDONLY; + else + sb->s_flags &= ~SB_RDONLY; + + if (sbi->ll_flags & LL_SBI_VERBOSE) + LCONSOLE_WARN("Remounted %s %s\n", profilenm, + read_only ? "read-only" : "read-write"); + } + return 0; +} + +/** + * Cleanup the open handle that is cached on MDT-side. + * + * For open case, the client side open handling thread may hit error + * after the MDT grant the open. Under such case, the client should + * send close RPC to the MDT as cleanup; otherwise, the open handle + * on the MDT will be leaked there until the client umount or evicted. + * + * In further, if someone unlinked the file, because the open handle + * holds the reference on such file/object, then it will block the + * subsequent threads that want to locate such object via FID. + * + * \param[in] sb super block for this file-system + * \param[in] open_req pointer to the original open request + */ +void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req) +{ + struct mdt_body *body; + struct md_op_data *op_data; + struct ptlrpc_request *close_req = NULL; + struct obd_export *exp = ll_s2sbi(sb)->ll_md_exp; + ENTRY; + + body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY); + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) { + CWARN("%s: cannot allocate op_data to release open handle for " + DFID"\n", + ll_get_fsname(sb, NULL, 0), PFID(&body->mbo_fid1)); + + RETURN_EXIT; + } + + op_data->op_fid1 = body->mbo_fid1; + op_data->op_open_handle = body->mbo_open_handle; + op_data->op_mod_time = ktime_get_real_seconds(); + md_close(exp, op_data, NULL, &close_req); + ptlrpc_req_finished(close_req); + ll_finish_md_op_data(op_data); + + EXIT; +} + +/* set filesystem-wide default LMV for subdir mount if it's enabled on ROOT. */ +static int ll_fileset_default_lmv_fixup(struct inode *inode, + struct lustre_md *md) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + union lmv_mds_md *lmm = NULL; + int size = 0; + int rc; + + LASSERT(is_root_inode(inode)); + LASSERT(!fid_is_root(&sbi->ll_root_fid)); + LASSERT(!md->default_lmv); + + rc = ll_dir_get_default_layout(inode, (void **)&lmm, &size, &req, + OBD_MD_DEFAULT_MEA, + GET_DEFAULT_LAYOUT_ROOT); + if (rc && rc != -ENODATA) + GOTO(out, rc); + + rc = 0; + if (lmm && size) { + rc = md_unpackmd(sbi->ll_md_exp, &md->default_lmv, lmm, size); + if (rc < 0) + GOTO(out, rc); + + rc = 0; + } + EXIT; +out: + if (req) + ptlrpc_req_finished(req); + return rc; +} + +int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, + struct super_block *sb, struct lookup_intent *it) +{ + struct ll_sb_info *sbi = NULL; + struct lustre_md md = { NULL }; + bool default_lmv_deleted = false; + int rc; + + ENTRY; + + LASSERT(*inode || sb); + sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode); + rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp, + sbi->ll_md_exp, &md); + if (rc != 0) + GOTO(cleanup, rc); + + /* + * clear default_lmv only if intent_getattr reply doesn't contain it. + * but it needs to be done after iget, check this early because + * ll_update_lsm_md() may change md. + */ + if (it && (it->it_op & (IT_LOOKUP | IT_GETATTR)) && + S_ISDIR(md.body->mbo_mode) && !md.default_lmv) { + if (unlikely(*inode && is_root_inode(*inode) && + !fid_is_root(&sbi->ll_root_fid))) { + rc = ll_fileset_default_lmv_fixup(*inode, &md); + if (rc) + GOTO(out, rc); + } + + if (!md.default_lmv) + default_lmv_deleted = true; + } + + if (*inode) { + rc = ll_update_inode(*inode, &md); + if (rc != 0) + GOTO(out, rc); + } else { + LASSERT(sb != NULL); + + /* + * At this point server returns to client's same fid as client + * generated for creating. So using ->fid1 is okay here. + */ + if (!fid_is_sane(&md.body->mbo_fid1)) { + CERROR("%s: Fid is insane "DFID"\n", + ll_get_fsname(sb, NULL, 0), + PFID(&md.body->mbo_fid1)); + GOTO(out, rc = -EINVAL); + } + + *inode = ll_iget(sb, cl_fid_build_ino(&md.body->mbo_fid1, + sbi->ll_flags & LL_SBI_32BIT_API), + &md); + if (IS_ERR(*inode)) { +#ifdef CONFIG_FS_POSIX_ACL + if (md.posix_acl) { + posix_acl_release(md.posix_acl); + md.posix_acl = NULL; + } +#endif + rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM; + *inode = NULL; + CERROR("new_inode -fatal: rc %d\n", rc); + GOTO(out, rc); + } + } + + /* Handling piggyback layout lock. + * Layout lock can be piggybacked by getattr and open request. + * The lsm can be applied to inode only if it comes with a layout lock + * otherwise correct layout may be overwritten, for example: + * 1. proc1: mdt returns a lsm but not granting layout + * 2. layout was changed by another client + * 3. proc2: refresh layout and layout lock granted + * 4. proc1: to apply a stale layout */ + if (it != NULL && it->it_lock_mode != 0) { + struct lustre_handle lockh; + struct ldlm_lock *lock; + + lockh.cookie = it->it_lock_handle; + lock = ldlm_handle2lock(&lockh); + LASSERT(lock != NULL); + if (ldlm_has_layout(lock)) { + struct cl_object_conf conf; + + memset(&conf, 0, sizeof(conf)); + conf.coc_opc = OBJECT_CONF_SET; + conf.coc_inode = *inode; + conf.coc_lock = lock; + conf.u.coc_layout = md.layout; + (void)ll_layout_conf(*inode, &conf); + } + LDLM_LOCK_PUT(lock); + } + + if (default_lmv_deleted) + ll_update_default_lsm_md(*inode, &md); + + GOTO(out, rc = 0); + +out: + md_free_lustre_md(sbi->ll_md_exp, &md); + +cleanup: + if (rc != 0 && it != NULL && it->it_op & IT_OPEN) { + ll_intent_drop_lock(it); + ll_open_cleanup(sb != NULL ? sb : (*inode)->i_sb, req); + } + + return rc; +} + +int ll_obd_statfs(struct inode *inode, void __user *arg) +{ + struct ll_sb_info *sbi = NULL; + struct obd_export *exp; + char *buf = NULL; + struct obd_ioctl_data *data = NULL; + __u32 type; + int len = 0, rc; + + if (!inode || !(sbi = ll_i2sbi(inode))) + GOTO(out_statfs, rc = -EINVAL); + + rc = obd_ioctl_getdata(&buf, &len, arg); + if (rc) + GOTO(out_statfs, rc); + + data = (void*)buf; + if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || + !data->ioc_pbuf1 || !data->ioc_pbuf2) + GOTO(out_statfs, rc = -EINVAL); + + if (data->ioc_inllen1 != sizeof(__u32) || + data->ioc_inllen2 != sizeof(__u32) || + data->ioc_plen1 != sizeof(struct obd_statfs) || + data->ioc_plen2 != sizeof(struct obd_uuid)) + GOTO(out_statfs, rc = -EINVAL); + + memcpy(&type, data->ioc_inlbuf1, sizeof(__u32)); + if (type & LL_STATFS_LMV) + exp = sbi->ll_md_exp; + else if (type & LL_STATFS_LOV) + exp = sbi->ll_dt_exp; + else + GOTO(out_statfs, rc = -ENODEV); + + rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, NULL); + if (rc) + GOTO(out_statfs, rc); +out_statfs: + OBD_FREE_LARGE(buf, len); + return rc; +} + +/* + * this is normally called in ll_fini_md_op_data(), but sometimes it needs to + * be called early to avoid deadlock. + */ +void ll_unlock_md_op_lsm(struct md_op_data *op_data) +{ + if (op_data->op_mea2_sem) { + up_read(op_data->op_mea2_sem); + op_data->op_mea2_sem = NULL; + } + + if (op_data->op_mea1_sem) { + up_read(op_data->op_mea1_sem); + op_data->op_mea1_sem = NULL; + } +} + +/* this function prepares md_op_data hint for passing it down to MD stack. */ +struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, + struct inode *i1, struct inode *i2, + const char *name, size_t namelen, + __u32 mode, enum md_op_code opc, + void *data) +{ + LASSERT(i1 != NULL); + + if (name == NULL) { + /* Do not reuse namelen for something else. */ + if (namelen != 0) + return ERR_PTR(-EINVAL); + } else { + if (namelen > ll_i2sbi(i1)->ll_namelen) + return ERR_PTR(-ENAMETOOLONG); + + /* "/" is not valid name, but it's allowed */ + if (!lu_name_is_valid_2(name, namelen) && + strncmp("/", name, namelen) != 0) + return ERR_PTR(-EINVAL); + } + + if (op_data == NULL) + OBD_ALLOC_PTR(op_data); + + if (op_data == NULL) + return ERR_PTR(-ENOMEM); + + ll_i2gids(op_data->op_suppgids, i1, i2); + op_data->op_fid1 = *ll_inode2fid(i1); + op_data->op_code = opc; + + if (S_ISDIR(i1->i_mode)) { + down_read(&ll_i2info(i1)->lli_lsm_sem); + op_data->op_mea1_sem = &ll_i2info(i1)->lli_lsm_sem; + op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md; + op_data->op_default_mea1 = ll_i2info(i1)->lli_default_lsm_md; + } + + if (i2) { + op_data->op_fid2 = *ll_inode2fid(i2); + if (S_ISDIR(i2->i_mode)) { + if (i2 != i1) { + down_read(&ll_i2info(i2)->lli_lsm_sem); + op_data->op_mea2_sem = + &ll_i2info(i2)->lli_lsm_sem; + } + op_data->op_mea2 = ll_i2info(i2)->lli_lsm_md; + } + } else { + fid_zero(&op_data->op_fid2); + } + + if (ll_i2sbi(i1)->ll_flags & LL_SBI_64BIT_HASH) + op_data->op_cli_flags |= CLI_HASH64; + + if (ll_need_32bit_api(ll_i2sbi(i1))) + op_data->op_cli_flags |= CLI_API32; + + op_data->op_name = name; + op_data->op_namelen = namelen; + op_data->op_mode = mode; + op_data->op_mod_time = ktime_get_real_seconds(); + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = cfs_curproc_cap_pack(); + op_data->op_mds = 0; + if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) && + filename_is_volatile(name, namelen, &op_data->op_mds)) { + op_data->op_bias |= MDS_CREATE_VOLATILE; + } + op_data->op_data = data; + + return op_data; +} + +void ll_finish_md_op_data(struct md_op_data *op_data) +{ + ll_unlock_md_op_lsm(op_data); + ll_security_release_secctx(op_data->op_file_secctx, + op_data->op_file_secctx_size); + OBD_FREE_PTR(op_data); +} + +#ifdef HAVE_SUPEROPS_USE_DENTRY +int ll_show_options(struct seq_file *seq, struct dentry *dentry) +#else +int ll_show_options(struct seq_file *seq, struct vfsmount *vfs) +#endif +{ + struct ll_sb_info *sbi; + +#ifdef HAVE_SUPEROPS_USE_DENTRY + LASSERT((seq != NULL) && (dentry != NULL)); + sbi = ll_s2sbi(dentry->d_sb); +#else + LASSERT((seq != NULL) && (vfs != NULL)); + sbi = ll_s2sbi(vfs->mnt_sb); +#endif + + if (sbi->ll_flags & LL_SBI_NOLCK) + seq_puts(seq, ",nolock"); + + /* "flock" is the default since 2.13, but it wasn't for many years, + * so it is still useful to print this to show it is enabled. + * Start to print "noflock" so it is now clear when flock is disabled. + */ + if (sbi->ll_flags & LL_SBI_FLOCK) + seq_puts(seq, ",flock"); + else if (sbi->ll_flags & LL_SBI_LOCALFLOCK) + seq_puts(seq, ",localflock"); + else + seq_puts(seq, ",noflock"); + + if (sbi->ll_flags & LL_SBI_USER_XATTR) + seq_puts(seq, ",user_xattr"); + + if (sbi->ll_flags & LL_SBI_LAZYSTATFS) + seq_puts(seq, ",lazystatfs"); + + if (sbi->ll_flags & LL_SBI_USER_FID2PATH) + seq_puts(seq, ",user_fid2path"); + + if (sbi->ll_flags & LL_SBI_ALWAYS_PING) + seq_puts(seq, ",always_ping"); + + if (sbi->ll_flags & LL_SBI_MDLL) + seq_puts(seq, ",mdll"); + + if (sbi->ll_flags & LL_SBI_MDLL_BYPASS) + seq_puts(seq, ",mdll_bypass"); + + if (sbi->ll_flags & LL_SBI_MDLL_AUTO_REFRESH) + seq_puts(seq, ",mdll_auto_refresh"); + + RETURN(0); +} + +/** + * Get obd name by cmd, and copy out to user space + */ +int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_device *obd; + ENTRY; + + if (cmd == OBD_IOC_GETDTNAME) + obd = class_exp2obd(sbi->ll_dt_exp); + else if (cmd == OBD_IOC_GETMDNAME) + obd = class_exp2obd(sbi->ll_md_exp); + else + RETURN(-EINVAL); + + if (!obd) + RETURN(-ENOENT); + + if (copy_to_user((void __user *)arg, obd->obd_name, + strlen(obd->obd_name) + 1)) + RETURN(-EFAULT); + + RETURN(0); +} + +/** + * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the + * fsname will be returned in this buffer; otherwise, a static buffer will be + * used to store the fsname and returned to caller. + */ +char *ll_get_fsname(struct super_block *sb, char *buf, int buflen) +{ + static char fsname_static[MTI_NAME_MAXLEN]; + struct lustre_sb_info *lsi = s2lsi(sb); + char *ptr; + int len; + + if (buf == NULL) { + /* this means the caller wants to use static buffer + * and it doesn't care about race. Usually this is + * in error reporting path */ + buf = fsname_static; + buflen = sizeof(fsname_static); + } + + len = strlen(lsi->lsi_lmd->lmd_profile); + ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-'); + if (ptr && (strcmp(ptr, "-client") == 0)) + len -= 7; + + if (unlikely(len >= buflen)) + len = buflen - 1; + strncpy(buf, lsi->lsi_lmd->lmd_profile, len); + buf[len] = '\0'; + + return buf; +} + +static char* ll_d_path(struct dentry *dentry, char *buf, int bufsize) +{ + char *path = NULL; + + struct path p; + + p.dentry = dentry; + p.mnt = current->fs->root.mnt; + path_get(&p); + path = d_path(&p, buf, bufsize); + path_put(&p); + return path; +} + +void ll_dirty_page_discard_warn(struct page *page, int ioret) +{ + char *buf, *path = NULL; + struct dentry *dentry = NULL; + struct inode *inode = page->mapping->host; + + /* this can be called inside spin lock so use GFP_ATOMIC. */ + buf = (char *)__get_free_page(GFP_ATOMIC); + if (buf != NULL) { + dentry = d_find_alias(page->mapping->host); + if (dentry != NULL) + path = ll_d_path(dentry, buf, PAGE_SIZE); + } + + CDEBUG(D_WARNING, + "%s: dirty page discard: %s/fid: "DFID"/%s may get corrupted " + "(rc %d)\n", ll_get_fsname(page->mapping->host->i_sb, NULL, 0), + s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev, + PFID(ll_inode2fid(inode)), + (path && !IS_ERR(path)) ? path : "", ioret); + + if (dentry != NULL) + dput(dentry); + + if (buf != NULL) + free_page((unsigned long)buf); +} + +ssize_t ll_copy_user_md(const struct lov_user_md __user *md, + struct lov_user_md **kbuf) +{ + struct lov_user_md lum; + ssize_t lum_size; + ENTRY; + + if (copy_from_user(&lum, md, sizeof(lum))) + RETURN(-EFAULT); + + lum_size = ll_lov_user_md_size(&lum); + if (lum_size < 0) + RETURN(lum_size); + + OBD_ALLOC_LARGE(*kbuf, lum_size); + if (*kbuf == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(*kbuf, md, lum_size) != 0) { + OBD_FREE_LARGE(*kbuf, lum_size); + RETURN(-EFAULT); + } + + RETURN(lum_size); +} + +/* + * Compute llite root squash state after a change of root squash + * configuration setting or add/remove of a lnet nid + */ +void ll_compute_rootsquash_state(struct ll_sb_info *sbi) +{ + struct root_squash_info *squash = &sbi->ll_squash; + int i; + bool matched; + struct lnet_process_id id; + + /* Update norootsquash flag */ + down_write(&squash->rsi_sem); + if (list_empty(&squash->rsi_nosquash_nids)) + sbi->ll_flags &= ~LL_SBI_NOROOTSQUASH; + else { + /* Do not apply root squash as soon as one of our NIDs is + * in the nosquash_nids list */ + matched = false; + i = 0; + while (LNetGetId(i++, &id) != -ENOENT) { + if (id.nid == LNET_NID_LO_0) + continue; + if (cfs_match_nid(id.nid, &squash->rsi_nosquash_nids)) { + matched = true; + break; + } + } + if (matched) + sbi->ll_flags |= LL_SBI_NOROOTSQUASH; + else + sbi->ll_flags &= ~LL_SBI_NOROOTSQUASH; + } + up_write(&squash->rsi_sem); +} + +/** + * Parse linkea content to extract information about a given hardlink + * + * \param[in] ldata - Initialized linkea data + * \param[in] linkno - Link identifier + * \param[out] parent_fid - The entry's parent FID + * \param[out] ln - Entry name destination buffer + * + * \retval 0 on success + * \retval Appropriate negative error code on failure + */ +static int ll_linkea_decode(struct linkea_data *ldata, unsigned int linkno, + struct lu_fid *parent_fid, struct lu_name *ln) +{ + unsigned int idx; + int rc; + ENTRY; + + rc = linkea_init_with_rec(ldata); + if (rc < 0) + RETURN(rc); + + if (linkno >= ldata->ld_leh->leh_reccount) + /* beyond last link */ + RETURN(-ENODATA); + + linkea_first_entry(ldata); + for (idx = 0; ldata->ld_lee != NULL; idx++) { + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, ln, + parent_fid); + if (idx == linkno) + break; + + linkea_next_entry(ldata); + } + + if (idx < linkno) + RETURN(-ENODATA); + + RETURN(0); +} + +/** + * Get parent FID and name of an identified link. Operation is performed for + * a given link number, letting the caller iterate over linkno to list one or + * all links of an entry. + * + * \param[in] file - File descriptor against which to perform the operation + * \param[in,out] arg - User-filled structure containing the linkno to operate + * on and the available size. It is eventually filled with + * the requested information or left untouched on error + * + * \retval - 0 on success + * \retval - Appropriate negative error code on failure + */ +int ll_getparent(struct file *file, struct getparent __user *arg) +{ + struct inode *inode = file_inode(file); + struct linkea_data *ldata; + struct lu_buf buf = LU_BUF_NULL; + struct lu_name ln; + struct lu_fid parent_fid; + __u32 linkno; + __u32 name_size; + int rc; + + ENTRY; + + if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) && + !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH)) + RETURN(-EPERM); + + if (get_user(name_size, &arg->gp_name_size)) + RETURN(-EFAULT); + + if (get_user(linkno, &arg->gp_linkno)) + RETURN(-EFAULT); + + if (name_size > PATH_MAX) + RETURN(-EINVAL); + + OBD_ALLOC(ldata, sizeof(*ldata)); + if (ldata == NULL) + RETURN(-ENOMEM); + + rc = linkea_data_new(ldata, &buf); + if (rc < 0) + GOTO(ldata_free, rc); + +#ifdef HAVE_XATTR_HANDLER_FLAGS + rc = ll_xattr_list(inode, XATTR_NAME_LINK, XATTR_TRUSTED_T, buf.lb_buf, + buf.lb_len, OBD_MD_FLXATTR); +#else + rc = ll_getxattr(file_dentry(file), XATTR_NAME_LINK, buf.lb_buf, + buf.lb_len); +#endif /* HAVE_XATTR_HANDLER_FLAGS */ + if (rc < 0) + GOTO(lb_free, rc); + + rc = ll_linkea_decode(ldata, linkno, &parent_fid, &ln); + if (rc < 0) + GOTO(lb_free, rc); + + if (ln.ln_namelen >= name_size) + GOTO(lb_free, rc = -EOVERFLOW); + + if (copy_to_user(&arg->gp_fid, &parent_fid, sizeof(arg->gp_fid))) + GOTO(lb_free, rc = -EFAULT); + + if (copy_to_user(&arg->gp_name, ln.ln_name, ln.ln_namelen)) + GOTO(lb_free, rc = -EFAULT); + + if (put_user('\0', arg->gp_name + ln.ln_namelen)) + GOTO(lb_free, rc = -EFAULT); + +lb_free: + lu_buf_free(&buf); +ldata_free: + OBD_FREE(ldata, sizeof(*ldata)); + + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c new file mode 100644 index 0000000000000..9be9bd690ee6d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c @@ -0,0 +1,538 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "llite_internal.h" +#include + +static const struct vm_operations_struct ll_file_vm_ops; + +void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma, + unsigned long addr, size_t count) +{ + policy->l_extent.start = ((addr - vma->vm_start) & PAGE_MASK) + + (vma->vm_pgoff << PAGE_SHIFT); + policy->l_extent.end = (policy->l_extent.start + count - 1) | + ~PAGE_MASK; +} + +struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, + size_t count) +{ + struct vm_area_struct *vma, *ret = NULL; + ENTRY; + + /* mmap_lock must have been held by caller. */ + LASSERT(!mmap_write_trylock(mm)); + + for (vma = find_vma(mm, addr); + vma != NULL && vma->vm_start < (addr + count); + vma = vma->vm_next) { + if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops && + vma->vm_flags & VM_SHARED) { + ret = vma; + break; + } + } + RETURN(ret); +} + +/** + * API independent part for page fault initialization. + * \param env - corespondent lu_env to processing + * \param vma - virtual memory area addressed to page fault + * \param index - page index corespondent to fault. + * \parm ra_flags - vma readahead flags. + * + * \return error codes from cl_io_init. + */ +static struct cl_io * +ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma, + pgoff_t index, unsigned long *ra_flags) +{ + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct cl_io *io; + struct cl_fault_io *fio; + int rc; + ENTRY; + + if (ll_file_nolock(file)) + RETURN(ERR_PTR(-EOPNOTSUPP)); + +restart: + io = vvp_env_thread_io(env); + io->ci_obj = ll_i2info(inode)->lli_clob; + LASSERT(io->ci_obj != NULL); + + fio = &io->u.ci_fault; + fio->ft_index = index; + fio->ft_executable = vma->vm_flags&VM_EXEC; + + /* + * disable VM_SEQ_READ and use VM_RAND_READ to make sure that + * the kernel will not read other pages not covered by ldlm in + * filemap_nopage. we do our readahead in ll_readpage. + */ + if (ra_flags != NULL) + *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); + vma->vm_flags &= ~VM_SEQ_READ; + vma->vm_flags |= VM_RAND_READ; + + CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, + fio->ft_index, fio->ft_executable); + + rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj); + if (rc == 0) { + struct vvp_io *vio = vvp_env_io(env); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + LASSERT(vio->vui_cl.cis_io == io); + + /* mmap lock must be MANDATORY it has to cache + * pages. */ + io->ci_lockreq = CILR_MANDATORY; + vio->vui_fd = fd; + } else { + LASSERT(rc < 0); + cl_io_fini(env, io); + if (io->ci_need_restart) + goto restart; + + io = ERR_PTR(rc); + } + + RETURN(io); +} + +/* Sharing code of page_mkwrite method for rhel5 and rhel6 */ +static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, + bool *retry) +{ + struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio; + int result; + __u16 refcheck; + sigset_t set; + struct inode *inode = NULL; + struct ll_inode_info *lli; + ENTRY; + + LASSERT(vmpage != NULL); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = ll_fault_io_init(env, vma, vmpage->index, NULL); + if (IS_ERR(io)) + GOTO(out, result = PTR_ERR(io)); + + result = io->ci_result; + if (result < 0) + GOTO(out_io, result); + + io->u.ci_fault.ft_mkwrite = 1; + io->u.ci_fault.ft_writable = 1; + + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = vmpage; + + set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); + + inode = vvp_object_inode(io->ci_obj); + lli = ll_i2info(inode); + + result = cl_io_loop(env, io); + + cfs_restore_sigs(set); + + if (result == 0) { + lock_page(vmpage); + if (vmpage->mapping == NULL) { + unlock_page(vmpage); + + /* page was truncated and lock was cancelled, return + * ENODATA so that VM_FAULT_NOPAGE will be returned + * to handle_mm_fault(). */ + if (result == 0) + result = -ENODATA; + } else if (!PageDirty(vmpage)) { + /* race, the page has been cleaned by ptlrpcd after + * it was unlocked, it has to be added into dirty + * cache again otherwise this soon-to-dirty page won't + * consume any grants, even worse if this page is being + * transferred because it will break RPC checksum. + */ + unlock_page(vmpage); + + CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has " + "been written out, retry.\n", + vmpage, vmpage->index); + + *retry = true; + result = -EAGAIN; + } + + if (result == 0) + ll_file_set_flag(lli, LLIF_DATA_MODIFIED); + } + EXIT; + +out_io: + cl_io_fini(env, io); +out: + cl_env_put(env, &refcheck); + CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result); + LASSERT(ergo(result == 0, PageLocked(vmpage))); + + /* if page has been unmapped, presumably due to lock reclaim for + * concurrent usage, add some delay before retrying to prevent + * entering live-lock situation with competitors + */ + if (result == -ENODATA && inode != NULL) { + CDEBUG(D_MMAP, "delaying new page-fault for inode %p to " + "prevent live-lock\n", inode); + msleep(10); + } + + return result; +} + +static inline int to_fault_error(int result) +{ + switch(result) { + case 0: + result = VM_FAULT_LOCKED; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + return result; +} + +/** + * Lustre implementation of a vm_operations_struct::fault() method, called by + * VM to server page fault (both in kernel and user space). + * + * \param vma - is virtiual area struct related to page fault + * \param vmf - structure which describe type and address where hit fault + * + * \return allocated and filled _locked_ page for address + * \retval VM_FAULT_ERROR on general error + * \retval NOPAGE_OOM not have memory for allocate new page + */ +static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio = NULL; + struct page *vmpage; + unsigned long ra_flags; + int result = 0; + int fault_ret = 0; + __u16 refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + if (ll_sbi_has_fast_read(ll_i2sbi(file_inode(vma->vm_file)))) { + /* do fast fault */ + bool has_retry = vmf->flags & FAULT_FLAG_RETRY_NOWAIT; + + /* To avoid loops, instruct downstream to not drop mmap_sem */ + vmf->flags |= FAULT_FLAG_RETRY_NOWAIT; + ll_cl_add(vma->vm_file, env, NULL, LCC_MMAP); + fault_ret = ll_filemap_fault(vma, vmf); + ll_cl_remove(vma->vm_file, env); + if (!has_retry) + vmf->flags &= ~FAULT_FLAG_RETRY_NOWAIT; + + /* - If there is no error, then the page was found in cache and + * uptodate; + * - If VM_FAULT_RETRY is set, the page existed but failed to + * lock. We will try slow path to avoid loops. + * - Otherwise, it should try normal fault under DLM lock. */ + if (!(fault_ret & VM_FAULT_RETRY) && + !(fault_ret & VM_FAULT_ERROR)) + GOTO(out, result = 0); + + fault_ret = 0; + } + + io = ll_fault_io_init(env, vma, vmf->pgoff, &ra_flags); + if (IS_ERR(io)) + GOTO(out, result = PTR_ERR(io)); + + result = io->ci_result; + if (result == 0) { + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = NULL; + vio->u.fault.ft_vmf = vmf; + vio->u.fault.ft_flags = 0; + vio->u.fault.ft_flags_valid = 0; + + /* May call ll_readpage() */ + ll_cl_add(vma->vm_file, env, io, LCC_MMAP); + + result = cl_io_loop(env, io); + + ll_cl_remove(vma->vm_file, env); + + /* ft_flags are only valid if we reached + * the call to filemap_fault */ + if (vio->u.fault.ft_flags_valid) + fault_ret = vio->u.fault.ft_flags; + + vmpage = vio->u.fault.ft_vmpage; + if (result != 0 && vmpage != NULL) { + put_page(vmpage); + vmf->page = NULL; + } + } + cl_io_fini(env, io); + + vma->vm_flags |= ra_flags; + +out: + cl_env_put(env, &refcheck); + if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) + fault_ret |= to_fault_error(result); + + CDEBUG(D_MMAP, "%s fault %d/%d\n", current->comm, fault_ret, result); + RETURN(fault_ret); +} + +#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY +static vm_fault_t ll_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; +#else +static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +#endif + int count = 0; + bool printed = false; + vm_fault_t result; + sigset_t set; + + /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite + * so that it can be killed by admin but not cause segfault by + * other signals. */ + set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); + + ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)), + LPROC_LL_FAULT, 1); + +restart: + result = ll_fault0(vma, vmf); + if (vmf->page && + !(result & (VM_FAULT_RETRY | VM_FAULT_ERROR | VM_FAULT_LOCKED))) { + struct page *vmpage = vmf->page; + + /* check if this page has been truncated */ + lock_page(vmpage); + if (unlikely(vmpage->mapping == NULL)) { /* unlucky */ + unlock_page(vmpage); + put_page(vmpage); + vmf->page = NULL; + + if (!printed && ++count > 16) { + CWARN("the page is under heavy contention," + "maybe your app(%s) needs revising :-)\n", + current->comm); + printed = true; + } + + goto restart; + } + + result |= VM_FAULT_LOCKED; + } + cfs_restore_sigs(set); + + if (vmf->page && result == VM_FAULT_LOCKED) + ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)), + current->pid, LUSTRE_FPRIVATE(vma->vm_file), + cl_offset(NULL, vmf->page->index), PAGE_SIZE, + READ); + return result; +} + +#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY +static vm_fault_t ll_page_mkwrite(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; +#else +static vm_fault_t ll_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ +#endif + int count = 0; + bool printed = false; + bool retry; + vm_fault_t result; + + ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)), + LPROC_LL_MKWRITE, 1); + + file_update_time(vma->vm_file); + do { + retry = false; + result = ll_page_mkwrite0(vma, vmf->page, &retry); + + if (!printed && ++count > 16) { + const struct dentry *de = file_dentry(vma->vm_file); + + CWARN("app(%s): the page %lu of file "DFID" is under" + " heavy contention\n", + current->comm, vmf->pgoff, + PFID(ll_inode2fid(de->d_inode))); + printed = true; + } + } while (retry); + + switch(result) { + case 0: + LASSERT(PageLocked(vmf->page)); + result = VM_FAULT_LOCKED; + break; + case -ENODATA: + case -EFAULT: + result = VM_FAULT_NOPAGE; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + case -EAGAIN: + result = VM_FAULT_RETRY; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + + if (result == VM_FAULT_LOCKED) + ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)), + current->pid, LUSTRE_FPRIVATE(vma->vm_file), + cl_offset(NULL, vmf->page->index), PAGE_SIZE, + WRITE); + return result; +} + +/** + * To avoid cancel the locks covering mmapped region for lock cache pressure, + * we track the mapped vma count in vvp_object::vob_mmap_cnt. + */ +static void ll_vm_open(struct vm_area_struct * vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct vvp_object *vob = cl_inode2vvp(inode); + + ENTRY; + LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); + atomic_inc(&vob->vob_mmap_cnt); + EXIT; +} + +/** + * Dual to ll_vm_open(). + */ +static void ll_vm_close(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct vvp_object *vob = cl_inode2vvp(inode); + + ENTRY; + atomic_dec(&vob->vob_mmap_cnt); + LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); + EXIT; +} + +/* XXX put nice comment here. talk about __free_pte -> dirty pages and + * nopage's reference passing to the pte */ +int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) +{ + int rc = -ENOENT; + ENTRY; + + LASSERTF(last > first, "last %llu first %llu\n", last, first); + if (mapping_mapped(mapping)) { + rc = 0; + unmap_mapping_range(mapping, first + PAGE_SIZE - 1, + last - first + 1, 0); + } + + RETURN(rc); +} + +static const struct vm_operations_struct ll_file_vm_ops = { + .fault = ll_fault, + .page_mkwrite = ll_page_mkwrite, + .open = ll_vm_open, + .close = ll_vm_close, +}; + +int ll_file_mmap(struct file *file, struct vm_area_struct * vma) +{ + struct inode *inode = file_inode(file); + int rc; + ENTRY; + + if (ll_file_nolock(file)) + RETURN(-EOPNOTSUPP); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1); + rc = generic_file_mmap(file, vma); + if (rc == 0) { + vma->vm_ops = &ll_file_vm_ops; + vma->vm_ops->open(vma); + /* update the inode's size and mtime */ + rc = ll_glimpse_size(inode); + } + + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c new file mode 100644 index 0000000000000..2e207361dd908 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c @@ -0,0 +1,377 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/llite/llite_nfs.c + * + * NFS export of Lustre Light File System + * + * Author: Yury Umanets + * Author: Huang Hua + */ + +#define DEBUG_SUBSYSTEM S_LLITE +#include "llite_internal.h" +#include + +__u32 get_uuid2int(const char *name, int len) +{ + __u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9; + while (len--) { + __u32 key = key1 + (key0 ^ (*name++ * 7152373)); + if (key & 0x80000000) key -= 0x7fffffff; + key1 = key0; + key0 = key; + } + return (key0 << 1); +} + +struct inode *search_inode_for_lustre(struct super_block *sb, + const struct lu_fid *fid) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct ptlrpc_request *req = NULL; + struct inode *inode = NULL; + int eadatalen = 0; + unsigned long hash = cl_fid_build_ino(fid, + ll_need_32bit_api(sbi)); + struct md_op_data *op_data; + int rc; + ENTRY; + + CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid)); + + inode = ilookup5(sb, hash, ll_test_inode_by_fid, (void *)fid); + if (inode) + RETURN(inode); + + rc = ll_get_default_mdsize(sbi, &eadatalen); + if (rc) + RETURN(ERR_PTR(rc)); + + /* Because inode is NULL, ll_prep_md_op_data can not + * be used here. So we allocate op_data ourselves */ + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + return ERR_PTR(-ENOMEM); + + op_data->op_fid1 = *fid; + op_data->op_mode = eadatalen; + op_data->op_valid = OBD_MD_FLEASIZE; + + /* mds_fid2dentry ignores f_type */ + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + OBD_FREE_PTR(op_data); + if (rc) { + /* Suppress erroneous/confusing messages when NFS + * is out of sync and requests old data. */ + CDEBUG(D_INFO, "can't get object attrs, fid "DFID", rc %d\n", + PFID(fid), rc); + RETURN(ERR_PTR(rc)); + } + rc = ll_prep_inode(&inode, req, sb, NULL); + ptlrpc_req_finished(req); + if (rc) + RETURN(ERR_PTR(rc)); + + RETURN(inode); +} + +struct lustre_nfs_fid { + struct lu_fid lnf_child; + struct lu_fid lnf_parent; +}; + +static struct dentry * +ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent) +{ + struct inode *inode; + struct dentry *result; + ENTRY; + + if (!fid_is_sane(fid)) + RETURN(ERR_PTR(-ESTALE)); + + CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid)); + + inode = search_inode_for_lustre(sb, fid); + if (IS_ERR(inode)) + RETURN(ERR_PTR(PTR_ERR(inode))); + + if (is_bad_inode(inode)) { + /* we didn't find the right inode.. */ + iput(inode); + RETURN(ERR_PTR(-ESTALE)); + } + + /* N.B. d_obtain_alias() drops inode ref on error */ + result = d_obtain_alias(inode); + if (!IS_ERR(result)) { + int rc; + + rc = ll_d_init(result); + if (rc < 0) { + dput(result); + result = ERR_PTR(rc); + } else { + struct ll_dentry_data *ldd = ll_d2d(result); + + /* + * Need to signal to the ll_file_open that + * we came from NFS and so opencache needs to be + * enabled for this one + */ + spin_lock(&result->d_lock); + ldd->lld_nfs_dentry = 1; + spin_unlock(&result->d_lock); + } + } + + RETURN(result); +} + +#ifndef FILEID_INVALID +#define FILEID_INVALID 0xff +#endif +#ifndef FILEID_LUSTRE +#define FILEID_LUSTRE 0x97 +#endif + +/** + * \a connectable - is nfsd will connect himself or this should be done + * at lustre + * + * The return value is file handle type: + * 1 -- contains child file handle; + * 2 -- contains child file handle and parent file handle; + * 255 -- error. + */ +#ifndef HAVE_ENCODE_FH_PARENT +static int ll_encode_fh(struct dentry *de, __u32 *fh, int *plen, + int connectable) +{ + struct inode *inode = de->d_inode; + struct inode *parent = de->d_parent->d_inode; +#else +static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen, + struct inode *parent) +{ +#endif + int fileid_len = sizeof(struct lustre_nfs_fid) / 4; + struct lustre_nfs_fid *nfs_fid = (void *)fh; + ENTRY; + + CDEBUG(D_INFO, "%s: encoding for ("DFID") maxlen=%d minlen=%d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(ll_inode2fid(inode)), *plen, fileid_len); + + if (*plen < fileid_len) { + *plen = fileid_len; + RETURN(FILEID_INVALID); + } + + nfs_fid->lnf_child = *ll_inode2fid(inode); + if (parent != NULL) + nfs_fid->lnf_parent = *ll_inode2fid(parent); + else + fid_zero(&nfs_fid->lnf_parent); + *plen = fileid_len; + + RETURN(FILEID_LUSTRE); +} + +static int +#ifndef HAVE_FILLDIR_USE_CTX +ll_nfs_get_name_filldir(void *cookie, const char *name, int namelen, + loff_t hash, u64 ino, unsigned type) +{ + struct ll_getname_data *lgd = cookie; +#else +ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, int namelen, + loff_t hash, u64 ino, unsigned type) +{ + struct ll_getname_data *lgd = + container_of(ctx, struct ll_getname_data, ctx); +#endif /* HAVE_FILLDIR_USE_CTX */ + /* It is hack to access lde_fid for comparison with lgd_fid. + * So the input 'name' must be part of the 'lu_dirent'. */ + struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name); + struct lu_fid fid; + + fid_le_to_cpu(&fid, &lde->lde_fid); + if (lu_fid_eq(&fid, &lgd->lgd_fid)) { + memcpy(lgd->lgd_name, name, namelen); + lgd->lgd_name[namelen] = 0; + lgd->lgd_found = 1; + } + return lgd->lgd_found; +} + +static int ll_get_name(struct dentry *dentry, char *name, + struct dentry *child) +{ + struct inode *dir = dentry->d_inode; + struct ll_getname_data lgd = { + .lgd_name = name, + .lgd_fid = ll_i2info(child->d_inode)->lli_fid, +#ifdef HAVE_DIR_CONTEXT + .ctx.actor = ll_nfs_get_name_filldir, +#endif + .lgd_found = 0, + }; + struct md_op_data *op_data; + __u64 pos = 0; + int rc; + ENTRY; + + if (!dir || !S_ISDIR(dir->i_mode)) + GOTO(out, rc = -ENOTDIR); + + if (!dir->i_fop) + GOTO(out, rc = -EINVAL); + + op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, + LUSTRE_OPC_ANY, dir); + if (IS_ERR(op_data)) + GOTO(out, rc = PTR_ERR(op_data)); + + inode_lock(dir); +#ifdef HAVE_DIR_CONTEXT + rc = ll_dir_read(dir, &pos, op_data, &lgd.ctx); +#else + rc = ll_dir_read(dir, &pos, op_data, &lgd, ll_nfs_get_name_filldir); +#endif + inode_unlock(dir); + ll_finish_md_op_data(op_data); + if (!rc && !lgd.lgd_found) + rc = -ENOENT; + EXIT; +out: + return rc; +} + +static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid; + + if (fh_type != FILEID_LUSTRE) + RETURN(ERR_PTR(-EPROTO)); + + RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent)); +} + +static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid; + + if (fh_type != FILEID_LUSTRE) + RETURN(ERR_PTR(-EPROTO)); + + RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL)); +} + +int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid) +{ + struct ptlrpc_request *req = NULL; + struct ll_sb_info *sbi; + struct mdt_body *body; + static const char dotdot[] = ".."; + struct md_op_data *op_data; + int rc; + int lmmsize; + ENTRY; + + LASSERT(dir && S_ISDIR(dir->i_mode)); + + sbi = ll_s2sbi(dir->i_sb); + + CDEBUG(D_INFO, "%s: getting parent for ("DFID")\n", + ll_get_fsname(dir->i_sb, NULL, 0), + PFID(ll_inode2fid(dir))); + + rc = ll_get_default_mdsize(sbi, &lmmsize); + if (rc != 0) + RETURN(rc); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot, + strlen(dotdot), lmmsize, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc != 0) { + CERROR("%s: failure inode "DFID" get parent: rc = %d\n", + ll_get_fsname(dir->i_sb, NULL, 0), + PFID(ll_inode2fid(dir)), rc); + RETURN(rc); + } + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + + /* + * LU-3952: MDT may lost the FID of its parent, we should not crash + * the NFS server, ll_iget_for_nfs() will handle the error. + */ + if (body->mbo_valid & OBD_MD_FLID) { + CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n", + PFID(ll_inode2fid(dir)), PFID(&body->mbo_fid1)); + *parent_fid = body->mbo_fid1; + } + + ptlrpc_req_finished(req); + RETURN(0); +} + +static struct dentry *ll_get_parent(struct dentry *dchild) +{ + struct lu_fid parent_fid = { 0 }; + int rc; + struct dentry *dentry; + ENTRY; + + rc = ll_dir_get_parent_fid(dchild->d_inode, &parent_fid); + if (rc != 0) + RETURN(ERR_PTR(rc)); + + dentry = ll_iget_for_nfs(dchild->d_inode->i_sb, &parent_fid, NULL); + + RETURN(dentry); +} + +struct export_operations lustre_export_operations = { + .get_parent = ll_get_parent, + .encode_fh = ll_encode_fh, + .get_name = ll_get_name, + .fh_to_dentry = ll_fh_to_dentry, + .fh_to_parent = ll_fh_to_parent, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c new file mode 100644 index 0000000000000..b2c0e28dd658a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c @@ -0,0 +1,1977 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include +#include +#include + +#include "llite_internal.h" +#include "vvp_internal.h" + +static struct kobject *llite_kobj; +static struct dentry *llite_root; + +int llite_tunables_register(void) +{ + int rc = 0; + + llite_kobj = class_setup_tunables("llite"); + if (IS_ERR(llite_kobj)) + return PTR_ERR(llite_kobj); + + llite_root = debugfs_create_dir("llite", debugfs_lustre_root); + if (IS_ERR_OR_NULL(llite_root)) { + rc = llite_root ? PTR_ERR(llite_root) : -ENOMEM; + llite_root = NULL; + kobject_put(llite_kobj); + llite_kobj = NULL; + } + + return rc; +} + +void llite_tunables_unregister(void) +{ + if (llite_kobj) { + kobject_put(llite_kobj); + llite_kobj = NULL; + } + + if (!IS_ERR_OR_NULL(llite_root)) { + debugfs_remove(llite_root); + llite_root = NULL; + } +} + +/* /lustre/llite mount point registration */ +static const struct file_operations ll_rw_extents_stats_fops; +static const struct file_operations ll_rw_extents_stats_pp_fops; +static const struct file_operations ll_rw_offset_stats_fops; + +/** + * ll_stats_pid_write() - Determine if stats collection should be enabled + * @buf: Buffer containing the data written + * @len: Number of bytes in the buffer + * + * Several proc files begin collecting stats when a value is written, and stop + * collecting when either '0' or 'disable' is written. This function checks the + * written value to see if collection should be enabled or disabled. + * + * Return: If '0' or 'disable' is provided, 0 is returned. If the text + * equivalent of a number is written, that number is returned. Otherwise, + * 1 is returned. Non-zero return values indicate collection should be enabled. + */ +static s64 ll_stats_pid_write(const char __user *buf, size_t len) +{ + unsigned long long value = 1; + char kernbuf[16]; + int rc; + + rc = kstrtoull_from_user(buf, len, 0, &value); + if (rc < 0 && len < sizeof(kernbuf)) { + if (copy_from_user(kernbuf, buf, len)) + return -EFAULT; + kernbuf[len] = 0; + + if (kernbuf[len - 1] == '\n') + kernbuf[len - 1] = 0; + + if (strncasecmp(kernbuf, "disable", 7) == 0) + value = 0; + } + + return value; +} + +static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + return sprintf(buf, "%u\n", osfs.os_bsize); +} +LUSTRE_RO_ATTR(blocksize); + +static ssize_t stat_blocksize_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", sbi->ll_stat_blksize); +} + +static ssize_t stat_blocksize_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + if (val != 0 && (val < PAGE_SIZE || (val & (val - 1))) != 0) + return -ERANGE; + + sbi->ll_stat_blksize = val; + + return count; +} +LUSTRE_RW_ATTR(stat_blocksize); + +static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytestotal); + +static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytesfree); + +static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytesavail); + +static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + return sprintf(buf, "%llu\n", osfs.os_files); +} +LUSTRE_RO_ATTR(filestotal); + +static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY); + if (rc) + return rc; + + return sprintf(buf, "%llu\n", osfs.os_ffree); +} +LUSTRE_RO_ATTR(filesfree); + +static ssize_t client_type_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "local client\n"); +} +LUSTRE_RO_ATTR(client_type); + +static ssize_t fstype_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "lustre\n"); +} +LUSTRE_RO_ATTR(fstype); + +static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%s\n", sbi->ll_sb_uuid.uuid); +} +LUSTRE_RO_ATTR(uuid); + +static int ll_site_stats_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + + /* + * See description of statistical counters in struct cl_site, and + * struct lu_site. + */ + return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m); +} + +LDEBUGFS_SEQ_FOPS_RO(ll_site_stats); + +static int ll_max_readahead_mb_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + unsigned long ra_max_mb; + + spin_lock(&sbi->ll_lock); + ra_max_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages); + spin_unlock(&sbi->ll_lock); + + seq_printf(m, "%lu\n", ra_max_mb); + return 0; +} + +static ssize_t +ll_max_readahead_mb_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + s64 ra_max_mb, pages_number; + int rc; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_mb, 'M'); + if (rc) + return rc; + + pages_number = round_up(ra_max_mb, 1024 * 1024) >> PAGE_SHIFT; + if (pages_number < 0 || pages_number > cfs_totalram_pages() / 2) { + /* 1/2 of RAM */ + CERROR("%s: can't set max_readahead_mb=%llu > %luMB\n", + ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number), + PAGES_TO_MiB(cfs_totalram_pages())); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ll_max_readahead_mb); + +static int ll_max_readahead_per_file_mb_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + unsigned long ra_max_file_mb; + + spin_lock(&sbi->ll_lock); + ra_max_file_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file); + spin_unlock(&sbi->ll_lock); + + seq_printf(m, "%lu\n", ra_max_file_mb); + return 0; +} + +static ssize_t +ll_max_readahead_per_file_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + s64 ra_max_file_mb, pages_number; + int rc; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_file_mb, + 'M'); + if (rc) + return rc; + + pages_number = round_up(ra_max_file_mb, 1024 * 1024) >> PAGE_SHIFT; + if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) { + CERROR("%s: can't set max_readahead_per_file_mb=%llu > max_read_ahead_mb=%lu\n", + ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number), + PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages)); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages_per_file = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ll_max_readahead_per_file_mb); + +static int ll_max_read_ahead_whole_mb_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + unsigned long ra_max_whole_mb; + + spin_lock(&sbi->ll_lock); + ra_max_whole_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_read_ahead_whole_pages); + spin_unlock(&sbi->ll_lock); + + seq_printf(m, "%lu\n", ra_max_whole_mb); + return 0; +} + +static ssize_t +ll_max_read_ahead_whole_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + s64 ra_max_whole_mb, pages_number; + int rc; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_whole_mb, + 'M'); + if (rc) + return rc; + + pages_number = round_up(ra_max_whole_mb, 1024 * 1024) >> PAGE_SHIFT; + /* Cap this at the current max readahead window size, the readahead + * algorithm does this anyway so it's pointless to set it larger. + */ + if (pages_number < 0 || + pages_number > sbi->ll_ra_info.ra_max_pages_per_file) { + CERROR("%s: can't set max_read_ahead_whole_mb=%llu > max_read_ahead_per_file_mb=%lu\n", + ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number), + PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file)); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ll_max_read_ahead_whole_mb); + +static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = sbi->ll_cache; + long max_cached_mb; + long unused_mb; + + max_cached_mb = PAGES_TO_MiB(cache->ccc_lru_max); + unused_mb = PAGES_TO_MiB(atomic_long_read(&cache->ccc_lru_left)); + seq_printf(m, "users: %d\n" + "max_cached_mb: %ld\n" + "used_mb: %ld\n" + "unused_mb: %ld\n" + "reclaim_count: %u\n", + atomic_read(&cache->ccc_users), + max_cached_mb, + max_cached_mb - unused_mb, + unused_mb, + cache->ccc_lru_shrinkers); + return 0; +} + +static ssize_t ll_max_cached_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = sbi->ll_cache; + struct lu_env *env; + long diff = 0; + long nrpages = 0; + __u16 refcheck; + __s64 pages_number; + int rc; + char kernbuf[128]; + + ENTRY; + if (count >= sizeof(kernbuf)) + RETURN(-EINVAL); + + if (copy_from_user(kernbuf, buffer, count)) + RETURN(-EFAULT); + kernbuf[count] = 0; + + buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) - + kernbuf; + rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M'); + if (rc) + RETURN(rc); + + pages_number >>= PAGE_SHIFT; + + if (pages_number < 0 || pages_number > cfs_totalram_pages()) { + CERROR("%s: can't set max cache more than %lu MB\n", + ll_get_fsname(sb, NULL, 0), + PAGES_TO_MiB(cfs_totalram_pages())); + RETURN(-ERANGE); + } + /* Allow enough cache so clients can make well-formed RPCs */ + pages_number = max_t(long, pages_number, PTLRPC_MAX_BRW_PAGES); + + spin_lock(&sbi->ll_lock); + diff = pages_number - cache->ccc_lru_max; + spin_unlock(&sbi->ll_lock); + + /* easy - add more LRU slots. */ + if (diff >= 0) { + atomic_long_add(diff, &cache->ccc_lru_left); + GOTO(out, rc = 0); + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + diff = -diff; + while (diff > 0) { + long tmp; + + /* reduce LRU budget from free slots. */ + do { + long ov, nv; + + ov = atomic_long_read(&cache->ccc_lru_left); + if (ov == 0) + break; + + nv = ov > diff ? ov - diff : 0; + rc = atomic_long_cmpxchg(&cache->ccc_lru_left, ov, nv); + if (likely(ov == rc)) { + diff -= ov - nv; + nrpages += ov - nv; + break; + } + } while (1); + + if (diff <= 0) + break; + + if (sbi->ll_dt_exp == NULL) { /* being initialized */ + rc = -ENODEV; + break; + } + + /* difficult - have to ask OSCs to drop LRU slots. */ + tmp = diff << 1; + rc = obd_set_info_async(env, sbi->ll_dt_exp, + sizeof(KEY_CACHE_LRU_SHRINK), + KEY_CACHE_LRU_SHRINK, + sizeof(tmp), &tmp, NULL); + if (rc < 0) + break; + } + cl_env_put(env, &refcheck); + +out: + if (rc >= 0) { + spin_lock(&sbi->ll_lock); + cache->ccc_lru_max = pages_number; + spin_unlock(&sbi->ll_lock); + rc = count; + } else { + atomic_long_add(nrpages, &cache->ccc_lru_left); + } + return rc; +} + +LDEBUGFS_SEQ_FOPS(ll_max_cached_mb); + +static ssize_t checksums_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0); +} + +static ssize_t checksums_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int tmp; + int rc; + + if (!sbi->ll_dt_exp) + /* Not set up yet */ + return -EAGAIN; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + if (val) + sbi->ll_flags |= LL_SBI_CHECKSUM; + else + sbi->ll_flags &= ~LL_SBI_CHECKSUM; + tmp = val; + + rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), + KEY_CHECKSUM, sizeof(tmp), &tmp, NULL); + if (rc) + CWARN("Failed to set OSC checksum flags: %d\n", rc); + + return count; +} +LUSTRE_RW_ATTR(checksums); + +LUSTRE_ATTR(checksum_pages, 0644, checksums_show, checksums_store); + +static ssize_t ll_rd_track_id(struct kobject *kobj, char *buf, + enum stats_track_type type) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + if (sbi->ll_stats_track_type == type) + return sprintf(buf, "%d\n", sbi->ll_stats_track_id); + else if (sbi->ll_stats_track_type == STATS_TRACK_ALL) + return sprintf(buf, "0 (all)\n"); + + return sprintf(buf, "untracked\n"); +} + +static ssize_t ll_wr_track_id(struct kobject *kobj, const char *buffer, + size_t count, enum stats_track_type type) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned long pid; + int rc; + + rc = kstrtoul(buffer, 10, &pid); + if (rc) + return rc; + + sbi->ll_stats_track_id = pid; + if (pid == 0) + sbi->ll_stats_track_type = STATS_TRACK_ALL; + else + sbi->ll_stats_track_type = type; + lprocfs_clear_stats(sbi->ll_stats); + return count; +} + +static ssize_t stats_track_pid_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return ll_rd_track_id(kobj, buf, STATS_TRACK_PID); +} + +static ssize_t stats_track_pid_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PID); +} +LUSTRE_RW_ATTR(stats_track_pid); + +static ssize_t stats_track_ppid_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return ll_rd_track_id(kobj, buf, STATS_TRACK_PPID); +} + +static ssize_t stats_track_ppid_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PPID); +} +LUSTRE_RW_ATTR(stats_track_ppid); + +static ssize_t stats_track_gid_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return ll_rd_track_id(kobj, buf, STATS_TRACK_GID); +} + +static ssize_t stats_track_gid_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_GID); +} +LUSTRE_RW_ATTR(stats_track_gid); + +static ssize_t statahead_running_max_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, 16, "%u\n", sbi->ll_sa_running_max); +} + +static ssize_t statahead_running_max_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 0, &val); + if (rc) + return rc; + + if (val <= LL_SA_RUNNING_MAX) { + sbi->ll_sa_running_max = val; + return count; + } + + CERROR("Bad statahead_running_max value %lu. Valid values " + "are in the range [0, %d]\n", val, LL_SA_RUNNING_MAX); + + return -ERANGE; +} +LUSTRE_RW_ATTR(statahead_running_max); + +static ssize_t statahead_max_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", sbi->ll_sa_max); +} + +static ssize_t statahead_max_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 0, &val); + if (rc) + return rc; + + if (val <= LL_SA_RPC_MAX) + sbi->ll_sa_max = val; + else + CERROR("Bad statahead_max value %lu. Valid values are in the range [0, %d]\n", + val, LL_SA_RPC_MAX); + + return count; +} +LUSTRE_RW_ATTR(statahead_max); + +static ssize_t statahead_agl_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0); +} + +static ssize_t statahead_agl_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + if (val) + sbi->ll_flags |= LL_SBI_AGL_ENABLED; + else + sbi->ll_flags &= ~LL_SBI_AGL_ENABLED; + + return count; +} +LUSTRE_RW_ATTR(statahead_agl); + +static int ll_statahead_stats_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, "statahead total: %u\n" + "statahead wrong: %u\n" + "agl total: %u\n", + atomic_read(&sbi->ll_sa_total), + atomic_read(&sbi->ll_sa_wrong), + atomic_read(&sbi->ll_agl_total)); + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(ll_statahead_stats); + +static ssize_t lazystatfs_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0); +} + +static ssize_t lazystatfs_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + if (val) + sbi->ll_flags |= LL_SBI_LAZYSTATFS; + else + sbi->ll_flags &= ~LL_SBI_LAZYSTATFS; + + return count; +} +LUSTRE_RW_ATTR(lazystatfs); + +static ssize_t statfs_max_age_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_statfs_max_age); +} + +static ssize_t statfs_max_age_store(struct kobject *kobj, + struct attribute *attr, const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + if (val > OBD_STATFS_CACHE_MAX_AGE) + return -EINVAL; + + sbi->ll_statfs_max_age = val; + + return count; +} +LUSTRE_RW_ATTR(statfs_max_age); + +static ssize_t neg_dentry_timeout_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%d\n", sbi->ll_neg_dentry_timeout); +} + +static ssize_t neg_dentry_timeout_store(struct kobject *kobj, + struct attribute *attr, const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + int val; + int rc; + + rc = kstrtoint(buffer, 10, &val); + if (rc) + return rc; + if (val < OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS) + return -EINVAL; + + sbi->ll_neg_dentry_timeout = val; + + return count; +} +LUSTRE_RW_ATTR(neg_dentry_timeout); + +static ssize_t max_easize_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int ealen; + int rc; + + rc = ll_get_max_mdsize(sbi, &ealen); + if (rc) + return rc; + + return sprintf(buf, "%u\n", ealen); +} +LUSTRE_RO_ATTR(max_easize); + +/** + * Get default_easize. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] m seq_file handle + * \param[in] v unused for single entry + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +static ssize_t default_easize_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int ealen; + int rc; + + rc = ll_get_default_mdsize(sbi, &ealen); + if (rc) + return rc; + + return sprintf(buf, "%u\n", ealen); +} + +/** + * Set default_easize. + * + * Range checking on the passed value is handled by + * ll_set_default_mdsize(). + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] file proc file + * \param[in] buffer string passed from user space + * \param[in] count \a buffer length + * \param[in] off unused for single entry + * + * \retval positive \a count on success + * \retval negative negated errno on failure + */ +static ssize_t default_easize_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + if (count == 0) + return 0; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + rc = ll_set_default_mdsize(sbi, val); + if (rc) + return rc; + + return count; +} +LUSTRE_RW_ATTR(default_easize); + +static int ll_sbi_flags_seq_show(struct seq_file *m, void *v) +{ + const char *str[] = LL_SBI_FLAGS; + struct super_block *sb = m->private; + int flags = ll_s2sbi(sb)->ll_flags; + int i = 0; + + while (flags != 0) { + if (ARRAY_SIZE(str) <= i) { + CERROR("%s: Revise array LL_SBI_FLAGS to match sbi " + "flags please.\n", ll_get_fsname(sb, NULL, 0)); + return -EINVAL; + } + + if (flags & 0x1) + seq_printf(m, "%s ", str[i]); + flags >>= 1; + ++i; + } + seq_printf(m, "\b\n"); + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(ll_sbi_flags); + +static ssize_t xattr_cache_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", sbi->ll_xattr_cache_enabled); +} + +static ssize_t xattr_cache_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + if (val && !(sbi->ll_flags & LL_SBI_XATTR_CACHE)) + return -ENOTSUPP; + + sbi->ll_xattr_cache_enabled = val; + sbi->ll_xattr_cache_set = 1; + + return count; +} +LUSTRE_RW_ATTR(xattr_cache); + +static ssize_t tiny_write_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", !!(sbi->ll_flags & LL_SBI_TINY_WRITE)); +} + +static ssize_t tiny_write_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + spin_lock(&sbi->ll_lock); + if (val) + sbi->ll_flags |= LL_SBI_TINY_WRITE; + else + sbi->ll_flags &= ~LL_SBI_TINY_WRITE; + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(tiny_write); + +static ssize_t fast_read_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return sprintf(buf, "%u\n", !!(sbi->ll_flags & LL_SBI_FAST_READ)); +} + +static ssize_t fast_read_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + spin_lock(&sbi->ll_lock); + if (val) + sbi->ll_flags |= LL_SBI_FAST_READ; + else + sbi->ll_flags &= ~LL_SBI_FAST_READ; + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(fast_read); + +static ssize_t inode_cache_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_inode_cache_enabled); +} + +static ssize_t inode_cache_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + sbi->ll_inode_cache_enabled = val; + + return count; +} +LUSTRE_RW_ATTR(inode_cache); + +static int ll_unstable_stats_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = sbi->ll_cache; + long pages; + int mb; + + pages = atomic_long_read(&cache->ccc_unstable_nr); + mb = (pages * PAGE_SIZE) >> 20; + + seq_printf(m, "unstable_check: %8d\n" + "unstable_pages: %12ld\n" + "unstable_mb: %8d\n", + cache->ccc_unstable_check, pages, mb); + return 0; +} + +static ssize_t ll_unstable_stats_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *unused) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)seq->private); + char kernbuf[128]; + bool val; + int rc; + + if (count == 0) + return 0; + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) - + kernbuf; + rc = kstrtobool_from_user(buffer, count, &val); + if (rc < 0) + return rc; + + /* borrow lru lock to set the value */ + spin_lock(&sbi->ll_cache->ccc_lru_lock); + sbi->ll_cache->ccc_unstable_check = val; + spin_unlock(&sbi->ll_cache->ccc_lru_lock); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ll_unstable_stats); + +static int ll_root_squash_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct root_squash_info *squash = &sbi->ll_squash; + + seq_printf(m, "%u:%u\n", squash->rsi_uid, squash->rsi_gid); + return 0; +} + +static ssize_t ll_root_squash_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct root_squash_info *squash = &sbi->ll_squash; + + return lprocfs_wr_root_squash(buffer, count, squash, + ll_get_fsname(sb, NULL, 0)); +} + +LDEBUGFS_SEQ_FOPS(ll_root_squash); + +static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct root_squash_info *squash = &sbi->ll_squash; + int len; + + down_read(&squash->rsi_sem); + if (!list_empty(&squash->rsi_nosquash_nids)) { + len = cfs_print_nidlist(m->buf + m->count, m->size - m->count, + &squash->rsi_nosquash_nids); + m->count += len; + seq_putc(m, '\n'); + } else { + seq_puts(m, "NONE\n"); + } + up_read(&squash->rsi_sem); + + return 0; +} + +static ssize_t ll_nosquash_nids_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct root_squash_info *squash = &sbi->ll_squash; + int rc; + + rc = lprocfs_wr_nosquash_nids(buffer, count, squash, + ll_get_fsname(sb, NULL, 0)); + if (rc < 0) + return rc; + + ll_compute_rootsquash_state(sbi); + + return rc; +} + +LDEBUGFS_SEQ_FOPS(ll_nosquash_nids); + +static int ll_mdll_dir_restore_max_retry_count_seq_show(struct seq_file *m, + void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, "%d\n", + atomic_read(&sbi->ll_dir_restore_max_retry_count)); + + return 0; +} + +static ssize_t +ll_mdll_dir_restore_max_retry_count_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = kstrtoint_from_user(buffer, count, 0, &val); + if (rc) + return rc; + + /* + * Right now there is no limitation set on the retry count. + * This is done as we dont know what the right max limit + * would be. The max value would depend on the number of + * files in the directory that is being restored and as well + * if the mdt keeps restarting. The client calls are + * interruptible and can be used to break from long retries. + */ + if (val < -1) + return -EINVAL; + + atomic_set(&sbi->ll_dir_restore_max_retry_count, val); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ll_mdll_dir_restore_max_retry_count); + +struct ldebugfs_vars lprocfs_llite_obd_vars[] = { + { .name = "site", + .fops = &ll_site_stats_fops }, + { .name = "max_read_ahead_mb", + .fops = &ll_max_readahead_mb_fops }, + { .name = "max_read_ahead_per_file_mb", + .fops = &ll_max_readahead_per_file_mb_fops }, + { .name = "max_read_ahead_whole_mb", + .fops = &ll_max_read_ahead_whole_mb_fops }, + { .name = "max_cached_mb", + .fops = &ll_max_cached_mb_fops }, + { .name = "statahead_stats", + .fops = &ll_statahead_stats_fops }, + { .name = "unstable_stats", + .fops = &ll_unstable_stats_fops }, + { .name = "sbi_flags", + .fops = &ll_sbi_flags_fops }, + { .name = "root_squash", + .fops = &ll_root_squash_fops }, + { .name = "nosquash_nids", + .fops = &ll_nosquash_nids_fops }, + { .name = "mdll_dir_restore_max_retry_count", + .fops = &ll_mdll_dir_restore_max_retry_count_fops }, + { NULL } +}; + +#define MAX_STRING_SIZE 128 + +static struct attribute *llite_attrs[] = { + &lustre_attr_blocksize.attr, + &lustre_attr_stat_blocksize.attr, + &lustre_attr_kbytestotal.attr, + &lustre_attr_kbytesfree.attr, + &lustre_attr_kbytesavail.attr, + &lustre_attr_filestotal.attr, + &lustre_attr_filesfree.attr, + &lustre_attr_client_type.attr, + &lustre_attr_fstype.attr, + &lustre_attr_uuid.attr, + &lustre_attr_checksums.attr, + &lustre_attr_checksum_pages.attr, + &lustre_attr_stats_track_pid.attr, + &lustre_attr_stats_track_ppid.attr, + &lustre_attr_stats_track_gid.attr, + &lustre_attr_statahead_running_max.attr, + &lustre_attr_statahead_max.attr, + &lustre_attr_statahead_agl.attr, + &lustre_attr_lazystatfs.attr, + &lustre_attr_statfs_max_age.attr, + &lustre_attr_max_easize.attr, + &lustre_attr_default_easize.attr, + &lustre_attr_xattr_cache.attr, + &lustre_attr_fast_read.attr, + &lustre_attr_tiny_write.attr, + &lustre_attr_neg_dentry_timeout.attr, + &lustre_attr_inode_cache.attr, + NULL, +}; + +static void llite_kobj_release(struct kobject *kobj) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + complete(&sbi->ll_kobj_unregister); +} + +static struct kobj_type llite_ktype = { + .default_attrs = llite_attrs, + .sysfs_ops = &lustre_sysfs_ops, + .release = llite_kobj_release, +}; + +static const struct llite_file_opcode { + __u32 opcode; + __u32 type; + const char *opname; +} llite_opcode_table[LPROC_LL_FILE_OPCODES] = { + /* file operation */ + { LPROC_LL_DIRTY_HITS, LPROCFS_TYPE_REGS, "dirty_pages_hits" }, + { LPROC_LL_DIRTY_MISSES, LPROCFS_TYPE_REGS, "dirty_pages_misses" }, + { LPROC_LL_READ_BYTES, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "read_bytes" }, + { LPROC_LL_WRITE_BYTES, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "write_bytes" }, + { LPROC_LL_BRW_READ, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + "brw_read" }, + { LPROC_LL_BRW_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + "brw_write" }, + { LPROC_LL_IOCTL, LPROCFS_TYPE_REGS, "ioctl" }, + { LPROC_LL_OPEN, LPROCFS_TYPE_REGS, "open" }, + { LPROC_LL_RELEASE, LPROCFS_TYPE_REGS, "close" }, + { LPROC_LL_MAP, LPROCFS_TYPE_REGS, "mmap" }, + { LPROC_LL_FAULT, LPROCFS_TYPE_REGS, "page_fault" }, + { LPROC_LL_MKWRITE, LPROCFS_TYPE_REGS, "page_mkwrite" }, + { LPROC_LL_LLSEEK, LPROCFS_TYPE_REGS, "seek" }, + { LPROC_LL_FSYNC, LPROCFS_TYPE_REGS, "fsync" }, + { LPROC_LL_READDIR, LPROCFS_TYPE_REGS, "readdir" }, + /* inode operation */ + { LPROC_LL_SETATTR, LPROCFS_TYPE_REGS, "setattr" }, + { LPROC_LL_TRUNC, LPROCFS_TYPE_REGS, "truncate" }, + { LPROC_LL_FLOCK, LPROCFS_TYPE_REGS, "flock" }, + { LPROC_LL_GETATTR, LPROCFS_TYPE_REGS, "getattr" }, + /* dir inode operation */ + { LPROC_LL_CREATE, LPROCFS_TYPE_REGS, "create" }, + { LPROC_LL_LINK, LPROCFS_TYPE_REGS, "link" }, + { LPROC_LL_UNLINK, LPROCFS_TYPE_REGS, "unlink" }, + { LPROC_LL_SYMLINK, LPROCFS_TYPE_REGS, "symlink" }, + { LPROC_LL_MKDIR, LPROCFS_TYPE_REGS, "mkdir" }, + { LPROC_LL_RMDIR, LPROCFS_TYPE_REGS, "rmdir" }, + { LPROC_LL_MKNOD, LPROCFS_TYPE_REGS, "mknod" }, + { LPROC_LL_RENAME, LPROCFS_TYPE_REGS, "rename" }, + /* special inode operation */ + { LPROC_LL_STAFS, LPROCFS_TYPE_REGS, "statfs" }, + { LPROC_LL_ALLOC_INODE, LPROCFS_TYPE_REGS, "alloc_inode" }, + { LPROC_LL_SETXATTR, LPROCFS_TYPE_REGS, "setxattr" }, + { LPROC_LL_GETXATTR, LPROCFS_TYPE_REGS, "getxattr" }, + { LPROC_LL_GETXATTR_HITS, LPROCFS_TYPE_REGS, "getxattr_hits" }, + { LPROC_LL_LISTXATTR, LPROCFS_TYPE_REGS, "listxattr" }, + { LPROC_LL_REMOVEXATTR, LPROCFS_TYPE_REGS, "removexattr" }, + { LPROC_LL_INODE_PERM, LPROCFS_TYPE_REGS, "inode_permission" }, +}; + +void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) +{ + if (!sbi->ll_stats) + return; + if (sbi->ll_stats_track_type == STATS_TRACK_ALL) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_PID && + sbi->ll_stats_track_id == current->pid) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_PPID && + sbi->ll_stats_track_id == current->parent->pid) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_GID && + sbi->ll_stats_track_id == + from_kgid(&init_user_ns, current_gid())) + lprocfs_counter_add(sbi->ll_stats, op, count); +} +EXPORT_SYMBOL(ll_stats_ops_tally); + +static const char *ra_stat_string[] = { + [RA_STAT_HIT] = "hits", + [RA_STAT_MISS] = "misses", + [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive", + [RA_STAT_MISS_IN_WINDOW] = "miss inside window", + [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page", + [RA_STAT_FAILED_MATCH] = "failed lock match", + [RA_STAT_DISCARDED] = "read but discarded", + [RA_STAT_ZERO_LEN] = "zero length file", + [RA_STAT_ZERO_WINDOW] = "zero size window", + [RA_STAT_EOF] = "read-ahead to EOF", + [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", + [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page", + [RA_STAT_FAILED_REACH_END] = "failed to reach end" +}; + +int ll_debugfs_register_super(struct super_block *sb, const char *name) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + int err, id, rc; + + ENTRY; + LASSERT(sbi); + + if (IS_ERR_OR_NULL(llite_root)) + goto out_ll_kset; + + sbi->ll_debugfs_entry = ldebugfs_register(name, llite_root, + lprocfs_llite_obd_vars, sb); + if (IS_ERR_OR_NULL(sbi->ll_debugfs_entry)) { + err = sbi->ll_debugfs_entry ? PTR_ERR(sbi->ll_debugfs_entry) : + -ENOMEM; + sbi->ll_debugfs_entry = NULL; + RETURN(err); + } + + rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "dump_page_cache",0444, + &vvp_dump_pgcache_file_ops, sbi); + if (rc) + CWARN("Error adding the dump_page_cache file\n"); + + rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "extents_stats", 0644, + &ll_rw_extents_stats_fops, sbi); + if (rc) + CWARN("Error adding the extent_stats file\n"); + + rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, + "extents_stats_per_process", 0644, + &ll_rw_extents_stats_pp_fops, sbi); + if (rc) + CWARN("Error adding the extents_stats_per_process file\n"); + + rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "offset_stats", 0644, + &ll_rw_offset_stats_fops, sbi); + if (rc) + CWARN("Error adding the offset_stats file\n"); + + /* File operations stats */ + sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES, + LPROCFS_STATS_FLAG_NONE); + if (sbi->ll_stats == NULL) + GOTO(out_debugfs, err = -ENOMEM); + + /* do counter init */ + for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) { + u32 type = llite_opcode_table[id].type; + void *ptr = NULL; + + if (type & LPROCFS_TYPE_REGS) + ptr = "regs"; + else if (type & LPROCFS_TYPE_BYTES) + ptr = "bytes"; + else if (type & LPROCFS_TYPE_PAGES) + ptr = "pages"; + lprocfs_counter_init(sbi->ll_stats, + llite_opcode_table[id].opcode, + (type & LPROCFS_CNTR_AVGMINMAX), + llite_opcode_table[id].opname, ptr); + } + + err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "stats", + sbi->ll_stats); + if (err) + GOTO(out_stats, err); + + sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string), + LPROCFS_STATS_FLAG_NONE); + if (sbi->ll_ra_stats == NULL) + GOTO(out_stats, err = -ENOMEM); + + for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++) + lprocfs_counter_init(sbi->ll_ra_stats, id, 0, + ra_stat_string[id], "pages"); + + err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "read_ahead_stats", + sbi->ll_ra_stats); + if (err) + GOTO(out_ra_stats, err); + +out_ll_kset: + /* Yes we also register sysfs mount kset here as well */ + sbi->ll_kset.kobj.parent = llite_kobj; + sbi->ll_kset.kobj.ktype = &llite_ktype; + init_completion(&sbi->ll_kobj_unregister); + err = kobject_set_name(&sbi->ll_kset.kobj, "%s", name); + if (err) + GOTO(out_ra_stats, err); + + err = kset_register(&sbi->ll_kset); + if (err) + GOTO(out_ra_stats, err); + + lsi->lsi_kobj = kobject_get(&sbi->ll_kset.kobj); + + RETURN(0); +out_ra_stats: + lprocfs_free_stats(&sbi->ll_ra_stats); +out_stats: + lprocfs_free_stats(&sbi->ll_stats); +out_debugfs: + ldebugfs_remove(&sbi->ll_debugfs_entry); + + RETURN(err); +} + +void ll_debugfs_unregister_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + + if (!IS_ERR_OR_NULL(sbi->ll_debugfs_entry)) + ldebugfs_remove(&sbi->ll_debugfs_entry); + + if (sbi->ll_dt_obd) + sysfs_remove_link(&sbi->ll_kset.kobj, + sbi->ll_dt_obd->obd_type->typ_name); + + if (sbi->ll_md_obd) + sysfs_remove_link(&sbi->ll_kset.kobj, + sbi->ll_md_obd->obd_type->typ_name); + + kobject_put(lsi->lsi_kobj); + + kset_unregister(&sbi->ll_kset); + wait_for_completion(&sbi->ll_kobj_unregister); + + lprocfs_free_stats(&sbi->ll_ra_stats); + lprocfs_free_stats(&sbi->ll_stats); +} +#undef MAX_STRING_SIZE + +static void ll_display_extents_info(struct ll_rw_extents_info *io_extents, + struct seq_file *seq, int which) +{ + unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; + unsigned long start, end, r, w; + char *unitp = "KMGTPEZY"; + int i, units = 10; + struct per_process_info *pp_info = &io_extents->pp_extents[which]; + + read_cum = 0; + write_cum = 0; + start = 0; + + for(i = 0; i < LL_HIST_MAX; i++) { + read_tot += pp_info->pp_r_hist.oh_buckets[i]; + write_tot += pp_info->pp_w_hist.oh_buckets[i]; + } + + for(i = 0; i < LL_HIST_MAX; i++) { + r = pp_info->pp_r_hist.oh_buckets[i]; + w = pp_info->pp_w_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + end = BIT(i + LL_HIST_START - units); + seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4u %4u | " + "%14lu %4u %4u\n", start, *unitp, end, *unitp, + (i == LL_HIST_MAX - 1) ? '+' : ' ', + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + start = end; + if (start == BIT(10)) { + start = 1; + units += 10; + unitp++; + } + if (read_cum == read_tot && write_cum == write_tot) + break; + } +} + +static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v) +{ + struct timespec64 now; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + int k; + + ktime_get_real_ts64(&now); + + if (!sbi->ll_rw_stats_on) { + seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n"); + return 0; + } + seq_printf(seq, "snapshot_time: %llu.%09lu (secs.nsecs)\n", + (s64)now.tv_sec, now.tv_nsec); + seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); + seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", + "extents", "calls", "%", "cum%", + "calls", "%", "cum%"); + spin_lock(&sbi->ll_pp_extent_lock); + for (k = 0; k < LL_PROCESS_HIST_MAX; k++) { + if (io_extents->pp_extents[k].pid != 0) { + seq_printf(seq, "\nPID: %d\n", + io_extents->pp_extents[k].pid); + ll_display_extents_info(io_extents, seq, k); + } + } + spin_unlock(&sbi->ll_pp_extent_lock); + return 0; +} + +static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file, + const char __user *buf, + size_t len, + loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + int i; + __s64 value; + + if (len == 0) + return -EINVAL; + + value = ll_stats_pid_write(buf, len); + + if (value == 0) + sbi->ll_rw_stats_on = 0; + else + sbi->ll_rw_stats_on = 1; + + spin_lock(&sbi->ll_pp_extent_lock); + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + io_extents->pp_extents[i].pid = 0; + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist); + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist); + } + spin_unlock(&sbi->ll_pp_extent_lock); + return len; +} + +LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats_pp); + +static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timespec64 now; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + + ktime_get_real_ts64(&now); + + if (!sbi->ll_rw_stats_on) { + seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n"); + return 0; + } + seq_printf(seq, "snapshot_time: %llu.%09lu (secs.nsecs)\n", + (s64)now.tv_sec, now.tv_nsec); + + seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); + seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", + "extents", "calls", "%", "cum%", + "calls", "%", "cum%"); + spin_lock(&sbi->ll_lock); + ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX); + spin_unlock(&sbi->ll_lock); + + return 0; +} + +static ssize_t ll_rw_extents_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + int i; + __s64 value; + + if (len == 0) + return -EINVAL; + + value = ll_stats_pid_write(buf, len); + + if (value == 0) + sbi->ll_rw_stats_on = 0; + else + sbi->ll_rw_stats_on = 1; + + spin_lock(&sbi->ll_pp_extent_lock); + for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { + io_extents->pp_extents[i].pid = 0; + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist); + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist); + } + spin_unlock(&sbi->ll_pp_extent_lock); + + return len; +} + +LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats); + +void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw) +{ + int i, cur = -1; + struct ll_rw_process_info *process; + struct ll_rw_process_info *offset; + int *off_count = &sbi->ll_rw_offset_entry_count; + int *process_count = &sbi->ll_offset_process_count; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + + if(!sbi->ll_rw_stats_on) + return; + process = sbi->ll_rw_process_info; + offset = sbi->ll_rw_offset_info; + + spin_lock(&sbi->ll_pp_extent_lock); + /* Extent statistics */ + for(i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if(io_extents->pp_extents[i].pid == pid) { + cur = i; + break; + } + } + + if (cur == -1) { + /* new process */ + sbi->ll_extent_process_count = + (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX; + cur = sbi->ll_extent_process_count; + io_extents->pp_extents[cur].pid = pid; + lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist); + lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist); + } + + for (i = 0; (count >= BIT(LL_HIST_START + i)) && + (i < (LL_HIST_MAX - 1)); i++); + if (rw == 0) { + io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++; + io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++; + } else { + io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++; + io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++; + } + spin_unlock(&sbi->ll_pp_extent_lock); + + spin_lock(&sbi->ll_process_lock); + /* Offset statistics */ + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if (process[i].rw_pid == pid) { + if (process[i].rw_last_file != file) { + process[i].rw_range_start = pos; + process[i].rw_last_file_pos = pos + count; + process[i].rw_smallest_extent = count; + process[i].rw_largest_extent = count; + process[i].rw_offset = 0; + process[i].rw_last_file = file; + spin_unlock(&sbi->ll_process_lock); + return; + } + if (process[i].rw_last_file_pos != pos) { + *off_count = + (*off_count + 1) % LL_OFFSET_HIST_MAX; + offset[*off_count].rw_op = process[i].rw_op; + offset[*off_count].rw_pid = pid; + offset[*off_count].rw_range_start = + process[i].rw_range_start; + offset[*off_count].rw_range_end = + process[i].rw_last_file_pos; + offset[*off_count].rw_smallest_extent = + process[i].rw_smallest_extent; + offset[*off_count].rw_largest_extent = + process[i].rw_largest_extent; + offset[*off_count].rw_offset = + process[i].rw_offset; + process[i].rw_op = rw; + process[i].rw_range_start = pos; + process[i].rw_smallest_extent = count; + process[i].rw_largest_extent = count; + process[i].rw_offset = pos - + process[i].rw_last_file_pos; + } + if(process[i].rw_smallest_extent > count) + process[i].rw_smallest_extent = count; + if(process[i].rw_largest_extent < count) + process[i].rw_largest_extent = count; + process[i].rw_last_file_pos = pos + count; + spin_unlock(&sbi->ll_process_lock); + return; + } + } + *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX; + process[*process_count].rw_pid = pid; + process[*process_count].rw_op = rw; + process[*process_count].rw_range_start = pos; + process[*process_count].rw_last_file_pos = pos + count; + process[*process_count].rw_smallest_extent = count; + process[*process_count].rw_largest_extent = count; + process[*process_count].rw_offset = 0; + process[*process_count].rw_last_file = file; + spin_unlock(&sbi->ll_process_lock); +} + +static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timespec64 now; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_process_info *offset = sbi->ll_rw_offset_info; + struct ll_rw_process_info *process = sbi->ll_rw_process_info; + int i; + + ktime_get_real_ts64(&now); + + if (!sbi->ll_rw_stats_on) { + seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n"); + return 0; + } + spin_lock(&sbi->ll_process_lock); + + seq_printf(seq, "snapshot_time: %llu.%09lu (secs.nsecs)\n", + (s64)now.tv_sec, now.tv_nsec); + seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n", + "R/W", "PID", "RANGE START", "RANGE END", + "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET"); + + /* We stored the discontiguous offsets here; print them first */ + for (i = 0; i < LL_OFFSET_HIST_MAX; i++) { + if (offset[i].rw_pid != 0) + seq_printf(seq, + "%3c %10d %14llu %14llu %17lu %17lu %14lld\n", + offset[i].rw_op == READ ? 'R' : 'W', + offset[i].rw_pid, + offset[i].rw_range_start, + offset[i].rw_range_end, + (unsigned long)offset[i].rw_smallest_extent, + (unsigned long)offset[i].rw_largest_extent, + offset[i].rw_offset); + } + + /* Then print the current offsets for each process */ + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if (process[i].rw_pid != 0) + seq_printf(seq, + "%3c %10d %14llu %14llu %17lu %17lu %14lld\n", + process[i].rw_op == READ ? 'R' : 'W', + process[i].rw_pid, + process[i].rw_range_start, + process[i].rw_last_file_pos, + (unsigned long)process[i].rw_smallest_extent, + (unsigned long)process[i].rw_largest_extent, + process[i].rw_offset); + } + spin_unlock(&sbi->ll_process_lock); + + return 0; +} + +static ssize_t ll_rw_offset_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_process_info *process_info = sbi->ll_rw_process_info; + struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info; + __s64 value; + + if (len == 0) + return -EINVAL; + + value = ll_stats_pid_write(buf, len); + + if (value == 0) + sbi->ll_rw_stats_on = 0; + else + sbi->ll_rw_stats_on = 1; + + spin_lock(&sbi->ll_process_lock); + sbi->ll_offset_process_count = 0; + sbi->ll_rw_offset_entry_count = 0; + memset(process_info, 0, sizeof(struct ll_rw_process_info) * + LL_PROCESS_HIST_MAX); + memset(offset_info, 0, sizeof(struct ll_rw_process_info) * + LL_OFFSET_HIST_MAX); + spin_unlock(&sbi->ll_process_lock); + + return len; +} + +LDEBUGFS_SEQ_FOPS(ll_rw_offset_stats); diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c new file mode 100644 index 0000000000000..3856943101ab4 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/namei.c @@ -0,0 +1,1891 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include "llite_internal.h" + +#ifndef HAVE_USER_NAMESPACE_ARG +#define ll_create_nd(ns, dir, de, mode, ex) ll_create_nd(dir, de, mode, ex) +#define ll_mkdir(ns, dir, dch, mode) ll_mkdir(dir, dch, mode) +#define ll_mknod(ns, dir, dch, mode, rd) ll_mknod(dir, dch, mode, rd) +#ifdef HAVE_IOPS_RENAME_WITH_FLAGS +#define ll_rename(ns, src, sdc, tgt, tdc, fl) ll_rename(src, sdc, tgt, tdc, fl) +#else +#define ll_rename(ns, src, sdc, tgt, tdc) ll_rename(src, sdc, tgt, tdc) +#endif /* HAVE_IOPS_RENAME_WITH_FLAGS */ +#define ll_symlink(nd, dir, dch, old) ll_symlink(dir, dch, old) +#endif + +static int ll_create_it(struct inode *dir, struct dentry *dentry, + struct lookup_intent *it, + void *secctx, __u32 secctxlen); + +/* called from iget5_locked->find_inode() under inode_lock spinlock */ +static int ll_test_inode(struct inode *inode, void *opaque) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lustre_md *md = opaque; + + if (unlikely(!(md->body->mbo_valid & OBD_MD_FLID))) { + CERROR("MDS body missing FID\n"); + return 0; + } + + if (!lu_fid_eq(&lli->lli_fid, &md->body->mbo_fid1)) + return 0; + + return 1; +} + +static int ll_set_inode(struct inode *inode, void *opaque) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body = ((struct lustre_md *)opaque)->body; + + if (unlikely(!(body->mbo_valid & OBD_MD_FLID))) { + CERROR("MDS body missing FID\n"); + return -EINVAL; + } + + lli->lli_fid = body->mbo_fid1; + if (unlikely(!(body->mbo_valid & OBD_MD_FLTYPE))) { + CERROR("Can not initialize inode "DFID" without object type: " + "valid = %#llx\n", + PFID(&lli->lli_fid), body->mbo_valid); + return -EINVAL; + } + + inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mbo_mode & S_IFMT); + if (unlikely(inode->i_mode == 0)) { + CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid)); + return -EINVAL; + } + + ll_lli_init(lli); + + return 0; +} + + +/** + * Get an inode by inode number(@hash), which is already instantiated by + * the intent lookup). + */ +struct inode *ll_iget(struct super_block *sb, ino_t hash, + struct lustre_md *md) +{ + struct inode *inode; + int rc = 0; + + ENTRY; + + LASSERT(hash != 0); + inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md); + if (inode == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + if (inode->i_state & I_NEW) { + rc = ll_read_inode2(inode, md); + if (rc == 0 && S_ISREG(inode->i_mode) && + ll_i2info(inode)->lli_clob == NULL) + rc = cl_file_inode_init(inode, md); + + if (rc != 0) { + /* Let's clear directory lsm here, otherwise + * make_bad_inode() will reset the inode mode + * to regular, then ll_clear_inode will not + * be able to clear lsm_md */ + if (S_ISDIR(inode->i_mode)) + ll_dir_clear_lsm_md(inode); + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + inode = ERR_PTR(rc); + } else { + inode_has_no_xattr(inode); + unlock_new_inode(inode); + } + } else if (is_bad_inode(inode)) { + iput(inode); + inode = ERR_PTR(-ESTALE); + } else if (!(inode->i_state & (I_FREEING | I_CLEAR))) { + rc = ll_update_inode(inode, md); + CDEBUG(D_VFSTRACE, "got inode: "DFID"(%p): rc = %d\n", + PFID(&md->body->mbo_fid1), inode, rc); + if (rc != 0) { + if (S_ISDIR(inode->i_mode)) + ll_dir_clear_lsm_md(inode); + iput(inode); + inode = ERR_PTR(rc); + } + } + + RETURN(inode); +} + +static void ll_invalidate_negative_children(struct inode *dir) +{ + struct dentry *dentry, *tmp_subdir; + DECLARE_LL_D_HLIST_NODE_PTR(p); + + ll_lock_dcache(dir); + ll_d_hlist_for_each_entry(dentry, p, &dir->i_dentry) { + spin_lock(&dentry->d_lock); + if (!list_empty(&dentry->d_subdirs)) { + struct dentry *child; + + list_for_each_entry_safe(child, tmp_subdir, + &dentry->d_subdirs, + d_child) { + if (child->d_inode == NULL) + d_lustre_invalidate(child, 1); + } + } + spin_unlock(&dentry->d_lock); + } + ll_unlock_dcache(dir); +} + +int ll_test_inode_by_fid(struct inode *inode, void *opaque) +{ + return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque); +} + +static int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock) +{ + struct lu_env *env; + struct ll_inode_info *lli = ll_i2info(inode); + __u16 refcheck; + int rc; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + /* reach MDC layer to flush data under the DoM ldlm lock */ + rc = cl_object_flush(env, lli->lli_clob, lock); + if (rc == -ENODATA) { + CDEBUG(D_INODE, "inode "DFID" layout has no DoM stripe\n", + PFID(ll_inode2fid(inode))); + /* most likely result of layout change, do nothing */ + rc = 0; + } + + cl_env_put(env, &refcheck); + RETURN(rc); +} + +static void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel) +{ + struct inode *inode = ll_inode_from_resource_lock(lock); + struct ll_inode_info *lli; + __u64 bits = to_cancel; + int rc; + + ENTRY; + + if (!inode) { + /* That means the inode is evicted most likely and may cause + * the skipping of lock cleanups below, so print the message + * about that in log. + */ + if (lock->l_resource->lr_lvb_inode) + LDLM_DEBUG(lock, + "can't take inode for the lock (%sevicted)\n", + lock->l_resource->lr_lvb_inode->i_state & + I_FREEING ? "" : "not "); + RETURN_EXIT; + } + + if (!fid_res_name_eq(ll_inode2fid(inode), + &lock->l_resource->lr_name)) { + LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)", + PFID(ll_inode2fid(inode)), inode); + LBUG(); + } + + if (bits & MDS_INODELOCK_XATTR) { + ll_xattr_cache_destroy(inode); + bits &= ~MDS_INODELOCK_XATTR; + } + + /* For OPEN locks we differentiate between lock modes + * LCK_CR, LCK_CW, LCK_PR - bug 22891 */ + if (bits & MDS_INODELOCK_OPEN) + ll_have_md_lock(inode, &bits, lock->l_req_mode); + + if (bits & MDS_INODELOCK_OPEN) { + fmode_t fmode; + + switch (lock->l_req_mode) { + case LCK_CW: + fmode = FMODE_WRITE; + break; + case LCK_PR: + fmode = FMODE_EXEC; + break; + case LCK_CR: + fmode = FMODE_READ; + break; + default: + LDLM_ERROR(lock, "bad lock mode for OPEN lock"); + LBUG(); + } + + ll_md_real_close(inode, fmode); + + bits &= ~MDS_INODELOCK_OPEN; + } + + if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM | + MDS_INODELOCK_DOM)) + ll_have_md_lock(inode, &bits, LCK_MINMODE); + + if (bits & MDS_INODELOCK_DOM) { + rc = ll_dom_lock_cancel(inode, lock); + if (rc < 0) + CDEBUG(D_INODE, "cannot flush DoM data " + DFID": rc = %d\n", + PFID(ll_inode2fid(inode)), rc); + } + + if (bits & MDS_INODELOCK_LAYOUT) { + struct cl_object_conf conf = { + .coc_opc = OBJECT_CONF_INVALIDATE, + .coc_inode = inode, + }; + + rc = ll_layout_conf(inode, &conf); + if (rc < 0) + CDEBUG(D_INODE, "cannot invalidate layout of " + DFID": rc = %d\n", + PFID(ll_inode2fid(inode)), rc); + } + + lli = ll_i2info(inode); + + if (bits & MDS_INODELOCK_UPDATE) + lli->lli_update_atime = 1; + + if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) { + CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, " + "pfid = "DFID"\n", PFID(ll_inode2fid(inode)), + lli, PFID(&lli->lli_pfid)); + truncate_inode_pages(inode->i_mapping, 0); + + if (unlikely(!fid_is_zero(&lli->lli_pfid))) { + struct inode *master_inode = NULL; + unsigned long hash; + + /* This is slave inode, since all of the child dentry + * is connected on the master inode, so we have to + * invalidate the negative children on master inode */ + CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n", + PFID(ll_inode2fid(inode)), PFID(&lli->lli_pfid)); + + hash = cl_fid_build_ino(&lli->lli_pfid, + ll_need_32bit_api(ll_i2sbi(inode))); + + /* Do not lookup the inode with ilookup5, otherwise + * it will cause dead lock, + * 1. Client1 send chmod req to the MDT0, then on MDT0, + * it enqueues master and all of its slaves lock, + * (mdt_attr_set() -> mdt_lock_slaves()), after gets + * master and stripe0 lock, it will send the enqueue + * req (for stripe1) to MDT1, then MDT1 finds the lock + * has been granted to client2. Then MDT1 sends blocking + * ast to client2. + * 2. At the same time, client2 tries to unlink + * the striped dir (rm -rf striped_dir), and during + * lookup, it will hold the master inode of the striped + * directory, whose inode state is NEW, then tries to + * revalidate all of its slaves, (ll_prep_inode()-> + * ll_iget()->ll_read_inode2()-> ll_update_inode().). + * And it will be blocked on the server side because + * of 1. + * 3. Then the client get the blocking_ast req, cancel + * the lock, but being blocked if using ->ilookup5()), + * because master inode state is NEW. */ + master_inode = ilookup5_nowait(inode->i_sb, hash, + ll_test_inode_by_fid, + (void *)&lli->lli_pfid); + if (master_inode) { + ll_invalidate_negative_children(master_inode); + iput(master_inode); + } + } else { + ll_invalidate_negative_children(inode); + } + } + + if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) && + inode->i_sb->s_root != NULL && + !is_root_inode(inode)) + ll_invalidate_aliases(inode); + + if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) + forget_all_cached_acls(inode); + + iput(inode); + RETURN_EXIT; +} + +/* Check if the given lock may be downgraded instead of canceling and + * that convert is really needed. */ +int ll_md_need_convert(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + struct inode *inode; + __u64 wanted = lock->l_policy_data.l_inodebits.cancel_bits; + __u64 bits = lock->l_policy_data.l_inodebits.bits & ~wanted; + enum ldlm_mode mode = LCK_MINMODE; + + if (!lock->l_conn_export || + !exp_connect_lock_convert(lock->l_conn_export)) + return 0; + + if (!wanted || !bits || ldlm_is_cancel(lock)) + return 0; + + /* do not convert locks other than DOM for now */ + if (!((bits | wanted) & MDS_INODELOCK_DOM)) + return 0; + + /* We may have already remaining bits in some other lock so + * lock convert will leave us just extra lock for the same bit. + * Check if client has other lock with the same bits and the same + * or lower mode and don't convert if any. + */ + switch (lock->l_req_mode) { + case LCK_PR: + mode = LCK_PR; + fallthrough; + case LCK_PW: + mode |= LCK_CR; + break; + case LCK_CW: + mode = LCK_CW; + fallthrough; + case LCK_CR: + mode |= LCK_CR; + break; + default: + /* do not convert other modes */ + return 0; + } + + /* is lock is too old to be converted? */ + lock_res_and_lock(lock); + if (ktime_after(ktime_get(), + ktime_add(lock->l_last_used, + ktime_set(ns->ns_dirty_age_limit, 0)))) { + unlock_res_and_lock(lock); + return 0; + } + unlock_res_and_lock(lock); + + inode = ll_inode_from_resource_lock(lock); + ll_have_md_lock(inode, &bits, mode); + iput(inode); + return !!(bits); +} + +int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *ld, + void *data, int flag) +{ + struct lustre_handle lockh; + int rc; + + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: + { + __u64 cancel_flags = LCF_ASYNC; + + /* if lock convert is not needed then still have to + * pass lock via ldlm_cli_convert() to keep all states + * correct, set cancel_bits to full lock bits to cause + * full cancel to happen. + */ + if (!ll_md_need_convert(lock)) { + lock_res_and_lock(lock); + lock->l_policy_data.l_inodebits.cancel_bits = + lock->l_policy_data.l_inodebits.bits; + unlock_res_and_lock(lock); + } + rc = ldlm_cli_convert(lock, cancel_flags); + if (!rc) + RETURN(0); + /* continue with cancel otherwise */ + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, cancel_flags); + if (rc < 0) { + CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc); + RETURN(rc); + } + break; + } + case LDLM_CB_CANCELING: + { + __u64 to_cancel = lock->l_policy_data.l_inodebits.bits; + + /* Nothing to do for non-granted locks */ + if (!ldlm_is_granted(lock)) + break; + + /* If 'ld' is supplied then bits to be cancelled are passed + * implicitly by lock converting and cancel_bits from 'ld' + * should be used. Otherwise full cancel is being performed + * and lock inodebits are used. + * + * Note: we cannot rely on cancel_bits in lock itself at this + * moment because they can be changed by concurrent thread, + * so ldlm_cli_inodebits_convert() pass cancel bits implicitly + * in 'ld' parameter. + */ + if (ld) { + /* partial bits cancel allowed only during convert */ + LASSERT(ldlm_is_converting(lock)); + /* mask cancel bits by lock bits so only no any unused + * bits are passed to ll_lock_cancel_bits() + */ + to_cancel &= ld->l_policy_data.l_inodebits.cancel_bits; + } + ll_lock_cancel_bits(lock, to_cancel); + break; + } + default: + LBUG(); + } + + RETURN(0); +} + +__u32 ll_i2suppgid(struct inode *i) +{ + if (in_group_p(i->i_gid)) + return (__u32)from_kgid(&init_user_ns, i->i_gid); + else + return (__u32) __kgid_val(INVALID_GID); +} + +/* Pack the required supplementary groups into the supplied groups array. + * If we don't need to use the groups from the target inode(s) then we + * instead pack one or more groups from the user's supplementary group + * array in case it might be useful. Not needed if doing an MDS-side upcall. */ +void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2) +{ + LASSERT(i1 != NULL); + LASSERT(suppgids != NULL); + + suppgids[0] = ll_i2suppgid(i1); + + if (i2) + suppgids[1] = ll_i2suppgid(i2); + else + suppgids[1] = -1; +} + +/* + * try to reuse three types of dentry: + * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid + * by concurrent .revalidate). + * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may + * be cleared by others calling d_lustre_revalidate). + * 3. DISCONNECTED alias. + */ +static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry) +{ + struct dentry *alias, *discon_alias, *invalid_alias; + DECLARE_LL_D_HLIST_NODE_PTR(p); + + if (ll_d_hlist_empty(&inode->i_dentry)) + return NULL; + + discon_alias = invalid_alias = NULL; + + ll_lock_dcache(inode); + ll_d_hlist_for_each_entry(alias, p, &inode->i_dentry) { + LASSERT(alias != dentry); + + spin_lock(&alias->d_lock); + if ((alias->d_flags & DCACHE_DISCONNECTED) && + S_ISDIR(inode->i_mode)) + /* LASSERT(last_discon == NULL); LU-405, bz 20055 */ + discon_alias = alias; + else if (alias->d_parent == dentry->d_parent && + alias->d_name.hash == dentry->d_name.hash && + alias->d_name.len == dentry->d_name.len && + memcmp(alias->d_name.name, dentry->d_name.name, + dentry->d_name.len) == 0) + invalid_alias = alias; + spin_unlock(&alias->d_lock); + + if (invalid_alias) + break; + } + alias = invalid_alias ?: discon_alias ?: NULL; + if (alias) { + spin_lock(&alias->d_lock); + dget_dlock(alias); + spin_unlock(&alias->d_lock); + } + ll_unlock_dcache(inode); + + return alias; +} + +/* + * Similar to d_splice_alias(), but lustre treats invalid alias + * similar to DCACHE_DISCONNECTED, and tries to use it anyway. + */ +struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de) +{ + struct dentry *new; + int rc; + + if (inode) { + new = ll_find_alias(inode, de); + if (new) { + rc = ll_d_init(new); + if (rc < 0) { + dput(new); + return ERR_PTR(rc); + } + d_move(new, de); + iput(inode); + CDEBUG(D_DENTRY, + "Reuse dentry %p inode %p refc %d flags %#x\n", + new, new->d_inode, ll_d_count(new), new->d_flags); + return new; + } + } + rc = ll_d_init(de); + if (rc < 0) + return ERR_PTR(rc); + d_add(de, inode); + CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n", + de, de->d_inode, ll_d_count(de), de->d_flags); + return de; +} + +static int ll_lookup_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, + struct inode *parent, struct dentry **de, + void *secctx, __u32 secctxlen, ktime_t kstart) +{ + struct inode *inode = NULL; + __u64 bits = 0; + int rc; + struct dentry *alias; + ENTRY; + + /* NB 1 request reference will be taken away by ll_intent_lock() + * when I return */ + CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it, + it->it_disposition); + if (!it_disposition(it, DISP_LOOKUP_NEG)) { + struct req_capsule *pill = &request->rq_pill; + struct mdt_body *body = req_capsule_server_get(pill, + &RMF_MDT_BODY); + + rc = ll_prep_inode(&inode, request, (*de)->d_sb, it); + if (rc) + RETURN(rc); + + ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits); + /* OPEN can return data if lock has DoM+LAYOUT bits set */ + if (it->it_op & IT_OPEN && + bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT) + ll_dom_finish_open(inode, request); + + /* We used to query real size from OSTs here, but actually + * this is not needed. For stat() calls size would be updated + * from subsequent do_revalidate()->ll_inode_revalidate_it() in + * 2.4 and + * vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6 + * Everybody else who needs correct file size would call + * ll_glimpse_size or some equivalent themselves anyway. + * Also see bug 7198. + */ + + /* If security context was returned by MDT, put it in + * inode now to save an extra getxattr from security hooks, + * and avoid deadlock. + */ + if (body->mbo_valid & OBD_MD_SECCTX) { + secctx = req_capsule_server_get(pill, &RMF_FILE_SECCTX); + secctxlen = req_capsule_get_size(pill, + &RMF_FILE_SECCTX, + RCL_SERVER); + + if (secctxlen) + CDEBUG(D_SEC, "server returned security context" + " for "DFID"\n", + PFID(ll_inode2fid(inode))); + } + + if (secctx && secctxlen) { + /* no need to protect selinux_inode_setsecurity() by + * inode_lock. Taking it would lead to a client deadlock + * LU-13617 + */ + rc = security_inode_notifysecctx(inode, secctx, + secctxlen); + if (rc) + CWARN("cannot set security context for " + DFID": rc = %d\n", + PFID(ll_inode2fid(inode)), rc); + } + } + + /* Only hash *de if it is unhashed (new dentry). + * Atoimc_open may passin hashed dentries for open. + */ + alias = ll_splice_alias(inode, *de); + if (IS_ERR(alias)) + GOTO(out, rc = PTR_ERR(alias)); + + *de = alias; + + if (!it_disposition(it, DISP_LOOKUP_NEG)) { + /* we have lookup look - unhide dentry */ + if (bits & MDS_INODELOCK_LOOKUP) { + d_lustre_revalidate(*de); + ll_update_dir_depth(parent, (*de)->d_inode); + } + } else if (!it_disposition(it, DISP_OPEN_CREATE)) { + /* + * If file was created on the server, the dentry is revalidated + * in ll_create_it if the lock allows for it. + */ + /* Check that parent has UPDATE lock. */ + struct lookup_intent parent_it = { + .it_op = IT_GETATTR, + .it_lock_handle = 0 }; + struct lu_fid fid = ll_i2info(parent)->lli_fid; + + /* If it is striped directory, get the real stripe parent */ + if (unlikely(ll_dir_striped(parent))) { + rc = md_get_fid_from_lsm(ll_i2mdexp(parent), + ll_i2info(parent)->lli_lsm_md, + (*de)->d_name.name, + (*de)->d_name.len, &fid); + if (rc != 0) + GOTO(out, rc); + } + + if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, &fid, + NULL)) { + d_lustre_revalidate(*de); + ll_intent_release(&parent_it); + } + } + + if (it_disposition(it, DISP_OPEN_CREATE)) { + ll_stats_ops_tally(ll_i2sbi(parent), LPROC_LL_MKNOD, + ktime_us_delta(ktime_get(), kstart)); + } + + GOTO(out, rc = 0); + +out: + if (rc != 0 && it->it_op & IT_OPEN) { + ll_intent_drop_lock(it); + ll_open_cleanup((*de)->d_sb, request); + } + + return rc; +} + +static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, + struct lookup_intent *it, + void **secctx, __u32 *secctxlen) +{ + ktime_t kstart = ktime_get(); + struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; + struct dentry *save = dentry, *retval; + struct ptlrpc_request *req = NULL; + struct md_op_data *op_data = NULL; + __u32 opc; + int rc; + char secctx_name[XATTR_NAME_MAX + 1]; + + ENTRY; + + if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen) + RETURN(ERR_PTR(-ENAMETOOLONG)); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), intent=%s\n", + dentry->d_name.len, dentry->d_name.name, + PFID(ll_inode2fid(parent)), parent, LL_IT2STR(it)); + + if (d_mountpoint(dentry)) + CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it)); + + if (it == NULL || it->it_op == IT_GETXATTR) + it = &lookup_it; + + if (it->it_op == IT_GETATTR && dentry_may_statahead(parent, dentry)) { + rc = ll_statahead(parent, &dentry, 0); + if (rc == 1) + RETURN(dentry == save ? NULL : dentry); + } + + if (it->it_op & IT_OPEN && it->it_flags & FMODE_WRITE && + dentry->d_sb->s_flags & SB_RDONLY) + RETURN(ERR_PTR(-EROFS)); + + if (it->it_op & IT_CREAT) + opc = LUSTRE_OPC_CREATE; + else + opc = LUSTRE_OPC_ANY; + + op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name, + dentry->d_name.len, 0, opc, NULL); + if (IS_ERR(op_data)) + GOTO(out, retval = ERR_CAST(op_data)); + + /* enforce umask if acl disabled or MDS doesn't support umask */ + if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) + it->it_create_mode &= ~current_umask(); + + if (it->it_op & IT_CREAT && + ll_i2sbi(parent)->ll_flags & LL_SBI_FILE_SECCTX) { + rc = ll_dentry_init_security(dentry, it->it_create_mode, + &dentry->d_name, + &op_data->op_file_secctx_name, + &op_data->op_file_secctx, + &op_data->op_file_secctx_size); + if (rc < 0) + GOTO(out, retval = ERR_PTR(rc)); + if (secctx) + *secctx = op_data->op_file_secctx; + if (secctxlen) + *secctxlen = op_data->op_file_secctx_size; + } else { + if (secctx) + *secctx = NULL; + if (secctxlen) + *secctxlen = 0; + } + + /* ask for security context upon intent */ + if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN)) { + /* get name of security xattr to request to server */ + rc = ll_listsecurity(parent, secctx_name, + sizeof(secctx_name)); + if (rc < 0) { + CDEBUG(D_SEC, "cannot get security xattr name for " + DFID": rc = %d\n", + PFID(ll_inode2fid(parent)), rc); + } else if (rc > 0) { + op_data->op_file_secctx_name = secctx_name; + op_data->op_file_secctx_name_size = rc; + CDEBUG(D_SEC, "'%.*s' is security xattr for "DFID"\n", + rc, secctx_name, PFID(ll_inode2fid(parent))); + } + } + + rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req, + &ll_md_blocking_ast, 0); + /* If the MDS allows the client to chgrp (CFS_SETGRP_PERM), but the + * client does not know which suppgid should be sent to the MDS, or + * some other(s) changed the target file's GID after this RPC sent + * to the MDS with the suppgid as the original GID, then we should + * try again with right suppgid. */ + if (rc == -EACCES && it->it_op & IT_OPEN && + it_disposition(it, DISP_OPEN_DENY)) { + struct mdt_body *body; + + LASSERT(req != NULL); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (op_data->op_suppgids[0] == body->mbo_gid || + op_data->op_suppgids[1] == body->mbo_gid || + !in_group_p(make_kgid(&init_user_ns, body->mbo_gid))) + GOTO(out, retval = ERR_PTR(-EACCES)); + + fid_zero(&op_data->op_fid2); + op_data->op_suppgids[1] = body->mbo_gid; + ptlrpc_req_finished(req); + req = NULL; + ll_intent_release(it); + rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req, + &ll_md_blocking_ast, 0); + } + + if (rc < 0) + GOTO(out, retval = ERR_PTR(rc)); + + /* dir layout may change */ + ll_unlock_md_op_lsm(op_data); + rc = ll_lookup_it_finish(req, it, parent, &dentry, + secctx ? *secctx : NULL, + secctxlen ? *secctxlen : 0, kstart); + if (rc != 0) { + ll_intent_release(it); + GOTO(out, retval = ERR_PTR(rc)); + } + + if ((it->it_op & IT_OPEN) && dentry->d_inode && + !S_ISREG(dentry->d_inode->i_mode) && + !S_ISDIR(dentry->d_inode->i_mode)) { + ll_release_openhandle(dentry, it); + } + ll_lookup_finish_locks(it, dentry); + + GOTO(out, retval = (dentry == save) ? NULL : dentry); + +out: + if (op_data != NULL && !IS_ERR(op_data)) { + if (secctx && secctxlen) { + /* caller needs sec ctx info, so reset it in op_data to + * prevent it from being freed */ + op_data->op_file_secctx = NULL; + op_data->op_file_secctx_size = 0; + } + ll_finish_md_op_data(op_data); + } + + ptlrpc_req_finished(req); + return retval; +} + +#ifdef HAVE_IOP_ATOMIC_OPEN +static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry, + unsigned int flags) +{ + struct lookup_intent *itp, it = { .it_op = IT_GETATTR }; + struct dentry *de; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), flags=%u\n", + dentry->d_name.len, dentry->d_name.name, + PFID(ll_inode2fid(parent)), parent, flags); + + /* + * Optimize away (CREATE && !OPEN). Let .create handle the race. + * but only if we have write permissions there, otherwise we need + * to proceed with lookup. LU-4185 + */ + if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN) && + (inode_permission(&init_user_ns, + parent, MAY_WRITE | MAY_EXEC) == 0)) + return NULL; + + if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE)) + itp = NULL; + else + itp = ⁢ + de = ll_lookup_it(parent, dentry, itp, NULL, NULL); + + if (itp != NULL) + ll_intent_release(itp); + + return de; +} + +#ifdef FMODE_CREATED /* added in Linux v4.18-rc1-20-g73a09dd */ +# define ll_is_opened(o, f) ((f)->f_mode & FMODE_OPENED) +# define ll_finish_open(f, d, o) finish_open((f), (d), NULL) +# define ll_last_arg +# define ll_set_created(o, f) \ +do { \ + (f)->f_mode |= FMODE_CREATED; \ +} while (0) + +#else +# define ll_is_opened(o, f) (*(o)) +# define ll_finish_open(f, d, o) finish_open((f), (d), NULL, (o)) +# define ll_last_arg , int *opened +# define ll_set_created(o, f) \ +do { \ + *(o) |= FILE_CREATED; \ +} while (0) + +#endif + +/* + * For cached negative dentry and new dentry, handle lookup/create/open + * together. + */ +static int ll_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode ll_last_arg) +{ + struct lookup_intent *it; + struct dentry *de; + long long lookup_flags = LOOKUP_OPEN; + void *secctx = NULL; + __u32 secctxlen = 0; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), file %p," + "open_flags %x, mode %x opened %d\n", + dentry->d_name.len, dentry->d_name.name, + PFID(ll_inode2fid(dir)), dir, file, open_flags, mode, + ll_is_opened(opened, file)); + + /* Only negative dentries enter here */ + LASSERT(dentry->d_inode == NULL); + + if (!d_unhashed(dentry)) { + /* A valid negative dentry that just passed revalidation, + * there's little point to try and open it server-side, + * even though there's a minuscule chance it might succeed. + * Either way it's a valid race to just return -ENOENT here. + */ + if (!(open_flags & O_CREAT)) + return -ENOENT; + + /* Otherwise we just unhash it to be rehashed afresh via + * lookup if necessary + */ + d_drop(dentry); + } + + OBD_ALLOC(it, sizeof(*it)); + if (!it) + RETURN(-ENOMEM); + + it->it_op = IT_OPEN; + if (open_flags & O_CREAT) { + it->it_op |= IT_CREAT; + lookup_flags |= LOOKUP_CREATE; + } + it->it_create_mode = (mode & S_IALLUGO) | S_IFREG; + it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags); + it->it_flags &= ~MDS_OPEN_FL_INTERNAL; + + /* Dentry added to dcache tree in ll_lookup_it */ + de = ll_lookup_it(dir, dentry, it, &secctx, &secctxlen); + if (IS_ERR(de)) + rc = PTR_ERR(de); + else if (de != NULL) + dentry = de; + + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE, cfs_fail_val); + + if (!rc) { + if (it_disposition(it, DISP_OPEN_CREATE)) { + /* Dentry instantiated in ll_create_it. */ + rc = ll_create_it(dir, dentry, it, secctx, secctxlen); + ll_security_release_secctx(secctx, secctxlen); + if (rc) { + /* We dget in ll_splice_alias. */ + if (de != NULL) + dput(de); + goto out_release; + } + ll_set_created(opened, file); + } + if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) { + /* Open dentry. */ + if (S_ISFIFO(dentry->d_inode->i_mode)) { + /* We cannot call open here as it might + * deadlock. This case is unreachable in + * practice because of OBD_CONNECT_NODEVOH. */ + rc = finish_no_open(file, de); + } else { + file->private_data = it; + rc = ll_finish_open(file, dentry, opened); + /* We dget in ll_splice_alias. finish_open takes + * care of dget for fd open. + */ + if (de != NULL) + dput(de); + } + } else { + rc = finish_no_open(file, de); + } + } + +out_release: + ll_intent_release(it); + OBD_FREE(it, sizeof(*it)); + + RETURN(rc); +} + +#else /* !HAVE_IOP_ATOMIC_OPEN */ +static struct lookup_intent * +ll_convert_intent(struct open_intent *oit, int lookup_flags, bool is_readonly) +{ + struct lookup_intent *it; + + OBD_ALLOC_PTR(it); + if (!it) + return ERR_PTR(-ENOMEM); + + if (lookup_flags & LOOKUP_OPEN) { + it->it_op = IT_OPEN; + /* Avoid file creation for ro bind mount point(is_readonly) */ + if ((lookup_flags & LOOKUP_CREATE) && !is_readonly) + it->it_op |= IT_CREAT; + it->it_create_mode = (oit->create_mode & S_IALLUGO) | S_IFREG; + it->it_flags = ll_namei_to_lookup_intent_flag(oit->flags & + ~(is_readonly ? O_CREAT : 0)); + it->it_flags &= ~MDS_OPEN_FL_INTERNAL; + } else { + it->it_op = IT_GETATTR; + } + + return it; +} + +static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry, + struct nameidata *nd) +{ + struct dentry *de; + ENTRY; + + if (nd && !(nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))) { + struct lookup_intent *it; + + if (ll_d2d(dentry) && ll_d2d(dentry)->lld_it) { + it = ll_d2d(dentry)->lld_it; + ll_d2d(dentry)->lld_it = NULL; + } else { + /* + * Optimize away (CREATE && !OPEN). Let .create handle + * the race. But only if we have write permissions + * there, otherwise we need to proceed with lookup. + * LU-4185 + */ + if ((nd->flags & LOOKUP_CREATE) && + !(nd->flags & LOOKUP_OPEN) && + (inode_permission(parent, + MAY_WRITE | MAY_EXEC) == 0)) + RETURN(NULL); + + it = ll_convert_intent(&nd->intent.open, nd->flags, + (nd->path.mnt->mnt_flags & MNT_READONLY) || + (nd->path.mnt->mnt_sb->s_flags & SB_RDONLY)); + if (IS_ERR(it)) + RETURN((struct dentry *)it); + } + + de = ll_lookup_it(parent, dentry, it, NULL, NULL); + if (de) + dentry = de; + if ((nd->flags & LOOKUP_OPEN) && !IS_ERR(dentry)) { /* Open */ + if (dentry->d_inode && + it_disposition(it, DISP_OPEN_OPEN)) { /* nocreate */ + if (S_ISFIFO(dentry->d_inode->i_mode)) { + /* We cannot call open here as it might + * deadlock. This case is unreachable in + * practice because of + * OBD_CONNECT_NODEVOH. */ + } else { + struct file *filp; + + nd->intent.open.file->private_data = it; + filp = lookup_instantiate_filp(nd, + dentry, + NULL); + if (IS_ERR(filp)) { + if (de) + dput(de); + de = (struct dentry *)filp; + } + } + } else if (it_disposition(it, DISP_OPEN_CREATE)) { + /* XXX This can only reliably work on assumption + * that there are NO hashed negative dentries.*/ + ll_d2d(dentry)->lld_it = it; + it = NULL; /* Will be freed in ll_create_nd */ + /* We absolutely depend on ll_create_nd to be + * called to not leak this intent and possible + * data attached to it */ + } + } + + if (it) { + ll_intent_release(it); + OBD_FREE(it, sizeof(*it)); + } + } else { + de = ll_lookup_it(parent, dentry, NULL, NULL, NULL); + } + + RETURN(de); +} +#endif /* HAVE_IOP_ATOMIC_OPEN */ + +/* We depend on "mode" being set with the proper file type/umask by now */ +static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it) +{ + struct inode *inode = NULL; + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int rc; + ENTRY; + + LASSERT(it && it->it_disposition); + + LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF)); + request = it->it_request; + it_clear_disposition(it, DISP_ENQ_CREATE_REF); + rc = ll_prep_inode(&inode, request, dir->i_sb, it); + if (rc) + GOTO(out, inode = ERR_PTR(rc)); + + /* Pause to allow for a race with concurrent access by fid */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_NODE_PAUSE, cfs_fail_val); + + /* We asked for a lock on the directory, but were granted a + * lock on the inode. Since we finally have an inode pointer, + * stuff it in the lock. */ + CDEBUG(D_DLMTRACE, "setting l_ast_data to inode "DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); + EXIT; + out: + ptlrpc_req_finished(request); + return inode; +} + +/* + * By the time this is called, we already have created the directory cache + * entry for the new file, but it is so far negative - it has no inode. + * + * We defer creating the OBD object(s) until open, to keep the intent and + * non-intent code paths similar, and also because we do not have the MDS + * inode number before calling ll_create_node() (which is needed for LOV), + * so we would need to do yet another RPC to the MDS to store the LOV EA + * data on the MDS. If needed, we would pass the PACKED lmm as data and + * lmm_size in datalen (the MDS still has code which will handle that). + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ll_create_it(struct inode *dir, struct dentry *dentry, + struct lookup_intent *it, + void *secctx, __u32 secctxlen) +{ + struct inode *inode; + __u64 bits = 0; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), intent=%s\n", + dentry->d_name.len, dentry->d_name.name, + PFID(ll_inode2fid(dir)), dir, LL_IT2STR(it)); + + rc = it_open_error(DISP_OPEN_CREATE, it); + if (rc) + RETURN(rc); + + inode = ll_create_node(dir, it); + if (IS_ERR(inode)) + RETURN(PTR_ERR(inode)); + + if ((ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX) && secctx) { + /* must be done before d_instantiate, because it calls + * security_d_instantiate, which means a getxattr if security + * context is not set yet */ + /* no need to protect selinux_inode_setsecurity() by + * inode_lock. Taking it would lead to a client deadlock + * LU-13617 + */ + rc = security_inode_notifysecctx(inode, secctx, secctxlen); + if (rc) + RETURN(rc); + } + + d_instantiate(dentry, inode); + + if (!(ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX)) { + rc = ll_inode_init_security(dentry, inode, dir); + if (rc) + RETURN(rc); + } + + ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, inode, it, &bits); + if (bits & MDS_INODELOCK_LOOKUP) { + d_lustre_revalidate(dentry); + ll_update_dir_depth(dir, inode); + } + + RETURN(0); +} + +void ll_update_times(struct ptlrpc_request *request, struct inode *inode) +{ + struct mdt_body *body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + + LASSERT(body); + if (body->mbo_valid & OBD_MD_FLMTIME && + body->mbo_mtime > inode->i_mtime.tv_sec) { + CDEBUG(D_INODE, + "setting fid " DFID " mtime from %lld to %llu\n", + PFID(ll_inode2fid(inode)), + (s64)inode->i_mtime.tv_sec, body->mbo_mtime); + inode->i_mtime.tv_sec = body->mbo_mtime; + } + + if (body->mbo_valid & OBD_MD_FLCTIME && + body->mbo_ctime > inode->i_ctime.tv_sec) + inode->i_ctime.tv_sec = body->mbo_ctime; +} + +/* once default LMV (space balanced) is set on ROOT, it should take effect if + * default LMV is not set on parent directory. + */ +static void ll_qos_mkdir_prep(struct md_op_data *op_data, struct inode *dir) +{ + struct inode *root = dir->i_sb->s_root->d_inode; + struct ll_inode_info *rlli = ll_i2info(root); + struct ll_inode_info *lli = ll_i2info(dir); + struct lmv_stripe_md *lsm; + + op_data->op_dir_depth = lli->lli_dir_depth; + + /* parent directory is striped */ + if (unlikely(lli->lli_lsm_md)) + return; + + /* default LMV set on parent directory */ + if (unlikely(lli->lli_default_lsm_md)) + return; + + /* parent is ROOT */ + if (unlikely(dir == root)) + return; + + /* default LMV not set on ROOT */ + if (!rlli->lli_default_lsm_md) + return; + + down_read(&rlli->lli_lsm_sem); + lsm = rlli->lli_default_lsm_md; + if (!lsm) + goto unlock; + + /* not space balanced */ + if (lsm->lsm_md_master_mdt_index != LMV_OFFSET_DEFAULT) + goto unlock; + + if (lsm->lsm_md_max_inherit != LMV_INHERIT_NONE && + (lsm->lsm_md_max_inherit == LMV_INHERIT_UNLIMITED || + lsm->lsm_md_max_inherit >= lli->lli_dir_depth)) { + op_data->op_flags |= MF_QOS_MKDIR; + if (lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE && + (lsm->lsm_md_max_inherit_rr == LMV_INHERIT_RR_UNLIMITED || + lsm->lsm_md_max_inherit_rr >= lli->lli_dir_depth)) + op_data->op_flags |= MF_RR_MKDIR; + CDEBUG(D_INODE, DFID" requests qos mkdir %#x\n", + PFID(&lli->lli_fid), op_data->op_flags); + } +unlock: + up_read(&rlli->lli_lsm_sem); +} + +static int ll_new_node(struct inode *dir, struct dentry *dchild, + const char *tgt, umode_t mode, int rdev, __u32 opc) +{ + struct qstr *name = &dchild->d_name; + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + struct inode *inode = NULL; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int tgt_len = 0; + int err; + + ENTRY; + if (unlikely(tgt != NULL)) + tgt_len = strlen(tgt) + 1; + +again: + op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, + name->len, 0, opc, NULL); + if (IS_ERR(op_data)) + GOTO(err_exit, err = PTR_ERR(op_data)); + + if (S_ISDIR(mode)) + ll_qos_mkdir_prep(op_data, dir); + + if (sbi->ll_flags & LL_SBI_FILE_SECCTX) { + err = ll_dentry_init_security(dchild, mode, &dchild->d_name, + &op_data->op_file_secctx_name, + &op_data->op_file_secctx, + &op_data->op_file_secctx_size); + if (err < 0) + GOTO(err_exit, err); + } + + err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode, + from_kuid(&init_user_ns, current_fsuid()), + from_kgid(&init_user_ns, current_fsgid()), + cfs_curproc_cap_pack(), rdev, &request); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 14, 58, 0) + /* + * server < 2.12.58 doesn't pack default LMV in intent_getattr reply, + * fetch default LMV here. + */ + if (unlikely(err == -EREMOTE)) { + struct ll_inode_info *lli = ll_i2info(dir); + struct lmv_user_md *lum; + int lumsize; + int err2; + + ptlrpc_req_finished(request); + request = NULL; + ll_finish_md_op_data(op_data); + op_data = NULL; + + err2 = ll_dir_getstripe(dir, (void **)&lum, &lumsize, &request, + OBD_MD_DEFAULT_MEA); + if (err2 == 0) { + struct lustre_md md = { NULL }; + + md.body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + if (!md.body) + GOTO(err_exit, err = -EPROTO); + + OBD_ALLOC_PTR(md.default_lmv); + if (!md.default_lmv) + GOTO(err_exit, err = -ENOMEM); + + md.default_lmv->lsm_md_magic = lum->lum_magic; + md.default_lmv->lsm_md_stripe_count = + lum->lum_stripe_count; + md.default_lmv->lsm_md_master_mdt_index = + lum->lum_stripe_offset; + md.default_lmv->lsm_md_hash_type = lum->lum_hash_type; + md.default_lmv->lsm_md_max_inherit = + lum->lum_max_inherit; + md.default_lmv->lsm_md_max_inherit_rr = + lum->lum_max_inherit_rr; + + err = ll_update_inode(dir, &md); + md_free_lustre_md(sbi->ll_md_exp, &md); + if (err) + GOTO(err_exit, err); + } else if (err2 == -ENODATA && lli->lli_default_lsm_md) { + /* + * If there are no default stripe EA on the MDT, but the + * client has default stripe, then it probably means + * default stripe EA has just been deleted. + */ + down_write(&lli->lli_lsm_sem); + if (lli->lli_default_lsm_md) + OBD_FREE_PTR(lli->lli_default_lsm_md); + lli->lli_default_lsm_md = NULL; + up_write(&lli->lli_lsm_sem); + } else { + GOTO(err_exit, err); + } + + ptlrpc_req_finished(request); + request = NULL; + goto again; + } +#endif + + if (err < 0) + GOTO(err_exit, err); + + ll_update_times(request, dir); + + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_NEWNODE_PAUSE, cfs_fail_val); + + err = ll_prep_inode(&inode, request, dchild->d_sb, NULL); + if (err) + GOTO(err_exit, err); + + if (sbi->ll_flags & LL_SBI_FILE_SECCTX) { + /* must be done before d_instantiate, because it calls + * security_d_instantiate, which means a getxattr if security + * context is not set yet */ + /* no need to protect selinux_inode_setsecurity() by + * inode_lock. Taking it would lead to a client deadlock + * LU-13617 + */ + err = security_inode_notifysecctx(inode, + op_data->op_file_secctx, + op_data->op_file_secctx_size); + if (err) + GOTO(err_exit, err); + } + + d_instantiate(dchild, inode); + + if (!(sbi->ll_flags & LL_SBI_FILE_SECCTX)) { + err = ll_inode_init_security(dchild, inode, dir); + if (err) + GOTO(err_exit, err); + } + + EXIT; +err_exit: + if (request != NULL) + ptlrpc_req_finished(request); + + if (!IS_ERR_OR_NULL(op_data)) + ll_finish_md_op_data(op_data); + + return err; +} + +static int ll_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dchild, ll_umode_t mode, dev_t rdev) +{ + struct qstr *name = &dchild->d_name; + int err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p) mode %o dev %x\n", + name->len, name->name, PFID(ll_inode2fid(dir)), dir, + mode, rdev); + + if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) + mode &= ~current_umask(); + + switch (mode & S_IFMT) { + case 0: + mode |= S_IFREG; + fallthrough; + case S_IFREG: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ll_new_node(dir, dchild, NULL, mode, old_encode_dev(rdev), + LUSTRE_OPC_MKNOD); + break; + case S_IFDIR: + err = -EPERM; + break; + default: + err = -EINVAL; + } + + if (!err) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1); + + RETURN(err); +} + +#ifdef HAVE_IOP_ATOMIC_OPEN +/* + * Plain create. Intent create is handled in atomic_open. + */ +static int ll_create_nd(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t mode, bool want_excl) +{ + int rc; + + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE, cfs_fail_val); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), " + "flags=%u, excl=%d\n", dentry->d_name.len, + dentry->d_name.name, PFID(ll_inode2fid(dir)), + dir, mode, want_excl); + + /* Using mknod(2) to create a regular file is designed to not recognize + * volatile file name, so we use ll_mknod() here. */ + rc = ll_mknod(mnt_userns, dir, dentry, mode, 0); + + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, unhashed %d\n", + dentry->d_name.len, dentry->d_name.name, d_unhashed(dentry)); + + return rc; +} +#else /* !HAVE_IOP_ATOMIC_OPEN */ +static int ll_create_nd(struct inode *dir, struct dentry *dentry, + ll_umode_t mode, struct nameidata *nd) +{ + struct ll_dentry_data *lld = ll_d2d(dentry); + struct lookup_intent *it = NULL; + int rc; + + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE, cfs_fail_val); + + if (lld != NULL) + it = lld->lld_it; + + if (!it) { + /* LU-8559: use LUSTRE_OPC_CREATE for non atomic open case + * so that volatile file name is recoginized. + * Mknod(2), however, is designed to not recognize volatile + * file name to avoid inode leak under orphan directory until + * MDT reboot */ + return ll_new_node(dir, dentry, NULL, mode, 0, + LUSTRE_OPC_CREATE); + } + + lld->lld_it = NULL; + + /* Was there an error? Propagate it! */ + if (it->it_status) { + rc = it->it_status; + goto out; + } + + rc = ll_create_it(dir, dentry, it, NULL, 0); + if (nd && (nd->flags & LOOKUP_OPEN) && dentry->d_inode) { /* Open */ + struct file *filp; + + nd->intent.open.file->private_data = it; + filp = lookup_instantiate_filp(nd, dentry, NULL); + if (IS_ERR(filp)) + rc = PTR_ERR(filp); + } + +out: + ll_intent_release(it); + OBD_FREE(it, sizeof(*it)); + + if (!rc) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1); + + return rc; +} +#endif /* HAVE_IOP_ATOMIC_OPEN */ + +static int ll_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dchild, const char *oldpath) +{ + struct qstr *name = &dchild->d_name; + int err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), target=%.*s\n", + name->len, name->name, PFID(ll_inode2fid(dir)), + dir, 3000, oldpath); + + err = ll_new_node(dir, dchild, oldpath, S_IFLNK | S_IRWXUGO, 0, + LUSTRE_OPC_SYMLINK); + + if (!err) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1); + + RETURN(err); +} + +static int ll_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + struct inode *src = old_dentry->d_inode; + struct qstr *name = &new_dentry->d_name; + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int err; + + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op: inode="DFID"(%p), dir="DFID"(%p), " + "target=%.*s\n", PFID(ll_inode2fid(src)), src, + PFID(ll_inode2fid(dir)), dir, name->len, name->name); + + op_data = ll_prep_md_op_data(NULL, src, dir, name->name, name->len, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + err = md_link(sbi->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (err) + GOTO(out, err); + + ll_update_times(request, dir); + ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1); + EXIT; +out: + ptlrpc_req_finished(request); + RETURN(err); +} + +static int ll_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dchild, ll_umode_t mode) +{ + struct qstr *name = &dchild->d_name; + int err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n", + name->len, name->name, PFID(ll_inode2fid(dir)), dir); + + if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) + mode &= ~current_umask(); + + mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR; + + err = ll_new_node(dir, dchild, NULL, mode, 0, LUSTRE_OPC_MKDIR); + if (err == 0) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1); + + RETURN(err); +} + +static int ll_rmdir(struct inode *dir, struct dentry *dchild) +{ + struct qstr *name = &dchild->d_name; + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n", + name->len, name->name, PFID(ll_inode2fid(dir)), dir); + + if (unlikely(d_mountpoint(dchild))) + RETURN(-EBUSY); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len, + S_IFDIR, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + if (dchild->d_inode != NULL) + op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); + + op_data->op_fid2 = op_data->op_fid3; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc == 0) { + ll_update_times(request, dir); + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1); + } + + ptlrpc_req_finished(request); + RETURN(rc); +} + +/** + * Remove dir entry + **/ +int ll_rmdir_entry(struct inode *dir, char *name, int namelen) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n", + namelen, name, PFID(ll_inode2fid(dir)), dir); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name), + S_IFDIR, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + op_data->op_cli_flags |= CLI_RM_ENTRY; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc == 0) { + ll_update_times(request, dir); + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1); + } + + ptlrpc_req_finished(request); + RETURN(rc); +} + +static int ll_unlink(struct inode *dir, struct dentry *dchild) +{ + struct qstr *name = &dchild->d_name; + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + struct mdt_body *body; + int rc; + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n", + name->len, name->name, PFID(ll_inode2fid(dir)), dir); + + /* + * XXX: unlink bind mountpoint maybe call to here, + * just check it as vfs_unlink does. + */ + if (unlikely(d_mountpoint(dchild))) + RETURN(-EBUSY); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); + + op_data->op_fid2 = op_data->op_fid3; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc) + GOTO(out, rc); + + /* + * The server puts attributes in on the last unlink, use them to update + * the link count so the inode can be freed immediately. + */ + body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + if (body->mbo_valid & OBD_MD_FLNLINK) + set_nlink(dchild->d_inode, body->mbo_nlink); + + ll_update_times(request, dir); + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1); + +out: + ptlrpc_req_finished(request); + RETURN(rc); +} + +static int ll_rename(struct user_namespace *mnt_userns, + struct inode *src, struct dentry *src_dchild, + struct inode *tgt, struct dentry *tgt_dchild +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_IOPS_RENAME_WITH_FLAGS) + , unsigned int flags +#endif + ) +{ + struct qstr *src_name = &src_dchild->d_name; + struct qstr *tgt_name = &tgt_dchild->d_name; + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(src); + struct md_op_data *op_data; + int err; + ENTRY; + +#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_IOPS_RENAME_WITH_FLAGS) + if (flags) + return -EINVAL; +#endif + + CDEBUG(D_VFSTRACE, "VFS Op:oldname=%.*s, src_dir="DFID + "(%p), newname=%.*s, tgt_dir="DFID"(%p)\n", + src_name->len, src_name->name, + PFID(ll_inode2fid(src)), src, tgt_name->len, + tgt_name->name, PFID(ll_inode2fid(tgt)), tgt); + + if (unlikely(d_mountpoint(src_dchild) || d_mountpoint(tgt_dchild))) + RETURN(-EBUSY); + + op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + if (src_dchild->d_inode != NULL) + op_data->op_fid3 = *ll_inode2fid(src_dchild->d_inode); + + if (tgt_dchild->d_inode != NULL) + op_data->op_fid4 = *ll_inode2fid(tgt_dchild->d_inode); + + err = md_rename(sbi->ll_md_exp, op_data, + src_name->name, src_name->len, + tgt_name->name, tgt_name->len, &request); + ll_finish_md_op_data(op_data); + if (!err) { + ll_update_times(request, src); + ll_update_times(request, tgt); + ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1); + } + + ptlrpc_req_finished(request); + + if (err == 0) + d_move(src_dchild, tgt_dchild); + + RETURN(err); +} + +const struct inode_operations ll_dir_inode_operations = { + .mknod = ll_mknod, +#ifdef HAVE_IOP_ATOMIC_OPEN + .atomic_open = ll_atomic_open, +#endif + .lookup = ll_lookup_nd, + .create = ll_create_nd, + /* We need all these non-raw things for NFSD, to not patch it. */ + .unlink = ll_unlink, + .mkdir = ll_mkdir, + .rmdir = ll_rmdir, + .symlink = ll_symlink, + .link = ll_link, + .rename = ll_rename, + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, +#ifdef HAVE_IOP_XATTR + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .removexattr = ll_removexattr, +#endif + .listxattr = ll_listxattr, +#ifdef HAVE_IOP_GET_ACL + .get_acl = ll_get_acl, +#endif +#ifdef HAVE_IOP_SET_ACL + .set_acl = ll_set_acl, +#endif +}; + +const struct inode_operations ll_special_inode_operations = { + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, +#ifdef HAVE_IOP_XATTR + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .removexattr = ll_removexattr, +#endif + .listxattr = ll_listxattr, +#ifdef HAVE_IOP_GET_ACL + .get_acl = ll_get_acl, +#endif +#ifdef HAVE_IOP_SET_ACL + .set_acl = ll_set_acl, +#endif +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/range_lock.c b/drivers/staging/lustrefsx/lustre/llite/range_lock.c new file mode 100644 index 0000000000000..7a4c9c4cb766a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/range_lock.c @@ -0,0 +1,247 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Range lock is used to allow multiple threads writing a single shared + * file given each thread is writing to a non-overlapping portion of the + * file. + * + * Refer to the possible upstream kernel version of range lock by + * Jan Kara : https://lkml.org/lkml/2013/1/31/480 + * + * This file could later replaced by the upstream kernel version. + */ +/* + * Author: Prakash Surya + * Author: Bobi Jam + */ +#ifdef HAVE_SCHED_HEADERS +#include +#endif +#include "range_lock.h" +#include + +/** + * Initialize a range lock tree + * + * \param tree [in] an empty range lock tree + * + * Pre: Caller should have allocated the range lock tree. + * Post: The range lock tree is ready to function. + */ +void range_lock_tree_init(struct range_lock_tree *tree) +{ + tree->rlt_root = NULL; + tree->rlt_sequence = 0; + spin_lock_init(&tree->rlt_lock); +} + +/** + * Intialize a range lock node + * + * \param lock [in] an empty range lock node + * \param start [in] start of the covering region + * \param end [in] end of the covering region + * + * Pre: Caller should have allocated the range lock node. + * Post: The range lock node is meant to cover [start, end] region + */ +int range_lock_init(struct range_lock *lock, __u64 start, __u64 end) +{ + int rc; + + interval_init(&lock->rl_node); + if (end != LUSTRE_EOF) + end >>= PAGE_SHIFT; + rc = interval_set(&lock->rl_node, start >> PAGE_SHIFT, end); + if (rc) + return rc; + + INIT_LIST_HEAD(&lock->rl_next_lock); + lock->rl_task = NULL; + lock->rl_lock_count = 0; + lock->rl_blocking_ranges = 0; + lock->rl_sequence = 0; + return rc; +} + +static inline struct range_lock *next_lock(struct range_lock *lock) +{ + return list_entry(lock->rl_next_lock.next, typeof(*lock), rl_next_lock); +} + +/** + * Helper function of range_unlock() + * + * \param node [in] a range lock found overlapped during interval node + * search + * \param arg [in] the range lock to be tested + * + * \retval INTERVAL_ITER_CONT indicate to continue the search for next + * overlapping range node + * \retval INTERVAL_ITER_STOP indicate to stop the search + */ +static enum interval_iter range_unlock_cb(struct interval_node *node, void *arg) +{ + struct range_lock *lock = arg; + struct range_lock *overlap = node2rangelock(node); + struct range_lock *iter; + ENTRY; + + list_for_each_entry(iter, &overlap->rl_next_lock, rl_next_lock) { + if (iter->rl_sequence > lock->rl_sequence) { + --iter->rl_blocking_ranges; + LASSERT(iter->rl_blocking_ranges > 0); + } + } + if (overlap->rl_sequence > lock->rl_sequence) { + --overlap->rl_blocking_ranges; + if (overlap->rl_blocking_ranges == 0) + wake_up_process(overlap->rl_task); + } + RETURN(INTERVAL_ITER_CONT); +} + +/** + * Unlock a range lock, wake up locks blocked by this lock. + * + * \param tree [in] range lock tree + * \param lock [in] range lock to be deleted + * + * If this lock has been granted, relase it; if not, just delete it from + * the tree or the same region lock list. Wake up those locks only blocked + * by this lock through range_unlock_cb(). + */ +void range_unlock(struct range_lock_tree *tree, struct range_lock *lock) +{ + ENTRY; + + spin_lock(&tree->rlt_lock); + if (!list_empty(&lock->rl_next_lock)) { + struct range_lock *next; + + if (interval_is_intree(&lock->rl_node)) { /* first lock */ + /* Insert the next same range lock into the tree */ + next = next_lock(lock); + next->rl_lock_count = lock->rl_lock_count - 1; + interval_erase(&lock->rl_node, &tree->rlt_root); + interval_insert(&next->rl_node, &tree->rlt_root); + } else { + /* find the first lock in tree */ + list_for_each_entry(next, &lock->rl_next_lock, + rl_next_lock) { + if (!interval_is_intree(&next->rl_node)) + continue; + + LASSERT(next->rl_lock_count > 0); + next->rl_lock_count--; + break; + } + } + list_del_init(&lock->rl_next_lock); + } else { + LASSERT(interval_is_intree(&lock->rl_node)); + interval_erase(&lock->rl_node, &tree->rlt_root); + } + + interval_search(tree->rlt_root, &lock->rl_node.in_extent, + range_unlock_cb, lock); + spin_unlock(&tree->rlt_lock); + + EXIT; +} + +/** + * Helper function of range_lock() + * + * \param node [in] a range lock found overlapped during interval node + * search + * \param arg [in] the range lock to be tested + * + * \retval INTERVAL_ITER_CONT indicate to continue the search for next + * overlapping range node + * \retval INTERVAL_ITER_STOP indicate to stop the search + */ +static enum interval_iter range_lock_cb(struct interval_node *node, void *arg) +{ + struct range_lock *lock = (struct range_lock *)arg; + struct range_lock *overlap = node2rangelock(node); + + lock->rl_blocking_ranges += overlap->rl_lock_count + 1; + RETURN(INTERVAL_ITER_CONT); +} + +/** + * Lock a region + * + * \param tree [in] range lock tree + * \param lock [in] range lock node containing the region span + * + * \retval 0 get the range lock + * \retval <0 error code while not getting the range lock + * + * If there exists overlapping range lock, the new lock will wait and + * retry, if later it find that it is not the chosen one to wake up, + * it wait again. + */ +int range_lock(struct range_lock_tree *tree, struct range_lock *lock) +{ + struct interval_node *node; + int rc = 0; + ENTRY; + + spin_lock(&tree->rlt_lock); + /* + * We need to check for all conflicting intervals + * already in the tree. + */ + interval_search(tree->rlt_root, &lock->rl_node.in_extent, + range_lock_cb, lock); + /* + * Insert to the tree if I am unique, otherwise I've been linked to + * the rl_next_lock of another lock which has the same range as mine + * in range_lock_cb(). + */ + node = interval_insert(&lock->rl_node, &tree->rlt_root); + if (node != NULL) { + struct range_lock *tmp = node2rangelock(node); + + list_add_tail(&lock->rl_next_lock, &tmp->rl_next_lock); + tmp->rl_lock_count++; + } + lock->rl_sequence = ++tree->rlt_sequence; + + while (lock->rl_blocking_ranges > 0) { + lock->rl_task = current; + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock(&tree->rlt_lock); + schedule(); + + if (signal_pending(current)) { + range_unlock(tree, lock); + GOTO(out, rc = -ERESTARTSYS); + } + spin_lock(&tree->rlt_lock); + } + spin_unlock(&tree->rlt_lock); +out: + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/range_lock.h b/drivers/staging/lustrefsx/lustre/llite/range_lock.h new file mode 100644 index 0000000000000..5266db71bb676 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/range_lock.h @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Range lock is used to allow multiple threads writing a single shared + * file given each thread is writing to a non-overlapping portion of the + * file. + * + * Refer to the possible upstream kernel version of range lock by + * Jan Kara : https://lkml.org/lkml/2013/1/31/480 + * + * This file could later replaced by the upstream kernel version. + */ +/* + * Author: Prakash Surya + * Author: Bobi Jam + */ +#ifndef _RANGE_LOCK_H +#define _RANGE_LOCK_H + +#include +#include + +#define RL_FMT "[%llu, %llu]" +#define RL_PARA(range) \ + (range)->rl_node.in_extent.start, \ + (range)->rl_node.in_extent.end + +struct range_lock { + struct interval_node rl_node; + /** + * Process to enqueue this lock. + */ + struct task_struct *rl_task; + /** + * List of locks with the same range. + */ + struct list_head rl_next_lock; + /** + * Number of locks in the list rl_next_lock + */ + unsigned int rl_lock_count; + /** + * Number of ranges which are blocking acquisition of the lock + */ + unsigned int rl_blocking_ranges; + /** + * Sequence number of range lock. This number is used to get to know + * the order the locks are queued; this is required for range_cancel(). + */ + __u64 rl_sequence; +}; + +static inline struct range_lock *node2rangelock(const struct interval_node *n) +{ + return container_of(n, struct range_lock, rl_node); +} + +struct range_lock_tree { + struct interval_node *rlt_root; + spinlock_t rlt_lock; + __u64 rlt_sequence; +}; + +void range_lock_tree_init(struct range_lock_tree *tree); +int range_lock_init(struct range_lock *lock, __u64 start, __u64 end); +int range_lock(struct range_lock_tree *tree, struct range_lock *lock); +void range_unlock(struct range_lock_tree *tree, struct range_lock *lock); +#endif diff --git a/drivers/staging/lustrefsx/lustre/llite/rw.c b/drivers/staging/lustrefsx/lustre/llite/rw.c new file mode 100644 index 0000000000000..a5f3f9c187d57 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/rw.c @@ -0,0 +1,1267 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/rw.c + * + * Lustre Lite I/O page cache routines shared by different kernel revs + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +/* current_is_kswapd() */ +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include "llite_internal.h" +#include + +static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); + +/** + * Get readahead pages from the filesystem readahead pool of the client for a + * thread. + * + * /param sbi superblock for filesystem readahead state ll_ra_info + * /param ria per-thread readahead state + * /param pages number of pages requested for readahead for the thread. + * + * WARNING: This algorithm is used to reduce contention on sbi->ll_lock. + * It should work well if the ra_max_pages is much greater than the single + * file's read-ahead window, and not too many threads contending for + * these readahead pages. + * + * TODO: There may be a 'global sync problem' if many threads are trying + * to get an ra budget that is larger than the remaining readahead pages + * and reach here at exactly the same time. They will compute /a ret to + * consume the remaining pages, but will fail at atomic_add_return() and + * get a zero ra window, although there is still ra space remaining. - Jay */ + +static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, + struct ra_io_arg *ria, + unsigned long pages, unsigned long min) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + long ret; + ENTRY; + + /* If read-ahead pages left are less than 1M, do not do read-ahead, + * otherwise it will form small read RPC(< 1M), which hurt server + * performance a lot. */ + ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), + pages); + if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages)) + GOTO(out, ret = 0); + + if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { + atomic_sub(ret, &ra->ra_cur_pages); + ret = 0; + } + +out: + if (ret < min) { + /* override ra limit for maximum performance */ + atomic_add(min - ret, &ra->ra_cur_pages); + ret = min; + } + RETURN(ret); +} + +void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + atomic_sub(len, &ra->ra_cur_pages); +} + +static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which) +{ + LASSERTF(which < _NR_RA_STAT, "which: %u\n", which); + lprocfs_counter_incr(sbi->ll_ra_stats, which); +} + +void ll_ra_stats_inc(struct inode *inode, enum ra_stat which) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + ll_ra_stats_inc_sbi(sbi, which); +} + +#define RAS_CDEBUG(ras) \ + CDEBUG(D_READA, \ + "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu rpc %lu " \ + "r %lu ri %lu csr %lu sf %lu sp %lu sl %lu\n", \ + ras->ras_last_readpage, ras->ras_consecutive_requests, \ + ras->ras_consecutive_pages, ras->ras_window_start, \ + ras->ras_window_len, ras->ras_next_readahead, \ + ras->ras_rpc_size, \ + ras->ras_requests, ras->ras_request_index, \ + ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ + ras->ras_stride_pages, ras->ras_stride_length) + +static int index_in_window(unsigned long index, unsigned long point, + unsigned long before, unsigned long after) +{ + unsigned long start = point - before, end = point + after; + + if (start > point) + start = 0; + if (end < point) + end = ~0; + + return start <= index && index <= end; +} + +void ll_ras_enter(struct file *f) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(f); + struct ll_readahead_state *ras = &fd->fd_ras; + + spin_lock(&ras->ras_lock); + ras->ras_requests++; + ras->ras_request_index = 0; + ras->ras_consecutive_requests++; + spin_unlock(&ras->ras_lock); +} + +/** + * Initiates read-ahead of a page with given index. + * + * \retval +ve: page was already uptodate so it will be skipped + * from being added; + * \retval -ve: page wasn't added to \a queue for error; + * \retval 0: page was added into \a queue for read ahead. + */ +static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, pgoff_t index) +{ + struct cl_object *clob = io->ci_obj; + struct inode *inode = vvp_object_inode(clob); + struct page *vmpage; + struct cl_page *page; + struct vvp_page *vpg; + enum ra_stat which = _NR_RA_STAT; /* keep gcc happy */ + int rc = 0; + const char *msg = NULL; + ENTRY; + + vmpage = grab_cache_page_nowait(inode->i_mapping, index); + if (vmpage == NULL) { + which = RA_STAT_FAILED_GRAB_PAGE; + msg = "g_c_p_n failed"; + GOTO(out, rc = -EBUSY); + } + + /* Check if vmpage was truncated or reclaimed */ + if (vmpage->mapping != inode->i_mapping) { + which = RA_STAT_WRONG_GRAB_PAGE; + msg = "g_c_p_n returned invalid page"; + GOTO(out, rc = -EBUSY); + } + + page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); + if (IS_ERR(page)) { + which = RA_STAT_FAILED_GRAB_PAGE; + msg = "cl_page_find failed"; + GOTO(out, rc = PTR_ERR(page)); + } + + lu_ref_add(&page->cp_reference, "ra", current); + cl_page_assume(env, io, page); + vpg = cl2vvp_page(cl_object_page_slice(clob, page)); + if (!vpg->vpg_defer_uptodate && !PageUptodate(vmpage)) { + vpg->vpg_defer_uptodate = 1; + vpg->vpg_ra_used = 0; + cl_page_list_add(queue, page); + } else { + /* skip completed pages */ + cl_page_unassume(env, io, page); + /* This page is already uptodate, returning a positive number + * to tell the callers about this */ + rc = 1; + } + + lu_ref_del(&page->cp_reference, "ra", current); + cl_page_put(env, page); + +out: + if (vmpage != NULL) { + if (rc != 0) + unlock_page(vmpage); + put_page(vmpage); + } + if (msg != NULL) { + ll_ra_stats_inc(inode, which); + CDEBUG(D_READA, "%s\n", msg); + + } + + RETURN(rc); +} + +#define RIA_DEBUG(ria) \ + CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \ + ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ + ria->ria_pages) + +static inline int stride_io_mode(struct ll_readahead_state *ras) +{ + return ras->ras_consecutive_stride_requests > 1; +} + +/* The function calculates how much pages will be read in + * [off, off + length], in such stride IO area, + * stride_offset = st_off, stride_lengh = st_len, + * stride_pages = st_pgs + * + * |------------------|*****|------------------|*****|------------|*****|.... + * st_off + * |--- st_pgs ---| + * |----- st_len -----| + * + * How many pages it should read in such pattern + * |-------------------------------------------------------------| + * off + * |<------ length ------->| + * + * = |<----->| + |-------------------------------------| + |---| + * start_left st_pgs * i end_left + */ +static unsigned long +stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs, + unsigned long off, unsigned long length) +{ + __u64 start = off > st_off ? off - st_off : 0; + __u64 end = off + length > st_off ? off + length - st_off : 0; + unsigned long start_left = 0; + unsigned long end_left = 0; + unsigned long pg_count; + + if (st_len == 0 || length == 0 || end == 0) + return length; + + start_left = do_div(start, st_len); + if (start_left < st_pgs) + start_left = st_pgs - start_left; + else + start_left = 0; + + end_left = do_div(end, st_len); + if (end_left > st_pgs) + end_left = st_pgs; + + CDEBUG(D_READA, "start %llu, end %llu start_left %lu end_left %lu\n", + start, end, start_left, end_left); + + if (start == end) + pg_count = end_left - (st_pgs - start_left); + else + pg_count = start_left + st_pgs * (end - start - 1) + end_left; + + CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu" + "pgcount %lu\n", st_off, st_len, st_pgs, off, length, pg_count); + + return pg_count; +} + +static int ria_page_count(struct ra_io_arg *ria) +{ + __u64 length = ria->ria_end >= ria->ria_start ? + ria->ria_end - ria->ria_start + 1 : 0; + + return stride_pg_count(ria->ria_stoff, ria->ria_length, + ria->ria_pages, ria->ria_start, + length); +} + +static unsigned long ras_align(struct ll_readahead_state *ras, + unsigned long index, + unsigned long *remainder) +{ + unsigned long rem = index % ras->ras_rpc_size; + if (remainder != NULL) + *remainder = rem; + return index - rem; +} + +/*Check whether the index is in the defined ra-window */ +static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) +{ + /* If ria_length == ria_pages, it means non-stride I/O mode, + * idx should always inside read-ahead window in this case + * For stride I/O mode, just check whether the idx is inside + * the ria_pages. */ + return ria->ria_length == 0 || ria->ria_length == ria->ria_pages || + (idx >= ria->ria_stoff && (idx - ria->ria_stoff) % + ria->ria_length < ria->ria_pages); +} + +static unsigned long +ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct ll_readahead_state *ras, + struct ra_io_arg *ria, pgoff_t *ra_end) +{ + struct cl_read_ahead ra = { 0 }; + int rc = 0, count = 0; + bool stride_ria; + pgoff_t page_idx; + + LASSERT(ria != NULL); + RIA_DEBUG(ria); + + stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0; + for (page_idx = ria->ria_start; + page_idx <= ria->ria_end && ria->ria_reserved > 0; page_idx++) { + if (ras_inside_ra_window(page_idx, ria)) { + if (ra.cra_end == 0 || ra.cra_end < page_idx) { + unsigned long end; + + cl_read_ahead_release(env, &ra); + + rc = cl_io_read_ahead(env, io, page_idx, &ra); + if (rc < 0) + break; + + CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n", + page_idx, ra.cra_end, ra.cra_rpc_size); + LASSERTF(ra.cra_end >= page_idx, + "object: %p, indcies %lu / %lu\n", + io->ci_obj, ra.cra_end, page_idx); + /* update read ahead RPC size. + * NB: it's racy but doesn't matter */ + if (ras->ras_rpc_size != ra.cra_rpc_size && + ra.cra_rpc_size > 0) + ras->ras_rpc_size = ra.cra_rpc_size; + /* trim it to align with optimal RPC size */ + end = ras_align(ras, ria->ria_end + 1, NULL); + if (end > 0 && !ria->ria_eof) + ria->ria_end = end - 1; + if (ria->ria_end < ria->ria_end_min) + ria->ria_end = ria->ria_end_min; + if (ria->ria_end > ra.cra_end) + ria->ria_end = ra.cra_end; + } + if (page_idx > ria->ria_end) + break; + + /* If the page is inside the read-ahead window */ + rc = ll_read_ahead_page(env, io, queue, page_idx); + if (rc < 0) + break; + + *ra_end = page_idx; + /* Only subtract from reserve & count the page if we + * really did readahead on that page. */ + if (rc == 0) { + ria->ria_reserved--; + count++; + } + } else if (stride_ria) { + /* If it is not in the read-ahead window, and it is + * read-ahead mode, then check whether it should skip + * the stride gap */ + pgoff_t offset; + /* FIXME: This assertion only is valid when it is for + * forward read-ahead, it will be fixed when backward + * read-ahead is implemented */ + LASSERTF(page_idx >= ria->ria_stoff, + "Invalid page_idx %lu rs %lu re %lu ro %lu " + "rl %lu rp %lu\n", page_idx, + ria->ria_start, ria->ria_end, ria->ria_stoff, + ria->ria_length, ria->ria_pages); + offset = page_idx - ria->ria_stoff; + offset = offset % (ria->ria_length); + if (offset > ria->ria_pages) { + page_idx += ria->ria_length - offset; + CDEBUG(D_READA, "i %lu skip %lu \n", page_idx, + ria->ria_length - offset); + continue; + } + } + } + + cl_read_ahead_release(env, &ra); + + return count; +} + +static int ll_readahead(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, + struct ll_readahead_state *ras, bool hit) +{ + struct vvp_io *vio = vvp_env_io(env); + struct ll_thread_info *lti = ll_env_info(env); + struct cl_attr *attr = vvp_env_thread_attr(env); + unsigned long len, mlen = 0; + pgoff_t ra_end = 0, start = 0, end = 0; + struct inode *inode; + struct ra_io_arg *ria = <i->lti_ria; + struct cl_object *clob; + int ret = 0; + __u64 kms; + ENTRY; + + clob = io->ci_obj; + inode = vvp_object_inode(clob); + + memset(ria, 0, sizeof *ria); + + cl_object_attr_lock(clob); + ret = cl_object_attr_get(env, clob, attr); + cl_object_attr_unlock(clob); + + if (ret != 0) + RETURN(ret); + kms = attr->cat_kms; + if (kms == 0) { + ll_ra_stats_inc(inode, RA_STAT_ZERO_LEN); + RETURN(0); + } + + spin_lock(&ras->ras_lock); + + /** + * Note: other thread might rollback the ras_next_readahead, + * if it can not get the full size of prepared pages, see the + * end of this function. For stride read ahead, it needs to + * make sure the offset is no less than ras_stride_offset, + * so that stride read ahead can work correctly. + */ + if (stride_io_mode(ras)) + start = max(ras->ras_next_readahead, ras->ras_stride_offset); + else + start = ras->ras_next_readahead; + + if (ras->ras_window_len > 0) + end = ras->ras_window_start + ras->ras_window_len - 1; + + /* Enlarge the RA window to encompass the full read */ + if (vio->vui_ra_valid && + end < vio->vui_ra_start + vio->vui_ra_count - 1) + end = vio->vui_ra_start + vio->vui_ra_count - 1; + + if (end != 0) { + unsigned long end_index; + + /* Truncate RA window to end of file */ + end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT); + if (end_index <= end) { + end = end_index; + ria->ria_eof = true; + } + } + ria->ria_start = start; + ria->ria_end = end; + /* If stride I/O mode is detected, get stride window*/ + if (stride_io_mode(ras)) { + ria->ria_stoff = ras->ras_stride_offset; + ria->ria_length = ras->ras_stride_length; + ria->ria_pages = ras->ras_stride_pages; + } + spin_unlock(&ras->ras_lock); + + if (end == 0) { + ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW); + RETURN(0); + } + len = ria_page_count(ria); + if (len == 0) { + ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW); + RETURN(0); + } + + RAS_CDEBUG(ras); + CDEBUG(D_READA, DFID": ria: %lu/%lu, bead: %lu/%lu, hit: %d\n", + PFID(lu_object_fid(&clob->co_lu)), + ria->ria_start, ria->ria_end, + vio->vui_ra_valid ? vio->vui_ra_start : 0, + vio->vui_ra_valid ? vio->vui_ra_count : 0, + hit); + + /* at least to extend the readahead window to cover current read */ + if (!hit && vio->vui_ra_valid && + vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) { + unsigned long remainder; + + /* to the end of current read window. */ + mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start; + /* trim to RPC boundary */ + ras_align(ras, ria->ria_start, &remainder); + mlen = min(mlen, ras->ras_rpc_size - remainder); + ria->ria_end_min = ria->ria_start + mlen; + } + + ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen); + if (ria->ria_reserved < len) + ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT); + + CDEBUG(D_READA, "reserved pages: %lu/%lu/%lu, ra_cur %d, ra_max %lu\n", + ria->ria_reserved, len, mlen, + atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), + ll_i2sbi(inode)->ll_ra_info.ra_max_pages); + + ret = ll_read_ahead_pages(env, io, queue, ras, ria, &ra_end); + + if (ria->ria_reserved != 0) + ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved); + + if (ra_end == end && ra_end == (kms >> PAGE_SHIFT)) + ll_ra_stats_inc(inode, RA_STAT_EOF); + + CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n", + ra_end, end, ria->ria_end, ret); + + if (ra_end != end) + ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END); + if (ra_end > 0) { + /* update the ras so that the next read-ahead tries from + * where we left off. */ + spin_lock(&ras->ras_lock); + ras->ras_next_readahead = ra_end + 1; + spin_unlock(&ras->ras_lock); + RAS_CDEBUG(ras); + } + + RETURN(ret); +} + +static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras, + unsigned long index) +{ + ras->ras_window_start = ras_align(ras, index, NULL); +} + +/* called with the ras_lock held or from places where it doesn't matter */ +static void ras_reset(struct inode *inode, struct ll_readahead_state *ras, + unsigned long index) +{ + ras->ras_last_readpage = index; + ras->ras_consecutive_requests = 0; + ras->ras_consecutive_pages = 0; + ras->ras_window_len = 0; + ras_set_start(inode, ras, index); + ras->ras_next_readahead = max(ras->ras_window_start, index + 1); + + RAS_CDEBUG(ras); +} + +/* called with the ras_lock held or from places where it doesn't matter */ +static void ras_stride_reset(struct ll_readahead_state *ras) +{ + ras->ras_consecutive_stride_requests = 0; + ras->ras_stride_length = 0; + ras->ras_stride_pages = 0; + RAS_CDEBUG(ras); +} + +void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) +{ + spin_lock_init(&ras->ras_lock); + ras->ras_rpc_size = PTLRPC_MAX_BRW_PAGES; + ras_reset(inode, ras, 0); + ras->ras_requests = 0; +} + +/* + * Check whether the read request is in the stride window. + * If it is in the stride window, return 1, otherwise return 0. + */ +static int index_in_stride_window(struct ll_readahead_state *ras, + unsigned long index) +{ + unsigned long stride_gap; + + if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 || + ras->ras_stride_pages == ras->ras_stride_length) + return 0; + + stride_gap = index - ras->ras_last_readpage - 1; + + /* If it is contiguous read */ + if (stride_gap == 0) + return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages; + + /* Otherwise check the stride by itself */ + return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap && + ras->ras_consecutive_pages == ras->ras_stride_pages; +} + +static void ras_update_stride_detector(struct ll_readahead_state *ras, + unsigned long index) +{ + unsigned long stride_gap = index - ras->ras_last_readpage - 1; + + if (!stride_io_mode(ras) && (stride_gap != 0 || + ras->ras_consecutive_stride_requests == 0)) { + ras->ras_stride_pages = ras->ras_consecutive_pages; + ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages; + } + LASSERT(ras->ras_request_index == 0); + LASSERT(ras->ras_consecutive_stride_requests == 0); + + if (index <= ras->ras_last_readpage) { + /*Reset stride window for forward read*/ + ras_stride_reset(ras); + return; + } + + ras->ras_stride_pages = ras->ras_consecutive_pages; + ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages; + + RAS_CDEBUG(ras); + return; +} + +static unsigned long +stride_page_count(struct ll_readahead_state *ras, unsigned long len) +{ + return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length, + ras->ras_stride_pages, ras->ras_stride_offset, + len); +} + +/* Stride Read-ahead window will be increased inc_len according to + * stride I/O pattern */ +static void ras_stride_increase_window(struct ll_readahead_state *ras, + struct ll_ra_info *ra, + unsigned long inc_len) +{ + unsigned long left, step, window_len; + unsigned long stride_len; + + LASSERT(ras->ras_stride_length > 0); + LASSERTF(ras->ras_window_start + ras->ras_window_len + >= ras->ras_stride_offset, "window_start %lu, window_len %lu" + " stride_offset %lu\n", ras->ras_window_start, + ras->ras_window_len, ras->ras_stride_offset); + + stride_len = ras->ras_window_start + ras->ras_window_len - + ras->ras_stride_offset; + + left = stride_len % ras->ras_stride_length; + window_len = ras->ras_window_len - left; + + if (left < ras->ras_stride_pages) + left += inc_len; + else + left = ras->ras_stride_pages + inc_len; + + LASSERT(ras->ras_stride_pages != 0); + + step = left / ras->ras_stride_pages; + left %= ras->ras_stride_pages; + + window_len += step * ras->ras_stride_length + left; + + if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file) + ras->ras_window_len = window_len; + + RAS_CDEBUG(ras); +} + +static void ras_increase_window(struct inode *inode, + struct ll_readahead_state *ras, + struct ll_ra_info *ra) +{ + /* The stretch of ra-window should be aligned with max rpc_size + * but current clio architecture does not support retrieve such + * information from lower layer. FIXME later + */ + if (stride_io_mode(ras)) { + ras_stride_increase_window(ras, ra, ras->ras_rpc_size); + } else { + unsigned long wlen; + + wlen = min(ras->ras_window_len + ras->ras_rpc_size, + ra->ra_max_pages_per_file); + if (wlen < ras->ras_rpc_size) + ras->ras_window_len = wlen; + else + ras->ras_window_len = ras_align(ras, wlen, NULL); + } +} + +static void ras_update(struct ll_sb_info *sbi, struct inode *inode, + struct ll_readahead_state *ras, unsigned long index, + enum ras_update_flags flags) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + bool hit = flags & LL_RAS_HIT; + int zero = 0, stride_detect = 0, ra_miss = 0; + ENTRY; + + spin_lock(&ras->ras_lock); + + if (!hit) + CDEBUG(D_READA, DFID " pages at %lu miss.\n", + PFID(ll_inode2fid(inode)), index); + ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS); + + /* reset the read-ahead window in two cases. First when the app seeks + * or reads to some other part of the file. Secondly if we get a + * read-ahead miss that we think we've previously issued. This can + * be a symptom of there being so many read-ahead pages that the VM is + * reclaiming it before we get to it. */ + if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) { + zero = 1; + ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE); + } else if (!hit && ras->ras_window_len && + index < ras->ras_next_readahead && + index_in_window(index, ras->ras_window_start, 0, + ras->ras_window_len)) { + ra_miss = 1; + ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW); + } + + /* On the second access to a file smaller than the tunable + * ra_max_read_ahead_whole_pages trigger RA on all pages in the + * file up to ra_max_pages_per_file. This is simply a best effort + * and only occurs once per open file. Normal RA behavior is reverted + * to for subsequent IO. The mmap case does not increment + * ras_requests and thus can never trigger this behavior. */ + if (ras->ras_requests >= 2 && !ras->ras_request_index) { + __u64 kms_pages; + + kms_pages = (i_size_read(inode) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + + CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages, + ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file); + + if (kms_pages && + kms_pages <= ra->ra_max_read_ahead_whole_pages) { + ras->ras_window_start = 0; + ras->ras_next_readahead = index + 1; + ras->ras_window_len = min(ra->ra_max_pages_per_file, + ra->ra_max_read_ahead_whole_pages); + GOTO(out_unlock, 0); + } + } + if (zero) { + /* check whether it is in stride I/O mode*/ + if (!index_in_stride_window(ras, index)) { + if (ras->ras_consecutive_stride_requests == 0 && + ras->ras_request_index == 0) { + ras_update_stride_detector(ras, index); + ras->ras_consecutive_stride_requests++; + } else { + ras_stride_reset(ras); + } + ras_reset(inode, ras, index); + ras->ras_consecutive_pages++; + GOTO(out_unlock, 0); + } else { + ras->ras_consecutive_pages = 0; + ras->ras_consecutive_requests = 0; + if (++ras->ras_consecutive_stride_requests > 1) + stride_detect = 1; + RAS_CDEBUG(ras); + } + } else { + if (ra_miss) { + if (index_in_stride_window(ras, index) && + stride_io_mode(ras)) { + if (index != ras->ras_last_readpage + 1) + ras->ras_consecutive_pages = 0; + ras_reset(inode, ras, index); + + /* If stride-RA hit cache miss, the stride + * detector will not be reset to avoid the + * overhead of redetecting read-ahead mode, + * but on the condition that the stride window + * is still intersect with normal sequential + * read-ahead window. */ + if (ras->ras_window_start < + ras->ras_stride_offset) + ras_stride_reset(ras); + RAS_CDEBUG(ras); + } else { + /* Reset both stride window and normal RA + * window */ + ras_reset(inode, ras, index); + ras->ras_consecutive_pages++; + ras_stride_reset(ras); + GOTO(out_unlock, 0); + } + } else if (stride_io_mode(ras)) { + /* If this is contiguous read but in stride I/O mode + * currently, check whether stride step still is valid, + * if invalid, it will reset the stride ra window*/ + if (!index_in_stride_window(ras, index)) { + /* Shrink stride read-ahead window to be zero */ + ras_stride_reset(ras); + ras->ras_window_len = 0; + ras->ras_next_readahead = index; + } + } + } + ras->ras_consecutive_pages++; + ras->ras_last_readpage = index; + ras_set_start(inode, ras, index); + + if (stride_io_mode(ras)) { + /* Since stride readahead is sentivite to the offset + * of read-ahead, so we use original offset here, + * instead of ras_window_start, which is RPC aligned */ + ras->ras_next_readahead = max(index + 1, + ras->ras_next_readahead); + ras->ras_window_start = max(ras->ras_stride_offset, + ras->ras_window_start); + } else { + if (ras->ras_next_readahead < ras->ras_window_start) + ras->ras_next_readahead = ras->ras_window_start; + if (!hit) + ras->ras_next_readahead = index + 1; + } + RAS_CDEBUG(ras); + + /* Trigger RA in the mmap case where ras_consecutive_requests + * is not incremented and thus can't be used to trigger RA */ + if (ras->ras_consecutive_pages >= 4 && flags & LL_RAS_MMAP) { + ras_increase_window(inode, ras, ra); + /* reset consecutive pages so that the readahead window can + * grow gradually. */ + ras->ras_consecutive_pages = 0; + GOTO(out_unlock, 0); + } + + /* Initially reset the stride window offset to next_readahead*/ + if (ras->ras_consecutive_stride_requests == 2 && stride_detect) { + /** + * Once stride IO mode is detected, next_readahead should be + * reset to make sure next_readahead > stride offset + */ + ras->ras_next_readahead = max(index, ras->ras_next_readahead); + ras->ras_stride_offset = index; + ras->ras_window_start = max(index, ras->ras_window_start); + } + + /* The initial ras_window_len is set to the request size. To avoid + * uselessly reading and discarding pages for random IO the window is + * only increased once per consecutive request received. */ + if ((ras->ras_consecutive_requests > 1 || stride_detect) && + !ras->ras_request_index) + ras_increase_window(inode, ras, ra); + EXIT; +out_unlock: + RAS_CDEBUG(ras); + ras->ras_request_index++; + spin_unlock(&ras->ras_lock); + return; +} + +int ll_writepage(struct page *vmpage, struct writeback_control *wbc) +{ + struct inode *inode = vmpage->mapping->host; + struct ll_inode_info *lli = ll_i2info(inode); + struct lu_env *env; + struct cl_io *io; + struct cl_page *page; + struct cl_object *clob; + bool redirtied = false; + bool unlocked = false; + int result; + __u16 refcheck; + ENTRY; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + + LASSERT(ll_i2dtexp(inode) != NULL); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, result = PTR_ERR(env)); + + clob = ll_i2info(inode)->lli_clob; + LASSERT(clob != NULL); + + io = vvp_env_thread_io(env); + io->ci_obj = clob; + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, clob); + if (result == 0) { + page = cl_page_find(env, clob, vmpage->index, + vmpage, CPT_CACHEABLE); + if (!IS_ERR(page)) { + lu_ref_add(&page->cp_reference, "writepage", + current); + cl_page_assume(env, io, page); + result = cl_page_flush(env, io, page); + if (result != 0) { + /* + * Re-dirty page on error so it retries write, + * but not in case when IO has actually + * occurred and completed with an error. + */ + if (!PageError(vmpage)) { + redirty_page_for_writepage(wbc, vmpage); + result = 0; + redirtied = true; + } + } + cl_page_disown(env, io, page); + unlocked = true; + lu_ref_del(&page->cp_reference, + "writepage", current); + cl_page_put(env, page); + } else { + result = PTR_ERR(page); + } + } + cl_io_fini(env, io); + + if (redirtied && wbc->sync_mode == WB_SYNC_ALL) { + loff_t offset = cl_offset(clob, vmpage->index); + + /* Flush page failed because the extent is being written out. + * Wait for the write of extent to be finished to avoid + * breaking kernel which assumes ->writepage should mark + * PageWriteback or clean the page. */ + result = cl_sync_file_range(inode, offset, + offset + PAGE_SIZE - 1, + CL_FSYNC_LOCAL, 1); + if (result > 0) { + /* actually we may have written more than one page. + * decreasing this page because the caller will count + * it. */ + wbc->nr_to_write -= result - 1; + result = 0; + } + } + + cl_env_put(env, &refcheck); + GOTO(out, result); + +out: + if (result < 0) { + if (!lli->lli_async_rc) + lli->lli_async_rc = result; + SetPageError(vmpage); + if (!unlocked) + unlock_page(vmpage); + } + return result; +} + +int ll_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + loff_t start; + loff_t end; + enum cl_fsync_mode mode; + int range_whole = 0; + int result; + ENTRY; + + if (wbc->range_cyclic) { + start = mapping->writeback_index << PAGE_SHIFT; + end = OBD_OBJECT_EOF; + } else { + start = wbc->range_start; + end = wbc->range_end; + if (end == LLONG_MAX) { + end = OBD_OBJECT_EOF; + range_whole = start == 0; + } + } + + mode = CL_FSYNC_NONE; + if (wbc->sync_mode == WB_SYNC_ALL) + mode = CL_FSYNC_LOCAL; + + if (ll_i2info(inode)->lli_clob == NULL) + RETURN(0); + + /* for directio, it would call writepages() to evict cached pages + * inside the IO context of write, which will cause deadlock at + * layout_conf since it waits for active IOs to complete. */ + result = cl_sync_file_range(inode, start, end, mode, 1); + if (result > 0) { + wbc->nr_to_write -= result; + result = 0; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) { + if (end == OBD_OBJECT_EOF) + mapping->writeback_index = 0; + else + mapping->writeback_index = (end >> PAGE_SHIFT) + 1; + } + RETURN(result); +} + +struct ll_cl_context *ll_cl_find(struct file *file) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_cl_context *lcc; + struct ll_cl_context *found = NULL; + + read_lock(&fd->fd_lock); + list_for_each_entry(lcc, &fd->fd_lccs, lcc_list) { + if (lcc->lcc_cookie == current) { + found = lcc; + break; + } + } + read_unlock(&fd->fd_lock); + + return found; +} + +void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io, + enum lcc_type type) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx; + + memset(lcc, 0, sizeof(*lcc)); + INIT_LIST_HEAD(&lcc->lcc_list); + lcc->lcc_cookie = current; + lcc->lcc_env = env; + lcc->lcc_io = io; + lcc->lcc_type = type; + + write_lock(&fd->fd_lock); + list_add(&lcc->lcc_list, &fd->fd_lccs); + write_unlock(&fd->fd_lock); +} + +void ll_cl_remove(struct file *file, const struct lu_env *env) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx; + + write_lock(&fd->fd_lock); + list_del_init(&lcc->lcc_list); + write_unlock(&fd->fd_lock); +} + +int ll_io_read_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, struct file *file) +{ + struct inode *inode = vvp_object_inode(page->cp_obj); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_readahead_state *ras = &fd->fd_ras; + struct cl_2queue *queue = &io->ci_queue; + struct cl_sync_io *anchor = NULL; + struct vvp_page *vpg; + int rc = 0; + bool uptodate; + ENTRY; + + vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); + uptodate = vpg->vpg_defer_uptodate; + + if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && + sbi->ll_ra_info.ra_max_pages > 0 && + !vpg->vpg_ra_updated) { + struct vvp_io *vio = vvp_env_io(env); + enum ras_update_flags flags = 0; + + if (uptodate) + flags |= LL_RAS_HIT; + if (!vio->vui_ra_valid) + flags |= LL_RAS_MMAP; + ras_update(sbi, inode, ras, vvp_index(vpg), flags); + } + + cl_2queue_init(queue); + if (uptodate) { + vpg->vpg_ra_used = 1; + cl_page_export(env, page, 1); + cl_page_disown(env, io, page); + } else { + anchor = &vvp_env_info(env)->vti_anchor; + cl_sync_io_init(anchor, 1, &cl_sync_io_end); + page->cp_sync_io = anchor; + + cl_2queue_add(queue, page); + } + + if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && + sbi->ll_ra_info.ra_max_pages > 0) { + int rc2; + + rc2 = ll_readahead(env, io, &queue->c2_qin, ras, + uptodate); + CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n", + PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg)); + } + + if (queue->c2_qin.pl_nr > 0) { + int count = queue->c2_qin.pl_nr; + rc = cl_io_submit_rw(env, io, CRT_READ, queue); + if (rc == 0) + task_io_account_read(PAGE_SIZE * count); + } + + + if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */ + rc = cl_sync_io_wait(env, anchor, 0); + + cl_page_assume(env, io, page); + cl_page_list_del(env, &queue->c2_qout, page); + + if (!PageUptodate(cl_page_vmpage(page))) { + /* Failed to read a mirror, discard this page so that + * new page can be created with new mirror. + * + * TODO: this is not needed after page reinit + * route is implemented */ + cl_page_discard(env, io, page); + } + cl_page_disown(env, io, page); + } + + /* TODO: discard all pages until page reinit route is implemented */ + cl_page_list_discard(env, io, &queue->c2_qin); + + /* Unlock unsent read pages in case of error. */ + cl_page_list_disown(env, io, &queue->c2_qin); + + cl_2queue_fini(env, queue); + + RETURN(rc); +} + +int ll_readpage(struct file *file, struct page *vmpage) +{ + struct inode *inode = file_inode(file); + struct cl_object *clob = ll_i2info(inode)->lli_clob; + struct ll_cl_context *lcc; + const struct lu_env *env = NULL; + struct cl_io *io = NULL; + struct cl_page *page; + int result; + ENTRY; + + lcc = ll_cl_find(file); + if (lcc != NULL) { + env = lcc->lcc_env; + io = lcc->lcc_io; + } + + if (io == NULL) { /* fast read */ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_readahead_state *ras = &fd->fd_ras; + struct lu_env *local_env = NULL; + unsigned long fast_read_pages = + max(RA_REMAIN_WINDOW_MIN, ras->ras_rpc_size); + struct vvp_page *vpg; + + result = -ENODATA; + + /* TODO: need to verify the layout version to make sure + * the page is not invalid due to layout change. */ + page = cl_vmpage_page(vmpage, clob); + if (page == NULL) { + unlock_page(vmpage); + RETURN(result); + } + + if (!env) { + local_env = cl_env_percpu_get(); + env = local_env; + } + + vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); + if (vpg->vpg_defer_uptodate) { + enum ras_update_flags flags = LL_RAS_HIT; + + if (lcc && lcc->lcc_type == LCC_MMAP) + flags |= LL_RAS_MMAP; + + /* For fast read, it updates read ahead state only + * if the page is hit in cache because non cache page + * case will be handled by slow read later. */ + ras_update(ll_i2sbi(inode), inode, ras, vvp_index(vpg), + flags); + /* avoid duplicate ras_update() call */ + vpg->vpg_ra_updated = 1; + + /* Check if we can issue a readahead RPC, if that is + * the case, we can't do fast IO because we will need + * a cl_io to issue the RPC. */ + if (ras->ras_window_start + ras->ras_window_len < + ras->ras_next_readahead + fast_read_pages) { + /* export the page and skip io stack */ + vpg->vpg_ra_used = 1; + cl_page_export(env, page, 1); + result = 0; + } + } + + /* release page refcount before unlocking the page to ensure + * the object won't be destroyed in the calling path of + * cl_page_put(). Please see comment in ll_releasepage(). */ + cl_page_put(env, page); + unlock_page(vmpage); + if (local_env) + cl_env_percpu_put(local_env); + + RETURN(result); + } + + LASSERT(io->ci_state == CIS_IO_GOING); + page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); + if (!IS_ERR(page)) { + LASSERT(page->cp_type == CPT_CACHEABLE); + if (likely(!PageUptodate(vmpage))) { + cl_page_assume(env, io, page); + + result = ll_io_read_page(env, io, page, file); + } else { + /* Page from a non-object file. */ + unlock_page(vmpage); + result = 0; + } + cl_page_put(env, page); + } else { + unlock_page(vmpage); + result = PTR_ERR(page); + } + RETURN(result); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/rw26.c b/drivers/staging/lustrefsx/lustre/llite/rw26.c new file mode 100644 index 0000000000000..9a1f0b6021baf --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/rw26.c @@ -0,0 +1,916 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/llite/rw26.c + * + * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_MIGRATE_H +#include +#elif defined(HAVE_MIGRATE_MODE_H) +#include +#endif + +#define DEBUG_SUBSYSTEM S_LLITE + +#include "llite_internal.h" +#include + +/** + * Implements Linux VM address_space::invalidatepage() method. This method is + * called when the page is truncate from a file, either as a result of + * explicit truncate, or when inode is removed from memory (as a result of + * final iput(), umount, or memory pressure induced icache shrinking). + * + * [0, offset] bytes of the page remain valid (this is for a case of not-page + * aligned truncate). Lustre leaves partially truncated page in the cache, + * relying on struct inode::i_size to limit further accesses. + */ +static void ll_invalidatepage(struct page *vmpage, +#ifdef HAVE_INVALIDATE_RANGE + unsigned int offset, unsigned int length +#else + unsigned long offset +#endif + ) +{ + struct inode *inode; + struct lu_env *env; + struct cl_page *page; + struct cl_object *obj; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + + /* + * It is safe to not check anything in invalidatepage/releasepage + * below because they are run with page locked and all our io is + * happening with locked page too + */ +#ifdef HAVE_INVALIDATE_RANGE + if (offset == 0 && length == PAGE_SIZE) { +#else + if (offset == 0) { +#endif + /* See the comment in ll_releasepage() */ + env = cl_env_percpu_get(); + LASSERT(!IS_ERR(env)); + + inode = vmpage->mapping->host; + obj = ll_i2info(inode)->lli_clob; + if (obj != NULL) { + page = cl_vmpage_page(vmpage, obj); + if (page != NULL) { + cl_page_delete(env, page); + cl_page_put(env, page); + } + } else + LASSERT(vmpage->private == 0); + + cl_env_percpu_put(env); + } +} + +#ifdef HAVE_RELEASEPAGE_WITH_INT +#define RELEASEPAGE_ARG_TYPE int +#else +#define RELEASEPAGE_ARG_TYPE gfp_t +#endif +static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask) +{ + struct lu_env *env; + struct cl_object *obj; + struct cl_page *page; + struct address_space *mapping; + int result = 0; + + LASSERT(PageLocked(vmpage)); + if (PageWriteback(vmpage) || PageDirty(vmpage)) + return 0; + + mapping = vmpage->mapping; + if (mapping == NULL) + return 1; + + obj = ll_i2info(mapping->host)->lli_clob; + if (obj == NULL) + return 1; + + /* 1 for caller, 1 for cl_page and 1 for page cache */ + if (page_count(vmpage) > 3) + return 0; + + page = cl_vmpage_page(vmpage, obj); + if (page == NULL) + return 1; + + env = cl_env_percpu_get(); + LASSERT(!IS_ERR(env)); + + if (!cl_page_in_use(page)) { + result = 1; + cl_page_delete(env, page); + } + + /* To use percpu env array, the call path can not be rescheduled; + * otherwise percpu array will be messed if ll_releaspage() called + * again on the same CPU. + * + * If this page holds the last refc of cl_object, the following + * call path may cause reschedule: + * cl_page_put -> cl_page_free -> cl_object_put -> + * lu_object_put -> lu_object_free -> lov_delete_raid0. + * + * However, the kernel can't get rid of this inode until all pages have + * been cleaned up. Now that we hold page lock here, it's pretty safe + * that we won't get into object delete path. + */ + LASSERT(cl_object_refc(obj) > 1); + cl_page_put(env, page); + + cl_env_percpu_put(env); + return result; +} + +#define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL + +static ssize_t +ll_direct_IO_seg(const struct lu_env *env, struct cl_io *io, int rw, + struct inode *inode, size_t size, loff_t file_offset, + struct page **pages, int page_count) +{ + struct cl_page *clp; + struct cl_2queue *queue; + struct cl_object *obj = io->ci_obj; + int i; + ssize_t rc = 0; + size_t page_size = cl_page_size(obj); + size_t orig_size = size; + bool do_io; + int io_pages = 0; + + ENTRY; + queue = &io->ci_queue; + cl_2queue_init(queue); + for (i = 0; i < page_count; i++) { + LASSERT(!(file_offset & (page_size - 1))); + clp = cl_page_find(env, obj, cl_index(obj, file_offset), + pages[i], CPT_TRANSIENT); + if (IS_ERR(clp)) { + rc = PTR_ERR(clp); + break; + } + + rc = cl_page_own(env, io, clp); + if (rc) { + LASSERT(clp->cp_state == CPS_FREEING); + cl_page_put(env, clp); + break; + } + + do_io = true; + + /* check the page type: if the page is a host page, then do + * write directly + */ + if (clp->cp_type == CPT_CACHEABLE) { + struct page *vmpage = cl_page_vmpage(clp); + struct page *src_page; + struct page *dst_page; + void *src; + void *dst; + + src_page = (rw == WRITE) ? pages[i] : vmpage; + dst_page = (rw == WRITE) ? vmpage : pages[i]; + + src = ll_kmap_atomic(src_page, KM_USER0); + dst = ll_kmap_atomic(dst_page, KM_USER1); + memcpy(dst, src, min(page_size, size)); + ll_kunmap_atomic(dst, KM_USER1); + ll_kunmap_atomic(src, KM_USER0); + + /* make sure page will be added to the transfer by + * cl_io_submit()->...->vvp_page_prep_write(). + */ + if (rw == WRITE) + set_page_dirty(vmpage); + + if (rw == READ) { + /* do not issue the page for read, since it + * may reread a ra page which has NOT uptodate + * bit set. + */ + cl_page_disown(env, io, clp); + do_io = false; + } + } + + if (likely(do_io)) { + cl_2queue_add(queue, clp); + + /* + * Set page clip to tell transfer formation engine + * that page has to be sent even if it is beyond KMS. + */ + cl_page_clip(env, clp, 0, min(size, page_size)); + + ++io_pages; + } + + /* drop the reference count for cl_page_find */ + cl_page_put(env, clp); + size -= page_size; + file_offset += page_size; + } + + if (rc == 0 && io_pages) { + rc = cl_io_submit_sync(env, io, + rw == READ ? CRT_READ : CRT_WRITE, + queue, 0); + } + if (rc == 0) + rc = orig_size; + + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + RETURN(rc); +} + +/* ll_free_user_pages - tear down page struct array + * @pages: array of page struct pointers underlying target buffer */ +static void ll_free_user_pages(struct page **pages, int npages, int do_dirty) +{ + int i; + + for (i = 0; i < npages; i++) { + if (pages[i] == NULL) + break; + if (do_dirty) + set_page_dirty_lock(pages[i]); + put_page(pages[i]); + } + +#if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW) + kvfree(pages); +#else + OBD_FREE_LARGE(pages, npages * sizeof(*pages)); +#endif +} + +#ifdef KMALLOC_MAX_SIZE +#define MAX_MALLOC KMALLOC_MAX_SIZE +#else +#define MAX_MALLOC (128 * 1024) +#endif + +/* This is the maximum size of a single O_DIRECT request, based on the + * kmalloc limit. We need to fit all of the brw_page structs, each one + * representing PAGE_SIZE worth of user data, into a single buffer, and + * then truncate this to be a full-sized RPC. For 4kB PAGE_SIZE this is + * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */ +#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_SIZE) & \ + ~(DT_MAX_BRW_SIZE - 1)) + +#ifndef HAVE_IOV_ITER_RW +# define iov_iter_rw(iter) rw +#endif + +#if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW) +static ssize_t +ll_direct_IO( +# ifndef HAVE_IOV_ITER_RW + int rw, +# endif + struct kiocb *iocb, struct iov_iter *iter +# ifndef HAVE_DIRECTIO_2ARGS + , loff_t file_offset +# endif + ) +{ +#ifdef HAVE_DIRECTIO_2ARGS + loff_t file_offset = iocb->ki_pos; +#endif + struct ll_cl_context *lcc; + const struct lu_env *env; + struct cl_io *io; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t count = iov_iter_count(iter); + ssize_t tot_bytes = 0, result = 0; + size_t size = MAX_DIO_SIZE; + + /* Check EOF by ourselves */ + if (iov_iter_rw(iter) == READ && file_offset >= i_size_read(inode)) + return 0; + /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ + if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK)) + return -EINVAL; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), " + "offset=%lld=%llx, pages %zd (max %lu)\n", + PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE, + file_offset, file_offset, count >> PAGE_SHIFT, + MAX_DIO_SIZE >> PAGE_SHIFT); + + /* Check that all user buffers are aligned as well */ + if (iov_iter_alignment(iter) & ~PAGE_MASK) + return -EINVAL; + + lcc = ll_cl_find(file); + if (lcc == NULL) + RETURN(-EIO); + + env = lcc->lcc_env; + LASSERT(!IS_ERR(env)); + io = lcc->lcc_io; + LASSERT(io != NULL); + + /* 0. Need locking between buffered and direct access. and race with + * size changing by concurrent truncates and writes. + * 1. Need inode mutex to operate transient pages. + */ + if (iov_iter_rw(iter) == READ) + inode_lock(inode); + + while (iov_iter_count(iter)) { + struct page **pages; + size_t offs; + + count = min_t(size_t, iov_iter_count(iter), size); + if (iov_iter_rw(iter) == READ) { + if (file_offset >= i_size_read(inode)) + break; + + if (file_offset + count > i_size_read(inode)) + count = i_size_read(inode) - file_offset; + } + + result = iov_iter_get_pages_alloc(iter, &pages, count, &offs); + if (likely(result > 0)) { + int n = DIV_ROUND_UP(result + offs, PAGE_SIZE); + + result = ll_direct_IO_seg(env, io, iov_iter_rw(iter), + inode, result, file_offset, + pages, n); + ll_free_user_pages(pages, n, + iov_iter_rw(iter) == READ); + + } + if (unlikely(result <= 0)) { + /* If we can't allocate a large enough buffer + * for the request, shrink it to a smaller + * PAGE_SIZE multiple and try again. + * We should always be able to kmalloc for a + * page worth of page pointers = 4MB on i386. */ + if (result == -ENOMEM && + size > (PAGE_SIZE / sizeof(*pages)) * + PAGE_SIZE) { + size = ((((size / 2) - 1) | + ~PAGE_MASK) + 1) & PAGE_MASK; + CDEBUG(D_VFSTRACE, "DIO size now %zu\n", + size); + continue; + } + + GOTO(out, result); + } + + iov_iter_advance(iter, result); + tot_bytes += result; + file_offset += result; + } +out: + if (iov_iter_rw(iter) == READ) + inode_unlock(inode); + + if (tot_bytes > 0) { + struct vvp_io *vio = vvp_env_io(env); + + /* no commit async for direct IO */ + vio->u.write.vui_written += tot_bytes; + } + + return tot_bytes ? : result; +} +#else /* !HAVE_DIRECTIO_ITER && !HAVE_IOV_ITER_RW */ + +static inline int ll_get_user_pages(int rw, unsigned long user_addr, + size_t size, struct page ***pages, + int *max_pages) +{ + int result = -ENOMEM; + + /* set an arbitrary limit to prevent arithmetic overflow */ + if (size > MAX_DIRECTIO_SIZE) { + *pages = NULL; + return -EFBIG; + } + + *max_pages = (user_addr + size + PAGE_SIZE - 1) >> + PAGE_SHIFT; + *max_pages -= user_addr >> PAGE_SHIFT; + + OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages)); + if (*pages) { + mmap_read_lock(current->mm); + result = get_user_pages(current, current->mm, user_addr, + *max_pages, (rw == READ), 0, *pages, + NULL); + mmap_read_unlock(current->mm); + if (unlikely(result <= 0)) + OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages)); + } + + return result; +} + +static ssize_t +ll_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t file_offset, unsigned long nr_segs) +{ + struct ll_cl_context *lcc; + const struct lu_env *env; + struct cl_io *io; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t count = iov_length(iov, nr_segs); + ssize_t tot_bytes = 0, result = 0; + unsigned long seg = 0; + size_t size = MAX_DIO_SIZE; + ENTRY; + + /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ + if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK)) + RETURN(-EINVAL); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), " + "offset=%lld=%llx, pages %zd (max %lu)\n", + PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE, + file_offset, file_offset, count >> PAGE_SHIFT, + MAX_DIO_SIZE >> PAGE_SHIFT); + + /* Check that all user buffers are aligned as well */ + for (seg = 0; seg < nr_segs; seg++) { + if (((unsigned long)iov[seg].iov_base & ~PAGE_MASK) || + (iov[seg].iov_len & ~PAGE_MASK)) + RETURN(-EINVAL); + } + + lcc = ll_cl_find(file); + if (lcc == NULL) + RETURN(-EIO); + + env = lcc->lcc_env; + LASSERT(!IS_ERR(env)); + io = lcc->lcc_io; + LASSERT(io != NULL); + + for (seg = 0; seg < nr_segs; seg++) { + size_t iov_left = iov[seg].iov_len; + unsigned long user_addr = (unsigned long)iov[seg].iov_base; + + if (rw == READ) { + if (file_offset >= i_size_read(inode)) + break; + if (file_offset + iov_left > i_size_read(inode)) + iov_left = i_size_read(inode) - file_offset; + } + + while (iov_left > 0) { + struct page **pages; + int page_count, max_pages = 0; + size_t bytes; + + bytes = min(size, iov_left); + page_count = ll_get_user_pages(rw, user_addr, bytes, + &pages, &max_pages); + if (likely(page_count > 0)) { + if (unlikely(page_count < max_pages)) + bytes = page_count << PAGE_SHIFT; + result = ll_direct_IO_seg(env, io, rw, inode, + bytes, file_offset, + pages, page_count); + ll_free_user_pages(pages, max_pages, rw==READ); + } else if (page_count == 0) { + GOTO(out, result = -EFAULT); + } else { + result = page_count; + } + if (unlikely(result <= 0)) { + /* If we can't allocate a large enough buffer + * for the request, shrink it to a smaller + * PAGE_SIZE multiple and try again. + * We should always be able to kmalloc for a + * page worth of page pointers = 4MB on i386. */ + if (result == -ENOMEM && + size > (PAGE_SIZE / sizeof(*pages)) * + PAGE_SIZE) { + size = ((((size / 2) - 1) | + ~PAGE_MASK) + 1) & + PAGE_MASK; + CDEBUG(D_VFSTRACE, "DIO size now %zu\n", + size); + continue; + } + + GOTO(out, result); + } + + tot_bytes += result; + file_offset += result; + iov_left -= result; + user_addr += result; + } + } +out: + if (tot_bytes > 0) { + struct vvp_io *vio = vvp_env_io(env); + + /* no commit async for direct IO */ + vio->u.write.vui_written += tot_bytes; + } + + RETURN(tot_bytes ? tot_bytes : result); +} +#endif /* HAVE_DIRECTIO_ITER || HAVE_IOV_ITER_RW */ + +/** + * Prepare partially written-to page for a write. + * @pg is owned when passed in and disowned when it returns non-zero result to + * the caller. + */ +static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, struct file *file) +{ + struct cl_attr *attr = vvp_env_thread_attr(env); + struct cl_object *obj = io->ci_obj; + struct vvp_page *vpg = cl_object_page_slice(obj, pg); + loff_t offset = cl_offset(obj, vvp_index(vpg)); + int result; + ENTRY; + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + if (result) { + cl_page_disown(env, io, pg); + GOTO(out, result); + } + + /* + * If are writing to a new page, no need to read old data. + * The extent locking will have updated the KMS, and for our + * purposes here we can treat it like i_size. + */ + if (attr->cat_kms <= offset) { + char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0); + + memset(kaddr, 0, cl_page_size(obj)); + ll_kunmap_atomic(kaddr, KM_USER0); + GOTO(out, result = 0); + } + + if (vpg->vpg_defer_uptodate) { + vpg->vpg_ra_used = 1; + GOTO(out, result = 0); + } + + result = ll_io_read_page(env, io, pg, file); + if (result) + GOTO(out, result); + + /* ll_io_read_page() disowns the page */ + result = cl_page_own(env, io, pg); + if (!result) { + if (!PageUptodate(cl_page_vmpage(pg))) { + cl_page_disown(env, io, pg); + result = -EIO; + } + } else if (result == -ENOENT) { + /* page was truncated */ + result = -EAGAIN; + } + EXIT; + +out: + return result; +} + +static int ll_tiny_write_begin(struct page *vmpage) +{ + /* Page must be present, up to date, dirty, and not in writeback. */ + if (!vmpage || !PageUptodate(vmpage) || !PageDirty(vmpage) || + PageWriteback(vmpage)) + return -ENODATA; + + return 0; +} + +static int ll_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct ll_cl_context *lcc = NULL; + const struct lu_env *env = NULL; + struct cl_io *io = NULL; + struct cl_page *page = NULL; + + struct cl_object *clob = ll_i2info(mapping->host)->lli_clob; + pgoff_t index = pos >> PAGE_SHIFT; + struct page *vmpage = NULL; + unsigned from = pos & (PAGE_SIZE - 1); + unsigned to = from + len; + int result = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len); + + lcc = ll_cl_find(file); + if (lcc == NULL) { + vmpage = grab_cache_page_nowait(mapping, index); + result = ll_tiny_write_begin(vmpage); + GOTO(out, result); + } + + env = lcc->lcc_env; + io = lcc->lcc_io; + + if (file->f_flags & O_DIRECT && io->ci_designated_mirror > 0) { + /* direct IO failed because it couldn't clean up cached pages, + * this causes a problem for mirror write because the cached + * page may belong to another mirror, which will result in + * problem submitting the I/O. */ + GOTO(out, result = -EBUSY); + } + +again: + /* To avoid deadlock, try to lock page first. */ + vmpage = grab_cache_page_nowait(mapping, index); + + if (unlikely(vmpage == NULL || + PageDirty(vmpage) || PageWriteback(vmpage))) { + struct vvp_io *vio = vvp_env_io(env); + struct cl_page_list *plist = &vio->u.write.vui_queue; + + /* if the page is already in dirty cache, we have to commit + * the pages right now; otherwise, it may cause deadlock + * because it holds page lock of a dirty page and request for + * more grants. It's okay for the dirty page to be the first + * one in commit page list, though. */ + if (vmpage != NULL && plist->pl_nr > 0) { + unlock_page(vmpage); + put_page(vmpage); + vmpage = NULL; + } + + /* commit pages and then wait for page lock */ + result = vvp_io_write_commit(env, io); + if (result < 0) + GOTO(out, result); + + if (vmpage == NULL) { + vmpage = grab_cache_page_write_begin(mapping, index, + flags); + if (vmpage == NULL) + GOTO(out, result = -ENOMEM); + } + } + + page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); + if (IS_ERR(page)) + GOTO(out, result = PTR_ERR(page)); + + lcc->lcc_page = page; + lu_ref_add(&page->cp_reference, "cl_io", io); + + cl_page_assume(env, io, page); + if (!PageUptodate(vmpage)) { + /* + * We're completely overwriting an existing page, + * so _don't_ set it up to date until commit_write + */ + if (from == 0 && to == PAGE_SIZE) { + CL_PAGE_HEADER(D_PAGE, env, page, "full page write\n"); + POISON_PAGE(vmpage, 0x11); + } else { + /* TODO: can be optimized at OSC layer to check if it + * is a lockless IO. In that case, it's not necessary + * to read the data. */ + result = ll_prepare_partial_page(env, io, page, file); + if (result) { + /* vmpage should have been unlocked */ + put_page(vmpage); + vmpage = NULL; + + if (result == -EAGAIN) + goto again; + GOTO(out, result); + } + } + } + EXIT; +out: + if (result < 0) { + if (vmpage != NULL) { + unlock_page(vmpage); + put_page(vmpage); + } + /* On tiny_write failure, page and io are always null. */ + if (!IS_ERR_OR_NULL(page)) { + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + } + if (io) + io->ci_result = result; + } else { + *pagep = vmpage; + *fsdata = lcc; + } + RETURN(result); +} + +static int ll_tiny_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct page *vmpage) +{ + struct cl_page *clpage = (struct cl_page *) vmpage->private; + loff_t kms = pos+copied; + loff_t to = kms & (PAGE_SIZE-1) ? kms & (PAGE_SIZE-1) : PAGE_SIZE; + __u16 refcheck; + struct lu_env *env = cl_env_get(&refcheck); + int rc = 0; + + ENTRY; + + if (IS_ERR(env)) { + rc = PTR_ERR(env); + goto out; + } + + /* This page is dirty in cache, so it should have a cl_page pointer + * set in vmpage->private. + */ + LASSERT(clpage != NULL); + + if (copied == 0) + goto out_env; + + /* Update the underlying size information in the OSC/LOV objects this + * page is part of. + */ + cl_page_touch(env, clpage, to); + +out_env: + cl_env_put(env, &refcheck); + +out: + /* Must return page unlocked. */ + unlock_page(vmpage); + + RETURN(rc); +} + +static int ll_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *vmpage, void *fsdata) +{ + struct ll_cl_context *lcc = fsdata; + const struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio; + struct cl_page *page; + unsigned from = pos & (PAGE_SIZE - 1); + bool unplug = false; + int result = 0; + ENTRY; + + put_page(vmpage); + + CDEBUG(D_VFSTRACE, "pos %llu, len %u, copied %u\n", pos, len, copied); + + if (lcc == NULL) { + result = ll_tiny_write_end(file, mapping, pos, len, copied, + vmpage); + GOTO(out, result); + } + + LASSERT(lcc != NULL); + env = lcc->lcc_env; + page = lcc->lcc_page; + io = lcc->lcc_io; + vio = vvp_env_io(env); + + LASSERT(cl_page_is_owned(page, io)); + if (copied > 0) { + struct cl_page_list *plist = &vio->u.write.vui_queue; + + lcc->lcc_page = NULL; /* page will be queued */ + + /* Add it into write queue */ + cl_page_list_add(plist, page); + if (plist->pl_nr == 1) /* first page */ + vio->u.write.vui_from = from; + else + LASSERT(from == 0); + vio->u.write.vui_to = from + copied; + + /* To address the deadlock in balance_dirty_pages() where + * this dirty page may be written back in the same thread. */ + if (PageDirty(vmpage)) + unplug = true; + + /* We may have one full RPC, commit it soon */ + if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES) + unplug = true; + + CL_PAGE_DEBUG(D_VFSTRACE, env, page, + "queued page: %d.\n", plist->pl_nr); + } else { + cl_page_disown(env, io, page); + + lcc->lcc_page = NULL; + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + + /* page list is not contiguous now, commit it now */ + unplug = true; + } + if (unplug || io->u.ci_wr.wr_sync) + result = vvp_io_write_commit(env, io); + + if (result < 0) + io->ci_result = result; + + +out: + RETURN(result >= 0 ? copied : result); +} + +#ifdef CONFIG_MIGRATION +static int ll_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page +#ifdef HAVE_MIGRATEPAGE_4ARGS + , enum migrate_mode mode +#endif + ) +{ + /* Always fail page migration until we have a proper implementation */ + return -EIO; +} +#endif + +const struct address_space_operations ll_aops = { + .readpage = ll_readpage, + .direct_IO = ll_direct_IO, + .writepage = ll_writepage, + .writepages = ll_writepages, + .set_page_dirty = __set_page_dirty_nobuffers, + .write_begin = ll_write_begin, + .write_end = ll_write_end, + .invalidatepage = ll_invalidatepage, + .releasepage = (void *)ll_releasepage, +#ifdef CONFIG_MIGRATION + .migratepage = ll_migratepage, +#endif +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/statahead.c b/drivers/staging/lustrefsx/lustre/llite/statahead.c new file mode 100644 index 0000000000000..e4886ca12f025 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/statahead.c @@ -0,0 +1,1678 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include "llite_internal.h" + +#define SA_OMITTED_ENTRY_MAX 8ULL + +typedef enum { + /** negative values are for error cases */ + SA_ENTRY_INIT = 0, /** init entry */ + SA_ENTRY_SUCC = 1, /** stat succeed */ + SA_ENTRY_INVA = 2, /** invalid entry */ +} se_state_t; + +/* sa_entry is not refcounted: statahead thread allocates it and do async stat, + * and in async stat callback ll_statahead_interpret() will add it into + * sai_interim_entries, later statahead thread will call sa_handle_callback() to + * instantiate entry and move it into sai_entries, and then only scanner process + * can access and free it. */ +struct sa_entry { + /* link into sai_interim_entries or sai_entries */ + struct list_head se_list; + /* link into sai hash table locally */ + struct list_head se_hash; + /* entry index in the sai */ + __u64 se_index; + /* low layer ldlm lock handle */ + __u64 se_handle; + /* entry status */ + se_state_t se_state; + /* entry size, contains name */ + int se_size; + /* pointer to async getattr enqueue info */ + struct md_enqueue_info *se_minfo; + /* pointer to the async getattr request */ + struct ptlrpc_request *se_req; + /* pointer to the target inode */ + struct inode *se_inode; + /* entry name */ + struct qstr se_qstr; + /* entry fid */ + struct lu_fid se_fid; +}; + +static unsigned int sai_generation = 0; +static DEFINE_SPINLOCK(sai_generation_lock); + +static inline int sa_unhashed(struct sa_entry *entry) +{ + return list_empty(&entry->se_hash); +} + +/* sa_entry is ready to use */ +static inline int sa_ready(struct sa_entry *entry) +{ + smp_rmb(); + return (entry->se_state != SA_ENTRY_INIT); +} + +/* hash value to put in sai_cache */ +static inline int sa_hash(int val) +{ + return val & LL_SA_CACHE_MASK; +} + +/* hash entry into sai_cache */ +static inline void +sa_rehash(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + int i = sa_hash(entry->se_qstr.hash); + + spin_lock(&sai->sai_cache_lock[i]); + list_add_tail(&entry->se_hash, &sai->sai_cache[i]); + spin_unlock(&sai->sai_cache_lock[i]); +} + +/* unhash entry from sai_cache */ +static inline void +sa_unhash(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + int i = sa_hash(entry->se_qstr.hash); + + spin_lock(&sai->sai_cache_lock[i]); + list_del_init(&entry->se_hash); + spin_unlock(&sai->sai_cache_lock[i]); +} + +static inline int agl_should_run(struct ll_statahead_info *sai, + struct inode *inode) +{ + return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid); +} + +static inline struct ll_inode_info * +agl_first_entry(struct ll_statahead_info *sai) +{ + return list_entry(sai->sai_agls.next, struct ll_inode_info, + lli_agl_list); +} + +/* statahead window is full */ +static inline int sa_sent_full(struct ll_statahead_info *sai) +{ + return atomic_read(&sai->sai_cache_count) >= sai->sai_max; +} + +/* got async stat replies */ +static inline int sa_has_callback(struct ll_statahead_info *sai) +{ + return !list_empty(&sai->sai_interim_entries); +} + +static inline int agl_list_empty(struct ll_statahead_info *sai) +{ + return list_empty(&sai->sai_agls); +} + +/** + * (1) hit ratio less than 80% + * or + * (2) consecutive miss more than 8 + * then means low hit. + */ +static inline int sa_low_hit(struct ll_statahead_info *sai) +{ + return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) || + (sai->sai_consecutive_miss > 8)); +} + +/* + * if the given index is behind of statahead window more than + * SA_OMITTED_ENTRY_MAX, then it is old. + */ +static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) +{ + return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX < + sai->sai_index); +} + +/* allocate sa_entry and hash it to allow scanner process to find it */ +static struct sa_entry * +sa_alloc(struct dentry *parent, struct ll_statahead_info *sai, __u64 index, + const char *name, int len, const struct lu_fid *fid) +{ + struct ll_inode_info *lli; + struct sa_entry *entry; + int entry_size; + char *dname; + ENTRY; + + entry_size = sizeof(struct sa_entry) + (len & ~3) + 4; + OBD_ALLOC(entry, entry_size); + if (unlikely(entry == NULL)) + RETURN(ERR_PTR(-ENOMEM)); + + CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n", + len, name, entry, index); + + entry->se_index = index; + + entry->se_state = SA_ENTRY_INIT; + entry->se_size = entry_size; + dname = (char *)entry + sizeof(struct sa_entry); + memcpy(dname, name, len); + dname[len] = 0; + entry->se_qstr.hash = ll_full_name_hash(parent, name, len); + entry->se_qstr.len = len; + entry->se_qstr.name = dname; + entry->se_fid = *fid; + + lli = ll_i2info(sai->sai_dentry->d_inode); + + spin_lock(&lli->lli_sa_lock); + INIT_LIST_HEAD(&entry->se_list); + sa_rehash(sai, entry); + spin_unlock(&lli->lli_sa_lock); + + atomic_inc(&sai->sai_cache_count); + + RETURN(entry); +} + +/* free sa_entry, which should have been unhashed and not in any list */ +static void sa_free(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n", + entry->se_qstr.len, entry->se_qstr.name, entry, + entry->se_index); + + LASSERT(list_empty(&entry->se_list)); + LASSERT(sa_unhashed(entry)); + + OBD_FREE(entry, entry->se_size); + atomic_dec(&sai->sai_cache_count); +} + +/* + * find sa_entry by name, used by directory scanner, lock is not needed because + * only scanner can remove the entry from cache. + */ +static struct sa_entry * +sa_get(struct ll_statahead_info *sai, const struct qstr *qstr) +{ + struct sa_entry *entry; + int i = sa_hash(qstr->hash); + + list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { + if (entry->se_qstr.hash == qstr->hash && + entry->se_qstr.len == qstr->len && + memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0) + return entry; + } + return NULL; +} + +/* unhash and unlink sa_entry, and then free it */ +static inline void +sa_kill(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); + + LASSERT(!sa_unhashed(entry)); + LASSERT(!list_empty(&entry->se_list)); + LASSERT(sa_ready(entry)); + + sa_unhash(sai, entry); + + spin_lock(&lli->lli_sa_lock); + list_del_init(&entry->se_list); + spin_unlock(&lli->lli_sa_lock); + + if (entry->se_inode != NULL) + iput(entry->se_inode); + + sa_free(sai, entry); +} + +/* called by scanner after use, sa_entry will be killed */ +static void +sa_put(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + struct sa_entry *tmp, *next; + + if (entry != NULL && entry->se_state == SA_ENTRY_SUCC) { + struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode); + + sai->sai_hit++; + sai->sai_consecutive_miss = 0; + sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); + } else { + sai->sai_miss++; + sai->sai_consecutive_miss++; + } + + if (entry != NULL) + sa_kill(sai, entry); + + /* kill old completed entries, only scanner process does this, no need + * to lock */ + list_for_each_entry_safe(tmp, next, &sai->sai_entries, se_list) { + if (!is_omitted_entry(sai, tmp->se_index)) + break; + sa_kill(sai, tmp); + } + + wake_up(&sai->sai_thread.t_ctl_waitq); +} + +/* update state and sort add entry to sai_entries by index, return true if + * scanner is waiting on this entry. */ +static bool +__sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret) +{ + struct sa_entry *se; + struct list_head *pos = &sai->sai_entries; + __u64 index = entry->se_index; + + LASSERT(!sa_ready(entry)); + LASSERT(list_empty(&entry->se_list)); + + list_for_each_entry_reverse(se, &sai->sai_entries, se_list) { + if (se->se_index < entry->se_index) { + pos = &se->se_list; + break; + } + } + list_add(&entry->se_list, pos); + entry->se_state = ret < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC; + + return (index == sai->sai_index_wait); +} + +/* finish async stat RPC arguments */ +static void sa_fini_data(struct md_enqueue_info *minfo) +{ + ll_unlock_md_op_lsm(&minfo->mi_data); + iput(minfo->mi_dir); + OBD_FREE_PTR(minfo); +} + +static int ll_statahead_interpret(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, int rc); + +/* + * prepare arguments for async stat RPC. + */ +static struct md_enqueue_info * +sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry) +{ + struct md_enqueue_info *minfo; + struct ldlm_enqueue_info *einfo; + struct md_op_data *op_data; + + OBD_ALLOC_PTR(minfo); + if (minfo == NULL) + return ERR_PTR(-ENOMEM); + + op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, + entry->se_qstr.name, entry->se_qstr.len, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(minfo); + return (struct md_enqueue_info *)op_data; + } + + if (child == NULL) + op_data->op_fid2 = entry->se_fid; + + minfo->mi_it.it_op = IT_GETATTR; + minfo->mi_dir = igrab(dir); + minfo->mi_cb = ll_statahead_interpret; + minfo->mi_cbdata = entry; + + einfo = &minfo->mi_einfo; + einfo->ei_type = LDLM_IBITS; + einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); + einfo->ei_cb_bl = ll_md_blocking_ast; + einfo->ei_cb_cp = ldlm_completion_ast; + einfo->ei_cb_gl = NULL; + einfo->ei_cbdata = NULL; + + return minfo; +} + +/* + * release resources used in async stat RPC, update entry state and wakeup if + * scanner process it waiting on this entry. + */ +static void +sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret) +{ + struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); + struct md_enqueue_info *minfo = entry->se_minfo; + struct ptlrpc_request *req = entry->se_req; + bool wakeup; + + /* release resources used in RPC */ + if (minfo) { + entry->se_minfo = NULL; + ll_intent_release(&minfo->mi_it); + sa_fini_data(minfo); + } + + if (req) { + entry->se_req = NULL; + ptlrpc_req_finished(req); + } + + spin_lock(&lli->lli_sa_lock); + wakeup = __sa_make_ready(sai, entry, ret); + spin_unlock(&lli->lli_sa_lock); + + if (wakeup) + wake_up(&sai->sai_waitq); +} + +/* insert inode into the list of sai_agls */ +static void ll_agl_add(struct ll_statahead_info *sai, + struct inode *inode, int index) +{ + struct ll_inode_info *child = ll_i2info(inode); + struct ll_inode_info *parent = ll_i2info(sai->sai_dentry->d_inode); + int added = 0; + + spin_lock(&child->lli_agl_lock); + if (child->lli_agl_index == 0) { + child->lli_agl_index = index; + spin_unlock(&child->lli_agl_lock); + + LASSERT(list_empty(&child->lli_agl_list)); + + igrab(inode); + spin_lock(&parent->lli_agl_lock); + if (agl_list_empty(sai)) + added = 1; + list_add_tail(&child->lli_agl_list, &sai->sai_agls); + spin_unlock(&parent->lli_agl_lock); + } else { + spin_unlock(&child->lli_agl_lock); + } + + if (added > 0) + wake_up(&sai->sai_agl_thread.t_ctl_waitq); +} + +/* allocate sai */ +static struct ll_statahead_info *ll_sai_alloc(struct dentry *dentry) +{ + struct ll_statahead_info *sai; + struct ll_inode_info *lli = ll_i2info(dentry->d_inode); + int i; + ENTRY; + + OBD_ALLOC_PTR(sai); + if (!sai) + RETURN(NULL); + + sai->sai_dentry = dget(dentry); + atomic_set(&sai->sai_refcount, 1); + sai->sai_max = LL_SA_RPC_MIN; + sai->sai_index = 1; + init_waitqueue_head(&sai->sai_waitq); + init_waitqueue_head(&sai->sai_thread.t_ctl_waitq); + init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq); + + INIT_LIST_HEAD(&sai->sai_interim_entries); + INIT_LIST_HEAD(&sai->sai_entries); + INIT_LIST_HEAD(&sai->sai_agls); + + for (i = 0; i < LL_SA_CACHE_SIZE; i++) { + INIT_LIST_HEAD(&sai->sai_cache[i]); + spin_lock_init(&sai->sai_cache_lock[i]); + } + atomic_set(&sai->sai_cache_count, 0); + + spin_lock(&sai_generation_lock); + lli->lli_sa_generation = ++sai_generation; + if (unlikely(sai_generation == 0)) + lli->lli_sa_generation = ++sai_generation; + spin_unlock(&sai_generation_lock); + + RETURN(sai); +} + +/* free sai */ +static inline void ll_sai_free(struct ll_statahead_info *sai) +{ + LASSERT(sai->sai_dentry != NULL); + dput(sai->sai_dentry); + OBD_FREE_PTR(sai); +} + +/* + * take refcount of sai if sai for @dir exists, which means statahead is on for + * this directory. + */ +static inline struct ll_statahead_info *ll_sai_get(struct inode *dir) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = NULL; + + spin_lock(&lli->lli_sa_lock); + sai = lli->lli_sai; + if (sai != NULL) + atomic_inc(&sai->sai_refcount); + spin_unlock(&lli->lli_sa_lock); + + return sai; +} + +/* + * put sai refcount after use, if refcount reaches zero, free sai and sa_entries + * attached to it. + */ +static void ll_sai_put(struct ll_statahead_info *sai) +{ + struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); + + if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { + struct sa_entry *entry, *next; + struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode); + + lli->lli_sai = NULL; + spin_unlock(&lli->lli_sa_lock); + + LASSERT(thread_is_stopped(&sai->sai_thread)); + LASSERT(thread_is_stopped(&sai->sai_agl_thread)); + LASSERT(sai->sai_sent == sai->sai_replied); + LASSERT(!sa_has_callback(sai)); + + list_for_each_entry_safe(entry, next, &sai->sai_entries, + se_list) + sa_kill(sai, entry); + + LASSERT(atomic_read(&sai->sai_cache_count) == 0); + LASSERT(agl_list_empty(sai)); + + ll_sai_free(sai); + atomic_dec(&sbi->ll_sa_running); + } +} + +/* Do NOT forget to drop inode refcount when into sai_agls. */ +static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) +{ + struct ll_inode_info *lli = ll_i2info(inode); + u64 index = lli->lli_agl_index; + ktime_t expire; + int rc; + + ENTRY; + LASSERT(list_empty(&lli->lli_agl_list)); + + /* AGL maybe fall behind statahead with one entry */ + if (is_omitted_entry(sai, index + 1)) { + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + /* In case of restore, the MDT has the right size and has already + * sent it back without granting the layout lock, inode is up-to-date. + * Then AGL (async glimpse lock) is useless. + * Also to glimpse we need the layout, in case of a runninh restore + * the MDT holds the layout lock so the glimpse will block up to the + * end of restore (statahead/agl will block) */ + if (ll_file_test_flag(lli, LLIF_FILE_RESTORING)) { + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + /* Someone is in glimpse (sync or async), do nothing. */ + rc = down_write_trylock(&lli->lli_glimpse_sem); + if (rc == 0) { + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + /* + * Someone triggered glimpse within 1 sec before. + * 1) The former glimpse succeeded with glimpse lock granted by OST, and + * if the lock is still cached on client, AGL needs to do nothing. If + * it is cancelled by other client, AGL maybe cannot obtaion new lock + * for no glimpse callback triggered by AGL. + * 2) The former glimpse succeeded, but OST did not grant glimpse lock. + * Under such case, it is quite possible that the OST will not grant + * glimpse lock for AGL also. + * 3) The former glimpse failed, compared with other two cases, it is + * relative rare. AGL can ignore such case, and it will not muchly + * affect the performance. + */ + expire = ktime_sub_ns(ktime_get(), NSEC_PER_SEC); + if (ktime_to_ns(lli->lli_glimpse_time) && + ktime_before(expire, lli->lli_glimpse_time)) { + up_write(&lli->lli_glimpse_sem); + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + CDEBUG(D_READA, "Handling (init) async glimpse: inode = " + DFID", idx = %llu\n", PFID(&lli->lli_fid), index); + + cl_agl(inode); + lli->lli_agl_index = 0; + lli->lli_glimpse_time = ktime_get(); + up_write(&lli->lli_glimpse_sem); + + CDEBUG(D_READA, "Handled (init) async glimpse: inode= " + DFID", idx = %llu, rc = %d\n", + PFID(&lli->lli_fid), index, rc); + + iput(inode); + + EXIT; +} + +/* + * prepare inode for sa entry, add it into agl list, now sa_entry is ready + * to be used by scanner process. + */ +static void sa_instantiate(struct ll_statahead_info *sai, + struct sa_entry *entry) +{ + struct inode *dir = sai->sai_dentry->d_inode; + struct inode *child; + struct md_enqueue_info *minfo; + struct lookup_intent *it; + struct ptlrpc_request *req; + struct mdt_body *body; + int rc = 0; + ENTRY; + + LASSERT(entry->se_handle != 0); + + minfo = entry->se_minfo; + it = &minfo->mi_it; + req = entry->se_req; + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EFAULT); + + child = entry->se_inode; + if (child != NULL) { + /* revalidate; unlinked and re-created with the same name */ + if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, + &body->mbo_fid1))) { + entry->se_inode = NULL; + iput(child); + child = NULL; + } + } + + it->it_lock_handle = entry->se_handle; + rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL); + if (rc != 1) + GOTO(out, rc = -EAGAIN); + + rc = ll_prep_inode(&child, req, dir->i_sb, it); + if (rc) + GOTO(out, rc); + + CDEBUG(D_READA, "%s: setting %.*s"DFID" l_data to inode %p\n", + ll_get_fsname(child->i_sb, NULL, 0), + entry->se_qstr.len, entry->se_qstr.name, + PFID(ll_inode2fid(child)), child); + ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); + + entry->se_inode = child; + + if (agl_should_run(sai, child)) + ll_agl_add(sai, child, entry->se_index); + + EXIT; + +out: + /* sa_make_ready() will drop ldlm ibits lock refcount by calling + * ll_intent_drop_lock() in spite of failures. Do not worry about + * calling ll_intent_drop_lock() more than once. */ + sa_make_ready(sai, entry, rc); +} + +/* once there are async stat replies, instantiate sa_entry from replies */ +static void sa_handle_callback(struct ll_statahead_info *sai) +{ + struct ll_inode_info *lli; + + lli = ll_i2info(sai->sai_dentry->d_inode); + + while (sa_has_callback(sai)) { + struct sa_entry *entry; + + spin_lock(&lli->lli_sa_lock); + if (unlikely(!sa_has_callback(sai))) { + spin_unlock(&lli->lli_sa_lock); + break; + } + entry = list_entry(sai->sai_interim_entries.next, + struct sa_entry, se_list); + list_del_init(&entry->se_list); + spin_unlock(&lli->lli_sa_lock); + + sa_instantiate(sai, entry); + } +} + +/* + * callback for async stat RPC, because this is called in ptlrpcd context, we + * only put sa_entry in sai_interim_entries, and wake up statahead thread to + * really prepare inode and instantiate sa_entry later. + */ +static int ll_statahead_interpret(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, int rc) +{ + struct lookup_intent *it = &minfo->mi_it; + struct inode *dir = minfo->mi_dir; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = lli->lli_sai; + struct sa_entry *entry = (struct sa_entry *)minfo->mi_cbdata; + __u64 handle = 0; + wait_queue_head_t *waitq = NULL; + ENTRY; + + if (it_disposition(it, DISP_LOOKUP_NEG)) + rc = -ENOENT; + + /* because statahead thread will wait for all inflight RPC to finish, + * sai should be always valid, no need to refcount */ + LASSERT(sai != NULL); + LASSERT(!thread_is_stopped(&sai->sai_thread)); + LASSERT(entry != NULL); + + CDEBUG(D_READA, "sa_entry %.*s rc %d\n", + entry->se_qstr.len, entry->se_qstr.name, rc); + + if (rc != 0) { + ll_intent_release(it); + sa_fini_data(minfo); + } else { + /* release ibits lock ASAP to avoid deadlock when statahead + * thread enqueues lock on parent in readdir and another + * process enqueues lock on child with parent lock held, eg. + * unlink. */ + handle = it->it_lock_handle; + ll_intent_drop_lock(it); + ll_unlock_md_op_lsm(&minfo->mi_data); + } + + spin_lock(&lli->lli_sa_lock); + if (rc != 0) { + if (__sa_make_ready(sai, entry, rc)) + waitq = &sai->sai_waitq; + } else { + entry->se_minfo = minfo; + entry->se_req = ptlrpc_request_addref(req); + /* Release the async ibits lock ASAP to avoid deadlock + * when statahead thread tries to enqueue lock on parent + * for readpage and other tries to enqueue lock on child + * with parent's lock held, for example: unlink. */ + entry->se_handle = handle; + if (!sa_has_callback(sai)) + waitq = &sai->sai_thread.t_ctl_waitq; + + list_add_tail(&entry->se_list, &sai->sai_interim_entries); + } + sai->sai_replied++; + + smp_mb(); + if (waitq != NULL) + wake_up(waitq); + spin_unlock(&lli->lli_sa_lock); + + RETURN(rc); +} + +/* async stat for file not found in dcache */ +static int sa_lookup(struct inode *dir, struct sa_entry *entry) +{ + struct md_enqueue_info *minfo; + int rc; + ENTRY; + + minfo = sa_prep_data(dir, NULL, entry); + if (IS_ERR(minfo)) + RETURN(PTR_ERR(minfo)); + + rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo); + if (rc < 0) + sa_fini_data(minfo); + + RETURN(rc); +} + +/** + * async stat for file found in dcache, similar to .revalidate + * + * \retval 1 dentry valid, no RPC sent + * \retval 0 dentry invalid, will send async stat RPC + * \retval negative number upon error + */ +static int sa_revalidate(struct inode *dir, struct sa_entry *entry, + struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct lookup_intent it = { .it_op = IT_GETATTR, + .it_lock_handle = 0 }; + struct md_enqueue_info *minfo; + int rc; + ENTRY; + + if (unlikely(inode == NULL)) + RETURN(1); + + if (d_mountpoint(dentry)) + RETURN(1); + + minfo = sa_prep_data(dir, inode, entry); + if (IS_ERR(minfo)) + RETURN(PTR_ERR(minfo)); + + entry->se_inode = igrab(inode); + rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode), + NULL); + if (rc == 1) { + entry->se_handle = it.it_lock_handle; + ll_intent_release(&it); + sa_fini_data(minfo); + RETURN(1); + } + + rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo); + if (rc < 0) { + entry->se_inode = NULL; + iput(inode); + sa_fini_data(minfo); + } + + RETURN(rc); +} + +/* async stat for file with @name */ +static void sa_statahead(struct dentry *parent, const char *name, int len, + const struct lu_fid *fid) +{ + struct inode *dir = parent->d_inode; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = lli->lli_sai; + struct dentry *dentry = NULL; + struct sa_entry *entry; + int rc; + ENTRY; + + entry = sa_alloc(parent, sai, sai->sai_index, name, len, fid); + if (IS_ERR(entry)) + RETURN_EXIT; + + dentry = d_lookup(parent, &entry->se_qstr); + if (!dentry) { + rc = sa_lookup(dir, entry); + } else { + rc = sa_revalidate(dir, entry, dentry); + if (rc == 1 && agl_should_run(sai, dentry->d_inode)) + ll_agl_add(sai, dentry->d_inode, entry->se_index); + } + + if (dentry != NULL) + dput(dentry); + + if (rc != 0) + sa_make_ready(sai, entry, rc); + else + sai->sai_sent++; + + sai->sai_index++; + + EXIT; +} + +/* async glimpse (agl) thread main function */ +static int ll_agl_thread(void *arg) +{ + struct dentry *parent = (struct dentry *)arg; + struct inode *dir = parent->d_inode; + struct ll_inode_info *plli = ll_i2info(dir); + struct ll_inode_info *clli; + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ll_statahead_info *sai; + struct ptlrpc_thread *thread; + struct l_wait_info lwi = { 0 }; + ENTRY; + + + sai = ll_sai_get(dir); + thread = &sai->sai_agl_thread; + thread->t_pid = current_pid(); + CDEBUG(D_READA, "agl thread started: sai %p, parent %.*s\n", + sai, parent->d_name.len, parent->d_name.name); + + atomic_inc(&sbi->ll_agl_total); + spin_lock(&plli->lli_agl_lock); + sai->sai_agl_valid = 1; + if (thread_is_init(thread)) + /* If someone else has changed the thread state + * (e.g. already changed to SVC_STOPPING), we can't just + * blindly overwrite that setting. */ + thread_set_flags(thread, SVC_RUNNING); + spin_unlock(&plli->lli_agl_lock); + wake_up(&thread->t_ctl_waitq); + + while (1) { + l_wait_event(thread->t_ctl_waitq, + !agl_list_empty(sai) || + !thread_is_running(thread), + &lwi); + + if (!thread_is_running(thread)) + break; + + spin_lock(&plli->lli_agl_lock); + /* The statahead thread maybe help to process AGL entries, + * so check whether list empty again. */ + if (!agl_list_empty(sai)) { + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + ll_agl_trigger(&clli->lli_vfs_inode, sai); + cond_resched(); + } else { + spin_unlock(&plli->lli_agl_lock); + } + } + + spin_lock(&plli->lli_agl_lock); + sai->sai_agl_valid = 0; + while (!agl_list_empty(sai)) { + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + clli->lli_agl_index = 0; + iput(&clli->lli_vfs_inode); + spin_lock(&plli->lli_agl_lock); + } + thread_set_flags(thread, SVC_STOPPED); + spin_unlock(&plli->lli_agl_lock); + wake_up(&thread->t_ctl_waitq); + ll_sai_put(sai); + CDEBUG(D_READA, "agl thread stopped: sai %p, parent %.*s\n", + sai, parent->d_name.len, parent->d_name.name); + RETURN(0); +} + +/* start agl thread */ +static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) +{ + struct ptlrpc_thread *thread = &sai->sai_agl_thread; + struct l_wait_info lwi = { 0 }; + struct ll_inode_info *plli; + struct task_struct *task; + ENTRY; + + CDEBUG(D_READA, "start agl thread: sai %p, parent %.*s\n", + sai, parent->d_name.len, parent->d_name.name); + + plli = ll_i2info(parent->d_inode); + task = kthread_run(ll_agl_thread, parent, + "ll_agl_%u", plli->lli_opendir_pid); + if (IS_ERR(task)) { + CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task)); + thread_set_flags(thread, SVC_STOPPED); + RETURN_EXIT; + } + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); + EXIT; +} + +/* statahead thread main function */ +static int ll_statahead_thread(void *arg) +{ + struct dentry *parent = (struct dentry *)arg; + struct inode *dir = parent->d_inode; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ll_statahead_info *sai; + struct ptlrpc_thread *sa_thread; + struct ptlrpc_thread *agl_thread; + int first = 0; + struct md_op_data *op_data; + struct ll_dir_chain chain; + struct l_wait_info lwi = { 0 }; + struct page *page = NULL; + __u64 pos = 0; + int rc = 0; + ENTRY; + + sai = ll_sai_get(dir); + sa_thread = &sai->sai_thread; + agl_thread = &sai->sai_agl_thread; + sa_thread->t_pid = current_pid(); + CDEBUG(D_READA, "statahead thread starting: sai %p, parent %.*s\n", + sai, parent->d_name.len, parent->d_name.name); + + OBD_ALLOC_PTR(op_data); + if (IS_ERR(op_data)) + GOTO(out, rc = PTR_ERR(op_data)); + + if (sbi->ll_flags & LL_SBI_AGL_ENABLED) + ll_start_agl(parent, sai); + + atomic_inc(&sbi->ll_sa_total); + spin_lock(&lli->lli_sa_lock); + if (thread_is_init(sa_thread)) + /* If someone else has changed the thread state + * (e.g. already changed to SVC_STOPPING), we can't just + * blindly overwrite that setting. */ + thread_set_flags(sa_thread, SVC_RUNNING); + spin_unlock(&lli->lli_sa_lock); + wake_up(&sa_thread->t_ctl_waitq); + + ll_dir_chain_init(&chain); + while (pos != MDS_DIR_END_OFF && thread_is_running(sa_thread)) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + op_data = ll_prep_md_op_data(op_data, dir, dir, NULL, 0, 0, + LUSTRE_OPC_ANY, dir); + if (IS_ERR(op_data)) { + rc = PTR_ERR(op_data); + break; + } + + sai->sai_in_readpage = 1; + page = ll_get_dir_page(dir, op_data, pos, &chain); + ll_unlock_md_op_lsm(op_data); + sai->sai_in_readpage = 0; + if (IS_ERR(page)) { + rc = PTR_ERR(page); + CDEBUG(D_READA, "error reading dir "DFID" at %llu" + "/%llu opendir_pid = %u: rc = %d\n", + PFID(ll_inode2fid(dir)), pos, sai->sai_index, + lli->lli_opendir_pid, rc); + break; + } + + dp = page_address(page); + for (ent = lu_dirent_start(dp); + ent != NULL && thread_is_running(sa_thread) && + !sa_low_hit(sai); + ent = lu_dirent_next(ent)) { + __u64 hash; + int namelen; + char *name; + struct lu_fid fid; + + hash = le64_to_cpu(ent->lde_hash); + if (unlikely(hash < pos)) + /* + * Skip until we find target hash value. + */ + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (unlikely(namelen == 0)) + /* + * Skip dummy record. + */ + continue; + + name = ent->lde_name; + if (name[0] == '.') { + if (namelen == 1) { + /* + * skip "." + */ + continue; + } else if (name[1] == '.' && namelen == 2) { + /* + * skip ".." + */ + continue; + } else if (!sai->sai_ls_all) { + /* + * skip hidden files. + */ + sai->sai_skip_hidden++; + continue; + } + } + + /* + * don't stat-ahead first entry. + */ + if (unlikely(++first == 1)) + continue; + + fid_le_to_cpu(&fid, &ent->lde_fid); + + /* wait for spare statahead window */ + do { + l_wait_event(sa_thread->t_ctl_waitq, + !sa_sent_full(sai) || + sa_has_callback(sai) || + !agl_list_empty(sai) || + !thread_is_running(sa_thread), + &lwi); + + sa_handle_callback(sai); + + spin_lock(&lli->lli_agl_lock); + while (sa_sent_full(sai) && + !agl_list_empty(sai)) { + struct ll_inode_info *clli; + + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&lli->lli_agl_lock); + + ll_agl_trigger(&clli->lli_vfs_inode, + sai); + cond_resched(); + spin_lock(&lli->lli_agl_lock); + } + spin_unlock(&lli->lli_agl_lock); + } while (sa_sent_full(sai) && + thread_is_running(sa_thread)); + + sa_statahead(parent, name, namelen, &fid); + } + + pos = le64_to_cpu(dp->ldp_hash_end); + ll_release_page(dir, page, + le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + + if (sa_low_hit(sai)) { + rc = -EFAULT; + atomic_inc(&sbi->ll_sa_wrong); + CDEBUG(D_READA, "Statahead for dir "DFID" hit " + "ratio too low: hit/miss %llu/%llu" + ", sent/replied %llu/%llu, stopping " + "statahead thread: pid %d\n", + PFID(&lli->lli_fid), sai->sai_hit, + sai->sai_miss, sai->sai_sent, + sai->sai_replied, current_pid()); + break; + } + } + ll_dir_chain_fini(&chain); + ll_finish_md_op_data(op_data); + + if (rc < 0) { + spin_lock(&lli->lli_sa_lock); + thread_set_flags(sa_thread, SVC_STOPPING); + lli->lli_sa_enabled = 0; + spin_unlock(&lli->lli_sa_lock); + } + + /* statahead is finished, but statahead entries need to be cached, wait + * for file release to stop me. */ + while (thread_is_running(sa_thread)) { + l_wait_event(sa_thread->t_ctl_waitq, + sa_has_callback(sai) || + !thread_is_running(sa_thread), + &lwi); + + sa_handle_callback(sai); + } + + EXIT; +out: + if (sai->sai_agl_valid) { + spin_lock(&lli->lli_agl_lock); + thread_set_flags(agl_thread, SVC_STOPPING); + spin_unlock(&lli->lli_agl_lock); + wake_up(&agl_thread->t_ctl_waitq); + + CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n", + sai, (unsigned int)agl_thread->t_pid); + l_wait_event(agl_thread->t_ctl_waitq, + thread_is_stopped(agl_thread), + &lwi); + } else { + /* Set agl_thread flags anyway. */ + thread_set_flags(agl_thread, SVC_STOPPED); + } + + /* wait for inflight statahead RPCs to finish, and then we can free sai + * safely because statahead RPC will access sai data */ + while (sai->sai_sent != sai->sai_replied) { + /* in case we're not woken up, timeout wait */ + lwi = LWI_TIMEOUT(msecs_to_jiffies(MSEC_PER_SEC >> 3), + NULL, NULL); + l_wait_event(sa_thread->t_ctl_waitq, + sai->sai_sent == sai->sai_replied, &lwi); + } + + /* release resources held by statahead RPCs */ + sa_handle_callback(sai); + + spin_lock(&lli->lli_sa_lock); + thread_set_flags(sa_thread, SVC_STOPPED); + spin_unlock(&lli->lli_sa_lock); + + CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %.*s\n", + sai, parent->d_name.len, parent->d_name.name); + + wake_up(&sai->sai_waitq); + wake_up(&sa_thread->t_ctl_waitq); + ll_sai_put(sai); + + return rc; +} + +/* authorize opened dir handle @key to statahead */ +void ll_authorize_statahead(struct inode *dir, void *key) +{ + struct ll_inode_info *lli = ll_i2info(dir); + + spin_lock(&lli->lli_sa_lock); + if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL) { + /* + * if lli_sai is not NULL, it means previous statahead is not + * finished yet, we'd better not start a new statahead for now. + */ + LASSERT(lli->lli_opendir_pid == 0); + lli->lli_opendir_key = key; + lli->lli_opendir_pid = current_pid(); + lli->lli_sa_enabled = 1; + } + spin_unlock(&lli->lli_sa_lock); +} + +/* + * deauthorize opened dir handle @key to statahead, and notify statahead thread + * to quit if it's running. + */ +void ll_deauthorize_statahead(struct inode *dir, void *key) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai; + + LASSERT(lli->lli_opendir_key == key); + LASSERT(lli->lli_opendir_pid != 0); + + CDEBUG(D_READA, "deauthorize statahead for "DFID"\n", + PFID(&lli->lli_fid)); + + spin_lock(&lli->lli_sa_lock); + lli->lli_opendir_key = NULL; + lli->lli_opendir_pid = 0; + lli->lli_sa_enabled = 0; + sai = lli->lli_sai; + if (sai != NULL && thread_is_running(&sai->sai_thread)) { + /* + * statahead thread may not quit yet because it needs to cache + * entries, now it's time to tell it to quit. + * + * In case sai is released, wake_up() is called inside spinlock, + * so we have to call smp_mb() explicitely to serialize ops. + */ + thread_set_flags(&sai->sai_thread, SVC_STOPPING); + smp_mb(); + wake_up(&sai->sai_thread.t_ctl_waitq); + } + spin_unlock(&lli->lli_sa_lock); +} + +enum { + /** + * not first dirent, or is "." + */ + LS_NOT_FIRST_DE = 0, + /** + * the first non-hidden dirent + */ + LS_FIRST_DE, + /** + * the first hidden dirent, that is "." + */ + LS_FIRST_DOT_DE +}; + +/* file is first dirent under @dir */ +static int is_first_dirent(struct inode *dir, struct dentry *dentry) +{ + struct ll_dir_chain chain; + struct qstr *target = &dentry->d_name; + struct md_op_data *op_data; + int dot_de; + struct page *page = NULL; + int rc = LS_NOT_FIRST_DE; + __u64 pos = 0; + ENTRY; + + op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, + LUSTRE_OPC_ANY, dir); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + /** + *FIXME choose the start offset of the readdir + */ + + ll_dir_chain_init(&chain); + page = ll_get_dir_page(dir, op_data, 0, &chain); + + while (1) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + if (IS_ERR(page)) { + struct ll_inode_info *lli = ll_i2info(dir); + + rc = PTR_ERR(page); + CERROR("%s: reading dir "DFID" at %llu" + "opendir_pid = %u : rc = %d\n", + ll_get_fsname(dir->i_sb, NULL, 0), + PFID(ll_inode2fid(dir)), pos, + lli->lli_opendir_pid, rc); + break; + } + + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL; + ent = lu_dirent_next(ent)) { + __u64 hash; + int namelen; + char *name; + + hash = le64_to_cpu(ent->lde_hash); + /* The ll_get_dir_page() can return any page containing + * the given hash which may be not the start hash. */ + if (unlikely(hash < pos)) + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (unlikely(namelen == 0)) + /* + * skip dummy record. + */ + continue; + + name = ent->lde_name; + if (name[0] == '.') { + if (namelen == 1) + /* + * skip "." + */ + continue; + else if (name[1] == '.' && namelen == 2) + /* + * skip ".." + */ + continue; + else + dot_de = 1; + } else { + dot_de = 0; + } + + if (dot_de && target->name[0] != '.') { + CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", + target->len, target->name, + namelen, name); + continue; + } + + if (target->len != namelen || + memcmp(target->name, name, namelen) != 0) + rc = LS_NOT_FIRST_DE; + else if (!dot_de) + rc = LS_FIRST_DE; + else + rc = LS_FIRST_DOT_DE; + + ll_release_page(dir, page, false); + GOTO(out, rc); + } + pos = le64_to_cpu(dp->ldp_hash_end); + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + ll_release_page(dir, page, false); + GOTO(out, rc); + } else { + /* + * chain is exhausted + * Normal case: continue to the next page. + */ + ll_release_page(dir, page, le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + page = ll_get_dir_page(dir, op_data, pos, &chain); + } + } + EXIT; +out: + ll_dir_chain_fini(&chain); + ll_finish_md_op_data(op_data); + return rc; +} + +/** + * revalidate @dentryp from statahead cache + * + * \param[in] dir parent directory + * \param[in] sai sai structure + * \param[out] dentryp pointer to dentry which will be revalidated + * \param[in] unplug unplug statahead window only (normally for negative + * dentry) + * \retval 1 on success, dentry is saved in @dentryp + * \retval 0 if revalidation failed (no proper lock on client) + * \retval negative number upon error + */ +static int revalidate_statahead_dentry(struct inode *dir, + struct ll_statahead_info *sai, + struct dentry **dentryp, + bool unplug) +{ + struct sa_entry *entry = NULL; + struct l_wait_info lwi = { 0 }; + struct ll_dentry_data *ldd; + struct ll_inode_info *lli = ll_i2info(dir); + int rc = 0; + ENTRY; + + if ((*dentryp)->d_name.name[0] == '.') { + if (sai->sai_ls_all || + sai->sai_miss_hidden >= sai->sai_skip_hidden) { + /* + * Hidden dentry is the first one, or statahead + * thread does not skip so many hidden dentries + * before "sai_ls_all" enabled as below. + */ + } else { + if (!sai->sai_ls_all) + /* + * It maybe because hidden dentry is not + * the first one, "sai_ls_all" was not + * set, then "ls -al" missed. Enable + * "sai_ls_all" for such case. + */ + sai->sai_ls_all = 1; + + /* + * Such "getattr" has been skipped before + * "sai_ls_all" enabled as above. + */ + sai->sai_miss_hidden++; + RETURN(-EAGAIN); + } + } + + if (unplug) + GOTO(out, rc = 1); + + entry = sa_get(sai, &(*dentryp)->d_name); + if (entry == NULL) + GOTO(out, rc = -EAGAIN); + + /* if statahead is busy in readdir, help it do post-work */ + if (!sa_ready(entry) && sai->sai_in_readpage) + sa_handle_callback(sai); + + if (!sa_ready(entry)) { + spin_lock(&lli->lli_sa_lock); + sai->sai_index_wait = entry->se_index; + spin_unlock(&lli->lli_sa_lock); + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL, + LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(sai->sai_waitq, sa_ready(entry), &lwi); + if (rc < 0) { + /* + * entry may not be ready, so it may be used by inflight + * statahead RPC, don't free it. + */ + entry = NULL; + GOTO(out, rc = -EAGAIN); + } + } + + if (entry->se_state == SA_ENTRY_SUCC && entry->se_inode != NULL) { + struct inode *inode = entry->se_inode; + struct lookup_intent it = { .it_op = IT_GETATTR, + .it_lock_handle = + entry->se_handle }; + __u64 bits; + + rc = md_revalidate_lock(ll_i2mdexp(dir), &it, + ll_inode2fid(inode), &bits); + if (rc == 1) { + if ((*dentryp)->d_inode == NULL) { + struct dentry *alias; + + alias = ll_splice_alias(inode, *dentryp); + if (IS_ERR(alias)) { + ll_intent_release(&it); + GOTO(out, rc = PTR_ERR(alias)); + } + *dentryp = alias; + /* statahead prepared this inode, transfer inode + * refcount from sa_entry to dentry */ + entry->se_inode = NULL; + } else if ((*dentryp)->d_inode != inode) { + /* revalidate, but inode is recreated */ + CDEBUG(D_READA, + "%s: stale dentry %.*s inode " + DFID", statahead inode "DFID + "\n", + ll_get_fsname((*dentryp)->d_inode->i_sb, + NULL, 0), + (*dentryp)->d_name.len, + (*dentryp)->d_name.name, + PFID(ll_inode2fid((*dentryp)->d_inode)), + PFID(ll_inode2fid(inode))); + ll_intent_release(&it); + GOTO(out, rc = -ESTALE); + } + + if ((bits & MDS_INODELOCK_LOOKUP) && + d_lustre_invalid(*dentryp)) { + d_lustre_revalidate(*dentryp); + ll_update_dir_depth(dir, (*dentryp)->d_inode); + } + + ll_intent_release(&it); + } + } +out: + /* + * statahead cached sa_entry can be used only once, and will be killed + * right after use, so if lookup/revalidate accessed statahead cache, + * set dentry ldd_sa_generation to parent lli_sa_generation, later if we + * stat this file again, we know we've done statahead before, see + * dentry_may_statahead(). + */ + ldd = ll_d2d(*dentryp); + /* ldd can be NULL if llite lookup failed. */ + if (ldd != NULL) + ldd->lld_sa_generation = lli->lli_sa_generation; + sa_put(sai, entry); + + RETURN(rc); +} + +/** + * start statahead thread + * + * \param[in] dir parent directory + * \param[in] dentry dentry that triggers statahead, normally the first + * dirent under @dir + * \retval -EAGAIN on success, because when this function is + * called, it's already in lookup call, so client should + * do it itself instead of waiting for statahead thread + * to do it asynchronously. + * \retval negative number upon error + */ +static int start_statahead_thread(struct inode *dir, struct dentry *dentry) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = NULL; + struct dentry *parent = dentry->d_parent; + struct ptlrpc_thread *thread; + struct l_wait_info lwi = { 0 }; + struct task_struct *task; + struct ll_sb_info *sbi = ll_i2sbi(parent->d_inode); + int first = LS_FIRST_DE; + int rc = 0; + ENTRY; + + /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ + first = is_first_dirent(dir, dentry); + if (first == LS_NOT_FIRST_DE) + /* It is not "ls -{a}l" operation, no need statahead for it. */ + GOTO(out, rc = -EFAULT); + + if (unlikely(atomic_inc_return(&sbi->ll_sa_running) > + sbi->ll_sa_running_max)) { + CDEBUG(D_READA, + "Too many concurrent statahead instances, " + "avoid new statahead instance temporarily.\n"); + GOTO(out, rc = -EMFILE); + } + + sai = ll_sai_alloc(parent); + if (sai == NULL) + GOTO(out, rc = -ENOMEM); + + sai->sai_ls_all = (first == LS_FIRST_DOT_DE); + + /* if current lli_opendir_key was deauthorized, or dir re-opened by + * another process, don't start statahead, otherwise the newly spawned + * statahead thread won't be notified to quit. */ + spin_lock(&lli->lli_sa_lock); + if (unlikely(lli->lli_sai != NULL || + lli->lli_opendir_key == NULL || + lli->lli_opendir_pid != current->pid)) { + spin_unlock(&lli->lli_sa_lock); + GOTO(out, rc = -EPERM); + } + lli->lli_sai = sai; + spin_unlock(&lli->lli_sa_lock); + + CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %.*s]\n", + current_pid(), parent->d_name.len, parent->d_name.name); + + task = kthread_run(ll_statahead_thread, parent, "ll_sa_%u", + lli->lli_opendir_pid); + thread = &sai->sai_thread; + if (IS_ERR(task)) { + spin_lock(&lli->lli_sa_lock); + lli->lli_sai = NULL; + spin_unlock(&lli->lli_sa_lock); + rc = PTR_ERR(task); + CERROR("can't start ll_sa thread, rc: %d\n", rc); + GOTO(out, rc); + } + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); + ll_sai_put(sai); + + /* + * We don't stat-ahead for the first dirent since we are already in + * lookup. + */ + RETURN(-EAGAIN); + +out: + /* once we start statahead thread failed, disable statahead so that + * subsequent stat won't waste time to try it. */ + spin_lock(&lli->lli_sa_lock); + if (lli->lli_opendir_pid == current->pid) + lli->lli_sa_enabled = 0; + spin_unlock(&lli->lli_sa_lock); + + if (sai != NULL) + ll_sai_free(sai); + if (first != LS_NOT_FIRST_DE) + atomic_dec(&sbi->ll_sa_running); + + RETURN(rc); +} + +/** + * statahead entry function, this is called when client getattr on a file, it + * will start statahead thread if this is the first dir entry, else revalidate + * dentry from statahead cache. + * + * \param[in] dir parent directory + * \param[out] dentryp dentry to getattr + * \param[in] unplug unplug statahead window only (normally for negative + * dentry) + * \retval 1 on success + * \retval 0 revalidation from statahead cache failed, caller needs + * to getattr from server directly + * \retval negative number on error, caller often ignores this and + * then getattr from server + */ +int ll_statahead(struct inode *dir, struct dentry **dentryp, bool unplug) +{ + struct ll_statahead_info *sai; + + sai = ll_sai_get(dir); + if (sai != NULL) { + int rc; + + rc = revalidate_statahead_dentry(dir, sai, dentryp, unplug); + CDEBUG(D_READA, "revalidate statahead %.*s: %d.\n", + (*dentryp)->d_name.len, (*dentryp)->d_name.name, rc); + ll_sai_put(sai); + return rc; + } + return start_statahead_thread(dir, *dentryp); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/super25.c b/drivers/staging/lustrefsx/lustre/llite/super25.c new file mode 100644 index 0000000000000..8fbbea24c9ce2 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/super25.c @@ -0,0 +1,221 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include "llite_internal.h" +#include "vvp_internal.h" + +static struct kmem_cache *ll_inode_cachep; + +static struct inode *ll_alloc_inode(struct super_block *sb) +{ + struct ll_inode_info *lli; + ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1); + OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, GFP_NOFS); + if (lli == NULL) + return NULL; + + inode_init_once(&lli->lli_vfs_inode); + return &lli->lli_vfs_inode; +} + +#ifdef HAVE_INODE_I_RCU +static void ll_inode_destroy_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ll_inode_info *ptr = ll_i2info(inode); + OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep); +} + +static void ll_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ll_inode_destroy_callback); +} +#else +static void ll_destroy_inode(struct inode *inode) +{ + struct ll_inode_info *ptr = ll_i2info(inode); + OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep); +} +#endif + +static int ll_drop_inode(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + + if (!sbi->ll_inode_cache_enabled) + return 1; + + return generic_drop_inode(inode); +} + +/* exported operations */ +struct super_operations lustre_super_operations = +{ + .alloc_inode = ll_alloc_inode, + .destroy_inode = ll_destroy_inode, + .drop_inode = ll_drop_inode, +#ifdef HAVE_SBOPS_EVICT_INODE + .evict_inode = ll_delete_inode, +#else + .clear_inode = ll_clear_inode, + .delete_inode = ll_delete_inode, +#endif + .put_super = ll_put_super, + .statfs = ll_statfs, + .umount_begin = ll_umount_begin, + .remount_fs = ll_remount_fs, + .show_options = ll_show_options, +}; + +static int __init lustre_init(void) +{ + struct lnet_process_id lnet_id; + struct timespec64 ts; + int i, rc, seed[2]; + unsigned long lustre_inode_cache_flags; + + CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1); + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre client module (%p).\n", + &lustre_super_operations); + + lustre_inode_cache_flags = SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD; +#ifdef SLAB_ACCOUNT + lustre_inode_cache_flags |= SLAB_ACCOUNT; +#endif + + ll_inode_cachep = kmem_cache_create("lustre_inode_cache", + sizeof(struct ll_inode_info), + 0, lustre_inode_cache_flags, NULL); + if (ll_inode_cachep == NULL) + GOTO(out_cache, rc = -ENOMEM); + + ll_file_data_slab = kmem_cache_create("ll_file_data", + sizeof(struct ll_file_data), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (ll_file_data_slab == NULL) + GOTO(out_cache, rc = -ENOMEM); + + rc = llite_tunables_register(); + if (rc) + GOTO(out_cache, rc); + + cfs_get_random_bytes(seed, sizeof(seed)); + + /* Nodes with small feet have little entropy. The NID for this + * node gives the most entropy in the low bits. */ + for (i = 0;; i++) { + if (LNetGetId(i, &lnet_id) == -ENOENT) + break; + + if (lnet_id.nid != LNET_NID_LO_0) + seed[0] ^= LNET_NIDADDR(lnet_id.nid); + } + + ktime_get_ts64(&ts); + cfs_srand(ts.tv_sec ^ seed[0], ts.tv_nsec ^ seed[1]); + + rc = vvp_global_init(); + if (rc != 0) + GOTO(out_tunables, rc); + + cl_inode_fini_env = cl_env_alloc(&cl_inode_fini_refcheck, + LCT_REMEMBER | LCT_NOREF); + if (IS_ERR(cl_inode_fini_env)) + GOTO(out_vvp, rc = PTR_ERR(cl_inode_fini_env)); + + cl_inode_fini_env->le_ctx.lc_cookie = 0x4; + + rc = ll_xattr_init(); + if (rc != 0) + GOTO(out_inode_fini_env, rc); + + lustre_register_client_fill_super(ll_fill_super); + lustre_register_kill_super_cb(ll_kill_super); + + RETURN(0); + +out_inode_fini_env: + cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck); +out_vvp: + vvp_global_fini(); +out_tunables: + llite_tunables_unregister(); +out_cache: + kmem_cache_destroy(ll_inode_cachep); + kmem_cache_destroy(ll_file_data_slab); + return rc; +} + +static void __exit lustre_exit(void) +{ + lustre_register_client_fill_super(NULL); + lustre_register_kill_super_cb(NULL); + + llite_tunables_unregister(); + + ll_xattr_fini(); + cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck); + vvp_global_fini(); + +#ifdef HAVE_INODE_I_RCU + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); +#endif + kmem_cache_destroy(ll_inode_cachep); + kmem_cache_destroy(ll_file_data_slab); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Client File System"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(lustre_init); +module_exit(lustre_exit); diff --git a/drivers/staging/lustrefsx/lustre/llite/symlink.c b/drivers/staging/lustrefsx/lustre/llite/symlink.c new file mode 100644 index 0000000000000..8e12995873cb8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/symlink.c @@ -0,0 +1,242 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#define DEBUG_SUBSYSTEM S_LLITE + +#include "llite_internal.h" + +static int ll_readlink_internal(struct inode *inode, + struct ptlrpc_request **request, char **symname) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + int rc, symlen = i_size_read(inode) + 1; + struct mdt_body *body; + struct md_op_data *op_data; + ENTRY; + + *request = NULL; + + if (lli->lli_symlink_name) { + int print_limit = min_t(int, PAGE_SIZE - 128, symlen); + + *symname = lli->lli_symlink_name; + /* If the total CDEBUG() size is larger than a page, it + * will print a warning to the console, avoid this by + * printing just the last part of the symlink. */ + CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n", + print_limit < symlen ? "..." : "", print_limit, + (*symname) + symlen - print_limit, symlen); + RETURN(0); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_LINKNAME; + rc = md_getattr(sbi->ll_md_exp, op_data, request); + ll_finish_md_op_data(op_data); + if (rc) { + if (rc != -ENOENT) + CERROR("%s: inode "DFID": rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(ll_inode2fid(inode)), rc); + GOTO (failed, rc); + } + + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + if ((body->mbo_valid & OBD_MD_LINKNAME) == 0) { + CERROR("OBD_MD_LINKNAME not set on reply\n"); + GOTO(failed, rc = -EPROTO); + } + + LASSERT(symlen != 0); + if (body->mbo_eadatasize != symlen) { + CERROR("%s: inode "DFID": symlink length %d not expected %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(ll_inode2fid(inode)), body->mbo_eadatasize - 1, + symlen - 1); + GOTO(failed, rc = -EPROTO); + } + + *symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD); + if (*symname == NULL || + strnlen(*symname, symlen) != symlen - 1) { + /* not full/NULL terminated */ + CERROR("%s: inode "DFID": symlink not NULL terminated string" + "of length %d\n", ll_get_fsname(inode->i_sb, NULL, 0), + PFID(ll_inode2fid(inode)), symlen - 1); + GOTO(failed, rc = -EPROTO); + } + + OBD_ALLOC(lli->lli_symlink_name, symlen); + /* do not return an error if we cannot cache the symlink locally */ + if (lli->lli_symlink_name) { + memcpy(lli->lli_symlink_name, *symname, symlen); + *symname = lli->lli_symlink_name; + } + RETURN(0); + +failed: + RETURN (rc); +} + +#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA +static void ll_put_link(struct dentry *dentry, + struct nameidata *nd, void *cookie) +#else +# ifdef HAVE_IOP_GET_LINK +static void ll_put_link(void *cookie) +# else +static void ll_put_link(struct inode *unused, void *cookie) +# endif +#endif +{ + ptlrpc_req_finished(cookie); +} + +#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA +static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct ptlrpc_request *request = NULL; + int rc; + char *symname = NULL; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op\n"); + /* Limit the recursive symlink depth to 5 instead of default + * 8 links when kernel has 4k stack to prevent stack overflow. + * For 8k stacks we need to limit it to 7 for local servers. */ + if (THREAD_SIZE < 8192 && current->link_count >= 6) { + rc = -ELOOP; + } else if (THREAD_SIZE == 8192 && current->link_count >= 8) { + rc = -ELOOP; + } else { + ll_inode_size_lock(inode); + rc = ll_readlink_internal(inode, &request, &symname); + ll_inode_size_unlock(inode); + } + if (rc) { + ptlrpc_req_finished(request); + request = NULL; + symname = ERR_PTR(rc); + } + + nd_set_link(nd, symname); + /* symname may contain a pointer to the request message buffer, + * we delay request releasing until ll_put_link then. + */ + RETURN(request); +} +#else +# ifdef HAVE_IOP_GET_LINK +static const char *ll_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct ptlrpc_request *request; + char *symname = NULL; + int rc; + + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op\n"); + if (!dentry) + RETURN(ERR_PTR(-ECHILD)); + ll_inode_size_lock(inode); + rc = ll_readlink_internal(inode, &request, &symname); + ll_inode_size_unlock(inode); + if (rc < 0) { + ptlrpc_req_finished(request); + return ERR_PTR(rc); + } + + /* symname may contain a pointer to the request message buffer, + * we delay request releasing then. + */ + set_delayed_call(done, ll_put_link, request); + RETURN(symname); +} +# else +static const char *ll_follow_link(struct dentry *dentry, void **cookie) +{ + struct inode *inode = d_inode(dentry); + struct ptlrpc_request *request; + char *symname = NULL; + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op\n"); + ll_inode_size_lock(inode); + rc = ll_readlink_internal(inode, &request, &symname); + ll_inode_size_unlock(inode); + if (rc < 0) { + ptlrpc_req_finished(request); + return ERR_PTR(rc); + } + + /* symname may contain a pointer to the request message buffer, + * we delay request releasing until ll_put_link then. + */ + *cookie = request; + RETURN(symname); +} +# endif /* HAVE_IOP_GET_LINK */ +#endif /* HAVE_SYMLINK_OPS_USE_NAMEIDATA */ + +struct inode_operations ll_fast_symlink_inode_operations = { +#ifdef HAVE_IOP_GENERIC_READLINK + .readlink = generic_readlink, +#endif + .setattr = ll_setattr, +#ifdef HAVE_IOP_GET_LINK + .get_link = ll_get_link, +#else + .follow_link = ll_follow_link, + .put_link = ll_put_link, +#endif + .getattr = ll_getattr, + .permission = ll_inode_permission, +#ifdef HAVE_IOP_XATTR + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .removexattr = ll_removexattr, +#endif + .listxattr = ll_listxattr, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c new file mode 100644 index 0000000000000..d36aed3919268 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c @@ -0,0 +1,620 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl_device and cl_device_type implementation for VVP layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include "llite_internal.h" +#include "vvp_internal.h" + +/***************************************************************************** + * + * Vvp device and device type functions. + * + */ + +/* + * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical + * "llite_" (var. "ll_") prefix. + */ + +static struct kmem_cache *ll_thread_kmem; +struct kmem_cache *vvp_object_kmem; +static struct kmem_cache *vvp_session_kmem; +static struct kmem_cache *vvp_thread_kmem; + +static struct lu_kmem_descr vvp_caches[] = { + { + .ckd_cache = &ll_thread_kmem, + .ckd_name = "ll_thread_kmem", + .ckd_size = sizeof(struct ll_thread_info), + }, + { + .ckd_cache = &vvp_object_kmem, + .ckd_name = "vvp_object_kmem", + .ckd_size = sizeof(struct vvp_object), + }, + { + .ckd_cache = &vvp_session_kmem, + .ckd_name = "vvp_session_kmem", + .ckd_size = sizeof (struct vvp_session) + }, + { + .ckd_cache = &vvp_thread_kmem, + .ckd_name = "vvp_thread_kmem", + .ckd_size = sizeof(struct vvp_thread_info), + }, + { + .ckd_cache = NULL + } +}; + +static void *ll_thread_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct ll_thread_info *lti; + + OBD_SLAB_ALLOC_PTR_GFP(lti, ll_thread_kmem, GFP_NOFS); + if (lti == NULL) + lti = ERR_PTR(-ENOMEM); + + return lti; +} + +static void ll_thread_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct ll_thread_info *lti = data; + + OBD_SLAB_FREE_PTR(lti, ll_thread_kmem); +} + +struct lu_context_key ll_thread_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = ll_thread_key_init, + .lct_fini = ll_thread_key_fini, +}; + +static void *vvp_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct vvp_session *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, GFP_NOFS); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +static void vvp_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct vvp_session *session = data; + OBD_SLAB_FREE_PTR(session, vvp_session_kmem); +} + +struct lu_context_key vvp_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = vvp_session_key_init, + .lct_fini = vvp_session_key_fini +}; + +static void *vvp_thread_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct vvp_thread_info *vti; + + OBD_SLAB_ALLOC_PTR_GFP(vti, vvp_thread_kmem, GFP_NOFS); + if (vti == NULL) + vti = ERR_PTR(-ENOMEM); + return vti; +} + +static void vvp_thread_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct vvp_thread_info *vti = data; + OBD_SLAB_FREE_PTR(vti, vvp_thread_kmem); +} + +struct lu_context_key vvp_thread_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = vvp_thread_key_init, + .lct_fini = vvp_thread_key_fini, +}; + +/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(vvp, &ll_thread_key, &vvp_session_key, &vvp_thread_key); + +static const struct lu_device_operations vvp_lu_ops = { + .ldo_object_alloc = vvp_object_alloc +}; + +static struct lu_device *vvp_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct vvp_device *vdv = lu2vvp_dev(d); + struct cl_site *site = lu2cl_site(d->ld_site); + struct lu_device *next = cl2lu_dev(vdv->vdv_next); + + if (d->ld_site != NULL) { + cl_site_fini(site); + OBD_FREE_PTR(site); + } + + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(vdv); + return next; +} + +static struct lu_device *vvp_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct vvp_device *vdv; + struct lu_device *lud; + struct cl_site *site; + int rc; + ENTRY; + + OBD_ALLOC_PTR(vdv); + if (vdv == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + lud = &vdv->vdv_cl.cd_lu_dev; + cl_device_init(&vdv->vdv_cl, t); + vvp2lu_dev(vdv)->ld_ops = &vvp_lu_ops; + + OBD_ALLOC_PTR(site); + if (site != NULL) { + rc = cl_site_init(site, &vdv->vdv_cl); + if (rc == 0) + rc = lu_site_init_finish(&site->cs_lu); + else { + LASSERT(lud->ld_site == NULL); + CERROR("Cannot init lu_site, rc %d.\n", rc); + OBD_FREE_PTR(site); + } + } else + rc = -ENOMEM; + if (rc != 0) { + vvp_device_free(env, lud); + lud = ERR_PTR(rc); + } + RETURN(lud); +} + +static int vvp_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct vvp_device *vdv; + int rc; + ENTRY; + + vdv = lu2vvp_dev(d); + vdv->vdv_next = lu2cl_dev(next); + + LASSERT(d->ld_site != NULL && next->ld_type != NULL); + next->ld_site = d->ld_site; + rc = next->ld_type->ldt_ops->ldto_device_init( + env, next, next->ld_type->ldt_name, NULL); + if (rc == 0) { + lu_device_get(next); + lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); + } + RETURN(rc); +} + +static struct lu_device *vvp_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + return cl2lu_dev(lu2vvp_dev(d)->vdv_next); +} + +static const struct lu_device_type_operations vvp_device_type_ops = { + .ldto_init = vvp_type_init, + .ldto_fini = vvp_type_fini, + + .ldto_start = vvp_type_start, + .ldto_stop = vvp_type_stop, + + .ldto_device_alloc = vvp_device_alloc, + .ldto_device_free = vvp_device_free, + .ldto_device_init = vvp_device_init, + .ldto_device_fini = vvp_device_fini, +}; + +struct lu_device_type vvp_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_VVP_NAME, + .ldt_ops = &vvp_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** + * A mutex serializing calls to vvp_inode_fini() under extreme memory + * pressure, when environments cannot be allocated. + */ +int vvp_global_init(void) +{ + int rc; + + rc = lu_kmem_init(vvp_caches); + if (rc != 0) + return rc; + + rc = lu_device_type_init(&vvp_device_type); + if (rc != 0) + goto out_kmem; + + return 0; + +out_kmem: + lu_kmem_fini(vvp_caches); + + return rc; +} + +void vvp_global_fini(void) +{ + lu_device_type_fini(&vvp_device_type); + lu_kmem_fini(vvp_caches); +} + +/***************************************************************************** + * + * mirror obd-devices into cl devices. + * + */ + +int cl_sb_init(struct super_block *sb) +{ + struct ll_sb_info *sbi; + struct cl_device *cl; + struct lu_env *env; + int rc = 0; + __u16 refcheck; + + sbi = ll_s2sbi(sb); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + cl = cl_type_setup(env, NULL, &vvp_device_type, + sbi->ll_dt_exp->exp_obd->obd_lu_dev); + if (!IS_ERR(cl)) { + sbi->ll_cl = cl; + sbi->ll_site = cl2lu_dev(cl)->ld_site; + } + cl_env_put(env, &refcheck); + } else + rc = PTR_ERR(env); + RETURN(rc); +} + +int cl_sb_fini(struct super_block *sb) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + struct cl_device *cld; + __u16 refcheck; + int result; + + ENTRY; + sbi = ll_s2sbi(sb); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + cld = sbi->ll_cl; + + if (cld != NULL) { + cl_stack_fini(env, cld); + sbi->ll_cl = NULL; + sbi->ll_site = NULL; + } + cl_env_put(env, &refcheck); + result = 0; + } else { + CERROR("Cannot cleanup cl-stack due to memory shortage.\n"); + result = PTR_ERR(env); + } + + RETURN(result); +} + +/**************************************************************************** + * + * debugfs/lustre/llite/$MNT/dump_page_cache + * + ****************************************************************************/ + +struct vvp_pgcache_id { + unsigned vpi_bucket; + unsigned vpi_depth; + uint32_t vpi_index; + + unsigned vpi_curdep; + struct lu_object_header *vpi_obj; +}; + +struct vvp_seq_private { + struct ll_sb_info *vsp_sbi; + struct lu_env *vsp_env; + u16 vsp_refcheck; + struct cl_object *vsp_clob; + struct vvp_pgcache_id vvp_id; + /* + * prev_pos is the 'pos' of the last object returned + * by ->start of ->next. + */ + loff_t vvp_prev_pos; +}; + +static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + struct vvp_pgcache_id *id = data; + struct lu_object_header *hdr = cfs_hash_object(hs, hnode); + + if (lu_object_is_dying(hdr)) + return 0; + + if (id->vpi_curdep-- > 0) + return 0; /* continue */ + + cfs_hash_get(hs, hnode); + id->vpi_obj = hdr; + return 1; +} + +static struct cl_object *vvp_pgcache_obj(const struct lu_env *env, + struct lu_device *dev, + struct vvp_pgcache_id *id) +{ + LASSERT(lu_device_is_cl(dev)); + + id->vpi_obj = NULL; + id->vpi_curdep = id->vpi_depth; + + cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket, + vvp_pgcache_obj_get, id); + if (id->vpi_obj != NULL) { + struct lu_object *lu_obj; + + lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type); + if (lu_obj != NULL) { + lu_object_ref_add(lu_obj, "dump", current); + return lu2cl(lu_obj); + } + lu_object_put(env, lu_object_top(id->vpi_obj)); + } + return NULL; +} + +static struct page *vvp_pgcache_current(struct vvp_seq_private *priv) +{ + struct lu_device *dev = &priv->vsp_sbi->ll_cl->cd_lu_dev; + + while (1) { + struct inode *inode; + struct page *vmpage; + int nr; + + if (!priv->vsp_clob) { + struct cl_object *clob; + + while ((clob = vvp_pgcache_obj(priv->vsp_env, dev, &priv->vvp_id)) == NULL && + ++(priv->vvp_id.vpi_bucket) < CFS_HASH_NHLIST(dev->ld_site->ls_obj_hash)) + priv->vvp_id.vpi_depth = 0; + if (!clob) + return NULL; + priv->vsp_clob = clob; + priv->vvp_id.vpi_index = 0; + } + + inode = vvp_object_inode(priv->vsp_clob); + nr = find_get_pages_contig(inode->i_mapping, priv->vvp_id.vpi_index, 1, &vmpage); + if (nr > 0) { + priv->vvp_id.vpi_index = vmpage->index; + return vmpage; + } + lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current); + cl_object_put(priv->vsp_env, priv->vsp_clob); + priv->vsp_clob = NULL; + priv->vvp_id.vpi_index = 0; + priv->vvp_id.vpi_depth++; + } +} + +#define seq_page_flag(seq, page, flag, has_flags) do { \ + if (test_bit(PG_##flag, &(page)->flags)) { \ + seq_printf(seq, "%s"#flag, has_flags ? "|" : ""); \ + has_flags = 1; \ + } \ +} while(0) + +static void vvp_pgcache_page_show(const struct lu_env *env, + struct seq_file *seq, struct cl_page *page) +{ + struct vvp_page *vpg; + struct page *vmpage; + int has_flags; + + vpg = cl2vvp_page(cl_page_at(page, &vvp_device_type)); + vmpage = vpg->vpg_page; + seq_printf(seq, " %5i | %p %p %s %s %s | %p "DFID"(%p) %lu %u [", + 0 /* gen */, + vpg, page, + "none", + vpg->vpg_defer_uptodate ? "du" : "- ", + PageWriteback(vmpage) ? "wb" : "-", + vmpage, + PFID(ll_inode2fid(vmpage->mapping->host)), + vmpage->mapping->host, vmpage->index, + page_count(vmpage)); + has_flags = 0; + seq_page_flag(seq, vmpage, locked, has_flags); + seq_page_flag(seq, vmpage, error, has_flags); + seq_page_flag(seq, vmpage, referenced, has_flags); + seq_page_flag(seq, vmpage, uptodate, has_flags); + seq_page_flag(seq, vmpage, dirty, has_flags); + seq_page_flag(seq, vmpage, writeback, has_flags); + seq_printf(seq, "%s]\n", has_flags ? "" : "-"); +} + +static int vvp_pgcache_show(struct seq_file *f, void *v) +{ + struct vvp_seq_private *priv = f->private; + struct page *vmpage = v; + struct cl_page *page; + + seq_printf(f, "%8lx@" DFID ": ", vmpage->index, + PFID(lu_object_fid(&priv->vsp_clob->co_lu))); + lock_page(vmpage); + page = cl_vmpage_page(vmpage, priv->vsp_clob); + unlock_page(vmpage); + put_page(vmpage); + + if (page) { + vvp_pgcache_page_show(priv->vsp_env, f, page); + cl_page_put(priv->vsp_env, page); + } else { + seq_puts(f, "missing\n"); + } + + return 0; +} + +static void vvp_pgcache_rewind(struct vvp_seq_private *priv) +{ + if (priv->vvp_prev_pos) { + memset(&priv->vvp_id, 0, sizeof(priv->vvp_id)); + priv->vvp_prev_pos = 0; + if (priv->vsp_clob) { + lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", + current); + cl_object_put(priv->vsp_env, priv->vsp_clob); + } + priv->vsp_clob = NULL; + } +} + +static struct page *vvp_pgcache_next_page(struct vvp_seq_private *priv) +{ + priv->vvp_id.vpi_index += 1; + return vvp_pgcache_current(priv); +} + +static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos) +{ + struct vvp_seq_private *priv = f->private; + + if (*pos == 0) { + vvp_pgcache_rewind(priv); + } else if (*pos == priv->vvp_prev_pos) { + /* Return the current item */; + } else { + WARN_ON(*pos != priv->vvp_prev_pos + 1); + priv->vvp_id.vpi_index += 1; + } + + priv->vvp_prev_pos = *pos; + return vvp_pgcache_current(priv); +} + +static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos) +{ + struct vvp_seq_private *priv = f->private; + + WARN_ON(*pos != priv->vvp_prev_pos); + *pos += 1; + priv->vvp_prev_pos = *pos; + return vvp_pgcache_next_page(priv); +} + +static void vvp_pgcache_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static struct seq_operations vvp_pgcache_ops = { + .start = vvp_pgcache_start, + .next = vvp_pgcache_next, + .stop = vvp_pgcache_stop, + .show = vvp_pgcache_show +}; + +static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp) +{ + struct vvp_seq_private *priv; + + priv = __seq_open_private(filp, &vvp_pgcache_ops, sizeof(*priv)); + if (!priv) + return -ENOMEM; + + priv->vsp_sbi = inode->i_private; + priv->vsp_env = cl_env_get(&priv->vsp_refcheck); + priv->vsp_clob = NULL; + memset(&priv->vvp_id, 0, sizeof(priv->vvp_id)); + if (IS_ERR(priv->vsp_env)) { + int err = PTR_ERR(priv->vsp_env); + + seq_release_private(inode, filp); + return err; + } + + return 0; +} + +static int vvp_dump_pgcache_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct vvp_seq_private *priv = seq->private; + + if (priv->vsp_clob) { + lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current); + cl_object_put(priv->vsp_env, priv->vsp_clob); + } + + cl_env_put(priv->vsp_env, &priv->vsp_refcheck); + return seq_release_private(inode, file); +} + +const struct file_operations vvp_dump_pgcache_file_ops = { + .owner = THIS_MODULE, + .open = vvp_dump_pgcache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = vvp_dump_pgcache_seq_release, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h new file mode 100644 index 0000000000000..0fb9b51a8f618 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h @@ -0,0 +1,329 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal definitions for VVP layer. + * + * Author: Nikita Danilov + */ + +#ifndef VVP_INTERNAL_H +#define VVP_INTERNAL_H + +#include + +enum obd_notify_event; +struct inode; +struct lustre_md; +struct obd_device; +struct obd_export; +struct page; + +enum vvp_io_subtype { + /** normal IO */ + IO_NORMAL, + /** io started from splice_{read|write} */ + IO_SPLICE, +}; + +/** + * IO state private to VVP layer. + */ +struct vvp_io { + /** super class */ + struct cl_io_slice vui_cl; + struct cl_io_lock_link vui_link; + /** + * I/O vector information to or from which read/write is going. + */ + struct iov_iter *vui_iter; + /** + * Total size for the left IO. + */ + size_t vui_tot_count; + + union { + struct vvp_fault_io { + /** + * Inode modification time that is checked across DLM + * lock request. + */ + time64_t ft_mtime; + struct vm_area_struct *ft_vma; + /** + * locked page returned from vvp_io + */ + struct page *ft_vmpage; + /** + * kernel fault info + */ + struct vm_fault *ft_vmf; + /** + * fault API used bitflags for return code. + */ + unsigned int ft_flags; + /** + * check that flags are from filemap_fault + */ + bool ft_flags_valid; + struct cl_page_list ft_queue; + } fault; + struct { + struct pipe_inode_info *vui_pipe; + unsigned int vui_flags; + } splice; + struct { + struct cl_page_list vui_queue; + unsigned long vui_written; + int vui_from; + int vui_to; + } write; + } u; + + enum vvp_io_subtype vui_io_subtype; + + /** + * Layout version when this IO is initialized + */ + __u32 vui_layout_gen; + /** + * File descriptor against which IO is done. + */ + struct ll_file_data *vui_fd; + struct kiocb *vui_iocb; + + /* Readahead state. */ + pgoff_t vui_ra_start; + pgoff_t vui_ra_count; + /* Set when vui_ra_{start,count} have been initialized. */ + bool vui_ra_valid; +}; + +extern struct lu_device_type vvp_device_type; + +extern struct lu_context_key vvp_session_key; +extern struct lu_context_key vvp_thread_key; + +extern struct kmem_cache *vvp_object_kmem; + +struct vvp_thread_info { + struct cl_lock vti_lock; + struct cl_lock_descr vti_descr; + struct cl_io vti_io; + struct cl_attr vti_attr; + struct cl_sync_io vti_anchor; +}; + +static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env) +{ + struct vvp_thread_info *vti; + + vti = lu_context_key_get(&env->le_ctx, &vvp_thread_key); + LASSERT(vti != NULL); + + return vti; +} + +static inline struct cl_lock *vvp_env_lock(const struct lu_env *env) +{ + struct cl_lock *lock = &vvp_env_info(env)->vti_lock; + + memset(lock, 0, sizeof(*lock)); + + return lock; +} + +static inline struct cl_attr *vvp_env_thread_attr(const struct lu_env *env) +{ + struct cl_attr *attr = &vvp_env_info(env)->vti_attr; + + memset(attr, 0, sizeof(*attr)); + + return attr; +} + +static inline struct cl_io *vvp_env_thread_io(const struct lu_env *env) +{ + struct cl_io *io = &vvp_env_info(env)->vti_io; + + memset(io, 0, sizeof(*io)); + + return io; +} + +struct vvp_session { + struct vvp_io vs_ios; +}; + +static inline struct vvp_session *vvp_env_session(const struct lu_env *env) +{ + struct vvp_session *ses; + + ses = lu_context_key_get(env->le_ses, &vvp_session_key); + LASSERT(ses != NULL); + + return ses; +} + +static inline struct vvp_io *vvp_env_io(const struct lu_env *env) +{ + return &vvp_env_session(env)->vs_ios; +} + +/** + * VPP-private object state. + */ +struct vvp_object { + struct cl_object_header vob_header; + struct cl_object vob_cl; + struct inode *vob_inode; + + /** + * Number of transient pages. This is no longer protected by i_sem, + * and needs to be atomic. This is not actually used for anything, + * and can probably be removed. + */ + atomic_t vob_transient_pages; + /** + * Number of outstanding mmaps on this file. + * + * \see ll_vm_open(), ll_vm_close(). + */ + atomic_t vob_mmap_cnt; + + /** + * various flags + * vob_discard_page_warned + * if pages belonging to this object are discarded when a client + * is evicted, some debug info will be printed, this flag will be set + * during processing the first discarded page, then avoid flooding + * debug message for lots of discarded pages. + * + * \see ll_dirty_page_discard_warn. + */ + unsigned int vob_discard_page_warned:1; +}; + +/** + * VVP-private page state. + */ +struct vvp_page { + struct cl_page_slice vpg_cl; + unsigned vpg_defer_uptodate:1, + vpg_ra_updated:1, + vpg_ra_used:1; + /** VM page */ + struct page *vpg_page; +}; + +static inline struct vvp_page *cl2vvp_page(const struct cl_page_slice *slice) +{ + return container_of(slice, struct vvp_page, vpg_cl); +} + +static inline pgoff_t vvp_index(struct vvp_page *vpg) +{ + return vpg->vpg_cl.cpl_index; +} + +struct vvp_device { + struct cl_device vdv_cl; + struct cl_device *vdv_next; +}; + +static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv) +{ + return &vdv->vdv_cl.cd_lu_dev; +} + +static inline struct vvp_device *lu2vvp_dev(const struct lu_device *d) +{ + return container_of0(d, struct vvp_device, vdv_cl.cd_lu_dev); +} + +static inline struct vvp_device *cl2vvp_dev(const struct cl_device *d) +{ + return container_of0(d, struct vvp_device, vdv_cl); +} + +static inline struct vvp_object *cl2vvp(const struct cl_object *obj) +{ + return container_of0(obj, struct vvp_object, vob_cl); +} + +static inline struct vvp_object *lu2vvp(const struct lu_object *obj) +{ + return container_of0(obj, struct vvp_object, vob_cl.co_lu); +} + +static inline struct inode *vvp_object_inode(const struct cl_object *obj) +{ + return cl2vvp(obj)->vob_inode; +} + +int vvp_object_invariant(const struct cl_object *obj); +struct vvp_object *cl_inode2vvp(struct inode *inode); + +static inline struct page *cl2vm_page(const struct cl_page_slice *slice) +{ + return cl2vvp_page(slice)->vpg_page; +} + +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK +# define CLOBINVRNT(env, clob, expr) \ + do { \ + if (unlikely(!(expr))) { \ + LU_OBJECT_DEBUG(D_ERROR, (env), &(clob)->co_lu, \ + #expr); \ + LINVRNT(0); \ + } \ + } while (0) +#else /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */ +# define CLOBINVRNT(env, clob, expr) \ + ((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr)) +#endif /* CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */ + +int lov_read_and_clear_async_rc(struct cl_object *clob); + +int vvp_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); +int vvp_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); +struct lu_object *vvp_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +int vvp_global_init(void); +void vvp_global_fini(void); + +extern const struct file_operations vvp_dump_pgcache_file_ops; + +#endif /* VVP_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c new file mode 100644 index 0000000000000..6d8070c5b8bfd --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c @@ -0,0 +1,1567 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for VVP layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include +#include "llite_internal.h" +#include "vvp_internal.h" +#include + +static struct vvp_io *cl2vvp_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct vvp_io *vio; + + vio = container_of(slice, struct vvp_io, vui_cl); + LASSERT(vio == vvp_env_io(env)); + + return vio; +} + +/** + * True, if \a io is a normal io, False for splice_{read,write} + */ +static int cl_is_normalio(const struct lu_env *env, const struct cl_io *io) +{ + struct vvp_io *vio = vvp_env_io(env); + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + return vio->vui_io_subtype == IO_NORMAL; +} + +/** + * For swapping layout. The file's layout may have changed. + * To avoid populating pages to a wrong stripe, we have to verify the + * correctness of layout. It works because swapping layout processes + * have to acquire group lock. + */ +static bool can_populate_pages(const struct lu_env *env, struct cl_io *io, + struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct vvp_io *vio = vvp_env_io(env); + bool rc = true; + + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + /* don't need lock here to check lli_layout_gen as we have held + * extent lock and GROUP lock has to hold to swap layout */ + if (ll_layout_version_get(lli) != vio->vui_layout_gen || + OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_LOST_LAYOUT, 0)) { + io->ci_need_restart = 1; + /* this will cause a short read/write */ + io->ci_continue = 0; + rc = false; + } + case CIT_FAULT: + /* fault is okay because we've already had a page. */ + default: + break; + } + + return rc; +} + +static void vvp_object_size_lock(struct cl_object *obj) +{ + struct inode *inode = vvp_object_inode(obj); + + ll_inode_size_lock(inode); + cl_object_attr_lock(obj); +} + +static void vvp_object_size_unlock(struct cl_object *obj) +{ + struct inode *inode = vvp_object_inode(obj); + + cl_object_attr_unlock(obj); + ll_inode_size_unlock(inode); +} + +/** + * Helper function that if necessary adjusts file size (inode->i_size), when + * position at the offset \a pos is accessed. File size can be arbitrary stale + * on a Lustre client, but client at least knows KMS. If accessed area is + * inside [0, KMS], set file size to KMS, otherwise glimpse file size. + * + * Locking: i_size_lock is used to serialize changes to inode size and to + * protect consistency between inode size and cl_object + * attributes. cl_object_size_lock() protects consistency between cl_attr's of + * top-object and sub-objects. + */ +static int vvp_prep_size(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, loff_t start, size_t count, + int *exceed) +{ + struct cl_attr *attr = vvp_env_thread_attr(env); + struct inode *inode = vvp_object_inode(obj); + loff_t pos = start + count - 1; + loff_t kms; + int result; + + /* + * Consistency guarantees: following possibilities exist for the + * relation between region being accessed and real file size at this + * moment: + * + * (A): the region is completely inside of the file; + * + * (B-x): x bytes of region are inside of the file, the rest is + * outside; + * + * (C): the region is completely outside of the file. + * + * This classification is stable under DLM lock already acquired by + * the caller, because to change the class, other client has to take + * DLM lock conflicting with our lock. Also, any updates to ->i_size + * by other threads on this client are serialized by + * ll_inode_size_lock(). This guarantees that short reads are handled + * correctly in the face of concurrent writes and truncates. + */ + vvp_object_size_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + kms = attr->cat_kms; + if (pos > kms) { + /* + * A glimpse is necessary to determine whether we + * return a short read (B) or some zeroes at the end + * of the buffer (C) + */ + vvp_object_size_unlock(obj); + result = cl_glimpse_lock(env, io, inode, obj, 0); + if (result == 0 && exceed != NULL) { + /* If objective page index exceed end-of-file + * page index, return directly. Do not expect + * kernel will check such case correctly. + * linux-2.6.18-128.1.1 miss to do that. + * --bug 17336 */ + loff_t size = i_size_read(inode); + unsigned long cur_index = start >> + PAGE_SHIFT; + + if ((size == 0 && cur_index != 0) || + (((size - 1) >> PAGE_SHIFT) < + cur_index)) + *exceed = 1; + } + + return result; + } else { + /* + * region is within kms and, hence, within real file + * size (A). We need to increase i_size to cover the + * read region so that generic_file_read() will do its + * job, but that doesn't mean the kms size is + * _correct_, it is only the _minimum_ size. If + * someone does a stat they will get the correct size + * which will always be >= the kms value here. + * b=11081 + */ + if (i_size_read(inode) < kms) { + i_size_write(inode, kms); + CDEBUG(D_VFSTRACE, + DFID" updating i_size %llu\n", + PFID(lu_object_fid(&obj->co_lu)), + (__u64)i_size_read(inode)); + } + } + } + + vvp_object_size_unlock(obj); + + return result; +} + +/***************************************************************************** + * + * io operations. + * + */ + +static int vvp_io_one_lock_index(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + pgoff_t start, pgoff_t end) +{ + struct vvp_io *vio = vvp_env_io(env); + struct cl_lock_descr *descr = &vio->vui_link.cill_descr; + struct cl_object *obj = io->ci_obj; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + ENTRY; + + CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end); + + memset(&vio->vui_link, 0, sizeof vio->vui_link); + + if (vio->vui_fd && (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + descr->cld_mode = CLM_GROUP; + descr->cld_gid = vio->vui_fd->fd_grouplock.lg_gid; + enqflags |= CEF_LOCK_MATCH; + } else { + descr->cld_mode = mode; + } + + descr->cld_obj = obj; + descr->cld_start = start; + descr->cld_end = end; + descr->cld_enq_flags = enqflags; + + cl_io_lock_add(env, io, &vio->vui_link); + + RETURN(0); +} + +static int vvp_io_one_lock(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + loff_t start, loff_t end) +{ + struct cl_object *obj = io->ci_obj; + + return vvp_io_one_lock_index(env, io, enqflags, mode, + cl_index(obj, start), cl_index(obj, end)); +} + +static int vvp_io_write_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + + cl_page_list_init(&vio->u.write.vui_queue); + vio->u.write.vui_written = 0; + vio->u.write.vui_from = 0; + vio->u.write.vui_to = PAGE_SIZE; + + return 0; +} + +static void vvp_io_write_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + + LASSERT(vio->u.write.vui_queue.pl_nr == 0); +} + +static int vvp_io_fault_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct inode *inode = vvp_object_inode(ios->cis_obj); + + LASSERT(inode == file_inode(vio->vui_fd->fd_file)); + vio->u.fault.ft_mtime = inode->i_mtime.tv_sec; + + return 0; +} + +static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct vvp_io *vio = cl2vvp_io(env, ios); + struct inode *inode = vvp_object_inode(obj); + __u32 gen = 0; + int rc; + ENTRY; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + + CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d " + "need write layout %d, restore needed %d\n", + PFID(lu_object_fid(&obj->co_lu)), + io->ci_ignore_layout, io->ci_verify_layout, + vio->vui_layout_gen, io->ci_need_write_intent, + io->ci_restore_needed); + + if (io->ci_restore_needed) { + /* file was detected release, we need to restore it + * before finishing the io + */ + rc = ll_layout_restore(inode, 0, OBD_OBJECT_EOF); + /* if restore registration failed, no restart, + * we will return -ENODATA */ + /* The layout will change after restore, so we need to + * block on layout lock held by the MDT + * as MDT will not send new layout in lvb (see LU-3124) + * we have to explicitly fetch it, all this will be done + * by ll_layout_refresh(). + * Even if ll_layout_restore() returns zero, it doesn't mean + * that restore has been successful. Therefore it sets + * ci_verify_layout so that it will check layout at the end + * of this function. + */ + if (rc) { + io->ci_restore_needed = 1; + io->ci_need_restart = 0; + io->ci_verify_layout = 0; + io->ci_result = rc; + GOTO(out, rc); + } + + io->ci_restore_needed = 0; + + /* Even if ll_layout_restore() returns zero, it doesn't mean + * that restore has been successful. Therefore it should verify + * if there was layout change and restart I/O correspondingly. + */ + ll_layout_refresh(inode, &gen); + io->ci_need_restart = vio->vui_layout_gen != gen; + if (io->ci_need_restart) { + CDEBUG(D_VFSTRACE, + DFID" layout changed from %d to %d.\n", + PFID(lu_object_fid(&obj->co_lu)), + vio->vui_layout_gen, gen); + /* today successful restore is the only possible + * case */ + /* restore was done, clear restoring state */ + ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)), + LLIF_FILE_RESTORING); + } + GOTO(out, 0); + } + + /** + * dynamic layout change needed, send layout intent + * RPC. + */ + if (io->ci_need_write_intent) { + enum layout_intent_opc opc = LAYOUT_INTENT_WRITE; + + io->ci_need_write_intent = 0; + + LASSERT(io->ci_type == CIT_WRITE || + cl_io_is_trunc(io) || cl_io_is_mkwrite(io)); + + CDEBUG(D_VFSTRACE, DFID" write layout, type %u "DEXT"\n", + PFID(lu_object_fid(&obj->co_lu)), io->ci_type, + PEXT(&io->ci_write_intent)); + + if (cl_io_is_trunc(io)) + opc = LAYOUT_INTENT_TRUNC; + + rc = ll_layout_write_intent(inode, opc, &io->ci_write_intent); + io->ci_result = rc; + if (!rc) + io->ci_need_restart = 1; + GOTO(out, rc); + } + + if (!io->ci_need_restart && + !io->ci_ignore_layout && io->ci_verify_layout) { + /* check layout version */ + ll_layout_refresh(inode, &gen); + io->ci_need_restart = vio->vui_layout_gen != gen; + if (io->ci_need_restart) { + CDEBUG(D_VFSTRACE, + DFID" layout changed from %d to %d.\n", + PFID(lu_object_fid(&obj->co_lu)), + vio->vui_layout_gen, gen); + } + GOTO(out, 0); + } +out: + EXIT; +} + +static void vvp_io_fault_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_page *page = io->u.ci_fault.ft_page; + + CLOBINVRNT(env, io->ci_obj, vvp_object_invariant(io->ci_obj)); + + if (page != NULL) { + lu_ref_del(&page->cp_reference, "fault", io); + cl_page_put(env, page); + io->u.ci_fault.ft_page = NULL; + } + vvp_io_fini(env, ios); +} + +static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma) +{ + /* + * we only want to hold PW locks if the mmap() can generate + * writes back to the file and that only happens in shared + * writable vmas + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return CLM_WRITE; + return CLM_READ; +} + +static int vvp_mmap_locks(const struct lu_env *env, + struct vvp_io *vio, struct cl_io *io) +{ + struct vvp_thread_info *vti = vvp_env_info(env); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct cl_lock_descr *descr = &vti->vti_descr; + union ldlm_policy_data policy; + struct iovec iov; + struct iov_iter i; + unsigned long addr; + ssize_t count; + int result = 0; + ENTRY; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + if (!cl_is_normalio(env, io)) + RETURN(0); + + /* nfs or loop back device write */ + if (vio->vui_iter == NULL) + RETURN(0); + + /* No MM (e.g. NFS)? No vmas too. */ + if (mm == NULL) + RETURN(0); + + if (!iter_is_iovec(vio->vui_iter) && !iov_iter_is_kvec(vio->vui_iter)) + RETURN(0); + + for (i = *vio->vui_iter; + iov_iter_count(&i); + iov_iter_advance(&i, iov.iov_len)) { + iov = iov_iter_iovec(&i); + addr = (unsigned long)iov.iov_base; + count = iov.iov_len; + + if (count == 0) + continue; + + count += addr & ~PAGE_MASK; + addr &= PAGE_MASK; + + mmap_read_lock(mm); + while ((vma = our_vma(mm, addr, count)) != NULL) { + struct dentry *de = file_dentry(vma->vm_file); + struct inode *inode = de->d_inode; + int flags = CEF_MUST; + + if (ll_file_nolock(vma->vm_file)) { + /* + * For no lock case is not allowed for mmap + */ + result = -EINVAL; + break; + } + + /* + * XXX: Required lock mode can be weakened: CIT_WRITE + * io only ever reads user level buffer, and CIT_READ + * only writes on it. + */ + policy_from_vma(&policy, vma, addr, count); + descr->cld_mode = vvp_mode_from_vma(vma); + descr->cld_obj = ll_i2info(inode)->lli_clob; + descr->cld_start = cl_index(descr->cld_obj, + policy.l_extent.start); + descr->cld_end = cl_index(descr->cld_obj, + policy.l_extent.end); + descr->cld_enq_flags = flags; + result = cl_io_lock_alloc_add(env, io, descr); + + CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", + descr->cld_mode, descr->cld_start, + descr->cld_end); + + if (result < 0) + break; + + if (vma->vm_end - addr >= count) + break; + + count -= vma->vm_end - addr; + addr = vma->vm_end; + } + mmap_read_unlock(mm); + if (result < 0) + break; + } + RETURN(result); +} + +static void vvp_io_advance(const struct lu_env *env, + const struct cl_io_slice *ios, + size_t nob) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = ios->cis_io->ci_obj; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + + if (!cl_is_normalio(env, io)) + return; + + vio->vui_tot_count -= nob; + iov_iter_reexpand(vio->vui_iter, vio->vui_tot_count); +} + +static void vvp_io_update_iov(const struct lu_env *env, + struct vvp_io *vio, struct cl_io *io) +{ + size_t size = io->u.ci_rw.crw_count; + + if (!cl_is_normalio(env, io) || vio->vui_iter == NULL) + return; + + iov_iter_truncate(vio->vui_iter, size); +} + +static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io, + enum cl_lock_mode mode, loff_t start, loff_t end) +{ + struct vvp_io *vio = vvp_env_io(env); + int result; + int ast_flags = 0; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + ENTRY; + + vvp_io_update_iov(env, vio, io); + + if (io->u.ci_rw.crw_nonblock) + ast_flags |= CEF_NONBLOCK; + if (io->ci_lock_no_expand) + ast_flags |= CEF_LOCK_NO_EXPAND; + + result = vvp_mmap_locks(env, vio, io); + if (result == 0) + result = vvp_io_one_lock(env, io, ast_flags, mode, start, end); + + RETURN(result); +} + +static int vvp_io_read_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_io_rw_common *rd = &io->u.ci_rd.rd; + int result; + + ENTRY; + result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos, + rd->crw_pos + rd->crw_count - 1); + RETURN(result); +} + +static int vvp_io_fault_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct vvp_io *vio = cl2vvp_io(env, ios); + /* + * XXX LDLM_FL_CBPENDING + */ + return vvp_io_one_lock_index(env, + io, 0, + vvp_mode_from_vma(vio->u.fault.ft_vma), + io->u.ci_fault.ft_index, + io->u.ci_fault.ft_index); +} + +static int vvp_io_write_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + loff_t start; + loff_t end; + + if (io->u.ci_wr.wr_append) { + start = 0; + end = OBD_OBJECT_EOF; + } else { + start = io->u.ci_wr.wr.crw_pos; + end = start + io->u.ci_wr.wr.crw_count - 1; + } + + RETURN(vvp_io_rw_lock(env, io, CLM_WRITE, start, end)); +} + +static int vvp_io_setattr_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) + +{ + return 0; +} + +/** + * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io. + * + * Handles "lockless io" mode when extent locking is done by server. + */ +static int vvp_io_setattr_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + __u64 new_size; + __u32 enqflags = 0; + + if (cl_io_is_trunc(io)) { + new_size = io->u.ci_setattr.sa_attr.lvb_size; + if (new_size == 0) + enqflags = CEF_DISCARD_DATA; + } else { + unsigned int valid = io->u.ci_setattr.sa_avalid; + + if (!(valid & TIMES_SET_FLAGS)) + return 0; + + if ((!(valid & ATTR_MTIME) || + io->u.ci_setattr.sa_attr.lvb_mtime >= + io->u.ci_setattr.sa_attr.lvb_ctime) && + (!(valid & ATTR_ATIME) || + io->u.ci_setattr.sa_attr.lvb_atime >= + io->u.ci_setattr.sa_attr.lvb_ctime)) + return 0; + + new_size = 0; + } + + return vvp_io_one_lock(env, io, enqflags, CLM_WRITE, + new_size, OBD_OBJECT_EOF); +} + +static int vvp_do_vmtruncate(struct inode *inode, size_t size) +{ + int result; + + /* + * Only ll_inode_size_lock is taken at this level. + */ + ll_inode_size_lock(inode); + result = inode_newsize_ok(inode, size); + if (result < 0) { + ll_inode_size_unlock(inode); + return result; + } + i_size_write(inode, size); + + ll_truncate_pagecache(inode, size); + ll_inode_size_unlock(inode); + return result; +} + +static int vvp_io_setattr_time(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct cl_attr *attr = vvp_env_thread_attr(env); + int result; + unsigned valid = CAT_CTIME; + + cl_object_attr_lock(obj); + attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime; + if (io->u.ci_setattr.sa_avalid & ATTR_ATIME_SET) { + attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime; + valid |= CAT_ATIME; + } + if (io->u.ci_setattr.sa_avalid & ATTR_MTIME_SET) { + attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime; + valid |= CAT_MTIME; + } + result = cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + + return result; +} + +static int vvp_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct inode *inode = vvp_object_inode(io->ci_obj); + struct ll_inode_info *lli = ll_i2info(inode); + + if (cl_io_is_trunc(io)) { + down_write(&lli->lli_trunc_sem); + inode_lock(inode); + inode_dio_wait(inode); + } else { + inode_lock(inode); + } + + if (io->u.ci_setattr.sa_avalid & TIMES_SET_FLAGS) + return vvp_io_setattr_time(env, ios); + + return 0; +} + +static void vvp_io_setattr_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct inode *inode = vvp_object_inode(io->ci_obj); + struct ll_inode_info *lli = ll_i2info(inode); + + if (cl_io_is_trunc(io)) { + /* Truncate in memory pages - they must be clean pages + * because osc has already notified to destroy osc_extents. */ + vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size); + inode_dio_write_done(inode); + inode_unlock(inode); + up_write(&lli->lli_trunc_sem); + } else { + inode_unlock(inode); + } +} + +static void vvp_io_setattr_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + bool restore_needed = ios->cis_io->ci_restore_needed; + struct inode *inode = vvp_object_inode(ios->cis_obj); + + vvp_io_fini(env, ios); + + if (restore_needed && !ios->cis_io->ci_restore_needed) { + /* restore finished, set data modified flag for HSM */ + ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED); + } +} + +static int vvp_io_read_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = vvp_object_inode(obj); + struct ll_inode_info *lli = ll_i2info(inode); + struct file *file = vio->vui_fd->fd_file; + loff_t pos = io->u.ci_rd.rd.crw_pos; + long cnt = io->u.ci_rd.rd.crw_count; + long tot = vio->vui_tot_count; + int exceed = 0; + int result; + ENTRY; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + + CDEBUG(D_VFSTRACE, "%s: read [%llu, %llu)\n", + file_dentry(file)->d_name.name, + pos, pos + cnt); + + if (vio->vui_io_subtype == IO_NORMAL) + down_read(&lli->lli_trunc_sem); + + if (!can_populate_pages(env, io, inode)) + RETURN(0); + + /* Unless this is reading a sparse file, otherwise the lock has already + * been acquired so vvp_prep_size() is an empty op. */ + result = vvp_prep_size(env, obj, io, pos, cnt, &exceed); + if (result != 0) + RETURN(result); + else if (exceed != 0) + GOTO(out, result); + + LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, + "Read ino %lu, %lu bytes, offset %lld, size %llu\n", + inode->i_ino, cnt, pos, i_size_read(inode)); + + /* turn off the kernel's read-ahead */ + vio->vui_fd->fd_file->f_ra.ra_pages = 0; + + /* initialize read-ahead window once per syscall */ + if (!vio->vui_ra_valid) { + vio->vui_ra_valid = true; + vio->vui_ra_start = cl_index(obj, pos); + vio->vui_ra_count = cl_index(obj, tot + PAGE_SIZE - 1); + ll_ras_enter(file); + } + + /* BUG: 5972 */ + file_accessed(file); + switch (vio->vui_io_subtype) { + case IO_NORMAL: + LASSERT(vio->vui_iocb->ki_pos == pos); + result = generic_file_read_iter(vio->vui_iocb, vio->vui_iter); + break; + case IO_SPLICE: + result = generic_file_splice_read(file, &pos, + vio->u.splice.vui_pipe, cnt, + vio->u.splice.vui_flags); + /* LU-1109: do splice read stripe by stripe otherwise if it + * may make nfsd stuck if this read occupied all internal pipe + * buffers. */ + io->ci_continue = 0; + break; + default: + CERROR("Wrong IO type %u\n", vio->vui_io_subtype); + LBUG(); + } + GOTO(out, result); + +out: + if (result >= 0) { + if (result < cnt) + io->ci_continue = 0; + io->ci_nob += result; + result = 0; + } + + return result; +} + +static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist, int from, int to) +{ + struct cl_2queue *queue = &io->ci_queue; + struct cl_page *page; + unsigned int bytes = 0; + int rc = 0; + ENTRY; + + if (plist->pl_nr == 0) + RETURN(0); + + if (from > 0 || to != PAGE_SIZE) { + page = cl_page_list_first(plist); + if (plist->pl_nr == 1) { + cl_page_clip(env, page, from, to); + } else { + if (from > 0) + cl_page_clip(env, page, from, PAGE_SIZE); + if (to != PAGE_SIZE) { + page = cl_page_list_last(plist); + cl_page_clip(env, page, 0, to); + } + } + } + + cl_2queue_init(queue); + cl_page_list_splice(plist, &queue->c2_qin); + rc = cl_io_submit_sync(env, io, CRT_WRITE, queue, 0); + + /* plist is not sorted any more */ + cl_page_list_splice(&queue->c2_qin, plist); + cl_page_list_splice(&queue->c2_qout, plist); + cl_2queue_fini(env, queue); + + if (rc == 0) { + /* calculate bytes */ + bytes = plist->pl_nr << PAGE_SHIFT; + bytes -= from + PAGE_SIZE - to; + + while (plist->pl_nr > 0) { + page = cl_page_list_first(plist); + cl_page_list_del(env, plist, page); + + cl_page_clip(env, page, 0, PAGE_SIZE); + + SetPageUptodate(cl_page_vmpage(page)); + cl_page_disown(env, io, page); + + /* held in ll_cl_init() */ + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + } + } + + RETURN(bytes > 0 ? bytes : rc); +} + +static void write_commit_callback(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + struct page *vmpage = page->cp_vmpage; + + SetPageUptodate(vmpage); + set_page_dirty(vmpage); + + cl_page_disown(env, io, page); + + /* held in ll_cl_init() */ + lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io)); + cl_page_put(env, page); +} + +/* make sure the page list is contiguous */ +static bool page_list_sanity_check(struct cl_object *obj, + struct cl_page_list *plist) +{ + struct cl_page *page; + pgoff_t index = CL_PAGE_EOF; + + cl_page_list_for_each(page, plist) { + struct vvp_page *vpg = cl_object_page_slice(obj, page); + + if (index == CL_PAGE_EOF) { + index = vvp_index(vpg); + continue; + } + + ++index; + if (index == vvp_index(vpg)) + continue; + + return false; + } + return true; +} + +/* Return how many bytes have queued or written */ +int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) +{ + struct cl_object *obj = io->ci_obj; + struct inode *inode = vvp_object_inode(obj); + struct vvp_io *vio = vvp_env_io(env); + struct cl_page_list *queue = &vio->u.write.vui_queue; + struct cl_page *page; + int rc = 0; + int bytes = 0; + unsigned int npages = vio->u.write.vui_queue.pl_nr; + ENTRY; + + if (npages == 0) + RETURN(0); + + CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d\n", + npages, vio->u.write.vui_from, vio->u.write.vui_to); + + LASSERT(page_list_sanity_check(obj, queue)); + + /* submit IO with async write */ + rc = cl_io_commit_async(env, io, queue, + vio->u.write.vui_from, vio->u.write.vui_to, + write_commit_callback); + npages -= queue->pl_nr; /* already committed pages */ + if (npages > 0) { + /* calculate how many bytes were written */ + bytes = npages << PAGE_SHIFT; + + /* first page */ + bytes -= vio->u.write.vui_from; + if (queue->pl_nr == 0) /* last page */ + bytes -= PAGE_SIZE - vio->u.write.vui_to; + LASSERTF(bytes > 0, "bytes = %d, pages = %d\n", bytes, npages); + + vio->u.write.vui_written += bytes; + + CDEBUG(D_VFSTRACE, "Committed %d pages %d bytes, tot: %ld\n", + npages, bytes, vio->u.write.vui_written); + + /* the first page must have been written. */ + vio->u.write.vui_from = 0; + } + LASSERT(page_list_sanity_check(obj, queue)); + LASSERT(ergo(rc == 0, queue->pl_nr == 0)); + + /* out of quota, try sync write */ + if (rc == -EDQUOT && !cl_io_is_mkwrite(io)) { + rc = vvp_io_commit_sync(env, io, queue, + vio->u.write.vui_from, + vio->u.write.vui_to); + if (rc > 0) { + vio->u.write.vui_written += rc; + rc = 0; + } + } + + /* update inode size */ + ll_merge_attr(env, inode); + + /* Now the pages in queue were failed to commit, discard them + * unless they were dirtied before. */ + while (queue->pl_nr > 0) { + page = cl_page_list_first(queue); + cl_page_list_del(env, queue, page); + + if (!PageDirty(cl_page_vmpage(page))) + cl_page_discard(env, io, page); + + cl_page_disown(env, io, page); + + /* held in ll_cl_init() */ + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + } + cl_page_list_fini(env, queue); + + RETURN(rc); +} + +static int vvp_io_write_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = vvp_object_inode(obj); + struct ll_inode_info *lli = ll_i2info(inode); + struct file *file = vio->vui_fd->fd_file; + ssize_t result = 0; + loff_t pos = io->u.ci_wr.wr.crw_pos; + size_t cnt = io->u.ci_wr.wr.crw_count; + bool lock_inode = !IS_NOSEC(inode); + size_t nob = io->ci_nob; + struct iov_iter iter; + size_t written = 0; + + ENTRY; + + if (vio->vui_io_subtype == IO_NORMAL) + down_read(&lli->lli_trunc_sem); + + if (!can_populate_pages(env, io, inode)) + RETURN(0); + + if (cl_io_is_append(io)) { + /* + * PARALLEL IO This has to be changed for parallel IO doing + * out-of-order writes. + */ + ll_merge_attr(env, inode); + pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode); + vio->vui_iocb->ki_pos = pos; + } else { + LASSERTF(vio->vui_iocb->ki_pos == pos, + "ki_pos %lld [%lld, %lld)\n", + vio->vui_iocb->ki_pos, + pos, pos + cnt); + } + + CDEBUG(D_VFSTRACE, "%s: write [%llu, %llu)\n", + file_dentry(file)->d_name.name, + pos, pos + cnt); + + /* The maximum Lustre file size is variable, based on the OST maximum + * object size and number of stripes. This needs another check in + * addition to the VFS checks earlier. */ + if (pos + cnt > ll_file_maxbytes(inode)) { + CDEBUG(D_INODE, + "%s: file %s ("DFID") offset %llu > maxbytes %llu\n", + ll_get_fsname(inode->i_sb, NULL, 0), + file_dentry(file)->d_name.name, + PFID(ll_inode2fid(inode)), pos + cnt, + ll_file_maxbytes(inode)); + RETURN(-EFBIG); + } + + /* Tests to verify we take the i_mutex correctly */ + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_SEC) && !lock_inode) + RETURN(-EINVAL); + + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_NOSEC) && lock_inode) + RETURN(-EINVAL); + + if (vio->vui_iter == NULL) { + /* from a temp io in ll_cl_init(). */ + result = 0; + } else { + /* + * When using the locked AIO function (generic_file_aio_write()) + * testing has shown the inode mutex to be a limiting factor + * with multi-threaded single shared file performance. To get + * around this, we now use the lockless version. To maintain + * consistency, proper locking to protect against writes, + * trucates, etc. is handled in the higher layers of lustre. + */ + lock_inode = !IS_NOSEC(inode); + iter = *vio->vui_iter; + + if (unlikely(lock_inode)) + inode_lock(inode); + result = __generic_file_write_iter(vio->vui_iocb, + vio->vui_iter); + if (unlikely(lock_inode)) + inode_unlock(inode); + + written = result; + if (result > 0 || result == -EIOCBQUEUED) +#ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS + result = generic_write_sync(vio->vui_iocb, result); +#else + { + ssize_t err; + + err = generic_write_sync(vio->vui_iocb->ki_filp, pos, + result); + if (err < 0 && result > 0) + result = err; + } +#endif + } + + if (result > 0) { + result = vvp_io_write_commit(env, io); + /* Simulate short commit */ + if (CFS_FAULT_CHECK(OBD_FAIL_LLITE_SHORT_COMMIT)) { + vio->u.write.vui_written >>= 1; + if (vio->u.write.vui_written > 0) + io->ci_need_restart = 1; + } + if (vio->u.write.vui_written > 0) { + result = vio->u.write.vui_written; + CDEBUG(D_VFSTRACE, "%s: write nob %zd, result: %zd\n", + file_dentry(file)->d_name.name, + io->ci_nob, result); + io->ci_nob += result; + } else { + io->ci_continue = 0; + } + } + if (vio->vui_iocb->ki_pos != (pos + io->ci_nob - nob)) { + CDEBUG(D_VFSTRACE, "%s: write position mismatch: " + "ki_pos %lld vs. pos %lld, written %ld, commit %ld " + "rc %ld\n", + file_dentry(file)->d_name.name, + vio->vui_iocb->ki_pos, pos + io->ci_nob - nob, + written, io->ci_nob - nob, result); + /* + * Rewind ki_pos and vui_iter to where it has + * successfully committed. + */ + vio->vui_iocb->ki_pos = pos + io->ci_nob - nob; + iov_iter_advance(&iter, io->ci_nob - nob); + vio->vui_iter->iov = iter.iov; + vio->vui_iter->nr_segs = iter.nr_segs; + vio->vui_iter->iov_offset = iter.iov_offset; + vio->vui_iter->count = iter.count; + } + if (result > 0) { + ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED); + + if (result < cnt) + io->ci_continue = 0; + result = 0; + } + + RETURN(result); +} + +static void vvp_io_rw_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct inode *inode = vvp_object_inode(ios->cis_obj); + struct ll_inode_info *lli = ll_i2info(inode); + + if (vio->vui_io_subtype == IO_NORMAL) + up_read(&lli->lli_trunc_sem); +} + +static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) +{ + struct vm_fault *vmf = cfio->ft_vmf; + + cfio->ft_flags = ll_filemap_fault(cfio->ft_vma, vmf); + cfio->ft_flags_valid = 1; + + if (vmf->page) { + LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n", + get_vmf_address(vmf)); + if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { + lock_page(vmf->page); + cfio->ft_flags |= VM_FAULT_LOCKED; + } + + cfio->ft_vmpage = vmf->page; + + return 0; + } + + if (cfio->ft_flags & VM_FAULT_SIGBUS) { + CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", get_vmf_address(vmf)); + return -EFAULT; + } + + if (cfio->ft_flags & VM_FAULT_OOM) { + CDEBUG(D_PAGE, "got addr %p - OOM\n", get_vmf_address(vmf)); + return -ENOMEM; + } + + if (cfio->ft_flags & VM_FAULT_RETRY) + return -EAGAIN; + + CERROR("unknown error in page fault %d\n", cfio->ft_flags); + + return -EINVAL; +} + +static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + set_page_dirty(page->cp_vmpage); +} + +static int vvp_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = vvp_object_inode(obj); + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_fault_io *fio = &io->u.ci_fault; + struct vvp_fault_io *cfio = &vio->u.fault; + loff_t offset; + int result = 0; + struct page *vmpage = NULL; + struct cl_page *page; + loff_t size; + pgoff_t last_index; + ENTRY; + + down_read(&lli->lli_trunc_sem); + + /* offset of the last byte on the page */ + offset = cl_offset(obj, fio->ft_index + 1) - 1; + LASSERT(cl_index(obj, offset) == fio->ft_index); + result = vvp_prep_size(env, obj, io, 0, offset + 1, NULL); + if (result != 0) + RETURN(result); + + /* must return locked page */ + if (fio->ft_mkwrite) { + LASSERT(cfio->ft_vmpage != NULL); + lock_page(cfio->ft_vmpage); + } else { + result = vvp_io_kernel_fault(cfio); + if (result != 0) + RETURN(result); + } + + vmpage = cfio->ft_vmpage; + LASSERT(PageLocked(vmpage)); + + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE)) + ll_invalidate_page(vmpage); + + size = i_size_read(inode); + /* Though we have already held a cl_lock upon this page, but + * it still can be truncated locally. */ + if (unlikely((vmpage->mapping != inode->i_mapping) || + (page_offset(vmpage) > size))) { + CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n"); + + /* return +1 to stop cl_io_loop() and ll_fault() will catch + * and retry. */ + GOTO(out, result = +1); + } + + last_index = cl_index(obj, size - 1); + + if (fio->ft_mkwrite ) { + /* + * Capture the size while holding the lli_trunc_sem from above + * we want to make sure that we complete the mkwrite action + * while holding this lock. We need to make sure that we are + * not past the end of the file. + */ + if (last_index < fio->ft_index) { + CDEBUG(D_PAGE, + "llite: mkwrite and truncate race happened: " + "%p: 0x%lx 0x%lx\n", + vmpage->mapping,fio->ft_index,last_index); + /* + * We need to return if we are + * passed the end of the file. This will propagate + * up the call stack to ll_page_mkwrite where + * we will return VM_FAULT_NOPAGE. Any non-negative + * value returned here will be silently + * converted to 0. If the vmpage->mapping is null + * the error code would be converted back to ENODATA + * in ll_page_mkwrite0. Thus we return -ENODATA + * to handle both cases + */ + GOTO(out, result = -ENODATA); + } + } + + page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE); + if (IS_ERR(page)) + GOTO(out, result = PTR_ERR(page)); + + /* if page is going to be written, we should add this page into cache + * earlier. */ + if (fio->ft_mkwrite) { + wait_on_page_writeback(vmpage); + if (!PageDirty(vmpage)) { + struct cl_page_list *plist = &vio->u.fault.ft_queue; + struct vvp_page *vpg = cl_object_page_slice(obj, page); + int to = PAGE_SIZE; + + /* vvp_page_assume() calls wait_on_page_writeback(). */ + cl_page_assume(env, io, page); + + cl_page_list_init(plist); + cl_page_list_add(plist, page); + + /* size fixup */ + if (last_index == vvp_index(vpg)) + to = ((size - 1) & ~PAGE_MASK) + 1; + + /* Do not set Dirty bit here so that in case IO is + * started before the page is really made dirty, we + * still have chance to detect it. */ + result = cl_io_commit_async(env, io, plist, 0, to, + mkwrite_commit_callback); + /* Have overquota flag, trying sync write to check + * whether indeed out of quota */ + if (result == -EDQUOT) { + cl_page_get(page); + result = vvp_io_commit_sync(env, io, + plist, 0, to); + if (result >= 0) { + io->ci_noquota = 1; + cl_page_own(env, io, page); + cl_page_list_add(plist, page); + lu_ref_add(&page->cp_reference, + "cl_io", io); + result = cl_io_commit_async(env, io, + plist, 0, to, + mkwrite_commit_callback); + io->ci_noquota = 0; + } else { + cl_page_put(env, page); + } + } + + LASSERT(cl_page_is_owned(page, io)); + cl_page_list_fini(env, plist); + + vmpage = NULL; + if (result < 0) { + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + + cl_page_put(env, page); + + /* we're in big trouble, what can we do now? */ + if (result == -EDQUOT) + result = -ENOSPC; + GOTO(out, result); + } else { + cl_page_disown(env, io, page); + } + } + } + + /* + * The ft_index is only used in the case of + * a mkwrite action. We need to check + * our assertions are correct, since + * we should have caught this above + */ + LASSERT(!fio->ft_mkwrite || fio->ft_index <= last_index); + if (fio->ft_index == last_index) + /* + * Last page is mapped partially. + */ + fio->ft_nob = size - cl_offset(obj, fio->ft_index); + else + fio->ft_nob = cl_page_size(obj); + + lu_ref_add(&page->cp_reference, "fault", io); + fio->ft_page = page; + EXIT; + +out: + /* return unlocked vmpage to avoid deadlocking */ + if (vmpage != NULL) + unlock_page(vmpage); + + cfio->ft_flags &= ~VM_FAULT_LOCKED; + + return result; +} + +static void vvp_io_fault_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct inode *inode = vvp_object_inode(ios->cis_obj); + struct ll_inode_info *lli = ll_i2info(inode); + + CLOBINVRNT(env, ios->cis_io->ci_obj, + vvp_object_invariant(ios->cis_io->ci_obj)); + up_read(&lli->lli_trunc_sem); +} + +static int vvp_io_fsync_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + /* we should mark TOWRITE bit to each dirty page in radix tree to + * verify pages have been written, but this is difficult because of + * race. */ + return 0; +} + +static int vvp_io_read_ahead(const struct lu_env *env, + const struct cl_io_slice *ios, + pgoff_t start, struct cl_read_ahead *ra) +{ + int result = 0; + ENTRY; + + if (ios->cis_io->ci_type == CIT_READ || + ios->cis_io->ci_type == CIT_FAULT) { + struct vvp_io *vio = cl2vvp_io(env, ios); + + if (unlikely(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + ra->cra_end = CL_PAGE_EOF; + result = +1; /* no need to call down */ + } + } + + RETURN(result); +} + +static const struct cl_io_operations vvp_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = vvp_io_fini, + .cio_lock = vvp_io_read_lock, + .cio_start = vvp_io_read_start, + .cio_end = vvp_io_rw_end, + .cio_advance = vvp_io_advance, + }, + [CIT_WRITE] = { + .cio_fini = vvp_io_fini, + .cio_iter_init = vvp_io_write_iter_init, + .cio_iter_fini = vvp_io_write_iter_fini, + .cio_lock = vvp_io_write_lock, + .cio_start = vvp_io_write_start, + .cio_end = vvp_io_rw_end, + .cio_advance = vvp_io_advance, + }, + [CIT_SETATTR] = { + .cio_fini = vvp_io_setattr_fini, + .cio_iter_init = vvp_io_setattr_iter_init, + .cio_lock = vvp_io_setattr_lock, + .cio_start = vvp_io_setattr_start, + .cio_end = vvp_io_setattr_end + }, + [CIT_FAULT] = { + .cio_fini = vvp_io_fault_fini, + .cio_iter_init = vvp_io_fault_iter_init, + .cio_lock = vvp_io_fault_lock, + .cio_start = vvp_io_fault_start, + .cio_end = vvp_io_fault_end, + }, + [CIT_FSYNC] = { + .cio_start = vvp_io_fsync_start, + .cio_fini = vvp_io_fini + }, + [CIT_GLIMPSE] = { + .cio_fini = vvp_io_fini + }, + [CIT_MISC] = { + .cio_fini = vvp_io_fini + }, + [CIT_LADVISE] = { + .cio_fini = vvp_io_fini + }, + }, + .cio_read_ahead = vvp_io_read_ahead +}; + +int vvp_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct vvp_io *vio = vvp_env_io(env); + struct inode *inode = vvp_object_inode(obj); + int result; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + ENTRY; + + CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d " + "restore needed %d\n", + PFID(lu_object_fid(&obj->co_lu)), + io->ci_ignore_layout, io->ci_verify_layout, + vio->vui_layout_gen, io->ci_restore_needed); + + CL_IO_SLICE_CLEAN(vio, vui_cl); + cl_io_slice_add(io, &vio->vui_cl, obj, &vvp_io_ops); + vio->vui_ra_valid = false; + result = 0; + if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) { + size_t count; + struct ll_inode_info *lli = ll_i2info(inode); + + count = io->u.ci_rw.crw_count; + /* "If nbyte is 0, read() will return 0 and have no other + * results." -- Single Unix Spec */ + if (count == 0) + result = 1; + else + vio->vui_tot_count = count; + + /* for read/write, we store the jobid in the inode, and + * it'll be fetched by osc when building RPC. + * + * it's not accurate if the file is shared by different + * jobs. + */ + lustre_get_jobid(lli->lli_jobid, sizeof(lli->lli_jobid)); + } else if (io->ci_type == CIT_SETATTR) { + if (!cl_io_is_trunc(io)) + io->ci_lockreq = CILR_MANDATORY; + } + + /* Enqueue layout lock and get layout version. We need to do this + * even for operations requiring to open file, such as read and write, + * because it might not grant layout lock in IT_OPEN. */ + if (result == 0 && !io->ci_ignore_layout) { + result = ll_layout_refresh(inode, &vio->vui_layout_gen); + if (result == -ENOENT) + /* If the inode on MDS has been removed, but the objects + * on OSTs haven't been destroyed (async unlink), layout + * fetch will return -ENOENT, we'd ingore this error + * and continue with dirty flush. LU-3230. */ + result = 0; + if (result < 0) + CERROR("%s: refresh file layout " DFID " error %d.\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(lu_object_fid(&obj->co_lu)), result); + } + + io->ci_result = result < 0 ? result : 0; + RETURN(result); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c new file mode 100644 index 0000000000000..c3bf715667577 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c @@ -0,0 +1,321 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl_object implementation for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include + +#include +#include "llite_internal.h" +#include "vvp_internal.h" + +/***************************************************************************** + * + * Object operations. + * + */ + +int vvp_object_invariant(const struct cl_object *obj) +{ + struct inode *inode = vvp_object_inode(obj); + struct ll_inode_info *lli = ll_i2info(inode); + + return (S_ISREG(inode->i_mode) || inode->i_mode == 0) && + lli->lli_clob == obj; +} + +static int vvp_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct vvp_object *obj = lu2vvp(o); + struct inode *inode = obj->vob_inode; + struct ll_inode_info *lli; + + (*p)(env, cookie, "(%d %d) inode: %p ", + atomic_read(&obj->vob_transient_pages), + atomic_read(&obj->vob_mmap_cnt), + inode); + if (inode) { + lli = ll_i2info(inode); + (*p)(env, cookie, "%lu/%u %o %u %d %p "DFID, + inode->i_ino, inode->i_generation, inode->i_mode, + inode->i_nlink, atomic_read(&inode->i_count), + lli->lli_clob, PFID(&lli->lli_fid)); + } + return 0; +} + +static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct inode *inode = vvp_object_inode(obj); + + /* + * lov overwrites most of these fields in + * lov_attr_get()->...lov_merge_lvb_kms(), except when inode + * attributes are newer. + */ + + attr->cat_size = i_size_read(inode); + attr->cat_mtime = inode->i_mtime.tv_sec; + attr->cat_atime = inode->i_atime.tv_sec; + attr->cat_ctime = inode->i_ctime.tv_sec; + attr->cat_blocks = inode->i_blocks; + attr->cat_uid = from_kuid(&init_user_ns, inode->i_uid); + attr->cat_gid = from_kgid(&init_user_ns, inode->i_gid); + attr->cat_projid = ll_i2info(inode)->lli_projid; + /* KMS is not known by this layer */ + return 0; /* layers below have to fill in the rest */ +} + +static int vvp_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct inode *inode = vvp_object_inode(obj); + + if (valid & CAT_UID) + inode->i_uid = make_kuid(&init_user_ns, attr->cat_uid); + if (valid & CAT_GID) + inode->i_gid = make_kgid(&init_user_ns, attr->cat_gid); + if (valid & CAT_ATIME) + inode->i_atime.tv_sec = attr->cat_atime; + if (valid & CAT_MTIME) + inode->i_mtime.tv_sec = attr->cat_mtime; + if (valid & CAT_CTIME) + inode->i_ctime.tv_sec = attr->cat_ctime; + if (0 && valid & CAT_SIZE) + i_size_write(inode, attr->cat_size); + if (valid & CAT_PROJID) + ll_i2info(inode)->lli_projid = attr->cat_projid; + /* not currently necessary */ + if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE|CAT_PROJID)) + mark_inode_dirty(inode); + return 0; +} + +static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct ll_inode_info *lli = ll_i2info(conf->coc_inode); + + if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { + CDEBUG(D_VFSTRACE, DFID ": losing layout lock\n", + PFID(&lli->lli_fid)); + + ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE); + + /* Clean up page mmap for this inode. + * The reason for us to do this is that if the page has + * already been installed into memory space, the process + * can access it without interacting with lustre, so this + * page may be stale due to layout change, and the process + * will never be notified. + * This operation is expensive but mmap processes have to pay + * a price themselves. */ + unmap_mapping_range(conf->coc_inode->i_mapping, + 0, OBD_OBJECT_EOF, 0); + } + return 0; +} + +static int vvp_prune(const struct lu_env *env, struct cl_object *obj) +{ + struct inode *inode = vvp_object_inode(obj); + int rc; + ENTRY; + + rc = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1); + if (rc < 0) { + CDEBUG(D_VFSTRACE, DFID ": writeback failed: %d\n", + PFID(lu_object_fid(&obj->co_lu)), rc); + RETURN(rc); + } + + truncate_inode_pages(inode->i_mapping, 0); + if (inode->i_mapping->nrpages) { + CDEBUG(D_VFSTRACE, DFID ": still has %lu pages remaining\n", + PFID(lu_object_fid(&obj->co_lu)), + inode->i_mapping->nrpages); + RETURN(-EIO); + } + + RETURN(0); +} + +static int vvp_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb) +{ + struct inode *inode = vvp_object_inode(obj); + + ENTRY; + lvb->lvb_mtime = inode->i_mtime.tv_sec; + lvb->lvb_atime = inode->i_atime.tv_sec; + lvb->lvb_ctime = inode->i_ctime.tv_sec; + + /* + * LU-417: Add dirty pages block count lest i_blocks reports 0, some + * "cp" or "tar" on remote node may think it's a completely sparse file + * and skip it. + */ + if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0) + lvb->lvb_blocks = dirty_cnt(inode); + + RETURN(0); +} + +static void vvp_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr) +{ + struct inode *inode; + struct obdo *oa; + u64 valid_flags = OBD_MD_FLTYPE | OBD_MD_FLUID | OBD_MD_FLGID; + + oa = attr->cra_oa; + inode = vvp_object_inode(obj); + + if (attr->cra_type == CRT_WRITE) { + valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME; + obdo_set_o_projid(oa, ll_i2info(inode)->lli_projid); + } + obdo_from_inode(oa, inode, valid_flags & attr->cra_flags); + obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_INVALID_PFID)) + oa->o_parent_oid++; + memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid, + sizeof(attr->cra_jobid)); +} + +static const struct cl_object_operations vvp_ops = { + .coo_page_init = vvp_page_init, + .coo_io_init = vvp_io_init, + .coo_attr_get = vvp_attr_get, + .coo_attr_update = vvp_attr_update, + .coo_conf_set = vvp_conf_set, + .coo_prune = vvp_prune, + .coo_glimpse = vvp_object_glimpse, + .coo_req_attr_set = vvp_req_attr_set +}; + +static int vvp_object_init0(const struct lu_env *env, + struct vvp_object *vob, + const struct cl_object_conf *conf) +{ + vob->vob_inode = conf->coc_inode; + atomic_set(&vob->vob_transient_pages, 0); + cl_object_page_init(&vob->vob_cl, sizeof(struct vvp_page)); + return 0; +} + +static int vvp_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct vvp_device *dev = lu2vvp_dev(obj->lo_dev); + struct vvp_object *vob = lu2vvp(obj); + struct lu_object *below; + struct lu_device *under; + int result; + + under = &dev->vdv_next->cd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); + if (below != NULL) { + const struct cl_object_conf *cconf; + + cconf = lu2cl_conf(conf); + lu_object_add(obj, below); + result = vvp_object_init0(env, vob, cconf); + } else + result = -ENOMEM; + + return result; +} + +static void vvp_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct vvp_object *vob = lu2vvp(obj); + + lu_object_fini(obj); + lu_object_header_fini(obj->lo_header); + OBD_SLAB_FREE_PTR(vob, vvp_object_kmem); +} + +static const struct lu_object_operations vvp_lu_obj_ops = { + .loo_object_init = vvp_object_init, + .loo_object_free = vvp_object_free, + .loo_object_print = vvp_object_print, +}; + +struct vvp_object *cl_inode2vvp(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct lu_object *lu; + + LASSERT(obj != NULL); + lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type); + LASSERT(lu != NULL); + + return lu2vvp(lu); +} + +struct lu_object *vvp_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct vvp_object *vob; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(vob, vvp_object_kmem, GFP_NOFS); + if (vob != NULL) { + struct cl_object_header *hdr; + + obj = &vob->vob_cl.co_lu; + hdr = &vob->vob_header; + cl_object_header_init(hdr); + hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page)); + + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + + vob->vob_cl.co_ops = &vvp_ops; + obj->lo_ops = &vvp_lu_obj_ops; + } else + obj = NULL; + return obj; +} diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_page.c b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c new file mode 100644 index 0000000000000..0f4e2a9e83dac --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c @@ -0,0 +1,555 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for VVP layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include + +#include +#include "llite_internal.h" +#include "vvp_internal.h" + +/***************************************************************************** + * + * Page operations. + * + */ + +static void vvp_page_fini_common(struct vvp_page *vpg, struct pagevec *pvec) +{ + struct page *vmpage = vpg->vpg_page; + + LASSERT(vmpage != NULL); + if (pvec) { + if (!pagevec_add(pvec, vmpage)) + pagevec_release(pvec); + } else { + put_page(vmpage); + } +} + +static void vvp_page_fini(const struct lu_env *env, + struct cl_page_slice *slice, + struct pagevec *pvec) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct page *vmpage = vpg->vpg_page; + + /* + * vmpage->private was already cleared when page was moved into + * VPG_FREEING state. + */ + LASSERT((struct cl_page *)vmpage->private != slice->cpl_page); + vvp_page_fini_common(vpg, pvec); +} + +static int vvp_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io, + int nonblock) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct page *vmpage = vpg->vpg_page; + + LASSERT(vmpage != NULL); + if (nonblock) { + if (!trylock_page(vmpage)) + return -EAGAIN; + + if (unlikely(PageWriteback(vmpage))) { + unlock_page(vmpage); + return -EAGAIN; + } + + return 0; + } + + lock_page(vmpage); + wait_on_page_writeback(vmpage); + + return 0; +} + +static void vvp_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + wait_on_page_writeback(vmpage); +} + +static void vvp_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); +} + +static void vvp_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + unlock_page(cl2vm_page(slice)); +} + +static void vvp_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + struct vvp_page *vpg = cl2vvp_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used && vmpage->mapping) + ll_ra_stats_inc(vmpage->mapping->host, RA_STAT_DISCARDED); + + ll_invalidate_page(vmpage); +} + +static void vvp_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct page *vmpage = cl2vm_page(slice); + struct cl_page *page = slice->cpl_page; + int refc; + + LASSERT(PageLocked(vmpage)); + LASSERT((struct cl_page *)vmpage->private == page); + + + /* Drop the reference count held in vvp_page_init */ + refc = atomic_dec_return(&page->cp_ref); + LASSERTF(refc >= 1, "page = %p, refc = %d\n", page, refc); + + ClearPagePrivate(vmpage); + vmpage->private = 0; + /* + * Reference from vmpage to cl_page is removed, but the reference back + * is still here. It is removed later in vvp_page_fini(). + */ +} + +static void vvp_page_export(const struct lu_env *env, + const struct cl_page_slice *slice, + int uptodate) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + if (uptodate) + SetPageUptodate(vmpage); + else + ClearPageUptodate(vmpage); +} + +static int vvp_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA; +} + +static int vvp_page_prep_read(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ENTRY; + /* Skip the page already marked as PG_uptodate. */ + RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0); +} + +static int vvp_page_prep_write(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + struct cl_page *pg = slice->cpl_page; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageDirty(vmpage)); + + /* ll_writepage path is not a sync write, so need to set page writeback + * flag */ + if (pg->cp_sync_io == NULL) + set_page_writeback(vmpage); + + return 0; +} + +/** + * Handles page transfer errors at VM level. + * + * This takes inode as a separate argument, because inode on which error is to + * be set can be different from \a vmpage inode in case of direct-io. + */ +static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret) +{ + struct vvp_object *obj = cl_inode2vvp(inode); + + if (ioret == 0) { + ClearPageError(vmpage); + obj->vob_discard_page_warned = 0; + } else { + SetPageError(vmpage); + if (ioret == -ENOSPC) + set_bit(AS_ENOSPC, &inode->i_mapping->flags); + else + set_bit(AS_EIO, &inode->i_mapping->flags); + + if ((ioret == -ESHUTDOWN || ioret == -EINTR || + ioret == -EIO) && obj->vob_discard_page_warned == 0) { + obj->vob_discard_page_warned = 1; + ll_dirty_page_discard_warn(vmpage, ioret); + } + } +} + +static void vvp_page_completion_read(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct page *vmpage = vpg->vpg_page; + struct cl_page *page = slice->cpl_page; + struct inode *inode = vvp_object_inode(page->cp_obj); + ENTRY; + + LASSERT(PageLocked(vmpage)); + CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret); + + if (vpg->vpg_defer_uptodate) + ll_ra_count_put(ll_i2sbi(inode), 1); + + if (ioret == 0) { + if (!vpg->vpg_defer_uptodate) + cl_page_export(env, page, 1); + } else if (vpg->vpg_defer_uptodate) { + vpg->vpg_defer_uptodate = 0; + if (ioret == -EWOULDBLOCK) { + /* mirror read failed, it needs to destroy the page + * because subpage would be from wrong osc when trying + * to read from a new mirror */ + ll_invalidate_page(vmpage); + } + } + + if (page->cp_sync_io == NULL) + unlock_page(vmpage); + + EXIT; +} + +static void vvp_page_completion_write(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct cl_page *pg = slice->cpl_page; + struct page *vmpage = vpg->vpg_page; + ENTRY; + + CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret); + + if (pg->cp_sync_io != NULL) { + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + } else { + LASSERT(PageWriteback(vmpage)); + /* + * Only mark the page error only when it's an async write + * because applications won't wait for IO to finish. + */ + vvp_vmpage_error(vvp_object_inode(pg->cp_obj), vmpage, ioret); + + end_page_writeback(vmpage); + } + EXIT; +} + +/** + * Implements cl_page_operations::cpo_make_ready() method. + * + * This is called to yank a page from the transfer cache and to send it out as + * a part of transfer. This function try-locks the page. If try-lock failed, + * page is owned by some concurrent IO, and should be skipped (this is bad, + * but hopefully rare situation, as it usually results in transfer being + * shorter than possible). + * + * \retval 0 success, page can be placed into transfer + * + * \retval -EAGAIN page is either used by concurrent IO has been + * truncated. Skip it. + */ +static int vvp_page_make_ready(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct page *vmpage = cl2vm_page(slice); + struct cl_page *pg = slice->cpl_page; + int result = 0; + + lock_page(vmpage); + if (clear_page_dirty_for_io(vmpage)) { + LASSERT(pg->cp_state == CPS_CACHED); + /* This actually clears the dirty bit in the radix + * tree. */ + set_page_writeback(vmpage); + CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n"); + } else if (pg->cp_state == CPS_PAGEOUT) { + /* is it possible for osc_flush_async_page() to already + * make it ready? */ + result = -EALREADY; + } else { + CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n", + pg->cp_state); + LBUG(); + } + unlock_page(vmpage); + RETURN(result); +} + +static int vvp_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct page *vmpage = vpg->vpg_page; + + (*printer)(env, cookie, LUSTRE_VVP_NAME"-page@%p(%d:%d) " + "vm@%p ", + vpg, vpg->vpg_defer_uptodate, vpg->vpg_ra_used, vmpage); + + if (vmpage != NULL) { + (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru", + (long)vmpage->flags, page_count(vmpage), + page_mapcount(vmpage), vmpage->private, + page_index(vmpage), + list_empty(&vmpage->lru) ? "not-" : ""); + } + + (*printer)(env, cookie, "\n"); + + return 0; +} + +static int vvp_page_fail(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + /* + * Cached read? + */ + LBUG(); + + return 0; +} + +static const struct cl_page_operations vvp_page_ops = { + .cpo_own = vvp_page_own, + .cpo_assume = vvp_page_assume, + .cpo_unassume = vvp_page_unassume, + .cpo_disown = vvp_page_disown, + .cpo_discard = vvp_page_discard, + .cpo_delete = vvp_page_delete, + .cpo_export = vvp_page_export, + .cpo_is_vmlocked = vvp_page_is_vmlocked, + .cpo_fini = vvp_page_fini, + .cpo_print = vvp_page_print, + .io = { + [CRT_READ] = { + .cpo_prep = vvp_page_prep_read, + .cpo_completion = vvp_page_completion_read, + .cpo_make_ready = vvp_page_fail, + }, + [CRT_WRITE] = { + .cpo_prep = vvp_page_prep_write, + .cpo_completion = vvp_page_completion_write, + .cpo_make_ready = vvp_page_make_ready, + }, + }, +}; + +static int vvp_transient_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ENTRY; + /* transient page should always be sent. */ + RETURN(0); +} + +static void vvp_transient_page_verify(const struct cl_page *page) +{ +} + +static int vvp_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused, int nonblock) +{ + vvp_transient_page_verify(slice->cpl_page); + return 0; +} + +static void vvp_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct cl_page *page = slice->cpl_page; + + vvp_transient_page_verify(slice->cpl_page); + + /* + * For transient pages, remove it from the radix tree. + */ + cl_page_delete(env, page); +} + +static int vvp_transient_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct inode *inode = vvp_object_inode(slice->cpl_obj); + int locked; + + locked = !inode_trylock(inode); + if (!locked) + inode_unlock(inode); + return locked ? -EBUSY : -ENODATA; +} + +static void +vvp_transient_page_completion(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_fini(const struct lu_env *env, + struct cl_page_slice *slice, + struct pagevec *pvec) +{ + struct vvp_page *vpg = cl2vvp_page(slice); + struct cl_page *clp = slice->cpl_page; + struct vvp_object *clobj = cl2vvp(clp->cp_obj); + + vvp_page_fini_common(vpg, pvec); + atomic_dec(&clobj->vob_transient_pages); +} + +static const struct cl_page_operations vvp_transient_page_ops = { + .cpo_own = vvp_transient_page_own, + .cpo_assume = vvp_transient_page_assume, + .cpo_unassume = vvp_transient_page_unassume, + .cpo_disown = vvp_transient_page_disown, + .cpo_discard = vvp_transient_page_discard, + .cpo_fini = vvp_transient_page_fini, + .cpo_is_vmlocked = vvp_transient_page_is_vmlocked, + .cpo_print = vvp_page_print, + .io = { + [CRT_READ] = { + .cpo_prep = vvp_transient_page_prep, + .cpo_completion = vvp_transient_page_completion, + }, + [CRT_WRITE] = { + .cpo_prep = vvp_transient_page_prep, + .cpo_completion = vvp_transient_page_completion, + } + } +}; + +int vvp_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + struct vvp_page *vpg = cl_object_page_slice(obj, page); + struct page *vmpage = page->cp_vmpage; + + CLOBINVRNT(env, obj, vvp_object_invariant(obj)); + + vpg->vpg_page = vmpage; + get_page(vmpage); + + if (page->cp_type == CPT_CACHEABLE) { + /* in cache, decref in vvp_page_delete */ + atomic_inc(&page->cp_ref); + SetPagePrivate(vmpage); + vmpage->private = (unsigned long)page; + cl_page_slice_add(page, &vpg->vpg_cl, obj, index, + &vvp_page_ops); + } else { + struct vvp_object *clobj = cl2vvp(obj); + + cl_page_slice_add(page, &vpg->vpg_cl, obj, index, + &vvp_transient_page_ops); + atomic_inc(&clobj->vob_transient_pages); + } + return 0; +} diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c new file mode 100644 index 0000000000000..35da3f779e02a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c @@ -0,0 +1,887 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#ifdef HAVE_LINUX_SELINUX_IS_ENABLED +#include +#endif + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include + +#include "llite_internal.h" + +#ifndef HAVE_XATTR_HANDLER_NAME +static inline const char *xattr_prefix(const struct xattr_handler *handler) +{ + return handler->prefix; +} +#endif + +const struct xattr_handler *get_xattr_type(const char *name) +{ + int i; + + for (i = 0; ll_xattr_handlers[i]; i++) { + const char *prefix = xattr_prefix(ll_xattr_handlers[i]); + size_t prefix_len = strlen(prefix); + + if (!strncmp(prefix, name, prefix_len)) + return ll_xattr_handlers[i]; + } + + return NULL; +} + +static int xattr_type_filter(struct ll_sb_info *sbi, + const struct xattr_handler *handler) +{ + /* No handler means XATTR_OTHER_T */ + if (!handler) + return -EOPNOTSUPP; + + if ((handler->flags == XATTR_ACL_ACCESS_T || + handler->flags == XATTR_ACL_DEFAULT_T) && + !(sbi->ll_flags & LL_SBI_ACL)) + return -EOPNOTSUPP; + + if (handler->flags == XATTR_USER_T && + !(sbi->ll_flags & LL_SBI_USER_XATTR)) + return -EOPNOTSUPP; + + if (handler->flags == XATTR_TRUSTED_T && + !capable(CFS_CAP_SYS_ADMIN)) + return -EPERM; + + return 0; +} + +#ifndef HAVE_USER_NAMESPACE_ARG +#define ll_xattr_set_common(hd, ns, de, inode, name, value, size, flags) \ + ll_xattr_set_common(hd, de, inode, name, value, size, flags) +#endif + +static int ll_xattr_set_common(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + const char *pv = value; + char *fullname; + u64 valid; + int rc; + ENTRY; + + /* When setxattr() is called with a size of 0 the value is + * unconditionally replaced by "". When removexattr() is + * called we get a NULL value and XATTR_REPLACE for flags. */ + if (!value && flags == XATTR_REPLACE) { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1); + valid = OBD_MD_FLXATTRRM; + } else { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1); + valid = OBD_MD_FLXATTR; + } + + /* FIXME: enable IMA when the conditions are ready */ + if (handler->flags == XATTR_SECURITY_T && + (!strcmp(name, "ima") || !strcmp(name, "evm"))) + RETURN(-EOPNOTSUPP); + + rc = xattr_type_filter(sbi, handler); + if (rc) + RETURN(rc); + + if ((handler->flags == XATTR_ACL_ACCESS_T || + handler->flags == XATTR_ACL_DEFAULT_T) && +/* Test for older kernels that was cleaned up in LU-12477 and LU-10092 */ +#if defined(HAVE_INODE_OWNER_OR_CAPABLE) || defined(HAVE_USER_NAMESPACE_ARG) + !inode_owner_or_capable(mnt_userns, inode)) +#else + !is_owner_or_cap(inode)) +#endif + RETURN(-EPERM); + + /* b10667: ignore lustre special xattr for now */ + if (!strcmp(name, "hsm") || + ((handler->flags == XATTR_TRUSTED_T && !strcmp(name, "lov")) || + (handler->flags == XATTR_LUSTRE_T && !strcmp(name, "lov")))) + RETURN(0); + + /* LU-549: Disable security.selinux when selinux is disabled */ + if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() && + strcmp(name, "selinux") == 0) + RETURN(-EOPNOTSUPP); + + /* + * In user.* namespace, only regular files and directories can have + * extended attributes. + */ + if (handler->flags == XATTR_USER_T) { + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + RETURN(-EPERM); + } + + fullname = kasprintf(GFP_KERNEL, "%s%s", xattr_prefix(handler), name); + if (!fullname) + RETURN(-ENOMEM); + + rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, fullname, + pv, size, flags, ll_i2suppgid(inode), &req); + kfree(fullname); + if (rc) { + if (rc == -EOPNOTSUPP && handler->flags == XATTR_USER_T) { + LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n"); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } + RETURN(rc); + } + + ptlrpc_req_finished(req); + RETURN(0); +} + +int ll_get_hsm_state(struct inode *inode, u32 *hus_states) +{ + struct md_op_data *op_data; + struct hsm_user_state *hus; + int rc; + + OBD_ALLOC_PTR(hus); + if (!hus) + return -ENOMEM; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hus); + if (!IS_ERR(op_data)) { + rc = obd_iocontrol(LL_IOC_HSM_STATE_GET, ll_i2mdexp(inode), + sizeof(*op_data), op_data, NULL); + if (!rc) + *hus_states = hus->hus_states; + else + CDEBUG(D_VFSTRACE, "obd_iocontrol failed. rc = %d\n", + rc); + + ll_finish_md_op_data(op_data); + } else { + rc = PTR_ERR(op_data); + CDEBUG(D_VFSTRACE, "Could not prepare the opdata. rc = %d\n", + rc); + } + OBD_FREE_PTR(hus); + return rc; +} + +static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump, size_t size) +{ + struct lov_comp_md_v1 *comp_v1 = (struct lov_comp_md_v1 *)lump; + struct lov_user_md *v1 = lump; + bool need_clear_release = false; + bool release_checked = false; + bool is_composite = false; + u16 entry_count = 1; + int rc = 0; + int i; + + if (!lump) + return 0; + + if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) { + if (size < sizeof(*comp_v1)) + return -ERANGE; + + entry_count = comp_v1->lcm_entry_count; + if (size < offsetof(typeof(*comp_v1), lcm_entries[entry_count])) + return -ERANGE; + is_composite = true; + } + + for (i = 0; i < entry_count; i++) { + if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) { + void *ptr = comp_v1; + + if (comp_v1->lcm_entries[i].lcme_offset + sizeof(*v1) > + size) + return -ERANGE; + + ptr += comp_v1->lcm_entries[i].lcme_offset; + v1 = (struct lov_user_md *)ptr; + } + + /* + * Attributes that are saved via getxattr will always + * have the stripe_offset as 0. Instead, the MDS + * should be allowed to pick the starting OST index. + * b=17846 + */ + if (!is_composite && v1->lmm_stripe_offset == 0) + v1->lmm_stripe_offset = -1; + + /* Avoid anyone directly setting the RELEASED flag. */ + if (v1->lmm_pattern & LOV_PATTERN_F_RELEASED) { + if (!release_checked) { + u32 state = HS_NONE; + + rc = ll_get_hsm_state(inode, &state); + if (rc) + return rc; + + if (!(state & HS_ARCHIVED)) + need_clear_release = true; + release_checked = true; + } + if (need_clear_release) + v1->lmm_pattern ^= LOV_PATTERN_F_RELEASED; + } + } + + return rc; +} + +static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump, + size_t size) +{ + struct inode *inode = dentry->d_inode; + int rc = 0; + + /* + * It is possible to set an xattr to a "" value of zero size. + * For this case we are going to treat it as a removal. + */ + if (!size && lump) + lump = NULL; + + if (size && size < sizeof(*lump)) { + /* ll_adjust_lum() or ll_lov_user_md_size() might access + * before size - just give up now. + */ + return -ERANGE; + } + rc = ll_adjust_lum(inode, lump, size); + if (rc) + return rc; + + if (lump && S_ISREG(inode->i_mode)) { + u64 it_flags = FMODE_WRITE; + ssize_t lum_size; + + lum_size = ll_lov_user_md_size(lump); + if (lum_size < 0 || size < lum_size) + return -ERANGE; + + rc = ll_lov_setstripe_ea_info(inode, dentry, it_flags, lump, + lum_size); + /** + * b=10667: ignore -EEXIST. + * Silently eat error on setting trusted.lov/lustre.lov + * attribute for platforms that added the default option + * to copy all attributes in 'cp' command. Both rsync and + * tar --xattrs also will try to set LOVEA for existing + * files. + */ + if (rc == -EEXIST) + rc = 0; + } else if (S_ISDIR(inode->i_mode)) { + if (size != 0 && size < sizeof(struct lov_user_md)) + return -EINVAL; + + rc = ll_dir_setstripe(inode, lump, 0); + } + + return rc; +} + +#ifndef HAVE_USER_NAMESPACE_ARG +#define ll_xattr_set(hd, ns, de, inode, name, value, size, flags) \ + ll_xattr_set(hd, de, inode, name, value, size, flags) +#endif + +static int ll_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) +{ + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), xattr %s\n", + PFID(ll_inode2fid(inode)), inode, name); + + /* lustre/trusted.lov.xxx would be passed through xattr API */ + if (!strcmp(name, "lov")) { + int op_type = flags == XATTR_REPLACE ? LPROC_LL_REMOVEXATTR : + LPROC_LL_SETXATTR; + + ll_stats_ops_tally(ll_i2sbi(inode), op_type, 1); + + return ll_setstripe_ea(dentry, (struct lov_user_md *)value, + size); + } else if (!strcmp(name, "lma") || !strcmp(name, "link")) { + int op_type = flags == XATTR_REPLACE ? LPROC_LL_REMOVEXATTR : + LPROC_LL_SETXATTR; + + ll_stats_ops_tally(ll_i2sbi(inode), op_type, 1); + return 0; + } + + if (strncmp(name, "lov.", 4) == 0 && + (__swab32(((struct lov_user_md *)value)->lmm_magic) & + le32_to_cpu(LOV_MAGIC_MASK)) == le32_to_cpu(LOV_MAGIC_MAGIC)) + lustre_swab_lov_user_md((struct lov_user_md *)value, 0); + + return ll_xattr_set_common(handler, mnt_userns, dentry, inode, name, + value, size, flags); +} + +int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer, + size_t size, u64 valid) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + void *xdata; + int rc; + ENTRY; + + if (sbi->ll_xattr_cache_enabled && type != XATTR_ACL_ACCESS_T && + (type != XATTR_SECURITY_T || strcmp(name, "security.selinux"))) { + rc = ll_xattr_cache_get(inode, name, buffer, size, valid); + if (rc == -EAGAIN) + goto getxattr_nocache; + if (rc < 0) + GOTO(out_xattr, rc); + + /* Add "system.posix_acl_access" to the list */ + if (lli->lli_posix_acl && valid & OBD_MD_FLXATTRLS) { + if (size == 0) { + rc += sizeof(XATTR_NAME_ACL_ACCESS); + } else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) { + memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS, + sizeof(XATTR_NAME_ACL_ACCESS)); + rc += sizeof(XATTR_NAME_ACL_ACCESS); + } else { + GOTO(out_xattr, rc = -ERANGE); + } + } + } else { +getxattr_nocache: + rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, + name, size, &req); + if (rc < 0) + GOTO(out_xattr, rc); + + /* only detect the xattr size */ + if (size == 0) + GOTO(out, rc); + + if (size < rc) + GOTO(out, rc = -ERANGE); + + /* do not need swab xattr data */ + xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, + rc); + if (!xdata) + GOTO(out, rc = -EPROTO); + + memcpy(buffer, xdata, rc); + } + + EXIT; + +out_xattr: + if (rc == -EOPNOTSUPP && type == XATTR_USER_T) { + LCONSOLE_INFO("%s: disabling user_xattr feature because " + "it is not supported on the server: rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), rc); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } +out: + ptlrpc_req_finished(req); + RETURN(rc); +} + +static int ll_xattr_get_common(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + char *fullname; + int rc; + + ENTRY; + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); + + rc = xattr_type_filter(sbi, handler); + if (rc) + RETURN(rc); + + /* LU-549: Disable security.selinux when selinux is disabled */ + if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() && + !strcmp(name, "selinux")) + RETURN(-EOPNOTSUPP); + +#ifdef CONFIG_FS_POSIX_ACL + /* posix acl is under protection of LOOKUP lock. when calling to this, + * we just have path resolution to the target inode, so we have great + * chance that cached ACL is uptodate. + */ + if (handler->flags == XATTR_ACL_ACCESS_T) { + struct ll_inode_info *lli = ll_i2info(inode); + struct posix_acl *acl; + + spin_lock(&lli->lli_lock); + acl = posix_acl_dup(lli->lli_posix_acl); + spin_unlock(&lli->lli_lock); + + if (!acl) + RETURN(-ENODATA); + + rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + RETURN(rc); + } + if (handler->flags == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode)) + RETURN(-ENODATA); +#endif + + fullname = kasprintf(GFP_KERNEL, "%s%s", xattr_prefix(handler), name); + if (!fullname) + RETURN(-ENOMEM); + + rc = ll_xattr_list(inode, fullname, handler->flags, buffer, size, + OBD_MD_FLXATTR); + kfree(fullname); + RETURN(rc); +} + +static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size) +{ + ssize_t rc; + + if (S_ISREG(inode->i_mode)) { + struct cl_object *obj = ll_i2info(inode)->lli_clob; + struct cl_layout cl = { + .cl_buf.lb_buf = buf, + .cl_buf.lb_len = buf_size, + }; + struct lu_env *env; + u16 refcheck; + + if (!obj) + RETURN(-ENODATA); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = cl_object_layout_get(env, obj, &cl); + if (rc < 0) + GOTO(out_env, rc); + + if (!cl.cl_size) + GOTO(out_env, rc = -ENODATA); + + rc = cl.cl_size; + + if (!buf_size) + GOTO(out_env, rc); + + LASSERT(buf && rc <= buf_size); + + /* + * Do not return layout gen for getxattr() since + * otherwise it would confuse tar --xattr by + * recognizing layout gen as stripe offset when the + * file is restored. See LU-2809. + */ + if ((((struct lov_mds_md *)buf)->lmm_magic & + __swab32(LOV_MAGIC_MAGIC)) == __swab32(LOV_MAGIC_MAGIC)) + lustre_swab_lov_user_md((struct lov_user_md *)buf, + cl.cl_size); + + switch (((struct lov_mds_md *)buf)->lmm_magic) { + case LOV_MAGIC_V1: + case LOV_MAGIC_V3: + case LOV_MAGIC_SPECIFIC: + ((struct lov_mds_md *)buf)->lmm_layout_gen = 0; + break; + case LOV_MAGIC_COMP_V1: + goto out_env; + default: + CERROR("Invalid LOV magic %08x\n", + ((struct lov_mds_md *)buf)->lmm_magic); + GOTO(out_env, rc = -EINVAL); + } + +out_env: + cl_env_put(env, &refcheck); + + RETURN(rc); + } else if (S_ISDIR(inode->i_mode)) { + struct ptlrpc_request *req = NULL; + struct ptlrpc_request *root_req = NULL; + struct lov_mds_md *lmm = NULL; + int lmm_size = 0; + + rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmm_size, + &req, &root_req, 0); + if (rc < 0) + GOTO(out_req, rc); + + if (!buf_size) + GOTO(out_req, rc = lmm_size); + + if (buf_size < lmm_size) + GOTO(out_req, rc = -ERANGE); + + memcpy(buf, lmm, lmm_size); + GOTO(out_req, rc = lmm_size); +out_req: + if (req) + ptlrpc_req_finished(req); + if (root_req) + ptlrpc_req_finished(root_req); + + RETURN(rc); + } else { + RETURN(-ENODATA); + } +} + +static int ll_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n", + PFID(ll_inode2fid(inode)), inode, name); + + if (!strcmp(name, "lov")) { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); + + return ll_getxattr_lov(inode, buffer, size); + } + + return ll_xattr_get_common(handler, dentry, inode, name, buffer, size); +} + +ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + char *xattr_name; + ssize_t rc, rc2; + size_t len, rem; + + LASSERT(inode); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1); + + rc = ll_xattr_list(inode, NULL, XATTR_OTHER_T, buffer, size, + OBD_MD_FLXATTRLS); + if (rc < 0) + RETURN(rc); + + /* + * If we're being called to get the size of the xattr list + * (size == 0) then just assume that a lustre.lov xattr + * exists. + */ + if (!size) + RETURN(rc + sizeof(XATTR_LUSTRE_LOV)); + + xattr_name = buffer; + rem = rc; + + while (rem > 0) { + len = strnlen(xattr_name, rem - 1) + 1; + rem -= len; + if (!xattr_type_filter(sbi, get_xattr_type(xattr_name))) { + /* Skip OK xattr type, leave it in buffer. */ + xattr_name += len; + continue; + } + + /* + * Move up remaining xattrs in buffer + * removing the xattr that is not OK. + */ + memmove(xattr_name, xattr_name + len, rem); + rc -= len; + } + + rc2 = ll_getxattr_lov(inode, NULL, 0); + if (rc2 == -ENODATA) + RETURN(rc); + + if (rc2 < 0) + RETURN(rc2); + + if (size < rc + sizeof(XATTR_LUSTRE_LOV)) + RETURN(-ERANGE); + + memcpy(buffer + rc, XATTR_LUSTRE_LOV, sizeof(XATTR_LUSTRE_LOV)); + + RETURN(rc + sizeof(XATTR_LUSTRE_LOV)); +} + +#ifdef HAVE_XATTR_HANDLER_SIMPLIFIED +static int ll_xattr_get_common_4_3(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + return ll_xattr_get_common(handler, dentry, dentry->d_inode, name, + buffer, size); +} + +static int ll_xattr_get_4_3(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + return ll_xattr_get(handler, dentry, dentry->d_inode, name, buffer, + size); +} + +static int ll_xattr_set_common_4_3(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return ll_xattr_set_common(handler, dentry, dentry->d_inode, name, + value, size, flags); +} + +static int ll_xattr_set_4_3(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return ll_xattr_set(handler, dentry, dentry->d_inode, name, value, + size, flags); +} + +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) +const struct xattr_handler *get_xattr_handler(int handler_flag) +{ + int i = 0; + + while (ll_xattr_handlers[i]) { + if (ll_xattr_handlers[i]->flags == handler_flag) + return ll_xattr_handlers[i]; + i++; + } + return NULL; +} + +static int ll_xattr_get_common_3_11(struct dentry *dentry, const char *name, + void *buffer, size_t size, int handler_flags) +{ + const struct xattr_handler *handler = get_xattr_handler(handler_flags); + + if (!handler) + return -ENXIO; + + return ll_xattr_get_common(handler, dentry, dentry->d_inode, name, + buffer, size); +} + +static int ll_xattr_get_3_11(struct dentry *dentry, const char *name, + void *buffer, size_t size, int handler_flags) +{ + const struct xattr_handler *handler = get_xattr_handler(handler_flags); + + if (!handler) + return -ENXIO; + + return ll_xattr_get(handler, dentry, dentry->d_inode, name, buffer, + size); +} + +static int ll_xattr_set_common_3_11(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + int handler_flags) +{ + const struct xattr_handler *handler = get_xattr_handler(handler_flags); + + if (!handler) + return -ENXIO; + + return ll_xattr_set_common(handler, NULL, dentry, dentry->d_inode, name, + value, size, flags); +} + +static int ll_xattr_set_3_11(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + int handler_flags) +{ + const struct xattr_handler *handler = get_xattr_handler(handler_flags); + + if (!handler) + return -ENXIO; + + return ll_xattr_set(handler, NULL, dentry, dentry->d_inode, name, value, + size, flags); +} +#endif + +static const struct xattr_handler ll_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = XATTR_USER_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_common_4_3, + .set = ll_xattr_set_common_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_common_3_11, + .set = ll_xattr_set_common_3_11, +#else + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +#endif +}; + +static const struct xattr_handler ll_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = XATTR_TRUSTED_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_4_3, + .set = ll_xattr_set_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_3_11, + .set = ll_xattr_set_3_11, +#else + .get = ll_xattr_get, + .set = ll_xattr_set, +#endif +}; + +static const struct xattr_handler ll_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = XATTR_SECURITY_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_common_4_3, + .set = ll_xattr_set_common_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_common_3_11, + .set = ll_xattr_set_common_3_11, +#else + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +#endif +}; + +static const struct xattr_handler ll_acl_access_xattr_handler = { +#ifdef HAVE_XATTR_HANDLER_NAME + .name = XATTR_NAME_POSIX_ACL_ACCESS, +#else + .prefix = XATTR_NAME_POSIX_ACL_ACCESS, +#endif + .flags = XATTR_ACL_ACCESS_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_common_4_3, + .set = ll_xattr_set_common_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_common_3_11, + .set = ll_xattr_set_common_3_11, +#else + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +#endif +}; + +static const struct xattr_handler ll_acl_default_xattr_handler = { +#ifdef HAVE_XATTR_HANDLER_NAME + .name = XATTR_NAME_POSIX_ACL_DEFAULT, +#else + .prefix = XATTR_NAME_POSIX_ACL_DEFAULT, +#endif + .flags = XATTR_ACL_DEFAULT_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_common_4_3, + .set = ll_xattr_set_common_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_common_3_11, + .set = ll_xattr_set_common_3_11, +#else + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +#endif +}; + +static const struct xattr_handler ll_lustre_xattr_handler = { + .prefix = XATTR_LUSTRE_PREFIX, + .flags = XATTR_LUSTRE_T, +#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED) + .get = ll_xattr_get_4_3, + .set = ll_xattr_set_4_3, +#elif !defined(HAVE_USER_NAMESPACE_ARG) && \ +!defined(HAVE_XATTR_HANDLER_INODE_PARAM) + .get = ll_xattr_get_3_11, + .set = ll_xattr_set_3_11, +#else + .get = ll_xattr_get, + .set = ll_xattr_set, +#endif +}; + +const struct xattr_handler *ll_xattr_handlers[] = { + &ll_user_xattr_handler, + &ll_trusted_xattr_handler, + &ll_security_xattr_handler, +#ifdef CONFIG_FS_POSIX_ACL + &ll_acl_access_xattr_handler, + &ll_acl_default_xattr_handler, +#endif + &ll_lustre_xattr_handler, + NULL, +}; diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr26.c b/drivers/staging/lustrefsx/lustre/llite/xattr26.c new file mode 100644 index 0000000000000..28772dd5a74a1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/xattr26.c @@ -0,0 +1,591 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include + +#include "llite_internal.h" + +/* xattr related to IMA(Integrity Measurement Architecture) */ +#ifndef XATTR_NAME_IMA +#define XATTR_NAME_IMA "security.ima" +#endif +#ifndef XATTR_NAME_EVM +#define XATTR_NAME_EVM "security.evm" +#endif + +static +int get_xattr26_type(const char *name) +{ + if (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS)) + return XATTR_ACL_ACCESS_T; + + if (!strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT)) + return XATTR_ACL_DEFAULT_T; + + if (!strncmp(name, XATTR_USER_PREFIX, + sizeof(XATTR_USER_PREFIX) - 1)) + return XATTR_USER_T; + + if (!strncmp(name, XATTR_TRUSTED_PREFIX, + sizeof(XATTR_TRUSTED_PREFIX) - 1)) + return XATTR_TRUSTED_T; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1)) + return XATTR_SECURITY_T; + + if (!strncmp(name, XATTR_LUSTRE_PREFIX, + sizeof(XATTR_LUSTRE_PREFIX) - 1)) + return XATTR_LUSTRE_T; + + return XATTR_OTHER_T; +} + +static +int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type) +{ + if ((xattr_type == XATTR_ACL_ACCESS_T || + xattr_type == XATTR_ACL_DEFAULT_T) && + !(sbi->ll_flags & LL_SBI_ACL)) + return -EOPNOTSUPP; + + if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR)) + return -EOPNOTSUPP; + if (xattr_type == XATTR_TRUSTED_T && !cfs_capable(CFS_CAP_SYS_ADMIN)) + return -EPERM; + if (xattr_type == XATTR_OTHER_T) + return -EOPNOTSUPP; + + return 0; +} + +static +int ll_setxattr_common(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, __u64 valid) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + int xattr_type, rc; + const char *pv = value; + ENTRY; + + /*FIXME: enable IMA when the conditions are ready */ + if (strncmp(name, XATTR_NAME_IMA, + sizeof(XATTR_NAME_IMA)) == 0 || + strncmp(name, XATTR_NAME_EVM, + sizeof(XATTR_NAME_EVM)) == 0) + return -EOPNOTSUPP; + + xattr_type = get_xattr26_type(name); + rc = xattr_type_filter(sbi, xattr_type); + if (rc) + RETURN(rc); + + if ((xattr_type == XATTR_ACL_ACCESS_T || + xattr_type == XATTR_ACL_DEFAULT_T) && +#ifdef HAVE_INODE_OWNER_OR_CAPABLE + !inode_owner_or_capable(inode)) +#else + !is_owner_or_cap(inode)) +#endif + return -EPERM; + + /* b10667: ignore lustre special xattr for now */ + if (strcmp(name, XATTR_NAME_HSM) == 0 || + (xattr_type == XATTR_TRUSTED_T && + strcmp(name, XATTR_NAME_LOV) == 0) || + (xattr_type == XATTR_LUSTRE_T && + strcmp(name, "lustre.lov") == 0)) + RETURN(0); + + /* LU-549: Disable security.selinux when selinux is disabled */ + if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() && + strcmp(name, "security.selinux") == 0) + RETURN(-EOPNOTSUPP); + + /* In user.* namespace, only regular files and directories can have + * extended attributes. */ + if (xattr_type == XATTR_USER_T) { + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + RETURN(-EPERM); + } + + rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, name, pv, + size, flags, ll_i2suppgid(inode), &req); + if (rc) { + if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) { + LCONSOLE_INFO("Disabling user_xattr feature because " + "it is not supported on the server\n"); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } + RETURN(rc); + } + + ptlrpc_req_finished(req); + RETURN(0); +} + +static int get_hsm_state(struct inode *inode, __u32 *hus_states) +{ + struct md_op_data *op_data; + struct hsm_user_state *hus; + int rc; + + OBD_ALLOC_PTR(hus); + if (hus == NULL) + return -ENOMEM; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hus); + if (!IS_ERR(op_data)) { + rc = obd_iocontrol(LL_IOC_HSM_STATE_GET, ll_i2mdexp(inode), + sizeof(*op_data), op_data, NULL); + if (rc == 0) + *hus_states = hus->hus_states; + else + CDEBUG(D_VFSTRACE, "obd_iocontrol failed. rc = %d\n", + rc); + + ll_finish_md_op_data(op_data); + } else { + rc = PTR_ERR(op_data); + CDEBUG(D_VFSTRACE, "Could not prepare the opdata. rc = %d\n", + rc); + } + OBD_FREE_PTR(hus); + return rc; +} + +static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump) +{ + struct lov_comp_md_v1 *comp_v1 = (struct lov_comp_md_v1 *)lump; + struct lov_user_md *v1 = lump; + bool release_checked = false; + bool need_clear_release = false; + __u16 entry_count = 1; + bool is_composite = false; + int rc = 0; + int i; + + if (lump == NULL) + return 0; + + if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) { + entry_count = comp_v1->lcm_entry_count; + is_composite = true; + } + + for (i = 0; i < entry_count; i++) { + if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) + v1 = (struct lov_user_md *)((char *)comp_v1 + + comp_v1->lcm_entries[i].lcme_offset); + + /* Attributes that are saved via getxattr will always + * have the stripe_offset as 0. Instead, the MDS + * should be allowed to pick the starting OST index. + * b=17846 */ + if (!is_composite && v1->lmm_stripe_offset == 0) + v1->lmm_stripe_offset = -1; + + /* Avoid anyone directly setting the RELEASED flag. */ + if (v1->lmm_pattern & LOV_PATTERN_F_RELEASED) { + if (!release_checked) { + __u32 state = HS_NONE; + rc = get_hsm_state(inode, &state); + if (rc) + return rc; + if (!(state & HS_ARCHIVED)) + need_clear_release = true; + release_checked = true; + } + if (need_clear_release) + v1->lmm_pattern ^= LOV_PATTERN_F_RELEASED; + } + } + + return rc; +} + +static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump, + size_t size) +{ + struct inode *inode = dentry->d_inode; + int rc = 0; + + rc = ll_adjust_lum(inode, lump); + if (rc) + return rc; + + if (lump != NULL && S_ISREG(inode->i_mode)) { + u64 it_flags = FMODE_WRITE; + int lum_size; + + lum_size = ll_lov_user_md_size(lump); + if (lum_size < 0 || size < lum_size) + return -ERANGE; + + rc = ll_lov_setstripe_ea_info(inode, dentry, it_flags, lump, + lum_size); + /** + * b=10667: ignore -EEXIST. + * Silently eat error on setting trusted.lov/lustre.lov + * attribute for SuSE 9, it added default option to copy + * all attributes in 'cp' command. rsync, tar --xattrs + * also will try to set LOVEA for existing files. + */ + if (rc == -EEXIST) + rc = 0; + } else if (S_ISDIR(inode->i_mode)) { + rc = ll_dir_setstripe(inode, lump, 0); + } + + return rc; +} + +int ll_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n", + PFID(ll_inode2fid(inode)), inode, name); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1); + + /* lustre/trusted.lov.xxx would be passed through xattr API */ + if (strcmp(name, XATTR_NAME_LOV) == 0 || + strcmp(name, XATTR_LUSTRE_LOV) == 0) + return ll_setstripe_ea(dentry, (struct lov_user_md *)value, + size); + else if (strcmp(name, XATTR_NAME_LMA) == 0 || + strcmp(name, XATTR_NAME_LINK) == 0) + return 0; + + return ll_setxattr_common(inode, name, value, size, flags, + OBD_MD_FLXATTR); +} + +int ll_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n", + PFID(ll_inode2fid(inode)), inode, name); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1); + return ll_setxattr_common(inode, name, NULL, 0, 0, + OBD_MD_FLXATTRRM); +} + +int ll_getxattr_common(struct inode *inode, const char *name, + void *buffer, size_t size, __u64 valid) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + int xattr_type, rc; + void *xdata; + struct ll_inode_info *lli = ll_i2info(inode); + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + + /* listxattr have slightly different behavior from of ext3: + * without 'user_xattr' ext3 will list all xattr names but + * filtered out "^user..*"; we list them all for simplicity. + */ + if (!name) { + xattr_type = XATTR_OTHER_T; + goto do_getxattr; + } + + xattr_type = get_xattr26_type(name); + rc = xattr_type_filter(sbi, xattr_type); + if (rc) + RETURN(rc); + + /* LU-549: Disable security.selinux when selinux is disabled */ + if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() && + strcmp(name, "security.selinux") == 0) + RETURN(-EOPNOTSUPP); + +#ifdef CONFIG_FS_POSIX_ACL + /* posix acl is under protection of LOOKUP lock. when calling to this, + * we just have path resolution to the target inode, so we have great + * chance that cached ACL is uptodate. + */ + if (xattr_type == XATTR_ACL_ACCESS_T) { + struct posix_acl *acl; + + spin_lock(&lli->lli_lock); + acl = posix_acl_dup(lli->lli_posix_acl); + spin_unlock(&lli->lli_lock); + + if (!acl) + RETURN(-ENODATA); + + rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + RETURN(rc); + } + if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode)) + RETURN(-ENODATA); +#endif + +do_getxattr: + if (sbi->ll_xattr_cache_enabled && + xattr_type != XATTR_ACL_ACCESS_T && + (xattr_type != XATTR_SECURITY_T || + strcmp(name, "security.selinux") != 0)) { + rc = ll_xattr_cache_get(inode, name, buffer, size, valid); + if (rc == -EAGAIN) + goto getxattr_nocache; + if (rc < 0) + GOTO(out_xattr, rc); + + /* Add "system.posix_acl_access" to the list */ + if (lli->lli_posix_acl != NULL && valid & OBD_MD_FLXATTRLS) { + if (size == 0) { + rc += sizeof(XATTR_NAME_ACL_ACCESS); + } else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) { + memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS, + sizeof(XATTR_NAME_ACL_ACCESS)); + rc += sizeof(XATTR_NAME_ACL_ACCESS); + } else { + GOTO(out_xattr, rc = -ERANGE); + } + } + } else { +getxattr_nocache: + rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, + name, size, &req); + if (rc < 0) + GOTO(out_xattr, rc); + + /* only detect the xattr size */ + if (size == 0) + GOTO(out, rc); + + if (size < rc) + GOTO(out, rc = -ERANGE); + + /* do not need swab xattr data */ + xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, + rc); + if (!xdata) + GOTO(out, rc = -EPROTO); + + memcpy(buffer, xdata, rc); + } + + EXIT; + +out_xattr: + if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) { + LCONSOLE_INFO("%s: disabling user_xattr feature because " + "it is not supported on the server: rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), rc); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } +out: + ptlrpc_req_finished(req); + return rc; +} + +static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size) +{ + ssize_t rc; + + if (S_ISREG(inode->i_mode)) { + struct cl_object *obj = ll_i2info(inode)->lli_clob; + struct lu_env *env; + struct cl_layout cl = { + .cl_buf.lb_buf = buf, + .cl_buf.lb_len = buf_size, + }; + __u16 refcheck; + + if (obj == NULL) + RETURN(-ENODATA); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = cl_object_layout_get(env, obj, &cl); + if (rc < 0) + GOTO(out_env, rc); + + if (cl.cl_size == 0) + GOTO(out_env, rc = -ENODATA); + + rc = cl.cl_size; + + if (buf_size == 0) + GOTO(out_env, rc); + + LASSERT(buf != NULL && rc <= buf_size); + + /* Do not return layout gen for getxattr() since + * otherwise it would confuse tar --xattr by + * recognizing layout gen as stripe offset when the + * file is restored. See LU-2809. */ + if (((struct lov_mds_md *)buf)->lmm_magic == LOV_MAGIC_COMP_V1) + goto out_env; + + ((struct lov_mds_md *)buf)->lmm_layout_gen = 0; +out_env: + cl_env_put(env, &refcheck); + + RETURN(rc); + } else if (S_ISDIR(inode->i_mode)) { + struct lov_mds_md *lmm = NULL; + int lmm_size = 0; + struct ptlrpc_request *req = NULL; + + rc = ll_dir_getstripe(inode, (void **)&lmm, &lmm_size, + &req, 0); + if (rc < 0) + GOTO(out_req, rc); + + if (buf_size == 0) + GOTO(out_req, rc = lmm_size); + + if (buf_size < lmm_size) + GOTO(out_req, rc = -ERANGE); + + memcpy(buf, lmm, lmm_size); + GOTO(out_req, rc = lmm_size); +out_req: + if (req != NULL) + ptlrpc_req_finished(req); + + return rc; + } else { + RETURN(-ENODATA); + } +} + +ssize_t ll_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t buf_size) +{ + struct inode *inode = dentry->d_inode; + + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n", + PFID(ll_inode2fid(inode)), inode, name); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); + + if (strcmp(name, XATTR_LUSTRE_LOV) == 0 || + strcmp(name, XATTR_NAME_LOV) == 0) + return ll_getxattr_lov(inode, buf, buf_size); + else + return ll_getxattr_common(inode, name, buf, buf_size, + OBD_MD_FLXATTR); +} + +ssize_t ll_listxattr(struct dentry *dentry, char *buf, size_t buf_size) +{ + struct inode *inode = dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + char *xattr_name; + ssize_t rc, rc2; + size_t len, rem; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1); + + rc = ll_getxattr_common(inode, NULL, buf, buf_size, OBD_MD_FLXATTRLS); + if (rc < 0) + RETURN(rc); + + /* If we're being called to get the size of the xattr list + * (buf_size == 0) then just assume that a lustre.lov xattr + * exists. */ + if (buf_size == 0) + RETURN(rc + sizeof(XATTR_LUSTRE_LOV)); + + xattr_name = buf; + rem = rc; + + while (rem > 0) { + len = strnlen(xattr_name, rem - 1) + 1; + rem -= len; + if (xattr_type_filter(sbi, get_xattr26_type(xattr_name)) == 0) { + /* Skip OK xattr type, leave it in buffer. */ + xattr_name += len; + continue; + } + + /* Move up remaining xattrs in buffer removing the + * xattr that is not OK. */ + memmove(xattr_name, xattr_name + len, rem); + rc -= len; + } + + rc2 = ll_getxattr_lov(inode, NULL, 0); + if (rc2 == -ENODATA) + RETURN(rc); + + if (rc2 < 0) + RETURN(rc2); + + if (buf_size < rc + sizeof(XATTR_LUSTRE_LOV)) + RETURN(-ERANGE); + + memcpy(buf + rc, XATTR_LUSTRE_LOV, sizeof(XATTR_LUSTRE_LOV)); + + RETURN(rc + sizeof(XATTR_LUSTRE_LOV)); +} diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c new file mode 100644 index 0000000000000..f1022b0296f47 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c @@ -0,0 +1,552 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Copyright (c) 2013, 2017, Intel Corporation. + * + * Author: Andrew Perepechko + * + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include "llite_internal.h" + +/* If we ever have hundreds of extended attributes, we might want to consider + * using a hash or a tree structure instead of list for faster lookups. + */ +struct ll_xattr_entry { + struct list_head xe_list; /* protected with + * lli_xattrs_list_rwsem */ + char *xe_name; /* xattr name, \0-terminated */ + char *xe_value; /* xattr value */ + unsigned xe_namelen; /* strlen(xe_name) + 1 */ + unsigned xe_vallen; /* xattr value length */ +}; + +static struct kmem_cache *xattr_kmem; +static struct lu_kmem_descr xattr_caches[] = { + { + .ckd_cache = &xattr_kmem, + .ckd_name = "xattr_kmem", + .ckd_size = sizeof(struct ll_xattr_entry) + }, + { + .ckd_cache = NULL + } +}; + +int ll_xattr_init(void) +{ + return lu_kmem_init(xattr_caches); +} + +void ll_xattr_fini(void) +{ + lu_kmem_fini(xattr_caches); +} + +/** + * Initializes xattr cache for an inode. + * + * This initializes the xattr list and marks cache presence. + */ +static void ll_xattr_cache_init(struct ll_inode_info *lli) +{ + ENTRY; + + LASSERT(lli != NULL); + + INIT_LIST_HEAD(&lli->lli_xattrs); + ll_file_set_flag(lli, LLIF_XATTR_CACHE); +} + +/** + * This looks for a specific extended attribute. + * + * Find in @cache and return @xattr_name attribute in @xattr, + * for the NULL @xattr_name return the first cached @xattr. + * + * \retval 0 success + * \retval -ENODATA if not found + */ +static int ll_xattr_cache_find(struct list_head *cache, + const char *xattr_name, + struct ll_xattr_entry **xattr) +{ + struct ll_xattr_entry *entry; + + ENTRY; + + list_for_each_entry(entry, cache, xe_list) { + /* xattr_name == NULL means look for any entry */ + if (xattr_name == NULL || + strcmp(xattr_name, entry->xe_name) == 0) { + *xattr = entry; + CDEBUG(D_CACHE, "find: [%s]=%.*s\n", + entry->xe_name, entry->xe_vallen, + entry->xe_value); + RETURN(0); + } + } + + RETURN(-ENODATA); +} + +/** + * This adds an xattr. + * + * Add @xattr_name attr with @xattr_val value and @xattr_val_len length, + * + * \retval 0 success + * \retval -ENOMEM if no memory could be allocated for the cached attr + * \retval -EPROTO if duplicate xattr is being added + */ +static int ll_xattr_cache_add(struct list_head *cache, + const char *xattr_name, + const char *xattr_val, + unsigned xattr_val_len) +{ + struct ll_xattr_entry *xattr; + + ENTRY; + + if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) { + CDEBUG(D_CACHE, "duplicate xattr: [%s]\n", xattr_name); + RETURN(-EPROTO); + } + + OBD_SLAB_ALLOC_PTR_GFP(xattr, xattr_kmem, GFP_NOFS); + if (xattr == NULL) { + CDEBUG(D_CACHE, "failed to allocate xattr\n"); + RETURN(-ENOMEM); + } + + xattr->xe_namelen = strlen(xattr_name) + 1; + + OBD_ALLOC(xattr->xe_name, xattr->xe_namelen); + if (!xattr->xe_name) { + CDEBUG(D_CACHE, "failed to alloc xattr name %u\n", + xattr->xe_namelen); + goto err_name; + } + OBD_ALLOC(xattr->xe_value, xattr_val_len); + if (!xattr->xe_value) { + CDEBUG(D_CACHE, "failed to alloc xattr value %d\n", + xattr_val_len); + goto err_value; + } + + memcpy(xattr->xe_name, xattr_name, xattr->xe_namelen); + memcpy(xattr->xe_value, xattr_val, xattr_val_len); + xattr->xe_vallen = xattr_val_len; + list_add(&xattr->xe_list, cache); + + CDEBUG(D_CACHE, "set: [%s]=%.*s\n", xattr_name, + xattr_val_len, xattr_val); + + RETURN(0); +err_value: + OBD_FREE(xattr->xe_name, xattr->xe_namelen); +err_name: + OBD_SLAB_FREE_PTR(xattr, xattr_kmem); + + RETURN(-ENOMEM); +} + +/** + * This removes an extended attribute from cache. + * + * Remove @xattr_name attribute from @cache. + * + * \retval 0 success + * \retval -ENODATA if @xattr_name is not cached + */ +static int ll_xattr_cache_del(struct list_head *cache, + const char *xattr_name) +{ + struct ll_xattr_entry *xattr; + + ENTRY; + + CDEBUG(D_CACHE, "del xattr: %s\n", xattr_name); + + if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) { + list_del(&xattr->xe_list); + OBD_FREE(xattr->xe_name, xattr->xe_namelen); + OBD_FREE(xattr->xe_value, xattr->xe_vallen); + OBD_SLAB_FREE_PTR(xattr, xattr_kmem); + + RETURN(0); + } + + RETURN(-ENODATA); +} + +/** + * This iterates cached extended attributes. + * + * Walk over cached attributes in @cache and + * fill in @xld_buffer or only calculate buffer + * size if @xld_buffer is NULL. + * + * \retval >= 0 buffer list size + * \retval -ENODATA if the list cannot fit @xld_size buffer + */ +static int ll_xattr_cache_list(struct list_head *cache, + char *xld_buffer, + int xld_size) +{ + struct ll_xattr_entry *xattr, *tmp; + int xld_tail = 0; + + ENTRY; + + list_for_each_entry_safe(xattr, tmp, cache, xe_list) { + CDEBUG(D_CACHE, "list: buffer=%p[%d] name=%s\n", + xld_buffer, xld_tail, xattr->xe_name); + + if (xld_buffer) { + xld_size -= xattr->xe_namelen; + if (xld_size < 0) + break; + memcpy(&xld_buffer[xld_tail], + xattr->xe_name, xattr->xe_namelen); + } + xld_tail += xattr->xe_namelen; + } + + if (xld_size < 0) + RETURN(-ERANGE); + + RETURN(xld_tail); +} + +/** + * Check if the xattr cache is initialized (filled). + * + * \retval 0 @cache is not initialized + * \retval 1 @cache is initialized + */ +static int ll_xattr_cache_valid(struct ll_inode_info *lli) +{ + return ll_file_test_flag(lli, LLIF_XATTR_CACHE); +} + +/** + * This finalizes the xattr cache. + * + * Free all xattr memory. @lli is the inode info pointer. + * + * \retval 0 no error occured + */ +static int ll_xattr_cache_destroy_locked(struct ll_inode_info *lli) +{ + ENTRY; + + if (!ll_xattr_cache_valid(lli)) + RETURN(0); + + while (ll_xattr_cache_del(&lli->lli_xattrs, NULL) == 0) + /* empty loop */ ; + + ll_file_clear_flag(lli, LLIF_XATTR_CACHE); + + RETURN(0); +} + +int ll_xattr_cache_destroy(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + + ENTRY; + + down_write(&lli->lli_xattrs_list_rwsem); + rc = ll_xattr_cache_destroy_locked(lli); + up_write(&lli->lli_xattrs_list_rwsem); + + RETURN(rc); +} + +/** + * Match or enqueue a PR lock. + * + * Find or request an LDLM lock with xattr data. + * Since LDLM does not provide API for atomic match_or_enqueue, + * the function handles it with a separate enq lock. + * If successful, the function exits with the list lock held. + * + * \retval 0 no error occured + * \retval -ENOMEM not enough memory + */ +static int ll_xattr_find_get_lock(struct inode *inode, + struct lookup_intent *oit, + struct ptlrpc_request **req) +{ + enum ldlm_mode mode; + struct lustre_handle lockh = { 0 }; + struct md_op_data *op_data; + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_export *exp = sbi->ll_md_exp; + int rc; + + ENTRY; + + mutex_lock(&lli->lli_xattrs_enq_lock); + /* inode may have been shrunk and recreated, so data is gone, match lock + * only when data exists. */ + if (ll_xattr_cache_valid(lli)) { + /* Try matching first. */ + mode = ll_take_md_lock(inode, MDS_INODELOCK_XATTR, &lockh, 0, + LCK_PR); + if (mode != 0) { + /* fake oit in mdc_revalidate_lock() manner */ + oit->it_lock_handle = lockh.cookie; + oit->it_lock_mode = mode; + goto out; + } + } + + /* Enqueue if the lock isn't cached locally. */ + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + mutex_unlock(&lli->lli_xattrs_enq_lock); + RETURN(PTR_ERR(op_data)); + } + + op_data->op_valid = OBD_MD_FLXATTR | OBD_MD_FLXATTRLS; + + rc = md_intent_lock(exp, op_data, oit, req, &ll_md_blocking_ast, 0); + ll_finish_md_op_data(op_data); + *req = oit->it_request; + + if (rc < 0) { + CDEBUG(D_CACHE, "md_intent_lock failed with %d for fid "DFID"\n", + rc, PFID(ll_inode2fid(inode))); + mutex_unlock(&lli->lli_xattrs_enq_lock); + RETURN(rc); + } + +out: + down_write(&lli->lli_xattrs_list_rwsem); + mutex_unlock(&lli->lli_xattrs_enq_lock); + + RETURN(0); +} + +/** + * Refill the xattr cache. + * + * Fetch and cache the whole of xattrs for @inode, acquiring a read lock. + * + * \retval 0 no error occured + * \retval -EPROTO network protocol error + * \retval -ENOMEM not enough memory for the cache + */ +static int ll_xattr_cache_refill(struct inode *inode) +{ + struct lookup_intent oit = { .it_op = IT_GETXATTR }; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + const char *xdata, *xval, *xtail, *xvtail; + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body; + __u32 *xsizes; + int rc = 0, i; + + ENTRY; + + rc = ll_xattr_find_get_lock(inode, &oit, &req); + if (rc) + GOTO(err_req, rc); + + /* Do we have the data at this point? */ + if (ll_xattr_cache_valid(lli)) { + ll_stats_ops_tally(sbi, LPROC_LL_GETXATTR_HITS, 1); + ll_intent_drop_lock(&oit); + GOTO(err_req, rc = 0); + } + + /* Matched but no cache? Cancelled on error by a parallel refill. */ + if (unlikely(req == NULL)) { + CDEBUG(D_CACHE, "cancelled by a parallel getxattr\n"); + ll_intent_drop_lock(&oit); + GOTO(err_unlock, rc = -EAGAIN); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) { + CERROR("no MDT BODY in the refill xattr reply\n"); + GOTO(err_cancel, rc = -EPROTO); + } + /* do not need swab xattr data */ + xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, + body->mbo_eadatasize); + xval = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS, + body->mbo_aclsize); + xsizes = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS_LENS, + body->mbo_max_mdsize * + sizeof(__u32)); + if (xdata == NULL || xval == NULL || xsizes == NULL) { + CERROR("wrong setxattr reply\n"); + GOTO(err_cancel, rc = -EPROTO); + } + + xtail = xdata + body->mbo_eadatasize; + xvtail = xval + body->mbo_aclsize; + + CDEBUG(D_CACHE, "caching: xdata=%p xtail=%p\n", xdata, xtail); + + ll_xattr_cache_init(lli); + + for (i = 0; i < body->mbo_max_mdsize; i++) { + CDEBUG(D_CACHE, "caching [%s]=%.*s\n", xdata, *xsizes, xval); + /* Perform consistency checks: attr names and vals in pill */ + if (memchr(xdata, 0, xtail - xdata) == NULL) { + CERROR("xattr protocol violation (names are broken)\n"); + rc = -EPROTO; + } else if (xval + *xsizes > xvtail) { + CERROR("xattr protocol violation (vals are broken)\n"); + rc = -EPROTO; + } else if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_XATTR_ENOMEM)) { + rc = -ENOMEM; + } else if (!strcmp(xdata, XATTR_NAME_ACL_ACCESS)) { + /* Filter out ACL ACCESS since it's cached separately */ + CDEBUG(D_CACHE, "not caching %s\n", + XATTR_NAME_ACL_ACCESS); + rc = 0; + } else if (!strcmp(xdata, "security.selinux")) { + /* Filter out security.selinux, it is cached in slab */ + CDEBUG(D_CACHE, "not caching security.selinux\n"); + rc = 0; + } else { + rc = ll_xattr_cache_add(&lli->lli_xattrs, xdata, xval, + *xsizes); + } + if (rc < 0) { + ll_xattr_cache_destroy_locked(lli); + GOTO(err_cancel, rc); + } + xdata += strlen(xdata) + 1; + xval += *xsizes; + xsizes++; + } + + if (xdata != xtail || xval != xvtail) + CERROR("a hole in xattr data\n"); + + ll_set_lock_data(sbi->ll_md_exp, inode, &oit, NULL); + ll_intent_drop_lock(&oit); + + ptlrpc_req_finished(req); + RETURN(0); + +err_cancel: + ldlm_lock_decref_and_cancel((struct lustre_handle *) + &oit.it_lock_handle, + oit.it_lock_mode); +err_unlock: + up_write(&lli->lli_xattrs_list_rwsem); +err_req: + if (rc == -ERANGE) + rc = -EAGAIN; + + ptlrpc_req_finished(req); + RETURN(rc); +} + +/** + * Get an xattr value or list xattrs using the write-through cache. + * + * Get the xattr value (@valid has OBD_MD_FLXATTR set) of @name or + * list xattr names (@valid has OBD_MD_FLXATTRLS set) for @inode. + * The resulting value/list is stored in @buffer if the former + * is not larger than @size. + * + * \retval 0 no error occured + * \retval -EPROTO network protocol error + * \retval -ENOMEM not enough memory for the cache + * \retval -ERANGE the buffer is not large enough + * \retval -ENODATA no such attr or the list is empty + */ +int ll_xattr_cache_get(struct inode *inode, + const char *name, + char *buffer, + size_t size, + __u64 valid) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc = 0; + + ENTRY; + + LASSERT(!!(valid & OBD_MD_FLXATTR) ^ !!(valid & OBD_MD_FLXATTRLS)); + + down_read(&lli->lli_xattrs_list_rwsem); + if (!ll_xattr_cache_valid(lli)) { + up_read(&lli->lli_xattrs_list_rwsem); + rc = ll_xattr_cache_refill(inode); + if (rc) + RETURN(rc); + downgrade_write(&lli->lli_xattrs_list_rwsem); + } else { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR_HITS, 1); + } + + if (valid & OBD_MD_FLXATTR) { + struct ll_xattr_entry *xattr; + + rc = ll_xattr_cache_find(&lli->lli_xattrs, name, &xattr); + if (rc == 0) { + rc = xattr->xe_vallen; + /* zero size means we are only requested size in rc */ + if (size != 0) { + if (size >= xattr->xe_vallen) + memcpy(buffer, xattr->xe_value, + xattr->xe_vallen); + else + rc = -ERANGE; + } + } + } else if (valid & OBD_MD_FLXATTRLS) { + rc = ll_xattr_cache_list(&lli->lli_xattrs, + size ? buffer : NULL, size); + } + + GOTO(out, rc); +out: + up_read(&lli->lli_xattrs_list_rwsem); + + RETURN(rc); +} + diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c new file mode 100644 index 0000000000000..83dccf8a52e3e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c @@ -0,0 +1,240 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * GPL HEADER END + */ + +/* + * Copyright (c) 2014 Bull SAS + * + * Copyright (c) 2015, 2016, Intel Corporation. + * Author: Sebastien Buisson sebastien.buisson@bull.net + */ + +/* + * lustre/llite/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include +#include +#ifdef HAVE_LINUX_SELINUX_IS_ENABLED +#include +#endif +#include +#include "llite_internal.h" + +#ifndef XATTR_SELINUX_SUFFIX +# define XATTR_SELINUX_SUFFIX "selinux" +#endif + +#ifndef XATTR_NAME_SELINUX +# define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX +#endif + +/* + * Check for LL_SBI_FILE_SECCTX before calling. + */ +int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name, + const char **secctx_name, void **secctx, + __u32 *secctx_size) +{ +#ifdef HAVE_SECURITY_DENTRY_INIT_SECURITY + int rc; + + /* + * security_dentry_init_security() is strange. Like + * security_inode_init_security() it may return a context (provided a + * Linux security module is enabled) but unlike + * security_inode_init_security() it does not return to us the name of + * the extended attribute to store the context under (for example + * "security.selinux"). So we only call it when we think we know what + * the name of the extended attribute will be. This is OK-ish since + * SELinux is the only module that implements + * security_dentry_init_security(). Note that the NFS client code just + * calls it and assumes that if anything is returned then it must come + * from SELinux. + */ + + if (!selinux_is_enabled()) + return 0; + + rc = security_dentry_init_security(dentry, mode, name, secctx, + secctx_size); + /* Usually, security_dentry_init_security() returns -EOPNOTSUPP when + * SELinux is disabled. + * But on some kernels (e.g. rhel 8.5) it returns 0 when SELinux is + * disabled, and in this case the security context is empty. + */ + if (rc == -EOPNOTSUPP || (rc == 0 && *secctx_size == 0)) + /* do nothing */ + return 0; + if (rc < 0) + return rc; + + *secctx_name = XATTR_NAME_SELINUX; +#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */ + + return 0; +} + +#ifdef HAVE_SECURITY_IINITSEC_CALLBACK +/** + * A helper function for ll_security_inode_init_security() + * that takes care of setting xattrs + * + * Get security context of @inode from @xattr_array, + * and put it in 'security.xxx' xattr of dentry + * stored in @fs_info. + * + * \retval 0 success + * \retval -ENOMEM if no memory could be allocated for xattr name + * \retval < 0 failure to set xattr + */ +static int +ll_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + struct dentry *dentry = fs_info; + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name; xattr++) { + char *full_name; + + full_name = kasprintf(GFP_KERNEL, "%s%s", + XATTR_SECURITY_PREFIX, xattr->name); + if (!full_name) { + err = -ENOMEM; + break; + } + + err = ll_vfs_setxattr(dentry, inode, full_name, xattr->value, + xattr->value_len, XATTR_CREATE); + kfree(full_name); + if (err < 0) + break; + } + return err; +} + +/** + * Initializes security context + * + * Get security context of @inode in @dir, + * and put it in 'security.xxx' xattr of @dentry. + * + * \retval 0 success, or SELinux is disabled + * \retval -ENOMEM if no memory could be allocated for xattr name + * \retval < 0 failure to get security context or set xattr + */ +int +ll_inode_init_security(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + int rc; + + if (!selinux_is_enabled()) + return 0; + + rc = ll_security_inode_init_security(inode, dir, NULL, NULL, 0, + &ll_initxattrs, dentry); + if (rc == -EOPNOTSUPP) + return 0; + + return rc; +} +#else /* !HAVE_SECURITY_IINITSEC_CALLBACK */ +/** + * Initializes security context + * + * Get security context of @inode in @dir, + * and put it in 'security.xxx' xattr of @dentry. + * + * \retval 0 success, or SELinux is disabled + * \retval -ENOMEM if no memory could be allocated for xattr name + * \retval < 0 failure to get security context or set xattr + */ +int +ll_inode_init_security(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + char *full_name; + void *value; + char *name; + size_t len; + int err; + + if (!selinux_is_enabled()) + return 0; + + err = ll_security_inode_init_security(inode, dir, &name, &value, &len, + NULL, dentry); + if (err != 0) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + full_name = kasprintf(GFP_KERNEL, "%s%s", XATTR_SECURITY_PREFIX, name); + if (!full_name) + GOTO(out_free, err = -ENOMEM); + + err = __vfs_setxattr(dentry, inode, full_name, value, len, + XATTR_CREATE); + kfree(full_name); +out_free: + kfree(name); + kfree(value); + + return err; +} +#endif /* HAVE_SECURITY_IINITSEC_CALLBACK */ + +/** + * Get security context xattr name used by policy. + * + * \retval >= 0 length of xattr name + * \retval < 0 failure to get security context xattr name + */ +int +ll_listsecurity(struct inode *inode, char *secctx_name, size_t secctx_name_size) +{ + int rc; + + if (!selinux_is_enabled()) + return 0; + +#ifdef HAVE_SECURITY_INODE_LISTSECURITY + rc = security_inode_listsecurity(inode, secctx_name, secctx_name_size); + if (rc >= secctx_name_size) + rc = -ERANGE; + else if (rc >= 0) + secctx_name[rc] = '\0'; + return rc; +#else /* !HAVE_SECURITY_INODE_LISTSECURITY */ + rc = sizeof(XATTR_NAME_SELINUX); + if (secctx_name && rc < secctx_name_size) { + memcpy(secctx_name, XATTR_NAME_SELINUX, rc); + secctx_name[rc] = '\0'; + } else { + rc = -ERANGE; + } + return rc; +#endif /* HAVE_SECURITY_INODE_LISTSECURITY */ +} diff --git a/drivers/staging/lustrefsx/lustre/lmv/Makefile b/drivers/staging/lustrefsx/lustre/lmv/Makefile new file mode 100644 index 0000000000000..40626f49283fb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTREFSX_FS) += lmv.o + +lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c new file mode 100644 index 0000000000000..e95930edf1251 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c @@ -0,0 +1,83 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LMV +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "lmv_internal.h" + +int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds) +{ + struct obd_device *obd = lmv2obd_dev(lmv); + int rc; + ENTRY; + + /* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and + * this fid_is_local check should be removed once LU-2240 is fixed */ + if (!fid_is_sane(fid) || !(fid_seq_in_fldb(fid_seq(fid)) || + fid_seq_is_local_file(fid_seq(fid)))) { + CERROR("%s: invalid FID "DFID"\n", obd->obd_name, PFID(fid)); + RETURN(-EINVAL); + } + + rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds, + LU_SEQ_RANGE_MDT, NULL); + if (rc) { + CERROR("Error while looking for mds number. Seq %#llx" + ", err = %d\n", fid_seq(fid), rc); + RETURN(rc); + } + + CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n", + *mds, PFID(fid)); + + if (*mds >= lmv->lmv_mdt_descs.ltd_tgts_size) { + rc = -EINVAL; + CERROR("%s: FLD lookup got invalid mds #%x (max: %x) for fid="DFID": rc = %d\n", + obd->obd_name, *mds, lmv->lmv_mdt_descs.ltd_tgts_size, + PFID(fid), rc); + } + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c new file mode 100644 index 0000000000000..fade4c9e9c31a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c @@ -0,0 +1,581 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LMV +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "lmv_internal.h" + +static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it, + const struct lu_fid *parent_fid, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags, + const char *secctx_name, __u32 secctx_name_size) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct ptlrpc_request *req = NULL; + struct lustre_handle plock; + struct md_op_data *op_data; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int pmode; + int rc = 0; + ENTRY; + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + LASSERT((body->mbo_valid & OBD_MD_MDS)); + + /* + * We got LOOKUP lock, but we really need attrs. + */ + pmode = it->it_lock_mode; + if (pmode) { + plock.cookie = it->it_lock_handle; + it->it_lock_mode = 0; + it->it_request = NULL; + } + + LASSERT(fid_is_sane(&body->mbo_fid1)); + + tgt = lmv_fid2tgt(lmv, &body->mbo_fid1); + if (IS_ERR(tgt)) + GOTO(out, rc = PTR_ERR(tgt)); + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + GOTO(out, rc = -ENOMEM); + + op_data->op_fid1 = body->mbo_fid1; + /* Sent the parent FID to the remote MDT */ + if (parent_fid != NULL) { + /* The parent fid is only for remote open to + * check whether the open is from OBF, + * see mdt_cross_open */ + LASSERT(it->it_op & IT_OPEN); + op_data->op_fid2 = *parent_fid; + } + + op_data->op_bias = MDS_CROSS_REF; + CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%u\n", + PFID(&body->mbo_fid1), tgt->ltd_index); + + /* ask for security context upon intent */ + if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN) && + secctx_name_size != 0 && secctx_name != NULL) { + op_data->op_file_secctx_name = secctx_name; + op_data->op_file_secctx_name_size = secctx_name_size; + CDEBUG(D_SEC, "'%.*s' is security xattr to fetch for " + DFID"\n", + secctx_name_size, secctx_name, PFID(&body->mbo_fid1)); + } + + rc = md_intent_lock(tgt->ltd_exp, op_data, it, &req, cb_blocking, + extra_lock_flags); + if (rc) + GOTO(out_free_op_data, rc); + + /* + * LLite needs LOOKUP lock to track dentry revocation in order to + * maintain dcache consistency. Thus drop UPDATE|PERM lock here + * and put LOOKUP in request. + */ + if (it->it_lock_mode != 0) { + it->it_remote_lock_handle = + it->it_lock_handle; + it->it_remote_lock_mode = it->it_lock_mode; + } + + if (pmode) { + it->it_lock_handle = plock.cookie; + it->it_lock_mode = pmode; + } + + EXIT; +out_free_op_data: + OBD_FREE_PTR(op_data); +out: + if (rc && pmode) + ldlm_lock_decref(&plock, pmode); + + ptlrpc_req_finished(*reqp); + *reqp = req; + return rc; +} + +int lmv_revalidate_slaves(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + ldlm_blocking_callback cb_blocking, + int extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct ptlrpc_request *req = NULL; + struct mdt_body *body; + struct md_op_data *op_data; + int i; + int valid_stripe_count = 0; + int rc = 0; + + ENTRY; + + /** + * revalidate slaves has some problems, temporarily return, + * we may not need that + */ + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + RETURN(-ENOMEM); + + /** + * Loop over the stripe information, check validity and update them + * from MDS if needed. + */ + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + struct lu_fid fid; + struct lookup_intent it = { .it_op = IT_GETATTR }; + struct lustre_handle *lockh = NULL; + struct lmv_tgt_desc *tgt = NULL; + struct inode *inode; + + fid = lsm->lsm_md_oinfo[i].lmo_fid; + inode = lsm->lsm_md_oinfo[i].lmo_root; + + if (!inode) + continue; + + /* + * Prepare op_data for revalidating. Note that @fid2 shluld be + * defined otherwise it will go to server and take new lock + * which is not needed here. + */ + memset(op_data, 0, sizeof(*op_data)); + op_data->op_fid1 = fid; + op_data->op_fid2 = fid; + /* shard revalidate only needs to fetch attributes and UPDATE + * lock, which is similar to the bottom half of remote object + * getattr, set this flag so that MDT skips checking whether + * it's remote object. + */ + op_data->op_bias = MDS_CROSS_REF; + + tgt = lmv_tgt(lmv, lsm->lsm_md_oinfo[i].lmo_mds); + if (!tgt) + GOTO(cleanup, rc = -ENODEV); + + CDEBUG(D_INODE, "Revalidate slave "DFID" -> mds #%u\n", + PFID(&fid), tgt->ltd_index); + + if (req != NULL) { + ptlrpc_req_finished(req); + req = NULL; + } + + rc = md_intent_lock(tgt->ltd_exp, op_data, &it, &req, + cb_blocking, extra_lock_flags); + if (rc == -ENOENT) { + /* skip stripe is not exists */ + rc = 0; + continue; + } + + if (rc < 0) + GOTO(cleanup, rc); + + lockh = (struct lustre_handle *)&it.it_lock_handle; + if (rc > 0 && req == NULL) { + /* slave inode is still valid */ + CDEBUG(D_INODE, "slave "DFID" is still valid.\n", + PFID(&fid)); + rc = 0; + } else { + /* refresh slave from server */ + body = req_capsule_server_get(&req->rq_pill, + &RMF_MDT_BODY); + if (body == NULL) { + if (it.it_lock_mode && lockh) { + ldlm_lock_decref(lockh, + it.it_lock_mode); + it.it_lock_mode = 0; + } + GOTO(cleanup, rc = -ENOENT); + } + + i_size_write(inode, body->mbo_size); + inode->i_blocks = body->mbo_blocks; + set_nlink(inode, body->mbo_nlink); + inode->i_atime.tv_sec = body->mbo_atime; + inode->i_ctime.tv_sec = body->mbo_ctime; + inode->i_mtime.tv_sec = body->mbo_mtime; + } + + md_set_lock_data(tgt->ltd_exp, lockh, inode, NULL); + if (it.it_lock_mode != 0 && lockh != NULL) { + ldlm_lock_decref(lockh, it.it_lock_mode); + it.it_lock_mode = 0; + } + + valid_stripe_count++; + } + +cleanup: + if (req != NULL) + ptlrpc_req_finished(req); + + /* if all stripes are invalid, return -ENOENT to notify user */ + if (!rc && !valid_stripe_count) + rc = -ENOENT; + + OBD_FREE_PTR(op_data); + RETURN(rc); +} + +/* + * IT_OPEN is intended to open (and create, possible) an object. Parent (pid) + * may be split dir. + */ +static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, + struct lookup_intent *it, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + __u64 flags = it->it_flags; + int rc; + + ENTRY; + + if ((it->it_op & IT_CREAT) && !(flags & MDS_OPEN_BY_FID)) { + /* don't allow create under dir with bad hash */ + if (lmv_dir_bad_hash(op_data->op_mea1)) + RETURN(-EBADF); + + if (lmv_dir_migrating(op_data->op_mea1)) { + if (flags & O_EXCL) { + /* + * open(O_CREAT | O_EXCL) needs to check + * existing name, which should be done on both + * old and new layout, check old layout on + * client side. + */ + rc = lmv_migrate_existence_check(lmv, op_data); + if (rc != -ENOENT) + RETURN(rc); + + op_data->op_post_migrate = true; + } else { + /* + * open(O_CREAT) will be sent to MDT in old + * layout first, to avoid creating new file + * under old layout, clear O_CREAT. + */ + it->it_flags &= ~O_CREAT; + } + } + } + +retry: + if (it->it_flags & MDS_OPEN_BY_FID) { + LASSERT(fid_is_sane(&op_data->op_fid2)); + + /* for striped directory, we can't know parent stripe fid + * without name, but we can set it to child fid, and MDT + * will obtain it from linkea in open in such case. */ + if (lmv_dir_striped(op_data->op_mea1)) + op_data->op_fid1 = op_data->op_fid2; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + op_data->op_mds = tgt->ltd_index; + } else { + LASSERT(fid_is_sane(&op_data->op_fid1)); + LASSERT(fid_is_zero(&op_data->op_fid2)); + LASSERT(op_data->op_name != NULL); + + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } + + /* If it is ready to open the file by FID, do not need + * allocate FID at all, otherwise it will confuse MDT */ + if ((it->it_op & IT_CREAT) && !(it->it_flags & MDS_OPEN_BY_FID)) { + /* + * For lookup(IT_CREATE) cases allocate new fid and setup FLD + * for it. + */ + rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc != 0) + RETURN(rc); + } + + CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID"," + " name='%s' -> mds #%u\n", PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_index); + + rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking, + extra_lock_flags); + if (rc != 0) + RETURN(rc); + /* + * Nothing is found, do not access body->fid1 as it is zero and thus + * pointless. + */ + if ((it->it_disposition & DISP_LOOKUP_NEG) && + !(it->it_disposition & DISP_OPEN_CREATE) && + !(it->it_disposition & DISP_OPEN_OPEN)) { + if (!(it->it_flags & MDS_OPEN_BY_FID) && + lmv_dir_retry_check_update(op_data)) { + ptlrpc_req_finished(*reqp); + it->it_request = NULL; + it->it_disposition = 0; + *reqp = NULL; + + it->it_flags = flags; + fid_zero(&op_data->op_fid2); + goto retry; + } + + RETURN(rc); + } + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + /* Not cross-ref case, just get out of here. */ + if (unlikely((body->mbo_valid & OBD_MD_MDS))) { + rc = lmv_intent_remote(exp, it, &op_data->op_fid1, reqp, + cb_blocking, extra_lock_flags, + op_data->op_file_secctx_name, + op_data->op_file_secctx_name_size); + if (rc != 0) + RETURN(rc); + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + } + + RETURN(rc); +} + +/* + * Handler for: getattr, lookup and revalidate cases. + */ +static int +lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data, + struct lookup_intent *it, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = NULL; + struct mdt_body *body; + int rc; + ENTRY; + +retry: + if (op_data->op_flags & MF_GETATTR_BY_FID) { + /* getattr by FID, replace fid1 with stripe FID, + * NB, don't replace if name is "/", because it may be a subtree + * mount, and if it's a striped directory, fid1 will be replaced + * to stripe FID by hash, while fid2 is master object FID, which + * will be treated as a remote object if the two FIDs are + * located on different MDTs, and LOOKUP lock can't be fetched. + */ + LASSERT(op_data->op_name); + if (op_data->op_namelen != 1 || + strncmp(op_data->op_name, "/", 1) != 0) { + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } + + /* name is used to locate stripe target, clear it here + * to avoid packing name in request, so that MDS knows + * it's getattr by FID. + */ + op_data->op_name = NULL; + op_data->op_namelen = 0; + + /* getattr request is sent to MDT where fid2 inode is */ + tgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + } else if (op_data->op_name) { + /* getattr by name */ + tgt = lmv_locate_tgt(lmv, op_data); + if (!fid_is_sane(&op_data->op_fid2)) + fid_zero(&op_data->op_fid2); + } else { + /* old way to getattr by FID, parent FID not packed */ + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + } + + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID + ", name='%s' -> mds #%u\n", + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), + op_data->op_name ? op_data->op_name : "", + tgt->ltd_index); + + op_data->op_bias &= ~MDS_CROSS_REF; + + rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking, + extra_lock_flags); + if (rc < 0) + RETURN(rc); + + if (*reqp == NULL) { + /* If RPC happens, lsm information will be revalidated + * during update_inode process (see ll_update_lsm_md) */ + if (lmv_dir_striped(op_data->op_mea2)) { + rc = lmv_revalidate_slaves(exp, op_data->op_mea2, + cb_blocking, + extra_lock_flags); + if (rc != 0) + RETURN(rc); + } + RETURN(rc); + } else if (it_disposition(it, DISP_LOOKUP_NEG) && + lmv_dir_retry_check_update(op_data)) { + ptlrpc_req_finished(*reqp); + it->it_request = NULL; + it->it_disposition = 0; + *reqp = NULL; + + goto retry; + } + + if (!it_has_reply_body(it)) + RETURN(0); + + /* + * MDS has returned success. Probably name has been resolved in + * remote inode. Let's check this. + */ + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + /* Not cross-ref case, just get out of here. */ + if (unlikely((body->mbo_valid & OBD_MD_MDS))) { + rc = lmv_intent_remote(exp, it, NULL, reqp, cb_blocking, + extra_lock_flags, + op_data->op_file_secctx_name, + op_data->op_file_secctx_name_size); + if (rc != 0) + RETURN(rc); + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + } + + RETURN(rc); +} + +int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + struct lookup_intent *it, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + int rc; + ENTRY; + + LASSERT(it != NULL); + LASSERT(fid_is_sane(&op_data->op_fid1)); + + CDEBUG(D_INODE, "INTENT LOCK '%s' for "DFID" '%.*s' on "DFID"\n", + LL_IT2STR(it), PFID(&op_data->op_fid2), + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1)); + + if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT | IT_GETXATTR)) + rc = lmv_intent_lookup(exp, op_data, it, reqp, cb_blocking, + extra_lock_flags); + else if (it->it_op & IT_OPEN) + rc = lmv_intent_open(exp, op_data, it, reqp, cb_blocking, + extra_lock_flags); + else + LBUG(); + + if (rc < 0) { + struct lustre_handle lock_handle; + + if (it->it_lock_mode != 0) { + lock_handle.cookie = it->it_lock_handle; + ldlm_lock_decref_and_cancel(&lock_handle, + it->it_lock_mode); + } + + it->it_lock_handle = 0; + it->it_lock_mode = 0; + + if (it->it_remote_lock_mode != 0) { + lock_handle.cookie = it->it_remote_lock_handle; + ldlm_lock_decref_and_cancel(&lock_handle, + it->it_remote_lock_mode); + } + + it->it_remote_lock_handle = 0; + it->it_remote_lock_mode = 0; + } + + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h new file mode 100644 index 0000000000000..84a6d98f44c46 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h @@ -0,0 +1,225 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LMV_INTERNAL_H_ +#define _LMV_INTERNAL_H_ + +#include +#include + +#define LMV_MAX_TGT_COUNT 128 + +#define LL_IT2STR(it) \ + ((it) ? ldlm_it2str((it)->it_op) : "0") + +int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + struct lookup_intent *it, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); + +int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *, int); +int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds); +int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data); + +int lmv_revalidate_slaves(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + ldlm_blocking_callback cb_blocking, + int extra_lock_flags); + +int lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **preq); +void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt, + int activate); + +int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt); + +static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv) +{ + return container_of0(lmv, struct obd_device, u.lmv); +} + +static inline struct lu_tgt_desc * +lmv_tgt(struct lmv_obd *lmv, __u32 index) +{ + return index < lmv->lmv_mdt_descs.ltd_tgt_bitmap->size ? + LTD_TGT(&lmv->lmv_mdt_descs, index) : NULL; +} + +static inline bool +lmv_mdt0_inited(struct lmv_obd *lmv) +{ + return lmv->lmv_mdt_descs.ltd_tgt_bitmap->size > 0 && + cfs_bitmap_check(lmv->lmv_mdt_descs.ltd_tgt_bitmap, 0); +} + +#define lmv_foreach_tgt(lmv, tgt) ltd_foreach_tgt(&(lmv)->lmv_mdt_descs, tgt) + +#define lmv_foreach_tgt_safe(lmv, tgt, tmp) \ + ltd_foreach_tgt_safe(&(lmv)->lmv_mdt_descs, tgt, tmp) + +static inline +struct lu_tgt_desc *lmv_first_connected_tgt(struct lmv_obd *lmv) +{ + struct lu_tgt_desc *tgt; + + tgt = ltd_first_tgt(&lmv->lmv_mdt_descs); + while (tgt && !tgt->ltd_exp) + tgt = ltd_next_tgt(&lmv->lmv_mdt_descs, tgt); + + return tgt; +} + +static inline +struct lu_tgt_desc *lmv_next_connected_tgt(struct lmv_obd *lmv, + struct lu_tgt_desc *tgt) +{ + do { + tgt = ltd_next_tgt(&lmv->lmv_mdt_descs, tgt); + } while (tgt && !tgt->ltd_exp); + + return tgt; +} + +#define lmv_foreach_connected_tgt(lmv, tgt) \ + for (tgt = lmv_first_connected_tgt(lmv); tgt; \ + tgt = lmv_next_connected_tgt(lmv, tgt)) + +static inline int +lmv_fid2tgt_index(struct lmv_obd *lmv, const struct lu_fid *fid) +{ + u32 mdt_idx; + int rc; + + if (lmv->lmv_mdt_count < 2) + return 0; + + rc = lmv_fld_lookup(lmv, fid, &mdt_idx); + if (rc < 0) + return rc; + + return mdt_idx; +} + +static inline struct lmv_tgt_desc * +lmv_fid2tgt(struct lmv_obd *lmv, const struct lu_fid *fid) +{ + struct lu_tgt_desc *tgt; + int index; + + index = lmv_fid2tgt_index(lmv, fid); + if (index < 0) + return ERR_PTR(index); + + tgt = lmv_tgt(lmv, index); + + return tgt ? tgt : ERR_PTR(-ENODEV); +} + +static inline int lmv_stripe_md_size(int stripe_count) +{ + struct lmv_stripe_md *lsm; + + return sizeof(*lsm) + stripe_count * sizeof(lsm->lsm_md_oinfo[0]); +} + +/* for file under migrating directory, return the target stripe info */ +static inline const struct lmv_oinfo * +lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name, + int namelen, bool post_migrate) +{ + __u32 hash_type = lsm->lsm_md_hash_type; + __u32 stripe_count = lsm->lsm_md_stripe_count; + int stripe_index; + + LASSERT(lmv_dir_striped(lsm)); + + if (hash_type & LMV_HASH_FLAG_MIGRATION) { + if (post_migrate) { + hash_type &= ~LMV_HASH_FLAG_MIGRATION; + stripe_count = lsm->lsm_md_migrate_offset; + } else { + hash_type = lsm->lsm_md_migrate_hash; + stripe_count -= lsm->lsm_md_migrate_offset; + } + } + + stripe_index = lmv_name_to_stripe_index(hash_type, stripe_count, + name, namelen); + if (stripe_index < 0) + return ERR_PTR(stripe_index); + + if ((lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) && !post_migrate) + stripe_index += lsm->lsm_md_migrate_offset; + + if (stripe_index >= lsm->lsm_md_stripe_count) { + CERROR("stripe_index %d stripe_count %d hash_type %#x " + "migrate_offset %d migrate_hash %#x name %.*s\n", + stripe_index, lsm->lsm_md_stripe_count, + lsm->lsm_md_hash_type, lsm->lsm_md_migrate_offset, + lsm->lsm_md_migrate_hash, namelen, name); + return ERR_PTR(-EBADF); + } + + return &lsm->lsm_md_oinfo[stripe_index]; +} + +static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data) +{ + const struct lmv_stripe_md *lsm = op_data->op_mea1; + + if (!lsm) + return false; + + if (lmv_dir_migrating(lsm) && !op_data->op_post_migrate) { + op_data->op_post_migrate = true; + return true; + } + + if (lmv_dir_bad_hash(lsm) && + op_data->op_stripe_index < lsm->lsm_md_stripe_count - 1) { + op_data->op_stripe_index++; + return true; + } + + return false; +} + +struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv, + struct md_op_data *op_data); +int lmv_migrate_existence_check(struct lmv_obd *lmv, + struct md_op_data *op_data); + +/* lproc_lmv.c */ +int lmv_tunables_init(struct obd_device *obd); +#endif diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c new file mode 100644 index 0000000000000..dce03d45f43e1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c @@ -0,0 +1,3695 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LMV + +#include +#include +#include +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lmv_internal.h" + +static int lmv_check_connect(struct obd_device *obd); + +void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt, + int activate) +{ + if (tgt->ltd_active == activate) + return; + + tgt->ltd_active = activate; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count += + (activate ? 1 : -1); + + tgt->ltd_exp->exp_obd->obd_inactive = !activate; +} + +/** + * Error codes: + * + * -EINVAL : UUID can't be found in the LMV's target list + * -ENOTCONN: The UUID is found, but the target connection is bad (!) + * -EBADF : The UUID is found, but the OBD of the wrong type (!) + */ +static int lmv_set_mdc_active(struct lmv_obd *lmv, + const struct obd_uuid *uuid, + int activate) +{ + struct lu_tgt_desc *tgt = NULL; + struct obd_device *obd; + int rc = 0; + + ENTRY; + + CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n", + lmv, uuid->uuid, activate); + + spin_lock(&lmv->lmv_lock); + lmv_foreach_connected_tgt(lmv, tgt) { + CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n", + tgt->ltd_index, tgt->ltd_uuid.uuid, + tgt->ltd_exp->exp_handle.h_cookie); + + if (obd_uuid_equals(uuid, &tgt->ltd_uuid)) + break; + } + + if (!tgt) + GOTO(out_lmv_lock, rc = -EINVAL); + + obd = class_exp2obd(tgt->ltd_exp); + if (obd == NULL) + GOTO(out_lmv_lock, rc = -ENOTCONN); + + CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n", + obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd, + obd->obd_type->typ_name, tgt->ltd_index); + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0); + + if (tgt->ltd_active == activate) { + CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd, + activate ? "" : "in"); + GOTO(out_lmv_lock, rc); + } + + CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, + activate ? "" : "in"); + lmv_activate_target(lmv, tgt, activate); + EXIT; + + out_lmv_lock: + spin_unlock(&lmv->lmv_lock); + return rc; +} + +struct obd_uuid *lmv_get_uuid(struct obd_export *exp) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0); + + return (tgt == NULL) ? NULL : obd_get_uuid(tgt->ltd_exp); +} + +static int lmv_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev) +{ + struct obd_connect_data *conn_data; + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_uuid *uuid; + int rc = 0; + ENTRY; + + if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) { + CERROR("unexpected notification of %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + RETURN(-EINVAL); + } + + uuid = &watched->u.cli.cl_target_uuid; + if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) { + /* + * Set MDC as active before notifying the observer, so the + * observer can use the MDC normally. + */ + rc = lmv_set_mdc_active(lmv, uuid, + ev == OBD_NOTIFY_ACTIVE); + if (rc) { + CERROR("%sactivation of %s failed: %d\n", + ev == OBD_NOTIFY_ACTIVE ? "" : "de", + uuid->uuid, rc); + RETURN(rc); + } + } else if (ev == OBD_NOTIFY_OCD) { + conn_data = &watched->u.cli.cl_import->imp_connect_data; + /* + * XXX: Make sure that ocd_connect_flags from all targets are + * the same. Otherwise one of MDTs runs wrong version or + * something like this. --umka + */ + obd->obd_self_export->exp_connect_data = *conn_data; + } + + /* + * Pass the notification up the chain. + */ + if (obd->obd_observer) + rc = obd_notify(obd->obd_observer, watched, ev); + + RETURN(rc); +} + +static int lmv_connect(const struct lu_env *env, + struct obd_export **pexp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lustre_handle conn = { 0 }; + struct obd_export *exp; + int rc; + ENTRY; + + rc = class_connect(&conn, obd, cluuid); + if (rc) { + CERROR("class_connection() returned %d\n", rc); + RETURN(rc); + } + + exp = class_conn2export(&conn); + + lmv->connected = 0; + lmv->conn_data = *data; + + lmv->lmv_tgts_kobj = kobject_create_and_add("target_obds", + &obd->obd_kset.kobj); + if (!lmv->lmv_tgts_kobj) { + CERROR("%s: cannot create /sys/fs/lustre/%s/%s/target_obds\n", + obd->obd_name, obd->obd_type->typ_name, obd->obd_name); + } + + rc = lmv_check_connect(obd); + if (rc != 0) + GOTO(out_sysfs, rc); + + *pexp = exp; + + RETURN(rc); + +out_sysfs: + if (lmv->lmv_tgts_kobj) + kobject_put(lmv->lmv_tgts_kobj); + + class_disconnect(exp); + + return rc; +} + +static int lmv_init_ea_size(struct obd_export *exp, __u32 easize, + __u32 def_easize) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int change = 0; + int rc = 0; + + ENTRY; + + if (lmv->max_easize < easize) { + lmv->max_easize = easize; + change = 1; + } + if (lmv->max_def_easize < def_easize) { + lmv->max_def_easize = def_easize; + change = 1; + } + + if (change == 0) + RETURN(0); + + if (lmv->connected == 0) + RETURN(0); + + lmv_foreach_connected_tgt(lmv, tgt) { + if (!tgt->ltd_active) + continue; + + rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize); + if (rc) { + CERROR("%s: obd_init_ea_size() failed on MDT target %d:" + " rc = %d\n", obd->obd_name, tgt->ltd_index, rc); + break; + } + } + RETURN(rc); +} + +#define MAX_STRING_SIZE 128 + +int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_device *mdc_obd; + struct obd_export *mdc_exp; + struct lu_fld_target target; + int rc; + ENTRY; + + mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc_obd) { + CERROR("target %s not attached\n", tgt->ltd_uuid.uuid); + RETURN(-EINVAL); + } + + CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s\n", + mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, + tgt->ltd_uuid.uuid, obd->obd_uuid.uuid); + + if (!mdc_obd->obd_set_up) { + CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid); + RETURN(-EINVAL); + } + + rc = obd_connect(NULL, &mdc_exp, mdc_obd, &obd->obd_uuid, + &lmv->conn_data, NULL); + if (rc) { + CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc); + RETURN(rc); + } + + /* + * Init fid sequence client for this mdc and add new fld target. + */ + rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA); + if (rc) + RETURN(rc); + + target.ft_srv = NULL; + target.ft_exp = mdc_exp; + target.ft_idx = tgt->ltd_index; + + fld_client_add_target(&lmv->lmv_fld, &target); + + rc = obd_register_observer(mdc_obd, obd); + if (rc) { + obd_disconnect(mdc_exp); + CERROR("target %s register_observer error %d\n", + tgt->ltd_uuid.uuid, rc); + RETURN(rc); + } + + if (obd->obd_observer) { + /* + * Tell the observer about the new target. + */ + rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd, + OBD_NOTIFY_ACTIVE); + if (rc) { + obd_disconnect(mdc_exp); + RETURN(rc); + } + } + + tgt->ltd_active = 1; + tgt->ltd_exp = mdc_exp; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count++; + + md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize); + + rc = lu_qos_add_tgt(&lmv->lmv_qos, tgt); + if (rc) { + obd_disconnect(mdc_exp); + RETURN(rc); + } + + CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n", + mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + + lmv_statfs_check_update(obd, tgt); + + if (lmv->lmv_tgts_kobj) + /* Even if we failed to create the link, that's fine */ + rc = sysfs_create_link(lmv->lmv_tgts_kobj, + &mdc_obd->obd_kset.kobj, + mdc_obd->obd_name); + RETURN(0); +} + +static void lmv_del_target(struct lmv_obd *lmv, struct lu_tgt_desc *tgt) +{ + LASSERT(tgt); + ltd_del_tgt(&lmv->lmv_mdt_descs, tgt); + OBD_FREE_PTR(tgt); +} + +static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, + __u32 index, int gen) +{ + struct obd_device *mdc_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct lu_tgt_descs *ltd = &lmv->lmv_mdt_descs; + int rc = 0; + + ENTRY; + + CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index); + mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc_obd) { + CERROR("%s: Target %s not attached: rc = %d\n", + obd->obd_name, uuidp->uuid, -EINVAL); + RETURN(-EINVAL); + } + + OBD_ALLOC_PTR(tgt); + if (!tgt) + RETURN(-ENOMEM); + + mutex_init(&tgt->ltd_fid_mutex); + tgt->ltd_index = index; + tgt->ltd_uuid = *uuidp; + tgt->ltd_active = 0; + + mutex_lock(<d->ltd_mutex); + rc = ltd_add_tgt(ltd, tgt); + mutex_unlock(<d->ltd_mutex); + + if (rc) + GOTO(out_tgt, rc); + + if (!lmv->connected) + /* lmv_check_connect() will connect this target. */ + RETURN(0); + + rc = lmv_connect_mdc(obd, tgt); + if (!rc) { + int easize = sizeof(struct lmv_stripe_md) + + lmv->lmv_mdt_count * sizeof(struct lu_fid); + + lmv_init_ea_size(obd->obd_self_export, easize, 0); + } + + RETURN(rc); + +out_tgt: + OBD_FREE_PTR(tgt); + return rc; +} + +static int lmv_check_connect(struct obd_device *obd) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int easize; + int rc; + + ENTRY; + + if (lmv->connected) + RETURN(0); + + mutex_lock(&lmv->lmv_mdt_descs.ltd_mutex); + if (lmv->connected) + GOTO(unlock, rc = 0); + + if (!lmv->lmv_mdt_count) { + CERROR("%s: no targets configured: rc = -EINVAL\n", + obd->obd_name); + GOTO(unlock, rc = -EINVAL); + } + + if (!lmv_mdt0_inited(lmv)) { + CERROR("%s: no target configured for index 0: rc = -EINVAL.\n", + obd->obd_name); + GOTO(unlock, rc = -EINVAL); + } + + CDEBUG(D_CONFIG, "Time to connect %s to %s\n", + obd->obd_uuid.uuid, obd->obd_name); + + lmv_foreach_tgt(lmv, tgt) { + rc = lmv_connect_mdc(obd, tgt); + if (rc) + GOTO(out_disc, rc); + } + + lmv->connected = 1; + easize = lmv_mds_md_size(lmv->lmv_mdt_count, LMV_MAGIC); + lmv_init_ea_size(obd->obd_self_export, easize, 0); + EXIT; +unlock: + mutex_unlock(&lmv->lmv_mdt_descs.ltd_mutex); + + return rc; + +out_disc: + lmv_foreach_tgt(lmv, tgt) { + tgt->ltd_active = 0; + if (!tgt->ltd_exp) + continue; + + --lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count; + obd_disconnect(tgt->ltd_exp); + } + + goto unlock; +} + +static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_device *mdc_obd; + int rc; + ENTRY; + + LASSERT(tgt != NULL); + LASSERT(obd != NULL); + + mdc_obd = class_exp2obd(tgt->ltd_exp); + + if (mdc_obd) { + mdc_obd->obd_force = obd->obd_force; + mdc_obd->obd_fail = obd->obd_fail; + mdc_obd->obd_no_recov = obd->obd_no_recov; + + if (lmv->lmv_tgts_kobj) + sysfs_remove_link(lmv->lmv_tgts_kobj, + mdc_obd->obd_name); + } + + rc = obd_fid_fini(tgt->ltd_exp->exp_obd); + if (rc) + CERROR("Can't finanize fids factory\n"); + + CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n", + tgt->ltd_exp->exp_obd->obd_name, + tgt->ltd_exp->exp_obd->obd_uuid.uuid); + + obd_register_observer(tgt->ltd_exp->exp_obd, NULL); + rc = obd_disconnect(tgt->ltd_exp); + if (rc) { + if (tgt->ltd_active) { + CERROR("Target %s disconnect error %d\n", + tgt->ltd_uuid.uuid, rc); + } + } + + lmv_activate_target(lmv, tgt, 0); + tgt->ltd_exp = NULL; + RETURN(0); +} + +static int lmv_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + lmv_foreach_connected_tgt(lmv, tgt) + lmv_disconnect_mdc(obd, tgt); + + if (lmv->lmv_tgts_kobj) + kobject_put(lmv->lmv_tgts_kobj); + + if (!lmv->connected) + class_export_put(exp); + rc = class_disconnect(exp); + lmv->connected = 0; + + RETURN(rc); +} + +static int lmv_fid2path(struct obd_export *exp, int len, void *karg, + void __user *uarg) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lmv_obd *lmv = &obddev->u.lmv; + struct getinfo_fid2path *gf; + struct lmv_tgt_desc *tgt; + struct getinfo_fid2path *remote_gf = NULL; + struct lu_fid root_fid; + int remote_gf_size = 0; + int rc; + + gf = karg; + tgt = lmv_fid2tgt(lmv, &gf->gf_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + root_fid = *gf->gf_u.gf_root_fid; + LASSERT(fid_is_sane(&root_fid)); + +repeat_fid2path: + rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg); + if (rc != 0 && rc != -EREMOTE) + GOTO(out_fid2path, rc); + + /* If remote_gf != NULL, it means just building the + * path on the remote MDT, copy this path segement to gf */ + if (remote_gf != NULL) { + struct getinfo_fid2path *ori_gf; + char *ptr; + int len; + + ori_gf = (struct getinfo_fid2path *)karg; + if (strlen(ori_gf->gf_u.gf_path) + 1 + + strlen(gf->gf_u.gf_path) + 1 > ori_gf->gf_pathlen) + GOTO(out_fid2path, rc = -EOVERFLOW); + + ptr = ori_gf->gf_u.gf_path; + + len = strlen(gf->gf_u.gf_path); + /* move the current path to the right to release space + * for closer-to-root part */ + memmove(ptr + len + 1, ptr, strlen(ori_gf->gf_u.gf_path)); + memcpy(ptr, gf->gf_u.gf_path, len); + ptr[len] = '/'; + } + + CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n", + tgt->ltd_exp->exp_obd->obd_name, + gf->gf_u.gf_path, PFID(&gf->gf_fid), gf->gf_recno, + gf->gf_linkno); + + if (rc == 0) + GOTO(out_fid2path, rc); + + /* sigh, has to go to another MDT to do path building further */ + if (remote_gf == NULL) { + remote_gf_size = sizeof(*remote_gf) + PATH_MAX; + OBD_ALLOC(remote_gf, remote_gf_size); + if (remote_gf == NULL) + GOTO(out_fid2path, rc = -ENOMEM); + remote_gf->gf_pathlen = PATH_MAX; + } + + if (!fid_is_sane(&gf->gf_fid)) { + CERROR("%s: invalid FID "DFID": rc = %d\n", + tgt->ltd_exp->exp_obd->obd_name, + PFID(&gf->gf_fid), -EINVAL); + GOTO(out_fid2path, rc = -EINVAL); + } + + tgt = lmv_fid2tgt(lmv, &gf->gf_fid); + if (IS_ERR(tgt)) + GOTO(out_fid2path, rc = -EINVAL); + + remote_gf->gf_fid = gf->gf_fid; + remote_gf->gf_recno = -1; + remote_gf->gf_linkno = -1; + memset(remote_gf->gf_u.gf_path, 0, remote_gf->gf_pathlen); + *remote_gf->gf_u.gf_root_fid = root_fid; + gf = remote_gf; + goto repeat_fid2path; + +out_fid2path: + if (remote_gf != NULL) + OBD_FREE(remote_gf, remote_gf_size); + RETURN(rc); +} + +static int lmv_hsm_req_count(struct lmv_obd *lmv, + const struct hsm_user_request *hur, + const struct lmv_tgt_desc *tgt_mds) +{ + struct lmv_tgt_desc *curr_tgt; + __u32 i; + int nr = 0; + + /* count how many requests must be sent to the given target */ + for (i = 0; i < hur->hur_request.hr_itemcount; i++) { + curr_tgt = lmv_fid2tgt(lmv, &hur->hur_user_item[i].hui_fid); + if (IS_ERR(curr_tgt)) + RETURN(PTR_ERR(curr_tgt)); + if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) + nr++; + } + return nr; +} + +static int lmv_hsm_req_build(struct lmv_obd *lmv, + struct hsm_user_request *hur_in, + const struct lmv_tgt_desc *tgt_mds, + struct hsm_user_request *hur_out) +{ + __u32 i, nr_out; + struct lmv_tgt_desc *curr_tgt; + + /* build the hsm_user_request for the given target */ + hur_out->hur_request = hur_in->hur_request; + nr_out = 0; + for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) { + curr_tgt = lmv_fid2tgt(lmv, &hur_in->hur_user_item[i].hui_fid); + if (IS_ERR(curr_tgt)) + RETURN(PTR_ERR(curr_tgt)); + if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) { + hur_out->hur_user_item[nr_out] = + hur_in->hur_user_item[i]; + nr_out++; + } + } + hur_out->hur_request.hr_itemcount = nr_out; + memcpy(hur_data(hur_out), hur_data(hur_in), + hur_in->hur_request.hr_data_len); + + RETURN(0); +} + +static int lmv_hsm_ct_unregister(struct obd_device *obd, unsigned int cmd, + int len, struct lustre_kernelcomm *lk, + void __user *uarg) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt; + int rc; + + ENTRY; + + /* unregister request (call from llapi_hsm_copytool_fini) */ + lmv_foreach_connected_tgt(lmv, tgt) + /* best effort: try to clean as much as possible + * (continue on error) */ + obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg); + + /* Whatever the result, remove copytool from kuc groups. + * Unreached coordinators will get EPIPE on next requests + * and will unregister automatically. + */ + rc = libcfs_kkuc_group_rem(&obd->obd_uuid, lk->lk_uid, lk->lk_group); + + RETURN(rc); +} + +static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd, + int len, struct lustre_kernelcomm *lk, + void __user *uarg) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct file *filp; + bool any_set = false; + struct kkuc_ct_data *kcd; + size_t kcd_size; + struct lu_tgt_desc *tgt; + __u32 i; + int err; + int rc = 0; + + ENTRY; + + filp = fget(lk->lk_wfd); + if (!filp) + RETURN(-EBADF); + + if (lk->lk_flags & LK_FLG_DATANR) + kcd_size = offsetof(struct kkuc_ct_data, + kcd_archives[lk->lk_data_count]); + else + kcd_size = sizeof(*kcd); + + OBD_ALLOC(kcd, kcd_size); + if (kcd == NULL) + GOTO(err_fput, rc = -ENOMEM); + + kcd->kcd_nr_archives = lk->lk_data_count; + if (lk->lk_flags & LK_FLG_DATANR) { + kcd->kcd_magic = KKUC_CT_DATA_ARRAY_MAGIC; + if (lk->lk_data_count > 0) + memcpy(kcd->kcd_archives, lk->lk_data, + sizeof(*kcd->kcd_archives) * lk->lk_data_count); + } else { + kcd->kcd_magic = KKUC_CT_DATA_BITMAP_MAGIC; + } + + rc = libcfs_kkuc_group_add(filp, &obd->obd_uuid, lk->lk_uid, + lk->lk_group, kcd, kcd_size); + OBD_FREE(kcd, kcd_size); + if (rc) + GOTO(err_fput, rc); + + /* All or nothing: try to register to all MDS. + * In case of failure, unregister from previous MDS, + * except if it because of inactive target. */ + lmv_foreach_connected_tgt(lmv, tgt) { + err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg); + if (err) { + if (tgt->ltd_active) { + /* permanent error */ + CERROR("%s: iocontrol MDC %s on MDT" + " idx %d cmd %x: err = %d\n", + lmv2obd_dev(lmv)->obd_name, + tgt->ltd_uuid.uuid, tgt->ltd_index, cmd, + err); + rc = err; + lk->lk_flags |= LK_FLG_STOP; + i = tgt->ltd_index; + /* unregister from previous MDS */ + lmv_foreach_connected_tgt(lmv, tgt) { + if (tgt->ltd_index >= i) + break; + + obd_iocontrol(cmd, tgt->ltd_exp, len, + lk, uarg); + } + GOTO(err_kkuc_rem, rc); + } + /* else: transient error. + * kuc will register to the missing MDT + * when it is back */ + } else { + any_set = true; + } + } + + if (!any_set) + /* no registration done: return error */ + GOTO(err_kkuc_rem, rc = -ENOTCONN); + + RETURN(0); + +err_kkuc_rem: + libcfs_kkuc_group_rem(&obd->obd_uuid, lk->lk_uid, lk->lk_group); + +err_fput: + fput(filp); + return rc; +} + +static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, + int len, void *karg, void __user *uarg) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lmv_obd *lmv = &obddev->u.lmv; + struct lu_tgt_desc *tgt = NULL; + int set = 0; + __u32 count = lmv->lmv_mdt_count; + int rc = 0; + + ENTRY; + + if (count == 0) + RETURN(-ENOTTY); + + switch (cmd) { + case IOC_OBD_STATFS: { + struct obd_ioctl_data *data = karg; + struct obd_device *mdc_obd; + struct obd_statfs stat_buf = {0}; + __u32 index; + + memcpy(&index, data->ioc_inlbuf2, sizeof(__u32)); + + if (index >= lmv->lmv_mdt_descs.ltd_tgts_size) + RETURN(-ENODEV); + + tgt = lmv_tgt(lmv, index); + if (!tgt || !tgt->ltd_active) + RETURN(-ENODATA); + + mdc_obd = class_exp2obd(tgt->ltd_exp); + if (!mdc_obd) + RETURN(-EINVAL); + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd), + min((int) data->ioc_plen2, + (int) sizeof(struct obd_uuid)))) + RETURN(-EFAULT); + + rc = obd_statfs(NULL, tgt->ltd_exp, &stat_buf, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + 0); + if (rc) + RETURN(rc); + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min((int) data->ioc_plen1, + (int) sizeof(stat_buf)))) + RETURN(-EFAULT); + break; + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct obd_quotactl *oqctl; + + if (qctl->qc_valid == QC_MDTIDX) { + tgt = lmv_tgt(lmv, qctl->qc_idx); + } else if (qctl->qc_valid == QC_UUID) { + lmv_foreach_tgt(lmv, tgt) { + if (!obd_uuid_equals(&tgt->ltd_uuid, + &qctl->obd_uuid)) + continue; + + if (!tgt->ltd_exp) + RETURN(-EINVAL); + + break; + } + } else { + RETURN(-EINVAL); + } + + if (!tgt || !tgt->ltd_exp) + RETURN(-EINVAL); + + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(tgt->ltd_exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_MDTIDX; + qctl->obd_uuid = tgt->ltd_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } + case LL_IOC_GET_CONNECT_FLAGS: { + tgt = lmv_tgt(lmv, 0); + rc = -ENODATA; + if (tgt && tgt->ltd_exp) + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_FID2MDTIDX: { + struct lu_fid *fid = karg; + int mdt_index; + + rc = lmv_fld_lookup(lmv, fid, &mdt_index); + if (rc != 0) + RETURN(rc); + + /* Note: this is from llite(see ll_dir_ioctl()), @uarg does not + * point to user space memory for FID2MDTIDX. */ + *(__u32 *)uarg = mdt_index; + break; + } + case OBD_IOC_FID2PATH: { + rc = lmv_fid2path(exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_STATE_GET: + case LL_IOC_HSM_STATE_SET: + case LL_IOC_HSM_ACTION: { + struct md_op_data *op_data = karg; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + if (tgt->ltd_exp == NULL) + RETURN(-EINVAL); + + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_PROGRESS: { + const struct hsm_progress_kernel *hpk = karg; + + tgt = lmv_fid2tgt(lmv, &hpk->hpk_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_REQUEST: { + struct hsm_user_request *hur = karg; + unsigned int reqcount = hur->hur_request.hr_itemcount; + + if (reqcount == 0) + RETURN(0); + + /* if the request is about a single fid + * or if there is a single MDS, no need to split + * the request. */ + if (reqcount == 1 || count == 1) { + tgt = lmv_fid2tgt(lmv, &hur->hur_user_item[0].hui_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + } else { + /* split fid list to their respective MDS */ + lmv_foreach_connected_tgt(lmv, tgt) { + int nr, rc1; + size_t reqlen; + struct hsm_user_request *req; + + nr = lmv_hsm_req_count(lmv, hur, tgt); + if (nr < 0) + RETURN(nr); + if (nr == 0) /* nothing for this MDS */ + continue; + + /* build a request with fids for this MDS */ + reqlen = offsetof(typeof(*hur), + hur_user_item[nr]) + + hur->hur_request.hr_data_len; + OBD_ALLOC_LARGE(req, reqlen); + if (req == NULL) + RETURN(-ENOMEM); + rc1 = lmv_hsm_req_build(lmv, hur, tgt, req); + if (rc1 < 0) + GOTO(hsm_req_err, rc1); + rc1 = obd_iocontrol(cmd, tgt->ltd_exp, reqlen, + req, uarg); +hsm_req_err: + if (rc1 != 0 && rc == 0) + rc = rc1; + OBD_FREE_LARGE(req, reqlen); + } + } + break; + } + case LL_IOC_LOV_SWAP_LAYOUTS: { + struct md_op_data *op_data = karg; + struct lmv_tgt_desc *tgt1, *tgt2; + + tgt1 = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt1)) + RETURN(PTR_ERR(tgt1)); + + tgt2 = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(tgt2)) + RETURN(PTR_ERR(tgt2)); + + if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL)) + RETURN(-EINVAL); + + /* only files on same MDT can have their layouts swapped */ + if (tgt1->ltd_index != tgt2->ltd_index) + RETURN(-EPERM); + + rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_CT_START: { + struct lustre_kernelcomm *lk = karg; + if (lk->lk_flags & LK_FLG_STOP) + rc = lmv_hsm_ct_unregister(obddev, cmd, len, lk, uarg); + else + rc = lmv_hsm_ct_register(obddev, cmd, len, lk, uarg); + break; + } + default: + lmv_foreach_connected_tgt(lmv, tgt) { + struct obd_device *mdc_obd; + int err; + + /* ll_umount_begin() sets force flag but for lmv, not + * mdc. Let's pass it through */ + mdc_obd = class_exp2obd(tgt->ltd_exp); + mdc_obd->obd_force = obddev->obd_force; + err = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); + if (err) { + if (tgt->ltd_active) { + CERROR("error: iocontrol MDC %s on MDT" + " idx %d cmd %x: err = %d\n", + tgt->ltd_uuid.uuid, + tgt->ltd_index, cmd, err); + if (!rc) + rc = err; + } + } else + set = 1; + } + if (!set && !rc) + rc = -EIO; + } + RETURN(rc); +} + +int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + LASSERT(op_data); + LASSERT(fid); + + tgt = lmv_tgt(lmv, op_data->op_mds); + if (!tgt) + RETURN(-ENODEV); + + if (!tgt->ltd_active || !tgt->ltd_exp) + RETURN(-ENODEV); + + /* + * New seq alloc and FLD setup should be atomic. Otherwise we may find + * on server that seq in new allocated fid is not yet known. + */ + mutex_lock(&tgt->ltd_fid_mutex); + rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL); + mutex_unlock(&tgt->ltd_fid_mutex); + if (rc > 0) { + LASSERT(fid_is_sane(fid)); + rc = 0; + } + + RETURN(rc); +} + +static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_desc *desc; + struct lnet_process_id lnet_id; + int i = 0; + int rc; + + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("LMV setup requires a descriptor\n"); + RETURN(-EINVAL); + } + + desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1); + if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("Lmv descriptor size wrong: %d > %d\n", + (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); + RETURN(-EINVAL); + } + + obd_str2uuid(&lmv->lmv_mdt_descs.ltd_lmv_desc.ld_uuid, + desc->ld_uuid.uuid); + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count = 0; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count = 0; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage = + LMV_DESC_QOS_MAXAGE_DEFAULT; + lmv->max_def_easize = 0; + lmv->max_easize = 0; + + spin_lock_init(&lmv->lmv_lock); + + /* + * initialize rr_index to lower 32bit of netid, so that client + * can distribute subdirs evenly from the beginning. + */ + while (LNetGetId(i++, &lnet_id) != -ENOENT) { + if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) { + lmv->lmv_qos_rr_index = (u32)lnet_id.nid; + break; + } + } + + rc = lmv_tunables_init(obd); + if (rc) + CWARN("%s: error adding LMV sysfs/debugfs files: rc = %d\n", + obd->obd_name, rc); + + rc = fld_client_init(&lmv->lmv_fld, obd->obd_name, + LUSTRE_CLI_FLD_HASH_DHT); + if (rc) + CERROR("Can't init FLD, err %d\n", rc); + + rc = lu_tgt_descs_init(&lmv->lmv_mdt_descs, true); + if (rc) + CWARN("%s: error initialize target table: rc = %d\n", + obd->obd_name, rc); + + RETURN(rc); +} + +static int lmv_cleanup(struct obd_device *obd) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt; + struct lu_tgt_desc *tmp; + + ENTRY; + + fld_client_fini(&lmv->lmv_fld); + lmv_foreach_tgt_safe(lmv, tgt, tmp) + lmv_del_target(lmv, tgt); + lu_tgt_descs_fini(&lmv->lmv_mdt_descs); + + RETURN(0); +} + +static int lmv_process_config(struct obd_device *obd, size_t len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct obd_uuid obd_uuid; + int gen; + __u32 index; + int rc; + ENTRY; + + switch (lcfg->lcfg_command) { + case LCFG_ADD_MDC: + /* modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDT0000_UUID + * 2:0 3:1 4:lustre-MDT0000-mdc_UUID */ + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) + GOTO(out, rc = -EINVAL); + + obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); + + if (sscanf(lustre_cfg_buf(lcfg, 2), "%u", &index) != 1) + GOTO(out, rc = -EINVAL); + if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) + GOTO(out, rc = -EINVAL); + rc = lmv_add_target(obd, &obd_uuid, index, gen); + GOTO(out, rc); + default: + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + } +out: + RETURN(rc); +} + +static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags) +{ + int i; + + if (flags & OBD_STATFS_FOR_MDT0) + return 0; + + if (lmv->lmv_statfs_start || lmv->lmv_mdt_count == 1) + return lmv->lmv_statfs_start; + + /* choose initial MDT for this client */ + for (i = 0;; i++) { + struct lnet_process_id lnet_id; + if (LNetGetId(i, &lnet_id) == -ENOENT) + break; + + if (lnet_id.nid != LNET_NID_LO_0) { + /* We dont need a full 64-bit modulus, just enough + * to distribute the requests across MDTs evenly. + */ + lmv->lmv_statfs_start = (u32)lnet_id.nid % + lmv->lmv_mdt_count; + break; + } + } + + return lmv->lmv_statfs_start; +} + +static int lmv_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_statfs *temp; + struct lu_tgt_desc *tgt; + __u32 i; + __u32 idx; + int rc = 0; + + ENTRY; + + OBD_ALLOC(temp, sizeof(*temp)); + if (temp == NULL) + RETURN(-ENOMEM); + + /* distribute statfs among MDTs */ + idx = lmv_select_statfs_mdt(lmv, flags); + + for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++, idx++) { + idx = idx % lmv->lmv_mdt_descs.ltd_tgts_size; + tgt = lmv_tgt(lmv, idx); + if (!tgt || !tgt->ltd_exp) + continue; + + rc = obd_statfs(env, tgt->ltd_exp, temp, max_age, flags); + if (rc) { + CERROR("%s: can't stat MDS #%d: rc = %d\n", + tgt->ltd_exp->exp_obd->obd_name, i, rc); + GOTO(out_free_temp, rc); + } + + if (temp->os_state & OS_STATE_SUM || + flags == OBD_STATFS_FOR_MDT0) { + /* reset to the last aggregated values + * and don't sum with non-aggrated data */ + /* If the statfs is from mount, it needs to retrieve + * necessary information from MDT0. i.e. mount does + * not need the merged osfs from all of MDT. Also + * clients can be mounted as long as MDT0 is in + * service */ + *osfs = *temp; + break; + } + + if (i == 0) { + *osfs = *temp; + } else { + osfs->os_bavail += temp->os_bavail; + osfs->os_blocks += temp->os_blocks; + osfs->os_ffree += temp->os_ffree; + osfs->os_files += temp->os_files; + osfs->os_granted += temp->os_granted; + } + } + + EXIT; +out_free_temp: + OBD_FREE(temp, sizeof(*temp)); + return rc; +} + +static int lmv_statfs_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct obd_device *obd = oinfo->oi_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = oinfo->oi_tgt; + struct obd_statfs *osfs = oinfo->oi_osfs; + + /* + * NB: don't deactivate TGT upon error, because we may not trigger async + * statfs any longer, then there is no chance to activate TGT. + */ + if (!rc) { + spin_lock(&lmv->lmv_lock); + tgt->ltd_statfs = *osfs; + tgt->ltd_statfs_age = ktime_get_seconds(); + spin_unlock(&lmv->lmv_lock); + set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags); + } + + return rc; +} + +/* update tgt statfs async if it's ld_qos_maxage old */ +int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct obd_info oinfo = { + .oi_obd = obd, + .oi_tgt = tgt, + .oi_cb_up = lmv_statfs_update, + }; + int rc; + + if (ktime_get_seconds() - tgt->ltd_statfs_age < + obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage) + return 0; + + rc = obd_statfs_async(tgt->ltd_exp, &oinfo, 0, NULL); + + return rc; +} + +static int lmv_get_root(struct obd_export *exp, const char *fileset, + struct lu_fid *fid) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt = lmv_tgt(lmv, 0); + int rc; + + ENTRY; + + if (!tgt) + RETURN(-ENODEV); + + rc = md_get_root(tgt->ltd_exp, fileset, fid); + RETURN(rc); +} + +static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, size_t buf_size, + struct ptlrpc_request **req) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_getxattr(tgt->ltd_exp, fid, obd_md_valid, name, buf_size, req); + + RETURN(rc); +} + +static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, + const void *value, size_t value_size, + unsigned int xattr_flags, u32 suppgid, + struct ptlrpc_request **req) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_setxattr(tgt->ltd_exp, fid, obd_md_valid, name, + value, value_size, xattr_flags, suppgid, req); + + RETURN(rc); +} + +static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + if (op_data->op_flags & MF_GET_MDT_IDX) { + op_data->op_mds = tgt->ltd_index; + RETURN(0); + } + + rc = md_getattr(tgt->ltd_exp, op_data, request); + + RETURN(rc); +} + +static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_tgt_desc *tgt; + + ENTRY; + + CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid)); + + /* + * With DNE every object can have two locks in different namespaces: + * lookup lock in space of MDT storing direntry and update/open lock in + * space of MDT storing inode. + */ + lmv_foreach_connected_tgt(lmv, tgt) + md_null_inode(tgt->ltd_exp, fid); + + RETURN(0); +} + +static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1)); + rc = md_close(tgt->ltd_exp, op_data, mod, request); + RETURN(rc); +} + +static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt, + unsigned short dir_depth) +{ + struct lu_tgt_desc *tgt, *cur = NULL; + __u64 total_avail = 0; + __u64 total_weight = 0; + __u64 cur_weight = 0; + int total_usable = 0; + __u64 rand; + int rc; + + ENTRY; + + if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) + RETURN(ERR_PTR(-EAGAIN)); + + down_write(&lmv->lmv_qos.lq_rw_sem); + + if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) + GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); + + rc = ltd_qos_penalties_calc(&lmv->lmv_mdt_descs); + if (rc) + GOTO(unlock, tgt = ERR_PTR(rc)); + + lmv_foreach_tgt(lmv, tgt) { + if (!tgt->ltd_exp || !tgt->ltd_active) { + tgt->ltd_qos.ltq_usable = 0; + continue; + } + + tgt->ltd_qos.ltq_usable = 1; + lu_tgt_qos_weight_calc(tgt); + if (tgt->ltd_index == *mdt) + cur = tgt; + total_avail += tgt->ltd_qos.ltq_avail; + total_weight += tgt->ltd_qos.ltq_weight; + total_usable++; + } + + /* if current MDT has above-average space, within range of the QOS + * threshold, stay on the same MDT to avoid creating needless remote + * MDT directories. It's more likely for low level directories + * "16 / (dir_depth + 10)" is the factor to make it more unlikely for + * top level directories, while more likely for low levels. + */ + rand = total_avail * 16 / (total_usable * (dir_depth + 10)); + if (cur && cur->ltd_qos.ltq_avail >= rand) { + tgt = cur; + GOTO(unlock, rc = 0); + } + + rand = lu_prandom_u64_max(total_weight); + + lmv_foreach_connected_tgt(lmv, tgt) { + if (!tgt->ltd_qos.ltq_usable) + continue; + + cur_weight += tgt->ltd_qos.ltq_weight; + if (cur_weight < rand) + continue; + + *mdt = tgt->ltd_index; + ltd_qos_update(&lmv->lmv_mdt_descs, tgt, &total_weight); + GOTO(unlock, rc = 0); + } + + /* no proper target found */ + GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); +unlock: + up_write(&lmv->lmv_qos.lq_rw_sem); + + return tgt; +} + +static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt) +{ + struct lu_tgt_desc *tgt; + int i; + int index; + + ENTRY; + + spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc); + for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++) { + index = (i + lmv->lmv_qos_rr_index) % + lmv->lmv_mdt_descs.ltd_tgts_size; + tgt = lmv_tgt(lmv, index); + if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) + continue; + + *mdt = tgt->ltd_index; + lmv->lmv_qos_rr_index = (*mdt + 1) % + lmv->lmv_mdt_descs.ltd_tgts_size; + spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc); + + RETURN(tgt); + } + spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc); + + RETURN(ERR_PTR(-ENODEV)); +} + +static struct lmv_tgt_desc * +lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, + const char *name, int namelen, struct lu_fid *fid, + __u32 *mds, bool post_migrate) +{ + struct lmv_tgt_desc *tgt; + const struct lmv_oinfo *oinfo; + + if (!lmv_dir_striped(lsm) || !namelen) { + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + return tgt; + + *mds = tgt->ltd_index; + return tgt; + } + + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) { + if (cfs_fail_val >= lsm->lsm_md_stripe_count) + return ERR_PTR(-EBADF); + oinfo = &lsm->lsm_md_oinfo[cfs_fail_val]; + } else { + oinfo = lsm_name_to_stripe_info(lsm, name, namelen, + post_migrate); + if (IS_ERR(oinfo)) + return ERR_CAST(oinfo); + } + + /* check stripe FID is sane */ + if (!fid_is_sane(&oinfo->lmo_fid)) + return ERR_PTR(-ENODEV); + *fid = oinfo->lmo_fid; + *mds = oinfo->lmo_mds; + + tgt = lmv_tgt(lmv, oinfo->lmo_mds); + + CDEBUG(D_INODE, "locate MDT %u parent "DFID"\n", *mds, PFID(fid)); + + return tgt ? tgt : ERR_PTR(-ENODEV); +} + +/** + * Locate MDT of op_data->op_fid1 + * + * For striped directory, it will locate the stripe by name hash, if hash_type + * is unknown, it will return the stripe specified by 'op_data->op_stripe_index' + * which is set outside, and if dir is migrating, 'op_data->op_post_migrate' + * indicates whether old or new layout is used to locate. + * + * For plain direcotry, it just locate the MDT of op_data->op_fid1. + * + * \param[in] lmv LMV device + * \param[in] op_data client MD stack parameters, name, namelen + * mds_num etc. + * + * retval pointer to the lmv_tgt_desc if succeed. + * ERR_PTR(errno) if failed. + */ +struct lmv_tgt_desc * +lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data) +{ + struct lmv_stripe_md *lsm = op_data->op_mea1; + struct lmv_oinfo *oinfo; + struct lmv_tgt_desc *tgt; + + /* During creating VOLATILE file, it should honor the mdt + * index if the file under striped dir is being restored, see + * ct_restore(). */ + if (op_data->op_bias & MDS_CREATE_VOLATILE && + op_data->op_mds != LMV_OFFSET_DEFAULT) { + tgt = lmv_tgt(lmv, op_data->op_mds); + if (!tgt) + return ERR_PTR(-ENODEV); + + if (lmv_dir_striped(lsm)) { + int i; + + /* refill the right parent fid */ + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + oinfo = &lsm->lsm_md_oinfo[i]; + if (oinfo->lmo_mds == op_data->op_mds) { + op_data->op_fid1 = oinfo->lmo_fid; + break; + } + } + + if (i == lsm->lsm_md_stripe_count) + op_data->op_fid1 = lsm->lsm_md_oinfo[0].lmo_fid; + } + } else if (lmv_dir_bad_hash(lsm)) { + LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count); + oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index]; + + op_data->op_fid1 = oinfo->lmo_fid; + op_data->op_mds = oinfo->lmo_mds; + tgt = lmv_tgt(lmv, oinfo->lmo_mds); + if (!tgt) + return ERR_PTR(-ENODEV); + } else { + tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea1, + op_data->op_name, op_data->op_namelen, + &op_data->op_fid1, &op_data->op_mds, + op_data->op_post_migrate); + } + + return tgt; +} + +/* Locate MDT of op_data->op_fid2 for link/rename */ +static struct lmv_tgt_desc * +lmv_locate_tgt2(struct lmv_obd *lmv, struct md_op_data *op_data) +{ + struct lmv_tgt_desc *tgt; + int rc; + + LASSERT(op_data->op_name); + if (lmv_dir_migrating(op_data->op_mea2)) { + struct lu_fid fid1 = op_data->op_fid1; + struct lmv_stripe_md *lsm1 = op_data->op_mea1; + struct ptlrpc_request *request = NULL; + + /* + * avoid creating new file under old layout of migrating + * directory, check it here. + */ + tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea2, + op_data->op_name, op_data->op_namelen, + &op_data->op_fid2, &op_data->op_mds, false); + if (IS_ERR(tgt)) + RETURN(tgt); + + op_data->op_fid1 = op_data->op_fid2; + op_data->op_mea1 = op_data->op_mea2; + rc = md_getattr_name(tgt->ltd_exp, op_data, &request); + op_data->op_fid1 = fid1; + op_data->op_mea1 = lsm1; + if (!rc) { + ptlrpc_req_finished(request); + RETURN(ERR_PTR(-EEXIST)); + } + + if (rc != -ENOENT) + RETURN(ERR_PTR(rc)); + } + + return lmv_locate_tgt_by_name(lmv, op_data->op_mea2, op_data->op_name, + op_data->op_namelen, &op_data->op_fid2, + &op_data->op_mds, true); +} + +int lmv_migrate_existence_check(struct lmv_obd *lmv, struct md_op_data *op_data) +{ + struct lu_tgt_desc *tgt; + struct ptlrpc_request *request; + int rc; + + LASSERT(lmv_dir_migrating(op_data->op_mea1)); + + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_getattr_name(tgt->ltd_exp, op_data, &request); + if (!rc) { + ptlrpc_req_finished(request); + return -EEXIST; + } + + return rc; +} + +static inline bool lmv_op_user_qos_mkdir(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + return (op_data->op_cli_flags & CLI_SET_MEA) && lum && + le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC && + le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT; +} + +static inline bool lmv_op_default_qos_mkdir(const struct md_op_data *op_data) +{ + const struct lmv_stripe_md *lsm = op_data->op_default_mea1; + + return (op_data->op_flags & MF_QOS_MKDIR) || + (lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT); +} + +/* mkdir by QoS in three cases: + * 1. ROOT default LMV is space balanced. + * 2. 'lfs mkdir -i -1' + * 3. parent default LMV master_mdt_index is -1 + * + * NB, mkdir by QoS only if parent is not striped, this is to avoid remote + * directories under striped directory. + */ +static inline bool lmv_op_qos_mkdir(const struct md_op_data *op_data) +{ + if (op_data->op_code != LUSTRE_OPC_MKDIR) + return false; + + if (lmv_dir_striped(op_data->op_mea1)) + return false; + + if (lmv_op_user_qos_mkdir(op_data)) + return true; + + if (lmv_op_default_qos_mkdir(op_data)) + return true; + + return false; +} + +/* if parent default LMV is space balanced, and + * 1. max_inherit_rr is set + * 2. or parent is ROOT + * mkdir roundrobin. Or if parent doesn't have default LMV, while ROOT default + * LMV requests roundrobin mkdir, do the same. + * NB, this needs to check server is balanced, which is done by caller. + */ +static inline bool lmv_op_default_rr_mkdir(const struct md_op_data *op_data) +{ + const struct lmv_stripe_md *lsm = op_data->op_default_mea1; + + if (!lmv_op_default_qos_mkdir(op_data)) + return false; + + return (op_data->op_flags & MF_RR_MKDIR) || + (lsm && lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE) || + fid_is_root(&op_data->op_fid1); +} + +/* 'lfs mkdir -i ' */ +static inline bool lmv_op_user_specific_mkdir(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + return op_data->op_code == LUSTRE_OPC_MKDIR && + op_data->op_cli_flags & CLI_SET_MEA && lum && + (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC || + le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) && + le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT; +} + +/* parent default LMV master_mdt_index is not -1. */ +static inline bool +lmv_op_default_specific_mkdir(const struct md_op_data *op_data) +{ + return op_data->op_code == LUSTRE_OPC_MKDIR && + op_data->op_default_mea1 && + op_data->op_default_mea1->lsm_md_master_mdt_index != + LMV_OFFSET_DEFAULT; +} + +int lmv_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, size_t datalen, umode_t mode, uid_t uid, + gid_t gid, cfs_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + if (!lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count) + RETURN(-EIO); + + if (lmv_dir_bad_hash(op_data->op_mea1)) + RETURN(-EBADF); + + if (lmv_dir_migrating(op_data->op_mea1)) { + /* + * if parent is migrating, create() needs to lookup existing + * name in both old and new layout, check old layout on client. + */ + rc = lmv_migrate_existence_check(lmv, op_data); + if (rc != -ENOENT) + RETURN(rc); + + op_data->op_post_migrate = true; + } + + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + if (lmv_op_user_specific_mkdir(op_data)) { + struct lmv_user_md *lum = op_data->op_data; + + op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset); + tgt = lmv_tgt(lmv, op_data->op_mds); + if (!tgt) + RETURN(-ENODEV); + } else if (lmv_op_default_specific_mkdir(op_data)) { + op_data->op_mds = + op_data->op_default_mea1->lsm_md_master_mdt_index; + tgt = lmv_tgt(lmv, op_data->op_mds); + if (!tgt) + RETURN(-ENODEV); + } else if (lmv_op_qos_mkdir(op_data)) { + struct lmv_tgt_desc *tmp = tgt; + + tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds, + op_data->op_dir_depth); + if (tgt == ERR_PTR(-EAGAIN)) { + if (ltd_qos_is_balanced(&lmv->lmv_mdt_descs) && + !lmv_op_default_rr_mkdir(op_data) && + !lmv_op_user_qos_mkdir(op_data)) + /* if it's not necessary, don't create remote + * directory. + */ + tgt = tmp; + else + tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds); + } + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + /* + * only update statfs after QoS mkdir, this means the cached + * statfs may be stale, and current mkdir may not follow QoS + * accurately, but it's not serious, and avoids periodic statfs + * when client doesn't mkdir by QoS. + */ + lmv_statfs_check_update(obd, tgt); + } + + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc) + RETURN(rc); + + CDEBUG(D_INODE, "CREATE name '%.*s' "DFID" on "DFID" -> mds #%x\n", + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid2), PFID(&op_data->op_fid1), + op_data->op_mds); + + op_data->op_flags |= MF_MDC_CANCEL_FID1; + rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid, + cap_effective, rdev, request); + if (rc == 0) { + if (*request == NULL) + RETURN(rc); + CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2)); + } + RETURN(rc); +} + +static int +lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + const union ldlm_policy_data *policy, struct md_op_data *op_data, + struct lustre_handle *lockh, __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + CDEBUG(D_INODE, "ENQUEUE on "DFID"\n", PFID(&op_data->op_fid1)); + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "ENQUEUE on "DFID" -> mds #%u\n", + PFID(&op_data->op_fid1), tgt->ltd_index); + + rc = md_enqueue(tgt->ltd_exp, einfo, policy, op_data, lockh, + extra_lock_flags); + + RETURN(rc); +} + +int +lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data, + struct ptlrpc_request **preq) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int rc; + + ENTRY; + +retry: + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n", + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1), tgt->ltd_index); + + rc = md_getattr_name(tgt->ltd_exp, op_data, preq); + if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) { + ptlrpc_req_finished(*preq); + *preq = NULL; + goto retry; + } + + if (rc) + RETURN(rc); + + body = req_capsule_server_get(&(*preq)->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + if (body->mbo_valid & OBD_MD_MDS) { + op_data->op_fid1 = body->mbo_fid1; + op_data->op_valid |= OBD_MD_FLCROSSREF; + op_data->op_namelen = 0; + op_data->op_name = NULL; + + ptlrpc_req_finished(*preq); + *preq = NULL; + + goto retry; + } + + RETURN(rc); +} + +#define md_op_data_fid(op_data, fl) \ + (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \ + fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \ + fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \ + fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \ + NULL) + +static int lmv_early_cancel(struct obd_export *exp, struct lmv_tgt_desc *tgt, + struct md_op_data *op_data, __u32 op_tgt, + enum ldlm_mode mode, int bits, int flag) +{ + struct lu_fid *fid = md_op_data_fid(op_data, flag); + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + union ldlm_policy_data policy = { { 0 } }; + int rc = 0; + ENTRY; + + if (!fid_is_sane(fid)) + RETURN(0); + + if (tgt == NULL) { + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } + + if (tgt->ltd_index != op_tgt) { + CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid)); + policy.l_inodebits.bits = bits; + rc = md_cancel_unused(tgt->ltd_exp, fid, &policy, + mode, LCF_ASYNC, NULL); + } else { + CDEBUG(D_INODE, + "EARLY_CANCEL skip operation target %d on "DFID"\n", + op_tgt, PFID(fid)); + op_data->op_flags |= flag; + rc = 0; + } + + RETURN(rc); +} + +/* + * llite passes fid of an target inode in op_data->op_fid1 and id of directory in + * op_data->op_fid2 + */ +static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + LASSERT(op_data->op_namelen != 0); + + CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n", + PFID(&op_data->op_fid2), (int)op_data->op_namelen, + op_data->op_name, PFID(&op_data->op_fid1)); + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = cfs_curproc_cap_pack(); + + tgt = lmv_locate_tgt2(lmv, op_data); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + /* + * Cancel UPDATE lock on child (fid1). + */ + op_data->op_flags |= MF_MDC_CANCEL_FID2; + rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); + if (rc != 0) + RETURN(rc); + + rc = md_link(tgt->ltd_exp, op_data, request); + + RETURN(rc); +} + +static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, + const char *name, size_t namelen, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_stripe_md *lsm = op_data->op_mea1; + struct lmv_tgt_desc *parent_tgt; + struct lmv_tgt_desc *sp_tgt; + struct lmv_tgt_desc *tp_tgt = NULL; + struct lmv_tgt_desc *child_tgt; + struct lmv_tgt_desc *tgt; + struct lu_fid target_fid; + int rc; + + ENTRY; + + LASSERT(op_data->op_cli_flags & CLI_MIGRATE); + + CDEBUG(D_INODE, "MIGRATE "DFID"/%.*s\n", + PFID(&op_data->op_fid1), (int)namelen, name); + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = cfs_curproc_cap_pack(); + + parent_tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(parent_tgt)) + RETURN(PTR_ERR(parent_tgt)); + + if (lmv_dir_striped(lsm)) { + __u32 hash_type = lsm->lsm_md_hash_type; + __u32 stripe_count = lsm->lsm_md_stripe_count; + + /* + * old stripes are appended after new stripes for migrating + * directory. + */ + if (lmv_dir_migrating(lsm)) { + hash_type = lsm->lsm_md_migrate_hash; + stripe_count -= lsm->lsm_md_migrate_offset; + } + + rc = lmv_name_to_stripe_index(hash_type, stripe_count, name, + namelen); + if (rc < 0) + RETURN(rc); + + if (lmv_dir_migrating(lsm)) + rc += lsm->lsm_md_migrate_offset; + + /* save it in fid4 temporarily for early cancel */ + op_data->op_fid4 = lsm->lsm_md_oinfo[rc].lmo_fid; + sp_tgt = lmv_tgt(lmv, lsm->lsm_md_oinfo[rc].lmo_mds); + if (!sp_tgt) + RETURN(-ENODEV); + + /* + * if parent is being migrated too, fill op_fid2 with target + * stripe fid, otherwise the target stripe is not created yet. + */ + if (lmv_dir_migrating(lsm)) { + hash_type = lsm->lsm_md_hash_type & + ~LMV_HASH_FLAG_MIGRATION; + stripe_count = lsm->lsm_md_migrate_offset; + + rc = lmv_name_to_stripe_index(hash_type, stripe_count, + name, namelen); + if (rc < 0) + RETURN(rc); + + op_data->op_fid2 = lsm->lsm_md_oinfo[rc].lmo_fid; + tp_tgt = lmv_tgt(lmv, lsm->lsm_md_oinfo[rc].lmo_mds); + if (!tp_tgt) + RETURN(-ENODEV); + } + } else { + sp_tgt = parent_tgt; + } + + child_tgt = lmv_fid2tgt(lmv, &op_data->op_fid3); + if (IS_ERR(child_tgt)) + RETURN(PTR_ERR(child_tgt)); + + /* for directory, migrate to MDT specified by lum_stripe_offset; + * otherwise migrate to the target stripe of parent, but parent + * directory may have finished migration (normally current file too), + * allocate FID on MDT lum_stripe_offset, and server will check + * whether file was migrated already. + */ + if (S_ISDIR(op_data->op_mode) || !tp_tgt) { + struct lmv_user_md *lum = op_data->op_data; + + op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset); + } else { + op_data->op_mds = tp_tgt->ltd_index; + } + rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data); + if (rc) + RETURN(rc); + + /* + * for directory, send migrate request to the MDT where the object will + * be migrated to, because we can't create a striped directory remotely. + * + * otherwise, send to the MDT where source is located because regular + * file may open lease. + * + * NB. if MDT doesn't support DIR_MIGRATE, send to source MDT too for + * backward compatibility. + */ + if (S_ISDIR(op_data->op_mode) && + (exp_connect_flags2(exp) & OBD_CONNECT2_DIR_MIGRATE)) { + tgt = lmv_fid2tgt(lmv, &target_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } else { + tgt = child_tgt; + } + + /* cancel UPDATE lock of parent master object */ + rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); + if (rc) + RETURN(rc); + + /* cancel UPDATE lock of source parent */ + if (sp_tgt != parent_tgt) { + /* + * migrate RPC packs master object FID, because we can only pack + * two FIDs in reint RPC, but MDS needs to know both source + * parent and target parent, and it will obtain them from master + * FID and LMV, the other FID in RPC is kept for target. + * + * since this FID is not passed to MDC, cancel it anyway. + */ + rc = lmv_early_cancel(exp, sp_tgt, op_data, -1, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID4); + if (rc) + RETURN(rc); + + op_data->op_flags &= ~MF_MDC_CANCEL_FID4; + } + op_data->op_fid4 = target_fid; + + /* cancel UPDATE locks of target parent */ + rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2); + if (rc) + RETURN(rc); + + /* cancel LOOKUP lock of source if source is remote object */ + if (child_tgt != sp_tgt) { + rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index, + LCK_EX, MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID3); + if (rc) + RETURN(rc); + } + + /* cancel ELC locks of source */ + rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3); + if (rc) + RETURN(rc); + + rc = md_rename(tgt->ltd_exp, op_data, name, namelen, NULL, 0, request); + + RETURN(rc); +} + +static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, size_t oldlen, + const char *new, size_t newlen, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *sp_tgt; + struct lmv_tgt_desc *tp_tgt = NULL; + struct lmv_tgt_desc *src_tgt = NULL; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int rc; + + ENTRY; + + LASSERT(oldlen != 0); + + if (op_data->op_cli_flags & CLI_MIGRATE) { + rc = lmv_migrate(exp, op_data, old, oldlen, request); + RETURN(rc); + } + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = cfs_curproc_cap_pack(); + + op_data->op_name = new; + op_data->op_namelen = newlen; + + tp_tgt = lmv_locate_tgt2(lmv, op_data); + if (IS_ERR(tp_tgt)) + RETURN(PTR_ERR(tp_tgt)); + + /* Since the target child might be destroyed, and it might become + * orphan, and we can only check orphan on the local MDT right now, so + * we send rename request to the MDT where target child is located. If + * target child does not exist, then it will send the request to the + * target parent */ + if (fid_is_sane(&op_data->op_fid4)) { + tgt = lmv_fid2tgt(lmv, &op_data->op_fid4); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } else { + tgt = tp_tgt; + } + + op_data->op_flags |= MF_MDC_CANCEL_FID4; + + /* cancel UPDATE locks of target parent */ + rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2); + if (rc != 0) + RETURN(rc); + + if (fid_is_sane(&op_data->op_fid4)) { + /* cancel LOOKUP lock of target on target parent */ + if (tgt != tp_tgt) { + rc = lmv_early_cancel(exp, tp_tgt, op_data, + tgt->ltd_index, LCK_EX, + MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID4); + if (rc != 0) + RETURN(rc); + } + } + + if (fid_is_sane(&op_data->op_fid3)) { + src_tgt = lmv_fid2tgt(lmv, &op_data->op_fid3); + if (IS_ERR(src_tgt)) + RETURN(PTR_ERR(src_tgt)); + + /* cancel ELC locks of source */ + rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_index, + LCK_EX, MDS_INODELOCK_ELC, + MF_MDC_CANCEL_FID3); + if (rc != 0) + RETURN(rc); + } + + op_data->op_name = old; + op_data->op_namelen = oldlen; +retry: + sp_tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(sp_tgt)) + RETURN(PTR_ERR(sp_tgt)); + + /* cancel UPDATE locks of source parent */ + rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); + if (rc != 0) + RETURN(rc); + + if (fid_is_sane(&op_data->op_fid3)) { + /* cancel LOOKUP lock of source on source parent */ + if (src_tgt != sp_tgt) { + rc = lmv_early_cancel(exp, sp_tgt, op_data, + tgt->ltd_index, LCK_EX, + MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID3); + if (rc != 0) + RETURN(rc); + } + } + +rename: + CDEBUG(D_INODE, "RENAME "DFID"/%.*s to "DFID"/%.*s\n", + PFID(&op_data->op_fid1), (int)oldlen, old, + PFID(&op_data->op_fid2), (int)newlen, new); + + rc = md_rename(tgt->ltd_exp, op_data, old, oldlen, new, newlen, + request); + if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) { + ptlrpc_req_finished(*request); + *request = NULL; + goto retry; + } + + if (rc && rc != -EXDEV) + RETURN(rc); + + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + /* Not cross-ref case, just get out of here. */ + if (likely(!(body->mbo_valid & OBD_MD_MDS))) + RETURN(rc); + + op_data->op_fid4 = body->mbo_fid1; + + ptlrpc_req_finished(*request); + *request = NULL; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid4); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + if (fid_is_sane(&op_data->op_fid4)) { + /* cancel LOOKUP lock of target on target parent */ + if (tgt != tp_tgt) { + rc = lmv_early_cancel(exp, tp_tgt, op_data, + tgt->ltd_index, LCK_EX, + MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID4); + if (rc != 0) + RETURN(rc); + } + } + + goto rename; +} + +static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, size_t ealen, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc = 0; + + ENTRY; + + CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x/0x%x\n", + PFID(&op_data->op_fid1), op_data->op_attr.ia_valid, + op_data->op_xvalid); + + op_data->op_flags |= MF_MDC_CANCEL_FID1; + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, request); + + RETURN(rc); +} + +static int lmv_fsync(struct obd_export *exp, const struct lu_fid *fid, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_fsync(tgt->ltd_exp, fid, request); + RETURN(rc); +} + +struct stripe_dirent { + struct page *sd_page; + struct lu_dirpage *sd_dp; + struct lu_dirent *sd_ent; + bool sd_eof; +}; + +struct lmv_dir_ctxt { + struct lmv_obd *ldc_lmv; + struct md_op_data *ldc_op_data; + struct md_callback *ldc_cb_op; + __u64 ldc_hash; + int ldc_count; + struct stripe_dirent ldc_stripes[0]; +}; + +static inline void stripe_dirent_unload(struct stripe_dirent *stripe) +{ + if (stripe->sd_page) { + kunmap(stripe->sd_page); + put_page(stripe->sd_page); + stripe->sd_page = NULL; + stripe->sd_ent = NULL; + } +} + +static inline void put_lmv_dir_ctxt(struct lmv_dir_ctxt *ctxt) +{ + int i; + + for (i = 0; i < ctxt->ldc_count; i++) + stripe_dirent_unload(&ctxt->ldc_stripes[i]); +} + +/* if @ent is dummy, or . .., get next */ +static struct lu_dirent *stripe_dirent_get(struct lmv_dir_ctxt *ctxt, + struct lu_dirent *ent, + int stripe_index) +{ + for (; ent; ent = lu_dirent_next(ent)) { + /* Skip dummy entry */ + if (le16_to_cpu(ent->lde_namelen) == 0) + continue; + + /* skip . and .. for other stripes */ + if (stripe_index && + (strncmp(ent->lde_name, ".", + le16_to_cpu(ent->lde_namelen)) == 0 || + strncmp(ent->lde_name, "..", + le16_to_cpu(ent->lde_namelen)) == 0)) + continue; + + if (le64_to_cpu(ent->lde_hash) >= ctxt->ldc_hash) + break; + } + + return ent; +} + +static struct lu_dirent *stripe_dirent_load(struct lmv_dir_ctxt *ctxt, + struct stripe_dirent *stripe, + int stripe_index) +{ + struct md_op_data *op_data = ctxt->ldc_op_data; + struct lmv_oinfo *oinfo; + struct lu_fid fid = op_data->op_fid1; + struct inode *inode = op_data->op_data; + struct lmv_tgt_desc *tgt; + struct lu_dirent *ent = stripe->sd_ent; + __u64 hash = ctxt->ldc_hash; + int rc = 0; + + ENTRY; + + LASSERT(stripe == &ctxt->ldc_stripes[stripe_index]); + LASSERT(!ent); + + do { + if (stripe->sd_page) { + __u64 end = le64_to_cpu(stripe->sd_dp->ldp_hash_end); + + /* @hash should be the last dirent hash */ + LASSERTF(hash <= end, + "ctxt@%p stripe@%p hash %llx end %llx\n", + ctxt, stripe, hash, end); + /* unload last page */ + stripe_dirent_unload(stripe); + /* eof */ + if (end == MDS_DIR_END_OFF) { + stripe->sd_eof = true; + break; + } + hash = end; + } + + oinfo = &op_data->op_mea1->lsm_md_oinfo[stripe_index]; + if (!oinfo->lmo_root) { + rc = -ENOENT; + break; + } + + tgt = lmv_tgt(ctxt->ldc_lmv, oinfo->lmo_mds); + if (!tgt) { + rc = -ENODEV; + break; + } + + /* op_data is shared by stripes, reset after use */ + op_data->op_fid1 = oinfo->lmo_fid; + op_data->op_fid2 = oinfo->lmo_fid; + op_data->op_data = oinfo->lmo_root; + + rc = md_read_page(tgt->ltd_exp, op_data, ctxt->ldc_cb_op, hash, + &stripe->sd_page); + + op_data->op_fid1 = fid; + op_data->op_fid2 = fid; + op_data->op_data = inode; + + if (rc) + break; + + stripe->sd_dp = page_address(stripe->sd_page); + ent = stripe_dirent_get(ctxt, lu_dirent_start(stripe->sd_dp), + stripe_index); + /* in case a page filled with ., .. and dummy, read next */ + } while (!ent); + + stripe->sd_ent = ent; + if (rc) { + LASSERT(!ent); + /* treat error as eof, so dir can be partially accessed */ + stripe->sd_eof = true; + LCONSOLE_WARN("dir "DFID" stripe %d readdir failed: %d, " + "directory is partially accessed!\n", + PFID(&ctxt->ldc_op_data->op_fid1), stripe_index, + rc); + } + + RETURN(ent); +} + +static int lmv_file_resync(struct obd_export *exp, struct md_op_data *data) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + rc = lmv_check_connect(obd); + if (rc != 0) + RETURN(rc); + + tgt = lmv_fid2tgt(lmv, &data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + data->op_flags |= MF_MDC_CANCEL_FID1; + rc = md_file_resync(tgt->ltd_exp, data); + RETURN(rc); +} + +/** + * Get dirent with the closest hash for striped directory + * + * This function will search the dir entry, whose hash value is the + * closest(>=) to hash from all of sub-stripes, and it is only being called + * for striped directory. + * + * \param[in] ctxt dir read context + * + * \retval dirent get the entry successfully + * NULL does not get the entry, normally it means + * it reaches the end of the directory, while read + * stripe dirent error is ignored to allow partial + * access. + */ +static struct lu_dirent *lmv_dirent_next(struct lmv_dir_ctxt *ctxt) +{ + struct stripe_dirent *stripe; + struct lu_dirent *ent = NULL; + int i; + int min = -1; + + /* TODO: optimize with k-way merge sort */ + for (i = 0; i < ctxt->ldc_count; i++) { + stripe = &ctxt->ldc_stripes[i]; + if (stripe->sd_eof) + continue; + + if (!stripe->sd_ent) { + stripe_dirent_load(ctxt, stripe, i); + if (!stripe->sd_ent) { + LASSERT(stripe->sd_eof); + continue; + } + } + + if (min == -1 || + le64_to_cpu(ctxt->ldc_stripes[min].sd_ent->lde_hash) > + le64_to_cpu(stripe->sd_ent->lde_hash)) { + min = i; + if (le64_to_cpu(stripe->sd_ent->lde_hash) == + ctxt->ldc_hash) + break; + } + } + + if (min != -1) { + stripe = &ctxt->ldc_stripes[min]; + ent = stripe->sd_ent; + /* pop found dirent */ + stripe->sd_ent = stripe_dirent_get(ctxt, lu_dirent_next(ent), + min); + } + + return ent; +} + +/** + * Build dir entry page for striped directory + * + * This function gets one entry by @offset from a striped directory. It will + * read entries from all of stripes, and choose one closest to the required + * offset(&offset). A few notes + * 1. skip . and .. for non-zero stripes, because there can only have one . + * and .. in a directory. + * 2. op_data will be shared by all of stripes, instead of allocating new + * one, so need to restore before reusing. + * + * \param[in] exp obd export refer to LMV + * \param[in] op_data hold those MD parameters of read_entry + * \param[in] cb_op ldlm callback being used in enqueue in mdc_read_entry + * \param[in] offset starting hash offset + * \param[out] ppage the page holding the entry. Note: because the entry + * will be accessed in upper layer, so we need hold the + * page until the usages of entry is finished, see + * ll_dir_entry_next. + * + * retval =0 if get entry successfully + * <0 cannot get entry + */ +static int lmv_striped_read_page(struct obd_export *exp, + struct md_op_data *op_data, + struct md_callback *cb_op, + __u64 offset, struct page **ppage) +{ + struct page *page = NULL; + struct lu_dirpage *dp; + void *start; + struct lu_dirent *ent; + struct lu_dirent *last_ent; + int stripe_count; + struct lmv_dir_ctxt *ctxt; + struct lu_dirent *next = NULL; + __u16 ent_size; + size_t left_bytes; + int rc = 0; + ENTRY; + + /* Allocate a page and read entries from all of stripes and fill + * the page by hash order */ + page = alloc_page(GFP_KERNEL); + if (!page) + RETURN(-ENOMEM); + + /* Initialize the entry page */ + dp = kmap(page); + memset(dp, 0, sizeof(*dp)); + dp->ldp_hash_start = cpu_to_le64(offset); + + start = dp + 1; + left_bytes = PAGE_SIZE - sizeof(*dp); + ent = start; + last_ent = ent; + + /* initalize dir read context */ + stripe_count = op_data->op_mea1->lsm_md_stripe_count; + OBD_ALLOC(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count])); + if (!ctxt) + GOTO(free_page, rc = -ENOMEM); + ctxt->ldc_lmv = &exp->exp_obd->u.lmv; + ctxt->ldc_op_data = op_data; + ctxt->ldc_cb_op = cb_op; + ctxt->ldc_hash = offset; + ctxt->ldc_count = stripe_count; + + while (1) { + next = lmv_dirent_next(ctxt); + + /* end of directory */ + if (!next) { + ctxt->ldc_hash = MDS_DIR_END_OFF; + break; + } + ctxt->ldc_hash = le64_to_cpu(next->lde_hash); + + ent_size = le16_to_cpu(next->lde_reclen); + + /* the last entry lde_reclen is 0, but it might not be the last + * one of this temporay dir page */ + if (!ent_size) + ent_size = lu_dirent_calc_size( + le16_to_cpu(next->lde_namelen), + le32_to_cpu(next->lde_attrs)); + /* page full */ + if (ent_size > left_bytes) + break; + + memcpy(ent, next, ent_size); + + /* Replace . with master FID and Replace .. with the parent FID + * of master object */ + if (strncmp(ent->lde_name, ".", + le16_to_cpu(ent->lde_namelen)) == 0 && + le16_to_cpu(ent->lde_namelen) == 1) + fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid1); + else if (strncmp(ent->lde_name, "..", + le16_to_cpu(ent->lde_namelen)) == 0 && + le16_to_cpu(ent->lde_namelen) == 2) + fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid3); + + CDEBUG(D_INODE, "entry %.*s hash %#llx\n", + le16_to_cpu(ent->lde_namelen), ent->lde_name, + le64_to_cpu(ent->lde_hash)); + + left_bytes -= ent_size; + ent->lde_reclen = cpu_to_le16(ent_size); + last_ent = ent; + ent = (void *)ent + ent_size; + }; + + last_ent->lde_reclen = 0; + + if (ent == start) + dp->ldp_flags |= LDF_EMPTY; + else if (ctxt->ldc_hash == le64_to_cpu(last_ent->lde_hash)) + dp->ldp_flags |= LDF_COLLIDE; + dp->ldp_flags = cpu_to_le32(dp->ldp_flags); + dp->ldp_hash_end = cpu_to_le64(ctxt->ldc_hash); + + put_lmv_dir_ctxt(ctxt); + OBD_FREE(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count])); + + *ppage = page; + + RETURN(0); + +free_page: + kunmap(page); + __free_page(page); + + return rc; +} + +int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data, + struct md_callback *cb_op, __u64 offset, + struct page **ppage) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + if (unlikely(lmv_dir_striped(op_data->op_mea1))) { + rc = lmv_striped_read_page(exp, op_data, cb_op, offset, ppage); + RETURN(rc); + } + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_read_page(tgt->ltd_exp, op_data, cb_op, offset, ppage); + + RETURN(rc); +} + +/** + * Unlink a file/directory + * + * Unlink a file or directory under the parent dir. The unlink request + * usually will be sent to the MDT where the child is located, but if + * the client does not have the child FID then request will be sent to the + * MDT where the parent is located. + * + * If the parent is a striped directory then it also needs to locate which + * stripe the name of the child is located, and replace the parent FID + * (@op->op_fid1) with the stripe FID. Note: if the stripe is unknown, + * it will walk through all of sub-stripes until the child is being + * unlinked finally. + * + * \param[in] exp export refer to LMV + * \param[in] op_data different parameters transferred beween client + * MD stacks, name, namelen, FIDs etc. + * op_fid1 is the parent FID, op_fid2 is the child + * FID. + * \param[out] request point to the request of unlink. + * + * retval 0 if succeed + * negative errno if failed. + */ +static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct lmv_tgt_desc *parent_tgt; + struct mdt_body *body; + int rc; + + ENTRY; + + op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); + op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); + op_data->op_cap = cfs_curproc_cap_pack(); + +retry: + parent_tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(parent_tgt)) + RETURN(PTR_ERR(parent_tgt)); + + if (likely(!fid_is_zero(&op_data->op_fid2))) { + tgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + } else { + tgt = parent_tgt; + } + + /* + * If child's fid is given, cancel unused locks for it if it is from + * another export than parent. + * + * LOOKUP lock for child (fid3) should also be cancelled on parent + * tgt_tgt in mdc_unlink(). + */ + op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; + + if (parent_tgt != tgt) + rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index, + LCK_EX, MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID3); + + rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX, + MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3); + if (rc) + RETURN(rc); + + CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n", + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), + tgt->ltd_index); + + rc = md_unlink(tgt->ltd_exp, op_data, request); + if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) { + ptlrpc_req_finished(*request); + *request = NULL; + goto retry; + } + + if (rc != -EREMOTE) + RETURN(rc); + + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + /* Not cross-ref case, just get out of here. */ + if (likely(!(body->mbo_valid & OBD_MD_MDS))) + RETURN(rc); + + /* This is a remote object, try remote MDT. */ + op_data->op_fid2 = body->mbo_fid1; + ptlrpc_req_finished(*request); + *request = NULL; + + tgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + goto retry; +} + +static int lmv_precleanup(struct obd_device *obd) +{ + ENTRY; + libcfs_kkuc_group_rem(&obd->obd_uuid, 0, KUC_GRP_HSM); + fld_client_debugfs_fini(&obd->u.lmv.lmv_fld); + lprocfs_obd_cleanup(obd); + lprocfs_free_md_stats(obd); + RETURN(0); +} + +/** + * Get by key a value associated with a LMV device. + * + * Dispatch request to lower-layer devices as needed. + * + * \param[in] env execution environment for this thread + * \param[in] exp export for the LMV device + * \param[in] keylen length of key identifier + * \param[in] key identifier of key to get value for + * \param[in] vallen size of \a val + * \param[out] val pointer to storage location for value + * \param[in] lsm optional striping metadata of object + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +static int lmv_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val) +{ + struct obd_device *obd; + struct lmv_obd *lmv; + struct lu_tgt_desc *tgt; + int rc = 0; + + ENTRY; + + obd = class_exp2obd(exp); + if (obd == NULL) { + CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n", + exp->exp_handle.h_cookie); + RETURN(-EINVAL); + } + + lmv = &obd->u.lmv; + if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) { + LASSERT(*vallen == sizeof(__u32)); + lmv_foreach_connected_tgt(lmv, tgt) { + if (!obd_get_info(env, tgt->ltd_exp, keylen, key, + vallen, val)) + RETURN(0); + } + RETURN(-EINVAL); + } else if (KEY_IS(KEY_MAX_EASIZE) || + KEY_IS(KEY_DEFAULT_EASIZE) || + KEY_IS(KEY_CONN_DATA)) { + /* + * Forwarding this request to first MDS, it should know LOV + * desc. + */ + tgt = lmv_tgt(lmv, 0); + if (!tgt) + RETURN(-ENODEV); + + rc = obd_get_info(env, tgt->ltd_exp, keylen, key, vallen, val); + if (!rc && KEY_IS(KEY_CONN_DATA)) + exp->exp_connect_data = *(struct obd_connect_data *)val; + RETURN(rc); + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((int *)val) = lmv->lmv_mdt_descs.ltd_tgts_size; + RETURN(0); + } + + CDEBUG(D_IOCTL, "Invalid key\n"); + RETURN(-EINVAL); +} + +static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa, + int *__rcs, struct ptlrpc_request_set *_set) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct ptlrpc_request_set *set = _set; + struct lmv_obd *lmv = &obddev->u.lmv; + int tgt_count = lmv->lmv_mdt_count; + struct lu_tgt_desc *tgt; + struct fid_array *fat, **fas = NULL; + int i, rc, **rcs = NULL; + + if (!set) { + set = ptlrpc_prep_set(); + if (!set) + RETURN(-ENOMEM); + } + + /* split FIDs by targets */ + OBD_ALLOC(fas, sizeof(fas) * tgt_count); + if (fas == NULL) + GOTO(out, rc = -ENOMEM); + OBD_ALLOC(rcs, sizeof(int *) * tgt_count); + if (rcs == NULL) + GOTO(out_fas, rc = -ENOMEM); + + for (i = 0; i < fa->fa_nr; i++) { + unsigned int idx; + + rc = lmv_fld_lookup(lmv, &fa->fa_fids[i], &idx); + if (rc) { + CDEBUG(D_OTHER, "can't lookup "DFID": rc = %d\n", + PFID(&fa->fa_fids[i]), rc); + continue; + } + LASSERT(idx < tgt_count); + if (!fas[idx]) + OBD_ALLOC(fas[idx], offsetof(struct fid_array, + fa_fids[fa->fa_nr])); + if (!fas[idx]) + GOTO(out, rc = -ENOMEM); + if (!rcs[idx]) + OBD_ALLOC(rcs[idx], sizeof(int) * fa->fa_nr); + if (!rcs[idx]) + GOTO(out, rc = -ENOMEM); + + fat = fas[idx]; + fat->fa_fids[fat->fa_nr++] = fa->fa_fids[i]; + } + + lmv_foreach_connected_tgt(lmv, tgt) { + fat = fas[tgt->ltd_index]; + if (!fat || fat->fa_nr == 0) + continue; + rc = md_rmfid(tgt->ltd_exp, fat, rcs[tgt->ltd_index], set); + } + + rc = ptlrpc_set_wait(NULL, set); + if (rc == 0) { + int j = 0; + for (i = 0; i < tgt_count; i++) { + fat = fas[i]; + if (!fat || fat->fa_nr == 0) + continue; + /* copy FIDs back */ + memcpy(fa->fa_fids + j, fat->fa_fids, + fat->fa_nr * sizeof(struct lu_fid)); + /* copy rcs back */ + memcpy(__rcs + j, rcs[i], fat->fa_nr * sizeof(**rcs)); + j += fat->fa_nr; + } + } + if (set != _set) + ptlrpc_set_destroy(set); + +out: + for (i = 0; i < tgt_count; i++) { + if (fas && fas[i]) + OBD_FREE(fas[i], offsetof(struct fid_array, + fa_fids[fa->fa_nr])); + if (rcs && rcs[i]) + OBD_FREE(rcs[i], sizeof(int) * fa->fa_nr); + } + if (rcs) + OBD_FREE(rcs, sizeof(int *) * tgt_count); +out_fas: + if (fas) + OBD_FREE(fas, sizeof(fas) * tgt_count); + + RETURN(rc); +} + +/** + * Asynchronously set by key a value associated with a LMV device. + * + * Dispatch request to lower-layer devices as needed. + * + * \param[in] env execution environment for this thread + * \param[in] exp export for the LMV device + * \param[in] keylen length of key identifier + * \param[in] key identifier of key to store value for + * \param[in] vallen size of value to store + * \param[in] val pointer to data to be stored + * \param[in] set optional list of related ptlrpc requests + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct lmv_tgt_desc *tgt = NULL; + struct obd_device *obd; + struct lmv_obd *lmv; + int rc = 0; + ENTRY; + + obd = class_exp2obd(exp); + if (obd == NULL) { + CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n", + exp->exp_handle.h_cookie); + RETURN(-EINVAL); + } + lmv = &obd->u.lmv; + + if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX) || + KEY_IS(KEY_DEFAULT_EASIZE)) { + int err = 0; + + lmv_foreach_connected_tgt(lmv, tgt) { + err = obd_set_info_async(env, tgt->ltd_exp, + keylen, key, vallen, val, set); + if (err && rc == 0) + rc = err; + } + + RETURN(rc); + } + + RETURN(-EINVAL); +} + +static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm, + const struct lmv_mds_md_v1 *lmm1) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + int stripe_count; + int cplen; + int i; + int rc = 0; + ENTRY; + + lsm->lsm_md_magic = le32_to_cpu(lmm1->lmv_magic); + lsm->lsm_md_stripe_count = le32_to_cpu(lmm1->lmv_stripe_count); + lsm->lsm_md_master_mdt_index = le32_to_cpu(lmm1->lmv_master_mdt_index); + if (OBD_FAIL_CHECK(OBD_FAIL_UNKNOWN_LMV_STRIPE)) + lsm->lsm_md_hash_type = LMV_HASH_TYPE_UNKNOWN; + else + lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type); + lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version); + lsm->lsm_md_migrate_offset = le32_to_cpu(lmm1->lmv_migrate_offset); + lsm->lsm_md_migrate_hash = le32_to_cpu(lmm1->lmv_migrate_hash); + cplen = strlcpy(lsm->lsm_md_pool_name, lmm1->lmv_pool_name, + sizeof(lsm->lsm_md_pool_name)); + + if (cplen >= sizeof(lsm->lsm_md_pool_name)) + RETURN(-E2BIG); + + CDEBUG(D_INFO, "unpack lsm count %d, master %d hash_type %#x " + "layout_version %d\n", lsm->lsm_md_stripe_count, + lsm->lsm_md_master_mdt_index, lsm->lsm_md_hash_type, + lsm->lsm_md_layout_version); + + stripe_count = le32_to_cpu(lmm1->lmv_stripe_count); + for (i = 0; i < stripe_count; i++) { + fid_le_to_cpu(&lsm->lsm_md_oinfo[i].lmo_fid, + &lmm1->lmv_stripe_fids[i]); + /* + * set default value -1, so lmv_locate_tgt() knows this stripe + * target is not initialized. + */ + lsm->lsm_md_oinfo[i].lmo_mds = LMV_OFFSET_DEFAULT; + if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid)) + continue; + + rc = lmv_fld_lookup(lmv, &lsm->lsm_md_oinfo[i].lmo_fid, + &lsm->lsm_md_oinfo[i].lmo_mds); + if (rc == -ENOENT) + continue; + + if (rc) + RETURN(rc); + + CDEBUG(D_INFO, "unpack fid #%d "DFID"\n", i, + PFID(&lsm->lsm_md_oinfo[i].lmo_fid)); + } + + RETURN(rc); +} + +static inline int lmv_unpack_user_md(struct obd_export *exp, + struct lmv_stripe_md *lsm, + const struct lmv_user_md *lmu) +{ + lsm->lsm_md_magic = le32_to_cpu(lmu->lum_magic); + lsm->lsm_md_stripe_count = le32_to_cpu(lmu->lum_stripe_count); + lsm->lsm_md_master_mdt_index = le32_to_cpu(lmu->lum_stripe_offset); + lsm->lsm_md_hash_type = le32_to_cpu(lmu->lum_hash_type); + lsm->lsm_md_max_inherit = lmu->lum_max_inherit; + lsm->lsm_md_max_inherit_rr = lmu->lum_max_inherit_rr; + lsm->lsm_md_pool_name[LOV_MAXPOOLNAME] = 0; + + return 0; +} + +static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp, + const union lmv_mds_md *lmm, size_t lmm_size) +{ + struct lmv_stripe_md *lsm; + int lsm_size; + int rc; + bool allocated = false; + ENTRY; + + LASSERT(lsmp != NULL); + + lsm = *lsmp; + /* Free memmd */ + if (lsm != NULL && lmm == NULL) { + int i; + + if (lmv_dir_striped(lsm)) { + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + if (lsm->lsm_md_oinfo[i].lmo_root) + iput(lsm->lsm_md_oinfo[i].lmo_root); + } + lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count); + } else { + lsm_size = lmv_stripe_md_size(0); + } + OBD_FREE(lsm, lsm_size); + *lsmp = NULL; + RETURN(0); + } + + if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_STRIPE) + RETURN(-EPERM); + + /* Unpack memmd */ + if (le32_to_cpu(lmm->lmv_magic) != LMV_MAGIC_V1 && + le32_to_cpu(lmm->lmv_magic) != LMV_USER_MAGIC) { + CERROR("%s: invalid lmv magic %x: rc = %d\n", + exp->exp_obd->obd_name, le32_to_cpu(lmm->lmv_magic), + -EIO); + RETURN(-EIO); + } + + if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_V1) + lsm_size = lmv_stripe_md_size(lmv_mds_md_stripe_count_get(lmm)); + else + /** + * Unpack default dirstripe(lmv_user_md) to lmv_stripe_md, + * stripecount should be 0 then. + */ + lsm_size = lmv_stripe_md_size(0); + + if (lsm == NULL) { + OBD_ALLOC(lsm, lsm_size); + if (lsm == NULL) + RETURN(-ENOMEM); + allocated = true; + *lsmp = lsm; + } + + switch (le32_to_cpu(lmm->lmv_magic)) { + case LMV_MAGIC_V1: + rc = lmv_unpack_md_v1(exp, lsm, &lmm->lmv_md_v1); + break; + case LMV_USER_MAGIC: + rc = lmv_unpack_user_md(exp, lsm, &lmm->lmv_user_md); + break; + default: + CERROR("%s: unrecognized magic %x\n", exp->exp_obd->obd_name, + le32_to_cpu(lmm->lmv_magic)); + rc = -EINVAL; + break; + } + + if (rc != 0 && allocated) { + OBD_FREE(lsm, lsm_size); + *lsmp = NULL; + lsm_size = rc; + } + RETURN(lsm_size); +} + +void lmv_free_memmd(struct lmv_stripe_md *lsm) +{ + lmv_unpackmd(NULL, &lsm, NULL, 0); +} +EXPORT_SYMBOL(lmv_free_memmd); + +static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + union ldlm_policy_data *policy, + enum ldlm_mode mode, enum ldlm_cancel_flags flags, + void *opaque) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + struct lu_tgt_desc *tgt; + int err; + int rc = 0; + + ENTRY; + + LASSERT(fid != NULL); + + lmv_foreach_connected_tgt(lmv, tgt) { + if (!tgt->ltd_active) + continue; + + err = md_cancel_unused(tgt->ltd_exp, fid, policy, mode, flags, + opaque); + if (!rc) + rc = err; + } + RETURN(rc); +} + +static int lmv_set_lock_data(struct obd_export *exp, + const struct lustre_handle *lockh, + void *data, __u64 *bits) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0); + int rc; + + ENTRY; + + if (tgt == NULL || tgt->ltd_exp == NULL) + RETURN(-EINVAL); + rc = md_set_lock_data(tgt->ltd_exp, lockh, data, bits); + RETURN(rc); +} + +enum ldlm_mode lmv_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, struct lustre_handle *lockh) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + enum ldlm_mode rc; + struct lu_tgt_desc *tgt; + int i; + int index; + + ENTRY; + + CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid)); + + /* + * With DNE every object can have two locks in different namespaces: + * lookup lock in space of MDT storing direntry and update/open lock in + * space of MDT storing inode. Try the MDT that the FID maps to first, + * since this can be easily found, and only try others if that fails. + */ + for (i = 0, index = lmv_fid2tgt_index(lmv, fid); + i < lmv->lmv_mdt_descs.ltd_tgts_size; + i++, index = (index + 1) % lmv->lmv_mdt_descs.ltd_tgts_size) { + if (index < 0) { + CDEBUG(D_HA, "%s: "DFID" is inaccessible: rc = %d\n", + obd->obd_name, PFID(fid), index); + index = 0; + } + + tgt = lmv_tgt(lmv, index); + if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) + continue; + + rc = md_lock_match(tgt->ltd_exp, flags, fid, type, policy, mode, + lockh); + if (rc) + RETURN(rc); + } + + RETURN(0); +} + +int lmv_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req, + struct obd_export *dt_exp, struct obd_export *md_exp, + struct lustre_md *md) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0); + + if (!tgt || !tgt->ltd_exp) + return -EINVAL; + + return md_get_lustre_md(tgt->ltd_exp, req, dt_exp, md_exp, md); +} + +int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0); + + ENTRY; + + if (md->default_lmv) { + lmv_free_memmd(md->default_lmv); + md->default_lmv = NULL; + } + if (md->lmv != NULL) { + lmv_free_memmd(md->lmv); + md->lmv = NULL; + } + if (!tgt || !tgt->ltd_exp) + RETURN(-EINVAL); + RETURN(md_free_lustre_md(tgt->ltd_exp, md)); +} + +int lmv_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, &och->och_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + RETURN(md_set_open_replay_data(tgt->ltd_exp, och, it)); +} + +int lmv_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, &och->och_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + RETURN(md_clear_open_replay_data(tgt->ltd_exp, och)); +} + +int lmv_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo) +{ + struct md_op_data *op_data = &minfo->mi_data; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *ptgt; + struct lmv_tgt_desc *ctgt; + int rc; + + ENTRY; + + if (!fid_is_sane(&op_data->op_fid2)) + RETURN(-EINVAL); + + ptgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(ptgt)) + RETURN(PTR_ERR(ptgt)); + + ctgt = lmv_fid2tgt(lmv, &op_data->op_fid2); + if (IS_ERR(ctgt)) + RETURN(PTR_ERR(ctgt)); + + /* remote object needs two RPCs to lookup and getattr, considering the + * complexity, don't support statahead for now. + */ + if (ptgt != ctgt) + RETURN(-EREMOTE); + + rc = md_intent_getattr_async(ptgt->ltd_exp, minfo); + + RETURN(rc); +} + +int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + + ENTRY; + + tgt = lmv_fid2tgt(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits); + RETURN(rc); +} + +int lmv_get_fid_from_lsm(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + const char *name, int namelen, struct lu_fid *fid) +{ + const struct lmv_oinfo *oinfo; + + LASSERT(lmv_dir_striped(lsm)); + + oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false); + if (IS_ERR(oinfo)) + return PTR_ERR(oinfo); + + *fid = oinfo->lmo_fid; + + RETURN(0); +} + +/** + * For lmv, only need to send request to master MDT, and the master MDT will + * process with other slave MDTs. The only exception is Q_GETOQUOTA for which + * we directly fetch data from the slave MDTs. + */ +int lmv_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0); + __u64 curspace, curinodes; + int rc = 0; + + ENTRY; + + if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) { + CERROR("master lmv inactive\n"); + RETURN(-EIO); + } + + if (oqctl->qc_cmd != Q_GETOQUOTA) { + rc = obd_quotactl(tgt->ltd_exp, oqctl); + RETURN(rc); + } + + curspace = curinodes = 0; + lmv_foreach_connected_tgt(lmv, tgt) { + int err; + + if (!tgt->ltd_active) + continue; + + err = obd_quotactl(tgt->ltd_exp, oqctl); + if (err) { + CERROR("getquota on mdt %d failed. %d\n", + tgt->ltd_index, err); + if (!rc) + rc = err; + } else { + curspace += oqctl->qc_dqblk.dqb_curspace; + curinodes += oqctl->qc_dqblk.dqb_curinodes; + } + } + oqctl->qc_dqblk.dqb_curspace = curspace; + oqctl->qc_dqblk.dqb_curinodes = curinodes; + + RETURN(rc); +} + +static int lmv_merge_attr(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + struct cl_attr *attr, + ldlm_blocking_callback cb_blocking) +{ + int rc; + int i; + + if (!lmv_dir_striped(lsm)) + return 0; + + rc = lmv_revalidate_slaves(exp, lsm, cb_blocking, 0); + if (rc < 0) + return rc; + + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root; + + if (!inode) + continue; + + CDEBUG(D_INFO, + "" DFID " size %llu, blocks %llu nlink %u, atime %lld ctime %lld, mtime %lld.\n", + PFID(&lsm->lsm_md_oinfo[i].lmo_fid), + i_size_read(inode), (unsigned long long)inode->i_blocks, + inode->i_nlink, (s64)inode->i_atime.tv_sec, + (s64)inode->i_ctime.tv_sec, (s64)inode->i_mtime.tv_sec); + + /* for slave stripe, it needs to subtract nlink for . and .. */ + if (i != 0) + attr->cat_nlink += inode->i_nlink - 2; + else + attr->cat_nlink = inode->i_nlink; + + attr->cat_size += i_size_read(inode); + attr->cat_blocks += inode->i_blocks; + + if (attr->cat_atime < inode->i_atime.tv_sec) + attr->cat_atime = inode->i_atime.tv_sec; + + if (attr->cat_ctime < inode->i_ctime.tv_sec) + attr->cat_ctime = inode->i_ctime.tv_sec; + + if (attr->cat_mtime < inode->i_mtime.tv_sec) + attr->cat_mtime = inode->i_mtime.tv_sec; + } + return 0; +} + +struct obd_ops lmv_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = lmv_setup, + .o_cleanup = lmv_cleanup, + .o_precleanup = lmv_precleanup, + .o_process_config = lmv_process_config, + .o_connect = lmv_connect, + .o_disconnect = lmv_disconnect, + .o_statfs = lmv_statfs, + .o_get_info = lmv_get_info, + .o_set_info_async = lmv_set_info_async, + .o_notify = lmv_notify, + .o_get_uuid = lmv_get_uuid, + .o_iocontrol = lmv_iocontrol, + .o_quotactl = lmv_quotactl +}; + +struct md_ops lmv_md_ops = { + .m_get_root = lmv_get_root, + .m_null_inode = lmv_null_inode, + .m_close = lmv_close, + .m_create = lmv_create, + .m_enqueue = lmv_enqueue, + .m_getattr = lmv_getattr, + .m_getxattr = lmv_getxattr, + .m_getattr_name = lmv_getattr_name, + .m_intent_lock = lmv_intent_lock, + .m_link = lmv_link, + .m_rename = lmv_rename, + .m_setattr = lmv_setattr, + .m_setxattr = lmv_setxattr, + .m_fsync = lmv_fsync, + .m_file_resync = lmv_file_resync, + .m_read_page = lmv_read_page, + .m_unlink = lmv_unlink, + .m_init_ea_size = lmv_init_ea_size, + .m_cancel_unused = lmv_cancel_unused, + .m_set_lock_data = lmv_set_lock_data, + .m_lock_match = lmv_lock_match, + .m_get_lustre_md = lmv_get_lustre_md, + .m_free_lustre_md = lmv_free_lustre_md, + .m_merge_attr = lmv_merge_attr, + .m_set_open_replay_data = lmv_set_open_replay_data, + .m_clear_open_replay_data = lmv_clear_open_replay_data, + .m_intent_getattr_async = lmv_intent_getattr_async, + .m_revalidate_lock = lmv_revalidate_lock, + .m_get_fid_from_lsm = lmv_get_fid_from_lsm, + .m_unpackmd = lmv_unpackmd, + .m_rmfid = lmv_rmfid, +}; + +static int __init lmv_init(void) +{ + return class_register_type(&lmv_obd_ops, &lmv_md_ops, true, NULL, + LUSTRE_LMV_NAME, NULL); +} + +static void __exit lmv_exit(void) +{ + class_unregister_type(LUSTRE_LMV_NAME); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Logical Metadata Volume"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(lmv_init); +module_exit(lmv_exit); diff --git a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c new file mode 100644 index 0000000000000..aed88d0f74157 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c @@ -0,0 +1,320 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +#include "lmv_internal.h" + +static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%u\n", dev->u.lmv.lmv_mdt_count); +} +LUSTRE_RO_ATTR(numobd); + +static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%u\n", + dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count); +} +LUSTRE_RO_ATTR(activeobd); + +static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%s\n", + dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_uuid.uuid); +} +LUSTRE_RO_ATTR(desc_uuid); + +static ssize_t qos_maxage_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%u\n", + dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage); +} + +static ssize_t qos_maxage_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage = val; + + return count; +} +LUSTRE_RW_ATTR(qos_maxage); + +static ssize_t qos_prio_free_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%u%%\n", + (dev->u.lmv.lmv_qos.lq_prio_free * 100 + 255) >> 8); +} + +static ssize_t qos_prio_free_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lmv_obd *lmv = &dev->u.lmv; + char buf[6], *tmp; + unsigned int val; + int rc; + + /* "100%\n\0" should be largest string */ + if (count >= sizeof(buf)) + return -ERANGE; + + strncpy(buf, buffer, sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + tmp = strchr(buf, '%'); + if (tmp) + *tmp = '\0'; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + + if (val > 100) + return -EINVAL; + + lmv->lmv_qos.lq_prio_free = (val << 8) / 100; + set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags); + set_bit(LQ_RESET, &lmv->lmv_qos.lq_flags); + + return count; +} +LUSTRE_RW_ATTR(qos_prio_free); + +static ssize_t qos_threshold_rr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%u%%\n", + (dev->u.lmv.lmv_qos.lq_threshold_rr * 100 + 255) >> 8); +} + +static ssize_t qos_threshold_rr_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lmv_obd *lmv = &dev->u.lmv; + char buf[6], *tmp; + unsigned int val; + int rc; + + /* "100%\n\0" should be largest string */ + if (count >= sizeof(buf)) + return -ERANGE; + + strncpy(buf, buffer, sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + tmp = strchr(buf, '%'); + if (tmp) + *tmp = '\0'; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + + if (val > 100) + return -EINVAL; + + lmv->lmv_qos.lq_threshold_rr = (val << 8) / 100; + set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags); + + return count; +} +LUSTRE_RW_ATTR(qos_threshold_rr); + +#ifdef CONFIG_PROC_FS +static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lmv_obd *lmv = &dev->u.lmv; + struct lu_tgt_desc *tgt; + + while (*pos < lmv->lmv_mdt_descs.ltd_tgts_size) { + tgt = lmv_tgt(lmv, (__u32)*pos); + if (tgt) + return tgt; + + ++*pos; + } + + return NULL; +} + +static void lmv_tgt_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lmv_obd *lmv = &dev->u.lmv; + struct lu_tgt_desc *tgt; + + ++*pos; + while (*pos < lmv->lmv_mdt_descs.ltd_tgts_size) { + tgt = lmv_tgt(lmv, (__u32)*pos); + if (tgt) + return tgt; + + ++*pos; + } + + return NULL; +} + +static int lmv_tgt_seq_show(struct seq_file *p, void *v) +{ + struct lmv_tgt_desc *tgt = v; + + if (!tgt) + return 0; + + seq_printf(p, "%u: %s %sACTIVE\n", + tgt->ltd_index, tgt->ltd_uuid.uuid, + tgt->ltd_active ? "" : "IN"); + return 0; +} + +static const struct seq_operations lmv_tgt_sops = { + .start = lmv_tgt_seq_start, + .stop = lmv_tgt_seq_stop, + .next = lmv_tgt_seq_next, + .show = lmv_tgt_seq_show, +}; + +static int lmv_target_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lmv_tgt_sops); + if (rc) + return rc; + + seq = file->private_data; + seq->private = PDE_DATA(inode); + return 0; +} + +static const struct proc_ops lmv_proc_target_fops = { + PROC_OWNER(THIS_MODULE) + .proc_open = lmv_target_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +static struct attribute *lmv_attrs[] = { + &lustre_attr_activeobd.attr, + &lustre_attr_desc_uuid.attr, + &lustre_attr_numobd.attr, + &lustre_attr_qos_maxage.attr, + &lustre_attr_qos_prio_free.attr, + &lustre_attr_qos_threshold_rr.attr, + NULL, +}; + +int lmv_tunables_init(struct obd_device *obd) +{ + int rc; + + obd->obd_ktype.default_attrs = lmv_attrs; + rc = lprocfs_obd_setup(obd, true); + if (rc) + goto out_failed; +#ifdef CONFIG_PROC_FS + rc = lprocfs_alloc_md_stats(obd, 0); + if (rc) { + lprocfs_obd_cleanup(obd); + goto out_failed; + } + + rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", + 0444, &lmv_proc_target_fops, obd); + if (rc) { + lprocfs_free_md_stats(obd); + lprocfs_obd_cleanup(obd); + CWARN("%s: error adding LMV target_obd file: rc = %d\n", + obd->obd_name, rc); + rc = 0; + } +#endif /* CONFIG_PROC_FS */ +out_failed: + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/lov/Makefile b/drivers/staging/lustrefsx/lustre/lov/Makefile new file mode 100644 index 0000000000000..dae11b1647cbe --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LUSTREFSX_FS) += lov.o + +lov-y := lov_dev.o lov_ea.o lov_io.o lov_lock.o lov_merge.o lov_obd.o +lov-y += lov_object.o lov_offset.o lov_pack.o lov_page.o lov_pool.o +lov-y += lov_request.o lovsub_dev.o lovsub_object.o +lov-y += lproc_lov.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h new file mode 100644 index 0000000000000..62ee46daed68f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h @@ -0,0 +1,813 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal interfaces of LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#ifndef LOV_CL_INTERNAL_H +#define LOV_CL_INTERNAL_H + +#include +#include +#include +#include "lov_internal.h" + +/** \defgroup lov lov + * Logical object volume layer. This layer implements data striping (raid0). + * + * At the lov layer top-entity (object, page, lock, io) is connected to one or + * more sub-entities: top-object, representing a file is connected to a set of + * sub-objects, each representing a stripe, file-level top-lock is connected + * to a set of per-stripe sub-locks, top-page is connected to a (single) + * sub-page, and a top-level IO is connected to a set of (potentially + * concurrent) sub-IO's. + * + * Sub-object, sub-page, and sub-io have well-defined top-object and top-page + * respectively, while a single sub-lock can be part of multiple top-locks. + * + * Reference counting models are different for different types of entities: + * + * - top-object keeps a reference to its sub-objects, and destroys them + * when it is destroyed. + * + * - top-page keeps a reference to its sub-page, and destroys it when it + * is destroyed. + * + * - IO's are not reference counted. + * + * To implement a connection between top and sub entities, lov layer is split + * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both + * implementing full set of cl-interfaces. For example, top-object has vvp and + * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is + * used to track child-parent relationship. + * + * @{ + */ + +struct lovsub_device; +struct lovsub_object; + +enum lov_device_flags { + LOV_DEV_INITIALIZED = 1 << 0 +}; + +/* + * Upper half. + */ + +/* Data-on-MDT array item in lov_device::ld_md_tgts[] */ +struct lovdom_device { + struct cl_device *ldm_mdc; + int ldm_idx; +}; + +struct lov_device { + /* + * XXX Locking of lov-private data is missing. + */ + struct cl_device ld_cl; + struct lov_obd *ld_lov; + /** size of lov_device::ld_target[] array */ + __u32 ld_target_nr; + struct lovsub_device **ld_target; + __u32 ld_flags; + + /* Data-on-MDT devices */ + __u32 ld_md_tgts_nr; + struct lovdom_device *ld_md_tgts; + struct obd_device *ld_lmv; + /* LU site for subdevices */ + struct lu_site ld_site; +}; + +/** + * Layout type. + */ +enum lov_layout_type { + LLT_EMPTY, /** empty file without body (mknod + truncate) */ + LLT_RELEASED, /** file with no objects (data in HSM) */ + LLT_COMP, /** support composite layout */ + LLT_NR +}; + +static inline char *llt2str(enum lov_layout_type llt) +{ + switch (llt) { + case LLT_EMPTY: + return "EMPTY"; + case LLT_RELEASED: + return "RELEASED"; + case LLT_COMP: + return "COMPOSITE"; + case LLT_NR: + LBUG(); + } + LBUG(); + return ""; +} + +/** + * Return lov_layout_entry_type associated with a given composite layout + * entry. + */ +static inline __u32 lov_entry_type(struct lov_stripe_md_entry *lsme) +{ + if ((lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_RAID0) || + (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT)) + return lov_pattern(lsme->lsme_pattern); + return 0; +} + +struct lov_layout_entry; +struct lov_object; +struct lov_lock_sub; + +struct lov_comp_layout_entry_ops { + int (*lco_init)(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, unsigned int index, + const struct cl_object_conf *conf, + struct lov_layout_entry *lle); + void (*lco_fini)(const struct lu_env *env, + struct lov_layout_entry *lle); + int (*lco_getattr)(const struct lu_env *env, struct lov_object *obj, + unsigned int index, struct lov_layout_entry *lle, + struct cl_attr **attr); +}; + +struct lov_layout_raid0 { + unsigned lo_nr; + /** + * record the stripe no before the truncate size, used for setting OST + * object size for truncate. LU-14128. + */ + int lo_trunc_stripeno; + /** + * When this is true, lov_object::lo_attr contains + * valid up to date attributes for a top-level + * object. This field is reset to 0 when attributes of + * any sub-object change. + */ + bool lo_attr_valid; + /** + * Array of sub-objects. Allocated when top-object is + * created (lov_init_raid0()). + * + * Top-object is a strict master of its sub-objects: + * it is created before them, and outlives its + * children (this later is necessary so that basic + * functions like cl_object_top() always + * work). Top-object keeps a reference on every + * sub-object. + * + * When top-object is destroyed (lov_delete_raid0()) + * it releases its reference to a sub-object and waits + * until the latter is finally destroyed. + */ + struct lovsub_object **lo_sub; + /** + * protect lo_sub + */ + spinlock_t lo_sub_lock; + /** + * Cached object attribute, built from sub-object + * attributes. + */ + struct cl_attr lo_attr; +}; + +struct lov_layout_dom { + /* keep this always at first place so DOM layout entry + * can be addressed also as RAID0 after initialization. + */ + struct lov_layout_raid0 lo_dom_r0; + struct lovsub_object *lo_dom; + struct lov_oinfo *lo_loi; +}; + +struct lov_layout_entry { + __u32 lle_type; + unsigned int lle_valid:1; + struct lu_extent *lle_extent; + struct lov_stripe_md_entry *lle_lsme; + struct lov_comp_layout_entry_ops *lle_comp_ops; + union { + struct lov_layout_raid0 lle_raid0; + struct lov_layout_dom lle_dom; + }; +}; + +struct lov_mirror_entry { + unsigned short lre_mirror_id; + unsigned short lre_preferred:1, + lre_stale:1, /* set if any components is stale */ + lre_valid:1; /* set if at least one of components + * in this mirror is valid */ + unsigned short lre_start; /* index to lo_entries, start index of + * this mirror */ + unsigned short lre_end; /* end index of this mirror */ +}; + +/** + * lov-specific file state. + * + * lov object has particular layout type, determining how top-object is built + * on top of sub-objects. Layout type can change dynamically. When this + * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode, + * all state pertaining to the old layout type is destroyed, and new state is + * constructed. All object methods take said semaphore in the shared mode, + * providing serialization against transition between layout types. + * + * To avoid multiple `if' or `switch' statements, selecting behavior for the + * current layout type, object methods perform double-dispatch, invoking + * function corresponding to the current layout type. + */ +struct lov_object { + struct cl_object lo_cl; + /** + * Serializes object operations with transitions between layout types. + * + * This semaphore is taken in shared mode by all object methods, and + * is taken in exclusive mode when object type is changed. + * + * \see lov_object::lo_type + */ + struct rw_semaphore lo_type_guard; + /** + * Type of an object. Protected by lov_object::lo_type_guard. + */ + enum lov_layout_type lo_type; + /** + * True if layout is invalid. This bit is cleared when layout lock + * is lost. + */ + bool lo_layout_invalid; + /** + * How many IOs are on going on this object. Layout can be changed + * only if there is no active IO. + */ + atomic_t lo_active_ios; + /** + * Waitq - wait for no one else is using lo_lsm + */ + wait_queue_head_t lo_waitq; + /** + * Layout metadata. NULL if empty layout. + */ + struct lov_stripe_md *lo_lsm; + + union lov_layout_state { + struct lov_layout_state_empty { + } empty; + struct lov_layout_state_released { + } released; + struct lov_layout_composite { + /** + * flags of lov_comp_md_v1::lcm_flags. Mainly used + * by FLR. + */ + uint32_t lo_flags; + /** + * For FLR: index of preferred mirror to read. + * Preferred mirror is initialized by the preferred + * bit of lsme. It can be changed when the preferred + * is inaccessible. + * In order to make lov_lsm_entry() return the same + * mirror in the same IO context, it's only possible + * to change the preferred mirror when the + * lo_active_ios reaches zero. + */ + int lo_preferred_mirror; + /** + * For FLR: the lock to protect access to + * lo_preferred_mirror. + */ + spinlock_t lo_write_lock; + /** + * For FLR: Number of (valid) mirrors. + */ + unsigned lo_mirror_count; + struct lov_mirror_entry *lo_mirrors; + /** + * Current entry count of lo_entries, include + * invalid entries. + */ + unsigned int lo_entry_count; + struct lov_layout_entry *lo_entries; + } composite; + } u; + /** + * Thread that acquired lov_object::lo_type_guard in an exclusive + * mode. + */ + struct task_struct *lo_owner; +}; + +static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i) +{ + LASSERT(lov->lo_type == LLT_COMP); + LASSERTF(i < lov->u.composite.lo_entry_count, + "entry %d entry_count %d", i, lov->u.composite.lo_entry_count); + + return &lov->u.composite.lo_entries[i].lle_raid0; +} + +static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i) +{ + LASSERT(lov->lo_lsm != NULL); + LASSERT(i < lov->lo_lsm->lsm_entry_count); + + return lov->lo_lsm->lsm_entries[i]; +} + +static inline unsigned lov_flr_state(const struct lov_object *lov) +{ + if (lov->lo_type != LLT_COMP) + return LCM_FL_NONE; + + return lov->u.composite.lo_flags & LCM_FL_FLR_MASK; +} + +static inline bool lov_is_flr(const struct lov_object *lov) +{ + return lov_flr_state(lov) != LCM_FL_NONE; +} + +static inline struct lov_layout_entry *lov_entry(struct lov_object *lov, int i) +{ + LASSERT(lov->lo_type == LLT_COMP); + LASSERTF(i < lov->u.composite.lo_entry_count, + "entry %d entry_count %d", i, lov->u.composite.lo_entry_count); + + return &lov->u.composite.lo_entries[i]; +} + +#define lov_for_layout_entry(lov, entry, start, end) \ + for (entry = lov_entry(lov, start); \ + entry <= lov_entry(lov, end); entry++) + +#define lov_foreach_layout_entry(lov, entry) \ + lov_for_layout_entry(lov, entry, 0, \ + (lov)->u.composite.lo_entry_count - 1) + +#define lov_foreach_mirror_layout_entry(lov, entry, lre) \ + lov_for_layout_entry(lov, entry, (lre)->lre_start, (lre)->lre_end) + +static inline struct lov_mirror_entry * +lov_mirror_entry(struct lov_object *lov, int i) +{ + LASSERT(i < lov->u.composite.lo_mirror_count); + return &lov->u.composite.lo_mirrors[i]; +} + +#define lov_foreach_mirror_entry(lov, lre) \ + for (lre = lov_mirror_entry(lov, 0); \ + lre <= lov_mirror_entry(lov, \ + lov->u.composite.lo_mirror_count - 1); \ + lre++) + +static inline unsigned +lov_layout_entry_index(struct lov_object *lov, struct lov_layout_entry *entry) +{ + struct lov_layout_entry *first = &lov->u.composite.lo_entries[0]; + unsigned index = (unsigned)(entry - first); + + LASSERT(entry >= first); + LASSERT(index < lov->u.composite.lo_entry_count); + + return index; +} + +/** + * State lov_lock keeps for each sub-lock. + */ +struct lov_lock_sub { + /** sub-lock itself */ + struct cl_lock sub_lock; + /** Set if the sublock has ever been enqueued, meaning it may + * hold resources of underlying layers */ + unsigned int sub_is_enqueued:1, + sub_initialized:1; + int sub_index; +}; + +/** + * lov-specific lock state. + */ +struct lov_lock { + struct cl_lock_slice lls_cl; + /** Number of sub-locks in this lock */ + int lls_nr; + /** sublock array */ + struct lov_lock_sub lls_sub[0]; +}; + +struct lov_page { + struct cl_page_slice lps_cl; + /** layout_entry + stripe index, composed using lov_comp_index() */ + unsigned int lps_index; + /* the layout gen when this page was created */ + __u32 lps_layout_gen; +}; + +/* + * Bottom half. + */ + +struct lovsub_device { + struct cl_device acid_cl; + struct cl_device *acid_next; +}; + +struct lovsub_object { + struct cl_object_header lso_header; + struct cl_object lso_cl; + struct lov_object *lso_super; + int lso_index; +}; + +/** + * Describe the environment settings for sublocks. + */ +struct lov_sublock_env { + const struct lu_env *lse_env; + struct cl_io *lse_io; +}; + +struct lov_thread_info { + struct cl_object_conf lti_stripe_conf; + struct lu_fid lti_fid; + struct ost_lvb lti_lvb; + struct cl_2queue lti_cl2q; + struct cl_page_list lti_plist; + wait_queue_entry_t lti_waiter; +}; + +/** + * State that lov_io maintains for every sub-io. + */ +struct lov_io_sub { + /** + * Linkage into a list (hanging off lov_io::lis_subios) + */ + struct list_head sub_list; + /** + * Linkage into a list (hanging off lov_io::lis_active) of all + * sub-io's active for the current IO iteration. + */ + struct list_head sub_linkage; + unsigned int sub_subio_index; + /** + * sub-io for a stripe. Ideally sub-io's can be stopped and resumed + * independently, with lov acting as a scheduler to maximize overall + * throughput. + */ + struct cl_io sub_io; + /** + * environment, in which sub-io executes. + */ + struct lu_env *sub_env; + /** + * environment's refcheck. + * + * \see cl_env_get() + */ + __u16 sub_refcheck; + __u16 sub_reenter; +}; + +/** + * IO state private for LOV. + */ +struct lov_io { + /** super-class */ + struct cl_io_slice lis_cl; + + /** + * FLR: index to lo_mirrors. Valid only if lov_is_flr() returns true. + * + * The mirror index of this io. Preserved over cl_io_init() + * if io->ci_ndelay_tried is greater than zero. + */ + int lis_mirror_index; + /** + * FLR: the layout gen when lis_mirror_index was cached. The + * mirror index makes sense only when the layout gen doesn't + * change. + */ + int lis_mirror_layout_gen; + + /** + * fields below this will be initialized in lov_io_init(). + */ + unsigned lis_preserved; + + /** + * Pointer to the object slice. This is a duplicate of + * lov_io::lis_cl::cis_object. + */ + struct lov_object *lis_object; + /** + * Original end-of-io position for this IO, set by the upper layer as + * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this, + * changes pos and count to fit IO into a single stripe and uses saved + * value to determine when IO iterations have to stop. + * + * This is used only for CIT_READ and CIT_WRITE io's. + */ + loff_t lis_io_endpos; + + /** + * starting position within a file, for the current io loop iteration + * (stripe), used by ci_io_loop(). + */ + loff_t lis_pos; + /** + * end position with in a file, for the current stripe io. This is + * exclusive (i.e., next offset after last byte affected by io). + */ + loff_t lis_endpos; + int lis_nr_subios; + + /** + * the index of ls_single_subio in ls_subios array + */ + int lis_single_subio_index; + struct lov_io_sub lis_single_subio; + + /** + * List of active sub-io's. Active sub-io's are under the range + * of [lis_pos, lis_endpos). + */ + struct list_head lis_active; + /** + * All sub-io's created in this lov_io. + */ + struct list_head lis_subios; + +}; + +struct lov_session { + struct lov_io ls_io; + struct lov_sublock_env ls_subenv; +}; + +extern struct lu_device_type lov_device_type; +extern struct lu_device_type lovsub_device_type; + +extern struct lu_context_key lov_key; +extern struct lu_context_key lov_session_key; + +extern struct kmem_cache *lov_lock_kmem; +extern struct kmem_cache *lov_object_kmem; +extern struct kmem_cache *lov_thread_kmem; +extern struct kmem_cache *lov_session_kmem; + +extern struct kmem_cache *lovsub_object_kmem; + +int lov_object_init (const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +int lovsub_object_init (const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +int lov_lock_init (const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_io_init (const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); + +int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_lock_init_empty (const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lov_io_init_empty (const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); + +struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio, + int stripe); + +int lov_page_init (const struct lu_env *env, struct cl_object *ob, + struct cl_page *page, pgoff_t index); +int lov_page_init_empty (const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); +int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); +struct lu_object *lov_object_alloc (const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); +struct lu_object *lovsub_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov); +int lov_page_stripe(const struct cl_page *page); +bool lov_page_is_empty(const struct cl_page *page); +int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset); +int lov_io_layout_at(struct lov_io *lio, __u64 offset); + +#define lov_foreach_target(lov, var) \ + for (var = 0; var < lov_targets_nr(lov); ++var) + +static inline struct lu_extent *lov_io_extent(struct lov_io *io, int i) +{ + return &lov_lse(io->lis_object, i)->lsme_extent; +} + +/** + * For layout entries within @ext. + */ +#define lov_foreach_io_layout(ind, lio, ext) \ + for (ind = lov_io_layout_at(lio, (ext)->e_start); \ + ind >= 0 && \ + lu_extent_is_overlapped(lov_io_extent(lio, ind), ext); \ + ind = lov_io_layout_at(lio, lov_io_extent(lio, ind)->e_end)) + +/***************************************************************************** + * + * Type conversions. + * + * Accessors. + * + */ + +static inline struct lov_session *lov_env_session(const struct lu_env *env) +{ + struct lov_session *ses; + + ses = lu_context_key_get(env->le_ses, &lov_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct lov_io *lov_env_io(const struct lu_env *env) +{ + return &lov_env_session(env)->ls_io; +} + +static inline int lov_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &lov_device_type; +} + +static inline int lovsub_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &lovsub_device_type; +} + +static inline struct lu_device *lov2lu_dev(struct lov_device *lov) +{ + return &lov->ld_cl.cd_lu_dev; +} + +static inline struct lov_device *lu2lov_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &lov_device_type); + return container_of0(d, struct lov_device, ld_cl.cd_lu_dev); +} + +static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub) +{ + return &lovsub->acid_cl; +} + +static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub) +{ + return &lovsub2cl_dev(lovsub)->cd_lu_dev; +} + +static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &lovsub_device_type); + return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev); +} + +static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d) +{ + LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type); + return container_of0(d, struct lovsub_device, acid_cl); +} + +static inline struct lu_object *lov2lu(struct lov_object *lov) +{ + return &lov->lo_cl.co_lu; +} + +static inline struct cl_object *lov2cl(struct lov_object *lov) +{ + return &lov->lo_cl; +} + +static inline struct lov_object *lu2lov(const struct lu_object *obj) +{ + LINVRNT(lov_is_object(obj)); + return container_of0(obj, struct lov_object, lo_cl.co_lu); +} + +static inline struct lov_object *cl2lov(const struct cl_object *obj) +{ + LINVRNT(lov_is_object(&obj->co_lu)); + return container_of0(obj, struct lov_object, lo_cl); +} + +static inline struct lu_object *lovsub2lu(struct lovsub_object *los) +{ + return &los->lso_cl.co_lu; +} + +static inline struct cl_object *lovsub2cl(struct lovsub_object *los) +{ + return &los->lso_cl; +} + +static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj) +{ + LINVRNT(lovsub_is_object(&obj->co_lu)); + return container_of0(obj, struct lovsub_object, lso_cl); +} + +static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj) +{ + LINVRNT(lovsub_is_object(obj)); + return container_of0(obj, struct lovsub_object, lso_cl.co_lu); +} + +static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(lov_is_object(&slice->cls_obj->co_lu)); + return container_of(slice, struct lov_lock, lls_cl); +} + +static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice) +{ + LINVRNT(lov_is_object(&slice->cpl_obj->co_lu)); + return container_of0(slice, struct lov_page, lps_cl); +} + +static inline struct lov_io *cl2lov_io(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio; + + lio = container_of(ios, struct lov_io, lis_cl); + LASSERT(lio == lov_env_io(env)); + return lio; +} + +static inline int lov_targets_nr(const struct lov_device *lov) +{ + return lov->ld_lov->desc.ld_tgt_count; +} + +static inline struct lov_thread_info *lov_env_info(const struct lu_env *env) +{ + struct lov_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &lov_key); + LASSERT(info != NULL); + return info; +} + +/* lov_pack.c */ +int lov_getstripe(const struct lu_env *env, struct lov_object *obj, + struct lov_stripe_md *lsm, struct lov_user_md __user *lump, + size_t size); + +/** @} lov */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_dev.c b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c new file mode 100644 index 0000000000000..1faef7ad76afa --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c @@ -0,0 +1,597 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device and cl_device_type for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +/* class_name2obd() */ +#include + +#include "lov_cl_internal.h" + +struct kmem_cache *lov_lock_kmem; +struct kmem_cache *lov_object_kmem; +struct kmem_cache *lov_thread_kmem; +struct kmem_cache *lov_session_kmem; + +struct kmem_cache *lovsub_object_kmem; + +struct lu_kmem_descr lov_caches[] = { + { + .ckd_cache = &lov_lock_kmem, + .ckd_name = "lov_lock_kmem", + .ckd_size = sizeof(struct lov_lock) + }, + { + .ckd_cache = &lov_object_kmem, + .ckd_name = "lov_object_kmem", + .ckd_size = sizeof(struct lov_object) + }, + { + .ckd_cache = &lov_thread_kmem, + .ckd_name = "lov_thread_kmem", + .ckd_size = sizeof(struct lov_thread_info) + }, + { + .ckd_cache = &lov_session_kmem, + .ckd_name = "lov_session_kmem", + .ckd_size = sizeof(struct lov_session) + }, + { + .ckd_cache = &lovsub_object_kmem, + .ckd_name = "lovsub_object_kmem", + .ckd_size = sizeof(struct lovsub_object) + }, + { + .ckd_cache = NULL + } +}; + +/***************************************************************************** + * + * Lov device and device type functions. + * + */ + +static void *lov_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct lov_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, GFP_NOFS); + if (!info) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void lov_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lov_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, lov_thread_kmem); +} + +struct lu_context_key lov_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = lov_key_init, + .lct_fini = lov_key_fini +}; + +static void *lov_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct lov_session *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, GFP_NOFS); + if (!info) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void lov_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lov_session *info = data; + + OBD_SLAB_FREE_PTR(info, lov_session_kmem); +} + +struct lu_context_key lov_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = lov_session_key_init, + .lct_fini = lov_session_key_fini +}; + +/* type constructor/destructor: lov_type_{init,fini,start,stop}() */ +LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key); + + +static int lov_mdc_dev_init(const struct lu_env *env, struct lov_device *ld, + struct lu_device *mdc_dev, __u32 idx, __u32 nr) +{ + struct cl_device *cl; + + ENTRY; + cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type, + mdc_dev); + if (IS_ERR(cl)) + RETURN(PTR_ERR(cl)); + + ld->ld_md_tgts[nr].ldm_mdc = cl; + ld->ld_md_tgts[nr].ldm_idx = idx; + RETURN(0); +} + +static struct lu_device *lov_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct lov_device *ld = lu2lov_dev(d); + int i; + + LASSERT(ld->ld_lov != NULL); + + if (ld->ld_lmv) { + class_decref(ld->ld_lmv, "lov", d); + ld->ld_lmv = NULL; + } + + if (ld->ld_md_tgts) { + for (i = 0; i < ld->ld_md_tgts_nr; i++) { + if (!ld->ld_md_tgts[i].ldm_mdc) + continue; + + cl_stack_fini(env, ld->ld_md_tgts[i].ldm_mdc); + ld->ld_md_tgts[i].ldm_mdc = NULL; + ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc = NULL; + } + } + + if (ld->ld_target) { + lov_foreach_target(ld, i) { + struct lovsub_device *lsd; + + lsd = ld->ld_target[i]; + if (lsd) { + cl_stack_fini(env, lovsub2cl_dev(lsd)); + ld->ld_target[i] = NULL; + } + } + } + RETURN(NULL); +} + +static int lov_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct lov_device *ld = lu2lov_dev(d); + int i; + int rc = 0; + + /* check all added already MDC subdevices and initialize them */ + for (i = 0; i < ld->ld_md_tgts_nr; i++) { + struct obd_device *mdc; + __u32 idx; + + mdc = ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc; + idx = ld->ld_lov->lov_mdc_tgts[i].lmtd_index; + + if (!mdc) + continue; + + rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx, i); + if (rc) { + CERROR("%s: failed to add MDC %s as target: rc = %d\n", + d->ld_obd->obd_name, + obd_uuid2str(&mdc->obd_uuid), rc); + GOTO(out_err, rc); + } + } + + if (!ld->ld_target) + RETURN(0); + + lov_foreach_target(ld, i) { + struct lovsub_device *lsd; + struct cl_device *cl; + struct lov_tgt_desc *desc; + + desc = ld->ld_lov->lov_tgts[i]; + if (!desc) + continue; + + cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type, + desc->ltd_obd->obd_lu_dev); + if (IS_ERR(cl)) + GOTO(out_err, rc = PTR_ERR(cl)); + + lsd = cl2lovsub_dev(cl); + ld->ld_target[i] = lsd; + } + ld->ld_flags |= LOV_DEV_INITIALIZED; + RETURN(0); + +out_err: + lu_device_fini(d); + RETURN(rc); +} + +/* Free the lov specific data created for the back end lu_device. */ +static struct lu_device *lov_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct lov_device *ld = lu2lov_dev(d); + const int nr = ld->ld_target_nr; + + lu_site_fini(&ld->ld_site); + + cl_device_fini(lu2cl_dev(d)); + if (ld->ld_target) { + OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]); + ld->ld_target = NULL; + } + if (ld->ld_md_tgts) { + OBD_FREE(ld->ld_md_tgts, + sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX); + ld->ld_md_tgts = NULL; + } + /* free array of MDCs */ + if (ld->ld_lov->lov_mdc_tgts) { + OBD_FREE(ld->ld_lov->lov_mdc_tgts, + sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX); + ld->ld_lov->lov_mdc_tgts = NULL; + } + + OBD_FREE_PTR(ld); + return NULL; +} + +static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev, + __u32 index) +{ + struct lov_device *ld = lu2lov_dev(dev); + + ENTRY; + + if (ld->ld_target[index]) { + cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index])); + ld->ld_target[index] = NULL; + } + EXIT; +} + +static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev) +{ + int result; + __u32 tgt_size; + __u32 sub_size; + + ENTRY; + result = 0; + tgt_size = dev->ld_lov->lov_tgt_size; + sub_size = dev->ld_target_nr; + if (sub_size < tgt_size) { + struct lovsub_device **newd; + const size_t sz = sizeof(newd[0]); + + OBD_ALLOC(newd, tgt_size * sz); + if (newd) { + if (sub_size > 0) { + memcpy(newd, dev->ld_target, sub_size * sz); + OBD_FREE(dev->ld_target, sub_size * sz); + } + + dev->ld_target = newd; + dev->ld_target_nr = tgt_size; + } else { + result = -ENOMEM; + } + } + + RETURN(result); +} + +static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev, + __u32 index) +{ + struct obd_device *obd = dev->ld_obd; + struct lov_device *ld = lu2lov_dev(dev); + struct lov_tgt_desc *tgt; + struct lovsub_device *lsd; + struct cl_device *cl; + int rc; + + ENTRY; + + lov_tgts_getref(obd); + + tgt = obd->u.lov.lov_tgts[index]; + LASSERT(tgt != NULL); + LASSERT(tgt->ltd_obd != NULL); + + if (!tgt->ltd_obd->obd_set_up) { + CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid)); + RETURN(-EINVAL); + } + + rc = lov_expand_targets(env, ld); + if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) { + cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type, + tgt->ltd_obd->obd_lu_dev); + if (!IS_ERR(cl)) { + lsd = cl2lovsub_dev(cl); + ld->ld_target[index] = lsd; + } else { + CERROR("add failed (%d), deleting %s\n", rc, + obd_uuid2str(&tgt->ltd_uuid)); + lov_cl_del_target(env, dev, index); + rc = PTR_ERR(cl); + } + } + + lov_tgts_putref(obd); + + RETURN(rc); +} + +/** + * Add new MDC target device in LOV. + * + * This function is part of the configuration log processing. It adds new MDC + * device to the MDC device array indexed by their indexes. + * + * \param[in] env execution environment + * \param[in] d LU device of LOV device + * \param[in] mdc MDC device to add + * \param[in] idx MDC device index + * + * \retval 0 if successful + * \retval negative value on error + */ +static int lov_add_mdc_target(const struct lu_env *env, struct lu_device *d, + struct obd_device *mdc, __u32 idx) +{ + struct lov_device *ld = lu2lov_dev(d); + struct obd_device *lov_obd = d->ld_obd; + struct obd_device *lmv_obd; + int next; + int rc = 0; + + ENTRY; + + LASSERT(mdc != NULL); + if (ld->ld_md_tgts_nr == LOV_MDC_TGT_MAX) { + /* + * If the maximum value of LOV_MDC_TGT_MAX will become too + * small then all MD target handling must be rewritten in LOD + * manner, check lod_add_device() and related functionality. + */ + CERROR("%s: cannot serve more than %d MDC devices\n", + lov_obd->obd_name, LOV_MDC_TGT_MAX); + RETURN(-ERANGE); + } + + /* + * grab FLD from lmv, do that here, when first MDC is added + * to be sure LMV is set up and can be found + */ + if (!ld->ld_lmv) { + next = 0; + while ((lmv_obd = class_devices_in_group(&lov_obd->obd_uuid, + &next)) != NULL) { + if ((strncmp(lmv_obd->obd_type->typ_name, + LUSTRE_LMV_NAME, + strlen(LUSTRE_LMV_NAME)) == 0)) + break; + } + if (!lmv_obd) { + CERROR("%s: cannot find LMV OBD by UUID (%s)\n", + lov_obd->obd_name, + obd_uuid2str(&lmv_obd->obd_uuid)); + RETURN(-ENODEV); + } + spin_lock(&lmv_obd->obd_dev_lock); + class_incref(lmv_obd, "lov", ld); + spin_unlock(&lmv_obd->obd_dev_lock); + ld->ld_lmv = lmv_obd; + } + + LASSERT(lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc == + NULL); + + if (ld->ld_flags & LOV_DEV_INITIALIZED) { + rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx, + ld->ld_md_tgts_nr); + if (rc) { + CERROR("%s: failed to add MDC %s as target: rc = %d\n", + lov_obd->obd_name, obd_uuid2str(&mdc->obd_uuid), + rc); + RETURN(rc); + } + } + + lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc = mdc; + lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_index = idx; + ld->ld_md_tgts_nr++; + + RETURN(rc); +} + +static int lov_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + struct obd_device *obd = d->ld_obd; + int cmd; + int rc; + int gen; + u32 index; + + lov_tgts_getref(obd); + + cmd = cfg->lcfg_command; + + rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen); + if (rc < 0) + GOTO(out, rc); + + switch (cmd) { + case LCFG_LOV_ADD_OBD: + case LCFG_LOV_ADD_INA: + rc = lov_cl_add_target(env, d, index); + if (rc != 0) + lov_del_target(d->ld_obd, index, NULL, 0); + break; + case LCFG_LOV_DEL_OBD: + lov_cl_del_target(env, d, index); + break; + case LCFG_ADD_MDC: + { + struct obd_device *mdc; + struct obd_uuid tgt_uuid; + + /* + * modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDT0000_UUID + * 2:0 3:1 4:lustre-MDT0000-mdc_UUID + */ + if (LUSTRE_CFG_BUFLEN(cfg, 1) > sizeof(tgt_uuid.uuid)) + GOTO(out, rc = -EINVAL); + + obd_str2uuid(&tgt_uuid, lustre_cfg_buf(cfg, 1)); + + rc = kstrtou32(lustre_cfg_buf(cfg, 2), 10, &index); + if (rc) + GOTO(out, rc); + + mdc = class_find_client_obd(&tgt_uuid, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc) + GOTO(out, rc = -ENODEV); + rc = lov_add_mdc_target(env, d, mdc, index); + break; + } + } +out: + lov_tgts_putref(obd); + RETURN(rc); +} + +static const struct lu_device_operations lov_lu_ops = { + .ldo_object_alloc = lov_object_alloc, + .ldo_process_config = lov_process_config, +}; + +static struct lu_device *lov_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct lov_device *ld; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(ld); + if (!ld) + RETURN(ERR_PTR(-ENOMEM)); + + cl_device_init(&ld->ld_cl, t); + d = lov2lu_dev(ld); + d->ld_ops = &lov_lu_ops; + + /* setup the LOV OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = lov_setup(obd, cfg); + if (rc) + GOTO(out, rc); + + /* Alloc MDC devices array */ + /* XXX: need dynamic allocation at some moment */ + OBD_ALLOC(ld->ld_md_tgts, sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX); + if (!ld->ld_md_tgts) + GOTO(out, rc = -ENOMEM); + + ld->ld_md_tgts_nr = 0; + + ld->ld_lov = &obd->u.lov; + OBD_ALLOC(ld->ld_lov->lov_mdc_tgts, + sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX); + if (!ld->ld_lov->lov_mdc_tgts) + GOTO(out_md_tgts, rc = -ENOMEM); + + rc = lu_site_init(&ld->ld_site, d); + if (rc != 0) + GOTO(out_mdc_tgts, rc); + + rc = lu_site_init_finish(&ld->ld_site); + if (rc != 0) + GOTO(out_site, rc); + + RETURN(d); +out_site: + lu_site_fini(&ld->ld_site); +out_mdc_tgts: + OBD_FREE(ld->ld_lov->lov_mdc_tgts, + sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX); + ld->ld_lov->lov_mdc_tgts = NULL; +out_md_tgts: + OBD_FREE(ld->ld_md_tgts, sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX); + ld->ld_md_tgts = NULL; +out: + OBD_FREE_PTR(ld); + + return ERR_PTR(rc); +} + +static const struct lu_device_type_operations lov_device_type_ops = { + .ldto_init = lov_type_init, + .ldto_fini = lov_type_fini, + + .ldto_start = lov_type_start, + .ldto_stop = lov_type_stop, + + .ldto_device_alloc = lov_device_alloc, + .ldto_device_free = lov_device_free, + + .ldto_device_init = lov_device_init, + .ldto_device_fini = lov_device_fini +}; + +struct lu_device_type lov_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_LOV_NAME, + .ldt_ops = &lov_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** @} lov */ diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_ea.c b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c new file mode 100644 index 0000000000000..1d388637d0235 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c @@ -0,0 +1,570 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_ea.c + * + * Author: Wang Di + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include +#include +#include + +#include +#include "lov_internal.h" + +static inline void +lu_extent_le_to_cpu(struct lu_extent *dst, const struct lu_extent *src) +{ + dst->e_start = le64_to_cpu(src->e_start); + dst->e_end = le64_to_cpu(src->e_end); +} + +/* + * Find minimum stripe maxbytes value. For inactive or + * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES. + */ +static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt) +{ + struct obd_import *imp; + loff_t maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES; + + if (!tgt->ltd_active) + return maxbytes; + + imp = tgt->ltd_obd->u.cli.cl_import; + if (!imp) + return maxbytes; + + spin_lock(&imp->imp_lock); + if ((imp->imp_state == LUSTRE_IMP_FULL || + imp->imp_state == LUSTRE_IMP_IDLE) && + (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) && + imp->imp_connect_data.ocd_maxbytes > 0) + maxbytes = imp->imp_connect_data.ocd_maxbytes; + + spin_unlock(&imp->imp_lock); + + return maxbytes; +} + +static int lsm_lmm_verify_v1v3(struct lov_mds_md *lmm, size_t lmm_size, + u16 stripe_count) +{ + if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { + CERROR("bad stripe count %d\n", stripe_count); + lov_dump_lmm_common(D_WARNING, lmm); + return -EINVAL; + } + + if (lmm_oi_id(&lmm->lmm_oi) == 0) { + CERROR("zero object id\n"); + lov_dump_lmm_common(D_WARNING, lmm); + return -EINVAL; + } + + if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_MDT && + lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) { + CERROR("bad striping pattern\n"); + lov_dump_lmm_common(D_WARNING, lmm); + return -EINVAL; + } + + if (lmm->lmm_stripe_size == 0 || + (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) { + CERROR("bad stripe size %u\n", + le32_to_cpu(lmm->lmm_stripe_size)); + lov_dump_lmm_common(D_WARNING, lmm); + return -EINVAL; + } + return 0; +} + +static void lsme_free(struct lov_stripe_md_entry *lsme) +{ + unsigned int stripe_count = lsme->lsme_stripe_count; + unsigned int i; + size_t lsme_size; + + if (!lsme_inited(lsme) || + lsme->lsme_pattern & LOV_PATTERN_F_RELEASED) + stripe_count = 0; + for (i = 0; i < stripe_count; i++) + OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab); + + lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]); + OBD_FREE_LARGE(lsme, lsme_size); +} + +void lsm_free(struct lov_stripe_md *lsm) +{ + unsigned int entry_count = lsm->lsm_entry_count; + unsigned int i; + size_t lsm_size; + + for (i = 0; i < entry_count; i++) + lsme_free(lsm->lsm_entries[i]); + + lsm_size = offsetof(typeof(*lsm), lsm_entries[entry_count]); + OBD_FREE(lsm, lsm_size); +} + +/** + * Unpack a struct lov_mds_md into a struct lov_stripe_md_entry. + * + * The caller should set id and extent. + */ +static struct lov_stripe_md_entry * +lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size, + const char *pool_name, bool inited, struct lov_ost_data_v1 *objects, + loff_t *maxbytes) +{ + struct lov_stripe_md_entry *lsme; + size_t lsme_size; + loff_t min_stripe_maxbytes = 0; + loff_t lov_bytes; + u32 magic; + u32 pattern; + unsigned int stripe_count; + unsigned int i; + int rc; + + magic = le32_to_cpu(lmm->lmm_magic); + if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) + RETURN(ERR_PTR(-EINVAL)); + + pattern = le32_to_cpu(lmm->lmm_pattern); + if (pattern & LOV_PATTERN_F_RELEASED || !inited) + stripe_count = 0; + else + stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + + if (buf_size < (magic == LOV_MAGIC_V1 ? sizeof(struct lov_mds_md_v1) : + sizeof(struct lov_mds_md_v3))) { + CERROR("LOV EA %s too small: %zu, need %u\n", + magic == LOV_MAGIC_V1 ? "V1" : "V3", buf_size, + lov_mds_md_size(stripe_count, magic == LOV_MAGIC_V1 ? + LOV_MAGIC_V1 : LOV_MAGIC_V3)); + lov_dump_lmm_common(D_WARNING, lmm); + return ERR_PTR(-EINVAL); + } + + rc = lsm_lmm_verify_v1v3(lmm, buf_size, stripe_count); + if (rc < 0) + return ERR_PTR(rc); + + lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]); + OBD_ALLOC_LARGE(lsme, lsme_size); + if (!lsme) + RETURN(ERR_PTR(-ENOMEM)); + + lsme->lsme_magic = magic; + lsme->lsme_pattern = pattern; + lsme->lsme_flags = 0; + lsme->lsme_stripe_size = le32_to_cpu(lmm->lmm_stripe_size); + /* preserve the possible -1 stripe count for uninstantiated component */ + lsme->lsme_stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + lsme->lsme_layout_gen = le16_to_cpu(lmm->lmm_layout_gen); + + if (pool_name) { + size_t pool_name_len; + + pool_name_len = strlcpy(lsme->lsme_pool_name, pool_name, + sizeof(lsme->lsme_pool_name)); + if (pool_name_len >= sizeof(lsme->lsme_pool_name)) + GOTO(out_lsme, rc = -E2BIG); + } + + /* with Data-on-MDT set maxbytes to stripe size */ + if (lsme_is_dom(lsme)) { + if (maxbytes) { + lov_bytes = lsme->lsme_stripe_size; + goto out_dom1; + } else { + goto out_dom2; + } + } + + for (i = 0; i < stripe_count; i++) { + struct lov_oinfo *loi; + struct lov_tgt_desc *ltd; + + OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS); + if (!loi) + GOTO(out_lsme, rc = -ENOMEM); + + lsme->lsme_oinfo[i] = loi; + + ostid_le_to_cpu(&objects[i].l_ost_oi, &loi->loi_oi); + loi->loi_ost_idx = le32_to_cpu(objects[i].l_ost_idx); + loi->loi_ost_gen = le32_to_cpu(objects[i].l_ost_gen); + if (lov_oinfo_is_dummy(loi)) + continue; + + if (loi->loi_ost_idx >= lov->desc.ld_tgt_count && + !lov2obd(lov)->obd_process_conf) { + CERROR("%s: OST index %d more than OST count %d\n", + (char*)lov->desc.ld_uuid.uuid, + loi->loi_ost_idx, lov->desc.ld_tgt_count); + lov_dump_lmm_v1(D_WARNING, lmm); + GOTO(out_lsme, rc = -EINVAL); + } + + ltd = lov->lov_tgts[loi->loi_ost_idx]; + if (!ltd) { + CERROR("%s: OST index %d missing\n", + (char*)lov->desc.ld_uuid.uuid, loi->loi_ost_idx); + lov_dump_lmm_v1(D_WARNING, lmm); + continue; + } + + lov_bytes = lov_tgt_maxbytes(ltd); + if (min_stripe_maxbytes == 0 || lov_bytes < min_stripe_maxbytes) + min_stripe_maxbytes = lov_bytes; + } + + if (maxbytes) { + if (min_stripe_maxbytes == 0) + min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES; + + if (stripe_count == 0) + stripe_count = lov->desc.ld_tgt_count; + + if (min_stripe_maxbytes <= LLONG_MAX / stripe_count) + lov_bytes = min_stripe_maxbytes * stripe_count; + else + lov_bytes = MAX_LFS_FILESIZE; +out_dom1: + *maxbytes = min_t(loff_t, lov_bytes, MAX_LFS_FILESIZE); + } +out_dom2: + + return lsme; + +out_lsme: + for (i = 0; i < stripe_count; i++) { + struct lov_oinfo *loi = lsme->lsme_oinfo[i]; + + if (loi) + OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab); + } + OBD_FREE_LARGE(lsme, lsme_size); + + return ERR_PTR(rc); +} + +static struct +lov_stripe_md *lsm_unpackmd_v1v3(struct lov_obd *lov, struct lov_mds_md *lmm, + size_t buf_size, const char *pool_name, + struct lov_ost_data_v1 *objects) +{ + struct lov_stripe_md *lsm; + struct lov_stripe_md_entry *lsme; + size_t lsm_size; + loff_t maxbytes; + u32 pattern; + int rc; + + pattern = le32_to_cpu(lmm->lmm_pattern); + + lsme = lsme_unpack(lov, lmm, buf_size, pool_name, true, objects, + &maxbytes); + if (IS_ERR(lsme)) + RETURN(ERR_CAST(lsme)); + + lsme->lsme_flags = LCME_FL_INIT; + lsme->lsme_extent.e_start = 0; + lsme->lsme_extent.e_end = LUSTRE_EOF; + + lsm_size = offsetof(typeof(*lsm), lsm_entries[1]); + OBD_ALLOC(lsm, lsm_size); + if (!lsm) + GOTO(out_lsme, rc = -ENOMEM); + + atomic_set(&lsm->lsm_refc, 1); + spin_lock_init(&lsm->lsm_lock); + lsm->lsm_maxbytes = maxbytes; + lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi); + lsm->lsm_magic = le32_to_cpu(lmm->lmm_magic); + lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen); + lsm->lsm_entry_count = 1; + lsm->lsm_is_released = pattern & LOV_PATTERN_F_RELEASED; + lsm->lsm_entries[0] = lsme; + + return lsm; + +out_lsme: + lsme_free(lsme); + + return ERR_PTR(rc); +} + +static inline struct lov_stripe_md * +lsm_unpackmd_v1(struct lov_obd *lov, void *buf, size_t buf_size) +{ + struct lov_mds_md_v1 *lmm = buf; + + return lsm_unpackmd_v1v3(lov, buf, buf_size, NULL, lmm->lmm_objects); +} + +const struct lsm_operations lsm_v1_ops = { + .lsm_unpackmd = lsm_unpackmd_v1, +}; + +static inline +struct lov_stripe_md *lsm_unpackmd_v3(struct lov_obd *lov, void *buf, + size_t buf_size) +{ + struct lov_mds_md_v3 *lmm = buf; + + return lsm_unpackmd_v1v3(lov, buf, buf_size, lmm->lmm_pool_name, + lmm->lmm_objects); +} + +const struct lsm_operations lsm_v3_ops = { + .lsm_unpackmd = lsm_unpackmd_v3, +}; + +static int lsm_verify_comp_md_v1(struct lov_comp_md_v1 *lcm, + size_t lcm_buf_size) +{ + unsigned int entry_count; + unsigned int i; + size_t lcm_size; + + lcm_size = le32_to_cpu(lcm->lcm_size); + if (lcm_buf_size < lcm_size) { + CERROR("bad LCM buffer size %zu, expected %zu\n", + lcm_buf_size, lcm_size); + RETURN(-EINVAL); + } + + entry_count = le16_to_cpu(lcm->lcm_entry_count); + for (i = 0; i < entry_count; i++) { + struct lov_comp_md_entry_v1 *lcme = &lcm->lcm_entries[i]; + size_t blob_offset; + size_t blob_size; + + blob_offset = le32_to_cpu(lcme->lcme_offset); + blob_size = le32_to_cpu(lcme->lcme_size); + + if (lcm_size < blob_offset || lcm_size < blob_size || + lcm_size < blob_offset + blob_size) { + CERROR("LCM entry %u has invalid blob: " + "LCM size = %zu, offset = %zu, size = %zu\n", + le32_to_cpu(lcme->lcme_id), + lcm_size, blob_offset, blob_size); + RETURN(-EINVAL); + } + } + + return 0; +} + +static struct lov_stripe_md_entry * +lsme_unpack_comp(struct lov_obd *lov, struct lov_mds_md *lmm, + size_t lmm_buf_size, bool inited, loff_t *maxbytes) +{ + unsigned int magic; + unsigned int stripe_count; + + stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + if (stripe_count == 0 && + lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_MDT) + RETURN(ERR_PTR(-EINVAL)); + /* un-instantiated lmm contains no ost id info, i.e. lov_ost_data_v1 */ + if (!inited) + stripe_count = 0; + + magic = le32_to_cpu(lmm->lmm_magic); + if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) + RETURN(ERR_PTR(-EINVAL)); + + if (lmm_buf_size < lov_mds_md_size(stripe_count, magic)) + RETURN(ERR_PTR(-EINVAL)); + + if (magic == LOV_MAGIC_V1) { + return lsme_unpack(lov, lmm, lmm_buf_size, NULL, + inited, lmm->lmm_objects, maxbytes); + } else { + struct lov_mds_md_v3 *lmm3 = (struct lov_mds_md_v3 *)lmm; + + return lsme_unpack(lov, lmm, lmm_buf_size, lmm3->lmm_pool_name, + inited, lmm3->lmm_objects, maxbytes); + } +} + +static struct lov_stripe_md * +lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size) +{ + struct lov_comp_md_v1 *lcm = buf; + struct lov_stripe_md *lsm; + size_t lsm_size; + unsigned int entry_count = 0; + unsigned int i; + loff_t maxbytes; + int rc; + + rc = lsm_verify_comp_md_v1(buf, buf_size); + if (rc < 0) + return ERR_PTR(rc); + + entry_count = le16_to_cpu(lcm->lcm_entry_count); + + lsm_size = offsetof(typeof(*lsm), lsm_entries[entry_count]); + OBD_ALLOC(lsm, lsm_size); + if (!lsm) + return ERR_PTR(-ENOMEM); + + atomic_set(&lsm->lsm_refc, 1); + spin_lock_init(&lsm->lsm_lock); + lsm->lsm_magic = le32_to_cpu(lcm->lcm_magic); + lsm->lsm_layout_gen = le32_to_cpu(lcm->lcm_layout_gen); + lsm->lsm_entry_count = entry_count; + lsm->lsm_mirror_count = le16_to_cpu(lcm->lcm_mirror_count); + lsm->lsm_flags = le16_to_cpu(lcm->lcm_flags); + lsm->lsm_is_released = true; + lsm->lsm_maxbytes = LLONG_MIN; + + for (i = 0; i < entry_count; i++) { + struct lov_comp_md_entry_v1 *lcme = &lcm->lcm_entries[i]; + struct lov_stripe_md_entry *lsme; + size_t blob_offset; + size_t blob_size; + void *blob; + + blob_offset = le32_to_cpu(lcme->lcme_offset); + blob_size = le32_to_cpu(lcme->lcme_size); + blob = (char *)lcm + blob_offset; + + lsme = lsme_unpack_comp(lov, blob, blob_size, + le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT, + (i == entry_count - 1) ? &maxbytes : + NULL); + if (IS_ERR(lsme)) + GOTO(out_lsm, rc = PTR_ERR(lsme)); + + if (!(lsme->lsme_pattern & LOV_PATTERN_F_RELEASED)) + lsm->lsm_is_released = false; + + lsm->lsm_entries[i] = lsme; + lsme->lsme_id = le32_to_cpu(lcme->lcme_id); + lsme->lsme_flags = le32_to_cpu(lcme->lcme_flags); + if (lsme->lsme_flags & LCME_FL_NOSYNC) + lsme->lsme_timestamp = + le64_to_cpu(lcme->lcme_timestamp); + lu_extent_le_to_cpu(&lsme->lsme_extent, &lcme->lcme_extent); + + if (i == entry_count - 1) { + lsm->lsm_maxbytes = (loff_t)lsme->lsme_extent.e_start + + maxbytes; + /* + * the last component hasn't been defined, or + * lsm_maxbytes overflowed. + */ + if (!lsme_is_dom(lsme) && + (lsme->lsme_extent.e_end != LUSTRE_EOF || + lsm->lsm_maxbytes < + (loff_t)lsme->lsme_extent.e_start)) + lsm->lsm_maxbytes = MAX_LFS_FILESIZE; + } + } + + RETURN(lsm); + +out_lsm: + for (i = 0; i < entry_count; i++) + if (lsm->lsm_entries[i]) + lsme_free(lsm->lsm_entries[i]); + + OBD_FREE(lsm, lsm_size); + + RETURN(ERR_PTR(rc)); +} + +const struct lsm_operations lsm_comp_md_v1_ops = { + .lsm_unpackmd = lsm_unpackmd_comp_md_v1, +}; + +void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm) +{ + int i, j; + + CDEBUG(level, "lsm %p, objid "DOSTID", maxbytes %#llx, magic 0x%08X, " + "refc: %d, entry: %u, layout_gen %u\n", + lsm, POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic, + atomic_read(&lsm->lsm_refc), lsm->lsm_entry_count, + lsm->lsm_layout_gen); + + for (i = 0; i < lsm->lsm_entry_count; i++) { + struct lov_stripe_md_entry *lse = lsm->lsm_entries[i]; + + CDEBUG(level, DEXT ": id: %u, flags: %x, " + "magic 0x%08X, layout_gen %u, " + "stripe count %u, sstripe size %u, " + "pool: ["LOV_POOLNAMEF"]\n", + PEXT(&lse->lsme_extent), lse->lsme_id, lse->lsme_flags, + lse->lsme_magic, lse->lsme_layout_gen, + lse->lsme_stripe_count, lse->lsme_stripe_size, + lse->lsme_pool_name); + if (!lsme_inited(lse) || + lse->lsme_pattern & LOV_PATTERN_F_RELEASED) + continue; + for (j = 0; j < lse->lsme_stripe_count; j++) { + CDEBUG(level, " oinfo:%p: ostid: "DOSTID + " ost idx: %d gen: %d\n", + lse->lsme_oinfo[j], + POSTID(&lse->lsme_oinfo[j]->loi_oi), + lse->lsme_oinfo[j]->loi_ost_idx, + lse->lsme_oinfo[j]->loi_ost_gen); + } + } +} + +int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset) +{ + int i; + + for (i = 0; i < lsm->lsm_entry_count; i++) { + struct lov_stripe_md_entry *lse = lsm->lsm_entries[i]; + + if ((offset >= lse->lsme_extent.e_start && + offset < lse->lsme_extent.e_end) || + (offset == OBD_OBJECT_EOF && + lse->lsme_extent.e_end == OBD_OBJECT_EOF)) + return i; + } + + return -1; +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h new file mode 100644 index 0000000000000..c4ea3804db4ba --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h @@ -0,0 +1,377 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LOV_INTERNAL_H +#define LOV_INTERNAL_H + +#include +#include + +/* If we are unable to get the maximum object size from the OST in + * ocd_maxbytes using OBD_CONNECT_MAXBYTES, then we fall back to using + * the old maximum object size from ext3. */ +#define LUSTRE_EXT3_STRIPE_MAXBYTES 0x1fffffff000ULL + +struct lov_stripe_md_entry { + struct lu_extent lsme_extent; + u32 lsme_id; + u32 lsme_magic; + u32 lsme_flags; + u32 lsme_pattern; + u64 lsme_timestamp; + u32 lsme_stripe_size; + u16 lsme_stripe_count; + u16 lsme_layout_gen; + char lsme_pool_name[LOV_MAXPOOLNAME + 1]; + struct lov_oinfo *lsme_oinfo[]; +}; + +static inline bool lsme_is_dom(struct lov_stripe_md_entry *lsme) +{ + return (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT); +} + +static inline void copy_lsm_entry(struct lov_stripe_md_entry *dst, + struct lov_stripe_md_entry *src) +{ + unsigned i; + + for (i = 0; i < src->lsme_stripe_count; i++) + *dst->lsme_oinfo[i] = *src->lsme_oinfo[i]; + memcpy(dst, src, offsetof(typeof(*src), lsme_oinfo)); +} + +struct lov_stripe_md { + atomic_t lsm_refc; + spinlock_t lsm_lock; + pid_t lsm_lock_owner; /* debugging */ + + /* maximum possible file size, might change as OSTs status changes, + * e.g. disconnected, deactivated */ + loff_t lsm_maxbytes; + struct ost_id lsm_oi; + u32 lsm_magic; + u32 lsm_layout_gen; + u16 lsm_flags; + bool lsm_is_released; + u16 lsm_mirror_count; + u16 lsm_entry_count; + struct lov_stripe_md_entry *lsm_entries[]; +}; + +static inline bool lsme_inited(const struct lov_stripe_md_entry *lsme) +{ + return lsme->lsme_flags & LCME_FL_INIT; +} + +static inline bool lsm_entry_inited(const struct lov_stripe_md *lsm, int index) +{ + return lsme_inited(lsm->lsm_entries[index]); +} + +static inline bool lsm_is_composite(__u32 magic) +{ + return magic == LOV_MAGIC_COMP_V1; +} + +static inline size_t lov_comp_md_size(const struct lov_stripe_md *lsm) +{ + struct lov_stripe_md_entry *lsme; + size_t size; + int entry; + + if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3) + return lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count, + lsm->lsm_entries[0]->lsme_magic); + + LASSERT(lsm->lsm_magic == LOV_MAGIC_COMP_V1); + + size = sizeof(struct lov_comp_md_v1); + for (entry = 0; entry < lsm->lsm_entry_count; entry++) { + u16 stripe_count; + + lsme = lsm->lsm_entries[entry]; + + if (lsme_inited(lsme)) + stripe_count = lsme->lsme_stripe_count; + else + stripe_count = 0; + + size += sizeof(*lsme); + size += lov_mds_md_size(stripe_count, + lsme->lsme_magic); + } + + return size; +} + +static inline bool lsm_has_objects(struct lov_stripe_md *lsm) +{ + return lsm != NULL && !lsm->lsm_is_released; +} + +static inline unsigned int lov_comp_index(int entry, int stripe) +{ + LASSERT(entry >= 0 && entry <= SHRT_MAX); + LASSERT(stripe >= 0 && stripe < USHRT_MAX); + + return entry << 16 | stripe; +} + +static inline int lov_comp_stripe(int index) +{ + return index & 0xffff; +} + +static inline int lov_comp_entry(int index) +{ + return index >> 16; +} + +struct lsm_operations { + struct lov_stripe_md *(*lsm_unpackmd)(struct lov_obd *, void *, size_t); +}; + +extern const struct lsm_operations lsm_v1_ops; +extern const struct lsm_operations lsm_v3_ops; +extern const struct lsm_operations lsm_comp_md_v1_ops; +static inline const struct lsm_operations *lsm_op_find(int magic) +{ + switch (magic) { + case LOV_MAGIC_V1: + return &lsm_v1_ops; + case LOV_MAGIC_V3: + return &lsm_v3_ops; + case LOV_MAGIC_COMP_V1: + return &lsm_comp_md_v1_ops; + default: + CERROR("unrecognized lsm_magic %08x\n", magic); + return NULL; + } +} + +void lsm_free(struct lov_stripe_md *lsm); + +/* lov_do_div64(a, b) returns a % b, and a = a / b. + * The 32-bit code is LOV-specific due to knowing about stripe limits in + * order to reduce the divisor to a 32-bit number. If the divisor is + * already a 32-bit value the compiler handles this directly. */ +#if BITS_PER_LONG == 64 +# define lov_do_div64(n, base) ({ \ + uint64_t __base = (base); \ + uint64_t __rem; \ + __rem = ((uint64_t)(n)) % __base; \ + (n) = ((uint64_t)(n)) / __base; \ + __rem; \ +}) +#elif BITS_PER_LONG == 32 +# define lov_do_div64(n, base) ({ \ + uint64_t __num = (n); \ + uint64_t __rem; \ + if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) { \ + int __remainder; \ + LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), \ + "64 bit lov division %llu / %llu\n", \ + __num, (uint64_t)(base)); \ + __remainder = __num & (LOV_MIN_STRIPE_SIZE - 1); \ + __num >>= LOV_MIN_STRIPE_BITS; \ + __rem = do_div(__num, (base) >> LOV_MIN_STRIPE_BITS); \ + __rem <<= LOV_MIN_STRIPE_BITS; \ + __rem += __remainder; \ + } else { \ + __rem = do_div(__num, base); \ + } \ + (n) = __num; \ + __rem; \ +}) +#endif + +#define pool_tgt_count(p) ((p)->pool_obds.op_count) +#define pool_tgt_array(p) ((p)->pool_obds.op_array) +#define pool_tgt_rw_sem(p) ((p)->pool_obds.op_rw_sem) + +struct pool_desc { + char pool_name[LOV_MAXPOOLNAME + 1]; + struct lu_tgt_pool pool_obds; + atomic_t pool_refcount; + struct hlist_node pool_hash; /* access by poolname */ + struct list_head pool_list; /* serial access */ + struct proc_dir_entry *pool_proc_entry; + struct obd_device *pool_lobd; /* owner */ +}; + +struct lov_request { + struct obd_info rq_oi; + struct lov_request_set *rq_rqset; + struct list_head rq_link; + int rq_idx; /* index in lov->tgts array */ +}; + +struct lov_request_set { + struct obd_info *set_oi; + struct obd_device *set_obd; + int set_count; + atomic_t set_completes; + atomic_t set_success; + struct list_head set_list; +}; + +extern struct kmem_cache *lov_oinfo_slab; + +extern struct lu_kmem_descr lov_caches[]; + +#define lov_uuid2str(lv, index) \ + (char *)((lv)->lov_tgts[index]->ltd_uuid.uuid) + +/* lov_merge.c */ +int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index, + struct ost_lvb *lvb, __u64 *kms_place); + +/* lov_offset.c */ +loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index); +u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, + u64 ost_size, int stripeno); +int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off, + int stripeno, loff_t *obd_off); +loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size, + int stripeno); +int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno, + struct lu_extent *ext, u64 *obd_start, u64 *obd_end); +int lov_stripe_number(struct lov_stripe_md *lsm, int index, loff_t lov_off); +pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index, + pgoff_t stripe_index, int stripe); + +/* lov_request.c */ +int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset); +int lov_fini_statfs_set(struct lov_request_set *set); + +/* lov_obd.c */ +void lov_tgts_getref(struct obd_device *obd); +void lov_tgts_putref(struct obd_device *obd); +void lov_stripe_lock(struct lov_stripe_md *md); +void lov_stripe_unlock(struct lov_stripe_md *md); +void lov_fix_desc(struct lov_desc *desc); +void lov_fix_desc_stripe_size(__u64 *val); +void lov_fix_desc_stripe_count(__u32 *val); +void lov_fix_desc_pattern(__u32 *val); +void lov_fix_desc_qos_maxage(__u32 *val); +__u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic, + __u16 stripe_count); +int lov_connect_obd(struct obd_device *obd, u32 index, int activate, + struct obd_connect_data *data); +int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, + u32 *indexp, int *genp); +int lov_del_target(struct obd_device *obd, u32 index, + struct obd_uuid *uuidp, int gen); + +/* lov_pack.c */ +ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf, + size_t buf_size); +struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf, + size_t buf_size); +int lov_free_memmd(struct lov_stripe_md **lsmp); + +void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm); +void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm); +void lov_dump_lmm_common(int level, void *lmmp); +void lov_dump_lmm(int level, void *lmm); + +/* lov_ea.c */ +void lsm_free_plain(struct lov_stripe_md *lsm); +void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm); + +/* lproc_lov.c */ +int lov_tunables_init(struct obd_device *obd); + +/* lov_cl.c */ +extern struct lu_device_type lov_device_type; + +#define LOV_MDC_TGT_MAX 256 + +/* pools */ +extern struct cfs_hash_ops pool_hash_operations; +/* lu_tgt_pool methods */ +int lov_ost_pool_init(struct lu_tgt_pool *op, unsigned int count); +int lov_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count); +int lov_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count); +int lov_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx); +int lov_ost_pool_free(struct lu_tgt_pool *op); + +/* high level pool methods */ +int lov_pool_new(struct obd_device *obd, char *poolname); +int lov_pool_del(struct obd_device *obd, char *poolname); +int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname); +int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname); +void lov_dump_pool(int level, struct pool_desc *pool); + +static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm) +{ + LASSERT(atomic_read(&lsm->lsm_refc) > 0); + atomic_inc(&lsm->lsm_refc); + return lsm; +} + +static inline bool lov_oinfo_is_dummy(const struct lov_oinfo *loi) +{ + if (unlikely(loi->loi_oi.oi.oi_id == 0 && + loi->loi_oi.oi.oi_seq == 0 && + loi->loi_ost_idx == 0 && + loi->loi_ost_gen == 0)) + return true; + + return false; +} + +static inline struct obd_device *lov2obd(const struct lov_obd *lov) +{ + return container_of0(lov, struct obd_device, u.lov); +} + +static inline void lov_lsm2layout(struct lov_stripe_md *lsm, + struct lov_stripe_md_entry *lsme, + struct ost_layout *ol) +{ + ol->ol_stripe_size = lsme->lsme_stripe_size; + ol->ol_stripe_count = lsme->lsme_stripe_count; + if (lsm->lsm_magic == LOV_MAGIC_COMP_V1) { + ol->ol_comp_start = lsme->lsme_extent.e_start; + ol->ol_comp_end = lsme->lsme_extent.e_end; + ol->ol_comp_id = lsme->lsme_id; + } else { + ol->ol_comp_start = 0; + ol->ol_comp_end = 0; + ol->ol_comp_id = 0; + } +} +#endif diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_io.c b/drivers/staging/lustrefsx/lustre/lov/lov_io.c new file mode 100644 index 0000000000000..c6eb7121b5db9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_io.c @@ -0,0 +1,1688 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +static inline struct lov_io_sub *lov_sub_alloc(struct lov_io *lio, int index) +{ + struct lov_io_sub *sub; + + if (lio->lis_nr_subios == 0) { + LASSERT(lio->lis_single_subio_index == -1); + sub = &lio->lis_single_subio; + lio->lis_single_subio_index = index; + memset(sub, 0, sizeof(*sub)); + } else { + OBD_ALLOC_PTR(sub); + } + + if (sub) { + INIT_LIST_HEAD(&sub->sub_list); + INIT_LIST_HEAD(&sub->sub_linkage); + sub->sub_subio_index = index; + } + + return sub; +} + +static inline void lov_sub_free(struct lov_io *lio, struct lov_io_sub *sub) +{ + if (sub->sub_subio_index == lio->lis_single_subio_index) { + LASSERT(sub == &lio->lis_single_subio); + lio->lis_single_subio_index = -1; + } else { + OBD_FREE_PTR(sub); + } +} + +static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub) +{ + ENTRY; + + cl_io_fini(sub->sub_env, &sub->sub_io); + + if (sub->sub_env && !IS_ERR(sub->sub_env)) { + cl_env_put(sub->sub_env, &sub->sub_refcheck); + sub->sub_env = NULL; + } + EXIT; +} + +static inline bool +is_index_within_mirror(struct lov_object *lov, int index, int mirror_index) +{ + struct lov_layout_composite *comp = &lov->u.composite; + struct lov_mirror_entry *lre = &comp->lo_mirrors[mirror_index]; + + return (index >= lre->lre_start && index <= lre->lre_end); +} + +static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub) +{ + struct lov_object *lov = lio->lis_object; + struct cl_io *sub_io; + struct cl_object *sub_obj; + struct cl_io *io = lio->lis_cl.cis_io; + int index = lov_comp_entry(sub->sub_subio_index); + int stripe = lov_comp_stripe(sub->sub_subio_index); + int result = 0; + LASSERT(sub->sub_env == NULL); + ENTRY; + + if (unlikely(!lov_r0(lov, index)->lo_sub || + !lov_r0(lov, index)->lo_sub[stripe])) + RETURN(-EIO); + + LASSERTF(is_index_within_mirror(lov, index, lio->lis_mirror_index), + DFID "iot = %d, index = %d, mirror = %d\n", + PFID(lu_object_fid(lov2lu(lov))), io->ci_type, index, + lio->lis_mirror_index); + + /* obtain new environment */ + sub->sub_env = cl_env_get(&sub->sub_refcheck); + if (IS_ERR(sub->sub_env)) { + result = PTR_ERR(sub->sub_env); + RETURN(result); + } + + sub_obj = lovsub2cl(lov_r0(lov, index)->lo_sub[stripe]); + sub_io = &sub->sub_io; + + sub_io->ci_obj = sub_obj; + sub_io->ci_result = 0; + + sub_io->ci_parent = io; + sub_io->ci_lockreq = io->ci_lockreq; + sub_io->ci_type = io->ci_type; + sub_io->ci_no_srvlock = io->ci_no_srvlock; + sub_io->ci_noatime = io->ci_noatime; + sub_io->ci_lock_no_expand = io->ci_lock_no_expand; + sub_io->ci_ndelay = io->ci_ndelay; + sub_io->ci_layout_version = io->ci_layout_version; + sub_io->ci_tried_all_mirrors = io->ci_tried_all_mirrors; + + result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj); + + if (result < 0) + lov_io_sub_fini(env, lio, sub); + + RETURN(result); +} + +struct lov_io_sub *lov_sub_get(const struct lu_env *env, + struct lov_io *lio, int index) +{ + struct lov_io_sub *sub; + int rc = 0; + + ENTRY; + + list_for_each_entry(sub, &lio->lis_subios, sub_list) { + if (sub->sub_subio_index == index) { + rc = 1; + break; + } + } + + if (rc == 0) { + sub = lov_sub_alloc(lio, index); + if (!sub) + GOTO(out, rc = -ENOMEM); + + rc = lov_io_sub_init(env, lio, sub); + if (rc < 0) { + lov_sub_free(lio, sub); + GOTO(out, rc); + } + + list_add_tail(&sub->sub_list, &lio->lis_subios); + lio->lis_nr_subios++; + } +out: + if (rc < 0) + sub = ERR_PTR(rc); + else + sub->sub_io.ci_noquota = lio->lis_cl.cis_io->ci_noquota; + RETURN(sub); +} + +/***************************************************************************** + * + * Lov io operations. + * + */ + +int lov_page_index(const struct cl_page *page) +{ + const struct cl_page_slice *slice; + ENTRY; + + slice = cl_page_at(page, &lov_device_type); + LASSERT(slice != NULL); + LASSERT(slice->cpl_obj != NULL); + + RETURN(cl2lov_page(slice)->lps_index); +} + +static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio, + struct cl_io *io) +{ + ENTRY; + + LASSERT(lio->lis_object != NULL); + + INIT_LIST_HEAD(&lio->lis_subios); + lio->lis_single_subio_index = -1; + lio->lis_nr_subios = 0; + + RETURN(0); +} + +/** + * Decide if it will need write intent RPC + */ +static int lov_io_mirror_write_intent(struct lov_io *lio, + struct lov_object *obj, struct cl_io *io) +{ + struct lov_layout_composite *comp = &obj->u.composite; + struct lu_extent *ext = &io->ci_write_intent; + struct lov_mirror_entry *lre; + struct lov_mirror_entry *primary; + struct lov_layout_entry *lle; + size_t count = 0; + ENTRY; + + *ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos }; + io->ci_need_write_intent = 0; + + if (!(io->ci_type == CIT_WRITE || cl_io_is_trunc(io) || + cl_io_is_mkwrite(io))) + RETURN(0); + + /* + * FLR: check if it needs to send a write intent RPC to server. + * Writing to sync_pending file needs write intent RPC to change + * the file state back to write_pending, so that the layout version + * can be increased when the state changes to sync_pending at a later + * time. Otherwise there exists a chance that an evicted client may + * dirty the file data while resync client is working on it. + * Designated I/O is allowed for resync workload. + */ + if (lov_flr_state(obj) == LCM_FL_RDONLY || + (lov_flr_state(obj) == LCM_FL_SYNC_PENDING && + io->ci_designated_mirror == 0)) { + io->ci_need_write_intent = 1; + RETURN(0); + } + + LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING)); + LASSERT(comp->lo_preferred_mirror >= 0); + + /* + * need to iterate all components to see if there are + * multiple components covering the writing component + */ + primary = &comp->lo_mirrors[comp->lo_preferred_mirror]; + LASSERT(!primary->lre_stale); + lov_foreach_mirror_layout_entry(obj, lle, primary) { + LASSERT(lle->lle_valid); + if (!lu_extent_is_overlapped(ext, lle->lle_extent)) + continue; + + ext->e_start = MIN(ext->e_start, lle->lle_extent->e_start); + ext->e_end = MAX(ext->e_end, lle->lle_extent->e_end); + ++count; + } + if (count == 0) { + CERROR(DFID ": cannot find any valid components covering " + "file extent "DEXT", mirror: %d\n", + PFID(lu_object_fid(lov2lu(obj))), PEXT(ext), + primary->lre_mirror_id); + RETURN(-EIO); + } + + count = 0; + lov_foreach_mirror_entry(obj, lre) { + if (lre == primary) + continue; + + lov_foreach_mirror_layout_entry(obj, lle, lre) { + if (!lle->lle_valid) + continue; + + if (lu_extent_is_overlapped(ext, lle->lle_extent)) { + ++count; + break; + } + } + } + + CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to " + "modify file extent "DEXT", iot: %d\n", + PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type); + + io->ci_need_write_intent = count > 0; + + RETURN(0); +} + +static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj, + struct cl_io *io) +{ + struct lov_layout_composite *comp = &obj->u.composite; + int index; + int i; + int result; + ENTRY; + + if (!lov_is_flr(obj)) { + /* only locks/pages are manipulated for CIT_MISC op, no + * cl_io_loop() will be called, don't check/set mirror info. + */ + if (io->ci_type != CIT_MISC) { + LASSERT(comp->lo_preferred_mirror == 0); + lio->lis_mirror_index = comp->lo_preferred_mirror; + } + io->ci_ndelay = 0; + RETURN(0); + } + + /* transfer the layout version for verification */ + if (io->ci_layout_version == 0) + io->ci_layout_version = obj->lo_lsm->lsm_layout_gen; + + /* find the corresponding mirror for designated mirror IO */ + if (io->ci_designated_mirror > 0) { + struct lov_mirror_entry *entry; + + LASSERT(!io->ci_ndelay); + + CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n", + lov_flr_state(obj)); + + if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE) && + (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) { + /* + * For resync I/O, the ci_layout_version was the layout + * version when resync starts. If it doesn't match the + * current object layout version, it means the layout + * has been changed + */ + RETURN(-ESTALE); + } + + io->ci_layout_version |= LU_LAYOUT_RESYNC; + + index = 0; + lio->lis_mirror_index = -1; + lov_foreach_mirror_entry(obj, entry) { + if (entry->lre_mirror_id == + io->ci_designated_mirror) { + lio->lis_mirror_index = index; + break; + } + + index++; + } + + RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0); + } + + result = lov_io_mirror_write_intent(lio, obj, io); + if (result) + RETURN(result); + + if (io->ci_need_write_intent) { + CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n", + PFID(lu_object_fid(lov2lu(obj))), + lio->lis_pos, lio->lis_endpos); + + if (cl_io_is_trunc(io)) { + /** + * for truncate, we uses [size, EOF) to judge whether + * a write intent needs to be send, but we need to + * restore the write extent to [0, size], in truncate, + * the byte in the size position is accessed. + */ + io->ci_write_intent.e_start = 0; + io->ci_write_intent.e_end = + io->u.ci_setattr.sa_attr.lvb_size + 1; + } + /* stop cl_io_init() loop */ + RETURN(1); + } + + if (io->ci_ndelay_tried == 0 || /* first time to try */ + /* reset the mirror index if layout has changed */ + lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) { + lio->lis_mirror_layout_gen = obj->lo_lsm->lsm_layout_gen; + index = lio->lis_mirror_index = comp->lo_preferred_mirror; + } else { + index = lio->lis_mirror_index; + LASSERT(index >= 0); + + /* move mirror index to the next one */ + index = (index + 1) % comp->lo_mirror_count; + } + + for (i = 0; i < comp->lo_mirror_count; i++) { + struct lu_extent ext = { .e_start = lio->lis_pos, + .e_end = lio->lis_pos + 1 }; + struct lov_mirror_entry *lre; + struct lov_layout_entry *lle; + bool found = false; + + lre = &comp->lo_mirrors[(index + i) % comp->lo_mirror_count]; + if (!lre->lre_valid) + continue; + + lov_foreach_mirror_layout_entry(obj, lle, lre) { + if (!lle->lle_valid) + continue; + + if (lu_extent_is_overlapped(&ext, lle->lle_extent)) { + found = true; + break; + } + } /* each component of the mirror */ + if (found) { + index = (index + i) % comp->lo_mirror_count; + break; + } + } /* each mirror */ + + if (i == comp->lo_mirror_count) { + CERROR(DFID": failed to find a component covering " + "I/O region at %llu\n", + PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos); + + dump_lsm(D_ERROR, obj->lo_lsm); + + RETURN(-EIO); + } + + CDEBUG(D_VFSTRACE, DFID ": flr state: %d, move mirror from %d to %d, " + "have retried: %d, mirror count: %d\n", + PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj), + lio->lis_mirror_index, index, io->ci_ndelay_tried, + comp->lo_mirror_count); + + lio->lis_mirror_index = index; + + /* + * FLR: if all mirrors have been tried once, most likely the network + * of this client has been partitioned. We should relinquish CPU for + * a while before trying again. + */ + if (io->ci_ndelay && io->ci_ndelay_tried > 0 && + (io->ci_ndelay_tried % comp->lo_mirror_count == 0)) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); /* 10ms */ + if (signal_pending(current)) + RETURN(-EINTR); + + /** + * we'd set ci_tried_all_mirrors to turn off fast mirror + * switching for read after we've tried all mirrors several + * rounds. + */ + io->ci_tried_all_mirrors = io->ci_ndelay_tried % + (comp->lo_mirror_count * 4) == 0; + } + ++io->ci_ndelay_tried; + + CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n", + io->ci_ndelay ? "non-" : ""); + + RETURN(0); +} + +static int lov_io_slice_init(struct lov_io *lio, + struct lov_object *obj, struct cl_io *io) +{ + int index; + int result = 0; + ENTRY; + + io->ci_result = 0; + lio->lis_object = obj; + + LASSERT(obj->lo_lsm != NULL); + + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + lio->lis_io_endpos = lio->lis_endpos; + if (cl_io_is_append(io)) { + LASSERT(io->ci_type == CIT_WRITE); + + /* + * If there is LOV EA hole, then we may cannot locate + * the current file-tail exactly. + */ + if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern & + LOV_PATTERN_F_HOLE)) + GOTO(out, result = -EIO); + + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + } + break; + + case CIT_SETATTR: + if (cl_io_is_trunc(io)) + lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size; + else + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + case CIT_DATA_VERSION: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + case CIT_FAULT: { + pgoff_t index = io->u.ci_fault.ft_index; + + lio->lis_pos = cl_offset(io->ci_obj, index); + lio->lis_endpos = cl_offset(io->ci_obj, index + 1); + break; + } + + case CIT_FSYNC: { + lio->lis_pos = io->u.ci_fsync.fi_start; + lio->lis_endpos = io->u.ci_fsync.fi_end; + break; + } + + case CIT_LADVISE: { + lio->lis_pos = io->u.ci_ladvise.li_start; + lio->lis_endpos = io->u.ci_ladvise.li_end; + break; + } + + case CIT_GLIMPSE: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + + if (lov_flr_state(obj) == LCM_FL_RDONLY && + !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE)) + /* SoM is accurate, no need glimpse */ + GOTO(out, result = 1); + break; + + case CIT_MISC: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + default: + LBUG(); + } + + result = lov_io_mirror_init(lio, obj, io); + if (result) + GOTO(out, result); + + /* check if it needs to instantiate layout */ + if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) || + (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0))) + GOTO(out, result = 0); + + /* + * for truncate, it only needs to instantiate the components + * before the truncated size. + */ + if (cl_io_is_trunc(io)) { + io->ci_write_intent.e_start = 0; + /* for writes, e_end is endpos, the location of the file + * pointer after the write is completed, so it is not accessed. + * For truncate, 'end' is the size, and *is* acccessed. + * In other words, writes are [start, end), but truncate is + * [start, size], where both are included. So add 1 to the + * size when creating the write intent to account for this. + */ + io->ci_write_intent.e_end = + io->u.ci_setattr.sa_attr.lvb_size + 1; + } else { + io->ci_write_intent.e_start = lio->lis_pos; + io->ci_write_intent.e_end = lio->lis_endpos; + } + + index = 0; + lov_foreach_io_layout(index, lio, &io->ci_write_intent) { + if (!lsm_entry_inited(obj->lo_lsm, index)) { + io->ci_need_write_intent = 1; + break; + } + } + + if (io->ci_need_write_intent && io->ci_designated_mirror > 0) { + /* + * REINT_SYNC RPC has already tried to instantiate all of the + * components involved, obviously it didn't succeed. Skip this + * mirror for now. The server won't be able to figure out + * which mirror it should instantiate components + */ + CERROR(DFID": trying to instantiate components for designated " + "I/O, file state: %d\n", + PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj)); + + io->ci_need_write_intent = 0; + GOTO(out, result = -EIO); + } + + if (io->ci_need_write_intent) + GOTO(out, result = 1); + + EXIT; + +out: + return result; +} + +static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_object *lov = cl2lov(ios->cis_obj); + + ENTRY; + + LASSERT(list_empty(&lio->lis_active)); + + while (!list_empty(&lio->lis_subios)) { + struct lov_io_sub *sub = list_entry(lio->lis_subios.next, + struct lov_io_sub, + sub_list); + + list_del_init(&sub->sub_list); + lio->lis_nr_subios--; + + lov_io_sub_fini(env, lio, sub); + lov_sub_free(lio, sub); + } + LASSERT(lio->lis_nr_subios == 0); + + LASSERT(atomic_read(&lov->lo_active_ios) > 0); + if (atomic_dec_and_test(&lov->lo_active_ios)) + wake_up_all(&lov->lo_waitq); + EXIT; +} + +static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, + loff_t start, loff_t end) +{ + struct cl_io *io = &sub->sub_io; + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct cl_io *parent = lio->lis_cl.cis_io; + int index = lov_comp_entry(sub->sub_subio_index); + int stripe = lov_comp_stripe(sub->sub_subio_index); + + switch (io->ci_type) { + case CIT_SETATTR: { + io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr; + io->u.ci_setattr.sa_attr_flags = + parent->u.ci_setattr.sa_attr_flags; + io->u.ci_setattr.sa_avalid = parent->u.ci_setattr.sa_avalid; + io->u.ci_setattr.sa_xvalid = parent->u.ci_setattr.sa_xvalid; + io->u.ci_setattr.sa_stripe_index = stripe; + io->u.ci_setattr.sa_parent_fid = + parent->u.ci_setattr.sa_parent_fid; + if (cl_io_is_trunc(io)) { + loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size; + + new_size = lov_size_to_stripe(lsm, index, new_size, + stripe); + io->u.ci_setattr.sa_attr.lvb_size = new_size; + } + lov_lsm2layout(lsm, lsm->lsm_entries[index], + &io->u.ci_setattr.sa_layout); + break; + } + case CIT_DATA_VERSION: { + io->u.ci_data_version.dv_data_version = 0; + io->u.ci_data_version.dv_flags = + parent->u.ci_data_version.dv_flags; + break; + } + case CIT_FAULT: { + struct cl_object *obj = parent->ci_obj; + loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index); + + io->u.ci_fault = parent->u.ci_fault; + off = lov_size_to_stripe(lsm, index, off, stripe); + io->u.ci_fault.ft_index = cl_index(obj, off); + break; + } + case CIT_FSYNC: { + io->u.ci_fsync.fi_start = start; + io->u.ci_fsync.fi_end = end; + io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid; + io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode; + break; + } + case CIT_READ: + case CIT_WRITE: { + io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent); + io->ci_tried_all_mirrors = parent->ci_tried_all_mirrors; + if (cl_io_is_append(parent)) { + io->u.ci_wr.wr_append = 1; + } else { + io->u.ci_rw.crw_pos = start; + io->u.ci_rw.crw_count = end - start; + } + break; + } + case CIT_LADVISE: { + io->u.ci_ladvise.li_start = start; + io->u.ci_ladvise.li_end = end; + io->u.ci_ladvise.li_fid = parent->u.ci_ladvise.li_fid; + io->u.ci_ladvise.li_advice = parent->u.ci_ladvise.li_advice; + io->u.ci_ladvise.li_flags = parent->u.ci_ladvise.li_flags; + break; + } + case CIT_GLIMPSE: + case CIT_MISC: + default: + break; + } +} + +static loff_t lov_offset_mod(loff_t val, int delta) +{ + if (val != OBD_OBJECT_EOF) + val += delta; + return val; +} + +static int lov_io_add_sub(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub, u64 start, u64 end) +{ + int rc; + + end = lov_offset_mod(end, 1); + lov_io_sub_inherit(sub, lio, start, end); + rc = cl_io_iter_init(sub->sub_env, &sub->sub_io); + if (rc != 0) { + cl_io_iter_fini(sub->sub_env, &sub->sub_io); + return rc; + } + + list_add_tail(&sub->sub_linkage, &lio->lis_active); + + return rc; +} +static int lov_io_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct lov_io_sub *sub; + struct lu_extent ext; + int index; + int rc = 0; + + ENTRY; + + ext.e_start = lio->lis_pos; + ext.e_end = lio->lis_endpos; + + lov_foreach_io_layout(index, lio, &ext) { + struct lov_layout_entry *le = lov_entry(lio->lis_object, index); + struct lov_layout_raid0 *r0 = &le->lle_raid0; + u64 start; + u64 end; + int stripe; + bool tested_trunc_stripe = false; + + r0->lo_trunc_stripeno = -1; + + CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n", + index, lsm->lsm_entries[index]->lsme_flags); + if (!lsm_entry_inited(lsm, index)) { + /* + * Read from uninitialized components should return + * zero filled pages. + */ + continue; + } + + if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) { + CERROR("I/O to invalid component: %d, mirror: %d\n", + index, lio->lis_mirror_index); + RETURN(-EIO); + } + + for (stripe = 0; stripe < r0->lo_nr; stripe++) { + if (!lov_stripe_intersects(lsm, index, stripe, + &ext, &start, &end)) + continue; + + if (unlikely(!r0->lo_sub[stripe])) { + if (ios->cis_io->ci_type == CIT_READ || + ios->cis_io->ci_type == CIT_WRITE || + ios->cis_io->ci_type == CIT_FAULT) + RETURN(-EIO); + + continue; + } + + if (cl_io_is_trunc(ios->cis_io) && + !tested_trunc_stripe) { + int prev; + u64 tr_start; + + prev = (stripe == 0) ? r0->lo_nr - 1 : + stripe - 1; + /** + * Only involving previous stripe if the + * truncate in this component is at the + * beginning of this stripe. + */ + tested_trunc_stripe = true; + if (ext.e_start < lsm->lsm_entries[index]-> + lsme_extent.e_start) { + /* need previous stripe involvement */ + r0->lo_trunc_stripeno = prev; + } else { + tr_start = ext.e_start; + tr_start = lov_do_div64(tr_start, + stripe_width(lsm, index)); + /* tr_start %= stripe_swidth */ + if (tr_start == stripe * lsm-> + lsm_entries[index]-> + lsme_stripe_size) + r0->lo_trunc_stripeno = prev; + } + } + + /* if the last stripe is the trunc stripeno */ + if (r0->lo_trunc_stripeno == stripe) + r0->lo_trunc_stripeno = -1; + + sub = lov_sub_get(env, lio, + lov_comp_index(index, stripe)); + if (IS_ERR(sub)) + return PTR_ERR(sub); + + rc = lov_io_add_sub(env, lio, sub, start, end); + if (rc != 0) + break; + } + if (rc != 0) + break; + + if (r0->lo_trunc_stripeno != -1) { + stripe = r0->lo_trunc_stripeno; + if (unlikely(!r0->lo_sub[stripe])) { + r0->lo_trunc_stripeno = -1; + continue; + } + sub = lov_sub_get(env, lio, + lov_comp_index(index, stripe)); + if (IS_ERR(sub)) + return PTR_ERR(sub); + + /** + * the prev sub could be used by another truncate, we'd + * skip it. LU-14128 happends when expand truncate + + * read get wrong kms. + */ + if (!list_empty(&sub->sub_linkage)) { + r0->lo_trunc_stripeno = -1; + continue; + } + + (void)lov_stripe_intersects(lsm, index, stripe, &ext, + &start, &end); + rc = lov_io_add_sub(env, lio, sub, start, end); + if (rc != 0) + break; + + } + } + RETURN(rc); +} + +static int lov_io_rw_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *io = ios->cis_io; + struct lov_stripe_md_entry *lse; + loff_t start = io->u.ci_rw.crw_pos; + loff_t next; + int index; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + ENTRY; + + if (cl_io_is_append(io)) + RETURN(lov_io_iter_init(env, ios)); + + index = lov_io_layout_at(lio, io->u.ci_rw.crw_pos); + if (index < 0) { /* non-existing layout component */ + if (io->ci_type == CIT_READ) { + /* + * TODO: it needs to detect the next component and + * then set the next pos + */ + io->ci_continue = 0; + + RETURN(lov_io_iter_init(env, ios)); + } + + RETURN(-ENODATA); + } + + if (!lov_entry(lio->lis_object, index)->lle_valid && + !io->ci_designated_mirror) + RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO); + + lse = lov_lse(lio->lis_object, index); + + next = MAX_LFS_FILESIZE; + if (lse->lsme_stripe_count > 1) { + unsigned long ssize = lse->lsme_stripe_size; + + lov_do_div64(start, ssize); + next = (start + 1) * ssize; + if (next <= start * ssize) + next = MAX_LFS_FILESIZE; + } + + LASSERTF(io->u.ci_rw.crw_pos >= lse->lsme_extent.e_start, + "pos %lld, [%lld, %lld)\n", io->u.ci_rw.crw_pos, + lse->lsme_extent.e_start, lse->lsme_extent.e_end); + next = min_t(__u64, next, lse->lsme_extent.e_end); + next = min_t(loff_t, next, lio->lis_io_endpos); + + io->ci_continue = next < lio->lis_io_endpos; + io->u.ci_rw.crw_count = next - io->u.ci_rw.crw_pos; + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + CDEBUG(D_VFSTRACE, + "stripe: %llu chunk: [%llu, %llu) %llu, %zd\n", + (__u64)start, lio->lis_pos, lio->lis_endpos, + (__u64)lio->lis_io_endpos, io->u.ci_rw.crw_count); + + /* + * XXX The following call should be optimized: we know, that + * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe. + */ + RETURN(lov_io_iter_init(env, ios)); +} + +static int lov_io_setattr_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *io = ios->cis_io; + int index; + ENTRY; + + if (cl_io_is_trunc(io) && lio->lis_pos > 0) { + index = lov_io_layout_at(lio, lio->lis_pos - 1); + /* no entry found for such offset */ + if (index < 0) + RETURN(io->ci_result = -ENODATA); + } + + RETURN(lov_io_iter_init(env, ios)); +} + +static int lov_io_call(const struct lu_env *env, struct lov_io *lio, + int (*iofunc)(const struct lu_env *, struct cl_io *)) +{ + struct cl_io *parent = lio->lis_cl.cis_io; + struct lov_io_sub *sub; + int rc = 0; + + ENTRY; + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + rc = iofunc(sub->sub_env, &sub->sub_io); + if (rc) + break; + + if (parent->ci_result == 0) + parent->ci_result = sub->sub_io.ci_result; + } + RETURN(rc); +} + +static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios) +{ + ENTRY; + RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock)); +} + +static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios) +{ + ENTRY; + RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start)); +} + +static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io) +{ + ENTRY; + /* + * It's possible that lov_io_start() wasn't called against this + * sub-io, either because previous sub-io failed, or upper layer + * completed IO. + */ + if (io->ci_state == CIS_IO_GOING) + cl_io_end(env, io); + else + io->ci_state = CIS_IO_FINISHED; + RETURN(0); +} + +static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io) +{ + cl_io_iter_fini(env, io); + RETURN(0); +} + +static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io) +{ + cl_io_unlock(env, io); + RETURN(0); +} + +static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios) +{ + int rc; + + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper); + LASSERT(rc == 0); +} + +static void +lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *parent = lio->lis_cl.cis_io; + struct cl_data_version_io *pdv = &parent->u.ci_data_version; + struct lov_io_sub *sub; + + ENTRY; + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + struct cl_data_version_io *sdv = &sub->sub_io.u.ci_data_version; + + lov_io_end_wrapper(sub->sub_env, &sub->sub_io); + + pdv->dv_data_version += sdv->dv_data_version; + if (pdv->dv_layout_version > sdv->dv_layout_version) + pdv->dv_layout_version = sdv->dv_layout_version; + + if (parent->ci_result == 0) + parent->ci_result = sub->sub_io.ci_result; + } + + EXIT; +} + +static void lov_io_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + int rc; + + ENTRY; + rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper); + LASSERT(rc == 0); + while (!list_empty(&lio->lis_active)) + list_del_init(lio->lis_active.next); + EXIT; +} + +static void lov_io_unlock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + int rc; + + ENTRY; + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper); + LASSERT(rc == 0); + EXIT; +} + +static int lov_io_read_ahead(const struct lu_env *env, + const struct cl_io_slice *ios, + pgoff_t start, struct cl_read_ahead *ra) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_object *loo = lio->lis_object; + struct cl_object *obj = lov2cl(loo); + struct lov_layout_raid0 *r0; + struct lov_io_sub *sub; + loff_t offset; + loff_t suboff; + pgoff_t ra_end; + unsigned int pps; /* pages per stripe */ + int stripe; + int index; + int rc; + ENTRY; + + offset = cl_offset(obj, start); + index = lov_io_layout_at(lio, offset); + if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index)) + RETURN(-ENODATA); + + /* avoid readahead to expand to stale components */ + if (!lov_entry(loo, index)->lle_valid) + RETURN(-EIO); + + stripe = lov_stripe_number(loo->lo_lsm, index, offset); + + r0 = lov_r0(loo, index); + if (unlikely(!r0->lo_sub[stripe])) + RETURN(-EIO); + + sub = lov_sub_get(env, lio, lov_comp_index(index, stripe)); + if (IS_ERR(sub)) + RETURN(PTR_ERR(sub)); + + lov_stripe_offset(loo->lo_lsm, index, offset, stripe, &suboff); + rc = cl_io_read_ahead(sub->sub_env, &sub->sub_io, + cl_index(lovsub2cl(r0->lo_sub[stripe]), suboff), + ra); + + CDEBUG(D_READA, DFID " cra_end = %lu, stripes = %d, rc = %d\n", + PFID(lu_object_fid(lov2lu(loo))), ra->cra_end, r0->lo_nr, rc); + if (rc != 0) + RETURN(rc); + + /** + * Adjust the stripe index by layout of comp. ra->cra_end is the + * maximum page index covered by an underlying DLM lock. + * This function converts cra_end from stripe level to file level, and + * make sure it's not beyond stripe and component boundary. + */ + + /* cra_end is stripe level, convert it into file level */ + ra_end = ra->cra_end; + if (ra_end != CL_PAGE_EOF) + ra->cra_end = lov_stripe_pgoff(loo->lo_lsm, index, + ra_end, stripe); + + /* boundary of current component */ + ra_end = cl_index(obj, (loff_t)lov_io_extent(lio, index)->e_end); + if (ra_end != CL_PAGE_EOF && ra->cra_end >= ra_end) + ra->cra_end = ra_end - 1; + + if (r0->lo_nr == 1) /* single stripe file */ + RETURN(0); + + pps = lov_lse(loo, index)->lsme_stripe_size >> PAGE_SHIFT; + + CDEBUG(D_READA, DFID " max_index = %lu, pps = %u, index = %u, " + "stripe_size = %u, stripe no = %u, start index = %lu\n", + PFID(lu_object_fid(lov2lu(loo))), ra->cra_end, pps, index, + lov_lse(loo, index)->lsme_stripe_size, stripe, start); + + /* never exceed the end of the stripe */ + ra->cra_end = min_t(pgoff_t, + ra->cra_end, start + pps - start % pps - 1); + RETURN(0); +} + +/** + * lov implementation of cl_operations::cio_submit() method. It takes a list + * of pages in \a queue, splits it into per-stripe sub-lists, invokes + * cl_io_submit() on underlying devices to submit sub-lists, and then splices + * everything back. + * + * Major complication of this function is a need to handle memory cleansing: + * cl_io_submit() is called to write out pages as a part of VM memory + * reclamation, and hence it may not fail due to memory shortages (system + * dead-locks otherwise). To deal with this, some resources (sub-lists, + * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a + * not-memory cleansing context), and in case of memory shortage, these + * pre-allocated resources are used by lov_io_submit() under + * lov_device::ld_mutex mutex. + */ +static int lov_io_submit(const struct lu_env *env, + const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + struct cl_page_list *qin = &queue->c2_qin; + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_io_sub *sub; + struct cl_page_list *plist = &lov_env_info(env)->lti_plist; + struct cl_page *page; + struct cl_page *tmp; + int index; + int rc = 0; + ENTRY; + + cl_page_list_init(plist); + while (qin->pl_nr > 0) { + struct cl_2queue *cl2q = &lov_env_info(env)->lti_cl2q; + + page = cl_page_list_first(qin); + if (lov_page_is_empty(page)) { + cl_page_list_move(&queue->c2_qout, qin, page); + + /* + * it could only be mirror read to get here therefore + * the pages will be transient. We don't care about + * the return code of cl_page_prep() at all. + */ + (void) cl_page_prep(env, ios->cis_io, page, crt); + cl_page_completion(env, page, crt, 0); + continue; + } + + cl_2queue_init(cl2q); + cl_page_list_move(&cl2q->c2_qin, qin, page); + + index = lov_page_index(page); + cl_page_list_for_each_safe(page, tmp, qin) { + /* this page is not on this stripe */ + if (index != lov_page_index(page)) + continue; + + cl_page_list_move(&cl2q->c2_qin, qin, page); + } + + sub = lov_sub_get(env, lio, index); + if (!IS_ERR(sub)) { + rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io, + crt, cl2q); + } else { + rc = PTR_ERR(sub); + } + + cl_page_list_splice(&cl2q->c2_qin, plist); + cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout); + cl_2queue_fini(env, cl2q); + + if (rc != 0) + break; + } + + cl_page_list_splice(plist, qin); + cl_page_list_fini(env, plist); + + RETURN(rc); +} + +static int lov_io_commit_async(const struct lu_env *env, + const struct cl_io_slice *ios, + struct cl_page_list *queue, int from, int to, + cl_commit_cbt cb) +{ + struct cl_page_list *plist = &lov_env_info(env)->lti_plist; + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_io_sub *sub; + struct cl_page *page; + int rc = 0; + ENTRY; + + if (lio->lis_nr_subios == 1) { + int idx = lio->lis_single_subio_index; + + LASSERT(!lov_page_is_empty(cl_page_list_first(queue))); + + sub = lov_sub_get(env, lio, idx); + LASSERT(!IS_ERR(sub)); + LASSERT(sub == &lio->lis_single_subio); + rc = cl_io_commit_async(sub->sub_env, &sub->sub_io, queue, + from, to, cb); + RETURN(rc); + } + + cl_page_list_init(plist); + while (queue->pl_nr > 0) { + int stripe_to = to; + int index; + + LASSERT(plist->pl_nr == 0); + page = cl_page_list_first(queue); + LASSERT(!lov_page_is_empty(page)); + + cl_page_list_move(plist, queue, page); + + index = lov_page_index(page); + while (queue->pl_nr > 0) { + page = cl_page_list_first(queue); + if (index != lov_page_index(page)) + break; + + cl_page_list_move(plist, queue, page); + } + + if (queue->pl_nr > 0) /* still has more pages */ + stripe_to = PAGE_SIZE; + + sub = lov_sub_get(env, lio, index); + if (!IS_ERR(sub)) { + rc = cl_io_commit_async(sub->sub_env, &sub->sub_io, + plist, from, stripe_to, cb); + } else { + rc = PTR_ERR(sub); + break; + } + + if (plist->pl_nr > 0) /* short write */ + break; + + from = 0; + } + + /* for error case, add the page back into the qin list */ + LASSERT(ergo(rc == 0, plist->pl_nr == 0)); + while (plist->pl_nr > 0) { + /* error occurred, add the uncommitted pages back into queue */ + page = cl_page_list_last(plist); + cl_page_list_move_head(queue, plist, page); + } + + RETURN(rc); +} + +static int lov_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_fault_io *fio; + struct lov_io *lio; + struct lov_io_sub *sub; + + ENTRY; + + fio = &ios->cis_io->u.ci_fault; + lio = cl2lov_io(env, ios); + sub = lov_sub_get(env, lio, lov_page_index(fio->ft_page)); + sub->sub_io.u.ci_fault.ft_nob = fio->ft_nob; + + RETURN(lov_io_start(env, ios)); +} + +static void lov_io_fsync_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_io_sub *sub; + unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written; + ENTRY; + + *written = 0; + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + struct cl_io *subio = &sub->sub_io; + + lov_io_end_wrapper(sub->sub_env, subio); + + if (subio->ci_result == 0) + *written += subio->u.ci_fsync.fi_nr_written; + } + RETURN_EXIT; +} + +static const struct cl_io_operations lov_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_WRITE] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_SETATTR] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_setattr_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_DATA_VERSION] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_data_version_end, + }, + [CIT_FAULT] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_fault_start, + .cio_end = lov_io_end + }, + [CIT_FSYNC] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_fsync_end + }, + [CIT_LADVISE] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_GLIMPSE] = { + .cio_fini = lov_io_fini, + }, + [CIT_MISC] = { + .cio_fini = lov_io_fini + } + }, + .cio_read_ahead = lov_io_read_ahead, + .cio_submit = lov_io_submit, + .cio_commit_async = lov_io_commit_async, +}; + +/***************************************************************************** + * + * Empty lov io operations. + * + */ + +static void lov_empty_io_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_object *lov = cl2lov(ios->cis_obj); + ENTRY; + + if (atomic_dec_and_test(&lov->lo_active_ios)) + wake_up_all(&lov->lo_waitq); + EXIT; +} + +static int lov_empty_io_submit(const struct lu_env *env, + const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + return -EBADF; +} + +static void lov_empty_impossible(const struct lu_env *env, + struct cl_io_slice *ios) +{ + LBUG(); +} + +#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible) + +/** + * An io operation vector for files without stripes. + */ +static const struct cl_io_operations lov_empty_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = lov_empty_io_fini, +#if 0 + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE +#endif + }, + [CIT_WRITE] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_SETATTR] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_FAULT] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_FSYNC] = { + .cio_fini = lov_empty_io_fini + }, + [CIT_LADVISE] = { + .cio_fini = lov_empty_io_fini + }, + [CIT_GLIMPSE] = { + .cio_fini = lov_empty_io_fini + }, + [CIT_MISC] = { + .cio_fini = lov_empty_io_fini + } + }, + .cio_submit = lov_empty_io_submit, + .cio_commit_async = LOV_EMPTY_IMPOSSIBLE +}; + +int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_io *lio = lov_env_io(env); + struct lov_object *lov = cl2lov(obj); + int result; + + ENTRY; + + INIT_LIST_HEAD(&lio->lis_active); + result = lov_io_slice_init(lio, lov, io); + if (result) + GOTO(out, result); + + result = lov_io_subio_init(env, lio, io); + if (!result) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops); + atomic_inc(&lov->lo_active_ios); + } + EXIT; +out: + io->ci_result = result < 0 ? result : 0; + return result; +} + +int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + int result; + ENTRY; + + lio->lis_object = lov; + switch (io->ci_type) { + default: + LBUG(); + case CIT_MISC: + case CIT_GLIMPSE: + case CIT_READ: + result = 0; + break; + case CIT_FSYNC: + case CIT_LADVISE: + case CIT_SETATTR: + case CIT_DATA_VERSION: + result = +1; + break; + case CIT_WRITE: + result = -EBADF; + break; + case CIT_FAULT: + result = -EFAULT; + CERROR("Page fault on a file without stripes: "DFID"\n", + PFID(lu_object_fid(&obj->co_lu))); + break; + } + if (result == 0) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops); + atomic_inc(&lov->lo_active_ios); + } + + io->ci_result = result < 0 ? result : 0; + RETURN(result); +} + +int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + int result; + ENTRY; + + LASSERT(lov->lo_lsm != NULL); + lio->lis_object = lov; + + switch (io->ci_type) { + default: + LASSERTF(0, "invalid type %d\n", io->ci_type); + result = -EOPNOTSUPP; + break; + case CIT_GLIMPSE: + case CIT_MISC: + case CIT_FSYNC: + case CIT_LADVISE: + case CIT_DATA_VERSION: + result = 1; + break; + case CIT_SETATTR: + /* + * the truncate to 0 is managed by MDT: + * - in open, for open O_TRUNC + * - in setattr, for truncate + */ + /* the truncate is for size > 0 so triggers a restore */ + if (cl_io_is_trunc(io)) { + io->ci_restore_needed = 1; + result = -ENODATA; + } else + result = 1; + break; + case CIT_READ: + case CIT_WRITE: + case CIT_FAULT: + io->ci_restore_needed = 1; + result = -ENODATA; + break; + } + + if (result == 0) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops); + atomic_inc(&lov->lo_active_ios); + } + + io->ci_result = result < 0 ? result : 0; + RETURN(result); +} + +/** + * Return the index in composite:lo_entries by the file offset + */ +int lov_io_layout_at(struct lov_io *lio, __u64 offset) +{ + struct lov_object *lov = lio->lis_object; + struct lov_layout_composite *comp = &lov->u.composite; + int start_index = 0; + int end_index = comp->lo_entry_count - 1; + int i; + + LASSERT(lov->lo_type == LLT_COMP); + + /* This is actual file offset so nothing can cover eof. */ + if (offset == LUSTRE_EOF) + return -1; + + if (lov_is_flr(lov)) { + struct lov_mirror_entry *lre; + + LASSERT(lio->lis_mirror_index >= 0); + + lre = &comp->lo_mirrors[lio->lis_mirror_index]; + start_index = lre->lre_start; + end_index = lre->lre_end; + } + + for (i = start_index; i <= end_index; i++) { + struct lov_layout_entry *lle = lov_entry(lov, i); + + if ((offset >= lle->lle_extent->e_start && + offset < lle->lle_extent->e_end) || + (offset == OBD_OBJECT_EOF && + lle->lle_extent->e_end == OBD_OBJECT_EOF)) + return i; + } + + return -1; +} + +/** @} lov */ diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_lock.c b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c new file mode 100644 index 0000000000000..1b4a95876cc75 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c @@ -0,0 +1,383 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lov lock operations. + * + */ + +static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env, + const struct cl_lock *parent, + struct lov_lock_sub *lls) +{ + struct lov_sublock_env *subenv; + struct lov_io *lio = lov_env_io(env); + struct cl_io *io = lio->lis_cl.cis_io; + struct lov_io_sub *sub; + + subenv = &lov_env_session(env)->ls_subenv; + + /* + * FIXME: We tend to use the subio's env & io to call the sublock + * lock operations because osc lock sometimes stores some control + * variables in thread's IO infomation(Now only lockless information). + * However, if the lock's host(object) is different from the object + * for current IO, we have no way to get the subenv and subio because + * they are not initialized at all. As a temp fix, in this case, + * we still borrow the parent's env to call sublock operations. + */ + if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) { + subenv->lse_env = env; + subenv->lse_io = io; + } else { + sub = lov_sub_get(env, lio, lls->sub_index); + if (!IS_ERR(sub)) { + subenv->lse_env = sub->sub_env; + subenv->lse_io = &sub->sub_io; + } else { + subenv = (void *)sub; + } + } + return subenv; +} + +static int lov_sublock_init(const struct lu_env *env, + const struct cl_lock *parent, + struct lov_lock_sub *lls) +{ + struct lov_sublock_env *subenv; + int result; + + ENTRY; + + subenv = lov_sublock_env_get(env, parent, lls); + if (!IS_ERR(subenv)) { + result = cl_lock_init(subenv->lse_env, &lls->sub_lock, + subenv->lse_io); + } else { + /* error occurs. */ + result = PTR_ERR(subenv); + } + RETURN(result); +} + +/** + * Creates sub-locks for a given lov_lock for the first time. + * + * Goes through all sub-objects of top-object, and creates sub-locks on every + * sub-object intersecting with top-lock extent. This is complicated by the + * fact that top-lock (that is being created) can be accessed concurrently + * through already created sub-locks (possibly shared with other top-locks). + */ +static struct lov_lock *lov_lock_sub_init(const struct lu_env *env, + const struct cl_io *io, + const struct cl_object *obj, + struct cl_lock *lock) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_lock *lovlck; + struct lu_extent ext; + loff_t start; + loff_t end; + int result = 0; + int i; + int index; + int nr; + + ENTRY; + + ext.e_start = cl_offset(obj, lock->cll_descr.cld_start); + if (lock->cll_descr.cld_end == CL_PAGE_EOF) + ext.e_end = OBD_OBJECT_EOF; + else + ext.e_end = cl_offset(obj, lock->cll_descr.cld_end + 1); + + nr = 0; + lov_foreach_io_layout(index, lov_env_io(env), &ext) { + struct lov_layout_raid0 *r0 = lov_r0(lov, index); + + for (i = 0; i < r0->lo_nr; i++) { + if (likely(r0->lo_sub[i])) {/* spare layout */ + if (lov_stripe_intersects(lov->lo_lsm, index, i, + &ext, &start, &end)) + nr++; + else if (cl_io_is_trunc(io) && + r0->lo_trunc_stripeno == i) + nr++; + } + } + } + /** + * Aggressive lock request (from cl_setattr_ost) which asks for + * [eof, -1) lock, could come across uninstantiated layout extent, + * hence a 0 nr is possible. + */ + + OBD_ALLOC_LARGE(lovlck, offsetof(struct lov_lock, lls_sub[nr])); + if (!lovlck) + RETURN(ERR_PTR(-ENOMEM)); + + lovlck->lls_nr = nr; + nr = 0; + lov_foreach_io_layout(index, lov_env_io(env), &ext) { + struct lov_layout_raid0 *r0 = lov_r0(lov, index); + + for (i = 0; i < r0->lo_nr; ++i) { + struct lov_lock_sub *lls = &lovlck->lls_sub[nr]; + struct cl_lock_descr *descr = &lls->sub_lock.cll_descr; + bool intersect = false; + + if (unlikely(!r0->lo_sub[i])) + continue; + + intersect = lov_stripe_intersects(lov->lo_lsm, index, i, + &ext, &start, &end); + if (intersect) + goto init_sublock; + + if (cl_io_is_trunc(io) && i == r0->lo_trunc_stripeno) + goto init_sublock; + + continue; + +init_sublock: + LASSERT(descr->cld_obj == NULL); + descr->cld_obj = lovsub2cl(r0->lo_sub[i]); + descr->cld_start = cl_index(descr->cld_obj, start); + descr->cld_end = cl_index(descr->cld_obj, end); + descr->cld_mode = lock->cll_descr.cld_mode; + descr->cld_gid = lock->cll_descr.cld_gid; + descr->cld_enq_flags = lock->cll_descr.cld_enq_flags; + + lls->sub_index = lov_comp_index(index, i); + + /* initialize sub lock */ + result = lov_sublock_init(env, lock, lls); + if (result < 0) + break; + + lls->sub_initialized = 1; + nr++; + } + } + LASSERT(ergo(result == 0, nr == lovlck->lls_nr)); + + if (result != 0) { + for (i = 0; i < nr; ++i) { + if (!lovlck->lls_sub[i].sub_initialized) + break; + + cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock); + } + + OBD_FREE_LARGE(lovlck, + offsetof(struct lov_lock, lls_sub[nr])); + lovlck = ERR_PTR(result); + } + + RETURN(lovlck); +} + +static void lov_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lov_lock *lovlck; + int i; + + ENTRY; + lovlck = cl2lov_lock(slice); + for (i = 0; i < lovlck->lls_nr; ++i) { + LASSERT(!lovlck->lls_sub[i].sub_is_enqueued); + if (lovlck->lls_sub[i].sub_initialized) + cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock); + } + OBD_FREE_LARGE(lovlck, + offsetof(struct lov_lock, lls_sub[lovlck->lls_nr])); + EXIT; +} + +/** + * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This + * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock + * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock + * state machines in the face of sub-locks sharing (by multiple top-locks), + * and concurrent sub-lock cancellations. + */ +static int lov_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, struct cl_sync_io *anchor) +{ + struct cl_lock *lock = slice->cls_lock; + struct lov_lock *lovlck = cl2lov_lock(slice); + int i; + int rc = 0; + + ENTRY; + + for (i = 0; i < lovlck->lls_nr; ++i) { + struct lov_lock_sub *lls = &lovlck->lls_sub[i]; + struct lov_sublock_env *subenv; + + subenv = lov_sublock_env_get(env, lock, lls); + if (IS_ERR(subenv)) { + rc = PTR_ERR(subenv); + break; + } + + rc = cl_lock_enqueue(subenv->lse_env, subenv->lse_io, + &lls->sub_lock, anchor); + if (rc != 0) + break; + + lls->sub_is_enqueued = 1; + } + RETURN(rc); +} + +static void lov_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct cl_lock *lock = slice->cls_lock; + struct lov_lock *lovlck = cl2lov_lock(slice); + int i; + + ENTRY; + + for (i = 0; i < lovlck->lls_nr; ++i) { + struct lov_lock_sub *lls = &lovlck->lls_sub[i]; + struct cl_lock *sublock = &lls->sub_lock; + struct lov_sublock_env *subenv; + + if (!lls->sub_is_enqueued) + continue; + + lls->sub_is_enqueued = 0; + subenv = lov_sublock_env_get(env, lock, lls); + if (!IS_ERR(subenv)) { + cl_lock_cancel(subenv->lse_env, sublock); + } else { + CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock, + "lov_lock_cancel fails with %ld.\n", + PTR_ERR(subenv)); + } + } +} + +static int lov_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + int i; + + (*p)(env, cookie, "%d\n", lck->lls_nr); + for (i = 0; i < lck->lls_nr; ++i) { + struct lov_lock_sub *sub; + + sub = &lck->lls_sub[i]; + (*p)(env, cookie, " %d %x: ", i, sub->sub_is_enqueued); + cl_lock_print(env, cookie, p, &sub->sub_lock); + } + return 0; +} + +static const struct cl_lock_operations lov_lock_ops = { + .clo_fini = lov_lock_fini, + .clo_enqueue = lov_lock_enqueue, + .clo_cancel = lov_lock_cancel, + .clo_print = lov_lock_print +}; + +int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lov_lock *lck; + int result = 0; + + ENTRY; + lck = lov_lock_sub_init(env, io, obj, lock); + if (!IS_ERR(lck)) + cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops); + else + result = PTR_ERR(lck); + RETURN(result); +} + +static void lov_empty_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + + OBD_SLAB_FREE_PTR(lck, lov_lock_kmem); +} + +static int lov_empty_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + (*p)(env, cookie, "empty\n"); + return 0; +} + +/* XXX: more methods will be added later. */ +static const struct cl_lock_operations lov_empty_lock_ops = { + .clo_fini = lov_empty_lock_fini, + .clo_print = lov_empty_lock_print +}; + +int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lov_lock *lck; + int result = -ENOMEM; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS); + if (lck) { + cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops); + result = 0; + } + RETURN(result); +} + +/** @} lov */ diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_merge.c b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c new file mode 100644 index 0000000000000..8a6ced24ff522 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c @@ -0,0 +1,109 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include +#include "lov_internal.h" + +/** Merge the lock value block(&lvb) attributes and KMS from each of the + * stripes in a file into a single lvb. It is expected that the caller + * initializes the current atime, mtime, ctime to avoid regressing a more + * uptodate time on the local client. + */ +int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index, + struct ost_lvb *lvb, __u64 *kms_place) +{ + struct lov_stripe_md_entry *lse = lsm->lsm_entries[index]; + u64 size = 0; + u64 kms = 0; + u64 blocks = 0; + s64 current_mtime = lvb->lvb_mtime; + s64 current_atime = lvb->lvb_atime; + s64 current_ctime = lvb->lvb_ctime; + int i; + int rc = 0; + + assert_spin_locked(&lsm->lsm_lock); + LASSERT(lsm->lsm_lock_owner == current_pid()); + + CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s=%llu m=%llu" + " a=%llu c=%llu b=%llu\n", POSTID(&lsm->lsm_oi), + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime, + lvb->lvb_blocks); + for (i = 0; i < lse->lsme_stripe_count; i++) { + struct lov_oinfo *loi = lse->lsme_oinfo[i]; + u64 lov_size; + u64 tmpsize; + + if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) { + rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks); + continue; + } + + tmpsize = loi->loi_kms; + lov_size = lov_stripe_size(lsm, index, tmpsize, i); + if (lov_size > kms) + kms = lov_size; + + if (loi->loi_lvb.lvb_size > tmpsize) + tmpsize = loi->loi_lvb.lvb_size; + + lov_size = lov_stripe_size(lsm, index, tmpsize, i); + if (lov_size > size) + size = lov_size; + /* merge blocks, mtime, atime */ + blocks += loi->loi_lvb.lvb_blocks; + if (loi->loi_lvb.lvb_mtime > current_mtime) + current_mtime = loi->loi_lvb.lvb_mtime; + if (loi->loi_lvb.lvb_atime > current_atime) + current_atime = loi->loi_lvb.lvb_atime; + if (loi->loi_lvb.lvb_ctime > current_ctime) + current_ctime = loi->loi_lvb.lvb_ctime; + + CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s=%llu m=%llu" + " a=%llu c=%llu b=%llu\n", POSTID(&lsm->lsm_oi), + loi->loi_ost_idx, loi->loi_lvb.lvb_size, + loi->loi_lvb.lvb_mtime, loi->loi_lvb.lvb_atime, + loi->loi_lvb.lvb_ctime, loi->loi_lvb.lvb_blocks); + } + + *kms_place = kms; + lvb->lvb_size = size; + lvb->lvb_blocks = blocks; + lvb->lvb_mtime = current_mtime; + lvb->lvb_atime = current_atime; + lvb->lvb_ctime = current_ctime; + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c new file mode 100644 index 0000000000000..b9c42313fe3ae --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c @@ -0,0 +1,1419 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_obd.c + * + * Author: Phil Schwan + * Author: Peter Braam + * Author: Mike Shaver + * Author: Nathan Rutman + */ + +#define DEBUG_SUBSYSTEM S_LOV +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lov_internal.h" + +/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion. + Any function that expects lov_tgts to remain stationary must take a ref. */ +void lov_tgts_getref(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + + /* nobody gets through here until lov_putref is done */ + mutex_lock(&lov->lov_lock); + atomic_inc(&lov->lov_refcount); + mutex_unlock(&lov->lov_lock); + return; +} + +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt); + +void lov_tgts_putref(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + + mutex_lock(&lov->lov_lock); + /* ok to dec to 0 more than once -- ltd_exp's will be null */ + if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) { + struct list_head kill = LIST_HEAD_INIT(kill); + struct lov_tgt_desc *tgt, *n; + int i; + + CDEBUG(D_CONFIG, "destroying %d lov targets\n", + lov->lov_death_row); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + tgt = lov->lov_tgts[i]; + + if (!tgt || !tgt->ltd_reap) + continue; + list_add(&tgt->ltd_kill, &kill); + /* XXX - right now there is a dependency on ld_tgt_count + * being the maximum tgt index for computing the + * mds_max_easize. So we can't shrink it. */ + lov_ost_pool_remove(&lov->lov_packed, i); + lov->lov_tgts[i] = NULL; + lov->lov_death_row--; + } + mutex_unlock(&lov->lov_lock); + + list_for_each_entry_safe(tgt, n, &kill, ltd_kill) { + list_del(&tgt->ltd_kill); + /* Disconnect */ + __lov_del_obd(obd, tgt); + } + } else { + mutex_unlock(&lov->lov_lock); + } +} + +static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, + enum obd_notify_event ev); +static int lov_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev); + +int lov_connect_obd(struct obd_device *obd, u32 index, int activate, + struct obd_connect_data *data) +{ + struct lov_obd *lov = &obd->u.lov; + struct obd_uuid *tgt_uuid; + struct obd_device *tgt_obd; + static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" }; + struct obd_import *imp; + int rc; + ENTRY; + + if (lov->lov_tgts[index] == NULL) + RETURN(-EINVAL); + + tgt_uuid = &lov->lov_tgts[index]->ltd_uuid; + tgt_obd = lov->lov_tgts[index]->ltd_obd; + + if (!tgt_obd->obd_set_up) { + CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid)); + RETURN(-EINVAL); + } + + /* override the sp_me from lov */ + tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me; + + if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX)) + data->ocd_index = index; + + /* + * Divine LOV knows that OBDs under it are OSCs. + */ + imp = tgt_obd->u.cli.cl_import; + + if (activate) { + tgt_obd->obd_no_recov = 0; + /* FIXME this is probably supposed to be + ptlrpc_set_import_active. Horrible naming. */ + ptlrpc_activate_import(imp, false); + } + + rc = obd_register_observer(tgt_obd, obd); + if (rc) { + CERROR("Target %s register_observer error %d\n", + obd_uuid2str(tgt_uuid), rc); + RETURN(rc); + } + + + if (imp->imp_invalid) { + CDEBUG(D_CONFIG, "not connecting OSC %s; administratively " + "disabled\n", obd_uuid2str(tgt_uuid)); + RETURN(0); + } + + rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd, + &lov_osc_uuid, data, NULL); + if (rc || !lov->lov_tgts[index]->ltd_exp) { + CERROR("Target %s connect error %d\n", + obd_uuid2str(tgt_uuid), rc); + RETURN(-ENODEV); + } + + lov->lov_tgts[index]->ltd_reap = 0; + + CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index, + obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in"); + + if (lov->lov_tgts_kobj) { + /* Even if we failed, that's ok */ + rc = sysfs_create_link(lov->lov_tgts_kobj, + &tgt_obd->obd_kset.kobj, + tgt_obd->obd_name); + if (rc) { + CERROR("%s: can't register LOV target /sys/fs/lustre/%s/%s/target_obds/%s : rc = %d\n", + obd->obd_name, obd->obd_type->typ_name, + obd->obd_name, + lov->lov_tgts[index]->ltd_exp->exp_obd->obd_name, + rc); + } + } + RETURN(0); +} + +static int lov_connect(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct lustre_handle conn; + int i, rc; + ENTRY; + + CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects); + + rc = class_connect(&conn, obd, cluuid); + if (rc) + RETURN(rc); + + *exp = class_conn2export(&conn); + + /* Why should there ever be more than 1 connect? */ + lov->lov_connects++; + LASSERT(lov->lov_connects == 1); + + memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd)); + if (data) + lov->lov_ocd = *data; + + lov_tgts_getref(obd); + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + tgt = lov->lov_tgts[i]; + if (!tgt || obd_uuid_empty(&tgt->ltd_uuid)) + continue; + /* Flags will be lowest common denominator */ + rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd); + if (rc) { + CERROR("%s: lov connect tgt %d failed: %d\n", + obd->obd_name, i, rc); + continue; + } + /* connect to administrative disabled ost */ + if (!lov->lov_tgts[i]->ltd_exp) + continue; + + rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd, + OBD_NOTIFY_CONNECT); + if (rc) { + CERROR("%s error sending notify %d\n", + obd->obd_name, rc); + } + } + + lov_tgts_putref(obd); + + RETURN(0); +} + +static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + struct lov_obd *lov = &obd->u.lov; + struct obd_device *osc_obd; + int rc; + ENTRY; + + osc_obd = class_exp2obd(tgt->ltd_exp); + CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", + obd->obd_name, osc_obd->obd_name); + + if (tgt->ltd_active) { + tgt->ltd_active = 0; + lov->desc.ld_active_tgt_count--; + tgt->ltd_exp->exp_obd->obd_inactive = 1; + } + + if (osc_obd) { + if (lov->lov_tgts_kobj) + sysfs_remove_link(lov->lov_tgts_kobj, + osc_obd->obd_name); + + /* Pass it on to our clients. + * XXX This should be an argument to disconnect, + * XXX not a back-door flag on the OBD. Ah well. + */ + osc_obd->obd_force = obd->obd_force; + osc_obd->obd_fail = obd->obd_fail; + osc_obd->obd_no_recov = obd->obd_no_recov; + + if (lov->targets_proc_entry != NULL) + lprocfs_remove_proc_entry(osc_obd->obd_name, + lov->targets_proc_entry); + } + + obd_register_observer(osc_obd, NULL); + + rc = obd_disconnect(tgt->ltd_exp); + if (rc) { + CERROR("Target %s disconnect error %d\n", + tgt->ltd_uuid.uuid, rc); + rc = 0; + } + + tgt->ltd_exp = NULL; + RETURN(0); +} + +static int lov_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + u32 index; + int rc; + + ENTRY; + if (!lov->lov_tgts) + goto out; + + /* Only disconnect the underlying layers on the final disconnect. */ + lov->lov_connects--; + if (lov->lov_connects != 0) { + /* why should there be more than 1 connect? */ + CWARN("%s: unexpected disconnect #%d\n", + obd->obd_name, lov->lov_connects); + goto out; + } + + /* hold another ref so lov_del_obd() doesn't spin in putref each time */ + lov_tgts_getref(obd); + + for (index = 0; index < lov->desc.ld_tgt_count; index++) { + if (lov->lov_tgts[index] && lov->lov_tgts[index]->ltd_exp) { + /* Disconnection is the last we know about an OBD */ + lov_del_target(obd, index, NULL, + lov->lov_tgts[index]->ltd_gen); + } + } + lov_tgts_putref(obd); + +out: + rc = class_disconnect(exp); /* bz 9811 */ + RETURN(rc); +} + +/* Error codes: + * + * -EINVAL : UUID can't be found in the LOV's target list + * -ENOTCONN: The UUID is found, but the target connection is bad (!) + * -EBADF : The UUID is found, but the OBD is the wrong type (!) + * any >= 0 : is log target index + */ +static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, + enum obd_notify_event ev) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + int index, activate, active; + ENTRY; + + CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n", + lov, uuid->uuid, ev); + + lov_tgts_getref(obd); + for (index = 0; index < lov->desc.ld_tgt_count; index++) { + tgt = lov->lov_tgts[index]; + if (!tgt) + continue; + if (obd_uuid_equals(uuid, &tgt->ltd_uuid)) + break; + } + + if (index == lov->desc.ld_tgt_count) + GOTO(out, index = -EINVAL); + + if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) { + activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0; + + /* + * LU-642, initially inactive OSC could miss the obd_connect, + * we make up for it here. + */ + if (activate && !tgt->ltd_exp) { + int rc; + struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"}; + + rc = obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd, + &lov_osc_uuid, &lov->lov_ocd, NULL); + if (rc || tgt->ltd_exp == NULL) + GOTO(out, index = rc); + rc = obd_set_info_async(NULL, tgt->ltd_exp, + sizeof(KEY_CACHE_SET), + KEY_CACHE_SET, + sizeof(struct cl_client_cache), + lov->lov_cache, NULL); + if (rc < 0) + GOTO(out, index = rc); + } + + if (lov->lov_tgts[index]->ltd_activate == activate) { + CDEBUG(D_INFO, "OSC %s already %sactivate!\n", + uuid->uuid, activate ? "" : "de"); + } else { + lov->lov_tgts[index]->ltd_activate = activate; + CDEBUG(D_CONFIG, "%sactivate OSC %s\n", + activate ? "" : "de", obd_uuid2str(uuid)); + } + + } else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) { + active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0; + + if (lov->lov_tgts[index]->ltd_active == active) { + CDEBUG(D_INFO, "OSC %s already %sactive!\n", + uuid->uuid, active ? "" : "in"); + GOTO(out, index); + } else { + CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n", + obd_uuid2str(uuid), active ? "" : "in"); + } + + lov->lov_tgts[index]->ltd_active = active; + if (active) { + lov->desc.ld_active_tgt_count++; + lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0; + } else { + lov->desc.ld_active_tgt_count--; + lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1; + } + } else { + CERROR("%s: unknown event %d for uuid %s\n", obd->obd_name, + ev, uuid->uuid); + } + + if (tgt->ltd_exp) + CDEBUG(D_INFO, "%s: lov idx %d conn %llx\n", obd_uuid2str(uuid), + index, tgt->ltd_exp->exp_handle.h_cookie); + + out: + lov_tgts_putref(obd); + RETURN(index); +} + +static int lov_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev) +{ + int rc = 0; + struct lov_obd *lov = &obd->u.lov; + ENTRY; + + down_read(&lov->lov_notify_lock); + if (!lov->lov_connects) + GOTO(out_notify_lock, rc = 0); + + if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE || + ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) { + struct obd_uuid *uuid; + + LASSERT(watched); + + if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { + CERROR("unexpected notification of %s %s\n", + watched->obd_type->typ_name, watched->obd_name); + GOTO(out_notify_lock, rc = -EINVAL); + } + + uuid = &watched->u.cli.cl_target_uuid; + + /* Set OSC as active before notifying the observer, so the + * observer can use the OSC normally. + */ + rc = lov_set_osc_active(obd, uuid, ev); + if (rc < 0) { + CERROR("%s: event %d failed: rc = %d\n", obd->obd_name, + ev, rc); + GOTO(out_notify_lock, rc); + } + } + + /* Pass the notification up the chain. */ + rc = obd_notify_observer(obd, watched, ev); + +out_notify_lock: + up_read(&lov->lov_notify_lock); + + RETURN(rc); +} + +static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, + u32 index, int gen, int active) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct obd_device *tgt_obd; + int rc; + + ENTRY; + CDEBUG(D_CONFIG, "uuid:%s idx:%u gen:%d active:%d\n", + uuidp->uuid, index, gen, active); + + if (gen <= 0) { + CERROR("%s: request to add '%s' with invalid generation: %d\n", + obd->obd_name, uuidp->uuid, gen); + RETURN(-EINVAL); + } + + tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME, &obd->obd_uuid); + if (tgt_obd == NULL) + RETURN(-EINVAL); + + mutex_lock(&lov->lov_lock); + + if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) { + tgt = lov->lov_tgts[index]; + rc = -EEXIST; + CERROR("%s: UUID %s already assigned at index %d: rc = %d\n", + obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), index, rc); + mutex_unlock(&lov->lov_lock); + RETURN(rc); + } + + if (index >= lov->lov_tgt_size) { + /* We need to reallocate the lov target array. */ + struct lov_tgt_desc **newtgts, **old = NULL; + __u32 newsize, oldsize = 0; + + newsize = max(lov->lov_tgt_size, (__u32)2); + while (newsize < index + 1) + newsize = newsize << 1; + OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize); + if (newtgts == NULL) { + mutex_unlock(&lov->lov_lock); + RETURN(-ENOMEM); + } + + if (lov->lov_tgt_size) { + memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) * + lov->lov_tgt_size); + old = lov->lov_tgts; + oldsize = lov->lov_tgt_size; + } + + lov->lov_tgts = newtgts; + lov->lov_tgt_size = newsize; + smp_rmb(); + if (old) + OBD_FREE(old, sizeof(*old) * oldsize); + + CDEBUG(D_CONFIG, "tgts: %p size: %d\n", + lov->lov_tgts, lov->lov_tgt_size); + } + + OBD_ALLOC_PTR(tgt); + if (!tgt) { + mutex_unlock(&lov->lov_lock); + RETURN(-ENOMEM); + } + + rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size); + if (rc) { + mutex_unlock(&lov->lov_lock); + OBD_FREE_PTR(tgt); + RETURN(rc); + } + + tgt->ltd_uuid = *uuidp; + tgt->ltd_obd = tgt_obd; + /* XXX - add a sanity check on the generation number. */ + tgt->ltd_gen = gen; + tgt->ltd_index = index; + tgt->ltd_activate = active; + lov->lov_tgts[index] = tgt; + if (index >= lov->desc.ld_tgt_count) + lov->desc.ld_tgt_count = index + 1; + + mutex_unlock(&lov->lov_lock); + + CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n", + index, tgt->ltd_gen, lov->desc.ld_tgt_count); + + if (lov->lov_connects == 0) { + /* lov_connect hasn't been called yet. We'll do the + lov_connect_obd on this target when that fn first runs, + because we don't know the connect flags yet. */ + RETURN(0); + } + + lov_tgts_getref(obd); + + rc = lov_connect_obd(obd, index, active, &lov->lov_ocd); + if (rc) + GOTO(out, rc); + + /* connect to administrative disabled ost */ + if (!tgt->ltd_exp) + GOTO(out, rc = 0); + + if (lov->lov_cache != NULL) { + rc = obd_set_info_async(NULL, tgt->ltd_exp, + sizeof(KEY_CACHE_SET), KEY_CACHE_SET, + sizeof(struct cl_client_cache), lov->lov_cache, + NULL); + if (rc < 0) + GOTO(out, rc); + } + + rc = lov_notify(obd, tgt->ltd_exp->exp_obd, + active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE); + +out: + if (rc) { + CERROR("%s: add failed, deleting %s: rc = %d\n", + obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), rc); + lov_del_target(obd, index, NULL, 0); + } + lov_tgts_putref(obd); + RETURN(rc); +} + +/* Schedule a target for deletion */ +int lov_del_target(struct obd_device *obd, u32 index, + struct obd_uuid *uuidp, int gen) +{ + struct lov_obd *lov = &obd->u.lov; + int count = lov->desc.ld_tgt_count; + int rc = 0; + ENTRY; + + if (index >= count) { + CERROR("LOV target index %d >= number of LOV OBDs %d.\n", + index, count); + RETURN(-EINVAL); + } + + /* to make sure there's no ongoing lov_notify() now */ + down_write(&lov->lov_notify_lock); + lov_tgts_getref(obd); + + if (!lov->lov_tgts[index]) { + CERROR("LOV target at index %d is not setup.\n", index); + GOTO(out, rc = -EINVAL); + } + + if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) { + CERROR("LOV target UUID %s at index %d doesn't match %s.\n", + lov_uuid2str(lov, index), index, + obd_uuid2str(uuidp)); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n", + lov_uuid2str(lov, index), index, + lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp, + lov->lov_tgts[index]->ltd_active); + + lov->lov_tgts[index]->ltd_reap = 1; + lov->lov_death_row++; + /* we really delete it from lov_tgts_putref() */ +out: + lov_tgts_putref(obd); + up_write(&lov->lov_notify_lock); + + RETURN(rc); +} + +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + struct obd_device *osc_obd; + + LASSERT(tgt); + LASSERT(tgt->ltd_reap); + + osc_obd = class_exp2obd(tgt->ltd_exp); + + CDEBUG(D_CONFIG, "Removing tgt %s : %s\n", + tgt->ltd_uuid.uuid, + osc_obd ? osc_obd->obd_name : ""); + + if (tgt->ltd_exp) + lov_disconnect_obd(obd, tgt); + + OBD_FREE_PTR(tgt); + + /* Manual cleanup - no cleanup logs to clean up the osc's. We must + do it ourselves. And we can't do it from lov_cleanup, + because we just lost our only reference to it. */ + if (osc_obd) + class_manual_cleanup(osc_obd); +} + +void lov_fix_desc_stripe_size(__u64 *val) +{ + if (*val < LOV_MIN_STRIPE_SIZE) { + if (*val != 0) + LCONSOLE_INFO("Increasing default stripe size to " + "minimum %u\n", + LOV_DESC_STRIPE_SIZE_DEFAULT); + *val = LOV_DESC_STRIPE_SIZE_DEFAULT; + } else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) { + *val &= ~(LOV_MIN_STRIPE_SIZE - 1); + LCONSOLE_WARN("Changing default stripe size to %llu (a " + "multiple of %u)\n", + *val, LOV_MIN_STRIPE_SIZE); + } +} + +void lov_fix_desc_stripe_count(__u32 *val) +{ + if (*val == 0) + *val = 1; +} + +void lov_fix_desc_pattern(__u32 *val) +{ + /* from lov_setstripe */ + if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) { + LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val); + *val = 0; + } +} + +void lov_fix_desc_qos_maxage(__u32 *val) +{ + if (*val == 0) + *val = LOV_DESC_QOS_MAXAGE_DEFAULT; +} + +void lov_fix_desc(struct lov_desc *desc) +{ + lov_fix_desc_stripe_size(&desc->ld_default_stripe_size); + lov_fix_desc_stripe_count(&desc->ld_default_stripe_count); + lov_fix_desc_pattern(&desc->ld_pattern); + lov_fix_desc_qos_maxage(&desc->ld_qos_maxage); +} + +int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lov_desc *desc; + struct lov_obd *lov = &obd->u.lov; + int rc; + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("LOV setup requires a descriptor\n"); + RETURN(-EINVAL); + } + + desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1); + + if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("descriptor size wrong: %d > %d\n", + (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); + RETURN(-EINVAL); + } + + if (desc->ld_magic != LOV_DESC_MAGIC) { + if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) { + CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n", + obd->obd_name, desc); + lustre_swab_lov_desc(desc); + } else { + CERROR("%s: Bad lov desc magic: %#x\n", + obd->obd_name, desc->ld_magic); + RETURN(-EINVAL); + } + } + + lov_fix_desc(desc); + + desc->ld_active_tgt_count = 0; + lov->desc = *desc; + lov->lov_tgt_size = 0; + + mutex_init(&lov->lov_lock); + atomic_set(&lov->lov_refcount, 0); + lov->lov_sp_me = LUSTRE_SP_CLI; + + init_rwsem(&lov->lov_notify_lock); + + lov->lov_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS, + HASH_POOLS_MAX_BITS, + HASH_POOLS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &pool_hash_operations, + CFS_HASH_DEFAULT); + INIT_LIST_HEAD(&lov->lov_pool_list); + lov->lov_pool_count = 0; + rc = lov_ost_pool_init(&lov->lov_packed, 0); + if (rc) + GOTO(out, rc); + + rc = lov_tunables_init(obd); + if (rc) + GOTO(out, rc); + + lov->lov_tgts_kobj = kobject_create_and_add("target_obds", + &obd->obd_kset.kobj); + +out: + return rc; +} + +static int lov_cleanup(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + struct list_head *pos, *tmp; + struct pool_desc *pool; + ENTRY; + + if (lov->lov_tgts_kobj) { + kobject_put(lov->lov_tgts_kobj); + lov->lov_tgts_kobj = NULL; + } + + list_for_each_safe(pos, tmp, &lov->lov_pool_list) { + pool = list_entry(pos, struct pool_desc, pool_list); + /* free pool structs */ + CDEBUG(D_INFO, "delete pool %p\n", pool); + /* In the function below, .hs_keycmp resolves to + * pool_hashkey_keycmp() */ + /* coverity[overrun-buffer-val] */ + lov_pool_del(obd, pool->pool_name); + } + cfs_hash_putref(lov->lov_pools_hash_body); + lov_ost_pool_free(&lov->lov_packed); + + lprocfs_obd_cleanup(obd); + if (lov->lov_tgts) { + int i; + lov_tgts_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + /* Inactive targets may never have connected */ + if (lov->lov_tgts[i]->ltd_active) + /* We should never get here - these + * should have been removed in the + * disconnect. */ + CERROR("%s: lov tgt %d not cleaned! " + "deathrow=%d, lovrc=%d\n", + obd->obd_name, i, lov->lov_death_row, + atomic_read(&lov->lov_refcount)); + lov_del_target(obd, i, NULL, 0); + } + lov_tgts_putref(obd); + OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) * + lov->lov_tgt_size); + lov->lov_tgt_size = 0; + } + + if (lov->lov_cache != NULL) { + cl_cache_decref(lov->lov_cache); + lov->lov_cache = NULL; + } + + RETURN(0); +} + +int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, + u32 *indexp, int *genp) +{ + struct obd_uuid obd_uuid; + int cmd; + int rc = 0; + + ENTRY; + switch (cmd = lcfg->lcfg_command) { + case LCFG_ADD_MDC: + case LCFG_DEL_MDC: + break; + case LCFG_LOV_ADD_OBD: + case LCFG_LOV_ADD_INA: + case LCFG_LOV_DEL_OBD: { + u32 index; + int gen; + + /* lov_modify_tgts add 0:lov_mdsA 1:ost1_UUID 2:0 3:1 */ + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) + GOTO(out, rc = -EINVAL); + + obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); + + rc = kstrtou32(lustre_cfg_buf(lcfg, 2), 10, indexp); + if (rc) + GOTO(out, rc); + rc = kstrtoint(lustre_cfg_buf(lcfg, 3), 10, genp); + if (rc) + GOTO(out, rc); + index = *indexp; + gen = *genp; + if (cmd == LCFG_LOV_ADD_OBD) + rc = lov_add_target(obd, &obd_uuid, index, gen, 1); + else if (cmd == LCFG_LOV_ADD_INA) + rc = lov_add_target(obd, &obd_uuid, index, gen, 0); + else + rc = lov_del_target(obd, index, &obd_uuid, gen); + + GOTO(out, rc); + } + case LCFG_PARAM: { + struct lov_desc *desc = &(obd->u.lov.desc); + ssize_t count; + + if (!desc) + GOTO(out, rc = -EINVAL); + + count = class_modify_config(lcfg, PARAM_LOV, + &obd->obd_kset.kobj); + GOTO(out, rc = count < 0 ? count : 0); + } + case LCFG_POOL_NEW: + case LCFG_POOL_ADD: + case LCFG_POOL_DEL: + case LCFG_POOL_REM: + GOTO(out, rc); + + default: { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + + } + } +out: + RETURN(rc); +} + +static int lov_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + struct obd_info oinfo = { + .oi_osfs = osfs, + .oi_flags = flags, + }; + struct ptlrpc_request_set *rqset; + struct lov_request_set *set = NULL; + struct lov_request *req; + int rc = 0; + int rc2; + + ENTRY; + + rqset = ptlrpc_prep_set(); + if (rqset == NULL) + RETURN(-ENOMEM); + + rc = lov_prep_statfs_set(obd, &oinfo, &set); + if (rc < 0) + GOTO(out_rqset, rc); + + list_for_each_entry(req, &set->set_list, rq_link) { + rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, max_age, rqset); + if (rc < 0) + GOTO(out_set, rc); + } + + rc = ptlrpc_set_wait(env, rqset); + +out_set: + if (rc < 0) + atomic_set(&set->set_completes, 0); + + rc2 = lov_fini_statfs_set(set); + if (rc == 0) + rc = rc2; + +out_rqset: + ptlrpc_set_destroy(rqset); + + RETURN(rc); +} + +static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void __user *uarg) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + int i = 0, rc = 0, count = lov->desc.ld_tgt_count; + struct obd_uuid *uuidp; + + ENTRY; + switch (cmd) { + case IOC_OBD_STATFS: { + struct obd_ioctl_data *data = karg; + struct obd_device *osc_obd; + struct obd_statfs stat_buf = {0}; + struct obd_import *imp; + __u32 index; + __u32 flags; + + memcpy(&index, data->ioc_inlbuf2, sizeof(index)); + if (index >= count) + RETURN(-ENODEV); + + if (!lov->lov_tgts[index]) + /* Try again with the next index */ + RETURN(-EAGAIN); + + osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp); + if (!osc_obd) + RETURN(-EINVAL); + + imp = osc_obd->u.cli.cl_import; + if (!lov->lov_tgts[index]->ltd_active && + imp->imp_state != LUSTRE_IMP_IDLE) + RETURN(-ENODATA); + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd), + min_t(unsigned long, data->ioc_plen2, + sizeof(struct obd_uuid)))) + RETURN(-EFAULT); + + memcpy(&flags, data->ioc_inlbuf1, sizeof(flags)); + flags = flags & LL_STATFS_NODELAY ? OBD_STATFS_NODELAY : 0; + + /* got statfs data */ + rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + flags); + if (rc) + RETURN(rc); + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min_t(unsigned long, data->ioc_plen1, + sizeof(struct obd_statfs)))) + RETURN(-EFAULT); + break; + } + case OBD_IOC_LOV_GET_CONFIG: { + struct obd_ioctl_data *data; + struct lov_desc *desc; + char *buf = NULL; + __u32 *genp; + + len = 0; + if (obd_ioctl_getdata(&buf, &len, uarg)) + RETURN(-EINVAL); + + data = (struct obd_ioctl_data *)buf; + + if (sizeof(*desc) > data->ioc_inllen1) { + OBD_FREE_LARGE(buf, len); + RETURN(-EINVAL); + } + + if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) { + OBD_FREE_LARGE(buf, len); + RETURN(-EINVAL); + } + + if (sizeof(__u32) * count > data->ioc_inllen3) { + OBD_FREE_LARGE(buf, len); + RETURN(-EINVAL); + } + + desc = (struct lov_desc *)data->ioc_inlbuf1; + memcpy(desc, &(lov->desc), sizeof(*desc)); + + uuidp = (struct obd_uuid *)data->ioc_inlbuf2; + genp = (__u32 *)data->ioc_inlbuf3; + /* the uuid will be empty for deleted OSTs */ + for (i = 0; i < count; i++, uuidp++, genp++) { + if (!lov->lov_tgts[i]) + continue; + *uuidp = lov->lov_tgts[i]->ltd_uuid; + *genp = lov->lov_tgts[i]->ltd_gen; + } + + if (copy_to_user(uarg, buf, len)) + rc = -EFAULT; + OBD_FREE_LARGE(buf, len); + break; + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct lov_tgt_desc *tgt = NULL; + struct obd_quotactl *oqctl; + + if (qctl->qc_valid == QC_OSTIDX) { + if (count <= qctl->qc_idx) + RETURN(-EINVAL); + + tgt = lov->lov_tgts[qctl->qc_idx]; + if (!tgt || !tgt->ltd_exp) + RETURN(-EINVAL); + } else if (qctl->qc_valid == QC_UUID) { + for (i = 0; i < count; i++) { + tgt = lov->lov_tgts[i]; + if (!tgt || + !obd_uuid_equals(&tgt->ltd_uuid, + &qctl->obd_uuid)) + continue; + + if (tgt->ltd_exp == NULL) + RETURN(-EINVAL); + + break; + } + } else { + RETURN(-EINVAL); + } + + if (i >= count) + RETURN(-EAGAIN); + + LASSERT(tgt && tgt->ltd_exp); + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(tgt->ltd_exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_OSTIDX; + qctl->obd_uuid = tgt->ltd_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } + default: { + int set = 0; + + if (count == 0) + RETURN(-ENOTTY); + + for (i = 0; i < count; i++) { + int err; + struct obd_device *osc_obd; + + /* OST was disconnected */ + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) + continue; + + /* ll_umount_begin() sets force on lov, pass to osc */ + osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp); + osc_obd->obd_force = obd->obd_force; + err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp, + len, karg, uarg); + if (err) { + if (lov->lov_tgts[i]->ltd_active) { + CDEBUG(err == -ENOTTY ? + D_IOCTL : D_WARNING, + "iocontrol OSC %s on OST " + "idx %d cmd %x: err = %d\n", + lov_uuid2str(lov, i), + i, cmd, err); + if (!rc) + rc = err; + } + } else { + set = 1; + } + } + if (!set && !rc) + rc = -EIO; + } + } + + RETURN(rc); +} + +static int lov_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lov_obd *lov = &obddev->u.lov; + struct lov_desc *ld = &lov->desc; + int rc = 0; + ENTRY; + + if (vallen == NULL || val == NULL) + RETURN(-EFAULT); + + lov_tgts_getref(obddev); + + if (KEY_IS(KEY_MAX_EASIZE)) { + u32 max_stripe_count = min_t(u32, ld->ld_active_tgt_count, + LOV_MAX_STRIPE_COUNT); + + *((u32 *)val) = lov_mds_md_size(max_stripe_count, LOV_MAGIC_V3); + } else if (KEY_IS(KEY_DEFAULT_EASIZE)) { + u32 def_stripe_count = min_t(u32, ld->ld_default_stripe_count, + LOV_MAX_STRIPE_COUNT); + + *((u32 *)val) = lov_mds_md_size(def_stripe_count, LOV_MAGIC_V3); + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((int *)val) = lov->desc.ld_tgt_count; + } else { + rc = -EINVAL; + } + + lov_tgts_putref(obddev); + + RETURN(rc); +} + +static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lov_obd *lov = &obddev->u.lov; + struct lov_tgt_desc *tgt; + bool do_inactive = false, no_set = false; + u32 i; + int rc = 0; + int err; + + ENTRY; + + if (set == NULL) { + no_set = true; + set = ptlrpc_prep_set(); + if (!set) + RETURN(-ENOMEM); + } + + lov_tgts_getref(obddev); + + if (KEY_IS(KEY_CHECKSUM)) { + do_inactive = true; + } else if (KEY_IS(KEY_CACHE_SET)) { + LASSERT(lov->lov_cache == NULL); + lov->lov_cache = val; + do_inactive = true; + cl_cache_incref(lov->lov_cache); + } + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + tgt = lov->lov_tgts[i]; + + /* OST was disconnected */ + if (tgt == NULL || tgt->ltd_exp == NULL) + continue; + + /* OST is inactive and we don't want inactive OSCs */ + if (!tgt->ltd_active && !do_inactive) + continue; + + err = obd_set_info_async(env, tgt->ltd_exp, keylen, key, + vallen, val, set); + + if (rc == 0) + rc = err; + } + + /* cycle through MDC target for Data-on-MDT */ + for (i = 0; i < LOV_MDC_TGT_MAX; i++) { + struct obd_device *mdc; + + mdc = lov->lov_mdc_tgts[i].lmtd_mdc; + if (mdc == NULL) + continue; + + err = obd_set_info_async(env, mdc->obd_self_export, + keylen, key, vallen, val, set); + if (rc == 0) + rc = err; + } + + lov_tgts_putref(obddev); + if (no_set) { + err = ptlrpc_set_wait(env, set); + if (rc == 0) + rc = err; + ptlrpc_set_destroy(set); + } + RETURN(rc); +} + +void lov_stripe_lock(struct lov_stripe_md *md) +__acquires(&md->lsm_lock) +{ + LASSERT(md->lsm_lock_owner != current_pid()); + spin_lock(&md->lsm_lock); + LASSERT(md->lsm_lock_owner == 0); + md->lsm_lock_owner = current_pid(); +} + +void lov_stripe_unlock(struct lov_stripe_md *md) +__releases(&md->lsm_lock) +{ + LASSERT(md->lsm_lock_owner == current_pid()); + md->lsm_lock_owner = 0; + spin_unlock(&md->lsm_lock); +} + +static int lov_quotactl(struct obd_device *obd, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + __u64 curspace = 0; + __u64 bhardlimit = 0; + int i, rc = 0; + ENTRY; + + if (oqctl->qc_cmd != Q_GETOQUOTA && + oqctl->qc_cmd != LUSTRE_Q_SETQUOTA) { + CERROR("%s: bad quota opc %x for lov obd\n", + obd->obd_name, oqctl->qc_cmd); + RETURN(-EFAULT); + } + + /* for lov tgt */ + lov_tgts_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + int err; + + tgt = lov->lov_tgts[i]; + + if (!tgt) + continue; + + if (!tgt->ltd_active || tgt->ltd_reap) { + if (oqctl->qc_cmd == Q_GETOQUOTA && + lov->lov_tgts[i]->ltd_activate) { + rc = -ENETDOWN; + CERROR("ost %d is inactive\n", i); + } else { + CDEBUG(D_HA, "ost %d is inactive\n", i); + } + continue; + } + + err = obd_quotactl(tgt->ltd_exp, oqctl); + if (err) { + if (tgt->ltd_active && !rc) + rc = err; + continue; + } + + if (oqctl->qc_cmd == Q_GETOQUOTA) { + curspace += oqctl->qc_dqblk.dqb_curspace; + bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit; + } + } + lov_tgts_putref(obd); + + if (oqctl->qc_cmd == Q_GETOQUOTA) { + oqctl->qc_dqblk.dqb_curspace = curspace; + oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit; + } + RETURN(rc); +} + +static struct obd_ops lov_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = lov_setup, + .o_cleanup = lov_cleanup, + .o_connect = lov_connect, + .o_disconnect = lov_disconnect, + .o_statfs = lov_statfs, + .o_iocontrol = lov_iocontrol, + .o_get_info = lov_get_info, + .o_set_info_async = lov_set_info_async, + .o_notify = lov_notify, + .o_pool_new = lov_pool_new, + .o_pool_rem = lov_pool_remove, + .o_pool_add = lov_pool_add, + .o_pool_del = lov_pool_del, + .o_quotactl = lov_quotactl, +}; + +struct kmem_cache *lov_oinfo_slab; + +static int __init lov_init(void) +{ + bool enable_proc = true; + struct obd_type *type; + int rc; + ENTRY; + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches); + + rc = lu_kmem_init(lov_caches); + if (rc) + return rc; + + lov_oinfo_slab = kmem_cache_create("lov_oinfo", + sizeof(struct lov_oinfo), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (lov_oinfo_slab == NULL) { + lu_kmem_fini(lov_caches); + return -ENOMEM; + } + + type = class_search_type(LUSTRE_LOD_NAME); + if (type != NULL && type->typ_procsym != NULL) + enable_proc = false; + + rc = class_register_type(&lov_obd_ops, NULL, enable_proc, NULL, + LUSTRE_LOV_NAME, &lov_device_type); + + if (rc) { + kmem_cache_destroy(lov_oinfo_slab); + lu_kmem_fini(lov_caches); + } + + RETURN(rc); +} + +static void __exit lov_exit(void) +{ + class_unregister_type(LUSTRE_LOV_NAME); + kmem_cache_destroy(lov_oinfo_slab); + lu_kmem_fini(lov_caches); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Logical Object Volume"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(lov_init); +module_exit(lov_exit); diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_object.c b/drivers/staging/lustrefsx/lustre/lov/lov_object.c new file mode 100644 index 0000000000000..590a2009a87ef --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_object.c @@ -0,0 +1,2180 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include "lov_cl_internal.h" + +static inline struct lov_device *lov_object_dev(struct lov_object *obj) +{ + return lu2lov_dev(obj->lo_cl.co_lu.lo_dev); +} + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Layout operations. + * + */ + +struct lov_layout_operations { + int (*llo_init)(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, struct lov_stripe_md *lsm, + const struct cl_object_conf *conf, + union lov_layout_state *state); + int (*llo_delete)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + void (*llo_fini)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + int (*llo_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o); + int (*llo_page_init)(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index); + int (*llo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + int (*llo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + int (*llo_getattr)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); + int (*llo_flush)(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock); +}; + +static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov); + +static void lov_lsm_put(struct lov_stripe_md *lsm) +{ + if (lsm != NULL) + lov_free_memmd(&lsm); +} + +/***************************************************************************** + * + * Lov object layout operations. + * + */ + +static struct cl_object *lov_sub_find(const struct lu_env *env, + struct cl_device *dev, + const struct lu_fid *fid, + const struct cl_object_conf *conf) +{ + struct lu_object *o; + + ENTRY; + + o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu); + LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type)); + RETURN(lu2cl(o)); +} + +static int lov_page_slice_fixup(struct lov_object *lov, + struct cl_object *stripe) +{ + struct cl_object_header *hdr = cl_object_header(&lov->lo_cl); + struct cl_object *o; + + if (stripe == NULL) + return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off - + cfs_size_round(sizeof(struct lov_page)); + + cl_object_for_each(o, stripe) + o->co_slice_off += hdr->coh_page_bufsize; + + return cl_object_header(stripe)->coh_page_bufsize; +} + +static int lov_init_sub(const struct lu_env *env, struct lov_object *lov, + struct cl_object *subobj, struct lov_oinfo *oinfo, + int idx) +{ + struct cl_object_header *hdr; + struct cl_object_header *subhdr; + struct cl_object_header *parent; + int entry = lov_comp_entry(idx); + int stripe = lov_comp_stripe(idx); + int result; + + if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) { + /* For sanity:test_206. + * Do not leave the object in cache to avoid accessing + * freed memory. This is because osc_object is referring to + * lov_oinfo of lsm_stripe_data which will be freed due to + * this failure. */ + cl_object_kill(env, subobj); + cl_object_put(env, subobj); + return -EIO; + } + + hdr = cl_object_header(lov2cl(lov)); + subhdr = cl_object_header(subobj); + + CDEBUG(D_INODE, DFID"@%p[%d:%d] -> "DFID"@%p: ostid: "DOSTID + " ost idx: %d gen: %d\n", + PFID(lu_object_fid(&subobj->co_lu)), subhdr, entry, stripe, + PFID(lu_object_fid(lov2lu(lov))), hdr, POSTID(&oinfo->loi_oi), + oinfo->loi_ost_idx, oinfo->loi_ost_gen); + + /* reuse ->coh_attr_guard to protect coh_parent change */ + spin_lock(&subhdr->coh_attr_guard); + parent = subhdr->coh_parent; + if (parent == NULL) { + struct lovsub_object *lso = cl2lovsub(subobj); + + subhdr->coh_parent = hdr; + spin_unlock(&subhdr->coh_attr_guard); + subhdr->coh_nesting = hdr->coh_nesting + 1; + lu_object_ref_add(&subobj->co_lu, "lov-parent", lov); + lso->lso_super = lov; + lso->lso_index = idx; + result = 0; + } else { + struct lu_object *old_obj; + struct lov_object *old_lov; + unsigned int mask = D_INODE; + + spin_unlock(&subhdr->coh_attr_guard); + old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type); + LASSERT(old_obj != NULL); + old_lov = cl2lov(lu2cl(old_obj)); + if (old_lov->lo_layout_invalid) { + /* the object's layout has already changed but isn't + * refreshed */ + lu_object_unhash(env, &subobj->co_lu); + result = -EAGAIN; + } else { + mask = D_ERROR; + result = -EIO; + } + + LU_OBJECT_DEBUG(mask, env, &subobj->co_lu, + "stripe %d is already owned.", idx); + LU_OBJECT_DEBUG(mask, env, old_obj, "owned."); + LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n"); + cl_object_put(env, subobj); + } + return result; +} + +static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, unsigned int index, + const struct cl_object_conf *conf, + struct lov_layout_entry *lle) +{ + struct lov_layout_raid0 *r0 = &lle->lle_raid0; + struct lov_thread_info *lti = lov_env_info(env); + struct cl_object_conf *subconf = <i->lti_stripe_conf; + struct lu_fid *ofid = <i->lti_fid; + struct cl_object *stripe; + struct lov_stripe_md_entry *lse = lov_lse(lov, index); + int result; + int psz, sz; + int i; + + ENTRY; + + spin_lock_init(&r0->lo_sub_lock); + r0->lo_nr = lse->lsme_stripe_count; + r0->lo_trunc_stripeno = -1; + + OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0])); + if (r0->lo_sub == NULL) + GOTO(out, result = -ENOMEM); + + psz = 0; + result = 0; + memset(subconf, 0, sizeof(*subconf)); + + /* + * Create stripe cl_objects. + */ + for (i = 0; i < r0->lo_nr; ++i) { + struct cl_device *subdev; + struct lov_oinfo *oinfo = lse->lsme_oinfo[i]; + int ost_idx = oinfo->loi_ost_idx; + + if (lov_oinfo_is_dummy(oinfo)) + continue; + + result = ostid_to_fid(ofid, &oinfo->loi_oi, oinfo->loi_ost_idx); + if (result != 0) + GOTO(out, result); + + if (dev->ld_target[ost_idx] == NULL) { + CERROR("%s: OST %04x is not initialized\n", + lov2obd(dev->ld_lov)->obd_name, ost_idx); + GOTO(out, result = -EIO); + } + + subdev = lovsub2cl_dev(dev->ld_target[ost_idx]); + subconf->u.coc_oinfo = oinfo; + LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx); + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + stripe = lov_sub_find(env, subdev, ofid, subconf); + if (IS_ERR(stripe)) + GOTO(out, result = PTR_ERR(stripe)); + + result = lov_init_sub(env, lov, stripe, oinfo, + lov_comp_index(index, i)); + if (result == -EAGAIN) { /* try again */ + --i; + result = 0; + continue; + } + + if (result == 0) { + r0->lo_sub[i] = cl2lovsub(stripe); + + sz = lov_page_slice_fixup(lov, stripe); + LASSERT(ergo(psz > 0, psz == sz)); + psz = sz; + } + } + if (result == 0) + result = psz; +out: + RETURN(result); +} + +static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov, + struct lov_layout_raid0 *r0, + struct lovsub_object *los, int idx) +{ + struct cl_object *sub; + struct lu_site *site; + wait_queue_head_t *wq; + wait_queue_entry_t *waiter; + + LASSERT(r0->lo_sub[idx] == los); + + sub = lovsub2cl(los); + site = sub->co_lu.lo_dev->ld_site; + wq = lu_site_wq_from_fid(site, &sub->co_lu.lo_header->loh_fid); + + cl_object_kill(env, sub); + /* release a reference to the sub-object and ... */ + lu_object_ref_del(&sub->co_lu, "lov-parent", lov); + cl_object_put(env, sub); + + /* ... wait until it is actually destroyed---sub-object clears its + * ->lo_sub[] slot in lovsub_object_free() */ + if (r0->lo_sub[idx] == los) { + waiter = &lov_env_info(env)->lti_waiter; + init_waitqueue_entry(waiter, current); + add_wait_queue(wq, waiter); + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) { + /* this wait-queue is signaled at the end of + * lu_object_free(). */ + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock(&r0->lo_sub_lock); + if (r0->lo_sub[idx] == los) { + spin_unlock(&r0->lo_sub_lock); + schedule(); + } else { + spin_unlock(&r0->lo_sub_lock); + set_current_state(TASK_RUNNING); + break; + } + } + remove_wait_queue(wq, waiter); + } + LASSERT(r0->lo_sub[idx] == NULL); +} + +static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov, + struct lov_layout_entry *lle) +{ + struct lov_layout_raid0 *r0 = &lle->lle_raid0; + + ENTRY; + + if (r0->lo_sub != NULL) { + int i; + + for (i = 0; i < r0->lo_nr; ++i) { + struct lovsub_object *los = r0->lo_sub[i]; + + if (los != NULL) { + cl_object_prune(env, &los->lso_cl); + /* + * If top-level object is to be evicted from + * the cache, so are its sub-objects. + */ + lov_subobject_kill(env, lov, r0, los, i); + } + } + } + + EXIT; +} + +static void lov_fini_raid0(const struct lu_env *env, + struct lov_layout_entry *lle) +{ + struct lov_layout_raid0 *r0 = &lle->lle_raid0; + + if (r0->lo_sub != NULL) { + OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]); + r0->lo_sub = NULL; + } +} + +static int lov_print_raid0(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lov_layout_entry *lle) +{ + const struct lov_layout_raid0 *r0 = &lle->lle_raid0; + int i; + + for (i = 0; i < r0->lo_nr; ++i) { + struct lu_object *sub; + + if (r0->lo_sub[i] != NULL) { + sub = lovsub2lu(r0->lo_sub[i]); + lu_object_print(env, cookie, p, sub); + } else { + (*p)(env, cookie, "sub %d absent\n", i); + } + } + return 0; +} + +static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov, + unsigned int index, struct lov_layout_entry *lle, + struct cl_attr **lov_attr) +{ + struct lov_layout_raid0 *r0 = &lle->lle_raid0; + struct lov_stripe_md *lsm = lov->lo_lsm; + struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb; + struct cl_attr *attr = &r0->lo_attr; + __u64 kms = 0; + int result = 0; + + if (r0->lo_attr_valid) { + *lov_attr = attr; + return 0; + } + + memset(lvb, 0, sizeof(*lvb)); + + /* XXX: timestamps can be negative by sanity:test_39m, + * how can it be? */ + lvb->lvb_atime = LLONG_MIN; + lvb->lvb_ctime = LLONG_MIN; + lvb->lvb_mtime = LLONG_MIN; + + /* + * XXX that should be replaced with a loop over sub-objects, + * doing cl_object_attr_get() on them. But for now, let's + * reuse old lov code. + */ + + /* + * XXX take lsm spin-lock to keep lov_merge_lvb_kms() + * happy. It's not needed, because new code uses + * ->coh_attr_guard spin-lock to protect consistency of + * sub-object attributes. + */ + lov_stripe_lock(lsm); + result = lov_merge_lvb_kms(lsm, index, lvb, &kms); + lov_stripe_unlock(lsm); + if (result == 0) { + cl_lvb2attr(attr, lvb); + attr->cat_kms = kms; + r0->lo_attr_valid = 1; + *lov_attr = attr; + } + + return result; +} + +static struct lov_comp_layout_entry_ops raid0_ops = { + .lco_init = lov_init_raid0, + .lco_fini = lov_fini_raid0, + .lco_getattr = lov_attr_get_raid0, +}; + +static int lov_attr_get_dom(const struct lu_env *env, struct lov_object *lov, + unsigned int index, struct lov_layout_entry *lle, + struct cl_attr **lov_attr) +{ + struct lov_layout_dom *dom = &lle->lle_dom; + struct lov_oinfo *loi = dom->lo_loi; + struct cl_attr *attr = &dom->lo_dom_r0.lo_attr; + + if (dom->lo_dom_r0.lo_attr_valid) { + *lov_attr = attr; + return 0; + } + + if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) + return OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks); + + cl_lvb2attr(attr, &loi->loi_lvb); + + /* DoM component size can be bigger than stripe size after + * client's setattr RPC, so do not count anything beyond + * component end. Alternatively, check that limit on server + * and do not allow size overflow there. */ + if (attr->cat_size > lle->lle_extent->e_end) + attr->cat_size = lle->lle_extent->e_end; + + attr->cat_kms = attr->cat_size; + + dom->lo_dom_r0.lo_attr_valid = 1; + *lov_attr = attr; + + return 0; +} + +/** + * Lookup FLD to get MDS index of the given DOM object FID. + * + * \param[in] ld LOV device + * \param[in] fid FID to lookup + * \param[out] nr index in MDC array to return back + * + * \retval 0 and \a mds filled with MDS index if successful + * \retval negative value on error + */ +static int lov_fld_lookup(struct lov_device *ld, const struct lu_fid *fid, + __u32 *nr) +{ + __u32 mds_idx; + int i, rc; + + ENTRY; + + rc = fld_client_lookup(&ld->ld_lmv->u.lmv.lmv_fld, fid_seq(fid), + &mds_idx, LU_SEQ_RANGE_MDT, NULL); + if (rc) { + CERROR("%s: error while looking for mds number. Seq %#llx" + ", err = %d\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)), + fid_seq(fid), rc); + RETURN(rc); + } + + CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n", + mds_idx, PFID(fid)); + + /* find proper MDC device in the array */ + for (i = 0; i < ld->ld_md_tgts_nr; i++) { + if (ld->ld_md_tgts[i].ldm_mdc != NULL && + ld->ld_md_tgts[i].ldm_idx == mds_idx) + break; + } + + if (i == ld->ld_md_tgts_nr) { + CERROR("%s: cannot find corresponding MDC device for mds #%x " + "for fid="DFID"\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)), + mds_idx, PFID(fid)); + rc = -EINVAL; + } else { + *nr = i; + } + RETURN(rc); +} + +/** + * Implementation of lov_comp_layout_entry_ops::lco_init for DOM object. + * + * Init the DOM object for the first time. It prepares also RAID0 entry + * for it to use in common methods with ordinary RAID0 layout entries. + * + * \param[in] env execution environment + * \param[in] dev LOV device + * \param[in] lov LOV object + * \param[in] index Composite layout entry index in LSM + * \param[in] lle Composite LOV layout entry + */ +static int lov_init_dom(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, unsigned int index, + const struct cl_object_conf *conf, + struct lov_layout_entry *lle) +{ + struct lov_thread_info *lti = lov_env_info(env); + struct lov_stripe_md_entry *lsme = lov_lse(lov, index); + struct cl_object *clo; + struct lu_object *o = lov2lu(lov); + const struct lu_fid *fid = lu_object_fid(o); + struct cl_device *mdcdev; + struct lov_oinfo *loi = NULL; + struct cl_object_conf *sconf = <i->lti_stripe_conf; + + int rc; + __u32 idx = 0; + + ENTRY; + + LASSERT(index == 0); + + /* find proper MDS device */ + rc = lov_fld_lookup(dev, fid, &idx); + if (rc) + RETURN(rc); + + LASSERTF(dev->ld_md_tgts[idx].ldm_mdc != NULL, + "LOV md target[%u] is NULL\n", idx); + + /* check lsm is DOM, more checks are needed */ + LASSERT(lsme->lsme_stripe_count == 0); + + /* + * Create lower cl_objects. + */ + mdcdev = dev->ld_md_tgts[idx].ldm_mdc; + + LASSERTF(mdcdev != NULL, "non-initialized mdc subdev\n"); + + /* DoM object has no oinfo in LSM entry, create it exclusively */ + OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS); + if (loi == NULL) + RETURN(-ENOMEM); + + fid_to_ostid(lu_object_fid(lov2lu(lov)), &loi->loi_oi); + + sconf->u.coc_oinfo = loi; +again: + clo = lov_sub_find(env, mdcdev, fid, sconf); + if (IS_ERR(clo)) + GOTO(out, rc = PTR_ERR(clo)); + + rc = lov_init_sub(env, lov, clo, loi, lov_comp_index(index, 0)); + if (rc == -EAGAIN) /* try again */ + goto again; + else if (rc != 0) + GOTO(out, rc); + + lle->lle_dom.lo_dom = cl2lovsub(clo); + spin_lock_init(&lle->lle_dom.lo_dom_r0.lo_sub_lock); + lle->lle_dom.lo_dom_r0.lo_nr = 1; + lle->lle_dom.lo_dom_r0.lo_sub = &lle->lle_dom.lo_dom; + lle->lle_dom.lo_loi = loi; + + rc = lov_page_slice_fixup(lov, clo); + RETURN(rc); + +out: + if (loi != NULL) + OBD_SLAB_FREE_PTR(loi, lov_oinfo_slab); + return rc; +} + +/** + * Implementation of lov_layout_operations::llo_fini for DOM object. + * + * Finish the DOM object and free related memory. + * + * \param[in] env execution environment + * \param[in] lov LOV object + * \param[in] state LOV layout state + */ +static void lov_fini_dom(const struct lu_env *env, + struct lov_layout_entry *lle) +{ + if (lle->lle_dom.lo_dom != NULL) + lle->lle_dom.lo_dom = NULL; + if (lle->lle_dom.lo_loi != NULL) + OBD_SLAB_FREE_PTR(lle->lle_dom.lo_loi, lov_oinfo_slab); +} + +static struct lov_comp_layout_entry_ops dom_ops = { + .lco_init = lov_init_dom, + .lco_fini = lov_fini_dom, + .lco_getattr = lov_attr_get_dom, +}; + +static int lov_init_composite(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, struct lov_stripe_md *lsm, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + struct lov_layout_composite *comp = &state->composite; + struct lov_layout_entry *lle; + struct lov_mirror_entry *lre; + unsigned int entry_count; + unsigned int psz = 0; + unsigned int mirror_count; + int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK; + int result = 0; + unsigned int seq; + int i, j; + + ENTRY; + + LASSERT(lsm->lsm_entry_count > 0); + LASSERT(lov->lo_lsm == NULL); + lov->lo_lsm = lsm_addref(lsm); + lov->lo_layout_invalid = true; + + dump_lsm(D_INODE, lsm); + + entry_count = lsm->lsm_entry_count; + + spin_lock_init(&comp->lo_write_lock); + comp->lo_flags = lsm->lsm_flags; + comp->lo_mirror_count = lsm->lsm_mirror_count + 1; + comp->lo_entry_count = lsm->lsm_entry_count; + comp->lo_preferred_mirror = -1; + + if (equi(flr_state == LCM_FL_NONE, comp->lo_mirror_count > 1)) + RETURN(-EINVAL); + + OBD_ALLOC(comp->lo_mirrors, + comp->lo_mirror_count * sizeof(*comp->lo_mirrors)); + if (comp->lo_mirrors == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC(comp->lo_entries, entry_count * sizeof(*comp->lo_entries)); + if (comp->lo_entries == NULL) + RETURN(-ENOMEM); + + /* Initiate all entry types and extents data at first */ + for (i = 0, j = 0, mirror_count = 1; i < entry_count; i++) { + int mirror_id = 0; + + lle = &comp->lo_entries[i]; + + lle->lle_lsme = lsm->lsm_entries[i]; + lle->lle_type = lov_entry_type(lle->lle_lsme); + switch (lle->lle_type) { + case LOV_PATTERN_RAID0: + lle->lle_comp_ops = &raid0_ops; + break; + case LOV_PATTERN_MDT: + lle->lle_comp_ops = &dom_ops; + break; + default: + CERROR("%s: unknown composite layout entry type %i\n", + lov2obd(dev->ld_lov)->obd_name, + lsm->lsm_entries[i]->lsme_pattern); + dump_lsm(D_ERROR, lsm); + RETURN(-EIO); + } + + lle->lle_extent = &lle->lle_lsme->lsme_extent; + lle->lle_valid = !(lle->lle_lsme->lsme_flags & LCME_FL_STALE); + + if (flr_state != LCM_FL_NONE) + mirror_id = mirror_id_of(lle->lle_lsme->lsme_id); + + lre = &comp->lo_mirrors[j]; + if (i > 0) { + if (mirror_id == lre->lre_mirror_id) { + lre->lre_valid |= lle->lle_valid; + lre->lre_stale |= !lle->lle_valid; + lre->lre_end = i; + continue; + } + + /* new mirror detected, assume that the mirrors + * are shorted in layout */ + ++mirror_count; + ++j; + if (j >= comp->lo_mirror_count) + break; + + lre = &comp->lo_mirrors[j]; + } + + /* entries must be sorted by mirrors */ + lre->lre_mirror_id = mirror_id; + lre->lre_start = lre->lre_end = i; + lre->lre_preferred = !!(lle->lle_lsme->lsme_flags & + LCME_FL_PREF_RD); + lre->lre_valid = lle->lle_valid; + lre->lre_stale = !lle->lle_valid; + } + + /* sanity check for FLR */ + if (mirror_count != comp->lo_mirror_count) { + CDEBUG(D_INODE, DFID + " doesn't have the # of mirrors it claims, %u/%u\n", + PFID(lu_object_fid(lov2lu(lov))), mirror_count, + comp->lo_mirror_count + 1); + + GOTO(out, result = -EINVAL); + } + + lov_foreach_layout_entry(lov, lle) { + int index = lov_layout_entry_index(lov, lle); + + /** + * If the component has not been init-ed on MDS side, for + * PFL layout, we'd know that the components beyond this one + * will be dynamically init-ed later on file write/trunc ops. + */ + if (!lsme_inited(lle->lle_lsme)) + continue; + + result = lle->lle_comp_ops->lco_init(env, dev, lov, index, + conf, lle); + if (result < 0) + break; + + LASSERT(ergo(psz > 0, psz == result)); + psz = result; + } + + if (psz > 0) + cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz; + + /* decide the preferred mirror. It uses the hash value of lov_object + * so that different clients would use different mirrors for read. */ + mirror_count = 0; + seq = hash_long((unsigned long)lov, 8); + for (i = 0; i < comp->lo_mirror_count; i++) { + unsigned int idx = (i + seq) % comp->lo_mirror_count; + + lre = lov_mirror_entry(lov, idx); + if (lre->lre_stale) + continue; + + mirror_count++; /* valid mirror */ + + if (lre->lre_preferred || comp->lo_preferred_mirror < 0) + comp->lo_preferred_mirror = idx; + } + if (!mirror_count) { + CDEBUG(D_INODE, DFID + " doesn't have any valid mirrors\n", + PFID(lu_object_fid(lov2lu(lov)))); + + comp->lo_preferred_mirror = 0; + } + + LASSERT(comp->lo_preferred_mirror >= 0); + + EXIT; +out: + return result > 0 ? 0 : result; +} + +static int lov_init_empty(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, struct lov_stripe_md *lsm, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + return 0; +} + +static int lov_init_released(const struct lu_env *env, + struct lov_device *dev, struct lov_object *lov, + struct lov_stripe_md *lsm, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + LASSERT(lsm != NULL); + LASSERT(lsm->lsm_is_released); + LASSERT(lov->lo_lsm == NULL); + + lov->lo_lsm = lsm_addref(lsm); + return 0; +} + +static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED); + + lov_layout_wait(env, lov); + return 0; +} + +static int lov_delete_composite(const struct lu_env *env, + struct lov_object *lov, + union lov_layout_state *state) +{ + struct lov_layout_entry *entry; + struct lov_layout_composite *comp = &state->composite; + + ENTRY; + + dump_lsm(D_INODE, lov->lo_lsm); + + lov_layout_wait(env, lov); + if (comp->lo_entries) + lov_foreach_layout_entry(lov, entry) + lov_delete_raid0(env, lov, entry); + + RETURN(0); +} + +static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED); +} + +static void lov_fini_composite(const struct lu_env *env, + struct lov_object *lov, + union lov_layout_state *state) +{ + struct lov_layout_composite *comp = &state->composite; + ENTRY; + + if (comp->lo_entries != NULL) { + struct lov_layout_entry *entry; + + lov_foreach_layout_entry(lov, entry) + entry->lle_comp_ops->lco_fini(env, entry); + + OBD_FREE(comp->lo_entries, + comp->lo_entry_count * sizeof(*comp->lo_entries)); + comp->lo_entries = NULL; + } + + if (comp->lo_mirrors != NULL) { + OBD_FREE(comp->lo_mirrors, + comp->lo_mirror_count * sizeof(*comp->lo_mirrors)); + comp->lo_mirrors = NULL; + } + + memset(comp, 0, sizeof(*comp)); + + dump_lsm(D_INODE, lov->lo_lsm); + lov_free_memmd(&lov->lo_lsm); + + EXIT; +} + +static void lov_fini_released(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + ENTRY; + dump_lsm(D_INODE, lov->lo_lsm); + lov_free_memmd(&lov->lo_lsm); + EXIT; +} + +static int lov_print_empty(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + (*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid); + return 0; +} + +static int lov_print_composite(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct lov_object *lov = lu2lov(o); + struct lov_stripe_md *lsm = lov->lo_lsm; + int i; + + (*p)(env, cookie, "entries: %d, %s, lsm{%p 0x%08X %d %u}:\n", + lsm->lsm_entry_count, + lov->lo_layout_invalid ? "invalid" : "valid", lsm, + lsm->lsm_magic, atomic_read(&lsm->lsm_refc), + lsm->lsm_layout_gen); + + for (i = 0; i < lsm->lsm_entry_count; i++) { + struct lov_stripe_md_entry *lse = lsm->lsm_entries[i]; + struct lov_layout_entry *lle = lov_entry(lov, i); + + (*p)(env, cookie, + DEXT ": { 0x%08X, %u, %#x, %u, %#x, %u, %u }\n", + PEXT(&lse->lsme_extent), lse->lsme_magic, + lse->lsme_id, lse->lsme_pattern, lse->lsme_layout_gen, + lse->lsme_flags, lse->lsme_stripe_count, + lse->lsme_stripe_size); + lov_print_raid0(env, cookie, p, lle); + } + + return 0; +} + +static int lov_print_released(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct lov_object *lov = lu2lov(o); + struct lov_stripe_md *lsm = lov->lo_lsm; + + (*p)(env, cookie, + "released: %s, lsm{%p 0x%08X %d %u}:\n", + lov->lo_layout_invalid ? "invalid" : "valid", lsm, + lsm->lsm_magic, atomic_read(&lsm->lsm_refc), + lsm->lsm_layout_gen); + return 0; +} + +/** + * Implements cl_object_operations::coo_attr_get() method for an object + * without stripes (LLT_EMPTY layout type). + * + * The only attributes this layer is authoritative in this case is + * cl_attr::cat_blocks---it's 0. + */ +static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + attr->cat_blocks = 0; + return 0; +} + +static int lov_attr_get_composite(const struct lu_env *env, + struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_layout_entry *entry; + int result = 0; + + ENTRY; + + attr->cat_size = 0; + attr->cat_blocks = 0; + lov_foreach_layout_entry(lov, entry) { + struct cl_attr *lov_attr = NULL; + int index = lov_layout_entry_index(lov, entry); + + if (!entry->lle_valid) + continue; + + /* PFL: This component has not been init-ed. */ + if (!lsm_entry_inited(lov->lo_lsm, index)) + continue; + + result = entry->lle_comp_ops->lco_getattr(env, lov, index, + entry, &lov_attr); + if (result < 0) + RETURN(result); + + if (lov_attr == NULL) + continue; + + CDEBUG(D_INODE, "COMP ID #%i: s=%llu m=%llu a=%llu c=%llu " + "b=%llu\n", index - 1, lov_attr->cat_size, + lov_attr->cat_mtime, lov_attr->cat_atime, + lov_attr->cat_ctime, lov_attr->cat_blocks); + + /* merge results */ + attr->cat_blocks += lov_attr->cat_blocks; + if (attr->cat_size < lov_attr->cat_size) + attr->cat_size = lov_attr->cat_size; + if (attr->cat_kms < lov_attr->cat_kms) + attr->cat_kms = lov_attr->cat_kms; + if (attr->cat_atime < lov_attr->cat_atime) + attr->cat_atime = lov_attr->cat_atime; + if (attr->cat_ctime < lov_attr->cat_ctime) + attr->cat_ctime = lov_attr->cat_ctime; + if (attr->cat_mtime < lov_attr->cat_mtime) + attr->cat_mtime = lov_attr->cat_mtime; + } + + RETURN(0); +} + +static int lov_flush_composite(const struct lu_env *env, + struct cl_object *obj, + struct ldlm_lock *lock) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_layout_entry *lle; + int rc = -ENODATA; + + ENTRY; + + lov_foreach_layout_entry(lov, lle) { + if (!lsme_is_dom(lle->lle_lsme)) + continue; + rc = cl_object_flush(env, lovsub2cl(lle->lle_dom.lo_dom), lock); + break; + } + + RETURN(rc); +} + +static int lov_flush_empty(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock) +{ + return 0; +} + +const static struct lov_layout_operations lov_dispatch[] = { + [LLT_EMPTY] = { + .llo_init = lov_init_empty, + .llo_delete = lov_delete_empty, + .llo_fini = lov_fini_empty, + .llo_print = lov_print_empty, + .llo_page_init = lov_page_init_empty, + .llo_lock_init = lov_lock_init_empty, + .llo_io_init = lov_io_init_empty, + .llo_getattr = lov_attr_get_empty, + .llo_flush = lov_flush_empty, + }, + [LLT_RELEASED] = { + .llo_init = lov_init_released, + .llo_delete = lov_delete_empty, + .llo_fini = lov_fini_released, + .llo_print = lov_print_released, + .llo_page_init = lov_page_init_empty, + .llo_lock_init = lov_lock_init_empty, + .llo_io_init = lov_io_init_released, + .llo_getattr = lov_attr_get_empty, + .llo_flush = lov_flush_empty, + }, + [LLT_COMP] = { + .llo_init = lov_init_composite, + .llo_delete = lov_delete_composite, + .llo_fini = lov_fini_composite, + .llo_print = lov_print_composite, + .llo_page_init = lov_page_init_composite, + .llo_lock_init = lov_lock_init_composite, + .llo_io_init = lov_io_init_composite, + .llo_getattr = lov_attr_get_composite, + .llo_flush = lov_flush_composite, + }, +}; + +/** + * Performs a double-dispatch based on the layout type of an object. + */ +#define LOV_2DISPATCH_NOLOCK(obj, op, ...) \ +({ \ + struct lov_object *__obj = (obj); \ + enum lov_layout_type __llt; \ + \ + __llt = __obj->lo_type; \ + LASSERT(__llt < ARRAY_SIZE(lov_dispatch)); \ + lov_dispatch[__llt].op(__VA_ARGS__); \ +}) + +/** + * Return lov_layout_type associated with a given lsm + */ +static enum lov_layout_type lov_type(struct lov_stripe_md *lsm) +{ + if (lsm == NULL) + return LLT_EMPTY; + + if (lsm->lsm_is_released) + return LLT_RELEASED; + + if (lsm->lsm_magic == LOV_MAGIC_V1 || + lsm->lsm_magic == LOV_MAGIC_V3 || + lsm->lsm_magic == LOV_MAGIC_COMP_V1) + return LLT_COMP; + + return LLT_EMPTY; +} + +static inline void lov_conf_freeze(struct lov_object *lov) +{ + CDEBUG(D_INODE, "To take share lov(%p) owner %p/%p\n", + lov, lov->lo_owner, current); + if (lov->lo_owner != current) + down_read(&lov->lo_type_guard); +} + +static inline void lov_conf_thaw(struct lov_object *lov) +{ + CDEBUG(D_INODE, "To release share lov(%p) owner %p/%p\n", + lov, lov->lo_owner, current); + if (lov->lo_owner != current) + up_read(&lov->lo_type_guard); +} + +#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...) \ +({ \ + struct lov_object *__obj = (obj); \ + int __lock = !!(lock); \ + typeof(lov_dispatch[0].op(__VA_ARGS__)) __result; \ + \ + if (__lock) \ + lov_conf_freeze(__obj); \ + __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__); \ + if (__lock) \ + lov_conf_thaw(__obj); \ + __result; \ +}) + +/** + * Performs a locked double-dispatch based on the layout type of an object. + */ +#define LOV_2DISPATCH(obj, op, ...) \ + LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__) + +#define LOV_2DISPATCH_VOID(obj, op, ...) \ +do { \ + struct lov_object *__obj = (obj); \ + enum lov_layout_type __llt; \ + \ + lov_conf_freeze(__obj); \ + __llt = __obj->lo_type; \ + LASSERT(__llt < ARRAY_SIZE(lov_dispatch)); \ + lov_dispatch[__llt].op(__VA_ARGS__); \ + lov_conf_thaw(__obj); \ +} while (0) + +static void lov_conf_lock(struct lov_object *lov) +{ + LASSERT(lov->lo_owner != current); + down_write(&lov->lo_type_guard); + LASSERT(lov->lo_owner == NULL); + lov->lo_owner = current; + CDEBUG(D_INODE, "Took exclusive lov(%p) owner %p\n", + lov, lov->lo_owner); +} + +static void lov_conf_unlock(struct lov_object *lov) +{ + CDEBUG(D_INODE, "To release exclusive lov(%p) owner %p\n", + lov, lov->lo_owner); + lov->lo_owner = NULL; + up_write(&lov->lo_type_guard); +} + +static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov) +{ + struct l_wait_info lwi = { 0 }; + ENTRY; + + while (atomic_read(&lov->lo_active_ios) > 0) { + CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n", + PFID(lu_object_fid(lov2lu(lov))), + atomic_read(&lov->lo_active_ios)); + + l_wait_event(lov->lo_waitq, + atomic_read(&lov->lo_active_ios) == 0, &lwi); + } + RETURN(0); +} + +static int lov_layout_change(const struct lu_env *unused, + struct lov_object *lov, struct lov_stripe_md *lsm, + const struct cl_object_conf *conf) +{ + enum lov_layout_type llt = lov_type(lsm); + union lov_layout_state *state = &lov->u; + const struct lov_layout_operations *old_ops; + const struct lov_layout_operations *new_ops; + struct lov_device *lov_dev = lov_object_dev(lov); + struct lu_env *env; + __u16 refcheck; + int rc; + ENTRY; + + LASSERT(lov->lo_type < ARRAY_SIZE(lov_dispatch)); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + LASSERT(llt < ARRAY_SIZE(lov_dispatch)); + + CDEBUG(D_INODE, DFID" from %s to %s\n", + PFID(lu_object_fid(lov2lu(lov))), + llt2str(lov->lo_type), llt2str(llt)); + + old_ops = &lov_dispatch[lov->lo_type]; + new_ops = &lov_dispatch[llt]; + + rc = cl_object_prune(env, &lov->lo_cl); + if (rc != 0) + GOTO(out, rc); + + rc = old_ops->llo_delete(env, lov, &lov->u); + if (rc != 0) + GOTO(out, rc); + + old_ops->llo_fini(env, lov, &lov->u); + + LASSERT(atomic_read(&lov->lo_active_ios) == 0); + + CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n", + PFID(lu_object_fid(lov2lu(lov))), lov, llt); + + /* page bufsize fixup */ + cl_object_header(&lov->lo_cl)->coh_page_bufsize -= + lov_page_slice_fixup(lov, NULL); + + lov->lo_type = llt; + rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state); + if (rc != 0) { + struct obd_device *obd = lov2obd(lov_dev->ld_lov); + + CERROR("%s: cannot apply new layout on "DFID" : rc = %d\n", + obd->obd_name, PFID(lu_object_fid(lov2lu(lov))), rc); + new_ops->llo_delete(env, lov, state); + new_ops->llo_fini(env, lov, state); + /* this file becomes an EMPTY file. */ + lov->lo_type = LLT_EMPTY; + GOTO(out, rc); + } + +out: + cl_env_put(env, &refcheck); + RETURN(rc); +} + +/***************************************************************************** + * + * Lov object operations. + * + */ +int lov_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct lov_object *lov = lu2lov(obj); + struct lov_device *dev = lov_object_dev(lov); + const struct cl_object_conf *cconf = lu2cl_conf(conf); + union lov_layout_state *set = &lov->u; + const struct lov_layout_operations *ops; + struct lov_stripe_md *lsm = NULL; + int rc; + ENTRY; + + init_rwsem(&lov->lo_type_guard); + atomic_set(&lov->lo_active_ios, 0); + init_waitqueue_head(&lov->lo_waitq); + cl_object_page_init(lu2cl(obj), sizeof(struct lov_page)); + + lov->lo_type = LLT_EMPTY; + if (cconf->u.coc_layout.lb_buf != NULL) { + lsm = lov_unpackmd(dev->ld_lov, + cconf->u.coc_layout.lb_buf, + cconf->u.coc_layout.lb_len); + if (IS_ERR(lsm)) + RETURN(PTR_ERR(lsm)); + + dump_lsm(D_INODE, lsm); + } + + /* no locking is necessary, as object is being created */ + lov->lo_type = lov_type(lsm); + ops = &lov_dispatch[lov->lo_type]; + rc = ops->llo_init(env, dev, lov, lsm, cconf, set); + if (rc != 0) + GOTO(out_lsm, rc); + +out_lsm: + lov_lsm_put(lsm); + + RETURN(rc); +} + +static int lov_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct lov_stripe_md *lsm = NULL; + struct lov_object *lov = cl2lov(obj); + int result = 0; + ENTRY; + + if (conf->coc_opc == OBJECT_CONF_SET && + conf->u.coc_layout.lb_buf != NULL) { + lsm = lov_unpackmd(lov_object_dev(lov)->ld_lov, + conf->u.coc_layout.lb_buf, + conf->u.coc_layout.lb_len); + if (IS_ERR(lsm)) + RETURN(PTR_ERR(lsm)); + dump_lsm(D_INODE, lsm); + } + + lov_conf_lock(lov); + if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { + lov->lo_layout_invalid = true; + GOTO(out, result = 0); + } + + if (conf->coc_opc == OBJECT_CONF_WAIT) { + if (lov->lo_layout_invalid && + atomic_read(&lov->lo_active_ios) > 0) { + lov_conf_unlock(lov); + result = lov_layout_wait(env, lov); + lov_conf_lock(lov); + } + GOTO(out, result); + } + + LASSERT(conf->coc_opc == OBJECT_CONF_SET); + + if ((lsm == NULL && lov->lo_lsm == NULL) || + ((lsm != NULL && lov->lo_lsm != NULL) && + (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) && + (lov->lo_lsm->lsm_entries[0]->lsme_pattern == + lsm->lsm_entries[0]->lsme_pattern))) { + /* same version of layout */ + lov->lo_layout_invalid = false; + GOTO(out, result = 0); + } + + /* will change layout - check if there still exists active IO. */ + if (atomic_read(&lov->lo_active_ios) > 0) { + lov->lo_layout_invalid = true; + GOTO(out, result = -EBUSY); + } + + result = lov_layout_change(env, lov, lsm, conf); + lov->lo_layout_invalid = result != 0; + EXIT; + +out: + lov_conf_unlock(lov); + lov_lsm_put(lsm); + CDEBUG(D_INODE, DFID" lo_layout_invalid=%d\n", + PFID(lu_object_fid(lov2lu(lov))), lov->lo_layout_invalid); + RETURN(result); +} + +static void lov_object_delete(const struct lu_env *env, struct lu_object *obj) +{ + struct lov_object *lov = lu2lov(obj); + + ENTRY; + LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u); + EXIT; +} + +static void lov_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct lov_object *lov = lu2lov(obj); + + ENTRY; + LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u); + lu_object_fini(obj); + OBD_SLAB_FREE_PTR(lov, lov_object_kmem); + EXIT; +} + +static int lov_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o); +} + +int lov_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_page_init, env, obj, page, + index); +} + +/** + * Implements cl_object_operations::clo_io_init() method for lov + * layer. Dispatches to the appropriate layout io initialization method. + */ +int lov_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + CL_IO_SLICE_CLEAN(lov_env_io(env), lis_preserved); + + CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n", + PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type, + io->ci_ignore_layout, io->ci_verify_layout); + + /* IO type CIT_MISC with ci_ignore_layout set are usually invoked from + * the OSC layer. It shouldn't take lov layout conf lock in that case, + * because as long as the OSC object exists, the layout can't be + * reconfigured. */ + return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init, + !(io->ci_ignore_layout && io->ci_type == CIT_MISC), + env, obj, io); +} + +/** + * An implementation of cl_object_operations::clo_attr_get() method for lov + * layer. For raid0 layout this collects and merges attributes of all + * sub-objects. + */ +static int lov_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + /* do not take lock, as this function is called under a + * spin-lock. Layout is protected from changing by ongoing IO. */ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr); +} + +static int lov_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + /* + * No dispatch is required here, as no layout implements this. + */ + return 0; +} + +int lov_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + /* No need to lock because we've taken one refcount of layout. */ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock, + io); +} + +/** + * We calculate on which OST the mapping will end. If the length of mapping + * is greater than (stripe_size * stripe_count) then the last_stripe will + * will be one just before start_stripe. Else we check if the mapping + * intersects each OST and find last_stripe. + * This function returns the last_stripe and also sets the stripe_count + * over which the mapping is spread + * + * \param lsm [in] striping information for the file + * \param index [in] stripe component index + * \param ext [in] logical extent of mapping + * \param start_stripe [in] starting stripe of the mapping + * \param stripe_count [out] the number of stripes across which to map is + * returned + * + * \retval last_stripe return the last stripe of the mapping + */ +static int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, int index, + struct lu_extent *ext, + int start_stripe, int *stripe_count) +{ + struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index]; + int last_stripe; + u64 obd_start; + u64 obd_end; + int i, j; + + if (ext->e_end - ext->e_start > + lsme->lsme_stripe_size * lsme->lsme_stripe_count) { + last_stripe = (start_stripe < 1 ? lsme->lsme_stripe_count - 1 : + start_stripe - 1); + *stripe_count = lsme->lsme_stripe_count; + } else { + for (j = 0, i = start_stripe; j < lsme->lsme_stripe_count; + i = (i + 1) % lsme->lsme_stripe_count, j++) { + if ((lov_stripe_intersects(lsm, index, i, ext, + &obd_start, &obd_end)) == 0) + break; + } + *stripe_count = j; + last_stripe = (start_stripe + j - 1) % lsme->lsme_stripe_count; + } + + return last_stripe; +} + +/** + * Set fe_device and copy extents from local buffer into main return buffer. + * + * \param fiemap [out] fiemap to hold all extents + * \param lcl_fm_ext [in] array of fiemap extents get from OSC layer + * \param ost_index [in] OST index to be written into the fm_device + * field for each extent + * \param ext_count [in] number of extents to be copied + * \param current_extent [in] where to start copying in the extent array + */ +static void fiemap_prepare_and_copy_exts(struct fiemap *fiemap, + struct fiemap_extent *lcl_fm_ext, + int ost_index, unsigned int ext_count, + int current_extent) +{ + char *to; + unsigned int ext; + + for (ext = 0; ext < ext_count; ext++) { + lcl_fm_ext[ext].fe_device = ost_index; + lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET; + } + + /* Copy fm_extent's from fm_local to return buffer */ + to = (char *)fiemap + fiemap_count_to_size(current_extent); + memcpy(to, lcl_fm_ext, ext_count * sizeof(struct fiemap_extent)); +} + +#define FIEMAP_BUFFER_SIZE 4096 + +/** + * Non-zero fe_logical indicates that this is a continuation FIEMAP + * call. The local end offset and the device are sent in the first + * fm_extent. This function calculates the stripe number from the index. + * This function returns a stripe_no on which mapping is to be restarted. + * + * This function returns fm_end_offset which is the in-OST offset at which + * mapping should be restarted. If fm_end_offset=0 is returned then caller + * will re-calculate proper offset in next stripe. + * Note that the first extent is passed to lov_get_info via the value field. + * + * \param fiemap [in] fiemap request header + * \param lsm [in] striping information for the file + * \param index [in] stripe component index + * \param ext [in] logical extent of mapping + * \param start_stripe [out] starting stripe will be returned in this + */ +static u64 fiemap_calc_fm_end_offset(struct fiemap *fiemap, + struct lov_stripe_md *lsm, + int index, struct lu_extent *ext, + int *start_stripe) +{ + struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index]; + u64 local_end = fiemap->fm_extents[0].fe_logical; + u64 lun_start; + u64 lun_end; + u64 fm_end_offset; + int stripe_no = -1; + int i; + + if (fiemap->fm_extent_count == 0 || + fiemap->fm_extents[0].fe_logical == 0) + return 0; + + /* Find out stripe_no from ost_index saved in the fe_device */ + for (i = 0; i < lsme->lsme_stripe_count; i++) { + struct lov_oinfo *oinfo = lsme->lsme_oinfo[i]; + + if (lov_oinfo_is_dummy(oinfo)) + continue; + + if (oinfo->loi_ost_idx == fiemap->fm_extents[0].fe_device) { + stripe_no = i; + break; + } + } + + if (stripe_no == -1) + return -EINVAL; + + /* If we have finished mapping on previous device, shift logical + * offset to start of next device */ + if (lov_stripe_intersects(lsm, index, stripe_no, ext, + &lun_start, &lun_end) != 0 && + local_end < lun_end) { + fm_end_offset = local_end; + *start_stripe = stripe_no; + } else { + /* This is a special value to indicate that caller should + * calculate offset in next stripe. */ + fm_end_offset = 0; + *start_stripe = (stripe_no + 1) % lsme->lsme_stripe_count; + } + + return fm_end_offset; +} + +struct fiemap_state { + struct fiemap *fs_fm; + struct lu_extent fs_ext; + u64 fs_length; + u64 fs_end_offset; + int fs_cur_extent; + int fs_cnt_need; + int fs_start_stripe; + int fs_last_stripe; + bool fs_device_done; + bool fs_finish_stripe; + bool fs_enough; +}; + +static struct cl_object *lov_find_subobj(const struct lu_env *env, + struct lov_object *lov, + struct lov_stripe_md *lsm, + int index) +{ + struct lov_device *dev = lu2lov_dev(lov2lu(lov)->lo_dev); + struct lov_thread_info *lti = lov_env_info(env); + struct lu_fid *ofid = <i->lti_fid; + struct lov_oinfo *oinfo; + struct cl_device *subdev; + int entry = lov_comp_entry(index); + int stripe = lov_comp_stripe(index); + int ost_idx; + int rc; + struct cl_object *result; + + if (lov->lo_type != LLT_COMP) + GOTO(out, result = NULL); + + if (entry >= lsm->lsm_entry_count || + stripe >= lsm->lsm_entries[entry]->lsme_stripe_count) + GOTO(out, result = NULL); + + oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe]; + ost_idx = oinfo->loi_ost_idx; + rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx); + if (rc != 0) + GOTO(out, result = NULL); + + subdev = lovsub2cl_dev(dev->ld_target[ost_idx]); + result = lov_sub_find(env, subdev, ofid, NULL); +out: + if (result == NULL) + result = ERR_PTR(-EINVAL); + return result; +} + +int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj, + struct lov_stripe_md *lsm, struct fiemap *fiemap, + size_t *buflen, struct ll_fiemap_info_key *fmkey, + int index, int stripeno, struct fiemap_state *fs) +{ + struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index]; + struct cl_object *subobj; + struct lov_obd *lov = lu2lov_dev(obj->co_lu.lo_dev)->ld_lov; + struct fiemap_extent *fm_ext = &fs->fs_fm->fm_extents[0]; + u64 req_fm_len; /* Stores length of required mapping */ + u64 len_mapped_single_call; + u64 lun_start; + u64 lun_end; + u64 obd_object_end; + unsigned int ext_count; + /* EOF for object */ + bool ost_eof = false; + /* done with required mapping for this OST? */ + bool ost_done = false; + int ost_index; + int rc = 0; + + fs->fs_device_done = false; + /* Find out range of mapping on this stripe */ + if ((lov_stripe_intersects(lsm, index, stripeno, &fs->fs_ext, + &lun_start, &obd_object_end)) == 0) + return 0; + + if (lov_oinfo_is_dummy(lsme->lsme_oinfo[stripeno])) + return -EIO; + + /* If this is a continuation FIEMAP call and we are on + * starting stripe then lun_start needs to be set to + * end_offset */ + if (fs->fs_end_offset != 0 && stripeno == fs->fs_start_stripe) + lun_start = fs->fs_end_offset; + lun_end = lov_size_to_stripe(lsm, index, fs->fs_ext.e_end, stripeno); + if (lun_start == lun_end) + return 0; + + req_fm_len = obd_object_end - lun_start + 1; + fs->fs_fm->fm_length = 0; + len_mapped_single_call = 0; + + /* find lobsub object */ + subobj = lov_find_subobj(env, cl2lov(obj), lsm, + lov_comp_index(index, stripeno)); + if (IS_ERR(subobj)) + return PTR_ERR(subobj); + /* If the output buffer is very large and the objects have many + * extents we may need to loop on a single OST repeatedly */ + do { + if (fiemap->fm_extent_count > 0) { + /* Don't get too many extents. */ + if (fs->fs_cur_extent + fs->fs_cnt_need > + fiemap->fm_extent_count) + fs->fs_cnt_need = fiemap->fm_extent_count - + fs->fs_cur_extent; + } + + lun_start += len_mapped_single_call; + fs->fs_fm->fm_length = req_fm_len - len_mapped_single_call; + req_fm_len = fs->fs_fm->fm_length; + /** + * If we've collected enough extent map, we'd request 1 more, + * to see whether we coincidentally finished all available + * extent map, so that FIEMAP_EXTENT_LAST would be set. + */ + fs->fs_fm->fm_extent_count = fs->fs_enough ? + 1 : fs->fs_cnt_need; + fs->fs_fm->fm_mapped_extents = 0; + fs->fs_fm->fm_flags = fiemap->fm_flags; + + ost_index = lsme->lsme_oinfo[stripeno]->loi_ost_idx; + + if (ost_index < 0 || ost_index >= lov->desc.ld_tgt_count) + GOTO(obj_put, rc = -EINVAL); + /* If OST is inactive, return extent with UNKNOWN flag. */ + if (!lov->lov_tgts[ost_index]->ltd_active) { + fs->fs_fm->fm_flags |= FIEMAP_EXTENT_LAST; + fs->fs_fm->fm_mapped_extents = 1; + + fm_ext[0].fe_logical = lun_start; + fm_ext[0].fe_length = obd_object_end - lun_start + 1; + fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN; + + goto inactive_tgt; + } + + fs->fs_fm->fm_start = lun_start; + fs->fs_fm->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER; + memcpy(&fmkey->lfik_fiemap, fs->fs_fm, sizeof(*fs->fs_fm)); + *buflen = fiemap_count_to_size(fs->fs_fm->fm_extent_count); + + rc = cl_object_fiemap(env, subobj, fmkey, fs->fs_fm, buflen); + if (rc != 0) + GOTO(obj_put, rc); +inactive_tgt: + ext_count = fs->fs_fm->fm_mapped_extents; + if (ext_count == 0) { + ost_done = true; + fs->fs_device_done = true; + /* If last stripe has hold at the end, + * we need to return */ + if (stripeno == fs->fs_last_stripe) { + fiemap->fm_mapped_extents = 0; + fs->fs_finish_stripe = true; + GOTO(obj_put, rc); + } + break; + } else if (fs->fs_enough) { + /* + * We've collected enough extents and there are + * more extents after it. + */ + GOTO(obj_put, rc); + } + + /* If we just need num of extents, got to next device */ + if (fiemap->fm_extent_count == 0) { + fs->fs_cur_extent += ext_count; + break; + } + + /* prepare to copy retrived map extents */ + len_mapped_single_call = fm_ext[ext_count - 1].fe_logical + + fm_ext[ext_count - 1].fe_length - + lun_start; + + /* Have we finished mapping on this device? */ + if (req_fm_len <= len_mapped_single_call) { + ost_done = true; + fs->fs_device_done = true; + } + + /* Clear the EXTENT_LAST flag which can be present on + * the last extent */ + if (fm_ext[ext_count - 1].fe_flags & FIEMAP_EXTENT_LAST) + fm_ext[ext_count - 1].fe_flags &= ~FIEMAP_EXTENT_LAST; + if (lov_stripe_size(lsm, index, + fm_ext[ext_count - 1].fe_logical + + fm_ext[ext_count - 1].fe_length, + stripeno) >= fmkey->lfik_oa.o_size) { + ost_eof = true; + fs->fs_device_done = true; + } + + fiemap_prepare_and_copy_exts(fiemap, fm_ext, ost_index, + ext_count, fs->fs_cur_extent); + fs->fs_cur_extent += ext_count; + + /* Ran out of available extents? */ + if (fs->fs_cur_extent >= fiemap->fm_extent_count) + fs->fs_enough = true; + } while (!ost_done && !ost_eof); + + if (stripeno == fs->fs_last_stripe) + fs->fs_finish_stripe = true; +obj_put: + cl_object_put(env, subobj); + + return rc; +} + +/** + * Break down the FIEMAP request and send appropriate calls to individual OSTs. + * This also handles the restarting of FIEMAP calls in case mapping overflows + * the available number of extents in single call. + * + * \param env [in] lustre environment + * \param obj [in] file object + * \param fmkey [in] fiemap request header and other info + * \param fiemap [out] fiemap buffer holding retrived map extents + * \param buflen [in/out] max buffer length of @fiemap, when iterate + * each OST, it is used to limit max map needed + * \retval 0 success + * \retval < 0 error + */ +static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj, + struct ll_fiemap_info_key *fmkey, + struct fiemap *fiemap, size_t *buflen) +{ + struct lov_stripe_md_entry *lsme; + struct lov_stripe_md *lsm; + struct fiemap *fm_local = NULL; + loff_t whole_start; + loff_t whole_end; + int entry; + int start_entry; + int end_entry; + int cur_stripe = 0; + int stripe_count; + unsigned int buffer_size = FIEMAP_BUFFER_SIZE; + int rc = 0; + struct fiemap_state fs = { 0 }; + ENTRY; + + lsm = lov_lsm_addref(cl2lov(obj)); + if (lsm == NULL) { + /* no extent: there is no object for mapping */ + fiemap->fm_mapped_extents = 0; + return 0; + } + + if (!(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) { + /** + * If the entry count > 1 or stripe_count > 1 and the + * application does not understand DEVICE_ORDER flag, + * it cannot interpret the extents correctly. + */ + if (lsm->lsm_entry_count > 1 || + (lsm->lsm_entry_count == 1 && + lsm->lsm_entries[0]->lsme_stripe_count > 1)) + GOTO(out_lsm, rc = -ENOTSUPP); + } + + /* No support for DOM layout yet. */ + if (lsme_is_dom(lsm->lsm_entries[0])) + GOTO(out_lsm, rc = -ENOTSUPP); + + if (lsm->lsm_is_released) { + if (fiemap->fm_start < fmkey->lfik_oa.o_size) { + /** + * released file, return a minimal FIEMAP if + * request fits in file-size. + */ + fiemap->fm_mapped_extents = 1; + fiemap->fm_extents[0].fe_logical = fiemap->fm_start; + if (fiemap->fm_start + fiemap->fm_length < + fmkey->lfik_oa.o_size) + fiemap->fm_extents[0].fe_length = + fiemap->fm_length; + else + fiemap->fm_extents[0].fe_length = + fmkey->lfik_oa.o_size - + fiemap->fm_start; + fiemap->fm_extents[0].fe_flags |= + FIEMAP_EXTENT_UNKNOWN | FIEMAP_EXTENT_LAST; + } + GOTO(out_lsm, rc = 0); + } + + /* buffer_size is small to hold fm_extent_count of extents. */ + if (fiemap_count_to_size(fiemap->fm_extent_count) < buffer_size) + buffer_size = fiemap_count_to_size(fiemap->fm_extent_count); + + OBD_ALLOC_LARGE(fm_local, buffer_size); + if (fm_local == NULL) + GOTO(out_lsm, rc = -ENOMEM); + + /** + * Requested extent count exceeds the fiemap buffer size, shrink our + * ambition. + */ + if (fiemap_count_to_size(fiemap->fm_extent_count) > *buflen) + fiemap->fm_extent_count = fiemap_size_to_count(*buflen); + if (fiemap->fm_extent_count == 0) + fs.fs_cnt_need = 0; + + fs.fs_enough = false; + fs.fs_cur_extent = 0; + fs.fs_fm = fm_local; + fs.fs_cnt_need = fiemap_size_to_count(buffer_size); + + whole_start = fiemap->fm_start; + /* whole_start is beyond the end of the file */ + if (whole_start > fmkey->lfik_oa.o_size) + GOTO(out_fm_local, rc = -EINVAL); + whole_end = (fiemap->fm_length == OBD_OBJECT_EOF) ? + fmkey->lfik_oa.o_size : + whole_start + fiemap->fm_length - 1; + /** + * If fiemap->fm_length != OBD_OBJECT_EOF but whole_end exceeds file + * size + */ + if (whole_end > fmkey->lfik_oa.o_size) + whole_end = fmkey->lfik_oa.o_size; + + start_entry = lov_lsm_entry(lsm, whole_start); + end_entry = lov_lsm_entry(lsm, whole_end); + if (end_entry == -1) + end_entry = lsm->lsm_entry_count - 1; + + if (start_entry == -1 || end_entry == -1) + GOTO(out_fm_local, rc = -EINVAL); + + /* TODO: rewrite it with lov_foreach_io_layout() */ + for (entry = start_entry; entry <= end_entry; entry++) { + lsme = lsm->lsm_entries[entry]; + + if (!lsme_inited(lsme)) + break; + + if (entry == start_entry) + fs.fs_ext.e_start = whole_start; + else + fs.fs_ext.e_start = lsme->lsme_extent.e_start; + if (entry == end_entry) + fs.fs_ext.e_end = whole_end; + else + fs.fs_ext.e_end = lsme->lsme_extent.e_end - 1; + fs.fs_length = fs.fs_ext.e_end - fs.fs_ext.e_start + 1; + + /* Calculate start stripe, last stripe and length of mapping */ + fs.fs_start_stripe = lov_stripe_number(lsm, entry, + fs.fs_ext.e_start); + fs.fs_last_stripe = fiemap_calc_last_stripe(lsm, entry, + &fs.fs_ext, fs.fs_start_stripe, + &stripe_count); + fs.fs_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, entry, + &fs.fs_ext, &fs.fs_start_stripe); + /* Check each stripe */ + for (cur_stripe = fs.fs_start_stripe; stripe_count > 0; + --stripe_count, + cur_stripe = (cur_stripe + 1) % lsme->lsme_stripe_count) { + rc = fiemap_for_stripe(env, obj, lsm, fiemap, buflen, + fmkey, entry, cur_stripe, &fs); + if (rc < 0) + GOTO(out_fm_local, rc); + if (fs.fs_enough) + GOTO(finish, rc); + if (fs.fs_finish_stripe) + break; + } /* for each stripe */ + } /* for covering layout component */ + /* + * We've traversed all components, set @entry to the last component + * entry, it's for the last stripe check. + */ + entry--; +finish: + /* Indicate that we are returning device offsets unless file just has + * single stripe */ + if (lsm->lsm_entry_count > 1 || + (lsm->lsm_entry_count == 1 && + lsm->lsm_entries[0]->lsme_stripe_count > 1)) + fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER; + + if (fiemap->fm_extent_count == 0) + goto skip_last_device_calc; + + /* Check if we have reached the last stripe and whether mapping for that + * stripe is done. */ + if ((cur_stripe == fs.fs_last_stripe) && fs.fs_device_done) + fiemap->fm_extents[fs.fs_cur_extent - 1].fe_flags |= + FIEMAP_EXTENT_LAST; +skip_last_device_calc: + fiemap->fm_mapped_extents = fs.fs_cur_extent; +out_fm_local: + OBD_FREE_LARGE(fm_local, buffer_size); + +out_lsm: + lov_lsm_put(lsm); + return rc; +} + +static int lov_object_getstripe(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *lum, size_t size) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_stripe_md *lsm; + int rc = 0; + ENTRY; + + lsm = lov_lsm_addref(lov); + if (lsm == NULL) + RETURN(-ENODATA); + + rc = lov_getstripe(env, cl2lov(obj), lsm, lum, size); + lov_lsm_put(lsm); + RETURN(rc); +} + +static int lov_object_layout_get(const struct lu_env *env, + struct cl_object *obj, + struct cl_layout *cl) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_stripe_md *lsm = lov_lsm_addref(lov); + struct lu_buf *buf = &cl->cl_buf; + ssize_t rc; + ENTRY; + + if (lsm == NULL) { + cl->cl_size = 0; + cl->cl_layout_gen = CL_LAYOUT_GEN_EMPTY; + + RETURN(0); + } + + cl->cl_size = lov_comp_md_size(lsm); + cl->cl_layout_gen = lsm->lsm_layout_gen; + cl->cl_is_composite = lsm_is_composite(lsm->lsm_magic); + + rc = lov_lsm_pack(lsm, buf->lb_buf, buf->lb_len); + lov_lsm_put(lsm); + + RETURN(rc < 0 ? rc : 0); +} + +static loff_t lov_object_maxbytes(struct cl_object *obj) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_stripe_md *lsm = lov_lsm_addref(lov); + loff_t maxbytes; + + if (lsm == NULL) + return LLONG_MAX; + + maxbytes = lsm->lsm_maxbytes; + + lov_lsm_put(lsm); + + return maxbytes; +} + +static int lov_object_flush(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock) +{ + return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_flush, true, env, obj, + lock); +} + +static const struct cl_object_operations lov_ops = { + .coo_page_init = lov_page_init, + .coo_lock_init = lov_lock_init, + .coo_io_init = lov_io_init, + .coo_attr_get = lov_attr_get, + .coo_attr_update = lov_attr_update, + .coo_conf_set = lov_conf_set, + .coo_getstripe = lov_object_getstripe, + .coo_layout_get = lov_object_layout_get, + .coo_maxbytes = lov_object_maxbytes, + .coo_fiemap = lov_object_fiemap, + .coo_object_flush = lov_object_flush +}; + +static const struct lu_object_operations lov_lu_obj_ops = { + .loo_object_init = lov_object_init, + .loo_object_delete = lov_object_delete, + .loo_object_release = NULL, + .loo_object_free = lov_object_free, + .loo_object_print = lov_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *lov_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct lov_object *lov; + struct lu_object *obj; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, GFP_NOFS); + if (lov != NULL) { + obj = lov2lu(lov); + lu_object_init(obj, NULL, dev); + lov->lo_cl.co_ops = &lov_ops; + lov->lo_type = -1; /* invalid, to catch uninitialized type */ + /* + * object io operation vector (cl_object::co_iop) is installed + * later in lov_object_init(), as different vectors are used + * for object with different layouts. + */ + obj->lo_ops = &lov_lu_obj_ops; + } else + obj = NULL; + RETURN(obj); +} + +struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov) +{ + struct lov_stripe_md *lsm = NULL; + + lov_conf_freeze(lov); + if (lov->lo_lsm != NULL) { + lsm = lsm_addref(lov->lo_lsm); + CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n", + lsm, atomic_read(&lsm->lsm_refc), + lov->lo_layout_invalid, current); + } + lov_conf_thaw(lov); + return lsm; +} + +int lov_read_and_clear_async_rc(struct cl_object *clob) +{ + struct lu_object *luobj; + int rc = 0; + ENTRY; + + luobj = lu_object_locate(&cl_object_header(clob)->coh_lu, + &lov_device_type); + if (luobj != NULL) { + struct lov_object *lov = lu2lov(luobj); + + lov_conf_freeze(lov); + switch (lov->lo_type) { + case LLT_COMP: { + struct lov_stripe_md *lsm; + int i; + + lsm = lov->lo_lsm; + LASSERT(lsm != NULL); + for (i = 0; i < lsm->lsm_entry_count; i++) { + struct lov_stripe_md_entry *lse = + lsm->lsm_entries[i]; + int j; + + if (!lsme_inited(lse)) + break; + + for (j = 0; j < lse->lsme_stripe_count; j++) { + struct lov_oinfo *loi = + lse->lsme_oinfo[j]; + + if (lov_oinfo_is_dummy(loi)) + continue; + + if (loi->loi_ar.ar_rc && !rc) + rc = loi->loi_ar.ar_rc; + loi->loi_ar.ar_rc = 0; + } + } + } + fallthrough; + case LLT_RELEASED: + case LLT_EMPTY: + break; + default: + LBUG(); + } + lov_conf_thaw(lov); + } + RETURN(rc); +} +EXPORT_SYMBOL(lov_read_and_clear_async_rc); + +/** @} lov */ diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_offset.c b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c new file mode 100644 index 0000000000000..de2e6c47da8ee --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c @@ -0,0 +1,303 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include + +#include "lov_internal.h" + +loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index) +{ + struct lov_stripe_md_entry *entry = lsm->lsm_entries[index]; + + LASSERT(index < lsm->lsm_entry_count); + + if (lsme_is_dom(entry)) + return (loff_t)entry->lsme_stripe_size; + + return (loff_t)entry->lsme_stripe_size * entry->lsme_stripe_count; +} + +/* compute object size given "stripeno" and the ost size */ +u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, u64 ost_size, + int stripeno) +{ + unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size; + unsigned long stripe_size; + loff_t swidth; + loff_t lov_size; + + ENTRY; + + if (ost_size == 0) + RETURN(0); + + swidth = stripe_width(lsm, index); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_size = lov_do_div64(ost_size, ssize); + if (stripe_size) + lov_size = ost_size * swidth + stripeno * ssize + stripe_size; + else + lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize; + + RETURN(lov_size); +} + +/** + * Compute file level page index by stripe level page offset + */ +pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index, + pgoff_t stripe_index, int stripe) +{ + loff_t offset; + + offset = lov_stripe_size(lsm, index, + (stripe_index << PAGE_SHIFT) + 1, + stripe); + return offset >> PAGE_SHIFT; +} + +/* + * we have an offset in file backed by an lov and want to find out where + * that offset lands in our given stripe of the file. for the easy + * case where the offset is within the stripe, we just have to scale the + * offset down to make it relative to the stripe instead of the lov. + * + * the harder case is what to do when the offset doesn't intersect the + * stripe. callers will want start offsets clamped ahead to the start + * of the nearest stripe in the file. end offsets similarly clamped to the + * nearest ending byte of a stripe in the file: + * + * all this function does is move offsets to the nearest region of the + * stripe, and it does its work "mod" the full length of all the stripes. + * consider a file with 3 stripes: + * + * S E + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + * + * to find stripe 1's offsets for S and E, it divides by the full stripe + * width and does its math in the context of a single set of stripes: + * + * S E + * ----------------------------------- + * | 0 | 1 | 2 | + * ----------------------------------- + * + * it'll notice that E is outside stripe 1 and clamp it to the end of the + * stripe, then multiply it back out by lov_off to give the real offsets in + * the stripe: + * + * S E + * --------------------------------------------------------------------- + * | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------------------------------------------- + * + * it would have done similarly and pulled S forward to the start of a 1 + * stripe if, say, S had landed in a 0 stripe. + * + * this rounding isn't always correct. consider an E lov offset that lands + * on a 0 stripe, the "mod stripe width" math will pull it forward to the + * start of a 1 stripe, when in fact it wanted to be rounded back to the end + * of a previous 1 stripe. this logic is handled by callers and this is why: + * + * this function returns < 0 when the offset was "before" the stripe and + * was moved forward to the start of the stripe in question; 0 when it + * falls in the stripe and no shifting was done; > 0 when the offset + * was outside the stripe and was pulled back to its final byte. + */ +int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off, + int stripeno, loff_t *obdoff) +{ + unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size; + loff_t stripe_off; + loff_t this_stripe; + loff_t swidth; + int ret = 0; + + if (lov_off == OBD_OBJECT_EOF) { + *obdoff = OBD_OBJECT_EOF; + return 0; + } + + swidth = stripe_width(lsm, index); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_off = lov_do_div64(lov_off, swidth); + + this_stripe = (loff_t)stripeno * ssize; + if (stripe_off < this_stripe) { + stripe_off = 0; + ret = -1; + } else { + stripe_off -= this_stripe; + + if (stripe_off >= ssize) { + stripe_off = ssize; + ret = 1; + } + } + + *obdoff = lov_off * ssize + stripe_off; + return ret; +} + +/* + * Given a whole-file size and a stripe number, give the file size which + * corresponds to the individual object of that stripe. + * + * This behaves basically in the same was as lov_stripe_offset, except that + * file sizes falling before the beginning of a stripe are clamped to the end + * of the previous stripe, not the beginning of the next: + * + * S + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + * + * if clamped to stripe 2 becomes: + * + * S + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + */ +loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size, + int stripeno) +{ + unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size; + loff_t stripe_off; + loff_t this_stripe; + loff_t swidth; + + if (file_size == OBD_OBJECT_EOF) + return OBD_OBJECT_EOF; + + swidth = stripe_width(lsm, index); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_off = lov_do_div64(file_size, swidth); + + this_stripe = (loff_t)stripeno * ssize; + if (stripe_off < this_stripe) { + /* Move to end of previous stripe, or zero */ + if (file_size > 0) { + file_size--; + stripe_off = ssize; + } else { + stripe_off = 0; + } + } else { + stripe_off -= this_stripe; + + if (stripe_off >= ssize) { + /* Clamp to end of this stripe */ + stripe_off = ssize; + } + } + + return (file_size * ssize + stripe_off); +} + +/* + * given an extent in an lov and a stripe, calculate the extent of the stripe + * that is contained within the lov extent. this returns true if the given + * stripe does intersect with the lov extent. + * + * Closed interval [@obd_start, @obd_end] will be returned. + */ +int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno, + struct lu_extent *ext, u64 *obd_start, u64 *obd_end) +{ + struct lov_stripe_md_entry *entry = lsm->lsm_entries[index]; + u64 start, end; + int start_side, end_side; + + if (!lu_extent_is_overlapped(ext, &entry->lsme_extent)) + return 0; + + start = max_t(__u64, ext->e_start, entry->lsme_extent.e_start); + end = min_t(__u64, ext->e_end, entry->lsme_extent.e_end); + if (end != OBD_OBJECT_EOF) + end--; + + start_side = lov_stripe_offset(lsm, index, start, stripeno, obd_start); + end_side = lov_stripe_offset(lsm, index, end, stripeno, obd_end); + + CDEBUG(D_INODE, "[%lld->%lld] -> [(%d) %lld->%lld (%d)]\n", + start, end, start_side, *obd_start, *obd_end, end_side); + + /* + * this stripe doesn't intersect the file extent when neither + * start or the end intersected the stripe and obd_start and + * obd_end got rounded up to the save value. + */ + if (start_side != 0 && end_side != 0 && *obd_start == *obd_end) + return 0; + + /* + * as mentioned in the lov_stripe_offset commentary, end + * might have been shifted in the wrong direction. This + * happens when an end offset is before the stripe when viewed + * through the "mod stripe size" math. we detect it being shifted + * in the wrong direction and touch it up. + * interestingly, this can't underflow since end must be > start + * if we passed through the previous check. + * (should we assert for that somewhere?) + */ + if (end_side != 0) + (*obd_end)--; + + return 1; +} + +/* compute which stripe number "lov_off" will be written into */ +int lov_stripe_number(struct lov_stripe_md *lsm, int index, loff_t lov_off) +{ + unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size; + loff_t stripe_off; + loff_t swidth; + + swidth = stripe_width(lsm, index); + + stripe_off = lov_do_div64(lov_off, swidth); + + /* Puts stripe_off/ssize result into stripe_off */ + lov_do_div64(stripe_off, ssize); + + return stripe_off; +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c new file mode 100644 index 0000000000000..6fe3c2ff5bd5b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c @@ -0,0 +1,470 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_pack.c + * + * (Un)packing of OST/MDS requests + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include +#include +#include +#include +#include + +#include "lov_cl_internal.h" +#include "lov_internal.h" + +void lov_dump_lmm_common(int level, void *lmmp) +{ + struct lov_mds_md *lmm = lmmp; + struct ost_id oi; + + lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi); + CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n", + POSTID(&oi), le32_to_cpu(lmm->lmm_magic), + le32_to_cpu(lmm->lmm_pattern)); + CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n", + le32_to_cpu(lmm->lmm_stripe_size), + le16_to_cpu(lmm->lmm_stripe_count), + le16_to_cpu(lmm->lmm_layout_gen)); +} + +static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod, + int stripe_count) +{ + int i; + + if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { + CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n", + stripe_count, LOV_V1_INSANE_STRIPE_COUNT); + return; + } + + for (i = 0; i < stripe_count; ++i, ++lod) { + struct ost_id oi; + + ostid_le_to_cpu(&lod->l_ost_oi, &oi); + CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i, + le32_to_cpu(lod->l_ost_idx), POSTID(&oi)); + } +} + +void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm) +{ + lov_dump_lmm_common(level, lmm); + lov_dump_lmm_objects(level, lmm->lmm_objects, + le16_to_cpu(lmm->lmm_stripe_count)); +} + +void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm) +{ + lov_dump_lmm_common(level, lmm); + CDEBUG(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name); + lov_dump_lmm_objects(level, lmm->lmm_objects, + le16_to_cpu(lmm->lmm_stripe_count)); +} + +void lov_dump_lmm(int level, void *lmm) +{ + int magic; + + magic = le32_to_cpu(((struct lov_mds_md *)lmm)->lmm_magic); + switch (magic) { + case LOV_MAGIC_V1: + lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)lmm); + break; + case LOV_MAGIC_V3: + lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)lmm); + break; + default: + CDEBUG(level, "unrecognized lmm_magic %x, assuming %x\n", + magic, LOV_MAGIC_V1); + lov_dump_lmm_common(level, lmm); + break; + } +} + +/** + * Pack LOV striping metadata for disk storage format (in little + * endian byte order). + * + * This follows the getxattr() conventions. If \a buf_size is zero + * then return the size needed. If \a buf_size is too small then + * return -ERANGE. Otherwise return the size of the result. + */ +ssize_t lov_lsm_pack_v1v3(const struct lov_stripe_md *lsm, void *buf, + size_t buf_size) +{ + struct lov_mds_md_v1 *lmmv1 = buf; + struct lov_mds_md_v3 *lmmv3 = buf; + struct lov_ost_data_v1 *lmm_objects; + size_t lmm_size; + unsigned int i; + + ENTRY; + + lmm_size = lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count, + lsm->lsm_magic); + if (buf_size == 0) + RETURN(lmm_size); + + if (buf_size < lmm_size) + RETURN(-ERANGE); + + /* + * lmmv1 and lmmv3 point to the same struct and have the + * same first fields + */ + lmmv1->lmm_magic = cpu_to_le32(lsm->lsm_magic); + lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi); + lmmv1->lmm_stripe_size = cpu_to_le32( + lsm->lsm_entries[0]->lsme_stripe_size); + lmmv1->lmm_stripe_count = cpu_to_le16( + lsm->lsm_entries[0]->lsme_stripe_count); + lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_entries[0]->lsme_pattern); + lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen); + + if (lsm->lsm_magic == LOV_MAGIC_V3) { + CLASSERT(sizeof(lsm->lsm_entries[0]->lsme_pool_name) == + sizeof(lmmv3->lmm_pool_name)); + strlcpy(lmmv3->lmm_pool_name, + lsm->lsm_entries[0]->lsme_pool_name, + sizeof(lmmv3->lmm_pool_name)); + lmm_objects = lmmv3->lmm_objects; + } else { + lmm_objects = lmmv1->lmm_objects; + } + + if (lsm->lsm_is_released) + RETURN(lmm_size); + + for (i = 0; i < lsm->lsm_entries[0]->lsme_stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_entries[0]->lsme_oinfo[i]; + + ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi); + lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen); + lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx); + } + + RETURN(lmm_size); +} + +ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf, + size_t buf_size) +{ + struct lov_comp_md_v1 *lcmv1 = buf; + struct lov_comp_md_entry_v1 *lcme; + struct lov_ost_data_v1 *lmm_objects; + size_t lmm_size; + unsigned int entry; + unsigned int offset; + unsigned int size; + unsigned int i; + + ENTRY; + + if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3) + return lov_lsm_pack_v1v3(lsm, buf, buf_size); + + lmm_size = lov_comp_md_size(lsm); + if (buf_size == 0) + RETURN(lmm_size); + + if (buf_size < lmm_size) + RETURN(-ERANGE); + + lcmv1->lcm_magic = cpu_to_le32(lsm->lsm_magic); + lcmv1->lcm_size = cpu_to_le32(lmm_size); + lcmv1->lcm_layout_gen = cpu_to_le32(lsm->lsm_layout_gen); + lcmv1->lcm_flags = cpu_to_le16(lsm->lsm_flags); + lcmv1->lcm_mirror_count = cpu_to_le16(lsm->lsm_mirror_count); + lcmv1->lcm_entry_count = cpu_to_le16(lsm->lsm_entry_count); + + offset = sizeof(*lcmv1) + sizeof(*lcme) * lsm->lsm_entry_count; + + for (entry = 0; entry < lsm->lsm_entry_count; entry++) { + struct lov_stripe_md_entry *lsme; + struct lov_mds_md *lmm; + __u16 stripe_count; + + lsme = lsm->lsm_entries[entry]; + lcme = &lcmv1->lcm_entries[entry]; + + lcme->lcme_id = cpu_to_le32(lsme->lsme_id); + lcme->lcme_flags = cpu_to_le32(lsme->lsme_flags); + if (lsme->lsme_flags & LCME_FL_NOSYNC) + lcme->lcme_timestamp = + cpu_to_le64(lsme->lsme_timestamp); + lcme->lcme_extent.e_start = + cpu_to_le64(lsme->lsme_extent.e_start); + lcme->lcme_extent.e_end = + cpu_to_le64(lsme->lsme_extent.e_end); + lcme->lcme_offset = cpu_to_le32(offset); + + lmm = (struct lov_mds_md *)((char *)lcmv1 + offset); + lmm->lmm_magic = cpu_to_le32(lsme->lsme_magic); + /* lmm->lmm_oi not set */ + lmm->lmm_pattern = cpu_to_le32(lsme->lsme_pattern); + lmm->lmm_stripe_size = cpu_to_le32(lsme->lsme_stripe_size); + lmm->lmm_stripe_count = cpu_to_le16(lsme->lsme_stripe_count); + lmm->lmm_layout_gen = cpu_to_le16(lsme->lsme_layout_gen); + + if (lsme->lsme_magic == LOV_MAGIC_V3) { + struct lov_mds_md_v3 *lmmv3 = + (struct lov_mds_md_v3 *)lmm; + + strlcpy(lmmv3->lmm_pool_name, lsme->lsme_pool_name, + sizeof(lmmv3->lmm_pool_name)); + lmm_objects = lmmv3->lmm_objects; + } else { + lmm_objects = + ((struct lov_mds_md_v1 *)lmm)->lmm_objects; + } + + if (lsme_inited(lsme) && + !(lsme->lsme_pattern & LOV_PATTERN_F_RELEASED)) + stripe_count = lsme->lsme_stripe_count; + else + stripe_count = 0; + + for (i = 0; i < stripe_count; i++) { + struct lov_oinfo *loi = lsme->lsme_oinfo[i]; + + ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi); + lmm_objects[i].l_ost_gen = + cpu_to_le32(loi->loi_ost_gen); + lmm_objects[i].l_ost_idx = + cpu_to_le32(loi->loi_ost_idx); + } + + size = lov_mds_md_size(stripe_count, lsme->lsme_magic); + lcme->lcme_size = cpu_to_le32(size); + offset += size; + } /* for each layout component */ + + RETURN(lmm_size); +} + +/* Find the max stripecount we should use */ +__u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic, __u16 stripe_count) +{ + __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD; + + if (!stripe_count) + stripe_count = lov->desc.ld_default_stripe_count; + if (stripe_count > lov->desc.ld_active_tgt_count) + stripe_count = lov->desc.ld_active_tgt_count; + if (!stripe_count) + stripe_count = 1; + + /* + * stripe count is based on whether ldiskfs can handle + * larger EA sizes + */ + if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE && + lov->lov_ocd.ocd_max_easize) + max_stripes = lov_mds_md_max_stripe_count( + lov->lov_ocd.ocd_max_easize, magic); + + if (stripe_count > max_stripes) + stripe_count = max_stripes; + + return stripe_count; +} + +int lov_free_memmd(struct lov_stripe_md **lsmp) +{ + struct lov_stripe_md *lsm = *lsmp; + int refc; + + *lsmp = NULL; + refc = atomic_dec_return(&lsm->lsm_refc); + LASSERT(refc >= 0); + if (refc == 0) + lsm_free(lsm); + + return refc; +} + +/* + * Unpack LOV object metadata from disk storage. It is packed in LE byte + * order and is opaque to the networking layer. + */ +struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf, + size_t buf_size) +{ + const struct lsm_operations *op; + struct lov_stripe_md *lsm; + u32 magic; + + ENTRY; + + if (buf_size < sizeof(magic)) + RETURN(ERR_PTR(-EINVAL)); + + magic = le32_to_cpu(*(u32 *)buf); + op = lsm_op_find(magic); + if (!op) + RETURN(ERR_PTR(-EINVAL)); + + lsm = op->lsm_unpackmd(lov, buf, buf_size); + + RETURN(lsm); +} + +/* + * Retrieve object striping information. + * + * @lump is a pointer to an in-core struct with lmm_ost_count indicating + * the maximum number of OST indices which will fit in the user buffer. + * lmm_magic must be LOV_USER_MAGIC. + * + * If @size > 0, User specified limited buffer size, usually the buffer is from + * ll_lov_setstripe(), and the buffer can only hold basic layout template info. + */ +int lov_getstripe(const struct lu_env *env, struct lov_object *obj, + struct lov_stripe_md *lsm, struct lov_user_md __user *lump, + size_t size) +{ + /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ + struct lov_mds_md *lmmk, *lmm; + struct lov_user_md_v1 lum; + size_t lmmk_size, lum_size = 0; + ssize_t lmm_size; + int rc = 0; + + ENTRY; + + if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3 && + lsm->lsm_magic != LOV_MAGIC_COMP_V1) { + CERROR("bad LSM MAGIC: 0x%08X != 0x%08X nor 0x%08X\n", + lsm->lsm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3); + GOTO(out, rc = -EIO); + } + + lmmk_size = lov_comp_md_size(lsm); + + OBD_ALLOC_LARGE(lmmk, lmmk_size); + if (!lmmk) + GOTO(out, rc = -ENOMEM); + + lmm_size = lov_lsm_pack(lsm, lmmk, lmmk_size); + if (lmm_size < 0) + GOTO(out_free, rc = lmm_size); + + if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) { + if (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) || + lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { + lustre_swab_lov_mds_md(lmmk); + lustre_swab_lov_user_md_objects( + (struct lov_user_ost_data *)lmmk->lmm_objects, + lmmk->lmm_stripe_count); + } else if (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1)) { + lustre_swab_lov_comp_md_v1( + (struct lov_comp_md_v1 *)lmmk); + } + } + + /* + * Legacy appication passes limited buffer, we need to figure out + * the user buffer size by the passed in lmm_stripe_count. + */ + if (copy_from_user(&lum, lump, sizeof(struct lov_user_md_v1))) + GOTO(out_free, rc = -EFAULT); + + if (lum.lmm_magic == LOV_USER_MAGIC_V1 || + lum.lmm_magic == LOV_USER_MAGIC_V3) + lum_size = lov_user_md_size(lum.lmm_stripe_count, + lum.lmm_magic); + + if (lum_size != 0) { + struct lov_mds_md *comp_md = lmmk; + + /* + * Legacy app (ADIO for instance) treats the layout as V1/V3 + * blindly, we'd return a reasonable V1/V3 for them. + */ + if (lmmk->lmm_magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *comp_v1; + struct cl_object *cl_obj; + struct cl_attr attr; + int i; + + attr.cat_size = 0; + cl_obj = cl_object_top(&obj->lo_cl); + cl_object_attr_lock(cl_obj); + cl_object_attr_get(env, cl_obj, &attr); + cl_object_attr_unlock(cl_obj); + + /* + * return the last instantiated component if file size + * is non-zero, otherwise, return the last component. + */ + comp_v1 = (struct lov_comp_md_v1 *)lmmk; + i = attr.cat_size == 0 ? comp_v1->lcm_entry_count : 0; + for (; i < comp_v1->lcm_entry_count; i++) { + if (!(comp_v1->lcm_entries[i].lcme_flags & + LCME_FL_INIT)) + break; + } + if (i > 0) + i--; + comp_md = (struct lov_mds_md *)((char *)comp_v1 + + comp_v1->lcm_entries[i].lcme_offset); + lum_size = comp_v1->lcm_entries[i].lcme_size; + } + + lmm = comp_md; + lmm_size = min(lum_size, lmmk_size); + } else { + lmm = lmmk; + lmm_size = lmmk_size; + } + /** + * User specified limited buffer size, usually the buffer is + * from ll_lov_setstripe(), and the buffer can only hold basic + * layout template info. + */ + if (size == 0 || size > lmm_size) + size = lmm_size; + if (copy_to_user(lump, lmm, size)) + GOTO(out_free, rc = -EFAULT); + +out_free: + OBD_FREE_LARGE(lmmk, lmmk_size); +out: + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_page.c b/drivers/staging/lustrefsx/lustre/lov/lov_page.c new file mode 100644 index 0000000000000..34fbc66e47172 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_page.c @@ -0,0 +1,161 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lov page operations. + * + */ + +static int lov_comp_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct lov_page *lp = cl2lov_page(slice); + + return (*printer)(env, cookie, + LUSTRE_LOV_NAME"-page@%p, comp index: %x, gen: %u\n", + lp, lp->lps_index, lp->lps_layout_gen); +} + +static const struct cl_page_operations lov_comp_page_ops = { + .cpo_print = lov_comp_page_print +}; + +int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + struct lov_object *loo = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + struct cl_object *subobj; + struct cl_object *o; + struct lov_io_sub *sub; + struct lov_page *lpg = cl_object_page_slice(obj, page); + struct lov_layout_raid0 *r0; + loff_t offset; + loff_t suboff; + int entry; + int stripe; + int rc; + + ENTRY; + + offset = cl_offset(obj, index); + entry = lov_io_layout_at(lio, offset); + if (entry < 0 || !lsm_entry_inited(loo->lo_lsm, entry)) { + /* non-existing layout component */ + lov_page_init_empty(env, obj, page, index); + RETURN(0); + } + + r0 = lov_r0(loo, entry); + stripe = lov_stripe_number(loo->lo_lsm, entry, offset); + LASSERT(stripe < r0->lo_nr); + rc = lov_stripe_offset(loo->lo_lsm, entry, offset, stripe, &suboff); + LASSERT(rc == 0); + + lpg->lps_index = lov_comp_index(entry, stripe); + lpg->lps_layout_gen = loo->lo_lsm->lsm_layout_gen; + cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_comp_page_ops); + + sub = lov_sub_get(env, lio, lpg->lps_index); + if (IS_ERR(sub)) + RETURN(PTR_ERR(sub)); + + subobj = lovsub2cl(r0->lo_sub[stripe]); + list_for_each_entry(o, &subobj->co_lu.lo_header->loh_layers, + co_lu.lo_linkage) { + if (o->co_ops->coo_page_init) { + rc = o->co_ops->coo_page_init(sub->sub_env, o, page, + cl_index(subobj, suboff)); + if (rc != 0) + break; + } + } + + RETURN(rc); +} + +static int lov_empty_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct lov_page *lp = cl2lov_page(slice); + + return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p, empty.\n", lp); +} + +static const struct cl_page_operations lov_empty_page_ops = { + .cpo_print = lov_empty_page_print +}; + +int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + struct lov_page *lpg = cl_object_page_slice(obj, page); + void *addr; + + ENTRY; + + lpg->lps_index = ~0; + cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_empty_page_ops); + addr = kmap(page->cp_vmpage); + memset(addr, 0, cl_page_size(obj)); + kunmap(page->cp_vmpage); + cl_page_export(env, page, 1); + RETURN(0); +} + +bool lov_page_is_empty(const struct cl_page *page) +{ + const struct cl_page_slice *slice = cl_page_at(page, &lov_device_type); + + LASSERT(slice != NULL); + return slice->cpl_ops == &lov_empty_page_ops; +} + + +/** @} lov */ + diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c new file mode 100644 index 0000000000000..225ba9391cf19 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c @@ -0,0 +1,618 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_pool.c + * + * OST pool methods + * + * Author: Jacques-Charles LAFOUCRIERE + * Author: Alex Lyashkov + * Author: Nathaniel Rutman + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include +#include + +#include +#include "lov_internal.h" + +#define pool_tgt(_p, _i) \ + _p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]] + +static void lov_pool_getref(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + atomic_inc(&pool->pool_refcount); +} + +static void lov_pool_putref(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + if (atomic_dec_and_test(&pool->pool_refcount)) { + LASSERT(hlist_unhashed(&pool->pool_hash)); + LASSERT(list_empty(&pool->pool_list)); + LASSERT(pool->pool_proc_entry == NULL); + lov_ost_pool_free(&(pool->pool_obds)); + OBD_FREE_PTR(pool); + EXIT; + } +} + +static void lov_pool_putref_locked(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + LASSERT(atomic_read(&pool->pool_refcount) > 1); + + atomic_dec(&pool->pool_refcount); +} + +/* + * hash function using a Rotating Hash algorithm + * Knuth, D. The Art of Computer Programming, + * Volume 3: Sorting and Searching, + * Chapter 6.4. + * Addison Wesley, 1973 + */ +static __u32 pool_hashfn(struct cfs_hash *hash_body, const void *key, + unsigned mask) +{ + int i; + __u32 result; + char *poolname; + + result = 0; + poolname = (char *)key; + for (i = 0; i < LOV_MAXPOOLNAME; i++) { + if (poolname[i] == '\0') + break; + result = (result << 4)^(result >> 28) ^ poolname[i]; + } + return (result % mask); +} + +static void *pool_key(struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + return (pool->pool_name); +} + +static int +pool_hashkey_keycmp(const void *key, struct hlist_node *compared_hnode) +{ + char *pool_name; + struct pool_desc *pool; + + pool_name = (char *)key; + pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash); + return !strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME); +} + +static void *pool_hashobject(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct pool_desc, pool_hash); +} + +static void pool_hashrefcount_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + lov_pool_getref(pool); +} + +static void pool_hashrefcount_put_locked(struct cfs_hash *hs, + struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + lov_pool_putref_locked(pool); +} + +struct cfs_hash_ops pool_hash_operations = { + .hs_hash = pool_hashfn, + .hs_key = pool_key, + .hs_keycmp = pool_hashkey_keycmp, + .hs_object = pool_hashobject, + .hs_get = pool_hashrefcount_get, + .hs_put_locked = pool_hashrefcount_put_locked, + +}; + +#ifdef CONFIG_PROC_FS +/* + * pool /proc seq_file methods + */ +/* + * iterator is used to go through the target pool entries + * index is the current entry index in the lp_array[] array + * index >= pos returned to the seq_file interface + * pos is from 0 to (pool->pool_obds.op_count - 1) + */ +#define POOL_IT_MAGIC 0xB001CEA0 +struct pool_iterator { + int magic; + struct pool_desc *pool; + int idx; /* from 0 to pool_tgt_size - 1 */ +}; + +static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct pool_iterator *iter = (struct pool_iterator *)s->private; + int prev_idx; + + LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X\n", iter->magic); + + /* test if end of file */ + if (*pos >= pool_tgt_count(iter->pool)) + return NULL; + + /* iterate to find a non empty entry */ + prev_idx = iter->idx; + iter->idx++; + if (iter->idx >= pool_tgt_count(iter->pool)) { + iter->idx = prev_idx; /* we stay on the last entry */ + return NULL; + } + (*pos)++; + /* return != NULL to continue */ + return iter; +} + +static void *pool_proc_start(struct seq_file *s, loff_t *pos) +{ + struct pool_desc *pool = (struct pool_desc *)s->private; + struct pool_iterator *iter; + + lov_pool_getref(pool); + if ((pool_tgt_count(pool) == 0) || + (*pos >= pool_tgt_count(pool))) { + /* iter is not created, so stop() has no way to + * find pool to dec ref */ + lov_pool_putref(pool); + return NULL; + } + + OBD_ALLOC_PTR(iter); + if (!iter) + return ERR_PTR(-ENOMEM); + iter->magic = POOL_IT_MAGIC; + iter->pool = pool; + iter->idx = 0; + + /* we use seq_file private field to memorized iterator so + * we can free it at stop() */ + /* /!\ do not forget to restore it to pool before freeing it */ + s->private = iter; + down_read(&pool_tgt_rw_sem(pool)); + if (*pos > 0) { + loff_t i; + void *ptr; + + i = 0; + do { + ptr = pool_proc_next(s, &iter, &i); + } while ((i < *pos) && (ptr != NULL)); + return ptr; + } + return iter; +} + +static void pool_proc_stop(struct seq_file *s, void *v) +{ + struct pool_iterator *iter = (struct pool_iterator *)s->private; + + /* in some cases stop() method is called 2 times, without + * calling start() method (see seq_read() from fs/seq_file.c) + * we have to free only if s->private is an iterator */ + if ((iter) && (iter->magic == POOL_IT_MAGIC)) { + up_read(&pool_tgt_rw_sem(iter->pool)); + /* we restore s->private so next call to pool_proc_start() + * will work */ + s->private = iter->pool; + lov_pool_putref(iter->pool); + OBD_FREE_PTR(iter); + } + return; +} + +static int pool_proc_show(struct seq_file *s, void *v) +{ + struct pool_iterator *iter = (struct pool_iterator *)v; + struct lov_tgt_desc *tgt; + + LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X\n", iter->magic); + LASSERT(iter->pool != NULL); + LASSERT(iter->idx <= pool_tgt_count(iter->pool)); + + tgt = pool_tgt(iter->pool, iter->idx); + if (tgt) + seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid))); + + return 0; +} + +static struct seq_operations pool_proc_ops = { + .start = pool_proc_start, + .next = pool_proc_next, + .stop = pool_proc_stop, + .show = pool_proc_show, +}; + +static int pool_proc_open(struct inode *inode, struct file *file) +{ + int rc; + + rc = seq_open(file, &pool_proc_ops); + if (!rc) { + struct seq_file *s = file->private_data; + s->private = PDE_DATA(inode); + } + return rc; +} + +const static struct proc_ops pool_proc_operations = { + .proc_open = pool_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +void lov_dump_pool(int level, struct pool_desc *pool) +{ + int i; + + lov_pool_getref(pool); + + CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n", + pool->pool_name, pool->pool_obds.op_count); + down_read(&pool_tgt_rw_sem(pool)); + + for (i = 0; i < pool_tgt_count(pool) ; i++) { + if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp) + continue; + CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n", + pool->pool_name, i, + obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid))); + } + + up_read(&pool_tgt_rw_sem(pool)); + lov_pool_putref(pool); +} + +#define LOV_POOL_INIT_COUNT 2 +int lov_ost_pool_init(struct lu_tgt_pool *op, unsigned int count) +{ + ENTRY; + + if (count == 0) + count = LOV_POOL_INIT_COUNT; + op->op_array = NULL; + op->op_count = 0; + init_rwsem(&op->op_rw_sem); + op->op_size = count * sizeof(op->op_array[0]); + OBD_ALLOC(op->op_array, op->op_size); + if (op->op_array == NULL) { + op->op_size = 0; + RETURN(-ENOMEM); + } + EXIT; + return 0; +} + +/* Caller must hold write op_rwlock */ +int lov_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count) +{ + __u32 *new; + __u32 new_size; + + LASSERT(min_count != 0); + + if (op->op_count * sizeof(op->op_array[0]) < op->op_size) + return 0; + + new_size = max_t(__u32, min_count * sizeof(op->op_array[0]), + 2 * op->op_size); + OBD_ALLOC(new, new_size); + if (new == NULL) + return -ENOMEM; + + /* copy old array to new one */ + memcpy(new, op->op_array, op->op_size); + OBD_FREE(op->op_array, op->op_size); + op->op_array = new; + op->op_size = new_size; + return 0; +} + +int lov_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count) +{ + int rc = 0, i; + ENTRY; + + down_write(&op->op_rw_sem); + + rc = lov_ost_pool_extend(op, min_count); + if (rc) + GOTO(out, rc); + + /* search ost in pool array */ + for (i = 0; i < op->op_count; i++) { + if (op->op_array[i] == idx) + GOTO(out, rc = -EEXIST); + } + /* ost not found we add it */ + op->op_array[op->op_count] = idx; + op->op_count++; + EXIT; +out: + up_write(&op->op_rw_sem); + return rc; +} + +int lov_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx) +{ + int i; + ENTRY; + + down_write(&op->op_rw_sem); + + for (i = 0; i < op->op_count; i++) { + if (op->op_array[i] == idx) { + memmove(&op->op_array[i], &op->op_array[i + 1], + (op->op_count - i - 1) * sizeof(op->op_array[0])); + op->op_count--; + up_write(&op->op_rw_sem); + EXIT; + return 0; + } + } + + up_write(&op->op_rw_sem); + RETURN(-EINVAL); +} + +int lov_ost_pool_free(struct lu_tgt_pool *op) +{ + ENTRY; + + if (op->op_size == 0) + RETURN(0); + + down_write(&op->op_rw_sem); + + OBD_FREE(op->op_array, op->op_size); + op->op_array = NULL; + op->op_count = 0; + op->op_size = 0; + + up_write(&op->op_rw_sem); + RETURN(0); +} + + +int lov_pool_new(struct obd_device *obd, char *poolname) +{ + struct lov_obd *lov; + struct pool_desc *new_pool; + int rc; + ENTRY; + + lov = &(obd->u.lov); + + if (strlen(poolname) > LOV_MAXPOOLNAME) + RETURN(-ENAMETOOLONG); + + OBD_ALLOC_PTR(new_pool); + if (new_pool == NULL) + RETURN(-ENOMEM); + + strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name)); + new_pool->pool_lobd = obd; + /* ref count init to 1 because when created a pool is always used + * up to deletion + */ + atomic_set(&new_pool->pool_refcount, 1); + rc = lov_ost_pool_init(&new_pool->pool_obds, 0); + if (rc) + GOTO(out_err, rc); + + INIT_HLIST_NODE(&new_pool->pool_hash); + +#ifdef CONFIG_PROC_FS + /* get ref for /proc file */ + lov_pool_getref(new_pool); + new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry, + poolname, new_pool, + &pool_proc_operations); + if (IS_ERR(new_pool->pool_proc_entry)) { + CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname); + new_pool->pool_proc_entry = NULL; + lov_pool_putref(new_pool); + } + CDEBUG(D_INFO, "pool %p - proc %p\n", + new_pool, new_pool->pool_proc_entry); +#endif + + spin_lock(&obd->obd_dev_lock); + list_add_tail(&new_pool->pool_list, &lov->lov_pool_list); + lov->lov_pool_count++; + spin_unlock(&obd->obd_dev_lock); + + /* add to find only when it fully ready */ + rc = cfs_hash_add_unique(lov->lov_pools_hash_body, poolname, + &new_pool->pool_hash); + if (rc) + GOTO(out_err, rc = -EEXIST); + + CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n", + poolname, lov->lov_pool_count); + + RETURN(0); + +out_err: + spin_lock(&obd->obd_dev_lock); + list_del_init(&new_pool->pool_list); + lov->lov_pool_count--; + spin_unlock(&obd->obd_dev_lock); + lprocfs_remove(&new_pool->pool_proc_entry); + lov_ost_pool_free(&new_pool->pool_obds); + OBD_FREE_PTR(new_pool); + + return rc; +} + +int lov_pool_del(struct obd_device *obd, char *poolname) +{ + struct lov_obd *lov; + struct pool_desc *pool; + ENTRY; + + lov = &(obd->u.lov); + + /* lookup and kill hash reference */ + pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + RETURN(-ENOENT); + + if (pool->pool_proc_entry != NULL) { + CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry); + lprocfs_remove(&pool->pool_proc_entry); + lov_pool_putref(pool); + } + + spin_lock(&obd->obd_dev_lock); + list_del_init(&pool->pool_list); + lov->lov_pool_count--; + spin_unlock(&obd->obd_dev_lock); + + /* release last reference */ + lov_pool_putref(pool); + + RETURN(0); +} + + +int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) +{ + struct obd_uuid ost_uuid; + struct lov_obd *lov; + struct pool_desc *pool; + unsigned int lov_idx; + int rc; + ENTRY; + + lov = &(obd->u.lov); + + pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + RETURN(-ENOENT); + + obd_str2uuid(&ost_uuid, ostname); + + + /* search ost in lov array */ + lov_tgts_getref(obd); + for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { + if (!lov->lov_tgts[lov_idx]) + continue; + if (obd_uuid_equals(&ost_uuid, + &(lov->lov_tgts[lov_idx]->ltd_uuid))) + break; + } + /* test if ost found in lov */ + if (lov_idx == lov->desc.ld_tgt_count) + GOTO(out, rc = -EINVAL); + + rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size); + if (rc) + GOTO(out, rc); + + CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n", + ostname, poolname, pool_tgt_count(pool)); + + EXIT; +out: + lov_tgts_putref(obd); + lov_pool_putref(pool); + + return rc; +} + +int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname) +{ + struct obd_uuid ost_uuid; + struct lov_obd *lov; + struct pool_desc *pool; + unsigned int lov_idx; + int rc = 0; + ENTRY; + + lov = &(obd->u.lov); + + pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + RETURN(-ENOENT); + + obd_str2uuid(&ost_uuid, ostname); + + lov_tgts_getref(obd); + /* search ost in lov array, to get index */ + for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { + if (!lov->lov_tgts[lov_idx]) + continue; + + if (obd_uuid_equals(&ost_uuid, + &(lov->lov_tgts[lov_idx]->ltd_uuid))) + break; + } + + /* test if ost found in lov */ + if (lov_idx == lov->desc.ld_tgt_count) + GOTO(out, rc = -EINVAL); + + lov_ost_pool_remove(&pool->pool_obds, lov_idx); + + CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname, + poolname); + + EXIT; +out: + lov_tgts_putref(obd); + lov_pool_putref(pool); + + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_request.c b/drivers/staging/lustrefsx/lustre/lov/lov_request.c new file mode 100644 index 0000000000000..75e5c901fd91e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lov_request.c @@ -0,0 +1,392 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include +#include "lov_internal.h" + +static void lov_init_set(struct lov_request_set *set) +{ + set->set_count = 0; + atomic_set(&set->set_completes, 0); + atomic_set(&set->set_success, 0); + INIT_LIST_HEAD(&set->set_list); +} + +static void lov_finish_set(struct lov_request_set *set) +{ + struct list_head *pos, *n; + struct lov_request *req; + + ENTRY; + + LASSERT(set != NULL); + list_for_each_safe(pos, n, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + list_del_init(&req->rq_link); + + if (req->rq_oi.oi_osfs) + OBD_FREE_PTR(req->rq_oi.oi_osfs); + + OBD_FREE_PTR(req); + } + + OBD_FREE_PTR(set); + EXIT; +} + +static void +lov_update_set(struct lov_request_set *set, struct lov_request *req, int rc) +{ + atomic_inc(&set->set_completes); + if (rc == 0) + atomic_inc(&set->set_success); +} + +static void +lov_set_add_req(struct lov_request *req, struct lov_request_set *set) +{ + list_add_tail(&req->rq_link, &set->set_list); + set->set_count++; + req->rq_rqset = set; +} + +static int lov_check_set(struct lov_obd *lov, int idx) +{ + int rc = 0; + + mutex_lock(&lov->lov_lock); + + if (!lov->lov_tgts[idx] || lov->lov_tgts[idx]->ltd_active || + (lov->lov_tgts[idx]->ltd_exp && + class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried)) + rc = 1; + + mutex_unlock(&lov->lov_lock); + return rc; +} + +/* + * Check if the OSC connection exists and is active. + * If the OSC has not yet had a chance to connect to the OST the first time, + * wait once for it to connect instead of returning an error. + */ +static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx) +{ + wait_queue_head_t waitq; + struct l_wait_info lwi; + struct lov_tgt_desc *tgt; + struct obd_import *imp = NULL; + int rc = 0; + + mutex_lock(&lov->lov_lock); + + tgt = lov->lov_tgts[ost_idx]; + + if (unlikely(!tgt)) + GOTO(out, rc = 0); + + if (likely(tgt->ltd_active)) + GOTO(out, rc = 1); + + if (tgt->ltd_exp) + imp = class_exp2cliimp(tgt->ltd_exp); + if (imp && imp->imp_connect_tried) + GOTO(out, rc = 0); + if (imp && imp->imp_state == LUSTRE_IMP_IDLE) + GOTO(out, rc = 0); + + mutex_unlock(&lov->lov_lock); + + init_waitqueue_head(&waitq); + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout), + cfs_time_seconds(1), NULL, NULL); + + rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi); + if (tgt->ltd_active) + return 1; + + return 0; + +out: + mutex_unlock(&lov->lov_lock); + return rc; +} + +#define LOV_U64_MAX ((__u64)~0ULL) +#define LOV_SUM_MAX(tot, add) \ + do { \ + if ((tot) + (add) < (tot)) \ + (tot) = LOV_U64_MAX; \ + else \ + (tot) += (add); \ + } while (0) + +static int +lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success) +{ + ENTRY; + + if (success) { + __u32 expected_stripes = lov_get_stripe_count(&obd->u.lov, + LOV_MAGIC, 0); + if (osfs->os_files != LOV_U64_MAX) + lov_do_div64(osfs->os_files, expected_stripes); + if (osfs->os_ffree != LOV_U64_MAX) + lov_do_div64(osfs->os_ffree, expected_stripes); + + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, osfs, sizeof(*osfs)); + obd->obd_osfs_age = ktime_get_seconds(); + spin_unlock(&obd->obd_osfs_lock); + RETURN(0); + } + + RETURN(-EIO); +} + +int lov_fini_statfs_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (!set) + RETURN(0); + + if (atomic_read(&set->set_completes)) { + rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs, + atomic_read(&set->set_success)); + } + + lov_finish_set(set); + + RETURN(rc); +} + +static void +lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs, + int success) +{ + int shift = 0, quit = 0; + __u64 tmp; + + if (success == 0) { + memcpy(osfs, lov_sfs, sizeof(*lov_sfs)); + } else { + if (osfs->os_bsize != lov_sfs->os_bsize) { + /* assume all block sizes are always powers of 2 */ + /* get the bits difference */ + tmp = osfs->os_bsize | lov_sfs->os_bsize; + for (shift = 0; shift <= 64; ++shift) { + if (tmp & 1) { + if (quit) + break; + quit = 1; + shift = 0; + } + tmp >>= 1; + } + } + + if (osfs->os_bsize < lov_sfs->os_bsize) { + osfs->os_bsize = lov_sfs->os_bsize; + + osfs->os_bfree >>= shift; + osfs->os_bavail >>= shift; + osfs->os_blocks >>= shift; + } else if (shift != 0) { + lov_sfs->os_bfree >>= shift; + lov_sfs->os_bavail >>= shift; + lov_sfs->os_blocks >>= shift; + } +#ifdef MIN_DF + /* + * Sandia requested that df (and so, statfs) only + * returned minimal available space on + * a single OST, so people would be able to + * write this much data guaranteed. + */ + if (osfs->os_bavail > lov_sfs->os_bavail) { + /* + * Presumably if new bavail is smaller, + * new bfree is bigger as well + */ + osfs->os_bfree = lov_sfs->os_bfree; + osfs->os_bavail = lov_sfs->os_bavail; + } +#else + osfs->os_bfree += lov_sfs->os_bfree; + osfs->os_bavail += lov_sfs->os_bavail; +#endif + osfs->os_blocks += lov_sfs->os_blocks; + /* + * XXX not sure about this one - depends on policy. + * - could be minimum if we always stripe on all OBDs + * (but that would be wrong for any other policy, + * if one of the OBDs has no more objects left) + * - could be sum if we stripe whole objects + * - could be average, just to give a nice number + * + * To give a "reasonable" (if not wholly accurate) + * number, we divide the total number of free objects + * by expected stripe count (watch out for overflow). + */ + LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files); + LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree); + } +} + +/* + * The callback for osc_statfs_async that finilizes a request info when a + * response is received. + */ +static int cb_statfs_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + struct lov_request_set *set; + struct obd_statfs *osfs, *lov_sfs; + struct lov_obd *lov; + struct lov_tgt_desc *tgt; + struct obd_device *lovobd, *tgtobd; + int success; + + ENTRY; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + set = lovreq->rq_rqset; + lovobd = set->set_obd; + lov = &lovobd->u.lov; + osfs = set->set_oi->oi_osfs; + lov_sfs = oinfo->oi_osfs; + success = atomic_read(&set->set_success); + /* + * XXX: the same is done in lov_update_common_set, however + * lovset->set_exp is not initialized. + */ + lov_update_set(set, lovreq, rc); + if (rc) + GOTO(out, rc); + + lov_tgts_getref(lovobd); + tgt = lov->lov_tgts[lovreq->rq_idx]; + if (!tgt || !tgt->ltd_active) + GOTO(out_update, rc); + + tgtobd = class_exp2obd(tgt->ltd_exp); + spin_lock(&tgtobd->obd_osfs_lock); + memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs)); + if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0) + tgtobd->obd_osfs_age = ktime_get_seconds(); + spin_unlock(&tgtobd->obd_osfs_lock); + +out_update: + lov_update_statfs(osfs, lov_sfs, success); + lov_tgts_putref(lovobd); +out: + RETURN(0); +} + +int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &obd->u.lov; + int rc = 0, i; + + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (!set) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_obd = obd; + set->set_oi = oinfo; + + /* We only get block data from the OBD */ + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + struct lov_tgt_desc *ltd = lov->lov_tgts[i]; + struct lov_request *req; + + if (!ltd) { + CDEBUG(D_HA, "lov idx %d inactive\n", i); + continue; + } + + /* + * skip targets that have been explicitely disabled by the + * administrator + */ + if (!ltd->ltd_exp) { + CDEBUG(D_HA, "lov idx %d administratively disabled\n", + i); + continue; + } + + if (oinfo->oi_flags & OBD_STATFS_NODELAY && + class_exp2cliimp(ltd->ltd_exp)->imp_state != + LUSTRE_IMP_IDLE && !ltd->ltd_active) { + CDEBUG(D_HA, "lov idx %d inactive\n", i); + continue; + } + + if (!ltd->ltd_active) + lov_check_and_wait_active(lov, i); + + OBD_ALLOC(req, sizeof(*req)); + if (!req) + GOTO(out_set, rc = -ENOMEM); + + OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs)); + if (!req->rq_oi.oi_osfs) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + + req->rq_idx = i; + req->rq_oi.oi_cb_up = cb_statfs_update; + req->rq_oi.oi_flags = oinfo->oi_flags; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_statfs_set(set); + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c new file mode 100644 index 0000000000000..90a11e75393b9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c @@ -0,0 +1,149 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device and cl_device_type for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lov-sub device and device type functions. + * + */ + +static int lovsub_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct lovsub_device *lsd = lu2lovsub_dev(d); + struct lu_device_type *ldt; + int rc; + + ENTRY; + next->ld_site = d->ld_site; + ldt = next->ld_type; + LASSERT(ldt != NULL); + rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL); + if (rc) { + next->ld_site = NULL; + RETURN(rc); + } + + lu_device_get(next); + lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); + lsd->acid_next = lu2cl_dev(next); + RETURN(rc); +} + +static struct lu_device *lovsub_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct lu_device *next; + struct lovsub_device *lsd; + + ENTRY; + lsd = lu2lovsub_dev(d); + next = cl2lu_dev(lsd->acid_next); + lsd->acid_next = NULL; + RETURN(next); +} + +static struct lu_device *lovsub_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct lovsub_device *lsd = lu2lovsub_dev(d); + struct lu_device *next = cl2lu_dev(lsd->acid_next); + + if (atomic_read(&d->ld_ref) && d->ld_site) { + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL); + lu_site_print(env, d->ld_site, &msgdata, lu_cdebug_printer); + } + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(lsd); + return next; +} + +static const struct lu_device_operations lovsub_lu_ops = { + .ldo_object_alloc = lovsub_object_alloc, + .ldo_process_config = NULL, + .ldo_recovery_complete = NULL +}; + +static struct lu_device *lovsub_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct lovsub_device *lsd; + + OBD_ALLOC_PTR(lsd); + if (lsd) { + int result; + + result = cl_device_init(&lsd->acid_cl, t); + if (result == 0) { + d = lovsub2lu_dev(lsd); + d->ld_ops = &lovsub_lu_ops; + } else + d = ERR_PTR(result); + } else + d = ERR_PTR(-ENOMEM); + return d; +} + +static const struct lu_device_type_operations lovsub_device_type_ops = { + .ldto_device_alloc = lovsub_device_alloc, + .ldto_device_free = lovsub_device_free, + + .ldto_device_init = lovsub_device_init, + .ldto_device_fini = lovsub_device_fini +}; + +#define LUSTRE_LOVSUB_NAME "lovsub" + +struct lu_device_type lovsub_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_LOVSUB_NAME, + .ldt_ops = &lovsub_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + + +/** @} lov */ + diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c new file mode 100644 index 0000000000000..d219356cb3ad3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c @@ -0,0 +1,195 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub object operations. + * + */ + +int lovsub_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev); + struct lu_object *below; + struct lu_device *under; + + int result; + + ENTRY; + under = &dev->acid_next->cd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); + if (below) { + lu_object_add(obj, below); + cl_object_page_init(lu2cl(obj), 0); + result = 0; + } else + result = -ENOMEM; + RETURN(result); + +} + +static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct lovsub_object *los = lu2lovsub(obj); + struct lov_object *lov = los->lso_super; + + ENTRY; + + /* + * We can't assume lov was assigned here, because of the shadow + * object handling in lu_object_find. + */ + if (lov) { + int index = lov_comp_entry(los->lso_index); + int stripe = lov_comp_stripe(los->lso_index); + struct lov_layout_raid0 *r0 = lov_r0(lov, index); + + LASSERT(lov->lo_type == LLT_COMP); + LASSERT(r0->lo_sub[stripe] == los); + spin_lock(&r0->lo_sub_lock); + r0->lo_sub[stripe] = NULL; + spin_unlock(&r0->lo_sub_lock); + } + + lu_object_fini(obj); + lu_object_header_fini(&los->lso_header.coh_lu); + OBD_SLAB_FREE_PTR(los, lovsub_object_kmem); + EXIT; +} + +static int lovsub_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj) +{ + struct lovsub_object *los = lu2lovsub(obj); + + return (*p)(env, cookie, "[%d]", los->lso_index); +} + +static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct lovsub_object *los = cl2lovsub(obj); + struct lov_object *lov = cl2lovsub(obj)->lso_super; + + ENTRY; + lov_r0(lov, lov_comp_entry(los->lso_index))->lo_attr_valid = 0; + RETURN(0); +} + +static int lovsub_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lovsub_object *los = cl2lovsub(obj); + + ENTRY; + RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb)); +} + +/** + * Implementation of struct cl_object_operations::coo_req_attr_set() for lovsub + * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx + * field, which is filled there. + */ +static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr) +{ + struct lovsub_object *subobj = cl2lovsub(obj); + struct lov_stripe_md *lsm = subobj->lso_super->lo_lsm; + + ENTRY; + cl_req_attr_set(env, &subobj->lso_super->lo_cl, attr); + + /* + * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it + * unconditionally. It never changes anyway. + */ + attr->cra_oa->o_stripe_idx = lov_comp_stripe(subobj->lso_index); + lov_lsm2layout(lsm, lsm->lsm_entries[lov_comp_entry(subobj->lso_index)], + &attr->cra_oa->o_layout); + attr->cra_oa->o_valid |= OBD_MD_FLOSTLAYOUT; + EXIT; +} + +static const struct cl_object_operations lovsub_ops = { + .coo_attr_update = lovsub_attr_update, + .coo_glimpse = lovsub_object_glimpse, + .coo_req_attr_set = lovsub_req_attr_set +}; + +static const struct lu_object_operations lovsub_lu_obj_ops = { + .loo_object_init = lovsub_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = lovsub_object_free, + .loo_object_print = lovsub_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *lovsub_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct lovsub_object *los; + struct lu_object *obj; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, GFP_NOFS); + if (los) { + struct cl_object_header *hdr; + + obj = lovsub2lu(los); + hdr = &los->lso_header; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + los->lso_cl.co_ops = &lovsub_ops; + obj->lo_ops = &lovsub_lu_obj_ops; + } else + obj = NULL; + RETURN(obj); +} + +/** @} lov */ diff --git a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c new file mode 100644 index 0000000000000..f6eeebed9e2b0 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c @@ -0,0 +1,344 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include "lov_internal.h" + +static int lov_stripesize_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = (struct obd_device *)m->private; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + + seq_printf(m, "%llu\n", desc->ld_default_stripe_size); + return 0; +} + +static ssize_t lov_stripesize_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + struct lov_desc *desc; + s64 val; + int rc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1'); + if (rc) + return rc; + if (val < 0) + return -ERANGE; + + lov_fix_desc_stripe_size(&val); + desc->ld_default_stripe_size = val; + + return count; +} +LPROC_SEQ_FOPS(lov_stripesize); + +static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &dev->u.lov.desc; + + return sprintf(buf, "%lld\n", desc->ld_default_stripe_offset); +} + +static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &dev->u.lov.desc; + long val; + int rc; + + rc = kstrtol(buf, 0, &val); + if (rc) + return rc; + if (val < -1 || val > LOV_MAX_STRIPE_COUNT) + return -ERANGE; + + desc->ld_default_stripe_offset = val; + + return count; +} +LUSTRE_RW_ATTR(stripeoffset); + +static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &dev->u.lov.desc; + + return sprintf(buf, "%u\n", desc->ld_pattern); +} + +static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &dev->u.lov.desc; + u32 pattern; + int rc; + + rc = kstrtouint(buffer, 0, &pattern); + if (rc) + return rc; + + lov_fix_desc_pattern(&pattern); + desc->ld_pattern = pattern; + + return count; +} +LUSTRE_RW_ATTR(stripetype); + +static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &dev->u.lov.desc; + + return sprintf(buf, "%d\n", + (__s16)(desc->ld_default_stripe_count + 1) - 1); +} + +static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &dev->u.lov.desc; + int stripe_count; + int rc; + + rc = kstrtoint(buffer, 0, &stripe_count); + if (rc) + return rc; + + if (stripe_count < -1) + return -ERANGE; + + lov_fix_desc_stripe_count(&stripe_count); + desc->ld_default_stripe_count = stripe_count; + + return count; +} +LUSTRE_RW_ATTR(stripecount); + +static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &dev->u.lov.desc; + + return sprintf(buf, "%u\n", desc->ld_tgt_count); +} +LUSTRE_RO_ATTR(numobd); + +static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &dev->u.lov.desc; + + return sprintf(buf, "%u\n", desc->ld_active_tgt_count); +} +LUSTRE_RO_ATTR(activeobd); + +static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lov_desc *desc = &dev->u.lov.desc; + + return sprintf(buf, "%s\n", desc->ld_uuid.uuid); +} +LUSTRE_RO_ATTR(desc_uuid); + +#ifdef CONFIG_PROC_FS +static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lov_obd *lov = &dev->u.lov; + + while (*pos < lov->desc.ld_tgt_count) { + if (lov->lov_tgts[*pos]) + return lov->lov_tgts[*pos]; + ++*pos; + } + return NULL; +} + +static void lov_tgt_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lov_obd *lov = &dev->u.lov; + + while (++*pos < lov->desc.ld_tgt_count) { + if (lov->lov_tgts[*pos]) + return lov->lov_tgts[*pos]; + } + return NULL; +} + +static int lov_tgt_seq_show(struct seq_file *p, void *v) +{ + struct lov_tgt_desc *tgt = v; + + seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index, + obd_uuid2str(&tgt->ltd_uuid), + tgt->ltd_active ? "" : "IN"); + return 0; +} + +static const struct seq_operations lov_tgt_sops = { + .start = lov_tgt_seq_start, + .stop = lov_tgt_seq_stop, + .next = lov_tgt_seq_next, + .show = lov_tgt_seq_show, +}; + +static int lov_target_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lov_tgt_sops); + if (rc) + return rc; + + seq = file->private_data; + seq->private = PDE_DATA(inode); + return 0; +} + +struct lprocfs_vars lprocfs_lov_obd_vars[] = { + { .name = "stripesize", + .fops = &lov_stripesize_fops }, + { NULL } +}; + +static const struct proc_ops lov_proc_target_fops = { + PROC_OWNER(THIS_MODULE) + .proc_open = lov_target_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = lprocfs_seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +static struct attribute *lov_attrs[] = { + &lustre_attr_activeobd.attr, + &lustre_attr_numobd.attr, + &lustre_attr_desc_uuid.attr, + &lustre_attr_stripeoffset.attr, + &lustre_attr_stripetype.attr, + &lustre_attr_stripecount.attr, + NULL, +}; + +int lov_tunables_init(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; +#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT) + struct obd_type *type; +#endif + int rc; + + obd->obd_vars = lprocfs_lov_obd_vars; +#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT) + /* If this is true then both client (lov) and server + * (lod) are on the same node. The lod layer if loaded + * first will register the lov proc directory. In that + * case obd->obd_type->typ_procroot will be not set. + * Instead we use type->typ_procsym as the parent. + */ + type = class_search_type(LUSTRE_LOD_NAME); + if (type && type->typ_procsym) { + obd->obd_proc_entry = lprocfs_register(obd->obd_name, + type->typ_procsym, + obd->obd_vars, obd); + if (IS_ERR(obd->obd_proc_entry)) { + rc = PTR_ERR(obd->obd_proc_entry); + CERROR("error %d setting up lprocfs for %s\n", rc, + obd->obd_name); + obd->obd_proc_entry = NULL; + } + } +#endif + obd->obd_ktype.default_attrs = lov_attrs; + rc = lprocfs_obd_setup(obd, false); + if (rc) + GOTO(out, rc); + +#ifdef CONFIG_PROC_FS + rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", 0444, + &lov_proc_target_fops, obd); + if (rc) + CWARN("%s: Error adding the target_obd file : rc %d\n", + obd->obd_name, rc); + + lov->lov_pool_proc_entry = lprocfs_register("pools", + obd->obd_proc_entry, + NULL, NULL); + if (IS_ERR(lov->lov_pool_proc_entry)) { + rc = PTR_ERR(lov->lov_pool_proc_entry); + CERROR("%s: error setting up debugfs for pools : rc %d\n", + obd->obd_name, rc); + lov->lov_pool_proc_entry = NULL; + } +#endif /* CONFIG_FS_PROC */ +out: + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/Makefile b/drivers/staging/lustrefsx/lustre/mdc/Makefile new file mode 100644 index 0000000000000..7c9329681bdf2 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_LUSTREFSX_FS) += mdc.o + +mdc-y := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o +mdc-y += mdc_changelog.o mdc_dev.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c new file mode 100644 index 0000000000000..0c2e79a2a336d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c @@ -0,0 +1,562 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include "mdc_internal.h" + +static ssize_t active_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + ssize_t len; + + LPROCFS_CLIMP_CHECK(dev); + len = sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive); + LPROCFS_CLIMP_EXIT(dev); + return len; +} + +static ssize_t active_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + /* opposite senses */ + if (dev->u.cli.cl_import->imp_deactive == val) + rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val); + else + CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n", + val); + + return count; +} +LUSTRE_RW_ATTR(active); + +static ssize_t max_rpcs_in_flight_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + ssize_t len; + u32 max; + + max = obd_get_max_rpcs_in_flight(&dev->u.cli); + len = sprintf(buf, "%u\n", max); + + return len; +} + +static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + rc = obd_set_max_rpcs_in_flight(&dev->u.cli, val); + if (rc) + count = rc; + + return count; +} +LUSTRE_RW_ATTR(max_rpcs_in_flight); + +static ssize_t max_mod_rpcs_in_flight_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + u16 max; + + max = obd_get_max_mod_rpcs_in_flight(&dev->u.cli); + return sprintf(buf, "%hu\n", max); +} + +static ssize_t max_mod_rpcs_in_flight_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + u16 val; + int rc; + + rc = kstrtou16(buffer, 10, &val); + if (rc) + return rc; + + rc = obd_set_max_mod_rpcs_in_flight(&dev->u.cli, val); + if (rc) + count = rc; + + return count; +} +LUSTRE_RW_ATTR(max_mod_rpcs_in_flight); + +static int mdc_max_dirty_mb_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + unsigned long val; + + spin_lock(&cli->cl_loi_list_lock); + val = PAGES_TO_MiB(cli->cl_dirty_max_pages); + spin_unlock(&cli->cl_loi_list_lock); + + seq_printf(m, "%lu\n", val); + return 0; +} + +static ssize_t mdc_max_dirty_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *sfl = file->private_data; + struct obd_device *dev = sfl->private; + struct client_obd *cli = &dev->u.cli; + s64 pages_number; + int rc; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M'); + if (rc) + return rc; + + /* MB -> pages */ + pages_number = round_up(pages_number, 1024 * 1024) >> PAGE_SHIFT; + if (pages_number <= 0 || + pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) || + pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */ + return -ERANGE; + + spin_lock(&cli->cl_loi_list_lock); + cli->cl_dirty_max_pages = pages_number; + osc_wake_cache_waiters(cli); + spin_unlock(&cli->cl_loi_list_lock); + + return count; +} +LPROC_SEQ_FOPS(mdc_max_dirty_mb); + +static ssize_t contention_seconds_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct osc_device *od = obd2osc_dev(obd); + + return sprintf(buf, "%lld\n", od->od_contention_time); +} + +static ssize_t contention_seconds_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct osc_device *od = obd2osc_dev(obd); + time64_t val; + int rc; + + rc = kstrtoll(buffer, 0, &val); + if (rc) + return rc; + + od->od_contention_time = val; + + return count; +} +LUSTRE_RW_ATTR(contention_seconds); + +LUSTRE_ATTR(mds_conn_uuid, 0444, conn_uuid_show, NULL); +LUSTRE_RO_ATTR(conn_uuid); + +LUSTRE_RW_ATTR(ping); + +static int mdc_cached_mb_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + int shift = 20 - PAGE_SHIFT; + + seq_printf(m, "used_mb: %ld\n" + "busy_cnt: %ld\n" + "reclaim: %llu\n", + (atomic_long_read(&cli->cl_lru_in_list) + + atomic_long_read(&cli->cl_lru_busy)) >> shift, + atomic_long_read(&cli->cl_lru_busy), + cli->cl_lru_reclaim); + + return 0; +} + +/* shrink the number of caching pages to a specific number */ +static ssize_t +mdc_cached_mb_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *sfl = file->private_data; + struct obd_device *dev = sfl->private; + struct client_obd *cli = &dev->u.cli; + __s64 pages_number; + long rc; + char kernbuf[128]; + + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) - + kernbuf; + rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M'); + if (rc) + return rc; + + pages_number >>= PAGE_SHIFT; + + if (pages_number < 0) + return -ERANGE; + + rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number; + if (rc > 0) { + struct lu_env *env; + __u16 refcheck; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + (void)osc_lru_shrink(env, cli, rc, true); + cl_env_put(env, &refcheck); + } + } + + return count; +} +LPROC_SEQ_FOPS(mdc_cached_mb); + +static int mdc_unstable_stats_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + long pages; + int mb; + + pages = atomic_long_read(&cli->cl_unstable_count); + mb = (pages * PAGE_SIZE) >> 20; + + seq_printf(m, "unstable_pages: %20ld\n" + "unstable_mb: %10d\n", pages, mb); + return 0; +} +LPROC_SEQ_FOPS_RO(mdc_unstable_stats); + +static ssize_t mdc_rpc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + + lprocfs_oh_clear(&cli->cl_mod_rpcs_hist); + + lprocfs_oh_clear(&cli->cl_read_rpc_hist); + lprocfs_oh_clear(&cli->cl_write_rpc_hist); + lprocfs_oh_clear(&cli->cl_read_page_hist); + lprocfs_oh_clear(&cli->cl_write_page_hist); + lprocfs_oh_clear(&cli->cl_read_offset_hist); + lprocfs_oh_clear(&cli->cl_write_offset_hist); + + return len; +} + +static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; + int i; + + obd_mod_rpc_stats_seq_show(&dev->u.cli, seq); + + spin_lock(&cli->cl_loi_list_lock); + + seq_printf(seq, "\nread RPCs in flight: %d\n", + cli->cl_r_in_flight); + seq_printf(seq, "write RPCs in flight: %d\n", + cli->cl_w_in_flight); + seq_printf(seq, "pending write pages: %d\n", + atomic_read(&cli->cl_pending_w_pages)); + seq_printf(seq, "pending read pages: %d\n", + atomic_read(&cli->cl_pending_r_pages)); + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "pages per rpc rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_page_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_page_hist.oh_buckets[i]; + + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + 1 << i, r, pct(r, read_tot), + pct(read_cum, read_tot), w, + pct(w, write_tot), + pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "rpcs in flight rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i]; + + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + i, r, pct(r, read_tot), pct(read_cum, read_tot), w, + pct(w, write_tot), pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "offset rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_offset_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_offset_hist.oh_buckets[i]; + + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + (i == 0) ? 0 : 1 << (i - 1), + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + spin_unlock(&cli->cl_loi_list_lock); + + return 0; +} +LPROC_SEQ_FOPS(mdc_rpc_stats); + +static int mdc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timespec64 now; + struct obd_device *dev = seq->private; + struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; + + ktime_get_real_ts64(&now); + + seq_printf(seq, "snapshot_time: %lld.%09lu (secs.nsecs)\n", + (s64)now.tv_sec, now.tv_nsec); + seq_printf(seq, "lockless_write_bytes\t\t%llu\n", + stats->os_lockless_writes); + seq_printf(seq, "lockless_read_bytes\t\t%llu\n", + stats->os_lockless_reads); + seq_printf(seq, "lockless_truncate\t\t%llu\n", + stats->os_lockless_truncates); + return 0; +} + +static ssize_t mdc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; + + memset(stats, 0, sizeof(*stats)); + return len; +} +LPROC_SEQ_FOPS(mdc_stats); + +static int mdc_dom_min_repsize_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + + seq_printf(m, "%u\n", dev->u.cli.cl_dom_min_inline_repsize); + + return 0; +} + +static ssize_t mdc_dom_min_repsize_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev; + unsigned int val; + int rc; + + dev = ((struct seq_file *)file->private_data)->private; + rc = kstrtouint_from_user(buffer, count, 0, &val); + if (rc) + return rc; + + if (val > MDC_DOM_MAX_INLINE_REPSIZE) + return -ERANGE; + + dev->u.cli.cl_dom_min_inline_repsize = val; + return count; +} +LPROC_SEQ_FOPS(mdc_dom_min_repsize); + +LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags); +LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid); +LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts); +LPROC_SEQ_FOPS_RO_TYPE(mdc, state); +LPROC_SEQ_FOPS_RW_TYPE(mdc, obd_max_pages_per_rpc); +LPROC_SEQ_FOPS_RW_TYPE(mdc, import); +LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov); + +struct lprocfs_vars lprocfs_mdc_obd_vars[] = { + { .name = "connect_flags", + .fops = &mdc_connect_flags_fops }, + { .name = "mds_server_uuid", + .fops = &mdc_server_uuid_fops }, + { .name = "max_pages_per_rpc", + .fops = &mdc_obd_max_pages_per_rpc_fops }, + { .name = "max_dirty_mb", + .fops = &mdc_max_dirty_mb_fops }, + { .name = "mdc_cached_mb", + .fops = &mdc_cached_mb_fops }, + { .name = "timeouts", + .fops = &mdc_timeouts_fops }, + { .name = "import", + .fops = &mdc_import_fops }, + { .name = "state", + .fops = &mdc_state_fops }, + { .name = "pinger_recov", + .fops = &mdc_pinger_recov_fops }, + { .name = "rpc_stats", + .fops = &mdc_rpc_stats_fops }, + { .name = "unstable_stats", + .fops = &mdc_unstable_stats_fops }, + { .name = "mdc_stats", + .fops = &mdc_stats_fops }, + { .name = "mdc_dom_min_repsize", + .fops = &mdc_dom_min_repsize_fops }, + { NULL } +}; + +static struct attribute *mdc_attrs[] = { + &lustre_attr_active.attr, + &lustre_attr_max_rpcs_in_flight.attr, + &lustre_attr_max_mod_rpcs_in_flight.attr, + &lustre_attr_contention_seconds.attr, + &lustre_attr_mds_conn_uuid.attr, + &lustre_attr_conn_uuid.attr, + &lustre_attr_ping.attr, + NULL, +}; + +int mdc_tunables_init(struct obd_device *obd) +{ + int rc; + + obd->obd_ktype.default_attrs = mdc_attrs; + obd->obd_vars = lprocfs_mdc_obd_vars; + + rc = lprocfs_obd_setup(obd, false); + if (rc) + goto out_failed; +#ifdef CONFIG_PROC_FS + rc = lprocfs_alloc_md_stats(obd, 0); + if (rc) { + lprocfs_obd_cleanup(obd); + goto out_failed; + } +#endif + rc = sptlrpc_lprocfs_cliobd_attach(obd); + if (rc) { +#ifdef CONFIG_PROC_FS + lprocfs_free_md_stats(obd); +#endif + lprocfs_obd_cleanup(obd); + goto out_failed; + } + ptlrpc_lprocfs_register_obd(obd); + +out_failed: + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c new file mode 100644 index 0000000000000..1c8eb65110500 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c @@ -0,0 +1,864 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Commissariat a l'Energie Atomique et aux Energies + * Alternatives. + * + * Copyright (c) 2017, Intel Corporation. + * + * Author: Henri Doreau + */ + +#define DEBUG_SUBSYSTEM S_MDC + +#include +#include +#include +#include +#include + +#include +#include + +#include "mdc_internal.h" + + +/* + * -- Changelog delivery through character device -- + */ + +/** + * Mutex to protect chlg_registered_devices below + */ +static DEFINE_MUTEX(chlg_registered_dev_lock); + +/** + * Global linked list of all registered devices (one per MDT). + */ +static LIST_HEAD(chlg_registered_devices); + + +struct chlg_registered_dev { + /* Device name of the form "changelog-{MDTNAME}" */ + char ced_name[32]; + /* changelog char device */ + struct cdev ced_cdev; + struct device *ced_device; + /* OBDs referencing this device (multiple mount point) */ + struct list_head ced_obds; + /* Reference counter for proper deregistration */ + struct kref ced_refs; + /* Link within the global chlg_registered_devices */ + struct list_head ced_link; +}; + +struct chlg_reader_state { + /* Shortcut to the corresponding OBD device */ + struct obd_device *crs_obd; + /* the corresponding chlg_registered_dev */ + struct chlg_registered_dev *crs_ced; + /* Producer thread (if any) */ + struct task_struct *crs_prod_task; + /* An error occurred that prevents from reading further */ + int crs_err; + /* EOF, no more records available */ + bool crs_eof; + /* Desired start position */ + __u64 crs_start_offset; + /* Wait queue for the catalog processing thread */ + wait_queue_head_t crs_waitq_prod; + /* Wait queue for the record copy threads */ + wait_queue_head_t crs_waitq_cons; + /* Mutex protecting crs_rec_count and crs_rec_queue */ + struct mutex crs_lock; + /* Number of item in the list */ + __u64 crs_rec_count; + /* List of prefetched enqueued_record::enq_linkage_items */ + struct list_head crs_rec_queue; + unsigned int crs_last_catidx; + unsigned int crs_last_idx; + bool crs_poll; +}; + +struct chlg_rec_entry { + /* Link within the chlg_reader_state::crs_rec_queue list */ + struct list_head enq_linkage; + /* Data (enq_record) field length */ + __u64 enq_length; + /* Copy of a changelog record (see struct llog_changelog_rec) */ + struct changelog_rec enq_record[]; +}; + +enum { + /* Number of records to prefetch locally. */ + CDEV_CHLG_MAX_PREFETCH = 1024, +}; + +static DEFINE_IDR(chlg_minor_idr); +static DEFINE_SPINLOCK(chlg_minor_lock); + +static int chlg_minor_alloc(int *pminor) +{ + void *minor_allocated = (void *)-1; + int minor; + + idr_preload(GFP_KERNEL); + spin_lock(&chlg_minor_lock); + minor = idr_alloc(&chlg_minor_idr, minor_allocated, 0, + MDC_CHANGELOG_DEV_COUNT, GFP_NOWAIT); + spin_unlock(&chlg_minor_lock); + idr_preload_end(); + + if (minor < 0) + return minor; + + *pminor = minor; + return 0; +} + +static void chlg_minor_free(int minor) +{ + spin_lock(&chlg_minor_lock); + idr_remove(&chlg_minor_idr, minor); + spin_unlock(&chlg_minor_lock); +} + +static void chlg_device_release(struct device *dev) +{ + struct chlg_registered_dev *entry = dev_get_drvdata(dev); + + chlg_minor_free(MINOR(entry->ced_cdev.dev)); + OBD_FREE_PTR(entry); +} + +/** + * Deregister a changelog character device whose refcount has reached zero. + */ +static void chlg_dev_clear(struct kref *kref) +{ + struct chlg_registered_dev *entry; + + ENTRY; + entry = container_of(kref, struct chlg_registered_dev, + ced_refs); + + list_del(&entry->ced_link); + cdev_del(&entry->ced_cdev); + device_destroy(mdc_changelog_class, entry->ced_cdev.dev); + EXIT; +} + +static inline struct obd_device* chlg_obd_get(struct chlg_registered_dev *dev) +{ + struct obd_device *obd; + + mutex_lock(&chlg_registered_dev_lock); + if (list_empty(&dev->ced_obds)) + return NULL; + + obd = list_first_entry(&dev->ced_obds, struct obd_device, + u.cli.cl_chg_dev_linkage); + class_incref(obd, "changelog", dev); + mutex_unlock(&chlg_registered_dev_lock); + return obd; +} + +static inline void chlg_obd_put(struct chlg_registered_dev *dev, + struct obd_device *obd) +{ + class_decref(obd, "changelog", dev); +} + +/** + * ChangeLog catalog processing callback invoked on each record. + * If the current record is eligible to userland delivery, push + * it into the crs_rec_queue where the consumer code will fetch it. + * + * @param[in] env (unused) + * @param[in] llh Client-side handle used to identify the llog + * @param[in] hdr Header of the current llog record + * @param[in,out] data chlg_reader_state passed from caller + * + * @return 0 or LLOG_PROC_* control code on success, negated error on failure. + */ +static int chlg_read_cat_process_cb(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *hdr, void *data) +{ + struct llog_changelog_rec *rec; + struct chlg_reader_state *crs = data; + struct chlg_rec_entry *enq; + size_t len; + int rc; + ENTRY; + + LASSERT(crs != NULL); + LASSERT(hdr != NULL); + + rec = container_of(hdr, struct llog_changelog_rec, cr_hdr); + + crs->crs_last_catidx = llh->lgh_hdr->llh_cat_idx; + crs->crs_last_idx = hdr->lrh_index; + + if (rec->cr_hdr.lrh_type != CHANGELOG_REC) { + rc = -EINVAL; + CERROR("%s: not a changelog rec %x/%d in llog "DFID" rc = %d\n", + crs->crs_obd->obd_name, rec->cr_hdr.lrh_type, + rec->cr.cr_type, + PFID(lu_object_fid(&llh->lgh_obj->do_lu)), rc); + RETURN(rc); + } + + /* Skip undesired records */ + if (rec->cr.cr_index < crs->crs_start_offset) + RETURN(0); + + CDEBUG(D_HSM, "%llu %02d%-5s %llu 0x%x t="DFID" p="DFID" %.*s\n", + rec->cr.cr_index, rec->cr.cr_type, + changelog_type2str(rec->cr.cr_type), rec->cr.cr_time, + rec->cr.cr_flags & CLF_FLAGMASK, + PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid), + rec->cr.cr_namelen, changelog_rec_name(&rec->cr)); + + wait_event_interruptible(crs->crs_waitq_prod, + crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH || + kthread_should_stop()); + + if (kthread_should_stop()) + RETURN(LLOG_PROC_BREAK); + + len = changelog_rec_size(&rec->cr) + rec->cr.cr_namelen; + OBD_ALLOC(enq, sizeof(*enq) + len); + if (enq == NULL) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&enq->enq_linkage); + enq->enq_length = len; + memcpy(enq->enq_record, &rec->cr, len); + + mutex_lock(&crs->crs_lock); + list_add_tail(&enq->enq_linkage, &crs->crs_rec_queue); + crs->crs_rec_count++; + mutex_unlock(&crs->crs_lock); + + wake_up_all(&crs->crs_waitq_cons); + + RETURN(0); +} + +/** + * Remove record from the list it is attached to and free it. + */ +static void enq_record_delete(struct chlg_rec_entry *rec) +{ + list_del(&rec->enq_linkage); + OBD_FREE(rec, sizeof(*rec) + rec->enq_length); +} + +/** + * Record prefetch thread entry point. Opens the changelog catalog and starts + * reading records. + * + * @param[in,out] args chlg_reader_state passed from caller. + * @return 0 on success, negated error code on failure. + */ +static int chlg_load(void *args) +{ + struct chlg_reader_state *crs = args; + struct chlg_registered_dev *ced = crs->crs_ced; + struct obd_device *obd = NULL; + struct llog_ctxt *ctx = NULL; + struct llog_handle *llh = NULL; + int rc; + ENTRY; + + crs->crs_last_catidx = -1; + crs->crs_last_idx = 0; + +again: + obd = chlg_obd_get(ced); + if (obd == NULL) + RETURN(-ENODEV); + + crs->crs_obd = obd; + + ctx = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT); + if (ctx == NULL) + GOTO(err_out, rc = -ENOENT); + + rc = llog_open(NULL, ctx, &llh, NULL, CHANGELOG_CATALOG, + LLOG_OPEN_EXISTS); + if (rc) { + CERROR("%s: fail to open changelog catalog: rc = %d\n", + obd->obd_name, rc); + GOTO(err_out, rc); + } + + + rc = llog_init_handle(NULL, llh, + LLOG_F_IS_CAT | + LLOG_F_EXT_JOBID | + LLOG_F_EXT_EXTRA_FLAGS | + LLOG_F_EXT_X_UIDGID | + LLOG_F_EXT_X_NID | + LLOG_F_EXT_X_OMODE | + LLOG_F_EXT_X_XATTR, + NULL); + if (rc) { + CERROR("%s: fail to init llog handle: rc = %d\n", + obd->obd_name, rc); + GOTO(err_out, rc); + } + + rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs, + crs->crs_last_catidx, crs->crs_last_idx); + if (rc < 0) { + CERROR("%s: fail to process llog: rc = %d\n", obd->obd_name, rc); + GOTO(err_out, rc); + } + if (!kthread_should_stop() && crs->crs_poll) { + llog_cat_close(NULL, llh); + llog_ctxt_put(ctx); + class_decref(obd, "changelog", crs); + schedule_timeout_interruptible(HZ); + goto again; + } + + crs->crs_eof = true; + +err_out: + if (rc < 0) + crs->crs_err = rc; + + wake_up_all(&crs->crs_waitq_cons); + + if (llh != NULL) + llog_cat_close(NULL, llh); + + if (ctx != NULL) + llog_ctxt_put(ctx); + + crs->crs_obd = NULL; + chlg_obd_put(ced, obd); + wait_event_interruptible(crs->crs_waitq_prod, kthread_should_stop()); + + RETURN(rc); +} + +/** + * Read handler, dequeues records from the chlg_reader_state if any. + * No partial records are copied to userland so this function can return less + * data than required (short read). + * + * @param[in] file File pointer to the character device. + * @param[out] buff Userland buffer where to copy the records. + * @param[in] count Userland buffer size. + * @param[out] ppos File position, updated with the index number of the next + * record to read. + * @return number of copied bytes on success, negated error code on failure. + */ +static ssize_t chlg_read(struct file *file, char __user *buff, size_t count, + loff_t *ppos) +{ + struct chlg_reader_state *crs = file->private_data; + struct chlg_rec_entry *rec; + struct chlg_rec_entry *tmp; + size_t written_total = 0; + ssize_t rc; + LIST_HEAD(consumed); + ENTRY; + + if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0) { + if (crs->crs_err < 0) + RETURN(crs->crs_err); + else if (crs->crs_eof) + RETURN(0); + else + RETURN(-EAGAIN); + } + + rc = wait_event_interruptible(crs->crs_waitq_cons, + crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err); + + mutex_lock(&crs->crs_lock); + list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) { + if (written_total + rec->enq_length > count) + break; + + if (copy_to_user(buff, rec->enq_record, rec->enq_length)) { + rc = -EFAULT; + break; + } + + buff += rec->enq_length; + written_total += rec->enq_length; + + crs->crs_rec_count--; + list_move_tail(&rec->enq_linkage, &consumed); + + crs->crs_start_offset = rec->enq_record->cr_index + 1; + } + mutex_unlock(&crs->crs_lock); + + if (written_total > 0) { + rc = written_total; + wake_up_all(&crs->crs_waitq_prod); + } else if (rc == 0) { + rc = crs->crs_err; + } + + list_for_each_entry_safe(rec, tmp, &consumed, enq_linkage) + enq_record_delete(rec); + + *ppos = crs->crs_start_offset; + + RETURN(rc); +} + +/** + * Jump to a given record index. Helper for chlg_llseek(). + * + * @param[in,out] crs Internal reader state. + * @param[in] offset Desired offset (index record). + * @return 0 on success, negated error code on failure. + */ +static int chlg_set_start_offset(struct chlg_reader_state *crs, __u64 offset) +{ + struct chlg_rec_entry *rec; + struct chlg_rec_entry *tmp; + + mutex_lock(&crs->crs_lock); + if (offset < crs->crs_start_offset) { + mutex_unlock(&crs->crs_lock); + return -ERANGE; + } + + crs->crs_start_offset = offset; + list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) { + struct changelog_rec *cr = rec->enq_record; + + if (cr->cr_index >= crs->crs_start_offset) + break; + + crs->crs_rec_count--; + enq_record_delete(rec); + } + + mutex_unlock(&crs->crs_lock); + wake_up_all(&crs->crs_waitq_prod); + return 0; +} + +/** + * Move read pointer to a certain record index, encoded as an offset. + * + * @param[in,out] file File pointer to the changelog character device + * @param[in] off Offset to skip, actually a record index, not byte count + * @param[in] whence Relative/Absolute interpretation of the offset + * @return the resulting position on success or negated error code on failure. + */ +static loff_t chlg_llseek(struct file *file, loff_t off, int whence) +{ + struct chlg_reader_state *crs = file->private_data; + loff_t pos; + int rc; + + switch (whence) { + case SEEK_SET: + pos = off; + break; + case SEEK_CUR: + pos = file->f_pos + off; + break; + case SEEK_END: + default: + return -EINVAL; + } + + /* We cannot go backward */ + if (pos < file->f_pos) + return -EINVAL; + + rc = chlg_set_start_offset(crs, pos); + if (rc != 0) + return rc; + + file->f_pos = pos; + return pos; +} + +/** + * Clear record range for a given changelog reader. + * + * @param[in] crs Current internal state. + * @param[in] reader Changelog reader ID (cl1, cl2...) + * @param[in] record Record index up which to clear + * @return 0 on success, negated error code on failure. + */ +static int chlg_clear(struct chlg_reader_state *crs, __u32 reader, __u64 record) +{ + struct obd_device *obd = NULL; + struct changelog_setinfo cs = { + .cs_recno = record, + .cs_id = reader + }; + int rc; + + obd = chlg_obd_get(crs->crs_ced); + if (obd == NULL) + return -ENODEV; + + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_CHANGELOG_CLEAR), + KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL); + + chlg_obd_put(crs->crs_ced, obd); + return rc; +} + +/** Maximum changelog control command size */ +#define CHLG_CONTROL_CMD_MAX 64 + +/** + * Handle writes() into the changelog character device. Write() can be used + * to request special control operations. + * + * @param[in] file File pointer to the changelog character device + * @param[in] buff User supplied data (written data) + * @param[in] count Number of written bytes + * @param[in] off (unused) + * @return number of written bytes on success, negated error code on failure. + */ +static ssize_t chlg_write(struct file *file, const char __user *buff, + size_t count, loff_t *off) +{ + struct chlg_reader_state *crs = file->private_data; + char *kbuf; + __u64 record; + __u32 reader; + int rc = 0; + ENTRY; + + if (count > CHLG_CONTROL_CMD_MAX) + RETURN(-EINVAL); + + OBD_ALLOC(kbuf, CHLG_CONTROL_CMD_MAX); + if (kbuf == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(kbuf, buff, count)) + GOTO(out_kbuf, rc = -EFAULT); + + kbuf[CHLG_CONTROL_CMD_MAX - 1] = '\0'; + + if (sscanf(kbuf, "clear:cl%u:%llu", &reader, &record) == 2) + rc = chlg_clear(crs, reader, record); + else + rc = -EINVAL; + + EXIT; +out_kbuf: + OBD_FREE(kbuf, CHLG_CONTROL_CMD_MAX); + return rc < 0 ? rc : count; +} + +/** + * Open handler, initialize internal CRS state and spawn prefetch thread if + * needed. + * @param[in] inode Inode struct for the open character device. + * @param[in] file Corresponding file pointer. + * @return 0 on success, negated error code on failure. + */ +static int chlg_open(struct inode *inode, struct file *file) +{ + struct chlg_reader_state *crs; + struct chlg_registered_dev *dev; + struct task_struct *task; + int rc; + ENTRY; + + dev = container_of(inode->i_cdev, struct chlg_registered_dev, ced_cdev); + + OBD_ALLOC_PTR(crs); + if (!crs) + RETURN(-ENOMEM); + + kref_get(&dev->ced_refs); + crs->crs_ced = dev; + crs->crs_err = false; + crs->crs_eof = false; + + mutex_init(&crs->crs_lock); + INIT_LIST_HEAD(&crs->crs_rec_queue); + init_waitqueue_head(&crs->crs_waitq_prod); + init_waitqueue_head(&crs->crs_waitq_cons); + + if (file->f_mode & FMODE_READ) { + task = kthread_run(chlg_load, crs, "chlg_load_thread"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("%s: cannot start changelog thread: rc = %d\n", + dev->ced_name, rc); + GOTO(err_crs, rc); + } + crs->crs_prod_task = task; + } + + file->private_data = crs; + RETURN(0); + +err_crs: + kref_put(&dev->ced_refs, chlg_dev_clear); + OBD_FREE_PTR(crs); + return rc; +} + +/** + * Close handler, release resources. + * + * @param[in] inode Inode struct for the open character device. + * @param[in] file Corresponding file pointer. + * @return 0 on success, negated error code on failure. + */ +static int chlg_release(struct inode *inode, struct file *file) +{ + struct chlg_reader_state *crs = file->private_data; + struct chlg_rec_entry *rec; + struct chlg_rec_entry *tmp; + int rc = 0; + + if (crs->crs_prod_task) + rc = kthread_stop(crs->crs_prod_task); + + list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) + enq_record_delete(rec); + + kref_put(&crs->crs_ced->ced_refs, chlg_dev_clear); + OBD_FREE_PTR(crs); + + return rc; +} + +/** + * Poll handler, indicates whether the device is readable (new records) and + * writable (always). + * + * @param[in] file Device file pointer. + * @param[in] wait (opaque) + * @return combination of the poll status flags. + */ +static unsigned int chlg_poll(struct file *file, poll_table *wait) +{ + struct chlg_reader_state *crs = file->private_data; + unsigned int mask = 0; + + mutex_lock(&crs->crs_lock); + poll_wait(file, &crs->crs_waitq_cons, wait); + if (crs->crs_rec_count > 0) + mask |= POLLIN | POLLRDNORM; + if (crs->crs_err) + mask |= POLLERR; + if (crs->crs_eof) + mask |= POLLHUP; + mutex_unlock(&crs->crs_lock); + return mask; +} + +static long chlg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc; + + struct chlg_reader_state *crs = file->private_data; + switch (cmd) { + case OBD_IOC_CHLG_POLL: + crs->crs_poll = !!arg; + rc = 0; + break; + default: + rc = -EINVAL; + break; + } + return rc; +} + +static const struct file_operations chlg_fops = { + .owner = THIS_MODULE, + .llseek = chlg_llseek, + .read = chlg_read, + .write = chlg_write, + .open = chlg_open, + .release = chlg_release, + .poll = chlg_poll, + .unlocked_ioctl = chlg_ioctl, +}; + +/** + * This uses obd_name of the form: "testfs-MDT0000-mdc-ffff88006501600" + * and returns a name of the form: "changelog-testfs-MDT0000". + */ +static void get_target_name(char *name, size_t name_len, struct obd_device *obd) +{ + int i; + + snprintf(name, name_len, "%s", obd->obd_name); + + /* Find the 2nd '-' from the end and truncate on it */ + for (i = 0; i < 2; i++) { + char *p = strrchr(name, '-'); + + if (p == NULL) + return; + *p = '\0'; + } +} + +/** + * Find a changelog character device by name. + * All devices registered during MDC setup are listed in a global list with + * their names attached. + */ +static struct chlg_registered_dev * +chlg_registered_dev_find_by_name(const char *name) +{ + struct chlg_registered_dev *dit; + + LASSERT(mutex_is_locked(&chlg_registered_dev_lock)); + list_for_each_entry(dit, &chlg_registered_devices, ced_link) + if (strcmp(name, dit->ced_name) == 0) + return dit; + return NULL; +} + +/** + * Find chlg_registered_dev structure for a given OBD device. + * This is bad O(n^2) but for each filesystem: + * - N is # of MDTs times # of mount points + * - this only runs at shutdown + */ +static struct chlg_registered_dev * +chlg_registered_dev_find_by_obd(const struct obd_device *obd) +{ + struct chlg_registered_dev *dit; + struct obd_device *oit; + + LASSERT(mutex_is_locked(&chlg_registered_dev_lock)); + list_for_each_entry(dit, &chlg_registered_devices, ced_link) + list_for_each_entry(oit, &dit->ced_obds, + u.cli.cl_chg_dev_linkage) + if (oit == obd) + return dit; + return NULL; +} + +/** + * Changelog character device initialization. + * Register a misc character device with a dynamic minor number, under a name + * of the form: 'changelog-fsname-MDTxxxx'. Reference this OBD device with it. + * + * @param[in] obd This MDC obd_device. + * @return 0 on success, negated error code on failure. + */ +int mdc_changelog_cdev_init(struct obd_device *obd) +{ + struct chlg_registered_dev *exist; + struct chlg_registered_dev *entry; + struct device *device; + dev_t dev; + int minor, rc; + ENTRY; + + OBD_ALLOC_PTR(entry); + if (entry == NULL) + RETURN(-ENOMEM); + + get_target_name(entry->ced_name, sizeof(entry->ced_name), obd); + + kref_init(&entry->ced_refs); + INIT_LIST_HEAD(&entry->ced_obds); + INIT_LIST_HEAD(&entry->ced_link); + + mutex_lock(&chlg_registered_dev_lock); + exist = chlg_registered_dev_find_by_name(entry->ced_name); + if (exist != NULL) { + kref_get(&exist->ced_refs); + list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &exist->ced_obds); + GOTO(out_unlock, rc = 0); + } + + list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds); + list_add_tail(&entry->ced_link, &chlg_registered_devices); + + /* Register new character device */ + cdev_init(&entry->ced_cdev, &chlg_fops); + entry->ced_cdev.owner = THIS_MODULE; + + rc = chlg_minor_alloc(&minor); + if (rc) + GOTO(out_unlock, rc); + + dev = MKDEV(MAJOR(mdc_changelog_dev), minor); + rc = cdev_add(&entry->ced_cdev, dev, 1); + if (rc) + GOTO(out_minor, rc); + + device = device_create(mdc_changelog_class, NULL, dev, entry, "%s-%s", + MDC_CHANGELOG_DEV_NAME, entry->ced_name); + if (IS_ERR(device)) + GOTO(out_cdev, rc = PTR_ERR(device)); + + device->release = chlg_device_release; + entry->ced_device = device; + + entry = NULL; /* prevent it from being freed below */ + GOTO(out_unlock, rc = 0); + +out_cdev: + cdev_del(&entry->ced_cdev); + +out_minor: + chlg_minor_free(minor); + + list_del_init(&obd->u.cli.cl_chg_dev_linkage); + list_del(&entry->ced_link); + +out_unlock: + mutex_unlock(&chlg_registered_dev_lock); + if (entry) + OBD_FREE_PTR(entry); + RETURN(rc); +} + +/** + * Release OBD, decrease reference count of the corresponding changelog device. + */ +void mdc_changelog_cdev_finish(struct obd_device *obd) +{ + struct chlg_registered_dev *dev; + + ENTRY; + mutex_lock(&chlg_registered_dev_lock); + dev = chlg_registered_dev_find_by_obd(obd); + list_del_init(&obd->u.cli.cl_chg_dev_linkage); + kref_put(&dev->ced_refs, chlg_dev_clear); + mutex_unlock(&chlg_registered_dev_lock); + EXIT; +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c new file mode 100644 index 0000000000000..3606778434879 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c @@ -0,0 +1,1564 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Implementation of cl_device, cl_req for MDC layer. + * + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_MDC + +#include +#include + +#include "mdc_internal.h" + +static void mdc_lock_build_policy(const struct lu_env *env, + union ldlm_policy_data *policy) +{ + memset(policy, 0, sizeof *policy); + policy->l_inodebits.bits = MDS_INODELOCK_DOM; +} + +int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data) +{ + return osc_ldlm_glimpse_ast(dlmlock, data); +} + +static void mdc_lock_build_einfo(const struct lu_env *env, + const struct cl_lock *lock, + struct osc_object *osc, + struct ldlm_enqueue_info *einfo) +{ + einfo->ei_type = LDLM_IBITS; + einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode); + einfo->ei_cb_bl = mdc_ldlm_blocking_ast; + einfo->ei_cb_cp = ldlm_completion_ast; + einfo->ei_cb_gl = mdc_ldlm_glimpse_ast; + einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */ +} + +static void mdc_lock_lvb_update(const struct lu_env *env, + struct osc_object *osc, + struct ldlm_lock *dlmlock, + struct ost_lvb *lvb); + +static int mdc_set_dom_lock_data(struct ldlm_lock *lock, void *data) +{ + int set = 0; + + LASSERT(lock != NULL); + LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast); + + lock_res_and_lock(lock); + + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + set = 1; + + unlock_res_and_lock(lock); + + return set; +} + +int mdc_dom_lock_match(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, enum ldlm_type type, + union ldlm_policy_data *policy, enum ldlm_mode mode, + __u64 *flags, struct osc_object *obj, + struct lustre_handle *lockh, int unref) +{ + struct obd_device *obd = exp->exp_obd; + __u64 lflags = *flags; + enum ldlm_mode rc; + + ENTRY; + + rc = ldlm_lock_match(obd->obd_namespace, lflags, + res_id, type, policy, mode, lockh, unref); + if (rc == 0 || lflags & LDLM_FL_TEST_LOCK) + RETURN(rc); + + if (obj != NULL) { + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + + LASSERT(lock != NULL); + if (mdc_set_dom_lock_data(lock, obj)) { + lock_res_and_lock(lock); + if (!ldlm_is_lvb_cached(lock)) { + LASSERT(lock->l_ast_data == obj); + mdc_lock_lvb_update(env, obj, lock, NULL); + ldlm_set_lvb_cached(lock); + } + unlock_res_and_lock(lock); + } else { + ldlm_lock_decref(lockh, rc); + rc = 0; + } + LDLM_LOCK_PUT(lock); + } + RETURN(rc); +} + +/** + * Finds an existing lock covering a page with given index. + * Copy of osc_obj_dlmlock_at_pgoff() but for DoM IBITS lock. + */ +struct ldlm_lock *mdc_dlmlock_at_pgoff(const struct lu_env *env, + struct osc_object *obj, pgoff_t index, + enum osc_dap_flags dap_flags) +{ + struct osc_thread_info *info = osc_env_info(env); + struct ldlm_res_id *resname = &info->oti_resname; + union ldlm_policy_data *policy = &info->oti_policy; + struct lustre_handle lockh; + struct ldlm_lock *lock = NULL; + enum ldlm_mode mode; + __u64 flags; + + ENTRY; + + fid_build_reg_res_name(lu_object_fid(osc2lu(obj)), resname); + mdc_lock_build_policy(env, policy); + + flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING; + if (dap_flags & OSC_DAP_FL_TEST_LOCK) + flags |= LDLM_FL_TEST_LOCK; + +again: + /* Next, search for already existing extent locks that will cover us */ + /* If we're trying to read, we also search for an existing PW lock. The + * VFS and page cache already protect us locally, so lots of readers/ + * writers can share a single PW lock. */ + mode = mdc_dom_lock_match(env, osc_export(obj), resname, LDLM_IBITS, + policy, LCK_PR | LCK_PW | LCK_GROUP, &flags, + obj, &lockh, + dap_flags & OSC_DAP_FL_CANCELING); + if (mode != 0) { + lock = ldlm_handle2lock(&lockh); + /* RACE: the lock is cancelled so let's try again */ + if (unlikely(lock == NULL)) + goto again; + } + + RETURN(lock); +} + +/** + * Check if page @page is covered by an extra lock or discard it. + */ +static int mdc_check_and_discard_cb(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops, void *cbdata) +{ + struct osc_thread_info *info = osc_env_info(env); + struct osc_object *osc = cbdata; + pgoff_t index; + + index = osc_index(ops); + if (index >= info->oti_fn_index) { + struct ldlm_lock *tmp; + struct cl_page *page = ops->ops_cl.cpl_page; + + /* refresh non-overlapped index */ + tmp = mdc_dlmlock_at_pgoff(env, osc, index, + OSC_DAP_FL_TEST_LOCK); + if (tmp != NULL) { + info->oti_fn_index = CL_PAGE_EOF; + LDLM_LOCK_PUT(tmp); + } else if (cl_page_own(env, io, page) == 0) { + /* discard the page */ + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + } + + info->oti_next_index = index + 1; + return CLP_GANG_OKAY; +} + +/** + * Discard pages protected by the given lock. This function traverses radix + * tree to find all covering pages and discard them. If a page is being covered + * by other locks, it should remain in cache. + * + * If error happens on any step, the process continues anyway (the reasoning + * behind this being that lock cancellation cannot be delayed indefinitely). + */ +static int mdc_lock_discard_pages(const struct lu_env *env, + struct osc_object *osc, + pgoff_t start, pgoff_t end, + bool discard) +{ + struct osc_thread_info *info = osc_env_info(env); + struct cl_io *io = &info->oti_io; + osc_page_gang_cbt cb; + int res; + int result; + + ENTRY; + + io->ci_obj = cl_object_top(osc2cl(osc)); + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + GOTO(out, result); + + cb = discard ? osc_discard_cb : mdc_check_and_discard_cb; + info->oti_fn_index = info->oti_next_index = start; + do { + res = osc_page_gang_lookup(env, io, osc, info->oti_next_index, + end, cb, (void *)osc); + if (info->oti_next_index > end) + break; + + if (res == CLP_GANG_RESCHED) + cond_resched(); + } while (res != CLP_GANG_OKAY); +out: + cl_io_fini(env, io); + RETURN(result); +} + +static int mdc_lock_flush(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end, enum cl_lock_mode mode, + bool discard) +{ + int result = 0; + int rc; + + ENTRY; + + if (mode == CLM_WRITE) { + result = osc_cache_writeback_range(env, obj, start, end, 1, + discard); + CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n", + obj, start, end, result, + discard ? "discarded" : "written back"); + if (result > 0) + result = 0; + } + + rc = mdc_lock_discard_pages(env, obj, start, end, discard); + if (result == 0 && rc < 0) + result = rc; + + RETURN(result); +} + +void mdc_lock_lockless_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct osc_object *osc = cl2osc(slice->cls_obj); + struct cl_lock_descr *descr = &slice->cls_lock->cll_descr; + int rc; + + LASSERT(ols->ols_dlmlock == NULL); + rc = mdc_lock_flush(env, osc, descr->cld_start, descr->cld_end, + descr->cld_mode, 0); + if (rc != 0) + CERROR("Pages for lockless lock %p were not purged(%d)\n", + ols, rc); + + osc_lock_wake_waiters(env, osc, ols); +} + +/** + * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock + * and ldlm_lock caches. + */ +static int mdc_dlm_blocking_ast0(const struct lu_env *env, + struct ldlm_lock *dlmlock, + int flag) +{ + struct cl_object *obj = NULL; + int result = 0; + bool discard; + enum cl_lock_mode mode = CLM_READ; + + ENTRY; + + LASSERT(flag == LDLM_CB_CANCELING); + LASSERT(dlmlock != NULL); + + lock_res_and_lock(dlmlock); + if (dlmlock->l_granted_mode != dlmlock->l_req_mode) { + dlmlock->l_ast_data = NULL; + unlock_res_and_lock(dlmlock); + RETURN(0); + } + + discard = ldlm_is_discard_data(dlmlock); + if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP)) + mode = CLM_WRITE; + + if (dlmlock->l_ast_data != NULL) { + obj = osc2cl(dlmlock->l_ast_data); + dlmlock->l_ast_data = NULL; + cl_object_get(obj); + } + unlock_res_and_lock(dlmlock); + + /* if l_ast_data is NULL, the dlmlock was enqueued by AGL or + * the object has been destroyed. */ + if (obj != NULL) { + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + + /* Destroy pages covered by the extent of the DLM lock */ + result = mdc_lock_flush(env, cl2osc(obj), cl_index(obj, 0), + CL_PAGE_EOF, mode, discard); + /* Losing a lock, set KMS to 0. + * NB: assumed that DOM lock covers whole data on MDT. + */ + /* losing a lock, update kms */ + lock_res_and_lock(dlmlock); + cl_object_attr_lock(obj); + attr->cat_kms = 0; + cl_object_attr_update(env, obj, attr, CAT_KMS); + cl_object_attr_unlock(obj); + unlock_res_and_lock(dlmlock); + cl_object_put(env, obj); + } + RETURN(result); +} + +int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, + struct ldlm_lock_desc *new, void *data, int flag) +{ + int rc = 0; + + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: { + struct lustre_handle lockh; + + ldlm_lock2handle(dlmlock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc == -ENODATA) + rc = 0; + break; + } + case LDLM_CB_CANCELING: { + struct lu_env *env; + __u16 refcheck; + + /* + * This can be called in the context of outer IO, e.g., + * + * osc_enqueue_base()->... + * ->ldlm_prep_elc_req()->... + * ->ldlm_cancel_callback()->... + * ->osc_ldlm_blocking_ast() + * + * new environment has to be created to not corrupt outer + * context. + */ + env = cl_env_get(&refcheck); + if (IS_ERR(env)) { + rc = PTR_ERR(env); + break; + } + + rc = mdc_dlm_blocking_ast0(env, dlmlock, flag); + cl_env_put(env, &refcheck); + break; + } + default: + LBUG(); + } + RETURN(rc); +} + +/** + * Updates object attributes from a lock value block (lvb) received together + * with the DLM lock reply from the server. + * This can be optimized to not update attributes when lock is a result of a + * local match. + * + * Called under lock and resource spin-locks. + */ +void mdc_lock_lvb_update(const struct lu_env *env, struct osc_object *osc, + struct ldlm_lock *dlmlock, struct ost_lvb *lvb) +{ + struct cl_object *obj = osc2cl(osc); + struct lov_oinfo *oinfo = osc->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | + CAT_SIZE; + unsigned int setkms = 0; + + ENTRY; + + if (lvb == NULL) { + LASSERT(dlmlock != NULL); + lvb = &dlmlock->l_ost_lvb; + } + cl_lvb2attr(attr, lvb); + + cl_object_attr_lock(obj); + if (dlmlock != NULL) { + __u64 size; + + check_res_locked(dlmlock->l_resource); + size = lvb->lvb_size; + + if (size >= oinfo->loi_kms) { + valid |= CAT_KMS; + attr->cat_kms = size; + setkms = 1; + } + } + + /* The size should not be less than the kms */ + if (attr->cat_size < oinfo->loi_kms) + attr->cat_size = oinfo->loi_kms; + + LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s " + "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size, + setkms ? "" : " leaving", + setkms ? attr->cat_kms : oinfo->loi_kms, + dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull); + + cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + EXIT; +} + +static void mdc_lock_granted(const struct lu_env *env, struct osc_lock *oscl, + struct lustre_handle *lockh) +{ + struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj); + struct ldlm_lock *dlmlock; + + ENTRY; + + dlmlock = ldlm_handle2lock_long(lockh, 0); + LASSERT(dlmlock != NULL); + + /* lock reference taken by ldlm_handle2lock_long() is + * owned by osc_lock and released in osc_lock_detach() + */ + lu_ref_add(&dlmlock->l_reference, "osc_lock", oscl); + oscl->ols_has_ref = 1; + + LASSERT(oscl->ols_dlmlock == NULL); + oscl->ols_dlmlock = dlmlock; + + /* This may be a matched lock for glimpse request, do not hold + * lock reference in that case. */ + if (!oscl->ols_glimpse) { + /* hold a refc for non glimpse lock which will + * be released in osc_lock_cancel() */ + lustre_handle_copy(&oscl->ols_handle, lockh); + ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode); + oscl->ols_hold = 1; + } + + /* Lock must have been granted. */ + lock_res_and_lock(dlmlock); + if (dlmlock->l_granted_mode == dlmlock->l_req_mode) { + struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr; + + /* extend the lock extent, otherwise it will have problem when + * we decide whether to grant a lockless lock. */ + descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode); + descr->cld_start = cl_index(descr->cld_obj, 0); + descr->cld_end = CL_PAGE_EOF; + + /* no lvb update for matched lock */ + if (!ldlm_is_lvb_cached(dlmlock)) { + LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); + LASSERT(osc == dlmlock->l_ast_data); + mdc_lock_lvb_update(env, osc, dlmlock, NULL); + ldlm_set_lvb_cached(dlmlock); + } + } + unlock_res_and_lock(dlmlock); + + LASSERT(oscl->ols_state != OLS_GRANTED); + oscl->ols_state = OLS_GRANTED; + EXIT; +} + +/** + * Lock upcall function that is executed either when a reply to ENQUEUE rpc is + * received from a server, or after osc_enqueue_base() matched a local DLM + * lock. + */ +static int mdc_lock_upcall(void *cookie, struct lustre_handle *lockh, + int errcode) +{ + struct osc_lock *oscl = cookie; + struct cl_lock_slice *slice = &oscl->ols_cl; + struct lu_env *env; + int rc; + + ENTRY; + + env = cl_env_percpu_get(); + /* should never happen, similar to osc_ldlm_blocking_ast(). */ + LASSERT(!IS_ERR(env)); + + rc = ldlm_error2errno(errcode); + if (oscl->ols_state == OLS_ENQUEUED) { + oscl->ols_state = OLS_UPCALL_RECEIVED; + } else if (oscl->ols_state == OLS_CANCELLED) { + rc = -EIO; + } else { + CERROR("Impossible state: %d\n", oscl->ols_state); + LBUG(); + } + + CDEBUG(D_INODE, "rc %d, err %d\n", rc, errcode); + if (rc == 0) + mdc_lock_granted(env, oscl, lockh); + + /* Error handling, some errors are tolerable. */ + if (oscl->ols_locklessable && rc == -EUSERS) { + /* This is a tolerable error, turn this lock into + * lockless lock. + */ + osc_object_set_contended(cl2osc(slice->cls_obj)); + LASSERT(slice->cls_ops != oscl->ols_lockless_ops); + + /* Change this lock to ldlmlock-less lock. */ + osc_lock_to_lockless(env, oscl, 1); + oscl->ols_state = OLS_GRANTED; + rc = 0; + } else if (oscl->ols_glimpse && rc == -ENAVAIL) { + LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); + mdc_lock_lvb_update(env, cl2osc(slice->cls_obj), + NULL, &oscl->ols_lvb); + /* Hide the error. */ + rc = 0; + } + + if (oscl->ols_owner != NULL) + cl_sync_io_note(env, oscl->ols_owner, rc); + cl_env_percpu_put(env); + + RETURN(rc); +} + +int mdc_fill_lvb(struct ptlrpc_request *req, struct ost_lvb *lvb) +{ + struct mdt_body *body; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (!body) + RETURN(-EPROTO); + + lvb->lvb_mtime = body->mbo_mtime; + lvb->lvb_atime = body->mbo_atime; + lvb->lvb_ctime = body->mbo_ctime; + lvb->lvb_blocks = body->mbo_dom_blocks; + lvb->lvb_size = body->mbo_dom_size; + + RETURN(0); +} + +int mdc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall, + void *cookie, struct lustre_handle *lockh, + enum ldlm_mode mode, __u64 *flags, int errcode) +{ + struct osc_lock *ols = cookie; + struct ldlm_lock *lock; + int rc = 0; + + ENTRY; + + /* The request was created before ldlm_cli_enqueue call. */ + if (errcode == ELDLM_LOCK_ABORTED) { + struct ldlm_reply *rep; + + rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + LASSERT(rep != NULL); + + rep->lock_policy_res2 = + ptlrpc_status_ntoh(rep->lock_policy_res2); + if (rep->lock_policy_res2) + errcode = rep->lock_policy_res2; + + rc = mdc_fill_lvb(req, &ols->ols_lvb); + *flags |= LDLM_FL_LVB_READY; + } else if (errcode == ELDLM_OK) { + /* Callers have references, should be valid always */ + lock = ldlm_handle2lock(lockh); + LASSERT(lock); + + rc = mdc_fill_lvb(req, &lock->l_ost_lvb); + LDLM_LOCK_PUT(lock); + *flags |= LDLM_FL_LVB_READY; + } + + /* Call the update callback. */ + rc = (*upcall)(cookie, lockh, rc < 0 ? rc : errcode); + + /* release the reference taken in ldlm_cli_enqueue() */ + if (errcode == ELDLM_LOCK_MATCHED) + errcode = ELDLM_OK; + if (errcode == ELDLM_OK && lustre_handle_is_used(lockh)) + ldlm_lock_decref(lockh, mode); + + RETURN(rc); +} + +int mdc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req, + struct osc_enqueue_args *aa, int rc) +{ + struct ldlm_lock *lock; + struct lustre_handle *lockh = &aa->oa_lockh; + enum ldlm_mode mode = aa->oa_mode; + + ENTRY; + + LASSERT(!aa->oa_speculative); + + /* ldlm_cli_enqueue is holding a reference on the lock, so it must + * be valid. */ + lock = ldlm_handle2lock(lockh); + LASSERTF(lock != NULL, + "lockh %#llx, req %p, aa %p - client evicted?\n", + lockh->cookie, req, aa); + + /* Take an additional reference so that a blocking AST that + * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed + * to arrive after an upcall has been executed by + * osc_enqueue_fini(). */ + ldlm_lock_addref(lockh, mode); + + /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2); + + /* Let CP AST to grant the lock first. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1); + + /* Complete obtaining the lock procedure. */ + rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_type, 1, + aa->oa_mode, aa->oa_flags, NULL, 0, + lockh, rc); + /* Complete mdc stuff. */ + rc = mdc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode, + aa->oa_flags, rc); + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10); + + ldlm_lock_decref(lockh, mode); + LDLM_LOCK_PUT(lock); + RETURN(rc); +} + +/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock + * from the 2nd OSC before a lock from the 1st one. This does not deadlock with + * other synchronous requests, however keeping some locks and trying to obtain + * others may take a considerable amount of time in a case of ost failure; and + * when other sync requests do not get released lock from a client, the client + * is excluded from the cluster -- such scenarious make the life difficult, so + * release locks just after they are obtained. */ +int mdc_enqueue_send(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, __u64 *flags, + union ldlm_policy_data *policy, + struct ost_lvb *lvb, int kms_valid, + osc_enqueue_upcall_f upcall, void *cookie, + struct ldlm_enqueue_info *einfo, int async) +{ + struct obd_device *obd = exp->exp_obd; + struct lustre_handle lockh = { 0 }; + struct ptlrpc_request *req = NULL; + struct ldlm_intent *lit; + enum ldlm_mode mode; + bool glimpse = *flags & LDLM_FL_HAS_INTENT; + __u64 match_flags = *flags; + int rc; + + ENTRY; + + mode = einfo->ei_mode; + if (einfo->ei_mode == LCK_PR) + mode |= LCK_PW; + + if (glimpse) + match_flags |= LDLM_FL_BLOCK_GRANTED; + /* DOM locking uses LDLM_FL_KMS_IGNORE to mark locks wich have no valid + * LVB information, e.g. canceled locks or locks of just pruned object, + * such locks should be skipped. + */ + mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id, + einfo->ei_type, policy, mode, &lockh, 0); + if (mode) { + struct ldlm_lock *matched; + + if (*flags & LDLM_FL_TEST_LOCK) + RETURN(ELDLM_OK); + + matched = ldlm_handle2lock(&lockh); + + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GLIMPSE_DDOS)) + ldlm_set_kms_ignore(matched); + + if (mdc_set_dom_lock_data(matched, einfo->ei_cbdata)) { + *flags |= LDLM_FL_LVB_READY; + + /* We already have a lock, and it's referenced. */ + (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED); + + ldlm_lock_decref(&lockh, mode); + LDLM_LOCK_PUT(matched); + RETURN(ELDLM_OK); + } + ldlm_lock_decref(&lockh, mode); + LDLM_LOCK_PUT(matched); + } + + if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK)) + RETURN(-ENOLCK); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_INTENT); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = glimpse ? IT_GLIMPSE : IT_BRW; + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0); + ptlrpc_request_set_replen(req); + + /* users of mdc_enqueue() can pass this flag for ldlm_lock_match() */ + *flags &= ~LDLM_FL_BLOCK_GRANTED; + /* All MDC IO locks are intents */ + *flags |= LDLM_FL_HAS_INTENT; + rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, NULL, + 0, LVB_T_NONE, &lockh, async); + if (async) { + if (!rc) { + struct osc_enqueue_args *aa; + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->oa_exp = exp; + aa->oa_mode = einfo->ei_mode; + aa->oa_type = einfo->ei_type; + lustre_handle_copy(&aa->oa_lockh, &lockh); + aa->oa_upcall = upcall; + aa->oa_cookie = cookie; + aa->oa_speculative = false; + aa->oa_flags = flags; + aa->oa_lvb = lvb; + + req->rq_interpret_reply = + (ptlrpc_interpterer_t)mdc_enqueue_interpret; + ptlrpcd_add_req(req); + } else { + ptlrpc_req_finished(req); + } + RETURN(rc); + } + + rc = mdc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode, + flags, rc); + ptlrpc_req_finished(req); + RETURN(rc); +} + +/** + * Implementation of cl_lock_operations::clo_enqueue() method for osc + * layer. This initiates ldlm enqueue: + * + * - cancels conflicting locks early (osc_lock_enqueue_wait()); + * + * - calls osc_enqueue_base() to do actual enqueue. + * + * osc_enqueue_base() is supplied with an upcall function that is executed + * when lock is received either after a local cached ldlm lock is matched, or + * when a reply from the server is received. + * + * This function does not wait for the network communication to complete. + */ +static int mdc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *unused, struct cl_sync_io *anchor) +{ + struct osc_thread_info *info = osc_env_info(env); + struct osc_io *oio = osc_env_io(env); + struct osc_object *osc = cl2osc(slice->cls_obj); + struct osc_lock *oscl = cl2osc_lock(slice); + struct cl_lock *lock = slice->cls_lock; + struct ldlm_res_id *resname = &info->oti_resname; + union ldlm_policy_data *policy = &info->oti_policy; + osc_enqueue_upcall_f upcall = mdc_lock_upcall; + void *cookie = (void *)oscl; + bool async = false; + int result; + + ENTRY; + + LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ), + "lock = %p, ols = %p\n", lock, oscl); + + if (oscl->ols_state == OLS_GRANTED) + RETURN(0); + + /* Lockahead is not supported on MDT yet */ + if (oscl->ols_flags & LDLM_FL_NO_EXPANSION) { + result = -EOPNOTSUPP; + RETURN(result); + } + + if (oscl->ols_flags & LDLM_FL_TEST_LOCK) + GOTO(enqueue_base, 0); + + if (oscl->ols_glimpse) { + LASSERT(equi(oscl->ols_speculative, anchor == NULL)); + async = true; + GOTO(enqueue_base, 0); + } + + result = osc_lock_enqueue_wait(env, osc, oscl); + if (result < 0) + GOTO(out, result); + + /* we can grant lockless lock right after all conflicting locks + * are canceled. */ + if (osc_lock_is_lockless(oscl)) { + oscl->ols_state = OLS_GRANTED; + oio->oi_lockless = 1; + RETURN(0); + } + +enqueue_base: + oscl->ols_state = OLS_ENQUEUED; + if (anchor != NULL) { + atomic_inc(&anchor->csi_sync_nr); + oscl->ols_owner = anchor; + } + + /** + * DLM lock's ast data must be osc_object; + * DLM's enqueue callback set to osc_lock_upcall() with cookie as + * osc_lock. + */ + fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname); + mdc_lock_build_policy(env, policy); + LASSERT(!oscl->ols_speculative); + result = mdc_enqueue_send(env, osc_export(osc), resname, + &oscl->ols_flags, policy, + &oscl->ols_lvb, osc->oo_oinfo->loi_kms_valid, + upcall, cookie, &oscl->ols_einfo, async); + if (result == 0) { + if (osc_lock_is_lockless(oscl)) { + oio->oi_lockless = 1; + } else if (!async) { + LASSERT(oscl->ols_state == OLS_GRANTED); + LASSERT(oscl->ols_hold); + LASSERT(oscl->ols_dlmlock != NULL); + } + } +out: + if (result < 0) { + oscl->ols_state = OLS_CANCELLED; + osc_lock_wake_waiters(env, osc, oscl); + + if (anchor != NULL) + cl_sync_io_note(env, anchor, result); + } + RETURN(result); +} + +static const struct cl_lock_operations mdc_lock_lockless_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = mdc_lock_enqueue, + .clo_cancel = mdc_lock_lockless_cancel, + .clo_print = osc_lock_print +}; + +static const struct cl_lock_operations mdc_lock_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = mdc_lock_enqueue, + .clo_cancel = osc_lock_cancel, + .clo_print = osc_lock_print, +}; + +int mdc_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct osc_lock *ols; + __u32 enqflags = lock->cll_descr.cld_enq_flags; + __u64 flags = osc_enq2ldlm_flags(enqflags); + + ENTRY; + + /* Ignore AGL for Data-on-MDT, stat returns size data */ + if ((enqflags & CEF_SPECULATIVE) != 0) + RETURN(0); + + OBD_SLAB_ALLOC_PTR_GFP(ols, osc_lock_kmem, GFP_NOFS); + if (unlikely(ols == NULL)) + RETURN(-ENOMEM); + + ols->ols_state = OLS_NEW; + spin_lock_init(&ols->ols_lock); + INIT_LIST_HEAD(&ols->ols_waiting_list); + INIT_LIST_HEAD(&ols->ols_wait_entry); + INIT_LIST_HEAD(&ols->ols_nextlock_oscobj); + ols->ols_lockless_ops = &mdc_lock_lockless_ops; + + ols->ols_flags = flags; + ols->ols_speculative = !!(enqflags & CEF_SPECULATIVE); + + if (ols->ols_flags & LDLM_FL_HAS_INTENT) { + ols->ols_flags |= LDLM_FL_BLOCK_GRANTED; + ols->ols_glimpse = 1; + } + mdc_lock_build_einfo(env, lock, cl2osc(obj), &ols->ols_einfo); + + cl_lock_slice_add(lock, &ols->ols_cl, obj, &mdc_lock_ops); + + if (!(enqflags & CEF_MUST)) + osc_lock_to_lockless(env, ols, (enqflags & CEF_NEVER)); + if (ols->ols_locklessable && !(enqflags & CEF_DISCARD_DATA)) + ols->ols_flags |= LDLM_FL_DENY_ON_CONTENTION; + + if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) + osc_lock_set_writer(env, io, obj, ols); + + LDLM_DEBUG_NOLOCK("lock %p, mdc lock %p, flags %llx\n", + lock, ols, ols->ols_flags); + RETURN(0); +} + +/** + * IO operations. + * + * An implementation of cl_io_operations specific methods for MDC layer. + * + */ +static int mdc_async_upcall(void *a, int rc) +{ + struct osc_async_cbargs *args = a; + + args->opc_rc = rc; + complete(&args->opc_sync); + return 0; +} + +static int mdc_get_lock_handle(const struct lu_env *env, struct osc_object *osc, + pgoff_t index, struct lustre_handle *lh) +{ + struct ldlm_lock *lock; + + /* find DOM lock protecting object */ + lock = mdc_dlmlock_at_pgoff(env, osc, index, + OSC_DAP_FL_TEST_LOCK | + OSC_DAP_FL_CANCELING); + if (lock == NULL) { + struct ldlm_resource *res; + struct ldlm_res_id *resname; + + resname = &osc_env_info(env)->oti_resname; + fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname); + res = ldlm_resource_get(osc_export(osc)->exp_obd->obd_namespace, + NULL, resname, LDLM_IBITS, 0); + ldlm_resource_dump(D_ERROR, res); + libcfs_debug_dumpstack(NULL); + return -ENOENT; + } else { + *lh = lock->l_remote_handle; + LDLM_LOCK_PUT(lock); + } + return 0; +} + +static int mdc_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + __u64 size = io->u.ci_setattr.sa_attr.lvb_size; + unsigned int ia_avalid = io->u.ci_setattr.sa_avalid; + enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid; + int rc; + + /* silently ignore non-truncate setattr for Data-on-MDT object */ + if (cl_io_is_trunc(io)) { + /* truncate cache dirty pages first */ + rc = osc_cache_truncate_start(env, cl2osc(obj), size, + &oio->oi_trunc); + if (rc < 0) + return rc; + } + + if (oio->oi_lockless == 0) { + cl_object_attr_lock(obj); + rc = cl_object_attr_get(env, obj, attr); + if (rc == 0) { + struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr; + unsigned int cl_valid = 0; + + if (ia_avalid & ATTR_SIZE) { + attr->cat_size = size; + attr->cat_kms = size; + cl_valid = (CAT_SIZE | CAT_KMS); + } + if (ia_avalid & ATTR_MTIME_SET) { + attr->cat_mtime = lvb->lvb_mtime; + cl_valid |= CAT_MTIME; + } + if (ia_avalid & ATTR_ATIME_SET) { + attr->cat_atime = lvb->lvb_atime; + cl_valid |= CAT_ATIME; + } + if (ia_xvalid & OP_XVALID_CTIME_SET) { + attr->cat_ctime = lvb->lvb_ctime; + cl_valid |= CAT_CTIME; + } + rc = cl_object_attr_update(env, obj, attr, cl_valid); + } + cl_object_attr_unlock(obj); + if (rc < 0) + return rc; + } + + if (!(ia_avalid & ATTR_SIZE)) + return 0; + + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_mtime = attr->cat_mtime; + oa->o_atime = attr->cat_atime; + oa->o_ctime = attr->cat_ctime; + + oa->o_size = size; + oa->o_blocks = OBD_OBJECT_EOF; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME | + OBD_MD_FLCTIME | OBD_MD_FLMTIME | OBD_MD_FLSIZE | + OBD_MD_FLBLOCKS; + if (oio->oi_lockless) { + oa->o_flags = OBD_FL_SRVLOCK; + oa->o_valid |= OBD_MD_FLFLAGS; + } else { + rc = mdc_get_lock_handle(env, cl2osc(obj), CL_PAGE_EOF, + &oa->o_handle); + if (!rc) + oa->o_valid |= OBD_MD_FLHANDLE; + } + + init_completion(&cbargs->opc_sync); + + rc = osc_punch_send(osc_export(cl2osc(obj)), oa, + mdc_async_upcall, cbargs); + cbargs->opc_rpc_sent = rc == 0; + return rc; +} + +static int mdc_io_read_ahead(const struct lu_env *env, + const struct cl_io_slice *ios, + pgoff_t start, struct cl_read_ahead *ra) +{ + struct osc_object *osc = cl2osc(ios->cis_obj); + struct ldlm_lock *dlmlock; + + ENTRY; + + dlmlock = mdc_dlmlock_at_pgoff(env, osc, start, 0); + if (dlmlock == NULL) + RETURN(-ENODATA); + + if (dlmlock->l_req_mode != LCK_PR) { + struct lustre_handle lockh; + + ldlm_lock2handle(dlmlock, &lockh); + ldlm_lock_addref(&lockh, LCK_PR); + ldlm_lock_decref(&lockh, dlmlock->l_req_mode); + } + + ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc; + ra->cra_end = CL_PAGE_EOF; + ra->cra_release = osc_read_ahead_release; + ra->cra_cbdata = dlmlock; + + RETURN(0); +} + +int mdc_io_fsync_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct cl_fsync_io *fio = &io->u.ci_fsync; + struct cl_object *obj = slice->cis_obj; + struct osc_object *osc = cl2osc(obj); + int result = 0; + + ENTRY; + + /* a MDC lock always covers whole object, do sync for whole + * possible range despite of supplied start/end values. + */ + result = osc_cache_writeback_range(env, osc, 0, CL_PAGE_EOF, 0, + fio->fi_mode == CL_FSYNC_DISCARD); + if (result > 0) { + fio->fi_nr_written += result; + result = 0; + } + if (fio->fi_mode == CL_FSYNC_ALL) { + int rc; + + rc = osc_cache_wait_range(env, osc, 0, CL_PAGE_EOF); + if (result == 0) + result = rc; + /* Use OSC sync code because it is asynchronous. + * It is to be added into MDC and avoid the using of + * OST_SYNC at both MDC and MDT. + */ + rc = osc_fsync_ost(env, osc, fio); + if (result == 0) + result = rc; + } + + RETURN(result); +} + +struct mdc_data_version_args { + struct osc_io *dva_oio; +}; + +static int +mdc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *arg, int rc) +{ + struct mdc_data_version_args *dva = arg; + struct osc_io *oio = dva->dva_oio; + const struct mdt_body *body; + + ENTRY; + if (rc < 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + /* Prepare OBDO from mdt_body for CLIO */ + oio->oi_oa.o_valid = body->mbo_valid; + oio->oi_oa.o_flags = body->mbo_flags; + oio->oi_oa.o_data_version = body->mbo_version; + oio->oi_oa.o_layout_version = body->mbo_layout_gen; + EXIT; +out: + oio->oi_cbarg.opc_rc = rc; + complete(&oio->oi_cbarg.opc_sync); + return 0; +} + +static int mdc_io_data_version_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + struct osc_object *obj = cl2osc(slice->cis_obj); + struct obd_export *exp = osc_export(obj); + struct ptlrpc_request *req; + struct mdt_body *body; + struct mdc_data_version_args *dva; + int rc; + + ENTRY; + + memset(&oio->oi_oa, 0, sizeof(oio->oi_oa)); + oio->oi_oa.o_oi.oi_fid = *lu_object_fid(osc2lu(obj)); + oio->oi_oa.o_valid = OBD_MD_FLID; + + init_completion(&cbargs->opc_sync); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY); + body->mbo_fid1 = *lu_object_fid(osc2lu(obj)); + body->mbo_valid = OBD_MD_FLID; + /* Indicate that data version is needed */ + body->mbo_valid |= OBD_MD_FLDATAVERSION; + body->mbo_flags = 0; + + if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) { + body->mbo_valid |= OBD_MD_FLFLAGS; + body->mbo_flags |= OBD_FL_SRVLOCK; + if (dv->dv_flags & LL_DV_WR_FLUSH) + body->mbo_flags |= OBD_FL_FLUSH; + } + + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0); + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0); + ptlrpc_request_set_replen(req); + + req->rq_interpret_reply = mdc_data_version_interpret; + CLASSERT(sizeof(*dva) <= sizeof(req->rq_async_args)); + dva = ptlrpc_req_async_args(req); + dva->dva_oio = oio; + + ptlrpcd_add_req(req); + + RETURN(0); +} + +static void mdc_io_data_version_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + + ENTRY; + wait_for_completion(&cbargs->opc_sync); + + if (cbargs->opc_rc != 0) { + slice->cis_io->ci_result = cbargs->opc_rc; + } else { + slice->cis_io->ci_result = 0; + if (!(oio->oi_oa.o_valid & + (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION))) + slice->cis_io->ci_result = -ENOTSUPP; + + if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION) + dv->dv_layout_version = oio->oi_oa.o_layout_version; + if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION) + dv->dv_data_version = oio->oi_oa.o_data_version; + } + + EXIT; +} + +static struct cl_io_operations mdc_io_ops = { + .op = { + [CIT_READ] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_iter_fini, + .cio_start = osc_io_read_start, + }, + [CIT_WRITE] = { + .cio_iter_init = osc_io_write_iter_init, + .cio_iter_fini = osc_io_write_iter_fini, + .cio_start = osc_io_write_start, + .cio_end = osc_io_end, + }, + [CIT_SETATTR] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_iter_fini, + .cio_start = mdc_io_setattr_start, + .cio_end = osc_io_setattr_end, + }, + [CIT_DATA_VERSION] = { + .cio_start = mdc_io_data_version_start, + .cio_end = mdc_io_data_version_end, + }, + [CIT_FAULT] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_iter_fini, + .cio_start = osc_io_fault_start, + .cio_end = osc_io_end, + }, + [CIT_FSYNC] = { + .cio_start = mdc_io_fsync_start, + .cio_end = osc_io_fsync_end, + }, + }, + .cio_read_ahead = mdc_io_read_ahead, + .cio_submit = osc_io_submit, + .cio_commit_async = osc_io_commit_async, +}; + +int mdc_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct osc_io *oio = osc_env_io(env); + + CL_IO_SLICE_CLEAN(oio, oi_cl); + cl_io_slice_add(io, &oio->oi_cl, obj, &mdc_io_ops); + return 0; +} + +static void mdc_build_res_name(struct osc_object *osc, + struct ldlm_res_id *resname) +{ + fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname); +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for MDC + * layer. MDC is responsible for struct obdo::o_id and struct obdo::o_seq + * fields. + */ +static void mdc_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr) +{ + u64 flags = attr->cra_flags; + + /* Copy object FID to cl_attr */ + attr->cra_oa->o_oi.oi_fid = *lu_object_fid(&obj->co_lu); + + if (flags & OBD_MD_FLGROUP) + attr->cra_oa->o_valid |= OBD_MD_FLGROUP; + + if (flags & OBD_MD_FLID) + attr->cra_oa->o_valid |= OBD_MD_FLID; + + if (flags & OBD_MD_FLHANDLE) { + struct osc_page *opg; + + opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj)); + if (!opg->ops_srvlock) { + int rc; + + rc = mdc_get_lock_handle(env, cl2osc(obj), + osc_index(opg), + &attr->cra_oa->o_handle); + if (rc) { + CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page, + "uncovered page!\n"); + LBUG(); + } else { + attr->cra_oa->o_valid |= OBD_MD_FLHANDLE; + } + } + } +} + +static int mdc_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + if (OST_LVB_IS_ERR(oinfo->loi_lvb.lvb_blocks)) + return OST_LVB_GET_ERR(oinfo->loi_lvb.lvb_blocks); + + return osc_attr_get(env, obj, attr); +} + +static int mdc_object_ast_clear(struct ldlm_lock *lock, void *data) +{ + struct osc_object *osc = (struct osc_object *)data; + struct ost_lvb *lvb = &lock->l_ost_lvb; + struct lov_oinfo *oinfo; + ENTRY; + + if (lock->l_ast_data == data) { + lock->l_ast_data = NULL; + + LASSERT(osc != NULL); + LASSERT(osc->oo_oinfo != NULL); + LASSERT(lvb != NULL); + + /* Updates lvb in lock by the cached oinfo */ + oinfo = osc->oo_oinfo; + + LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: " + "%llu %llu %llu by oinfo size %llu blocks %llu " + "[cma]time %llu %llu %llu", lvb->lvb_size, + lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime, + lvb->lvb_atime, oinfo->loi_lvb.lvb_size, + oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime, + oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime); + LASSERT(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms); + + cl_object_attr_lock(&osc->oo_cl); + memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb)); + cl_object_attr_unlock(&osc->oo_cl); + ldlm_clear_lvb_cached(lock); + } + RETURN(LDLM_ITER_CONTINUE); +} + +int mdc_object_prune(const struct lu_env *env, struct cl_object *obj) +{ + struct osc_object *osc = cl2osc(obj); + struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname; + + /* DLM locks don't hold a reference of osc_object so we have to + * clear it before the object is being destroyed. */ + osc_build_res_name(osc, resname); + ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname, + mdc_object_ast_clear, osc); + return 0; +} + +static int mdc_object_flush(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock) +{ + /* if lock cancel is initiated from llite then it is combined + * lock with DOM bit and it may have no l_ast_data initialized yet, + * so init it here with given osc_object. + */ + mdc_set_dom_lock_data(lock, cl2osc(obj)); + RETURN(mdc_dlm_blocking_ast0(env, lock, LDLM_CB_CANCELING)); +} + +static const struct cl_object_operations mdc_ops = { + .coo_page_init = osc_page_init, + .coo_lock_init = mdc_lock_init, + .coo_io_init = mdc_io_init, + .coo_attr_get = mdc_attr_get, + .coo_attr_update = osc_attr_update, + .coo_glimpse = osc_object_glimpse, + .coo_req_attr_set = mdc_req_attr_set, + .coo_prune = mdc_object_prune, + .coo_object_flush = mdc_object_flush +}; + +static const struct osc_object_operations mdc_object_ops = { + .oto_build_res_name = mdc_build_res_name, + .oto_dlmlock_at_pgoff = mdc_dlmlock_at_pgoff, +}; + +static int mdc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct osc_object *osc = lu2osc(obj); + + if (osc->oo_initialized) + return 0; + + osc->oo_initialized = true; + + return osc_object_init(env, obj, conf); +} + +static void mdc_object_free(const struct lu_env *env, struct lu_object *obj) +{ + osc_object_free(env, obj); +} + +static const struct lu_object_operations mdc_lu_obj_ops = { + .loo_object_init = mdc_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = mdc_object_free, + .loo_object_print = osc_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *mdc_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct osc_object *osc; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS); + if (osc != NULL) { + obj = osc2lu(osc); + lu_object_init(obj, NULL, dev); + osc->oo_cl.co_ops = &mdc_ops; + obj->lo_ops = &mdc_lu_obj_ops; + osc->oo_obj_ops = &mdc_object_ops; + osc->oo_initialized = false; + } else { + obj = NULL; + } + return obj; +} + +static int mdc_cl_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + return mdc_process_config(d->ld_obd, 0, cfg); +} + +const struct lu_device_operations mdc_lu_ops = { + .ldo_object_alloc = mdc_object_alloc, + .ldo_process_config = mdc_cl_process_config, + .ldo_recovery_complete = NULL, +}; + +static struct lu_device *mdc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct osc_device *od; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(od); + if (od == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + cl_device_init(&od->od_cl, t); + d = osc2lu_dev(od); + d->ld_ops = &mdc_lu_ops; + + /* Setup MDC OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + if (obd == NULL) + RETURN(ERR_PTR(-ENODEV)); + + rc = mdc_setup(obd, cfg); + if (rc < 0) { + osc_device_free(env, d); + RETURN(ERR_PTR(rc)); + } + od->od_exp = obd->obd_self_export; + RETURN(d); +} + +static const struct lu_device_type_operations mdc_device_type_ops = { + .ldto_device_alloc = mdc_device_alloc, + .ldto_device_free = osc_device_free, + .ldto_device_init = osc_device_init, + .ldto_device_fini = osc_device_fini +}; + +struct lu_device_type mdc_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_MDC_NAME, + .ldt_ops = &mdc_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h new file mode 100644 index 0000000000000..c0df4152bf80f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h @@ -0,0 +1,186 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _MDC_INTERNAL_H +#define _MDC_INTERNAL_H + +#include + +int mdc_tunables_init(struct obd_device *obd); + +void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid, + u64 valid, size_t ea_size, u32 suppgid, u32 flags); +void mdc_swap_layouts_pack(struct ptlrpc_request *req, + struct md_op_data *op_data); +void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size, + const struct lu_fid *fid); +void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags, + struct md_op_data *data, size_t ea_size); +void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + void *ea, size_t ealen); +void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const void *data, size_t datalen, umode_t mode, + uid_t uid, gid_t gid, cfs_cap_t capability, __u64 rdev); +void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + umode_t mode, __u64 rdev, __u64 flags, + const void *data, size_t datalen); +void mdc_file_secctx_pack(struct ptlrpc_request *req, + const char *secctx_name, + const void *secctx, size_t secctx_size); +void mdc_file_sepol_pack(struct ptlrpc_request *req); + +void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +void mdc_getxattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const char *old, size_t oldlen, + const char *new, size_t newlen); +void mdc_migrate_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const char *name, size_t namelen); +void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data); + +/* mdc/mdc_locks.c */ +int mdc_set_lock_data(struct obd_export *exp, + const struct lustre_handle *lockh, + void *data, __u64 *bits); + +int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid); + +int mdc_intent_lock(struct obd_export *exp, + struct md_op_data *op_data, + struct lookup_intent *it, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); + +int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + const union ldlm_policy_data *policy, + struct md_op_data *op_data, + struct lustre_handle *lockh, __u64 extra_lock_flags); + +int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, + struct list_head *cancels, enum ldlm_mode mode, + __u64 bits); +int mdc_save_lovea(struct ptlrpc_request *req, + const struct req_msg_field *field, + void *data, u32 size); +/* mdc/mdc_request.c */ +int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data); +int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg); +int mdc_process_config(struct obd_device *obd, size_t len, void *buf); + +struct obd_client_handle; + +int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req, + struct obd_export *dt_exp, struct obd_export *lmv_exp, + struct lustre_md *md); + +int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md); + +int mdc_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it); + +int mdc_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och); +void mdc_commit_open(struct ptlrpc_request *req); +void mdc_replay_open(struct ptlrpc_request *req); + +int mdc_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, size_t datalen, + umode_t mode, uid_t uid, gid_t gid, + cfs_cap_t capability, __u64 rdev, + struct ptlrpc_request **request); +int mdc_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request); +int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, size_t oldlen, const char *new, size_t newlen, + struct ptlrpc_request **request); +int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, size_t ealen, struct ptlrpc_request **request); +int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request); +int mdc_file_resync(struct obd_export *exp, struct md_op_data *data); +int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + union ldlm_policy_data *policy, enum ldlm_mode mode, + enum ldlm_cancel_flags flags, void *opaque); + +int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits); + +int mdc_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo); + +enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, struct lustre_handle *lockh); + + +#define MDC_CHANGELOG_DEV_COUNT LMV_MAX_STRIPE_COUNT +#define MDC_CHANGELOG_DEV_NAME "changelog" +extern struct class *mdc_changelog_class; +extern dev_t mdc_changelog_dev; + +int mdc_changelog_cdev_init(struct obd_device *obd); + +void mdc_changelog_cdev_finish(struct obd_device *obd); + +static inline int mdc_prep_elc_req(struct obd_export *exp, + struct ptlrpc_request *req, int opc, + struct list_head *cancels, int count) +{ + return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels, + count); +} + +static inline unsigned long hash_x_index(__u64 hash, int hash64) +{ + if (BITS_PER_LONG == 32 && hash64) + hash >>= 32; + /* save hash 0 with hash 1 */ + return ~0UL - (hash + !hash); +} + +/* mdc_dev.c */ +extern struct lu_device_type mdc_device_type; +int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, + struct ldlm_lock_desc *new, void *data, int flag); +int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data); +int mdc_fill_lvb(struct ptlrpc_request *req, struct ost_lvb *lvb); + +/* the minimum inline repsize should be PAGE_SIZE at least */ +#define MDC_DOM_DEF_INLINE_REPSIZE max(8192UL, PAGE_SIZE) +#define MDC_DOM_MAX_INLINE_REPSIZE XATTR_SIZE_MAX + +#endif diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c new file mode 100644 index 0000000000000..dcc42508aca98 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c @@ -0,0 +1,630 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include +#include +#include +#include +#include "mdc_internal.h" + +static void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags) +{ + mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll); + mrc->cr_flags_h = (__u32)(flags >> 32); +} + +static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid) +{ + LASSERT (b != NULL); + + b->mbo_suppgid = suppgid; + b->mbo_uid = from_kuid(&init_user_ns, current_uid()); + b->mbo_gid = from_kgid(&init_user_ns, current_gid()); + b->mbo_fsuid = from_kuid(&init_user_ns, current_fsuid()); + b->mbo_fsgid = from_kgid(&init_user_ns, current_fsgid()); + b->mbo_capability = cfs_curproc_cap_pack(); +} + +void mdc_swap_layouts_pack(struct ptlrpc_request *req, + struct md_op_data *op_data) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + + __mdc_pack_body(b, op_data->op_suppgids[0]); + b->mbo_fid1 = op_data->op_fid1; + b->mbo_fid2 = op_data->op_fid2; + b->mbo_valid |= OBD_MD_FLID; +} + +void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid, + u64 valid, size_t ea_size, u32 suppgid, u32 flags) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + LASSERT(b != NULL); + b->mbo_valid = valid; + b->mbo_eadatasize = ea_size; + b->mbo_flags = flags; + __mdc_pack_body(b, suppgid); + if (fid) { + b->mbo_fid1 = *fid; + b->mbo_valid |= OBD_MD_FLID; + } +} + +/** + * Pack a name (path component) into a request + * + * \param[in] req request + * \param[in] field request field (usually RMF_NAME) + * \param[in] name path component + * \param[in] name_len length of path component + * + * \a field must be present in \a req and of size \a name_len + 1. + * + * \a name must be '\0' terminated of length \a name_len and represent + * a single path component (not contain '/'). + */ +static void mdc_pack_name(struct ptlrpc_request *req, + const struct req_msg_field *field, + const char *name, size_t name_len) +{ + char *buf; + size_t buf_size; + size_t cpy_len; + + buf = req_capsule_client_get(&req->rq_pill, field); + buf_size = req_capsule_get_size(&req->rq_pill, field, RCL_CLIENT); + + LASSERT(name != NULL && name_len != 0 && + buf != NULL && buf_size == name_len + 1); + + cpy_len = strlcpy(buf, name, buf_size); + + LASSERT(lu_name_is_valid_2(buf, cpy_len)); + if (cpy_len != name_len) + CDEBUG(D_DENTRY, "%s: %s len %zd != %zd, concurrent rename?\n", + req->rq_export->exp_obd->obd_name, buf, name_len, + cpy_len); +} + +void mdc_file_secctx_pack(struct ptlrpc_request *req, const char *secctx_name, + const void *secctx, size_t secctx_size) +{ + void *buf; + size_t buf_size; + + if (secctx_name == NULL) + return; + + buf = req_capsule_client_get(&req->rq_pill, &RMF_FILE_SECCTX_NAME); + buf_size = req_capsule_get_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT); + + LASSERT(buf_size == strlen(secctx_name) + 1); + memcpy(buf, secctx_name, buf_size); + + buf = req_capsule_client_get(&req->rq_pill, &RMF_FILE_SECCTX); + buf_size = req_capsule_get_size(&req->rq_pill, &RMF_FILE_SECCTX, + RCL_CLIENT); + + LASSERT(buf_size == secctx_size); + memcpy(buf, secctx, buf_size); +} + +void mdc_file_sepol_pack(struct ptlrpc_request *req) +{ + void *buf; + size_t buf_size; + + if (strlen(req->rq_sepol) == 0) + return; + + buf = req_capsule_client_get(&req->rq_pill, &RMF_SELINUX_POL); + buf_size = req_capsule_get_size(&req->rq_pill, &RMF_SELINUX_POL, + RCL_CLIENT); + + LASSERT(buf_size == strlen(req->rq_sepol) + 1); + snprintf(buf, strlen(req->rq_sepol) + 1, "%s", req->rq_sepol); +} + +void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size, + const struct lu_fid *fid) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + b->mbo_fid1 = *fid; + b->mbo_valid |= OBD_MD_FLID; + b->mbo_size = pgoff; /* !! */ + b->mbo_nlink = size; /* !! */ + __mdc_pack_body(b, -1); + b->mbo_mode = LUDA_FID | LUDA_TYPE; +} + +/* packing of MDS records */ +void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const void *data, size_t datalen, umode_t mode, + uid_t uid, gid_t gid, cfs_cap_t cap_effective, __u64 rdev) +{ + struct mdt_rec_create *rec; + char *tmp; + __u64 flags; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + + rec->cr_opcode = REINT_CREATE; + rec->cr_fsuid = uid; + rec->cr_fsgid = gid; + rec->cr_cap = cap_effective; + rec->cr_fid1 = op_data->op_fid1; + rec->cr_fid2 = op_data->op_fid2; + rec->cr_mode = mode; + rec->cr_rdev = rdev; + rec->cr_time = op_data->op_mod_time; + rec->cr_suppgid1 = op_data->op_suppgids[0]; + rec->cr_suppgid2 = op_data->op_suppgids[1]; + flags = 0; + if (op_data->op_bias & MDS_CREATE_VOLATILE) + flags |= MDS_OPEN_VOLATILE; + set_mrc_cr_flags(rec, flags); + rec->cr_bias = op_data->op_bias; + rec->cr_umask = current_umask(); + + mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen); + if (data) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, data, datalen); + } + + mdc_file_secctx_pack(req, op_data->op_file_secctx_name, + op_data->op_file_secctx, + op_data->op_file_secctx_size); + + /* pack SELinux policy info if any */ + mdc_file_sepol_pack(req); +} + +static inline __u64 mds_pack_open_flags(__u64 flags) +{ + __u64 cr_flags = (flags & MDS_OPEN_FL_INTERNAL); + + if (flags & FMODE_READ) + cr_flags |= MDS_FMODE_READ; + if (flags & FMODE_WRITE) + cr_flags |= MDS_FMODE_WRITE; + if (flags & O_CREAT) + cr_flags |= MDS_OPEN_CREAT; + if (flags & O_EXCL) + cr_flags |= MDS_OPEN_EXCL; + if (flags & O_TRUNC) + cr_flags |= MDS_OPEN_TRUNC; + if (flags & O_APPEND) + cr_flags |= MDS_OPEN_APPEND; + if (flags & O_SYNC) + cr_flags |= MDS_OPEN_SYNC; + if (flags & O_DIRECTORY) + cr_flags |= MDS_OPEN_DIRECTORY; +#ifdef FMODE_EXEC + if (flags & FMODE_EXEC) + cr_flags |= MDS_FMODE_EXEC; +#endif + if (cl_is_lov_delay_create(flags)) + cr_flags |= MDS_OPEN_DELAY_CREATE; + + if (flags & O_NONBLOCK) + cr_flags |= MDS_OPEN_NORESTORE; + + return cr_flags; +} + +/* packing of MDS records */ +void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + umode_t mode, __u64 rdev, __u64 flags, const void *lmm, + size_t lmmlen) +{ + struct mdt_rec_create *rec; + char *tmp; + __u64 cr_flags; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + /* XXX do something about time, uid, gid */ + rec->cr_opcode = REINT_OPEN; + rec->cr_fsuid = from_kuid(&init_user_ns, current_fsuid()); + rec->cr_fsgid = from_kgid(&init_user_ns, current_fsgid()); + rec->cr_cap = cfs_curproc_cap_pack(); + rec->cr_mode = mode; + cr_flags = mds_pack_open_flags(flags); + rec->cr_rdev = rdev; + rec->cr_umask = current_umask(); + if (op_data != NULL) { + rec->cr_fid1 = op_data->op_fid1; + rec->cr_fid2 = op_data->op_fid2; + rec->cr_time = op_data->op_mod_time; + rec->cr_suppgid1 = op_data->op_suppgids[0]; + rec->cr_suppgid2 = op_data->op_suppgids[1]; + rec->cr_bias = op_data->op_bias; + rec->cr_open_handle_old = op_data->op_open_handle; + + if (op_data->op_name) { + mdc_pack_name(req, &RMF_NAME, op_data->op_name, + op_data->op_namelen); + + if (op_data->op_bias & MDS_CREATE_VOLATILE) + cr_flags |= MDS_OPEN_VOLATILE; + } + + mdc_file_secctx_pack(req, op_data->op_file_secctx_name, + op_data->op_file_secctx, + op_data->op_file_secctx_size); + + /* pack SELinux policy info if any */ + mdc_file_sepol_pack(req); + } + + if (lmm) { + cr_flags |= MDS_OPEN_HAS_EA; + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, lmm, lmmlen); + } + set_mrc_cr_flags(rec, cr_flags); +} + +static inline u64 attr_pack(unsigned int ia_valid, enum op_xvalid ia_xvalid) +{ + u64 sa_valid = 0; + + if (ia_valid & ATTR_MODE) + sa_valid |= MDS_ATTR_MODE; + if (ia_valid & ATTR_UID) + sa_valid |= MDS_ATTR_UID; + if (ia_valid & ATTR_GID) + sa_valid |= MDS_ATTR_GID; + if (ia_valid & ATTR_SIZE) + sa_valid |= MDS_ATTR_SIZE; + if (ia_valid & ATTR_ATIME) + sa_valid |= MDS_ATTR_ATIME; + if (ia_valid & ATTR_MTIME) + sa_valid |= MDS_ATTR_MTIME; + if (ia_valid & ATTR_CTIME) + sa_valid |= MDS_ATTR_CTIME; + if (ia_valid & ATTR_ATIME_SET) + sa_valid |= MDS_ATTR_ATIME_SET; + if (ia_valid & ATTR_MTIME_SET) + sa_valid |= MDS_ATTR_MTIME_SET; + if (ia_valid & ATTR_FORCE) + sa_valid |= MDS_ATTR_FORCE; + if (ia_xvalid & OP_XVALID_FLAGS) + sa_valid |= MDS_ATTR_ATTR_FLAG; + if (ia_valid & ATTR_KILL_SUID) + sa_valid |= MDS_ATTR_KILL_SUID; + if (ia_valid & ATTR_KILL_SGID) + sa_valid |= MDS_ATTR_KILL_SGID; + if (ia_xvalid & OP_XVALID_CTIME_SET) + sa_valid |= MDS_ATTR_CTIME_SET; + if (ia_valid & ATTR_OPEN) + sa_valid |= MDS_ATTR_FROM_OPEN; + if (ia_xvalid & OP_XVALID_BLOCKS) + sa_valid |= MDS_ATTR_BLOCKS; + if (ia_xvalid & OP_XVALID_OWNEROVERRIDE) + /* NFSD hack (see bug 5781) */ + sa_valid |= MDS_OPEN_OWNEROVERRIDE; + if (ia_xvalid & OP_XVALID_PROJID) + sa_valid |= MDS_ATTR_PROJID; + if (ia_xvalid & OP_XVALID_LAZYSIZE) + sa_valid |= MDS_ATTR_LSIZE; + if (ia_xvalid & OP_XVALID_LAZYBLOCKS) + sa_valid |= MDS_ATTR_LBLOCKS; + return sa_valid; +} + +static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec, + struct md_op_data *op_data) +{ + rec->sa_opcode = REINT_SETATTR; + rec->sa_fsuid = from_kuid(&init_user_ns, current_fsuid()); + rec->sa_fsgid = from_kgid(&init_user_ns, current_fsgid()); + rec->sa_cap = cfs_curproc_cap_pack(); + rec->sa_suppgid = -1; + + rec->sa_fid = op_data->op_fid1; + rec->sa_valid = attr_pack(op_data->op_attr.ia_valid, + op_data->op_xvalid); + rec->sa_mode = op_data->op_attr.ia_mode; + rec->sa_uid = from_kuid(&init_user_ns, op_data->op_attr.ia_uid); + rec->sa_gid = from_kgid(&init_user_ns, op_data->op_attr.ia_gid); + rec->sa_projid = op_data->op_projid; + rec->sa_size = op_data->op_attr.ia_size; + rec->sa_blocks = op_data->op_attr_blocks; + rec->sa_atime = op_data->op_attr.ia_atime.tv_sec; + rec->sa_mtime = op_data->op_attr.ia_mtime.tv_sec; + rec->sa_ctime = op_data->op_attr.ia_ctime.tv_sec; + rec->sa_attr_flags = op_data->op_attr_flags; + if ((op_data->op_attr.ia_valid & ATTR_GID) && + in_group_p(op_data->op_attr.ia_gid)) + rec->sa_suppgid = + from_kgid(&init_user_ns, op_data->op_attr.ia_gid); + else + rec->sa_suppgid = op_data->op_suppgids[0]; + + rec->sa_bias = op_data->op_bias; +} + +static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch, + struct md_op_data *op_data) +{ + epoch->mio_open_handle = op_data->op_open_handle; + epoch->mio_unused1 = 0; + epoch->mio_unused2 = 0; + epoch->mio_padding = 0; +} + +void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + void *ea, size_t ealen) +{ + struct mdt_rec_setattr *rec; + struct lov_user_md *lum = NULL; + + CLASSERT(sizeof(struct mdt_rec_reint) == + sizeof(struct mdt_rec_setattr)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + mdc_setattr_pack_rec(rec, op_data); + + if (ealen == 0) + return; + + lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + if (ea == NULL) { /* Remove LOV EA */ + lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1); + lum->lmm_stripe_size = 0; + lum->lmm_stripe_count = 0; + lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1); + } else { + memcpy(lum, ea, ealen); + } +} + +void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data) +{ + struct mdt_rec_unlink *rec; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + LASSERT(rec != NULL); + + rec->ul_opcode = op_data->op_cli_flags & CLI_RM_ENTRY ? + REINT_RMENTRY : REINT_UNLINK; + rec->ul_fsuid = op_data->op_fsuid; + rec->ul_fsgid = op_data->op_fsgid; + rec->ul_cap = op_data->op_cap; + rec->ul_mode = op_data->op_mode; + rec->ul_suppgid1= op_data->op_suppgids[0]; + rec->ul_suppgid2= -1; + rec->ul_fid1 = op_data->op_fid1; + rec->ul_fid2 = op_data->op_fid2; + rec->ul_time = op_data->op_mod_time; + rec->ul_bias = op_data->op_bias; + + mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen); + + /* pack SELinux policy info if any */ + mdc_file_sepol_pack(req); +} + +void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data) +{ + struct mdt_rec_link *rec; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + LASSERT (rec != NULL); + + rec->lk_opcode = REINT_LINK; + rec->lk_fsuid = op_data->op_fsuid;//current->fsuid; + rec->lk_fsgid = op_data->op_fsgid;//current->fsgid; + rec->lk_cap = op_data->op_cap;//current->cap_effective; + rec->lk_suppgid1 = op_data->op_suppgids[0]; + rec->lk_suppgid2 = op_data->op_suppgids[1]; + rec->lk_fid1 = op_data->op_fid1; + rec->lk_fid2 = op_data->op_fid2; + rec->lk_time = op_data->op_mod_time; + rec->lk_bias = op_data->op_bias; + + mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen); + + /* pack SELinux policy info if any */ + mdc_file_sepol_pack(req); +} + +static void mdc_close_intent_pack(struct ptlrpc_request *req, + struct md_op_data *op_data) +{ + struct close_data *data; + struct ldlm_lock *lock; + enum mds_op_bias bias = op_data->op_bias; + + if (!(bias & (MDS_CLOSE_INTENT | MDS_CLOSE_MIGRATE))) + return; + + data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA); + LASSERT(data != NULL); + + lock = ldlm_handle2lock(&op_data->op_lease_handle); + if (lock != NULL) { + data->cd_handle = lock->l_remote_handle; + LDLM_LOCK_PUT(lock); + } + ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL); + + data->cd_data_version = op_data->op_data_version; + data->cd_fid = op_data->op_fid2; + + if (bias & MDS_CLOSE_LAYOUT_SPLIT) { + data->cd_mirror_id = op_data->op_mirror_id; + } else if (bias & MDS_CLOSE_RESYNC_DONE) { + struct close_data_resync_done *sync = &data->cd_resync; + + CLASSERT(sizeof(data->cd_resync) <= sizeof(data->cd_reserved)); + sync->resync_count = op_data->op_data_size / sizeof(__u32); + if (sync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) { + memcpy(sync->resync_ids_inline, op_data->op_data, + op_data->op_data_size); + } else { + size_t count = sync->resync_count; + + memcpy(req_capsule_client_get(&req->rq_pill, &RMF_U32), + op_data->op_data, count * sizeof(__u32)); + } + } +} + +void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const char *old, size_t oldlen, + const char *new, size_t newlen) +{ + struct mdt_rec_rename *rec; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + /* XXX do something about time, uid, gid */ + rec->rn_opcode = REINT_RENAME; + rec->rn_fsuid = op_data->op_fsuid; + rec->rn_fsgid = op_data->op_fsgid; + rec->rn_cap = op_data->op_cap; + rec->rn_suppgid1 = op_data->op_suppgids[0]; + rec->rn_suppgid2 = op_data->op_suppgids[1]; + rec->rn_fid1 = op_data->op_fid1; + rec->rn_fid2 = op_data->op_fid2; + rec->rn_time = op_data->op_mod_time; + rec->rn_mode = op_data->op_mode; + rec->rn_bias = op_data->op_bias; + + mdc_pack_name(req, &RMF_NAME, old, oldlen); + + if (new != NULL) + mdc_pack_name(req, &RMF_SYMTGT, new, newlen); + + /* pack SELinux policy info if any */ + mdc_file_sepol_pack(req); +} + +void mdc_migrate_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const char *name, size_t namelen) +{ + struct mdt_rec_rename *rec; + char *ea; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + rec->rn_opcode = REINT_MIGRATE; + rec->rn_fsuid = op_data->op_fsuid; + rec->rn_fsgid = op_data->op_fsgid; + rec->rn_cap = op_data->op_cap; + rec->rn_suppgid1 = op_data->op_suppgids[0]; + rec->rn_suppgid2 = op_data->op_suppgids[1]; + rec->rn_fid1 = op_data->op_fid1; + rec->rn_fid2 = op_data->op_fid4; + rec->rn_time = op_data->op_mod_time; + rec->rn_mode = op_data->op_mode; + rec->rn_bias = op_data->op_bias; + + mdc_pack_name(req, &RMF_NAME, name, namelen); + + if (op_data->op_bias & MDS_CLOSE_MIGRATE) { + struct mdt_ioepoch *epoch; + + mdc_close_intent_pack(req, op_data); + epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); + mdc_ioepoch_pack(epoch, op_data); + } + + ea = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(ea, op_data->op_data, op_data->op_data_size); +} + +void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags, + struct md_op_data *op_data, size_t ea_size) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + + b->mbo_valid = valid; + if (op_data->op_bias & MDS_CROSS_REF) + b->mbo_valid |= OBD_MD_FLCROSSREF; + b->mbo_eadatasize = ea_size; + b->mbo_flags = flags; + __mdc_pack_body(b, op_data->op_suppgids[0]); + + b->mbo_fid1 = op_data->op_fid1; + b->mbo_fid2 = op_data->op_fid2; + b->mbo_valid |= OBD_MD_FLID; + + if (op_data->op_name != NULL) + mdc_pack_name(req, &RMF_NAME, op_data->op_name, + op_data->op_namelen); +} + +void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data) +{ + struct mdt_ioepoch *epoch; + struct mdt_rec_setattr *rec; + + epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + mdc_setattr_pack_rec(rec, op_data); + /* + * The client will zero out local timestamps when losing the IBITS lock + * so any new RPC timestamps will update the client inode's timestamps. + * There was a defect on the server side which allowed the atime to be + * overwritten by a zeroed-out atime packed into the close RPC. + * + * Proactively clear the MDS_ATTR_ATIME flag in the RPC in this case + * to avoid zeroing the atime on old unpatched servers. See LU-8041. + */ + if (rec->sa_atime == 0) + rec->sa_valid &= ~MDS_ATTR_ATIME; + + mdc_ioepoch_pack(epoch, op_data); + mdc_close_intent_pack(req, op_data); +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c new file mode 100644 index 0000000000000..11021e8d89dc6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c @@ -0,0 +1,1428 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mdc_internal.h" + +struct mdc_getattr_args { + struct obd_export *ga_exp; + struct md_enqueue_info *ga_minfo; +}; + +int it_open_error(int phase, struct lookup_intent *it) +{ + if (it_disposition(it, DISP_OPEN_LEASE)) { + if (phase >= DISP_OPEN_LEASE) + return it->it_status; + else + return 0; + } + if (it_disposition(it, DISP_OPEN_OPEN)) { + if (phase >= DISP_OPEN_OPEN) + return it->it_status; + else + return 0; + } + + if (it_disposition(it, DISP_OPEN_CREATE)) { + if (phase >= DISP_OPEN_CREATE) + return it->it_status; + else + return 0; + } + + if (it_disposition(it, DISP_LOOKUP_EXECD)) { + if (phase >= DISP_LOOKUP_EXECD) + return it->it_status; + else + return 0; + } + + if (it_disposition(it, DISP_IT_EXECD)) { + if (phase >= DISP_IT_EXECD) + return it->it_status; + else + return 0; + } + + CERROR("it disp: %X, status: %d\n", it->it_disposition, it->it_status); + LBUG(); + + return 0; +} +EXPORT_SYMBOL(it_open_error); + +/* this must be called on a lockh that is known to have a referenced lock */ +int mdc_set_lock_data(struct obd_export *exp, const struct lustre_handle *lockh, + void *data, __u64 *bits) +{ + struct ldlm_lock *lock; + struct inode *new_inode = data; + ENTRY; + + if(bits) + *bits = 0; + + if (!lustre_handle_is_used(lockh)) + RETURN(0); + + lock = ldlm_handle2lock(lockh); + + LASSERT(lock != NULL); + lock_res_and_lock(lock); + if (lock->l_resource->lr_lvb_inode && + lock->l_resource->lr_lvb_inode != data) { + struct inode *old_inode = lock->l_resource->lr_lvb_inode; + LASSERTF(old_inode->i_state & I_FREEING, + "Found existing inode %p/%lu/%u state %lu in lock: " + "setting data to %p/%lu/%u\n", old_inode, + old_inode->i_ino, old_inode->i_generation, + old_inode->i_state, + new_inode, new_inode->i_ino, new_inode->i_generation); + } + lock->l_resource->lr_lvb_inode = new_inode; + if (bits) + *bits = lock->l_policy_data.l_inodebits.bits; + + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + + RETURN(0); +} + +enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, enum ldlm_type type, + union ldlm_policy_data *policy, + enum ldlm_mode mode, struct lustre_handle *lockh) +{ + struct ldlm_res_id res_id; + enum ldlm_mode rc; + ENTRY; + + fid_build_reg_res_name(fid, &res_id); + /* LU-4405: Clear bits not supported by server */ + policy->l_inodebits.bits &= exp_connect_ibits(exp); + rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags, + &res_id, type, policy, mode, lockh, 0); + RETURN(rc); +} + +int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + union ldlm_policy_data *policy, enum ldlm_mode mode, + enum ldlm_cancel_flags flags, void *opaque) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ldlm_res_id res_id; + int rc; + + ENTRY; + + fid_build_reg_res_name(fid, &res_id); + rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id, + policy, mode, flags, opaque); + RETURN(rc); +} + +int mdc_null_inode(struct obd_export *exp, + const struct lu_fid *fid) +{ + struct ldlm_res_id res_id; + struct ldlm_resource *res; + struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace; + ENTRY; + + LASSERTF(ns != NULL, "no namespace passed\n"); + + fid_build_reg_res_name(fid, &res_id); + + res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); + if (IS_ERR(res)) + RETURN(0); + + lock_res(res); + res->lr_lvb_inode = NULL; + unlock_res(res); + + ldlm_resource_putref(res); + RETURN(0); +} + +static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc) +{ + /* Don't hold error requests for replay. */ + if (req->rq_replay) { + spin_lock(&req->rq_lock); + req->rq_replay = 0; + spin_unlock(&req->rq_lock); + } + if (rc && req->rq_transno != 0) { + DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc); + LBUG(); + } +} + +/* Save a large LOV EA into the request buffer so that it is available + * for replay. We don't do this in the initial request because the + * original request doesn't need this buffer (at most it sends just the + * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty + * buffer and may also be difficult to allocate and save a very large + * request buffer for each open. (bug 5707) + * + * OOM here may cause recovery failure if lmm is needed (only for the + * original open if the MDS crashed just when this client also OOM'd) + * but this is incredibly unlikely, and questionable whether the client + * could do MDS recovery under OOM anyways... */ +int mdc_save_lovea(struct ptlrpc_request *req, + const struct req_msg_field *field, + void *data, u32 size) +{ + struct req_capsule *pill = &req->rq_pill; + void *lmm; + int rc = 0; + + if (req_capsule_get_size(pill, field, RCL_CLIENT) < size) { + rc = sptlrpc_cli_enlarge_reqbuf(req, field, size); + if (rc) { + CERROR("%s: Can't enlarge ea size to %d: rc = %d\n", + req->rq_export->exp_obd->obd_name, + size, rc); + return rc; + } + } else { + req_capsule_shrink(pill, field, size, RCL_CLIENT); + } + + req_capsule_set_size(pill, field, RCL_CLIENT, size); + lmm = req_capsule_client_get(pill, field); + if (lmm) + memcpy(lmm, data, size); + + return rc; +} + +static struct ptlrpc_request * +mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it, + struct md_op_data *op_data, __u32 acl_bufsize) +{ + struct ptlrpc_request *req; + struct obd_device *obddev = class_exp2obd(exp); + struct ldlm_intent *lit; + const void *lmm = op_data->op_data; + __u32 lmmsize = op_data->op_data_size; + struct list_head cancels = LIST_HEAD_INIT(cancels); + int count = 0; + enum ldlm_mode mode; + int rc; + int repsize, repsize_estimate; + + ENTRY; + + it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG; + + /* XXX: openlock is not cancelled for cross-refs. */ + /* If inode is known, cancel conflicting OPEN locks. */ + if (fid_is_sane(&op_data->op_fid2)) { + if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */ + if (it->it_flags & MDS_FMODE_WRITE) + mode = LCK_EX; + else + mode = LCK_PR; + } else { + if (it->it_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC)) + mode = LCK_CW; +#ifdef FMODE_EXEC + else if (it->it_flags & FMODE_EXEC) + mode = LCK_PR; +#endif + else + mode = LCK_CR; + } + count = mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, mode, + MDS_INODELOCK_OPEN); + } + + /* If CREATE, cancel parent's UPDATE lock. */ + if (it->it_op & IT_CREAT) + mode = LCK_EX; + else + mode = LCK_CR; + count += mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, mode, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_OPEN); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(ERR_PTR(-ENOMEM)); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + if (cl_is_lov_delay_create(it->it_flags)) { + /* open(O_LOV_DELAY_CREATE) won't pack lmm */ + LASSERT(lmmsize == 0); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0); + } else { + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + max(lmmsize, obddev->u.cli.cl_default_mds_easize)); + } + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT, op_data->op_file_secctx_name != NULL ? + op_data->op_file_secctx_name_size : 0); + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT, + op_data->op_file_secctx_size); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + spin_lock(&req->rq_lock); + req->rq_replay = req->rq_import->imp_replayable; + spin_unlock(&req->rq_lock); + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the intended request */ + mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm, + lmmsize); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obddev->u.cli.cl_max_mds_easize); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize); + + if (!(it->it_op & IT_CREAT) && it->it_op & IT_OPEN && + req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT) && + op_data->op_file_secctx_name_size > 0 && + op_data->op_file_secctx_name != NULL) { + char *secctx_name; + + secctx_name = req_capsule_client_get(&req->rq_pill, + &RMF_FILE_SECCTX_NAME); + memcpy(secctx_name, op_data->op_file_secctx_name, + op_data->op_file_secctx_name_size); + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, + RCL_SERVER, + obddev->u.cli.cl_max_mds_easize); + + CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n", + op_data->op_file_secctx_name_size, + op_data->op_file_secctx_name); + + } else { + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, + RCL_SERVER, 0); + } + + /** + * Inline buffer for possible data from Data-on-MDT files. + */ + req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE, RCL_SERVER, + sizeof(struct niobuf_remote)); + ptlrpc_request_set_replen(req); + + /* Get real repbuf allocated size as rounded up power of 2 */ + repsize = size_roundup_power2(req->rq_replen + + lustre_msg_early_size()); + /* Estimate free space for DoM files in repbuf */ + repsize_estimate = repsize - (req->rq_replen - + obddev->u.cli.cl_max_mds_easize + + sizeof(struct lov_comp_md_v1) + + sizeof(struct lov_comp_md_entry_v1) + + lov_mds_md_size(0, LOV_MAGIC_V3)); + + if (repsize_estimate < obddev->u.cli.cl_dom_min_inline_repsize) { + repsize = obddev->u.cli.cl_dom_min_inline_repsize - + repsize_estimate + sizeof(struct niobuf_remote); + req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE, + RCL_SERVER, + sizeof(struct niobuf_remote) + repsize); + ptlrpc_request_set_replen(req); + CDEBUG(D_INFO, "Increase repbuf by %d bytes, total: %d\n", + repsize, req->rq_replen); + repsize = size_roundup_power2(req->rq_replen + + lustre_msg_early_size()); + } + /* The only way to report real allocated repbuf size to the server + * is the lm_repsize but it must be set prior buffer allocation itself + * due to security reasons - it is part of buffer used in signature + * calculation (see LU-11414). Therefore the saved size is predicted + * value as rq_replen rounded to the next higher power of 2. + * Such estimation is safe. Though the final allocated buffer might + * be even larger, it is not possible to know that at this point. + */ + req->rq_reqmsg->lm_repsize = repsize; + return req; +} + +#define GA_DEFAULT_EA_NAME_LEN 20 +#define GA_DEFAULT_EA_VAL_LEN 250 +#define GA_DEFAULT_EA_NUM 10 + +static struct ptlrpc_request * +mdc_intent_getxattr_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *op_data) +{ + struct ptlrpc_request *req; + struct ldlm_intent *lit; + int rc, count = 0; + struct list_head cancels = LIST_HEAD_INIT(cancels); + u32 ea_vals_buf_size = GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM; + + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_GETXATTR); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = IT_GETXATTR; + CDEBUG(D_INFO, "%s: get xattrs for "DFID"\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1)); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0) + /* If the supplied buffer is too small then the server will + * return -ERANGE and llite will fallback to using non cached + * xattr operations. On servers before 2.10.1 a (non-cached) + * listxattr RPC for an orphan or dead file causes an oops. So + * let's try to avoid sending too small a buffer to too old a + * server. This is effectively undoing the memory conservation + * of LU-9417 when it would be *more* likely to crash the + * server. See LU-9856. */ + if (exp->exp_connect_data.ocd_version < OBD_OCD_VERSION(2, 10, 1, 0)) + ea_vals_buf_size = max_t(u32, ea_vals_buf_size, + exp->exp_connect_data.ocd_max_easize); +#endif + + /* pack the intended request */ + mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid, + ea_vals_buf_size, -1, 0); + + /* get SELinux policy info if any */ + mdc_file_sepol_pack(req); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, + GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM); + + req_capsule_set_size(&req->rq_pill, &RMF_EAVALS, RCL_SERVER, + ea_vals_buf_size); + + req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS, RCL_SERVER, + sizeof(u32) * GA_DEFAULT_EA_NUM); + + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0); + + ptlrpc_request_set_replen(req); + + RETURN(req); +} + +static struct ptlrpc_request * +mdc_intent_getattr_pack(struct obd_export *exp, struct lookup_intent *it, + struct md_op_data *op_data, __u32 acl_bufsize) +{ + struct ptlrpc_request *req; + struct obd_device *obddev = class_exp2obd(exp); + u64 valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE | + OBD_MD_FLDIREA | OBD_MD_MEA | OBD_MD_FLACL | + OBD_MD_DEFAULT_MEA; + struct ldlm_intent *lit; + __u32 easize; + bool have_secctx = false; + int rc; + + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_GETATTR); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + /* send name of security xattr to get upon intent */ + if (it->it_op & (IT_LOOKUP | IT_GETATTR) && + req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT) && + op_data->op_file_secctx_name_size > 0 && + op_data->op_file_secctx_name != NULL) { + have_secctx = true; + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT, + op_data->op_file_secctx_name_size); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + if (obddev->u.cli.cl_default_mds_easize > 0) + easize = obddev->u.cli.cl_default_mds_easize; + else + easize = obddev->u.cli.cl_max_mds_easize; + + /* pack the intended request */ + mdc_getattr_pack(req, valid, it->it_flags, op_data, easize); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize); + req_capsule_set_size(&req->rq_pill, &RMF_DEFAULT_MDT_MD, RCL_SERVER, + sizeof(struct lmv_user_md)); + + if (have_secctx) { + char *secctx_name; + + secctx_name = req_capsule_client_get(&req->rq_pill, + &RMF_FILE_SECCTX_NAME); + memcpy(secctx_name, op_data->op_file_secctx_name, + op_data->op_file_secctx_name_size); + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, + RCL_SERVER, easize); + + CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n", + op_data->op_file_secctx_name_size, + op_data->op_file_secctx_name); + } else { + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, + RCL_SERVER, 0); + } + + ptlrpc_request_set_replen(req); + RETURN(req); +} + +static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *op_data) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct ldlm_intent *lit; + struct layout_intent *layout; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_LAYOUT); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0); + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the layout intent request */ + layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT); + LASSERT(op_data->op_data != NULL); + LASSERT(op_data->op_data_size == sizeof(*layout)); + memcpy(layout, op_data->op_data, sizeof(*layout)); + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + ptlrpc_request_set_replen(req); + RETURN(req); +} + +static struct ptlrpc_request * +mdc_enqueue_pack(struct obd_export *exp, int lvb_len) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len); + ptlrpc_request_set_replen(req); + RETURN(req); +} + +static int mdc_finish_enqueue(struct obd_export *exp, + struct ptlrpc_request *req, + struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, + struct lustre_handle *lockh, + int rc) +{ + struct req_capsule *pill = &req->rq_pill; + struct ldlm_request *lockreq; + struct ldlm_reply *lockrep; + struct ldlm_lock *lock; + struct mdt_body *body = NULL; + void *lvb_data = NULL; + __u32 lvb_len = 0; + + ENTRY; + + LASSERT(rc >= 0); + /* Similarly, if we're going to replay this request, we don't want to + * actually get a lock, just perform the intent. */ + if (req->rq_transno || req->rq_replay) { + lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ); + lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY); + } + + if (rc == ELDLM_LOCK_ABORTED) { + einfo->ei_mode = 0; + memset(lockh, 0, sizeof(*lockh)); + rc = 0; + } else { /* rc = 0 */ + lock = ldlm_handle2lock(lockh); + LASSERT(lock != NULL); + + /* If the server gave us back a different lock mode, we should + * fix up our variables. */ + if (lock->l_req_mode != einfo->ei_mode) { + ldlm_lock_addref(lockh, lock->l_req_mode); + ldlm_lock_decref(lockh, einfo->ei_mode); + einfo->ei_mode = lock->l_req_mode; + } + LDLM_LOCK_PUT(lock); + } + + lockrep = req_capsule_server_get(pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */ + + it->it_disposition = (int)lockrep->lock_policy_res1; + it->it_status = (int)lockrep->lock_policy_res2; + it->it_lock_mode = einfo->ei_mode; + it->it_lock_handle = lockh->cookie; + it->it_request = req; + + /* Technically speaking rq_transno must already be zero if + * it_status is in error, so the check is a bit redundant */ + if ((!req->rq_transno || it->it_status < 0) && req->rq_replay) + mdc_clear_replay_flag(req, it->it_status); + + /* If we're doing an IT_OPEN which did not result in an actual + * successful open, then we need to remove the bit which saves + * this request for unconditional replay. + * + * It's important that we do this first! Otherwise we might exit the + * function without doing so, and try to replay a failed create + * (bug 3440) */ + if (it->it_op & IT_OPEN && req->rq_replay && + (!it_disposition(it, DISP_OPEN_OPEN) || it->it_status != 0)) + mdc_clear_replay_flag(req, it->it_status); + + DEBUG_REQ(D_RPCTRACE, req, "op: %x disposition: %x, status: %d", + it->it_op, it->it_disposition, it->it_status); + + /* We know what to expect, so we do any byte flipping required here */ + if (it_has_reply_body(it)) { + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (body == NULL) { + CERROR ("Can't swab mdt_body\n"); + RETURN (-EPROTO); + } + + if (it_disposition(it, DISP_OPEN_OPEN) && + !it_open_error(DISP_OPEN_OPEN, it)) { + /* + * If this is a successful OPEN request, we need to set + * replay handler and data early, so that if replay + * happens immediately after swabbing below, new reply + * is swabbed by that handler correctly. + */ + mdc_set_open_replay_data(NULL, NULL, it); + } + + if (it_disposition(it, DISP_OPEN_CREATE) && + !it_open_error(DISP_OPEN_CREATE, it)) { + lprocfs_counter_incr(exp->exp_obd->obd_md_stats, + LPROC_MD_CREATE); + } + + if (body->mbo_valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) { + void *eadata; + + mdc_update_max_ea_from_body(exp, body); + + /* + * The eadata is opaque; just check that it is there. + * Eventually, obd_unpackmd() will check the contents. + */ + eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + body->mbo_eadatasize); + if (eadata == NULL) + RETURN(-EPROTO); + + /* save lvb data and length in case this is for layout + * lock */ + lvb_data = eadata; + lvb_len = body->mbo_eadatasize; + + /* + * We save the reply LOV EA in case we have to replay a + * create for recovery. If we didn't allocate a large + * enough request buffer above we need to reallocate it + * here to hold the actual LOV EA. + * + * To not save LOV EA if request is not going to replay + * (for example error one). + */ + if ((it->it_op & IT_OPEN) && req->rq_replay) { + rc = mdc_save_lovea(req, &RMF_EADATA, eadata, + body->mbo_eadatasize); + if (rc) { + body->mbo_valid &= ~OBD_MD_FLEASIZE; + body->mbo_eadatasize = 0; + rc = 0; + } + } + } + } else if (it->it_op & IT_LAYOUT) { + /* maybe the lock was granted right away and layout + * is packed into RMF_DLM_LVB of req */ + lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER); + CDEBUG(D_INFO, "%s: layout return lvb %d transno %lld\n", + class_exp2obd(exp)->obd_name, lvb_len, req->rq_transno); + if (lvb_len > 0) { + lvb_data = req_capsule_server_sized_get(pill, + &RMF_DLM_LVB, lvb_len); + if (lvb_data == NULL) + RETURN(-EPROTO); + + /** + * save replied layout data to the request buffer for + * recovery consideration (lest MDS reinitialize + * another set of OST objects). + */ + if (req->rq_transno) + (void)mdc_save_lovea(req, &RMF_EADATA, lvb_data, + lvb_len); + } + } + + /* fill in stripe data for layout lock. + * LU-6581: trust layout data only if layout lock is granted. The MDT + * has stopped sending layout unless the layout lock is granted. The + * client still does this checking in case it's talking with an old + * server. - Jinshan */ + lock = ldlm_handle2lock(lockh); + if (lock == NULL) + RETURN(rc); + + if (ldlm_has_layout(lock) && lvb_data != NULL && + !(lockrep->lock_flags & LDLM_FL_BLOCKED_MASK)) { + void *lmm; + + LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d", + ldlm_it2str(it->it_op), lvb_len); + + OBD_ALLOC_LARGE(lmm, lvb_len); + if (lmm == NULL) + GOTO(out_lock, rc = -ENOMEM); + + memcpy(lmm, lvb_data, lvb_len); + + /* install lvb_data */ + lock_res_and_lock(lock); + if (lock->l_lvb_data == NULL) { + lock->l_lvb_type = LVB_T_LAYOUT; + lock->l_lvb_data = lmm; + lock->l_lvb_len = lvb_len; + lmm = NULL; + } + unlock_res_and_lock(lock); + if (lmm != NULL) + OBD_FREE_LARGE(lmm, lvb_len); + } + + if (ldlm_has_dom(lock)) { + LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast); + + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (!(body->mbo_valid & OBD_MD_DOM_SIZE)) { + LDLM_ERROR(lock, "%s: DoM lock without size.", + exp->exp_obd->obd_name); + GOTO(out_lock, rc = -EPROTO); + } + + LDLM_DEBUG(lock, "DoM lock is returned by: %s, size: %llu", + ldlm_it2str(it->it_op), body->mbo_dom_size); + + rc = mdc_fill_lvb(req, &lock->l_ost_lvb); + } +out_lock: + LDLM_LOCK_PUT(lock); + + RETURN(rc); +} + +/* We always reserve enough space in the reply packet for a stripe MD, because + * we don't know in advance the file type. */ +static int mdc_enqueue_base(struct obd_export *exp, + struct ldlm_enqueue_info *einfo, + const union ldlm_policy_data *policy, + struct lookup_intent *it, + struct md_op_data *op_data, + struct lustre_handle *lockh, + __u64 extra_lock_flags) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct ptlrpc_request *req = NULL; + __u64 flags, saved_flags = extra_lock_flags; + struct ldlm_res_id res_id; + static const union ldlm_policy_data lookup_policy = { + .l_inodebits = { MDS_INODELOCK_LOOKUP } }; + static const union ldlm_policy_data update_policy = { + .l_inodebits = { MDS_INODELOCK_UPDATE } }; + static const union ldlm_policy_data layout_policy = { + .l_inodebits = { MDS_INODELOCK_LAYOUT } }; + static const union ldlm_policy_data getxattr_policy = { + .l_inodebits = { MDS_INODELOCK_XATTR } }; + int generation, resends = 0; + struct ldlm_reply *lockrep; + struct obd_import *imp = class_exp2cliimp(exp); + __u32 acl_bufsize; + enum lvb_type lvb_type = 0; + int rc; + ENTRY; + + LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n", + einfo->ei_type); + fid_build_reg_res_name(&op_data->op_fid1, &res_id); + + if (it != NULL) { + LASSERT(policy == NULL); + + saved_flags |= LDLM_FL_HAS_INTENT; + if (it->it_op & (IT_GETATTR | IT_READDIR)) + policy = &update_policy; + else if (it->it_op & IT_LAYOUT) + policy = &layout_policy; + else if (it->it_op & IT_GETXATTR) + policy = &getxattr_policy; + else + policy = &lookup_policy; + } + + generation = obddev->u.cli.cl_import->imp_generation; + if (!it || (it->it_op & (IT_OPEN | IT_CREAT))) + acl_bufsize = imp->imp_connect_data.ocd_max_easize; + else + acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD; + +resend: + flags = saved_flags; + if (it == NULL) { + /* The only way right now is FLOCK. */ + LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n", + einfo->ei_type); + res_id.name[3] = LDLM_FLOCK; + } else if (it->it_op & IT_OPEN) { + req = mdc_intent_open_pack(exp, it, op_data, acl_bufsize); + } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) { + req = mdc_intent_getattr_pack(exp, it, op_data, acl_bufsize); + } else if (it->it_op & IT_READDIR) { + req = mdc_enqueue_pack(exp, 0); + } else if (it->it_op & IT_LAYOUT) { + if (!imp_connect_lvb_type(imp)) + RETURN(-EOPNOTSUPP); + req = mdc_intent_layout_pack(exp, it, op_data); + lvb_type = LVB_T_LAYOUT; + } else if (it->it_op & IT_GETXATTR) { + req = mdc_intent_getxattr_pack(exp, it, op_data); + } else { + LBUG(); + RETURN(-EINVAL); + } + + if (IS_ERR(req)) + RETURN(PTR_ERR(req)); + + if (resends) { + req->rq_generation_set = 1; + req->rq_import_generation = generation; + req->rq_sent = ktime_get_real_seconds() + resends; + } + + /* It is important to obtain modify RPC slot first (if applicable), so + * that threads that are waiting for a modify RPC slot are not polluting + * our rpcs in flight counter. + * We do not do flock request limiting, though */ + if (it) { + mdc_get_mod_rpc_slot(req, it); + rc = obd_get_request_slot(&obddev->u.cli); + if (rc != 0) { + mdc_put_mod_rpc_slot(req, it); + mdc_clear_replay_flag(req, 0); + ptlrpc_req_finished(req); + RETURN(rc); + } + } + + /* With Data-on-MDT the glimpse callback is needed too. + * It is set here in advance but not in mdc_finish_enqueue() + * to avoid possible races. It is safe to have glimpse handler + * for non-DOM locks and costs nothing.*/ + if (einfo->ei_cb_gl == NULL) + einfo->ei_cb_gl = mdc_ldlm_glimpse_ast; + + rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL, + 0, lvb_type, lockh, 0); + if (!it) { + /* For flock requests we immediatelly return without further + delay and let caller deal with the rest, since rest of + this function metadata processing makes no sense for flock + requests anyway. But in case of problem during comms with + Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we + can not rely on caller and this mainly for F_UNLCKs + (explicits or automatically generated by Kernel to clean + current FLocks upon exit) that can't be trashed */ + if (((rc == -EINTR) || (rc == -ETIMEDOUT)) && + (einfo->ei_type == LDLM_FLOCK) && + (einfo->ei_mode == LCK_NL)) + goto resend; + RETURN(rc); + } + + obd_put_request_slot(&obddev->u.cli); + mdc_put_mod_rpc_slot(req, it); + + if (rc < 0) { + CDEBUG(D_INFO, + "%s: ldlm_cli_enqueue "DFID":"DFID"=%s failed: rc = %d\n", + obddev->obd_name, PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2), op_data->op_name ?: "", rc); + + mdc_clear_replay_flag(req, rc); + ptlrpc_req_finished(req); + RETURN(rc); + } + + lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); + + lockrep->lock_policy_res2 = + ptlrpc_status_ntoh(lockrep->lock_policy_res2); + + /* Retry infinitely when the server returns -EINPROGRESS for the + * intent operation, when server returns -EINPROGRESS for acquiring + * intent lock, we'll retry in after_reply(). */ + if (it && (int)lockrep->lock_policy_res2 == -EINPROGRESS) { + mdc_clear_replay_flag(req, rc); + ptlrpc_req_finished(req); + if (generation == obddev->u.cli.cl_import->imp_generation) { + if (signal_pending(current)) + RETURN(-EINTR); + + resends++; + CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n", + obddev->obd_name, resends, it->it_op, + PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2)); + goto resend; + } else { + CDEBUG(D_HA, "resend cross eviction\n"); + RETURN(-EIO); + } + } + + if ((int)lockrep->lock_policy_res2 == -ERANGE && + it->it_op & (IT_OPEN | IT_GETATTR | IT_LOOKUP) && + acl_bufsize != imp->imp_connect_data.ocd_max_easize) { + mdc_clear_replay_flag(req, -ERANGE); + ptlrpc_req_finished(req); + acl_bufsize = imp->imp_connect_data.ocd_max_easize; + goto resend; + } + + rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); + if (rc < 0) { + if (lustre_handle_is_used(lockh)) { + ldlm_lock_decref(lockh, einfo->ei_mode); + memset(lockh, 0, sizeof(*lockh)); + } + ptlrpc_req_finished(req); + + it->it_lock_handle = 0; + it->it_lock_mode = 0; + it->it_request = NULL; + } + + RETURN(rc); +} + +int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + const union ldlm_policy_data *policy, + struct md_op_data *op_data, + struct lustre_handle *lockh, __u64 extra_lock_flags) +{ + return mdc_enqueue_base(exp, einfo, policy, NULL, + op_data, lockh, extra_lock_flags); +} + +static int mdc_finish_intent_lock(struct obd_export *exp, + struct ptlrpc_request *request, + struct md_op_data *op_data, + struct lookup_intent *it, + struct lustre_handle *lockh) +{ + struct lustre_handle old_lock; + struct ldlm_lock *lock; + int rc = 0; + ENTRY; + + LASSERT(request != NULL); + LASSERT(request != LP_POISON); + LASSERT(request->rq_repmsg != LP_POISON); + + if (it->it_op & IT_READDIR) + RETURN(0); + + if (it->it_op & (IT_GETXATTR | IT_LAYOUT)) { + if (it->it_status != 0) + GOTO(out, rc = it->it_status); + } else { + if (!it_disposition(it, DISP_IT_EXECD)) { + /* The server failed before it even started executing + * the intent, i.e. because it couldn't unpack the + * request. + */ + LASSERT(it->it_status != 0); + GOTO(out, rc = it->it_status); + } + rc = it_open_error(DISP_IT_EXECD, it); + if (rc) + GOTO(out, rc); + + rc = it_open_error(DISP_LOOKUP_EXECD, it); + if (rc) + GOTO(out, rc); + + /* keep requests around for the multiple phases of the call + * this shows the DISP_XX must guarantee we make it into the + * call + */ + if (!it_disposition(it, DISP_ENQ_CREATE_REF) && + it_disposition(it, DISP_OPEN_CREATE) && + !it_open_error(DISP_OPEN_CREATE, it)) { + it_set_disposition(it, DISP_ENQ_CREATE_REF); + /* balanced in ll_create_node */ + ptlrpc_request_addref(request); + } + if (!it_disposition(it, DISP_ENQ_OPEN_REF) && + it_disposition(it, DISP_OPEN_OPEN) && + !it_open_error(DISP_OPEN_OPEN, it)) { + it_set_disposition(it, DISP_ENQ_OPEN_REF); + /* balanced in ll_file_open */ + ptlrpc_request_addref(request); + /* BUG 11546 - eviction in the middle of open rpc + * processing + */ + OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, + obd_timeout); + } + + if (it->it_op & IT_CREAT) { + /* XXX this belongs in ll_create_it */ + } else if (it->it_op == IT_OPEN) { + LASSERT(!it_disposition(it, DISP_OPEN_CREATE)); + } else { + LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP)); + } + } + + /* If we already have a matching lock, then cancel the new + * one. We have to set the data here instead of in + * mdc_enqueue, because we need to use the child's inode as + * the l_ast_data to match, and that's not available until + * intent_finish has performed the iget().) */ + lock = ldlm_handle2lock(lockh); + if (lock) { + union ldlm_policy_data policy = lock->l_policy_data; + LDLM_DEBUG(lock, "matching against this"); + + if (it_has_reply_body(it)) { + struct mdt_body *body; + + body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + /* mdc_enqueue checked */ + LASSERT(body != NULL); + LASSERTF(fid_res_name_eq(&body->mbo_fid1, + &lock->l_resource->lr_name), + "Lock res_id: "DLDLMRES", fid: "DFID"\n", + PLDLMRES(lock->l_resource), + PFID(&body->mbo_fid1)); + } + LDLM_LOCK_PUT(lock); + + memcpy(&old_lock, lockh, sizeof(*lockh)); + if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL, + LDLM_IBITS, &policy, LCK_NL, &old_lock, 0)) { + ldlm_lock_decref_and_cancel(lockh, it->it_lock_mode); + memcpy(lockh, &old_lock, sizeof(old_lock)); + it->it_lock_handle = lockh->cookie; + } + } + + EXIT; +out: + CDEBUG(D_DENTRY,"D_IT dentry %.*s intent: %s status %d disp %x rc %d\n", + (int)op_data->op_namelen, op_data->op_name, + ldlm_it2str(it->it_op), it->it_status, + it->it_disposition, rc); + return rc; +} + +int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + /* We could just return 1 immediately, but since we should only + * be called in revalidate_it if we already have a lock, let's + * verify that. */ + struct ldlm_res_id res_id; + struct lustre_handle lockh; + union ldlm_policy_data policy; + enum ldlm_mode mode; + ENTRY; + + if (it->it_lock_handle) { + lockh.cookie = it->it_lock_handle; + mode = ldlm_revalidate_lock_handle(&lockh, bits); + } else { + fid_build_reg_res_name(fid, &res_id); + switch (it->it_op) { + case IT_GETATTR: + /* File attributes are held under multiple bits: + * nlink is under lookup lock, size and times are + * under UPDATE lock and recently we've also got + * a separate permissions lock for owner/group/acl that + * were protected by lookup lock before. + * Getattr must provide all of that information, + * so we need to ensure we have all of those locks. + * Unfortunately, if the bits are split across multiple + * locks, there's no easy way to match all of them here, + * so an extra RPC would be performed to fetch all + * of those bits at once for now. */ + /* For new MDTs(> 2.4), UPDATE|PERM should be enough, + * but for old MDTs (< 2.4), permission is covered + * by LOOKUP lock, so it needs to match all bits here.*/ + policy.l_inodebits.bits = MDS_INODELOCK_UPDATE | + MDS_INODELOCK_PERM; + break; + case IT_READDIR: + policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; + break; + case IT_LAYOUT: + policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT; + break; + default: + policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP; + break; + } + + mode = mdc_lock_match(exp, LDLM_FL_BLOCK_GRANTED, fid, + LDLM_IBITS, &policy, + LCK_CR | LCK_CW | LCK_PR | LCK_PW, + &lockh); + } + + if (mode) { + it->it_lock_handle = lockh.cookie; + it->it_lock_mode = mode; + } else { + it->it_lock_handle = 0; + it->it_lock_mode = 0; + } + + RETURN(!!mode); +} + +/* + * This long block is all about fixing up the lock and request state + * so that it is correct as of the moment _before_ the operation was + * applied; that way, the VFS will think that everything is normal and + * call Lustre's regular VFS methods. + * + * If we're performing a creation, that means that unless the creation + * failed with EEXIST, we should fake up a negative dentry. + * + * For everything else, we want to lookup to succeed. + * + * One additional note: if CREATE or OPEN succeeded, we add an extra + * reference to the request because we need to keep it around until + * ll_create/ll_open gets called. + * + * The server will return to us, in it_disposition, an indication of + * exactly what it_status refers to. + * + * If DISP_OPEN_OPEN is set, then it_status refers to the open() call, + * otherwise if DISP_OPEN_CREATE is set, then it status is the + * creation failure mode. In either case, one of DISP_LOOKUP_NEG or + * DISP_LOOKUP_POS will be set, indicating whether the child lookup + * was successful. + * + * Else, if DISP_LOOKUP_EXECD then it_status is the rc of the + * child lookup. + */ +int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + struct lookup_intent *it, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags) +{ + struct ldlm_enqueue_info einfo = { + .ei_type = LDLM_IBITS, + .ei_mode = it_to_lock_mode(it), + .ei_cb_bl = cb_blocking, + .ei_cb_cp = ldlm_completion_ast, + .ei_cb_gl = mdc_ldlm_glimpse_ast, + }; + struct lustre_handle lockh; + int rc = 0; + ENTRY; + LASSERT(it); + + CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID + ", intent: %s flags %#llo\n", (int)op_data->op_namelen, + op_data->op_name, PFID(&op_data->op_fid2), + PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), + it->it_flags); + + lockh.cookie = 0; + if (fid_is_sane(&op_data->op_fid2) && + (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_READDIR))) { + /* We could just return 1 immediately, but since we should only + * be called in revalidate_it if we already have a lock, let's + * verify that. */ + it->it_lock_handle = 0; + rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL); + /* Only return failure if it was not GETATTR by cfid + (from inode_revalidate) */ + if (rc || op_data->op_namelen != 0) + RETURN(rc); + } + + /* For case if upper layer did not alloc fid, do it now. */ + if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) { + rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc < 0) { + CERROR("Can't alloc new fid, rc %d\n", rc); + RETURN(rc); + } + } + + rc = mdc_enqueue_base(exp, &einfo, NULL, it, op_data, &lockh, + extra_lock_flags); + if (rc < 0) + RETURN(rc); + + *reqp = it->it_request; + rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh); + RETURN(rc); +} + +static int mdc_intent_getattr_async_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) +{ + struct mdc_getattr_args *ga = args; + struct obd_export *exp = ga->ga_exp; + struct md_enqueue_info *minfo = ga->ga_minfo; + struct ldlm_enqueue_info *einfo = &minfo->mi_einfo; + struct lookup_intent *it; + struct lustre_handle *lockh; + struct obd_device *obddev; + struct ldlm_reply *lockrep; + __u64 flags = LDLM_FL_HAS_INTENT; + ENTRY; + + it = &minfo->mi_it; + lockh = &minfo->mi_lockh; + + obddev = class_exp2obd(exp); + + obd_put_request_slot(&obddev->u.cli); + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE)) + rc = -ETIMEDOUT; + + rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode, + &flags, NULL, 0, lockh, rc); + if (rc < 0) { + CERROR("ldlm_cli_enqueue_fini: %d\n", rc); + mdc_clear_replay_flag(req, rc); + GOTO(out, rc); + } + + lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); + + lockrep->lock_policy_res2 = + ptlrpc_status_ntoh(lockrep->lock_policy_res2); + + rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); + if (rc) + GOTO(out, rc); + + rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh); + EXIT; + +out: + minfo->mi_cb(req, minfo, rc); + return 0; +} + +int mdc_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo) +{ + struct md_op_data *op_data = &minfo->mi_data; + struct lookup_intent *it = &minfo->mi_it; + struct ptlrpc_request *req; + struct mdc_getattr_args *ga; + struct obd_device *obddev = class_exp2obd(exp); + struct ldlm_res_id res_id; + union ldlm_policy_data policy = { + .l_inodebits = { MDS_INODELOCK_LOOKUP | + MDS_INODELOCK_UPDATE } }; + int rc = 0; + __u64 flags = LDLM_FL_HAS_INTENT; + ENTRY; + + CDEBUG(D_DLMTRACE, "name: %.*s in inode "DFID", intent: %s flags %#llo\n", + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), it->it_flags); + + fid_build_reg_res_name(&op_data->op_fid1, &res_id); + /* If the MDT return -ERANGE because of large ACL, then the sponsor + * of the async getattr RPC will handle that by itself. */ + req = mdc_intent_getattr_pack(exp, it, op_data, + LUSTRE_POSIX_ACL_MAX_SIZE_OLD); + if (IS_ERR(req)) + RETURN(PTR_ERR(req)); + + rc = obd_get_request_slot(&obddev->u.cli); + if (rc != 0) { + ptlrpc_req_finished(req); + RETURN(rc); + } + + /* With Data-on-MDT the glimpse callback is needed too. + * It is set here in advance but not in mdc_finish_enqueue() + * to avoid possible races. It is safe to have glimpse handler + * for non-DOM locks and costs nothing.*/ + if (minfo->mi_einfo.ei_cb_gl == NULL) + minfo->mi_einfo.ei_cb_gl = mdc_ldlm_glimpse_ast; + + rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy, + &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1); + if (rc < 0) { + obd_put_request_slot(&obddev->u.cli); + ptlrpc_req_finished(req); + RETURN(rc); + } + + CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args)); + ga = ptlrpc_req_async_args(req); + ga->ga_exp = exp; + ga->ga_minfo = minfo; + + req->rq_interpret_reply = mdc_intent_getattr_async_interpret; + ptlrpcd_add_req(req); + + RETURN(0); +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c new file mode 100644 index 0000000000000..096b20fd4847a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c @@ -0,0 +1,521 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC + +#include +#include + +#include +#include "mdc_internal.h" +#include + +/* mdc_setattr does its own semaphore handling */ +static int mdc_reint(struct ptlrpc_request *request, int level) +{ + int rc; + + request->rq_send_state = level; + + mdc_get_mod_rpc_slot(request, NULL); + rc = ptlrpc_queue_wait(request); + mdc_put_mod_rpc_slot(request, NULL); + if (rc) + CDEBUG(D_INFO, "error in handling %d\n", rc); + else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) { + rc = -EPROTO; + } + return rc; +} + +/* Find and cancel locally locks matched by inode @bits & @mode in the resource + * found by @fid. Found locks are added into @cancel list. Returns the amount of + * locks added to @cancels list. */ +int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, + struct list_head *cancels, enum ldlm_mode mode, + __u64 bits) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + union ldlm_policy_data policy = { {0} }; + struct ldlm_res_id res_id; + struct ldlm_resource *res; + int count; + ENTRY; + + /* Return, i.e. cancel nothing, only if ELC is supported (flag in + * export) but disabled through procfs (flag in NS). + * + * This distinguishes from a case when ELC is not supported originally, + * when we still want to cancel locks in advance and just cancel them + * locally, without sending any RPC. */ + if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) + RETURN(0); + + fid_build_reg_res_name(fid, &res_id); + res = ldlm_resource_get(exp->exp_obd->obd_namespace, + NULL, &res_id, 0, 0); + if (IS_ERR(res)) + RETURN(0); + LDLM_RESOURCE_ADDREF(res); + /* Initialize ibits lock policy. */ + policy.l_inodebits.bits = bits; + count = ldlm_cancel_resource_local(res, cancels, &policy, + mode, 0, 0, NULL); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(count); +} + +int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, size_t ealen, struct ptlrpc_request **request) +{ + struct list_head cancels = LIST_HEAD_INIT(cancels); + struct ptlrpc_request *req; + int count = 0, rc; + __u64 bits; + ENTRY; + + LASSERT(op_data != NULL); + + bits = MDS_INODELOCK_UPDATE; + if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) + bits |= MDS_INODELOCK_LOOKUP; + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, bits); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_SETATTR); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT, 0); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen); + req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT, 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME)) + CDEBUG(D_INODE, "setting mtime %lld, ctime %lld\n", + (s64)op_data->op_attr.ia_mtime.tv_sec, + (s64)op_data->op_attr.ia_ctime.tv_sec); + mdc_setattr_pack(req, op_data, ea, ealen); + + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + req->rq_import->imp_connect_data.ocd_max_easize); + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, LUSTRE_IMP_FULL); + if (rc == -ERESTARTSYS) + rc = 0; + + *request = req; + + RETURN(rc); +} + +int mdc_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, size_t datalen, + umode_t mode, uid_t uid, gid_t gid, + cfs_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int level, rc; + int count, resends = 0; + struct obd_import *import = exp->exp_obd->u.cli.cl_import; + int generation = import->imp_generation; + struct list_head cancels = LIST_HEAD_INIT(cancels); + ENTRY; + + /* For case if upper layer did not alloc fid, do it now. */ + if (!fid_is_sane(&op_data->op_fid2)) { + /* + * mdc_fid_alloc() may return errno 1 in case of switch to new + * sequence, handle this. + */ + rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc < 0) + RETURN(rc); + } + +rebuild: + count = 0; + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_CREATE_ACL); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + data && datalen ? datalen : 0); + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME, + RCL_CLIENT, op_data->op_file_secctx_name != NULL ? + strlen(op_data->op_file_secctx_name) + 1 : 0); + + req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT, + op_data->op_file_secctx_size); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + /* + * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with + * tgt, for symlinks or lov MD data. + */ + mdc_create_pack(req, op_data, data, datalen, mode, uid, + gid, cap_effective, rdev); + + ptlrpc_request_set_replen(req); + + /* ask ptlrpc not to resend on EINPROGRESS since we have our own retry + * logic here */ + req->rq_no_retry_einprogress = 1; + + if (resends) { + req->rq_generation_set = 1; + req->rq_import_generation = generation; + req->rq_sent = ktime_get_real_seconds() + resends; + } + level = LUSTRE_IMP_FULL; + resend: + rc = mdc_reint(req, level); + + /* Resend if we were told to. */ + if (rc == -ERESTARTSYS) { + level = LUSTRE_IMP_RECOVER; + goto resend; + } else if (rc == -EINPROGRESS) { + /* Retry create infinitely until succeed or get other + * error code or interrupted. */ + ptlrpc_req_finished(req); + if (generation == import->imp_generation) { + if (signal_pending(current)) + RETURN(-EINTR); + + resends++; + CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n", + exp->exp_obd->obd_name, resends, + PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2)); + goto rebuild; + } else { + CDEBUG(D_HA, "resend cross eviction\n"); + RETURN(-EIO); + } + } + + *request = req; + RETURN(rc); +} + +int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct list_head cancels = LIST_HEAD_INIT(cancels); + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req = *request; + int count = 0, rc; + ENTRY; + + LASSERT(req == NULL); + + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && + (fid_is_sane(&op_data->op_fid3))) + /* don't cancel DoM lock which may cause data flush */ + count += mdc_resource_get_unused(exp, &op_data->op_fid3, + &cancels, LCK_EX, + MDS_INODELOCK_ELC); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_UNLINK); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_unlink_pack(req, op_data); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + ptlrpc_request_set_replen(req); + + *request = req; + + rc = mdc_reint(req, LUSTRE_IMP_FULL); + if (rc == -ERESTARTSYS) + rc = 0; + RETURN(rc); +} + +int mdc_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct list_head cancels = LIST_HEAD_INIT(cancels); + struct ptlrpc_request *req; + int count = 0, rc; + ENTRY; + + if ((op_data->op_flags & MF_MDC_CANCEL_FID2) && + (fid_is_sane(&op_data->op_fid2))) + count = mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count += mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_link_pack(req, op_data); + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, LUSTRE_IMP_FULL); + *request = req; + if (rc == -ERESTARTSYS) + rc = 0; + + RETURN(rc); +} + +int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, size_t oldlen, const char *new, size_t newlen, + struct ptlrpc_request **request) +{ + struct list_head cancels = LIST_HEAD_INIT(cancels); + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req; + int count = 0, rc; + + ENTRY; + + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID2) && + (fid_is_sane(&op_data->op_fid2))) + count += mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && + (fid_is_sane(&op_data->op_fid3))) + count += mdc_resource_get_unused(exp, &op_data->op_fid3, + &cancels, LCK_EX, + MDS_INODELOCK_LOOKUP); + if ((op_data->op_flags & MF_MDC_CANCEL_FID4) && + (fid_is_sane(&op_data->op_fid4))) + count += mdc_resource_get_unused(exp, &op_data->op_fid4, + &cancels, LCK_EX, + MDS_INODELOCK_ELC); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + op_data->op_cli_flags & CLI_MIGRATE ? + &RQF_MDS_REINT_MIGRATE : &RQF_MDS_REINT_RENAME); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1); + req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1); + if (op_data->op_cli_flags & CLI_MIGRATE) + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + op_data->op_data_size); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (exp_connect_cancelset(exp) && req) + ldlm_cli_cancel_list(&cancels, count, req, 0); + + if (op_data->op_cli_flags & CLI_MIGRATE) + mdc_migrate_pack(req, op_data, old, oldlen); + else + mdc_rename_pack(req, op_data, old, oldlen, new, newlen); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, LUSTRE_IMP_FULL); + *request = req; + if (rc == -ERESTARTSYS) + rc = 0; + + RETURN(rc); +} + +int mdc_file_resync(struct obd_export *exp, struct md_op_data *op_data) +{ + struct list_head cancels = LIST_HEAD_INIT(cancels); + struct ptlrpc_request *req; + struct ldlm_lock *lock; + struct mdt_rec_resync *rec; + int count = 0, rc; + ENTRY; + + if (op_data->op_flags & MF_MDC_CANCEL_FID1 && + fid_is_sane(&op_data->op_fid1)) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_LAYOUT); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_RESYNC); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + CLASSERT(sizeof(*rec) == sizeof(struct mdt_rec_reint)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + rec->rs_opcode = REINT_RESYNC; + rec->rs_fsuid = op_data->op_fsuid; + rec->rs_fsgid = op_data->op_fsgid; + rec->rs_cap = op_data->op_cap; + rec->rs_fid = op_data->op_fid1; + rec->rs_bias = op_data->op_bias; + rec->rs_mirror_id = op_data->op_mirror_id; + + lock = ldlm_handle2lock(&op_data->op_lease_handle); + if (lock != NULL) { + rec->rs_lease_handle = lock->l_remote_handle; + LDLM_LOCK_PUT(lock); + } + + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, LUSTRE_IMP_FULL); + if (rc == -ERESTARTSYS) + rc = 0; + + ptlrpc_req_finished(req); + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c new file mode 100644 index 0000000000000..c91b65eddf39b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c @@ -0,0 +1,3020 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC + +#include +#include +#include +#include +#include +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mdc_internal.h" + +#define REQUEST_MINOR 244 + +static int mdc_cleanup(struct obd_device *obd); + +static inline int mdc_queue_wait(struct ptlrpc_request *req) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + int rc; + + /* obd_get_request_slot() ensures that this client has no more + * than cl_max_rpcs_in_flight RPCs simultaneously inf light + * against an MDT. */ + rc = obd_get_request_slot(cli); + if (rc != 0) + return rc; + + rc = ptlrpc_queue_wait(req); + obd_put_request_slot(cli); + + return rc; +} + +/* + * Send MDS_GET_ROOT RPC to fetch root FID. + * + * If \a fileset is not NULL it should contain a subdirectory off + * the ROOT/ directory to be mounted on the client. Return the FID + * of the subdirectory to the client to mount onto its mountpoint. + * + * \param[in] imp MDC import + * \param[in] fileset fileset name, which could be NULL + * \param[out] rootfid root FID of this mountpoint + * \param[out] pc root capa will be unpacked and saved in this pointer + * + * \retval 0 on success, negative errno on failure + */ +static int mdc_get_root(struct obd_export *exp, const char *fileset, + struct lu_fid *rootfid) +{ + struct ptlrpc_request *req; + struct mdt_body *body; + int rc; + + ENTRY; + + if (fileset && !(exp_connect_flags(exp) & OBD_CONNECT_SUBTREE)) + RETURN(-ENOTSUPP); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_GET_ROOT); + if (req == NULL) + RETURN(-ENOMEM); + + if (fileset != NULL) + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + strlen(fileset) + 1); + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_ROOT); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + mdc_pack_body(req, NULL, 0, 0, -1, 0); + if (fileset != NULL) { + char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + + memcpy(name, fileset, strlen(fileset)); + } + lustre_msg_add_flags(req->rq_reqmsg, LUSTRE_IMP_FULL); + req->rq_send_state = LUSTRE_IMP_FULL; + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + *rootfid = body->mbo_fid1; + CDEBUG(D_NET, "root fid="DFID", last_committed=%llu\n", + PFID(rootfid), lustre_msg_get_last_committed(req->rq_repmsg)); + EXIT; +out: + ptlrpc_req_finished(req); + + return rc; +} + +/* + * This function now is known to always saying that it will receive 4 buffers + * from server. Even for cases when acl_size and md_size is zero, RPC header + * will contain 4 fields and RPC itself will contain zero size fields. This is + * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed + * and thus zero, it shrinks it, making zero size. The same story about + * md_size. And this is course of problem when client waits for smaller number + * of fields. This issue will be fixed later when client gets aware of RPC + * layouts. --umka + */ +static int mdc_getattr_common(struct obd_export *exp, + struct ptlrpc_request *req) +{ + struct req_capsule *pill = &req->rq_pill; + struct mdt_body *body; + void *eadata; + int rc; + ENTRY; + + /* Request message already built. */ + rc = ptlrpc_queue_wait(req); + if (rc != 0) + RETURN(rc); + + /* sanity check for the reply */ + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + CDEBUG(D_NET, "mode: %o\n", body->mbo_mode); + + mdc_update_max_ea_from_body(exp, body); + if (body->mbo_eadatasize != 0) { + eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + body->mbo_eadatasize); + if (eadata == NULL) + RETURN(-EPROTO); + } + + RETURN(0); +} + +static void mdc_reset_acl_req(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_early_free_lock); + sptlrpc_cli_free_repbuf(req); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; + req->rq_repdata = NULL; + req->rq_reqdata_len = 0; + spin_unlock(&req->rq_early_free_lock); +} + +static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + struct obd_import *imp = class_exp2cliimp(exp); + __u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD; + int rc; + ENTRY; + + /* Single MDS without an LMV case */ + if (op_data->op_flags & MF_GET_MDT_IDX) { + op_data->op_mds = 0; + RETURN(0); + } + + *request = NULL; + req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + +again: + mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid, + op_data->op_mode, -1, 0); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize); + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + op_data->op_mode); + ptlrpc_request_set_replen(req); + + rc = mdc_getattr_common(exp, req); + if (rc) { + if (rc == -ERANGE && + acl_bufsize != imp->imp_connect_data.ocd_max_easize) { + acl_bufsize = imp->imp_connect_data.ocd_max_easize; + mdc_reset_acl_req(req); + goto again; + } + + ptlrpc_req_finished(req); + } else { + *request = req; + } + + RETURN(rc); +} + +static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + struct obd_import *imp = class_exp2cliimp(exp); + __u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD; + int rc; + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR_NAME); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (op_data->op_name) { + char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LASSERT(strnlen(op_data->op_name, op_data->op_namelen) == + op_data->op_namelen); + memcpy(name, op_data->op_name, op_data->op_namelen); + } + +again: + mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid, + op_data->op_mode, op_data->op_suppgids[0], 0); + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + op_data->op_mode); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize); + ptlrpc_request_set_replen(req); + + rc = mdc_getattr_common(exp, req); + if (rc) { + if (rc == -ERANGE && + acl_bufsize != imp->imp_connect_data.ocd_max_easize) { + acl_bufsize = imp->imp_connect_data.ocd_max_easize; + mdc_reset_acl_req(req); + goto again; + } + + ptlrpc_req_finished(req); + } else { + *request = req; + } + + RETURN(rc); +} + +static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt, + const struct lu_fid *fid, int opcode, u64 valid, + const char *xattr_name, const char *input, + int input_size, int output_size, int flags, + __u32 suppgid, struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int xattr_namelen = 0; + char *tmp; + int rc; + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt); + if (req == NULL) + RETURN(-ENOMEM); + + if (xattr_name) { + xattr_namelen = strlen(xattr_name) + 1; + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + xattr_namelen); + } + if (input_size) + LASSERT(input); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + input_size); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(req); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(req->rq_sepol) ? + strlen(req->rq_sepol) + 1 : 0); + + /* Flush local XATTR locks to get rid of a possible cancel RPC */ + if (opcode == MDS_REINT && fid_is_sane(fid) && + exp->exp_connect_data.ocd_ibits_known & MDS_INODELOCK_XATTR) { + struct list_head cancels = LIST_HEAD_INIT(cancels); + int count; + + /* Without that packing would fail */ + if (input_size == 0) + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, + RCL_CLIENT, 0); + + count = mdc_resource_get_unused(exp, fid, + &cancels, LCK_EX, + MDS_INODELOCK_XATTR); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + } else { + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + } + + if (opcode == MDS_REINT) { + struct mdt_rec_setxattr *rec; + + CLASSERT(sizeof(struct mdt_rec_setxattr) == + sizeof(struct mdt_rec_reint)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + rec->sx_opcode = REINT_SETXATTR; + rec->sx_fsuid = from_kuid(&init_user_ns, current_fsuid()); + rec->sx_fsgid = from_kgid(&init_user_ns, current_fsgid()); + rec->sx_cap = cfs_curproc_cap_pack(); + rec->sx_suppgid1 = suppgid; + rec->sx_suppgid2 = -1; + rec->sx_fid = *fid; + rec->sx_valid = valid | OBD_MD_FLCTIME; + rec->sx_time = ktime_get_real_seconds(); + rec->sx_size = output_size; + rec->sx_flags = flags; + } else { + mdc_pack_body(req, fid, valid, output_size, suppgid, flags); + } + + if (xattr_name) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + memcpy(tmp, xattr_name, xattr_namelen); + } + if (input_size) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, input, input_size); + } + + mdc_file_sepol_pack(req); + + if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER)) + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, + RCL_SERVER, output_size); + ptlrpc_request_set_replen(req); + + /* make rpc */ + if (opcode == MDS_REINT) + mdc_get_mod_rpc_slot(req, NULL); + + rc = ptlrpc_queue_wait(req); + + if (opcode == MDS_REINT) + mdc_put_mod_rpc_slot(req, NULL); + + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + RETURN(rc); +} + +static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, + const void *value, size_t value_size, + unsigned int xattr_flags, u32 suppgid, + struct ptlrpc_request **req) +{ + LASSERT(obd_md_valid == OBD_MD_FLXATTR || + obd_md_valid == OBD_MD_FLXATTRRM); + + return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR, + fid, MDS_REINT, obd_md_valid, name, + value, value_size, 0, xattr_flags, suppgid, + req); +} + +static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid, + u64 obd_md_valid, const char *name, size_t buf_size, + struct ptlrpc_request **req) +{ + struct mdt_body *body; + int rc; + + LASSERT(obd_md_valid == OBD_MD_FLXATTR || + obd_md_valid == OBD_MD_FLXATTRLS); + + CDEBUG(D_INFO, "%s: get xattr '%s' for "DFID"\n", + exp->exp_obd->obd_name, name, PFID(fid)); + rc = mdc_xattr_common(exp, &RQF_MDS_GETXATTR, fid, MDS_GETXATTR, + obd_md_valid, name, NULL, 0, buf_size, 0, -1, + req); + if (rc < 0) + GOTO(out, rc); + + body = req_capsule_server_get(&(*req)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + /* only detect the xattr size */ + if (buf_size == 0) { + /* LU-11109: Older MDTs do not distinguish + * between nonexistent xattrs and zero length + * values in this case. Newer MDTs will return + * -ENODATA or set OBD_MD_FLXATTR. */ + GOTO(out, rc = body->mbo_eadatasize); + } + + if (body->mbo_eadatasize == 0) { + /* LU-11109: Newer MDTs set OBD_MD_FLXATTR on + * success so that we can distinguish between + * zero length value and nonexistent xattr. + * + * If OBD_MD_FLXATTR is not set then we keep + * the old behavior and return -ENODATA for + * getxattr() when mbo_eadatasize is 0. But + * -ENODATA only makes sense for getxattr() + * and not for listxattr(). */ + if (body->mbo_valid & OBD_MD_FLXATTR) + GOTO(out, rc = 0); + else if (obd_md_valid == OBD_MD_FLXATTR) + GOTO(out, rc = -ENODATA); + else + GOTO(out, rc = 0); + } + + GOTO(out, rc = body->mbo_eadatasize); +out: + if (rc < 0) { + ptlrpc_req_finished(*req); + *req = NULL; + } + + return rc; +} + +#ifdef CONFIG_FS_POSIX_ACL +static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md) +{ + struct req_capsule *pill = &req->rq_pill; + struct mdt_body *body = md->body; + struct posix_acl *acl; + void *buf; + int rc; + ENTRY; + + if (!body->mbo_aclsize) + RETURN(0); + + buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->mbo_aclsize); + + if (!buf) + RETURN(-EPROTO); + + acl = posix_acl_from_xattr(&init_user_ns, buf, body->mbo_aclsize); + if (acl == NULL) + RETURN(0); + if (IS_ERR(acl)) { + rc = PTR_ERR(acl); + CERROR("convert xattr to acl: %d\n", rc); + RETURN(rc); + } + + rc = posix_acl_valid(&init_user_ns, acl); + if (rc) { + CERROR("validate acl: %d\n", rc); + posix_acl_release(acl); + RETURN(rc); + } + + md->posix_acl = acl; + RETURN(0); +} +#else +#define mdc_unpack_acl(req, md) 0 +#endif + +int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req, + struct obd_export *dt_exp, struct obd_export *md_exp, + struct lustre_md *md) +{ + struct req_capsule *pill = &req->rq_pill; + int rc; + ENTRY; + + LASSERT(md); + memset(md, 0, sizeof(*md)); + + md->body = req_capsule_server_get(pill, &RMF_MDT_BODY); + LASSERT(md->body != NULL); + + if (md->body->mbo_valid & OBD_MD_FLEASIZE) { + if (!S_ISREG(md->body->mbo_mode)) { + CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, should be a " + "regular file, but is not\n"); + GOTO(out, rc = -EPROTO); + } + + if (md->body->mbo_eadatasize == 0) { + CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, " + "but eadatasize 0\n"); + GOTO(out, rc = -EPROTO); + } + + md->layout.lb_len = md->body->mbo_eadatasize; + md->layout.lb_buf = req_capsule_server_sized_get(pill, + &RMF_MDT_MD, + md->layout.lb_len); + if (md->layout.lb_buf == NULL) + GOTO(out, rc = -EPROTO); + } else if (md->body->mbo_valid & OBD_MD_FLDIREA) { + const union lmv_mds_md *lmv; + size_t lmv_size; + + if (!S_ISDIR(md->body->mbo_mode)) { + CDEBUG(D_INFO, "OBD_MD_FLDIREA set, should be a " + "directory, but is not\n"); + GOTO(out, rc = -EPROTO); + } + + if (md_exp->exp_obd->obd_type->typ_lu == &mdc_device_type) { + CERROR("%s: no LMV, upgrading from old version?\n", + md_exp->exp_obd->obd_name); + + GOTO(out_acl, rc = 0); + } + + if (md->body->mbo_valid & OBD_MD_MEA) { + lmv_size = md->body->mbo_eadatasize; + if (lmv_size == 0) { + CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, " + "but eadatasize 0\n"); + RETURN(-EPROTO); + } + + lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + lmv_size); + if (lmv == NULL) + GOTO(out, rc = -EPROTO); + + rc = md_unpackmd(md_exp, &md->lmv, lmv, lmv_size); + if (rc < 0) + GOTO(out, rc); + } + + /* since 2.12.58 intent_getattr fetches default LMV */ + if (md->body->mbo_valid & OBD_MD_DEFAULT_MEA) { + lmv_size = sizeof(struct lmv_user_md); + lmv = req_capsule_server_sized_get(pill, + &RMF_DEFAULT_MDT_MD, + lmv_size); + if (!lmv) + GOTO(out, rc = -EPROTO); + + rc = md_unpackmd(md_exp, &md->default_lmv, lmv, + lmv_size); + if (rc < 0) + GOTO(out, rc); + + if (rc < (int)sizeof(*md->default_lmv)) { + CDEBUG(D_INFO, + "default lmv size too small: %d < %d\n", + rc, (int)sizeof(*md->default_lmv)); + GOTO(out, rc = -EPROTO); + } + } + } + rc = 0; + +out_acl: + if (md->body->mbo_valid & OBD_MD_FLACL) { + /* for ACL, it's possible that FLACL is set but aclsize is zero. + * only when aclsize != 0 there's an actual segment for ACL + * in reply buffer. + */ + if (md->body->mbo_aclsize) { + rc = mdc_unpack_acl(req, md); + if (rc) + GOTO(out, rc); +#ifdef CONFIG_FS_POSIX_ACL + } else { + md->posix_acl = NULL; +#endif + } + } + + EXIT; +out: + if (rc) { +#ifdef CONFIG_FS_POSIX_ACL + posix_acl_release(md->posix_acl); +#endif + } + return rc; +} + +int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md) +{ + ENTRY; + RETURN(0); +} + +void mdc_replay_open(struct ptlrpc_request *req) +{ + struct md_open_data *mod = req->rq_cb_data; + struct ptlrpc_request *close_req; + struct obd_client_handle *och; + struct lustre_handle old_open_handle = { }; + struct mdt_body *body; + ENTRY; + + if (mod == NULL) { + DEBUG_REQ(D_ERROR, req, + "Can't properly replay without open data."); + EXIT; + return; + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + spin_lock(&req->rq_lock); + och = mod->mod_och; + if (och && och->och_open_handle.cookie) + req->rq_early_free_repbuf = 1; + else + req->rq_early_free_repbuf = 0; + spin_unlock(&req->rq_lock); + + if (req->rq_early_free_repbuf) { + struct lustre_handle *file_open_handle; + + LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC); + + file_open_handle = &och->och_open_handle; + CDEBUG(D_HA, "updating handle from %#llx to %#llx\n", + file_open_handle->cookie, body->mbo_open_handle.cookie); + old_open_handle = *file_open_handle; + *file_open_handle = body->mbo_open_handle; + } + + close_req = mod->mod_close_req; + if (close_req) { + __u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg); + struct mdt_ioepoch *epoch; + + LASSERT(opc == MDS_CLOSE); + epoch = req_capsule_client_get(&close_req->rq_pill, + &RMF_MDT_EPOCH); + LASSERT(epoch); + + if (req->rq_early_free_repbuf) + LASSERT(old_open_handle.cookie == + epoch->mio_open_handle.cookie); + + DEBUG_REQ(D_HA, close_req, "updating close body with new fh"); + epoch->mio_open_handle = body->mbo_open_handle; + } + EXIT; +} + +void mdc_commit_open(struct ptlrpc_request *req) +{ + struct md_open_data *mod = req->rq_cb_data; + if (mod == NULL) + return; + + /** + * No need to touch md_open_data::mod_och, it holds a reference on + * \var mod and will zero references to each other, \var mod will be + * freed after that when md_open_data::mod_och will put the reference. + */ + + /** + * Do not let open request to disappear as it still may be needed + * for close rpc to happen (it may happen on evict only, otherwise + * ptlrpc_request::rq_replay does not let mdc_commit_open() to be + * called), just mark this rpc as committed to distinguish these 2 + * cases, see mdc_close() for details. The open request reference will + * be put along with freeing \var mod. + */ + ptlrpc_request_addref(req); + spin_lock(&req->rq_lock); + req->rq_committed = 1; + spin_unlock(&req->rq_lock); + req->rq_cb_data = NULL; + obd_mod_put(mod); +} + +int mdc_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct lookup_intent *it) +{ + struct md_open_data *mod; + struct mdt_rec_create *rec; + struct mdt_body *body; + struct ptlrpc_request *open_req = it->it_request; + struct obd_import *imp = open_req->rq_import; + ENTRY; + + if (!open_req->rq_replay) + RETURN(0); + + rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT); + body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY); + LASSERT(rec != NULL); + /* Incoming message in my byte order (it's been swabbed). */ + /* Outgoing messages always in my byte order. */ + LASSERT(body != NULL); + + /* Only if the import is replayable, we set replay_open data */ + if (och && imp->imp_replayable) { + mod = obd_mod_alloc(); + if (mod == NULL) { + DEBUG_REQ(D_ERROR, open_req, + "Can't allocate md_open_data"); + RETURN(0); + } + + /** + * Take a reference on \var mod, to be freed on mdc_close(). + * It protects \var mod from being freed on eviction (commit + * callback is called despite rq_replay flag). + * Another reference for \var och. + */ + obd_mod_get(mod); + obd_mod_get(mod); + + spin_lock(&open_req->rq_lock); + och->och_mod = mod; + mod->mod_och = och; + mod->mod_is_create = it_disposition(it, DISP_OPEN_CREATE) || + it_disposition(it, DISP_OPEN_STRIPE); + mod->mod_open_req = open_req; + open_req->rq_cb_data = mod; + open_req->rq_commit_cb = mdc_commit_open; + open_req->rq_early_free_repbuf = 1; + spin_unlock(&open_req->rq_lock); + } + + rec->cr_fid2 = body->mbo_fid1; + rec->cr_open_handle_old = body->mbo_open_handle; + open_req->rq_replay_cb = mdc_replay_open; + if (!fid_is_sane(&body->mbo_fid1)) { + DEBUG_REQ(D_ERROR, open_req, + "saving replay request with insane FID " DFID, + PFID(&body->mbo_fid1)); + LBUG(); + } + + DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data"); + RETURN(0); +} + +static void mdc_free_open(struct md_open_data *mod) +{ + int committed = 0; + + if (mod->mod_is_create == 0 && + imp_connect_disp_stripe(mod->mod_open_req->rq_import)) + committed = 1; + + /** + * No reason to asssert here if the open request has + * rq_replay == 1. It means that mdc_close failed, and + * close request wasn`t sent. It is not fatal to client. + * The worst thing is eviction if the client gets open lock + **/ + + DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "free open request rq_replay" + "= %d\n", mod->mod_open_req->rq_replay); + + ptlrpc_request_committed(mod->mod_open_req, committed); + if (mod->mod_close_req) + ptlrpc_request_committed(mod->mod_close_req, committed); +} + +int mdc_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + struct md_open_data *mod = och->och_mod; + ENTRY; + + /** + * It is possible to not have \var mod in a case of eviction between + * lookup and ll_file_open(). + **/ + if (mod == NULL) + RETURN(0); + + LASSERT(mod != LP_POISON); + LASSERT(mod->mod_open_req != NULL); + + spin_lock(&mod->mod_open_req->rq_lock); + if (mod->mod_och) + mod->mod_och->och_open_handle.cookie = 0; + mod->mod_open_req->rq_early_free_repbuf = 0; + spin_unlock(&mod->mod_open_req->rq_lock); + mdc_free_open(mod); + + mod->mod_och = NULL; + och->och_mod = NULL; + obd_mod_put(mod); + + RETURN(0); +} + +static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, struct ptlrpc_request **request) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct req_format *req_fmt; + size_t u32_count = 0; + int rc; + int saved_rc = 0; + ENTRY; + + CDEBUG(D_INODE, "%s: "DFID" file closed with intent: %x\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + op_data->op_bias); + + if (op_data->op_bias & MDS_CLOSE_INTENT) { + req_fmt = &RQF_MDS_CLOSE_INTENT; + if (op_data->op_bias & MDS_HSM_RELEASE) { + /* allocate a FID for volatile file */ + rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, + op_data); + if (rc < 0) { + CERROR("%s: "DFID" allocating FID: rc = %d\n", + obd->obd_name, PFID(&op_data->op_fid1), + rc); + /* save the errcode and proceed to close */ + saved_rc = rc; + } + } + if (op_data->op_bias & MDS_CLOSE_RESYNC_DONE) { + size_t count = op_data->op_data_size / sizeof(__u32); + + if (count > INLINE_RESYNC_ARRAY_SIZE) + u32_count = count; + } + } else { + req_fmt = &RQF_MDS_CLOSE; + } + + *request = NULL; + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_CLOSE)) + req = NULL; + else + req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt); + + /* Ensure that this close's handle is fixed up during replay. */ + if (likely(mod != NULL)) { + LASSERTF(mod->mod_open_req != NULL && + mod->mod_open_req->rq_type != LI_POISON, + "POISONED open %p!\n", mod->mod_open_req); + + mod->mod_close_req = req; + + DEBUG_REQ(D_HA, mod->mod_open_req, "matched open"); + /* We no longer want to preserve this open for replay even + * though the open was committed. b=3632, b=3633 */ + spin_lock(&mod->mod_open_req->rq_lock); + mod->mod_open_req->rq_replay = 0; + spin_unlock(&mod->mod_open_req->rq_lock); + } else { + CDEBUG(D_HA, "couldn't find open req; expecting close error\n"); + } + if (req == NULL) { + /** + * TODO: repeat close after errors + */ + CWARN("%s: close of FID "DFID" failed, file reference will be " + "dropped when this client unmounts or is evicted\n", + obd->obd_name, PFID(&op_data->op_fid1)); + GOTO(out, rc = -ENOMEM); + } + + if (u32_count > 0) + req_capsule_set_size(&req->rq_pill, &RMF_U32, RCL_CLIENT, + u32_count * sizeof(__u32)); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + GOTO(out, rc); + } + + /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a + * portal whose threads are not taking any DLM locks and are therefore + * always progressing */ + req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (!(exp_connect_flags2(exp) & OBD_CONNECT2_LSOM)) + op_data->op_xvalid &= ~(OP_XVALID_LAZYSIZE | + OP_XVALID_LAZYBLOCKS); + + mdc_close_pack(req, op_data); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_default_mds_easize); + + ptlrpc_request_set_replen(req); + + mdc_get_mod_rpc_slot(req, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_mod_rpc_slot(req, NULL); + + if (req->rq_repmsg == NULL) { + CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req, + req->rq_status); + if (rc == 0) + rc = req->rq_status ?: -EIO; + } else if (rc == 0 || rc == -EAGAIN) { + struct mdt_body *body; + + rc = lustre_msg_get_status(req->rq_repmsg); + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR, err " + "= %d", rc); + if (rc > 0) + rc = -rc; + } + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + rc = -EPROTO; + } else if (rc == -ESTALE) { + /** + * it can be allowed error after 3633 if open was committed and + * server failed before close was sent. Let's check if mod + * exists and return no error in that case + */ + if (mod) { + DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc); + LASSERT(mod->mod_open_req != NULL); + if (mod->mod_open_req->rq_committed) + rc = 0; + } + } + +out: + if (mod) { + if (rc != 0) + mod->mod_close_req = NULL; + /* Since now, mod is accessed through open_req only, + * thus close req does not keep a reference on mod anymore. */ + obd_mod_put(mod); + } + *request = req; + + RETURN(rc < 0 ? rc : saved_rc); +} + +static int mdc_getpage(struct obd_export *exp, const struct lu_fid *fid, + u64 offset, struct page **pages, int npages, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + int i; + wait_queue_head_t waitq; + int resends = 0; + struct l_wait_info lwi; + int rc; + ENTRY; + + *request = NULL; + init_waitqueue_head(&waitq); + +restart_bulk: + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + desc = ptlrpc_prep_bulk_imp(req, npages, 1, + PTLRPC_BULK_PUT_SINK | PTLRPC_BULK_BUF_KIOV, + MDS_BULK_PORTAL, + &ptlrpc_bulk_kiov_pin_ops); + if (desc == NULL) { + ptlrpc_req_finished(req); + RETURN(-ENOMEM); + } + + /* NB req now owns desc and will free it when it gets freed */ + for (i = 0; i < npages; i++) + desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0, + PAGE_SIZE); + + mdc_readdir_pack(req, offset, PAGE_SIZE * npages, fid); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) { + ptlrpc_req_finished(req); + if (rc != -ETIMEDOUT) + RETURN(rc); + + resends++; + if (!client_should_resend(resends, &exp->exp_obd->u.cli)) { + CERROR("%s: too many resend retries: rc = %d\n", + exp->exp_obd->obd_name, -EIO); + RETURN(-EIO); + } + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, + NULL); + l_wait_event(waitq, 0, &lwi); + + goto restart_bulk; + } + + rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, + req->rq_bulk->bd_nob_transferred); + if (rc < 0) { + ptlrpc_req_finished(req); + RETURN(rc); + } + + if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) { + CERROR("%s: unexpected bytes transferred: %d (%ld expected)\n", + exp->exp_obd->obd_name, req->rq_bulk->bd_nob_transferred, + PAGE_SIZE * npages); + ptlrpc_req_finished(req); + RETURN(-EPROTO); + } + + *request = req; + RETURN(0); +} + +static void mdc_release_page(struct page *page, int remove) +{ + if (remove) { + lock_page(page); + if (likely(page->mapping != NULL)) + truncate_complete_page(page->mapping, page); + unlock_page(page); + } + put_page(page); +} + +static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, + __u64 *start, __u64 *end, int hash64) +{ + /* + * Complement of hash is used as an index so that + * radix_tree_gang_lookup() can be used to find a page with starting + * hash _smaller_ than one we are looking for. + */ + unsigned long offset = hash_x_index(*hash, hash64); + struct page *page; + int found; + + xa_lock_irq(&mapping->i_pages); + found = radix_tree_gang_lookup(&mapping->page_tree, + (void **)&page, offset, 1); + if (found > 0 && !xa_is_value(page)) { + struct lu_dirpage *dp; + + get_page(page); + xa_unlock_irq(&mapping->i_pages); + /* + * In contrast to find_lock_page() we are sure that directory + * page cannot be truncated (while DLM lock is held) and, + * hence, can avoid restart. + * + * In fact, page cannot be locked here at all, because + * mdc_read_page_remote does synchronous io. + */ + wait_on_page_locked(page); + if (PageUptodate(page)) { + dp = kmap(page); + if (BITS_PER_LONG == 32 && hash64) { + *start = le64_to_cpu(dp->ldp_hash_start) >> 32; + *end = le64_to_cpu(dp->ldp_hash_end) >> 32; + *hash = *hash >> 32; + } else { + *start = le64_to_cpu(dp->ldp_hash_start); + *end = le64_to_cpu(dp->ldp_hash_end); + } + if (unlikely(*start == 1 && *hash == 0)) + *hash = *start; + else + LASSERTF(*start <= *hash, "start = %#llx" + ",end = %#llx,hash = %#llx\n", + *start, *end, *hash); + CDEBUG(D_VFSTRACE, "offset %lx [%#llx %#llx]," + " hash %#llx\n", offset, *start, *end, *hash); + if (*hash > *end) { + kunmap(page); + mdc_release_page(page, 0); + page = NULL; + } else if (*end != *start && *hash == *end) { + /* + * upon hash collision, remove this page, + * otherwise put page reference, and + * mdc_read_page_remote() will issue RPC to + * fetch the page we want. + */ + kunmap(page); + mdc_release_page(page, + le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + page = NULL; + } + } else { + put_page(page); + page = ERR_PTR(-EIO); + } + } else { + xa_unlock_irq(&mapping->i_pages); + page = NULL; + } + return page; +} + +/* + * Adjust a set of pages, each page containing an array of lu_dirpages, + * so that each page can be used as a single logical lu_dirpage. + * + * A lu_dirpage is laid out as follows, where s = ldp_hash_start, + * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a + * struct lu_dirent. It has size up to LU_PAGE_SIZE. The ldp_hash_end + * value is used as a cookie to request the next lu_dirpage in a + * directory listing that spans multiple pages (two in this example): + * ________ + * | | + * .|--------v------- -----. + * |s|e|f|p|ent|ent| ... |ent| + * '--|-------------- -----' Each PAGE contains a single + * '------. lu_dirpage. + * .---------v------- -----. + * |s|e|f|p|ent| 0 | ... | 0 | + * '----------------- -----' + * + * However, on hosts where the native VM page size (PAGE_SIZE) is + * larger than LU_PAGE_SIZE, a single host page may contain multiple + * lu_dirpages. After reading the lu_dirpages from the MDS, the + * ldp_hash_end of the first lu_dirpage refers to the one immediately + * after it in the same PAGE (arrows simplified for brevity, but + * in general e0==s1, e1==s2, etc.): + * + * .-------------------- -----. + * |s0|e0|f0|p|ent|ent| ... |ent| + * |---v---------------- -----| + * |s1|e1|f1|p|ent|ent| ... |ent| + * |---v---------------- -----| Here, each PAGE contains + * ... multiple lu_dirpages. + * |---v---------------- -----| + * |s'|e'|f'|p|ent|ent| ... |ent| + * '---|---------------- -----' + * v + * .----------------------------. + * | next PAGE | + * + * This structure is transformed into a single logical lu_dirpage as follows: + * + * - Replace e0 with e' so the request for the next lu_dirpage gets the page + * labeled 'next PAGE'. + * + * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether + * a hash collision with the next page exists. + * + * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span + * to the first entry of the next lu_dirpage. + */ +#if PAGE_SIZE > LU_PAGE_SIZE +static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs) +{ + int i; + + for (i = 0; i < cfs_pgs; i++) { + struct lu_dirpage *dp = kmap(pages[i]); + struct lu_dirpage *first = dp; + struct lu_dirent *end_dirent = NULL; + struct lu_dirent *ent; + __u64 hash_end = dp->ldp_hash_end; + __u32 flags = dp->ldp_flags; + + while (--lu_pgs > 0) { + ent = lu_dirent_start(dp); + for (end_dirent = ent; ent != NULL; + end_dirent = ent, ent = lu_dirent_next(ent)); + + /* Advance dp to next lu_dirpage. */ + dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE); + + /* Check if we've reached the end of the PAGE. */ + if (!((unsigned long)dp & ~PAGE_MASK)) + break; + + /* Save the hash and flags of this lu_dirpage. */ + hash_end = dp->ldp_hash_end; + flags = dp->ldp_flags; + + /* Check if lu_dirpage contains no entries. */ + if (end_dirent == NULL) + break; + + /* Enlarge the end entry lde_reclen from 0 to + * first entry of next lu_dirpage. */ + LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0); + end_dirent->lde_reclen = + cpu_to_le16((char *)(dp->ldp_entries) - + (char *)end_dirent); + } + + first->ldp_hash_end = hash_end; + first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE); + first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE); + + kunmap(pages[i]); + } + LASSERTF(lu_pgs == 0, "left = %d\n", lu_pgs); +} +#else +#define mdc_adjust_dirpages(pages, cfs_pgs, lu_pgs) do {} while (0) +#endif /* PAGE_SIZE > LU_PAGE_SIZE */ + +/* parameters for readdir page */ +struct readpage_param { + struct md_op_data *rp_mod; + __u64 rp_off; + int rp_hash64; + struct obd_export *rp_exp; + struct md_callback *rp_cb; +}; + +#ifndef HAVE_DELETE_FROM_PAGE_CACHE +static inline void delete_from_page_cache(struct page *page) +{ + remove_from_page_cache(page); + put_page(page); +} +#endif + +/** + * Read pages from server. + * + * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains + * a header lu_dirpage which describes the start/end hash, and whether this + * page is empty (contains no dir entry) or hash collide with next page. + * After client receives reply, several pages will be integrated into dir page + * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the + * lu_dirpage for this integrated page will be adjusted. + **/ +static int mdc_read_page_remote(void *data, struct page *page0) +{ + struct readpage_param *rp = data; + struct page **page_pool; + struct page *page; + struct lu_dirpage *dp; + struct md_op_data *op_data = rp->rp_mod; + struct ptlrpc_request *req; + int max_pages; + struct inode *inode; + struct lu_fid *fid; + int rd_pgs = 0; /* number of pages actually read */ + int npages; + int i; + int rc; + ENTRY; + + max_pages = rp->rp_exp->exp_obd->u.cli.cl_max_pages_per_rpc; + inode = op_data->op_data; + fid = &op_data->op_fid1; + LASSERT(inode != NULL); + + OBD_ALLOC(page_pool, sizeof(page_pool[0]) * max_pages); + if (page_pool != NULL) { + page_pool[0] = page0; + } else { + page_pool = &page0; + max_pages = 1; + } + + for (npages = 1; npages < max_pages; npages++) { + page = __page_cache_alloc(mapping_gfp_mask(inode->i_mapping) + | __GFP_COLD); + if (page == NULL) + break; + page_pool[npages] = page; + } + + rc = mdc_getpage(rp->rp_exp, fid, rp->rp_off, page_pool, npages, &req); + if (rc < 0) { + /* page0 is special, which was added into page cache early */ + delete_from_page_cache(page0); + } else { + int lu_pgs; + + rd_pgs = (req->rq_bulk->bd_nob_transferred + PAGE_SIZE - 1) >> + PAGE_SHIFT; + lu_pgs = req->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT; + LASSERT(!(req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK)); + + CDEBUG(D_INODE, "read %d(%d) pages\n", rd_pgs, lu_pgs); + + mdc_adjust_dirpages(page_pool, rd_pgs, lu_pgs); + + SetPageUptodate(page0); + } + unlock_page(page0); + + ptlrpc_req_finished(req); + CDEBUG(D_CACHE, "read %d/%d pages\n", rd_pgs, npages); + for (i = 1; i < npages; i++) { + unsigned long offset; + __u64 hash; + int ret; + + page = page_pool[i]; + + if (rc < 0 || i >= rd_pgs) { + put_page(page); + continue; + } + + SetPageUptodate(page); + + dp = kmap(page); + hash = le64_to_cpu(dp->ldp_hash_start); + kunmap(page); + + offset = hash_x_index(hash, rp->rp_hash64); + + prefetchw(&page->flags); + ret = add_to_page_cache_lru(page, inode->i_mapping, offset, + GFP_KERNEL); + if (ret == 0) + unlock_page(page); + else + CDEBUG(D_VFSTRACE, "page %lu add to page cache failed:" + " rc = %d\n", offset, ret); + put_page(page); + } + + if (page_pool != &page0) + OBD_FREE(page_pool, sizeof(page_pool[0]) * max_pages); + + RETURN(rc); +} + +/** + * Read dir page from cache first, if it can not find it, read it from + * server and add into the cache. + * + * \param[in] exp MDC export + * \param[in] op_data client MD stack parameters, transfering parameters + * between different layers on client MD stack. + * \param[in] cb_op callback required for ldlm lock enqueue during + * read page + * \param[in] hash_offset the hash offset of the page to be read + * \param[in] ppage the page to be read + * + * retval = 0 get the page successfully + * errno(<0) get the page failed + */ +static int mdc_read_page(struct obd_export *exp, struct md_op_data *op_data, + struct md_callback *cb_op, __u64 hash_offset, + struct page **ppage) +{ + struct lookup_intent it = { .it_op = IT_READDIR }; + struct page *page; + struct inode *dir = op_data->op_data; + struct address_space *mapping; + struct lu_dirpage *dp; + __u64 start = 0; + __u64 end = 0; + struct lustre_handle lockh; + struct ptlrpc_request *enq_req = NULL; + struct readpage_param rp_param; + int rc; + + ENTRY; + + *ppage = NULL; + + LASSERT(dir != NULL); + mapping = dir->i_mapping; + + rc = mdc_intent_lock(exp, op_data, &it, &enq_req, + cb_op->md_blocking_ast, 0); + if (enq_req != NULL) + ptlrpc_req_finished(enq_req); + + if (rc < 0) { + CERROR("%s: "DFID" lock enqueue fails: rc = %d\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), rc); + RETURN(rc); + } + + rc = 0; + lockh.cookie = it.it_lock_handle; + mdc_set_lock_data(exp, &lockh, dir, NULL); + + rp_param.rp_off = hash_offset; + rp_param.rp_hash64 = op_data->op_cli_flags & CLI_HASH64; + page = mdc_page_locate(mapping, &rp_param.rp_off, &start, &end, + rp_param.rp_hash64); + if (IS_ERR(page)) { + CERROR("%s: dir page locate: "DFID" at %llu: rc %ld\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, PTR_ERR(page)); + GOTO(out_unlock, rc = PTR_ERR(page)); + } else if (page != NULL) { + /* + * XXX nikita: not entirely correct handling of a corner case: + * suppose hash chain of entries with hash value HASH crosses + * border between pages P0 and P1. First both P0 and P1 are + * cached, seekdir() is called for some entry from the P0 part + * of the chain. Later P0 goes out of cache. telldir(HASH) + * happens and finds P1, as it starts with matching hash + * value. Remaining entries from P0 part of the chain are + * skipped. (Is that really a bug?) + * + * Possible solutions: 0. don't cache P1 is such case, handle + * it as an "overflow" page. 1. invalidate all pages at + * once. 2. use HASH|1 as an index for P1. + */ + GOTO(hash_collision, page); + } + + rp_param.rp_exp = exp; + rp_param.rp_mod = op_data; + page = read_cache_page(mapping, + hash_x_index(rp_param.rp_off, + rp_param.rp_hash64), + mdc_read_page_remote, &rp_param); + if (IS_ERR(page)) { + CDEBUG(D_INFO, "%s: read cache page: "DFID" at %llu: %ld\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, PTR_ERR(page)); + GOTO(out_unlock, rc = PTR_ERR(page)); + } + + wait_on_page_locked(page); + (void)kmap(page); + if (!PageUptodate(page)) { + CERROR("%s: page not updated: "DFID" at %llu: rc %d\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, -5); + goto fail; + } + if (!PageChecked(page)) + SetPageChecked(page); + if (PageError(page)) { + CERROR("%s: page error: "DFID" at %llu: rc %d\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, -5); + goto fail; + } + +hash_collision: + dp = page_address(page); + if (BITS_PER_LONG == 32 && rp_param.rp_hash64) { + start = le64_to_cpu(dp->ldp_hash_start) >> 32; + end = le64_to_cpu(dp->ldp_hash_end) >> 32; + rp_param.rp_off = hash_offset >> 32; + } else { + start = le64_to_cpu(dp->ldp_hash_start); + end = le64_to_cpu(dp->ldp_hash_end); + rp_param.rp_off = hash_offset; + } + if (end == start) { + LASSERT(start == rp_param.rp_off); + CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end); +#if BITS_PER_LONG == 32 + CWARN("Real page-wide hash collision at [%llu %llu] with " + "hash %llu\n", le64_to_cpu(dp->ldp_hash_start), + le64_to_cpu(dp->ldp_hash_end), hash_offset); +#endif + + /* + * Fetch whole overflow chain... + * + * XXX not yet. + */ + goto fail; + } + *ppage = page; +out_unlock: + ldlm_lock_decref(&lockh, it.it_lock_mode); + return rc; +fail: + kunmap(page); + mdc_release_page(page, 1); + rc = -EIO; + goto out_unlock; +} + +static int mdc_statfs_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *args, int rc) +{ + struct obd_info *oinfo = args; + struct obd_statfs *osfs; + + if (!rc) { + osfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (!osfs) + return -EPROTO; + + oinfo->oi_osfs = osfs; + + CDEBUG(D_CACHE, "blocks=%llu free=%llu avail=%llu " + "objects=%llu free=%llu state=%x\n", + osfs->os_blocks, osfs->os_bfree, osfs->os_bavail, + osfs->os_files, osfs->os_ffree, osfs->os_state); + } + + oinfo->oi_cb_up(oinfo, rc); + + return rc; +} + +static int mdc_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, time64_t max_age, + struct ptlrpc_request_set *unused) +{ + struct ptlrpc_request *req; + struct obd_info *aa; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_STATFS, + LUSTRE_MDS_VERSION, MDS_STATFS); + if (req == NULL) + return -ENOMEM; + + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = mdc_statfs_interpret; + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + *aa = *oinfo; + + ptlrpcd_add_req(req); + + return 0; +} + +static int mdc_statfs(const struct lu_env *env, + struct obd_export *exp, struct obd_statfs *osfs, + time64_t max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct req_format *fmt; + struct ptlrpc_request *req; + struct obd_statfs *msfs; + struct obd_import *imp = NULL; + int rc; + ENTRY; + + /* + * Since the request might also come from lprocfs, so we need + * sync this with client_disconnect_export Bug15684 + */ + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) + imp = class_import_get(obd->u.cli.cl_import); + up_read(&obd->u.cli.cl_sem); + if (!imp) + RETURN(-ENODEV); + + fmt = &RQF_MDS_STATFS; + if ((exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS) && + (flags & OBD_STATFS_SUM)) + fmt = &RQF_MDS_STATFS_NEW; + req = ptlrpc_request_alloc_pack(imp, fmt, LUSTRE_MDS_VERSION, + MDS_STATFS); + if (req == NULL) + GOTO(output, rc = -ENOMEM); + + if ((flags & OBD_STATFS_SUM) && + (exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS)) { + /* request aggregated states */ + struct mdt_body *body; + + body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + body->mbo_valid = OBD_MD_FLAGSTATFS; + } + + ptlrpc_request_set_replen(req); + + if (flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stay in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + rc = ptlrpc_queue_wait(req); + if (rc) { + /* check connection error first */ + if (imp->imp_connect_error) + rc = imp->imp_connect_error; + GOTO(out, rc); + } + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) + GOTO(out, rc = -EPROTO); + + *osfs = *msfs; + EXIT; +out: + ptlrpc_req_finished(req); +output: + class_import_put(imp); + return rc; +} + +static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf) +{ + __u32 keylen, vallen; + void *key; + int rc; + + if (gf->gf_pathlen > PATH_MAX) + RETURN(-ENAMETOOLONG); + if (gf->gf_pathlen < 2) + RETURN(-EOVERFLOW); + + /* Key is KEY_FID2PATH + getinfo_fid2path description */ + keylen = cfs_size_round(sizeof(KEY_FID2PATH) + sizeof(*gf) + + sizeof(struct lu_fid)); + OBD_ALLOC(key, keylen); + if (key == NULL) + RETURN(-ENOMEM); + memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH)); + memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf)); + memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf), + gf->gf_u.gf_root_fid, sizeof(struct lu_fid)); + CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n", + PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno); + + if (!fid_is_sane(&gf->gf_fid)) + GOTO(out, rc = -EINVAL); + + /* Val is struct getinfo_fid2path result plus path */ + vallen = sizeof(*gf) + gf->gf_pathlen; + + rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf); + if (rc != 0 && rc != -EREMOTE) + GOTO(out, rc); + + if (vallen <= sizeof(*gf)) + GOTO(out, rc = -EPROTO); + if (vallen > sizeof(*gf) + gf->gf_pathlen) + GOTO(out, rc = -EOVERFLOW); + + CDEBUG(D_IOCTL, "path got "DFID" from %llu #%d: %s\n", + PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, + gf->gf_pathlen < 512 ? gf->gf_u.gf_path : + /* only log the last 512 characters of the path */ + gf->gf_u.gf_path + gf->gf_pathlen - 512); + +out: + OBD_FREE(key, keylen); + return rc; +} + +static int mdc_ioc_hsm_progress(struct obd_export *exp, + struct hsm_progress_kernel *hpk) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct hsm_progress_kernel *req_hpk; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS, + LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + mdc_pack_body(req, NULL, 0, 0, -1, 0); + + /* Copy hsm_progress struct */ + req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS); + if (req_hpk == NULL) + GOTO(out, rc = -EPROTO); + + *req_hpk = *hpk; + req_hpk->hpk_errval = lustre_errno_hton(hpk->hpk_errval); + + ptlrpc_request_set_replen(req); + + mdc_get_mod_rpc_slot(req, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_mod_rpc_slot(req, NULL); + + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} +/** + * Send hsm_ct_register to MDS + * + * \param[in] imp import + * \param[in] archive_count if in bitmap format, it is the bitmap, + * else it is the count of archive_ids + * \param[in] archives if in bitmap format, it is NULL, + * else it is archive_id lists + */ +static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archive_count, + __u32 *archives) +{ + struct ptlrpc_request *req; + __u32 *archive_array; + size_t archives_size; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_CT_REGISTER); + if (req == NULL) + RETURN(-ENOMEM); + + if (archives != NULL) + archives_size = sizeof(*archive_array) * archive_count; + else + archives_size = sizeof(archive_count); + + req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_ARCHIVE, + RCL_CLIENT, archives_size); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_CT_REGISTER); + if (rc) { + ptlrpc_request_free(req); + RETURN(-ENOMEM); + } + + mdc_pack_body(req, NULL, 0, 0, -1, 0); + + archive_array = req_capsule_client_get(&req->rq_pill, + &RMF_MDS_HSM_ARCHIVE); + if (archive_array == NULL) + GOTO(out, rc = -EPROTO); + + if (archives != NULL) + memcpy(archive_array, archives, archives_size); + else + *archive_array = archive_count; + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_current_action(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_current_action *hca = op_data->op_data; + struct hsm_current_action *req_hca; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_ACTION); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, &op_data->op_fid1, 0, 0, + op_data->op_suppgids[0], 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + if (rc) + GOTO(out, rc); + + req_hca = req_capsule_server_get(&req->rq_pill, + &RMF_MDS_HSM_CURRENT_ACTION); + if (req_hca == NULL) + GOTO(out, rc = -EPROTO); + + *hca = *req_hca; + + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER, + LUSTRE_MDS_VERSION, + MDS_HSM_CT_UNREGISTER); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + mdc_pack_body(req, NULL, 0, 0, -1, 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_state_get(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_user_state *hus = op_data->op_data; + struct hsm_user_state *req_hus; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_STATE_GET); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET); + if (rc != 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, &op_data->op_fid1, 0, 0, + op_data->op_suppgids[0], 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + if (rc) + GOTO(out, rc); + + req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE); + if (req_hus == NULL) + GOTO(out, rc = -EPROTO); + + *hus = *req_hus; + + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_state_set(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_state_set *hss = op_data->op_data; + struct hsm_state_set *req_hss; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_STATE_SET); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, &op_data->op_fid1, 0, 0, + op_data->op_suppgids[0], 0); + + /* Copy states */ + req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET); + if (req_hss == NULL) + GOTO(out, rc = -EPROTO); + *req_hss = *hss; + + ptlrpc_request_set_replen(req); + + mdc_get_mod_rpc_slot(req, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_mod_rpc_slot(req, NULL); + + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_request(struct obd_export *exp, + struct hsm_user_request *hur) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct ptlrpc_request *req; + struct hsm_request *req_hr; + struct hsm_user_item *req_hui; + char *req_opaque; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT, + hur->hur_request.hr_itemcount + * sizeof(struct hsm_user_item)); + req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT, + hur->hur_request.hr_data_len); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, NULL, 0, 0, -1, 0); + + /* Copy hsm_request struct */ + req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST); + if (req_hr == NULL) + GOTO(out, rc = -EPROTO); + *req_hr = hur->hur_request; + + /* Copy hsm_user_item structs */ + req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM); + if (req_hui == NULL) + GOTO(out, rc = -EPROTO); + memcpy(req_hui, hur->hur_user_item, + hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item)); + + /* Copy opaque field */ + req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA); + if (req_opaque == NULL) + GOTO(out, rc = -EPROTO); + memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len); + + ptlrpc_request_set_replen(req); + + mdc_get_mod_rpc_slot(req, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_mod_rpc_slot(req, NULL); + + GOTO(out, rc); + +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_ct_start(struct obd_export *exp, + struct lustre_kernelcomm *lk); + +static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct ptlrpc_request *req; + struct obd_quotactl *oqc; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION, + MDS_QUOTACTL); + if (req == NULL) + RETURN(-ENOMEM); + + oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *oqc = *oqctl; + + ptlrpc_request_set_replen(req); + ptlrpc_at_set_req_timeout(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); + + if (req->rq_repmsg && + (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) { + *oqctl = *oqc; + } else if (!rc) { + CERROR ("Can't unpack obd_quotactl\n"); + rc = -EPROTO; + } + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static int mdc_ioc_swap_layouts(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct list_head cancels = LIST_HEAD_INIT(cancels); + struct ptlrpc_request *req; + int rc, count; + struct mdc_swap_layouts *msl, *payload; + ENTRY; + + msl = op_data->op_data; + + /* When the MDT will get the MDS_SWAP_LAYOUTS RPC the + * first thing it will do is to cancel the 2 layout + * locks held by this client. + * So the client must cancel its layout locks on the 2 fids + * with the request RPC to avoid extra RPC round trips. + */ + count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, + LCK_EX, MDS_INODELOCK_LAYOUT | + MDS_INODELOCK_XATTR); + count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels, + LCK_EX, MDS_INODELOCK_LAYOUT | + MDS_INODELOCK_XATTR); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_SWAP_LAYOUTS); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_swap_layouts_pack(req, op_data); + + payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS); + LASSERT(payload); + + *payload = *msl; + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + EXIT; + +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void __user *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_ioctl_data *data = karg; + struct obd_import *imp = obd->u.cli.cl_import; + int rc; + ENTRY; + + if (!try_module_get(THIS_MODULE)) { + CERROR("%s: cannot get module '%s'\n", obd->obd_name, + module_name(THIS_MODULE)); + return -EINVAL; + } + switch (cmd) { + case OBD_IOC_FID2PATH: + rc = mdc_ioc_fid2path(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_CT_START: + rc = mdc_ioc_hsm_ct_start(exp, karg); + /* ignore if it was already registered on this MDS. */ + if (rc == -EEXIST) + rc = 0; + GOTO(out, rc); + case LL_IOC_HSM_PROGRESS: + rc = mdc_ioc_hsm_progress(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_STATE_GET: + rc = mdc_ioc_hsm_state_get(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_STATE_SET: + rc = mdc_ioc_hsm_state_set(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_ACTION: + rc = mdc_ioc_hsm_current_action(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_REQUEST: + rc = mdc_ioc_hsm_request(exp, karg); + GOTO(out, rc); + case OBD_IOC_CLIENT_RECOVER: + rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0); + if (rc < 0) + GOTO(out, rc); + GOTO(out, rc = 0); + case IOC_OSC_SET_ACTIVE: + rc = ptlrpc_set_import_active(imp, data->ioc_offset); + GOTO(out, rc); + case OBD_IOC_PING_TARGET: + rc = ptlrpc_obd_ping(obd); + GOTO(out, rc); + /* + * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by + * LMV instead of MDC. But when the cluster is upgraded from 1.8, + * there'd be no LMV layer thus we might be called here. Eventually + * this code should be removed. + * bz20731, LU-592. + */ + case IOC_OBD_STATFS: { + struct obd_statfs stat_buf = {0}; + + if (*((__u32 *) data->ioc_inlbuf2) != 0) + GOTO(out, rc = -ENODEV); + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd), + min((int)data->ioc_plen2, + (int)sizeof(struct obd_uuid)))) + GOTO(out, rc = -EFAULT); + + rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + 0); + if (rc != 0) + GOTO(out, rc); + + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min((int) data->ioc_plen1, + (int) sizeof(stat_buf)))) + GOTO(out, rc = -EFAULT); + + GOTO(out, rc = 0); + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct obd_quotactl *oqctl; + + OBD_ALLOC_PTR(oqctl); + if (oqctl == NULL) + GOTO(out, rc = -ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_MDTIDX; + qctl->obd_uuid = obd->u.cli.cl_target_uuid; + } + + OBD_FREE_PTR(oqctl); + GOTO(out, rc); + } + case LL_IOC_GET_CONNECT_FLAGS: + if (copy_to_user(uarg, exp_connect_flags_ptr(exp), + sizeof(*exp_connect_flags_ptr(exp)))) + GOTO(out, rc = -EFAULT); + + GOTO(out, rc = 0); + case LL_IOC_LOV_SWAP_LAYOUTS: + rc = mdc_ioc_swap_layouts(exp, karg); + GOTO(out, rc); + default: + CERROR("unrecognised ioctl: cmd = %#x\n", cmd); + GOTO(out, rc = -ENOTTY); + } +out: + module_put(THIS_MODULE); + + return rc; +} + +static int mdc_get_info_rpc(struct obd_export *exp, + u32 keylen, void *key, + u32 vallen, void *val) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct ptlrpc_request *req; + char *tmp; + int rc = -EINVAL; + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY, + RCL_CLIENT, keylen); + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN, + RCL_CLIENT, sizeof(vallen)); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN); + memcpy(tmp, &vallen, sizeof(vallen)); + + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL, + RCL_SERVER, vallen); + ptlrpc_request_set_replen(req); + + /* if server failed to resolve FID, and OI scrub not able to fix it, it + * will return -EINPROGRESS, ptlrpc_queue_wait() will keep retrying, + * set request interruptible to avoid deadlock. + */ + if (KEY_IS(KEY_FID2PATH)) + req->rq_allow_intr = 1; + + rc = ptlrpc_queue_wait(req); + /* -EREMOTE means the get_info result is partial, and it needs to + * continue on another MDT, see fid2path part in lmv_iocontrol */ + if (rc == 0 || rc == -EREMOTE) { + tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL); + memcpy(val, tmp, vallen); + if (ptlrpc_rep_need_swab(req)) { + if (KEY_IS(KEY_FID2PATH)) + lustre_swab_fid2path(val); + } + } + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static void lustre_swab_hai(struct hsm_action_item *h) +{ + __swab32s(&h->hai_len); + __swab32s(&h->hai_action); + lustre_swab_lu_fid(&h->hai_fid); + lustre_swab_lu_fid(&h->hai_dfid); + __swab64s(&h->hai_cookie); + __swab64s(&h->hai_extent.offset); + __swab64s(&h->hai_extent.length); + __swab64s(&h->hai_gid); +} + +static void lustre_swab_hal(struct hsm_action_list *h) +{ + struct hsm_action_item *hai; + __u32 i; + + __swab32s(&h->hal_version); + __swab32s(&h->hal_count); + __swab32s(&h->hal_archive_id); + __swab64s(&h->hal_flags); + hai = hai_first(h); + for (i = 0; i < h->hal_count; i++, hai = hai_next(hai)) + lustre_swab_hai(hai); +} + +static void lustre_swab_kuch(struct kuc_hdr *l) +{ + __swab16s(&l->kuc_magic); + /* __u8 l->kuc_transport */ + __swab16s(&l->kuc_msgtype); + __swab16s(&l->kuc_msglen); +} + +static int mdc_ioc_hsm_ct_start(struct obd_export *exp, + struct lustre_kernelcomm *lk) +{ + struct obd_import *imp = class_exp2cliimp(exp); + int rc = 0; + + if (lk->lk_group != KUC_GRP_HSM) { + CERROR("Bad copytool group %d\n", lk->lk_group); + return -EINVAL; + } + + CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd, + lk->lk_uid, lk->lk_group, lk->lk_flags); + + if (lk->lk_flags & LK_FLG_STOP) { + /* Unregister with the coordinator */ + rc = mdc_ioc_hsm_ct_unregister(imp); + } else { + __u32 *archives = NULL; + + if ((lk->lk_flags & LK_FLG_DATANR) && lk->lk_data_count > 0) + archives = lk->lk_data; + + rc = mdc_ioc_hsm_ct_register(imp, lk->lk_data_count, archives); + } + + return rc; +} + +/** + * Send a message to any listening copytools + * @param val KUC message (kuc_hdr + hsm_action_list) + * @param len total length of message + */ +static int mdc_hsm_copytool_send(const struct obd_uuid *uuid, + size_t len, void *val) +{ + struct kuc_hdr *lh = (struct kuc_hdr *)val; + struct hsm_action_list *hal = (struct hsm_action_list *)(lh + 1); + int rc; + ENTRY; + + if (len < sizeof(*lh) + sizeof(*hal)) { + CERROR("Short HSM message %zu < %zu\n", len, + sizeof(*lh) + sizeof(*hal)); + RETURN(-EPROTO); + } + if (lh->kuc_magic == __swab16(KUC_MAGIC)) { + lustre_swab_kuch(lh); + lustre_swab_hal(hal); + } else if (lh->kuc_magic != KUC_MAGIC) { + CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC); + RETURN(-EPROTO); + } + + CDEBUG(D_HSM, " Received message mg=%x t=%d m=%d l=%d actions=%d " + "on %s\n", + lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype, + lh->kuc_msglen, hal->hal_count, hal->hal_fsname); + + /* Broadcast to HSM listeners */ + rc = libcfs_kkuc_group_put(uuid, KUC_GRP_HSM, lh); + + RETURN(rc); +} + +/** + * callback function passed to kuc for re-registering each HSM copytool + * running on MDC, after MDT shutdown/recovery. + * @param data copytool registration data + * @param cb_arg callback argument (obd_import) + */ +static int mdc_hsm_ct_reregister(void *data, void *cb_arg) +{ + struct obd_import *imp = (struct obd_import *)cb_arg; + struct kkuc_ct_data *kcd = data; + __u32 *archives = NULL; + int rc; + + if (kcd == NULL || + (kcd->kcd_magic != KKUC_CT_DATA_ARRAY_MAGIC && + kcd->kcd_magic != KKUC_CT_DATA_BITMAP_MAGIC)) + return -EPROTO; + + if (kcd->kcd_magic == KKUC_CT_DATA_BITMAP_MAGIC) { + CDEBUG(D_HA, "%s: recover copytool registration to MDT " + "(archive=%#x)\n", imp->imp_obd->obd_name, + kcd->kcd_nr_archives); + } else { + CDEBUG(D_HA, "%s: recover copytool registration to MDT " + "(archive nr = %u)\n", + imp->imp_obd->obd_name, kcd->kcd_nr_archives); + if (kcd->kcd_nr_archives != 0) + archives = kcd->kcd_archives; + } + + rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_nr_archives, archives); + /* ignore error if the copytool is already registered */ + return (rc == -EEXIST) ? 0 : rc; +} + +/** + * Re-establish all kuc contexts with MDT + * after MDT shutdown/recovery. + */ +static int mdc_kuc_reregister(struct obd_import *imp) +{ + /* re-register HSM agents */ + return libcfs_kkuc_group_foreach(&imp->imp_obd->obd_uuid, KUC_GRP_HSM, + mdc_hsm_ct_reregister, imp); +} + +static int mdc_set_info_async(const struct lu_env *env, + struct obd_export *exp, + u32 keylen, void *key, + u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct obd_import *imp = class_exp2cliimp(exp); + int rc; + ENTRY; + + if (KEY_IS(KEY_READ_ONLY)) { + if (vallen != sizeof(int)) + RETURN(-EINVAL); + + spin_lock(&imp->imp_lock); + if (*((int *)val)) { + imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY; + imp->imp_connect_data.ocd_connect_flags |= + OBD_CONNECT_RDONLY; + } else { + imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY; + imp->imp_connect_data.ocd_connect_flags &= + ~OBD_CONNECT_RDONLY; + } + spin_unlock(&imp->imp_lock); + + rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, + keylen, key, vallen, val, set); + RETURN(rc); + } + if (KEY_IS(KEY_CHANGELOG_CLEAR)) { + rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, + keylen, key, vallen, val, set); + RETURN(rc); + } + if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) { + rc = mdc_hsm_copytool_send(&imp->imp_obd->obd_uuid, vallen, + val); + RETURN(rc); + } + + if (KEY_IS(KEY_DEFAULT_EASIZE)) { + __u32 *default_easize = val; + + exp->exp_obd->u.cli.cl_default_mds_easize = *default_easize; + RETURN(0); + } + + rc = osc_set_info_async(env, exp, keylen, key, vallen, val, set); + RETURN(rc); +} + +static int mdc_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val) +{ + int rc = -EINVAL; + + if (KEY_IS(KEY_MAX_EASIZE)) { + __u32 mdsize, *max_easize; + + if (*vallen != sizeof(int)) + RETURN(-EINVAL); + mdsize = *(__u32 *)val; + if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize) + exp->exp_obd->u.cli.cl_max_mds_easize = mdsize; + max_easize = val; + *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize; + RETURN(0); + } else if (KEY_IS(KEY_DEFAULT_EASIZE)) { + __u32 *default_easize; + + if (*vallen != sizeof(int)) + RETURN(-EINVAL); + default_easize = val; + *default_easize = exp->exp_obd->u.cli.cl_default_mds_easize; + RETURN(0); + } else if (KEY_IS(KEY_CONN_DATA)) { + struct obd_import *imp = class_exp2cliimp(exp); + struct obd_connect_data *data = val; + + if (*vallen != sizeof(*data)) + RETURN(-EINVAL); + + *data = imp->imp_connect_data; + RETURN(0); + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((__u32 *)val) = 1; + RETURN(0); + } + + rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val); + + RETURN(rc); +} + +static int mdc_fsync(struct obd_export *exp, const struct lu_fid *fid, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, fid, 0, 0, -1, 0); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + RETURN(rc); +} + +struct mdc_rmfid_args { + int *mra_rcs; + int mra_nr; +}; + +int mdc_rmfid_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *args, int rc) +{ + struct mdc_rmfid_args *aa; + int *rcs, size; + ENTRY; + + if (!rc) { + aa = ptlrpc_req_async_args(req); + + size = req_capsule_get_size(&req->rq_pill, &RMF_RCS, + RCL_SERVER); + LASSERT(size == sizeof(int) * aa->mra_nr); + rcs = req_capsule_server_get(&req->rq_pill, &RMF_RCS); + LASSERT(rcs); + LASSERT(aa->mra_rcs); + LASSERT(aa->mra_nr); + memcpy(aa->mra_rcs, rcs, size); + } + + RETURN(rc); +} + +static int mdc_rmfid(struct obd_export *exp, struct fid_array *fa, + int *rcs, struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + struct mdc_rmfid_args *aa; + struct mdt_body *b; + struct lu_fid *tmp; + int rc, flen; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_RMFID); + if (req == NULL) + RETURN(-ENOMEM); + + flen = fa->fa_nr * sizeof(struct lu_fid); + req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY, + RCL_CLIENT, flen); + req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY, + RCL_SERVER, flen); + req_capsule_set_size(&req->rq_pill, &RMF_RCS, + RCL_SERVER, fa->fa_nr * sizeof(__u32)); + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_RMFID); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + tmp = req_capsule_client_get(&req->rq_pill, &RMF_FID_ARRAY); + memcpy(tmp, fa->fa_fids, flen); + + mdc_pack_body(req, NULL, 0, 0, -1, 0); + b = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY); + b->mbo_ctime = ktime_get_real_seconds(); + + ptlrpc_request_set_replen(req); + + LASSERT(rcs); + aa = ptlrpc_req_async_args(req); + aa->mra_rcs = rcs; + aa->mra_nr = fa->fa_nr; + req->rq_interpret_reply = mdc_rmfid_interpret; + + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(NULL, set); + + RETURN(rc); +} + +static int mdc_import_event(struct obd_device *obd, struct obd_import *imp, + enum obd_import_event event) +{ + struct client_obd *cli = &obd->u.cli; + int rc = 0; + + LASSERT(imp->imp_obd == obd); + + switch (event) { + case IMP_EVENT_DISCON: + spin_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant = 0; + cli->cl_lost_grant = 0; + spin_unlock(&cli->cl_loi_list_lock); + break; + case IMP_EVENT_INACTIVE: + /* + * Flush current sequence to make client obtain new one + * from server in case of disconnect/reconnect. + */ + down_read(&cli->cl_seq_rwsem); + if (cli->cl_seq) + seq_client_flush(cli->cl_seq); + up_read(&cli->cl_seq_rwsem); + + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE); + break; + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + struct lu_env *env; + __u16 refcheck; + + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + /* Reset grants. All pages go to failing rpcs due to + * the invalid import. + */ + osc_io_unplug(env, cli, NULL); + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + osc_ldlm_resource_invalidate, + env, 0); + cl_env_put(env, &refcheck); + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + } else { + rc = PTR_ERR(env); + } + break; + } + case IMP_EVENT_ACTIVE: + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE); + /* redo the kuc registration after reconnecting */ + if (rc == 0) + rc = mdc_kuc_reregister(imp); + break; + case IMP_EVENT_OCD: { + struct obd_connect_data *ocd = &imp->imp_connect_data; + + if (OCD_HAS_FLAG(ocd, GRANT)) + osc_init_grant(cli, ocd); + + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD); + break; + } + case IMP_EVENT_DEACTIVATE: + case IMP_EVENT_ACTIVATE: + break; + default: + CERROR("Unknown import event %x\n", event); + LBUG(); + } + RETURN(rc); +} + +int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + int rc = -EIO; + + ENTRY; + + down_read(&cli->cl_seq_rwsem); + if (cli->cl_seq) + rc = seq_client_alloc_fid(env, cli->cl_seq, fid); + up_read(&cli->cl_seq_rwsem); + + RETURN(rc); +} + +static struct obd_uuid *mdc_get_uuid(struct obd_export *exp) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + return &cli->cl_target_uuid; +} + +/** + * Determine whether the lock can be canceled before replaying it during + * recovery, non zero value will be return if the lock can be canceled, + * or zero returned for not + */ +static int mdc_cancel_weight(struct ldlm_lock *lock) +{ + if (lock->l_resource->lr_type != LDLM_IBITS) + RETURN(0); + + /* FIXME: if we ever get into a situation where there are too many + * opened files with open locks on a single node, then we really + * should replay these open locks to reget it */ + if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN) + RETURN(0); + + /* Special case for DoM locks, cancel only unused and granted locks */ + if (ldlm_has_dom(lock) && + (lock->l_granted_mode != lock->l_req_mode || + osc_ldlm_weigh_ast(lock) != 0)) + RETURN(0); + + RETURN(1); +} + +static int mdc_resource_inode_free(struct ldlm_resource *res) +{ + if (res->lr_lvb_inode) + res->lr_lvb_inode = NULL; + + return 0; +} + +static struct ldlm_valblock_ops inode_lvbo = { + .lvbo_free = mdc_resource_inode_free +}; + +static int mdc_llog_init(struct obd_device *obd) +{ + struct obd_llog_group *olg = &obd->obd_olg; + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, obd, + &llog_client_ops); + if (rc < 0) + RETURN(rc); + + ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT); + llog_initiator_connect(ctxt); + llog_ctxt_put(ctxt); + + RETURN(0); +} + +static void mdc_llog_finish(struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT); + if (ctxt != NULL) + llog_cleanup(NULL, ctxt); + + EXIT; +} + +int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg) +{ + int rc; + + ENTRY; + + rc = osc_setup_common(obd, cfg); + if (rc < 0) + RETURN(rc); + + rc = mdc_tunables_init(obd); + if (rc) + GOTO(err_osc_cleanup, rc); + + obd->u.cli.cl_dom_min_inline_repsize = MDC_DOM_DEF_INLINE_REPSIZE; + + ns_register_cancel(obd->obd_namespace, mdc_cancel_weight); + + obd->obd_namespace->ns_lvbo = &inode_lvbo; + + rc = mdc_llog_init(obd); + if (rc) { + CERROR("%s: failed to setup llogging subsystems: rc = %d\n", + obd->obd_name, rc); + GOTO(err_llog_cleanup, rc); + } + + rc = mdc_changelog_cdev_init(obd); + if (rc) { + CERROR("%s: failed to setup changelog char device: rc = %d\n", + obd->obd_name, rc); + GOTO(err_changelog_cleanup, rc); + } + + RETURN(rc); + +err_changelog_cleanup: + mdc_llog_finish(obd); +err_llog_cleanup: + lprocfs_free_md_stats(obd); + ptlrpc_lprocfs_unregister_obd(obd); +err_osc_cleanup: + osc_cleanup_common(obd); + return rc; +} + +/* Initialize the default and maximum LOV EA sizes. This allows + * us to make MDS RPCs with large enough reply buffers to hold a default + * sized EA without having to calculate this (via a call into the + * LOV + OSCs) each time we make an RPC. The maximum size is also tracked + * but not used to avoid wastefully vmalloc()'ing large reply buffers when + * a large number of stripes is possible. If a larger reply buffer is + * required it will be reallocated in the ptlrpc layer due to overflow. + */ +static int mdc_init_ea_size(struct obd_export *exp, __u32 easize, + __u32 def_easize) +{ + struct obd_device *obd = exp->exp_obd; + struct client_obd *cli = &obd->u.cli; + ENTRY; + + if (cli->cl_max_mds_easize < easize) + cli->cl_max_mds_easize = easize; + + if (cli->cl_default_mds_easize < def_easize) + cli->cl_default_mds_easize = def_easize; + + RETURN(0); +} + +static int mdc_precleanup(struct obd_device *obd) +{ + ENTRY; + + osc_precleanup_common(obd); + mdc_changelog_cdev_finish(obd); + + obd_cleanup_client_import(obd); + ptlrpc_lprocfs_unregister_obd(obd); + lprocfs_free_md_stats(obd); + mdc_llog_finish(obd); + RETURN(0); +} + +static int mdc_cleanup(struct obd_device *obd) +{ + return osc_cleanup_common(obd); +} + +int mdc_process_config(struct obd_device *obd, size_t len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + size_t count = class_modify_config(lcfg, PARAM_MDC, + &obd->obd_kset.kobj); + + return count > 0 ? 0 : count; +} + +static struct obd_ops mdc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = mdc_setup, + .o_precleanup = mdc_precleanup, + .o_cleanup = mdc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_reconnect = osc_reconnect, + .o_disconnect = osc_disconnect, + .o_iocontrol = mdc_iocontrol, + .o_set_info_async = mdc_set_info_async, + .o_statfs = mdc_statfs, + .o_statfs_async = mdc_statfs_async, + .o_fid_init = client_fid_init, + .o_fid_fini = client_fid_fini, + .o_fid_alloc = mdc_fid_alloc, + .o_import_event = mdc_import_event, + .o_get_info = mdc_get_info, + .o_get_uuid = mdc_get_uuid, + .o_quotactl = mdc_quotactl, +}; + +static struct md_ops mdc_md_ops = { + .m_get_root = mdc_get_root, + .m_null_inode = mdc_null_inode, + .m_close = mdc_close, + .m_create = mdc_create, + .m_enqueue = mdc_enqueue, + .m_getattr = mdc_getattr, + .m_getattr_name = mdc_getattr_name, + .m_intent_lock = mdc_intent_lock, + .m_link = mdc_link, + .m_rename = mdc_rename, + .m_setattr = mdc_setattr, + .m_setxattr = mdc_setxattr, + .m_getxattr = mdc_getxattr, + .m_fsync = mdc_fsync, + .m_file_resync = mdc_file_resync, + .m_read_page = mdc_read_page, + .m_unlink = mdc_unlink, + .m_cancel_unused = mdc_cancel_unused, + .m_init_ea_size = mdc_init_ea_size, + .m_set_lock_data = mdc_set_lock_data, + .m_lock_match = mdc_lock_match, + .m_get_lustre_md = mdc_get_lustre_md, + .m_free_lustre_md = mdc_free_lustre_md, + .m_set_open_replay_data = mdc_set_open_replay_data, + .m_clear_open_replay_data = mdc_clear_open_replay_data, + .m_intent_getattr_async = mdc_intent_getattr_async, + .m_revalidate_lock = mdc_revalidate_lock, + .m_rmfid = mdc_rmfid, +}; + +dev_t mdc_changelog_dev; +struct class *mdc_changelog_class; +static int __init mdc_init(void) +{ + int rc = 0; + rc = alloc_chrdev_region(&mdc_changelog_dev, 0, + MDC_CHANGELOG_DEV_COUNT, + MDC_CHANGELOG_DEV_NAME); + if (rc) + return rc; + + mdc_changelog_class = class_create(THIS_MODULE, MDC_CHANGELOG_DEV_NAME); + if (IS_ERR(mdc_changelog_class)) { + rc = PTR_ERR(mdc_changelog_class); + goto out_dev; + } + + rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL, + LUSTRE_MDC_NAME, &mdc_device_type); + if (rc) + goto out_dev; + + return 0; + +out_dev: + unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT); + return rc; +} + +static void __exit mdc_exit(void) +{ + class_destroy(mdc_changelog_class); + unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT); + class_unregister_type(LUSTRE_MDC_NAME); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Metadata Client"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(mdc_init); +module_exit(mdc_exit); diff --git a/drivers/staging/lustrefsx/lustre/mgc/Makefile b/drivers/staging/lustrefsx/lustre/mgc/Makefile new file mode 100644 index 0000000000000..7353c95e42cca --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mgc/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTREFSX_FS) += mgc.o + +mgc-y := mgc_request.o lproc_mgc.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c new file mode 100644 index 0000000000000..f277d3e489e70 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include "mgc_internal.h" + +#ifdef CONFIG_PROC_FS + +LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, connect_flags); + +LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, server_uuid); + +LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, import); + +LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, state); + +static int mgc_ir_state_seq_show(struct seq_file *m, void *v) +{ + return lprocfs_mgc_rd_ir_state(m, m->private); +} + +LDEBUGFS_SEQ_FOPS_RO(mgc_ir_state); + +struct ldebugfs_vars ldebugfs_mgc_obd_vars[] = { + { .name = "connect_flags", + .fops = &mgc_connect_flags_fops }, + { .name = "mgs_server_uuid", + .fops = &mgc_server_uuid_fops }, + { .name = "import", + .fops = &mgc_import_fops }, + { .name = "state", + .fops = &mgc_state_fops }, + { .name = "ir_state", + .fops = &mgc_ir_state_fops }, + { NULL } +}; +#endif /* CONFIG_PROC_FS */ + +LUSTRE_ATTR(mgs_conn_uuid, 0444, conn_uuid_show, NULL); +LUSTRE_RO_ATTR(conn_uuid); + +LUSTRE_RW_ATTR(ping); + +static struct attribute *mgc_attrs[] = { + &lustre_attr_mgs_conn_uuid.attr, + &lustre_attr_conn_uuid.attr, + &lustre_attr_ping.attr, + NULL, +}; + +int mgc_tunables_init(struct obd_device *obd) +{ + int rc; + + obd->obd_ktype.default_attrs = mgc_attrs; + obd->obd_debugfs_vars = ldebugfs_mgc_obd_vars; + rc = lprocfs_obd_setup(obd, true); + if (rc) + return rc; + + return sptlrpc_lprocfs_cliobd_attach(obd); +} diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h new file mode 100644 index 0000000000000..cd49fa2e47ffe --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h @@ -0,0 +1,67 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _MGC_INTERNAL_H +#define _MGC_INTERNAL_H + +#include +#include +#include +#include +#include + +int mgc_tunables_init(struct obd_device *obd); +int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data); + +int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld); + +static inline bool cld_is_sptlrpc(struct config_llog_data *cld) +{ + return cld->cld_type == MGS_CFG_T_SPTLRPC; +} + +static inline bool cld_is_recover(struct config_llog_data *cld) +{ + return cld->cld_type == MGS_CFG_T_RECOVER; +} + +static inline bool cld_is_nodemap(struct config_llog_data *cld) +{ + return cld->cld_type == MGS_CFG_T_NODEMAP; +} + +static inline bool cld_is_barrier(struct config_llog_data *cld) +{ + return cld->cld_type == MGS_CFG_T_BARRIER; +} + +#endif /* _MGC_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c new file mode 100644 index 0000000000000..ab588e1d100af --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c @@ -0,0 +1,2301 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/mgc/mgc_request.c + * + * Author: Nathan Rutman + */ + +#define DEBUG_SUBSYSTEM S_MGC +#define D_MGC D_CONFIG /*|D_WARNING*/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mgc_internal.h" + +static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id, + enum mgs_cfg_type type) +{ + __u64 resname = 0; + + if (len > sizeof(resname)) { + CERROR("name too long: %s\n", name); + return -EINVAL; + } + if (len <= 0) { + CERROR("missing name: %s\n", name); + return -EINVAL; + } + memcpy(&resname, name, len); + + /* Always use the same endianness for the resid */ + memset(res_id, 0, sizeof(*res_id)); + res_id->name[0] = cpu_to_le64(resname); + /* XXX: unfortunately, sptlprc and config llog share one lock */ + switch(type) { + case MGS_CFG_T_CONFIG: + case MGS_CFG_T_SPTLRPC: + resname = 0; + break; + case MGS_CFG_T_RECOVER: + case MGS_CFG_T_PARAMS: + case MGS_CFG_T_NODEMAP: + case MGS_CFG_T_BARRIER: + resname = type; + break; + default: + LBUG(); + } + res_id->name[1] = cpu_to_le64(resname); + CDEBUG(D_MGC, "log %s to resid %#llx/%#llx (%.8s)\n", name, + res_id->name[0], res_id->name[1], (char *)&res_id->name[0]); + return 0; +} + +int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, + enum mgs_cfg_type type) +{ + /* fsname is at most 8 chars long, maybe contain "-". + * e.g. "lustre", "SUN-000" */ + return mgc_name2resid(fsname, strlen(fsname), res_id, type); +} +EXPORT_SYMBOL(mgc_fsname2resid); + +int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, + enum mgs_cfg_type type) +{ + char *name_end; + int len; + + /* logname consists of "fsname-nodetype". + * e.g. "lustre-MDT0001", "SUN-000-client" + * there is an exception: llog "params" */ + name_end = strrchr(logname, '-'); + if (!name_end) + len = strlen(logname); + else + len = name_end - logname; + return mgc_name2resid(logname, len, res_id, type); +} +EXPORT_SYMBOL(mgc_logname2resid); + +/********************** config llog list **********************/ +static struct list_head config_llog_list = LIST_HEAD_INIT(config_llog_list); +static DEFINE_SPINLOCK(config_list_lock); /* protects config_llog_list */ + +/* Take a reference to a config log */ +static int config_log_get(struct config_llog_data *cld) +{ + ENTRY; + atomic_inc(&cld->cld_refcount); + CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, + atomic_read(&cld->cld_refcount)); + RETURN(0); +} + +/* Drop a reference to a config log. When no longer referenced, + we can free the config log data */ +static void config_log_put(struct config_llog_data *cld) +{ + ENTRY; + + if (unlikely(!cld)) + RETURN_EXIT; + + CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, + atomic_read(&cld->cld_refcount)); + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* spinlock to make sure no item with 0 refcount in the list */ + if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) { + list_del(&cld->cld_list_chain); + spin_unlock(&config_list_lock); + + CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname); + + config_log_put(cld->cld_barrier); + config_log_put(cld->cld_recover); + config_log_put(cld->cld_params); + config_log_put(cld->cld_nodemap); + config_log_put(cld->cld_sptlrpc); + if (cld_is_sptlrpc(cld)) + sptlrpc_conf_log_stop(cld->cld_logname); + + class_export_put(cld->cld_mgcexp); + OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1); + } + + EXIT; +} + +/* Find a config log by name */ +static +struct config_llog_data *config_log_find(char *logname, + struct config_llog_instance *cfg) +{ + struct config_llog_data *cld; + struct config_llog_data *found = NULL; + unsigned long cfg_instance; + + ENTRY; + LASSERT(logname != NULL); + + cfg_instance = cfg ? cfg->cfg_instance : 0; + spin_lock(&config_list_lock); + list_for_each_entry(cld, &config_llog_list, cld_list_chain) { + /* check if cfg_instance is the one we want */ + if (cfg_instance != cld->cld_cfg.cfg_instance) + continue; + + /* instance may be NULL, should check name */ + if (strcmp(logname, cld->cld_logname) == 0) { + found = cld; + config_log_get(found); + break; + } + } + spin_unlock(&config_list_lock); + RETURN(found); +} + +static +struct config_llog_data *do_config_log_add(struct obd_device *obd, + char *logname, + enum mgs_cfg_type type, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct config_llog_data *cld; + int rc; + + ENTRY; + + CDEBUG(D_MGC, "do adding config log %s-%016lx\n", logname, + cfg ? cfg->cfg_instance : 0); + + OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1); + if (!cld) + RETURN(ERR_PTR(-ENOMEM)); + + rc = mgc_logname2resid(logname, &cld->cld_resid, type); + if (rc) { + OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1); + RETURN(ERR_PTR(rc)); + } + + strcpy(cld->cld_logname, logname); + if (cfg) + cld->cld_cfg = *cfg; + else + cld->cld_cfg.cfg_callback = class_config_llog_handler; + mutex_init(&cld->cld_lock); + cld->cld_cfg.cfg_last_idx = 0; + cld->cld_cfg.cfg_flags = 0; + cld->cld_cfg.cfg_sb = sb; + cld->cld_type = type; + atomic_set(&cld->cld_refcount, 1); + + /* Keep the mgc around until we are done */ + cld->cld_mgcexp = class_export_get(obd->obd_self_export); + + if (cld_is_sptlrpc(cld)) + sptlrpc_conf_log_start(logname); + + spin_lock(&config_list_lock); + list_add(&cld->cld_list_chain, &config_llog_list); + spin_unlock(&config_list_lock); + + if (cld_is_sptlrpc(cld) || cld_is_nodemap(cld) || cld_is_barrier(cld)) { + rc = mgc_process_log(obd, cld); + if (rc && rc != -ENOENT) + CERROR("%s: failed processing log, type %d: rc = %d\n", + obd->obd_name, type, rc); + } + + RETURN(cld); +} + +static struct config_llog_data *config_recover_log_add(struct obd_device *obd, + char *fsname, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct config_llog_instance lcfg = *cfg; + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_data *cld; + char logname[32]; + + if (IS_OST(lsi)) + return NULL; + + /* for osp-on-ost, see lustre_start_osp() */ + if (IS_MDT(lsi) && lcfg.cfg_instance) + return NULL; + + /* We have to use different llog for clients and MDTs for DNE, + * where only clients are notified if one of DNE server restarts. + */ + LASSERT(strlen(fsname) < sizeof(logname) / 2); + strncpy(logname, fsname, sizeof(logname)); + if (IS_SERVER(lsi)) { /* mdt */ + LASSERT(lcfg.cfg_instance == 0); + lcfg.cfg_instance = ll_get_cfg_instance(sb); + strncat(logname, "-mdtir", sizeof(logname)); + } else { + LASSERT(lcfg.cfg_instance != 0); + strncat(logname, "-cliir", sizeof(logname)); + } + + cld = do_config_log_add(obd, logname, MGS_CFG_T_RECOVER, &lcfg, sb); + return cld; +} + +static struct config_llog_data * +config_log_find_or_add(struct obd_device *obd, char *logname, + struct super_block *sb, enum mgs_cfg_type type, + struct config_llog_instance *cfg) +{ + struct config_llog_instance lcfg = *cfg; + struct config_llog_data *cld; + + /* Note class_config_llog_handler() depends on getting "obd" back */ + lcfg.cfg_instance = sb ? ll_get_cfg_instance(sb) : (unsigned long)obd; + + cld = config_log_find(logname, &lcfg); + if (unlikely(cld != NULL)) + return cld; + + return do_config_log_add(obd, logname, type, &lcfg, sb); +} + +/** Add this log to the list of active logs watched by an MGC. + * Active means we're watching for updates. + * We have one active log per "mount" - client instance or servername. + * Each instance may be at a different point in the log. + */ +static struct config_llog_data * +config_log_add(struct obd_device *obd, char *logname, + struct config_llog_instance *cfg, struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_data *cld = NULL; + struct config_llog_data *sptlrpc_cld = NULL; + struct config_llog_data *params_cld = NULL; + struct config_llog_data *nodemap_cld = NULL; + struct config_llog_data *barrier_cld = NULL; + char seclogname[32]; + char *ptr; + int rc; + bool locked = false; + ENTRY; + + CDEBUG(D_MGC, "add config log %s-%016lx\n", logname, + cfg->cfg_instance); + + /* + * for each regular log, the depended sptlrpc log name is + * -sptlrpc. multiple regular logs may share one sptlrpc log. + */ + ptr = strrchr(logname, '-'); + if (ptr == NULL || ptr - logname > 8) { + CERROR("logname %s is too long\n", logname); + RETURN(ERR_PTR(-EINVAL)); + } + + memcpy(seclogname, logname, ptr - logname); + strcpy(seclogname + (ptr - logname), "-sptlrpc"); + + if (cfg->cfg_sub_clds & CONFIG_SUB_SPTLRPC) { + sptlrpc_cld = config_log_find_or_add(obd, seclogname, NULL, + MGS_CFG_T_SPTLRPC, cfg); + if (IS_ERR(sptlrpc_cld)) { + CERROR("%s: can't create sptlrpc log %s: rc = %ld\n", + obd->obd_name, seclogname, PTR_ERR(sptlrpc_cld)); + RETURN(sptlrpc_cld); + } + } + + if (!IS_MGS(lsi) && cfg->cfg_sub_clds & CONFIG_SUB_NODEMAP) { + nodemap_cld = config_log_find_or_add(obd, LUSTRE_NODEMAP_NAME, + NULL, MGS_CFG_T_NODEMAP, + cfg); + if (IS_ERR(nodemap_cld)) { + rc = PTR_ERR(nodemap_cld); + CERROR("%s: cannot create nodemap log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_sptlrpc, rc); + } + } + + if (cfg->cfg_sub_clds & CONFIG_SUB_PARAMS) { + params_cld = config_log_find_or_add(obd, PARAMS_FILENAME, sb, + MGS_CFG_T_PARAMS, cfg); + if (IS_ERR(params_cld)) { + rc = PTR_ERR(params_cld); + CERROR("%s: can't create params log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_nodemap, rc); + } + } + + if (IS_MDT(s2lsi(sb)) && cfg->cfg_sub_clds & CONFIG_SUB_BARRIER) { + snprintf(seclogname + (ptr - logname), sizeof(seclogname) - 1, + "-%s", BARRIER_FILENAME); + barrier_cld = config_log_find_or_add(obd, seclogname, sb, + MGS_CFG_T_BARRIER, cfg); + if (IS_ERR(barrier_cld)) { + rc = PTR_ERR(barrier_cld); + CERROR("%s: can't create barrier log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_params, rc); + } + } + + cld = do_config_log_add(obd, logname, MGS_CFG_T_CONFIG, cfg, sb); + if (IS_ERR(cld)) { + rc = PTR_ERR(cld); + CERROR("%s: can't create log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_barrier, rc = PTR_ERR(cld)); + } + + LASSERT(lsi->lsi_lmd); + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) && + cfg->cfg_sub_clds & CONFIG_SUB_RECOVER) { + struct config_llog_data *recover_cld; + + ptr = strrchr(seclogname, '-'); + if (ptr != NULL) { + *ptr = 0; + } else { + CERROR("%s: sptlrpc log name not correct, %s: " + "rc = %d\n", obd->obd_name, seclogname, -EINVAL); + GOTO(out_cld, rc = -EINVAL); + } + + recover_cld = config_recover_log_add(obd, seclogname, cfg, sb); + if (IS_ERR(recover_cld)) { + rc = PTR_ERR(recover_cld); + CERROR("%s: can't create recover log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_cld, rc); + } + + mutex_lock(&cld->cld_lock); + locked = true; + cld->cld_recover = recover_cld; + } + + if (!locked) + mutex_lock(&cld->cld_lock); + cld->cld_params = params_cld; + cld->cld_barrier = barrier_cld; + cld->cld_nodemap = nodemap_cld; + cld->cld_sptlrpc = sptlrpc_cld; + mutex_unlock(&cld->cld_lock); + + RETURN(cld); + +out_cld: + config_log_put(cld); +out_barrier: + config_log_put(barrier_cld); +out_params: + config_log_put(params_cld); +out_nodemap: + config_log_put(nodemap_cld); +out_sptlrpc: + config_log_put(sptlrpc_cld); + + return ERR_PTR(rc); +} + +DEFINE_MUTEX(llog_process_lock); + +static inline void config_mark_cld_stop(struct config_llog_data *cld) +{ + if (cld) { + mutex_lock(&cld->cld_lock); + spin_lock(&config_list_lock); + cld->cld_stopping = 1; + spin_unlock(&config_list_lock); + mutex_unlock(&cld->cld_lock); + } +} + +/** Stop watching for updates on this log. + */ +static int config_log_end(char *logname, struct config_llog_instance *cfg) +{ + struct config_llog_data *cld; + struct config_llog_data *cld_sptlrpc = NULL; + struct config_llog_data *cld_params = NULL; + struct config_llog_data *cld_recover = NULL; + struct config_llog_data *cld_nodemap = NULL; + struct config_llog_data *cld_barrier = NULL; + int rc = 0; + + ENTRY; + + cld = config_log_find(logname, cfg); + if (cld == NULL) + RETURN(-ENOENT); + + mutex_lock(&cld->cld_lock); + /* + * if cld_stopping is set, it means we didn't start the log thus + * not owning the start ref. this can happen after previous umount: + * the cld still hanging there waiting for lock cancel, and we + * remount again but failed in the middle and call log_end without + * calling start_log. + */ + if (unlikely(cld->cld_stopping)) { + mutex_unlock(&cld->cld_lock); + /* drop the ref from the find */ + config_log_put(cld); + RETURN(rc); + } + + spin_lock(&config_list_lock); + cld->cld_stopping = 1; + spin_unlock(&config_list_lock); + + cld_recover = cld->cld_recover; + cld->cld_recover = NULL; + cld_params = cld->cld_params; + cld->cld_params = NULL; + cld_nodemap = cld->cld_nodemap; + cld->cld_nodemap = NULL; + cld_barrier = cld->cld_barrier; + cld->cld_barrier = NULL; + cld_sptlrpc = cld->cld_sptlrpc; + cld->cld_sptlrpc = NULL; + mutex_unlock(&cld->cld_lock); + + config_mark_cld_stop(cld_recover); + config_log_put(cld_recover); + + config_mark_cld_stop(cld_params); + config_log_put(cld_params); + + /* don't set cld_stopping on nm lock as other targets may be active */ + config_log_put(cld_nodemap); + + if (cld_barrier) { + mutex_lock(&cld_barrier->cld_lock); + cld_barrier->cld_stopping = 1; + mutex_unlock(&cld_barrier->cld_lock); + config_log_put(cld_barrier); + } + + config_log_put(cld_sptlrpc); + + /* drop the ref from the find */ + config_log_put(cld); + /* drop the start ref */ + config_log_put(cld); + + CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client", + rc); + RETURN(rc); +} + +int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp; + struct obd_connect_data *ocd; + struct config_llog_data *cld; + + ENTRY; + LASSERT(obd); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + ocd = &imp->imp_connect_data; + + seq_printf(m, "imperative_recovery: %s\n", + OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED"); + seq_printf(m, "client_state:\n"); + + spin_lock(&config_list_lock); + list_for_each_entry(cld, &config_llog_list, cld_list_chain) { + if (cld->cld_recover == NULL) + continue; + seq_printf(m, " - { client: %s, nidtbl_version: %u }\n", + cld->cld_logname, + cld->cld_recover->cld_cfg.cfg_last_idx); + } + spin_unlock(&config_list_lock); + + LPROCFS_CLIMP_EXIT(obd); + RETURN(0); +} + +/* reenqueue any lost locks */ +#define RQ_RUNNING 0x1 +#define RQ_NOW 0x2 +#define RQ_LATER 0x4 +#define RQ_STOP 0x8 +#define RQ_PRECLEANUP 0x10 +static int rq_state = 0; +static wait_queue_head_t rq_waitq; +static DECLARE_COMPLETION(rq_exit); +static DECLARE_COMPLETION(rq_start); + +static void do_requeue(struct config_llog_data *cld) +{ + int rc = 0; + ENTRY; + + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* + * Do not run mgc_process_log on a disconnected export or an + * export which is being disconnected. Take the client + * semaphore to make the check non-racy. + */ + down_read_nested(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem, + OBD_CLI_SEM_MGC); + if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) { + CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname); + rc = mgc_process_log(cld->cld_mgcexp->exp_obd, cld); + if (rc && rc != -ENOENT) + CERROR("failed processing log: %d\n", rc); + } else { + CDEBUG(D_MGC, "disconnecting, won't update log %s\n", + cld->cld_logname); + } + up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem); + + EXIT; +} + +/* this timeout represents how many seconds MGC should wait before + * requeue config and recover lock to the MGS. We need to randomize this + * in order to not flood the MGS. + */ +#define MGC_TIMEOUT_MIN_SECONDS 5 +#define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */ + +static int mgc_requeue_thread(void *data) +{ + int rc = 0; + bool first = true; + ENTRY; + + CDEBUG(D_MGC, "Starting requeue thread\n"); + + /* Keep trying failed locks periodically */ + spin_lock(&config_list_lock); + rq_state |= RQ_RUNNING; + while (!(rq_state & RQ_STOP)) { + struct l_wait_info lwi; + struct config_llog_data *cld, *cld_prev; + int rand = cfs_rand() & MGC_TIMEOUT_RAND_CENTISEC; + int to; + + /* Any new or requeued lostlocks will change the state */ + rq_state &= ~(RQ_NOW | RQ_LATER); + spin_unlock(&config_list_lock); + + if (first) { + first = false; + complete(&rq_start); + } + + /* Always wait a few seconds to allow the server who + caused the lock revocation to finish its setup, plus some + random so everyone doesn't try to reconnect at once. */ + to = msecs_to_jiffies(MGC_TIMEOUT_MIN_SECONDS * MSEC_PER_SEC); + /* rand is centi-seconds */ + to += msecs_to_jiffies(rand * MSEC_PER_SEC / 100); + lwi = LWI_TIMEOUT(to, NULL, NULL); + l_wait_event(rq_waitq, rq_state & (RQ_STOP | RQ_PRECLEANUP), + &lwi); + + /* + * iterate & processing through the list. for each cld, process + * its depending sptlrpc cld firstly (if any) and then itself. + * + * it's guaranteed any item in the list must have + * reference > 0; and if cld_lostlock is set, at + * least one reference is taken by the previous enqueue. + */ + cld_prev = NULL; + + spin_lock(&config_list_lock); + rq_state &= ~RQ_PRECLEANUP; + list_for_each_entry(cld, &config_llog_list, + cld_list_chain) { + if (!cld->cld_lostlock || cld->cld_stopping) + continue; + + /* hold reference to avoid being freed during + * subsequent processing. */ + config_log_get(cld); + cld->cld_lostlock = 0; + spin_unlock(&config_list_lock); + + config_log_put(cld_prev); + cld_prev = cld; + + if (likely(!(rq_state & RQ_STOP))) { + do_requeue(cld); + spin_lock(&config_list_lock); + } else { + spin_lock(&config_list_lock); + break; + } + } + spin_unlock(&config_list_lock); + config_log_put(cld_prev); + + /* Wait a bit to see if anyone else needs a requeue */ + lwi = (struct l_wait_info) { 0 }; + l_wait_event(rq_waitq, rq_state & (RQ_NOW | RQ_STOP), + &lwi); + spin_lock(&config_list_lock); + } + + /* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */ + rq_state &= ~RQ_RUNNING; + spin_unlock(&config_list_lock); + + complete(&rq_exit); + + CDEBUG(D_MGC, "Ending requeue thread\n"); + RETURN(rc); +} + +/* Add a cld to the list to requeue. Start the requeue thread if needed. + We are responsible for dropping the config log reference from here on out. */ +static void mgc_requeue_add(struct config_llog_data *cld) +{ + bool wakeup = false; + ENTRY; + + CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n", + cld->cld_logname, atomic_read(&cld->cld_refcount), + cld->cld_stopping, rq_state); + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + mutex_lock(&cld->cld_lock); + spin_lock(&config_list_lock); + if (!(rq_state & RQ_STOP) && !cld->cld_stopping && !cld->cld_lostlock) { + cld->cld_lostlock = 1; + rq_state |= RQ_NOW; + wakeup = true; + } + spin_unlock(&config_list_lock); + mutex_unlock(&cld->cld_lock); + if (wakeup) + wake_up(&rq_waitq); + + EXIT; +} + +/********************** class fns **********************/ +static int mgc_local_llog_init(const struct lu_env *env, + struct obd_device *obd, + struct obd_device *disk) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_ORIG_CTXT, disk, + &llog_osd_ops); + if (rc) + RETURN(rc); + + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + LASSERT(ctxt); + ctxt->loc_dir = obd->u.cli.cl_mgc_configs_dir; + llog_ctxt_put(ctxt); + + RETURN(0); +} + +static int mgc_local_llog_fini(const struct lu_env *env, + struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + llog_cleanup(env, ctxt); + + RETURN(0); +} + +static int mgc_fs_setup(const struct lu_env *env, struct obd_device *obd, + struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct client_obd *cli = &obd->u.cli; + struct lu_fid rfid, fid; + struct dt_object *root, *dto; + int rc = 0; + + ENTRY; + + LASSERT(lsi); + LASSERT(lsi->lsi_dt_dev); + + /* The mgc fs exclusion mutex. Only one fs can be setup at a time. */ + mutex_lock(&cli->cl_mgc_mutex); + + /* Setup the configs dir */ + fid.f_seq = FID_SEQ_LOCAL_NAME; + fid.f_oid = 1; + fid.f_ver = 0; + rc = local_oid_storage_init(env, lsi->lsi_dt_dev, &fid, + &cli->cl_mgc_los); + if (rc) + RETURN(rc); + + rc = dt_root_get(env, lsi->lsi_dt_dev, &rfid); + if (rc) + GOTO(out_los, rc); + + root = dt_locate_at(env, lsi->lsi_dt_dev, &rfid, + &cli->cl_mgc_los->los_dev->dd_lu_dev, NULL); + if (unlikely(IS_ERR(root))) + GOTO(out_los, rc = PTR_ERR(root)); + + dto = local_file_find_or_create(env, cli->cl_mgc_los, root, + MOUNT_CONFIGS_DIR, + S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); + dt_object_put_nocache(env, root); + if (IS_ERR(dto)) + GOTO(out_los, rc = PTR_ERR(dto)); + + cli->cl_mgc_configs_dir = dto; + + LASSERT(lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt); + rc = mgc_local_llog_init(env, obd, lsi->lsi_osd_exp->exp_obd); + if (rc) + GOTO(out_llog, rc); + + /* We take an obd ref to insure that we can't get to mgc_cleanup + * without calling mgc_fs_cleanup first. */ + class_incref(obd, "mgc_fs", obd); + + /* We keep the cl_mgc_sem until mgc_fs_cleanup */ + EXIT; +out_llog: + if (rc) { + dt_object_put(env, cli->cl_mgc_configs_dir); + cli->cl_mgc_configs_dir = NULL; + } +out_los: + if (rc < 0) { + local_oid_storage_fini(env, cli->cl_mgc_los); + cli->cl_mgc_los = NULL; + mutex_unlock(&cli->cl_mgc_mutex); + } + return rc; +} + +static int mgc_fs_cleanup(const struct lu_env *env, struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + ENTRY; + + LASSERT(cli->cl_mgc_los != NULL); + + mgc_local_llog_fini(env, obd); + + dt_object_put_nocache(env, cli->cl_mgc_configs_dir); + cli->cl_mgc_configs_dir = NULL; + + local_oid_storage_fini(env, cli->cl_mgc_los); + cli->cl_mgc_los = NULL; + + class_decref(obd, "mgc_fs", obd); + mutex_unlock(&cli->cl_mgc_mutex); + + RETURN(0); +} + +static int mgc_llog_init(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + /* setup only remote ctxt, the local disk context is switched per each + * filesystem during mgc_fs_setup() */ + rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_REPL_CTXT, obd, + &llog_client_ops); + if (rc) + RETURN(rc); + + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + LASSERT(ctxt); + + llog_initiator_connect(ctxt); + llog_ctxt_put(ctxt); + + RETURN(0); +} + +static int mgc_llog_fini(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + if (ctxt) + llog_cleanup(env, ctxt); + + RETURN(0); +} + + +static atomic_t mgc_count = ATOMIC_INIT(0); +static int mgc_precleanup(struct obd_device *obd) +{ + int rc = 0; + int temp; + ENTRY; + + if (atomic_dec_and_test(&mgc_count)) { + LASSERT(rq_state & RQ_RUNNING); + /* stop requeue thread */ + temp = RQ_STOP; + } else { + /* wakeup requeue thread to clean our cld */ + temp = RQ_NOW | RQ_PRECLEANUP; + } + + spin_lock(&config_list_lock); + rq_state |= temp; + spin_unlock(&config_list_lock); + wake_up(&rq_waitq); + + if (temp & RQ_STOP) + wait_for_completion(&rq_exit); + obd_cleanup_client_import(obd); + + rc = mgc_llog_fini(NULL, obd); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + + RETURN(rc); +} + +static int mgc_cleanup(struct obd_device *obd) +{ + int rc; + ENTRY; + + /* COMPAT_146 - old config logs may have added profiles we don't + know about */ + if (obd->obd_type->typ_refcnt <= 1) + /* Only for the last mgc */ + class_del_profiles(); + + lprocfs_obd_cleanup(obd); + ptlrpcd_decref(); + + rc = client_obd_cleanup(obd); + RETURN(rc); +} + +static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct task_struct *task; + int rc; + ENTRY; + + rc = ptlrpcd_addref(); + if (rc < 0) + RETURN(rc); + + rc = client_obd_setup(obd, lcfg); + if (rc) + GOTO(err_decref, rc); + + rc = mgc_llog_init(NULL, obd); + if (rc) { + CERROR("failed to setup llogging subsystems\n"); + GOTO(err_cleanup, rc); + } + + rc = mgc_tunables_init(obd); + if (rc) + GOTO(err_sysfs, rc); + + if (atomic_inc_return(&mgc_count) == 1) { + rq_state = 0; + init_waitqueue_head(&rq_waitq); + + /* start requeue thread */ + task = kthread_run(mgc_requeue_thread, NULL, "ll_cfg_requeue"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("%s: cannot start requeue thread: rc = %d; " + "no more log updates\n", + obd->obd_name, rc); + GOTO(err_sysfs, rc); + } + /* rc is the task_struct pointer of mgc_requeue_thread. */ + rc = 0; + wait_for_completion(&rq_start); + } + + RETURN(rc); + +err_sysfs: + lprocfs_obd_cleanup(obd); +err_cleanup: + client_obd_cleanup(obd); +err_decref: + ptlrpcd_decref(); + RETURN(rc); +} + +/* based on ll_mdc_blocking_ast */ +static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct lustre_handle lockh; + struct config_llog_data *cld = (struct config_llog_data *)data; + int rc = 0; + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: + /* mgs wants the lock, give it up... */ + LDLM_DEBUG(lock, "MGC blocking CB"); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + break; + case LDLM_CB_CANCELING: + /* We've given up the lock, prepare ourselves to update. */ + LDLM_DEBUG(lock, "MGC cancel CB"); + + CDEBUG(D_MGC, "Lock res "DLDLMRES" (%.8s)\n", + PLDLMRES(lock->l_resource), + (char *)&lock->l_resource->lr_name.name[0]); + + if (!cld) { + CDEBUG(D_INFO, "missing data, won't requeue\n"); + break; + } + + /* held at mgc_process_log(). */ + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + lock->l_ast_data = NULL; + /* Are we done with this log? */ + if (cld->cld_stopping) { + CDEBUG(D_MGC, "log %s: stopping, won't requeue\n", + cld->cld_logname); + config_log_put(cld); + break; + } + /* Make sure not to re-enqueue when the mgc is stopping + (we get called from client_disconnect_export) */ + if (lock->l_conn_export == NULL || + lock->l_conn_export->exp_obd->u.cli.cl_conn_count == 0) { + CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n", + cld->cld_logname); + config_log_put(cld); + break; + } + + /* Re-enqueue now */ + mgc_requeue_add(cld); + config_log_put(cld); + break; + default: + LBUG(); + } + + RETURN(rc); +} + +/* Not sure where this should go... */ +/* This is the timeout value for MGS_CONNECT request plus a ping interval, such + * that we can have a chance to try the secondary MGS if any. */ +#define MGC_ENQUEUE_LIMIT (INITIAL_CONNECT_TIMEOUT + (AT_OFF ? 0 : at_min) \ + + PING_INTERVAL) +#define MGC_TARGET_REG_LIMIT 10 +#define MGC_TARGET_REG_LIMIT_MAX RECONNECT_DELAY_MAX +#define MGC_SEND_PARAM_LIMIT 10 + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0) +/* Send parameter to MGS*/ +static int mgc_set_mgs_param(struct obd_export *exp, + struct mgs_send_param *msp) +{ + struct ptlrpc_request *req; + struct mgs_send_param *req_msp, *rep_msp; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION, + MGS_SET_INFO); + if (!req) + RETURN(-ENOMEM); + + req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM); + if (!req_msp) { + ptlrpc_req_finished(req); + RETURN(-ENOMEM); + } + + memcpy(req_msp, msp, sizeof(*req_msp)); + ptlrpc_request_set_replen(req); + + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = MGC_SEND_PARAM_LIMIT; + rc = ptlrpc_queue_wait(req); + if (!rc) { + rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM); + memcpy(msp, rep_msp, sizeof(*rep_msp)); + } + + ptlrpc_req_finished(req); + + RETURN(rc); +} +#endif + +/* Take a config lock so we can get cancel notifications */ +static int mgc_enqueue(struct obd_export *exp, enum ldlm_type type, + union ldlm_policy_data *policy, enum ldlm_mode mode, + __u64 *flags, ldlm_glimpse_callback glimpse_callback, + void *data, __u32 lvb_len, void *lvb_swabber, + struct lustre_handle *lockh) +{ + struct config_llog_data *cld = (struct config_llog_data *)data; + struct ldlm_enqueue_info einfo = { + .ei_type = type, + .ei_mode = mode, + .ei_cb_bl = mgc_blocking_ast, + .ei_cb_cp = ldlm_completion_ast, + .ei_cb_gl = glimpse_callback, + }; + struct ptlrpc_request *req; + int short_limit = cld_is_sptlrpc(cld); + int rc; + ENTRY; + + CDEBUG(D_MGC, "Enqueue for %s (res %#llx)\n", cld->cld_logname, + cld->cld_resid.name[0]); + + /* We need a callback for every lockholder, so don't try to + ldlm_lock_match (see rev 1.1.2.11.2.47) */ + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION, + LDLM_ENQUEUE); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0); + ptlrpc_request_set_replen(req); + + /* check if this is server or client */ + if (cld->cld_cfg.cfg_sb) { + struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb); + if (lsi && IS_SERVER(lsi)) + short_limit = 1; + } + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT; + rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags, + NULL, 0, LVB_T_NONE, lockh, 0); + /* A failed enqueue should still call the mgc_blocking_ast, + where it will be requeued if needed ("grant failed"). */ + ptlrpc_req_finished(req); + RETURN(rc); +} + +static int mgc_cancel(struct obd_export *exp, enum ldlm_mode mode, + struct lustre_handle *lockh) +{ + ENTRY; + + ldlm_lock_decref(lockh, mode); + + RETURN(0); +} + +static void mgc_notify_active(struct obd_device *unused) +{ + /* wakeup mgc_requeue_thread to requeue mgc lock */ + spin_lock(&config_list_lock); + rq_state |= RQ_NOW; + spin_unlock(&config_list_lock); + wake_up(&rq_waitq); + + /* TODO: Help the MGS rebuild nidtbl. -jay */ +} + +/* Send target_reg message to MGS */ +static int mgc_target_register(struct obd_export *exp, + struct mgs_target_info *mti) +{ + struct ptlrpc_request *req; + struct mgs_target_info *req_mti, *rep_mti; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION, + MGS_TARGET_REG); + if (req == NULL) + RETURN(-ENOMEM); + + req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO); + if (!req_mti) { + ptlrpc_req_finished(req); + RETURN(-ENOMEM); + } + + memcpy(req_mti, mti, sizeof(*req_mti)); + ptlrpc_request_set_replen(req); + CDEBUG(D_MGC, "register %s\n", mti->mti_svname); + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = MGC_TARGET_REG_LIMIT; + + /* if the target needs to regenerate the config log in MGS, it's better + * to use some longer limit to let MGC have time to change connection to + * another MGS (or try again with the same MGS) for the target (server) + * will fail and exit if the request expired due to delay limit. */ + if (mti->mti_flags & (LDD_F_UPDATE | LDD_F_NEED_INDEX)) + req->rq_delay_limit = MGC_TARGET_REG_LIMIT_MAX; + + rc = ptlrpc_queue_wait(req); + if (!rc) { + rep_mti = req_capsule_server_get(&req->rq_pill, + &RMF_MGS_TARGET_INFO); + memcpy(mti, rep_mti, sizeof(*rep_mti)); + CDEBUG(D_MGC, "register %s got index = %d\n", + mti->mti_svname, mti->mti_stripe_index); + } + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, + u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + int rc = -EINVAL; + ENTRY; + + /* Turn off initial_recov after we try all backup servers once */ + if (KEY_IS(KEY_INIT_RECOV_BACKUP)) { + struct obd_import *imp = class_exp2cliimp(exp); + int value; + if (vallen != sizeof(int)) + RETURN(-EINVAL); + value = *(int *)val; + CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n", + imp->imp_obd->obd_name, value, + imp->imp_deactive, imp->imp_invalid, + imp->imp_replayable, imp->imp_obd->obd_replayable, + ptlrpc_import_state_name(imp->imp_state)); + /* Resurrect the import immediately if + * 1. we previously got disconnected, + * 2. value > 1 (at the same node with MGS) + * */ + if (imp->imp_state == LUSTRE_IMP_DISCON || value > 1) + ptlrpc_reconnect_import(imp); + + RETURN(0); + } + + /* FIXME move this to mgc_process_config */ + if (KEY_IS(KEY_REGISTER_TARGET)) { + struct mgs_target_info *mti; + if (vallen != sizeof(struct mgs_target_info)) + RETURN(-EINVAL); + mti = (struct mgs_target_info *)val; + CDEBUG(D_MGC, "register_target %s %#x\n", + mti->mti_svname, mti->mti_flags); + rc = mgc_target_register(exp, mti); + RETURN(rc); + } + if (KEY_IS(KEY_SET_FS)) { + struct super_block *sb = (struct super_block *)val; + + if (vallen != sizeof(struct super_block)) + RETURN(-EINVAL); + + rc = mgc_fs_setup(env, exp->exp_obd, sb); + RETURN(rc); + } + if (KEY_IS(KEY_CLEAR_FS)) { + if (vallen != 0) + RETURN(-EINVAL); + rc = mgc_fs_cleanup(env, exp->exp_obd); + RETURN(rc); + } +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0) + if (KEY_IS(KEY_SET_INFO)) { + struct mgs_send_param *msp; + + msp = (struct mgs_send_param *)val; + rc = mgc_set_mgs_param(exp, msp); + RETURN(rc); + } +#endif + if (KEY_IS(KEY_MGSSEC)) { + struct client_obd *cli = &exp->exp_obd->u.cli; + struct sptlrpc_flavor flvr; + + /* + * empty string means using current flavor, if which haven't + * been set yet, set it as null. + * + * if flavor has been set previously, check the asking flavor + * must match the existing one. + */ + if (vallen == 0) { + if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID) + RETURN(0); + val = "null"; + vallen = 4; + } + + rc = sptlrpc_parse_flavor(val, &flvr); + if (rc) { + CERROR("invalid sptlrpc flavor %s to MGS\n", + (char *) val); + RETURN(rc); + } + + /* + * caller already hold a mutex + */ + if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) { + cli->cl_flvr_mgc = flvr; + } else if (memcmp(&cli->cl_flvr_mgc, &flvr, + sizeof(flvr)) != 0) { + char str[20]; + + sptlrpc_flavor2name(&cli->cl_flvr_mgc, + str, sizeof(str)); + LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but " + "currently %s is in use\n", + (char *) val, str); + rc = -EPERM; + } + RETURN(rc); + } + + RETURN(rc); +} + +static int mgc_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val) +{ + int rc = -EINVAL; + + if (KEY_IS(KEY_CONN_DATA)) { + struct obd_import *imp = class_exp2cliimp(exp); + struct obd_connect_data *data = val; + + if (*vallen == sizeof(*data)) { + *data = imp->imp_connect_data; + rc = 0; + } + } + + return rc; +} + +static int mgc_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + int rc = 0; + + LASSERT(imp->imp_obd == obd); + CDEBUG(D_MGC, "import event %#x\n", event); + + switch (event) { + case IMP_EVENT_DISCON: + /* MGC imports should not wait for recovery */ + if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV)) + ptlrpc_pinger_ir_down(); + break; + case IMP_EVENT_INACTIVE: + break; + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + break; + } + case IMP_EVENT_ACTIVE: + CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name); + /* Clearing obd_no_recov allows us to continue pinging */ + obd->obd_no_recov = 0; + mgc_notify_active(obd); + if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV)) + ptlrpc_pinger_ir_up(); + break; + case IMP_EVENT_OCD: + break; + case IMP_EVENT_DEACTIVATE: + case IMP_EVENT_ACTIVATE: + break; + default: + CERROR("Unknown import event %#x\n", event); + LBUG(); + } + RETURN(rc); +} + +enum { + CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_SHIFT), + CONFIG_READ_NRPAGES = 4 +}; + +static int mgc_apply_recover_logs(struct obd_device *mgc, + struct config_llog_data *cld, + __u64 max_version, + void *data, int datalen, bool mne_swab) +{ + struct config_llog_instance *cfg = &cld->cld_cfg; + struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb); + struct mgs_nidtbl_entry *entry; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + u64 prev_version = 0; + char *inst; + char *buf; + int bufsz; + int pos; + int rc = 0; + int off = 0; + + ENTRY; + LASSERT(cfg->cfg_instance != 0); + LASSERT(ll_get_cfg_instance(cfg->cfg_sb) == cfg->cfg_instance); + + OBD_ALLOC(inst, PAGE_SIZE); + if (inst == NULL) + RETURN(-ENOMEM); + + if (!IS_SERVER(lsi)) { + pos = snprintf(inst, PAGE_SIZE, "%016lx", cfg->cfg_instance); + if (pos >= PAGE_SIZE) { + OBD_FREE(inst, PAGE_SIZE); + return -E2BIG; + } + } else { + LASSERT(IS_MDT(lsi)); + rc = server_name2svname(lsi->lsi_svname, inst, NULL, + PAGE_SIZE); + if (rc) { + OBD_FREE(inst, PAGE_SIZE); + RETURN(-EINVAL); + } + pos = strlen(inst); + } + + ++pos; + buf = inst + pos; + bufsz = PAGE_SIZE - pos; + + while (datalen > 0) { + int entry_len = sizeof(*entry); + int is_ost, i; + struct obd_device *obd; + char *obdname; + char *cname; + char *params; + char *uuid; + + rc = -EINVAL; + if (datalen < sizeof(*entry)) + break; + + entry = (typeof(entry))(data + off); + + /* sanity check */ + if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */ + break; + if (entry->mne_nid_count == 0) /* at least one nid entry */ + break; + if (entry->mne_nid_size != sizeof(lnet_nid_t)) + break; + + entry_len += entry->mne_nid_count * entry->mne_nid_size; + if (datalen < entry_len) /* must have entry_len at least */ + break; + + /* Keep this swab for normal mixed endian handling. LU-1644 */ + if (mne_swab) + lustre_swab_mgs_nidtbl_entry(entry); + if (entry->mne_length > PAGE_SIZE) { + CERROR("MNE too large (%u)\n", entry->mne_length); + break; + } + + if (entry->mne_length < entry_len) + break; + + off += entry->mne_length; + datalen -= entry->mne_length; + if (datalen < 0) + break; + + if (entry->mne_version > max_version) { + CERROR("entry index(%lld) is over max_index(%lld)\n", + entry->mne_version, max_version); + break; + } + + if (prev_version >= entry->mne_version) { + CERROR("index unsorted, prev %lld, now %lld\n", + prev_version, entry->mne_version); + break; + } + prev_version = entry->mne_version; + + /* + * Write a string with format "nid::instance" to + * lustre//--/import. + */ + + is_ost = entry->mne_type == LDD_F_SV_TYPE_OST; + memset(buf, 0, bufsz); + obdname = buf; + pos = 0; + + /* lustre-OST0001-osc- */ + strcpy(obdname, cld->cld_logname); + cname = strrchr(obdname, '-'); + if (cname == NULL) { + CERROR("mgc %s: invalid logname %s\n", + mgc->obd_name, obdname); + break; + } + + pos = cname - obdname; + obdname[pos] = 0; + pos += sprintf(obdname + pos, "-%s%04x", + is_ost ? "OST" : "MDT", entry->mne_index); + + cname = is_ost ? "osc" : "mdc", + pos += sprintf(obdname + pos, "-%s-%s", cname, inst); + lustre_cfg_bufs_reset(&bufs, obdname); + + /* find the obd by obdname */ + obd = class_name2obd(obdname); + if (obd == NULL) { + CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n", + mgc->obd_name, obdname); + rc = 0; + /* this is a safe race, when the ost is starting up...*/ + continue; + } + + /* osc.import = "connection=::" */ + ++pos; + params = buf + pos; + pos += sprintf(params, "%s.import=%s", cname, "connection="); + uuid = buf + pos; + + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import == NULL) { + /* client does not connect to the OST yet */ + up_read(&obd->u.cli.cl_sem); + rc = 0; + continue; + } + + /* iterate all nids to find one */ + /* find uuid by nid */ + rc = -ENOENT; + for (i = 0; i < entry->mne_nid_count; i++) { + rc = client_import_find_conn(obd->u.cli.cl_import, + entry->u.nids[i], + (struct obd_uuid *)uuid); + if (rc == 0) + break; + } + + up_read(&obd->u.cli.cl_sem); + if (rc < 0) { + CERROR("mgc: cannot find uuid by nid %s\n", + libcfs_nid2str(entry->u.nids[0])); + break; + } + + CDEBUG(D_INFO, "Find uuid %s by nid %s\n", + uuid, libcfs_nid2str(entry->u.nids[0])); + + pos += strlen(uuid); + pos += sprintf(buf + pos, "::%u", entry->mne_instance); + LASSERT(pos < bufsz); + + lustre_cfg_bufs_set_string(&bufs, 1, params); + + OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, + bufs.lcfg_buflen)); + if (!lcfg) { + rc = -ENOMEM; + break; + } + lustre_cfg_init(lcfg, LCFG_PARAM, &bufs); + + CDEBUG(D_INFO, "ir apply logs %lld/%lld for %s -> %s\n", + prev_version, max_version, obdname, params); + + rc = class_process_config(lcfg); + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens)); + if (rc) + CDEBUG(D_INFO, "process config for %s error %d\n", + obdname, rc); + + /* continue, even one with error */ + } + + OBD_FREE(inst, PAGE_SIZE); + RETURN(rc); +} + +/** + * This function is called if this client was notified for target restarting + * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery or + * nodemap logs. + */ +static int mgc_process_recover_nodemap_log(struct obd_device *obd, + struct config_llog_data *cld) +{ + struct ptlrpc_connection *mgc_conn; + struct ptlrpc_request *req = NULL; + struct config_llog_instance *cfg = &cld->cld_cfg; + struct mgs_config_body *body; + struct mgs_config_res *res; + struct nodemap_config *new_config = NULL; + struct lu_nodemap *recent_nodemap = NULL; + struct ptlrpc_bulk_desc *desc; + struct page **pages = NULL; + __u64 config_read_offset = 0; + __u8 nodemap_cur_pass = 0; + int nrpages = 0; + bool eof = true; + bool mne_swab = false; + int i; + int ealen; + int rc; + ENTRY; + + mgc_conn = class_exp2cliimp(cld->cld_mgcexp)->imp_connection; + + /* don't need to get local config */ + if (cld_is_nodemap(cld) && LNetIsPeerLocal(mgc_conn->c_peer.nid)) + GOTO(out, rc = 0); + + /* allocate buffer for bulk transfer. + * if this is the first time for this mgs to read logs, + * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs + * once; otherwise, it only reads increment of logs, this should be + * small and CONFIG_READ_NRPAGES will be used. + */ + nrpages = CONFIG_READ_NRPAGES; + if (cfg->cfg_last_idx == 0 || cld_is_nodemap(cld)) + nrpages = CONFIG_READ_NRPAGES_INIT; + + OBD_ALLOC(pages, sizeof(*pages) * nrpages); + if (pages == NULL) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < nrpages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (pages[i] == NULL) + GOTO(out, rc = -ENOMEM); + } + +again: +#ifdef HAVE_SERVER_SUPPORT + if (cld_is_nodemap(cld) && config_read_offset == 0) { + new_config = nodemap_config_alloc(); + if (IS_ERR(new_config)) { + rc = PTR_ERR(new_config); + new_config = NULL; + GOTO(out, rc); + } + } +#endif + LASSERT(cld_is_recover(cld) || cld_is_nodemap(cld)); + LASSERT(mutex_is_locked(&cld->cld_lock)); + req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp), + &RQF_MGS_CONFIG_READ); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ); + if (rc) + GOTO(out, rc); + + /* pack request */ + body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY); + LASSERT(body != NULL); + LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname)); + if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name)) + >= sizeof(body->mcb_name)) + GOTO(out, rc = -E2BIG); + if (cld_is_nodemap(cld)) + body->mcb_offset = config_read_offset; + else + body->mcb_offset = cfg->cfg_last_idx + 1; + body->mcb_type = cld->cld_type; + body->mcb_bits = PAGE_SHIFT; + body->mcb_units = nrpages; + body->mcb_nm_cur_pass = nodemap_cur_pass; + + /* allocate bulk transfer descriptor */ + desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, + PTLRPC_BULK_PUT_SINK | PTLRPC_BULK_BUF_KIOV, + MGS_BULK_PORTAL, + &ptlrpc_bulk_kiov_pin_ops); + if (desc == NULL) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < nrpages; i++) + desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0, + PAGE_SIZE); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES); + if (!res) + GOTO(out, rc = -EPROTO); + + if (cld_is_nodemap(cld)) { + config_read_offset = res->mcr_offset; + eof = config_read_offset == II_END_OFF; + nodemap_cur_pass = res->mcr_nm_cur_pass; + } else { + if (res->mcr_size < res->mcr_offset) + GOTO(out, rc = -EINVAL); + + /* always update the index even though it might have errors with + * handling the recover logs + */ + cfg->cfg_last_idx = res->mcr_offset; + eof = res->mcr_offset == res->mcr_size; + + CDEBUG(D_INFO, "Latest version %lld, more %d.\n", + res->mcr_offset, eof == false); + } + + ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0); + if (ealen < 0) + GOTO(out, rc = ealen); + + if (ealen > nrpages << PAGE_SHIFT) + GOTO(out, rc = -EINVAL); + + if (ealen == 0) { /* no logs transferred */ +#ifdef HAVE_SERVER_SUPPORT + /* config changed since first read RPC */ + if (cld_is_nodemap(cld) && config_read_offset == 0) { + CDEBUG(D_INFO, "nodemap config changed in transit, retrying\n"); + GOTO(out, rc = -EAGAIN); + } +#endif + if (!eof) + rc = -EINVAL; + GOTO(out, rc); + } + + mne_swab = ptlrpc_rep_need_swab(req); + + /* When a nodemap config is received, we build a new nodemap config, + * with new nodemap structs. We keep track of the most recently added + * nodemap since the config is read ordered by nodemap_id, and so it + * is likely that the next record will be related. Because access to + * the nodemaps is single threaded until the nodemap_config is active, + * we don't need to reference count with recent_nodemap, though + * recent_nodemap should be set to NULL when the nodemap_config + * is either destroyed or set active. + */ + for (i = 0; i < nrpages && ealen > 0; i++) { + int rc2; + union lu_page *ptr; + + ptr = kmap(pages[i]); + if (cld_is_nodemap(cld)) + rc2 = nodemap_process_idx_pages(new_config, ptr, + &recent_nodemap); + else + rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, + ptr, + min_t(int, ealen, + PAGE_SIZE), + mne_swab); + kunmap(pages[i]); + if (rc2 < 0) { + CWARN("%s: error processing %s log %s: rc = %d\n", + obd->obd_name, + cld_is_nodemap(cld) ? "nodemap" : "recovery", + cld->cld_logname, + rc2); + GOTO(out, rc = rc2); + } + + ealen -= PAGE_SIZE; + } + +out: + if (req) { + ptlrpc_req_finished(req); + req = NULL; + } + + if (rc == 0 && !eof) + goto again; + +#ifdef HAVE_SERVER_SUPPORT + if (new_config != NULL) { + /* recent_nodemap cannot be used after set_active/dealloc */ + if (rc == 0) + nodemap_config_set_active_mgc(new_config); + else + nodemap_config_dealloc(new_config); + } +#endif + + if (pages) { + for (i = 0; i < nrpages; i++) { + if (pages[i] == NULL) + break; + __free_page(pages[i]); + } + OBD_FREE(pages, sizeof(*pages) * nrpages); + } + return rc; +} + +static int mgc_barrier_glimpse_ast(struct ldlm_lock *lock, void *data) +{ + struct config_llog_data *cld = lock->l_ast_data; + int rc; + ENTRY; + + if (cld->cld_stopping) + RETURN(-ENODEV); + + rc = barrier_handler(s2lsi(cld->cld_cfg.cfg_sb)->lsi_dt_dev, + (struct ptlrpc_request *)data); + + RETURN(rc); +} + +/* Copy a remote log locally */ +static int mgc_llog_local_copy(const struct lu_env *env, + struct obd_device *obd, + struct llog_ctxt *rctxt, + struct llog_ctxt *lctxt, char *logname) +{ + char *temp_log; + int rc; + + ENTRY; + + /* + * - copy it to backup using llog_backup() + * - copy remote llog to logname using llog_backup() + * - if failed then move bakup to logname again + */ + + OBD_ALLOC(temp_log, strlen(logname) + 2); + if (!temp_log) + RETURN(-ENOMEM); + sprintf(temp_log, "%sT", logname); + + /* make a copy of local llog at first */ + rc = llog_backup(env, obd, lctxt, lctxt, logname, temp_log); + if (rc < 0 && rc != -ENOENT) + GOTO(out, rc); + /* copy remote llog to the local copy */ + rc = llog_backup(env, obd, rctxt, lctxt, logname, logname); + if (rc == -ENOENT) { + /* no remote llog, delete local one too */ + llog_erase(env, lctxt, NULL, logname); + } else if (rc < 0) { + /* error during backup, get local one back from the copy */ + llog_backup(env, obd, lctxt, lctxt, temp_log, logname); +out: + CERROR("%s: failed to copy remote log %s: rc = %d\n", + obd->obd_name, logname, rc); + } + llog_erase(env, lctxt, NULL, temp_log); + OBD_FREE(temp_log, strlen(logname) + 2); + return rc; +} + +/* local_only means it cannot get remote llogs */ +static int mgc_process_cfg_log(struct obd_device *mgc, + struct config_llog_data *cld, int local_only) +{ + struct llog_ctxt *ctxt, *lctxt = NULL; + struct client_obd *cli = &mgc->u.cli; + struct lustre_sb_info *lsi = NULL; + int rc = 0; + struct lu_env *env; + + ENTRY; + + LASSERT(cld); + LASSERT(mutex_is_locked(&cld->cld_lock)); + + if (cld->cld_cfg.cfg_sb) + lsi = s2lsi(cld->cld_cfg.cfg_sb); + + OBD_ALLOC_PTR(env); + if (env == NULL) + RETURN(-ENOMEM); + + rc = lu_env_init(env, LCT_MG_THREAD); + if (rc) + GOTO(out_free, rc); + + ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT); + LASSERT(ctxt); + + lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT); + + /* Copy the setup log locally if we can. Don't mess around if we're + * running an MGS though (logs are already local). */ + if (lctxt && lsi && IS_SERVER(lsi) && !IS_MGS(lsi) && + cli->cl_mgc_configs_dir != NULL && + lu2dt_dev(cli->cl_mgc_configs_dir->do_lu.lo_dev) == + lsi->lsi_dt_dev) { + if (!local_only && !lsi->lsi_dt_dev->dd_rdonly) + /* Only try to copy log if we have the lock. */ + rc = mgc_llog_local_copy(env, mgc, ctxt, lctxt, + cld->cld_logname); + if (local_only || rc) { + if (strcmp(cld->cld_logname, PARAMS_FILENAME) != 0 && + llog_is_empty(env, lctxt, cld->cld_logname)) { + LCONSOLE_ERROR_MSG(0x13a, "Failed to get MGS " + "log %s and no local copy." + "\n", cld->cld_logname); + GOTO(out_pop, rc = -ENOENT); + } + CDEBUG(D_MGC, "Failed to get MGS log %s, using local " + "copy for now, will try to update later.\n", + cld->cld_logname); + rc = 0; + } + /* Now, whether we copied or not, start using the local llog. + * If we failed to copy, we'll start using whatever the old + * log has. */ + llog_ctxt_put(ctxt); + ctxt = lctxt; + lctxt = NULL; + } else { + if (local_only) /* no local log at client side */ + GOTO(out_pop, rc = -EIO); + } + + rc = -EAGAIN; + if (lsi && IS_SERVER(lsi) && !IS_MGS(lsi) && + lsi->lsi_dt_dev->dd_rdonly) { + struct llog_ctxt *rctxt; + + /* Under readonly mode, we may have no local copy or local + * copy is incomplete, so try to use remote llog firstly. */ + rctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT); + LASSERT(rctxt); + + rc = class_config_parse_llog(env, rctxt, cld->cld_logname, + &cld->cld_cfg); + llog_ctxt_put(rctxt); + } + + if (rc && rc != -ENOENT) + rc = class_config_parse_llog(env, ctxt, cld->cld_logname, + &cld->cld_cfg); + + /* + * update settings on existing OBDs. doing it inside + * of llog_process_lock so no device is attaching/detaching + * in parallel. + * the logname must be -sptlrpc + */ + if (rc == 0 && cld_is_sptlrpc(cld)) + class_notify_sptlrpc_conf(cld->cld_logname, + strlen(cld->cld_logname) - + strlen("-sptlrpc")); + EXIT; + +out_pop: + __llog_ctxt_put(env, ctxt); + if (lctxt) + __llog_ctxt_put(env, lctxt); + + lu_env_fini(env); +out_free: + OBD_FREE_PTR(env); + return rc; +} + +static bool mgc_import_in_recovery(struct obd_import *imp) +{ + bool in_recovery = true; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_FULL || + imp->imp_state == LUSTRE_IMP_CLOSED) + in_recovery = false; + spin_unlock(&imp->imp_lock); + + return in_recovery; +} + +/** + * Get a configuration log from the MGS and process it. + * + * This function is called for both clients and servers to process the + * configuration log from the MGS. The MGC enqueues a DLM lock on the + * log from the MGS, and if the lock gets revoked the MGC will be notified + * by the lock cancellation callback that the config log has changed, + * and will enqueue another MGS lock on it, and then continue processing + * the new additions to the end of the log. + * + * Since the MGC import is not replayable, if the import is being evicted + * (rcl == -ESHUTDOWN, \see ptlrpc_import_delay_req()), retry to process + * the log until recovery is finished or the import is closed. + * + * Make a local copy of the log before parsing it if appropriate (non-MGS + * server) so that the server can start even when the MGS is down. + * + * There shouldn't be multiple processes running process_log at once -- + * sounds like badness. It actually might be fine, as long as they're not + * trying to update from the same log simultaneously, in which case we + * should use a per-log semaphore instead of cld_lock. + * + * \param[in] mgc MGC device by which to fetch the configuration log + * \param[in] cld log processing state (stored in lock callback data) + * + * \retval 0 on success + * \retval negative errno on failure + */ +int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld) +{ + struct lustre_handle lockh = { 0 }; + __u64 flags = LDLM_FL_NO_LRU; + int rc = 0, rcl; + bool retry = false; + ENTRY; + + LASSERT(cld != NULL); + + /* I don't want multiple processes running process_log at once -- + sounds like badness. It actually might be fine, as long as + we're not trying to update from the same log + simultaneously (in which case we should use a per-log sem.) */ +restart: + mutex_lock(&cld->cld_lock); + if (cld->cld_stopping) { + mutex_unlock(&cld->cld_lock); + RETURN(0); + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20); + + CDEBUG(D_MGC, "Process log %s-%016lx from %d\n", cld->cld_logname, + cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1); + + /* Get the cfg lock on the llog */ + rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, LDLM_PLAIN, NULL, + LCK_CR, &flags, + cld_is_barrier(cld) ? mgc_barrier_glimpse_ast : NULL, + cld, 0, NULL, &lockh); + if (rcl == 0) { + /* Get the cld, it will be released in mgc_blocking_ast. */ + config_log_get(cld); + rc = ldlm_lock_set_data(&lockh, (void *)cld); + LASSERT(rc == 0); + } else { + CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl); + + if (rcl == -ESHUTDOWN && + atomic_read(&mgc->u.cli.cl_mgc_refcount) > 0 && !retry) { + struct obd_import *imp; + struct l_wait_info lwi; + int secs = cfs_time_seconds(obd_timeout); + + mutex_unlock(&cld->cld_lock); + imp = class_exp2cliimp(mgc->u.cli.cl_mgc_mgsexp); + + /* Let's force the pinger, and wait the import to be + * connected, note: since mgc import is non-replayable, + * and even the import state is disconnected, it does + * not mean the "recovery" is stopped, so we will keep + * waitting until timeout or the import state is + * FULL or closed */ + ptlrpc_pinger_force(imp); + + lwi = LWI_TIMEOUT(secs, NULL, NULL); + l_wait_event(imp->imp_recovery_waitq, + !mgc_import_in_recovery(imp), &lwi); + + if (imp->imp_state == LUSTRE_IMP_FULL) { + retry = true; + goto restart; + } else { + mutex_lock(&cld->cld_lock); + /* unlock/lock mutex, so check stopping again */ + if (cld->cld_stopping) { + mutex_unlock(&cld->cld_lock); + RETURN(0); + } + spin_lock(&config_list_lock); + cld->cld_lostlock = 1; + spin_unlock(&config_list_lock); + } + } else { + /* mark cld_lostlock so that it will requeue + * after MGC becomes available. */ + spin_lock(&config_list_lock); + cld->cld_lostlock = 1; + spin_unlock(&config_list_lock); + } + } + + if (cld_is_recover(cld) || cld_is_nodemap(cld)) { + if (!rcl) + rc = mgc_process_recover_nodemap_log(mgc, cld); + else if (cld_is_nodemap(cld)) + rc = rcl; + + if (cld_is_recover(cld) && rc) { + if (!rcl) { + CERROR("%s: recover log %s failed, not fatal: rc = %d\n", + mgc->obd_name, cld->cld_logname, rc); + spin_lock(&config_list_lock); + cld->cld_lostlock = 1; + spin_unlock(&config_list_lock); + } + rc = 0; /* this is not a fatal error for recover log */ + } + } else if (!cld_is_barrier(cld)) { + rc = mgc_process_cfg_log(mgc, cld, rcl != 0); + } + + CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n", + mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc); + + mutex_unlock(&cld->cld_lock); + + /* Now drop the lock so MGS can revoke it */ + if (!rcl) { + rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, LCK_CR, &lockh); + if (rcl) + CERROR("Can't drop cfg lock: %d\n", rcl); + } + + /* requeue nodemap lock immediately if transfer was interrupted */ + if (cld_is_nodemap(cld) && rc == -EAGAIN) { + mgc_requeue_add(cld); + rc = 0; + } + + RETURN(rc); +} + + +/** Called from lustre_process_log. + * LCFG_LOG_START gets the config log from the MGS, processes it to start + * any services, and adds it to the list logs to watch (follow). + */ +static int mgc_process_config(struct obd_device *obd, size_t len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct config_llog_instance *cfg = NULL; + char *logname; + int rc = 0; + ENTRY; + + switch(lcfg->lcfg_command) { + case LCFG_LOV_ADD_OBD: { + /* Overloading this cfg command: register a new target */ + struct mgs_target_info *mti; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) != + sizeof(struct mgs_target_info)) + GOTO(out, rc = -EINVAL); + + mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1); + CDEBUG(D_MGC, "add_target %s %#x\n", + mti->mti_svname, mti->mti_flags); + rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti); + break; + } + case LCFG_LOV_DEL_OBD: + /* Unregister has no meaning at the moment. */ + CERROR("lov_del_obd unimplemented\n"); + rc = -ENOSYS; + break; + case LCFG_SPTLRPC_CONF: { + rc = sptlrpc_process_config(lcfg); + break; + } + case LCFG_LOG_START: { + struct config_llog_data *cld; + struct super_block *sb; + + logname = lustre_cfg_string(lcfg, 1); + cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2); + sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3); + + CDEBUG(D_MGC, "parse_log %s from %d\n", logname, + cfg->cfg_last_idx); + + /* We're only called through here on the initial mount */ + cld = config_log_add(obd, logname, cfg, sb); + if (IS_ERR(cld)) { + rc = PTR_ERR(cld); + break; + } + + rc = mgc_process_log(obd, cld); + if (rc == 0 && cld->cld_recover != NULL) { + if (OCD_HAS_FLAG(&obd->u.cli.cl_import-> + imp_connect_data, IMP_RECOV)) { + rc = mgc_process_log(obd, cld->cld_recover); + } else { + struct config_llog_data *cir; + + mutex_lock(&cld->cld_lock); + cir = cld->cld_recover; + cld->cld_recover = NULL; + mutex_unlock(&cld->cld_lock); + config_log_put(cir); + } + + if (rc) + CERROR("Cannot process recover llog %d\n", rc); + } + + if (rc == 0 && cld->cld_params != NULL) { + rc = mgc_process_log(obd, cld->cld_params); + if (rc == -ENOENT) { + CDEBUG(D_MGC, "There is no params " + "config file yet\n"); + rc = 0; + } + /* params log is optional */ + if (rc) + CERROR("%s: can't process params llog: rc = %d\n", + obd->obd_name, rc); + } + + break; + } + case LCFG_LOG_END: { + logname = lustre_cfg_string(lcfg, 1); + + if (lcfg->lcfg_bufcount >= 2) + cfg = (struct config_llog_instance *)lustre_cfg_buf( + lcfg, 2); + rc = config_log_end(logname, cfg); + break; + } + default: { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + + } + } +out: + RETURN(rc); +} + +static struct obd_ops mgc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = mgc_setup, + .o_precleanup = mgc_precleanup, + .o_cleanup = mgc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_disconnect = client_disconnect_export, + .o_set_info_async = mgc_set_info_async, + .o_get_info = mgc_get_info, + .o_import_event = mgc_import_event, + .o_process_config = mgc_process_config, +}; + +static int __init mgc_init(void) +{ + return class_register_type(&mgc_obd_ops, NULL, false, NULL, + LUSTRE_MGC_NAME, NULL); +} + +static void __exit mgc_exit(void) +{ + class_unregister_type(LUSTRE_MGC_NAME); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Management Client"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(mgc_init); +module_exit(mgc_exit); diff --git a/drivers/staging/lustrefsx/lustre/nodist b/drivers/staging/lustrefsx/lustre/nodist new file mode 100644 index 0000000000000..24f55bb96b97d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/nodist @@ -0,0 +1,9 @@ +obd-*/obd-* +CVS +*~ +make.rules +config.* +*.o +*.orig +*.backup +.depfiles diff --git a/drivers/staging/lustrefsx/lustre/obdclass/Makefile b/drivers/staging/lustrefsx/lustre/obdclass/Makefile new file mode 100644 index 0000000000000..b2db59390dd4b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/Makefile @@ -0,0 +1,14 @@ +obj-$(CONFIG_LUSTREFSX_FS) += obdclass.o + +obdclass-y := llog.o llog_cat.o llog_obd.o llog_swab.o llog_osd.o +obdclass-y += class_obd.o debug.o genops.o llog_ioctl.o +obdclass-y += lprocfs_status.o lprocfs_counters.o +obdclass-y += lustre_handles.o lustre_peer.o local_storage.o +obdclass-y += statfs_pack.o obdo.o obd_config.o obd_mount.o obd_sysfs.o +obdclass-y += lu_object.o dt_object.o +obdclass-y += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o +obdclass-y += linkea.o kernelcomm.o jobid.o +obdclass-y += integrity.o obd_cksum.o +obdclass-y += lu_tgt_descs.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/obdclass/acl.c b/drivers/staging/lustrefsx/lustre/obdclass/acl.c new file mode 100644 index 0000000000000..599946f846ec3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/acl.c @@ -0,0 +1,283 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/acl.c + * + * Lustre Access Control List. + * + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#ifdef HAVE_SERVER_SUPPORT +# include +# include +#endif /* HAVE_SERVER_SUPPORT */ + +#ifdef CONFIG_FS_POSIX_ACL + +static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d, + posix_acl_xattr_entry *s) +{ + d->e_tag = le16_to_cpu(s->e_tag); + d->e_perm = le16_to_cpu(s->e_perm); + d->e_id = le32_to_cpu(s->e_id); +} + +#if 0 +static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d, + posix_acl_xattr_entry *s) +{ + d->e_tag = cpu_to_le16(s->e_tag); + d->e_perm = cpu_to_le16(s->e_perm); + d->e_id = cpu_to_le32(s->e_id); +} +#endif + +/* + * Check permission based on POSIX ACL. + */ +int lustre_posix_acl_permission(struct lu_ucred *mu, const struct lu_attr *la, + int want, posix_acl_xattr_entry *entry, + int count) +{ + posix_acl_xattr_entry *pa, *pe, *mask_obj; + posix_acl_xattr_entry ae, me; + int found = 0; + + if (count <= 0) + return -EACCES; + + for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) { + lustre_posix_acl_le_to_cpu(&ae, pa); + switch (ae.e_tag) { + case ACL_USER_OBJ: + /* (May have been checked already) */ + if (la->la_uid == mu->uc_fsuid) + goto check_perm; + break; + case ACL_USER: + if (ae.e_id == mu->uc_fsuid) + goto mask; + break; + case ACL_GROUP_OBJ: + if (lustre_in_group_p(mu, la->la_gid)) { + found = 1; + if ((ae.e_perm & want) == want) + goto mask; + } + break; + case ACL_GROUP: + if (lustre_in_group_p(mu, ae.e_id)) { + found = 1; + if ((ae.e_perm & want) == want) + goto mask; + } + break; + case ACL_MASK: + break; + case ACL_OTHER: + if (found) + return -EACCES; + goto check_perm; + default: + return -EIO; +} + } + return -EIO; + +mask: + for (mask_obj = pa + 1; mask_obj <= pe; mask_obj++) { + lustre_posix_acl_le_to_cpu(&me, mask_obj); + if (me.e_tag == ACL_MASK) { + if ((ae.e_perm & me.e_perm & want) == want) + return 0; + + return -EACCES; + } + } + +check_perm: + if ((ae.e_perm & want) == want) + return 0; + + return -EACCES; +} +EXPORT_SYMBOL(lustre_posix_acl_permission); + +/* + * Modify the ACL for the chmod. + */ +int lustre_posix_acl_chmod_masq(posix_acl_xattr_entry *entry, u32 mode, + int count) +{ + posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe; + + for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) { + switch (le16_to_cpu(pa->e_tag)) { + case ACL_USER_OBJ: + pa->e_perm = cpu_to_le16((mode & S_IRWXU) >> 6); + break; + case ACL_USER: + case ACL_GROUP: + break; + case ACL_GROUP_OBJ: + group_obj = pa; + break; + case ACL_MASK: + mask_obj = pa; + break; + case ACL_OTHER: + pa->e_perm = cpu_to_le16(mode & S_IRWXO); + break; + default: + return -EIO; + } + } + + if (mask_obj) { + mask_obj->e_perm = cpu_to_le16((mode & S_IRWXG) >> 3); + } else { + if (!group_obj) + return -EIO; + group_obj->e_perm = cpu_to_le16((mode & S_IRWXG) >> 3); + } + + return 0; +} +EXPORT_SYMBOL(lustre_posix_acl_chmod_masq); + +/* + * Returns 0 if the acl can be exactly represented in the traditional + * file mode permission bits, or else 1. Returns -E... on error. + */ +int +lustre_posix_acl_equiv_mode(posix_acl_xattr_entry *entry, mode_t *mode_p, + int count) +{ + posix_acl_xattr_entry *pa, *pe; + mode_t mode = 0; + int not_equiv = 0; + + for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) { + __u16 perm = le16_to_cpu(pa->e_perm); + switch (le16_to_cpu(pa->e_tag)) { + case ACL_USER_OBJ: + mode |= (perm & S_IRWXO) << 6; + break; + case ACL_GROUP_OBJ: + mode |= (perm & S_IRWXO) << 3; + break; + case ACL_OTHER: + mode |= perm & S_IRWXO; + break; + case ACL_MASK: + mode = (mode & ~S_IRWXG) | + ((perm & S_IRWXO) << 3); + not_equiv = 1; + break; + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + default: + return -EINVAL; + } + } + if (mode_p) + *mode_p = (*mode_p & ~S_IRWXUGO) | mode; + return not_equiv; +} +EXPORT_SYMBOL(lustre_posix_acl_equiv_mode); + +/* + * Modify acl when creating a new object. + */ +int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, u32 *pmode, + int count) +{ + posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe; + posix_acl_xattr_entry ae; + u32 mode = *pmode; + int not_equiv = 0; + + for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) { + lustre_posix_acl_le_to_cpu(&ae, pa); + switch (ae.e_tag) { + case ACL_USER_OBJ: + ae.e_perm &= (mode >> 6) | ~(0007); + pa->e_perm = cpu_to_le16(ae.e_perm); + mode &= (ae.e_perm << 6) | ~S_IRWXU; + break; + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + case ACL_GROUP_OBJ: + group_obj = pa; + break; + case ACL_OTHER: + ae.e_perm &= mode | ~(0007); + pa->e_perm = cpu_to_le16(ae.e_perm); + mode &= ae.e_perm | ~(0007); + break; + case ACL_MASK: + mask_obj = pa; + not_equiv = 1; + break; + default: + return -EIO; + } + } + + if (mask_obj) { + ae.e_perm = le16_to_cpu(mask_obj->e_perm) & + ((mode >> 3) | ~(0007)); + mode &= (ae.e_perm << 3) | ~S_IRWXG; + mask_obj->e_perm = cpu_to_le16(ae.e_perm); + } else { + if (!group_obj) + return -EIO; + ae.e_perm = le16_to_cpu(group_obj->e_perm) & + ((mode >> 3) | ~(0007)); + mode &= (ae.e_perm << 3) | ~S_IRWXG; + group_obj->e_perm = cpu_to_le16(ae.e_perm); + } + + *pmode = (*pmode & ~S_IRWXUGO) | mode; + return not_equiv; +} +EXPORT_SYMBOL(lustre_posix_acl_create_masq); +#endif diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h new file mode 100644 index 0000000000000..0c1276deb37bc --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h @@ -0,0 +1,53 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal cl interfaces. + * + * Author: Nikita Danilov + */ +#ifndef _CL_INTERNAL_H +#define _CL_INTERNAL_H + +/** + * Thread local state internal for generic cl-code. + */ +struct cl_thread_info { + /** + * Used for submitting a sync I/O. + */ + struct cl_sync_io clt_anchor; +}; + +struct cl_thread_info *cl_env_info(const struct lu_env *env); +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); + +#endif /* _CL_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c new file mode 100644 index 0000000000000..181ef89299b2d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c @@ -0,0 +1,1237 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client IO. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include "cl_internal.h" + +/***************************************************************************** + * + * cl_io interface. + * + */ + +static inline int cl_io_type_is_valid(enum cl_io_type type) +{ + return CIT_READ <= type && type < CIT_OP_NR; +} + +static inline int cl_io_is_loopable(const struct cl_io *io) +{ + return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC; +} + +/** + * cl_io invariant that holds at all times when exported cl_io_*() functions + * are entered and left. + */ +static int cl_io_invariant(const struct cl_io *io) +{ + struct cl_io *up; + + up = io->ci_parent; + return + /* + * io can own pages only when it is ongoing. Sub-io might + * still be in CIS_LOCKED state when top-io is in + * CIS_IO_GOING. + */ + ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING || + (io->ci_state == CIS_LOCKED && up != NULL)); +} + +/** + * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top. + */ +void cl_io_fini(const struct lu_env *env, struct cl_io *io) +{ + struct cl_io_slice *slice; + + LINVRNT(cl_io_type_is_valid(io->ci_type)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + while (!list_empty(&io->ci_layers)) { + slice = container_of(io->ci_layers.prev, struct cl_io_slice, + cis_linkage); + list_del_init(&slice->cis_linkage); + if (slice->cis_iop->op[io->ci_type].cio_fini != NULL) + slice->cis_iop->op[io->ci_type].cio_fini(env, slice); + /* + * Invalidate slice to catch use after free. This assumes that + * slices are allocated within session and can be touched + * after ->cio_fini() returns. + */ + slice->cis_io = NULL; + } + io->ci_state = CIS_FINI; + + /* sanity check for layout change */ + switch(io->ci_type) { + case CIT_READ: + case CIT_WRITE: + case CIT_DATA_VERSION: + case CIT_FAULT: + break; + case CIT_FSYNC: + LASSERT(!io->ci_need_restart); + break; + case CIT_SETATTR: + case CIT_MISC: + /* Check ignore layout change conf */ + LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout, + !io->ci_need_restart)); + case CIT_GLIMPSE: + break; + case CIT_LADVISE: + break; + default: + LBUG(); + } + EXIT; +} +EXPORT_SYMBOL(cl_io_fini); + +static int cl_io_init0(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_object *scan; + int result; + + LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI); + LINVRNT(cl_io_type_is_valid(iot)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + io->ci_type = iot; + INIT_LIST_HEAD(&io->ci_lockset.cls_todo); + INIT_LIST_HEAD(&io->ci_lockset.cls_done); + INIT_LIST_HEAD(&io->ci_layers); + + result = 0; + cl_object_for_each(scan, obj) { + if (scan->co_ops->coo_io_init != NULL) { + result = scan->co_ops->coo_io_init(env, scan, io); + if (result != 0) + break; + } + } + if (result == 0) + io->ci_state = CIS_INIT; + RETURN(result); +} + +/** + * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * \pre obj != cl_object_top(obj) + */ +int cl_io_sub_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + LASSERT(obj != cl_object_top(obj)); + + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_sub_init); + +/** + * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter + * what the latter returned. + * + * \pre obj == cl_object_top(obj) + * \pre cl_io_type_is_valid(iot) + * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot + */ +int cl_io_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + LASSERT(obj == cl_object_top(obj)); + + /* clear I/O restart from previous instance */ + io->ci_need_restart = 0; + + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_init); + +/** + * Initialize read or write io. + * + * \pre iot == CIT_READ || iot == CIT_WRITE + */ +int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count) +{ + LINVRNT(iot == CIT_READ || iot == CIT_WRITE); + LINVRNT(io->ci_obj != NULL); + ENTRY; + + LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu, + "io range: %u [%llu, %llu) %u %u\n", + iot, (__u64)pos, (__u64)pos + count, + io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append); + io->u.ci_rw.crw_pos = pos; + io->u.ci_rw.crw_count = count; + RETURN(cl_io_init(env, io, iot, io->ci_obj)); +} +EXPORT_SYMBOL(cl_io_rw_init); + +static int cl_lock_descr_sort(const struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + return lu_fid_cmp(lu_object_fid(&d0->cld_obj->co_lu), + lu_object_fid(&d1->cld_obj->co_lu)); +} + +/* + * Sort locks in lexicographical order of their (fid, start-offset) pairs. + */ +static void cl_io_locks_sort(struct cl_io *io) +{ + int done = 0; + + ENTRY; + /* hidden treasure: bubble sort for now. */ + do { + struct cl_io_lock_link *curr; + struct cl_io_lock_link *prev; + struct cl_io_lock_link *temp; + + done = 1; + prev = NULL; + + list_for_each_entry_safe(curr, temp, &io->ci_lockset.cls_todo, + cill_linkage) { + if (prev != NULL) { + switch (cl_lock_descr_sort(&prev->cill_descr, + &curr->cill_descr)) { + case 0: + /* + * IMPOSSIBLE: Identical locks are + * already removed at + * this point. + */ + default: + LBUG(); + case +1: + list_move_tail(&curr->cill_linkage, + &prev->cill_linkage); + done = 0; + continue; /* don't change prev: it's + * still "previous" */ + case -1: /* already in order */ + break; + } + } + prev = curr; + } + } while (!done); + EXIT; +} + +static void cl_lock_descr_merge(struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + d0->cld_start = min(d0->cld_start, d1->cld_start); + d0->cld_end = max(d0->cld_end, d1->cld_end); + + if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE) + d0->cld_mode = CLM_WRITE; + + if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP) + d0->cld_mode = CLM_GROUP; +} + +static int cl_lockset_merge(const struct cl_lockset *set, + const struct cl_lock_descr *need) +{ + struct cl_io_lock_link *scan; + + ENTRY; + list_for_each_entry(scan, &set->cls_todo, cill_linkage) { + if (!cl_object_same(scan->cill_descr.cld_obj, need->cld_obj)) + continue; + + /* Merge locks for the same object because ldlm lock server + * may expand the lock extent, otherwise there is a deadlock + * case if two conflicted locks are queueud for the same object + * and lock server expands one lock to overlap the another. + * The side effect is that it can generate a multi-stripe lock + * that may cause casacading problem */ + cl_lock_descr_merge(&scan->cill_descr, need); + CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", + scan->cill_descr.cld_mode, scan->cill_descr.cld_start, + scan->cill_descr.cld_end); + RETURN(+1); + } + RETURN(0); +} + +static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io, + struct cl_lockset *set) +{ + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + int result; + + ENTRY; + result = 0; + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) { + result = cl_lock_request(env, io, &link->cill_lock); + if (result < 0) + break; + + list_move(&link->cill_linkage, &set->cls_done); + } + RETURN(result); +} + +/** + * Takes locks necessary for the current iteration of io. + * + * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required + * by layers for the current iteration. Then sort locks (to avoid dead-locks), + * and acquire them. + */ +int cl_io_lock(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IT_STARTED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_lock == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan); + if (result != 0) + break; + } + if (result == 0) { + cl_io_locks_sort(io); + result = cl_lockset_lock(env, io, &io->ci_lockset); + } + if (result != 0) + cl_io_unlock(env, io); + else + io->ci_state = CIS_LOCKED; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock); + +/** + * Release locks takes by io. + */ +void cl_io_unlock(const struct lu_env *env, struct cl_io *io) +{ + struct cl_lockset *set; + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + const struct cl_io_slice *scan; + + LASSERT(cl_io_is_loopable(io)); + LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + set = &io->ci_lockset; + + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) { + list_del_init(&link->cill_linkage); + if (link->cill_fini != NULL) + link->cill_fini(env, link); + } + + list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) { + list_del_init(&link->cill_linkage); + cl_lock_release(env, &link->cill_lock); + if (link->cill_fini != NULL) + link->cill_fini(env, link); + } + + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL) + scan->cis_iop->op[io->ci_type].cio_unlock(env, scan); + } + io->ci_state = CIS_UNLOCKED; + EXIT; +} +EXPORT_SYMBOL(cl_io_unlock); + +/** + * Prepares next iteration of io. + * + * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give + * layers a chance to modify io parameters, e.g., so that lov can restrict io + * to a single stripe. + */ +int cl_io_iter_init(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + result = 0; + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_iter_init(env, + scan); + if (result != 0) + break; + } + if (result == 0) + io->ci_state = CIS_IT_STARTED; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_iter_init); + +/** + * Finalizes io iteration. + * + * Calls cl_io_operations::cio_iter_fini() bottom-to-top. + */ +void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL) + scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan); + } + io->ci_state = CIS_IT_ENDED; + EXIT; +} +EXPORT_SYMBOL(cl_io_iter_fini); + +/** + * Records that read or write io progressed \a nob bytes forward. + */ +void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob) +{ + const struct cl_io_slice *scan; + + ENTRY; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + nob == 0); + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(cl_io_invariant(io)); + + io->u.ci_rw.crw_pos += nob; + io->u.ci_rw.crw_count -= nob; + + /* layers have to be notified. */ + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_advance != NULL) + scan->cis_iop->op[io->ci_type].cio_advance(env, scan, + nob); + } + EXIT; +} + +/** + * Adds a lock to a lockset. + */ +int cl_io_lock_add(const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link) +{ + int result; + + ENTRY; + if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr)) + result = +1; + else { + list_add(&link->cill_linkage, &io->ci_lockset.cls_todo); + result = 0; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock_add); + +static void cl_free_io_lock_link(const struct lu_env *env, + struct cl_io_lock_link *link) +{ + OBD_FREE_PTR(link); +} + +/** + * Allocates new lock link, and uses it to add a lock to a lockset. + */ +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr) +{ + struct cl_io_lock_link *link; + int result; + + ENTRY; + OBD_ALLOC_PTR(link); + if (link != NULL) { + link->cill_descr = *descr; + link->cill_fini = cl_free_io_lock_link; + result = cl_io_lock_add(env, io, link); + if (result) /* lock match */ + link->cill_fini(env, link); + } else + result = -ENOMEM; + + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock_alloc_add); + +/** + * Starts io by calling cl_io_operations::cio_start() top-to-bottom. + */ +int cl_io_start(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + io->ci_state = CIS_IO_GOING; + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_start == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_start(env, scan); + if (result != 0) + break; + } + if (result >= 0) + result = 0; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_start); + +/** + * Wait until current io iteration is finished by calling + * cl_io_operations::cio_end() bottom-to-top. + */ +void cl_io_end(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IO_GOING); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_end != NULL) + scan->cis_iop->op[io->ci_type].cio_end(env, scan); + /* TODO: error handling. */ + } + io->ci_state = CIS_IO_FINISHED; + EXIT; +} +EXPORT_SYMBOL(cl_io_end); + +/** + * Called by read io, to decide the readahead extent + * + * \see cl_io_operations::cio_read_ahead() + */ +int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io, + pgoff_t start, struct cl_read_ahead *ra) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->cio_read_ahead == NULL) + continue; + + result = scan->cis_iop->cio_read_ahead(env, scan, start, ra); + if (result != 0) + break; + } + RETURN(result > 0 ? 0 : result); +} +EXPORT_SYMBOL(cl_io_read_ahead); + +/** + * Commit a list of contiguous pages into writeback cache. + * + * \returns 0 if all pages committed, or errcode if error occurred. + * \see cl_io_operations::cio_commit_async() + */ +int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, int from, int to, + cl_commit_cbt cb) +{ + const struct cl_io_slice *scan; + int result = 0; + ENTRY; + + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->cio_commit_async == NULL) + continue; + result = scan->cis_iop->cio_commit_async(env, scan, queue, + from, to, cb); + if (result != 0) + break; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_io_commit_async); + +/** + * Submits a list of pages for immediate io. + * + * After the function gets returned, The submitted pages are moved to + * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need + * to be submitted, and the pages are errant to submit. + * + * \returns 0 if at least one page was submitted, error code otherwise. + * \see cl_io_operations::cio_submit() + */ +int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io, + enum cl_req_type crt, struct cl_2queue *queue) +{ + const struct cl_io_slice *scan; + int result = 0; + ENTRY; + + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->cio_submit == NULL) + continue; + result = scan->cis_iop->cio_submit(env, scan, crt, queue); + if (result != 0) + break; + } + /* + * If ->cio_submit() failed, no pages were sent. + */ + LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages))); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_submit_rw); + +/** + * Submit a sync_io and wait for the IO to be finished, or error happens. + * If \a timeout is zero, it means to wait for the IO unconditionally. + */ +int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue, + long timeout) +{ + struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor; + struct cl_page *pg; + int rc; + ENTRY; + + cl_page_list_for_each(pg, &queue->c2_qin) { + LASSERT(pg->cp_sync_io == NULL); + pg->cp_sync_io = anchor; + } + + cl_sync_io_init(anchor, queue->c2_qin.pl_nr, &cl_sync_io_end); + rc = cl_io_submit_rw(env, io, iot, queue); + if (rc == 0) { + /* + * If some pages weren't sent for any reason (e.g., + * read found up-to-date pages in the cache, or write found + * clean pages), count them as completed to avoid infinite + * wait. + */ + cl_page_list_for_each(pg, &queue->c2_qin) { + pg->cp_sync_io = NULL; + cl_sync_io_note(env, anchor, 1); + } + + /* wait for the IO to be finished. */ + rc = cl_sync_io_wait(env, anchor, timeout); + cl_page_list_assume(env, io, &queue->c2_qout); + } else { + LASSERT(list_empty(&queue->c2_qout.pl_pages)); + cl_page_list_for_each(pg, &queue->c2_qin) + pg->cp_sync_io = NULL; + } + RETURN(rc); +} +EXPORT_SYMBOL(cl_io_submit_sync); + +/** + * Cancel an IO which has been submitted by cl_io_submit_rw. + */ +int cl_io_cancel(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue) +{ + struct cl_page *page; + int result = 0; + + CERROR("Canceling ongoing page trasmission\n"); + cl_page_list_for_each(page, queue) { + int rc; + + rc = cl_page_cancel(env, page); + result = result ?: rc; + } + return result; +} + +/** + * Main io loop. + * + * Pumps io through iterations calling + * + * - cl_io_iter_init() + * + * - cl_io_lock() + * + * - cl_io_start() + * + * - cl_io_end() + * + * - cl_io_unlock() + * + * - cl_io_iter_fini() + * + * repeatedly until there is no more io to do. + */ +int cl_io_loop(const struct lu_env *env, struct cl_io *io) +{ + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + ENTRY; + + do { + size_t nob; + + io->ci_continue = 0; + result = cl_io_iter_init(env, io); + if (result == 0) { + nob = io->ci_nob; + result = cl_io_lock(env, io); + if (result == 0) { + /* + * Notify layers that locks has been taken, + * and do actual i/o. + * + * - llite: kms, short read; + * - llite: generic_file_read(); + */ + result = cl_io_start(env, io); + /* + * Send any remaining pending + * io, etc. + * + ** - llite: ll_rw_stats_tally. + */ + cl_io_end(env, io); + cl_io_unlock(env, io); + cl_io_rw_advance(env, io, io->ci_nob - nob); + } + } + cl_io_iter_fini(env, io); + } while (result == 0 && io->ci_continue); + + if (result == -EWOULDBLOCK && io->ci_ndelay) { + io->ci_need_restart = 1; + result = 0; + } + + if (result == 0) + result = io->ci_result; + RETURN(result < 0 ? result : 0); +} +EXPORT_SYMBOL(cl_io_loop); + +/** + * Adds io slice to the cl_io. + * + * This is called by cl_object_operations::coo_io_init() methods to add a + * per-layer state to the io. New state is added at the end of + * cl_io::ci_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add() + */ +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, + const struct cl_io_operations *ops) +{ + struct list_head *linkage = &slice->cis_linkage; + + LASSERT((linkage->prev == NULL && linkage->next == NULL) || + list_empty(linkage)); + ENTRY; + + list_add_tail(linkage, &io->ci_layers); + slice->cis_io = io; + slice->cis_obj = obj; + slice->cis_iop = ops; + EXIT; +} +EXPORT_SYMBOL(cl_io_slice_add); + + +/** + * Initializes page list. + */ +void cl_page_list_init(struct cl_page_list *plist) +{ + ENTRY; + plist->pl_nr = 0; + INIT_LIST_HEAD(&plist->pl_pages); + plist->pl_owner = current; + EXIT; +} +EXPORT_SYMBOL(cl_page_list_init); + +/** + * Adds a page to a page list. + */ +void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page) +{ + ENTRY; + /* it would be better to check that page is owned by "current" io, but + * it is not passed here. */ + LASSERT(page->cp_owner != NULL); + LINVRNT(plist->pl_owner == current); + + LASSERT(list_empty(&page->cp_batch)); + list_add_tail(&page->cp_batch, &plist->pl_pages); + ++plist->pl_nr; + lu_ref_add_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist); + cl_page_get(page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_add); + +/** + * Removes a page from a page list. + */ +void cl_page_list_del(const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page) +{ + LASSERT(plist->pl_nr > 0); + LASSERT(cl_page_is_vmlocked(env, page)); + LINVRNT(plist->pl_owner == current); + + ENTRY; + list_del_init(&page->cp_batch); + --plist->pl_nr; + lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist); + cl_page_put(env, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_del); + +/** + * Moves a page from one page list to another. + */ +void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page) +{ + LASSERT(src->pl_nr > 0); + LINVRNT(dst->pl_owner == current); + LINVRNT(src->pl_owner == current); + + ENTRY; + list_move_tail(&page->cp_batch, &dst->pl_pages); + --src->pl_nr; + ++dst->pl_nr; + lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue", + src, dst); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_move); + +/** + * Moves a page from one page list to the head of another list. + */ +void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page) +{ + LASSERT(src->pl_nr > 0); + LINVRNT(dst->pl_owner == current); + LINVRNT(src->pl_owner == current); + + ENTRY; + list_move(&page->cp_batch, &dst->pl_pages); + --src->pl_nr; + ++dst->pl_nr; + lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue", + src, dst); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_move_head); + +/** + * splice the cl_page_list, just as list head does + */ +void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head) +{ + struct cl_page *page; + struct cl_page *tmp; + + LINVRNT(list->pl_owner == current); + LINVRNT(head->pl_owner == current); + + ENTRY; + cl_page_list_for_each_safe(page, tmp, list) + cl_page_list_move(head, list, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_splice); + +/** + * Disowns pages in a queue. + */ +void cl_page_list_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + LINVRNT(plist->pl_owner == current); + + ENTRY; + cl_page_list_for_each_safe(page, temp, plist) { + LASSERT(plist->pl_nr > 0); + + list_del_init(&page->cp_batch); + --plist->pl_nr; + /* + * cl_page_disown0 rather than usual cl_page_disown() is used, + * because pages are possibly in CPS_FREEING state already due + * to the call to cl_page_list_discard(). + */ + /* + * XXX cl_page_disown0() will fail if page is not locked. + */ + cl_page_disown0(env, io, page); + lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", + plist); + cl_page_put(env, page); + } + EXIT; +} +EXPORT_SYMBOL(cl_page_list_disown); + +/** + * Releases pages from queue. + */ +void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + LINVRNT(plist->pl_owner == current); + + ENTRY; + cl_page_list_for_each_safe(page, temp, plist) + cl_page_list_del(env, plist, page); + LASSERT(plist->pl_nr == 0); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_fini); + +/** + * Assumes all pages in a queue. + */ +void cl_page_list_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + + LINVRNT(plist->pl_owner == current); + + cl_page_list_for_each(page, plist) + cl_page_assume(env, io, page); +} + +/** + * Discards all pages in a queue. + */ +void cl_page_list_discard(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist) +{ + struct cl_page *page; + + LINVRNT(plist->pl_owner == current); + ENTRY; + cl_page_list_for_each(page, plist) + cl_page_discard(env, io, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_discard); + +/** + * Initialize dual page queue. + */ +void cl_2queue_init(struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_init(&queue->c2_qin); + cl_page_list_init(&queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_init); + +/** + * Add a page to the incoming page list of 2-queue. + */ +void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page) +{ + ENTRY; + cl_page_list_add(&queue->c2_qin, page); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_add); + +/** + * Disown pages in both lists of a 2-queue. + */ +void cl_2queue_disown(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_disown(env, io, &queue->c2_qin); + cl_page_list_disown(env, io, &queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_disown); + +/** + * Discard (truncate) pages in both lists of a 2-queue. + */ +void cl_2queue_discard(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_discard(env, io, &queue->c2_qin); + cl_page_list_discard(env, io, &queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_discard); + +/** + * Assume to own the pages in cl_2queue + */ +void cl_2queue_assume(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + cl_page_list_assume(env, io, &queue->c2_qin); + cl_page_list_assume(env, io, &queue->c2_qout); +} + +/** + * Finalize both page lists of a 2-queue. + */ +void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_fini(env, &queue->c2_qout); + cl_page_list_fini(env, &queue->c2_qin); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_fini); + +/** + * Initialize a 2-queue to contain \a page in its incoming page list. + */ +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page) +{ + ENTRY; + cl_2queue_init(queue); + cl_2queue_add(queue, page); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_init_page); + +/** + * Returns top-level io. + * + * \see cl_object_top() + */ +struct cl_io *cl_io_top(struct cl_io *io) +{ + ENTRY; + while (io->ci_parent != NULL) + io = io->ci_parent; + RETURN(io); +} +EXPORT_SYMBOL(cl_io_top); + +/** + * Prints human readable representation of \a io to the \a f. + */ +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io) +{ +} + +/** + * Fills in attributes that are passed to server together with transfer. Only + * attributes from \a flags may be touched. This can be called multiple times + * for the same request. + */ +void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr) +{ + struct cl_object *scan; + ENTRY; + + cl_object_for_each(scan, obj) { + if (scan->co_ops->coo_req_attr_set != NULL) + scan->co_ops->coo_req_attr_set(env, scan, attr); + } + EXIT; +} +EXPORT_SYMBOL(cl_req_attr_set); + +/* cl_sync_io_callback assumes the caller must call cl_sync_io_wait() to + * wait for the IO to finish. */ +void cl_sync_io_end(const struct lu_env *env, struct cl_sync_io *anchor) +{ + wake_up_all(&anchor->csi_waitq); + + /* it's safe to nuke or reuse anchor now */ + atomic_set(&anchor->csi_barrier, 0); +} +EXPORT_SYMBOL(cl_sync_io_end); + +/** + * Initialize synchronous io wait anchor + */ +void cl_sync_io_init(struct cl_sync_io *anchor, int nr, + void (*end)(const struct lu_env *, struct cl_sync_io *)) +{ + ENTRY; + memset(anchor, 0, sizeof(*anchor)); + init_waitqueue_head(&anchor->csi_waitq); + atomic_set(&anchor->csi_sync_nr, nr); + atomic_set(&anchor->csi_barrier, nr > 0); + anchor->csi_sync_rc = 0; + anchor->csi_end_io = end; + LASSERT(end != NULL); + EXIT; +} +EXPORT_SYMBOL(cl_sync_io_init); + +/** + * Wait until all IO completes. Transfer completion routine has to call + * cl_sync_io_note() for every entity. + */ +int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor, + long timeout) +{ + struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout), + NULL, NULL, NULL); + int rc; + ENTRY; + + LASSERT(timeout >= 0); + + rc = l_wait_event(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0, + &lwi); + if (rc < 0) { + CERROR("IO failed: %d, still wait for %d remaining entries\n", + rc, atomic_read(&anchor->csi_sync_nr)); + + lwi = (struct l_wait_info) { 0 }; + (void)l_wait_event(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0, + &lwi); + } else { + rc = anchor->csi_sync_rc; + } + LASSERT(atomic_read(&anchor->csi_sync_nr) == 0); + + /* wait until cl_sync_io_note() has done wakeup */ + while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) { + cpu_relax(); + } + RETURN(rc); +} +EXPORT_SYMBOL(cl_sync_io_wait); + +/** + * Indicate that transfer of a single page completed. + */ +void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor, + int ioret) +{ + ENTRY; + if (anchor->csi_sync_rc == 0 && ioret < 0) + anchor->csi_sync_rc = ioret; + /* + * Synchronous IO done without releasing page lock (e.g., as a part of + * ->{prepare,commit}_write(). Completion is used to signal the end of + * IO. + */ + LASSERT(atomic_read(&anchor->csi_sync_nr) > 0); + if (atomic_dec_and_test(&anchor->csi_sync_nr)) { + LASSERT(anchor->csi_end_io != NULL); + anchor->csi_end_io(env, anchor); + /* Can't access anchor any more */ + } + EXIT; +} +EXPORT_SYMBOL(cl_sync_io_note); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c new file mode 100644 index 0000000000000..30c7186651dba --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c @@ -0,0 +1,291 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Extent Lock. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include "cl_internal.h" + +static void cl_lock_trace0(int level, const struct lu_env *env, + const char *prefix, const struct cl_lock *lock, + const char *func, const int line) +{ + struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj); + CDEBUG(level, "%s: %p (%p/%d) at %s():%d\n", + prefix, lock, env, h->coh_nesting, func, line); +} +#define cl_lock_trace(level, env, prefix, lock) \ + cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__) + +/** + * Adds lock slice to the compound lock. + * + * This is called by cl_object_operations::coo_lock_init() methods to add a + * per-layer state to the lock. New state is added at the end of + * cl_lock::cll_layers list, that is, it is at the bottom of the stack. + * + * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add() + */ +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops) +{ + ENTRY; + slice->cls_lock = lock; + list_add_tail(&slice->cls_linkage, &lock->cll_layers); + slice->cls_obj = obj; + slice->cls_ops = ops; + EXIT; +} +EXPORT_SYMBOL(cl_lock_slice_add); + +void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + + cl_lock_trace(D_DLMTRACE, env, "destroy lock", lock); + + while (!list_empty(&lock->cll_layers)) { + struct cl_lock_slice *slice; + + slice = list_entry(lock->cll_layers.next, + struct cl_lock_slice, cls_linkage); + list_del_init(lock->cll_layers.next); + slice->cls_ops->clo_fini(env, slice); + } + POISON(lock, 0x5a, sizeof(*lock)); + EXIT; +} +EXPORT_SYMBOL(cl_lock_fini); + +int cl_lock_init(const struct lu_env *env, struct cl_lock *lock, + const struct cl_io *io) +{ + struct cl_object *obj = lock->cll_descr.cld_obj; + struct cl_object *scan; + int result = 0; + ENTRY; + + /* Make sure cl_lock::cll_descr is initialized. */ + LASSERT(obj != NULL); + + INIT_LIST_HEAD(&lock->cll_layers); + list_for_each_entry(scan, &obj->co_lu.lo_header->loh_layers, + co_lu.lo_linkage) { + if (scan->co_ops->coo_lock_init != NULL) + result = scan->co_ops->coo_lock_init(env, scan, lock, + io); + + if (result != 0) { + cl_lock_fini(env, lock); + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_init); + +/** + * Returns a slice with a lock, corresponding to the given layer in the + * device stack. + * + * \see cl_page_at() + */ +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype) +{ + const struct cl_lock_slice *slice; + + ENTRY; + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype) + RETURN(slice); + } + RETURN(NULL); +} +EXPORT_SYMBOL(cl_lock_at); + +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + ENTRY; + + cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock); + list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_cancel != NULL) + slice->cls_ops->clo_cancel(env, slice); + } + + EXIT; +} +EXPORT_SYMBOL(cl_lock_cancel); + +/** + * Enqueue a lock. + * \param anchor: if we need to wait for resources before getting the lock, + * use @anchor for the purpose. + * \retval 0 enqueue successfully + * \retval <0 error code + */ +int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io, + struct cl_lock *lock, struct cl_sync_io *anchor) +{ + const struct cl_lock_slice *slice; + int rc = 0; + + ENTRY; + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_enqueue == NULL) + continue; + + rc = slice->cls_ops->clo_enqueue(env, slice, io, anchor); + if (rc != 0) + break; + } + RETURN(rc); +} +EXPORT_SYMBOL(cl_lock_enqueue); + +/** + * Main high-level entry point of cl_lock interface that finds existing or + * enqueues new lock matching given description. + */ +int cl_lock_request(const struct lu_env *env, struct cl_io *io, + struct cl_lock *lock) +{ + struct cl_sync_io *anchor = NULL; + __u32 enq_flags = lock->cll_descr.cld_enq_flags; + int rc; + ENTRY; + + rc = cl_lock_init(env, lock, io); + if (rc < 0) + RETURN(rc); + + if ((enq_flags & CEF_GLIMPSE) && !(enq_flags & CEF_SPECULATIVE)) { + anchor = &cl_env_info(env)->clt_anchor; + cl_sync_io_init(anchor, 1, cl_sync_io_end); + } + + rc = cl_lock_enqueue(env, io, lock, anchor); + + if (anchor != NULL) { + int rc2; + + /* drop the reference count held at initialization time */ + cl_sync_io_note(env, anchor, 0); + rc2 = cl_sync_io_wait(env, anchor, 0); + if (rc2 < 0 && rc == 0) + rc = rc2; + } + + if (rc < 0) + cl_lock_release(env, lock); + RETURN(rc); +} +EXPORT_SYMBOL(cl_lock_request); + +/** + * Releases a hold and a reference on a lock, obtained by cl_lock_hold(). + */ +void cl_lock_release(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + + cl_lock_trace(D_DLMTRACE, env, "release lock", lock); + cl_lock_cancel(env, lock); + cl_lock_fini(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_lock_release); + +const char *cl_lock_mode_name(const enum cl_lock_mode mode) +{ + static const char * const names[] = { + [CLM_READ] = "R", + [CLM_WRITE] = "W", + [CLM_GROUP] = "G" + }; + CLASSERT(CLM_MAX == ARRAY_SIZE(names)); + return names[mode]; +} +EXPORT_SYMBOL(cl_lock_mode_name); + +/** + * Prints human readable representation of a lock description. + */ +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr) +{ + const struct lu_fid *fid; + + fid = lu_object_fid(&descr->cld_obj->co_lu); + (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid)); +} +EXPORT_SYMBOL(cl_lock_descr_print); + +/** + * Prints human readable representation of \a lock to the \a f. + */ +void cl_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + + (*printer)(env, cookie, "lock@%p", lock); + cl_lock_descr_print(env, cookie, printer, &lock->cll_descr); + (*printer)(env, cookie, " {\n"); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + (*printer)(env, cookie, " %s@%p: ", + slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name, + slice); + if (slice->cls_ops->clo_print != NULL) + slice->cls_ops->clo_print(env, cookie, printer, slice); + (*printer)(env, cookie, "\n"); + } + (*printer)(env, cookie, "} lock@%p\n", lock); +} +EXPORT_SYMBOL(cl_lock_print); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c new file mode 100644 index 0000000000000..5aa59de91b53e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c @@ -0,0 +1,1108 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Lustre Object. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +/* + * Locking. + * + * i_mutex + * PG_locked + * ->coh_attr_guard + * ->ls_guard + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +/* class_put_type() */ +#include +#include +#include +#include /* for cfs_hash stuff */ +#include +#include +#include "cl_internal.h" + +static struct kmem_cache *cl_env_kmem; + +/** Lock class of cl_object_header::coh_attr_guard */ +static struct lock_class_key cl_attr_guard_class; + +/** + * Initialize cl_object_header. + */ +int cl_object_header_init(struct cl_object_header *h) +{ + int result; + + ENTRY; + result = lu_object_header_init(&h->coh_lu); + if (result == 0) { + spin_lock_init(&h->coh_attr_guard); + lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class); + h->coh_page_bufsize = 0; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_header_init); + +/** + * Finalize cl_object_header. + */ +void cl_object_header_fini(struct cl_object_header *h) +{ + lu_object_header_fini(&h->coh_lu); +} + +/** + * Returns a cl_object with a given \a fid. + * + * Returns either cached or newly created object. Additional reference on the + * returned object is acquired. + * + * \see lu_object_find(), cl_page_find(), cl_lock_find() + */ +struct cl_object *cl_object_find(const struct lu_env *env, + struct cl_device *cd, const struct lu_fid *fid, + const struct cl_object_conf *c) +{ + might_sleep(); + return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu)); +} +EXPORT_SYMBOL(cl_object_find); + +/** + * Releases a reference on \a o. + * + * When last reference is released object is returned to the cache, unless + * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header. + * + * \see cl_page_put(), cl_lock_put(). + */ +void cl_object_put(const struct lu_env *env, struct cl_object *o) +{ + lu_object_put(env, &o->co_lu); +} +EXPORT_SYMBOL(cl_object_put); + +/** + * Acquire an additional reference to the object \a o. + * + * This can only be used to acquire _additional_ reference, i.e., caller + * already has to possess at least one reference to \a o before calling this. + * + * \see cl_page_get(), cl_lock_get(). + */ +void cl_object_get(struct cl_object *o) +{ + lu_object_get(&o->co_lu); +} +EXPORT_SYMBOL(cl_object_get); + +/** + * Returns the top-object for a given \a o. + * + * \see cl_io_top() + */ +struct cl_object *cl_object_top(struct cl_object *o) +{ + struct cl_object_header *hdr = cl_object_header(o); + struct cl_object *top; + + while (hdr->coh_parent != NULL) + hdr = hdr->coh_parent; + + top = lu2cl(lu_object_top(&hdr->coh_lu)); + CDEBUG(D_TRACE, "%p -> %p\n", o, top); + return top; +} +EXPORT_SYMBOL(cl_object_top); + +/** + * Returns pointer to the lock protecting data-attributes for the given object + * \a o. + * + * Data-attributes are protected by the cl_object_header::coh_attr_guard + * spin-lock in the top-object. + * + * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get(). + */ +static spinlock_t *cl_object_attr_guard(struct cl_object *o) +{ + return &cl_object_header(cl_object_top(o))->coh_attr_guard; +} + +/** + * Locks data-attributes. + * + * Prevents data-attributes from changing, until lock is released by + * cl_object_attr_unlock(). This has to be called before calls to + * cl_object_attr_get(), cl_object_attr_update(). + */ +void cl_object_attr_lock(struct cl_object *o) +__acquires(cl_object_attr_guard(o)) +{ + spin_lock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_lock); + +/** + * Releases data-attributes lock, acquired by cl_object_attr_lock(). + */ +void cl_object_attr_unlock(struct cl_object *o) +__releases(cl_object_attr_guard(o)) +{ + spin_unlock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_unlock); + +/** + * Returns data-attributes of an object \a obj. + * + * Every layer is asked (by calling cl_object_operations::coo_attr_get()) + * top-to-bottom to fill in parts of \a attr that this layer is responsible + * for. + */ +int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lu_object_header *top; + int result; + + assert_spin_locked(cl_object_attr_guard(obj)); + ENTRY; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_attr_get != NULL) { + result = obj->co_ops->coo_attr_get(env, obj, attr); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_attr_get); + +/** + * Updates data-attributes of an object \a obj. + * + * Only attributes, mentioned in a validness bit-mask \a v are + * updated. Calls cl_object_operations::coo_upd_attr() on every layer, bottom + * to top. + */ +int cl_object_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned v) +{ + struct lu_object_header *top; + int result; + + assert_spin_locked(cl_object_attr_guard(obj)); + ENTRY; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_attr_update != NULL) { + result = obj->co_ops->coo_attr_update(env, obj, attr, + v); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_attr_update); + +/** + * Notifies layers (bottom-to-top) that glimpse AST was received. + * + * Layers have to fill \a lvb fields with information that will be shipped + * back to glimpse issuer. + * + * \see cl_lock_operations::clo_glimpse() + */ +int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lu_object_header *top; + int result; + + ENTRY; + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_glimpse != NULL) { + result = obj->co_ops->coo_glimpse(env, obj, lvb); + if (result != 0) + break; + } + } + LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top), + "size: %llu mtime: %llu atime: %llu " + "ctime: %llu blocks: %llu\n", + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); + RETURN(result); +} +EXPORT_SYMBOL(cl_object_glimpse); + +/** + * Updates a configuration of an object \a obj. + */ +int cl_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct lu_object_header *top; + int result; + + ENTRY; + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_conf_set != NULL) { + result = obj->co_ops->coo_conf_set(env, obj, conf); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_conf_set); + +/** + * Prunes caches of pages and locks for this object. + */ +int cl_object_prune(const struct lu_env *env, struct cl_object *obj) +{ + struct lu_object_header *top; + struct cl_object *o; + int result; + ENTRY; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(o, &top->loh_layers, co_lu.lo_linkage) { + if (o->co_ops->coo_prune != NULL) { + result = o->co_ops->coo_prune(env, o); + if (result != 0) + break; + } + } + + RETURN(result); +} +EXPORT_SYMBOL(cl_object_prune); + +/** + * Get stripe information of this object. + */ +int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *uarg, size_t size) +{ + struct lu_object_header *top; + int result = 0; + ENTRY; + + top = obj->co_lu.lo_header; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_getstripe != NULL) { + result = obj->co_ops->coo_getstripe(env, obj, uarg, + size); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_getstripe); + +/** + * Get fiemap extents from file object. + * + * \param env [in] lustre environment + * \param obj [in] file object + * \param key [in] fiemap request argument + * \param fiemap [out] fiemap extents mapping retrived + * \param buflen [in] max buffer length of @fiemap + * + * \retval 0 success + * \retval < 0 error + */ +int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj, + struct ll_fiemap_info_key *key, + struct fiemap *fiemap, size_t *buflen) +{ + struct lu_object_header *top; + int result = 0; + ENTRY; + + top = obj->co_lu.lo_header; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_fiemap != NULL) { + result = obj->co_ops->coo_fiemap(env, obj, key, fiemap, + buflen); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_fiemap); + +int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj, + struct cl_layout *cl) +{ + struct lu_object_header *top = obj->co_lu.lo_header; + ENTRY; + + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_layout_get != NULL) + return obj->co_ops->coo_layout_get(env, obj, cl); + } + + RETURN(-EOPNOTSUPP); +} +EXPORT_SYMBOL(cl_object_layout_get); + +loff_t cl_object_maxbytes(struct cl_object *obj) +{ + struct lu_object_header *top = obj->co_lu.lo_header; + loff_t maxbytes = LLONG_MAX; + ENTRY; + + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_maxbytes != NULL) + maxbytes = min_t(loff_t, obj->co_ops->coo_maxbytes(obj), + maxbytes); + } + + RETURN(maxbytes); +} +EXPORT_SYMBOL(cl_object_maxbytes); + +int cl_object_flush(const struct lu_env *env, struct cl_object *obj, + struct ldlm_lock *lock) +{ + struct lu_object_header *top = obj->co_lu.lo_header; + int rc = 0; + ENTRY; + + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_object_flush) { + rc = obj->co_ops->coo_object_flush(env, obj, lock); + if (rc) + break; + } + } + RETURN(rc); +} +EXPORT_SYMBOL(cl_object_flush); + +/** + * Helper function removing all object locks, and marking object for + * deletion. All object pages must have been deleted at this point. + * + * This is called by cl_inode_fini() and lov_object_delete() to destroy top- + * and sub- objects respectively. + */ +void cl_object_kill(const struct lu_env *env, struct cl_object *obj) +{ + struct cl_object_header *hdr = cl_object_header(obj); + + set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags); +} +EXPORT_SYMBOL(cl_object_kill); + +void cache_stats_init(struct cache_stats *cs, const char *name) +{ + int i; + + cs->cs_name = name; + for (i = 0; i < CS_NR; i++) + atomic_set(&cs->cs_stats[i], 0); +} + +static int cache_stats_print(const struct cache_stats *cs, + struct seq_file *m, int h) +{ + int i; + + /* + * lookup hit total cached create + * env: ...... ...... ...... ...... ...... + */ + if (h) { + const char *names[CS_NR] = CS_NAMES; + + seq_printf(m, "%6s", " "); + for (i = 0; i < CS_NR; i++) + seq_printf(m, "%8s", names[i]); + seq_printf(m, "\n"); + } + + seq_printf(m, "%5.5s:", cs->cs_name); + for (i = 0; i < CS_NR; i++) + seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i])); + return 0; +} + +static void cl_env_percpu_refill(void); + +/** + * Initialize client site. + * + * Perform common initialization (lu_site_init()), and initialize statistical + * counters. Also perform global initializations on the first call. + */ +int cl_site_init(struct cl_site *s, struct cl_device *d) +{ + size_t i; + int result; + + result = lu_site_init(&s->cs_lu, &d->cd_lu_dev); + if (result == 0) { + cache_stats_init(&s->cs_pages, "pages"); + for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i) + atomic_set(&s->cs_pages_state[0], 0); + cl_env_percpu_refill(); + } + return result; +} +EXPORT_SYMBOL(cl_site_init); + +/** + * Finalize client site. Dual to cl_site_init(). + */ +void cl_site_fini(struct cl_site *s) +{ + lu_site_fini(&s->cs_lu); +} +EXPORT_SYMBOL(cl_site_fini); + +static struct cache_stats cl_env_stats = { + .cs_name = "envs", + .cs_stats = { ATOMIC_INIT(0), } +}; + +/** + * Outputs client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *site, struct seq_file *m) +{ + static const char *pstate[] = { + [CPS_CACHED] = "c", + [CPS_OWNED] = "o", + [CPS_PAGEOUT] = "w", + [CPS_PAGEIN] = "r", + [CPS_FREEING] = "f" + }; + size_t i; + +/* + lookup hit total busy create +pages: ...... ...... ...... ...... ...... [...... ...... ...... ......] +locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......] + env: ...... ...... ...... ...... ...... + */ + lu_site_stats_seq_print(&site->cs_lu, m); + cache_stats_print(&site->cs_pages, m, 1); + seq_printf(m, " ["); + for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i) + seq_printf(m, "%s: %u ", pstate[i], + atomic_read(&site->cs_pages_state[i])); + seq_printf(m, "]\n"); + cache_stats_print(&cl_env_stats, m, 0); + seq_printf(m, "\n"); + return 0; +} +EXPORT_SYMBOL(cl_site_stats_print); + +/***************************************************************************** + * + * lu_env handling on client. + * + */ + +/** + * The most efficient way is to store cl_env pointer in task specific + * structures. On Linux, it isn't easy to use task_struct->journal_info + * because Lustre code may call into other fs during memory reclaim, which + * has certain assumptions about journal_info. There are not currently any + * fields in task_struct that can be used for this purpose. + * \note As long as we use task_struct to store cl_env, we assume that once + * called into Lustre, we'll never call into the other part of the kernel + * which will use those fields in task_struct without explicitly exiting + * Lustre. + * + * Since there's no space in task_struct is available, hash will be used. + * bz20044, bz22683. + */ + +static unsigned cl_envs_cached_max = 32; /* XXX: prototype: arbitrary limit + * for now. */ +static struct cl_env_cache { + rwlock_t cec_guard; + unsigned cec_count; + struct list_head cec_envs; +} *cl_envs = NULL; + +struct cl_env { + void *ce_magic; + struct lu_env ce_lu; + struct lu_context ce_ses; + + /* + * Linkage into global list of all client environments. Used for + * garbage collection. + */ + struct list_head ce_linkage; + /* + * + */ + int ce_ref; + /* + * Debugging field: address of the caller who made original + * allocation. + */ + void *ce_debug; +}; + +static void cl_env_inc(enum cache_stats_item item) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + atomic_inc(&cl_env_stats.cs_stats[item]); +#endif +} + +static void cl_env_dec(enum cache_stats_item item) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + LASSERT(atomic_read(&cl_env_stats.cs_stats[item]) > 0); + atomic_dec(&cl_env_stats.cs_stats[item]); +#endif +} + +static void cl_env_init0(struct cl_env *cle, void *debug) +{ + LASSERT(cle->ce_ref == 0); + LASSERT(cle->ce_magic == &cl_env_init0); + LASSERT(cle->ce_debug == NULL); + + cle->ce_ref = 1; + cle->ce_debug = debug; + cl_env_inc(CS_busy); +} + +static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug) +{ + struct lu_env *env; + struct cl_env *cle; + + OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, GFP_NOFS); + if (cle != NULL) { + int rc; + + INIT_LIST_HEAD(&cle->ce_linkage); + cle->ce_magic = &cl_env_init0; + env = &cle->ce_lu; + rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags); + if (rc == 0) { + rc = lu_context_init(&cle->ce_ses, + LCT_SESSION | ses_tags); + if (rc == 0) { + lu_context_enter(&cle->ce_ses); + env->le_ses = &cle->ce_ses; + cl_env_init0(cle, debug); + } else + lu_env_fini(env); + } + if (rc != 0) { + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); + env = ERR_PTR(rc); + } else { + cl_env_inc(CS_create); + cl_env_inc(CS_total); + } + } else + env = ERR_PTR(-ENOMEM); + return env; +} + +static void cl_env_fini(struct cl_env *cle) +{ + cl_env_dec(CS_total); + lu_context_fini(&cle->ce_lu.le_ctx); + lu_context_fini(&cle->ce_ses); + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); +} + +static struct lu_env *cl_env_obtain(void *debug) +{ + struct cl_env *cle; + struct lu_env *env; + int cpu = get_cpu(); + + ENTRY; + + read_lock(&cl_envs[cpu].cec_guard); + LASSERT(equi(cl_envs[cpu].cec_count == 0, + list_empty(&cl_envs[cpu].cec_envs))); + if (cl_envs[cpu].cec_count > 0) { + int rc; + + cle = container_of(cl_envs[cpu].cec_envs.next, struct cl_env, + ce_linkage); + list_del_init(&cle->ce_linkage); + cl_envs[cpu].cec_count--; + read_unlock(&cl_envs[cpu].cec_guard); + put_cpu(); + + env = &cle->ce_lu; + rc = lu_env_refill(env); + if (rc == 0) { + cl_env_init0(cle, debug); + lu_context_enter(&env->le_ctx); + lu_context_enter(&cle->ce_ses); + } else { + cl_env_fini(cle); + env = ERR_PTR(rc); + } + } else { + read_unlock(&cl_envs[cpu].cec_guard); + put_cpu(); + env = cl_env_new(lu_context_tags_default, + lu_session_tags_default, debug); + } + RETURN(env); +} + +static inline struct cl_env *cl_env_container(struct lu_env *env) +{ + return container_of(env, struct cl_env, ce_lu); +} + +/** + * Returns lu_env: if there already is an environment associated with the + * current thread, it is returned, otherwise, new environment is allocated. + * + * Allocations are amortized through the global cache of environments. + * + * \param refcheck pointer to a counter used to detect environment leaks. In + * the usual case cl_env_get() and cl_env_put() are called in the same lexical + * scope and pointer to the same integer is passed as \a refcheck. This is + * used to detect missed cl_env_put(). + * + * \see cl_env_put() + */ +struct lu_env *cl_env_get(__u16 *refcheck) +{ + struct lu_env *env; + + env = cl_env_obtain(__builtin_return_address(0)); + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + } + return env; +} +EXPORT_SYMBOL(cl_env_get); + +/** + * Forces an allocation of a fresh environment with given tags. + * + * \see cl_env_get() + */ +struct lu_env *cl_env_alloc(__u16 *refcheck, __u32 tags) +{ + struct lu_env *env; + + env = cl_env_new(tags, tags, __builtin_return_address(0)); + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + } + return env; +} +EXPORT_SYMBOL(cl_env_alloc); + +static void cl_env_exit(struct cl_env *cle) +{ + lu_context_exit(&cle->ce_lu.le_ctx); + lu_context_exit(&cle->ce_ses); +} + +/** + * Finalizes and frees a given number of cached environments. This is done to + * (1) free some memory (not currently hooked into VM), or (2) release + * references to modules. + */ +unsigned cl_env_cache_purge(unsigned nr) +{ + struct cl_env *cle; + unsigned i; + + ENTRY; + for_each_possible_cpu(i) { + write_lock(&cl_envs[i].cec_guard); + for (; !list_empty(&cl_envs[i].cec_envs) && nr > 0; --nr) { + cle = container_of(cl_envs[i].cec_envs.next, + struct cl_env, ce_linkage); + list_del_init(&cle->ce_linkage); + LASSERT(cl_envs[i].cec_count > 0); + cl_envs[i].cec_count--; + write_unlock(&cl_envs[i].cec_guard); + + cl_env_fini(cle); + write_lock(&cl_envs[i].cec_guard); + } + LASSERT(equi(cl_envs[i].cec_count == 0, + list_empty(&cl_envs[i].cec_envs))); + write_unlock(&cl_envs[i].cec_guard); + } + RETURN(nr); +} +EXPORT_SYMBOL(cl_env_cache_purge); + +/** + * Release an environment. + * + * Decrement \a env reference counter. When counter drops to 0, nothing in + * this thread is using environment and it is returned to the allocation + * cache, or freed straight away, if cache is large enough. + */ +void cl_env_put(struct lu_env *env, __u16 *refcheck) +{ + struct cl_env *cle; + + cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 0); + LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck)); + + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + if (--cle->ce_ref == 0) { + int cpu = get_cpu(); + + cl_env_dec(CS_busy); + cle->ce_debug = NULL; + cl_env_exit(cle); + /* + * Don't bother to take a lock here. + * + * Return environment to the cache only when it was allocated + * with the standard tags. + */ + if (cl_envs[cpu].cec_count < cl_envs_cached_max && + (env->le_ctx.lc_tags & ~LCT_HAS_EXIT) == LCT_CL_THREAD && + (env->le_ses->lc_tags & ~LCT_HAS_EXIT) == LCT_SESSION) { + read_lock(&cl_envs[cpu].cec_guard); + list_add(&cle->ce_linkage, &cl_envs[cpu].cec_envs); + cl_envs[cpu].cec_count++; + read_unlock(&cl_envs[cpu].cec_guard); + } else + cl_env_fini(cle); + put_cpu(); + } +} +EXPORT_SYMBOL(cl_env_put); + +/** + * Converts struct cl_attr to struct ost_lvb. + * + * \see cl_lvb2attr + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr) +{ + lvb->lvb_size = attr->cat_size; + lvb->lvb_mtime = attr->cat_mtime; + lvb->lvb_atime = attr->cat_atime; + lvb->lvb_ctime = attr->cat_ctime; + lvb->lvb_blocks = attr->cat_blocks; +} + +/** + * Converts struct ost_lvb to struct cl_attr. + * + * \see cl_attr2lvb + */ +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb) +{ + attr->cat_size = lvb->lvb_size; + attr->cat_mtime = lvb->lvb_mtime; + attr->cat_atime = lvb->lvb_atime; + attr->cat_ctime = lvb->lvb_ctime; + attr->cat_blocks = lvb->lvb_blocks; +} +EXPORT_SYMBOL(cl_lvb2attr); + +static struct cl_env cl_env_percpu[NR_CPUS]; + +static int cl_env_percpu_init(void) +{ + struct cl_env *cle; + int tags = LCT_REMEMBER | LCT_NOREF; + int i, j; + int rc = 0; + + for_each_possible_cpu(i) { + struct lu_env *env; + + rwlock_init(&cl_envs[i].cec_guard); + INIT_LIST_HEAD(&cl_envs[i].cec_envs); + cl_envs[i].cec_count = 0; + + cle = &cl_env_percpu[i]; + env = &cle->ce_lu; + + INIT_LIST_HEAD(&cle->ce_linkage); + cle->ce_magic = &cl_env_init0; + rc = lu_env_init(env, LCT_CL_THREAD | tags); + if (rc == 0) { + rc = lu_context_init(&cle->ce_ses, LCT_SESSION | tags); + if (rc == 0) { + lu_context_enter(&cle->ce_ses); + env->le_ses = &cle->ce_ses; + } else { + lu_env_fini(env); + } + } + if (rc != 0) + break; + } + if (rc != 0) { + /* Indices 0 to i (excluding i) were correctly initialized, + * thus we must uninitialize up to i, the rest are undefined. */ + for (j = 0; j < i; j++) { + cle = &cl_env_percpu[j]; + lu_context_exit(&cle->ce_ses); + lu_context_fini(&cle->ce_ses); + lu_env_fini(&cle->ce_lu); + } + } + + return rc; +} + +static void cl_env_percpu_fini(void) +{ + int i; + + for_each_possible_cpu(i) { + struct cl_env *cle = &cl_env_percpu[i]; + + lu_context_exit(&cle->ce_ses); + lu_context_fini(&cle->ce_ses); + lu_env_fini(&cle->ce_lu); + } +} + +static void cl_env_percpu_refill(void) +{ + int i; + + for_each_possible_cpu(i) + lu_env_refill(&cl_env_percpu[i].ce_lu); +} + +void cl_env_percpu_put(struct lu_env *env) +{ + struct cl_env *cle; + int cpu; + + cpu = smp_processor_id(); + cle = cl_env_container(env); + LASSERT(cle == &cl_env_percpu[cpu]); + + cle->ce_ref--; + LASSERT(cle->ce_ref == 0); + + cl_env_dec(CS_busy); + cle->ce_debug = NULL; + + put_cpu(); +} +EXPORT_SYMBOL(cl_env_percpu_put); + +struct lu_env *cl_env_percpu_get() +{ + struct cl_env *cle; + + cle = &cl_env_percpu[get_cpu()]; + cl_env_init0(cle, __builtin_return_address(0)); + + return &cle->ce_lu; +} +EXPORT_SYMBOL(cl_env_percpu_get); + +/***************************************************************************** + * + * Temporary prototype thing: mirror obd-devices into cl devices. + * + */ + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next) +{ + const char *typename; + struct lu_device *d; + + LASSERT(ldt != NULL); + + typename = ldt->ldt_name; + d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL); + if (!IS_ERR(d)) { + int rc; + + if (site != NULL) + d->ld_site = site; + rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next); + if (rc == 0) { + lu_device_get(d); + lu_ref_add(&d->ld_reference, + "lu-stack", &lu_site_init); + } else { + ldt->ldt_ops->ldto_device_free(env, d); + CERROR("can't init device '%s', %d\n", typename, rc); + d = ERR_PTR(rc); + } + } else + CERROR("Cannot allocate device: '%s'\n", typename); + return lu2cl_dev(d); +} +EXPORT_SYMBOL(cl_type_setup); + +/** + * Finalize device stack by calling lu_stack_fini(). + */ +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl) +{ + lu_stack_fini(env, cl2lu_dev(cl)); +} +EXPORT_SYMBOL(cl_stack_fini); + +static struct lu_context_key cl_key; + +struct cl_thread_info *cl_env_info(const struct lu_env *env) +{ + return lu_context_key_get(&env->le_ctx, &cl_key); +} + +/* defines cl_key_{init,fini}() */ +LU_KEY_INIT_FINI(cl, struct cl_thread_info); + +static struct lu_context_key cl_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = cl_key_init, + .lct_fini = cl_key_fini, +}; + +static struct lu_kmem_descr cl_object_caches[] = { + { + .ckd_cache = &cl_env_kmem, + .ckd_name = "cl_env_kmem", + .ckd_size = sizeof (struct cl_env) + }, + { + .ckd_cache = NULL + } +}; + +/** + * Global initialization of cl-data. Create kmem caches, register + * lu_context_key's, etc. + * + * \see cl_global_fini() + */ +int cl_global_init(void) +{ + int result; + + OBD_ALLOC(cl_envs, sizeof(*cl_envs) * num_possible_cpus()); + if (cl_envs == NULL) + GOTO(out, result = -ENOMEM); + + result = lu_kmem_init(cl_object_caches); + if (result) + GOTO(out_envs, result); + + LU_CONTEXT_KEY_INIT(&cl_key); + result = lu_context_key_register(&cl_key); + if (result) + GOTO(out_kmem, result); + + result = cl_env_percpu_init(); + if (result) /* no cl_env_percpu_fini on error */ + GOTO(out_keys, result); + + return 0; + +out_keys: + lu_context_key_degister(&cl_key); +out_kmem: + lu_kmem_fini(cl_object_caches); +out_envs: + OBD_FREE(cl_envs, sizeof(*cl_envs) * num_possible_cpus()); +out: + return result; +} + +/** + * Finalization of global cl-data. Dual to cl_global_init(). + */ +void cl_global_fini(void) +{ + cl_env_percpu_fini(); + lu_context_key_degister(&cl_key); + lu_kmem_fini(cl_object_caches); + OBD_FREE(cl_envs, sizeof(*cl_envs) * num_possible_cpus()); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c new file mode 100644 index 0000000000000..a1b1e130f31c6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c @@ -0,0 +1,1187 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Lustre Page. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +#include +#include "cl_internal.h" + +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg); + +#ifdef LIBCFS_DEBUG +# define PASSERT(env, page, expr) \ + do { \ + if (unlikely(!(expr))) { \ + CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ + LASSERT(0); \ + } \ + } while (0) +#else /* !LIBCFS_DEBUG */ +# define PASSERT(env, page, exp) \ + ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp)) +#endif /* !LIBCFS_DEBUG */ + +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK +# define PINVRNT(env, page, expr) \ + do { \ + if (unlikely(!(expr))) { \ + CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ + LINVRNT(0); \ + } \ + } while (0) +#else /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */ +# define PINVRNT(env, page, exp) \ + ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp)) +#endif /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */ + +/* Disable page statistic by default due to huge performance penalty. */ +static void cs_page_inc(const struct cl_object *obj, + enum cache_stats_item item) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + atomic_inc(&cl_object_site(obj)->cs_pages.cs_stats[item]); +#endif +} + +static void cs_page_dec(const struct cl_object *obj, + enum cache_stats_item item) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + atomic_dec(&cl_object_site(obj)->cs_pages.cs_stats[item]); +#endif +} + +static void cs_pagestate_inc(const struct cl_object *obj, + enum cl_page_state state) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + atomic_inc(&cl_object_site(obj)->cs_pages_state[state]); +#endif +} + +static void cs_pagestate_dec(const struct cl_object *obj, + enum cl_page_state state) +{ +#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING + atomic_dec(&cl_object_site(obj)->cs_pages_state[state]); +#endif +} + +/** + * Internal version of cl_page_get(). + * + * This function can be used to obtain initial reference to previously + * unreferenced cached object. It can be called only if concurrent page + * reclamation is somehow prevented, e.g., by keeping a lock on a VM page, + * associated with \a page. + * + * Use with care! Not exported. + */ +static void cl_page_get_trust(struct cl_page *page) +{ + LASSERT(atomic_read(&page->cp_ref) > 0); + atomic_inc(&page->cp_ref); +} + +/** + * Returns a slice within a page, corresponding to the given layer in the + * device stack. + * + * \see cl_lock_at() + */ +static const struct cl_page_slice * +cl_page_at_trusted(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + const struct cl_page_slice *slice; + ENTRY; + + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype) + RETURN(slice); + } + RETURN(NULL); +} + +static void cl_page_free(const struct lu_env *env, struct cl_page *page, + struct pagevec *pvec) +{ + struct cl_object *obj = page->cp_obj; + int pagesize = cl_object_header(obj)->coh_page_bufsize; + + PASSERT(env, page, list_empty(&page->cp_batch)); + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, page->cp_state == CPS_FREEING); + + ENTRY; + while (!list_empty(&page->cp_layers)) { + struct cl_page_slice *slice; + + slice = list_entry(page->cp_layers.next, + struct cl_page_slice, cpl_linkage); + list_del_init(page->cp_layers.next); + if (unlikely(slice->cpl_ops->cpo_fini != NULL)) + slice->cpl_ops->cpo_fini(env, slice, pvec); + } + cs_page_dec(obj, CS_total); + cs_pagestate_dec(obj, page->cp_state); + lu_object_ref_del_at(&obj->co_lu, &page->cp_obj_ref, "cl_page", page); + cl_object_put(env, obj); + lu_ref_fini(&page->cp_reference); + OBD_FREE(page, pagesize); + EXIT; +} + +/** + * Helper function updating page state. This is the only place in the code + * where cl_page::cp_state field is mutated. + */ +static inline void cl_page_state_set_trust(struct cl_page *page, + enum cl_page_state state) +{ + /* bypass const. */ + *(enum cl_page_state *)&page->cp_state = state; +} + +struct cl_page *cl_page_alloc(const struct lu_env *env, + struct cl_object *o, pgoff_t ind, struct page *vmpage, + enum cl_page_type type) +{ + struct cl_page *page; + struct lu_object_header *head; + + ENTRY; + OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize, + GFP_NOFS); + if (page != NULL) { + int result = 0; + atomic_set(&page->cp_ref, 1); + page->cp_obj = o; + cl_object_get(o); + lu_object_ref_add_at(&o->co_lu, &page->cp_obj_ref, "cl_page", + page); + page->cp_vmpage = vmpage; + cl_page_state_set_trust(page, CPS_CACHED); + page->cp_type = type; + INIT_LIST_HEAD(&page->cp_layers); + INIT_LIST_HEAD(&page->cp_batch); + lu_ref_init(&page->cp_reference); + head = o->co_lu.lo_header; + list_for_each_entry(o, &head->loh_layers, + co_lu.lo_linkage) { + if (o->co_ops->coo_page_init != NULL) { + result = o->co_ops->coo_page_init(env, o, page, + ind); + if (result != 0) { + cl_page_delete0(env, page); + cl_page_free(env, page, NULL); + page = ERR_PTR(result); + break; + } + } + } + if (result == 0) { + cs_page_inc(o, CS_total); + cs_page_inc(o, CS_create); + cs_pagestate_dec(o, CPS_CACHED); + } + } else { + page = ERR_PTR(-ENOMEM); + } + RETURN(page); +} + +/** + * Returns a cl_page with index \a idx at the object \a o, and associated with + * the VM page \a vmpage. + * + * This is the main entry point into the cl_page caching interface. First, a + * cache (implemented as a per-object radix tree) is consulted. If page is + * found there, it is returned immediately. Otherwise new page is allocated + * and returned. In any case, additional reference to page is acquired. + * + * \see cl_object_find(), cl_lock_find() + */ +struct cl_page *cl_page_find(const struct lu_env *env, + struct cl_object *o, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type) +{ + struct cl_page *page = NULL; + struct cl_object_header *hdr; + + LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT); + might_sleep(); + + ENTRY; + + hdr = cl_object_header(o); + cs_page_inc(o, CS_lookup); + + CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n", + idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type); + /* fast path. */ + if (type == CPT_CACHEABLE) { + /* vmpage lock is used to protect the child/parent + * relationship */ + KLASSERT(PageLocked(vmpage)); + /* + * cl_vmpage_page() can be called here without any locks as + * + * - "vmpage" is locked (which prevents ->private from + * concurrent updates), and + * + * - "o" cannot be destroyed while current thread holds a + * reference on it. + */ + page = cl_vmpage_page(vmpage, o); + if (page != NULL) { + cs_page_inc(o, CS_hit); + RETURN(page); + } + } + + /* allocate and initialize cl_page */ + page = cl_page_alloc(env, o, idx, vmpage, type); + RETURN(page); +} +EXPORT_SYMBOL(cl_page_find); + +static inline int cl_page_invariant(const struct cl_page *pg) +{ + return cl_page_in_use_noref(pg); +} + +static void cl_page_state_set0(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + enum cl_page_state old; + + /* + * Matrix of allowed state transitions [old][new], for sanity + * checking. + */ + static const int allowed_transitions[CPS_NR][CPS_NR] = { + [CPS_CACHED] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 1, /* io finds existing cached page */ + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 1, /* write-out from the cache */ + [CPS_FREEING] = 1, /* eviction on the memory pressure */ + }, + [CPS_OWNED] = { + [CPS_CACHED] = 1, /* release to the cache */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 1, /* start read immediately */ + [CPS_PAGEOUT] = 1, /* start write immediately */ + [CPS_FREEING] = 1, /* lock invalidation or truncate */ + }, + [CPS_PAGEIN] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_PAGEOUT] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_FREEING] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + } + }; + + ENTRY; + old = page->cp_state; + PASSERT(env, page, allowed_transitions[old][state]); + CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state); + PASSERT(env, page, page->cp_state == old); + PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner != NULL)); + + cs_pagestate_dec(page->cp_obj, page->cp_state); + cs_pagestate_inc(page->cp_obj, state); + cl_page_state_set_trust(page, state); + EXIT; +} + +static void cl_page_state_set(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + cl_page_state_set0(env, page, state); +} + +/** + * Acquires an additional reference to a page. + * + * This can be called only by caller already possessing a reference to \a + * page. + * + * \see cl_object_get(), cl_lock_get(). + */ +void cl_page_get(struct cl_page *page) +{ + ENTRY; + cl_page_get_trust(page); + EXIT; +} +EXPORT_SYMBOL(cl_page_get); + +/** + * Releases a reference to a page, use the pagevec to release the pages + * in batch if provided. + * + * Users need to do a final pagevec_release() to release any trailing pages. + */ +void cl_pagevec_put(const struct lu_env *env, struct cl_page *page, + struct pagevec *pvec) +{ + ENTRY; + CL_PAGE_HEADER(D_TRACE, env, page, "%d\n", + atomic_read(&page->cp_ref)); + + if (atomic_dec_and_test(&page->cp_ref)) { + LASSERT(page->cp_state == CPS_FREEING); + + LASSERT(atomic_read(&page->cp_ref) == 0); + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, list_empty(&page->cp_batch)); + /* + * Page is no longer reachable by other threads. Tear + * it down. + */ + cl_page_free(env, page, pvec); + } + + EXIT; +} +EXPORT_SYMBOL(cl_pagevec_put); + +/** + * Releases a reference to a page, wrapper to cl_pagevec_put + * + * When last reference is released, page is returned to the cache, unless it + * is in cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * \see cl_object_put(), cl_lock_put(). + */ +void cl_page_put(const struct lu_env *env, struct cl_page *page) +{ + cl_pagevec_put(env, page, NULL); +} +EXPORT_SYMBOL(cl_page_put); + +/** + * Returns a cl_page associated with a VM page, and given cl_object. + */ +struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj) +{ + struct cl_page *page; + + ENTRY; + KLASSERT(PageLocked(vmpage)); + + /* + * NOTE: absence of races and liveness of data are guaranteed by page + * lock on a "vmpage". That works because object destruction has + * bottom-to-top pass. + */ + + page = (struct cl_page *)vmpage->private; + if (page != NULL) { + cl_page_get_trust(page); + LASSERT(page->cp_type == CPT_CACHEABLE); + } + RETURN(page); +} +EXPORT_SYMBOL(cl_vmpage_page); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + return cl_page_at_trusted(page, dtype); +} +EXPORT_SYMBOL(cl_page_at); + +static void cl_page_owner_clear(struct cl_page *page) +{ + ENTRY; + if (page->cp_owner != NULL) { + LASSERT(page->cp_owner->ci_owned_nr > 0); + page->cp_owner->ci_owned_nr--; + page->cp_owner = NULL; + } + EXIT; +} + +static void cl_page_owner_set(struct cl_page *page) +{ + ENTRY; + LASSERT(page->cp_owner != NULL); + page->cp_owner->ci_owned_nr++; + EXIT; +} + +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + const struct cl_page_slice *slice; + enum cl_page_state state; + + ENTRY; + state = pg->cp_state; + PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING); + PINVRNT(env, pg, cl_page_invariant(pg) || state == CPS_FREEING); + cl_page_owner_clear(pg); + + if (state == CPS_OWNED) + cl_page_state_set(env, pg, CPS_CACHED); + /* + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for VFS/VM interaction runs + * last and can release locks safely. + */ + list_for_each_entry_reverse(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_disown != NULL) + (*slice->cpl_ops->cpo_disown)(env, slice, io); + } + + EXIT; +} + +/** + * returns true, iff page is owned by the given io. + */ +int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io) +{ + struct cl_io *top = cl_io_top((struct cl_io *)io); + LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj)); + ENTRY; + RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == top); +} +EXPORT_SYMBOL(cl_page_is_owned); + +/** + * Try to own a page by IO. + * + * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it + * into cl_page_state::CPS_OWNED state. + * + * \pre !cl_page_is_owned(pg, io) + * \post result == 0 iff cl_page_is_owned(pg, io) + * + * \retval 0 success + * + * \retval -ve failure, e.g., page was destroyed (and landed in + * cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED). + * or, page was owned by another thread, or in IO. + * + * \see cl_page_disown() + * \see cl_page_operations::cpo_own() + * \see cl_page_own_try() + * \see cl_page_own + */ +static int cl_page_own0(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, int nonblock) +{ + int result = 0; + const struct cl_page_slice *slice; + + PINVRNT(env, pg, !cl_page_is_owned(pg, io)); + + ENTRY; + io = cl_io_top(io); + + if (pg->cp_state == CPS_FREEING) { + result = -ENOENT; + goto out; + } + + list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_own) + result = (*slice->cpl_ops->cpo_own)(env, slice, + io, nonblock); + + if (result != 0) + break; + + } + if (result > 0) + result = 0; + + if (result == 0) { + PASSERT(env, pg, pg->cp_owner == NULL); + pg->cp_owner = cl_io_top(io); + cl_page_owner_set(pg); + if (pg->cp_state != CPS_FREEING) { + cl_page_state_set(env, pg, CPS_OWNED); + } else { + cl_page_disown0(env, io, pg); + result = -ENOENT; + } + } + +out: + PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg))); + RETURN(result); +} + +/** + * Own a page, might be blocked. + * + * \see cl_page_own0() + */ +int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg) +{ + return cl_page_own0(env, io, pg, 0); +} +EXPORT_SYMBOL(cl_page_own); + +/** + * Nonblock version of cl_page_own(). + * + * \see cl_page_own0() + */ +int cl_page_own_try(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg) +{ + return cl_page_own0(env, io, pg, 1); +} +EXPORT_SYMBOL(cl_page_own_try); + + +/** + * Assume page ownership. + * + * Called when page is already locked by the hosting VM. + * + * \pre !cl_page_is_owned(pg, io) + * \post cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_assume() + */ +void cl_page_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + const struct cl_page_slice *slice; + + PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj)); + + ENTRY; + io = cl_io_top(io); + + list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_assume != NULL) + (*slice->cpl_ops->cpo_assume)(env, slice, io); + } + + PASSERT(env, pg, pg->cp_owner == NULL); + pg->cp_owner = cl_io_top(io); + cl_page_owner_set(pg); + cl_page_state_set(env, pg, CPS_OWNED); + EXIT; +} +EXPORT_SYMBOL(cl_page_assume); + +/** + * Releases page ownership without unlocking the page. + * + * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the + * underlying VM page (as VM is supposed to do this itself). + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_assume() + */ +void cl_page_unassume(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + const struct cl_page_slice *slice; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + io = cl_io_top(io); + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, CPS_CACHED); + + list_for_each_entry_reverse(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_unassume != NULL) + (*slice->cpl_ops->cpo_unassume)(env, slice, io); + } + + EXIT; +} +EXPORT_SYMBOL(cl_page_unassume); + +/** + * Releases page ownership. + * + * Moves page into cl_page_state::CPS_CACHED. + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_own() + * \see cl_page_operations::cpo_disown() + */ +void cl_page_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io) || + pg->cp_state == CPS_FREEING); + + ENTRY; + io = cl_io_top(io); + cl_page_disown0(env, io, pg); + EXIT; +} +EXPORT_SYMBOL(cl_page_disown); + +/** + * Called when page is to be removed from the object, e.g., as a result of + * truncate. + * + * Calls cl_page_operations::cpo_discard() top-to-bottom. + * + * \pre cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_discard() + */ +void cl_page_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + const struct cl_page_slice *slice; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_discard != NULL) + (*slice->cpl_ops->cpo_discard)(env, slice, io); + } +} +EXPORT_SYMBOL(cl_page_discard); + +/** + * Version of cl_page_delete() that can be called for not fully constructed + * pages, e.g. in an error handling cl_page_find()->cl_page_delete0() + * path. Doesn't check page invariant. + */ +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg) +{ + const struct cl_page_slice *slice; + + ENTRY; + + PASSERT(env, pg, pg->cp_state != CPS_FREEING); + + /* + * Severe all ways to obtain new pointers to @pg. + */ + cl_page_owner_clear(pg); + cl_page_state_set0(env, pg, CPS_FREEING); + + list_for_each_entry_reverse(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_delete != NULL) + (*slice->cpl_ops->cpo_delete)(env, slice); + } + + EXIT; +} + +/** + * Called when a decision is made to throw page out of memory. + * + * Notifies all layers about page destruction by calling + * cl_page_operations::cpo_delete() method top-to-bottom. + * + * Moves page into cl_page_state::CPS_FREEING state (this is the only place + * where transition to this state happens). + * + * Eliminates all venues through which new references to the page can be + * obtained: + * + * - removes page from the radix trees, + * + * - breaks linkage from VM page to cl_page. + * + * Once page reaches cl_page_state::CPS_FREEING, all remaining references will + * drain after some time, at which point page will be recycled. + * + * \pre VM page is locked + * \post pg->cp_state == CPS_FREEING + * + * \see cl_page_operations::cpo_delete() + */ +void cl_page_delete(const struct lu_env *env, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + ENTRY; + cl_page_delete0(env, pg); + EXIT; +} +EXPORT_SYMBOL(cl_page_delete); + +/** + * Marks page up-to-date. + * + * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The + * layer responsible for VM interaction has to mark/clear page as up-to-date + * by the \a uptodate argument. + * + * \see cl_page_operations::cpo_export() + */ +void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate) +{ + const struct cl_page_slice *slice; + + PINVRNT(env, pg, cl_page_invariant(pg)); + + list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_export != NULL) + (*slice->cpl_ops->cpo_export)(env, slice, uptodate); + } +} +EXPORT_SYMBOL(cl_page_export); + +/** + * Returns true, iff \a pg is VM locked in a suitable sense by the calling + * thread. + */ +int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg) +{ + const struct cl_page_slice *slice; + int result; + + ENTRY; + slice = container_of(pg->cp_layers.next, + const struct cl_page_slice, cpl_linkage); + PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL); + /* + * Call ->cpo_is_vmlocked() directly instead of going through + * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by + * cl_page_invariant(). + */ + result = slice->cpl_ops->cpo_is_vmlocked(env, slice); + PASSERT(env, pg, result == -EBUSY || result == -ENODATA); + RETURN(result == -EBUSY); +} +EXPORT_SYMBOL(cl_page_is_vmlocked); + +void cl_page_touch(const struct lu_env *env, const struct cl_page *pg, + size_t to) +{ + const struct cl_page_slice *slice; + + ENTRY; + + list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_page_touch != NULL) + (*slice->cpl_ops->cpo_page_touch)(env, slice, to); + } + + EXIT; +} +EXPORT_SYMBOL(cl_page_touch); + +static enum cl_page_state cl_req_type_state(enum cl_req_type crt) +{ + ENTRY; + RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN); +} + +static void cl_page_io_start(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt) +{ + /* + * Page is queued for IO, change its state. + */ + ENTRY; + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, cl_req_type_state(crt)); + EXIT; +} + +/** + * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is + * called top-to-bottom. Every layer either agrees to submit this page (by + * returning 0), or requests to omit this page (by returning -EALREADY). Layer + * handling interactions with the VM also has to inform VM that page is under + * transfer now. + */ +int cl_page_prep(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt) +{ + const struct cl_page_slice *slice; + int result = 0; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + PINVRNT(env, pg, crt < CRT_NR); + + /* + * XXX this has to be called bottom-to-top, so that llite can set up + * PG_writeback without risking other layers deciding to skip this + * page. + */ + if (crt >= CRT_NR) + return -EINVAL; + + list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_own) + result = (*slice->cpl_ops->io[crt].cpo_prep)(env, + slice, + io); + + if (result != 0) + break; + + } + + if (result >= 0) { + result = 0; + cl_page_io_start(env, pg, crt); + } + + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + return result; +} +EXPORT_SYMBOL(cl_page_prep); + +/** + * Notify layers about transfer completion. + * + * Invoked by transfer sub-system (which is a part of osc) to notify layers + * that a transfer, of which this page is a part of has completed. + * + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for the VFS/VM interaction runs last + * and can release locks safely. + * + * \pre pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT + * \post pg->cp_state == CPS_CACHED + * + * \see cl_page_operations::cpo_completion() + */ +void cl_page_completion(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret) +{ + const struct cl_page_slice *slice; + struct cl_sync_io *anchor = pg->cp_sync_io; + + PASSERT(env, pg, crt < CRT_NR); + PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt)); + + ENTRY; + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret); + cl_page_state_set(env, pg, CPS_CACHED); + if (crt >= CRT_NR) + return; + + list_for_each_entry_reverse(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->io[crt].cpo_completion != NULL) + (*slice->cpl_ops->io[crt].cpo_completion)(env, slice, + ioret); + } + + if (anchor != NULL) { + LASSERT(pg->cp_sync_io == anchor); + pg->cp_sync_io = NULL; + cl_sync_io_note(env, anchor, ioret); + } + EXIT; +} +EXPORT_SYMBOL(cl_page_completion); + +/** + * Notify layers that transfer formation engine decided to yank this page from + * the cache and to make it a part of a transfer. + * + * \pre pg->cp_state == CPS_CACHED + * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT + * + * \see cl_page_operations::cpo_make_ready() + */ +int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt) +{ + const struct cl_page_slice *sli; + int result = 0; + + PINVRNT(env, pg, crt < CRT_NR); + + ENTRY; + if (crt >= CRT_NR) + RETURN(-EINVAL); + + list_for_each_entry(sli, &pg->cp_layers, cpl_linkage) { + if (sli->cpl_ops->io[crt].cpo_make_ready != NULL) + result = (*sli->cpl_ops->io[crt].cpo_make_ready)(env, + sli); + if (result != 0) + break; + } + + if (result >= 0) { + result = 0; + PASSERT(env, pg, pg->cp_state == CPS_CACHED); + cl_page_io_start(env, pg, crt); + } + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_make_ready); + +/** + * Called if a pge is being written back by kernel's intention. + * + * \pre cl_page_is_owned(pg, io) + * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT) + * + * \see cl_page_operations::cpo_flush() + */ +int cl_page_flush(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg) +{ + const struct cl_page_slice *slice; + int result = 0; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + + list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_flush != NULL) + result = (*slice->cpl_ops->cpo_flush)(env, slice, io); + if (result != 0) + break; + } + if (result > 0) + result = 0; + + CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_flush); + +/** + * Tells transfer engine that only part of a page is to be transmitted. + * + * \see cl_page_operations::cpo_clip() + */ +void cl_page_clip(const struct lu_env *env, struct cl_page *pg, + int from, int to) +{ + const struct cl_page_slice *slice; + + PINVRNT(env, pg, cl_page_invariant(pg)); + + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to); + list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_clip != NULL) + (*slice->cpl_ops->cpo_clip)(env, slice, from, to); + } +} +EXPORT_SYMBOL(cl_page_clip); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + (*printer)(env, cookie, + "page@%p[%d %p %d %d %p]\n", + pg, atomic_read(&pg->cp_ref), pg->cp_obj, + pg->cp_state, pg->cp_type, + pg->cp_owner); +} +EXPORT_SYMBOL(cl_page_header_print); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + const struct cl_page_slice *slice; + int result = 0; + + cl_page_header_print(env, cookie, printer, pg); + list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_print != NULL) + result = (*slice->cpl_ops->cpo_print)(env, slice, + cookie, printer); + if (result != 0) + break; + } + (*printer)(env, cookie, "end page@%p\n", pg); +} +EXPORT_SYMBOL(cl_page_print); + +/** + * Cancel a page which is still in a transfer. + */ +int cl_page_cancel(const struct lu_env *env, struct cl_page *page) +{ + const struct cl_page_slice *slice; + int result = 0; + + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_cancel != NULL) + result = (*slice->cpl_ops->cpo_cancel)(env, slice); + if (result != 0) + break; + } + if (result > 0) + result = 0; + + return result; +} + +/** + * Converts a byte offset within object \a obj into a page index. + */ +loff_t cl_offset(const struct cl_object *obj, pgoff_t idx) +{ + return (loff_t)idx << PAGE_SHIFT; +} +EXPORT_SYMBOL(cl_offset); + +/** + * Converts a page index into a byte offset within object \a obj. + */ +pgoff_t cl_index(const struct cl_object *obj, loff_t offset) +{ + return offset >> PAGE_SHIFT; +} +EXPORT_SYMBOL(cl_index); + +size_t cl_page_size(const struct cl_object *obj) +{ + return 1UL << PAGE_SHIFT; +} +EXPORT_SYMBOL(cl_page_size); + +/** + * Adds page slice to the compound page. + * + * This is called by cl_object_operations::coo_page_init() methods to add a + * per-layer state to the page. New state is added at the end of + * cl_page::cp_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add() + */ +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, pgoff_t index, + const struct cl_page_operations *ops) +{ + ENTRY; + list_add_tail(&slice->cpl_linkage, &page->cp_layers); + slice->cpl_obj = obj; + slice->cpl_index = index; + slice->cpl_ops = ops; + slice->cpl_page = page; + EXIT; +} +EXPORT_SYMBOL(cl_page_slice_add); + +/** + * Allocate and initialize cl_cache, called by ll_init_sbi(). + */ +struct cl_client_cache *cl_cache_init(unsigned long lru_page_max) +{ + struct cl_client_cache *cache = NULL; + + ENTRY; + OBD_ALLOC(cache, sizeof(*cache)); + if (cache == NULL) + RETURN(NULL); + + /* Initialize cache data */ + atomic_set(&cache->ccc_users, 1); + cache->ccc_lru_max = lru_page_max; + atomic_long_set(&cache->ccc_lru_left, lru_page_max); + spin_lock_init(&cache->ccc_lru_lock); + INIT_LIST_HEAD(&cache->ccc_lru); + + /* turn unstable check off by default as it impacts performance */ + cache->ccc_unstable_check = 0; + atomic_long_set(&cache->ccc_unstable_nr, 0); + init_waitqueue_head(&cache->ccc_unstable_waitq); + + RETURN(cache); +} +EXPORT_SYMBOL(cl_cache_init); + +/** + * Increase cl_cache refcount + */ +void cl_cache_incref(struct cl_client_cache *cache) +{ + atomic_inc(&cache->ccc_users); +} +EXPORT_SYMBOL(cl_cache_incref); + +/** + * Decrease cl_cache refcount and free the cache if refcount=0. + * Since llite, lov and osc all hold cl_cache refcount, + * the free will not cause race. (LU-6173) + */ +void cl_cache_decref(struct cl_client_cache *cache) +{ + if (atomic_dec_and_test(&cache->ccc_users)) + OBD_FREE(cache, sizeof(*cache)); +} +EXPORT_SYMBOL(cl_cache_decref); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c new file mode 100644 index 0000000000000..3cf9b86b2835a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c @@ -0,0 +1,830 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_SERVER_SUPPORT +# include +# include +#endif /* HAVE_SERVER_SUPPORT */ +#include +#include "llog_internal.h" + +#ifdef CONFIG_PROC_FS +static __u64 obd_max_alloc; +#else +__u64 obd_max_alloc; +#endif + +static DEFINE_SPINLOCK(obd_updatemax_lock); + +/* The following are visible and mutable through /proc/sys/lustre/. */ +unsigned int obd_debug_peer_on_timeout; +EXPORT_SYMBOL(obd_debug_peer_on_timeout); +unsigned int obd_dump_on_timeout; +EXPORT_SYMBOL(obd_dump_on_timeout); +unsigned int obd_dump_on_eviction; +EXPORT_SYMBOL(obd_dump_on_eviction); +unsigned int obd_lbug_on_eviction; +EXPORT_SYMBOL(obd_lbug_on_eviction); +unsigned long obd_max_dirty_pages; +EXPORT_SYMBOL(obd_max_dirty_pages); +atomic_long_t obd_dirty_pages; +EXPORT_SYMBOL(obd_dirty_pages); +unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ +EXPORT_SYMBOL(obd_timeout); +unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */ +EXPORT_SYMBOL(ldlm_timeout); +unsigned int obd_timeout_set; +EXPORT_SYMBOL(obd_timeout_set); +unsigned int ldlm_timeout_set; +EXPORT_SYMBOL(ldlm_timeout_set); +/* bulk transfer timeout, give up after 100s by default */ +unsigned int bulk_timeout = 100; /* seconds */ +EXPORT_SYMBOL(bulk_timeout); +/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */ +unsigned int at_min = 0; +EXPORT_SYMBOL(at_min); +unsigned int at_max = 600; +EXPORT_SYMBOL(at_max); +unsigned int at_history = 600; +EXPORT_SYMBOL(at_history); +int at_early_margin = 5; +EXPORT_SYMBOL(at_early_margin); +int at_extra = 30; +EXPORT_SYMBOL(at_extra); + +#ifdef CONFIG_PROC_FS +struct lprocfs_stats *obd_memory = NULL; +EXPORT_SYMBOL(obd_memory); +#endif + +static int class_resolve_dev_name(__u32 len, const char *name) +{ + int rc; + int dev; + + ENTRY; + if (!len || !name) { + CERROR("No name passed,!\n"); + GOTO(out, rc = -EINVAL); + } + if (name[len - 1] != 0) { + CERROR("Name not nul terminated!\n"); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s\n", name); + dev = class_name2dev(name); + if (dev == -1) { + CDEBUG(D_IOCTL, "No device for name %s!\n", name); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev); + rc = dev; + +out: + RETURN(rc); +} + +#define OBD_MAX_IOCTL_BUFFER 8192 + +static int obd_ioctl_is_invalid(struct obd_ioctl_data *data) +{ + if (data->ioc_len > BIT(30)) { + CERROR("OBD ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen1 > BIT(30)) { + CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen2 > BIT(30)) { + CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen3 > BIT(30)) { + CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen4 > BIT(30)) { + CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) { + CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) { + CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) { + CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) { + CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_pbuf1 && data->ioc_plen1 == 0) { + CERROR("OBD ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_pbuf2 && data->ioc_plen2 == 0) { + CERROR("OBD ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + + if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) { + CERROR("OBD ioctl: plen1 set but NULL pointer\n"); + return 1; + } + + if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) { + CERROR("OBD ioctl: plen2 set but NULL pointer\n"); + return 1; + } + + if (obd_ioctl_packlen(data) > data->ioc_len) { + CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n", + obd_ioctl_packlen(data), data->ioc_len); + return 1; + } + + return 0; +} + +/* buffer MUST be at least the size of obd_ioctl_hdr */ +int obd_ioctl_getdata(char **buf, int *len, void __user *arg) +{ + struct obd_ioctl_hdr hdr; + struct obd_ioctl_data *data; + int offset = 0; + + ENTRY; + if (copy_from_user(&hdr, arg, sizeof(hdr))) + RETURN(-EFAULT); + + if (hdr.ioc_version != OBD_IOCTL_VERSION) { + CERROR("Version mismatch kernel (%x) vs application (%x)\n", + OBD_IOCTL_VERSION, hdr.ioc_version); + RETURN(-EINVAL); + } + + if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) { + CERROR("User buffer len %d exceeds %d max buffer\n", + hdr.ioc_len, OBD_MAX_IOCTL_BUFFER); + RETURN(-EINVAL); + } + + if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) { + CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len); + RETURN(-EINVAL); + } + + /* When there are lots of processes calling vmalloc on multi-core + * system, the high lock contention will hurt performance badly, + * obdfilter-survey is an example, which relies on ioctl. So we'd + * better avoid vmalloc on ioctl path. LU-66 + */ + OBD_ALLOC_LARGE(*buf, hdr.ioc_len); + if (!*buf) { + CERROR("Cannot allocate control buffer of len %d\n", + hdr.ioc_len); + RETURN(-EINVAL); + } + *len = hdr.ioc_len; + data = (struct obd_ioctl_data *)*buf; + + if (copy_from_user(*buf, arg, hdr.ioc_len)) { + OBD_FREE_LARGE(*buf, hdr.ioc_len); + RETURN(-EFAULT); + } + + if (obd_ioctl_is_invalid(data)) { + CERROR("ioctl not correctly formatted\n"); + OBD_FREE_LARGE(*buf, hdr.ioc_len); + RETURN(-EINVAL); + } + + if (data->ioc_inllen1) { + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + offset += cfs_size_round(data->ioc_inllen1); + } + + if (data->ioc_inllen2) { + data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset; + offset += cfs_size_round(data->ioc_inllen2); + } + + if (data->ioc_inllen3) { + data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset; + offset += cfs_size_round(data->ioc_inllen3); + } + + if (data->ioc_inllen4) + data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset; + + RETURN(0); +} +EXPORT_SYMBOL(obd_ioctl_getdata); + +int class_handle_ioctl(unsigned int cmd, unsigned long arg) +{ + char *buf = NULL; + struct obd_ioctl_data *data; + struct libcfs_debug_ioctl_data *debug_data; + struct obd_device *obd = NULL; + int err = 0, len = 0; + ENTRY; + + /* only for debugging */ + if (cmd == LIBCFS_IOC_DEBUG_MASK) { + debug_data = (struct libcfs_debug_ioctl_data*)arg; + libcfs_subsystem_debug = debug_data->subs; + libcfs_debug = debug_data->debug; + return 0; + } + + CDEBUG(D_IOCTL, "cmd = %x\n", cmd); + if (obd_ioctl_getdata(&buf, &len, (void __user *)arg)) { + CERROR("OBD ioctl: data error\n"); + RETURN(-EINVAL); + } + data = (struct obd_ioctl_data *)buf; + + switch (cmd) { + case OBD_IOC_PROCESS_CFG: { + struct lustre_cfg *lcfg; + + if (!data->ioc_plen1 || !data->ioc_pbuf1) { + CERROR("No config buffer passed!\n"); + GOTO(out, err = -EINVAL); + } + OBD_ALLOC(lcfg, data->ioc_plen1); + if (lcfg == NULL) + GOTO(out, err = -ENOMEM); + err = copy_from_user(lcfg, data->ioc_pbuf1, + data->ioc_plen1); + if (!err) + err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1); + if (!err) + err = class_process_config(lcfg); + + OBD_FREE(lcfg, data->ioc_plen1); + GOTO(out, err); + } + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0) + case OBD_GET_VERSION: { + static bool warned; + + if (!data->ioc_inlbuf1) { + CERROR("No buffer passed in ioctl\n"); + GOTO(out, err = -EINVAL); + } + + if (strlen(LUSTRE_VERSION_STRING) + 1 > data->ioc_inllen1) { + CERROR("ioctl buffer too small to hold version\n"); + GOTO(out, err = -EINVAL); + } + + if (!warned) { + warned = true; + CWARN("%s: ioctl(OBD_GET_VERSION) is deprecated, " + "use llapi_get_version_string() and/or relink\n", + current->comm); + } + memcpy(data->ioc_bulk, LUSTRE_VERSION_STRING, + strlen(LUSTRE_VERSION_STRING) + 1); + + if (copy_to_user((void __user *)arg, data, len)) + err = -EFAULT; + GOTO(out, err); + } +#endif + case OBD_IOC_NAME2DEV: { + /* Resolve a device name. This does not change the + * currently selected device. + */ + int dev; + + dev = class_resolve_dev_name(data->ioc_inllen1, + data->ioc_inlbuf1); + data->ioc_dev = dev; + if (dev < 0) + GOTO(out, err = -EINVAL); + + if (copy_to_user((void __user *)arg, data, sizeof(*data))) + err = -EFAULT; + GOTO(out, err); + } + + case OBD_IOC_UUID2DEV: { + /* Resolve a device uuid. This does not change the + * currently selected device. + */ + int dev; + struct obd_uuid uuid; + + if (!data->ioc_inllen1 || !data->ioc_inlbuf1) { + CERROR("No UUID passed!\n"); + GOTO(out, err = -EINVAL); + } + if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) { + CERROR("UUID not NUL terminated!\n"); + GOTO(out, err = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1); + obd_str2uuid(&uuid, data->ioc_inlbuf1); + dev = class_uuid2dev(&uuid); + data->ioc_dev = dev; + if (dev == -1) { + CDEBUG(D_IOCTL, "No device for UUID %s!\n", + data->ioc_inlbuf1); + GOTO(out, err = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1, + dev); + if (copy_to_user((void __user *)arg, data, sizeof(*data))) + err = -EFAULT; + GOTO(out, err); + } + + case OBD_IOC_GETDEVICE: { + int index = data->ioc_count; + char *status, *str; + + if (!data->ioc_inlbuf1) { + CERROR("No buffer passed in ioctl\n"); + GOTO(out, err = -EINVAL); + } + if (data->ioc_inllen1 < 128) { + CERROR("ioctl buffer too small to hold version\n"); + GOTO(out, err = -EINVAL); + } + + obd = class_num2obd(index); + if (!obd) + GOTO(out, err = -ENOENT); + + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + str = (char *)data->ioc_bulk; + snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d", + (int)index, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + + if (copy_to_user((void __user *)arg, data, len)) + err = -EFAULT; + + GOTO(out, err); + } + + } + + if (data->ioc_dev == OBD_DEV_BY_DEVNAME) { + if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL) + GOTO(out, err = -EINVAL); + if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME) + GOTO(out, err = -EINVAL); + obd = class_name2obd(data->ioc_inlbuf4); + } else if (data->ioc_dev < class_devno_max()) { + obd = class_num2obd(data->ioc_dev); + } else { + CERROR("OBD ioctl: No device\n"); + GOTO(out, err = -EINVAL); + } + + if (obd == NULL) { + CERROR("OBD ioctl : No Device %d\n", data->ioc_dev); + GOTO(out, err = -EINVAL); + } + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + + if (!obd->obd_set_up || obd->obd_stopping) { + CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev); + GOTO(out, err = -EINVAL); + } + + switch(cmd) { + case OBD_IOC_NO_TRANSNO: { + if (!obd->obd_attached) { + CERROR("Device %d not attached\n", obd->obd_minor); + GOTO(out, err = -ENODEV); + } + CDEBUG(D_HA, "%s: disabling committed-transno notification\n", + obd->obd_name); + obd->obd_no_transno = 1; + GOTO(out, err = 0); + } + + default: { + err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL); + if (err) + GOTO(out, err); + + if (copy_to_user((void __user *)arg, data, len)) + err = -EFAULT; + GOTO(out, err); + } + } + +out: + OBD_FREE_LARGE(buf, len); + RETURN(err); +} /* class_handle_ioctl */ + +/* opening /dev/obd */ +static int obd_class_open(struct inode * inode, struct file * file) +{ + ENTRY; + try_module_get(THIS_MODULE); + RETURN(0); +} + +/* closing /dev/obd */ +static int obd_class_release(struct inode * inode, struct file * file) +{ + ENTRY; + + module_put(THIS_MODULE); + RETURN(0); +} + +/* to control /dev/obd */ +static long obd_class_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int err = 0; + + ENTRY; + /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */ + if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET)) + RETURN(err = -EACCES); + + if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */ + RETURN(err = -ENOTTY); + + err = class_handle_ioctl(cmd, (unsigned long)arg); + + RETURN(err); +} + +/* declare character device */ +static struct file_operations obd_psdev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */ + .open = obd_class_open, /* open */ + .release = obd_class_release, /* release */ +}; + +/* modules setup */ +struct miscdevice obd_psdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = OBD_DEV_NAME, + .fops = &obd_psdev_fops, +}; + +static int obd_init_checks(void) +{ + __u64 u64val, div64val; + char buf[64]; + int len, ret = 0; + + CDEBUG(D_INFO, "OBD_OBJECT_EOF = %#llx\n", (__u64)OBD_OBJECT_EOF); + + u64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), "%#llx", u64val); + if (len != 18) { + CWARN("u64 hex wrong length! strlen(%s)=%d != 18\n", buf, len); + ret = -EINVAL; + } + + div64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EOVERFLOW; + } + if (u64val >> 8 != OBD_OBJECT_EOF >> 8) { + CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + return -EOVERFLOW; + } + if (do_div(div64val, 256) != (u64val & 255)) { + CERROR("do_div(%#llx,256) != %llu\n", u64val, u64val & 255); + return -EOVERFLOW; + } + if (u64val >> 8 != div64val) { + CERROR("do_div(%#llx,256) %llu != %llu\n", + u64val, div64val, u64val >> 8); + return -EOVERFLOW; + } + len = snprintf(buf, sizeof(buf), "%#llx", u64val); + if (len != 18) { + CWARN("u64 hex wrong length! strlen(%s)=%d != 18\n", buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), "%llu", u64val); + if (len != 20) { + CWARN("u64 wrong length! strlen(%s)=%d != 20\n", buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), "%lld", u64val); + if (len != 2) { + CWARN("s64 wrong length! strlen(%s)=%d != 2\n", buf, len); + ret = -EINVAL; + } + if ((u64val & ~PAGE_MASK) >= PAGE_SIZE) { + CWARN("mask failed: u64val %llu >= %llu\n", u64val, + (__u64)PAGE_SIZE); + ret = -EINVAL; + } + + return ret; +} + +static int __init obdclass_init(void) +{ + int err; + + LCONSOLE_INFO("Lustre: Build Version: "LUSTRE_VERSION_STRING"\n"); + + libcfs_kkuc_init(); + + err = obd_init_checks(); + if (err == -EOVERFLOW) + return err; + +#ifdef CONFIG_PROC_FS + obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM, + LPROCFS_STATS_FLAG_NONE | + LPROCFS_STATS_FLAG_IRQ_SAFE); + if (obd_memory == NULL) { + CERROR("kmalloc of 'obd_memory' failed\n"); + return -ENOMEM; + } + + lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT, + LPROCFS_CNTR_AVGMINMAX, + "memused", "bytes"); +#endif + err = obd_zombie_impexp_init(); + if (err) + goto cleanup_obd_memory; + + err = class_handle_init(); + if (err) + goto cleanup_zombie_impexp; + + err = misc_register(&obd_psdev); + if (err) { + CERROR("cannot register OBD miscdevice: err = %d\n", err); + goto cleanup_class_handle; + } + + /* Default the dirty page cache cap to 1/2 of system memory. + * For clients with less memory, a larger fraction is needed + * for other purposes (mostly for BGL). */ + if (cfs_totalram_pages() <= 512 << (20 - PAGE_SHIFT)) + obd_max_dirty_pages = cfs_totalram_pages() / 4; + else + obd_max_dirty_pages = cfs_totalram_pages() / 2; + + err = obd_init_caches(); + if (err) + goto cleanup_deregister; + + err = class_procfs_init(); + if (err) + goto cleanup_caches; + + err = lu_global_init(); + if (err) + goto cleanup_class_procfs; + + err = cl_global_init(); + if (err != 0) + goto cleanup_lu_global; + +#ifdef HAVE_SERVER_SUPPORT + err = dt_global_init(); + if (err != 0) + goto cleanup_cl_global; + + err = lu_ucred_global_init(); + if (err != 0) + goto cleanup_dt_global; +#endif /* HAVE_SERVER_SUPPORT */ + + err = llog_info_init(); + if (err) +#ifdef HAVE_SERVER_SUPPORT + goto cleanup_lu_ucred_global; +#else /* !HAVE_SERVER_SUPPORT */ + goto cleanup_cl_global; +#endif /* HAVE_SERVER_SUPPORT */ + + err = lustre_register_fs(); + + /* simulate a late OOM situation now to require all + * alloc'ed/initialized resources to be freed */ + if (OBD_FAIL_CHECK(OBD_FAIL_OBDCLASS_MODULE_LOAD)) { + /* fake error but filesystem has been registered */ + lustre_unregister_fs(); + /* force error to ensure module will be unloaded/cleaned */ + err = -ENOMEM; + } + + if (err) + goto cleanup_llog_info; + + return 0; + +cleanup_llog_info: + llog_info_fini(); + +#ifdef HAVE_SERVER_SUPPORT +cleanup_lu_ucred_global: + lu_ucred_global_fini(); + +cleanup_dt_global: + dt_global_fini(); +#endif /* HAVE_SERVER_SUPPORT */ + +cleanup_cl_global: + cl_global_fini(); + +cleanup_lu_global: + lu_global_fini(); + +cleanup_class_procfs: + class_procfs_clean(); + +cleanup_caches: + obd_cleanup_caches(); + +cleanup_deregister: + misc_deregister(&obd_psdev); + +cleanup_class_handle: + class_handle_cleanup(); + +cleanup_zombie_impexp: + obd_zombie_impexp_stop(); + +cleanup_obd_memory: +#ifdef CONFIG_PROC_FS + lprocfs_free_stats(&obd_memory); +#endif + + return err; +} + +void obd_update_maxusage(void) +{ + __u64 max; + + max = obd_memory_sum(); + + spin_lock(&obd_updatemax_lock); + if (max > obd_max_alloc) + obd_max_alloc = max; + spin_unlock(&obd_updatemax_lock); +} +EXPORT_SYMBOL(obd_update_maxusage); + +#ifdef CONFIG_PROC_FS +__u64 obd_memory_max(void) +{ + __u64 ret; + + obd_update_maxusage(); + spin_lock(&obd_updatemax_lock); + ret = obd_max_alloc; + spin_unlock(&obd_updatemax_lock); + + return ret; +} +#endif /* CONFIG_PROC_FS */ + +static void __exit obdclass_exit(void) +{ +#ifdef CONFIG_PROC_FS + __u64 memory_leaked; + __u64 memory_max; +#endif /* CONFIG_PROC_FS */ + ENTRY; + + lustre_unregister_fs(); + + misc_deregister(&obd_psdev); + llog_info_fini(); +#ifdef HAVE_SERVER_SUPPORT + lu_ucred_global_fini(); + dt_global_fini(); +#endif /* HAVE_SERVER_SUPPORT */ + cl_global_fini(); + lu_global_fini(); + + obd_cleanup_caches(); + + class_procfs_clean(); + + class_handle_cleanup(); + class_del_uuid(NULL); /* Delete all UUIDs. */ + obd_zombie_impexp_stop(); + +#ifdef CONFIG_PROC_FS + memory_leaked = obd_memory_sum(); + memory_max = obd_memory_max(); + + lprocfs_free_stats(&obd_memory); + CDEBUG((memory_leaked) ? D_ERROR : D_INFO, + "obd_memory max: %llu, leaked: %llu\n", + memory_max, memory_leaked); +#endif /* CONFIG_PROC_FS */ + + EXIT; +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Class Driver"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(obdclass_init); +module_exit(obdclass_exit); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/debug.c b/drivers/staging/lustrefsx/lustre/obdclass/debug.c new file mode 100644 index 0000000000000..bfa1bad3dcb4a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/debug.c @@ -0,0 +1,106 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/debug.c + * + * Helper routines for dumping data structs for debugging. + */ + +#define DEBUG_SUBSYSTEM D_OTHER + + +#include +#include +#include + +void dump_lniobuf(struct niobuf_local *nb) +{ + CDEBUG(D_RPCTRACE, + "niobuf_local: file_offset=%lld, len=%d, page=%p, rc=%d\n", + nb->lnb_file_offset, nb->lnb_len, nb->lnb_page, nb->lnb_rc); + CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n", + nb->lnb_page ? page_index(nb->lnb_page) : -1); +} + +#define LPDS sizeof(__u64) +int block_debug_setup(void *addr, int len, __u64 off, __u64 id) +{ + LASSERT(addr); + + off = cpu_to_le64 (off); + id = cpu_to_le64 (id); + memcpy(addr, (char *)&off, LPDS); + memcpy(addr + LPDS, (char *)&id, LPDS); + + addr += len - LPDS - LPDS; + memcpy(addr, (char *)&off, LPDS); + memcpy(addr + LPDS, (char *)&id, LPDS); + + return 0; +} +EXPORT_SYMBOL(block_debug_setup); + +int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id) +{ + __u64 ne_off; + int err = 0; + + LASSERT(addr); + + ne_off = le64_to_cpu (off); + id = le64_to_cpu (id); + if (memcmp(addr, (char *)&ne_off, LPDS)) { + CDEBUG(D_ERROR, "%s: id %#llx offset %llu off: %#llx != " + "%#llx\n", who, id, off, *(__u64 *)addr, ne_off); + err = -EINVAL; + } + if (memcmp(addr + LPDS, (char *)&id, LPDS)) { + CDEBUG(D_ERROR, "%s: id %#llx offset %llu id: %#llx != %#llx\n", + who, id, off, *(__u64 *)(addr + LPDS), id); + err = -EINVAL; + } + + addr += end - LPDS - LPDS; + if (memcmp(addr, (char *)&ne_off, LPDS)) { + CDEBUG(D_ERROR, "%s: id %#llx offset %llu end off: %#llx != " + "%#llx\n", who, id, off, *(__u64 *)addr, ne_off); + err = -EINVAL; + } + if (memcmp(addr + LPDS, (char *)&id, LPDS)) { + CDEBUG(D_ERROR, "%s: id %#llx offset %llu end id: %#llx != " + "%#llx\n", who, id, off, *(__u64 *)(addr + LPDS), id); + err = -EINVAL; + } + + return err; +} +EXPORT_SYMBOL(block_debug_check); +#undef LPDS diff --git a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c new file mode 100644 index 0000000000000..68952df7e1242 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c @@ -0,0 +1,1304 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/dt_object.c + * + * Dt Object. + * Generic functions from dt_object.h + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +/* fid_be_to_cpu() */ +#include +#include +#include +#include + +/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */ +LU_KEY_INIT(dt_global, struct dt_thread_info); +LU_KEY_FINI(dt_global, struct dt_thread_info); + +struct lu_context_key dt_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL, + .lct_init = dt_global_key_init, + .lct_fini = dt_global_key_fini +}; + +/* + * no lock is necessary to protect the list, because call-backs + * are added during system startup. Please refer to "struct dt_device". + */ +void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb) +{ + list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks); +} +EXPORT_SYMBOL(dt_txn_callback_add); + +void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb) +{ + list_del_init(&cb->dtc_linkage); +} +EXPORT_SYMBOL(dt_txn_callback_del); + +int dt_txn_hook_start(const struct lu_env *env, + struct dt_device *dev, struct thandle *th) +{ + int rc = 0; + struct dt_txn_callback *cb; + + if (th->th_local) + return 0; + + list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) { + struct thandle *dtc_th = th; + + if (cb->dtc_txn_start == NULL || + !(cb->dtc_tag & env->le_ctx.lc_tags)) + continue; + + /* + * Usually dt_txn_hook_start is called from bottom device, + * and if the thandle has th_top, then we need use top + * thandle for the callback in the top thandle layer + */ + if (th->th_top != NULL) + dtc_th = th->th_top; + + rc = cb->dtc_txn_start(env, dtc_th, cb->dtc_cookie); + if (rc < 0) + break; + } + return rc; +} +EXPORT_SYMBOL(dt_txn_hook_start); + +int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th) +{ + struct dt_device *dev = th->th_dev; + struct dt_txn_callback *cb; + int rc = 0; + + if (th->th_local) + return 0; + + if (OBD_FAIL_CHECK(OBD_FAIL_DT_TXN_STOP)) + return -EIO; + + list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) { + struct thandle *dtc_th = th; + + if (cb->dtc_txn_stop == NULL || + !(cb->dtc_tag & env->le_ctx.lc_tags)) + continue; + + /* + * Usually dt_txn_hook_stop is called from bottom device, + * and if the thandle has th_top, then we need use top + * thandle for the callback in the top thandle layer + */ + if (th->th_top != NULL) + dtc_th = th->th_top; + + rc = cb->dtc_txn_stop(env, dtc_th, cb->dtc_cookie); + if (rc < 0) + break; + } + return rc; +} +EXPORT_SYMBOL(dt_txn_hook_stop); + +int dt_device_init(struct dt_device *dev, struct lu_device_type *t) +{ + INIT_LIST_HEAD(&dev->dd_txn_callbacks); + return lu_device_init(&dev->dd_lu_dev, t); +} +EXPORT_SYMBOL(dt_device_init); + +void dt_device_fini(struct dt_device *dev) +{ + lu_device_fini(&dev->dd_lu_dev); +} +EXPORT_SYMBOL(dt_device_fini); + +int dt_object_init(struct dt_object *obj, + struct lu_object_header *h, struct lu_device *d) + +{ + return lu_object_init(&obj->do_lu, h, d); +} +EXPORT_SYMBOL(dt_object_init); + +void dt_object_fini(struct dt_object *obj) +{ + lu_object_fini(&obj->do_lu); +} +EXPORT_SYMBOL(dt_object_fini); + +int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj) +{ + if (obj->do_index_ops == NULL) + obj->do_ops->do_index_try(env, obj, &dt_directory_features); + return obj->do_index_ops != NULL; +} +EXPORT_SYMBOL(dt_try_as_dir); + +enum dt_format_type dt_mode_to_dft(__u32 mode) +{ + enum dt_format_type result; + + switch (mode & S_IFMT) { + case S_IFDIR: + result = DFT_DIR; + break; + case S_IFREG: + result = DFT_REGULAR; + break; + case S_IFLNK: + result = DFT_SYM; + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + result = DFT_NODE; + break; + default: + LASSERTF(0, "invalid mode %o\n", mode); + result = 0; /* Just for satisfying compiler. */ + break; + } + return result; +} +EXPORT_SYMBOL(dt_mode_to_dft); + +/** + * lookup fid for object named \a name in directory \a dir. + */ + +int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir, + const char *name, struct lu_fid *fid) +{ + if (dt_try_as_dir(env, dir)) + return dt_lookup(env, dir, (struct dt_rec *)fid, + (const struct dt_key *)name); + return -ENOTDIR; +} +EXPORT_SYMBOL(dt_lookup_dir); + +/* + * this differs from dt_locate by top_dev as parameter + * but not one from lu_site + */ +struct dt_object *dt_locate_at(const struct lu_env *env, + struct dt_device *dev, + const struct lu_fid *fid, + struct lu_device *top_dev, + const struct lu_object_conf *conf) +{ + struct lu_object *lo; + struct lu_object *n; + + lo = lu_object_find_at(env, top_dev, fid, conf); + if (IS_ERR(lo)) + return ERR_PTR(PTR_ERR(lo)); + + LASSERT(lo != NULL); + + list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) { + if (n->lo_dev == &dev->dd_lu_dev) + return container_of0(n, struct dt_object, do_lu); + } + + lu_object_put(env, lo); + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL(dt_locate_at); + +/** + * find an object named \a entry in given \a dfh->dfh_o directory. + */ +static int dt_find_entry(const struct lu_env *env, const char *entry, + void *data) +{ + struct dt_find_hint *dfh = data; + struct dt_device *dt = dfh->dfh_dt; + struct lu_fid *fid = dfh->dfh_fid; + struct dt_object *obj = dfh->dfh_o; + int rc; + + rc = dt_lookup_dir(env, obj, entry, fid); + dt_object_put(env, obj); + if (rc == 0) { + obj = dt_locate(env, dt, fid); + if (IS_ERR(obj)) + rc = PTR_ERR(obj); + } + dfh->dfh_o = obj; + + return rc; +} + +/** + * Abstract function which parses path name. This function feeds + * path component to \a entry_func. + */ +int dt_path_parser(const struct lu_env *env, + char *path, dt_entry_func_t entry_func, + void *data) +{ + char *e; + int rc = 0; + + while (1) { + e = strsep(&path, "/"); + if (e == NULL) + break; + + if (e[0] == 0) { + if (!path || path[0] == '\0') + break; + continue; + } + rc = entry_func(env, e, data); + if (rc) + break; + } + + return rc; +} + +struct dt_object * +dt_store_resolve(const struct lu_env *env, struct dt_device *dt, + const char *path, struct lu_fid *fid) +{ + struct dt_thread_info *info = dt_info(env); + struct dt_find_hint *dfh = &info->dti_dfh; + struct dt_object *obj; + int result; + + + dfh->dfh_dt = dt; + dfh->dfh_fid = fid; + + strlcpy(info->dti_buf, path, sizeof(info->dti_buf)); + + result = dt->dd_ops->dt_root_get(env, dt, fid); + if (result == 0) { + obj = dt_locate(env, dt, fid); + if (!IS_ERR(obj)) { + dfh->dfh_o = obj; + result = dt_path_parser(env, info->dti_buf, + dt_find_entry, dfh); + if (result != 0) + obj = ERR_PTR(result); + else + obj = dfh->dfh_o; + } + } else { + obj = ERR_PTR(result); + } + return obj; +} + +static struct dt_object *dt_reg_open(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *p, + const char *name, + struct lu_fid *fid) +{ + struct dt_object *o; + int result; + + result = dt_lookup_dir(env, p, name, fid); + if (result == 0) + o = dt_locate(env, dt, fid); + else + o = ERR_PTR(result); + + return o; +} + +/** + * Open dt object named \a filename from \a dirname directory. + * \param dt dt device + * \param fid on success, object fid is stored in *fid + */ +struct dt_object *dt_store_open(const struct lu_env *env, struct dt_device *dt, + const char *dirname, const char *filename, + struct lu_fid *fid) +{ + struct dt_object *file; + struct dt_object *dir; + + dir = dt_store_resolve(env, dt, dirname, fid); + if (!IS_ERR(dir)) { + file = dt_reg_open(env, dt, dir, filename, fid); + dt_object_put(env, dir); + } else { + file = dir; + } + + return file; +} + +struct dt_object *dt_find_or_create(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object_format *dof, + struct lu_attr *at) +{ + struct dt_object *dto; + struct thandle *th; + int rc; + + ENTRY; + + dto = dt_locate(env, dt, fid); + if (IS_ERR(dto)) + RETURN(dto); + + LASSERT(dto != NULL); + if (dt_object_exists(dto)) + RETURN(dto); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = dt_declare_create(env, dto, at, NULL, dof, th); + if (rc) + GOTO(trans_stop, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(trans_stop, rc); + + dt_write_lock(env, dto, 0); + if (dt_object_exists(dto)) + GOTO(unlock, rc = 0); + + CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid)); + + rc = dt_create(env, dto, at, NULL, dof, th); + if (rc) + GOTO(unlock, rc); + LASSERT(dt_object_exists(dto)); +unlock: + dt_write_unlock(env, dto); +trans_stop: + dt_trans_stop(env, dt, th); +out: + if (rc) { + dt_object_put(env, dto); + dto = ERR_PTR(rc); + } + + RETURN(dto); +} +EXPORT_SYMBOL(dt_find_or_create); + +/* dt class init function. */ +int dt_global_init(void) +{ + int result; + + LU_CONTEXT_KEY_INIT(&dt_key); + result = lu_context_key_register(&dt_key); + return result; +} + +void dt_global_fini(void) +{ + lu_context_key_degister(&dt_key); +} + +/** + * Generic read helper. May return an error for partial reads. + * + * \param env lustre environment + * \param dt object to be read + * \param buf lu_buf to be filled, with buffer pointer and length + * \param pos position to start reading, updated as data is read + * + * \retval real size of data read + * \retval -ve errno on failure + */ +int dt_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + LASSERTF(dt != NULL, "dt is NULL when we want to read record\n"); + return dt->do_body_ops->dbo_read(env, dt, buf, pos); +} +EXPORT_SYMBOL(dt_read); + +/** + * Read structures of fixed size from storage. Unlike dt_read(), using + * dt_record_read() will return an error for partial reads. + * + * \param env lustre environment + * \param dt object to be read + * \param buf lu_buf to be filled, with buffer pointer and length + * \param pos position to start reading, updated as data is read + * + * \retval 0 on successfully reading full buffer + * \retval -EFAULT on short read + * \retval -ve errno on failure + */ +int dt_record_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + ssize_t size; + + LASSERTF(dt != NULL, "dt is NULL when we want to read record\n"); + + size = dt->do_body_ops->dbo_read(env, dt, buf, pos); + if (size < 0) + return size; + return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT; +} +EXPORT_SYMBOL(dt_record_read); + +int dt_record_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, struct thandle *th) +{ + ssize_t size; + + LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); + LASSERT(th != NULL); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_write); + + size = dt->do_body_ops->dbo_write(env, dt, buf, pos, th); + if (size < 0) + return size; + return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT; +} +EXPORT_SYMBOL(dt_record_write); + +int dt_declare_version_set(const struct lu_env *env, struct dt_object *o, + struct thandle *th) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + + LASSERT(o); + vbuf.lb_buf = NULL; + vbuf.lb_len = sizeof(dt_obj_version_t); + return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th); + +} +EXPORT_SYMBOL(dt_declare_version_set); + +void dt_version_set(const struct lu_env *env, struct dt_object *o, + dt_obj_version_t version, struct thandle *th) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + int rc; + + LASSERT(o); + vbuf.lb_buf = &version; + vbuf.lb_len = sizeof(version); + + rc = dt_xattr_set(env, o, &vbuf, xname, 0, th); + if (rc < 0) + CDEBUG(D_INODE, "Can't set version, rc %d\n", rc); + return; +} +EXPORT_SYMBOL(dt_version_set); + +dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + dt_obj_version_t version; + int rc; + + LASSERT(o); + vbuf.lb_buf = &version; + vbuf.lb_len = sizeof(version); + rc = dt_xattr_get(env, o, &vbuf, xname); + if (rc != sizeof(version)) { + CDEBUG(D_INODE, "Can't get version, rc %d\n", rc); + version = 0; + } + return version; +} +EXPORT_SYMBOL(dt_version_get); + +/* list of all supported index types */ + +/* directories */ +const struct dt_index_features dt_directory_features; +EXPORT_SYMBOL(dt_directory_features); + +/* scrub iterator */ +const struct dt_index_features dt_otable_features; +EXPORT_SYMBOL(dt_otable_features); + +/* lfsck layout orphan */ +const struct dt_index_features dt_lfsck_layout_orphan_features = { + .dif_flags = 0, + .dif_keysize_min = sizeof(struct lu_fid), + .dif_keysize_max = sizeof(struct lu_fid), + .dif_recsize_min = sizeof(struct lu_orphan_rec_v3), + .dif_recsize_max = sizeof(struct lu_orphan_rec_v3), + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_lfsck_layout_orphan_features); + +/* lfsck layout dangling */ +const struct dt_index_features dt_lfsck_layout_dangling_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(struct lfsck_layout_dangling_key), + .dif_keysize_max = sizeof(struct lfsck_layout_dangling_key), + .dif_recsize_min = sizeof(struct lu_fid), + .dif_recsize_max = sizeof(struct lu_fid), + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_lfsck_layout_dangling_features); + +/* lfsck namespace */ +const struct dt_index_features dt_lfsck_namespace_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(struct lu_fid), + .dif_keysize_max = sizeof(struct lu_fid), + .dif_recsize_min = sizeof(__u8), + .dif_recsize_max = sizeof(__u8), + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_lfsck_namespace_features); + +/* accounting indexes */ +const struct dt_index_features dt_acct_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_acct_rec), /* 16 bytes */ + .dif_recsize_max = sizeof(struct lquota_acct_rec), /* 16 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_acct_features); + +/* global quota files */ +const struct dt_index_features dt_quota_glb_features = { + .dif_flags = DT_IND_UPDATE, + /* a different key would have to be used for per-directory quota */ + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_glb_rec), /* 32 bytes */ + .dif_recsize_max = sizeof(struct lquota_glb_rec), /* 32 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_quota_glb_features); + +/* slave quota files */ +const struct dt_index_features dt_quota_slv_features = { + .dif_flags = DT_IND_UPDATE, + /* a different key would have to be used for per-directory quota */ + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_slv_rec), /* 8 bytes */ + .dif_recsize_max = sizeof(struct lquota_slv_rec), /* 8 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_quota_slv_features); + +/* nodemap files, nodemap_rec size asserted in nodemap_storage.c */ +const struct dt_index_features dt_nodemap_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(__u64), /* 64-bit nodemap/record id */ + .dif_keysize_max = sizeof(__u64), /* 64-bit nodemap/record id */ + .dif_recsize_min = sizeof(union nodemap_rec), /* 32 bytes */ + .dif_recsize_max = sizeof(union nodemap_rec), /* 32 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_nodemap_features); + +/* + * helper function returning what dt_index_features structure should be used + * based on the FID sequence. This is used by OBD_IDX_READ RPC + */ +static inline const struct dt_index_features *dt_index_feat_select(__u64 seq, + __u32 mode) +{ + if (seq == FID_SEQ_QUOTA_GLB) { + /* global quota index */ + if (!S_ISREG(mode)) + /* global quota index should be a regular file */ + return ERR_PTR(-ENOENT); + return &dt_quota_glb_features; + } else if (seq == FID_SEQ_QUOTA) { + /* quota slave index */ + if (!S_ISREG(mode)) + /* slave index should be a regular file */ + return ERR_PTR(-ENOENT); + return &dt_quota_slv_features; + } else if (seq == FID_SEQ_LAYOUT_RBTREE){ + return &dt_lfsck_layout_orphan_features; + } else if (seq >= FID_SEQ_NORMAL) { + /* object is part of the namespace, verify that it is a + * directory */ + if (!S_ISDIR(mode)) + /* sorry, we can only deal with directory */ + return ERR_PTR(-ENOTDIR); + return &dt_directory_features; + } + + return ERR_PTR(-EOPNOTSUPP); +} + +/* + * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ + * RPC + * + * \param env - is the environment passed by the caller + * \param lp - is a pointer to the lu_page to fill + * \param nob - is the maximum number of bytes that should be copied + * \param iops - is the index operation vector associated with the index object + * \param it - is a pointer to the current iterator + * \param attr - is the index attribute to pass to iops->rec() + * \param arg - is a pointer to the idx_info structure + */ +static int dt_index_page_build(const struct lu_env *env, union lu_page *lp, + size_t nob, const struct dt_it_ops *iops, + struct dt_it *it, __u32 attr, void *arg) +{ + struct idx_info *ii = (struct idx_info *)arg; + struct lu_idxpage *lip = &lp->lp_idx; + char *entry; + __u64 hash; + __u16 hashsize = 0; + __u16 keysize = 0; + __u16 recsize; + int rc; + + ENTRY; + + if (nob < LIP_HDR_SIZE) + return -EINVAL; + + /* initialize the header of the new container */ + memset(lip, 0, LIP_HDR_SIZE); + lip->lip_magic = LIP_MAGIC; + nob -= LIP_HDR_SIZE; + + /* client wants to the 64-bit hash value associated with each record */ + if (!(ii->ii_flags & II_FL_NOHASH)) + hashsize = sizeof(hash); + + entry = lip->lip_entries; + do { + /* fetch 64-bit hash value */ + hash = iops->store(env, it); + ii->ii_hash_end = hash; + + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) { + if (lip->lip_nr != 0) + GOTO(out, rc = 0); + } + + if (!(ii->ii_flags & II_FL_NOKEY)) { + keysize = iops->key_size(env, it); + if (!(ii->ii_flags & II_FL_VARKEY) && + keysize != ii->ii_keysize) { + CERROR("keysize mismatch %hu != %hu.\n", + keysize, ii->ii_keysize); + GOTO(out, rc = -EINVAL); + } + } + + /* and finally the record */ + if (ii->ii_flags & II_FL_VARREC) + recsize = iops->rec_size(env, it, attr); + else + recsize = ii->ii_recsize; + + if (nob < hashsize + keysize + recsize) { + if (lip->lip_nr == 0) + GOTO(out, rc = -E2BIG); + GOTO(out, rc = 0); + } + + rc = iops->rec(env, it, + (struct dt_rec *)(entry + hashsize + keysize), + attr); + if (!rc) { + if (hashsize) + memcpy(entry, &hash, hashsize); + if (keysize) { + struct dt_key *key; + + key = iops->key(env, it); + memcpy(entry + hashsize, key, keysize); + } + /* hash/key/record successfully copied! */ + lip->lip_nr++; + if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0)) + ii->ii_hash_start = hash; + entry += hashsize + keysize + recsize; + nob -= hashsize + keysize + recsize; + } else if (rc != -ESTALE) { + GOTO(out, rc); + } + + /* move on to the next record */ + do { + rc = iops->next(env, it); + } while (rc == -ESTALE); + } while (rc == 0); + + GOTO(out, rc); +out: + if (rc >= 0 && lip->lip_nr > 0) + /* one more container */ + ii->ii_count++; + if (rc > 0) + /* no more entries */ + ii->ii_hash_end = II_END_OFF; + return rc; +} + + +/* + * Walk index and fill lu_page containers with key/record pairs + * + * \param env - is the environment passed by the caller + * \param obj - is the index object to parse + * \param rdpg - is the lu_rdpg descriptor associated with the transfer + * \param filler - is the callback function responsible for filling a lu_page + * with key/record pairs in the format wanted by the caller. + * If NULL, uses dt_index_page_build + * \param arg - is an opaq argument passed to the filler function + * + * \retval sum (in bytes) of all filled lu_pages + * \retval -ve errno on failure + */ +int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + const struct lu_rdpg *rdpg, dt_index_page_build_t filler, + void *arg) +{ + struct dt_it *it; + const struct dt_it_ops *iops; + size_t pageidx, nob, nlupgs = 0; + int rc; + ENTRY; + + LASSERT(rdpg->rp_pages != NULL); + LASSERT(obj->do_index_ops != NULL); + + if (filler == NULL) + filler = dt_index_page_build; + + nob = rdpg->rp_count; + if (nob == 0) + RETURN(-EFAULT); + + /* Iterate through index and fill containers from @rdpg */ + iops = &obj->do_index_ops->dio_it; + LASSERT(iops != NULL); + it = iops->init(env, obj, rdpg->rp_attrs); + if (IS_ERR(it)) + RETURN(PTR_ERR(it)); + + rc = iops->load(env, it, rdpg->rp_hash); + if (rc == 0) { + /* + * Iterator didn't find record with exactly the key requested. + * + * It is currently either + * + * - positioned above record with key less than + * requested---skip it. + * - or not positioned at all (is in IAM_IT_SKEWED + * state)---position it on the next item. + */ + rc = iops->next(env, it); + } else if (rc > 0) { + rc = 0; + } else { + if (rc == -ENODATA) + rc = 0; + GOTO(out, rc); + } + + /* + * Fill containers one after the other. There might be multiple + * containers per physical page. + * + * At this point and across for-loop: + * rc == 0 -> ok, proceed. + * rc > 0 -> end of index. + * rc < 0 -> error. + */ + for (pageidx = 0; rc == 0 && nob > 0; pageidx++) { + union lu_page *lp; + int i; + + LASSERT(pageidx < rdpg->rp_npages); + lp = kmap(rdpg->rp_pages[pageidx]); + + /* fill lu pages */ + for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) { + rc = filler(env, lp, min_t(size_t, nob, LU_PAGE_SIZE), + iops, it, rdpg->rp_attrs, arg); + if (rc < 0) + break; + /* one more lu_page */ + nlupgs++; + if (rc > 0) + /* end of index */ + break; + } + kunmap(rdpg->rp_pages[i]); + } + +out: + iops->put(env, it); + iops->fini(env, it); + + if (rc >= 0) + rc = min_t(size_t, nlupgs * LU_PAGE_SIZE, rdpg->rp_count); + + RETURN(rc); +} +EXPORT_SYMBOL(dt_index_walk); + +/** + * Walk key/record pairs of an index and copy them into 4KB containers to be + * transferred over the network. This is the common handler for OBD_IDX_READ + * RPC processing. + * + * \param env - is the environment passed by the caller + * \param dev - is the dt_device storing the index + * \param ii - is the idx_info structure packed by the client in the + * OBD_IDX_READ request + * \param rdpg - is the lu_rdpg descriptor + * + * \retval on success, return sum (in bytes) of all filled containers + * \retval appropriate error otherwise. + */ +int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg) +{ + const struct dt_index_features *feat; + struct dt_object *obj; + int rc; + ENTRY; + + /* + * rp_count shouldn't be null and should be a multiple of the container + * size + */ + if (rdpg->rp_count == 0 || (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0) + RETURN(-EFAULT); + + if (!fid_is_quota(&ii->ii_fid) && !fid_is_layout_rbtree(&ii->ii_fid) && + !fid_is_norm(&ii->ii_fid)) + RETURN(-EOPNOTSUPP); + + /* lookup index object subject to the transfer */ + obj = dt_locate(env, dev, &ii->ii_fid); + if (IS_ERR(obj)) + RETURN(PTR_ERR(obj)); + if (dt_object_exists(obj) == 0) + GOTO(out, rc = -ENOENT); + + /* fetch index features associated with index object */ + feat = dt_index_feat_select(fid_seq(&ii->ii_fid), + lu_object_attr(&obj->do_lu)); + if (IS_ERR(feat)) + GOTO(out, rc = PTR_ERR(feat)); + + /* load index feature if not done already */ + if (obj->do_index_ops == NULL) { + rc = obj->do_ops->do_index_try(env, obj, feat); + if (rc) + GOTO(out, rc); + } + + /* fill ii_flags with supported index features */ + ii->ii_flags &= (II_FL_NOHASH | II_FL_NOKEY | II_FL_VARKEY | + II_FL_VARREC); + + if (!(feat->dif_flags & DT_IND_VARKEY)) + ii->ii_keysize = feat->dif_keysize_max; + + if (!(feat->dif_flags & DT_IND_VARREC)) + ii->ii_recsize = feat->dif_recsize_max; + + if (feat->dif_flags & DT_IND_NONUNQ) + /* key isn't necessarily unique */ + ii->ii_flags |= II_FL_NONUNQ; + + if (!fid_is_layout_rbtree(&ii->ii_fid)) { + dt_read_lock(env, obj, 0); + /* fetch object version before walking the index */ + ii->ii_version = dt_version_get(env, obj); + } + + /* walk the index and fill lu_idxpages with key/record pairs */ + rc = dt_index_walk(env, obj, rdpg, dt_index_page_build, ii); + if (!fid_is_layout_rbtree(&ii->ii_fid)) + dt_read_unlock(env, obj); + + if (rc == 0) { + /* index is empty */ + LASSERT(ii->ii_count == 0); + ii->ii_hash_end = II_END_OFF; + } + + GOTO(out, rc); +out: + dt_object_put(env, obj); + return rc; +} +EXPORT_SYMBOL(dt_index_read); + +#ifdef CONFIG_PROC_FS +int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) + seq_printf(m, "%u\n", (unsigned) osfs.os_bsize); + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_blksize_seq_show); + +int lprocfs_dt_kbytestotal_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_kbytestotal_seq_show); + +int lprocfs_dt_kbytesfree_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_kbytesfree_seq_show); + +int lprocfs_dt_kbytesavail_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + seq_printf(m, "%llu\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_kbytesavail_seq_show); + +int lprocfs_dt_filestotal_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) + seq_printf(m, "%llu\n", osfs.os_files); + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_filestotal_seq_show); + +int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v) +{ + struct dt_device *dt = m->private; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) + seq_printf(m, "%llu\n", osfs.os_ffree); + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_filesfree_seq_show); + +#endif /* CONFIG_PROC_FS */ + +static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct lu_device *lu = dt2lu_dev(dt); + + if (!lu->ld_obd) + return -ENODEV; + + return sprintf(buf, "%s\n", lu->ld_obd->obd_uuid.uuid); +} +LUSTRE_RO_ATTR(uuid); + +static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + return sprintf(buf, "%u\n", (unsigned) osfs.os_bsize); +} +LUSTRE_RO_ATTR(blocksize); + +static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytestotal); + +static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytesfree); + +static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + u32 blk_size; + u64 result; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + blk_size = osfs.os_bsize >> 10; + result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); +} +LUSTRE_RO_ATTR(kbytesavail); + +static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + return sprintf(buf, "%llu\n", osfs.os_files); +} +LUSTRE_RO_ATTR(filestotal); + +static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct obd_statfs osfs; + int rc; + + rc = dt_statfs(NULL, dt, &osfs); + if (rc) + return rc; + + return sprintf(buf, "%llu\n", osfs.os_ffree); +} +LUSTRE_RO_ATTR(filesfree); + +static const struct attribute *dt_def_attrs[] = { + &lustre_attr_uuid.attr, + &lustre_attr_blocksize.attr, + &lustre_attr_kbytestotal.attr, + &lustre_attr_kbytesfree.attr, + &lustre_attr_kbytesavail.attr, + &lustre_attr_filestotal.attr, + &lustre_attr_filesfree.attr, + NULL, +}; + +static void dt_sysfs_release(struct kobject *kobj) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + + complete(&dt->dd_kobj_unregister); +} + +int dt_tunables_fini(struct dt_device *dt) +{ + if (!dt) + return -EINVAL; + + if (!IS_ERR_OR_NULL(dt->dd_debugfs_entry)) + ldebugfs_remove(&dt->dd_debugfs_entry); + + if (dt->dd_def_attrs) + sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs); + + kobject_put(&dt->dd_kobj); + wait_for_completion(&dt->dd_kobj_unregister); + + return 0; +} +EXPORT_SYMBOL(dt_tunables_fini); + +int dt_tunables_init(struct dt_device *dt, struct obd_type *type, + const char *name, struct ldebugfs_vars *list) +{ + int rc; + + dt->dd_ktype.sysfs_ops = &lustre_sysfs_ops; + dt->dd_ktype.release = dt_sysfs_release; + + init_completion(&dt->dd_kobj_unregister); + rc = kobject_init_and_add(&dt->dd_kobj, &dt->dd_ktype, type->typ_kobj, + "%s", name); + if (rc) + return rc; + + dt->dd_def_attrs = dt_def_attrs; + + rc = sysfs_create_files(&dt->dd_kobj, dt->dd_def_attrs); + if (rc) { + kobject_put(&dt->dd_kobj); + return rc; + } + + /* + * No need to register debugfs if no enteries. This allows us to + * choose between using dt_device or obd_device for debugfs. + */ + if (!list) + return rc; + + dt->dd_debugfs_entry = ldebugfs_register(name, + type->typ_debugfs_entry, + list, dt); + if (IS_ERR_OR_NULL(dt->dd_debugfs_entry)) { + rc = dt->dd_debugfs_entry ? PTR_ERR(dt->dd_debugfs_entry) + : -ENOMEM; + CERROR("%s: error %d setting up debugfs\n", + name, rc); + dt->dd_debugfs_entry = NULL; + sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs); + kobject_put(&dt->dd_kobj); + return rc; + } + + return rc; +} +EXPORT_SYMBOL(dt_tunables_init); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/genops.c b/drivers/staging/lustrefsx/lustre/obdclass/genops.c new file mode 100644 index 0000000000000..bd9330daafd8a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/genops.c @@ -0,0 +1,2415 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/genops.c + * + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(obd_types_lock); +static LIST_HEAD(obd_types); +DEFINE_RWLOCK(obd_dev_lock); +static struct obd_device *obd_devs[MAX_OBD_DEVICES]; + +static struct kmem_cache *obd_device_cachep; + +static struct workqueue_struct *zombie_wq; + +static void obd_zombie_export_add(struct obd_export *exp); +static void obd_zombie_import_add(struct obd_import *imp); +static void print_export_data(struct obd_export *exp, + const char *status, int locks, int debug_level); + +static LIST_HEAD(obd_stale_exports); +static DEFINE_SPINLOCK(obd_stale_export_lock); +static atomic_t obd_stale_export_num = ATOMIC_INIT(0); + +int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); +EXPORT_SYMBOL(ptlrpc_put_connection_superhack); + +/* + * support functions: we could use inter-module communication, but this + * is more portable to other OS's + */ +static struct obd_device *obd_device_alloc(void) +{ + struct obd_device *obd; + + OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, GFP_NOFS); + if (obd != NULL) { + obd->obd_magic = OBD_DEVICE_MAGIC; + } + return obd; +} + +static void obd_device_free(struct obd_device *obd) +{ + LASSERT(obd != NULL); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + if (obd->obd_namespace != NULL) { + CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n", + obd, obd->obd_namespace, obd->obd_force); + LBUG(); + } + lu_ref_fini(&obd->obd_reference); + OBD_SLAB_FREE_PTR(obd, obd_device_cachep); +} + +struct obd_type *class_search_type(const char *name) +{ + struct list_head *tmp; + struct obd_type *type; + + spin_lock(&obd_types_lock); + list_for_each(tmp, &obd_types) { + type = list_entry(tmp, struct obd_type, typ_chain); + if (strcmp(type->typ_name, name) == 0) { + spin_unlock(&obd_types_lock); + return type; + } + } + spin_unlock(&obd_types_lock); + return NULL; +} +EXPORT_SYMBOL(class_search_type); + +struct obd_type *class_get_type(const char *name) +{ + struct obd_type *type = class_search_type(name); + +#ifdef HAVE_MODULE_LOADING_SUPPORT + if (!type) { + const char *modname = name; + + if (strcmp(modname, "obdfilter") == 0) + modname = "ofd"; + + if (strcmp(modname, LUSTRE_LWP_NAME) == 0) + modname = LUSTRE_OSP_NAME; + + if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME))) + modname = LUSTRE_MDT_NAME; + + if (!request_module("%s", modname)) { + CDEBUG(D_INFO, "Loaded module '%s'\n", modname); + type = class_search_type(name); + } else { + LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n", + modname); + } + } +#endif + if (type) { + spin_lock(&type->obd_type_lock); + type->typ_refcnt++; + try_module_get(type->typ_dt_ops->o_owner); + spin_unlock(&type->obd_type_lock); + } + return type; +} + +void class_put_type(struct obd_type *type) +{ + LASSERT(type); + spin_lock(&type->obd_type_lock); + type->typ_refcnt--; + module_put(type->typ_dt_ops->o_owner); + spin_unlock(&type->obd_type_lock); +} + +static void class_sysfs_release(struct kobject *kobj) +{ + OBD_FREE(kobj, sizeof(*kobj)); +} + +static struct kobj_type class_ktype = { + .sysfs_ops = &lustre_sysfs_ops, + .release = class_sysfs_release, +}; + +struct kobject *class_setup_tunables(const char *name) +{ + struct kobject *kobj; + int rc; + +#ifdef HAVE_SERVER_SUPPORT + kobj = kset_find_obj(lustre_kset, name); + if (kobj) + return kobj; +#endif + OBD_ALLOC(kobj, sizeof(*kobj)); + if (!kobj) + return ERR_PTR(-ENOMEM); + + kobj->kset = lustre_kset; + kobject_init(kobj, &class_ktype); + rc = kobject_add(kobj, &lustre_kset->kobj, "%s", name); + if (rc) { + kobject_put(kobj); + return ERR_PTR(rc); + } + return kobj; +} +EXPORT_SYMBOL(class_setup_tunables); + +#define CLASS_MAX_NAME 1024 + +int class_register_type(const struct obd_ops *dt_ops, + const struct md_ops *md_ops, + bool enable_proc, struct ldebugfs_vars *vars, + const char *name, struct lu_device_type *ldt) +{ + struct obd_type *type; +#ifdef HAVE_SERVER_SUPPORT + struct qstr dname; +#endif /* HAVE_SERVER_SUPPORT */ + int rc = 0; + + ENTRY; + /* sanity check */ + LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME); + + if (class_search_type(name)) { + CDEBUG(D_IOCTL, "Type %s already registered\n", name); + RETURN(-EEXIST); + } + + rc = -ENOMEM; + OBD_ALLOC(type, sizeof(*type)); + if (type == NULL) + RETURN(rc); + + OBD_ALLOC_PTR(type->typ_dt_ops); + OBD_ALLOC_PTR(type->typ_md_ops); + OBD_ALLOC(type->typ_name, strlen(name) + 1); + + if (type->typ_dt_ops == NULL || + type->typ_md_ops == NULL || + type->typ_name == NULL) + GOTO (failed, rc); + + *(type->typ_dt_ops) = *dt_ops; + /* md_ops is optional */ + if (md_ops) + *(type->typ_md_ops) = *md_ops; + strcpy(type->typ_name, name); + spin_lock_init(&type->obd_type_lock); + +#ifdef CONFIG_PROC_FS + if (enable_proc) { + type->typ_procroot = lprocfs_register(type->typ_name, + proc_lustre_root, + NULL, type); + if (IS_ERR(type->typ_procroot)) { + rc = PTR_ERR(type->typ_procroot); + type->typ_procroot = NULL; + GOTO(failed, rc); + } + } +#endif +#ifdef HAVE_SERVER_SUPPORT + dname.name = name; + dname.len = strlen(dname.name); + dname.hash = ll_full_name_hash(debugfs_lustre_root, dname.name, + dname.len); + type->typ_debugfs_entry = d_lookup(debugfs_lustre_root, &dname); + if (type->typ_debugfs_entry) { + dput(type->typ_debugfs_entry); + type->typ_sym_filter = true; + goto dir_exist; + } +#endif /* HAVE_SERVER_SUPPORT */ + + type->typ_debugfs_entry = ldebugfs_register(type->typ_name, + debugfs_lustre_root, + vars, type); + if (IS_ERR_OR_NULL(type->typ_debugfs_entry)) { + rc = type->typ_debugfs_entry ? PTR_ERR(type->typ_debugfs_entry) + : -ENOMEM; + type->typ_debugfs_entry = NULL; + GOTO(failed, rc); + } +#ifdef HAVE_SERVER_SUPPORT +dir_exist: +#endif + type->typ_kobj = class_setup_tunables(type->typ_name); + if (IS_ERR(type->typ_kobj)) + GOTO(failed, rc = PTR_ERR(type->typ_kobj)); + + if (ldt) { + type->typ_lu = ldt; + rc = lu_device_type_init(ldt); + if (rc) { + kobject_put(type->typ_kobj); + GOTO(failed, rc); + } + } + + spin_lock(&obd_types_lock); + list_add(&type->typ_chain, &obd_types); + spin_unlock(&obd_types_lock); + + RETURN(0); + +failed: +#ifdef HAVE_SERVER_SUPPORT + if (type->typ_sym_filter) + type->typ_debugfs_entry = NULL; +#endif + if (!IS_ERR_OR_NULL(type->typ_debugfs_entry)) + ldebugfs_remove(&type->typ_debugfs_entry); + if (type->typ_name != NULL) { +#ifdef CONFIG_PROC_FS + if (type->typ_procroot != NULL) + remove_proc_subtree(type->typ_name, proc_lustre_root); +#endif + OBD_FREE(type->typ_name, strlen(name) + 1); + } + if (type->typ_md_ops != NULL) + OBD_FREE_PTR(type->typ_md_ops); + if (type->typ_dt_ops != NULL) + OBD_FREE_PTR(type->typ_dt_ops); + OBD_FREE(type, sizeof(*type)); + RETURN(rc); +} +EXPORT_SYMBOL(class_register_type); + +int class_unregister_type(const char *name) +{ + struct obd_type *type = class_search_type(name); + ENTRY; + + if (!type) { + CERROR("unknown obd type\n"); + RETURN(-EINVAL); + } + + if (type->typ_refcnt) { + CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt); + /* This is a bad situation, let's make the best of it */ + /* Remove ops, but leave the name for debugging */ + OBD_FREE_PTR(type->typ_dt_ops); + OBD_FREE_PTR(type->typ_md_ops); + RETURN(-EBUSY); + } + + kobject_put(type->typ_kobj); + + /* we do not use type->typ_procroot as for compatibility purposes + * other modules can share names (i.e. lod can use lov entry). so + * we can't reference pointer as it can get invalided when another + * module removes the entry */ +#ifdef CONFIG_PROC_FS + if (type->typ_procroot != NULL) + remove_proc_subtree(type->typ_name, proc_lustre_root); + if (type->typ_procsym != NULL) + lprocfs_remove(&type->typ_procsym); +#endif +#ifdef HAVE_SERVER_SUPPORT + if (type->typ_sym_filter) + type->typ_debugfs_entry = NULL; +#endif + if (!IS_ERR_OR_NULL(type->typ_debugfs_entry)) + ldebugfs_remove(&type->typ_debugfs_entry); + + if (type->typ_lu) + lu_device_type_fini(type->typ_lu); + + spin_lock(&obd_types_lock); + list_del(&type->typ_chain); + spin_unlock(&obd_types_lock); + OBD_FREE(type->typ_name, strlen(name) + 1); + if (type->typ_dt_ops != NULL) + OBD_FREE_PTR(type->typ_dt_ops); + if (type->typ_md_ops != NULL) + OBD_FREE_PTR(type->typ_md_ops); + OBD_FREE(type, sizeof(*type)); + RETURN(0); +} /* class_unregister_type */ +EXPORT_SYMBOL(class_unregister_type); + +/** + * Create a new obd device. + * + * Allocate the new obd_device and initialize it. + * + * \param[in] type_name obd device type string. + * \param[in] name obd device name. + * \param[in] uuid obd device UUID + * + * \retval newdev pointer to created obd_device + * \retval ERR_PTR(errno) on error + */ +struct obd_device *class_newdev(const char *type_name, const char *name, + const char *uuid) +{ + struct obd_device *newdev; + struct obd_type *type = NULL; + ENTRY; + + if (strlen(name) >= MAX_OBD_NAME) { + CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME); + RETURN(ERR_PTR(-EINVAL)); + } + + type = class_get_type(type_name); + if (type == NULL){ + CERROR("OBD: unknown type: %s\n", type_name); + RETURN(ERR_PTR(-ENODEV)); + } + + newdev = obd_device_alloc(); + if (newdev == NULL) { + class_put_type(type); + RETURN(ERR_PTR(-ENOMEM)); + } + LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC); + strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1); + newdev->obd_type = type; + newdev->obd_minor = -1; + + rwlock_init(&newdev->obd_pool_lock); + newdev->obd_pool_limit = 0; + newdev->obd_pool_slv = 0; + + INIT_LIST_HEAD(&newdev->obd_exports); + INIT_LIST_HEAD(&newdev->obd_unlinked_exports); + INIT_LIST_HEAD(&newdev->obd_delayed_exports); + INIT_LIST_HEAD(&newdev->obd_exports_timed); + INIT_LIST_HEAD(&newdev->obd_nid_stats); + spin_lock_init(&newdev->obd_nid_lock); + spin_lock_init(&newdev->obd_dev_lock); + mutex_init(&newdev->obd_dev_mutex); + spin_lock_init(&newdev->obd_osfs_lock); + /* newdev->obd_osfs_age must be set to a value in the distant + * past to guarantee a fresh statfs is fetched on mount. */ + newdev->obd_osfs_age = ktime_get_seconds() - 1000; + + /* XXX belongs in setup not attach */ + init_rwsem(&newdev->obd_observer_link_sem); + /* recovery data */ + spin_lock_init(&newdev->obd_recovery_task_lock); + init_waitqueue_head(&newdev->obd_next_transno_waitq); + init_waitqueue_head(&newdev->obd_evict_inprogress_waitq); + INIT_LIST_HEAD(&newdev->obd_req_replay_queue); + INIT_LIST_HEAD(&newdev->obd_lock_replay_queue); + INIT_LIST_HEAD(&newdev->obd_final_req_queue); + INIT_LIST_HEAD(&newdev->obd_evict_list); + INIT_LIST_HEAD(&newdev->obd_lwp_list); + + llog_group_init(&newdev->obd_olg); + /* Detach drops this */ + atomic_set(&newdev->obd_refcount, 1); + lu_ref_init(&newdev->obd_reference); + lu_ref_add(&newdev->obd_reference, "newdev", newdev); + + newdev->obd_conn_inprogress = 0; + + strncpy(newdev->obd_uuid.uuid, uuid, UUID_MAX); + + CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n", + newdev->obd_name, newdev); + + return newdev; +} + +/** + * Free obd device. + * + * \param[in] obd obd_device to be freed + * + * \retval none + */ +void class_free_dev(struct obd_device *obd) +{ + struct obd_type *obd_type = obd->obd_type; + + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x " + "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd, + "obd %p != obd_devs[%d] %p\n", + obd, obd->obd_minor, obd_devs[obd->obd_minor]); + LASSERTF(atomic_read(&obd->obd_refcount) == 0, + "obd_refcount should be 0, not %d\n", + atomic_read(&obd->obd_refcount)); + LASSERT(obd_type != NULL); + + CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n", + obd->obd_name, obd->obd_type->typ_name); + + CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n", + obd->obd_name, obd->obd_uuid.uuid); + if (obd->obd_stopping) { + int err; + + /* If we're not stopping, we were never set up */ + err = obd_cleanup(obd); + if (err) + CERROR("Cleanup %s returned %d\n", + obd->obd_name, err); + } + + obd_device_free(obd); + + class_put_type(obd_type); +} + +/** + * Unregister obd device. + * + * Free slot in obd_dev[] used by \a obd. + * + * \param[in] new_obd obd_device to be unregistered + * + * \retval none + */ +void class_unregister_device(struct obd_device *obd) +{ + write_lock(&obd_dev_lock); + if (obd->obd_minor >= 0) { + LASSERT(obd_devs[obd->obd_minor] == obd); + obd_devs[obd->obd_minor] = NULL; + obd->obd_minor = -1; + } + write_unlock(&obd_dev_lock); +} + +/** + * Register obd device. + * + * Find free slot in obd_devs[], fills it with \a new_obd. + * + * \param[in] new_obd obd_device to be registered + * + * \retval 0 success + * \retval -EEXIST device with this name is registered + * \retval -EOVERFLOW obd_devs[] is full + */ +int class_register_device(struct obd_device *new_obd) +{ + int ret = 0; + int i; + int new_obd_minor = 0; + bool minor_assign = false; + bool retried = false; + +again: + write_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd != NULL && + (strcmp(new_obd->obd_name, obd->obd_name) == 0)) { + + if (!retried) { + write_unlock(&obd_dev_lock); + + /* the obd_device could be waited to be + * destroyed by the "obd_zombie_impexp_thread". + */ + obd_zombie_barrier(); + retried = true; + goto again; + } + + CERROR("%s: already exists, won't add\n", + obd->obd_name); + /* in case we found a free slot before duplicate */ + minor_assign = false; + ret = -EEXIST; + break; + } + if (!minor_assign && obd == NULL) { + new_obd_minor = i; + minor_assign = true; + } + } + + if (minor_assign) { + new_obd->obd_minor = new_obd_minor; + LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] " + "%p\n", new_obd_minor, obd_devs[new_obd_minor]); + obd_devs[new_obd_minor] = new_obd; + } else { + if (ret == 0) { + ret = -EOVERFLOW; + CERROR("%s: all %u/%u devices used, increase " + "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name, + i, class_devno_max(), ret); + } + } + write_unlock(&obd_dev_lock); + + RETURN(ret); +} + +static int class_name2dev_nolock(const char *name) +{ + int i; + + if (!name) + return -1; + + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && strcmp(name, obd->obd_name) == 0) { + /* Make sure we finished attaching before we give + out any references */ + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_attached) { + return i; + } + break; + } + } + + return -1; +} + +int class_name2dev(const char *name) +{ + int i; + + if (!name) + return -1; + + read_lock(&obd_dev_lock); + i = class_name2dev_nolock(name); + read_unlock(&obd_dev_lock); + + return i; +} +EXPORT_SYMBOL(class_name2dev); + +struct obd_device *class_name2obd(const char *name) +{ + int dev = class_name2dev(name); + + if (dev < 0 || dev > class_devno_max()) + return NULL; + return class_num2obd(dev); +} +EXPORT_SYMBOL(class_name2obd); + +int class_uuid2dev_nolock(struct obd_uuid *uuid) +{ + int i; + + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) { + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + return i; + } + } + + return -1; +} + +int class_uuid2dev(struct obd_uuid *uuid) +{ + int i; + + read_lock(&obd_dev_lock); + i = class_uuid2dev_nolock(uuid); + read_unlock(&obd_dev_lock); + + return i; +} +EXPORT_SYMBOL(class_uuid2dev); + +struct obd_device *class_uuid2obd(struct obd_uuid *uuid) +{ + int dev = class_uuid2dev(uuid); + if (dev < 0) + return NULL; + return class_num2obd(dev); +} +EXPORT_SYMBOL(class_uuid2obd); + +/** + * Get obd device from ::obd_devs[] + * + * \param num [in] array index + * + * \retval NULL if ::obd_devs[\a num] does not contains an obd device + * otherwise return the obd device there. + */ +struct obd_device *class_num2obd(int num) +{ + struct obd_device *obd = NULL; + + if (num < class_devno_max()) { + obd = obd_devs[num]; + if (obd == NULL) + return NULL; + + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "%p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd->obd_minor == num, + "%p obd_minor %0d != %0d\n", + obd, obd->obd_minor, num); + } + + return obd; +} + +/** + * Find obd in obd_dev[] by name or uuid. + * + * Increment obd's refcount if found. + * + * \param[in] str obd name or uuid + * + * \retval NULL if not found + * \retval target pointer to found obd_device + */ +struct obd_device *class_dev_by_str(const char *str) +{ + struct obd_device *target = NULL; + struct obd_uuid tgtuuid; + int rc; + + obd_str2uuid(&tgtuuid, str); + + read_lock(&obd_dev_lock); + rc = class_uuid2dev_nolock(&tgtuuid); + if (rc < 0) + rc = class_name2dev_nolock(str); + + if (rc >= 0) + target = class_num2obd(rc); + + if (target != NULL) + class_incref(target, "find", current); + read_unlock(&obd_dev_lock); + + RETURN(target); +} +EXPORT_SYMBOL(class_dev_by_str); + +/** + * Get obd devices count. Device in any + * state are counted + * \retval obd device count + */ +int get_devices_count(void) +{ + int index, max_index = class_devno_max(), dev_count = 0; + + read_lock(&obd_dev_lock); + for (index = 0; index <= max_index; index++) { + struct obd_device *obd = class_num2obd(index); + if (obd != NULL) + dev_count++; + } + read_unlock(&obd_dev_lock); + + return dev_count; +} +EXPORT_SYMBOL(get_devices_count); + +void class_obd_list(void) +{ + char *status; + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n", + i, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + } + read_unlock(&obd_dev_lock); + return; +} + +/* Search for a client OBD connected to tgt_uuid. If grp_uuid is + specified, then only the client with that uuid is returned, + otherwise any client connected to the tgt is returned. */ +struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, + const char * typ_name, + struct obd_uuid *grp_uuid) +{ + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if ((strncmp(obd->obd_type->typ_name, typ_name, + strlen(typ_name)) == 0)) { + if (obd_uuid_equals(tgt_uuid, + &obd->u.cli.cl_target_uuid) && + ((grp_uuid)? obd_uuid_equals(grp_uuid, + &obd->obd_uuid) : 1)) { + read_unlock(&obd_dev_lock); + return obd; + } + } + } + read_unlock(&obd_dev_lock); + + return NULL; +} +EXPORT_SYMBOL(class_find_client_obd); + +/* Iterate the obd_device list looking devices have grp_uuid. Start + searching at *next, and if a device is found, the next index to look + at is saved in *next. If next is NULL, then the first matching device + will always be returned. */ +struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next) +{ + int i; + + if (next == NULL) + i = 0; + else if (*next >= 0 && *next < class_devno_max()) + i = *next; + else + return NULL; + + read_lock(&obd_dev_lock); + for (; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) { + if (next != NULL) + *next = i+1; + read_unlock(&obd_dev_lock); + return obd; + } + } + read_unlock(&obd_dev_lock); + + return NULL; +} +EXPORT_SYMBOL(class_devices_in_group); + +/** + * to notify sptlrpc log for \a fsname has changed, let every relevant OBD + * adjust sptlrpc settings accordingly. + */ +int class_notify_sptlrpc_conf(const char *fsname, int namelen) +{ + struct obd_device *obd; + const char *type; + int i, rc = 0, rc2; + + LASSERT(namelen > 0); + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + obd = class_num2obd(i); + + if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping) + continue; + + /* only notify mdc, osc, osp, lwp, mdt, ost + * because only these have a -sptlrpc llog */ + type = obd->obd_type->typ_name; + if (strcmp(type, LUSTRE_MDC_NAME) != 0 && + strcmp(type, LUSTRE_OSC_NAME) != 0 && + strcmp(type, LUSTRE_OSP_NAME) != 0 && + strcmp(type, LUSTRE_LWP_NAME) != 0 && + strcmp(type, LUSTRE_MDT_NAME) != 0 && + strcmp(type, LUSTRE_OST_NAME) != 0) + continue; + + if (strncmp(obd->obd_name, fsname, namelen)) + continue; + + class_incref(obd, __FUNCTION__, obd); + read_unlock(&obd_dev_lock); + rc2 = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_SPTLRPC_CONF), + KEY_SPTLRPC_CONF, 0, NULL, NULL); + rc = rc ? rc : rc2; + class_decref(obd, __FUNCTION__, obd); + read_lock(&obd_dev_lock); + } + read_unlock(&obd_dev_lock); + return rc; +} +EXPORT_SYMBOL(class_notify_sptlrpc_conf); + +void obd_cleanup_caches(void) +{ + ENTRY; + if (obd_device_cachep) { + kmem_cache_destroy(obd_device_cachep); + obd_device_cachep = NULL; + } + + EXIT; +} + +int obd_init_caches(void) +{ + int rc; + ENTRY; + + LASSERT(obd_device_cachep == NULL); + obd_device_cachep = kmem_cache_create_usercopy("ll_obd_dev_cache", + sizeof(struct obd_device), + 0, 0, 0, sizeof(struct obd_device), NULL); + if (!obd_device_cachep) + GOTO(out, rc = -ENOMEM); + + RETURN(0); +out: + obd_cleanup_caches(); + RETURN(rc); +} + +/* map connection to client */ +struct obd_export *class_conn2export(struct lustre_handle *conn) +{ + struct obd_export *export; + ENTRY; + + if (!conn) { + CDEBUG(D_CACHE, "looking for null handle\n"); + RETURN(NULL); + } + + if (conn->cookie == -1) { /* this means assign a new connection */ + CDEBUG(D_CACHE, "want a new connection\n"); + RETURN(NULL); + } + + CDEBUG(D_INFO, "looking for export cookie %#llx\n", conn->cookie); + export = class_handle2object(conn->cookie, NULL); + RETURN(export); +} +EXPORT_SYMBOL(class_conn2export); + +struct obd_device *class_exp2obd(struct obd_export *exp) +{ + if (exp) + return exp->exp_obd; + return NULL; +} +EXPORT_SYMBOL(class_exp2obd); + +struct obd_import *class_exp2cliimp(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + if (obd == NULL) + return NULL; + return obd->u.cli.cl_import; +} +EXPORT_SYMBOL(class_exp2cliimp); + +/* Export management functions */ +static void class_export_destroy(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + ENTRY; + + LASSERT_ATOMIC_ZERO(&exp->exp_refcount); + LASSERT(obd != NULL); + + CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp, + exp->exp_client_uuid.uuid, obd->obd_name); + + /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */ + if (exp->exp_connection) + ptlrpc_put_connection_superhack(exp->exp_connection); + + LASSERT(list_empty(&exp->exp_outstanding_replies)); + LASSERT(list_empty(&exp->exp_uncommitted_replies)); + LASSERT(list_empty(&exp->exp_req_replay_queue)); + LASSERT(list_empty(&exp->exp_hp_rpcs)); + obd_destroy_export(exp); + /* self export doesn't hold a reference to an obd, although it + * exists until freeing of the obd */ + if (exp != obd->obd_self_export) + class_decref(obd, "export", exp); + + OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle); + EXIT; +} + +static void export_handle_addref(void *export) +{ + class_export_get(export); +} + +static struct portals_handle_ops export_handle_ops = { + .hop_addref = export_handle_addref, + .hop_free = NULL, +}; + +struct obd_export *class_export_get(struct obd_export *exp) +{ + atomic_inc(&exp->exp_refcount); + CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp, + atomic_read(&exp->exp_refcount)); + return exp; +} +EXPORT_SYMBOL(class_export_get); + +void class_export_put(struct obd_export *exp) +{ + LASSERT(exp != NULL); + LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp, + atomic_read(&exp->exp_refcount) - 1); + + if (atomic_dec_and_test(&exp->exp_refcount)) { + struct obd_device *obd = exp->exp_obd; + + CDEBUG(D_IOCTL, "final put %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + /* release nid stat refererence */ + lprocfs_exp_cleanup(exp); + + if (exp == obd->obd_self_export) { + /* self export should be destroyed without + * zombie thread as it doesn't hold a + * reference to obd and doesn't hold any + * resources */ + class_export_destroy(exp); + /* self export is destroyed, no class + * references exist and it is safe to free + * obd */ + class_free_dev(obd); + } else { + LASSERT(!list_empty(&exp->exp_obd_chain)); + obd_zombie_export_add(exp); + } + + } +} +EXPORT_SYMBOL(class_export_put); + +static void obd_zombie_exp_cull(struct work_struct *ws) +{ + struct obd_export *export; + + export = container_of(ws, struct obd_export, exp_zombie_work); + class_export_destroy(export); +} + +/* Creates a new export, adds it to the hash table, and returns a + * pointer to it. The refcount is 2: one for the hash reference, and + * one for the pointer returned by this function. */ +struct obd_export *__class_new_export(struct obd_device *obd, + struct obd_uuid *cluuid, bool is_self) +{ + struct obd_export *export; + struct cfs_hash *hash = NULL; + int rc = 0; + ENTRY; + + OBD_ALLOC_PTR(export); + if (!export) + return ERR_PTR(-ENOMEM); + + export->exp_conn_cnt = 0; + export->exp_lock_hash = NULL; + export->exp_flock_hash = NULL; + /* 2 = class_handle_hash + last */ + atomic_set(&export->exp_refcount, 2); + atomic_set(&export->exp_rpc_count, 0); + atomic_set(&export->exp_cb_count, 0); + atomic_set(&export->exp_locks_count, 0); +#if LUSTRE_TRACKS_LOCK_EXP_REFS + INIT_LIST_HEAD(&export->exp_locks_list); + spin_lock_init(&export->exp_locks_list_guard); +#endif + atomic_set(&export->exp_replay_count, 0); + export->exp_obd = obd; + INIT_LIST_HEAD(&export->exp_outstanding_replies); + spin_lock_init(&export->exp_uncommitted_replies_lock); + INIT_LIST_HEAD(&export->exp_uncommitted_replies); + INIT_LIST_HEAD(&export->exp_req_replay_queue); + INIT_LIST_HEAD_RCU(&export->exp_handle.h_link); + INIT_LIST_HEAD(&export->exp_hp_rpcs); + INIT_LIST_HEAD(&export->exp_reg_rpcs); + class_handle_hash(&export->exp_handle, &export_handle_ops); + export->exp_last_request_time = ktime_get_real_seconds(); + spin_lock_init(&export->exp_lock); + spin_lock_init(&export->exp_rpc_lock); + INIT_HLIST_NODE(&export->exp_uuid_hash); + INIT_HLIST_NODE(&export->exp_nid_hash); + INIT_HLIST_NODE(&export->exp_gen_hash); + spin_lock_init(&export->exp_bl_list_lock); + INIT_LIST_HEAD(&export->exp_bl_list); + INIT_LIST_HEAD(&export->exp_stale_list); + INIT_WORK(&export->exp_zombie_work, obd_zombie_exp_cull); + + export->exp_sp_peer = LUSTRE_SP_ANY; + export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID; + export->exp_client_uuid = *cluuid; + obd_init_export(export); + + if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) { + spin_lock(&obd->obd_dev_lock); + /* shouldn't happen, but might race */ + if (obd->obd_stopping) + GOTO(exit_unlock, rc = -ENODEV); + + hash = cfs_hash_getref(obd->obd_uuid_hash); + if (hash == NULL) + GOTO(exit_unlock, rc = -ENODEV); + spin_unlock(&obd->obd_dev_lock); + + rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash); + if (rc != 0) { + LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n", + obd->obd_name, cluuid->uuid, rc); + GOTO(exit_err, rc = -EALREADY); + } + } + + at_init(&export->exp_bl_lock_at, obd_timeout, 0); + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + if (hash) + cfs_hash_del(hash, cluuid, &export->exp_uuid_hash); + GOTO(exit_unlock, rc = -ESHUTDOWN); + } + + if (!is_self) { + class_incref(obd, "export", export); + list_add_tail(&export->exp_obd_chain_timed, + &obd->obd_exports_timed); + list_add(&export->exp_obd_chain, &obd->obd_exports); + obd->obd_num_exports++; + } else { + INIT_LIST_HEAD(&export->exp_obd_chain_timed); + INIT_LIST_HEAD(&export->exp_obd_chain); + } + spin_unlock(&obd->obd_dev_lock); + if (hash) + cfs_hash_putref(hash); + RETURN(export); + +exit_unlock: + spin_unlock(&obd->obd_dev_lock); +exit_err: + if (hash) + cfs_hash_putref(hash); + class_handle_unhash(&export->exp_handle); + LASSERT(hlist_unhashed(&export->exp_uuid_hash)); + obd_destroy_export(export); + OBD_FREE_PTR(export); + return ERR_PTR(rc); +} + +struct obd_export *class_new_export(struct obd_device *obd, + struct obd_uuid *uuid) +{ + return __class_new_export(obd, uuid, false); +} +EXPORT_SYMBOL(class_new_export); + +struct obd_export *class_new_export_self(struct obd_device *obd, + struct obd_uuid *uuid) +{ + return __class_new_export(obd, uuid, true); +} + +void class_unlink_export(struct obd_export *exp) +{ + class_handle_unhash(&exp->exp_handle); + + if (exp->exp_obd->obd_self_export == exp) { + class_export_put(exp); + return; + } + + spin_lock(&exp->exp_obd->obd_dev_lock); + /* delete an uuid-export hashitem from hashtables */ + if (!hlist_unhashed(&exp->exp_uuid_hash)) + cfs_hash_del(exp->exp_obd->obd_uuid_hash, + &exp->exp_client_uuid, + &exp->exp_uuid_hash); + +#ifdef HAVE_SERVER_SUPPORT + if (!hlist_unhashed(&exp->exp_gen_hash)) { + struct tg_export_data *ted = &exp->exp_target_data; + struct cfs_hash *hash; + + /* Because obd_gen_hash will not be released until + * class_cleanup(), so hash should never be NULL here */ + hash = cfs_hash_getref(exp->exp_obd->obd_gen_hash); + LASSERT(hash != NULL); + cfs_hash_del(hash, &ted->ted_lcd->lcd_generation, + &exp->exp_gen_hash); + cfs_hash_putref(hash); + } +#endif /* HAVE_SERVER_SUPPORT */ + + list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports); + list_del_init(&exp->exp_obd_chain_timed); + exp->exp_obd->obd_num_exports--; + spin_unlock(&exp->exp_obd->obd_dev_lock); + atomic_inc(&obd_stale_export_num); + + /* A reference is kept by obd_stale_exports list */ + obd_stale_export_put(exp); +} +EXPORT_SYMBOL(class_unlink_export); + +/* Import management functions */ +static void obd_zombie_import_free(struct obd_import *imp) +{ + ENTRY; + + CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp, + imp->imp_obd->obd_name); + + LASSERT_ATOMIC_ZERO(&imp->imp_refcount); + + ptlrpc_put_connection_superhack(imp->imp_connection); + + while (!list_empty(&imp->imp_conn_list)) { + struct obd_import_conn *imp_conn; + + imp_conn = list_entry(imp->imp_conn_list.next, + struct obd_import_conn, oic_item); + list_del_init(&imp_conn->oic_item); + ptlrpc_put_connection_superhack(imp_conn->oic_conn); + OBD_FREE(imp_conn, sizeof(*imp_conn)); + } + + LASSERT(imp->imp_sec == NULL); + LASSERTF(atomic_read(&imp->imp_reqs) == 0, "%s: imp_reqs = %d\n", + imp->imp_obd->obd_name, atomic_read(&imp->imp_reqs)); + class_decref(imp->imp_obd, "import", imp); + OBD_FREE_PTR(imp); + EXIT; +} + +struct obd_import *class_import_get(struct obd_import *import) +{ + atomic_inc(&import->imp_refcount); + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import, + atomic_read(&import->imp_refcount), + import->imp_obd->obd_name); + return import; +} +EXPORT_SYMBOL(class_import_get); + +void class_import_put(struct obd_import *imp) +{ + ENTRY; + + LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON); + + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp, + atomic_read(&imp->imp_refcount) - 1, + imp->imp_obd->obd_name); + + if (atomic_dec_and_test(&imp->imp_refcount)) { + CDEBUG(D_INFO, "final put import %p\n", imp); + obd_zombie_import_add(imp); + } + + EXIT; +} +EXPORT_SYMBOL(class_import_put); + +static void init_imp_at(struct imp_at *at) { + int i; + at_init(&at->iat_net_latency, 0, 0); + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + /* max service estimates are tracked on the server side, so + don't use the AT history here, just use the last reported + val. (But keep hist for proc histogram, worst_ever) */ + at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT, + AT_FLG_NOHIST); + } +} + +static void obd_zombie_imp_cull(struct work_struct *ws) +{ + struct obd_import *import; + + import = container_of(ws, struct obd_import, imp_zombie_work); + obd_zombie_import_free(import); +} + +struct obd_import *class_new_import(struct obd_device *obd) +{ + struct obd_import *imp; + struct pid_namespace *curr_pid_ns = ll_task_pid_ns(current); + + OBD_ALLOC(imp, sizeof(*imp)); + if (imp == NULL) + return NULL; + + INIT_LIST_HEAD(&imp->imp_pinger_chain); + INIT_LIST_HEAD(&imp->imp_replay_list); + INIT_LIST_HEAD(&imp->imp_sending_list); + INIT_LIST_HEAD(&imp->imp_delayed_list); + INIT_LIST_HEAD(&imp->imp_committed_list); + INIT_LIST_HEAD(&imp->imp_unreplied_list); + imp->imp_known_replied_xid = 0; + imp->imp_replay_cursor = &imp->imp_committed_list; + spin_lock_init(&imp->imp_lock); + imp->imp_last_success_conn = 0; + imp->imp_state = LUSTRE_IMP_NEW; + imp->imp_obd = class_incref(obd, "import", imp); + mutex_init(&imp->imp_sec_mutex); + init_waitqueue_head(&imp->imp_recovery_waitq); + INIT_WORK(&imp->imp_zombie_work, obd_zombie_imp_cull); + + if (curr_pid_ns && curr_pid_ns->child_reaper) + imp->imp_sec_refpid = curr_pid_ns->child_reaper->pid; + else + imp->imp_sec_refpid = 1; + + atomic_set(&imp->imp_refcount, 2); + atomic_set(&imp->imp_unregistering, 0); + atomic_set(&imp->imp_reqs, 0); + atomic_set(&imp->imp_inflight, 0); + atomic_set(&imp->imp_replay_inflight, 0); + init_waitqueue_head(&imp->imp_replay_waitq); + atomic_set(&imp->imp_inval_count, 0); + INIT_LIST_HEAD(&imp->imp_conn_list); + init_imp_at(&imp->imp_at); + + /* the default magic is V2, will be used in connect RPC, and + * then adjusted according to the flags in request/reply. */ + imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2; + + return imp; +} +EXPORT_SYMBOL(class_new_import); + +void class_destroy_import(struct obd_import *import) +{ + LASSERT(import != NULL); + LASSERT(import != LP_POISON); + + spin_lock(&import->imp_lock); + import->imp_generation++; + spin_unlock(&import->imp_lock); + class_import_put(import); +} +EXPORT_SYMBOL(class_destroy_import); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + +void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) +{ + spin_lock(&exp->exp_locks_list_guard); + + LASSERT(lock->l_exp_refs_nr >= 0); + + if (lock->l_exp_refs_target != NULL && + lock->l_exp_refs_target != exp) { + LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n", + exp, lock, lock->l_exp_refs_target); + } + if ((lock->l_exp_refs_nr ++) == 0) { + list_add(&lock->l_exp_refs_link, &exp->exp_locks_list); + lock->l_exp_refs_target = exp; + } + CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", + lock, exp, lock->l_exp_refs_nr); + spin_unlock(&exp->exp_locks_list_guard); +} +EXPORT_SYMBOL(__class_export_add_lock_ref); + +void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) +{ + spin_lock(&exp->exp_locks_list_guard); + LASSERT(lock->l_exp_refs_nr > 0); + if (lock->l_exp_refs_target != exp) { + LCONSOLE_WARN("lock %p, " + "mismatching export pointers: %p, %p\n", + lock, lock->l_exp_refs_target, exp); + } + if (-- lock->l_exp_refs_nr == 0) { + list_del_init(&lock->l_exp_refs_link); + lock->l_exp_refs_target = NULL; + } + CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", + lock, exp, lock->l_exp_refs_nr); + spin_unlock(&exp->exp_locks_list_guard); +} +EXPORT_SYMBOL(__class_export_del_lock_ref); +#endif + +/* A connection defines an export context in which preallocation can + be managed. This releases the export pointer reference, and returns + the export handle, so the export refcount is 1 when this function + returns. */ +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid) +{ + struct obd_export *export; + LASSERT(conn != NULL); + LASSERT(obd != NULL); + LASSERT(cluuid != NULL); + ENTRY; + + export = class_new_export(obd, cluuid); + if (IS_ERR(export)) + RETURN(PTR_ERR(export)); + + conn->cookie = export->exp_handle.h_cookie; + class_export_put(export); + + CDEBUG(D_IOCTL, "connect: client %s, cookie %#llx\n", + cluuid->uuid, conn->cookie); + RETURN(0); +} +EXPORT_SYMBOL(class_connect); + +/* if export is involved in recovery then clean up related things */ +static void class_export_recovery_cleanup(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + + spin_lock(&obd->obd_recovery_task_lock); + if (obd->obd_recovering) { + if (exp->exp_in_recovery) { + spin_lock(&exp->exp_lock); + exp->exp_in_recovery = 0; + spin_unlock(&exp->exp_lock); + LASSERT_ATOMIC_POS(&obd->obd_connected_clients); + atomic_dec(&obd->obd_connected_clients); + } + + /* if called during recovery then should update + * obd_stale_clients counter, + * lightweight exports are not counted */ + if ((exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0) + exp->exp_obd->obd_stale_clients++; + } + spin_unlock(&obd->obd_recovery_task_lock); + + spin_lock(&exp->exp_lock); + /** Cleanup req replay fields */ + if (exp->exp_req_replay_needed) { + exp->exp_req_replay_needed = 0; + + LASSERT(atomic_read(&obd->obd_req_replay_clients)); + atomic_dec(&obd->obd_req_replay_clients); + } + + /** Cleanup lock replay data */ + if (exp->exp_lock_replay_needed) { + exp->exp_lock_replay_needed = 0; + + LASSERT(atomic_read(&obd->obd_lock_replay_clients)); + atomic_dec(&obd->obd_lock_replay_clients); + } + spin_unlock(&exp->exp_lock); +} + +/* This function removes 1-3 references from the export: + * 1 - for export pointer passed + * and if disconnect really need + * 2 - removing from hash + * 3 - in client_unlink_export + * The export pointer passed to this function can destroyed */ +int class_disconnect(struct obd_export *export) +{ + int already_disconnected; + ENTRY; + + if (export == NULL) { + CWARN("attempting to free NULL export %p\n", export); + RETURN(-EINVAL); + } + + spin_lock(&export->exp_lock); + already_disconnected = export->exp_disconnected; + export->exp_disconnected = 1; + /* We hold references of export for uuid hash + * and nid_hash and export link at least. So + * it is safe to call cfs_hash_del in there. */ + if (!hlist_unhashed(&export->exp_nid_hash)) + cfs_hash_del(export->exp_obd->obd_nid_hash, + &export->exp_connection->c_peer.nid, + &export->exp_nid_hash); + spin_unlock(&export->exp_lock); + + /* class_cleanup(), abort_recovery(), and class_fail_export() + * all end up in here, and if any of them race we shouldn't + * call extra class_export_puts(). */ + if (already_disconnected) { + LASSERT(hlist_unhashed(&export->exp_nid_hash)); + GOTO(no_disconn, already_disconnected); + } + + CDEBUG(D_IOCTL, "disconnect: cookie %#llx\n", + export->exp_handle.h_cookie); + + class_export_recovery_cleanup(export); + class_unlink_export(export); +no_disconn: + class_export_put(export); + RETURN(0); +} +EXPORT_SYMBOL(class_disconnect); + +/* Return non-zero for a fully connected export */ +int class_connected_export(struct obd_export *exp) +{ + int connected = 0; + + if (exp) { + spin_lock(&exp->exp_lock); + connected = (exp->exp_conn_cnt > 0) && !exp->exp_failed; + spin_unlock(&exp->exp_lock); + } + return connected; +} +EXPORT_SYMBOL(class_connected_export); + +static void class_disconnect_export_list(struct list_head *list, + enum obd_option flags) +{ + int rc; + struct obd_export *exp; + ENTRY; + + /* It's possible that an export may disconnect itself, but + * nothing else will be added to this list. */ + while (!list_empty(list)) { + exp = list_entry(list->next, struct obd_export, + exp_obd_chain); + /* need for safe call CDEBUG after obd_disconnect */ + class_export_get(exp); + + spin_lock(&exp->exp_lock); + exp->exp_flags = flags; + spin_unlock(&exp->exp_lock); + + if (obd_uuid_equals(&exp->exp_client_uuid, + &exp->exp_obd->obd_uuid)) { + CDEBUG(D_HA, + "exp %p export uuid == obd uuid, don't discon\n", + exp); + /* Need to delete this now so we don't end up pointing + * to work_list later when this export is cleaned up. */ + list_del_init(&exp->exp_obd_chain); + class_export_put(exp); + continue; + } + + class_export_get(exp); + CDEBUG(D_HA, "%s: disconnecting export at %s (%p), " + "last request at %lld\n", + exp->exp_obd->obd_name, obd_export_nid2str(exp), + exp, exp->exp_last_request_time); + /* release one export reference anyway */ + rc = obd_disconnect(exp); + + CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n", + obd_export_nid2str(exp), exp, rc); + class_export_put(exp); + } + EXIT; +} + +void class_disconnect_exports(struct obd_device *obd) +{ + struct list_head work_list; + ENTRY; + + /* Move all of the exports from obd_exports to a work list, en masse. */ + INIT_LIST_HEAD(&work_list); + spin_lock(&obd->obd_dev_lock); + list_splice_init(&obd->obd_exports, &work_list); + list_splice_init(&obd->obd_delayed_exports, &work_list); + spin_unlock(&obd->obd_dev_lock); + + if (!list_empty(&work_list)) { + CDEBUG(D_HA, "OBD device %d (%p) has exports, " + "disconnecting them\n", obd->obd_minor, obd); + class_disconnect_export_list(&work_list, + exp_flags_from_obd(obd)); + } else + CDEBUG(D_HA, "OBD device %d (%p) has no exports\n", + obd->obd_minor, obd); + EXIT; +} +EXPORT_SYMBOL(class_disconnect_exports); + +/* Remove exports that have not completed recovery. + */ +void class_disconnect_stale_exports(struct obd_device *obd, + int (*test_export)(struct obd_export *)) +{ + struct list_head work_list; + struct obd_export *exp, *n; + int evicted = 0; + ENTRY; + + INIT_LIST_HEAD(&work_list); + spin_lock(&obd->obd_dev_lock); + list_for_each_entry_safe(exp, n, &obd->obd_exports, + exp_obd_chain) { + /* don't count self-export as client */ + if (obd_uuid_equals(&exp->exp_client_uuid, + &exp->exp_obd->obd_uuid)) + continue; + + /* don't evict clients which have no slot in last_rcvd + * (e.g. lightweight connection) */ + if (exp->exp_target_data.ted_lr_idx == -1) + continue; + + spin_lock(&exp->exp_lock); + if (exp->exp_failed || test_export(exp)) { + spin_unlock(&exp->exp_lock); + continue; + } + exp->exp_failed = 1; + spin_unlock(&exp->exp_lock); + + list_move(&exp->exp_obd_chain, &work_list); + evicted++; + CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n", + obd->obd_name, exp->exp_client_uuid.uuid, + obd_export_nid2str(exp)); + print_export_data(exp, "EVICTING", 0, D_HA); + } + spin_unlock(&obd->obd_dev_lock); + + if (evicted) + LCONSOLE_WARN("%s: disconnecting %d stale clients\n", + obd->obd_name, evicted); + + class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); + EXIT; +} +EXPORT_SYMBOL(class_disconnect_stale_exports); + +void class_fail_export(struct obd_export *exp) +{ + int rc, already_failed; + + spin_lock(&exp->exp_lock); + already_failed = exp->exp_failed; + exp->exp_failed = 1; + spin_unlock(&exp->exp_lock); + + if (already_failed) { + CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n", + exp, exp->exp_client_uuid.uuid); + return; + } + + CDEBUG(D_HA, "disconnecting export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + /* need for safe call CDEBUG after obd_disconnect */ + class_export_get(exp); + + /* Most callers into obd_disconnect are removing their own reference + * (request, for example) in addition to the one from the hash table. + * We don't have such a reference here, so make one. */ + class_export_get(exp); + rc = obd_disconnect(exp); + if (rc) + CERROR("disconnecting export %p failed: %d\n", exp, rc); + else + CDEBUG(D_HA, "disconnected export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + class_export_put(exp); +} +EXPORT_SYMBOL(class_fail_export); + +int obd_export_evict_by_nid(struct obd_device *obd, const char *nid) +{ + struct cfs_hash *nid_hash; + struct obd_export *doomed_exp = NULL; + int exports_evicted = 0; + + lnet_nid_t nid_key = libcfs_str2nid((char *)nid); + + spin_lock(&obd->obd_dev_lock); + /* umount has run already, so evict thread should leave + * its task to umount thread now */ + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return exports_evicted; + } + nid_hash = obd->obd_nid_hash; + cfs_hash_getref(nid_hash); + spin_unlock(&obd->obd_dev_lock); + + do { + doomed_exp = cfs_hash_lookup(nid_hash, &nid_key); + if (doomed_exp == NULL) + break; + + LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key, + "nid %s found, wanted nid %s, requested nid %s\n", + obd_export_nid2str(doomed_exp), + libcfs_nid2str(nid_key), nid); + LASSERTF(doomed_exp != obd->obd_self_export, + "self-export is hashed by NID?\n"); + exports_evicted++; + LCONSOLE_WARN("%s: evicting %s (at %s) by administrative " + "request\n", obd->obd_name, + obd_uuid2str(&doomed_exp->exp_client_uuid), + obd_export_nid2str(doomed_exp)); + class_fail_export(doomed_exp); + class_export_put(doomed_exp); + } while (1); + + cfs_hash_putref(nid_hash); + + if (!exports_evicted) + CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n", + obd->obd_name, nid); + return exports_evicted; +} +EXPORT_SYMBOL(obd_export_evict_by_nid); + +int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid) +{ + struct cfs_hash *uuid_hash; + struct obd_export *doomed_exp = NULL; + struct obd_uuid doomed_uuid; + int exports_evicted = 0; + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return exports_evicted; + } + uuid_hash = obd->obd_uuid_hash; + cfs_hash_getref(uuid_hash); + spin_unlock(&obd->obd_dev_lock); + + obd_str2uuid(&doomed_uuid, uuid); + if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) { + CERROR("%s: can't evict myself\n", obd->obd_name); + cfs_hash_putref(uuid_hash); + return exports_evicted; + } + + doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid); + + if (doomed_exp == NULL) { + CERROR("%s: can't disconnect %s: no exports found\n", + obd->obd_name, uuid); + } else { + CWARN("%s: evicting %s at adminstrative request\n", + obd->obd_name, doomed_exp->exp_client_uuid.uuid); + class_fail_export(doomed_exp); + class_export_put(doomed_exp); + exports_evicted++; + } + cfs_hash_putref(uuid_hash); + + return exports_evicted; +} + +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void (*class_export_dump_hook)(struct obd_export*) = NULL; +EXPORT_SYMBOL(class_export_dump_hook); +#endif + +static void print_export_data(struct obd_export *exp, const char *status, + int locks, int debug_level) +{ + struct ptlrpc_reply_state *rs; + struct ptlrpc_reply_state *first_reply = NULL; + int nreplies = 0; + + spin_lock(&exp->exp_lock); + list_for_each_entry(rs, &exp->exp_outstanding_replies, + rs_exp_list) { + if (nreplies == 0) + first_reply = rs; + nreplies++; + } + spin_unlock(&exp->exp_lock); + + CDEBUG(debug_level, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: " + "%p %s %llu stale:%d\n", + exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid, + obd_export_nid2str(exp), atomic_read(&exp->exp_refcount), + atomic_read(&exp->exp_rpc_count), + atomic_read(&exp->exp_cb_count), + atomic_read(&exp->exp_locks_count), + exp->exp_disconnected, exp->exp_delayed, exp->exp_failed, + nreplies, first_reply, nreplies > 3 ? "..." : "", + exp->exp_last_committed, !list_empty(&exp->exp_stale_list)); +#if LUSTRE_TRACKS_LOCK_EXP_REFS + if (locks && class_export_dump_hook != NULL) + class_export_dump_hook(exp); +#endif +} + +void dump_exports(struct obd_device *obd, int locks, int debug_level) +{ + struct obd_export *exp; + + spin_lock(&obd->obd_dev_lock); + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) + print_export_data(exp, "ACTIVE", locks, debug_level); + list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain) + print_export_data(exp, "UNLINKED", locks, debug_level); + list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain) + print_export_data(exp, "DELAYED", locks, debug_level); + spin_unlock(&obd->obd_dev_lock); +} + +void obd_exports_barrier(struct obd_device *obd) +{ + int waited = 2; + LASSERT(list_empty(&obd->obd_exports)); + spin_lock(&obd->obd_dev_lock); + while (!list_empty(&obd->obd_unlinked_exports)) { + spin_unlock(&obd->obd_dev_lock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(waited)); + if (waited > 5 && is_power_of_2(waited)) { + LCONSOLE_WARN("%s is waiting for obd_unlinked_exports " + "more than %d seconds. " + "The obd refcount = %d. Is it stuck?\n", + obd->obd_name, waited, + atomic_read(&obd->obd_refcount)); + dump_exports(obd, 1, D_CONSOLE | D_WARNING); + } + waited *= 2; + spin_lock(&obd->obd_dev_lock); + } + spin_unlock(&obd->obd_dev_lock); +} +EXPORT_SYMBOL(obd_exports_barrier); + +/** + * Add export to the obd_zombe thread and notify it. + */ +static void obd_zombie_export_add(struct obd_export *exp) { + atomic_dec(&obd_stale_export_num); + spin_lock(&exp->exp_obd->obd_dev_lock); + LASSERT(!list_empty(&exp->exp_obd_chain)); + list_del_init(&exp->exp_obd_chain); + spin_unlock(&exp->exp_obd->obd_dev_lock); + + queue_work(zombie_wq, &exp->exp_zombie_work); +} + +/** + * Add import to the obd_zombe thread and notify it. + */ +static void obd_zombie_import_add(struct obd_import *imp) { + LASSERT(imp->imp_sec == NULL); + + queue_work(zombie_wq, &imp->imp_zombie_work); +} + +/** + * wait when obd_zombie import/export queues become empty + */ +void obd_zombie_barrier(void) +{ + flush_workqueue(zombie_wq); +} +EXPORT_SYMBOL(obd_zombie_barrier); + + +struct obd_export *obd_stale_export_get(void) +{ + struct obd_export *exp = NULL; + ENTRY; + + spin_lock(&obd_stale_export_lock); + if (!list_empty(&obd_stale_exports)) { + exp = list_entry(obd_stale_exports.next, + struct obd_export, exp_stale_list); + list_del_init(&exp->exp_stale_list); + } + spin_unlock(&obd_stale_export_lock); + + if (exp) { + CDEBUG(D_DLMTRACE, "Get export %p: total %d\n", exp, + atomic_read(&obd_stale_export_num)); + } + RETURN(exp); +} +EXPORT_SYMBOL(obd_stale_export_get); + +void obd_stale_export_put(struct obd_export *exp) +{ + ENTRY; + + LASSERT(list_empty(&exp->exp_stale_list)); + if (exp->exp_lock_hash && + atomic_read(&exp->exp_lock_hash->hs_count)) { + CDEBUG(D_DLMTRACE, "Put export %p: total %d\n", exp, + atomic_read(&obd_stale_export_num)); + + spin_lock_bh(&exp->exp_bl_list_lock); + spin_lock(&obd_stale_export_lock); + /* Add to the tail if there is no blocked locks, + * to the head otherwise. */ + if (list_empty(&exp->exp_bl_list)) + list_add_tail(&exp->exp_stale_list, + &obd_stale_exports); + else + list_add(&exp->exp_stale_list, + &obd_stale_exports); + + spin_unlock(&obd_stale_export_lock); + spin_unlock_bh(&exp->exp_bl_list_lock); + } else { + class_export_put(exp); + } + EXIT; +} +EXPORT_SYMBOL(obd_stale_export_put); + +/** + * Adjust the position of the export in the stale list, + * i.e. move to the head of the list if is needed. + **/ +void obd_stale_export_adjust(struct obd_export *exp) +{ + LASSERT(exp != NULL); + spin_lock_bh(&exp->exp_bl_list_lock); + spin_lock(&obd_stale_export_lock); + + if (!list_empty(&exp->exp_stale_list) && + !list_empty(&exp->exp_bl_list)) + list_move(&exp->exp_stale_list, &obd_stale_exports); + + spin_unlock(&obd_stale_export_lock); + spin_unlock_bh(&exp->exp_bl_list_lock); +} +EXPORT_SYMBOL(obd_stale_export_adjust); + +/** + * start destroy zombie import/export thread + */ +int obd_zombie_impexp_init(void) +{ + zombie_wq = alloc_workqueue("obd_zombid", 0, 0); + if (!zombie_wq) + return -ENOMEM; + + return 0; +} + +/** + * stop destroy zombie import/export thread + */ +void obd_zombie_impexp_stop(void) +{ + destroy_workqueue(zombie_wq); + LASSERT(list_empty(&obd_stale_exports)); +} + +/***** Kernel-userspace comm helpers *******/ + +/* Get length of entire message, including header */ +int kuc_len(int payload_len) +{ + return sizeof(struct kuc_hdr) + payload_len; +} +EXPORT_SYMBOL(kuc_len); + +/* Get a pointer to kuc header, given a ptr to the payload + * @param p Pointer to payload area + * @returns Pointer to kuc header + */ +struct kuc_hdr * kuc_ptr(void *p) +{ + struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1; + LASSERT(lh->kuc_magic == KUC_MAGIC); + return lh; +} +EXPORT_SYMBOL(kuc_ptr); + +/* Alloc space for a message, and fill in header + * @return Pointer to payload area + */ +void *kuc_alloc(int payload_len, int transport, int type) +{ + struct kuc_hdr *lh; + int len = kuc_len(payload_len); + + OBD_ALLOC(lh, len); + if (lh == NULL) + return ERR_PTR(-ENOMEM); + + lh->kuc_magic = KUC_MAGIC; + lh->kuc_transport = transport; + lh->kuc_msgtype = type; + lh->kuc_msglen = len; + + return (void *)(lh + 1); +} +EXPORT_SYMBOL(kuc_alloc); + +/* Takes pointer to payload area */ +void kuc_free(void *p, int payload_len) +{ + struct kuc_hdr *lh = kuc_ptr(p); + OBD_FREE(lh, kuc_len(payload_len)); +} +EXPORT_SYMBOL(kuc_free); + +struct obd_request_slot_waiter { + struct list_head orsw_entry; + wait_queue_head_t orsw_waitq; + bool orsw_signaled; +}; + +static bool obd_request_slot_avail(struct client_obd *cli, + struct obd_request_slot_waiter *orsw) +{ + bool avail; + + spin_lock(&cli->cl_loi_list_lock); + avail = !!list_empty(&orsw->orsw_entry); + spin_unlock(&cli->cl_loi_list_lock); + + return avail; +}; + +/* + * For network flow control, the RPC sponsor needs to acquire a credit + * before sending the RPC. The credits count for a connection is defined + * by the "cl_max_rpcs_in_flight". If all the credits are occpuied, then + * the subsequent RPC sponsors need to wait until others released their + * credits, or the administrator increased the "cl_max_rpcs_in_flight". + */ +int obd_get_request_slot(struct client_obd *cli) +{ + struct obd_request_slot_waiter orsw; + struct l_wait_info lwi; + int rc; + + spin_lock(&cli->cl_loi_list_lock); + if (cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight) { + cli->cl_rpcs_in_flight++; + spin_unlock(&cli->cl_loi_list_lock); + return 0; + } + + init_waitqueue_head(&orsw.orsw_waitq); + list_add_tail(&orsw.orsw_entry, &cli->cl_flight_waiters); + orsw.orsw_signaled = false; + spin_unlock(&cli->cl_loi_list_lock); + + lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(orsw.orsw_waitq, + obd_request_slot_avail(cli, &orsw) || + orsw.orsw_signaled, + &lwi); + + /* Here, we must take the lock to avoid the on-stack 'orsw' to be + * freed but other (such as obd_put_request_slot) is using it. */ + spin_lock(&cli->cl_loi_list_lock); + if (rc != 0) { + if (!orsw.orsw_signaled) { + if (list_empty(&orsw.orsw_entry)) + cli->cl_rpcs_in_flight--; + else + list_del(&orsw.orsw_entry); + } + } + + if (orsw.orsw_signaled) { + LASSERT(list_empty(&orsw.orsw_entry)); + + rc = -EINTR; + } + spin_unlock(&cli->cl_loi_list_lock); + + return rc; +} +EXPORT_SYMBOL(obd_get_request_slot); + +void obd_put_request_slot(struct client_obd *cli) +{ + struct obd_request_slot_waiter *orsw; + + spin_lock(&cli->cl_loi_list_lock); + cli->cl_rpcs_in_flight--; + + /* If there is free slot, wakeup the first waiter. */ + if (!list_empty(&cli->cl_flight_waiters) && + likely(cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight)) { + orsw = list_entry(cli->cl_flight_waiters.next, + struct obd_request_slot_waiter, orsw_entry); + list_del_init(&orsw->orsw_entry); + cli->cl_rpcs_in_flight++; + wake_up(&orsw->orsw_waitq); + } + spin_unlock(&cli->cl_loi_list_lock); +} +EXPORT_SYMBOL(obd_put_request_slot); + +__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli) +{ + return cli->cl_max_rpcs_in_flight; +} +EXPORT_SYMBOL(obd_get_max_rpcs_in_flight); + +int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max) +{ + struct obd_request_slot_waiter *orsw; + __u32 old; + int diff; + int i; + int rc; + + if (max > OBD_MAX_RIF_MAX || max < 1) + return -ERANGE; + + CDEBUG(D_INFO, "%s: max = %hu max_mod = %u rif = %u\n", + cli->cl_import->imp_obd->obd_name, max, + cli->cl_max_mod_rpcs_in_flight, cli->cl_max_rpcs_in_flight); + + if (strcmp(cli->cl_import->imp_obd->obd_type->typ_name, + LUSTRE_MDC_NAME) == 0) { + /* adjust max_mod_rpcs_in_flight to ensure it is always + * strictly lower that max_rpcs_in_flight */ + if (max < 2) { + CERROR("%s: cannot set mdc.*.max_rpcs_in_flight=1\n", + cli->cl_import->imp_obd->obd_name); + return -ERANGE; + } + if (max <= cli->cl_max_mod_rpcs_in_flight) { + rc = obd_set_max_mod_rpcs_in_flight(cli, max - 1); + if (rc != 0) + return rc; + } + } + + spin_lock(&cli->cl_loi_list_lock); + old = cli->cl_max_rpcs_in_flight; + cli->cl_max_rpcs_in_flight = max; + client_adjust_max_dirty(cli); + + diff = max - old; + + /* We increase the max_rpcs_in_flight, then wakeup some waiters. */ + for (i = 0; i < diff; i++) { + if (list_empty(&cli->cl_flight_waiters)) + break; + + orsw = list_entry(cli->cl_flight_waiters.next, + struct obd_request_slot_waiter, orsw_entry); + list_del_init(&orsw->orsw_entry); + cli->cl_rpcs_in_flight++; + wake_up(&orsw->orsw_waitq); + } + spin_unlock(&cli->cl_loi_list_lock); + + return 0; +} +EXPORT_SYMBOL(obd_set_max_rpcs_in_flight); + +__u16 obd_get_max_mod_rpcs_in_flight(struct client_obd *cli) +{ + return cli->cl_max_mod_rpcs_in_flight; +} +EXPORT_SYMBOL(obd_get_max_mod_rpcs_in_flight); + +int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max) +{ + struct obd_connect_data *ocd; + __u16 maxmodrpcs; + __u16 prev; + + if (max > OBD_MAX_RIF_MAX || max < 1) + return -ERANGE; + + ocd = &cli->cl_import->imp_connect_data; + CDEBUG(D_INFO, "%s: max = %hu flags = %llx, max_mod = %u rif = %u\n", + cli->cl_import->imp_obd->obd_name, max, ocd->ocd_connect_flags, + ocd->ocd_maxmodrpcs, cli->cl_max_rpcs_in_flight); + + if (max == OBD_MAX_RIF_MAX) + max = OBD_MAX_RIF_MAX - 1; + + /* Cannot exceed or equal max_rpcs_in_flight. If we are asked to + * increase this value, also bump up max_rpcs_in_flight to match. + */ + if (max >= cli->cl_max_rpcs_in_flight) { + CDEBUG(D_INFO, + "%s: increasing max_rpcs_in_flight=%hu to allow larger max_mod_rpcs_in_flight=%u\n", + cli->cl_import->imp_obd->obd_name, max + 1, max); + obd_set_max_rpcs_in_flight(cli, max + 1); + } + + /* cannot exceed max modify RPCs in flight supported by the server, + * but verify ocd_connect_flags is at least initialized first. If + * not, allow it and fix value later in ptlrpc_connect_set_flags(). + */ + if (!ocd->ocd_connect_flags) { + maxmodrpcs = cli->cl_max_rpcs_in_flight - 1; + } else if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) { + maxmodrpcs = ocd->ocd_maxmodrpcs; + if (maxmodrpcs == 0) { /* connection not finished yet */ + maxmodrpcs = cli->cl_max_rpcs_in_flight - 1; + CDEBUG(D_INFO, + "%s: partial connect, assume maxmodrpcs=%hu\n", + cli->cl_import->imp_obd->obd_name, maxmodrpcs); + } + } else { + maxmodrpcs = 1; + } + if (max > maxmodrpcs) { + CERROR("%s: can't set max_mod_rpcs_in_flight=%hu higher than ocd_maxmodrpcs=%hu returned by the server at connection\n", + cli->cl_import->imp_obd->obd_name, + max, maxmodrpcs); + return -ERANGE; + } + + spin_lock(&cli->cl_mod_rpcs_lock); + + prev = cli->cl_max_mod_rpcs_in_flight; + cli->cl_max_mod_rpcs_in_flight = max; + + /* wakeup waiters if limit has been increased */ + if (cli->cl_max_mod_rpcs_in_flight > prev) + wake_up(&cli->cl_mod_rpcs_waitq); + + spin_unlock(&cli->cl_mod_rpcs_lock); + + return 0; +} +EXPORT_SYMBOL(obd_set_max_mod_rpcs_in_flight); + +int obd_mod_rpc_stats_seq_show(struct client_obd *cli, + struct seq_file *seq) +{ + unsigned long mod_tot = 0, mod_cum; + struct timespec64 now; + int i; + + ktime_get_real_ts64(&now); + + spin_lock(&cli->cl_mod_rpcs_lock); + + seq_printf(seq, "snapshot_time: %llu.%9lu (secs.nsecs)\n", + (s64)now.tv_sec, now.tv_nsec); + seq_printf(seq, "modify_RPCs_in_flight: %hu\n", + cli->cl_mod_rpcs_in_flight); + + seq_printf(seq, "\n\t\t\tmodify\n"); + seq_printf(seq, "rpcs in flight rpcs %% cum %%\n"); + + mod_tot = lprocfs_oh_sum(&cli->cl_mod_rpcs_hist); + + mod_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long mod = cli->cl_mod_rpcs_hist.oh_buckets[i]; + mod_cum += mod; + seq_printf(seq, "%d:\t\t%10lu %3u %3u\n", + i, mod, pct(mod, mod_tot), + pct(mod_cum, mod_tot)); + if (mod_cum == mod_tot) + break; + } + + spin_unlock(&cli->cl_mod_rpcs_lock); + + return 0; +} +EXPORT_SYMBOL(obd_mod_rpc_stats_seq_show); + +/* The number of modify RPCs sent in parallel is limited + * because the server has a finite number of slots per client to + * store request result and ensure reply reconstruction when needed. + * On the client, this limit is stored in cl_max_mod_rpcs_in_flight + * that takes into account server limit and cl_max_rpcs_in_flight + * value. + * On the MDC client, to avoid a potential deadlock (see Bugzilla 3462), + * one close request is allowed above the maximum. + */ +static inline bool obd_mod_rpc_slot_avail_locked(struct client_obd *cli, + bool close_req) +{ + bool avail; + + /* A slot is available if + * - number of modify RPCs in flight is less than the max + * - it's a close RPC and no other close request is in flight + */ + avail = cli->cl_mod_rpcs_in_flight < cli->cl_max_mod_rpcs_in_flight || + (close_req && cli->cl_close_rpcs_in_flight == 0); + + return avail; +} + +static inline bool obd_mod_rpc_slot_avail(struct client_obd *cli, + bool close_req) +{ + bool avail; + + spin_lock(&cli->cl_mod_rpcs_lock); + avail = obd_mod_rpc_slot_avail_locked(cli, close_req); + spin_unlock(&cli->cl_mod_rpcs_lock); + return avail; +} + +static inline bool obd_skip_mod_rpc_slot(const struct lookup_intent *it) +{ + if (it != NULL && + (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || + it->it_op == IT_READDIR || + (it->it_op == IT_LAYOUT && !(it->it_flags & MDS_FMODE_WRITE)))) + return true; + return false; +} + +/* Get a modify RPC slot from the obd client @cli according + * to the kind of operation @opc that is going to be sent + * and the intent @it of the operation if it applies. + * If the maximum number of modify RPCs in flight is reached + * the thread is put to sleep. + * Returns the tag to be set in the request message. Tag 0 + * is reserved for non-modifying requests. + */ +__u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc, + struct lookup_intent *it) +{ + struct l_wait_info lwi = LWI_INTR(NULL, NULL); + bool close_req = false; + __u16 i, max; + + /* read-only metadata RPCs don't consume a slot on MDT + * for reply reconstruction + */ + if (obd_skip_mod_rpc_slot(it)) + return 0; + + if (opc == MDS_CLOSE) + close_req = true; + + do { + spin_lock(&cli->cl_mod_rpcs_lock); + max = cli->cl_max_mod_rpcs_in_flight; + if (obd_mod_rpc_slot_avail_locked(cli, close_req)) { + /* there is a slot available */ + cli->cl_mod_rpcs_in_flight++; + if (close_req) + cli->cl_close_rpcs_in_flight++; + lprocfs_oh_tally(&cli->cl_mod_rpcs_hist, + cli->cl_mod_rpcs_in_flight); + /* find a free tag */ + i = find_first_zero_bit(cli->cl_mod_tag_bitmap, + max + 1); + LASSERT(i < OBD_MAX_RIF_MAX); + LASSERT(!test_and_set_bit(i, cli->cl_mod_tag_bitmap)); + spin_unlock(&cli->cl_mod_rpcs_lock); + /* tag 0 is reserved for non-modify RPCs */ + return i + 1; + } + spin_unlock(&cli->cl_mod_rpcs_lock); + + CDEBUG(D_RPCTRACE, "%s: sleeping for a modify RPC slot " + "opc %u, max %hu\n", + cli->cl_import->imp_obd->obd_name, opc, max); + + l_wait_event_exclusive(cli->cl_mod_rpcs_waitq, + obd_mod_rpc_slot_avail(cli, close_req), + &lwi); + } while (true); +} +EXPORT_SYMBOL(obd_get_mod_rpc_slot); + +/* Put a modify RPC slot from the obd client @cli according + * to the kind of operation @opc that has been sent and the + * intent @it of the operation if it applies. + */ +void obd_put_mod_rpc_slot(struct client_obd *cli, __u32 opc, + struct lookup_intent *it, __u16 tag) +{ + bool close_req = false; + + if (obd_skip_mod_rpc_slot(it)) + return; + + if (opc == MDS_CLOSE) + close_req = true; + + spin_lock(&cli->cl_mod_rpcs_lock); + cli->cl_mod_rpcs_in_flight--; + if (close_req) + cli->cl_close_rpcs_in_flight--; + /* release the tag in the bitmap */ + LASSERT(tag - 1 < OBD_MAX_RIF_MAX); + LASSERT(test_and_clear_bit(tag - 1, cli->cl_mod_tag_bitmap) != 0); + spin_unlock(&cli->cl_mod_rpcs_lock); + wake_up(&cli->cl_mod_rpcs_waitq); +} +EXPORT_SYMBOL(obd_put_mod_rpc_slot); + diff --git a/drivers/staging/lustrefsx/lustre/obdclass/idmap.c b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c new file mode 100644 index 0000000000000..1fcbb2a839f9d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c @@ -0,0 +1,163 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/idmap.c + * + * Lustre user identity mapping. + * + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include +#include +#include +#include + +/* + * groups_search() is copied from linux kernel! + * A simple bsearch. + */ +static int lustre_groups_search(struct group_info *group_info, + gid_t grp) +{ + int left, right; + + if (!group_info) + return 0; + + left = 0; + right = group_info->ngroups; + while (left < right) { + int mid = (left + right) / 2; + int cmp = grp - + from_kgid(&init_user_ns, CFS_GROUP_AT(group_info, mid)); + + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return 1; + } + return 0; +} + +void lustre_groups_from_list(struct group_info *ginfo, gid_t *glist) +{ +#ifdef HAVE_GROUP_INFO_GID + memcpy(ginfo->gid, glist, ginfo->ngroups * sizeof(__u32)); +#else + int i; + int count = ginfo->ngroups; + + /* fill group_info from gid array */ + for (i = 0; i < ginfo->nblocks && count > 0; i++) { + int cp_count = min(CFS_NGROUPS_PER_BLOCK, count); + int off = i * CFS_NGROUPS_PER_BLOCK; + int len = cp_count * sizeof(*glist); + + memcpy(ginfo->blocks[i], glist + off, len); + count -= cp_count; + } +#endif +} +EXPORT_SYMBOL(lustre_groups_from_list); + +/* groups_sort() is copied from linux kernel! */ +/* a simple shell-metzner sort */ +void lustre_groups_sort(struct group_info *group_info) +{ + int base, max, stride; + int gidsetsize = group_info->ngroups; + + for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) + ; /* nothing */ + stride /= 3; + + while (stride) { + max = gidsetsize - stride; + for (base = 0; base < max; base++) { + int left = base; + int right = left + stride; + gid_t tmp = from_kgid(&init_user_ns, + CFS_GROUP_AT(group_info, right)); + + while (left >= 0 && + tmp < from_kgid(&init_user_ns, + CFS_GROUP_AT(group_info, left))) { + CFS_GROUP_AT(group_info, right) = + CFS_GROUP_AT(group_info, left); + right = left; + left -= stride; + } + CFS_GROUP_AT(group_info, right) = + make_kgid(&init_user_ns, tmp); + } + stride /= 3; + } +} +EXPORT_SYMBOL(lustre_groups_sort); + +int lustre_in_group_p(struct lu_ucred *mu, gid_t grp) +{ + int rc = 1; + + if (grp != mu->uc_fsgid) { + struct group_info *group_info = NULL; + + if (mu->uc_ginfo || !mu->uc_identity || + mu->uc_valid == UCRED_OLD) + if (grp == mu->uc_suppgids[0] || + grp == mu->uc_suppgids[1]) + return 1; + + if (mu->uc_ginfo) + group_info = mu->uc_ginfo; + else if (mu->uc_identity) + group_info = mu->uc_identity->mi_ginfo; + + if (!group_info) + return 0; + + atomic_inc(&group_info->usage); + rc = lustre_groups_search(group_info, grp); + if (atomic_dec_and_test(&group_info->usage)) + groups_free(group_info); + } + return rc; +} +EXPORT_SYMBOL(lustre_in_group_p); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/integrity.c b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c new file mode 100644 index 0000000000000..4a6d27aa6ae36 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c @@ -0,0 +1,277 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2018, DataDirect Networks Storage. + * Author: Li Xi. + * + * General data integrity functions + */ +#include +#include +#include +#include +#include + +#if IS_ENABLED(CONFIG_CRC_T10DIF) +__u16 obd_dif_crc_fn(void *data, unsigned int len) +{ + return cpu_to_be16(crc_t10dif(data, len)); +} +EXPORT_SYMBOL(obd_dif_crc_fn); + +__u16 obd_dif_ip_fn(void *data, unsigned int len) +{ + return ip_compute_csum(data, len); +} +EXPORT_SYMBOL(obd_dif_ip_fn); + +int obd_page_dif_generate_buffer(const char *obd_name, struct page *page, + __u32 offset, __u32 length, + __u16 *guard_start, int guard_number, + int *used_number, int sector_size, + obd_dif_csum_fn *fn) +{ + unsigned int i = offset; + unsigned int end = offset + length; + char *data_buf; + __u16 *guard_buf = guard_start; + unsigned int data_size; + int used = 0; + + data_buf = kmap(page) + offset; + while (i < end) { + if (used >= guard_number) { + CERROR("%s: unexpected used guard number of DIF %u/%u, " + "data length %u, sector size %u: rc = %d\n", + obd_name, used, guard_number, length, + sector_size, -E2BIG); + return -E2BIG; + } + data_size = min(round_up(i + 1, sector_size), end) - i; + *guard_buf = fn(data_buf, data_size); + guard_buf++; + data_buf += data_size; + i += data_size; + used++; + } + kunmap(page); + *used_number = used; + + return 0; +} +EXPORT_SYMBOL(obd_page_dif_generate_buffer); + +static int __obd_t10_performance_test(const char *obd_name, + enum cksum_types cksum_type, + struct page *data_page, + int repeat_number) +{ + unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP); + struct ahash_request *req; + obd_dif_csum_fn *fn = NULL; + unsigned int bufsize; + unsigned char *buffer; + struct page *__page; + __u16 *guard_start; + int guard_number; + int used_number = 0; + int sector_size = 0; + __u32 cksum; + int rc = 0; + int rc2; + int used; + int i; + + obd_t10_cksum2dif(cksum_type, &fn, §or_size); + if (!fn) + return -EINVAL; + + __page = alloc_page(GFP_KERNEL); + if (__page == NULL) + return -ENOMEM; + + req = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + CERROR("%s: unable to initialize checksum hash %s: rc = %d\n", + obd_name, cfs_crypto_hash_name(cfs_alg), rc); + GOTO(out, rc); + } + + buffer = kmap(__page); + guard_start = (__u16 *)buffer; + guard_number = PAGE_SIZE / sizeof(*guard_start); + for (i = 0; i < repeat_number; i++) { + /* + * The left guard number should be able to hold checksums of a + * whole page + */ + rc = obd_page_dif_generate_buffer(obd_name, data_page, 0, + PAGE_SIZE, + guard_start + used_number, + guard_number - used_number, + &used, sector_size, fn); + if (rc) + break; + + used_number += used; + if (used_number == guard_number) { + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + used_number = 0; + } + } + kunmap(__page); + if (rc) + GOTO(out_final, rc); + + if (used_number != 0) + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + + bufsize = sizeof(cksum); +out_final: + rc2 = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize); + rc = rc ? rc : rc2; +out: + __free_page(__page); + + return rc; +} + +/** + * Array of T10PI checksum algorithm speed in MByte per second + */ +static int obd_t10_cksum_speeds[OBD_T10_CKSUM_MAX]; + +static enum obd_t10_cksum_type +obd_t10_cksum2type(enum cksum_types cksum_type) +{ + switch (cksum_type) { + case OBD_CKSUM_T10IP512: + return OBD_T10_CKSUM_IP512; + case OBD_CKSUM_T10IP4K: + return OBD_T10_CKSUM_IP4K; + case OBD_CKSUM_T10CRC512: + return OBD_T10_CKSUM_CRC512; + case OBD_CKSUM_T10CRC4K: + return OBD_T10_CKSUM_CRC4K; + default: + return OBD_T10_CKSUM_UNKNOWN; + } +} + +static const char *obd_t10_cksum_name(enum obd_t10_cksum_type index) +{ + DECLARE_CKSUM_NAME; + + /* Need to skip "crc32", "adler", "crc32c", "reserved" */ + return cksum_name[3 + index]; +} + +/** + * Compute the speed of specified T10PI checksum type + * + * Run a speed test on the given T10PI checksum on buffer using a 1MB buffer + * size. This is a reasonable buffer size for Lustre RPCs, even if the actual + * RPC size is larger or smaller. + * + * The speed is stored internally in the obd_t10_cksum_speeds[] array, and + * is available through the obd_t10_cksum_speed() function. + * + * This function needs to stay the same as cfs_crypto_performance_test() so + * that the speeds are comparable. And this function should reflect the real + * cost of the checksum calculation. + * + * \param[in] obd_name name of the OBD device + * \param[in] cksum_type checksum type (OBD_CKSUM_T10*) + */ +static void obd_t10_performance_test(const char *obd_name, + enum cksum_types cksum_type) +{ + enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type); + const int buf_len = max(PAGE_SIZE, 1048576UL); + unsigned long bcount; + unsigned long start; + unsigned long end; + struct page *page; + int rc = 0; + void *buf; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) { + rc = -ENOMEM; + goto out; + } + + buf = kmap(page); + memset(buf, 0xAD, PAGE_SIZE); + kunmap(page); + + for (start = jiffies, end = start + msecs_to_jiffies(MSEC_PER_SEC / 4), + bcount = 0; time_before(jiffies, end) && rc == 0; bcount++) { + rc = __obd_t10_performance_test(obd_name, cksum_type, page, + buf_len / PAGE_SIZE); + if (rc) + break; + } + end = jiffies; + __free_page(page); +out: + if (rc) { + obd_t10_cksum_speeds[index] = rc; + CDEBUG(D_INFO, "%s: T10 checksum algorithm %s test error: " + "rc = %d\n", obd_name, obd_t10_cksum_name(index), rc); + } else { + unsigned long tmp; + + tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) * + 1000) / (1024 * 1024); + obd_t10_cksum_speeds[index] = (int)tmp; + CDEBUG(D_CONFIG, "%s: T10 checksum algorithm %s speed = %d " + "MB/s\n", obd_name, obd_t10_cksum_name(index), + obd_t10_cksum_speeds[index]); + } +} +#endif /* CONFIG_CRC_T10DIF */ + +int obd_t10_cksum_speed(const char *obd_name, + enum cksum_types cksum_type) +{ +#if IS_ENABLED(CONFIG_CRC_T10DIF) + enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type); + + if (unlikely(obd_t10_cksum_speeds[index] == 0)) { + static DEFINE_MUTEX(obd_t10_cksum_speed_mutex); + + mutex_lock(&obd_t10_cksum_speed_mutex); + if (obd_t10_cksum_speeds[index] == 0) + obd_t10_performance_test(obd_name, cksum_type); + mutex_unlock(&obd_t10_cksum_speed_mutex); + } + + return obd_t10_cksum_speeds[index]; +#else /* !CONFIG_CRC_T10DIF */ + return 0; +#endif /* !CONFIG_CRC_T10DIF */ +} +EXPORT_SYMBOL(obd_t10_cksum_speed); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/jobid.c b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c new file mode 100644 index 0000000000000..b7a08d495b2ce --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c @@ -0,0 +1,575 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2017 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * Store PID->JobID mappings + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#ifdef HAVE_UIDGID_HEADER +#include +#endif +#include + +#include +#include +#include +#include + +static struct cfs_hash *jobid_hash; +static struct cfs_hash_ops jobid_hash_ops; +spinlock_t jobid_hash_lock; + +#define RESCAN_INTERVAL 30 +#define DELETE_INTERVAL 300 + +char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE; +char obd_jobid_name[LUSTRE_JOBID_SIZE] = "%e.%u"; + +/** + * Structure to store a single PID->JobID mapping + */ +struct jobid_pid_map { + struct hlist_node jp_hash; + time64_t jp_time; + spinlock_t jp_lock; /* protects jp_jobid */ + char jp_jobid[LUSTRE_JOBID_SIZE]; + unsigned int jp_joblen; + atomic_t jp_refcount; + pid_t jp_pid; +}; + +/* + * Get jobid of current process by reading the environment variable + * stored in between the "env_start" & "env_end" of task struct. + * + * If some job scheduler doesn't store jobid in the "env_start/end", + * then an upcall could be issued here to get the jobid by utilizing + * the userspace tools/API. Then, the jobid must be cached. + */ +int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len) +{ + int rc; + + rc = cfs_get_environ(jobid_var, jobid, jobid_len); + if (!rc) + goto out; + + if (rc == -EOVERFLOW) { + /* For the PBS_JOBID and LOADL_STEP_ID keys (which are + * variable length strings instead of just numbers), it + * might make sense to keep the unique parts for JobID, + * instead of just returning an error. That means a + * larger temp buffer for cfs_get_environ(), then + * truncating the string at some separator to fit into + * the specified jobid_len. Fix later if needed. */ + static ktime_t printed; + + if (unlikely(ktime_to_ns(printed) == 0 || + ktime_after(ktime_get(), + ktime_add_ns(printed, + 3600*24*NSEC_PER_SEC)))) { + LCONSOLE_WARN("jobid: '%s' value too large (%d)\n", + obd_jobid_var, *jobid_len); + printed = ktime_get(); + } + + rc = 0; + } else { + CDEBUG_LIMIT((rc == -ENOENT || rc == -EINVAL || + rc == -EDEADLK) ? D_INFO : D_ERROR, + "jobid: get '%s' failed: rc = %d\n", + obd_jobid_var, rc); + } + +out: + return rc; +} + +/* + * jobid_should_free_item + * + * Each item is checked to see if it should be released + * Removed from hash table by caller + * Actually freed in jobid_put_locked + * + * Returns 1 if item is to be freed, 0 if it is to be kept + */ + +static int jobid_should_free_item(void *obj, void *data) +{ + char *jobid = data; + struct jobid_pid_map *pidmap = obj; + int rc = 0; + + if (obj == NULL) + return 0; + + if (jobid == NULL) { + WARN_ON_ONCE(atomic_read(&pidmap->jp_refcount) != 1); + return 1; + } + + spin_lock(&pidmap->jp_lock); + /* prevent newly inserted items from deleting */ + if (jobid[0] == '\0' && atomic_read(&pidmap->jp_refcount) == 1) + rc = 1; + else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL) + rc = 1; + else if (strcmp(pidmap->jp_jobid, jobid) == 0) + rc = 1; + spin_unlock(&pidmap->jp_lock); + + return rc; +} + +/* + * jobid_name_is_valid + * + * Checks if the jobid is a Lustre process + * + * Returns true if jobid is valid + * Returns false if jobid looks like it's a Lustre process + */ +static bool jobid_name_is_valid(char *jobid) +{ + const char *const lustre_reserved[] = { "ll_ping", "ptlrpc", + "ldlm", "ll_sa", NULL }; + int i; + + if (jobid[0] == '\0') + return false; + + for (i = 0; lustre_reserved[i] != NULL; i++) { + if (strncmp(jobid, lustre_reserved[i], + strlen(lustre_reserved[i])) == 0) + return false; + } + return true; +} + +/* + * jobid_get_from_cache() + * + * Returns contents of jobid_var from process environment for current PID. + * This will be cached for some time to avoid overhead scanning environment. + * + * Return: -ENOMEM if allocating a new pidmap fails + * -ENOENT if no entry could be found + * +ve string length for success (something was returned in jobid) + */ +static int jobid_get_from_cache(char *jobid, size_t joblen) +{ + static time64_t last_expire; + bool expire_cache = false; + pid_t pid = current_pid(); + struct jobid_pid_map *pidmap = NULL; + time64_t now = ktime_get_real_seconds(); + int rc = 0; + ENTRY; + + LASSERT(jobid_hash != NULL); + + /* scan hash periodically to remove old PID entries from cache */ + spin_lock(&jobid_hash_lock); + if (unlikely(last_expire + DELETE_INTERVAL <= now)) { + expire_cache = true; + last_expire = now; + } + spin_unlock(&jobid_hash_lock); + + if (expire_cache) + cfs_hash_cond_del(jobid_hash, jobid_should_free_item, + "intentionally_bad_jobid"); + + /* first try to find PID in the hash and use that value */ + pidmap = cfs_hash_lookup(jobid_hash, &pid); + if (pidmap == NULL) { + struct jobid_pid_map *pidmap2; + + OBD_ALLOC_PTR(pidmap); + if (pidmap == NULL) + GOTO(out, rc = -ENOMEM); + + pidmap->jp_pid = pid; + pidmap->jp_time = 0; + pidmap->jp_jobid[0] = '\0'; + spin_lock_init(&pidmap->jp_lock); + INIT_HLIST_NODE(&pidmap->jp_hash); + /* + * @pidmap might be reclaimed just after it is added into + * hash list, init @jp_refcount as 1 to make sure memory + * could be not freed during access. + */ + atomic_set(&pidmap->jp_refcount, 1); + + /* + * Add the newly created map to the hash, on key collision we + * lost a racing addition and must destroy our newly allocated + * map. The object which exists in the hash will be returned. + */ + pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid, + &pidmap->jp_hash); + if (unlikely(pidmap != pidmap2)) { + CDEBUG(D_INFO, "jobid: duplicate found for PID=%u\n", + pid); + OBD_FREE_PTR(pidmap); + pidmap = pidmap2; + } + } + + /* + * If pidmap is old (this is always true for new entries) refresh it. + * If obd_jobid_var is not found, cache empty entry and try again + * later, to avoid repeat lookups for PID if obd_jobid_var missing. + */ + spin_lock(&pidmap->jp_lock); + if (pidmap->jp_time + RESCAN_INTERVAL <= now) { + char env_jobid[LUSTRE_JOBID_SIZE] = ""; + int env_len = sizeof(env_jobid); + + pidmap->jp_time = now; + + spin_unlock(&pidmap->jp_lock); + rc = jobid_get_from_environ(obd_jobid_var, env_jobid, &env_len); + + CDEBUG(D_INFO, "jobid: PID mapping established: %d->%s\n", + pidmap->jp_pid, env_jobid); + spin_lock(&pidmap->jp_lock); + if (!rc) { + pidmap->jp_joblen = env_len; + strlcpy(pidmap->jp_jobid, env_jobid, + sizeof(pidmap->jp_jobid)); + rc = 0; + } else if (rc == -ENOENT) { + /* It might have been deleted, clear out old entry */ + pidmap->jp_joblen = 0; + pidmap->jp_jobid[0] = '\0'; + } + } + + /* + * Regardless of how pidmap was found, if it contains a valid entry + * use that for now. If there was a technical error (e.g. -ENOMEM) + * use the old cached value until it can be looked up again properly. + * If a cached missing entry was found, return -ENOENT. + */ + if (pidmap->jp_joblen) { + strlcpy(jobid, pidmap->jp_jobid, joblen); + joblen = pidmap->jp_joblen; + rc = 0; + } else if (!rc) { + rc = -ENOENT; + } + spin_unlock(&pidmap->jp_lock); + + cfs_hash_put(jobid_hash, &pidmap->jp_hash); + + EXIT; +out: + return rc < 0 ? rc : joblen; +} + +/* + * jobid_interpret_string() + * + * Interpret the jobfmt string to expand specified fields, like coredumps do: + * %e = executable + * %g = gid + * %h = hostname + * %j = jobid from environment + * %p = pid + * %u = uid + * + * Unknown escape strings are dropped. Other characters are copied through, + * excluding whitespace (to avoid making jobid parsing difficult). + * + * Return: -EOVERFLOW if the expanded string does not fit within @joblen + * 0 for success + */ +static int jobid_interpret_string(const char *jobfmt, char *jobid, + ssize_t joblen) +{ + char c; + + while ((c = *jobfmt++) && joblen > 1) { + char f; + int l; + + if (isspace(c)) /* Don't allow embedded spaces */ + continue; + + if (c != '%') { + *jobid = c; + joblen--; + jobid++; + continue; + } + + switch ((f = *jobfmt++)) { + case 'e': /* executable name */ + l = snprintf(jobid, joblen, "%s", current_comm()); + break; + case 'g': /* group ID */ + l = snprintf(jobid, joblen, "%u", + from_kgid(&init_user_ns, current_fsgid())); + break; + case 'h': /* hostname */ + l = snprintf(jobid, joblen, "%s", + init_utsname()->nodename); + break; + case 'j': /* jobid stored in process environment */ + l = jobid_get_from_cache(jobid, joblen); + if (l < 0) + l = 0; + break; + case 'p': /* process ID */ + l = snprintf(jobid, joblen, "%u", current_pid()); + break; + case 'u': /* user ID */ + l = snprintf(jobid, joblen, "%u", + from_kuid(&init_user_ns, current_fsuid())); + break; + case '\0': /* '%' at end of format string */ + l = 0; + goto out; + default: /* drop unknown %x format strings */ + l = 0; + break; + } + jobid += l; + joblen -= l; + } + /* + * This points at the end of the buffer, so long as jobid is always + * incremented the same amount as joblen is decremented. + */ +out: + jobid[joblen - 1] = '\0'; + + return joblen < 0 ? -EOVERFLOW : 0; +} + +/* + * Hash initialization, copied from server-side job stats bucket sizes + */ +#define HASH_JOBID_BKT_BITS 5 +#define HASH_JOBID_CUR_BITS 7 +#define HASH_JOBID_MAX_BITS 12 + +int jobid_cache_init(void) +{ + int rc = 0; + ENTRY; + + if (jobid_hash) + return 0; + + spin_lock_init(&jobid_hash_lock); + jobid_hash = cfs_hash_create("JOBID_HASH", HASH_JOBID_CUR_BITS, + HASH_JOBID_MAX_BITS, HASH_JOBID_BKT_BITS, + 0, CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, + &jobid_hash_ops, CFS_HASH_DEFAULT); + if (!jobid_hash) + rc = -ENOMEM; + + RETURN(rc); +} +EXPORT_SYMBOL(jobid_cache_init); + +void jobid_cache_fini(void) +{ + struct cfs_hash *tmp_hash; + ENTRY; + + spin_lock(&jobid_hash_lock); + tmp_hash = jobid_hash; + jobid_hash = NULL; + spin_unlock(&jobid_hash_lock); + + if (tmp_hash != NULL) { + cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL); + cfs_hash_putref(tmp_hash); + } + + EXIT; +} +EXPORT_SYMBOL(jobid_cache_fini); + +/* + * Hash operations for pid<->jobid + */ +static unsigned jobid_hashfn(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(pid_t), mask); +} + +static void *jobid_key(struct hlist_node *hnode) +{ + struct jobid_pid_map *pidmap; + + pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash); + return &pidmap->jp_pid; +} + +static int jobid_keycmp(const void *key, struct hlist_node *hnode) +{ + const pid_t *pid_key1; + const pid_t *pid_key2; + + LASSERT(key != NULL); + pid_key1 = (pid_t *)key; + pid_key2 = (pid_t *)jobid_key(hnode); + + return *pid_key1 == *pid_key2; +} + +static void *jobid_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct jobid_pid_map, jp_hash); +} + +static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct jobid_pid_map *pidmap; + + pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash); + + atomic_inc(&pidmap->jp_refcount); +} + +static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct jobid_pid_map *pidmap; + + if (hnode == NULL) + return; + + pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash); + LASSERT(atomic_read(&pidmap->jp_refcount) > 0); + if (atomic_dec_and_test(&pidmap->jp_refcount)) { + CDEBUG(D_INFO, "Freeing: %d->%s\n", + pidmap->jp_pid, pidmap->jp_jobid); + + OBD_FREE_PTR(pidmap); + } +} + +static struct cfs_hash_ops jobid_hash_ops = { + .hs_hash = jobid_hashfn, + .hs_keycmp = jobid_keycmp, + .hs_key = jobid_key, + .hs_object = jobid_object, + .hs_get = jobid_get, + .hs_put = jobid_put_locked, + .hs_put_locked = jobid_put_locked, +}; + +/** + * Generate the job identifier string for this process for tracking purposes. + * + * Fill in @jobid string based on the value of obd_jobid_var: + * JOBSTATS_DISABLE: none + * JOBSTATS_NODELOCAL: content of obd_jobid_node (jobid_interpret_string()) + * JOBSTATS_PROCNAME_UID: process name/UID + * anything else: look up obd_jobid_var in the processes environment + * + * Return -ve error number, 0 on success. + */ +int lustre_get_jobid(char *jobid, size_t joblen) +{ + int rc = 0; + ENTRY; + + if (unlikely(joblen < 2)) { + if (joblen == 1) + jobid[0] = '\0'; + RETURN(-EINVAL); + } + + if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) { + /* Jobstats isn't enabled */ + memset(jobid, 0, joblen); + } else if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) { + /* Whole node dedicated to single job */ + rc = jobid_interpret_string(obd_jobid_name, jobid, joblen); + } else if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { + rc = jobid_interpret_string("%e.%u", jobid, joblen); + } else if (jobid_name_is_valid(current_comm())) { + /* + * obd_jobid_var holds the jobid environment variable name. + * Skip initial check if obd_jobid_name already uses "%j", + * otherwise try just "%j" first, then fall back to whatever + * is in obd_jobid_name if obd_jobid_var is not found. + */ + rc = -EAGAIN; + if (!strnstr(obd_jobid_name, "%j", joblen)) + rc = jobid_get_from_cache(jobid, joblen); + + /* fall back to jobid_node if jobid_var not in environment */ + if (rc < 0) { + int rc2 = jobid_interpret_string(obd_jobid_name, + jobid, joblen); + if (!rc2) + rc = 0; + } + } + + RETURN(rc); +} +EXPORT_SYMBOL(lustre_get_jobid); + +/* + * lustre_jobid_clear + * + * Search cache for JobID given by @find_jobid. + * If any entries in the hash table match the value, they are removed + */ +void lustre_jobid_clear(const char *find_jobid) +{ + char jobid[LUSTRE_JOBID_SIZE]; + char *end; + + if (jobid_hash == NULL) + return; + + strlcpy(jobid, find_jobid, sizeof(jobid)); + /* trim \n off the end of the incoming jobid */ + end = strchr(jobid, '\n'); + if (end && *end == '\n') + *end = '\0'; + + CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid); + cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid); + + CDEBUG(D_INFO, "%d items remain in jobID table\n", + atomic_read(&jobid_hash->hs_count)); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c new file mode 100644 index 0000000000000..7afb9484a8a69 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c @@ -0,0 +1,262 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2015, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: Nathan Rutman + * + * Kernel <-> userspace communication routines. + * Using pipes for all arches. + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include + +#include +#include + +/** + * libcfs_kkuc_msg_put - send an message from kernel to userspace + * @param fp to send the message to + * @param payload Payload data. First field of payload is always + * struct kuc_hdr + */ +int libcfs_kkuc_msg_put(struct file *filp, void *payload) +{ + struct kuc_hdr *kuch = (struct kuc_hdr *)payload; + ssize_t count = kuch->kuc_msglen; + loff_t offset = 0; + int rc = 0; + + if (IS_ERR_OR_NULL(filp)) + return -EBADF; + + if (kuch->kuc_magic != KUC_MAGIC) { + CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic); + return -ENOSYS; + } + + while (count > 0) { + rc = cfs_kernel_write(filp, payload, count, &offset); + if (rc < 0) + break; + count -= rc; + payload += rc; + rc = 0; + } + + if (rc < 0) + CWARN("message send failed (%d)\n", rc); + else + CDEBUG(D_HSM, "Sent message rc=%d, fp=%p\n", rc, filp); + + return rc; +} +EXPORT_SYMBOL(libcfs_kkuc_msg_put); + +/* Broadcast groups are global across all mounted filesystems; + * i.e. registering for a group on 1 fs will get messages for that + * group from any fs */ +/** A single group registration has a uid and a file pointer */ +struct kkuc_reg { + struct list_head kr_chain; + struct obd_uuid kr_uuid; + int kr_uid; + struct file *kr_fp; + char kr_data[0]; +}; + +static struct list_head kkuc_groups[KUC_GRP_MAX + 1]; +/* Protect message sending against remove and adds */ +static DECLARE_RWSEM(kg_sem); + +static inline bool libcfs_kkuc_group_is_valid(int group) +{ + return 0 <= group && group < ARRAY_SIZE(kkuc_groups); +} + +void libcfs_kkuc_init(void) +{ + int group; + + for (group = 0; group < ARRAY_SIZE(kkuc_groups); group++) + INIT_LIST_HEAD(&kkuc_groups[group]); +} + +/** Add a receiver to a broadcast group + * @param filp pipe to write into + * @param uid identifier for this receiver + * @param group group number + * @param data user data + */ +int libcfs_kkuc_group_add(struct file *filp, const struct obd_uuid *uuid, + int uid, int group, void *data, size_t data_len) +{ + struct kkuc_reg *reg; + + if (!libcfs_kkuc_group_is_valid(group)) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + return -EINVAL; + } + + /* fput in group_rem */ + if (filp == NULL) + return -EBADF; + + /* freed in group_rem */ + reg = kzalloc(sizeof(*reg) + data_len, 0); + if (reg == NULL) + return -ENOMEM; + + reg->kr_uuid = *uuid; + reg->kr_fp = filp; + reg->kr_uid = uid; + memcpy(reg->kr_data, data, data_len); + + down_write(&kg_sem); + list_add(®->kr_chain, &kkuc_groups[group]); + up_write(&kg_sem); + + CDEBUG(D_HSM, "Added uid=%d fp=%p to group %d\n", uid, filp, group); + + return 0; +} +EXPORT_SYMBOL(libcfs_kkuc_group_add); + +int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group) +{ + struct kkuc_reg *reg, *next; + ENTRY; + + if (!libcfs_kkuc_group_is_valid(group)) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + return -EINVAL; + } + + if (uid == 0) { + /* Broadcast a shutdown message */ + struct kuc_hdr lh; + + lh.kuc_magic = KUC_MAGIC; + lh.kuc_transport = KUC_TRANSPORT_GENERIC; + lh.kuc_msgtype = KUC_MSG_SHUTDOWN; + lh.kuc_msglen = sizeof(lh); + libcfs_kkuc_group_put(uuid, group, &lh); + } + + down_write(&kg_sem); + list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) { + if (obd_uuid_equals(uuid, ®->kr_uuid) && + (uid == 0 || uid == reg->kr_uid)) { + list_del(®->kr_chain); + CDEBUG(D_HSM, "Removed uid=%d fp=%p from group %d\n", + reg->kr_uid, reg->kr_fp, group); + if (reg->kr_fp != NULL) + fput(reg->kr_fp); + kfree(reg); + } + } + up_write(&kg_sem); + + RETURN(0); +} +EXPORT_SYMBOL(libcfs_kkuc_group_rem); + +int libcfs_kkuc_group_put(const struct obd_uuid *uuid, int group, void *payload) +{ + struct kkuc_reg *reg; + int rc = 0; + int one_success = 0; + ENTRY; + + if (!libcfs_kkuc_group_is_valid(group)) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + return -EINVAL; + } + + down_write(&kg_sem); + + if (unlikely(list_empty(&kkuc_groups[group])) || + unlikely(OBD_FAIL_CHECK(OBD_FAIL_MDS_HSM_CT_REGISTER_NET))) { + /* no agent have fully registered, CDT will retry */ + up_write(&kg_sem); + RETURN(-EAGAIN); + } + + list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { + if (obd_uuid_equals(uuid, ®->kr_uuid) && + reg->kr_fp != NULL) { + rc = libcfs_kkuc_msg_put(reg->kr_fp, payload); + if (rc == 0) + one_success = 1; + else if (rc == -EPIPE) { + fput(reg->kr_fp); + reg->kr_fp = NULL; + } + } + } + up_write(&kg_sem); + + /* don't return an error if the message has been delivered + * at least to one agent */ + if (one_success) + rc = 0; + + RETURN(rc); +} +EXPORT_SYMBOL(libcfs_kkuc_group_put); + +/** + * Calls a callback function for each link of the given kuc group. + * @param group the group to call the function on. + * @param cb_func the function to be called. + * @param cb_arg extra argument to be passed to the callback function. + */ +int libcfs_kkuc_group_foreach(const struct obd_uuid *uuid, int group, + libcfs_kkuc_cb_t cb_func, void *cb_arg) +{ + struct kkuc_reg *reg; + int rc = 0; + ENTRY; + + if (!libcfs_kkuc_group_is_valid(group)) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + RETURN(-EINVAL); + } + + down_read(&kg_sem); + list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { + if (obd_uuid_equals(uuid, ®->kr_uuid) && reg->kr_fp != NULL) + rc = cb_func(reg->kr_data, cb_arg); + } + up_read(&kg_sem); + + RETURN(rc); +} +EXPORT_SYMBOL(libcfs_kkuc_group_foreach); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linkea.c b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c new file mode 100644 index 0000000000000..cf17a50999f8d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c @@ -0,0 +1,307 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + * Use is subject to license terms. + * + * Author: Di Wang + */ + +#include +#include + +int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf) +{ + ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_SIZE); + if (ldata->ld_buf->lb_buf == NULL) + return -ENOMEM; + ldata->ld_leh = ldata->ld_buf->lb_buf; + ldata->ld_leh->leh_magic = LINK_EA_MAGIC; + ldata->ld_leh->leh_reccount = 0; + ldata->ld_leh->leh_len = sizeof(struct link_ea_header); + ldata->ld_leh->leh_overflow_time = 0; + ldata->ld_leh->leh_padding = 0; + return 0; +} +EXPORT_SYMBOL(linkea_data_new); + +int linkea_init(struct linkea_data *ldata) +{ + struct link_ea_header *leh; + + LASSERT(ldata->ld_buf != NULL); + leh = ldata->ld_buf->lb_buf; + if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) { + leh->leh_magic = LINK_EA_MAGIC; + leh->leh_reccount = __swab32(leh->leh_reccount); + leh->leh_len = __swab64(leh->leh_len); + leh->leh_overflow_time = __swab32(leh->leh_overflow_time); + leh->leh_padding = __swab32(leh->leh_padding); + /* individual entries are swabbed by linkea_entry_unpack() */ + } + + if (leh->leh_magic != LINK_EA_MAGIC) + return -EINVAL; + + if (leh->leh_reccount == 0 && leh->leh_overflow_time == 0) + return -ENODATA; + + ldata->ld_leh = leh; + return 0; +} +EXPORT_SYMBOL(linkea_init); + +int linkea_init_with_rec(struct linkea_data *ldata) +{ + int rc; + + rc = linkea_init(ldata); + if (!rc && ldata->ld_leh->leh_reccount == 0) + rc = -ENODATA; + + return rc; +} +EXPORT_SYMBOL(linkea_init_with_rec); + +/** + * Pack a link_ea_entry. + * All elements are stored as chars to avoid alignment issues. + * Numbers are always big-endian + * \retval record length + */ +int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct lu_fid tmpfid; + int reclen; + + tmpfid = *pfid; + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_MUL_REF)) + tmpfid.f_oid--; + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH)) + tmpfid.f_ver = ~0; + fid_cpu_to_be(&tmpfid, &tmpfid); + memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid)); + memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen); + reclen = sizeof(struct link_ea_entry) + lname->ln_namelen; + + lee->lee_reclen[0] = (reclen >> 8) & 0xff; + lee->lee_reclen[1] = reclen & 0xff; + return reclen; +} +EXPORT_SYMBOL(linkea_entry_pack); + +void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, + struct lu_name *lname, struct lu_fid *pfid) +{ + LASSERT(lee != NULL); + + *reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1]; + memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid)); + fid_be_to_cpu(pfid, pfid); + if (lname != NULL) { + lname->ln_name = lee->lee_name; + lname->ln_namelen = *reclen - sizeof(struct link_ea_entry); + } +} +EXPORT_SYMBOL(linkea_entry_unpack); + +/** + * Add a record to the end of link ea buf + **/ +int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct link_ea_header *leh = ldata->ld_leh; + int reclen; + + LASSERT(leh != NULL); + + if (lname == NULL || pfid == NULL) + return -EINVAL; + + reclen = lname->ln_namelen + sizeof(struct link_ea_entry); + if (unlikely(leh->leh_len + reclen > MAX_LINKEA_SIZE)) { + /* Use 32-bits to save the overflow time, although it will + * shrink the ktime_get_real_seconds() returned 64-bits value + * to 32-bits value, it is still quite large and can be used + * for about 140 years. That is enough. + */ + leh->leh_overflow_time = ktime_get_real_seconds(); + if (unlikely(leh->leh_overflow_time == 0)) + leh->leh_overflow_time++; + + CDEBUG(D_INODE, "No enough space to hold linkea entry '" + DFID": %.*s' at %u\n", PFID(pfid), lname->ln_namelen, + lname->ln_name, leh->leh_overflow_time); + return 0; + } + + if (leh->leh_len + reclen > ldata->ld_buf->lb_len) { + if (lu_buf_check_and_grow(ldata->ld_buf, + leh->leh_len + reclen) < 0) + return -ENOMEM; + + leh = ldata->ld_leh = ldata->ld_buf->lb_buf; + } + + ldata->ld_lee = ldata->ld_buf->lb_buf + leh->leh_len; + ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid); + leh->leh_len += ldata->ld_reclen; + leh->leh_reccount++; + CDEBUG(D_INODE, "New link_ea name '"DFID":%.*s' is added\n", + PFID(pfid), lname->ln_namelen, lname->ln_name); + return 0; +} +EXPORT_SYMBOL(linkea_add_buf); + +/** Del the current record from the link ea buf */ +void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname) +{ + LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL); + LASSERT(ldata->ld_leh->leh_reccount > 0); + + ldata->ld_leh->leh_reccount--; + ldata->ld_leh->leh_len -= ldata->ld_reclen; + memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen, + (char *)ldata->ld_leh + ldata->ld_leh->leh_len - + (char *)ldata->ld_lee); + CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n", + lname->ln_namelen, lname->ln_name); + + if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh + + ldata->ld_leh->leh_len)) + ldata->ld_lee = NULL; +} +EXPORT_SYMBOL(linkea_del_buf); + +int linkea_links_new(struct linkea_data *ldata, struct lu_buf *buf, + const struct lu_name *cname, const struct lu_fid *pfid) +{ + int rc; + + rc = linkea_data_new(ldata, buf); + if (!rc) + rc = linkea_add_buf(ldata, cname, pfid); + + return rc; +} +EXPORT_SYMBOL(linkea_links_new); + +/** + * Mark the linkEA as overflow with current timestamp, + * and remove the last linkEA entry. + * + * Return the new linkEA size. + */ +int linkea_overflow_shrink(struct linkea_data *ldata) +{ + struct link_ea_header *leh; + struct lu_name tname; + struct lu_fid tfid; + int count; + + leh = ldata->ld_leh = ldata->ld_buf->lb_buf; + if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) { + leh->leh_magic = LINK_EA_MAGIC; + leh->leh_reccount = __swab32(leh->leh_reccount); + leh->leh_overflow_time = __swab32(leh->leh_overflow_time); + leh->leh_padding = __swab32(leh->leh_padding); + } + + LASSERT(leh->leh_reccount > 0); + + leh->leh_len = sizeof(struct link_ea_header); + leh->leh_reccount--; + if (unlikely(leh->leh_reccount == 0)) + return 0; + + leh->leh_overflow_time = ktime_get_real_seconds(); + if (unlikely(leh->leh_overflow_time == 0)) + leh->leh_overflow_time++; + ldata->ld_reclen = 0; + ldata->ld_lee = (struct link_ea_entry *)(leh + 1); + for (count = 0; count < leh->leh_reccount; count++) { + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, + &tname, &tfid); + leh->leh_len += ldata->ld_reclen; + ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + + ldata->ld_reclen); + } + + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, &tname, &tfid); + CDEBUG(D_INODE, "No enough space to hold the last linkea entry '" + DFID": %.*s', shrink it, left %d linkea entries, size %llu\n", + PFID(&tfid), tname.ln_namelen, tname.ln_name, + leh->leh_reccount, leh->leh_len); + + return leh->leh_len; +} +EXPORT_SYMBOL(linkea_overflow_shrink); + +/** + * Check if such a link exists in linkEA. + * + * \param ldata link data the search to be done on + * \param lname name in the parent's directory entry pointing to this object + * \param pfid parent fid the link to be found for + * + * \retval 0 success + * \retval -ENOENT link does not exist + * \retval -ve on error + */ +int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct lu_name tmpname; + struct lu_fid tmpfid; + int count; + + LASSERT(ldata->ld_leh != NULL); + + /* link #0, if leh_reccount == 0 we skip the loop and return -ENOENT */ + if (likely(ldata->ld_leh->leh_reccount > 0)) + ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1); + + for (count = 0; count < ldata->ld_leh->leh_reccount; count++) { + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, + &tmpname, &tmpfid); + if (tmpname.ln_namelen == lname->ln_namelen && + lu_fid_eq(&tmpfid, pfid) && + (strncmp(tmpname.ln_name, lname->ln_name, + tmpname.ln_namelen) == 0)) + break; + ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + + ldata->ld_reclen); + } + + if (count == ldata->ld_leh->leh_reccount) { + CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n", + lname->ln_namelen, lname->ln_name); + ldata->ld_lee = NULL; + ldata->ld_reclen = 0; + return -ENOENT; + } + return 0; +} +EXPORT_SYMBOL(linkea_links_find); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog.c b/drivers/staging/lustrefsx/lustre/obdclass/llog.c new file mode 100644 index 0000000000000..e9228b33339f3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog.c @@ -0,0 +1,1486 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog.c + * + * OST<->MDS recovery logging infrastructure. + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + * Author: Alex Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include +#include +#include +#include +#include +#include "llog_internal.h" + +/* + * Allocate a new log or catalog handle + * Used inside llog_open(). + */ +static struct llog_handle *llog_alloc_handle(void) +{ + struct llog_handle *loghandle; + + OBD_ALLOC_PTR(loghandle); + if (loghandle == NULL) + return NULL; + + init_rwsem(&loghandle->lgh_lock); + mutex_init(&loghandle->lgh_hdr_mutex); + init_rwsem(&loghandle->lgh_last_sem); + INIT_LIST_HEAD(&loghandle->u.phd.phd_entry); + atomic_set(&loghandle->lgh_refcount, 1); + + return loghandle; +} + +/* + * Free llog handle and header data if exists. Used in llog_close() only + */ +static void llog_free_handle(struct llog_handle *loghandle) +{ + LASSERT(loghandle != NULL); + + /* failed llog_init_handle */ + if (loghandle->lgh_hdr == NULL) + goto out; + + if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) + LASSERT(list_empty(&loghandle->u.phd.phd_entry)); + else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + LASSERT(list_empty(&loghandle->u.chd.chd_head)); + OBD_FREE_LARGE(loghandle->lgh_hdr, loghandle->lgh_hdr_size); +out: + OBD_FREE_PTR(loghandle); +} + +struct llog_handle *llog_handle_get(struct llog_handle *loghandle) +{ + if (atomic_inc_not_zero(&loghandle->lgh_refcount)) + return loghandle; + return NULL; +} + +int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle) +{ + int rc = 0; + + if (atomic_dec_and_test(&loghandle->lgh_refcount)) { + struct llog_operations *lop; + + rc = llog_handle2ops(loghandle, &lop); + if (!rc) { + if (lop->lop_close) + rc = lop->lop_close(env, loghandle); + else + rc = -EOPNOTSUPP; + } + llog_free_handle(loghandle); + } + return rc; +} + +static int llog_declare_destroy(const struct lu_env *env, + struct llog_handle *handle, + struct thandle *th) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_declare_destroy == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_declare_destroy(env, handle, th); + + RETURN(rc); +} + +int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th) +{ + struct llog_operations *lop; + int rc; + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc < 0) + RETURN(rc); + if (lop->lop_destroy == NULL) + RETURN(-EOPNOTSUPP); + + LASSERT(handle->lgh_obj != NULL); + if (!llog_exist(handle)) + RETURN(0); + + rc = lop->lop_destroy(env, handle, th); + + RETURN(rc); +} + +int llog_destroy(const struct lu_env *env, struct llog_handle *handle) +{ + struct llog_operations *lop; + struct dt_device *dt; + struct thandle *th; + int rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc < 0) + RETURN(rc); + if (lop->lop_destroy == NULL) + RETURN(-EOPNOTSUPP); + + if (handle->lgh_obj == NULL) { + /* if lgh_obj == NULL, then it is from client side destroy */ + rc = lop->lop_destroy(env, handle, NULL); + RETURN(rc); + } + + if (!llog_exist(handle)) + RETURN(0); + + dt = lu2dt_dev(handle->lgh_obj->do_lu.lo_dev); + + if (unlikely(unlikely(dt->dd_rdonly))) + RETURN(-EROFS); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_declare_destroy(env, handle, th); + if (rc != 0) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc < 0) + GOTO(out_trans, rc); + + rc = lop->lop_destroy(env, handle, th); + +out_trans: + dt_trans_stop(env, dt, th); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_destroy); + +/* returns negative on error; 0 if success; 1 if success & log destroyed */ +int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle, + int index) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_device *dt; + struct llog_log_hdr *llh; + struct thandle *th; + __u32 tmp_lgc_index; + int rc; + int rc1; + bool subtract_count = false; + + ENTRY; + + LASSERT(loghandle != NULL); + LASSERT(loghandle->lgh_ctxt != NULL); + LASSERT(loghandle->lgh_obj != NULL); + + llh = loghandle->lgh_hdr; + + CDEBUG(D_RPCTRACE, "Canceling %d in log "DFID"\n", index, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid)); + + if (index == 0) { + CERROR("Can't cancel index 0 which is header\n"); + RETURN(-EINVAL); + } + + dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev); + + if (unlikely(unlikely(dt->dd_rdonly))) + RETURN(0); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_declare_write_rec(env, loghandle, &llh->llh_hdr, index, th); + if (rc < 0) + GOTO(out_trans, rc); + + if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY)) { + rc = llog_declare_destroy(env, loghandle, th); + if (rc < 0) + GOTO(out_trans, rc); + } + + th->th_wait_submit = 1; + rc = dt_trans_start_local(env, dt, th); + if (rc < 0) + GOTO(out_trans, rc); + + down_write(&loghandle->lgh_lock); + /* clear bitmap */ + mutex_lock(&loghandle->lgh_hdr_mutex); + if (!ext2_clear_bit(index, LLOG_HDR_BITMAP(llh))) { + CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index); + GOTO(out_unlock, rc); + } + + loghandle->lgh_hdr->llh_count--; + subtract_count = true; + + /* Since llog_process_thread use lgi_cookie, it`s better to save them + * and restore after using + */ + tmp_lgc_index = lgi->lgi_cookie.lgc_index; + /* Pass this index to llog_osd_write_rec(), which will use the index + * to only update the necesary bitmap. */ + lgi->lgi_cookie.lgc_index = index; + /* update header */ + rc = llog_write_rec(env, loghandle, &llh->llh_hdr, &lgi->lgi_cookie, + LLOG_HEADER_IDX, th); + lgi->lgi_cookie.lgc_index = tmp_lgc_index; + + if (rc != 0) + GOTO(out_unlock, rc); + + if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1) && + ((loghandle->lgh_last_idx == LLOG_HDR_BITMAP_SIZE(llh) - 1) || + (loghandle->u.phd.phd_cat_handle != NULL && + loghandle->u.phd.phd_cat_handle->u.chd.chd_current_log != + loghandle))) { + /* never try to destroy it again */ + llh->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY; + rc = llog_trans_destroy(env, loghandle, th); + if (rc < 0) { + /* Sigh, can not destroy the final plain llog, but + * the bitmap has been clearly, so the record can not + * be accessed anymore, let's return 0 for now, and + * the orphan will be handled by LFSCK. */ + CERROR("%s: can't destroy empty llog "DFID": rc = %d\n", + loghandle2name(loghandle), + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rc); + GOTO(out_unlock, rc = 0); + } + rc = LLOG_DEL_PLAIN; + } + +out_unlock: + mutex_unlock(&loghandle->lgh_hdr_mutex); + up_write(&loghandle->lgh_lock); +out_trans: + rc1 = dt_trans_stop(env, dt, th); + if (rc == 0) + rc = rc1; + if (rc < 0 && subtract_count) { + mutex_lock(&loghandle->lgh_hdr_mutex); + loghandle->lgh_hdr->llh_count++; + ext2_set_bit(index, LLOG_HDR_BITMAP(llh)); + mutex_unlock(&loghandle->lgh_hdr_mutex); + } + RETURN(rc); +} + +int llog_read_header(const struct lu_env *env, struct llog_handle *handle, + const struct obd_uuid *uuid) +{ + struct llog_operations *lop; + int rc; + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + + if (lop->lop_read_header == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_read_header(env, handle); + if (rc == LLOG_EEMPTY) { + struct llog_log_hdr *llh = handle->lgh_hdr; + + /* lrh_len should be initialized in llog_init_handle */ + handle->lgh_last_idx = 0; /* header is record with index 0 */ + llh->llh_count = 1; /* for the header record */ + llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC; + LASSERT(handle->lgh_ctxt->loc_chunk_size >= + LLOG_MIN_CHUNK_SIZE); + llh->llh_hdr.lrh_len = handle->lgh_ctxt->loc_chunk_size; + llh->llh_hdr.lrh_index = 0; + llh->llh_timestamp = ktime_get_real_seconds(); + if (uuid) + memcpy(&llh->llh_tgtuuid, uuid, + sizeof(llh->llh_tgtuuid)); + llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap); + /* Since update llog header might also call this function, + * let's reset the bitmap to 0 here */ + memset(LLOG_HDR_BITMAP(llh), 0, llh->llh_hdr.lrh_len - + llh->llh_bitmap_offset - + sizeof(llh->llh_tail)); + ext2_set_bit(0, LLOG_HDR_BITMAP(llh)); + LLOG_HDR_TAIL(llh)->lrt_len = llh->llh_hdr.lrh_len; + LLOG_HDR_TAIL(llh)->lrt_index = llh->llh_hdr.lrh_index; + rc = 0; + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_read_header); + +int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, + int flags, struct obd_uuid *uuid) +{ + struct llog_log_hdr *llh; + enum llog_flag fmt = flags & LLOG_F_EXT_MASK; + int rc; + int chunk_size = handle->lgh_ctxt->loc_chunk_size; + ENTRY; + + LASSERT(handle->lgh_hdr == NULL); + + LASSERT(chunk_size >= LLOG_MIN_CHUNK_SIZE); + OBD_ALLOC_LARGE(llh, chunk_size); + if (llh == NULL) + RETURN(-ENOMEM); + + handle->lgh_hdr = llh; + handle->lgh_hdr_size = chunk_size; + /* first assign flags to use llog_client_ops */ + llh->llh_flags = flags; + rc = llog_read_header(env, handle, uuid); + if (rc == 0) { + if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN && + flags & LLOG_F_IS_CAT) || + (llh->llh_flags & LLOG_F_IS_CAT && + flags & LLOG_F_IS_PLAIN))) { + CERROR("%s: llog type is %s but initializing %s\n", + loghandle2name(handle), + llh->llh_flags & LLOG_F_IS_CAT ? + "catalog" : "plain", + flags & LLOG_F_IS_CAT ? "catalog" : "plain"); + GOTO(out, rc = -EINVAL); + } else if (llh->llh_flags & + (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) { + /* + * it is possible to open llog without specifying llog + * type so it is taken from llh_flags + */ + flags = llh->llh_flags; + } else { + /* for some reason the llh_flags has no type set */ + CERROR("llog type is not specified!\n"); + GOTO(out, rc = -EINVAL); + } + if (unlikely(uuid && + !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) { + CERROR("%s: llog uuid mismatch: %s/%s\n", + loghandle2name(handle), + (char *)uuid->uuid, + (char *)llh->llh_tgtuuid.uuid); + GOTO(out, rc = -EEXIST); + } + } + if (flags & LLOG_F_IS_CAT) { + LASSERT(list_empty(&handle->u.chd.chd_head)); + INIT_LIST_HEAD(&handle->u.chd.chd_head); + llh->llh_size = sizeof(struct llog_logid_rec); + llh->llh_flags |= LLOG_F_IS_FIXSIZE; + } else if (!(flags & LLOG_F_IS_PLAIN)) { + CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n", + loghandle2name(handle), flags, LLOG_F_IS_CAT, + LLOG_F_IS_PLAIN); + rc = -EINVAL; + } + llh->llh_flags |= fmt; +out: + if (rc) { + OBD_FREE_LARGE(llh, chunk_size); + handle->lgh_hdr = NULL; + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_init_handle); + +int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec) +{ + int chunk_size = llh->lgh_hdr->llh_hdr.lrh_len; + + if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) { + CERROR("%s: record is too large: %d > %d\n", + loghandle2name(llh), rec->lrh_len, chunk_size); + return -EINVAL; + } + if (rec->lrh_index >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) { + CERROR("%s: index is too high: %d\n", + loghandle2name(llh), rec->lrh_index); + return -EINVAL; + } + if ((rec->lrh_type & LLOG_OP_MASK) != LLOG_OP_MAGIC) { + CERROR("%s: magic %x is bad\n", + loghandle2name(llh), rec->lrh_type); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL(llog_verify_record); + +static int llog_process_thread(void *arg) +{ + struct llog_process_info *lpi = arg; + struct llog_handle *loghandle = lpi->lpi_loghandle; + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = lpi->lpi_catdata; + struct llog_thread_info *lti; + char *buf; + size_t chunk_size; + __u64 cur_offset; + int rc = 0, index = 1, last_index; + int saved_index = 0; + int last_called_index = 0; + bool repeated = false; + bool refresh_idx = false; + + ENTRY; + + if (llh == NULL) + RETURN(-EINVAL); + + lti = lpi->lpi_env == NULL ? NULL : llog_info(lpi->lpi_env); + + cur_offset = chunk_size = llh->llh_hdr.lrh_len; + /* expect chunk_size to be power of two */ + LASSERT(is_power_of_2(chunk_size)); + + OBD_ALLOC_LARGE(buf, chunk_size); + if (buf == NULL) { + lpi->lpi_rc = -ENOMEM; + RETURN(0); + } + + if (cd != NULL) { + last_called_index = cd->lpcd_first_idx; + index = cd->lpcd_first_idx + 1; + } + if (cd != NULL && cd->lpcd_last_idx) + last_index = cd->lpcd_last_idx; + else + last_index = LLOG_HDR_BITMAP_SIZE(llh) - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + off_t chunk_offset = 0; + unsigned int buf_offset = 0; + bool partial_chunk; + int lh_last_idx; + int synced_idx = 0; + + /* skip records not set in bitmap */ + while (index <= last_index && + !ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) + ++index; + + /* There are no indices prior the last_index */ + if (index > last_index) + break; + + CDEBUG(D_OTHER, "index: %d last_index %d\n", index, + last_index); + +repeat: + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, chunk_size); + /* the record index for outdated chunk data */ + /* it is safe to process buffer until saved lgh_last_idx */ + lh_last_idx = LLOG_HDR_TAIL(llh)->lrt_index; + rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index, + index, &cur_offset, buf, chunk_size); + if (repeated && rc) + CDEBUG(D_OTHER, "cur_offset %llu, chunk_offset %llu," + " buf_offset %u, rc = %d\n", cur_offset, + (__u64)chunk_offset, buf_offset, rc); + /* we`ve tried to reread the chunk, but there is no + * new records */ + if (rc == -EIO && repeated && (chunk_offset + buf_offset) == + cur_offset) + GOTO(out, rc = 0); + if (rc != 0) + GOTO(out, rc); + + /* NB: after llog_next_block() call the cur_offset is the + * offset of the next block after read one. + * The absolute offset of the current chunk is calculated + * from cur_offset value and stored in chunk_offset variable. + */ + if ((cur_offset & (chunk_size - 1)) != 0) { + partial_chunk = true; + chunk_offset = cur_offset & ~(chunk_size - 1); + } else { + partial_chunk = false; + chunk_offset = cur_offset - chunk_size; + } + + /* NB: when rec->lrh_len is accessed it is already swabbed + * since it is used at the "end" of the loop and the rec + * swabbing is done at the beginning of the loop. */ + for (rec = (struct llog_rec_hdr *)(buf + buf_offset); + (char *)rec < buf + chunk_size; + rec = llog_rec_hdr_next(rec)) { + + CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n", + rec, rec->lrh_type); + + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n", + rec->lrh_type, rec->lrh_index); + + if (index == (synced_idx + 1) && + synced_idx == LLOG_HDR_TAIL(llh)->lrt_index) + GOTO(out, rc = 0); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) && + cfs_fail_val == (unsigned int) + (loghandle->lgh_id.lgl_oi.oi.oi_id & + 0xFFFFFFFF)) { + OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT); + } + + /* the bitmap could be changed during processing + * records from the chunk. For wrapped catalog + * it means we can read deleted record and try to + * process it. Check this case and reread the chunk. + * It is safe to process to lh_last_idx, including + * lh_last_idx if it was synced. We can not do <= + * comparison, cause for wrapped catalog lgh_last_idx + * could be less than index. So we detect last index + * for processing as index == lh_last_idx+1. But when + * catalog is wrapped and full lgh_last_idx=llh_cat_idx, + * the first processing index is llh_cat_idx+1.The + * exception is !(lgh_last_idx == llh_cat_idx && + * index == llh_cat_idx + 1), and after simplification + * it turns to + * lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index + * This exception is working for catalog only. + */ + + if ((index == lh_last_idx && synced_idx != index) || + (index == (lh_last_idx + 1) && + lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index) || + (rec->lrh_index == 0 && !repeated)) { + + /* save offset inside buffer for the re-read */ + buf_offset = (char *)rec - (char *)buf; + cur_offset = chunk_offset; + repeated = true; + /* We need to be sure lgh_last_idx + * record was saved to disk + */ + down_read(&loghandle->lgh_last_sem); + synced_idx = LLOG_HDR_TAIL(llh)->lrt_index; + up_read(&loghandle->lgh_last_sem); + CDEBUG(D_OTHER, "synced_idx: %d\n", synced_idx); + goto repeat; + + } + + repeated = false; + + rc = llog_verify_record(loghandle, rec); + if (rc) { + CERROR("%s: invalid record in llog "DFID + " record for index %d/%d: rc = %d\n", + loghandle2name(loghandle), + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + rec->lrh_index, index, rc); + /* + * the block seem to be corrupted, let's try + * with the next one. reset rc to go to the + * next chunk. + */ + refresh_idx = true; + index = 0; + GOTO(repeat, rc = 0); + } + + if (rec->lrh_index < index) { + CDEBUG(D_OTHER, "skipping lrh_index %d\n", + rec->lrh_index); + continue; + } + + if (rec->lrh_index != index) { + /* + * the last time we couldn't parse the block due + * to corruption, thus has no idea about the + * next index, take it from the block, once. + */ + if (refresh_idx) { + refresh_idx = false; + index = rec->lrh_index; + } else { + CERROR("%s: "DFID" Invalid record: index" + " %u but expected %u\n", + loghandle2name(loghandle), + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + rec->lrh_index, index); + GOTO(out, rc = -ERANGE); + } + } + + CDEBUG(D_OTHER, + "lrh_index: %d lrh_len: %d (%d remains)\n", + rec->lrh_index, rec->lrh_len, + (int)(buf + chunk_size - (char *)rec)); + + /* lgh_cur_offset is used only at llog_test_3 */ + loghandle->lgh_cur_offset = (char *)rec - (char *)buf + + chunk_offset; + + /* if set, process the callback on this record */ + if (ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) { + struct llog_cookie *lgc; + __u64 tmp_off; + int tmp_idx; + + CDEBUG((llh->llh_flags & LLOG_F_IS_CAT ? + D_HA : D_OTHER), + "index: %d, lh_last_idx: %d " + "synced_idx: %d lgh_last_idx: %d\n", + index, lh_last_idx, synced_idx, + loghandle->lgh_last_idx); + + if (lti != NULL) { + lgc = <i->lgi_cookie; + /* store lu_env for recursive calls */ + tmp_off = lgc->lgc_offset; + tmp_idx = lgc->lgc_index; + + lgc->lgc_offset = (char *)rec - + (char *)buf + chunk_offset; + lgc->lgc_index = rec->lrh_index; + } + /* using lu_env for passing record offset to + * llog_write through various callbacks */ + rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec, + lpi->lpi_cbdata); + last_called_index = index; + + if (lti != NULL) { + lgc->lgc_offset = tmp_off; + lgc->lgc_index = tmp_idx; + } + + if (rc == LLOG_PROC_BREAK) { + GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + rc = llog_cancel_rec(lpi->lpi_env, + loghandle, + rec->lrh_index); + } + if (rc) + GOTO(out, rc); + /* some stupid callbacks directly cancel records + * and delete llog. Check it and stop + * processing. */ + if (loghandle->lgh_hdr == NULL || + loghandle->lgh_hdr->llh_count == 1) + GOTO(out, rc = 0); + } + /* exit if the last index is reached */ + if (index >= last_index) + GOTO(out, rc = 0); + ++index; + } + } + +out: + CDEBUG(D_HA, "stop processing %s "DOSTID":%x index %d count %d\n", + ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" : "plain"), + POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen, + index, llh->llh_count); + + if (cd != NULL) + cd->lpcd_last_idx = last_called_index; + + if (unlikely(rc == -EIO && loghandle->lgh_obj != NULL)) { + if (dt_object_remote(loghandle->lgh_obj)) { + /* If it is remote object, then -EIO might means + * disconnection or eviction, let's return -EAGAIN, + * so for update recovery log processing, it will + * retry until the umount or abort recovery, see + * lod_sub_recovery_thread() */ + CERROR("%s retry remote llog process\n", + loghandle2name(loghandle)); + rc = -EAGAIN; + } else { + /* something bad happened to the processing of a local + * llog file, probably I/O error or the log got + * corrupted to be able to finally release the log we + * discard any remaining bits in the header */ + CERROR("%s: Local llog found corrupted #"DOSTID":%x" + " %s index %d count %d\n", + loghandle2name(loghandle), + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, + ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" : + "plain"), index, llh->llh_count); + + while (index <= last_index) { + if (ext2_test_bit(index, + LLOG_HDR_BITMAP(llh)) != 0) + llog_cancel_rec(lpi->lpi_env, loghandle, + index); + index++; + } + rc = 0; + } + } + + OBD_FREE_LARGE(buf, chunk_size); + lpi->lpi_rc = rc; + return 0; +} + +static int llog_process_thread_daemonize(void *arg) +{ + struct llog_process_info *lpi = arg; + struct lu_env env; + int rc; + struct nsproxy *new_ns, *curr_ns = current->nsproxy; + + task_lock(lpi->lpi_reftask); + new_ns = lpi->lpi_reftask->nsproxy; + if (curr_ns != new_ns) { + get_nsproxy(new_ns); + + current->nsproxy = new_ns; + /* XXX: we should call put_nsproxy() instead of + * atomic_dec(&ns->count) directly. But put_nsproxy() cannot be + * used outside of the kernel itself, because it calls + * free_nsproxy() which is not exported by the kernel + * (defined in kernel/nsproxy.c) */ + if (curr_ns) + atomic_dec(&curr_ns->count); + } + task_unlock(lpi->lpi_reftask); + + unshare_fs_struct(); + + /* client env has no keys, tags is just 0 */ + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + goto out; + lpi->lpi_env = &env; + + rc = llog_process_thread(arg); + + lu_env_fini(&env); +out: + complete(&lpi->lpi_completion); + return rc; +} + +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork) +{ + struct llog_process_info *lpi; + int rc; + + ENTRY; + + OBD_ALLOC_PTR(lpi); + if (lpi == NULL) { + CERROR("cannot alloc pointer\n"); + RETURN(-ENOMEM); + } + lpi->lpi_loghandle = loghandle; + lpi->lpi_cb = cb; + lpi->lpi_cbdata = data; + lpi->lpi_catdata = catdata; + + if (fork) { + struct task_struct *task; + + /* The new thread can't use parent env, + * init the new one in llog_process_thread_daemonize. */ + lpi->lpi_env = NULL; + init_completion(&lpi->lpi_completion); + /* take reference to current, so that + * llog_process_thread_daemonize() can use it to switch to + * namespace associated with current */ + lpi->lpi_reftask = current; + task = kthread_run(llog_process_thread_daemonize, lpi, + "llog_process_thread"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("%s: cannot start thread: rc = %d\n", + loghandle2name(loghandle), rc); + GOTO(out_lpi, rc); + } + wait_for_completion(&lpi->lpi_completion); + } else { + lpi->lpi_env = env; + llog_process_thread(lpi); + } + rc = lpi->lpi_rc; + +out_lpi: + OBD_FREE_PTR(lpi); + RETURN(rc); +} +EXPORT_SYMBOL(llog_process_or_fork); + +int llog_process(const struct lu_env *env, struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata) +{ + int rc; + rc = llog_process_or_fork(env, loghandle, cb, data, catdata, true); + return rc == LLOG_DEL_PLAIN ? 0 : rc; +} +EXPORT_SYMBOL(llog_process); + +int llog_reverse_process(const struct lu_env *env, + struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata) +{ + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = catdata; + void *buf; + int rc = 0, first_index = 1, index, idx; + __u32 chunk_size = llh->llh_hdr.lrh_len; + ENTRY; + + OBD_ALLOC_LARGE(buf, chunk_size); + if (buf == NULL) + RETURN(-ENOMEM); + + if (cd != NULL) + first_index = cd->lpcd_first_idx + 1; + if (cd != NULL && cd->lpcd_last_idx) + index = cd->lpcd_last_idx; + else + index = LLOG_HDR_BITMAP_SIZE(llh) - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + struct llog_rec_tail *tail; + + /* skip records not set in bitmap */ + while (index >= first_index && + !ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) + --index; + + LASSERT(index >= first_index - 1); + if (index == first_index - 1) + break; + + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, chunk_size); + rc = llog_prev_block(env, loghandle, index, buf, chunk_size); + if (rc) + GOTO(out, rc); + + rec = buf; + idx = rec->lrh_index; + CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx); + while (idx < index) { + rec = (void *)rec + rec->lrh_len; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + idx ++; + } + LASSERT(idx == index); + tail = (void *)rec + rec->lrh_len - sizeof(*tail); + + /* process records in buffer, starting where we found one */ + while ((void *)tail > buf) { + if (tail->lrt_index == 0) + GOTO(out, rc = 0); /* no more records */ + + /* if set, process the callback on this record */ + if (ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) { + rec = (void *)tail - tail->lrt_len + + sizeof(*tail); + + rc = cb(env, loghandle, rec, data); + if (rc == LLOG_PROC_BREAK) { + GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + rc = llog_cancel_rec(env, loghandle, + tail->lrt_index); + } + if (rc) + GOTO(out, rc); + } + + /* previous record, still in buffer? */ + --index; + if (index < first_index) + GOTO(out, rc = 0); + tail = (void *)tail - tail->lrt_len; + } + } + +out: + if (buf != NULL) + OBD_FREE_LARGE(buf, chunk_size); + RETURN(rc); +} +EXPORT_SYMBOL(llog_reverse_process); + +/** + * new llog API + * + * API functions: + * llog_open - open llog, may not exist + * llog_exist - check if llog exists + * llog_close - close opened llog, pair for open, frees llog_handle + * llog_declare_create - declare llog creation + * llog_create - create new llog on disk, need transaction handle + * llog_declare_write_rec - declaration of llog write + * llog_write_rec - write llog record on disk, need transaction handle + * llog_declare_add - declare llog catalog record addition + * llog_add - add llog record in catalog, need transaction handle + */ +int llog_exist(struct llog_handle *loghandle) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_exist == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_exist(loghandle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_exist); + +int llog_declare_create(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_declare_create == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_declare_create(env, loghandle, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} + +int llog_create(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_create == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_create(env, handle, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} + +int llog_declare_write_rec(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, int idx, + struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + LASSERT(lop); + if (lop->lop_declare_write_rec == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_declare_write_rec(env, handle, rec, idx, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} + +int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + int idx, struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc, buflen; + + ENTRY; + + /* API sanity checks */ + if (handle == NULL) { + CERROR("loghandle is missed\n"); + RETURN(-EPROTO); + } else if (handle->lgh_obj == NULL) { + CERROR("loghandle %p with NULL object\n", + handle); + RETURN(-EPROTO); + } else if (th == NULL) { + CERROR("%s: missed transaction handle\n", + loghandle2name(handle)); + RETURN(-EPROTO); + } else if (handle->lgh_hdr == NULL) { + CERROR("%s: loghandle %p with no header\n", + loghandle2name(handle), handle); + RETURN(-EPROTO); + } + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + + if (lop->lop_write_rec == NULL) + RETURN(-EOPNOTSUPP); + + buflen = rec->lrh_len; + LASSERT(cfs_size_round(buflen) == buflen); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_write_rec(env, handle, rec, logcookies, idx, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} + +int llog_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + struct thandle *th) +{ + int raised, rc; + + ENTRY; + + if (lgh->lgh_logops->lop_add == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_add); + +int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th) +{ + int raised, rc; + + ENTRY; + + if (lgh->lgh_logops->lop_declare_add == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_declare_add); + +/** + * Helper function to open llog or create it if doesn't exist. + * It hides all transaction handling from caller. + */ +int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **res, struct llog_logid *logid, + char *name) +{ + struct dt_device *d; + struct thandle *th; + int rc; + + ENTRY; + + rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW); + if (rc) + RETURN(rc); + + if (llog_exist(*res)) + RETURN(0); + + LASSERT((*res)->lgh_obj != NULL); + + d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev); + + if (unlikely(unlikely(d->dd_rdonly))) + RETURN(-EROFS); + + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + /* Create update llog object synchronously, which + * happens during inialization process see + * lod_sub_prep_llog(), to make sure the update + * llog object is created before corss-MDT writing + * updates into the llog object */ + if (ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) + th->th_sync = 1; + + th->th_wait_submit = 1; + rc = llog_declare_create(env, *res, th); + if (rc == 0) { + rc = dt_trans_start_local(env, d, th); + if (rc == 0) + rc = llog_create(env, *res, th); + } + dt_trans_stop(env, d, th); +out: + if (rc) + llog_close(env, *res); + RETURN(rc); +} +EXPORT_SYMBOL(llog_open_create); + +/** + * Helper function to delete existent llog. + */ +int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_logid *logid, char *name) +{ + struct llog_handle *handle; + int rc = 0, rc2; + + ENTRY; + + /* nothing to erase */ + if (name == NULL && logid == NULL) + RETURN(0); + + rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS); + if (rc < 0) + RETURN(rc); + + rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL); + if (rc == 0) + rc = llog_destroy(env, handle); + + rc2 = llog_close(env, handle); + if (rc == 0) + rc = rc2; + RETURN(rc); +} +EXPORT_SYMBOL(llog_erase); + +/* + * Helper function for write record in llog. + * It hides all transaction handling from caller. + * Valid only with local llog. + */ +int llog_write(const struct lu_env *env, struct llog_handle *loghandle, + struct llog_rec_hdr *rec, int idx) +{ + struct dt_device *dt; + struct thandle *th; + bool need_cookie; + int rc; + + ENTRY; + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + LASSERT(loghandle->lgh_obj != NULL); + + dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev); + + if (unlikely(unlikely(dt->dd_rdonly))) + RETURN(-EROFS); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_declare_write_rec(env, loghandle, rec, idx, th); + if (rc) + GOTO(out_trans, rc); + + th->th_wait_submit = 1; + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(out_trans, rc); + + need_cookie = !(idx == LLOG_HEADER_IDX || idx == LLOG_NEXT_IDX); + + down_write(&loghandle->lgh_lock); + if (need_cookie) { + struct llog_thread_info *lti = llog_info(env); + + /* cookie comes from llog_process_thread */ + rc = llog_write_rec(env, loghandle, rec, <i->lgi_cookie, + rec->lrh_index, th); + /* upper layer didn`t pass cookie so change rc */ + rc = (rc == 1 ? 0 : rc); + } else { + rc = llog_write_rec(env, loghandle, rec, NULL, idx, th); + } + + up_write(&loghandle->lgh_lock); +out_trans: + dt_trans_stop(env, dt, th); + RETURN(rc); +} +EXPORT_SYMBOL(llog_write); + +int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param) +{ + int raised; + int rc; + + ENTRY; + + LASSERT(ctxt); + LASSERT(ctxt->loc_logops); + + if (ctxt->loc_logops->lop_open == NULL) { + *lgh = NULL; + RETURN(-EOPNOTSUPP); + } + + *lgh = llog_alloc_handle(); + if (*lgh == NULL) + RETURN(-ENOMEM); + (*lgh)->lgh_ctxt = ctxt; + (*lgh)->lgh_logops = ctxt->loc_logops; + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + if (rc) { + llog_free_handle(*lgh); + *lgh = NULL; + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_open); + +int llog_close(const struct lu_env *env, struct llog_handle *loghandle) +{ + return llog_handle_put(env, loghandle); +} +EXPORT_SYMBOL(llog_close); + +/** + * Helper function to get the llog size in records. It is used by MGS + * mostly to check that config llog exists and contains data. + * + * \param[in] env execution environment + * \param[in] ctxt llog context + * \param[in] name llog name + * + * \retval true if there are records in llog besides a header + * \retval false on error or llog without records + */ +int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name) +{ + struct llog_handle *llh; + int rc = 0; + + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc < 0) { + if (likely(rc == -ENOENT)) + rc = 0; + GOTO(out, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_close, rc); + rc = llog_get_size(llh); + +out_close: + llog_close(env, llh); +out: + /* The header is record 1, the llog is still considered as empty + * if there is only header */ + return (rc <= 1); +} +EXPORT_SYMBOL(llog_is_empty); + +int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_handle *copy_llh = data; + + /* Append all records */ + return llog_write(env, copy_llh, rec, LLOG_NEXT_IDX); +} + +/* backup plain llog */ +int llog_backup(const struct lu_env *env, struct obd_device *obd, + struct llog_ctxt *ctxt, struct llog_ctxt *bctxt, + char *name, char *backup) +{ + struct llog_handle *llh, *bllh; + int rc; + + ENTRY; + + /* open original log */ + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc < 0) { + /* the -ENOENT case is also reported to the caller + * but silently so it should handle that if needed. + */ + if (rc != -ENOENT) + CERROR("%s: failed to open log %s: rc = %d\n", + obd->obd_name, name, rc); + RETURN(rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_close, rc); + + /* Make sure there's no old backup log */ + rc = llog_erase(env, bctxt, NULL, backup); + if (rc < 0 && rc != -ENOENT) + GOTO(out_close, rc); + + /* open backup log */ + rc = llog_open_create(env, bctxt, &bllh, NULL, backup); + if (rc) { + CERROR("%s: failed to open backup logfile %s: rc = %d\n", + obd->obd_name, backup, rc); + GOTO(out_close, rc); + } + + /* check that backup llog is not the same object as original one */ + if (llh->lgh_obj == bllh->lgh_obj) { + CERROR("%s: backup llog %s to itself (%s), objects %p/%p\n", + obd->obd_name, name, backup, llh->lgh_obj, + bllh->lgh_obj); + GOTO(out_backup, rc = -EEXIST); + } + + rc = llog_init_handle(env, bllh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_backup, rc); + + /* Copy log record by record */ + rc = llog_process_or_fork(env, llh, llog_copy_handler, (void *)bllh, + NULL, false); + if (rc) + CERROR("%s: failed to backup log %s: rc = %d\n", + obd->obd_name, name, rc); +out_backup: + llog_close(env, bllh); +out_close: + llog_close(env, llh); + RETURN(rc); +} +EXPORT_SYMBOL(llog_backup); + +/* Get size of llog */ +__u64 llog_size(const struct lu_env *env, struct llog_handle *llh) +{ + int rc; + struct lu_attr la; + + rc = llh->lgh_obj->do_ops->do_attr_get(env, llh->lgh_obj, &la); + if (rc) { + CERROR("%s: attr_get failed for "DFID": rc = %d\n", + loghandle2name(llh), PFID(&llh->lgh_id.lgl_oi.oi_fid), + rc); + return 0; + } + + return la.la_size; +} +EXPORT_SYMBOL(llog_size); + diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c new file mode 100644 index 0000000000000..91f029052585e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c @@ -0,0 +1,1169 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_cat.c + * + * OST<->MDS recovery logging infrastructure. + * + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + * Author: Alexey Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include + +#include "llog_internal.h" + +/* Create a new log handle and add it to the open list. + * This log handle will be closed when all of the records in it are removed. + * + * Assumes caller has already pushed us into the kernel context and is locking. + */ +static int llog_cat_new_log(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_handle *loghandle, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_logid_rec *rec = &lgi->lgi_logid; + struct thandle *handle = NULL; + struct dt_device *dt = NULL; + struct llog_log_hdr *llh = cathandle->lgh_hdr; + int rc, index; + + ENTRY; + + index = (cathandle->lgh_last_idx + 1) % + (OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) ? (cfs_fail_val + 1) : + LLOG_HDR_BITMAP_SIZE(llh)); + + /* check that new llog index will not overlap with the first one. + * - llh_cat_idx is the index just before the first/oldest still in-use + * index in catalog + * - lgh_last_idx is the last/newest used index in catalog + * + * When catalog is not wrapped yet then lgh_last_idx is always larger + * than llh_cat_idx. After the wrap around lgh_last_idx re-starts + * from 0 and llh_cat_idx becomes the upper limit for it + * + * Check if catalog has already wrapped around or not by comparing + * last_idx and cat_idx */ + if ((index == llh->llh_cat_idx + 1 && llh->llh_count > 1) || + (index == 0 && llh->llh_cat_idx == 0)) { + if (cathandle->lgh_name == NULL) { + CWARN("%s: there are no more free slots in catalog " + DFID":%x\n", + loghandle2name(loghandle), + PFID(&cathandle->lgh_id.lgl_oi.oi_fid), + cathandle->lgh_id.lgl_ogen); + } else { + CWARN("%s: there are no more free slots in " + "catalog %s\n", loghandle2name(loghandle), + cathandle->lgh_name); + } + RETURN(-ENOSPC); + } + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED)) + RETURN(-ENOSPC); + + if (loghandle->lgh_hdr != NULL) { + /* If llog object is remote and creation is failed, lgh_hdr + * might be left over here, free it first */ + LASSERT(!llog_exist(loghandle)); + OBD_FREE_LARGE(loghandle->lgh_hdr, loghandle->lgh_hdr_size); + loghandle->lgh_hdr = NULL; + } + + if (th == NULL) { + dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev); + + handle = dt_trans_create(env, dt); + if (IS_ERR(handle)) + RETURN(PTR_ERR(handle)); + + /* Create update llog object synchronously, which + * happens during inialization process see + * lod_sub_prep_llog(), to make sure the update + * llog object is created before corss-MDT writing + * updates into the llog object */ + if (cathandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) + handle->th_sync = 1; + + handle->th_wait_submit = 1; + + rc = llog_declare_create(env, loghandle, handle); + if (rc != 0) + GOTO(out, rc); + + rec->lid_hdr.lrh_len = sizeof(*rec); + rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + rec->lid_id = loghandle->lgh_id; + rc = llog_declare_write_rec(env, cathandle, &rec->lid_hdr, -1, + handle); + if (rc != 0) + GOTO(out, rc); + + rc = dt_trans_start_local(env, dt, handle); + if (rc != 0) + GOTO(out, rc); + + th = handle; + } + + rc = llog_create(env, loghandle, th); + /* if llog is already created, no need to initialize it */ + if (rc == -EEXIST) { + GOTO(out, rc = 0); + } else if (rc != 0) { + CERROR("%s: can't create new plain llog in catalog: rc = %d\n", + loghandle2name(loghandle), rc); + GOTO(out, rc); + } + + rc = llog_init_handle(env, loghandle, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &cathandle->lgh_hdr->llh_tgtuuid); + if (rc < 0) + GOTO(out, rc); + + /* build the record for this log in the catalog */ + rec->lid_hdr.lrh_len = sizeof(*rec); + rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + rec->lid_id = loghandle->lgh_id; + + /* append the new record into catalog. The new index will be + * assigned to the record and updated in rec header */ + rc = llog_write_rec(env, cathandle, &rec->lid_hdr, + &loghandle->u.phd.phd_cookie, LLOG_NEXT_IDX, th); + if (rc < 0) + GOTO(out_destroy, rc); + + CDEBUG(D_OTHER, "new plain log "DFID".%u of catalog "DFID"\n", + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rec->lid_hdr.lrh_index, + PFID(&cathandle->lgh_id.lgl_oi.oi_fid)); + + loghandle->lgh_hdr->llh_cat_idx = rec->lid_hdr.lrh_index; + + /* limit max size of plain llog so that space can be + * released sooner, especially on small filesystems */ + /* 2MB for the cases when free space hasn't been learned yet */ + loghandle->lgh_max_size = 2 << 20; + dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev); + rc = dt_statfs(env, dt, &lgi->lgi_statfs); + if (rc == 0 && lgi->lgi_statfs.os_bfree > 0) { + __u64 freespace = (lgi->lgi_statfs.os_bfree * + lgi->lgi_statfs.os_bsize) >> 6; + if (freespace < loghandle->lgh_max_size) + loghandle->lgh_max_size = freespace; + /* shouldn't be > 128MB in any case? + * it's 256K records of 512 bytes each */ + if (freespace > (128 << 20)) + loghandle->lgh_max_size = 128 << 20; + } + rc = 0; + +out: + if (handle != NULL) { + handle->th_result = rc >= 0 ? 0 : rc; + dt_trans_stop(env, dt, handle); + } + RETURN(rc); + +out_destroy: + /* to signal llog_cat_close() it shouldn't try to destroy the llog, + * we want to destroy it in this transaction, otherwise the object + * becomes an orphan */ + loghandle->lgh_hdr->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY; + /* this is to mimic full log, so another llog_cat_current_log() + * can skip it and ask for another onet */ + loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(loghandle->lgh_hdr) + 1; + llog_trans_destroy(env, loghandle, th); + if (handle != NULL) + dt_trans_stop(env, dt, handle); + RETURN(rc); +} + +static int llog_cat_refresh(const struct lu_env *env, + struct llog_handle *cathandle) +{ + struct llog_handle *loghandle; + int rc; + + down_write(&cathandle->lgh_lock); + list_for_each_entry(loghandle, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + if (!llog_exist(loghandle)) + continue; + + rc = llog_read_header(env, loghandle, NULL); + if (rc) + goto unlock; + } + + rc = llog_read_header(env, cathandle, NULL); +unlock: + up_write(&loghandle->lgh_lock); + + return rc; +} + +/* + * prepare current/next log for catalog. + * + * if \a *ploghandle is NULL, open it, and declare create, NB, if \a + * *ploghandle is remote, create it synchronously here, see comments + * below. + * + * \a cathandle->lgh_lock is down_read-ed, it gets down_write-ed if \a + * *ploghandle has to be opened. + */ +static int llog_cat_prep_log(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_handle **ploghandle, + struct thandle *th) +{ + int rc; + int sem_upgraded; + +start: + rc = 0; + sem_upgraded = 0; + if (IS_ERR_OR_NULL(*ploghandle)) { + up_read(&cathandle->lgh_lock); + down_write(&cathandle->lgh_lock); + sem_upgraded = 1; + if (IS_ERR_OR_NULL(*ploghandle)) { + struct llog_handle *loghandle; + + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, + NULL, NULL, LLOG_OPEN_NEW); + if (!rc) { + *ploghandle = loghandle; + list_add_tail(&loghandle->u.phd.phd_entry, + &cathandle->u.chd.chd_head); + } + } + if (rc) + GOTO(out, rc); + } + + rc = llog_exist(*ploghandle); + if (rc < 0) + GOTO(out, rc); + if (rc) + GOTO(out, rc = 0); + + if (dt_object_remote(cathandle->lgh_obj)) { + down_write_nested(&(*ploghandle)->lgh_lock, LLOGH_LOG); + if (!llog_exist(*ploghandle)) { + /* For remote operation, if we put the llog object + * creation in the current transaction, then the + * llog object will not be created on the remote + * target until the transaction stop, if other + * operations start before the transaction stop, + * and use the same llog object, will be dependent + * on the success of this transaction. So let's + * create the llog object synchronously here to + * remove the dependency. */ + rc = llog_cat_new_log(env, cathandle, *ploghandle, + NULL); + if (rc == -ESTALE) { + up_write(&(*ploghandle)->lgh_lock); + if (sem_upgraded) + up_write(&cathandle->lgh_lock); + else + up_read(&cathandle->lgh_lock); + + rc = llog_cat_refresh(env, cathandle); + down_read_nested(&cathandle->lgh_lock, + LLOGH_CAT); + if (rc) + return rc; + /* *ploghandle might become NULL, restart */ + goto start; + } + } + up_write(&(*ploghandle)->lgh_lock); + } else { + struct llog_thread_info *lgi = llog_info(env); + struct llog_logid_rec *lirec = &lgi->lgi_logid; + + rc = llog_declare_create(env, *ploghandle, th); + if (rc) + GOTO(out, rc); + + lirec->lid_hdr.lrh_len = sizeof(*lirec); + rc = llog_declare_write_rec(env, cathandle, &lirec->lid_hdr, -1, + th); + } + +out: + if (sem_upgraded) { + up_write(&cathandle->lgh_lock); + down_read_nested(&cathandle->lgh_lock, LLOGH_CAT); + if (rc == 0) + goto start; + } + return rc; +} + +/* Open an existent log handle and add it to the open list. + * This log handle will be closed when all of the records in it are removed. + * + * Assumes caller has already pushed us into the kernel context and is locking. + * We return a lock on the handle to ensure nobody yanks it from us. + * + * This takes extra reference on llog_handle via llog_handle_get() and require + * this reference to be put by caller using llog_handle_put() + */ +int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle **res, struct llog_logid *logid) +{ + struct llog_handle *loghandle; + enum llog_flag fmt; + int rc = 0; + + ENTRY; + + if (cathandle == NULL) + RETURN(-EBADF); + + fmt = cathandle->lgh_hdr->llh_flags & LLOG_F_EXT_MASK; + down_write(&cathandle->lgh_lock); + list_for_each_entry(loghandle, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + struct llog_logid *cgl = &loghandle->lgh_id; + + if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) && + ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) { + if (cgl->lgl_ogen != logid->lgl_ogen) { + CWARN("%s: log "DFID" generation %x != %x\n", + loghandle2name(loghandle), + PFID(&logid->lgl_oi.oi_fid), + cgl->lgl_ogen, logid->lgl_ogen); + continue; + } + *res = llog_handle_get(loghandle); + if (!*res) { + CERROR("%s: log "DFID" refcount is zero!\n", + loghandle2name(loghandle), + PFID(&logid->lgl_oi.oi_fid)); + continue; + } + loghandle->u.phd.phd_cat_handle = cathandle; + up_write(&cathandle->lgh_lock); + RETURN(rc); + } + } + up_write(&cathandle->lgh_lock); + + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL, + LLOG_OPEN_EXISTS); + if (rc < 0) { + CERROR("%s: error opening log id "DFID":%x: rc = %d\n", + loghandle2name(cathandle), PFID(&logid->lgl_oi.oi_fid), + logid->lgl_ogen, rc); + RETURN(rc); + } + + rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN | fmt, NULL); + if (rc < 0) { + llog_close(env, loghandle); + *res = NULL; + RETURN(rc); + } + + *res = llog_handle_get(loghandle); + LASSERT(*res); + down_write(&cathandle->lgh_lock); + list_add_tail(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head); + up_write(&cathandle->lgh_lock); + + loghandle->u.phd.phd_cat_handle = cathandle; + loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id; + loghandle->u.phd.phd_cookie.lgc_index = + loghandle->lgh_hdr->llh_cat_idx; + RETURN(0); +} + +int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle) +{ + struct llog_handle *loghandle, *n; + int rc; + + ENTRY; + + list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + struct llog_log_hdr *llh = loghandle->lgh_hdr; + int index; + + /* unlink open-not-created llogs */ + list_del_init(&loghandle->u.phd.phd_entry); + llh = loghandle->lgh_hdr; + if (loghandle->lgh_obj != NULL && llh != NULL && + (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1)) { + rc = llog_destroy(env, loghandle); + if (rc) + CERROR("%s: failure destroying log during " + "cleanup: rc = %d\n", + loghandle2name(loghandle), rc); + + index = loghandle->u.phd.phd_cookie.lgc_index; + llog_cat_cleanup(env, cathandle, NULL, index); + } + llog_close(env, loghandle); + } + /* if handle was stored in ctxt, remove it too */ + if (cathandle->lgh_ctxt->loc_handle == cathandle) + cathandle->lgh_ctxt->loc_handle = NULL; + rc = llog_close(env, cathandle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_close); + +/** + * lockdep markers for nested struct llog_handle::lgh_lock locking. + */ +enum { + LLOGH_CAT, + LLOGH_LOG +}; + +/** Return the currently active log handle. If the current log handle doesn't + * have enough space left for the current record, start a new one. + * + * If reclen is 0, we only want to know what the currently active log is, + * otherwise we get a lock on this log so nobody can steal our space. + * + * Assumes caller has already pushed us into the kernel context and is locking. + * + * NOTE: loghandle is write-locked upon successful return + */ +static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle, + struct thandle *th) +{ + struct llog_handle *loghandle = NULL; + ENTRY; + + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED2)) { + down_write_nested(&cathandle->lgh_lock, LLOGH_CAT); + GOTO(next, loghandle); + } + + down_read_nested(&cathandle->lgh_lock, LLOGH_CAT); + loghandle = cathandle->u.chd.chd_current_log; + if (loghandle) { + struct llog_log_hdr *llh; + + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + llh = loghandle->lgh_hdr; + if (llh == NULL || !llog_is_full(loghandle)) { + up_read(&cathandle->lgh_lock); + RETURN(loghandle); + } else { + up_write(&loghandle->lgh_lock); + } + } + up_read(&cathandle->lgh_lock); + + /* time to use next log */ + + /* first, we have to make sure the state hasn't changed */ + down_write_nested(&cathandle->lgh_lock, LLOGH_CAT); + loghandle = cathandle->u.chd.chd_current_log; + if (loghandle) { + struct llog_log_hdr *llh; + + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + llh = loghandle->lgh_hdr; + if (llh == NULL || !llog_is_full(loghandle)) + GOTO(out_unlock, loghandle); + else + up_write(&loghandle->lgh_lock); + } + +next: + /* Sigh, the chd_next_log and chd_current_log is initialized + * in declare phase, and we do not serialize the catlog + * accessing, so it might be possible the llog creation + * thread (see llog_cat_declare_add_rec()) did not create + * llog successfully, then the following thread might + * meet this situation. */ + if (IS_ERR_OR_NULL(cathandle->u.chd.chd_next_log)) { + CERROR("%s: next log does not exist!\n", + loghandle2name(cathandle)); + loghandle = ERR_PTR(-EIO); + if (cathandle->u.chd.chd_next_log == NULL) { + /* Store the error in chd_next_log, so + * the following process can get correct + * failure value */ + cathandle->u.chd.chd_next_log = loghandle; + } + GOTO(out_unlock, loghandle); + } + + CDEBUG(D_INODE, "use next log\n"); + + loghandle = cathandle->u.chd.chd_next_log; + cathandle->u.chd.chd_current_log = loghandle; + cathandle->u.chd.chd_next_log = NULL; + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + +out_unlock: + up_write(&cathandle->lgh_lock); + LASSERT(loghandle); + RETURN(loghandle); +} + +/* Add a single record to the recovery log(s) using a catalog + * Returns as llog_write_record + * + * Assumes caller has already pushed us into the kernel context. + */ +int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + struct thandle *th) +{ + struct llog_handle *loghandle; + int rc, retried = 0; + ENTRY; + + LASSERT(rec->lrh_len <= cathandle->lgh_ctxt->loc_chunk_size); + +retry: + loghandle = llog_cat_current_log(cathandle, th); + if (IS_ERR(loghandle)) + RETURN(PTR_ERR(loghandle)); + + /* loghandle is already locked by llog_cat_current_log() for us */ + if (!llog_exist(loghandle)) { + rc = llog_cat_new_log(env, cathandle, loghandle, th); + if (rc < 0) { + up_write(&loghandle->lgh_lock); + /* nobody should be trying to use this llog */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_current_log == loghandle) + cathandle->u.chd.chd_current_log = NULL; + up_write(&cathandle->lgh_lock); + RETURN(rc); + } + } + /* now let's try to add the record */ + rc = llog_write_rec(env, loghandle, rec, reccookie, LLOG_NEXT_IDX, th); + if (rc < 0) { + CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR, + "llog_write_rec %d: lh=%p\n", rc, loghandle); + /* -ENOSPC is returned if no empty records left + * and when it's lack of space on the stogage. + * there is no point to try again if it's the second + * case. many callers (like llog test) expect ENOSPC, + * so we preserve this error code, but look for the + * actual cause here */ + if (rc == -ENOSPC && llog_is_full(loghandle)) + rc = -ENOBUFS; + } + up_write(&loghandle->lgh_lock); + + if (rc == -ENOBUFS) { + if (retried++ == 0) + GOTO(retry, rc); + CERROR("%s: error on 2nd llog: rc = %d\n", + loghandle2name(cathandle), rc); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_add_rec); + +int llog_cat_declare_add_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct thandle *th) +{ + int rc; + + ENTRY; + +start: + down_read_nested(&cathandle->lgh_lock, LLOGH_CAT); + rc = llog_cat_prep_log(env, cathandle, + &cathandle->u.chd.chd_current_log, th); + if (rc) + GOTO(unlock, rc); + + rc = llog_cat_prep_log(env, cathandle, &cathandle->u.chd.chd_next_log, + th); + if (rc) + GOTO(unlock, rc); + + rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log, + rec, -1, th); + if (rc == -ESTALE && dt_object_remote(cathandle->lgh_obj)) { + up_read(&cathandle->lgh_lock); + rc = llog_cat_refresh(env, cathandle); + if (rc) + RETURN(rc); + goto start; + } + +#if 0 + /* + * XXX: we hope for declarations made for existing llog this might be + * not correct with some backends where declarations are expected + * against specific object like ZFS with full debugging enabled. + */ + rc = llog_declare_write_rec(env, cathandle->u.chd.chd_next_log, rec, -1, + th); +#endif +unlock: + up_read(&cathandle->lgh_lock); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_declare_add_rec); + +int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie) +{ + struct llog_ctxt *ctxt; + struct dt_device *dt; + struct thandle *th = NULL; + int rc; + + ctxt = cathandle->lgh_ctxt; + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + + LASSERT(cathandle->lgh_obj != NULL); + dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_cat_declare_add_rec(env, cathandle, rec, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(out_trans, rc); + rc = llog_cat_add_rec(env, cathandle, rec, reccookie, th); +out_trans: + dt_trans_stop(env, dt, th); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_add); + +/* For each cookie in the cookie array, we clear the log in-use bit and either: + * - the log is empty, so mark it free in the catalog header and delete it + * - the log is not empty, just write out the log header + * + * The cookies may be in different log files, so we need to get new logs + * each time. + * + * Assumes caller has already pushed us into the kernel context. + */ +int llog_cat_cancel_records(const struct lu_env *env, + struct llog_handle *cathandle, int count, + struct llog_cookie *cookies) +{ + int i, index, rc = 0, failed = 0; + + ENTRY; + + for (i = 0; i < count; i++, cookies++) { + struct llog_handle *loghandle; + struct llog_logid *lgl = &cookies->lgc_lgl; + int lrc; + + rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl); + if (rc) { + CDEBUG(D_HA, "%s: cannot find llog for handle "DFID":%x" + ": rc = %d\n", loghandle2name(cathandle), + PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, rc); + failed++; + continue; + } + + if ((cathandle->lgh_ctxt->loc_flags & + LLOG_CTXT_FLAG_NORMAL_FID) && !llog_exist(loghandle)) { + /* For update log, some of loghandles of cathandle + * might not exist because remote llog creation might + * be failed, so let's skip the record cancellation + * for these non-exist llogs. + */ + lrc = -ENOENT; + CDEBUG(D_HA, "%s: llog "DFID":%x does not exist" + ": rc = %d\n", loghandle2name(cathandle), + PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, lrc); + failed++; + if (rc == 0) + rc = lrc; + continue; + } + + lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index); + if (lrc == LLOG_DEL_PLAIN) { /* log has been destroyed */ + index = loghandle->u.phd.phd_cookie.lgc_index; + lrc = llog_cat_cleanup(env, cathandle, loghandle, + index); + if (rc == 0) + rc = lrc; + } else if (lrc == -ENOENT) { + if (rc == 0) /* ENOENT shouldn't rewrite any error */ + rc = lrc; + } else if (lrc < 0) { + failed++; + if (rc == 0) + rc = lrc; + } + llog_handle_put(env, loghandle); + } + if (rc) + CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n", + loghandle2name(cathandle), failed, count, rc); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_cancel_records); + +static int llog_cat_process_common(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, + struct llog_handle **llhp) +{ + struct llog_logid_rec *lir = container_of(rec, typeof(*lir), lid_hdr); + struct llog_log_hdr *hdr; + int rc; + + ENTRY; + if (rec->lrh_type != le32_to_cpu(LLOG_LOGID_MAGIC)) { + rc = -EINVAL; + CWARN("%s: invalid record in catalog "DFID":%x: rc = %d\n", + loghandle2name(cat_llh), + PFID(&cat_llh->lgh_id.lgl_oi.oi_fid), + cat_llh->lgh_id.lgl_ogen, rc); + RETURN(rc); + } + CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "DFID"\n", + PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen, + le32_to_cpu(rec->lrh_index), + PFID(&cat_llh->lgh_id.lgl_oi.oi_fid)); + + rc = llog_cat_id2handle(env, cat_llh, llhp, &lir->lid_id); + if (rc) { + /* After a server crash, a stub of index record in catlog could + * be kept, because plain log destroy + catlog index record + * deletion are not atomic. So we end up with an index but no + * actual record. Destroy the index and move on. */ + if (rc == -ENOENT || rc == -ESTALE) + rc = LLOG_DEL_RECORD; + else if (rc) + CWARN("%s: can't find llog handle "DFID":%x: rc = %d\n", + loghandle2name(cat_llh), + PFID(&lir->lid_id.lgl_oi.oi_fid), + lir->lid_id.lgl_ogen, rc); + + RETURN(rc); + } + + /* clean old empty llogs, do not consider current llog in use */ + /* ignore remote (lgh_obj == NULL) llogs */ + hdr = (*llhp)->lgh_hdr; + if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + hdr->llh_count == 1 && cat_llh->lgh_obj != NULL && + *llhp != cat_llh->u.chd.chd_current_log) { + rc = llog_destroy(env, *llhp); + if (rc) + CWARN("%s: can't destroy empty log "DFID": rc = %d\n", + loghandle2name((*llhp)), + PFID(&lir->lid_id.lgl_oi.oi_fid), rc); + rc = LLOG_DEL_PLAIN; + } + + RETURN(rc); +} + +static int llog_cat_process_cb(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_handle *llh = NULL; + int rc; + + ENTRY; + rc = llog_cat_process_common(env, cat_llh, rec, &llh); + if (rc) + GOTO(out, rc); + + if (rec->lrh_index < d->lpd_startcat) { + /* Skip processing of the logs until startcat */ + rc = 0; + } else if (d->lpd_startidx > 0) { + struct llog_process_cat_data cd; + + cd.lpcd_first_idx = d->lpd_startidx; + cd.lpcd_last_idx = 0; + rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, + &cd, false); + /* Continue processing the next log from idx 0 */ + d->lpd_startidx = 0; + } else { + rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, + NULL, false); + } + if (rc == -ENOENT && (cat_llh->lgh_hdr->llh_flags & LLOG_F_RM_ON_ERR)) { + /* + * plain llog is reported corrupted, so better to just remove + * it if the caller is fine with that. + */ + CERROR("%s: remove corrupted/missing llog "DFID"\n", + loghandle2name(cat_llh), + PFID(&llh->lgh_id.lgl_oi.oi_fid)); + rc = LLOG_DEL_PLAIN; + } + +out: + /* The empty plain log was destroyed while processing */ + if (rc == LLOG_DEL_PLAIN) { + rc = llog_cat_cleanup(env, cat_llh, llh, + llh->u.phd.phd_cookie.lgc_index); + } else if (rc == LLOG_DEL_RECORD) { + /* clear wrong catalog entry */ + rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index); + } + + if (llh) + llog_handle_put(env, llh); + + RETURN(rc); +} + +int llog_cat_process_or_fork(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cat_cb, + llog_cb_t cb, void *data, int startcat, + int startidx, bool fork) +{ + struct llog_process_data d; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + + ENTRY; + + LASSERT(llh->llh_flags & LLOG_F_IS_CAT); + d.lpd_data = data; + d.lpd_cb = cb; + d.lpd_startcat = (startcat == LLOG_CAT_FIRST ? 0 : startcat); + d.lpd_startidx = startidx; + + if (llh->llh_cat_idx >= cat_llh->lgh_last_idx && + llh->llh_count > 1) { + struct llog_process_cat_data cd; + + CWARN("%s: catlog "DFID" crosses index zero\n", + loghandle2name(cat_llh), + PFID(&cat_llh->lgh_id.lgl_oi.oi_fid)); + /*startcat = 0 is default value for general processing */ + if ((startcat != LLOG_CAT_FIRST && + startcat >= llh->llh_cat_idx) || !startcat) { + /* processing the catalog part at the end */ + cd.lpcd_first_idx = (startcat ? startcat : + llh->llh_cat_idx); + if (OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS)) + cd.lpcd_last_idx = cfs_fail_val; + else + cd.lpcd_last_idx = 0; + rc = llog_process_or_fork(env, cat_llh, cat_cb, + &d, &cd, fork); + /* Reset the startcat becasue it has already reached + * catalog bottom. + */ + startcat = 0; + if (rc != 0) + RETURN(rc); + } + /* processing the catalog part at the begining */ + cd.lpcd_first_idx = (startcat == LLOG_CAT_FIRST) ? 0 : startcat; + /* Note, the processing will stop at the lgh_last_idx value, + * and it could be increased during processing. So records + * between current lgh_last_idx and lgh_last_idx in future + * would left unprocessed. + */ + cd.lpcd_last_idx = cat_llh->lgh_last_idx; + rc = llog_process_or_fork(env, cat_llh, cat_cb, + &d, &cd, fork); + } else { + rc = llog_process_or_fork(env, cat_llh, cat_cb, + &d, NULL, fork); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_process_or_fork); + +int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, int startidx) +{ + return llog_cat_process_or_fork(env, cat_llh, llog_cat_process_cb, + cb, data, startcat, startidx, false); +} +EXPORT_SYMBOL(llog_cat_process); + +static int llog_cat_size_cb(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_handle *llh = NULL; + __u64 *cum_size = d->lpd_data; + __u64 size; + int rc; + + ENTRY; + rc = llog_cat_process_common(env, cat_llh, rec, &llh); + + if (rc == LLOG_DEL_PLAIN) { + /* empty log was deleted, don't count it */ + rc = llog_cat_cleanup(env, cat_llh, llh, + llh->u.phd.phd_cookie.lgc_index); + } else if (rc == LLOG_DEL_RECORD) { + /* clear wrong catalog entry */ + rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index); + } else { + size = llog_size(env, llh); + *cum_size += size; + + CDEBUG(D_INFO, "Add llog entry "DFID" size=%llu, tot=%llu\n", + PFID(&llh->lgh_id.lgl_oi.oi_fid), size, *cum_size); + } + + if (llh != NULL) + llog_handle_put(env, llh); + + RETURN(0); +} + +__u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh) +{ + __u64 size = llog_size(env, cat_llh); + + llog_cat_process_or_fork(env, cat_llh, llog_cat_size_cb, + NULL, &size, 0, 0, false); + + return size; +} +EXPORT_SYMBOL(llog_cat_size); + +/* currently returns the number of "free" entries in catalog, + * ie the available entries for a new plain LLOG file creation, + * even if catalog has wrapped + */ +__u32 llog_cat_free_space(struct llog_handle *cat_llh) +{ + /* simulate almost full Catalog */ + if (OBD_FAIL_CHECK(OBD_FAIL_CAT_FREE_RECORDS)) + return cfs_fail_val; + + if (cat_llh->lgh_hdr->llh_count == 1) + return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1; + + if (cat_llh->lgh_last_idx > cat_llh->lgh_hdr->llh_cat_idx) + return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1 + + cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx; + + /* catalog is presently wrapped */ + return cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx; +} +EXPORT_SYMBOL(llog_cat_free_space); + +static int llog_cat_reverse_process_cb(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_handle *llh; + int rc; + + ENTRY; + rc = llog_cat_process_common(env, cat_llh, rec, &llh); + + /* The empty plain log was destroyed while processing */ + if (rc == LLOG_DEL_PLAIN) { + rc = llog_cat_cleanup(env, cat_llh, llh, + llh->u.phd.phd_cookie.lgc_index); + } else if (rc == LLOG_DEL_RECORD) { + /* clear wrong catalog entry */ + rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index); + } + if (rc) + RETURN(rc); + + rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL); + + /* The empty plain was destroyed while processing */ + if (rc == LLOG_DEL_PLAIN) + rc = llog_cat_cleanup(env, cat_llh, llh, + llh->u.phd.phd_cookie.lgc_index); + + llog_handle_put(env, llh); + RETURN(rc); +} + +int llog_cat_reverse_process(const struct lu_env *env, + struct llog_handle *cat_llh, + llog_cb_t cb, void *data) +{ + struct llog_process_data d; + struct llog_process_cat_data cd; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + ENTRY; + + LASSERT(llh->llh_flags & LLOG_F_IS_CAT); + d.lpd_data = data; + d.lpd_cb = cb; + + if (llh->llh_cat_idx >= cat_llh->lgh_last_idx && + llh->llh_count > 1) { + CWARN("%s: catalog "DFID" crosses index zero\n", + loghandle2name(cat_llh), + PFID(&cat_llh->lgh_id.lgl_oi.oi_fid)); + + cd.lpcd_first_idx = 0; + cd.lpcd_last_idx = cat_llh->lgh_last_idx; + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, &cd); + if (rc != 0) + RETURN(rc); + + cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx); + cd.lpcd_last_idx = 0; + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, &cd); + } else { + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, NULL); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_reverse_process); + +static int llog_cat_set_first_idx(struct llog_handle *cathandle, int idx) +{ + struct llog_log_hdr *llh = cathandle->lgh_hdr; + int bitmap_size; + + ENTRY; + + bitmap_size = LLOG_HDR_BITMAP_SIZE(llh); + /* + * The llh_cat_idx equals to the first used index minus 1 + * so if we canceled the first index then llh_cat_idx + * must be renewed. + */ + if (llh->llh_cat_idx == (idx - 1)) { + llh->llh_cat_idx = idx; + + while (idx != cathandle->lgh_last_idx) { + idx = (idx + 1) % bitmap_size; + if (!ext2_test_bit(idx, LLOG_HDR_BITMAP(llh))) { + /* update llh_cat_idx for each unset bit, + * expecting the next one is set */ + llh->llh_cat_idx = idx; + } else if (idx == 0) { + /* skip header bit */ + llh->llh_cat_idx = 0; + continue; + } else { + /* the first index is found */ + break; + } + } + + CDEBUG(D_HA, "catlog "DFID" first idx %u, last_idx %u\n", + PFID(&cathandle->lgh_id.lgl_oi.oi_fid), + llh->llh_cat_idx, cathandle->lgh_last_idx); + } + + RETURN(0); +} + +/* Cleanup deleted plain llog traces from catalog */ +int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle *loghandle, int index) +{ + int rc; + struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0}; + + LASSERT(index); + if (loghandle != NULL) { + /* remove destroyed llog from catalog list and + * chd_current_log variable */ + fid = loghandle->lgh_id.lgl_oi.oi_fid; + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_current_log == loghandle) + cathandle->u.chd.chd_current_log = NULL; + list_del_init(&loghandle->u.phd.phd_entry); + up_write(&cathandle->lgh_lock); + LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index); + /* llog was opened and keep in a list, close it now */ + llog_close(env, loghandle); + } + + /* do not attempt to cleanup on-disk llog if on client side */ + if (cathandle->lgh_obj == NULL) + return 0; + + /* remove plain llog entry from catalog by index */ + llog_cat_set_first_idx(cathandle, index); + rc = llog_cancel_rec(env, cathandle, index); + if (rc == 0) + CDEBUG(D_HA, + "cancel plain log "DFID" at index %u of catalog "DFID"\n", + PFID(&fid), index, + PFID(&cathandle->lgh_id.lgl_oi.oi_fid)); + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h new file mode 100644 index 0000000000000..c42f13ea6824f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h @@ -0,0 +1,100 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LLOG_INTERNAL_H__ +#define __LLOG_INTERNAL_H__ + +#include + +struct llog_process_info { + struct llog_handle *lpi_loghandle; + llog_cb_t lpi_cb; + void *lpi_cbdata; + void *lpi_catdata; + int lpi_rc; + struct completion lpi_completion; + const struct lu_env *lpi_env; + struct task_struct *lpi_reftask; +}; + +struct llog_thread_info { + struct lu_attr lgi_attr; + struct lu_fid lgi_fid; + struct dt_object_format lgi_dof; + struct lu_buf lgi_buf; + loff_t lgi_off; + struct llog_logid_rec lgi_logid; + struct dt_insert_rec lgi_dt_rec; + struct lu_seq_range lgi_range; + struct llog_cookie lgi_cookie; + struct obd_statfs lgi_statfs; + char lgi_name[32]; +}; + +extern struct lu_context_key llog_thread_key; + +static inline struct llog_thread_info *llog_info(const struct lu_env *env) +{ + struct llog_thread_info *lgi; + + lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key); + LASSERT(lgi); + return lgi; +} + +int llog_info_init(void); +void llog_info_fini(void); + +struct llog_handle *llog_handle_get(struct llog_handle *loghandle); +int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle); +int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle **res, struct llog_logid *logid); +int class_config_dump_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data); +int class_config_yaml_output(struct llog_rec_hdr *rec, char *buf, int size); +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork); +int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle *loghandle, int index); + +static inline struct llog_rec_hdr *llog_rec_hdr_next(struct llog_rec_hdr *rec) +{ + return (struct llog_rec_hdr *)((char *)rec + rec->lrh_len); +} +int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec); +static inline char *loghandle2name(const struct llog_handle *lgh) +{ + return lgh->lgh_ctxt->loc_obd->obd_name; +} +#endif diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c new file mode 100644 index 0000000000000..276ffa8280c84 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c @@ -0,0 +1,526 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include +#include +#include "llog_internal.h" + +static int str2logid(struct llog_logid *logid, char *str, int len) +{ + unsigned long long id, seq; + char *start, *end; + u32 ogen; + int rc; + + ENTRY; + start = str; + if (start[0] == '[') { + struct lu_fid *fid = &logid->lgl_oi.oi_fid; + int num; + + fid_zero(fid); + logid->lgl_ogen = 0; + num = sscanf(start + 1, SFID, RFID(fid)); + CDEBUG(D_INFO, DFID":%x\n", PFID(fid), logid->lgl_ogen); + RETURN(num == 3 && fid_is_sane(fid) ? 0 : -EINVAL); + } + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 1, 53, 0) + /* + * logids used to be input in the form "#id#seq:ogen" before they + * were changed over to accept the FID [seq:oid:ver] format. + * This is accepted for compatibility reasons, though I doubt + * anyone is actually using this for anything. + */ + if (start[0] != '#') + RETURN(-EINVAL); + + start++; + if (start - str >= len - 1) + RETURN(-EINVAL); + end = strchr(start, '#'); + if (end == NULL || end == start) + RETURN(-EINVAL); + + *end = '\0'; + rc = kstrtoull(start, 0, &id); + if (rc) + RETURN(rc); + + start = ++end; + if (start - str >= len - 1) + RETURN(-EINVAL); + + end = strchr(start, '#'); + if (!end || end == start) + RETURN(-EINVAL); + + *end = '\0'; + rc = kstrtoull(start, 0, &seq); + if (rc) + RETURN(rc); + + ostid_set_seq(&logid->lgl_oi, seq); + if (ostid_set_id(&logid->lgl_oi, id)) + RETURN(-EINVAL); + + start = ++end; + if (start - str >= len - 1) + RETURN(-EINVAL); + + rc = kstrtouint(start, 16, &ogen); + if (rc) + RETURN(-EINVAL); + logid->lgl_ogen = ogen; + + RETURN(0); +#else + RETURN(-EINVAL); +#endif +} + +static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct obd_ioctl_data *ioc_data = data; + static int l, remains; + static long from, to; + static char *out; + int cur_index; + int rc = 0; + + ENTRY; + if (ioc_data && ioc_data->ioc_inllen1 > 0) { + l = 0; + remains = ioc_data->ioc_inllen4 + + round_up(ioc_data->ioc_inllen1, 8) + + round_up(ioc_data->ioc_inllen2, 8) + + round_up(ioc_data->ioc_inllen3, 8); + + rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from); + if (rc) + RETURN(rc); + + rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to); + if (rc) + RETURN(rc); + + ioc_data->ioc_inllen1 = 0; + out = ioc_data->ioc_bulk; + } + + cur_index = rec->lrh_index; + if (cur_index < from) + RETURN(0); + if (to > 0 && cur_index > to) + RETURN(-LLOG_EEMPTY); + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) { + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *loghandle; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + l = snprintf(out, remains, + "[index]: %05d [type]: %02x [len]: %04d failed\n", + cur_index, rec->lrh_type, + rec->lrh_len); + } + if (handle->lgh_ctxt == NULL) + RETURN(-EOPNOTSUPP); + rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id); + if (rc) { + CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n", + PFID(&lir->lid_id.lgl_oi.oi_fid), + lir->lid_id.lgl_ogen); + RETURN(rc); + } + rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL); + llog_handle_put(env, loghandle); + } else { + bool ok; + + switch (rec->lrh_type) { + case OST_SZ_REC: + case MDS_UNLINK_REC: + case MDS_UNLINK64_REC: + case MDS_SETATTR64_REC: + case OBD_CFG_REC: + case LLOG_GEN_REC: + case LLOG_HDR_MAGIC: + ok = true; + break; + default: + ok = false; + } + + l = snprintf(out, remains, "[index]: %05d [type]: " + "%02x [len]: %04d %s\n", + cur_index, rec->lrh_type, rec->lrh_len, + ok ? "ok" : "failed"); + out += l; + remains -= l; + if (remains <= 0) { + CERROR("%s: no space to print log records\n", + handle->lgh_ctxt->loc_obd->obd_name); + RETURN(-LLOG_EEMPTY); + } + } + RETURN(rc); +} + +static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct obd_ioctl_data *ioc_data = data; + static int l, remains; + static long from, to; + static char *out; + int cur_index; + int rc; + + ENTRY; + if (ioc_data && ioc_data->ioc_inllen1 > 0) { + l = 0; + remains = ioc_data->ioc_inllen4 + + round_up(ioc_data->ioc_inllen1, 8) + + round_up(ioc_data->ioc_inllen2, 8) + + round_up(ioc_data->ioc_inllen3, 8); + + rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from); + if (rc) + RETURN(rc); + + rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to); + if (rc) + RETURN(rc); + + out = ioc_data->ioc_bulk; + ioc_data->ioc_inllen1 = 0; + } + + cur_index = rec->lrh_index; + if (cur_index < from) + RETURN(0); + if (to > 0 && cur_index > to) + RETURN(-LLOG_EEMPTY); + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) { + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + l = snprintf(out, remains, + "[index]: %05d [logid]: "DFID":%x\n", + cur_index, PFID(&lir->lid_id.lgl_oi.oi_fid), + lir->lid_id.lgl_ogen); + } else if (rec->lrh_type == OBD_CFG_REC) { + int rc; + + rc = class_config_yaml_output(rec, out, remains); + if (rc < 0) + RETURN(rc); + l = rc; + } else { + l = snprintf(out, remains, + "[index]: %05d [type]: %02x [len]: %04d\n", + cur_index, rec->lrh_type, rec->lrh_len); + } + out += l; + remains -= l; + if (remains <= 0) { + CERROR("not enough space for print log records\n"); + RETURN(-LLOG_EEMPTY); + } + + RETURN(0); +} +static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat, + struct llog_logid *logid) +{ + struct llog_handle *log; + int rc; + + ENTRY; + + rc = llog_cat_id2handle(env, cat, &log, logid); + if (rc) { + CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n", + PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen); + RETURN(-ENOENT); + } + + rc = llog_destroy(env, log); + if (rc) { + CDEBUG(D_IOCTL, "cannot destroy log "DFID":%x\n", + PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen); + GOTO(out, rc); + } + llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index); +out: + llog_handle_put(env, log); + RETURN(rc); + +} + +static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + int rc; + + ENTRY; + if (rec->lrh_type != LLOG_LOGID_MAGIC) + RETURN(-EINVAL); + rc = llog_remove_log(env, handle, &lir->lid_id); + + RETURN(rc); +} + + +int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd, + struct obd_ioctl_data *data) +{ + struct llog_logid logid; + int rc = 0; + struct llog_handle *handle = NULL; + char *logname, start; + + ENTRY; + + logname = data->ioc_inlbuf1; + start = logname[0]; + if (start == '#' || start == '[') { + rc = str2logid(&logid, logname, data->ioc_inllen1); + if (rc) + RETURN(rc); + rc = llog_open(env, ctxt, &handle, &logid, NULL, + LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + } else if (start == '$' || isalpha(start) || isdigit(start)) { + if (start == '$') + logname++; + + rc = llog_open(env, ctxt, &handle, NULL, logname, + LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + } else { + rc = -EINVAL; + CDEBUG(D_INFO, "%s: invalid log name '%s': rc = %d\n", + ctxt->loc_obd->obd_name, logname, rc); + RETURN(rc); + } + + rc = llog_init_handle(env, handle, 0, NULL); + if (rc) + GOTO(out_close, rc = -ENOENT); + + switch (cmd) { + case OBD_IOC_LLOG_INFO: { + int l; + int remains = data->ioc_inllen2 + + cfs_size_round(data->ioc_inllen1); + char *out = data->ioc_bulk; + + l = snprintf(out, remains, + "logid: "DFID":%x\n" + "flags: %x (%s)\n" + "records_count: %d\n" + "last_index: %d\n", + PFID(&handle->lgh_id.lgl_oi.oi_fid), + handle->lgh_id.lgl_ogen, + handle->lgh_hdr->llh_flags, + handle->lgh_hdr->llh_flags & + LLOG_F_IS_CAT ? "cat" : "plain", + handle->lgh_hdr->llh_count, + handle->lgh_last_idx); + out += l; + remains -= l; + if (remains <= 0) { + CERROR("%s: not enough space for log header info\n", + ctxt->loc_obd->obd_name); + rc = -ENOSPC; + } + break; + } + case OBD_IOC_LLOG_CHECK: + LASSERT(data->ioc_inllen1 > 0); + rc = llog_process(env, handle, llog_check_cb, data, NULL); + if (rc == -LLOG_EEMPTY) + rc = 0; + else if (rc) + GOTO(out_close, rc); + break; + case OBD_IOC_LLOG_PRINT: + LASSERT(data->ioc_inllen1 > 0); + rc = llog_process(env, handle, llog_print_cb, data, NULL); + if (rc == -LLOG_EEMPTY) + rc = 0; + else if (rc) + GOTO(out_close, rc); + break; + case OBD_IOC_LLOG_CANCEL: { + struct llog_cookie cookie; + struct llog_logid plain; + u32 lgc_index; + + rc = kstrtouint(data->ioc_inlbuf3, 0, &lgc_index); + if (rc) + GOTO(out_close, rc); + cookie.lgc_index = lgc_index; + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) { + rc = llog_cancel_rec(env, handle, cookie.lgc_index); + GOTO(out_close, rc); + } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) { + GOTO(out_close, rc = -EINVAL); + } + + if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */ + GOTO(out_close, rc = -ENOTTY); + + rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2); + if (rc) + GOTO(out_close, rc); + cookie.lgc_lgl = plain; + rc = llog_cat_cancel_records(env, handle, 1, &cookie); + if (rc) + GOTO(out_close, rc); + break; + } + case OBD_IOC_LLOG_REMOVE: { + struct llog_logid plain; + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) { + rc = llog_destroy(env, handle); + GOTO(out_close, rc); + } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) { + GOTO(out_close, rc = -EINVAL); + } + + if (data->ioc_inlbuf2 > 0) { + /* remove indicate log from the catalog */ + rc = str2logid(&plain, data->ioc_inlbuf2, + data->ioc_inllen2); + if (rc) + GOTO(out_close, rc); + rc = llog_remove_log(env, handle, &plain); + } else { + /* remove all the log of the catalog */ + rc = llog_process(env, handle, llog_delete_cb, NULL, + NULL); + if (rc) + GOTO(out_close, rc); + } + break; + } + default: + CERROR("%s: Unknown ioctl cmd %#x\n", + ctxt->loc_obd->obd_name, cmd); + GOTO(out_close, rc = -ENOTTY); + } + +out_close: + if (handle->lgh_hdr && + handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + llog_cat_close(env, handle); + else + llog_close(env, handle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_ioctl); + +int llog_catalog_list(const struct lu_env *env, struct dt_device *d, + int count, struct obd_ioctl_data *data, + const struct lu_fid *fid) +{ + int size, i; + struct llog_catid *idarray; + struct llog_logid *id; + char *out; + int l, remains, rc = 0; + + ENTRY; + + if (count == 0) { /* get total number of logs */ + rc = llog_osd_get_cat_list(env, d, 0, 0, NULL, fid); + if (rc < 0) + RETURN(rc); + count = rc; + } + + size = sizeof(*idarray) * count; + + OBD_ALLOC_LARGE(idarray, size); + if (!idarray) + RETURN(-ENOMEM); + + rc = llog_osd_get_cat_list(env, d, 0, count, idarray, fid); + if (rc) + GOTO(out, rc); + + out = data->ioc_bulk; + remains = data->ioc_inllen1; + /* OBD_FAIL: fetch the catalog records from the specified one */ + if (OBD_FAIL_CHECK(OBD_FAIL_CATLIST)) + data->ioc_count = cfs_fail_val - 1; + for (i = data->ioc_count; i < count; i++) { + id = &idarray[i].lci_logid; + l = snprintf(out, remains, "catalog_log: "DFID":%x\n", + PFID(&id->lgl_oi.oi_fid), id->lgl_ogen); + out += l; + remains -= l; + if (remains <= 0) { + if (remains < 0) { + /* the print is not complete */ + remains += l; + data->ioc_bulk[out - data->ioc_bulk - l] = '\0'; + data->ioc_count = i; + } else { + data->ioc_count = i++; + } + goto out; + } + } + data->ioc_count = 0; +out: + OBD_FREE_LARGE(idarray, size); + RETURN(rc); +} +EXPORT_SYMBOL(llog_catalog_list); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c new file mode 100644 index 0000000000000..1d1f953992301 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c @@ -0,0 +1,250 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include +#include +#include "llog_internal.h" + +/* helper functions for calling the llog obd methods */ +static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + OBD_ALLOC_PTR(ctxt); + if (!ctxt) + return NULL; + + ctxt->loc_obd = obd; + atomic_set(&ctxt->loc_refcount, 1); + + return ctxt; +} + +static void llog_ctxt_destroy(struct llog_ctxt *ctxt) +{ + if (ctxt->loc_exp) { + class_export_put(ctxt->loc_exp); + ctxt->loc_exp = NULL; + } + if (ctxt->loc_imp) { + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = NULL; + } + OBD_FREE_PTR(ctxt); +} + +int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct obd_llog_group *olg = ctxt->loc_olg; + struct obd_device *obd; + int rc = 0; + + spin_lock(&olg->olg_lock); + if (!atomic_dec_and_test(&ctxt->loc_refcount)) { + spin_unlock(&olg->olg_lock); + return rc; + } + olg->olg_ctxts[ctxt->loc_idx] = NULL; + spin_unlock(&olg->olg_lock); + + obd = ctxt->loc_obd; + spin_lock(&obd->obd_dev_lock); + /* sync with llog ctxt user thread */ + spin_unlock(&obd->obd_dev_lock); + + /* + * obd->obd_starting is needed for the case of cleanup + * in error case while obd is starting up. + */ + LASSERTF(obd->obd_starting == 1 || + obd->obd_stopping == 1 || obd->obd_set_up == 0, + "wrong obd state: %d/%d/%d\n", !!obd->obd_starting, + !!obd->obd_stopping, !!obd->obd_set_up); + + /* cleanup the llog ctxt here */ + if (ctxt->loc_logops->lop_cleanup) + rc = ctxt->loc_logops->lop_cleanup(env, ctxt); + + llog_ctxt_destroy(ctxt); + wake_up(&olg->olg_waitq); + return rc; +} +EXPORT_SYMBOL(__llog_ctxt_put); + +int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + struct obd_llog_group *olg; + int rc, idx; + + ENTRY; + + LASSERT(ctxt != NULL); + LASSERT(ctxt != LP_POISON); + + olg = ctxt->loc_olg; + LASSERT(olg != NULL); + LASSERT(olg != LP_POISON); + + idx = ctxt->loc_idx; + + /* + * Banlance the ctxt get when calling llog_cleanup() + */ + LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON); + LASSERT(atomic_read(&ctxt->loc_refcount) > 1); + llog_ctxt_put(ctxt); + + /* + * Try to free the ctxt. + */ + rc = __llog_ctxt_put(env, ctxt); + if (rc) + CERROR("Error %d while cleaning up ctxt %p\n", + rc, ctxt); + + l_wait_event(olg->olg_waitq, + llog_group_ctxt_null(olg, idx), &lwi); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cleanup); + +int llog_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int index, + struct obd_device *disk_obd, struct llog_operations *op) +{ + struct llog_ctxt *ctxt; + int rc = 0; + + ENTRY; + + if (index < 0 || index >= LLOG_MAX_CTXTS) + RETURN(-EINVAL); + + LASSERT(olg != NULL); + + ctxt = llog_new_ctxt(obd); + if (!ctxt) + RETURN(-ENOMEM); + + ctxt->loc_obd = obd; + ctxt->loc_olg = olg; + ctxt->loc_idx = index; + ctxt->loc_logops = op; + mutex_init(&ctxt->loc_mutex); + if (disk_obd != NULL) + ctxt->loc_exp = class_export_get(disk_obd->obd_self_export); + else + ctxt->loc_exp = class_export_get(obd->obd_self_export); + + ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED; + ctxt->loc_chunk_size = LLOG_MIN_CHUNK_SIZE; + + rc = llog_group_set_ctxt(olg, ctxt, index); + if (rc) { + llog_ctxt_destroy(ctxt); + if (rc == -EEXIST) { + ctxt = llog_group_get_ctxt(olg, index); + if (ctxt) { + CDEBUG(D_CONFIG, "%s: ctxt %d already set up\n", + obd->obd_name, index); + LASSERT(ctxt->loc_olg == olg); + LASSERT(ctxt->loc_obd == obd); + if (disk_obd != NULL) + LASSERT(ctxt->loc_exp == + disk_obd->obd_self_export); + else + LASSERT(ctxt->loc_exp == + obd->obd_self_export); + LASSERT(ctxt->loc_logops == op); + llog_ctxt_put(ctxt); + } + rc = 0; + } + RETURN(rc); + } + + if (op->lop_setup) { + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP)) + rc = -EOPNOTSUPP; + else + rc = op->lop_setup(env, obd, olg, index, disk_obd); + } + + if (rc) { + CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n", + obd->obd_name, index, op->lop_setup, rc); + llog_group_clear_ctxt(olg, index); + llog_ctxt_destroy(ctxt); + } else { + CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n", + obd->obd_name, index); + ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED; + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_setup); + +int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags) +{ + int rc = 0; + + ENTRY; + if (ctxt && ctxt->loc_logops->lop_sync) + rc = ctxt->loc_logops->lop_sync(ctxt, exp, flags); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_sync); + +/* context key constructor/destructor: llog_key_init, llog_key_fini */ +LU_KEY_INIT_FINI(llog, struct llog_thread_info); +/* context key: llog_thread_key */ +LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL); +LU_KEY_INIT_GENERIC(llog); + +int llog_info_init(void) +{ + llog_key_init_generic(&llog_thread_key, NULL); + lu_context_key_register(&llog_thread_key); + return 0; +} + +void llog_info_fini(void) +{ + lu_context_key_degister(&llog_thread_key); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c new file mode 100644 index 0000000000000..55088d417146d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c @@ -0,0 +1,2193 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/* + * lustre/obdclass/llog_osd.c + * + * Low level llog routines on top of OSD API + * + * This file provides set of methods for llog operations on top of + * dt_device. It contains all supported llog_operations interfaces and + * supplimental functions. + * + * Author: Alexey Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include + +#include +#include +#include +#include +#include + +#include "llog_internal.h" +#include "local_storage.h" + +/** + * Implementation of the llog_operations::lop_declare_create + * + * This function is a wrapper over local_storage API function + * local_object_declare_create(). + * + * \param[in] env execution environment + * \param[in] los local_storage for bottom storage device + * \param[in] o dt_object to create + * \param[in] th current transaction handle + * + * \retval 0 on successful declaration of the new object + * \retval negative error if declaration was failed + */ +static int llog_osd_declare_new_object(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + return local_object_declare_create(env, los, o, &lgi->lgi_attr, + &lgi->lgi_dof, th); +} + +/** + * Implementation of the llog_operations::lop_create + * + * This function is a wrapper over local_storage API function + * local_object_create(). + * + * \param[in] env execution environment + * \param[in] los local_storage for bottom storage device + * \param[in] o dt_object to create + * \param[in] th current transaction handle + * + * \retval 0 on successful creation of the new object + * \retval negative error if creation was failed + */ +static int llog_osd_create_new_object(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + return local_object_create(env, los, o, &lgi->lgi_attr, + &lgi->lgi_dof, th); +} + +/** + * Implementation of the llog_operations::lop_exist + * + * This function checks that llog exists on storage. + * + * \param[in] handle llog handle of the current llog + * + * \retval true if llog object exists and is not just destroyed + * \retval false if llog doesn't exist or just destroyed + */ +static int llog_osd_exist(struct llog_handle *handle) +{ + LASSERT(handle->lgh_obj); + return dt_object_exists(handle->lgh_obj) && !handle->lgh_destroyed; +} + +static void *rec_tail(struct llog_rec_hdr *rec) +{ + return (void *)((char *)rec + rec->lrh_len - + sizeof(struct llog_rec_tail)); +} + +/** + * Write a padding record to the llog + * + * This function writes a padding record to the end of llog. That may + * be needed if llog contains records of variable size, e.g. config logs + * or changelogs. + * The padding record just aligns llog to the llog chunk_size boundary if + * the current record doesn't fit in the remaining space. + * + * It allocates full length to avoid two separate writes for header and tail. + * Such 2-steps scheme needs extra protection and complex error handling. + * + * \param[in] env execution environment + * \param[in] o dt_object to create + * \param[in,out] off pointer to the padding start offset + * \param[in] len padding length + * \param[in] index index of the padding record in a llog + * \param[in] th current transaction handle + * + * \retval 0 on successful padding write + * \retval negative error if write failed + */ +static int llog_osd_pad(const struct lu_env *env, struct dt_object *o, + loff_t *off, int len, int index, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_rec_hdr *rec; + struct llog_rec_tail *tail; + int rc; + + ENTRY; + + LASSERT(th); + LASSERT(off); + LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0); + + OBD_ALLOC(rec, len); + if (rec == NULL) + RETURN(-ENOMEM); + + rec->lrh_len = len; + rec->lrh_index = index; + rec->lrh_type = LLOG_PAD_MAGIC; + + tail = rec_tail(rec); + tail->lrt_len = len; + tail->lrt_index = index; + + lgi->lgi_buf.lb_buf = rec; + lgi->lgi_buf.lb_len = len; + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) + CERROR("%s: error writing padding record: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + + OBD_FREE(rec, len); + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_read_header + * + * This function reads the current llog header from the bottom storage + * device. + * + * \param[in] env execution environment + * \param[in] handle llog handle of the current llog + * + * \retval 0 on successful header read + * \retval negative error if read failed + */ +static int llog_osd_read_header(const struct lu_env *env, + struct llog_handle *handle) +{ + struct llog_rec_hdr *llh_hdr; + struct dt_object *o; + struct llog_thread_info *lgi; + enum llog_flag flags; + int rc; + + ENTRY; + + o = handle->lgh_obj; + LASSERT(o); + + lgi = llog_info(env); + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + RETURN(rc); + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + + if (lgi->lgi_attr.la_size == 0) { + CDEBUG(D_HA, "not reading header from 0-byte log\n"); + RETURN(LLOG_EEMPTY); + } + + flags = handle->lgh_hdr->llh_flags; + + lgi->lgi_off = 0; + lgi->lgi_buf.lb_buf = handle->lgh_hdr; + lgi->lgi_buf.lb_len = handle->lgh_hdr_size; + rc = dt_read(env, o, &lgi->lgi_buf, &lgi->lgi_off); + llh_hdr = &handle->lgh_hdr->llh_hdr; + if (rc < sizeof(*llh_hdr) || rc < llh_hdr->lrh_len) { + CERROR("%s: error reading "DFID" log header size %d: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), rc < 0 ? 0 : rc, + -EFAULT); + + if (rc >= 0) + rc = -EFAULT; + + RETURN(rc); + } + + if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr)) + lustre_swab_llog_hdr(handle->lgh_hdr); + + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { + CERROR("%s: bad log %s "DFID" header magic: %#x " + "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + llh_hdr->lrh_type, LLOG_HDR_MAGIC); + RETURN(-EIO); + } else if (llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE || + llh_hdr->lrh_len > handle->lgh_hdr_size) { + CERROR("%s: incorrectly sized log %s "DFID" header: " + "%#x (expected at least %#x)\n" + "you may need to re-run lconf --write_conf.\n", + o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + llh_hdr->lrh_len, LLOG_MIN_CHUNK_SIZE); + RETURN(-EIO); + } else if (LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index > + LLOG_HDR_BITMAP_SIZE(handle->lgh_hdr) || + LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len != + llh_hdr->lrh_len) { + CERROR("%s: incorrectly sized log %s "DFID" tailer: " + "%#x : rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len, -EIO); + RETURN(-EIO); + } + + handle->lgh_hdr->llh_flags |= (flags & LLOG_F_EXT_MASK); + handle->lgh_last_idx = LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index; + + RETURN(0); +} + +/** + * Implementation of the llog_operations::lop_declare_write + * + * This function declares the new record write. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * \param[in] rec llog record header. This is a real header of the full + * llog record to write. This is the beginning of buffer + * to write, the length of buffer is stored in + * \a rec::lrh_len + * \param[in] idx index of the llog record. If \a idx == -1 then this is + * append case, otherwise \a idx is the index of record + * to modify + * \param[in] th current transaction handle + * + * \retval 0 on successful declaration + * \retval negative error if declaration failed + */ +static int llog_osd_declare_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + int idx, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + __u32 chunk_size; + struct dt_object *o; + int rc; + + ENTRY; + + LASSERT(env); + LASSERT(th); + LASSERT(loghandle); + LASSERT(rec); + LASSERT(rec->lrh_len <= loghandle->lgh_ctxt->loc_chunk_size); + + o = loghandle->lgh_obj; + LASSERT(o); + + chunk_size = loghandle->lgh_ctxt->loc_chunk_size; + lgi->lgi_buf.lb_len = chunk_size; + lgi->lgi_buf.lb_buf = NULL; + /* each time we update header */ + rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0, + th); + if (rc || idx == 0) /* if error or just header */ + RETURN(rc); + + /** + * the pad record can be inserted so take into account double + * record size + */ + lgi->lgi_buf.lb_len = chunk_size * 2; + lgi->lgi_buf.lb_buf = NULL; + /* XXX: implement declared window or multi-chunks approach */ + rc = dt_declare_record_write(env, o, &lgi->lgi_buf, -1, th); + + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_write + * + * This function writes the new record in the llog or modify the existed one. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * \param[in] rec llog record header. This is a real header of + * the full llog record to write. This is + * the beginning of buffer to write, the length + * of buffer is stored in \a rec::lrh_len + * \param[in,out] reccookie pointer to the cookie to return back if needed. + * It is used for further cancel of this llog + * record. + * \param[in] idx index of the llog record. If \a idx == -1 then + * this is append case, otherwise \a idx is + * the index of record to modify + * \param[in] th current transaction handle + * + * \retval 0 on successful write && \a reccookie == NULL + * 1 on successful write && \a reccookie != NULL + * \retval negative error if write failed + */ +static int llog_osd_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *reccookie, + int idx, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_log_hdr *llh; + int reclen = rec->lrh_len; + int index, rc; + struct llog_rec_tail *lrt; + struct dt_object *o; + __u32 chunk_size; + size_t left; + __u32 orig_last_idx; + ENTRY; + + llh = loghandle->lgh_hdr; + o = loghandle->lgh_obj; + + chunk_size = llh->llh_hdr.lrh_len; + CDEBUG(D_OTHER, "new record %x to "DFID"\n", + rec->lrh_type, PFID(lu_object_fid(&o->do_lu))); + + if (!llog_osd_exist(loghandle)) + RETURN(-ENOENT); + + /* record length should not bigger than */ + if (reclen > loghandle->lgh_hdr->llh_hdr.lrh_len) + RETURN(-E2BIG); + + /* sanity check for fixed-records llog */ + if (idx != LLOG_HEADER_IDX && (llh->llh_flags & LLOG_F_IS_FIXSIZE)) { + LASSERT(llh->llh_size != 0); + LASSERT(llh->llh_size == reclen); + } + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + RETURN(rc); + + /** + * The modification case. + * If idx set then the record with that index must be modified. + * There are three cases possible: + * 1) the common case is the llog header update (idx == 0) + * 2) the llog record modification during llog process. + * This is indicated by the \a loghandle::lgh_cur_idx > 0. + * In that case the \a loghandle::lgh_cur_offset + * 3) otherwise this is assumed that llog consist of records of + * fixed size, i.e. catalog. The llog header must has llh_size + * field equal to record size. The record offset is calculated + * just by /a idx value + * + * During modification we don't need extra header update because + * the bitmap and record count are not changed. The record header + * and tail remains the same too. + */ + if (idx != LLOG_NEXT_IDX) { + /* llog can be empty only when first record is being written */ + LASSERT(ergo(idx > 0, lgi->lgi_attr.la_size > 0)); + + if (!ext2_test_bit(idx, LLOG_HDR_BITMAP(llh))) { + CERROR("%s: modify unset record %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx); + RETURN(-ENOENT); + } + + if (idx != rec->lrh_index) { + CERROR("%s: modify index mismatch %d %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx, + rec->lrh_index); + RETURN(-EFAULT); + } + + if (idx == LLOG_HEADER_IDX) { + /* llog header update */ + __u32 *bitmap = LLOG_HDR_BITMAP(llh); + + lgi->lgi_off = 0; + + /* If it does not indicate the bitmap index + * (reccookie == NULL), then it means update + * the whole update header. Otherwise only + * update header and bits needs to be updated, + * and in DNE cases, it will signaficantly + * shrink the RPC size. + * see distribute_txn_cancel_records()*/ + if (reccookie == NULL) { + lgi->lgi_buf.lb_len = reclen; + lgi->lgi_buf.lb_buf = rec; + rc = dt_record_write(env, o, &lgi->lgi_buf, + &lgi->lgi_off, th); + RETURN(rc); + } + + /* update the header */ + lgi->lgi_buf.lb_len = llh->llh_bitmap_offset; + lgi->lgi_buf.lb_buf = llh; + rc = dt_record_write(env, o, &lgi->lgi_buf, + &lgi->lgi_off, th); + if (rc != 0) + RETURN(rc); + + /* update the bitmap */ + index = reccookie->lgc_index; + lgi->lgi_off = llh->llh_bitmap_offset + + (index / (sizeof(*bitmap) * 8)) * + sizeof(*bitmap); + lgi->lgi_buf.lb_len = sizeof(*bitmap); + lgi->lgi_buf.lb_buf = + &bitmap[index/(sizeof(*bitmap)*8)]; + rc = dt_record_write(env, o, &lgi->lgi_buf, + &lgi->lgi_off, th); + + RETURN(rc); + } else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) { + lgi->lgi_off = llh->llh_hdr.lrh_len + + (idx - 1) * reclen; + } else if (reccookie != NULL && reccookie->lgc_index > 0) { + /** + * The lgc_offset can be used only if index is + * the same. + */ + if (idx != reccookie->lgc_index) { + CERROR("%s: modify index mismatch %d %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx, + reccookie->lgc_index); + RETURN(-EFAULT); + } + + lgi->lgi_off = reccookie->lgc_offset; + CDEBUG(D_OTHER, "modify record "DFID": idx:%u, " + "len:%u offset %llu\n", + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), idx, + rec->lrh_len, (long long)lgi->lgi_off); + } else { + /* This can be result of lgh_cur_idx is not set during + * llog processing or llh_size is not set to proper + * record size for fixed records llog. Therefore it is + * impossible to get record offset. */ + CERROR("%s: can't get record offset, idx:%d, " + "len:%u.\n", o->do_lu.lo_dev->ld_obd->obd_name, + idx, rec->lrh_len); + RETURN(-EFAULT); + } + + /* update only data, header and tail remain the same */ + lgi->lgi_off += sizeof(struct llog_rec_hdr); + lgi->lgi_buf.lb_len = REC_DATA_LEN(rec); + lgi->lgi_buf.lb_buf = REC_DATA(rec); + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc == 0 && reccookie) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = idx; + rc = 1; + } + RETURN(rc); + } + + /** + * The append case. + * The most common case of using llog. The new index is assigned to + * the new record, new bit is set in llog bitmap and llog count is + * incremented. + * + * Make sure that records don't cross a chunk boundary, so we can + * process them page-at-a-time if needed. If it will cross a chunk + * boundary, write in a fake (but referenced) entry to pad the chunk. + */ + + + /* simulate ENOSPC when new plain llog is being added to the + * catalog */ + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED2) && + llh->llh_flags & LLOG_F_IS_CAT) + RETURN(-ENOSPC); + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + orig_last_idx = loghandle->lgh_last_idx; + lgi->lgi_off = lgi->lgi_attr.la_size; + + if (loghandle->lgh_max_size > 0 && + lgi->lgi_off >= loghandle->lgh_max_size) { + CDEBUG(D_OTHER, "llog is getting too large (%u > %u) at %u " + DFID"\n", (unsigned)lgi->lgi_off, + loghandle->lgh_max_size, (int)loghandle->lgh_last_idx, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid)); + /* this is to signal that this llog is full */ + loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) - 1; + RETURN(-ENOSPC); + } + + left = chunk_size - (lgi->lgi_off & (chunk_size - 1)); + /* NOTE: padding is a record, but no bit is set */ + if (left != 0 && left != reclen && + left < (reclen + LLOG_MIN_REC_SIZE)) { + index = loghandle->lgh_last_idx + 1; + rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th); + if (rc) + RETURN(rc); + + loghandle->lgh_last_idx++; /* for pad rec */ + } + /* if it's the last idx in log file, then return -ENOSPC + * or wrap around if a catalog */ + if (llog_is_full(loghandle) || + unlikely(llh->llh_flags & LLOG_F_IS_CAT && + OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) && + loghandle->lgh_last_idx >= cfs_fail_val)) { + if (llh->llh_flags & LLOG_F_IS_CAT) + loghandle->lgh_last_idx = 0; + else + RETURN(-ENOSPC); + } + + down_write(&loghandle->lgh_last_sem); + /* increment the last_idx along with llh_tail index, they should + * be equal for a llog lifetime */ + loghandle->lgh_last_idx++; + index = loghandle->lgh_last_idx; + LLOG_HDR_TAIL(llh)->lrt_index = index; + /** + * NB: the caller should make sure only 1 process access + * the lgh_last_idx, e.g. append should be exclusive. + * Otherwise it might hit the assert. + */ + LASSERT(index < LLOG_HDR_BITMAP_SIZE(llh)); + rec->lrh_index = index; + lrt = rec_tail(rec); + lrt->lrt_len = rec->lrh_len; + lrt->lrt_index = rec->lrh_index; + + /* the lgh_hdr_mutex protects llog header data from concurrent + * update/cancel, the llh_count and llh_bitmap are protected */ + mutex_lock(&loghandle->lgh_hdr_mutex); + if (ext2_set_bit(index, LLOG_HDR_BITMAP(llh))) { + CERROR("%s: index %u already set in log bitmap\n", + o->do_lu.lo_dev->ld_obd->obd_name, index); + mutex_unlock(&loghandle->lgh_hdr_mutex); + LBUG(); /* should never happen */ + } + llh->llh_count++; + + if (!(llh->llh_flags & LLOG_F_IS_FIXSIZE)) { + /* Update the minimum size of the llog record */ + if (llh->llh_size == 0) + llh->llh_size = reclen; + else if (reclen < llh->llh_size) + llh->llh_size = reclen; + } + + if (lgi->lgi_attr.la_size == 0) { + lgi->lgi_off = 0; + lgi->lgi_buf.lb_len = llh->llh_hdr.lrh_len; + lgi->lgi_buf.lb_buf = &llh->llh_hdr; + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc != 0) + GOTO(out_unlock, rc); + } else { + __u32 *bitmap = LLOG_HDR_BITMAP(llh); + + /* Note: If this is not initialization (size == 0), then do not + * write the whole header (8k bytes), only update header/tail + * and bits needs to be updated. Because this update might be + * part of cross-MDT operation, which needs to write these + * updates into the update log(32KB limit) and also pack inside + * the RPC (1MB limit), if we write 8K for each operation, which + * will cost a lot space, and keep us adding more updates to one + * update log.*/ + lgi->lgi_off = 0; + lgi->lgi_buf.lb_len = llh->llh_bitmap_offset; + lgi->lgi_buf.lb_buf = &llh->llh_hdr; + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc != 0) + GOTO(out_unlock, rc); + + lgi->lgi_off = llh->llh_bitmap_offset + + (index / (sizeof(*bitmap) * 8)) * sizeof(*bitmap); + lgi->lgi_buf.lb_len = sizeof(*bitmap); + lgi->lgi_buf.lb_buf = &bitmap[index/(sizeof(*bitmap)*8)]; + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc != 0) + GOTO(out_unlock, rc); + + lgi->lgi_off = (unsigned long)LLOG_HDR_TAIL(llh) - + (unsigned long)llh; + lgi->lgi_buf.lb_len = sizeof(llh->llh_tail); + lgi->lgi_buf.lb_buf = LLOG_HDR_TAIL(llh); + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc != 0) + GOTO(out_unlock, rc); + } + +out_unlock: + /* unlock here for remote object */ + mutex_unlock(&loghandle->lgh_hdr_mutex); + if (rc) + GOTO(out, rc); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) && + cfs_fail_val == (unsigned int)(loghandle->lgh_id.lgl_oi.oi.oi_id & + 0xFFFFFFFF)) { + OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT); + msleep(1 * MSEC_PER_SEC); + } + /* computed index can be used to determine offset for fixed-size + * records. This also allows to handle Catalog wrap around case */ + if (llh->llh_flags & LLOG_F_IS_FIXSIZE) { + lgi->lgi_off = llh->llh_hdr.lrh_len + (index - 1) * reclen; + } else { + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(out, rc); + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + lgi->lgi_off = max_t(__u64, lgi->lgi_attr.la_size, + lgi->lgi_off); + } + + lgi->lgi_buf.lb_len = reclen; + lgi->lgi_buf.lb_buf = rec; + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc < 0) + GOTO(out, rc); + + up_write(&loghandle->lgh_last_sem); + + CDEBUG(D_HA, "added record "DFID".%u, %u off%llu\n", + PFID(lu_object_fid(&o->do_lu)), index, rec->lrh_len, + lgi->lgi_off); + if (reccookie != NULL) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = index; + if ((rec->lrh_type == MDS_UNLINK_REC) || + (rec->lrh_type == MDS_SETATTR64_REC)) + reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT; + else if (rec->lrh_type == OST_SZ_REC) + reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT; + else + reccookie->lgc_subsys = -1; + rc = 1; + } + RETURN(rc); +out: + /* cleanup llog for error case */ + mutex_lock(&loghandle->lgh_hdr_mutex); + ext2_clear_bit(index, LLOG_HDR_BITMAP(llh)); + llh->llh_count--; + mutex_unlock(&loghandle->lgh_hdr_mutex); + + /* restore llog last_idx */ + if (dt_object_remote(o)) { + loghandle->lgh_last_idx = orig_last_idx; + } else if (--loghandle->lgh_last_idx == 0 && + (llh->llh_flags & LLOG_F_IS_CAT) && llh->llh_cat_idx != 0) { + /* catalog had just wrap-around case */ + loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) - 1; + } + + LLOG_HDR_TAIL(llh)->lrt_index = loghandle->lgh_last_idx; + up_write(&loghandle->lgh_last_sem); + + RETURN(rc); +} + +/** + * We can skip reading at least as many log blocks as the number of + * minimum sized log records we are skipping. If it turns out + * that we are not far enough along the log (because the + * actual records are larger than minimum size) we just skip + * some more records. + * + * Note: in llog_process_thread, it will use bitmap offset as + * the index to locate the record, which also includs some pad + * records, whose record size is very small, and it also does not + * consider pad record when recording minimum record size (otherwise + * min_record size might be too small), so in some rare cases, + * it might skip too much record for @goal, see llog_osd_next_block(). + * + * When force_mini_rec is true, it means we have to use LLOG_MIN_REC_SIZE + * as the min record size to skip over, usually because in the previous + * try, it skip too much record, see loog_osd_next(prev)_block(). + */ +static inline void llog_skip_over(struct llog_handle *lgh, __u64 *off, + int curr, int goal, __u32 chunk_size, + bool force_mini_rec) +{ + struct llog_log_hdr *llh = lgh->lgh_hdr; + + /* Goal should not bigger than the record count */ + if (goal > lgh->lgh_last_idx) + goal = lgh->lgh_last_idx; + + if (goal > curr) { + if (llh->llh_flags & LLOG_F_IS_FIXSIZE) { + *off = chunk_size + (goal - 1) * llh->llh_size; + } else { + __u64 min_rec_size = LLOG_MIN_REC_SIZE; + + if (llh->llh_size > 0 && !force_mini_rec) + min_rec_size = llh->llh_size; + + *off = *off + (goal - curr - 1) * min_rec_size; + } + } + /* always align with lower chunk boundary*/ + *off &= ~(chunk_size - 1); +} + +/** + * Remove optional fields that the client doesn't expect. + * This is typically in order to ensure compatibility with older clients. + * It is assumed that since we exclusively remove fields, the block will be + * big enough to handle the remapped records. It is also assumed that records + * of a block have the same format (i.e.: the same features enabled). + * + * \param[in,out] hdr Header of the block of records to remap. + * \param[in,out] last_hdr Last header, don't read past this point. + * \param[in] flags Flags describing the fields to keep. + * \param[in] extra_flags Flags describing the extra fields to keep. + */ +static void changelog_block_trim_ext(struct llog_rec_hdr *hdr, + struct llog_rec_hdr *last_hdr, + struct llog_handle *loghandle) +{ + enum changelog_rec_flags flags = CLF_SUPPORTED; + enum changelog_rec_extra_flags extra_flags = CLFE_SUPPORTED; + + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_XATTR)) + extra_flags &= ~CLFE_XATTR; + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_OMODE)) + extra_flags &= ~CLFE_OPEN; + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_NID)) + extra_flags &= ~CLFE_NID; + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_UIDGID)) + extra_flags &= ~CLFE_UIDGID; + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_EXTRA_FLAGS)) + flags &= ~CLF_EXTRA_FLAGS; + if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID)) + flags &= ~CLF_JOBID; + + if (flags == CLF_SUPPORTED && extra_flags == CLFE_SUPPORTED) + return; + + if (hdr->lrh_type != CHANGELOG_REC) + return; + + do { + struct changelog_rec *rec = (struct changelog_rec *)(hdr + 1); + enum changelog_rec_extra_flags xflag = CLFE_INVALID; + + if (flags & CLF_EXTRA_FLAGS && + rec->cr_flags & CLF_EXTRA_FLAGS) { + xflag = changelog_rec_extra_flags(rec)->cr_extra_flags & + extra_flags; + } + + if (unlikely(hdr->lrh_len == 0)) { + /* It is corruption case, we cannot know the next rec, + * jump to the last one directly to avoid dead loop. */ + LCONSOLE(D_WARNING, "Hit invalid llog record: " + "idx %u, type %u, id %u\n", + hdr->lrh_index, hdr->lrh_type, hdr->lrh_id); + hdr = llog_rec_hdr_next(last_hdr); + if (unlikely(hdr == last_hdr)) + LCONSOLE(D_WARNING, "The last record crashed: " + "idx %u, type %u, id %u\n", + hdr->lrh_index, hdr->lrh_type, + hdr->lrh_id); + break; + } + + changelog_remap_rec(rec, rec->cr_flags & flags, xflag); + hdr = llog_rec_hdr_next(hdr); + /* Yield CPU to avoid soft-lockup if there are too many records + * to be handled. */ + cond_resched(); + } while ((char *)hdr <= (char *)last_hdr); +} + +/** + * Implementation of the llog_operations::lop_next_block + * + * This function finds the the next llog block to return which contains + * record with required index. It is main part of llog processing. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * \param[in,out] cur_idx index preceeding cur_offset + * \param[in] next_idx target index to find + * \param[in,out] cur_offset furtherst point read in the file + * \param[in] buf pointer to data buffer to fill + * \param[in] len required len to read, it is + * usually llog chunk_size. + * + * \retval 0 on successful buffer read + * \retval negative value on error + */ +static int llog_osd_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o; + struct dt_device *dt; + int rc; + __u32 chunk_size; + int last_idx = *cur_idx; + __u64 last_offset = *cur_offset; + bool force_mini_rec = false; + + ENTRY; + + LASSERT(env); + LASSERT(lgi); + + chunk_size = loghandle->lgh_hdr->llh_hdr.lrh_len; + if (len == 0 || len & (chunk_size - 1)) + RETURN(-EINVAL); + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + LASSERT(llog_osd_exist(loghandle)); + dt = lu2dt_dev(o->do_lu.lo_dev); + LASSERT(dt); + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(out, rc); + + CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off" + "%llu), size %llu\n", next_idx, *cur_idx, + *cur_offset, lgi->lgi_attr.la_size); + + while (*cur_offset < lgi->lgi_attr.la_size) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + + llog_skip_over(loghandle, cur_offset, *cur_idx, + next_idx, chunk_size, force_mini_rec); + + /* read up to next llog chunk_size block */ + lgi->lgi_buf.lb_len = chunk_size - + (*cur_offset & (chunk_size - 1)); + lgi->lgi_buf.lb_buf = buf; + + rc = dt_read(env, o, &lgi->lgi_buf, cur_offset); + if (rc < 0) { + if (rc == -EBADR && !force_mini_rec) + goto retry; + + CERROR("%s: can't read llog block from log "DFID + " offset %llu: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), *cur_offset, + rc); + GOTO(out, rc); + } + + if (rc < len) { + /* signal the end of the valid buffer to + * llog_process */ + memset(buf + rc, 0, len - rc); + } + + if (rc == 0) { /* end of file, nothing to do */ + if (!force_mini_rec) + goto retry; + GOTO(out, rc); + } + + if (rc < sizeof(*tail)) { + if (!force_mini_rec) + goto retry; + + CERROR("%s: invalid llog block at log id "DFID":%x " + "offset %llu\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_id.lgl_ogen, *cur_offset); + GOTO(out, rc = -EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + tail = (struct llog_rec_tail *)((char *)buf + rc - + sizeof(struct llog_rec_tail)); + + if (llog_verify_record(loghandle, rec)) { + /* + * the block seems corrupted. make a pad record so the + * caller can skip the block and try with the next one + */ + rec->lrh_len = rc; + rec->lrh_index = next_idx; + rec->lrh_type = LLOG_PAD_MAGIC; + + tail = rec_tail(rec); + tail->lrt_len = rc; + tail->lrt_index = next_idx; + + GOTO(out, rc = 0); + } + + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)((char *)buf + rc - + tail->lrt_len); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + + if (last_rec->lrh_index != tail->lrt_index) { + CERROR("%s: invalid llog tail at log id "DFID":%x " + "offset %llu last_rec idx %u tail idx %u" + "lrt len %u read_size %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_id.lgl_ogen, *cur_offset, + last_rec->lrh_index, tail->lrt_index, + tail->lrt_len, rc); + GOTO(out, rc = -EINVAL); + } + + *cur_idx = tail->lrt_index; + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("%s: invalid llog tail at log id "DFID":%x " + "offset %llu bytes %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_id.lgl_ogen, *cur_offset, rc); + GOTO(out, rc = -EINVAL); + } + if (tail->lrt_index < next_idx) { + last_idx = *cur_idx; + last_offset = *cur_offset; + continue; + } + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (next_idx && rec->lrh_index > next_idx) { + if (!force_mini_rec && next_idx > last_idx) + goto retry; + + CERROR("%s: missed desired record? %u > %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, + rec->lrh_index, next_idx); + GOTO(out, rc = -ENOENT); + } + + /* Trim unsupported extensions for compat w/ older clients */ + changelog_block_trim_ext(rec, last_rec, loghandle); + + GOTO(out, rc = 0); + +retry: + /* Note: because there are some pad records in the + * llog, so llog_skip_over() might skip too much + * records, let's try skip again with minimum record */ + force_mini_rec = true; + *cur_offset = last_offset; + *cur_idx = last_idx; + } + GOTO(out, rc = -EIO); +out: + return rc; +} + +/** + * Implementation of the llog_operations::lop_prev_block + * + * This function finds the llog block to return which contains + * record with required index but in reverse order - from end of llog + * to the beginning. + * It is main part of reverse llog processing. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * \param[in] prev_idx target index to find + * \param[in] buf pointer to data buffer to fill + * \param[in] len required len to read, it is llog_chunk_size usually. + * + * \retval 0 on successful buffer read + * \retval negative value on error + */ +static int llog_osd_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o; + struct dt_device *dt; + loff_t cur_offset; + __u32 chunk_size; + int rc; + + ENTRY; + + chunk_size = loghandle->lgh_hdr->llh_hdr.lrh_len; + if (len == 0 || len & (chunk_size - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx); + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + LASSERT(llog_osd_exist(loghandle)); + dt = lu2dt_dev(o->do_lu.lo_dev); + LASSERT(dt); + + /* Let's only use mini record size for previous block read + * for now XXX */ + cur_offset = chunk_size; + llog_skip_over(loghandle, &cur_offset, 0, prev_idx, + chunk_size, true); + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(out, rc); + + while (cur_offset < lgi->lgi_attr.la_size) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + + lgi->lgi_buf.lb_len = len; + lgi->lgi_buf.lb_buf = buf; + rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset); + if (rc < 0) { + CERROR("%s: can't read llog block from log "DFID + " offset %llu: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), cur_offset, rc); + GOTO(out, rc); + } + + if (rc == 0) /* end of file, nothing to do */ + GOTO(out, rc); + + if (rc < sizeof(*tail)) { + CERROR("%s: invalid llog block at log id "DFID":%x " + "offset %llu\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_id.lgl_ogen, cur_offset); + GOTO(out, rc = -EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + tail = (struct llog_rec_tail *)((char *)buf + rc - + sizeof(struct llog_rec_tail)); + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)((char *)buf + rc - + le32_to_cpu(tail->lrt_len)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + LASSERT(last_rec->lrh_index == tail->lrt_index); + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("%s: invalid llog tail at log id "DFID":%x " + "offset %llu\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_id.lgl_ogen, cur_offset); + GOTO(out, rc = -EINVAL); + } + if (tail->lrt_index < prev_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (rec->lrh_index > prev_idx) { + CERROR("%s: missed desired record? %u > %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, + rec->lrh_index, prev_idx); + GOTO(out, rc = -ENOENT); + } + + /* Trim unsupported extensions for compat w/ older clients */ + changelog_block_trim_ext(rec, last_rec, loghandle); + + GOTO(out, rc = 0); + } + GOTO(out, rc = -EIO); +out: + return rc; +} + +/** + * This is helper function to get llog directory object. It is used by named + * llog operations to find/insert/delete llog entry from llog directory. + * + * \param[in] env execution environment + * \param[in] ctxt llog context + * + * \retval dt_object of llog directory + * \retval ERR_PTR of negative value on error + */ +static struct dt_object *llog_osd_dir_get(const struct lu_env *env, + struct llog_ctxt *ctxt) +{ + struct dt_device *dt; + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dir; + int rc; + + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + if (ctxt->loc_dir == NULL) { + rc = dt_root_get(env, dt, &dti->dti_fid); + if (rc) + return ERR_PTR(rc); + dir = dt_locate(env, dt, &dti->dti_fid); + + if (!IS_ERR(dir) && !dt_try_as_dir(env, dir)) { + dt_object_put(env, dir); + return ERR_PTR(-ENOTDIR); + } + } else { + lu_object_get(&ctxt->loc_dir->do_lu); + dir = ctxt->loc_dir; + } + + return dir; +} + +/** + * Implementation of the llog_operations::lop_open + * + * This function opens the llog by its logid or by name, it may open also + * non existent llog and assing then new id to it. + * The llog_open/llog_close pair works similar to lu_object_find/put, + * the object may not exist prior open. The result of open is just dt_object + * in the llog header. + * + * \param[in] env execution environment + * \param[in] handle llog handle of the current llog + * \param[in] logid logid of llog to open (nameless llog) + * \param[in] name name of llog to open (named llog) + * \param[in] open_param + * LLOG_OPEN_NEW - new llog, may not exist + * LLOG_OPEN_EXIST - old llog, must exist + * + * \retval 0 on successful open, llog_handle::lgh_obj + * contains the dt_object of the llog. + * \retval negative value on error + */ +static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle, + struct llog_logid *logid, char *name, + enum llog_open_param open_param) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_ctxt *ctxt = handle->lgh_ctxt; + struct dt_object *o; + struct dt_device *dt; + struct ls_device *ls; + struct local_oid_storage *los = NULL; + int rc = 0; + bool new_id = false; + + ENTRY; + + LASSERT(env); + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + LASSERT(ctxt->loc_exp->exp_obd); + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + LASSERT(dt); + if (ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + struct lu_object_conf conf = { 0 }; + if (logid != NULL) { + logid_to_fid(logid, &lgi->lgi_fid); + } else { + /* If logid == NULL, then it means the caller needs + * to allocate new FID (llog_cat_declare_add_rec()). */ + rc = obd_fid_alloc(env, ctxt->loc_exp, + &lgi->lgi_fid, NULL); + if (rc < 0) + RETURN(rc); + rc = 0; + conf.loc_flags = LOC_F_NEW; + } + + o = dt_locate_at(env, dt, &lgi->lgi_fid, + dt->dd_lu_dev.ld_site->ls_top_dev, &conf); + if (IS_ERR(o)) + RETURN(PTR_ERR(o)); + + goto after_open; + } + + ls = ls_device_get(dt); + if (IS_ERR(ls)) + RETURN(PTR_ERR(ls)); + + mutex_lock(&ls->ls_los_mutex); + los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG); + mutex_unlock(&ls->ls_los_mutex); + LASSERT(los); + ls_device_put(env, ls); + + LASSERT(handle); + + if (logid != NULL) { + logid_to_fid(logid, &lgi->lgi_fid); + } else if (name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + GOTO(out, rc = PTR_ERR(llog_dir)); + dt_read_lock(env, llog_dir, 0); + rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid); + dt_read_unlock(env, llog_dir); + dt_object_put(env, llog_dir); + if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) { + /* generate fid for new llog */ + rc = local_object_fid_generate(env, los, + &lgi->lgi_fid); + new_id = true; + } + if (rc < 0) + GOTO(out, rc); + OBD_ALLOC(handle->lgh_name, strlen(name) + 1); + if (handle->lgh_name) + strcpy(handle->lgh_name, name); + else + GOTO(out, rc = -ENOMEM); + } else { + LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param); + /* generate fid for new llog */ +generate: + rc = local_object_fid_generate(env, los, &lgi->lgi_fid); + if (rc < 0) + GOTO(out, rc); + new_id = true; + } + + o = ls_locate(env, ls, &lgi->lgi_fid, NULL); + if (IS_ERR(o)) + GOTO(out_name, rc = PTR_ERR(o)); + + if (dt_object_exists(o) && new_id) { + /* llog exists with just generated ID, e.g. some old llog file + * still is in use or is orphan, drop a warn and skip it. */ + CDEBUG(D_INFO, "%s: llog exists with the same FID: "DFID + ", skipping\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu))); + dt_object_put(env, o); + /* just skip this llog ID, we shouldn't delete it because we + * don't know exactly what is its purpose and state. */ + goto generate; + } + +after_open: + /* No new llog is expected but doesn't exist */ + if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o)) { + CDEBUG(D_INFO, "%s: llog FID: "DFID" obj %p doesn`t exist\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), o); + GOTO(out_put, rc = -ENOENT); + } + fid_to_logid(&lgi->lgi_fid, &handle->lgh_id); + handle->lgh_obj = o; + handle->private_data = los; + LASSERT(handle->lgh_ctxt); + + RETURN(rc); + +out_put: + dt_object_put(env, o); +out_name: + if (handle->lgh_name != NULL) + OBD_FREE(handle->lgh_name, strlen(name) + 1); +out: + if (los != NULL) + dt_los_put(los); + RETURN(rc); +} + +/** + * Get dir for regular fid log object + * + * Get directory for regular fid log object, and these regular fid log + * object will be inserted under this directory, to satisfy the FS + * consistency check, e2fsck etc. + * + * \param [in] env execution environment + * \param [in] dto llog object + * + * \retval pointer to the directory if it is found. + * \retval ERR_PTR(negative errno) if it fails. + */ +struct dt_object *llog_osd_get_regular_fid_dir(const struct lu_env *env, + struct dt_object *dto) +{ + struct llog_thread_info *lgi = llog_info(env); + struct seq_server_site *ss = dto->do_lu.lo_dev->ld_site->ld_seq_site; + struct lu_seq_range *range = &lgi->lgi_range; + struct lu_fid *dir_fid = &lgi->lgi_fid; + struct dt_object *dir; + int rc; + ENTRY; + + fld_range_set_any(range); + LASSERT(ss != NULL); + rc = ss->ss_server_fld->lsf_seq_lookup(env, ss->ss_server_fld, + fid_seq(lu_object_fid(&dto->do_lu)), range); + if (rc < 0) + RETURN(ERR_PTR(rc)); + + lu_update_log_dir_fid(dir_fid, range->lsr_index); + dir = dt_locate(env, lu2dt_dev(dto->do_lu.lo_dev), dir_fid); + if (IS_ERR(dir)) + RETURN(dir); + + if (!dt_try_as_dir(env, dir)) { + dt_object_put(env, dir); + RETURN(ERR_PTR(-ENOTDIR)); + } + + RETURN(dir); +} + +/** + * Add llog object with regular FID to name entry + * + * Add llog object with regular FID to name space, and each llog + * object on each MDT will be /update_log_dir/[seq:oid:ver], + * so to satisfy the namespace consistency check, e2fsck etc. + * + * \param [in] env execution environment + * \param [in] dto llog object + * \param [in] th thandle + * \param [in] declare if it is declare or execution + * + * \retval 0 if insertion succeeds. + * \retval negative errno if insertion fails. + */ +static int +llog_osd_regular_fid_add_name_entry(const struct lu_env *env, + struct dt_object *dto, + struct thandle *th, bool declare) +{ + struct llog_thread_info *lgi = llog_info(env); + const struct lu_fid *fid = lu_object_fid(&dto->do_lu); + struct dt_insert_rec *rec = &lgi->lgi_dt_rec; + struct dt_object *dir; + char *name = lgi->lgi_name; + int rc; + ENTRY; + + if (!fid_is_norm(fid)) + RETURN(0); + + dir = llog_osd_get_regular_fid_dir(env, dto); + if (IS_ERR(dir)) + RETURN(PTR_ERR(dir)); + + rec->rec_fid = fid; + rec->rec_type = S_IFREG; + snprintf(name, sizeof(lgi->lgi_name), DFID, PFID(fid)); + dt_write_lock(env, dir, 0); + if (declare) { + rc = dt_declare_insert(env, dir, (struct dt_rec *)rec, + (struct dt_key *)name, th); + } else { + rc = dt_insert(env, dir, (struct dt_rec *)rec, + (struct dt_key *)name, th); + } + dt_write_unlock(env, dir); + + dt_object_put(env, dir); + RETURN(rc); +} + + +/** + * Implementation of the llog_operations::lop_declare_create + * + * This function declares the llog create. It declares also name insert + * into llog directory in case of named llog. + * + * \param[in] env execution environment + * \param[in] res llog handle of the current llog + * \param[in] th current transaction handle + * + * \retval 0 on successful create declaration + * \retval negative value on error + */ +static int llog_osd_declare_create(const struct lu_env *env, + struct llog_handle *res, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_insert_rec *rec = &lgi->lgi_dt_rec; + struct local_oid_storage *los; + struct dt_object *o; + int rc; + + ENTRY; + + LASSERT(res->lgh_obj); + LASSERT(th); + + /* object can be created by another thread */ + o = res->lgh_obj; + if (dt_object_exists(o)) + RETURN(0); + + if (res->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE | LA_SIZE; + lgi->lgi_attr.la_size = 0; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + if (rc < 0) + RETURN(rc); + + + rc = llog_osd_regular_fid_add_name_entry(env, o, th, true); + + RETURN(rc); + } + los = res->private_data; + LASSERT(los); + + rc = llog_osd_declare_new_object(env, los, o, th); + if (rc) + RETURN(rc); + + /* do not declare header initialization here as it's declared + * in llog_osd_declare_write_rec() which is always called */ + + if (res->lgh_name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, res->lgh_ctxt); + if (IS_ERR(llog_dir)) + RETURN(PTR_ERR(llog_dir)); + logid_to_fid(&res->lgh_id, &lgi->lgi_fid); + rec->rec_fid = &lgi->lgi_fid; + rec->rec_type = S_IFREG; + rc = dt_declare_insert(env, llog_dir, + (struct dt_rec *)rec, + (struct dt_key *)res->lgh_name, th); + dt_object_put(env, llog_dir); + if (rc) + CERROR("%s: can't declare named llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + res->lgh_name, rc); + } + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_create + * + * This function creates the llog according with llog_handle::lgh_obj + * and llog_handle::lgh_name. + * + * \param[in] env execution environment + * \param[in] res llog handle of the current llog + * \param[in] th current transaction handle + * + * \retval 0 on successful create + * \retval negative value on error + */ +static int llog_osd_create(const struct lu_env *env, struct llog_handle *res, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_insert_rec *rec = &lgi->lgi_dt_rec; + struct local_oid_storage *los; + struct dt_object *o; + int rc = 0; + + ENTRY; + + LASSERT(env); + o = res->lgh_obj; + LASSERT(o); + + /* llog can be already created */ + if (dt_object_exists(o)) + RETURN(-EEXIST); + + if (res->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE | LA_SIZE | LA_TYPE; + lgi->lgi_attr.la_size = 0; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + dt_write_lock(env, o, 0); + rc = dt_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + dt_write_unlock(env, o); + if (rc < 0) + RETURN(rc); + + rc = llog_osd_regular_fid_add_name_entry(env, o, th, false); + + RETURN(rc); + } + + los = res->private_data; + LASSERT(los); + + dt_write_lock(env, o, 0); + if (!dt_object_exists(o)) + rc = llog_osd_create_new_object(env, los, o, th); + else + rc = -EEXIST; + + dt_write_unlock(env, o); + if (rc) + RETURN(rc); + + if (res->lgh_name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, res->lgh_ctxt); + if (IS_ERR(llog_dir)) + RETURN(PTR_ERR(llog_dir)); + + logid_to_fid(&res->lgh_id, &lgi->lgi_fid); + rec->rec_fid = &lgi->lgi_fid; + rec->rec_type = S_IFREG; + dt_read_lock(env, llog_dir, 0); + rc = dt_insert(env, llog_dir, (struct dt_rec *)rec, + (struct dt_key *)res->lgh_name, th); + dt_read_unlock(env, llog_dir); + dt_object_put(env, llog_dir); + if (rc) + CERROR("%s: can't create named llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + res->lgh_name, rc); + } + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_close + * + * This function closes the llog. It just put llog object and referenced + * local storage. + * + * \param[in] env execution environment + * \param[in] handle llog handle of the current llog + * + * \retval 0 on successful llog close + * \retval negative value on error + */ +static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle) +{ + struct local_oid_storage *los; + int rc = 0; + + ENTRY; + + LASSERT(handle->lgh_obj); + + if (handle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + /* Remove the object from the cache, otherwise it may + * hold LOD being released during cleanup process */ + dt_object_put_nocache(env, handle->lgh_obj); + LASSERT(handle->private_data == NULL); + RETURN(rc); + } else { + dt_object_put(env, handle->lgh_obj); + } + los = handle->private_data; + LASSERT(los); + dt_los_put(los); + + if (handle->lgh_name) + OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1); + + RETURN(rc); +} + +/** + * delete llog object name entry + * + * Delete llog object (with regular FID) from name space (under + * update_log_dir). + * + * \param [in] env execution environment + * \param [in] dto llog object + * \param [in] th thandle + * \param [in] declare if it is declare or execution + * + * \retval 0 if deletion succeeds. + * \retval negative errno if deletion fails. + */ +static int +llog_osd_regular_fid_del_name_entry(const struct lu_env *env, + struct dt_object *dto, + struct thandle *th, bool declare) +{ + struct llog_thread_info *lgi = llog_info(env); + const struct lu_fid *fid = lu_object_fid(&dto->do_lu); + struct dt_object *dir; + char *name = lgi->lgi_name; + int rc; + ENTRY; + + if (!fid_is_norm(fid)) + RETURN(0); + + dir = llog_osd_get_regular_fid_dir(env, dto); + if (IS_ERR(dir)) + RETURN(PTR_ERR(dir)); + + snprintf(name, sizeof(lgi->lgi_name), DFID, PFID(fid)); + dt_write_lock(env, dir, 0); + if (declare) { + rc = dt_declare_delete(env, dir, (struct dt_key *)name, + th); + } else { + rc = dt_delete(env, dir, (struct dt_key *)name, th); + } + dt_write_unlock(env, dir); + + dt_object_put(env, dir); + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_declare_destroy + * + * This function declare destroys the llog and deletes also entry in the + * llog directory in case of named llog. Llog should be opened prior that. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * + * \retval 0 on successful destroy + * \retval negative value on error + */ +static int llog_osd_declare_destroy(const struct lu_env *env, + struct llog_handle *loghandle, + struct thandle *th) +{ + struct llog_ctxt *ctxt; + struct dt_object *o, *llog_dir = NULL; + int rc; + + ENTRY; + + ctxt = loghandle->lgh_ctxt; + LASSERT(ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + + if (loghandle->lgh_name) { + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + RETURN(PTR_ERR(llog_dir)); + + rc = dt_declare_delete(env, llog_dir, + (struct dt_key *)loghandle->lgh_name, + th); + if (rc < 0) + GOTO(out_put, rc); + } + + rc = dt_declare_ref_del(env, o, th); + if (rc < 0) + GOTO(out_put, rc); + + rc = dt_declare_destroy(env, o, th); + if (rc < 0) + GOTO(out_put, rc); + + if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + rc = llog_osd_regular_fid_del_name_entry(env, o, th, true); + if (rc < 0) + GOTO(out_put, rc); + } + +out_put: + if (!(IS_ERR_OR_NULL(llog_dir))) + dt_object_put(env, llog_dir); + + RETURN(rc); +} + + +/** + * Implementation of the llog_operations::lop_destroy + * + * This function destroys the llog and deletes also entry in the + * llog directory in case of named llog. Llog should be opened prior that. + * Destroy method is not part of external transaction and does everything + * inside. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * + * \retval 0 on successful destroy + * \retval negative value on error + */ +static int llog_osd_destroy(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th) +{ + struct llog_ctxt *ctxt; + struct dt_object *o, *llog_dir = NULL; + int rc; + + ENTRY; + + ctxt = loghandle->lgh_ctxt; + LASSERT(ctxt != NULL); + + o = loghandle->lgh_obj; + LASSERT(o != NULL); + + dt_write_lock(env, o, 0); + if (!llog_osd_exist(loghandle)) + GOTO(out_unlock, rc = 0); + + if (loghandle->lgh_name) { + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + GOTO(out_unlock, rc = PTR_ERR(llog_dir)); + + dt_read_lock(env, llog_dir, 0); + rc = dt_delete(env, llog_dir, + (struct dt_key *)loghandle->lgh_name, + th); + dt_read_unlock(env, llog_dir); + if (rc) { + CERROR("%s: can't remove llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + loghandle->lgh_name, rc); + GOTO(out_unlock, rc); + } + } + + dt_ref_del(env, o, th); + rc = dt_destroy(env, o, th); + if (rc < 0) + GOTO(out_unlock, rc); + + loghandle->lgh_destroyed = true; + if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + rc = llog_osd_regular_fid_del_name_entry(env, o, th, false); + if (rc < 0) + GOTO(out_unlock, rc); + } + +out_unlock: + dt_write_unlock(env, o); + if (!(IS_ERR_OR_NULL(llog_dir))) + dt_object_put(env, llog_dir); + RETURN(rc); +} + +/** + * Implementation of the llog_operations::lop_setup + * + * This function setup the llog on local storage. + * + * \param[in] env execution environment + * \param[in] obd obd device the llog belongs to + * \param[in] olg the llog group, it is always zero group now. + * \param[in] ctxt_idx the llog index, it defines the purpose of this llog. + * Every new llog type have to use own index. + * \param[in] disk_obd the storage obd, where llog is stored. + * + * \retval 0 on successful llog setup + * \retval negative value on error + */ +static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int ctxt_idx, + struct obd_device *disk_obd) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_ctxt *ctxt; + int rc = 0; + ENTRY; + + LASSERT(obd); + LASSERT(olg->olg_ctxts[ctxt_idx]); + + ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]); + LASSERT(ctxt); + + if (disk_obd == NULL) + GOTO(out, rc = 0); + + /* initialize data allowing to generate new fids, + * literally we need a sequece */ + lgi->lgi_fid.f_seq = FID_SEQ_LLOG; + lgi->lgi_fid.f_oid = 1; + lgi->lgi_fid.f_ver = 0; + rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt, + &lgi->lgi_fid, + &ctxt->loc_los_nameless); + if (rc != 0) + GOTO(out, rc); + + lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME; + lgi->lgi_fid.f_oid = 1; + lgi->lgi_fid.f_ver = 0; + rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt, + &lgi->lgi_fid, + &ctxt->loc_los_named); + if (rc != 0) { + local_oid_storage_fini(env, ctxt->loc_los_nameless); + ctxt->loc_los_nameless = NULL; + } + + GOTO(out, rc); + +out: + llog_ctxt_put(ctxt); + return rc; +} + +/** + * Implementation of the llog_operations::lop_cleanup + * + * This function cleanups the llog on local storage. + * + * \param[in] env execution environment + * \param[in] ctxt the llog context + * + * \retval 0 + */ +static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + if (ctxt->loc_los_nameless != NULL) { + local_oid_storage_fini(env, ctxt->loc_los_nameless); + ctxt->loc_los_nameless = NULL; + } + + if (ctxt->loc_los_named != NULL) { + local_oid_storage_fini(env, ctxt->loc_los_named); + ctxt->loc_los_named = NULL; + } + + return 0; +} + +struct llog_operations llog_osd_ops = { + .lop_next_block = llog_osd_next_block, + .lop_prev_block = llog_osd_prev_block, + .lop_read_header = llog_osd_read_header, + .lop_declare_destroy = llog_osd_declare_destroy, + .lop_destroy = llog_osd_destroy, + .lop_setup = llog_osd_setup, + .lop_cleanup = llog_osd_cleanup, + .lop_open = llog_osd_open, + .lop_exist = llog_osd_exist, + .lop_declare_create = llog_osd_declare_create, + .lop_create = llog_osd_create, + .lop_declare_write_rec = llog_osd_declare_write_rec, + .lop_write_rec = llog_osd_write_rec, + .lop_close = llog_osd_close, +}; +EXPORT_SYMBOL(llog_osd_ops); + +struct llog_operations llog_common_cat_ops = { + .lop_next_block = llog_osd_next_block, + .lop_prev_block = llog_osd_prev_block, + .lop_read_header = llog_osd_read_header, + .lop_declare_destroy = llog_osd_declare_destroy, + .lop_destroy = llog_osd_destroy, + .lop_setup = llog_osd_setup, + .lop_cleanup = llog_osd_cleanup, + .lop_open = llog_osd_open, + .lop_exist = llog_osd_exist, + .lop_declare_create = llog_osd_declare_create, + .lop_create = llog_osd_create, + .lop_declare_write_rec = llog_osd_declare_write_rec, + .lop_write_rec = llog_osd_write_rec, + .lop_close = llog_osd_close, + .lop_add = llog_cat_add_rec, + .lop_declare_add = llog_cat_declare_add_rec, +}; +EXPORT_SYMBOL(llog_common_cat_ops); + +/** + * Read the special file which contains the list of llog catalogs IDs + * + * This function reads the CATALOGS file which contains the array of llog + * catalogs IDs. The main purpose of this file is to store OSP llogs indexed + * by OST/MDT number. + * + * \param[in] env execution environment + * \param[in] d corresponding storage device + * \param[in] idx position to start from, usually OST/MDT index + * \param[in] count how many catalog IDs to read + * \param[out] idarray the buffer for the data. If it is NULL then + * function returns just number of catalog IDs + * in the file. + * \param[in] fid LLOG_CATALOGS_OID for CATALOG object + * + * \retval 0 on successful read of catalog IDs + * \retval negative value on error + * \retval positive value which is number of records in + * the file if \a idarray is NULL + */ +int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray, + const struct lu_fid *fid) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o = NULL; + struct thandle *th; + int rc, size; + + ENTRY; + + LASSERT(d); + + size = sizeof(*idarray) * count; + lgi->lgi_off = idx * sizeof(*idarray); + + lgi->lgi_fid = *fid; + o = dt_locate(env, d, &lgi->lgi_fid); + if (IS_ERR(o)) + RETURN(PTR_ERR(o)); + + if (!dt_object_exists(o)) { + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + th->th_wait_submit = 1; + /* Make the llog object creation synchronization, so + * it will be reliable to the reference, especially + * for remote reference */ + th->th_sync = 1; + + rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, d, th); + if (rc) + GOTO(out_trans, rc); + + dt_write_lock(env, o, 0); + if (!dt_object_exists(o)) + rc = dt_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + dt_write_unlock(env, o); +out_trans: + dt_trans_stop(env, d, th); + if (rc) + GOTO(out, rc); + } + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(out, rc); + + if (!S_ISREG(lgi->lgi_attr.la_mode)) { + CERROR("%s: CATALOGS is not a regular file!: mode = %o\n", + o->do_lu.lo_dev->ld_obd->obd_name, + lgi->lgi_attr.la_mode); + GOTO(out, rc = -ENOENT); + } + + CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n", + (int)lgi->lgi_attr.la_size, size); + + /* return just number of llogs */ + if (idarray == NULL) { + rc = lgi->lgi_attr.la_size / sizeof(*idarray); + GOTO(out, rc); + } + + /* read for new ost index or for empty file */ + memset(idarray, 0, size); + if (lgi->lgi_attr.la_size <= lgi->lgi_off) + GOTO(out, rc = 0); + if (lgi->lgi_attr.la_size < lgi->lgi_off + size) + size = lgi->lgi_attr.la_size - lgi->lgi_off; + + lgi->lgi_buf.lb_buf = idarray; + lgi->lgi_buf.lb_len = size; + rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off); + /* -EFAULT means the llog is a sparse file. This is not an error + * after arbitrary OST index is supported. */ + if (rc < 0 && rc != -EFAULT) { + CERROR("%s: error reading CATALOGS: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out, rc); + } + + EXIT; +out: + dt_object_put(env, o); + RETURN(rc); +} +EXPORT_SYMBOL(llog_osd_get_cat_list); + +/** + * Write the special file which contains the list of llog catalogs IDs + * + * This function writes the CATALOG file which contains the array of llog + * catalogs IDs. It is used mostly to store OSP llogs indexed by OST/MDT + * number. + * + * \param[in] env execution environment + * \param[in] d corresponding storage device + * \param[in] idx position to start from, usually OST/MDT index + * \param[in] count how many catalog IDs to write + * \param[out] idarray the buffer with the data to write. + * \param[in] fid LLOG_CATALOGS_OID for CATALOG object + * + * \retval 0 on successful write of catalog IDs + * \retval negative value on error + */ +int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray, + const struct lu_fid *fid) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o = NULL; + struct thandle *th; + int rc, size; + + if (count == 0) + RETURN(0); + + LASSERT(d); + + size = sizeof(*idarray) * count; + lgi->lgi_off = idx * sizeof(*idarray); + lgi->lgi_fid = *fid; + + o = dt_locate(env, d, &lgi->lgi_fid); + if (IS_ERR(o)) + RETURN(PTR_ERR(o)); + + if (!dt_object_exists(o)) + GOTO(out, rc = -ENOENT); + + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(out, rc); + + if (!S_ISREG(lgi->lgi_attr.la_mode)) { + CERROR("%s: CATALOGS is not a regular file!: mode = %o\n", + o->do_lu.lo_dev->ld_obd->obd_name, + lgi->lgi_attr.la_mode); + GOTO(out, rc = -ENOENT); + } + + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + lgi->lgi_buf.lb_len = size; + lgi->lgi_buf.lb_buf = idarray; + rc = dt_declare_record_write(env, o, &lgi->lgi_buf, lgi->lgi_off, th); + if (rc) + GOTO(out_trans, rc); + + /* For update log, this happens during initialization, + * see lod_sub_prep_llog(), and we need make sure catlog + * file ID is written to catlist file(committed) before + * cross-MDT operation write update records to catlog FILE, + * otherwise, during failover these update records might + * missing */ + if (fid_is_update_log(fid)) + th->th_sync = 1; + + rc = dt_trans_start_local(env, d, th); + if (rc) + GOTO(out_trans, rc); + + th->th_wait_submit = 1; + + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc) + CDEBUG(D_INODE, "can't write CATALOGS at index %d: rc = %d\n", + idx, rc); +out_trans: + dt_trans_stop(env, d, th); +out: + dt_object_put(env, o); + RETURN(rc); +} +EXPORT_SYMBOL(llog_osd_put_cat_list); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c new file mode 100644 index 0000000000000..c644efb64ac1f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c @@ -0,0 +1,484 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_swab.c + * + * Swabbing of llog datatypes (from disk or over the wire). + * + * Author: jacob berkman + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include +#include + +static void print_llogd_body(struct llogd_body *d) +{ + CDEBUG(D_OTHER, "llogd body: %p\n", d); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi.oi_fid: "DFID"\n", + PFID(&d->lgd_logid.lgl_oi.oi_fid)); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen); + CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx); + CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags); + CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index); + CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index); + CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len); + CDEBUG(D_OTHER, "\tlgd_cur_offset: %#llx\n", d->lgd_cur_offset); +} + +void lustre_swab_lu_fid(struct lu_fid *fid) +{ + __swab64s(&fid->f_seq); + __swab32s(&fid->f_oid); + __swab32s(&fid->f_ver); +} +EXPORT_SYMBOL(lustre_swab_lu_fid); + +void lustre_swab_ost_id(struct ost_id *oid) +{ + if (fid_seq_is_mdt0(oid->oi.oi_seq) || + fid_seq_is_default(oid->oi.oi_seq)) { + __swab64s(&oid->oi.oi_id); + __swab64s(&oid->oi.oi_seq); + } else { + lustre_swab_lu_fid(&oid->oi_fid); + } +} +EXPORT_SYMBOL(lustre_swab_ost_id); + +void lustre_swab_llog_id(struct llog_logid *log_id) +{ + __swab64s(&log_id->lgl_oi.oi.oi_id); + __swab64s(&log_id->lgl_oi.oi.oi_seq); + __swab32s(&log_id->lgl_ogen); +} + +void lustre_swab_llogd_body (struct llogd_body *d) +{ + ENTRY; + print_llogd_body(d); + lustre_swab_llog_id(&d->lgd_logid); + __swab32s(&d->lgd_ctxt_idx); + __swab32s(&d->lgd_llh_flags); + __swab32s(&d->lgd_index); + __swab32s(&d->lgd_saved_index); + __swab32s(&d->lgd_len); + __swab64s(&d->lgd_cur_offset); + print_llogd_body(d); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_llogd_body); + +void lustre_swab_llogd_conn_body (struct llogd_conn_body *d) +{ + __swab64s(&d->lgdc_gen.mnt_cnt); + __swab64s(&d->lgdc_gen.conn_cnt); + lustre_swab_llog_id(&d->lgdc_logid); + __swab32s(&d->lgdc_ctxt_idx); +} +EXPORT_SYMBOL(lustre_swab_llogd_conn_body); + +void lustre_swab_ll_fid(struct ll_fid *fid) +{ + __swab64s(&fid->id); + __swab32s(&fid->generation); + __swab32s(&fid->f_type); +} + +void lustre_swab_lu_seq_range(struct lu_seq_range *range) +{ + __swab64s(&range->lsr_start); + __swab64s(&range->lsr_end); + __swab32s(&range->lsr_index); + __swab32s(&range->lsr_flags); +} +EXPORT_SYMBOL(lustre_swab_lu_seq_range); + +void lustre_swab_update_ops(struct update_ops *uops, unsigned int op_count) +{ + unsigned int i; + unsigned int j; + + for (i = 0; i < op_count; i++) { + lustre_swab_lu_fid(&uops->uops_op[i].uop_fid); + __swab16s(&uops->uops_op[i].uop_type); + __swab16s(&uops->uops_op[i].uop_param_count); + for (j = 0; j < uops->uops_op[i].uop_param_count; j++) + __swab16s(&uops->uops_op[i].uop_params_off[j]); + } +} +EXPORT_SYMBOL(lustre_swab_update_ops); + +void lustre_swab_llog_rec(struct llog_rec_hdr *rec) +{ + struct llog_rec_tail *tail = NULL; + + __swab32s(&rec->lrh_len); + __swab32s(&rec->lrh_index); + __swab32s(&rec->lrh_type); + __swab32s(&rec->lrh_id); + + switch (rec->lrh_type) { + case OST_SZ_REC: + { + struct llog_size_change_rec *lsc = + (struct llog_size_change_rec *)rec; + + lustre_swab_ll_fid(&lsc->lsc_fid); + __swab32s(&lsc->lsc_ioepoch); + tail = &lsc->lsc_tail; + break; + } + case MDS_UNLINK_REC: + { + struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec; + + __swab64s(&lur->lur_oid); + __swab32s(&lur->lur_oseq); + __swab32s(&lur->lur_count); + tail = &lur->lur_tail; + break; + } + case MDS_UNLINK64_REC: + { + struct llog_unlink64_rec *lur = + (struct llog_unlink64_rec *)rec; + + lustre_swab_lu_fid(&lur->lur_fid); + __swab32s(&lur->lur_count); + tail = &lur->lur_tail; + break; + } + case CHANGELOG_REC: + { + struct llog_changelog_rec *cr = + (struct llog_changelog_rec *)rec; + + __swab16s(&cr->cr.cr_namelen); + __swab16s(&cr->cr.cr_flags); + __swab32s(&cr->cr.cr_type); + __swab64s(&cr->cr.cr_index); + __swab64s(&cr->cr.cr_prev); + __swab64s(&cr->cr.cr_time); + lustre_swab_lu_fid(&cr->cr.cr_tfid); + lustre_swab_lu_fid(&cr->cr.cr_pfid); + if (cr->cr.cr_flags & CLF_RENAME) { + struct changelog_ext_rename *rnm = + changelog_rec_rename(&cr->cr); + + lustre_swab_lu_fid(&rnm->cr_sfid); + lustre_swab_lu_fid(&rnm->cr_spfid); + } + /* + * Because the tail follows a variable-length structure we need + * to compute its location at runtime + */ + tail = (struct llog_rec_tail *)((char *)&cr->cr + + changelog_rec_size(&cr->cr) + + cr->cr.cr_namelen); + break; + } + + case CHANGELOG_USER_REC: + { + struct llog_changelog_user_rec *cur = + (struct llog_changelog_user_rec *)rec; + + __swab32s(&cur->cur_id); + __swab64s(&cur->cur_endrec); + __swab32s(&cur->cur_time); + tail = &cur->cur_tail; + break; + } + + case HSM_AGENT_REC: { + struct llog_agent_req_rec *arr = + (struct llog_agent_req_rec *)rec; + + __swab32s(&arr->arr_hai.hai_len); + __swab32s(&arr->arr_hai.hai_action); + lustre_swab_lu_fid(&arr->arr_hai.hai_fid); + lustre_swab_lu_fid(&arr->arr_hai.hai_dfid); + __swab64s(&arr->arr_hai.hai_cookie); + __swab64s(&arr->arr_hai.hai_extent.offset); + __swab64s(&arr->arr_hai.hai_extent.length); + __swab64s(&arr->arr_hai.hai_gid); + /* + * no swabing for opaque data + * hai_data[0]; + */ + break; + } + + case MDS_SETATTR64_REC: + { + struct llog_setattr64_rec *lsr = + (struct llog_setattr64_rec *)rec; + + lustre_swab_ost_id(&lsr->lsr_oi); + __swab32s(&lsr->lsr_uid); + __swab32s(&lsr->lsr_uid_h); + __swab32s(&lsr->lsr_gid); + __swab32s(&lsr->lsr_gid_h); + __swab64s(&lsr->lsr_valid); + + if (rec->lrh_len > sizeof(struct llog_setattr64_rec)) { + struct llog_setattr64_rec_v2 *lsr2 = + (struct llog_setattr64_rec_v2 *)rec; + + __swab32s(&lsr2->lsr_projid); + __swab32s(&lsr2->lsr_layout_version); + tail = &lsr2->lsr_tail; + } else { + tail = &lsr->lsr_tail; + } + break; + } + case OBD_CFG_REC: + /* these are swabbed as they are consumed */ + break; + case LLOG_HDR_MAGIC: + { + struct llog_log_hdr *llh = (struct llog_log_hdr *)rec; + + __swab64s(&llh->llh_timestamp); + __swab32s(&llh->llh_count); + __swab32s(&llh->llh_bitmap_offset); + __swab32s(&llh->llh_flags); + __swab32s(&llh->llh_size); + __swab32s(&llh->llh_cat_idx); + tail = LLOG_HDR_TAIL(llh); + break; + } + case LLOG_LOGID_MAGIC: + { + struct llog_logid_rec *lid = (struct llog_logid_rec *)rec; + + lustre_swab_llog_id(&lid->lid_id); + tail = &lid->lid_tail; + break; + } + case LLOG_GEN_REC: + { + struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec; + + __swab64s(&lgr->lgr_gen.mnt_cnt); + __swab64s(&lgr->lgr_gen.conn_cnt); + tail = &lgr->lgr_tail; + break; + } + case LLOG_PAD_MAGIC: + break; + case UPDATE_REC: + { + struct llog_update_record *lur = + (struct llog_update_record *)rec; + struct update_records *record = &lur->lur_update_rec; + + __swab32s(&record->ur_flags); + __swab64s(&record->ur_batchid); + __swab64s(&record->ur_master_transno); + __swab32s(&record->ur_param_count); + __swab32s(&record->ur_update_count); + lustre_swab_update_ops(&record->ur_ops, + record->ur_update_count); + + /* Compute tail location. */ + tail = (struct llog_rec_tail *)((char *)record + + update_records_size(record)); + break; + } + default: + CERROR("Unknown llog rec type %#x swabbing rec %p\n", + rec->lrh_type, rec); + } + + if (tail) { + __swab32s(&tail->lrt_len); + __swab32s(&tail->lrt_index); + } +} + +static void print_llog_hdr(struct llog_log_hdr *h) +{ + CDEBUG(D_OTHER, "llog header: %p\n", h); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type); + CDEBUG(D_OTHER, "\tllh_timestamp: %#llx\n", h->llh_timestamp); + CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count); + CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset); + CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags); + CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size); + CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx); + CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", + LLOG_HDR_TAIL(h)->lrt_index); + CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", + LLOG_HDR_TAIL(h)->lrt_len); +} + +void lustre_swab_llog_hdr (struct llog_log_hdr *h) +{ + ENTRY; + print_llog_hdr(h); + + lustre_swab_llog_rec(&h->llh_hdr); + + print_llog_hdr(h); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_llog_hdr); + +void print_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + + ENTRY; + + if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */ + return; + + CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg); + CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command); + CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num); + CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags); + CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", + libcfs_nid2str(lcfg->lcfg_nid)); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount); + if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT) + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d %s\n", + i, lcfg->lcfg_buflens[i], + lustre_cfg_string(lcfg, i)); + } + + EXIT; +} +EXPORT_SYMBOL(print_lustre_cfg); + +void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + + ENTRY; + + __swab32s(&lcfg->lcfg_version); + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) { + CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n", + lcfg->lcfg_version, LUSTRE_CFG_VERSION); + EXIT; + return; + } + + __swab32s(&lcfg->lcfg_command); + __swab32s(&lcfg->lcfg_num); + __swab32s(&lcfg->lcfg_flags); + __swab64s(&lcfg->lcfg_nid); + __swab32s(&lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++) + __swab32s(&lcfg->lcfg_buflens[i]); + + print_lustre_cfg(lcfg); + EXIT; + return; +} + +/* used only for compatibility with old on-disk cfg_marker data */ +struct cfg_marker32 { + __u32 cm_step; + __u32 cm_flags; + __u32 cm_vers; + __u32 padding; + __u32 cm_createtime; + __u32 cm_canceltime; + char cm_tgtname[MTI_NAME_MAXLEN]; + char cm_comment[MTI_NAME_MAXLEN]; +}; + +#define MTI_NAMELEN32 (MTI_NAME_MAXLEN - \ + (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32))) + +void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size) +{ + struct cfg_marker32 *cm32 = (struct cfg_marker32 *)marker; + + ENTRY; + + if (swab) { + __swab32s(&marker->cm_step); + __swab32s(&marker->cm_flags); + __swab32s(&marker->cm_vers); + } + if (size == sizeof(*cm32)) { + __u32 createtime, canceltime; + /* + * There was a problem with the original declaration of + * cfg_marker on 32-bit systems because it used time_t as + * a wire protocol structure, and didn't verify this in + * wirecheck. We now have to convert the offsets of the + * later fields in order to work on 32- and 64-bit systems. + * + * Fortunately, the cm_comment field has no functional use + * so can be sacrificed when converting the timestamp size. + * + * Overwrite fields from the end first, so they are not + * clobbered, and use memmove() instead of memcpy() because + * the source and target buffers overlap. bug 16771 + */ + createtime = cm32->cm_createtime; + canceltime = cm32->cm_canceltime; + memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32); + marker->cm_comment[MTI_NAMELEN32 - 1] = '\0'; + memmove(marker->cm_tgtname, cm32->cm_tgtname, + sizeof(marker->cm_tgtname)); + if (swab) { + __swab32s(&createtime); + __swab32s(&canceltime); + } + marker->cm_createtime = createtime; + marker->cm_canceltime = canceltime; + CDEBUG(D_CONFIG, + "Find old cfg_marker(Srv32b,Clt64b) for target %s, converting\n", + marker->cm_tgtname); + } else if (swab) { + __swab64s(&marker->cm_createtime); + __swab64s(&marker->cm_canceltime); + } + + EXIT; + return; +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c new file mode 100644 index 0000000000000..f1517ceef7198 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c @@ -0,0 +1,2287 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_test.c + * + * Author: Phil Schwan + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +#include +#include +#include + +/* This is slightly more than the number of records that can fit into a + * single llog file, because the llog_log_header takes up some of the + * space in the first block that cannot be used for the bitmap. */ +static int llog_test_recnum = (LLOG_MIN_CHUNK_SIZE * 8); +static int llog_test_rand; +static struct obd_uuid uuid = { .uuid = "test_uuid" }; +static struct llog_logid cat_logid; + +struct llog_mini_rec { + struct llog_rec_hdr lmr_hdr; + struct llog_rec_tail lmr_tail; +} __attribute__((packed)); + +static int verify_handle(char *test, struct llog_handle *llh, int num_recs) +{ + int i; + int last_idx = 0; + int active_recs = 0; + + for (i = 0; i < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr); i++) { + if (ext2_test_bit(i, LLOG_HDR_BITMAP(llh->lgh_hdr))) { + last_idx = i; + active_recs++; + } + } + + /* check the llog is sane at first, llh_count and lgh_last_idx*/ + if (llh->lgh_hdr->llh_count != active_recs) { + CERROR("%s: handle->count is %d, but there are %d recs found\n", + test, llh->lgh_hdr->llh_count, active_recs); + RETURN(-ERANGE); + } + + if (llh->lgh_last_idx != LLOG_HDR_TAIL(llh->lgh_hdr)->lrt_index || + (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_CAT) && + llh->lgh_last_idx < last_idx)) { + CERROR("%s: lgh_last_idx is %d (%d in the header), last found %d\n", + test, llh->lgh_last_idx, + LLOG_HDR_TAIL(llh->lgh_hdr)->lrt_index, last_idx); + RETURN(-ERANGE); + } + + /* finally checks against expected value from the caller */ + if (active_recs != num_recs) { + CERROR("%s: expected %d active recs after write, found %d\n", + test, num_recs, active_recs); + RETURN(-ERANGE); + } + + RETURN(0); +} + +/* Test named-log create/open, close */ +static int llog_test_1(const struct lu_env *env, + struct obd_device *obd, char *name) +{ + struct llog_handle *llh; + struct llog_ctxt *ctxt; + int rc; + int rc2; + + ENTRY; + + CWARN("1a: create a log with name: %s\n", name); + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + rc = llog_open_create(env, ctxt, &llh, NULL, name); + if (rc) { + CERROR("1a: llog_create with name %s failed: %d\n", name, rc); + GOTO(out, rc); + } + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("1a: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + rc = verify_handle("1", llh, 1); + + CWARN("1b: close newly-created log\n"); +out_close: + rc2 = llog_close(env, llh); + if (rc2) { + CERROR("1b: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +out: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static int test_2_cancel_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + return LLOG_DEL_RECORD; +} + +/* Test named-log reopen; returns opened log on success */ +static int llog_test_2(const struct lu_env *env, struct obd_device *obd, + char *name, struct llog_handle **llh) +{ + struct llog_ctxt *ctxt; + struct llog_handle *lgh; + struct llog_logid logid; + int rc; + struct llog_mini_rec lmr; + + ENTRY; + + CWARN("2a: re-open a log with name: %s\n", name); + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("2a: re-open log with name %s failed: %d\n", name, rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2a: can't init llog handle: %d\n", rc); + GOTO(out_close_llh, rc); + } + + rc = verify_handle("2", *llh, 1); + if (rc) + GOTO(out_close_llh, rc); + + CWARN("2b: create a log without specified NAME & LOGID\n"); + rc = llog_open_create(env, ctxt, &lgh, NULL, NULL); + if (rc) { + CERROR("2b: create log failed\n"); + GOTO(out_close_llh, rc); + } + rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2b: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + logid = lgh->lgh_id; + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC; + + /* Check llog header values are correct after record add/cancel */ + CWARN("2b: write 1 llog records, check llh_count\n"); + rc = llog_write(env, lgh, &lmr.lmr_hdr, LLOG_NEXT_IDX); + if (rc < 0) + GOTO(out_close, rc); + + /* in-memory values after record addition */ + rc = verify_handle("2b", lgh, 2); + if (rc < 0) + GOTO(out_close, rc); + + /* re-open llog to read on-disk values */ + llog_close(env, lgh); + + CWARN("2c: re-open the log by LOGID and verify llh_count\n"); + rc = llog_open(env, ctxt, &lgh, &logid, NULL, LLOG_OPEN_EXISTS); + if (rc < 0) { + CERROR("2c: re-open log by LOGID failed\n"); + GOTO(out_close_llh, rc); + } + + rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid); + if (rc < 0) { + CERROR("2c: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + /* check values just read from disk */ + rc = verify_handle("2c", lgh, 2); + if (rc < 0) + GOTO(out_close, rc); + + rc = llog_process(env, lgh, test_2_cancel_cb, NULL, NULL); + if (rc < 0) + GOTO(out_close, rc); + + /* in-memory values */ + rc = verify_handle("2c", lgh, 1); + if (rc < 0) + GOTO(out_close, rc); + + /* re-open llog to get on-disk values */ + llog_close(env, lgh); + + rc = llog_open(env, ctxt, &lgh, &logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("2c: re-open log by LOGID failed\n"); + GOTO(out_close_llh, rc); + } + + rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2c: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + /* on-disk values after llog re-open */ + rc = verify_handle("2c", lgh, 1); + if (rc < 0) + GOTO(out_close, rc); + + CWARN("2d: destroy this log\n"); + rc = llog_destroy(env, lgh); + if (rc) + CERROR("2d: destroy log failed\n"); +out_close: + llog_close(env, lgh); +out_close_llh: + if (rc) + llog_close(env, *llh); +out_put: + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +static int test_3_rec_num; +static off_t test_3_rec_off; +static int test_3_paddings; +static int test_3_start_idx; + +/* + * Test 3 callback. + * - check lgh_cur_offset correctness + * - check record index consistency + * - modify each record in-place + * - add new record during *last_idx processing + */ +static int test3_check_n_add_cb(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec; + int *last_rec = data; + unsigned cur_idx = test_3_start_idx + test_3_rec_num; + int rc; + + if (lgh->lgh_hdr->llh_flags & LLOG_F_IS_FIXSIZE) { + LASSERT(lgh->lgh_hdr->llh_size > 0); + if (lgh->lgh_cur_offset != lgh->lgh_hdr->llh_hdr.lrh_len + + (cur_idx - 1) * lgh->lgh_hdr->llh_size) + CERROR("Wrong record offset in cur_off: %llu, should be %u\n", + lgh->lgh_cur_offset, + lgh->lgh_hdr->llh_hdr.lrh_len + + (cur_idx - 1) * lgh->lgh_hdr->llh_size); + } else { + size_t chunk_size = lgh->lgh_hdr->llh_hdr.lrh_len; + + /* + * For variable size records the start offset is unknown, trust + * the first value and check others are consistent with it. + */ + if (test_3_rec_off == 0) + test_3_rec_off = lgh->lgh_cur_offset; + + if (lgh->lgh_cur_offset != test_3_rec_off) { + __u64 tmp = lgh->lgh_cur_offset; + + /* there can be padding record */ + if ((do_div(tmp, chunk_size) == 0) && + (lgh->lgh_cur_offset - test_3_rec_off < + rec->lrh_len + LLOG_MIN_REC_SIZE)) { + test_3_rec_off = lgh->lgh_cur_offset; + test_3_paddings++; + } else { + CERROR("Wrong record offset in cur_off: %llu" + ", should be %lld (rec len %u)\n", + lgh->lgh_cur_offset, + (long long)test_3_rec_off, + rec->lrh_len); + } + } + test_3_rec_off += rec->lrh_len; + } + + cur_idx += test_3_paddings; + if (cur_idx != rec->lrh_index) + CERROR("Record with wrong index was read: %u, expected %u\n", + rec->lrh_index, cur_idx); + + /* modify all records in place */ + lgr->lgr_gen.conn_cnt = rec->lrh_index; + rc = llog_write(env, lgh, rec, rec->lrh_index); + if (rc < 0) + CERROR("cb_test_3: cannot modify record while processing\n"); + + /* + * Add new record to the llog at *last_rec position one by one to + * check that last block is re-read during processing + */ + if (cur_idx == *last_rec || cur_idx == (*last_rec + 1)) { + rc = llog_write(env, lgh, rec, LLOG_NEXT_IDX); + if (rc < 0) + CERROR("cb_test_3: cannot add new record while " + "processing\n"); + } + test_3_rec_num++; + + return rc; +} + +/* Check in-place modifications were done for all records*/ +static int test3_check_cb(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec; + + if (lgr->lgr_gen.conn_cnt != rec->lrh_index) { + CERROR("cb_test_3: record %u is not modified\n", + rec->lrh_index); + return -EINVAL; + } + test_3_rec_num++; + return 0; +} + +static int llog_test3_process(const struct lu_env *env, + struct llog_handle *lgh, + llog_cb_t cb, int start) +{ + struct llog_process_cat_data cd; + int last_idx; /* new record will be injected here */ + int rc = 0; + + CWARN("test3: processing records from index %d to the end\n", + start); + cd.lpcd_first_idx = start - 1; + cd.lpcd_last_idx = 0; + test_3_rec_num = test_3_paddings = 0; + last_idx = lgh->lgh_last_idx; + rc = llog_process(env, lgh, cb, &last_idx, &cd); + if (rc < 0) + return rc; + CWARN("test3: total %u records processed with %u paddings\n", + test_3_rec_num, test_3_paddings); + return test_3_rec_num; +} + +/* Test plain llog functionality */ +static int llog_test_3(const struct lu_env *env, struct obd_device *obd, + struct llog_handle *llh) +{ + char buf[128]; + struct llog_rec_hdr *hdr = (void *)buf; + int rc, i; + int num_recs = 1; /* 1 for the header */ + int expected; + + ENTRY; + + hdr->lrh_len = sizeof(struct llog_gen_rec); + hdr->lrh_type = LLOG_GEN_REC; + llh->lgh_hdr->llh_size = sizeof(struct llog_gen_rec); + llh->lgh_hdr->llh_flags |= LLOG_F_IS_FIXSIZE; + + /* + * Fill the llog with 64-bytes records, use 1023 records, + * so last chunk will be partially full. Don't change this + * value until record size is changed. + */ + CWARN("3a: write 1023 fixed-size llog records\n"); + for (i = 0; i < 1023; i++) { + rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX); + if (rc < 0) { + CERROR("3a: write 1023 records failed at #%d: %d\n", + i + 1, rc); + RETURN(rc); + } + num_recs++; + } + + rc = verify_handle("3a", llh, num_recs); + if (rc) + RETURN(rc); + + /* + * Test fixed-size records processing: + * - search the needed index + * - go through all records from that index + * - check all indices are growing monotonically and exist + * - modify each record + * + * NB: test3_check_n_add adds two new records while processing + * after last record. There were 1023 records created so the last chunk + * misses exactly one record. Therefore one of new records will be + * the last in the current chunk and second causes the new chunk to be + * created. + */ + test_3_rec_off = 0; + test_3_start_idx = 501; + expected = 525; + rc = llog_test3_process(env, llh, test3_check_n_add_cb, + test_3_start_idx); + if (rc < 0) + RETURN(rc); + + /* extra record is created during llog_process() */ + if (rc != expected) { + CERROR("3a: process total %d records but expect %d\n", + rc, expected); + RETURN(-ERANGE); + } + + num_recs += 2; + + /* test modification in place */ + rc = llog_test3_process(env, llh, test3_check_cb, test_3_start_idx); + if (rc < 0) + RETURN(rc); + + if (rc != expected) { + CERROR("3a: process total %d records but expect %d\n", + rc, expected); + RETURN(-ERANGE); + } + + CWARN("3b: write 566 variable size llog records\n"); + + /* + * Drop llh_size to 0 to mark llog as variable-size and write + * header to make this change permanent. + */ + llh->lgh_hdr->llh_flags &= ~LLOG_F_IS_FIXSIZE; + llog_write(env, llh, &llh->lgh_hdr->llh_hdr, LLOG_HEADER_IDX); + + hdr->lrh_type = OBD_CFG_REC; + + /* + * there are 1025 64-bytes records in llog already, + * the last chunk contains single record, i.e. 64 bytes. + * Each pair of variable size records is 200 bytes, so + * we will have the following distribution per chunks: + * block 1: 64 + 80(80/120) + 80 + 48(pad) = 81 iterations + * block 2: 80(120/80) + 120 + 72(pad) = 81 itereations + * block 3: 80(80/120) + 80 + 112(pad) = 81 iterations + * -- the same as block 2 again and so on. + * block 7: 80(80/120) = 80 iterations and 192 bytes remain + * Total 6 * 81 + 80 = 566 itereations. + * Callback will add another 120 bytes in the end of the last chunk + * and another 120 bytes will cause padding (72 bytes) plus 120 + * bytes in the new block. + */ + for (i = 0; i < 566; i++) { + if ((i % 2) == 0) + hdr->lrh_len = 80; + else + hdr->lrh_len = 120; + + rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX); + if (rc < 0) { + CERROR("3b: write 566 records failed at #%d: %d\n", + i + 1, rc); + RETURN(rc); + } + num_recs++; + } + + rc = verify_handle("3b", llh, num_recs); + if (rc) + RETURN(rc); + + test_3_start_idx = 1026; + expected = 568; + rc = llog_test3_process(env, llh, test3_check_n_add_cb, + test_3_start_idx); + if (rc < 0) + RETURN(rc); + + if (rc != expected) { + CERROR("3b: process total %d records but expect %d\n", + rc, expected); + RETURN(-ERANGE); + } + + num_recs += 2; + + /* test modification in place */ + rc = llog_test3_process(env, llh, test3_check_cb, test_3_start_idx); + if (rc < 0) + RETURN(rc); + + if (rc != expected) { + CERROR("3b: process total %d records but expect %d\n", + rc, expected); + RETURN(-ERANGE); + } + + CWARN("3c: write records with variable size until BITMAP_SIZE, " + "return -ENOSPC\n"); + while (num_recs < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) { + if ((num_recs % 2) == 0) + hdr->lrh_len = 80; + else + hdr->lrh_len = 128; + + rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX); + if (rc == -ENOSPC) { + break; + } else if (rc < 0) { + CERROR("3c: write recs failed at #%d: %d\n", + num_recs, rc); + RETURN(rc); + } + num_recs++; + } + + if (rc != -ENOSPC) { + CWARN("3c: write record more than BITMAP size!\n"); + RETURN(-EINVAL); + } + CWARN("3c: wrote %d more records before end of llog is reached\n", + num_recs); + + rc = verify_handle("3c", llh, num_recs); + + RETURN(rc); +} + +/* Test catalogue additions */ +static int llog_test_4(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *cath, *llh; + char name[10]; + int rc, rc2, i, buflen; + struct llog_mini_rec lmr; + struct llog_cookie cookie; + struct llog_ctxt *ctxt; + int num_recs = 0; + char *buf; + struct llog_rec_hdr *rec; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC; + + sprintf(name, "%x", llog_test_rand + 1); + CWARN("4a: create a catalog log with name: %s\n", name); + rc = llog_open_create(env, ctxt, &cath, NULL, name); + if (rc) { + CERROR("4a: llog_create with name %s failed: %d\n", name, rc); + GOTO(ctxt_release, rc); + } + rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("4a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + num_recs++; + cat_logid = cath->lgh_id; + + CWARN("4b: write 1 record into the catalog\n"); + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie); + if (rc != 1) { + CERROR("4b: write 1 catalog record failed at: %d\n", rc); + GOTO(out, rc); + } + num_recs++; + rc = verify_handle("4b", cath, 2); + if (rc) + GOTO(out, rc); + + rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs); + if (rc) + GOTO(out, rc); + + /* estimate the max number of record for the plain llog + * cause it depends on disk size + */ + llh = cath->u.chd.chd_current_log; + if (llh->lgh_max_size != 0) { + llog_test_recnum = (llh->lgh_max_size - + sizeof(struct llog_log_hdr)) / LLOG_MIN_REC_SIZE; + } + + if (llog_test_recnum >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) + llog_test_recnum = LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1; + + CWARN("4c: cancel 1 log record\n"); + rc = llog_cat_cancel_records(env, cath, 1, &cookie); + if (rc) { + CERROR("4c: cancel 1 catalog based record failed: %d\n", rc); + GOTO(out, rc); + } + num_recs--; + + rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs); + if (rc) + GOTO(out, rc); + + CWARN("4d: write %d more log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("4d: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + num_recs++; + } + + /* make sure new plain llog appears */ + rc = verify_handle("4d", cath, 3); + if (rc) + GOTO(out, rc); + + CWARN("4e: add 5 large records, one record per block\n"); + buflen = LLOG_MIN_CHUNK_SIZE; + OBD_ALLOC(buf, buflen); + if (buf == NULL) + GOTO(out, rc = -ENOMEM); + for (i = 0; i < 5; i++) { + rec = (void *)buf; + rec->lrh_len = buflen; + rec->lrh_type = OBD_CFG_REC; + rc = llog_cat_add(env, cath, rec, NULL); + if (rc) { + CERROR("4e: write 5 records failed at #%d: %d\n", + i + 1, rc); + GOTO(out_free, rc); + } + num_recs++; + } +out_free: + OBD_FREE(buf, buflen); +out: + CWARN("4f: put newly-created catalog\n"); + rc2 = llog_cat_close(env, cath); + if (rc2) { + CERROR("4: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +ctxt_release: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static int cat_counter; + +static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct lu_fid fid = {0}; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + logid_to_fid(&lir->lid_id, &fid); + + CWARN("seeing record at index %d - "DFID" in log "DFID"\n", + rec->lrh_index, PFID(&fid), + PFID(lu_object_fid(&llh->lgh_obj->do_lu))); + + cat_counter++; + + RETURN(0); +} + +static int plain_counter; + +static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct lu_fid fid = {0}; + + if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) { + CERROR("log is not plain\n"); + RETURN(-EINVAL); + } + + logid_to_fid(&llh->lgh_id, &fid); + + CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n", + rec->lrh_index, PFID(&fid)); + + plain_counter++; + + RETURN(0); +} + +static int cancel_count; + +static int llog_cancel_rec_cb(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_cookie cookie; + + if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) { + CERROR("log is not plain\n"); + RETURN(-EINVAL); + } + + cookie.lgc_lgl = llh->lgh_id; + cookie.lgc_index = rec->lrh_index; + + llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie); + cancel_count++; + if (cancel_count == llog_test_recnum) + RETURN(-LLOG_EEMPTY); + RETURN(0); +} + +/* Test log and catalogue processing */ +static int llog_test_5(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *llh = NULL; + char name[10]; + int rc, rc2; + struct llog_mini_rec lmr; + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC; + + CWARN("5a: re-open catalog by id\n"); + rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("5a: llog_create with logid failed: %d\n", rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("5a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + CWARN("5b: print the catalog entries.. we expect 2\n"); + cat_counter = 0; + rc = llog_process(env, llh, cat_print_cb, "test 5", NULL); + if (rc) { + CERROR("5b: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 2) { + CERROR("5b: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5c: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("5c: process with llog_cancel_rec_cb failed: %d\n", rc); + GOTO(out, rc); + } + + CWARN("5c: print the catalog entries.. we expect 1\n"); + cat_counter = 0; + rc = llog_process(env, llh, cat_print_cb, "test 5", NULL); + if (rc) { + CERROR("5c: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 1) { + CERROR("5c: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5d: add 1 record to the log with many canceled empty pages\n"); + rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("5d: add record to the log with many canceled empty " + "pages failed\n"); + GOTO(out, rc); + } + + CWARN("5e: print plain log entries.. expect 6\n"); + plain_counter = 0; + rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0); + if (rc) { + CERROR("5e: process with plain_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (plain_counter != 6) { + CERROR("5e: found %d records\n", plain_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5f: print plain log entries reversely.. expect 6\n"); + plain_counter = 0; + rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar"); + if (rc) { + CERROR("5f: reversely process with plain_print_cb failed: " + "%d\n", rc); + GOTO(out, rc); + } + if (plain_counter != 6) { + CERROR("5f: found %d records\n", plain_counter); + GOTO(out, rc = -EINVAL); + } + +out: + CWARN("5g: close re-opened catalog\n"); + rc2 = llog_cat_close(env, llh); + if (rc2) { + CERROR("5g: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +out_put: + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +/* Test client api; open log by name and process */ +static int llog_test_6(const struct lu_env *env, struct obd_device *obd, + char *name) +{ + struct obd_device *mgc_obd; + struct llog_ctxt *ctxt; + struct obd_uuid *mgs_uuid; + struct obd_export *exp; + struct obd_uuid uuid = { "LLOG_TEST6_UUID" }; + struct llog_handle *llh = NULL; + struct llog_ctxt *nctxt; + int rc, rc2; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid; + + CWARN("6a: re-open log %s using client API\n", name); + mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL); + if (mgc_obd == NULL) { + CERROR("6a: no MGC devices connected to %s found.\n", + mgs_uuid->uuid); + GOTO(ctxt_release, rc = -ENOENT); + } + + rc = obd_connect(NULL, &exp, mgc_obd, &uuid, + NULL /* obd_connect_data */, NULL); + if (rc != -EALREADY) { + CERROR("6a: connect on connected MGC (%s) failed to return" + " -EALREADY\n", mgc_obd->obd_name); + if (rc == 0) + obd_disconnect(exp); + GOTO(ctxt_release, rc = -EINVAL); + } + + nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT); + rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("6a: llog_open failed %d\n", rc); + GOTO(nctxt_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) { + CERROR("6a: llog_init_handle failed %d\n", rc); + GOTO(parse_out, rc); + } + + plain_counter = 1; /* llog header is first record */ + CWARN("6b: process log %s using client API\n", name); + rc = llog_process(env, llh, plain_print_cb, NULL, NULL); + if (rc) + CERROR("6b: llog_process failed %d\n", rc); + CWARN("6b: processed %d records\n", plain_counter); + + rc = verify_handle("6b", llh, plain_counter); + if (rc) + GOTO(parse_out, rc); + + plain_counter = 1; /* llog header is first record */ + CWARN("6c: process log %s reversely using client API\n", name); + rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL); + if (rc) + CERROR("6c: llog_reverse_process failed %d\n", rc); + CWARN("6c: processed %d records\n", plain_counter); + + rc = verify_handle("6c", llh, plain_counter); + if (rc) + GOTO(parse_out, rc); + +parse_out: + rc2 = llog_close(env, llh); + if (rc2) { + CERROR("6: llog_close failed: rc = %d\n", rc2); + if (rc == 0) + rc = rc2; + } +nctxt_put: + llog_ctxt_put(nctxt); +ctxt_release: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static union { + struct llog_rec_hdr lrh; /* common header */ + struct llog_logid_rec llr; /* LLOG_LOGID_MAGIC */ + struct llog_unlink64_rec lur; /* MDS_UNLINK64_REC */ + struct llog_setattr64_rec lsr64; /* MDS_SETATTR64_REC */ + struct llog_setattr64_rec_v2 lsr64_v2; /* MDS_SETATTR64_REC */ + struct llog_size_change_rec lscr; /* OST_SZ_REC */ + struct llog_changelog_rec lcr; /* CHANGELOG_REC */ + struct llog_changelog_user_rec lcur; /* CHANGELOG_USER_REC */ + struct llog_gen_rec lgr; /* LLOG_GEN_REC */ +} llog_records; + +static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct lu_fid fid = {0}; + + logid_to_fid(&llh->lgh_id, &fid); + + CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n", + rec->lrh_type, rec->lrh_index, PFID(&fid)); + + plain_counter++; + return 0; +} + +static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + plain_counter++; + /* test LLOG_DEL_RECORD is working */ + return LLOG_DEL_RECORD; +} + +static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct llog_handle *llh; + int rc = 0, i, process_count; + int num_recs = 0; + + ENTRY; + + rc = llog_open_create(env, ctxt, &llh, NULL, NULL); + if (rc) { + CERROR("7_sub: create log failed\n"); + RETURN(rc); + } + + rc = llog_init_handle(env, llh, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &uuid); + if (rc) { + CERROR("7_sub: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + for (i = 0; i < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr); i++) { + rc = llog_write(env, llh, &llog_records.lrh, LLOG_NEXT_IDX); + if (rc == -ENOSPC) { + break; + } else if (rc < 0) { + CERROR("7_sub: write recs failed at #%d: %d\n", + i + 1, rc); + GOTO(out_close, rc); + } + num_recs++; + } + if (rc != -ENOSPC) { + CWARN("7_sub: write record more than BITMAP size!\n"); + GOTO(out_close, rc = -EINVAL); + } + + rc = verify_handle("7_sub", llh, num_recs + 1); + if (rc) { + CERROR("7_sub: verify handle failed: %d\n", rc); + GOTO(out_close, rc); + } + if (num_recs < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1) + CWARN("7_sub: records are not aligned, written %d from %u\n", + num_recs, LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1); + + plain_counter = 0; + rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL); + if (rc) { + CERROR("7_sub: llog process failed: %d\n", rc); + GOTO(out_close, rc); + } + process_count = plain_counter; + if (process_count != num_recs) { + CERROR("7_sub: processed %d records from %d total\n", + process_count, num_recs); + GOTO(out_close, rc = -EINVAL); + } + + plain_counter = 0; + rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL); + if (rc && rc != LLOG_DEL_PLAIN) { + CERROR("7_sub: reverse llog process failed: %d\n", rc); + GOTO(out_close, rc); + } + if (process_count != plain_counter) { + CERROR("7_sub: Reverse/direct processing found different" + "number of records: %d/%d\n", + plain_counter, process_count); + GOTO(out_close, rc = -EINVAL); + } + if (llog_exist(llh)) { + CERROR("7_sub: llog exists but should be zapped\n"); + GOTO(out_close, rc = -EEXIST); + } + + rc = verify_handle("7_sub", llh, 1); +out_close: + if (rc) + llog_destroy(env, llh); + llog_close(env, llh); + RETURN(rc); +} + +/* Test all llog records writing and processing */ +static int llog_test_7(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + + CWARN("7a: test llog_logid_rec\n"); + llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr); + llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr); + llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7a: llog_logid_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7b: test llog_unlink64_rec\n"); + llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur); + llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur); + llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7b: llog_unlink_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7c: test llog_setattr64_rec\n"); + llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64); + llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64); + llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7c: llog_setattr64_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7d: test llog_size_change_rec\n"); + llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7d: llog_size_change_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7e: test llog_changelog_rec\n"); + /* Direct access to cr_do_not_use: peculiar case for this test */ + llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_do_not_use.lrt_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7e: llog_changelog_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7f: test llog_changelog_user_rec\n"); + llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7f: llog_changelog_user_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7g: test llog_gen_rec\n"); + llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr); + llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr); + llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7g: llog_size_change_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7h: test llog_setattr64_rec_v2\n"); + llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64_v2); + llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64_v2); + llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7h: llog_setattr64_rec_v2 test failed\n"); + GOTO(out, rc); + } +out: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static int test_8_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + plain_counter++; + return 0; +} + +static int llog_test_8(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *llh = NULL; + char name[10]; + int rc, rc2, i; + int orig_counter; + struct llog_mini_rec lmr; + struct llog_ctxt *ctxt; + struct dt_object *obj = NULL; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC; + + CWARN("8a: fill the first plain llog\n"); + rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("8a: llog_create with logid failed: %d\n", rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("8a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + plain_counter = 0; + rc = llog_cat_process(env, llh, test_8_cb, "foobar", 0, 0); + if (rc != 0) { + CERROR("5a: process with test_8_cb failed: %d\n", rc); + GOTO(out, rc); + } + orig_counter = plain_counter; + + for (i = 0; i < 100; i++) { + rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("5a: add record failed\n"); + GOTO(out, rc); + } + } + + /* grab the current plain llog, we'll corrupt it later */ + obj = llh->u.chd.chd_current_log->lgh_obj; + LASSERT(obj); + lu_object_get(&obj->do_lu); + CWARN("8a: pin llog "DFID"\n", PFID(lu_object_fid(&obj->do_lu))); + + rc2 = llog_cat_close(env, llh); + if (rc2) { + CERROR("8a: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + GOTO(out_put, rc); + } + + CWARN("8b: fill the second plain llog\n"); + rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("8b: llog_create with logid failed: %d\n", rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("8b: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + for (i = 0; i < 100; i++) { + rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("8b: add record failed\n"); + GOTO(out, rc); + } + } + CWARN("8b: second llog "DFID"\n", + PFID(lu_object_fid(&llh->u.chd.chd_current_log->lgh_obj->do_lu))); + + rc2 = llog_cat_close(env, llh); + if (rc2) { + CERROR("8b: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + GOTO(out_put, rc); + } + + /* Here was 8c: drop two records from the first plain llog + * llog_truncate was bad idea cause it creates a wrong state, + * lgh_last_idx is wrong and two records belongs to zeroed buffer + */ + + CWARN("8d: count survived records\n"); + rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("8d: llog_create with logid failed: %d\n", rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("8d: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + plain_counter = 0; + rc = llog_cat_process(env, llh, test_8_cb, "foobar", 0, 0); + if (rc != 0) { + CERROR("8d: process with test_8_cb failed: %d\n", rc); + GOTO(out, rc); + } + + if (orig_counter + 200 != plain_counter) { + CERROR("found %d records (expected %d)\n", plain_counter, + orig_counter + 200); + rc = -EIO; + } + +out: + CWARN("8d: close re-opened catalog\n"); + rc2 = llog_cat_close(env, llh); + if (rc2) { + CERROR("8d: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +out_put: + llog_ctxt_put(ctxt); + + if (obj != NULL) + dt_object_put(env, obj); + + RETURN(rc); +} + +static int llog_test_9_sub(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct llog_handle *llh; + struct lu_fid fid; + int rc = 0; + + ENTRY; + + rc = llog_open_create(env, ctxt, &llh, NULL, NULL); + if (rc != 0) { + CERROR("9_sub: create log failed\n"); + RETURN(rc); + } + + rc = llog_init_handle(env, llh, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &uuid); + if (rc != 0) { + CERROR("9_sub: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + logid_to_fid(&llh->lgh_id, &fid); + fid_to_logid(&fid, &llog_records.llr.lid_id); + rc = llog_write(env, llh, &llog_records.lrh, LLOG_NEXT_IDX); + if (rc < 0) { + CERROR("9_sub: write recs failed at #1: %d\n", rc); + GOTO(out_close, rc); + } + CWARN("9_sub: record type %x in log "DFID_NOBRACE"\n", + llog_records.lrh.lrh_type, PFID(&fid)); +out_close: + llog_close(env, llh); + RETURN(rc); +} + +/* Prepare different types of llog records for llog_reader test*/ +static int llog_test_9(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + + CWARN("9a: test llog_logid_rec\n"); + llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr); + llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr); + llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + + rc = llog_test_9_sub(env, ctxt); + if (rc != 0) { + CERROR("9a: llog_logid_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("9b: test llog_obd_cfg_rec\n"); + llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_hdr.lrh_type = OBD_CFG_REC; + + rc = llog_test_9_sub(env, ctxt); + if (rc != 0) { + CERROR("9b: llog_obd_cfg_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("9c: test llog_changelog_rec\n"); + /* Direct access to cr_do_not_use: peculiar case for this test */ + llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_do_not_use.lrt_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC; + + rc = llog_test_9_sub(env, ctxt); + if (rc != 0) { + CERROR("9c: llog_changelog_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("9d: test llog_changelog_user_rec\n"); + llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC; + + rc = llog_test_9_sub(env, ctxt); + if (rc != 0) { + CERROR("9d: llog_changelog_user_rec test failed\n"); + GOTO(out, rc); + } + +out: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +struct llog_process_info { + struct llog_handle *lpi_loghandle; + llog_cb_t lpi_cb; + void *lpi_cbdata; + void *lpi_catdata; + int lpi_rc; + struct completion lpi_completion; + const struct lu_env *lpi_env; + struct task_struct *lpi_reftask; +}; + + +static int llog_test_process_thread(void *arg) +{ + struct llog_process_info *lpi = arg; + int rc; + + rc = llog_cat_process_or_fork(NULL, lpi->lpi_loghandle, lpi->lpi_cb, + NULL, lpi->lpi_cbdata, 1, 0, true); + + complete(&lpi->lpi_completion); + + lpi->lpi_rc = rc; + if (rc) + CWARN("10h: Error during catalog processing %d\n", rc); + return rc; +} + +static int cat_check_old_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct lu_fid fid = {0}; + struct lu_fid *prev_fid = data; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + logid_to_fid(&lir->lid_id, &fid); + + CWARN("seeing record at index %d - "DFID" in log "DFID"\n", + rec->lrh_index, PFID(&fid), + PFID(lu_object_fid(&llh->lgh_obj->do_lu))); + + if (prev_fid->f_oid > fid.f_oid) { + CWARN("processing old record, fail\n"); + prev_fid->f_oid = 0xbad; + RETURN(-LLOG_EEMPTY); + } + + if (prev_fid->f_oid == 0) { + cfs_fail_loc = OBD_FAIL_ONCE | OBD_FAIL_LLOG_PROCESS_TIMEOUT; + cfs_fail_val = (unsigned int) (llh->lgh_id.lgl_oi.oi.oi_id & + 0xFFFFFFFF); + msleep(1 * MSEC_PER_SEC); + } + *prev_fid = fid; + + RETURN(0); +} + +/* test catalog wrap around */ +static int llog_test_10(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *cath; + char name[10]; + int rc, rc2, i, enospc, eok; + struct llog_mini_rec lmr; + struct llog_ctxt *ctxt; + struct lu_attr la; + __u64 cat_max_size; + struct dt_device *dt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC; + + snprintf(name, sizeof(name), "%x", llog_test_rand + 2); + CWARN("10a: create a catalog log with name: %s\n", name); + rc = llog_open_create(env, ctxt, &cath, NULL, name); + if (rc) { + CERROR("10a: llog_create with name %s failed: %d\n", name, rc); + GOTO(ctxt_release, rc); + } + rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("10a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + cat_logid = cath->lgh_id; + dt = lu2dt_dev(cath->lgh_obj->do_lu.lo_dev); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10c: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* force catalog wrap for 5th plain LLOG */ + cfs_fail_loc = CFS_FAIL_SKIP|OBD_FAIL_CAT_RECORDS; + cfs_fail_val = 4; + + CWARN("10b: write %d log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("10b: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + } + + /* make sure 2 new plain llog appears in catalog (+1 with hdr) */ + rc = verify_handle("10b", cath, 3); + if (rc) + GOTO(out, rc); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10b: sync failed: %d\n", rc); + GOTO(out, rc); + } + + CWARN("10c: write %d more log records\n", 2 * llog_test_recnum); + for (i = 0; i < 2 * llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc) { + CERROR("10c: write %d records failed at #%d: %d\n", + 2*llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + } + + /* make sure 2 new plain llog appears in catalog (+1 with hdr) */ + rc = verify_handle("10c", cath, 5); + if (rc) + GOTO(out, rc); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10c: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* + * fill last allocated plain LLOG and reach -ENOSPC condition + * because no slot available in Catalog + */ + enospc = 0; + eok = 0; + CWARN("10c: write %d more log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc && rc != -ENOSPC) { + CERROR("10c: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + /* + * after last added plain LLOG has filled up, all new + * records add should fail with -ENOSPC + */ + if (rc == -ENOSPC) { + enospc++; + } else { + enospc = 0; + eok++; + } + } + + if ((enospc == 0) && (enospc+eok != llog_test_recnum)) { + CERROR("10c: all last records adds should have failed with" + " -ENOSPC\n"); + GOTO(out, rc = -EINVAL); + } + + CWARN("10c: wrote %d records then %d failed with ENOSPC\n", eok, + enospc); + + /* make sure no new record in Catalog */ + rc = verify_handle("10c", cath, 5); + if (rc) + GOTO(out, rc); + + /* Catalog should have reached its max size for test */ + rc = dt_attr_get(env, cath->lgh_obj, &la); + if (rc) { + CERROR("10c: failed to get catalog attrs: %d\n", rc); + GOTO(out, rc); + } + cat_max_size = la.la_size; + + /* + * cancel all 1st plain llog records to empty it, this will also cause + * its catalog entry to be freed for next forced wrap in 10e + */ + CWARN("10d: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("10d: process with llog_cancel_rec_cb failed: %d\n", rc); + /* + * need to indicate error if for any reason llog_test_recnum is + * not reached + */ + if (rc == 0) + rc = -ERANGE; + GOTO(out, rc); + } + + CWARN("10d: print the catalog entries.. we expect 3\n"); + cat_counter = 0; + rc = llog_process(env, cath, cat_print_cb, "test 10", NULL); + if (rc) { + CERROR("10d: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 3) { + CERROR("10d: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* verify one down in catalog (+1 with hdr) */ + rc = verify_handle("10d", cath, 4); + if (rc) + GOTO(out, rc); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10d: sync failed: %d\n", rc); + GOTO(out, rc); + } + + enospc = 0; + eok = 0; + CWARN("10e: write %d more log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc && rc != -ENOSPC) { + CERROR("10e: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + /* + * after last added plain LLOG has filled up, all new + * records add should fail with -ENOSPC + */ + if (rc == -ENOSPC) { + enospc++; + } else { + enospc = 0; + eok++; + } + } + + if ((enospc == 0) && (enospc+eok != llog_test_recnum)) { + CERROR("10e: all last records adds should have failed with" + " -ENOSPC\n"); + GOTO(out, rc = -EINVAL); + } + + CWARN("10e: wrote %d records then %d failed with ENOSPC\n", eok, + enospc); + + CWARN("10e: print the catalog entries.. we expect 4\n"); + cat_counter = 0; + rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10", + 0, 0, false); + if (rc) { + CERROR("10e: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 4) { + CERROR("10e: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* make sure 1 new plain llog appears in catalog (+1 with hdr) */ + rc = verify_handle("10e", cath, 5); + if (rc) + GOTO(out, rc); + + /* verify catalog has wrap around */ + if (cath->lgh_last_idx > cath->lgh_hdr->llh_cat_idx) { + CERROR("10e: catalog failed to wrap around\n"); + GOTO(out, rc = -EINVAL); + } + + rc = dt_attr_get(env, cath->lgh_obj, &la); + if (rc) { + CERROR("10e: failed to get catalog attrs: %d\n", rc); + GOTO(out, rc); + } + + if (la.la_size != cat_max_size) { + CERROR("10e: catalog size has changed after it has wrap around," + " current size = %llu, expected size = %llu\n", + la.la_size, cat_max_size); + GOTO(out, rc = -EINVAL); + } + CWARN("10e: catalog successfully wrap around, last_idx %d, first %d\n", + cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10e: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* + * cancel more records to free one more slot in Catalog + * see if it is re-allocated when adding more records + */ + CWARN("10f: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("10f: process with llog_cancel_rec_cb failed: %d\n", rc); + /* + * need to indicate error if for any reason llog_test_recnum is + * not reached + */ + if (rc == 0) + rc = -ERANGE; + GOTO(out, rc); + } + + CWARN("10f: print the catalog entries.. we expect 3\n"); + cat_counter = 0; + rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10", + 0, 0, false); + if (rc) { + CERROR("10f: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 3) { + CERROR("10f: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* verify one down in catalog (+1 with hdr) */ + rc = verify_handle("10f", cath, 4); + if (rc) + GOTO(out, rc); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10f: sync failed: %d\n", rc); + GOTO(out, rc); + } + + enospc = 0; + eok = 0; + CWARN("10f: write %d more log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc && rc != -ENOSPC) { + CERROR("10f: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + /* + * after last added plain LLOG has filled up, all new + * records add should fail with -ENOSPC + */ + if (rc == -ENOSPC) { + enospc++; + } else { + enospc = 0; + eok++; + } + } + + if ((enospc == 0) && (enospc+eok != llog_test_recnum)) { + CERROR("10f: all last records adds should have failed with" + " -ENOSPC\n"); + GOTO(out, rc = -EINVAL); + } + + CWARN("10f: wrote %d records then %d failed with ENOSPC\n", eok, + enospc); + + /* make sure 1 new plain llog appears in catalog (+1 with hdr) */ + rc = verify_handle("10f", cath, 5); + if (rc) + GOTO(out, rc); + + /* verify lgh_last_idx = llh_cat_idx = 2 now */ + if (cath->lgh_last_idx != cath->lgh_hdr->llh_cat_idx || + cath->lgh_last_idx != 2) { + CERROR("10f: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 2\n", + cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx); + GOTO(out, rc = -EINVAL); + } + + rc = dt_attr_get(env, cath->lgh_obj, &la); + if (rc) { + CERROR("10f: failed to get catalog attrs: %d\n", rc); + GOTO(out, rc); + } + + if (la.la_size != cat_max_size) { + CERROR("10f: catalog size has changed after it has wrap around," + " current size = %llu, expected size = %llu\n", + la.la_size, cat_max_size); + GOTO(out, rc = -EINVAL); + } + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10f: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* will llh_cat_idx also successfully wrap ? */ + + /* + * cancel all records in the plain LLOGs referenced by 2 last indexes in + * Catalog + */ + + /* cancel more records to free one more slot in Catalog */ + CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc); + /* need to indicate error if for any reason llog_test_recnum is + * not reached */ + if (rc == 0) + rc = -ERANGE; + GOTO(out, rc); + } + + CWARN("10g: print the catalog entries.. we expect 3\n"); + cat_counter = 0; + rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10", + 0, 0, false); + if (rc) { + CERROR("10g: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 3) { + CERROR("10g: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* verify one down in catalog (+1 with hdr) */ + rc = verify_handle("10g", cath, 4); + if (rc) + GOTO(out, rc); + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10g: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* cancel more records to free one more slot in Catalog */ + CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc); + /* + * need to indicate error if for any reason llog_test_recnum is + * not reached + */ + if (rc == 0) + rc = -ERANGE; + GOTO(out, rc); + } + + CWARN("10g: print the catalog entries.. we expect 2\n"); + cat_counter = 0; + rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10", + 0, 0, false); + if (rc) { + CERROR("10g: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 2) { + CERROR("10g: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* verify one down in catalog (+1 with hdr) */ + rc = verify_handle("10g", cath, 3); + if (rc) + GOTO(out, rc); + + /* verify lgh_last_idx = 2 and llh_cat_idx = 0 now */ + if (cath->lgh_hdr->llh_cat_idx != 0 || + cath->lgh_last_idx != 2) { + CERROR("10g: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 0\n", + cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx); + GOTO(out, rc = -EINVAL); + } + + /* + * sync device to commit all recent LLOG changes to disk and avoid + * to consume a huge space with delayed journal commit callbacks + * particularly on low memory nodes or VMs + */ + rc = dt_sync(env, dt); + if (rc) { + CERROR("10g: sync failed: %d\n", rc); + GOTO(out, rc); + } + + /* cancel more records to free one more slot in Catalog */ + CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum); + cancel_count = 0; + rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc); + /* + * need to indicate error if for any reason llog_test_recnum is + * not reached + */ + if (rc == 0) + rc = -ERANGE; + GOTO(out, rc); + } + + CWARN("10g: print the catalog entries.. we expect 1\n"); + cat_counter = 0; + rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10", + 0, 0, false); + if (rc) { + CERROR("10g: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 1) { + CERROR("10g: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + /* verify one down in catalog (+1 with hdr) */ + rc = verify_handle("10g", cath, 2); + if (rc) + GOTO(out, rc); + + /* verify lgh_last_idx = 2 and llh_cat_idx = 1 now */ + if (cath->lgh_hdr->llh_cat_idx != 1 || + cath->lgh_last_idx != 2) { + CERROR("10g: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 1\n", + cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx); + GOTO(out, rc = -EINVAL); + } + + CWARN("10g: llh_cat_idx has also successfully wrapped!\n"); + + /* + * catalog has only one valid entry other slots has outdated + * records. Trying to race the llog_thread_process with llog_add + * llog_thread_process read buffer and loop record on it. + * llog_add adds a record and mark a record in bitmap. + * llog_thread_process process record with old data. + */ + { + struct llog_process_info lpi; + struct lu_fid test_fid = {0}; + + lpi.lpi_loghandle = cath; + lpi.lpi_cb = cat_check_old_cb; + lpi.lpi_catdata = NULL; + lpi.lpi_cbdata = &test_fid; + init_completion(&lpi.lpi_completion); + + kthread_run(llog_test_process_thread, &lpi, "llog_test_process_thread"); + + msleep(1 * MSEC_PER_SEC / 2); + enospc = 0; + eok = 0; + CWARN("10h: write %d more log records\n", llog_test_recnum); + for (i = 0; i < llog_test_recnum; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL); + if (rc && rc != -ENOSPC) { + CERROR("10h: write %d records failed at #%d: %d\n", + llog_test_recnum, i + 1, rc); + GOTO(out, rc); + } + /* + * after last added plain LLOG has filled up, all new + * records add should fail with -ENOSPC + */ + if (rc == -ENOSPC) { + enospc++; + } else { + enospc = 0; + eok++; + } + } + + if ((enospc == 0) && (enospc+eok != llog_test_recnum)) { + CERROR("10h: all last records adds should have failed with" + " -ENOSPC\n"); + GOTO(out, rc = -EINVAL); + } + + CWARN("10h: wrote %d records then %d failed with ENOSPC\n", eok, + enospc); + + wait_for_completion(&lpi.lpi_completion); + + if (lpi.lpi_rc != 0) { + CERROR("10h: race happened, old record was processed\n"); + GOTO(out, rc = -EINVAL); + } + } +out: + cfs_fail_loc = 0; + cfs_fail_val = 0; + + CWARN("10: put newly-created catalog\n"); + rc2 = llog_cat_close(env, cath); + if (rc2) { + CERROR("10: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +ctxt_release: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +/* + * ------------------------------------------------------------------------- + * Tests above, boring obd functions below + * ------------------------------------------------------------------------- + */ +static int llog_run_tests(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *llh = NULL; + struct llog_ctxt *ctxt; + int rc, err; + char name[10]; + + ENTRY; + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + sprintf(name, "%x", llog_test_rand); + + rc = llog_test_1(env, obd, name); + if (rc) + GOTO(cleanup_ctxt, rc); + + rc = llog_test_2(env, obd, name, &llh); + if (rc) + GOTO(cleanup_ctxt, rc); + + rc = llog_test_3(env, obd, llh); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_4(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_5(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_6(env, obd, name); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_7(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_8(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_9(env, obd); + if (rc != 0) + GOTO(cleanup, rc); + + rc = llog_test_10(env, obd); + if (rc) + GOTO(cleanup, rc); + +cleanup: + err = llog_destroy(env, llh); + if (err) + CERROR("cleanup: llog_destroy failed: %d\n", err); + llog_close(env, llh); + if (rc == 0) + rc = err; +cleanup_ctxt: + llog_ctxt_put(ctxt); + return rc; +} + +static int llog_test_cleanup(struct obd_device *obd) +{ + struct obd_device *tgt; + struct lu_env env; + int rc; + + ENTRY; + + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + RETURN(rc); + + tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd; + rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT)); + if (rc) + CERROR("failed to llog_test_llog_finish: %d\n", rc); + lu_env_fini(&env); + RETURN(rc); +} + +static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_device *tgt; + struct llog_ctxt *ctxt; + struct dt_object *o; + struct lu_env env; + struct lu_context test_session; + int rc; + + ENTRY; + + if (lcfg->lcfg_bufcount < 2) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + if (lcfg->lcfg_buflens[1] < 1) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + /* disk obd */ + tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); + if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { + CERROR("target device not attached or not set up (%s)\n", + lustre_cfg_string(lcfg, 1)); + RETURN(-EINVAL); + } + + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + RETURN(rc); + + rc = lu_context_init(&test_session, LCT_SERVER_SESSION); + if (rc) + GOTO(cleanup_env, rc); + test_session.lc_thread = (struct ptlrpc_thread *)current; + lu_context_enter(&test_session); + env.le_ses = &test_session; + + CWARN("Setup llog-test device over %s device\n", + lustre_cfg_string(lcfg, 1)); + + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev); + + rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt, + &llog_osd_ops); + if (rc) + GOTO(cleanup_session, rc); + + /* use MGS llog dir for tests */ + ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT); + LASSERT(ctxt); + o = ctxt->loc_dir; + llog_ctxt_put(ctxt); + + ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + ctxt->loc_dir = o; + llog_ctxt_put(ctxt); + + llog_test_rand = cfs_rand(); + + rc = llog_run_tests(&env, tgt); + if (rc) + llog_test_cleanup(obd); +cleanup_session: + lu_context_exit(&test_session); + lu_context_fini(&test_session); +cleanup_env: + lu_env_fini(&env); + RETURN(rc); +} + +static struct obd_ops llog_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = llog_test_setup, + .o_cleanup = llog_test_cleanup, +}; + +static int __init llog_test_init(void) +{ + return class_register_type(&llog_obd_ops, NULL, false, NULL, + "llog_test", NULL); +} + +static void __exit llog_test_exit(void) +{ + class_unregister_type("llog_test"); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Log test module"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(llog_test_init); +module_exit(llog_test_exit); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c new file mode 100644 index 0000000000000..04c25ebd88274 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c @@ -0,0 +1,973 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/obdclass/local_storage.c + * + * Local storage for file/objects with fid generation. Works on top of OSD. + * + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "local_storage.h" + +/* all initialized local storages on this node are linked on this */ +static LIST_HEAD(ls_list_head); +static DEFINE_MUTEX(ls_list_mutex); + +static int ls_object_init(const struct lu_env *env, struct lu_object *o, + const struct lu_object_conf *unused) +{ + struct ls_device *ls; + struct lu_object *below; + struct lu_device *under; + + ENTRY; + + ls = container_of0(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev); + under = &ls->ls_osd->dd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under); + if (below == NULL) + RETURN(-ENOMEM); + + lu_object_add(o, below); + + RETURN(0); +} + +static void ls_object_free(const struct lu_env *env, struct lu_object *o) +{ + struct ls_object *obj = lu2ls_obj(o); + struct lu_object_header *h = o->lo_header; + + dt_object_fini(&obj->ls_obj); + lu_object_header_fini(h); + OBD_FREE_PTR(obj); +} + +static struct lu_object_operations ls_lu_obj_ops = { + .loo_object_init = ls_object_init, + .loo_object_free = ls_object_free, +}; + +static struct lu_object *ls_object_alloc(const struct lu_env *env, + const struct lu_object_header *_h, + struct lu_device *d) +{ + struct lu_object_header *h; + struct ls_object *o; + struct lu_object *l; + + LASSERT(_h == NULL); + + OBD_ALLOC_PTR(o); + if (o != NULL) { + l = &o->ls_obj.do_lu; + h = &o->ls_header; + + lu_object_header_init(h); + dt_object_init(&o->ls_obj, h, d); + lu_object_add_top(h, l); + + l->lo_ops = &ls_lu_obj_ops; + + return l; + } else { + return NULL; + } +} + +static struct lu_device_operations ls_lu_dev_ops = { + .ldo_object_alloc = ls_object_alloc +}; + +static struct ls_device *__ls_find_dev(struct dt_device *dev) +{ + struct ls_device *ls, *ret = NULL; + + list_for_each_entry(ls, &ls_list_head, ls_linkage) { + if (ls->ls_osd == dev) { + atomic_inc(&ls->ls_refcount); + ret = ls; + break; + } + } + return ret; +} + +struct ls_device *ls_find_dev(struct dt_device *dev) +{ + struct ls_device *ls; + + mutex_lock(&ls_list_mutex); + ls = __ls_find_dev(dev); + mutex_unlock(&ls_list_mutex); + + return ls; +} + +static struct lu_device_type_operations ls_device_type_ops = { + .ldto_start = NULL, + .ldto_stop = NULL, +}; + +static struct lu_device_type ls_lu_type = { + .ldt_name = "local_storage", + .ldt_ops = &ls_device_type_ops, +}; + +struct ls_device *ls_device_get(struct dt_device *dev) +{ + struct ls_device *ls; + + ENTRY; + + mutex_lock(&ls_list_mutex); + ls = __ls_find_dev(dev); + if (ls) + GOTO(out_ls, ls); + + /* not found, then create */ + OBD_ALLOC_PTR(ls); + if (ls == NULL) + GOTO(out_ls, ls = ERR_PTR(-ENOMEM)); + + atomic_set(&ls->ls_refcount, 1); + INIT_LIST_HEAD(&ls->ls_los_list); + mutex_init(&ls->ls_los_mutex); + + ls->ls_osd = dev; + + LASSERT(dev->dd_lu_dev.ld_site); + lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type); + ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops; + ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site; + + /* finally add ls to the list */ + list_add(&ls->ls_linkage, &ls_list_head); +out_ls: + mutex_unlock(&ls_list_mutex); + RETURN(ls); +} + +void ls_device_put(const struct lu_env *env, struct ls_device *ls) +{ + LASSERT(env); + if (!atomic_dec_and_test(&ls->ls_refcount)) + return; + + mutex_lock(&ls_list_mutex); + if (atomic_read(&ls->ls_refcount) == 0) { + LASSERT(list_empty(&ls->ls_los_list)); + list_del(&ls->ls_linkage); + lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0); + lu_device_fini(&ls->ls_top_dev.dd_lu_dev); + OBD_FREE_PTR(ls); + } + mutex_unlock(&ls_list_mutex); +} + +/** + * local file fid generation + */ +int local_object_fid_generate(const struct lu_env *env, + struct local_oid_storage *los, + struct lu_fid *fid) +{ + LASSERT(los->los_dev); + LASSERT(los->los_obj); + + /* take next OID */ + + /* to make it unique after reboot we store + * the latest generated fid atomically with + * object creation see local_object_create() */ + + mutex_lock(&los->los_id_lock); + fid->f_seq = los->los_seq; + fid->f_oid = ++los->los_last_oid; + fid->f_ver = 0; + mutex_unlock(&los->los_id_lock); + + return 0; +} + +int local_object_declare_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, struct lu_attr *attr, + struct dt_object_format *dof, + struct thandle *th) +{ + struct dt_thread_info *dti = dt_info(env); + int rc; + + ENTRY; + + /* update fid generation file */ + if (los != NULL) { + LASSERT(dt_object_exists(los->los_obj)); + dti->dti_lb.lb_buf = NULL; + dti->dti_lb.lb_len = sizeof(struct los_ondisk); + rc = dt_declare_record_write(env, los->los_obj, + &dti->dti_lb, 0, th); + if (rc) + RETURN(rc); + } + + rc = dt_declare_create(env, o, attr, NULL, dof, th); + if (rc) + RETURN(rc); + + dti->dti_lb.lb_buf = NULL; + dti->dti_lb.lb_len = sizeof(dti->dti_lma); + rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th); + + RETURN(rc); +} + +int local_object_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, struct lu_attr *attr, + struct dt_object_format *dof, struct thandle *th) +{ + struct dt_thread_info *dti = dt_info(env); + u64 lastid; + int rc; + + ENTRY; + + rc = dt_create(env, o, attr, NULL, dof, th); + if (rc) + RETURN(rc); + + if (los == NULL) + RETURN(rc); + + LASSERT(los->los_obj); + LASSERT(dt_object_exists(los->los_obj)); + + /* many threads can be updated this, serialize + * them here to avoid the race where one thread + * takes the value first, but writes it last */ + mutex_lock(&los->los_id_lock); + + /* update local oid number on disk so that + * we know the last one used after reboot */ + lastid = cpu_to_le64(los->los_last_oid); + + dti->dti_off = 0; + dti->dti_lb.lb_buf = &lastid; + dti->dti_lb.lb_len = sizeof(lastid); + rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off, + th); + mutex_unlock(&los->los_id_lock); + + RETURN(rc); +} + +/* + * Create local named object (file, directory or index) in parent directory. + */ +static struct dt_object *__local_file_create(const struct lu_env *env, + const struct lu_fid *fid, + struct local_oid_storage *los, + struct ls_device *ls, + struct dt_object *parent, + const char *name, + struct lu_attr *attr, + struct dt_object_format *dof) +{ + struct dt_thread_info *dti = dt_info(env); + struct lu_object_conf *conf = &dti->dti_conf; + struct dt_insert_rec *rec = &dti->dti_dt_rec; + struct dt_object *dto; + struct thandle *th; + int rc; + + /* We know that the target object does not exist, to be created, + * then give some hints - LOC_F_NEW to help low layer to handle + * that efficiently and properly. */ + memset(conf, 0, sizeof(*conf)); + conf->loc_flags = LOC_F_NEW; + dto = ls_locate(env, ls, fid, conf); + if (unlikely(IS_ERR(dto))) + RETURN(dto); + + LASSERT(dto != NULL); + if (dt_object_exists(dto)) + GOTO(out, rc = -EEXIST); + + th = dt_trans_create(env, ls->ls_osd); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = local_object_declare_create(env, los, dto, attr, dof, th); + if (rc) + GOTO(trans_stop, rc); + + if (dti->dti_dof.dof_type == DFT_DIR) { + rc = dt_declare_ref_add(env, dto, th); + if (rc < 0) + GOTO(trans_stop, rc); + + rc = dt_declare_ref_add(env, parent, th); + if (rc < 0) + GOTO(trans_stop, rc); + } + + rec->rec_fid = fid; + rec->rec_type = attr->la_mode & S_IFMT; + rc = dt_declare_insert(env, parent, (const struct dt_rec *)rec, + (const struct dt_key *)name, th); + if (rc) + GOTO(trans_stop, rc); + + if (dti->dti_dof.dof_type == DFT_DIR) { + if (!dt_try_as_dir(env, dto)) + GOTO(trans_stop, rc = -ENOTDIR); + + rec->rec_type = S_IFDIR; + rec->rec_fid = fid; + rc = dt_declare_insert(env, dto, (const struct dt_rec *)rec, + (const struct dt_key *)".", th); + if (rc != 0) + GOTO(trans_stop, rc); + + rec->rec_fid = lu_object_fid(&parent->do_lu); + rc = dt_declare_insert(env, dto, (const struct dt_rec *)rec, + (const struct dt_key *)"..", th); + if (rc != 0) + GOTO(trans_stop, rc); + + rc = dt_declare_ref_add(env, dto, th); + if (rc != 0) + GOTO(trans_stop, rc); + } + + rc = dt_trans_start_local(env, ls->ls_osd, th); + if (rc) + GOTO(trans_stop, rc); + + dt_write_lock(env, dto, LOS_CHILD); + if (dt_object_exists(dto)) + GOTO(unlock, rc = 0); + + CDEBUG(D_OTHER, "create new object "DFID"\n", + PFID(lu_object_fid(&dto->do_lu))); + rc = local_object_create(env, los, dto, attr, dof, th); + if (rc) + GOTO(unlock, rc); + LASSERT(dt_object_exists(dto)); + + if (dti->dti_dof.dof_type == DFT_DIR) { + + rec->rec_type = S_IFDIR; + rec->rec_fid = fid; + /* Add "." and ".." for newly created dir */ + rc = dt_insert(env, dto, (const struct dt_rec *)rec, + (const struct dt_key *)".", th); + if (rc != 0) + GOTO(destroy, rc); + + dt_ref_add(env, dto, th); + rec->rec_fid = lu_object_fid(&parent->do_lu); + rc = dt_insert(env, dto, (const struct dt_rec *)rec, + (const struct dt_key *)"..", th); + if (rc != 0) + GOTO(destroy, rc); + } + + rec->rec_fid = fid; + rec->rec_type = dto->do_lu.lo_header->loh_attr; + dt_write_lock(env, parent, LOS_PARENT); + rc = dt_insert(env, parent, (const struct dt_rec *)rec, + (const struct dt_key *)name, th); + if (dti->dti_dof.dof_type == DFT_DIR) + dt_ref_add(env, parent, th); + dt_write_unlock(env, parent); + if (rc) + GOTO(destroy, rc); +destroy: + if (rc) + dt_destroy(env, dto, th); +unlock: + dt_write_unlock(env, dto); +trans_stop: + dt_trans_stop(env, ls->ls_osd, th); +out: + if (rc) { + dt_object_put_nocache(env, dto); + dto = ERR_PTR(rc); + } + RETURN(dto); +} + +struct dt_object *local_file_find(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (!rc) + dto = ls_locate(env, dt2ls_dev(los->los_dev), + &dti->dti_fid, NULL); + else + dto = ERR_PTR(rc); + + return dto; +} +EXPORT_SYMBOL(local_file_find); + +/* + * Look up and create (if it does not exist) a local named file or directory in + * parent directory. + */ +struct dt_object *local_file_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + dto = local_file_find(env, los, parent, name); + if (!IS_ERR(dto) || PTR_ERR(dto) != -ENOENT) + return dto; + + rc = local_object_fid_generate(env, los, &dti->dti_fid); + if (rc) + return ERR_PTR(rc); + + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT); + dto = __local_file_create(env, &dti->dti_fid, los, + dt2ls_dev(los->los_dev), parent, name, + &dti->dti_attr, &dti->dti_dof); + return dto; +} +EXPORT_SYMBOL(local_file_find_or_create); + +struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, + __u32 mode) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + dto = dt_locate(env, dt, &dti->dti_fid); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + struct ls_device *ls; + + ls = ls_device_get(dt); + if (IS_ERR(ls)) { + dto = ERR_PTR(PTR_ERR(ls)); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT); + dto = __local_file_create(env, fid, NULL, ls, parent, + name, &dti->dti_attr, + &dti->dti_dof); + /* ls_device_put() will finalize the ls device, we + * have to open the object in other device stack */ + if (!IS_ERR(dto)) { + dti->dti_fid = dto->do_lu.lo_header->loh_fid; + dt_object_put_nocache(env, dto); + dto = dt_locate(env, dt, &dti->dti_fid); + } + ls_device_put(env, ls); + } + } + return dto; +} +EXPORT_SYMBOL(local_file_find_or_create_with_fid); + +/* + * Look up and create (if it does not exist) a local named index file in parent + * directory. + */ +struct dt_object *local_index_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + /* name is found, get the object */ + dto = ls_locate(env, dt2ls_dev(los->los_dev), + &dti->dti_fid, NULL); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + rc = local_object_fid_generate(env, los, &dti->dti_fid); + if (rc < 0) { + dto = ERR_PTR(rc); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = DFT_INDEX; + dti->dti_dof.u.dof_idx.di_feat = ft; + dto = __local_file_create(env, &dti->dti_fid, los, + dt2ls_dev(los->los_dev), + parent, name, &dti->dti_attr, + &dti->dti_dof); + } + } + return dto; + +} +EXPORT_SYMBOL(local_index_find_or_create); + +struct dt_object * +local_index_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + /* name is found, get the object */ + if (!lu_fid_eq(fid, &dti->dti_fid)) + dto = ERR_PTR(-EINVAL); + else + dto = dt_locate(env, dt, fid); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + struct ls_device *ls; + + ls = ls_device_get(dt); + if (IS_ERR(ls)) { + dto = ERR_PTR(PTR_ERR(ls)); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = DFT_INDEX; + dti->dti_dof.u.dof_idx.di_feat = ft; + dto = __local_file_create(env, fid, NULL, ls, parent, + name, &dti->dti_attr, + &dti->dti_dof); + /* ls_device_put() will finalize the ls device, we + * have to open the object in other device stack */ + if (!IS_ERR(dto)) { + dti->dti_fid = dto->do_lu.lo_header->loh_fid; + dt_object_put_nocache(env, dto); + dto = dt_locate(env, dt, &dti->dti_fid); + } + ls_device_put(env, ls); + } + } + return dto; +} +EXPORT_SYMBOL(local_index_find_or_create_with_fid); + +static int local_object_declare_unlink(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *p, + struct dt_object *c, const char *name, + struct thandle *th) +{ + int rc; + + rc = dt_declare_delete(env, p, (const struct dt_key *)name, th); + if (rc < 0) + return rc; + + rc = dt_declare_ref_del(env, c, th); + if (rc < 0) + return rc; + + return dt_declare_destroy(env, c, th); +} + +int local_object_unlink(const struct lu_env *env, struct dt_device *dt, + struct dt_object *parent, const char *name) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + struct thandle *th; + int rc; + + ENTRY; + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == -ENOENT) + RETURN(0); + else if (rc < 0) + RETURN(rc); + + dto = dt_locate(env, dt, &dti->dti_fid); + if (unlikely(IS_ERR(dto))) + RETURN(PTR_ERR(dto)); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = local_object_declare_unlink(env, dt, parent, dto, name, th); + if (rc < 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc < 0) + GOTO(stop, rc); + + dt_write_lock(env, dto, 0); + rc = dt_delete(env, parent, (struct dt_key *)name, th); + if (rc < 0) + GOTO(unlock, rc); + + rc = dt_ref_del(env, dto, th); + if (rc < 0) { + struct dt_insert_rec *rec = &dti->dti_dt_rec; + + rec->rec_fid = &dti->dti_fid; + rec->rec_type = dto->do_lu.lo_header->loh_attr; + rc = dt_insert(env, parent, (const struct dt_rec *)rec, + (const struct dt_key *)name, th); + GOTO(unlock, rc); + } + + rc = dt_destroy(env, dto, th); +unlock: + dt_write_unlock(env, dto); +stop: + dt_trans_stop(env, dt, th); +out: + dt_object_put_nocache(env, dto); + return rc; +} +EXPORT_SYMBOL(local_object_unlink); + +struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq) +{ + struct local_oid_storage *los, *ret = NULL; + + list_for_each_entry(los, &ls->ls_los_list, los_list) { + if (los->los_seq == seq) { + atomic_inc(&los->los_refcount); + ret = los; + break; + } + } + return ret; +} + +void dt_los_put(struct local_oid_storage *los) +{ + if (atomic_dec_and_test(&los->los_refcount)) + /* should never happen, only local_oid_storage_fini should + * drop refcount to zero */ + LBUG(); + return; +} + +/* after Lustre 2.3 release there may be old file to store last generated FID + * If such file exists then we have to read its content + */ +static int lastid_compat_check(const struct lu_env *env, struct dt_device *dev, + __u64 lastid_seq, __u32 *first_oid, + struct ls_device *ls) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *root = NULL; + struct los_ondisk losd; + struct dt_object *o = NULL; + int rc = 0; + + rc = dt_root_get(env, dev, &dti->dti_fid); + if (rc) + return rc; + + root = ls_locate(env, ls, &dti->dti_fid, NULL); + if (IS_ERR(root)) + return PTR_ERR(root); + + /* find old last_id file */ + snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-%#llx-lastid", + lastid_seq); + rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid); + dt_object_put_nocache(env, root); + if (rc == -ENOENT) { + /* old llog lastid accessed by FID only */ + if (lastid_seq != FID_SEQ_LLOG) + return 0; + dti->dti_fid.f_seq = FID_SEQ_LLOG; + dti->dti_fid.f_oid = 1; + dti->dti_fid.f_ver = 0; + o = ls_locate(env, ls, &dti->dti_fid, NULL); + if (IS_ERR(o)) + return PTR_ERR(o); + + if (!dt_object_exists(o)) { + dt_object_put_nocache(env, o); + return 0; + } + CDEBUG(D_INFO, "Found old llog lastid file\n"); + } else if (rc < 0) { + return rc; + } else { + CDEBUG(D_INFO, "Found old lastid file for sequence %#llx\n", + lastid_seq); + o = ls_locate(env, ls, &dti->dti_fid, NULL); + if (IS_ERR(o)) + return PTR_ERR(o); + } + /* let's read seq-NNNNNN-lastid file value */ + LASSERT(dt_object_exists(o)); + dti->dti_off = 0; + dti->dti_lb.lb_buf = &losd; + dti->dti_lb.lb_len = sizeof(losd); + dt_read_lock(env, o, 0); + rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off); + dt_read_unlock(env, o); + if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) { + CERROR("%s: wrong content of seq-%#llx-lastid file, magic %x\n", + o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, + le32_to_cpu(losd.lso_magic)); + rc = -EINVAL; + } else if (rc < 0) { + CERROR("%s: failed to read seq-%#llx-lastid: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, rc); + } + dt_object_put_nocache(env, o); + if (rc == 0) + *first_oid = le32_to_cpu(losd.lso_next_oid); + return rc; +} + +/** + * Initialize local OID storage for required sequence. + * That may be needed for services that uses local files and requires + * dynamic OID allocation for them. + * + * Per each sequence we have an object with 'first_fid' identificator + * containing the counter for OIDs of locally created files with that + * sequence. + * + * It is used now by llog subsystem and MGS for NID tables + * + * Function gets first_fid to create counter object. + * All dynamic fids will be generated with the same sequence and incremented + * OIDs + * + * Returned local_oid_storage is in-memory representaion of OID storage + */ +int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *first_fid, + struct local_oid_storage **los) +{ + struct dt_thread_info *dti = dt_info(env); + struct ls_device *ls; + u64 lastid; + struct dt_object *o = NULL; + struct thandle *th; + __u32 first_oid = fid_oid(first_fid); + int rc = 0; + + ENTRY; + + ls = ls_device_get(dev); + if (IS_ERR(ls)) + RETURN(PTR_ERR(ls)); + + mutex_lock(&ls->ls_los_mutex); + *los = dt_los_find(ls, fid_seq(first_fid)); + if (*los != NULL) + GOTO(out, rc = 0); + + /* not found, then create */ + OBD_ALLOC_PTR(*los); + if (*los == NULL) + GOTO(out, rc = -ENOMEM); + + atomic_set(&(*los)->los_refcount, 1); + mutex_init(&(*los)->los_id_lock); + (*los)->los_dev = &ls->ls_top_dev; + atomic_inc(&ls->ls_refcount); + list_add(&(*los)->los_list, &ls->ls_los_list); + + /* Use {seq, 0, 0} to create the LAST_ID file for every + * sequence. OIDs start at LUSTRE_FID_INIT_OID. + */ + dti->dti_fid.f_seq = fid_seq(first_fid); + dti->dti_fid.f_oid = LUSTRE_FID_LASTID_OID; + dti->dti_fid.f_ver = 0; + o = ls_locate(env, ls, &dti->dti_fid, NULL); + if (IS_ERR(o)) + GOTO(out_los, rc = PTR_ERR(o)); + + if (!dt_object_exists(o)) { + rc = lastid_compat_check(env, dev, fid_seq(first_fid), + &first_oid, ls); + if (rc < 0) + GOTO(out_los, rc); + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(out_los, rc = PTR_ERR(th)); + + dti->dti_attr.la_valid = LA_MODE | LA_TYPE; + dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG); + + rc = dt_declare_create(env, o, &dti->dti_attr, NULL, + &dti->dti_dof, th); + if (rc) + GOTO(out_trans, rc); + + lastid = cpu_to_le64(first_oid); + + dti->dti_off = 0; + dti->dti_lb.lb_buf = &lastid; + dti->dti_lb.lb_len = sizeof(lastid); + rc = dt_declare_record_write(env, o, &dti->dti_lb, dti->dti_off, + th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(out_trans, rc); + + dt_write_lock(env, o, 0); + if (dt_object_exists(o)) + GOTO(out_lock, rc = 0); + + rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof, + th); + if (rc) + GOTO(out_lock, rc); + + rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th); + if (rc) + GOTO(out_lock, rc); +out_lock: + dt_write_unlock(env, o); +out_trans: + dt_trans_stop(env, dev, th); + } else { + dti->dti_off = 0; + dti->dti_lb.lb_buf = &lastid; + dti->dti_lb.lb_len = sizeof(lastid); + dt_read_lock(env, o, 0); + rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off); + dt_read_unlock(env, o); + if (rc == 0 && le64_to_cpu(lastid) > OBIF_MAX_OID) { + CERROR("%s: bad oid %llu is read from LAST_ID\n", + o->do_lu.lo_dev->ld_obd->obd_name, + le64_to_cpu(lastid)); + rc = -EINVAL; + } + } +out_los: + if (rc != 0) { + list_del(&(*los)->los_list); + atomic_dec(&ls->ls_refcount); + OBD_FREE_PTR(*los); + *los = NULL; + if (o != NULL && !IS_ERR(o)) + dt_object_put_nocache(env, o); + } else { + (*los)->los_seq = fid_seq(first_fid); + (*los)->los_last_oid = le64_to_cpu(lastid); + (*los)->los_obj = o; + /* Read value should not be less than initial one + * but possible after upgrade from older fs. + * In this case just switch to the first_oid in memory and + * it will be updated on disk with first object generated */ + if ((*los)->los_last_oid < first_oid) + (*los)->los_last_oid = first_oid; + } +out: + mutex_unlock(&ls->ls_los_mutex); + ls_device_put(env, ls); + return rc; +} +EXPORT_SYMBOL(local_oid_storage_init); + +void local_oid_storage_fini(const struct lu_env *env, + struct local_oid_storage *los) +{ + struct ls_device *ls; + + LASSERT(env); + LASSERT(los->los_dev); + ls = dt2ls_dev(los->los_dev); + + /* Take the mutex before decreasing the reference to avoid race + * conditions as described in LU-4721. */ + mutex_lock(&ls->ls_los_mutex); + if (!atomic_dec_and_test(&los->los_refcount)) { + mutex_unlock(&ls->ls_los_mutex); + return; + } + + if (los->los_obj) + dt_object_put_nocache(env, los->los_obj); + list_del(&los->los_list); + OBD_FREE_PTR(los); + mutex_unlock(&ls->ls_los_mutex); + ls_device_put(env, ls); +} +EXPORT_SYMBOL(local_oid_storage_fini); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h new file mode 100644 index 0000000000000..caf26bfec6e28 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h @@ -0,0 +1,102 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * lustre/obdclass/local_storage.c + * + * Local storage for file/objects with fid generation. Works on top of OSD. + * + * Author: Mikhail Pershin + */ +#ifndef __LOCAL_STORAGE_H +#define __LOCAL_STORAGE_H + +#include +#include +#include +#include + +struct ls_device { + struct dt_device ls_top_dev; + /* all initialized ls_devices on this node linked by this */ + struct list_head ls_linkage; + /* how many handle's reference this local storage */ + atomic_t ls_refcount; + /* underlaying OSD device */ + struct dt_device *ls_osd; + /* list of all local OID storages */ + struct list_head ls_los_list; + struct mutex ls_los_mutex; +}; + +static inline struct ls_device *dt2ls_dev(struct dt_device *d) +{ + return container_of0(d, struct ls_device, ls_top_dev); +} + +struct ls_object { + struct lu_object_header ls_header; + struct dt_object ls_obj; +}; + +static inline struct ls_object *lu2ls_obj(struct lu_object *o) +{ + return container_of0(o, struct ls_object, ls_obj.do_lu); +} + +static inline struct dt_object *ls_locate(const struct lu_env *env, + struct ls_device *ls, + const struct lu_fid *fid, + const struct lu_object_conf *conf) +{ + return dt_locate_at(env, ls->ls_osd, fid, + &ls->ls_top_dev.dd_lu_dev, conf); +} + +struct ls_device *ls_device_get(struct dt_device *dev); +void ls_device_put(const struct lu_env *env, struct ls_device *ls); +struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq); +void dt_los_put(struct local_oid_storage *los); + +/* Lustre 2.3 on-disk structure describing local object OIDs storage + * the structure to be used with any sequence managed by + * local object library. + * Obsoleted since 2.4 but is kept for compatibility reasons, + * see lastid_compat_check() in obdclass/local_storage.c */ +struct los_ondisk { + __u32 lso_magic; + __u32 lso_next_oid; +}; + +#define LOS_MAGIC 0xdecafbee + +/** + * Used in __local_file_create() for object lock role + **/ +enum los_object_role { + LOS_PARENT, + LOS_CHILD, +}; + +#endif diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c new file mode 100644 index 0000000000000..bbdcfd47ebad9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c @@ -0,0 +1,137 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lprocfs_counters.c + * + * Lustre lprocfs counter routines + * + * Author: Andreas Dilger + */ +#include +#include +#include + +#ifdef CONFIG_PROC_FS +void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount) +{ + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *header; + int smp_id; + unsigned long flags = 0; + + if (stats == NULL) + return; + + LASSERTF(0 <= idx && idx < stats->ls_num, + "idx %d, ls_num %hu\n", idx, stats->ls_num); + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags); + if (smp_id < 0) + return; + + header = &stats->ls_cnt_header[idx]; + percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx); + percpu_cntr->lc_count++; + + if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + /* + * lprocfs_counter_add() can be called in interrupt context, + * as memory allocation could trigger memory shrinker call + * ldlm_pool_shrink(), which calls lprocfs_counter_add(). + * LU-1727. + * + * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE + * flag, because it needs accurate counting lest memory leak + * check reports error. + */ + if (in_interrupt() && + (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq += amount; + else + percpu_cntr->lc_sum += amount; + + if (header->lc_config & LPROCFS_CNTR_STDDEV) + percpu_cntr->lc_sumsquare += (__s64)amount * amount; + if (amount < percpu_cntr->lc_min) + percpu_cntr->lc_min = amount; + if (amount > percpu_cntr->lc_max) + percpu_cntr->lc_max = amount; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_add); + +void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount) +{ + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *header; + int smp_id; + unsigned long flags = 0; + + if (stats == NULL) + return; + + LASSERTF(0 <= idx && idx < stats->ls_num, + "idx %d, ls_num %hu\n", idx, stats->ls_num); + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags); + if (smp_id < 0) + return; + + header = &stats->ls_cnt_header[idx]; + percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx); + if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + /* + * Sometimes we use RCU callbacks to free memory which calls + * lprocfs_counter_sub(), and RCU callbacks may execute in + * softirq context - right now that's the only case we're in + * softirq context here, use separate counter for that. + * bz20650. + * + * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE + * flag, because it needs accurate counting lest memory leak + * check reports error. + */ + if (in_interrupt() && + (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq -= amount; + else + percpu_cntr->lc_sum -= amount; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_sub); +#endif /* CONFIG_PROC_FS */ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c new file mode 100644 index 0000000000000..37d749d199275 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c @@ -0,0 +1,661 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2016, Intel Corporation. + * Use is subject to license terms. + * + * Author: Niu Yawei + */ +/* + * lustre/obdclass/lprocfs_jobstats.c + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#ifdef CONFIG_PROC_FS + +/* + * JobID formats & JobID environment variable names for supported + * job schedulers: + * + * SLURM: + * JobID format: 32 bit integer. + * JobID env var: SLURM_JOB_ID. + * SGE: + * JobID format: Decimal integer range to 99999. + * JobID env var: JOB_ID. + * LSF: + * JobID format: 6 digit integer by default (up to 999999), can be + * increased to 10 digit (up to 2147483646). + * JobID env var: LSB_JOBID. + * Loadleveler: + * JobID format: String of machine_name.cluster_id.process_id, for + * example: fr2n02.32.0 + * JobID env var: LOADL_STEP_ID. + * PBS: + * JobID format: String of sequence_number[.server_name][@server]. + * JobID env var: PBS_JOBID. + * Maui/MOAB: + * JobID format: Same as PBS. + * JobID env var: Same as PBS. + */ + +struct job_stat { + struct hlist_node js_hash; /* hash struct for this jobid */ + struct list_head js_list; /* on ojs_list, with ojs_lock */ + atomic_t js_refcount; /* num users of this struct */ + char js_jobid[LUSTRE_JOBID_SIZE]; /* job name + NUL*/ + time64_t js_timestamp; /* seconds of most recent stat*/ + struct lprocfs_stats *js_stats; /* per-job statistics */ + struct obd_job_stats *js_jobstats; /* for accessing ojs_lock */ +}; + +static unsigned +job_stat_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, strlen(key), mask); +} + +static void *job_stat_key(struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + return job->js_jobid; +} + +static int job_stat_keycmp(const void *key, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + return (strlen(job->js_jobid) == strlen(key)) && + !strncmp(job->js_jobid, key, strlen(key)); +} + +static void *job_stat_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct job_stat, js_hash); +} + +static void job_stat_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + atomic_inc(&job->js_refcount); +} + +static void job_free(struct job_stat *job) +{ + LASSERT(atomic_read(&job->js_refcount) == 0); + LASSERT(job->js_jobstats != NULL); + + write_lock(&job->js_jobstats->ojs_lock); + list_del_init(&job->js_list); + write_unlock(&job->js_jobstats->ojs_lock); + + lprocfs_free_stats(&job->js_stats); + OBD_FREE_PTR(job); +} + +static void job_putref(struct job_stat *job) +{ + LASSERT(atomic_read(&job->js_refcount) > 0); + if (atomic_dec_and_test(&job->js_refcount)) + job_free(job); +} + +static void job_stat_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + job_putref(job); +} + +static void job_stat_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + CERROR("should not have any items\n"); +} + +static struct cfs_hash_ops job_stats_hash_ops = { + .hs_hash = job_stat_hash, + .hs_key = job_stat_key, + .hs_keycmp = job_stat_keycmp, + .hs_object = job_stat_object, + .hs_get = job_stat_get, + .hs_put_locked = job_stat_put_locked, + .hs_exit = job_stat_exit, +}; + +/** + * Jobstats expiry iterator to clean up old jobids + * + * Called for each job_stat structure on this device, it should delete stats + * older than the specified \a oldest_time in seconds. If \a oldest_time is + * in the future then this will delete all statistics (e.g. during shutdown). + * + * \param[in] hs hash of all jobids on this device + * \param[in] bd hash bucket containing this jobid + * \param[in] hnode hash structure for this jobid + * \param[in] data pointer to stats expiry time in seconds + */ +static int job_cleanup_iter_callback(struct cfs_hash *hs, + struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + time64_t oldest_time = *((time64_t *)data); + struct job_stat *job; + + job = hlist_entry(hnode, struct job_stat, js_hash); + if (job->js_timestamp < oldest_time) + cfs_hash_bd_del_locked(hs, bd, hnode); + + return 0; +} + +/** + * Clean up jobstats that were updated more than \a before seconds ago. + * + * Since this function may be called frequently, do not scan all of the + * jobstats on each call, only twice per cleanup interval. That means stats + * may be around on average cleanup_interval / 4 longer than necessary, + * but that is not considered harmful. + * + * If \a before is negative then this will force clean up all jobstats due + * to the expiry time being in the future (e.g. at shutdown). + * + * If there is already another thread doing jobstats cleanup, don't try to + * do this again in the current thread unless this is a force cleanup. + * + * \param[in] stats stucture tracking all job stats for this device + * \param[in] before expire jobstats updated more than this many seconds ago + */ +static void lprocfs_job_cleanup(struct obd_job_stats *stats, int before) +{ + time64_t now = ktime_get_real_seconds(); + time64_t oldest; + + if (likely(before >= 0)) { + unsigned int cleanup_interval = stats->ojs_cleanup_interval; + + if (cleanup_interval == 0 || before == 0) + return; + + if (now < stats->ojs_last_cleanup + cleanup_interval / 2) + return; + + if (stats->ojs_cleaning) + return; + } + + write_lock(&stats->ojs_lock); + if (before >= 0 && stats->ojs_cleaning) { + write_unlock(&stats->ojs_lock); + return; + } + + stats->ojs_cleaning = true; + write_unlock(&stats->ojs_lock); + + /* Can't hold ojs_lock over hash iteration, since it is grabbed by + * job_cleanup_iter_callback() + * ->cfs_hash_bd_del_locked() + * ->job_putref() + * ->job_free() + * + * Holding ojs_lock isn't necessary for safety of the hash iteration, + * since locking of the hash is handled internally, but there isn't + * any benefit to having multiple threads doing cleanup at one time. + */ + oldest = now - before; + cfs_hash_for_each_safe(stats->ojs_hash, job_cleanup_iter_callback, + &oldest); + + write_lock(&stats->ojs_lock); + stats->ojs_cleaning = false; + stats->ojs_last_cleanup = ktime_get_real_seconds(); + write_unlock(&stats->ojs_lock); +} + +static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs) +{ + struct job_stat *job; + + OBD_ALLOC_PTR(job); + if (job == NULL) + return NULL; + + job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0); + if (job->js_stats == NULL) { + OBD_FREE_PTR(job); + return NULL; + } + + jobs->ojs_cntr_init_fn(job->js_stats); + + memcpy(job->js_jobid, jobid, sizeof(job->js_jobid)); + job->js_timestamp = ktime_get_real_seconds(); + job->js_jobstats = jobs; + INIT_HLIST_NODE(&job->js_hash); + INIT_LIST_HEAD(&job->js_list); + atomic_set(&job->js_refcount, 1); + + return job; +} + +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, + int event, long amount) +{ + struct obd_job_stats *stats = &obd->u.obt.obt_jobstats; + struct job_stat *job, *job2; + ENTRY; + + LASSERT(stats != NULL); + LASSERT(stats->ojs_hash != NULL); + + if (event >= stats->ojs_cntr_num) + RETURN(-EINVAL); + + if (jobid == NULL || strlen(jobid) == 0) + RETURN(-EINVAL); + + if (strlen(jobid) >= LUSTRE_JOBID_SIZE) { + CERROR("Invalid jobid size (%lu), expect(%d)\n", + (unsigned long)strlen(jobid) + 1, LUSTRE_JOBID_SIZE); + RETURN(-EINVAL); + } + + job = cfs_hash_lookup(stats->ojs_hash, jobid); + if (job) + goto found; + + lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval); + + job = job_alloc(jobid, stats); + if (job == NULL) + RETURN(-ENOMEM); + + job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid, + &job->js_hash); + if (job2 != job) { + job_putref(job); + job = job2; + /* We cannot LASSERT(!list_empty(&job->js_list)) here, + * since we just lost the race for inserting "job" into the + * ojs_list, and some other thread is doing it _right_now_. + * Instead, be content the other thread is doing this, since + * "job2" was initialized in job_alloc() already. LU-2163 */ + } else { + LASSERT(list_empty(&job->js_list)); + write_lock(&stats->ojs_lock); + list_add_tail(&job->js_list, &stats->ojs_list); + write_unlock(&stats->ojs_lock); + } + +found: + LASSERT(stats == job->js_jobstats); + job->js_timestamp = ktime_get_real_seconds(); + lprocfs_counter_add(job->js_stats, event, amount); + + job_putref(job); + + RETURN(0); +} +EXPORT_SYMBOL(lprocfs_job_stats_log); + +void lprocfs_job_stats_fini(struct obd_device *obd) +{ + struct obd_job_stats *stats = &obd->u.obt.obt_jobstats; + + if (stats->ojs_hash == NULL) + return; + + lprocfs_job_cleanup(stats, -99); + cfs_hash_putref(stats->ojs_hash); + stats->ojs_hash = NULL; + LASSERT(list_empty(&stats->ojs_list)); +} +EXPORT_SYMBOL(lprocfs_job_stats_fini); + +static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_job_stats *stats = p->private; + loff_t off = *pos; + struct job_stat *job; + + read_lock(&stats->ojs_lock); + if (off == 0) + return SEQ_START_TOKEN; + off--; + list_for_each_entry(job, &stats->ojs_list, js_list) { + if (!off--) + return job; + } + return NULL; +} + +static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v) +{ + struct obd_job_stats *stats = p->private; + + read_unlock(&stats->ojs_lock); +} + +static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_job_stats *stats = p->private; + struct job_stat *job; + struct list_head *next; + + ++*pos; + if (v == SEQ_START_TOKEN) { + next = stats->ojs_list.next; + } else { + job = (struct job_stat *)v; + next = job->js_list.next; + } + + return next == &stats->ojs_list ? NULL : + list_entry(next, struct job_stat, js_list); +} + +/* + * Example of output on MDT: + * + * job_stats: + * - job_id: dd.4854 + * snapshot_time: 1322494486 + * open: { samples: 1, unit: reqs } + * close: { samples: 1, unit: reqs } + * mknod: { samples: 0, unit: reqs } + * link: { samples: 0, unit: reqs } + * unlink: { samples: 0, unit: reqs } + * mkdir: { samples: 0, unit: reqs } + * rmdir: { samples: 0, unit: reqs } + * rename: { samples: 0, unit: reqs } + * getattr: { samples: 1, unit: reqs } + * setattr: { samples: 0, unit: reqs } + * getxattr: { samples: 0, unit: reqs } + * setxattr: { samples: 0, unit: reqs } + * statfs: { samples: 0, unit: reqs } + * sync: { samples: 0, unit: reqs } + * + * Example of output on OST: + * + * job_stats: + * - job_id dd.4854 + * snapshot_time: 1322494602 + * read: { samples: 0, unit: bytes, min: 0, max: 0, sum: 0 } + * write: { samples: 1, unit: bytes, min: 4096, max: 4096, sum: 4096 } + * setattr: { samples: 0, unit: reqs } + * punch: { samples: 0, unit: reqs } + * sync: { samples: 0, unit: reqs } + */ + +static const char spaces[] = " "; + +static int inline width(const char *str, int len) +{ + return len - min((int)strlen(str), 15); +} + +static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v) +{ + struct job_stat *job = v; + struct lprocfs_stats *s; + struct lprocfs_counter ret; + struct lprocfs_counter_header *cntr_header; + int i; + + if (v == SEQ_START_TOKEN) { + seq_printf(p, "job_stats:\n"); + return 0; + } + + /* Replace the non-printable character in jobid with '?', so + * that the output of jobid will be confined in single line. */ + seq_printf(p, "- %-16s ", "job_id:"); + for (i = 0; i < strlen(job->js_jobid); i++) { + if (isprint(job->js_jobid[i]) != 0) + seq_putc(p, job->js_jobid[i]); + else + seq_putc(p, '?'); + } + seq_putc(p, '\n'); + + seq_printf(p, " %-16s %lld\n", "snapshot_time:", job->js_timestamp); + + s = job->js_stats; + for (i = 0; i < s->ls_num; i++) { + cntr_header = &s->ls_cnt_header[i]; + lprocfs_stats_collect(s, i, &ret); + + seq_printf(p, " %s:%.*s { samples: %11llu", + cntr_header->lc_name, + width(cntr_header->lc_name, 15), spaces, + ret.lc_count); + if (cntr_header->lc_units[0] != '\0') + seq_printf(p, ", unit: %5s", cntr_header->lc_units); + + if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + seq_printf(p, ", min:%8llu, max:%8llu," + " sum:%16llu", + ret.lc_count ? ret.lc_min : 0, + ret.lc_count ? ret.lc_max : 0, + ret.lc_count ? ret.lc_sum : 0); + } + if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) { + seq_printf(p, ", sumsq: %18llu", + ret.lc_count ? ret.lc_sumsquare : 0); + } + + seq_printf(p, " }\n"); + + } + return 0; +} + +static const struct seq_operations lprocfs_jobstats_seq_sops = { + .start = lprocfs_jobstats_seq_start, + .stop = lprocfs_jobstats_seq_stop, + .next = lprocfs_jobstats_seq_next, + .show = lprocfs_jobstats_seq_show, +}; + +static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = LPROCFS_ENTRY_CHECK(inode); + if (rc < 0) + return rc; + + rc = seq_open(file, &lprocfs_jobstats_seq_sops); + if (rc) + return rc; + seq = file->private_data; + seq->private = PDE_DATA(inode); + return 0; +} + +static ssize_t lprocfs_jobstats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_job_stats *stats = seq->private; + char jobid[LUSTRE_JOBID_SIZE]; + struct job_stat *job; + + if (len == 0 || len >= LUSTRE_JOBID_SIZE) + return -EINVAL; + + if (stats->ojs_hash == NULL) + return -ENODEV; + + if (copy_from_user(jobid, buf, len)) + return -EFAULT; + jobid[len] = 0; + + /* Trim '\n' if any */ + if (jobid[len - 1] == '\n') + jobid[len - 1] = 0; + + if (strcmp(jobid, "clear") == 0) { + lprocfs_job_cleanup(stats, -99); + + return len; + } + + if (strlen(jobid) == 0) + return -EINVAL; + + job = cfs_hash_lookup(stats->ojs_hash, jobid); + if (!job) + return -EINVAL; + + cfs_hash_del_key(stats->ojs_hash, jobid); + + job_putref(job); + return len; +} + +/** + * Clean up the seq file state when the /proc file is closed. + * + * This also expires old job stats from the cache after they have been + * printed in case the system is idle and not generating new jobstats. + * + * \param[in] inode struct inode for seq file being closed + * \param[in] file struct file for seq file being closed + * + * \retval 0 on success + * \retval negative errno on failure + */ +static int lprocfs_jobstats_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct obd_job_stats *stats = seq->private; + + lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval); + + return lprocfs_seq_release(inode, file); +} + +static const struct proc_ops lprocfs_jobstats_seq_fops = { + PROC_OWNER(THIS_MODULE) + .proc_open = lprocfs_jobstats_seq_open, + .proc_read = seq_read, + .proc_write = lprocfs_jobstats_seq_write, + .proc_lseek = seq_lseek, + .proc_release = lprocfs_jobstats_seq_release, +}; + +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback init_fn) +{ + struct proc_dir_entry *entry; + struct obd_job_stats *stats; + ENTRY; + + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->obd_type->typ_name); + + if (cntr_num <= 0) + RETURN(-EINVAL); + + if (init_fn == NULL) + RETURN(-EINVAL); + + /* Currently needs to be a target due to the use of obt_jobstats. */ + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0 && + strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) != 0) { + CERROR("%s: invalid device type %s for job stats: rc = %d\n", + obd->obd_name, obd->obd_type->typ_name, -EINVAL); + RETURN(-EINVAL); + } + stats = &obd->u.obt.obt_jobstats; + + LASSERT(stats->ojs_hash == NULL); + stats->ojs_hash = cfs_hash_create("JOB_STATS", + HASH_JOB_STATS_CUR_BITS, + HASH_JOB_STATS_MAX_BITS, + HASH_JOB_STATS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &job_stats_hash_ops, + CFS_HASH_DEFAULT); + if (stats->ojs_hash == NULL) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&stats->ojs_list); + rwlock_init(&stats->ojs_lock); + stats->ojs_cntr_num = cntr_num; + stats->ojs_cntr_init_fn = init_fn; + stats->ojs_cleanup_interval = 600; /* 10 mins by default */ + stats->ojs_last_cleanup = ktime_get_real_seconds(); + + entry = lprocfs_add_simple(obd->obd_proc_entry, "job_stats", stats, + &lprocfs_jobstats_seq_fops); + if (IS_ERR(entry)) { + lprocfs_job_stats_fini(obd); + RETURN(-ENOMEM); + } + RETURN(0); +} +EXPORT_SYMBOL(lprocfs_job_stats_init); +#endif /* CONFIG_PROC_FS*/ + +ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_job_stats *stats; + + stats = &obd->u.obt.obt_jobstats; + return scnprintf(buf, PAGE_SIZE, "%d\n", stats->ojs_cleanup_interval); +} +EXPORT_SYMBOL(job_cleanup_interval_show); + +ssize_t job_cleanup_interval_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_job_stats *stats; + unsigned int val; + int rc; + + stats = &obd->u.obt.obt_jobstats; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + stats->ojs_cleanup_interval = val; + lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval); + return count; +} +EXPORT_SYMBOL(job_cleanup_interval_store); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c new file mode 100644 index 0000000000000..6cad0a93e7d11 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c @@ -0,0 +1,2470 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lprocfs_status.c + * + * Author: Hariharan Thantry + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#ifdef CONFIG_PROC_FS + +static int lprocfs_no_percpu_stats = 0; +module_param(lprocfs_no_percpu_stats, int, 0644); +MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs stats"); + +#define MAX_STRING_SIZE 128 + +int lprocfs_single_release(struct inode *inode, struct file *file) +{ + return single_release(inode, file); +} +EXPORT_SYMBOL(lprocfs_single_release); + +int lprocfs_seq_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} +EXPORT_SYMBOL(lprocfs_seq_release); + +static umode_t default_mode(const struct proc_ops *ops) +{ + umode_t mode = 0; + + if (ops->proc_read) + mode = 0444; + if (ops->proc_write) + mode |= 0200; + + return mode; +} + +struct proc_dir_entry * +lprocfs_add_simple(struct proc_dir_entry *root, char *name, + void *data, const struct proc_ops *fops) +{ + struct proc_dir_entry *proc; + umode_t mode; + + if (!root || !name || !fops) + return ERR_PTR(-EINVAL); + + mode = default_mode(fops); + proc = proc_create_data(name, mode, root, fops, data); + if (!proc) { + CERROR("LprocFS: No memory to create /proc entry %s\n", + name); + return ERR_PTR(-ENOMEM); + } + return proc; +} +EXPORT_SYMBOL(lprocfs_add_simple); + +struct proc_dir_entry *lprocfs_add_symlink(const char *name, + struct proc_dir_entry *parent, + const char *format, ...) +{ + struct proc_dir_entry *entry; + char *dest; + va_list ap; + + if (!parent || !format) + return NULL; + + OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1); + if (!dest) + return NULL; + + va_start(ap, format); + vsnprintf(dest, MAX_STRING_SIZE, format, ap); + va_end(ap); + + entry = proc_symlink(name, parent, dest); + if (!entry) + CERROR("LprocFS: Could not create symbolic link from " + "%s to %s\n", name, dest); + + OBD_FREE(dest, MAX_STRING_SIZE + 1); + return entry; +} +EXPORT_SYMBOL(lprocfs_add_symlink); + +static const struct file_operations ldebugfs_empty_ops = { }; + +int ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *list, + void *data) +{ + if (IS_ERR_OR_NULL(parent) || IS_ERR_OR_NULL(list)) + return -EINVAL; + + while (list->name) { + struct dentry *entry; + umode_t mode = 0; + + if (list->proc_mode != 0000) { + mode = list->proc_mode; + } else if (list->fops) { + if (list->fops->read) + mode = 0444; + if (list->fops->write) + mode |= 0200; + } + entry = debugfs_create_file(list->name, mode, parent, + list->data ? : data, + list->fops ? : &ldebugfs_empty_ops); + if (IS_ERR_OR_NULL(entry)) + return entry ? PTR_ERR(entry) : -ENOMEM; + list++; + } + return 0; +} +EXPORT_SYMBOL_GPL(ldebugfs_add_vars); + +static const struct proc_ops lprocfs_empty_ops = { }; + +/** + * Add /proc entries. + * + * \param root [in] The parent proc entry on which new entry will be added. + * \param list [in] Array of proc entries to be added. + * \param data [in] The argument to be passed when entries read/write routines + * are called through /proc file. + * + * \retval 0 on success + * < 0 on error + */ +int +lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list, + void *data) +{ + if (!root || !list) + return -EINVAL; + + while (list->name) { + struct proc_dir_entry *proc; + umode_t mode = 0; + + if (list->proc_mode) + mode = list->proc_mode; + else if (list->fops) + mode = default_mode(list->fops); + proc = proc_create_data(list->name, mode, root, + list->fops ?: &lprocfs_empty_ops, + list->data ?: data); + if (!proc) + return -ENOMEM; + list++; + } + return 0; +} +EXPORT_SYMBOL(lprocfs_add_vars); + +void ldebugfs_remove(struct dentry **entryp) +{ + debugfs_remove_recursive(*entryp); + *entryp = NULL; +} +EXPORT_SYMBOL_GPL(ldebugfs_remove); + +#ifndef HAVE_REMOVE_PROC_SUBTREE +/* for b=10866, global variable */ +DECLARE_RWSEM(_lprocfs_lock); +EXPORT_SYMBOL(_lprocfs_lock); + +static void lprocfs_remove_nolock(struct proc_dir_entry **proot) +{ + struct proc_dir_entry *root = *proot; + struct proc_dir_entry *temp = root; + struct proc_dir_entry *rm_entry; + struct proc_dir_entry *parent; + + *proot = NULL; + if (!root || IS_ERR(root)) + return; + + parent = root->parent; + LASSERT(parent != NULL); + + while (1) { + while (temp->subdir) + temp = temp->subdir; + + rm_entry = temp; + temp = temp->parent; + + /* + * Memory corruption once caused this to fail, and + * without this LASSERT we would loop here forever. + */ + LASSERTF(strlen(rm_entry->name) == rm_entry->namelen, + "0x%p %s/%s len %d\n", rm_entry, temp->name, + rm_entry->name, (int)strlen(rm_entry->name)); + + remove_proc_entry(rm_entry->name, temp); + if (temp == parent) + break; + } +} + +int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry *t = NULL; + struct proc_dir_entry **p; + int len, busy = 0; + + LASSERT(parent != NULL); + len = strlen(name); + + down_write(&_lprocfs_lock); + /* lookup target name */ + for (p = &parent->subdir; *p; p = &(*p)->next) { + if ((*p)->namelen != len) + continue; + if (memcmp(name, (*p)->name, len)) + continue; + t = *p; + break; + } + + if (t) { + /* verify it's empty: do not count "num_refs" */ + for (p = &t->subdir; *p; p = &(*p)->next) { + if ((*p)->namelen != strlen("num_refs")) { + busy = 1; + break; + } + if (memcmp("num_refs", (*p)->name, + strlen("num_refs"))) { + busy = 1; + break; + } + } + } + + if (busy == 0) + lprocfs_remove_nolock(&t); + + up_write(&_lprocfs_lock); + return 0; +} +#endif /* !HAVE_REMOVE_PROC_SUBTREE */ + +#ifndef HAVE_PROC_REMOVE +void proc_remove(struct proc_dir_entry *de) +{ +#ifndef HAVE_REMOVE_PROC_SUBTREE + down_write(&_lprocfs_lock); /* search vs remove race */ + lprocfs_remove_nolock(&de); + up_write(&_lprocfs_lock); +#else + if (de) + remove_proc_subtree(de->name, de->parent); +#endif +} +#endif + +void lprocfs_remove(struct proc_dir_entry **rooth) +{ + proc_remove(*rooth); + *rooth = NULL; +} +EXPORT_SYMBOL(lprocfs_remove); + +void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + LASSERT(parent != NULL); + remove_proc_entry(name, parent); +} +EXPORT_SYMBOL(lprocfs_remove_proc_entry); + +struct dentry *ldebugfs_register(const char *name, struct dentry *parent, + struct ldebugfs_vars *list, void *data) +{ + struct dentry *entry; + + entry = debugfs_create_dir(name, parent); + if (IS_ERR_OR_NULL(entry)) { + entry = entry ?: ERR_PTR(-ENOMEM); + goto out; + } + + if (!IS_ERR_OR_NULL(list)) { + int rc; + + rc = ldebugfs_add_vars(entry, list, data); + if (rc) { + debugfs_remove(entry); + entry = ERR_PTR(rc); + } + } +out: + return entry; +} +EXPORT_SYMBOL_GPL(ldebugfs_register); + +struct proc_dir_entry * +lprocfs_register(const char *name, struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data) +{ + struct proc_dir_entry *newchild; + + newchild = proc_mkdir(name, parent); + if (!newchild) + return ERR_PTR(-ENOMEM); + + if (list) { + int rc = lprocfs_add_vars(newchild, list, data); + if (rc) { + lprocfs_remove(&newchild); + return ERR_PTR(rc); + } + } + return newchild; +} +EXPORT_SYMBOL(lprocfs_register); + +/* Generic callbacks */ +int lprocfs_uuid_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + + LASSERT(obd != NULL); + seq_printf(m, "%s\n", obd->obd_uuid.uuid); + return 0; +} +EXPORT_SYMBOL(lprocfs_uuid_seq_show); + +static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%s\n", obd->obd_uuid.uuid); +} +LUSTRE_RO_ATTR(uuid); + +static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) + return sprintf(buf, "%u\n", osfs.os_bsize); + + return rc; +} +LUSTRE_RO_ATTR(blocksize); + +static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) { + u32 blk_size = osfs.os_bsize >> 10; + u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); + } + + return rc; +} +LUSTRE_RO_ATTR(kbytestotal); + +static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) { + u32 blk_size = osfs.os_bsize >> 10; + u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); + } + + return rc; +} +LUSTRE_RO_ATTR(kbytesfree); + +static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) { + u32 blk_size = osfs.os_bsize >> 10; + u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + return sprintf(buf, "%llu\n", result); + } + + return rc; +} +LUSTRE_RO_ATTR(kbytesavail); + +static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) + return sprintf(buf, "%llu\n", osfs.os_files); + + return rc; +} +LUSTRE_RO_ATTR(filestotal); + +static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_statfs osfs; + int rc; + + rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS, + OBD_STATFS_NODELAY); + if (!rc) + return sprintf(buf, "%llu\n", osfs.os_ffree); + + return rc; +} +LUSTRE_RO_ATTR(filesfree); + +ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct ptlrpc_connection *conn; + ssize_t count; + + LPROCFS_CLIMP_CHECK(obd); + conn = obd->u.cli.cl_import->imp_connection; + if (conn && obd->u.cli.cl_import) + count = sprintf(buf, "%s\n", conn->c_remote_uuid.uuid); + else + count = sprintf(buf, "%s\n", ""); + + LPROCFS_CLIMP_EXIT(obd); + return count; +} +EXPORT_SYMBOL(conn_uuid_show); + +int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp; + char *imp_state_name = NULL; + int rc = 0; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + imp_state_name = ptlrpc_import_state_name(imp->imp_state); + seq_printf(m, "%s\t%s%s\n", obd2cli_tgt(obd), imp_state_name, + imp->imp_deactive ? "\tDEACTIVATED" : ""); + + LPROCFS_CLIMP_EXIT(obd); + return rc; +} +EXPORT_SYMBOL(lprocfs_server_uuid_seq_show); + +/** add up per-cpu counters */ + +/** + * Lock statistics structure for access, possibly only on this CPU. + * + * The statistics struct may be allocated with per-CPU structures for + * efficient concurrent update (usually only on server-wide stats), or + * as a single global struct (e.g. for per-client or per-job statistics), + * so the required locking depends on the type of structure allocated. + * + * For per-CPU statistics, pin the thread to the current cpuid so that + * will only access the statistics for that CPU. If the stats structure + * for the current CPU has not been allocated (or previously freed), + * allocate it now. The per-CPU statistics do not need locking since + * the thread is pinned to the CPU during update. + * + * For global statistics, lock the stats structure to prevent concurrent update. + * + * \param[in] stats statistics structure to lock + * \param[in] opc type of operation: + * LPROCFS_GET_SMP_ID: "lock" and return current CPU index + * for incrementing statistics for that CPU + * LPROCFS_GET_NUM_CPU: "lock" and return number of used + * CPU indices to iterate over all indices + * \param[out] flags CPU interrupt saved state for IRQ-safe locking + * + * \retval cpuid of current thread or number of allocated structs + * \retval negative on error (only for opc LPROCFS_GET_SMP_ID + per-CPU stats) + */ +int lprocfs_stats_lock(struct lprocfs_stats *stats, + enum lprocfs_stats_lock_ops opc, + unsigned long *flags) +{ + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, *flags); + else + spin_lock(&stats->ls_lock); + return opc == LPROCFS_GET_NUM_CPU ? 1 : 0; + } + + switch (opc) { + case LPROCFS_GET_SMP_ID: { + unsigned int cpuid = get_cpu(); + + if (unlikely(!stats->ls_percpu[cpuid])) { + int rc = lprocfs_stats_alloc_one(stats, cpuid); + + if (rc < 0) { + put_cpu(); + return rc; + } + } + return cpuid; + } + case LPROCFS_GET_NUM_CPU: + return stats->ls_biggest_alloc_num; + default: + LBUG(); + } +} + +/** + * Unlock statistics structure after access. + * + * Unlock the lock acquired via lprocfs_stats_lock() for global statistics, + * or unpin this thread from the current cpuid for per-CPU statistics. + * + * This function must be called using the same arguments as used when calling + * lprocfs_stats_lock() so that the correct operation can be performed. + * + * \param[in] stats statistics structure to unlock + * \param[in] opc type of operation (current cpuid or number of structs) + * \param[in] flags CPU interrupt saved state for IRQ-safe locking + */ +void lprocfs_stats_unlock(struct lprocfs_stats *stats, + enum lprocfs_stats_lock_ops opc, + unsigned long *flags) +{ + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_unlock_irqrestore(&stats->ls_lock, *flags); + else + spin_unlock(&stats->ls_lock); + } else if (opc == LPROCFS_GET_SMP_ID) { + put_cpu(); + } +} + +/** add up per-cpu counters */ +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt) +{ + unsigned int num_entry; + struct lprocfs_counter *percpu_cntr; + int i; + unsigned long flags = 0; + + memset(cnt, 0, sizeof(*cnt)); + + if (!stats) { + /* set count to 1 to avoid divide-by-zero errs in callers */ + cnt->lc_count = 1; + return; + } + + cnt->lc_min = LC_MIN_INIT; + + num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + + for (i = 0; i < num_entry; i++) { + if (!stats->ls_percpu[i]) + continue; + percpu_cntr = lprocfs_stats_counter_get(stats, i, idx); + + cnt->lc_count += percpu_cntr->lc_count; + cnt->lc_sum += percpu_cntr->lc_sum; + if (percpu_cntr->lc_min < cnt->lc_min) + cnt->lc_min = percpu_cntr->lc_min; + if (percpu_cntr->lc_max > cnt->lc_max) + cnt->lc_max = percpu_cntr->lc_max; + cnt->lc_sumsquare += percpu_cntr->lc_sumsquare; + } + + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} + +static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m) +{ + bool first = true; + + if (imp->imp_obd->obd_no_recov) { + seq_printf(m, "no_recov"); + first = false; + } + + flag2str(imp, invalid); + flag2str(imp, deactive); + flag2str(imp, replayable); + flag2str(imp, delayed_recovery); + flag2str(imp, vbr_failed); + flag2str(imp, pingable); + flag2str(imp, resend_replay); + flag2str(imp, no_pinger_recover); + flag2str(imp, connect_tried); +} + +static const char *obd_connect_names[] = { + /* flags names */ + "read_only", + "lov_index", + "connect_from_mds", + "write_grant", + "server_lock", + "version", + "request_portal", + "acl", + "xattr", + "create_on_write", + "truncate_lock", + "initial_transno", + "inode_bit_locks", + "barrier", + "getattr_by_fid", + "no_oh_for_devices", + "remote_client", + "remote_client_by_force", + "max_byte_per_rpc", + "64bit_qdata", + "mds_capability", + "oss_capability", + "early_lock_cancel", + "som", + "adaptive_timeouts", + "lru_resize", + "mds_mds_connection", + "real_conn", + "change_qunit_size", + "alt_checksum_algorithm", + "fid_is_enabled", + "version_recovery", + "pools", + "grant_shrink", + "skip_orphan", + "large_ea", + "full20", + "layout_lock", + "64bithash", + "object_max_bytes", + "imp_recov", + "jobstats", + "umask", + "einprogress", + "grant_param", + "flock_owner", + "lvb_type", + "nanoseconds_times", + "lightweight_conn", + "short_io", + "pingless", + "flock_deadlock", + "disp_stripe", + "open_by_fid", + "lfsck", + "unknown", + "unlink_close", + "multi_mod_rpcs", + "dir_stripe", + "subtree", + "lockahead", + "bulk_mbits", + "compact_obdo", + "second_flags", + /* flags2 names */ + "file_secctx", /* 0x01 */ + "lockaheadv2", /* 0x02 */ + "dir_migrate", /* 0x04 */ + "sum_statfs", /* 0x08 */ + "overstriping", /* 0x10 */ + "flr", /* 0x20 */ + "wbc", /* 0x40 */ + "lock_convert", /* 0x80 */ + "archive_id_array", /* 0x100 */ + "increasing_xid", /* 0x200 */ + "selinux_policy", /* 0x400 */ + "lsom", /* 0x800 */ + "pcc", /* 0x1000 */ + "unknown", /* 0x2000 */ + "async_discard", /* 0x4000 */ + "client_encryption", /* 0x8000 */ + "fidmap", /* 0x10000 */ + "getattr_pfid", /* 0x20000 */ + "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", + "", + "mdll_bypass", /* 0x800000000000000 */ + "mdll", /* 0x1000000000000000 */ + "mdll_auto_refresh", /* 0x2000000000000000 */ + "", "", + NULL +}; + +void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2, + const char *sep) +{ + bool first = true; + __u64 mask; + int i; + + for (i = 0, mask = 1; i < 64; i++, mask <<= 1) { + if (flags & mask) { + seq_printf(m, "%s%s", + first ? "" : sep, obd_connect_names[i]); + first = false; + } + } + + if (flags & ~(mask - 1)) { + seq_printf(m, "%sunknown_%#llx", + first ? "" : sep, flags & ~(mask - 1)); + first = false; + } + + if (!(flags & OBD_CONNECT_FLAGS2) || flags2 == 0) + return; + + for (i = 64, mask = 1; obd_connect_names[i] != NULL; i++, mask <<= 1) { + if (flags2 & mask) { + seq_printf(m, "%s%s", + first ? "" : sep, obd_connect_names[i]); + first = false; + } + } + + if (flags2 & ~(mask - 1)) { + seq_printf(m, "%sunknown2_%#llx", + first ? "" : sep, flags2 & ~(mask - 1)); + first = false; + } +} +EXPORT_SYMBOL(obd_connect_seq_flags2str); + +int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2, + const char *sep) +{ + __u64 mask; + int i, ret = 0; + + for (i = 0, mask = 1; i < 64; i++, mask <<= 1) { + if (flags & mask) + ret += snprintf(page + ret, count - ret, "%s%s", + ret ? sep : "", obd_connect_names[i]); + } + + if (flags & ~(mask - 1)) + ret += snprintf(page + ret, count - ret, + "%sunknown_%#llx", + ret ? sep : "", flags & ~(mask - 1)); + + if (!(flags & OBD_CONNECT_FLAGS2) || flags2 == 0) + return ret; + + for (i = 64, mask = 1; obd_connect_names[i] != NULL; i++, mask <<= 1) { + if (flags2 & mask) + ret += snprintf(page + ret, count - ret, "%s%s", + ret ? sep : "", obd_connect_names[i]); + } + + if (flags2 & ~(mask - 1)) + ret += snprintf(page + ret, count - ret, + "%sunknown2_%#llx", + ret ? sep : "", flags2 & ~(mask - 1)); + + return ret; +} +EXPORT_SYMBOL(obd_connect_flags2str); + +void +obd_connect_data_seqprint(struct seq_file *m, struct obd_connect_data *ocd) +{ + __u64 flags; + + LASSERT(ocd != NULL); + flags = ocd->ocd_connect_flags; + + seq_printf(m, " connect_data:\n" + " flags: %#llx\n" + " instance: %u\n", + ocd->ocd_connect_flags, + ocd->ocd_instance); + if (flags & OBD_CONNECT_VERSION) + seq_printf(m, " target_version: %u.%u.%u.%u\n", + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version)); + if (flags & OBD_CONNECT_MDS) + seq_printf(m, " mdt_index: %d\n", ocd->ocd_group); + if (flags & OBD_CONNECT_GRANT) + seq_printf(m, " initial_grant: %d\n", ocd->ocd_grant); + if (flags & OBD_CONNECT_INDEX) + seq_printf(m, " target_index: %u\n", ocd->ocd_index); + if (flags & OBD_CONNECT_BRW_SIZE) + seq_printf(m, " max_brw_size: %d\n", ocd->ocd_brw_size); + if (flags & OBD_CONNECT_IBITS) + seq_printf(m, " ibits_known: %#llx\n", + ocd->ocd_ibits_known); + if (flags & OBD_CONNECT_GRANT_PARAM) + seq_printf(m, " grant_block_size: %d\n" + " grant_inode_size: %d\n" + " grant_max_extent_size: %d\n" + " grant_extent_tax: %d\n", + 1 << ocd->ocd_grant_blkbits, + 1 << ocd->ocd_grant_inobits, + ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits, + ocd->ocd_grant_tax_kb << 10); + if (flags & OBD_CONNECT_TRANSNO) + seq_printf(m, " first_transno: %#llx\n", + ocd->ocd_transno); + if (flags & OBD_CONNECT_CKSUM) + seq_printf(m, " cksum_types: %#x\n", + ocd->ocd_cksum_types); + if (flags & OBD_CONNECT_MAX_EASIZE) + seq_printf(m, " max_easize: %d\n", ocd->ocd_max_easize); + if (flags & OBD_CONNECT_MAXBYTES) + seq_printf(m, " max_object_bytes: %llu\n", + ocd->ocd_maxbytes); + if (flags & OBD_CONNECT_MULTIMODRPCS) + seq_printf(m, " max_mod_rpcs: %hu\n", + ocd->ocd_maxmodrpcs); +} + +int lprocfs_import_seq_show(struct seq_file *m, void *data) +{ + char nidstr[LNET_NIDSTR_SIZE]; + struct lprocfs_counter ret; + struct lprocfs_counter_header *header; + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + struct obd_import_conn *conn; + struct obd_connect_data *ocd; + int j; + int k; + int rw = 0; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + ocd = &imp->imp_connect_data; + + seq_printf(m, "import:\n" + " name: %s\n" + " target: %s\n" + " state: %s\n" + " connect_flags: [ ", + obd->obd_name, + obd2cli_tgt(obd), + ptlrpc_import_state_name(imp->imp_state)); + obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, + imp->imp_connect_data.ocd_connect_flags2, + ", "); + seq_printf(m, " ]\n"); + obd_connect_data_seqprint(m, ocd); + seq_printf(m, " import_flags: [ "); + obd_import_flags2str(imp, m); + + seq_printf(m, " ]\n" + " connection:\n" + " failover_nids: [ "); + spin_lock(&imp->imp_lock); + j = 0; + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + libcfs_nid2str_r(conn->oic_conn->c_peer.nid, + nidstr, sizeof(nidstr)); + seq_printf(m, "%s%s", j ? ", " : "", nidstr); + j++; + } + if (imp->imp_connection) + libcfs_nid2str_r(imp->imp_connection->c_peer.nid, + nidstr, sizeof(nidstr)); + else + strncpy(nidstr, "", sizeof(nidstr)); + seq_printf(m, " ]\n" + " current_connection: %s\n" + " connection_attempts: %u\n" + " generation: %u\n" + " in-progress_invalidations: %u\n" + " idle: %lld sec\n", + nidstr, + imp->imp_conn_cnt, + imp->imp_generation, + atomic_read(&imp->imp_inval_count), + ktime_get_real_seconds() - imp->imp_last_reply_time); + spin_unlock(&imp->imp_lock); + + if (!obd->obd_svc_stats) + goto out_climp; + + header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR]; + lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret); + if (ret.lc_count != 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + } else + ret.lc_sum = 0; + seq_printf(m, " rpcs:\n" + " inflight: %u\n" + " unregistering: %u\n" + " timeouts: %u\n" + " avg_waittime: %llu %s\n", + atomic_read(&imp->imp_inflight), + atomic_read(&imp->imp_unregistering), + atomic_read(&imp->imp_timeouts), + ret.lc_sum, header->lc_units); + + k = 0; + for(j = 0; j < IMP_AT_MAX_PORTALS; j++) { + if (imp->imp_at.iat_portal[j] == 0) + break; + k = max_t(unsigned int, k, + at_get(&imp->imp_at.iat_service_estimate[j])); + } + seq_printf(m, " service_estimates:\n" + " services: %u sec\n" + " network: %u sec\n", + k, + at_get(&imp->imp_at.iat_net_latency)); + + seq_printf(m, " transactions:\n" + " last_replay: %llu\n" + " peer_committed: %llu\n" + " last_checked: %llu\n", + imp->imp_last_replay_transno, + imp->imp_peer_committed_transno, + imp->imp_last_transno_checked); + + /* avg data rates */ + for (rw = 0; rw <= 1; rw++) { + lprocfs_stats_collect(obd->obd_svc_stats, + PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw, + &ret); + if (ret.lc_sum > 0 && ret.lc_count > 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + seq_printf(m, " %s_data_averages:\n" + " bytes_per_rpc: %llu\n", + rw ? "write" : "read", + ret.lc_sum); + } + k = (int)ret.lc_sum; + j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES; + header = &obd->obd_svc_stats->ls_cnt_header[j]; + lprocfs_stats_collect(obd->obd_svc_stats, j, &ret); + if (ret.lc_sum > 0 && ret.lc_count != 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + seq_printf(m, " %s_per_rpc: %llu\n", + header->lc_units, ret.lc_sum); + j = (int)ret.lc_sum; + if (j > 0) + seq_printf(m, " MB_per_sec: %u.%.02u\n", + k / j, (100 * k / j) % 100); + } + } + +out_climp: + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_import_seq_show); + +int lprocfs_state_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + int j, k; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + + seq_printf(m, "current_state: %s\n", + ptlrpc_import_state_name(imp->imp_state)); + seq_printf(m, "state_history:\n"); + k = imp->imp_state_hist_idx; + for (j = 0; j < IMP_STATE_HIST_LEN; j++) { + struct import_state_hist *ish = + &imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN]; + if (ish->ish_state == 0) + continue; + seq_printf(m, " - [ %lld, %s ]\n", (s64)ish->ish_time, + ptlrpc_import_state_name(ish->ish_state)); + } + + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_state_seq_show); + +int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at) +{ + int i; + for (i = 0; i < AT_BINS; i++) + seq_printf(m, "%3u ", at->at_hist[i]); + seq_printf(m, "\n"); + return 0; +} +EXPORT_SYMBOL(lprocfs_at_hist_helper); + +/* See also ptlrpc_lprocfs_timeouts_show_seq */ +int lprocfs_timeouts_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + unsigned int cur, worst; + time64_t now, worstt; + int i; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + + now = ktime_get_real_seconds(); + + /* Some network health info for kicks */ + seq_printf(m, "%-10s : %lld, %llds ago\n", + "last reply", (s64)imp->imp_last_reply_time, + (s64)(now - imp->imp_last_reply_time)); + + cur = at_get(&imp->imp_at.iat_net_latency); + worst = imp->imp_at.iat_net_latency.at_worst_ever; + worstt = imp->imp_at.iat_net_latency.at_worst_time; + seq_printf(m, "%-10s : cur %3u worst %3u (at %lld, %llds ago) ", + "network", cur, worst, (s64)worstt, (s64)(now - worstt)); + lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency); + + for(i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (imp->imp_at.iat_portal[i] == 0) + break; + cur = at_get(&imp->imp_at.iat_service_estimate[i]); + worst = imp->imp_at.iat_service_estimate[i].at_worst_ever; + worstt = imp->imp_at.iat_service_estimate[i].at_worst_time; + seq_printf(m, "portal %-2d : cur %3u worst %3u (at %lld, %llds ago) ", + imp->imp_at.iat_portal[i], cur, worst, (s64)worstt, + (s64)(now - worstt)); + lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]); + } + + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_timeouts_seq_show); + +int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + __u64 flags; + __u64 flags2; + + LPROCFS_CLIMP_CHECK(obd); + flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags; + flags2 = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags2; + seq_printf(m, "flags=%#llx\n", flags); + seq_printf(m, "flags2=%#llx\n", flags2); + obd_connect_seq_flags2str(m, flags, flags2, "\n"); + seq_printf(m, "\n"); + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_connect_flags_seq_show); + +static const struct attribute *obd_def_uuid_attrs[] = { + &lustre_attr_uuid.attr, + NULL, +}; + +static const struct attribute *obd_def_attrs[] = { + &lustre_attr_blocksize.attr, + &lustre_attr_kbytestotal.attr, + &lustre_attr_kbytesfree.attr, + &lustre_attr_kbytesavail.attr, + &lustre_attr_filestotal.attr, + &lustre_attr_filesfree.attr, + &lustre_attr_uuid.attr, + NULL, +}; + +static void obd_sysfs_release(struct kobject *kobj) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + complete(&obd->obd_kobj_unregister); +} + +int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only) +{ + struct ldebugfs_vars *debugfs_vars = NULL; + int rc; + + if (!obd || obd->obd_magic != OBD_DEVICE_MAGIC) + return -ENODEV; + + rc = kobject_set_name(&obd->obd_kset.kobj, "%s", obd->obd_name); + if (rc) + return rc; + + obd->obd_ktype.sysfs_ops = &lustre_sysfs_ops; + obd->obd_ktype.release = obd_sysfs_release; + + obd->obd_kset.kobj.parent = obd->obd_type->typ_kobj; + obd->obd_kset.kobj.ktype = &obd->obd_ktype; + init_completion(&obd->obd_kobj_unregister); + rc = kset_register(&obd->obd_kset); + if (rc) + return rc; + + if (uuid_only) + obd->obd_attrs = obd_def_uuid_attrs; + else + obd->obd_attrs = obd_def_attrs; + + rc = sysfs_create_files(&obd->obd_kset.kobj, obd->obd_attrs); + if (rc) { + kset_unregister(&obd->obd_kset); + return rc; + } + + if (!obd->obd_type->typ_procroot) + debugfs_vars = obd->obd_debugfs_vars; + obd->obd_debugfs_entry = ldebugfs_register(obd->obd_name, + obd->obd_type->typ_debugfs_entry, + debugfs_vars, obd); + if (IS_ERR_OR_NULL(obd->obd_debugfs_entry)) { + rc = obd->obd_debugfs_entry ? PTR_ERR(obd->obd_debugfs_entry) + : -ENOMEM; + CERROR("error %d setting up debugfs for %s\n", + rc, obd->obd_name); + obd->obd_debugfs_entry = NULL; + + sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs); + obd->obd_attrs = NULL; + kset_unregister(&obd->obd_kset); + return rc; + } + + if (obd->obd_proc_entry || !obd->obd_type->typ_procroot) + GOTO(already_registered, rc); + + obd->obd_proc_entry = lprocfs_register(obd->obd_name, + obd->obd_type->typ_procroot, + obd->obd_vars, obd); + if (IS_ERR(obd->obd_proc_entry)) { + rc = PTR_ERR(obd->obd_proc_entry); + CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name); + obd->obd_proc_entry = NULL; + + ldebugfs_remove(&obd->obd_debugfs_entry); + sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs); + obd->obd_attrs = NULL; + kset_unregister(&obd->obd_kset); + return rc; + } +already_registered: + return rc; +} +EXPORT_SYMBOL(lprocfs_obd_setup); + +int lprocfs_obd_cleanup(struct obd_device *obd) +{ + if (!obd) + return -EINVAL; + + if (obd->obd_proc_exports_entry) { + /* Should be no exports left */ + lprocfs_remove(&obd->obd_proc_exports_entry); + obd->obd_proc_exports_entry = NULL; + } + + if (obd->obd_proc_entry) { + lprocfs_remove(&obd->obd_proc_entry); + obd->obd_proc_entry = NULL; + } + + if (!IS_ERR_OR_NULL(obd->obd_debugfs_entry)) + ldebugfs_remove(&obd->obd_debugfs_entry); + + /* obd device never allocated a kset */ + if (!obd->obd_kset.kobj.state_initialized) + return 0; + + if (obd->obd_attrs) { + sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs); + obd->obd_attrs = NULL; + } + + kset_unregister(&obd->obd_kset); + wait_for_completion(&obd->obd_kobj_unregister); + return 0; +} +EXPORT_SYMBOL(lprocfs_obd_cleanup); + +int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid) +{ + struct lprocfs_counter *cntr; + unsigned int percpusize; + int rc = -ENOMEM; + unsigned long flags = 0; + int i; + + LASSERT(stats->ls_percpu[cpuid] == NULL); + LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0); + + percpusize = lprocfs_stats_counter_size(stats); + LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize); + if (stats->ls_percpu[cpuid]) { + rc = 0; + if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, flags); + else + spin_lock(&stats->ls_lock); + if (stats->ls_biggest_alloc_num <= cpuid) + stats->ls_biggest_alloc_num = cpuid + 1; + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) { + spin_unlock_irqrestore(&stats->ls_lock, flags); + } else { + spin_unlock(&stats->ls_lock); + } + } + /* initialize the ls_percpu[cpuid] non-zero counter */ + for (i = 0; i < stats->ls_num; ++i) { + cntr = lprocfs_stats_counter_get(stats, cpuid, i); + cntr->lc_min = LC_MIN_INIT; + } + } + return rc; +} + +struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, + enum lprocfs_stats_flags flags) +{ + struct lprocfs_stats *stats; + unsigned int num_entry; + unsigned int percpusize = 0; + int i; + + if (num == 0) + return NULL; + + if (lprocfs_no_percpu_stats != 0) + flags |= LPROCFS_STATS_FLAG_NOPERCPU; + + if (flags & LPROCFS_STATS_FLAG_NOPERCPU) + num_entry = 1; + else + num_entry = num_possible_cpus(); + + /* alloc percpu pointers for all possible cpu slots */ + LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry])); + if (!stats) + return NULL; + + stats->ls_num = num; + stats->ls_flags = flags; + spin_lock_init(&stats->ls_lock); + + /* alloc num of counter headers */ + LIBCFS_ALLOC(stats->ls_cnt_header, + stats->ls_num * sizeof(struct lprocfs_counter_header)); + if (!stats->ls_cnt_header) + goto fail; + + if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) { + /* contains only one set counters */ + percpusize = lprocfs_stats_counter_size(stats); + LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize); + if (!stats->ls_percpu[0]) + goto fail; + stats->ls_biggest_alloc_num = 1; + } else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) { + /* alloc all percpu data, currently only obd_memory use this */ + for (i = 0; i < num_entry; ++i) + if (lprocfs_stats_alloc_one(stats, i) < 0) + goto fail; + } + + return stats; + +fail: + lprocfs_free_stats(&stats); + return NULL; +} +EXPORT_SYMBOL(lprocfs_alloc_stats); + +void lprocfs_free_stats(struct lprocfs_stats **statsh) +{ + struct lprocfs_stats *stats = *statsh; + unsigned int num_entry; + unsigned int percpusize; + unsigned int i; + + if (!stats || stats->ls_num == 0) + return; + *statsh = NULL; + + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) + num_entry = 1; + else + num_entry = num_possible_cpus(); + + percpusize = lprocfs_stats_counter_size(stats); + for (i = 0; i < num_entry; i++) + if (stats->ls_percpu[i]) + LIBCFS_FREE(stats->ls_percpu[i], percpusize); + if (stats->ls_cnt_header) + LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num * + sizeof(struct lprocfs_counter_header)); + LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry])); +} +EXPORT_SYMBOL(lprocfs_free_stats); + +u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, + enum lprocfs_fields_flags field) +{ + unsigned long flags = 0; + unsigned int num_cpu; + unsigned int i; + u64 ret = 0; + + LASSERT(stats); + + num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + for (i = 0; i < num_cpu; i++) { + struct lprocfs_counter *cntr; + + if (!stats->ls_percpu[i]) + continue; + + cntr = lprocfs_stats_counter_get(stats, i, idx); + ret += lprocfs_read_helper(cntr, &stats->ls_cnt_header[idx], + stats->ls_flags, field); + } + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); + return ret; +} +EXPORT_SYMBOL(lprocfs_stats_collector); + +void lprocfs_clear_stats(struct lprocfs_stats *stats) +{ + struct lprocfs_counter *percpu_cntr; + int i; + int j; + unsigned int num_entry; + unsigned long flags = 0; + + num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + + for (i = 0; i < num_entry; i++) { + if (!stats->ls_percpu[i]) + continue; + for (j = 0; j < stats->ls_num; j++) { + percpu_cntr = lprocfs_stats_counter_get(stats, i, j); + percpu_cntr->lc_count = 0; + percpu_cntr->lc_min = LC_MIN_INIT; + percpu_cntr->lc_max = 0; + percpu_cntr->lc_sumsquare = 0; + percpu_cntr->lc_sum = 0; + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + percpu_cntr->lc_sum_irq = 0; + } + } + + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_clear_stats); + +static ssize_t lprocfs_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct lprocfs_stats *stats = seq->private; + + lprocfs_clear_stats(stats); + + return len; +} + +static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos) +{ + struct lprocfs_stats *stats = p->private; + + return (*pos < stats->ls_num) ? pos : NULL; +} + +static void lprocfs_stats_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + (*pos)++; + + return lprocfs_stats_seq_start(p, pos); +} + +/* seq file export of one lprocfs counter */ +static int lprocfs_stats_seq_show(struct seq_file *p, void *v) +{ + struct lprocfs_stats *stats = p->private; + struct lprocfs_counter_header *hdr; + struct lprocfs_counter ctr; + int idx = *(loff_t *)v; + + if (idx == 0) { + struct timespec64 now; + + ktime_get_real_ts64(&now); + seq_printf(p, "%-25s %llu.%09lu secs.nsecs\n", + "snapshot_time", (s64)now.tv_sec, now.tv_nsec); + } + + hdr = &stats->ls_cnt_header[idx]; + lprocfs_stats_collect(stats, idx, &ctr); + + if (ctr.lc_count == 0) + return 0; + + seq_printf(p, "%-25s %lld samples [%s]", hdr->lc_name, + ctr.lc_count, hdr->lc_units); + + if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && ctr.lc_count > 0) { + seq_printf(p, " %lld %lld %lld", + ctr.lc_min, ctr.lc_max, ctr.lc_sum); + if (hdr->lc_config & LPROCFS_CNTR_STDDEV) + seq_printf(p, " %llu", ctr.lc_sumsquare); + } + seq_putc(p, '\n'); + return 0; +} + +static const struct seq_operations lprocfs_stats_seq_sops = { + .start = lprocfs_stats_seq_start, + .stop = lprocfs_stats_seq_stop, + .next = lprocfs_stats_seq_next, + .show = lprocfs_stats_seq_show, +}; + +static int lprocfs_stats_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = LPROCFS_ENTRY_CHECK(inode); + if (rc < 0) + return rc; + + rc = seq_open(file, &lprocfs_stats_seq_sops); + if (rc) + return rc; + seq = file->private_data; + seq->private = inode->i_private ? inode->i_private : PDE_DATA(inode); + return 0; +} + +const struct file_operations ldebugfs_stats_seq_fops = { + .owner = THIS_MODULE, + .open = lprocfs_stats_seq_open, + .read = seq_read, + .write = lprocfs_stats_seq_write, + .llseek = seq_lseek, + .release = lprocfs_seq_release, +}; +EXPORT_SYMBOL(ldebugfs_stats_seq_fops); + +static const struct proc_ops lprocfs_stats_seq_fops = { + PROC_OWNER(THIS_MODULE) + .proc_open = lprocfs_stats_seq_open, + .proc_read = seq_read, + .proc_write = lprocfs_stats_seq_write, + .proc_lseek = seq_lseek, + .proc_release = lprocfs_seq_release, +}; + +int ldebugfs_register_stats(struct dentry *parent, const char *name, + struct lprocfs_stats *stats) +{ + struct dentry *entry; + + LASSERT(!IS_ERR_OR_NULL(parent)); + + entry = debugfs_create_file(name, 0644, parent, stats, + &ldebugfs_stats_seq_fops); + if (IS_ERR_OR_NULL(entry)) + return entry ? PTR_ERR(entry) : -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(ldebugfs_register_stats); + +int lprocfs_register_stats(struct proc_dir_entry *root, const char *name, + struct lprocfs_stats *stats) +{ + struct proc_dir_entry *entry; + LASSERT(root != NULL); + + entry = proc_create_data(name, 0644, root, + &lprocfs_stats_seq_fops, stats); + if (!entry) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(lprocfs_register_stats); + +void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + unsigned conf, const char *name, const char *units) +{ + struct lprocfs_counter_header *header; + struct lprocfs_counter *percpu_cntr; + unsigned long flags = 0; + unsigned int i; + unsigned int num_cpu; + + LASSERT(stats != NULL); + + header = &stats->ls_cnt_header[index]; + LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n", + index, name, units); + + header->lc_config = conf; + header->lc_name = name; + header->lc_units = units; + + num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + for (i = 0; i < num_cpu; ++i) { + if (!stats->ls_percpu[i]) + continue; + percpu_cntr = lprocfs_stats_counter_get(stats, i, index); + percpu_cntr->lc_count = 0; + percpu_cntr->lc_min = LC_MIN_INIT; + percpu_cntr->lc_max = 0; + percpu_cntr->lc_sumsquare = 0; + percpu_cntr->lc_sum = 0; + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq = 0; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_init); + +static const char * const mps_stats[] = { + [LPROC_MD_CLOSE] = "close", + [LPROC_MD_CREATE] = "create", + [LPROC_MD_ENQUEUE] = "enqueue", + [LPROC_MD_GETATTR] = "getattr", + [LPROC_MD_INTENT_LOCK] = "intent_lock", + [LPROC_MD_LINK] = "link", + [LPROC_MD_RENAME] = "rename", + [LPROC_MD_SETATTR] = "setattr", + [LPROC_MD_FSYNC] = "fsync", + [LPROC_MD_READ_PAGE] = "read_page", + [LPROC_MD_UNLINK] = "unlink", + [LPROC_MD_SETXATTR] = "setxattr", + [LPROC_MD_GETXATTR] = "getxattr", + [LPROC_MD_INTENT_GETATTR_ASYNC] = "intent_getattr_async", + [LPROC_MD_REVALIDATE_LOCK] = "revalidate_lock", +}; + +int lprocfs_alloc_md_stats(struct obd_device *obd, + unsigned int num_private_stats) +{ + struct lprocfs_stats *stats; + unsigned int num_stats; + int rc, i; + + /* + * TODO Ensure that this function is only used where + * appropriate by adding an assertion to the effect that + * obd->obd_type->typ_md_ops is not NULL. We can't do this now + * because mdt_procfs_init() uses this function to allocate + * the stats backing /proc/fs/lustre/mdt/.../md_stats but the + * mdt layer does not use the md_ops interface. This is + * confusing and a waste of memory. See LU-2484. + */ + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->obd_md_stats == NULL); + + num_stats = ARRAY_SIZE(mps_stats) + num_private_stats; + stats = lprocfs_alloc_stats(num_stats, 0); + if (!stats) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(mps_stats); i++) { + lprocfs_counter_init(stats, i, 0, mps_stats[i], "reqs"); + if (!stats->ls_cnt_header[i].lc_name) { + CERROR("Missing md_stat initializer md_op operation at offset %d. Aborting.\n", + i); + LBUG(); + } + } + + rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats); + if (rc < 0) { + lprocfs_free_stats(&stats); + } else { + obd->obd_md_stats = stats; + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_alloc_md_stats); + +void lprocfs_free_md_stats(struct obd_device *obd) +{ + struct lprocfs_stats *stats = obd->obd_md_stats; + + if (stats) { + obd->obd_md_stats = NULL; + lprocfs_free_stats(&stats); + } +} +EXPORT_SYMBOL(lprocfs_free_md_stats); + +void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) +{ + lprocfs_counter_init(ldlm_stats, + LDLM_ENQUEUE - LDLM_FIRST_OPC, + 0, "ldlm_enqueue", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CONVERT - LDLM_FIRST_OPC, + 0, "ldlm_convert", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CANCEL - LDLM_FIRST_OPC, + 0, "ldlm_cancel", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_BL_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_bl_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CP_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_cp_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_GL_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_gl_callback", "reqs"); +} +EXPORT_SYMBOL(lprocfs_init_ldlm_stats); + +__s64 lprocfs_read_helper(struct lprocfs_counter *lc, + struct lprocfs_counter_header *header, + enum lprocfs_stats_flags flags, + enum lprocfs_fields_flags field) +{ + __s64 ret = 0; + + if (!lc || !header) + RETURN(0); + + switch (field) { + case LPROCFS_FIELDS_FLAGS_CONFIG: + ret = header->lc_config; + break; + case LPROCFS_FIELDS_FLAGS_SUM: + ret = lc->lc_sum; + if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + ret += lc->lc_sum_irq; + break; + case LPROCFS_FIELDS_FLAGS_MIN: + ret = lc->lc_min; + break; + case LPROCFS_FIELDS_FLAGS_MAX: + ret = lc->lc_max; + break; + case LPROCFS_FIELDS_FLAGS_AVG: + ret = (lc->lc_max - lc->lc_min) / 2; + break; + case LPROCFS_FIELDS_FLAGS_SUMSQUARE: + ret = lc->lc_sumsquare; + break; + case LPROCFS_FIELDS_FLAGS_COUNT: + ret = lc->lc_count; + break; + default: + break; + }; + RETURN(ret); +} +EXPORT_SYMBOL(lprocfs_read_helper); + +/* Obtains the conversion factor for the unit specified */ +static int get_mult(char unit, __u64 *mult) +{ + __u64 units = 1; + + switch (unit) { + /* peta, tera, giga, mega, and kilo */ + case 'p': + case 'P': + units <<= 10; + fallthrough; + case 't': + case 'T': + units <<= 10; + fallthrough; + case 'g': + case 'G': + units <<= 10; + fallthrough; + case 'm': + case 'M': + units <<= 10; + fallthrough; + case 'k': + case 'K': + units <<= 10; + break; + /* some tests expect % to be accepted */ + case '%': + units = 1; + break; + default: + return -EINVAL; + } + + *mult = units; + + return 0; +} + +/* + * Ensures the numeric string is valid. The function provides the final + * multiplier in the case a unit exists at the end of the string. It also + * locates the start of the whole and fractional parts (if any). This + * function modifies the string so kstrtoull can be used to parse both + * the whole and fraction portions. This function also figures out + * the base of the number. + */ +static int preprocess_numeric_str(char *buffer, __u64 *mult, __u64 def_mult, + bool allow_units, char **whole, char **frac, + unsigned int *base) +{ + bool hit_decimal = false; + bool hit_unit = false; + int rc = 0; + char *start; + *mult = def_mult; + *whole = NULL; + *frac = NULL; + *base = 10; + + /* a hex string if it starts with "0x" */ + if (buffer[0] == '0' && tolower(buffer[1]) == 'x') { + *base = 16; + buffer += 2; + } + + start = buffer; + + while (*buffer) { + /* allow for a single new line before the null terminator */ + if (*buffer == '\n') { + *buffer = '\0'; + buffer++; + + if (*buffer) + return -EINVAL; + + break; + } + + /* any chars after our unit indicates a malformed string */ + if (hit_unit) + return -EINVAL; + + /* ensure we only hit one decimal */ + if (*buffer == '.') { + if (hit_decimal) + return -EINVAL; + + /* if past start, there's a whole part */ + if (start != buffer) + *whole = start; + + *buffer = '\0'; + start = buffer + 1; + hit_decimal = true; + } else if (!isdigit(*buffer) && + !(*base == 16 && isxdigit(*buffer))) { + if (allow_units) { + /* if we allow units, attempt to get mult */ + hit_unit = true; + rc = get_mult(*buffer, mult); + if (rc) + return rc; + + /* string stops here, but keep processing */ + *buffer = '\0'; + } else { + /* bad string */ + return -EINVAL; + } + } + + buffer++; + } + + if (hit_decimal) { + /* hit a decimal, make sure there's a fractional part */ + if (!*start) + return -EINVAL; + + *frac = start; + } else { + /* didn't hit a decimal, but may have a whole part */ + if (start != buffer && *start) + *whole = start; + } + + /* malformed string if we didn't get anything */ + if (!*frac && !*whole) + return -EINVAL; + + return 0; +} + +/* + * Parses a numeric string which can contain a whole and fraction portion + * into a __u64. Accepts a multiplier to apply to the value parsed. Also + * allows the string to have a unit at the end. The function handles + * wrapping of the final unsigned value. + */ +static int str_to_u64_parse(char *buffer, unsigned long count, + __u64 *val, __u64 def_mult, bool allow_units) +{ + __u64 whole = 0; + __u64 frac = 0; + unsigned int frac_d = 1; + __u64 wrap_indicator = ULLONG_MAX; + int rc = 0; + __u64 mult; + char *strwhole; + char *strfrac; + unsigned int base = 10; + + rc = preprocess_numeric_str(buffer, &mult, def_mult, allow_units, + &strwhole, &strfrac, &base); + + if (rc) + return rc; + + if (mult == 0) { + *val = 0; + return 0; + } + + /* the multiplier limits how large the value can be */ + wrap_indicator = div64_u64(wrap_indicator, mult); + + if (strwhole) { + rc = kstrtoull(strwhole, base, &whole); + if (rc) + return rc; + + if (whole > wrap_indicator) + return -ERANGE; + + whole *= mult; + } + + if (strfrac) { + if (strlen(strfrac) > 10) + strfrac[10] = '\0'; + + rc = kstrtoull(strfrac, base, &frac); + if (rc) + return rc; + + /* determine power of fractional portion */ + while (*strfrac) { + frac_d *= base; + strfrac++; + } + + /* fractional portion is too large to perform calculation */ + if (frac > wrap_indicator) + return -ERANGE; + + frac *= mult; + do_div(frac, frac_d); + } + + /* check that the sum of whole and fraction fits in u64 */ + if (whole > (ULLONG_MAX - frac)) + return -ERANGE; + + *val = whole + frac; + + return 0; +} + +/* + * This function parses numeric/hex strings into __s64. It accepts a multiplier + * which will apply to the value parsed. It also can allow the string to + * have a unit as the last character. The function handles overflow/underflow + * of the signed integer. + */ +int lu_str_to_s64(char *buffer, unsigned long count, __s64 *val, char defunit) +{ + __u64 mult = 1; + __u64 tmp; + unsigned int offset = 0; + int signed sign = 1; + __u64 max = LLONG_MAX; + int rc = 0; + + if (defunit != '1') { + rc = get_mult(defunit, &mult); + if (rc) + return rc; + } + + /* keep track of our sign */ + if (*buffer == '-') { + sign = -1; + offset++; + /* equivalent to max = -LLONG_MIN, avoids overflow */ + max++; + } + + rc = str_to_u64_parse(buffer + offset, count - offset, + &tmp, mult, true); + if (rc) + return rc; + + /* check for overflow/underflow */ + if (max < tmp) + return -ERANGE; + + *val = (__s64)tmp * sign; + + return 0; +} +EXPORT_SYMBOL(lu_str_to_s64); + +/** + * Convert a user string into a signed 64 bit number. This function produces + * an error when the value parsed from the string times multiplier underflows or + * overflows. This function only accepts strings that contains digits, an + * optional decimal, and a char representing a unit at the end. If a unit is + * specified in the string, the multiplier provided by the caller is ignored. + * This function can also accept hexadecimal strings which are prefixed with + * "0x". + * + * \param[in] buffer string consisting of numbers, a decimal, and a unit + * \param[in] count buffer length + * \param[in] val if successful, the value represented by the string + * \param[in] defunit default unit if string doesn't contain one + * + * \retval 0 on success + * \retval negative number on error + */ +int lprocfs_str_with_units_to_s64(const char __user *buffer, + unsigned long count, __s64 *val, char defunit) +{ + char kernbuf[22]; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + + return lu_str_to_s64(kernbuf, count, val, defunit); +} +EXPORT_SYMBOL(lprocfs_str_with_units_to_s64); + +char *lprocfs_strnstr(const char *s1, const char *s2, size_t len) +{ + size_t l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + while (len >= l2) { + len--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} +EXPORT_SYMBOL(lprocfs_strnstr); + +/** + * Find the string \a name in the input \a buffer, and return a pointer to the + * value immediately following \a name, reducing \a count appropriately. + * If \a name is not found the original \a buffer is returned. + */ +char *lprocfs_find_named_value(const char *buffer, const char *name, + size_t *count) +{ + char *val; + size_t buflen = *count; + + /* there is no strnstr() in rhel5 and ubuntu kernels */ + val = lprocfs_strnstr(buffer, name, buflen); + if (!val) + return (char *)buffer; + + val += strlen(name); /* skip prefix */ + while (val < buffer + buflen && isspace(*val)) /* skip separator */ + val++; + + *count = 0; + while (val < buffer + buflen && isalnum(*val)) { + ++*count; + ++val; + } + + return val - *count; +} +EXPORT_SYMBOL(lprocfs_find_named_value); + +int ldebugfs_seq_create(struct dentry *parent, const char *name, umode_t mode, + const struct file_operations *seq_fops, void *data) +{ + struct dentry *entry; + + /* Disallow secretly (un)writable entries. */ + LASSERT((!seq_fops->write) == (!(mode & 0222))); + + entry = debugfs_create_file(name, mode, parent, data, seq_fops); + if (IS_ERR_OR_NULL(entry)) + return entry ? PTR_ERR(entry) : -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(ldebugfs_seq_create); + +int lprocfs_seq_create(struct proc_dir_entry *parent, + const char *name, + mode_t mode, + const struct proc_ops *seq_fops, + void *data) +{ + struct proc_dir_entry *entry; + ENTRY; + + /* Disallow secretly (un)writable entries. */ + LASSERT(!seq_fops->proc_write == !(mode & 0222)); + + entry = proc_create_data(name, mode, parent, seq_fops, data); + + if (!entry) + RETURN(-ENOMEM); + + RETURN(0); +} +EXPORT_SYMBOL(lprocfs_seq_create); + +int lprocfs_obd_seq_create(struct obd_device *dev, + const char *name, + mode_t mode, + const struct proc_ops *seq_fops, + void *data) +{ + return (lprocfs_seq_create(dev->obd_proc_entry, name, + mode, seq_fops, data)); +} +EXPORT_SYMBOL(lprocfs_obd_seq_create); + +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) +{ + if (value >= OBD_HIST_MAX) + value = OBD_HIST_MAX - 1; + + spin_lock(&oh->oh_lock); + oh->oh_buckets[value]++; + spin_unlock(&oh->oh_lock); +} +EXPORT_SYMBOL(lprocfs_oh_tally); + +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) +{ + unsigned int val = 0; + + if (likely(value != 0)) + val = min(fls(value - 1), OBD_HIST_MAX); + + lprocfs_oh_tally(oh, val); +} +EXPORT_SYMBOL(lprocfs_oh_tally_log2); + +unsigned long lprocfs_oh_sum(struct obd_histogram *oh) +{ + unsigned long ret = 0; + int i; + + for (i = 0; i < OBD_HIST_MAX; i++) + ret += oh->oh_buckets[i]; + return ret; +} +EXPORT_SYMBOL(lprocfs_oh_sum); + +void lprocfs_oh_clear(struct obd_histogram *oh) +{ + spin_lock(&oh->oh_lock); + memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets)); + spin_unlock(&oh->oh_lock); +} +EXPORT_SYMBOL(lprocfs_oh_clear); + +ssize_t lustre_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct lustre_attr *a = container_of(attr, struct lustre_attr, attr); + + return a->show ? a->show(kobj, attr, buf) : 0; +} +EXPORT_SYMBOL_GPL(lustre_attr_show); + +ssize_t lustre_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct lustre_attr *a = container_of(attr, struct lustre_attr, attr); + + return a->store ? a->store(kobj, attr, buf, len) : len; +} +EXPORT_SYMBOL_GPL(lustre_attr_store); + +const struct sysfs_ops lustre_sysfs_ops = { + .show = lustre_attr_show, + .store = lustre_attr_store, +}; +EXPORT_SYMBOL_GPL(lustre_sysfs_ops); + +int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + + spin_lock(&cli->cl_loi_list_lock); + seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc); + spin_unlock(&cli->cl_loi_list_lock); + return 0; +} +EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_show); + +ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = + ((struct seq_file *)file->private_data)->private; + struct client_obd *cli = &dev->u.cli; + struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data; + int chunk_mask, rc; + s64 val; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1'); + if (rc) + return rc; + if (val < 0) + return -ERANGE; + + /* if the max_pages is specified in bytes, convert to pages */ + if (val >= ONE_MB_BRW_SIZE) + val >>= PAGE_SHIFT; + + LPROCFS_CLIMP_CHECK(dev); + + chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1); + /* max_pages_per_rpc must be chunk aligned */ + val = (val + ~chunk_mask) & chunk_mask; + if (val == 0 || (ocd->ocd_brw_size != 0 && + val > ocd->ocd_brw_size >> PAGE_SHIFT)) { + LPROCFS_CLIMP_EXIT(dev); + return -ERANGE; + } + spin_lock(&cli->cl_loi_list_lock); + cli->cl_max_pages_per_rpc = val; + client_adjust_max_dirty(cli); + spin_unlock(&cli->cl_loi_list_lock); + + LPROCFS_CLIMP_EXIT(dev); + return count; +} +EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_write); + +ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &dev->u.cli; + int rc; + + spin_lock(&cli->cl_loi_list_lock); + rc = sprintf(buf, "%d\n", cli->cl_max_short_io_bytes); + spin_unlock(&cli->cl_loi_list_lock); + return rc; +} +EXPORT_SYMBOL(short_io_bytes_show); + +/* Used to catch people who think they're specifying pages. */ +#define MIN_SHORT_IO_BYTES 64U + +ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &dev->u.cli; + u32 val; + int rc; + + LPROCFS_CLIMP_CHECK(dev); + + rc = kstrtouint(buffer, 0, &val); + if (rc) + GOTO(out, rc); + + if (val && (val < MIN_SHORT_IO_BYTES || val > OBD_MAX_SHORT_IO_BYTES)) + GOTO(out, rc = -ERANGE); + + rc = count; + + spin_lock(&cli->cl_loi_list_lock); + if (val > (cli->cl_max_pages_per_rpc << PAGE_SHIFT)) + rc = -ERANGE; + else + cli->cl_max_short_io_bytes = val; + spin_unlock(&cli->cl_loi_list_lock); + +out: + LPROCFS_CLIMP_EXIT(dev); + return rc; +} +EXPORT_SYMBOL(short_io_bytes_store); + +int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count, + struct root_squash_info *squash, char *name) +{ + int rc; + char kernbuf[64], *tmp, *errmsg; + unsigned long uid, gid; + ENTRY; + + if (count >= sizeof(kernbuf)) { + errmsg = "string too long"; + GOTO(failed_noprint, rc = -EINVAL); + } + if (copy_from_user(kernbuf, buffer, count)) { + errmsg = "bad address"; + GOTO(failed_noprint, rc = -EFAULT); + } + kernbuf[count] = '\0'; + + /* look for uid gid separator */ + tmp = strchr(kernbuf, ':'); + if (!tmp) { + errmsg = "needs uid:gid format"; + GOTO(failed, rc = -EINVAL); + } + *tmp = '\0'; + tmp++; + + /* parse uid */ + if (kstrtoul(kernbuf, 0, &uid) != 0) { + errmsg = "bad uid"; + GOTO(failed, rc = -EINVAL); + } + + /* parse gid */ + if (kstrtoul(tmp, 0, &gid) != 0) { + errmsg = "bad gid"; + GOTO(failed, rc = -EINVAL); + } + + squash->rsi_uid = uid; + squash->rsi_gid = gid; + + LCONSOLE_INFO("%s: root_squash is set to %u:%u\n", + name, squash->rsi_uid, squash->rsi_gid); + RETURN(count); + +failed: + if (tmp) { + tmp--; + *tmp = ':'; + } + CWARN("%s: failed to set root_squash to \"%s\", %s, rc = %d\n", + name, kernbuf, errmsg, rc); + RETURN(rc); +failed_noprint: + CWARN("%s: failed to set root_squash due to %s, rc = %d\n", + name, errmsg, rc); + RETURN(rc); +} +EXPORT_SYMBOL(lprocfs_wr_root_squash); + + +int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count, + struct root_squash_info *squash, char *name) +{ + int rc; + char *kernbuf = NULL; + char *errmsg; + struct list_head tmp; + int len = count; + ENTRY; + + if (count > 4096) { + errmsg = "string too long"; + GOTO(failed, rc = -EINVAL); + } + + OBD_ALLOC(kernbuf, count + 1); + if (!kernbuf) { + errmsg = "no memory"; + GOTO(failed, rc = -ENOMEM); + } + if (copy_from_user(kernbuf, buffer, count)) { + errmsg = "bad address"; + GOTO(failed, rc = -EFAULT); + } + kernbuf[count] = '\0'; + + if (count > 0 && kernbuf[count - 1] == '\n') + len = count - 1; + + if ((len == 4 && strncmp(kernbuf, "NONE", len) == 0) || + (len == 5 && strncmp(kernbuf, "clear", len) == 0)) { + /* empty string is special case */ + down_write(&squash->rsi_sem); + if (!list_empty(&squash->rsi_nosquash_nids)) + cfs_free_nidlist(&squash->rsi_nosquash_nids); + up_write(&squash->rsi_sem); + LCONSOLE_INFO("%s: nosquash_nids is cleared\n", name); + OBD_FREE(kernbuf, count + 1); + RETURN(count); + } + + INIT_LIST_HEAD(&tmp); + if (cfs_parse_nidlist(kernbuf, count, &tmp) <= 0) { + errmsg = "can't parse"; + GOTO(failed, rc = -EINVAL); + } + LCONSOLE_INFO("%s: nosquash_nids set to %s\n", + name, kernbuf); + OBD_FREE(kernbuf, count + 1); + kernbuf = NULL; + + down_write(&squash->rsi_sem); + if (!list_empty(&squash->rsi_nosquash_nids)) + cfs_free_nidlist(&squash->rsi_nosquash_nids); + list_splice(&tmp, &squash->rsi_nosquash_nids); + up_write(&squash->rsi_sem); + + RETURN(count); + +failed: + if (kernbuf) { + CWARN("%s: failed to set nosquash_nids to \"%s\", %s rc = %d\n", + name, kernbuf, errmsg, rc); + OBD_FREE(kernbuf, count + 1); + } else { + CWARN("%s: failed to set nosquash_nids due to %s rc = %d\n", + name, errmsg, rc); + } + RETURN(rc); +} +EXPORT_SYMBOL(lprocfs_wr_nosquash_nids); + +#endif /* CONFIG_PROC_FS*/ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c new file mode 100644 index 0000000000000..4df66a941e535 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c @@ -0,0 +1,908 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lprocfs_status_server.c + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#include +#include +#include + +#define MAX_STRING_SIZE 128 + +struct dentry *ldebugfs_add_symlink(const char *name, const char *target, + const char *format, ...) +{ + struct dentry *entry = NULL; + struct dentry *parent; + struct qstr dname; + va_list ap; + char *dest; + + if (!target || !format) + return NULL; + + dname.name = target; + dname.len = strlen(dname.name); + dname.hash = ll_full_name_hash(debugfs_lustre_root, + dname.name, dname.len); + parent = d_lookup(debugfs_lustre_root, &dname); + if (!parent) + return NULL; + + OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1); + if (!dest) + goto no_entry; + + va_start(ap, format); + vsnprintf(dest, MAX_STRING_SIZE, format, ap); + va_end(ap); + + entry = debugfs_create_symlink(name, parent, dest); + if (IS_ERR_OR_NULL(entry)) { + CERROR("LdebugFS: Could not create symbolic link from %s to %s\n", + name, dest); + entry = NULL; + } + + OBD_FREE(dest, MAX_STRING_SIZE + 1); +no_entry: + dput(parent); + return entry; +} +EXPORT_SYMBOL(ldebugfs_add_symlink); + +#ifdef CONFIG_PROC_FS + +int lprocfs_evict_client_open(struct inode *inode, struct file *f) +{ + struct obd_device *obd = PDE_DATA(file_inode(f)); + + atomic_inc(&obd->obd_evict_inprogress); + return 0; +} + +int lprocfs_evict_client_release(struct inode *inode, struct file *f) +{ + struct obd_device *obd = PDE_DATA(file_inode(f)); + + atomic_dec(&obd->obd_evict_inprogress); + wake_up(&obd->obd_evict_inprogress_waitq); + + return 0; +} + +#define BUFLEN (UUID_MAX + 5) + +ssize_t +lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + char *tmpbuf, *kbuf; + + OBD_ALLOC(kbuf, BUFLEN); + if (kbuf == NULL) + return -ENOMEM; + + /* + * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1 + * bytes into kbuf, to ensure that the string is NUL-terminated. + * UUID_MAX should include a trailing NUL already. + */ + if (copy_from_user(kbuf, buffer, + min_t(unsigned long, BUFLEN - 1, count))) { + count = -EFAULT; + goto out; + } + tmpbuf = cfs_firststr(kbuf, min_t(unsigned long, BUFLEN - 1, count)); + class_incref(obd, __func__, current); + + if (strncmp(tmpbuf, "nid:", 4) == 0) + obd_export_evict_by_nid(obd, tmpbuf + 4); + else if (strncmp(tmpbuf, "uuid:", 5) == 0) + obd_export_evict_by_uuid(obd, tmpbuf + 5); + else + obd_export_evict_by_uuid(obd, tmpbuf); + + class_decref(obd, __func__, current); + +out: + OBD_FREE(kbuf, BUFLEN); + return count; +} +EXPORT_SYMBOL(lprocfs_evict_client_seq_write); + +#undef BUFLEN + +ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", obd->obd_num_exports); +} +EXPORT_SYMBOL(num_exports_show); + +static int obd_export_flags2str(struct obd_export *exp, struct seq_file *m) +{ + bool first = true; + + flag2str(exp, failed); + flag2str(exp, in_recovery); + flag2str(exp, disconnected); + flag2str(exp, connecting); + + return 0; +} + +static int +lprocfs_exp_print_export_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *cb_data) +{ + struct seq_file *m = cb_data; + struct obd_export *exp = cfs_hash_object(hs, hnode); + struct obd_device *obd; + struct obd_connect_data *ocd; + + LASSERT(exp != NULL); + if (exp->exp_nid_stats == NULL) + goto out; + obd = exp->exp_obd; + ocd = &exp->exp_connect_data; + + seq_printf(m, "%s:\n" + " name: %s\n" + " client: %s\n" + " connect_flags: [ ", + obd_uuid2str(&exp->exp_client_uuid), + obd->obd_name, + obd_export_nid2str(exp)); + obd_connect_seq_flags2str(m, ocd->ocd_connect_flags, + ocd->ocd_connect_flags2, ", "); + seq_printf(m, " ]\n"); + obd_connect_data_seqprint(m, ocd); + seq_printf(m, " export_flags: [ "); + obd_export_flags2str(exp, m); + seq_printf(m, " ]\n"); + + if (obd->obd_type && + strcmp(obd->obd_type->typ_name, "obdfilter") == 0) { + struct filter_export_data *fed = &exp->exp_filter_data; + + seq_printf(m, " grant:\n"); + seq_printf(m, " granted: %ld\n", + fed->fed_ted.ted_grant); + seq_printf(m, " dirty: %ld\n", + fed->fed_ted.ted_dirty); + seq_printf(m, " pending: %ld\n", + fed->fed_ted.ted_pending); + } + +out: + return 0; +} + +/** + * RPC connections are composed of an import and an export. Using the + * lctl utility we can extract important information about the state. + * The lprocfs_exp_export_seq_show routine displays the state information + * for the export. + * + * \param[in] m seq file + * \param[in] data unused + * + * \retval 0 on success + * + * The format of the export state information is like: + * a793e354-49c0-aa11-8c4f-a4f2b1a1a92b: + * name: MGS + * client: 10.211.55.10@tcp + * connect_flags: [ version, barrier, adaptive_timeouts, ... ] + * connect_data: + * flags: 0x2000011005002020 + * instance: 0 + * target_version: 2.10.51.0 + * export_flags: [ ... ] + * + */ +static int lprocfs_exp_export_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + struct obd_device *obd = stats->nid_obd; + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_export_seq, m); + return 0; +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_export); + +static void lprocfs_free_client_stats(struct nid_stat *client_stat) +{ + CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat, + client_stat->nid_proc, client_stat->nid_stats); + + LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0, + "nid %s:count %d\n", libcfs_nid2str(client_stat->nid), + atomic_read(&client_stat->nid_exp_ref_count)); + + if (client_stat->nid_proc) + lprocfs_remove(&client_stat->nid_proc); + + if (client_stat->nid_stats) + lprocfs_free_stats(&client_stat->nid_stats); + + if (client_stat->nid_ldlm_stats) + lprocfs_free_stats(&client_stat->nid_ldlm_stats); + + OBD_FREE_PTR(client_stat); + return; +} + +void lprocfs_free_per_client_stats(struct obd_device *obd) +{ + struct cfs_hash *hash = obd->obd_nid_stats_hash; + struct nid_stat *stat; + ENTRY; + + /* we need extra list - because hash_exit called to early */ + /* not need locking because all clients is died */ + while (!list_empty(&obd->obd_nid_stats)) { + stat = list_entry(obd->obd_nid_stats.next, + struct nid_stat, nid_list); + list_del_init(&stat->nid_list); + cfs_hash_del(hash, &stat->nid, &stat->nid_hash); + lprocfs_free_client_stats(stat); + } + EXIT; +} +EXPORT_SYMBOL(lprocfs_free_per_client_stats); + +static int +lprocfs_exp_print_uuid_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *cb_data) +{ + struct seq_file *m = cb_data; + struct obd_export *exp = cfs_hash_object(hs, hnode); + + if (exp->exp_nid_stats != NULL) + seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid)); + return 0; +} + +static int +lprocfs_exp_print_nodemap_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *cb_data) +{ + struct seq_file *m = cb_data; + struct obd_export *exp = cfs_hash_object(hs, hnode); + struct lu_nodemap *nodemap = exp->exp_target_data.ted_nodemap; + + if (nodemap != NULL) + seq_printf(m, "%s\n", nodemap->nm_name); + return 0; +} + +static int +lprocfs_exp_nodemap_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + struct obd_device *obd = stats->nid_obd; + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_nodemap_seq, m); + return 0; +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_nodemap); + +static int lprocfs_exp_uuid_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + struct obd_device *obd = stats->nid_obd; + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_uuid_seq, m); + return 0; +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_uuid); + +static int +lprocfs_exp_print_hash_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *cb_data) + +{ + struct seq_file *m = cb_data; + struct obd_export *exp = cfs_hash_object(hs, hnode); + + if (exp->exp_lock_hash != NULL) { + cfs_hash_debug_header(m); + cfs_hash_debug_str(hs, m); + } + return 0; +} + +static int lprocfs_exp_hash_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + struct obd_device *obd = stats->nid_obd; + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_hash_seq, m); + return 0; +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_hash); + +int lprocfs_exp_print_replydata_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *cb_data) + +{ + struct obd_export *exp = cfs_hash_object(hs, hnode); + struct seq_file *m = cb_data; + struct tg_export_data *ted = &exp->exp_target_data; + + seq_printf(m, "reply_cnt: %d\n" + "reply_max: %d\n" + "reply_released_by_xid: %d\n" + "reply_released_by_tag: %d\n\n", + ted->ted_reply_cnt, + ted->ted_reply_max, + ted->ted_release_xid, + ted->ted_release_tag); + return 0; +} + +int lprocfs_exp_replydata_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + struct obd_device *obd = stats->nid_obd; + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_replydata_seq, m); + return 0; +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_replydata); + +int lprocfs_exp_print_fmd_count_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *cb_data) + +{ + struct obd_export *exp = cfs_hash_object(hs, hnode); + struct seq_file *m = cb_data; + struct tg_export_data *ted = &exp->exp_target_data; + + seq_printf(m, "%d\n", ted->ted_fmd_count); + + return 0; +} + +int lprocfs_exp_fmd_count_seq_show(struct seq_file *m, void *data) +{ + struct nid_stat *stats = m->private; + struct obd_device *obd = stats->nid_obd; + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_fmd_count_seq, m); + return 0; +} +LPROC_SEQ_FOPS_RO(lprocfs_exp_fmd_count); + +int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data) +{ + seq_puts(m, "Write into this file to clear all nid stats and stale nid entries\n"); + return 0; +} +EXPORT_SYMBOL(lprocfs_nid_stats_clear_seq_show); + +static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data) +{ + struct nid_stat *stat = obj; + ENTRY; + + CDEBUG(D_INFO, "refcnt %d\n", atomic_read(&stat->nid_exp_ref_count)); + if (atomic_read(&stat->nid_exp_ref_count) == 1) { + /* object has only hash references. */ + spin_lock(&stat->nid_obd->obd_nid_lock); + list_move(&stat->nid_list, data); + spin_unlock(&stat->nid_obd->obd_nid_lock); + RETURN(1); + } + /* we has reference to object - only clear data*/ + if (stat->nid_stats) + lprocfs_clear_stats(stat->nid_stats); + + RETURN(0); +} + +ssize_t +lprocfs_nid_stats_clear_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + struct nid_stat *client_stat; + struct list_head free_list; + + INIT_LIST_HEAD(&free_list); + cfs_hash_cond_del(obd->obd_nid_stats_hash, + lprocfs_nid_stats_clear_write_cb, &free_list); + + while (!list_empty(&free_list)) { + client_stat = list_entry(free_list.next, struct nid_stat, + nid_list); + list_del_init(&client_stat->nid_list); + lprocfs_free_client_stats(client_stat); + } + return count; +} +EXPORT_SYMBOL(lprocfs_nid_stats_clear_seq_write); + +int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid) +{ + struct nid_stat *new_stat, *old_stat; + struct obd_device *obd = NULL; + struct proc_dir_entry *entry; + char nidstr[LNET_NIDSTR_SIZE]; + int rc = 0; + ENTRY; + + if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry || + !exp->exp_obd->obd_nid_stats_hash) + RETURN(-EINVAL); + + /* not test against zero because eric say: + * You may only test nid against another nid, or LNET_NID_ANY. + * Anything else is nonsense.*/ + if (nid == NULL || *nid == LNET_NID_ANY) + RETURN(-EALREADY); + + libcfs_nid2str_r(*nid, nidstr, sizeof(nidstr)); + + spin_lock(&exp->exp_lock); + if (exp->exp_nid_stats != NULL) { + spin_unlock(&exp->exp_lock); + RETURN(-EALREADY); + } + spin_unlock(&exp->exp_lock); + + obd = exp->exp_obd; + + CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash); + + OBD_ALLOC_PTR(new_stat); + if (new_stat == NULL) + RETURN(-ENOMEM); + + new_stat->nid = *nid; + new_stat->nid_obd = exp->exp_obd; + /* we need set default refcount to 1 to balance obd_disconnect */ + atomic_set(&new_stat->nid_exp_ref_count, 1); + + old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash, + nid, &new_stat->nid_hash); + CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n", + old_stat, nidstr, atomic_read(&old_stat->nid_exp_ref_count)); + + /* Return -EALREADY here so that we know that the /proc + * entry already has been created */ + if (old_stat != new_stat) { + spin_lock(&exp->exp_lock); + if (exp->exp_nid_stats) { + LASSERT(exp->exp_nid_stats == old_stat); + nidstat_putref(exp->exp_nid_stats); + } + exp->exp_nid_stats = old_stat; + spin_unlock(&exp->exp_lock); + GOTO(destroy_new, rc = -EALREADY); + } + /* not found - create */ + new_stat->nid_proc = lprocfs_register(nidstr, + obd->obd_proc_exports_entry, + NULL, NULL); + + if (IS_ERR(new_stat->nid_proc)) { + rc = PTR_ERR(new_stat->nid_proc); + new_stat->nid_proc = NULL; + CERROR("%s: cannot create proc entry for export %s: rc = %d\n", + obd->obd_name, nidstr, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "nodemap", new_stat, + &lprocfs_exp_nodemap_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the nodemap file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "uuid", new_stat, + &lprocfs_exp_uuid_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the NID stats file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "hash", new_stat, + &lprocfs_exp_hash_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the hash file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "export", + new_stat, &lprocfs_exp_export_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the export file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "reply_data", new_stat, + &lprocfs_exp_replydata_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the reply_data file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "fmd_count", new_stat, + &lprocfs_exp_fmd_count_fops); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CWARN("%s: error adding the fmd_count file: rc = %d\n", + obd->obd_name, rc); + GOTO(destroy_new_ns, rc); + } + + spin_lock(&exp->exp_lock); + exp->exp_nid_stats = new_stat; + spin_unlock(&exp->exp_lock); + + /* protect competitive add to list, not need locking on destroy */ + spin_lock(&obd->obd_nid_lock); + list_add(&new_stat->nid_list, &obd->obd_nid_stats); + spin_unlock(&obd->obd_nid_lock); + + RETURN(0); + +destroy_new_ns: + if (new_stat->nid_proc != NULL) + lprocfs_remove(&new_stat->nid_proc); + cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash); + +destroy_new: + nidstat_putref(new_stat); + OBD_FREE_PTR(new_stat); + RETURN(rc); +} +EXPORT_SYMBOL(lprocfs_exp_setup); + +int lprocfs_exp_cleanup(struct obd_export *exp) +{ + struct nid_stat *stat = exp->exp_nid_stats; + + if (!stat || !exp->exp_obd) + RETURN(0); + + nidstat_putref(exp->exp_nid_stats); + exp->exp_nid_stats = NULL; + + return 0; +} + +int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned int num_stats) +{ + struct lprocfs_stats *stats; + int rc; + + LASSERT(obd->obd_stats == NULL); + LASSERT(obd->obd_proc_entry != NULL); + + stats = lprocfs_alloc_stats(num_stats, 0); + if (stats == NULL) + return -ENOMEM; + + rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats); + if (rc < 0) + lprocfs_free_stats(&stats); + else + obd->obd_stats = stats; + + return rc; +} +EXPORT_SYMBOL(lprocfs_alloc_obd_stats); + +void lprocfs_free_obd_stats(struct obd_device *obd) +{ + if (obd->obd_stats) + lprocfs_free_stats(&obd->obd_stats); +} +EXPORT_SYMBOL(lprocfs_free_obd_stats); + +int lprocfs_hash_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = m->private; + + if (obd == NULL) + return 0; + + cfs_hash_debug_header(m); + cfs_hash_debug_str(obd->obd_uuid_hash, m); + cfs_hash_debug_str(obd->obd_nid_hash, m); + cfs_hash_debug_str(obd->obd_nid_stats_hash, m); + return 0; +} +EXPORT_SYMBOL(lprocfs_hash_seq_show); + +int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = m->private; + struct target_distribute_txn_data *tdtd; + + LASSERT(obd != NULL); + + seq_printf(m, "status: "); + if (atomic_read(&obd->obd_max_recoverable_clients) == 0) { + seq_printf(m, "INACTIVE\n"); + goto out; + } + + /* sampled unlocked, but really... */ + if (obd->obd_recovering == 0) { + seq_printf(m, "COMPLETE\n"); + seq_printf(m, "recovery_start: %lld\n", + (s64)obd->obd_recovery_start); + seq_printf(m, "recovery_duration: %lld\n", + obd->obd_recovery_end ? + obd->obd_recovery_end - obd->obd_recovery_start : + ktime_get_real_seconds() - obd->obd_recovery_start); + /* Number of clients that have completed recovery */ + seq_printf(m, "completed_clients: %d/%d\n", + atomic_read(&obd->obd_max_recoverable_clients) - + obd->obd_stale_clients, + atomic_read(&obd->obd_max_recoverable_clients)); + seq_printf(m, "replayed_requests: %d\n", + obd->obd_replayed_requests); + seq_printf(m, "last_transno: %lld\n", + obd->obd_next_recovery_transno - 1); + seq_printf(m, "VBR: %s\n", obd->obd_version_recov ? + "ENABLED" : "DISABLED"); + seq_printf(m, "IR: %s\n", obd->obd_no_ir ? + "DISABLED" : "ENABLED"); + goto out; + } + + tdtd = obd->u.obt.obt_lut->lut_tdtd; + if (tdtd && tdtd->tdtd_show_update_logs_retrievers) { + char *buf; + int size = 0; + int count = 0; + + buf = tdtd->tdtd_show_update_logs_retrievers( + tdtd->tdtd_show_retrievers_cbdata, + &size, &count); + if (count > 0) { + seq_printf(m, "WAITING\n"); + seq_printf(m, "non-ready MDTs: %s\n", + buf ? buf : "unknown (not enough RAM)"); + seq_printf(m, "recovery_start: %lld\n", + (s64)obd->obd_recovery_start); + seq_printf(m, "time_waited: %lld\n", + (s64)(ktime_get_real_seconds() - + obd->obd_recovery_start)); + } + + if (buf != NULL) + OBD_FREE(buf, size); + + if (likely(count > 0)) + goto out; + } + + /* recovery won't start until the clients connect */ + if (obd->obd_recovery_start == 0) { + seq_printf(m, "WAITING_FOR_CLIENTS\n"); + goto out; + } + + seq_printf(m, "RECOVERING\n"); + seq_printf(m, "recovery_start: %lld\n", (s64)obd->obd_recovery_start); + seq_printf(m, "time_remaining: %lld\n", + ktime_get_real_seconds() >= + obd->obd_recovery_start + + obd->obd_recovery_timeout ? 0 : + (s64)(obd->obd_recovery_start + + obd->obd_recovery_timeout - + ktime_get_real_seconds())); + seq_printf(m, "connected_clients: %d/%d\n", + atomic_read(&obd->obd_connected_clients), + atomic_read(&obd->obd_max_recoverable_clients)); + /* Number of clients that have completed recovery */ + seq_printf(m, "req_replay_clients: %d\n", + atomic_read(&obd->obd_req_replay_clients)); + seq_printf(m, "lock_repay_clients: %d\n", + atomic_read(&obd->obd_lock_replay_clients)); + seq_printf(m, "completed_clients: %d\n", + atomic_read(&obd->obd_connected_clients) - + atomic_read(&obd->obd_lock_replay_clients)); + seq_printf(m, "evicted_clients: %d\n", obd->obd_stale_clients); + seq_printf(m, "replayed_requests: %d\n", obd->obd_replayed_requests); + seq_printf(m, "queued_requests: %d\n", + obd->obd_requests_queued_for_recovery); + seq_printf(m, "next_transno: %lld\n", + obd->obd_next_recovery_transno); +out: + return 0; +} +EXPORT_SYMBOL(lprocfs_recovery_status_seq_show); + +ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", obd->obd_recovery_ir_factor); +} +EXPORT_SYMBOL(ir_factor_show); + +ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + int val; + int rc; + + rc = kstrtoint(buffer, 10, &val); + if (rc) + return rc; + + if (val < OBD_IR_FACTOR_MIN || val > OBD_IR_FACTOR_MAX) + return -EINVAL; + + obd->obd_recovery_ir_factor = val; + return count; +} +EXPORT_SYMBOL(ir_factor_store); + +int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = m->private; + + LASSERT(obd != NULL); + seq_printf(m, "%d\n", obd->obd_checksum_dump); + return 0; +} +EXPORT_SYMBOL(lprocfs_checksum_dump_seq_show); + +ssize_t +lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + bool val; + int rc; + + LASSERT(obd != NULL); + rc = kstrtobool_from_user(buffer, count, &val); + if (rc) + return rc; + + obd->obd_checksum_dump = val; + return count; +} +EXPORT_SYMBOL(lprocfs_checksum_dump_seq_write); + +ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%ld\n", obd->obd_recovery_timeout); +} +EXPORT_SYMBOL(recovery_time_soft_show); + +ssize_t recovery_time_soft_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + obd->obd_recovery_timeout = val; + return count; +} +EXPORT_SYMBOL(recovery_time_soft_store); + +ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%ld\n", obd->obd_recovery_time_hard); +} +EXPORT_SYMBOL(recovery_time_hard_show); + +ssize_t recovery_time_hard_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + obd->obd_recovery_time_hard = val; + return count; +} +EXPORT_SYMBOL(recovery_time_hard_store); + +ssize_t instance_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_device_target *target = &obd->u.obt; + + LASSERT(target->obt_magic == OBT_MAGIC); + return scnprintf(buf, PAGE_SIZE, "%u\n", obd->u.obt.obt_instance); +} +EXPORT_SYMBOL(instance_show); + +#endif /* CONFIG_PROC_FS*/ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c new file mode 100644 index 0000000000000..42e880e8a3948 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c @@ -0,0 +1,2586 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_object.c + * + * Lustre Object. + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include /* hash_long() */ +#include +#include +#include +#include +#include +#include +#include +#include + +struct lu_site_bkt_data { + /** + * LRU list, updated on each access to object. Protected by + * bucket lock of lu_site::ls_obj_hash. + * + * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are + * moved to the lu_site::ls_lru.prev (this is due to the non-existence + * of list_for_each_entry_safe_reverse()). + */ + struct list_head lsb_lru; + /** + * Wait-queue signaled when an object in this site is ultimately + * destroyed (lu_object_free()) or initialized (lu_object_start()). + * It is used by lu_object_find() to wait before re-trying when + * object in the process of destruction is found in the hash table; + * or wait object to be initialized by the allocator. + * + * \see htable_lookup(). + */ + wait_queue_head_t lsb_waitq; +}; + +enum { + LU_CACHE_PERCENT_MAX = 50, + LU_CACHE_PERCENT_DEFAULT = 20 +}; + +#define LU_CACHE_NR_MAX_ADJUST 512 +#define LU_CACHE_NR_UNLIMITED -1 +#define LU_CACHE_NR_DEFAULT LU_CACHE_NR_UNLIMITED +#define LU_CACHE_NR_LDISKFS_LIMIT LU_CACHE_NR_UNLIMITED +/** This is set to roughly (20 * OSS_NTHRS_MAX) to prevent thrashing */ +#define LU_CACHE_NR_ZFS_LIMIT 10240 + +#define LU_SITE_BITS_MIN 12 +#define LU_SITE_BITS_MAX 24 +#define LU_SITE_BITS_MAX_CL 19 +/** + * total 256 buckets, we don't want too many buckets because: + * - consume too much memory + * - avoid unbalanced LRU list + */ +#define LU_SITE_BKT_BITS 8 + + +static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; +module_param(lu_cache_percent, int, 0644); +MODULE_PARM_DESC(lu_cache_percent, "Percentage of memory to be used as lu_object cache"); + +static long lu_cache_nr = LU_CACHE_NR_DEFAULT; +module_param(lu_cache_nr, long, 0644); +MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache"); + +static void lu_object_free(const struct lu_env *env, struct lu_object *o); +static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx); + +wait_queue_head_t * +lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid) +{ + struct cfs_hash_bd bd; + struct lu_site_bkt_data *bkt; + + cfs_hash_bd_get(site->ls_obj_hash, fid, &bd); + bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); + return &bkt->lsb_waitq; +} +EXPORT_SYMBOL(lu_site_wq_from_fid); + +/** + * Decrease reference counter on object. If last reference is freed, return + * object to the cache, unless lu_object_is_dying(o) holds. In the latter + * case, free object immediately. + */ +void lu_object_put(const struct lu_env *env, struct lu_object *o) +{ + struct lu_site_bkt_data *bkt; + struct lu_object_header *top = o->lo_header; + struct lu_site *site = o->lo_dev->ld_site; + struct lu_object *orig = o; + struct cfs_hash_bd bd; + const struct lu_fid *fid = lu_object_fid(o); + bool is_dying; + + /* + * till we have full fids-on-OST implemented anonymous objects + * are possible in OSP. such an object isn't listed in the site + * so we should not remove it from the site. + */ + if (fid_is_zero(fid)) { + LASSERT(top->loh_hash.next == NULL + && top->loh_hash.pprev == NULL); + LASSERT(list_empty(&top->loh_lru)); + if (!atomic_dec_and_test(&top->loh_ref)) + return; + list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_release != NULL) + o->lo_ops->loo_object_release(env, o); + } + lu_object_free(env, orig); + return; + } + + cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd); + bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); + + is_dying = lu_object_is_dying(top); + if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) { + /* at this point the object reference is dropped and lock is + * not taken, so lu_object should not be touched because it + * can be freed by concurrent thread. Use local variable for + * check. + */ + if (is_dying) { + /* + * somebody may be waiting for this, currently only + * used for cl_object, see cl_object_put_last(). + */ + wake_up_all(&bkt->lsb_waitq); + } + return; + } + + /* + * When last reference is released, iterate over object + * layers, and notify them that object is no longer busy. + */ + list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_release != NULL) + o->lo_ops->loo_object_release(env, o); + } + + /* don't use local 'is_dying' here because if was taken without lock + * but here we need the latest actual value of it so check lu_object + * directly here. + */ + if (!lu_object_is_dying(top) && + (lu_object_exists(orig) || lu_object_is_cl(orig))) { + LASSERT(list_empty(&top->loh_lru)); + list_add_tail(&top->loh_lru, &bkt->lsb_lru); + percpu_counter_inc(&site->ls_lru_len_counter); + CDEBUG(D_INODE, "Add %p/%p to site lru. hash: %p, bkt: %p\n", + orig, top, site->ls_obj_hash, bkt); + cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); + return; + } + + /* + * If object is dying (will not be cached) then remove it + * from hash table and LRU. + * + * This is done with hash table and LRU lists locked. As the only + * way to acquire first reference to previously unreferenced + * object is through hash-table lookup (lu_object_find()), + * or LRU scanning (lu_site_purge()), that are done under hash-table + * and LRU lock, no race with concurrent object lookup is possible + * and we can safely destroy object below. + */ + if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) + cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash); + cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); + /* + * Object was already removed from hash and lru above, can + * kill it. + */ + lu_object_free(env, orig); +} +EXPORT_SYMBOL(lu_object_put); + +/** + * Put object and don't keep in cache. This is temporary solution for + * multi-site objects when its layering is not constant. + */ +void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o) +{ + set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags); + return lu_object_put(env, o); +} +EXPORT_SYMBOL(lu_object_put_nocache); + +/** + * Kill the object and take it out of LRU cache. + * Currently used by client code for layout change. + */ +void lu_object_unhash(const struct lu_env *env, struct lu_object *o) +{ + struct lu_object_header *top; + + top = o->lo_header; + set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags); + if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) { + struct lu_site *site = o->lo_dev->ld_site; + struct cfs_hash *obj_hash = site->ls_obj_hash; + struct cfs_hash_bd bd; + + cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1); + if (!list_empty(&top->loh_lru)) { + struct lu_site_bkt_data *bkt; + + list_del_init(&top->loh_lru); + bkt = cfs_hash_bd_extra_get(obj_hash, &bd); + percpu_counter_dec(&site->ls_lru_len_counter); + } + cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash); + cfs_hash_bd_unlock(obj_hash, &bd, 1); + } +} +EXPORT_SYMBOL(lu_object_unhash); + +/** + * Allocate new object. + * + * This follows object creation protocol, described in the comment within + * struct lu_device_operations definition. + */ +static struct lu_object *lu_object_alloc(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f) +{ + struct lu_object *top; + + /* + * Create top-level object slice. This will also create + * lu_object_header. + */ + top = dev->ld_ops->ldo_object_alloc(env, NULL, dev); + if (top == NULL) + return ERR_PTR(-ENOMEM); + if (IS_ERR(top)) + return top; + /* + * This is the only place where object fid is assigned. It's constant + * after this point. + */ + top->lo_header->loh_fid = *f; + + return top; +} + +/** + * Initialize object. + * + * This is called after object hash insertion to avoid returning an object with + * stale attributes. + */ +static int lu_object_start(const struct lu_env *env, struct lu_device *dev, + struct lu_object *top, + const struct lu_object_conf *conf) +{ + struct lu_object *scan; + struct list_head *layers; + unsigned int init_mask = 0; + unsigned int init_flag; + int clean; + int result; + + layers = &top->lo_header->loh_layers; + + do { + /* + * Call ->loo_object_init() repeatedly, until no more new + * object slices are created. + */ + clean = 1; + init_flag = 1; + list_for_each_entry(scan, layers, lo_linkage) { + if (init_mask & init_flag) + goto next; + clean = 0; + scan->lo_header = top->lo_header; + result = scan->lo_ops->loo_object_init(env, scan, conf); + if (result) + return result; + + init_mask |= init_flag; +next: + init_flag <<= 1; + } + } while (!clean); + + list_for_each_entry_reverse(scan, layers, lo_linkage) { + if (scan->lo_ops->loo_object_start != NULL) { + result = scan->lo_ops->loo_object_start(env, scan); + if (result) + return result; + } + } + + lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED); + + set_bit(LU_OBJECT_INITED, &top->lo_header->loh_flags); + + return 0; +} + +/** + * Free an object. + */ +static void lu_object_free(const struct lu_env *env, struct lu_object *o) +{ + wait_queue_head_t *wq; + struct lu_site *site; + struct lu_object *scan; + struct list_head *layers; + struct list_head splice; + + site = o->lo_dev->ld_site; + layers = &o->lo_header->loh_layers; + wq = lu_site_wq_from_fid(site, &o->lo_header->loh_fid); + /* + * First call ->loo_object_delete() method to release all resources. + */ + list_for_each_entry_reverse(scan, layers, lo_linkage) { + if (scan->lo_ops->loo_object_delete != NULL) + scan->lo_ops->loo_object_delete(env, scan); + } + + /* + * Then, splice object layers into stand-alone list, and call + * ->loo_object_free() on all layers to free memory. Splice is + * necessary, because lu_object_header is freed together with the + * top-level slice. + */ + INIT_LIST_HEAD(&splice); + list_splice_init(layers, &splice); + while (!list_empty(&splice)) { + /* + * Free layers in bottom-to-top order, so that object header + * lives as long as possible and ->loo_object_free() methods + * can look at its contents. + */ + o = container_of0(splice.prev, struct lu_object, lo_linkage); + list_del_init(&o->lo_linkage); + LASSERT(o->lo_ops->loo_object_free != NULL); + o->lo_ops->loo_object_free(env, o); + } + + if (waitqueue_active(wq)) + wake_up_all(wq); +} + +/** + * Free \a nr objects from the cold end of the site LRU list. + * if canblock is 0, then don't block awaiting for another + * instance of lu_site_purge() to complete + */ +int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s, + int nr, int canblock) +{ + struct lu_object_header *h; + struct lu_object_header *temp; + struct lu_site_bkt_data *bkt; + struct cfs_hash_bd bd; + struct cfs_hash_bd bd2; + struct list_head dispose; + int did_sth; + unsigned int start = 0; + int count; + int bnr; + unsigned int i; + + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU)) + RETURN(0); + + INIT_LIST_HEAD(&dispose); + /* + * Under LRU list lock, scan LRU list and move unreferenced objects to + * the dispose list, removing them from LRU and hash table. + */ + if (nr != ~0) + start = s->ls_purge_start; + bnr = (nr == ~0) ? -1 : nr / (int)CFS_HASH_NBKT(s->ls_obj_hash) + 1; + again: + /* + * It doesn't make any sense to make purge threads parallel, that can + * only bring troubles to us. See LU-5331. + */ + if (canblock != 0) + mutex_lock(&s->ls_purge_mutex); + else if (mutex_trylock(&s->ls_purge_mutex) == 0) + goto out; + + did_sth = 0; + cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { + if (i < start) + continue; + count = bnr; + cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1); + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); + + list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) { + LASSERT(atomic_read(&h->loh_ref) == 0); + + cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2); + LASSERT(bd.bd_bucket == bd2.bd_bucket); + + cfs_hash_bd_del_locked(s->ls_obj_hash, + &bd2, &h->loh_hash); + list_move(&h->loh_lru, &dispose); + percpu_counter_dec(&s->ls_lru_len_counter); + if (did_sth == 0) + did_sth = 1; + + if (nr != ~0 && --nr == 0) + break; + + if (count > 0 && --count == 0) + break; + + } + cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1); + cond_resched(); + /* + * Free everything on the dispose list. This is safe against + * races due to the reasons described in lu_object_put(). + */ + while (!list_empty(&dispose)) { + h = container_of0(dispose.next, + struct lu_object_header, loh_lru); + list_del_init(&h->loh_lru); + lu_object_free(env, lu_object_top(h)); + lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED); + } + + if (nr == 0) + break; + } + mutex_unlock(&s->ls_purge_mutex); + + if (nr != 0 && did_sth && start != 0) { + start = 0; /* restart from the first bucket */ + goto again; + } + /* race on s->ls_purge_start, but nobody cares */ + s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash); + +out: + return nr; +} +EXPORT_SYMBOL(lu_site_purge_objects); + +/* + * Object printing. + * + * Code below has to jump through certain loops to output object description + * into libcfs_debug_msg-based log. The problem is that lu_object_print() + * composes object description from strings that are parts of _lines_ of + * output (i.e., strings that are not terminated by newline). This doesn't fit + * very well into libcfs_debug_msg() interface that assumes that each message + * supplied to it is a self-contained output line. + * + * To work around this, strings are collected in a temporary buffer + * (implemented as a value of lu_cdebug_key key), until terminating newline + * character is detected. + * + */ + +enum { + /** + * Maximal line size. + * + * XXX overflow is not handled correctly. + */ + LU_CDEBUG_LINE = 512 +}; + +struct lu_cdebug_data { + /** + * Temporary buffer. + */ + char lck_area[LU_CDEBUG_LINE]; +}; + +/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */ +LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data); + +/** + * Key, holding temporary buffer. This key is registered very early by + * lu_global_init(). + */ +static struct lu_context_key lu_global_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | + LCT_MG_THREAD | LCT_CL_THREAD | LCT_LOCAL, + .lct_init = lu_global_key_init, + .lct_fini = lu_global_key_fini +}; + +/** + * Printer function emitting messages through libcfs_debug_msg(). + */ +int lu_cdebug_printer(const struct lu_env *env, + void *cookie, const char *format, ...) +{ + struct libcfs_debug_msg_data *msgdata = cookie; + struct lu_cdebug_data *key; + int used; + int complete; + va_list args; + + va_start(args, format); + + key = lu_context_key_get(&env->le_ctx, &lu_global_key); + LASSERT(key != NULL); + + used = strlen(key->lck_area); + complete = format[strlen(format) - 1] == '\n'; + /* + * Append new chunk to the buffer. + */ + vsnprintf(key->lck_area + used, + ARRAY_SIZE(key->lck_area) - used, format, args); + if (complete) { + if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys)) + libcfs_debug_msg(msgdata, "%s\n", key->lck_area); + key->lck_area[0] = 0; + } + va_end(args); + return 0; +} +EXPORT_SYMBOL(lu_cdebug_printer); + +/** + * Print object header. + */ +void lu_object_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct lu_object_header *hdr) +{ + (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]", + hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref), + PFID(&hdr->loh_fid), + hlist_unhashed(&hdr->loh_hash) ? "" : " hash", + list_empty((struct list_head *)&hdr->loh_lru) ? \ + "" : " lru", + hdr->loh_attr & LOHA_EXISTS ? " exist" : ""); +} +EXPORT_SYMBOL(lu_object_header_print); + +/** + * Print human readable representation of the \a o to the \a printer. + */ +void lu_object_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct lu_object *o) +{ + static const char ruler[] = "........................................"; + struct lu_object_header *top; + int depth = 4; + + top = o->lo_header; + lu_object_header_print(env, cookie, printer, top); + (*printer)(env, cookie, "{\n"); + + list_for_each_entry(o, &top->loh_layers, lo_linkage) { + /* + * print `.' \a depth times followed by type name and address + */ + (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler, + o->lo_dev->ld_type->ldt_name, o); + + if (o->lo_ops->loo_object_print != NULL) + (*o->lo_ops->loo_object_print)(env, cookie, printer, o); + + (*printer)(env, cookie, "\n"); + } + + (*printer)(env, cookie, "} header@%p\n", top); +} +EXPORT_SYMBOL(lu_object_print); + +/** + * Check object consistency. + */ +int lu_object_invariant(const struct lu_object *o) +{ + struct lu_object_header *top; + + top = o->lo_header; + list_for_each_entry(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_invariant != NULL && + !o->lo_ops->loo_object_invariant(o)) + return 0; + } + return 1; +} + +static struct lu_object *htable_lookup(struct lu_site *s, + struct cfs_hash_bd *bd, + const struct lu_fid *f, + __u64 *version) +{ + struct lu_object_header *h; + struct hlist_node *hnode; + __u64 ver = cfs_hash_bd_version_get(bd); + + if (*version == ver) + return ERR_PTR(-ENOENT); + + *version = ver; + /* cfs_hash_bd_peek_locked is a somehow "internal" function + * of cfs_hash, it doesn't add refcount on object. */ + hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f); + if (!hnode) { + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS); + return ERR_PTR(-ENOENT); + } + + h = container_of0(hnode, struct lu_object_header, loh_hash); + cfs_hash_get(s->ls_obj_hash, hnode); + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT); + if (!list_empty(&h->loh_lru)) { + list_del_init(&h->loh_lru); + percpu_counter_dec(&s->ls_lru_len_counter); + } + return lu_object_top(h); +} + +/** + * Search cache for an object with the fid \a f. If such object is found, + * return it. Otherwise, create new object, insert it into cache and return + * it. In any case, additional reference is acquired on the returned object. + */ +struct lu_object *lu_object_find(const struct lu_env *env, + struct lu_device *dev, const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf); +} +EXPORT_SYMBOL(lu_object_find); + +/* + * Limit the lu_object cache to a maximum of lu_cache_nr objects. Because + * the calculation for the number of objects to reclaim is not covered by + * a lock the maximum number of objects is capped by LU_CACHE_MAX_ADJUST. + * This ensures that many concurrent threads will not accidentally purge + * the entire cache. + */ +static void lu_object_limit(const struct lu_env *env, + struct lu_device *dev) +{ + __u64 size, nr; + + if (lu_cache_nr == LU_CACHE_NR_UNLIMITED) + return; + + size = cfs_hash_size_get(dev->ld_site->ls_obj_hash); + nr = (__u64)lu_cache_nr; + if (size <= nr) + return; + + lu_site_purge_objects(env, dev->ld_site, + MIN(size - nr, LU_CACHE_NR_MAX_ADJUST), 0); +} + +/** + * Core logic of lu_object_find*() functions. + * + * Much like lu_object_find(), but top level device of object is specifically + * \a dev rather than top level device of the site. This interface allows + * objects of different "stacking" to be created within the same site. + */ +struct lu_object *lu_object_find_at(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *o; + struct lu_object *shadow; + struct lu_site *s; + struct cfs_hash *hs; + struct cfs_hash_bd bd; + struct lu_site_bkt_data *bkt; + struct l_wait_info lwi = { 0 }; + __u64 version = 0; + int rc; + + ENTRY; + + /* FID is from disk or network, zero FID is meaningless, return error + * early to avoid assertion in lu_object_put. If a zero FID is wanted, + * it should be allocated via lu_object_anon(). + */ + if (fid_is_zero(f)) + RETURN(ERR_PTR(-EINVAL)); + + /* + * This uses standard index maintenance protocol: + * + * - search index under lock, and return object if found; + * - otherwise, unlock index, allocate new object; + * - lock index and search again; + * - if nothing is found (usual case), insert newly created + * object into index; + * - otherwise (race: other thread inserted object), free + * object just allocated. + * - unlock index; + * - return object. + * + * For "LOC_F_NEW" case, we are sure the object is new established. + * It is unnecessary to perform lookup-alloc-lookup-insert, instead, + * just alloc and insert directly. + * + */ + s = dev->ld_site; + hs = s->ls_obj_hash; + + if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_OBD_ZERO_NLINK_RACE))) + lu_site_purge(env, s, -1); + + cfs_hash_bd_get(hs, f, &bd); + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); + if (!(conf && conf->loc_flags & LOC_F_NEW)) { + cfs_hash_bd_lock(hs, &bd, 1); + o = htable_lookup(s, &bd, f, &version); + cfs_hash_bd_unlock(hs, &bd, 1); + + if (!IS_ERR(o)) { + if (likely(lu_object_is_inited(o->lo_header))) + RETURN(o); + + l_wait_event(bkt->lsb_waitq, + lu_object_is_inited(o->lo_header) || + lu_object_is_dying(o->lo_header), &lwi); + + if (lu_object_is_dying(o->lo_header)) { + lu_object_put(env, o); + + RETURN(ERR_PTR(-ENOENT)); + } + + RETURN(o); + } + + if (PTR_ERR(o) != -ENOENT) + RETURN(o); + } + + /* + * Allocate new object, NB, object is unitialized in case object + * is changed between allocation and hash insertion, thus the object + * with stale attributes is returned. + */ + o = lu_object_alloc(env, dev, f); + if (IS_ERR(o)) + RETURN(o); + + LASSERT(lu_fid_eq(lu_object_fid(o), f)); + + CFS_RACE_WAIT(OBD_FAIL_OBD_ZERO_NLINK_RACE); + + cfs_hash_bd_lock(hs, &bd, 1); + + if (conf && conf->loc_flags & LOC_F_NEW) + shadow = ERR_PTR(-ENOENT); + else + shadow = htable_lookup(s, &bd, f, &version); + if (likely(PTR_ERR(shadow) == -ENOENT)) { + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + cfs_hash_bd_unlock(hs, &bd, 1); + + /* + * This may result in rather complicated operations, including + * fld queries, inode loading, etc. + */ + rc = lu_object_start(env, dev, o, conf); + if (rc) { + lu_object_put_nocache(env, o); + RETURN(ERR_PTR(rc)); + } + + wake_up_all(&bkt->lsb_waitq); + + lu_object_limit(env, dev); + + RETURN(o); + } + + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE); + cfs_hash_bd_unlock(hs, &bd, 1); + lu_object_free(env, o); + + if (!(conf && conf->loc_flags & LOC_F_NEW) && + !lu_object_is_inited(shadow->lo_header)) { + l_wait_event(bkt->lsb_waitq, + lu_object_is_inited(shadow->lo_header) || + lu_object_is_dying(shadow->lo_header), &lwi); + + if (lu_object_is_dying(shadow->lo_header)) { + lu_object_put(env, shadow); + + RETURN(ERR_PTR(-ENOENT)); + } + } + + RETURN(shadow); +} +EXPORT_SYMBOL(lu_object_find_at); + +/** + * Find object with given fid, and return its slice belonging to given device. + */ +struct lu_object *lu_object_find_slice(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *top; + struct lu_object *obj; + + top = lu_object_find(env, dev, f, conf); + if (IS_ERR(top)) + return top; + + obj = lu_object_locate(top->lo_header, dev->ld_type); + if (unlikely(obj == NULL)) { + lu_object_put(env, top); + obj = ERR_PTR(-ENOENT); + } + + return obj; +} +EXPORT_SYMBOL(lu_object_find_slice); + +int lu_device_type_init(struct lu_device_type *ldt) +{ + int result = 0; + + atomic_set(&ldt->ldt_device_nr, 0); + if (ldt->ldt_ops->ldto_init) + result = ldt->ldt_ops->ldto_init(ldt); + + return result; +} +EXPORT_SYMBOL(lu_device_type_init); + +void lu_device_type_fini(struct lu_device_type *ldt) +{ + if (ldt->ldt_ops->ldto_fini) + ldt->ldt_ops->ldto_fini(ldt); +} +EXPORT_SYMBOL(lu_device_type_fini); + +/** + * Global list of all sites on this node + */ +static LIST_HEAD(lu_sites); +static DECLARE_RWSEM(lu_sites_guard); + +/** + * Global environment used by site shrinker. + */ +static struct lu_env lu_shrink_env; + +struct lu_site_print_arg { + struct lu_env *lsp_env; + void *lsp_cookie; + lu_printer_t lsp_printer; +}; + +static int +lu_site_obj_print(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *data) +{ + struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data; + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + if (!list_empty(&h->loh_layers)) { + const struct lu_object *o; + + o = lu_object_top(h); + lu_object_print(arg->lsp_env, arg->lsp_cookie, + arg->lsp_printer, o); + } else { + lu_object_header_print(arg->lsp_env, arg->lsp_cookie, + arg->lsp_printer, h); + } + return 0; +} + +/** + * Print all objects in \a s. + */ +void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, + lu_printer_t printer) +{ + struct lu_site_print_arg arg = { + .lsp_env = (struct lu_env *)env, + .lsp_cookie = cookie, + .lsp_printer = printer, + }; + + cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg); +} +EXPORT_SYMBOL(lu_site_print); + +/** + * Return desired hash table order. + */ +static unsigned long lu_htable_order(struct lu_device *top) +{ + unsigned long cache_size; + unsigned long bits; + unsigned long bits_max = LU_SITE_BITS_MAX; + + /* + * For ZFS based OSDs the cache should be disabled by default. This + * allows the ZFS ARC maximum flexibility in determining what buffers + * to cache. If Lustre has objects or buffer which it wants to ensure + * always stay cached it must maintain a hold on them. + */ + if (strcmp(top->ld_type->ldt_name, LUSTRE_OSD_ZFS_NAME) == 0) { + lu_cache_percent = 1; + lu_cache_nr = LU_CACHE_NR_ZFS_LIMIT; + return LU_SITE_BITS_MIN; + } + + if (strcmp(top->ld_type->ldt_name, LUSTRE_VVP_NAME) == 0) + bits_max = LU_SITE_BITS_MAX_CL; + + /* + * Calculate hash table size, assuming that we want reasonable + * performance when 20% of total memory is occupied by cache of + * lu_objects. + * + * Size of lu_object is (arbitrary) taken as 1K (together with inode). + */ + cache_size = cfs_totalram_pages(); + +#if BITS_PER_LONG == 32 + /* limit hashtable size for lowmem systems to low RAM */ + if (cache_size > 1 << (30 - PAGE_SHIFT)) + cache_size = 1 << (30 - PAGE_SHIFT) * 3 / 4; +#endif + + /* clear off unreasonable cache setting. */ + if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) { + CWARN("obdclass: invalid lu_cache_percent: %u, it must be in" + " the range of (0, %u]. Will use default value: %u.\n", + lu_cache_percent, LU_CACHE_PERCENT_MAX, + LU_CACHE_PERCENT_DEFAULT); + + lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; + } + cache_size = cache_size / 100 * lu_cache_percent * + (PAGE_SIZE / 1024); + + for (bits = 1; (1 << bits) < cache_size; ++bits) { + ; + } + + return clamp_t(typeof(bits), bits, LU_SITE_BITS_MIN, bits_max); +} + +static unsigned lu_obj_hop_hash(struct cfs_hash *hs, + const void *key, unsigned mask) +{ + struct lu_fid *fid = (struct lu_fid *)key; + __u32 hash; + + hash = fid_flatten32(fid); + hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */ + hash = hash_long(hash, hs->hs_bkt_bits); + + /* give me another random factor */ + hash -= hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3); + + hash <<= hs->hs_cur_bits - hs->hs_bkt_bits; + hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1); + + return hash & mask; +} + +static void *lu_obj_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct lu_object_header, loh_hash); +} + +static void *lu_obj_hop_key(struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + return &h->loh_fid; +} + +static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key); +} + +static void lu_obj_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + atomic_inc(&h->loh_ref); +} + +static void lu_obj_hop_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + LBUG(); /* we should never called it */ +} + +static struct cfs_hash_ops lu_site_hash_ops = { + .hs_hash = lu_obj_hop_hash, + .hs_key = lu_obj_hop_key, + .hs_keycmp = lu_obj_hop_keycmp, + .hs_object = lu_obj_hop_object, + .hs_get = lu_obj_hop_get, + .hs_put_locked = lu_obj_hop_put_locked, +}; + +void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d) +{ + spin_lock(&s->ls_ld_lock); + if (list_empty(&d->ld_linkage)) + list_add(&d->ld_linkage, &s->ls_ld_linkage); + spin_unlock(&s->ls_ld_lock); +} +EXPORT_SYMBOL(lu_dev_add_linkage); + +void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d) +{ + spin_lock(&s->ls_ld_lock); + list_del_init(&d->ld_linkage); + spin_unlock(&s->ls_ld_lock); +} +EXPORT_SYMBOL(lu_dev_del_linkage); + +/** + * Initialize site \a s, with \a d as the top level device. + */ +int lu_site_init(struct lu_site *s, struct lu_device *top) +{ + struct lu_site_bkt_data *bkt; + struct cfs_hash_bd bd; + char name[16]; + unsigned long bits; + unsigned int i; + int rc; + ENTRY; + + memset(s, 0, sizeof *s); + mutex_init(&s->ls_purge_mutex); + +#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG + rc = percpu_counter_init(&s->ls_lru_len_counter, 0, GFP_NOFS); +#else + rc = percpu_counter_init(&s->ls_lru_len_counter, 0); +#endif + if (rc) + return -ENOMEM; + + snprintf(name, sizeof(name), "lu_site_%s", top->ld_type->ldt_name); + for (bits = lu_htable_order(top); + bits >= LU_SITE_BITS_MIN; bits--) { + s->ls_obj_hash = cfs_hash_create(name, bits, bits, + bits - LU_SITE_BKT_BITS, + sizeof(*bkt), 0, 0, + &lu_site_hash_ops, + CFS_HASH_SPIN_BKTLOCK | + CFS_HASH_NO_ITEMREF | + CFS_HASH_DEPTH | + CFS_HASH_ASSERT_EMPTY | + CFS_HASH_COUNTER); + if (s->ls_obj_hash != NULL) + break; + } + + if (s->ls_obj_hash == NULL) { + CERROR("failed to create lu_site hash with bits: %lu\n", bits); + return -ENOMEM; + } + + cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); + INIT_LIST_HEAD(&bkt->lsb_lru); + init_waitqueue_head(&bkt->lsb_waitq); + } + + s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0); + if (s->ls_stats == NULL) { + cfs_hash_putref(s->ls_obj_hash); + s->ls_obj_hash = NULL; + return -ENOMEM; + } + + lprocfs_counter_init(s->ls_stats, LU_SS_CREATED, + 0, "created", "created"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT, + 0, "cache_hit", "cache_hit"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS, + 0, "cache_miss", "cache_miss"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE, + 0, "cache_race", "cache_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE, + 0, "cache_death_race", "cache_death_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED, + 0, "lru_purged", "lru_purged"); + + INIT_LIST_HEAD(&s->ls_linkage); + s->ls_top_dev = top; + top->ld_site = s; + lu_device_get(top); + lu_ref_add(&top->ld_reference, "site-top", s); + + INIT_LIST_HEAD(&s->ls_ld_linkage); + spin_lock_init(&s->ls_ld_lock); + + lu_dev_add_linkage(s, top); + + RETURN(0); +} +EXPORT_SYMBOL(lu_site_init); + +/** + * Finalize \a s and release its resources. + */ +void lu_site_fini(struct lu_site *s) +{ + down_write(&lu_sites_guard); + list_del_init(&s->ls_linkage); + up_write(&lu_sites_guard); + + percpu_counter_destroy(&s->ls_lru_len_counter); + + if (s->ls_obj_hash != NULL) { + cfs_hash_putref(s->ls_obj_hash); + s->ls_obj_hash = NULL; + } + + if (s->ls_top_dev != NULL) { + s->ls_top_dev->ld_site = NULL; + lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s); + lu_device_put(s->ls_top_dev); + s->ls_top_dev = NULL; + } + + if (s->ls_stats != NULL) + lprocfs_free_stats(&s->ls_stats); +} +EXPORT_SYMBOL(lu_site_fini); + +/** + * Called when initialization of stack for this site is completed. + */ +int lu_site_init_finish(struct lu_site *s) +{ + int result; + down_write(&lu_sites_guard); + result = lu_context_refill(&lu_shrink_env.le_ctx); + if (result == 0) + list_add(&s->ls_linkage, &lu_sites); + up_write(&lu_sites_guard); + return result; +} +EXPORT_SYMBOL(lu_site_init_finish); + +/** + * Acquire additional reference on device \a d + */ +void lu_device_get(struct lu_device *d) +{ + atomic_inc(&d->ld_ref); +} +EXPORT_SYMBOL(lu_device_get); + +/** + * Release reference on device \a d. + */ +void lu_device_put(struct lu_device *d) +{ + LASSERT(atomic_read(&d->ld_ref) > 0); + atomic_dec(&d->ld_ref); +} +EXPORT_SYMBOL(lu_device_put); + +/** + * Initialize device \a d of type \a t. + */ +int lu_device_init(struct lu_device *d, struct lu_device_type *t) +{ + if (atomic_inc_return(&t->ldt_device_nr) == 1 && + t->ldt_ops->ldto_start != NULL) + t->ldt_ops->ldto_start(t); + + memset(d, 0, sizeof *d); + d->ld_type = t; + lu_ref_init(&d->ld_reference); + INIT_LIST_HEAD(&d->ld_linkage); + + return 0; +} +EXPORT_SYMBOL(lu_device_init); + +/** + * Finalize device \a d. + */ +void lu_device_fini(struct lu_device *d) +{ + struct lu_device_type *t = d->ld_type; + + if (d->ld_obd != NULL) { + d->ld_obd->obd_lu_dev = NULL; + d->ld_obd = NULL; + } + + lu_ref_fini(&d->ld_reference); + LASSERTF(atomic_read(&d->ld_ref) == 0, + "Refcount is %u\n", atomic_read(&d->ld_ref)); + LASSERT(atomic_read(&t->ldt_device_nr) > 0); + + if (atomic_dec_and_test(&t->ldt_device_nr) && + t->ldt_ops->ldto_stop != NULL) + t->ldt_ops->ldto_stop(t); +} +EXPORT_SYMBOL(lu_device_fini); + +/** + * Initialize object \a o that is part of compound object \a h and was created + * by device \a d. + */ +int lu_object_init(struct lu_object *o, struct lu_object_header *h, + struct lu_device *d) +{ + memset(o, 0, sizeof(*o)); + o->lo_header = h; + o->lo_dev = d; + lu_device_get(d); + lu_ref_add_at(&d->ld_reference, &o->lo_dev_ref, "lu_object", o); + INIT_LIST_HEAD(&o->lo_linkage); + + return 0; +} +EXPORT_SYMBOL(lu_object_init); + +/** + * Finalize object and release its resources. + */ +void lu_object_fini(struct lu_object *o) +{ + struct lu_device *dev = o->lo_dev; + + LASSERT(list_empty(&o->lo_linkage)); + + if (dev != NULL) { + lu_ref_del_at(&dev->ld_reference, &o->lo_dev_ref, + "lu_object", o); + lu_device_put(dev); + o->lo_dev = NULL; + } +} +EXPORT_SYMBOL(lu_object_fini); + +/** + * Add object \a o as first layer of compound object \a h + * + * This is typically called by the ->ldo_object_alloc() method of top-level + * device. + */ +void lu_object_add_top(struct lu_object_header *h, struct lu_object *o) +{ + list_move(&o->lo_linkage, &h->loh_layers); +} +EXPORT_SYMBOL(lu_object_add_top); + +/** + * Add object \a o as a layer of compound object, going after \a before. + * + * This is typically called by the ->ldo_object_alloc() method of \a + * before->lo_dev. + */ +void lu_object_add(struct lu_object *before, struct lu_object *o) +{ + list_move(&o->lo_linkage, &before->lo_linkage); +} +EXPORT_SYMBOL(lu_object_add); + +/** + * Initialize compound object. + */ +int lu_object_header_init(struct lu_object_header *h) +{ + memset(h, 0, sizeof *h); + atomic_set(&h->loh_ref, 1); + INIT_HLIST_NODE(&h->loh_hash); + INIT_LIST_HEAD(&h->loh_lru); + INIT_LIST_HEAD(&h->loh_layers); + lu_ref_init(&h->loh_reference); + return 0; +} +EXPORT_SYMBOL(lu_object_header_init); + +/** + * Finalize compound object. + */ +void lu_object_header_fini(struct lu_object_header *h) +{ + LASSERT(list_empty(&h->loh_layers)); + LASSERT(list_empty(&h->loh_lru)); + LASSERT(hlist_unhashed(&h->loh_hash)); + lu_ref_fini(&h->loh_reference); +} +EXPORT_SYMBOL(lu_object_header_fini); + +/** + * Given a compound object, find its slice, corresponding to the device type + * \a dtype. + */ +struct lu_object *lu_object_locate(struct lu_object_header *h, + const struct lu_device_type *dtype) +{ + struct lu_object *o; + + list_for_each_entry(o, &h->loh_layers, lo_linkage) { + if (o->lo_dev->ld_type == dtype) + return o; + } + return NULL; +} +EXPORT_SYMBOL(lu_object_locate); + +/** + * Finalize and free devices in the device stack. + * + * Finalize device stack by purging object cache, and calling + * lu_device_type_operations::ldto_device_fini() and + * lu_device_type_operations::ldto_device_free() on all devices in the stack. + */ +void lu_stack_fini(const struct lu_env *env, struct lu_device *top) +{ + struct lu_site *site = top->ld_site; + struct lu_device *scan; + struct lu_device *next; + + lu_site_purge(env, site, ~0); + for (scan = top; scan != NULL; scan = next) { + next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan); + lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init); + lu_device_put(scan); + } + + /* purge again. */ + lu_site_purge(env, site, ~0); + + for (scan = top; scan != NULL; scan = next) { + const struct lu_device_type *ldt = scan->ld_type; + struct obd_type *type; + + next = ldt->ldt_ops->ldto_device_free(env, scan); + type = ldt->ldt_obd_type; + if (type != NULL) { + type->typ_refcnt--; + class_put_type(type); + } + } +} + +enum { + /** + * Maximal number of tld slots. + */ + LU_CONTEXT_KEY_NR = 40 +}; + +static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, }; + +DEFINE_RWLOCK(lu_keys_guard); +static atomic_t lu_key_initing_cnt = ATOMIC_INIT(0); + +/** + * Global counter incremented whenever key is registered, unregistered, + * revived or quiesced. This is used to void unnecessary calls to + * lu_context_refill(). No locking is provided, as initialization and shutdown + * are supposed to be externally serialized. + */ +static unsigned key_set_version = 0; + +/** + * Register new key. + */ +int lu_context_key_register(struct lu_context_key *key) +{ + int result; + unsigned int i; + + LASSERT(key->lct_init != NULL); + LASSERT(key->lct_fini != NULL); + LASSERT(key->lct_tags != 0); + LASSERT(key->lct_owner != NULL); + + result = -ENFILE; + write_lock(&lu_keys_guard); + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + if (lu_keys[i] == NULL) { + key->lct_index = i; + atomic_set(&key->lct_used, 1); + lu_keys[i] = key; + lu_ref_init(&key->lct_reference); + result = 0; + ++key_set_version; + break; + } + } + write_unlock(&lu_keys_guard); + return result; +} +EXPORT_SYMBOL(lu_context_key_register); + +static void key_fini(struct lu_context *ctx, int index) +{ + if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) { + struct lu_context_key *key; + + key = lu_keys[index]; + LASSERT(key != NULL); + LASSERT(key->lct_fini != NULL); + LASSERT(atomic_read(&key->lct_used) > 1); + + key->lct_fini(ctx, key, ctx->lc_value[index]); + lu_ref_del(&key->lct_reference, "ctx", ctx); + if (atomic_dec_and_test(&key->lct_used)) + wake_up_var(&key->lct_used); + + LASSERT(key->lct_owner != NULL); + if ((ctx->lc_tags & LCT_NOREF) == 0) { + LINVRNT(module_refcount(key->lct_owner) > 0); + module_put(key->lct_owner); + } + ctx->lc_value[index] = NULL; + } +} + +/** + * Deregister key. + */ +void lu_context_key_degister(struct lu_context_key *key) +{ + LASSERT(atomic_read(&key->lct_used) >= 1); + LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + + lu_context_key_quiesce(key); + + key_fini(&lu_shrink_env.le_ctx, key->lct_index); + + /** + * Wait until all transient contexts referencing this key have + * run lu_context_key::lct_fini() method. + */ + atomic_dec(&key->lct_used); + wait_var_event(&key->lct_used, atomic_read(&key->lct_used) == 0); + + write_lock(&lu_keys_guard); + if (lu_keys[key->lct_index]) { + lu_keys[key->lct_index] = NULL; + lu_ref_fini(&key->lct_reference); + } + write_unlock(&lu_keys_guard); + + LASSERTF(atomic_read(&key->lct_used) == 0, + "key has instances: %d\n", + atomic_read(&key->lct_used)); +} +EXPORT_SYMBOL(lu_context_key_degister); + +/** + * Register a number of keys. This has to be called after all keys have been + * initialized by a call to LU_CONTEXT_KEY_INIT(). + */ +int lu_context_key_register_many(struct lu_context_key *k, ...) +{ + struct lu_context_key *key = k; + va_list args; + int result; + + va_start(args, k); + do { + result = lu_context_key_register(key); + if (result) + break; + key = va_arg(args, struct lu_context_key *); + } while (key != NULL); + va_end(args); + + if (result != 0) { + va_start(args, k); + while (k != key) { + lu_context_key_degister(k); + k = va_arg(args, struct lu_context_key *); + } + va_end(args); + } + + return result; +} +EXPORT_SYMBOL(lu_context_key_register_many); + +/** + * De-register a number of keys. This is a dual to + * lu_context_key_register_many(). + */ +void lu_context_key_degister_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_degister(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_degister_many); + +/** + * Revive a number of keys. + */ +void lu_context_key_revive_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_revive(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_revive_many); + +/** + * Quiescent a number of keys. + */ +void lu_context_key_quiesce_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_quiesce(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_quiesce_many); + +/** + * Return value associated with key \a key in context \a ctx. + */ +void *lu_context_key_get(const struct lu_context *ctx, + const struct lu_context_key *key) +{ + LINVRNT(ctx->lc_state == LCS_ENTERED); + LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + LASSERT(lu_keys[key->lct_index] == key); + return ctx->lc_value[key->lct_index]; +} +EXPORT_SYMBOL(lu_context_key_get); + +/** + * List of remembered contexts. XXX document me. + */ +static LIST_HEAD(lu_context_remembered); + +/** + * Destroy \a key in all remembered contexts. This is used to destroy key + * values in "shared" contexts (like service threads), when a module owning + * the key is about to be unloaded. + */ +void lu_context_key_quiesce(struct lu_context_key *key) +{ + struct lu_context *ctx; + + if (!(key->lct_tags & LCT_QUIESCENT)) { + /* + * XXX memory barrier has to go here. + */ + write_lock(&lu_keys_guard); + key->lct_tags |= LCT_QUIESCENT; + + /** + * Wait until all lu_context_key::lct_init() methods + * have completed. + */ + while (atomic_read(&lu_key_initing_cnt) > 0) { + write_unlock(&lu_keys_guard); + CDEBUG(D_INFO, "lu_context_key_quiesce: \"%s\"" + " %p, %d (%d)\n", + key->lct_owner ? key->lct_owner->name : "", + key, atomic_read(&key->lct_used), + atomic_read(&lu_key_initing_cnt)); + schedule(); + write_lock(&lu_keys_guard); + } + + list_for_each_entry(ctx, &lu_context_remembered, + lc_remember) + key_fini(ctx, key->lct_index); + + ++key_set_version; + write_unlock(&lu_keys_guard); + } +} + +void lu_context_key_revive(struct lu_context_key *key) +{ + write_lock(&lu_keys_guard); + key->lct_tags &= ~LCT_QUIESCENT; + ++key_set_version; + write_unlock(&lu_keys_guard); +} + +static void keys_fini(struct lu_context *ctx) +{ + unsigned int i; + + if (ctx->lc_value == NULL) + return; + + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) + key_fini(ctx, i); + + OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]); + ctx->lc_value = NULL; +} + +static int keys_fill(struct lu_context *ctx) +{ + unsigned int i; + unsigned pre_version; + + /* + * A serialisation with lu_context_key_quiesce() is needed, but some + * "key->lct_init()" are calling kernel memory allocation routine and + * can't be called while holding a spin_lock. + * "lu_keys_guard" is held while incrementing "lu_key_initing_cnt" + * to ensure the start of the serialisation. + * An atomic_t variable is still used, in order not to reacquire the + * lock when decrementing the counter. + */ + read_lock(&lu_keys_guard); + atomic_inc(&lu_key_initing_cnt); + pre_version = key_set_version; + read_unlock(&lu_keys_guard); + +refill: + LINVRNT(ctx->lc_value != NULL); + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + struct lu_context_key *key; + + key = lu_keys[i]; + if (ctx->lc_value[i] == NULL && key != NULL && + (key->lct_tags & ctx->lc_tags) && + /* + * Don't create values for a LCT_QUIESCENT key, as this + * will pin module owning a key. + */ + !(key->lct_tags & LCT_QUIESCENT)) { + void *value; + + LINVRNT(key->lct_init != NULL); + LINVRNT(key->lct_index == i); + + LASSERT(key->lct_owner != NULL); + if (!(ctx->lc_tags & LCT_NOREF) && + try_module_get(key->lct_owner) == 0) { + /* module is unloading, skip this key */ + continue; + } + + value = key->lct_init(ctx, key); + if (unlikely(IS_ERR(value))) { + atomic_dec(&lu_key_initing_cnt); + return PTR_ERR(value); + } + + lu_ref_add_atomic(&key->lct_reference, "ctx", ctx); + atomic_inc(&key->lct_used); + /* + * This is the only place in the code, where an + * element of ctx->lc_value[] array is set to non-NULL + * value. + */ + ctx->lc_value[i] = value; + if (key->lct_exit != NULL) + ctx->lc_tags |= LCT_HAS_EXIT; + } + } + + read_lock(&lu_keys_guard); + if (pre_version != key_set_version) { + pre_version = key_set_version; + read_unlock(&lu_keys_guard); + goto refill; + } + + ctx->lc_version = key_set_version; + + atomic_dec(&lu_key_initing_cnt); + read_unlock(&lu_keys_guard); + return 0; +} + +static int keys_init(struct lu_context *ctx) +{ + OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]); + if (likely(ctx->lc_value != NULL)) + return keys_fill(ctx); + + return -ENOMEM; +} + +/** + * Initialize context data-structure. Create values for all keys. + */ +int lu_context_init(struct lu_context *ctx, __u32 tags) +{ + int rc; + + memset(ctx, 0, sizeof *ctx); + ctx->lc_state = LCS_INITIALIZED; + ctx->lc_tags = tags; + if (tags & LCT_REMEMBER) { + write_lock(&lu_keys_guard); + list_add(&ctx->lc_remember, &lu_context_remembered); + write_unlock(&lu_keys_guard); + } else { + INIT_LIST_HEAD(&ctx->lc_remember); + } + + rc = keys_init(ctx); + if (rc != 0) + lu_context_fini(ctx); + + return rc; +} +EXPORT_SYMBOL(lu_context_init); + +/** + * Finalize context data-structure. Destroy key values. + */ +void lu_context_fini(struct lu_context *ctx) +{ + LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); + ctx->lc_state = LCS_FINALIZED; + + if ((ctx->lc_tags & LCT_REMEMBER) == 0) { + LASSERT(list_empty(&ctx->lc_remember)); + keys_fini(ctx); + + } else { /* could race with key degister */ + write_lock(&lu_keys_guard); + keys_fini(ctx); + list_del_init(&ctx->lc_remember); + write_unlock(&lu_keys_guard); + } +} +EXPORT_SYMBOL(lu_context_fini); + +/** + * Called before entering context. + */ +void lu_context_enter(struct lu_context *ctx) +{ + LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); + ctx->lc_state = LCS_ENTERED; +} +EXPORT_SYMBOL(lu_context_enter); + +/** + * Called after exiting from \a ctx + */ +void lu_context_exit(struct lu_context *ctx) +{ + unsigned int i; + + LINVRNT(ctx->lc_state == LCS_ENTERED); + ctx->lc_state = LCS_LEFT; + if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) { + /* could race with key quiescency */ + if (ctx->lc_tags & LCT_REMEMBER) + read_lock(&lu_keys_guard); + + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + if (ctx->lc_value[i] != NULL) { + struct lu_context_key *key; + + key = lu_keys[i]; + LASSERT(key != NULL); + if (key->lct_exit != NULL) + key->lct_exit(ctx, + key, ctx->lc_value[i]); + } + } + + if (ctx->lc_tags & LCT_REMEMBER) + read_unlock(&lu_keys_guard); + } +} +EXPORT_SYMBOL(lu_context_exit); + +/** + * Allocate for context all missing keys that were registered after context + * creation. key_set_version is only changed in rare cases when modules + * are loaded and removed. + */ +int lu_context_refill(struct lu_context *ctx) +{ + read_lock(&lu_keys_guard); + if (likely(ctx->lc_version == key_set_version)) { + read_unlock(&lu_keys_guard); + return 0; + } + + read_unlock(&lu_keys_guard); + return keys_fill(ctx); +} + +/** + * lu_ctx_tags/lu_ses_tags will be updated if there are new types of + * obd being added. Currently, this is only used on client side, specifically + * for echo device client, for other stack (like ptlrpc threads), context are + * predefined when the lu_device type are registered, during the module probe + * phase. + */ +__u32 lu_context_tags_default = 0; +__u32 lu_session_tags_default = 0; + +void lu_context_tags_update(__u32 tags) +{ + write_lock(&lu_keys_guard); + lu_context_tags_default |= tags; + key_set_version++; + write_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_context_tags_update); + +void lu_context_tags_clear(__u32 tags) +{ + write_lock(&lu_keys_guard); + lu_context_tags_default &= ~tags; + key_set_version++; + write_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_context_tags_clear); + +void lu_session_tags_update(__u32 tags) +{ + write_lock(&lu_keys_guard); + lu_session_tags_default |= tags; + key_set_version++; + write_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_session_tags_update); + +void lu_session_tags_clear(__u32 tags) +{ + write_lock(&lu_keys_guard); + lu_session_tags_default &= ~tags; + key_set_version++; + write_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_session_tags_clear); + +int lu_env_init(struct lu_env *env, __u32 tags) +{ + int result; + + env->le_ses = NULL; + result = lu_context_init(&env->le_ctx, tags); + if (likely(result == 0)) + lu_context_enter(&env->le_ctx); + return result; +} +EXPORT_SYMBOL(lu_env_init); + +void lu_env_fini(struct lu_env *env) +{ + lu_context_exit(&env->le_ctx); + lu_context_fini(&env->le_ctx); + env->le_ses = NULL; +} +EXPORT_SYMBOL(lu_env_fini); + +int lu_env_refill(struct lu_env *env) +{ + int result; + + result = lu_context_refill(&env->le_ctx); + if (result == 0 && env->le_ses != NULL) + result = lu_context_refill(env->le_ses); + return result; +} +EXPORT_SYMBOL(lu_env_refill); + +/** + * Currently, this API will only be used by echo client. + * Because echo client and normal lustre client will share + * same cl_env cache. So echo client needs to refresh + * the env context after it get one from the cache, especially + * when normal client and echo client co-exist in the same client. + */ +int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, + __u32 stags) +{ + int result; + + if ((env->le_ctx.lc_tags & ctags) != ctags) { + env->le_ctx.lc_version = 0; + env->le_ctx.lc_tags |= ctags; + } + + if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) { + env->le_ses->lc_version = 0; + env->le_ses->lc_tags |= stags; + } + + result = lu_env_refill(env); + + return result; +} +EXPORT_SYMBOL(lu_env_refill_by_tags); + +#ifdef HAVE_SERVER_SUPPORT +struct lu_env_item { + struct task_struct *lei_task; /* rhashtable key */ + struct rhash_head lei_linkage; + struct lu_env *lei_env; + struct rcu_head lei_rcu_head; +}; + +static const struct rhashtable_params lu_env_rhash_params = { + .key_len = sizeof(struct task_struct *), + .key_offset = offsetof(struct lu_env_item, lei_task), + .head_offset = offsetof(struct lu_env_item, lei_linkage), +}; + +struct rhashtable lu_env_rhash; + +struct lu_env_percpu { + struct task_struct *lep_task; + struct lu_env *lep_env ____cacheline_aligned_in_smp; +}; + +static struct lu_env_percpu lu_env_percpu[NR_CPUS]; + +int lu_env_add(struct lu_env *env) +{ + struct lu_env_item *lei, *old; + + LASSERT(env); + + OBD_ALLOC_PTR(lei); + if (!lei) + return -ENOMEM; + + lei->lei_task = current; + lei->lei_env = env; + + old = rhashtable_lookup_get_insert_fast(&lu_env_rhash, + &lei->lei_linkage, + lu_env_rhash_params); + LASSERT(!old); + + return 0; +} +EXPORT_SYMBOL(lu_env_add); + +static void lu_env_item_free(struct rcu_head *head) +{ + struct lu_env_item *lei; + + lei = container_of(head, struct lu_env_item, lei_rcu_head); + OBD_FREE_PTR(lei); +} + +void lu_env_remove(struct lu_env *env) +{ + struct lu_env_item *lei; + const void *task = current; + int i; + + for_each_possible_cpu(i) { + if (lu_env_percpu[i].lep_env == env) { + LASSERT(lu_env_percpu[i].lep_task == task); + lu_env_percpu[i].lep_task = NULL; + lu_env_percpu[i].lep_env = NULL; + } + } + + /* The rcu_lock is not taking in this case since the key + * used is the actual task_struct. This implies that each + * object is only removed by the owning thread, so there + * can never be a race on a particular object. + */ + lei = rhashtable_lookup_fast(&lu_env_rhash, &task, + lu_env_rhash_params); + if (lei && rhashtable_remove_fast(&lu_env_rhash, &lei->lei_linkage, + lu_env_rhash_params) == 0) + call_rcu(&lei->lei_rcu_head, lu_env_item_free); +} +EXPORT_SYMBOL(lu_env_remove); + +struct lu_env *lu_env_find(void) +{ + struct lu_env *env = NULL; + struct lu_env_item *lei; + const void *task = current; + int i = get_cpu(); + + if (lu_env_percpu[i].lep_task == current) { + env = lu_env_percpu[i].lep_env; + put_cpu(); + LASSERT(env); + return env; + } + + lei = rhashtable_lookup_fast(&lu_env_rhash, &task, + lu_env_rhash_params); + if (lei) { + env = lei->lei_env; + lu_env_percpu[i].lep_task = current; + lu_env_percpu[i].lep_env = env; + } + put_cpu(); + + return env; +} +EXPORT_SYMBOL(lu_env_find); +#define lu_env_rhash_init(rhash, params) rhashtable_init(rhash, params) +#define lu_env_rhash_destroy(rhash) rhashtable_destroy(rhash) +#else +#define lu_env_rhash_init(rhash, params) 0 +#define lu_env_rhash_destroy(rhash) do {} while (0) +#endif /* HAVE_SERVER_SUPPORT */ + +static struct shrinker *lu_site_shrinker; + +typedef struct lu_site_stats{ + unsigned lss_populated; + unsigned lss_max_search; + unsigned lss_total; + unsigned lss_busy; +} lu_site_stats_t; + +static void lu_site_stats_get(const struct lu_site *s, + lu_site_stats_t *stats, int populated) +{ + struct cfs_hash *hs = s->ls_obj_hash; + struct cfs_hash_bd bd; + unsigned int i; + /* + * percpu_counter_sum_positive() won't accept a const pointer + * as it does modify the struct by taking a spinlock + */ + struct lu_site *s2 = (struct lu_site *)s; + + stats->lss_busy += cfs_hash_size_get(hs) - + percpu_counter_sum_positive(&s2->ls_lru_len_counter); + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + cfs_hash_bd_lock(hs, &bd, 1); + stats->lss_total += cfs_hash_bd_count_get(&bd); + stats->lss_max_search = max((int)stats->lss_max_search, + cfs_hash_bd_depmax_get(&bd)); + if (!populated) { + cfs_hash_bd_unlock(hs, &bd, 1); + continue; + } + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + if (!hlist_empty(hhead)) + stats->lss_populated++; + } + cfs_hash_bd_unlock(hs, &bd, 1); + } +} + + +/* + * lu_cache_shrink_count() returns an approximate number of cached objects + * that can be freed by shrink_slab(). A counter, which tracks the + * number of items in the site's lru, is maintained in a percpu_counter + * for each site. The percpu values are incremented and decremented as + * objects are added or removed from the lru. The percpu values are summed + * and saved whenever a percpu value exceeds a threshold. Thus the saved, + * summed value at any given time may not accurately reflect the current + * lru length. But this value is sufficiently accurate for the needs of + * a shrinker. + * + * Using a per cpu counter is a compromise solution to concurrent access: + * lu_object_put() can update the counter without locking the site and + * lu_cache_shrink_count can sum the counters without locking each + * ls_obj_hash bucket. + */ +static unsigned long lu_cache_shrink_count(struct shrinker *sk, + struct shrink_control *sc) +{ + struct lu_site *s; + struct lu_site *tmp; + unsigned long cached = 0; + + if (!(sc->gfp_mask & __GFP_FS)) + return 0; + + down_read(&lu_sites_guard); + list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) + cached += percpu_counter_read_positive(&s->ls_lru_len_counter); + up_read(&lu_sites_guard); + + cached = (cached / 100) * sysctl_vfs_cache_pressure; + CDEBUG(D_INODE, "%ld objects cached, cache pressure %d\n", + cached, sysctl_vfs_cache_pressure); + + return cached; +} + +static unsigned long lu_cache_shrink_scan(struct shrinker *sk, + struct shrink_control *sc) +{ + struct lu_site *s; + struct lu_site *tmp; + unsigned long remain = sc->nr_to_scan; + LIST_HEAD(splice); + + if (!(sc->gfp_mask & __GFP_FS)) + /* We must not take the lu_sites_guard lock when + * __GFP_FS is *not* set because of the deadlock + * possibility detailed above. Additionally, + * since we cannot determine the number of + * objects in the cache without taking this + * lock, we're in a particularly tough spot. As + * a result, we'll just lie and say our cache is + * empty. This _should_ be ok, as we can't + * reclaim objects when __GFP_FS is *not* set + * anyways. + */ + return SHRINK_STOP; + + down_write(&lu_sites_guard); + list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) { + remain = lu_site_purge(&lu_shrink_env, s, remain); + /* + * Move just shrunk site to the tail of site list to + * assure shrinking fairness. + */ + list_move_tail(&s->ls_linkage, &splice); + } + list_splice(&splice, lu_sites.prev); + up_write(&lu_sites_guard); + + return sc->nr_to_scan - remain; +} + +#ifndef HAVE_SHRINKER_COUNT +/* + * There exists a potential lock inversion deadlock scenario when using + * Lustre on top of ZFS. This occurs between one of ZFS's + * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially, + * thread A will take the lu_sites_guard lock and sleep on the ht_lock, + * while thread B will take the ht_lock and sleep on the lu_sites_guard + * lock. Obviously neither thread will wake and drop their respective hold + * on their lock. + * + * To prevent this from happening we must ensure the lu_sites_guard lock is + * not taken while down this code path. ZFS reliably does not set the + * __GFP_FS bit in its code paths, so this can be used to determine if it + * is safe to take the lu_sites_guard lock. + * + * Ideally we should accurately return the remaining number of cached + * objects without taking the lu_sites_guard lock, but this is not + * possible in the current implementation. + */ +static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + int cached = 0; + struct shrink_control scv = { + .nr_to_scan = shrink_param(sc, nr_to_scan), + .gfp_mask = shrink_param(sc, gfp_mask) + }; +#if !defined(HAVE_SHRINKER_WANT_SHRINK_PTR) && !defined(HAVE_SHRINK_CONTROL) + struct shrinker* shrinker = NULL; +#endif + + + CDEBUG(D_INODE, "Shrink %lu objects\n", scv.nr_to_scan); + + if (scv.nr_to_scan != 0) + lu_cache_shrink_scan(shrinker, &scv); + + cached = lu_cache_shrink_count(shrinker, &scv); + return cached; +} + +#endif /* HAVE_SHRINKER_COUNT */ + + +/* + * Debugging stuff. + */ + +/** + * Environment to be used in debugger, contains all tags. + */ +static struct lu_env lu_debugging_env; + +/** + * Debugging printer function using printk(). + */ +int lu_printk_printer(const struct lu_env *env, + void *unused, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vprintk(format, args); + va_end(args); + return 0; +} + +int lu_debugging_setup(void) +{ + return lu_env_init(&lu_debugging_env, ~0); +} + +void lu_context_keys_dump(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + struct lu_context_key *key; + + key = lu_keys[i]; + if (key != NULL) { + CERROR("[%d]: %p %x (%p,%p,%p) %d %d \"%s\"@%p\n", + i, key, key->lct_tags, + key->lct_init, key->lct_fini, key->lct_exit, + key->lct_index, atomic_read(&key->lct_used), + key->lct_owner ? key->lct_owner->name : "", + key->lct_owner); + lu_ref_print(&key->lct_reference); + } + } +} + +/** + * Initialization of global lu_* data. + */ +int lu_global_init(void) +{ + int result; + DEF_SHRINKER_VAR(shvar, lu_cache_shrink, + lu_cache_shrink_count, lu_cache_shrink_scan); + + CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys); + + result = lu_ref_global_init(); + if (result != 0) + return result; + + LU_CONTEXT_KEY_INIT(&lu_global_key); + result = lu_context_key_register(&lu_global_key); + if (result != 0) + return result; + + /* + * At this level, we don't know what tags are needed, so allocate them + * conservatively. This should not be too bad, because this + * environment is global. + */ + down_write(&lu_sites_guard); + result = lu_env_init(&lu_shrink_env, LCT_SHRINKER); + up_write(&lu_sites_guard); + if (result != 0) + return result; + + /* + * seeks estimation: 3 seeks to read a record from oi, one to read + * inode, one for ea. Unfortunately setting this high value results in + * lu_object/inode cache consuming all the memory. + */ + lu_site_shrinker = set_shrinker(DEFAULT_SEEKS, &shvar); + if (lu_site_shrinker == NULL) + return -ENOMEM; + + result = lu_env_rhash_init(&lu_env_rhash, &lu_env_rhash_params); + + return result; +} + +/** + * Dual to lu_global_init(). + */ +void lu_global_fini(void) +{ + if (lu_site_shrinker != NULL) { + remove_shrinker(lu_site_shrinker); + lu_site_shrinker = NULL; + } + + lu_context_key_degister(&lu_global_key); + + /* + * Tear shrinker environment down _after_ de-registering + * lu_global_key, because the latter has a value in the former. + */ + down_write(&lu_sites_guard); + lu_env_fini(&lu_shrink_env); + up_write(&lu_sites_guard); + + lu_env_rhash_destroy(&lu_env_rhash); + + lu_ref_global_fini(); +} + +static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx) +{ +#ifdef CONFIG_PROC_FS + struct lprocfs_counter ret; + + lprocfs_stats_collect(stats, idx, &ret); + return (__u32)ret.lc_count; +#else + return 0; +#endif +} + +/** + * Output site statistical counters into a buffer. Suitable for + * lprocfs_rd_*()-style functions. + */ +int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m) +{ + lu_site_stats_t stats; + + memset(&stats, 0, sizeof(stats)); + lu_site_stats_get(s, &stats, 1); + + seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n", + stats.lss_busy, + stats.lss_total, + stats.lss_populated, + CFS_HASH_NHLIST(s->ls_obj_hash), + stats.lss_max_search, + ls_stats_read(s->ls_stats, LU_SS_CREATED), + ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT), + ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS), + ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE), + ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE), + ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED)); + return 0; +} +EXPORT_SYMBOL(lu_site_stats_seq_print); + +/** + * Helper function to initialize a number of kmem slab caches at once. + */ +int lu_kmem_init(struct lu_kmem_descr *caches) +{ + int result; + struct lu_kmem_descr *iter = caches; + + for (result = 0; iter->ckd_cache != NULL; ++iter) { + *iter->ckd_cache = kmem_cache_create(iter->ckd_name, + iter->ckd_size, + 0, 0, NULL); + if (*iter->ckd_cache == NULL) { + result = -ENOMEM; + /* free all previously allocated caches */ + lu_kmem_fini(caches); + break; + } + } + return result; +} +EXPORT_SYMBOL(lu_kmem_init); + +/** + * Helper function to finalize a number of kmem slab cached at once. Dual to + * lu_kmem_init(). + */ +void lu_kmem_fini(struct lu_kmem_descr *caches) +{ + for (; caches->ckd_cache != NULL; ++caches) { + if (*caches->ckd_cache != NULL) { + kmem_cache_destroy(*caches->ckd_cache); + *caches->ckd_cache = NULL; + } + } +} +EXPORT_SYMBOL(lu_kmem_fini); + +/** + * Temporary solution to be able to assign fid in ->do_create() + * till we have fully-functional OST fids + */ +void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o, + const struct lu_fid *fid) +{ + struct lu_site *s = o->lo_dev->ld_site; + struct lu_fid *old = &o->lo_header->loh_fid; + struct cfs_hash *hs; + struct cfs_hash_bd bd; + + LASSERT(fid_is_zero(old)); + + /* supposed to be unique */ + hs = s->ls_obj_hash; + cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1); +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK + { + __u64 version = 0; + struct lu_object *shadow; + + shadow = htable_lookup(s, &bd, fid, &version); + /* supposed to be unique */ + LASSERT(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT); + } +#endif + *old = *fid; + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + cfs_hash_bd_unlock(hs, &bd, 1); +} +EXPORT_SYMBOL(lu_object_assign_fid); + +/** + * allocates object with 0 (non-assiged) fid + * XXX: temporary solution to be able to assign fid in ->do_create() + * till we have fully-functional OST fids + */ +struct lu_object *lu_object_anon(const struct lu_env *env, + struct lu_device *dev, + const struct lu_object_conf *conf) +{ + struct lu_fid fid; + struct lu_object *o; + int rc; + + fid_zero(&fid); + o = lu_object_alloc(env, dev, &fid); + if (!IS_ERR(o)) { + rc = lu_object_start(env, dev, o, conf); + if (rc) { + lu_object_free(env, o); + return ERR_PTR(rc); + } + } + + return o; +} +EXPORT_SYMBOL(lu_object_anon); + +struct lu_buf LU_BUF_NULL = { + .lb_buf = NULL, + .lb_len = 0 +}; +EXPORT_SYMBOL(LU_BUF_NULL); + +void lu_buf_free(struct lu_buf *buf) +{ + LASSERT(buf); + if (buf->lb_buf) { + LASSERT(buf->lb_len > 0); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + buf->lb_buf = NULL; + buf->lb_len = 0; + } +} +EXPORT_SYMBOL(lu_buf_free); + +void lu_buf_alloc(struct lu_buf *buf, size_t size) +{ + LASSERT(buf); + LASSERT(buf->lb_buf == NULL); + LASSERT(buf->lb_len == 0); + OBD_ALLOC_LARGE(buf->lb_buf, size); + if (likely(buf->lb_buf)) + buf->lb_len = size; +} +EXPORT_SYMBOL(lu_buf_alloc); + +void lu_buf_realloc(struct lu_buf *buf, size_t size) +{ + lu_buf_free(buf); + lu_buf_alloc(buf, size); +} +EXPORT_SYMBOL(lu_buf_realloc); + +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len) +{ + if (buf->lb_buf == NULL && buf->lb_len == 0) + lu_buf_alloc(buf, len); + + if ((len > buf->lb_len) && (buf->lb_buf != NULL)) + lu_buf_realloc(buf, len); + + return buf; +} +EXPORT_SYMBOL(lu_buf_check_and_alloc); + +/** + * Increase the size of the \a buf. + * preserves old data in buffer + * old buffer remains unchanged on error + * \retval 0 or -ENOMEM + */ +int lu_buf_check_and_grow(struct lu_buf *buf, size_t len) +{ + char *ptr; + + if (len <= buf->lb_len) + return 0; + + OBD_ALLOC_LARGE(ptr, len); + if (ptr == NULL) + return -ENOMEM; + + /* Free the old buf */ + if (buf->lb_buf != NULL) { + memcpy(ptr, buf->lb_buf, buf->lb_len); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + } + + buf->lb_buf = ptr; + buf->lb_len = len; + return 0; +} +EXPORT_SYMBOL(lu_buf_check_and_grow); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c new file mode 100644 index 0000000000000..e0a75791f1e6e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c @@ -0,0 +1,446 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_ref.c + * + * Lustre reference. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include + +#ifdef USE_LU_REF + +/** + * Asserts a condition for a given lu_ref. Must be called with + * lu_ref::lf_guard held. + */ +#define REFASSERT(ref, expr) do { \ + struct lu_ref *__tmp = (ref); \ + \ + if (unlikely(!(expr))) { \ + lu_ref_print(__tmp); \ + spin_unlock(&__tmp->lf_guard); \ + lu_ref_print_all(); \ + LASSERT(0); \ + spin_lock(&__tmp->lf_guard); \ + } \ +} while (0) + +static struct kmem_cache *lu_ref_link_kmem; + +static struct lu_kmem_descr lu_ref_caches[] = { + { + .ckd_cache = &lu_ref_link_kmem, + .ckd_name = "lu_ref_link_kmem", + .ckd_size = sizeof(struct lu_ref_link) + }, + { + .ckd_cache = NULL + } +}; + +/** + * Global list of active (initialized, but not finalized) lu_ref's. + * + * Protected by lu_ref_refs_guard. + */ +static LIST_HEAD(lu_ref_refs); +static DEFINE_SPINLOCK(lu_ref_refs_guard); +static struct lu_ref lu_ref_marker = { + .lf_guard = __SPIN_LOCK_UNLOCKED(lu_ref_marker.lf_guard), + .lf_list = LIST_HEAD_INIT(lu_ref_marker.lf_list), + .lf_linkage = LIST_HEAD_INIT(lu_ref_marker.lf_linkage) +}; + +void lu_ref_print(const struct lu_ref *ref) +{ + struct lu_ref_link *link; + + CERROR("lu_ref: %p %d %d %s:%d\n", + ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line); + list_for_each_entry(link, &ref->lf_list, ll_linkage) { + CERROR(" link: %s %p\n", link->ll_scope, link->ll_source); + } +} + +static int lu_ref_is_marker(const struct lu_ref *ref) +{ + return ref == &lu_ref_marker; +} + +void lu_ref_print_all(void) +{ + struct lu_ref *ref; + + spin_lock(&lu_ref_refs_guard); + list_for_each_entry(ref, &lu_ref_refs, lf_linkage) { + if (lu_ref_is_marker(ref)) + continue; + + spin_lock(&ref->lf_guard); + lu_ref_print(ref); + spin_unlock(&ref->lf_guard); + } + spin_unlock(&lu_ref_refs_guard); +} + +void lu_ref_init_loc(struct lu_ref *ref, const char *func, const int line) +{ + ref->lf_refs = 0; + ref->lf_func = func; + ref->lf_line = line; + spin_lock_init(&ref->lf_guard); + INIT_LIST_HEAD(&ref->lf_list); + spin_lock(&lu_ref_refs_guard); + list_add(&ref->lf_linkage, &lu_ref_refs); + spin_unlock(&lu_ref_refs_guard); +} +EXPORT_SYMBOL(lu_ref_init_loc); + +void lu_ref_fini(struct lu_ref *ref) +{ + spin_lock(&ref->lf_guard); + REFASSERT(ref, list_empty(&ref->lf_list)); + REFASSERT(ref, ref->lf_refs == 0); + spin_unlock(&ref->lf_guard); + spin_lock(&lu_ref_refs_guard); + list_del_init(&ref->lf_linkage); + spin_unlock(&lu_ref_refs_guard); +} +EXPORT_SYMBOL(lu_ref_fini); + +static struct lu_ref_link *lu_ref_add_context(struct lu_ref *ref, + int flags, + const char *scope, + const void *source) +{ + struct lu_ref_link *link; + + link = NULL; + if (lu_ref_link_kmem != NULL) { + OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags); + if (link != NULL) { + link->ll_ref = ref; + link->ll_scope = scope; + link->ll_source = source; + spin_lock(&ref->lf_guard); + list_add_tail(&link->ll_linkage, &ref->lf_list); + ref->lf_refs++; + spin_unlock(&ref->lf_guard); + } + } + + if (link == NULL) { + spin_lock(&ref->lf_guard); + ref->lf_failed++; + spin_unlock(&ref->lf_guard); + link = ERR_PTR(-ENOMEM); + } + + return link; +} + +void lu_ref_add(struct lu_ref *ref, const char *scope, const void *source) +{ + might_sleep(); + lu_ref_add_context(ref, GFP_NOFS, scope, source); +} +EXPORT_SYMBOL(lu_ref_add); + +void lu_ref_add_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source) +{ + link->ll_ref = ref; + link->ll_scope = scope; + link->ll_source = source; + spin_lock(&ref->lf_guard); + list_add_tail(&link->ll_linkage, &ref->lf_list); + ref->lf_refs++; + spin_unlock(&ref->lf_guard); +} +EXPORT_SYMBOL(lu_ref_add_at); + +/** + * Version of lu_ref_add() to be used in non-blockable contexts. + */ +void lu_ref_add_atomic(struct lu_ref *ref, const char *scope, + const void *source) +{ + lu_ref_add_context(ref, GFP_ATOMIC, scope, source); +} +EXPORT_SYMBOL(lu_ref_add_atomic); + +static inline int lu_ref_link_eq(const struct lu_ref_link *link, + const char *scope, + const void *source) +{ + return link->ll_source == source && !strcmp(link->ll_scope, scope); +} + +/** + * Maximal chain length seen so far. + */ +static unsigned lu_ref_chain_max_length = 127; + +/** + * Searches for a lu_ref_link with given [scope, source] within given lu_ref. + */ +static struct lu_ref_link *lu_ref_find(struct lu_ref *ref, const char *scope, + const void *source) +{ + struct lu_ref_link *link; + unsigned int iterations; + + iterations = 0; + list_for_each_entry(link, &ref->lf_list, ll_linkage) { + ++iterations; + if (lu_ref_link_eq(link, scope, source)) { + if (iterations > lu_ref_chain_max_length) { + CWARN("Long lu_ref chain %d \"%s\":%p\n", + iterations, scope, source); + lu_ref_chain_max_length = iterations * 3 / 2; + } + return link; + } + } + return NULL; +} + +void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source) +{ + struct lu_ref_link *link; + + spin_lock(&ref->lf_guard); + link = lu_ref_find(ref, scope, source); + if (link != NULL) { + list_del(&link->ll_linkage); + ref->lf_refs--; + spin_unlock(&ref->lf_guard); + OBD_SLAB_FREE(link, lu_ref_link_kmem, sizeof(*link)); + } else { + REFASSERT(ref, ref->lf_failed > 0); + ref->lf_failed--; + spin_unlock(&ref->lf_guard); + } +} +EXPORT_SYMBOL(lu_ref_del); + +void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, + const void *source0, const void *source1) +{ + spin_lock(&ref->lf_guard); + REFASSERT(ref, link != NULL && !IS_ERR(link)); + REFASSERT(ref, link->ll_ref == ref); + REFASSERT(ref, lu_ref_link_eq(link, scope, source0)); + link->ll_source = source1; + spin_unlock(&ref->lf_guard); +} +EXPORT_SYMBOL(lu_ref_set_at); + +void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source) +{ + spin_lock(&ref->lf_guard); + REFASSERT(ref, link != NULL && !IS_ERR(link)); + REFASSERT(ref, link->ll_ref == ref); + REFASSERT(ref, lu_ref_link_eq(link, scope, source)); + list_del(&link->ll_linkage); + ref->lf_refs--; + spin_unlock(&ref->lf_guard); +} +EXPORT_SYMBOL(lu_ref_del_at); + +#ifdef CONFIG_PROC_FS + +static void *lu_ref_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct lu_ref *ref = seq->private; + + spin_lock(&lu_ref_refs_guard); + if (list_empty(&ref->lf_linkage)) + ref = NULL; + spin_unlock(&lu_ref_refs_guard); + + return ref; +} + +static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos) +{ + struct lu_ref *ref = p; + struct lu_ref *next; + + LASSERT(seq->private == p); + LASSERT(!list_empty(&ref->lf_linkage)); + + spin_lock(&lu_ref_refs_guard); + next = list_entry(ref->lf_linkage.next, struct lu_ref, lf_linkage); + if (&next->lf_linkage == &lu_ref_refs) { + p = NULL; + } else { + (*pos)++; + list_move(&ref->lf_linkage, &next->lf_linkage); + } + spin_unlock(&lu_ref_refs_guard); + return p; +} + +static void lu_ref_seq_stop(struct seq_file *seq, void *p) +{ + /* Nothing to do */ +} + + +static int lu_ref_seq_show(struct seq_file *seq, void *p) +{ + struct lu_ref *ref = p; + struct lu_ref *next; + + spin_lock(&lu_ref_refs_guard); + next = list_entry(ref->lf_linkage.next, struct lu_ref, lf_linkage); + if ((&next->lf_linkage == &lu_ref_refs) || lu_ref_is_marker(next)) { + spin_unlock(&lu_ref_refs_guard); + return 0; + } + + /* print the entry */ + spin_lock(&next->lf_guard); + seq_printf(seq, "lu_ref: %p %d %d %s:%d\n", + next, next->lf_refs, next->lf_failed, + next->lf_func, next->lf_line); + if (next->lf_refs > 64) { + seq_puts(seq, " too many references, skip\n"); + } else { + struct lu_ref_link *link; + int i = 0; + + list_for_each_entry(link, &next->lf_list, ll_linkage) + seq_printf(seq, " #%d link: %s %p\n", + i++, link->ll_scope, link->ll_source); + } + spin_unlock(&next->lf_guard); + spin_unlock(&lu_ref_refs_guard); + + return 0; +} + +static struct seq_operations lu_ref_seq_ops = { + .start = lu_ref_seq_start, + .stop = lu_ref_seq_stop, + .next = lu_ref_seq_next, + .show = lu_ref_seq_show +}; + +static int lu_ref_seq_open(struct inode *inode, struct file *file) +{ + struct lu_ref *marker = &lu_ref_marker; + int result = 0; + + result = seq_open(file, &lu_ref_seq_ops); + if (result == 0) { + spin_lock(&lu_ref_refs_guard); + if (!list_empty(&marker->lf_linkage)) + result = -EAGAIN; + else + list_add(&marker->lf_linkage, &lu_ref_refs); + spin_unlock(&lu_ref_refs_guard); + + if (result == 0) { + struct seq_file *f = file->private_data; + + f->private = marker; + } else { + seq_release(inode, file); + } + } + + return result; +} + +static int lu_ref_seq_release(struct inode *inode, struct file *file) +{ + struct lu_ref *ref = ((struct seq_file *)file->private_data)->private; + + spin_lock(&lu_ref_refs_guard); + list_del_init(&ref->lf_linkage); + spin_unlock(&lu_ref_refs_guard); + + return seq_release(inode, file); +} + +static struct file_operations lu_ref_dump_fops = { + .owner = THIS_MODULE, + .open = lu_ref_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = lu_ref_seq_release +}; + +#endif /* CONFIG_PROC_FS */ + +int lu_ref_global_init(void) +{ + int result; + + CDEBUG(D_CONSOLE, + "lu_ref tracking is enabled. Performance isn't.\n"); + + result = lu_kmem_init(lu_ref_caches); + +#ifdef CONFIG_PROC_FS + if (result == 0) { + result = lprocfs_seq_create(proc_lustre_root, "lu_refs", + 0444, &lu_ref_dump_fops, NULL); + if (result) + lu_kmem_fini(lu_ref_caches); + } +#endif /* CONFIG_PROC_FS */ + + return result; +} + +void lu_ref_global_fini(void) +{ +#ifdef CONFIG_PROC_FS + lprocfs_remove_proc_entry("lu_refs", proc_lustre_root); +#endif /* CONFIG_PROC_FS */ + lu_kmem_fini(lu_ref_caches); +} + +#endif /* USE_LU_REF */ diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c new file mode 100644 index 0000000000000..893a971e486c5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c @@ -0,0 +1,682 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/obdclass/lu_tgt_descs.c + * + * Lustre target descriptions + * These are the only exported functions, they provide some generic + * infrastructure for target description management used by LOD/LMV + * + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include /* hash_long() */ +#include +#include +#include +#include +#include +#include + +/** + * lu_prandom_u64_max - returns a pseudo-random u64 number in interval + * [0, ep_ro) + * + * \param[in] ep_ro right open interval endpoint + * + * \retval a pseudo-random 64-bit number that is in interval [0, ep_ro). + */ +u64 lu_prandom_u64_max(u64 ep_ro) +{ + u64 rand = 0; + + if (ep_ro) { +#if BITS_PER_LONG == 32 + /* + * If ep_ro > 32-bit, first generate the high + * 32 bits of the random number, then add in the low + * 32 bits (truncated to the upper limit, if needed) + */ + if (ep_ro > 0xffffffffULL) + rand = prandom_u32_max((u32)(ep_ro >> 32)) << 32; + + if (rand == (ep_ro & 0xffffffff00000000ULL)) + rand |= prandom_u32_max((u32)ep_ro); + else + rand |= prandom_u32(); +#else + rand = ((u64)prandom_u32() << 32 | prandom_u32()) % ep_ro; +#endif + } + + return rand; +} +EXPORT_SYMBOL(lu_prandom_u64_max); + +void lu_qos_rr_init(struct lu_qos_rr *lqr) +{ + spin_lock_init(&lqr->lqr_alloc); + set_bit(LQ_DIRTY, &lqr->lqr_flags); +} +EXPORT_SYMBOL(lu_qos_rr_init); + +/** + * Add a new target to Quality of Service (QoS) target table. + * + * Add a new MDT/OST target to the structure representing an OSS. Resort the + * list of known MDSs/OSSs by the number of MDTs/OSTs attached to each MDS/OSS. + * The MDS/OSS list is protected internally and no external locking is required. + * + * \param[in] qos lu_qos data + * \param[in] tgt target description + * + * \retval 0 on success + * \retval -ENOMEM on error + */ +int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *tgt) +{ + struct lu_svr_qos *svr = NULL; + struct lu_svr_qos *tempsvr; + struct obd_export *exp = tgt->ltd_exp; + int found = 0; + __u32 id = 0; + int rc = 0; + + ENTRY; + + down_write(&qos->lq_rw_sem); + /* + * a bit hacky approach to learn NID of corresponding connection + * but there is no official API to access information like this + * with OSD API. + */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + if (obd_uuid_equals(&svr->lsq_uuid, + &exp->exp_connection->c_remote_uuid)) { + found++; + break; + } + if (svr->lsq_id > id) + id = svr->lsq_id; + } + + if (!found) { + OBD_ALLOC_PTR(svr); + if (!svr) + GOTO(out, rc = -ENOMEM); + memcpy(&svr->lsq_uuid, &exp->exp_connection->c_remote_uuid, + sizeof(svr->lsq_uuid)); + ++id; + svr->lsq_id = id; + } else { + /* Assume we have to move this one */ + list_del(&svr->lsq_svr_list); + } + + svr->lsq_tgt_count++; + tgt->ltd_qos.ltq_svr = svr; + + CDEBUG(D_OTHER, "add tgt %s to server %s (%d targets)\n", + obd_uuid2str(&tgt->ltd_uuid), obd_uuid2str(&svr->lsq_uuid), + svr->lsq_tgt_count); + + /* + * Add sorted by # of tgts. Find the first entry that we're + * bigger than... + */ + list_for_each_entry(tempsvr, &qos->lq_svr_list, lsq_svr_list) { + if (svr->lsq_tgt_count > tempsvr->lsq_tgt_count) + break; + } + /* + * ...and add before it. If we're the first or smallest, tempsvr + * points to the list head, and we add to the end. + */ + list_add_tail(&svr->lsq_svr_list, &tempsvr->lsq_svr_list); + + set_bit(LQ_DIRTY, &qos->lq_flags); + set_bit(LQ_DIRTY, &qos->lq_rr.lqr_flags); +out: + up_write(&qos->lq_rw_sem); + RETURN(rc); +} +EXPORT_SYMBOL(lu_qos_add_tgt); + +/** + * Remove MDT/OST target from QoS table. + * + * Removes given MDT/OST target from QoS table and releases related + * MDS/OSS structure if no target remain on the MDS/OSS. + * + * \param[in] qos lu_qos data + * \param[in] ltd target description + * + * \retval 0 on success + * \retval -ENOENT if no server was found + */ +static int lu_qos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd) +{ + struct lu_svr_qos *svr; + int rc = 0; + + ENTRY; + + down_write(&qos->lq_rw_sem); + svr = ltd->ltd_qos.ltq_svr; + if (!svr) + GOTO(out, rc = -ENOENT); + + svr->lsq_tgt_count--; + if (svr->lsq_tgt_count == 0) { + CDEBUG(D_OTHER, "removing server %s\n", + obd_uuid2str(&svr->lsq_uuid)); + list_del(&svr->lsq_svr_list); + ltd->ltd_qos.ltq_svr = NULL; + OBD_FREE_PTR(svr); + } + + set_bit(LQ_DIRTY, &qos->lq_flags); + set_bit(LQ_DIRTY, &qos->lq_rr.lqr_flags); +out: + up_write(&qos->lq_rw_sem); + RETURN(rc); +} + +static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt) +{ + struct obd_statfs *statfs = &tgt->ltd_statfs; + + return statfs->os_bavail * statfs->os_bsize; +} + +static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt) +{ + return tgt->ltd_statfs.os_ffree; +} + +/** + * Calculate weight for a given tgt. + * + * The final tgt weight is bavail >> 16 * iavail >> 8 minus the tgt and server + * penalties. See ltd_qos_penalties_calc() for how penalties are calculated. + * + * \param[in] tgt target descriptor + */ +void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt) +{ + struct lu_tgt_qos *ltq = &tgt->ltd_qos; + __u64 penalty; + + ltq->ltq_avail = (tgt_statfs_bavail(tgt) >> 16) * + (tgt_statfs_iavail(tgt) >> 8); + penalty = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty; + if (ltq->ltq_avail < penalty) + ltq->ltq_weight = 0; + else + ltq->ltq_weight = ltq->ltq_avail - penalty; +} +EXPORT_SYMBOL(lu_tgt_qos_weight_calc); + +/** + * Allocate and initialize target table. + * + * A helper function to initialize the target table and allocate + * a bitmap of the available targets. + * + * \param[in] ltd target's table to initialize + * \param[in] is_mdt target table for MDTs + * + * \retval 0 on success + * \retval negative negated errno on error + **/ +int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt) +{ + mutex_init(<d->ltd_mutex); + init_rwsem(<d->ltd_rw_sem); + + /* + * the tgt array and bitmap are allocated/grown dynamically as tgts are + * added to the LOD/LMV, see lu_tgt_descs_add() + */ + ltd->ltd_tgt_bitmap = CFS_ALLOCATE_BITMAP(BITS_PER_LONG); + if (!ltd->ltd_tgt_bitmap) + return -ENOMEM; + + ltd->ltd_tgts_size = BITS_PER_LONG; + ltd->ltd_death_row = 0; + ltd->ltd_refcount = 0; + + /* Set up allocation policy (QoS and RR) */ + INIT_LIST_HEAD(<d->ltd_qos.lq_svr_list); + init_rwsem(<d->ltd_qos.lq_rw_sem); + set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags); + set_bit(LQ_RESET, <d->ltd_qos.lq_flags); + ltd->ltd_is_mdt = is_mdt; + + /* MDT imbalance threshold is low to balance across MDTs + * relatively quickly, because each directory may result + * in a large number of files/subdirs created therein. + */ + if (is_mdt) { + ltd->ltd_lmv_desc.ld_pattern = LMV_HASH_TYPE_DEFAULT; + ltd->ltd_qos.lq_prio_free = LMV_QOS_DEF_PRIO_FREE * 256 / 100; + ltd->ltd_qos.lq_threshold_rr = + LMV_QOS_DEF_THRESHOLD_RR_PCT * 256 / 100; + } else { + ltd->ltd_qos.lq_prio_free = LOV_QOS_DEF_PRIO_FREE * 256 / 100; + ltd->ltd_qos.lq_threshold_rr = + LOV_QOS_DEF_THRESHOLD_RR_PCT * 256 / 100; + } + + return 0; +} +EXPORT_SYMBOL(lu_tgt_descs_init); + +/** + * Free bitmap and target table pages. + * + * \param[in] ltd target table + */ +void lu_tgt_descs_fini(struct lu_tgt_descs *ltd) +{ + int i; + + CFS_FREE_BITMAP(ltd->ltd_tgt_bitmap); + for (i = 0; i < TGT_PTRS; i++) { + if (ltd->ltd_tgt_idx[i]) + OBD_FREE_PTR(ltd->ltd_tgt_idx[i]); + } + ltd->ltd_tgts_size = 0; +} +EXPORT_SYMBOL(lu_tgt_descs_fini); + +/** + * Expand size of target table. + * + * When the target table is full, we have to extend the table. To do so, + * we allocate new memory with some reserve, move data from the old table + * to the new one and release memory consumed by the old table. + * + * \param[in] ltd target table + * \param[in] newsize new size of the table + * + * \retval 0 on success + * \retval -ENOMEM if reallocation failed + */ +static int lu_tgt_descs_resize(struct lu_tgt_descs *ltd, __u32 newsize) +{ + struct cfs_bitmap *new_bitmap, *old_bitmap = NULL; + + /* someone else has already resize the array */ + if (newsize <= ltd->ltd_tgts_size) + return 0; + + new_bitmap = CFS_ALLOCATE_BITMAP(newsize); + if (!new_bitmap) + return -ENOMEM; + + if (ltd->ltd_tgts_size > 0) { + /* the bitmap already exists, copy data from old one */ + cfs_bitmap_copy(new_bitmap, ltd->ltd_tgt_bitmap); + old_bitmap = ltd->ltd_tgt_bitmap; + } + + ltd->ltd_tgts_size = newsize; + ltd->ltd_tgt_bitmap = new_bitmap; + + if (old_bitmap) + CFS_FREE_BITMAP(old_bitmap); + + CDEBUG(D_CONFIG, "tgt size: %d\n", ltd->ltd_tgts_size); + + return 0; +} + +/** + * Add new target to target table. + * + * Extend target table if it's full, update target table and bitmap. + * Notice we need to take ltd_rw_sem exclusively before entry to ensure + * atomic switch. + * + * \param[in] ltd target table + * \param[in] tgt new target desc + * + * \retval 0 on success + * \retval -ENOMEM if reallocation failed + * -EEXIST if target existed + */ +int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) +{ + __u32 index = tgt->ltd_index; + int rc; + + ENTRY; + + if (index >= ltd->ltd_tgts_size) { + __u32 newsize = 1; + + while (newsize < index + 1) + newsize = newsize << 1; + + rc = lu_tgt_descs_resize(ltd, newsize); + if (rc) + RETURN(rc); + } else if (cfs_bitmap_check(ltd->ltd_tgt_bitmap, index)) { + RETURN(-EEXIST); + } + + if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL) { + OBD_ALLOC_PTR(ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK]); + if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL) + RETURN(-ENOMEM); + } + + LTD_TGT(ltd, tgt->ltd_index) = tgt; + cfs_bitmap_set(ltd->ltd_tgt_bitmap, tgt->ltd_index); + + ltd->ltd_lov_desc.ld_tgt_count++; + if (tgt->ltd_active) + ltd->ltd_lov_desc.ld_active_tgt_count++; + + RETURN(0); +} +EXPORT_SYMBOL(ltd_add_tgt); + +/** + * Delete target from target table + */ +void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) +{ + lu_qos_del_tgt(<d->ltd_qos, tgt); + LTD_TGT(ltd, tgt->ltd_index) = NULL; + cfs_bitmap_clear(ltd->ltd_tgt_bitmap, tgt->ltd_index); + ltd->ltd_lov_desc.ld_tgt_count--; + if (tgt->ltd_active) + ltd->ltd_lov_desc.ld_active_tgt_count--; +} +EXPORT_SYMBOL(ltd_del_tgt); + +/** + * Calculate penalties per-tgt and per-server + * + * Re-calculate penalties when the configuration changes, active targets + * change and after statfs refresh (all these are reflected by lq_dirty flag). + * On every tgt and server: decay the penalty by half for every 8x the update + * interval that the device has been idle. That gives lots of time for the + * statfs information to be updated (which the penalty is only a proxy for), + * and avoids penalizing server/tgt under light load. + * See lu_qos_tgt_weight_calc() for how penalties are factored into the weight. + * + * \param[in] ltd lu_tgt_descs + * + * \retval 0 on success + * \retval -EAGAIN the number of tgt isn't enough or all tgt spaces are + * almost the same + */ +int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd) +{ + struct lu_qos *qos = <d->ltd_qos; + struct lov_desc *desc = <d->ltd_lov_desc; + struct lu_tgt_desc *tgt; + struct lu_svr_qos *svr; + __u64 ba_max, ba_min, ba; + __u64 ia_max, ia_min, ia = 1; + __u32 num_active; + int prio_wide; + time64_t now, age; + int rc; + + ENTRY; + + if (!test_bit(LQ_DIRTY, &qos->lq_flags)) + GOTO(out, rc = 0); + + num_active = desc->ld_active_tgt_count - 1; + if (num_active < 1) + GOTO(out, rc = -EAGAIN); + + /* find bavail on each server */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + svr->lsq_bavail = 0; + /* if inode is not counted, set to 1 to ignore */ + svr->lsq_iavail = ltd->ltd_is_mdt ? 0 : 1; + } + qos->lq_active_svr_count = 0; + + /* + * How badly user wants to select targets "widely" (not recently chosen + * and not on recent MDS's). As opposed to "freely" (free space avail.) + * 0-256 + */ + prio_wide = 256 - qos->lq_prio_free; + + ba_min = (__u64)(-1); + ba_max = 0; + ia_min = (__u64)(-1); + ia_max = 0; + now = ktime_get_real_seconds(); + + /* Calculate server penalty per object */ + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + /* when inode is counted, bavail >> 16 to avoid overflow */ + ba = tgt_statfs_bavail(tgt); + if (ltd->ltd_is_mdt) + ba >>= 16; + else + ba >>= 8; + if (!ba) + continue; + + ba_min = min(ba, ba_min); + ba_max = max(ba, ba_max); + + /* Count the number of usable servers */ + if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0) + qos->lq_active_svr_count++; + tgt->ltd_qos.ltq_svr->lsq_bavail += ba; + + if (ltd->ltd_is_mdt) { + /* iavail >> 8 to avoid overflow */ + ia = tgt_statfs_iavail(tgt) >> 8; + if (!ia) + continue; + + ia_min = min(ia, ia_min); + ia_max = max(ia, ia_max); + + tgt->ltd_qos.ltq_svr->lsq_iavail += ia; + } + + /* + * per-tgt penalty is + * prio * bavail * iavail / (num_tgt - 1) / 2 + */ + tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8; + do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active); + tgt->ltd_qos.ltq_penalty_per_obj >>= 1; + + age = (now - tgt->ltd_qos.ltq_used) >> 3; + if (test_bit(LQ_RESET, &qos->lq_flags) || + age > 32 * desc->ld_qos_maxage) + tgt->ltd_qos.ltq_penalty = 0; + else if (age > desc->ld_qos_maxage) + /* Decay tgt penalty. */ + tgt->ltd_qos.ltq_penalty >>= age / desc->ld_qos_maxage; + } + + num_active = qos->lq_active_svr_count - 1; + if (num_active < 1) { + /* + * If there's only 1 server, we can't penalize it, so instead + * we have to double the tgt penalty + */ + num_active = 1; + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + tgt->ltd_qos.ltq_penalty_per_obj <<= 1; + } + } + + /* + * Per-server penalty is + * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2 + */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + ba = svr->lsq_bavail; + ia = svr->lsq_iavail; + svr->lsq_penalty_per_obj = prio_wide * ba * ia >> 8; + do_div(svr->lsq_penalty_per_obj, + svr->lsq_tgt_count * num_active); + svr->lsq_penalty_per_obj >>= 1; + + age = (now - svr->lsq_used) >> 3; + if (test_bit(LQ_RESET, &qos->lq_flags) || + age > 32 * desc->ld_qos_maxage) + svr->lsq_penalty = 0; + else if (age > desc->ld_qos_maxage) + /* Decay server penalty. */ + svr->lsq_penalty >>= age / desc->ld_qos_maxage; + } + + clear_bit(LQ_DIRTY, &qos->lq_flags); + clear_bit(LQ_RESET, &qos->lq_flags); + + /* + * If each tgt has almost same free space, do rr allocation for better + * creation performance + */ + clear_bit(LQ_SAME_SPACE, &qos->lq_flags); + if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min && + (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) { + set_bit(LQ_SAME_SPACE, &qos->lq_flags); + /* Reset weights for the next time we enter qos mode */ + set_bit(LQ_RESET, &qos->lq_flags); + } + rc = 0; + +out: + if (!rc && test_bit(LQ_SAME_SPACE, &qos->lq_flags)) + RETURN(-EAGAIN); + + RETURN(rc); +} +EXPORT_SYMBOL(ltd_qos_penalties_calc); + +/** + * Re-calculate penalties and weights of all tgts. + * + * The function is called when some target was used for a new object. In + * this case we should re-calculate all the weights to keep new allocations + * balanced well. + * + * \param[in] ltd lu_tgt_descs + * \param[in] tgt recently used tgt + * \param[out] total_wt new total weight for the pool + * + * \retval 0 + */ +int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, + __u64 *total_wt) +{ + struct lu_qos *qos = <d->ltd_qos; + struct lu_tgt_qos *ltq; + struct lu_svr_qos *svr; + + ENTRY; + + ltq = &tgt->ltd_qos; + LASSERT(ltq); + + /* Don't allocate on this device anymore, until the next alloc_qos */ + ltq->ltq_usable = 0; + + svr = ltq->ltq_svr; + + /* + * Decay old penalty by half (we're adding max penalty, and don't + * want it to run away.) + */ + ltq->ltq_penalty >>= 1; + svr->lsq_penalty >>= 1; + + /* mark the server and tgt as recently used */ + ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds(); + + /* Set max penalties for this tgt and server */ + ltq->ltq_penalty += ltq->ltq_penalty_per_obj * + ltd->ltd_lov_desc.ld_active_tgt_count; + svr->lsq_penalty += svr->lsq_penalty_per_obj * + qos->lq_active_svr_count; + + /* Decrease all MDS penalties */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + if (svr->lsq_penalty < svr->lsq_penalty_per_obj) + svr->lsq_penalty = 0; + else + svr->lsq_penalty -= svr->lsq_penalty_per_obj; + } + + *total_wt = 0; + /* Decrease all tgt penalties */ + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + ltq = &tgt->ltd_qos; + if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj) + ltq->ltq_penalty = 0; + else + ltq->ltq_penalty -= ltq->ltq_penalty_per_obj; + + lu_tgt_qos_weight_calc(tgt); + + /* Recalc the total weight of usable osts */ + if (ltq->ltq_usable) + *total_wt += ltq->ltq_weight; + + CDEBUG(D_OTHER, "recalc tgt %d usable=%d bavail=%llu ffree=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n", + tgt->ltd_index, ltq->ltq_usable, + tgt_statfs_bavail(tgt) >> 16, + tgt_statfs_iavail(tgt) >> 8, + ltq->ltq_penalty_per_obj >> 10, + ltq->ltq_penalty >> 10, + ltq->ltq_svr->lsq_penalty_per_obj >> 10, + ltq->ltq_svr->lsq_penalty >> 10, + ltq->ltq_weight >> 10); + } + + RETURN(0); +} +EXPORT_SYMBOL(ltd_qos_update); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c new file mode 100644 index 0000000000000..44a69f730e1cb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c @@ -0,0 +1,103 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_ucred.c + * + * Lustre user credentials context infrastructure. + * + * Author: Nikita Danilov + * Author: Fan Yong + * Author: Vitaly Fertman + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */ +LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred); + +static struct lu_context_key lu_ucred_key = { + .lct_tags = LCT_SERVER_SESSION, + .lct_init = lu_ucred_key_init, + .lct_fini = lu_ucred_key_fini +}; + +/** + * Get ucred key if session exists and ucred key is allocated on it. + * Return NULL otherwise. + */ +struct lu_ucred *lu_ucred(const struct lu_env *env) +{ + if (!env->le_ses) + return NULL; + return lu_context_key_get(env->le_ses, &lu_ucred_key); +} +EXPORT_SYMBOL(lu_ucred); + +/** + * Get ucred key and check if it is properly initialized. + * Return NULL otherwise. + */ +struct lu_ucred *lu_ucred_check(const struct lu_env *env) +{ + struct lu_ucred *uc = lu_ucred(env); + if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW) + return NULL; + return uc; +} +EXPORT_SYMBOL(lu_ucred_check); + +/** + * Get ucred key, which must exist and must be properly initialized. + * Assert otherwise. + */ +struct lu_ucred *lu_ucred_assert(const struct lu_env *env) +{ + struct lu_ucred *uc = lu_ucred_check(env); + LASSERT(uc != NULL); + return uc; +} +EXPORT_SYMBOL(lu_ucred_assert); + +int lu_ucred_global_init(void) +{ + LU_CONTEXT_KEY_INIT(&lu_ucred_key); + return lu_context_key_register(&lu_ucred_key); +} + +void lu_ucred_global_fini(void) +{ + lu_context_key_degister(&lu_ucred_key); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c new file mode 100644 index 0000000000000..4161b2dabfd72 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c @@ -0,0 +1,266 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lustre_handles.c + * + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include + + +static __u64 handle_base; +#define HANDLE_INCR 7 +static DEFINE_SPINLOCK(handle_base_lock); + +static struct handle_bucket { + spinlock_t lock; + struct list_head head; +} *handle_hash; + +#define HANDLE_HASH_SIZE (1 << 16) +#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1) + +/* + * Generate a unique 64bit cookie (hash) for a handle and insert it into + * global (per-node) hash-table. + */ +void class_handle_hash(struct portals_handle *h, + struct portals_handle_ops *ops) +{ + struct handle_bucket *bucket; + + ENTRY; + + LASSERT(h != NULL); + LASSERT(list_empty(&h->h_link)); + + /* + * This is fast, but simplistic cookie generation algorithm, it will + * need a re-do at some point in the future for security. + */ + spin_lock(&handle_base_lock); + handle_base += HANDLE_INCR; + + if (unlikely(handle_base == 0)) { + /* + * Cookie of zero is "dangerous", because in many places it's + * assumed that 0 means "unassigned" handle, not bound to any + * object. + */ + CWARN("The universe has been exhausted: cookie wrap-around.\n"); + handle_base += HANDLE_INCR; + } + h->h_cookie = handle_base; + spin_unlock(&handle_base_lock); + + h->h_ops = ops; + spin_lock_init(&h->h_lock); + + bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK]; + spin_lock(&bucket->lock); + list_add_rcu(&h->h_link, &bucket->head); + h->h_in = 1; + spin_unlock(&bucket->lock); + + CDEBUG(D_INFO, "added object %p with handle %#llx to hash\n", + h, h->h_cookie); + EXIT; +} +EXPORT_SYMBOL(class_handle_hash); + +static void class_handle_unhash_nolock(struct portals_handle *h) +{ + if (list_empty(&h->h_link)) { + CERROR("removing an already-removed handle (%#llx)\n", + h->h_cookie); + return; + } + + CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n", + h, h->h_cookie); + + spin_lock(&h->h_lock); + if (h->h_in == 0) { + spin_unlock(&h->h_lock); + return; + } + h->h_in = 0; + spin_unlock(&h->h_lock); + list_del_rcu(&h->h_link); +} + +void class_handle_unhash(struct portals_handle *h) +{ + struct handle_bucket *bucket; + bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); + + spin_lock(&bucket->lock); + class_handle_unhash_nolock(h); + spin_unlock(&bucket->lock); +} +EXPORT_SYMBOL(class_handle_unhash); + +void class_handle_hash_back(struct portals_handle *h) +{ + struct handle_bucket *bucket; + ENTRY; + + bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); + + spin_lock(&bucket->lock); + list_add_rcu(&h->h_link, &bucket->head); + h->h_in = 1; + spin_unlock(&bucket->lock); + + EXIT; +} +EXPORT_SYMBOL(class_handle_hash_back); + +void *class_handle2object(__u64 cookie, const void *owner) +{ + struct handle_bucket *bucket; + struct portals_handle *h; + void *retval = NULL; + + ENTRY; + + LASSERT(handle_hash != NULL); + + /* + * Be careful when you want to change this code. See the + * rcu_read_lock() definition on top this file. - jxiong + */ + bucket = handle_hash + (cookie & HANDLE_HASH_MASK); + + rcu_read_lock(); + list_for_each_entry_rcu(h, &bucket->head, h_link) { + if (h->h_cookie != cookie || h->h_owner != owner) + continue; + + spin_lock(&h->h_lock); + if (likely(h->h_in != 0)) { + h->h_ops->hop_addref(h); + retval = h; + } + spin_unlock(&h->h_lock); + break; + } + rcu_read_unlock(); + + RETURN(retval); +} +EXPORT_SYMBOL(class_handle2object); + +void class_handle_free_cb(struct rcu_head *rcu) +{ + struct portals_handle *h; + void *ptr; + + h = container_of(rcu, struct portals_handle, h_rcu); + ptr = (void *)(unsigned long)h->h_cookie; + + if (h->h_ops->hop_free != NULL) + h->h_ops->hop_free(ptr, h->h_size); + else + OBD_FREE(ptr, h->h_size); +} +EXPORT_SYMBOL(class_handle_free_cb); + +int class_handle_init(void) +{ + struct handle_bucket *bucket; + struct timespec64 ts; + int seed[2]; + + LASSERT(handle_hash == NULL); + + OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE); + if (handle_hash == NULL) + return -ENOMEM; + + for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash; + bucket--) { + INIT_LIST_HEAD(&bucket->head); + spin_lock_init(&bucket->lock); + } + + /** bug 21430: add randomness to the initial base */ + cfs_get_random_bytes(seed, sizeof(seed)); + ktime_get_ts64(&ts); + cfs_srand(ts.tv_sec ^ seed[0], ts.tv_nsec ^ seed[1]); + + cfs_get_random_bytes(&handle_base, sizeof(handle_base)); + LASSERT(handle_base != 0ULL); + + return 0; +} + +static int cleanup_all_handles(void) +{ + int rc; + int i; + + for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) { + struct portals_handle *h; + + spin_lock(&handle_hash[i].lock); + list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) { + CERROR("force clean handle %#llx addr %p ops %p\n", + h->h_cookie, h, h->h_ops); + + class_handle_unhash_nolock(h); + rc++; + } + spin_unlock(&handle_hash[i].lock); + } + + return rc; +} + +void class_handle_cleanup(void) +{ + int count; + + LASSERT(handle_hash != NULL); + + count = cleanup_all_handles(); + + OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE); + handle_hash = NULL; + + if (count != 0) + CERROR("handle_count at cleanup: %d\n", count); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c new file mode 100644 index 0000000000000..535d78eac5578 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c @@ -0,0 +1,203 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include +#include + +#define NIDS_MAX 32 + +struct uuid_nid_data { + struct list_head un_list; + struct obd_uuid un_uuid; + int un_nid_count; + lnet_nid_t un_nids[NIDS_MAX]; +}; + +/* FIXME: This should probably become more elegant than a global linked list */ +static LIST_HEAD(g_uuid_list); +static DEFINE_SPINLOCK(g_uuid_lock); + +int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index) +{ + struct uuid_nid_data *data; + struct obd_uuid tmp; + int rc = -ENOENT; + + obd_str2uuid(&tmp, uuid); + spin_lock(&g_uuid_lock); + list_for_each_entry(data, &g_uuid_list, un_list) { + if (obd_uuid_equals(&data->un_uuid, &tmp)) { + if (index >= data->un_nid_count) + break; + + rc = 0; + *peer_nid = data->un_nids[index]; + break; + } + } + spin_unlock(&g_uuid_lock); + return rc; +} +EXPORT_SYMBOL(lustre_uuid_to_peer); + +/* Add a nid to a niduuid. Multiple nids can be added to a single uuid; + LNET will choose the best one. */ +int class_add_uuid(const char *uuid, __u64 nid) +{ + struct uuid_nid_data *data, *entry; + int found = 0; + + LASSERT(nid != 0); /* valid newconfig NID is never zero */ + + if (strlen(uuid) > UUID_MAX - 1) + return -EOVERFLOW; + + OBD_ALLOC_PTR(data); + if (data == NULL) + return -ENOMEM; + + obd_str2uuid(&data->un_uuid, uuid); + data->un_nids[0] = nid; + data->un_nid_count = 1; + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) { + int i; + + found = 1; + for (i = 0; i < entry->un_nid_count; i++) + if (nid == entry->un_nids[i]) + break; + + if (i == entry->un_nid_count) { + LASSERT(entry->un_nid_count < NIDS_MAX); + entry->un_nids[entry->un_nid_count++] = nid; + } + break; + } + } + if (!found) + list_add(&data->un_list, &g_uuid_list); + spin_unlock(&g_uuid_lock); + + if (found) { + CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid, + libcfs_nid2str(nid), entry->un_nid_count); + OBD_FREE(data, sizeof(*data)); + } else { + CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid)); + } + return 0; +} + +/* Delete the nids for one uuid if specified, otherwise delete all */ +int class_del_uuid(const char *uuid) +{ + struct uuid_nid_data *data; + struct list_head deathrow; + + INIT_LIST_HEAD(&deathrow); + + spin_lock(&g_uuid_lock); + if (uuid != NULL) { + struct obd_uuid tmp; + + obd_str2uuid(&tmp, uuid); + list_for_each_entry(data, &g_uuid_list, un_list) { + if (obd_uuid_equals(&data->un_uuid, &tmp)) { + list_move(&data->un_list, &deathrow); + break; + } + } + } else + list_splice_init(&g_uuid_list, &deathrow); + spin_unlock(&g_uuid_lock); + + if (uuid != NULL && list_empty(&deathrow)) { + CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid); + return -EINVAL; + } + + while (!list_empty(&deathrow)) { + data = list_entry(deathrow.next, struct uuid_nid_data, + un_list); + list_del(&data->un_list); + + CDEBUG(D_INFO, "del uuid %s %s/%d\n", + obd_uuid2str(&data->un_uuid), + libcfs_nid2str(data->un_nids[0]), + data->un_nid_count); + + OBD_FREE(data, sizeof(*data)); + } + return 0; +} + +/* check if @nid exists in nid list of @uuid */ +int class_check_uuid(struct obd_uuid *uuid, __u64 nid) +{ + struct uuid_nid_data *entry; + int found = 0; + + ENTRY; + + CDEBUG(D_INFO, "check if uuid %s has %s.\n", + obd_uuid2str(uuid), libcfs_nid2str(nid)); + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + int i; + + if (!obd_uuid_equals(&entry->un_uuid, uuid)) + continue; + + /* found the uuid, check if it has @nid */ + for (i = 0; i < entry->un_nid_count; i++) { + if (entry->un_nids[i] == nid) { + found = 1; + break; + } + } + break; + } + spin_unlock(&g_uuid_lock); + RETURN(found); +} +EXPORT_SYMBOL(class_check_uuid); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c new file mode 100644 index 0000000000000..d0ca4f17b1cb3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c @@ -0,0 +1,199 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + * Use is subject to license terms. + * + * Author: Johann Lombardi + */ +#include +#include +#include +#include + +/** + * Initialize new \a lma. Only fid is stored. + * + * \param lma - is the new LMA structure to be initialized + * \param fid - is the FID of the object this LMA belongs to + * \param incompat - features that MDS must understand to access object + */ +void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid, + __u32 compat, __u32 incompat) +{ + lma->lma_compat = compat; + lma->lma_incompat = incompat; + lma->lma_self_fid = *fid; + + /* If a field is added in struct lustre_mdt_attrs, zero it explicitly + * and change the test below. */ + CLASSERT(sizeof(*lma) == + (offsetof(struct lustre_mdt_attrs, lma_self_fid) + + sizeof(lma->lma_self_fid))); +} +EXPORT_SYMBOL(lustre_lma_init); + +/** + * Swab, if needed, LMA structure which is stored on-disk in little-endian order. + * + * \param lma - is a pointer to the LMA structure to be swabbed. + */ +void lustre_lma_swab(struct lustre_mdt_attrs *lma) +{ +#ifdef __BIG_ENDIAN + __swab32s(&lma->lma_compat); + __swab32s(&lma->lma_incompat); + lustre_swab_lu_fid(&lma->lma_self_fid); +#endif +} +EXPORT_SYMBOL(lustre_lma_swab); + +void lustre_loa_init(struct lustre_ost_attrs *loa, const struct lu_fid *fid, + __u32 compat, __u32 incompat) +{ + CLASSERT(sizeof(*loa) == LMA_OLD_SIZE); + + memset(&loa->loa_parent_fid, 0, + sizeof(*loa) - offsetof(typeof(*loa), loa_parent_fid)); + lustre_lma_init(&loa->loa_lma, fid, compat, incompat); +} +EXPORT_SYMBOL(lustre_loa_init); + +/** + * Swab, if needed, LOA (for OST-object only) structure with LMA EA and PFID EA + * combined together are stored on-disk in little-endian order. + * + * \param[in] loa - the pointer to the LOA structure to be swabbed. + * \param[in] to_cpu - to indicate swab for CPU order or not. + */ +void lustre_loa_swab(struct lustre_ost_attrs *loa, bool to_cpu) +{ + struct lustre_mdt_attrs *lma = &loa->loa_lma; +#ifdef __BIG_ENDIAN + __u32 compat = lma->lma_compat; +#endif + + lustre_lma_swab(lma); +#ifdef __BIG_ENDIAN + if (to_cpu) + compat = lma->lma_compat; + + if (compat & LMAC_STRIPE_INFO) { + lustre_swab_lu_fid(&loa->loa_parent_fid); + __swab32s(&loa->loa_stripe_size); + } + if (compat & LMAC_COMP_INFO) { + __swab32s(&loa->loa_comp_id); + __swab64s(&loa->loa_comp_start); + __swab64s(&loa->loa_comp_end); + } +#endif +} +EXPORT_SYMBOL(lustre_loa_swab); + +/** + * Swab, if needed, SOM structure which is stored on-disk in little-endian + * order. + * + * \param attrs - is a pointer to the SOM structure to be swabbed. + */ +void lustre_som_swab(struct lustre_som_attrs *attrs) +{ +#ifdef __BIG_ENDIAN + __swab16s(&attrs->lsa_valid); + __swab64s(&attrs->lsa_size); + __swab64s(&attrs->lsa_blocks); +#endif +} +EXPORT_SYMBOL(lustre_som_swab); + +/** + * Swab, if needed, HSM structure which is stored on-disk in little-endian + * order. + * + * \param attrs - is a pointer to the HSM structure to be swabbed. + */ +void lustre_hsm_swab(struct hsm_attrs *attrs) +{ +#ifdef __BIG_ENDIAN + __swab32s(&attrs->hsm_compat); + __swab32s(&attrs->hsm_flags); + __swab64s(&attrs->hsm_arch_id); + __swab64s(&attrs->hsm_arch_ver); +#endif +} + +/* + * Swab and extract HSM attributes from on-disk xattr. + * + * \param buf - is a buffer containing the on-disk HSM extended attribute. + * \param rc - is the HSM xattr stored in \a buf + * \param mh - is the md_hsm structure where to extract HSM attributes. + */ +int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh) +{ + struct hsm_attrs *attrs = (struct hsm_attrs *)buf; + ENTRY; + + if (rc == 0 || rc == -ENODATA) + /* no HSM attributes */ + RETURN(-ENODATA); + + if (rc < 0) + /* error hit while fetching xattr */ + RETURN(rc); + + /* unpack HSM attributes */ + lustre_hsm_swab(attrs); + + /* fill md_hsm structure */ + mh->mh_compat = attrs->hsm_compat; + mh->mh_flags = attrs->hsm_flags; + mh->mh_arch_id = attrs->hsm_arch_id; + mh->mh_arch_ver = attrs->hsm_arch_ver; + + RETURN(0); +} +EXPORT_SYMBOL(lustre_buf2hsm); + +/* + * Pack HSM attributes. + * + * \param buf - is the output buffer where to pack the on-disk HSM xattr. + * \param mh - is the md_hsm structure to pack. + */ +void lustre_hsm2buf(void *buf, const struct md_hsm *mh) +{ + struct hsm_attrs *attrs = (struct hsm_attrs *)buf; + ENTRY; + + /* copy HSM attributes */ + attrs->hsm_compat = mh->mh_compat; + attrs->hsm_flags = mh->mh_flags; + attrs->hsm_arch_id = mh->mh_arch_id; + attrs->hsm_arch_ver = mh->mh_arch_ver; + + /* pack xattr */ + lustre_hsm_swab(attrs); +} +EXPORT_SYMBOL(lustre_hsm2buf); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c new file mode 100644 index 0000000000000..16e6f12f8a05c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c @@ -0,0 +1,149 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2018, DataDirect Networks Storage. + * Author: Li Xi. + * + * Checksum functions + */ +#include +#include + +/* Server uses algos that perform at 50% or better of the Adler */ +enum cksum_types obd_cksum_types_supported_server(const char *obd_name) +{ + enum cksum_types ret = OBD_CKSUM_ADLER; + int base_speed; + + CDEBUG(D_INFO, "%s: checksum speed: crc %d, crc32c %d, adler %d, " + "t10ip512 %d, t10ip4k %d, t10crc512 %d, t10crc4k %d\n", + obd_name, + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)), + obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512), + obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K), + obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512), + obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K)); + + base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2; + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >= + base_speed) + ret |= OBD_CKSUM_CRC32C; + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >= + base_speed) + ret |= OBD_CKSUM_CRC32; + + if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512) >= base_speed) + ret |= OBD_CKSUM_T10IP512; + + if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K) >= base_speed) + ret |= OBD_CKSUM_T10IP4K; + + if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512) >= base_speed) + ret |= OBD_CKSUM_T10CRC512; + + if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K) >= base_speed) + ret |= OBD_CKSUM_T10CRC4K; + + return ret; +} +EXPORT_SYMBOL(obd_cksum_types_supported_server); + +/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can + * only be a single checksum type per RPC. + * + * The OBD_CKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask + * since they need to represent the full range of checksum algorithms that + * both the client and server can understand. + * + * In case of an unsupported types/flags we fall back to ADLER + * because that is supported by all clients since 1.8 + * + * In case multiple algorithms are supported the best one is used. */ +u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type) +{ + unsigned int performance = 0, tmp; + u32 flag = OBD_FL_CKSUM_ADLER; + + if (cksum_type & OBD_CKSUM_CRC32) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_CRC32; + } + } + if (cksum_type & OBD_CKSUM_CRC32C) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_CRC32C; + } + } + if (cksum_type & OBD_CKSUM_ADLER) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_ADLER; + } + } + + if (cksum_type & OBD_CKSUM_T10IP512) { + tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_T10IP512; + } + } + + if (cksum_type & OBD_CKSUM_T10IP4K) { + tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_T10IP4K; + } + } + + if (cksum_type & OBD_CKSUM_T10CRC512) { + tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_T10CRC512; + } + } + + if (cksum_type & OBD_CKSUM_T10CRC4K) { + tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_T10CRC4K; + } + } + + if (unlikely(cksum_type && !(cksum_type & OBD_CKSUM_ALL))) + CWARN("%s: unknown cksum type %x\n", obd_name, cksum_type); + + return flag; +} +EXPORT_SYMBOL(obd_cksum_type_pack); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c new file mode 100644 index 0000000000000..a5b5dcfe572fe --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c @@ -0,0 +1,2295 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_config.c + * + * Config API + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "llog_internal.h" + +static struct cfs_hash_ops uuid_hash_ops; +static struct cfs_hash_ops nid_hash_ops; +static struct cfs_hash_ops nid_stat_hash_ops; +static struct cfs_hash_ops gen_hash_ops; + +/*********** string parsing utils *********/ + +/* returns 0 if we find this key in the buffer, else 1 */ +int class_find_param(char *buf, char *key, char **valp) +{ + char *ptr; + + if (!buf) + return 1; + + if ((ptr = strstr(buf, key)) == NULL) + return 1; + + if (valp) + *valp = ptr + strlen(key); + + return 0; +} +EXPORT_SYMBOL(class_find_param); + +/** + * Check whether the proc parameter \a param is an old parameter or not from + * the array \a ptr which contains the mapping from old parameters to new ones. + * If it's an old one, then return the pointer to the cfg_interop_param struc- + * ture which contains both the old and new parameters. + * + * \param param proc parameter + * \param ptr an array which contains the mapping from + * old parameters to new ones + * + * \retval valid-pointer pointer to the cfg_interop_param structure + * which contains the old and new parameters + * \retval NULL \a param or \a ptr is NULL, + * or \a param is not an old parameter + */ +struct cfg_interop_param *class_find_old_param(const char *param, + struct cfg_interop_param *ptr) +{ + char *value = NULL; + int name_len = 0; + + if (param == NULL || ptr == NULL) + RETURN(NULL); + + value = strchr(param, '='); + if (value == NULL) + name_len = strlen(param); + else + name_len = value - param; + + while (ptr->old_param != NULL) { + if (strncmp(param, ptr->old_param, name_len) == 0 && + name_len == strlen(ptr->old_param)) + RETURN(ptr); + ptr++; + } + + RETURN(NULL); +} +EXPORT_SYMBOL(class_find_old_param); + +/** + * Finds a parameter in \a params and copies it to \a copy. + * + * Leading spaces are skipped. Next space or end of string is the + * parameter terminator with the exception that spaces inside single or double + * quotes get included into a parameter. The parameter is copied into \a copy + * which has to be allocated big enough by a caller, quotes are stripped in + * the copy and the copy is terminated by 0. + * + * On return \a params is set to next parameter or to NULL if last + * parameter is returned. + * + * \retval 0 if parameter is returned in \a copy + * \retval 1 otherwise + * \retval -EINVAL if unbalanced quota is found + */ +int class_get_next_param(char **params, char *copy) +{ + char *q1, *q2, *str; + int len; + + str = *params; + while (*str == ' ') + str++; + + if (*str == '\0') { + *params = NULL; + return 1; + } + + while (1) { + q1 = strpbrk(str, " '\""); + if (q1 == NULL) { + len = strlen(str); + memcpy(copy, str, len); + copy[len] = '\0'; + *params = NULL; + return 0; + } + len = q1 - str; + if (*q1 == ' ') { + memcpy(copy, str, len); + copy[len] = '\0'; + *params = str + len; + return 0; + } + + memcpy(copy, str, len); + copy += len; + + /* search for the matching closing quote */ + str = q1 + 1; + q2 = strchr(str, *q1); + if (q2 == NULL) { + CERROR("Unbalanced quota in parameters: \"%s\"\n", + *params); + return -EINVAL; + } + len = q2 - str; + memcpy(copy, str, len); + copy += len; + str = q2 + 1; + } + return 1; +} +EXPORT_SYMBOL(class_get_next_param); + +/* returns 0 if this is the first key in the buffer, else 1. + valp points to first char after key. */ +int class_match_param(char *buf, const char *key, char **valp) +{ + if (!buf) + return 1; + + if (memcmp(buf, key, strlen(key)) != 0) + return 1; + + if (valp) + *valp = buf + strlen(key); + + return 0; +} +EXPORT_SYMBOL(class_match_param); + +static int parse_nid(char *buf, void *value, int quiet) +{ + lnet_nid_t *nid = (lnet_nid_t *)value; + + *nid = libcfs_str2nid(buf); + if (*nid != LNET_NID_ANY) + return 0; + + if (!quiet) + LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf); + return -EINVAL; +} + +static int parse_net(char *buf, void *value) +{ + __u32 *net = (__u32 *)value; + + *net = libcfs_str2net(buf); + CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net)); + return 0; +} + +enum { + CLASS_PARSE_NID = 1, + CLASS_PARSE_NET, +}; + +/* 0 is good nid, + 1 not found + < 0 error + endh is set to next separator */ +static int class_parse_value(char *buf, int opc, void *value, char **endh, + int quiet) +{ + char *endp; + char tmp; + int rc = 0; + + if (!buf) + return 1; + while (*buf == ',' || *buf == ':') + buf++; + if (*buf == ' ' || *buf == '/' || *buf == '\0') + return 1; + + /* nid separators or end of nids */ + endp = strpbrk(buf, ",: /"); + if (endp == NULL) + endp = buf + strlen(buf); + + tmp = *endp; + *endp = '\0'; + switch (opc) { + default: + LBUG(); + case CLASS_PARSE_NID: + rc = parse_nid(buf, value, quiet); + break; + case CLASS_PARSE_NET: + rc = parse_net(buf, value); + break; + } + *endp = tmp; + if (rc != 0) + return rc; + if (endh) + *endh = endp; + return 0; +} + +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0); +} +EXPORT_SYMBOL(class_parse_nid); + +int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1); +} +EXPORT_SYMBOL(class_parse_nid_quiet); + +int class_parse_net(char *buf, __u32 *net, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0); +} + +/* 1 param contains key and match + * 0 param contains key and not match + * -1 param does not contain key + */ +int class_match_nid(char *buf, char *key, lnet_nid_t nid) +{ + lnet_nid_t tmp; + int rc = -1; + + while (class_find_param(buf, key, &buf) == 0) { + /* please restrict to the nids pertaining to + * the specified nids */ + while (class_parse_nid(buf, &tmp, &buf) == 0) { + if (tmp == nid) + return 1; + } + rc = 0; + } + return rc; +} + +int class_match_net(char *buf, char *key, __u32 net) +{ + __u32 tmp; + int rc = -1; + + while (class_find_param(buf, key, &buf) == 0) { + /* please restrict to the nids pertaining to + * the specified networks */ + while (class_parse_net(buf, &tmp, &buf) == 0) { + if (tmp == net) + return 1; + } + rc = 0; + } + return rc; +} + +char *lustre_cfg_string(struct lustre_cfg *lcfg, u32 index) +{ + char *s; + + if (!lcfg->lcfg_buflens[index]) + return NULL; + + s = lustre_cfg_buf(lcfg, index); + if (!s) + return NULL; + + /* + * make sure it's NULL terminated, even if this kills a char + * of data. Try to use the padding first though. + */ + if (s[lcfg->lcfg_buflens[index] - 1] != '\0') { + size_t last = ALIGN(lcfg->lcfg_buflens[index], 8) - 1; + char lost; + + /* Use the smaller value */ + if (last > lcfg->lcfg_buflens[index]) + last = lcfg->lcfg_buflens[index]; + + lost = s[last]; + s[last] = '\0'; + if (lost != '\0') { + CWARN("Truncated buf %d to '%s' (lost '%c'...)\n", + index, s, lost); + } + } + return s; +} +EXPORT_SYMBOL(lustre_cfg_string); + +/********************** class fns **********************/ + +/** + * Create a new obd device and set the type, name and uuid. If successful, + * the new device can be accessed by either name or uuid. + */ +int class_attach(struct lustre_cfg *lcfg) +{ + struct obd_export *exp; + struct obd_device *obd = NULL; + char *typename, *name, *uuid; + int rc, len; + ENTRY; + + if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("No type passed!\n"); + RETURN(-EINVAL); + } + typename = lustre_cfg_string(lcfg, 1); + + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) { + CERROR("No name passed!\n"); + RETURN(-EINVAL); + } + name = lustre_cfg_string(lcfg, 0); + if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) { + CERROR("No UUID passed!\n"); + RETURN(-EINVAL); + } + + uuid = lustre_cfg_string(lcfg, 2); + len = strlen(uuid); + if (len >= sizeof(obd->obd_uuid)) { + CERROR("%s: uuid must be < %d bytes long\n", + name, (int)sizeof(obd->obd_uuid)); + RETURN(-EINVAL); + } + + obd = class_newdev(typename, name, uuid); + if (IS_ERR(obd)) { /* Already exists or out of obds */ + rc = PTR_ERR(obd); + CERROR("Cannot create device %s of type %s : %d\n", + name, typename, rc); + RETURN(rc); + } + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "obd %p obd_magic %08X != %08X\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0, + "%p obd_name %s != %s\n", obd, obd->obd_name, name); + + exp = class_new_export_self(obd, &obd->obd_uuid); + if (IS_ERR(exp)) { + rc = PTR_ERR(exp); + class_free_dev(obd); + RETURN(rc); + } + + obd->obd_self_export = exp; + list_del_init(&exp->exp_obd_chain_timed); + class_export_put(exp); + + rc = class_register_device(obd); + if (rc != 0) { + class_decref(obd, "newdev", obd); + RETURN(rc); + } + + obd->obd_attached = 1; + CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n", + obd->obd_minor, typename, atomic_read(&obd->obd_refcount)); + + RETURN(0); +} +EXPORT_SYMBOL(class_attach); + +/** Create hashes, self-export, and call type-specific setup. + * Setup is effectively the "start this obd" call. + */ +int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int err = 0; + ENTRY; + + LASSERT(obd != NULL); + LASSERTF(obd == class_num2obd(obd->obd_minor), + "obd %p != obd_devs[%d] %p\n", + obd, obd->obd_minor, class_num2obd(obd->obd_minor)); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "obd %p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + + /* have we attached a type to this device? */ + if (!obd->obd_attached) { + CERROR("Device %d not attached\n", obd->obd_minor); + RETURN(-ENODEV); + } + + if (obd->obd_set_up) { + CERROR("Device %d already setup (type %s)\n", + obd->obd_minor, obd->obd_type->typ_name); + RETURN(-EEXIST); + } + + /* is someone else setting us up right now? (attach inits spinlock) */ + spin_lock(&obd->obd_dev_lock); + if (obd->obd_starting) { + spin_unlock(&obd->obd_dev_lock); + CERROR("Device %d setup in progress (type %s)\n", + obd->obd_minor, obd->obd_type->typ_name); + RETURN(-EEXIST); + } + /* just leave this on forever. I can't use obd_set_up here because + other fns check that status, and we're not actually set up yet. */ + obd->obd_starting = 1; + obd->obd_uuid_hash = NULL; + obd->obd_nid_hash = NULL; + obd->obd_nid_stats_hash = NULL; + obd->obd_gen_hash = NULL; + spin_unlock(&obd->obd_dev_lock); + + /* create an uuid-export lustre hash */ + obd->obd_uuid_hash = cfs_hash_create("UUID_HASH", + HASH_UUID_CUR_BITS, + HASH_UUID_MAX_BITS, + HASH_UUID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &uuid_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_uuid_hash) + GOTO(err_exit, err = -ENOMEM); + + /* create a nid-export lustre hash */ + obd->obd_nid_hash = cfs_hash_create("NID_HASH", + HASH_NID_CUR_BITS, + HASH_NID_MAX_BITS, + HASH_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nid_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_nid_hash) + GOTO(err_exit, err = -ENOMEM); + + /* create a nid-stats lustre hash */ + obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS", + HASH_NID_STATS_CUR_BITS, + HASH_NID_STATS_MAX_BITS, + HASH_NID_STATS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nid_stat_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_nid_stats_hash) + GOTO(err_exit, err = -ENOMEM); + + /* create a client_generation-export lustre hash */ + obd->obd_gen_hash = cfs_hash_create("UUID_HASH", + HASH_GEN_CUR_BITS, + HASH_GEN_MAX_BITS, + HASH_GEN_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &gen_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_gen_hash) + GOTO(err_exit, err = -ENOMEM); + + err = obd_setup(obd, lcfg); + if (err) + GOTO(err_exit, err); + + obd->obd_set_up = 1; + + spin_lock(&obd->obd_dev_lock); + /* cleanup drops this */ + class_incref(obd, "setup", obd); + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n", + obd->obd_name, obd->obd_uuid.uuid); + + RETURN(0); +err_exit: + if (obd->obd_uuid_hash) { + cfs_hash_putref(obd->obd_uuid_hash); + obd->obd_uuid_hash = NULL; + } + if (obd->obd_nid_hash) { + cfs_hash_putref(obd->obd_nid_hash); + obd->obd_nid_hash = NULL; + } + if (obd->obd_nid_stats_hash) { + cfs_hash_putref(obd->obd_nid_stats_hash); + obd->obd_nid_stats_hash = NULL; + } + if (obd->obd_gen_hash) { + cfs_hash_putref(obd->obd_gen_hash); + obd->obd_gen_hash = NULL; + } + obd->obd_starting = 0; + CERROR("setup %s failed (%d)\n", obd->obd_name, err); + return err; +} +EXPORT_SYMBOL(class_setup); + +/** We have finished using this obd and are ready to destroy it. + * There can be no more references to this obd. + */ +int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + ENTRY; + + if (obd->obd_set_up) { + CERROR("OBD device %d still set up\n", obd->obd_minor); + RETURN(-EBUSY); + } + + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_attached) { + spin_unlock(&obd->obd_dev_lock); + CERROR("OBD device %d not attached\n", obd->obd_minor); + RETURN(-ENODEV); + } + obd->obd_attached = 0; + spin_unlock(&obd->obd_dev_lock); + + /* cleanup in progress. we don't like to find this device after now */ + class_unregister_device(obd); + + CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n", + obd->obd_name, obd->obd_uuid.uuid); + + class_decref(obd, "newdev", obd); + + RETURN(0); +} +EXPORT_SYMBOL(class_detach); + +/** Start shutting down the obd. There may be in-progess ops when + * this is called. We tell them to start shutting down with a call + * to class_disconnect_exports(). + */ +int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int err = 0; + char *flag; + ENTRY; + + OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS); + + if (!obd->obd_set_up) { + CERROR("Device %d not setup\n", obd->obd_minor); + RETURN(-ENODEV); + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + CERROR("OBD %d already stopping\n", obd->obd_minor); + RETURN(-ENODEV); + } + /* Leave this on forever */ + obd->obd_stopping = 1; + /* function can't return error after that point, so clear setup flag + * as early as possible to avoid finding via obd_devs / hash */ + obd->obd_set_up = 0; + spin_unlock(&obd->obd_dev_lock); + + /* wait for already-arrived-connections to finish. */ + while (obd->obd_conn_inprogress > 0) + yield(); + smp_rmb(); + + if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) { + for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++) + switch (*flag) { + case 'F': + obd->obd_force = 1; + break; + case 'A': + LCONSOLE_WARN("Failing over %s\n", + obd->obd_name); + obd->obd_fail = 1; + obd->obd_no_transno = 1; + obd->obd_no_recov = 1; + if (OBP(obd, iocontrol)) { + obd_iocontrol(OBD_IOC_SYNC, + obd->obd_self_export, + 0, NULL, NULL); + } + break; + default: + CERROR("Unrecognised flag '%c'\n", *flag); + } + } + + LASSERT(obd->obd_self_export); + + CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n", + obd->obd_name, obd->obd_num_exports, + atomic_read(&obd->obd_refcount) - 2); + dump_exports(obd, 0, D_HA); + class_disconnect_exports(obd); + + /* Precleanup, we must make sure all exports get destroyed. */ + err = obd_precleanup(obd); + if (err) + CERROR("Precleanup %s returned %d\n", + obd->obd_name, err); + + /* destroy an uuid-export hash body */ + if (obd->obd_uuid_hash) { + cfs_hash_putref(obd->obd_uuid_hash); + obd->obd_uuid_hash = NULL; + } + + /* destroy a nid-export hash body */ + if (obd->obd_nid_hash) { + cfs_hash_putref(obd->obd_nid_hash); + obd->obd_nid_hash = NULL; + } + + /* destroy a nid-stats hash body */ + if (obd->obd_nid_stats_hash) { + cfs_hash_putref(obd->obd_nid_stats_hash); + obd->obd_nid_stats_hash = NULL; + } + + /* destroy a client_generation-export hash body */ + if (obd->obd_gen_hash) { + cfs_hash_putref(obd->obd_gen_hash); + obd->obd_gen_hash = NULL; + } + + class_decref(obd, "setup", obd); + obd->obd_set_up = 0; + + RETURN(0); +} + +struct obd_device *class_incref(struct obd_device *obd, + const char *scope, const void *source) +{ + lu_ref_add_atomic(&obd->obd_reference, scope, source); + atomic_inc(&obd->obd_refcount); + CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd, + atomic_read(&obd->obd_refcount)); + + return obd; +} +EXPORT_SYMBOL(class_incref); + +void class_decref(struct obd_device *obd, const char *scope, const void *source) +{ + int last; + + CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd, + atomic_read(&obd->obd_refcount), scope); + + LASSERT(obd->obd_num_exports >= 0); + last = atomic_dec_and_test(&obd->obd_refcount); + lu_ref_del(&obd->obd_reference, scope, source); + + if (last) { + struct obd_export *exp; + + LASSERT(!obd->obd_attached); + /* All exports have been destroyed; there should + * be no more in-progress ops by this point.*/ + exp = obd->obd_self_export; + + if (exp) { + exp->exp_flags |= exp_flags_from_obd(obd); + class_unlink_export(exp); + } + } +} +EXPORT_SYMBOL(class_decref); + +/** Add a failover nid location. + * Client obd types contact server obd types using this nid list. + */ +int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_import *imp; + struct obd_uuid uuid; + int rc; + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { + CERROR("invalid conn_uuid\n"); + RETURN(-EINVAL); + } + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) { + CERROR("can't add connection on non-client dev\n"); + RETURN(-EINVAL); + } + + imp = obd->u.cli.cl_import; + if (!imp) { + CERROR("try to add conn on immature client dev\n"); + RETURN(-EINVAL); + } + + obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); + rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num); + + RETURN(rc); +} + +/** Remove a failover nid location. + */ +static int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_import *imp; + struct obd_uuid uuid; + int rc; + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { + CERROR("invalid conn_uuid\n"); + RETURN(-EINVAL); + } + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { + CERROR("can't del connection on non-client dev\n"); + RETURN(-EINVAL); + } + + imp = obd->u.cli.cl_import; + if (!imp) { + CERROR("try to del conn on immature client dev\n"); + RETURN(-EINVAL); + } + + obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); + rc = obd_del_conn(imp, &uuid); + + RETURN(rc); +} + +static LIST_HEAD(lustre_profile_list); +static DEFINE_SPINLOCK(lustre_profile_list_lock); + +struct lustre_profile *class_get_profile(const char *prof) +{ + struct lustre_profile *lprof; + + ENTRY; + spin_lock(&lustre_profile_list_lock); + list_for_each_entry(lprof, &lustre_profile_list, lp_list) { + if (!strcmp(lprof->lp_profile, prof)) { + lprof->lp_refs++; + spin_unlock(&lustre_profile_list_lock); + RETURN(lprof); + } + } + spin_unlock(&lustre_profile_list_lock); + RETURN(NULL); +} +EXPORT_SYMBOL(class_get_profile); + +/** Create a named "profile". + * This defines the mdc and osc names to use for a client. + * This also is used to define the lov to be used by a mdt. + */ +static int class_add_profile(int proflen, char *prof, int osclen, char *osc, + int mdclen, char *mdc) +{ + struct lustre_profile *lprof; + int err = 0; + ENTRY; + + CDEBUG(D_CONFIG, "Add profile %s\n", prof); + + OBD_ALLOC(lprof, sizeof(*lprof)); + if (lprof == NULL) + RETURN(-ENOMEM); + INIT_LIST_HEAD(&lprof->lp_list); + + LASSERT(proflen == (strlen(prof) + 1)); + OBD_ALLOC(lprof->lp_profile, proflen); + if (lprof->lp_profile == NULL) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_profile, prof, proflen); + + LASSERT(osclen == (strlen(osc) + 1)); + OBD_ALLOC(lprof->lp_dt, osclen); + if (lprof->lp_dt == NULL) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_dt, osc, osclen); + + if (mdclen > 0) { + LASSERT(mdclen == (strlen(mdc) + 1)); + OBD_ALLOC(lprof->lp_md, mdclen); + if (lprof->lp_md == NULL) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_md, mdc, mdclen); + } + + spin_lock(&lustre_profile_list_lock); + lprof->lp_refs = 1; + lprof->lp_list_deleted = false; + + list_add(&lprof->lp_list, &lustre_profile_list); + spin_unlock(&lustre_profile_list_lock); + RETURN(err); + +out: + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, mdclen); + if (lprof->lp_dt) + OBD_FREE(lprof->lp_dt, osclen); + if (lprof->lp_profile) + OBD_FREE(lprof->lp_profile, proflen); + OBD_FREE(lprof, sizeof(*lprof)); + RETURN(err); +} + +void class_del_profile(const char *prof) +{ + struct lustre_profile *lprof; + ENTRY; + + CDEBUG(D_CONFIG, "Del profile %s\n", prof); + + lprof = class_get_profile(prof); + if (lprof) { + spin_lock(&lustre_profile_list_lock); + /* because get profile increments the ref counter */ + lprof->lp_refs--; + list_del(&lprof->lp_list); + lprof->lp_list_deleted = true; + spin_unlock(&lustre_profile_list_lock); + + class_put_profile(lprof); + } + EXIT; +} +EXPORT_SYMBOL(class_del_profile); + +void class_put_profile(struct lustre_profile *lprof) +{ + spin_lock(&lustre_profile_list_lock); + if ((--lprof->lp_refs) > 0) { + LASSERT(lprof->lp_refs > 0); + spin_unlock(&lustre_profile_list_lock); + return; + } + spin_unlock(&lustre_profile_list_lock); + + /* confirm not a negative number */ + LASSERT(lprof->lp_refs == 0); + + /* At least one class_del_profile/profiles must be called + * on the target profile or lustre_profile_list will corrupt */ + LASSERT(lprof->lp_list_deleted); + OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1); + OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1); + if (lprof->lp_md != NULL) + OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1); + OBD_FREE(lprof, sizeof(*lprof)); +} +EXPORT_SYMBOL(class_put_profile); + +/* COMPAT_146 */ +void class_del_profiles(void) +{ + struct lustre_profile *lprof, *n; + ENTRY; + + spin_lock(&lustre_profile_list_lock); + list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) { + list_del(&lprof->lp_list); + lprof->lp_list_deleted = true; + spin_unlock(&lustre_profile_list_lock); + + class_put_profile(lprof); + + spin_lock(&lustre_profile_list_lock); + } + spin_unlock(&lustre_profile_list_lock); + EXIT; +} +EXPORT_SYMBOL(class_del_profiles); + +/* We can't call lquota_process_config directly because + * it lives in a module that must be loaded after this one. + */ +#ifdef HAVE_SERVER_SUPPORT +static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL; +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * Rename the proc parameter in \a cfg with a new name \a new_name. + * + * \param cfg config structure which contains the proc parameter + * \param new_name new name of the proc parameter + * + * \retval valid-pointer pointer to the newly-allocated config structure + * which contains the renamed proc parameter + * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does + * not contain a proc parameter + * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs + */ +struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg, + const char *new_name) +{ + struct lustre_cfg_bufs *bufs = NULL; + struct lustre_cfg *new_cfg = NULL; + char *param = NULL; + char *new_param = NULL; + char *value = NULL; + int name_len = 0; + int new_len = 0; + ENTRY; + + if (!cfg || !new_name) + GOTO(out_nocfg, new_cfg = ERR_PTR(-EINVAL)); + + param = lustre_cfg_string(cfg, 1); + if (!param) + GOTO(out_nocfg, new_cfg = ERR_PTR(-EINVAL)); + + value = strchr(param, '='); + if (value == NULL) + name_len = strlen(param); + else + name_len = value - param; + + new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len; + + OBD_ALLOC(new_param, new_len); + if (!new_param) + GOTO(out_nocfg, new_cfg = ERR_PTR(-ENOMEM)); + + strcpy(new_param, new_name); + if (value != NULL) + strcat(new_param, value); + + OBD_ALLOC_PTR(bufs); + if (!bufs) + GOTO(out_free_param, new_cfg = ERR_PTR(-ENOMEM)); + + lustre_cfg_bufs_reset(bufs, NULL); + lustre_cfg_bufs_init(bufs, cfg); + lustre_cfg_bufs_set_string(bufs, 1, new_param); + + OBD_ALLOC(new_cfg, lustre_cfg_len(bufs->lcfg_bufcount, + bufs->lcfg_buflen)); + if (!new_cfg) + GOTO(out_free_buf, new_cfg = ERR_PTR(-ENOMEM)); + + lustre_cfg_init(new_cfg, cfg->lcfg_command, bufs); + + new_cfg->lcfg_num = cfg->lcfg_num; + new_cfg->lcfg_flags = cfg->lcfg_flags; + new_cfg->lcfg_nid = cfg->lcfg_nid; + new_cfg->lcfg_nal = cfg->lcfg_nal; +out_free_buf: + OBD_FREE_PTR(bufs); +out_free_param: + OBD_FREE(new_param, new_len); +out_nocfg: + RETURN(new_cfg); +} +EXPORT_SYMBOL(lustre_cfg_rename); + +static ssize_t process_param2_config(struct lustre_cfg *lcfg) +{ + char *param = lustre_cfg_string(lcfg, 1); + char *upcall = lustre_cfg_string(lcfg, 2); + struct kobject *kobj = NULL; + const char *subsys = param; + char *argv[] = { + [0] = "/usr/sbin/lctl", + [1] = "set_param", + [2] = param, + [3] = NULL + }; + ktime_t start; + ktime_t end; + size_t len; + int rc; + + ENTRY; + print_lustre_cfg(lcfg); + + len = strcspn(param, ".="); + if (!len) + return -EINVAL; + + /* If we find '=' then its the top level sysfs directory */ + if (param[len] == '=') + return class_set_global(param); + + subsys = kstrndup(param, len, GFP_KERNEL); + if (!subsys) + return -ENOMEM; + + kobj = kset_find_obj(lustre_kset, subsys); + kfree(subsys); + if (kobj) { + char *value = param; + char *envp[3]; + int i; + + param = strsep(&value, "="); + envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s", param); + envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value); + envp[2] = NULL; + + rc = kobject_uevent_env(kobj, KOBJ_CHANGE, envp); + for (i = 0; i < ARRAY_SIZE(envp); i++) + kfree(envp[i]); + + kobject_put(kobj); + + RETURN(rc); + } + + /* Add upcall processing here. Now only lctl is supported */ + if (strcmp(upcall, LCTL_UPCALL) != 0) { + CERROR("Unsupported upcall %s\n", upcall); + RETURN(-EINVAL); + } + + start = ktime_get(); + rc = call_usermodehelper(argv[0], argv, NULL, UMH_WAIT_PROC); + end = ktime_get(); + + if (rc < 0) { + CERROR("lctl: error invoking upcall %s %s %s: rc = %d; " + "time %ldus\n", argv[0], argv[1], argv[2], rc, + (long)ktime_us_delta(end, start)); + } else { + CDEBUG(D_HA, "lctl: invoked upcall %s %s %s, time %ldus\n", + argv[0], argv[1], argv[2], + (long)ktime_us_delta(end, start)); + rc = 0; + } + + RETURN(rc); +} + +#ifdef HAVE_SERVER_SUPPORT +void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg)) +{ + quota_process_config = qpc; +} +EXPORT_SYMBOL(lustre_register_quota_process_config); +#endif /* HAVE_SERVER_SUPPORT */ + +/** Process configuration commands given in lustre_cfg form. + * These may come from direct calls (e.g. class_manual_cleanup) + * or processing the config llog, or ioctl from lctl. + */ +int class_process_config(struct lustre_cfg *lcfg) +{ + struct obd_device *obd; + int err; + + LASSERT(lcfg && !IS_ERR(lcfg)); + CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command); + + /* Commands that don't need a device */ + switch(lcfg->lcfg_command) { + case LCFG_ATTACH: { + err = class_attach(lcfg); + GOTO(out, err); + } + case LCFG_ADD_UUID: { + CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid %#llx" + " (%s)\n", lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid, libcfs_nid2str(lcfg->lcfg_nid)); + + err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid); + GOTO(out, err); + } + case LCFG_DEL_UUID: { + CDEBUG(D_IOCTL, "removing mappings for uuid %s\n", + (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0) + ? "" : lustre_cfg_string(lcfg, 1)); + + err = class_del_uuid(lustre_cfg_string(lcfg, 1)); + GOTO(out, err); + } + case LCFG_MOUNTOPT: { + CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n", + lustre_cfg_string(lcfg, 1), + lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + /* set these mount options somewhere, so ll_fill_super + * can find them. */ + err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1), + lustre_cfg_string(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 2), + lustre_cfg_string(lcfg, 2), + LUSTRE_CFG_BUFLEN(lcfg, 3), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err); + } + case LCFG_DEL_MOUNTOPT: { + CDEBUG(D_IOCTL, "mountopt: profile %s\n", + lustre_cfg_string(lcfg, 1)); + class_del_profile(lustre_cfg_string(lcfg, 1)); + GOTO(out, err = 0); + } + case LCFG_SET_TIMEOUT: { + CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n", + obd_timeout, lcfg->lcfg_num); + obd_timeout = max(lcfg->lcfg_num, 1U); + obd_timeout_set = 1; + GOTO(out, err = 0); + } + case LCFG_SET_LDLM_TIMEOUT: { + CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n", + ldlm_timeout, lcfg->lcfg_num); + ldlm_timeout = max(lcfg->lcfg_num, 1U); + if (ldlm_timeout >= obd_timeout) + ldlm_timeout = max(obd_timeout / 3, 1U); + ldlm_timeout_set = 1; + GOTO(out, err = 0); + } + case LCFG_SET_UPCALL: { + LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n"); + /* COMPAT_146 Don't fail on old configs */ + GOTO(out, err = 0); + } + case LCFG_MARKER: { + struct cfg_marker *marker; + marker = lustre_cfg_buf(lcfg, 1); + CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step, + marker->cm_flags, marker->cm_tgtname, marker->cm_comment); + GOTO(out, err = 0); + } + case LCFG_PARAM: { + char *tmp; + + /* llite has no obd */ + if (class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_LLITE, NULL) == 0) { + struct lustre_sb_info *lsi; + unsigned long addr; + ssize_t count; + + /* The instance name contains the sb: + * lustre-client-aacfe000 + */ + tmp = strrchr(lustre_cfg_string(lcfg, 0), '-'); + if (!tmp || !*(++tmp)) + GOTO(out, err = -EINVAL); + + if (sscanf(tmp, "%lx", &addr) != 1) + GOTO(out, err = -EINVAL); + + lsi = s2lsi((struct super_block *)addr); + /* This better be a real Lustre superblock! */ + LASSERT(lsi->lsi_lmd->lmd_magic == LMD_MAGIC); + + count = class_modify_config(lcfg, PARAM_LLITE, + lsi->lsi_kobj); + err = count < 0 ? count : 0; + GOTO(out, err); + } else if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_SYS, &tmp) == 0)) { + /* Global param settings */ + err = class_set_global(tmp); + /* + * Client or server should not fail to mount if + * it hits an unknown configuration parameter. + */ + if (err < 0) + CWARN("Ignoring unknown param %s\n", tmp); + + GOTO(out, err = 0); +#ifdef HAVE_SERVER_SUPPORT + } else if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_QUOTA, &tmp) == 0) && + quota_process_config) { + err = (*quota_process_config)(lcfg); + GOTO(out, err); +#endif /* HAVE_SERVER_SUPPORT */ + } + + break; + } + case LCFG_SET_PARAM: { + err = process_param2_config(lcfg); + GOTO(out, err = 0); + } + } + /* Commands that require a device */ + obd = class_name2obd(lustre_cfg_string(lcfg, 0)); + if (obd == NULL) { + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) + CERROR("this lcfg command requires a device name\n"); + else + CERROR("no device for: %s\n", + lustre_cfg_string(lcfg, 0)); + + GOTO(out, err = -EINVAL); + } + switch(lcfg->lcfg_command) { + case LCFG_SETUP: { + err = class_setup(obd, lcfg); + GOTO(out, err); + } + case LCFG_DETACH: { + err = class_detach(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_CLEANUP: { + err = class_cleanup(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_ADD_CONN: { + err = class_add_conn(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_DEL_CONN: { + err = class_del_conn(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_POOL_NEW: { + err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2)); + GOTO(out, err = 0); + } + case LCFG_POOL_ADD: { + err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err = 0); + } + case LCFG_POOL_REM: { + err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err = 0); + } + case LCFG_POOL_DEL: { + err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2)); + GOTO(out, err = 0); + } + /* Process config log ADD_MDC record twice to add MDC also to LOV + * for Data-on-MDT: + * + * add 0:lustre-clilmv 1:lustre-MDT0000_UUID 2:0 3:1 + * 4:lustre-MDT0000-mdc_UUID + */ + case LCFG_ADD_MDC: { + struct obd_device *lov_obd; + char *clilmv; + + err = obd_process_config(obd, sizeof(*lcfg), lcfg); + if (err) + GOTO(out, err); + + /* make sure this is client LMV log entry */ + clilmv = strstr(lustre_cfg_string(lcfg, 0), "clilmv"); + if (!clilmv) + GOTO(out, err); + + /* replace 'lmv' with 'lov' name to address LOV device and + * process llog record to add MDC there. */ + clilmv[4] = 'o'; + lov_obd = class_name2obd(lustre_cfg_string(lcfg, 0)); + if (lov_obd == NULL) { + err = -ENOENT; + CERROR("%s: Cannot find LOV by %s name, rc = %d\n", + obd->obd_name, lustre_cfg_string(lcfg, 0), err); + } else { + err = obd_process_config(lov_obd, sizeof(*lcfg), lcfg); + } + /* restore 'lmv' name */ + clilmv[4] = 'm'; + GOTO(out, err); + } + default: { + err = obd_process_config(obd, sizeof(*lcfg), lcfg); + GOTO(out, err); + + } + } + EXIT; +out: + if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) { + CWARN("Ignoring error %d on optional command %#x\n", err, + lcfg->lcfg_command); + err = 0; + } + return err; +} +EXPORT_SYMBOL(class_process_config); + +ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix, + struct kobject *kobj) +{ + struct kobj_type *typ; + ssize_t count = 0; + int i; + + if (lcfg->lcfg_command != LCFG_PARAM) { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + return -EINVAL; + } + + typ = get_ktype(kobj); + if (!typ || !typ->default_attrs) + return -ENODEV; + + print_lustre_cfg(lcfg); + + /* + * e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt + * or lctl conf_param lustre-MDT0000.mdt.group_upcall=bar + * or lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 + */ + for (i = 1; i < lcfg->lcfg_bufcount; i++) { + struct attribute *attr; + size_t keylen; + char *value; + char *key; + int j; + + key = lustre_cfg_buf(lcfg, i); + /* Strip off prefix */ + if (class_match_param(key, prefix, &key)) + /* If the prefix doesn't match, return error so we + * can pass it down the stack + */ + return -EINVAL; + + value = strchr(key, '='); + if (!value || *(value + 1) == 0) { + CERROR("%s: can't parse param '%s' (missing '=')\n", + lustre_cfg_string(lcfg, 0), + lustre_cfg_string(lcfg, i)); + /* continue parsing other params */ + continue; + } + keylen = value - key; + value++; + + attr = NULL; + for (j = 0; typ->default_attrs[j]; j++) { + if (!strncmp(typ->default_attrs[j]->name, key, + keylen)) { + attr = typ->default_attrs[j]; + break; + } + } + + if (!attr) { + char *envp[3]; + + envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s.%s.%.*s", + kobject_name(kobj->parent), + kobject_name(kobj), + (int) keylen, key); + envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value); + envp[2] = NULL; + + if (kobject_uevent_env(kobj, KOBJ_CHANGE, envp)) { + CERROR("%s: failed to send uevent %s\n", + kobject_name(kobj), key); + } + + for (i = 0; i < ARRAY_SIZE(envp); i++) + kfree(envp[i]); + } else { + count += lustre_attr_store(kobj, attr, value, + strlen(value)); + } + } + return count; +} +EXPORT_SYMBOL(class_modify_config); + +/* + * Supplemental functions for config logs, it allocates lustre_cfg + * buffers plus initialized llog record header at the beginning. + */ +struct llog_cfg_rec *lustre_cfg_rec_new(int cmd, struct lustre_cfg_bufs *bufs) +{ + struct llog_cfg_rec *lcr; + int reclen; + + ENTRY; + + reclen = lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen); + reclen = llog_data_len(reclen) + sizeof(struct llog_rec_hdr) + + sizeof(struct llog_rec_tail); + + OBD_ALLOC(lcr, reclen); + if (lcr == NULL) + RETURN(NULL); + + lustre_cfg_init(&lcr->lcr_cfg, cmd, bufs); + + lcr->lcr_hdr.lrh_len = reclen; + lcr->lcr_hdr.lrh_type = OBD_CFG_REC; + + RETURN(lcr); +} +EXPORT_SYMBOL(lustre_cfg_rec_new); + +void lustre_cfg_rec_free(struct llog_cfg_rec *lcr) +{ + ENTRY; + OBD_FREE(lcr, lcr->lcr_hdr.lrh_len); + EXIT; +} +EXPORT_SYMBOL(lustre_cfg_rec_free); + +/** Parse a configuration llog, doing various manipulations on them + * for various reasons, (modifications for compatibility, skip obsolete + * records, change uuids, etc), then class_process_config() resulting + * net records. + */ +int class_config_llog_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct config_llog_instance *cfg = data; + int cfg_len = rec->lrh_len; + char *cfg_buf = (char *) (rec + 1); + int rc = 0; + ENTRY; + + /* class_config_dump_handler(handle, rec, data); */ + + switch (rec->lrh_type) { + case OBD_CFG_REC: { + struct lustre_cfg *lcfg, *lcfg_new; + struct lustre_cfg_bufs bufs; + char *inst_name = NULL; + int inst_len = 0; + int swab = 0; + + lcfg = (struct lustre_cfg *)cfg_buf; + if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { + lustre_swab_lustre_cfg(lcfg); + swab = 1; + } + + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); + if (rc) + GOTO(out, rc); + + /* Figure out config state info */ + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + lustre_swab_cfg_marker(marker, swab, + LUSTRE_CFG_BUFLEN(lcfg, 1)); + CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n", + cfg->cfg_flags, marker->cm_flags); + if (marker->cm_flags & CM_START) { + /* all previous flags off */ + cfg->cfg_flags = CFG_F_MARKER; + server_name2index(marker->cm_tgtname, + &cfg->cfg_lwp_idx, NULL); + if (marker->cm_flags & CM_SKIP) { + cfg->cfg_flags |= CFG_F_SKIP; + CDEBUG(D_CONFIG, "SKIP #%d\n", + marker->cm_step); + } else if ((marker->cm_flags & CM_EXCLUDE) || + (cfg->cfg_sb && + lustre_check_exclusion(cfg->cfg_sb, + marker->cm_tgtname))) { + cfg->cfg_flags |= CFG_F_EXCLUDE; + CDEBUG(D_CONFIG, "EXCLUDE %d\n", + marker->cm_step); + } + } else if (marker->cm_flags & CM_END) { + cfg->cfg_flags = 0; + } + } + /* A config command without a start marker before it is + * illegal + */ + if (!(cfg->cfg_flags & CFG_F_MARKER) && + (lcfg->lcfg_command != LCFG_MARKER)) { + CWARN("Skip config outside markers, (inst: %016lx, uuid: %s, flags: %#x)\n", + cfg->cfg_instance, + cfg->cfg_uuid.uuid, cfg->cfg_flags); + cfg->cfg_flags |= CFG_F_SKIP; + } + if (cfg->cfg_flags & CFG_F_SKIP) { + CDEBUG(D_CONFIG, "skipping %#x\n", + cfg->cfg_flags); + rc = 0; + /* No processing! */ + break; + } + + /* + * For interoperability between 1.8 and 2.0, + * rename "mds" obd device type to "mdt". + */ + { + char *typename = lustre_cfg_string(lcfg, 1); + char *index = lustre_cfg_string(lcfg, 2); + + if ((lcfg->lcfg_command == LCFG_ATTACH && typename && + strcmp(typename, "mds") == 0)) { + CWARN("For 1.8 interoperability, rename obd " + "type from mds to mdt\n"); + typename[2] = 't'; + } + if ((lcfg->lcfg_command == LCFG_SETUP && index && + strcmp(index, "type") == 0)) { + CDEBUG(D_INFO, "For 1.8 interoperability, " + "set this index to '0'\n"); + index[0] = '0'; + index[1] = 0; + } + } + +#ifdef HAVE_SERVER_SUPPORT + /* newer MDS replaces LOV/OSC with LOD/OSP */ + { + char *typename = lustre_cfg_string(lcfg, 1); + + if ((lcfg->lcfg_command == LCFG_ATTACH && typename && + strcmp(typename, LUSTRE_LOV_NAME) == 0) && + cfg->cfg_sb && IS_MDT(s2lsi(cfg->cfg_sb))) { + CDEBUG(D_CONFIG, + "For 2.x interoperability, rename obd " + "type from lov to lod (%s)\n", + s2lsi(cfg->cfg_sb)->lsi_svname); + strcpy(typename, LUSTRE_LOD_NAME); + } + if ((lcfg->lcfg_command == LCFG_ATTACH && typename && + strcmp(typename, LUSTRE_OSC_NAME) == 0) && + cfg->cfg_sb && IS_MDT(s2lsi(cfg->cfg_sb))) { + CDEBUG(D_CONFIG, + "For 2.x interoperability, rename obd " + "type from osc to osp (%s)\n", + s2lsi(cfg->cfg_sb)->lsi_svname); + strcpy(typename, LUSTRE_OSP_NAME); + } + } +#endif /* HAVE_SERVER_SUPPORT */ + + if (cfg->cfg_flags & CFG_F_EXCLUDE) { + CDEBUG(D_CONFIG, "cmd: %x marked EXCLUDED\n", + lcfg->lcfg_command); + if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD) + /* Add inactive instead */ + lcfg->lcfg_command = LCFG_LOV_ADD_INA; + } + + lustre_cfg_bufs_reset(&bufs, NULL); + lustre_cfg_bufs_init(&bufs, lcfg); + + if (cfg->cfg_instance && + lcfg->lcfg_command != LCFG_SPTLRPC_CONF && + LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) { + inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + 16 + 4; + OBD_ALLOC(inst_name, inst_len); + if (inst_name == NULL) + GOTO(out, rc = -ENOMEM); + snprintf(inst_name, inst_len, "%s-%016lx", + lustre_cfg_string(lcfg, 0), + cfg->cfg_instance); + lustre_cfg_bufs_set_string(&bufs, 0, inst_name); + CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n", + lcfg->lcfg_command, inst_name); + } + + /* override llog UUID for clients, to insure they are unique */ + if (cfg->cfg_instance && lcfg->lcfg_command == LCFG_ATTACH) + lustre_cfg_bufs_set_string(&bufs, 2, + cfg->cfg_uuid.uuid); + /* + * sptlrpc config record, we expect 2 data segments: + * [0]: fs_name/target_name, + * [1]: rule string + * moving them to index [1] and [2], and insert MGC's + * obdname at index [0]. + */ + if (cfg->cfg_instance && + lcfg->lcfg_command == LCFG_SPTLRPC_CONF) { + /* After ASLR changes cfg_instance this needs fixing */ + /* "obd" is set in config_log_find_or_add() */ + struct obd_device *obd = (void *)cfg->cfg_instance; + + lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1], + bufs.lcfg_buflen[1]); + lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0], + bufs.lcfg_buflen[0]); + lustre_cfg_bufs_set_string(&bufs, 0, + obd->obd_name); + } + + /* Add net info to setup command + * if given on command line. + * So config log will be: + * [0]: client name + * [1]: client UUID + * [2]: server UUID + * [3]: inactive-on-startup + * [4]: restrictive net + */ + if (cfg && cfg->cfg_sb && s2lsi(cfg->cfg_sb) && + !IS_SERVER(s2lsi(cfg->cfg_sb))) { + struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb); + char *nidnet = lsi->lsi_lmd->lmd_nidnet; + + if (lcfg->lcfg_command == LCFG_SETUP && + lcfg->lcfg_bufcount != 2 && nidnet) { + CDEBUG(D_CONFIG, "Adding net %s info to setup " + "command for client %s\n", nidnet, + lustre_cfg_string(lcfg, 0)); + lustre_cfg_bufs_set_string(&bufs, 4, nidnet); + } + } + + /* Skip add_conn command if uuid is + * not on restricted net */ + if (cfg && cfg->cfg_sb && s2lsi(cfg->cfg_sb) && + !IS_SERVER(s2lsi(cfg->cfg_sb))) { + struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb); + char *uuid_str = lustre_cfg_string(lcfg, 1); + + if (lcfg->lcfg_command == LCFG_ADD_CONN && + lsi->lsi_lmd->lmd_nidnet && + LNET_NIDNET(libcfs_str2nid(uuid_str)) != + libcfs_str2net(lsi->lsi_lmd->lmd_nidnet)) { + CDEBUG(D_CONFIG, "skipping add_conn for %s\n", + uuid_str); + rc = 0; + /* No processing! */ + break; + } + } + + OBD_ALLOC(lcfg_new, lustre_cfg_len(bufs.lcfg_bufcount, + bufs.lcfg_buflen)); + if (!lcfg_new) + GOTO(out, rc = -ENOMEM); + + lustre_cfg_init(lcfg_new, lcfg->lcfg_command, &bufs); + lcfg_new->lcfg_num = lcfg->lcfg_num; + lcfg_new->lcfg_flags = lcfg->lcfg_flags; + + /* XXX Hack to try to remain binary compatible with + * pre-newconfig logs */ + if (lcfg->lcfg_nal != 0 && /* pre-newconfig log? */ + (lcfg->lcfg_nid >> 32) == 0) { + __u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff); + + lcfg_new->lcfg_nid = + LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr); + CWARN("Converted pre-newconfig NAL %d NID %x to %s\n", + lcfg->lcfg_nal, addr, + libcfs_nid2str(lcfg_new->lcfg_nid)); + } else { + lcfg_new->lcfg_nid = lcfg->lcfg_nid; + } + + lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */ + + rc = class_process_config(lcfg_new); + OBD_FREE(lcfg_new, lustre_cfg_len(lcfg_new->lcfg_bufcount, + lcfg_new->lcfg_buflens)); + if (inst_name) + OBD_FREE(inst_name, inst_len); + break; + } + default: + CERROR("Unknown llog record type %#x encountered\n", + rec->lrh_type); + break; + } +out: + if (rc) { + CERROR("%s: cfg command failed: rc = %d\n", + handle->lgh_ctxt->loc_obd->obd_name, rc); + class_config_dump_handler(NULL, handle, rec, data); + } + RETURN(rc); +} +EXPORT_SYMBOL(class_config_llog_handler); + +int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg) +{ + struct llog_process_cat_data cd = { + .lpcd_first_idx = 0, + }; + struct llog_handle *llh; + llog_cb_t callback; + int rc; + ENTRY; + + CDEBUG(D_INFO, "looking up llog %s\n", name); + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(parse_out, rc); + + /* continue processing from where we last stopped to end-of-log */ + if (cfg) { + cd.lpcd_first_idx = cfg->cfg_last_idx; + callback = cfg->cfg_callback; + LASSERT(callback != NULL); + } else { + callback = class_config_llog_handler; + } + + cd.lpcd_last_idx = 0; + + rc = llog_process(env, llh, callback, cfg, &cd); + + CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name, + cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc); + if (cfg) + cfg->cfg_last_idx = cd.lpcd_last_idx; + +parse_out: + llog_close(env, llh); + RETURN(rc); +} +EXPORT_SYMBOL(class_config_parse_llog); + +/** + * Parse config record and output dump in supplied buffer. + * + * This is separated from class_config_dump_handler() to use + * for ioctl needs as well + * + * Sample Output: + * - { index: 4, event: attach, device: lustrewt-clilov, type: lov, + * UUID: lustrewt-clilov_UUID } + */ +int class_config_yaml_output(struct llog_rec_hdr *rec, char *buf, int size) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); + char *ptr = buf; + char *end = buf + size; + int rc = 0, i; + struct lcfg_type_data *ldata; + + LASSERT(rec->lrh_type == OBD_CFG_REC); + rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len); + if (rc < 0) + return rc; + + ldata = lcfg_cmd2data(lcfg->lcfg_command); + if (ldata == NULL) + return -ENOTTY; + + if (lcfg->lcfg_command == LCFG_MARKER) + return 0; + + /* form YAML entity */ + ptr += snprintf(ptr, end - ptr, "- { index: %u, event: %s", + rec->lrh_index, ldata->ltd_name); + if (end - ptr <= 0) + goto out_overflow; + + if (lcfg->lcfg_flags) { + ptr += snprintf(ptr, end - ptr, ", flags: %#08x", + lcfg->lcfg_flags); + if (end - ptr <= 0) + goto out_overflow; + } + if (lcfg->lcfg_num) { + ptr += snprintf(ptr, end - ptr, ", num: %#08x", + lcfg->lcfg_num); + if (end - ptr <= 0) + goto out_overflow; + } + if (lcfg->lcfg_nid) { + char nidstr[LNET_NIDSTR_SIZE]; + + libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr)); + ptr += snprintf(ptr, end - ptr, ", nid: %s(%#llx)", + nidstr, lcfg->lcfg_nid); + if (end - ptr <= 0) + goto out_overflow; + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) { + ptr += snprintf(ptr, end - ptr, ", device: %s", + lustre_cfg_string(lcfg, 0)); + if (end - ptr <= 0) + goto out_overflow; + } + + if (lcfg->lcfg_command == LCFG_SET_PARAM) { + /* + * set_param -P parameters have param=val here, separate + * them through pointer magic and print them out in + * native yamlese + */ + char *cfg_str = lustre_cfg_string(lcfg, 1); + char *tmp = strchr(cfg_str, '='); + size_t len; + + if (tmp == NULL) + goto out_done; + + ptr += snprintf(ptr, end - ptr, ", %s: ", ldata->ltd_bufs[0]); + len = tmp - cfg_str + 1; + snprintf(ptr, len, "%s", cfg_str); + ptr += len - 1; + + ptr += snprintf(ptr, end - ptr, ", %s: ", ldata->ltd_bufs[1]); + ptr += snprintf(ptr, end - ptr, "%s", tmp + 1); + + goto out_done; + } + + for (i = 1; i < lcfg->lcfg_bufcount; i++) { + if (LUSTRE_CFG_BUFLEN(lcfg, i) > 0) { + ptr += snprintf(ptr, end - ptr, ", %s: %s", + ldata->ltd_bufs[i - 1], + lustre_cfg_string(lcfg, i)); + if (end - ptr <= 0) + goto out_overflow; + } + } + +out_done: + ptr += snprintf(ptr, end - ptr, " }\n"); +out_overflow: + /* Return consumed bytes. If the buffer overflowed, zero last byte */ + rc = ptr - buf; + if (rc > size) { + rc = -EOVERFLOW; + *(end - 1) = '\0'; + } + + return rc; +} + +/** + * parse config record and output dump in supplied buffer. + * This is separated from class_config_dump_handler() to use + * for ioctl needs as well + */ +static int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); + char *ptr = buf; + char *end = buf + size; + int rc = 0; + + ENTRY; + + LASSERT(rec->lrh_type == OBD_CFG_REC); + rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len); + if (rc < 0) + RETURN(rc); + + ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command); + if (lcfg->lcfg_flags) + ptr += snprintf(ptr, end-ptr, "flags=%#08x ", + lcfg->lcfg_flags); + + if (lcfg->lcfg_num) + ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num); + + if (lcfg->lcfg_nid) { + char nidstr[LNET_NIDSTR_SIZE]; + + libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr)); + ptr += snprintf(ptr, end-ptr, "nid=%s(%#llx)\n ", + nidstr, lcfg->lcfg_nid); + } + + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + + ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'", + marker->cm_step, marker->cm_flags, + marker->cm_tgtname, marker->cm_comment); + } else { + int i; + + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + ptr += snprintf(ptr, end-ptr, "%d:%s ", i, + lustre_cfg_string(lcfg, i)); + } + } + ptr += snprintf(ptr, end - ptr, "\n"); + /* return consumed bytes */ + rc = ptr - buf; + RETURN(rc); +} + +int class_config_dump_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + char *outstr; + int rc = 0; + + ENTRY; + + OBD_ALLOC(outstr, 256); + if (outstr == NULL) + RETURN(-ENOMEM); + + if (rec->lrh_type == OBD_CFG_REC) { + class_config_parse_rec(rec, outstr, 256); + LCONSOLE(D_WARNING, " %s\n", outstr); + } else { + LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type); + rc = -EINVAL; + } + + OBD_FREE(outstr, 256); + RETURN(rc); +} + +/** Call class_cleanup and class_detach. + * "Manual" only in the sense that we're faking lcfg commands. + */ +int class_manual_cleanup(struct obd_device *obd) +{ + char flags[3] = ""; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + int rc; + ENTRY; + + if (!obd) { + CERROR("empty cleanup\n"); + RETURN(-EALREADY); + } + + if (obd->obd_force) + strcat(flags, "F"); + if (obd->obd_fail) + strcat(flags, "A"); + + CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n", + obd->obd_name, flags); + + lustre_cfg_bufs_reset(&bufs, obd->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, flags); + OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen)); + if (!lcfg) + RETURN(-ENOMEM); + lustre_cfg_init(lcfg, LCFG_CLEANUP, &bufs); + + rc = class_process_config(lcfg); + if (rc) { + CERROR("cleanup failed %d: %s\n", rc, obd->obd_name); + GOTO(out, rc); + } + + /* the lcfg is almost the same for both ops */ + lcfg->lcfg_command = LCFG_DETACH; + rc = class_process_config(lcfg); + if (rc) + CERROR("detach failed %d: %s\n", rc, obd->obd_name); +out: + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)); + RETURN(rc); +} +EXPORT_SYMBOL(class_manual_cleanup); + +/* + * uuid<->export lustre hash operations + */ + +static unsigned +uuid_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid, + sizeof(((struct obd_uuid *)key)->uuid), mask); +} + +static void * +uuid_key(struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + + return &exp->exp_client_uuid; +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +uuid_keycmp(const void *key, struct hlist_node *hnode) +{ + struct obd_export *exp; + + LASSERT(key); + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + + return obd_uuid_equals(key, &exp->exp_client_uuid) && + !exp->exp_failed; +} + +static void * +uuid_export_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct obd_export, exp_uuid_hash); +} + +static void +uuid_export_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + class_export_get(exp); +} + +static void +uuid_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + class_export_put(exp); +} + +static struct cfs_hash_ops uuid_hash_ops = { + .hs_hash = uuid_hash, + .hs_key = uuid_key, + .hs_keycmp = uuid_keycmp, + .hs_object = uuid_export_object, + .hs_get = uuid_export_get, + .hs_put_locked = uuid_export_put_locked, +}; + + +/* + * nid<->export hash operations + */ + +static unsigned +nid_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask); +} + +static void * +nid_key(struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + + RETURN(&exp->exp_connection->c_peer.nid); +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +nid_kepcmp(const void *key, struct hlist_node *hnode) +{ + struct obd_export *exp; + + LASSERT(key); + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + + RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key && + !exp->exp_failed); +} + +static void * +nid_export_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct obd_export, exp_nid_hash); +} + +static void +nid_export_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + class_export_get(exp); +} + +static void +nid_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + class_export_put(exp); +} + +static struct cfs_hash_ops nid_hash_ops = { + .hs_hash = nid_hash, + .hs_key = nid_key, + .hs_keycmp = nid_kepcmp, + .hs_object = nid_export_object, + .hs_get = nid_export_get, + .hs_put_locked = nid_export_put_locked, +}; + + +/* + * nid<->nidstats hash operations + */ + +static void * +nidstats_key(struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + + return &ns->nid; +} + +static int +nidstats_keycmp(const void *key, struct hlist_node *hnode) +{ + return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key; +} + +static void * +nidstats_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct nid_stat, nid_hash); +} + +static void +nidstats_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + nidstat_getref(ns); +} + +static void +nidstats_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + nidstat_putref(ns); +} + +static struct cfs_hash_ops nid_stat_hash_ops = { + .hs_hash = nid_hash, + .hs_key = nidstats_key, + .hs_keycmp = nidstats_keycmp, + .hs_object = nidstats_object, + .hs_get = nidstats_get, + .hs_put_locked = nidstats_put_locked, +}; + + +/* + * client_generation<->export hash operations + */ + +static unsigned +gen_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(__u32), mask); +} + +static void * +gen_key(struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_gen_hash); + + RETURN(&exp->exp_target_data.ted_lcd->lcd_generation); +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +gen_kepcmp(const void *key, struct hlist_node *hnode) +{ + struct obd_export *exp; + + LASSERT(key); + exp = hlist_entry(hnode, struct obd_export, exp_gen_hash); + + RETURN(exp->exp_target_data.ted_lcd->lcd_generation == *(__u32 *)key && + !exp->exp_failed); +} + +static void * +gen_export_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct obd_export, exp_gen_hash); +} + +static void +gen_export_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_gen_hash); + class_export_get(exp); +} + +static void +gen_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_gen_hash); + class_export_put(exp); +} + +static struct cfs_hash_ops gen_hash_ops = { + .hs_hash = gen_hash, + .hs_key = gen_key, + .hs_keycmp = gen_kepcmp, + .hs_object = gen_export_object, + .hs_get = gen_export_get, + .hs_put_locked = gen_export_put_locked, +}; diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c new file mode 100644 index 0000000000000..3c7a51ffd38a1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c @@ -0,0 +1,1695 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_mount.c + * + * Client mount routines + * + * Author: Nathan Rutman + */ + + +#define DEBUG_SUBSYSTEM S_CLASS +#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */) +#define PRINT_CMD CDEBUG + +#include +#include +#include +#include +#include +#include + +static int (*client_fill_super)(struct super_block *sb, + struct vfsmount *mnt); + +static void (*kill_super_cb)(struct super_block *sb); + +/**************** config llog ********************/ + +/** Get a config log from the MGS and process it. + * This func is called for both clients and servers. + * Continue to process new statements appended to the logs + * (whenever the config lock is revoked) until lustre_end_log + * is called. + * @param sb The superblock is used by the MGC to write to the local copy of + * the config log + * @param logname The name of the llog to replicate from the MGS + * @param cfg Since the same mgc may be used to follow multiple config logs + * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for + * this log, and is added to the mgc's list of logs to follow. + */ +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs *bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + ENTRY; + + LASSERT(mgc); + LASSERT(cfg); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + RETURN(-ENOMEM); + + /* mgc_process_config */ + lustre_cfg_bufs_reset(bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(bufs, 1, logname); + lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg)); + lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb)); + OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen)); + if (!lcfg) + GOTO(out, rc = -ENOMEM); + lustre_cfg_init(lcfg, LCFG_LOG_START, bufs); + + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)); +out: + OBD_FREE_PTR(bufs); + + if (rc == -EINVAL) + LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'" + "failed from the MGS (%d). Make sure this " + "client and the MGS are running compatible " + "versions of Lustre.\n", + mgc->obd_name, logname, rc); + else if (rc != 0) + LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' " + "failed (%d). This may be the result of " + "communication errors between this node and " + "the MGS, a bad configuration, or other " + "errors. See the syslog for more " + "information.\n", mgc->obd_name, logname, + rc); + + /* class_obd_list(); */ + RETURN(rc); +} +EXPORT_SYMBOL(lustre_process_log); + +/* Stop watching this config log for updates */ +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + ENTRY; + + if (!mgc) + RETURN(-ENOENT); + + /* mgc_process_config */ + lustre_cfg_bufs_reset(&bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, logname); + if (cfg) + lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg)); + OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen)); + if (!lcfg) + RETURN(-ENOMEM); + lustre_cfg_init(lcfg, LCFG_LOG_END, &bufs); + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)); + RETURN(rc); +} +EXPORT_SYMBOL(lustre_end_log); + +/**************** obd start *******************/ + +/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from + * lctl (and do for echo cli/srv. + */ +static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2, char *s3, char *s4) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg = NULL; + int rc; + + CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname, + cmd, s1, s2, s3, s4); + + lustre_cfg_bufs_reset(&bufs, cfgname); + if (s1) + lustre_cfg_bufs_set_string(&bufs, 1, s1); + if (s2) + lustre_cfg_bufs_set_string(&bufs, 2, s2); + if (s3) + lustre_cfg_bufs_set_string(&bufs, 3, s3); + if (s4) + lustre_cfg_bufs_set_string(&bufs, 4, s4); + + OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen)); + if (!lcfg) + return -ENOMEM; + lustre_cfg_init(lcfg, cmd, &bufs); + lcfg->lcfg_nid = nid; + rc = class_process_config(lcfg); + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)); + return rc; +} + +/** Call class_attach and class_setup. These methods in turn call + * obd type-specific methods. + */ +int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2, char *s3, char *s4) +{ + int rc; + CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type); + + rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, NULL, NULL); + if (rc) { + CERROR("%s attach error %d\n", obdname, rc); + return rc; + } + rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4); + if (rc) { + CERROR("%s setup error %d\n", obdname, rc); + do_lcfg(obdname, 0, LCFG_DETACH, NULL, NULL, NULL, NULL); + } + return rc; +} + +static DEFINE_MUTEX(mgc_start_lock); + +/** Set up a mgc obd to process startup logs + * + * \param sb [in] super block of the mgc obd + * + * \retval 0 success, otherwise error code + */ +int lustre_start_mgc(struct super_block *sb) +{ + struct obd_connect_data *data = NULL; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + struct obd_export *exp; + struct obd_uuid *uuid = NULL; + class_uuid_t uuidc; + lnet_nid_t nid; + char nidstr[LNET_NIDSTR_SIZE]; + char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL; + char *ptr; + int rc = 0, i = 0, j; + size_t len; + ENTRY; + + LASSERT(lsi->lsi_lmd); + + /* Find the first non-lo MGS nid for our MGC name */ + if (IS_SERVER(lsi)) { + /* mount -o mgsnode=nid */ + ptr = lsi->lsi_lmd->lmd_mgs; + if (lsi->lsi_lmd->lmd_mgs && + (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) { + i++; + } else if (IS_MGS(lsi)) { + struct lnet_process_id id; + + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + if (id.nid == LNET_NID_LO_0) + continue; + nid = id.nid; + i++; + break; + } + } + } else { /* client */ + /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + if (class_parse_nid(ptr, &nid, &ptr) == 0) + i++; + } + if (i == 0) { + CERROR("No valid MGS nids found.\n"); + RETURN(-EINVAL); + } + + mutex_lock(&mgc_start_lock); + + libcfs_nid2str_r(nid, nidstr, sizeof(nidstr)); + len = strlen(LUSTRE_MGC_OBDNAME) + strlen(nidstr) + 1; + OBD_ALLOC(mgcname, len); + OBD_ALLOC(niduuid, len + 2); + if (mgcname == NULL || niduuid == NULL) + GOTO(out_free, rc = -ENOMEM); + snprintf(mgcname, len, "%s%s", LUSTRE_MGC_OBDNAME, nidstr); + + mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : ""; + + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out_free, rc = -ENOMEM); + + obd = class_name2obd(mgcname); + if (obd && !obd->obd_stopping) { + int recov_bk; + + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + GOTO(out_free, rc); + + /* Re-using an existing MGC */ + atomic_inc(&obd->u.cli.cl_mgc_refcount); + + /* IR compatibility check, only for clients */ + if (lmd_is_client(lsi->lsi_lmd)) { + int has_ir; + int vallen = sizeof(*data); + __u32 *flags = &lsi->lsi_lmd->lmd_flags; + + rc = obd_get_info(NULL, obd->obd_self_export, + strlen(KEY_CONN_DATA), KEY_CONN_DATA, + &vallen, data); + LASSERT(rc == 0); + has_ir = OCD_HAS_FLAG(data, IMP_RECOV); + if (has_ir ^ !(*flags & LMD_FLG_NOIR)) { + /* LMD_FLG_NOIR is for test purpose only */ + LCONSOLE_WARN( + "Trying to mount a client with IR setting " + "not compatible with current mgc. " + "Force to use current mgc setting that is " + "IR %s.\n", + has_ir ? "enabled" : "disabled"); + if (has_ir) + *flags &= ~LMD_FLG_NOIR; + else + *flags |= LMD_FLG_NOIR; + } + } + + recov_bk = 0; + /* If we are restarting the MGS, don't try to keep the MGC's + old connection, or registration will fail. */ + if (IS_MGS(lsi)) { + CDEBUG(D_MOUNT, "New MGS with live MGC\n"); + recov_bk = 1; + } + + /* Try all connections, but only once (again). + We don't want to block another target from starting + (using its local copy of the log), but we do want to connect + if at all possible. */ + recov_bk++; + CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk); + rc = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_INIT_RECOV_BACKUP), + KEY_INIT_RECOV_BACKUP, + sizeof(recov_bk), &recov_bk, NULL); + GOTO(out, rc = 0); + } + + CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname); + + /* Add the primary nids for the MGS */ + i = 0; + snprintf(niduuid, len + 2, "%s_%x", mgcname, i); + if (IS_SERVER(lsi)) { + ptr = lsi->lsi_lmd->lmd_mgs; + CDEBUG(D_MOUNT, "mgs nids %s.\n", ptr); + if (IS_MGS(lsi)) { + /* Use local nids (including LO) */ + struct lnet_process_id id; + + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + rc = do_lcfg(mgcname, id.nid, LCFG_ADD_UUID, + niduuid, NULL, NULL, NULL); + } + } else { + /* Use mgsnode= nids */ + /* mount -o mgsnode=nid */ + if (lsi->lsi_lmd->lmd_mgs) { + ptr = lsi->lsi_lmd->lmd_mgs; + } else if (class_find_param(ptr, PARAM_MGSNODE, + &ptr) != 0) { + CERROR("No MGS nids given.\n"); + GOTO(out_free, rc = -EINVAL); + } + /* + * Add primary MGS nid(s). + * Multiple nids on one MGS node are separated + * by commas. + */ + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, + niduuid, NULL, NULL, NULL); + if (rc == 0) + ++i; + /* Stop at the first failover nid */ + if (*ptr == ':') + break; + } + } + } else { /* client */ + /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, + niduuid, NULL, NULL, NULL); + if (rc == 0) + ++i; + /* Stop at the first failover nid */ + if (*ptr == ':') + break; + } + } + if (i == 0) { + CERROR("No valid MGS nids found.\n"); + GOTO(out_free, rc = -EINVAL); + } + lsi->lsi_lmd->lmd_mgs_failnodes = 1; + + /* Random uuid for MGC allows easier reconnects */ + OBD_ALLOC_PTR(uuid); + if (uuid == NULL) + GOTO(out_free, rc = -ENOMEM); + + ll_generate_random_uuid(uuidc); + class_uuid_unparse(uuidc, uuid); + + /* Start the MGC */ + rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME, + (char *)uuid->uuid, LUSTRE_MGS_OBDNAME, + niduuid, NULL, NULL); + if (rc) + GOTO(out_free, rc); + + /* Add any failover MGS nids */ + i = 1; + while (ptr && ((*ptr == ':' || + class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) { + /* New failover node */ + sprintf(niduuid, "%s_%x", mgcname, i); + j = 0; + while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, + niduuid, NULL, NULL, NULL); + if (rc == 0) + ++j; + if (*ptr == ':') + break; + } + if (j > 0) { + rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN, + niduuid, NULL, NULL, NULL); + if (rc == 0) + ++i; + } else { + /* at ":/fsname" */ + break; + } + } + lsi->lsi_lmd->lmd_mgs_failnodes = i; + + obd = class_name2obd(mgcname); + if (!obd) { + CERROR("Can't find mgcobd %s\n", mgcname); + GOTO(out_free, rc = -ENOTCONN); + } + + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + GOTO(out_free, rc); + + /* Keep a refcount of servers/clients who started with "mount", + so we know when we can get rid of the mgc. */ + atomic_set(&obd->u.cli.cl_mgc_refcount, 1); + + /* We connect to the MGS at setup, and don't disconnect until cleanup */ + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT | + OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | + OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_BULK_MBITS | OBD_CONNECT_BARRIER; + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0) + data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB; +#endif + + if (lmd_is_client(lsi->lsi_lmd) && + lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) + data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV; + data->ocd_version = LUSTRE_VERSION_CODE; + rc = obd_connect(NULL, &exp, obd, uuid, data, NULL); + if (rc) { + CERROR("connect failed %d\n", rc); + GOTO(out, rc); + } + + obd->u.cli.cl_mgc_mgsexp = exp; + +out: + /* Keep the mgc info in the sb. Note that many lsi's can point + to the same mgc.*/ + lsi->lsi_mgc = obd; +out_free: + mutex_unlock(&mgc_start_lock); + + if (uuid) + OBD_FREE_PTR(uuid); + if (data) + OBD_FREE_PTR(data); + if (mgcname) + OBD_FREE(mgcname, len); + if (niduuid) + OBD_FREE(niduuid, len + 2); + RETURN(rc); +} + +static int lustre_stop_mgc(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + char *niduuid = NULL, *ptr = NULL; + int i, rc = 0, len = 0; + ENTRY; + + if (!lsi) + RETURN(-ENOENT); + obd = lsi->lsi_mgc; + if (!obd) + RETURN(-ENOENT); + lsi->lsi_mgc = NULL; + + mutex_lock(&mgc_start_lock); + LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0); + if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { + /* This is not fatal, every client that stops + will call in here. */ + CDEBUG(D_MOUNT, "mgc still has %d references.\n", + atomic_read(&obd->u.cli.cl_mgc_refcount)); + GOTO(out, rc = -EBUSY); + } + + /* The MGC has no recoverable data in any case. + * force shotdown set in umount_begin */ + obd->obd_no_recov = 1; + + if (obd->u.cli.cl_mgc_mgsexp) { + /* An error is not fatal, if we are unable to send the + disconnect mgs ping evictor cleans up the export */ + rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp); + if (rc) + CDEBUG(D_MOUNT, "disconnect failed %d\n", rc); + } + + /* Save the obdname for cleaning the nid uuids, which are + obdname_XX */ + len = strlen(obd->obd_name) + 6; + OBD_ALLOC(niduuid, len); + if (niduuid) { + strcpy(niduuid, obd->obd_name); + ptr = niduuid + strlen(niduuid); + } + + rc = class_manual_cleanup(obd); + if (rc) + GOTO(out, rc); + + /* Clean the nid uuids */ + if (!niduuid) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { + sprintf(ptr, "_%x", i); + rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID, + niduuid, NULL, NULL, NULL); + if (rc) + CERROR("del MDC UUID %s failed: rc = %d\n", + niduuid, rc); + } +out: + if (niduuid) + OBD_FREE(niduuid, len); + + /* class_import_put will get rid of the additional connections */ + mutex_unlock(&mgc_start_lock); + RETURN(rc); +} + +/***************** lustre superblock **************/ + +static struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi; + ENTRY; + + OBD_ALLOC_PTR(lsi); + if (!lsi) + RETURN(NULL); + OBD_ALLOC_PTR(lsi->lsi_lmd); + if (!lsi->lsi_lmd) { + OBD_FREE_PTR(lsi); + RETURN(NULL); + } + + lsi->lsi_lmd->lmd_exclude_count = 0; + lsi->lsi_lmd->lmd_recovery_time_soft = 0; + lsi->lsi_lmd->lmd_recovery_time_hard = 0; + s2lsi_nocast(sb) = lsi; + /* we take 1 extra ref for our setup */ + atomic_set(&lsi->lsi_mounts, 1); + + /* Default umount style */ + lsi->lsi_flags = LSI_UMOUNT_FAILOVER; + INIT_LIST_HEAD(&lsi->lsi_lwp_list); + mutex_init(&lsi->lsi_lwp_mutex); + + RETURN(lsi); +} + +static int lustre_free_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + LASSERT(lsi != NULL); + CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi); + + /* someone didn't call server_put_mount. */ + LASSERT(atomic_read(&lsi->lsi_mounts) == 0); + + if (lsi->lsi_lmd != NULL) { + if (lsi->lsi_lmd->lmd_dev != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_dev, + strlen(lsi->lsi_lmd->lmd_dev) + 1); + if (lsi->lsi_lmd->lmd_profile != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_profile, + strlen(lsi->lsi_lmd->lmd_profile) + 1); + if (lsi->lsi_lmd->lmd_fileset != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_fileset, + strlen(lsi->lsi_lmd->lmd_fileset) + 1); + if (lsi->lsi_lmd->lmd_mgssec != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgssec, + strlen(lsi->lsi_lmd->lmd_mgssec) + 1); + if (lsi->lsi_lmd->lmd_opts != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_opts, + strlen(lsi->lsi_lmd->lmd_opts) + 1); + if (lsi->lsi_lmd->lmd_exclude_count) + OBD_FREE(lsi->lsi_lmd->lmd_exclude, + sizeof(lsi->lsi_lmd->lmd_exclude[0]) * + lsi->lsi_lmd->lmd_exclude_count); + if (lsi->lsi_lmd->lmd_mgs != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgs, + strlen(lsi->lsi_lmd->lmd_mgs) + 1); + if (lsi->lsi_lmd->lmd_osd_type != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_osd_type, + strlen(lsi->lsi_lmd->lmd_osd_type) + 1); + if (lsi->lsi_lmd->lmd_params != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_params, 4096); + if (lsi->lsi_lmd->lmd_nidnet != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_nidnet, + strlen(lsi->lsi_lmd->lmd_nidnet) + 1); + + OBD_FREE_PTR(lsi->lsi_lmd); + } + + LASSERT(lsi->lsi_llsbi == NULL); + OBD_FREE_PTR(lsi); + s2lsi_nocast(sb) = NULL; + + RETURN(0); +} + +/* The lsi has one reference for every server that is using the disk - + e.g. MDT, MGS, and potentially MGC */ +int lustre_put_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + LASSERT(lsi != NULL); + + CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts)); + if (atomic_dec_and_test(&lsi->lsi_mounts)) { + if (IS_SERVER(lsi) && lsi->lsi_osd_exp) { + lu_device_put(&lsi->lsi_dt_dev->dd_lu_dev); + lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt = NULL; + lsi->lsi_dt_dev = NULL; + obd_disconnect(lsi->lsi_osd_exp); + /* wait till OSD is gone */ + obd_zombie_barrier(); + } + lustre_free_lsi(sb); + RETURN(1); + } + RETURN(0); +} + +/* + * The goal of this function is to extract the file system name + * from the obd name. This can come in two flavors. One is + * fsname-MDTXXXX or fsname-XXXXXXX were X is a hexadecimal + * number. In both cases we should return fsname. If it is + * not a valid obd name it is assumed to be the file system + * name itself. + */ +void obdname2fsname(const char *tgt, char *fsname, size_t buflen) +{ + const char *ptr; + const char *tmp; + size_t len = 0; + + /* First we have to see if the @tgt has '-' at all. It is + * valid for the user to request something like + * lctl set_param -P llite.lustre*.xattr_cache=0 + */ + ptr = strrchr(tgt, '-'); + if (!ptr) { + /* No '-' means it could end in '*' */ + ptr = strchr(tgt, '*'); + if (!ptr) { + /* No '*' either. Assume tgt = fsname */ + len = strlen(tgt); + goto valid_obd_name; + } + len = ptr - tgt; + goto valid_obd_name; + } + + /* tgt format fsname-MDT0000-* */ + if ((!strncmp(ptr, "-MDT", 4) || + !strncmp(ptr, "-OST", 4)) && + (isxdigit(ptr[4]) && isxdigit(ptr[5]) && + isxdigit(ptr[6]) && isxdigit(ptr[7]))) { + len = ptr - tgt; + goto valid_obd_name; + } + + /* tgt_format fsname-cli'dev'-'uuid' except for the llite case + * which are named fsname-'uuid'. Examples: + * + * lustre-clilov-ffff88104db5b800 + * lustre-ffff88104db5b800 (for llite device) + * + * The length of the obd uuid can vary on different platforms. + * This test if any invalid characters are in string. Allow + * wildcards with '*' character. + */ + ptr++; + if (!strspn(ptr, "0123456789abcdefABCDEF*")) { + len = 0; + goto no_fsname; + } + + /* Now that we validated the device name lets extract the + * file system name. Most of the names in this class will + * have '-cli' in its name which needs to be dropped. If + * it doesn't have '-cli' then its a llite device which + * ptr already points to the start of the uuid string. + */ + tmp = strstr(tgt, "-cli"); + if (tmp) + ptr = tmp; + else + ptr--; + len = ptr - tgt; +valid_obd_name: + len = min_t(size_t, len, LUSTRE_MAXFSNAME); + snprintf(fsname, buflen, "%.*s", (int)len, tgt); +no_fsname: + fsname[len] = '\0'; +} +EXPORT_SYMBOL(obdname2fsname); + +/*** SERVER NAME *** + * + * FSNAME is between 1 and 8 characters (inclusive). + * Excluded characters are '/' and ':' + * SEPARATOR is either ':' or '-' + * TYPE: "OST", "MDT", etc. + * INDEX: Hex representation of the index + */ + +/** Get the fsname ("lustre") from the server name ("lustre-OST003F"). + * @param [in] svname server name including type and index + * @param [out] fsname Buffer to copy filesystem name prefix into. + * Must have at least 'strlen(fsname) + 1' chars. + * @param [out] endptr if endptr isn't NULL it is set to end of fsname + * rc < 0 on error + */ +int server_name2fsname(const char *svname, char *fsname, const char **endptr) +{ + const char *dash; + + dash = svname + strnlen(svname, LUSTRE_MAXFSNAME); + for (; dash > svname && *dash != '-' && *dash != ':'; dash--) + ; + if (dash == svname) + return -EINVAL; + + if (fsname != NULL) { + strncpy(fsname, svname, dash - svname); + fsname[dash - svname] = '\0'; + } + + if (endptr != NULL) + *endptr = dash; + + return 0; +} +EXPORT_SYMBOL(server_name2fsname); + +/** + * Get service name (svname) from string + * rc < 0 on error + * if endptr isn't NULL it is set to end of fsname * + */ +int server_name2svname(const char *label, char *svname, const char **endptr, + size_t svsize) +{ + int rc; + const char *dash; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(label, NULL, &dash); + if (rc != 0) + return rc; + + if (endptr != NULL) + *endptr = dash; + + if (strlcpy(svname, dash + 1, svsize) >= svsize) + return -E2BIG; + + return 0; +} +EXPORT_SYMBOL(server_name2svname); + +/** + * check server name is OST. + **/ +int server_name_is_ost(const char *svname) +{ + const char *dash; + int rc; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(svname, NULL, &dash); + if (rc != 0) + return rc; + + dash++; + + if (strncmp(dash, "OST", 3) == 0) + return 1; + return 0; +} +EXPORT_SYMBOL(server_name_is_ost); + +/** + * Get the index from the target name MDTXXXX/OSTXXXX + * rc = server type, or rc < 0 on error + **/ +int target_name2index(const char *tgtname, __u32 *idx, const char **endptr) +{ + const char *dash = tgtname; + unsigned long index; + int rc; + + if (strncmp(dash, "MDT", 3) == 0) + rc = LDD_F_SV_TYPE_MDT; + else if (strncmp(dash, "OST", 3) == 0) + rc = LDD_F_SV_TYPE_OST; + else + return -EINVAL; + + dash += 3; + + if (strncmp(dash, "all", 3) == 0) { + if (endptr != NULL) + *endptr = dash + 3; + return rc | LDD_F_SV_ALL; + } + + index = simple_strtoul(dash, (char **)endptr, 16); + if (idx != NULL) + *idx = index; + return rc; +} +EXPORT_SYMBOL(target_name2index); + +/* Get the index from the obd name. + rc = server type, or + rc < 0 on error + if endptr isn't NULL it is set to end of name */ +int server_name2index(const char *svname, __u32 *idx, const char **endptr) +{ + const char *dash; + int rc; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(svname, NULL, &dash); + if (rc != 0) + return rc; + + dash++; + rc = target_name2index(dash, idx, endptr); + if (rc < 0) + return rc; + + /* Account for -mdc after index that is possible when specifying mdt */ + if (endptr != NULL && strncmp(LUSTRE_MDC_NAME, *endptr + 1, + sizeof(LUSTRE_MDC_NAME)-1) == 0) + *endptr += sizeof(LUSTRE_MDC_NAME); + + return rc; +} +EXPORT_SYMBOL(server_name2index); + +/*************** mount common betweeen server and client ***************/ + +/* Common umount */ +int lustre_common_put_super(struct super_block *sb) +{ + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "dropping sb %p\n", sb); + + /* Drop a ref to the MGC */ + rc = lustre_stop_mgc(sb); + if (rc && (rc != -ENOENT)) { + if (rc != -EBUSY) { + CERROR("Can't stop MGC: %d\n", rc); + RETURN(rc); + } + /* BUSY just means that there's some other obd that + needs the mgc. Let him clean it up. */ + CDEBUG(D_MOUNT, "MGC still in use\n"); + } + /* Drop a ref to the mounted disk */ + lustre_put_lsi(sb); + + RETURN(rc); +} +EXPORT_SYMBOL(lustre_common_put_super); + +static void lmd_print(struct lustre_mount_data *lmd) +{ + int i; + + PRINT_CMD(D_MOUNT, " mount data:\n"); + if (lmd_is_client(lmd)) + PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile); + PRINT_CMD(D_MOUNT, "device: %s\n", lmd->lmd_dev); + PRINT_CMD(D_MOUNT, "flags: %x\n", lmd->lmd_flags); + + if (lmd->lmd_opts) + PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts); + + if (lmd->lmd_recovery_time_soft) + PRINT_CMD(D_MOUNT, "recovery time soft: %d\n", + lmd->lmd_recovery_time_soft); + + if (lmd->lmd_recovery_time_hard) + PRINT_CMD(D_MOUNT, "recovery time hard: %d\n", + lmd->lmd_recovery_time_hard); + + for (i = 0; i < lmd->lmd_exclude_count; i++) { + PRINT_CMD(D_MOUNT, "exclude %d: OST%04x\n", i, + lmd->lmd_exclude[i]); + } +} + +/* Is this server on the exclusion list */ +int lustre_check_exclusion(struct super_block *sb, char *svname) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct lustre_mount_data *lmd = lsi->lsi_lmd; + __u32 index; + int i, rc; + ENTRY; + + rc = server_name2index(svname, &index, NULL); + if (rc != LDD_F_SV_TYPE_OST) + /* Only exclude OSTs */ + RETURN(0); + + CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname, + index, lmd->lmd_exclude_count, lmd->lmd_dev); + + for(i = 0; i < lmd->lmd_exclude_count; i++) { + if (index == lmd->lmd_exclude[i]) { + CWARN("Excluding %s (on exclusion list)\n", svname); + RETURN(1); + } + } + RETURN(0); +} + +/* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */ +static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr) +{ + const char *s1 = ptr, *s2; + __u32 *exclude_list; + __u32 index = 0; + int rc = 0, devmax; + ENTRY; + + /* The shortest an ost name can be is 8 chars: -OST0000. + We don't actually know the fsname at this time, so in fact + a user could specify any fsname. */ + devmax = strlen(ptr) / 8 + 1; + + /* temp storage until we figure out how many we have */ + OBD_ALLOC(exclude_list, sizeof(index) * devmax); + if (!exclude_list) + RETURN(-ENOMEM); + + /* we enter this fn pointing at the '=' */ + while (*s1 && *s1 != ' ' && *s1 != ',') { + s1++; + rc = server_name2index(s1, &index, &s2); + if (rc < 0) { + CERROR("Can't parse server name '%s': rc = %d\n", + s1, rc); + break; + } + if (rc == LDD_F_SV_TYPE_OST) + exclude_list[lmd->lmd_exclude_count++] = index; + else + CDEBUG(D_MOUNT, "ignoring exclude %.*s: type = %#x\n", + (uint)(s2-s1), s1, rc); + s1 = s2; + /* now we are pointing at ':' (next exclude) + or ',' (end of excludes) */ + if (lmd->lmd_exclude_count >= devmax) + break; + } + if (rc >= 0) /* non-err */ + rc = 0; + + if (lmd->lmd_exclude_count) { + /* permanent, freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_exclude, sizeof(index) * + lmd->lmd_exclude_count); + if (lmd->lmd_exclude) { + memcpy(lmd->lmd_exclude, exclude_list, + sizeof(index) * lmd->lmd_exclude_count); + } else { + rc = -ENOMEM; + lmd->lmd_exclude_count = 0; + } + } + OBD_FREE(exclude_list, sizeof(index) * devmax); + RETURN(rc); +} + +static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr) +{ + char *tail; + int length; + + if (lmd->lmd_mgssec != NULL) { + OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1); + lmd->lmd_mgssec = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(lmd->lmd_mgssec, length + 1); + if (lmd->lmd_mgssec == NULL) + return -ENOMEM; + + memcpy(lmd->lmd_mgssec, ptr, length); + lmd->lmd_mgssec[length] = '\0'; + return 0; +} + +static int lmd_parse_network(struct lustre_mount_data *lmd, char *ptr) +{ + char *tail; + int length; + + if (lmd->lmd_nidnet != NULL) { + OBD_FREE(lmd->lmd_nidnet, strlen(lmd->lmd_nidnet) + 1); + lmd->lmd_nidnet = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(lmd->lmd_nidnet, length + 1); + if (lmd->lmd_nidnet == NULL) + return -ENOMEM; + + memcpy(lmd->lmd_nidnet, ptr, length); + lmd->lmd_nidnet[length] = '\0'; + return 0; +} + +static int lmd_parse_string(char **handle, char *ptr) +{ + char *tail; + int length; + + if ((handle == NULL) || (ptr == NULL)) + return -EINVAL; + + if (*handle != NULL) { + OBD_FREE(*handle, strlen(*handle) + 1); + *handle = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(*handle, length + 1); + if (*handle == NULL) + return -ENOMEM; + + memcpy(*handle, ptr, length); + (*handle)[length] = '\0'; + + return 0; +} + +/* Collect multiple values for mgsnid specifiers */ +static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr) +{ + lnet_nid_t nid; + char *tail = *ptr; + char *mgsnid; + int length; + int oldlen = 0; + + /* Find end of nidlist */ + while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {} + length = tail - *ptr; + if (length == 0) { + LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr); + return -EINVAL; + } + + if (lmd->lmd_mgs != NULL) + oldlen = strlen(lmd->lmd_mgs) + 1; + + OBD_ALLOC(mgsnid, oldlen + length + 1); + if (mgsnid == NULL) + return -ENOMEM; + + if (lmd->lmd_mgs != NULL) { + /* Multiple mgsnid= are taken to mean failover locations */ + memcpy(mgsnid, lmd->lmd_mgs, oldlen); + mgsnid[oldlen - 1] = ':'; + OBD_FREE(lmd->lmd_mgs, oldlen); + } + memcpy(mgsnid + oldlen, *ptr, length); + mgsnid[oldlen + length] = '\0'; + lmd->lmd_mgs = mgsnid; + *ptr = tail; + + return 0; +} + +/** + * Find the first delimiter (comma or colon) from the specified \a buf and + * make \a *endh point to the string starting with the delimiter. The commas + * in expression list [...] will be skipped. + * + * @buf a delimiter-separated string + * @endh a pointer to a pointer that will point to the string + * starting with the delimiter + * + * RETURNS true if delimiter is found, false if delimiter is not found + */ +static bool lmd_find_delimiter(char *buf, char **endh) +{ + char *c = buf; + size_t pos; + bool found; + + if (!buf) + return false; +try_again: + if (*c == ',' || *c == ':') + return true; + + pos = strcspn(c, "[:,]"); + if (!pos) + return false; + + /* Not a valid mount string */ + if (*c == ']') { + CWARN("invalid mount string format\n"); + return false; + } + + c += pos; + if (*c == '[') { + c = strchr(c, ']'); + + /* invalid mount string */ + if (!c) { + CWARN("invalid mount string format\n"); + return false; + } + c++; + goto try_again; + } + + found = *c != '\0'; + if (found && endh) + *endh = c; + + return found; +} + +/** + * Find the first valid string delimited by comma or colon from the specified + * \a buf and parse it to see whether it's a valid nid list. If yes, \a *endh + * will point to the next string starting with the delimiter. + * + * \param[in] buf a delimiter-separated string + * \param[in] endh a pointer to a pointer that will point to the string + * starting with the delimiter + * + * \retval 0 if the string is a valid nid list + * \retval 1 if the string is not a valid nid list + */ +static int lmd_parse_nidlist(char *buf, char **endh) +{ + struct list_head nidlist; + char *endp = buf; + char tmp; + int rc = 0; + + if (buf == NULL) + return 1; + while (*buf == ',' || *buf == ':') + buf++; + if (*buf == ' ' || *buf == '/' || *buf == '\0') + return 1; + + if (!lmd_find_delimiter(buf, &endp)) + endp = buf + strlen(buf); + + tmp = *endp; + *endp = '\0'; + + INIT_LIST_HEAD(&nidlist); + if (cfs_parse_nidlist(buf, strlen(buf), &nidlist) <= 0) + rc = 1; + cfs_free_nidlist(&nidlist); + + *endp = tmp; + if (rc != 0) + return rc; + if (endh != NULL) + *endh = endp; + return 0; +} + +/** Parse mount line options + * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre + * dev is passed as device=uml1:/lustre by mount.lustre + */ +static int lmd_parse(char *options, struct lustre_mount_data *lmd) +{ + char *s1, *s2, *devname = NULL; + struct lustre_mount_data *raw = (struct lustre_mount_data *)options; + int rc = 0; + ENTRY; + + LASSERT(lmd); + if (!options) { + LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that " + "/sbin/mount.lustre is installed.\n"); + RETURN(-EINVAL); + } + + /* Options should be a string - try to detect old lmd data */ + if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) { + LCONSOLE_ERROR_MSG(0x163, "You're using an old version of " + "/sbin/mount.lustre. Please install " + "version %s\n", LUSTRE_VERSION_STRING); + RETURN(-EINVAL); + } + lmd->lmd_magic = LMD_MAGIC; + + OBD_ALLOC(lmd->lmd_params, LMD_PARAMS_MAXLEN); + if (lmd->lmd_params == NULL) + RETURN(-ENOMEM); + lmd->lmd_params[0] = '\0'; + + /* Set default flags here */ + + s1 = options; + while (*s1) { + int clear = 0; + int time_min = OBD_RECOVERY_TIME_MIN; + char *s3; + + /* Skip whitespace and extra commas */ + while (*s1 == ' ' || *s1 == ',') + s1++; + s3 = s1; + + /* Client options are parsed in ll_options: eg. flock, + user_xattr, acl */ + + /* Parse non-ldiskfs options here. Rather than modifying + ldiskfs, we just zero these out here */ + if (strncmp(s1, "abort_recov", 11) == 0) { + lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; + clear++; + } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) { + lmd->lmd_recovery_time_soft = + max_t(int, simple_strtoul(s1 + 19, NULL, 10), + time_min); + clear++; + } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) { + lmd->lmd_recovery_time_hard = + max_t(int, simple_strtoul(s1 + 19, NULL, 10), + time_min); + clear++; + } else if (strncmp(s1, "noir", 4) == 0) { + lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */ + clear++; + } else if (strncmp(s1, "nosvc", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOSVC; + clear++; + } else if (strncmp(s1, "nomgs", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOMGS; + clear++; + } else if (strncmp(s1, "noscrub", 7) == 0) { + lmd->lmd_flags |= LMD_FLG_NOSCRUB; + clear++; + } else if (strncmp(s1, "skip_lfsck", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_SKIP_LFSCK; + clear++; + } else if (strncmp(s1, "rdonly_dev", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_DEV_RDONLY; + clear++; + } else if (strncmp(s1, PARAM_MGSNODE, + sizeof(PARAM_MGSNODE) - 1) == 0) { + s2 = s1 + sizeof(PARAM_MGSNODE) - 1; + /* Assume the next mount opt is the first + invalid nid we get to. */ + rc = lmd_parse_mgs(lmd, &s2); + if (rc) + goto invalid; + s3 = s2; + clear++; + } else if (strncmp(s1, "writeconf", 9) == 0) { + lmd->lmd_flags |= LMD_FLG_WRITECONF; + clear++; + } else if (strncmp(s1, "update", 6) == 0) { + lmd->lmd_flags |= LMD_FLG_UPDATE; + clear++; + } else if (strncmp(s1, "virgin", 6) == 0) { + lmd->lmd_flags |= LMD_FLG_VIRGIN; + clear++; + } else if (strncmp(s1, "noprimnode", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE; + clear++; + } else if (strncmp(s1, "mgssec=", 7) == 0) { + rc = lmd_parse_mgssec(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; + /* ost exclusion list */ + } else if (strncmp(s1, "exclude=", 8) == 0) { + rc = lmd_make_exclusion(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "mgs", 3) == 0) { + /* We are an MGS */ + lmd->lmd_flags |= LMD_FLG_MGS; + clear++; + } else if (strncmp(s1, "svname=", 7) == 0) { + rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "param=", 6) == 0) { + size_t length, params_length; + char *tail = s1; + + if (lmd_find_delimiter(s1 + 6, &tail)) { + char *param_str = tail + 1; + int supplementary = 1; + while (lmd_parse_nidlist(param_str, + ¶m_str) == 0) { + supplementary = 0; + } + length = param_str - s1 - supplementary; + } else { + length = strlen(s1); + } + length -= 6; + params_length = strlen(lmd->lmd_params); + if (params_length + length + 1 >= LMD_PARAMS_MAXLEN) + RETURN(-E2BIG); + strncat(lmd->lmd_params, s1 + 6, length); + lmd->lmd_params[params_length + length] = '\0'; + strlcat(lmd->lmd_params, " ", LMD_PARAMS_MAXLEN); + s3 = s1 + 6 + length; + clear++; + } else if (strncmp(s1, "osd=", 4) == 0) { + rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4); + if (rc) + goto invalid; + clear++; + } + /* Linux 2.4 doesn't pass the device, so we stuck it at the + end of the options. */ + else if (strncmp(s1, "device=", 7) == 0) { + devname = s1 + 7; + /* terminate options right before device. device + must be the last one. */ + *s1 = '\0'; + break; + } else if (strncmp(s1, "network=", 8) == 0) { + rc = lmd_parse_network(lmd, s1 + 8); + if (rc) + goto invalid; + + /* check if LNet dynamic peer discovery is activated */ + if (LNetGetPeerDiscoveryStatus()) { + CERROR("LNet Dynamic Peer Discovery is enabled " + "on this node. 'network' mount option " + "cannot be taken into account.\n"); + goto invalid; + } + + clear++; + } + + /* Find next opt */ + s2 = strchr(s3, ','); + if (s2 == NULL) { + if (clear) + *s1 = '\0'; + break; + } + s2++; + if (clear) + memmove(s1, s2, strlen(s2) + 1); + else + s1 = s2; + } + + if (!devname) { + LCONSOLE_ERROR_MSG(0x164, "Can't find the device name " + "(need mount option 'device=...')\n"); + goto invalid; + } + + s1 = strstr(devname, ":/"); + if (s1) { + ++s1; + lmd->lmd_flags |= LMD_FLG_CLIENT; + /* Remove leading /s from fsname */ + while (*++s1 == '/') + ; + s2 = s1; + while (*s2 != '/' && *s2 != '\0') + s2++; + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_profile, s2 - s1 + 8); + if (!lmd->lmd_profile) + RETURN(-ENOMEM); + + strncat(lmd->lmd_profile, s1, s2 - s1); + strncat(lmd->lmd_profile, "-client", 7); + + s1 = s2; + s2 = s1 + strlen(s1) - 1; + /* Remove padding /s from fileset */ + while (*s2 == '/') + s2--; + if (s2 > s1) { + OBD_ALLOC(lmd->lmd_fileset, s2 - s1 + 2); + if (lmd->lmd_fileset == NULL) { + OBD_FREE(lmd->lmd_profile, + strlen(lmd->lmd_profile) + 1); + RETURN(-ENOMEM); + } + strncat(lmd->lmd_fileset, s1, s2 - s1 + 1); + } + } else { + /* server mount */ + if (lmd->lmd_nidnet != NULL) { + /* 'network=' mount option forbidden for server */ + OBD_FREE(lmd->lmd_nidnet, strlen(lmd->lmd_nidnet) + 1); + lmd->lmd_nidnet = NULL; + rc = -EINVAL; + CERROR("%s: option 'network=' not allowed for Lustre " + "servers: rc = %d\n", devname, rc); + RETURN(rc); + } + } + + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1); + if (!lmd->lmd_dev) + RETURN(-ENOMEM); + strncpy(lmd->lmd_dev, devname, strlen(devname)+1); + + /* Save mount options */ + s1 = options + strlen(options) - 1; + while (s1 >= options && (*s1 == ',' || *s1 == ' ')) + *s1-- = 0; + while (*options && (*options == ',' || *options == ' ')) + options++; + if (*options != 0) { + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1); + if (!lmd->lmd_opts) + RETURN(-ENOMEM); + strncpy(lmd->lmd_opts, options, strlen(options)+1); + } + + lmd_print(lmd); + lmd->lmd_magic = LMD_MAGIC; + + RETURN(rc); + +invalid: + CERROR("Bad mount options %s\n", options); + RETURN(-EINVAL); +} + +struct lustre_mount_data2 { + void *lmd2_data; + struct vfsmount *lmd2_mnt; +}; + +/** This is the entry point for the mount call into Lustre. + * This is called when a server or client is mounted, + * and this is where we start setting things up. + * @param data Mount options (e.g. -o flock,abort_recov) + */ +static int lustre_fill_super(struct super_block *sb, void *data, int silent) +{ + struct lustre_mount_data *lmd; + struct lustre_mount_data2 *lmd2 = data; + struct lustre_sb_info *lsi; + int rc; + ENTRY; + + CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb); + + lsi = lustre_init_lsi(sb); + if (!lsi) + RETURN(-ENOMEM); + lmd = lsi->lsi_lmd; + + /* + * Disable lockdep during mount, because mount locking patterns are + * `special'. + */ + lockdep_off(); + + /* + * LU-639: the obd cleanup of last mount may not finish yet, wait here. + */ + obd_zombie_barrier(); + + /* Figure out the lmd from the mount options */ + if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) { + lustre_put_lsi(sb); + GOTO(out, rc = -EINVAL); + } + + if (lmd_is_client(lmd)) { + CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile); + if (client_fill_super == NULL) + request_module("lustre"); + if (client_fill_super == NULL) { + LCONSOLE_ERROR_MSG(0x165, "Nothing registered for " + "client mount! Is the 'lustre' " + "module loaded?\n"); + lustre_put_lsi(sb); + rc = -ENODEV; + } else { + rc = lustre_start_mgc(sb); + if (rc) { + lustre_common_put_super(sb); + GOTO(out, rc); + } + /* Connect and start */ + /* (should always be ll_fill_super) */ + rc = (*client_fill_super)(sb, lmd2->lmd2_mnt); + /* c_f_s will call lustre_common_put_super on failure */ + } + } else { +#ifdef HAVE_SERVER_SUPPORT + CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev); + rc = server_fill_super(sb); + /* s_f_s calls lustre_start_mgc after the mount because we need + the MGS nids which are stored on disk. Plus, we may + need to start the MGS first. */ + /* s_f_s will call server_put_super on failure */ +#else + CERROR("This is client-side-only module, " + "cannot handle server mount.\n"); + rc = -EINVAL; +#endif + } + + /* If error happens in fill_super() call, @lsi will be killed there. + * This is why we do not put it here. */ + GOTO(out, rc); +out: + if (rc) { + CERROR("Unable to mount %s (%d)\n", + s2lsi(sb) ? lmd->lmd_dev : "", rc); + } else { + CDEBUG(D_SUPER, "Mount %s complete\n", + lmd->lmd_dev); + } + lockdep_on(); + return rc; +} + + +/* We can't call ll_fill_super by name because it lives in a module that + must be loaded after this one. */ +void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb, + struct vfsmount *mnt)) +{ + client_fill_super = cfs; +} +EXPORT_SYMBOL(lustre_register_client_fill_super); + +void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb)) +{ + kill_super_cb = cfs; +} +EXPORT_SYMBOL(lustre_register_kill_super_cb); + +/***************** FS registration ******************/ +#ifdef HAVE_FSTYPE_MOUNT +static struct dentry *lustre_mount(struct file_system_type *fs_type, int flags, + const char *devname, void *data) +{ + struct lustre_mount_data2 lmd2 = { + .lmd2_data = data, + }; + + return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super); +} +#else +static int lustre_get_sb(struct file_system_type *fs_type, int flags, + const char *devname, void *data, struct vfsmount *mnt) +{ + struct lustre_mount_data2 lmd2 = { + .lmd2_data = data, + .lmd2_mnt = mnt, + }; + + return get_sb_nodev(fs_type, flags, &lmd2, lustre_fill_super, mnt); +} +#endif + +static void lustre_kill_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + if (kill_super_cb && lsi && !IS_SERVER(lsi)) + (*kill_super_cb)(sb); + + kill_anon_super(sb); +} + +/** Register the "lustre" fs type + */ +static struct file_system_type lustre_fs_type = { + .owner = THIS_MODULE, + .name = "lustre", +#ifdef HAVE_FSTYPE_MOUNT + .mount = lustre_mount, +#else + .get_sb = lustre_get_sb, +#endif + .kill_sb = lustre_kill_super, + .fs_flags = FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE | +#ifdef HAVE_SERVER_SUPPORT + FS_REQUIRES_DEV, +#else + 0, +#endif +}; +MODULE_ALIAS_FS("lustre"); + +int lustre_register_fs(void) +{ + return register_filesystem(&lustre_fs_type); +} + +int lustre_unregister_fs(void) +{ + return unregister_filesystem(&lustre_fs_type); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c new file mode 100644 index 0000000000000..b23a4ccf0bd9d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c @@ -0,0 +1,2076 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_mount_server.c + * + * Server mount routines + * + * Author: Nathan Rutman + */ + + +#define DEBUG_SUBSYSTEM S_CLASS +#define D_MOUNT (D_SUPER | D_CONFIG /* | D_WARNING */) +#define PRINT_CMD CDEBUG +#define PRINT_MASK (D_SUPER | D_CONFIG) + +#include +#ifdef HAVE_LINUX_SELINUX_IS_ENABLED +#include +#endif +#include +#include +#ifdef HAVE_KERNEL_LOCKED +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +/*********** mount lookup *********/ + +static DEFINE_MUTEX(lustre_mount_info_lock); +static LIST_HEAD(server_mount_info_list); + +static struct lustre_mount_info *server_find_mount(const char *name) +{ + struct list_head *tmp; + struct lustre_mount_info *lmi; + ENTRY; + + list_for_each(tmp, &server_mount_info_list) { + lmi = list_entry(tmp, struct lustre_mount_info, + lmi_list_chain); + if (strcmp(name, lmi->lmi_name) == 0) + RETURN(lmi); + } + RETURN(NULL); +} + +/* we must register an obd for a mount before we call the setup routine. + *_setup will call lustre_get_mount to get the mnt struct + by obd_name, since we can't pass the pointer to setup. */ +static int server_register_mount(const char *name, struct super_block *sb) +{ + struct lustre_mount_info *lmi; + char *name_cp; + ENTRY; + + LASSERT(sb); + + OBD_ALLOC(lmi, sizeof(*lmi)); + if (!lmi) + RETURN(-ENOMEM); + OBD_ALLOC(name_cp, strlen(name) + 1); + if (!name_cp) { + OBD_FREE(lmi, sizeof(*lmi)); + RETURN(-ENOMEM); + } + strcpy(name_cp, name); + + mutex_lock(&lustre_mount_info_lock); + + if (server_find_mount(name)) { + mutex_unlock(&lustre_mount_info_lock); + OBD_FREE(lmi, sizeof(*lmi)); + OBD_FREE(name_cp, strlen(name) + 1); + CERROR("Already registered %s\n", name); + RETURN(-EEXIST); + } + lmi->lmi_name = name_cp; + lmi->lmi_sb = sb; + list_add(&lmi->lmi_list_chain, &server_mount_info_list); + + mutex_unlock(&lustre_mount_info_lock); + + CDEBUG(D_MOUNT, "register mount %p from %s\n", sb, name); + + RETURN(0); +} + +/* when an obd no longer needs a mount */ +static int server_deregister_mount(const char *name) +{ + struct lustre_mount_info *lmi; + ENTRY; + + mutex_lock(&lustre_mount_info_lock); + lmi = server_find_mount(name); + if (!lmi) { + mutex_unlock(&lustre_mount_info_lock); + CERROR("%s not registered\n", name); + RETURN(-ENOENT); + } + + CDEBUG(D_MOUNT, "deregister mount %p from %s\n", lmi->lmi_sb, name); + + OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1); + list_del(&lmi->lmi_list_chain); + OBD_FREE(lmi, sizeof(*lmi)); + mutex_unlock(&lustre_mount_info_lock); + + RETURN(0); +} + +/* obd's look up a registered mount using their obdname. This is just + for initial obd setup to find the mount struct. It should not be + called every time you want to mntget. */ +struct lustre_mount_info *server_get_mount(const char *name) +{ + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + ENTRY; + + mutex_lock(&lustre_mount_info_lock); + lmi = server_find_mount(name); + mutex_unlock(&lustre_mount_info_lock); + if (!lmi) { + CERROR("Can't find mount for %s\n", name); + RETURN(NULL); + } + lsi = s2lsi(lmi->lmi_sb); + + atomic_inc(&lsi->lsi_mounts); + + CDEBUG(D_MOUNT, "get mount %p from %s, refs=%d\n", lmi->lmi_sb, + name, atomic_read(&lsi->lsi_mounts)); + + RETURN(lmi); +} +EXPORT_SYMBOL(server_get_mount); + +/** + * server_put_mount: to be called from obd_cleanup methods + * @name: obd name + * @dereg_mnt: 0 or 1 depending on whether the mount is to be deregistered or + * not + * + * The caller decides whether server_deregister_mount() needs to be called or + * not. Calling of server_deregister_mount() does not depend on refcounting on + * lsi because we could have say the mgs and mds on the same node and we + * unmount the mds, then the ref on the lsi would still be non-zero but we + * would still want to deregister the mds mount. + */ +int server_put_mount(const char *name, bool dereg_mnt) +{ + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + ENTRY; + + mutex_lock(&lustre_mount_info_lock); + lmi = server_find_mount(name); + mutex_unlock(&lustre_mount_info_lock); + if (!lmi) { + CERROR("Can't find mount for %s\n", name); + RETURN(-ENOENT); + } + lsi = s2lsi(lmi->lmi_sb); + + CDEBUG(D_MOUNT, "put mount %p from %s, refs=%d\n", + lmi->lmi_sb, name, atomic_read(&lsi->lsi_mounts)); + + if (lustre_put_lsi(lmi->lmi_sb)) + CDEBUG(D_MOUNT, "Last put of mount %p from %s\n", + lmi->lmi_sb, name); + + if (dereg_mnt) + /* this obd should never need the mount again */ + server_deregister_mount(name); + + RETURN(0); +} +EXPORT_SYMBOL(server_put_mount); + +/* Set up a MGS to serve startup logs */ +static int server_start_mgs(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct lustre_mount_info *lmi; + int rc = 0; + ENTRY; + + /* It is impossible to have more than 1 MGS per node, since + MGC wouldn't know which to connect to */ + lmi = server_find_mount(LUSTRE_MGS_OBDNAME); + if (lmi) { + lsi = s2lsi(lmi->lmi_sb); + LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started" + " from server\n"); + RETURN(-EALREADY); + } + + CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME); + + rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb); + + if (!rc) { + rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME, + LUSTRE_MGS_OBDNAME, NULL, NULL, + lsi->lsi_osd_obdname, NULL); + /* server_deregister_mount() is not called previously, for lsi + * and other stuff can't be freed cleanly when mgs calls + * server_put_mount() in error handling case (see b=17758), + * this problem is caused by a bug in mgs_init0, which forgot + * calling server_put_mount in error case. */ + + if (rc) + server_deregister_mount(LUSTRE_MGS_OBDNAME); + } + + if (rc) + LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). " + "Is the 'mgs' module loaded?\n", + LUSTRE_MGS_OBDNAME, rc); + RETURN(rc); +} + +static int server_stop_mgs(struct super_block *sb) +{ + struct obd_device *obd; + int rc; + struct lustre_mount_info *lmi; + ENTRY; + + /* Do not stop MGS if this device is not the running MGT */ + lmi = server_find_mount(LUSTRE_MGS_OBDNAME); + if (lmi != NULL && lmi->lmi_sb != sb) + RETURN(0); + + CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME); + + /* There better be only one MGS */ + obd = class_name2obd(LUSTRE_MGS_OBDNAME); + if (!obd) { + CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME); + RETURN(-EALREADY); + } + + /* The MGS should always stop when we say so */ + obd->obd_force = 1; + rc = class_manual_cleanup(obd); + RETURN(rc); +} + +/* Since there's only one mgc per node, we have to change it's fs to get + access to the right disk. */ +static int server_mgc_set_fs(const struct lu_env *env, + struct obd_device *mgc, struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev); + + /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */ + rc = obd_set_info_async(env, mgc->obd_self_export, + sizeof(KEY_SET_FS), KEY_SET_FS, + sizeof(*sb), sb, NULL); + if (rc != 0) + CERROR("can't set_fs %d\n", rc); + + RETURN(rc); +} + +static int server_mgc_clear_fs(const struct lu_env *env, + struct obd_device *mgc) +{ + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "Unassign mgc disk\n"); + + rc = obd_set_info_async(env, mgc->obd_self_export, + sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS, + 0, NULL, NULL); + RETURN(rc); +} + +static inline bool is_mdc_device(const char *devname) +{ + char *ptr; + + ptr = strrchr(devname, '-'); + return ptr != NULL && strcmp(ptr, "-mdc") == 0; +} + +static inline bool tgt_is_mdt(const char *tgtname, __u32 *idx) +{ + int type; + + type = server_name2index(tgtname, idx, NULL); + + return type == LDD_F_SV_TYPE_MDT; +} + +/** + * Convert OST/MDT name(fsname-{MDT,OST}xxxx) to a lwp name with the @idx:yyyy + * (fsname-MDTyyyy-lwp-{MDT,OST}xxxx) + **/ +int tgt_name2lwp_name(const char *tgt_name, char *lwp_name, int len, __u32 idx) +{ + char *fsname; + const char *tgt; + int rc; + ENTRY; + + OBD_ALLOC(fsname, MTI_NAME_MAXLEN); + if (fsname == NULL) + RETURN(-ENOMEM); + + rc = server_name2fsname(tgt_name, fsname, &tgt); + if (rc != 0) { + CERROR("%s: failed to get fsname from tgt_name: rc = %d\n", + tgt_name, rc); + GOTO(cleanup, rc); + } + + if (*tgt != '-' && *tgt != ':') { + CERROR("%s: invalid tgt_name name!\n", tgt_name); + GOTO(cleanup, rc = -EINVAL); + } + + tgt++; + if (strncmp(tgt, "OST", 3) != 0 && strncmp(tgt, "MDT", 3) != 0) { + CERROR("%s is not an OST or MDT target!\n", tgt_name); + GOTO(cleanup, rc = -EINVAL); + } + snprintf(lwp_name, len, "%s-MDT%04x-%s-%s", + fsname, idx, LUSTRE_LWP_NAME, tgt); + + GOTO(cleanup, rc = 0); + +cleanup: + if (fsname != NULL) + OBD_FREE(fsname, MTI_NAME_MAXLEN); + + return rc; +} +EXPORT_SYMBOL(tgt_name2lwp_name); + +static LIST_HEAD(lwp_register_list); +static DEFINE_SPINLOCK(lwp_register_list_lock); + +static void lustre_put_lwp_item(struct lwp_register_item *lri) +{ + if (atomic_dec_and_test(&lri->lri_ref)) { + LASSERT(list_empty(&lri->lri_list)); + + if (*lri->lri_exp != NULL) + class_export_put(*lri->lri_exp); + OBD_FREE_PTR(lri); + } +} + +int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp, + register_lwp_cb cb_func, void *cb_data) +{ + struct obd_device *lwp; + struct lwp_register_item *lri; + bool cb = false; + ENTRY; + + LASSERTF(strlen(lwpname) < MTI_NAME_MAXLEN, "lwpname is too long %s\n", + lwpname); + LASSERT(exp != NULL && *exp == NULL); + + OBD_ALLOC_PTR(lri); + if (lri == NULL) + RETURN(-ENOMEM); + + lwp = class_name2obd(lwpname); + if (lwp != NULL && lwp->obd_set_up == 1) { + struct obd_uuid *uuid; + + OBD_ALLOC_PTR(uuid); + if (uuid == NULL) { + OBD_FREE_PTR(lri); + RETURN(-ENOMEM); + } + memcpy(uuid->uuid, lwpname, strlen(lwpname)); + *exp = cfs_hash_lookup(lwp->obd_uuid_hash, uuid); + OBD_FREE_PTR(uuid); + } + + memcpy(lri->lri_name, lwpname, strlen(lwpname)); + lri->lri_exp = exp; + lri->lri_cb_func = cb_func; + lri->lri_cb_data = cb_data; + INIT_LIST_HEAD(&lri->lri_list); + /* + * Initialize the lri_ref at 2, one will be released before + * current function returned via lustre_put_lwp_item(), the + * other will be released in lustre_deregister_lwp_item(). + */ + atomic_set(&lri->lri_ref, 2); + + spin_lock(&lwp_register_list_lock); + list_add(&lri->lri_list, &lwp_register_list); + if (*exp != NULL) + cb = true; + spin_unlock(&lwp_register_list_lock); + + if (cb && cb_func != NULL) + cb_func(cb_data); + lustre_put_lwp_item(lri); + + RETURN(0); +} +EXPORT_SYMBOL(lustre_register_lwp_item); + +void lustre_deregister_lwp_item(struct obd_export **exp) +{ + struct lwp_register_item *lri; + bool removed = false; + int repeat = 0; + + spin_lock(&lwp_register_list_lock); + list_for_each_entry(lri, &lwp_register_list, lri_list) { + if (exp == lri->lri_exp) { + list_del_init(&lri->lri_list); + removed = true; + break; + } + } + spin_unlock(&lwp_register_list_lock); + + if (!removed) + return; + + /* See lustre_notify_lwp_list(), in some extreme race conditions, + * the notify callback could be still on the fly, we need to wait + * for the callback done before moving on to free the data used + * by callback. */ + while (atomic_read(&lri->lri_ref) > 1) { + CDEBUG(D_MOUNT, "lri reference count %u, repeat: %d\n", + atomic_read(&lri->lri_ref), repeat); + repeat++; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + lustre_put_lwp_item(lri); +} +EXPORT_SYMBOL(lustre_deregister_lwp_item); + +struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx) +{ + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + struct obd_device *lwp; + struct obd_export *exp = NULL; + char fsname[16]; + char lwp_name[24]; + int rc; + + lmi = server_get_mount(dev); + if (lmi == NULL) + return NULL; + + lsi = s2lsi(lmi->lmi_sb); + rc = server_name2fsname(lsi->lsi_svname, fsname, NULL); + if (rc != 0) { + CERROR("%s: failed to get fsname: rc = %d\n", + lsi->lsi_svname, rc); + goto err_lmi; + } + + snprintf(lwp_name, sizeof(lwp_name), "%s-MDT%04x", fsname, idx); + mutex_lock(&lsi->lsi_lwp_mutex); + list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) { + char *ptr = strstr(lwp->obd_name, lwp_name); + + if (ptr != NULL && lwp->obd_lwp_export != NULL) { + exp = class_export_get(lwp->obd_lwp_export); + break; + } + } + mutex_unlock(&lsi->lsi_lwp_mutex); + +err_lmi: + server_put_mount(dev, false); + + return exp; +} +EXPORT_SYMBOL(lustre_find_lwp_by_index); + +void lustre_notify_lwp_list(struct obd_export *exp) +{ + struct lwp_register_item *lri; + LASSERT(exp != NULL); + +again: + spin_lock(&lwp_register_list_lock); + list_for_each_entry(lri, &lwp_register_list, lri_list) { + if (strcmp(exp->exp_obd->obd_name, lri->lri_name)) + continue; + if (*lri->lri_exp != NULL) + continue; + *lri->lri_exp = class_export_get(exp); + if (lri->lri_cb_func == NULL) + continue; + atomic_inc(&lri->lri_ref); + spin_unlock(&lwp_register_list_lock); + + lri->lri_cb_func(lri->lri_cb_data); + lustre_put_lwp_item(lri); + + /* Others may have changed the list after we unlock, we have + * to rescan the list from the beginning. Usually, the list + * 'lwp_register_list' is very short, and there is 'guard' + * lri::lri_exp that will prevent the callback to be done + * repeatedly. So rescanning the list has no problem. */ + goto again; + } + spin_unlock(&lwp_register_list_lock); +} +EXPORT_SYMBOL(lustre_notify_lwp_list); + +static int lustre_lwp_connect(struct obd_device *lwp, bool is_mdt) +{ + struct lu_env env; + struct lu_context session_ctx; + struct obd_export *exp; + struct obd_uuid *uuid = NULL; + struct obd_connect_data *data = NULL; + int rc; + ENTRY; + + /* log has been fully processed, let clients connect */ + rc = lu_env_init(&env, lwp->obd_lu_dev->ld_type->ldt_ctx_tags); + if (rc != 0) + RETURN(rc); + + lu_context_init(&session_ctx, LCT_SERVER_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + env.le_ses = &session_ctx; + + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out, rc = -ENOMEM); + + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX; + data->ocd_version = LUSTRE_VERSION_CODE; + data->ocd_connect_flags |= OBD_CONNECT_FID | OBD_CONNECT_AT | + OBD_CONNECT_LRU_RESIZE | OBD_CONNECT_FULL20 | + OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LIGHTWEIGHT | + OBD_CONNECT_LFSCK | OBD_CONNECT_BULK_MBITS; + + if (is_mdt) + data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS; + + OBD_ALLOC_PTR(uuid); + if (uuid == NULL) + GOTO(out, rc = -ENOMEM); + + if (strlen(lwp->obd_name) > sizeof(uuid->uuid)) { + CERROR("%s: Too long lwp name %s, max_size is %d\n", + lwp->obd_name, lwp->obd_name, (int)sizeof(uuid->uuid)); + GOTO(out, rc = -EINVAL); + } + + /* Use lwp name as the uuid, so we find the export by lwp name later */ + memcpy(uuid->uuid, lwp->obd_name, strlen(lwp->obd_name)); + rc = obd_connect(&env, &exp, lwp, uuid, data, NULL); + if (rc != 0) { + CERROR("%s: connect failed: rc = %d\n", lwp->obd_name, rc); + } else { + if (unlikely(lwp->obd_lwp_export != NULL)) + class_export_put(lwp->obd_lwp_export); + lwp->obd_lwp_export = class_export_get(exp); + } + + GOTO(out, rc); + +out: + if (data != NULL) + OBD_FREE_PTR(data); + if (uuid != NULL) + OBD_FREE_PTR(uuid); + + lu_env_fini(&env); + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + + return rc; +} + +/** + * lwp is used by slaves (Non-MDT0 targets) to manage the connection to MDT0, + * or from the OSTx to MDTy. + **/ +static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi, + __u32 idx) +{ + struct obd_device *obd; + char *lwpname = NULL; + char *lwpuuid = NULL; + int rc; + ENTRY; + + rc = class_add_uuid(lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid); + if (rc != 0) { + CERROR("%s: Can't add uuid: rc =%d\n", lsi->lsi_svname, rc); + RETURN(rc); + } + + OBD_ALLOC(lwpname, MTI_NAME_MAXLEN); + if (lwpname == NULL) + GOTO(out, rc = -ENOMEM); + + rc = tgt_name2lwp_name(lsi->lsi_svname, lwpname, MTI_NAME_MAXLEN, idx); + if (rc != 0) { + CERROR("%s: failed to generate lwp name: rc = %d\n", + lsi->lsi_svname, rc); + GOTO(out, rc); + } + + OBD_ALLOC(lwpuuid, MTI_NAME_MAXLEN); + if (lwpuuid == NULL) + GOTO(out, rc = -ENOMEM); + + sprintf(lwpuuid, "%s_UUID", lwpname); + rc = lustre_start_simple(lwpname, LUSTRE_LWP_NAME, + lwpuuid, lustre_cfg_string(lcfg, 1), + NULL, NULL, NULL); + if (rc) { + CERROR("%s: setup up failed: rc %d\n", lwpname, rc); + GOTO(out, rc); + } + + obd = class_name2obd(lwpname); + LASSERT(obd != NULL); + + rc = lustre_lwp_connect(obd, strstr(lsi->lsi_svname, "-MDT") != NULL); + if (rc == 0) { + obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE; + mutex_lock(&lsi->lsi_lwp_mutex); + list_add_tail(&obd->obd_lwp_list, &lsi->lsi_lwp_list); + mutex_unlock(&lsi->lsi_lwp_mutex); + } else { + CERROR("%s: connect failed: rc = %d\n", lwpname, rc); + } + + GOTO(out, rc); + +out: + if (lwpname != NULL) + OBD_FREE(lwpname, MTI_NAME_MAXLEN); + if (lwpuuid != NULL) + OBD_FREE(lwpuuid, MTI_NAME_MAXLEN); + + return rc; +} + +/* the caller is responsible for memory free */ +static struct obd_device *lustre_find_lwp(struct lustre_sb_info *lsi, + char **lwpname, __u32 idx) +{ + struct obd_device *lwp; + int rc = 0; + ENTRY; + + LASSERT(lwpname != NULL); + LASSERT(IS_OST(lsi) || IS_MDT(lsi)); + + OBD_ALLOC(*lwpname, MTI_NAME_MAXLEN); + if (*lwpname == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = tgt_name2lwp_name(lsi->lsi_svname, *lwpname, MTI_NAME_MAXLEN, idx); + if (rc != 0) { + CERROR("%s: failed to generate lwp name: rc = %d\n", + lsi->lsi_svname, rc); + GOTO(out, rc = -EINVAL); + } + + lwp = class_name2obd(*lwpname); + +out: + if (rc != 0) { + if (*lwpname != NULL) { + OBD_FREE(*lwpname, MTI_NAME_MAXLEN); + *lwpname = NULL; + } + lwp = ERR_PTR(rc); + } + + RETURN(lwp != NULL ? lwp : ERR_PTR(-ENOENT)); +} + +static int lustre_lwp_add_conn(struct lustre_cfg *cfg, + struct lustre_sb_info *lsi, __u32 idx) +{ + struct lustre_cfg_bufs *bufs = NULL; + struct lustre_cfg *lcfg = NULL; + char *lwpname = NULL; + struct obd_device *lwp; + int rc; + ENTRY; + + lwp = lustre_find_lwp(lsi, &lwpname, idx); + if (IS_ERR(lwp)) { + CERROR("%s: can't find lwp device.\n", lsi->lsi_svname); + GOTO(out, rc = PTR_ERR(lwp)); + } + LASSERT(lwpname != NULL); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + GOTO(out, rc = -ENOMEM); + + lustre_cfg_bufs_reset(bufs, lwpname); + lustre_cfg_bufs_set_string(bufs, 1, + lustre_cfg_string(cfg, 1)); + + OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen)); + if (!lcfg) + GOTO(out_cfg, rc = -ENOMEM); + lustre_cfg_init(lcfg, LCFG_ADD_CONN, bufs); + + rc = class_add_conn(lwp, lcfg); + if (rc) + CERROR("%s: can't add conn: rc = %d\n", lwpname, rc); + + if (lcfg) + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens)); +out_cfg: + if (bufs != NULL) + OBD_FREE_PTR(bufs); +out: + if (lwpname != NULL) + OBD_FREE(lwpname, MTI_NAME_MAXLEN); + RETURN(rc); +} + +/** + * Retrieve MDT nids from the client log, then start the lwp device. + * there are only two scenarios which would include mdt nid. + * 1. + * marker 5 (flags=0x01, v2.1.54.0) lustre-MDTyyyy 'add mdc' xxx- + * add_uuid nid=192.168.122.162@tcp(0x20000c0a87aa2) 0: 1:192.168.122.162@tcp + * attach 0:lustre-MDTyyyy-mdc 1:mdc 2:lustre-clilmv_UUID + * setup 0:lustre-MDTyyyy-mdc 1:lustre-MDTyyyy_UUID 2:192.168.122.162@tcp + * add_uuid nid=192.168.172.1@tcp(0x20000c0a8ac01) 0: 1:192.168.172.1@tcp + * add_conn 0:lustre-MDTyyyy-mdc 1:192.168.172.1@tcp + * modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDTyyyy_UUID xxxx + * marker 5 (flags=0x02, v2.1.54.0) lustre-MDTyyyy 'add mdc' xxxx- + * 2. + * marker 7 (flags=0x01, v2.1.54.0) lustre-MDTyyyy 'add failnid' xxxx- + * add_uuid nid=192.168.122.2@tcp(0x20000c0a87a02) 0: 1:192.168.122.2@tcp + * add_conn 0:lustre-MDTyyyy-mdc 1:192.168.122.2@tcp + * marker 7 (flags=0x02, v2.1.54.0) lustre-MDTyyyy 'add failnid' xxxx- + **/ +static int client_lwp_config_process(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct config_llog_instance *cfg = data; + int cfg_len = rec->lrh_len; + char *cfg_buf = (char *) (rec + 1); + struct lustre_cfg *lcfg = NULL; + struct lustre_sb_info *lsi; + int rc = 0, swab = 0; + ENTRY; + + if (rec->lrh_type != OBD_CFG_REC) { + CERROR("Unknown llog record type %#x encountered\n", + rec->lrh_type); + RETURN(-EINVAL); + } + + if (cfg->cfg_sb == NULL) + GOTO(out, rc = -EINVAL); + lsi = s2lsi(cfg->cfg_sb); + + lcfg = (struct lustre_cfg *)cfg_buf; + if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { + lustre_swab_lustre_cfg(lcfg); + swab = 1; + } + + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); + if (rc) + GOTO(out, rc); + + switch (lcfg->lcfg_command) { + case LCFG_MARKER: { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + + lustre_swab_cfg_marker(marker, swab, + LUSTRE_CFG_BUFLEN(lcfg, 1)); + if (marker->cm_flags & CM_SKIP || + marker->cm_flags & CM_EXCLUDE) + GOTO(out, rc = 0); + + if (!tgt_is_mdt(marker->cm_tgtname, &cfg->cfg_lwp_idx)) + GOTO(out, rc = 0); + + if (IS_MDT(lsi) && cfg->cfg_lwp_idx != 0) + GOTO(out, rc = 0); + + if (!strncmp(marker->cm_comment, "add mdc", 7) || + !strncmp(marker->cm_comment, "add failnid", 11)) { + if (marker->cm_flags & CM_START) { + cfg->cfg_flags = CFG_F_MARKER; + /* This hack is to differentiate the + * ADD_UUID is come from "add mdc" record + * or from "add failnid" record. */ + if (!strncmp(marker->cm_comment, + "add failnid", 11)) + cfg->cfg_flags |= CFG_F_SKIP; + } else if (marker->cm_flags & CM_END) { + cfg->cfg_flags = 0; + } + } + break; + } + case LCFG_ADD_UUID: { + if (cfg->cfg_flags == CFG_F_MARKER) { + rc = lustre_lwp_setup(lcfg, lsi, cfg->cfg_lwp_idx); + /* XXX: process only the first nid as + * we don't need another instance of lwp */ + cfg->cfg_flags |= CFG_F_SKIP; + } else if (cfg->cfg_flags == (CFG_F_MARKER | CFG_F_SKIP)) { + rc = class_add_uuid(lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid); + if (rc) + CERROR("%s: Fail to add uuid, rc:%d\n", + lsi->lsi_svname, rc); + } + break; + } + case LCFG_ADD_CONN: { + char *devname = lustre_cfg_string(lcfg, 0); + char *ptr; + __u32 idx = 0; + + if (!is_mdc_device(devname)) + break; + + ptr = strrchr(devname, '-'); + if (ptr == NULL) + break; + + *ptr = 0; + if (!tgt_is_mdt(devname, &idx)) { + *ptr = '-'; + break; + } + *ptr = '-'; + + if (IS_MDT(lsi) && idx != 0) + break; + + rc = lustre_lwp_add_conn(lcfg, lsi, idx); + break; + } + default: + break; + } +out: + RETURN(rc); +} + +static int lustre_disconnect_lwp(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *lwp; + char *logname = NULL; + struct lustre_cfg_bufs *bufs = NULL; + struct config_llog_instance *cfg = NULL; + int rc = 0; + int rc1 = 0; + ENTRY; + + if (likely(lsi->lsi_lwp_started)) { + OBD_ALLOC(logname, MTI_NAME_MAXLEN); + if (logname == NULL) + RETURN(-ENOMEM); + + rc = server_name2fsname(lsi->lsi_svname, logname, NULL); + if (rc != 0) { + CERROR("%s: failed to get fsname from svname: " + "rc = %d\n", lsi->lsi_svname, rc); + GOTO(out, rc = -EINVAL); + } + + strcat(logname, "-client"); + OBD_ALLOC_PTR(cfg); + if (cfg == NULL) + GOTO(out, rc = -ENOMEM); + + /* end log first */ + cfg->cfg_instance = ll_get_cfg_instance(sb); + rc = lustre_end_log(sb, logname, cfg); + if (rc != 0 && rc != -ENOENT) + GOTO(out, rc); + + lsi->lsi_lwp_started = 0; + } + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + GOTO(out, rc = -ENOMEM); + + mutex_lock(&lsi->lsi_lwp_mutex); + list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) { + struct lustre_cfg *lcfg; + + if (likely(lwp->obd_lwp_export != NULL)) { + class_export_put(lwp->obd_lwp_export); + lwp->obd_lwp_export = NULL; + } + + lustre_cfg_bufs_reset(bufs, lwp->obd_name); + lustre_cfg_bufs_set_string(bufs, 1, NULL); + OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, + bufs->lcfg_buflen)); + if (!lcfg) { + rc = -ENOMEM; + break; + } + lustre_cfg_init(lcfg, LCFG_CLEANUP, bufs); + + /* Disconnect import first. NULL is passed for the '@env', + * since it will not be used. */ + rc = lwp->obd_lu_dev->ld_ops->ldo_process_config(NULL, + lwp->obd_lu_dev, lcfg); + OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens)); + if (rc != 0 && rc != -ETIMEDOUT) { + CERROR("%s: fail to disconnect LWP: rc = %d\n", + lwp->obd_name, rc); + rc1 = rc; + } + } + mutex_unlock(&lsi->lsi_lwp_mutex); + + GOTO(out, rc); + +out: + if (bufs != NULL) + OBD_FREE_PTR(bufs); + if (cfg != NULL) + OBD_FREE_PTR(cfg); + if (logname != NULL) + OBD_FREE(logname, MTI_NAME_MAXLEN); + + return rc1 != 0 ? rc1 : rc; +} + +/** + * Stop the lwp for an OST/MDT target. + **/ +static int lustre_stop_lwp(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *lwp; + int rc = 0; + int rc1 = 0; + ENTRY; + + mutex_lock(&lsi->lsi_lwp_mutex); + while (!list_empty(&lsi->lsi_lwp_list)) { + lwp = list_entry(lsi->lsi_lwp_list.next, struct obd_device, + obd_lwp_list); + list_del_init(&lwp->obd_lwp_list); + lwp->obd_force = 1; + mutex_unlock(&lsi->lsi_lwp_mutex); + + rc = class_manual_cleanup(lwp); + if (rc != 0) { + CERROR("%s: fail to stop LWP: rc = %d\n", + lwp->obd_name, rc); + rc1 = rc; + } + mutex_lock(&lsi->lsi_lwp_mutex); + } + mutex_unlock(&lsi->lsi_lwp_mutex); + + RETURN(rc1 != 0 ? rc1 : rc); +} + +/** + * Start the lwp(fsname-MDTyyyy-lwp-{MDT,OST}xxxx) for a MDT/OST or MDT target. + **/ +static int lustre_start_lwp(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_instance *cfg = NULL; + char *logname; + int rc; + ENTRY; + + if (unlikely(lsi->lsi_lwp_started)) + RETURN(0); + + OBD_ALLOC(logname, MTI_NAME_MAXLEN); + if (logname == NULL) + RETURN(-ENOMEM); + + rc = server_name2fsname(lsi->lsi_svname, logname, NULL); + if (rc != 0) { + CERROR("%s: failed to get fsname from svname: rc = %d\n", + lsi->lsi_svname, rc); + GOTO(out, rc = -EINVAL); + } + + strcat(logname, "-client"); + OBD_ALLOC_PTR(cfg); + if (cfg == NULL) + GOTO(out, rc = -ENOMEM); + + cfg->cfg_callback = client_lwp_config_process; + cfg->cfg_instance = ll_get_cfg_instance(sb); + rc = lustre_process_log(sb, logname, cfg); + /* need to remove config llog from mgc */ + lsi->lsi_lwp_started = 1; + + GOTO(out, rc); + +out: + OBD_FREE(logname, MTI_NAME_MAXLEN); + if (cfg != NULL) + OBD_FREE_PTR(cfg); + + return rc; +} + +static DEFINE_MUTEX(server_start_lock); + +/* Stop MDS/OSS if nobody is using them */ +static int server_stop_servers(int lsiflags) +{ + struct obd_device *obd = NULL; + struct obd_type *type = NULL; + int rc = 0; + ENTRY; + + mutex_lock(&server_start_lock); + + /* Either an MDT or an OST or neither */ + /* if this was an MDT, and there are no more MDT's, clean up the MDS */ + if (lsiflags & LDD_F_SV_TYPE_MDT) { + obd = class_name2obd(LUSTRE_MDS_OBDNAME); + if (obd != NULL) + type = class_search_type(LUSTRE_MDT_NAME); + } + + /* if this was an OST, and there are no more OST's, clean up the OSS */ + if (lsiflags & LDD_F_SV_TYPE_OST) { + obd = class_name2obd(LUSTRE_OSS_OBDNAME); + if (obd != NULL) + type = class_search_type(LUSTRE_OST_NAME); + } + + if (obd != NULL && (type == NULL || type->typ_refcnt == 0)) { + obd->obd_force = 1; + /* obd_fail doesn't mean much on a server obd */ + rc = class_manual_cleanup(obd); + } + + mutex_unlock(&server_start_lock); + + RETURN(rc); +} + +int server_mti_print(const char *title, struct mgs_target_info *mti) +{ + PRINT_CMD(PRINT_MASK, "mti %s\n", title); + PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname); + PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname); + PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid); + PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n", + mti->mti_config_ver, mti->mti_flags); + return 0; +} + +/* Generate data for registration */ +static int server_lsi2mti(struct lustre_sb_info *lsi, + struct mgs_target_info *mti) +{ + struct lnet_process_id id; + int rc, i = 0; + int cplen = 0; + ENTRY; + + if (!IS_SERVER(lsi)) + RETURN(-EINVAL); + + if (strlcpy(mti->mti_svname, lsi->lsi_svname, sizeof(mti->mti_svname)) + >= sizeof(mti->mti_svname)) + RETURN(-E2BIG); + + mti->mti_nid_count = 0; + while (LNetGetId(i++, &id) != -ENOENT) { + if (id.nid == LNET_NID_LO_0) + continue; + + /* server use --servicenode param, only allow specified + * nids be registered */ + if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) != 0 && + class_match_nid(lsi->lsi_lmd->lmd_params, + PARAM_FAILNODE, id.nid) < 1) + continue; + + /* match specified network */ + if (!class_match_net(lsi->lsi_lmd->lmd_params, + PARAM_NETWORK, LNET_NIDNET(id.nid))) + continue; + + mti->mti_nids[mti->mti_nid_count] = id.nid; + mti->mti_nid_count++; + if (mti->mti_nid_count >= MTI_NIDS_MAX) { + CWARN("Only using first %d nids for %s\n", + mti->mti_nid_count, mti->mti_svname); + break; + } + } + + if (mti->mti_nid_count == 0) { + CERROR("Failed to get NID for server %s, please check whether " + "the target is specifed with improper --servicenode or " + "--network options.\n", mti->mti_svname); + RETURN(-EINVAL); + } + + mti->mti_lustre_ver = LUSTRE_VERSION_CODE; + mti->mti_config_ver = 0; + + rc = server_name2fsname(lsi->lsi_svname, mti->mti_fsname, NULL); + if (rc != 0) + return rc; + + rc = server_name2index(lsi->lsi_svname, &mti->mti_stripe_index, NULL); + if (rc < 0) + return rc; + /* Orion requires index to be set */ + LASSERT(!(rc & LDD_F_NEED_INDEX)); + /* keep only LDD flags */ + mti->mti_flags = lsi->lsi_flags & LDD_F_MASK; + if (mti->mti_flags & (LDD_F_WRITECONF | LDD_F_VIRGIN)) + mti->mti_flags |= LDD_F_UPDATE; + cplen = strlcpy(mti->mti_params, lsi->lsi_lmd->lmd_params, + sizeof(mti->mti_params)); + if (cplen >= sizeof(mti->mti_params)) + return -E2BIG; + return 0; +} + +/* Register an old or new target with the MGS. If needed MGS will construct + startup logs and assign index */ +static int server_register_target(struct lustre_sb_info *lsi) +{ + struct obd_device *mgc = lsi->lsi_mgc; + struct mgs_target_info *mti = NULL; + bool writeconf; + int rc; + int tried = 0; + ENTRY; + + LASSERT(mgc); + + if (!IS_SERVER(lsi)) + RETURN(-EINVAL); + + OBD_ALLOC_PTR(mti); + if (!mti) + RETURN(-ENOMEM); + + rc = server_lsi2mti(lsi, mti); + if (rc) + GOTO(out, rc); + + CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n", + mti->mti_svname, mti->mti_fsname, + libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index, + mti->mti_flags); + + /* if write_conf is true, the registration must succeed */ + writeconf = !!(lsi->lsi_flags & (LDD_F_NEED_INDEX | LDD_F_UPDATE)); + mti->mti_flags |= LDD_F_OPC_REG; + +again: + /* Register the target */ + /* FIXME use mgc_process_config instead */ + rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp, + sizeof(KEY_REGISTER_TARGET), + KEY_REGISTER_TARGET, + sizeof(*mti), mti, NULL); + if (rc) { + if (mti->mti_flags & LDD_F_ERROR) { + LCONSOLE_ERROR_MSG(0x160, + "%s: the MGS refuses to allow this server " + "to start: rc = %d. Please see messages on " + "the MGS.\n", lsi->lsi_svname, rc); + } else if (writeconf) { + if ((rc == -ESHUTDOWN || rc == -EIO) && ++tried < 5) { + /* The connection with MGS is not established. + * Try again after 2 seconds. Interruptable. */ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout( + msecs_to_jiffies(MSEC_PER_SEC) * 2); + set_current_state(TASK_RUNNING); + if (!signal_pending(current)) + goto again; + } + + LCONSOLE_ERROR_MSG(0x15f, + "%s: cannot register this server with the MGS: " + "rc = %d. Is the MGS running?\n", + lsi->lsi_svname, rc); + } else { + CDEBUG(D_HA, "%s: error registering with the MGS: " + "rc = %d (not fatal)\n", lsi->lsi_svname, rc); + /* reset the error code for non-fatal error. */ + rc = 0; + } + GOTO(out, rc); + } + +out: + if (mti) + OBD_FREE_PTR(mti); + RETURN(rc); +} + +/** + * Notify the MGS that this target is ready. + * Used by IR - if the MGS receives this message, it will notify clients. + */ +static int server_notify_target(struct super_block *sb, struct obd_device *obd) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + struct mgs_target_info *mti = NULL; + int rc; + ENTRY; + + LASSERT(mgc); + + if (!(IS_SERVER(lsi))) + RETURN(-EINVAL); + + OBD_ALLOC_PTR(mti); + if (!mti) + RETURN(-ENOMEM); + rc = server_lsi2mti(lsi, mti); + if (rc) + GOTO(out, rc); + + mti->mti_instance = obd->u.obt.obt_instance; + mti->mti_flags |= LDD_F_OPC_READY; + + /* FIXME use mgc_process_config instead */ + rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp, + sizeof(KEY_REGISTER_TARGET), + KEY_REGISTER_TARGET, + sizeof(*mti), mti, NULL); + + /* Imperative recovery: if the mgs informs us to use IR? */ + if (!rc && !(mti->mti_flags & LDD_F_ERROR) && + (mti->mti_flags & LDD_F_IR_CAPABLE)) + lsi->lsi_flags |= LDD_F_IR_CAPABLE; + +out: + if (mti) + OBD_FREE_PTR(mti); + RETURN(rc); + +} + +/** Start server targets: MDTs and OSTs + */ +static int server_start_targets(struct super_block *sb) +{ + struct obd_device *obd; + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_instance cfg; + struct lu_env mgc_env; + struct lu_device *dev; + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_svname); + + if (IS_MDT(lsi)) { + /* make sure the MDS is started */ + mutex_lock(&server_start_lock); + obd = class_name2obd(LUSTRE_MDS_OBDNAME); + if (!obd) { + rc = lustre_start_simple(LUSTRE_MDS_OBDNAME, + LUSTRE_MDS_NAME, + LUSTRE_MDS_OBDNAME"_uuid", + NULL, NULL, NULL, NULL); + if (rc) { + mutex_unlock(&server_start_lock); + CERROR("failed to start MDS: %d\n", rc); + RETURN(rc); + } + } + mutex_unlock(&server_start_lock); + } + + /* If we're an OST, make sure the global OSS is running */ + if (IS_OST(lsi)) { + /* make sure OSS is started */ + mutex_lock(&server_start_lock); + obd = class_name2obd(LUSTRE_OSS_OBDNAME); + if (!obd) { + rc = lustre_start_simple(LUSTRE_OSS_OBDNAME, + LUSTRE_OSS_NAME, + LUSTRE_OSS_OBDNAME"_uuid", + NULL, NULL, NULL, NULL); + if (rc) { + mutex_unlock(&server_start_lock); + CERROR("failed to start OSS: %d\n", rc); + RETURN(rc); + } + } + mutex_unlock(&server_start_lock); + } + + rc = lu_env_init(&mgc_env, LCT_MG_THREAD); + if (rc != 0) + GOTO(out_stop_service, rc); + + /* Set the mgc fs to our server disk. This allows the MGC to + * read and write configs locally, in case it can't talk to the MGS. */ + rc = server_mgc_set_fs(&mgc_env, lsi->lsi_mgc, sb); + if (rc) + GOTO(out_env, rc); + + /* Register with MGS */ + rc = server_register_target(lsi); + if (rc) + GOTO(out_mgc, rc); + + /* Let the target look up the mount using the target's name + (we can't pass the sb or mnt through class_process_config.) */ + rc = server_register_mount(lsi->lsi_svname, sb); + if (rc) + GOTO(out_mgc, rc); + + /* Start targets using the llog named for the target */ + memset(&cfg, 0, sizeof(cfg)); + cfg.cfg_callback = class_config_llog_handler; + cfg.cfg_sub_clds = CONFIG_SUB_SERVER; + rc = lustre_process_log(sb, lsi->lsi_svname, &cfg); + if (rc) { + CERROR("failed to start server %s: %d\n", + lsi->lsi_svname, rc); + /* Do NOT call server_deregister_mount() here. This makes it + * impossible to find mount later in cleanup time and leaves + * @lsi and othder stuff leaked. -umka */ + GOTO(out_mgc, rc); + } + + obd = class_name2obd(lsi->lsi_svname); + if (!obd) { + CERROR("no server named %s was started\n", lsi->lsi_svname); + GOTO(out_mgc, rc = -ENXIO); + } + + if (IS_OST(lsi) || IS_MDT(lsi)) { + rc = lustre_start_lwp(sb); + if (rc) { + CERROR("%s: failed to start LWP: %d\n", + lsi->lsi_svname, rc); + GOTO(out_mgc, rc); + } + } + + server_notify_target(sb, obd); + + /* calculate recovery timeout, do it after lustre_process_log */ + server_calc_timeout(lsi, obd); + + /* log has been fully processed, let clients connect */ + dev = obd->obd_lu_dev; + if (dev && dev->ld_ops->ldo_prepare) { + struct lu_env env; + + rc = lu_env_init(&env, dev->ld_type->ldt_ctx_tags); + if (rc == 0) { + struct lu_context session_ctx; + + lu_context_init(&session_ctx, LCT_SERVER_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + env.le_ses = &session_ctx; + + rc = dev->ld_ops->ldo_prepare(&env, NULL, dev); + + lu_env_fini(&env); + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + } + } + + /* abort recovery only on the complete stack: + * many devices can be involved */ + if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) && + (OBP(obd, iocontrol))) { + obd_iocontrol(OBD_IOC_ABORT_RECOVERY, obd->obd_self_export, 0, + NULL, NULL); + } + +out_mgc: + /* Release the mgc fs for others to use */ + server_mgc_clear_fs(&mgc_env, lsi->lsi_mgc); +out_env: + lu_env_fini(&mgc_env); +out_stop_service: + if (rc != 0) + server_stop_servers(lsi->lsi_flags); + + RETURN(rc); +} + +static int lsi_prepare(struct lustre_sb_info *lsi) +{ + const char *osd_type; + const char *fstype; + __u32 index; + int rc; + ENTRY; + + LASSERT(lsi); + LASSERT(lsi->lsi_lmd); + + /* The server name is given as a mount line option */ + if (lsi->lsi_lmd->lmd_profile == NULL) { + LCONSOLE_ERROR("Can't determine server name\n"); + RETURN(-EINVAL); + } + + /* Determine osd type */ + if (lsi->lsi_lmd->lmd_osd_type == NULL) { + osd_type = LUSTRE_OSD_LDISKFS_NAME; + fstype = "ldiskfs"; + } else { + osd_type = lsi->lsi_lmd->lmd_osd_type; + fstype = lsi->lsi_lmd->lmd_osd_type; + } + + if (strlen(lsi->lsi_lmd->lmd_profile) >= sizeof(lsi->lsi_svname) || + strlen(osd_type) >= sizeof(lsi->lsi_osd_type) || + strlen(fstype) >= sizeof(lsi->lsi_fstype)) + RETURN(-ENAMETOOLONG); + + strlcpy(lsi->lsi_svname, lsi->lsi_lmd->lmd_profile, + sizeof(lsi->lsi_svname)); + strlcpy(lsi->lsi_osd_type, osd_type, sizeof(lsi->lsi_osd_type)); + /* XXX: a temp. solution for components using ldiskfs + * to be removed in one of the subsequent patches */ + strlcpy(lsi->lsi_fstype, fstype, sizeof(lsi->lsi_fstype)); + + /* Determine server type */ + rc = server_name2index(lsi->lsi_svname, &index, NULL); + if (rc < 0) { + if (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) { + /* Assume we're a bare MGS */ + rc = 0; + lsi->lsi_lmd->lmd_flags |= LMD_FLG_NOSVC; + } else { + LCONSOLE_ERROR("Can't determine server type of '%s'\n", + lsi->lsi_svname); + RETURN(rc); + } + } + lsi->lsi_flags |= rc; + + /* Add mount line flags that used to be in ldd: + * writeconf, mgs, anything else? + */ + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF) ? + LDD_F_WRITECONF : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_VIRGIN) ? + LDD_F_VIRGIN : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_UPDATE) ? + LDD_F_UPDATE : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) ? + LDD_F_SV_TYPE_MGS : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) ? + LDD_F_NO_PRIMNODE : 0; + + RETURN(0); +} + +/*************** server mount ******************/ + +/** Start the shutdown of servers at umount. + */ +static void server_put_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + char *tmpname, *extraname = NULL; + int tmpname_sz; + int lsiflags = lsi->lsi_flags; + ENTRY; + + LASSERT(IS_SERVER(lsi)); + + tmpname_sz = strlen(lsi->lsi_svname) + 1; + OBD_ALLOC(tmpname, tmpname_sz); + memcpy(tmpname, lsi->lsi_svname, tmpname_sz); + CDEBUG(D_MOUNT, "server put_super %s\n", tmpname); + if (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC)) + snprintf(tmpname, tmpname_sz, "MGS"); + + /* disconnect the lwp first to drain off the inflight request */ + if (IS_OST(lsi) || IS_MDT(lsi)) { + int rc; + + rc = lustre_disconnect_lwp(sb); + if (rc != 0 && rc != -ETIMEDOUT && + rc != -ENOTCONN && rc != -ESHUTDOWN) + CWARN("%s: failed to disconnect lwp: rc= %d\n", + tmpname, rc); + } + + /* Stop the target */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && + (IS_MDT(lsi) || IS_OST(lsi))) { + struct lustre_profile *lprof = NULL; + + /* tell the mgc to drop the config log */ + lustre_end_log(sb, lsi->lsi_svname, NULL); + + /* COMPAT_146 - profile may get deleted in mgc_cleanup. + If there are any setup/cleanup errors, save the lov + name for safety cleanup later. */ + lprof = class_get_profile(lsi->lsi_svname); + if (lprof != NULL) { + if (lprof->lp_dt != NULL) { + OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1); + strncpy(extraname, lprof->lp_dt, + strlen(lprof->lp_dt) + 1); + } + class_put_profile(lprof); + } + + obd = class_name2obd(lsi->lsi_svname); + if (obd) { + CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name); + if (lsiflags & LSI_UMOUNT_FAILOVER) + obd->obd_fail = 1; + /* We can't seem to give an error return code + * to .put_super, so we better make sure we clean up! */ + obd->obd_force = 1; + class_manual_cleanup(obd); + } else { + CERROR("no obd %s\n", lsi->lsi_svname); + server_deregister_mount(lsi->lsi_svname); + } + } + + /* If they wanted the mgs to stop separately from the mdt, they + should have put it on a different device. */ + if (IS_MGS(lsi)) { + /* if MDS start with --nomgs, don't stop MGS then */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) + server_stop_mgs(sb); + } + + if (IS_OST(lsi) || IS_MDT(lsi)) { + if (lustre_stop_lwp(sb) < 0) + CERROR("%s: failed to stop lwp!\n", tmpname); + } + + /* Clean the mgc and sb */ + lustre_common_put_super(sb); + + /* wait till all in-progress cleanups are done + * specifically we're interested in ofd cleanup + * as it pins OSS */ + obd_zombie_barrier(); + + /* Stop the servers (MDS, OSS) if no longer needed. We must wait + until the target is really gone so that our type refcount check + is right. */ + server_stop_servers(lsiflags); + + /* In case of startup or cleanup err, stop related obds */ + if (extraname) { + obd = class_name2obd(extraname); + if (obd) { + CWARN("Cleaning orphaned obd %s\n", extraname); + obd->obd_force = 1; + class_manual_cleanup(obd); + } + OBD_FREE(extraname, strlen(extraname) + 1); + } + + LCONSOLE_WARN("server umount %s complete\n", tmpname); + OBD_FREE(tmpname, tmpname_sz); + EXIT; +} + +/** Called only for 'umount -f' + */ +static void server_umount_begin(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + CDEBUG(D_MOUNT, "umount -f\n"); + /* umount = failover + umount -f = force + no third way to do non-force, non-failover */ + lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER; + EXIT; +} + +static int server_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_statfs statfs; + int rc; + ENTRY; + + if (lsi->lsi_dt_dev) { + rc = dt_statfs(NULL, lsi->lsi_dt_dev, &statfs); + if (rc == 0) { + statfs_unpack(buf, &statfs); + buf->f_type = sb->s_magic; + RETURN(0); + } + } + + /* just return 0 */ + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = 1; + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_files = 1; + buf->f_ffree = 0; + buf->f_namelen = NAME_MAX; + RETURN(0); +} + +#ifdef HAVE_SUPEROPS_USE_DENTRY +int server_show_options(struct seq_file *seq, struct dentry *dentry) +#else +int server_show_options(struct seq_file *seq, struct vfsmount *vfs) +#endif +{ + struct lustre_sb_info *lsi; + struct lustre_mount_data *lmd; + +#ifdef HAVE_SUPEROPS_USE_DENTRY + LASSERT(seq != NULL && dentry != NULL); + lsi = s2lsi(dentry->d_sb); +#else + LASSERT(seq != NULL && vfs != NULL); + lsi = s2lsi(vfs->mnt_sb); +#endif + + lmd = lsi->lsi_lmd; + seq_printf(seq, ",svname=%s", lmd->lmd_profile); + + if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV) + seq_puts(seq, ",abort_recov"); + + if (lmd->lmd_flags & LMD_FLG_NOIR) + seq_puts(seq, ",noir"); + + if (lmd->lmd_flags & LMD_FLG_NOSVC) + seq_puts(seq, ",nosvc"); + + if (lmd->lmd_flags & LMD_FLG_NOMGS) + seq_puts(seq, ",nomgs"); + + if (lmd->lmd_flags & LMD_FLG_NOSCRUB) + seq_puts(seq, ",noscrub"); + if (lmd->lmd_flags & LMD_FLG_SKIP_LFSCK) + seq_puts(seq, ",skip_lfsck"); + + if (lmd->lmd_flags & LMD_FLG_DEV_RDONLY) + seq_puts(seq, ",rdonly_dev"); + + if (lmd->lmd_flags & LMD_FLG_MGS) + seq_puts(seq, ",mgs"); + + if (lmd->lmd_mgs != NULL) + seq_printf(seq, ",mgsnode=%s", lmd->lmd_mgs); + + if (lmd->lmd_osd_type != NULL) + seq_printf(seq, ",osd=%s", lmd->lmd_osd_type); + + if (lmd->lmd_opts != NULL) { + seq_putc(seq, ','); + seq_puts(seq, lmd->lmd_opts); + } + + RETURN(0); +} + +/** The operations we support directly on the superblock: + * mount, umount, and df. + */ +static struct super_operations server_ops = { + .put_super = server_put_super, + .umount_begin = server_umount_begin, /* umount -f */ + .statfs = server_statfs, + .show_options = server_show_options, +}; + +/* + * Xattr support for Lustre servers + */ +#ifdef HAVE_IOP_XATTR +static ssize_t lustre_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + if (!selinux_is_enabled()) + return -EOPNOTSUPP; + return -ENODATA; +} + +static int lustre_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} +#endif + +static ssize_t lustre_listxattr(struct dentry *d_entry, char *name, + size_t size) +{ + return -EOPNOTSUPP; +} + +static bool is_cmd_supported(unsigned int command) +{ + switch (command) { + case FITRIM: + return true; + default: + return false; + } + + return false; +} + +static long server_ioctl(struct file *filp, unsigned int command, + unsigned long arg) +{ + struct file active_filp; + struct inode *inode = file_inode(filp); + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + struct super_block *dd_sb = dt_mnt_sb_get(lsi->lsi_dt_dev); + struct inode *active_inode; + int err = -EOPNOTSUPP; + + if (IS_ERR(dd_sb) || !is_cmd_supported(command)) + return err; + + active_inode = igrab(dd_sb->s_root->d_inode); + if (!active_inode) + return -EACCES; + + active_filp.f_inode = active_inode; + if (active_inode->i_fop && active_inode->i_fop->unlocked_ioctl) + err = active_inode->i_fop->unlocked_ioctl(&active_filp, + command, arg); + iput(active_inode); + return err; +} + +static const struct inode_operations server_inode_operations = { +#ifdef HAVE_IOP_XATTR + .setxattr = lustre_setxattr, + .getxattr = lustre_getxattr, +#endif + .listxattr = lustre_listxattr, +}; + +static const struct file_operations server_file_operations = { + .unlocked_ioctl = server_ioctl, +}; + +#define log2(n) ffz(~(n)) +#define LUSTRE_SUPER_MAGIC 0x0BD00BD1 + +static int server_fill_super_common(struct super_block *sb) +{ + struct inode *root = NULL; + ENTRY; + + CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = log2(sb->s_blocksize); + sb->s_magic = LUSTRE_SUPER_MAGIC; + sb->s_maxbytes = 0; /* we don't allow file IO on server mountpoints */ + sb->s_flags |= SB_RDONLY; + sb->s_op = &server_ops; + + root = new_inode(sb); + if (!root) { + CERROR("Can't make root inode\n"); + RETURN(-EIO); + } + + /* returns -EIO for every operation */ + /* make_bad_inode(root); -- badness - can't umount */ + /* apparently we need to be a directory for the mount to finish */ + root->i_mode = S_IFDIR; + root->i_op = &server_inode_operations; + root->i_fop = &server_file_operations; + sb->s_root = d_make_root(root); + if (!sb->s_root) { + CERROR("%s: can't make root dentry\n", sb->s_id); + RETURN(-EIO); + } + + RETURN(0); +} + +static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags) +{ + struct lustre_mount_data *lmd = lsi->lsi_lmd; + struct obd_device *obd; + struct dt_device_param p; + char flagstr[20 + 1 + 10 + 1]; + int rc; + ENTRY; + + CDEBUG(D_MOUNT, + "Attempting to start %s, type=%s, lsifl=%x, mountfl=%lx\n", + lsi->lsi_svname, lsi->lsi_osd_type, lsi->lsi_flags, mflags); + + sprintf(lsi->lsi_osd_obdname, "%s-osd", lsi->lsi_svname); + strcpy(lsi->lsi_osd_uuid, lsi->lsi_osd_obdname); + strcat(lsi->lsi_osd_uuid, "_UUID"); + snprintf(flagstr, sizeof(flagstr), "%lu:%u", mflags, lmd->lmd_flags); + + obd = class_name2obd(lsi->lsi_osd_obdname); + if (obd == NULL) { + rc = lustre_start_simple(lsi->lsi_osd_obdname, + lsi->lsi_osd_type, + lsi->lsi_osd_uuid, lmd->lmd_dev, + flagstr, lsi->lsi_lmd->lmd_opts, + lsi->lsi_svname); + if (rc) + GOTO(out, rc); + obd = class_name2obd(lsi->lsi_osd_obdname); + LASSERT(obd); + } else { + CDEBUG(D_MOUNT, "%s already started\n", lsi->lsi_osd_obdname); + /* but continue setup to allow special case of MDT and internal + * MGT being started separately. */ + if (!((IS_MGS(lsi) && (lsi->lsi_lmd->lmd_flags & + LMD_FLG_NOMGS)) || + (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags & + LMD_FLG_NOSVC)))) + RETURN(-EALREADY); + } + + rc = obd_connect(NULL, &lsi->lsi_osd_exp, + obd, &obd->obd_uuid, NULL, NULL); + + if (rc) { + obd->obd_force = 1; + class_manual_cleanup(obd); + lsi->lsi_dt_dev = NULL; + RETURN(rc); + } + + LASSERT(obd->obd_lu_dev); + lu_device_get(obd->obd_lu_dev); + lsi->lsi_dt_dev = lu2dt_dev(obd->obd_lu_dev); + LASSERT(lsi->lsi_dt_dev); + + /* set disk context for llog usage */ + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.dt = lsi->lsi_dt_dev; + + dt_conf_get(NULL, lsi->lsi_dt_dev, &p); +out: + RETURN(rc); +} + +/** Fill in the superblock info for a Lustre server. + * Mount the device with the correct options. + * Read the on-disk config file. + * Start the services. + */ +int server_fill_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + int rc; + ENTRY; + + /* to simulate target mount race */ + OBD_RACE(OBD_FAIL_TGT_MOUNT_RACE); + + rc = lsi_prepare(lsi); + if (rc) { + lustre_put_lsi(sb); + RETURN(rc); + } + + /* Start low level OSD */ + rc = osd_start(lsi, sb->s_flags); + if (rc) { + CERROR("Unable to start osd on %s: %d\n", + lsi->lsi_lmd->lmd_dev, rc); + lustre_put_lsi(sb); + RETURN(rc); + } + + CDEBUG(D_MOUNT, "Found service %s on device %s\n", + lsi->lsi_svname, lsi->lsi_lmd->lmd_dev); + + if (class_name2obd(lsi->lsi_svname)) { + LCONSOLE_ERROR_MSG(0x161, "The target named %s is already " + "running. Double-mount may have compromised" + " the disk journal.\n", + lsi->lsi_svname); + lustre_put_lsi(sb); + RETURN(-EALREADY); + } + + /* Start MGS before MGC */ + if (IS_MGS(lsi) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) { + rc = server_start_mgs(sb); + if (rc) + GOTO(out_mnt, rc); + } + + /* Start MGC before servers */ + rc = lustre_start_mgc(sb); + if (rc) + GOTO(out_mnt, rc); + + /* Set up all obd devices for service */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && + (IS_OST(lsi) || IS_MDT(lsi))) { + rc = server_start_targets(sb); + if (rc < 0) { + CERROR("Unable to start targets: %d\n", rc); + GOTO(out_mnt, rc); + } + /* FIXME overmount client here, or can we just start a + * client log and client_fill_super on this sb? We + * need to make sure server_put_super gets called too + * - ll_put_super calls lustre_common_put_super; check + * there for LSI_SERVER flag, call s_p_s if so. + * + * Probably should start client from new thread so we + * can return. Client will not finish until all + * servers are connected. Note - MGS-only server does + * NOT get a client, since there is no lustre fs + * associated - the MGS is for all lustre fs's */ + } + + rc = server_fill_super_common(sb); + if (rc) + GOTO(out_mnt, rc); + + RETURN(0); +out_mnt: + /* We jump here in case of failure while starting targets or MGS. + * In this case we can't just put @mnt and have to do real cleanup + * with stoping targets, etc. */ + server_put_super(sb); + return rc; +} + +/* + * Calculate timeout value for a target. + */ +void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd) +{ + struct lustre_mount_data *lmd; + int soft = 0; + int hard = 0; + int factor = 0; + bool has_ir = !!(lsi->lsi_flags & LDD_F_IR_CAPABLE); + int min = OBD_RECOVERY_TIME_MIN; + + LASSERT(IS_SERVER(lsi)); + + lmd = lsi->lsi_lmd; + if (lmd) { + soft = lmd->lmd_recovery_time_soft; + hard = lmd->lmd_recovery_time_hard; + has_ir = has_ir && !(lmd->lmd_flags & LMD_FLG_NOIR); + obd->obd_no_ir = !has_ir; + } + + if (soft == 0) + soft = OBD_RECOVERY_TIME_SOFT; + if (hard == 0) + hard = OBD_RECOVERY_TIME_HARD; + + /* target may have ir_factor configured. */ + factor = OBD_IR_FACTOR_DEFAULT; + if (obd->obd_recovery_ir_factor) + factor = obd->obd_recovery_ir_factor; + + if (has_ir) { + int new_soft = soft; + + /* adjust timeout value by imperative recovery */ + new_soft = (soft * factor) / OBD_IR_FACTOR_MAX; + /* make sure the timeout is not too short */ + new_soft = max(min, new_soft); + + LCONSOLE_INFO("%s: Imperative Recovery enabled, recovery " + "window shrunk from %d-%d down to %d-%d\n", + obd->obd_name, soft, hard, new_soft, hard); + + soft = new_soft; + } else { + LCONSOLE_INFO("%s: Imperative Recovery not enabled, recovery " + "window %d-%d\n", obd->obd_name, soft, hard); + } + + /* we're done */ + obd->obd_recovery_timeout = max_t(time64_t, obd->obd_recovery_timeout, + soft); + obd->obd_recovery_time_hard = hard; + obd->obd_recovery_ir_factor = factor; +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c new file mode 100644 index 0000000000000..53b0b3130b717 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c @@ -0,0 +1,535 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_sysfs.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +struct static_lustre_uintvalue_attr { + struct { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len); + } u; + int *value; +}; + +static ssize_t static_uintvalue_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct static_lustre_uintvalue_attr *lattr = (void *)attr; + + return sprintf(buf, "%d\n", *lattr->value); +} + +static ssize_t static_uintvalue_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct static_lustre_uintvalue_attr *lattr = (void *)attr; + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + *lattr->value = val; + + return count; +} + +#define LUSTRE_STATIC_UINT_ATTR(name, value) \ +static struct static_lustre_uintvalue_attr lustre_sattr_##name = \ + { __ATTR(name, 0644, static_uintvalue_show, \ + static_uintvalue_store), value } + +LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout); +LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout); +LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout); +LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction); +LUSTRE_STATIC_UINT_ATTR(at_min, &at_min); +LUSTRE_STATIC_UINT_ATTR(at_max, &at_max); +LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra); +LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin); +LUSTRE_STATIC_UINT_ATTR(at_history, &at_history); +LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction); + +#ifdef HAVE_SERVER_SUPPORT +LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout); +LUSTRE_STATIC_UINT_ATTR(bulk_timeout, &bulk_timeout); +#endif + +static ssize_t memused_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%llu\n", obd_memory_sum()); +} +LUSTRE_RO_ATTR(memused); + +static ssize_t memused_max_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%llu\n", obd_memory_max()); +} +LUSTRE_RO_ATTR(memused_max); + +static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", + obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT))); +} + +static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 10, &val); + if (rc) + return rc; + + val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */ + + if (val > ((cfs_totalram_pages() / 10) * 9)) { + /* Somebody wants to assign too much memory to dirty pages */ + return -EINVAL; + } + + if (val < 4 << (20 - PAGE_SHIFT)) { + /* Less than 4 Mb for dirty cache is also bad */ + return -EINVAL; + } + + obd_max_dirty_pages = val; + + return count; +} +LUSTRE_RW_ATTR(max_dirty_mb); + +static ssize_t version_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING); +} + +static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ +#ifdef ENABLE_PINGER + const char *state = "on"; +#else + const char *state = "off"; +#endif + return sprintf(buf, "%s\n", state); +} + +/** + * Check all obd devices health + * + * \param kobj + * \param buf [in] + * + * \retval number of characters printed if healthy + */ +static ssize_t +health_check_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + bool healthy = true; + size_t len = 0; + int i; + + if (libcfs_catastrophe) { + len = sprintf(buf, "LBUG\n"); + healthy = false; + } + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd; + + obd = class_num2obd(i); + if (obd == NULL || !obd->obd_attached || !obd->obd_set_up) + continue; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + continue; + + class_incref(obd, __FUNCTION__, current); + read_unlock(&obd_dev_lock); + + if (obd_health_check(NULL, obd)) { + len = sprintf(buf, "device %s reported unhealthy\n", + obd->obd_name); + healthy = false; + } + class_decref(obd, __FUNCTION__, current); + read_lock(&obd_dev_lock); + } + read_unlock(&obd_dev_lock); + + if (healthy) + len = sprintf(buf, "healthy\n"); + else + len = sprintf(buf, "NOT HEALTHY\n"); + + return len; +} + +static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + int rc = 0; + + if (strlen(obd_jobid_var)) + rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var); + return rc; +} + +static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN) + return -EINVAL; + + memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1); + + memcpy(obd_jobid_var, buffer, count); + + /* Trim the trailing '\n' if any */ + if (obd_jobid_var[count - 1] == '\n') + obd_jobid_var[count - 1] = 0; + + return count; +} + +static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + int rc = 0; + + if (strlen(obd_jobid_name)) + rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_name); + return rc; +} + +static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + if (!count || count > LUSTRE_JOBID_SIZE) + return -EINVAL; + + if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) != 0 && + !strchr(buffer, '%')) { + lustre_jobid_clear(buffer); + return count; + } + + /* clear previous value */ + memset(obd_jobid_name, 0, LUSTRE_JOBID_SIZE); + + memcpy(obd_jobid_name, buffer, count); + + /* Trim the trailing '\n' if any */ + if (obd_jobid_name[count - 1] == '\n') { + /* Don't echo just a newline */ + if (count == 1) + return -EINVAL; + obd_jobid_name[count - 1] = 0; + } + + return count; +} + +/* Root for /sys/kernel/debug/lustre */ +struct dentry *debugfs_lustre_root; +EXPORT_SYMBOL_GPL(debugfs_lustre_root); + +#ifdef CONFIG_PROC_FS +/* Root for /proc/fs/lustre */ +struct proc_dir_entry *proc_lustre_root; +EXPORT_SYMBOL(proc_lustre_root); +#else +#define lprocfs_base NULL +#endif /* CONFIG_PROC_FS */ + +LUSTRE_RO_ATTR(version); +LUSTRE_RO_ATTR(pinger); +LUSTRE_RO_ATTR(health_check); +LUSTRE_RW_ATTR(jobid_var); +LUSTRE_RW_ATTR(jobid_name); + +static struct attribute *lustre_attrs[] = { + &lustre_attr_version.attr, + &lustre_attr_pinger.attr, + &lustre_attr_health_check.attr, + &lustre_attr_jobid_name.attr, + &lustre_attr_jobid_var.attr, + &lustre_sattr_timeout.u.attr, + &lustre_attr_max_dirty_mb.attr, + &lustre_sattr_debug_peer_on_timeout.u.attr, + &lustre_sattr_dump_on_timeout.u.attr, + &lustre_sattr_dump_on_eviction.u.attr, + &lustre_sattr_at_min.u.attr, + &lustre_sattr_at_max.u.attr, + &lustre_sattr_at_extra.u.attr, + &lustre_sattr_at_early_margin.u.attr, + &lustre_sattr_at_history.u.attr, + &lustre_attr_memused_max.attr, + &lustre_attr_memused.attr, +#ifdef HAVE_SERVER_SUPPORT + &lustre_sattr_ldlm_timeout.u.attr, + &lustre_sattr_bulk_timeout.u.attr, +#endif + &lustre_sattr_lbug_on_eviction.u.attr, + NULL, +}; + +static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos) +{ + if (*pos >= class_devno_max()) + return NULL; + + return pos; +} + +static void obd_device_list_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + ++*pos; + if (*pos >= class_devno_max()) + return NULL; + + return pos; +} + +static int obd_device_list_seq_show(struct seq_file *p, void *v) +{ + loff_t index = *(loff_t *)v; + struct obd_device *obd = class_num2obd((int)index); + char *status; + + if (obd == NULL) + return 0; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_inactive) + status = "IN"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + + seq_printf(p, "%3d %s %s %s %s %d\n", + (int)index, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + return 0; +} + +static const struct seq_operations obd_device_list_sops = { + .start = obd_device_list_seq_start, + .stop = obd_device_list_seq_stop, + .next = obd_device_list_seq_next, + .show = obd_device_list_seq_show, +}; + +static int obd_device_list_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = seq_open(file, &obd_device_list_sops); + + if (rc) + return rc; + + seq = file->private_data; + seq->private = inode->i_private; + return 0; +} + +static const struct file_operations obd_device_list_fops = { + .owner = THIS_MODULE, + .open = obd_device_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +struct kset *lustre_kset; +EXPORT_SYMBOL_GPL(lustre_kset); + +static struct attribute_group lustre_attr_group = { + .attrs = lustre_attrs, +}; + +ssize_t class_set_global(const char *param) +{ + const char *value = strchr(param, '=') + 1; + size_t off = value - param - 1; + ssize_t count = -ENOENT; + int i; + + for (i = 0; lustre_attrs[i]; i++) { + if (!strncmp(lustre_attrs[i]->name, param, off)) { + count = lustre_attr_store(&lustre_kset->kobj, + lustre_attrs[i], value, + strlen(value)); + break; + } + } + return count; +} + +int class_procfs_init(void) +{ + struct proc_dir_entry *entry; + struct dentry *file; + int rc = -ENOMEM; + + ENTRY; + + lustre_kset = kset_create_and_add("lustre", NULL, fs_kobj); + if (!lustre_kset) + goto out; + + /* Create the files associated with this kobject */ + rc = sysfs_create_group(&lustre_kset->kobj, &lustre_attr_group); + if (rc) { + kset_unregister(lustre_kset); + goto out; + } + + rc = jobid_cache_init(); + if (rc) { + kset_unregister(lustre_kset); + goto out; + } + + debugfs_lustre_root = debugfs_create_dir("lustre", NULL); + if (IS_ERR_OR_NULL(debugfs_lustre_root)) { + rc = debugfs_lustre_root ? PTR_ERR(debugfs_lustre_root) + : -ENOMEM; + debugfs_lustre_root = NULL; + kset_unregister(lustre_kset); + goto out; + } + + file = debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL, + &obd_device_list_fops); + if (IS_ERR_OR_NULL(file)) { + rc = file ? PTR_ERR(file) : -ENOMEM; + debugfs_remove(debugfs_lustre_root); + kset_unregister(lustre_kset); + goto out; + } + + entry = lprocfs_register("fs/lustre", NULL, NULL, NULL); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CERROR("cannot create '/proc/fs/lustre': rc = %d\n", rc); + debugfs_remove_recursive(debugfs_lustre_root); + kset_unregister(lustre_kset); + goto out; + } + + proc_lustre_root = entry; +out: + RETURN(rc); +} + +int class_procfs_clean(void) +{ + ENTRY; + + debugfs_remove_recursive(debugfs_lustre_root); + + debugfs_lustre_root = NULL; + jobid_cache_fini(); + + if (proc_lustre_root) + lprocfs_remove(&proc_lustre_root); + + sysfs_remove_group(&lustre_kset->kobj, &lustre_attr_group); + + kset_unregister(lustre_kset); + + RETURN(0); +} diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c new file mode 100644 index 0000000000000..0367cfd1bef67 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c @@ -0,0 +1,227 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obdo.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include +#include + +void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent) +{ + dst->o_parent_oid = fid_oid(parent); + dst->o_parent_seq = fid_seq(parent); + dst->o_parent_ver = fid_ver(parent); + dst->o_valid |= OBD_MD_FLPARENT | OBD_MD_FLFID; +} +EXPORT_SYMBOL(obdo_set_parent_fid); + +void obdo_set_o_projid(struct obdo *dst, u32 projid) +{ + dst->o_projid = projid; + dst->o_valid |= OBD_MD_FLPROJID; +} +EXPORT_SYMBOL(obdo_set_o_projid); + +/* + * WARNING: the file systems must take care not to tinker with + * attributes they don't manage (such as blocks). + */ +void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid) +{ + u64 newvalid = 0; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, "valid %#llx, new time %lld/%lld\n", + valid, (s64) src->i_mtime.tv_sec, + (s64) src->i_ctime.tv_sec); + + if (valid & OBD_MD_FLATIME) { + dst->o_atime = src->i_atime.tv_sec; + newvalid |= OBD_MD_FLATIME; + } + if (valid & OBD_MD_FLMTIME) { + dst->o_mtime = src->i_mtime.tv_sec; + newvalid |= OBD_MD_FLMTIME; + } + if (valid & OBD_MD_FLCTIME) { + dst->o_ctime = src->i_ctime.tv_sec; + newvalid |= OBD_MD_FLCTIME; + } + if (valid & OBD_MD_FLSIZE) { + dst->o_size = i_size_read(src); + newvalid |= OBD_MD_FLSIZE; + } + if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */ + dst->o_blocks = src->i_blocks; + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */ + dst->o_blksize = 1U << src->i_blkbits; + newvalid |= OBD_MD_FLBLKSZ; + } + if (valid & OBD_MD_FLTYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO) | + (src->i_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->o_mode = (dst->o_mode & S_IFMT) | + (src->i_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & OBD_MD_FLUID) { + dst->o_uid = from_kuid(&init_user_ns, src->i_uid); + newvalid |= OBD_MD_FLUID; + } + if (valid & OBD_MD_FLGID) { + dst->o_gid = from_kgid(&init_user_ns, src->i_gid); + newvalid |= OBD_MD_FLGID; + } + if (valid & OBD_MD_FLFLAGS) { + dst->o_flags = src->i_flags; + newvalid |= OBD_MD_FLFLAGS; + } + dst->o_valid |= newvalid; +} +EXPORT_SYMBOL(obdo_from_inode); + +void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid) +{ + CDEBUG(D_INODE, "src obdo "DOSTID" valid %#llx, dst obdo "DOSTID"\n", + POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi)); + if (valid & OBD_MD_FLATIME) + dst->o_atime = src->o_atime; + if (valid & OBD_MD_FLMTIME) + dst->o_mtime = src->o_mtime; + if (valid & OBD_MD_FLCTIME) + dst->o_ctime = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + dst->o_size = src->o_size; + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + dst->o_blocks = src->o_blocks; + if (valid & OBD_MD_FLBLKSZ) + dst->o_blksize = src->o_blksize; + if (valid & OBD_MD_FLTYPE) + dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT); + if (valid & OBD_MD_FLMODE) + dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT); + if (valid & OBD_MD_FLUID) + dst->o_uid = src->o_uid; + if (valid & OBD_MD_FLGID) + dst->o_gid = src->o_gid; + if (valid & OBD_MD_FLFLAGS) + dst->o_flags = src->o_flags; + if (valid & OBD_MD_FLFID) { + dst->o_parent_seq = src->o_parent_seq; + dst->o_parent_ver = src->o_parent_ver; + } + if (valid & OBD_MD_FLPARENT) + dst->o_parent_oid = src->o_parent_oid; + if (valid & OBD_MD_FLHANDLE) + dst->o_handle = src->o_handle; + + dst->o_valid |= valid; +} +EXPORT_SYMBOL(obdo_cpy_md); + +void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj) +{ + ioobj->ioo_oid = oa->o_oi; + if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) + ostid_set_seq_mdt0(&ioobj->ioo_oid); + + /* + * Since 2.4 this does not contain o_mode in the low 16 bits. + * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs + */ + ioobj->ioo_max_brw = 0; +} +EXPORT_SYMBOL(obdo_to_ioobj); + +/* + * Create an obdo to send over the wire + */ +void lustre_set_wire_obdo(const struct obd_connect_data *ocd, + struct obdo *wobdo, + const struct obdo *lobdo) +{ + *wobdo = *lobdo; + if (ocd == NULL) + return; + + if (!(wobdo->o_valid & OBD_MD_FLUID)) + wobdo->o_uid = from_kuid(&init_user_ns, current_uid()); + if (!(wobdo->o_valid & OBD_MD_FLGID)) + wobdo->o_gid = from_kgid(&init_user_ns, current_gid()); + + if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) && + fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) { + /* + * Currently OBD_FL_OSTID will only be used when 2.4 echo + * client communicate with pre-2.4 server + */ + wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid); + wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid); + } +} +EXPORT_SYMBOL(lustre_set_wire_obdo); + +/* + * Create a local obdo from a wire based odbo + */ +void lustre_get_wire_obdo(const struct obd_connect_data *ocd, + struct obdo *lobdo, + const struct obdo *wobdo) +{ + *lobdo = *wobdo; + if (ocd == NULL) + return; + + if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) && + fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) { + /* see above */ + lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq; + lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id; + lobdo->o_oi.oi_fid.f_ver = 0; + } +} +EXPORT_SYMBOL(lustre_get_wire_obdo); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c new file mode 100644 index 0000000000000..0f7f474f7fbb9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c @@ -0,0 +1,156 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/linux/linux-obdo.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include /* for PAGE_SIZE */ +#include + +/*FIXME: Just copy from obdo_from_inode*/ +void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid) +{ + u64 newvalid = 0; + + if (valid & LA_ATIME) { + dst->o_atime = la->la_atime; + newvalid |= OBD_MD_FLATIME; + } + if (valid & LA_MTIME) { + dst->o_mtime = la->la_mtime; + newvalid |= OBD_MD_FLMTIME; + } + if (valid & LA_CTIME) { + dst->o_ctime = la->la_ctime; + newvalid |= OBD_MD_FLCTIME; + } + if (valid & LA_SIZE) { + dst->o_size = la->la_size; + newvalid |= OBD_MD_FLSIZE; + } + if (valid & LA_BLOCKS) { /* allocation of space (x512 bytes) */ + dst->o_blocks = la->la_blocks; + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & LA_TYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO) | + (la->la_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & LA_MODE) { + dst->o_mode = (dst->o_mode & S_IFMT) | + (la->la_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & LA_UID) { + dst->o_uid = la->la_uid; + newvalid |= OBD_MD_FLUID; + } + if (valid & LA_GID) { + dst->o_gid = la->la_gid; + newvalid |= OBD_MD_FLGID; + } + if (valid & LA_PROJID) { + dst->o_projid = la->la_projid; + newvalid |= OBD_MD_FLPROJID; + } + if (valid & LA_FLAGS) { + dst->o_flags = la->la_flags; + newvalid |= OBD_MD_FLFLAGS; + } + dst->o_valid |= newvalid; +} +EXPORT_SYMBOL(obdo_from_la); + +/*FIXME: Just copy from obdo_from_inode*/ +void la_from_obdo(struct lu_attr *dst, const struct obdo *obdo, u64 valid) +{ + u64 newvalid = 0; + + valid &= obdo->o_valid; + + if (valid & OBD_MD_FLATIME) { + dst->la_atime = obdo->o_atime; + newvalid |= LA_ATIME; + } + if (valid & OBD_MD_FLMTIME) { + dst->la_mtime = obdo->o_mtime; + newvalid |= LA_MTIME; + } + if (valid & OBD_MD_FLCTIME) { + dst->la_ctime = obdo->o_ctime; + newvalid |= LA_CTIME; + } + if (valid & OBD_MD_FLSIZE) { + dst->la_size = obdo->o_size; + newvalid |= LA_SIZE; + } + if (valid & OBD_MD_FLBLOCKS) { + dst->la_blocks = obdo->o_blocks; + newvalid |= LA_BLOCKS; + } + if (valid & OBD_MD_FLTYPE) { + dst->la_mode = (dst->la_mode & S_IALLUGO) | + (obdo->o_mode & S_IFMT); + newvalid |= LA_TYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->la_mode = (dst->la_mode & S_IFMT) | + (obdo->o_mode & S_IALLUGO); + newvalid |= LA_MODE; + } + if (valid & OBD_MD_FLUID) { + dst->la_uid = obdo->o_uid; + newvalid |= LA_UID; + } + if (valid & OBD_MD_FLGID) { + dst->la_gid = obdo->o_gid; + newvalid |= LA_GID; + } + if (valid & OBD_MD_FLPROJID) { + dst->la_projid = obdo->o_projid; + newvalid |= LA_PROJID; + } + if (valid & OBD_MD_FLFLAGS) { + dst->la_flags = obdo->o_flags; + newvalid |= LA_FLAGS; + } + dst->la_valid = newvalid; +} +EXPORT_SYMBOL(la_from_obdo); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/scrub.c b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c new file mode 100644 index 0000000000000..b2e93c6dcc408 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c @@ -0,0 +1,1216 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + */ +/* + * lustre/obdclass/scrub.c + * + * The OI scrub is used for checking and (re)building Object Index files + * that are usually backend special. Here are some general scrub related + * functions that can be shared by different backends for OI scrub. + * + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_LFSCK + +#include +#include +#include +#include + +static inline struct dt_device *scrub_obj2dev(struct dt_object *obj) +{ + return container_of0(obj->do_lu.lo_dev, struct dt_device, dd_lu_dev); +} + +static void scrub_file_to_cpu(struct scrub_file *des, struct scrub_file *src) +{ + memcpy(des->sf_uuid, src->sf_uuid, 16); + des->sf_flags = le64_to_cpu(src->sf_flags); + des->sf_magic = le32_to_cpu(src->sf_magic); + des->sf_status = le16_to_cpu(src->sf_status); + des->sf_param = le16_to_cpu(src->sf_param); + des->sf_time_last_complete = + le64_to_cpu(src->sf_time_last_complete); + des->sf_time_latest_start = + le64_to_cpu(src->sf_time_latest_start); + des->sf_time_last_checkpoint = + le64_to_cpu(src->sf_time_last_checkpoint); + des->sf_pos_latest_start = + le64_to_cpu(src->sf_pos_latest_start); + des->sf_pos_last_checkpoint = + le64_to_cpu(src->sf_pos_last_checkpoint); + des->sf_pos_first_inconsistent = + le64_to_cpu(src->sf_pos_first_inconsistent); + des->sf_items_checked = + le64_to_cpu(src->sf_items_checked); + des->sf_items_updated = + le64_to_cpu(src->sf_items_updated); + des->sf_items_failed = + le64_to_cpu(src->sf_items_failed); + des->sf_items_updated_prior = + le64_to_cpu(src->sf_items_updated_prior); + des->sf_run_time = le32_to_cpu(src->sf_run_time); + des->sf_success_count = le32_to_cpu(src->sf_success_count); + des->sf_oi_count = le16_to_cpu(src->sf_oi_count); + des->sf_internal_flags = le16_to_cpu(src->sf_internal_flags); + memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE); +} + +static void scrub_file_to_le(struct scrub_file *des, struct scrub_file *src) +{ + memcpy(des->sf_uuid, src->sf_uuid, 16); + des->sf_flags = cpu_to_le64(src->sf_flags); + des->sf_magic = cpu_to_le32(src->sf_magic); + des->sf_status = cpu_to_le16(src->sf_status); + des->sf_param = cpu_to_le16(src->sf_param); + des->sf_time_last_complete = + cpu_to_le64(src->sf_time_last_complete); + des->sf_time_latest_start = + cpu_to_le64(src->sf_time_latest_start); + des->sf_time_last_checkpoint = + cpu_to_le64(src->sf_time_last_checkpoint); + des->sf_pos_latest_start = + cpu_to_le64(src->sf_pos_latest_start); + des->sf_pos_last_checkpoint = + cpu_to_le64(src->sf_pos_last_checkpoint); + des->sf_pos_first_inconsistent = + cpu_to_le64(src->sf_pos_first_inconsistent); + des->sf_items_checked = + cpu_to_le64(src->sf_items_checked); + des->sf_items_updated = + cpu_to_le64(src->sf_items_updated); + des->sf_items_failed = + cpu_to_le64(src->sf_items_failed); + des->sf_items_updated_prior = + cpu_to_le64(src->sf_items_updated_prior); + des->sf_run_time = cpu_to_le32(src->sf_run_time); + des->sf_success_count = cpu_to_le32(src->sf_success_count); + des->sf_oi_count = cpu_to_le16(src->sf_oi_count); + des->sf_internal_flags = cpu_to_le16(src->sf_internal_flags); + memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE); +} + +void scrub_file_init(struct lustre_scrub *scrub, __u8 *uuid) +{ + struct scrub_file *sf = &scrub->os_file; + + memset(sf, 0, sizeof(*sf)); + memcpy(sf->sf_uuid, uuid, 16); + sf->sf_magic = SCRUB_MAGIC_V1; + sf->sf_status = SS_INIT; +} +EXPORT_SYMBOL(scrub_file_init); + +void scrub_file_reset(struct lustre_scrub *scrub, __u8 *uuid, __u64 flags) +{ + struct scrub_file *sf = &scrub->os_file; + + CDEBUG(D_LFSCK, "%s: reset OI scrub file, old flags = " + "%#llx, add flags = %#llx\n", + scrub->os_name, sf->sf_flags, flags); + + memcpy(sf->sf_uuid, uuid, 16); + sf->sf_status = SS_INIT; + sf->sf_flags |= flags; + sf->sf_flags &= ~SF_AUTO; + sf->sf_run_time = 0; + sf->sf_time_latest_start = 0; + sf->sf_time_last_checkpoint = 0; + sf->sf_pos_latest_start = 0; + sf->sf_pos_last_checkpoint = 0; + sf->sf_pos_first_inconsistent = 0; + sf->sf_items_checked = 0; + sf->sf_items_updated = 0; + sf->sf_items_failed = 0; + sf->sf_items_noscrub = 0; + sf->sf_items_igif = 0; + if (!scrub->os_in_join) + sf->sf_items_updated_prior = 0; +} +EXPORT_SYMBOL(scrub_file_reset); + +int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub) +{ + struct scrub_file *sf = &scrub->os_file; + struct lu_buf buf = { + .lb_buf = &scrub->os_file_disk, + .lb_len = sizeof(scrub->os_file_disk) + }; + loff_t pos = 0; + int rc; + + rc = dt_read(env, scrub->os_obj, &buf, &pos); + /* failure */ + if (rc < 0) { + CERROR("%s: fail to load scrub file: rc = %d\n", + scrub->os_name, rc); + return rc; + } + + /* empty */ + if (!rc) + return -ENOENT; + + /* corrupted */ + if (rc < buf.lb_len) { + CDEBUG(D_LFSCK, "%s: fail to load scrub file, " + "expected = %d: rc = %d\n", + scrub->os_name, (int)buf.lb_len, rc); + return -EFAULT; + } + + scrub_file_to_cpu(sf, &scrub->os_file_disk); + if (sf->sf_magic != SCRUB_MAGIC_V1) { + CDEBUG(D_LFSCK, "%s: invalid scrub magic 0x%x != 0x%x\n", + scrub->os_name, sf->sf_magic, SCRUB_MAGIC_V1); + return -EFAULT; + } + + return 0; +} +EXPORT_SYMBOL(scrub_file_load); + +int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub) +{ + struct scrub_file *sf = &scrub->os_file_disk; + struct dt_object *obj = scrub->os_obj; + struct dt_device *dev = scrub_obj2dev(obj); + struct lu_buf buf = { + .lb_buf = sf, + .lb_len = sizeof(*sf) + }; + struct thandle *th; + loff_t pos = 0; + int rc; + ENTRY; + + /* Skip store under rdonly mode. */ + if (dev->dd_rdonly) + RETURN(0); + + scrub_file_to_le(sf, &scrub->os_file); + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(log, rc = PTR_ERR(th)); + + rc = dt_declare_record_write(env, obj, &buf, pos, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_record_write(env, obj, &buf, &pos, th); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + +log: + if (rc) + CERROR("%s: store scrub file: rc = %d\n", + scrub->os_name, rc); + else + CDEBUG(D_LFSCK, "%s: store scrub file: rc = %d\n", + scrub->os_name, rc); + + scrub->os_time_last_checkpoint = ktime_get_seconds(); + scrub->os_time_next_checkpoint = scrub->os_time_last_checkpoint + + SCRUB_CHECKPOINT_INTERVAL; + return rc; +} +EXPORT_SYMBOL(scrub_file_store); + +int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub) +{ + struct scrub_file *sf = &scrub->os_file; + time64_t now = ktime_get_seconds(); + int rc; + + if (likely(now < scrub->os_time_next_checkpoint || + scrub->os_new_checked == 0)) + return 0; + + CDEBUG(D_LFSCK, "%s: OI scrub checkpoint at pos %llu\n", + scrub->os_name, scrub->os_pos_current); + + down_write(&scrub->os_rwsem); + sf->sf_items_checked += scrub->os_new_checked; + scrub->os_new_checked = 0; + sf->sf_pos_last_checkpoint = scrub->os_pos_current; + sf->sf_time_last_checkpoint = ktime_get_real_seconds(); + sf->sf_run_time += now - scrub->os_time_last_checkpoint; + rc = scrub_file_store(env, scrub); + up_write(&scrub->os_rwsem); + + return rc; +} +EXPORT_SYMBOL(scrub_checkpoint); + +int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub, + void *data, __u32 flags) +{ + struct ptlrpc_thread *thread = &scrub->os_thread; + struct l_wait_info lwi = { 0 }; + struct task_struct *task; + int rc; + ENTRY; + +again: + /* os_lock: sync status between stop and scrub thread */ + spin_lock(&scrub->os_lock); + if (thread_is_running(thread)) { + spin_unlock(&scrub->os_lock); + RETURN(-EALREADY); + } + + if (unlikely(thread_is_stopping(thread))) { + spin_unlock(&scrub->os_lock); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopped(thread), + &lwi); + goto again; + } + spin_unlock(&scrub->os_lock); + + if (scrub->os_file.sf_status == SS_COMPLETED) { + if (!(flags & SS_SET_FAILOUT)) + flags |= SS_CLEAR_FAILOUT; + + if (!(flags & SS_SET_DRYRUN)) + flags |= SS_CLEAR_DRYRUN; + + flags |= SS_RESET; + } + + scrub->os_start_flags = flags; + thread_set_flags(thread, 0); + task = kthread_run(threadfn, data, "OI_scrub"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("%s: cannot start iteration thread: rc = %d\n", + scrub->os_name, rc); + RETURN(rc); + } + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); + + RETURN(0); +} +EXPORT_SYMBOL(scrub_start); + +void scrub_stop(struct lustre_scrub *scrub) +{ + struct ptlrpc_thread *thread = &scrub->os_thread; + struct l_wait_info lwi = { 0 }; + + /* os_lock: sync status between stop and scrub thread */ + spin_lock(&scrub->os_lock); + if (!thread_is_init(thread) && !thread_is_stopped(thread)) { + thread_set_flags(thread, SVC_STOPPING); + spin_unlock(&scrub->os_lock); + wake_up_all(&thread->t_ctl_waitq); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopped(thread), + &lwi); + /* Do not skip the last lock/unlock, which can guarantee that + * the caller cannot return until the OI scrub thread exit. */ + spin_lock(&scrub->os_lock); + } + spin_unlock(&scrub->os_lock); +} +EXPORT_SYMBOL(scrub_stop); + +const char *scrub_status_names[] = { + "init", + "scanning", + "completed", + "failed", + "stopped", + "paused", + "crashed", + NULL +}; + +const char *scrub_flags_names[] = { + "recreated", + "inconsistent", + "auto", + "upgrade", + NULL +}; + +const char *scrub_param_names[] = { + "failout", + "dryrun", + NULL +}; + +static void scrub_bits_dump(struct seq_file *m, int bits, const char *names[], + const char *prefix) +{ + int flag; + int i; + + seq_printf(m, "%s:%c", prefix, bits != 0 ? ' ' : '\n'); + + for (i = 0, flag = 1; bits != 0; i++, flag = 1 << i) { + if (flag & bits) { + bits &= ~flag; + seq_printf(m, "%s%c", names[i], + bits != 0 ? ',' : '\n'); + } + } +} + +static void scrub_time_dump(struct seq_file *m, time64_t time, + const char *prefix) +{ + if (time != 0) + seq_printf(m, "%s: %llu seconds\n", prefix, + ktime_get_real_seconds() - time); + else + seq_printf(m, "%s: N/A\n", prefix); +} + +static void scrub_pos_dump(struct seq_file *m, __u64 pos, const char *prefix) +{ + if (pos != 0) + seq_printf(m, "%s: %llu\n", prefix, pos); + else + seq_printf(m, "%s: N/A\n", prefix); +} + +void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub) +{ + struct scrub_file *sf = &scrub->os_file; + u64 checked; + s64 speed; + + down_read(&scrub->os_rwsem); + seq_printf(m, "name: OI_scrub\n" + "magic: 0x%x\n" + "oi_files: %d\n" + "status: %s\n", + sf->sf_magic, (int)sf->sf_oi_count, + scrub_status_names[sf->sf_status]); + + scrub_bits_dump(m, sf->sf_flags, scrub_flags_names, "flags"); + + scrub_bits_dump(m, sf->sf_param, scrub_param_names, "param"); + + scrub_time_dump(m, sf->sf_time_last_complete, + "time_since_last_completed"); + + scrub_time_dump(m, sf->sf_time_latest_start, + "time_since_latest_start"); + + scrub_time_dump(m, sf->sf_time_last_checkpoint, + "time_since_last_checkpoint"); + + scrub_pos_dump(m, sf->sf_pos_latest_start, + "latest_start_position"); + + scrub_pos_dump(m, sf->sf_pos_last_checkpoint, + "last_checkpoint_position"); + + scrub_pos_dump(m, sf->sf_pos_first_inconsistent, + "first_failure_position"); + + checked = sf->sf_items_checked + scrub->os_new_checked; + seq_printf(m, "checked: %llu\n" + "%s: %llu\n" + "failed: %llu\n" + "prior_%s: %llu\n" + "noscrub: %llu\n" + "igif: %llu\n" + "success_count: %u\n", + checked, + sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated", + sf->sf_items_updated, sf->sf_items_failed, + sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated", + sf->sf_items_updated_prior, sf->sf_items_noscrub, + sf->sf_items_igif, sf->sf_success_count); + + speed = checked; + if (thread_is_running(&scrub->os_thread)) { + s64 new_checked = scrub->os_new_checked; + time64_t duration; + time64_t rtime; + + /* Since the time resolution is in seconds for new system + * or small devices it ismore likely that duration will be + * zero which will lead to inaccurate results. + */ + duration = ktime_get_seconds() - + scrub->os_time_last_checkpoint; + if (duration != 0) + new_checked = div_s64(new_checked, duration); + + rtime = sf->sf_run_time + duration; + if (rtime != 0) + speed = div_s64(speed, rtime); + + seq_printf(m, "run_time: %lld seconds\n" + "average_speed: %lld objects/sec\n" + "real-time_speed: %lld objects/sec\n" + "current_position: %llu\n" + "scrub_in_prior: %s\n" + "scrub_full_speed: %s\n" + "partial_scan: %s\n", + rtime, speed, new_checked, + scrub->os_pos_current, + scrub->os_in_prior ? "yes" : "no", + scrub->os_full_speed ? "yes" : "no", + scrub->os_partial_scan ? "yes" : "no"); + } else { + if (sf->sf_run_time != 0) + speed = div_s64(speed, sf->sf_run_time); + seq_printf(m, "run_time: %ld seconds\n" + "average_speed: %lld objects/sec\n" + "real-time_speed: N/A\n" + "current_position: N/A\n", + sf->sf_run_time, speed); + } + + up_read(&scrub->os_rwsem); +} +EXPORT_SYMBOL(scrub_dump); + +int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid, + const struct lu_fid *cfid, __u64 child, + const char *name, int namelen) +{ + struct lustre_index_restore_unit *liru; + int len = sizeof(*liru) + namelen + 1; + + OBD_ALLOC(liru, len); + if (!liru) + return -ENOMEM; + + INIT_LIST_HEAD(&liru->liru_link); + liru->liru_pfid = *pfid; + liru->liru_cfid = *cfid; + liru->liru_clid = child; + liru->liru_len = len; + memcpy(liru->liru_name, name, namelen); + liru->liru_name[namelen] = 0; + list_add_tail(&liru->liru_link, head); + + return 0; +} +EXPORT_SYMBOL(lustre_liru_new); + +int lustre_index_register(struct dt_device *dev, const char *devname, + struct list_head *head, spinlock_t *lock, int *guard, + const struct lu_fid *fid, + __u32 keysize, __u32 recsize) +{ + struct lustre_index_backup_unit *libu, *pos; + int rc = 0; + ENTRY; + + if (dev->dd_rdonly || *guard) + RETURN(1); + + OBD_ALLOC_PTR(libu); + if (!libu) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&libu->libu_link); + libu->libu_keysize = keysize; + libu->libu_recsize = recsize; + libu->libu_fid = *fid; + + spin_lock(lock); + if (unlikely(*guard)) { + spin_unlock(lock); + OBD_FREE_PTR(libu); + + RETURN(1); + } + + list_for_each_entry_reverse(pos, head, libu_link) { + rc = lu_fid_cmp(&pos->libu_fid, fid); + if (rc < 0) { + list_add(&libu->libu_link, &pos->libu_link); + spin_unlock(lock); + + RETURN(0); + } + + if (!rc) { + /* Registered already. But the former registered one + * has different keysize/recsize. It may because that + * the former values are from disk and corrupted, then + * replace it with new values. */ + if (unlikely(keysize != pos->libu_keysize || + recsize != pos->libu_recsize)) { + CWARN("%s: the index "DFID" has registered " + "with %u/%u, may be invalid, replace " + "with %u/%u\n", + devname, PFID(fid), pos->libu_keysize, + pos->libu_recsize, keysize, recsize); + + pos->libu_keysize = keysize; + pos->libu_recsize = recsize; + } else { + rc = 1; + } + + spin_unlock(lock); + OBD_FREE_PTR(libu); + + RETURN(rc); + } + } + + list_add(&libu->libu_link, head); + spin_unlock(lock); + + RETURN(0); +} +EXPORT_SYMBOL(lustre_index_register); + +static void lustre_index_degister(struct list_head *head, spinlock_t *lock, + const struct lu_fid *fid) +{ + struct lustre_index_backup_unit *libu; + int rc = -ENOENT; + + spin_lock(lock); + list_for_each_entry_reverse(libu, head, libu_link) { + rc = lu_fid_cmp(&libu->libu_fid, fid); + /* NOT registered. */ + if (rc < 0) + break; + + if (!rc) { + list_del(&libu->libu_link); + break; + } + } + spin_unlock(lock); + + if (!rc) + OBD_FREE_PTR(libu); +} + +static void +lustre_index_backup_make_header(struct lustre_index_backup_header *header, + __u32 keysize, __u32 recsize, + const struct lu_fid *fid, __u32 count) +{ + memset(header, 0, sizeof(*header)); + header->libh_magic = cpu_to_le32(INDEX_BACKUP_MAGIC_V1); + header->libh_count = cpu_to_le32(count); + header->libh_keysize = cpu_to_le32(keysize); + header->libh_recsize = cpu_to_le32(recsize); + fid_cpu_to_le(&header->libh_owner, fid); +} + +static int lustre_index_backup_body(const struct lu_env *env, + struct dt_object *obj, loff_t *pos, + void *buf, int bufsize) +{ + struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev); + struct thandle *th; + struct lu_buf lbuf = { + .lb_buf = buf, + .lb_len = bufsize + }; + int rc; + ENTRY; + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = dt_declare_record_write(env, obj, &lbuf, *pos, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_record_write(env, obj, &lbuf, pos, th); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + return rc; +} + +static int lustre_index_backup_header(const struct lu_env *env, + struct dt_object *obj, + const struct lu_fid *tgt_fid, + __u32 keysize, __u32 recsize, + void *buf, int bufsize, int count) +{ + struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev); + struct lustre_index_backup_header *header = buf; + struct lu_attr *la = buf; + struct thandle *th; + struct lu_buf lbuf = { + .lb_buf = header, + .lb_len = sizeof(*header) + }; + loff_t size = sizeof(*header) + (keysize + recsize) * count; + loff_t pos = 0; + int rc; + bool punch = false; + ENTRY; + + LASSERT(sizeof(*la) <= bufsize); + LASSERT(sizeof(*header) <= bufsize); + + rc = dt_attr_get(env, obj, la); + if (rc) + RETURN(rc); + + if (la->la_size > size) + punch = true; + + lustre_index_backup_make_header(header, keysize, recsize, + tgt_fid, count); + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = dt_declare_record_write(env, obj, &lbuf, pos, th); + if (rc) + GOTO(stop, rc); + + if (punch) { + rc = dt_declare_punch(env, obj, size, OBD_OBJECT_EOF, th); + if (rc) + GOTO(stop, rc); + } + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_record_write(env, obj, &lbuf, &pos, th); + if (!rc && punch) + rc = dt_punch(env, obj, size, OBD_OBJECT_EOF, th); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + return rc; +} + +static int lustre_index_update_lma(const struct lu_env *env, + struct dt_object *obj, + void *buf, int bufsize) +{ + struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev); + struct lustre_mdt_attrs *lma = buf; + struct lu_buf lbuf = { + .lb_buf = lma, + .lb_len = sizeof(struct lustre_ost_attrs) + }; + struct thandle *th; + int fl = LU_XATTR_REPLACE; + int rc; + ENTRY; + + LASSERT(bufsize >= lbuf.lb_len); + + rc = dt_xattr_get(env, obj, &lbuf, XATTR_NAME_LMA); + if (unlikely(rc == -ENODATA)) { + fl = LU_XATTR_CREATE; + lustre_lma_init(lma, lu_object_fid(&obj->do_lu), + LMAC_IDX_BACKUP, 0); + rc = sizeof(*lma); + } else if (rc < sizeof(*lma)) { + RETURN(rc < 0 ? rc : -EFAULT); + } else { + lustre_lma_swab(lma); + if (lma->lma_compat & LMAC_IDX_BACKUP) + RETURN(0); + + lma->lma_compat |= LMAC_IDX_BACKUP; + } + + lustre_lma_swab(lma); + lbuf.lb_len = rc; + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + RETURN(rc); + + rc = dt_declare_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + return rc; +} + +static int lustre_index_backup_one(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + struct lustre_index_backup_unit *libu, + char *buf, int bufsize) +{ + struct dt_device *dev = scrub_obj2dev(parent); + struct dt_object *tgt_obj = NULL; + struct dt_object *bak_obj = NULL; + const struct dt_it_ops *iops; + struct dt_it *di; + loff_t pos = sizeof(struct lustre_index_backup_header); + int count = 0; + int size = 0; + int rc; + ENTRY; + + tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + &libu->libu_fid, NULL)); + if (IS_ERR_OR_NULL(tgt_obj)) + GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT); + + if (!dt_object_exists(tgt_obj)) + GOTO(out, rc = 0); + + if (!tgt_obj->do_index_ops) { + struct dt_index_features feat; + + feat.dif_flags = DT_IND_UPDATE; + feat.dif_keysize_min = libu->libu_keysize; + feat.dif_keysize_max = libu->libu_keysize; + feat.dif_recsize_min = libu->libu_recsize; + feat.dif_recsize_max = libu->libu_recsize; + feat.dif_ptrsize = 4; + rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, &feat); + if (rc) + GOTO(out, rc); + } + + lustre_fid2lbx(buf, &libu->libu_fid, bufsize); + bak_obj = local_file_find_or_create(env, los, parent, buf, + S_IFREG | S_IRUGO | S_IWUSR); + if (IS_ERR_OR_NULL(bak_obj)) + GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT); + + iops = &tgt_obj->do_index_ops->dio_it; + di = iops->init(env, tgt_obj, 0); + if (IS_ERR(di)) + GOTO(out, rc = PTR_ERR(di)); + + rc = iops->load(env, di, 0); + if (!rc) + rc = iops->next(env, di); + else if (rc > 0) + rc = 0; + + while (!rc) { + void *key; + void *rec; + + key = iops->key(env, di); + memcpy(&buf[size], key, libu->libu_keysize); + size += libu->libu_keysize; + rec = &buf[size]; + rc = iops->rec(env, di, rec, 0); + if (rc) + GOTO(fini, rc); + + size += libu->libu_recsize; + count++; + if (size + libu->libu_keysize + libu->libu_recsize > bufsize) { + rc = lustre_index_backup_body(env, bak_obj, &pos, + buf, size); + if (rc) + GOTO(fini, rc); + + size = 0; + } + + rc = iops->next(env, di); + } + + if (rc >= 0 && size > 0) + rc = lustre_index_backup_body(env, bak_obj, &pos, buf, size); + + if (rc < 0) + GOTO(fini, rc); + + rc = lustre_index_backup_header(env, bak_obj, &libu->libu_fid, + libu->libu_keysize, libu->libu_recsize, + buf, bufsize, count); + if (!rc) + rc = lustre_index_update_lma(env, tgt_obj, buf, bufsize); + + if (!rc && OBD_FAIL_CHECK(OBD_FAIL_OSD_INDEX_CRASH)) { + LASSERT(bufsize >= 512); + + pos = 0; + memset(buf, 0, 512); + lustre_index_backup_body(env, tgt_obj, &pos, buf, 512); + } + + GOTO(fini, rc); + +fini: + iops->fini(env, di); +out: + if (!IS_ERR_OR_NULL(tgt_obj)) + dt_object_put_nocache(env, tgt_obj); + if (!IS_ERR_OR_NULL(bak_obj)) + dt_object_put_nocache(env, bak_obj); + return rc; +} + +void lustre_index_backup(const struct lu_env *env, struct dt_device *dev, + const char *devname, struct list_head *head, + spinlock_t *lock, int *guard, bool backup) +{ + struct lustre_index_backup_unit *libu; + struct local_oid_storage *los = NULL; + struct dt_object *parent = NULL; + char *buf = NULL; + struct lu_fid fid; + int rc; + ENTRY; + + if (dev->dd_rdonly || *guard) + RETURN_EXIT; + + spin_lock(lock); + *guard = 1; + spin_unlock(lock); + + if (list_empty(head)) + RETURN_EXIT; + + /* Handle kinds of failures during mount process. */ + if (!dev->dd_lu_dev.ld_site || !dev->dd_lu_dev.ld_site->ls_top_dev) + backup = false; + + if (backup) { + OBD_ALLOC_LARGE(buf, INDEX_BACKUP_BUFSIZE); + if (!buf) { + backup = false; + goto scan; + } + + lu_local_obj_fid(&fid, INDEX_BACKUP_OID); + parent = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + &fid, NULL)); + if (IS_ERR_OR_NULL(parent)) { + CERROR("%s: failed to locate backup dir: rc = %ld\n", + devname, parent ? PTR_ERR(parent) : -ENOENT); + backup = false; + goto scan; + } + + lu_local_name_obj_fid(&fid, 1); + rc = local_oid_storage_init(env, dev, &fid, &los); + if (rc) { + CERROR("%s: failed to init local storage: rc = %d\n", + devname, rc); + backup = false; + } + } + +scan: + spin_lock(lock); + while (!list_empty(head)) { + libu = list_entry(head->next, + struct lustre_index_backup_unit, libu_link); + list_del_init(&libu->libu_link); + spin_unlock(lock); + + if (backup) { + rc = lustre_index_backup_one(env, los, parent, libu, + buf, INDEX_BACKUP_BUFSIZE); + CDEBUG(D_WARNING, "%s: backup index "DFID": rc = %d\n", + devname, PFID(&libu->libu_fid), rc); + } + + OBD_FREE_PTR(libu); + spin_lock(lock); + } + spin_unlock(lock); + + if (los) + local_oid_storage_fini(env, los); + if (parent) + dt_object_put_nocache(env, parent); + if (buf) + OBD_FREE_LARGE(buf, INDEX_BACKUP_BUFSIZE); + + EXIT; +} +EXPORT_SYMBOL(lustre_index_backup); + +int lustre_index_restore(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *parent_fid, + const struct lu_fid *tgt_fid, + const struct lu_fid *bak_fid, const char *name, + struct list_head *head, spinlock_t *lock, + char *buf, int bufsize) +{ + struct dt_object *parent_obj = NULL; + struct dt_object *tgt_obj = NULL; + struct dt_object *bak_obj = NULL; + struct lustre_index_backup_header *header; + struct dt_index_features *feat; + struct dt_object_format *dof; + struct lu_attr *la; + struct thandle *th; + struct lu_object_conf conf; + struct dt_insert_rec ent; + struct lu_buf lbuf; + struct lu_fid tfid; + loff_t pos = 0; + __u32 keysize; + __u32 recsize; + __u32 pairsize; + int count; + int rc; + bool registered = false; + ENTRY; + + LASSERT(bufsize >= sizeof(*la) + sizeof(*dof) + + sizeof(*feat) + sizeof(*header)); + + memset(buf, 0, bufsize); + la = (struct lu_attr *)buf; + dof = (void *)la + sizeof(*la); + feat = (void *)dof + sizeof(*dof); + header = (void *)feat + sizeof(*feat); + lbuf.lb_buf = header; + lbuf.lb_len = sizeof(*header); + + tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + tgt_fid, NULL)); + if (IS_ERR_OR_NULL(tgt_obj)) + GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT); + + bak_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + bak_fid, NULL)); + if (IS_ERR_OR_NULL(bak_obj)) + GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT); + + if (!dt_object_exists(bak_obj)) + GOTO(out, rc = -ENOENT); + + parent_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + parent_fid, NULL)); + if (IS_ERR_OR_NULL(parent_obj)) + GOTO(out, rc = parent_obj ? PTR_ERR(parent_obj) : -ENOENT); + + LASSERT(dt_object_exists(parent_obj)); + + if (unlikely(!dt_try_as_dir(env, parent_obj))) + GOTO(out, rc = -ENOTDIR); + + rc = dt_attr_get(env, tgt_obj, la); + if (rc) + GOTO(out, rc); + + rc = dt_record_read(env, bak_obj, &lbuf, &pos); + if (rc) + GOTO(out, rc); + + if (le32_to_cpu(header->libh_magic) != INDEX_BACKUP_MAGIC_V1) + GOTO(out, rc = -EINVAL); + + fid_le_to_cpu(&tfid, &header->libh_owner); + if (unlikely(!lu_fid_eq(tgt_fid, &tfid))) + GOTO(out, rc = -EINVAL); + + keysize = le32_to_cpu(header->libh_keysize); + recsize = le32_to_cpu(header->libh_recsize); + pairsize = keysize + recsize; + + memset(feat, 0, sizeof(*feat)); + feat->dif_flags = DT_IND_UPDATE; + feat->dif_keysize_min = feat->dif_keysize_max = keysize; + feat->dif_recsize_min = feat->dif_recsize_max = recsize; + feat->dif_ptrsize = 4; + + /* T1: remove old name entry and destroy old index. */ + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = dt_declare_delete(env, parent_obj, + (const struct dt_key *)name, th); + if (rc) + GOTO(stop, rc); + + rc = dt_declare_destroy(env, tgt_obj, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_delete(env, parent_obj, (const struct dt_key *)name, th); + if (rc) + GOTO(stop, rc); + + dt_write_lock(env, tgt_obj, 0); + rc = dt_destroy(env, tgt_obj, th); + dt_write_unlock(env, tgt_obj); + dt_trans_stop(env, dev, th); + if (rc) + GOTO(out, rc); + + la->la_valid = LA_MODE | LA_UID | LA_GID; + conf.loc_flags = LOC_F_NEW; + dof->u.dof_idx.di_feat = feat; + dof->dof_type = DFT_INDEX; + ent.rec_type = S_IFREG; + ent.rec_fid = tgt_fid; + + /* Drop cache before re-create it. */ + dt_object_put_nocache(env, tgt_obj); + tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev, + tgt_fid, &conf)); + if (IS_ERR_OR_NULL(tgt_obj)) + GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT); + + LASSERT(!dt_object_exists(tgt_obj)); + + /* T2: create new index and insert new name entry. */ + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = dt_declare_create(env, tgt_obj, la, NULL, dof, th); + if (rc) + GOTO(stop, rc); + + rc = dt_declare_insert(env, parent_obj, (const struct dt_rec *)&ent, + (const struct dt_key *)name, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + dt_write_lock(env, tgt_obj, 0); + rc = dt_create(env, tgt_obj, la, NULL, dof, th); + dt_write_unlock(env, tgt_obj); + if (rc) + GOTO(stop, rc); + + rc = dt_insert(env, parent_obj, (const struct dt_rec *)&ent, + (const struct dt_key *)name, th); + dt_trans_stop(env, dev, th); + /* Some index name may has been inserted by OSD + * automatically when create the index object. */ + if (unlikely(rc == -EEXIST)) + rc = 0; + if (rc) + GOTO(out, rc); + + /* The new index will register via index_try. */ + rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, feat); + if (rc) + GOTO(out, rc); + + registered = true; + count = le32_to_cpu(header->libh_count); + while (!rc && count > 0) { + int size = pairsize * count; + int items = count; + int i; + + if (size > bufsize) { + items = bufsize / pairsize; + size = pairsize * items; + } + + lbuf.lb_buf = buf; + lbuf.lb_len = size; + rc = dt_record_read(env, bak_obj, &lbuf, &pos); + for (i = 0; i < items && !rc; i++) { + void *key = &buf[i * pairsize]; + void *rec = &buf[i * pairsize + keysize]; + + /* Tn: restore the records. */ + th = dt_trans_create(env, dev); + if (!th) + GOTO(out, rc = -ENOMEM); + + rc = dt_declare_insert(env, tgt_obj, rec, key, th); + if (rc) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(stop, rc); + + rc = dt_insert(env, tgt_obj, rec, key, th); + if (unlikely(rc == -EEXIST)) + rc = 0; + + dt_trans_stop(env, dev, th); + } + + count -= items; + } + + GOTO(out, rc); + +stop: + dt_trans_stop(env, dev, th); + if (rc && registered) + /* Degister the index to avoid overwriting the backup. */ + lustre_index_degister(head, lock, tgt_fid); + +out: + if (!IS_ERR_OR_NULL(tgt_obj)) + dt_object_put_nocache(env, tgt_obj); + if (!IS_ERR_OR_NULL(bak_obj)) + dt_object_put_nocache(env, bak_obj); + if (!IS_ERR_OR_NULL(parent_obj)) + dt_object_put_nocache(env, parent_obj); + return rc; +} +EXPORT_SYMBOL(lustre_index_restore); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c new file mode 100644 index 0000000000000..9c52f8094e9fe --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c @@ -0,0 +1,73 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/statfs_pack.c + * + * (Un)packing of OST/MDS requests + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include + +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs) +{ + memset(osfs, 0, sizeof(*osfs)); + osfs->os_type = sfs->f_type; + osfs->os_blocks = sfs->f_blocks; + osfs->os_bfree = sfs->f_bfree; + osfs->os_bavail = sfs->f_bavail; + osfs->os_files = sfs->f_files; + osfs->os_ffree = sfs->f_ffree; + osfs->os_bsize = sfs->f_bsize; + osfs->os_namelen = sfs->f_namelen; +} +EXPORT_SYMBOL(statfs_pack); + +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs) +{ + memset(sfs, 0, sizeof(*sfs)); + sfs->f_type = osfs->os_type; + sfs->f_blocks = osfs->os_blocks; + sfs->f_bfree = osfs->os_bfree; + sfs->f_bavail = osfs->os_bavail; + sfs->f_files = osfs->os_files; + sfs->f_ffree = osfs->os_ffree; + sfs->f_bsize = osfs->os_bsize; + sfs->f_namelen = osfs->os_namelen; +} +EXPORT_SYMBOL(statfs_unpack); diff --git a/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c new file mode 100644 index 0000000000000..5622410784d7a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c @@ -0,0 +1,448 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/upcall_cache.c + * + * Supplementary groups cache. + */ +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include + +static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache, + __u64 key, void *args) +{ + struct upcall_cache_entry *entry; + + LIBCFS_ALLOC(entry, sizeof(*entry)); + if (!entry) + return NULL; + + UC_CACHE_SET_NEW(entry); + INIT_LIST_HEAD(&entry->ue_hash); + entry->ue_key = key; + atomic_set(&entry->ue_refcount, 0); + init_waitqueue_head(&entry->ue_waitq); + if (cache->uc_ops->init_entry) + cache->uc_ops->init_entry(entry, args); + return entry; +} + +/* protected by cache lock */ +static void free_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + if (cache->uc_ops->free_entry) + cache->uc_ops->free_entry(cache, entry); + + list_del(&entry->ue_hash); + CDEBUG(D_OTHER, "destroy cache entry %p for key %llu\n", + entry, entry->ue_key); + LIBCFS_FREE(entry, sizeof(*entry)); +} + +static inline int upcall_compare(struct upcall_cache *cache, + struct upcall_cache_entry *entry, + __u64 key, void *args) +{ + if (entry->ue_key != key) + return -1; + + if (cache->uc_ops->upcall_compare) + return cache->uc_ops->upcall_compare(cache, entry, key, args); + + return 0; +} + +static inline int downcall_compare(struct upcall_cache *cache, + struct upcall_cache_entry *entry, + __u64 key, void *args) +{ + if (entry->ue_key != key) + return -1; + + if (cache->uc_ops->downcall_compare) + return cache->uc_ops->downcall_compare(cache, entry, key, args); + + return 0; +} + +static inline void get_entry(struct upcall_cache_entry *entry) +{ + atomic_inc(&entry->ue_refcount); +} + +static inline void put_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + if (atomic_dec_and_test(&entry->ue_refcount) && + (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) { + free_entry(cache, entry); + } +} + +static int check_unlink_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + time64_t now = ktime_get_seconds(); + + if (UC_CACHE_IS_VALID(entry) && now < entry->ue_expire) + return 0; + + if (UC_CACHE_IS_ACQUIRING(entry)) { + if (entry->ue_acquire_expire == 0 || + now < entry->ue_acquire_expire) + return 0; + + UC_CACHE_SET_EXPIRED(entry); + wake_up_all(&entry->ue_waitq); + } else if (!UC_CACHE_IS_INVALID(entry)) { + UC_CACHE_SET_EXPIRED(entry); + } + + list_del_init(&entry->ue_hash); + if (!atomic_read(&entry->ue_refcount)) + free_entry(cache, entry); + return 1; +} + +static inline int refresh_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + LASSERT(cache->uc_ops->do_upcall); + return cache->uc_ops->do_upcall(cache, entry); +} + +struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache, + __u64 key, void *args) +{ + struct upcall_cache_entry *entry = NULL, *new = NULL, *next; + struct list_head *head; + wait_queue_entry_t wait; + int rc, found; + ENTRY; + + LASSERT(cache); + + head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; +find_again: + found = 0; + spin_lock(&cache->uc_lock); + list_for_each_entry_safe(entry, next, head, ue_hash) { + /* check invalid & expired items */ + if (check_unlink_entry(cache, entry)) + continue; + if (upcall_compare(cache, entry, key, args) == 0) { + found = 1; + break; + } + } + + if (!found) { + if (!new) { + spin_unlock(&cache->uc_lock); + new = alloc_entry(cache, key, args); + if (!new) { + CERROR("fail to alloc entry\n"); + RETURN(ERR_PTR(-ENOMEM)); + } + goto find_again; + } else { + list_add(&new->ue_hash, head); + entry = new; + } + } else { + if (new) { + free_entry(cache, new); + new = NULL; + } + list_move(&entry->ue_hash, head); + } + get_entry(entry); + + /* acquire for new one */ + if (UC_CACHE_IS_NEW(entry)) { + UC_CACHE_SET_ACQUIRING(entry); + UC_CACHE_CLEAR_NEW(entry); + spin_unlock(&cache->uc_lock); + rc = refresh_entry(cache, entry); + spin_lock(&cache->uc_lock); + entry->ue_acquire_expire = ktime_get_seconds() + + cache->uc_acquire_expire; + if (rc < 0) { + UC_CACHE_CLEAR_ACQUIRING(entry); + UC_CACHE_SET_INVALID(entry); + wake_up_all(&entry->ue_waitq); + if (unlikely(rc == -EREMCHG)) { + put_entry(cache, entry); + GOTO(out, entry = ERR_PTR(rc)); + } + } + } + /* someone (and only one) is doing upcall upon this item, + * wait it to complete */ + if (UC_CACHE_IS_ACQUIRING(entry)) { + long expiry = (entry == new) ? + cfs_time_seconds(cache->uc_acquire_expire) : + MAX_SCHEDULE_TIMEOUT; + long left; + + init_waitqueue_entry(&wait, current); + add_wait_queue(&entry->ue_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock(&cache->uc_lock); + + left = schedule_timeout(expiry); + + spin_lock(&cache->uc_lock); + remove_wait_queue(&entry->ue_waitq, &wait); + if (UC_CACHE_IS_ACQUIRING(entry)) { + /* we're interrupted or upcall failed in the middle */ + rc = left > 0 ? -EINTR : -ETIMEDOUT; + CERROR("acquire for key %llu: error %d\n", + entry->ue_key, rc); + put_entry(cache, entry); + GOTO(out, entry = ERR_PTR(rc)); + } + } + + /* invalid means error, don't need to try again */ + if (UC_CACHE_IS_INVALID(entry)) { + put_entry(cache, entry); + GOTO(out, entry = ERR_PTR(-EIDRM)); + } + + /* check expired + * We can't refresh the existing one because some + * memory might be shared by multiple processes. + */ + if (check_unlink_entry(cache, entry)) { + /* if expired, try again. but if this entry is + * created by me but too quickly turn to expired + * without any error, should at least give a + * chance to use it once. + */ + if (entry != new) { + put_entry(cache, entry); + spin_unlock(&cache->uc_lock); + new = NULL; + goto find_again; + } + } + + /* Now we know it's good */ +out: + spin_unlock(&cache->uc_lock); + RETURN(entry); +} +EXPORT_SYMBOL(upcall_cache_get_entry); + +void upcall_cache_put_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + ENTRY; + + if (!entry) { + EXIT; + return; + } + + LASSERT(atomic_read(&entry->ue_refcount) > 0); + spin_lock(&cache->uc_lock); + put_entry(cache, entry); + spin_unlock(&cache->uc_lock); + EXIT; +} +EXPORT_SYMBOL(upcall_cache_put_entry); + +int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key, + void *args) +{ + struct upcall_cache_entry *entry = NULL; + struct list_head *head; + int found = 0, rc = 0; + ENTRY; + + LASSERT(cache); + + head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; + + spin_lock(&cache->uc_lock); + list_for_each_entry(entry, head, ue_hash) { + if (downcall_compare(cache, entry, key, args) == 0) { + found = 1; + get_entry(entry); + break; + } + } + + if (!found) { + CDEBUG(D_OTHER, "%s: upcall for key %llu not expected\n", + cache->uc_name, key); + /* haven't found, it's possible */ + spin_unlock(&cache->uc_lock); + RETURN(-EINVAL); + } + + if (err) { + CDEBUG(D_OTHER, "%s: upcall for key %llu returned %d\n", + cache->uc_name, entry->ue_key, err); + GOTO(out, rc = -EINVAL); + } + + if (!UC_CACHE_IS_ACQUIRING(entry)) { + CDEBUG(D_RPCTRACE, "%s: found uptodate entry %p (key %llu)" + "\n", cache->uc_name, entry, entry->ue_key); + GOTO(out, rc = 0); + } + + if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) { + CERROR("%s: found a stale entry %p (key %llu) in ioctl\n", + cache->uc_name, entry, entry->ue_key); + GOTO(out, rc = -EINVAL); + } + + spin_unlock(&cache->uc_lock); + if (cache->uc_ops->parse_downcall) + rc = cache->uc_ops->parse_downcall(cache, entry, args); + spin_lock(&cache->uc_lock); + if (rc) + GOTO(out, rc); + + entry->ue_expire = ktime_get_seconds() + cache->uc_entry_expire; + UC_CACHE_SET_VALID(entry); + CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key %llu\n", + cache->uc_name, entry, entry->ue_key); +out: + if (rc) { + UC_CACHE_SET_INVALID(entry); + list_del_init(&entry->ue_hash); + } + UC_CACHE_CLEAR_ACQUIRING(entry); + spin_unlock(&cache->uc_lock); + wake_up_all(&entry->ue_waitq); + put_entry(cache, entry); + + RETURN(rc); +} +EXPORT_SYMBOL(upcall_cache_downcall); + +void upcall_cache_flush(struct upcall_cache *cache, int force) +{ + struct upcall_cache_entry *entry, *next; + int i; + ENTRY; + + spin_lock(&cache->uc_lock); + for (i = 0; i < UC_CACHE_HASH_SIZE; i++) { + list_for_each_entry_safe(entry, next, + &cache->uc_hashtable[i], ue_hash) { + if (!force && atomic_read(&entry->ue_refcount)) { + UC_CACHE_SET_EXPIRED(entry); + continue; + } + LASSERT(!atomic_read(&entry->ue_refcount)); + free_entry(cache, entry); + } + } + spin_unlock(&cache->uc_lock); + EXIT; +} +EXPORT_SYMBOL(upcall_cache_flush); + +void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args) +{ + struct list_head *head; + struct upcall_cache_entry *entry; + int found = 0; + ENTRY; + + head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; + + spin_lock(&cache->uc_lock); + list_for_each_entry(entry, head, ue_hash) { + if (upcall_compare(cache, entry, key, args) == 0) { + found = 1; + break; + } + } + + if (found) { + CWARN("%s: flush entry %p: key %llu, ref %d, fl %x, " + "cur %lld, ex %lld/%lld\n", + cache->uc_name, entry, entry->ue_key, + atomic_read(&entry->ue_refcount), entry->ue_flags, + ktime_get_real_seconds(), entry->ue_acquire_expire, + entry->ue_expire); + UC_CACHE_SET_EXPIRED(entry); + if (!atomic_read(&entry->ue_refcount)) + free_entry(cache, entry); + } + spin_unlock(&cache->uc_lock); +} +EXPORT_SYMBOL(upcall_cache_flush_one); + +struct upcall_cache *upcall_cache_init(const char *name, const char *upcall, + struct upcall_cache_ops *ops) +{ + struct upcall_cache *cache; + int i; + ENTRY; + + LIBCFS_ALLOC(cache, sizeof(*cache)); + if (!cache) + RETURN(ERR_PTR(-ENOMEM)); + + spin_lock_init(&cache->uc_lock); + init_rwsem(&cache->uc_upcall_rwsem); + for (i = 0; i < UC_CACHE_HASH_SIZE; i++) + INIT_LIST_HEAD(&cache->uc_hashtable[i]); + strlcpy(cache->uc_name, name, sizeof(cache->uc_name)); + /* upcall pathname proc tunable */ + strlcpy(cache->uc_upcall, upcall, sizeof(cache->uc_upcall)); + cache->uc_entry_expire = 20 * 60; + cache->uc_acquire_expire = 30; + cache->uc_ops = ops; + + RETURN(cache); +} +EXPORT_SYMBOL(upcall_cache_init); + +void upcall_cache_cleanup(struct upcall_cache *cache) +{ + if (!cache) + return; + upcall_cache_flush_all(cache); + LIBCFS_FREE(cache, sizeof(*cache)); +} +EXPORT_SYMBOL(upcall_cache_cleanup); diff --git a/drivers/staging/lustrefsx/lustre/obdecho/Makefile b/drivers/staging/lustrefsx/lustre/obdecho/Makefile new file mode 100644 index 0000000000000..8fdb779fdc085 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdecho/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTREFSX_FS) += obdecho.o + +obdecho-y := echo_client.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo.c b/drivers/staging/lustrefsx/lustre/obdecho/echo.c new file mode 100644 index 0000000000000..0f97a830f9b37 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdecho/echo.c @@ -0,0 +1,966 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdecho/echo.c + * + * Author: Peter Braam + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_ECHO + +#include +#include +#include +#include +#include + +#include "echo_internal.h" + +/* The echo objid needs to be below 2^32, because regular FID numbers are + * limited to 2^32 objects in f_oid for the FID_SEQ_ECHO range. b=23335 */ +#define ECHO_INIT_OID 0x10000000ULL +#define ECHO_HANDLE_MAGIC 0xabcd0123fedc9876ULL + +#define ECHO_PERSISTENT_PAGES (ECHO_PERSISTENT_SIZE >> PAGE_SHIFT) +static struct page *echo_persistent_pages[ECHO_PERSISTENT_PAGES]; + +enum { + LPROC_ECHO_READ_BYTES = 1, + LPROC_ECHO_WRITE_BYTES = 2, + LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES +1 +}; + +struct echo_srv_device { + struct lu_device esd_dev; + struct lu_target esd_lut; +}; + +static inline struct echo_srv_device *echo_srv_dev(struct lu_device *d) +{ + return container_of0(d, struct echo_srv_device, esd_dev); +} + +static inline struct obd_device *echo_srv_obd(struct echo_srv_device *esd) +{ + return esd->esd_dev.ld_obd; +} + +static int echo_connect(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct lustre_handle conn = { 0 }; + int rc; + + data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED; + + if (data->ocd_connect_flags & OBD_CONNECT_FLAGS2) + data->ocd_connect_flags2 &= ECHO_CONNECT_SUPPORTED2; + + rc = class_connect(&conn, obd, cluuid); + if (rc) { + CERROR("can't connect %d\n", rc); + return rc; + } + *exp = class_conn2export(&conn); + + return 0; +} + +static int echo_disconnect(struct obd_export *exp) +{ + LASSERT (exp != NULL); + + return server_disconnect_export(exp); +} + +static int echo_init_export(struct obd_export *exp) +{ + return ldlm_init_export(exp); +} + +static int echo_destroy_export(struct obd_export *exp) +{ + ENTRY; + + target_destroy_export(exp); + ldlm_destroy_export(exp); + + RETURN(0); +} + +static u64 echo_next_id(struct obd_device *obddev) +{ + u64 id; + + spin_lock(&obddev->u.echo.eo_lock); + id = ++obddev->u.echo.eo_lastino; + spin_unlock(&obddev->u.echo.eo_lock); + + return id; +} + +static void +echo_page_debug_setup(struct page *page, int rw, u64 id, + __u64 offset, int len) +{ + int page_offset = offset & ~PAGE_MASK; + char *addr = ((char *)kmap(page)) + page_offset; + + if (len % OBD_ECHO_BLOCK_SIZE != 0) + CERROR("Unexpected block size %d\n", len); + + while (len > 0) { + if (rw & OBD_BRW_READ) + block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE, + offset, id); + else + block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE, + 0xecc0ecc0ecc0ecc0ULL, + 0xecc0ecc0ecc0ecc0ULL); + + addr += OBD_ECHO_BLOCK_SIZE; + offset += OBD_ECHO_BLOCK_SIZE; + len -= OBD_ECHO_BLOCK_SIZE; + } + + kunmap(page); +} + +static int +echo_page_debug_check(struct page *page, u64 id, + __u64 offset, int len) +{ + int page_offset = offset & ~PAGE_MASK; + char *addr = ((char *)kmap(page)) + page_offset; + int rc = 0; + int rc2; + + if (len % OBD_ECHO_BLOCK_SIZE != 0) + CERROR("Unexpected block size %d\n", len); + + while (len > 0) { + rc2 = block_debug_check("echo", addr, OBD_ECHO_BLOCK_SIZE, + offset, id); + + if (rc2 != 0 && rc == 0) + rc = rc2; + + addr += OBD_ECHO_BLOCK_SIZE; + offset += OBD_ECHO_BLOCK_SIZE; + len -= OBD_ECHO_BLOCK_SIZE; + } + + kunmap(page); + + return rc; +} + +static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj, + struct niobuf_remote *nb, int *pages, + struct niobuf_local *lb, int cmd, int *left) +{ + gfp_t gfp_mask = (ostid_id(&obj->ioo_oid) & 1) ? + GFP_HIGHUSER : GFP_KERNEL; + int ispersistent = ostid_id(&obj->ioo_oid) == ECHO_PERSISTENT_OBJID; + int debug_setup = (!ispersistent && + (oa->o_valid & OBD_MD_FLFLAGS) != 0 && + (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); + struct niobuf_local *res = lb; + u64 offset = nb->rnb_offset; + int len = nb->rnb_len; + + while (len > 0) { + int plen = PAGE_SIZE - (offset & (PAGE_SIZE-1)); + if (len < plen) + plen = len; + + /* check for local buf overflow */ + if (*left == 0) + return -EINVAL; + + res->lnb_file_offset = offset; + res->lnb_len = plen; + LASSERT((res->lnb_file_offset & ~PAGE_MASK) + + res->lnb_len <= PAGE_SIZE); + + if (ispersistent && + ((res->lnb_file_offset >> PAGE_SHIFT) < + ECHO_PERSISTENT_PAGES)) { + res->lnb_page = + echo_persistent_pages[res->lnb_file_offset >> + PAGE_SHIFT]; + /* Take extra ref so __free_pages() can be called OK */ + get_page(res->lnb_page); + } else { + res->lnb_page = alloc_page(gfp_mask); + if (res->lnb_page == NULL) { + CERROR("can't get page for id " DOSTID"\n", + POSTID(&obj->ioo_oid)); + return -ENOMEM; + } + } + + CDEBUG(D_PAGE, "$$$$ get page %p @ %llu for %d\n", + res->lnb_page, res->lnb_file_offset, res->lnb_len); + + if (cmd & OBD_BRW_READ) + res->lnb_rc = res->lnb_len; + + if (debug_setup) + echo_page_debug_setup(res->lnb_page, cmd, + ostid_id(&obj->ioo_oid), + res->lnb_file_offset, + res->lnb_len); + + offset += plen; + len -= plen; + res++; + + (*left)--; + (*pages)++; + } + + return 0; +} + +static int echo_finalize_lb(struct obdo *oa, struct obd_ioobj *obj, + struct niobuf_remote *rb, int *pgs, + struct niobuf_local *lb, int verify) +{ + struct niobuf_local *res = lb; + u64 start = rb->rnb_offset >> PAGE_SHIFT; + u64 end = (rb->rnb_offset + rb->rnb_len + PAGE_SIZE - 1) >> + PAGE_SHIFT; + int count = (int)(end - start); + int rc = 0; + int i; + + for (i = 0; i < count; i++, (*pgs) ++, res++) { + struct page *page = res->lnb_page; + void *addr; + + if (page == NULL) { + CERROR("null page objid %llu:%p, buf %d/%d\n", + ostid_id(&obj->ioo_oid), page, i, + obj->ioo_bufcnt); + return -EFAULT; + } + + addr = kmap(page); + + CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@%llu\n", + res->lnb_page, addr, res->lnb_file_offset); + + if (verify) { + int vrc = echo_page_debug_check(page, + ostid_id(&obj->ioo_oid), + res->lnb_file_offset, + res->lnb_len); + /* check all the pages always */ + if (vrc != 0 && rc == 0) + rc = vrc; + } + + kunmap(page); + /* NB see comment above regarding persistent pages */ + __free_page(page); + } + + return rc; +} + +static int echo_preprw(const struct lu_env *env, int cmd, + struct obd_export *export, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *nb, int *pages, + struct niobuf_local *res) +{ + struct obd_device *obd; + int tot_bytes = 0; + int rc = 0; + int i, left; + ENTRY; + + obd = export->exp_obd; + if (obd == NULL) + RETURN(-EINVAL); + + /* Temp fix to stop falling foul of osc_announce_cached() */ + oa->o_valid &= ~(OBD_MD_FLBLOCKS | OBD_MD_FLGRANT); + + memset(res, 0, sizeof(*res) * *pages); + + CDEBUG(D_PAGE, "%s %d obdos with %d IOs\n", + cmd == OBD_BRW_READ ? "reading" : "writing", objcount, *pages); + + left = *pages; + *pages = 0; + + for (i = 0; i < objcount; i++, obj++) { + int j; + + for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++) { + + rc = echo_map_nb_to_lb(oa, obj, nb, pages, + res + *pages, cmd, &left); + if (rc) + GOTO(preprw_cleanup, rc); + + tot_bytes += nb->rnb_len; + } + } + + atomic_add(*pages, &obd->u.echo.eo_prep); + + if (cmd & OBD_BRW_READ) + lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_READ_BYTES, + tot_bytes); + else + lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_WRITE_BYTES, + tot_bytes); + + CDEBUG(D_PAGE, "%d pages allocated after prep\n", + atomic_read(&obd->u.echo.eo_prep)); + + RETURN(0); + +preprw_cleanup: + /* It is possible that we would rather handle errors by allow + * any already-set-up pages to complete, rather than tearing them + * all down again. I believe that this is what the in-kernel + * prep/commit operations do. + */ + CERROR("cleaning up %u pages (%d obdos)\n", *pages, objcount); + for (i = 0; i < *pages; i++) { + kunmap(res[i].lnb_page); + /* NB if this is a persistent page, __free_page() will just + * lose the extra ref gained above */ + __free_page(res[i].lnb_page); + res[i].lnb_page = NULL; + atomic_dec(&obd->u.echo.eo_prep); + } + + return rc; +} + +static int echo_commitrw(const struct lu_env *env, int cmd, + struct obd_export *export, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *rb, int niocount, + struct niobuf_local *res, int rc) +{ + struct obd_device *obd; + int pgs = 0; + int i; + ENTRY; + + obd = export->exp_obd; + if (obd == NULL) + RETURN(-EINVAL); + + if (rc) + GOTO(commitrw_cleanup, rc); + + if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) { + CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n", + objcount, niocount); + } else { + CDEBUG(D_PAGE, "writing %d obdos with %d IOs\n", + objcount, niocount); + } + + if (niocount && res == NULL) { + CERROR("NULL res niobuf with niocount %d\n", niocount); + RETURN(-EINVAL); + } + + for (i = 0; i < objcount; i++, obj++) { + int verify = (rc == 0 && + ostid_id(&obj->ioo_oid) != ECHO_PERSISTENT_OBJID && + (oa->o_valid & OBD_MD_FLFLAGS) != 0 && + (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); + int j; + + for (j = 0 ; j < obj->ioo_bufcnt ; j++, rb++) { + int vrc = echo_finalize_lb(oa, obj, rb, &pgs, &res[pgs], + verify); + if (vrc == 0) + continue; + + if (vrc == -EFAULT) + GOTO(commitrw_cleanup, rc = vrc); + + if (rc == 0) + rc = vrc; + } + + } + + atomic_sub(pgs, &obd->u.echo.eo_prep); + + CDEBUG(D_PAGE, "%d pages remain after commit\n", + atomic_read(&obd->u.echo.eo_prep)); + RETURN(rc); + +commitrw_cleanup: + atomic_sub(pgs, &obd->u.echo.eo_prep); + + CERROR("cleaning up %d pages (%d obdos)\n", + niocount - pgs - 1, objcount); + + while (pgs < niocount) { + struct page *page = res[pgs++].lnb_page; + + if (page == NULL) + continue; + + /* NB see comment above regarding persistent pages */ + __free_page(page); + atomic_dec(&obd->u.echo.eo_prep); + } + return rc; +} + +LPROC_SEQ_FOPS_RO_TYPE(echo, uuid); +static struct lprocfs_vars lprocfs_echo_obd_vars[] = { + { .name = "uuid", + .fops = &echo_uuid_fops }, + { NULL } +}; + +struct obd_ops echo_obd_ops = { + .o_owner = THIS_MODULE, + .o_connect = echo_connect, + .o_disconnect = echo_disconnect, + .o_init_export = echo_init_export, + .o_destroy_export = echo_destroy_export, + .o_preprw = echo_preprw, + .o_commitrw = echo_commitrw, +}; + +/** + * Echo Server request handler for OST_CREATE RPC. + * + * This is part of request processing. Its simulates the object + * creation on OST. + * + * \param[in] tsi target session environment for this request + * + * \retval 0 if successful + * \retval negative value on error + */ +static int esd_create_hdl(struct tgt_session_info *tsi) +{ + const struct obdo *oa = &tsi->tsi_ost_body->oa; + struct obd_device *obd = tsi->tsi_exp->exp_obd; + struct ost_body *repbody; + struct obdo *rep_oa; + + ENTRY; + + repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY); + if (repbody == NULL) + RETURN(-ENOMEM); + + if (!(oa->o_mode & S_IFMT)) { + CERROR("%s: no type is set in obdo!\n", + tsi->tsi_exp->exp_obd->obd_name); + RETURN(-ENOENT); + } + + if (!(oa->o_valid & OBD_MD_FLTYPE)) { + CERROR("%s: invalid o_valid in obdo: %#llx\n", + tsi->tsi_exp->exp_obd->obd_name, oa->o_valid); + RETURN(-EINVAL); + } + + rep_oa = &repbody->oa; + + if (!fid_seq_is_echo(ostid_seq(&oa->o_oi))) { + CERROR("%s: invalid seq %#llx\n", + tsi->tsi_exp->exp_obd->obd_name, ostid_seq(&oa->o_oi)); + return -EINVAL; + } + + ostid_set_seq_echo(&rep_oa->o_oi); + ostid_set_id(&rep_oa->o_oi, echo_next_id(obd)); + + CDEBUG(D_INFO, "%s: Create object "DOSTID"\n", + tsi->tsi_exp->exp_obd->obd_name, POSTID(&rep_oa->o_oi)); + + rep_oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP; + + RETURN(0); +} + +/** + * Echo Server request handler for OST_DESTROY RPC. + * + * This is Echo Server part of request handling. It simulates the objects + * destroy on OST. + * + * \param[in] tsi target session environment for this request + * + * \retval 0 if successful + * \retval negative value on error + */ +static int esd_destroy_hdl(struct tgt_session_info *tsi) +{ + const struct obdo *oa = &tsi->tsi_ost_body->oa; + struct obd_device *obd = tsi->tsi_exp->exp_obd; + struct ost_body *repbody; + u64 oid; + + ENTRY; + + oid = ostid_id(&oa->o_oi); + LASSERT(oid != 0); + + if (!(oa->o_valid & OBD_MD_FLID)) { + CERROR("%s: obdo missing FLID valid flag: %#llx\n", + tsi->tsi_exp->exp_obd->obd_name, oa->o_valid); + RETURN(-EINVAL); + } + + repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY); + + if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino || + ostid_id(&oa->o_oi) < ECHO_INIT_OID) { + CERROR("%s: bad objid to destroy: "DOSTID"\n", + tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi)); + RETURN(-EINVAL); + } + + CDEBUG(D_INFO, "%s: Destroy object "DOSTID"\n", + tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi)); + + repbody->oa.o_oi = oa->o_oi; + RETURN(0); +} + +/** + * Echo Server request handler for OST_GETATTR RPC. + * + * This is Echo Server part of request handling. It returns an object + * attributes to the client. All objects have the same attributes in + * Echo Server. + * + * \param[in] tsi target session environment for this request + * + * \retval 0 if successful + * \retval negative value on error + */ +static int esd_getattr_hdl(struct tgt_session_info *tsi) +{ + const struct obdo *oa = &tsi->tsi_ost_body->oa; + struct obd_device *obd = tsi->tsi_exp->exp_obd; + struct ost_body *repbody; + + ENTRY; + + if (!(oa->o_valid & OBD_MD_FLID)) { + CERROR("%s: obdo missing FLID valid flag: %#llx\n", + tsi->tsi_exp->exp_obd->obd_name, oa->o_valid); + RETURN(-EINVAL); + } + + repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY); + if (repbody == NULL) + RETURN(-ENOMEM); + + repbody->oa.o_oi = oa->o_oi; + repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + obdo_cpy_md(&repbody->oa, &obd->u.echo.eo_oa, oa->o_valid); + + repbody->oa.o_valid |= OBD_MD_FLFLAGS; + repbody->oa.o_flags = OBD_FL_FLUSH; + + RETURN(0); +} + +/** + * Echo Server request handler for OST_SETATTR RPC. + * + * This is Echo Server part of request handling. It sets common + * attributes from request to the Echo Server objects. + * + * \param[in] tsi target session environment for this request + * + * \retval 0 if successful + * \retval negative value on error + */ +static int esd_setattr_hdl(struct tgt_session_info *tsi) +{ + struct ost_body *body = tsi->tsi_ost_body; + struct obd_device *obd = tsi->tsi_exp->exp_obd; + struct ost_body *repbody; + + ENTRY; + + if (!(body->oa.o_valid & OBD_MD_FLID)) { + CERROR("%s: obdo missing FLID valid flag: %#llx\n", + tsi->tsi_exp->exp_obd->obd_name, + body->oa.o_valid); + RETURN(-EINVAL); + } + + repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY); + if (repbody == NULL) + RETURN(-ENOMEM); + + repbody->oa.o_oi = body->oa.o_oi; + repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + obd->u.echo.eo_oa = body->oa; + + RETURN(0); +} + +#define OBD_FAIL_OST_READ_NET OBD_FAIL_OST_BRW_NET +#define OBD_FAIL_OST_WRITE_NET OBD_FAIL_OST_BRW_NET +#define OST_BRW_READ OST_READ +#define OST_BRW_WRITE OST_WRITE + +/** + * Table of Echo Server specific request handlers + * + * This table contains all opcodes accepted by Echo Server and + * specifies handlers for them. The tgt_request_handler() + * uses such table from each target to process incoming + * requests. + */ +static struct tgt_handler esd_tgt_handlers[] = { +TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_CONNECT, tgt_connect, + &RQF_CONNECT, LUSTRE_OBD_VERSION), +TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_DISCONNECT, tgt_disconnect, + &RQF_OST_DISCONNECT, LUSTRE_OBD_VERSION), +TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_GETATTR, esd_getattr_hdl), +TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO | MUTABOR, OST_SETATTR, + esd_setattr_hdl), +TGT_OST_HDL(HABEO_REFERO | MUTABOR, OST_CREATE, esd_create_hdl), +TGT_OST_HDL(HABEO_REFERO | MUTABOR, OST_DESTROY, esd_destroy_hdl), +TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_BRW_READ, tgt_brw_read), +TGT_OST_HDL(HABEO_CORPUS | MUTABOR, OST_BRW_WRITE, tgt_brw_write), +}; + +static struct tgt_opc_slice esd_common_slice[] = { + { + .tos_opc_start = OST_FIRST_OPC, + .tos_opc_end = OST_LAST_OPC, + .tos_hs = esd_tgt_handlers + }, + { + .tos_opc_start = OBD_FIRST_OPC, + .tos_opc_end = OBD_LAST_OPC, + .tos_hs = tgt_obd_handlers + }, + { + .tos_opc_start = LDLM_FIRST_OPC, + .tos_opc_end = LDLM_LAST_OPC, + .tos_hs = tgt_dlm_handlers + }, + { + .tos_opc_start = SEC_FIRST_OPC, + .tos_opc_end = SEC_LAST_OPC, + .tos_hs = tgt_sec_ctx_handlers + }, + { + .tos_hs = NULL + } +}; + +/** + * lu_device_operations matrix for ECHO SRV device is NULL, + * this device is just serving incoming requests immediately + * without building a stack of lu_devices. + */ +static struct lu_device_operations echo_srv_lu_ops = { 0 }; + +/** + * Initialize Echo Server device with parameters in the config log \a cfg. + * + * This is the main starting point of Echo Server initialization. It fills all + * parameters with their initial values and starts Echo Server. + * + * \param[in] env execution environment + * \param[in] m Echo Server device + * \param[in] ldt LU device type of Echo Server + * \param[in] cfg configuration log + * + * \retval 0 if successful + * \retval negative value on error + */ +static int echo_srv_init0(const struct lu_env *env, + struct echo_srv_device *esd, + struct lu_device_type *ldt, struct lustre_cfg *cfg) +{ + const char *dev = lustre_cfg_string(cfg, 0); + struct obd_device *obd; + char ns_name[48]; + int rc; + + ENTRY; + + obd = class_name2obd(dev); + if (obd == NULL) { + CERROR("Cannot find obd with name %s\n", dev); + RETURN(-ENODEV); + } + + spin_lock_init(&obd->u.echo.eo_lock); + obd->u.echo.eo_lastino = ECHO_INIT_OID; + + esd->esd_dev.ld_ops = &echo_srv_lu_ops; + esd->esd_dev.ld_obd = obd; + /* set this lu_device to obd, because error handling need it */ + obd->obd_lu_dev = &esd->esd_dev; + + /* No connection accepted until configurations will finish */ + spin_lock(&obd->obd_dev_lock); + obd->obd_no_conn = 1; + spin_unlock(&obd->obd_dev_lock); + + /* non-replayable target */ + obd->obd_replayable = 0; + + snprintf(ns_name, sizeof(ns_name), "echotgt-%s", obd->obd_uuid.uuid); + obd->obd_namespace = ldlm_namespace_new(obd, ns_name, + LDLM_NAMESPACE_SERVER, + LDLM_NAMESPACE_MODEST, + LDLM_NS_TYPE_OST); + if (obd->obd_namespace == NULL) + RETURN(-ENOMEM); + + obd->obd_vars = lprocfs_echo_obd_vars; + if (!lprocfs_obd_setup(obd, true) && + lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) { + lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES, + LPROCFS_CNTR_AVGMINMAX, + "read_bytes", "bytes"); + lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_WRITE_BYTES, + LPROCFS_CNTR_AVGMINMAX, + "write_bytes", "bytes"); + } + + ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, + "echo_ldlm_cb_client", &obd->obd_ldlm_client); + + rc = tgt_init(env, &esd->esd_lut, obd, NULL, esd_common_slice, + OBD_FAIL_OST_ALL_REQUEST_NET, + OBD_FAIL_OST_ALL_REPLY_NET); + if (rc) + GOTO(err_out, rc); + + spin_lock(&obd->obd_dev_lock); + obd->obd_no_conn = 0; + spin_unlock(&obd->obd_dev_lock); + + RETURN(0); + +err_out: + ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force); + obd->obd_namespace = NULL; + + lprocfs_obd_cleanup(obd); + lprocfs_free_obd_stats(obd); + RETURN(rc); +} + +/** + * Stop the Echo Server device. + * + * This function stops the Echo Server device and all its subsystems. + * This is the end of Echo Server lifecycle. + * + * \param[in] env execution environment + * \param[in] esd ESD device + */ +static void echo_srv_fini(const struct lu_env *env, + struct echo_srv_device *esd) +{ + struct obd_device *obd = echo_srv_obd(esd); + struct lu_device *d = &esd->esd_dev; + int leaked; + + ENTRY; + + class_disconnect_exports(obd); + if (obd->obd_namespace != NULL) + ldlm_namespace_free_prior(obd->obd_namespace, NULL, + obd->obd_force); + + obd_exports_barrier(obd); + obd_zombie_barrier(); + + tgt_fini(env, &esd->esd_lut); + + if (obd->obd_namespace != NULL) { + ldlm_namespace_free_post(obd->obd_namespace); + obd->obd_namespace = NULL; + } + + lprocfs_obd_cleanup(obd); + lprocfs_free_obd_stats(obd); + + leaked = atomic_read(&obd->u.echo.eo_prep); + if (leaked != 0) + CERROR("%d prep/commitrw pages leaked\n", leaked); + + LASSERT(atomic_read(&d->ld_ref) == 0); + EXIT; +} + +/** + * Implementation of lu_device_type_operations::ldto_device_fini. + * + * Finalize device. Dual to echo_srv_device_init(). It is called from + * obd_precleanup() and stops the current device. + * + * \param[in] env execution environment + * \param[in] d LU device of ESD + * + * \retval NULL + */ +static struct lu_device *echo_srv_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + ENTRY; + echo_srv_fini(env, echo_srv_dev(d)); + RETURN(NULL); +} + +/** + * Implementation of lu_device_type_operations::ldto_device_free. + * + * Free Echo Server device. Dual to echo_srv_device_alloc(). + * + * \param[in] env execution environment + * \param[in] d LU device of ESD + * + * \retval NULL + */ +static struct lu_device *echo_srv_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_srv_device *esd = echo_srv_dev(d); + + lu_device_fini(&esd->esd_dev); + OBD_FREE_PTR(esd); + RETURN(NULL); +} + +/** + * Implementation of lu_device_type_operations::ldto_device_alloc. + * + * This function allocates the new Echo Server device. It is called from + * obd_setup() if OBD device had lu_device_type defined. + * + * \param[in] env execution environment + * \param[in] t lu_device_type of ESD device + * \param[in] cfg configuration log + * + * \retval pointer to the lu_device of just allocated OFD + * \retval ERR_PTR of return value on error + */ +static struct lu_device *echo_srv_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct echo_srv_device *esd; + struct lu_device *l; + int rc; + + OBD_ALLOC_PTR(esd); + if (esd == NULL) + return ERR_PTR(-ENOMEM); + + l = &esd->esd_dev; + lu_device_init(l, t); + rc = echo_srv_init0(env, esd, t, cfg); + if (rc != 0) { + echo_srv_device_free(env, l); + l = ERR_PTR(rc); + } + + return l; +} + +static const struct lu_device_type_operations echo_srv_type_ops = { + .ldto_device_alloc = echo_srv_device_alloc, + .ldto_device_free = echo_srv_device_free, + .ldto_device_fini = echo_srv_device_fini +}; + +struct lu_device_type echo_srv_type = { + .ldt_tags = LU_DEVICE_DT, + .ldt_name = LUSTRE_ECHO_NAME, + .ldt_ops = &echo_srv_type_ops, + .ldt_ctx_tags = LCT_DT_THREAD, +}; + +void echo_persistent_pages_fini(void) +{ + int i; + + for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) + if (echo_persistent_pages[i] != NULL) { + __free_page(echo_persistent_pages[i]); + echo_persistent_pages[i] = NULL; + } +} + +int echo_persistent_pages_init(void) +{ + struct page *pg; + int i; + + for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) { + gfp_t gfp_mask = (i < ECHO_PERSISTENT_PAGES/2) ? + GFP_KERNEL : GFP_HIGHUSER; + + pg = alloc_page(gfp_mask); + if (pg == NULL) { + echo_persistent_pages_fini(); + return -ENOMEM; + } + + memset(kmap(pg), 0, PAGE_SIZE); + kunmap(pg); + + echo_persistent_pages[i] = pg; + } + + return 0; +} diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c new file mode 100644 index 0000000000000..b9357e77b980f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c @@ -0,0 +1,3139 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_ECHO + +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_SERVER_SUPPORT +# include + +#define ETI_NAME_LEN 20 + +#endif /* HAVE_SERVER_SUPPORT */ + +#include "echo_internal.h" + +/** \defgroup echo_client Echo Client + * @{ + */ + +struct echo_device { + struct cl_device ed_cl; + struct echo_client_obd *ed_ec; + + struct cl_site ed_site_myself; + struct lu_site *ed_site; + struct lu_device *ed_next; + int ed_next_ismd; + struct lu_client_seq *ed_cl_seq; +#ifdef HAVE_SERVER_SUPPORT + struct local_oid_storage *ed_los; + struct lu_fid ed_root_fid; +#endif /* HAVE_SERVER_SUPPORT */ +}; + +struct echo_object { + struct cl_object eo_cl; + struct cl_object_header eo_hdr; + struct echo_device *eo_dev; + struct list_head eo_obj_chain; + struct lov_oinfo *eo_oinfo; + atomic_t eo_npages; + int eo_deleted; +}; + +struct echo_object_conf { + struct cl_object_conf eoc_cl; + struct lov_oinfo **eoc_oinfo; +}; + +struct echo_page { + struct cl_page_slice ep_cl; + struct mutex ep_lock; +}; + +struct echo_lock { + struct cl_lock_slice el_cl; + struct list_head el_chain; + struct echo_object *el_object; + __u64 el_cookie; + atomic_t el_refcount; +}; + +#ifdef HAVE_SERVER_SUPPORT +static const char echo_md_root_dir_name[] = "ROOT_ECHO"; + +/** + * In order to use the values of members in struct mdd_device, + * we define an alias structure here. + */ +struct echo_md_device { + struct md_device emd_md_dev; + struct obd_export *emd_child_exp; + struct dt_device *emd_child; + struct dt_device *emd_bottom; + struct lu_fid emd_root_fid; + struct lu_fid emd_local_root_fid; +}; +#endif /* HAVE_SERVER_SUPPORT */ + +static int echo_client_setup(const struct lu_env *env, + struct obd_device *obddev, + struct lustre_cfg *lcfg); +static int echo_client_cleanup(struct obd_device *obddev); + + +/** \defgroup echo_helpers Helper functions + * @{ + */ +static inline struct echo_device *cl2echo_dev(const struct cl_device *dev) +{ + return container_of0(dev, struct echo_device, ed_cl); +} + +static inline struct cl_device *echo_dev2cl(struct echo_device *d) +{ + return &d->ed_cl; +} + +static inline struct echo_device *obd2echo_dev(const struct obd_device *obd) +{ + return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev)); +} + +static inline struct cl_object *echo_obj2cl(struct echo_object *eco) +{ + return &eco->eo_cl; +} + +static inline struct echo_object *cl2echo_obj(const struct cl_object *o) +{ + return container_of(o, struct echo_object, eo_cl); +} + +static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s) +{ + return container_of(s, struct echo_page, ep_cl); +} + +static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s) +{ + return container_of(s, struct echo_lock, el_cl); +} + +static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl) +{ + return ecl->el_cl.cls_lock; +} + +static struct lu_context_key echo_thread_key; +static inline struct echo_thread_info *echo_env_info(const struct lu_env *env) +{ + struct echo_thread_info *info; + info = lu_context_key_get(&env->le_ctx, &echo_thread_key); + LASSERT(info != NULL); + return info; +} + +static inline +struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c) +{ + return container_of(c, struct echo_object_conf, eoc_cl); +} + +#ifdef HAVE_SERVER_SUPPORT +static inline struct echo_md_device *lu2emd_dev(struct lu_device *d) +{ + return container_of0(d, struct echo_md_device, emd_md_dev.md_lu_dev); +} + +static inline struct lu_device *emd2lu_dev(struct echo_md_device *d) +{ + return &d->emd_md_dev.md_lu_dev; +} + +static inline struct seq_server_site *echo_md_seq_site(struct echo_md_device *d) +{ + return emd2lu_dev(d)->ld_site->ld_seq_site; +} + +static inline struct obd_device *emd2obd_dev(struct echo_md_device *d) +{ + return d->emd_md_dev.md_lu_dev.ld_obd; +} +#endif /* HAVE_SERVER_SUPPORT */ + +/** @} echo_helpers */ + +static int cl_echo_object_put(struct echo_object *eco); +static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, + struct page **pages, int npages, int async); + +struct echo_thread_info { + struct echo_object_conf eti_conf; + struct lustre_md eti_md; + + struct cl_2queue eti_queue; + struct cl_io eti_io; + struct cl_lock eti_lock; + struct lu_fid eti_fid; + struct lu_fid eti_fid2; +#ifdef HAVE_SERVER_SUPPORT + struct md_op_spec eti_spec; + struct lov_mds_md_v3 eti_lmm; + struct lov_user_md_v3 eti_lum; + struct md_attr eti_ma; + struct lu_name eti_lname; + /* per-thread values, can be re-used */ + void *eti_big_lmm; /* may be vmalloc'd */ + int eti_big_lmmsize; + char eti_name[ETI_NAME_LEN]; + struct lu_buf eti_buf; + /* If we want to test large ACL, then need to enlarge the buffer. */ + char eti_xattr_buf[LUSTRE_POSIX_ACL_MAX_SIZE_OLD]; +#endif +}; + +/* No session used right now */ +struct echo_session_info { + unsigned long dummy; +}; + +static struct kmem_cache *echo_lock_kmem; +static struct kmem_cache *echo_object_kmem; +static struct kmem_cache *echo_thread_kmem; +static struct kmem_cache *echo_session_kmem; +/* static struct kmem_cache *echo_req_kmem; */ + +static struct lu_kmem_descr echo_caches[] = { + { + .ckd_cache = &echo_lock_kmem, + .ckd_name = "echo_lock_kmem", + .ckd_size = sizeof (struct echo_lock) + }, + { + .ckd_cache = &echo_object_kmem, + .ckd_name = "echo_object_kmem", + .ckd_size = sizeof (struct echo_object) + }, + { + .ckd_cache = &echo_thread_kmem, + .ckd_name = "echo_thread_kmem", + .ckd_size = sizeof (struct echo_thread_info) + }, + { + .ckd_cache = &echo_session_kmem, + .ckd_name = "echo_session_kmem", + .ckd_size = sizeof (struct echo_session_info) + }, + { + .ckd_cache = NULL + } +}; + +/** \defgroup echo_page Page operations + * + * Echo page operations. + * + * @{ + */ +static int echo_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock) +{ + struct echo_page *ep = cl2echo_page(slice); + + if (!nonblock) + mutex_lock(&ep->ep_lock); + else if (!mutex_trylock(&ep->ep_lock)) + return -EAGAIN; + return 0; +} + +static void echo_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct echo_page *ep = cl2echo_page(slice); + + LASSERT(mutex_is_locked(&ep->ep_lock)); + mutex_unlock(&ep->ep_lock); +} + +static void echo_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + cl_page_delete(env, slice->cpl_page); +} + +static int echo_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + if (mutex_is_locked(&cl2echo_page(slice)->ep_lock)) + return -EBUSY; + return -ENODATA; +} + +static void echo_page_completion(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + LASSERT(slice->cpl_page->cp_sync_io != NULL); +} + +static void echo_page_fini(const struct lu_env *env, + struct cl_page_slice *slice, + struct pagevec *pvec) +{ + struct echo_object *eco = cl2echo_obj(slice->cpl_obj); + ENTRY; + + atomic_dec(&eco->eo_npages); + put_page(slice->cpl_page->cp_vmpage); + EXIT; +} + +static int echo_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + return 0; +} + +static int echo_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct echo_page *ep = cl2echo_page(slice); + + (*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n", + ep, mutex_is_locked(&ep->ep_lock), + slice->cpl_page->cp_vmpage); + return 0; +} + +static const struct cl_page_operations echo_page_ops = { + .cpo_own = echo_page_own, + .cpo_disown = echo_page_disown, + .cpo_discard = echo_page_discard, + .cpo_fini = echo_page_fini, + .cpo_print = echo_page_print, + .cpo_is_vmlocked = echo_page_is_vmlocked, + .io = { + [CRT_READ] = { + .cpo_prep = echo_page_prep, + .cpo_completion = echo_page_completion, + }, + [CRT_WRITE] = { + .cpo_prep = echo_page_prep, + .cpo_completion = echo_page_completion, + } + } +}; +/** @} echo_page */ + +/** \defgroup echo_lock Locking + * + * echo lock operations + * + * @{ + */ +static void echo_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct echo_lock *ecl = cl2echo_lock(slice); + + LASSERT(list_empty(&ecl->el_chain)); + OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem); +} + +static struct cl_lock_operations echo_lock_ops = { + .clo_fini = echo_lock_fini, +}; + +/** @} echo_lock */ + +/** \defgroup echo_cl_ops cl_object operations + * + * operations for cl_object + * + * @{ + */ +static int echo_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + struct echo_page *ep = cl_object_page_slice(obj, page); + struct echo_object *eco = cl2echo_obj(obj); + ENTRY; + + get_page(page->cp_vmpage); + mutex_init(&ep->ep_lock); + cl_page_slice_add(page, &ep->ep_cl, obj, index, &echo_page_ops); + atomic_inc(&eco->eo_npages); + RETURN(0); +} + +static int echo_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + return 0; +} + +static int echo_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *unused) +{ + struct echo_lock *el; + ENTRY; + + OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, GFP_NOFS); + if (el != NULL) { + cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops); + el->el_object = cl2echo_obj(obj); + INIT_LIST_HEAD(&el->el_chain); + atomic_set(&el->el_refcount, 0); + } + RETURN(el == NULL ? -ENOMEM : 0); +} + +static int echo_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + return 0; +} + +static const struct cl_object_operations echo_cl_obj_ops = { + .coo_page_init = echo_page_init, + .coo_lock_init = echo_lock_init, + .coo_io_init = echo_io_init, + .coo_conf_set = echo_conf_set +}; +/** @} echo_cl_ops */ + +/** \defgroup echo_lu_ops lu_object operations + * + * operations for echo lu object. + * + * @{ + */ +static int echo_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(obj->lo_dev)); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + ENTRY; + + if (ed->ed_next) { + struct lu_object *below; + struct lu_device *under; + + under = ed->ed_next; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, + under); + if (below == NULL) + RETURN(-ENOMEM); + lu_object_add(obj, below); + } + + if (!ed->ed_next_ismd) { + const struct cl_object_conf *cconf = lu2cl_conf(conf); + struct echo_object_conf *econf = cl2echo_conf(cconf); + + LASSERT(econf->eoc_oinfo != NULL); + + /* Transfer the oinfo pointer to eco that it won't be + * freed. */ + eco->eo_oinfo = *econf->eoc_oinfo; + *econf->eoc_oinfo = NULL; + } else { + eco->eo_oinfo = NULL; + } + + eco->eo_dev = ed; + atomic_set(&eco->eo_npages, 0); + cl_object_page_init(lu2cl(obj), sizeof(struct echo_page)); + + spin_lock(&ec->ec_lock); + list_add_tail(&eco->eo_obj_chain, &ec->ec_objects); + spin_unlock(&ec->ec_lock); + + RETURN(0); +} + +static void echo_object_delete(const struct lu_env *env, struct lu_object *obj) +{ + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + struct echo_client_obd *ec; + + ENTRY; + + /* object delete called unconditolally - layer init or not */ + if (eco->eo_dev == NULL) + return; + + ec = eco->eo_dev->ed_ec; + + LASSERT(atomic_read(&eco->eo_npages) == 0); + + spin_lock(&ec->ec_lock); + list_del_init(&eco->eo_obj_chain); + spin_unlock(&ec->ec_lock); + + if (eco->eo_oinfo != NULL) + OBD_FREE_PTR(eco->eo_oinfo); +} + +static void echo_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + + ENTRY; + + lu_object_fini(obj); + lu_object_header_fini(obj->lo_header); + + OBD_SLAB_FREE_PTR(eco, echo_object_kmem); + EXIT; +} + +static int echo_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct echo_object *obj = cl2echo_obj(lu2cl(o)); + + return (*p)(env, cookie, "echoclient-object@%p", obj); +} + +static const struct lu_object_operations echo_lu_obj_ops = { + .loo_object_init = echo_object_init, + .loo_object_delete = echo_object_delete, + .loo_object_release = NULL, + .loo_object_free = echo_object_free, + .loo_object_print = echo_object_print, + .loo_object_invariant = NULL +}; +/** @} echo_lu_ops */ + +/** \defgroup echo_lu_dev_ops lu_device operations + * + * Operations for echo lu device. + * + * @{ + */ +static struct lu_object *echo_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) +{ + struct echo_object *eco; + struct lu_object *obj = NULL; + ENTRY; + + /* we're the top dev. */ + LASSERT(hdr == NULL); + OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, GFP_NOFS); + if (eco != NULL) { + struct cl_object_header *hdr = &eco->eo_hdr; + + obj = &echo_obj2cl(eco)->co_lu; + cl_object_header_init(hdr); + hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page)); + + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + + eco->eo_cl.co_ops = &echo_cl_obj_ops; + obj->lo_ops = &echo_lu_obj_ops; + } + RETURN(obj); +} + +static struct lu_device_operations echo_device_lu_ops = { + .ldo_object_alloc = echo_object_alloc, +}; + +/** @} echo_lu_dev_ops */ + +/** \defgroup echo_init Setup and teardown + * + * Init and fini functions for echo client. + * + * @{ + */ +static int echo_site_init(const struct lu_env *env, struct echo_device *ed) +{ + struct cl_site *site = &ed->ed_site_myself; + int rc; + + /* initialize site */ + rc = cl_site_init(site, &ed->ed_cl); + if (rc) { + CERROR("Cannot initialize site for echo client(%d)\n", rc); + return rc; + } + + rc = lu_site_init_finish(&site->cs_lu); + if (rc) { + cl_site_fini(site); + return rc; + } + + ed->ed_site = &site->cs_lu; + return 0; +} + +static void echo_site_fini(const struct lu_env *env, struct echo_device *ed) +{ + if (ed->ed_site) { + if (!ed->ed_next_ismd) + lu_site_fini(ed->ed_site); + ed->ed_site = NULL; + } +} + +static void *echo_thread_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct echo_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void echo_thread_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct echo_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, echo_thread_kmem); +} + +static struct lu_context_key echo_thread_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = echo_thread_key_init, + .lct_fini = echo_thread_key_fini, +}; + +static void *echo_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct echo_session_info *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, GFP_NOFS); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +static void echo_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct echo_session_info *session = data; + OBD_SLAB_FREE_PTR(session, echo_session_kmem); +} + +static struct lu_context_key echo_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = echo_session_key_init, + .lct_fini = echo_session_key_fini, +}; + +LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key); + +#ifdef HAVE_SERVER_SUPPORT +# define ECHO_SEQ_WIDTH 0xffffffff +static int echo_fid_init(struct echo_device *ed, char *obd_name, + struct seq_server_site *ss) +{ + char *prefix; + int rc; + ENTRY; + + OBD_ALLOC_PTR(ed->ed_cl_seq); + if (ed->ed_cl_seq == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC(prefix, MAX_OBD_NAME + 5); + if (prefix == NULL) + GOTO(out_free_seq, rc = -ENOMEM); + + snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", obd_name); + + /* Init client side sequence-manager */ + rc = seq_client_init(ed->ed_cl_seq, NULL, + LUSTRE_SEQ_METADATA, + prefix, ss->ss_server_seq); + ed->ed_cl_seq->lcs_width = ECHO_SEQ_WIDTH; + OBD_FREE(prefix, MAX_OBD_NAME + 5); + if (rc) + GOTO(out_free_seq, rc); + + RETURN(0); + +out_free_seq: + OBD_FREE_PTR(ed->ed_cl_seq); + ed->ed_cl_seq = NULL; + RETURN(rc); +} + +static int echo_fid_fini(struct obd_device *obddev) +{ + struct echo_device *ed = obd2echo_dev(obddev); + ENTRY; + + if (ed->ed_cl_seq != NULL) { + seq_client_fini(ed->ed_cl_seq); + OBD_FREE_PTR(ed->ed_cl_seq); + ed->ed_cl_seq = NULL; + } + + RETURN(0); +} + +static void echo_ed_los_fini(const struct lu_env *env, struct echo_device *ed) +{ + ENTRY; + + if (ed != NULL && ed->ed_next_ismd && ed->ed_los != NULL) { + local_oid_storage_fini(env, ed->ed_los); + ed->ed_los = NULL; + } +} + +static int +echo_md_local_file_create(const struct lu_env *env, struct echo_md_device *emd, + struct local_oid_storage *los, + const struct lu_fid *pfid, const char *name, + __u32 mode, struct lu_fid *fid) +{ + struct dt_object *parent = NULL; + struct dt_object *dto = NULL; + int rc = 0; + ENTRY; + + LASSERT(!fid_is_zero(pfid)); + parent = dt_locate(env, emd->emd_bottom, pfid); + if (unlikely(IS_ERR(parent))) + RETURN(PTR_ERR(parent)); + + /* create local file with @fid */ + dto = local_file_find_or_create_with_fid(env, emd->emd_bottom, fid, + parent, name, mode); + if (IS_ERR(dto)) + GOTO(out_put, rc = PTR_ERR(dto)); + + *fid = *lu_object_fid(&dto->do_lu); + /* since stack is not fully set up the local_storage uses own stack + * and we should drop its object from cache */ + dt_object_put_nocache(env, dto); + + EXIT; +out_put: + dt_object_put(env, parent); + RETURN(rc); +} + +static int +echo_md_root_get(const struct lu_env *env, struct echo_md_device *emd, + struct echo_device *ed) +{ + struct lu_fid fid; + int rc = 0; + ENTRY; + + /* Setup local dirs */ + fid.f_seq = FID_SEQ_LOCAL_NAME; + fid.f_oid = 1; + fid.f_ver = 0; + rc = local_oid_storage_init(env, emd->emd_bottom, &fid, &ed->ed_los); + if (rc != 0) + RETURN(rc); + + lu_echo_root_fid(&fid); + if (echo_md_seq_site(emd)->ss_node_id == 0) { + rc = echo_md_local_file_create(env, emd, ed->ed_los, + &emd->emd_local_root_fid, + echo_md_root_dir_name, S_IFDIR | + S_IRUGO | S_IWUSR | S_IXUGO, + &fid); + if (rc != 0) { + CERROR("%s: create md echo root fid failed: rc = %d\n", + emd2obd_dev(emd)->obd_name, rc); + GOTO(out_los, rc); + } + } + ed->ed_root_fid = fid; + + RETURN(0); +out_los: + echo_ed_los_fini(env, ed); + + RETURN(rc); +} +#endif /* HAVE_SERVER_SUPPORT */ + +static struct lu_device *echo_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *next; + struct echo_device *ed; + struct cl_device *cd; + struct obd_device *obd = NULL; /* to keep compiler happy */ + struct obd_device *tgt; + const char *tgt_type_name; + int rc; + int cleanup = 0; + ENTRY; + + OBD_ALLOC_PTR(ed); + if (ed == NULL) + GOTO(out, rc = -ENOMEM); + + cleanup = 1; + cd = &ed->ed_cl; + rc = cl_device_init(cd, t); + if (rc) + GOTO(out, rc); + + cd->cd_lu_dev.ld_ops = &echo_device_lu_ops; + + cleanup = 2; + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + LASSERT(env != NULL); + + tgt = class_name2obd(lustre_cfg_string(cfg, 1)); + if (tgt == NULL) { + CERROR("Can not find tgt device %s\n", + lustre_cfg_string(cfg, 1)); + GOTO(out, rc = -ENODEV); + } + + next = tgt->obd_lu_dev; + + if (strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) { + ed->ed_next_ismd = 1; + } else if (strcmp(tgt->obd_type->typ_name, LUSTRE_OST_NAME) == 0 || + strcmp(tgt->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) { + ed->ed_next_ismd = 0; + rc = echo_site_init(env, ed); + if (rc) + GOTO(out, rc); + } else { + GOTO(out, rc = -EINVAL); + } + + cleanup = 3; + + rc = echo_client_setup(env, obd, cfg); + if (rc) + GOTO(out, rc); + + ed->ed_ec = &obd->u.echo_client; + cleanup = 4; + + if (ed->ed_next_ismd) { +#ifdef HAVE_SERVER_SUPPORT + /* Suppose to connect to some Metadata layer */ + struct lu_site *ls = NULL; + struct lu_device *ld = NULL; + struct md_device *md = NULL; + struct echo_md_device *emd = NULL; + int found = 0; + + if (next == NULL) { + CERROR("%s is not lu device type!\n", + lustre_cfg_string(cfg, 1)); + GOTO(out, rc = -EINVAL); + } + + tgt_type_name = lustre_cfg_string(cfg, 2); + if (!tgt_type_name) { + CERROR("%s no type name for echo %s setup\n", + lustre_cfg_string(cfg, 1), + tgt->obd_type->typ_name); + GOTO(out, rc = -EINVAL); + } + + ls = next->ld_site; + + spin_lock(&ls->ls_ld_lock); + list_for_each_entry(ld, &ls->ls_ld_linkage, ld_linkage) { + if (strcmp(ld->ld_type->ldt_name, tgt_type_name) == 0) { + found = 1; + break; + } + } + spin_unlock(&ls->ls_ld_lock); + + if (found == 0) { + CERROR("%s is not lu device type!\n", + lustre_cfg_string(cfg, 1)); + GOTO(out, rc = -EINVAL); + } + + next = ld; + /* For MD echo client, it will use the site in MDS stack */ + ed->ed_site = ls; + ed->ed_cl.cd_lu_dev.ld_site = ls; + rc = echo_fid_init(ed, obd->obd_name, lu_site2seq(ls)); + if (rc) { + CERROR("echo fid init error %d\n", rc); + GOTO(out, rc); + } + + md = lu2md_dev(next); + emd = lu2emd_dev(&md->md_lu_dev); + rc = echo_md_root_get(env, emd, ed); + if (rc != 0) { + CERROR("%s: get root error: rc = %d\n", + emd2obd_dev(emd)->obd_name, rc); + GOTO(out, rc); + } +#else /* !HAVE_SERVER_SUPPORT */ + CERROR("Local operations are NOT supported on client side. " + "Only remote operations are supported. Metadata client " + "must be run on server side.\n"); + GOTO(out, rc = -EOPNOTSUPP); +#endif /* HAVE_SERVER_SUPPORT */ + } else { + /* if echo client is to be stacked upon ost device, the next is + * NULL since ost is not a clio device so far */ + if (next != NULL && !lu_device_is_cl(next)) + next = NULL; + + tgt_type_name = tgt->obd_type->typ_name; + if (next != NULL) { + LASSERT(next != NULL); + if (next->ld_site != NULL) + GOTO(out, rc = -EBUSY); + + next->ld_site = ed->ed_site; + rc = next->ld_type->ldt_ops->ldto_device_init(env, next, + next->ld_type->ldt_name, + NULL); + if (rc) + GOTO(out, rc); + } else + LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0); + } + + ed->ed_next = next; + RETURN(&cd->cd_lu_dev); +out: + switch(cleanup) { + case 4: { + int rc2; + rc2 = echo_client_cleanup(obd); + if (rc2) + CERROR("Cleanup obd device %s error(%d)\n", + obd->obd_name, rc2); + } + fallthrough; + + case 3: + echo_site_fini(env, ed); + fallthrough; + case 2: + cl_device_fini(&ed->ed_cl); + fallthrough; + case 1: + OBD_FREE_PTR(ed); + fallthrough; + case 0: + default: + break; + } + return(ERR_PTR(rc)); +} + +static int echo_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + LBUG(); + return 0; +} + +static struct lu_device *echo_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); + struct lu_device *next = ed->ed_next; + + while (next && !ed->ed_next_ismd) + next = next->ld_type->ldt_ops->ldto_device_fini(env, next); + return NULL; +} + +static void echo_lock_release(const struct lu_env *env, + struct echo_lock *ecl, + int still_used) +{ + struct cl_lock *clk = echo_lock2cl(ecl); + + cl_lock_release(env, clk); +} + +static struct lu_device *echo_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco; + struct lu_device *next = ed->ed_next; + + CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n", + ed, next); + + lu_site_purge(env, ed->ed_site, -1); + + /* check if there are objects still alive. + * It shouldn't have any object because lu_site_purge would cleanup + * all of cached objects. Anyway, probably the echo device is being + * parallelly accessed. + */ + spin_lock(&ec->ec_lock); + list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain) + eco->eo_deleted = 1; + spin_unlock(&ec->ec_lock); + + /* purge again */ + lu_site_purge(env, ed->ed_site, -1); + + CDEBUG(D_INFO, + "Waiting for the reference of echo object to be dropped\n"); + + /* Wait for the last reference to be dropped. */ + spin_lock(&ec->ec_lock); + while (!list_empty(&ec->ec_objects)) { + spin_unlock(&ec->ec_lock); + CERROR("echo_client still has objects at cleanup time, " + "wait for 1 second\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + lu_site_purge(env, ed->ed_site, -1); + spin_lock(&ec->ec_lock); + } + spin_unlock(&ec->ec_lock); + + LASSERT(list_empty(&ec->ec_locks)); + + CDEBUG(D_INFO, "No object exists, exiting...\n"); + + echo_client_cleanup(d->ld_obd); +#ifdef HAVE_SERVER_SUPPORT + echo_fid_fini(d->ld_obd); + echo_ed_los_fini(env, ed); +#endif + while (next && !ed->ed_next_ismd) + next = next->ld_type->ldt_ops->ldto_device_free(env, next); + + LASSERT(ed->ed_site == d->ld_site); + echo_site_fini(env, ed); + cl_device_fini(&ed->ed_cl); + OBD_FREE_PTR(ed); + + cl_env_cache_purge(~0); + + return NULL; +} + +static const struct lu_device_type_operations echo_device_type_ops = { + .ldto_init = echo_type_init, + .ldto_fini = echo_type_fini, + + .ldto_start = echo_type_start, + .ldto_stop = echo_type_stop, + + .ldto_device_alloc = echo_device_alloc, + .ldto_device_free = echo_device_free, + .ldto_device_init = echo_device_init, + .ldto_device_fini = echo_device_fini +}; + +static struct lu_device_type echo_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_ECHO_CLIENT_NAME, + .ldt_ops = &echo_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD | LCT_MD_THREAD | LCT_DT_THREAD, +}; +/** @} echo_init */ + +/** \defgroup echo_exports Exported operations + * + * exporting functions to echo client + * + * @{ + */ + +/* Interfaces to echo client obd device */ +static struct echo_object * +cl_echo_object_find(struct echo_device *d, const struct ost_id *oi) +{ + struct lu_env *env; + struct echo_thread_info *info; + struct echo_object_conf *conf; + struct echo_object *eco; + struct cl_object *obj; + struct lov_oinfo *oinfo = NULL; + struct lu_fid *fid; + __u16 refcheck; + int rc; + ENTRY; + + LASSERTF(ostid_id(oi) != 0, DOSTID"\n", POSTID(oi)); + LASSERTF(ostid_seq(oi) == FID_SEQ_ECHO, DOSTID"\n", POSTID(oi)); + + /* Never return an object if the obd is to be freed. */ + if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping) + RETURN(ERR_PTR(-ENODEV)); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN((void *)env); + + info = echo_env_info(env); + conf = &info->eti_conf; + if (d->ed_next) { + OBD_ALLOC_PTR(oinfo); + if (oinfo == NULL) + GOTO(out, eco = ERR_PTR(-ENOMEM)); + + oinfo->loi_oi = *oi; + conf->eoc_cl.u.coc_oinfo = oinfo; + } + + /* If echo_object_init() is successful then ownership of oinfo + * is transferred to the object. */ + conf->eoc_oinfo = &oinfo; + + fid = &info->eti_fid; + rc = ostid_to_fid(fid, oi, 0); + if (rc != 0) + GOTO(out, eco = ERR_PTR(rc)); + + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl); + if (IS_ERR(obj)) + GOTO(out, eco = (void*)obj); + + eco = cl2echo_obj(obj); + if (eco->eo_deleted) { + cl_object_put(env, obj); + eco = ERR_PTR(-EAGAIN); + } + +out: + if (oinfo != NULL) + OBD_FREE_PTR(oinfo); + + cl_env_put(env, &refcheck); + RETURN(eco); +} + +static int cl_echo_object_put(struct echo_object *eco) +{ + struct lu_env *env; + struct cl_object *obj = echo_obj2cl(eco); + __u16 refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + /* an external function to kill an object? */ + if (eco->eo_deleted) { + struct lu_object_header *loh = obj->co_lu.lo_header; + LASSERT(&eco->eo_hdr == luh2coh(loh)); + set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags); + } + + cl_object_put(env, obj); + cl_env_put(env, &refcheck); + RETURN(0); +} + +static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco, + u64 start, u64 end, int mode, + __u64 *cookie , __u32 enqflags) +{ + struct cl_io *io; + struct cl_lock *lck; + struct cl_object *obj; + struct cl_lock_descr *descr; + struct echo_thread_info *info; + int rc = -ENOMEM; + ENTRY; + + info = echo_env_info(env); + io = &info->eti_io; + lck = &info->eti_lock; + obj = echo_obj2cl(eco); + + memset(lck, 0, sizeof(*lck)); + descr = &lck->cll_descr; + descr->cld_obj = obj; + descr->cld_start = cl_index(obj, start); + descr->cld_end = cl_index(obj, end); + descr->cld_mode = mode == LCK_PW ? CLM_WRITE : CLM_READ; + descr->cld_enq_flags = enqflags; + io->ci_obj = obj; + + rc = cl_lock_request(env, io, lck); + if (rc == 0) { + struct echo_client_obd *ec = eco->eo_dev->ed_ec; + struct echo_lock *el; + + el = cl2echo_lock(cl_lock_at(lck, &echo_device_type)); + spin_lock(&ec->ec_lock); + if (list_empty(&el->el_chain)) { + list_add(&el->el_chain, &ec->ec_locks); + el->el_cookie = ++ec->ec_unique; + } + atomic_inc(&el->el_refcount); + *cookie = el->el_cookie; + spin_unlock(&ec->ec_lock); + } + RETURN(rc); +} + +static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed, + __u64 cookie) +{ + struct echo_client_obd *ec = ed->ed_ec; + struct echo_lock *ecl = NULL; + struct list_head *el; + int found = 0, still_used = 0; + ENTRY; + + LASSERT(ec != NULL); + spin_lock(&ec->ec_lock); + list_for_each(el, &ec->ec_locks) { + ecl = list_entry(el, struct echo_lock, el_chain); + CDEBUG(D_INFO, "ecl: %p, cookie: %#llx\n", ecl, ecl->el_cookie); + found = (ecl->el_cookie == cookie); + if (found) { + if (atomic_dec_and_test(&ecl->el_refcount)) + list_del_init(&ecl->el_chain); + else + still_used = 1; + break; + } + } + spin_unlock(&ec->ec_lock); + + if (!found) + RETURN(-ENOENT); + + echo_lock_release(env, ecl, still_used); + RETURN(0); +} + +static void echo_commit_callback(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + struct echo_thread_info *info; + struct cl_2queue *queue; + + info = echo_env_info(env); + LASSERT(io == &info->eti_io); + + queue = &info->eti_queue; + cl_page_list_add(&queue->c2_qout, page); +} + +static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, + struct page **pages, int npages, int async) +{ + struct lu_env *env; + struct echo_thread_info *info; + struct cl_object *obj = echo_obj2cl(eco); + struct echo_device *ed = eco->eo_dev; + struct cl_2queue *queue; + struct cl_io *io; + struct cl_page *clp; + struct lustre_handle lh = { 0 }; + int page_size = cl_page_size(obj); + int rc; + int i; + __u16 refcheck; + ENTRY; + + LASSERT((offset & ~PAGE_MASK) == 0); + LASSERT(ed->ed_next != NULL); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + info = echo_env_info(env); + io = &info->eti_io; + queue = &info->eti_queue; + + cl_2queue_init(queue); + + io->ci_ignore_layout = 1; + rc = cl_io_init(env, io, CIT_MISC, obj); + if (rc < 0) + GOTO(out, rc); + LASSERT(rc == 0); + + + rc = cl_echo_enqueue0(env, eco, offset, + offset + npages * PAGE_SIZE - 1, + rw == READ ? LCK_PR : LCK_PW, &lh.cookie, + CEF_NEVER); + if (rc < 0) + GOTO(error_lock, rc); + + for (i = 0; i < npages; i++) { + LASSERT(pages[i]); + clp = cl_page_find(env, obj, cl_index(obj, offset), + pages[i], CPT_TRANSIENT); + if (IS_ERR(clp)) { + rc = PTR_ERR(clp); + break; + } + LASSERT(clp->cp_type == CPT_TRANSIENT); + + rc = cl_page_own(env, io, clp); + if (rc) { + LASSERT(clp->cp_state == CPS_FREEING); + cl_page_put(env, clp); + break; + } + + cl_2queue_add(queue, clp); + + /* drop the reference count for cl_page_find, so that the page + * will be freed in cl_2queue_fini. */ + cl_page_put(env, clp); + cl_page_clip(env, clp, 0, page_size); + + offset += page_size; + } + + if (rc == 0) { + enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE; + + async = async && (typ == CRT_WRITE); + if (async) + rc = cl_io_commit_async(env, io, &queue->c2_qin, + 0, PAGE_SIZE, + echo_commit_callback); + else + rc = cl_io_submit_sync(env, io, typ, queue, 0); + CDEBUG(D_INFO, "echo_client %s write returns %d\n", + async ? "async" : "sync", rc); + } + + cl_echo_cancel0(env, ed, lh.cookie); + EXIT; +error_lock: + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + cl_io_fini(env, io); +out: + cl_env_put(env, &refcheck); + return rc; +} +/** @} echo_exports */ + + +static u64 last_object_id; + +#ifdef HAVE_SERVER_SUPPORT +static inline void echo_md_build_name(struct lu_name *lname, char *name, + __u64 id) +{ + snprintf(name, ETI_NAME_LEN, "%llu", id); + lname->ln_name = name; + lname->ln_namelen = strlen(name); +} + +/* similar to mdt_attr_get_complex */ +static int echo_big_lmm_get(const struct lu_env *env, struct md_object *o, + struct md_attr *ma) +{ + struct echo_thread_info *info = echo_env_info(env); + int rc; + + ENTRY; + + LASSERT(ma->ma_lmm_size > 0); + + LASSERT(ma->ma_need & (MA_LOV | MA_LMV)); + if (ma->ma_need & MA_LOV) + rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LOV); + else + rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LMV); + + if (rc < 0) + RETURN(rc); + + /* big_lmm may need to be grown */ + if (info->eti_big_lmmsize < rc) { + int size = size_roundup_power2(rc); + + if (info->eti_big_lmmsize > 0) { + /* free old buffer */ + LASSERT(info->eti_big_lmm); + OBD_FREE_LARGE(info->eti_big_lmm, + info->eti_big_lmmsize); + info->eti_big_lmm = NULL; + info->eti_big_lmmsize = 0; + } + + OBD_ALLOC_LARGE(info->eti_big_lmm, size); + if (info->eti_big_lmm == NULL) + RETURN(-ENOMEM); + info->eti_big_lmmsize = size; + } + LASSERT(info->eti_big_lmmsize >= rc); + + info->eti_buf.lb_buf = info->eti_big_lmm; + info->eti_buf.lb_len = info->eti_big_lmmsize; + if (ma->ma_need & MA_LOV) + rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LOV); + else + rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LMV); + if (rc < 0) + RETURN(rc); + + if (ma->ma_need & MA_LOV) + ma->ma_valid |= MA_LOV; + else + ma->ma_valid |= MA_LMV; + + ma->ma_lmm = info->eti_big_lmm; + ma->ma_lmm_size = rc; + + RETURN(0); +} + +static int echo_attr_get_complex(const struct lu_env *env, + struct md_object *next, + struct md_attr *ma) +{ + struct echo_thread_info *info = echo_env_info(env); + struct lu_buf *buf = &info->eti_buf; + umode_t mode = lu_object_attr(&next->mo_lu); + int rc = 0, rc2; + + ENTRY; + + ma->ma_valid = 0; + + if (ma->ma_need & MA_INODE) { + rc = mo_attr_get(env, next, ma); + if (rc) + GOTO(out, rc); + ma->ma_valid |= MA_INODE; + } + + if ((ma->ma_need & MA_LOV) && (S_ISREG(mode) || S_ISDIR(mode))) { + LASSERT(ma->ma_lmm_size > 0); + buf->lb_buf = ma->ma_lmm; + buf->lb_len = ma->ma_lmm_size; + rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV); + if (rc2 > 0) { + ma->ma_lmm_size = rc2; + ma->ma_valid |= MA_LOV; + } else if (rc2 == -ENODATA) { + /* no LOV EA */ + ma->ma_lmm_size = 0; + } else if (rc2 == -ERANGE) { + rc2 = echo_big_lmm_get(env, next, ma); + if (rc2 < 0) + GOTO(out, rc = rc2); + } else { + GOTO(out, rc = rc2); + } + } + + if ((ma->ma_need & MA_LMV) && S_ISDIR(mode)) { + LASSERT(ma->ma_lmm_size > 0); + buf->lb_buf = ma->ma_lmm; + buf->lb_len = ma->ma_lmm_size; + rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LMV); + if (rc2 > 0) { + ma->ma_lmm_size = rc2; + ma->ma_valid |= MA_LMV; + } else if (rc2 == -ENODATA) { + /* no LMV EA */ + ma->ma_lmm_size = 0; + } else if (rc2 == -ERANGE) { + rc2 = echo_big_lmm_get(env, next, ma); + if (rc2 < 0) + GOTO(out, rc = rc2); + } else { + GOTO(out, rc = rc2); + } + } + +#ifdef CONFIG_FS_POSIX_ACL + if ((ma->ma_need & MA_ACL_DEF) && S_ISDIR(mode)) { + buf->lb_buf = ma->ma_acl; + buf->lb_len = ma->ma_acl_size; + rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT); + if (rc2 > 0) { + ma->ma_acl_size = rc2; + ma->ma_valid |= MA_ACL_DEF; + } else if (rc2 == -ENODATA) { + /* no ACLs */ + ma->ma_acl_size = 0; + } else { + GOTO(out, rc = rc2); + } + } +#endif +out: + CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = %#llx ma_lmm=%p\n", + rc, ma->ma_valid, ma->ma_lmm); + RETURN(rc); +} + +static int +echo_md_create_internal(const struct lu_env *env, struct echo_device *ed, + struct md_object *parent, struct lu_fid *fid, + struct lu_name *lname, struct md_op_spec *spec, + struct md_attr *ma) +{ + struct lu_object *ec_child, *child; + struct lu_device *ld = ed->ed_next; + struct echo_thread_info *info = echo_env_info(env); + struct lu_fid *fid2 = &info->eti_fid2; + struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; + int rc; + + ENTRY; + + rc = mdo_lookup(env, parent, lname, fid2, spec); + if (rc == 0) + return -EEXIST; + else if (rc != -ENOENT) + return rc; + + ec_child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, + fid, &conf); + if (IS_ERR(ec_child)) { + CERROR("Can not find the child "DFID": rc = %ld\n", PFID(fid), + PTR_ERR(ec_child)); + RETURN(PTR_ERR(ec_child)); + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (child == NULL) { + CERROR("Can not locate the child "DFID"\n", PFID(fid)); + GOTO(out_put, rc = -EINVAL); + } + + CDEBUG(D_RPCTRACE, "Start creating object "DFID" %s %p\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent); + + /* + * Do not perform lookup sanity check. We know that name does not exist. + */ + spec->sp_cr_lookup = 0; + rc = mdo_create(env, parent, lname, lu2md(child), spec, ma); + if (rc) { + CERROR("Can not create child "DFID": rc = %d\n", PFID(fid), rc); + GOTO(out_put, rc); + } + CDEBUG(D_RPCTRACE, "End creating object "DFID" %s %p rc = %d\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent, rc); + EXIT; +out_put: + lu_object_put(env, ec_child); + return rc; +} + +static int echo_set_lmm_size(const struct lu_env *env, struct lu_device *ld, + struct md_attr *ma) +{ + struct echo_thread_info *info = echo_env_info(env); + + if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) { + ma->ma_lmm = (void *)&info->eti_lmm; + ma->ma_lmm_size = sizeof(info->eti_lmm); + } else { + LASSERT(info->eti_big_lmmsize); + ma->ma_lmm = info->eti_big_lmm; + ma->ma_lmm_size = info->eti_big_lmmsize; + } + + return 0; +} + +static int +echo_md_dir_stripe_choose(const struct lu_env *env, struct echo_device *ed, + struct lu_object *obj, const char *name, + unsigned int namelen, __u64 id, + struct lu_object **new_parent) +{ + struct echo_thread_info *info = echo_env_info(env); + struct md_attr *ma = &info->eti_ma; + struct lmv_mds_md_v1 *lmv; + struct lu_device *ld = ed->ed_next; + unsigned int idx; + struct lu_name tmp_ln_name; + struct lu_fid stripe_fid; + struct lu_object *stripe_obj; + int rc; + + LASSERT(obj != NULL); + LASSERT(S_ISDIR(obj->lo_header->loh_attr)); + + memset(ma, 0, sizeof(*ma)); + echo_set_lmm_size(env, ld, ma); + ma->ma_need = MA_LMV; + rc = echo_attr_get_complex(env, lu2md(obj), ma); + if (rc) { + CERROR("Can not getattr child "DFID": rc = %d\n", + PFID(lu_object_fid(obj)), rc); + return rc; + } + + if (!(ma->ma_valid & MA_LMV)) { + *new_parent = obj; + return 0; + } + + lmv = (struct lmv_mds_md_v1 *)ma->ma_lmm; + if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1) { + rc = -EINVAL; + CERROR("Invalid mds md magic %x "DFID": rc = %d\n", + le32_to_cpu(lmv->lmv_magic), PFID(lu_object_fid(obj)), + rc); + return rc; + } + + if (name != NULL) { + tmp_ln_name.ln_name = name; + tmp_ln_name.ln_namelen = namelen; + } else { + LASSERT(id != -1); + echo_md_build_name(&tmp_ln_name, info->eti_name, id); + } + + idx = lmv_name_to_stripe_index(LMV_HASH_TYPE_FNV_1A_64, + le32_to_cpu(lmv->lmv_stripe_count), + tmp_ln_name.ln_name, tmp_ln_name.ln_namelen); + + LASSERT(idx < le32_to_cpu(lmv->lmv_stripe_count)); + fid_le_to_cpu(&stripe_fid, &lmv->lmv_stripe_fids[idx]); + + stripe_obj = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, &stripe_fid, + NULL); + if (IS_ERR(stripe_obj)) { + rc = PTR_ERR(stripe_obj); + CERROR("Can not find the parent "DFID": rc = %d\n", + PFID(&stripe_fid), rc); + return rc; + } + + *new_parent = lu_object_locate(stripe_obj->lo_header, ld->ld_type); + if (*new_parent == NULL) { + lu_object_put(env, stripe_obj); + RETURN(-ENXIO); + } + + return rc; +} + +static int echo_create_md_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + struct lu_fid *fid, + char *name, int namelen, + __u64 id, __u32 mode, int count, + int stripe_count, int stripe_offset) +{ + struct lu_object *parent; + struct lu_object *new_parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + struct md_op_spec *spec = &info->eti_spec; + struct md_attr *ma = &info->eti_ma; + struct lu_device *ld = ed->ed_next; + int rc = 0; + int i; + + ENTRY; + + if (ec_parent == NULL) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (parent == NULL) + RETURN(-ENXIO); + + rc = echo_md_dir_stripe_choose(env, ed, parent, name, namelen, + id, &new_parent); + if (rc != 0) + RETURN(rc); + + LASSERT(new_parent != NULL); + memset(ma, 0, sizeof(*ma)); + memset(spec, 0, sizeof(*spec)); + echo_set_lmm_size(env, ld, ma); + if (stripe_count != 0) { + spec->sp_cr_flags |= MDS_FMODE_WRITE; + if (stripe_count != -1) { + if (S_ISDIR(mode)) { + struct lmv_user_md *lmu; + + lmu = (struct lmv_user_md *)&info->eti_lum; + lmu->lum_magic = LMV_USER_MAGIC; + lmu->lum_stripe_offset = stripe_offset; + lmu->lum_stripe_count = stripe_count; + lmu->lum_hash_type = LMV_HASH_TYPE_FNV_1A_64; + spec->u.sp_ea.eadata = lmu; + spec->u.sp_ea.eadatalen = sizeof(*lmu); + } else { + struct lov_user_md_v3 *lum = &info->eti_lum; + + lum->lmm_magic = LOV_USER_MAGIC_V3; + lum->lmm_stripe_count = stripe_count; + lum->lmm_stripe_offset = stripe_offset; + lum->lmm_pattern = LOV_PATTERN_NONE; + spec->u.sp_ea.eadata = lum; + spec->u.sp_ea.eadatalen = sizeof(*lum); + } + spec->sp_cr_flags |= MDS_OPEN_HAS_EA; + } + } + + ma->ma_attr.la_mode = mode; + ma->ma_attr.la_valid = LA_CTIME | LA_MODE; + ma->ma_attr.la_ctime = ktime_get_real_seconds(); + + if (name != NULL) { + lname->ln_name = name; + lname->ln_namelen = namelen; + /* If name is specified, only create one object by name */ + rc = echo_md_create_internal(env, ed, lu2md(new_parent), fid, + lname, spec, ma); + GOTO(out_put, rc); + } + + /* Create multiple object sequenced by id */ + for (i = 0; i < count; i++) { + char *tmp_name = info->eti_name; + + echo_md_build_name(lname, tmp_name, id); + + rc = echo_md_create_internal(env, ed, lu2md(new_parent), + fid, lname, spec, ma); + if (rc) { + CERROR("Can not create child %s: rc = %d\n", tmp_name, + rc); + break; + } + id++; + fid->f_oid++; + } + +out_put: + if (new_parent != parent) + lu_object_put(env, new_parent); + + RETURN(rc); +} + +static struct lu_object *echo_md_lookup(const struct lu_env *env, + struct echo_device *ed, + struct md_object *parent, + struct lu_name *lname) +{ + struct echo_thread_info *info = echo_env_info(env); + struct lu_fid *fid = &info->eti_fid; + struct lu_object *child; + int rc; + ENTRY; + + CDEBUG(D_INFO, "lookup %s in parent "DFID" %p\n", lname->ln_name, + PFID(fid), parent); + + rc = mdo_lookup(env, parent, lname, fid, NULL); + if (rc) { + CERROR("lookup %s: rc = %d\n", lname->ln_name, rc); + RETURN(ERR_PTR(rc)); + } + + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL); + + RETURN(child); +} + +static int echo_setattr_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + __u64 id, int count) +{ + struct lu_object *parent; + struct lu_object *new_parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + char *name = info->eti_name; + struct lu_device *ld = ed->ed_next; + struct lu_buf *buf = &info->eti_buf; + int rc = 0; + int i; + + ENTRY; + + if (ec_parent == NULL) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (parent == NULL) + RETURN(-ENXIO); + + rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id, + &new_parent); + if (rc != 0) + RETURN(rc); + + for (i = 0; i < count; i++) { + struct lu_object *ec_child, *child; + + echo_md_build_name(lname, name, id); + + ec_child = echo_md_lookup(env, ed, lu2md(new_parent), lname); + if (IS_ERR(ec_child)) { + rc = PTR_ERR(ec_child); + CERROR("Can't find child %s: rc = %d\n", + lname->ln_name, rc); + break; + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (child == NULL) { + CERROR("Can not locate the child %s\n", lname->ln_name); + lu_object_put(env, ec_child); + rc = -EINVAL; + break; + } + + CDEBUG(D_RPCTRACE, "Start setattr object "DFID"\n", + PFID(lu_object_fid(child))); + + buf->lb_buf = info->eti_xattr_buf; + buf->lb_len = sizeof(info->eti_xattr_buf); + + sprintf(name, "%s.test1", XATTR_USER_PREFIX); + rc = mo_xattr_set(env, lu2md(child), buf, name, + LU_XATTR_CREATE); + if (rc < 0) { + CERROR("Can not setattr child "DFID": rc = %d\n", + PFID(lu_object_fid(child)), rc); + lu_object_put(env, ec_child); + break; + } + CDEBUG(D_RPCTRACE, "End setattr object "DFID"\n", + PFID(lu_object_fid(child))); + id++; + lu_object_put(env, ec_child); + } + + if (new_parent != parent) + lu_object_put(env, new_parent); + + RETURN(rc); +} + +static int echo_getattr_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + __u64 id, int count) +{ + struct lu_object *parent; + struct lu_object *new_parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + char *name = info->eti_name; + struct md_attr *ma = &info->eti_ma; + struct lu_device *ld = ed->ed_next; + int rc = 0; + int i; + + ENTRY; + + if (ec_parent == NULL) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (parent == NULL) + RETURN(-ENXIO); + + rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id, + &new_parent); + if (rc != 0) + RETURN(rc); + + memset(ma, 0, sizeof(*ma)); + ma->ma_need |= MA_INODE | MA_LOV | MA_PFID | MA_HSM | MA_ACL_DEF; + ma->ma_acl = info->eti_xattr_buf; + ma->ma_acl_size = sizeof(info->eti_xattr_buf); + + for (i = 0; i < count; i++) { + struct lu_object *ec_child, *child; + + ma->ma_valid = 0; + echo_md_build_name(lname, name, id); + echo_set_lmm_size(env, ld, ma); + + ec_child = echo_md_lookup(env, ed, lu2md(new_parent), lname); + if (IS_ERR(ec_child)) { + CERROR("Can't find child %s: rc = %ld\n", + lname->ln_name, PTR_ERR(ec_child)); + RETURN(PTR_ERR(ec_child)); + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (child == NULL) { + CERROR("Can not locate the child %s\n", lname->ln_name); + lu_object_put(env, ec_child); + RETURN(-EINVAL); + } + + CDEBUG(D_RPCTRACE, "Start getattr object "DFID"\n", + PFID(lu_object_fid(child))); + rc = echo_attr_get_complex(env, lu2md(child), ma); + if (rc) { + CERROR("Can not getattr child "DFID": rc = %d\n", + PFID(lu_object_fid(child)), rc); + lu_object_put(env, ec_child); + break; + } + CDEBUG(D_RPCTRACE, "End getattr object "DFID"\n", + PFID(lu_object_fid(child))); + id++; + lu_object_put(env, ec_child); + } + + if (new_parent != parent) + lu_object_put(env, new_parent); + + RETURN(rc); +} + +static int echo_lookup_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + __u64 id, int count) +{ + struct lu_object *parent; + struct lu_object *new_parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + char *name = info->eti_name; + struct lu_fid *fid = &info->eti_fid; + struct lu_device *ld = ed->ed_next; + int rc = 0; + int i; + + if (ec_parent == NULL) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (parent == NULL) + return -ENXIO; + + rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id, + &new_parent); + if (rc != 0) + RETURN(rc); + + /*prepare the requests*/ + for (i = 0; i < count; i++) { + echo_md_build_name(lname, name, id); + + CDEBUG(D_RPCTRACE, "Start lookup object "DFID" %s %p\n", + PFID(lu_object_fid(new_parent)), lname->ln_name, + new_parent); + + rc = mdo_lookup(env, lu2md(new_parent), lname, fid, NULL); + if (rc) { + CERROR("Can not lookup child %s: rc = %d\n", name, rc); + break; + } + + CDEBUG(D_RPCTRACE, "End lookup object "DFID" %s %p\n", + PFID(lu_object_fid(new_parent)), lname->ln_name, + new_parent); + + id++; + } + + if (new_parent != parent) + lu_object_put(env, new_parent); + + return rc; +} + +static int echo_md_destroy_internal(const struct lu_env *env, + struct echo_device *ed, + struct md_object *parent, + struct lu_name *lname, + struct md_attr *ma) +{ + struct lu_device *ld = ed->ed_next; + struct lu_object *ec_child; + struct lu_object *child; + int rc; + + ENTRY; + + ec_child = echo_md_lookup(env, ed, parent, lname); + if (IS_ERR(ec_child)) { + CERROR("Can't find child %s: rc = %ld\n", lname->ln_name, + PTR_ERR(ec_child)); + RETURN(PTR_ERR(ec_child)); + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (child == NULL) { + CERROR("Can not locate the child %s\n", lname->ln_name); + GOTO(out_put, rc = -EINVAL); + } + + if (lu_object_remote(child)) { + CERROR("Can not destroy remote object %s: rc = %d\n", + lname->ln_name, -EPERM); + GOTO(out_put, rc = -EPERM); + } + CDEBUG(D_RPCTRACE, "Start destroy object "DFID" %s %p\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent); + + rc = mdo_unlink(env, parent, lu2md(child), lname, ma, 0); + if (rc) { + CERROR("Can not unlink child %s: rc = %d\n", + lname->ln_name, rc); + GOTO(out_put, rc); + } + CDEBUG(D_RPCTRACE, "End destroy object "DFID" %s %p\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent); +out_put: + lu_object_put(env, ec_child); + return rc; +} + +static int echo_destroy_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + char *name, int namelen, + __u64 id, __u32 mode, + int count) +{ + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + struct md_attr *ma = &info->eti_ma; + struct lu_device *ld = ed->ed_next; + struct lu_object *parent; + struct lu_object *new_parent; + int rc = 0; + int i; + ENTRY; + + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (parent == NULL) + RETURN(-EINVAL); + + rc = echo_md_dir_stripe_choose(env, ed, parent, name, namelen, + id, &new_parent); + if (rc != 0) + RETURN(rc); + + memset(ma, 0, sizeof(*ma)); + ma->ma_attr.la_mode = mode; + ma->ma_attr.la_valid = LA_CTIME; + ma->ma_attr.la_ctime = ktime_get_real_seconds(); + ma->ma_need = MA_INODE; + ma->ma_valid = 0; + + if (name != NULL) { + lname->ln_name = name; + lname->ln_namelen = namelen; + rc = echo_md_destroy_internal(env, ed, lu2md(new_parent), lname, + ma); + GOTO(out_put, rc); + } + + /*prepare the requests*/ + for (i = 0; i < count; i++) { + char *tmp_name = info->eti_name; + + ma->ma_valid = 0; + echo_md_build_name(lname, tmp_name, id); + + rc = echo_md_destroy_internal(env, ed, lu2md(new_parent), lname, + ma); + if (rc) { + CERROR("Can not unlink child %s: rc = %d\n", name, rc); + break; + } + id++; + } + +out_put: + if (new_parent != parent) + lu_object_put(env, new_parent); + + RETURN(rc); +} + +static struct lu_object *echo_resolve_path(const struct lu_env *env, + struct echo_device *ed, char *path, + int path_len) +{ + struct lu_device *ld = ed->ed_next; + struct echo_thread_info *info = echo_env_info(env); + struct lu_fid *fid = &info->eti_fid; + struct lu_name *lname = &info->eti_lname; + struct lu_object *parent = NULL; + struct lu_object *child = NULL; + int rc = 0; + ENTRY; + + *fid = ed->ed_root_fid; + + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + parent = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL); + if (IS_ERR(parent)) { + CERROR("Can not find the parent "DFID": rc = %ld\n", + PFID(fid), PTR_ERR(parent)); + RETURN(parent); + } + + while (1) { + struct lu_object *ld_parent; + char *e; + + e = strsep(&path, "/"); + if (e == NULL) + break; + + if (e[0] == 0) { + if (!path || path[0] == '\0') + break; + continue; + } + + lname->ln_name = e; + lname->ln_namelen = strlen(e); + + ld_parent = lu_object_locate(parent->lo_header, ld->ld_type); + if (ld_parent == NULL) { + lu_object_put(env, parent); + rc = -EINVAL; + break; + } + + child = echo_md_lookup(env, ed, lu2md(ld_parent), lname); + lu_object_put(env, parent); + if (IS_ERR(child)) { + rc = (int)PTR_ERR(child); + CERROR("lookup %s under parent "DFID": rc = %d\n", + lname->ln_name, PFID(lu_object_fid(ld_parent)), + rc); + break; + } + parent = child; + } + if (rc) + RETURN(ERR_PTR(rc)); + + RETURN(parent); +} + +static void echo_ucred_init(struct lu_env *env) +{ + struct lu_ucred *ucred = lu_ucred(env); + + ucred->uc_valid = UCRED_INVALID; + + ucred->uc_suppgids[0] = -1; + ucred->uc_suppgids[1] = -1; + + ucred->uc_uid = ucred->uc_o_uid = + from_kuid(&init_user_ns, current_uid()); + ucred->uc_gid = ucred->uc_o_gid = + from_kgid(&init_user_ns, current_gid()); + ucred->uc_fsuid = ucred->uc_o_fsuid = + from_kuid(&init_user_ns, current_fsuid()); + ucred->uc_fsgid = ucred->uc_o_fsgid = + from_kgid(&init_user_ns, current_fsgid()); + ucred->uc_cap = cfs_curproc_cap_pack(); + + /* remove fs privilege for non-root user. */ + if (ucred->uc_fsuid) + ucred->uc_cap &= ~CFS_CAP_FS_MASK; + ucred->uc_valid = UCRED_NEW; +} + +static void echo_ucred_fini(struct lu_env *env) +{ + struct lu_ucred *ucred = lu_ucred(env); + ucred->uc_valid = UCRED_INIT; +} + +#define ECHO_MD_CTX_TAG (LCT_REMEMBER | LCT_MD_THREAD) +#define ECHO_MD_SES_TAG (LCT_REMEMBER | LCT_SESSION | LCT_SERVER_SESSION) +static int echo_md_handler(struct echo_device *ed, int command, + char *path, int path_len, __u64 id, int count, + struct obd_ioctl_data *data) +{ + struct echo_thread_info *info; + struct lu_device *ld = ed->ed_next; + struct lu_env *env; + __u16 refcheck; + struct lu_object *parent; + char *name = NULL; + int namelen = data->ioc_plen2; + int rc = 0; + ENTRY; + + if (ld == NULL) { + CERROR("MD echo client is not being initialized properly\n"); + RETURN(-EINVAL); + } + + if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) { + CERROR("Only support MDD layer right now!\n"); + RETURN(-EINVAL); + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = lu_env_refill_by_tags(env, ECHO_MD_CTX_TAG, ECHO_MD_SES_TAG); + if (rc != 0) + GOTO(out_env, rc); + + /* init big_lmm buffer */ + info = echo_env_info(env); + LASSERT(info->eti_big_lmm == NULL); + OBD_ALLOC_LARGE(info->eti_big_lmm, MIN_MD_SIZE); + if (info->eti_big_lmm == NULL) + GOTO(out_env, rc = -ENOMEM); + info->eti_big_lmmsize = MIN_MD_SIZE; + + parent = echo_resolve_path(env, ed, path, path_len); + if (IS_ERR(parent)) { + CERROR("Can not resolve the path %s: rc = %ld\n", path, + PTR_ERR(parent)); + GOTO(out_free, rc = PTR_ERR(parent)); + } + + if (namelen > 0) { + OBD_ALLOC(name, namelen + 1); + if (name == NULL) + GOTO(out_put, rc = -ENOMEM); + if (copy_from_user(name, data->ioc_pbuf2, namelen)) + GOTO(out_name, rc = -EFAULT); + } + + echo_ucred_init(env); + + switch (command) { + case ECHO_MD_CREATE: + case ECHO_MD_MKDIR: { + struct echo_thread_info *info = echo_env_info(env); + __u32 mode = data->ioc_obdo2.o_mode; + struct lu_fid *fid = &info->eti_fid; + int stripe_count = (int)data->ioc_obdo2.o_misc; + int stripe_index = (int)data->ioc_obdo2.o_stripe_idx; + + rc = ostid_to_fid(fid, &data->ioc_obdo1.o_oi, 0); + if (rc != 0) + break; + + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + rc = echo_create_md_object(env, ed, parent, fid, name, namelen, + id, mode, count, stripe_count, + stripe_index); + break; + } + case ECHO_MD_DESTROY: + case ECHO_MD_RMDIR: { + __u32 mode = data->ioc_obdo2.o_mode; + + rc = echo_destroy_object(env, ed, parent, name, namelen, + id, mode, count); + break; + } + case ECHO_MD_LOOKUP: + rc = echo_lookup_object(env, ed, parent, id, count); + break; + case ECHO_MD_GETATTR: + rc = echo_getattr_object(env, ed, parent, id, count); + break; + case ECHO_MD_SETATTR: + rc = echo_setattr_object(env, ed, parent, id, count); + break; + default: + CERROR("unknown command %d\n", command); + rc = -EINVAL; + break; + } + echo_ucred_fini(env); + +out_name: + if (name != NULL) + OBD_FREE(name, namelen + 1); +out_put: + lu_object_put(env, parent); +out_free: + LASSERT(info->eti_big_lmm); + OBD_FREE_LARGE(info->eti_big_lmm, info->eti_big_lmmsize); + info->eti_big_lmm = NULL; + info->eti_big_lmmsize = 0; +out_env: + cl_env_put(env, &refcheck); + return rc; +} +#endif /* HAVE_SERVER_SUPPORT */ + +static int echo_create_object(const struct lu_env *env, struct echo_device *ed, + struct obdo *oa) +{ + struct echo_object *eco; + struct echo_client_obd *ec = ed->ed_ec; + int created = 0; + int rc; + ENTRY; + + if (!(oa->o_valid & OBD_MD_FLID) || + !(oa->o_valid & OBD_MD_FLGROUP) || + !fid_seq_is_echo(ostid_seq(&oa->o_oi))) { + CERROR("invalid oid "DOSTID"\n", POSTID(&oa->o_oi)); + RETURN(-EINVAL); + } + + if (ostid_id(&oa->o_oi) == 0) { + rc = ostid_set_id(&oa->o_oi, ++last_object_id); + if (rc) + GOTO(failed, rc); + } + + rc = obd_create(env, ec->ec_exp, oa); + if (rc != 0) { + CERROR("Cannot create objects: rc = %d\n", rc); + GOTO(failed, rc); + } + + created = 1; + + oa->o_valid |= OBD_MD_FLID; + + eco = cl_echo_object_find(ed, &oa->o_oi); + if (IS_ERR(eco)) + GOTO(failed, rc = PTR_ERR(eco)); + cl_echo_object_put(eco); + + CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi)); + EXIT; + +failed: + if (created && rc != 0) + obd_destroy(env, ec->ec_exp, oa); + + if (rc != 0) + CERROR("create object failed with: rc = %d\n", rc); + + return rc; +} + +static int echo_get_object(struct echo_object **ecop, struct echo_device *ed, + struct obdo *oa) +{ + struct echo_object *eco; + int rc; + ENTRY; + + if (!(oa->o_valid & OBD_MD_FLID) || + !(oa->o_valid & OBD_MD_FLGROUP) || + ostid_id(&oa->o_oi) == 0) { + CERROR("invalid oid "DOSTID"\n", POSTID(&oa->o_oi)); + RETURN(-EINVAL); + } + + rc = 0; + eco = cl_echo_object_find(ed, &oa->o_oi); + if (!IS_ERR(eco)) + *ecop = eco; + else + rc = PTR_ERR(eco); + + RETURN(rc); +} + +static void echo_put_object(struct echo_object *eco) +{ + int rc; + + rc = cl_echo_object_put(eco); + if (rc) + CERROR("%s: echo client drop an object failed: rc = %d\n", + eco->eo_dev->ed_ec->ec_exp->exp_obd->obd_name, rc); +} + +static void echo_client_page_debug_setup(struct page *page, int rw, u64 id, + u64 offset, u64 count) +{ + char *addr; + u64 stripe_off; + u64 stripe_id; + int delta; + + /* no partial pages on the client */ + LASSERT(count == PAGE_SIZE); + + addr = kmap(page); + + for (delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) { + if (rw == OBD_BRW_WRITE) { + stripe_off = offset + delta; + stripe_id = id; + } else { + stripe_off = 0xdeadbeef00c0ffeeULL; + stripe_id = 0xdeadbeef00c0ffeeULL; + } + block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE, + stripe_off, stripe_id); + } + + kunmap(page); +} + +static int +echo_client_page_debug_check(struct page *page, u64 id, u64 offset, u64 count) +{ + u64 stripe_off; + u64 stripe_id; + char *addr; + int delta; + int rc; + int rc2; + + /* no partial pages on the client */ + LASSERT(count == PAGE_SIZE); + + addr = kmap(page); + + for (rc = delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) { + stripe_off = offset + delta; + stripe_id = id; + + rc2 = block_debug_check("test_brw", + addr + delta, OBD_ECHO_BLOCK_SIZE, + stripe_off, stripe_id); + if (rc2 != 0) { + CERROR("Error in echo object %#llx\n", id); + rc = rc2; + } + } + + kunmap(page); + return rc; +} + +static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa, + struct echo_object *eco, u64 offset, + u64 count, int async) +{ + size_t npages; + struct brw_page *pga; + struct brw_page *pgp; + struct page **pages; + u64 off; + size_t i; + int rc; + int verify; + gfp_t gfp_mask; + u32 brw_flags = 0; + ENTRY; + + verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID && + (oa->o_valid & OBD_MD_FLFLAGS) != 0 && + (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); + + gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_KERNEL : GFP_HIGHUSER; + + LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); + + if ((count & (~PAGE_MASK)) != 0) + RETURN(-EINVAL); + + /* XXX think again with misaligned I/O */ + npages = count >> PAGE_SHIFT; + + if (rw == OBD_BRW_WRITE) + brw_flags = OBD_BRW_ASYNC; + + OBD_ALLOC(pga, npages * sizeof(*pga)); + if (pga == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC(pages, npages * sizeof(*pages)); + if (pages == NULL) { + OBD_FREE(pga, npages * sizeof(*pga)); + RETURN(-ENOMEM); + } + + for (i = 0, pgp = pga, off = offset; + i < npages; + i++, pgp++, off += PAGE_SIZE) { + + LASSERT(pgp->pg == NULL); /* for cleanup */ + + rc = -ENOMEM; + pgp->pg = alloc_page(gfp_mask); + if (pgp->pg == NULL) + goto out; + + pages[i] = pgp->pg; + pgp->count = PAGE_SIZE; + pgp->off = off; + pgp->flag = brw_flags; + + if (verify) + echo_client_page_debug_setup(pgp->pg, rw, + ostid_id(&oa->o_oi), off, + pgp->count); + } + + /* brw mode can only be used at client */ + LASSERT(ed->ed_next != NULL); + rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async); + + out: + if (rc != 0 || rw != OBD_BRW_READ) + verify = 0; + + for (i = 0, pgp = pga; i < npages; i++, pgp++) { + if (pgp->pg == NULL) + continue; + + if (verify) { + int vrc; + vrc = echo_client_page_debug_check(pgp->pg, + ostid_id(&oa->o_oi), + pgp->off, pgp->count); + if (vrc != 0 && rc == 0) + rc = vrc; + } + __free_page(pgp->pg); + } + OBD_FREE(pga, npages * sizeof(*pga)); + OBD_FREE(pages, npages * sizeof(*pages)); + RETURN(rc); +} + +static int echo_client_prep_commit(const struct lu_env *env, + struct obd_export *exp, int rw, + struct obdo *oa, struct echo_object *eco, + u64 offset, u64 count, + u64 batch, int async) +{ + struct obd_ioobj ioo; + struct niobuf_local *lnb; + struct niobuf_remote rnb; + u64 off; + u64 npages, tot_pages, apc; + int i, ret = 0, brw_flags = 0; + + ENTRY; + + if (count <= 0 || (count & ~PAGE_MASK) != 0) + RETURN(-EINVAL); + + apc = npages = batch >> PAGE_SHIFT; + tot_pages = count >> PAGE_SHIFT; + + OBD_ALLOC_LARGE(lnb, apc * sizeof(struct niobuf_local)); + if (lnb == NULL) + RETURN(-ENOMEM); + + if (rw == OBD_BRW_WRITE && async) + brw_flags |= OBD_BRW_ASYNC; + + obdo_to_ioobj(oa, &ioo); + + off = offset; + + for (; tot_pages > 0; tot_pages -= npages) { + int lpages; + + if (tot_pages < npages) + npages = tot_pages; + + rnb.rnb_offset = off; + rnb.rnb_len = npages * PAGE_SIZE; + rnb.rnb_flags = brw_flags; + ioo.ioo_bufcnt = 1; + off += npages * PAGE_SIZE; + + lpages = npages; + ret = obd_preprw(env, rw, exp, oa, 1, &ioo, &rnb, &lpages, lnb); + if (ret != 0) + GOTO(out, ret); + + for (i = 0; i < lpages; i++) { + struct page *page = lnb[i].lnb_page; + + /* read past eof? */ + if (page == NULL && lnb[i].lnb_rc == 0) + continue; + + if (async) + lnb[i].lnb_flags |= OBD_BRW_ASYNC; + + if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID || + (oa->o_valid & OBD_MD_FLFLAGS) == 0 || + (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0) + continue; + + if (rw == OBD_BRW_WRITE) + echo_client_page_debug_setup(page, rw, + ostid_id(&oa->o_oi), + lnb[i].lnb_file_offset, + lnb[i].lnb_len); + else + echo_client_page_debug_check(page, + ostid_id(&oa->o_oi), + lnb[i].lnb_file_offset, + lnb[i].lnb_len); + } + + ret = obd_commitrw(env, rw, exp, oa, 1, &ioo, &rnb, npages, lnb, + ret); + if (ret != 0) + break; + + /* Reuse env context. */ + lu_context_exit((struct lu_context *)&env->le_ctx); + lu_context_enter((struct lu_context *)&env->le_ctx); + } + +out: + OBD_FREE_LARGE(lnb, apc * sizeof(struct niobuf_local)); + + RETURN(ret); +} + +static int echo_client_brw_ioctl(const struct lu_env *env, int rw, + struct obd_export *exp, + struct obd_ioctl_data *data) +{ + struct obd_device *obd = class_exp2obd(exp); + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = ed->ed_ec; + struct obdo *oa = &data->ioc_obdo1; + struct echo_object *eco; + int rc; + int async = 0; + long test_mode; + ENTRY; + + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + + rc = echo_get_object(&eco, ed, oa); + if (rc) + RETURN(rc); + + oa->o_valid &= ~OBD_MD_FLHANDLE; + + /* OFD/obdfilter works only via prep/commit */ + test_mode = (long)data->ioc_pbuf1; + if (ed->ed_next == NULL && test_mode != 3) { + test_mode = 3; + data->ioc_plen1 = data->ioc_count; + } + + if (test_mode == 3) + async = 1; + + /* Truncate batch size to maximum */ + if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE) + data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE; + + switch (test_mode) { + case 1: + fallthrough; + case 2: + rc = echo_client_kbrw(ed, rw, oa, eco, data->ioc_offset, + data->ioc_count, async); + break; + case 3: + rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa, eco, + data->ioc_offset, data->ioc_count, + data->ioc_plen1, async); + break; + default: + rc = -EINVAL; + } + + echo_put_object(eco); + + RETURN(rc); +} + +static int +echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void __user *uarg) +{ +#ifdef HAVE_SERVER_SUPPORT + struct tgt_session_info *tsi; +#endif + struct obd_device *obd = exp->exp_obd; + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco; + struct obd_ioctl_data *data = karg; + struct lu_env *env; + struct obdo *oa; + struct lu_fid fid; + int rw = OBD_BRW_READ; + int rc = 0; +#ifdef HAVE_SERVER_SUPPORT + struct lu_context echo_session; +#endif + ENTRY; + + oa = &data->ioc_obdo1; + if (!(oa->o_valid & OBD_MD_FLGROUP)) { + oa->o_valid |= OBD_MD_FLGROUP; + ostid_set_seq_echo(&oa->o_oi); + } + + /* This FID is unpacked just for validation at this point */ + rc = ostid_to_fid(&fid, &oa->o_oi, 0); + if (rc < 0) + RETURN(rc); + + OBD_ALLOC_PTR(env); + if (env == NULL) + RETURN(-ENOMEM); + + rc = lu_env_init(env, LCT_DT_THREAD); + if (rc) + GOTO(out_alloc, rc = -ENOMEM); + lu_env_add(env); + if (rc) + GOTO(out_env_fini, rc = -ENOMEM); + +#ifdef HAVE_SERVER_SUPPORT + env->le_ses = &echo_session; + rc = lu_context_init(env->le_ses, LCT_SERVER_SESSION | LCT_NOREF); + if (unlikely(rc < 0)) + GOTO(out_env, rc); + lu_context_enter(env->le_ses); + + tsi = tgt_ses_info(env); + tsi->tsi_exp = ec->ec_exp; + tsi->tsi_jobid = NULL; +#endif + switch (cmd) { + case OBD_IOC_CREATE: /* may create echo object */ + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_create_object(env, ed, oa); + GOTO(out, rc); + +#ifdef HAVE_SERVER_SUPPORT + case OBD_IOC_ECHO_MD: { + int count; + int cmd; + char *dir = NULL; + int dirlen; + __u64 id; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO(out, rc = -EPERM); + + count = data->ioc_count; + cmd = data->ioc_command; + + id = data->ioc_obdo2.o_oi.oi.oi_id; + dirlen = data->ioc_plen1; + OBD_ALLOC(dir, dirlen + 1); + if (dir == NULL) + GOTO(out, rc = -ENOMEM); + + if (copy_from_user(dir, data->ioc_pbuf1, dirlen)) { + OBD_FREE(dir, data->ioc_plen1 + 1); + GOTO(out, rc = -EFAULT); + } + + rc = echo_md_handler(ed, cmd, dir, dirlen, id, count, data); + OBD_FREE(dir, dirlen + 1); + GOTO(out, rc); + } + case OBD_IOC_ECHO_ALLOC_SEQ: { + struct lu_env *cl_env; + __u16 refcheck; + __u64 seq; + int max_count; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO(out, rc = -EPERM); + + cl_env = cl_env_get(&refcheck); + if (IS_ERR(cl_env)) + GOTO(out, rc = PTR_ERR(cl_env)); + + rc = lu_env_refill_by_tags(cl_env, ECHO_MD_CTX_TAG, + ECHO_MD_SES_TAG); + if (rc != 0) { + cl_env_put(cl_env, &refcheck); + GOTO(out, rc); + } + + rc = seq_client_get_seq(cl_env, ed->ed_cl_seq, &seq); + cl_env_put(cl_env, &refcheck); + if (rc < 0) { + CERROR("%s: Can not alloc seq: rc = %d\n", + obd->obd_name, rc); + GOTO(out, rc); + } + + if (copy_to_user(data->ioc_pbuf1, &seq, data->ioc_plen1)) + return -EFAULT; + + max_count = LUSTRE_METADATA_SEQ_MAX_WIDTH; + if (copy_to_user(data->ioc_pbuf2, &max_count, + data->ioc_plen2)) + return -EFAULT; + GOTO(out, rc); + } +#endif /* HAVE_SERVER_SUPPORT */ + case OBD_IOC_DESTROY: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + rc = obd_destroy(env, ec->ec_exp, oa); + if (rc == 0) + eco->eo_deleted = 1; + echo_put_object(eco); + } + GOTO(out, rc); + + case OBD_IOC_GETATTR: + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + rc = obd_getattr(env, ec->ec_exp, oa); + echo_put_object(eco); + } + GOTO(out, rc); + + case OBD_IOC_SETATTR: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + rc = obd_setattr(env, ec->ec_exp, oa); + echo_put_object(eco); + } + GOTO(out, rc); + + case OBD_IOC_BRW_WRITE: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rw = OBD_BRW_WRITE; + fallthrough; + case OBD_IOC_BRW_READ: + rc = echo_client_brw_ioctl(env, rw, exp, data); + GOTO(out, rc); + + default: + CERROR ("echo_ioctl(): unrecognised ioctl %#x\n", cmd); + GOTO (out, rc = -ENOTTY); + } + + EXIT; +out: +#ifdef HAVE_SERVER_SUPPORT + lu_context_exit(env->le_ses); + lu_context_fini(env->le_ses); +out_env: +#endif + lu_env_remove(env); +out_env_fini: + lu_env_fini(env); +out_alloc: + OBD_FREE_PTR(env); + + return rc; +} + +static int echo_client_setup(const struct lu_env *env, + struct obd_device *obddev, struct lustre_cfg *lcfg) +{ + struct echo_client_obd *ec = &obddev->u.echo_client; + struct obd_device *tgt; + struct obd_uuid echo_uuid = { "ECHO_UUID" }; + struct obd_connect_data *ocd = NULL; + int rc; + ENTRY; + + if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); + if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { + CERROR("device not attached or not set up (%s)\n", + lustre_cfg_string(lcfg, 1)); + RETURN(-EINVAL); + } + + spin_lock_init(&ec->ec_lock); + INIT_LIST_HEAD(&ec->ec_objects); + INIT_LIST_HEAD(&ec->ec_locks); + ec->ec_unique = 0; + + if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) { +#ifdef HAVE_SERVER_SUPPORT + lu_context_tags_update(ECHO_MD_CTX_TAG); + lu_session_tags_update(ECHO_MD_SES_TAG); +#else + CERROR("Local operations are NOT supported on client side. " + "Only remote operations are supported. Metadata client " + "must be run on server side.\n"); +#endif + RETURN(0); + } + + OBD_ALLOC(ocd, sizeof(*ocd)); + if (ocd == NULL) { + CERROR("Can't alloc ocd connecting to %s\n", + lustre_cfg_string(lcfg, 1)); + return -ENOMEM; + } + + ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL | + OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 | + OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_FID; + ocd->ocd_brw_size = DT_MAX_BRW_SIZE; + ocd->ocd_version = LUSTRE_VERSION_CODE; + ocd->ocd_group = FID_SEQ_ECHO; + + rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL); + if (rc == 0) { + /* Turn off pinger because it connects to tgt obd directly. */ + spin_lock(&tgt->obd_dev_lock); + list_del_init(&ec->ec_exp->exp_obd_chain_timed); + spin_unlock(&tgt->obd_dev_lock); + } + + OBD_FREE(ocd, sizeof(*ocd)); + + if (rc != 0) { + CERROR("fail to connect to device %s\n", + lustre_cfg_string(lcfg, 1)); + return (rc); + } + + RETURN(rc); +} + +static int echo_client_cleanup(struct obd_device *obddev) +{ + struct echo_device *ed = obd2echo_dev(obddev); + struct echo_client_obd *ec = &obddev->u.echo_client; + int rc; + ENTRY; + + /*Do nothing for Metadata echo client*/ + if (ed == NULL ) + RETURN(0); + + if (ed->ed_next_ismd) { +#ifdef HAVE_SERVER_SUPPORT + lu_context_tags_clear(ECHO_MD_CTX_TAG); + lu_session_tags_clear(ECHO_MD_SES_TAG); +#else + CERROR("This is client-side only module, does not support " + "metadata echo client.\n"); +#endif + RETURN(0); + } + + if (!list_empty(&obddev->obd_exports)) { + CERROR("still has clients!\n"); + RETURN(-EBUSY); + } + + LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0); + rc = obd_disconnect(ec->ec_exp); + if (rc != 0) + CERROR("fail to disconnect device: %d\n", rc); + + RETURN(rc); +} + +static int echo_client_connect(const struct lu_env *env, + struct obd_export **exp, + struct obd_device *src, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + int rc; + struct lustre_handle conn = { 0 }; + + ENTRY; + rc = class_connect(&conn, src, cluuid); + if (rc == 0) { + *exp = class_conn2export(&conn); + } + + RETURN (rc); +} + +static int echo_client_disconnect(struct obd_export *exp) +{ + int rc; + ENTRY; + + if (exp == NULL) + GOTO(out, rc = -EINVAL); + + rc = class_disconnect(exp); + GOTO(out, rc); + out: + return rc; +} + +static struct obd_ops echo_client_obd_ops = { + .o_owner = THIS_MODULE, + .o_iocontrol = echo_client_iocontrol, + .o_connect = echo_client_connect, + .o_disconnect = echo_client_disconnect +}; + +static int __init obdecho_init(void) +{ + int rc; + + ENTRY; + LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n"); + + LASSERT(PAGE_SIZE % OBD_ECHO_BLOCK_SIZE == 0); + +# ifdef HAVE_SERVER_SUPPORT + rc = echo_persistent_pages_init(); + if (rc != 0) + goto failed_0; + + rc = class_register_type(&echo_obd_ops, NULL, true, NULL, + LUSTRE_ECHO_NAME, &echo_srv_type); + if (rc != 0) + goto failed_1; +# endif + + rc = lu_kmem_init(echo_caches); + if (rc == 0) { + rc = class_register_type(&echo_client_obd_ops, NULL, false, + NULL, LUSTRE_ECHO_CLIENT_NAME, + &echo_device_type); + if (rc) + lu_kmem_fini(echo_caches); + } + +# ifdef HAVE_SERVER_SUPPORT + if (rc == 0) + RETURN(0); + + class_unregister_type(LUSTRE_ECHO_NAME); +failed_1: + echo_persistent_pages_fini(); +failed_0: +# endif + RETURN(rc); +} + +static void __exit obdecho_exit(void) +{ + class_unregister_type(LUSTRE_ECHO_CLIENT_NAME); + lu_kmem_fini(echo_caches); + +#ifdef HAVE_SERVER_SUPPORT + class_unregister_type(LUSTRE_ECHO_NAME); + echo_persistent_pages_fini(); +#endif +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Echo Client test driver"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(obdecho_init); +module_exit(obdecho_exit); + +/** @} echo_client */ diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h new file mode 100644 index 0000000000000..469d68e94f02f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h @@ -0,0 +1,53 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014 Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdecho/echo_internal.h + */ + +#ifndef _ECHO_INTERNAL_H +#define _ECHO_INTERNAL_H + +/* The persistent object (i.e. actually stores stuff!) */ +#define ECHO_PERSISTENT_OBJID 1ULL +#define ECHO_PERSISTENT_SIZE ((__u64)(1<<20)) + +/* block size to use for data verification */ +#define OBD_ECHO_BLOCK_SIZE (4<<10) + +#ifdef HAVE_SERVER_SUPPORT +extern struct obd_ops echo_obd_ops; +extern struct lu_device_type echo_srv_type; +int echo_persistent_pages_init(void); +void echo_persistent_pages_fini(void); +#endif /* HAVE_SERVER_SUPPORT */ + +#endif diff --git a/drivers/staging/lustrefsx/lustre/osc/Makefile b/drivers/staging/lustrefsx/lustre/osc/Makefile new file mode 100644 index 0000000000000..223e42283bf92 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_LUSTREFSX_FS) += osc.o + +osc-y := osc_request.o lproc_osc.o osc_dev.o osc_object.o osc_page.o +osc-y += osc_lock.o osc_io.o osc_quota.o osc_cache.o + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c new file mode 100644 index 0000000000000..ab8cfca3601eb --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c @@ -0,0 +1,1003 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include + +#include "osc_internal.h" + +static ssize_t active_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + int rc; + + LPROCFS_CLIMP_CHECK(dev); + rc = sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive); + LPROCFS_CLIMP_EXIT(dev); + return rc; +} + +static ssize_t active_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + /* opposite senses */ + if (dev->u.cli.cl_import->imp_deactive == val) + rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val); + else + CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n", + (unsigned int)val); + + return count; +} +LUSTRE_RW_ATTR(active); + +static ssize_t max_rpcs_in_flight_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &dev->u.cli; + ssize_t len; + + spin_lock(&cli->cl_loi_list_lock); + len = sprintf(buf, "%u\n", cli->cl_max_rpcs_in_flight); + spin_unlock(&cli->cl_loi_list_lock); + return len; +} + +static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &dev->u.cli; + int adding, added, req_count; + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + if (val == 0 || val > OSC_MAX_RIF_MAX) + return -ERANGE; + + LPROCFS_CLIMP_CHECK(dev); + + adding = (int)val - cli->cl_max_rpcs_in_flight; + req_count = atomic_read(&osc_pool_req_count); + if (adding > 0 && req_count < osc_reqpool_maxreqcount) { + /* + * There might be some race which will cause over-limit + * allocation, but it is fine. + */ + if (req_count + adding > osc_reqpool_maxreqcount) + adding = osc_reqpool_maxreqcount - req_count; + + added = osc_rq_pool->prp_populate(osc_rq_pool, adding); + atomic_add(added, &osc_pool_req_count); + } + + spin_lock(&cli->cl_loi_list_lock); + cli->cl_max_rpcs_in_flight = val; + client_adjust_max_dirty(cli); + spin_unlock(&cli->cl_loi_list_lock); + + LPROCFS_CLIMP_EXIT(dev); + return count; +} +LUSTRE_RW_ATTR(max_rpcs_in_flight); + +static ssize_t max_dirty_mb_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &dev->u.cli; + unsigned long val; + + spin_lock(&cli->cl_loi_list_lock); + val = PAGES_TO_MiB(cli->cl_dirty_max_pages); + spin_unlock(&cli->cl_loi_list_lock); + + return sprintf(buf, "%lu\n", val); +} + +static ssize_t max_dirty_mb_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &dev->u.cli; + unsigned long pages_number, max_dirty_mb; + int rc; + + rc = kstrtoul(buffer, 10, &max_dirty_mb); + if (rc) + return rc; + + pages_number = MiB_TO_PAGES(max_dirty_mb); + + if (pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) || + pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */ + return -ERANGE; + + spin_lock(&cli->cl_loi_list_lock); + cli->cl_dirty_max_pages = pages_number; + osc_wake_cache_waiters(cli); + spin_unlock(&cli->cl_loi_list_lock); + + return count; +} +LUSTRE_RW_ATTR(max_dirty_mb); + +LUSTRE_ATTR(ost_conn_uuid, 0444, conn_uuid_show, NULL); +LUSTRE_RO_ATTR(conn_uuid); + +LUSTRE_RW_ATTR(ping); + +static int osc_cached_mb_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + int shift = 20 - PAGE_SHIFT; + + seq_printf(m, "used_mb: %ld\n" + "busy_cnt: %ld\n" + "reclaim: %llu\n", + (atomic_long_read(&cli->cl_lru_in_list) + + atomic_long_read(&cli->cl_lru_busy)) >> shift, + atomic_long_read(&cli->cl_lru_busy), + cli->cl_lru_reclaim); + + return 0; +} + +/* shrink the number of caching pages to a specific number */ +static ssize_t osc_cached_mb_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *dev = ((struct seq_file *)file->private_data)->private; + struct client_obd *cli = &dev->u.cli; + __s64 pages_number; + long rc; + char kernbuf[128]; + + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) - + kernbuf; + rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M'); + if (rc) + return rc; + + pages_number >>= PAGE_SHIFT; + + if (pages_number < 0) + return -ERANGE; + + rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number; + if (rc > 0) { + struct lu_env *env; + __u16 refcheck; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + (void)osc_lru_shrink(env, cli, rc, true); + cl_env_put(env, &refcheck); + } + } + + return count; +} + +LPROC_SEQ_FOPS(osc_cached_mb); + +static ssize_t cur_dirty_bytes_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &dev->u.cli; + ssize_t len; + + spin_lock(&cli->cl_loi_list_lock); + len = sprintf(buf, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT); + spin_unlock(&cli->cl_loi_list_lock); + + return len; +} +LUSTRE_RO_ATTR(cur_dirty_bytes); + +static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + + spin_lock(&cli->cl_loi_list_lock); + seq_printf(m, "%lu\n", cli->cl_avail_grant); + spin_unlock(&cli->cl_loi_list_lock); + return 0; +} + +static ssize_t osc_cur_grant_bytes_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + struct client_obd *cli = &obd->u.cli; + s64 val; + int rc; + + if (obd == NULL) + return 0; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1'); + if (rc) + return rc; + if (val < 0) + return val; + + /* this is only for shrinking grant */ + spin_lock(&cli->cl_loi_list_lock); + if (val >= cli->cl_avail_grant) { + spin_unlock(&cli->cl_loi_list_lock); + return 0; + } + + spin_unlock(&cli->cl_loi_list_lock); + + LPROCFS_CLIMP_CHECK(obd); + if (cli->cl_import->imp_state == LUSTRE_IMP_FULL) + rc = osc_shrink_grant_to_target(cli, val); + LPROCFS_CLIMP_EXIT(obd); + + return rc ? rc : count; +} +LPROC_SEQ_FOPS(osc_cur_grant_bytes); + +static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &dev->u.cli; + ssize_t len; + + spin_lock(&cli->cl_loi_list_lock); + len = sprintf(buf, "%lu\n", cli->cl_lost_grant); + spin_unlock(&cli->cl_loi_list_lock); + return len; +} +LUSTRE_RO_ATTR(cur_lost_grant_bytes); + +static ssize_t grant_shrink_interval_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%lld\n", obd->u.cli.cl_grant_shrink_interval); +} + +static ssize_t grant_shrink_interval_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + if (val == 0) + return -ERANGE; + + obd->u.cli.cl_grant_shrink_interval = val; + + return count; +} +LUSTRE_RW_ATTR(grant_shrink_interval); + +static ssize_t checksums_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%d\n", obd->u.cli.cl_checksum ? 1 : 0); +} + +static ssize_t checksums_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + obd->u.cli.cl_checksum = val; + + return count; +} +LUSTRE_RW_ATTR(checksums); + +static int osc_checksum_type_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + int i; + DECLARE_CKSUM_NAME; + + if (obd == NULL) + return 0; + + for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { + if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0) + continue; + if (obd->u.cli.cl_cksum_type == (1 << i)) + seq_printf(m, "[%s] ", cksum_name[i]); + else + seq_printf(m, "%s ", cksum_name[i]); + } + seq_printf(m, "\n"); + return 0; +} + +static ssize_t osc_checksum_type_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd = ((struct seq_file *)file->private_data)->private; + int i; + DECLARE_CKSUM_NAME; + char kernbuf[10]; + + if (obd == NULL) + return 0; + + if (count > sizeof(kernbuf) - 1) + return -EINVAL; + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + if (count > 0 && kernbuf[count - 1] == '\n') + kernbuf[count - 1] = '\0'; + else + kernbuf[count] = '\0'; + + for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { + if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0) + continue; + if (!strcmp(kernbuf, cksum_name[i])) { + obd->u.cli.cl_cksum_type = 1 << i; + return count; + } + } + return -EINVAL; +} +LPROC_SEQ_FOPS(osc_checksum_type); + +static ssize_t resend_count_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%u\n", atomic_read(&obd->u.cli.cl_resends)); +} + +static ssize_t resend_count_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + atomic_set(&obd->u.cli.cl_resends, val); + + return count; +} +LUSTRE_RW_ATTR(resend_count); + +static ssize_t checksum_dump_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%d\n", obd->u.cli.cl_checksum_dump ? 1 : 0); +} + +static ssize_t checksum_dump_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + obd->u.cli.cl_checksum_dump = val; + + return count; +} +LUSTRE_RW_ATTR(checksum_dump); + +static ssize_t contention_seconds_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct osc_device *od = obd2osc_dev(obd); + + return sprintf(buf, "%lld\n", od->od_contention_time); +} + +static ssize_t contention_seconds_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct osc_device *od = obd2osc_dev(obd); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); + if (rc) + return rc; + + od->od_contention_time = val; + + return count; +} +LUSTRE_RW_ATTR(contention_seconds); + +static ssize_t lockless_truncate_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct osc_device *od = obd2osc_dev(obd); + + return sprintf(buf, "%u\n", od->od_lockless_truncate); +} + +static ssize_t lockless_truncate_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct osc_device *od = obd2osc_dev(obd); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + od->od_lockless_truncate = val; + + return count; +} +LUSTRE_RW_ATTR(lockless_truncate); + +static ssize_t destroys_in_flight_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return sprintf(buf, "%u\n", + atomic_read(&obd->u.cli.cl_destroy_in_flight)); +} +LUSTRE_RO_ATTR(destroys_in_flight); + +LPROC_SEQ_FOPS_RW_TYPE(osc, obd_max_pages_per_rpc); + +LUSTRE_RW_ATTR(short_io_bytes); + +#ifdef CONFIG_PROC_FS +static int osc_unstable_stats_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + long pages; + int mb; + + pages = atomic_long_read(&cli->cl_unstable_count); + mb = (pages * PAGE_SIZE) >> 20; + + seq_printf(m, "unstable_pages: %20ld\n" + "unstable_mb: %10d\n", + pages, mb); + return 0; +} +LPROC_SEQ_FOPS_RO(osc_unstable_stats); + +static ssize_t idle_timeout_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &obd->u.cli; + int ret; + + LPROCFS_CLIMP_CHECK(obd); + ret = sprintf(buf, "%u\n", cli->cl_import->imp_idle_timeout); + LPROCFS_CLIMP_EXIT(obd); + + return ret; +} + +static ssize_t idle_timeout_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &dev->u.cli; + struct ptlrpc_request *req; + unsigned int idle_debug = 0; + unsigned int val; + int rc; + + if (strncmp(buffer, "debug", 5) == 0) { + idle_debug = D_CONSOLE; + } else if (strncmp(buffer, "nodebug", 6) == 0) { + idle_debug = D_HA; + } else { + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + if (val > CONNECTION_SWITCH_MAX) + return -ERANGE; + } + + LPROCFS_CLIMP_CHECK(dev); + if (idle_debug) { + cli->cl_import->imp_idle_debug = idle_debug; + } else { + if (!val) { + /* initiate the connection if it's in IDLE state */ + req = ptlrpc_request_alloc(cli->cl_import, + &RQF_OST_STATFS); + if (req != NULL) + ptlrpc_req_finished(req); + } + cli->cl_import->imp_idle_timeout = val; + } + LPROCFS_CLIMP_EXIT(dev); + + return count; +} +LUSTRE_RW_ATTR(idle_timeout); + +static ssize_t idle_connect_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct client_obd *cli = &dev->u.cli; + struct ptlrpc_request *req; + + LPROCFS_CLIMP_CHECK(dev); + /* to initiate the connection if it's in IDLE state */ + req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_STATFS); + if (req) + ptlrpc_req_finished(req); + ptlrpc_pinger_force(cli->cl_import); + LPROCFS_CLIMP_EXIT(dev); + + return count; +} +LUSTRE_WO_ATTR(idle_connect); + +static ssize_t grant_shrink_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + ssize_t len; + + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + len = snprintf(buf, PAGE_SIZE, "%d\n", + !imp->imp_grant_shrink_disabled && + OCD_HAS_FLAG(&imp->imp_connect_data, GRANT_SHRINK)); + LPROCFS_CLIMP_EXIT(obd); + + return len; +} + +static ssize_t grant_shrink_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *dev = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct obd_import *imp; + bool val; + int rc; + + if (dev == NULL) + return 0; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + LPROCFS_CLIMP_CHECK(dev); + + imp = dev->u.cli.cl_import; + spin_lock(&imp->imp_lock); + imp->imp_grant_shrink_disabled = !val; + spin_unlock(&imp->imp_lock); + + LPROCFS_CLIMP_EXIT(dev); + + return count; +} +LUSTRE_RW_ATTR(grant_shrink); + +LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags); +LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid); +LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts); +LPROC_SEQ_FOPS_RO_TYPE(osc, state); + +LPROC_SEQ_FOPS_RW_TYPE(osc, import); +LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov); + +struct lprocfs_vars lprocfs_osc_obd_vars[] = { + { .name = "connect_flags", + .fops = &osc_connect_flags_fops }, + { .name = "ost_server_uuid", + .fops = &osc_server_uuid_fops }, + { .name = "max_pages_per_rpc", + .fops = &osc_obd_max_pages_per_rpc_fops }, + { .name = "osc_cached_mb", + .fops = &osc_cached_mb_fops }, + { .name = "cur_grant_bytes", + .fops = &osc_cur_grant_bytes_fops }, + { .name = "checksum_type", + .fops = &osc_checksum_type_fops }, + { .name = "timeouts", + .fops = &osc_timeouts_fops }, + { .name = "import", + .fops = &osc_import_fops }, + { .name = "state", + .fops = &osc_state_fops }, + { .name = "pinger_recov", + .fops = &osc_pinger_recov_fops }, + { .name = "unstable_stats", + .fops = &osc_unstable_stats_fops }, + { NULL } +}; + +static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timespec64 now; + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; + int i; + + ktime_get_real_ts64(&now); + + spin_lock(&cli->cl_loi_list_lock); + + seq_printf(seq, "snapshot_time: %lld.%09lu (secs.nsecs)\n", + (s64)now.tv_sec, now.tv_nsec); + seq_printf(seq, "read RPCs in flight: %d\n", + cli->cl_r_in_flight); + seq_printf(seq, "write RPCs in flight: %d\n", + cli->cl_w_in_flight); + seq_printf(seq, "pending write pages: %d\n", + atomic_read(&cli->cl_pending_w_pages)); + seq_printf(seq, "pending read pages: %d\n", + atomic_read(&cli->cl_pending_r_pages)); + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "pages per rpc rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_page_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_page_hist.oh_buckets[i]; + + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + 1 << i, r, pct(r, read_tot), + pct(read_cum, read_tot), w, + pct(w, write_tot), + pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "rpcs in flight rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + i, r, pct(r, read_tot), + pct(read_cum, read_tot), w, + pct(w, write_tot), + pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "offset rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_offset_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_offset_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3u %3u | %10lu %3u %3u\n", + (i == 0) ? 0 : 1 << (i - 1), + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + spin_unlock(&cli->cl_loi_list_lock); + + return 0; +} + +static ssize_t osc_rpc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + + lprocfs_oh_clear(&cli->cl_read_rpc_hist); + lprocfs_oh_clear(&cli->cl_write_rpc_hist); + lprocfs_oh_clear(&cli->cl_read_page_hist); + lprocfs_oh_clear(&cli->cl_write_page_hist); + lprocfs_oh_clear(&cli->cl_read_offset_hist); + lprocfs_oh_clear(&cli->cl_write_offset_hist); + + return len; +} +LPROC_SEQ_FOPS(osc_rpc_stats); + +static int osc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timespec64 now; + struct obd_device *dev = seq->private; + struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; + + ktime_get_real_ts64(&now); + + seq_printf(seq, "snapshot_time: %lld.%09lu (secs.nsecs)\n", + (s64)now.tv_sec, now.tv_nsec); + seq_printf(seq, "lockless_write_bytes\t\t%llu\n", + stats->os_lockless_writes); + seq_printf(seq, "lockless_read_bytes\t\t%llu\n", + stats->os_lockless_reads); + seq_printf(seq, "lockless_truncate\t\t%llu\n", + stats->os_lockless_truncates); + return 0; +} + +static ssize_t osc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; + + memset(stats, 0, sizeof(*stats)); + return len; +} + +LPROC_SEQ_FOPS(osc_stats); + +int lprocfs_osc_attach_seqstat(struct obd_device *dev) +{ + int rc; + + rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0644, + &osc_stats_fops, dev); + if (rc == 0) + rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0644, + &osc_rpc_stats_fops, dev); + + return rc; +} +#endif /* CONFIG_PROC_FS */ + +static struct attribute *osc_attrs[] = { + &lustre_attr_active.attr, + &lustre_attr_checksums.attr, + &lustre_attr_checksum_dump.attr, + &lustre_attr_contention_seconds.attr, + &lustre_attr_cur_dirty_bytes.attr, + &lustre_attr_cur_lost_grant_bytes.attr, + &lustre_attr_destroys_in_flight.attr, + &lustre_attr_grant_shrink_interval.attr, + &lustre_attr_lockless_truncate.attr, + &lustre_attr_max_dirty_mb.attr, + &lustre_attr_max_rpcs_in_flight.attr, + &lustre_attr_short_io_bytes.attr, + &lustre_attr_resend_count.attr, + &lustre_attr_ost_conn_uuid.attr, + &lustre_attr_conn_uuid.attr, + &lustre_attr_ping.attr, + &lustre_attr_idle_timeout.attr, + &lustre_attr_idle_connect.attr, + &lustre_attr_grant_shrink.attr, + NULL, +}; + +int osc_tunables_init(struct obd_device *obd) +{ +#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT) + struct obd_type *type; +#endif + int rc; + + obd->obd_vars = lprocfs_osc_obd_vars; +#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT) + /* If this is true then both client (osc) and server (osp) are on the + * same node. The osp layer if loaded first will register the osc proc + * directory. In that case this obd_device will be attached its proc + * tree to type->typ_procsym instead of obd->obd_type->typ_procroot. + */ + type = class_search_type(LUSTRE_OSP_NAME); + if (type && type->typ_procsym) { + obd->obd_proc_entry = lprocfs_register(obd->obd_name, + type->typ_procsym, + obd->obd_vars, obd); + if (IS_ERR(obd->obd_proc_entry)) { + rc = PTR_ERR(obd->obd_proc_entry); + CERROR("error %d setting up lprocfs for %s\n", rc, + obd->obd_name); + obd->obd_proc_entry = NULL; + } + } +#endif + obd->obd_ktype.default_attrs = osc_attrs; + rc = lprocfs_obd_setup(obd, false); + if (rc) + return rc; +#ifdef CONFIG_PROC_FS + /* If the basic OSC proc tree construction succeeded then + * lets do the rest. + */ + rc = lprocfs_osc_attach_seqstat(obd); + if (rc) + goto obd_cleanup; + +#endif /* CONFIG_PROC_FS */ + rc = sptlrpc_lprocfs_cliobd_attach(obd); + if (rc) + goto obd_cleanup; + + ptlrpc_lprocfs_register_obd(obd); +obd_cleanup: + if (rc) + lprocfs_obd_cleanup(obd); + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c new file mode 100644 index 0000000000000..4bae208d145f5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c @@ -0,0 +1,3334 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + * + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * osc cache management. + * + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include +#include + +#include "osc_internal.h" + +static int extent_debug; /* set it to be true for more debug */ + +static void osc_update_pending(struct osc_object *obj, int cmd, int delta); +static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, + enum osc_extent_state state); +static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int sent, int rc); +static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, + int cmd); +static int osc_refresh_count(const struct lu_env *env, + struct osc_async_page *oap, int cmd); +static int osc_io_unplug_async(const struct lu_env *env, + struct client_obd *cli, struct osc_object *osc); +static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, + unsigned int lost_grant, unsigned int dirty_grant); + +static void osc_extent_tree_dump0(int level, struct osc_object *obj, + const char *func, int line); +#define osc_extent_tree_dump(lvl, obj) \ + osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) + +static void osc_unreserve_grant(struct client_obd *cli, unsigned int reserved, + unsigned int unused); + +/** \addtogroup osc + * @{ + */ + +/* ------------------ osc extent ------------------ */ +static inline char *ext_flags(struct osc_extent *ext, char *flags) +{ + char *buf = flags; + *buf++ = ext->oe_rw ? 'r' : 'w'; + if (ext->oe_intree) + *buf++ = 'i'; + if (ext->oe_sync) + *buf++ = 'S'; + if (ext->oe_srvlock) + *buf++ = 's'; + if (ext->oe_hp) + *buf++ = 'h'; + if (ext->oe_urgent) + *buf++ = 'u'; + if (ext->oe_memalloc) + *buf++ = 'm'; + if (ext->oe_trunc_pending) + *buf++ = 't'; + if (ext->oe_fsync_wait) + *buf++ = 'Y'; + *buf = 0; + return flags; +} + +static inline char list_empty_marker(struct list_head *list) +{ + return list_empty(list) ? '-' : '+'; +} + +#define EXTSTR "[%lu -> %lu/%lu]" +#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end +static const char *oes_strings[] = { + "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL }; + +#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \ + struct osc_extent *__ext = (extent); \ + char __buf[16]; \ + \ + CDEBUG(lvl, \ + "extent %p@{" EXTSTR ", " \ + "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ + /* ----- extent part 0 ----- */ \ + __ext, EXTPARA(__ext), \ + /* ----- part 1 ----- */ \ + atomic_read(&__ext->oe_refc), \ + atomic_read(&__ext->oe_users), \ + list_empty_marker(&__ext->oe_link), \ + oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ + __ext->oe_obj, \ + /* ----- part 2 ----- */ \ + __ext->oe_grants, __ext->oe_nr_pages, \ + list_empty_marker(&__ext->oe_pages), \ + waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \ + __ext->oe_dlmlock, __ext->oe_mppr, __ext->oe_owner, \ + /* ----- part 4 ----- */ \ + ## __VA_ARGS__); \ + if (lvl == D_ERROR && __ext->oe_dlmlock != NULL) \ + LDLM_ERROR(__ext->oe_dlmlock, "extent: %p", __ext); \ + else \ + LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p", __ext); \ +} while (0) + +#undef EASSERTF +#define EASSERTF(expr, ext, fmt, args...) do { \ + if (!(expr)) { \ + OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \ + osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \ + LASSERT(expr); \ + } \ +} while (0) + +#undef EASSERT +#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n") + +static inline struct osc_extent *rb_extent(struct rb_node *n) +{ + if (n == NULL) + return NULL; + + return container_of(n, struct osc_extent, oe_node); +} + +static inline struct osc_extent *next_extent(struct osc_extent *ext) +{ + if (ext == NULL) + return NULL; + + LASSERT(ext->oe_intree); + return rb_extent(rb_next(&ext->oe_node)); +} + +static inline struct osc_extent *prev_extent(struct osc_extent *ext) +{ + if (ext == NULL) + return NULL; + + LASSERT(ext->oe_intree); + return rb_extent(rb_prev(&ext->oe_node)); +} + +static inline struct osc_extent *first_extent(struct osc_object *obj) +{ + return rb_extent(rb_first(&obj->oo_root)); +} + +/* object must be locked by caller. */ +static int osc_extent_sanity_check0(struct osc_extent *ext, + const char *func, const int line) +{ + struct osc_object *obj = ext->oe_obj; + struct osc_async_page *oap; + size_t page_count; + int rc = 0; + + if (!osc_object_is_locked(obj)) + GOTO(out, rc = 9); + + if (ext->oe_state >= OES_STATE_MAX) + GOTO(out, rc = 10); + + if (atomic_read(&ext->oe_refc) <= 0) + GOTO(out, rc = 20); + + if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) + GOTO(out, rc = 30); + + switch (ext->oe_state) { + case OES_INV: + if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages)) + GOTO(out, rc = 35); + GOTO(out, rc = 0); + break; + case OES_ACTIVE: + if (atomic_read(&ext->oe_users) == 0) + GOTO(out, rc = 40); + if (ext->oe_hp) + GOTO(out, rc = 50); + if (ext->oe_fsync_wait && !ext->oe_urgent) + GOTO(out, rc = 55); + break; + case OES_CACHE: + if (ext->oe_grants == 0) + GOTO(out, rc = 60); + if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) + GOTO(out, rc = 65); + fallthrough; + default: + if (atomic_read(&ext->oe_users) > 0) + GOTO(out, rc = 70); + } + + if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) + GOTO(out, rc = 80); + + if (ext->oe_sync && ext->oe_grants > 0) + GOTO(out, rc = 90); + + if (ext->oe_dlmlock != NULL && + ext->oe_dlmlock->l_resource->lr_type == LDLM_EXTENT && + !ldlm_is_failed(ext->oe_dlmlock)) { + struct ldlm_extent *extent; + + extent = &ext->oe_dlmlock->l_policy_data.l_extent; + if (!(extent->start <= cl_offset(osc2cl(obj), ext->oe_start) && + extent->end >= cl_offset(osc2cl(obj), ext->oe_max_end))) + GOTO(out, rc = 100); + + if (!(ext->oe_dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))) + GOTO(out, rc = 102); + } + + if (ext->oe_nr_pages > ext->oe_mppr) + GOTO(out, rc = 105); + + /* Do not verify page list if extent is in RPC. This is because an + * in-RPC extent is supposed to be exclusively accessible w/o lock. */ + if (ext->oe_state > OES_CACHE) + GOTO(out, rc = 0); + + if (!extent_debug) + GOTO(out, rc = 0); + + page_count = 0; + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + pgoff_t index = osc_index(oap2osc(oap)); + ++page_count; + if (index > ext->oe_end || index < ext->oe_start) + GOTO(out, rc = 110); + } + if (page_count != ext->oe_nr_pages) + GOTO(out, rc = 120); + +out: + if (rc != 0) + OSC_EXTENT_DUMP(D_ERROR, ext, + "%s:%d sanity check %p failed with rc = %d\n", + func, line, ext, rc); + return rc; +} + +#define sanity_check_nolock(ext) \ + osc_extent_sanity_check0(ext, __func__, __LINE__) + +#define sanity_check(ext) ({ \ + int __res; \ + osc_object_lock((ext)->oe_obj); \ + __res = sanity_check_nolock(ext); \ + osc_object_unlock((ext)->oe_obj); \ + __res; \ +}) + + +/** + * sanity check - to make sure there is no overlapped extent in the tree. + */ +static int osc_extent_is_overlapped(struct osc_object *obj, + struct osc_extent *ext) +{ + struct osc_extent *tmp; + + LASSERT(osc_object_is_locked(obj)); + + if (!extent_debug) + return 0; + + for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) { + if (tmp == ext) + continue; + if (tmp->oe_end >= ext->oe_start && + tmp->oe_start <= ext->oe_end) + return 1; + } + return 0; +} + +static void osc_extent_state_set(struct osc_extent *ext, int state) +{ + LASSERT(osc_object_is_locked(ext->oe_obj)); + LASSERT(state >= OES_INV && state < OES_STATE_MAX); + + /* Never try to sanity check a state changing extent :-) */ + /* LASSERT(sanity_check_nolock(ext) == 0); */ + + /* TODO: validate the state machine */ + ext->oe_state = state; + wake_up_all(&ext->oe_waitq); +} + +static struct osc_extent *osc_extent_alloc(struct osc_object *obj) +{ + struct osc_extent *ext; + + OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_NOFS); + if (ext == NULL) + return NULL; + + RB_CLEAR_NODE(&ext->oe_node); + ext->oe_obj = obj; + cl_object_get(osc2cl(obj)); + atomic_set(&ext->oe_refc, 1); + atomic_set(&ext->oe_users, 0); + INIT_LIST_HEAD(&ext->oe_link); + ext->oe_state = OES_INV; + INIT_LIST_HEAD(&ext->oe_pages); + init_waitqueue_head(&ext->oe_waitq); + ext->oe_dlmlock = NULL; + + return ext; +} + +static void osc_extent_free(struct osc_extent *ext) +{ + OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); +} + +static struct osc_extent *osc_extent_get(struct osc_extent *ext) +{ + LASSERT(atomic_read(&ext->oe_refc) >= 0); + atomic_inc(&ext->oe_refc); + return ext; +} + +static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) +{ + LASSERT(atomic_read(&ext->oe_refc) > 0); + if (atomic_dec_and_test(&ext->oe_refc)) { + LASSERT(list_empty(&ext->oe_link)); + LASSERT(atomic_read(&ext->oe_users) == 0); + LASSERT(ext->oe_state == OES_INV); + LASSERT(!ext->oe_intree); + + if (ext->oe_dlmlock != NULL) { + lu_ref_add(&ext->oe_dlmlock->l_reference, + "osc_extent", ext); + LDLM_LOCK_PUT(ext->oe_dlmlock); + ext->oe_dlmlock = NULL; + } + cl_object_put(env, osc2cl(ext->oe_obj)); + osc_extent_free(ext); + } +} + +/** + * osc_extent_put_trust() is a special version of osc_extent_put() when + * it's known that the caller is not the last user. This is to address the + * problem of lacking of lu_env ;-). + */ +static void osc_extent_put_trust(struct osc_extent *ext) +{ + LASSERT(atomic_read(&ext->oe_refc) > 1); + LASSERT(osc_object_is_locked(ext->oe_obj)); + atomic_dec(&ext->oe_refc); +} + +/** + * Return the extent which includes pgoff @index, or return the greatest + * previous extent in the tree. + */ +static struct osc_extent *osc_extent_search(struct osc_object *obj, + pgoff_t index) +{ + struct rb_node *n = obj->oo_root.rb_node; + struct osc_extent *tmp, *p = NULL; + + LASSERT(osc_object_is_locked(obj)); + while (n != NULL) { + tmp = rb_extent(n); + if (index < tmp->oe_start) { + n = n->rb_left; + } else if (index > tmp->oe_end) { + p = rb_extent(n); + n = n->rb_right; + } else { + return tmp; + } + } + return p; +} + +/* + * Return the extent covering @index, otherwise return NULL. + * caller must have held object lock. + */ +static struct osc_extent *osc_extent_lookup(struct osc_object *obj, + pgoff_t index) +{ + struct osc_extent *ext; + + ext = osc_extent_search(obj, index); + if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end) + return osc_extent_get(ext); + return NULL; +} + +/* caller must have held object lock. */ +static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) +{ + struct rb_node **n = &obj->oo_root.rb_node; + struct rb_node *parent = NULL; + struct osc_extent *tmp; + + LASSERT(ext->oe_intree == 0); + LASSERT(ext->oe_obj == obj); + LASSERT(osc_object_is_locked(obj)); + while (*n != NULL) { + tmp = rb_extent(*n); + parent = *n; + + if (ext->oe_end < tmp->oe_start) + n = &(*n)->rb_left; + else if (ext->oe_start > tmp->oe_end) + n = &(*n)->rb_right; + else + EASSERTF(0, tmp, EXTSTR"\n", EXTPARA(ext)); + } + rb_link_node(&ext->oe_node, parent, n); + rb_insert_color(&ext->oe_node, &obj->oo_root); + osc_extent_get(ext); + ext->oe_intree = 1; +} + +/* caller must have held object lock. */ +static void osc_extent_erase(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + LASSERT(osc_object_is_locked(obj)); + if (ext->oe_intree) { + rb_erase(&ext->oe_node, &obj->oo_root); + ext->oe_intree = 0; + /* rbtree held a refcount */ + osc_extent_put_trust(ext); + } +} + +static struct osc_extent *osc_extent_hold(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + + LASSERT(osc_object_is_locked(obj)); + LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE); + if (ext->oe_state == OES_CACHE) { + osc_extent_state_set(ext, OES_ACTIVE); + osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages); + } + atomic_inc(&ext->oe_users); + list_del_init(&ext->oe_link); + return osc_extent_get(ext); +} + +static void __osc_extent_remove(struct osc_extent *ext) +{ + LASSERT(osc_object_is_locked(ext->oe_obj)); + LASSERT(list_empty(&ext->oe_pages)); + osc_extent_erase(ext); + list_del_init(&ext->oe_link); + osc_extent_state_set(ext, OES_INV); + OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n"); +} + +static void osc_extent_remove(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + + osc_object_lock(obj); + __osc_extent_remove(ext); + osc_object_unlock(obj); +} + +/** + * This function is used to merge extents to get better performance. It checks + * if @cur and @victim are contiguous at block level. + */ +static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, + struct osc_extent *victim) +{ + struct osc_object *obj = cur->oe_obj; + struct client_obd *cli = osc_cli(obj); + pgoff_t chunk_start; + pgoff_t chunk_end; + int ppc_bits; + + LASSERT(cur->oe_state == OES_CACHE); + LASSERT(osc_object_is_locked(obj)); + if (victim == NULL) + return -EINVAL; + + if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait) + return -EBUSY; + + if (cur->oe_max_end != victim->oe_max_end) + return -ERANGE; + + LASSERT(cur->oe_dlmlock == victim->oe_dlmlock); + ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT; + chunk_start = cur->oe_start >> ppc_bits; + chunk_end = cur->oe_end >> ppc_bits; + if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && + chunk_end + 1 != victim->oe_start >> ppc_bits) + return -ERANGE; + + /* overall extent size should not exceed the max supported limit + * reported by the server */ + if (cur->oe_end - cur->oe_start + 1 + + victim->oe_end - victim->oe_start + 1 > cli->cl_max_extent_pages) + return -ERANGE; + + OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur); + + cur->oe_start = min(cur->oe_start, victim->oe_start); + cur->oe_end = max(cur->oe_end, victim->oe_end); + /* per-extent tax should be accounted only once for the whole extent */ + cur->oe_grants += victim->oe_grants - cli->cl_grant_extent_tax; + cur->oe_nr_pages += victim->oe_nr_pages; + /* only the following bits are needed to merge */ + cur->oe_urgent |= victim->oe_urgent; + cur->oe_memalloc |= victim->oe_memalloc; + list_splice_init(&victim->oe_pages, &cur->oe_pages); + list_del_init(&victim->oe_link); + victim->oe_nr_pages = 0; + + osc_extent_get(victim); + __osc_extent_remove(victim); + osc_extent_put(env, victim); + + OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim); + return 0; +} + +/** + * Drop user count of osc_extent, and unplug IO asynchronously. + */ +int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); + int rc = 0; + ENTRY; + + LASSERT(atomic_read(&ext->oe_users) > 0); + LASSERT(sanity_check(ext) == 0); + LASSERT(ext->oe_grants > 0); + + if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) { + LASSERT(ext->oe_state == OES_ACTIVE); + if (ext->oe_trunc_pending) { + /* a truncate process is waiting for this extent. + * This may happen due to a race, check + * osc_cache_truncate_start(). */ + osc_extent_state_set(ext, OES_TRUNC); + ext->oe_trunc_pending = 0; + } else { + int grant = 0; + + osc_extent_state_set(ext, OES_CACHE); + osc_update_pending(obj, OBD_BRW_WRITE, + ext->oe_nr_pages); + + /* try to merge the previous and next extent. */ + if (osc_extent_merge(env, ext, prev_extent(ext)) == 0) + grant += cli->cl_grant_extent_tax; + if (osc_extent_merge(env, ext, next_extent(ext)) == 0) + grant += cli->cl_grant_extent_tax; + if (grant > 0) + osc_unreserve_grant(cli, 0, grant); + + if (ext->oe_hp) + list_move_tail(&ext->oe_link, + &obj->oo_hp_exts); + else if (ext->oe_urgent) + list_move_tail(&ext->oe_link, + &obj->oo_urgent_exts); + else if (ext->oe_nr_pages == ext->oe_mppr) { + list_move_tail(&ext->oe_link, + &obj->oo_full_exts); + } + } + osc_object_unlock(obj); + + osc_io_unplug_async(env, cli, obj); + } + osc_extent_put(env, ext); + RETURN(rc); +} + +static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) +{ + return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); +} + +/** + * Find or create an extent which includes @index, core function to manage + * extent tree. + */ +static struct osc_extent *osc_extent_find(const struct lu_env *env, + struct osc_object *obj, pgoff_t index, + unsigned int *grants) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_lock *olck; + struct cl_lock_descr *descr; + struct osc_extent *cur; + struct osc_extent *ext; + struct osc_extent *conflict = NULL; + struct osc_extent *found = NULL; + pgoff_t chunk; + pgoff_t max_end; + unsigned int max_pages; /* max_pages_per_rpc */ + unsigned int chunksize; + int ppc_bits; /* pages per chunk bits */ + pgoff_t chunk_mask; + int rc; + ENTRY; + + cur = osc_extent_alloc(obj); + if (cur == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + olck = osc_env_io(env)->oi_write_osclock; + LASSERTF(olck != NULL, "page %lu is not covered by lock\n", index); + LASSERT(olck->ols_state == OLS_GRANTED); + + descr = &olck->ols_cl.cls_lock->cll_descr; + LASSERT(descr->cld_mode >= CLM_WRITE); + + LASSERTF(cli->cl_chunkbits >= PAGE_SHIFT, + "chunkbits: %u\n", cli->cl_chunkbits); + ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; + chunk_mask = ~((1 << ppc_bits) - 1); + chunksize = 1 << cli->cl_chunkbits; + chunk = index >> ppc_bits; + + /* align end to RPC edge. */ + max_pages = cli->cl_max_pages_per_rpc; + if ((max_pages & ~chunk_mask) != 0) { + CERROR("max_pages: %#x chunkbits: %u chunk_mask: %#lx\n", + max_pages, cli->cl_chunkbits, chunk_mask); + RETURN(ERR_PTR(-EINVAL)); + } + max_end = index - (index % max_pages) + max_pages - 1; + max_end = min_t(pgoff_t, max_end, descr->cld_end); + + /* initialize new extent by parameters so far */ + cur->oe_max_end = max_end; + cur->oe_start = index & chunk_mask; + cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1; + if (cur->oe_start < descr->cld_start) + cur->oe_start = descr->cld_start; + if (cur->oe_end > max_end) + cur->oe_end = max_end; + cur->oe_grants = 0; + cur->oe_mppr = max_pages; + if (olck->ols_dlmlock != NULL) { + LASSERT(olck->ols_hold); + cur->oe_dlmlock = LDLM_LOCK_GET(olck->ols_dlmlock); + lu_ref_add(&olck->ols_dlmlock->l_reference, "osc_extent", cur); + } + + /* grants has been allocated by caller */ + LASSERTF(*grants >= chunksize + cli->cl_grant_extent_tax, + "%u/%u/%u.\n", *grants, chunksize, cli->cl_grant_extent_tax); + LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR"\n", + EXTPARA(cur)); + +restart: + osc_object_lock(obj); + ext = osc_extent_search(obj, cur->oe_start); + if (ext == NULL) + ext = first_extent(obj); + while (ext != NULL) { + pgoff_t ext_chk_start = ext->oe_start >> ppc_bits; + pgoff_t ext_chk_end = ext->oe_end >> ppc_bits; + + LASSERT(sanity_check_nolock(ext) == 0); + if (chunk > ext_chk_end + 1 || chunk < ext_chk_start) + break; + + /* if covering by different locks, no chance to match */ + if (olck->ols_dlmlock != ext->oe_dlmlock) { + EASSERTF(!overlapped(ext, cur), ext, + EXTSTR"\n", EXTPARA(cur)); + + ext = next_extent(ext); + continue; + } + + /* discontiguous chunks? */ + if (chunk + 1 < ext_chk_start) { + ext = next_extent(ext); + continue; + } + + /* ok, from now on, ext and cur have these attrs: + * 1. covered by the same lock + * 2. contiguous at chunk level or overlapping. */ + + if (overlapped(ext, cur)) { + /* cur is the minimum unit, so overlapping means + * full contain. */ + EASSERTF((ext->oe_start <= cur->oe_start && + ext->oe_end >= cur->oe_end), + ext, EXTSTR"\n", EXTPARA(cur)); + + if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) { + /* for simplicity, we wait for this extent to + * finish before going forward. */ + conflict = osc_extent_get(ext); + break; + } + + found = osc_extent_hold(ext); + break; + } + + /* non-overlapped extent */ + if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) { + /* we can't do anything for a non OES_CACHE extent, or + * if there is someone waiting for this extent to be + * flushed, try next one. */ + ext = next_extent(ext); + continue; + } + + /* check if they belong to the same rpc slot before trying to + * merge. the extents are not overlapped and contiguous at + * chunk level to get here. */ + if (ext->oe_max_end != max_end) { + /* if they don't belong to the same RPC slot or + * max_pages_per_rpc has ever changed, do not merge. */ + ext = next_extent(ext); + continue; + } + + /* check whether maximum extent size will be hit */ + if ((ext_chk_end - ext_chk_start + 1 + 1) << ppc_bits > + cli->cl_max_extent_pages) { + ext = next_extent(ext); + continue; + } + + /* it's required that an extent must be contiguous at chunk + * level so that we know the whole extent is covered by grant + * (the pages in the extent are NOT required to be contiguous). + * Otherwise, it will be too much difficult to know which + * chunks have grants allocated. */ + + /* try to do front merge - extend ext's start */ + if (chunk + 1 == ext_chk_start) { + /* ext must be chunk size aligned */ + EASSERT((ext->oe_start & ~chunk_mask) == 0, ext); + + /* pull ext's start back to cover cur */ + ext->oe_start = cur->oe_start; + ext->oe_grants += chunksize; + LASSERT(*grants >= chunksize); + *grants -= chunksize; + + found = osc_extent_hold(ext); + } else if (chunk == ext_chk_end + 1) { + /* rear merge */ + ext->oe_end = cur->oe_end; + ext->oe_grants += chunksize; + LASSERT(*grants >= chunksize); + *grants -= chunksize; + + /* try to merge with the next one because we just fill + * in a gap */ + if (osc_extent_merge(env, ext, next_extent(ext)) == 0) + /* we can save extent tax from next extent */ + *grants += cli->cl_grant_extent_tax; + + found = osc_extent_hold(ext); + } + if (found != NULL) + break; + + ext = next_extent(ext); + } + + osc_extent_tree_dump(D_CACHE, obj); + if (found != NULL) { + LASSERT(conflict == NULL); + if (!IS_ERR(found)) { + LASSERT(found->oe_dlmlock == cur->oe_dlmlock); + OSC_EXTENT_DUMP(D_CACHE, found, + "found caching ext for %lu.\n", index); + } + } else if (conflict == NULL) { + /* create a new extent */ + EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur); + cur->oe_grants = chunksize + cli->cl_grant_extent_tax; + LASSERT(*grants >= cur->oe_grants); + *grants -= cur->oe_grants; + + cur->oe_state = OES_CACHE; + found = osc_extent_hold(cur); + osc_extent_insert(obj, cur); + OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n", + index, descr->cld_end); + } + osc_object_unlock(obj); + + if (conflict != NULL) { + LASSERT(found == NULL); + + /* waiting for IO to finish. Please notice that it's impossible + * to be an OES_TRUNC extent. */ + rc = osc_extent_wait(env, conflict, OES_INV); + osc_extent_put(env, conflict); + conflict = NULL; + if (rc < 0) + GOTO(out, found = ERR_PTR(rc)); + + goto restart; + } + EXIT; + +out: + osc_extent_put(env, cur); + return found; +} + +/** + * Called when IO is finished to an extent. + */ +int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, + int sent, int rc) +{ + struct client_obd *cli = osc_cli(ext->oe_obj); + struct osc_async_page *oap; + struct osc_async_page *tmp; + int nr_pages = ext->oe_nr_pages; + int lost_grant = 0; + int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; + loff_t last_off = 0; + int last_count = -1; + ENTRY; + + OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n"); + + ext->oe_rc = rc ?: ext->oe_nr_pages; + EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext); + + osc_lru_add_batch(cli, &ext->oe_pages); + list_for_each_entry_safe(oap, tmp, &ext->oe_pages, + oap_pending_item) { + list_del_init(&oap->oap_rpc_item); + list_del_init(&oap->oap_pending_item); + if (last_off <= oap->oap_obj_off) { + last_off = oap->oap_obj_off; + last_count = oap->oap_count; + } + + --ext->oe_nr_pages; + osc_ap_completion(env, cli, oap, sent, rc); + } + EASSERT(ext->oe_nr_pages == 0, ext); + + if (!sent) { + lost_grant = ext->oe_grants; + } else if (blocksize < PAGE_SIZE && + last_count != PAGE_SIZE) { + /* For short writes we shouldn't count parts of pages that + * span a whole chunk on the OST side, or our accounting goes + * wrong. Should match the code in filter_grant_check. */ + int offset = last_off & ~PAGE_MASK; + int count = last_count + (offset & (blocksize - 1)); + int end = (offset + last_count) & (blocksize - 1); + if (end) + count += blocksize - end; + + lost_grant = PAGE_SIZE - count; + } + if (ext->oe_grants > 0) + osc_free_grant(cli, nr_pages, lost_grant, ext->oe_grants); + + osc_extent_remove(ext); + /* put the refcount for RPC */ + osc_extent_put(env, ext); + RETURN(0); +} + +static int extent_wait_cb(struct osc_extent *ext, enum osc_extent_state state) +{ + int ret; + + osc_object_lock(ext->oe_obj); + ret = ext->oe_state == state; + osc_object_unlock(ext->oe_obj); + + return ret; +} + +/** + * Wait for the extent's state to become @state. + */ +static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, + enum osc_extent_state state) +{ + struct osc_object *obj = ext->oe_obj; + struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, + LWI_ON_SIGNAL_NOOP, NULL); + int rc = 0; + ENTRY; + + osc_object_lock(obj); + LASSERT(sanity_check_nolock(ext) == 0); + /* `Kick' this extent only if the caller is waiting for it to be + * written out. */ + if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp) { + if (ext->oe_state == OES_ACTIVE) { + ext->oe_urgent = 1; + } else if (ext->oe_state == OES_CACHE) { + ext->oe_urgent = 1; + osc_extent_hold(ext); + rc = 1; + } + } + osc_object_unlock(obj); + if (rc == 1) + osc_extent_release(env, ext); + + /* wait for the extent until its state becomes @state */ + rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi); + if (rc == -ETIMEDOUT) { + OSC_EXTENT_DUMP(D_ERROR, ext, + "%s: wait ext to %u timedout, recovery in progress?\n", + cli_name(osc_cli(obj)), state); + + lwi = LWI_INTR(NULL, NULL); + rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), + &lwi); + } + if (rc == 0 && ext->oe_rc < 0) + rc = ext->oe_rc; + RETURN(rc); +} + +/** + * Discard pages with index greater than @size. If @ext is overlapped with + * @size, then partial truncate happens. + */ +static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, + bool partial) +{ + struct lu_env *env; + struct cl_io *io; + struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_async_page *oap; + struct osc_async_page *tmp; + struct pagevec *pvec; + int pages_in_chunk = 0; + int ppc_bits = cli->cl_chunkbits - + PAGE_SHIFT; + __u64 trunc_chunk = trunc_index >> ppc_bits; + int grants = 0; + int nr_pages = 0; + int rc = 0; + __u16 refcheck; + ENTRY; + + LASSERT(sanity_check(ext) == 0); + LASSERT(ext->oe_state == OES_TRUNC); + LASSERT(!ext->oe_urgent); + + /* Request new lu_env. + * We can't use that env from osc_cache_truncate_start() because + * it's from lov_io_sub and not fully initialized. */ + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = osc_env_thread_io(env); + io->ci_obj = cl_object_top(osc2cl(obj)); + io->ci_ignore_layout = 1; + pvec = &osc_env_info(env)->oti_pagevec; + ll_pagevec_init(pvec, 0); + rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (rc < 0) + GOTO(out, rc); + + /* discard all pages with index greater than trunc_index */ + list_for_each_entry_safe(oap, tmp, &ext->oe_pages, + oap_pending_item) { + pgoff_t index = osc_index(oap2osc(oap)); + struct cl_page *page = oap2cl_page(oap); + + LASSERT(list_empty(&oap->oap_rpc_item)); + + /* only discard the pages with their index greater than + * trunc_index, and ... */ + if (index < trunc_index || + (index == trunc_index && partial)) { + /* accounting how many pages remaining in the chunk + * so that we can calculate grants correctly. */ + if (index >> ppc_bits == trunc_chunk) + ++pages_in_chunk; + continue; + } + + list_del_init(&oap->oap_pending_item); + + cl_page_get(page); + lu_ref_add(&page->cp_reference, "truncate", current); + + if (cl_page_own(env, io, page) == 0) { + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + LASSERT(0); + } + + lu_ref_del(&page->cp_reference, "truncate", current); + cl_pagevec_put(env, page, pvec); + + --ext->oe_nr_pages; + ++nr_pages; + } + pagevec_release(pvec); + + EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, + ext->oe_nr_pages == 0), + ext, "trunc_index %lu, partial %d\n", trunc_index, partial); + + osc_object_lock(obj); + if (ext->oe_nr_pages == 0) { + LASSERT(pages_in_chunk == 0); + grants = ext->oe_grants; + ext->oe_grants = 0; + } else { /* calculate how many grants we can free */ + int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk; + pgoff_t last_index; + + + /* if there is no pages in this chunk, we can also free grants + * for the last chunk */ + if (pages_in_chunk == 0) { + /* if this is the 1st chunk and no pages in this chunk, + * ext->oe_nr_pages must be zero, so we should be in + * the other if-clause. */ + LASSERT(trunc_chunk > 0); + --trunc_chunk; + ++chunks; + } + + /* this is what we can free from this extent */ + grants = chunks << cli->cl_chunkbits; + ext->oe_grants -= grants; + last_index = ((trunc_chunk + 1) << ppc_bits) - 1; + ext->oe_end = min(last_index, ext->oe_max_end); + LASSERT(ext->oe_end >= ext->oe_start); + LASSERT(ext->oe_grants > 0); + } + osc_object_unlock(obj); + + if (grants > 0 || nr_pages > 0) + osc_free_grant(cli, nr_pages, grants, grants); + +out: + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + RETURN(rc); +} + +/** + * This function is used to make the extent prepared for transfer. + * A race with flusing page - ll_writepage() has to be handled cautiously. + */ +static int osc_extent_make_ready(const struct lu_env *env, + struct osc_extent *ext) +{ + struct osc_async_page *oap; + struct osc_async_page *last = NULL; + struct osc_object *obj = ext->oe_obj; + unsigned int page_count = 0; + int rc; + ENTRY; + + /* we're going to grab page lock, so object lock must not be taken. */ + LASSERT(sanity_check(ext) == 0); + /* in locking state, any process should not touch this extent. */ + EASSERT(ext->oe_state == OES_LOCKING, ext); + EASSERT(ext->oe_owner != NULL, ext); + + OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n"); + + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + ++page_count; + if (last == NULL || last->oap_obj_off < oap->oap_obj_off) + last = oap; + + /* checking ASYNC_READY is race safe */ + if ((oap->oap_async_flags & ASYNC_READY) != 0) + continue; + + rc = osc_make_ready(env, oap, OBD_BRW_WRITE); + switch (rc) { + case 0: + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_READY; + spin_unlock(&oap->oap_lock); + break; + case -EALREADY: + LASSERT((oap->oap_async_flags & ASYNC_READY) != 0); + break; + default: + LASSERTF(0, "unknown return code: %d\n", rc); + } + } + + LASSERT(page_count == ext->oe_nr_pages); + LASSERT(last != NULL); + /* the last page is the only one we need to refresh its count by + * the size of file. */ + if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { + int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); + LASSERT(last_oap_count > 0); + LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE); + last->oap_count = last_oap_count; + spin_lock(&last->oap_lock); + last->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&last->oap_lock); + } + + /* for the rest of pages, we don't need to call osf_refresh_count() + * because it's known they are not the last page */ + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { + oap->oap_count = PAGE_SIZE - oap->oap_page_off; + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&oap->oap_lock); + } + } + + osc_object_lock(obj); + osc_extent_state_set(ext, OES_RPC); + osc_object_unlock(obj); + /* get a refcount for RPC. */ + osc_extent_get(ext); + + RETURN(0); +} + +/** + * Quick and simple version of osc_extent_find(). This function is frequently + * called to expand the extent for the same IO. To expand the extent, the + * page index must be in the same or next chunk of ext->oe_end. + */ +static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, + unsigned int *grants) +{ + struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_extent *next; + int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; + pgoff_t chunk = index >> ppc_bits; + pgoff_t end_chunk; + pgoff_t end_index; + unsigned int chunksize = 1 << cli->cl_chunkbits; + int rc = 0; + ENTRY; + + LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); + osc_object_lock(obj); + LASSERT(sanity_check_nolock(ext) == 0); + end_chunk = ext->oe_end >> ppc_bits; + if (chunk > end_chunk + 1) + GOTO(out, rc = -ERANGE); + + if (end_chunk >= chunk) + GOTO(out, rc = 0); + + LASSERT(end_chunk + 1 == chunk); + + /* try to expand this extent to cover @index */ + end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1); + + /* don't go over the maximum extent size reported by server */ + if (end_index - ext->oe_start + 1 > cli->cl_max_extent_pages) + GOTO(out, rc = -ERANGE); + + next = next_extent(ext); + if (next != NULL && next->oe_start <= end_index) + /* complex mode - overlapped with the next extent, + * this case will be handled by osc_extent_find() */ + GOTO(out, rc = -EAGAIN); + + ext->oe_end = end_index; + ext->oe_grants += chunksize; + LASSERT(*grants >= chunksize); + *grants -= chunksize; + EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext, + "overlapped after expanding for %lu.\n", index); + EXIT; + +out: + osc_object_unlock(obj); + RETURN(rc); +} + +static void osc_extent_tree_dump0(int level, struct osc_object *obj, + const char *func, int line) +{ + struct osc_extent *ext; + int cnt; + + if (!cfs_cdebug_show(level, DEBUG_SUBSYSTEM)) + return; + + CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n", + obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); + + /* osc_object_lock(obj); */ + cnt = 1; + for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext)) + OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) + OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) + OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) + OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++); + /* osc_object_unlock(obj); */ +} + +/* ------------------ osc extent end ------------------ */ + +static inline int osc_is_ready(struct osc_object *osc) +{ + return !list_empty(&osc->oo_ready_item) || + !list_empty(&osc->oo_hp_ready_item); +} + +#define OSC_IO_DEBUG(OSC, STR, args...) \ + CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \ + (OSC), osc_is_ready(OSC), \ + list_empty_marker(&(OSC)->oo_hp_ready_item), \ + list_empty_marker(&(OSC)->oo_ready_item), \ + atomic_read(&(OSC)->oo_nr_writes), \ + list_empty_marker(&(OSC)->oo_hp_exts), \ + list_empty_marker(&(OSC)->oo_urgent_exts), \ + atomic_read(&(OSC)->oo_nr_reads), \ + list_empty_marker(&(OSC)->oo_reading_exts), \ + ##args) + +static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, + int cmd) +{ + struct osc_page *opg = oap2osc_page(oap); + struct cl_page *page = oap2cl_page(oap); + int result; + + LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ + + ENTRY; + result = cl_page_make_ready(env, page, CRT_WRITE); + if (result == 0) + opg->ops_submit_time = ktime_get(); + RETURN(result); +} + +static int osc_refresh_count(const struct lu_env *env, + struct osc_async_page *oap, int cmd) +{ + struct osc_page *opg = oap2osc_page(oap); + pgoff_t index = osc_index(oap2osc(oap)); + struct cl_object *obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int result; + loff_t kms; + + /* readpage queues with _COUNT_STABLE, shouldn't get here. */ + LASSERT(!(cmd & OBD_BRW_READ)); + LASSERT(opg != NULL); + obj = opg->ops_cl.cpl_obj; + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + if (result < 0) + return result; + kms = attr->cat_kms; + if (cl_offset(obj, index) >= kms) + /* catch race with truncate */ + return 0; + else if (cl_offset(obj, index + 1) > kms) + /* catch sub-page write at end of file */ + return kms % PAGE_SIZE; + else + return PAGE_SIZE; +} + +static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, + int cmd, int rc) +{ + struct osc_page *opg = oap2osc_page(oap); + struct cl_page *page = oap2cl_page(oap); + enum cl_req_type crt; + int srvlock; + + ENTRY; + + cmd &= ~OBD_BRW_NOQUOTA; + LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ), + "cp_state:%u, cmd:%d\n", page->cp_state, cmd); + LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE), + "cp_state:%u, cmd:%d\n", page->cp_state, cmd); + LASSERT(opg->ops_transfer_pinned); + + crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; + /* Clear opg->ops_transfer_pinned before VM lock is released. */ + opg->ops_transfer_pinned = 0; + + opg->ops_submit_time = ktime_set(0, 0); + srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; + + /* statistic */ + if (rc == 0 && srvlock) { + struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; + struct osc_stats *stats = &lu2osc_dev(ld)->od_stats; + size_t bytes = oap->oap_count; + + if (crt == CRT_READ) + stats->os_lockless_reads += bytes; + else + stats->os_lockless_writes += bytes; + } + + /* + * This has to be the last operation with the page, as locks are + * released in cl_page_completion() and nothing except for the + * reference counter protects page from concurrent reclaim. + */ + lu_ref_del(&page->cp_reference, "transfer", page); + + cl_page_completion(env, page, crt, rc); + cl_page_put(env, page); + + RETURN(0); +} + +#define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do { \ + struct client_obd *__tmp = (cli); \ + CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %ld/%lu " \ + "dropped: %ld avail: %ld, dirty_grant: %ld, " \ + "reserved: %ld, flight: %d } lru {in list: %ld, " \ + "left: %ld, waiters: %d }" fmt "\n", \ + cli_name(__tmp), \ + __tmp->cl_dirty_pages, __tmp->cl_dirty_max_pages, \ + atomic_long_read(&obd_dirty_pages), obd_max_dirty_pages, \ + __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ + __tmp->cl_dirty_grant, \ + __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, \ + atomic_long_read(&__tmp->cl_lru_in_list), \ + atomic_long_read(&__tmp->cl_lru_busy), \ + atomic_read(&__tmp->cl_lru_shrinkers), ##args); \ +} while (0) + +/* caller must hold loi_list_lock */ +static void osc_consume_write_grant(struct client_obd *cli, + struct brw_page *pga) +{ + assert_spin_locked(&cli->cl_loi_list_lock); + LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); + cli->cl_dirty_pages++; + pga->flag |= OBD_BRW_FROM_GRANT; + CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", + PAGE_SIZE, pga, pga->pg); + osc_update_next_shrink(cli); +} + +/* the companion to osc_consume_write_grant, called when a brw has completed. + * must be called with the loi lock held. */ +static void osc_release_write_grant(struct client_obd *cli, + struct brw_page *pga) +{ + ENTRY; + + assert_spin_locked(&cli->cl_loi_list_lock); + if (!(pga->flag & OBD_BRW_FROM_GRANT)) { + EXIT; + return; + } + + pga->flag &= ~OBD_BRW_FROM_GRANT; + atomic_long_dec(&obd_dirty_pages); + cli->cl_dirty_pages--; + EXIT; +} + +/** + * To avoid sleeping with object lock held, it's good for us allocate enough + * grants before entering into critical section. + * + * client_obd_list_lock held by caller + */ +static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes) +{ + int rc = -EDQUOT; + + if (cli->cl_avail_grant >= bytes) { + cli->cl_avail_grant -= bytes; + cli->cl_reserved_grant += bytes; + rc = 0; + } + return rc; +} + +static void __osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + /* it's quite normal for us to get more grant than reserved. + * Thinking about a case that two extents merged by adding a new + * chunk, we can save one extent tax. If extent tax is greater than + * one chunk, we can save more grant by adding a new chunk */ + cli->cl_reserved_grant -= reserved; + if (unused > reserved) { + cli->cl_avail_grant += reserved; + cli->cl_lost_grant += unused - reserved; + cli->cl_dirty_grant -= unused - reserved; + } else { + cli->cl_avail_grant += unused; + cli->cl_dirty_grant += reserved - unused; + } +} + +static void osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + spin_lock(&cli->cl_loi_list_lock); + __osc_unreserve_grant(cli, reserved, unused); + if (unused > 0) + osc_wake_cache_waiters(cli); + spin_unlock(&cli->cl_loi_list_lock); +} + +/** + * Free grant after IO is finished or canceled. + * + * @lost_grant is used to remember how many grants we have allocated but not + * used, we should return these grants to OST. There're two cases where grants + * can be lost: + * 1. truncate; + * 2. blocksize at OST is less than PAGE_SIZE and a partial page was + * written. In this case OST may use less chunks to serve this partial + * write. OSTs don't actually know the page size on the client side. so + * clients have to calculate lost grant by the blocksize on the OST. + * See filter_grant_check() for details. + */ +static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, + unsigned int lost_grant, unsigned int dirty_grant) +{ + unsigned long grant; + + grant = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax; + + spin_lock(&cli->cl_loi_list_lock); + atomic_long_sub(nr_pages, &obd_dirty_pages); + cli->cl_dirty_pages -= nr_pages; + cli->cl_lost_grant += lost_grant; + cli->cl_dirty_grant -= dirty_grant; + if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { + /* borrow some grant from truncate to avoid the case that + * truncate uses up all avail grant */ + cli->cl_lost_grant -= grant; + cli->cl_avail_grant += grant; + } + osc_wake_cache_waiters(cli); + spin_unlock(&cli->cl_loi_list_lock); + CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu/%lu\n", + lost_grant, cli->cl_lost_grant, + cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT, + cli->cl_dirty_grant); +} + +/** + * The companion to osc_enter_cache(), called when @oap is no longer part of + * the dirty accounting due to error. + */ +static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) +{ + spin_lock(&cli->cl_loi_list_lock); + osc_release_write_grant(cli, &oap->oap_brw_page); + spin_unlock(&cli->cl_loi_list_lock); +} + +/** + * Non-blocking version of osc_enter_cache() that consumes grant only when it + * is available. + */ +static int osc_enter_cache_try(struct client_obd *cli, + struct osc_async_page *oap, + int bytes) +{ + int rc; + + OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); + + rc = osc_reserve_grant(cli, bytes); + if (rc < 0) + return 0; + + if (cli->cl_dirty_pages < cli->cl_dirty_max_pages) { + if (atomic_long_add_return(1, &obd_dirty_pages) <= + obd_max_dirty_pages) { + osc_consume_write_grant(cli, &oap->oap_brw_page); + return 1; + } + atomic_long_dec(&obd_dirty_pages); + } + __osc_unreserve_grant(cli, bytes, bytes); + return 0; +} + +/* Following two inlines exist to pass code fragments + * to wait_event_idle_exclusive_timeout_cmd(). Passing + * code fragments as macro args can look confusing, so + * we provide inlines to encapsulate them. + */ +static inline void cli_unlock_and_unplug(const struct lu_env *env, + struct client_obd *cli, + struct osc_async_page *oap) +{ + spin_unlock(&cli->cl_loi_list_lock); + osc_io_unplug_async(env, cli, NULL); + CDEBUG(D_CACHE, + "%s: sleeping for cache space for %p\n", + cli_name(cli), oap); +} + +static inline void cli_lock_after_unplug(struct client_obd *cli) +{ + spin_lock(&cli->cl_loi_list_lock); +} +/** + * The main entry to reserve dirty page accounting. Usually the grant reserved + * in this function will be freed in bulk in osc_free_grant() unless it fails + * to add osc cache, in that case, it will be freed in osc_exit_cache(). + * + * The process will be put into sleep if it's already run out of grant. + */ +static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int bytes) +{ + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + int rc = -EDQUOT; + int remain; + bool entered = false; + /* We cannot wait for a long time here since we are holding ldlm lock + * across the actual IO. If no requests complete fast (e.g. due to + * overloaded OST that takes a long time to process everything, we'd + * get evicted if we wait for a normal obd_timeout or some such. + * So we try to wait half the time it would take the client to be + * evicted by server which is half obd_timeout when AT is off + * or at least ldlm_enqueue_min with AT on. + * See LU-13131 */ + unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout / 2 : + ldlm_enqueue_min / 2); + + ENTRY; + + OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); + + spin_lock(&cli->cl_loi_list_lock); + + /* force the caller to try sync io. this can jump the list + * of queued writes and create a discontiguous rpc stream */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || + cli->cl_dirty_max_pages == 0 || + cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) { + OSC_DUMP_GRANT(D_CACHE, cli, "forced sync i/o\n"); + GOTO(out, rc = -EDQUOT); + } + + /* + * We can wait here for two reasons: too many dirty pages in cache, or + * run out of grants. In both cases we should write dirty pages out. + * Adding a cache waiter will trigger urgent write-out no matter what + * RPC size will be. + * The exiting condition (other than success) is no avail grants + * and no dirty pages caching, that really means there is no space + * on the OST. + */ + remain = wait_event_idle_exclusive_timeout_cmd( + cli->cl_cache_waiters, + (entered = osc_enter_cache_try(cli, oap, bytes)) || + (cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0), + timeout, + cli_unlock_and_unplug(env, cli, oap), + cli_lock_after_unplug(cli)); + + if (entered) { + if (remain == timeout) + OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); + else + OSC_DUMP_GRANT(D_CACHE, cli, + "finally got grant space\n"); + wake_up(&cli->cl_cache_waiters); + rc = 0; + } else if (remain == 0) { + OSC_DUMP_GRANT(D_CACHE, cli, + "timeout, fall back to sync i/o\n"); + osc_extent_tree_dump(D_CACHE, osc); + /* fall back to synchronous I/O */ + } else { + OSC_DUMP_GRANT(D_CACHE, cli, + "no grant space, fall back to sync i/o\n"); + wake_up_all(&cli->cl_cache_waiters); + } + EXIT; +out: + spin_unlock(&cli->cl_loi_list_lock); + RETURN(rc); +} + +static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) +{ + int hprpc = !!list_empty(&osc->oo_hp_exts); + return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; +} + +/* This maintains the lists of pending pages to read/write for a given object + * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint() + * to quickly find objects that are ready to send an RPC. */ +static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, + int cmd) +{ + int invalid_import = 0; + ENTRY; + + /* if we have an invalid import we want to drain the queued pages + * by forcing them through rpcs that immediately fail and complete + * the pages. recovery relies on this to empty the queued pages + * before canceling the locks and evicting down the llite pages */ + if ((cli->cl_import == NULL || cli->cl_import->imp_invalid)) + invalid_import = 1; + + if (cmd & OBD_BRW_WRITE) { + if (atomic_read(&osc->oo_nr_writes) == 0) + RETURN(0); + if (invalid_import) { + CDEBUG(D_CACHE, "invalid import forcing RPC\n"); + RETURN(1); + } + if (!list_empty(&osc->oo_hp_exts)) { + CDEBUG(D_CACHE, "high prio request forcing RPC\n"); + RETURN(1); + } + if (!list_empty(&osc->oo_urgent_exts)) { + CDEBUG(D_CACHE, "urgent request forcing RPC\n"); + RETURN(1); + } + /* trigger a write rpc stream as long as there are dirtiers + * waiting for space. as they're waiting, they're not going to + * create more pages to coalesce with what's waiting.. + */ + if (waitqueue_active(&cli->cl_cache_waiters)) { + CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); + RETURN(1); + } + if (!list_empty(&osc->oo_full_exts)) { + CDEBUG(D_CACHE, "full extent ready, make an RPC\n"); + RETURN(1); + } + } else { + if (atomic_read(&osc->oo_nr_reads) == 0) + RETURN(0); + if (invalid_import) { + CDEBUG(D_CACHE, "invalid import forcing RPC\n"); + RETURN(1); + } + /* all read are urgent. */ + if (!list_empty(&osc->oo_reading_exts)) + RETURN(1); + } + + RETURN(0); +} + +static void osc_update_pending(struct osc_object *obj, int cmd, int delta) +{ + struct client_obd *cli = osc_cli(obj); + if (cmd & OBD_BRW_WRITE) { + atomic_add(delta, &obj->oo_nr_writes); + atomic_add(delta, &cli->cl_pending_w_pages); + LASSERT(atomic_read(&obj->oo_nr_writes) >= 0); + } else { + atomic_add(delta, &obj->oo_nr_reads); + atomic_add(delta, &cli->cl_pending_r_pages); + LASSERT(atomic_read(&obj->oo_nr_reads) >= 0); + } + OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta); +} + +static int osc_makes_hprpc(struct osc_object *obj) +{ + return !list_empty(&obj->oo_hp_exts); +} + +static void on_list(struct list_head *item, struct list_head *list, + int should_be_on) +{ + if (list_empty(item) && should_be_on) + list_add_tail(item, list); + else if (!list_empty(item) && !should_be_on) + list_del_init(item); +} + +/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc + * can find pages to build into rpcs quickly */ +static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc) +{ + if (osc_makes_hprpc(osc)) { + /* HP rpc */ + on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0); + on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); + } else { + on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); + on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, + osc_makes_rpc(cli, osc, OBD_BRW_WRITE) || + osc_makes_rpc(cli, osc, OBD_BRW_READ)); + } + + on_list(&osc->oo_write_item, &cli->cl_loi_write_list, + atomic_read(&osc->oo_nr_writes) > 0); + + on_list(&osc->oo_read_item, &cli->cl_loi_read_list, + atomic_read(&osc->oo_nr_reads) > 0); + + return osc_is_ready(osc); +} + +static int osc_list_maint(struct client_obd *cli, struct osc_object *osc) +{ + int is_ready; + + spin_lock(&cli->cl_loi_list_lock); + is_ready = __osc_list_maint(cli, osc); + spin_unlock(&cli->cl_loi_list_lock); + + return is_ready; +} + +/* this is trying to propogate async writeback errors back up to the + * application. As an async write fails we record the error code for later if + * the app does an fsync. As long as errors persist we force future rpcs to be + * sync so that the app can get a sync error and break the cycle of queueing + * pages for which writeback will fail. */ +static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, + int rc) +{ + if (rc) { + if (!ar->ar_rc) + ar->ar_rc = rc; + + ar->ar_force_sync = 1; + ar->ar_min_xid = ptlrpc_sample_next_xid(); + return; + + } + + if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) + ar->ar_force_sync = 0; +} + +/* this must be called holding the loi list lock to give coverage to exit_cache, + * async_flag maintenance, and oap_request */ +static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int sent, int rc) +{ + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + __u64 xid = 0; + + ENTRY; + if (oap->oap_request != NULL) { + xid = ptlrpc_req_xid(oap->oap_request); + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = NULL; + } + + /* As the transfer for this page is being done, clear the flags */ + spin_lock(&oap->oap_lock); + oap->oap_async_flags = 0; + spin_unlock(&oap->oap_lock); + oap->oap_interrupted = 0; + + if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { + spin_lock(&cli->cl_loi_list_lock); + osc_process_ar(&cli->cl_ar, xid, rc); + osc_process_ar(&loi->loi_ar, xid, rc); + spin_unlock(&cli->cl_loi_list_lock); + } + + rc = osc_completion(env, oap, oap->oap_cmd, rc); + if (rc) + CERROR("completion on oap %p obj %p returns %d.\n", + oap, osc, rc); + + EXIT; +} + +struct extent_rpc_data { + struct list_head *erd_rpc_list; + unsigned int erd_page_count; + unsigned int erd_max_pages; + unsigned int erd_max_chunks; + unsigned int erd_max_extents; +}; + +static inline unsigned osc_extent_chunks(const struct osc_extent *ext) +{ + struct client_obd *cli = osc_cli(ext->oe_obj); + unsigned ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; + + return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1; +} + +/** + * Try to add extent to one RPC. We need to think about the following things: + * - # of pages must not be over max_pages_per_rpc + * - extent must be compatible with previous ones + */ +static int try_to_add_extent_for_io(struct client_obd *cli, + struct osc_extent *ext, + struct extent_rpc_data *data) +{ + struct osc_extent *tmp; + unsigned int chunk_count; + struct osc_async_page *oap = list_first_entry(&ext->oe_pages, + struct osc_async_page, + oap_pending_item); + ENTRY; + + EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), + ext); + OSC_EXTENT_DUMP(D_CACHE, ext, "trying to add this extent\n"); + + if (data->erd_max_extents == 0) + RETURN(0); + + chunk_count = osc_extent_chunks(ext); + EASSERTF(data->erd_page_count != 0 || + chunk_count <= data->erd_max_chunks, ext, + "The first extent to be fit in a RPC contains %u chunks, " + "which is over the limit %u.\n", chunk_count, + data->erd_max_chunks); + if (chunk_count > data->erd_max_chunks) + RETURN(0); + + data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages); + EASSERTF(data->erd_page_count != 0 || + ext->oe_nr_pages <= data->erd_max_pages, ext, + "The first extent to be fit in a RPC contains %u pages, " + "which is over the limit %u.\n", ext->oe_nr_pages, + data->erd_max_pages); + if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages) + RETURN(0); + + list_for_each_entry(tmp, data->erd_rpc_list, oe_link) { + struct osc_async_page *oap2; + oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page, + oap_pending_item); + EASSERT(tmp->oe_owner == current, tmp); +#if 0 + if (overlapped(tmp, ext)) { + OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext); + EASSERT(0, ext); + } +#endif + if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) { + CDEBUG(D_CACHE, "Do not permit different types of IO " + "in one RPC\n"); + RETURN(0); + } + + if (tmp->oe_srvlock != ext->oe_srvlock || + !tmp->oe_grants != !ext->oe_grants || + tmp->oe_ndelay != ext->oe_ndelay || + tmp->oe_no_merge || ext->oe_no_merge) + RETURN(0); + + /* remove break for strict check */ + break; + } + + data->erd_max_extents--; + data->erd_max_chunks -= chunk_count; + data->erd_page_count += ext->oe_nr_pages; + list_move_tail(&ext->oe_link, data->erd_rpc_list); + ext->oe_owner = current; + RETURN(1); +} + +static inline unsigned osc_max_write_chunks(const struct client_obd *cli) +{ + /* + * LU-8135: + * + * The maximum size of a single transaction is about 64MB in ZFS. + * #define DMU_MAX_ACCESS (64 * 1024 * 1024) + * + * Since ZFS is a copy-on-write file system, a single dirty page in + * a chunk will result in the rewrite of the whole chunk, therefore + * an RPC shouldn't be allowed to contain too many chunks otherwise + * it will make transaction size much bigger than 64MB, especially + * with big block size for ZFS. + * + * This piece of code is to make sure that OSC won't send write RPCs + * with too many chunks. The maximum chunk size that an RPC can cover + * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally + * OST should tell the client what the biggest transaction size is, + * but it's good enough for now. + * + * This limitation doesn't apply to ldiskfs, which allows as many + * chunks in one RPC as we want. However, it won't have any benefits + * to have too many discontiguous pages in one RPC. + * + * An osc_extent won't cover over a RPC size, so the chunks in an + * osc_extent won't bigger than PTLRPC_MAX_BRW_SIZE >> chunkbits. + */ + return PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits; +} + +/** + * In order to prevent multiple ptlrpcd from breaking contiguous extents, + * get_write_extent() takes all appropriate extents in atomic. + * + * The following policy is used to collect extents for IO: + * 1. Add as many HP extents as possible; + * 2. Add the first urgent extent in urgent extent list and take it out of + * urgent list; + * 3. Add subsequent extents of this urgent extent; + * 4. If urgent list is not empty, goto 2; + * 5. Traverse the extent tree from the 1st extent; + * 6. Above steps exit if there is no space in this RPC. + */ +static unsigned int get_write_extents(struct osc_object *obj, + struct list_head *rpclist) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct extent_rpc_data data = { + .erd_rpc_list = rpclist, + .erd_page_count = 0, + .erd_max_pages = cli->cl_max_pages_per_rpc, + .erd_max_chunks = osc_max_write_chunks(cli), + .erd_max_extents = 256, + }; + + LASSERT(osc_object_is_locked(obj)); + while (!list_empty(&obj->oo_hp_exts)) { + ext = list_entry(obj->oo_hp_exts.next, struct osc_extent, + oe_link); + if (!try_to_add_extent_for_io(cli, ext, &data)) + return data.erd_page_count; + EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext); + } + if (data.erd_page_count == data.erd_max_pages) + return data.erd_page_count; + + while (!list_empty(&obj->oo_urgent_exts)) { + ext = list_entry(obj->oo_urgent_exts.next, + struct osc_extent, oe_link); + if (!try_to_add_extent_for_io(cli, ext, &data)) + return data.erd_page_count; + } + if (data.erd_page_count == data.erd_max_pages) + return data.erd_page_count; + + /* One key difference between full extents and other extents: full + * extents can usually only be added if the rpclist was empty, so if we + * can't add one, we continue on to trying to add normal extents. This + * is so we don't miss adding extra extents to an RPC containing high + * priority or urgent extents. */ + while (!list_empty(&obj->oo_full_exts)) { + ext = list_entry(obj->oo_full_exts.next, + struct osc_extent, oe_link); + if (!try_to_add_extent_for_io(cli, ext, &data)) + break; + } + if (data.erd_page_count == data.erd_max_pages) + return data.erd_page_count; + + ext = first_extent(obj); + while (ext != NULL) { + if ((ext->oe_state != OES_CACHE) || + /* this extent may be already in current rpclist */ + (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) { + ext = next_extent(ext); + continue; + } + + if (!try_to_add_extent_for_io(cli, ext, &data)) + return data.erd_page_count; + + ext = next_extent(ext); + } + return data.erd_page_count; +} + +static int +osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc) +__must_hold(osc) +{ + struct list_head rpclist = LIST_HEAD_INIT(rpclist); + struct osc_extent *ext; + struct osc_extent *tmp; + struct osc_extent *first = NULL; + unsigned int page_count = 0; + int srvlock = 0; + int rc = 0; + ENTRY; + + LASSERT(osc_object_is_locked(osc)); + + page_count = get_write_extents(osc, &rpclist); + LASSERT(equi(page_count == 0, list_empty(&rpclist))); + + if (list_empty(&rpclist)) + RETURN(0); + + osc_update_pending(osc, OBD_BRW_WRITE, -page_count); + + list_for_each_entry(ext, &rpclist, oe_link) { + LASSERT(ext->oe_state == OES_CACHE || + ext->oe_state == OES_LOCK_DONE); + if (ext->oe_state == OES_CACHE) + osc_extent_state_set(ext, OES_LOCKING); + else + osc_extent_state_set(ext, OES_RPC); + } + + /* we're going to grab page lock, so release object lock because + * lock order is page lock -> object lock. */ + osc_object_unlock(osc); + + list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) { + if (ext->oe_state == OES_LOCKING) { + rc = osc_extent_make_ready(env, ext); + if (unlikely(rc < 0)) { + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 0, rc); + continue; + } + } + if (first == NULL) { + first = ext; + srvlock = ext->oe_srvlock; + } else { + LASSERT(srvlock == ext->oe_srvlock); + } + } + + if (!list_empty(&rpclist)) { + LASSERT(page_count > 0); + rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE); + LASSERT(list_empty(&rpclist)); + } + + osc_object_lock(osc); + RETURN(rc); +} + +/** + * prepare pages for ASYNC io and put pages in send queue. + * + * \param cmd OBD_BRW_* macroses + * \param lop pending pages + * + * \return zero if no page added to send queue. + * \return 1 if pages successfully added to send queue. + * \return negative on errors. + */ +static int +osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc) +__must_hold(osc) +{ + struct osc_extent *ext; + struct osc_extent *next; + struct list_head rpclist = LIST_HEAD_INIT(rpclist); + struct extent_rpc_data data = { + .erd_rpc_list = &rpclist, + .erd_page_count = 0, + .erd_max_pages = cli->cl_max_pages_per_rpc, + .erd_max_chunks = UINT_MAX, + .erd_max_extents = UINT_MAX, + }; + int rc = 0; + ENTRY; + + LASSERT(osc_object_is_locked(osc)); + list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) { + EASSERT(ext->oe_state == OES_LOCK_DONE, ext); + if (!try_to_add_extent_for_io(cli, ext, &data)) + break; + osc_extent_state_set(ext, OES_RPC); + EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext); + } + LASSERT(data.erd_page_count <= data.erd_max_pages); + + osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count); + + if (!list_empty(&rpclist)) { + osc_object_unlock(osc); + + rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ); + LASSERT(list_empty(&rpclist)); + + osc_object_lock(osc); + } + RETURN(rc); +} + +#define list_to_obj(list, item) ({ \ + struct list_head *__tmp = (list)->next; \ + list_del_init(__tmp); \ + list_entry(__tmp, struct osc_object, oo_##item); \ +}) + +/* This is called by osc_check_rpcs() to find which objects have pages that + * we could be sending. These lists are maintained by osc_makes_rpc(). */ +static struct osc_object *osc_next_obj(struct client_obd *cli) +{ + ENTRY; + + /* First return objects that have blocked locks so that they + * will be flushed quickly and other clients can get the lock, + * then objects which have pages ready to be stuffed into RPCs */ + if (!list_empty(&cli->cl_loi_hp_ready_list)) + RETURN(list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item)); + if (!list_empty(&cli->cl_loi_ready_list)) + RETURN(list_to_obj(&cli->cl_loi_ready_list, ready_item)); + + /* then if we have cache waiters, return all objects with queued + * writes. This is especially important when many small files + * have filled up the cache and not been fired into rpcs because + * they don't pass the nr_pending/object threshhold + */ + if (waitqueue_active(&cli->cl_cache_waiters) && + !list_empty(&cli->cl_loi_write_list)) + RETURN(list_to_obj(&cli->cl_loi_write_list, write_item)); + + /* then return all queued objects when we have an invalid import + * so that they get flushed */ + if (cli->cl_import == NULL || cli->cl_import->imp_invalid) { + if (!list_empty(&cli->cl_loi_write_list)) + RETURN(list_to_obj(&cli->cl_loi_write_list, + write_item)); + if (!list_empty(&cli->cl_loi_read_list)) + RETURN(list_to_obj(&cli->cl_loi_read_list, + read_item)); + } + RETURN(NULL); +} + +/* called with the loi list lock held */ +static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) +__must_hold(&cli->cl_loi_list_lock) +{ + struct osc_object *osc; + int rc = 0; + ENTRY; + + while ((osc = osc_next_obj(cli)) != NULL) { + struct cl_object *obj = osc2cl(osc); + struct lu_ref_link link; + + OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); + + /* even if we have reached our max in flight RPCs, we still + * allow all high-priority RPCs through to prevent their + * starvation and leading to server evicting us for not + * writing out pages in a timely manner LU-13131 */ + if (osc_max_rpc_in_flight(cli, osc) && + list_empty(&osc->oo_hp_exts)) { + __osc_list_maint(cli, osc); + break; + } + + cl_object_get(obj); + spin_unlock(&cli->cl_loi_list_lock); + lu_object_ref_add_at(&obj->co_lu, &link, "check", current); + + /* attempt some read/write balancing by alternating between + * reads and writes in an object. The makes_rpc checks here + * would be redundant if we were getting read/write work items + * instead of objects. we don't want send_oap_rpc to drain a + * partial read pending queue when we're given this object to + * do io on writes while there are cache waiters */ + osc_object_lock(osc); + if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) { + rc = osc_send_write_rpc(env, cli, osc); + if (rc < 0) { + CERROR("Write request failed with %d\n", rc); + + /* osc_send_write_rpc failed, mostly because of + * memory pressure. + * + * It can't break here, because if: + * - a page was submitted by osc_io_submit, so + * page locked; + * - no request in flight + * - no subsequent request + * The system will be in live-lock state, + * because there is no chance to call + * osc_io_unplug() and osc_check_rpcs() any + * more. pdflush can't help in this case, + * because it might be blocked at grabbing + * the page lock as we mentioned. + * + * Anyway, continue to drain pages. */ + /* break; */ + } + } + if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) { + rc = osc_send_read_rpc(env, cli, osc); + if (rc < 0) + CERROR("Read request failed with %d\n", rc); + } + osc_object_unlock(osc); + + osc_list_maint(cli, osc); + lu_object_ref_del_at(&obj->co_lu, &link, "check", current); + cl_object_put(env, obj); + + spin_lock(&cli->cl_loi_list_lock); + } +} + +int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, int async) +{ + int rc = 0; + + if (osc != NULL && osc_list_maint(cli, osc) == 0) + return 0; + + if (!async) { + spin_lock(&cli->cl_loi_list_lock); + osc_check_rpcs(env, cli); + spin_unlock(&cli->cl_loi_list_lock); + } else { + CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli); + LASSERT(cli->cl_writeback_work != NULL); + rc = ptlrpcd_queue_work(cli->cl_writeback_work); + } + return rc; +} +EXPORT_SYMBOL(osc_io_unplug0); + +int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, + struct page *page, loff_t offset) +{ + struct obd_export *exp = osc_export(osc); + struct osc_async_page *oap = &ops->ops_oap; + ENTRY; + + if (!page) + return cfs_size_round(sizeof(*oap)); + + oap->oap_magic = OAP_MAGIC; + oap->oap_cli = &exp->exp_obd->u.cli; + oap->oap_obj = osc; + + oap->oap_page = page; + oap->oap_obj_off = offset; + LASSERT(!(offset & ~PAGE_MASK)); + + INIT_LIST_HEAD(&oap->oap_pending_item); + INIT_LIST_HEAD(&oap->oap_rpc_item); + + spin_lock_init(&oap->oap_lock); + CDEBUG(D_INFO, "oap %p page %p obj off %llu\n", + oap, page, oap->oap_obj_off); + RETURN(0); +} +EXPORT_SYMBOL(osc_prep_async_page); + +int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops) +{ + struct osc_io *oio = osc_env_io(env); + struct osc_extent *ext = NULL; + struct osc_async_page *oap = &ops->ops_oap; + struct client_obd *cli = oap->oap_cli; + struct osc_object *osc = oap->oap_obj; + pgoff_t index; + unsigned int tmp; + unsigned int grants = 0; + u32 brw_flags = OBD_BRW_ASYNC; + int cmd = OBD_BRW_WRITE; + int need_release = 0; + int rc = 0; + ENTRY; + + if (oap->oap_magic != OAP_MAGIC) + RETURN(-EINVAL); + + if (cli->cl_import == NULL || cli->cl_import->imp_invalid) + RETURN(-EIO); + + if (!list_empty(&oap->oap_pending_item) || + !list_empty(&oap->oap_rpc_item)) + RETURN(-EBUSY); + + /* Set the OBD_BRW_SRVLOCK before the page is queued. */ + brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; + if (oio->oi_cap_sys_resource || io->ci_noquota) { + brw_flags |= OBD_BRW_NOQUOTA; + cmd |= OBD_BRW_NOQUOTA; + } + + /* check if the file's owner/group is over quota */ + if (!(cmd & OBD_BRW_NOQUOTA)) { + struct cl_object *obj; + struct cl_attr *attr; + unsigned int qid[LL_MAXQUOTAS]; + + obj = cl_object_top(&osc->oo_cl); + attr = &osc_env_info(env)->oti_attr; + + cl_object_attr_lock(obj); + rc = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + + qid[USRQUOTA] = attr->cat_uid; + qid[GRPQUOTA] = attr->cat_gid; + qid[PRJQUOTA] = attr->cat_projid; + if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA) + rc = -EDQUOT; + if (rc) + RETURN(rc); + } + + oap->oap_cmd = cmd; + oap->oap_page_off = ops->ops_from; + oap->oap_count = ops->ops_to - ops->ops_from; + /* No need to hold a lock here, + * since this page is not in any list yet. */ + oap->oap_async_flags = 0; + oap->oap_brw_flags = brw_flags; + + OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n", + oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK); + + index = osc_index(oap2osc(oap)); + + /* Add this page into extent by the following steps: + * 1. if there exists an active extent for this IO, mostly this page + * can be added to the active extent and sometimes we need to + * expand extent to accomodate this page; + * 2. otherwise, a new extent will be allocated. */ + + ext = oio->oi_active; + if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) { + /* one chunk plus extent overhead must be enough to write this + * page */ + grants = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax; + if (ext->oe_end >= index) + grants = 0; + + /* it doesn't need any grant to dirty this page */ + spin_lock(&cli->cl_loi_list_lock); + rc = osc_enter_cache_try(cli, oap, grants); + spin_unlock(&cli->cl_loi_list_lock); + if (rc == 0) { /* try failed */ + grants = 0; + need_release = 1; + } else if (ext->oe_end < index) { + tmp = grants; + /* try to expand this extent */ + rc = osc_extent_expand(ext, index, &tmp); + if (rc < 0) { + need_release = 1; + /* don't free reserved grant */ + } else { + OSC_EXTENT_DUMP(D_CACHE, ext, + "expanded for %lu.\n", index); + osc_unreserve_grant(cli, grants, tmp); + grants = 0; + } + } + rc = 0; + } else if (ext != NULL) { + /* index is located outside of active extent */ + need_release = 1; + } + if (need_release) { + osc_extent_release(env, ext); + oio->oi_active = NULL; + ext = NULL; + } + + if (ext == NULL) { + tmp = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax; + + /* try to find new extent to cover this page */ + LASSERT(oio->oi_active == NULL); + /* we may have allocated grant for this page if we failed + * to expand the previous active extent. */ + LASSERT(ergo(grants > 0, grants >= tmp)); + + rc = 0; + if (grants == 0) { + /* we haven't allocated grant for this page. */ + rc = osc_enter_cache(env, cli, oap, tmp); + if (rc == 0) + grants = tmp; + } + + tmp = grants; + if (rc == 0) { + ext = osc_extent_find(env, osc, index, &tmp); + if (IS_ERR(ext)) { + LASSERT(tmp == grants); + osc_exit_cache(cli, oap); + rc = PTR_ERR(ext); + ext = NULL; + } else { + oio->oi_active = ext; + } + } + if (grants > 0) + osc_unreserve_grant(cli, grants, tmp); + } + + LASSERT(ergo(rc == 0, ext != NULL)); + if (ext != NULL) { + EASSERTF(ext->oe_end >= index && ext->oe_start <= index, + ext, "index = %lu.\n", index); + LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0); + + osc_object_lock(osc); + if (ext->oe_nr_pages == 0) + ext->oe_srvlock = ops->ops_srvlock; + else + LASSERT(ext->oe_srvlock == ops->ops_srvlock); + ++ext->oe_nr_pages; + list_add_tail(&oap->oap_pending_item, &ext->oe_pages); + osc_object_unlock(osc); + + if (!ext->oe_layout_version) + ext->oe_layout_version = io->ci_layout_version; + } + + RETURN(rc); +} + +int osc_teardown_async_page(const struct lu_env *env, + struct osc_object *obj, struct osc_page *ops) +{ + struct osc_async_page *oap = &ops->ops_oap; + int rc = 0; + ENTRY; + + LASSERT(oap->oap_magic == OAP_MAGIC); + + CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n", + oap, ops, osc_index(oap2osc(oap))); + + if (!list_empty(&oap->oap_rpc_item)) { + CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap); + rc = -EBUSY; + } else if (!list_empty(&oap->oap_pending_item)) { + struct osc_extent *ext = NULL; + + osc_object_lock(obj); + ext = osc_extent_lookup(obj, osc_index(oap2osc(oap))); + osc_object_unlock(obj); + /* only truncated pages are allowed to be taken out. + * See osc_extent_truncate() and osc_cache_truncate_start() + * for details. */ + if (ext != NULL && ext->oe_state != OES_TRUNC) { + OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n", + osc_index(oap2osc(oap))); + rc = -EBUSY; + } + if (ext != NULL) + osc_extent_put(env, ext); + } + RETURN(rc); +} + +/** + * This is called when a page is picked up by kernel to write out. + * + * We should find out the corresponding extent and add the whole extent + * into urgent list. The extent may be being truncated or used, handle it + * carefully. + */ +int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops) +{ + struct osc_extent *ext = NULL; + struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj); + struct cl_page *cp = ops->ops_cl.cpl_page; + pgoff_t index = osc_index(ops); + struct osc_async_page *oap = &ops->ops_oap; + bool unplug = false; + int rc = 0; + ENTRY; + + osc_object_lock(obj); + ext = osc_extent_lookup(obj, index); + if (ext == NULL) { + osc_extent_tree_dump(D_ERROR, obj); + LASSERTF(0, "page index %lu is NOT covered.\n", index); + } + + switch (ext->oe_state) { + case OES_RPC: + case OES_LOCK_DONE: + CL_PAGE_DEBUG(D_ERROR, env, cp, "flush an in-rpc page?\n"); + LASSERT(0); + break; + case OES_LOCKING: + /* If we know this extent is being written out, we should abort + * so that the writer can make this page ready. Otherwise, there + * exists a deadlock problem because other process can wait for + * page writeback bit holding page lock; and meanwhile in + * vvp_page_make_ready(), we need to grab page lock before + * really sending the RPC. */ + case OES_TRUNC: + /* race with truncate, page will be redirtied */ + case OES_ACTIVE: + /* The extent is active so we need to abort and let the caller + * re-dirty the page. If we continued on here, and we were the + * one making the extent active, we could deadlock waiting for + * the page writeback to clear but it won't because the extent + * is active and won't be written out. */ + GOTO(out, rc = -EAGAIN); + default: + break; + } + + rc = cl_page_prep(env, io, cp, CRT_WRITE); + if (rc) + GOTO(out, rc); + + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT; + spin_unlock(&oap->oap_lock); + + if (memory_pressure_get()) + ext->oe_memalloc = 1; + + ext->oe_urgent = 1; + if (ext->oe_state == OES_CACHE) { + OSC_EXTENT_DUMP(D_CACHE, ext, + "flush page %p make it urgent.\n", oap); + if (list_empty(&ext->oe_link)) + list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); + unplug = true; + } + rc = 0; + EXIT; + +out: + osc_object_unlock(obj); + osc_extent_put(env, ext); + if (unplug) + osc_io_unplug_async(env, osc_cli(obj), obj); + return rc; +} + +/** + * this is called when a sync waiter receives an interruption. Its job is to + * get the caller woken as soon as possible. If its page hasn't been put in an + * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as + * desiring interruption which will forcefully complete the rpc once the rpc + * has timed out. + */ +int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) +{ + struct osc_async_page *oap = &ops->ops_oap; + struct osc_object *obj = oap->oap_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_extent *found = NULL; + struct list_head *plist; + pgoff_t index = osc_index(ops); + int rc = -EBUSY; + int cmd; + ENTRY; + + LASSERT(!oap->oap_interrupted); + oap->oap_interrupted = 1; + + /* Find out the caching extent */ + osc_object_lock(obj); + if (oap->oap_cmd & OBD_BRW_WRITE) { + plist = &obj->oo_urgent_exts; + cmd = OBD_BRW_WRITE; + } else { + plist = &obj->oo_reading_exts; + cmd = OBD_BRW_READ; + } + list_for_each_entry(ext, plist, oe_link) { + if (ext->oe_start <= index && ext->oe_end >= index) { + LASSERT(ext->oe_state == OES_LOCK_DONE); + /* For OES_LOCK_DONE state extent, it has already held + * a refcount for RPC. */ + found = osc_extent_get(ext); + break; + } + } + if (found != NULL) { + list_del_init(&found->oe_link); + osc_update_pending(obj, cmd, -found->oe_nr_pages); + osc_object_unlock(obj); + + osc_extent_finish(env, found, 0, -EINTR); + osc_extent_put(env, found); + rc = 0; + } else { + osc_object_unlock(obj); + /* ok, it's been put in an rpc. only one oap gets a request + * reference */ + if (oap->oap_request != NULL) { + ptlrpc_mark_interrupted(oap->oap_request); + ptlrpcd_wake(oap->oap_request); + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = NULL; + } + } + + osc_list_maint(cli, obj); + RETURN(rc); +} + +int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io, + struct osc_object *obj, struct list_head *list, + int brw_flags) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_async_page *oap; + int page_count = 0; + int mppr = cli->cl_max_pages_per_rpc; + bool can_merge = true; + pgoff_t start = CL_PAGE_EOF; + pgoff_t end = 0; + ENTRY; + + list_for_each_entry(oap, list, oap_pending_item) { + struct osc_page *opg = oap2osc_page(oap); + pgoff_t index = osc_index(opg); + + if (index > end) + end = index; + if (index < start) + start = index; + ++page_count; + mppr <<= (page_count > mppr); + + if (unlikely(opg->ops_from > 0 || opg->ops_to < PAGE_SIZE)) + can_merge = false; + } + + ext = osc_extent_alloc(obj); + if (ext == NULL) { + struct osc_async_page *tmp; + + list_for_each_entry_safe(oap, tmp, list, oap_pending_item) { + list_del_init(&oap->oap_pending_item); + osc_ap_completion(env, cli, oap, 0, -ENOMEM); + } + RETURN(-ENOMEM); + } + + ext->oe_rw = !!(brw_flags & OBD_BRW_READ); + ext->oe_sync = 1; + ext->oe_no_merge = !can_merge; + ext->oe_urgent = 1; + ext->oe_start = start; + ext->oe_end = ext->oe_max_end = end; + ext->oe_obj = obj; + ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); + ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY); + if (brw_flags & OBD_BRW_NOCACHE && !ext->oe_rw) { /* direct io write */ + int grants; + int ppc; + + ppc = 1 << (cli->cl_chunkbits - PAGE_SHIFT); + grants = cli->cl_grant_extent_tax; + grants += (1 << cli->cl_chunkbits) * + ((page_count + ppc - 1) / ppc); + + spin_lock(&cli->cl_loi_list_lock); + if (osc_reserve_grant(cli, grants) == 0) { + list_for_each_entry(oap, list, oap_pending_item) { + osc_consume_write_grant(cli, + &oap->oap_brw_page); + atomic_long_inc(&obd_dirty_pages); + } + __osc_unreserve_grant(cli, grants, 0); + ext->oe_grants = grants; + } + spin_unlock(&cli->cl_loi_list_lock); + } + ext->oe_nr_pages = page_count; + ext->oe_mppr = mppr; + list_splice_init(list, &ext->oe_pages); + ext->oe_layout_version = io->ci_layout_version; + + osc_object_lock(obj); + /* Reuse the initial refcount for RPC, don't drop it */ + osc_extent_state_set(ext, OES_LOCK_DONE); + if (!ext->oe_rw) { /* write */ + if (!ext->oe_srvlock) { + /* The most likely case here is from lack of grants + * so we are either out of quota or out of space. + * Since this means we are holding locks across + * potentially multi-striped IO, we must send out + * everything out instantly to avoid prolonged + * waits resulting in lock eviction (likely since + * the extended wait in osc_cache_enter() did not + * yield any additional grant due to a timeout. + * LU-13131 */ + ext->oe_hp = 1; + list_add_tail(&ext->oe_link, &obj->oo_hp_exts); + } else { + list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); + } + osc_update_pending(obj, OBD_BRW_WRITE, page_count); + } else { + list_add_tail(&ext->oe_link, &obj->oo_reading_exts); + osc_update_pending(obj, OBD_BRW_READ, page_count); + } + osc_object_unlock(obj); + + osc_io_unplug_async(env, cli, obj); + RETURN(0); +} + +/** + * Called by osc_io_setattr_start() to freeze and destroy covering extents. + */ +int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, + __u64 size, struct osc_extent **extp) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_extent *waiting = NULL; + pgoff_t index; + struct list_head list = LIST_HEAD_INIT(list); + int result = 0; + bool partial; + ENTRY; + + /* pages with index greater or equal to index will be truncated. */ + index = cl_index(osc2cl(obj), size); + partial = size > cl_offset(osc2cl(obj), index); + +again: + osc_object_lock(obj); + ext = osc_extent_search(obj, index); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < index) + ext = next_extent(ext); + while (ext != NULL) { + EASSERT(ext->oe_state != OES_TRUNC, ext); + + if (ext->oe_state > OES_CACHE || ext->oe_urgent) { + /* if ext is in urgent state, it means there must exist + * a page already having been flushed by write_page(). + * We have to wait for this extent because we can't + * truncate that page. */ + OSC_EXTENT_DUMP(D_CACHE, ext, + "waiting for busy extent\n"); + waiting = osc_extent_get(ext); + break; + } + + OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size); + + osc_extent_get(ext); + if (ext->oe_state == OES_ACTIVE) { + /* though we grab inode mutex for write path, but we + * release it before releasing extent(in osc_io_end()), + * so there is a race window that an extent is still + * in OES_ACTIVE when truncate starts. */ + LASSERT(!ext->oe_trunc_pending); + ext->oe_trunc_pending = 1; + } else { + EASSERT(ext->oe_state == OES_CACHE, ext); + osc_extent_state_set(ext, OES_TRUNC); + osc_update_pending(obj, OBD_BRW_WRITE, + -ext->oe_nr_pages); + } + /* This extent could be on the full extents list, that's OK */ + EASSERT(!ext->oe_hp && !ext->oe_urgent, ext); + if (!list_empty(&ext->oe_link)) + list_move_tail(&ext->oe_link, &list); + else + list_add_tail(&ext->oe_link, &list); + + ext = next_extent(ext); + } + osc_object_unlock(obj); + + osc_list_maint(cli, obj); + + while (!list_empty(&list)) { + int rc; + + ext = list_entry(list.next, struct osc_extent, oe_link); + list_del_init(&ext->oe_link); + + /* extent may be in OES_ACTIVE state because inode mutex + * is released before osc_io_end() in file write case */ + if (ext->oe_state != OES_TRUNC) + osc_extent_wait(env, ext, OES_TRUNC); + + rc = osc_extent_truncate(ext, index, partial); + if (rc < 0) { + if (result == 0) + result = rc; + + OSC_EXTENT_DUMP(D_ERROR, ext, + "truncate error %d\n", rc); + } else if (ext->oe_nr_pages == 0) { + osc_extent_remove(ext); + } else { + /* this must be an overlapped extent which means only + * part of pages in this extent have been truncated. + */ + EASSERTF(ext->oe_start <= index, ext, + "trunc index = %lu/%d.\n", index, partial); + /* fix index to skip this partially truncated extent */ + index = ext->oe_end + 1; + partial = false; + + /* we need to hold this extent in OES_TRUNC state so + * that no writeback will happen. This is to avoid + * BUG 17397. + * Only partial truncate can reach here, if @size is + * not zero, the caller should provide a valid @extp. */ + LASSERT(*extp == NULL); + *extp = osc_extent_get(ext); + OSC_EXTENT_DUMP(D_CACHE, ext, + "trunc at %llu\n", size); + } + osc_extent_put(env, ext); + } + if (waiting != NULL) { + int rc; + + /* ignore the result of osc_extent_wait the write initiator + * should take care of it. */ + rc = osc_extent_wait(env, waiting, OES_INV); + if (rc < 0) + OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc); + + osc_extent_put(env, waiting); + waiting = NULL; + goto again; + } + RETURN(result); +} +EXPORT_SYMBOL(osc_cache_truncate_start); + +/** + * Called after osc_io_setattr_end to add oio->oi_trunc back to cache. + */ +void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext) +{ + if (ext != NULL) { + struct osc_object *obj = ext->oe_obj; + bool unplug = false; + + EASSERT(ext->oe_nr_pages > 0, ext); + EASSERT(ext->oe_state == OES_TRUNC, ext); + EASSERT(!ext->oe_urgent, ext); + + OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n"); + osc_object_lock(obj); + osc_extent_state_set(ext, OES_CACHE); + if (ext->oe_fsync_wait && !ext->oe_urgent) { + ext->oe_urgent = 1; + list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); + unplug = true; + } + osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages); + osc_object_unlock(obj); + osc_extent_put(env, ext); + + if (unplug) + osc_io_unplug_async(env, osc_cli(obj), obj); + } +} + +/** + * Wait for extents in a specific range to be written out. + * The caller must have called osc_cache_writeback_range() to issue IO + * otherwise it will take a long time for this function to finish. + * + * Caller must hold inode_mutex , or cancel exclusive dlm lock so that + * nobody else can dirty this range of file while we're waiting for + * extents to be written. + */ +int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end) +{ + struct osc_extent *ext; + pgoff_t index = start; + int result = 0; + ENTRY; + +again: + osc_object_lock(obj); + ext = osc_extent_search(obj, index); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < index) + ext = next_extent(ext); + while (ext != NULL) { + int rc; + + if (ext->oe_start > end) + break; + + if (!ext->oe_fsync_wait) { + ext = next_extent(ext); + continue; + } + + EASSERT(ergo(ext->oe_state == OES_CACHE, + ext->oe_hp || ext->oe_urgent), ext); + EASSERT(ergo(ext->oe_state == OES_ACTIVE, + !ext->oe_hp && ext->oe_urgent), ext); + + index = ext->oe_end + 1; + osc_extent_get(ext); + osc_object_unlock(obj); + + rc = osc_extent_wait(env, ext, OES_INV); + if (result == 0) + result = rc; + osc_extent_put(env, ext); + goto again; + } + osc_object_unlock(obj); + + OSC_IO_DEBUG(obj, "sync file range.\n"); + RETURN(result); +} +EXPORT_SYMBOL(osc_cache_wait_range); + +/** + * Called to write out a range of osc object. + * + * @hp : should be set this is caused by lock cancel; + * @discard: is set if dirty pages should be dropped - file will be deleted or + * truncated, this implies there is no partially discarding extents. + * + * Return how many pages will be issued, or error code if error occurred. + */ +int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end, int hp, int discard) +{ + struct osc_extent *ext; + struct list_head discard_list = LIST_HEAD_INIT(discard_list); + bool unplug = false; + int result = 0; + ENTRY; + + osc_object_lock(obj); + ext = osc_extent_search(obj, start); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < start) + ext = next_extent(ext); + while (ext != NULL) { + if (ext->oe_start > end) + break; + + ext->oe_fsync_wait = 1; + switch (ext->oe_state) { + case OES_CACHE: + result += ext->oe_nr_pages; + if (!discard) { + struct list_head *list = NULL; + if (hp) { + EASSERT(!ext->oe_hp, ext); + ext->oe_hp = 1; + list = &obj->oo_hp_exts; + } else if (!ext->oe_urgent && !ext->oe_hp) { + ext->oe_urgent = 1; + list = &obj->oo_urgent_exts; + } + if (list != NULL) + list_move_tail(&ext->oe_link, list); + unplug = true; + } else { + struct client_obd *cli = osc_cli(obj); + int pcc_bits = cli->cl_chunkbits - PAGE_SHIFT; + pgoff_t align_by = (1 << pcc_bits); + pgoff_t a_start = round_down(start, align_by); + pgoff_t a_end = round_up(end, align_by); + + /* overflow case */ + if (end && !a_end) + a_end = CL_PAGE_EOF; + /* the only discarder is lock cancelling, so + * [start, end], aligned by chunk size, must + * contain this extent */ + LASSERTF(ext->oe_start >= a_start && + ext->oe_end <= a_end, + "ext [%lu, %lu] reg [%lu, %lu] " + "orig [%lu %lu] align %lu bits " + "%d\n", ext->oe_start, ext->oe_end, + a_start, a_end, start, end, + align_by, pcc_bits); + osc_extent_state_set(ext, OES_LOCKING); + ext->oe_owner = current; + list_move_tail(&ext->oe_link, + &discard_list); + osc_update_pending(obj, OBD_BRW_WRITE, + -ext->oe_nr_pages); + } + break; + case OES_ACTIVE: + /* It's pretty bad to wait for ACTIVE extents, because + * we don't know how long we will wait for it to be + * flushed since it may be blocked at awaiting more + * grants. We do this for the correctness of fsync. */ + LASSERT(hp == 0 && discard == 0); + ext->oe_urgent = 1; + break; + case OES_TRUNC: + /* this extent is being truncated, can't do anything + * for it now. it will be set to urgent after truncate + * is finished in osc_cache_truncate_end(). */ + default: + break; + } + ext = next_extent(ext); + } + osc_object_unlock(obj); + + LASSERT(ergo(!discard, list_empty(&discard_list))); + if (!list_empty(&discard_list)) { + struct osc_extent *tmp; + int rc; + + osc_list_maint(osc_cli(obj), obj); + list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) { + list_del_init(&ext->oe_link); + EASSERT(ext->oe_state == OES_LOCKING, ext); + + /* Discard caching pages. We don't actually write this + * extent out but we complete it as if we did. */ + rc = osc_extent_make_ready(env, ext); + if (unlikely(rc < 0)) { + OSC_EXTENT_DUMP(D_ERROR, ext, + "make_ready returned %d\n", rc); + if (result >= 0) + result = rc; + } + + /* finish the extent as if the pages were sent */ + osc_extent_finish(env, ext, 0, 0); + } + } + + if (unplug) + osc_io_unplug(env, osc_cli(obj), obj); + + if (hp || discard) { + int rc; + rc = osc_cache_wait_range(env, obj, start, end); + if (result >= 0 && rc < 0) + result = rc; + } + + OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result); + RETURN(result); +} +EXPORT_SYMBOL(osc_cache_writeback_range); + +/** + * Returns a list of pages by a given [start, end] of \a obj. + * + * \param resched If not NULL, then we give up before hogging CPU for too + * long and set *resched = 1, in that case caller should implement a retry + * logic. + * + * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely + * crucial in the face of [offset, EOF] locks. + * + * Return at least one page in @queue unless there is no covered page. + */ +int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, + struct osc_object *osc, pgoff_t start, pgoff_t end, + osc_page_gang_cbt cb, void *cbdata) +{ + struct osc_page *ops; + struct pagevec *pagevec; + void **pvec; + pgoff_t idx; + unsigned int nr; + unsigned int i; + unsigned int j; + int res = CLP_GANG_OKAY; + bool tree_lock = true; + ENTRY; + + idx = start; + pvec = osc_env_info(env)->oti_pvec; + pagevec = &osc_env_info(env)->oti_pagevec; + ll_pagevec_init(pagevec, 0); + spin_lock(&osc->oo_tree_lock); + while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec, + idx, OTI_PVEC_SIZE)) > 0) { + struct cl_page *page; + bool end_of_region = false; + + for (i = 0, j = 0; i < nr; ++i) { + ops = pvec[i]; + pvec[i] = NULL; + + idx = osc_index(ops); + if (idx > end) { + end_of_region = true; + break; + } + + page = ops->ops_cl.cpl_page; + LASSERT(page->cp_type == CPT_CACHEABLE); + if (page->cp_state == CPS_FREEING) + continue; + + cl_page_get(page); + lu_ref_add_atomic(&page->cp_reference, + "gang_lookup", current); + pvec[j++] = ops; + } + ++idx; + + /* + * Here a delicate locking dance is performed. Current thread + * holds a reference to a page, but has to own it before it + * can be placed into queue. Owning implies waiting, so + * radix-tree lock is to be released. After a wait one has to + * check that pages weren't truncated (cl_page_own() returns + * error in the latter case). + */ + spin_unlock(&osc->oo_tree_lock); + tree_lock = false; + + for (i = 0; i < j; ++i) { + ops = pvec[i]; + if (res == CLP_GANG_OKAY) + res = (*cb)(env, io, ops, cbdata); + + page = ops->ops_cl.cpl_page; + lu_ref_del(&page->cp_reference, "gang_lookup", current); + cl_pagevec_put(env, page, pagevec); + } + pagevec_release(pagevec); + + if (nr < OTI_PVEC_SIZE || end_of_region) + break; + + if (res == CLP_GANG_OKAY && need_resched()) + res = CLP_GANG_RESCHED; + if (res != CLP_GANG_OKAY) + break; + + spin_lock(&osc->oo_tree_lock); + tree_lock = true; + } + if (tree_lock) + spin_unlock(&osc->oo_tree_lock); + RETURN(res); +} +EXPORT_SYMBOL(osc_page_gang_lookup); + +/** + * Check if page @page is covered by an extra lock or discard it. + */ +static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops, void *cbdata) +{ + struct osc_thread_info *info = osc_env_info(env); + struct osc_object *osc = cbdata; + pgoff_t index; + + index = osc_index(ops); + if (index >= info->oti_fn_index) { + struct ldlm_lock *tmp; + struct cl_page *page = ops->ops_cl.cpl_page; + + /* refresh non-overlapped index */ + tmp = osc_dlmlock_at_pgoff(env, osc, index, + OSC_DAP_FL_TEST_LOCK); + if (tmp != NULL) { + __u64 end = tmp->l_policy_data.l_extent.end; + /* Cache the first-non-overlapped index so as to skip + * all pages within [index, oti_fn_index). This is safe + * because if tmp lock is canceled, it will discard + * these pages. */ + info->oti_fn_index = cl_index(osc2cl(osc), end + 1); + if (end == OBD_OBJECT_EOF) + info->oti_fn_index = CL_PAGE_EOF; + LDLM_LOCK_PUT(tmp); + } else if (cl_page_own(env, io, page) == 0) { + /* discard the page */ + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + } + + info->oti_next_index = index + 1; + return CLP_GANG_OKAY; +} + +int osc_discard_cb(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops, void *cbdata) +{ + struct osc_thread_info *info = osc_env_info(env); + struct cl_page *page = ops->ops_cl.cpl_page; + + /* page is top page. */ + info->oti_next_index = osc_index(ops) + 1; + if (cl_page_own(env, io, page) == 0) { + if (!ergo(page->cp_type == CPT_CACHEABLE, + !PageDirty(cl_page_vmpage(page)))) + CL_PAGE_DEBUG(D_ERROR, env, page, + "discard dirty page?\n"); + + /* discard the page */ + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + + return CLP_GANG_OKAY; +} +EXPORT_SYMBOL(osc_discard_cb); + +/** + * Discard pages protected by the given lock. This function traverses radix + * tree to find all covering pages and discard them. If a page is being covered + * by other locks, it should remain in cache. + * + * If error happens on any step, the process continues anyway (the reasoning + * behind this being that lock cancellation cannot be delayed indefinitely). + */ +int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, + pgoff_t start, pgoff_t end, bool discard) +{ + struct osc_thread_info *info = osc_env_info(env); + struct cl_io *io = osc_env_thread_io(env); + osc_page_gang_cbt cb; + int res; + int result; + + ENTRY; + + io->ci_obj = cl_object_top(osc2cl(osc)); + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + GOTO(out, result); + + cb = discard ? osc_discard_cb : check_and_discard_cb; + info->oti_fn_index = info->oti_next_index = start; + do { + res = osc_page_gang_lookup(env, io, osc, + info->oti_next_index, end, cb, osc); + if (info->oti_next_index > end) + break; + + if (res == CLP_GANG_RESCHED) + cond_resched(); + } while (res != CLP_GANG_OKAY); +out: + cl_io_fini(env, io); + RETURN(result); +} + + +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_dev.c b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c new file mode 100644 index 0000000000000..cbddab5c0f319 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c @@ -0,0 +1,250 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device, for OSC layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_OSC + +/* class_name2obd() */ +#include +#include + +#include "osc_internal.h" + +/** \addtogroup osc + * @{ + */ + +struct kmem_cache *osc_lock_kmem; +EXPORT_SYMBOL(osc_lock_kmem); +struct kmem_cache *osc_object_kmem; +EXPORT_SYMBOL(osc_object_kmem); + +struct kmem_cache *osc_thread_kmem; +struct kmem_cache *osc_session_kmem; +struct kmem_cache *osc_extent_kmem; +struct kmem_cache *osc_quota_kmem; +struct kmem_cache *osc_obdo_kmem; + +struct lu_kmem_descr osc_caches[] = { + { + .ckd_cache = &osc_lock_kmem, + .ckd_name = "osc_lock_kmem", + .ckd_size = sizeof (struct osc_lock) + }, + { + .ckd_cache = &osc_object_kmem, + .ckd_name = "osc_object_kmem", + .ckd_size = sizeof (struct osc_object) + }, + { + .ckd_cache = &osc_thread_kmem, + .ckd_name = "osc_thread_kmem", + .ckd_size = sizeof (struct osc_thread_info) + }, + { + .ckd_cache = &osc_session_kmem, + .ckd_name = "osc_session_kmem", + .ckd_size = sizeof (struct osc_session) + }, + { + .ckd_cache = &osc_extent_kmem, + .ckd_name = "osc_extent_kmem", + .ckd_size = sizeof (struct osc_extent) + }, + { + .ckd_cache = &osc_quota_kmem, + .ckd_name = "osc_quota_kmem", + .ckd_size = sizeof(struct osc_quota_info) + }, + { + .ckd_cache = &osc_obdo_kmem, + .ckd_name = "osc_obdo_kmem", + .ckd_size = sizeof(struct obdo) + }, + { + .ckd_cache = NULL + } +}; + +/***************************************************************************** + * + * Osc device and device type functions. + * + */ + +static void *osc_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osc_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osc_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osc_thread_info *info = data; + + lu_buf_free(&info->oti_ladvise_buf); + OBD_SLAB_FREE_PTR(info, osc_thread_kmem); +} + +struct lu_context_key osc_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = osc_key_init, + .lct_fini = osc_key_fini +}; +EXPORT_SYMBOL(osc_key); + +static void *osc_session_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osc_session *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, GFP_NOFS); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osc_session_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osc_session *info = data; + OBD_SLAB_FREE_PTR(info, osc_session_kmem); +} + +struct lu_context_key osc_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = osc_session_init, + .lct_fini = osc_session_fini +}; +EXPORT_SYMBOL(osc_session_key); + +/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key); + +static int osc_cl_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + ENTRY; + RETURN(osc_process_config_base(d->ld_obd, cfg)); +} + +static const struct lu_device_operations osc_lu_ops = { + .ldo_object_alloc = osc_object_alloc, + .ldo_process_config = osc_cl_process_config, + .ldo_recovery_complete = NULL +}; + +int osc_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + RETURN(0); +} +EXPORT_SYMBOL(osc_device_init); + +struct lu_device *osc_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + return NULL; +} +EXPORT_SYMBOL(osc_device_fini); + +struct lu_device *osc_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct osc_device *od = lu2osc_dev(d); + + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(od); + return NULL; +} +EXPORT_SYMBOL(osc_device_free); + +static struct lu_device *osc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct osc_device *od; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(od); + if (od == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + cl_device_init(&od->od_cl, t); + d = osc2lu_dev(od); + d->ld_ops = &osc_lu_ops; + + /* Setup OSC OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = osc_setup(obd, cfg); + if (rc) { + osc_device_free(env, d); + RETURN(ERR_PTR(rc)); + } + od->od_exp = obd->obd_self_export; + RETURN(d); +} + +static const struct lu_device_type_operations osc_device_type_ops = { + .ldto_init = osc_type_init, + .ldto_fini = osc_type_fini, + + .ldto_start = osc_type_start, + .ldto_stop = osc_type_stop, + + .ldto_device_alloc = osc_device_alloc, + .ldto_device_free = osc_device_free, + + .ldto_device_init = osc_device_init, + .ldto_device_fini = osc_device_fini +}; + +struct lu_device_type osc_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_OSC_NAME, + .ldt_ops = &osc_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_internal.h b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h new file mode 100644 index 0000000000000..519a4d1f4b57e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h @@ -0,0 +1,191 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef OSC_INTERNAL_H +#define OSC_INTERNAL_H + +#define OAP_MAGIC 8675309 + +#include +#include + +extern atomic_t osc_pool_req_count; +extern unsigned int osc_reqpool_maxreqcount; +extern struct ptlrpc_request_pool *osc_rq_pool; + +int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes); +void osc_update_next_shrink(struct client_obd *cli); +int lru_queue_work(const struct lu_env *env, void *data); +int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, + int sent, int rc); +int osc_extent_release(const struct lu_env *env, struct osc_extent *ext); +int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, + pgoff_t start, pgoff_t end, bool discard); + +extern struct ptlrpc_request_set *PTLRPCD_SET; + +void osc_lock_lvb_update(const struct lu_env *env, + struct osc_object *osc, + struct ldlm_lock *dlmlock, + struct ost_lvb *lvb); + +int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u64 *flags, union ldlm_policy_data *policy, + struct ost_lvb *lvb, osc_enqueue_upcall_f upcall, + void *cookie, struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *rqset, int async, + bool speculative); + +int osc_match_base(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, enum ldlm_type type, + union ldlm_policy_data *policy, enum ldlm_mode mode, + __u64 *flags, struct osc_object *obj, + struct lustre_handle *lockh, int unref); + +int osc_setattr_async(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); +int osc_sync_base(struct osc_object *obj, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); +int osc_ladvise_base(struct obd_export *exp, struct obdo *oa, + struct ladvise_hdr *ladvise_hdr, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); +int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg); +int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, + struct list_head *ext_list, int cmd); +unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages); +void osc_lru_unreserve(struct client_obd *cli, unsigned long npages); + +extern struct lu_kmem_descr osc_caches[]; + +unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock); + +int osc_cleanup(struct obd_device *obd); +int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg); + +int osc_tunables_init(struct obd_device *obd); + +extern struct lu_device_type osc_device_type; + +static inline struct cl_io *osc_env_thread_io(const struct lu_env *env) +{ + struct cl_io *io = &osc_env_info(env)->oti_io; + + memset(io, 0, sizeof(*io)); + return io; +} + +static inline int osc_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &osc_device_type; +} + +static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock) +{ + return cl2osc_lock(cl_lock_at(lock, &osc_device_type)); +} + +int osc_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int osc_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +struct lu_object *osc_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +static inline int osc_recoverable_error(int rc) +{ + return (rc == -EIO || rc == -EROFS || rc == -ENOMEM || + rc == -EAGAIN || rc == -EINPROGRESS); +} + +static inline unsigned long rpcs_in_flight(struct client_obd *cli) +{ + return cli->cl_r_in_flight + cli->cl_w_in_flight; +} + +static inline char *cli_name(struct client_obd *cli) +{ + return cli->cl_import->imp_obd->obd_name; +} + +#ifndef min_t +#define min_t(type,x,y) \ + ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) +#endif + +struct osc_async_args { + struct obd_info *aa_oi; +}; + +int osc_quota_setup(struct obd_device *obd); +int osc_quota_cleanup(struct obd_device *obd); +int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[], + u64 valid, u32 flags); +int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]); +int osc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +void osc_inc_unstable_pages(struct ptlrpc_request *req); +void osc_dec_unstable_pages(struct ptlrpc_request *req); +bool osc_over_unstable_soft_limit(struct client_obd *cli); +void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj, + pgoff_t idx, size_t to); + +struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env, + struct osc_object *obj, + pgoff_t index, + enum osc_dap_flags flags); + +int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc); + +/** osc shrink list to link all osc client obd */ +extern struct list_head osc_shrink_list; +/** spin lock to protect osc_shrink_list */ +extern spinlock_t osc_shrink_lock; +extern unsigned long osc_cache_shrink_count(struct shrinker *sk, + struct shrink_control *sc); +extern unsigned long osc_cache_shrink_scan(struct shrinker *sk, + struct shrink_control *sc); + +static inline void osc_set_io_portal(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + + /* Distinguish OSC from MDC here to use OST or MDS portal */ + if (OCD_HAS_FLAG(&imp->imp_connect_data, IBITS)) + req->rq_request_portal = MDS_IO_PORTAL; + else + req->rq_request_portal = OST_IO_PORTAL; +} +#endif /* OSC_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_io.c b/drivers/staging/lustrefsx/lustre/osc/osc_io.c new file mode 100644 index 0000000000000..4a51b9912d72f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_io.c @@ -0,0 +1,1043 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include +#include + +#include "osc_internal.h" + +/** \addtogroup osc + * @{ + */ + +/***************************************************************************** + * + * io operations. + * + */ + +static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io) +{ +} + +void osc_read_ahead_release(const struct lu_env *env, void *cbdata) +{ + struct ldlm_lock *dlmlock = cbdata; + struct lustre_handle lockh; + + ldlm_lock2handle(dlmlock, &lockh); + ldlm_lock_decref(&lockh, LCK_PR); + LDLM_LOCK_PUT(dlmlock); +} +EXPORT_SYMBOL(osc_read_ahead_release); + +static int osc_io_read_ahead(const struct lu_env *env, + const struct cl_io_slice *ios, + pgoff_t start, struct cl_read_ahead *ra) +{ + struct osc_object *osc = cl2osc(ios->cis_obj); + struct ldlm_lock *dlmlock; + int result = -ENODATA; + ENTRY; + + dlmlock = osc_dlmlock_at_pgoff(env, osc, start, 0); + if (dlmlock != NULL) { + LASSERT(dlmlock->l_ast_data == osc); + if (dlmlock->l_req_mode != LCK_PR) { + struct lustre_handle lockh; + ldlm_lock2handle(dlmlock, &lockh); + ldlm_lock_addref(&lockh, LCK_PR); + ldlm_lock_decref(&lockh, dlmlock->l_req_mode); + } + + ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc; + ra->cra_end = cl_index(osc2cl(osc), + dlmlock->l_policy_data.l_extent.end); + ra->cra_release = osc_read_ahead_release; + ra->cra_cbdata = dlmlock; + result = 0; + } + + RETURN(result); +} + +/** + * An implementation of cl_io_operations::cio_io_submit() method for osc + * layer. Iterates over pages in the in-queue, prepares each for io by calling + * cl_page_prep() and then either submits them through osc_io_submit_page() + * or, if page is already submitted, changes osc flags through + * osc_set_async_flags(). + */ +int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + struct cl_page *page; + struct cl_page *tmp; + struct client_obd *cli = NULL; + struct osc_object *osc = NULL; /* to keep gcc happy */ + struct osc_page *opg; + struct cl_io *io; + struct list_head list = LIST_HEAD_INIT(list); + + struct cl_page_list *qin = &queue->c2_qin; + struct cl_page_list *qout = &queue->c2_qout; + unsigned int queued = 0; + int result = 0; + int brw_flags; + unsigned int max_pages; + + LASSERT(qin->pl_nr > 0); + + CDEBUG(D_CACHE|D_READA, "%d %d\n", qin->pl_nr, crt); + + osc = cl2osc(ios->cis_obj); + cli = osc_cli(osc); + max_pages = cli->cl_max_pages_per_rpc; + + brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0; + brw_flags |= crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; + if (crt == CRT_READ && ios->cis_io->ci_ndelay) + brw_flags |= OBD_BRW_NDELAY; + + page = cl_page_list_first(qin); + if (page->cp_type == CPT_TRANSIENT) + brw_flags |= OBD_BRW_NOCACHE; + + /* + * NOTE: here @page is a top-level page. This is done to avoid + * creation of sub-page-list. + */ + cl_page_list_for_each_safe(page, tmp, qin) { + struct osc_async_page *oap; + + /* Top level IO. */ + io = page->cp_owner; + LASSERT(io != NULL); + + opg = osc_cl_page_osc(page, osc); + oap = &opg->ops_oap; + LASSERT(osc == oap->oap_obj); + + if (!list_empty(&oap->oap_pending_item) || + !list_empty(&oap->oap_rpc_item)) { + CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n", + oap, opg); + result = -EBUSY; + break; + } + + result = cl_page_prep(env, io, page, crt); + if (result != 0) { + LASSERT(result < 0); + if (result != -EALREADY) + break; + /* + * Handle -EALREADY error: for read case, the page is + * already in UPTODATE state; for write, the page + * is not dirty. + */ + result = 0; + continue; + } + + spin_lock(&oap->oap_lock); + oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY; + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&oap->oap_lock); + + osc_page_submit(env, opg, crt, brw_flags); + list_add_tail(&oap->oap_pending_item, &list); + + if (page->cp_sync_io != NULL) + cl_page_list_move(qout, qin, page); + else /* async IO */ + cl_page_list_del(env, qin, page); + + if (++queued == max_pages) { + queued = 0; + result = osc_queue_sync_pages(env, io, osc, &list, + brw_flags); + if (result < 0) + break; + } + } + + if (queued > 0) + result = osc_queue_sync_pages(env, io, osc, &list, brw_flags); + + /* Update c/mtime for sync write. LU-7310 */ + if (crt == CRT_WRITE && qout->pl_nr > 0 && result == 0) { + struct cl_object *obj = ios->cis_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + + cl_object_attr_lock(obj); + attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds(); + cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME); + cl_object_attr_unlock(obj); + } + + CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result); + return qout->pl_nr > 0 ? 0 : result; +} +EXPORT_SYMBOL(osc_io_submit); + +/** + * This is called to update the attributes when modifying a specific page, + * both when making new pages and when doing updates to existing cached pages. + * + * Expand stripe KMS if necessary. + */ +void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj, + pgoff_t idx, size_t to) +{ + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int valid; + __u64 kms; + + ENTRY; + + /* offset within stripe */ + kms = cl_offset(obj, idx) + to; + + cl_object_attr_lock(obj); + CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n", + kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms, + loi->loi_lvb.lvb_size); + + attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds(); + valid = CAT_MTIME | CAT_CTIME; + if (kms > loi->loi_kms) { + attr->cat_kms = kms; + valid |= CAT_KMS; + } + if (kms > loi->loi_lvb.lvb_size) { + attr->cat_size = kms; + valid |= CAT_SIZE; + } + cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + + EXIT; +} + +int osc_io_commit_async(const struct lu_env *env, + const struct cl_io_slice *ios, + struct cl_page_list *qin, int from, int to, + cl_commit_cbt cb) +{ + struct cl_io *io = ios->cis_io; + struct osc_io *oio = cl2osc_io(env, ios); + struct osc_object *osc = cl2osc(ios->cis_obj); + struct cl_page *page; + struct cl_page *last_page; + struct osc_page *opg; + int result = 0; + ENTRY; + + LASSERT(qin->pl_nr > 0); + + /* Handle partial page cases */ + last_page = cl_page_list_last(qin); + if (oio->oi_lockless) { + page = cl_page_list_first(qin); + if (page == last_page) { + cl_page_clip(env, page, from, to); + } else { + if (from != 0) + cl_page_clip(env, page, from, PAGE_SIZE); + if (to != PAGE_SIZE) + cl_page_clip(env, last_page, 0, to); + } + } + + while (qin->pl_nr > 0) { + struct osc_async_page *oap; + + page = cl_page_list_first(qin); + opg = osc_cl_page_osc(page, osc); + oap = &opg->ops_oap; + + LASSERTF(osc == oap->oap_obj, + "obj mismatch: %p / %p\n", osc, oap->oap_obj); + + if (!list_empty(&oap->oap_rpc_item)) { + CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n", + oap, opg); + result = -EBUSY; + break; + } + + /* The page may be already in dirty cache. */ + if (list_empty(&oap->oap_pending_item)) { + result = osc_page_cache_add(env, &opg->ops_cl, io); + if (result != 0) + break; + } + + osc_page_touch_at(env, osc2cl(osc), osc_index(opg), + page == last_page ? to : PAGE_SIZE); + + cl_page_list_del(env, qin, page); + + (*cb)(env, io, page); + /* Can't access page any more. Page can be in transfer and + * complete at any time. */ + } + + /* for sync write, kernel will wait for this page to be flushed before + * osc_io_end() is called, so release it earlier. + * for mkwrite(), it's known there is no further pages. */ + if (cl_io_is_sync_write(io) && oio->oi_active != NULL) { + osc_extent_release(env, oio->oi_active); + oio->oi_active = NULL; + } + + CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, result); + RETURN(result); +} +EXPORT_SYMBOL(osc_io_commit_async); + +static bool osc_import_not_healthy(struct obd_import *imp) +{ + return imp->imp_invalid || imp->imp_deactive || + !(imp->imp_state == LUSTRE_IMP_FULL || + imp->imp_state == LUSTRE_IMP_IDLE); +} + +int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct osc_object *osc = cl2osc(ios->cis_obj); + struct obd_import *imp = osc_cli(osc)->cl_import; + struct osc_io *oio = osc_env_io(env); + int rc = -EIO; + ENTRY; + + spin_lock(&imp->imp_lock); + /** + * check whether this OSC device is available for non-delay read, + * fast switching mirror if we haven't tried all mirrors. + */ + if (ios->cis_io->ci_type == CIT_READ && ios->cis_io->ci_ndelay && + !ios->cis_io->ci_tried_all_mirrors && osc_import_not_healthy(imp)) { + rc = -EWOULDBLOCK; + } else if (likely(!imp->imp_invalid)) { + atomic_inc(&osc->oo_nr_ios); + oio->oi_is_active = 1; + rc = 0; + } + spin_unlock(&imp->imp_lock); + + if (cfs_capable(CFS_CAP_SYS_RESOURCE)) + oio->oi_cap_sys_resource = 1; + + RETURN(rc); +} +EXPORT_SYMBOL(osc_io_iter_init); + +int osc_io_write_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct osc_io *oio = osc_env_io(env); + struct osc_object *osc = cl2osc(ios->cis_obj); + unsigned long npages; + ENTRY; + + if (cl_io_is_append(io)) + RETURN(osc_io_iter_init(env, ios)); + + npages = io->u.ci_rw.crw_count >> PAGE_SHIFT; + if (io->u.ci_rw.crw_pos & ~PAGE_MASK) + ++npages; + + oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages); + + RETURN(osc_io_iter_init(env, ios)); +} +EXPORT_SYMBOL(osc_io_write_iter_init); + +void osc_io_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct osc_io *oio = osc_env_io(env); + + if (oio->oi_is_active) { + struct osc_object *osc = cl2osc(ios->cis_obj); + + oio->oi_is_active = 0; + LASSERT(atomic_read(&osc->oo_nr_ios) > 0); + if (atomic_dec_and_test(&osc->oo_nr_ios)) + wake_up_all(&osc->oo_io_waitq); + } +} +EXPORT_SYMBOL(osc_io_iter_fini); + +void osc_io_write_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct osc_io *oio = osc_env_io(env); + struct osc_object *osc = cl2osc(ios->cis_obj); + + if (oio->oi_lru_reserved > 0) { + osc_lru_unreserve(osc_cli(osc), oio->oi_lru_reserved); + oio->oi_lru_reserved = 0; + } + oio->oi_write_osclock = NULL; + + osc_io_iter_fini(env, ios); +} +EXPORT_SYMBOL(osc_io_write_iter_fini); + +int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct cl_io *io; + struct cl_fault_io *fio; + ENTRY; + + io = ios->cis_io; + fio = &io->u.ci_fault; + CDEBUG(D_INFO, "%lu %d %zu\n", + fio->ft_index, fio->ft_writable, fio->ft_nob); + /* + * If mapping is writeable, adjust kms to cover this page, + * but do not extend kms beyond actual file size. + * See bug 10919. + */ + if (fio->ft_writable) + osc_page_touch_at(env, ios->cis_obj, + fio->ft_index, fio->ft_nob); + RETURN(0); +} +EXPORT_SYMBOL(osc_io_fault_start); + + +static int osc_async_upcall(void *a, int rc) +{ + struct osc_async_cbargs *args = a; + + args->opc_rc = rc; + complete(&args->opc_sync); + return 0; +} + +/** + * Checks that there are no pages being written in the extent being truncated. + */ +static int trunc_check_cb(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops , void *cbdata) +{ + struct cl_page *page = ops->ops_cl.cpl_page; + struct osc_async_page *oap; + __u64 start = *(__u64 *)cbdata; + + oap = &ops->ops_oap; + if (oap->oap_cmd & OBD_BRW_WRITE && + !list_empty(&oap->oap_pending_item)) + CL_PAGE_DEBUG(D_ERROR, env, page, "exists %llu/%s.\n", + start, current->comm); + + if (PageLocked(page->cp_vmpage)) + CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n", + ops, osc_index(ops), oap->oap_cmd & OBD_BRW_RWMASK); + + return CLP_GANG_OKAY; +} + +static void osc_trunc_check(const struct lu_env *env, struct cl_io *io, + struct osc_io *oio, __u64 size) +{ + struct cl_object *clob; + int partial; + pgoff_t start; + + clob = oio->oi_cl.cis_obj; + start = cl_index(clob, size); + partial = cl_offset(clob, start) < size; + + /* + * Complain if there are pages in the truncated region. + */ + osc_page_gang_lookup(env, io, cl2osc(clob), + start + partial, CL_PAGE_EOF, + trunc_check_cb, (void *)&size); +} + +static int osc_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + __u64 size = io->u.ci_setattr.sa_attr.lvb_size; + unsigned int ia_avalid = io->u.ci_setattr.sa_avalid; + enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid; + int result = 0; + + ENTRY; + /* truncate cache dirty pages first */ + if (cl_io_is_trunc(io)) + result = osc_cache_truncate_start(env, cl2osc(obj), size, + &oio->oi_trunc); + + if (result == 0 && oio->oi_lockless == 0) { + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr; + unsigned int cl_valid = 0; + + if (ia_avalid & ATTR_SIZE) { + attr->cat_size = size; + attr->cat_kms = size; + cl_valid = (CAT_SIZE | CAT_KMS); + } + if (ia_avalid & ATTR_MTIME_SET) { + attr->cat_mtime = lvb->lvb_mtime; + cl_valid |= CAT_MTIME; + } + if (ia_avalid & ATTR_ATIME_SET) { + attr->cat_atime = lvb->lvb_atime; + cl_valid |= CAT_ATIME; + } + if (ia_xvalid & OP_XVALID_CTIME_SET) { + attr->cat_ctime = lvb->lvb_ctime; + cl_valid |= CAT_CTIME; + } + result = cl_object_attr_update(env, obj, attr, + cl_valid); + } + cl_object_attr_unlock(obj); + } + memset(oa, 0, sizeof(*oa)); + if (result == 0) { + oa->o_oi = loi->loi_oi; + obdo_set_parent_fid(oa, io->u.ci_setattr.sa_parent_fid); + oa->o_stripe_idx = io->u.ci_setattr.sa_stripe_index; + oa->o_layout = io->u.ci_setattr.sa_layout; + oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP | + OBD_MD_FLOSTLAYOUT; + if (ia_avalid & ATTR_CTIME) { + oa->o_valid |= OBD_MD_FLCTIME; + oa->o_ctime = attr->cat_ctime; + } + if (ia_avalid & ATTR_ATIME) { + oa->o_valid |= OBD_MD_FLATIME; + oa->o_atime = attr->cat_atime; + } + if (ia_avalid & ATTR_MTIME) { + oa->o_valid |= OBD_MD_FLMTIME; + oa->o_mtime = attr->cat_mtime; + } + if (ia_avalid & ATTR_SIZE) { + oa->o_size = size; + oa->o_blocks = OBD_OBJECT_EOF; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + + if (oio->oi_lockless) { + oa->o_flags = OBD_FL_SRVLOCK; + oa->o_valid |= OBD_MD_FLFLAGS; + } + + if (io->ci_layout_version > 0) { + /* verify layout version */ + oa->o_valid |= OBD_MD_LAYOUT_VERSION; + oa->o_layout_version = io->ci_layout_version; + } + } else { + LASSERT(oio->oi_lockless == 0); + } + + if (ia_xvalid & OP_XVALID_FLAGS) { + oa->o_flags = io->u.ci_setattr.sa_attr_flags; + oa->o_valid |= OBD_MD_FLFLAGS; + } + + init_completion(&cbargs->opc_sync); + + if (ia_avalid & ATTR_SIZE) + result = osc_punch_send(osc_export(cl2osc(obj)), + oa, osc_async_upcall, cbargs); + else + result = osc_setattr_async(osc_export(cl2osc(obj)), + oa, osc_async_upcall, + cbargs, PTLRPCD_SET); + + cbargs->opc_rpc_sent = result == 0; + } + + RETURN(result); +} + +void osc_io_setattr_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + struct obdo *oa = &oio->oi_oa; + unsigned int cl_valid = 0; + int result = 0; + + if (cbargs->opc_rpc_sent) { + wait_for_completion(&cbargs->opc_sync); + result = io->ci_result = cbargs->opc_rc; + } + + if (result == 0) { + if (oio->oi_lockless) { + /* lockless truncate */ + struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); + + LASSERT(cl_io_is_trunc(io)); + /* XXX: Need a lock. */ + osd->od_stats.os_lockless_truncates++; + } + } + + if (cl_io_is_trunc(io)) { + __u64 size = io->u.ci_setattr.sa_attr.lvb_size; + cl_object_attr_lock(obj); + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + cl_valid |= CAT_BLOCKS; + } + + cl_object_attr_update(env, obj, attr, cl_valid); + cl_object_attr_unlock(obj); + osc_trunc_check(env, io, oio, size); + osc_cache_truncate_end(env, oio->oi_trunc); + oio->oi_trunc = NULL; + } +} +EXPORT_SYMBOL(osc_io_setattr_end); + +struct osc_data_version_args { + struct osc_io *dva_oio; +}; + +static int +osc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *arg, int rc) +{ + struct osc_data_version_args *dva = arg; + struct osc_io *oio = dva->dva_oio; + const struct ost_body *body; + + ENTRY; + if (rc < 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, &oio->oi_oa, + &body->oa); + EXIT; +out: + oio->oi_cbarg.opc_rc = rc; + complete(&oio->oi_cbarg.opc_sync); + + return 0; +} + +static int osc_io_data_version_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; + struct osc_io *oio = cl2osc_io(env, slice); + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + struct osc_object *obj = cl2osc(slice->cis_obj); + struct lov_oinfo *loi = obj->oo_oinfo; + struct obd_export *exp = osc_export(obj); + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_data_version_args *dva; + int rc; + + ENTRY; + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) { + oa->o_valid |= OBD_MD_FLFLAGS; + oa->o_flags |= OBD_FL_SRVLOCK; + if (dv->dv_flags & LL_DV_WR_FLUSH) + oa->o_flags |= OBD_FL_FLUSH; + } + + init_completion(&cbargs->opc_sync); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = osc_data_version_interpret; + CLASSERT(sizeof(*dva) <= sizeof(req->rq_async_args)); + dva = ptlrpc_req_async_args(req); + dva->dva_oio = oio; + + ptlrpcd_add_req(req); + + RETURN(0); +} + +static void osc_io_data_version_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + + ENTRY; + wait_for_completion(&cbargs->opc_sync); + + if (cbargs->opc_rc != 0) { + slice->cis_io->ci_result = cbargs->opc_rc; + } else { + slice->cis_io->ci_result = 0; + if (!(oio->oi_oa.o_valid & + (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION))) + slice->cis_io->ci_result = -ENOTSUPP; + + if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION) + dv->dv_layout_version = oio->oi_oa.o_layout_version; + if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION) + dv->dv_data_version = oio->oi_oa.o_data_version; + } + + EXIT; +} + +int osc_io_read_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_object *obj = slice->cis_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int rc = 0; + ENTRY; + + if (!slice->cis_io->ci_noatime) { + cl_object_attr_lock(obj); + attr->cat_atime = ktime_get_real_seconds(); + rc = cl_object_attr_update(env, obj, attr, CAT_ATIME); + cl_object_attr_unlock(obj); + } + + RETURN(rc); +} +EXPORT_SYMBOL(osc_io_read_start); + +int osc_io_write_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_object *obj = slice->cis_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int rc = 0; + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1); + cl_object_attr_lock(obj); + attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds(); + rc = cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME); + cl_object_attr_unlock(obj); + + RETURN(rc); +} +EXPORT_SYMBOL(osc_io_write_start); + +int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj, + struct cl_fsync_io *fio) +{ + struct osc_io *oio = osc_env_io(env); + struct obdo *oa = &oio->oi_oa; + struct lov_oinfo *loi = obj->oo_oinfo; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + int rc = 0; + ENTRY; + + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + /* reload size abd blocks for start and end of sync range */ + oa->o_size = fio->fi_start; + oa->o_blocks = fio->fi_end; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + + obdo_set_parent_fid(oa, fio->fi_fid); + + init_completion(&cbargs->opc_sync); + + rc = osc_sync_base(obj, oa, osc_async_upcall, cbargs, PTLRPCD_SET); + RETURN(rc); +} +EXPORT_SYMBOL(osc_fsync_ost); + +int osc_io_fsync_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct cl_fsync_io *fio = &io->u.ci_fsync; + struct cl_object *obj = slice->cis_obj; + struct osc_object *osc = cl2osc(obj); + pgoff_t start = cl_index(obj, fio->fi_start); + pgoff_t end = cl_index(obj, fio->fi_end); + int result = 0; + ENTRY; + + if (fio->fi_end == OBD_OBJECT_EOF) + end = CL_PAGE_EOF; + + result = osc_cache_writeback_range(env, osc, start, end, 0, + fio->fi_mode == CL_FSYNC_DISCARD); + if (result > 0) { + fio->fi_nr_written += result; + result = 0; + } + if (fio->fi_mode == CL_FSYNC_ALL) { + int rc; + + /* we have to wait for writeback to finish before we can + * send OST_SYNC RPC. This is bad because it causes extents + * to be written osc by osc. However, we usually start + * writeback before CL_FSYNC_ALL so this won't have any real + * problem. */ + rc = osc_cache_wait_range(env, osc, start, end); + if (result == 0) + result = rc; + rc = osc_fsync_ost(env, osc, fio); + if (result == 0) + result = rc; + } + + RETURN(result); +} + +void osc_io_fsync_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync; + struct cl_object *obj = slice->cis_obj; + pgoff_t start = cl_index(obj, fio->fi_start); + pgoff_t end = cl_index(obj, fio->fi_end); + int result = 0; + + if (fio->fi_mode == CL_FSYNC_LOCAL) { + result = osc_cache_wait_range(env, cl2osc(obj), start, end); + } else if (fio->fi_mode == CL_FSYNC_ALL) { + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + + wait_for_completion(&cbargs->opc_sync); + if (result == 0) + result = cbargs->opc_rc; + } + slice->cis_io->ci_result = result; +} +EXPORT_SYMBOL(osc_io_fsync_end); + +static int osc_io_ladvise_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + int result = 0; + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_ladvise_io *lio = &io->u.ci_ladvise; + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + struct lu_ladvise *ladvise; + struct ladvise_hdr *ladvise_hdr; + int buf_size; + int num_advise = 1; + ENTRY; + + /* TODO: add multiple ladvise support in CLIO */ + buf_size = offsetof(typeof(*ladvise_hdr), lah_advise[num_advise]); + if (osc_env_info(env)->oti_ladvise_buf.lb_len < buf_size) + lu_buf_realloc(&osc_env_info(env)->oti_ladvise_buf, buf_size); + + ladvise_hdr = osc_env_info(env)->oti_ladvise_buf.lb_buf; + if (ladvise_hdr == NULL) + RETURN(-ENOMEM); + + memset(ladvise_hdr, 0, buf_size); + ladvise_hdr->lah_magic = LADVISE_MAGIC; + ladvise_hdr->lah_count = num_advise; + ladvise_hdr->lah_flags = lio->li_flags; + + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_valid = OBD_MD_FLID; + obdo_set_parent_fid(oa, lio->li_fid); + + ladvise = ladvise_hdr->lah_advise; + ladvise->lla_start = lio->li_start; + ladvise->lla_end = lio->li_end; + ladvise->lla_advice = lio->li_advice; + + if (lio->li_flags & LF_ASYNC) { + result = osc_ladvise_base(osc_export(cl2osc(obj)), oa, + ladvise_hdr, NULL, NULL, NULL); + } else { + init_completion(&cbargs->opc_sync); + result = osc_ladvise_base(osc_export(cl2osc(obj)), oa, + ladvise_hdr, osc_async_upcall, + cbargs, PTLRPCD_SET); + cbargs->opc_rpc_sent = result == 0; + } + RETURN(result); +} + +static void osc_io_ladvise_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + int result = 0; + struct cl_ladvise_io *lio = &io->u.ci_ladvise; + + if ((!(lio->li_flags & LF_ASYNC)) && cbargs->opc_rpc_sent) { + wait_for_completion(&cbargs->opc_sync); + result = cbargs->opc_rc; + } + slice->cis_io->ci_result = result; +} + +void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice) +{ + struct osc_io *oio = cl2osc_io(env, slice); + + if (oio->oi_active) { + osc_extent_release(env, oio->oi_active); + oio->oi_active = NULL; + } +} +EXPORT_SYMBOL(osc_io_end); + +static const struct cl_io_operations osc_io_ops = { + .op = { + [CIT_READ] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_iter_fini, + .cio_start = osc_io_read_start, + .cio_fini = osc_io_fini + }, + [CIT_WRITE] = { + .cio_iter_init = osc_io_write_iter_init, + .cio_iter_fini = osc_io_write_iter_fini, + .cio_start = osc_io_write_start, + .cio_end = osc_io_end, + .cio_fini = osc_io_fini + }, + [CIT_SETATTR] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_iter_fini, + .cio_start = osc_io_setattr_start, + .cio_end = osc_io_setattr_end + }, + [CIT_DATA_VERSION] = { + .cio_start = osc_io_data_version_start, + .cio_end = osc_io_data_version_end, + }, + [CIT_FAULT] = { + .cio_iter_init = osc_io_iter_init, + .cio_iter_fini = osc_io_iter_fini, + .cio_start = osc_io_fault_start, + .cio_end = osc_io_end, + .cio_fini = osc_io_fini + }, + [CIT_FSYNC] = { + .cio_start = osc_io_fsync_start, + .cio_end = osc_io_fsync_end, + .cio_fini = osc_io_fini + }, + [CIT_LADVISE] = { + .cio_start = osc_io_ladvise_start, + .cio_end = osc_io_ladvise_end, + .cio_fini = osc_io_fini + }, + [CIT_MISC] = { + .cio_fini = osc_io_fini + } + }, + .cio_read_ahead = osc_io_read_ahead, + .cio_submit = osc_io_submit, + .cio_commit_async = osc_io_commit_async +}; + +/***************************************************************************** + * + * Transfer operations. + * + */ + +int osc_io_init(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io) +{ + struct osc_io *oio = osc_env_io(env); + + CL_IO_SLICE_CLEAN(oio, oi_cl); + cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops); + return 0; +} + +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_lock.c b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c new file mode 100644 index 0000000000000..dd956fd8532b2 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c @@ -0,0 +1,1306 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC + +/* fid_build_reg_res_name() */ +#include +#include + +#include "osc_internal.h" + +/** \addtogroup osc + * @{ + */ + +/** + * Returns a weak pointer to the ldlm lock identified by a handle. Returned + * pointer cannot be dereferenced, as lock is not protected from concurrent + * reclaim. This function is a helper for osc_lock_invariant(). + */ +static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle) +{ + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(handle); + if (lock != NULL) + LDLM_LOCK_PUT(lock); + return lock; +} + +/** + * Invariant that has to be true all of the time. + */ +static int osc_lock_invariant(struct osc_lock *ols) +{ + struct ldlm_lock *lock = osc_handle_ptr(&ols->ols_handle); + struct ldlm_lock *olock = ols->ols_dlmlock; + int handle_used = lustre_handle_is_used(&ols->ols_handle); + + if (ergo(osc_lock_is_lockless(ols), + ols->ols_locklessable && ols->ols_dlmlock == NULL)) + return 1; + + /* + * If all the following "ergo"s are true, return 1, otherwise 0 + */ + if (! ergo(olock != NULL, handle_used)) + return 0; + + if (! ergo(olock != NULL, + olock->l_handle.h_cookie == ols->ols_handle.cookie)) + return 0; + + if (! ergo(handle_used, + ergo(lock != NULL && olock != NULL, lock == olock) && + ergo(lock == NULL, olock == NULL))) + return 0; + /* + * Check that ->ols_handle and ->ols_dlmlock are consistent, but + * take into account that they are set at the different time. + */ + if (! ergo(ols->ols_state == OLS_CANCELLED, + olock == NULL && !handle_used)) + return 0; + /* + * DLM lock is destroyed only after we have seen cancellation + * ast. + */ + if (! ergo(olock != NULL && ols->ols_state < OLS_CANCELLED, + !ldlm_is_destroyed(olock))) + return 0; + + if (! ergo(ols->ols_state == OLS_GRANTED, + olock != NULL && + ldlm_is_granted(olock) && + ols->ols_hold)) + return 0; + return 1; +} + +/***************************************************************************** + * + * Lock operations. + * + */ + +void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + + LINVRNT(osc_lock_invariant(ols)); + LASSERT(ols->ols_dlmlock == NULL); + + OBD_SLAB_FREE_PTR(ols, osc_lock_kmem); +} +EXPORT_SYMBOL(osc_lock_fini); + +static void osc_lock_build_policy(const struct lu_env *env, + const struct cl_lock *lock, + union ldlm_policy_data *policy) +{ + const struct cl_lock_descr *d = &lock->cll_descr; + + osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end); + policy->l_extent.gid = d->cld_gid; +} + +/** + * Updates object attributes from a lock value block (lvb) received together + * with the DLM lock reply from the server. Copy of osc_update_enqueue() + * logic. + * + * Called under lock and resource spin-locks. + */ +void osc_lock_lvb_update(const struct lu_env *env, + struct osc_object *osc, + struct ldlm_lock *dlmlock, + struct ost_lvb *lvb) +{ + struct cl_object *obj = osc2cl(osc); + struct lov_oinfo *oinfo = osc->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned valid, setkms = 0; + + ENTRY; + + valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE; + if (lvb == NULL) { + LASSERT(dlmlock != NULL); + lvb = dlmlock->l_lvb_data; + } + cl_lvb2attr(attr, lvb); + + cl_object_attr_lock(obj); + if (dlmlock != NULL) { + __u64 size; + + check_res_locked(dlmlock->l_resource); + + LASSERT(lvb == dlmlock->l_lvb_data); + size = lvb->lvb_size; + + /* Extend KMS up to the end of this lock and no further + * A lock on [x,y] means a KMS of up to y + 1 bytes! */ + if (size > dlmlock->l_policy_data.l_extent.end) + size = dlmlock->l_policy_data.l_extent.end + 1; + if (size >= oinfo->loi_kms) { + valid |= CAT_KMS; + attr->cat_kms = size; + setkms = 1; + } + ldlm_lock_allow_match_locked(dlmlock); + } + + /* The size should not be less than the kms */ + if (attr->cat_size < oinfo->loi_kms) + attr->cat_size = oinfo->loi_kms; + + LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s " + "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size, + setkms ? "" : " leaving", + setkms ? attr->cat_kms : oinfo->loi_kms, + dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull); + + cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + + EXIT; +} + +static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl, + struct lustre_handle *lockh) +{ + struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj); + struct ldlm_lock *dlmlock; + + dlmlock = ldlm_handle2lock_long(lockh, 0); + LASSERT(dlmlock != NULL); + + /* lock reference taken by ldlm_handle2lock_long() is + * owned by osc_lock and released in osc_lock_detach() + */ + lu_ref_add(&dlmlock->l_reference, "osc_lock", oscl); + oscl->ols_has_ref = 1; + + LASSERT(oscl->ols_dlmlock == NULL); + oscl->ols_dlmlock = dlmlock; + + /* This may be a matched lock for glimpse request, do not hold + * lock reference in that case. */ + if (!oscl->ols_glimpse) { + /* hold a refc for non glimpse lock which will + * be released in osc_lock_cancel() */ + lustre_handle_copy(&oscl->ols_handle, lockh); + ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode); + oscl->ols_hold = 1; + } + + /* Lock must have been granted. */ + lock_res_and_lock(dlmlock); + if (ldlm_is_granted(dlmlock)) { + struct ldlm_extent *ext = &dlmlock->l_policy_data.l_extent; + struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr; + + /* extend the lock extent, otherwise it will have problem when + * we decide whether to grant a lockless lock. */ + descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode); + descr->cld_start = cl_index(descr->cld_obj, ext->start); + descr->cld_end = cl_index(descr->cld_obj, ext->end); + descr->cld_gid = ext->gid; + + /* no lvb update for matched lock */ + if (!ldlm_is_lvb_cached(dlmlock)) { + LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); + LASSERT(osc == dlmlock->l_ast_data); + osc_lock_lvb_update(env, osc, dlmlock, NULL); + ldlm_set_lvb_cached(dlmlock); + } + LINVRNT(osc_lock_invariant(oscl)); + } + unlock_res_and_lock(dlmlock); + + LASSERT(oscl->ols_state != OLS_GRANTED); + oscl->ols_state = OLS_GRANTED; +} + +/** + * Lock upcall function that is executed either when a reply to ENQUEUE rpc is + * received from a server, or after osc_enqueue_base() matched a local DLM + * lock. + */ +static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh, + int errcode) +{ + struct osc_lock *oscl = cookie; + struct cl_lock_slice *slice = &oscl->ols_cl; + struct lu_env *env; + int rc; + + ENTRY; + + env = cl_env_percpu_get(); + /* should never happen, similar to osc_ldlm_blocking_ast(). */ + LASSERT(!IS_ERR(env)); + + rc = ldlm_error2errno(errcode); + if (oscl->ols_state == OLS_ENQUEUED) { + oscl->ols_state = OLS_UPCALL_RECEIVED; + } else if (oscl->ols_state == OLS_CANCELLED) { + rc = -EIO; + } else { + CERROR("Impossible state: %d\n", oscl->ols_state); + LBUG(); + } + + if (rc == 0) + osc_lock_granted(env, oscl, lockh); + + /* Error handling, some errors are tolerable. */ + if (oscl->ols_locklessable && rc == -EUSERS) { + /* This is a tolerable error, turn this lock into + * lockless lock. + */ + osc_object_set_contended(cl2osc(slice->cls_obj)); + LASSERT(slice->cls_ops != oscl->ols_lockless_ops); + + /* Change this lock to ldlmlock-less lock. */ + osc_lock_to_lockless(env, oscl, 1); + oscl->ols_state = OLS_GRANTED; + rc = 0; + } else if (oscl->ols_glimpse && rc == -ENAVAIL) { + LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); + osc_lock_lvb_update(env, cl2osc(slice->cls_obj), + NULL, &oscl->ols_lvb); + /* Hide the error. */ + rc = 0; + } else if (rc < 0 && oscl->ols_flags & LDLM_FL_NDELAY) { + rc = -EWOULDBLOCK; + } + + if (oscl->ols_owner != NULL) + cl_sync_io_note(env, oscl->ols_owner, rc); + cl_env_percpu_put(env); + + RETURN(rc); +} + +static int osc_lock_upcall_speculative(void *cookie, + struct lustre_handle *lockh, + int errcode) +{ + struct osc_object *osc = cookie; + struct ldlm_lock *dlmlock; + struct lu_env *env; + __u16 refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + LASSERT(!IS_ERR(env)); + + if (errcode == ELDLM_LOCK_MATCHED) + GOTO(out, errcode = ELDLM_OK); + + if (errcode != ELDLM_OK) + GOTO(out, errcode); + + dlmlock = ldlm_handle2lock(lockh); + LASSERT(dlmlock != NULL); + + lock_res_and_lock(dlmlock); + LASSERT(ldlm_is_granted(dlmlock)); + + /* there is no osc_lock associated with speculative locks + * thus no need to set LDLM_FL_LVB_CACHED */ + osc_lock_lvb_update(env, osc, dlmlock, NULL); + + unlock_res_and_lock(dlmlock); + LDLM_LOCK_PUT(dlmlock); + +out: + cl_object_put(env, osc2cl(osc)); + cl_env_put(env, &refcheck); + RETURN(ldlm_error2errno(errcode)); +} + +static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end, + enum cl_lock_mode mode, bool discard) +{ + struct lu_env *env; + __u16 refcheck; + int rc = 0; + int rc2 = 0; + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + if (mode == CLM_WRITE) { + rc = osc_cache_writeback_range(env, obj, start, end, 1, + discard); + CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n", + obj, start, end, rc, + discard ? "discarded" : "written back"); + if (rc > 0) + rc = 0; + } + + /* + * Do not try to match other locks with CLM_WRITE since we already + * know there're none + */ + rc2 = osc_lock_discard_pages(env, obj, start, end, + mode == CLM_WRITE || discard); + if (rc == 0 && rc2 < 0) + rc = rc2; + + cl_env_put(env, &refcheck); + RETURN(rc); +} + +/** + * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock + * and ldlm_lock caches. + */ +static int osc_dlm_blocking_ast0(const struct lu_env *env, + struct ldlm_lock *dlmlock, + void *data, int flag) +{ + struct cl_object *obj = NULL; + int result = 0; + bool discard; + enum cl_lock_mode mode = CLM_READ; + ENTRY; + + LASSERT(flag == LDLM_CB_CANCELING); + + lock_res_and_lock(dlmlock); + if (!ldlm_is_granted(dlmlock)) { + dlmlock->l_ast_data = NULL; + unlock_res_and_lock(dlmlock); + RETURN(0); + } + + discard = ldlm_is_discard_data(dlmlock); + if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP)) + mode = CLM_WRITE; + + if (dlmlock->l_ast_data != NULL) { + obj = osc2cl(dlmlock->l_ast_data); + dlmlock->l_ast_data = NULL; + + cl_object_get(obj); + } + + unlock_res_and_lock(dlmlock); + + /* if l_ast_data is NULL, the dlmlock was enqueued by AGL or + * the object has been destroyed. */ + if (obj != NULL) { + struct ldlm_extent *extent = &dlmlock->l_policy_data.l_extent; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + __u64 old_kms; + + /* Destroy pages covered by the extent of the DLM lock */ + result = osc_lock_flush(cl2osc(obj), + cl_index(obj, extent->start), + cl_index(obj, extent->end), + mode, discard); + + /* losing a lock, update kms */ + lock_res_and_lock(dlmlock); + cl_object_attr_lock(obj); + /* Must get the value under the lock to avoid race. */ + old_kms = cl2osc(obj)->oo_oinfo->loi_kms; + /* Update the kms. Need to loop all granted locks. + * Not a problem for the client */ + attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms); + + cl_object_attr_update(env, obj, attr, CAT_KMS); + cl_object_attr_unlock(obj); + unlock_res_and_lock(dlmlock); + + cl_object_put(env, obj); + } + RETURN(result); +} + +/** + * Blocking ast invoked by ldlm when dlm lock is either blocking progress of + * some other lock, or is canceled. This function is installed as a + * ldlm_lock::l_blocking_ast() for client extent locks. + * + * Control flow is tricky, because ldlm uses the same call-back + * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's. + * + * \param dlmlock lock for which ast occurred. + * + * \param new description of a conflicting lock in case of blocking ast. + * + * \param data value of dlmlock->l_ast_data + * + * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish + * cancellation and blocking ast's. + * + * Possible use cases: + * + * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel + * lock due to lock lru pressure, or explicit user request to purge + * locks. + * + * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify + * us that dlmlock conflicts with another lock that some client is + * enqueuing. Lock is canceled. + * + * - cl_lock_cancel() is called. osc_lock_cancel() calls + * ldlm_cli_cancel() that calls + * + * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) + * + * recursively entering osc_ldlm_blocking_ast(). + * + * - client cancels lock voluntary (e.g., as a part of early cancellation): + * + * cl_lock_cancel()-> + * osc_lock_cancel()-> + * ldlm_cli_cancel()-> + * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) + * + */ +static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, + struct ldlm_lock_desc *new, void *data, + int flag) +{ + int result = 0; + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: { + struct lustre_handle lockh; + + ldlm_lock2handle(dlmlock, &lockh); + result = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (result == -ENODATA) + result = 0; + break; + } + case LDLM_CB_CANCELING: { + struct lu_env *env; + __u16 refcheck; + + /* + * This can be called in the context of outer IO, e.g., + * + * osc_enqueue_base()->... + * ->ldlm_prep_elc_req()->... + * ->ldlm_cancel_callback()->... + * ->osc_ldlm_blocking_ast() + * + * new environment has to be created to not corrupt outer + * context. + */ + env = cl_env_get(&refcheck); + if (IS_ERR(env)) { + result = PTR_ERR(env); + break; + } + + result = osc_dlm_blocking_ast0(env, dlmlock, data, flag); + cl_env_put(env, &refcheck); + break; + } + default: + LBUG(); + } + RETURN(result); +} + +int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data) +{ + struct ptlrpc_request *req = data; + struct lu_env *env; + struct ost_lvb *lvb; + struct req_capsule *cap; + struct cl_object *obj = NULL; + struct ldlm_resource *res = dlmlock->l_resource; + struct ldlm_match_data matchdata = { 0 }; + union ldlm_policy_data policy; + enum ldlm_mode mode = LCK_PW | LCK_GROUP | LCK_PR; + int result; + __u16 refcheck; + + ENTRY; + + LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, result = PTR_ERR(env)); + + policy.l_extent.start = 0; + policy.l_extent.end = LUSTRE_EOF; + + matchdata.lmd_mode = &mode; + matchdata.lmd_policy = &policy; + matchdata.lmd_flags = LDLM_FL_TEST_LOCK | LDLM_FL_CBPENDING; + matchdata.lmd_unref = 1; + matchdata.lmd_has_ast_data = true; + + LDLM_LOCK_GET(dlmlock); + + /* If any dlmlock has l_ast_data set, we must find it or we risk + * missing a size update done under a different lock. + */ + while (dlmlock) { + lock_res_and_lock(dlmlock); + if (dlmlock->l_ast_data) { + obj = osc2cl(dlmlock->l_ast_data); + cl_object_get(obj); + } + unlock_res_and_lock(dlmlock); + LDLM_LOCK_RELEASE(dlmlock); + + dlmlock = NULL; + + if (obj == NULL && res->lr_type == LDLM_EXTENT) { + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_SIZE_DATA)) + break; + + lock_res(res); + dlmlock = search_itree(res, &matchdata); + unlock_res(res); + } + } + + if (obj != NULL) { + /* Do not grab the mutex of cl_lock for glimpse. + * See LU-1274 for details. + * BTW, it's okay for cl_lock to be cancelled during + * this period because server can handle this race. + * See ldlm_server_glimpse_ast() for details. + * cl_lock_mutex_get(env, lock); */ + cap = &req->rq_pill; + req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK); + req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER, + sizeof *lvb); + result = req_capsule_server_pack(cap); + if (result == 0) { + lvb = req_capsule_server_get(cap, &RMF_DLM_LVB); + result = cl_object_glimpse(env, obj, lvb); + } + if (!exp_connect_lvb_type(req->rq_export)) + req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, + sizeof(struct ost_lvb_v1), RCL_SERVER); + cl_object_put(env, obj); + } else { + /* + * These errors are normal races, so we don't want to + * fill the console with messages by calling + * ptlrpc_error() + */ + lustre_pack_reply(req, 1, NULL, NULL); + result = -ELDLM_NO_LOCK_DATA; + } + cl_env_put(env, &refcheck); + EXIT; + +out: + req->rq_status = result; + RETURN(result); +} +EXPORT_SYMBOL(osc_ldlm_glimpse_ast); + +static int weigh_cb(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops, void *cbdata) +{ + struct cl_page *page = ops->ops_cl.cpl_page; + + if (cl_page_is_vmlocked(env, page) || PageDirty(page->cp_vmpage) || + PageWriteback(page->cp_vmpage)) + return CLP_GANG_ABORT; + + *(pgoff_t *)cbdata = osc_index(ops) + 1; + return CLP_GANG_OKAY; +} + +static unsigned long osc_lock_weight(const struct lu_env *env, + struct osc_object *oscobj, + loff_t start, loff_t end) +{ + struct cl_io *io = osc_env_thread_io(env); + struct cl_object *obj = cl_object_top(&oscobj->oo_cl); + pgoff_t page_index; + int result; + + ENTRY; + + io->ci_obj = obj; + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + RETURN(result); + + page_index = cl_index(obj, start); + do { + result = osc_page_gang_lookup(env, io, oscobj, + page_index, cl_index(obj, end), + weigh_cb, (void *)&page_index); + if (result == CLP_GANG_ABORT) + break; + if (result == CLP_GANG_RESCHED) + cond_resched(); + } while (result != CLP_GANG_OKAY); + cl_io_fini(env, io); + + return result == CLP_GANG_ABORT ? 1 : 0; +} + +/** + * Get the weight of dlm lock for early cancellation. + */ +unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock) +{ + struct lu_env *env; + struct osc_object *obj; + struct osc_lock *oscl; + unsigned long weight; + bool found = false; + __u16 refcheck; + + ENTRY; + + might_sleep(); + /* + * osc_ldlm_weigh_ast has a complex context since it might be called + * because of lock canceling, or from user's input. We have to make + * a new environment for it. Probably it is implementation safe to use + * the upper context because cl_lock_put don't modify environment + * variables. But just in case .. + */ + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + /* Mostly because lack of memory, do not eliminate this lock */ + RETURN(1); + + LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT || + dlmlock->l_resource->lr_type == LDLM_IBITS); + + lock_res_and_lock(dlmlock); + obj = dlmlock->l_ast_data; + if (obj) + cl_object_get(osc2cl(obj)); + unlock_res_and_lock(dlmlock); + + if (obj == NULL) + GOTO(out, weight = 1); + + spin_lock(&obj->oo_ol_spin); + list_for_each_entry(oscl, &obj->oo_ol_list, ols_nextlock_oscobj) { + if (oscl->ols_dlmlock == dlmlock) { + found = true; + break; + } + } + spin_unlock(&obj->oo_ol_spin); + if (found) { + /* + * If the lock is being used by an IO, definitely not cancel it. + */ + GOTO(out, weight = 1); + } + + if (dlmlock->l_resource->lr_type == LDLM_EXTENT) + weight = osc_lock_weight(env, obj, + dlmlock->l_policy_data.l_extent.start, + dlmlock->l_policy_data.l_extent.end); + else if (ldlm_has_dom(dlmlock)) + weight = osc_lock_weight(env, obj, 0, OBD_OBJECT_EOF); + /* The DOM bit can be cancelled at any time; in that case, we know + * there are no pages, so just return weight of 0 + */ + else + weight = 0; + + EXIT; + +out: + if (obj) + cl_object_put(env, osc2cl(obj)); + + cl_env_put(env, &refcheck); + return weight; +} +EXPORT_SYMBOL(osc_ldlm_weigh_ast); + +static void osc_lock_build_einfo(const struct lu_env *env, + const struct cl_lock *lock, + struct osc_object *osc, + struct ldlm_enqueue_info *einfo) +{ + einfo->ei_type = LDLM_EXTENT; + einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode); + einfo->ei_cb_bl = osc_ldlm_blocking_ast; + einfo->ei_cb_cp = ldlm_completion_ast; + einfo->ei_cb_gl = osc_ldlm_glimpse_ast; + einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */ +} + +/** + * Determine if the lock should be converted into a lockless lock. + * + * Steps to check: + * - if the lock has an explicite requirment for a non-lockless lock; + * - if the io lock request type ci_lockreq; + * - send the enqueue rpc to ost to make the further decision; + * - special treat to truncate lockless lock + * + * Additional policy can be implemented here, e.g., never do lockless-io + * for large extents. + */ +void osc_lock_to_lockless(const struct lu_env *env, + struct osc_lock *ols, int force) +{ + struct cl_lock_slice *slice = &ols->ols_cl; + struct osc_io *oio = osc_env_io(env); + struct cl_io *io = oio->oi_cl.cis_io; + struct cl_object *obj = slice->cls_obj; + struct osc_object *oob = cl2osc(obj); + const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); + struct obd_connect_data *ocd; + + LASSERT(ols->ols_state == OLS_NEW || + ols->ols_state == OLS_UPCALL_RECEIVED); + + if (force) { + ols->ols_locklessable = 1; + slice->cls_ops = ols->ols_lockless_ops; + } else { + LASSERT(io->ci_lockreq == CILR_MANDATORY || + io->ci_lockreq == CILR_MAYBE || + io->ci_lockreq == CILR_NEVER); + + ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data; + ols->ols_locklessable = (io->ci_type != CIT_SETATTR) && + (io->ci_lockreq == CILR_MAYBE) && + (ocd->ocd_connect_flags & + OBD_CONNECT_SRVLOCK); + if (io->ci_lockreq == CILR_NEVER || + /* lockless IO */ + (ols->ols_locklessable && osc_object_is_contended(oob)) || + /* lockless truncate */ + (cl_io_is_trunc(io) && osd->od_lockless_truncate && + (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK))) { + ols->ols_locklessable = 1; + slice->cls_ops = ols->ols_lockless_ops; + } + } + LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols))); +} +EXPORT_SYMBOL(osc_lock_to_lockless); + +static bool osc_lock_compatible(const struct osc_lock *qing, + const struct osc_lock *qed) +{ + struct cl_lock_descr *qed_descr = &qed->ols_cl.cls_lock->cll_descr; + struct cl_lock_descr *qing_descr = &qing->ols_cl.cls_lock->cll_descr; + + if (qed->ols_glimpse || qed->ols_speculative) + return true; + + if (qing_descr->cld_mode == CLM_READ && qed_descr->cld_mode == CLM_READ) + return true; + + if (qed->ols_state < OLS_GRANTED) + return true; + + if (qed_descr->cld_mode >= qing_descr->cld_mode && + qed_descr->cld_start <= qing_descr->cld_start && + qed_descr->cld_end >= qing_descr->cld_end) + return true; + + return false; +} + +void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc, + struct osc_lock *oscl) +{ + spin_lock(&osc->oo_ol_spin); + list_del_init(&oscl->ols_nextlock_oscobj); + spin_unlock(&osc->oo_ol_spin); + + spin_lock(&oscl->ols_lock); + while (!list_empty(&oscl->ols_waiting_list)) { + struct osc_lock *scan; + + scan = list_entry(oscl->ols_waiting_list.next, struct osc_lock, + ols_wait_entry); + list_del_init(&scan->ols_wait_entry); + + cl_sync_io_note(env, scan->ols_owner, 0); + } + spin_unlock(&oscl->ols_lock); +} +EXPORT_SYMBOL(osc_lock_wake_waiters); + +int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj, + struct osc_lock *oscl) +{ + struct osc_lock *tmp_oscl; + struct cl_lock_descr *need = &oscl->ols_cl.cls_lock->cll_descr; + struct cl_sync_io *waiter = &osc_env_info(env)->oti_anchor; + int rc = 0; + + ENTRY; + + spin_lock(&obj->oo_ol_spin); + list_add_tail(&oscl->ols_nextlock_oscobj, &obj->oo_ol_list); + +restart: + list_for_each_entry(tmp_oscl, &obj->oo_ol_list, + ols_nextlock_oscobj) { + struct cl_lock_descr *descr; + + if (tmp_oscl == oscl) + break; + + descr = &tmp_oscl->ols_cl.cls_lock->cll_descr; + if (descr->cld_start > need->cld_end || + descr->cld_end < need->cld_start) + continue; + + /* We're not supposed to give up group lock */ + if (descr->cld_mode == CLM_GROUP) + break; + + if (!osc_lock_is_lockless(oscl) && + osc_lock_compatible(oscl, tmp_oscl)) + continue; + + /* wait for conflicting lock to be canceled */ + cl_sync_io_init(waiter, 1, cl_sync_io_end); + oscl->ols_owner = waiter; + + spin_lock(&tmp_oscl->ols_lock); + /* add oscl into tmp's ols_waiting list */ + list_add_tail(&oscl->ols_wait_entry, + &tmp_oscl->ols_waiting_list); + spin_unlock(&tmp_oscl->ols_lock); + + spin_unlock(&obj->oo_ol_spin); + rc = cl_sync_io_wait(env, waiter, 0); + spin_lock(&obj->oo_ol_spin); + + if (rc < 0) + break; + + oscl->ols_owner = NULL; + goto restart; + } + spin_unlock(&obj->oo_ol_spin); + + RETURN(rc); +} +EXPORT_SYMBOL(osc_lock_enqueue_wait); + +/** + * Implementation of cl_lock_operations::clo_enqueue() method for osc + * layer. This initiates ldlm enqueue: + * + * - cancels conflicting locks early (osc_lock_enqueue_wait()); + * + * - calls osc_enqueue_base() to do actual enqueue. + * + * osc_enqueue_base() is supplied with an upcall function that is executed + * when lock is received either after a local cached ldlm lock is matched, or + * when a reply from the server is received. + * + * This function does not wait for the network communication to complete. + */ +static int osc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *unused, struct cl_sync_io *anchor) +{ + struct osc_thread_info *info = osc_env_info(env); + struct osc_io *oio = osc_env_io(env); + struct osc_object *osc = cl2osc(slice->cls_obj); + struct osc_lock *oscl = cl2osc_lock(slice); + struct obd_export *exp = osc_export(osc); + struct cl_lock *lock = slice->cls_lock; + struct ldlm_res_id *resname = &info->oti_resname; + union ldlm_policy_data *policy = &info->oti_policy; + osc_enqueue_upcall_f upcall = osc_lock_upcall; + void *cookie = oscl; + bool async = false; + int result; + + ENTRY; + + LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ), + "lock = %p, ols = %p\n", lock, oscl); + + if (oscl->ols_state == OLS_GRANTED) + RETURN(0); + + if ((oscl->ols_flags & LDLM_FL_NO_EXPANSION) && + !(exp_connect_lockahead_old(exp) || exp_connect_lockahead(exp))) { + result = -EOPNOTSUPP; + CERROR("%s: server does not support lockahead/locknoexpand:" + "rc = %d\n", exp->exp_obd->obd_name, result); + RETURN(result); + } + + if (oscl->ols_flags & LDLM_FL_TEST_LOCK) + GOTO(enqueue_base, 0); + + /* For glimpse and/or speculative locks, do not wait for reply from + * server on LDLM request */ + if (oscl->ols_glimpse || oscl->ols_speculative) { + /* Speculative and glimpse locks do not have an anchor */ + LASSERT(equi(oscl->ols_speculative, anchor == NULL)); + async = true; + GOTO(enqueue_base, 0); + } + + result = osc_lock_enqueue_wait(env, osc, oscl); + if (result < 0) + GOTO(out, result); + + /* we can grant lockless lock right after all conflicting locks + * are canceled. */ + if (osc_lock_is_lockless(oscl)) { + oscl->ols_state = OLS_GRANTED; + oio->oi_lockless = 1; + RETURN(0); + } + +enqueue_base: + oscl->ols_state = OLS_ENQUEUED; + if (anchor != NULL) { + atomic_inc(&anchor->csi_sync_nr); + oscl->ols_owner = anchor; + } + + /** + * DLM lock's ast data must be osc_object; + * if glimpse or speculative lock, async of osc_enqueue_base() + * must be true + * + * For non-speculative locks: + * DLM's enqueue callback set to osc_lock_upcall() with cookie as + * osc_lock. + * For speculative locks: + * osc_lock_upcall_speculative & cookie is the osc object, since + * there is no osc_lock + */ + ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname); + osc_lock_build_policy(env, lock, policy); + if (oscl->ols_speculative) { + oscl->ols_einfo.ei_cbdata = NULL; + /* hold a reference for callback */ + cl_object_get(osc2cl(osc)); + upcall = osc_lock_upcall_speculative; + cookie = osc; + } + result = osc_enqueue_base(exp, resname, &oscl->ols_flags, + policy, &oscl->ols_lvb, + upcall, cookie, + &oscl->ols_einfo, PTLRPCD_SET, async, + oscl->ols_speculative); + if (result == 0) { + if (osc_lock_is_lockless(oscl)) { + oio->oi_lockless = 1; + } else if (!async) { + LASSERT(oscl->ols_state == OLS_GRANTED); + LASSERT(oscl->ols_hold); + LASSERT(oscl->ols_dlmlock != NULL); + } + } else if (oscl->ols_speculative) { + cl_object_put(env, osc2cl(osc)); + if (oscl->ols_glimpse) { + /* hide error for AGL request */ + result = 0; + } + } + +out: + if (result < 0) { + oscl->ols_state = OLS_CANCELLED; + osc_lock_wake_waiters(env, osc, oscl); + + if (anchor != NULL) + cl_sync_io_note(env, anchor, result); + } + RETURN(result); +} + +/** + * Breaks a link between osc_lock and dlm_lock. + */ +static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck) +{ + struct ldlm_lock *dlmlock; + + ENTRY; + + dlmlock = olck->ols_dlmlock; + if (dlmlock == NULL) + RETURN_EXIT; + + if (olck->ols_hold) { + olck->ols_hold = 0; + ldlm_lock_decref(&olck->ols_handle, olck->ols_einfo.ei_mode); + olck->ols_handle.cookie = 0ULL; + } + + olck->ols_dlmlock = NULL; + + /* release a reference taken in osc_lock_upcall(). */ + LASSERT(olck->ols_has_ref); + lu_ref_del(&dlmlock->l_reference, "osc_lock", olck); + LDLM_LOCK_RELEASE(dlmlock); + olck->ols_has_ref = 0; + + EXIT; +} + +/** + * Implements cl_lock_operations::clo_cancel() method for osc layer. This is + * called (as part of cl_lock_cancel()) when lock is canceled either voluntary + * (LRU pressure, early cancellation, umount, etc.) or due to the conflict + * with some other lock some where in the cluster. This function does the + * following: + * + * - invalidates all pages protected by this lock (after sending dirty + * ones to the server, as necessary); + * + * - decref's underlying ldlm lock; + * + * - cancels ldlm lock (ldlm_cli_cancel()). + */ +void osc_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_object *obj = cl2osc(slice->cls_obj); + struct osc_lock *oscl = cl2osc_lock(slice); + + ENTRY; + + LINVRNT(osc_lock_invariant(oscl)); + + osc_lock_detach(env, oscl); + oscl->ols_state = OLS_CANCELLED; + oscl->ols_flags &= ~LDLM_FL_LVB_READY; + + osc_lock_wake_waiters(env, obj, oscl); + EXIT; +} +EXPORT_SYMBOL(osc_lock_cancel); + +int osc_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + (*p)(env, cookie, "%p %#llx %#llx %d %p ", + lock->ols_dlmlock, lock->ols_flags, lock->ols_handle.cookie, + lock->ols_state, lock->ols_owner); + osc_lvb_print(env, cookie, p, &lock->ols_lvb); + return 0; +} +EXPORT_SYMBOL(osc_lock_print); + +static const struct cl_lock_operations osc_lock_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = osc_lock_enqueue, + .clo_cancel = osc_lock_cancel, + .clo_print = osc_lock_print, +}; + +static void osc_lock_lockless_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct osc_object *osc = cl2osc(slice->cls_obj); + struct cl_lock_descr *descr = &slice->cls_lock->cll_descr; + int result; + + LASSERT(ols->ols_dlmlock == NULL); + result = osc_lock_flush(osc, descr->cld_start, descr->cld_end, + descr->cld_mode, false); + if (result) + CERROR("Pages for lockless lock %p were not purged(%d)\n", + ols, result); + + osc_lock_wake_waiters(env, osc, ols); +} + +static const struct cl_lock_operations osc_lock_lockless_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = osc_lock_enqueue, + .clo_cancel = osc_lock_lockless_cancel, + .clo_print = osc_lock_print +}; + +void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io, + struct cl_object *obj, struct osc_lock *oscl) +{ + struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr; + pgoff_t io_start; + pgoff_t io_end; + + if (!cl_object_same(io->ci_obj, obj)) + return; + + if (likely(io->ci_type == CIT_WRITE)) { + io_start = cl_index(obj, io->u.ci_rw.crw_pos); + io_end = cl_index(obj, io->u.ci_rw.crw_pos + + io->u.ci_rw.crw_count - 1); + } else { + LASSERT(cl_io_is_mkwrite(io)); + io_start = io_end = io->u.ci_fault.ft_index; + } + + if (descr->cld_mode >= CLM_WRITE && + (cl_io_is_append(io) || + (descr->cld_start <= io_start && descr->cld_end >= io_end))) { + struct osc_io *oio = osc_env_io(env); + + /* There must be only one lock to match the write region */ + LASSERT(oio->oi_write_osclock == NULL); + oio->oi_write_osclock = oscl; + } +} +EXPORT_SYMBOL(osc_lock_set_writer); + +int osc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io) +{ + struct osc_lock *oscl; + __u32 enqflags = lock->cll_descr.cld_enq_flags; + + OBD_SLAB_ALLOC_PTR_GFP(oscl, osc_lock_kmem, GFP_NOFS); + if (oscl == NULL) + return -ENOMEM; + + oscl->ols_state = OLS_NEW; + spin_lock_init(&oscl->ols_lock); + INIT_LIST_HEAD(&oscl->ols_waiting_list); + INIT_LIST_HEAD(&oscl->ols_wait_entry); + INIT_LIST_HEAD(&oscl->ols_nextlock_oscobj); + oscl->ols_lockless_ops = &osc_lock_lockless_ops; + + /* Speculative lock requests must be either no_expand or glimpse + * request (CEF_GLIMPSE). non-glimpse no_expand speculative extent + * locks will break ofd_intent_cb. (see comment there)*/ + LASSERT(ergo((enqflags & CEF_SPECULATIVE) != 0, + (enqflags & (CEF_LOCK_NO_EXPAND | CEF_GLIMPSE)) != 0)); + + oscl->ols_flags = osc_enq2ldlm_flags(enqflags); + oscl->ols_speculative = !!(enqflags & CEF_SPECULATIVE); + + if (oscl->ols_flags & LDLM_FL_HAS_INTENT) { + oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED; + oscl->ols_glimpse = 1; + } + if (io->ci_ndelay && cl_object_same(io->ci_obj, obj)) + oscl->ols_flags |= LDLM_FL_NDELAY; + osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo); + + cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops); + + if (!(enqflags & CEF_MUST)) + /* try to convert this lock to a lockless lock */ + osc_lock_to_lockless(env, oscl, (enqflags & CEF_NEVER)); + if (oscl->ols_locklessable && !(enqflags & CEF_DISCARD_DATA)) + oscl->ols_flags |= LDLM_FL_DENY_ON_CONTENTION; + + if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) + osc_lock_set_writer(env, io, obj, oscl); + + LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %#llx", + lock, oscl, oscl->ols_flags); + + return 0; +} + +/** + * Finds an existing lock covering given index and optionally different from a + * given \a except lock. + */ +struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env, + struct osc_object *obj, + pgoff_t index, + enum osc_dap_flags dap_flags) +{ + struct osc_thread_info *info = osc_env_info(env); + struct ldlm_res_id *resname = &info->oti_resname; + union ldlm_policy_data *policy = &info->oti_policy; + struct lustre_handle lockh; + struct ldlm_lock *lock = NULL; + enum ldlm_mode mode; + __u64 flags; + + ENTRY; + + ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname); + osc_index2policy(policy, osc2cl(obj), index, index); + policy->l_extent.gid = LDLM_GID_ANY; + + flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING; + if (dap_flags & OSC_DAP_FL_TEST_LOCK) + flags |= LDLM_FL_TEST_LOCK; + /* + * It is fine to match any group lock since there could be only one + * with a uniq gid and it conflicts with all other lock modes too + */ +again: + mode = osc_match_base(env, osc_export(obj), resname, LDLM_EXTENT, + policy, LCK_PR | LCK_PW | LCK_GROUP, &flags, + obj, &lockh, dap_flags & OSC_DAP_FL_CANCELING); + if (mode != 0) { + lock = ldlm_handle2lock(&lockh); + /* RACE: the lock is cancelled so let's try again */ + if (unlikely(lock == NULL)) + goto again; + } + + RETURN(lock); +} +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_object.c b/drivers/staging/lustrefsx/lustre/osc/osc_object.c new file mode 100644 index 0000000000000..a99747cecf011 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_object.c @@ -0,0 +1,499 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC +#include + +#include "osc_internal.h" + +/** \addtogroup osc + * @{ + */ + +/***************************************************************************** + * + * Object operations. + * + */ +static void osc_obj_build_res_name(struct osc_object *osc, + struct ldlm_res_id *resname) +{ + ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname); +} + +static const struct osc_object_operations osc_object_ops = { + .oto_build_res_name = osc_obj_build_res_name, + .oto_dlmlock_at_pgoff = osc_obj_dlmlock_at_pgoff, +}; + +int osc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct osc_object *osc = lu2osc(obj); + const struct cl_object_conf *cconf = lu2cl_conf(conf); + + osc->oo_oinfo = cconf->u.coc_oinfo; +#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK + mutex_init(&osc->oo_debug_mutex); +#endif + INIT_LIST_HEAD(&osc->oo_ready_item); + INIT_LIST_HEAD(&osc->oo_hp_ready_item); + INIT_LIST_HEAD(&osc->oo_write_item); + INIT_LIST_HEAD(&osc->oo_read_item); + + osc->oo_root.rb_node = NULL; + INIT_LIST_HEAD(&osc->oo_hp_exts); + INIT_LIST_HEAD(&osc->oo_urgent_exts); + INIT_LIST_HEAD(&osc->oo_full_exts); + INIT_LIST_HEAD(&osc->oo_reading_exts); + atomic_set(&osc->oo_nr_reads, 0); + atomic_set(&osc->oo_nr_writes, 0); + spin_lock_init(&osc->oo_lock); + spin_lock_init(&osc->oo_tree_lock); + spin_lock_init(&osc->oo_ol_spin); + INIT_LIST_HEAD(&osc->oo_ol_list); + + atomic_set(&osc->oo_nr_ios, 0); + init_waitqueue_head(&osc->oo_io_waitq); + + LASSERT(osc->oo_obj_ops != NULL); + + cl_object_page_init(lu2cl(obj), sizeof(struct osc_page)); + + return 0; +} +EXPORT_SYMBOL(osc_object_init); + +void osc_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct osc_object *osc = lu2osc(obj); + + LASSERT(list_empty(&osc->oo_ready_item)); + LASSERT(list_empty(&osc->oo_hp_ready_item)); + LASSERT(list_empty(&osc->oo_write_item)); + LASSERT(list_empty(&osc->oo_read_item)); + + LASSERT(osc->oo_root.rb_node == NULL); + LASSERT(list_empty(&osc->oo_hp_exts)); + LASSERT(list_empty(&osc->oo_urgent_exts)); + LASSERT(list_empty(&osc->oo_full_exts)); + LASSERT(list_empty(&osc->oo_reading_exts)); + LASSERT(atomic_read(&osc->oo_nr_reads) == 0); + LASSERT(atomic_read(&osc->oo_nr_writes) == 0); + LASSERT(list_empty(&osc->oo_ol_list)); + LASSERT(atomic_read(&osc->oo_nr_ios) == 0); + + lu_object_fini(obj); + OBD_SLAB_FREE_PTR(osc, osc_object_kmem); +} +EXPORT_SYMBOL(osc_object_free); + +int osc_lvb_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct ost_lvb *lvb) +{ + return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu " + "ctime: %llu blocks: %llu", + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); +} +EXPORT_SYMBOL(osc_lvb_print); + +int osc_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj) +{ + struct osc_object *osc = lu2osc(obj); + struct lov_oinfo *oinfo = osc->oo_oinfo; + struct osc_async_rc *ar = &oinfo->loi_ar; + + (*p)(env, cookie, "id: "DOSTID" " + "idx: %d gen: %d kms_valid: %u kms %llu " + "rc: %d force_sync: %d min_xid: %llu ", + POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx, + oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms, + ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid); + osc_lvb_print(env, cookie, p, &oinfo->loi_lvb); + return 0; +} +EXPORT_SYMBOL(osc_object_print); + + +int osc_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + cl_lvb2attr(attr, &oinfo->loi_lvb); + attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0; + return 0; +} +EXPORT_SYMBOL(osc_attr_get); + +int osc_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + struct ost_lvb *lvb = &oinfo->loi_lvb; + + if (valid & CAT_SIZE) + lvb->lvb_size = attr->cat_size; + if (valid & CAT_MTIME) + lvb->lvb_mtime = attr->cat_mtime; + if (valid & CAT_ATIME) + lvb->lvb_atime = attr->cat_atime; + if (valid & CAT_CTIME) + lvb->lvb_ctime = attr->cat_ctime; + if (valid & CAT_BLOCKS) + lvb->lvb_blocks = attr->cat_blocks; + if (valid & CAT_KMS) { + CDEBUG(D_CACHE, "set kms from %lluto %llu\n", + oinfo->loi_kms, (__u64)attr->cat_kms); + loi_kms_set(oinfo, attr->cat_kms); + } + return 0; +} +EXPORT_SYMBOL(osc_attr_update); + +int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + lvb->lvb_size = oinfo->loi_kms; + lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks; + return 0; +} +EXPORT_SYMBOL(osc_object_glimpse); + +static int osc_object_ast_clear(struct ldlm_lock *lock, void *data) +{ + struct osc_object *osc = (struct osc_object *)data; + struct ost_lvb *lvb = lock->l_lvb_data; + struct lov_oinfo *oinfo; + ENTRY; + + if (lock->l_ast_data == data) { + lock->l_ast_data = NULL; + + LASSERT(osc != NULL); + LASSERT(osc->oo_oinfo != NULL); + LASSERT(lvb != NULL); + + /* Updates lvb in lock by the cached oinfo */ + oinfo = osc->oo_oinfo; + + LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: " + "%llu %llu %llu by oinfo size %llu blocks %llu " + "[cma]time %llu %llu %llu", lvb->lvb_size, + lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime, + lvb->lvb_atime, oinfo->loi_lvb.lvb_size, + oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime, + oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime); + LASSERT(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms); + + cl_object_attr_lock(&osc->oo_cl); + memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb)); + cl_object_attr_unlock(&osc->oo_cl); + ldlm_clear_lvb_cached(lock); + } + RETURN(LDLM_ITER_CONTINUE); +} + +int osc_object_prune(const struct lu_env *env, struct cl_object *obj) +{ + struct osc_object *osc = cl2osc(obj); + struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname; + + /* DLM locks don't hold a reference of osc_object so we have to + * clear it before the object is being destroyed. */ + osc_build_res_name(osc, resname); + ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname, + osc_object_ast_clear, osc); + return 0; +} +EXPORT_SYMBOL(osc_object_prune); + +static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj, + struct ll_fiemap_info_key *fmkey, + struct fiemap *fiemap, size_t *buflen) +{ + struct obd_export *exp = osc_export(cl2osc(obj)); + struct ldlm_res_id resid; + union ldlm_policy_data policy; + struct lustre_handle lockh; + enum ldlm_mode mode = LCK_MINMODE; + struct ptlrpc_request *req; + struct fiemap *reply; + char *tmp; + int rc; + ENTRY; + + fmkey->lfik_oa.o_oi = cl2osc(obj)->oo_oinfo->loi_oi; + if (!(fmkey->lfik_fiemap.fm_flags & FIEMAP_FLAG_SYNC)) + goto skip_locking; + + policy.l_extent.start = fmkey->lfik_fiemap.fm_start & PAGE_MASK; + + if (OBD_OBJECT_EOF - fmkey->lfik_fiemap.fm_length <= + fmkey->lfik_fiemap.fm_start + PAGE_SIZE - 1) + policy.l_extent.end = OBD_OBJECT_EOF; + else + policy.l_extent.end = (fmkey->lfik_fiemap.fm_start + + fmkey->lfik_fiemap.fm_length + + PAGE_SIZE - 1) & PAGE_MASK; + + ostid_build_res_name(&fmkey->lfik_oa.o_oi, &resid); + mode = ldlm_lock_match(exp->exp_obd->obd_namespace, + LDLM_FL_BLOCK_GRANTED | LDLM_FL_LVB_READY, + &resid, LDLM_EXTENT, &policy, + LCK_PR | LCK_PW, &lockh, 0); + if (mode) { /* lock is cached on client */ + if (mode != LCK_PR) { + ldlm_lock_addref(&lockh, LCK_PR); + ldlm_lock_decref(&lockh, LCK_PW); + } + } else { /* no cached lock, needs acquire lock on server side */ + fmkey->lfik_oa.o_valid |= OBD_MD_FLFLAGS; + fmkey->lfik_oa.o_flags |= OBD_FL_SRVLOCK; + } + +skip_locking: + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_OST_GET_INFO_FIEMAP); + if (req == NULL) + GOTO(drop_lock, rc = -ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, RCL_CLIENT, + sizeof(*fmkey)); + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_CLIENT, + *buflen); + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_SERVER, + *buflen); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); + if (rc != 0) { + ptlrpc_request_free(req); + GOTO(drop_lock, rc); + } + tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY); + memcpy(tmp, fmkey, sizeof(*fmkey)); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL); + memcpy(tmp, fiemap, *buflen); + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc != 0) + GOTO(fini_req, rc); + + reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL); + if (reply == NULL) + GOTO(fini_req, rc = -EPROTO); + + memcpy(fiemap, reply, *buflen); +fini_req: + ptlrpc_req_finished(req); +drop_lock: + if (mode) + ldlm_lock_decref(&lockh, LCK_PR); + RETURN(rc); +} + +int osc_object_is_contended(struct osc_object *obj) +{ + struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev); + time64_t osc_contention_time = dev->od_contention_time; + ktime_t retry_time; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION)) + return 1; + + if (!obj->oo_contended) + return 0; + + /* + * I like copy-paste. the code is copied from + * ll_file_is_contended. + */ + retry_time = ktime_add_ns(obj->oo_contention_time, + osc_contention_time * NSEC_PER_SEC); + if (ktime_after(ktime_get(), retry_time)) { + osc_object_clear_contended(obj); + return 0; + } + return 1; +} +EXPORT_SYMBOL(osc_object_is_contended); + +/** + * Implementation of struct cl_object_operations::coo_req_attr_set() for osc + * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq + * fields. + */ +static void osc_req_attr_set(const struct lu_env *env, struct cl_object *obj, + struct cl_req_attr *attr) +{ + struct lov_oinfo *oinfo; + struct obdo *oa; + struct ost_lvb *lvb; + u64 flags = attr->cra_flags; + + oinfo = cl2osc(obj)->oo_oinfo; + lvb = &oinfo->loi_lvb; + oa = attr->cra_oa; + + if ((flags & OBD_MD_FLMTIME) != 0) { + oa->o_mtime = lvb->lvb_mtime; + oa->o_valid |= OBD_MD_FLMTIME; + } + if ((flags & OBD_MD_FLATIME) != 0) { + oa->o_atime = lvb->lvb_atime; + oa->o_valid |= OBD_MD_FLATIME; + } + if ((flags & OBD_MD_FLCTIME) != 0) { + oa->o_ctime = lvb->lvb_ctime; + oa->o_valid |= OBD_MD_FLCTIME; + } + if (flags & OBD_MD_FLGROUP) { + ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi)); + oa->o_valid |= OBD_MD_FLGROUP; + } + if (flags & OBD_MD_FLID) { + int rc; + + rc = ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi)); + if (rc) { + CERROR("Bad %llu to set " DOSTID " : rc %d\n", + (unsigned long long)ostid_id(&oinfo->loi_oi), + POSTID(&oa->o_oi), rc); + } + oa->o_valid |= OBD_MD_FLID; + } + if (flags & OBD_MD_FLHANDLE) { + struct ldlm_lock *lock; + struct osc_page *opg; + + opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj)); + lock = osc_dlmlock_at_pgoff(env, cl2osc(obj), osc_index(opg), + OSC_DAP_FL_TEST_LOCK | OSC_DAP_FL_CANCELING); + if (lock == NULL && !opg->ops_srvlock) { + struct ldlm_resource *res; + struct ldlm_res_id *resname; + + CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page, + "uncovered page!\n"); + + resname = &osc_env_info(env)->oti_resname; + ostid_build_res_name(&oinfo->loi_oi, resname); + res = ldlm_resource_get( + osc_export(cl2osc(obj))->exp_obd->obd_namespace, + NULL, resname, LDLM_EXTENT, 0); + ldlm_resource_dump(D_ERROR, res); + + libcfs_debug_dumpstack(NULL); + LBUG(); + } + + /* check for lockless io. */ + if (lock != NULL) { + oa->o_handle = lock->l_remote_handle; + oa->o_valid |= OBD_MD_FLHANDLE; + LDLM_LOCK_PUT(lock); + } + } +} + +static const struct cl_object_operations osc_ops = { + .coo_page_init = osc_page_init, + .coo_lock_init = osc_lock_init, + .coo_io_init = osc_io_init, + .coo_attr_get = osc_attr_get, + .coo_attr_update = osc_attr_update, + .coo_glimpse = osc_object_glimpse, + .coo_prune = osc_object_prune, + .coo_fiemap = osc_object_fiemap, + .coo_req_attr_set = osc_req_attr_set +}; + +static const struct lu_object_operations osc_lu_obj_ops = { + .loo_object_init = osc_object_init, + .loo_object_release = NULL, + .loo_object_free = osc_object_free, + .loo_object_print = osc_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *osc_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct osc_object *osc; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS); + if (osc != NULL) { + obj = osc2lu(osc); + lu_object_init(obj, NULL, dev); + osc->oo_cl.co_ops = &osc_ops; + obj->lo_ops = &osc_lu_obj_ops; + osc->oo_obj_ops = &osc_object_ops; + } else + obj = NULL; + return obj; +} + +int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc) +{ + struct l_wait_info lwi = { 0 }; + ENTRY; + + CDEBUG(D_INODE, "Invalidate osc object: %p, # of active IOs: %d\n", + osc, atomic_read(&osc->oo_nr_ios)); + + l_wait_event(osc->oo_io_waitq, atomic_read(&osc->oo_nr_ios) == 0, &lwi); + + /* Discard all dirty pages of this object. */ + osc_cache_truncate_start(env, osc, 0, NULL); + + /* Discard all caching pages */ + osc_lock_discard_pages(env, osc, 0, CL_PAGE_EOF, true); + + /* Clear ast data of dlm lock. Do this after discarding all pages */ + osc_object_prune(env, osc2cl(osc)); + + RETURN(0); +} +EXPORT_SYMBOL(osc_object_invalidate); +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_page.c b/drivers/staging/lustrefsx/lustre/osc/osc_page.c new file mode 100644 index 0000000000000..a37c185772a00 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_page.c @@ -0,0 +1,1157 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC +#include + +#include "osc_internal.h" + +static void osc_lru_del(struct client_obd *cli, struct osc_page *opg); +static void osc_lru_use(struct client_obd *cli, struct osc_page *opg); +static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli, + struct osc_page *opg); + +/** \addtogroup osc + * @{ + */ + +/* + * Page operations. + */ +static void osc_page_transfer_get(struct osc_page *opg, const char *label) +{ + struct cl_page *page = opg->ops_cl.cpl_page; + + LASSERT(!opg->ops_transfer_pinned); + cl_page_get(page); + lu_ref_add_atomic(&page->cp_reference, label, page); + opg->ops_transfer_pinned = 1; +} + +static void osc_page_transfer_put(const struct lu_env *env, + struct osc_page *opg) +{ + struct cl_page *page = opg->ops_cl.cpl_page; + + if (opg->ops_transfer_pinned) { + opg->ops_transfer_pinned = 0; + lu_ref_del(&page->cp_reference, "transfer", page); + cl_page_put(env, page); + } +} + +/** + * This is called once for every page when it is submitted for a transfer + * either opportunistic (osc_page_cache_add()), or immediate + * (osc_page_submit()). + */ +static void osc_page_transfer_add(const struct lu_env *env, + struct osc_page *opg, enum cl_req_type crt) +{ + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + + /* ops_lru and ops_inflight share the same field, so take it from LRU + * first and then use it as inflight. */ + osc_lru_use(osc_cli(obj), opg); +} + +int osc_page_cache_add(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io) +{ + struct osc_page *opg = cl2osc_page(slice); + int result; + ENTRY; + + osc_page_transfer_get(opg, "transfer\0cache"); + result = osc_queue_async_io(env, io, opg); + if (result != 0) + osc_page_transfer_put(env, opg); + else + osc_page_transfer_add(env, opg, CRT_WRITE); + + RETURN(result); +} + +void osc_index2policy(union ldlm_policy_data *policy, + const struct cl_object *obj, pgoff_t start, pgoff_t end) +{ + memset(policy, 0, sizeof *policy); + policy->l_extent.start = cl_offset(obj, start); + policy->l_extent.end = cl_offset(obj, end + 1) - 1; +} + +static const char *osc_list(struct list_head *head) +{ + return list_empty(head) ? "-" : "+"; +} + +static inline s64 osc_submit_duration(struct osc_page *opg) +{ + if (ktime_to_ns(opg->ops_submit_time) == 0) + return 0; + + return ktime_ms_delta(ktime_get(), opg->ops_submit_time); +} + +static int osc_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + struct osc_object *obj = cl2osc(slice->cpl_obj); + struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli; + + return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p %lu: " + "1< %#x %d %u %s %s > " + "2< %lld %u %u %#x %#x | %p %p %p > " + "3< %d %lld %d > " + "4< %d %d %d %lu %c | %s %s %s %s > " + "5< %s %s %s %s | %d %s | %d %s %s>\n", + opg, osc_index(opg), + /* 1 */ + oap->oap_magic, oap->oap_cmd, + oap->oap_interrupted, + osc_list(&oap->oap_pending_item), + osc_list(&oap->oap_rpc_item), + /* 2 */ + oap->oap_obj_off, oap->oap_page_off, oap->oap_count, + oap->oap_async_flags, oap->oap_brw_flags, + oap->oap_request, oap->oap_cli, obj, + /* 3 */ + opg->ops_transfer_pinned, + osc_submit_duration(opg), opg->ops_srvlock, + /* 4 */ + cli->cl_r_in_flight, cli->cl_w_in_flight, + cli->cl_max_rpcs_in_flight, + cli->cl_avail_grant, + waitqueue_active(&cli->cl_cache_waiters) ? '+' : '-', + osc_list(&cli->cl_loi_ready_list), + osc_list(&cli->cl_loi_hp_ready_list), + osc_list(&cli->cl_loi_write_list), + osc_list(&cli->cl_loi_read_list), + /* 5 */ + osc_list(&obj->oo_ready_item), + osc_list(&obj->oo_hp_ready_item), + osc_list(&obj->oo_write_item), + osc_list(&obj->oo_read_item), + atomic_read(&obj->oo_nr_reads), + osc_list(&obj->oo_reading_exts), + atomic_read(&obj->oo_nr_writes), + osc_list(&obj->oo_hp_exts), + osc_list(&obj->oo_urgent_exts)); +} + +static void osc_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + int rc; + + ENTRY; + CDEBUG(D_TRACE, "%p\n", opg); + osc_page_transfer_put(env, opg); + rc = osc_teardown_async_page(env, obj, opg); + if (rc) { + CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, + "Trying to teardown failed: %d\n", rc); + LASSERT(0); + } + + osc_lru_del(osc_cli(obj), opg); + + if (slice->cpl_page->cp_type == CPT_CACHEABLE) { + void *value = NULL; + + spin_lock(&obj->oo_tree_lock); + if (opg->ops_intree) { + value = radix_tree_delete(&obj->oo_tree, + osc_index(opg)); + if (value != NULL) { + --obj->oo_npages; + opg->ops_intree = 0; + } + } + spin_unlock(&obj->oo_tree_lock); + + LASSERT(ergo(value != NULL, value == opg)); + } + + EXIT; +} + +static void osc_page_clip(const struct lu_env *env, + const struct cl_page_slice *slice, + int from, int to) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + + opg->ops_from = from; + opg->ops_to = to; + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&oap->oap_lock); +} + +static int osc_page_cancel(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + int rc = 0; + + /* Check if the transferring against this page + * is completed, or not even queued. */ + if (opg->ops_transfer_pinned) + /* FIXME: may not be interrupted.. */ + rc = osc_cancel_async_page(env, opg); + LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0)); + return rc; +} + +static int osc_page_flush(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct osc_page *opg = cl2osc_page(slice); + int rc = 0; + ENTRY; + rc = osc_flush_async_page(env, io, opg); + RETURN(rc); +} + +static void osc_page_touch(const struct lu_env *env, + const struct cl_page_slice *slice, size_t to) +{ + struct osc_page *opg = cl2osc_page(slice); + struct cl_object *obj = opg->ops_cl.cpl_obj; + + osc_page_touch_at(env, obj, osc_index(opg), to); +} + +static const struct cl_page_operations osc_page_ops = { + .cpo_print = osc_page_print, + .cpo_delete = osc_page_delete, + .cpo_clip = osc_page_clip, + .cpo_cancel = osc_page_cancel, + .cpo_flush = osc_page_flush, + .cpo_page_touch = osc_page_touch, +}; + +int osc_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, pgoff_t index) +{ + struct osc_object *osc = cl2osc(obj); + struct osc_page *opg = cl_object_page_slice(obj, page); + struct osc_io *oio = osc_env_io(env); + int result; + + opg->ops_from = 0; + opg->ops_to = PAGE_SIZE; + + INIT_LIST_HEAD(&opg->ops_lru); + + result = osc_prep_async_page(osc, opg, page->cp_vmpage, + cl_offset(obj, index)); + if (result != 0) + return result; + + opg->ops_srvlock = osc_io_srvlock(oio); + cl_page_slice_add(page, &opg->ops_cl, obj, index, + &osc_page_ops); + + + /* reserve an LRU space for this page */ + if (page->cp_type == CPT_CACHEABLE) { + result = osc_lru_alloc(env, osc_cli(osc), opg); + if (result == 0) { + result = radix_tree_preload(GFP_NOFS); + if (result == 0) { + spin_lock(&osc->oo_tree_lock); + result = radix_tree_insert(&osc->oo_tree, + index, opg); + if (result == 0) { + ++osc->oo_npages; + opg->ops_intree = 1; + } + spin_unlock(&osc->oo_tree_lock); + + radix_tree_preload_end(); + } + } + } + + return result; +} +EXPORT_SYMBOL(osc_page_init); + +/** + * Helper function called by osc_io_submit() for every page in an immediate + * transfer (i.e., transferred synchronously). + */ +void osc_page_submit(const struct lu_env *env, struct osc_page *opg, + enum cl_req_type crt, int brw_flags) +{ + struct osc_io *oio = osc_env_io(env); + struct osc_async_page *oap = &opg->ops_oap; + + LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, " + "magic 0x%x\n", oap, oap->oap_magic); + LASSERT(oap->oap_async_flags & ASYNC_READY); + LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE); + + oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; + oap->oap_page_off = opg->ops_from; + oap->oap_count = opg->ops_to - opg->ops_from; + oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags; + + if (oio->oi_cap_sys_resource) { + oap->oap_brw_flags |= OBD_BRW_NOQUOTA; + oap->oap_cmd |= OBD_BRW_NOQUOTA; + } + + opg->ops_submit_time = ktime_get(); + osc_page_transfer_get(opg, "transfer\0imm"); + osc_page_transfer_add(env, opg, crt); +} + +/* --------------- LRU page management ------------------ */ + +/* OSC is a natural place to manage LRU pages as applications are specialized + * to write OSC by OSC. Ideally, if one OSC is used more frequently it should + * occupy more LRU slots. On the other hand, we should avoid using up all LRU + * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep + * for free LRU slots - this will be very bad so the algorithm requires each + * OSC to free slots voluntarily to maintain a reasonable number of free slots + * at any time. + */ + +static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq); + +/** + * LRU pages are freed in batch mode. OSC should at least free this + * number of pages to avoid running out of LRU slots. + */ +static inline int lru_shrink_min(struct client_obd *cli) +{ + return cli->cl_max_pages_per_rpc * 2; +} + +/** + * free this number at most otherwise it will take too long time to finsih. + */ +static inline int lru_shrink_max(struct client_obd *cli) +{ + return cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight; +} + +/** + * Check if we can free LRU slots from this OSC. If there exists LRU waiters, + * we should free slots aggressively. In this way, slots are freed in a steady + * step to maintain fairness among OSCs. + * + * Return how many LRU pages should be freed. + */ +static int osc_cache_too_much(struct client_obd *cli) +{ + struct cl_client_cache *cache = cli->cl_cache; + long pages = atomic_long_read(&cli->cl_lru_in_list); + unsigned long budget; + + LASSERT(cache != NULL); + budget = cache->ccc_lru_max / (atomic_read(&cache->ccc_users) - 2); + + /* if it's going to run out LRU slots, we should free some, but not + * too much to maintain faireness among OSCs. */ + if (atomic_long_read(cli->cl_lru_left) < cache->ccc_lru_max >> 2) { + if (pages >= budget) + return lru_shrink_max(cli); + else if (pages >= budget / 2) + return lru_shrink_min(cli); + } else { + time64_t duration = ktime_get_real_seconds(); + long timediff; + + /* knock out pages by duration of no IO activity */ + duration -= cli->cl_lru_last_used; + /* + * The difference shouldn't be more than 70 years + * so we can safely case to a long. Round to + * approximately 1 minute. + */ + timediff = (long)(duration >> 6); + if (timediff > 0 && pages >= budget / timediff) + return lru_shrink_min(cli); + } + return 0; +} + +int lru_queue_work(const struct lu_env *env, void *data) +{ + struct client_obd *cli = data; + int count; + + CDEBUG(D_CACHE, "%s: run LRU work for client obd\n", cli_name(cli)); + count = osc_cache_too_much(cli); + if (count > 0) { + int rc = osc_lru_shrink(env, cli, count, false); + + CDEBUG(D_CACHE, "%s: shrank %d/%d pages from client obd\n", + cli_name(cli), rc, count); + if (rc >= count) { + CDEBUG(D_CACHE, "%s: queue again\n", cli_name(cli)); + ptlrpcd_queue_work(cli->cl_lru_work); + } + } + + RETURN(0); +} + +void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist) +{ + struct list_head lru = LIST_HEAD_INIT(lru); + struct osc_async_page *oap; + long npages = 0; + + list_for_each_entry(oap, plist, oap_pending_item) { + struct osc_page *opg = oap2osc_page(oap); + + if (!opg->ops_in_lru) + continue; + + ++npages; + LASSERT(list_empty(&opg->ops_lru)); + list_add(&opg->ops_lru, &lru); + } + + if (npages > 0) { + spin_lock(&cli->cl_lru_list_lock); + list_splice_tail(&lru, &cli->cl_lru_list); + atomic_long_sub(npages, &cli->cl_lru_busy); + atomic_long_add(npages, &cli->cl_lru_in_list); + cli->cl_lru_last_used = ktime_get_real_seconds(); + spin_unlock(&cli->cl_lru_list_lock); + + if (waitqueue_active(&osc_lru_waitq)) + (void)ptlrpcd_queue_work(cli->cl_lru_work); + } +} + +static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg) +{ + LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0); + list_del_init(&opg->ops_lru); + atomic_long_dec(&cli->cl_lru_in_list); +} + +/** + * Page is being destroyed. The page may be not in LRU list, if the transfer + * has never finished(error occurred). + */ +static void osc_lru_del(struct client_obd *cli, struct osc_page *opg) +{ + if (opg->ops_in_lru) { + spin_lock(&cli->cl_lru_list_lock); + if (!list_empty(&opg->ops_lru)) { + __osc_lru_del(cli, opg); + } else { + LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0); + atomic_long_dec(&cli->cl_lru_busy); + } + spin_unlock(&cli->cl_lru_list_lock); + + atomic_long_inc(cli->cl_lru_left); + /* this is a great place to release more LRU pages if + * this osc occupies too many LRU pages and kernel is + * stealing one of them. */ + if (osc_cache_too_much(cli)) { + CDEBUG(D_CACHE, "%s: queue LRU work\n", cli_name(cli)); + (void)ptlrpcd_queue_work(cli->cl_lru_work); + } + wake_up(&osc_lru_waitq); + } else { + LASSERT(list_empty(&opg->ops_lru)); + } +} + +/** + * Delete page from LRU list for redirty. + */ +static void osc_lru_use(struct client_obd *cli, struct osc_page *opg) +{ + /* If page is being transferred for the first time, + * ops_lru should be empty */ + if (opg->ops_in_lru) { + spin_lock(&cli->cl_lru_list_lock); + if (!list_empty(&opg->ops_lru)) { + __osc_lru_del(cli, opg); + atomic_long_inc(&cli->cl_lru_busy); + } + spin_unlock(&cli->cl_lru_list_lock); + } +} + +static void discard_pagevec(const struct lu_env *env, struct cl_io *io, + struct cl_page **pvec, int max_index) +{ + struct pagevec *pagevec = &osc_env_info(env)->oti_pagevec; + int i; + + ll_pagevec_init(pagevec, 0); + for (i = 0; i < max_index; i++) { + struct cl_page *page = pvec[i]; + + LASSERT(cl_page_is_owned(page, io)); + cl_page_delete(env, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + cl_pagevec_put(env, page, pagevec); + + pvec[i] = NULL; + } + pagevec_release(pagevec); +} + +/** + * Check if a cl_page can be released, i.e, it's not being used. + * + * If unstable account is turned on, bulk transfer may hold one refcount + * for recovery so we need to check vmpage refcount as well; otherwise, + * even we can destroy cl_page but the corresponding vmpage can't be reused. + */ +static inline bool lru_page_busy(struct client_obd *cli, struct cl_page *page) +{ + if (cl_page_in_use_noref(page)) + return true; + + if (cli->cl_cache->ccc_unstable_check) { + struct page *vmpage = cl_page_vmpage(page); + + /* vmpage have two known users: cl_page and VM page cache */ + if (page_count(vmpage) - page_mapcount(vmpage) > 2) + return true; + } + return false; +} + +/** + * Drop @target of pages from LRU at most. + */ +long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, + long target, bool force) +{ + struct cl_io *io; + struct cl_object *clobj = NULL; + struct cl_page **pvec; + struct osc_page *opg; + long count = 0; + int maxscan = 0; + int index = 0; + int rc = 0; + ENTRY; + + LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0); + if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0) + RETURN(0); + + CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n", + cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force); + if (!force) { + if (atomic_read(&cli->cl_lru_shrinkers) > 0) + RETURN(-EBUSY); + + if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) { + atomic_dec(&cli->cl_lru_shrinkers); + RETURN(-EBUSY); + } + } else { + atomic_inc(&cli->cl_lru_shrinkers); + } + + pvec = (struct cl_page **)osc_env_info(env)->oti_pvec; + io = osc_env_thread_io(env); + + spin_lock(&cli->cl_lru_list_lock); + if (force) + cli->cl_lru_reclaim++; + maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list)); + while (!list_empty(&cli->cl_lru_list)) { + struct cl_page *page; + bool will_free = false; + + if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1) + break; + + if (--maxscan < 0) + break; + + opg = list_entry(cli->cl_lru_list.next, struct osc_page, + ops_lru); + page = opg->ops_cl.cpl_page; + if (lru_page_busy(cli, page)) { + list_move_tail(&opg->ops_lru, &cli->cl_lru_list); + continue; + } + + LASSERT(page->cp_obj != NULL); + if (clobj != page->cp_obj) { + struct cl_object *tmp = page->cp_obj; + + cl_object_get(tmp); + spin_unlock(&cli->cl_lru_list_lock); + + if (clobj != NULL) { + discard_pagevec(env, io, pvec, index); + index = 0; + + cl_io_fini(env, io); + cl_object_put(env, clobj); + clobj = NULL; + } + + clobj = tmp; + io->ci_obj = clobj; + io->ci_ignore_layout = 1; + rc = cl_io_init(env, io, CIT_MISC, clobj); + + spin_lock(&cli->cl_lru_list_lock); + + if (rc != 0) + break; + + ++maxscan; + continue; + } + + if (cl_page_own_try(env, io, page) == 0) { + if (!lru_page_busy(cli, page)) { + /* remove it from lru list earlier to avoid + * lock contention */ + __osc_lru_del(cli, opg); + opg->ops_in_lru = 0; /* will be discarded */ + + cl_page_get(page); + will_free = true; + } else { + cl_page_disown(env, io, page); + } + } + + if (!will_free) { + list_move_tail(&opg->ops_lru, &cli->cl_lru_list); + continue; + } + + /* Don't discard and free the page with cl_lru_list held */ + pvec[index++] = page; + if (unlikely(index == OTI_PVEC_SIZE)) { + spin_unlock(&cli->cl_lru_list_lock); + discard_pagevec(env, io, pvec, index); + index = 0; + + spin_lock(&cli->cl_lru_list_lock); + } + + if (++count >= target) + break; + } + spin_unlock(&cli->cl_lru_list_lock); + + if (clobj != NULL) { + discard_pagevec(env, io, pvec, index); + + cl_io_fini(env, io); + cl_object_put(env, clobj); + } + + atomic_dec(&cli->cl_lru_shrinkers); + if (count > 0) { + atomic_long_add(count, cli->cl_lru_left); + wake_up_all(&osc_lru_waitq); + } + RETURN(count > 0 ? count : rc); +} +EXPORT_SYMBOL(osc_lru_shrink); + +/** + * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least + * \@npages of LRU slots. For performance consideration, it's better to drop + * LRU pages in batch. Therefore, the actual number is adjusted at least + * max_pages_per_rpc. + */ +static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages) +{ + struct lu_env *env; + struct cl_client_cache *cache = cli->cl_cache; + int max_scans; + __u16 refcheck; + long rc = 0; + ENTRY; + + LASSERT(cache != NULL); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(rc); + + npages = max_t(int, npages, cli->cl_max_pages_per_rpc); + CDEBUG(D_CACHE, "%s: start to reclaim %ld pages from LRU\n", + cli_name(cli), npages); + rc = osc_lru_shrink(env, cli, npages, true); + if (rc >= npages) { + CDEBUG(D_CACHE, "%s: reclaimed %ld/%ld pages from LRU\n", + cli_name(cli), rc, npages); + if (osc_cache_too_much(cli) > 0) + ptlrpcd_queue_work(cli->cl_lru_work); + GOTO(out, rc); + } else if (rc > 0) { + npages -= rc; + } + + CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n", + cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list), + atomic_long_read(&cli->cl_lru_busy), npages); + + /* Reclaim LRU slots from other client_obd as it can't free enough + * from its own. This should rarely happen. */ + spin_lock(&cache->ccc_lru_lock); + LASSERT(!list_empty(&cache->ccc_lru)); + + cache->ccc_lru_shrinkers++; + list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); + + max_scans = atomic_read(&cache->ccc_users) - 2; + while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) { + cli = list_entry(cache->ccc_lru.next, struct client_obd, + cl_lru_osc); + + CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n", + cli_name(cli), cli, + atomic_long_read(&cli->cl_lru_in_list), + atomic_long_read(&cli->cl_lru_busy)); + + list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); + if (osc_cache_too_much(cli) > 0) { + spin_unlock(&cache->ccc_lru_lock); + + rc = osc_lru_shrink(env, cli, npages, true); + spin_lock(&cache->ccc_lru_lock); + if (rc >= npages) + break; + if (rc > 0) + npages -= rc; + } + } + spin_unlock(&cache->ccc_lru_lock); + +out: + cl_env_put(env, &refcheck); + CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n", + cli_name(cli), cli, rc); + return rc; +} + +/** + * osc_lru_alloc() is called to allocate an LRU slot for a cl_page. + * + * Usually the LRU slots are reserved in osc_io_iter_rw_init(). + * Only in the case that the LRU slots are in extreme shortage, it should + * have reserved enough slots for an IO. + */ +static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli, + struct osc_page *opg) +{ + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + struct osc_io *oio = osc_env_io(env); + int rc = 0; + + ENTRY; + + if (cli->cl_cache == NULL) /* shall not be in LRU */ + RETURN(0); + + if (oio->oi_lru_reserved > 0) { + --oio->oi_lru_reserved; + goto out; + } + + LASSERT(atomic_long_read(cli->cl_lru_left) >= 0); + while (!atomic_long_add_unless(cli->cl_lru_left, -1, 0)) { + /* run out of LRU spaces, try to drop some by itself */ + rc = osc_lru_reclaim(cli, 1); + if (rc < 0) + break; + if (rc > 0) + continue; + + cond_resched(); + rc = l_wait_event(osc_lru_waitq, + atomic_long_read(cli->cl_lru_left) > 0, + &lwi); + if (rc < 0) + break; + } + +out: + if (rc >= 0) { + atomic_long_inc(&cli->cl_lru_busy); + opg->ops_in_lru = 1; + rc = 0; + } + + RETURN(rc); +} + +/** + * osc_lru_reserve() is called to reserve enough LRU slots for I/O. + * + * The benefit of doing this is to reduce contention against atomic counter + * cl_lru_left by changing it from per-page access to per-IO access. + */ +unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages) +{ + unsigned long reserved = 0; + unsigned long max_pages; + unsigned long c; + + /* reserve a full RPC window at most to avoid that a thread accidentally + * consumes too many LRU slots */ + max_pages = cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight; + if (npages > max_pages) + npages = max_pages; + + c = atomic_long_read(cli->cl_lru_left); + if (c < npages && osc_lru_reclaim(cli, npages) > 0) + c = atomic_long_read(cli->cl_lru_left); + while (c >= npages) { + if (c == atomic_long_cmpxchg(cli->cl_lru_left, c, c - npages)) { + reserved = npages; + break; + } + c = atomic_long_read(cli->cl_lru_left); + } + if (atomic_long_read(cli->cl_lru_left) < max_pages) { + /* If there aren't enough pages in the per-OSC LRU then + * wake up the LRU thread to try and clear out space, so + * we don't block if pages are being dirtied quickly. */ + CDEBUG(D_CACHE, "%s: queue LRU, left: %lu/%ld.\n", + cli_name(cli), atomic_long_read(cli->cl_lru_left), + max_pages); + (void)ptlrpcd_queue_work(cli->cl_lru_work); + } + + return reserved; +} + +/** + * osc_lru_unreserve() is called to unreserve LRU slots. + * + * LRU slots reserved by osc_lru_reserve() may have entries left due to several + * reasons such as page already existing or I/O error. Those reserved slots + * should be freed by calling this function. + */ +void osc_lru_unreserve(struct client_obd *cli, unsigned long npages) +{ + atomic_long_add(npages, cli->cl_lru_left); + wake_up_all(&osc_lru_waitq); +} + +/** + * Atomic operations are expensive. We accumulate the accounting for the + * same page zone to get better performance. + * In practice this can work pretty good because the pages in the same RPC + * are likely from the same page zone. + */ +#ifdef HAVE_NR_UNSTABLE_NFS +/* Old kernels use a separate counter for unstable pages, + * newer kernels treat them like any other writeback. + */ +#define NR_WRITEBACK NR_UNSTABLE_NFS +#endif + +static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc, + struct osc_brw_async_args *aa, + int factor) +{ + int page_count; + void *zone = NULL; + int count = 0; + int i; + + if (desc != NULL) { + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + page_count = desc->bd_iov_count; + } else { + page_count = aa->aa_page_count; + } + + for (i = 0; i < page_count; i++) { + void *pz; + if (desc) + pz = page_zone(BD_GET_KIOV(desc, i).kiov_page); + else + pz = page_zone(aa->aa_ppga[i]->pg); + + if (likely(pz == zone)) { + ++count; + continue; + } + + if (count > 0) { + mod_zone_page_state(zone, NR_WRITEBACK, + factor * count); + count = 0; + } + zone = pz; + ++count; + } + if (count > 0) + mod_zone_page_state(zone, NR_WRITEBACK, factor * count); +} + +static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc, + struct osc_brw_async_args *aa) +{ + unstable_page_accounting(desc, aa, 1); +} + +static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc, + struct osc_brw_async_args *aa) +{ + unstable_page_accounting(desc, aa, -1); +} + +/** + * Performs "unstable" page accounting. This function balances the + * increment operations performed in osc_inc_unstable_pages. It is + * registered as the RPC request callback, and is executed when the + * bulk RPC is committed on the server. Thus at this point, the pages + * involved in the bulk transfer are no longer considered unstable. + * + * If this function is called, the request should have been committed + * or req:rq_unstable must have been set; it implies that the unstable + * statistic have been added. + */ +void osc_dec_unstable_pages(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + struct osc_brw_async_args *aa = (void *)&req->rq_async_args; + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + int page_count; + long unstable_count; + + if (desc) + page_count = desc->bd_iov_count; + else + page_count = aa->aa_page_count; + + LASSERT(page_count >= 0); + + dec_unstable_page_accounting(desc, aa); + + unstable_count = atomic_long_sub_return(page_count, + &cli->cl_unstable_count); + LASSERT(unstable_count >= 0); + + unstable_count = atomic_long_sub_return(page_count, + &cli->cl_cache->ccc_unstable_nr); + LASSERT(unstable_count >= 0); + if (unstable_count == 0) + wake_up_all(&cli->cl_cache->ccc_unstable_waitq); + + if (waitqueue_active(&osc_lru_waitq)) + (void)ptlrpcd_queue_work(cli->cl_lru_work); +} + +/** + * "unstable" page accounting. See: osc_dec_unstable_pages. + */ +void osc_inc_unstable_pages(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + struct osc_brw_async_args *aa = (void *)&req->rq_async_args; + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + long page_count; + + /* No unstable page tracking */ + if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check) + return; + + if (desc) + page_count = desc->bd_iov_count; + else + page_count = aa->aa_page_count; + + add_unstable_page_accounting(desc, aa); + atomic_long_add(page_count, &cli->cl_unstable_count); + atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr); + + /* If the request has already been committed (i.e. brw_commit + * called via rq_commit_cb), we need to undo the unstable page + * increments we just performed because rq_commit_cb wont be + * called again. */ + spin_lock(&req->rq_lock); + if (unlikely(req->rq_committed)) { + spin_unlock(&req->rq_lock); + + osc_dec_unstable_pages(req); + } else { + req->rq_unstable = 1; + spin_unlock(&req->rq_lock); + } +} + +/** + * Check if it piggybacks SOFT_SYNC flag to OST from this OSC. + * This function will be called by every BRW RPC so it's critical + * to make this function fast. + */ +bool osc_over_unstable_soft_limit(struct client_obd *cli) +{ + long unstable_nr, osc_unstable_count; + + /* Can't check cli->cl_unstable_count, therefore, no soft limit */ + if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check) + return false; + + osc_unstable_count = atomic_long_read(&cli->cl_unstable_count); + unstable_nr = atomic_long_read(&cli->cl_cache->ccc_unstable_nr); + + CDEBUG(D_CACHE, + "%s: cli: %p unstable pages: %lu, osc unstable pages: %lu\n", + cli_name(cli), cli, unstable_nr, osc_unstable_count); + + /* If the LRU slots are in shortage - 25% remaining AND this OSC + * has one full RPC window of unstable pages, it's a good chance + * to piggyback a SOFT_SYNC flag. + * Please notice that the OST won't take immediate response for the + * SOFT_SYNC request so active OSCs will have more chance to carry + * the flag, this is reasonable. */ + return unstable_nr > cli->cl_cache->ccc_lru_max >> 2 && + osc_unstable_count > cli->cl_max_pages_per_rpc * + cli->cl_max_rpcs_in_flight; +} + +/** + * Return how many LRU pages in the cache of all OSC devices + * + * \retval return # of cached LRU pages times reclaimation tendency + * \retval SHRINK_STOP if it cannot do any scanning in this time + */ +unsigned long osc_cache_shrink_count(struct shrinker *sk, + struct shrink_control *sc) +{ + struct client_obd *cli; + unsigned long cached = 0; + + spin_lock(&osc_shrink_lock); + list_for_each_entry(cli, &osc_shrink_list, cl_shrink_list) + cached += atomic_long_read(&cli->cl_lru_in_list); + spin_unlock(&osc_shrink_lock); + + return (cached * sysctl_vfs_cache_pressure) / 100; +} + +/** + * Scan and try to reclaim sc->nr_to_scan cached LRU pages + * + * \retval number of cached LRU pages reclaimed + * \retval SHRINK_STOP if it cannot do any scanning in this time + * + * Linux kernel will loop calling this shrinker scan routine with + * sc->nr_to_scan = SHRINK_BATCH(128 for now) until kernel got enough memory. + * + * If sc->nr_to_scan is 0, the VM is querying the cache size, we don't need + * to scan and try to reclaim LRU pages, just return 0 and + * osc_cache_shrink_count() will report the LRU page number. + */ +unsigned long osc_cache_shrink_scan(struct shrinker *sk, + struct shrink_control *sc) +{ + struct client_obd *cli; + struct client_obd *stop_anchor = NULL; + struct lu_env *env; + long shrank = 0; + int rc; + __u16 refcheck; + + if (sc->nr_to_scan == 0) + return 0; + + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return SHRINK_STOP; + + spin_lock(&osc_shrink_lock); + while (!list_empty(&osc_shrink_list)) { + cli = list_entry(osc_shrink_list.next, struct client_obd, + cl_shrink_list); + + if (stop_anchor == NULL) + stop_anchor = cli; + else if (cli == stop_anchor) + break; + + list_move_tail(&cli->cl_shrink_list, &osc_shrink_list); + spin_unlock(&osc_shrink_lock); + + /* shrink no more than max_pages_per_rpc for an OSC */ + rc = osc_lru_shrink(env, cli, (sc->nr_to_scan - shrank) > + cli->cl_max_pages_per_rpc ? + cli->cl_max_pages_per_rpc : + sc->nr_to_scan - shrank, true); + if (rc > 0) + shrank += rc; + + if (shrank >= sc->nr_to_scan) + goto out; + + spin_lock(&osc_shrink_lock); + } + spin_unlock(&osc_shrink_lock); + +out: + cl_env_put(env, &refcheck); + + return shrank; +} + +/** @} osc */ diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_quota.c b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c new file mode 100644 index 0000000000000..a0aaae784515a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c @@ -0,0 +1,320 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2017, Intel Corporation. + * + * Code originally extracted from quota directory + */ + +#include +#include + +#include "osc_internal.h" + +static inline struct osc_quota_info *osc_oqi_alloc(u32 id) +{ + struct osc_quota_info *oqi; + + OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem); + if (oqi != NULL) + oqi->oqi_id = id; + + return oqi; +} + +int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]) +{ + int type; + ENTRY; + + for (type = 0; type < LL_MAXQUOTAS; type++) { + struct osc_quota_info *oqi; + + oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]); + if (oqi) { + /* do not try to access oqi here, it could have been + * freed by osc_quota_setdq() */ + + /* the slot is busy, the user is about to run out of + * quota space on this OST */ + CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n", + type == USRQUOTA ? "user" : "grout", qid[type]); + RETURN(NO_QUOTA); + } + } + + RETURN(QUOTA_OK); +} + +static inline u32 md_quota_flag(int qtype) +{ + switch (qtype) { + case USRQUOTA: + return OBD_MD_FLUSRQUOTA; + case GRPQUOTA: + return OBD_MD_FLGRPQUOTA; + case PRJQUOTA: + return OBD_MD_FLPRJQUOTA; + default: + return 0; + } +} + +static inline u32 fl_quota_flag(int qtype) +{ + switch (qtype) { + case USRQUOTA: + return OBD_FL_NO_USRQUOTA; + case GRPQUOTA: + return OBD_FL_NO_GRPQUOTA; + case PRJQUOTA: + return OBD_FL_NO_PRJQUOTA; + default: + return 0; + } +} + +int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[], + u64 valid, u32 flags) +{ + int type; + int rc = 0; + + ENTRY; + + if ((valid & (OBD_MD_FLALLQUOTA)) == 0) + RETURN(0); + + mutex_lock(&cli->cl_quota_mutex); + /* still mark the quots is running out for the old request, because it + * could be processed after the new request at OST, the side effect is + * the following request will be processed synchronously, but it will + * not break the quota enforcement. */ + if (cli->cl_quota_last_xid > xid && !(flags & OBD_FL_NO_QUOTA_ALL)) + GOTO(out_unlock, rc); + + if (cli->cl_quota_last_xid < xid) + cli->cl_quota_last_xid = xid; + + for (type = 0; type < LL_MAXQUOTAS; type++) { + struct osc_quota_info *oqi; + + if ((valid & md_quota_flag(type)) == 0) + continue; + + /* lookup the ID in the per-type hash table */ + oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]); + if ((flags & fl_quota_flag(type)) != 0) { + /* This ID is getting close to its quota limit, let's + * switch to sync I/O */ + if (oqi != NULL) + continue; + + oqi = osc_oqi_alloc(qid[type]); + if (oqi == NULL) { + rc = -ENOMEM; + break; + } + + rc = cfs_hash_add_unique(cli->cl_quota_hash[type], + &qid[type], &oqi->oqi_hash); + /* race with others? */ + if (rc == -EALREADY) { + rc = 0; + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); + } + + CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n", + cli_name(cli), qtype_name(type), qid[type], rc); + } else { + /* This ID is now off the hook, let's remove it from + * the hash table */ + if (oqi == NULL) + continue; + + oqi = cfs_hash_del_key(cli->cl_quota_hash[type], + &qid[type]); + if (oqi) + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); + + CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n", + cli_name(cli), qtype_name(type), qid[type], oqi); + } + } + +out_unlock: + mutex_unlock(&cli->cl_quota_mutex); + RETURN(rc); +} + +/* + * Hash operations for uid/gid <-> osc_quota_info + */ +static unsigned +oqi_hashfn(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_u32_hash(*((__u32*)key), mask); +} + +static int +oqi_keycmp(const void *key, struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + u32 uid; + + LASSERT(key != NULL); + uid = *((u32 *)key); + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + + return uid == oqi->oqi_id; +} + +static void * +oqi_key(struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + return &oqi->oqi_id; +} + +static void * +oqi_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct osc_quota_info, oqi_hash); +} + +static void +oqi_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ +} + +static void +oqi_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ +} + +static void +oqi_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); +} + +#define HASH_QUOTA_BKT_BITS 5 +#define HASH_QUOTA_CUR_BITS 5 +#define HASH_QUOTA_MAX_BITS 15 + +static struct cfs_hash_ops quota_hash_ops = { + .hs_hash = oqi_hashfn, + .hs_keycmp = oqi_keycmp, + .hs_key = oqi_key, + .hs_object = oqi_object, + .hs_get = oqi_get, + .hs_put_locked = oqi_put_locked, + .hs_exit = oqi_exit, +}; + +int osc_quota_setup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int i, type; + ENTRY; + + mutex_init(&cli->cl_quota_mutex); + + for (type = 0; type < LL_MAXQUOTAS; type++) { + cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH", + HASH_QUOTA_CUR_BITS, + HASH_QUOTA_MAX_BITS, + HASH_QUOTA_BKT_BITS, + 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + "a_hash_ops, + CFS_HASH_DEFAULT); + if (cli->cl_quota_hash[type] == NULL) + break; + } + + if (type == LL_MAXQUOTAS) + RETURN(0); + + for (i = 0; i < type; i++) + cfs_hash_putref(cli->cl_quota_hash[i]); + + RETURN(-ENOMEM); +} + +int osc_quota_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int type; + ENTRY; + + for (type = 0; type < LL_MAXQUOTAS; type++) + cfs_hash_putref(cli->cl_quota_hash[type]); + + RETURN(0); +} + +int osc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct ptlrpc_request *req; + struct obd_quotactl *oqc; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION, + OST_QUOTACTL); + if (req == NULL) + RETURN(-ENOMEM); + + oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *oqc = *oqctl; + + ptlrpc_request_set_replen(req); + ptlrpc_at_set_req_timeout(req); + req->rq_no_resend = 1; + + rc = ptlrpc_queue_wait(req); + if (rc) + CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); + + if (req->rq_repmsg && + (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) { + *oqctl = *oqc; + } else if (!rc) { + CERROR ("Can't unpack obd_quotactl\n"); + rc = -EPROTO; + } + ptlrpc_req_finished(req); + + RETURN(rc); +} diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_request.c b/drivers/staging/lustrefsx/lustre/osc/osc_request.c new file mode 100644 index 0000000000000..9a3c3fb092209 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/osc/osc_request.c @@ -0,0 +1,3517 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "osc_internal.h" + +atomic_t osc_pool_req_count; +unsigned int osc_reqpool_maxreqcount; +struct ptlrpc_request_pool *osc_rq_pool; + +/* max memory used for request pool, unit is MB */ +static unsigned int osc_reqpool_mem_max = 5; +module_param(osc_reqpool_mem_max, uint, 0444); + +static int osc_idle_timeout = 20; +module_param(osc_idle_timeout, uint, 0644); + +#define osc_grant_args osc_brw_async_args + +struct osc_setattr_args { + struct obdo *sa_oa; + obd_enqueue_update_f sa_upcall; + void *sa_cookie; +}; + +struct osc_fsync_args { + struct osc_object *fa_obj; + struct obdo *fa_oa; + obd_enqueue_update_f fa_upcall; + void *fa_cookie; +}; + +struct osc_ladvise_args { + struct obdo *la_oa; + obd_enqueue_update_f la_upcall; + void *la_cookie; +}; + +static void osc_release_ppga(struct brw_page **ppga, size_t count); +static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *data, int rc); + +void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa) +{ + struct ost_body *body; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); +} + +static int osc_getattr(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int rc; + + ENTRY; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_pack_req_body(req, oa); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); + + oa->o_blksize = cli_brw_size(exp->exp_obd); + oa->o_valid |= OBD_MD_FLBLKSZ; + + EXIT; +out: + ptlrpc_req_finished(req); + + return rc; +} + +static int osc_setattr(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int rc; + + ENTRY; + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_pack_req_body(req, oa); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); + + EXIT; +out: + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static int osc_setattr_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct osc_setattr_args *sa, int rc) +{ + struct ost_body *body; + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa, + &body->oa); +out: + rc = sa->sa_upcall(sa->sa_cookie, rc); + RETURN(rc); +} + +int osc_setattr_async(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct osc_setattr_args *sa; + int rc; + + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_pack_req_body(req, oa); + + ptlrpc_request_set_replen(req); + + /* do mds to ost setattr asynchronously */ + if (!rqset) { + /* Do not wait for response. */ + ptlrpcd_add_req(req); + } else { + req->rq_interpret_reply = + (ptlrpc_interpterer_t)osc_setattr_interpret; + + CLASSERT(sizeof(*sa) <= sizeof(req->rq_async_args)); + sa = ptlrpc_req_async_args(req); + sa->sa_oa = oa; + sa->sa_upcall = upcall; + sa->sa_cookie = cookie; + + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req); + else + ptlrpc_set_add_req(rqset, req); + } + + RETURN(0); +} + +static int osc_ladvise_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc) +{ + struct osc_ladvise_args *la = arg; + struct ost_body *body; + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + *la->la_oa = body->oa; +out: + rc = la->la_upcall(la->la_cookie, rc); + RETURN(rc); +} + +/** + * If rqset is NULL, do not wait for response. Upcall and cookie could also + * be NULL in this case + */ +int osc_ladvise_base(struct obd_export *exp, struct obdo *oa, + struct ladvise_hdr *ladvise_hdr, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_ladvise_args *la; + int rc; + struct lu_ladvise *req_ladvise; + struct lu_ladvise *ladvise = ladvise_hdr->lah_advise; + int num_advise = ladvise_hdr->lah_count; + struct ladvise_hdr *req_ladvise_hdr; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_LADVISE); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_OST_LADVISE, RCL_CLIENT, + num_advise * sizeof(*ladvise)); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_LADVISE); + if (rc != 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + req->rq_request_portal = OST_IO_PORTAL; + ptlrpc_at_set_req_timeout(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oa); + + req_ladvise_hdr = req_capsule_client_get(&req->rq_pill, + &RMF_OST_LADVISE_HDR); + memcpy(req_ladvise_hdr, ladvise_hdr, sizeof(*ladvise_hdr)); + + req_ladvise = req_capsule_client_get(&req->rq_pill, &RMF_OST_LADVISE); + memcpy(req_ladvise, ladvise, sizeof(*ladvise) * num_advise); + ptlrpc_request_set_replen(req); + + if (rqset == NULL) { + /* Do not wait for response. */ + ptlrpcd_add_req(req); + RETURN(0); + } + + req->rq_interpret_reply = osc_ladvise_interpret; + CLASSERT(sizeof(*la) <= sizeof(req->rq_async_args)); + la = ptlrpc_req_async_args(req); + la->la_oa = oa; + la->la_upcall = upcall; + la->la_cookie = cookie; + + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req); + else + ptlrpc_set_add_req(rqset, req); + + RETURN(0); +} + +static int osc_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int rc; + ENTRY; + + LASSERT(oa != NULL); + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi))); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE); + if (rc) { + ptlrpc_request_free(req); + GOTO(out, rc); + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out_req, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out_req, rc = -EPROTO); + + CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); + + oa->o_blksize = cli_brw_size(exp->exp_obd); + oa->o_valid |= OBD_MD_FLBLKSZ; + + CDEBUG(D_HA, "transno: %lld\n", + lustre_msg_get_transno(req->rq_repmsg)); +out_req: + ptlrpc_req_finished(req); +out: + RETURN(rc); +} + +int osc_punch_send(struct obd_export *exp, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie) +{ + struct ptlrpc_request *req; + struct osc_setattr_args *sa; + struct obd_import *imp = class_exp2cliimp(exp); + struct ost_body *body; + int rc; + + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH); + if (rc < 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_set_io_portal(req); + + ptlrpc_at_set_req_timeout(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + + lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + + req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret; + CLASSERT(sizeof(*sa) <= sizeof(req->rq_async_args)); + sa = ptlrpc_req_async_args(req); + sa->sa_oa = oa; + sa->sa_upcall = upcall; + sa->sa_cookie = cookie; + + ptlrpcd_add_req(req); + + RETURN(0); +} +EXPORT_SYMBOL(osc_punch_send); + +static int osc_sync_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc) +{ + struct osc_fsync_args *fa = arg; + struct ost_body *body; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned long valid = 0; + struct cl_object *obj; + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + CERROR("can't unpack ost_body\n"); + GOTO(out, rc = -EPROTO); + } + + *fa->fa_oa = body->oa; + obj = osc2cl(fa->fa_obj); + + /* Update osc object's blocks attribute */ + cl_object_attr_lock(obj); + if (body->oa.o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = body->oa.o_blocks; + valid |= CAT_BLOCKS; + } + + if (valid != 0) + cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + +out: + rc = fa->fa_upcall(fa->fa_cookie, rc); + RETURN(rc); +} + +int osc_sync_base(struct osc_object *obj, struct obdo *oa, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct obd_export *exp = osc_export(obj); + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_fsync_args *fa; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + /* overload the size and blocks fields in the oa with start/end */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = osc_sync_interpret; + + CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args)); + fa = ptlrpc_req_async_args(req); + fa->fa_obj = obj; + fa->fa_oa = oa; + fa->fa_upcall = upcall; + fa->fa_cookie = cookie; + + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req); + else + ptlrpc_set_add_req(rqset, req); + + RETURN (0); +} + +/* Find and cancel locally locks matched by @mode in the resource found by + * @objid. Found locks are added into @cancel list. Returns the amount of + * locks added to @cancels list. */ +static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, + struct list_head *cancels, + enum ldlm_mode mode, __u64 lock_flags) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + struct ldlm_res_id res_id; + struct ldlm_resource *res; + int count; + ENTRY; + + /* Return, i.e. cancel nothing, only if ELC is supported (flag in + * export) but disabled through procfs (flag in NS). + * + * This distinguishes from a case when ELC is not supported originally, + * when we still want to cancel locks in advance and just cancel them + * locally, without sending any RPC. */ + if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) + RETURN(0); + + ostid_build_res_name(&oa->o_oi, &res_id); + res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); + if (IS_ERR(res)) + RETURN(0); + + LDLM_RESOURCE_ADDREF(res); + count = ldlm_cancel_resource_local(res, cancels, NULL, mode, + lock_flags, 0, NULL); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(count); +} + +static int osc_destroy_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *data, + int rc) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + + atomic_dec(&cli->cl_destroy_in_flight); + wake_up(&cli->cl_destroy_waitq); + return 0; +} + +static int osc_can_send_destroy(struct client_obd *cli) +{ + if (atomic_inc_return(&cli->cl_destroy_in_flight) <= + cli->cl_max_rpcs_in_flight) { + /* The destroy request can be sent */ + return 1; + } + if (atomic_dec_return(&cli->cl_destroy_in_flight) < + cli->cl_max_rpcs_in_flight) { + /* + * The counter has been modified between the two atomic + * operations. + */ + wake_up(&cli->cl_destroy_waitq); + } + return 0; +} + +static int osc_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + struct ptlrpc_request *req; + struct ost_body *body; + struct list_head cancels = LIST_HEAD_INIT(cancels); + int rc, count; + ENTRY; + + if (!oa) { + CDEBUG(D_INFO, "oa NULL\n"); + RETURN(-EINVAL); + } + + count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW, + LDLM_FL_DISCARD_DATA); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY, + 0, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + ptlrpc_request_set_replen(req); + + req->rq_interpret_reply = osc_destroy_interpret; + if (!osc_can_send_destroy(cli)) { + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + + /* + * Wait until the number of on-going destroy RPCs drops + * under max_rpc_in_flight + */ + rc = l_wait_event_exclusive(cli->cl_destroy_waitq, + osc_can_send_destroy(cli), &lwi); + if (rc) { + ptlrpc_req_finished(req); + RETURN(rc); + } + } + + /* Do not wait for response */ + ptlrpcd_add_req(req); + RETURN(0); +} + +static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, + long writing_bytes) +{ + u64 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT; + + LASSERT(!(oa->o_valid & bits)); + + oa->o_valid |= bits; + spin_lock(&cli->cl_loi_list_lock); + if (OCD_HAS_FLAG(&cli->cl_import->imp_connect_data, GRANT_PARAM)) + oa->o_dirty = cli->cl_dirty_grant; + else + oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT; + if (unlikely(cli->cl_dirty_pages > cli->cl_dirty_max_pages)) { + CERROR("dirty %lu > dirty_max %lu\n", + cli->cl_dirty_pages, + cli->cl_dirty_max_pages); + oa->o_undirty = 0; + } else if (unlikely(atomic_long_read(&obd_dirty_pages) > + (long)(obd_max_dirty_pages + 1))) { + /* The atomic_read() allowing the atomic_inc() are + * not covered by a lock thus they may safely race and trip + * this CERROR() unless we add in a small fudge factor (+1). */ + CERROR("%s: dirty %ld > system dirty_max %ld\n", + cli_name(cli), atomic_long_read(&obd_dirty_pages), + obd_max_dirty_pages); + oa->o_undirty = 0; + } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages > + 0x7fffffff)) { + CERROR("dirty %lu - dirty_max %lu too big???\n", + cli->cl_dirty_pages, cli->cl_dirty_max_pages); + oa->o_undirty = 0; + } else { + unsigned long nrpages; + unsigned long undirty; + + nrpages = cli->cl_max_pages_per_rpc; + nrpages *= cli->cl_max_rpcs_in_flight + 1; + nrpages = max(nrpages, cli->cl_dirty_max_pages); + undirty = nrpages << PAGE_SHIFT; + if (OCD_HAS_FLAG(&cli->cl_import->imp_connect_data, + GRANT_PARAM)) { + int nrextents; + + /* take extent tax into account when asking for more + * grant space */ + nrextents = (nrpages + cli->cl_max_extent_pages - 1) / + cli->cl_max_extent_pages; + undirty += nrextents * cli->cl_grant_extent_tax; + } + /* Do not ask for more than OBD_MAX_GRANT - a margin for server + * to add extent tax, etc. + */ + oa->o_undirty = min(undirty, OBD_MAX_GRANT & + ~(PTLRPC_MAX_BRW_SIZE * 4UL)); + } + oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant; + /* o_dropped AKA o_misc is 32 bits, but cl_lost_grant is 64 bits */ + if (cli->cl_lost_grant > INT_MAX) { + CDEBUG(D_CACHE, + "%s: avoided o_dropped overflow: cl_lost_grant %lu\n", + cli_name(cli), cli->cl_lost_grant); + oa->o_dropped = INT_MAX; + } else { + oa->o_dropped = cli->cl_lost_grant; + } + cli->cl_lost_grant -= oa->o_dropped; + spin_unlock(&cli->cl_loi_list_lock); + CDEBUG(D_CACHE, "%s: dirty: %llu undirty: %u dropped %u grant: %llu" + " cl_lost_grant %lu\n", cli_name(cli), oa->o_dirty, + oa->o_undirty, oa->o_dropped, oa->o_grant, cli->cl_lost_grant); +} + +void osc_update_next_shrink(struct client_obd *cli) +{ + cli->cl_next_shrink_grant = ktime_get_seconds() + + cli->cl_grant_shrink_interval; + + CDEBUG(D_CACHE, "next time %lld to shrink grant\n", + cli->cl_next_shrink_grant); +} + +static void __osc_update_grant(struct client_obd *cli, u64 grant) +{ + spin_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant += grant; + spin_unlock(&cli->cl_loi_list_lock); +} + +static void osc_update_grant(struct client_obd *cli, struct ost_body *body) +{ + if (body->oa.o_valid & OBD_MD_FLGRANT) { + CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant); + __osc_update_grant(cli, body->oa.o_grant); + } +} + +/** + * grant thread data for shrinking space. + */ +struct grant_thread_data { + struct list_head gtd_clients; + struct mutex gtd_mutex; + unsigned long gtd_stopped:1; +}; +static struct grant_thread_data client_gtd; + +static int osc_shrink_grant_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *aa, int rc) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa; + struct ost_body *body; + + if (rc != 0) { + __osc_update_grant(cli, oa->o_grant); + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + osc_update_grant(cli, body); +out: + OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem); + oa = NULL; + return rc; +} + +static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa) +{ + spin_lock(&cli->cl_loi_list_lock); + oa->o_grant = cli->cl_avail_grant / 4; + cli->cl_avail_grant -= oa->o_grant; + spin_unlock(&cli->cl_loi_list_lock); + if (!(oa->o_valid & OBD_MD_FLFLAGS)) { + oa->o_valid |= OBD_MD_FLFLAGS; + oa->o_flags = 0; + } + oa->o_flags |= OBD_FL_SHRINK_GRANT; + osc_update_next_shrink(cli); +} + +/* Shrink the current grant, either from some large amount to enough for a + * full set of in-flight RPCs, or if we have already shrunk to that limit + * then to enough for a single RPC. This avoids keeping more grant than + * needed, and avoids shrinking the grant piecemeal. */ +static int osc_shrink_grant(struct client_obd *cli) +{ + __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) * + (cli->cl_max_pages_per_rpc << PAGE_SHIFT); + + spin_lock(&cli->cl_loi_list_lock); + if (cli->cl_avail_grant <= target_bytes) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT; + spin_unlock(&cli->cl_loi_list_lock); + + return osc_shrink_grant_to_target(cli, target_bytes); +} + +int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes) +{ + int rc = 0; + struct ost_body *body; + ENTRY; + + spin_lock(&cli->cl_loi_list_lock); + /* Don't shrink if we are already above or below the desired limit + * We don't want to shrink below a single RPC, as that will negatively + * impact block allocation and long-term performance. */ + if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT; + + if (target_bytes >= cli->cl_avail_grant) { + spin_unlock(&cli->cl_loi_list_lock); + RETURN(0); + } + spin_unlock(&cli->cl_loi_list_lock); + + OBD_ALLOC_PTR(body); + if (!body) + RETURN(-ENOMEM); + + osc_announce_cached(cli, &body->oa, 0); + + spin_lock(&cli->cl_loi_list_lock); + if (target_bytes >= cli->cl_avail_grant) { + /* available grant has changed since target calculation */ + spin_unlock(&cli->cl_loi_list_lock); + GOTO(out_free, rc = 0); + } + body->oa.o_grant = cli->cl_avail_grant - target_bytes; + cli->cl_avail_grant = target_bytes; + spin_unlock(&cli->cl_loi_list_lock); + if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= OBD_FL_SHRINK_GRANT; + osc_update_next_shrink(cli); + + rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export, + sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK, + sizeof(*body), body, NULL); + if (rc != 0) + __osc_update_grant(cli, body->oa.o_grant); +out_free: + OBD_FREE_PTR(body); + RETURN(rc); +} + +static int osc_should_shrink_grant(struct client_obd *client) +{ + time64_t next_shrink = client->cl_next_shrink_grant; + + if (client->cl_import == NULL) + return 0; + + if (!OCD_HAS_FLAG(&client->cl_import->imp_connect_data, GRANT_SHRINK) || + client->cl_import->imp_grant_shrink_disabled) { + osc_update_next_shrink(client); + return 0; + } + + if (ktime_get_seconds() >= next_shrink - 5) { + /* Get the current RPC size directly, instead of going via: + * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export) + * Keep comment here so that it can be found by searching. */ + int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT; + + if (client->cl_import->imp_state == LUSTRE_IMP_FULL && + client->cl_avail_grant > brw_size) + return 1; + else + osc_update_next_shrink(client); + } + return 0; +} + +#define GRANT_SHRINK_RPC_BATCH 100 + +static struct delayed_work work; + +static void osc_grant_work_handler(struct work_struct *data) +{ + struct client_obd *cli; + int rpc_sent; + bool init_next_shrink = true; + time64_t next_shrink = ktime_get_seconds() + GRANT_SHRINK_INTERVAL; + + rpc_sent = 0; + mutex_lock(&client_gtd.gtd_mutex); + list_for_each_entry(cli, &client_gtd.gtd_clients, + cl_grant_chain) { + if (rpc_sent < GRANT_SHRINK_RPC_BATCH && + osc_should_shrink_grant(cli)) { + osc_shrink_grant(cli); + rpc_sent++; + } + + if (!init_next_shrink) { + if (cli->cl_next_shrink_grant < next_shrink && + cli->cl_next_shrink_grant > ktime_get_seconds()) + next_shrink = cli->cl_next_shrink_grant; + } else { + init_next_shrink = false; + next_shrink = cli->cl_next_shrink_grant; + } + } + mutex_unlock(&client_gtd.gtd_mutex); + + if (client_gtd.gtd_stopped == 1) + return; + + if (next_shrink > ktime_get_seconds()) + schedule_delayed_work(&work, msecs_to_jiffies( + (next_shrink - ktime_get_seconds()) * + MSEC_PER_SEC)); + else + schedule_work(&work.work); +} + +/** + * Start grant thread for returing grant to server for idle clients. + */ +static int osc_start_grant_work(void) +{ + client_gtd.gtd_stopped = 0; + mutex_init(&client_gtd.gtd_mutex); + INIT_LIST_HEAD(&client_gtd.gtd_clients); + + INIT_DELAYED_WORK(&work, osc_grant_work_handler); + schedule_work(&work.work); + + return 0; +} + +static void osc_stop_grant_work(void) +{ + client_gtd.gtd_stopped = 1; + cancel_delayed_work_sync(&work); +} + +static void osc_add_grant_list(struct client_obd *client) +{ + mutex_lock(&client_gtd.gtd_mutex); + list_add(&client->cl_grant_chain, &client_gtd.gtd_clients); + mutex_unlock(&client_gtd.gtd_mutex); +} + +static void osc_del_grant_list(struct client_obd *client) +{ + if (list_empty(&client->cl_grant_chain)) + return; + + mutex_lock(&client_gtd.gtd_mutex); + list_del_init(&client->cl_grant_chain); + mutex_unlock(&client_gtd.gtd_mutex); +} + +void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) +{ + /* + * ocd_grant is the total grant amount we're expect to hold: if we've + * been evicted, it's the new avail_grant amount, cl_dirty_pages will + * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant + + * dirty. + * + * race is tolerable here: if we're evicted, but imp_state already + * left EVICTED state, then cl_dirty_pages must be 0 already. + */ + spin_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant = ocd->ocd_grant; + if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) { + unsigned long consumed = cli->cl_reserved_grant; + + if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) + consumed += cli->cl_dirty_grant; + else + consumed += cli->cl_dirty_pages << PAGE_SHIFT; + if (cli->cl_avail_grant < consumed) { + CERROR("%s: granted %ld but already consumed %ld\n", + cli_name(cli), cli->cl_avail_grant, consumed); + cli->cl_avail_grant = 0; + } else { + cli->cl_avail_grant -= consumed; + } + } + + if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) { + u64 size; + int chunk_mask; + + /* overhead for each extent insertion */ + cli->cl_grant_extent_tax = ocd->ocd_grant_tax_kb << 10; + /* determine the appropriate chunk size used by osc_extent. */ + cli->cl_chunkbits = max_t(int, PAGE_SHIFT, + ocd->ocd_grant_blkbits); + /* max_pages_per_rpc must be chunk aligned */ + chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1); + cli->cl_max_pages_per_rpc = (cli->cl_max_pages_per_rpc + + ~chunk_mask) & chunk_mask; + /* determine maximum extent size, in #pages */ + size = (u64)ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits; + cli->cl_max_extent_pages = size >> PAGE_SHIFT; + if (cli->cl_max_extent_pages == 0) + cli->cl_max_extent_pages = 1; + } else { + cli->cl_grant_extent_tax = 0; + cli->cl_chunkbits = PAGE_SHIFT; + cli->cl_max_extent_pages = DT_MAX_BRW_PAGES; + } + spin_unlock(&cli->cl_loi_list_lock); + + CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld." + "chunk bits: %d cl_max_extent_pages: %d\n", + cli_name(cli), + cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits, + cli->cl_max_extent_pages); + + if (OCD_HAS_FLAG(ocd, GRANT_SHRINK) && list_empty(&cli->cl_grant_chain)) + osc_add_grant_list(cli); +} +EXPORT_SYMBOL(osc_init_grant); + +/* We assume that the reason this OSC got a short read is because it read + * beyond the end of a stripe file; i.e. lustre is reading a sparse file + * via the LOV, and it _knows_ it's reading inside the file, it's just that + * this stripe never got written at or beyond this stripe offset yet. */ +static void handle_short_read(int nob_read, size_t page_count, + struct brw_page **pga) +{ + char *ptr; + int i = 0; + + /* skip bytes read OK */ + while (nob_read > 0) { + LASSERT (page_count > 0); + + if (pga[i]->count > nob_read) { + /* EOF inside this page */ + ptr = kmap(pga[i]->pg) + + (pga[i]->off & ~PAGE_MASK); + memset(ptr + nob_read, 0, pga[i]->count - nob_read); + kunmap(pga[i]->pg); + page_count--; + i++; + break; + } + + nob_read -= pga[i]->count; + page_count--; + i++; + } + + /* zero remaining pages */ + while (page_count-- > 0) { + ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK); + memset(ptr, 0, pga[i]->count); + kunmap(pga[i]->pg); + i++; + } +} + +static int check_write_rcs(struct ptlrpc_request *req, + int requested_nob, int niocount, + size_t page_count, struct brw_page **pga) +{ + int i; + __u32 *remote_rcs; + + remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS, + sizeof(*remote_rcs) * + niocount); + if (remote_rcs == NULL) { + CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n"); + return(-EPROTO); + } + + /* return error if any niobuf was in error */ + for (i = 0; i < niocount; i++) { + if ((int)remote_rcs[i] < 0) + return(remote_rcs[i]); + + if (remote_rcs[i] != 0) { + CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n", + i, remote_rcs[i], req); + return(-EPROTO); + } + } + if (req->rq_bulk != NULL && + req->rq_bulk->bd_nob_transferred != requested_nob) { + CERROR("Unexpected # bytes transferred: %d (requested %d)\n", + req->rq_bulk->bd_nob_transferred, requested_nob); + return(-EPROTO); + } + + return (0); +} + +static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) +{ + if (p1->flag != p2->flag) { + unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_SYNC | + OBD_BRW_ASYNC | OBD_BRW_NOQUOTA | + OBD_BRW_SOFT_SYNC); + + /* warn if we try to combine flags that we don't know to be + * safe to combine */ + if (unlikely((p1->flag & mask) != (p2->flag & mask))) { + CWARN("Saw flags 0x%x and 0x%x in the same brw, please " + "report this at https://jira.whamcloud.com/\n", + p1->flag, p2->flag); + } + return 0; + } + + return (p1->off + p1->count == p2->off); +} + +#if IS_ENABLED(CONFIG_CRC_T10DIF) +static int osc_checksum_bulk_t10pi(const char *obd_name, int nob, + size_t pg_count, struct brw_page **pga, + int opc, obd_dif_csum_fn *fn, + int sector_size, + u32 *check_sum) +{ + struct ahash_request *req; + /* Used Adler as the default checksum type on top of DIF tags */ + unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP); + struct page *__page; + unsigned char *buffer; + __u16 *guard_start; + unsigned int bufsize; + int guard_number; + int used_number = 0; + int used; + u32 cksum; + int rc = 0; + int i = 0; + + LASSERT(pg_count > 0); + + __page = alloc_page(GFP_KERNEL); + if (__page == NULL) + return -ENOMEM; + + req = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + CERROR("%s: unable to initialize checksum hash %s: rc = %d\n", + obd_name, cfs_crypto_hash_name(cfs_alg), rc); + GOTO(out, rc); + } + + buffer = kmap(__page); + guard_start = (__u16 *)buffer; + guard_number = PAGE_SIZE / sizeof(*guard_start); + while (nob > 0 && pg_count > 0) { + unsigned int count = pga[i]->count > nob ? nob : pga[i]->count; + + /* corrupt the data before we compute the checksum, to + * simulate an OST->client data error */ + if (unlikely(i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))) { + unsigned char *ptr = kmap(pga[i]->pg); + int off = pga[i]->off & ~PAGE_MASK; + + memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob)); + kunmap(pga[i]->pg); + } + + /* + * The left guard number should be able to hold checksums of a + * whole page + */ + rc = obd_page_dif_generate_buffer(obd_name, pga[i]->pg, + pga[i]->off & ~PAGE_MASK, + count, + guard_start + used_number, + guard_number - used_number, + &used, sector_size, + fn); + if (rc) + break; + + used_number += used; + if (used_number == guard_number) { + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + used_number = 0; + } + + nob -= pga[i]->count; + pg_count--; + i++; + } + kunmap(__page); + if (rc) + GOTO(out, rc); + + if (used_number != 0) + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + + bufsize = sizeof(cksum); + cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize); + + /* For sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo */ + if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) + cksum++; + + *check_sum = cksum; +out: + __free_page(__page); + return rc; +} +#else /* !CONFIG_CRC_T10DIF */ +#define obd_dif_ip_fn NULL +#define obd_dif_crc_fn NULL +#define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum) \ + -EOPNOTSUPP +#endif /* CONFIG_CRC_T10DIF */ + +static int osc_checksum_bulk(int nob, size_t pg_count, + struct brw_page **pga, int opc, + enum cksum_types cksum_type, + u32 *cksum) +{ + int i = 0; + struct ahash_request *req; + unsigned int bufsize; + unsigned char cfs_alg = cksum_obd2cfs(cksum_type); + + LASSERT(pg_count > 0); + + req = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(req)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_alg)); + return PTR_ERR(req); + } + + while (nob > 0 && pg_count > 0) { + unsigned int count = pga[i]->count > nob ? nob : pga[i]->count; + + /* corrupt the data before we compute the checksum, to + * simulate an OST->client data error */ + if (i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) { + unsigned char *ptr = kmap(pga[i]->pg); + int off = pga[i]->off & ~PAGE_MASK; + + memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob)); + kunmap(pga[i]->pg); + } + cfs_crypto_hash_update_page(req, pga[i]->pg, + pga[i]->off & ~PAGE_MASK, + count); + LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n", + (int)(pga[i]->off & ~PAGE_MASK)); + + nob -= pga[i]->count; + pg_count--; + i++; + } + + bufsize = sizeof(*cksum); + cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize); + + /* For sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo */ + if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) + (*cksum)++; + + return 0; +} + +static int osc_checksum_bulk_rw(const char *obd_name, + enum cksum_types cksum_type, + int nob, size_t pg_count, + struct brw_page **pga, int opc, + u32 *check_sum) +{ + obd_dif_csum_fn *fn = NULL; + int sector_size = 0; + int rc; + + ENTRY; + obd_t10_cksum2dif(cksum_type, &fn, §or_size); + + if (fn) + rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga, + opc, fn, sector_size, check_sum); + else + rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type, + check_sum); + + RETURN(rc); +} + +static int +osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa, + u32 page_count, struct brw_page **pga, + struct ptlrpc_request **reqp, int resend) +{ + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + struct ost_body *body; + struct obd_ioobj *ioobj; + struct niobuf_remote *niobuf; + int niocount, i, requested_nob, opc, rc, short_io_size = 0; + struct osc_brw_async_args *aa; + struct req_capsule *pill; + struct brw_page *pg_prev; + void *short_io_buf; + const char *obd_name = cli->cl_import->imp_obd->obd_name; + + ENTRY; + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ)) + RETURN(-ENOMEM); /* Recoverable */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2)) + RETURN(-EINVAL); /* Fatal */ + + if ((cmd & OBD_BRW_WRITE) != 0) { + opc = OST_WRITE; + req = ptlrpc_request_alloc_pool(cli->cl_import, + osc_rq_pool, + &RQF_OST_BRW_WRITE); + } else { + opc = OST_READ; + req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ); + } + if (req == NULL) + RETURN(-ENOMEM); + + for (niocount = i = 1; i < page_count; i++) { + if (!can_merge_pages(pga[i - 1], pga[i])) + niocount++; + } + + pill = &req->rq_pill; + req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT, + sizeof(*ioobj)); + req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT, + niocount * sizeof(*niobuf)); + + for (i = 0; i < page_count; i++) + short_io_size += pga[i]->count; + + /* Check if read/write is small enough to be a short io. */ + if (short_io_size > cli->cl_max_short_io_bytes || niocount > 1 || + !imp_connect_shortio(cli->cl_import)) + short_io_size = 0; + + req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT, + opc == OST_READ ? 0 : short_io_size); + if (opc == OST_READ) + req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER, + short_io_size); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + osc_set_io_portal(req); + + ptlrpc_at_set_req_timeout(req); + /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own + * retry logic */ + req->rq_no_retry_einprogress = 1; + + if (short_io_size != 0) { + desc = NULL; + short_io_buf = NULL; + goto no_bulk; + } + + desc = ptlrpc_prep_bulk_imp(req, page_count, + cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS, + (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE : + PTLRPC_BULK_PUT_SINK) | + PTLRPC_BULK_BUF_KIOV, + OST_BULK_PORTAL, + &ptlrpc_bulk_kiov_pin_ops); + + if (desc == NULL) + GOTO(out, rc = -ENOMEM); + /* NB request now owns desc and will free it when it gets freed */ +no_bulk: + body = req_capsule_client_get(pill, &RMF_OST_BODY); + ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ); + niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); + LASSERT(body != NULL && ioobj != NULL && niobuf != NULL); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + /* For READ and WRITE, we can't fill o_uid and o_gid using from_kuid() + * and from_kgid(), because they are asynchronous. Fortunately, variable + * oa contains valid o_uid and o_gid in these two operations. + * Besides, filling o_uid and o_gid is enough for nrs-tbf, see LU-9658. + * OBD_MD_FLUID and OBD_MD_FLUID is not set in order to avoid breaking + * other process logic */ + body->oa.o_uid = oa->o_uid; + body->oa.o_gid = oa->o_gid; + + obdo_to_ioobj(oa, ioobj); + ioobj->ioo_bufcnt = niocount; + /* The high bits of ioo_max_brw tells server _maximum_ number of bulks + * that might be send for this request. The actual number is decided + * when the RPC is finally sent in ptlrpc_register_bulk(). It sends + * "max - 1" for old client compatibility sending "0", and also so the + * the actual maximum is a power-of-two number, not one less. LU-1431 */ + if (desc != NULL) + ioobj_max_brw_set(ioobj, desc->bd_md_max_brw); + else /* short io */ + ioobj_max_brw_set(ioobj, 0); + + if (short_io_size != 0) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= OBD_FL_SHORT_IO; + CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n", + short_io_size); + if (opc == OST_WRITE) { + short_io_buf = req_capsule_client_get(pill, + &RMF_SHORT_IO); + LASSERT(short_io_buf != NULL); + } + } + + LASSERT(page_count > 0); + pg_prev = pga[0]; + for (requested_nob = i = 0; i < page_count; i++, niobuf++) { + struct brw_page *pg = pga[i]; + int poff = pg->off & ~PAGE_MASK; + + LASSERT(pg->count > 0); + /* make sure there is no gap in the middle of page array */ + LASSERTF(page_count == 1 || + (ergo(i == 0, poff + pg->count == PAGE_SIZE) && + ergo(i > 0 && i < page_count - 1, + poff == 0 && pg->count == PAGE_SIZE) && + ergo(i == page_count - 1, poff == 0)), + "i: %d/%d pg: %p off: %llu, count: %u\n", + i, page_count, pg, pg->off, pg->count); + LASSERTF(i == 0 || pg->off > pg_prev->off, + "i %d p_c %u pg %p [pri %lu ind %lu] off %llu" + " prev_pg %p [pri %lu ind %lu] off %llu\n", + i, page_count, + pg->pg, page_private(pg->pg), pg->pg->index, pg->off, + pg_prev->pg, page_private(pg_prev->pg), + pg_prev->pg->index, pg_prev->off); + LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) == + (pg->flag & OBD_BRW_SRVLOCK)); + if (short_io_size != 0 && opc == OST_WRITE) { + unsigned char *ptr = ll_kmap_atomic(pg->pg, KM_USER0); + + LASSERT(short_io_size >= requested_nob + pg->count); + memcpy(short_io_buf + requested_nob, + ptr + poff, + pg->count); + ll_kunmap_atomic(ptr, KM_USER0); + } else if (short_io_size == 0) { + desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, + pg->count); + } + requested_nob += pg->count; + + if (i > 0 && can_merge_pages(pg_prev, pg)) { + niobuf--; + niobuf->rnb_len += pg->count; + } else { + niobuf->rnb_offset = pg->off; + niobuf->rnb_len = pg->count; + niobuf->rnb_flags = pg->flag; + } + pg_prev = pg; + } + + LASSERTF((void *)(niobuf - niocount) == + req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE), + "want %p - real %p\n", req_capsule_client_get(&req->rq_pill, + &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount)); + + osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0); + if (resend) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= OBD_FL_RECOV_RESEND; + } + + if (osc_should_shrink_grant(cli)) + osc_shrink_grant_local(cli, &body->oa); + + /* size[REQ_REC_OFF] still sizeof (*body) */ + if (opc == OST_WRITE) { + if (cli->cl_checksum && + !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { + /* store cl_cksum_type in a local variable since + * it can be changed via lprocfs */ + enum cksum_types cksum_type = cli->cl_cksum_type; + + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) + body->oa.o_flags = 0; + + body->oa.o_flags |= obd_cksum_type_pack(obd_name, + cksum_type); + body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + + rc = osc_checksum_bulk_rw(obd_name, cksum_type, + requested_nob, page_count, + pga, OST_WRITE, + &body->oa.o_cksum); + if (rc < 0) { + CDEBUG(D_PAGE, "failed to checksum, rc = %d\n", + rc); + GOTO(out, rc); + } + CDEBUG(D_PAGE, "checksum at write origin: %x\n", + body->oa.o_cksum); + + /* save this in 'oa', too, for later checking */ + oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + oa->o_flags |= obd_cksum_type_pack(obd_name, + cksum_type); + } else { + /* clear out the checksum flag, in case this is a + * resend but cl_checksum is no longer set. b=11238 */ + oa->o_valid &= ~OBD_MD_FLCKSUM; + } + oa->o_cksum = body->oa.o_cksum; + /* 1 RC per niobuf */ + req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER, + sizeof(__u32) * niocount); + } else { + if (cli->cl_checksum && + !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) + body->oa.o_flags = 0; + body->oa.o_flags |= obd_cksum_type_pack(obd_name, + cli->cl_cksum_type); + body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + } + + /* Client cksum has been already copied to wire obdo in previous + * lustre_set_wire_obdo(), and in the case a bulk-read is being + * resent due to cksum error, this will allow Server to + * check+dump pages on its side */ + } + ptlrpc_request_set_replen(req); + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->aa_oa = oa; + aa->aa_requested_nob = requested_nob; + aa->aa_nio_count = niocount; + aa->aa_page_count = page_count; + aa->aa_resends = 0; + aa->aa_ppga = pga; + aa->aa_cli = cli; + INIT_LIST_HEAD(&aa->aa_oaps); + + *reqp = req; + niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); + CDEBUG(D_RPCTRACE, "brw rpc %p - object "DOSTID" offset %lld<>%lld\n", + req, POSTID(&oa->o_oi), niobuf[0].rnb_offset, + niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len); + RETURN(0); + + out: + ptlrpc_req_finished(req); + RETURN(rc); +} + +char dbgcksum_file_name[PATH_MAX]; + +static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count, + struct brw_page **pga, __u32 server_cksum, + __u32 client_cksum) +{ + struct file *filp; + int rc, i; + unsigned int len; + char *buf; + + /* will only keep dump of pages on first error for the same range in + * file/fid, not during the resends/retries. */ + snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name), + "%s-checksum_dump-osc-"DFID":[%llu-%llu]-%x-%x", + (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0 ? + libcfs_debug_file_path_arr : + LIBCFS_DEBUG_FILE_PATH_DEFAULT), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + pga[0]->off, + pga[page_count-1]->off + pga[page_count-1]->count - 1, + client_cksum, server_cksum); + filp = filp_open(dbgcksum_file_name, + O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + if (rc == -EEXIST) + CDEBUG(D_INFO, "%s: can't open to dump pages with " + "checksum error: rc = %d\n", dbgcksum_file_name, + rc); + else + CERROR("%s: can't open to dump pages with checksum " + "error: rc = %d\n", dbgcksum_file_name, rc); + return; + } + + for (i = 0; i < page_count; i++) { + len = pga[i]->count; + buf = kmap(pga[i]->pg); + while (len != 0) { + rc = cfs_kernel_write(filp, buf, len, &filp->f_pos); + if (rc < 0) { + CERROR("%s: wanted to write %u but got %d " + "error\n", dbgcksum_file_name, len, rc); + break; + } + len -= rc; + buf += rc; + CDEBUG(D_INFO, "%s: wrote %d bytes\n", + dbgcksum_file_name, rc); + } + kunmap(pga[i]->pg); + } + + rc = ll_vfs_fsync_range(filp, 0, LLONG_MAX, 1); + if (rc) + CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc); + filp_close(filp, NULL); + return; +} + +static int +check_write_checksum(struct obdo *oa, const struct lnet_process_id *peer, + __u32 client_cksum, __u32 server_cksum, + struct osc_brw_async_args *aa) +{ + const char *obd_name = aa->aa_cli->cl_import->imp_obd->obd_name; + enum cksum_types cksum_type; + obd_dif_csum_fn *fn = NULL; + int sector_size = 0; + __u32 new_cksum; + char *msg; + int rc; + + if (server_cksum == client_cksum) { + CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); + return 0; + } + + if (aa->aa_cli->cl_checksum_dump) + dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga, + server_cksum, client_cksum); + + cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ? + oa->o_flags : 0); + + switch (cksum_type) { + case OBD_CKSUM_T10IP512: + fn = obd_dif_ip_fn; + sector_size = 512; + break; + case OBD_CKSUM_T10IP4K: + fn = obd_dif_ip_fn; + sector_size = 4096; + break; + case OBD_CKSUM_T10CRC512: + fn = obd_dif_crc_fn; + sector_size = 512; + break; + case OBD_CKSUM_T10CRC4K: + fn = obd_dif_crc_fn; + sector_size = 4096; + break; + default: + break; + } + + if (fn) + rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob, + aa->aa_page_count, aa->aa_ppga, + OST_WRITE, fn, sector_size, + &new_cksum); + else + rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count, + aa->aa_ppga, OST_WRITE, cksum_type, + &new_cksum); + + if (rc < 0) + msg = "failed to calculate the client write checksum"; + else if (cksum_type != obd_cksum_type_unpack(aa->aa_oa->o_flags)) + msg = "the server did not use the checksum type specified in " + "the original request - likely a protocol problem"; + else if (new_cksum == server_cksum) + msg = "changed on the client after we checksummed it - " + "likely false positive due to mmap IO (bug 11742)"; + else if (new_cksum == client_cksum) + msg = "changed in transit before arrival at OST"; + else + msg = "changed in transit AND doesn't match the original - " + "likely false positive due to mmap IO (bug 11742)"; + + LCONSOLE_ERROR_MSG(0x132, "%s: BAD WRITE CHECKSUM: %s: from %s inode " + DFID " object "DOSTID" extent [%llu-%llu], original " + "client csum %x (type %x), server csum %x (type %x)," + " client csum now %x\n", + obd_name, msg, libcfs_nid2str(peer->nid), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + POSTID(&oa->o_oi), aa->aa_ppga[0]->off, + aa->aa_ppga[aa->aa_page_count - 1]->off + + aa->aa_ppga[aa->aa_page_count-1]->count - 1, + client_cksum, + obd_cksum_type_unpack(aa->aa_oa->o_flags), + server_cksum, cksum_type, new_cksum); + return 1; +} + +/* Note rc enters this function as number of bytes transferred */ +static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) +{ + struct osc_brw_async_args *aa = (void *)&req->rq_async_args; + struct client_obd *cli = aa->aa_cli; + const char *obd_name = cli->cl_import->imp_obd->obd_name; + const struct lnet_process_id *peer = + &req->rq_import->imp_connection->c_peer; + struct ost_body *body; + u32 client_cksum = 0; + ENTRY; + + if (rc < 0 && rc != -EDQUOT) { + DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc); + RETURN(rc); + } + + LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc); + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + DEBUG_REQ(D_INFO, req, "Can't unpack body\n"); + RETURN(-EPROTO); + } + + /* set/clear over quota flag for a uid/gid/projid */ + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && + body->oa.o_valid & (OBD_MD_FLALLQUOTA)) { + unsigned qid[LL_MAXQUOTAS] = { + body->oa.o_uid, body->oa.o_gid, + body->oa.o_projid }; + CDEBUG(D_QUOTA, "setdq for [%u %u %u] with valid %#llx, flags %x\n", + body->oa.o_uid, body->oa.o_gid, body->oa.o_projid, + body->oa.o_valid, body->oa.o_flags); + osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid, + body->oa.o_flags); + } + + osc_update_grant(cli, body); + + if (rc < 0) + RETURN(rc); + + if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM) + client_cksum = aa->aa_oa->o_cksum; /* save for later */ + + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { + if (rc > 0) { + CERROR("Unexpected +ve rc %d\n", rc); + RETURN(-EPROTO); + } + + if (req->rq_bulk != NULL && + sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk)) + RETURN(-EAGAIN); + + if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum && + check_write_checksum(&body->oa, peer, client_cksum, + body->oa.o_cksum, aa)) + RETURN(-EAGAIN); + + rc = check_write_rcs(req, aa->aa_requested_nob,aa->aa_nio_count, + aa->aa_page_count, aa->aa_ppga); + GOTO(out, rc); + } + + /* The rest of this function executes only for OST_READs */ + + if (req->rq_bulk == NULL) { + rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO, + RCL_SERVER); + LASSERT(rc == req->rq_status); + } else { + /* if unwrap_bulk failed, return -EAGAIN to retry */ + rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc); + } + if (rc < 0) + GOTO(out, rc = -EAGAIN); + + if (rc > aa->aa_requested_nob) { + CERROR("Unexpected rc %d (%d requested)\n", rc, + aa->aa_requested_nob); + RETURN(-EPROTO); + } + + if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) { + CERROR ("Unexpected rc %d (%d transferred)\n", + rc, req->rq_bulk->bd_nob_transferred); + return (-EPROTO); + } + + if (req->rq_bulk == NULL) { + /* short io */ + int nob, pg_count, i = 0; + unsigned char *buf; + + CDEBUG(D_CACHE, "Using short io read, size %d\n", rc); + pg_count = aa->aa_page_count; + buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO, + rc); + nob = rc; + while (nob > 0 && pg_count > 0) { + unsigned char *ptr; + int count = aa->aa_ppga[i]->count > nob ? + nob : aa->aa_ppga[i]->count; + + CDEBUG(D_CACHE, "page %p count %d\n", + aa->aa_ppga[i]->pg, count); + ptr = ll_kmap_atomic(aa->aa_ppga[i]->pg, KM_USER0); + memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf, + count); + ll_kunmap_atomic((void *) ptr, KM_USER0); + + buf += count; + nob -= count; + i++; + pg_count--; + } + } + + if (rc < aa->aa_requested_nob) + handle_short_read(rc, aa->aa_page_count, aa->aa_ppga); + + if (body->oa.o_valid & OBD_MD_FLCKSUM) { + static int cksum_counter; + u32 server_cksum = body->oa.o_cksum; + char *via = ""; + char *router = ""; + enum cksum_types cksum_type; + u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ? + body->oa.o_flags : 0; + + cksum_type = obd_cksum_type_unpack(o_flags); + rc = osc_checksum_bulk_rw(obd_name, cksum_type, rc, + aa->aa_page_count, aa->aa_ppga, + OST_READ, &client_cksum); + if (rc < 0) + GOTO(out, rc); + + if (req->rq_bulk != NULL && + peer->nid != req->rq_bulk->bd_sender) { + via = " via "; + router = libcfs_nid2str(req->rq_bulk->bd_sender); + } + + if (server_cksum != client_cksum) { + struct ost_body *clbody; + u32 page_count = aa->aa_page_count; + + clbody = req_capsule_client_get(&req->rq_pill, + &RMF_OST_BODY); + if (cli->cl_checksum_dump) + dump_all_bulk_pages(&clbody->oa, page_count, + aa->aa_ppga, server_cksum, + client_cksum); + + LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from " + "%s%s%s inode "DFID" object "DOSTID + " extent [%llu-%llu], client %x, " + "server %x, cksum_type %x\n", + obd_name, + libcfs_nid2str(peer->nid), + via, router, + clbody->oa.o_valid & OBD_MD_FLFID ? + clbody->oa.o_parent_seq : 0ULL, + clbody->oa.o_valid & OBD_MD_FLFID ? + clbody->oa.o_parent_oid : 0, + clbody->oa.o_valid & OBD_MD_FLFID ? + clbody->oa.o_parent_ver : 0, + POSTID(&body->oa.o_oi), + aa->aa_ppga[0]->off, + aa->aa_ppga[page_count-1]->off + + aa->aa_ppga[page_count-1]->count - 1, + client_cksum, server_cksum, + cksum_type); + cksum_counter = 0; + aa->aa_oa->o_cksum = client_cksum; + rc = -EAGAIN; + } else { + cksum_counter++; + CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); + rc = 0; + } + } else if (unlikely(client_cksum)) { + static int cksum_missed; + + cksum_missed++; + if ((cksum_missed & (-cksum_missed)) == cksum_missed) + CERROR("Checksum %u requested from %s but not sent\n", + cksum_missed, libcfs_nid2str(peer->nid)); + } else { + rc = 0; + } +out: + if (rc >= 0) + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, + aa->aa_oa, &body->oa); + + RETURN(rc); +} + +static int osc_brw_redo_request(struct ptlrpc_request *request, + struct osc_brw_async_args *aa, int rc) +{ + struct ptlrpc_request *new_req; + struct osc_brw_async_args *new_aa; + struct osc_async_page *oap; + ENTRY; + + DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request, + "redo for recoverable error %d", rc); + + rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == + OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ, + aa->aa_cli, aa->aa_oa, aa->aa_page_count, + aa->aa_ppga, &new_req, 1); + if (rc) + RETURN(rc); + + list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { + if (oap->oap_request != NULL) { + LASSERTF(request == oap->oap_request, + "request %p != oap_request %p\n", + request, oap->oap_request); + if (oap->oap_interrupted) { + ptlrpc_req_finished(new_req); + RETURN(-EINTR); + } + } + } + /* New request takes over pga and oaps from old request. + * Note that copying a list_head doesn't work, need to move it... */ + aa->aa_resends++; + new_req->rq_interpret_reply = request->rq_interpret_reply; + new_req->rq_async_args = request->rq_async_args; + new_req->rq_commit_cb = request->rq_commit_cb; + /* cap resend delay to the current request timeout, this is similar to + * what ptlrpc does (see after_reply()) */ + if (aa->aa_resends > new_req->rq_timeout) + new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout; + else + new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends; + new_req->rq_generation_set = 1; + new_req->rq_import_generation = request->rq_import_generation; + + new_aa = ptlrpc_req_async_args(new_req); + + INIT_LIST_HEAD(&new_aa->aa_oaps); + list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps); + INIT_LIST_HEAD(&new_aa->aa_exts); + list_splice_init(&aa->aa_exts, &new_aa->aa_exts); + new_aa->aa_resends = aa->aa_resends; + + list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) { + if (oap->oap_request) { + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = ptlrpc_request_addref(new_req); + } + } + + /* XXX: This code will run into problem if we're going to support + * to add a series of BRW RPCs into a self-defined ptlrpc_request_set + * and wait for all of them to be finished. We should inherit request + * set from old request. */ + ptlrpcd_add_req(new_req); + + DEBUG_REQ(D_INFO, new_req, "new request"); + RETURN(0); +} + +/* + * ugh, we want disk allocation on the target to happen in offset order. we'll + * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do + * fine for our small page arrays and doesn't require allocation. its an + * insertion sort that swaps elements that are strides apart, shrinking the + * stride down until its '1' and the array is sorted. + */ +static void sort_brw_pages(struct brw_page **array, int num) +{ + int stride, i, j; + struct brw_page *tmp; + + if (num == 1) + return; + for (stride = 1; stride < num ; stride = (stride * 3) + 1) + ; + + do { + stride /= 3; + for (i = stride ; i < num ; i++) { + tmp = array[i]; + j = i; + while (j >= stride && array[j - stride]->off > tmp->off) { + array[j] = array[j - stride]; + j -= stride; + } + array[j] = tmp; + } + } while (stride > 1); +} + +static void osc_release_ppga(struct brw_page **ppga, size_t count) +{ + LASSERT(ppga != NULL); + OBD_FREE(ppga, sizeof(*ppga) * count); +} + +static int brw_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *data, int rc) +{ + struct osc_brw_async_args *aa = data; + struct osc_extent *ext; + struct osc_extent *tmp; + struct client_obd *cli = aa->aa_cli; + unsigned long transferred = 0; + ENTRY; + + rc = osc_brw_fini_request(req, rc); + CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc); + /* When server return -EINPROGRESS, client should always retry + * regardless of the number of times the bulk was resent already. */ + if (osc_recoverable_error(rc) && !req->rq_no_delay) { + if (req->rq_import_generation != + req->rq_import->imp_generation) { + CDEBUG(D_HA, "%s: resend cross eviction for object: " + ""DOSTID", rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } else if (rc == -EINPROGRESS || + client_should_resend(aa->aa_resends, aa->aa_cli)) { + rc = osc_brw_redo_request(req, aa, rc); + } else { + CERROR("%s: too many resent retries for object: " + "%llu:%llu, rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } + + if (rc == 0) + RETURN(0); + else if (rc == -EAGAIN || rc == -EINPROGRESS) + rc = -EIO; + } + + if (rc == 0) { + struct obdo *oa = aa->aa_oa; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned long valid = 0; + struct cl_object *obj; + struct osc_async_page *last; + + last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]); + obj = osc2cl(last->oap_obj); + + cl_object_attr_lock(obj); + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + valid |= CAT_BLOCKS; + } + if (oa->o_valid & OBD_MD_FLMTIME) { + attr->cat_mtime = oa->o_mtime; + valid |= CAT_MTIME; + } + if (oa->o_valid & OBD_MD_FLATIME) { + attr->cat_atime = oa->o_atime; + valid |= CAT_ATIME; + } + if (oa->o_valid & OBD_MD_FLCTIME) { + attr->cat_ctime = oa->o_ctime; + valid |= CAT_CTIME; + } + + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + loff_t last_off = last->oap_count + last->oap_obj_off + + last->oap_page_off; + + /* Change file size if this is an out of quota or + * direct IO write and it extends the file size */ + if (loi->loi_lvb.lvb_size < last_off) { + attr->cat_size = last_off; + valid |= CAT_SIZE; + } + /* Extend KMS if it's not a lockless write */ + if (loi->loi_kms < last_off && + oap2osc_page(last)->ops_srvlock == 0) { + attr->cat_kms = last_off; + valid |= CAT_KMS; + } + } + + if (valid != 0) + cl_object_attr_update(env, obj, attr, valid); + cl_object_attr_unlock(obj); + } + OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem); + aa->aa_oa = NULL; + + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0) + osc_inc_unstable_pages(req); + + list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) { + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 1, + rc && req->rq_no_delay ? -EWOULDBLOCK : rc); + } + LASSERT(list_empty(&aa->aa_exts)); + LASSERT(list_empty(&aa->aa_oaps)); + + transferred = (req->rq_bulk == NULL ? /* short io */ + aa->aa_requested_nob : + req->rq_bulk->bd_nob_transferred); + + osc_release_ppga(aa->aa_ppga, aa->aa_page_count); + ptlrpc_lprocfs_brw(req, transferred); + + spin_lock(&cli->cl_loi_list_lock); + /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters + * is called so we know whether to go to sync BRWs or wait for more + * RPCs to complete */ + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) + cli->cl_w_in_flight--; + else + cli->cl_r_in_flight--; + osc_wake_cache_waiters(cli); + spin_unlock(&cli->cl_loi_list_lock); + + osc_io_unplug(env, cli, NULL); + RETURN(rc); +} + +static void brw_commit(struct ptlrpc_request *req) +{ + /* If osc_inc_unstable_pages (via osc_extent_finish) races with + * this called via the rq_commit_cb, I need to ensure + * osc_dec_unstable_pages is still called. Otherwise unstable + * pages may be leaked. */ + spin_lock(&req->rq_lock); + if (likely(req->rq_unstable)) { + req->rq_unstable = 0; + spin_unlock(&req->rq_lock); + + osc_dec_unstable_pages(req); + } else { + req->rq_committed = 1; + spin_unlock(&req->rq_lock); + } +} + +/** + * Build an RPC by the list of extent @ext_list. The caller must ensure + * that the total pages in this list are NOT over max pages per RPC. + * Extents in the list must be in OES_RPC state. + */ +int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, + struct list_head *ext_list, int cmd) +{ + struct ptlrpc_request *req = NULL; + struct osc_extent *ext; + struct brw_page **pga = NULL; + struct osc_brw_async_args *aa = NULL; + struct obdo *oa = NULL; + struct osc_async_page *oap; + struct osc_object *obj = NULL; + struct cl_req_attr *crattr = NULL; + loff_t starting_offset = OBD_OBJECT_EOF; + loff_t ending_offset = 0; + int mpflag = 0; + int mem_tight = 0; + int page_count = 0; + bool soft_sync = false; + bool interrupted = false; + bool ndelay = false; + int i; + int grant = 0; + int rc; + __u32 layout_version = 0; + struct list_head rpc_list = LIST_HEAD_INIT(rpc_list); + struct ost_body *body; + ENTRY; + LASSERT(!list_empty(ext_list)); + + /* add pages into rpc_list to build BRW rpc */ + list_for_each_entry(ext, ext_list, oe_link) { + LASSERT(ext->oe_state == OES_RPC); + mem_tight |= ext->oe_memalloc; + grant += ext->oe_grants; + page_count += ext->oe_nr_pages; + layout_version = MAX(layout_version, ext->oe_layout_version); + if (obj == NULL) + obj = ext->oe_obj; + } + + soft_sync = osc_over_unstable_soft_limit(cli); + if (mem_tight) + mpflag = cfs_memory_pressure_get_and_set(); + + OBD_ALLOC(pga, sizeof(*pga) * page_count); + if (pga == NULL) + GOTO(out, rc = -ENOMEM); + + OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS); + if (oa == NULL) + GOTO(out, rc = -ENOMEM); + + i = 0; + list_for_each_entry(ext, ext_list, oe_link) { + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + if (mem_tight) + oap->oap_brw_flags |= OBD_BRW_MEMALLOC; + if (soft_sync) + oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC; + pga[i] = &oap->oap_brw_page; + pga[i]->off = oap->oap_obj_off + oap->oap_page_off; + i++; + + list_add_tail(&oap->oap_rpc_item, &rpc_list); + if (starting_offset == OBD_OBJECT_EOF || + starting_offset > oap->oap_obj_off) + starting_offset = oap->oap_obj_off; + else + LASSERT(oap->oap_page_off == 0); + if (ending_offset < oap->oap_obj_off + oap->oap_count) + ending_offset = oap->oap_obj_off + + oap->oap_count; + else + LASSERT(oap->oap_page_off + oap->oap_count == + PAGE_SIZE); + if (oap->oap_interrupted) + interrupted = true; + } + if (ext->oe_ndelay) + ndelay = true; + } + + /* first page in the list */ + oap = list_entry(rpc_list.next, typeof(*oap), oap_rpc_item); + + crattr = &osc_env_info(env)->oti_req_attr; + memset(crattr, 0, sizeof(*crattr)); + crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ; + crattr->cra_flags = ~0ULL; + crattr->cra_page = oap2cl_page(oap); + crattr->cra_oa = oa; + cl_req_attr_set(env, osc2cl(obj), crattr); + + if (cmd == OBD_BRW_WRITE) { + oa->o_grant_used = grant; + if (layout_version > 0) { + CDEBUG(D_LAYOUT, DFID": write with layout version %u\n", + PFID(&oa->o_oi.oi_fid), layout_version); + + oa->o_layout_version = layout_version; + oa->o_valid |= OBD_MD_LAYOUT_VERSION; + } + } + + sort_brw_pages(pga, page_count); + rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0); + if (rc != 0) { + CERROR("prep_req failed: %d\n", rc); + GOTO(out, rc); + } + + req->rq_commit_cb = brw_commit; + req->rq_interpret_reply = brw_interpret; + req->rq_memalloc = mem_tight != 0; + oap->oap_request = ptlrpc_request_addref(req); + if (interrupted && !req->rq_intr) + ptlrpc_mark_interrupted(req); + if (ndelay) { + req->rq_no_resend = req->rq_no_delay = 1; + /* probably set a shorter timeout value. + * to handle ETIMEDOUT in brw_interpret() correctly. */ + /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */ + } + + /* Need to update the timestamps after the request is built in case + * we race with setattr (locally or in queue at OST). If OST gets + * later setattr before earlier BRW (as determined by the request xid), + * the OST will not use BRW timestamps. Sadly, there is no obvious + * way to do this in a single call. bug 10150 */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + crattr->cra_oa = &body->oa; + crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME; + cl_req_attr_set(env, osc2cl(obj), crattr); + lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid); + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + INIT_LIST_HEAD(&aa->aa_oaps); + list_splice_init(&rpc_list, &aa->aa_oaps); + INIT_LIST_HEAD(&aa->aa_exts); + list_splice_init(ext_list, &aa->aa_exts); + + spin_lock(&cli->cl_loi_list_lock); + starting_offset >>= PAGE_SHIFT; + if (cmd == OBD_BRW_READ) { + cli->cl_r_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); + lprocfs_oh_tally_log2(&cli->cl_read_offset_hist, + starting_offset + 1); + } else { + cli->cl_w_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight); + lprocfs_oh_tally_log2(&cli->cl_write_offset_hist, + starting_offset + 1); + } + spin_unlock(&cli->cl_loi_list_lock); + + DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %ur/%uw in flight", + page_count, aa, cli->cl_r_in_flight, + cli->cl_w_in_flight); + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val); + + ptlrpcd_add_req(req); + rc = 0; + EXIT; + +out: + if (mem_tight != 0) + cfs_memory_pressure_restore(mpflag); + + if (rc != 0) { + LASSERT(req == NULL); + + if (oa) + OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem); + if (pga) + OBD_FREE(pga, sizeof(*pga) * page_count); + /* this should happen rarely and is pretty bad, it makes the + * pending list not follow the dirty order */ + while (!list_empty(ext_list)) { + ext = list_entry(ext_list->next, struct osc_extent, + oe_link); + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 0, rc); + } + } + RETURN(rc); +} + +static int osc_set_lock_data(struct ldlm_lock *lock, void *data) +{ + int set = 0; + + LASSERT(lock != NULL); + + lock_res_and_lock(lock); + + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + set = 1; + + unlock_res_and_lock(lock); + + return set; +} + +int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall, + void *cookie, struct lustre_handle *lockh, + enum ldlm_mode mode, __u64 *flags, bool speculative, + int errcode) +{ + bool intent = *flags & LDLM_FL_HAS_INTENT; + int rc; + ENTRY; + + /* The request was created before ldlm_cli_enqueue call. */ + if (intent && errcode == ELDLM_LOCK_ABORTED) { + struct ldlm_reply *rep; + + rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + LASSERT(rep != NULL); + + rep->lock_policy_res1 = + ptlrpc_status_ntoh(rep->lock_policy_res1); + if (rep->lock_policy_res1) + errcode = rep->lock_policy_res1; + if (!speculative) + *flags |= LDLM_FL_LVB_READY; + } else if (errcode == ELDLM_OK) { + *flags |= LDLM_FL_LVB_READY; + } + + /* Call the update callback. */ + rc = (*upcall)(cookie, lockh, errcode); + + /* release the reference taken in ldlm_cli_enqueue() */ + if (errcode == ELDLM_LOCK_MATCHED) + errcode = ELDLM_OK; + if (errcode == ELDLM_OK && lustre_handle_is_used(lockh)) + ldlm_lock_decref(lockh, mode); + + RETURN(rc); +} + +int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req, + struct osc_enqueue_args *aa, int rc) +{ + struct ldlm_lock *lock; + struct lustre_handle *lockh = &aa->oa_lockh; + enum ldlm_mode mode = aa->oa_mode; + struct ost_lvb *lvb = aa->oa_lvb; + __u32 lvb_len = sizeof(*lvb); + __u64 flags = 0; + + ENTRY; + + /* ldlm_cli_enqueue is holding a reference on the lock, so it must + * be valid. */ + lock = ldlm_handle2lock(lockh); + LASSERTF(lock != NULL, + "lockh %#llx, req %p, aa %p - client evicted?\n", + lockh->cookie, req, aa); + + /* Take an additional reference so that a blocking AST that + * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed + * to arrive after an upcall has been executed by + * osc_enqueue_fini(). */ + ldlm_lock_addref(lockh, mode); + + /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2); + + /* Let CP AST to grant the lock first. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1); + + if (aa->oa_speculative) { + LASSERT(aa->oa_lvb == NULL); + LASSERT(aa->oa_flags == NULL); + aa->oa_flags = &flags; + } + + /* Complete obtaining the lock procedure. */ + rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_type, 1, + aa->oa_mode, aa->oa_flags, lvb, lvb_len, + lockh, rc); + /* Complete osc stuff. */ + rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode, + aa->oa_flags, aa->oa_speculative, rc); + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10); + + ldlm_lock_decref(lockh, mode); + LDLM_LOCK_PUT(lock); + RETURN(rc); +} + +struct ptlrpc_request_set *PTLRPCD_SET = (void *)1; + +/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock + * from the 2nd OSC before a lock from the 1st one. This does not deadlock with + * other synchronous requests, however keeping some locks and trying to obtain + * others may take a considerable amount of time in a case of ost failure; and + * when other sync requests do not get released lock from a client, the client + * is evicted from the cluster -- such scenarious make the life difficult, so + * release locks just after they are obtained. */ +int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u64 *flags, union ldlm_policy_data *policy, + struct ost_lvb *lvb, osc_enqueue_upcall_f upcall, + void *cookie, struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *rqset, int async, + bool speculative) +{ + struct obd_device *obd = exp->exp_obd; + struct lustre_handle lockh = { 0 }; + struct ptlrpc_request *req = NULL; + int intent = *flags & LDLM_FL_HAS_INTENT; + __u64 match_flags = *flags; + enum ldlm_mode mode; + int rc; + ENTRY; + + /* Filesystem lock extents are extended to page boundaries so that + * dealing with the page cache is a little smoother. */ + policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK; + policy->l_extent.end |= ~PAGE_MASK; + + /* Next, search for already existing extent locks that will cover us */ + /* If we're trying to read, we also search for an existing PW lock. The + * VFS and page cache already protect us locally, so lots of readers/ + * writers can share a single PW lock. + * + * There are problems with conversion deadlocks, so instead of + * converting a read lock to a write lock, we'll just enqueue a new + * one. + * + * At some point we should cancel the read lock instead of making them + * send us a blocking callback, but there are problems with canceling + * locks out from other users right now, too. */ + mode = einfo->ei_mode; + if (einfo->ei_mode == LCK_PR) + mode |= LCK_PW; + /* Normal lock requests must wait for the LVB to be ready before + * matching a lock; speculative lock requests do not need to, + * because they will not actually use the lock. */ + if (!speculative) + match_flags |= LDLM_FL_LVB_READY; + if (intent != 0) + match_flags |= LDLM_FL_BLOCK_GRANTED; + mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id, + einfo->ei_type, policy, mode, &lockh, 0); + if (mode) { + struct ldlm_lock *matched; + + if (*flags & LDLM_FL_TEST_LOCK) + RETURN(ELDLM_OK); + + matched = ldlm_handle2lock(&lockh); + if (speculative) { + /* This DLM lock request is speculative, and does not + * have an associated IO request. Therefore if there + * is already a DLM lock, it wll just inform the + * caller to cancel the request for this stripe.*/ + lock_res_and_lock(matched); + if (ldlm_extent_equal(&policy->l_extent, + &matched->l_policy_data.l_extent)) + rc = -EEXIST; + else + rc = -ECANCELED; + unlock_res_and_lock(matched); + + ldlm_lock_decref(&lockh, mode); + LDLM_LOCK_PUT(matched); + RETURN(rc); + } else if (osc_set_lock_data(matched, einfo->ei_cbdata)) { + *flags |= LDLM_FL_LVB_READY; + + /* We already have a lock, and it's referenced. */ + (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED); + + ldlm_lock_decref(&lockh, mode); + LDLM_LOCK_PUT(matched); + RETURN(ELDLM_OK); + } else { + ldlm_lock_decref(&lockh, mode); + LDLM_LOCK_PUT(matched); + } + } + + if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK)) + RETURN(-ENOLCK); + + if (intent) { + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_ENQUEUE_LVB); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + sizeof *lvb); + ptlrpc_request_set_replen(req); + } + + /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */ + *flags &= ~LDLM_FL_BLOCK_GRANTED; + + rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb, + sizeof(*lvb), LVB_T_OST, &lockh, async); + if (async) { + if (!rc) { + struct osc_enqueue_args *aa; + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->oa_exp = exp; + aa->oa_mode = einfo->ei_mode; + aa->oa_type = einfo->ei_type; + lustre_handle_copy(&aa->oa_lockh, &lockh); + aa->oa_upcall = upcall; + aa->oa_cookie = cookie; + aa->oa_speculative = speculative; + if (!speculative) { + aa->oa_flags = flags; + aa->oa_lvb = lvb; + } else { + /* speculative locks are essentially to enqueue + * a DLM lock in advance, so we don't care + * about the result of the enqueue. */ + aa->oa_lvb = NULL; + aa->oa_flags = NULL; + } + + req->rq_interpret_reply = + (ptlrpc_interpterer_t)osc_enqueue_interpret; + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req); + else + ptlrpc_set_add_req(rqset, req); + } else if (intent) { + ptlrpc_req_finished(req); + } + RETURN(rc); + } + + rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode, + flags, speculative, rc); + if (intent) + ptlrpc_req_finished(req); + + RETURN(rc); +} + +int osc_match_base(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, enum ldlm_type type, + union ldlm_policy_data *policy, enum ldlm_mode mode, + __u64 *flags, struct osc_object *obj, + struct lustre_handle *lockh, int unref) +{ + struct obd_device *obd = exp->exp_obd; + __u64 lflags = *flags; + enum ldlm_mode rc; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH)) + RETURN(-EIO); + + /* Filesystem lock extents are extended to page boundaries so that + * dealing with the page cache is a little smoother */ + policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK; + policy->l_extent.end |= ~PAGE_MASK; + + /* Next, search for already existing extent locks that will cover us */ + /* If we're trying to read, we also search for an existing PW lock. The + * VFS and page cache already protect us locally, so lots of readers/ + * writers can share a single PW lock. */ + rc = mode; + if (mode == LCK_PR) + rc |= LCK_PW; + rc = ldlm_lock_match(obd->obd_namespace, lflags, + res_id, type, policy, rc, lockh, unref); + if (rc == 0 || lflags & LDLM_FL_TEST_LOCK) + RETURN(rc); + + if (obj != NULL) { + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + + LASSERT(lock != NULL); + if (osc_set_lock_data(lock, obj)) { + lock_res_and_lock(lock); + if (!ldlm_is_lvb_cached(lock)) { + LASSERT(lock->l_ast_data == obj); + osc_lock_lvb_update(env, obj, lock, NULL); + ldlm_set_lvb_cached(lock); + } + unlock_res_and_lock(lock); + } else { + ldlm_lock_decref(lockh, rc); + rc = 0; + } + LDLM_LOCK_PUT(lock); + } + RETURN(rc); +} + +static int osc_statfs_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct osc_async_args *aa, int rc) +{ + struct obd_statfs *msfs; + ENTRY; + + if (rc == -EBADR) + /* The request has in fact never been sent + * due to issues at a higher level (LOV). + * Exit immediately since the caller is + * aware of the problem and takes care + * of the clean up */ + RETURN(rc); + + if ((rc == -ENOTCONN || rc == -EAGAIN) && + (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY)) + GOTO(out, rc = 0); + + if (rc != 0) + GOTO(out, rc); + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) { + GOTO(out, rc = -EPROTO); + } + + *aa->aa_oi->oi_osfs = *msfs; +out: + rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); + RETURN(rc); +} + +static int osc_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, time64_t max_age, + struct ptlrpc_request_set *rqset) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct osc_async_args *aa; + int rc; + ENTRY; + + if (obd->obd_osfs_age >= max_age) { + CDEBUG(D_SUPER, + "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n", + obd->obd_name, &obd->obd_osfs, + obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, + obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); + spin_lock(&obd->obd_osfs_lock); + memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs)); + spin_unlock(&obd->obd_osfs_lock); + oinfo->oi_flags |= OBD_STATFS_FROM_CACHE; + if (oinfo->oi_cb_up) + oinfo->oi_cb_up(oinfo, 0); + + RETURN(0); + } + + /* We could possibly pass max_age in the request (as an absolute + * timestamp or a "seconds.usec ago") so the target can avoid doing + * extra calls into the filesystem if that isn't necessary (e.g. + * during mount that would help a bit). Having relative timestamps + * is not so great if request processing is slow, while absolute + * timestamps are not ideal because they need time synchronization. */ + req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + ptlrpc_request_set_replen(req); + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (oinfo->oi_flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stat in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret; + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->aa_oi = oinfo; + + ptlrpc_set_add_req(rqset, req); + RETURN(0); +} + +static int osc_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct obd_statfs *msfs; + struct ptlrpc_request *req; + struct obd_import *imp = NULL; + int rc; + ENTRY; + + + /*Since the request might also come from lprocfs, so we need + *sync this with client_disconnect_export Bug15684*/ + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) + imp = class_import_get(obd->u.cli.cl_import); + up_read(&obd->u.cli.cl_sem); + if (!imp) + RETURN(-ENODEV); + + /* We could possibly pass max_age in the request (as an absolute + * timestamp or a "seconds.usec ago") so the target can avoid doing + * extra calls into the filesystem if that isn't necessary (e.g. + * during mount that would help a bit). Having relative timestamps + * is not so great if request processing is slow, while absolute + * timestamps are not ideal because they need time synchronization. */ + req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS); + + class_import_put(imp); + + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + ptlrpc_request_set_replen(req); + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stat in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) + GOTO(out, rc = -EPROTO); + + *osfs = *msfs; + + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void __user *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_ioctl_data *data = karg; + int err = 0; + ENTRY; + + if (!try_module_get(THIS_MODULE)) { + CERROR("%s: cannot get module '%s'\n", obd->obd_name, + module_name(THIS_MODULE)); + return -EINVAL; + } + switch (cmd) { + case OBD_IOC_CLIENT_RECOVER: + err = ptlrpc_recover_import(obd->u.cli.cl_import, + data->ioc_inlbuf1, 0); + if (err > 0) + err = 0; + GOTO(out, err); + case IOC_OSC_SET_ACTIVE: + err = ptlrpc_set_import_active(obd->u.cli.cl_import, + data->ioc_offset); + GOTO(out, err); + case OBD_IOC_PING_TARGET: + err = ptlrpc_obd_ping(obd); + GOTO(out, err); + default: + CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n", + cmd, current_comm()); + GOTO(out, err = -ENOTTY); + } +out: + module_put(THIS_MODULE); + return err; +} + +int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, + u32 keylen, void *key, u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + struct obd_device *obd = exp->exp_obd; + struct obd_import *imp = class_exp2cliimp(exp); + char *tmp; + int rc; + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10); + + if (KEY_IS(KEY_CHECKSUM)) { + if (vallen != sizeof(int)) + RETURN(-EINVAL); + exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0; + RETURN(0); + } + + if (KEY_IS(KEY_SPTLRPC_CONF)) { + sptlrpc_conf_client_adapt(obd); + RETURN(0); + } + + if (KEY_IS(KEY_FLUSH_CTX)) { + sptlrpc_import_flush_my_ctx(imp); + RETURN(0); + } + + if (KEY_IS(KEY_CACHE_SET)) { + struct client_obd *cli = &obd->u.cli; + + LASSERT(cli->cl_cache == NULL); /* only once */ + cli->cl_cache = (struct cl_client_cache *)val; + cl_cache_incref(cli->cl_cache); + cli->cl_lru_left = &cli->cl_cache->ccc_lru_left; + + /* add this osc into entity list */ + LASSERT(list_empty(&cli->cl_lru_osc)); + spin_lock(&cli->cl_cache->ccc_lru_lock); + list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + + RETURN(0); + } + + if (KEY_IS(KEY_CACHE_LRU_SHRINK)) { + struct client_obd *cli = &obd->u.cli; + long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1; + long target = *(long *)val; + + nr = osc_lru_shrink(env, cli, min(nr, target), true); + *(long *)val -= nr; + RETURN(0); + } + + if (!set && !KEY_IS(KEY_GRANT_SHRINK)) + RETURN(-EINVAL); + + /* We pass all other commands directly to OST. Since nobody calls osc + methods directly and everybody is supposed to go through LOV, we + assume lov checked invalid values for us. + The only recognised values so far are evict_by_nid and mds_conn. + Even if something bad goes through, we'd get a -EINVAL from OST + anyway. */ + + req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ? + &RQF_OST_SET_GRANT_INFO : + &RQF_OBD_SET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + if (!KEY_IS(KEY_GRANT_SHRINK)) + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT, vallen); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ? + &RMF_OST_BODY : + &RMF_SETINFO_VAL); + memcpy(tmp, val, vallen); + + if (KEY_IS(KEY_GRANT_SHRINK)) { + struct osc_grant_args *aa; + struct obdo *oa; + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS); + if (!oa) { + ptlrpc_req_finished(req); + RETURN(-ENOMEM); + } + *oa = ((struct ost_body *)val)->oa; + aa->aa_oa = oa; + req->rq_interpret_reply = osc_shrink_grant_interpret; + } + + ptlrpc_request_set_replen(req); + if (!KEY_IS(KEY_GRANT_SHRINK)) { + LASSERT(set != NULL); + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(NULL, set); + } else { + ptlrpcd_add_req(req); + } + + RETURN(0); +} +EXPORT_SYMBOL(osc_set_info_async); + +int osc_reconnect(const struct lu_env *env, struct obd_export *exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + struct client_obd *cli = &obd->u.cli; + + if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { + long lost_grant; + long grant; + + spin_lock(&cli->cl_loi_list_lock); + grant = cli->cl_avail_grant + cli->cl_reserved_grant; + if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM) { + /* restore ocd_grant_blkbits as client page bits */ + data->ocd_grant_blkbits = PAGE_SHIFT; + grant += cli->cl_dirty_grant; + } else { + grant += cli->cl_dirty_pages << PAGE_SHIFT; + } + data->ocd_grant = grant ? : 2 * cli_brw_size(obd); + lost_grant = cli->cl_lost_grant; + cli->cl_lost_grant = 0; + spin_unlock(&cli->cl_loi_list_lock); + + CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d" + " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags, + data->ocd_version, data->ocd_grant, lost_grant); + } + + RETURN(0); +} +EXPORT_SYMBOL(osc_reconnect); + +int osc_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + int rc; + + rc = client_disconnect_export(exp); + /** + * Initially we put del_shrink_grant before disconnect_export, but it + * causes the following problem if setup (connect) and cleanup + * (disconnect) are tangled together. + * connect p1 disconnect p2 + * ptlrpc_connect_import + * ............... class_manual_cleanup + * osc_disconnect + * del_shrink_grant + * ptlrpc_connect_interrupt + * osc_init_grant + * add this client to shrink list + * cleanup_osc + * Bang! grant shrink thread trigger the shrink. BUG18662 + */ + osc_del_grant_list(&obd->u.cli); + return rc; +} +EXPORT_SYMBOL(osc_disconnect); + +int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) +{ + struct lu_env *env = arg; + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + struct ldlm_lock *lock; + struct osc_object *osc = NULL; + ENTRY; + + lock_res(res); + list_for_each_entry(lock, &res->lr_granted, l_res_link) { + if (lock->l_ast_data != NULL && osc == NULL) { + osc = lock->l_ast_data; + cl_object_get(osc2cl(osc)); + } + + /* clear LDLM_FL_CLEANED flag to make sure it will be canceled + * by the 2nd round of ldlm_namespace_clean() call in + * osc_import_event(). */ + ldlm_clear_cleaned(lock); + } + unlock_res(res); + + if (osc != NULL) { + osc_object_invalidate(env, osc); + cl_object_put(env, osc2cl(osc)); + } + + RETURN(0); +} +EXPORT_SYMBOL(osc_ldlm_resource_invalidate); + +static int osc_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + struct client_obd *cli; + int rc = 0; + + ENTRY; + LASSERT(imp->imp_obd == obd); + + switch (event) { + case IMP_EVENT_DISCON: { + cli = &obd->u.cli; + spin_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant = 0; + cli->cl_lost_grant = 0; + spin_unlock(&cli->cl_loi_list_lock); + break; + } + case IMP_EVENT_INACTIVE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE); + break; + } + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + struct lu_env *env; + __u16 refcheck; + + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + osc_io_unplug(env, &obd->u.cli, NULL); + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + osc_ldlm_resource_invalidate, + env, 0); + cl_env_put(env, &refcheck); + + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + } else + rc = PTR_ERR(env); + break; + } + case IMP_EVENT_ACTIVE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE); + break; + } + case IMP_EVENT_OCD: { + struct obd_connect_data *ocd = &imp->imp_connect_data; + + if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT) + osc_init_grant(&obd->u.cli, ocd); + + /* See bug 7198 */ + if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL) + imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL; + + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD); + break; + } + case IMP_EVENT_DEACTIVATE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE); + break; + } + case IMP_EVENT_ACTIVATE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE); + break; + } + default: + CERROR("Unknown import event %d\n", event); + LBUG(); + } + RETURN(rc); +} + +/** + * Determine whether the lock can be canceled before replaying the lock + * during recovery, see bug16774 for detailed information. + * + * \retval zero the lock can't be canceled + * \retval other ok to cancel + */ +static int osc_cancel_weight(struct ldlm_lock *lock) +{ + /* + * Cancel all unused and granted extent lock. + */ + if (lock->l_resource->lr_type == LDLM_EXTENT && + ldlm_is_granted(lock) && + osc_ldlm_weigh_ast(lock) == 0) + RETURN(1); + + RETURN(0); +} + +static int brw_queue_work(const struct lu_env *env, void *data) +{ + struct client_obd *cli = data; + + CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli); + + osc_io_unplug(env, cli, NULL); + RETURN(0); +} + +int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct client_obd *cli = &obd->u.cli; + void *handler; + int rc; + + ENTRY; + + rc = ptlrpcd_addref(); + if (rc) + RETURN(rc); + + rc = client_obd_setup(obd, lcfg); + if (rc) + GOTO(out_ptlrpcd, rc); + + + handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli); + if (IS_ERR(handler)) + GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler)); + cli->cl_writeback_work = handler; + + handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli); + if (IS_ERR(handler)) + GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler)); + cli->cl_lru_work = handler; + + rc = osc_quota_setup(obd); + if (rc) + GOTO(out_ptlrpcd_work, rc); + + cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; + osc_update_next_shrink(cli); + + RETURN(rc); + +out_ptlrpcd_work: + if (cli->cl_writeback_work != NULL) { + ptlrpcd_destroy_work(cli->cl_writeback_work); + cli->cl_writeback_work = NULL; + } + if (cli->cl_lru_work != NULL) { + ptlrpcd_destroy_work(cli->cl_lru_work); + cli->cl_lru_work = NULL; + } + client_obd_cleanup(obd); +out_ptlrpcd: + ptlrpcd_decref(); + RETURN(rc); +} +EXPORT_SYMBOL(osc_setup_common); + +int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct client_obd *cli = &obd->u.cli; + int adding; + int added; + int req_count; + int rc; + + ENTRY; + + rc = osc_setup_common(obd, lcfg); + if (rc < 0) + RETURN(rc); + + rc = osc_tunables_init(obd); + if (rc) + RETURN(rc); + + /* + * We try to control the total number of requests with a upper limit + * osc_reqpool_maxreqcount. There might be some race which will cause + * over-limit allocation, but it is fine. + */ + req_count = atomic_read(&osc_pool_req_count); + if (req_count < osc_reqpool_maxreqcount) { + adding = cli->cl_max_rpcs_in_flight + 2; + if (req_count + adding > osc_reqpool_maxreqcount) + adding = osc_reqpool_maxreqcount - req_count; + + added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding); + atomic_add(added, &osc_pool_req_count); + } + + ns_register_cancel(obd->obd_namespace, osc_cancel_weight); + + spin_lock(&osc_shrink_lock); + list_add_tail(&cli->cl_shrink_list, &osc_shrink_list); + spin_unlock(&osc_shrink_lock); + cli->cl_import->imp_idle_timeout = osc_idle_timeout; + cli->cl_import->imp_idle_debug = D_HA; + + RETURN(0); +} + +int osc_precleanup_common(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + ENTRY; + + /* LU-464 + * for echo client, export may be on zombie list, wait for + * zombie thread to cull it, because cli.cl_import will be + * cleared in client_disconnect_export(): + * class_export_destroy() -> obd_cleanup() -> + * echo_device_free() -> echo_client_cleanup() -> + * obd_disconnect() -> osc_disconnect() -> + * client_disconnect_export() + */ + obd_zombie_barrier(); + if (cli->cl_writeback_work) { + ptlrpcd_destroy_work(cli->cl_writeback_work); + cli->cl_writeback_work = NULL; + } + + if (cli->cl_lru_work) { + ptlrpcd_destroy_work(cli->cl_lru_work); + cli->cl_lru_work = NULL; + } + + obd_cleanup_client_import(obd); + RETURN(0); +} +EXPORT_SYMBOL(osc_precleanup_common); + +static int osc_precleanup(struct obd_device *obd) +{ + ENTRY; + + osc_precleanup_common(obd); + + ptlrpc_lprocfs_unregister_obd(obd); + RETURN(0); +} + +int osc_cleanup_common(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int rc; + + ENTRY; + + spin_lock(&osc_shrink_lock); + list_del(&cli->cl_shrink_list); + spin_unlock(&osc_shrink_lock); + + /* lru cleanup */ + if (cli->cl_cache != NULL) { + LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0); + spin_lock(&cli->cl_cache->ccc_lru_lock); + list_del_init(&cli->cl_lru_osc); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + cli->cl_lru_left = NULL; + cl_cache_decref(cli->cl_cache); + cli->cl_cache = NULL; + } + + /* free memory of osc quota cache */ + osc_quota_cleanup(obd); + + rc = client_obd_cleanup(obd); + + ptlrpcd_decref(); + RETURN(rc); +} +EXPORT_SYMBOL(osc_cleanup_common); + +int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + ssize_t count = class_modify_config(lcfg, PARAM_OSC, + &obd->obd_kset.kobj); + return count > 0 ? 0 : count; +} + +static int osc_process_config(struct obd_device *obd, size_t len, void *buf) +{ + return osc_process_config_base(obd, buf); +} + +static struct obd_ops osc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = osc_setup, + .o_precleanup = osc_precleanup, + .o_cleanup = osc_cleanup_common, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_reconnect = osc_reconnect, + .o_disconnect = osc_disconnect, + .o_statfs = osc_statfs, + .o_statfs_async = osc_statfs_async, + .o_create = osc_create, + .o_destroy = osc_destroy, + .o_getattr = osc_getattr, + .o_setattr = osc_setattr, + .o_iocontrol = osc_iocontrol, + .o_set_info_async = osc_set_info_async, + .o_import_event = osc_import_event, + .o_process_config = osc_process_config, + .o_quotactl = osc_quotactl, +}; + +static struct shrinker *osc_cache_shrinker; +struct list_head osc_shrink_list = LIST_HEAD_INIT(osc_shrink_list); +DEFINE_SPINLOCK(osc_shrink_lock); + +#ifndef HAVE_SHRINKER_COUNT +static int osc_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + struct shrink_control scv = { + .nr_to_scan = shrink_param(sc, nr_to_scan), + .gfp_mask = shrink_param(sc, gfp_mask) + }; +#if !defined(HAVE_SHRINKER_WANT_SHRINK_PTR) && !defined(HAVE_SHRINK_CONTROL) + struct shrinker *shrinker = NULL; +#endif + + (void)osc_cache_shrink_scan(shrinker, &scv); + + return osc_cache_shrink_count(shrinker, &scv); +} +#endif + +static int __init osc_init(void) +{ + bool enable_proc = true; + struct obd_type *type; + unsigned int reqpool_size; + unsigned int reqsize; + int rc; + DEF_SHRINKER_VAR(osc_shvar, osc_cache_shrink, + osc_cache_shrink_count, osc_cache_shrink_scan); + ENTRY; + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches); + + rc = lu_kmem_init(osc_caches); + if (rc) + RETURN(rc); + + type = class_search_type(LUSTRE_OSP_NAME); + if (type != NULL && type->typ_procsym != NULL) + enable_proc = false; + + rc = class_register_type(&osc_obd_ops, NULL, enable_proc, NULL, + LUSTRE_OSC_NAME, &osc_device_type); + if (rc) + GOTO(out_kmem, rc); + + osc_cache_shrinker = set_shrinker(DEFAULT_SEEKS, &osc_shvar); + + /* This is obviously too much memory, only prevent overflow here */ + if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0) + GOTO(out_type, rc = -EINVAL); + + reqpool_size = osc_reqpool_mem_max << 20; + + reqsize = 1; + while (reqsize < OST_IO_MAXREQSIZE) + reqsize = reqsize << 1; + + /* + * We don't enlarge the request count in OSC pool according to + * cl_max_rpcs_in_flight. The allocation from the pool will only be + * tried after normal allocation failed. So a small OSC pool won't + * cause much performance degression in most of cases. + */ + osc_reqpool_maxreqcount = reqpool_size / reqsize; + + atomic_set(&osc_pool_req_count, 0); + osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE, + ptlrpc_add_rqs_to_pool); + + if (osc_rq_pool == NULL) + GOTO(out_type, rc = -ENOMEM); + + rc = osc_start_grant_work(); + if (rc != 0) + GOTO(out_req_pool, rc); + + RETURN(rc); + +out_req_pool: + ptlrpc_free_rq_pool(osc_rq_pool); +out_type: + class_unregister_type(LUSTRE_OSC_NAME); +out_kmem: + lu_kmem_fini(osc_caches); + + RETURN(rc); +} + +static void __exit osc_exit(void) +{ + osc_stop_grant_work(); + remove_shrinker(osc_cache_shrinker); + class_unregister_type(LUSTRE_OSC_NAME); + lu_kmem_fini(osc_caches); + ptlrpc_free_rq_pool(osc_rq_pool); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(osc_init); +module_exit(osc_exit); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile new file mode 100644 index 0000000000000..f192313597822 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile @@ -0,0 +1,26 @@ +obj-$(CONFIG_LUSTREFSX_FS) += ptlrpc.o + +LDLM := ../../lustre/ldlm/ +TARGET := ../../lustre/target/ + +ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o +ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o +ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o +ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o +ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o +ldlm_objs += $(LDLM)ldlm_pool.o $(LDLM)interval_tree.o +ldlm_objs += $(LDLM)ldlm_reclaim.o + +ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o +ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o +ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o +ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o +ptlrpc_objs += sec.o sec_ctx.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o +ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o nrs_crr.o nrs_orr.o +ptlrpc_objs += nrs_tbf.o nrs_delay.o errno.o + +ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs) $(TARGET)barrier.o + +ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/ldlm + +include $(srctree)/drivers/staging/lustrefsx/Makefile.rules diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c new file mode 100644 index 0000000000000..b9888d92b1fd8 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c @@ -0,0 +1,3520 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** Implementation of client-side PortalRPC interfaces */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops = { + .add_kiov_frag = ptlrpc_prep_bulk_page_pin, + .release_frags = ptlrpc_release_bulk_page_pin, +}; +EXPORT_SYMBOL(ptlrpc_bulk_kiov_pin_ops); + +const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops = { + .add_kiov_frag = ptlrpc_prep_bulk_page_nopin, + .release_frags = ptlrpc_release_bulk_noop, +}; +EXPORT_SYMBOL(ptlrpc_bulk_kiov_nopin_ops); + +const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kvec_ops = { + .add_iov_frag = ptlrpc_prep_bulk_frag, +}; +EXPORT_SYMBOL(ptlrpc_bulk_kvec_ops); + +static int ptlrpc_send_new_req(struct ptlrpc_request *req); +static int ptlrpcd_check_work(struct ptlrpc_request *req); +static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async); + +/** + * Initialize passed in client structure \a cl. + */ +void ptlrpc_init_client(int req_portal, int rep_portal, char *name, + struct ptlrpc_client *cl) +{ + cl->cli_request_portal = req_portal; + cl->cli_reply_portal = rep_portal; + cl->cli_name = name; +} +EXPORT_SYMBOL(ptlrpc_init_client); + +/** + * Return PortalRPC connection for remore uud \a uuid + */ +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid, + lnet_nid_t nid4refnet) +{ + struct ptlrpc_connection *c; + lnet_nid_t self; + struct lnet_process_id peer; + int err; + + /* ptlrpc_uuid_to_peer() initializes its 2nd parameter + * before accessing its values. */ + /* coverity[uninit_use_in_call] */ + peer.nid = nid4refnet; + err = ptlrpc_uuid_to_peer(uuid, &peer, &self); + if (err != 0) { + CNETERR("cannot find peer %s!\n", uuid->uuid); + return NULL; + } + + c = ptlrpc_connection_get(peer, self, uuid); + if (c) { + memcpy(c->c_remote_uuid.uuid, + uuid->uuid, sizeof(c->c_remote_uuid.uuid)); + } + + CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c); + + return c; +} + +/** + * Allocate and initialize new bulk descriptor on the sender. + * Returns pointer to the descriptor or NULL on error. + */ +struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned nfrags, unsigned max_brw, + enum ptlrpc_bulk_op_type type, + unsigned portal, + const struct ptlrpc_bulk_frag_ops *ops) +{ + struct ptlrpc_bulk_desc *desc; + int i; + + /* ensure that only one of KIOV or IOVEC is set but not both */ + LASSERT((ptlrpc_is_bulk_desc_kiov(type) && + ops->add_kiov_frag != NULL) || + (ptlrpc_is_bulk_desc_kvec(type) && + ops->add_iov_frag != NULL)); + + if (max_brw > PTLRPC_BULK_OPS_COUNT) + RETURN(NULL); + + if (nfrags > LNET_MAX_IOV * max_brw) + RETURN(NULL); + + OBD_ALLOC_PTR(desc); + if (desc == NULL) + return NULL; + if (type & PTLRPC_BULK_BUF_KIOV) { + OBD_ALLOC_LARGE(GET_KIOV(desc), + nfrags * sizeof(*GET_KIOV(desc))); + if (GET_KIOV(desc) == NULL) + goto out; + } else { + OBD_ALLOC_LARGE(GET_KVEC(desc), + nfrags * sizeof(*GET_KVEC(desc))); + if (GET_KVEC(desc) == NULL) + goto out; + } + + spin_lock_init(&desc->bd_lock); + init_waitqueue_head(&desc->bd_waitq); + desc->bd_max_iov = nfrags; + desc->bd_iov_count = 0; + desc->bd_portal = portal; + desc->bd_type = type; + desc->bd_md_count = 0; + desc->bd_nob_last = LNET_MTU; + desc->bd_frag_ops = (struct ptlrpc_bulk_frag_ops *) ops; + LASSERT(max_brw > 0); + desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT); + /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this + * node. Negotiated ocd_brw_size will always be <= this number. */ + for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++) + LNetInvalidateMDHandle(&desc->bd_mds[i]); + + return desc; +out: + OBD_FREE_PTR(desc); + return NULL; +} + +/** + * Prepare bulk descriptor for specified outgoing request \a req that + * can fit \a nfrags * pages. \a type is bulk type. \a portal is where + * the bulk to be sent. Used on client-side. + * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on + * error. + */ +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, + unsigned nfrags, unsigned max_brw, + unsigned int type, + unsigned portal, + const struct ptlrpc_bulk_frag_ops + *ops) +{ + struct obd_import *imp = req->rq_import; + struct ptlrpc_bulk_desc *desc; + + ENTRY; + LASSERT(ptlrpc_is_bulk_op_passive(type)); + + desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops); + if (desc == NULL) + RETURN(NULL); + + desc->bd_import_generation = req->rq_import_generation; + desc->bd_import = class_import_get(imp); + desc->bd_req = req; + + desc->bd_cbid.cbid_fn = client_bulk_callback; + desc->bd_cbid.cbid_arg = desc; + + /* This makes req own desc, and free it when she frees herself */ + req->rq_bulk = desc; + + return desc; +} +EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); + +void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len, + int pin) +{ + lnet_kiov_t *kiov; + + LASSERT(desc->bd_iov_count < desc->bd_max_iov); + LASSERT(page != NULL); + LASSERT(pageoffset >= 0); + LASSERT(len > 0); + LASSERT(pageoffset + len <= PAGE_SIZE); + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + + kiov = &BD_GET_KIOV(desc, desc->bd_iov_count); + if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) || + ((desc->bd_nob_last + len) > LNET_MTU)) { + desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count; + desc->bd_md_count++; + desc->bd_nob_last = 0; + LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT); + } + + desc->bd_nob_last += len; + desc->bd_nob += len; + + if (pin) + get_page(page); + + kiov->kiov_page = page; + kiov->kiov_offset = pageoffset; + kiov->kiov_len = len; + + desc->bd_iov_count++; +} +EXPORT_SYMBOL(__ptlrpc_prep_bulk_page); + +int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc, + void *frag, int len) +{ + struct kvec *iovec; + ENTRY; + + LASSERT(desc->bd_iov_count < desc->bd_max_iov); + LASSERT(frag != NULL); + LASSERT(len > 0); + LASSERT(ptlrpc_is_bulk_desc_kvec(desc->bd_type)); + + iovec = &BD_GET_KVEC(desc, desc->bd_iov_count); + if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) || + ((desc->bd_nob_last + len) > LNET_MTU)) { + desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count; + desc->bd_md_count++; + desc->bd_nob_last = 0; + LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT); + } + + desc->bd_nob_last += len; + desc->bd_nob += len; + + iovec->iov_base = frag; + iovec->iov_len = len; + + desc->bd_iov_count++; + + RETURN(desc->bd_nob); +} +EXPORT_SYMBOL(ptlrpc_prep_bulk_frag); + +void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) +{ + ENTRY; + + LASSERT(desc != NULL); + LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ + LASSERT(desc->bd_refs == 0); /* network hands off */ + LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); + LASSERT(desc->bd_frag_ops != NULL); + + if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) + sptlrpc_enc_pool_put_pages(desc); + + if (desc->bd_export) + class_export_put(desc->bd_export); + else + class_import_put(desc->bd_import); + + if (desc->bd_frag_ops->release_frags != NULL) + desc->bd_frag_ops->release_frags(desc); + + if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) + OBD_FREE_LARGE(GET_KIOV(desc), + desc->bd_max_iov * sizeof(*GET_KIOV(desc))); + else + OBD_FREE_LARGE(GET_KVEC(desc), + desc->bd_max_iov * sizeof(*GET_KVEC(desc))); + OBD_FREE_PTR(desc); + EXIT; +} +EXPORT_SYMBOL(ptlrpc_free_bulk); + +/** + * Set server timelimit for this req, i.e. how long are we willing to wait + * for reply before timing out this request. + */ +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) +{ + __u32 serv_est; + int idx; + struct imp_at *at; + + LASSERT(req->rq_import); + + if (AT_OFF) { + /* non-AT settings */ + /** + * \a imp_server_timeout means this is reverse import and + * we send (currently only) ASTs to the client and cannot afford + * to wait too long for the reply, otherwise the other client + * (because of which we are sending this request) would + * timeout waiting for us + */ + req->rq_timeout = req->rq_import->imp_server_timeout ? + obd_timeout / 2 : obd_timeout; + } else { + at = &req->rq_import->imp_at; + idx = import_at_get_index(req->rq_import, + req->rq_request_portal); + serv_est = at_get(&at->iat_service_estimate[idx]); + req->rq_timeout = at_est2timeout(serv_est); + } + /* We could get even fancier here, using history to predict increased + loading... */ + + /* Let the server know what this RPC timeout is by putting it in the + reqmsg*/ + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); +} +EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); + +/* Adjust max service estimate based on server value */ +static void ptlrpc_at_adj_service(struct ptlrpc_request *req, + unsigned int serv_est) +{ + int idx; + unsigned int oldse; + struct imp_at *at; + + LASSERT(req->rq_import); + at = &req->rq_import->imp_at; + + idx = import_at_get_index(req->rq_import, req->rq_request_portal); + /* max service estimates are tracked on the server side, + so just keep minimal history here */ + oldse = at_measured(&at->iat_service_estimate[idx], serv_est); + if (oldse != 0) + CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d " + "has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name,req->rq_request_portal, + oldse, at_get(&at->iat_service_estimate[idx])); +} + +/* Expected network latency per remote node (secs) */ +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) +{ + return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); +} + +/* Adjust expected network latency */ +void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, + timeout_t service_timeout) +{ + unsigned int nl, oldnl; + struct imp_at *at; + time64_t now = ktime_get_real_seconds(); + + LASSERT(req->rq_import); + + if (service_timeout > now - req->rq_sent + 3) { + /* + * b=16408, however, this can also happen if early reply + * is lost and client RPC is expired and resent, early reply + * or reply of original RPC can still be fit in reply buffer + * of resent RPC, now client is measuring time from the + * resent time, but server sent back service time of original + * RPC. + */ + CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ? + D_ADAPTTO : D_WARNING, + "Reported service time %u > total measured time %lld\n", + service_timeout, now - req->rq_sent); + return; + } + + /* Network latency is total time less server processing time */ + nl = max_t(int, now - req->rq_sent - + service_timeout, 0) + 1; /* st rounding */ + at = &req->rq_import->imp_at; + + oldnl = at_measured(&at->iat_net_latency, nl); + if (oldnl != 0) + CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) " + "has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name, + obd_uuid2str( + &req->rq_import->imp_connection->c_remote_uuid), + oldnl, at_get(&at->iat_net_latency)); +} + +static int unpack_reply(struct ptlrpc_request *req) +{ + int rc; + + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { + rc = ptlrpc_unpack_rep_msg(req, req->rq_replen); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); + return(-EPROTO); + } + } + + rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); + return(-EPROTO); + } + return 0; +} + +/** + * Handle an early reply message, called with the rq_lock held. + * If anything goes wrong just ignore it - same as if it never happened + */ +static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) +__must_hold(&req->rq_lock) +{ + struct ptlrpc_request *early_req; + timeout_t service_timeout; + time64_t olddl; + int rc; + + ENTRY; + req->rq_early = 0; + spin_unlock(&req->rq_lock); + + rc = sptlrpc_cli_unwrap_early_reply(req, &early_req); + if (rc) { + spin_lock(&req->rq_lock); + RETURN(rc); + } + + rc = unpack_reply(early_req); + if (rc != 0) { + sptlrpc_cli_finish_early_reply(early_req); + spin_lock(&req->rq_lock); + RETURN(rc); + } + + /* Use new timeout value just to adjust the local value for this + * request, don't include it into at_history. It is unclear yet why + * service time increased and should it be counted or skipped, e.g. + * that can be recovery case or some error or server, the real reply + * will add all new data if it is worth to add. */ + req->rq_timeout = lustre_msg_get_timeout(early_req->rq_repmsg); + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); + + /* Network latency can be adjusted, it is pure network delays */ + service_timeout = lustre_msg_get_service_timeout(early_req->rq_repmsg); + ptlrpc_at_adj_net_latency(req, service_timeout); + + sptlrpc_cli_finish_early_reply(early_req); + + spin_lock(&req->rq_lock); + olddl = req->rq_deadline; + /* server assumes it now has rq_timeout from when the request + * arrived, so the client should give it at least that long. + * since we don't know the arrival time we'll use the original + * sent time */ + req->rq_deadline = req->rq_sent + req->rq_timeout + + ptlrpc_at_get_net_latency(req); + + DEBUG_REQ(D_ADAPTTO, req, + "Early reply #%d, new deadline in %llds (%llds)", + req->rq_early_count, + req->rq_deadline - ktime_get_real_seconds(), + req->rq_deadline - olddl); + + RETURN(rc); +} + +static struct kmem_cache *request_cache; + +int ptlrpc_request_cache_init(void) +{ + request_cache = kmem_cache_create("ptlrpc_cache", + sizeof(struct ptlrpc_request), + 0, SLAB_HWCACHE_ALIGN, NULL); + return request_cache == NULL ? -ENOMEM : 0; +} + +void ptlrpc_request_cache_fini(void) +{ + kmem_cache_destroy(request_cache); +} + +struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags) +{ + struct ptlrpc_request *req; + + OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags); + return req; +} + +void ptlrpc_request_cache_free(struct ptlrpc_request *req) +{ + OBD_SLAB_FREE_PTR(req, request_cache); +} + +/** + * Wind down request pool \a pool. + * Frees all requests from the pool too + */ +void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) +{ + struct list_head *l, *tmp; + struct ptlrpc_request *req; + + LASSERT(pool != NULL); + + spin_lock(&pool->prp_lock); + list_for_each_safe(l, tmp, &pool->prp_req_list) { + req = list_entry(l, struct ptlrpc_request, rq_list); + list_del(&req->rq_list); + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len == pool->prp_rq_size); + OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size); + ptlrpc_request_cache_free(req); + } + spin_unlock(&pool->prp_lock); + OBD_FREE(pool, sizeof(*pool)); +} +EXPORT_SYMBOL(ptlrpc_free_rq_pool); + +/** + * Allocates, initializes and adds \a num_rq requests to the pool \a pool + */ +int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) +{ + int i; + int size = 1; + + while (size < pool->prp_rq_size) + size <<= 1; + + LASSERTF(list_empty(&pool->prp_req_list) || + size == pool->prp_rq_size, + "Trying to change pool size with nonempty pool " + "from %d to %d bytes\n", pool->prp_rq_size, size); + + spin_lock(&pool->prp_lock); + pool->prp_rq_size = size; + for (i = 0; i < num_rq; i++) { + struct ptlrpc_request *req; + struct lustre_msg *msg; + + spin_unlock(&pool->prp_lock); + req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (!req) + return i; + OBD_ALLOC_LARGE(msg, size); + if (!msg) { + ptlrpc_request_cache_free(req); + return i; + } + req->rq_reqbuf = msg; + req->rq_reqbuf_len = size; + req->rq_pool = pool; + spin_lock(&pool->prp_lock); + list_add_tail(&req->rq_list, &pool->prp_req_list); + } + spin_unlock(&pool->prp_lock); + return num_rq; +} +EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); + +/** + * Create and initialize new request pool with given attributes: + * \a num_rq - initial number of requests to create for the pool + * \a msgsize - maximum message size possible for requests in thid pool + * \a populate_pool - function to be called when more requests need to be added + * to the pool + * Returns pointer to newly created pool or NULL on error. + */ +struct ptlrpc_request_pool * +ptlrpc_init_rq_pool(int num_rq, int msgsize, + int (*populate_pool)(struct ptlrpc_request_pool *, int)) +{ + struct ptlrpc_request_pool *pool; + + OBD_ALLOC(pool, sizeof(struct ptlrpc_request_pool)); + if (!pool) + return NULL; + + /* Request next power of two for the allocation, because internally + kernel would do exactly this */ + + spin_lock_init(&pool->prp_lock); + INIT_LIST_HEAD(&pool->prp_req_list); + pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD; + pool->prp_populate = populate_pool; + + populate_pool(pool, num_rq); + + return pool; +} +EXPORT_SYMBOL(ptlrpc_init_rq_pool); + +/** + * Fetches one request from pool \a pool + */ +static struct ptlrpc_request * +ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *request; + struct lustre_msg *reqbuf; + + if (!pool) + return NULL; + + spin_lock(&pool->prp_lock); + + /* See if we have anything in a pool, and bail out if nothing, + * in writeout path, where this matters, this is safe to do, because + * nothing is lost in this case, and when some in-flight requests + * complete, this code will be called again. */ + if (unlikely(list_empty(&pool->prp_req_list))) { + spin_unlock(&pool->prp_lock); + return NULL; + } + + request = list_entry(pool->prp_req_list.next, struct ptlrpc_request, + rq_list); + list_del_init(&request->rq_list); + spin_unlock(&pool->prp_lock); + + LASSERT(request->rq_reqbuf); + LASSERT(request->rq_pool); + + reqbuf = request->rq_reqbuf; + memset(request, 0, sizeof(*request)); + request->rq_reqbuf = reqbuf; + request->rq_reqbuf_len = pool->prp_rq_size; + request->rq_pool = pool; + + return request; +} + +/** + * Returns freed \a request to pool. + */ +static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request) +{ + struct ptlrpc_request_pool *pool = request->rq_pool; + + spin_lock(&pool->prp_lock); + LASSERT(list_empty(&request->rq_list)); + LASSERT(!request->rq_receiving_reply); + list_add_tail(&request->rq_list, &pool->prp_req_list); + spin_unlock(&pool->prp_lock); +} + +void ptlrpc_add_unreplied(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct list_head *tmp; + struct ptlrpc_request *iter; + + assert_spin_locked(&imp->imp_lock); + LASSERT(list_empty(&req->rq_unreplied_list)); + + /* unreplied list is sorted by xid in ascending order */ + list_for_each_prev(tmp, &imp->imp_unreplied_list) { + iter = list_entry(tmp, struct ptlrpc_request, + rq_unreplied_list); + + LASSERT(req->rq_xid != iter->rq_xid); + if (req->rq_xid < iter->rq_xid) + continue; + list_add(&req->rq_unreplied_list, &iter->rq_unreplied_list); + return; + } + list_add(&req->rq_unreplied_list, &imp->imp_unreplied_list); +} + +void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req) +{ + req->rq_xid = ptlrpc_next_xid(); + ptlrpc_add_unreplied(req); +} + +static inline void ptlrpc_assign_next_xid(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_import->imp_lock); + ptlrpc_assign_next_xid_nolock(req); + spin_unlock(&req->rq_import->imp_lock); +} + +int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, char **bufs, + struct ptlrpc_cli_ctx *ctx) +{ + int count; + struct obd_import *imp; + __u32 *lengths; + int rc; + + ENTRY; + + count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT); + imp = request->rq_import; + lengths = request->rq_pill.rc_area[RCL_CLIENT]; + + if (ctx != NULL) { + request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx); + } else { + rc = sptlrpc_req_get_ctx(request); + if (rc) + GOTO(out_free, rc); + } + sptlrpc_req_set_flavor(request, opcode); + + rc = lustre_pack_request(request, imp->imp_msg_magic, count, + lengths, bufs); + if (rc) + GOTO(out_ctx, rc); + + lustre_msg_add_version(request->rq_reqmsg, version); + request->rq_send_state = LUSTRE_IMP_FULL; + request->rq_type = PTL_RPC_MSG_REQUEST; + + request->rq_req_cbid.cbid_fn = request_out_callback; + request->rq_req_cbid.cbid_arg = request; + + request->rq_reply_cbid.cbid_fn = reply_in_callback; + request->rq_reply_cbid.cbid_arg = request; + + request->rq_reply_deadline = 0; + request->rq_bulk_deadline = 0; + request->rq_req_deadline = 0; + request->rq_phase = RQ_PHASE_NEW; + request->rq_next_phase = RQ_PHASE_UNDEFINED; + + request->rq_request_portal = imp->imp_client->cli_request_portal; + request->rq_reply_portal = imp->imp_client->cli_reply_portal; + + ptlrpc_at_set_req_timeout(request); + + lustre_msg_set_opc(request->rq_reqmsg, opcode); + ptlrpc_assign_next_xid(request); + + /* Let's setup deadline for req/reply/bulk unlink for opcode. */ + if (cfs_fail_val == opcode) { + time64_t *fail_t = NULL, *fail2_t = NULL; + + if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) + fail_t = &request->rq_bulk_deadline; + else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) + fail_t = &request->rq_reply_deadline; + else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK)) + fail_t = &request->rq_req_deadline; + else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK)) { + fail_t = &request->rq_reply_deadline; + fail2_t = &request->rq_bulk_deadline; + } + + if (fail_t) { + *fail_t = ktime_get_real_seconds() + LONG_UNLINK; + + if (fail2_t) + *fail2_t = ktime_get_real_seconds() + + LONG_UNLINK; + + /* + * The RPC is infected, let the test to change the + * fail_loc + */ + msleep(4 * MSEC_PER_SEC); + } + } + + RETURN(0); + +out_ctx: + LASSERT(!request->rq_pool); + sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1); +out_free: + atomic_dec(&imp->imp_reqs); + class_import_put(imp); + + return rc; + +} +EXPORT_SYMBOL(ptlrpc_request_bufs_pack); + +/** + * Pack request buffers for network transfer, performing necessary encryption + * steps if necessary. + */ +int ptlrpc_request_pack(struct ptlrpc_request *request, + __u32 version, int opcode) +{ + int rc; + rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL); + if (rc) + return rc; + + /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of + * ptlrpc_body sent from server equal to local ptlrpc_body size, so we + * have to send old ptlrpc_body to keep interoprability with these + * clients. + * + * Only three kinds of server->client RPCs so far: + * - LDLM_BL_CALLBACK + * - LDLM_CP_CALLBACK + * - LDLM_GL_CALLBACK + * + * XXX This should be removed whenever we drop the interoprability with + * the these old clients. + */ + if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK || + opcode == LDLM_GL_CALLBACK) + req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY, + sizeof(struct ptlrpc_body_v2), RCL_CLIENT); + + return rc; +} +EXPORT_SYMBOL(ptlrpc_request_pack); + +/** + * Helper function to allocate new request on import \a imp + * and possibly using existing request from pool \a pool if provided. + * Returns allocated request structure with import field filled or + * NULL on error. + */ +static inline +struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, + struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *request = NULL; + + request = ptlrpc_request_cache_alloc(GFP_NOFS); + + if (!request && pool) + request = ptlrpc_prep_req_from_pool(pool); + + if (request) { + ptlrpc_cli_req_init(request); + + LASSERTF((unsigned long)imp > 0x1000, "%p", imp); + LASSERT(imp != LP_POISON); + LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p\n", + imp->imp_client); + LASSERT(imp->imp_client != LP_POISON); + + request->rq_import = class_import_get(imp); + atomic_inc(&imp->imp_reqs); + } else { + CERROR("request allocation out of memory\n"); + } + + return request; +} + +static int ptlrpc_reconnect_if_idle(struct obd_import *imp) +{ + int rc; + + /* + * initiate connection if needed when the import has been + * referenced by the new request to avoid races with disconnect. + * serialize this check against conditional state=IDLE + * in ptlrpc_disconnect_idle_interpret() + */ + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_IDLE) { + imp->imp_generation++; + imp->imp_initiated_at = imp->imp_generation; + imp->imp_state = LUSTRE_IMP_NEW; + + /* connect_import_locked releases imp_lock */ + rc = ptlrpc_connect_import_locked(imp); + if (rc) + return rc; + ptlrpc_pinger_add_import(imp); + } else { + spin_unlock(&imp->imp_lock); + } + return 0; +} + +/** + * Helper function for creating a request. + * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits + * buffer structures according to capsule template \a format. + * Returns allocated request structure pointer or NULL on error. + */ +static struct ptlrpc_request * +ptlrpc_request_alloc_internal(struct obd_import *imp, + struct ptlrpc_request_pool * pool, + const struct req_format *format) +{ + struct ptlrpc_request *request; + + request = __ptlrpc_request_alloc(imp, pool); + if (request == NULL) + return NULL; + + /* don't make expensive check for idling connection + * if it's already connected */ + if (unlikely(imp->imp_state != LUSTRE_IMP_FULL)) { + if (ptlrpc_reconnect_if_idle(imp) < 0) { + atomic_dec(&imp->imp_reqs); + ptlrpc_request_free(request); + return NULL; + } + } + + req_capsule_init(&request->rq_pill, request, RCL_CLIENT); + req_capsule_set(&request->rq_pill, format); + return request; +} + +/** + * Allocate new request structure for import \a imp and initialize its + * buffer structure according to capsule template \a format. + */ +struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, + const struct req_format *format) +{ + return ptlrpc_request_alloc_internal(imp, NULL, format); +} +EXPORT_SYMBOL(ptlrpc_request_alloc); + +/** + * Allocate new request structure for import \a imp from pool \a pool and + * initialize its buffer structure according to capsule template \a format. + */ +struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, + struct ptlrpc_request_pool * pool, + const struct req_format *format) +{ + return ptlrpc_request_alloc_internal(imp, pool, format); +} +EXPORT_SYMBOL(ptlrpc_request_alloc_pool); + +/** + * For requests not from pool, free memory of the request structure. + * For requests obtained from a pool earlier, return request back to pool. + */ +void ptlrpc_request_free(struct ptlrpc_request *request) +{ + if (request->rq_pool) + __ptlrpc_free_req_to_pool(request); + else + ptlrpc_request_cache_free(request); +} +EXPORT_SYMBOL(ptlrpc_request_free); + +/** + * Allocate new request for operatione \a opcode and immediatelly pack it for + * network transfer. + * Only used for simple requests like OBD_PING where the only important + * part of the request is operation itself. + * Returns allocated request or NULL on error. + */ +struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, + const struct req_format *format, + __u32 version, int opcode) +{ + struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format); + int rc; + + if (req) { + rc = ptlrpc_request_pack(req, version, opcode); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + } + } + return req; +} +EXPORT_SYMBOL(ptlrpc_request_alloc_pack); + +/** + * Allocate and initialize new request set structure on the current CPT. + * Returns a pointer to the newly allocated set structure or NULL on error. + */ +struct ptlrpc_request_set *ptlrpc_prep_set(void) +{ + struct ptlrpc_request_set *set; + int cpt; + + ENTRY; + cpt = cfs_cpt_current(cfs_cpt_table, 0); + OBD_CPT_ALLOC(set, cfs_cpt_table, cpt, sizeof *set); + if (!set) + RETURN(NULL); + atomic_set(&set->set_refcount, 1); + INIT_LIST_HEAD(&set->set_requests); + init_waitqueue_head(&set->set_waitq); + atomic_set(&set->set_new_count, 0); + atomic_set(&set->set_remaining, 0); + spin_lock_init(&set->set_new_req_lock); + INIT_LIST_HEAD(&set->set_new_requests); + set->set_max_inflight = UINT_MAX; + set->set_producer = NULL; + set->set_producer_arg = NULL; + set->set_rc = 0; + + RETURN(set); +} +EXPORT_SYMBOL(ptlrpc_prep_set); + +/** + * Allocate and initialize new request set structure with flow control + * extension. This extension allows to control the number of requests in-flight + * for the whole set. A callback function to generate requests must be provided + * and the request set will keep the number of requests sent over the wire to + * @max_inflight. + * Returns a pointer to the newly allocated set structure or NULL on error. + */ +struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, + void *arg) + +{ + struct ptlrpc_request_set *set; + + set = ptlrpc_prep_set(); + if (!set) + RETURN(NULL); + + set->set_max_inflight = max; + set->set_producer = func; + set->set_producer_arg = arg; + + RETURN(set); +} + +/** + * Wind down and free request set structure previously allocated with + * ptlrpc_prep_set. + * Ensures that all requests on the set have completed and removes + * all requests from the request list in a set. + * If any unsent request happen to be on the list, pretends that they got + * an error in flight and calls their completion handler. + */ +void ptlrpc_set_destroy(struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + struct list_head *next; + int expected_phase; + int n = 0; + ENTRY; + + /* Requests on the set should either all be completed, or all be new */ + expected_phase = (atomic_read(&set->set_remaining) == 0) ? + RQ_PHASE_COMPLETE : RQ_PHASE_NEW; + list_for_each(tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + + LASSERT(req->rq_phase == expected_phase); + n++; + } + + LASSERTF(atomic_read(&set->set_remaining) == 0 || + atomic_read(&set->set_remaining) == n, "%d / %d\n", + atomic_read(&set->set_remaining), n); + + list_for_each_safe(tmp, next, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + list_del_init(&req->rq_set_chain); + + LASSERT(req->rq_phase == expected_phase); + + if (req->rq_phase == RQ_PHASE_NEW) { + ptlrpc_req_interpret(NULL, req, -EBADR); + atomic_dec(&set->set_remaining); + } + + spin_lock(&req->rq_lock); + req->rq_set = NULL; + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + + ptlrpc_req_finished (req); + } + + LASSERT(atomic_read(&set->set_remaining) == 0); + + ptlrpc_reqset_put(set); + EXIT; +} +EXPORT_SYMBOL(ptlrpc_set_destroy); + +/** + * Add a new request to the general purpose request set. + * Assumes request reference from the caller. + */ +void ptlrpc_set_add_req(struct ptlrpc_request_set *set, + struct ptlrpc_request *req) +{ + LASSERT(req->rq_import->imp_state != LUSTRE_IMP_IDLE); + LASSERT(list_empty(&req->rq_set_chain)); + + if (req->rq_allow_intr) + set->set_allow_intr = 1; + + /* The set takes over the caller's request reference */ + list_add_tail(&req->rq_set_chain, &set->set_requests); + req->rq_set = set; + atomic_inc(&set->set_remaining); + req->rq_queued_time = ktime_get_seconds(); + + if (req->rq_reqmsg != NULL) + lustre_msg_set_jobid(req->rq_reqmsg, NULL); + + if (set->set_producer != NULL) + /* If the request set has a producer callback, the RPC must be + * sent straight away */ + ptlrpc_send_new_req(req); +} +EXPORT_SYMBOL(ptlrpc_set_add_req); + +/** + * Add a request to a request with dedicated server thread + * and wake the thread to make any necessary processing. + * Currently only used for ptlrpcd. + */ +void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, + struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *set = pc->pc_set; + int count, i; + + LASSERT(req->rq_set == NULL); + LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0); + + spin_lock(&set->set_new_req_lock); + /* + * The set takes over the caller's request reference. + */ + req->rq_set = set; + req->rq_queued_time = ktime_get_seconds(); + list_add_tail(&req->rq_set_chain, &set->set_new_requests); + count = atomic_inc_return(&set->set_new_count); + spin_unlock(&set->set_new_req_lock); + + /* Only need to call wakeup once for the first entry. */ + if (count == 1) { + wake_up(&set->set_waitq); + + /* XXX: It maybe unnecessary to wakeup all the partners. But to + * guarantee the async RPC can be processed ASAP, we have + * no other better choice. It maybe fixed in future. */ + for (i = 0; i < pc->pc_npartners; i++) + wake_up(&pc->pc_partners[i]->pc_set->set_waitq); + } +} + +/** + * Based on the current state of the import, determine if the request + * can be sent, is an error, or should be delayed. + * + * Returns true if this request should be delayed. If false, and + * *status is set, then the request can not be sent and *status is the + * error code. If false and status is 0, then request can be sent. + * + * The imp->imp_lock must be held. + */ +static int ptlrpc_import_delay_req(struct obd_import *imp, + struct ptlrpc_request *req, int *status) +{ + int delay = 0; + ENTRY; + + LASSERT (status != NULL); + *status = 0; + + if (req->rq_ctx_init || req->rq_ctx_fini) { + /* always allow ctx init/fini rpc go through */ + } else if (imp->imp_state == LUSTRE_IMP_NEW) { + DEBUG_REQ(D_ERROR, req, "Uninitialized import."); + *status = -EIO; + } else if (imp->imp_state == LUSTRE_IMP_CLOSED) { + unsigned int opc = lustre_msg_get_opc(req->rq_reqmsg); + + /* pings or MDS-equivalent STATFS may safely race with umount */ + DEBUG_REQ((opc == OBD_PING || opc == OST_STATFS) ? + D_HA : D_ERROR, req, "IMP_CLOSED "); + *status = -EIO; + } else if (ptlrpc_send_limit_expired(req)) { + /* probably doesn't need to be a D_ERROR after initial testing*/ + DEBUG_REQ(D_HA, req, "send limit expired "); + *status = -ETIMEDOUT; + } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && + imp->imp_state == LUSTRE_IMP_CONNECTING) { + /* allow CONNECT even if import is invalid */ ; + if (atomic_read(&imp->imp_inval_count) != 0) { + DEBUG_REQ(D_ERROR, req, "invalidate in flight"); + *status = -EIO; + } + } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) { + if (!imp->imp_deactive) + DEBUG_REQ(D_NET, req, "IMP_INVALID"); + *status = -ESHUTDOWN; /* bz 12940 */ + } else if (req->rq_import_generation != imp->imp_generation) { + DEBUG_REQ(D_ERROR, req, "req wrong generation:"); + *status = -EIO; + } else if (req->rq_send_state != imp->imp_state) { + /* invalidate in progress - any requests should be drop */ + if (atomic_read(&imp->imp_inval_count) != 0) { + DEBUG_REQ(D_ERROR, req, "invalidate in flight"); + *status = -EIO; + } else if (req->rq_no_delay && + imp->imp_generation != imp->imp_initiated_at) { + /* ignore nodelay for requests initiating connections */ + *status = -EWOULDBLOCK; + } else if (req->rq_allow_replay && + (imp->imp_state == LUSTRE_IMP_REPLAY || + imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS || + imp->imp_state == LUSTRE_IMP_REPLAY_WAIT || + imp->imp_state == LUSTRE_IMP_RECOVER)) { + DEBUG_REQ(D_HA, req, "allow during recovery.\n"); + } else { + delay = 1; + } + } + + RETURN(delay); +} + +/** + * Decide if the error message should be printed to the console or not. + * Makes its decision based on request type, status, and failure frequency. + * + * \param[in] req request that failed and may need a console message + * + * \retval false if no message should be printed + * \retval true if console message should be printed + */ +static bool ptlrpc_console_allow(struct ptlrpc_request *req, __u32 opc, int err) +{ + LASSERT(req->rq_reqmsg != NULL); + + /* Suppress particular reconnect errors which are to be expected. */ + if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) { + + /* Suppress timed out reconnect requests */ + if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) || + req->rq_timedout) + return false; + + /* Suppress most unavailable/again reconnect requests, but + * print occasionally so it is clear client is trying to + * connect to a server where no target is running. */ + if ((err == -ENODEV || err == -EAGAIN) && + req->rq_import->imp_conn_cnt % 30 != 20) + return false; + } + + if (opc == LDLM_ENQUEUE && err == -EAGAIN) + /* -EAGAIN is normal when using POSIX flocks */ + return false; + + if (opc == OBD_PING && (err == -ENODEV || err == -ENOTCONN) && + (req->rq_xid & 0xf) != 10) + /* Suppress most ping requests, they may fail occasionally */ + return false; + + return true; +} + +/** + * Check request processing status. + * Returns the status. + */ +static int ptlrpc_check_status(struct ptlrpc_request *req) +{ + int err; + ENTRY; + + err = lustre_msg_get_status(req->rq_repmsg); + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + struct obd_import *imp = req->rq_import; + lnet_nid_t nid = imp->imp_connection->c_peer.nid; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + if (ptlrpc_console_allow(req, opc, err)) + LCONSOLE_ERROR_MSG(0x11, "%s: operation %s to node %s " + "failed: rc = %d\n", + imp->imp_obd->obd_name, + ll_opcode2str(opc), + libcfs_nid2str(nid), err); + RETURN(err < 0 ? err : -EINVAL); + } + + if (err < 0) { + DEBUG_REQ(D_INFO, req, "status is %d", err); + } else if (err > 0) { + /* XXX: translate this error from net to host */ + DEBUG_REQ(D_INFO, req, "status is %d", err); + } + + RETURN(err); +} + +/** + * save pre-versions of objects into request for replay. + * Versions are obtained from server reply. + * used for VBR. + */ +static void ptlrpc_save_versions(struct ptlrpc_request *req) +{ + struct lustre_msg *repmsg = req->rq_repmsg; + struct lustre_msg *reqmsg = req->rq_reqmsg; + __u64 *versions = lustre_msg_get_versions(repmsg); + ENTRY; + + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) + return; + + LASSERT(versions); + lustre_msg_set_versions(reqmsg, versions); + CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n", + versions[0], versions[1]); + + EXIT; +} + +__u64 ptlrpc_known_replied_xid(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + assert_spin_locked(&imp->imp_lock); + if (list_empty(&imp->imp_unreplied_list)) + return 0; + + req = list_entry(imp->imp_unreplied_list.next, struct ptlrpc_request, + rq_unreplied_list); + LASSERTF(req->rq_xid >= 1, "XID:%llu\n", req->rq_xid); + + if (imp->imp_known_replied_xid < req->rq_xid - 1) + imp->imp_known_replied_xid = req->rq_xid - 1; + + return req->rq_xid - 1; +} + +/** + * Callback function called when client receives RPC reply for \a req. + * Returns 0 on success or error code. + * The return alue would be assigned to req->rq_status by the caller + * as request processing status. + * This function also decides if the request needs to be saved for later replay. + */ +static int after_reply(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct obd_device *obd = req->rq_import->imp_obd; + ktime_t work_start; + u64 committed; + s64 timediff; + int rc; + + ENTRY; + LASSERT(obd != NULL); + /* repbuf must be unlinked */ + LASSERT(!req->rq_receiving_reply && req->rq_reply_unlinked); + + if (req->rq_reply_truncated) { + if (ptlrpc_no_resend(req)) { + DEBUG_REQ(D_ERROR, req, "reply buffer overflow," + " expected: %d, actual size: %d", + req->rq_nob_received, req->rq_repbuf_len); + RETURN(-EOVERFLOW); + } + + sptlrpc_cli_free_repbuf(req); + /* Pass the required reply buffer size (include + * space for early reply). + * NB: no need to roundup because alloc_repbuf + * will roundup it */ + req->rq_replen = req->rq_nob_received; + req->rq_nob_received = 0; + spin_lock(&req->rq_lock); + req->rq_resend = 1; + spin_unlock(&req->rq_lock); + RETURN(0); + } + + work_start = ktime_get_real(); + timediff = ktime_us_delta(work_start, req->rq_sent_ns); + + /* + * NB Until this point, the whole of the incoming message, + * including buflens, status etc is in the sender's byte order. + */ + rc = sptlrpc_cli_unwrap_reply(req); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc); + RETURN(rc); + } + + /* + * Security layer unwrap might ask resend this request. + */ + if (req->rq_resend) + RETURN(0); + + rc = unpack_reply(req); + if (rc) + RETURN(rc); + + /* retry indefinitely on EINPROGRESS */ + if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS && + ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) { + time64_t now = ktime_get_real_seconds(); + + DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS"); + spin_lock(&req->rq_lock); + req->rq_resend = 1; + spin_unlock(&req->rq_lock); + req->rq_nr_resend++; + + /* Readjust the timeout for current conditions */ + ptlrpc_at_set_req_timeout(req); + /* delay resend to give a chance to the server to get ready. + * The delay is increased by 1s on every resend and is capped to + * the current request timeout (i.e. obd_timeout if AT is off, + * or AT service time x 125% + 5s, see at_est2timeout) */ + if (req->rq_nr_resend > req->rq_timeout) + req->rq_sent = now + req->rq_timeout; + else + req->rq_sent = now + req->rq_nr_resend; + + /* Resend for EINPROGRESS will use a new XID */ + spin_lock(&imp->imp_lock); + list_del_init(&req->rq_unreplied_list); + spin_unlock(&imp->imp_lock); + + RETURN(0); + } + + if (obd->obd_svc_stats != NULL) { + lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, + timediff); + ptlrpc_lprocfs_rpc_sent(req, timediff); + } + + if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && + lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { + DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)", + lustre_msg_get_type(req->rq_repmsg)); + RETURN(-EPROTO); + } + + if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) + CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val); + ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg)); + ptlrpc_at_adj_net_latency(req, + lustre_msg_get_service_timeout(req->rq_repmsg)); + + rc = ptlrpc_check_status(req); + imp->imp_connect_error = rc; + + if (rc) { + /* + * Either we've been evicted, or the server has failed for + * some reason. Try to reconnect, and if that fails, punt to + * the upcall. + */ + if (ptlrpc_recoverable_error(rc)) { + if (req->rq_send_state != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { + RETURN(rc); + } + ptlrpc_request_handle_notconn(req); + RETURN(rc); + } + } else { + /* + * Let's look if server sent slv. Do it only for RPC with + * rc == 0. + */ + ldlm_cli_update_pool(req); + } + + /* + * Store transno in reqmsg for replay. + */ + if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { + req->rq_transno = lustre_msg_get_transno(req->rq_repmsg); + lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno); + } + + if (imp->imp_replayable) { + spin_lock(&imp->imp_lock); + /* + * No point in adding already-committed requests to the replay + * list, we will just remove them immediately. b=9829 + */ + if (req->rq_transno != 0 && + (req->rq_transno > + lustre_msg_get_last_committed(req->rq_repmsg) || + req->rq_replay)) { + /** version recovery */ + ptlrpc_save_versions(req); + ptlrpc_retain_replayable_request(req, imp); + } else if (req->rq_commit_cb != NULL && + list_empty(&req->rq_replay_list)) { + /* NB: don't call rq_commit_cb if it's already on + * rq_replay_list, ptlrpc_free_committed() will call + * it later, see LU-3618 for details */ + spin_unlock(&imp->imp_lock); + req->rq_commit_cb(req); + spin_lock(&imp->imp_lock); + } + + /* + * Replay-enabled imports return commit-status information. + */ + committed = lustre_msg_get_last_committed(req->rq_repmsg); + if (likely(committed > imp->imp_peer_committed_transno)) + imp->imp_peer_committed_transno = committed; + + ptlrpc_free_committed(imp); + + if (!list_empty(&imp->imp_replay_list)) { + struct ptlrpc_request *last; + + last = list_entry(imp->imp_replay_list.prev, + struct ptlrpc_request, + rq_replay_list); + /* + * Requests with rq_replay stay on the list even if no + * commit is expected. + */ + if (last->rq_transno > imp->imp_peer_committed_transno) + ptlrpc_pinger_commit_expected(imp); + } + + spin_unlock(&imp->imp_lock); + } + + RETURN(rc); +} + +/** + * Helper function to send request \a req over the network for the first time + * Also adjusts request phase. + * Returns 0 on success or error code. + */ +static int ptlrpc_send_new_req(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + __u64 min_xid = 0; + int rc; + ENTRY; + + LASSERT(req->rq_phase == RQ_PHASE_NEW); + + /* do not try to go further if there is not enough memory in enc_pool */ + if (req->rq_sent && req->rq_bulk != NULL) + if (req->rq_bulk->bd_iov_count > get_free_pages_in_pool() && + pool_is_at_full_capacity()) + RETURN(-ENOMEM); + + if (req->rq_sent && (req->rq_sent > ktime_get_real_seconds()) && + (!req->rq_generation_set || + req->rq_import_generation == imp->imp_generation)) + RETURN (0); + + ptlrpc_rqphase_move(req, RQ_PHASE_RPC); + + spin_lock(&imp->imp_lock); + + LASSERT(req->rq_xid != 0); + LASSERT(!list_empty(&req->rq_unreplied_list)); + + if (!req->rq_generation_set) + req->rq_import_generation = imp->imp_generation; + + if (ptlrpc_import_delay_req(imp, req, &rc)) { + spin_lock(&req->rq_lock); + req->rq_waiting = 1; + spin_unlock(&req->rq_lock); + + DEBUG_REQ(D_HA, req, "req waiting for recovery: (%s != %s)", + ptlrpc_import_state_name(req->rq_send_state), + ptlrpc_import_state_name(imp->imp_state)); + LASSERT(list_empty(&req->rq_list)); + list_add_tail(&req->rq_list, &imp->imp_delayed_list); + atomic_inc(&req->rq_import->imp_inflight); + spin_unlock(&imp->imp_lock); + RETURN(0); + } + + if (rc != 0) { + spin_unlock(&imp->imp_lock); + req->rq_status = rc; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + RETURN(rc); + } + + LASSERT(list_empty(&req->rq_list)); + list_add_tail(&req->rq_list, &imp->imp_sending_list); + atomic_inc(&req->rq_import->imp_inflight); + + /* find the known replied XID from the unreplied list, CONNECT + * and DISCONNECT requests are skipped to make the sanity check + * on server side happy. see process_req_last_xid(). + * + * For CONNECT: Because replay requests have lower XID, it'll + * break the sanity check if CONNECT bump the exp_last_xid on + * server. + * + * For DISCONNECT: Since client will abort inflight RPC before + * sending DISCONNECT, DISCONNECT may carry an XID which higher + * than the inflight RPC. + */ + if (!ptlrpc_req_is_connect(req) && !ptlrpc_req_is_disconnect(req)) + min_xid = ptlrpc_known_replied_xid(imp); + spin_unlock(&imp->imp_lock); + + lustre_msg_set_last_xid(req->rq_reqmsg, min_xid); + + lustre_msg_set_status(req->rq_reqmsg, current_pid()); + + rc = sptlrpc_req_refresh_ctx(req, -1); + if (rc) { + if (req->rq_err) { + req->rq_status = rc; + RETURN(1); + } else { + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 1; + spin_unlock(&req->rq_lock); + RETURN(0); + } + } + + CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc" + " %s:%s:%d:%llu:%s:%d\n", current_comm(), + imp->imp_obd->obd_uuid.uuid, + lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, + obd_import_nid2str(imp), lustre_msg_get_opc(req->rq_reqmsg)); + + rc = ptl_send_rpc(req, 0); + if (rc == -ENOMEM) { + spin_lock(&imp->imp_lock); + if (!list_empty(&req->rq_list)) { + list_del_init(&req->rq_list); + atomic_dec(&req->rq_import->imp_inflight); + } + spin_unlock(&imp->imp_lock); + ptlrpc_rqphase_move(req, RQ_PHASE_NEW); + RETURN(rc); + } + if (rc) { + DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc); + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + RETURN(rc); + } + RETURN(0); +} + +static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set) +{ + int remaining, rc; + ENTRY; + + LASSERT(set->set_producer != NULL); + + remaining = atomic_read(&set->set_remaining); + + /* populate the ->set_requests list with requests until we + * reach the maximum number of RPCs in flight for this set */ + while (atomic_read(&set->set_remaining) < set->set_max_inflight) { + rc = set->set_producer(set, set->set_producer_arg); + if (rc == -ENOENT) { + /* no more RPC to produce */ + set->set_producer = NULL; + set->set_producer_arg = NULL; + RETURN(0); + } + } + + RETURN((atomic_read(&set->set_remaining) - remaining)); +} + +/** + * this sends any unsent RPCs in \a set and returns 1 if all are sent + * and no more replies are expected. + * (it is possible to get less replies than requests sent e.g. due to timed out + * requests or requests that we had trouble to send out) + * + * NOTE: This function contains a potential schedule point (cond_resched()). + */ +int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *next; + struct list_head comp_reqs; + int force_timer_recalc = 0; + ENTRY; + + if (atomic_read(&set->set_remaining) == 0) + RETURN(1); + + INIT_LIST_HEAD(&comp_reqs); + list_for_each_safe(tmp, next, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + struct obd_import *imp = req->rq_import; + int unregistered = 0; + int async = 1; + int rc = 0; + + if (req->rq_phase == RQ_PHASE_COMPLETE) { + list_move_tail(&req->rq_set_chain, &comp_reqs); + continue; + } + + /* This schedule point is mainly for the ptlrpcd caller of this + * function. Most ptlrpc sets are not long-lived and unbounded + * in length, but at the least the set used by the ptlrpcd is. + * Since the processing time is unbounded, we need to insert an + * explicit schedule point to make the thread well-behaved. + */ + cond_resched(); + + /* If the caller requires to allow to be interpreted by force + * and it has really been interpreted, then move the request + * to RQ_PHASE_INTERPRET phase in spite of what the current + * phase is. */ + if (unlikely(req->rq_allow_intr && req->rq_intr)) { + req->rq_status = -EINTR; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + + /* Since it is interpreted and we have to wait for + * the reply to be unlinked, then use sync mode. */ + async = 0; + + GOTO(interpret, req->rq_status); + } + + if (req->rq_phase == RQ_PHASE_NEW && ptlrpc_send_new_req(req)) + force_timer_recalc = 1; + + /* delayed send - skip */ + if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) + continue; + + /* delayed resend - skip */ + if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend && + req->rq_sent > ktime_get_real_seconds()) + continue; + + if (!(req->rq_phase == RQ_PHASE_RPC || + req->rq_phase == RQ_PHASE_BULK || + req->rq_phase == RQ_PHASE_INTERPRET || + req->rq_phase == RQ_PHASE_UNREG_RPC || + req->rq_phase == RQ_PHASE_UNREG_BULK)) { + DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase); + LBUG(); + } + + if (req->rq_phase == RQ_PHASE_UNREG_RPC || + req->rq_phase == RQ_PHASE_UNREG_BULK) { + LASSERT(req->rq_next_phase != req->rq_phase); + LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED); + + if (req->rq_req_deadline && + !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK)) + req->rq_req_deadline = 0; + if (req->rq_reply_deadline && + !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) + req->rq_reply_deadline = 0; + if (req->rq_bulk_deadline && + !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) + req->rq_bulk_deadline = 0; + + /* + * Skip processing until reply is unlinked. We + * can't return to pool before that and we can't + * call interpret before that. We need to make + * sure that all rdma transfers finished and will + * not corrupt any data. + */ + if (req->rq_phase == RQ_PHASE_UNREG_RPC && + ptlrpc_client_recv_or_unlink(req)) + continue; + if (req->rq_phase == RQ_PHASE_UNREG_BULK && + ptlrpc_client_bulk_active(req)) + continue; + + /* + * Turn fail_loc off to prevent it from looping + * forever. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK, + OBD_FAIL_ONCE); + } + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK, + OBD_FAIL_ONCE); + } + + /* + * Move to next phase if reply was successfully + * unlinked. + */ + ptlrpc_rqphase_move(req, req->rq_next_phase); + } + + if (req->rq_phase == RQ_PHASE_INTERPRET) + GOTO(interpret, req->rq_status); + + /* + * Note that this also will start async reply unlink. + */ + if (req->rq_net_err && !req->rq_timedout) { + ptlrpc_expire_one_request(req, 1); + + /* + * Check if we still need to wait for unlink. + */ + if (ptlrpc_client_recv_or_unlink(req) || + ptlrpc_client_bulk_active(req)) + continue; + /* If there is no need to resend, fail it now. */ + if (req->rq_no_resend) { + if (req->rq_status == 0) + req->rq_status = -EIO; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } else { + continue; + } + } + + if (req->rq_err) { + spin_lock(&req->rq_lock); + req->rq_replied = 0; + spin_unlock(&req->rq_lock); + if (req->rq_status == 0) + req->rq_status = -EIO; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } + + /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr + * so it sets rq_intr regardless of individual rpc + * timeouts. The synchronous IO waiting path sets + * rq_intr irrespective of whether ptlrpcd + * has seen a timeout. Our policy is to only interpret + * interrupted rpcs after they have timed out, so we + * need to enforce that here. + */ + + if (req->rq_intr && (req->rq_timedout || req->rq_waiting || + req->rq_wait_ctx)) { + req->rq_status = -EINTR; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } + + if (req->rq_phase == RQ_PHASE_RPC) { + if (req->rq_timedout || req->rq_resend || + req->rq_waiting || req->rq_wait_ctx) { + int status; + + if (!ptlrpc_unregister_reply(req, 1)) { + ptlrpc_unregister_bulk(req, 1); + continue; + } + + spin_lock(&imp->imp_lock); + if (ptlrpc_import_delay_req(imp, req, &status)){ + /* put on delay list - only if we wait + * recovery finished - before send */ + list_del_init(&req->rq_list); + list_add_tail(&req->rq_list, + &imp-> + imp_delayed_list); + spin_unlock(&imp->imp_lock); + continue; + } + + if (status != 0) { + req->rq_status = status; + ptlrpc_rqphase_move(req, + RQ_PHASE_INTERPRET); + spin_unlock(&imp->imp_lock); + GOTO(interpret, req->rq_status); + } + /* ignore on just initiated connections */ + if (ptlrpc_no_resend(req) && + !req->rq_wait_ctx && + imp->imp_generation != + imp->imp_initiated_at) { + req->rq_status = -ENOTCONN; + ptlrpc_rqphase_move(req, + RQ_PHASE_INTERPRET); + spin_unlock(&imp->imp_lock); + GOTO(interpret, req->rq_status); + } + + list_del_init(&req->rq_list); + list_add_tail(&req->rq_list, + &imp->imp_sending_list); + + spin_unlock(&imp->imp_lock); + + spin_lock(&req->rq_lock); + req->rq_waiting = 0; + spin_unlock(&req->rq_lock); + + if (req->rq_timedout || req->rq_resend) { + /* This is re-sending anyways, + * let's mark req as resend. */ + spin_lock(&req->rq_lock); + req->rq_resend = 1; + spin_unlock(&req->rq_lock); + } + /* + * rq_wait_ctx is only touched by ptlrpcd, + * so no lock is needed here. + */ + status = sptlrpc_req_refresh_ctx(req, -1); + if (status) { + if (req->rq_err) { + req->rq_status = status; + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 0; + spin_unlock(&req->rq_lock); + force_timer_recalc = 1; + } else { + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 1; + spin_unlock(&req->rq_lock); + } + + continue; + } else { + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 0; + spin_unlock(&req->rq_lock); + } + + /* In any case, the previous bulk should be + * cleaned up to prepare for the new sending */ + if (req->rq_bulk != NULL && + !ptlrpc_unregister_bulk(req, 1)) + continue; + + rc = ptl_send_rpc(req, 0); + if (rc == -ENOMEM) { + spin_lock(&imp->imp_lock); + if (!list_empty(&req->rq_list)) + list_del_init(&req->rq_list); + spin_unlock(&imp->imp_lock); + ptlrpc_rqphase_move(req, RQ_PHASE_NEW); + continue; + } + if (rc) { + DEBUG_REQ(D_HA, req, + "send failed: rc = %d", rc); + force_timer_recalc = 1; + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + continue; + } + /* need to reset the timeout */ + force_timer_recalc = 1; + } + + spin_lock(&req->rq_lock); + + if (ptlrpc_client_early(req)) { + ptlrpc_at_recv_early_reply(req); + spin_unlock(&req->rq_lock); + continue; + } + + /* Still waiting for a reply? */ + if (ptlrpc_client_recv(req)) { + spin_unlock(&req->rq_lock); + continue; + } + + /* Did we actually receive a reply? */ + if (!ptlrpc_client_replied(req)) { + spin_unlock(&req->rq_lock); + continue; + } + + spin_unlock(&req->rq_lock); + + /* unlink from net because we are going to + * swab in-place of reply buffer */ + unregistered = ptlrpc_unregister_reply(req, 1); + if (!unregistered) + continue; + + req->rq_status = after_reply(req); + if (req->rq_resend) + continue; + + /* If there is no bulk associated with this request, + * then we're done and should let the interpreter + * process the reply. Similarly if the RPC returned + * an error, and therefore the bulk will never arrive. + */ + if (req->rq_bulk == NULL || req->rq_status < 0) { + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } + + ptlrpc_rqphase_move(req, RQ_PHASE_BULK); + } + + LASSERT(req->rq_phase == RQ_PHASE_BULK); + if (ptlrpc_client_bulk_active(req)) + continue; + + if (req->rq_bulk->bd_failure) { + /* The RPC reply arrived OK, but the bulk screwed + * up! Dead weird since the server told us the RPC + * was good after getting the REPLY for her GET or + * the ACK for her PUT. */ + DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); + req->rq_status = -EIO; + } + + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + + interpret: + LASSERT(req->rq_phase == RQ_PHASE_INTERPRET); + + /* This moves to "unregistering" phase we need to wait for + * reply unlink. */ + if (!unregistered && !ptlrpc_unregister_reply(req, async)) { + /* start async bulk unlink too */ + ptlrpc_unregister_bulk(req, 1); + continue; + } + + if (!ptlrpc_unregister_bulk(req, async)) + continue; + + /* When calling interpret receiving already should be + * finished. */ + LASSERT(!req->rq_receiving_reply); + + ptlrpc_req_interpret(env, req, req->rq_status); + + if (ptlrpcd_check_work(req)) { + atomic_dec(&set->set_remaining); + continue; + } + ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); + + if (req->rq_reqmsg != NULL) + CDEBUG(D_RPCTRACE, + "Completed RPC pname:cluuid:pid:xid:nid:" + "opc %s:%s:%d:%llu:%s:%d\n", current_comm(), + imp->imp_obd->obd_uuid.uuid, + lustre_msg_get_status(req->rq_reqmsg), + req->rq_xid, + obd_import_nid2str(imp), + lustre_msg_get_opc(req->rq_reqmsg)); + + spin_lock(&imp->imp_lock); + /* Request already may be not on sending or delaying list. This + * may happen in the case of marking it erroneous for the case + * ptlrpc_import_delay_req(req, status) find it impossible to + * allow sending this rpc and returns *status != 0. */ + if (!list_empty(&req->rq_list)) { + list_del_init(&req->rq_list); + atomic_dec(&imp->imp_inflight); + } + list_del_init(&req->rq_unreplied_list); + spin_unlock(&imp->imp_lock); + + atomic_dec(&set->set_remaining); + wake_up_all(&imp->imp_recovery_waitq); + + if (set->set_producer) { + /* produce a new request if possible */ + if (ptlrpc_set_producer(set) > 0) + force_timer_recalc = 1; + + /* free the request that has just been completed + * in order not to pollute set->set_requests */ + list_del_init(&req->rq_set_chain); + spin_lock(&req->rq_lock); + req->rq_set = NULL; + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + + /* record rq_status to compute the final status later */ + if (req->rq_status != 0) + set->set_rc = req->rq_status; + ptlrpc_req_finished(req); + } else { + list_move_tail(&req->rq_set_chain, &comp_reqs); + } + } + + /* move completed request at the head of list so it's easier for + * caller to find them */ + list_splice(&comp_reqs, &set->set_requests); + + /* If we hit an error, we want to recover promptly. */ + RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc); +} +EXPORT_SYMBOL(ptlrpc_check_set); + +/** + * Time out request \a req. is \a async_unlink is set, that means do not wait + * until LNet actually confirms network buffer unlinking. + * Return 1 if we should give up further retrying attempts or 0 otherwise. + */ +int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) +{ + struct obd_import *imp = req->rq_import; + unsigned int debug_mask = D_RPCTRACE; + int rc = 0; + ENTRY; + + spin_lock(&req->rq_lock); + req->rq_timedout = 1; + spin_unlock(&req->rq_lock); + + if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg), + lustre_msg_get_status(req->rq_reqmsg))) + debug_mask = D_WARNING; + DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]", + req->rq_net_err ? "failed due to network error" : + ((req->rq_real_sent == 0 || + req->rq_real_sent < req->rq_sent || + req->rq_real_sent >= req->rq_deadline) ? + "timed out for sent delay" : "timed out for slow reply"), + (s64)req->rq_sent, (s64)req->rq_real_sent); + + if (imp != NULL && obd_debug_peer_on_timeout) + LNetDebugPeer(imp->imp_connection->c_peer); + + ptlrpc_unregister_reply(req, async_unlink); + ptlrpc_unregister_bulk(req, async_unlink); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + if (imp == NULL) { + DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); + RETURN(1); + } + + atomic_inc(&imp->imp_timeouts); + + /* The DLM server doesn't want recovery run on its imports. */ + if (imp->imp_dlm_fake) + RETURN(1); + + /* If this request is for recovery or other primordial tasks, + * then error it out here. */ + if (req->rq_ctx_init || req->rq_ctx_fini || + req->rq_send_state != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov) { + DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", + ptlrpc_import_state_name(req->rq_send_state), + ptlrpc_import_state_name(imp->imp_state)); + spin_lock(&req->rq_lock); + req->rq_status = -ETIMEDOUT; + req->rq_err = 1; + spin_unlock(&req->rq_lock); + RETURN(1); + } + + /* if a request can't be resent we can't wait for an answer after + the timeout */ + if (ptlrpc_no_resend(req)) { + DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); + rc = 1; + } + + ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); + + RETURN(rc); +} + +/** + * Time out all uncompleted requests in request set pointed by \a data + * Callback used when waiting on sets with l_wait_event. + * Always returns 1. + */ +int ptlrpc_expired_set(void *data) +{ + struct ptlrpc_request_set *set = data; + struct list_head *tmp; + time64_t now = ktime_get_real_seconds(); + + ENTRY; + LASSERT(set != NULL); + + /* + * A timeout expired. See which reqs it applies to... + */ + list_for_each(tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + + /* don't expire request waiting for context */ + if (req->rq_wait_ctx) + continue; + + /* Request in-flight? */ + if (!((req->rq_phase == RQ_PHASE_RPC && + !req->rq_waiting && !req->rq_resend) || + (req->rq_phase == RQ_PHASE_BULK))) + continue; + + if (req->rq_timedout || /* already dealt with */ + req->rq_deadline > now) /* not expired */ + continue; + + /* Deal with this guy. Do it asynchronously to not block + * ptlrpcd thread. */ + ptlrpc_expire_one_request(req, 1); + } + + /* + * When waiting for a whole set, we always break out of the + * sleep so we can recalculate the timeout, or enable interrupts + * if everyone's timed out. + */ + RETURN(1); +} + +/** + * Sets rq_intr flag in \a req under spinlock. + */ +void ptlrpc_mark_interrupted(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_lock); + req->rq_intr = 1; + spin_unlock(&req->rq_lock); +} +EXPORT_SYMBOL(ptlrpc_mark_interrupted); + +/** + * Interrupts (sets interrupted flag) all uncompleted requests in + * a set \a data. Callback for l_wait_event for interruptible waits. + */ +static void ptlrpc_interrupted_set(void *data) +{ + struct ptlrpc_request_set *set = data; + struct list_head *tmp; + + LASSERT(set != NULL); + CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set); + + list_for_each(tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + if (req->rq_intr) + continue; + + if (req->rq_phase != RQ_PHASE_RPC && + req->rq_phase != RQ_PHASE_UNREG_RPC && + !req->rq_allow_intr) + continue; + + ptlrpc_mark_interrupted(req); + } +} + +/** + * Get the smallest timeout in the set; this does NOT set a timeout. + */ +time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + time64_t now = ktime_get_real_seconds(); + int timeout = 0; + struct ptlrpc_request *req; + time64_t deadline; + + ENTRY; + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + /* + * Request in-flight? + */ + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) + continue; + + /* + * Already timed out. + */ + if (req->rq_timedout) + continue; + + /* + * Waiting for ctx. + */ + if (req->rq_wait_ctx) + continue; + + if (req->rq_phase == RQ_PHASE_NEW) + deadline = req->rq_sent; + else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend) + deadline = req->rq_sent; + else + deadline = req->rq_sent + req->rq_timeout; + + if (deadline <= now) /* actually expired already */ + timeout = 1; /* ASAP */ + else if (timeout == 0 || timeout > deadline - now) + timeout = deadline - now; + } + RETURN(timeout); +} + +/** + * Send all unset request from the set and then wait untill all + * requests in the set complete (either get a reply, timeout, get an + * error or otherwise be interrupted). + * Returns 0 on success or error code otherwise. + */ +int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + struct ptlrpc_request *req; + struct l_wait_info lwi; + time64_t timeout; + int rc; + ENTRY; + + if (set->set_producer) + (void)ptlrpc_set_producer(set); + else + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + if (req->rq_phase == RQ_PHASE_NEW) + (void)ptlrpc_send_new_req(req); + } + + if (list_empty(&set->set_requests)) + RETURN(0); + + do { + timeout = ptlrpc_set_next_timeout(set); + + /* wait until all complete, interrupted, or an in-flight + * req times out */ + CDEBUG(D_RPCTRACE, "set %p going to sleep for %lld seconds\n", + set, timeout); + + if ((timeout == 0 && !signal_pending(current)) || + set->set_allow_intr) + /* No requests are in-flight (ether timed out + * or delayed), so we can allow interrupts. + * We still want to block for a limited time, + * so we allow interrupts during the timeout. */ + lwi = LWI_TIMEOUT_INTR_ALL( + cfs_time_seconds(timeout ? timeout : 1), + ptlrpc_expired_set, + ptlrpc_interrupted_set, set); + else + /* + * At least one request is in flight, so no + * interrupts are allowed. Wait until all + * complete, or an in-flight req times out. + */ + lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1), + ptlrpc_expired_set, set); + + rc = l_wait_event(set->set_waitq, + ptlrpc_check_set(NULL, set), &lwi); + + /* LU-769 - if we ignored the signal because it was already + * pending when we started, we need to handle it now or we risk + * it being ignored forever */ + if (rc == -ETIMEDOUT && + (!lwi.lwi_allow_intr || set->set_allow_intr) && + signal_pending(current)) { + sigset_t blocked_sigs = + cfs_block_sigsinv(LUSTRE_FATAL_SIGS); + + /* In fact we only interrupt for the "fatal" signals + * like SIGINT or SIGKILL. We still ignore less + * important signals since ptlrpc set is not easily + * reentrant from userspace again */ + if (signal_pending(current)) + ptlrpc_interrupted_set(set); + cfs_restore_sigs(blocked_sigs); + } + + LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT); + + /* -EINTR => all requests have been flagged rq_intr so next + * check completes. + * -ETIMEDOUT => someone timed out. When all reqs have + * timed out, signals are enabled allowing completion with + * EINTR. + * I don't really care if we go once more round the loop in + * the error cases -eeb. */ + if (rc == 0 && atomic_read(&set->set_remaining) == 0) { + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + spin_lock(&req->rq_lock); + req->rq_invalid_rqset = 1; + spin_unlock(&req->rq_lock); + } + } + } while (rc != 0 || atomic_read(&set->set_remaining) != 0); + + LASSERT(atomic_read(&set->set_remaining) == 0); + + rc = set->set_rc; /* rq_status of already freed requests if any */ + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + LASSERT(req->rq_phase == RQ_PHASE_COMPLETE); + if (req->rq_status != 0) + rc = req->rq_status; + } + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_set_wait); + +/** + * Helper fuction for request freeing. + * Called when request count reached zero and request needs to be freed. + * Removes request from all sorts of sending/replay lists it might be on, + * frees network buffers if any are present. + * If \a locked is set, that means caller is already holding import imp_lock + * and so we no longer need to reobtain it (for certain lists manipulations) + */ +static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) +{ + ENTRY; + + if (request == NULL) + RETURN_EXIT; + + LASSERT(!request->rq_srv_req); + LASSERT(request->rq_export == NULL); + LASSERTF(!request->rq_receiving_reply, "req %p\n", request); + LASSERTF(list_empty(&request->rq_list), "req %p\n", request); + LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); + LASSERTF(!request->rq_replay, "req %p\n", request); + + req_capsule_fini(&request->rq_pill); + + /* We must take it off the imp_replay_list first. Otherwise, we'll set + * request->rq_reqmsg to NULL while osc_close is dereferencing it. */ + if (request->rq_import != NULL) { + if (!locked) + spin_lock(&request->rq_import->imp_lock); + list_del_init(&request->rq_replay_list); + list_del_init(&request->rq_unreplied_list); + if (!locked) + spin_unlock(&request->rq_import->imp_lock); + } + LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request); + + if (atomic_read(&request->rq_refcount) != 0) { + DEBUG_REQ(D_ERROR, request, + "freeing request with nonzero refcount"); + LBUG(); + } + + if (request->rq_repbuf != NULL) + sptlrpc_cli_free_repbuf(request); + + if (request->rq_import != NULL) { + if (!ptlrpcd_check_work(request)) { + LASSERT(atomic_read(&request->rq_import->imp_reqs) > 0); + atomic_dec(&request->rq_import->imp_reqs); + } + class_import_put(request->rq_import); + request->rq_import = NULL; + } + if (request->rq_bulk != NULL) + ptlrpc_free_bulk(request->rq_bulk); + + if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL) + sptlrpc_cli_free_reqbuf(request); + + if (request->rq_cli_ctx) + sptlrpc_req_put_ctx(request, !locked); + + if (request->rq_pool) + __ptlrpc_free_req_to_pool(request); + else + ptlrpc_request_cache_free(request); + EXIT; +} + +static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked); +/** + * Drop one request reference. Must be called with import imp_lock held. + * When reference count drops to zero, request is freed. + */ +void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request) +{ + assert_spin_locked(&request->rq_import->imp_lock); + (void)__ptlrpc_req_finished(request, 1); +} + +/** + * Helper function + * Drops one reference count for request \a request. + * \a locked set indicates that caller holds import imp_lock. + * Frees the request whe reference count reaches zero. + * + * \retval 1 the request is freed + * \retval 0 some others still hold references on the request + */ +static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) +{ + int count; + ENTRY; + + if (!request) + RETURN(1); + + LASSERT(request != LP_POISON); + LASSERT(request->rq_reqmsg != LP_POISON); + + DEBUG_REQ(D_INFO, request, "refcount now %u", + atomic_read(&request->rq_refcount) - 1); + + spin_lock(&request->rq_lock); + count = atomic_dec_return(&request->rq_refcount); + LASSERTF(count >= 0, "Invalid ref count %d\n", count); + + /* For open RPC, the client does not know the EA size (LOV, ACL, and + * so on) before replied, then the client has to reserve very large + * reply buffer. Such buffer will not be released until the RPC freed. + * Since The open RPC is replayable, we need to keep it in the replay + * list until close. If there are a lot of files opened concurrently, + * then the client may be OOM. + * + * If fact, it is unnecessary to keep reply buffer for open replay, + * related EAs have already been saved via mdc_save_lovea() before + * coming here. So it is safe to free the reply buffer some earlier + * before releasing the RPC to avoid client OOM. LU-9514 */ + if (count == 1 && request->rq_early_free_repbuf && request->rq_repbuf) { + spin_lock(&request->rq_early_free_lock); + sptlrpc_cli_free_repbuf(request); + request->rq_repbuf = NULL; + request->rq_repbuf_len = 0; + request->rq_repdata = NULL; + request->rq_reqdata_len = 0; + spin_unlock(&request->rq_early_free_lock); + } + spin_unlock(&request->rq_lock); + + if (!count) + __ptlrpc_free_req(request, locked); + + RETURN(!count); +} + +/** + * Drops one reference count for a request. + */ +void ptlrpc_req_finished(struct ptlrpc_request *request) +{ + __ptlrpc_req_finished(request, 0); +} +EXPORT_SYMBOL(ptlrpc_req_finished); + +/** + * Returns xid of a \a request + */ +__u64 ptlrpc_req_xid(struct ptlrpc_request *request) +{ + return request->rq_xid; +} +EXPORT_SYMBOL(ptlrpc_req_xid); + +/** + * Disengage the client's reply buffer from the network + * NB does _NOT_ unregister any client-side bulk. + * IDEMPOTENT, but _not_ safe against concurrent callers. + * The request owner (i.e. the thread doing the I/O) must call... + * Returns 0 on success or 1 if unregistering cannot be made. + */ +static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) +{ + int rc; + struct l_wait_info lwi; + + /* + * Might sleep. + */ + LASSERT(!in_interrupt()); + + /* Let's setup deadline for reply unlink. */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + async && request->rq_reply_deadline == 0 && cfs_fail_val == 0) + request->rq_reply_deadline = ktime_get_real_seconds() + + LONG_UNLINK; + + /* + * Nothing left to do. + */ + if (!ptlrpc_client_recv_or_unlink(request)) + RETURN(1); + + LNetMDUnlink(request->rq_reply_md_h); + + /* + * Let's check it once again. + */ + if (!ptlrpc_client_recv_or_unlink(request)) + RETURN(1); + + /* Move to "Unregistering" phase as reply was not unlinked yet. */ + ptlrpc_rqphase_move(request, RQ_PHASE_UNREG_RPC); + + /* + * Do not wait for unlink to finish. + */ + if (async) + RETURN(0); + + /* + * We have to l_wait_event() whatever the result, to give liblustre + * a chance to run reply_in_callback(), and to make sure we've + * unlinked before returning a req to the pool. + */ + for (;;) { + /* The wq argument is ignored by user-space wait_event macros */ + wait_queue_head_t *wq = (request->rq_set != NULL) ? + &request->rq_set->set_waitq : + &request->rq_reply_waitq; + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request), + &lwi); + if (rc == 0) { + ptlrpc_rqphase_move(request, request->rq_next_phase); + RETURN(1); + } + + LASSERT(rc == -ETIMEDOUT); + DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout " + "receiving_reply=%d req_ulinked=%d reply_unlinked=%d", + request->rq_receiving_reply, + request->rq_req_unlinked, + request->rq_reply_unlinked); + } + RETURN(0); +} + +static void ptlrpc_free_request(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_lock); + req->rq_replay = 0; + spin_unlock(&req->rq_lock); + + if (req->rq_commit_cb != NULL) + req->rq_commit_cb(req); + list_del_init(&req->rq_replay_list); + + __ptlrpc_req_finished(req, 1); +} + +/** + * the request is committed and dropped from the replay list of its import + */ +void ptlrpc_request_committed(struct ptlrpc_request *req, int force) +{ + struct obd_import *imp = req->rq_import; + + spin_lock(&imp->imp_lock); + if (list_empty(&req->rq_replay_list)) { + spin_unlock(&imp->imp_lock); + return; + } + + if (force || req->rq_transno <= imp->imp_peer_committed_transno) { + if (imp->imp_replay_cursor == &req->rq_replay_list) + imp->imp_replay_cursor = req->rq_replay_list.next; + ptlrpc_free_request(req); + } + + spin_unlock(&imp->imp_lock); +} +EXPORT_SYMBOL(ptlrpc_request_committed); + +/** + * Iterates through replay_list on import and prunes + * all requests have transno smaller than last_committed for the + * import and don't have rq_replay set. + * Since requests are sorted in transno order, stops when meetign first + * transno bigger than last_committed. + * caller must hold imp->imp_lock + */ +void ptlrpc_free_committed(struct obd_import *imp) +{ + struct ptlrpc_request *req, *saved; + struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ + bool skip_committed_list = true; + ENTRY; + + LASSERT(imp != NULL); + assert_spin_locked(&imp->imp_lock); + + if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked && + imp->imp_generation == imp->imp_last_generation_checked) { + CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n", + imp->imp_obd->obd_name, imp->imp_peer_committed_transno); + RETURN_EXIT; + } + CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n", + imp->imp_obd->obd_name, imp->imp_peer_committed_transno, + imp->imp_generation); + + if (imp->imp_generation != imp->imp_last_generation_checked || + imp->imp_last_transno_checked == 0) + skip_committed_list = false; + + imp->imp_last_transno_checked = imp->imp_peer_committed_transno; + imp->imp_last_generation_checked = imp->imp_generation; + + list_for_each_entry_safe(req, saved, &imp->imp_replay_list, + rq_replay_list) { + /* XXX ok to remove when 1357 resolved - rread 05/29/03 */ + LASSERT(req != last_req); + last_req = req; + + if (req->rq_transno == 0) { + DEBUG_REQ(D_EMERG, req, "zero transno during replay"); + LBUG(); + } + if (req->rq_import_generation < imp->imp_generation) { + DEBUG_REQ(D_RPCTRACE, req, "free request with old gen"); + GOTO(free_req, 0); + } + + /* not yet committed */ + if (req->rq_transno > imp->imp_peer_committed_transno) { + DEBUG_REQ(D_RPCTRACE, req, "stopping search"); + break; + } + + if (req->rq_replay) { + DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)"); + list_move_tail(&req->rq_replay_list, + &imp->imp_committed_list); + continue; + } + + DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)", + imp->imp_peer_committed_transno); +free_req: + ptlrpc_free_request(req); + } + + if (skip_committed_list) + GOTO(out, 0); + + list_for_each_entry_safe(req, saved, &imp->imp_committed_list, + rq_replay_list) { + LASSERT(req->rq_transno != 0); + if (req->rq_import_generation < imp->imp_generation || + !req->rq_replay) { + DEBUG_REQ(D_RPCTRACE, req, "free %s open request", + req->rq_import_generation < + imp->imp_generation ? "stale" : "closed"); + + if (imp->imp_replay_cursor == &req->rq_replay_list) + imp->imp_replay_cursor = + req->rq_replay_list.next; + + ptlrpc_free_request(req); + } + } +out: + EXIT; +} + +void ptlrpc_cleanup_client(struct obd_import *imp) +{ + ENTRY; + EXIT; +} + +/** + * Schedule previously sent request for resend. + * For bulk requests we assign new xid (to avoid problems with + * lost replies and therefore several transfers landing into same buffer + * from different sending attempts). + */ +void ptlrpc_resend_req(struct ptlrpc_request *req) +{ + DEBUG_REQ(D_HA, req, "going to resend"); + spin_lock(&req->rq_lock); + + /* Request got reply but linked to the import list still. + Let ptlrpc_check_set() to process it. */ + if (ptlrpc_client_replied(req)) { + spin_unlock(&req->rq_lock); + DEBUG_REQ(D_HA, req, "it has reply, so skip it"); + return; + } + + req->rq_status = -EAGAIN; + + req->rq_resend = 1; + req->rq_net_err = 0; + req->rq_timedout = 0; + + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); +} + +/* XXX: this function and rq_status are currently unused */ +void ptlrpc_restart_req(struct ptlrpc_request *req) +{ + DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request"); + req->rq_status = -ERESTARTSYS; + + spin_lock(&req->rq_lock); + req->rq_restart = 1; + req->rq_timedout = 0; + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); +} + +/** + * Grab additional reference on a request \a req + */ +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) +{ + ENTRY; + atomic_inc(&req->rq_refcount); + RETURN(req); +} +EXPORT_SYMBOL(ptlrpc_request_addref); + +/** + * Add a request to import replay_list. + * Must be called under imp_lock + */ +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp) +{ + struct list_head *tmp; + + assert_spin_locked(&imp->imp_lock); + + if (req->rq_transno == 0) { + DEBUG_REQ(D_EMERG, req, "saving request with zero transno"); + LBUG(); + } + + /* clear this for new requests that were resent as well + as resent replayed requests. */ + lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); + + /* don't re-add requests that have been replayed */ + if (!list_empty(&req->rq_replay_list)) + return; + + lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY); + + spin_lock(&req->rq_lock); + req->rq_resend = 0; + spin_unlock(&req->rq_lock); + + LASSERT(imp->imp_replayable); + /* Balanced in ptlrpc_free_committed, usually. */ + ptlrpc_request_addref(req); + list_for_each_prev(tmp, &imp->imp_replay_list) { + struct ptlrpc_request *iter = list_entry(tmp, + struct ptlrpc_request, + rq_replay_list); + + /* We may have duplicate transnos if we create and then + * open a file, or for closes retained if to match creating + * opens, so use req->rq_xid as a secondary key. + * (See bugs 684, 685, and 428.) + * XXX no longer needed, but all opens need transnos! + */ + if (iter->rq_transno > req->rq_transno) + continue; + + if (iter->rq_transno == req->rq_transno) { + LASSERT(iter->rq_xid != req->rq_xid); + if (iter->rq_xid > req->rq_xid) + continue; + } + + list_add(&req->rq_replay_list, &iter->rq_replay_list); + return; + } + + list_add(&req->rq_replay_list, &imp->imp_replay_list); +} + +/** + * Send request and wait until it completes. + * Returns request processing status. + */ +int ptlrpc_queue_wait(struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *set; + int rc; + ENTRY; + + LASSERT(req->rq_set == NULL); + LASSERT(!req->rq_receiving_reply); + + set = ptlrpc_prep_set(); + if (set == NULL) { + CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM); + RETURN(-ENOMEM); + } + + /* for distributed debugging */ + lustre_msg_set_status(req->rq_reqmsg, current_pid()); + + /* add a ref for the set (see comment in ptlrpc_set_add_req) */ + ptlrpc_request_addref(req); + ptlrpc_set_add_req(set, req); + rc = ptlrpc_set_wait(NULL, set); + ptlrpc_set_destroy(set); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_queue_wait); + +/** + * Callback used for replayed requests reply processing. + * In case of successful reply calls registered request replay callback. + * In case of error restart replay process. + */ +static int ptlrpc_replay_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void * data, int rc) +{ + struct ptlrpc_replay_async_args *aa = data; + struct obd_import *imp = req->rq_import; + + ENTRY; + atomic_dec(&imp->imp_replay_inflight); + + /* Note: if it is bulk replay (MDS-MDS replay), then even if + * server got the request, but bulk transfer timeout, let's + * replay the bulk req again */ + if (!ptlrpc_client_replied(req) || + (req->rq_bulk != NULL && + lustre_msg_get_status(req->rq_repmsg) == -ETIMEDOUT)) { + DEBUG_REQ(D_ERROR, req, "request replay timed out.\n"); + GOTO(out, rc = -ETIMEDOUT); + } + + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR && + (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN || + lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) + GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg)); + + /** VBR: check version failure */ + if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) { + /** replay was failed due to version mismatch */ + DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n"); + spin_lock(&imp->imp_lock); + imp->imp_vbr_failed = 1; + spin_unlock(&imp->imp_lock); + lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); + } else { + /** The transno had better not change over replay. */ + LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) == + lustre_msg_get_transno(req->rq_repmsg) || + lustre_msg_get_transno(req->rq_repmsg) == 0, + "%#llx/%#llx\n", + lustre_msg_get_transno(req->rq_reqmsg), + lustre_msg_get_transno(req->rq_repmsg)); + } + + spin_lock(&imp->imp_lock); + imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg); + spin_unlock(&imp->imp_lock); + LASSERT(imp->imp_last_replay_transno); + + /* transaction number shouldn't be bigger than the latest replayed */ + if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) { + DEBUG_REQ(D_ERROR, req, + "Reported transno %llu is bigger than the " + "replayed one: %llu", req->rq_transno, + lustre_msg_get_transno(req->rq_reqmsg)); + GOTO(out, rc = -EINVAL); + } + + DEBUG_REQ(D_HA, req, "got rep"); + + /* let the callback do fixups, possibly including in the request */ + if (req->rq_replay_cb) + req->rq_replay_cb(req); + + if (ptlrpc_client_replied(req) && + lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) { + DEBUG_REQ(D_ERROR, req, "status %d, old was %d", + lustre_msg_get_status(req->rq_repmsg), + aa->praa_old_status); + + /* Note: If the replay fails for MDT-MDT recovery, let's + * abort all of the following requests in the replay + * and sending list, because MDT-MDT update requests + * are dependent on each other, see LU-7039 */ + if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) { + struct ptlrpc_request *free_req; + struct ptlrpc_request *tmp; + + spin_lock(&imp->imp_lock); + list_for_each_entry_safe(free_req, tmp, + &imp->imp_replay_list, + rq_replay_list) { + ptlrpc_free_request(free_req); + } + + list_for_each_entry_safe(free_req, tmp, + &imp->imp_committed_list, + rq_replay_list) { + ptlrpc_free_request(free_req); + } + + list_for_each_entry_safe(free_req, tmp, + &imp->imp_delayed_list, + rq_list) { + spin_lock(&free_req->rq_lock); + free_req->rq_err = 1; + free_req->rq_status = -EIO; + ptlrpc_client_wake_req(free_req); + spin_unlock(&free_req->rq_lock); + } + + list_for_each_entry_safe(free_req, tmp, + &imp->imp_sending_list, + rq_list) { + spin_lock(&free_req->rq_lock); + free_req->rq_err = 1; + free_req->rq_status = -EIO; + ptlrpc_client_wake_req(free_req); + spin_unlock(&free_req->rq_lock); + } + spin_unlock(&imp->imp_lock); + } + } else { + /* Put it back for re-replay. */ + lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); + } + + /* + * Errors while replay can set transno to 0, but + * imp_last_replay_transno shouldn't be set to 0 anyway + */ + if (req->rq_transno == 0) + CERROR("Transno is 0 during replay!\n"); + + /* continue with recovery */ + rc = ptlrpc_import_recovery_state_machine(imp); + out: + req->rq_send_state = aa->praa_old_state; + + if (rc != 0) + /* this replay failed, so restart recovery */ + ptlrpc_connect_import(imp); + + RETURN(rc); +} + +/** + * Prepares and queues request for replay. + * Adds it to ptlrpcd queue for actual sending. + * Returns 0 on success. + */ +int ptlrpc_replay_req(struct ptlrpc_request *req) +{ + struct ptlrpc_replay_async_args *aa; + + ENTRY; + + LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + memset(aa, 0, sizeof(*aa)); + + /* Prepare request to be resent with ptlrpcd */ + aa->praa_old_state = req->rq_send_state; + req->rq_send_state = LUSTRE_IMP_REPLAY; + req->rq_phase = RQ_PHASE_NEW; + req->rq_next_phase = RQ_PHASE_UNDEFINED; + if (req->rq_repmsg) + aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); + req->rq_status = 0; + req->rq_interpret_reply = ptlrpc_replay_interpret; + /* Readjust the timeout for current conditions */ + ptlrpc_at_set_req_timeout(req); + + /* Tell server the net_latency, so the server can calculate how long + * it should wait for next replay */ + lustre_msg_set_service_timeout(req->rq_reqmsg, + ptlrpc_at_get_net_latency(req)); + DEBUG_REQ(D_HA, req, "REPLAY"); + + atomic_inc(&req->rq_import->imp_replay_inflight); + spin_lock(&req->rq_lock); + req->rq_early_free_repbuf = 0; + spin_unlock(&req->rq_lock); + ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ + + ptlrpcd_add_req(req); + RETURN(0); +} + +/** + * Aborts all in-flight request on import \a imp sending and delayed lists + */ +void ptlrpc_abort_inflight(struct obd_import *imp) +{ + struct list_head *tmp, *n; + ENTRY; + + /* + * Make sure that no new requests get processed for this import. + * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing + * this flag and then putting requests on sending_list or delayed_list. + */ + assert_spin_locked(&imp->imp_lock); + + /* XXX locking? Maybe we should remove each request with the list + * locked? Also, how do we know if the requests on the list are + * being freed at this time? + */ + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + struct ptlrpc_request *req = list_entry(tmp, + struct ptlrpc_request, + rq_list); + + DEBUG_REQ(D_RPCTRACE, req, "inflight"); + + spin_lock(&req->rq_lock); + if (req->rq_import_generation < imp->imp_generation) { + req->rq_err = 1; + req->rq_status = -EIO; + ptlrpc_client_wake_req(req); + } + spin_unlock(&req->rq_lock); + } + + list_for_each_safe(tmp, n, &imp->imp_delayed_list) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req"); + + spin_lock(&req->rq_lock); + if (req->rq_import_generation < imp->imp_generation) { + req->rq_err = 1; + req->rq_status = -EIO; + ptlrpc_client_wake_req(req); + } + spin_unlock(&req->rq_lock); + } + + /* Last chance to free reqs left on the replay list, but we + * will still leak reqs that haven't committed. */ + if (imp->imp_replayable) + ptlrpc_free_committed(imp); + + EXIT; +} + +/** + * Abort all uncompleted requests in request set \a set + */ +void ptlrpc_abort_set(struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *pos; + + LASSERT(set != NULL); + + list_for_each_safe(pos, tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(pos, struct ptlrpc_request, + rq_set_chain); + + spin_lock(&req->rq_lock); + if (req->rq_phase != RQ_PHASE_RPC) { + spin_unlock(&req->rq_lock); + continue; + } + + req->rq_err = 1; + req->rq_status = -EINTR; + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); + } +} + +static __u64 ptlrpc_last_xid; +static spinlock_t ptlrpc_last_xid_lock; + +/** + * Initialize the XID for the node. This is common among all requests on + * this node, and only requires the property that it is monotonically + * increasing. It does not need to be sequential. Since this is also used + * as the RDMA match bits, it is important that a single client NOT have + * the same match bits for two different in-flight requests, hence we do + * NOT want to have an XID per target or similar. + * + * To avoid an unlikely collision between match bits after a client reboot + * (which would deliver old data into the wrong RDMA buffer) initialize + * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s. + * If the time is clearly incorrect, we instead use a 62-bit random number. + * In the worst case the random number will overflow 1M RPCs per second in + * 9133 years, or permutations thereof. + */ +#define YEAR_2004 (1ULL << 30) +void ptlrpc_init_xid(void) +{ + time64_t now = ktime_get_real_seconds(); + + spin_lock_init(&ptlrpc_last_xid_lock); + if (now < YEAR_2004) { + cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid)); + ptlrpc_last_xid >>= 2; + ptlrpc_last_xid |= (1ULL << 61); + } else { + ptlrpc_last_xid = (__u64)now << 20; + } + + /* Need to always be aligned to a power-of-two for mutli-bulk BRW */ + CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0); + ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK; +} + +/** + * Increase xid and returns resulting new value to the caller. + * + * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting + * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC + * itself uses the last bulk xid needed, so the server can determine the + * the number of bulk transfers from the RPC XID and a bitmask. The starting + * xid must align to a power-of-two value. + * + * This is assumed to be true due to the initial ptlrpc_last_xid + * value also being initialized to a power-of-two value. LU-1431 + */ +__u64 ptlrpc_next_xid(void) +{ + __u64 next; + + spin_lock(&ptlrpc_last_xid_lock); + next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; + ptlrpc_last_xid = next; + spin_unlock(&ptlrpc_last_xid_lock); + + return next; +} + +/** + * If request has a new allocated XID (new request or EINPROGRESS resend), + * use this XID as matchbits of bulk, otherwise allocate a new matchbits for + * request to ensure previous bulk fails and avoid problems with lost replies + * and therefore several transfers landing into the same buffer from different + * sending attempts. + */ +void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *bd = req->rq_bulk; + + LASSERT(bd != NULL); + + /* Generate new matchbits for all resend requests, including + * resend replay. */ + if (req->rq_resend) { + __u64 old_mbits = req->rq_mbits; + + /* First time resend on -EINPROGRESS will generate new xid, + * so we can actually use the rq_xid as rq_mbits in such case, + * however, it's bit hard to distinguish such resend with a + * 'resend for the -EINPROGRESS resend'. To make it simple, + * we opt to generate mbits for all resend cases. */ + if (OCD_HAS_FLAG(&bd->bd_import->imp_connect_data, BULK_MBITS)){ + req->rq_mbits = ptlrpc_next_xid(); + } else { + /* Old version transfers rq_xid to peer as + * matchbits. */ + spin_lock(&req->rq_import->imp_lock); + list_del_init(&req->rq_unreplied_list); + ptlrpc_assign_next_xid_nolock(req); + spin_unlock(&req->rq_import->imp_lock); + req->rq_mbits = req->rq_xid; + } + CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n", + old_mbits, req->rq_mbits); + } else if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { + /* Request being sent first time, use xid as matchbits. */ + req->rq_mbits = req->rq_xid; + } else { + /* Replay request, xid and matchbits have already been + * correctly assigned. */ + return; + } + + /* For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so + * that server can infer the number of bulks that were prepared, + * see LU-1431 */ + req->rq_mbits += bd->bd_md_count - 1; + + /* Set rq_xid as rq_mbits to indicate the final bulk for the old + * server which does not support OBD_CONNECT_BULK_MBITS. LU-6808. + * + * It's ok to directly set the rq_xid here, since this xid bump + * won't affect the request position in unreplied list. */ + if (!OCD_HAS_FLAG(&bd->bd_import->imp_connect_data, BULK_MBITS)) + req->rq_xid = req->rq_mbits; +} + +/** + * Get a glimpse at what next xid value might have been. + * Returns possible next xid. + */ +__u64 ptlrpc_sample_next_xid(void) +{ +#if BITS_PER_LONG == 32 + /* need to avoid possible word tearing on 32-bit systems */ + __u64 next; + + spin_lock(&ptlrpc_last_xid_lock); + next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; + spin_unlock(&ptlrpc_last_xid_lock); + + return next; +#else + /* No need to lock, since returned value is racy anyways */ + return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; +#endif +} +EXPORT_SYMBOL(ptlrpc_sample_next_xid); + +/** + * Functions for operating ptlrpc workers. + * + * A ptlrpc work is a function which will be running inside ptlrpc context. + * The callback shouldn't sleep otherwise it will block that ptlrpcd thread. + * + * 1. after a work is created, it can be used many times, that is: + * handler = ptlrpcd_alloc_work(); + * ptlrpcd_queue_work(); + * + * queue it again when necessary: + * ptlrpcd_queue_work(); + * ptlrpcd_destroy_work(); + * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but + * it will only be queued once in any time. Also as its name implies, it may + * have delay before it really runs by ptlrpcd thread. + */ +struct ptlrpc_work_async_args { + int (*cb)(const struct lu_env *, void *); + void *cbdata; +}; + +static void ptlrpcd_add_work_req(struct ptlrpc_request *req) +{ + /* re-initialize the req */ + req->rq_timeout = obd_timeout; + req->rq_sent = ktime_get_real_seconds(); + req->rq_deadline = req->rq_sent + req->rq_timeout; + req->rq_phase = RQ_PHASE_INTERPRET; + req->rq_next_phase = RQ_PHASE_COMPLETE; + req->rq_xid = ptlrpc_next_xid(); + req->rq_import_generation = req->rq_import->imp_generation; + + ptlrpcd_add_req(req); +} + +static int work_interpreter(const struct lu_env *env, + struct ptlrpc_request *req, void *data, int rc) +{ + struct ptlrpc_work_async_args *arg = data; + + LASSERT(ptlrpcd_check_work(req)); + LASSERT(arg->cb != NULL); + + rc = arg->cb(env, arg->cbdata); + + list_del_init(&req->rq_set_chain); + req->rq_set = NULL; + + if (atomic_dec_return(&req->rq_refcount) > 1) { + atomic_set(&req->rq_refcount, 2); + ptlrpcd_add_work_req(req); + } + return rc; +} + +static int worker_format; + +static int ptlrpcd_check_work(struct ptlrpc_request *req) +{ + return req->rq_pill.rc_fmt == (void *)&worker_format; +} + +/** + * Create a work for ptlrpc. + */ +void *ptlrpcd_alloc_work(struct obd_import *imp, + int (*cb)(const struct lu_env *, void *), void *cbdata) +{ + struct ptlrpc_request *req = NULL; + struct ptlrpc_work_async_args *args; + ENTRY; + + might_sleep(); + + if (cb == NULL) + RETURN(ERR_PTR(-EINVAL)); + + /* copy some code from deprecated fakereq. */ + req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (req == NULL) { + CERROR("ptlrpc: run out of memory!\n"); + RETURN(ERR_PTR(-ENOMEM)); + } + + ptlrpc_cli_req_init(req); + + req->rq_send_state = LUSTRE_IMP_FULL; + req->rq_type = PTL_RPC_MSG_REQUEST; + req->rq_import = class_import_get(imp); + req->rq_interpret_reply = work_interpreter; + /* don't want reply */ + req->rq_no_delay = req->rq_no_resend = 1; + req->rq_pill.rc_fmt = (void *)&worker_format; + + CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args)); + args = ptlrpc_req_async_args(req); + args->cb = cb; + args->cbdata = cbdata; + + RETURN(req); +} +EXPORT_SYMBOL(ptlrpcd_alloc_work); + +void ptlrpcd_destroy_work(void *handler) +{ + struct ptlrpc_request *req = handler; + + if (req) + ptlrpc_req_finished(req); +} +EXPORT_SYMBOL(ptlrpcd_destroy_work); + +int ptlrpcd_queue_work(void *handler) +{ + struct ptlrpc_request *req = handler; + + /* + * Check if the req is already being queued. + * + * Here comes a trick: it lacks a way of checking if a req is being + * processed reliably in ptlrpc. Here I have to use refcount of req + * for this purpose. This is okay because the caller should use this + * req as opaque data. - Jinshan + */ + LASSERT(atomic_read(&req->rq_refcount) > 0); + if (atomic_inc_return(&req->rq_refcount) == 2) + ptlrpcd_add_work_req(req); + return 0; +} +EXPORT_SYMBOL(ptlrpcd_queue_work); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c b/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c new file mode 100644 index 0000000000000..369eace7f9233 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c @@ -0,0 +1,240 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include + +#include "ptlrpc_internal.h" + +static struct cfs_hash *conn_hash; +static struct cfs_hash_ops conn_hash_ops; + +struct ptlrpc_connection * +ptlrpc_connection_get(struct lnet_process_id peer, lnet_nid_t self, + struct obd_uuid *uuid) +{ + struct ptlrpc_connection *conn, *conn2; + ENTRY; + + peer.nid = LNetPrimaryNID(peer.nid); + conn = cfs_hash_lookup(conn_hash, &peer); + if (conn) + GOTO(out, conn); + + OBD_ALLOC_PTR(conn); + if (!conn) + RETURN(NULL); + + conn->c_peer = peer; + conn->c_self = self; + INIT_HLIST_NODE(&conn->c_hash); + atomic_set(&conn->c_refcount, 1); + if (uuid) + obd_str2uuid(&conn->c_remote_uuid, uuid->uuid); + + /* + * Add the newly created conn to the hash, on key collision we + * lost a racing addition and must destroy our newly allocated + * connection. The object which exists in the hash will be + * returned and may be compared against out object. + */ + /* In the function below, .hs_keycmp resolves to + * conn_keycmp() */ + /* coverity[overrun-buffer-val] */ + conn2 = cfs_hash_findadd_unique(conn_hash, &peer, &conn->c_hash); + if (conn != conn2) { + OBD_FREE_PTR(conn); + conn = conn2; + } + EXIT; +out: + CDEBUG(D_INFO, "conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nid2str(conn->c_peer.nid)); + return conn; +} + +int ptlrpc_connection_put(struct ptlrpc_connection *conn) +{ + int rc = 0; + ENTRY; + + if (!conn) + RETURN(rc); + + LASSERT(atomic_read(&conn->c_refcount) > 1); + + /* + * We do not remove connection from hashtable and + * do not free it even if last caller released ref, + * as we want to have it cached for the case it is + * needed again. + * + * Deallocating it and later creating new connection + * again would be wastful. This way we also avoid + * expensive locking to protect things from get/put + * race when found cached connection is freed by + * ptlrpc_connection_put(). + * + * It will be freed later in module unload time, + * when ptlrpc_connection_fini()->lh_exit->conn_exit() + * path is called. + */ + if (atomic_dec_return(&conn->c_refcount) == 1) + rc = 1; + + CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nid2str(conn->c_peer.nid)); + + RETURN(rc); +} + +struct ptlrpc_connection * +ptlrpc_connection_addref(struct ptlrpc_connection *conn) +{ + ENTRY; + + atomic_inc(&conn->c_refcount); + CDEBUG(D_INFO, "conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nid2str(conn->c_peer.nid)); + + RETURN(conn); +} + +int ptlrpc_connection_init(void) +{ + ENTRY; + + conn_hash = cfs_hash_create("CONN_HASH", + HASH_CONN_CUR_BITS, + HASH_CONN_MAX_BITS, + HASH_CONN_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &conn_hash_ops, CFS_HASH_DEFAULT); + if (!conn_hash) + RETURN(-ENOMEM); + + RETURN(0); +} + +void ptlrpc_connection_fini(void) { + ENTRY; + cfs_hash_putref(conn_hash); + EXIT; +} + +/* + * Hash operations for net_peer<->connection + */ +static unsigned +conn_hashfn(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(struct lnet_process_id), mask); +} + +static int +conn_keycmp(const void *key, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + const struct lnet_process_id *conn_key; + + LASSERT(key != NULL); + conn_key = (struct lnet_process_id *)key; + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + + return conn_key->nid == conn->c_peer.nid && + conn_key->pid == conn->c_peer.pid; +} + +static void * +conn_key(struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + return &conn->c_peer; +} + +static void * +conn_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ptlrpc_connection, c_hash); +} + +static void +conn_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + atomic_inc(&conn->c_refcount); +} + +static void +conn_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + atomic_dec(&conn->c_refcount); +} + +static void +conn_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + /* + * Nothing should be left. Connection user put it and + * connection also was deleted from table by this time + * so we should have 0 refs. + */ + LASSERTF(atomic_read(&conn->c_refcount) == 0, + "Busy connection with %d refs\n", + atomic_read(&conn->c_refcount)); + OBD_FREE_PTR(conn); +} + +static struct cfs_hash_ops conn_hash_ops = { + .hs_hash = conn_hashfn, + .hs_keycmp = conn_keycmp, + .hs_key = conn_key, + .hs_object = conn_object, + .hs_get = conn_get, + .hs_put_locked = conn_put_locked, + .hs_exit = conn_exit, +}; diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c new file mode 100644 index 0000000000000..a3d31a853244c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c @@ -0,0 +1,411 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * GPL HEADER END + */ +/* + * Copyright (C) 2011 FUJITSU LIMITED. All rights reserved. + * + * Copyright (c) 2013, Intel Corporation. + */ + +#include +#include + +#ifdef LUSTRE_TRANSLATE_ERRNOS +#include + +/* + * The two translation tables below must define a one-to-one mapping between + * host and network errnos. + * + * EWOULDBLOCK is equal to EAGAIN on all architectures except for parisc, which + * appears irrelevant. Thus, existing references to EWOULDBLOCK are fine. + * + * EDEADLOCK is equal to EDEADLK on x86 but not on sparc, at least. A sparc + * host has no context-free way to determine if a LUSTRE_EDEADLK represents an + * EDEADLK or an EDEADLOCK. Therefore, all existing references to EDEADLOCK + * that need to be transferred on wire have been replaced with EDEADLK. + */ +static int lustre_errno_hton_mapping[] = { + [EPERM] = LUSTRE_EPERM, + [ENOENT] = LUSTRE_ENOENT, + [ESRCH] = LUSTRE_ESRCH, + [EINTR] = LUSTRE_EINTR, + [EIO] = LUSTRE_EIO, + [ENXIO] = LUSTRE_ENXIO, + [E2BIG] = LUSTRE_E2BIG, + [ENOEXEC] = LUSTRE_ENOEXEC, + [EBADF] = LUSTRE_EBADF, + [ECHILD] = LUSTRE_ECHILD, + [EAGAIN] = LUSTRE_EAGAIN, + [ENOMEM] = LUSTRE_ENOMEM, + [EACCES] = LUSTRE_EACCES, + [EFAULT] = LUSTRE_EFAULT, + [ENOTBLK] = LUSTRE_ENOTBLK, + [EBUSY] = LUSTRE_EBUSY, + [EEXIST] = LUSTRE_EEXIST, + [EXDEV] = LUSTRE_EXDEV, + [ENODEV] = LUSTRE_ENODEV, + [ENOTDIR] = LUSTRE_ENOTDIR, + [EISDIR] = LUSTRE_EISDIR, + [EINVAL] = LUSTRE_EINVAL, + [ENFILE] = LUSTRE_ENFILE, + [EMFILE] = LUSTRE_EMFILE, + [ENOTTY] = LUSTRE_ENOTTY, + [ETXTBSY] = LUSTRE_ETXTBSY, + [EFBIG] = LUSTRE_EFBIG, + [ENOSPC] = LUSTRE_ENOSPC, + [ESPIPE] = LUSTRE_ESPIPE, + [EROFS] = LUSTRE_EROFS, + [EMLINK] = LUSTRE_EMLINK, + [EPIPE] = LUSTRE_EPIPE, + [EDOM] = LUSTRE_EDOM, + [ERANGE] = LUSTRE_ERANGE, + [EDEADLK] = LUSTRE_EDEADLK, + [ENAMETOOLONG] = LUSTRE_ENAMETOOLONG, + [ENOLCK] = LUSTRE_ENOLCK, + [ENOSYS] = LUSTRE_ENOSYS, + [ENOTEMPTY] = LUSTRE_ENOTEMPTY, + [ELOOP] = LUSTRE_ELOOP, + [ENOMSG] = LUSTRE_ENOMSG, + [EIDRM] = LUSTRE_EIDRM, + [ECHRNG] = LUSTRE_ECHRNG, + [EL2NSYNC] = LUSTRE_EL2NSYNC, + [EL3HLT] = LUSTRE_EL3HLT, + [EL3RST] = LUSTRE_EL3RST, + [ELNRNG] = LUSTRE_ELNRNG, + [EUNATCH] = LUSTRE_EUNATCH, + [ENOCSI] = LUSTRE_ENOCSI, + [EL2HLT] = LUSTRE_EL2HLT, + [EBADE] = LUSTRE_EBADE, + [EBADR] = LUSTRE_EBADR, + [EXFULL] = LUSTRE_EXFULL, + [ENOANO] = LUSTRE_ENOANO, + [EBADRQC] = LUSTRE_EBADRQC, + [EBADSLT] = LUSTRE_EBADSLT, + [EBFONT] = LUSTRE_EBFONT, + [ENOSTR] = LUSTRE_ENOSTR, + [ENODATA] = LUSTRE_ENODATA, + [ETIME] = LUSTRE_ETIME, + [ENOSR] = LUSTRE_ENOSR, + [ENONET] = LUSTRE_ENONET, + [ENOPKG] = LUSTRE_ENOPKG, + [EREMOTE] = LUSTRE_EREMOTE, + [ENOLINK] = LUSTRE_ENOLINK, + [EADV] = LUSTRE_EADV, + [ESRMNT] = LUSTRE_ESRMNT, + [ECOMM] = LUSTRE_ECOMM, + [EPROTO] = LUSTRE_EPROTO, + [EMULTIHOP] = LUSTRE_EMULTIHOP, + [EDOTDOT] = LUSTRE_EDOTDOT, + [EBADMSG] = LUSTRE_EBADMSG, + [EOVERFLOW] = LUSTRE_EOVERFLOW, + [ENOTUNIQ] = LUSTRE_ENOTUNIQ, + [EBADFD] = LUSTRE_EBADFD, + [EREMCHG] = LUSTRE_EREMCHG, + [ELIBACC] = LUSTRE_ELIBACC, + [ELIBBAD] = LUSTRE_ELIBBAD, + [ELIBSCN] = LUSTRE_ELIBSCN, + [ELIBMAX] = LUSTRE_ELIBMAX, + [ELIBEXEC] = LUSTRE_ELIBEXEC, + [EILSEQ] = LUSTRE_EILSEQ, + [ERESTART] = LUSTRE_ERESTART, + [ESTRPIPE] = LUSTRE_ESTRPIPE, + [EUSERS] = LUSTRE_EUSERS, + [ENOTSOCK] = LUSTRE_ENOTSOCK, + [EDESTADDRREQ] = LUSTRE_EDESTADDRREQ, + [EMSGSIZE] = LUSTRE_EMSGSIZE, + [EPROTOTYPE] = LUSTRE_EPROTOTYPE, + [ENOPROTOOPT] = LUSTRE_ENOPROTOOPT, + [EPROTONOSUPPORT] = LUSTRE_EPROTONOSUPPORT, + [ESOCKTNOSUPPORT] = LUSTRE_ESOCKTNOSUPPORT, + [EOPNOTSUPP] = LUSTRE_EOPNOTSUPP, + [EPFNOSUPPORT] = LUSTRE_EPFNOSUPPORT, + [EAFNOSUPPORT] = LUSTRE_EAFNOSUPPORT, + [EADDRINUSE] = LUSTRE_EADDRINUSE, + [EADDRNOTAVAIL] = LUSTRE_EADDRNOTAVAIL, + [ENETDOWN] = LUSTRE_ENETDOWN, + [ENETUNREACH] = LUSTRE_ENETUNREACH, + [ENETRESET] = LUSTRE_ENETRESET, + [ECONNABORTED] = LUSTRE_ECONNABORTED, + [ECONNRESET] = LUSTRE_ECONNRESET, + [ENOBUFS] = LUSTRE_ENOBUFS, + [EISCONN] = LUSTRE_EISCONN, + [ENOTCONN] = LUSTRE_ENOTCONN, + [ESHUTDOWN] = LUSTRE_ESHUTDOWN, + [ETOOMANYREFS] = LUSTRE_ETOOMANYREFS, + [ETIMEDOUT] = LUSTRE_ETIMEDOUT, + [ECONNREFUSED] = LUSTRE_ECONNREFUSED, + [EHOSTDOWN] = LUSTRE_EHOSTDOWN, + [EHOSTUNREACH] = LUSTRE_EHOSTUNREACH, + [EALREADY] = LUSTRE_EALREADY, + [EINPROGRESS] = LUSTRE_EINPROGRESS, + [ESTALE] = LUSTRE_ESTALE, + [EUCLEAN] = LUSTRE_EUCLEAN, + [ENOTNAM] = LUSTRE_ENOTNAM, + [ENAVAIL] = LUSTRE_ENAVAIL, + [EISNAM] = LUSTRE_EISNAM, + [EREMOTEIO] = LUSTRE_EREMOTEIO, + [EDQUOT] = LUSTRE_EDQUOT, + [ENOMEDIUM] = LUSTRE_ENOMEDIUM, + [EMEDIUMTYPE] = LUSTRE_EMEDIUMTYPE, + [ECANCELED] = LUSTRE_ECANCELED, + [ENOKEY] = LUSTRE_ENOKEY, + [EKEYEXPIRED] = LUSTRE_EKEYEXPIRED, + [EKEYREVOKED] = LUSTRE_EKEYREVOKED, + [EKEYREJECTED] = LUSTRE_EKEYREJECTED, + [EOWNERDEAD] = LUSTRE_EOWNERDEAD, + [ENOTRECOVERABLE] = LUSTRE_ENOTRECOVERABLE, + [ERESTARTSYS] = LUSTRE_ERESTARTSYS, + [ERESTARTNOINTR] = LUSTRE_ERESTARTNOINTR, + [ERESTARTNOHAND] = LUSTRE_ERESTARTNOHAND, + [ENOIOCTLCMD] = LUSTRE_ENOIOCTLCMD, + [ERESTART_RESTARTBLOCK] = LUSTRE_ERESTART_RESTARTBLOCK, + [EBADHANDLE] = LUSTRE_EBADHANDLE, + [ENOTSYNC] = LUSTRE_ENOTSYNC, + [EBADCOOKIE] = LUSTRE_EBADCOOKIE, + [ENOTSUPP] = LUSTRE_ENOTSUPP, + [ETOOSMALL] = LUSTRE_ETOOSMALL, + [ESERVERFAULT] = LUSTRE_ESERVERFAULT, + [EBADTYPE] = LUSTRE_EBADTYPE, + [EJUKEBOX] = LUSTRE_EJUKEBOX, + [EIOCBQUEUED] = LUSTRE_EIOCBQUEUED, + + /* + * The ELDLM errors are Lustre specific errors whose ranges + * lie in the middle of the above system errors. The ELDLM + * numbers must be preserved to avoid LU-9793. + */ + [ELDLM_LOCK_CHANGED] = ELDLM_LOCK_CHANGED, + [ELDLM_LOCK_ABORTED] = ELDLM_LOCK_ABORTED, + [ELDLM_LOCK_REPLACED] = ELDLM_LOCK_REPLACED, + [ELDLM_NO_LOCK_DATA] = ELDLM_NO_LOCK_DATA, + [ELDLM_LOCK_WOULDBLOCK] = ELDLM_LOCK_WOULDBLOCK, + [ELDLM_NAMESPACE_EXISTS]= ELDLM_NAMESPACE_EXISTS, + [ELDLM_BAD_NAMESPACE] = ELDLM_BAD_NAMESPACE +}; + +static int lustre_errno_ntoh_mapping[] = { + [LUSTRE_EPERM] = EPERM, + [LUSTRE_ENOENT] = ENOENT, + [LUSTRE_ESRCH] = ESRCH, + [LUSTRE_EINTR] = EINTR, + [LUSTRE_EIO] = EIO, + [LUSTRE_ENXIO] = ENXIO, + [LUSTRE_E2BIG] = E2BIG, + [LUSTRE_ENOEXEC] = ENOEXEC, + [LUSTRE_EBADF] = EBADF, + [LUSTRE_ECHILD] = ECHILD, + [LUSTRE_EAGAIN] = EAGAIN, + [LUSTRE_ENOMEM] = ENOMEM, + [LUSTRE_EACCES] = EACCES, + [LUSTRE_EFAULT] = EFAULT, + [LUSTRE_ENOTBLK] = ENOTBLK, + [LUSTRE_EBUSY] = EBUSY, + [LUSTRE_EEXIST] = EEXIST, + [LUSTRE_EXDEV] = EXDEV, + [LUSTRE_ENODEV] = ENODEV, + [LUSTRE_ENOTDIR] = ENOTDIR, + [LUSTRE_EISDIR] = EISDIR, + [LUSTRE_EINVAL] = EINVAL, + [LUSTRE_ENFILE] = ENFILE, + [LUSTRE_EMFILE] = EMFILE, + [LUSTRE_ENOTTY] = ENOTTY, + [LUSTRE_ETXTBSY] = ETXTBSY, + [LUSTRE_EFBIG] = EFBIG, + [LUSTRE_ENOSPC] = ENOSPC, + [LUSTRE_ESPIPE] = ESPIPE, + [LUSTRE_EROFS] = EROFS, + [LUSTRE_EMLINK] = EMLINK, + [LUSTRE_EPIPE] = EPIPE, + [LUSTRE_EDOM] = EDOM, + [LUSTRE_ERANGE] = ERANGE, + [LUSTRE_EDEADLK] = EDEADLK, + [LUSTRE_ENAMETOOLONG] = ENAMETOOLONG, + [LUSTRE_ENOLCK] = ENOLCK, + [LUSTRE_ENOSYS] = ENOSYS, + [LUSTRE_ENOTEMPTY] = ENOTEMPTY, + [LUSTRE_ELOOP] = ELOOP, + [LUSTRE_ENOMSG] = ENOMSG, + [LUSTRE_EIDRM] = EIDRM, + [LUSTRE_ECHRNG] = ECHRNG, + [LUSTRE_EL2NSYNC] = EL2NSYNC, + [LUSTRE_EL3HLT] = EL3HLT, + [LUSTRE_EL3RST] = EL3RST, + [LUSTRE_ELNRNG] = ELNRNG, + [LUSTRE_EUNATCH] = EUNATCH, + [LUSTRE_ENOCSI] = ENOCSI, + [LUSTRE_EL2HLT] = EL2HLT, + [LUSTRE_EBADE] = EBADE, + [LUSTRE_EBADR] = EBADR, + [LUSTRE_EXFULL] = EXFULL, + [LUSTRE_ENOANO] = ENOANO, + [LUSTRE_EBADRQC] = EBADRQC, + [LUSTRE_EBADSLT] = EBADSLT, + [LUSTRE_EBFONT] = EBFONT, + [LUSTRE_ENOSTR] = ENOSTR, + [LUSTRE_ENODATA] = ENODATA, + [LUSTRE_ETIME] = ETIME, + [LUSTRE_ENOSR] = ENOSR, + [LUSTRE_ENONET] = ENONET, + [LUSTRE_ENOPKG] = ENOPKG, + [LUSTRE_EREMOTE] = EREMOTE, + [LUSTRE_ENOLINK] = ENOLINK, + [LUSTRE_EADV] = EADV, + [LUSTRE_ESRMNT] = ESRMNT, + [LUSTRE_ECOMM] = ECOMM, + [LUSTRE_EPROTO] = EPROTO, + [LUSTRE_EMULTIHOP] = EMULTIHOP, + [LUSTRE_EDOTDOT] = EDOTDOT, + [LUSTRE_EBADMSG] = EBADMSG, + [LUSTRE_EOVERFLOW] = EOVERFLOW, + [LUSTRE_ENOTUNIQ] = ENOTUNIQ, + [LUSTRE_EBADFD] = EBADFD, + [LUSTRE_EREMCHG] = EREMCHG, + [LUSTRE_ELIBACC] = ELIBACC, + [LUSTRE_ELIBBAD] = ELIBBAD, + [LUSTRE_ELIBSCN] = ELIBSCN, + [LUSTRE_ELIBMAX] = ELIBMAX, + [LUSTRE_ELIBEXEC] = ELIBEXEC, + [LUSTRE_EILSEQ] = EILSEQ, + [LUSTRE_ERESTART] = ERESTART, + [LUSTRE_ESTRPIPE] = ESTRPIPE, + [LUSTRE_EUSERS] = EUSERS, + [LUSTRE_ENOTSOCK] = ENOTSOCK, + [LUSTRE_EDESTADDRREQ] = EDESTADDRREQ, + [LUSTRE_EMSGSIZE] = EMSGSIZE, + [LUSTRE_EPROTOTYPE] = EPROTOTYPE, + [LUSTRE_ENOPROTOOPT] = ENOPROTOOPT, + [LUSTRE_EPROTONOSUPPORT] = EPROTONOSUPPORT, + [LUSTRE_ESOCKTNOSUPPORT] = ESOCKTNOSUPPORT, + [LUSTRE_EOPNOTSUPP] = EOPNOTSUPP, + [LUSTRE_EPFNOSUPPORT] = EPFNOSUPPORT, + [LUSTRE_EAFNOSUPPORT] = EAFNOSUPPORT, + [LUSTRE_EADDRINUSE] = EADDRINUSE, + [LUSTRE_EADDRNOTAVAIL] = EADDRNOTAVAIL, + [LUSTRE_ENETDOWN] = ENETDOWN, + [LUSTRE_ENETUNREACH] = ENETUNREACH, + [LUSTRE_ENETRESET] = ENETRESET, + [LUSTRE_ECONNABORTED] = ECONNABORTED, + [LUSTRE_ECONNRESET] = ECONNRESET, + [LUSTRE_ENOBUFS] = ENOBUFS, + [LUSTRE_EISCONN] = EISCONN, + [LUSTRE_ENOTCONN] = ENOTCONN, + [LUSTRE_ESHUTDOWN] = ESHUTDOWN, + [LUSTRE_ETOOMANYREFS] = ETOOMANYREFS, + [LUSTRE_ETIMEDOUT] = ETIMEDOUT, + [LUSTRE_ECONNREFUSED] = ECONNREFUSED, + [LUSTRE_EHOSTDOWN] = EHOSTDOWN, + [LUSTRE_EHOSTUNREACH] = EHOSTUNREACH, + [LUSTRE_EALREADY] = EALREADY, + [LUSTRE_EINPROGRESS] = EINPROGRESS, + [LUSTRE_ESTALE] = ESTALE, + [LUSTRE_EUCLEAN] = EUCLEAN, + [LUSTRE_ENOTNAM] = ENOTNAM, + [LUSTRE_ENAVAIL] = ENAVAIL, + [LUSTRE_EISNAM] = EISNAM, + [LUSTRE_EREMOTEIO] = EREMOTEIO, + [LUSTRE_EDQUOT] = EDQUOT, + [LUSTRE_ENOMEDIUM] = ENOMEDIUM, + [LUSTRE_EMEDIUMTYPE] = EMEDIUMTYPE, + [LUSTRE_ECANCELED] = ECANCELED, + [LUSTRE_ENOKEY] = ENOKEY, + [LUSTRE_EKEYEXPIRED] = EKEYEXPIRED, + [LUSTRE_EKEYREVOKED] = EKEYREVOKED, + [LUSTRE_EKEYREJECTED] = EKEYREJECTED, + [LUSTRE_EOWNERDEAD] = EOWNERDEAD, + [LUSTRE_ENOTRECOVERABLE] = ENOTRECOVERABLE, + [LUSTRE_ERESTARTSYS] = ERESTARTSYS, + [LUSTRE_ERESTARTNOINTR] = ERESTARTNOINTR, + [LUSTRE_ERESTARTNOHAND] = ERESTARTNOHAND, + [LUSTRE_ENOIOCTLCMD] = ENOIOCTLCMD, + [LUSTRE_ERESTART_RESTARTBLOCK] = ERESTART_RESTARTBLOCK, + [LUSTRE_EBADHANDLE] = EBADHANDLE, + [LUSTRE_ENOTSYNC] = ENOTSYNC, + [LUSTRE_EBADCOOKIE] = EBADCOOKIE, + [LUSTRE_ENOTSUPP] = ENOTSUPP, + [LUSTRE_ETOOSMALL] = ETOOSMALL, + [LUSTRE_ESERVERFAULT] = ESERVERFAULT, + [LUSTRE_EBADTYPE] = EBADTYPE, + [LUSTRE_EJUKEBOX] = EJUKEBOX, + [LUSTRE_EIOCBQUEUED] = EIOCBQUEUED, + + /* + * The ELDLM errors are Lustre specific errors whose ranges + * lie in the middle of the above system errors. The ELDLM + * numbers must be preserved to avoid LU-9793. + */ + [ELDLM_LOCK_CHANGED] = ELDLM_LOCK_CHANGED, + [ELDLM_LOCK_ABORTED] = ELDLM_LOCK_ABORTED, + [ELDLM_LOCK_REPLACED] = ELDLM_LOCK_REPLACED, + [ELDLM_NO_LOCK_DATA] = ELDLM_NO_LOCK_DATA, + [ELDLM_LOCK_WOULDBLOCK] = ELDLM_LOCK_WOULDBLOCK, + [ELDLM_NAMESPACE_EXISTS] = ELDLM_NAMESPACE_EXISTS, + [ELDLM_BAD_NAMESPACE] = ELDLM_BAD_NAMESPACE +}; + +unsigned int lustre_errno_hton(unsigned int h) +{ + unsigned int n; + + if (h == 0) { + n = 0; + } else if (h < ARRAY_SIZE(lustre_errno_hton_mapping)) { + n = lustre_errno_hton_mapping[h]; + if (n == 0) + goto generic; + } else { +generic: + /* + * A generic errno is better than the unknown one that could + * mean anything to a different host. + */ + n = LUSTRE_EIO; + } + + return n; +} +EXPORT_SYMBOL(lustre_errno_hton); + +unsigned int lustre_errno_ntoh(unsigned int n) +{ + unsigned int h; + + if (n == 0) { + h = 0; + } else if (n < ARRAY_SIZE(lustre_errno_ntoh_mapping)) { + h = lustre_errno_ntoh_mapping[n]; + if (h == 0) + goto generic; + } else { +generic: + /* + * Similar to the situation in lustre_errno_hton(), an unknown + * network errno could coincide with anything. Hence, it is + * better to return a generic errno. + */ + h = EIO; + } + + return h; +} +EXPORT_SYMBOL(lustre_errno_ntoh); + +#endif /* LUSTRE_TRANSLATE_ERRNOS */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/events.c b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c new file mode 100644 index 0000000000000..6c713b22b94ae --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c @@ -0,0 +1,646 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +struct lnet_handle_eq ptlrpc_eq_h; + +/* + * Client's outgoing request callback + */ +void request_out_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_request *req = cbid->cbid_arg; + bool wakeup = false; + ENTRY; + + LASSERT(ev->type == LNET_EVENT_SEND || ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->unlinked); + + DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); + + /* Do not update imp_next_ping for connection request */ + if (lustre_msg_get_opc(req->rq_reqmsg) != + req->rq_import->imp_connect_op) + ptlrpc_pinger_sending_on_import(req->rq_import); + + sptlrpc_request_out_callback(req); + + spin_lock(&req->rq_lock); + req->rq_real_sent = ktime_get_real_seconds(); + req->rq_req_unlinked = 1; + /* reply_in_callback happened before request_out_callback? */ + if (req->rq_reply_unlinked) + wakeup = true; + + if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) { + /* Failed send: make it seem like the reply timed out, just + * like failing sends in client.c does currently... */ + req->rq_net_err = 1; + wakeup = true; + } + + if (wakeup) + ptlrpc_client_wake_req(req); + + spin_unlock(&req->rq_lock); + + ptlrpc_req_finished(req); + EXIT; +} + +/* + * Client's incoming reply callback + */ +void reply_in_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_request *req = cbid->cbid_arg; + ENTRY; + + DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); + + LASSERT (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK); + LASSERT (ev->md.start == req->rq_repbuf); + LASSERT (ev->offset + ev->mlength <= req->rq_repbuf_len); + /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests + for adaptive timeouts' early reply. */ + LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0); + + spin_lock(&req->rq_lock); + + req->rq_receiving_reply = 0; + req->rq_early = 0; + if (ev->unlinked) + req->rq_reply_unlinked = 1; + + if (ev->status) + goto out_wake; + + if (ev->type == LNET_EVENT_UNLINK) { + LASSERT(ev->unlinked); + DEBUG_REQ(D_NET, req, "unlink"); + goto out_wake; + } + + if (ev->mlength < ev->rlength ) { + CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req, + req->rq_replen, ev->rlength, ev->offset); + req->rq_reply_truncated = 1; + req->rq_replied = 1; + req->rq_status = -EOVERFLOW; + req->rq_nob_received = ev->rlength + ev->offset; + goto out_wake; + } + + if ((ev->offset == 0) && + ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) { + /* Early reply */ + DEBUG_REQ(D_ADAPTTO, req, + "Early reply received: mlen=%u offset=%d replen=%d " + "replied=%d unlinked=%d", ev->mlength, ev->offset, + req->rq_replen, req->rq_replied, ev->unlinked); + + req->rq_early_count++; /* number received, client side */ + + /* already got the real reply or buffers are already unlinked */ + if (req->rq_replied || + req->rq_reply_unlinked == 1) + goto out_wake; + + req->rq_early = 1; + req->rq_reply_off = ev->offset; + req->rq_nob_received = ev->mlength; + /* And we're still receiving */ + req->rq_receiving_reply = 1; + } else { + /* Real reply */ + req->rq_rep_swab_mask = 0; + req->rq_replied = 1; + /* Got reply, no resend required */ + req->rq_resend = 0; + req->rq_reply_off = ev->offset; + req->rq_nob_received = ev->mlength; + /* LNetMDUnlink can't be called under the LNET_LOCK, + so we must unlink in ptlrpc_unregister_reply */ + DEBUG_REQ(D_INFO, req, + "reply in flags=%x mlen=%u offset=%d replen=%d", + lustre_msg_get_flags(req->rq_reqmsg), + ev->mlength, ev->offset, req->rq_replen); + } + + if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) + req->rq_import->imp_last_reply_time = ktime_get_real_seconds(); + +out_wake: + /* NB don't unlock till after wakeup; req can disappear under us + * since we don't have our own ref */ + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); + EXIT; +} + +/* + * Client's bulk has been written/read + */ +void client_bulk_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; + struct ptlrpc_request *req; + ENTRY; + + LASSERT((ptlrpc_is_bulk_put_sink(desc->bd_type) && + ev->type == LNET_EVENT_PUT) || + (ptlrpc_is_bulk_get_source(desc->bd_type) && + ev->type == LNET_EVENT_GET) || + ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->unlinked); + + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE)) + ev->status = -EIO; + + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE)) + ev->status = -EIO; + + CDEBUG((ev->status == 0) ? D_NET : D_ERROR, + "event type %d, status %d, desc %p\n", + ev->type, ev->status, desc); + + spin_lock(&desc->bd_lock); + req = desc->bd_req; + LASSERT(desc->bd_refs > 0); + desc->bd_refs--; + + if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) { + desc->bd_nob_transferred += ev->mlength; + desc->bd_sender = ev->sender; + } else { + /* start reconnect and resend if network error hit */ + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + } + + if (ev->status != 0) + desc->bd_failure = 1; + + /* NB don't unlock till after wakeup; desc can disappear under us + * otherwise */ + if (desc->bd_refs == 0) + ptlrpc_client_wake_req(desc->bd_req); + + spin_unlock(&desc->bd_lock); + EXIT; +} + +/* + * We will have percpt request history list for ptlrpc service in upcoming + * patches because we don't want to be serialized by current per-service + * history operations. So we require history ID can (somehow) show arriving + * order w/o grabbing global lock, and user can sort them in userspace. + * + * This is how we generate history ID for ptlrpc_request: + * ---------------------------------------------------- + * | 32 bits | 16 bits | (16 - X)bits | X bits | + * ---------------------------------------------------- + * | seconds | usec / 16 | sequence | CPT id | + * ---------------------------------------------------- + * + * it might not be precise but should be good enough. + */ + +#define REQS_CPT_BITS(svcpt) ((svcpt)->scp_service->srv_cpt_bits) + +#define REQS_SEC_SHIFT 32 +#define REQS_USEC_SHIFT 16 +#define REQS_SEQ_SHIFT(svcpt) REQS_CPT_BITS(svcpt) + +static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + u64 sec = req->rq_arrival_time.tv_sec; + u32 usec = req->rq_arrival_time.tv_nsec / NSEC_PER_USEC / 16; /* usec / 16 */ + u64 new_seq; + + /* set sequence ID for request and add it to history list, + * it must be called with hold svcpt::scp_lock */ + + new_seq = (sec << REQS_SEC_SHIFT) | + (usec << REQS_USEC_SHIFT) | + (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt); + + if (new_seq > svcpt->scp_hist_seq) { + /* This handles the initial case of scp_hist_seq == 0 or + * we just jumped into a new time window */ + svcpt->scp_hist_seq = new_seq; + } else { + LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT); + /* NB: increase sequence number in current usec bucket, + * however, it's possible that we used up all bits for + * sequence and jumped into the next usec bucket (future time), + * then we hope there will be less RPCs per bucket at some + * point, and sequence will catch up again */ + svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt)); + new_seq = svcpt->scp_hist_seq; + } + + req->rq_history_seq = new_seq; + + list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs); +} + +/* + * Server's incoming request callback + */ +void request_in_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg; + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + struct ptlrpc_service *service = svcpt->scp_service; + struct ptlrpc_request *req; + ENTRY; + + LASSERT (ev->type == LNET_EVENT_PUT || + ev->type == LNET_EVENT_UNLINK); + LASSERT ((char *)ev->md.start >= rqbd->rqbd_buffer); + LASSERT ((char *)ev->md.start + ev->offset + ev->mlength <= + rqbd->rqbd_buffer + service->srv_buf_size); + + CDEBUG((ev->status == 0) ? D_NET : D_ERROR, + "event type %d, status %d, service %s\n", + ev->type, ev->status, service->srv_name); + + if (ev->unlinked) { + /* If this is the last request message to fit in the + * request buffer we can use the request object embedded in + * rqbd. Note that if we failed to allocate a request, + * we'd have to re-post the rqbd, which we can't do in this + * context. */ + req = &rqbd->rqbd_req; + memset(req, 0, sizeof (*req)); + } else { + LASSERT (ev->type == LNET_EVENT_PUT); + if (ev->status != 0) { + /* We moaned above already... */ + return; + } + req = ptlrpc_request_cache_alloc(GFP_ATOMIC); + if (req == NULL) { + CERROR("Can't allocate incoming request descriptor: " + "Dropping %s RPC from %s\n", + service->srv_name, + libcfs_id2str(ev->initiator)); + return; + } + } + + ptlrpc_srv_req_init(req); + /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL, + * flags are reset and scalars are zero. We only set the message + * size to non-zero if this was a successful receive. */ + req->rq_xid = ev->match_bits; + req->rq_reqbuf = ev->md.start + ev->offset; + if (ev->type == LNET_EVENT_PUT && ev->status == 0) + req->rq_reqdata_len = ev->mlength; + ktime_get_real_ts64(&req->rq_arrival_time); + /* Multi-Rail: keep track of both initiator and source NID. */ + req->rq_peer = ev->initiator; + req->rq_source = ev->source; + req->rq_self = ev->target.nid; + req->rq_rqbd = rqbd; + req->rq_phase = RQ_PHASE_NEW; + if (ev->type == LNET_EVENT_PUT) + CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n", + req, req->rq_xid, ev->mlength); + + CDEBUG(D_RPCTRACE, "peer: %s (source: %s)\n", + libcfs_id2str(req->rq_peer), libcfs_id2str(req->rq_source)); + + spin_lock(&svcpt->scp_lock); + + ptlrpc_req_add_history(svcpt, req); + + if (ev->unlinked) { + svcpt->scp_nrqbds_posted--; + CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n", + svcpt->scp_nrqbds_posted); + + /* Normally, don't complain about 0 buffers posted; LNET won't + * drop incoming reqs since we set the portal lazy */ + if (test_req_buffer_pressure && + ev->type != LNET_EVENT_UNLINK && + svcpt->scp_nrqbds_posted == 0) + CWARN("All %s request buffers busy\n", + service->srv_name); + + /* req takes over the network's ref on rqbd */ + } else { + /* req takes a ref on rqbd */ + rqbd->rqbd_refcount++; + } + + list_add_tail(&req->rq_list, &svcpt->scp_req_incoming); + svcpt->scp_nreqs_incoming++; + + /* NB everything can disappear under us once the request + * has been queued and we unlock, so do the wake now... */ + wake_up(&svcpt->scp_waitq); + + spin_unlock(&svcpt->scp_lock); + EXIT; +} + +/* + * Server's outgoing reply callback + */ +void reply_out_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_reply_state *rs = cbid->cbid_arg; + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + ENTRY; + + LASSERT (ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_ACK || + ev->type == LNET_EVENT_UNLINK); + + if (!rs->rs_difficult) { + /* 'Easy' replies have no further processing so I drop the + * net's ref on 'rs' */ + LASSERT (ev->unlinked); + ptlrpc_rs_decref(rs); + EXIT; + return; + } + + LASSERT (rs->rs_on_net); + + if (ev->unlinked) { + /* Last network callback. The net's ref on 'rs' stays put + * until ptlrpc_handle_rs() is done with it */ + spin_lock(&svcpt->scp_rep_lock); + spin_lock(&rs->rs_lock); + + rs->rs_on_net = 0; + if (!rs->rs_no_ack || + rs->rs_transno <= + rs->rs_export->exp_obd->obd_last_committed || + list_empty(&rs->rs_obd_list)) + ptlrpc_schedule_difficult_reply(rs); + + spin_unlock(&rs->rs_lock); + spin_unlock(&svcpt->scp_rep_lock); + } + EXIT; +} + +#ifdef HAVE_SERVER_SUPPORT +/* + * Server's bulk completion callback + */ +void server_bulk_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; + ENTRY; + + LASSERT(ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_UNLINK || + (ptlrpc_is_bulk_put_source(desc->bd_type) && + ev->type == LNET_EVENT_ACK) || + (ptlrpc_is_bulk_get_sink(desc->bd_type) && + ev->type == LNET_EVENT_REPLY)); + + CDEBUG((ev->status == 0) ? D_NET : D_ERROR, + "event type %d, status %d, desc %p\n", + ev->type, ev->status, desc); + + spin_lock(&desc->bd_lock); + + LASSERT(desc->bd_refs > 0); + + if ((ev->type == LNET_EVENT_ACK || + ev->type == LNET_EVENT_REPLY) && + ev->status == 0) { + /* We heard back from the peer, so even if we get this + * before the SENT event (oh yes we can), we know we + * read/wrote the peer buffer and how much... */ + desc->bd_nob_transferred += ev->mlength; + desc->bd_sender = ev->sender; + } + + if (ev->status != 0) + desc->bd_failure = 1; + + if (ev->unlinked) { + desc->bd_refs--; + /* This is the last callback no matter what... */ + if (desc->bd_refs == 0) + wake_up(&desc->bd_waitq); + } + + spin_unlock(&desc->bd_lock); + EXIT; +} +#endif + +static void ptlrpc_master_callback(struct lnet_event *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + void (*callback)(struct lnet_event *ev) = cbid->cbid_fn; + + /* Honestly, it's best to find out early. */ + LASSERT (cbid->cbid_arg != LP_POISON); + LASSERT (callback == request_out_callback || + callback == reply_in_callback || + callback == client_bulk_callback || + callback == request_in_callback || + callback == reply_out_callback +#ifdef HAVE_SERVER_SUPPORT + || callback == server_bulk_callback +#endif + ); + + callback (ev); +} + +int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, + struct lnet_process_id *peer, lnet_nid_t *self) +{ + int best_dist = 0; + __u32 best_order = 0; + int count = 0; + int rc = -ENOENT; + int dist; + __u32 order; + lnet_nid_t dst_nid; + lnet_nid_t src_nid; + + peer->pid = LNET_PID_LUSTRE; + + /* Choose the matching UUID that's closest */ + while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) { + if (peer->nid != LNET_NID_ANY && LNET_NIDADDR(peer->nid) == 0 && + LNET_NIDNET(dst_nid) != LNET_NIDNET(peer->nid)) + continue; + + dist = LNetDist(dst_nid, &src_nid, &order); + if (dist < 0) + continue; + + if (dist == 0) { /* local! use loopback LND */ + peer->nid = *self = LNET_NID_LO_0; + rc = 0; + break; + } + + if (rc < 0 || + dist < best_dist || + (dist == best_dist && order < best_order)) { + best_dist = dist; + best_order = order; + + peer->nid = dst_nid; + *self = src_nid; + rc = 0; + } + } + + CDEBUG(D_NET, "%s->%s\n", uuid->uuid, libcfs_id2str(*peer)); + return rc; +} + +void ptlrpc_ni_fini(void) +{ + wait_queue_head_t waitq; + struct l_wait_info lwi; + int rc; + int retries; + + /* Wait for the event queue to become idle since there may still be + * messages in flight with pending events (i.e. the fire-and-forget + * messages == client requests and "non-difficult" server + * replies */ + + for (retries = 0;; retries++) { + rc = LNetEQFree(ptlrpc_eq_h); + switch (rc) { + default: + LBUG(); + + case 0: + LNetNIFini(); + return; + + case -EBUSY: + if (retries != 0) + CWARN("Event queue still busy\n"); + + /* Wait for a bit */ + init_waitqueue_head(&waitq); + lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL); + l_wait_event(waitq, 0, &lwi); + break; + } + } + /* notreached */ +} + +lnet_pid_t ptl_get_pid(void) +{ + return LNET_PID_LUSTRE; +} + +int ptlrpc_ni_init(void) +{ + int rc; + lnet_pid_t pid; + + pid = ptl_get_pid(); + CDEBUG(D_NET, "My pid is: %x\n", pid); + + /* We're not passing any limits yet... */ + rc = LNetNIInit(pid); + if (rc < 0) { + CDEBUG (D_NET, "Can't init network interface: %d\n", rc); + return rc; + } + + /* CAVEAT EMPTOR: how we process portals events is _radically_ + * different depending on... */ + /* kernel LNet calls our master callback when there are new event, + * because we are guaranteed to get every event via callback, + * so we just set EQ size to 0 to avoid overhread of serializing + * enqueue/dequeue operations in LNet. */ + rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h); + if (rc == 0) + return 0; + + CERROR ("Failed to allocate event queue: %d\n", rc); + LNetNIFini(); + + return rc; +} + + +int ptlrpc_init_portals(void) +{ + int rc = ptlrpc_ni_init(); + + if (rc != 0) { + CERROR("network initialisation failed\n"); + return rc; + } + rc = ptlrpcd_addref(); + if (rc == 0) + return 0; + + CERROR("rpcd initialisation failed\n"); + ptlrpc_ni_fini(); + return rc; +} + +void ptlrpc_exit_portals(void) +{ + ptlrpcd_decref(); + ptlrpc_ni_fini(); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h new file mode 100644 index 0000000000000..a5f203e215389 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h @@ -0,0 +1,185 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * Somewhat simplified version of the gss api. + * + * Dug Song + * Andy Adamson + * Bruce Fields + * Copyright (c) 2000 The Regents of the University of Michigan + * + */ + +#ifndef __PTLRPC_GSS_GSS_API_H_ +#define __PTLRPC_GSS_GSS_API_H_ + +struct gss_api_mech; + +typedef int (*digest_hash)( + struct ahash_request *req, rawobj_t *hdr, + int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs); + +/* The mechanism-independent gss-api context: */ +struct gss_ctx { + struct gss_api_mech *mech_type; + void *internal_ctx_id; + digest_hash hash_func; +}; + +#define GSS_C_NO_BUFFER ((rawobj_t) 0) +#define GSS_C_NO_CONTEXT ((struct gss_ctx *) 0) +#define GSS_C_NULL_OID ((rawobj_t) 0) + +/* + * gss-api prototypes; note that these are somewhat simplified versions of + * the prototypes specified in RFC 2744. + */ +__u32 lgss_import_sec_context( + rawobj_t *input_token, + struct gss_api_mech *mech, + struct gss_ctx **ctx); +__u32 lgss_copy_reverse_context( + struct gss_ctx *ctx, + struct gss_ctx **ctx_new); +__u32 lgss_inquire_context( + struct gss_ctx *ctx, + time64_t *endtime); +__u32 lgss_get_mic( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token); +__u32 lgss_verify_mic( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token); +__u32 lgss_wrap( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *out_token); +__u32 lgss_unwrap( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *out_msg); +__u32 lgss_prep_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc); +__u32 lgss_wrap_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); +__u32 lgss_unwrap_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); +__u32 lgss_delete_sec_context( + struct gss_ctx **ctx); +int lgss_display( + struct gss_ctx *ctx, + char *buf, + int bufsize); + +struct subflavor_desc { + __u32 sf_subflavor; + __u32 sf_qop; + __u32 sf_service; + char *sf_name; +}; + +/* Each mechanism is described by the following struct: */ +struct gss_api_mech { + struct list_head gm_list; + struct module *gm_owner; + char *gm_name; + rawobj_t gm_oid; + atomic_t gm_count; + struct gss_api_ops *gm_ops; + int gm_sf_num; + struct subflavor_desc *gm_sfs; +}; + +/* and must provide the following operations: */ +struct gss_api_ops { + __u32 (*gss_import_sec_context)( + rawobj_t *input_token, + struct gss_ctx *ctx); + __u32 (*gss_copy_reverse_context)( + struct gss_ctx *ctx, + struct gss_ctx *ctx_new); + __u32 (*gss_inquire_context)( + struct gss_ctx *ctx, + time64_t *endtime); + __u32 (*gss_get_mic)( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token); + __u32 (*gss_verify_mic)( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token); + __u32 (*gss_wrap)( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *out_token); + __u32 (*gss_unwrap)( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *out_msg); + __u32 (*gss_prep_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc); + __u32 (*gss_wrap_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); + __u32 (*gss_unwrap_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); + void (*gss_delete_sec_context)( + void *ctx); + int (*gss_display)( + struct gss_ctx *ctx, + char *buf, + int bufsize); +}; + +int lgss_mech_register(struct gss_api_mech *mech); +void lgss_mech_unregister(struct gss_api_mech *mech); + +struct gss_api_mech * lgss_OID_to_mech(rawobj_t *oid); +struct gss_api_mech * lgss_name_to_mech(char *name); +struct gss_api_mech * lgss_subflavor_to_mech(__u32 subflavor); + +struct gss_api_mech * lgss_mech_get(struct gss_api_mech *mech); +void lgss_mech_put(struct gss_api_mech *mech); + +#endif /* __PTLRPC_GSS_GSS_API_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h new file mode 100644 index 0000000000000..1f535485bd0f3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h @@ -0,0 +1,84 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * minimal asn1 for generic encoding/decoding of gss tokens + * + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1995 by the Massachusetts Institute of Technology. + * All Rights Reserved. + * + * Export of this software from the United States of America may + * require a specific license from the United States Government. + * It is the responsibility of any person or organization contemplating + * export to obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of M.I.T. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. Furthermore if you modify this software you must label + * your software as modified software and not distribute it in such a + * fashion that it might be confused with the original M.I.T. software. + * M.I.T. makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + */ + +#define SIZEOF_INT 4 + +/* from gssapi_err_generic.h */ +#define G_BAD_SERVICE_NAME (-2045022976L) +#define G_BAD_STRING_UID (-2045022975L) +#define G_NOUSER (-2045022974L) +#define G_VALIDATE_FAILED (-2045022973L) +#define G_BUFFER_ALLOC (-2045022972L) +#define G_BAD_MSG_CTX (-2045022971L) +#define G_WRONG_SIZE (-2045022970L) +#define G_BAD_USAGE (-2045022969L) +#define G_UNKNOWN_QOP (-2045022968L) +#define G_NO_HOSTNAME (-2045022967L) +#define G_BAD_HOSTNAME (-2045022966L) +#define G_WRONG_MECH (-2045022965L) +#define G_BAD_TOK_HEADER (-2045022964L) +#define G_BAD_DIRECTION (-2045022963L) +#define G_TOK_TRUNC (-2045022962L) +#define G_REFLECT (-2045022961L) +#define G_WRONG_TOKID (-2045022960L) + +#define g_OID_equal(o1,o2) \ + (((o1)->len == (o2)->len) && \ + (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0)) + +__u32 g_verify_token_header(rawobj_t *mech, + int *body_size, + unsigned char **buf_in, + int toksize); + +__u32 g_get_mech_oid(rawobj_t *mech, + rawobj_t *in_buf); + +int g_token_size(rawobj_t *mech, + unsigned int body_size); + +void g_make_token_header(rawobj_t *mech, + int body_size, + unsigned char **buf); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c new file mode 100644 index 0000000000000..041dd12dac593 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c @@ -0,0 +1,521 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/gss/gss_bulk.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_cli_ctx *gctx; + struct lustre_msg *msg; + struct ptlrpc_bulk_sec_desc *bsd; + rawobj_t token; + __u32 maj; + int offset; + int rc; + ENTRY; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + LASSERT(gctx->gc_mechctx); + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + LASSERT(req->rq_reqbuf->lm_bufcount >= 3); + msg = req->rq_reqbuf; + offset = msg->lm_bufcount - 1; + break; + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(req->rq_reqbuf->lm_bufcount >= 4); + msg = req->rq_reqbuf; + offset = msg->lm_bufcount - 2; + break; + case SPTLRPC_SVC_PRIV: + LASSERT(req->rq_clrbuf->lm_bufcount >= 2); + msg = req->rq_clrbuf; + offset = msg->lm_bufcount - 1; + break; + default: + LBUG(); + } + + bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); + bsd->bsd_version = 0; + bsd->bsd_flags = 0; + bsd->bsd_type = SPTLRPC_BULK_DEFAULT; + bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc); + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL) + RETURN(0); + + LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG || + bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV); + + if (req->rq_bulk_read) { + /* + * bulk read: prepare receiving pages only for privacy mode. + */ + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV) + return gss_cli_prep_bulk(req, desc); + } else { + /* + * bulk write: sign or encrypt bulk pages. + */ + bsd->bsd_nob = desc->bd_nob; + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) { + /* integrity mode */ + token.data = bsd->bsd_data; + token.len = lustre_msg_buflen(msg, offset) - + sizeof(*bsd); + + maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL, + desc->bd_iov_count, + GET_KIOV(desc), + &token); + if (maj != GSS_S_COMPLETE) { + CWARN("failed to sign bulk data: %x\n", maj); + RETURN(-EACCES); + } + } else { + /* privacy mode */ + if (desc->bd_iov_count == 0) + RETURN(0); + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) { + CERROR("bulk write: failed to allocate " + "encryption pages: %d\n", rc); + RETURN(rc); + } + + token.data = bsd->bsd_data; + token.len = lustre_msg_buflen(msg, offset) - + sizeof(*bsd); + + maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0); + if (maj != GSS_S_COMPLETE) { + CWARN("fail to encrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + } + } + + RETURN(0); +} + +int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_cli_ctx *gctx; + struct lustre_msg *rmsg, *vmsg; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; + int roff, voff; + ENTRY; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + vmsg = req->rq_repdata; + LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 3); + voff = vmsg->lm_bufcount - 1; + + rmsg = req->rq_reqbuf; + LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 3); + roff = rmsg->lm_bufcount - 1; /* last segment */ + break; + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + vmsg = req->rq_repdata; + LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 4); + voff = vmsg->lm_bufcount - 2; + + rmsg = req->rq_reqbuf; + LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 4); + roff = rmsg->lm_bufcount - 2; /* second last segment */ + break; + case SPTLRPC_SVC_PRIV: + vmsg = req->rq_repdata; + LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 2); + voff = vmsg->lm_bufcount - 1; + + rmsg = req->rq_clrbuf; + LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 2); + roff = rmsg->lm_bufcount - 1; /* last segment */ + break; + default: + LBUG(); + } + + bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr)); + bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv)); + LASSERT(bsdr && bsdv); + + if (bsdr->bsd_version != bsdv->bsd_version || + bsdr->bsd_type != bsdv->bsd_type || + bsdr->bsd_svc != bsdv->bsd_svc) { + CERROR("bulk security descriptor mismatch: " + "(%u,%u,%u) != (%u,%u,%u)\n", + bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc, + bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc); + RETURN(-EPROTO); + } + + LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL || + bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG || + bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV); + + /* + * in privacy mode if return success, make sure bd_nob_transferred + * is the actual size of the clear text, otherwise upper layer + * may be surprised. + */ + if (req->rq_bulk_write) { + if (bsdv->bsd_flags & BSD_FL_ERR) { + CERROR("server reported bulk i/o failure\n"); + RETURN(-EIO); + } + + if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) + desc->bd_nob_transferred = desc->bd_nob; + } else { + /* + * bulk read, upon return success, bd_nob_transferred is + * the size of plain text actually received. + */ + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + LASSERT(gctx->gc_mechctx); + + if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) { + int i, nob; + + /* fix the actual data size */ + for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { + if (BD_GET_KIOV(desc, i).kiov_len + nob > + desc->bd_nob_transferred) { + BD_GET_KIOV(desc, i).kiov_len = + desc->bd_nob_transferred - nob; + } + nob += BD_GET_KIOV(desc, i).kiov_len; + } + + token.data = bsdv->bsd_data; + token.len = lustre_msg_buflen(vmsg, voff) - + sizeof(*bsdv); + + maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL, + desc->bd_iov_count, + GET_KIOV(desc), + &token); + if (maj != GSS_S_COMPLETE) { + CERROR("failed to verify bulk read: %x\n", maj); + RETURN(-EACCES); + } + } else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) { + desc->bd_nob = bsdv->bsd_nob; + if (desc->bd_nob == 0) + RETURN(0); + + token.data = bsdv->bsd_data; + token.len = lustre_msg_buflen(vmsg, voff) - + sizeof(*bsdr); + + maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc, + &token, 1); + if (maj != GSS_S_COMPLETE) { + CERROR("failed to decrypt bulk read: %x\n", + maj); + RETURN(-EACCES); + } + + desc->bd_nob_transferred = desc->bd_nob; + } + } + + RETURN(0); +} + +static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc, + struct gss_ctx *mechctx) +{ + int rc; + + if (desc->bd_iov_count == 0) + return 0; + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) + return rc; + + if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE) + return -EACCES; + + return 0; +} + +int gss_cli_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + int rc; + ENTRY; + + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read); + + if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV) + RETURN(0); + + rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx); + if (rc) + CERROR("bulk read: failed to prepare encryption " + "pages: %d\n", rc); + + RETURN(rc); +} + +int gss_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsd; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_write); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + LASSERT(grctx->src_reqbsd); + LASSERT(grctx->src_repbsd); + LASSERT(grctx->src_ctx); + LASSERT(grctx->src_ctx->gsc_mechctx); + + bsd = grctx->src_reqbsd; + if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV) + RETURN(0); + + rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx); + if (rc) + CERROR("bulk write: failed to prepare encryption " + "pages: %d\n", rc); + + RETURN(rc); +} + +int gss_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_write); + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + + LASSERT(grctx->src_reqbsd); + LASSERT(grctx->src_repbsd); + LASSERT(grctx->src_ctx); + LASSERT(grctx->src_ctx->gsc_mechctx); + + bsdr = grctx->src_reqbsd; + bsdv = grctx->src_repbsd; + + /* bsdr has been sanity checked during unpacking */ + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + switch (bsdv->bsd_svc) { + case SPTLRPC_BULK_SVC_INTG: + token.data = bsdr->bsd_data; + token.len = grctx->src_reqbsd_size - sizeof(*bsdr); + + maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL, + desc->bd_iov_count, + GET_KIOV(desc), &token); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to verify bulk signature: %x\n", maj); + RETURN(-EACCES); + } + break; + case SPTLRPC_BULK_SVC_PRIV: + if (bsdr->bsd_nob != desc->bd_nob) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("prepared nob %d doesn't match the actual " + "nob %d\n", desc->bd_nob, bsdr->bsd_nob); + RETURN(-EPROTO); + } + + if (desc->bd_iov_count == 0) { + LASSERT(desc->bd_nob == 0); + break; + } + + token.data = bsdr->bsd_data; + token.len = grctx->src_reqbsd_size - sizeof(*bsdr); + + maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx, + desc, &token, 0); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed decrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + + /* mimic gss_cli_ctx_unwrap_bulk */ + desc->bd_nob_transferred = desc->bd_nob; + + break; + } + + RETURN(0); +} + +int gss_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read); + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + + LASSERT(grctx->src_reqbsd); + LASSERT(grctx->src_repbsd); + LASSERT(grctx->src_ctx); + LASSERT(grctx->src_ctx->gsc_mechctx); + + bsdr = grctx->src_reqbsd; + bsdv = grctx->src_repbsd; + + /* bsdr has been sanity checked during unpacking */ + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + switch (bsdv->bsd_svc) { + case SPTLRPC_BULK_SVC_INTG: + token.data = bsdv->bsd_data; + token.len = grctx->src_repbsd_size - sizeof(*bsdv); + + maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL, + desc->bd_iov_count, + GET_KIOV(desc), &token); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to sign bulk data: %x\n", maj); + RETURN(-EACCES); + } + break; + case SPTLRPC_BULK_SVC_PRIV: + bsdv->bsd_nob = desc->bd_nob; + + if (desc->bd_iov_count == 0) { + LASSERT(desc->bd_nob == 0); + break; + } + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("bulk read: failed to allocate encryption " + "pages: %d\n", rc); + RETURN(rc); + } + + token.data = bsdv->bsd_data; + token.len = grctx->src_repbsd_size - sizeof(*bsdv); + + maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx, + desc, &token, 1); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to encrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + break; + } + + RETURN(0); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c new file mode 100644 index 0000000000000..70d4711c67a96 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c @@ -0,0 +1,445 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/gss/gss_cli_upcall.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +/********************************************** + * gss context init/fini helper * + **********************************************/ + +static +int ctx_init_pack_request(struct obd_import *imp, + struct ptlrpc_request *req, + int lustre_srv, + uid_t uid, gid_t gid, + long token_size, + char __user *token) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct gss_sec *gsec; + struct gss_header *ghdr; + struct ptlrpc_user_desc *pud; + __u32 *p, size, offset = 2; + rawobj_t obj; + + LASSERT(msg->lm_bufcount <= 4); + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_cli_ctx->cc_sec); + + /* gss hdr */ + ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr)); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = (__u8) imp->imp_sec->ps_part; + ghdr->gh_flags = 0; + ghdr->gh_proc = PTLRPC_GSS_PROC_INIT; + ghdr->gh_seq = 0; + ghdr->gh_svc = SPTLRPC_SVC_NULL; + ghdr->gh_handle.len = 0; + + /* fix the user desc */ + if (req->rq_pack_udesc) { + ghdr->gh_flags |= LUSTRE_GSS_PACK_USER; + + pud = lustre_msg_buf(msg, offset, sizeof(*pud)); + LASSERT(pud); + pud->pud_uid = pud->pud_fsuid = uid; + pud->pud_gid = pud->pud_fsgid = gid; + pud->pud_cap = 0; + pud->pud_ngroups = 0; + offset++; + } + + /* new clients are expected to set KCSUM flag */ + ghdr->gh_flags |= LUSTRE_GSS_PACK_KCSUM; + + /* security payload */ + p = lustre_msg_buf(msg, offset, 0); + size = msg->lm_buflens[offset]; + LASSERT(p); + + /* 1. lustre svc type */ + LASSERT(size > 4); + *p++ = cpu_to_le32(lustre_srv); + size -= 4; + + /* 2. target uuid */ + obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1; + obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid; + if (rawobj_serialize(&obj, &p, &size)) + LBUG(); + + /* 3. reverse context handle. actually only needed by root user, + * but we send it anyway. */ + gsec = sec2gsec(req->rq_cli_ctx->cc_sec); + obj.len = sizeof(gsec->gs_rvs_hdl); + obj.data = (__u8 *) &gsec->gs_rvs_hdl; + if (rawobj_serialize(&obj, &p, &size)) + LBUG(); + + /* 4. now the token */ + LASSERT(size >= (sizeof(__u32) + token_size)); + *p++ = cpu_to_le32(((__u32) token_size)); + if (copy_from_user(p, token, token_size)) { + CERROR("can't copy token\n"); + return -EFAULT; + } + size -= sizeof(__u32) + cfs_size_round4(token_size); + + req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset, + msg->lm_buflens[offset] - size, 0); + return 0; +} + +static +int ctx_init_parse_reply(struct lustre_msg *msg, int swabbed, + char __user *outbuf, long outlen) +{ + struct gss_rep_header *ghdr; + __u32 obj_len, round_len; + __u32 status, effective = 0; + + if (msg->lm_bufcount != 3) { + CERROR("unexpected bufcount %u\n", msg->lm_bufcount); + return -EPROTO; + } + + ghdr = (struct gss_rep_header *) gss_swab_header(msg, 0, swabbed); + if (ghdr == NULL) { + CERROR("unable to extract gss reply header\n"); + return -EPROTO; + } + + if (ghdr->gh_version != PTLRPC_GSS_VERSION) { + CERROR("invalid gss version %u\n", ghdr->gh_version); + return -EPROTO; + } + + if (outlen < (4 + 2) * 4 + cfs_size_round4(ghdr->gh_handle.len) + + cfs_size_round4(msg->lm_buflens[2])) { + CERROR("output buffer size %ld too small\n", outlen); + return -EFAULT; + } + + status = 0; + effective = 0; + + if (copy_to_user(outbuf, &status, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, &ghdr->gh_major, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, &ghdr->gh_minor, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, &ghdr->gh_seqwin, 4)) + return -EFAULT; + outbuf += 4; + effective += 4 * 4; + + /* handle */ + obj_len = ghdr->gh_handle.len; + round_len = (obj_len + 3) & ~3; + if (copy_to_user(outbuf, &obj_len, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, (char *) ghdr->gh_handle.data, round_len)) + return -EFAULT; + outbuf += round_len; + effective += 4 + round_len; + + /* out token */ + obj_len = msg->lm_buflens[2]; + round_len = (obj_len + 3) & ~3; + if (copy_to_user(outbuf, &obj_len, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, lustre_msg_buf(msg, 2, 0), round_len)) + return -EFAULT; + outbuf += round_len; + effective += 4 + round_len; + + return effective; +} + +/* XXX move to where lgssd could see */ +struct lgssd_ioctl_param { + int version; /* in */ + int secid; /* in */ + char __user *uuid; /* in */ + int lustre_svc; /* in */ + uid_t uid; /* in */ + gid_t gid; /* in */ + long send_token_size;/* in */ + char __user *send_token; /* in */ + long reply_buf_size; /* in */ + char __user *reply_buf; /* in */ + long status; /* out */ + long reply_length; /* out */ +}; + +int gss_do_ctx_init_rpc(char __user *buffer, unsigned long count) +{ + struct obd_import *imp; + struct ptlrpc_request *req; + struct lgssd_ioctl_param param; + struct obd_device *obd; + char obdname[64]; + long lsize; + int rc; + + if (count != sizeof(param)) { + CERROR("ioctl size %lu, expect %lu, please check lgss_keyring " + "version\n", count, (unsigned long) sizeof(param)); + RETURN(-EINVAL); + } + if (copy_from_user(¶m, buffer, sizeof(param))) { + CERROR("failed copy data from lgssd\n"); + RETURN(-EFAULT); + } + + if (param.version != GSSD_INTERFACE_VERSION) { + CERROR("gssd interface version %d (expect %d)\n", + param.version, GSSD_INTERFACE_VERSION); + RETURN(-EINVAL); + } + + /* take name */ + if (strncpy_from_user(obdname, param.uuid, sizeof(obdname)) <= 0) { + CERROR("Invalid obdname pointer\n"); + RETURN(-EFAULT); + } + + obd = class_name2obd(obdname); + if (!obd) { + CERROR("no such obd %s\n", obdname); + RETURN(-EINVAL); + } + + if (unlikely(!obd->obd_set_up)) { + CERROR("obd %s not setup\n", obdname); + RETURN(-EINVAL); + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + CERROR("obd %s has stopped\n", obdname); + spin_unlock(&obd->obd_dev_lock); + RETURN(-EINVAL); + } + + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME)) { + CERROR("obd %s is not a client device\n", obdname); + spin_unlock(&obd->obd_dev_lock); + RETURN(-EINVAL); + } + spin_unlock(&obd->obd_dev_lock); + + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import == NULL) { + CERROR("obd %s: import has gone\n", obd->obd_name); + up_read(&obd->u.cli.cl_sem); + RETURN(-EINVAL); + } + imp = class_import_get(obd->u.cli.cl_import); + up_read(&obd->u.cli.cl_sem); + + if (imp->imp_deactive) { + CERROR("import has been deactivated\n"); + class_import_put(imp); + RETURN(-EINVAL); + } + + req = ptlrpc_request_alloc_pack(imp, &RQF_SEC_CTX, LUSTRE_OBD_VERSION, + SEC_CTX_INIT); + if (req == NULL) { + param.status = -ENOMEM; + goto out_copy; + } + + if (req->rq_cli_ctx->cc_sec->ps_id != param.secid) { + CWARN("original secid %d, now has changed to %d, " + "cancel this negotiation\n", param.secid, + req->rq_cli_ctx->cc_sec->ps_id); + param.status = -EINVAL; + goto out_copy; + } + + /* get token */ + rc = ctx_init_pack_request(imp, req, + param.lustre_svc, + param.uid, param.gid, + param.send_token_size, + param.send_token); + if (rc) { + param.status = rc; + goto out_copy; + } + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) { + /* If any _real_ denial be made, we expect server return + * -EACCES reply or return success but indicate gss error + * inside reply messsage. All other errors are treated as + * timeout, caller might try the negotiation repeatedly, + * leave recovery decisions to general ptlrpc layer. + * + * FIXME maybe some other error code shouldn't be treated + * as timeout. */ + param.status = rc; + if (rc != -EACCES) + param.status = -ETIMEDOUT; + goto out_copy; + } + + LASSERT(req->rq_repdata); + lsize = ctx_init_parse_reply(req->rq_repdata, + ptlrpc_rep_need_swab(req), + param.reply_buf, param.reply_buf_size); + if (lsize < 0) { + param.status = (int) lsize; + goto out_copy; + } + + param.status = 0; + param.reply_length = lsize; + +out_copy: + if (copy_to_user(buffer, ¶m, sizeof(param))) + rc = -EFAULT; + else + rc = 0; + + class_import_put(imp); + ptlrpc_req_finished(req); + RETURN(rc); +} + +int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx) +{ + struct ptlrpc_cli_ctx *ctx = &gctx->gc_base; + struct obd_import *imp = ctx->cc_sec->ps_import; + struct ptlrpc_request *req; + struct ptlrpc_user_desc *pud; + int rc; + ENTRY; + + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (cli_ctx_is_error(ctx) || !cli_ctx_is_uptodate(ctx)) { + CDEBUG(D_SEC, "ctx %p(%u->%s) not uptodate, " + "don't send destroy rpc\n", ctx, + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + RETURN(0); + } + + might_sleep(); + + CWARN("%s ctx %p idx %#llx (%u->%s)\n", + sec_is_reverse(ctx->cc_sec) ? + "server finishing reverse" : "client finishing forward", + ctx, gss_handle_to_u64(&gctx->gc_handle), + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + + gctx->gc_proc = PTLRPC_GSS_PROC_DESTROY; + + req = ptlrpc_request_alloc(imp, &RQF_SEC_CTX); + if (req == NULL) { + CWARN("ctx %p(%u): fail to prepare rpc, destroy locally\n", + ctx, ctx->cc_vcred.vc_uid); + GOTO(out, rc = -ENOMEM); + } + + rc = ptlrpc_request_bufs_pack(req, LUSTRE_OBD_VERSION, SEC_CTX_FINI, + NULL, ctx); + if (rc) + GOTO(out_ref, rc); + + /* fix the user desc */ + if (req->rq_pack_udesc) { + /* we rely the fact that this request is in AUTH mode, + * and user_desc at offset 2. */ + pud = lustre_msg_buf(req->rq_reqbuf, 2, sizeof(*pud)); + LASSERT(pud); + pud->pud_uid = pud->pud_fsuid = ctx->cc_vcred.vc_uid; + pud->pud_gid = pud->pud_fsgid = ctx->cc_vcred.vc_gid; + pud->pud_cap = 0; + pud->pud_ngroups = 0; + } + + req->rq_phase = RQ_PHASE_RPC; + rc = ptl_send_rpc(req, 1); + if (rc) + CWARN("ctx %p(%u->%s): rpc error %d, destroy locally\n", ctx, + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), rc); + +out_ref: + ptlrpc_req_finished(req); +out: + RETURN(rc); +} + +int __init gss_init_cli_upcall(void) +{ + return 0; +} + +void gss_exit_cli_upcall(void) +{ +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c new file mode 100644 index 0000000000000..7be412d2d4a72 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c @@ -0,0 +1,464 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_krb5_mech.c + * linux/net/sunrpc/gss_krb5_crypto.c + * linux/net/sunrpc/gss_krb5_seal.c + * linux/net/sunrpc/gss_krb5_seqnum.c + * linux/net/sunrpc/gss_krb5_unseal.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include + +#include "gss_internal.h" +#include "gss_crypto.h" + +int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name, + const int alg_mode) +{ + int rc; + + kb->kb_tfm = crypto_alloc_sync_skcipher(alg_name, alg_mode, 0); + if (IS_ERR(kb->kb_tfm)) { + rc = PTR_ERR(kb->kb_tfm); + kb->kb_tfm = NULL; + CERROR("failed to alloc tfm: %s, mode %d: rc = %d\n", alg_name, + alg_mode, rc); + return rc; + } + + rc = crypto_sync_skcipher_setkey(kb->kb_tfm, kb->kb_key.data, + kb->kb_key.len); + if (rc) { + CERROR("failed to set %s key, len %d, rc = %d\n", alg_name, + kb->kb_key.len, rc); + return rc; + } + + return 0; +} + +void gss_keyblock_free(struct gss_keyblock *kb) +{ + rawobj_free(&kb->kb_key); + if (kb->kb_tfm) + crypto_free_sync_skcipher(kb->kb_tfm); +} + +int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb) +{ + return rawobj_dup(&new->kb_key, &kb->kb_key); +} + +int gss_get_bytes(char **ptr, const char *end, void *res, size_t len) +{ + char *p, *q; + p = *ptr; + q = p + len; + if (q > end || q < p) + return -EINVAL; + memcpy(res, p, len); + *ptr = q; + return 0; +} + +int gss_get_rawobj(char **ptr, const char *end, rawobj_t *res) +{ + char *p, *q; + __u32 len; + + p = *ptr; + if (gss_get_bytes(&p, end, &len, sizeof(len))) + return -EINVAL; + + q = p + len; + if (q > end || q < p) + return -EINVAL; + + /* Support empty objects */ + if (len != 0) { + OBD_ALLOC_LARGE(res->data, len); + if (!res->data) + return -ENOMEM; + } else { + res->len = len; + res->data = NULL; + return 0; + } + + res->len = len; + memcpy(res->data, p, len); + *ptr = q; + return 0; +} + +int gss_get_keyblock(char **ptr, const char *end, + struct gss_keyblock *kb, __u32 keysize) +{ + char *buf; + int rc; + + OBD_ALLOC_LARGE(buf, keysize); + if (buf == NULL) + return -ENOMEM; + + rc = gss_get_bytes(ptr, end, buf, keysize); + if (rc) { + OBD_FREE_LARGE(buf, keysize); + return rc; + } + + kb->kb_key.len = keysize; + kb->kb_key.data = buf; + return 0; +} + +/* + * Should be used for buffers allocated with k/vmalloc(). + * + * Dispose of @sgt with gss_teardown_sgtable(). + * + * @prealloc_sg is to avoid memory allocation inside sg_alloc_table() + * in cases where a single sg is sufficient. No attempt to reduce the + * number of sgs by squeezing physically contiguous pages together is + * made though, for simplicity. + * + * This function is copied from the ceph filesystem code. + */ +int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg, + const void *buf, unsigned int buf_len) +{ + struct scatterlist *sg; + const bool is_vmalloc = is_vmalloc_addr(buf); + unsigned int off = offset_in_page(buf); + unsigned int chunk_cnt = 1; + unsigned int chunk_len = PAGE_ALIGN(off + buf_len); + int i; + int rc; + + if (buf_len == 0) { + memset(sgt, 0, sizeof(*sgt)); + return -EINVAL; + } + + if (is_vmalloc) { + chunk_cnt = chunk_len >> PAGE_SHIFT; + chunk_len = PAGE_SIZE; + } + + if (chunk_cnt > 1) { + rc = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS); + if (rc) + return rc; + } else { + WARN_ON_ONCE(chunk_cnt != 1); + sg_init_table(prealloc_sg, 1); + sgt->sgl = prealloc_sg; + sgt->nents = sgt->orig_nents = 1; + } + + for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) { + struct page *page; + unsigned int len = min(chunk_len - off, buf_len); + + if (is_vmalloc) + page = vmalloc_to_page(buf); + else + page = virt_to_page(buf); + + sg_set_page(sg, page, len, off); + + off = 0; + buf += len; + buf_len -= len; + } + + WARN_ON_ONCE(buf_len != 0); + + return 0; +} + +void gss_teardown_sgtable(struct sg_table *sgt) +{ + if (sgt->orig_nents > 1) + sg_free_table(sgt); +} + +int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt, + const void *iv, const void *in, void *out, size_t length) +{ + struct scatterlist sg; + struct sg_table sg_out; + __u8 local_iv[16] = {0}; + __u32 ret = -EINVAL; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + LASSERT(tfm); + + if (length % crypto_sync_skcipher_blocksize(tfm) != 0) { + CERROR("output length %zu mismatch blocksize %d\n", + length, crypto_sync_skcipher_blocksize(tfm)); + goto out; + } + + if (crypto_sync_skcipher_ivsize(tfm) > ARRAY_SIZE(local_iv)) { + CERROR("iv size too large %d\n", + crypto_sync_skcipher_ivsize(tfm)); + goto out; + } + + if (iv) + memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm)); + + if (in != out) + memmove(out, in, length); + + ret = gss_setup_sgtable(&sg_out, &sg, out, length); + if (ret != 0) + goto out; + + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg, &sg, length, local_iv); + + if (decrypt) + ret = crypto_skcipher_decrypt_iv(req, &sg, &sg, length); + else + ret = crypto_skcipher_encrypt_iv(req, &sg, &sg, length); + + skcipher_request_zero(req); + gss_teardown_sgtable(&sg_out); +out: + return ret; +} + +int gss_digest_hash(struct ahash_request *req, + rawobj_t *hdr, int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs) +{ + struct scatterlist sg[1]; + struct sg_table sgt; + int rc = 0; + int i; + + for (i = 0; i < msgcnt; i++) { + if (msgs[i].len == 0) + continue; + + rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len); + if (rc) + return rc; + + ahash_request_set_crypt(req, sg, NULL, msgs[i].len); + rc = crypto_ahash_update(req); + gss_teardown_sgtable(&sgt); + if (rc) + return rc; + } + + for (i = 0; i < iovcnt; i++) { + if (iovs[i].kiov_len == 0) + continue; + + sg_init_table(sg, 1); + sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len, + iovs[i].kiov_offset); + + ahash_request_set_crypt(req, sg, NULL, iovs[i].kiov_len); + rc = crypto_ahash_update(req); + if (rc) + return rc; + } + + if (hdr) { + rc = gss_setup_sgtable(&sgt, sg, hdr->data, hdr->len); + if (rc) + return rc; + + ahash_request_set_crypt(req, sg, NULL, hdr->len); + rc = crypto_ahash_update(req); + gss_teardown_sgtable(&sgt); + if (rc) + return rc; + } + + return rc; +} + +int gss_digest_hash_compat(struct ahash_request *req, + rawobj_t *hdr, int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs) +{ + struct scatterlist sg[1]; + struct sg_table sgt; + int rc = 0; + int i; + + for (i = 0; i < msgcnt; i++) { + if (msgs[i].len == 0) + continue; + + rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len); + if (rc) + return rc; + + ahash_request_set_crypt(req, sg, NULL, msgs[i].len); + rc = crypto_ahash_update(req); + gss_teardown_sgtable(&sgt); + if (rc) + return rc; + } + + for (i = 0; i < iovcnt; i++) { + if (iovs[i].kiov_len == 0) + continue; + + sg_init_table(sg, 1); + sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len, + iovs[i].kiov_offset); + + ahash_request_set_crypt(req, sg, NULL, iovs[i].kiov_len); + rc = crypto_ahash_update(req); + if (rc) + return rc; + } + + if (hdr) { + rc = gss_setup_sgtable(&sgt, sg, &(hdr->len), sizeof(hdr->len)); + if (rc) + return rc; + + ahash_request_set_crypt(req, sg, NULL, sizeof(hdr->len)); + rc = crypto_ahash_update(req); + gss_teardown_sgtable(&sgt); + if (rc) + return rc; + } + + return rc; +} + +int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize) +{ + int padding; + + padding = (blocksize - (msg->len & (blocksize - 1))) & + (blocksize - 1); + if (!padding) + return 0; + + if (msg->len + padding > msg_buflen) { + CERROR("bufsize %u too small: datalen %u, padding %u\n", + msg_buflen, msg->len, padding); + return -EINVAL; + } + + memset(msg->data + msg->len, padding, padding); + msg->len += padding; + return 0; +} + +int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv, + int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj, + int enc) +{ + struct scatterlist src; + struct scatterlist dst; + struct sg_table sg_dst; + struct sg_table sg_src; + __u8 *buf; + __u32 datalen = 0; + int i, rc; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + ENTRY; + + buf = outobj->data; + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + + for (i = 0; i < inobj_cnt; i++) { + LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len); + + rc = gss_setup_sgtable(&sg_src, &src, inobjs[i].data, + inobjs[i].len); + if (rc != 0) + RETURN(rc); + + rc = gss_setup_sgtable(&sg_dst, &dst, buf, + outobj->len - datalen); + if (rc != 0) { + gss_teardown_sgtable(&sg_src); + RETURN(rc); + } + + skcipher_request_set_crypt(req, &src, &dst, src.length, iv); + if (!iv) + skcipher_request_set_crypt_iv(req); + + if (enc) + rc = crypto_skcipher_encrypt_iv(req, &dst, &src, + src.length); + else + rc = crypto_skcipher_decrypt_iv(req, &dst, &src, + src.length); + + gss_teardown_sgtable(&sg_src); + gss_teardown_sgtable(&sg_dst); + + if (rc) { + CERROR("encrypt error %d\n", rc); + skcipher_request_zero(req); + RETURN(rc); + } + + datalen += inobjs[i].len; + buf += inobjs[i].len; + } + skcipher_request_zero(req); + + outobj->len = datalen; + RETURN(0); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h new file mode 100644 index 0000000000000..7ed680a4c8430 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h @@ -0,0 +1,99 @@ +#ifndef PTLRPC_GSS_CRYPTO_H +#define PTLRPC_GSS_CRYPTO_H + +#include + +#include "gss_internal.h" + +#include + +/* + * linux v4.19-rc2-66-gb350bee5ea0f + * crypto: skcipher - Introduce crypto_sync_skcipher + * + * crypto_sync_skcipher will replace crypto_blkcipher so start using + * crypto_sync_skcipher and provide wrappers for older kernels + */ +#ifdef SYNC_SKCIPHER_REQUEST_ON_STACK + +#define crypto_skcipher_encrypt_iv(desc, dst, src, blocksize) \ + crypto_skcipher_encrypt((desc)) + +#define crypto_skcipher_decrypt_iv(desc, dst, src, blocksize) \ + crypto_skcipher_decrypt((desc)) + +#define skcipher_request_set_crypt_iv(d) + +#else /* ! SYNC_SKCIPHER_REQUEST_ON_STACK */ + +#define crypto_sync_skcipher crypto_blkcipher + +#define SYNC_SKCIPHER_REQUEST_ON_STACK(name, tfm) \ + struct blkcipher_desc __##name##_obj, *name = (void *)&__##name##_obj + +#define skcipher_request_set_sync_tfm(d, _tfm) \ + do { (d)->tfm = _tfm; } while (0) + +#define skcipher_request_set_callback(d, f, c, data) \ + do { (d)->flags = f; } while (0) + +#define skcipher_request_set_crypt(d, src, dst, cryptlen, iv) \ + do { (d)->info = iv; } while (0) + +#define skcipher_request_set_crypt_iv(d) \ + do { (d)->info = crypto_blkcipher_crt((d)->tfm)->iv; } while (0) + +#define crypto_sync_skcipher_blocksize(tfm) \ + crypto_blkcipher_blocksize((tfm)) + +#define crypto_sync_skcipher_setkey(tfm, key, keylen) \ + crypto_blkcipher_setkey((tfm), (key), (keylen)) + +#define crypto_alloc_sync_skcipher(name, type, mask) \ + crypto_alloc_blkcipher((name), (type), (mask)) + +#define crypto_free_sync_skcipher(tfm) \ + crypto_free_blkcipher((tfm)) + +#define crypto_sync_skcipher_ivsize(tfm) \ + crypto_blkcipher_ivsize((tfm)) + +#define crypto_skcipher_encrypt_iv(desc, dst, src, len) \ + crypto_blkcipher_encrypt_iv((desc), (dst), (src), (len)) + +#define crypto_skcipher_decrypt_iv(desc, dst, src, len) \ + crypto_blkcipher_decrypt_iv((desc), (dst), (src), (len)) + +#define skcipher_request_zero(req) /* nop */ + +#endif /* SYNC_SKCIPHER_REQUEST_ON_STACK */ + +struct gss_keyblock { + rawobj_t kb_key; + struct crypto_sync_skcipher *kb_tfm; +}; + +int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name, + const int alg_mode); +void gss_keyblock_free(struct gss_keyblock *kb); +int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb); +int gss_get_bytes(char **ptr, const char *end, void *res, size_t len); +int gss_get_rawobj(char **ptr, const char *end, rawobj_t *res); +int gss_get_keyblock(char **ptr, const char *end, struct gss_keyblock *kb, + __u32 keysize); +int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg, + const void *buf, unsigned int buf_len); +void gss_teardown_sgtable(struct sg_table *sgt); +int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt, + const void *iv, const void *in, void *out, size_t length); +int gss_digest_hash(struct ahash_request *req, rawobj_t *hdr, + int msgcnt, rawobj_t *msgs, int iovcnt, lnet_kiov_t *iovs); +int gss_digest_hash_compat(struct ahash_request *req, + rawobj_t *hdr, int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs); +int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize); +int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv, + int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj, + int enc); + +#endif /* PTLRPC_GSS_CRYPTO_H */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h new file mode 100644 index 0000000000000..bcf81304ff750 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h @@ -0,0 +1,193 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __PTLRPC_GSS_GSS_ERR_H_ +#define __PTLRPC_GSS_GSS_ERR_H_ + +typedef unsigned int OM_uint32; + +/* + * Flag bits for context-level services. + */ +#define GSS_C_DELEG_FLAG (1) +#define GSS_C_MUTUAL_FLAG (2) +#define GSS_C_REPLAY_FLAG (4) +#define GSS_C_SEQUENCE_FLAG (8) +#define GSS_C_CONF_FLAG (16) +#define GSS_C_INTEG_FLAG (32) +#define GSS_C_ANON_FLAG (64) +#define GSS_C_PROT_READY_FLAG (128) +#define GSS_C_TRANS_FLAG (256) + +/* + * Credential usage options + */ +#define GSS_C_BOTH (0) +#define GSS_C_INITIATE (1) +#define GSS_C_ACCEPT (2) + +/* + * Status code types for gss_display_status + */ +#define GSS_C_GSS_CODE (1) +#define GSS_C_MECH_CODE (2) + + +/* + * Define the default Quality of Protection for per-message services. Note + * that an implementation that offers multiple levels of QOP may either reserve + * a value (for example zero, as assumed here) to mean "default protection", or + * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit + * QOP value. However a value of 0 should always be interpreted by a GSSAPI + * implementation as a request for the default protection level. + */ +#define GSS_C_QOP_DEFAULT (0) + +/* + * Expiration time of 2^32-1 seconds means infinite lifetime for a + * credential or security context + */ +#define GSS_C_INDEFINITE ((OM_uint32) 0xfffffffful) + + +/* Major status codes */ + +#define GSS_S_COMPLETE (0) + +/* + * Some "helper" definitions to make the status code macros obvious. + */ +#define GSS_C_CALLING_ERROR_OFFSET (24) +#define GSS_C_ROUTINE_ERROR_OFFSET (16) +#define GSS_C_SUPPLEMENTARY_OFFSET (0) +#define GSS_C_CALLING_ERROR_MASK ((OM_uint32) 0377ul) +#define GSS_C_ROUTINE_ERROR_MASK ((OM_uint32) 0377ul) +#define GSS_C_SUPPLEMENTARY_MASK ((OM_uint32) 0177777ul) + +/* + * The macros that test status codes for error conditions. Note that the + * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now + * evaluates its argument only once. + */ +#define GSS_CALLING_ERROR(x) \ + ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET)) +#define GSS_ROUTINE_ERROR(x) \ + ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)) +#define GSS_SUPPLEMENTARY_INFO(x) \ + ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET)) +#define GSS_ERROR(x) \ + ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \ + (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))) + +/* + * Now the actual status code definitions + */ + +/* + * Calling errors: + */ +#define GSS_S_CALL_INACCESSIBLE_READ \ + (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET) +#define GSS_S_CALL_INACCESSIBLE_WRITE \ + (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET) +#define GSS_S_CALL_BAD_STRUCTURE \ + (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET) + +/* + * Routine errors: + */ +#define GSS_S_BAD_MECH \ + (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_NAME \ + (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_NAMETYPE \ + (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_BINDINGS \ + (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_STATUS \ + (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_SIG \ + (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NO_CRED \ + (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NO_CONTEXT \ + (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DEFECTIVE_TOKEN \ + (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DEFECTIVE_CREDENTIAL \ + (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_CREDENTIALS_EXPIRED \ + (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_CONTEXT_EXPIRED \ + (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_FAILURE \ + (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_QOP \ + (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_UNAUTHORIZED \ + (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_UNAVAILABLE \ + (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DUPLICATE_ELEMENT \ + (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NAME_NOT_MN \ + (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET) + +/* + * Supplementary info bits: + */ +#define GSS_S_CONTINUE_NEEDED (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0)) +#define GSS_S_DUPLICATE_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1)) +#define GSS_S_OLD_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2)) +#define GSS_S_UNSEQ_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3)) +#define GSS_S_GAP_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4)) + +/* XXXX these are not part of the GSSAPI C bindings! (but should be) */ + +#define GSS_CALLING_ERROR_FIELD(x) \ + (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK) +#define GSS_ROUTINE_ERROR_FIELD(x) \ + (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK) +#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \ + (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK) + +/* XXXX This is a necessary evil until the spec is fixed */ +#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE + +#endif /* __PTLRPC_GSS_GSS_ERR_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c new file mode 100644 index 0000000000000..23506f89d67c2 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c @@ -0,0 +1,284 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_generic_token.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_krb5.h" +#include "gss_asn1.h" + + +/* TWRITE_STR from gssapiP_generic.h */ +#define TWRITE_STR(ptr, str, len) \ + memcpy((ptr), (char *) (str), (len)); \ + (ptr) += (len); + +/* XXXX this code currently makes the assumption that a mech oid will + never be longer than 127 bytes. This assumption is not inherent in + the interfaces, so the code can be fixed if the OSI namespace + balloons unexpectedly. */ + +/* Each token looks like this: + +0x60 tag for APPLICATION 0, SEQUENCE + (constructed, definite-length) + possible multiple bytes, need to parse/generate + 0x06 tag for OBJECT IDENTIFIER + compile-time constant string (assume 1 byte) + compile-time constant string + the ANY containing the application token + bytes 0,1 are the token type + bytes 2,n are the token data + +For the purposes of this abstraction, the token "header" consists of +the sequence tag and length octets, the mech OID DER encoding, and the +first two inner bytes, which indicate the token type. The token +"body" consists of everything else. + +*/ + +static +int der_length_size(int length) +{ + if (length < (1 << 7)) + return 1; + else if (length < (1 << 8)) + return 2; +#if (SIZEOF_INT == 2) + else + return 3; +#else + else if (length < (1 << 16)) + return 3; + else if (length < (1 << 24)) + return 4; + else + return 5; +#endif +} + +static +void der_write_length(unsigned char **buf, int length) +{ + if (length < (1 << 7)) { + *(*buf)++ = (unsigned char) length; + } else { + *(*buf)++ = (unsigned char) (der_length_size(length) + 127); +#if (SIZEOF_INT > 2) + if (length >= (1 << 24)) + *(*buf)++ = (unsigned char) (length >> 24); + if (length >= (1 << 16)) + *(*buf)++ = (unsigned char) ((length >> 16) & 0xff); +#endif + if (length >= (1 << 8)) + *(*buf)++ = (unsigned char) ((length >> 8) & 0xff); + *(*buf)++ = (unsigned char) (length & 0xff); + } +} + +/* + * returns decoded length, or < 0 on failure. Advances buf and + * decrements bufsize + */ +static +int der_read_length(unsigned char **buf, int *bufsize) +{ + unsigned char sf; + int ret; + + if (*bufsize < 1) + return -1; + sf = *(*buf)++; + (*bufsize)--; + if (sf & 0x80) { + if ((sf &= 0x7f) > ((*bufsize) - 1)) + return -1; + if (sf > SIZEOF_INT) + return -1; + ret = 0; + for (; sf; sf--) { + ret = (ret << 8) + (*(*buf)++); + (*bufsize)--; + } + } else { + ret = sf; + } + + return ret; +} + +/* + * returns the length of a token, given the mech oid and the body size + */ +int g_token_size(rawobj_t *mech, unsigned int body_size) +{ + /* set body_size to sequence contents size */ + body_size += 4 + (int) mech->len; /* NEED overflow check */ + return (1 + der_length_size(body_size) + body_size); +} + +/* + * fills in a buffer with the token header. The buffer is assumed to + * be the right size. buf is advanced past the token header + */ +void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf) +{ + *(*buf)++ = 0x60; + der_write_length(buf, 4 + mech->len + body_size); + *(*buf)++ = 0x06; + *(*buf)++ = (unsigned char) mech->len; + TWRITE_STR(*buf, mech->data, ((int) mech->len)); +} + +/* + * Given a buffer containing a token, reads and verifies the token, + * leaving buf advanced past the token header, and setting body_size + * to the number of remaining bytes. Returns 0 on success, + * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the + * mechanism in the token does not match the mech argument. buf and + * *body_size are left unmodified on error. + */ +__u32 g_verify_token_header(rawobj_t *mech, int *body_size, + unsigned char **buf_in, int toksize) +{ + unsigned char *buf = *buf_in; + int seqsize; + rawobj_t toid; + int ret = 0; + + if ((toksize -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x60) + return (G_BAD_TOK_HEADER); + + if ((seqsize = der_read_length(&buf, &toksize)) < 0) + return(G_BAD_TOK_HEADER); + + if (seqsize != toksize) + return (G_BAD_TOK_HEADER); + + if ((toksize -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x06) + return (G_BAD_TOK_HEADER); + + if ((toksize -= 1) < 0) + return (G_BAD_TOK_HEADER); + toid.len = *buf++; + + if ((toksize -= toid.len) < 0) + return (G_BAD_TOK_HEADER); + toid.data = buf; + buf += toid.len; + + if (!g_OID_equal(&toid, mech)) + ret = G_WRONG_MECH; + + /* G_WRONG_MECH is not returned immediately because it's more + * important to return G_BAD_TOK_HEADER if the token header is + * in fact bad + */ + if ((toksize -= 2) < 0) + return (G_BAD_TOK_HEADER); + + if (ret) + return (ret); + + if (!ret) { + *buf_in = buf; + *body_size = toksize; + } + + return (ret); +} + +/* + * Given a buffer containing a token, returns a copy of the mech oid in + * the parameter mech. + */ +__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t *in_buf) +{ + unsigned char *buf = in_buf->data; + int len = in_buf->len; + int ret = 0; + int seqsize; + + if ((len -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x60) + return (G_BAD_TOK_HEADER); + + if ((seqsize = der_read_length(&buf, &len)) < 0) + return (G_BAD_TOK_HEADER); + + if ((len -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x06) + return (G_BAD_TOK_HEADER); + + if ((len -= 1) < 0) + return (G_BAD_TOK_HEADER); + mech->len = *buf++; + + if ((len -= mech->len) < 0) + return (G_BAD_TOK_HEADER); + OBD_ALLOC_LARGE(mech->data, mech->len); + if (!mech->data) + return (G_BUFFER_ALLOC); + memcpy(mech->data, buf, mech->len); + + return ret; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h new file mode 100644 index 0000000000000..c49a54021688f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h @@ -0,0 +1,576 @@ +/* + * Modified from NFSv4 project for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2016, Intel Corporation. + * + * Author: Eric Mei + */ + +#ifndef __PTLRPC_GSS_GSS_INTERNAL_H_ +#define __PTLRPC_GSS_GSS_INTERNAL_H_ + +#include +#include +#include + +/* + * rawobj stuff + */ +typedef struct netobj_s { + __u32 len; + __u8 data[0]; +} netobj_t; + +#define NETOBJ_EMPTY ((netobj_t) { 0 }) + +typedef struct rawobj_s { + __u32 len; + __u8 *data; +} rawobj_t; + +#define RAWOBJ_EMPTY ((rawobj_t) { 0, NULL }) + +typedef struct rawobj_buf_s { + __u32 dataoff; + __u32 datalen; + __u32 buflen; + __u8 *buf; +} rawobj_buf_t; + +int rawobj_empty(rawobj_t *obj); +int rawobj_alloc(rawobj_t *obj, char *buf, int len); +void rawobj_free(rawobj_t *obj); +int rawobj_equal(rawobj_t *a, rawobj_t *b); +int rawobj_dup(rawobj_t *dest, rawobj_t *src); +int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj); +int rawobj_from_netobj_alloc(rawobj_t *obj, netobj_t *netobj); + +int buffer_extract_bytes(const void **buf, __u32 *buflen, + void *res, __u32 reslen); + +/* + * several timeout values. client refresh upcall timeout we using + * default in pipefs implemnetation. + */ +#define __TIMEOUT_DELTA (10) + +#define GSS_SECINIT_RPC_TIMEOUT \ + (obd_timeout < __TIMEOUT_DELTA ? \ + __TIMEOUT_DELTA : obd_timeout - __TIMEOUT_DELTA) + +#define GSS_SECFINI_RPC_TIMEOUT (__TIMEOUT_DELTA) +#define GSS_SECSVC_UPCALL_TIMEOUT (GSS_SECINIT_RPC_TIMEOUT) + +/* + * default gc interval + */ +#define GSS_GC_INTERVAL (60 * 60) /* 60 minutes */ + +static inline time64_t gss_round_ctx_expiry(time64_t expiry, + unsigned long sec_flags) +{ + if (sec_flags & PTLRPC_SEC_FL_REVERSE) + return expiry; + + if (ktime_get_real_seconds() + __TIMEOUT_DELTA <= expiry) + return expiry - __TIMEOUT_DELTA; + + return expiry; +} + +/* + * Max encryption element in block cipher algorithms. + */ +#define GSS_MAX_CIPHER_BLOCK (16) + +/* + * XXX make it visible of kernel and lgssd/lsvcgssd + */ +enum { + GSSD_INTERFACE_VERSION_V1 = 1, + GSSD_INTERFACE_VERSION_V2 = 2, + GSSD_INTERFACE_VERSION = GSSD_INTERFACE_VERSION_V2, +}; + +#define PTLRPC_GSS_VERSION (1) + + +enum ptlrpc_gss_proc { + PTLRPC_GSS_PROC_DATA = 0, + PTLRPC_GSS_PROC_INIT = 1, + PTLRPC_GSS_PROC_CONTINUE_INIT = 2, + PTLRPC_GSS_PROC_DESTROY = 3, + PTLRPC_GSS_PROC_ERR = 4, +}; + +enum ptlrpc_gss_tgt { + LUSTRE_GSS_TGT_MGS = 0, + LUSTRE_GSS_TGT_MDS = 1, + LUSTRE_GSS_TGT_OSS = 2, +}; + +enum ptlrpc_gss_header_flags { + LUSTRE_GSS_PACK_BULK = 1, + LUSTRE_GSS_PACK_USER = 2, + LUSTRE_GSS_PACK_KCSUM = 4, +}; + +static inline +__u32 import_to_gss_svc(struct obd_import *imp) +{ + int cl_sp_to = LUSTRE_SP_ANY; + + if (imp->imp_obd) + cl_sp_to = imp->imp_obd->u.cli.cl_sp_to; + + switch (cl_sp_to) { + case LUSTRE_SP_MDT: + return LUSTRE_GSS_TGT_MDS; + case LUSTRE_SP_OST: + return LUSTRE_GSS_TGT_OSS; + case LUSTRE_SP_MGC: + case LUSTRE_SP_MGS: + return LUSTRE_GSS_TGT_MGS; + case LUSTRE_SP_CLI: + case LUSTRE_SP_ANY: + default: + return 0; + } +} + +/* + * following 3 header must have the same size and offset + */ +struct gss_header { + __u8 gh_version; /* gss version */ + __u8 gh_sp; /* sec part */ + __u16 gh_pad0; + __u32 gh_flags; /* wrap flags */ + __u32 gh_proc; /* proc */ + __u32 gh_seq; /* sequence */ + __u32 gh_svc; /* service */ + __u32 gh_pad1; + __u32 gh_pad2; + __u32 gh_pad3; + netobj_t gh_handle; /* context handle */ +}; + +struct gss_rep_header { + __u8 gh_version; + __u8 gh_sp; + __u16 gh_pad0; + __u32 gh_flags; + __u32 gh_proc; + __u32 gh_major; + __u32 gh_minor; + __u32 gh_seqwin; + __u32 gh_pad2; + __u32 gh_pad3; + netobj_t gh_handle; +}; + +struct gss_err_header { + __u8 gh_version; + __u8 gh_sp; + __u16 gh_pad0; + __u32 gh_flags; + __u32 gh_proc; + __u32 gh_major; + __u32 gh_minor; + __u32 gh_pad1; + __u32 gh_pad2; + __u32 gh_pad3; + netobj_t gh_handle; +}; + +/* + * part of wire context information send from client which be saved and + * used later by server. + */ +struct gss_wire_ctx { + __u32 gw_flags; + __u32 gw_proc; + __u32 gw_seq; + __u32 gw_svc; + rawobj_t gw_handle; +}; + +#define PTLRPC_GSS_MAX_HANDLE_SIZE (8) +#define PTLRPC_GSS_HEADER_SIZE (sizeof(struct gss_header) + \ + PTLRPC_GSS_MAX_HANDLE_SIZE) + + +static inline __u64 gss_handle_to_u64(rawobj_t *handle) +{ + if (handle->len != PTLRPC_GSS_MAX_HANDLE_SIZE) + return -1; + return *((__u64 *) handle->data); +} + +#define GSS_SEQ_WIN (2048) +#define GSS_SEQ_WIN_MAIN GSS_SEQ_WIN +#define GSS_SEQ_WIN_BACK (128) +#define GSS_SEQ_REPACK_THRESHOLD (GSS_SEQ_WIN_MAIN / 2 + \ + GSS_SEQ_WIN_MAIN / 4) + +struct gss_svc_seq_data { + spinlock_t ssd_lock; + /* + * highest sequence number seen so far, for main and back window + */ + __u32 ssd_max_main; + __u32 ssd_max_back; + /* + * main and back window + * for i such that ssd_max - GSS_SEQ_WIN < i <= ssd_max, the i-th bit + * of ssd_win is nonzero iff sequence number i has been seen already. + */ + unsigned long ssd_win_main[GSS_SEQ_WIN_MAIN/BITS_PER_LONG]; + unsigned long ssd_win_back[GSS_SEQ_WIN_BACK/BITS_PER_LONG]; +}; + +struct gss_svc_ctx { + struct gss_ctx *gsc_mechctx; + struct gss_svc_seq_data gsc_seqdata; + rawobj_t gsc_rvs_hdl; + __u32 gsc_rvs_seq; + uid_t gsc_uid; + gid_t gsc_gid; + uid_t gsc_mapped_uid; + unsigned int gsc_usr_root:1, + gsc_usr_mds:1, + gsc_usr_oss:1, + gsc_remote:1, + gsc_reverse:1; +}; + +struct gss_svc_reqctx { + struct ptlrpc_svc_ctx src_base; + /* + * context + */ + struct gss_wire_ctx src_wirectx; + struct gss_svc_ctx *src_ctx; + /* + * record place of bulk_sec_desc in request/reply buffer + */ + struct ptlrpc_bulk_sec_desc *src_reqbsd; + int src_reqbsd_size; + struct ptlrpc_bulk_sec_desc *src_repbsd; + int src_repbsd_size; + /* + * flags + */ + unsigned int src_init:1, + src_init_continue:1, + src_err_notify:1; + int src_reserve_len; +}; + +struct gss_cli_ctx { + struct ptlrpc_cli_ctx gc_base; + __u32 gc_flavor; + __u32 gc_proc; + __u32 gc_win; + atomic_t gc_seq; + rawobj_t gc_handle; + struct gss_ctx *gc_mechctx; + /* handle for the buddy svc ctx */ + rawobj_t gc_svc_handle; +}; + +struct gss_cli_ctx_keyring { + struct gss_cli_ctx gck_base; + struct key *gck_key; + struct timer_list gck_timer; +}; + +struct gss_sec { + struct ptlrpc_sec gs_base; + struct gss_api_mech *gs_mech; + spinlock_t gs_lock; + __u64 gs_rvs_hdl; +}; + +struct gss_sec_pipefs { + struct gss_sec gsp_base; + int gsp_chash_size; /* must be 2^n */ + struct hlist_head gsp_chash[0]; +}; + +/* + * FIXME cleanup the keyring upcall mutexes + */ +#define HAVE_KEYRING_UPCALL_SERIALIZED 1 + +struct gss_sec_keyring { + struct gss_sec gsk_base; + /* + * all contexts listed here. access is protected by sec spinlock. + */ + struct hlist_head gsk_clist; + /* + * specially point to root ctx (only one at a time). access is + * protected by sec spinlock. + */ + struct ptlrpc_cli_ctx *gsk_root_ctx; + /* + * specially serialize upcalls for root context. + */ + struct mutex gsk_root_uc_lock; + +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + struct mutex gsk_uc_lock; /* serialize upcalls */ +#endif +}; + +static inline struct gss_cli_ctx *ctx2gctx(struct ptlrpc_cli_ctx *ctx) +{ + return container_of(ctx, struct gss_cli_ctx, gc_base); +} + +static inline +struct gss_cli_ctx_keyring *ctx2gctx_keyring(struct ptlrpc_cli_ctx *ctx) +{ + return container_of(ctx2gctx(ctx), + struct gss_cli_ctx_keyring, gck_base); +} + +static inline struct gss_sec *sec2gsec(struct ptlrpc_sec *sec) +{ + return container_of(sec, struct gss_sec, gs_base); +} + +static inline struct gss_sec_pipefs *sec2gsec_pipefs(struct ptlrpc_sec *sec) +{ + return container_of(sec2gsec(sec), struct gss_sec_pipefs, gsp_base); +} + +static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec) +{ + return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base); +} + +#ifdef HAVE_CACHE_HASH_SPINLOCK +# define sunrpc_cache_lookup(c, i, h) sunrpc_cache_lookup_rcu((c), (i), (h)) +# define cache_read_lock(cdetail) spin_lock(&((cdetail)->hash_lock)) +# define cache_read_unlock(cdetail) spin_unlock(&((cdetail)->hash_lock)) +#else /* ! HAVE_CACHE_HASH_SPINLOCK */ +# define cache_read_lock(cdetail) read_lock(&((cdetail)->hash_lock)) +# define cache_read_unlock(cdetail) read_unlock(&((cdetail)->hash_lock)) +#endif + +#define GSS_CTX_INIT_MAX_LEN (1024) + +/* + * This only guaranteed be enough for current krb5 des-cbc-crc . We might + * adjust this when new enc type or mech added in. + */ +#define GSS_PRIVBUF_PREFIX_LEN (32) +#define GSS_PRIVBUF_SUFFIX_LEN (32) + +static inline +struct gss_svc_reqctx *gss_svc_ctx2reqctx(struct ptlrpc_svc_ctx *ctx) +{ + LASSERT(ctx); + return container_of(ctx, struct gss_svc_reqctx, src_base); +} + +static inline +struct gss_svc_ctx *gss_svc_ctx2gssctx(struct ptlrpc_svc_ctx *ctx) +{ + LASSERT(ctx); + return gss_svc_ctx2reqctx(ctx)->src_ctx; +} + +/* sec_gss.c */ +int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred); +int gss_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize); +int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); +int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); +int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); +int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); + +int gss_sec_install_rctx(struct obd_import *imp, struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); +int gss_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, + int msgsize); +void gss_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req); +int gss_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, + int msgsize); +void gss_free_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req); +int gss_enlarge_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, + int segment, int newsize); + +int gss_svc_accept(struct ptlrpc_sec_policy *policy, + struct ptlrpc_request *req); +void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx); +int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen); +int gss_svc_authorize(struct ptlrpc_request *req); +void gss_svc_free_rs(struct ptlrpc_reply_state *rs); +void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx); + +int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx); +int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx); + +int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx, + struct ptlrpc_svc_ctx *svc_ctx); + +struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment, + int swabbed); +netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment); + +void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx); +int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor); +int gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num, int set); + +int gss_sec_create_common(struct gss_sec *gsec, + struct ptlrpc_sec_policy *policy, + struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *sf); +void gss_sec_destroy_common(struct gss_sec *gsec); +void gss_sec_kill(struct ptlrpc_sec *sec); + +int gss_cli_ctx_init_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_ctx_ops *ctxops, + struct vfs_cred *vcred); +int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); + +void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize); + +/* gss_keyring.c */ +#ifndef HAVE_GSS_KEYRING +static inline int __init gss_init_keyring(void) { return 0; } +static inline void __exit gss_exit_keyring(void) { return; } +#else +int __init gss_init_keyring(void); +void __exit gss_exit_keyring(void); +#endif + +/* gss_pipefs.c */ +#ifndef HAVE_GSS_PIPEFS +static inline int __init gss_init_pipefs(void) { return 0; } +static inline void __exit gss_exit_pipefs(void) { return; } +#else +int __init gss_init_pipefs(void); +void __exit gss_exit_pipefs(void); +#endif + +/* gss_bulk.c */ +int gss_cli_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + +/* gss_generic_token.c */ +int g_token_size(rawobj_t *mech, unsigned int body_size); +void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf); +__u32 g_verify_token_header(rawobj_t *mech, int *body_size, + unsigned char **buf_in, int toksize); + + +/* gss_cli_upcall.c */ +int gss_do_ctx_init_rpc(char __user *buffer, unsigned long count); +int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx); + +int __init gss_init_cli_upcall(void); +void gss_exit_cli_upcall(void); + +/* gss_svc_upcall.c */ +__u64 gss_get_next_ctx_index(void); +int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp, + struct gss_sec *gsec, + struct gss_cli_ctx *gctx); +int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle); +int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx); +int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq); +int gss_svc_upcall_handle_init(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + struct obd_device *target, + __u32 lustre_svc, + rawobj_t *rvs_hdl, + rawobj_t *in_token); +struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req, + struct gss_wire_ctx *gw); +void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx); +void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx); + +int __init gss_init_svc_upcall(void); +void gss_exit_svc_upcall(void); +extern unsigned int krb5_allow_old_client_csum; + +/* lproc_gss.c */ +void gss_stat_oos_record_cli(int behind); +void gss_stat_oos_record_svc(int phase, int replay); + +int __init gss_init_lproc(void); +void gss_exit_lproc(void); + +/* gss_null_mech.c */ +int __init init_null_module(void); +void cleanup_null_module(void); + +/* gss_krb5_mech.c */ +int __init init_kerberos_module(void); +void cleanup_kerberos_module(void); + +/* gss_sk_mech.c */ +#ifdef HAVE_OPENSSL_SSK +int __init init_sk_module(void); +void cleanup_sk_module(void); +#else +static inline int init_sk_module(void) { return 0; } +static inline void cleanup_sk_module(void) { return; } +#endif /* HAVE_OPENSSL_SSK */ + +/* debug */ +static inline +void __dbg_memdump(char *name, void *ptr, int size) +{ + char *buf, *p = (char *) ptr; + int bufsize = size * 2 + 1, i; + + OBD_ALLOC(buf, bufsize); + if (!buf) { + CDEBUG(D_ERROR, "DUMP ERROR: can't alloc %d bytes\n", bufsize); + return; + } + + for (i = 0; i < size; i++) + sprintf(&buf[i+i], "%02x", (__u8) p[i]); + buf[size + size] = '\0'; + LCONSOLE_INFO("DUMP %s@%p(%d): %s\n", name, ptr, size, buf); + OBD_FREE(buf, bufsize); +} + +static inline unsigned int ll_read_key_usage(struct key *key) +{ +#ifdef HAVE_KEY_USAGE_REFCOUNT + return refcount_read(&key->usage); +#else + return atomic_read(&key->usage); +#endif +} + +#endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c new file mode 100644 index 0000000000000..845269c8acec3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c @@ -0,0 +1,1635 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/gss/gss_keyring.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +#ifdef HAVE_GET_REQUEST_KEY_AUTH +#include +#endif + +static struct ptlrpc_sec_policy gss_policy_keyring; +static struct ptlrpc_ctx_ops gss_keyring_ctxops; +static struct key_type gss_key_type; + +static int sec_install_rctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_svc_ctx *svc_ctx); + +/* + * the timeout is only for the case that upcall child process die abnormally. + * in any other cases it should finally update kernel key. + * + * FIXME we'd better to incorporate the client & server side upcall timeouts + * into the framework of Adaptive Timeouts, but we need to figure out how to + * make sure that kernel knows the upcall processes is in-progress or died + * unexpectedly. + */ +#define KEYRING_UPCALL_TIMEOUT (obd_timeout + obd_timeout) + +/**************************************** + * internal helpers * + ****************************************/ + +static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr) +{ +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + mutex_lock(&gsec_kr->gsk_uc_lock); +#endif +} + +static inline void keyring_upcall_unlock(struct gss_sec_keyring *gsec_kr) +{ +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + mutex_unlock(&gsec_kr->gsk_uc_lock); +#endif +} + +static inline void key_revoke_locked(struct key *key) +{ + set_bit(KEY_FLAG_REVOKED, &key->flags); +} + +static void ctx_upcall_timeout_kr(cfs_timer_cb_arg_t data) +{ + struct gss_cli_ctx_keyring *gctx_kr = cfs_from_timer(gctx_kr, + data, gck_timer); + struct ptlrpc_cli_ctx *ctx = &(gctx_kr->gck_base.gc_base); + struct key *key = gctx_kr->gck_key; + + CWARN("ctx %p, key %p\n", ctx, key); + + LASSERT(key); + + cli_ctx_expire(ctx); + key_revoke_locked(key); +} + +static void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, time64_t timeout) +{ + struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx); + struct timer_list *timer = &gctx_kr->gck_timer; + + LASSERT(timer); + + CDEBUG(D_SEC, "ctx %p: start timer %llds\n", ctx, timeout); + + cfs_timer_setup(timer, ctx_upcall_timeout_kr, + (unsigned long)gctx_kr, 0); + timer->expires = cfs_time_seconds(timeout) + jiffies; + add_timer(timer); +} + +/* + * caller should make sure no race with other threads + */ +static +void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx) +{ + struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx); + struct timer_list *timer = &gctx_kr->gck_timer; + + CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key); + + del_singleshot_timer_sync(timer); +} + +static +struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec, + struct vfs_cred *vcred) +{ + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx_keyring *gctx_kr; + + OBD_ALLOC_PTR(gctx_kr); + if (gctx_kr == NULL) + return NULL; + + cfs_timer_setup(&gctx_kr->gck_timer, NULL, 0, 0); + + ctx = &gctx_kr->gck_base.gc_base; + + if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) { + OBD_FREE_PTR(gctx_kr); + return NULL; + } + + ctx->cc_expire = ktime_get_real_seconds() + KEYRING_UPCALL_TIMEOUT; + clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags); + atomic_inc(&ctx->cc_refcount); /* for the caller */ + + return ctx; +} + +static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx); + + CDEBUG(D_SEC, "destroying ctx %p\n", ctx); + + /* at this time the association with key has been broken. */ + LASSERT(sec); + LASSERT(atomic_read(&sec->ps_refcount) > 0); + LASSERT(atomic_read(&sec->ps_nctx) > 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0); + LASSERT(gctx_kr->gck_key == NULL); + + ctx_clear_timer_kr(ctx); + + if (gss_cli_ctx_fini_common(sec, ctx)) + return; + + OBD_FREE_PTR(gctx_kr); + + atomic_dec(&sec->ps_nctx); + sptlrpc_sec_put(sec); +} + +static void ctx_release_kr(struct ptlrpc_cli_ctx *ctx, int sync) +{ + if (sync) { + ctx_destroy_kr(ctx); + } else { + atomic_inc(&ctx->cc_refcount); + sptlrpc_gc_add_ctx(ctx); + } +} + +static void ctx_put_kr(struct ptlrpc_cli_ctx *ctx, int sync) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (atomic_dec_and_test(&ctx->cc_refcount)) + ctx_release_kr(ctx, sync); +} + +/* + * key <-> ctx association and rules: + * - ctx might not bind with any key + * - key/ctx binding is protected by key semaphore (if the key present) + * - key and ctx each take a reference of the other + * - ctx enlist/unlist is protected by ctx spinlock + * - never enlist a ctx after it's been unlisted + * - whoever do enlist should also do bind, lock key before enlist: + * - lock key -> lock ctx -> enlist -> unlock ctx -> bind -> unlock key + * - whoever do unlist should also do unbind: + * - lock key -> lock ctx -> unlist -> unlock ctx -> unbind -> unlock key + * - lock ctx -> unlist -> unlock ctx -> lock key -> unbind -> unlock key + */ + +static inline void spin_lock_if(spinlock_t *lock, int condition) +{ + if (condition) + spin_lock(lock); +} + +static inline void spin_unlock_if(spinlock_t *lock, int condition) +{ + if (condition) + spin_unlock(lock); +} + +static void ctx_enlist_kr(struct ptlrpc_cli_ctx *ctx, int is_root, int locked) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + + LASSERT(!test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + spin_lock_if(&sec->ps_lock, !locked); + + atomic_inc(&ctx->cc_refcount); + set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags); + hlist_add_head(&ctx->cc_cache, &gsec_kr->gsk_clist); + if (is_root) + gsec_kr->gsk_root_ctx = ctx; + + spin_unlock_if(&sec->ps_lock, !locked); +} + +/* + * Note after this get called, caller should not access ctx again because + * it might have been freed, unless caller hold at least one refcount of + * the ctx. + * + * return non-zero if we indeed unlist this ctx. + */ +static int ctx_unlist_kr(struct ptlrpc_cli_ctx *ctx, int locked) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + + /* if hashed bit has gone, leave the job to somebody who is doing it */ + if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0) + return 0; + + /* drop ref inside spin lock to prevent race with other operations */ + spin_lock_if(&sec->ps_lock, !locked); + + if (gsec_kr->gsk_root_ctx == ctx) + gsec_kr->gsk_root_ctx = NULL; + hlist_del_init(&ctx->cc_cache); + atomic_dec(&ctx->cc_refcount); + + spin_unlock_if(&sec->ps_lock, !locked); + + return 1; +} + +/* + * Get specific payload. Newer kernels support 4 slots. + */ +static void * +key_get_payload(struct key *key, unsigned int index) +{ + void *key_ptr = NULL; + +#ifdef HAVE_KEY_PAYLOAD_DATA_ARRAY + key_ptr = key->payload.data[index]; +#else + if (!index) + key_ptr = key->payload.data; +#endif + return key_ptr; +} + +/* + * Set specific payload. Newer kernels support 4 slots. + */ +static int key_set_payload(struct key *key, unsigned int index, + struct ptlrpc_cli_ctx *ctx) +{ + int rc = -EINVAL; + +#ifdef HAVE_KEY_PAYLOAD_DATA_ARRAY + if (index < 4) { + key->payload.data[index] = ctx; +#else + if (!index) { + key->payload.data = ctx; +#endif + rc = 0; + } + return rc; +} + +/* + * bind a key with a ctx together. + * caller must hold write lock of the key, as well as ref on key & ctx. + */ +static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ll_read_key_usage(key) > 0); + LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL); + LASSERT(!key_get_payload(key, 0)); + + /* at this time context may or may not in list. */ + key_get(key); + atomic_inc(&ctx->cc_refcount); + ctx2gctx_keyring(ctx)->gck_key = key; + LASSERT(!key_set_payload(key, 0, ctx)); +} + +/* + * unbind a key and a ctx. + * caller must hold write lock, as well as a ref of the key. + */ +static void unbind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(key_get_payload(key, 0) == ctx); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0); + + /* must revoke the key, or others may treat it as newly created */ + key_revoke_locked(key); + + key_set_payload(key, 0, NULL); + ctx2gctx_keyring(ctx)->gck_key = NULL; + + /* once ctx get split from key, the timer is meaningless */ + ctx_clear_timer_kr(ctx); + + ctx_put_kr(ctx, 1); + key_put(key); +} + +/* + * given a ctx, unbind with its coupled key, if any. + * unbind could only be called once, so we don't worry the key be released + * by someone else. + */ +static void unbind_ctx_kr(struct ptlrpc_cli_ctx *ctx) +{ + struct key *key = ctx2gctx_keyring(ctx)->gck_key; + + if (key) { + LASSERT(key_get_payload(key, 0) == ctx); + + key_get(key); + down_write(&key->sem); + unbind_key_ctx(key, ctx); + up_write(&key->sem); + key_put(key); + } +} + +/* + * given a key, unbind with its coupled ctx, if any. + * caller must hold write lock, as well as a ref of the key. + */ +static void unbind_key_locked(struct key *key) +{ + struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0); + + if (ctx) + unbind_key_ctx(key, ctx); +} + +/* + * unlist a ctx, and unbind from coupled key + */ +static void kill_ctx_kr(struct ptlrpc_cli_ctx *ctx) +{ + if (ctx_unlist_kr(ctx, 0)) + unbind_ctx_kr(ctx); +} + +/* + * given a key, unlist and unbind with the coupled ctx (if any). + * caller must hold write lock, as well as a ref of the key. + */ +static void kill_key_locked(struct key *key) +{ + struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0); + + if (ctx && ctx_unlist_kr(ctx, 0)) + unbind_key_locked(key); +} + +/* + * caller should hold one ref on contexts in freelist. + */ +static void dispose_ctx_list_kr(struct hlist_head *freelist) +{ + struct hlist_node __maybe_unused *pos, *next; + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx *gctx; + + cfs_hlist_for_each_entry_safe(ctx, pos, next, freelist, cc_cache) { + hlist_del_init(&ctx->cc_cache); + + /* reverse ctx: update current seq to buddy svcctx if exist. + * ideally this should be done at gss_cli_ctx_finalize(), but + * the ctx destroy could be delayed by: + * 1) ctx still has reference; + * 2) ctx destroy is asynchronous; + * and reverse import call inval_all_ctx() require this be done + * _immediately_ otherwise newly created reverse ctx might copy + * the very old sequence number from svcctx. */ + gctx = ctx2gctx(ctx); + if (!rawobj_empty(&gctx->gc_svc_handle) && + sec_is_reverse(gctx->gc_base.cc_sec)) { + gss_svc_upcall_update_sequence(&gctx->gc_svc_handle, + (__u32) atomic_read(&gctx->gc_seq)); + } + + /* we need to wakeup waiting reqs here. the context might + * be forced released before upcall finished, then the + * late-arrived downcall can't find the ctx even. */ + sptlrpc_cli_ctx_wakeup(ctx); + + unbind_ctx_kr(ctx); + ctx_put_kr(ctx, 0); + } +} + +/* + * lookup a root context directly in a sec, return root ctx with a + * reference taken or NULL. + */ +static +struct ptlrpc_cli_ctx * sec_lookup_root_ctx_kr(struct ptlrpc_sec *sec) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct ptlrpc_cli_ctx *ctx = NULL; + + spin_lock(&sec->ps_lock); + + ctx = gsec_kr->gsk_root_ctx; + + if (ctx == NULL && unlikely(sec_is_reverse(sec))) { + struct hlist_node __maybe_unused *node; + struct ptlrpc_cli_ctx *tmp; + + /* reverse ctx, search root ctx in list, choose the one + * with shortest expire time, which is most possibly have + * an established peer ctx at client side. */ + cfs_hlist_for_each_entry(tmp, node, &gsec_kr->gsk_clist, + cc_cache) { + if (ctx == NULL || ctx->cc_expire == 0 || + ctx->cc_expire > tmp->cc_expire) { + ctx = tmp; + /* promote to be root_ctx */ + gsec_kr->gsk_root_ctx = ctx; + } + } + } + + if (ctx) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(!hlist_empty(&gsec_kr->gsk_clist)); + atomic_inc(&ctx->cc_refcount); + } + + spin_unlock(&sec->ps_lock); + + return ctx; +} + +#define RVS_CTX_EXPIRE_NICE (10) + +static +void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *new_ctx, + struct key *key) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct hlist_node __maybe_unused *hnode; + struct ptlrpc_cli_ctx *ctx; + time64_t now; + + ENTRY; + LASSERT(sec_is_reverse(sec)); + + spin_lock(&sec->ps_lock); + + now = ktime_get_real_seconds(); + + /* set all existing ctxs short expiry */ + cfs_hlist_for_each_entry(ctx, hnode, &gsec_kr->gsk_clist, cc_cache) { + if (ctx->cc_expire > now + RVS_CTX_EXPIRE_NICE) { + ctx->cc_early_expire = 1; + ctx->cc_expire = now + RVS_CTX_EXPIRE_NICE; + } + } + + /* if there's root_ctx there, instead obsolete the current + * immediately, we leave it continue operating for a little while. + * hopefully when the first backward rpc with newest ctx send out, + * the client side already have the peer ctx well established. */ + ctx_enlist_kr(new_ctx, gsec_kr->gsk_root_ctx ? 0 : 1, 1); + + if (key) + bind_key_ctx(key, new_ctx); + + spin_unlock(&sec->ps_lock); +} + +static void construct_key_desc(void *buf, int bufsize, + struct ptlrpc_sec *sec, uid_t uid) +{ + snprintf(buf, bufsize, "%d@%x", uid, sec->ps_id); + ((char *)buf)[bufsize - 1] = '\0'; +} + +/**************************************** + * sec apis * + ****************************************/ + +static +struct ptlrpc_sec * gss_sec_create_kr(struct obd_import *imp, + struct ptlrpc_svc_ctx *svcctx, + struct sptlrpc_flavor *sf) +{ + struct gss_sec_keyring *gsec_kr; + ENTRY; + + OBD_ALLOC(gsec_kr, sizeof(*gsec_kr)); + if (gsec_kr == NULL) + RETURN(NULL); + + INIT_HLIST_HEAD(&gsec_kr->gsk_clist); + gsec_kr->gsk_root_ctx = NULL; + mutex_init(&gsec_kr->gsk_root_uc_lock); +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + mutex_init(&gsec_kr->gsk_uc_lock); +#endif + + if (gss_sec_create_common(&gsec_kr->gsk_base, &gss_policy_keyring, + imp, svcctx, sf)) + goto err_free; + + if (svcctx != NULL && + sec_install_rctx_kr(&gsec_kr->gsk_base.gs_base, svcctx)) { + gss_sec_destroy_common(&gsec_kr->gsk_base); + goto err_free; + } + + RETURN(&gsec_kr->gsk_base.gs_base); + +err_free: + OBD_FREE(gsec_kr, sizeof(*gsec_kr)); + RETURN(NULL); +} + +static +void gss_sec_destroy_kr(struct ptlrpc_sec *sec) +{ + struct gss_sec *gsec = sec2gsec(sec); + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + + CDEBUG(D_SEC, "destroy %s@%p\n", sec->ps_policy->sp_name, sec); + + LASSERT(hlist_empty(&gsec_kr->gsk_clist)); + LASSERT(gsec_kr->gsk_root_ctx == NULL); + + gss_sec_destroy_common(gsec); + + OBD_FREE(gsec_kr, sizeof(*gsec_kr)); +} + +static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred) +{ + /* except the ROOTONLY flag, treat it as root user only if real uid + * is 0, euid/fsuid being 0 are handled as setuid scenarios */ + if (sec_is_rootonly(sec) || (vcred->vc_uid == 0)) + return 1; + else + return 0; +} + +/* + * kernel 5.3: commit 0f44e4d976f96c6439da0d6717238efa4b91196e + * keys: Move the user and user-session keyrings to the user_namespace + * + * When lookup_user_key is available use the kernel API rather than directly + * accessing the uid_keyring and session_keyring via the current process + * credentials. + */ +#ifdef HAVE_LOOKUP_USER_KEY + +/* from Linux security/keys/internal.h: */ +#ifndef KEY_LOOKUP_FOR_UNLINK +#define KEY_LOOKUP_FOR_UNLINK 0x04 +#endif + +static struct key *_user_key(key_serial_t id) +{ + key_ref_t ref; + + might_sleep(); + ref = lookup_user_key(id, KEY_LOOKUP_FOR_UNLINK, 0); + if (IS_ERR(ref)) + return NULL; + return key_ref_to_ptr(ref); +} + +static inline struct key *get_user_session_keyring(const struct cred *cred) +{ + return _user_key(KEY_SPEC_USER_SESSION_KEYRING); +} + +static inline struct key *get_user_keyring(const struct cred *cred) +{ + return _user_key(KEY_SPEC_USER_KEYRING); +} +#else +static inline struct key *get_user_session_keyring(const struct cred *cred) +{ + return key_get(cred->user->session_keyring); +} + +static inline struct key *get_user_keyring(const struct cred *cred) +{ + return key_get(cred->user->uid_keyring); +} +#endif + +/* + * unlink request key from it's ring, which is linked during request_key(). + * sadly, we have to 'guess' which keyring it's linked to. + * + * FIXME this code is fragile, it depends on how request_key() is implemented. + */ +static void request_key_unlink(struct key *key) +{ + const struct cred *cred = current_cred(); + struct key *ring = NULL; + + switch (cred->jit_keyring) { + case KEY_REQKEY_DEFL_DEFAULT: + case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: +#ifdef HAVE_GET_REQUEST_KEY_AUTH + if (cred->request_key_auth) { + struct request_key_auth *rka; + struct key *authkey = cred->request_key_auth; + + down_read(&authkey->sem); + rka = get_request_key_auth(authkey); + if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags)) + ring = key_get(rka->dest_keyring); + up_read(&authkey->sem); + if (ring) + break; + } +#endif + fallthrough; + case KEY_REQKEY_DEFL_THREAD_KEYRING: + ring = key_get(cred->thread_keyring); + if (ring) + break; + fallthrough; + case KEY_REQKEY_DEFL_PROCESS_KEYRING: + ring = key_get(cred->process_keyring); + if (ring) + break; + fallthrough; + case KEY_REQKEY_DEFL_SESSION_KEYRING: + rcu_read_lock(); + ring = key_get(rcu_dereference(cred->session_keyring)); + rcu_read_unlock(); + if (ring) + break; + fallthrough; + case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: + ring = get_user_session_keyring(cred); + break; + case KEY_REQKEY_DEFL_USER_KEYRING: + ring = get_user_keyring(cred); + break; + case KEY_REQKEY_DEFL_GROUP_KEYRING: + default: + LBUG(); + } + + LASSERT(ring); + key_unlink(ring, key); + key_put(ring); +} + +static +struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + struct obd_import *imp = sec->ps_import; + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct ptlrpc_cli_ctx *ctx = NULL; + unsigned int is_root = 0, create_new = 0; + struct key *key; + char desc[24]; + char *coinfo; + int coinfo_size; + const char *sec_part_flags = ""; + char svc_flag = '-'; + ENTRY; + + LASSERT(imp != NULL); + + is_root = user_is_root(sec, vcred); + + /* a little bit optimization for root context */ + if (is_root) { + ctx = sec_lookup_root_ctx_kr(sec); + /* + * Only lookup directly for REVERSE sec, which should + * always succeed. + */ + if (ctx || sec_is_reverse(sec)) + RETURN(ctx); + } + + LASSERT(create != 0); + + /* for root context, obtain lock and check again, this time hold + * the root upcall lock, make sure nobody else populated new root + * context after last check. */ + if (is_root) { + mutex_lock(&gsec_kr->gsk_root_uc_lock); + + ctx = sec_lookup_root_ctx_kr(sec); + if (ctx) + goto out; + + /* update reverse handle for root user */ + sec2gsec(sec)->gs_rvs_hdl = gss_get_next_ctx_index(); + + switch (sec->ps_part) { + case LUSTRE_SP_MDT: + sec_part_flags = "m"; + break; + case LUSTRE_SP_OST: + sec_part_flags = "o"; + break; + case LUSTRE_SP_MGC: + sec_part_flags = "rmo"; + break; + case LUSTRE_SP_CLI: + sec_part_flags = "r"; + break; + case LUSTRE_SP_MGS: + default: + LBUG(); + } + + switch (SPTLRPC_FLVR_SVC(sec->ps_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + svc_flag = 'n'; + break; + case SPTLRPC_SVC_AUTH: + svc_flag = 'a'; + break; + case SPTLRPC_SVC_INTG: + svc_flag = 'i'; + break; + case SPTLRPC_SVC_PRIV: + svc_flag = 'p'; + break; + default: + LBUG(); + } + } + + /* in case of setuid, key will be constructed as owner of fsuid/fsgid, + * but we do authentication based on real uid/gid. the key permission + * bits will be exactly as POS_ALL, so only processes who subscribed + * this key could have the access, although the quota might be counted + * on others (fsuid/fsgid). + * + * keyring will use fsuid/fsgid as upcall parameters, so we have to + * encode real uid/gid into callout info. + */ + + /* But first we need to make sure the obd type is supported */ + if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MGC_NAME) && + strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_LWP_NAME) && + strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSP_NAME)) { + CERROR("obd %s is not a supported device\n", + imp->imp_obd->obd_name); + GOTO(out, ctx = NULL); + } + + construct_key_desc(desc, sizeof(desc), sec, vcred->vc_uid); + + /* callout info format: + * secid:mech:uid:gid:sec_flags:svc_flag:svc_type:peer_nid:target_uuid: + * self_nid:pid + */ + coinfo_size = sizeof(struct obd_uuid) + MAX_OBD_NAME + 64; + OBD_ALLOC(coinfo, coinfo_size); + if (coinfo == NULL) + goto out; + + /* Last callout parameter is pid of process whose namespace will be used + * for credentials' retrieval. + * For user's credentials (in which case sec_part_flags is empty), use + * current PID instead of import's reference PID to get reference + * namespace. */ + snprintf(coinfo, coinfo_size, "%d:%s:%u:%u:%s:%c:%d:%#llx:%s:%#llx:%d", + sec->ps_id, sec2gsec(sec)->gs_mech->gm_name, + vcred->vc_uid, vcred->vc_gid, + sec_part_flags, svc_flag, import_to_gss_svc(imp), + imp->imp_connection->c_peer.nid, imp->imp_obd->obd_name, + imp->imp_connection->c_self, + sec_part_flags[0] == '\0' ? + current_pid() : imp->imp_sec_refpid); + + CDEBUG(D_SEC, "requesting key for %s\n", desc); + + keyring_upcall_lock(gsec_kr); + key = request_key(&gss_key_type, desc, coinfo); + keyring_upcall_unlock(gsec_kr); + + OBD_FREE(coinfo, coinfo_size); + + if (IS_ERR(key)) { + CERROR("failed request key: %ld\n", PTR_ERR(key)); + goto out; + } + CDEBUG(D_SEC, "obtained key %08x for %s\n", key->serial, desc); + + /* once payload.data was pointed to a ctx, it never changes until + * we de-associate them; but parallel request_key() may return + * a key with payload.data == NULL at the same time. so we still + * need wirtelock of key->sem to serialize them. */ + down_write(&key->sem); + + ctx = key_get_payload(key, 0); + if (likely(ctx)) { + LASSERT(atomic_read(&ctx->cc_refcount) >= 1); + LASSERT(ctx2gctx_keyring(ctx)->gck_key == key); + LASSERT(ll_read_key_usage(key) >= 2); + + /* simply take a ref and return. it's upper layer's + * responsibility to detect & replace dead ctx. */ + atomic_inc(&ctx->cc_refcount); + } else { + /* pre initialization with a cli_ctx. this can't be done in + * key_instantiate() because we'v no enough information + * there. */ + ctx = ctx_create_kr(sec, vcred); + if (ctx != NULL) { + ctx_enlist_kr(ctx, is_root, 0); + bind_key_ctx(key, ctx); + + ctx_start_timer_kr(ctx, KEYRING_UPCALL_TIMEOUT); + + CDEBUG(D_SEC, "installed key %p <-> ctx %p (sec %p)\n", + key, ctx, sec); + } else { + /* we'd prefer to call key_revoke(), but we more like + * to revoke it within this key->sem locked period. */ + key_revoke_locked(key); + } + + create_new = 1; + } + + up_write(&key->sem); + + if (is_root && create_new) + request_key_unlink(key); + + key_put(key); +out: + if (is_root) + mutex_unlock(&gsec_kr->gsk_root_uc_lock); + RETURN(ctx); +} + +static +void gss_sec_release_ctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync) +{ + LASSERT(atomic_read(&sec->ps_refcount) > 0); + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + ctx_release_kr(ctx, sync); +} + +/* + * flush context of normal user, we must resort to keyring itself to find out + * contexts which belong to me. + * + * Note here we suppose only to flush _my_ context, the "uid" will + * be ignored in the search. + */ +static +void flush_user_ctx_cache_kr(struct ptlrpc_sec *sec, + uid_t uid, + int grace, int force) +{ + struct key *key; + char desc[24]; + + /* nothing to do for reverse or rootonly sec */ + if (sec_is_reverse(sec) || sec_is_rootonly(sec)) + return; + + construct_key_desc(desc, sizeof(desc), sec, uid); + + /* there should be only one valid key, but we put it in the + * loop in case of any weird cases */ + for (;;) { + key = request_key(&gss_key_type, desc, NULL); + if (IS_ERR(key)) { + CDEBUG(D_SEC, "No more key found for current user\n"); + break; + } + + down_write(&key->sem); + + kill_key_locked(key); + + /* kill_key_locked() should usually revoke the key, but we + * revoke it again to make sure, e.g. some case the key may + * not well coupled with a context. */ + key_revoke_locked(key); + + up_write(&key->sem); + + request_key_unlink(key); + + key_put(key); + } +} + +/* + * flush context of root or all, we iterate through the list. + */ +static +void flush_spec_ctx_cache_kr(struct ptlrpc_sec *sec, uid_t uid, int grace, + int force) +{ + struct gss_sec_keyring *gsec_kr; + struct hlist_head freelist = HLIST_HEAD_INIT; + struct hlist_node __maybe_unused *pos, *next; + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + gsec_kr = sec2gsec_keyring(sec); + + spin_lock(&sec->ps_lock); + cfs_hlist_for_each_entry_safe(ctx, pos, next, + &gsec_kr->gsk_clist, cc_cache) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (uid != -1 && uid != ctx->cc_vcred.vc_uid) + continue; + + /* at this moment there's at least 2 base reference: + * key association and in-list. */ + if (atomic_read(&ctx->cc_refcount) > 2) { + if (!force) + continue; + CWARN("flush busy ctx %p(%u->%s, extra ref %d)\n", + ctx, ctx->cc_vcred.vc_uid, + sec2target_str(ctx->cc_sec), + atomic_read(&ctx->cc_refcount) - 2); + } + + set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags); + if (!grace) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + atomic_inc(&ctx->cc_refcount); + + if (ctx_unlist_kr(ctx, 1)) { + hlist_add_head(&ctx->cc_cache, &freelist); + } else { + LASSERT(atomic_read(&ctx->cc_refcount) >= 2); + atomic_dec(&ctx->cc_refcount); + } + } + spin_unlock(&sec->ps_lock); + + dispose_ctx_list_kr(&freelist); + EXIT; +} + +static +int gss_sec_flush_ctx_cache_kr(struct ptlrpc_sec *sec, + uid_t uid, int grace, int force) +{ + ENTRY; + + CDEBUG(D_SEC, "sec %p(%d, nctx %d), uid %d, grace %d, force %d\n", + sec, atomic_read(&sec->ps_refcount), + atomic_read(&sec->ps_nctx), + uid, grace, force); + + if (uid != -1 && uid != 0) + flush_user_ctx_cache_kr(sec, uid, grace, force); + else + flush_spec_ctx_cache_kr(sec, uid, grace, force); + + RETURN(0); +} + +static +void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct hlist_head freelist = HLIST_HEAD_INIT; + struct hlist_node __maybe_unused *pos, *next; + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + CWARN("running gc\n"); + + spin_lock(&sec->ps_lock); + cfs_hlist_for_each_entry_safe(ctx, pos, next, + &gsec_kr->gsk_clist, cc_cache) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + atomic_inc(&ctx->cc_refcount); + + if (cli_ctx_check_death(ctx) && ctx_unlist_kr(ctx, 1)) { + hlist_add_head(&ctx->cc_cache, &freelist); + CWARN("unhashed ctx %p\n", ctx); + } else { + LASSERT(atomic_read(&ctx->cc_refcount) >= 2); + atomic_dec(&ctx->cc_refcount); + } + } + spin_unlock(&sec->ps_lock); + + dispose_ctx_list_kr(&freelist); + EXIT; + return; +} + +static +int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct hlist_node __maybe_unused *pos, *next; + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx *gctx; + time64_t now = ktime_get_real_seconds(); + + ENTRY; + spin_lock(&sec->ps_lock); + cfs_hlist_for_each_entry_safe(ctx, pos, next, + &gsec_kr->gsk_clist, cc_cache) { + struct key *key; + char flags_str[40]; + char mech[40]; + + gctx = ctx2gctx(ctx); + key = ctx2gctx_keyring(ctx)->gck_key; + + gss_cli_ctx_flags2str(ctx->cc_flags, + flags_str, sizeof(flags_str)); + + if (gctx->gc_mechctx) + lgss_display(gctx->gc_mechctx, mech, sizeof(mech)); + else + snprintf(mech, sizeof(mech), "N/A"); + mech[sizeof(mech) - 1] = '\0'; + + seq_printf(seq, + "%p: uid %u, ref %d, expire %lld(%+lld), fl %s, seq %d, win %u, key %08x(ref %d), hdl %#llx:%#llx, mech: %s\n", + ctx, ctx->cc_vcred.vc_uid, + atomic_read(&ctx->cc_refcount), + ctx->cc_expire, + ctx->cc_expire ? ctx->cc_expire - now : 0, + flags_str, + atomic_read(&gctx->gc_seq), + gctx->gc_win, + key ? key->serial : 0, + key ? ll_read_key_usage(key) : 0, + gss_handle_to_u64(&gctx->gc_handle), + gss_handle_to_u64(&gctx->gc_svc_handle), + mech); + } + spin_unlock(&sec->ps_lock); + + RETURN(0); +} + +/**************************************** + * cli_ctx apis * + ****************************************/ + +static +int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx) +{ + /* upcall is already on the way */ + struct gss_cli_ctx *gctx = ctx ? ctx2gctx(ctx) : NULL; + + /* record latest sequence number in buddy svcctx */ + if (gctx && !rawobj_empty(&gctx->gc_svc_handle) && + sec_is_reverse(gctx->gc_base.cc_sec)) { + return gss_svc_upcall_update_sequence(&gctx->gc_svc_handle, + (__u32)atomic_read(&gctx->gc_seq)); + } + return 0; +} + +static +int gss_cli_ctx_validate_kr(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ctx->cc_sec); + + if (cli_ctx_check_death(ctx)) { + kill_ctx_kr(ctx); + return 1; + } + + if (cli_ctx_is_ready(ctx)) + return 0; + return 1; +} + +static +void gss_cli_ctx_die_kr(struct ptlrpc_cli_ctx *ctx, int grace) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ctx->cc_sec); + + cli_ctx_expire(ctx); + kill_ctx_kr(ctx); +} + +/**************************************** + * (reverse) service * + ****************************************/ + +/* + * reverse context could have nothing to do with keyrings. here we still keep + * the version which bind to a key, for future reference. + */ +#define HAVE_REVERSE_CTX_NOKEY + +#ifdef HAVE_REVERSE_CTX_NOKEY + +static +int sec_install_rctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct ptlrpc_cli_ctx *cli_ctx; + struct vfs_cred vcred = { .vc_uid = 0 }; + int rc; + + LASSERT(sec); + LASSERT(svc_ctx); + + cli_ctx = ctx_create_kr(sec, &vcred); + if (cli_ctx == NULL) + return -ENOMEM; + + rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx); + if (rc) { + CERROR("failed copy reverse cli ctx: %d\n", rc); + + ctx_put_kr(cli_ctx, 1); + return rc; + } + + rvs_sec_install_root_ctx_kr(sec, cli_ctx, NULL); + + ctx_put_kr(cli_ctx, 1); + + return 0; +} + +#else /* ! HAVE_REVERSE_CTX_NOKEY */ + +static +int sec_install_rctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct ptlrpc_cli_ctx *cli_ctx = NULL; + struct key *key; + struct vfs_cred vcred = { .vc_uid = 0 }; + char desc[64]; + int rc; + + LASSERT(sec); + LASSERT(svc_ctx); + CWARN("called\n"); + + construct_key_desc(desc, sizeof(desc), sec, 0); + + key = key_alloc(&gss_key_type, desc, 0, 0, + KEY_POS_ALL | KEY_USR_ALL, 1); + if (IS_ERR(key)) { + CERROR("failed to alloc key: %ld\n", PTR_ERR(key)); + return PTR_ERR(key); + } + + rc = key_instantiate_and_link(key, NULL, 0, NULL, NULL); + if (rc) { + CERROR("failed to instantiate key: %d\n", rc); + goto err_revoke; + } + + down_write(&key->sem); + + LASSERT(!key_get_payload(key, 0)); + + cli_ctx = ctx_create_kr(sec, &vcred); + if (cli_ctx == NULL) { + rc = -ENOMEM; + goto err_up; + } + + rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx); + if (rc) { + CERROR("failed copy reverse cli ctx: %d\n", rc); + goto err_put; + } + + rvs_sec_install_root_ctx_kr(sec, cli_ctx, key); + + ctx_put_kr(cli_ctx, 1); + up_write(&key->sem); + + rc = 0; + CWARN("ok!\n"); +out: + key_put(key); + return rc; + +err_put: + ctx_put_kr(cli_ctx, 1); +err_up: + up_write(&key->sem); +err_revoke: + key_revoke(key); + goto out; +} + +#endif /* HAVE_REVERSE_CTX_NOKEY */ + +/**************************************** + * service apis * + ****************************************/ + +static +int gss_svc_accept_kr(struct ptlrpc_request *req) +{ + return gss_svc_accept(&gss_policy_keyring, req); +} + +static +int gss_svc_install_rctx_kr(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct ptlrpc_sec *sec; + int rc; + + sec = sptlrpc_import_sec_ref(imp); + LASSERT(sec); + + rc = sec_install_rctx_kr(sec, svc_ctx); + sptlrpc_sec_put(sec); + + return rc; +} + +/**************************************** + * key apis * + ****************************************/ + +static +#ifdef HAVE_KEY_TYPE_INSTANTIATE_2ARGS +int gss_kt_instantiate(struct key *key, struct key_preparsed_payload *prep) +{ + const void *data = prep->data; + size_t datalen = prep->datalen; +#else +int gss_kt_instantiate(struct key *key, const void *data, size_t datalen) +{ +#endif + int rc; + ENTRY; + + if (data != NULL || datalen != 0) { + CERROR("invalid: data %p, len %lu\n", data, (long)datalen); + RETURN(-EINVAL); + } + + if (key_get_payload(key, 0)) { + CERROR("key already have payload\n"); + RETURN(-EINVAL); + } + + /* link the key to session keyring, so following context negotiation + * rpc fired from user space could find this key. This will be unlinked + * automatically when upcall processes die. + * + * we can't do this through keyctl from userspace, because the upcall + * might be neither possessor nor owner of the key (setuid). + * + * the session keyring is created upon upcall, and don't change all + * the way until upcall finished, so rcu lock is not needed here. + */ + LASSERT(current_cred()->session_keyring); + + lockdep_off(); + rc = key_link(current_cred()->session_keyring, key); + lockdep_on(); + if (unlikely(rc)) { + CERROR("failed to link key %08x to keyring %08x: %d\n", + key->serial, + current_cred()->session_keyring->serial, rc); + RETURN(rc); + } + + CDEBUG(D_SEC, "key %p instantiated, ctx %p\n", key, + key_get_payload(key, 0)); + RETURN(0); +} + +/* + * called with key semaphore write locked. it means we can operate + * on the context without fear of loosing refcount. + */ +static +#ifdef HAVE_KEY_TYPE_INSTANTIATE_2ARGS +int gss_kt_update(struct key *key, struct key_preparsed_payload *prep) +{ + const void *data = prep->data; + __u32 datalen32 = (__u32) prep->datalen; +#else +int gss_kt_update(struct key *key, const void *data, size_t datalen) +{ + __u32 datalen32 = (__u32) datalen; +#endif + struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0); + struct gss_cli_ctx *gctx; + rawobj_t tmpobj = RAWOBJ_EMPTY; + int rc; + ENTRY; + + if (data == NULL || datalen32 == 0) { + CWARN("invalid: data %p, len %lu\n", data, (long)datalen32); + RETURN(-EINVAL); + } + + /* if upcall finished negotiation too fast (mostly likely because + * of local error happened) and call kt_update(), the ctx + * might be still NULL. but the key will finally be associate + * with a context, or be revoked. if key status is fine, return + * -EAGAIN to allow userspace sleep a while and call again. */ + if (ctx == NULL) { + CDEBUG(D_SEC, "update too soon: key %p(%x) flags %lx\n", + key, key->serial, key->flags); + + rc = key_validate(key); + if (rc == 0) + RETURN(-EAGAIN); + else + RETURN(rc); + } + + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ctx->cc_sec); + + ctx_clear_timer_kr(ctx); + + /* don't proceed if already refreshed */ + if (cli_ctx_is_refreshed(ctx)) { + CWARN("ctx already done refresh\n"); + RETURN(0); + } + + sptlrpc_cli_ctx_get(ctx); + gctx = ctx2gctx(ctx); + + rc = buffer_extract_bytes(&data, &datalen32, &gctx->gc_win, + sizeof(gctx->gc_win)); + if (rc) { + CERROR("failed extract seq_win\n"); + goto out; + } + + if (gctx->gc_win == 0) { + __u32 nego_rpc_err, nego_gss_err; + + rc = buffer_extract_bytes(&data, &datalen32, &nego_rpc_err, + sizeof(nego_rpc_err)); + if (rc) { + CERROR("cannot extract RPC: rc = %d\n", rc); + goto out; + } + + rc = buffer_extract_bytes(&data, &datalen32, &nego_gss_err, + sizeof(nego_gss_err)); + if (rc) { + CERROR("failed to extract gss rc = %d\n", rc); + goto out; + } + + CERROR("negotiation: rpc err %d, gss err %x\n", + nego_rpc_err, nego_gss_err); + + rc = nego_rpc_err ? nego_rpc_err : -EACCES; + } else { + rc = rawobj_extract_local_alloc(&gctx->gc_handle, + (__u32 **) &data, &datalen32); + if (rc) { + CERROR("failed extract handle\n"); + goto out; + } + + rc = rawobj_extract_local(&tmpobj, + (__u32 **) &data, &datalen32); + if (rc) { + CERROR("failed extract mech\n"); + goto out; + } + + rc = lgss_import_sec_context(&tmpobj, + sec2gsec(ctx->cc_sec)->gs_mech, + &gctx->gc_mechctx); + if (rc != GSS_S_COMPLETE) + CERROR("failed import context\n"); + else + rc = 0; + } +out: + /* we don't care what current status of this ctx, even someone else + * is operating on the ctx at the same time. we just add up our own + * opinions here. */ + if (rc == 0) { + gss_cli_ctx_uptodate(gctx); + } else { + /* this will also revoke the key. has to be done before + * wakeup waiters otherwise they can find the stale key */ + kill_key_locked(key); + + cli_ctx_expire(ctx); + + if (rc != -ERESTART) + set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags); + } + + /* let user space think it's a success */ + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(0); +} + +#ifndef HAVE_KEY_MATCH_DATA +static int +gss_kt_match(const struct key *key, const void *desc) +{ + return strcmp(key->description, (const char *) desc) == 0 && + !test_bit(KEY_FLAG_REVOKED, &key->flags); +} +#else /* ! HAVE_KEY_MATCH_DATA */ +static bool +gss_kt_match(const struct key *key, const struct key_match_data *match_data) +{ + const char *desc = match_data->raw_data; + + return strcmp(key->description, desc) == 0 && + !test_bit(KEY_FLAG_REVOKED, &key->flags); +} + +/* + * Preparse the match criterion. + */ +static int gss_kt_match_preparse(struct key_match_data *match_data) +{ + match_data->lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT; + match_data->cmp = gss_kt_match; + return 0; +} +#endif /* HAVE_KEY_MATCH_DATA */ + +static +void gss_kt_destroy(struct key *key) +{ + ENTRY; + LASSERT(!key_get_payload(key, 0)); + CDEBUG(D_SEC, "destroy key %p\n", key); + EXIT; +} + +static +void gss_kt_describe(const struct key *key, struct seq_file *s) +{ + if (key->description == NULL) + seq_puts(s, "[null]"); + else + seq_puts(s, key->description); +} + +static struct key_type gss_key_type = +{ + .name = "lgssc", + .def_datalen = 0, + .instantiate = gss_kt_instantiate, + .update = gss_kt_update, +#ifdef HAVE_KEY_MATCH_DATA + .match_preparse = gss_kt_match_preparse, +#else + .match = gss_kt_match, +#endif + .destroy = gss_kt_destroy, + .describe = gss_kt_describe, +}; + +/**************************************** + * lustre gss keyring policy * + ****************************************/ + +static struct ptlrpc_ctx_ops gss_keyring_ctxops = { + .match = gss_cli_ctx_match, + .refresh = gss_cli_ctx_refresh_kr, + .validate = gss_cli_ctx_validate_kr, + .die = gss_cli_ctx_die_kr, + .sign = gss_cli_ctx_sign, + .verify = gss_cli_ctx_verify, + .seal = gss_cli_ctx_seal, + .unseal = gss_cli_ctx_unseal, + .wrap_bulk = gss_cli_ctx_wrap_bulk, + .unwrap_bulk = gss_cli_ctx_unwrap_bulk, +}; + +static struct ptlrpc_sec_cops gss_sec_keyring_cops = { + .create_sec = gss_sec_create_kr, + .destroy_sec = gss_sec_destroy_kr, + .kill_sec = gss_sec_kill, + .lookup_ctx = gss_sec_lookup_ctx_kr, + .release_ctx = gss_sec_release_ctx_kr, + .flush_ctx_cache = gss_sec_flush_ctx_cache_kr, + .gc_ctx = gss_sec_gc_ctx_kr, + .install_rctx = gss_sec_install_rctx, + .alloc_reqbuf = gss_alloc_reqbuf, + .free_reqbuf = gss_free_reqbuf, + .alloc_repbuf = gss_alloc_repbuf, + .free_repbuf = gss_free_repbuf, + .enlarge_reqbuf = gss_enlarge_reqbuf, + .display = gss_sec_display_kr, +}; + +static struct ptlrpc_sec_sops gss_sec_keyring_sops = { + .accept = gss_svc_accept_kr, + .invalidate_ctx = gss_svc_invalidate_ctx, + .alloc_rs = gss_svc_alloc_rs, + .authorize = gss_svc_authorize, + .free_rs = gss_svc_free_rs, + .free_ctx = gss_svc_free_ctx, + .prep_bulk = gss_svc_prep_bulk, + .unwrap_bulk = gss_svc_unwrap_bulk, + .wrap_bulk = gss_svc_wrap_bulk, + .install_rctx = gss_svc_install_rctx_kr, +}; + +static struct ptlrpc_sec_policy gss_policy_keyring = { + .sp_owner = THIS_MODULE, + .sp_name = "gss.keyring", + .sp_policy = SPTLRPC_POLICY_GSS, + .sp_cops = &gss_sec_keyring_cops, + .sp_sops = &gss_sec_keyring_sops, +}; + + +int __init gss_init_keyring(void) +{ + int rc; + + rc = register_key_type(&gss_key_type); + if (rc) { + CERROR("failed to register keyring type: %d\n", rc); + return rc; + } + + rc = sptlrpc_register_policy(&gss_policy_keyring); + if (rc) { + unregister_key_type(&gss_key_type); + return rc; + } + + return 0; +} + +void __exit gss_exit_keyring(void) +{ + unregister_key_type(&gss_key_type); + sptlrpc_unregister_policy(&gss_policy_keyring); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h new file mode 100644 index 0000000000000..611160458d9b1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h @@ -0,0 +1,160 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * linux/include/linux/sunrpc/gss_krb5_types.h + * + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Bruce Fields + */ + +/* + * Copyright 1995 by the Massachusetts Institute of Technology. + * All Rights Reserved. + * + * Export of this software from the United States of America may + * require a specific license from the United States Government. + * It is the responsibility of any person or organization contemplating + * export to obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of M.I.T. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. Furthermore if you modify this software you must label + * your software as modified software and not distribute it in such a + * fashion that it might be confused with the original M.I.T. software. + * M.I.T. makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + */ + +#ifndef PTLRPC_GSS_KRB5_H +#define PTLRPC_GSS_KRB5_H + +#include "gss_crypto.h" + +/* + * RFC 4142 + */ + +#define KG_USAGE_ACCEPTOR_SEAL 22 +#define KG_USAGE_ACCEPTOR_SIGN 23 +#define KG_USAGE_INITIATOR_SEAL 24 +#define KG_USAGE_INITIATOR_SIGN 25 + +#define KG_TOK_MIC_MSG 0x0404 +#define KG_TOK_WRAP_MSG 0x0504 + +#define FLAG_SENDER_IS_ACCEPTOR 0x01 +#define FLAG_WRAP_CONFIDENTIAL 0x02 +#define FLAG_ACCEPTOR_SUBKEY 0x04 + +struct krb5_header { + __u16 kh_tok_id; /* token id */ + __u8 kh_flags; /* acceptor flags */ + __u8 kh_filler; /* 0xff */ + __u16 kh_ec; /* extra count */ + __u16 kh_rrc; /* right rotation count */ + __u64 kh_seq; /* sequence number */ + __u8 kh_cksum[0]; /* checksum */ +}; + +struct krb5_ctx { + unsigned int kc_initiate:1, + kc_cfx:1, + kc_seed_init:1, + kc_have_acceptor_subkey:1; + time64_t kc_endtime; + __u8 kc_seed[16]; + __u64 kc_seq_send; + __u64 kc_seq_recv; + __u32 kc_enctype; + struct gss_keyblock kc_keye; /* encryption */ + struct gss_keyblock kc_keyi; /* integrity */ + struct gss_keyblock kc_keyc; /* checksum */ + rawobj_t kc_mech_used; +}; + +enum sgn_alg { + SGN_ALG_DES_MAC_MD5 = 0x0000, + SGN_ALG_MD2_5 = 0x0001, + SGN_ALG_DES_MAC = 0x0002, + SGN_ALG_3 = 0x0003, /* not published */ + SGN_ALG_HMAC_MD5 = 0x0011, /* microsoft w2k; no support */ + SGN_ALG_HMAC_SHA1_DES3_KD = 0x0004 +}; + +enum seal_alg { + SEAL_ALG_NONE = 0xffff, + SEAL_ALG_DES = 0x0000, + SEAL_ALG_1 = 0x0001, /* not published */ + SEAL_ALG_MICROSOFT_RC4 = 0x0010, /* microsoft w2k; no support */ + SEAL_ALG_DES3KD = 0x0002 +}; + +#define CKSUMTYPE_CRC32 0x0001 +#define CKSUMTYPE_RSA_MD4 0x0002 +#define CKSUMTYPE_RSA_MD4_DES 0x0003 +#define CKSUMTYPE_DESCBC 0x0004 +/* des-mac-k */ +/* rsa-md4-des-k */ +#define CKSUMTYPE_RSA_MD5 0x0007 +#define CKSUMTYPE_RSA_MD5_DES 0x0008 +#define CKSUMTYPE_NIST_SHA 0x0009 +#define CKSUMTYPE_HMAC_SHA1_DES3 0x000c +#define CKSUMTYPE_HMAC_SHA1_96_AES128 0x000f +#define CKSUMTYPE_HMAC_SHA1_96_AES256 0x0010 +#define CKSUMTYPE_HMAC_MD5_ARCFOUR -138 + +/* from gssapi_err_krb5.h */ +#define KG_CCACHE_NOMATCH (39756032L) +#define KG_KEYTAB_NOMATCH (39756033L) +#define KG_TGT_MISSING (39756034L) +#define KG_NO_SUBKEY (39756035L) +#define KG_CONTEXT_ESTABLISHED (39756036L) +#define KG_BAD_SIGN_TYPE (39756037L) +#define KG_BAD_LENGTH (39756038L) +#define KG_CTX_INCOMPLETE (39756039L) +#define KG_CONTEXT (39756040L) +#define KG_CRED (39756041L) +#define KG_ENC_DESC (39756042L) +#define KG_BAD_SEQ (39756043L) +#define KG_EMPTY_CCACHE (39756044L) +#define KG_NO_CTYPES (39756045L) + +/* per Kerberos v5 protocol spec crypto types from the wire. + * these get mapped to linux kernel crypto routines. + */ +#define ENCTYPE_NULL 0x0000 +#define ENCTYPE_DES_CBC_CRC 0x0001 /* DES cbc mode with CRC-32 */ +#define ENCTYPE_DES_CBC_MD4 0x0002 /* DES cbc mode with RSA-MD4 */ +#define ENCTYPE_DES_CBC_MD5 0x0003 /* DES cbc mode with RSA-MD5 */ +#define ENCTYPE_DES_CBC_RAW 0x0004 /* DES cbc mode raw */ +/* XXX deprecated? */ +#define ENCTYPE_DES3_CBC_SHA 0x0005 /* DES-3 cbc mode with NIST-SHA */ +#define ENCTYPE_DES3_CBC_RAW 0x0006 /* DES-3 cbc mode raw */ +#define ENCTYPE_DES_HMAC_SHA1 0x0008 +#define ENCTYPE_DES3_CBC_SHA1 0x0010 +#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011 +#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012 +#define ENCTYPE_ARCFOUR_HMAC 0x0017 +#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018 +#define ENCTYPE_UNKNOWN 0x01ff + +#endif /* PTLRPC_GSS_KRB5_H */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c new file mode 100644 index 0000000000000..bd3a94ba162b3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c @@ -0,0 +1,1610 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2015, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_krb5_mech.c + * linux/net/sunrpc/gss_krb5_crypto.c + * linux/net/sunrpc/gss_krb5_seal.c + * linux/net/sunrpc/gss_krb5_seqnum.c + * linux/net/sunrpc/gss_krb5_unseal.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_asn1.h" +#include "gss_krb5.h" +#include "gss_crypto.h" + +static spinlock_t krb5_seq_lock; + +struct krb5_enctype { + char *ke_dispname; + char *ke_enc_name; /* linux tfm name */ + char *ke_hash_name; /* linux tfm name */ + int ke_enc_mode; /* linux tfm mode */ + int ke_hash_size; /* checksum size */ + int ke_conf_size; /* confounder size */ + unsigned int ke_hash_hmac:1; /* is hmac? */ +}; + +/* + * NOTE: for aes128-cts and aes256-cts, MIT implementation use CTS encryption. + * but currently we simply CBC with padding, because linux doesn't support CTS + * yet. this need to be fixed in the future. + */ +static struct krb5_enctype enctypes[] = { + [ENCTYPE_DES_CBC_RAW] = { /* des-cbc-md5 */ + .ke_dispname = "des-cbc-md5", + .ke_enc_name = "cbc(des)", + .ke_hash_name = "md5", + .ke_hash_size = 16, + .ke_conf_size = 8, + }, +#ifdef HAVE_DES3_SUPPORT + [ENCTYPE_DES3_CBC_RAW] = { /* des3-hmac-sha1 */ + .ke_dispname = "des3-hmac-sha1", + .ke_enc_name = "cbc(des3_ede)", + .ke_hash_name = "sha1", + .ke_hash_size = 20, + .ke_conf_size = 8, + .ke_hash_hmac = 1, + }, +#endif + [ENCTYPE_AES128_CTS_HMAC_SHA1_96] = { /* aes128-cts */ + .ke_dispname = "aes128-cts-hmac-sha1-96", + .ke_enc_name = "cbc(aes)", + .ke_hash_name = "sha1", + .ke_hash_size = 12, + .ke_conf_size = 16, + .ke_hash_hmac = 1, + }, + [ENCTYPE_AES256_CTS_HMAC_SHA1_96] = { /* aes256-cts */ + .ke_dispname = "aes256-cts-hmac-sha1-96", + .ke_enc_name = "cbc(aes)", + .ke_hash_name = "sha1", + .ke_hash_size = 12, + .ke_conf_size = 16, + .ke_hash_hmac = 1, + }, + [ENCTYPE_ARCFOUR_HMAC] = { /* arcfour-hmac-md5 */ + .ke_dispname = "arcfour-hmac-md5", + .ke_enc_name = "ecb(arc4)", + .ke_hash_name = "md5", + .ke_hash_size = 16, + .ke_conf_size = 8, + .ke_hash_hmac = 1, + } +}; + +static const char * enctype2str(__u32 enctype) +{ + if (enctype < ARRAY_SIZE(enctypes) && enctypes[enctype].ke_dispname) + return enctypes[enctype].ke_dispname; + + return "unknown"; +} + +static +int krb5_init_keys(struct krb5_ctx *kctx) +{ + struct krb5_enctype *ke; + + if (kctx->kc_enctype >= ARRAY_SIZE(enctypes) || + enctypes[kctx->kc_enctype].ke_hash_size == 0) { + CERROR("unsupported enctype %x\n", kctx->kc_enctype); + return -1; + } + + ke = &enctypes[kctx->kc_enctype]; + + /* tfm arc4 is stateful, user should alloc-use-free by his own */ + if (kctx->kc_enctype != ENCTYPE_ARCFOUR_HMAC && + gss_keyblock_init(&kctx->kc_keye, ke->ke_enc_name, ke->ke_enc_mode)) + return -1; + + /* tfm hmac is stateful, user should alloc-use-free by his own */ + if (ke->ke_hash_hmac == 0 && + gss_keyblock_init(&kctx->kc_keyi, ke->ke_enc_name, ke->ke_enc_mode)) + return -1; + if (ke->ke_hash_hmac == 0 && + gss_keyblock_init(&kctx->kc_keyc, ke->ke_enc_name, ke->ke_enc_mode)) + return -1; + + return 0; +} + +static +void delete_context_kerberos(struct krb5_ctx *kctx) +{ + rawobj_free(&kctx->kc_mech_used); + + gss_keyblock_free(&kctx->kc_keye); + gss_keyblock_free(&kctx->kc_keyi); + gss_keyblock_free(&kctx->kc_keyc); +} + +static +__u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end) +{ + unsigned int tmp_uint, keysize; + + /* seed_init flag */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + kctx->kc_seed_init = (tmp_uint != 0); + + /* seed */ + if (gss_get_bytes(&p, end, kctx->kc_seed, sizeof(kctx->kc_seed))) + goto out_err; + + /* sign/seal algorithm, not really used now */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) || + gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + + /* end time. While kc_endtime might be 64 bit the krb5 API + * still uses 32 bits. To delay the 2038 bug see the incoming + * value as a u32 which give us until 2106. See the link for details: + * + * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html + */ + if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32))) + goto out_err; + + /* seq send */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + kctx->kc_seq_send = tmp_uint; + + /* mech oid */ + if (gss_get_rawobj(&p, end, &kctx->kc_mech_used)) + goto out_err; + + /* old style enc/seq keys in format: + * - enctype (u32) + * - keysize (u32) + * - keydata + * we decompose them to fit into the new context + */ + + /* enc key */ + if (gss_get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype))) + goto out_err; + + if (gss_get_bytes(&p, end, &keysize, sizeof(keysize))) + goto out_err; + + if (gss_get_keyblock(&p, end, &kctx->kc_keye, keysize)) + goto out_err; + + /* seq key */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) || + tmp_uint != kctx->kc_enctype) + goto out_err; + + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) || + tmp_uint != keysize) + goto out_err; + + if (gss_get_keyblock(&p, end, &kctx->kc_keyc, keysize)) + goto out_err; + + /* old style fallback */ + if (gss_keyblock_dup(&kctx->kc_keyi, &kctx->kc_keyc)) + goto out_err; + + if (p != end) + goto out_err; + + CDEBUG(D_SEC, "successfully imported rfc1964 context\n"); + return 0; +out_err: + return GSS_S_FAILURE; +} + +/* Flags for version 2 context flags */ +#define KRB5_CTX_FLAG_INITIATOR 0x00000001 +#define KRB5_CTX_FLAG_CFX 0x00000002 +#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY 0x00000004 + +static +__u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end) +{ + unsigned int tmp_uint, keysize; + + /* end time. While kc_endtime might be 64 bit the krb5 API + * still uses 32 bits. To delay the 2038 bug see the incoming + * value as a u32 which give us until 2106. See the link for details: + * + * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html + */ + if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32))) + goto out_err; + + /* flags */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + + if (tmp_uint & KRB5_CTX_FLAG_INITIATOR) + kctx->kc_initiate = 1; + if (tmp_uint & KRB5_CTX_FLAG_CFX) + kctx->kc_cfx = 1; + if (tmp_uint & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) + kctx->kc_have_acceptor_subkey = 1; + + /* seq send */ + if (gss_get_bytes(&p, end, &kctx->kc_seq_send, + sizeof(kctx->kc_seq_send))) + goto out_err; + + /* enctype */ + if (gss_get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype))) + goto out_err; + + /* size of each key */ + if (gss_get_bytes(&p, end, &keysize, sizeof(keysize))) + goto out_err; + + /* number of keys - should always be 3 */ + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + + if (tmp_uint != 3) { + CERROR("Invalid number of keys: %u\n", tmp_uint); + goto out_err; + } + + /* ke */ + if (gss_get_keyblock(&p, end, &kctx->kc_keye, keysize)) + goto out_err; + /* ki */ + if (gss_get_keyblock(&p, end, &kctx->kc_keyi, keysize)) + goto out_err; + /* ki */ + if (gss_get_keyblock(&p, end, &kctx->kc_keyc, keysize)) + goto out_err; + + CDEBUG(D_SEC, "successfully imported v2 context\n"); + return 0; +out_err: + return GSS_S_FAILURE; +} + +/* + * The whole purpose here is trying to keep user level gss context parsing + * from nfs-utils unchanged as possible as we can, they are not quite mature + * yet, and many stuff still not clear, like heimdal etc. + */ +static +__u32 gss_import_sec_context_kerberos(rawobj_t *inbuf, + struct gss_ctx *gctx) +{ + struct krb5_ctx *kctx; + char *p = (char *)inbuf->data; + char *end = (char *)(inbuf->data + inbuf->len); + unsigned int tmp_uint, rc; + + if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) { + CERROR("Fail to read version\n"); + return GSS_S_FAILURE; + } + + /* only support 0, 1 for the moment */ + if (tmp_uint > 2) { + CERROR("Invalid version %u\n", tmp_uint); + return GSS_S_FAILURE; + } + + OBD_ALLOC_PTR(kctx); + if (!kctx) + return GSS_S_FAILURE; + + if (tmp_uint == 0 || tmp_uint == 1) { + kctx->kc_initiate = tmp_uint; + rc = import_context_rfc1964(kctx, p, end); + } else { + rc = import_context_rfc4121(kctx, p, end); + } + + if (rc == 0) + rc = krb5_init_keys(kctx); + + if (rc) { + delete_context_kerberos(kctx); + OBD_FREE_PTR(kctx); + + return GSS_S_FAILURE; + } + + gctx->internal_ctx_id = kctx; + return GSS_S_COMPLETE; +} + +static +__u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx, + struct gss_ctx *gctx_new) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_ctx *knew; + + OBD_ALLOC_PTR(knew); + if (!knew) + return GSS_S_FAILURE; + + knew->kc_initiate = kctx->kc_initiate ? 0 : 1; + knew->kc_cfx = kctx->kc_cfx; + knew->kc_seed_init = kctx->kc_seed_init; + knew->kc_have_acceptor_subkey = kctx->kc_have_acceptor_subkey; + knew->kc_endtime = kctx->kc_endtime; + + memcpy(knew->kc_seed, kctx->kc_seed, sizeof(kctx->kc_seed)); + knew->kc_seq_send = kctx->kc_seq_recv; + knew->kc_seq_recv = kctx->kc_seq_send; + knew->kc_enctype = kctx->kc_enctype; + + if (rawobj_dup(&knew->kc_mech_used, &kctx->kc_mech_used)) + goto out_err; + + if (gss_keyblock_dup(&knew->kc_keye, &kctx->kc_keye)) + goto out_err; + if (gss_keyblock_dup(&knew->kc_keyi, &kctx->kc_keyi)) + goto out_err; + if (gss_keyblock_dup(&knew->kc_keyc, &kctx->kc_keyc)) + goto out_err; + if (krb5_init_keys(knew)) + goto out_err; + + gctx_new->internal_ctx_id = knew; + CDEBUG(D_SEC, "successfully copied reverse context\n"); + return GSS_S_COMPLETE; + +out_err: + delete_context_kerberos(knew); + OBD_FREE_PTR(knew); + return GSS_S_FAILURE; +} + +static +__u32 gss_inquire_context_kerberos(struct gss_ctx *gctx, + time64_t *endtime) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + + *endtime = kctx->kc_endtime; + return GSS_S_COMPLETE; +} + +static +void gss_delete_sec_context_kerberos(void *internal_ctx) +{ + struct krb5_ctx *kctx = internal_ctx; + + delete_context_kerberos(kctx); + OBD_FREE_PTR(kctx); +} + +/* + * compute (keyed/keyless) checksum against the plain text which appended + * with krb5 wire token header. + */ +static +__s32 krb5_make_checksum(__u32 enctype, + struct gss_keyblock *kb, + struct krb5_header *khdr, + int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs, + rawobj_t *cksum, + digest_hash hash_func) +{ + struct krb5_enctype *ke = &enctypes[enctype]; + struct ahash_request *req = NULL; + enum cfs_crypto_hash_alg hash_algo; + rawobj_t hdr; + int rc; + + hash_algo = cfs_crypto_hash_alg(ke->ke_hash_name); + + /* For the cbc(des) case we want md5 instead of hmac(md5) */ + if (strcmp(ke->ke_enc_name, "cbc(des)")) + req = cfs_crypto_hash_init(hash_algo, kb->kb_key.data, + kb->kb_key.len); + else + req = cfs_crypto_hash_init(hash_algo, NULL, 0); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + CERROR("failed to alloc hash %s : rc = %d\n", + ke->ke_hash_name, rc); + goto out_no_hash; + } + + cksum->len = cfs_crypto_hash_digestsize(hash_algo); + OBD_ALLOC_LARGE(cksum->data, cksum->len); + if (!cksum->data) { + cksum->len = 0; + rc = -ENOMEM; + goto out_free_hash; + } + + hdr.data = (__u8 *)khdr; + hdr.len = sizeof(*khdr); + + if (!hash_func) { + rc = -EPROTO; + CERROR("hash function for %s undefined\n", + ke->ke_hash_name); + goto out_free_hash; + } + rc = hash_func(req, &hdr, msgcnt, msgs, iovcnt, iovs); + if (rc) + goto out_free_hash; + + if (!ke->ke_hash_hmac) { + LASSERT(kb->kb_tfm); + + cfs_crypto_hash_final(req, cksum->data, &cksum->len); + rc = gss_crypt_generic(kb->kb_tfm, 0, NULL, + cksum->data, cksum->data, + cksum->len); + goto out_no_hash; + } + +out_free_hash: + if (req) + cfs_crypto_hash_final(req, cksum->data, &cksum->len); +out_no_hash: + return rc ? GSS_S_FAILURE : GSS_S_COMPLETE; +} + +static void fill_krb5_header(struct krb5_ctx *kctx, + struct krb5_header *khdr, + int privacy) +{ + unsigned char acceptor_flag; + + acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR; + + if (privacy) { + khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG); + khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL; + khdr->kh_ec = cpu_to_be16(0); + khdr->kh_rrc = cpu_to_be16(0); + } else { + khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG); + khdr->kh_flags = acceptor_flag; + khdr->kh_ec = cpu_to_be16(0xffff); + khdr->kh_rrc = cpu_to_be16(0xffff); + } + + khdr->kh_filler = 0xff; + spin_lock(&krb5_seq_lock); + khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++); + spin_unlock(&krb5_seq_lock); +} + +static __u32 verify_krb5_header(struct krb5_ctx *kctx, + struct krb5_header *khdr, + int privacy) +{ + unsigned char acceptor_flag; + __u16 tok_id, ec_rrc; + + acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0; + + if (privacy) { + tok_id = KG_TOK_WRAP_MSG; + ec_rrc = 0x0; + } else { + tok_id = KG_TOK_MIC_MSG; + ec_rrc = 0xffff; + } + + /* sanity checks */ + if (be16_to_cpu(khdr->kh_tok_id) != tok_id) { + CERROR("bad token id\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) { + CERROR("bad direction flag\n"); + return GSS_S_BAD_SIG; + } + if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) { + CERROR("missing confidential flag\n"); + return GSS_S_BAD_SIG; + } + if (khdr->kh_filler != 0xff) { + CERROR("bad filler\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + if (be16_to_cpu(khdr->kh_ec) != ec_rrc || + be16_to_cpu(khdr->kh_rrc) != ec_rrc) { + CERROR("bad EC or RRC\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + return GSS_S_COMPLETE; +} + +static +__u32 gss_get_mic_kerberos(struct gss_ctx *gctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + rawobj_t cksum = RAWOBJ_EMPTY; + u32 major; + + /* fill krb5 header */ + LASSERT(token->len >= sizeof(*khdr)); + khdr = (struct krb5_header *)token->data; + fill_krb5_header(kctx, khdr, 0); + + /* checksum */ + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, khdr, + msgcnt, msgs, iovcnt, iovs, &cksum, + gctx->hash_func)) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + + LASSERT(cksum.len >= ke->ke_hash_size); + LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size); + memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size); + + token->len = sizeof(*khdr) + ke->ke_hash_size; + major = GSS_S_COMPLETE; +out_free_cksum: + rawobj_free(&cksum); + return major; +} + +static +__u32 gss_verify_mic_kerberos(struct gss_ctx *gctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + rawobj_t cksum = RAWOBJ_EMPTY; + u32 major; + + if (token->len < sizeof(*khdr)) { + CERROR("short signature: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + khdr = (struct krb5_header *)token->data; + + major = verify_krb5_header(kctx, khdr, 0); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + goto out; + } + + if (token->len < sizeof(*khdr) + ke->ke_hash_size) { + CERROR("short signature: %u, require %d\n", + token->len, (int) sizeof(*khdr) + ke->ke_hash_size); + GOTO(out, major = GSS_S_FAILURE); + } + + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, + khdr, msgcnt, msgs, iovcnt, iovs, &cksum, + gctx->hash_func)) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + + LASSERT(cksum.len >= ke->ke_hash_size); + if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size)) { + CERROR("checksum mismatch\n"); + GOTO(out_free_cksum, major = GSS_S_BAD_SIG); + } + major = GSS_S_COMPLETE; +out_free_cksum: + rawobj_free(&cksum); +out: + return major; +} + +/* + * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size. + */ +static +int krb5_encrypt_bulk(struct crypto_sync_skcipher *tfm, + struct krb5_header *khdr, + char *confounder, + struct ptlrpc_bulk_desc *desc, + rawobj_t *cipher, + int adj_nob) +{ + __u8 local_iv[16] = {0}; + struct scatterlist src, dst; + struct sg_table sg_src, sg_dst; + int blocksize, i, rc, nob = 0; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + LASSERT(desc->bd_iov_count); + LASSERT(GET_ENC_KIOV(desc)); + + blocksize = crypto_sync_skcipher_blocksize(tfm); + LASSERT(blocksize > 1); + LASSERT(cipher->len == blocksize + sizeof(*khdr)); + + /* encrypt confounder */ + rc = gss_setup_sgtable(&sg_src, &src, confounder, blocksize); + if (rc != 0) + return rc; + + rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data, blocksize); + if (rc != 0) { + gss_teardown_sgtable(&sg_src); + return rc; + } + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl, + blocksize, local_iv); + + rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize); + + gss_teardown_sgtable(&sg_dst); + gss_teardown_sgtable(&sg_src); + + if (rc) { + CERROR("error to encrypt confounder: %d\n", rc); + skcipher_request_zero(req); + return rc; + } + + /* encrypt clear pages */ + for (i = 0; i < desc->bd_iov_count; i++) { + sg_init_table(&src, 1); + sg_set_page(&src, BD_GET_KIOV(desc, i).kiov_page, + (BD_GET_KIOV(desc, i).kiov_len + + blocksize - 1) & + (~(blocksize - 1)), + BD_GET_KIOV(desc, i).kiov_offset); + if (adj_nob) + nob += src.length; + sg_init_table(&dst, 1); + sg_set_page(&dst, BD_GET_ENC_KIOV(desc, i).kiov_page, + src.length, src.offset); + + BD_GET_ENC_KIOV(desc, i).kiov_offset = dst.offset; + BD_GET_ENC_KIOV(desc, i).kiov_len = dst.length; + + skcipher_request_set_crypt(req, &src, &dst, + src.length, local_iv); + rc = crypto_skcipher_encrypt_iv(req, &dst, &src, src.length); + if (rc) { + CERROR("error to encrypt page: %d\n", rc); + skcipher_request_zero(req); + return rc; + } + } + + /* encrypt krb5 header */ + rc = gss_setup_sgtable(&sg_src, &src, khdr, sizeof(*khdr)); + if (rc != 0) { + skcipher_request_zero(req); + return rc; + } + + rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize, + sizeof(*khdr)); + if (rc != 0) { + gss_teardown_sgtable(&sg_src); + skcipher_request_zero(req); + return rc; + } + + skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl, + sizeof(*khdr), local_iv); + rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, + sizeof(*khdr)); + skcipher_request_zero(req); + + gss_teardown_sgtable(&sg_dst); + gss_teardown_sgtable(&sg_src); + + if (rc) { + CERROR("error to encrypt krb5 header: %d\n", rc); + return rc; + } + + if (adj_nob) + desc->bd_nob = nob; + + return 0; +} + +/* + * desc->bd_nob_transferred is the size of cipher text received. + * desc->bd_nob is the target size of plain text supposed to be. + * + * if adj_nob != 0, we adjust each page's kiov_len to the actual + * plain text size. + * - for client read: we don't know data size for each page, so + * bd_iov[]->kiov_len is set to PAGE_SIZE, but actual data received might + * be smaller, so we need to adjust it according to + * bd_u.bd_kiov.bd_enc_vec[]->kiov_len. + * this means we DO NOT support the situation that server send an odd size + * data in a page which is not the last one. + * - for server write: we knows exactly data size for each page being expected, + * thus kiov_len is accurate already, so we should not adjust it at all. + * and bd_u.bd_kiov.bd_enc_vec[]->kiov_len should be + * round_up(bd_iov[]->kiov_len) which + * should have been done by prep_bulk(). + */ +static +int krb5_decrypt_bulk(struct crypto_sync_skcipher *tfm, + struct krb5_header *khdr, + struct ptlrpc_bulk_desc *desc, + rawobj_t *cipher, + rawobj_t *plain, + int adj_nob) +{ + __u8 local_iv[16] = {0}; + struct scatterlist src, dst; + struct sg_table sg_src, sg_dst; + int ct_nob = 0, pt_nob = 0; + int blocksize, i, rc; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + LASSERT(desc->bd_iov_count); + LASSERT(GET_ENC_KIOV(desc)); + LASSERT(desc->bd_nob_transferred); + + blocksize = crypto_sync_skcipher_blocksize(tfm); + LASSERT(blocksize > 1); + LASSERT(cipher->len == blocksize + sizeof(*khdr)); + + if (desc->bd_nob_transferred % blocksize) { + CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred); + return -EPROTO; + } + + /* decrypt head (confounder) */ + rc = gss_setup_sgtable(&sg_src, &src, cipher->data, blocksize); + if (rc != 0) + return rc; + + rc = gss_setup_sgtable(&sg_dst, &dst, plain->data, blocksize); + if (rc != 0) { + gss_teardown_sgtable(&sg_src); + return rc; + } + + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl, + blocksize, local_iv); + + rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize); + + gss_teardown_sgtable(&sg_dst); + gss_teardown_sgtable(&sg_src); + + if (rc) { + CERROR("error to decrypt confounder: %d\n", rc); + skcipher_request_zero(req); + return rc; + } + + for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred; + i++) { + if (BD_GET_ENC_KIOV(desc, i).kiov_offset % blocksize != 0 || + BD_GET_ENC_KIOV(desc, i).kiov_len % blocksize != 0) { + CERROR("page %d: odd offset %u len %u, blocksize %d\n", + i, BD_GET_ENC_KIOV(desc, i).kiov_offset, + BD_GET_ENC_KIOV(desc, i).kiov_len, + blocksize); + skcipher_request_zero(req); + return -EFAULT; + } + + if (adj_nob) { + if (ct_nob + BD_GET_ENC_KIOV(desc, i).kiov_len > + desc->bd_nob_transferred) + BD_GET_ENC_KIOV(desc, i).kiov_len = + desc->bd_nob_transferred - ct_nob; + + BD_GET_KIOV(desc, i).kiov_len = + BD_GET_ENC_KIOV(desc, i).kiov_len; + if (pt_nob + BD_GET_ENC_KIOV(desc, i).kiov_len > + desc->bd_nob) + BD_GET_KIOV(desc, i).kiov_len = + desc->bd_nob - pt_nob; + } else { + /* this should be guaranteed by LNET */ + LASSERT(ct_nob + BD_GET_ENC_KIOV(desc, i). + kiov_len <= + desc->bd_nob_transferred); + LASSERT(BD_GET_KIOV(desc, i).kiov_len <= + BD_GET_ENC_KIOV(desc, i).kiov_len); + } + + if (BD_GET_ENC_KIOV(desc, i).kiov_len == 0) + continue; + + sg_init_table(&src, 1); + sg_set_page(&src, BD_GET_ENC_KIOV(desc, i).kiov_page, + BD_GET_ENC_KIOV(desc, i).kiov_len, + BD_GET_ENC_KIOV(desc, i).kiov_offset); + dst = src; + if (BD_GET_KIOV(desc, i).kiov_len % blocksize == 0) + sg_assign_page(&dst, + BD_GET_KIOV(desc, i).kiov_page); + + skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl, + src.length, local_iv); + rc = crypto_skcipher_decrypt_iv(req, &dst, &src, src.length); + if (rc) { + CERROR("error to decrypt page: %d\n", rc); + skcipher_request_zero(req); + return rc; + } + + if (BD_GET_KIOV(desc, i).kiov_len % blocksize != 0) { + memcpy(page_address(BD_GET_KIOV(desc, i).kiov_page) + + BD_GET_KIOV(desc, i).kiov_offset, + page_address(BD_GET_ENC_KIOV(desc, i). + kiov_page) + + BD_GET_KIOV(desc, i).kiov_offset, + BD_GET_KIOV(desc, i).kiov_len); + } + + ct_nob += BD_GET_ENC_KIOV(desc, i).kiov_len; + pt_nob += BD_GET_KIOV(desc, i).kiov_len; + } + + if (unlikely(ct_nob != desc->bd_nob_transferred)) { + CERROR("%d cipher text transferred but only %d decrypted\n", + desc->bd_nob_transferred, ct_nob); + skcipher_request_zero(req); + return -EFAULT; + } + + if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) { + CERROR("%d plain text expected but only %d received\n", + desc->bd_nob, pt_nob); + skcipher_request_zero(req); + return -EFAULT; + } + + /* if needed, clear up the rest unused iovs */ + if (adj_nob) + while (i < desc->bd_iov_count) + BD_GET_KIOV(desc, i++).kiov_len = 0; + + /* decrypt tail (krb5 header) */ + rc = gss_setup_sgtable(&sg_src, &src, cipher->data + blocksize, + sizeof(*khdr)); + if (rc != 0) + return rc; + + rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize, + sizeof(*khdr)); + if (rc != 0) { + gss_teardown_sgtable(&sg_src); + return rc; + } + + skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl, + src.length, local_iv); + rc = crypto_skcipher_decrypt_iv(req, sg_dst.sgl, sg_src.sgl, + sizeof(*khdr)); + gss_teardown_sgtable(&sg_src); + gss_teardown_sgtable(&sg_dst); + + skcipher_request_zero(req); + if (rc) { + CERROR("error to decrypt tail: %d\n", rc); + return rc; + } + + if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) { + CERROR("krb5 header doesn't match\n"); + return -EACCES; + } + + return 0; +} + +static +__u32 gss_wrap_kerberos(struct gss_ctx *gctx, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksize; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t data_desc[3], cipher; + __u8 conf[GSS_MAX_CIPHER_BLOCK]; + __u8 local_iv[16] = {0}; + u32 major; + int rc = 0; + + LASSERT(ke); + LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK); + LASSERT(kctx->kc_keye.kb_tfm == NULL || + ke->ke_conf_size >= + crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm)); + + /* + * final token format: + * --------------------------------------------------- + * | krb5 header | cipher text | checksum (16 bytes) | + * --------------------------------------------------- + */ + + /* fill krb5 header */ + LASSERT(token->len >= sizeof(*khdr)); + khdr = (struct krb5_header *)token->data; + fill_krb5_header(kctx, khdr, 1); + + /* generate confounder */ + cfs_get_random_bytes(conf, ke->ke_conf_size); + + /* get encryption blocksize. note kc_keye might not associated with + * a tfm, currently only for arcfour-hmac */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksize = 1; + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksize = crypto_sync_skcipher_blocksize( + kctx->kc_keye.kb_tfm); + } + LASSERT(blocksize <= ke->ke_conf_size); + + /* padding the message */ + if (gss_add_padding(msg, msg_buflen, blocksize)) + return GSS_S_FAILURE; + + /* + * clear text layout for checksum: + * ------------------------------------------------------ + * | confounder | gss header | clear msgs | krb5 header | + * ------------------------------------------------------ + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + data_desc[1].data = gsshdr->data; + data_desc[1].len = gsshdr->len; + data_desc[2].data = msg->data; + data_desc[2].len = msg->len; + + /* compute checksum */ + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 3, data_desc, 0, NULL, &cksum, + gctx->hash_func)) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + LASSERT(cksum.len >= ke->ke_hash_size); + + /* + * clear text layout for encryption: + * ----------------------------------------- + * | confounder | clear msgs | krb5 header | + * ----------------------------------------- + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + data_desc[1].data = msg->data; + data_desc[1].len = msg->len; + data_desc[2].data = (__u8 *) khdr; + data_desc[2].len = sizeof(*khdr); + + /* cipher text will be directly inplace */ + cipher.data = (__u8 *)(khdr + 1); + cipher.len = token->len - sizeof(*khdr); + LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr)); + + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + rawobj_t arc4_keye = RAWOBJ_EMPTY; + struct crypto_sync_skcipher *arc4_tfm; + + if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi, + NULL, 1, &cksum, 0, NULL, &arc4_keye, + gctx->hash_func)) { + CERROR("failed to obtain arc4 enc key\n"); + GOTO(arc4_out_key, rc = -EACCES); + } + + arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0); + if (IS_ERR(arc4_tfm)) { + CERROR("failed to alloc tfm arc4 in ECB mode\n"); + GOTO(arc4_out_key, rc = -EACCES); + } + + if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data, + arc4_keye.len)) { + CERROR("failed to set arc4 key, len %d\n", + arc4_keye.len); + GOTO(arc4_out_tfm, rc = -EACCES); + } + + rc = gss_crypt_rawobjs(arc4_tfm, NULL, 3, data_desc, + &cipher, 1); +arc4_out_tfm: + crypto_free_sync_skcipher(arc4_tfm); +arc4_out_key: + rawobj_free(&arc4_keye); + } else { + rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 3, + data_desc, &cipher, 1); + } + + if (rc) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + + /* fill in checksum */ + LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size); + memcpy((char *)(khdr + 1) + cipher.len, + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size); + + /* final token length */ + token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size; + major = GSS_S_COMPLETE; +out_free_cksum: + rawobj_free(&cksum); + return major; +} + +static +__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + int blocksize, i; + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + LASSERT(desc->bd_iov_count); + LASSERT(GET_ENC_KIOV(desc)); + LASSERT(kctx->kc_keye.kb_tfm); + + blocksize = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm); + + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(BD_GET_ENC_KIOV(desc, i).kiov_page); + /* + * offset should always start at page boundary of either + * client or server side. + */ + if (BD_GET_KIOV(desc, i).kiov_offset & blocksize) { + CERROR("odd offset %d in page %d\n", + BD_GET_KIOV(desc, i).kiov_offset, i); + return GSS_S_FAILURE; + } + + BD_GET_ENC_KIOV(desc, i).kiov_offset = + BD_GET_KIOV(desc, i).kiov_offset; + BD_GET_ENC_KIOV(desc, i).kiov_len = + (BD_GET_KIOV(desc, i).kiov_len + + blocksize - 1) & (~(blocksize - 1)); + } + + return GSS_S_COMPLETE; +} + +static +__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksz; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t data_desc[1], cipher; + __u8 conf[GSS_MAX_CIPHER_BLOCK]; + int rc = 0; + u32 major; + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + LASSERT(ke); + LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK); + + /* + * final token format: + * -------------------------------------------------- + * | krb5 header | head/tail cipher text | checksum | + * -------------------------------------------------- + */ + + /* fill krb5 header */ + LASSERT(token->len >= sizeof(*khdr)); + khdr = (struct krb5_header *)token->data; + fill_krb5_header(kctx, khdr, 1); + + /* generate confounder */ + cfs_get_random_bytes(conf, ke->ke_conf_size); + + /* get encryption blocksize. note kc_keye might not associated with + * a tfm, currently only for arcfour-hmac */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksz = 1; + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm); + } + + /* + * we assume the size of krb5_header (16 bytes) must be n * blocksize. + * the bulk token size would be exactly (sizeof(krb5_header) + + * blocksize + sizeof(krb5_header) + hashsize) + */ + LASSERT(blocksz <= ke->ke_conf_size); + LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0); + LASSERT(token->len >= sizeof(*khdr) + blocksz + sizeof(*khdr) + 16); + + /* + * clear text layout for checksum: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + + /* compute checksum */ + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 1, data_desc, + desc->bd_iov_count, GET_KIOV(desc), + &cksum, gctx->hash_func)) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + LASSERT(cksum.len >= ke->ke_hash_size); + + /* + * clear text layout for encryption: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + * | | | + * ---------- (cipher pages) | + * result token: | | + * ------------------------------------------- + * | krb5 header | cipher text | cipher text | + * ------------------------------------------- + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + + cipher.data = (__u8 *)(khdr + 1); + cipher.len = blocksz + sizeof(*khdr); + + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LBUG(); + rc = 0; + } else { + rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr, + conf, desc, &cipher, adj_nob); + } + if (rc) + GOTO(out_free_cksum, major = GSS_S_FAILURE); + + /* fill in checksum */ + LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size); + memcpy((char *)(khdr + 1) + cipher.len, + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size); + + /* final token length */ + token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size; + major = GSS_S_COMPLETE; +out_free_cksum: + rawobj_free(&cksum); + return major; +} + +static +__u32 gss_unwrap_kerberos(struct gss_ctx *gctx, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *msg) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + unsigned char *tmpbuf; + int blocksz, bodysize; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t cipher_in, plain_out; + rawobj_t hash_objs[3]; + int rc = 0; + __u32 major; + __u8 local_iv[16] = {0}; + + LASSERT(ke); + + if (token->len < sizeof(*khdr)) { + CERROR("short signature: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + khdr = (struct krb5_header *)token->data; + + major = verify_krb5_header(kctx, khdr, 1); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + return major; + } + + /* block size */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksz = 1; + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm); + } + + /* expected token layout: + * ---------------------------------------- + * | krb5 header | cipher text | checksum | + * ---------------------------------------- + */ + bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size; + + if (bodysize % blocksz) { + CERROR("odd bodysize %d\n", bodysize); + return GSS_S_DEFECTIVE_TOKEN; + } + + if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) { + CERROR("incomplete token: bodysize %d\n", bodysize); + return GSS_S_DEFECTIVE_TOKEN; + } + + if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) { + CERROR("buffer too small: %u, require %d\n", + msg->len, bodysize - ke->ke_conf_size); + return GSS_S_FAILURE; + } + + /* decrypting */ + OBD_ALLOC_LARGE(tmpbuf, bodysize); + if (!tmpbuf) + return GSS_S_FAILURE; + + major = GSS_S_FAILURE; + + cipher_in.data = (__u8 *)(khdr + 1); + cipher_in.len = bodysize; + plain_out.data = tmpbuf; + plain_out.len = bodysize; + + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + rawobj_t arc4_keye; + struct crypto_sync_skcipher *arc4_tfm; + + cksum.data = token->data + token->len - ke->ke_hash_size; + cksum.len = ke->ke_hash_size; + + if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi, + NULL, 1, &cksum, 0, NULL, &arc4_keye, + gctx->hash_func)) { + CERROR("failed to obtain arc4 enc key\n"); + GOTO(arc4_out, rc = -EACCES); + } + + arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0); + if (IS_ERR(arc4_tfm)) { + CERROR("failed to alloc tfm arc4 in ECB mode\n"); + GOTO(arc4_out_key, rc = -EACCES); + } + + if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data, + arc4_keye.len)) { + CERROR("failed to set arc4 key, len %d\n", + arc4_keye.len); + GOTO(arc4_out_tfm, rc = -EACCES); + } + + rc = gss_crypt_rawobjs(arc4_tfm, NULL, 1, &cipher_in, + &plain_out, 0); +arc4_out_tfm: + crypto_free_sync_skcipher(arc4_tfm); +arc4_out_key: + rawobj_free(&arc4_keye); +arc4_out: + cksum = RAWOBJ_EMPTY; + } else { + rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 1, + &cipher_in, &plain_out, 0); + } + + if (rc != 0) { + CERROR("error decrypt\n"); + goto out_free; + } + LASSERT(plain_out.len == bodysize); + + /* expected clear text layout: + * ----------------------------------------- + * | confounder | clear msgs | krb5 header | + * ----------------------------------------- + */ + + /* verify krb5 header in token is not modified */ + if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr), + sizeof(*khdr))) { + CERROR("decrypted krb5 header mismatch\n"); + goto out_free; + } + + /* verify checksum, compose clear text as layout: + * ------------------------------------------------------ + * | confounder | gss header | clear msgs | krb5 header | + * ------------------------------------------------------ + */ + hash_objs[0].len = ke->ke_conf_size; + hash_objs[0].data = plain_out.data; + hash_objs[1].len = gsshdr->len; + hash_objs[1].data = gsshdr->data; + hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr); + hash_objs[2].data = plain_out.data + ke->ke_conf_size; + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 3, hash_objs, 0, NULL, &cksum, + gctx->hash_func)) + goto out_free; + + LASSERT(cksum.len >= ke->ke_hash_size); + if (memcmp((char *)(khdr + 1) + bodysize, + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size)) { + CERROR("checksum mismatch\n"); + goto out_free; + } + + msg->len = bodysize - ke->ke_conf_size - sizeof(*khdr); + memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len); + + major = GSS_S_COMPLETE; +out_free: + OBD_FREE_LARGE(tmpbuf, bodysize); + rawobj_free(&cksum); + return major; +} + +static +__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksz; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t cipher, plain; + rawobj_t data_desc[1]; + int rc; + __u32 major; + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + LASSERT(ke); + + if (token->len < sizeof(*khdr)) { + CERROR("short signature: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + khdr = (struct krb5_header *)token->data; + + major = verify_krb5_header(kctx, khdr, 1); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + return major; + } + + /* block size */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksz = 1; + LBUG(); + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm); + } + LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0); + + /* + * token format is expected as: + * ----------------------------------------------- + * | krb5 header | head/tail cipher text | cksum | + * ----------------------------------------------- + */ + if (token->len < sizeof(*khdr) + blocksz + sizeof(*khdr) + + ke->ke_hash_size) { + CERROR("short token size: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + cipher.data = (__u8 *) (khdr + 1); + cipher.len = blocksz + sizeof(*khdr); + plain.data = cipher.data; + plain.len = cipher.len; + + rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr, + desc, &cipher, &plain, adj_nob); + if (rc) + return GSS_S_DEFECTIVE_TOKEN; + + /* + * verify checksum, compose clear text as layout: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + */ + data_desc[0].data = plain.data; + data_desc[0].len = blocksz; + + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 1, data_desc, + desc->bd_iov_count, + GET_KIOV(desc), + &cksum, gctx->hash_func)) + return GSS_S_FAILURE; + LASSERT(cksum.len >= ke->ke_hash_size); + + if (memcmp(plain.data + blocksz + sizeof(*khdr), + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size)) { + CERROR("checksum mismatch\n"); + rawobj_free(&cksum); + return GSS_S_BAD_SIG; + } + + rawobj_free(&cksum); + return GSS_S_COMPLETE; +} + +int gss_display_kerberos(struct gss_ctx *ctx, + char *buf, + int bufsize) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + int written; + + written = snprintf(buf, bufsize, "krb5 (%s)", + enctype2str(kctx->kc_enctype)); + return written; +} + +static struct gss_api_ops gss_kerberos_ops = { + .gss_import_sec_context = gss_import_sec_context_kerberos, + .gss_copy_reverse_context = gss_copy_reverse_context_kerberos, + .gss_inquire_context = gss_inquire_context_kerberos, + .gss_get_mic = gss_get_mic_kerberos, + .gss_verify_mic = gss_verify_mic_kerberos, + .gss_wrap = gss_wrap_kerberos, + .gss_unwrap = gss_unwrap_kerberos, + .gss_prep_bulk = gss_prep_bulk_kerberos, + .gss_wrap_bulk = gss_wrap_bulk_kerberos, + .gss_unwrap_bulk = gss_unwrap_bulk_kerberos, + .gss_delete_sec_context = gss_delete_sec_context_kerberos, + .gss_display = gss_display_kerberos, +}; + +static struct subflavor_desc gss_kerberos_sfs[] = { + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5N, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_NULL, + .sf_name = "krb5n" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5A, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_AUTH, + .sf_name = "krb5a" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5I, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_INTG, + .sf_name = "krb5i" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5P, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_PRIV, + .sf_name = "krb5p" + }, +}; + +static struct gss_api_mech gss_kerberos_mech = { + /* .gm_owner uses default NULL value for THIS_MODULE */ + .gm_name = "krb5", + .gm_oid = (rawobj_t) + {9, "\052\206\110\206\367\022\001\002\002"}, + .gm_ops = &gss_kerberos_ops, + .gm_sf_num = 4, + .gm_sfs = gss_kerberos_sfs, +}; + +int __init init_kerberos_module(void) +{ + int status; + + spin_lock_init(&krb5_seq_lock); + + status = lgss_mech_register(&gss_kerberos_mech); + if (status) + CERROR("Failed to register kerberos gss mechanism!\n"); + return status; +} + +void cleanup_kerberos_module(void) +{ + lgss_mech_unregister(&gss_kerberos_mech); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c new file mode 100644 index 0000000000000..3ee125f1070bf --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c @@ -0,0 +1,361 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_mech_switch.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_crypto.h" + +static struct list_head registered_mechs = LIST_HEAD_INIT(registered_mechs); +static DEFINE_SPINLOCK(registered_mechs_lock); + +int lgss_mech_register(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_add(&gm->gm_list, ®istered_mechs); + spin_unlock(®istered_mechs_lock); + CDEBUG(D_SEC, "register %s mechanism\n", gm->gm_name); + return 0; +} + +void lgss_mech_unregister(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_del(&gm->gm_list); + spin_unlock(®istered_mechs_lock); + CDEBUG(D_SEC, "Unregister %s mechanism\n", gm->gm_name); +} + + +struct gss_api_mech *lgss_mech_get(struct gss_api_mech *gm) +{ + __module_get(gm->gm_owner); + return gm; +} + +struct gss_api_mech *lgss_name_to_mech(char *name) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (0 == strcmp(name, pos->gm_name)) { + if (!try_module_get(pos->gm_owner)) + continue; + gm = pos; + break; + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +static inline +int mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor) +{ + int i; + + for (i = 0; i < gm->gm_sf_num; i++) { + if (gm->gm_sfs[i].sf_subflavor == subflavor) + return 1; + } + return 0; +} + +struct gss_api_mech *lgss_subflavor_to_mech(__u32 subflavor) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (!try_module_get(pos->gm_owner)) + continue; + if (!mech_supports_subflavor(pos, subflavor)) { + module_put(pos->gm_owner); + continue; + } + gm = pos; + break; + } + spin_unlock(®istered_mechs_lock); + return gm; +} + +void lgss_mech_put(struct gss_api_mech *gm) +{ + module_put(gm->gm_owner); +} + +/* The mech could probably be determined from the token instead, but it's just + * as easy for now to pass it in. */ +__u32 lgss_import_sec_context(rawobj_t *input_token, + struct gss_api_mech *mech, + struct gss_ctx **ctx_id) +{ + OBD_ALLOC_PTR(*ctx_id); + if (*ctx_id == NULL) + return GSS_S_FAILURE; + + (*ctx_id)->mech_type = lgss_mech_get(mech); + (*ctx_id)->hash_func = gss_digest_hash; + + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_import_sec_context); + return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id); +} + +__u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id, + struct gss_ctx **ctx_id_new) +{ + struct gss_api_mech *mech = ctx_id->mech_type; + __u32 major; + + LASSERT(mech); + + OBD_ALLOC_PTR(*ctx_id_new); + if (*ctx_id_new == NULL) + return GSS_S_FAILURE; + + (*ctx_id_new)->mech_type = lgss_mech_get(mech); + (*ctx_id_new)->hash_func = ctx_id->hash_func; + + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_copy_reverse_context); + + major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new); + if (major != GSS_S_COMPLETE) { + lgss_mech_put(mech); + OBD_FREE_PTR(*ctx_id_new); + *ctx_id_new = NULL; + } + return major; +} + +/* + * this interface is much simplified, currently we only need endtime. + */ +__u32 lgss_inquire_context(struct gss_ctx *context_handle, + time64_t *endtime) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context); + + return context_handle->mech_type->gm_ops + ->gss_inquire_context(context_handle, + endtime); +} + +/* gss_get_mic: compute a mic over message and return mic_token. */ +__u32 lgss_get_mic(struct gss_ctx *context_handle, + int msgcnt, + rawobj_t *msg, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_get_mic); + + return context_handle->mech_type->gm_ops + ->gss_get_mic(context_handle, + msgcnt, + msg, + iovcnt, + iovs, + mic_token); +} + +/* gss_verify_mic: check whether the provided mic_token verifies message. */ +__u32 lgss_verify_mic(struct gss_ctx *context_handle, + int msgcnt, + rawobj_t *msg, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic); + + return context_handle->mech_type->gm_ops + ->gss_verify_mic(context_handle, + msgcnt, + msg, + iovcnt, + iovs, + mic_token); +} + +__u32 lgss_wrap(struct gss_ctx *context_handle, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *out_token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_wrap); + + return context_handle->mech_type->gm_ops + ->gss_wrap(context_handle, gsshdr, msg, msg_buflen, out_token); +} + +__u32 lgss_unwrap(struct gss_ctx *context_handle, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *out_msg) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_unwrap); + + return context_handle->mech_type->gm_ops + ->gss_unwrap(context_handle, gsshdr, token, out_msg); +} + + +__u32 lgss_prep_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk); + + return context_handle->mech_type->gm_ops + ->gss_prep_bulk(context_handle, desc); +} + +__u32 lgss_wrap_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk); + + return context_handle->mech_type->gm_ops + ->gss_wrap_bulk(context_handle, desc, token, adj_nob); +} + +__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk); + + return context_handle->mech_type->gm_ops + ->gss_unwrap_bulk(context_handle, desc, token, adj_nob); +} + +/* gss_delete_sec_context: free all resources associated with context_handle. + * Note this differs from the RFC 2744-specified prototype in that we don't + * bother returning an output token, since it would never be used anyway. */ + +__u32 lgss_delete_sec_context(struct gss_ctx **context_handle) +{ + struct gss_api_mech *mech; + + if (!*context_handle) + return GSS_S_NO_CONTEXT; + + CDEBUG(D_SEC, "deleting %p\n", *context_handle); + + mech = (*context_handle)->mech_type; + if ((*context_handle)->internal_ctx_id != NULL) { + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_delete_sec_context); + mech->gm_ops->gss_delete_sec_context( + (*context_handle)->internal_ctx_id); + } + if (mech) + lgss_mech_put(mech); + + OBD_FREE_PTR(*context_handle); + *context_handle = NULL; + return GSS_S_COMPLETE; +} + +int lgss_display(struct gss_ctx *ctx, + char *buf, + int bufsize) +{ + LASSERT(ctx); + LASSERT(ctx->mech_type); + LASSERT(ctx->mech_type->gm_ops); + LASSERT(ctx->mech_type->gm_ops->gss_display); + + return ctx->mech_type->gm_ops->gss_display(ctx, buf, bufsize); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c new file mode 100644 index 0000000000000..1e946f8ba2aff --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c @@ -0,0 +1,219 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013, 2015, Trustees of Indiana University + * + * Copyright (c) 2014, Intel Corporation. + * + * Author: Jeremy Filizetti + * Author: Andrew Korty + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_asn1.h" + +struct null_ctx { + __u64 nc_token; +}; + +static +__u32 gss_import_sec_context_null(rawobj_t *inbuf, struct gss_ctx *gss_context) +{ + struct null_ctx *null_context; + + if (inbuf == NULL || inbuf->data == NULL || + inbuf->len != sizeof(*null_context)) { + CDEBUG(D_SEC, "Invalid input buffer for null context\n"); + return GSS_S_FAILURE; + } + + OBD_ALLOC_PTR(null_context); + if (null_context == NULL) + return GSS_S_FAILURE; + + memcpy(&null_context->nc_token, inbuf->data, inbuf->len); + + gss_context->internal_ctx_id = null_context; + CDEBUG(D_SEC, "successfully imported null context\n"); + + return GSS_S_COMPLETE; +} + +static +__u32 gss_copy_reverse_context_null(struct gss_ctx *gss_context_old, + struct gss_ctx *gss_context_new) +{ + struct null_ctx *null_context_old; + struct null_ctx *null_context_new; + + OBD_ALLOC_PTR(null_context_new); + if (null_context_new == NULL) + return GSS_S_FAILURE; + + null_context_old = gss_context_old->internal_ctx_id; + memcpy(null_context_new, null_context_old, sizeof(*null_context_new)); + gss_context_new->internal_ctx_id = null_context_new; + CDEBUG(D_SEC, "successfully copied reverse null context\n"); + + return GSS_S_COMPLETE; +} + +static +__u32 gss_inquire_context_null(struct gss_ctx *gss_context, + time64_t *endtime) +{ + /* quick timeout for testing purposes */ + *endtime = ktime_get_real_seconds() + 60; + return GSS_S_COMPLETE; +} + +static +__u32 gss_wrap_null(struct gss_ctx *gss_context, rawobj_t *gss_header, + rawobj_t *message, int message_buffer_length, + rawobj_t *token) +{ + return GSS_S_COMPLETE; +} + +static +__u32 gss_unwrap_null(struct gss_ctx *gss_context, rawobj_t *gss_header, + rawobj_t *token, rawobj_t *message) +{ + return GSS_S_COMPLETE; +} + +static +__u32 gss_prep_bulk_null(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc) +{ + return GSS_S_COMPLETE; +} + +static +__u32 gss_wrap_bulk_null(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc, rawobj_t *token, + int adj_nob) +{ + return GSS_S_COMPLETE; +} + +static +__u32 gss_unwrap_bulk_null(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + return GSS_S_COMPLETE; +} + +static +void gss_delete_sec_context_null(void *internal_context) +{ + struct null_ctx *null_context = internal_context; + + OBD_FREE_PTR(null_context); +} + +int gss_display_null(struct gss_ctx *gss_context, char *buf, int bufsize) +{ + return snprintf(buf, bufsize, "null"); +} + +static +__u32 gss_get_mic_null(struct gss_ctx *gss_context, int message_count, + rawobj_t *messages, int iov_count, lnet_kiov_t *iovs, + rawobj_t *token) +{ + return GSS_S_COMPLETE; +} + +static +__u32 gss_verify_mic_null(struct gss_ctx *gss_context, int message_count, + rawobj_t *messages, int iov_count, lnet_kiov_t *iovs, + rawobj_t *token) +{ + return GSS_S_COMPLETE; +} + +static struct gss_api_ops gss_null_ops = { + .gss_import_sec_context = gss_import_sec_context_null, + .gss_copy_reverse_context = gss_copy_reverse_context_null, + .gss_inquire_context = gss_inquire_context_null, + .gss_get_mic = gss_get_mic_null, + .gss_verify_mic = gss_verify_mic_null, + .gss_wrap = gss_wrap_null, + .gss_unwrap = gss_unwrap_null, + .gss_prep_bulk = gss_prep_bulk_null, + .gss_wrap_bulk = gss_wrap_bulk_null, + .gss_unwrap_bulk = gss_unwrap_bulk_null, + .gss_delete_sec_context = gss_delete_sec_context_null, + .gss_display = gss_display_null, +}; + +static struct subflavor_desc gss_null_sfs[] = { + { + .sf_subflavor = SPTLRPC_SUBFLVR_GSSNULL, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_NULL, + .sf_name = "gssnull" + }, +}; + +static struct gss_api_mech gss_null_mech = { + /* .gm_owner uses default NULL value for THIS_MODULE */ + .gm_name = "gssnull", + .gm_oid = (rawobj_t) { + 12, + "\053\006\001\004\001\311\146\215\126\001\000\000" + }, + .gm_ops = &gss_null_ops, + .gm_sf_num = 1, + .gm_sfs = gss_null_sfs, +}; + +int __init init_null_module(void) +{ + int status; + + status = lgss_mech_register(&gss_null_mech); + if (status) + CERROR("Failed to register null gss mechanism!\n"); + + return status; +} + +void cleanup_null_module(void) +{ + lgss_mech_unregister(&gss_null_mech); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c new file mode 100644 index 0000000000000..5e1e7caa1aae6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c @@ -0,0 +1,1254 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2016, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/auth_gss.c + * + * RPCSEC_GSS client authentication. + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Dug Song + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include +#include +struct rpc_clnt; /* for rpc_pipefs */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +static struct ptlrpc_sec_policy gss_policy_pipefs; +static struct ptlrpc_ctx_ops gss_pipefs_ctxops; + +static int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx); + +static int gss_sec_pipe_upcall_init(struct gss_sec *gsec) +{ + return 0; +} + +static void gss_sec_pipe_upcall_fini(struct gss_sec *gsec) +{ +} + +/**************************************** + * internal context helpers * + ****************************************/ + +static +struct ptlrpc_cli_ctx *ctx_create_pf(struct ptlrpc_sec *sec, + struct vfs_cred *vcred) +{ + struct gss_cli_ctx *gctx; + int rc; + + OBD_ALLOC_PTR(gctx); + if (gctx == NULL) + return NULL; + + rc = gss_cli_ctx_init_common(sec, &gctx->gc_base, + &gss_pipefs_ctxops, vcred); + if (rc) { + OBD_FREE_PTR(gctx); + return NULL; + } + + return &gctx->gc_base; +} + +static +void ctx_destroy_pf(struct ptlrpc_sec *sec, struct ptlrpc_cli_ctx *ctx) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + + if (gss_cli_ctx_fini_common(sec, ctx)) + return; + + OBD_FREE_PTR(gctx); + + atomic_dec(&sec->ps_nctx); + sptlrpc_sec_put(sec); +} + +static +void ctx_enhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *hash) +{ + set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags); + atomic_inc(&ctx->cc_refcount); + hlist_add_head(&ctx->cc_cache, hash); +} + +/* + * caller must hold spinlock + */ +static +void ctx_unhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *freelist) +{ + assert_spin_locked(&ctx->cc_sec->ps_lock); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)); + LASSERT(!hlist_unhashed(&ctx->cc_cache)); + + clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags); + + if (atomic_dec_and_test(&ctx->cc_refcount)) { + __hlist_del(&ctx->cc_cache); + hlist_add_head(&ctx->cc_cache, freelist); + } else { + hlist_del_init(&ctx->cc_cache); + } +} + +/* + * return 1 if the context is dead. + */ +static +int ctx_check_death_pf(struct ptlrpc_cli_ctx *ctx, + struct hlist_head *freelist) +{ + if (cli_ctx_check_death(ctx)) { + if (freelist) + ctx_unhash_pf(ctx, freelist); + return 1; + } + + return 0; +} + +static inline +int ctx_check_death_locked_pf(struct ptlrpc_cli_ctx *ctx, + struct hlist_head *freelist) +{ + LASSERT(ctx->cc_sec); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)); + + return ctx_check_death_pf(ctx, freelist); +} + +static inline +int ctx_match_pf(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred) +{ + /* a little bit optimization for null policy */ + if (!ctx->cc_ops->match) + return 1; + + return ctx->cc_ops->match(ctx, vcred); +} + +static +void ctx_list_destroy_pf(struct hlist_head *head) +{ + struct ptlrpc_cli_ctx *ctx; + + while (!hlist_empty(head)) { + ctx = cfs_hlist_entry(head->first, struct ptlrpc_cli_ctx, + cc_cache); + + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, + &ctx->cc_flags) == 0); + + hlist_del_init(&ctx->cc_cache); + ctx_destroy_pf(ctx->cc_sec, ctx); + } +} + +/**************************************** + * context apis * + ****************************************/ + +static +int gss_cli_ctx_validate_pf(struct ptlrpc_cli_ctx *ctx) +{ + if (ctx_check_death_pf(ctx, NULL)) + return 1; + if (cli_ctx_is_ready(ctx)) + return 0; + return 1; +} + +static +void gss_cli_ctx_die_pf(struct ptlrpc_cli_ctx *ctx, int grace) +{ + LASSERT(ctx->cc_sec); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + cli_ctx_expire(ctx); + + spin_lock(&ctx->cc_sec->ps_lock); + + if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)) { + LASSERT(!hlist_unhashed(&ctx->cc_cache)); + LASSERT(atomic_read(&ctx->cc_refcount) > 1); + + hlist_del_init(&ctx->cc_cache); + if (atomic_dec_and_test(&ctx->cc_refcount)) + LBUG(); + } + + spin_unlock(&ctx->cc_sec->ps_lock); +} + +/**************************************** + * reverse context installation * + ****************************************/ + +static inline +unsigned int ctx_hash_index(int hashsize, __u64 key) +{ + return (unsigned int) (key & ((__u64) hashsize - 1)); +} + +static +void gss_sec_ctx_replace_pf(struct gss_sec *gsec, + struct ptlrpc_cli_ctx *new) +{ + struct hlist_node __maybe_unused *pos, *next; + struct gss_sec_pipefs *gsec_pf; + struct ptlrpc_cli_ctx *ctx; + HLIST_HEAD(freelist); + unsigned int hash; + ENTRY; + + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + hash = ctx_hash_index(gsec_pf->gsp_chash_size, + (__u64) new->cc_vcred.vc_uid); + LASSERT(hash < gsec_pf->gsp_chash_size); + + spin_lock(&gsec->gs_base.ps_lock); + + cfs_hlist_for_each_entry_safe(ctx, pos, next, + &gsec_pf->gsp_chash[hash], cc_cache) { + if (!ctx_match_pf(ctx, &new->cc_vcred)) + continue; + + cli_ctx_expire(ctx); + ctx_unhash_pf(ctx, &freelist); + break; + } + + ctx_enhash_pf(new, &gsec_pf->gsp_chash[hash]); + + spin_unlock(&gsec->gs_base.ps_lock); + + ctx_list_destroy_pf(&freelist); + EXIT; +} + +static +int gss_install_rvs_cli_ctx_pf(struct gss_sec *gsec, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct vfs_cred vcred; + struct ptlrpc_cli_ctx *cli_ctx; + int rc; + ENTRY; + + vcred.vc_uid = 0; + vcred.vc_gid = 0; + + cli_ctx = ctx_create_pf(&gsec->gs_base, &vcred); + if (!cli_ctx) + RETURN(-ENOMEM); + + rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx); + if (rc) { + ctx_destroy_pf(cli_ctx->cc_sec, cli_ctx); + RETURN(rc); + } + + gss_sec_ctx_replace_pf(gsec, cli_ctx); + RETURN(0); +} + +static +void gss_ctx_cache_gc_pf(struct gss_sec_pipefs *gsec_pf, + struct hlist_head *freelist) +{ + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + struct hlist_node __maybe_unused *pos; + struct hlist_node *next; + int i; + ENTRY; + + sec = &gsec_pf->gsp_base.gs_base; + + CDEBUG(D_SEC, "do gc on sec %s@%p\n", sec->ps_policy->sp_name, sec); + + for (i = 0; i < gsec_pf->gsp_chash_size; i++) { + cfs_hlist_for_each_entry_safe(ctx, pos, next, + &gsec_pf->gsp_chash[i], cc_cache) + ctx_check_death_locked_pf(ctx, freelist); + } + + sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval; + EXIT; +} + +static +struct ptlrpc_sec* gss_sec_create_pf(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *sf) +{ + struct gss_sec_pipefs *gsec_pf; + int alloc_size, hash_size, i; + ENTRY; + +#define GSS_SEC_PIPEFS_CTX_HASH_SIZE (32) + + if (ctx || + sf->sf_flags & (PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_REVERSE)) + hash_size = 1; + else + hash_size = GSS_SEC_PIPEFS_CTX_HASH_SIZE; + + alloc_size = sizeof(*gsec_pf) + + sizeof(struct hlist_head) * hash_size; + + OBD_ALLOC(gsec_pf, alloc_size); + if (!gsec_pf) + RETURN(NULL); + + gsec_pf->gsp_chash_size = hash_size; + for (i = 0; i < hash_size; i++) + INIT_HLIST_HEAD(&gsec_pf->gsp_chash[i]); + + if (gss_sec_create_common(&gsec_pf->gsp_base, &gss_policy_pipefs, + imp, ctx, sf)) + goto err_free; + + if (ctx == NULL) { + if (gss_sec_pipe_upcall_init(&gsec_pf->gsp_base)) + goto err_destroy; + } else { + if (gss_install_rvs_cli_ctx_pf(&gsec_pf->gsp_base, ctx)) + goto err_destroy; + } + + RETURN(&gsec_pf->gsp_base.gs_base); + +err_destroy: + gss_sec_destroy_common(&gsec_pf->gsp_base); +err_free: + OBD_FREE(gsec_pf, alloc_size); + RETURN(NULL); +} + +static +void gss_sec_destroy_pf(struct ptlrpc_sec *sec) +{ + struct gss_sec_pipefs *gsec_pf; + struct gss_sec *gsec; + + CWARN("destroy %s@%p\n", sec->ps_policy->sp_name, sec); + + gsec = container_of(sec, struct gss_sec, gs_base); + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + LASSERT(gsec_pf->gsp_chash); + LASSERT(gsec_pf->gsp_chash_size); + + gss_sec_pipe_upcall_fini(gsec); + + gss_sec_destroy_common(gsec); + + OBD_FREE(gsec, sizeof(*gsec_pf) + + sizeof(struct hlist_head) * gsec_pf->gsp_chash_size); +} + +static +struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_pf(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + struct gss_sec *gsec; + struct gss_sec_pipefs *gsec_pf; + struct ptlrpc_cli_ctx *ctx = NULL, *new = NULL; + struct hlist_head *hash_head; + struct hlist_node __maybe_unused *pos, *next; + unsigned int hash, gc = 0, found = 0; + HLIST_HEAD(freelist); + ENTRY; + + might_sleep(); + + gsec = container_of(sec, struct gss_sec, gs_base); + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + hash = ctx_hash_index(gsec_pf->gsp_chash_size, + (__u64) vcred->vc_uid); + hash_head = &gsec_pf->gsp_chash[hash]; + LASSERT(hash < gsec_pf->gsp_chash_size); + +retry: + spin_lock(&sec->ps_lock); + + /* gc_next == 0 means never do gc */ + if (remove_dead && sec->ps_gc_next && + (ktime_get_real_seconds() > sec->ps_gc_next)) { + gss_ctx_cache_gc_pf(gsec_pf, &freelist); + gc = 1; + } + + cfs_hlist_for_each_entry_safe(ctx, pos, next, hash_head, cc_cache) { + if (gc == 0 && + ctx_check_death_locked_pf(ctx, + remove_dead ? &freelist : NULL)) + continue; + + if (ctx_match_pf(ctx, vcred)) { + found = 1; + break; + } + } + + if (found) { + if (new && new != ctx) { + /* lost the race, just free it */ + hlist_add_head(&new->cc_cache, &freelist); + new = NULL; + } + + /* hot node, move to head */ + if (hash_head->first != &ctx->cc_cache) { + __hlist_del(&ctx->cc_cache); + hlist_add_head(&ctx->cc_cache, hash_head); + } + } else { + /* don't allocate for reverse sec */ + if (sec_is_reverse(sec)) { + spin_unlock(&sec->ps_lock); + RETURN(NULL); + } + + if (new) { + ctx_enhash_pf(new, hash_head); + ctx = new; + } else if (create) { + spin_unlock(&sec->ps_lock); + new = ctx_create_pf(sec, vcred); + if (new) { + clear_bit(PTLRPC_CTX_NEW_BIT, &new->cc_flags); + goto retry; + } + } else { + ctx = NULL; + } + } + + /* hold a ref */ + if (ctx) + atomic_inc(&ctx->cc_refcount); + + spin_unlock(&sec->ps_lock); + + /* the allocator of the context must give the first push to refresh */ + if (new) { + LASSERT(new == ctx); + gss_cli_ctx_refresh_pf(new); + } + + ctx_list_destroy_pf(&freelist); + RETURN(ctx); +} + +static +void gss_sec_release_ctx_pf(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync) +{ + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0); + LASSERT(hlist_unhashed(&ctx->cc_cache)); + + /* if required async, we must clear the UPTODATE bit to prevent extra + * rpcs during destroy procedure. */ + if (!sync) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + /* destroy this context */ + ctx_destroy_pf(sec, ctx); +} + +/* + * @uid: which user. "-1" means flush all. + * @grace: mark context DEAD, allow graceful destroy like notify + * server side, etc. + * @force: also flush busy entries. + * + * return the number of busy context encountered. + * + * In any cases, never touch "eternal" contexts. + */ +static +int gss_sec_flush_ctx_cache_pf(struct ptlrpc_sec *sec, + uid_t uid, + int grace, int force) +{ + struct gss_sec *gsec; + struct gss_sec_pipefs *gsec_pf; + struct ptlrpc_cli_ctx *ctx; + struct hlist_node __maybe_unused *pos, *next; + HLIST_HEAD(freelist); + int i, busy = 0; + ENTRY; + + might_sleep_if(grace); + + gsec = container_of(sec, struct gss_sec, gs_base); + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + spin_lock(&sec->ps_lock); + for (i = 0; i < gsec_pf->gsp_chash_size; i++) { + cfs_hlist_for_each_entry_safe(ctx, pos, next, + &gsec_pf->gsp_chash[i], + cc_cache) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (uid != -1 && uid != ctx->cc_vcred.vc_uid) + continue; + + if (atomic_read(&ctx->cc_refcount) > 1) { + busy++; + if (!force) + continue; + + CWARN("flush busy(%d) ctx %p(%u->%s) by force, " + "grace %d\n", + atomic_read(&ctx->cc_refcount), + ctx, ctx->cc_vcred.vc_uid, + sec2target_str(ctx->cc_sec), grace); + } + ctx_unhash_pf(ctx, &freelist); + + set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags); + if (!grace) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, + &ctx->cc_flags); + } + } + spin_unlock(&sec->ps_lock); + + ctx_list_destroy_pf(&freelist); + RETURN(busy); +} + +/**************************************** + * service apis * + ****************************************/ + +static +int gss_svc_accept_pf(struct ptlrpc_request *req) +{ + return gss_svc_accept(&gss_policy_pipefs, req); +} + +static +int gss_svc_install_rctx_pf(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx) +{ + struct ptlrpc_sec *sec; + int rc; + + sec = sptlrpc_import_sec_ref(imp); + LASSERT(sec); + rc = gss_install_rvs_cli_ctx_pf(sec2gsec(sec), ctx); + + sptlrpc_sec_put(sec); + return rc; +} + +/**************************************** + * rpc_pipefs definitions * + ****************************************/ + +#define LUSTRE_PIPE_ROOT "/lustre" +#define LUSTRE_PIPE_KRB5 LUSTRE_PIPE_ROOT"/krb5" + +struct gss_upcall_msg_data { + __u32 gum_seq; + __u32 gum_uid; + __u32 gum_gid; + __u32 gum_svc; /* MDS/OSS... */ + __u64 gum_nid; /* peer NID */ + __u8 gum_obd[64]; /* client obd name */ +}; + +struct gss_upcall_msg { + struct rpc_pipe_msg gum_base; + atomic_t gum_refcount; + struct list_head gum_list; + __u32 gum_mechidx; + struct gss_sec *gum_gsec; + struct gss_cli_ctx *gum_gctx; + struct gss_upcall_msg_data gum_data; +}; + +static atomic_t upcall_seq = ATOMIC_INIT(0); + +static inline +__u32 upcall_get_sequence(void) +{ + return (__u32) atomic_inc_return(&upcall_seq); +} + +enum mech_idx_t { + MECH_KRB5 = 0, + MECH_MAX +}; + +static inline +__u32 mech_name2idx(const char *name) +{ + LASSERT(!strcmp(name, "krb5")); + return MECH_KRB5; +} + +/* pipefs dentries for each mechanisms */ +static struct dentry *de_pipes[MECH_MAX] = { NULL, }; +/* all upcall messgaes linked here */ +static struct list_head upcall_lists[MECH_MAX]; +/* and protected by this */ +static spinlock_t upcall_locks[MECH_MAX]; + +static inline +void upcall_list_lock(int idx) +{ + spin_lock(&upcall_locks[idx]); +} + +static inline +void upcall_list_unlock(int idx) +{ + spin_unlock(&upcall_locks[idx]); +} + +static +void upcall_msg_enlist(struct gss_upcall_msg *msg) +{ + __u32 idx = msg->gum_mechidx; + + upcall_list_lock(idx); + list_add(&msg->gum_list, &upcall_lists[idx]); + upcall_list_unlock(idx); +} + +static +void upcall_msg_delist(struct gss_upcall_msg *msg) +{ + __u32 idx = msg->gum_mechidx; + + upcall_list_lock(idx); + list_del_init(&msg->gum_list); + upcall_list_unlock(idx); +} + +/**************************************** + * rpc_pipefs upcall helpers * + ****************************************/ + +static +void gss_release_msg(struct gss_upcall_msg *gmsg) +{ + ENTRY; + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + + if (!atomic_dec_and_test(&gmsg->gum_refcount)) { + EXIT; + return; + } + + if (gmsg->gum_gctx) { + sptlrpc_cli_ctx_wakeup(&gmsg->gum_gctx->gc_base); + sptlrpc_cli_ctx_put(&gmsg->gum_gctx->gc_base, 1); + gmsg->gum_gctx = NULL; + } + + LASSERT(list_empty(&gmsg->gum_list)); + LASSERT(list_empty(&gmsg->gum_base.list)); + OBD_FREE_PTR(gmsg); + EXIT; +} + +static +void gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg) +{ + __u32 idx = gmsg->gum_mechidx; + + LASSERT(idx < MECH_MAX); + assert_spin_locked(&upcall_locks[idx]); + + if (list_empty(&gmsg->gum_list)) + return; + + list_del_init(&gmsg->gum_list); + LASSERT(atomic_read(&gmsg->gum_refcount) > 1); + atomic_dec(&gmsg->gum_refcount); +} + +static +void gss_unhash_msg(struct gss_upcall_msg *gmsg) +{ + __u32 idx = gmsg->gum_mechidx; + + LASSERT(idx < MECH_MAX); + upcall_list_lock(idx); + gss_unhash_msg_nolock(gmsg); + upcall_list_unlock(idx); +} + +static +void gss_msg_fail_ctx(struct gss_upcall_msg *gmsg) +{ + if (gmsg->gum_gctx) { + struct ptlrpc_cli_ctx *ctx = &gmsg->gum_gctx->gc_base; + + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + sptlrpc_cli_ctx_expire(ctx); + set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags); + } +} + +static +struct gss_upcall_msg * gss_find_upcall(__u32 mechidx, __u32 seq) +{ + struct gss_upcall_msg *gmsg; + + upcall_list_lock(mechidx); + list_for_each_entry(gmsg, &upcall_lists[mechidx], gum_list) { + if (gmsg->gum_data.gum_seq != seq) + continue; + + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + LASSERT(gmsg->gum_mechidx == mechidx); + + atomic_inc(&gmsg->gum_refcount); + upcall_list_unlock(mechidx); + return gmsg; + } + upcall_list_unlock(mechidx); + return NULL; +} + +static +int simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen) +{ + if (*buflen < reslen) { + CERROR("buflen %u < %u\n", *buflen, reslen); + return -EINVAL; + } + + memcpy(res, *buf, reslen); + *buf += reslen; + *buflen -= reslen; + return 0; +} + +/**************************************** + * rpc_pipefs apis * + ****************************************/ + +static +ssize_t gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + ssize_t mlen = msg->len; + ssize_t left; + ENTRY; + + if (mlen > buflen) + mlen = buflen; + left = copy_to_user(dst, data, mlen); + if (left < 0) { + msg->errno = left; + RETURN(left); + } + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + RETURN(mlen); +} + +static +ssize_t gss_pipe_downcall(struct file *filp, const char *src, size_t mlen) +{ + struct rpc_inode *rpci = RPC_I(file_inode(filp)); + struct gss_upcall_msg *gss_msg; + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx *gctx = NULL; + char *buf, *data; + int datalen; + int timeout, rc; + __u32 mechidx, seq, gss_err; + ENTRY; + + mechidx = (__u32) (long) rpci->private; + LASSERT(mechidx < MECH_MAX); + + OBD_ALLOC(buf, mlen); + if (!buf) + RETURN(-ENOMEM); + + if (copy_from_user(buf, src, mlen)) { + CERROR("failed copy user space data\n"); + GOTO(out_free, rc = -EFAULT); + } + data = buf; + datalen = mlen; + + /* data passed down format: + * - seq + * - timeout + * - gc_win / error + * - wire_ctx (rawobj) + * - mech_ctx (rawobj) + */ + if (simple_get_bytes(&data, &datalen, &seq, sizeof(seq))) { + CERROR("fail to get seq\n"); + GOTO(out_free, rc = -EFAULT); + } + + gss_msg = gss_find_upcall(mechidx, seq); + if (!gss_msg) { + CERROR("upcall %u has aborted earlier\n", seq); + GOTO(out_free, rc = -EINVAL); + } + + gss_unhash_msg(gss_msg); + gctx = gss_msg->gum_gctx; + LASSERT(gctx); + LASSERT(atomic_read(&gctx->gc_base.cc_refcount) > 0); + + /* timeout is not in use for now */ + if (simple_get_bytes(&data, &datalen, &timeout, sizeof(timeout))) + GOTO(out_msg, rc = -EFAULT); + + /* lgssd signal an error by gc_win == 0 */ + if (simple_get_bytes(&data, &datalen, &gctx->gc_win, + sizeof(gctx->gc_win))) + GOTO(out_msg, rc = -EFAULT); + + if (gctx->gc_win == 0) { + /* followed by: + * - rpc error + * - gss error + */ + if (simple_get_bytes(&data, &datalen, &rc, sizeof(rc))) + GOTO(out_msg, rc = -EFAULT); + if (simple_get_bytes(&data, &datalen, &gss_err,sizeof(gss_err))) + GOTO(out_msg, rc = -EFAULT); + + if (rc == 0 && gss_err == GSS_S_COMPLETE) { + CWARN("both rpc & gss error code not set\n"); + rc = -EPERM; + } + } else { + rawobj_t tmpobj; + + /* handle */ + if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen)) + GOTO(out_msg, rc = -EFAULT); + if (rawobj_dup(&gctx->gc_handle, &tmpobj)) + GOTO(out_msg, rc = -ENOMEM); + + /* mechctx */ + if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen)) + GOTO(out_msg, rc = -EFAULT); + gss_err = lgss_import_sec_context(&tmpobj, + gss_msg->gum_gsec->gs_mech, + &gctx->gc_mechctx); + rc = 0; + } + + if (likely(rc == 0 && gss_err == GSS_S_COMPLETE)) { + gss_cli_ctx_uptodate(gctx); + } else { + ctx = &gctx->gc_base; + sptlrpc_cli_ctx_expire(ctx); + if (rc != -ERESTART || gss_err != GSS_S_COMPLETE) + set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags); + + CERROR("refresh ctx %p(uid %d) failed: %d/0x%08x: %s\n", + ctx, ctx->cc_vcred.vc_uid, rc, gss_err, + test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags) ? + "fatal error" : "non-fatal"); + } + + rc = mlen; + +out_msg: + gss_release_msg(gss_msg); + +out_free: + OBD_FREE(buf, mlen); + /* FIXME + * hack pipefs: always return asked length unless all following + * downcalls might be messed up. */ + rc = mlen; + RETURN(rc); +} + +static +void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct gss_upcall_msg *gmsg; + struct gss_upcall_msg_data *gumd; + static time64_t ratelimit; + ENTRY; + + LASSERT(list_empty(&msg->list)); + + /* normally errno is >= 0 */ + if (msg->errno >= 0) { + EXIT; + return; + } + + gmsg = container_of(msg, struct gss_upcall_msg, gum_base); + gumd = &gmsg->gum_data; + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + + CERROR("failed msg %p (seq %u, uid %u, svc %u, nid %#llx, obd %.*s): " + "errno %d\n", msg, gumd->gum_seq, gumd->gum_uid, gumd->gum_svc, + gumd->gum_nid, (int) sizeof(gumd->gum_obd), + gumd->gum_obd, msg->errno); + + atomic_inc(&gmsg->gum_refcount); + gss_unhash_msg(gmsg); + if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) { + time64_t now = ktime_get_real_seconds(); + + if (now > ratelimit) { + CWARN("upcall timed out, is lgssd running?\n"); + ratelimit = now + 15; + } + } + gss_msg_fail_ctx(gmsg); + gss_release_msg(gmsg); + EXIT; +} + +static +void gss_pipe_release(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + __u32 idx; + ENTRY; + + idx = (__u32) (long) rpci->private; + LASSERT(idx < MECH_MAX); + + upcall_list_lock(idx); + while (!list_empty(&upcall_lists[idx])) { + struct gss_upcall_msg *gmsg; + struct gss_upcall_msg_data *gumd; + + gmsg = list_entry(upcall_lists[idx].next, + struct gss_upcall_msg, gum_list); + gumd = &gmsg->gum_data; + LASSERT(list_empty(&gmsg->gum_base.list)); + + CERROR("failing remaining msg %p:seq %u, uid %u, svc %u, " + "nid %#llx, obd %.*s\n", gmsg, + gumd->gum_seq, gumd->gum_uid, gumd->gum_svc, + gumd->gum_nid, (int) sizeof(gumd->gum_obd), + gumd->gum_obd); + + gmsg->gum_base.errno = -EPIPE; + atomic_inc(&gmsg->gum_refcount); + gss_unhash_msg_nolock(gmsg); + + gss_msg_fail_ctx(gmsg); + + upcall_list_unlock(idx); + gss_release_msg(gmsg); + upcall_list_lock(idx); + } + upcall_list_unlock(idx); + EXIT; +} + +static struct rpc_pipe_ops gss_upcall_ops = { + .upcall = gss_pipe_upcall, + .downcall = gss_pipe_downcall, + .destroy_msg = gss_pipe_destroy_msg, + .release_pipe = gss_pipe_release, +}; + +/**************************************** + * upcall helper functions * + ****************************************/ + +static +int gss_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx) +{ + struct obd_import *imp; + struct gss_sec *gsec; + struct gss_upcall_msg *gmsg; + int rc = 0; + ENTRY; + + might_sleep(); + + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_import); + LASSERT(ctx->cc_sec->ps_import->imp_obd); + + imp = ctx->cc_sec->ps_import; + if (!imp->imp_connection) { + CERROR("import has no connection set\n"); + RETURN(-EINVAL); + } + + gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base); + + OBD_ALLOC_PTR(gmsg); + if (!gmsg) + RETURN(-ENOMEM); + + /* initialize pipefs base msg */ + INIT_LIST_HEAD(&gmsg->gum_base.list); + gmsg->gum_base.data = &gmsg->gum_data; + gmsg->gum_base.len = sizeof(gmsg->gum_data); + gmsg->gum_base.copied = 0; + gmsg->gum_base.errno = 0; + + /* init upcall msg */ + atomic_set(&gmsg->gum_refcount, 1); + gmsg->gum_mechidx = mech_name2idx(gsec->gs_mech->gm_name); + gmsg->gum_gsec = gsec; + gmsg->gum_gctx = container_of(sptlrpc_cli_ctx_get(ctx), + struct gss_cli_ctx, gc_base); + gmsg->gum_data.gum_seq = upcall_get_sequence(); + gmsg->gum_data.gum_uid = ctx->cc_vcred.vc_uid; + gmsg->gum_data.gum_gid = 0; /* not used for now */ + gmsg->gum_data.gum_svc = import_to_gss_svc(imp); + gmsg->gum_data.gum_nid = imp->imp_connection->c_peer.nid; + strlcpy(gmsg->gum_data.gum_obd, imp->imp_obd->obd_name, + sizeof(gmsg->gum_data.gum_obd)); + + /* This only could happen when sysadmin set it dead/expired + * using lctl by force. */ + if (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK) { + CWARN("ctx %p(%u->%s) was set flags %lx unexpectedly\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + ctx->cc_flags); + + LASSERT(!(ctx->cc_flags & PTLRPC_CTX_UPTODATE)); + ctx->cc_flags |= PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR; + + rc = -EIO; + goto err_free; + } + + upcall_msg_enlist(gmsg); + + rc = rpc_queue_upcall(de_pipes[gmsg->gum_mechidx]->d_inode, + &gmsg->gum_base); + if (rc) { + CERROR("rpc_queue_upcall failed: %d\n", rc); + + upcall_msg_delist(gmsg); + goto err_free; + } + + RETURN(0); +err_free: + OBD_FREE_PTR(gmsg); + RETURN(rc); +} + +static +int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx) +{ + /* if we are refreshing for root, also update the reverse + * handle index, do not confuse reverse contexts. */ + if (ctx->cc_vcred.vc_uid == 0) { + struct gss_sec *gsec; + + gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base); + gsec->gs_rvs_hdl = gss_get_next_ctx_index(); + } + + return gss_ctx_refresh_pf(ctx); +} + +/**************************************** + * lustre gss pipefs policy * + ****************************************/ + +static struct ptlrpc_ctx_ops gss_pipefs_ctxops = { + .match = gss_cli_ctx_match, + .refresh = gss_cli_ctx_refresh_pf, + .validate = gss_cli_ctx_validate_pf, + .die = gss_cli_ctx_die_pf, + .sign = gss_cli_ctx_sign, + .verify = gss_cli_ctx_verify, + .seal = gss_cli_ctx_seal, + .unseal = gss_cli_ctx_unseal, + .wrap_bulk = gss_cli_ctx_wrap_bulk, + .unwrap_bulk = gss_cli_ctx_unwrap_bulk, +}; + +static struct ptlrpc_sec_cops gss_sec_pipefs_cops = { + .create_sec = gss_sec_create_pf, + .destroy_sec = gss_sec_destroy_pf, + .kill_sec = gss_sec_kill, + .lookup_ctx = gss_sec_lookup_ctx_pf, + .release_ctx = gss_sec_release_ctx_pf, + .flush_ctx_cache = gss_sec_flush_ctx_cache_pf, + .install_rctx = gss_sec_install_rctx, + .alloc_reqbuf = gss_alloc_reqbuf, + .free_reqbuf = gss_free_reqbuf, + .alloc_repbuf = gss_alloc_repbuf, + .free_repbuf = gss_free_repbuf, + .enlarge_reqbuf = gss_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops gss_sec_pipefs_sops = { + .accept = gss_svc_accept_pf, + .invalidate_ctx = gss_svc_invalidate_ctx, + .alloc_rs = gss_svc_alloc_rs, + .authorize = gss_svc_authorize, + .free_rs = gss_svc_free_rs, + .free_ctx = gss_svc_free_ctx, + .unwrap_bulk = gss_svc_unwrap_bulk, + .wrap_bulk = gss_svc_wrap_bulk, + .install_rctx = gss_svc_install_rctx_pf, +}; + +static struct ptlrpc_sec_policy gss_policy_pipefs = { + .sp_owner = THIS_MODULE, + .sp_name = "gss.pipefs", + .sp_policy = SPTLRPC_POLICY_GSS_PIPEFS, + .sp_cops = &gss_sec_pipefs_cops, + .sp_sops = &gss_sec_pipefs_sops, +}; + +static +int __init gss_init_pipefs_upcall(void) +{ + struct dentry *de; + + /* pipe dir */ + de = rpc_mkdir(LUSTRE_PIPE_ROOT, NULL); + if (IS_ERR(de) && PTR_ERR(de) != -EEXIST) { + CERROR("Failed to create gss pipe dir: %ld\n", PTR_ERR(de)); + return PTR_ERR(de); + } + + /* FIXME hack pipefs: dput will sometimes cause oops during module + * unload and lgssd close the pipe fds. */ + + /* krb5 mechanism */ + de = rpc_mkpipe(LUSTRE_PIPE_KRB5, (void *) MECH_KRB5, &gss_upcall_ops, + RPC_PIPE_WAIT_FOR_OPEN); + if (!de || IS_ERR(de)) { + CERROR("failed to make rpc_pipe %s: %ld\n", + LUSTRE_PIPE_KRB5, PTR_ERR(de)); + rpc_rmdir(LUSTRE_PIPE_ROOT); + return PTR_ERR(de); + } + + de_pipes[MECH_KRB5] = de; + INIT_LIST_HEAD(&upcall_lists[MECH_KRB5]); + spin_lock_init(&upcall_locks[MECH_KRB5]); + + return 0; +} + +static +void __exit gss_exit_pipefs_upcall(void) +{ + __u32 i; + + for (i = 0; i < MECH_MAX; i++) { + LASSERT(list_empty(&upcall_lists[i])); + + /* dput pipe dentry here might cause lgssd oops. */ + de_pipes[i] = NULL; + } + + rpc_unlink(LUSTRE_PIPE_KRB5); + rpc_rmdir(LUSTRE_PIPE_ROOT); +} + +int __init gss_init_pipefs(void) +{ + int rc; + + rc = gss_init_pipefs_upcall(); + if (rc) + return rc; + + rc = sptlrpc_register_policy(&gss_policy_pipefs); + if (rc) { + gss_exit_pipefs_upcall(); + return rc; + } + + return 0; +} + +void __exit gss_exit_pipefs(void) +{ + gss_exit_pipefs_upcall(); + sptlrpc_unregister_policy(&gss_policy_pipefs); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c new file mode 100644 index 0000000000000..79930bb67419d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c @@ -0,0 +1,238 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/gss/gss_rawobj.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include + +#include +#include +#include +#include + +#include "gss_internal.h" + +int rawobj_empty(rawobj_t *obj) +{ + LASSERT(equi(obj->len, obj->data)); + return (obj->len == 0); +} + +int rawobj_alloc(rawobj_t *obj, char *buf, int len) +{ + LASSERT(obj); + LASSERT(len >= 0); + + obj->len = len; + if (len) { + OBD_ALLOC_LARGE(obj->data, len); + if (!obj->data) { + obj->len = 0; + RETURN(-ENOMEM); + } + memcpy(obj->data, buf, len); + } else + obj->data = NULL; + return 0; +} + +void rawobj_free(rawobj_t *obj) +{ + LASSERT(obj); + + if (obj->len) { + LASSERT(obj->data); + OBD_FREE_LARGE(obj->data, obj->len); + obj->len = 0; + obj->data = NULL; + } else + LASSERT(!obj->data); +} + +int rawobj_equal(rawobj_t *a, rawobj_t *b) +{ + LASSERT(a && b); + + return (a->len == b->len && + (!a->len || !memcmp(a->data, b->data, a->len))); +} + +int rawobj_dup(rawobj_t *dest, rawobj_t *src) +{ + LASSERT(src && dest); + + dest->len = src->len; + if (dest->len) { + OBD_ALLOC_LARGE(dest->data, dest->len); + if (!dest->data) { + dest->len = 0; + return -ENOMEM; + } + memcpy(dest->data, src->data, dest->len); + } else + dest->data = NULL; + return 0; +} + +int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + __u32 len; + + LASSERT(obj); + LASSERT(buf); + LASSERT(buflen); + + len = cfs_size_round4(obj->len); + + if (*buflen < 4 + len) { + CERROR("buflen %u < %u\n", *buflen, 4 + len); + return -EINVAL; + } + + *(*buf)++ = cpu_to_le32(obj->len); + memcpy(*buf, obj->data, obj->len); + *buf += (len >> 2); + *buflen -= (4 + len); + + return 0; +} + +static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen, + int alloc, int local) +{ + __u32 len; + + if (*buflen < sizeof(__u32)) { + CERROR("buflen %u\n", *buflen); + return -EINVAL; + } + + obj->len = *(*buf)++; + if (!local) + obj->len = le32_to_cpu(obj->len); + *buflen -= sizeof(__u32); + + if (!obj->len) { + obj->data = NULL; + return 0; + } + + len = local ? obj->len : cfs_size_round4(obj->len); + if (*buflen < len) { + CERROR("buflen %u < %u\n", *buflen, len); + obj->len = 0; + return -EINVAL; + } + + if (!alloc) + obj->data = (__u8 *) *buf; + else { + OBD_ALLOC_LARGE(obj->data, obj->len); + if (!obj->data) { + CERROR("fail to alloc %u bytes\n", obj->len); + obj->len = 0; + return -ENOMEM; + } + memcpy(obj->data, *buf, obj->len); + } + + *((char **)buf) += len; + *buflen -= len; + + return 0; +} + +int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 0, 0); +} + +int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 1, 0); +} + +int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 0, 1); +} + +int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 1, 1); +} + +int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj) +{ + rawobj->len = netobj->len; + rawobj->data = netobj->data; + return 0; +} + +int rawobj_from_netobj_alloc(rawobj_t *rawobj, netobj_t *netobj) +{ + rawobj->len = 0; + rawobj->data = NULL; + + if (netobj->len == 0) + return 0; + + OBD_ALLOC_LARGE(rawobj->data, netobj->len); + if (rawobj->data == NULL) + return -ENOMEM; + + rawobj->len = netobj->len; + memcpy(rawobj->data, netobj->data, netobj->len); + return 0; +} + +/**************************************** + * misc more * + ****************************************/ + +int buffer_extract_bytes(const void **buf, __u32 *buflen, + void *res, __u32 reslen) +{ + if (*buflen < reslen) { + CERROR("buflen %u < %u\n", *buflen, reslen); + return -EINVAL; + } + + memcpy(res, *buf, reslen); + *buf += reslen; + *buflen -= reslen; + return 0; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c new file mode 100644 index 0000000000000..69e92bcb28311 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c @@ -0,0 +1,960 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013, 2015, Trustees of Indiana University + * + * Copyright (c) 2014, 2016, Intel Corporation. + * + * Author: Jeremy Filizetti + * Author: Andrew Korty + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "gss_err.h" +#include "gss_crypto.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_asn1.h" + +#define SK_INTERFACE_VERSION 1 +#define SK_MSG_VERSION 1 +#define SK_MIN_SIZE 8 +#define SK_IV_SIZE 16 + +/* Starting number for reverse contexts. It is critical to security + * that reverse contexts use a different range of numbers than regular + * contexts because they are using the same key. Therefore the IV/nonce + * combination must be unique for them. To accomplish this reverse contexts + * use the the negative range of a 64-bit number and regular contexts use the + * postive range. If the same IV/nonce combination were reused it would leak + * information about the plaintext. */ +#define SK_IV_REV_START (1ULL << 63) + +struct sk_ctx { + enum cfs_crypto_crypt_alg sc_crypt; + enum cfs_crypto_hash_alg sc_hmac; + __u32 sc_expire; + __u32 sc_host_random; + __u32 sc_peer_random; + atomic64_t sc_iv; + rawobj_t sc_hmac_key; + struct gss_keyblock sc_session_kb; +}; + +struct sk_hdr { + __u64 skh_version; + __u64 skh_iv; +} __attribute__((packed)); + +/* The format of SK wire data is similar to that of RFC3686 ESP Payload + * (section 3) except instead of just an IV there is a struct sk_hdr. + * --------------------------------------------------------------------- + * | struct sk_hdr | ciphertext (variable size) | HMAC (variable size) | + * --------------------------------------------------------------------- */ +struct sk_wire { + rawobj_t skw_header; + rawobj_t skw_cipher; + rawobj_t skw_hmac; +}; + +static inline unsigned long sk_block_mask(unsigned long len, int blocksize) +{ + return (len + blocksize - 1) & (~(blocksize - 1)); +} + +static int sk_fill_header(struct sk_ctx *skc, struct sk_hdr *skh) +{ + __u64 tmp_iv; + skh->skh_version = be64_to_cpu(SK_MSG_VERSION); + + /* Always using inc_return so we don't use our initial numbers which + * could be the reuse detecting numbers */ + tmp_iv = atomic64_inc_return(&skc->sc_iv); + skh->skh_iv = be64_to_cpu(tmp_iv); + if (tmp_iv == 0 || tmp_iv == SK_IV_REV_START) { + CERROR("Counter looped, connection must be reset to avoid " + "plaintext information\n"); + return GSS_S_FAILURE; + } + + return GSS_S_COMPLETE; +} + +static int sk_verify_header(struct sk_hdr *skh) +{ + if (cpu_to_be64(skh->skh_version) != SK_MSG_VERSION) + return GSS_S_DEFECTIVE_TOKEN; + + return GSS_S_COMPLETE; +} + +void sk_construct_rfc3686_iv(__u8 *iv, __u32 nonce, __u64 partial_iv) +{ + __u32 ctr = cpu_to_be32(1); + + memcpy(iv, &nonce, CTR_RFC3686_NONCE_SIZE); + iv += CTR_RFC3686_NONCE_SIZE; + memcpy(iv, &partial_iv, CTR_RFC3686_IV_SIZE); + iv += CTR_RFC3686_IV_SIZE; + memcpy(iv, &ctr, sizeof(ctr)); +} + +static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc) +{ + char *ptr = inbuf->data; + char *end = inbuf->data + inbuf->len; + char sk_hmac[CRYPTO_MAX_ALG_NAME]; + char sk_crypt[CRYPTO_MAX_ALG_NAME]; + u32 tmp; + + /* see sk_serialize_kctx() for format from userspace side */ + /* 1. Version */ + if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) { + CERROR("Failed to read shared key interface version\n"); + return -1; + } + if (tmp != SK_INTERFACE_VERSION) { + CERROR("Invalid shared key interface version: %d\n", tmp); + return -1; + } + + /* 2. HMAC type */ + if (gss_get_bytes(&ptr, end, &sk_hmac, sizeof(sk_hmac))) { + CERROR("Failed to read HMAC algorithm type\n"); + return -1; + } + + skc->sc_hmac = cfs_crypto_hash_alg(sk_hmac); + if (skc->sc_hmac != CFS_HASH_ALG_NULL && + skc->sc_hmac != CFS_HASH_ALG_SHA256 && + skc->sc_hmac != CFS_HASH_ALG_SHA512) { + CERROR("Invalid hmac type: %s\n", sk_hmac); + return -1; + } + + /* 3. crypt type */ + if (gss_get_bytes(&ptr, end, &sk_crypt, sizeof(sk_crypt))) { + CERROR("Failed to read crypt algorithm type\n"); + return -1; + } + + skc->sc_crypt = cfs_crypto_crypt_alg(sk_crypt); + if (skc->sc_crypt == CFS_CRYPT_ALG_UNKNOWN) { + CERROR("Invalid crypt type: %s\n", sk_crypt); + return -1; + } + + /* 4. expiration time */ + if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) { + CERROR("Failed to read context expiration time\n"); + return -1; + } + skc->sc_expire = tmp + ktime_get_real_seconds(); + + /* 5. host random is used as nonce for encryption */ + if (gss_get_bytes(&ptr, end, &skc->sc_host_random, + sizeof(skc->sc_host_random))) { + CERROR("Failed to read host random\n"); + return -1; + } + + /* 6. peer random is used as nonce for decryption */ + if (gss_get_bytes(&ptr, end, &skc->sc_peer_random, + sizeof(skc->sc_peer_random))) { + CERROR("Failed to read peer random\n"); + return -1; + } + + /* 7. HMAC key */ + if (gss_get_rawobj(&ptr, end, &skc->sc_hmac_key)) { + CERROR("Failed to read HMAC key\n"); + return -1; + } + if (skc->sc_hmac_key.len <= SK_MIN_SIZE) { + CERROR("HMAC key must key must be larger than %d bytes\n", + SK_MIN_SIZE); + return -1; + } + + /* 8. Session key, can be empty if not using privacy mode */ + if (gss_get_rawobj(&ptr, end, &skc->sc_session_kb.kb_key)) { + CERROR("Failed to read session key\n"); + return -1; + } + + return 0; +} + +static void sk_delete_context(struct sk_ctx *skc) +{ + if (!skc) + return; + + rawobj_free(&skc->sc_hmac_key); + gss_keyblock_free(&skc->sc_session_kb); + OBD_FREE_PTR(skc); +} + +static +__u32 gss_import_sec_context_sk(rawobj_t *inbuf, struct gss_ctx *gss_context) +{ + struct sk_ctx *skc; + bool privacy = false; + + if (inbuf == NULL || inbuf->data == NULL) + return GSS_S_FAILURE; + + OBD_ALLOC_PTR(skc); + if (!skc) + return GSS_S_FAILURE; + + atomic64_set(&skc->sc_iv, 0); + + if (sk_fill_context(inbuf, skc)) + goto out_err; + + /* Only privacy mode needs to initialize keys */ + if (skc->sc_session_kb.kb_key.len > 0) { + privacy = true; + if (gss_keyblock_init(&skc->sc_session_kb, + cfs_crypto_crypt_name(skc->sc_crypt), 0)) + goto out_err; + } + + gss_context->internal_ctx_id = skc; + CDEBUG(D_SEC, "successfully imported sk%s context\n", + privacy ? " (with privacy)" : ""); + + return GSS_S_COMPLETE; + +out_err: + sk_delete_context(skc); + return GSS_S_FAILURE; +} + +static +__u32 gss_copy_reverse_context_sk(struct gss_ctx *gss_context_old, + struct gss_ctx *gss_context_new) +{ + struct sk_ctx *skc_old = gss_context_old->internal_ctx_id; + struct sk_ctx *skc_new; + + OBD_ALLOC_PTR(skc_new); + if (!skc_new) + return GSS_S_FAILURE; + + skc_new->sc_hmac = skc_old->sc_hmac; + skc_new->sc_crypt = skc_old->sc_crypt; + skc_new->sc_expire = skc_old->sc_expire; + skc_new->sc_host_random = skc_old->sc_host_random; + skc_new->sc_peer_random = skc_old->sc_peer_random; + + atomic64_set(&skc_new->sc_iv, SK_IV_REV_START); + + if (rawobj_dup(&skc_new->sc_hmac_key, &skc_old->sc_hmac_key)) + goto out_err; + if (gss_keyblock_dup(&skc_new->sc_session_kb, &skc_old->sc_session_kb)) + goto out_err; + + /* Only privacy mode needs to initialize keys */ + if (skc_new->sc_session_kb.kb_key.len > 0) + if (gss_keyblock_init(&skc_new->sc_session_kb, + cfs_crypto_crypt_name(skc_new->sc_crypt), + 0)) + goto out_err; + + gss_context_new->internal_ctx_id = skc_new; + CDEBUG(D_SEC, "successfully copied reverse sk context\n"); + + return GSS_S_COMPLETE; + +out_err: + sk_delete_context(skc_new); + return GSS_S_FAILURE; +} + +static +__u32 gss_inquire_context_sk(struct gss_ctx *gss_context, + time64_t *endtime) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + + *endtime = skc->sc_expire; + return GSS_S_COMPLETE; +} + +static +u32 sk_make_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key, int msg_count, + rawobj_t *msgs, int iov_count, lnet_kiov_t *iovs, + rawobj_t *token, digest_hash hash_func) +{ + struct ahash_request *req; + int rc2, rc; + + req = cfs_crypto_hash_init(algo, key->data, key->len); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + goto out_init_failed; + } + + + if (hash_func) + rc2 = hash_func(req, NULL, msg_count, msgs, iov_count, + iovs); + else + rc2 = gss_digest_hash(req, NULL, msg_count, msgs, iov_count, + iovs); + + rc = cfs_crypto_hash_final(req, token->data, &token->len); + if (!rc && rc2) + rc = rc2; +out_init_failed: + return rc ? GSS_S_FAILURE : GSS_S_COMPLETE; +} + +static +__u32 gss_get_mic_sk(struct gss_ctx *gss_context, + int message_count, + rawobj_t *messages, + int iov_count, + lnet_kiov_t *iovs, + rawobj_t *token) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + + return sk_make_hmac(skc->sc_hmac, + &skc->sc_hmac_key, message_count, messages, + iov_count, iovs, token, gss_context->hash_func); +} + +static +u32 sk_verify_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key, + int message_count, rawobj_t *messages, + int iov_count, lnet_kiov_t *iovs, + rawobj_t *token, digest_hash hash_func) +{ + rawobj_t checksum = RAWOBJ_EMPTY; + __u32 rc = GSS_S_FAILURE; + + checksum.len = cfs_crypto_hash_digestsize(algo); + if (token->len < checksum.len) { + CDEBUG(D_SEC, "Token received too short, expected %d " + "received %d\n", token->len, checksum.len); + return GSS_S_DEFECTIVE_TOKEN; + } + + OBD_ALLOC_LARGE(checksum.data, checksum.len); + if (!checksum.data) + return rc; + + if (sk_make_hmac(algo, key, message_count, + messages, iov_count, iovs, &checksum, + hash_func)) { + CDEBUG(D_SEC, "Failed to create checksum to validate\n"); + goto cleanup; + } + + if (memcmp(token->data, checksum.data, checksum.len)) { + CERROR("checksum mismatch\n"); + rc = GSS_S_BAD_SIG; + goto cleanup; + } + + rc = GSS_S_COMPLETE; + +cleanup: + OBD_FREE(checksum.data, checksum.len); + return rc; +} + +/* sk_verify_bulk_hmac() differs slightly from sk_verify_hmac() because all + * encrypted pages in the bulk descriptor are populated although we only need + * to decrypt up to the number of bytes actually specified from the sender + * (bd_nob) otherwise the calulated HMAC will be incorrect. */ +static +u32 sk_verify_bulk_hmac(enum cfs_crypto_hash_alg sc_hmac, rawobj_t *key, + int msgcnt, rawobj_t *msgs, int iovcnt, + lnet_kiov_t *iovs, int iov_bytes, rawobj_t *token) +{ + rawobj_t checksum = RAWOBJ_EMPTY; + struct ahash_request *req; + struct scatterlist sg[1]; + int rc = 0; + struct sg_table sgt; + int bytes; + int i; + + checksum.len = cfs_crypto_hash_digestsize(sc_hmac); + if (token->len < checksum.len) { + CDEBUG(D_SEC, "Token received too short, expected %d " + "received %d\n", token->len, checksum.len); + return GSS_S_DEFECTIVE_TOKEN; + } + + OBD_ALLOC_LARGE(checksum.data, checksum.len); + if (!checksum.data) + return GSS_S_FAILURE; + + req = cfs_crypto_hash_init(sc_hmac, key->data, key->len); + if (IS_ERR(req)) { + rc = GSS_S_FAILURE; + goto cleanup; + } + + for (i = 0; i < msgcnt; i++) { + if (!msgs[i].len) + continue; + + rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len); + if (rc != 0) + goto hash_cleanup; + + ahash_request_set_crypt(req, sg, NULL, msgs[i].len); + rc = crypto_ahash_update(req); + if (rc) { + gss_teardown_sgtable(&sgt); + goto hash_cleanup; + } + + gss_teardown_sgtable(&sgt); + } + + for (i = 0; i < iovcnt && iov_bytes > 0; i++) { + if (iovs[i].kiov_len == 0) + continue; + + bytes = min_t(int, iov_bytes, iovs[i].kiov_len); + iov_bytes -= bytes; + + sg_init_table(sg, 1); + sg_set_page(&sg[0], iovs[i].kiov_page, bytes, + iovs[i].kiov_offset); + ahash_request_set_crypt(req, sg, NULL, bytes); + rc = crypto_ahash_update(req); + if (rc) + goto hash_cleanup; + } + +hash_cleanup: + cfs_crypto_hash_final(req, checksum.data, &checksum.len); + if (rc) + goto cleanup; + + if (memcmp(token->data, checksum.data, checksum.len)) + rc = GSS_S_BAD_SIG; + else + rc = GSS_S_COMPLETE; + +cleanup: + OBD_FREE_LARGE(checksum.data, checksum.len); + + return rc; +} + +static +__u32 gss_verify_mic_sk(struct gss_ctx *gss_context, + int message_count, + rawobj_t *messages, + int iov_count, + lnet_kiov_t *iovs, + rawobj_t *token) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + + return sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key, + message_count, messages, iov_count, iovs, token, + gss_context->hash_func); +} + +static +__u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header, + rawobj_t *message, int message_buffer_length, + rawobj_t *token) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac); + struct sk_wire skw; + struct sk_hdr skh; + rawobj_t msgbufs[3]; + __u8 local_iv[SK_IV_SIZE]; + unsigned int blocksize; + + LASSERT(skc->sc_session_kb.kb_tfm); + + blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm); + if (gss_add_padding(message, message_buffer_length, blocksize)) + return GSS_S_FAILURE; + + memset(token->data, 0, token->len); + + if (sk_fill_header(skc, &skh) != GSS_S_COMPLETE) + return GSS_S_FAILURE; + + skw.skw_header.data = token->data; + skw.skw_header.len = sizeof(skh); + memcpy(skw.skw_header.data, &skh, sizeof(skh)); + + sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv); + skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len; + skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes; + if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv, 1, message, + &skw.skw_cipher, 1)) + return GSS_S_FAILURE; + + /* HMAC covers the SK header, GSS header, and ciphertext */ + msgbufs[0] = skw.skw_header; + msgbufs[1] = *gss_header; + msgbufs[2] = skw.skw_cipher; + + skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len; + skw.skw_hmac.len = sht_bytes; + if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key, + 3, msgbufs, 0, NULL, &skw.skw_hmac, + gss_context->hash_func)) + return GSS_S_FAILURE; + + token->len = skw.skw_header.len + skw.skw_cipher.len + skw.skw_hmac.len; + + return GSS_S_COMPLETE; +} + +static +__u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header, + rawobj_t *token, rawobj_t *message) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac); + struct sk_wire skw; + struct sk_hdr *skh; + rawobj_t msgbufs[3]; + __u8 local_iv[SK_IV_SIZE]; + unsigned int blocksize; + int rc; + + LASSERT(skc->sc_session_kb.kb_tfm); + + if (token->len < sizeof(skh) + sht_bytes) + return GSS_S_DEFECTIVE_TOKEN; + + skw.skw_header.data = token->data; + skw.skw_header.len = sizeof(struct sk_hdr); + skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len; + skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes; + skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len; + skw.skw_hmac.len = sht_bytes; + + blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm); + if (skw.skw_cipher.len % blocksize != 0) + return GSS_S_DEFECTIVE_TOKEN; + + skh = (struct sk_hdr *)skw.skw_header.data; + rc = sk_verify_header(skh); + if (rc != GSS_S_COMPLETE) + return rc; + + /* HMAC covers the SK header, GSS header, and ciphertext */ + msgbufs[0] = skw.skw_header; + msgbufs[1] = *gss_header; + msgbufs[2] = skw.skw_cipher; + rc = sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key, 3, msgbufs, + 0, NULL, &skw.skw_hmac, gss_context->hash_func); + if (rc) + return rc; + + sk_construct_rfc3686_iv(local_iv, skc->sc_peer_random, skh->skh_iv); + message->len = skw.skw_cipher.len; + if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv, + 1, &skw.skw_cipher, message, 0)) + return GSS_S_FAILURE; + + return GSS_S_COMPLETE; +} + +static +__u32 gss_prep_bulk_sk(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + int blocksize; + int i; + + LASSERT(skc->sc_session_kb.kb_tfm); + blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm); + + for (i = 0; i < desc->bd_iov_count; i++) { + if (BD_GET_KIOV(desc, i).kiov_offset & blocksize) { + CERROR("offset %d not blocksize aligned\n", + BD_GET_KIOV(desc, i).kiov_offset); + return GSS_S_FAILURE; + } + + BD_GET_ENC_KIOV(desc, i).kiov_offset = + BD_GET_KIOV(desc, i).kiov_offset; + BD_GET_ENC_KIOV(desc, i).kiov_len = + sk_block_mask(BD_GET_KIOV(desc, i).kiov_len, blocksize); + } + + return GSS_S_COMPLETE; +} + +static __u32 sk_encrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv, + struct ptlrpc_bulk_desc *desc, rawobj_t *cipher, + int adj_nob) +{ + struct scatterlist ptxt; + struct scatterlist ctxt; + int blocksize; + int i; + int rc; + int nob = 0; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + blocksize = crypto_sync_skcipher_blocksize(tfm); + + sg_init_table(&ptxt, 1); + sg_init_table(&ctxt, 1); + + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + + for (i = 0; i < desc->bd_iov_count; i++) { + sg_set_page(&ptxt, BD_GET_KIOV(desc, i).kiov_page, + sk_block_mask(BD_GET_KIOV(desc, i).kiov_len, + blocksize), + BD_GET_KIOV(desc, i).kiov_offset); + nob += ptxt.length; + + sg_set_page(&ctxt, BD_GET_ENC_KIOV(desc, i).kiov_page, + ptxt.length, ptxt.offset); + + BD_GET_ENC_KIOV(desc, i).kiov_offset = ctxt.offset; + BD_GET_ENC_KIOV(desc, i).kiov_len = ctxt.length; + + skcipher_request_set_crypt(req, &ptxt, &ctxt, ptxt.length, iv); + rc = crypto_skcipher_encrypt_iv(req, &ctxt, &ptxt, ptxt.length); + if (rc) { + CERROR("failed to encrypt page: %d\n", rc); + skcipher_request_zero(req); + return rc; + } + } + skcipher_request_zero(req); + + if (adj_nob) + desc->bd_nob = nob; + + return 0; +} + +static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv, + struct ptlrpc_bulk_desc *desc, rawobj_t *cipher, + int adj_nob) +{ + struct scatterlist ptxt; + struct scatterlist ctxt; + int blocksize; + int i; + int rc; + int pnob = 0; + int cnob = 0; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + + sg_init_table(&ptxt, 1); + sg_init_table(&ctxt, 1); + + blocksize = crypto_sync_skcipher_blocksize(tfm); + if (desc->bd_nob_transferred % blocksize != 0) { + CERROR("Transfer not a multiple of block size: %d\n", + desc->bd_nob_transferred); + return GSS_S_DEFECTIVE_TOKEN; + } + + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + + for (i = 0; i < desc->bd_iov_count && cnob < desc->bd_nob_transferred; + i++) { + lnet_kiov_t *piov = &BD_GET_KIOV(desc, i); + lnet_kiov_t *ciov = &BD_GET_ENC_KIOV(desc, i); + + if (ciov->kiov_offset % blocksize != 0 || + ciov->kiov_len % blocksize != 0) { + CERROR("Invalid bulk descriptor vector\n"); + skcipher_request_zero(req); + return GSS_S_DEFECTIVE_TOKEN; + } + + /* Must adjust bytes here because we know the actual sizes after + * decryption. Similar to what gss_cli_ctx_unwrap_bulk does for + * integrity only mode */ + if (adj_nob) { + /* cipher text must not exceed transferred size */ + if (ciov->kiov_len + cnob > desc->bd_nob_transferred) + ciov->kiov_len = + desc->bd_nob_transferred - cnob; + + piov->kiov_len = ciov->kiov_len; + + /* plain text must not exceed bulk's size */ + if (ciov->kiov_len + pnob > desc->bd_nob) + piov->kiov_len = desc->bd_nob - pnob; + } else { + /* Taken from krb5_decrypt since it was not verified + * whether or not LNET guarantees these */ + if (ciov->kiov_len + cnob > desc->bd_nob_transferred || + piov->kiov_len > ciov->kiov_len) { + CERROR("Invalid decrypted length\n"); + skcipher_request_zero(req); + return GSS_S_FAILURE; + } + } + + if (ciov->kiov_len == 0) + continue; + + sg_init_table(&ctxt, 1); + sg_set_page(&ctxt, ciov->kiov_page, ciov->kiov_len, + ciov->kiov_offset); + ptxt = ctxt; + + /* In the event the plain text size is not a multiple + * of blocksize we decrypt in place and copy the result + * after the decryption */ + if (piov->kiov_len % blocksize == 0) + sg_assign_page(&ptxt, piov->kiov_page); + + skcipher_request_set_crypt(req, &ctxt, &ptxt, ptxt.length, iv); + rc = crypto_skcipher_decrypt_iv(req, &ptxt, &ctxt, ptxt.length); + if (rc) { + CERROR("Decryption failed for page: %d\n", rc); + skcipher_request_zero(req); + return GSS_S_FAILURE; + } + + if (piov->kiov_len % blocksize != 0) { + memcpy(page_address(piov->kiov_page) + + piov->kiov_offset, + page_address(ciov->kiov_page) + + ciov->kiov_offset, + piov->kiov_len); + } + + cnob += ciov->kiov_len; + pnob += piov->kiov_len; + } + skcipher_request_zero(req); + + /* if needed, clear up the rest unused iovs */ + if (adj_nob) + while (i < desc->bd_iov_count) + BD_GET_KIOV(desc, i++).kiov_len = 0; + + if (unlikely(cnob != desc->bd_nob_transferred)) { + CERROR("%d cipher text transferred but only %d decrypted\n", + desc->bd_nob_transferred, cnob); + return GSS_S_FAILURE; + } + + if (unlikely(!adj_nob && pnob != desc->bd_nob)) { + CERROR("%d plain text expected but only %d received\n", + desc->bd_nob, pnob); + return GSS_S_FAILURE; + } + + return 0; +} + +static +__u32 gss_wrap_bulk_sk(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc, rawobj_t *token, + int adj_nob) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac); + struct sk_wire skw; + struct sk_hdr skh; + __u8 local_iv[SK_IV_SIZE]; + + LASSERT(skc->sc_session_kb.kb_tfm); + + memset(token->data, 0, token->len); + if (sk_fill_header(skc, &skh) != GSS_S_COMPLETE) + return GSS_S_FAILURE; + + skw.skw_header.data = token->data; + skw.skw_header.len = sizeof(skh); + memcpy(skw.skw_header.data, &skh, sizeof(skh)); + + sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv); + skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len; + skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes; + if (sk_encrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv, + desc, &skw.skw_cipher, adj_nob)) + return GSS_S_FAILURE; + + skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len; + skw.skw_hmac.len = sht_bytes; + if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1, &skw.skw_cipher, + desc->bd_iov_count, GET_ENC_KIOV(desc), &skw.skw_hmac, + gss_context->hash_func)) + return GSS_S_FAILURE; + + return GSS_S_COMPLETE; +} + +static +__u32 gss_unwrap_bulk_sk(struct gss_ctx *gss_context, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + struct sk_ctx *skc = gss_context->internal_ctx_id; + size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac); + struct sk_wire skw; + struct sk_hdr *skh; + __u8 local_iv[SK_IV_SIZE]; + int rc; + + LASSERT(skc->sc_session_kb.kb_tfm); + + if (token->len < sizeof(skh) + sht_bytes) + return GSS_S_DEFECTIVE_TOKEN; + + skw.skw_header.data = token->data; + skw.skw_header.len = sizeof(struct sk_hdr); + skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len; + skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes; + skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len; + skw.skw_hmac.len = sht_bytes; + + skh = (struct sk_hdr *)skw.skw_header.data; + rc = sk_verify_header(skh); + if (rc != GSS_S_COMPLETE) + return rc; + + rc = sk_verify_bulk_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1, + &skw.skw_cipher, desc->bd_iov_count, + GET_ENC_KIOV(desc), desc->bd_nob, + &skw.skw_hmac); + if (rc) + return rc; + + sk_construct_rfc3686_iv(local_iv, skc->sc_peer_random, skh->skh_iv); + rc = sk_decrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv, + desc, &skw.skw_cipher, adj_nob); + if (rc) + return rc; + + return GSS_S_COMPLETE; +} + +static +void gss_delete_sec_context_sk(void *internal_context) +{ + struct sk_ctx *sk_context = internal_context; + sk_delete_context(sk_context); +} + +int gss_display_sk(struct gss_ctx *gss_context, char *buf, int bufsize) +{ + return snprintf(buf, bufsize, "sk"); +} + +static struct gss_api_ops gss_sk_ops = { + .gss_import_sec_context = gss_import_sec_context_sk, + .gss_copy_reverse_context = gss_copy_reverse_context_sk, + .gss_inquire_context = gss_inquire_context_sk, + .gss_get_mic = gss_get_mic_sk, + .gss_verify_mic = gss_verify_mic_sk, + .gss_wrap = gss_wrap_sk, + .gss_unwrap = gss_unwrap_sk, + .gss_prep_bulk = gss_prep_bulk_sk, + .gss_wrap_bulk = gss_wrap_bulk_sk, + .gss_unwrap_bulk = gss_unwrap_bulk_sk, + .gss_delete_sec_context = gss_delete_sec_context_sk, + .gss_display = gss_display_sk, +}; + +static struct subflavor_desc gss_sk_sfs[] = { + { + .sf_subflavor = SPTLRPC_SUBFLVR_SKN, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_NULL, + .sf_name = "skn" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_SKA, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_AUTH, + .sf_name = "ska" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_SKI, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_INTG, + .sf_name = "ski" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_SKPI, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_PRIV, + .sf_name = "skpi" + }, +}; + +static struct gss_api_mech gss_sk_mech = { + /* .gm_owner uses default NULL value for THIS_MODULE */ + .gm_name = "sk", + .gm_oid = (rawobj_t) { + .len = 12, + .data = "\053\006\001\004\001\311\146\215\126\001\000\001", + }, + .gm_ops = &gss_sk_ops, + .gm_sf_num = 4, + .gm_sfs = gss_sk_sfs, +}; + +int __init init_sk_module(void) +{ + int status; + + status = lgss_mech_register(&gss_sk_mech); + if (status) + CERROR("Failed to register sk gss mechanism!\n"); + + return status; +} + +void cleanup_sk_module(void) +{ + lgss_mech_unregister(&gss_sk_mech); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c new file mode 100644 index 0000000000000..2202e3f56f8c5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c @@ -0,0 +1,1161 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, 2014, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * Neil Brown + * J. Bruce Fields + * Andy Adamson + * Dug Song + * + * RPCSEC_GSS server authentication. + * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 + * (gssapi) + * + * The RPCSEC_GSS involves three stages: + * 1/ context creation + * 2/ data exchange + * 3/ context destruction + * + * Context creation is handled largely by upcalls to user-space. + * In particular, GSS_Accept_sec_context is handled by an upcall + * Data exchange is handled entirely within the kernel + * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. + * Context destruction is handled in-kernel + * GSS_Delete_sec_context is in-kernel + * + * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. + * The context handle and gss_token are used as a key into the rpcsec_init cache. + * The content of this cache includes some of the outputs of GSS_Accept_sec_context, + * being major_status, minor_status, context_handle, reply_token. + * These are sent back to the client. + * Sequence window management is handled by the kernel. The window size if currently + * a compile time constant. + * + * When user-space is happy that a context is established, it places an entry + * in the rpcsec_context cache. The key for this cache is the context_handle. + * The content includes: + * uid/gidlist - for determining access rights + * mechanism type + * mechanism specific information, such as a key + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_crypto.h" + +#define GSS_SVC_UPCALL_TIMEOUT (20) + +static spinlock_t __ctx_index_lock; +static __u64 __ctx_index; + +unsigned int krb5_allow_old_client_csum; + +__u64 gss_get_next_ctx_index(void) +{ + __u64 idx; + + spin_lock(&__ctx_index_lock); + idx = __ctx_index++; + spin_unlock(&__ctx_index_lock); + + return idx; +} + +static inline unsigned long hash_mem(char *buf, int length, int bits) +{ + unsigned long hash = 0; + unsigned long l = 0; + int len = 0; + unsigned char c; + + do { + if (len == length) { + c = (char) len; + len = -1; + } else + c = *buf++; + + l = (l << 8) | c; + len++; + + if ((len & (BITS_PER_LONG/8-1)) == 0) + hash = hash_long(hash^l, BITS_PER_LONG); + } while (len); + + return hash >> (BITS_PER_LONG - bits); +} + +/* This compatibility can be removed once kernel 3.3 is used, + * since cache_register_net/cache_unregister_net are exported. + * Note that since kernel 3.4 cache_register and cache_unregister + * are removed. +*/ +static inline int _cache_register_net(struct cache_detail *cd, struct net *net) +{ +#ifdef HAVE_CACHE_REGISTER + return cache_register(cd); +#else + return cache_register_net(cd, net); +#endif +} +static inline void _cache_unregister_net(struct cache_detail *cd, + struct net *net) +{ +#ifdef HAVE_CACHE_REGISTER + cache_unregister(cd); +#else + cache_unregister_net(cd, net); +#endif +} +/**************************************** + * rpc sec init (rsi) cache * + ****************************************/ + +#define RSI_HASHBITS (6) +#define RSI_HASHMAX (1 << RSI_HASHBITS) +#define RSI_HASHMASK (RSI_HASHMAX - 1) + +struct rsi { + struct cache_head h; + __u32 lustre_svc; + __u64 nid; + char nm_name[LUSTRE_NODEMAP_NAME_LENGTH + 1]; + wait_queue_head_t waitq; + rawobj_t in_handle, in_token; + rawobj_t out_handle, out_token; + int major_status, minor_status; +}; + +#ifdef HAVE_CACHE_HEAD_HLIST +static struct hlist_head rsi_table[RSI_HASHMAX]; +#else +static struct cache_head *rsi_table[RSI_HASHMAX]; +#endif +static struct cache_detail rsi_cache; +static struct rsi *rsi_update(struct rsi *new, struct rsi *old); +static struct rsi *rsi_lookup(struct rsi *item); + +#ifdef HAVE_CACHE_DETAIL_WRITERS +static inline int channel_users(struct cache_detail *cd) +{ + return atomic_read(&cd->writers); +} +#else +static inline int channel_users(struct cache_detail *cd) +{ + return atomic_read(&cd->readers); +} +#endif + +static inline int rsi_hash(struct rsi *item) +{ + return hash_mem((char *)item->in_handle.data, item->in_handle.len, + RSI_HASHBITS) ^ + hash_mem((char *)item->in_token.data, item->in_token.len, + RSI_HASHBITS); +} + +static inline int __rsi_match(struct rsi *item, struct rsi *tmp) +{ + return (rawobj_equal(&item->in_handle, &tmp->in_handle) && + rawobj_equal(&item->in_token, &tmp->in_token)); +} + +static void rsi_free(struct rsi *rsi) +{ + rawobj_free(&rsi->in_handle); + rawobj_free(&rsi->in_token); + rawobj_free(&rsi->out_handle); + rawobj_free(&rsi->out_token); +} + +/* See handle_channel_req() userspace for where the upcall data is read */ +static void rsi_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + struct rsi *rsi = container_of(h, struct rsi, h); + __u64 index = 0; + + /* if in_handle is null, provide kernel suggestion */ + if (rsi->in_handle.len == 0) + index = gss_get_next_ctx_index(); + + qword_addhex(bpp, blen, (char *) &rsi->lustre_svc, + sizeof(rsi->lustre_svc)); + qword_addhex(bpp, blen, (char *) &rsi->nid, sizeof(rsi->nid)); + qword_addhex(bpp, blen, (char *) &index, sizeof(index)); + qword_addhex(bpp, blen, (char *) rsi->nm_name, + strlen(rsi->nm_name) + 1); + qword_addhex(bpp, blen, rsi->in_handle.data, rsi->in_handle.len); + qword_addhex(bpp, blen, rsi->in_token.data, rsi->in_token.len); + (*bpp)[-1] = '\n'; +} + +#ifdef HAVE_SUNRPC_UPCALL_HAS_3ARGS +static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, rsi_request); +} +#else + +static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); +} +#endif + +static inline void __rsi_init(struct rsi *new, struct rsi *item) +{ + new->out_handle = RAWOBJ_EMPTY; + new->out_token = RAWOBJ_EMPTY; + + new->in_handle = item->in_handle; + item->in_handle = RAWOBJ_EMPTY; + new->in_token = item->in_token; + item->in_token = RAWOBJ_EMPTY; + + new->lustre_svc = item->lustre_svc; + new->nid = item->nid; + memcpy(new->nm_name, item->nm_name, sizeof(item->nm_name)); + init_waitqueue_head(&new->waitq); +} + +static inline void __rsi_update(struct rsi *new, struct rsi *item) +{ + LASSERT(new->out_handle.len == 0); + LASSERT(new->out_token.len == 0); + + new->out_handle = item->out_handle; + item->out_handle = RAWOBJ_EMPTY; + new->out_token = item->out_token; + item->out_token = RAWOBJ_EMPTY; + + new->major_status = item->major_status; + new->minor_status = item->minor_status; +} + +static void rsi_put(struct kref *ref) +{ + struct rsi *rsi = container_of(ref, struct rsi, h.ref); + +#ifdef HAVE_CACHE_HEAD_HLIST + LASSERT(rsi->h.cache_list.next == NULL); +#else + LASSERT(rsi->h.next == NULL); +#endif + rsi_free(rsi); + OBD_FREE_PTR(rsi); +} + +static int rsi_match(struct cache_head *a, struct cache_head *b) +{ + struct rsi *item = container_of(a, struct rsi, h); + struct rsi *tmp = container_of(b, struct rsi, h); + + return __rsi_match(item, tmp); +} + +static void rsi_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct rsi *new = container_of(cnew, struct rsi, h); + struct rsi *item = container_of(citem, struct rsi, h); + + __rsi_init(new, item); +} + +static void update_rsi(struct cache_head *cnew, struct cache_head *citem) +{ + struct rsi *new = container_of(cnew, struct rsi, h); + struct rsi *item = container_of(citem, struct rsi, h); + + __rsi_update(new, item); +} + +static struct cache_head *rsi_alloc(void) +{ + struct rsi *rsi; + + OBD_ALLOC_PTR(rsi); + if (rsi) + return &rsi->h; + else + return NULL; +} + +static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + char *buf = mesg; + int len; + struct rsi rsii, *rsip = NULL; + time64_t expiry; + int status = -EINVAL; + ENTRY; + + + memset(&rsii, 0, sizeof(rsii)); + + /* handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.in_handle, buf, len)) { + status = -ENOMEM; + goto out; + } + + /* token */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.in_token, buf, len)) { + status = -ENOMEM; + goto out; + } + + rsip = rsi_lookup(&rsii); + if (!rsip) + goto out; + + rsii.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + if (expiry == 0) + goto out; + + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + + /* major */ + status = kstrtoint(buf, 10, &rsii.major_status); + if (status) + goto out; + + /* minor */ + len = qword_get(&mesg, buf, mlen); + if (len <= 0) { + status = -EINVAL; + goto out; + } + + status = kstrtoint(buf, 10, &rsii.minor_status); + if (status) + goto out; + + /* out_handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.out_handle, buf, len)) { + status = -ENOMEM; + goto out; + } + + /* out_token */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.out_token, buf, len)) { + status = -ENOMEM; + goto out; + } + + rsii.h.expiry_time = expiry; + rsip = rsi_update(&rsii, rsip); + status = 0; +out: + rsi_free(&rsii); + if (rsip) { + wake_up_all(&rsip->waitq); + cache_put(&rsip->h, &rsi_cache); + } else { + status = -ENOMEM; + } + + if (status) + CERROR("rsi parse error %d\n", status); + RETURN(status); +} + +static struct cache_detail rsi_cache = { + .hash_size = RSI_HASHMAX, + .hash_table = rsi_table, + .name = "auth.sptlrpc.init", + .cache_put = rsi_put, +#ifndef HAVE_SUNRPC_UPCALL_HAS_3ARGS + .cache_request = rsi_request, +#endif + .cache_upcall = rsi_upcall, + .cache_parse = rsi_parse, + .match = rsi_match, + .init = rsi_init, + .update = update_rsi, + .alloc = rsi_alloc, +}; + +static struct rsi *rsi_lookup(struct rsi *item) +{ + struct cache_head *ch; + int hash = rsi_hash(item); + + ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash); + if (ch) + return container_of(ch, struct rsi, h); + else + return NULL; +} + +static struct rsi *rsi_update(struct rsi *new, struct rsi *old) +{ + struct cache_head *ch; + int hash = rsi_hash(new); + + ch = sunrpc_cache_update(&rsi_cache, &new->h, &old->h, hash); + if (ch) + return container_of(ch, struct rsi, h); + else + return NULL; +} + +/**************************************** + * rpc sec context (rsc) cache * + ****************************************/ + +#define RSC_HASHBITS (10) +#define RSC_HASHMAX (1 << RSC_HASHBITS) +#define RSC_HASHMASK (RSC_HASHMAX - 1) + +struct rsc { + struct cache_head h; + struct obd_device *target; + rawobj_t handle; + struct gss_svc_ctx ctx; +}; + +#ifdef HAVE_CACHE_HEAD_HLIST +static struct hlist_head rsc_table[RSC_HASHMAX]; +#else +static struct cache_head *rsc_table[RSC_HASHMAX]; +#endif +static struct cache_detail rsc_cache; +static struct rsc *rsc_update(struct rsc *new, struct rsc *old); +static struct rsc *rsc_lookup(struct rsc *item); + +static void rsc_free(struct rsc *rsci) +{ + rawobj_free(&rsci->handle); + rawobj_free(&rsci->ctx.gsc_rvs_hdl); + lgss_delete_sec_context(&rsci->ctx.gsc_mechctx); +} + +static inline int rsc_hash(struct rsc *rsci) +{ + return hash_mem((char *)rsci->handle.data, + rsci->handle.len, RSC_HASHBITS); +} + +static inline int __rsc_match(struct rsc *new, struct rsc *tmp) +{ + return rawobj_equal(&new->handle, &tmp->handle); +} + +static inline void __rsc_init(struct rsc *new, struct rsc *tmp) +{ + new->handle = tmp->handle; + tmp->handle = RAWOBJ_EMPTY; + + new->target = NULL; + memset(&new->ctx, 0, sizeof(new->ctx)); + new->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY; +} + +static inline void __rsc_update(struct rsc *new, struct rsc *tmp) +{ + new->ctx = tmp->ctx; + tmp->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY; + tmp->ctx.gsc_mechctx = NULL; + + memset(&new->ctx.gsc_seqdata, 0, sizeof(new->ctx.gsc_seqdata)); + spin_lock_init(&new->ctx.gsc_seqdata.ssd_lock); +} + +static void rsc_put(struct kref *ref) +{ + struct rsc *rsci = container_of(ref, struct rsc, h.ref); + +#ifdef HAVE_CACHE_HEAD_HLIST + LASSERT(rsci->h.cache_list.next == NULL); +#else + LASSERT(rsci->h.next == NULL); +#endif + rsc_free(rsci); + OBD_FREE_PTR(rsci); +} + +static int rsc_match(struct cache_head *a, struct cache_head *b) +{ + struct rsc *new = container_of(a, struct rsc, h); + struct rsc *tmp = container_of(b, struct rsc, h); + + return __rsc_match(new, tmp); +} + +static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp) +{ + struct rsc *new = container_of(cnew, struct rsc, h); + struct rsc *tmp = container_of(ctmp, struct rsc, h); + + __rsc_init(new, tmp); +} + +static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp) +{ + struct rsc *new = container_of(cnew, struct rsc, h); + struct rsc *tmp = container_of(ctmp, struct rsc, h); + + __rsc_update(new, tmp); +} + +static struct cache_head * rsc_alloc(void) +{ + struct rsc *rsc; + + OBD_ALLOC_PTR(rsc); + if (rsc) + return &rsc->h; + else + return NULL; +} + +static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + char *buf = mesg; + int len, rv, tmp_int; + struct rsc rsci, *rscp = NULL; + time64_t expiry; + int status = -EINVAL; + struct gss_api_mech *gm = NULL; + + memset(&rsci, 0, sizeof(rsci)); + + /* context handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) goto out; + status = -ENOMEM; + if (rawobj_alloc(&rsci.handle, buf, len)) + goto out; + + rsci.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + /* remote flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get remote flag\n"); + goto out; + } + rsci.ctx.gsc_remote = (tmp_int != 0); + + /* root user flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get root user flag\n"); + goto out; + } + rsci.ctx.gsc_usr_root = (tmp_int != 0); + + /* mds user flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get mds user flag\n"); + goto out; + } + rsci.ctx.gsc_usr_mds = (tmp_int != 0); + + /* oss user flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get oss user flag\n"); + goto out; + } + rsci.ctx.gsc_usr_oss = (tmp_int != 0); + + /* mapped uid */ + rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid); + if (rv) { + CERROR("fail to get mapped uid\n"); + goto out; + } + + rscp = rsc_lookup(&rsci); + if (!rscp) + goto out; + + /* uid, or NEGATIVE */ + rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid); + if (rv == -EINVAL) + goto out; + if (rv == -ENOENT) { + CERROR("NOENT? set rsc entry negative\n"); + set_bit(CACHE_NEGATIVE, &rsci.h.flags); + } else { + rawobj_t tmp_buf; + time64_t ctx_expiry; + + /* gid */ + if (get_int(&mesg, (int *) &rsci.ctx.gsc_gid)) + goto out; + + /* mech name */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + gm = lgss_name_to_mech(buf); + status = -EOPNOTSUPP; + if (!gm) + goto out; + + status = -EINVAL; + /* mech-specific data: */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + + tmp_buf.len = len; + tmp_buf.data = (unsigned char *)buf; + if (lgss_import_sec_context(&tmp_buf, gm, + &rsci.ctx.gsc_mechctx)) + goto out; + + /* set to seconds since machine booted */ + expiry = ktime_get_seconds(); + + /* currently the expiry time passed down from user-space + * is invalid, here we retrive it from mech. + */ + if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) { + CERROR("unable to get expire time, drop it\n"); + goto out; + } + + /* ctx_expiry is the number of seconds since Jan 1 1970. + * We want just the number of seconds into the future. + */ + expiry += ctx_expiry - ktime_get_real_seconds(); + } + + rsci.h.expiry_time = expiry; + rscp = rsc_update(&rsci, rscp); + status = 0; +out: + if (gm) + lgss_mech_put(gm); + rsc_free(&rsci); + if (rscp) + cache_put(&rscp->h, &rsc_cache); + else + status = -ENOMEM; + + if (status) + CERROR("parse rsc error %d\n", status); + return status; +} + +static struct cache_detail rsc_cache = { + .hash_size = RSC_HASHMAX, + .hash_table = rsc_table, + .name = "auth.sptlrpc.context", + .cache_put = rsc_put, + .cache_parse = rsc_parse, + .match = rsc_match, + .init = rsc_init, + .update = update_rsc, + .alloc = rsc_alloc, +}; + +static struct rsc *rsc_lookup(struct rsc *item) +{ + struct cache_head *ch; + int hash = rsc_hash(item); + + ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash); + if (ch) + return container_of(ch, struct rsc, h); + else + return NULL; +} + +static struct rsc *rsc_update(struct rsc *new, struct rsc *old) +{ + struct cache_head *ch; + int hash = rsc_hash(new); + + ch = sunrpc_cache_update(&rsc_cache, &new->h, &old->h, hash); + if (ch) + return container_of(ch, struct rsc, h); + else + return NULL; +} + +#define COMPAT_RSC_PUT(item, cd) cache_put((item), (cd)) + +/**************************************** + * rsc cache flush * + ****************************************/ + +static struct rsc *gss_svc_searchbyctx(rawobj_t *handle) +{ + struct rsc rsci; + struct rsc *found; + + memset(&rsci, 0, sizeof(rsci)); + if (rawobj_dup(&rsci.handle, handle)) + return NULL; + + found = rsc_lookup(&rsci); + rsc_free(&rsci); + if (!found) + return NULL; + if (cache_check(&rsc_cache, &found->h, NULL)) + return NULL; + return found; +} + +int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp, + struct gss_sec *gsec, + struct gss_cli_ctx *gctx) +{ + struct rsc rsci, *rscp = NULL; + time64_t ctx_expiry; + __u32 major; + int rc; + ENTRY; + + memset(&rsci, 0, sizeof(rsci)); + + if (rawobj_alloc(&rsci.handle, (char *) &gsec->gs_rvs_hdl, + sizeof(gsec->gs_rvs_hdl))) + GOTO(out, rc = -ENOMEM); + + rscp = rsc_lookup(&rsci); + if (rscp == NULL) + GOTO(out, rc = -ENOMEM); + + major = lgss_copy_reverse_context(gctx->gc_mechctx, + &rsci.ctx.gsc_mechctx); + if (major != GSS_S_COMPLETE) + GOTO(out, rc = -ENOMEM); + + if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) { + CERROR("unable to get expire time, drop it\n"); + GOTO(out, rc = -EINVAL); + } + rsci.h.expiry_time = ctx_expiry; + + switch (imp->imp_obd->u.cli.cl_sp_to) { + case LUSTRE_SP_MDT: + rsci.ctx.gsc_usr_mds = 1; + break; + case LUSTRE_SP_OST: + rsci.ctx.gsc_usr_oss = 1; + break; + case LUSTRE_SP_CLI: + rsci.ctx.gsc_usr_root = 1; + break; + case LUSTRE_SP_MGS: + /* by convention, all 3 set to 1 means MGS */ + rsci.ctx.gsc_usr_mds = 1; + rsci.ctx.gsc_usr_oss = 1; + rsci.ctx.gsc_usr_root = 1; + break; + default: + break; + } + + rscp = rsc_update(&rsci, rscp); + if (rscp == NULL) + GOTO(out, rc = -ENOMEM); + + rscp->target = imp->imp_obd; + rawobj_dup(&gctx->gc_svc_handle, &rscp->handle); + + CWARN("create reverse svc ctx %p to %s: idx %#llx\n", + &rscp->ctx, obd2cli_tgt(imp->imp_obd), gsec->gs_rvs_hdl); + rc = 0; +out: + if (rscp) + cache_put(&rscp->h, &rsc_cache); + rsc_free(&rsci); + + if (rc) + CERROR("create reverse svc ctx: idx %#llx, rc %d\n", + gsec->gs_rvs_hdl, rc); + RETURN(rc); +} + +int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle) +{ + const time64_t expire = 20; + struct rsc *rscp; + + rscp = gss_svc_searchbyctx(handle); + if (rscp) { + CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n", + &rscp->ctx, rscp); + + rscp->h.expiry_time = ktime_get_real_seconds() + expire; + COMPAT_RSC_PUT(&rscp->h, &rsc_cache); + } + return 0; +} + +int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx) +{ + struct rsc *rscp = container_of(ctx, struct rsc, ctx); + + return rawobj_dup(handle, &rscp->handle); +} + +int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq) +{ + struct rsc *rscp; + + rscp = gss_svc_searchbyctx(handle); + if (rscp) { + CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) update seq to %u\n", + &rscp->ctx, rscp, seq + 1); + + rscp->ctx.gsc_rvs_seq = seq + 1; + COMPAT_RSC_PUT(&rscp->h, &rsc_cache); + } + return 0; +} + +static struct cache_deferred_req* cache_upcall_defer(struct cache_req *req) +{ + return NULL; +} +static struct cache_req cache_upcall_chandle = { cache_upcall_defer }; + +int gss_svc_upcall_handle_init(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + struct obd_device *target, + __u32 lustre_svc, + rawobj_t *rvs_hdl, + rawobj_t *in_token) +{ + struct ptlrpc_reply_state *rs; + struct rsc *rsci = NULL; + struct rsi *rsip = NULL, rsikey; + wait_queue_entry_t wait; + int replen = sizeof(struct ptlrpc_body); + struct gss_rep_header *rephdr; + int first_check = 1; + int rc = SECSVC_DROP; + ENTRY; + + memset(&rsikey, 0, sizeof(rsikey)); + rsikey.lustre_svc = lustre_svc; + /* In case of MR, rq_peer is not the NID from which request is received, + * but primary NID of peer. + * So we need rq_source, which contains the NID actually in use. + */ + rsikey.nid = (__u64) req->rq_source.nid; + nodemap_test_nid(req->rq_peer.nid, rsikey.nm_name, + sizeof(rsikey.nm_name)); + + /* duplicate context handle. for INIT it always 0 */ + if (rawobj_dup(&rsikey.in_handle, &gw->gw_handle)) { + CERROR("fail to dup context handle\n"); + GOTO(out, rc); + } + + if (rawobj_dup(&rsikey.in_token, in_token)) { + CERROR("can't duplicate token\n"); + rawobj_free(&rsikey.in_handle); + GOTO(out, rc); + } + + rsip = rsi_lookup(&rsikey); + rsi_free(&rsikey); + if (!rsip) { + CERROR("error in rsi_lookup.\n"); + + if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0)) + rc = SECSVC_COMPLETE; + + GOTO(out, rc); + } + + cache_get(&rsip->h); /* take an extra ref */ + init_waitqueue_head(&rsip->waitq); + init_waitqueue_entry(&wait, current); + add_wait_queue(&rsip->waitq, &wait); + +cache_check: + /* Note each time cache_check() will drop a reference if return + * non-zero. We hold an extra reference on initial rsip, but must + * take care of following calls. */ + rc = cache_check(&rsi_cache, &rsip->h, &cache_upcall_chandle); + switch (rc) { + case -ETIMEDOUT: + case -EAGAIN: { + int valid; + + if (first_check) { + first_check = 0; + + cache_read_lock(&rsi_cache); + valid = test_bit(CACHE_VALID, &rsip->h.flags); + if (valid == 0) + set_current_state(TASK_INTERRUPTIBLE); + cache_read_unlock(&rsi_cache); + + if (valid == 0) { + unsigned long jiffies; + jiffies = msecs_to_jiffies(MSEC_PER_SEC * + GSS_SVC_UPCALL_TIMEOUT); + schedule_timeout(jiffies); + } + cache_get(&rsip->h); + goto cache_check; + } + CWARN("waited %ds timeout, drop\n", GSS_SVC_UPCALL_TIMEOUT); + break; + } + case -ENOENT: + CDEBUG(D_SEC, "cache_check return ENOENT, drop\n"); + break; + case 0: + /* if not the first check, we have to release the extra + * reference we just added on it. */ + if (!first_check) + cache_put(&rsip->h, &rsi_cache); + CDEBUG(D_SEC, "cache_check is good\n"); + break; + } + + remove_wait_queue(&rsip->waitq, &wait); + cache_put(&rsip->h, &rsi_cache); + + if (rc) + GOTO(out, rc = SECSVC_DROP); + + rc = SECSVC_DROP; + rsci = gss_svc_searchbyctx(&rsip->out_handle); + if (!rsci) { + CERROR("authentication failed\n"); + + /* gss mechanism returned major and minor code so we return + * those in error message */ + if (!gss_pack_err_notify(req, rsip->major_status, + rsip->minor_status)) + rc = SECSVC_COMPLETE; + + GOTO(out, rc); + } else { + cache_get(&rsci->h); + grctx->src_ctx = &rsci->ctx; + } + + if (gw->gw_flags & LUSTRE_GSS_PACK_KCSUM) { + grctx->src_ctx->gsc_mechctx->hash_func = gss_digest_hash; + } else if (!strcmp(grctx->src_ctx->gsc_mechctx->mech_type->gm_name, + "krb5") && + !krb5_allow_old_client_csum) { + CWARN("%s: deny connection from '%s' due to missing 'krb_csum' feature, set 'sptlrpc.gss.krb5_allow_old_client_csum=1' to allow, but recommend client upgrade: rc = %d\n", + target->obd_name, libcfs_nid2str(req->rq_peer.nid), + -EPROTO); + GOTO(out, rc = SECSVC_DROP); + } else { + grctx->src_ctx->gsc_mechctx->hash_func = + gss_digest_hash_compat; + } + + if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) { + CERROR("failed duplicate reverse handle\n"); + GOTO(out, rc); + } + + rsci->target = target; + + CDEBUG(D_SEC, "server create rsc %p(%u->%s)\n", + rsci, rsci->ctx.gsc_uid, libcfs_nid2str(req->rq_peer.nid)); + + if (rsip->out_handle.len > PTLRPC_GSS_MAX_HANDLE_SIZE) { + CERROR("handle size %u too large\n", rsip->out_handle.len); + GOTO(out, rc = SECSVC_DROP); + } + + grctx->src_init = 1; + grctx->src_reserve_len = cfs_size_round4(rsip->out_token.len); + + rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0); + if (rc) { + CERROR("failed to pack reply: %d\n", rc); + GOTO(out, rc = SECSVC_DROP); + } + + rs = req->rq_reply_state; + LASSERT(rs->rs_repbuf->lm_bufcount == 3); + LASSERT(rs->rs_repbuf->lm_buflens[0] >= + sizeof(*rephdr) + rsip->out_handle.len); + LASSERT(rs->rs_repbuf->lm_buflens[2] >= rsip->out_token.len); + + rephdr = lustre_msg_buf(rs->rs_repbuf, 0, 0); + rephdr->gh_version = PTLRPC_GSS_VERSION; + rephdr->gh_flags = 0; + rephdr->gh_proc = PTLRPC_GSS_PROC_ERR; + rephdr->gh_major = rsip->major_status; + rephdr->gh_minor = rsip->minor_status; + rephdr->gh_seqwin = GSS_SEQ_WIN; + rephdr->gh_handle.len = rsip->out_handle.len; + memcpy(rephdr->gh_handle.data, rsip->out_handle.data, + rsip->out_handle.len); + + memcpy(lustre_msg_buf(rs->rs_repbuf, 2, 0), rsip->out_token.data, + rsip->out_token.len); + + rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2, + rsip->out_token.len, 0); + + rc = SECSVC_OK; + +out: + /* it looks like here we should put rsip also, but this mess up + * with NFS cache mgmt code... FIXME + * something like: + * if (rsip) + * rsi_put(&rsip->h, &rsi_cache); */ + + if (rsci) { + /* if anything went wrong, we don't keep the context too */ + if (rc != SECSVC_OK) + set_bit(CACHE_NEGATIVE, &rsci->h.flags); + else + CDEBUG(D_SEC, "create rsc with idx %#llx\n", + gss_handle_to_u64(&rsci->handle)); + + COMPAT_RSC_PUT(&rsci->h, &rsc_cache); + } + RETURN(rc); +} + +struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct rsc *rsc; + + rsc = gss_svc_searchbyctx(&gw->gw_handle); + if (!rsc) { + CWARN("Invalid gss ctx idx %#llx from %s\n", + gss_handle_to_u64(&gw->gw_handle), + libcfs_nid2str(req->rq_peer.nid)); + return NULL; + } + + return &rsc->ctx; +} + +void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx) +{ + struct rsc *rsc = container_of(ctx, struct rsc, ctx); + + COMPAT_RSC_PUT(&rsc->h, &rsc_cache); +} + +void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx) +{ + struct rsc *rsc = container_of(ctx, struct rsc, ctx); + + /* can't be found */ + set_bit(CACHE_NEGATIVE, &rsc->h.flags); + /* to be removed at next scan */ + rsc->h.expiry_time = 1; +} + +int __init gss_init_svc_upcall(void) +{ + int i, rc; + + spin_lock_init(&__ctx_index_lock); + /* + * this helps reducing context index confliction. after server reboot, + * conflicting request from clients might be filtered out by initial + * sequence number checking, thus no chance to sent error notification + * back to clients. + */ + cfs_get_random_bytes(&__ctx_index, sizeof(__ctx_index)); + + rc = _cache_register_net(&rsi_cache, &init_net); + if (rc != 0) + return rc; + + rc = _cache_register_net(&rsc_cache, &init_net); + if (rc != 0) { + _cache_unregister_net(&rsi_cache, &init_net); + return rc; + } + + /* FIXME this looks stupid. we intend to give lsvcgssd a chance to open + * the init upcall channel, otherwise there's big chance that the first + * upcall issued before the channel be opened thus nfsv4 cache code will + * drop the request directly, thus lead to unnecessary recovery time. + * Here we wait at minimum 1.5 seconds. + */ + for (i = 0; i < 6; i++) { + if (channel_users(&rsi_cache) > 0) + break; + set_current_state(TASK_UNINTERRUPTIBLE); + LASSERT(msecs_to_jiffies(MSEC_PER_SEC / 4) > 0); + schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC / 4)); + } + + if (channel_users(&rsi_cache) == 0) + CWARN("Init channel is not opened by lsvcgssd, following " + "request might be dropped until lsvcgssd is active\n"); + + return 0; +} + +void gss_exit_svc_upcall(void) +{ + cache_purge(&rsi_cache); + _cache_unregister_net(&rsi_cache, &init_net); + + cache_purge(&rsc_cache); + _cache_unregister_net(&rsc_cache, &init_net); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c new file mode 100644 index 0000000000000..f2943207b34fd --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c @@ -0,0 +1,253 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +static struct proc_dir_entry *gss_proc_root = NULL; +static struct proc_dir_entry *gss_proc_lk = NULL; + +/* + * statistic of "out-of-sequence-window" + */ +static struct { + spinlock_t oos_lock; + atomic_t oos_cli_count; /* client occurrence */ + int oos_cli_behind; /* client max seqs behind */ + atomic_t oos_svc_replay[3]; /* server replay detected */ + atomic_t oos_svc_pass[3]; /* server verified ok */ +} gss_stat_oos = { + .oos_cli_count = ATOMIC_INIT(0), + .oos_cli_behind = 0, + .oos_svc_replay = { ATOMIC_INIT(0), }, + .oos_svc_pass = { ATOMIC_INIT(0), }, +}; + +void gss_stat_oos_record_cli(int behind) +{ + atomic_inc(&gss_stat_oos.oos_cli_count); + + spin_lock(&gss_stat_oos.oos_lock); + if (behind > gss_stat_oos.oos_cli_behind) + gss_stat_oos.oos_cli_behind = behind; + spin_unlock(&gss_stat_oos.oos_lock); +} + +void gss_stat_oos_record_svc(int phase, int replay) +{ + LASSERT(phase >= 0 && phase <= 2); + + if (replay) + atomic_inc(&gss_stat_oos.oos_svc_replay[phase]); + else + atomic_inc(&gss_stat_oos.oos_svc_pass[phase]); +} + +static int gss_proc_oos_seq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "seqwin: %u\n" + "backwin: %u\n" + "client fall behind seqwin\n" + " occurrence: %d\n" + " max seq behind: %d\n" + "server replay detected:\n" + " phase 0: %d\n" + " phase 1: %d\n" + " phase 2: %d\n" + "server verify ok:\n" + " phase 2: %d\n", + GSS_SEQ_WIN_MAIN, + GSS_SEQ_WIN_BACK, + atomic_read(&gss_stat_oos.oos_cli_count), + gss_stat_oos.oos_cli_behind, + atomic_read(&gss_stat_oos.oos_svc_replay[0]), + atomic_read(&gss_stat_oos.oos_svc_replay[1]), + atomic_read(&gss_stat_oos.oos_svc_replay[2]), + atomic_read(&gss_stat_oos.oos_svc_pass[2])); + return 0; +} +LPROC_SEQ_FOPS_RO(gss_proc_oos); + +static ssize_t +gss_proc_write_secinit(struct file *file, const char *buffer, + size_t count, loff_t *off) +{ + int rc; + + rc = gss_do_ctx_init_rpc((char *) buffer, count); + if (rc) { + LASSERT(rc < 0); + return rc; + } + return count; +} + +static const struct file_operations gss_proc_secinit = { + .write = gss_proc_write_secinit, +}; + +int sptlrpc_krb5_allow_old_client_csum_seq_show(struct seq_file *m, void *data) +{ + seq_printf(m, "%u\n", krb5_allow_old_client_csum); + return 0; +} + +ssize_t sptlrpc_krb5_allow_old_client_csum_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + bool val; + int rc; + + rc = kstrtobool_from_user(buffer, count, &val); + if (rc) + return rc; + + krb5_allow_old_client_csum = val; + return count; +} +LPROC_SEQ_FOPS(sptlrpc_krb5_allow_old_client_csum); + +static struct ldebugfs_vars gss_debugfs_vars[] = { + { .name = "replays", + .fops = &gss_proc_oos_fops }, + { .name = "init_channel", + .fops = &gss_proc_secinit, + .proc_mode = 0222 }, + { NULL } +}; + +static struct lprocfs_vars gss_lprocfs_vars[] = { + { .name = "krb5_allow_old_client_csum", + .fops = &sptlrpc_krb5_allow_old_client_csum_fops }, + { NULL } +}; + +/* + * for userspace helper lgss_keyring. + * + * debug_level: [0, 4], defined in utils/gss/lgss_utils.h + */ +static int gss_lk_debug_level = 1; + +static int gss_lk_proc_dl_seq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%u\n", gss_lk_debug_level); + return 0; +} + +static ssize_t +gss_lk_proc_dl_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + unsigned int val; + int rc; + + rc = kstrtouint_from_user(buffer, count, 0, &val); + if (rc < 0) + return rc; + + if (val > 4) + return -ERANGE; + + gss_lk_debug_level = val; + + return count; +} +LPROC_SEQ_FOPS(gss_lk_proc_dl); + +static struct ldebugfs_vars gss_lk_debugfs_vars[] = { + { .name = "debug_level", + .fops = &gss_lk_proc_dl_fops }, + { NULL } +}; + +void gss_exit_lproc(void) +{ + if (gss_proc_lk) { + lprocfs_remove(&gss_proc_lk); + gss_proc_lk = NULL; + } + + if (gss_proc_root) { + lprocfs_remove(&gss_proc_root); + gss_proc_root = NULL; + } +} + +int gss_init_lproc(void) +{ + int rc; + + spin_lock_init(&gss_stat_oos.oos_lock); + + gss_proc_root = lprocfs_register("gss", sptlrpc_proc_root, + gss_lprocfs_vars, NULL); + if (IS_ERR(gss_proc_root)) { + rc = PTR_ERR(gss_proc_root); + gss_proc_root = NULL; + GOTO(out, rc); + } + + gss_proc_lk = lprocfs_register("lgss_keyring", gss_proc_root, + gss_lk_debugfs_vars, NULL); + if (IS_ERR(gss_proc_lk)) { + rc = PTR_ERR(gss_proc_lk); + gss_proc_lk = NULL; + GOTO(out, rc); + } + + return 0; + +out: + CERROR("failed to initialize gss lproc entries: %d\n", rc); + gss_exit_lproc(); + + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c new file mode 100644 index 0000000000000..17e8f0a258c6d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c @@ -0,0 +1,2929 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2015, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/auth_gss.c + * + * RPCSEC_GSS client authentication. + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Dug Song + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +#include +#include + +/* + * early reply have fixed size, respectively in privacy and integrity mode. + * so we calculate them only once. + */ +static int gss_at_reply_off_integ; +static int gss_at_reply_off_priv; + + +static inline int msg_last_segidx(struct lustre_msg *msg) +{ + LASSERT(msg->lm_bufcount > 0); + return msg->lm_bufcount - 1; +} +static inline int msg_last_seglen(struct lustre_msg *msg) +{ + return msg->lm_buflens[msg_last_segidx(msg)]; +} + +/******************************************** + * wire data swabber * + ********************************************/ + +static +void gss_header_swabber(struct gss_header *ghdr) +{ + __swab32s(&ghdr->gh_flags); + __swab32s(&ghdr->gh_proc); + __swab32s(&ghdr->gh_seq); + __swab32s(&ghdr->gh_svc); + __swab32s(&ghdr->gh_pad1); + __swab32s(&ghdr->gh_handle.len); +} + +struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment, + int swabbed) +{ + struct gss_header *ghdr; + + ghdr = lustre_msg_buf(msg, segment, sizeof(*ghdr)); + if (ghdr == NULL) + return NULL; + + if (swabbed) + gss_header_swabber(ghdr); + + if (sizeof(*ghdr) + ghdr->gh_handle.len > msg->lm_buflens[segment]) { + CERROR("gss header has length %d, now %u received\n", + (int) sizeof(*ghdr) + ghdr->gh_handle.len, + msg->lm_buflens[segment]); + return NULL; + } + + return ghdr; +} + +/* + * payload should be obtained from mechanism. but currently since we + * only support kerberos, we could simply use fixed value. + * krb5 "meta" data: + * - krb5 header: 16 + * - krb5 checksum: 20 + * + * for privacy mode, payload also include the cipher text which has the same + * size as plain text, plus possible confounder, padding both at maximum cipher + * block size. + */ +#define GSS_KRB5_INTEG_MAX_PAYLOAD (40) + +static inline +int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy) +{ + if (privacy) + return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize; + else + return GSS_KRB5_INTEG_MAX_PAYLOAD; +} + +/* + * return signature size, otherwise < 0 to indicate error + */ +static int gss_sign_msg(struct lustre_msg *msg, + struct gss_ctx *mechctx, + enum lustre_sec_part sp, + __u32 flags, __u32 proc, __u32 seq, __u32 svc, + rawobj_t *handle) +{ + struct gss_header *ghdr; + rawobj_t text[4], mic; + int textcnt, max_textcnt, mic_idx; + __u32 major; + + LASSERT(msg->lm_bufcount >= 2); + + /* gss hdr */ + LASSERT(msg->lm_buflens[0] >= + sizeof(*ghdr) + (handle ? handle->len : 0)); + ghdr = lustre_msg_buf(msg, 0, 0); + + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = (__u8) sp; + ghdr->gh_flags = flags; + ghdr->gh_proc = proc; + ghdr->gh_seq = seq; + ghdr->gh_svc = svc; + if (!handle) { + /* fill in a fake one */ + ghdr->gh_handle.len = 0; + } else { + ghdr->gh_handle.len = handle->len; + memcpy(ghdr->gh_handle.data, handle->data, handle->len); + } + + /* no actual signature for null mode */ + if (svc == SPTLRPC_SVC_NULL) + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + + /* MIC */ + mic_idx = msg_last_segidx(msg); + max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx; + + for (textcnt = 0; textcnt < max_textcnt; textcnt++) { + text[textcnt].len = msg->lm_buflens[textcnt]; + text[textcnt].data = lustre_msg_buf(msg, textcnt, 0); + } + + mic.len = msg->lm_buflens[mic_idx]; + mic.data = lustre_msg_buf(msg, mic_idx, 0); + + major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic); + if (major != GSS_S_COMPLETE) { + CERROR("fail to generate MIC: %08x\n", major); + return -EPERM; + } + LASSERT(mic.len <= msg->lm_buflens[mic_idx]); + + return lustre_shrink_msg(msg, mic_idx, mic.len, 0); +} + +/* + * return gss error + */ +static +__u32 gss_verify_msg(struct lustre_msg *msg, + struct gss_ctx *mechctx, + __u32 svc) +{ + rawobj_t text[4], mic; + int textcnt, max_textcnt; + int mic_idx; + __u32 major; + + LASSERT(msg->lm_bufcount >= 2); + + if (svc == SPTLRPC_SVC_NULL) + return GSS_S_COMPLETE; + + mic_idx = msg_last_segidx(msg); + max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx; + + for (textcnt = 0; textcnt < max_textcnt; textcnt++) { + text[textcnt].len = msg->lm_buflens[textcnt]; + text[textcnt].data = lustre_msg_buf(msg, textcnt, 0); + } + + mic.len = msg->lm_buflens[mic_idx]; + mic.data = lustre_msg_buf(msg, mic_idx, 0); + + major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic); + if (major != GSS_S_COMPLETE) + CERROR("mic verify error: %08x\n", major); + + return major; +} + +/* + * return gss error code + */ +static +__u32 gss_unseal_msg(struct gss_ctx *mechctx, + struct lustre_msg *msgbuf, + int *msg_len, int msgbuf_len) +{ + rawobj_t clear_obj, hdrobj, token; + __u8 *clear_buf; + int clear_buflen; + __u32 major; + ENTRY; + + if (msgbuf->lm_bufcount != 2) { + CERROR("invalid bufcount %d\n", msgbuf->lm_bufcount); + RETURN(GSS_S_FAILURE); + } + + /* allocate a temporary clear text buffer, same sized as token, + * we assume the final clear text size <= token size */ + clear_buflen = lustre_msg_buflen(msgbuf, 1); + OBD_ALLOC_LARGE(clear_buf, clear_buflen); + if (!clear_buf) + RETURN(GSS_S_FAILURE); + + /* buffer objects */ + hdrobj.len = lustre_msg_buflen(msgbuf, 0); + hdrobj.data = lustre_msg_buf(msgbuf, 0, 0); + token.len = lustre_msg_buflen(msgbuf, 1); + token.data = lustre_msg_buf(msgbuf, 1, 0); + clear_obj.len = clear_buflen; + clear_obj.data = clear_buf; + + major = lgss_unwrap(mechctx, &hdrobj, &token, &clear_obj); + if (major != GSS_S_COMPLETE) { + CERROR("unwrap message error: %08x\n", major); + GOTO(out_free, major = GSS_S_FAILURE); + } + LASSERT(clear_obj.len <= clear_buflen); + LASSERT(clear_obj.len <= msgbuf_len); + + /* now the decrypted message */ + memcpy(msgbuf, clear_obj.data, clear_obj.len); + *msg_len = clear_obj.len; + + major = GSS_S_COMPLETE; +out_free: + OBD_FREE_LARGE(clear_buf, clear_buflen); + RETURN(major); +} + +/******************************************** + * gss client context manipulation helpers * + ********************************************/ + +int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->cc_refcount)); + + if (!test_and_set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags)) { + if (!ctx->cc_early_expire) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + CWARN("ctx %p(%u->%s) get expired: %lld(%+llds)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + ctx->cc_expire, + ctx->cc_expire == 0 ? 0 : + ctx->cc_expire - ktime_get_real_seconds()); + + sptlrpc_cli_ctx_wakeup(ctx); + return 1; + } + + return 0; +} + +/* + * return 1 if the context is dead. + */ +int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx) +{ + if (unlikely(cli_ctx_is_dead(ctx))) + return 1; + + /* expire is 0 means never expire. a newly created gss context + * which during upcall may has 0 expiration */ + if (ctx->cc_expire == 0) + return 0; + + /* check real expiration */ + if (ctx->cc_expire > ktime_get_real_seconds()) + return 0; + + cli_ctx_expire(ctx); + return 1; +} + +void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx) +{ + struct ptlrpc_cli_ctx *ctx = &gctx->gc_base; + time64_t ctx_expiry; + + if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) { + CERROR("ctx %p(%u): unable to inquire, expire it now\n", + gctx, ctx->cc_vcred.vc_uid); + ctx_expiry = 1; /* make it expired now */ + } + + ctx->cc_expire = gss_round_ctx_expiry(ctx_expiry, + ctx->cc_sec->ps_flvr.sf_flags); + + /* At this point this ctx might have been marked as dead by + * someone else, in which case nobody will make further use + * of it. we don't care, and mark it UPTODATE will help + * destroying server side context when it be destroyed. */ + set_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + if (sec_is_reverse(ctx->cc_sec)) { + CWARN("server installed reverse ctx %p idx %#llx, " + "expiry %lld(%+llds)\n", ctx, + gss_handle_to_u64(&gctx->gc_handle), + ctx->cc_expire, + ctx->cc_expire - ktime_get_real_seconds()); + } else { + CWARN("client refreshed ctx %p idx %#llx (%u->%s), " + "expiry %lld(%+llds)\n", ctx, + gss_handle_to_u64(&gctx->gc_handle), + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + ctx->cc_expire, + ctx->cc_expire - ktime_get_real_seconds()); + + /* install reverse svc ctx for root context */ + if (ctx->cc_vcred.vc_uid == 0) + gss_sec_install_rctx(ctx->cc_sec->ps_import, + ctx->cc_sec, ctx); + } + + sptlrpc_cli_ctx_wakeup(ctx); +} + +static void gss_cli_ctx_finalize(struct gss_cli_ctx *gctx) +{ + LASSERT(gctx->gc_base.cc_sec); + + if (gctx->gc_mechctx) { + lgss_delete_sec_context(&gctx->gc_mechctx); + gctx->gc_mechctx = NULL; + } + + if (!rawobj_empty(&gctx->gc_svc_handle)) { + /* forward ctx: mark buddy reverse svcctx soon-expire. */ + if (!sec_is_reverse(gctx->gc_base.cc_sec) && + !rawobj_empty(&gctx->gc_svc_handle)) + gss_svc_upcall_expire_rvs_ctx(&gctx->gc_svc_handle); + + rawobj_free(&gctx->gc_svc_handle); + } + + rawobj_free(&gctx->gc_handle); +} + +/** + * Based on sequence number algorithm as specified in RFC 2203. + * + * Modified for our own problem: arriving request has valid sequence number, + * but unwrapping request might cost a long time, after that its sequence + * are not valid anymore (fall behind the window). It rarely happen, mostly + * under extreme load. + * + * Note we should not check sequence before verifying the integrity of incoming + * request, because just one attacking request with high sequence number might + * cause all following requests be dropped. + * + * So here we use a multi-phase approach: prepare 2 sequence windows, + * "main window" for normal sequence and "back window" for fall behind sequence. + * and 3-phase checking mechanism: + * 0 - before integrity verification, perform an initial sequence checking in + * main window, which only tries and doesn't actually set any bits. if the + * sequence is high above the window or fits in the window and the bit + * is 0, then accept and proceed to integrity verification. otherwise + * reject this sequence. + * 1 - after integrity verification, check in main window again. if this + * sequence is high above the window or fits in the window and the bit + * is 0, then set the bit and accept; if it fits in the window but bit + * already set, then reject; if it falls behind the window, then proceed + * to phase 2. + * 2 - check in back window. if it is high above the window or fits in the + * window and the bit is 0, then set the bit and accept. otherwise reject. + * + * \return 1: looks like a replay + * \return 0: is ok + * \return -1: is a replay + * + * Note phase 0 is necessary, because otherwise replay attacking request of + * sequence which between the 2 windows can't be detected. + * + * This mechanism can't totally solve the problem, but could help reduce the + * number of valid requests be dropped. + */ +static +int gss_do_check_seq(unsigned long *window, __u32 win_size, __u32 *max_seq, + __u32 seq_num, int phase) +{ + LASSERT(phase >= 0 && phase <= 2); + + if (seq_num > *max_seq) { + /* + * 1. high above the window + */ + if (phase == 0) + return 0; + + if (seq_num >= *max_seq + win_size) { + memset(window, 0, win_size / 8); + *max_seq = seq_num; + } else { + while(*max_seq < seq_num) { + (*max_seq)++; + __clear_bit((*max_seq) % win_size, window); + } + } + __set_bit(seq_num % win_size, window); + } else if (seq_num + win_size <= *max_seq) { + /* + * 2. low behind the window + */ + if (phase == 0 || phase == 2) + goto replay; + + CWARN("seq %u is %u behind (size %d), check backup window\n", + seq_num, *max_seq - win_size - seq_num, win_size); + return 1; + } else { + /* + * 3. fit into the window + */ + switch (phase) { + case 0: + if (test_bit(seq_num % win_size, window)) + goto replay; + break; + case 1: + case 2: + if (__test_and_set_bit(seq_num % win_size, window)) + goto replay; + break; + } + } + + return 0; + +replay: + CERROR("seq %u (%s %s window) is a replay: max %u, winsize %d\n", + seq_num, + seq_num + win_size > *max_seq ? "in" : "behind", + phase == 2 ? "backup " : "main", + *max_seq, win_size); + return -1; +} + +/* + * Based on sequence number algorithm as specified in RFC 2203. + * + * if @set == 0: initial check, don't set any bit in window + * if @sec == 1: final check, set bit in window + */ +int gss_check_seq_num(struct gss_svc_seq_data *ssd, __u32 seq_num, int set) +{ + int rc = 0; + + spin_lock(&ssd->ssd_lock); + + if (set == 0) { + /* + * phase 0 testing + */ + rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN, + &ssd->ssd_max_main, seq_num, 0); + if (unlikely(rc)) + gss_stat_oos_record_svc(0, 1); + } else { + /* + * phase 1 checking main window + */ + rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN, + &ssd->ssd_max_main, seq_num, 1); + switch (rc) { + case -1: + gss_stat_oos_record_svc(1, 1); + fallthrough; + case 0: + goto exit; + } + /* + * phase 2 checking back window + */ + rc = gss_do_check_seq(ssd->ssd_win_back, GSS_SEQ_WIN_BACK, + &ssd->ssd_max_back, seq_num, 2); + if (rc) + gss_stat_oos_record_svc(2, 1); + else + gss_stat_oos_record_svc(2, 0); + } +exit: + spin_unlock(&ssd->ssd_lock); + return rc; +} + +/*************************************** + * cred APIs * + ***************************************/ + +static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx, + int msgsize, int privacy) +{ + return gss_mech_payload(NULL, msgsize, privacy); +} + +static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx, + struct sptlrpc_flavor *flvr, + int reply, int read) +{ + int payload = sizeof(struct ptlrpc_bulk_sec_desc); + + LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT); + + if ((!reply && !read) || (reply && read)) { + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_NULL: + break; + case SPTLRPC_BULK_SVC_INTG: + payload += gss_cli_payload(ctx, 0, 0); + break; + case SPTLRPC_BULK_SVC_PRIV: + payload += gss_cli_payload(ctx, 0, 1); + break; + case SPTLRPC_BULK_SVC_AUTH: + default: + LBUG(); + } + } + + return payload; +} + +int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred) +{ + return (ctx->cc_vcred.vc_uid == vcred->vc_uid); +} + +void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_CTX_NEW) + strlcat(buf, "new,", bufsize); + if (flags & PTLRPC_CTX_UPTODATE) + strlcat(buf, "uptodate,", bufsize); + if (flags & PTLRPC_CTX_DEAD) + strlcat(buf, "dead,", bufsize); + if (flags & PTLRPC_CTX_ERROR) + strlcat(buf, "error,", bufsize); + if (flags & PTLRPC_CTX_CACHED) + strlcat(buf, "cached,", bufsize); + if (flags & PTLRPC_CTX_ETERNAL) + strlcat(buf, "eternal,", bufsize); + if (buf[0] == '\0') + strlcat(buf, "-,", bufsize); +} + +int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + __u32 flags = 0, seq, svc; + int rc; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf->lm_bufcount >= 2); + LASSERT(req->rq_cli_ctx == ctx); + + /* nothing to do for context negotiation RPCs */ + if (req->rq_ctx_init) + RETURN(0); + + svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + if (req->rq_pack_bulk) + flags |= LUSTRE_GSS_PACK_BULK; + if (req->rq_pack_udesc) + flags |= LUSTRE_GSS_PACK_USER; + +redo: + seq = atomic_inc_return(&gctx->gc_seq); + + rc = gss_sign_msg(req->rq_reqbuf, gctx->gc_mechctx, + ctx->cc_sec->ps_part, + flags, gctx->gc_proc, seq, svc, + &gctx->gc_handle); + if (rc < 0) + RETURN(rc); + + /* gss_sign_msg() msg might take long time to finish, in which period + * more rpcs could be wrapped up and sent out. if we found too many + * of them we should repack this rpc, because sent it too late might + * lead to the sequence number fall behind the window on server and + * be dropped. also applies to gss_cli_ctx_seal(). + * + * Note: null mode doesn't check sequence number. */ + if (svc != SPTLRPC_SVC_NULL && + atomic_read(&gctx->gc_seq) - seq > GSS_SEQ_REPACK_THRESHOLD) { + int behind = atomic_read(&gctx->gc_seq) - seq; + + gss_stat_oos_record_cli(behind); + CWARN("req %p: %u behind, retry signing\n", req, behind); + goto redo; + } + + req->rq_reqdata_len = rc; + RETURN(0); +} + +static +int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct gss_header *ghdr) +{ + struct gss_err_header *errhdr; + int rc; + + LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR); + + errhdr = (struct gss_err_header *) ghdr; + + CWARN("req x%llu/t%llu, ctx %p idx %#llx(%u->%s): " + "%sserver respond (%08x/%08x)\n", + req->rq_xid, req->rq_transno, ctx, + gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle), + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + sec_is_reverse(ctx->cc_sec) ? "reverse" : "", + errhdr->gh_major, errhdr->gh_minor); + + /* context fini rpc, let it failed */ + if (req->rq_ctx_fini) { + CWARN("context fini rpc failed\n"); + return -EINVAL; + } + + /* reverse sec, just return error, don't expire this ctx because it's + * crucial to callback rpcs. note if the callback rpc failed because + * of bit flip during network transfer, the client will be evicted + * directly. so more gracefully we probably want let it retry for + * number of times. */ + if (sec_is_reverse(ctx->cc_sec)) + return -EINVAL; + + if (errhdr->gh_major != GSS_S_NO_CONTEXT && + errhdr->gh_major != GSS_S_BAD_SIG) + return -EACCES; + + /* server return NO_CONTEXT might be caused by context expire + * or server reboot/failover. we try to refresh a new ctx which + * be transparent to upper layer. + * + * In some cases, our gss handle is possible to be incidentally + * identical to another handle since the handle itself is not + * fully random. In krb5 case, the GSS_S_BAD_SIG will be + * returned, maybe other gss error for other mechanism. + * + * if we add new mechanism, make sure the correct error are + * returned in this case. */ + CWARN("%s: server might lost the context, retrying\n", + errhdr->gh_major == GSS_S_NO_CONTEXT ? "NO_CONTEXT" : "BAD_SIG"); + + sptlrpc_cli_ctx_expire(ctx); + + /* we need replace the ctx right here, otherwise during + * resent we'll hit the logic in sptlrpc_req_refresh_ctx() + * which keep the ctx with RESEND flag, thus we'll never + * get rid of this ctx. */ + rc = sptlrpc_req_replace_dead_ctx(req); + if (rc == 0) + req->rq_resend = 1; + + return rc; +} + +int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx; + struct gss_header *ghdr, *reqhdr; + struct lustre_msg *msg = req->rq_repdata; + __u32 major; + int pack_bulk, swabbed, rc = 0; + ENTRY; + + LASSERT(req->rq_cli_ctx == ctx); + LASSERT(msg); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + + /* special case for context negotiation, rq_repmsg/rq_replen actually + * are not used currently. but early reply always be treated normally */ + if (req->rq_ctx_init && !req->rq_early) { + req->rq_repmsg = lustre_msg_buf(msg, 1, 0); + req->rq_replen = msg->lm_buflens[1]; + RETURN(0); + } + + if (msg->lm_bufcount < 2 || msg->lm_bufcount > 4) { + CERROR("unexpected bufcount %u\n", msg->lm_bufcount); + RETURN(-EPROTO); + } + + swabbed = ptlrpc_rep_need_swab(req); + + ghdr = gss_swab_header(msg, 0, swabbed); + if (ghdr == NULL) { + CERROR("can't decode gss header\n"); + RETURN(-EPROTO); + } + + /* sanity checks */ + reqhdr = lustre_msg_buf(msg, 0, sizeof(*reqhdr)); + LASSERT(reqhdr); + + if (ghdr->gh_version != reqhdr->gh_version) { + CERROR("gss version %u mismatch, expect %u\n", + ghdr->gh_version, reqhdr->gh_version); + RETURN(-EPROTO); + } + + switch (ghdr->gh_proc) { + case PTLRPC_GSS_PROC_DATA: + pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK; + + if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){ + CERROR("%s bulk flag in reply\n", + req->rq_pack_bulk ? "missing" : "unexpected"); + RETURN(-EPROTO); + } + + if (ghdr->gh_seq != reqhdr->gh_seq) { + CERROR("seqnum %u mismatch, expect %u\n", + ghdr->gh_seq, reqhdr->gh_seq); + RETURN(-EPROTO); + } + + if (ghdr->gh_svc != reqhdr->gh_svc) { + CERROR("svc %u mismatch, expect %u\n", + ghdr->gh_svc, reqhdr->gh_svc); + RETURN(-EPROTO); + } + + if (swabbed) + gss_header_swabber(ghdr); + + major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc); + if (major != GSS_S_COMPLETE) { + CERROR("failed to verify reply: %x\n", major); + RETURN(-EPERM); + } + + if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) { + __u32 cksum; + + cksum = crc32_le(!(__u32) 0, + lustre_msg_buf(msg, 1, 0), + lustre_msg_buflen(msg, 1)); + if (cksum != msg->lm_cksum) { + CWARN("early reply checksum mismatch: " + "%08x != %08x\n", cksum, msg->lm_cksum); + RETURN(-EPROTO); + } + } + + if (pack_bulk) { + /* bulk checksum is right after the lustre msg */ + if (msg->lm_bufcount < 3) { + CERROR("Invalid reply bufcount %u\n", + msg->lm_bufcount); + RETURN(-EPROTO); + } + + rc = bulk_sec_desc_unpack(msg, 2, swabbed); + if (rc) { + CERROR("unpack bulk desc: %d\n", rc); + RETURN(rc); + } + } + + req->rq_repmsg = lustre_msg_buf(msg, 1, 0); + req->rq_replen = msg->lm_buflens[1]; + break; + case PTLRPC_GSS_PROC_ERR: + if (req->rq_early) { + CERROR("server return error with early reply\n"); + rc = -EPROTO; + } else { + rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr); + } + break; + default: + CERROR("unknown gss proc %d\n", ghdr->gh_proc); + rc = -EPROTO; + } + + RETURN(rc); +} + +int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx; + rawobj_t hdrobj, msgobj, token; + struct gss_header *ghdr; + __u32 buflens[2], major; + int wiresize, rc; + ENTRY; + + LASSERT(req->rq_clrbuf); + LASSERT(req->rq_cli_ctx == ctx); + LASSERT(req->rq_reqlen); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + + /* final clear data length */ + req->rq_clrdata_len = lustre_msg_size_v2(req->rq_clrbuf->lm_bufcount, + req->rq_clrbuf->lm_buflens); + + /* calculate wire data length */ + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(&gctx->gc_base, req->rq_clrdata_len, 1); + wiresize = lustre_msg_size_v2(2, buflens); + + /* allocate wire buffer */ + if (req->rq_pool) { + /* pre-allocated */ + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf != req->rq_clrbuf); + LASSERT(req->rq_reqbuf_len >= wiresize); + } else { + OBD_ALLOC_LARGE(req->rq_reqbuf, wiresize); + if (!req->rq_reqbuf) + RETURN(-ENOMEM); + req->rq_reqbuf_len = wiresize; + } + + lustre_init_msg_v2(req->rq_reqbuf, 2, buflens, NULL); + req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + /* gss header */ + ghdr = lustre_msg_buf(req->rq_reqbuf, 0, 0); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = (__u8) ctx->cc_sec->ps_part; + ghdr->gh_flags = 0; + ghdr->gh_proc = gctx->gc_proc; + ghdr->gh_svc = SPTLRPC_SVC_PRIV; + ghdr->gh_handle.len = gctx->gc_handle.len; + memcpy(ghdr->gh_handle.data, gctx->gc_handle.data, gctx->gc_handle.len); + if (req->rq_pack_bulk) + ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK; + if (req->rq_pack_udesc) + ghdr->gh_flags |= LUSTRE_GSS_PACK_USER; + +redo: + ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq); + + /* buffer objects */ + hdrobj.len = PTLRPC_GSS_HEADER_SIZE; + hdrobj.data = (__u8 *) ghdr; + msgobj.len = req->rq_clrdata_len; + msgobj.data = (__u8 *) req->rq_clrbuf; + token.len = lustre_msg_buflen(req->rq_reqbuf, 1); + token.data = lustre_msg_buf(req->rq_reqbuf, 1, 0); + + major = lgss_wrap(gctx->gc_mechctx, &hdrobj, &msgobj, + req->rq_clrbuf_len, &token); + if (major != GSS_S_COMPLETE) { + CERROR("priv: wrap message error: %08x\n", major); + GOTO(err_free, rc = -EPERM); + } + LASSERT(token.len <= buflens[1]); + + /* see explain in gss_cli_ctx_sign() */ + if (unlikely(atomic_read(&gctx->gc_seq) - ghdr->gh_seq > + GSS_SEQ_REPACK_THRESHOLD)) { + int behind = atomic_read(&gctx->gc_seq) - ghdr->gh_seq; + + gss_stat_oos_record_cli(behind); + CWARN("req %p: %u behind, retry sealing\n", req, behind); + + ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq); + goto redo; + } + + /* now set the final wire data length */ + req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, 1, token.len,0); + RETURN(0); + +err_free: + if (!req->rq_pool) { + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } + RETURN(rc); +} + +int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx; + struct gss_header *ghdr; + struct lustre_msg *msg = req->rq_repdata; + int msglen, pack_bulk, swabbed, rc; + __u32 major; + ENTRY; + + LASSERT(req->rq_cli_ctx == ctx); + LASSERT(req->rq_ctx_init == 0); + LASSERT(msg); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + swabbed = ptlrpc_rep_need_swab(req); + + ghdr = gss_swab_header(msg, 0, swabbed); + if (ghdr == NULL) { + CERROR("can't decode gss header\n"); + RETURN(-EPROTO); + } + + /* sanity checks */ + if (ghdr->gh_version != PTLRPC_GSS_VERSION) { + CERROR("gss version %u mismatch, expect %u\n", + ghdr->gh_version, PTLRPC_GSS_VERSION); + RETURN(-EPROTO); + } + + switch (ghdr->gh_proc) { + case PTLRPC_GSS_PROC_DATA: + pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK; + + if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){ + CERROR("%s bulk flag in reply\n", + req->rq_pack_bulk ? "missing" : "unexpected"); + RETURN(-EPROTO); + } + + if (swabbed) + gss_header_swabber(ghdr); + + /* use rq_repdata_len as buffer size, which assume unseal + * doesn't need extra memory space. for precise control, we'd + * better calculate out actual buffer size as + * (repbuf_len - offset - repdata_len) */ + major = gss_unseal_msg(gctx->gc_mechctx, msg, + &msglen, req->rq_repdata_len); + if (major != GSS_S_COMPLETE) { + CERROR("failed to unwrap reply: %x\n", major); + rc = -EPERM; + break; + } + + swabbed = __lustre_unpack_msg(msg, msglen); + if (swabbed < 0) { + CERROR("Failed to unpack after decryption\n"); + RETURN(-EPROTO); + } + + if (msg->lm_bufcount < 1) { + CERROR("Invalid reply buffer: empty\n"); + RETURN(-EPROTO); + } + + if (pack_bulk) { + if (msg->lm_bufcount < 2) { + CERROR("bufcount %u: missing bulk sec desc\n", + msg->lm_bufcount); + RETURN(-EPROTO); + } + + /* bulk checksum is the last segment */ + if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1, + swabbed)) + RETURN(-EPROTO); + } + + req->rq_repmsg = lustre_msg_buf(msg, 0, 0); + req->rq_replen = msg->lm_buflens[0]; + + rc = 0; + break; + case PTLRPC_GSS_PROC_ERR: + if (req->rq_early) { + CERROR("server return error with early reply\n"); + rc = -EPROTO; + } else { + rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr); + } + break; + default: + CERROR("unexpected proc %d\n", ghdr->gh_proc); + rc = -EPERM; + } + + RETURN(rc); +} + +/********************************************* + * reverse context installation * + *********************************************/ + +static inline +int gss_install_rvs_svc_ctx(struct obd_import *imp, + struct gss_sec *gsec, + struct gss_cli_ctx *gctx) +{ + return gss_svc_upcall_install_rvs_ctx(imp, gsec, gctx); +} + +/********************************************* + * GSS security APIs * + *********************************************/ +int gss_sec_create_common(struct gss_sec *gsec, + struct ptlrpc_sec_policy *policy, + struct obd_import *imp, + struct ptlrpc_svc_ctx *svcctx, + struct sptlrpc_flavor *sf) +{ + struct ptlrpc_sec *sec; + + LASSERT(imp); + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS); + + gsec->gs_mech = lgss_subflavor_to_mech( + SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc)); + if (!gsec->gs_mech) { + CERROR("gss backend 0x%x not found\n", + SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc)); + return -EOPNOTSUPP; + } + + spin_lock_init(&gsec->gs_lock); + gsec->gs_rvs_hdl = 0ULL; + + /* initialize upper ptlrpc_sec */ + sec = &gsec->gs_base; + sec->ps_policy = policy; + atomic_set(&sec->ps_refcount, 0); + atomic_set(&sec->ps_nctx, 0); + sec->ps_id = sptlrpc_get_next_secid(); + sec->ps_flvr = *sf; + sec->ps_import = class_import_get(imp); + spin_lock_init(&sec->ps_lock); + INIT_LIST_HEAD(&sec->ps_gc_list); + sec->ps_sepol_mtime = ktime_set(0, 0); + sec->ps_sepol_checknext = ktime_set(0, 0); + sec->ps_sepol[0] = '\0'; + + if (!svcctx) { + sec->ps_gc_interval = GSS_GC_INTERVAL; + } else { + LASSERT(sec_is_reverse(sec)); + + /* never do gc on reverse sec */ + sec->ps_gc_interval = 0; + } + + if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV) + sptlrpc_enc_pool_add_user(); + + CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""), + policy->sp_name, gsec); + return 0; +} + +void gss_sec_destroy_common(struct gss_sec *gsec) +{ + struct ptlrpc_sec *sec = &gsec->gs_base; + ENTRY; + + LASSERT(sec->ps_import); + LASSERT(atomic_read(&sec->ps_refcount) == 0); + LASSERT(atomic_read(&sec->ps_nctx) == 0); + + if (gsec->gs_mech) { + lgss_mech_put(gsec->gs_mech); + gsec->gs_mech = NULL; + } + + class_import_put(sec->ps_import); + + if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV) + sptlrpc_enc_pool_del_user(); + + EXIT; +} + +void gss_sec_kill(struct ptlrpc_sec *sec) +{ + sec->ps_dying = 1; +} + +int gss_cli_ctx_init_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_ctx_ops *ctxops, + struct vfs_cred *vcred) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + + gctx->gc_win = 0; + atomic_set(&gctx->gc_seq, 0); + + INIT_HLIST_NODE(&ctx->cc_cache); + atomic_set(&ctx->cc_refcount, 0); + ctx->cc_sec = sec; + ctx->cc_ops = ctxops; + ctx->cc_expire = 0; + ctx->cc_flags = PTLRPC_CTX_NEW; + ctx->cc_vcred = *vcred; + spin_lock_init(&ctx->cc_lock); + INIT_LIST_HEAD(&ctx->cc_req_list); + INIT_LIST_HEAD(&ctx->cc_gc_chain); + + /* take a ref on belonging sec, balanced in ctx destroying */ + atomic_inc(&sec->ps_refcount); + /* statistic only */ + atomic_inc(&sec->ps_nctx); + + CDEBUG(D_SEC, "%s@%p: create ctx %p(%u->%s)\n", + sec->ps_policy->sp_name, ctx->cc_sec, + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + return 0; +} + +/* + * return value: + * 1: the context has been taken care of by someone else + * 0: proceed to really destroy the context locally + */ +int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + + LASSERT(atomic_read(&sec->ps_nctx) > 0); + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + LASSERT(ctx->cc_sec == sec); + + /* + * remove UPTODATE flag of reverse ctx thus we won't send fini rpc, + * this is to avoid potential problems of client side reverse svc ctx + * be mis-destroyed in various recovery senarios. anyway client can + * manage its reverse ctx well by associating it with its buddy ctx. + */ + if (sec_is_reverse(sec)) + ctx->cc_flags &= ~PTLRPC_CTX_UPTODATE; + + if (gctx->gc_mechctx) { + /* the final context fini rpc will use this ctx too, and it's + * asynchronous which finished by request_out_callback(). so + * we add refcount, whoever drop finally drop the refcount to + * 0 should responsible for the rest of destroy. */ + atomic_inc(&ctx->cc_refcount); + + gss_do_ctx_fini_rpc(gctx); + gss_cli_ctx_finalize(gctx); + + if (!atomic_dec_and_test(&ctx->cc_refcount)) + return 1; + } + + if (sec_is_reverse(sec)) + CWARN("reverse sec %p: destroy ctx %p\n", + ctx->cc_sec, ctx); + else + CWARN("%s@%p: destroy ctx %p(%u->%s)\n", + sec->ps_policy->sp_name, ctx->cc_sec, + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + + return 0; +} + +static +int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int svc, int msgsize) +{ + int bufsize, txtsize; + int bufcnt = 2; + __u32 buflens[5]; + ENTRY; + + /* + * on-wire data layout: + * - gss header + * - lustre message + * - user descriptor (optional) + * - bulk sec descriptor (optional) + * - signature (optional) + * - svc == NULL: NULL + * - svc == AUTH: signature of gss header + * - svc == INTG: signature of all above + * + * if this is context negotiation, reserver fixed space + * at the last (signature) segment regardless of svc mode. + */ + + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + txtsize = buflens[0]; + + buflens[1] = msgsize; + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[1]; + + if (req->rq_pack_udesc) { + buflens[bufcnt] = sptlrpc_current_user_desc_size(); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if (req->rq_pack_bulk) { + buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 0, req->rq_bulk_read); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if (req->rq_ctx_init) + buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN; + else if (svc != SPTLRPC_SVC_NULL) + buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0); + + bufsize = lustre_msg_size_v2(bufcnt, buflens); + + if (!req->rq_reqbuf) { + bufsize = size_roundup_power2(bufsize); + + OBD_ALLOC_LARGE(req->rq_reqbuf, bufsize); + if (!req->rq_reqbuf) + RETURN(-ENOMEM); + + req->rq_reqbuf_len = bufsize; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= bufsize); + memset(req->rq_reqbuf, 0, bufsize); + } + + lustre_init_msg_v2(req->rq_reqbuf, bufcnt, buflens, NULL); + req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, msgsize); + LASSERT(req->rq_reqmsg); + + /* pack user desc here, later we might leave current user's process */ + if (req->rq_pack_udesc) + sptlrpc_pack_user_desc(req->rq_reqbuf, 2); + + RETURN(0); +} + +static +int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 ibuflens[3], wbuflens[2]; + int ibufcnt; + int clearsize, wiresize; + ENTRY; + + LASSERT(req->rq_clrbuf == NULL); + LASSERT(req->rq_clrbuf_len == 0); + + /* Inner (clear) buffers + * - lustre message + * - user descriptor (optional) + * - bulk checksum (optional) + */ + ibufcnt = 1; + ibuflens[0] = msgsize; + + if (req->rq_pack_udesc) + ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size(); + if (req->rq_pack_bulk) + ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, 0, + req->rq_bulk_read); + + clearsize = lustre_msg_size_v2(ibufcnt, ibuflens); + /* to allow append padding during encryption */ + clearsize += GSS_MAX_CIPHER_BLOCK; + + /* Wrapper (wire) buffers + * - gss header + * - cipher text + */ + wbuflens[0] = PTLRPC_GSS_HEADER_SIZE; + wbuflens[1] = gss_cli_payload(req->rq_cli_ctx, clearsize, 1); + wiresize = lustre_msg_size_v2(2, wbuflens); + + if (req->rq_pool) { + /* rq_reqbuf is preallocated */ + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len >= wiresize); + + memset(req->rq_reqbuf, 0, req->rq_reqbuf_len); + + /* if the pre-allocated buffer is big enough, we just pack + * both clear buf & request buf in it, to avoid more alloc. */ + if (clearsize + wiresize <= req->rq_reqbuf_len) { + req->rq_clrbuf = + (void *) (((char *) req->rq_reqbuf) + wiresize); + } else { + CWARN("pre-allocated buf size %d is not enough for " + "both clear (%d) and cipher (%d) text, proceed " + "with extra allocation\n", req->rq_reqbuf_len, + clearsize, wiresize); + } + } + + if (!req->rq_clrbuf) { + clearsize = size_roundup_power2(clearsize); + + OBD_ALLOC_LARGE(req->rq_clrbuf, clearsize); + if (!req->rq_clrbuf) + RETURN(-ENOMEM); + } + req->rq_clrbuf_len = clearsize; + + lustre_init_msg_v2(req->rq_clrbuf, ibufcnt, ibuflens, NULL); + req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, msgsize); + + if (req->rq_pack_udesc) + sptlrpc_pack_user_desc(req->rq_clrbuf, 1); + + RETURN(0); +} + +/* + * NOTE: any change of request buffer allocation should also consider + * changing enlarge_reqbuf() series functions. + */ +int gss_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + + LASSERT(!req->rq_pack_bulk || + (req->rq_bulk_read || req->rq_bulk_write)); + + switch (svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + return gss_alloc_reqbuf_intg(sec, req, svc, msgsize); + case SPTLRPC_SVC_PRIV: + return gss_alloc_reqbuf_priv(sec, req, msgsize); + default: + LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc); + return 0; + } +} + +void gss_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + int privacy; + ENTRY; + + LASSERT(!req->rq_pool || req->rq_reqbuf); + privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV; + + if (!req->rq_clrbuf) + goto release_reqbuf; + + /* release clear buffer */ + LASSERT(privacy); + LASSERT(req->rq_clrbuf_len); + + if (req->rq_pool == NULL || + req->rq_clrbuf < req->rq_reqbuf || + (char *) req->rq_clrbuf >= + (char *) req->rq_reqbuf + req->rq_reqbuf_len) + OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len); + + req->rq_clrbuf = NULL; + req->rq_clrbuf_len = 0; + +release_reqbuf: + if (!req->rq_pool && req->rq_reqbuf) { + LASSERT(req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } + + EXIT; +} + +static int do_alloc_repbuf(struct ptlrpc_request *req, int bufsize) +{ + bufsize = size_roundup_power2(bufsize); + + OBD_ALLOC_LARGE(req->rq_repbuf, bufsize); + if (!req->rq_repbuf) + return -ENOMEM; + + req->rq_repbuf_len = bufsize; + return 0; +} + +static +int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int svc, int msgsize) +{ + int txtsize; + __u32 buflens[4]; + int bufcnt = 2; + int alloc_size; + + /* + * on-wire data layout: + * - gss header + * - lustre message + * - bulk sec descriptor (optional) + * - signature (optional) + * - svc == NULL: NULL + * - svc == AUTH: signature of gss header + * - svc == INTG: signature of all above + * + * if this is context negotiation, reserver fixed space + * at the last (signature) segment regardless of svc mode. + */ + + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + txtsize = buflens[0]; + + buflens[1] = msgsize; + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[1]; + + if (req->rq_pack_bulk) { + buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 1, req->rq_bulk_read); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if (req->rq_ctx_init) + buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN; + else if (svc != SPTLRPC_SVC_NULL) + buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0); + + alloc_size = lustre_msg_size_v2(bufcnt, buflens); + + /* add space for early reply */ + alloc_size += gss_at_reply_off_integ; + + return do_alloc_repbuf(req, alloc_size); +} + +static +int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + int txtsize; + __u32 buflens[2]; + int bufcnt; + int alloc_size; + + /* inner buffers */ + bufcnt = 1; + buflens[0] = msgsize; + + if (req->rq_pack_bulk) + buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 1, req->rq_bulk_read); + txtsize = lustre_msg_size_v2(bufcnt, buflens); + txtsize += GSS_MAX_CIPHER_BLOCK; + + /* wrapper buffers */ + bufcnt = 2; + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1); + + alloc_size = lustre_msg_size_v2(bufcnt, buflens); + /* add space for early reply */ + alloc_size += gss_at_reply_off_priv; + + return do_alloc_repbuf(req, alloc_size); +} + +int gss_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + ENTRY; + + LASSERT(!req->rq_pack_bulk || + (req->rq_bulk_read || req->rq_bulk_write)); + + switch (svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + return gss_alloc_repbuf_intg(sec, req, svc, msgsize); + case SPTLRPC_SVC_PRIV: + return gss_alloc_repbuf_priv(sec, req, msgsize); + default: + LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc); + return 0; + } +} + +void gss_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; + req->rq_repdata = NULL; + req->rq_repdata_len = 0; +} + +static int get_enlarged_msgsize(struct lustre_msg *msg, + int segment, int newsize) +{ + int save, newmsg_size; + + LASSERT(newsize >= msg->lm_buflens[segment]); + + save = msg->lm_buflens[segment]; + msg->lm_buflens[segment] = newsize; + newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + msg->lm_buflens[segment] = save; + + return newmsg_size; +} + +static int get_enlarged_msgsize2(struct lustre_msg *msg, + int segment1, int newsize1, + int segment2, int newsize2) +{ + int save1, save2, newmsg_size; + + LASSERT(newsize1 >= msg->lm_buflens[segment1]); + LASSERT(newsize2 >= msg->lm_buflens[segment2]); + + save1 = msg->lm_buflens[segment1]; + save2 = msg->lm_buflens[segment2]; + msg->lm_buflens[segment1] = newsize1; + msg->lm_buflens[segment2] = newsize2; + newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + msg->lm_buflens[segment1] = save1; + msg->lm_buflens[segment2] = save2; + + return newmsg_size; +} + +static +int gss_enlarge_reqbuf_intg(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int svc, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + int txtsize, sigsize = 0, i; + int newmsg_size, newbuf_size; + + /* + * gss header is at seg 0; + * embedded msg is at seg 1; + * signature (if any) is at the last seg + */ + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len > req->rq_reqlen); + LASSERT(req->rq_reqbuf->lm_bufcount >= 2); + LASSERT(lustre_msg_buf(req->rq_reqbuf, 1, 0) == req->rq_reqmsg); + + /* 1. compute new embedded msg size */ + newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize); + LASSERT(newmsg_size >= req->rq_reqbuf->lm_buflens[1]); + + /* 2. compute new wrapper msg size */ + if (svc == SPTLRPC_SVC_NULL) { + /* no signature, get size directly */ + newbuf_size = get_enlarged_msgsize(req->rq_reqbuf, + 1, newmsg_size); + } else { + txtsize = req->rq_reqbuf->lm_buflens[0]; + + if (svc == SPTLRPC_SVC_INTG) { + for (i = 1; i < req->rq_reqbuf->lm_bufcount; i++) + txtsize += req->rq_reqbuf->lm_buflens[i]; + txtsize += newmsg_size - req->rq_reqbuf->lm_buflens[1]; + } + + sigsize = gss_cli_payload(req->rq_cli_ctx, txtsize, 0); + LASSERT(sigsize >= msg_last_seglen(req->rq_reqbuf)); + + newbuf_size = get_enlarged_msgsize2( + req->rq_reqbuf, + 1, newmsg_size, + msg_last_segidx(req->rq_reqbuf), + sigsize); + } + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size); + + if (req->rq_reqbuf_len < newbuf_size) { + newbuf_size = size_roundup_power2(newbuf_size); + + OBD_ALLOC_LARGE(newbuf, newbuf_size); + if (newbuf == NULL) + RETURN(-ENOMEM); + + /* Must lock this, so that otherwise unprotected change of + * rq_reqmsg is not racing with parallel processing of + * imp_replay_list traversing threads. See LU-3333 + * This is a bandaid at best, we really need to deal with this + * in request enlarging code before unpacking that's already + * there */ + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + + memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = newbuf; + req->rq_reqbuf_len = newbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, 0); + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } + + /* do enlargement, from wrapper to embedded, from end to begin */ + if (svc != SPTLRPC_SVC_NULL) + _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, + msg_last_segidx(req->rq_reqbuf), + sigsize); + + _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, 1, newmsg_size); + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + + req->rq_reqlen = newmsg_size; + RETURN(0); +} + +static +int gss_enlarge_reqbuf_priv(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newclrbuf; + int newmsg_size, newclrbuf_size, newcipbuf_size; + __u32 buflens[3]; + + /* + * embedded msg is at seg 0 of clear buffer; + * cipher text is at seg 2 of cipher buffer; + */ + LASSERT(req->rq_pool || + (req->rq_reqbuf == NULL && req->rq_reqbuf_len == 0)); + LASSERT(req->rq_reqbuf == NULL || + (req->rq_pool && req->rq_reqbuf->lm_bufcount == 3)); + LASSERT(req->rq_clrbuf); + LASSERT(req->rq_clrbuf_len > req->rq_reqlen); + LASSERT(lustre_msg_buf(req->rq_clrbuf, 0, 0) == req->rq_reqmsg); + + /* compute new embedded msg size */ + newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize); + + /* compute new clear buffer size */ + newclrbuf_size = get_enlarged_msgsize(req->rq_clrbuf, 0, newmsg_size); + newclrbuf_size += GSS_MAX_CIPHER_BLOCK; + + /* compute new cipher buffer size */ + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0); + buflens[2] = gss_cli_payload(req->rq_cli_ctx, newclrbuf_size, 1); + newcipbuf_size = lustre_msg_size_v2(3, buflens); + + /* handle the case that we put both clear buf and cipher buf into + * pre-allocated single buffer. */ + if (unlikely(req->rq_pool) && + req->rq_clrbuf >= req->rq_reqbuf && + (char *) req->rq_clrbuf < + (char *) req->rq_reqbuf + req->rq_reqbuf_len) { + /* it couldn't be better we still fit into the + * pre-allocated buffer. */ + if (newclrbuf_size + newcipbuf_size <= req->rq_reqbuf_len) { + void *src, *dst; + + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + /* move clear text backward. */ + src = req->rq_clrbuf; + dst = (char *) req->rq_reqbuf + newcipbuf_size; + + memmove(dst, src, req->rq_clrbuf_len); + + req->rq_clrbuf = (struct lustre_msg *) dst; + req->rq_clrbuf_len = newclrbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0); + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } else { + /* sadly we have to split out the clear buffer */ + LASSERT(req->rq_reqbuf_len >= newcipbuf_size); + LASSERT(req->rq_clrbuf_len < newclrbuf_size); + } + } + + if (req->rq_clrbuf_len < newclrbuf_size) { + newclrbuf_size = size_roundup_power2(newclrbuf_size); + + OBD_ALLOC_LARGE(newclrbuf, newclrbuf_size); + if (newclrbuf == NULL) + RETURN(-ENOMEM); + + /* Must lock this, so that otherwise unprotected change of + * rq_reqmsg is not racing with parallel processing of + * imp_replay_list traversing threads. See LU-3333 + * This is a bandaid at best, we really need to deal with this + * in request enlarging code before unpacking that's already + * there */ + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + + memcpy(newclrbuf, req->rq_clrbuf, req->rq_clrbuf_len); + + if (req->rq_reqbuf == NULL || + req->rq_clrbuf < req->rq_reqbuf || + (char *) req->rq_clrbuf >= + (char *) req->rq_reqbuf + req->rq_reqbuf_len) { + OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len); + } + + req->rq_clrbuf = newclrbuf; + req->rq_clrbuf_len = newclrbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0); + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } + + _sptlrpc_enlarge_msg_inplace(req->rq_clrbuf, 0, newmsg_size); + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + req->rq_reqlen = newmsg_size; + + RETURN(0); +} + +int gss_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + + LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini); + + switch (svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + return gss_enlarge_reqbuf_intg(sec, req, svc, segment, newsize); + case SPTLRPC_SVC_PRIV: + return gss_enlarge_reqbuf_priv(sec, req, segment, newsize); + default: + LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc); + return 0; + } +} + +int gss_sec_install_rctx(struct obd_import *imp, + struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx) +{ + struct gss_sec *gsec; + struct gss_cli_ctx *gctx; + int rc; + + gsec = container_of(sec, struct gss_sec, gs_base); + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + + rc = gss_install_rvs_svc_ctx(imp, gsec, gctx); + return rc; +} + +/******************************************** + * server side API * + ********************************************/ + +static inline +int gss_svc_reqctx_is_special(struct gss_svc_reqctx *grctx) +{ + LASSERT(grctx); + return (grctx->src_init || grctx->src_init_continue || + grctx->src_err_notify); +} + +static +void gss_svc_reqctx_free(struct gss_svc_reqctx *grctx) +{ + if (grctx->src_ctx) + gss_svc_upcall_put_ctx(grctx->src_ctx); + + sptlrpc_policy_put(grctx->src_base.sc_policy); + OBD_FREE_PTR(grctx); +} + +static inline +void gss_svc_reqctx_addref(struct gss_svc_reqctx *grctx) +{ + LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0); + atomic_inc(&grctx->src_base.sc_refcount); +} + +static inline +void gss_svc_reqctx_decref(struct gss_svc_reqctx *grctx) +{ + LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0); + + if (atomic_dec_and_test(&grctx->src_base.sc_refcount)) + gss_svc_reqctx_free(grctx); +} + +static +int gss_svc_sign(struct ptlrpc_request *req, + struct ptlrpc_reply_state *rs, + struct gss_svc_reqctx *grctx, + __u32 svc) +{ + __u32 flags = 0; + int rc; + ENTRY; + + LASSERT(rs->rs_msg == lustre_msg_buf(rs->rs_repbuf, 1, 0)); + + /* embedded lustre_msg might have been shrunk */ + if (req->rq_replen != rs->rs_repbuf->lm_buflens[1]) + lustre_shrink_msg(rs->rs_repbuf, 1, req->rq_replen, 1); + + if (req->rq_pack_bulk) + flags |= LUSTRE_GSS_PACK_BULK; + + rc = gss_sign_msg(rs->rs_repbuf, grctx->src_ctx->gsc_mechctx, + LUSTRE_SP_ANY, flags, PTLRPC_GSS_PROC_DATA, + grctx->src_wirectx.gw_seq, svc, NULL); + if (rc < 0) + RETURN(rc); + + rs->rs_repdata_len = rc; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = gss_at_reply_off_integ; + else + req->rq_reply_off = 0; + } else { + if (svc == SPTLRPC_SVC_NULL) + rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0, + lustre_msg_buf(rs->rs_repbuf, 1, 0), + lustre_msg_buflen(rs->rs_repbuf, 1)); + req->rq_reply_off = 0; + } + + RETURN(0); +} + +int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + struct ptlrpc_reply_state *rs; + struct gss_err_header *ghdr; + int replen = sizeof(struct ptlrpc_body); + int rc; + ENTRY; + + //if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_SVCGSS_ERR_NOTIFY, OBD_FAIL_ONCE)) + // RETURN(-EINVAL); + + grctx->src_err_notify = 1; + grctx->src_reserve_len = 0; + + rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0); + if (rc) { + CERROR("could not pack reply, err %d\n", rc); + RETURN(rc); + } + + /* gss hdr */ + rs = req->rq_reply_state; + LASSERT(rs->rs_repbuf->lm_buflens[1] >= sizeof(*ghdr)); + ghdr = lustre_msg_buf(rs->rs_repbuf, 0, 0); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_flags = 0; + ghdr->gh_proc = PTLRPC_GSS_PROC_ERR; + ghdr->gh_major = major; + ghdr->gh_minor = minor; + ghdr->gh_handle.len = 0; /* fake context handle */ + + rs->rs_repdata_len = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount, + rs->rs_repbuf->lm_buflens); + + CDEBUG(D_SEC, "prepare gss error notify(0x%x/0x%x) to %s\n", + major, minor, libcfs_nid2str(req->rq_peer.nid)); + RETURN(0); +} + +static +int gss_svc_handle_init(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + struct lustre_msg *reqbuf = req->rq_reqbuf; + struct obd_uuid *uuid; + struct obd_device *target; + rawobj_t uuid_obj, rvs_hdl, in_token; + __u32 lustre_svc; + __u32 *secdata, seclen; + int swabbed, rc; + ENTRY; + + CDEBUG(D_SEC, "processing gss init(%d) request from %s\n", gw->gw_proc, + libcfs_nid2str(req->rq_peer.nid)); + + req->rq_ctx_init = 1; + + if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { + CERROR("unexpected bulk flag\n"); + RETURN(SECSVC_DROP); + } + + if (gw->gw_proc == PTLRPC_GSS_PROC_INIT && gw->gw_handle.len != 0) { + CERROR("proc %u: invalid handle length %u\n", + gw->gw_proc, gw->gw_handle.len); + RETURN(SECSVC_DROP); + } + + if (reqbuf->lm_bufcount < 3 || reqbuf->lm_bufcount > 4){ + CERROR("Invalid bufcount %d\n", reqbuf->lm_bufcount); + RETURN(SECSVC_DROP); + } + + swabbed = ptlrpc_req_need_swab(req); + + /* ctx initiate payload is in last segment */ + secdata = lustre_msg_buf(reqbuf, reqbuf->lm_bufcount - 1, 0); + seclen = reqbuf->lm_buflens[reqbuf->lm_bufcount - 1]; + + if (seclen < 4 + 4) { + CERROR("sec size %d too small\n", seclen); + RETURN(SECSVC_DROP); + } + + /* lustre svc type */ + lustre_svc = le32_to_cpu(*secdata++); + seclen -= 4; + + /* extract target uuid, note this code is somewhat fragile + * because touched internal structure of obd_uuid */ + if (rawobj_extract(&uuid_obj, &secdata, &seclen)) { + CERROR("failed to extract target uuid\n"); + RETURN(SECSVC_DROP); + } + uuid_obj.data[uuid_obj.len - 1] = '\0'; + + uuid = (struct obd_uuid *) uuid_obj.data; + target = class_uuid2obd(uuid); + if (!target || target->obd_stopping || !target->obd_set_up) { + CERROR("target '%s' is not available for context init (%s)\n", + uuid->uuid, target == NULL ? "no target" : + (target->obd_stopping ? "stopping" : "not set up")); + RETURN(SECSVC_DROP); + } + + /* extract reverse handle */ + if (rawobj_extract(&rvs_hdl, &secdata, &seclen)) { + CERROR("failed extract reverse handle\n"); + RETURN(SECSVC_DROP); + } + + /* extract token */ + if (rawobj_extract(&in_token, &secdata, &seclen)) { + CERROR("can't extract token\n"); + RETURN(SECSVC_DROP); + } + + rc = gss_svc_upcall_handle_init(req, grctx, gw, target, lustre_svc, + &rvs_hdl, &in_token); + if (rc != SECSVC_OK) + RETURN(rc); + + if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss || + grctx->src_ctx->gsc_usr_root) + CWARN("create svc ctx %p: user from %s authenticated as %s\n", + grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid), + grctx->src_ctx->gsc_usr_root ? "root" : + (grctx->src_ctx->gsc_usr_mds ? "mds" : + (grctx->src_ctx->gsc_usr_oss ? "oss" : "null"))); + else + CWARN("create svc ctx %p: accept user %u from %s\n", + grctx->src_ctx, grctx->src_ctx->gsc_uid, + libcfs_nid2str(req->rq_peer.nid)); + + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (reqbuf->lm_bufcount < 4) { + CERROR("missing user descriptor\n"); + RETURN(SECSVC_DROP); + } + if (sptlrpc_unpack_user_desc(reqbuf, 2, swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(SECSVC_DROP); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(reqbuf, 2, 0); + } + + req->rq_reqmsg = lustre_msg_buf(reqbuf, 1, 0); + req->rq_reqlen = lustre_msg_buflen(reqbuf, 1); + + RETURN(rc); +} + +/* + * last segment must be the gss signature. + */ +static +int gss_svc_verify_request(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + __u32 *major) +{ + struct gss_svc_ctx *gctx = grctx->src_ctx; + struct lustre_msg *msg = req->rq_reqbuf; + int offset = 2; + int swabbed; + ENTRY; + + *major = GSS_S_COMPLETE; + + if (msg->lm_bufcount < 2) { + CERROR("Too few segments (%u) in request\n", msg->lm_bufcount); + RETURN(-EINVAL); + } + + if (gw->gw_svc == SPTLRPC_SVC_NULL) + goto verified; + + if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) { + CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + + *major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc); + if (*major != GSS_S_COMPLETE) { + CERROR("failed to verify request: %x\n", *major); + RETURN(-EACCES); + } + + if (gctx->gsc_reverse == 0 && + gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) { + CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + +verified: + swabbed = ptlrpc_req_need_swab(req); + + /* user descriptor */ + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (msg->lm_bufcount < (offset + 1)) { + CERROR("no user desc included\n"); + RETURN(-EINVAL); + } + + if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(-EINVAL); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(msg, offset, 0); + offset++; + } + + /* check bulk_sec_desc data */ + if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { + if (msg->lm_bufcount < (offset + 1)) { + CERROR("missing bulk sec descriptor\n"); + RETURN(-EINVAL); + } + + if (bulk_sec_desc_unpack(msg, offset, swabbed)) + RETURN(-EINVAL); + + req->rq_pack_bulk = 1; + grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0); + grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset); + } + + req->rq_reqmsg = lustre_msg_buf(msg, 1, 0); + req->rq_reqlen = msg->lm_buflens[1]; + RETURN(0); +} + +static +int gss_svc_unseal_request(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + __u32 *major) +{ + struct gss_svc_ctx *gctx = grctx->src_ctx; + struct lustre_msg *msg = req->rq_reqbuf; + int swabbed, msglen, offset = 1; + ENTRY; + + if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) { + CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + + *major = gss_unseal_msg(gctx->gsc_mechctx, msg, + &msglen, req->rq_reqdata_len); + if (*major != GSS_S_COMPLETE) { + CERROR("failed to unwrap request: %x\n", *major); + RETURN(-EACCES); + } + + if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) { + CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + + swabbed = __lustre_unpack_msg(msg, msglen); + if (swabbed < 0) { + CERROR("Failed to unpack after decryption\n"); + RETURN(-EINVAL); + } + req->rq_reqdata_len = msglen; + + if (msg->lm_bufcount < 1) { + CERROR("Invalid buffer: is empty\n"); + RETURN(-EINVAL); + } + + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (msg->lm_bufcount < offset + 1) { + CERROR("no user descriptor included\n"); + RETURN(-EINVAL); + } + + if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(-EINVAL); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(msg, offset, 0); + offset++; + } + + if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { + if (msg->lm_bufcount < offset + 1) { + CERROR("no bulk checksum included\n"); + RETURN(-EINVAL); + } + + if (bulk_sec_desc_unpack(msg, offset, swabbed)) + RETURN(-EINVAL); + + req->rq_pack_bulk = 1; + grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0); + grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset); + } + + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 0, 0); + req->rq_reqlen = req->rq_reqbuf->lm_buflens[0]; + RETURN(0); +} + +static +int gss_svc_handle_data(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + __u32 major = 0; + int rc = 0; + ENTRY; + + grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw); + if (!grctx->src_ctx) { + major = GSS_S_NO_CONTEXT; + goto error; + } + + switch (gw->gw_svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + rc = gss_svc_verify_request(req, grctx, gw, &major); + break; + case SPTLRPC_SVC_PRIV: + rc = gss_svc_unseal_request(req, grctx, gw, &major); + break; + default: + CERROR("unsupported gss service %d\n", gw->gw_svc); + rc = -EINVAL; + } + + if (rc == 0) + RETURN(SECSVC_OK); + + CERROR("svc %u failed: major 0x%08x: req xid %llu ctx %p idx " + "%#llx(%u->%s)\n", gw->gw_svc, major, req->rq_xid, + grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle), + grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid)); +error: + /* we only notify client in case of NO_CONTEXT/BAD_SIG, which + * might happen after server reboot, to allow recovery. */ + if ((major == GSS_S_NO_CONTEXT || major == GSS_S_BAD_SIG) && + gss_pack_err_notify(req, major, 0) == 0) + RETURN(SECSVC_COMPLETE); + + RETURN(SECSVC_DROP); +} + +static +int gss_svc_handle_destroy(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + __u32 major; + ENTRY; + + req->rq_ctx_fini = 1; + req->rq_no_reply = 1; + + grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw); + if (!grctx->src_ctx) { + CDEBUG(D_SEC, "invalid gss context handle for destroy.\n"); + RETURN(SECSVC_DROP); + } + + if (gw->gw_svc != SPTLRPC_SVC_INTG) { + CERROR("svc %u is not supported in destroy.\n", gw->gw_svc); + RETURN(SECSVC_DROP); + } + + if (gss_svc_verify_request(req, grctx, gw, &major)) + RETURN(SECSVC_DROP); + + CWARN("destroy svc ctx %p idx %#llx (%u->%s)\n", + grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle), + grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid)); + + gss_svc_upcall_destroy_ctx(grctx->src_ctx); + + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (req->rq_reqbuf->lm_bufcount < 4) { + CERROR("missing user descriptor, ignore it\n"); + RETURN(SECSVC_OK); + } + if (sptlrpc_unpack_user_desc(req->rq_reqbuf, 2, + ptlrpc_req_need_swab(req))) { + CERROR("Mal-formed user descriptor, ignore it\n"); + RETURN(SECSVC_OK); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(req->rq_reqbuf, 2, 0); + } + + RETURN(SECSVC_OK); +} + +int gss_svc_accept(struct ptlrpc_sec_policy *policy, struct ptlrpc_request *req) +{ + struct gss_header *ghdr; + struct gss_svc_reqctx *grctx; + struct gss_wire_ctx *gw; + int swabbed, rc; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_svc_ctx == NULL); + + if (req->rq_reqbuf->lm_bufcount < 2) { + CERROR("buf count only %d\n", req->rq_reqbuf->lm_bufcount); + RETURN(SECSVC_DROP); + } + + swabbed = ptlrpc_req_need_swab(req); + + ghdr = gss_swab_header(req->rq_reqbuf, 0, swabbed); + if (ghdr == NULL) { + CERROR("can't decode gss header\n"); + RETURN(SECSVC_DROP); + } + + /* sanity checks */ + if (ghdr->gh_version != PTLRPC_GSS_VERSION) { + CERROR("gss version %u, expect %u\n", ghdr->gh_version, + PTLRPC_GSS_VERSION); + RETURN(SECSVC_DROP); + } + + req->rq_sp_from = ghdr->gh_sp; + + /* alloc grctx data */ + OBD_ALLOC_PTR(grctx); + if (!grctx) + RETURN(SECSVC_DROP); + + grctx->src_base.sc_policy = sptlrpc_policy_get(policy); + atomic_set(&grctx->src_base.sc_refcount, 1); + req->rq_svc_ctx = &grctx->src_base; + gw = &grctx->src_wirectx; + + /* save wire context */ + gw->gw_flags = ghdr->gh_flags; + gw->gw_proc = ghdr->gh_proc; + gw->gw_seq = ghdr->gh_seq; + gw->gw_svc = ghdr->gh_svc; + rawobj_from_netobj(&gw->gw_handle, &ghdr->gh_handle); + + /* keep original wire header which subject to checksum verification */ + if (swabbed) + gss_header_swabber(ghdr); + + switch(ghdr->gh_proc) { + case PTLRPC_GSS_PROC_INIT: + case PTLRPC_GSS_PROC_CONTINUE_INIT: + rc = gss_svc_handle_init(req, gw); + break; + case PTLRPC_GSS_PROC_DATA: + rc = gss_svc_handle_data(req, gw); + break; + case PTLRPC_GSS_PROC_DESTROY: + rc = gss_svc_handle_destroy(req, gw); + break; + default: + CERROR("unknown proc %u\n", gw->gw_proc); + rc = SECSVC_DROP; + break; + } + + switch (rc) { + case SECSVC_OK: + LASSERT (grctx->src_ctx); + + req->rq_auth_gss = 1; + req->rq_auth_usr_mdt = grctx->src_ctx->gsc_usr_mds; + req->rq_auth_usr_ost = grctx->src_ctx->gsc_usr_oss; + req->rq_auth_usr_root = grctx->src_ctx->gsc_usr_root; + req->rq_auth_uid = grctx->src_ctx->gsc_uid; + req->rq_auth_mapped_uid = grctx->src_ctx->gsc_mapped_uid; + break; + case SECSVC_COMPLETE: + break; + case SECSVC_DROP: + gss_svc_reqctx_free(grctx); + req->rq_svc_ctx = NULL; + break; + } + + RETURN(rc); +} + +void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx) +{ + struct gss_svc_reqctx *grctx; + ENTRY; + + if (svc_ctx == NULL) { + EXIT; + return; + } + + grctx = gss_svc_ctx2reqctx(svc_ctx); + + CWARN("gss svc invalidate ctx %p(%u)\n", + grctx->src_ctx, grctx->src_ctx->gsc_uid); + gss_svc_upcall_destroy_ctx(grctx->src_ctx); + + EXIT; +} + +static inline +int gss_svc_payload(struct gss_svc_reqctx *grctx, int early, + int msgsize, int privacy) +{ + /* we should treat early reply normally, but which is actually sharing + * the same ctx with original request, so in this case we should + * ignore the special ctx's special flags */ + if (early == 0 && gss_svc_reqctx_is_special(grctx)) + return grctx->src_reserve_len; + + return gss_mech_payload(NULL, msgsize, privacy); +} + +static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx, + struct sptlrpc_flavor *flvr, + int read) +{ + int payload = sizeof(struct ptlrpc_bulk_sec_desc); + + if (read) { + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_NULL: + break; + case SPTLRPC_BULK_SVC_INTG: + payload += gss_mech_payload(NULL, 0, 0); + break; + case SPTLRPC_BULK_SVC_PRIV: + payload += gss_mech_payload(NULL, 0, 1); + break; + case SPTLRPC_BULK_SVC_AUTH: + default: + LBUG(); + } + } + + return payload; +} + +int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_reply_state *rs; + int early, privacy, svc, bsd_off = 0; + __u32 ibuflens[2], buflens[4]; + int ibufcnt = 0, bufcnt; + int txtsize, wmsg_size, rs_size; + ENTRY; + + LASSERT(msglen % 8 == 0); + + if (req->rq_pack_bulk && !req->rq_bulk_read && !req->rq_bulk_write) { + CERROR("client request bulk sec on non-bulk rpc\n"); + RETURN(-EPROTO); + } + + svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + early = (req->rq_packed_final == 0); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + if (!early && gss_svc_reqctx_is_special(grctx)) + privacy = 0; + else + privacy = (svc == SPTLRPC_SVC_PRIV); + + if (privacy) { + /* inner clear buffers */ + ibufcnt = 1; + ibuflens[0] = msglen; + + if (req->rq_pack_bulk) { + LASSERT(grctx->src_reqbsd); + + bsd_off = ibufcnt; + ibuflens[ibufcnt++] = gss_svc_bulk_payload( + grctx->src_ctx, + &req->rq_flvr, + req->rq_bulk_read); + } + + txtsize = lustre_msg_size_v2(ibufcnt, ibuflens); + txtsize += GSS_MAX_CIPHER_BLOCK; + + /* wrapper buffer */ + bufcnt = 2; + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_svc_payload(grctx, early, txtsize, 1); + } else { + bufcnt = 2; + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = msglen; + + txtsize = buflens[0]; + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[1]; + + if (req->rq_pack_bulk) { + LASSERT(grctx->src_reqbsd); + + bsd_off = bufcnt; + buflens[bufcnt] = gss_svc_bulk_payload( + grctx->src_ctx, + &req->rq_flvr, + req->rq_bulk_read); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if ((!early && gss_svc_reqctx_is_special(grctx)) || + svc != SPTLRPC_SVC_NULL) + buflens[bufcnt++] = gss_svc_payload(grctx, early, + txtsize, 0); + } + + wmsg_size = lustre_msg_size_v2(bufcnt, buflens); + + rs_size = sizeof(*rs) + wmsg_size; + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + RETURN(-ENOMEM); + + rs->rs_size = rs_size; + } + + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = wmsg_size; + + /* initialize the buffer */ + if (privacy) { + lustre_init_msg_v2(rs->rs_repbuf, ibufcnt, ibuflens, NULL); + rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 0, msglen); + } else { + lustre_init_msg_v2(rs->rs_repbuf, bufcnt, buflens, NULL); + rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 1, 0); + } + + if (bsd_off) { + grctx->src_repbsd = lustre_msg_buf(rs->rs_repbuf, bsd_off, 0); + grctx->src_repbsd_size = lustre_msg_buflen(rs->rs_repbuf, + bsd_off); + } + + gss_svc_reqctx_addref(grctx); + rs->rs_svc_ctx = req->rq_svc_ctx; + + LASSERT(rs->rs_msg); + req->rq_reply_state = rs; + RETURN(0); +} + +static int gss_svc_seal(struct ptlrpc_request *req, + struct ptlrpc_reply_state *rs, + struct gss_svc_reqctx *grctx) +{ + struct gss_svc_ctx *gctx = grctx->src_ctx; + rawobj_t hdrobj, msgobj, token; + struct gss_header *ghdr; + __u8 *token_buf; + int token_buflen; + __u32 buflens[2], major; + int msglen, rc; + ENTRY; + + /* get clear data length. note embedded lustre_msg might + * have been shrunk */ + if (req->rq_replen != lustre_msg_buflen(rs->rs_repbuf, 0)) + msglen = lustre_shrink_msg(rs->rs_repbuf, 0, req->rq_replen, 1); + else + msglen = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount, + rs->rs_repbuf->lm_buflens); + + /* temporarily use tail of buffer to hold gss header data */ + LASSERT(msglen + PTLRPC_GSS_HEADER_SIZE <= rs->rs_repbuf_len); + ghdr = (struct gss_header *) ((char *) rs->rs_repbuf + + rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = LUSTRE_SP_ANY; + ghdr->gh_flags = 0; + ghdr->gh_proc = PTLRPC_GSS_PROC_DATA; + ghdr->gh_seq = grctx->src_wirectx.gw_seq; + ghdr->gh_svc = SPTLRPC_SVC_PRIV; + ghdr->gh_handle.len = 0; + if (req->rq_pack_bulk) + ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK; + + /* allocate temporary cipher buffer */ + token_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1); + OBD_ALLOC_LARGE(token_buf, token_buflen); + if (token_buf == NULL) + RETURN(-ENOMEM); + + hdrobj.len = PTLRPC_GSS_HEADER_SIZE; + hdrobj.data = (__u8 *) ghdr; + msgobj.len = msglen; + msgobj.data = (__u8 *) rs->rs_repbuf; + token.len = token_buflen; + token.data = token_buf; + + major = lgss_wrap(gctx->gsc_mechctx, &hdrobj, &msgobj, + rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE, &token); + if (major != GSS_S_COMPLETE) { + CERROR("wrap message error: %08x\n", major); + GOTO(out_free, rc = -EPERM); + } + LASSERT(token.len <= token_buflen); + + /* we are about to override data at rs->rs_repbuf, nullify pointers + * to which to catch further illegal usage. */ + if (req->rq_pack_bulk) { + grctx->src_repbsd = NULL; + grctx->src_repbsd_size = 0; + } + + /* now fill the actual wire data + * - gss header + * - gss token + */ + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = token.len; + + rs->rs_repdata_len = lustre_msg_size_v2(2, buflens); + LASSERT(rs->rs_repdata_len <= rs->rs_repbuf_len); + + lustre_init_msg_v2(rs->rs_repbuf, 2, buflens, NULL); + rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + memcpy(lustre_msg_buf(rs->rs_repbuf, 0, 0), ghdr, + PTLRPC_GSS_HEADER_SIZE); + memcpy(lustre_msg_buf(rs->rs_repbuf, 1, 0), token.data, token.len); + + /* reply offset */ + if (req->rq_packed_final && + (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) + req->rq_reply_off = gss_at_reply_off_priv; + else + req->rq_reply_off = 0; + + /* to catch upper layer's further access */ + rs->rs_msg = NULL; + req->rq_repmsg = NULL; + req->rq_replen = 0; + + rc = 0; +out_free: + OBD_FREE_LARGE(token_buf, token_buflen); + RETURN(rc); +} + +int gss_svc_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + struct gss_wire_ctx *gw = &grctx->src_wirectx; + int early, rc; + ENTRY; + + early = (req->rq_packed_final == 0); + + if (!early && gss_svc_reqctx_is_special(grctx)) { + LASSERT(rs->rs_repdata_len != 0); + + req->rq_reply_off = gss_at_reply_off_integ; + RETURN(0); + } + + /* early reply could happen in many cases */ + if (!early && + gw->gw_proc != PTLRPC_GSS_PROC_DATA && + gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) { + CERROR("proc %d not support\n", gw->gw_proc); + RETURN(-EINVAL); + } + + LASSERT(grctx->src_ctx); + + switch (gw->gw_svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + rc = gss_svc_sign(req, rs, grctx, gw->gw_svc); + break; + case SPTLRPC_SVC_PRIV: + rc = gss_svc_seal(req, rs, grctx); + break; + default: + CERROR("Unknown service %d\n", gw->gw_svc); + GOTO(out, rc = -EINVAL); + } + rc = 0; + +out: + RETURN(rc); +} + +void gss_svc_free_rs(struct ptlrpc_reply_state *rs) +{ + struct gss_svc_reqctx *grctx; + + LASSERT(rs->rs_svc_ctx); + grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base); + + gss_svc_reqctx_decref(grctx); + rs->rs_svc_ctx = NULL; + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); +} + +void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->sc_refcount) == 0); + gss_svc_reqctx_free(gss_svc_ctx2reqctx(ctx)); +} + +int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct gss_cli_ctx *cli_gctx = ctx2gctx(cli_ctx); + struct gss_svc_ctx *svc_gctx = gss_svc_ctx2gssctx(svc_ctx); + struct gss_ctx *mechctx = NULL; + + LASSERT(cli_gctx); + LASSERT(svc_gctx && svc_gctx->gsc_mechctx); + + cli_gctx->gc_proc = PTLRPC_GSS_PROC_DATA; + cli_gctx->gc_win = GSS_SEQ_WIN; + + /* The problem is the reverse ctx might get lost in some recovery + * situations, and the same svc_ctx will be used to re-create it. + * if there's callback be sentout before that, new reverse ctx start + * with sequence 0 will lead to future callback rpc be treated as + * replay. + * + * each reverse root ctx will record its latest sequence number on its + * buddy svcctx before be destroyed, so here we continue use it. + */ + atomic_set(&cli_gctx->gc_seq, svc_gctx->gsc_rvs_seq); + + if (gss_svc_upcall_dup_handle(&cli_gctx->gc_svc_handle, svc_gctx)) { + CERROR("failed to dup svc handle\n"); + goto err_out; + } + + if (lgss_copy_reverse_context(svc_gctx->gsc_mechctx, &mechctx) != + GSS_S_COMPLETE) { + CERROR("failed to copy mech context\n"); + goto err_svc_handle; + } + + if (rawobj_dup(&cli_gctx->gc_handle, &svc_gctx->gsc_rvs_hdl)) { + CERROR("failed to dup reverse handle\n"); + goto err_ctx; + } + + cli_gctx->gc_mechctx = mechctx; + gss_cli_ctx_uptodate(cli_gctx); + + return 0; + +err_ctx: + lgss_delete_sec_context(&mechctx); +err_svc_handle: + rawobj_free(&cli_gctx->gc_svc_handle); +err_out: + return -ENOMEM; +} + +static void gss_init_at_reply_offset(void) +{ + __u32 buflens[3]; + int clearsize; + + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = lustre_msg_early_size(); + buflens[2] = gss_cli_payload(NULL, buflens[1], 0); + gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens); + + buflens[0] = lustre_msg_early_size(); + clearsize = lustre_msg_size_v2(1, buflens); + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(NULL, clearsize, 0); + buflens[2] = gss_cli_payload(NULL, clearsize, 1); + gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens); +} + +static int __init sptlrpc_gss_init(void) +{ + int rc; + + rc = gss_init_lproc(); + if (rc) + return rc; + + rc = gss_init_cli_upcall(); + if (rc) + goto out_lproc; + + rc = gss_init_svc_upcall(); + if (rc) + goto out_cli_upcall; + + rc = init_null_module(); + if (rc) + goto out_svc_upcall; + + rc = init_kerberos_module(); + if (rc) + goto out_null; + + rc = init_sk_module(); + if (rc) + goto out_kerberos; + + /* register policy after all other stuff be initialized, because it + * might be in used immediately after the registration. */ + + rc = gss_init_keyring(); + if (rc) + goto out_sk; + + rc = gss_init_pipefs(); + if (rc) + goto out_keyring; + + gss_init_at_reply_offset(); + + return 0; + +out_keyring: + gss_exit_keyring(); +out_sk: + cleanup_sk_module(); +out_kerberos: + cleanup_kerberos_module(); +out_null: + cleanup_null_module(); +out_svc_upcall: + gss_exit_svc_upcall(); +out_cli_upcall: + gss_exit_cli_upcall(); +out_lproc: + gss_exit_lproc(); + return rc; +} + +static void __exit sptlrpc_gss_exit(void) +{ + gss_exit_keyring(); + gss_exit_pipefs(); + cleanup_kerberos_module(); + gss_exit_svc_upcall(); + gss_exit_cli_upcall(); + gss_exit_lproc(); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre GSS security policy"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(sptlrpc_gss_init); +module_exit(sptlrpc_gss_exit); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c new file mode 100644 index 0000000000000..46d92bf4ed2d0 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c @@ -0,0 +1,1997 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/import.c + * + * Author: Mike Shaver + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +struct ptlrpc_connect_async_args { + __u64 pcaa_peer_committed; + int pcaa_initial_connect; +}; + +/** + * Updates import \a imp current state to provided \a state value + * Helper function. + */ +static void import_set_state_nolock(struct obd_import *imp, + enum lustre_imp_state state) +{ + switch (state) { + case LUSTRE_IMP_CLOSED: + case LUSTRE_IMP_NEW: + case LUSTRE_IMP_DISCON: + case LUSTRE_IMP_CONNECTING: + break; + case LUSTRE_IMP_REPLAY_WAIT: + imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS; + break; + default: + imp->imp_replay_state = LUSTRE_IMP_REPLAY; + break; + } + + /* A CLOSED import should remain so. */ + if (imp->imp_state == LUSTRE_IMP_CLOSED) + return; + + if (imp->imp_state != LUSTRE_IMP_NEW) { + CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", + imp, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(imp->imp_state), + ptlrpc_import_state_name(state)); + } + + imp->imp_state = state; + imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state; + imp->imp_state_hist[imp->imp_state_hist_idx].ish_time = + ktime_get_real_seconds(); + imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) % + IMP_STATE_HIST_LEN; +} + +static void import_set_state(struct obd_import *imp, + enum lustre_imp_state new_state) +{ + spin_lock(&imp->imp_lock); + import_set_state_nolock(imp, new_state); + spin_unlock(&imp->imp_lock); +} + +void ptlrpc_import_enter_resend(struct obd_import *imp) +{ + import_set_state(imp, LUSTRE_IMP_RECOVER); +} +EXPORT_SYMBOL(ptlrpc_import_enter_resend); + + +static int ptlrpc_connect_interpret(const struct lu_env *env, + struct ptlrpc_request *request, + void * data, int rc); +int ptlrpc_import_recovery_state_machine(struct obd_import *imp); + +/* Only this function is allowed to change the import state when it is + * CLOSED. I would rather refcount the import and free it after + * disconnection like we do with exports. To do that, the client_obd + * will need to save the peer info somewhere other than in the import, + * though. */ +int ptlrpc_init_import(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + + imp->imp_generation++; + imp->imp_state = LUSTRE_IMP_NEW; + + spin_unlock(&imp->imp_lock); + + return 0; +} +EXPORT_SYMBOL(ptlrpc_init_import); + +#define UUID_STR "_UUID" +void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len) +{ + *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix)) + ? uuid : uuid + strlen(prefix); + + *uuid_len = strlen(*uuid_start); + + if (*uuid_len < strlen(UUID_STR)) + return; + + if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR), + UUID_STR, strlen(UUID_STR))) + *uuid_len -= strlen(UUID_STR); +} + +/* Must be called with imp_lock held! */ +static void ptlrpc_deactivate_import_nolock(struct obd_import *imp) +{ + ENTRY; + + assert_spin_locked(&imp->imp_lock); + CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd)); + imp->imp_invalid = 1; + imp->imp_generation++; + + ptlrpc_abort_inflight(imp); + + EXIT; +} + +/** + * Returns true if import was FULL, false if import was already not + * connected. + * @imp - import to be disconnected + * @conn_cnt - connection count (epoch) of the request that timed out + * and caused the disconnection. In some cases, multiple + * inflight requests can fail to a single target (e.g. OST + * bulk requests) and if one has already caused a reconnection + * (increasing the import->conn_cnt) the older failure should + * not also cause a reconnection. If zero it forces a reconnect. + * @invalid - set import invalid flag + */ +int ptlrpc_set_import_discon(struct obd_import *imp, + __u32 conn_cnt, bool invalid) +{ + int rc = 0; + + spin_lock(&imp->imp_lock); + + if (imp->imp_state == LUSTRE_IMP_FULL && + (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) { + char *target_start; + int target_len; + bool inact = false; + + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + + import_set_state_nolock(imp, LUSTRE_IMP_DISCON); + if (imp->imp_replayable) { + LCONSOLE_WARN("%s: Connection to %.*s (at %s) was " + "lost; in progress operations using this " + "service will wait for recovery to complete\n", + imp->imp_obd->obd_name, target_len, target_start, + obd_import_nid2str(imp)); + } else { + LCONSOLE_ERROR_MSG(0x166, "%s: Connection to " + "%.*s (at %s) was lost; in progress " + "operations using this service will fail\n", + imp->imp_obd->obd_name, target_len, target_start, + obd_import_nid2str(imp)); + if (invalid) { + CDEBUG(D_HA, "import %s@%s for %s not " + "replayable, auto-deactivating\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_obd->obd_name); + ptlrpc_deactivate_import_nolock(imp); + inact = true; + } + } + spin_unlock(&imp->imp_lock); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); + + if (inact) + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); + rc = 1; + } else { + spin_unlock(&imp->imp_lock); + CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n", + imp->imp_client->cli_name, imp, + (imp->imp_state == LUSTRE_IMP_FULL && + imp->imp_conn_cnt > conn_cnt) ? + "reconnected" : "not connected", imp->imp_conn_cnt, + conn_cnt, ptlrpc_import_state_name(imp->imp_state)); + } + + return rc; +} + +/* + * This acts as a barrier; all existing requests are rejected, and + * no new requests will be accepted until the import is valid again. + */ +void ptlrpc_deactivate_import(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + ptlrpc_deactivate_import_nolock(imp); + spin_unlock(&imp->imp_lock); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); +} +EXPORT_SYMBOL(ptlrpc_deactivate_import); + +static time64_t ptlrpc_inflight_deadline(struct ptlrpc_request *req, + time64_t now) +{ + time64_t dl; + + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) + return 0; + + if (req->rq_timedout) + return 0; + + if (req->rq_phase == RQ_PHASE_NEW) + dl = req->rq_sent; + else + dl = req->rq_deadline; + + if (dl <= now) + return 0; + + return dl - now; +} + +static time64_t ptlrpc_inflight_timeout(struct obd_import *imp) +{ + time64_t now = ktime_get_real_seconds(); + struct list_head *tmp, *n; + struct ptlrpc_request *req; + time64_t timeout = 0; + + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + timeout = max(ptlrpc_inflight_deadline(req, now), timeout); + } + spin_unlock(&imp->imp_lock); + return timeout; +} + +/** + * This function will invalidate the import, if necessary, then block + * for all the RPC completions, and finally notify the obd to + * invalidate its state (ie cancel locks, clear pending requests, + * etc). + */ +void ptlrpc_invalidate_import(struct obd_import *imp) +{ + struct list_head *tmp, *n; + struct ptlrpc_request *req; + struct l_wait_info lwi; + time64_t timeout; + int rc; + + atomic_inc(&imp->imp_inval_count); + + if (!imp->imp_invalid || imp->imp_obd->obd_no_recov) + ptlrpc_deactivate_import(imp); + + if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CONNECT_RACE)) { + OBD_RACE(OBD_FAIL_PTLRPC_CONNECT_RACE); + msleep(10 * MSEC_PER_SEC); + } + CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2); + LASSERT(imp->imp_invalid); + + /* Wait forever until inflight == 0. We really can't do it another + * way because in some cases we need to wait for very long reply + * unlink. We can't do anything before that because there is really + * no guarantee that some rdma transfer is not in progress right now. */ + do { + long timeout_jiffies; + + /* Calculate max timeout for waiting on rpcs to error + * out. Use obd_timeout if calculated value is smaller + * than it. + */ + if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + timeout = ptlrpc_inflight_timeout(imp); + timeout += div_u64(timeout, 3); + + if (timeout == 0) + timeout = obd_timeout; + } else { + /* decrease the interval to increase race condition */ + timeout = 1; + } + + CDEBUG(D_RPCTRACE, "Sleeping %llds for inflight to error out\n", + timeout); + + /* Wait for all requests to error out and call completion + * callbacks. Cap it at obd_timeout -- these should all + * have been locally cancelled by ptlrpc_abort_inflight. + */ + timeout_jiffies = max_t(long, cfs_time_seconds(timeout), 1); + lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies, + (timeout > 1) ? cfs_time_seconds(1) : + cfs_time_seconds(1) / 2, + NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inflight) == 0), + &lwi); + if (rc) { + const char *cli_tgt = obd2cli_tgt(imp->imp_obd); + + CERROR("%s: rc = %d waiting for callback (%d != 0)\n", + cli_tgt, rc, atomic_read(&imp->imp_inflight)); + + spin_lock(&imp->imp_lock); + if (atomic_read(&imp->imp_inflight) == 0) { + int count = atomic_read(&imp->imp_unregistering); + + /* We know that "unregistering" rpcs only can + * survive in sending or delaying lists (they + * maybe waiting for long reply unlink in + * sluggish nets). Let's check this. If there + * is no inflight and unregistering != 0, this + * is bug. */ + LASSERTF(count == 0, "Some RPCs are still " + "unregistering: %d\n", count); + + /* Let's save one loop as soon as inflight have + * dropped to zero. No new inflights possible at + * this point. */ + rc = 0; + } else { + list_for_each_safe(tmp, n, + &imp->imp_sending_list) { + req = list_entry(tmp, + struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, + "still on sending list"); + } + list_for_each_safe(tmp, n, + &imp->imp_delayed_list) { + req = list_entry(tmp, + struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, + "still on delayed list"); + } + + CERROR("%s: Unregistering RPCs found (%d). " + "Network is sluggish? Waiting them " + "to error out.\n", cli_tgt, + atomic_read(&imp->imp_unregistering)); + } + spin_unlock(&imp->imp_lock); + } + } while (rc != 0); + + /* + * Let's additionally check that no new rpcs added to import in + * "invalidate" state. + */ + LASSERT(atomic_read(&imp->imp_inflight) == 0); + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); + sptlrpc_import_flush_all_ctx(imp); + + atomic_dec(&imp->imp_inval_count); + wake_up_all(&imp->imp_recovery_waitq); +} +EXPORT_SYMBOL(ptlrpc_invalidate_import); + +/* unset imp_invalid */ +void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full) +{ + struct obd_device *obd = imp->imp_obd; + + spin_lock(&imp->imp_lock); + if (imp->imp_deactive != 0) { + LASSERT(imp->imp_state != LUSTRE_IMP_FULL); + if (imp->imp_state != LUSTRE_IMP_DISCON) + import_set_state_nolock(imp, LUSTRE_IMP_DISCON); + spin_unlock(&imp->imp_lock); + return; + } + if (set_state_full) + import_set_state_nolock(imp, LUSTRE_IMP_FULL); + + imp->imp_invalid = 0; + + spin_unlock(&imp->imp_lock); + obd_import_event(obd, imp, IMP_EVENT_ACTIVE); +} +EXPORT_SYMBOL(ptlrpc_activate_import); + +void ptlrpc_pinger_force(struct obd_import *imp) +{ + CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(imp->imp_state)); + + spin_lock(&imp->imp_lock); + imp->imp_force_verify = 1; + spin_unlock(&imp->imp_lock); + + if (imp->imp_state != LUSTRE_IMP_CONNECTING) + ptlrpc_pinger_wake_up(); +} +EXPORT_SYMBOL(ptlrpc_pinger_force); + +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt) +{ + ENTRY; + + LASSERT(!imp->imp_dlm_fake); + + if (ptlrpc_set_import_discon(imp, conn_cnt, true)) + ptlrpc_pinger_force(imp); + + EXIT; +} + +int ptlrpc_reconnect_import(struct obd_import *imp) +{ +#ifdef ENABLE_PINGER + long timeout_jiffies = cfs_time_seconds(obd_timeout); + struct l_wait_info lwi; + int rc; + + ptlrpc_pinger_force(imp); + + CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n", + obd2cli_tgt(imp->imp_obd), obd_timeout); + + lwi = LWI_TIMEOUT(timeout_jiffies, NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), &lwi); + CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(imp->imp_state)); + return rc; +#else + ptlrpc_set_import_discon(imp, 0, false); + /* Force a new connect attempt */ + ptlrpc_invalidate_import(imp); + /* Do a fresh connect next time by zeroing the handle */ + ptlrpc_disconnect_import(imp, 1); + /* Wait for all invalidate calls to finish */ + if (atomic_read(&imp->imp_inval_count) > 0) { + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + int rc; + + rc = l_wait_event(imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inval_count) == 0), + &lwi); + if (rc) + CERROR("Interrupted, inval=%d\n", + atomic_read(&imp->imp_inval_count)); + } + + /* Allow reconnect attempts */ + imp->imp_obd->obd_no_recov = 0; + /* Remove 'invalid' flag */ + ptlrpc_activate_import(imp, false); + /* Attempt a new connect */ + ptlrpc_recover_import(imp, NULL, 0); + return 0; +#endif +} +EXPORT_SYMBOL(ptlrpc_reconnect_import); + +/** + * Connection on import \a imp is changed to another one (if more than one is + * present). We typically chose connection that we have not tried to connect to + * the longest + */ +static int import_select_connection(struct obd_import *imp) +{ + struct obd_import_conn *imp_conn = NULL, *conn; + struct obd_export *dlmexp; + char *target_start; + int target_len, tried_all = 1; + ENTRY; + + spin_lock(&imp->imp_lock); + + if (list_empty(&imp->imp_conn_list)) { + CERROR("%s: no connections available\n", + imp->imp_obd->obd_name); + spin_unlock(&imp->imp_lock); + RETURN(-EINVAL); + } + + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n", + imp->imp_obd->obd_name, + libcfs_nid2str(conn->oic_conn->c_peer.nid), + conn->oic_last_attempt); + + /* If we have not tried this connection since + the last successful attempt, go with this one */ + if ((conn->oic_last_attempt == 0) || + conn->oic_last_attempt <= imp->imp_last_success_conn) { + imp_conn = conn; + tried_all = 0; + break; + } + + /* If all of the connections have already been tried + since the last successful connection; just choose the + least recently used */ + if (!imp_conn) + imp_conn = conn; + else if (imp_conn->oic_last_attempt > conn->oic_last_attempt) + imp_conn = conn; + } + + /* if not found, simply choose the current one */ + if (!imp_conn || imp->imp_force_reconnect) { + LASSERT(imp->imp_conn_current); + imp_conn = imp->imp_conn_current; + tried_all = 0; + } + LASSERT(imp_conn->oic_conn); + + /* If we've tried everything, and we're back to the beginning of the + list, increase our timeout and try again. It will be reset when + we do finally connect. (FIXME: really we should wait for all network + state associated with the last connection attempt to drain before + trying to reconnect on it.) */ + if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) { + struct adaptive_timeout *at = &imp->imp_at.iat_net_latency; + if (at_get(at) < CONNECTION_SWITCH_MAX) { + at_measured(at, at_get(at) + CONNECTION_SWITCH_INC); + if (at_get(at) > CONNECTION_SWITCH_MAX) + at_reset(at, CONNECTION_SWITCH_MAX); + } + LASSERT(imp_conn->oic_last_attempt); + CDEBUG(D_HA, "%s: tried all connections, increasing latency " + "to %ds\n", imp->imp_obd->obd_name, at_get(at)); + } + + imp_conn->oic_last_attempt = ktime_get_seconds(); + + /* switch connection, don't mind if it's same as the current one */ + if (imp->imp_connection) + ptlrpc_connection_put(imp->imp_connection); + imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); + + dlmexp = class_conn2export(&imp->imp_dlm_handle); + LASSERT(dlmexp != NULL); + if (dlmexp->exp_connection) + ptlrpc_connection_put(dlmexp->exp_connection); + dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); + class_export_put(dlmexp); + + if (imp->imp_conn_current != imp_conn) { + if (imp->imp_conn_current) { + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + + CDEBUG(D_HA, "%s: Connection changing to" + " %.*s (at %s)\n", + imp->imp_obd->obd_name, + target_len, target_start, + libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + } + + imp->imp_conn_current = imp_conn; + } + + CDEBUG(D_HA, "%s: import %p using connection %s/%s\n", + imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid, + libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + + spin_unlock(&imp->imp_lock); + + RETURN(0); +} + +/* + * must be called under imp_lock + */ +static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno) +{ + struct ptlrpc_request *req; + struct list_head *tmp; + + /* The requests in committed_list always have smaller transnos than + * the requests in replay_list */ + if (!list_empty(&imp->imp_committed_list)) { + tmp = imp->imp_committed_list.next; + req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); + *transno = req->rq_transno; + if (req->rq_transno == 0) { + DEBUG_REQ(D_ERROR, req, "zero transno in committed_list"); + LBUG(); + } + return 1; + } + if (!list_empty(&imp->imp_replay_list)) { + tmp = imp->imp_replay_list.next; + req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); + *transno = req->rq_transno; + if (req->rq_transno == 0) { + DEBUG_REQ(D_ERROR, req, "zero transno in replay_list"); + LBUG(); + } + return 1; + } + return 0; +} + +int ptlrpc_connect_import(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + return ptlrpc_connect_import_locked(imp); +} + +/** + * Attempt to (re)connect import \a imp. This includes all preparations, + * initializing CONNECT RPC request and passing it to ptlrpcd for + * actual sending. + * + * Assumes imp->imp_lock is held, and releases it. + * + * Returns 0 on success or error code. + */ +int ptlrpc_connect_import_locked(struct obd_import *imp) +{ + struct obd_device *obd = imp->imp_obd; + int initial_connect = 0; + int set_transno = 0; + __u64 committed_before_reconnect = 0; + struct ptlrpc_request *request; + struct obd_connect_data ocd; + char *bufs[] = { NULL, + obd2cli_tgt(imp->imp_obd), + obd->obd_uuid.uuid, + (char *)&imp->imp_dlm_handle, + (char *)&ocd, + NULL }; + struct ptlrpc_connect_async_args *aa; + int rc; + ENTRY; + + assert_spin_locked(&imp->imp_lock); + + if (imp->imp_state == LUSTRE_IMP_CLOSED) { + spin_unlock(&imp->imp_lock); + CERROR("can't connect to a closed import\n"); + RETURN(-EINVAL); + } else if (imp->imp_state == LUSTRE_IMP_FULL) { + spin_unlock(&imp->imp_lock); + CERROR("already connected\n"); + RETURN(0); + } else if (imp->imp_state == LUSTRE_IMP_CONNECTING || + imp->imp_state == LUSTRE_IMP_EVICTED || + imp->imp_connected) { + spin_unlock(&imp->imp_lock); + CERROR("already connecting\n"); + RETURN(-EALREADY); + } + + import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING); + + imp->imp_conn_cnt++; + imp->imp_resend_replay = 0; + + if (!lustre_handle_is_used(&imp->imp_remote_handle)) + initial_connect = 1; + else + committed_before_reconnect = imp->imp_peer_committed_transno; + + set_transno = ptlrpc_first_transno(imp, + &imp->imp_connect_data.ocd_transno); + spin_unlock(&imp->imp_lock); + + rc = import_select_connection(imp); + if (rc) + GOTO(out, rc); + + rc = sptlrpc_import_sec_adapt(imp, NULL, NULL); + if (rc) + GOTO(out, rc); + + /* Reset connect flags to the originally requested flags, in case + * the server is updated on-the-fly we will get the new features. */ + ocd = imp->imp_connect_data; + ocd.ocd_connect_flags = imp->imp_connect_flags_orig; + ocd.ocd_connect_flags2 = imp->imp_connect_flags2_orig; + /* Reset ocd_version each time so the server knows the exact versions */ + ocd.ocd_version = LUSTRE_VERSION_CODE; + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; + + rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd, + &obd->obd_uuid, &ocd, NULL); + if (rc) + GOTO(out, rc); + + request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT); + if (request == NULL) + GOTO(out, rc = -ENOMEM); + + /* get SELinux policy info if any */ + rc = sptlrpc_get_sepol(request); + if (rc < 0) { + ptlrpc_request_free(request); + GOTO(out, rc); + } + + bufs[5] = request->rq_sepol; + + req_capsule_set_size(&request->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT, + strlen(request->rq_sepol) ? + strlen(request->rq_sepol) + 1 : 0); + + rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION, + imp->imp_connect_op, bufs, NULL); + if (rc) { + ptlrpc_request_free(request); + GOTO(out, rc); + } + + /* Report the rpc service time to the server so that it knows how long + * to wait for clients to join recovery */ + lustre_msg_set_service_timeout(request->rq_reqmsg, + at_timeout2est(request->rq_timeout)); + + /* The amount of time we give the server to process the connect req. + * import_select_connection will increase the net latency on + * repeated reconnect attempts to cover slow networks. + * We override/ignore the server rpc completion estimate here, + * which may be large if this is a reconnect attempt */ + request->rq_timeout = INITIAL_CONNECT_TIMEOUT; + lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout); + + request->rq_no_resend = request->rq_no_delay = 1; + request->rq_send_state = LUSTRE_IMP_CONNECTING; + /* Allow a slightly larger reply for future growth compatibility */ + req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER, + sizeof(struct obd_connect_data)+16*sizeof(__u64)); + ptlrpc_request_set_replen(request); + request->rq_interpret_reply = ptlrpc_connect_interpret; + + CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args)); + aa = ptlrpc_req_async_args(request); + memset(aa, 0, sizeof *aa); + + aa->pcaa_peer_committed = committed_before_reconnect; + aa->pcaa_initial_connect = initial_connect; + + if (aa->pcaa_initial_connect) { + spin_lock(&imp->imp_lock); + imp->imp_replayable = 1; + spin_unlock(&imp->imp_lock); + lustre_msg_add_op_flags(request->rq_reqmsg, + MSG_CONNECT_INITIAL); + } + + if (set_transno) + lustre_msg_add_op_flags(request->rq_reqmsg, + MSG_CONNECT_TRANSNO); + + DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)", + request->rq_timeout); + ptlrpcd_add_req(request); + rc = 0; +out: + if (rc != 0) + import_set_state(imp, LUSTRE_IMP_DISCON); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_connect_import); + +static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) +{ + int force_verify; + + spin_lock(&imp->imp_lock); + force_verify = imp->imp_force_verify != 0; + spin_unlock(&imp->imp_lock); + + if (force_verify) + ptlrpc_pinger_wake_up(); +} + +static int ptlrpc_busy_reconnect(int rc) +{ + return (rc == -EBUSY) || (rc == -EAGAIN); +} + +static int ptlrpc_connect_set_flags(struct obd_import *imp, + struct obd_connect_data *ocd, + __u64 old_connect_flags, + struct obd_export *exp, int init_connect) +{ + static bool warned; + struct client_obd *cli = &imp->imp_obd->u.cli; + + spin_lock(&imp->imp_lock); + list_del(&imp->imp_conn_current->oic_item); + list_add(&imp->imp_conn_current->oic_item, + &imp->imp_conn_list); + imp->imp_last_success_conn = + imp->imp_conn_current->oic_last_attempt; + + spin_unlock(&imp->imp_lock); + + if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version > LUSTRE_VERSION_CODE + + LUSTRE_VERSION_OFFSET_WARN || + ocd->ocd_version < LUSTRE_VERSION_CODE - + LUSTRE_VERSION_OFFSET_WARN)) { + /* Sigh, some compilers do not like #ifdef in the middle + of macro arguments */ + const char *older = "older than client. " + "Consider upgrading server"; + const char *newer = "newer than client. " + "Consider upgrading client"; + + LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) " + "is much %s (%s)\n", + obd2cli_tgt(imp->imp_obd), + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + ocd->ocd_version > LUSTRE_VERSION_CODE ? + newer : older, LUSTRE_VERSION_STRING); + warned = true; + } + + if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) { + /* We sent to the server ocd_cksum_types with bits set + * for algorithms we understand. The server masked off + * the checksum types it doesn't support */ + if ((ocd->ocd_cksum_types & + obd_cksum_types_supported_client()) == 0) { + LCONSOLE_ERROR("The negotiation of the checksum " + "alogrithm to use with server %s " + "failed (%x/%x)\n", + obd2cli_tgt(imp->imp_obd), + ocd->ocd_cksum_types, + obd_cksum_types_supported_client()); + return -EPROTO; + } else { + cli->cl_supp_cksum_types = ocd->ocd_cksum_types; + } + } else { + /* The server does not support OBD_CONNECT_CKSUM. + * Enforce ADLER for backward compatibility*/ + cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; + } + cli->cl_cksum_type = obd_cksum_type_select(imp->imp_obd->obd_name, + cli->cl_supp_cksum_types); + + if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) + cli->cl_max_pages_per_rpc = + min(ocd->ocd_brw_size >> PAGE_SHIFT, + cli->cl_max_pages_per_rpc); + else if (imp->imp_connect_op == MDS_CONNECT || + imp->imp_connect_op == MGS_CONNECT) + cli->cl_max_pages_per_rpc = 1; + + LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) && + (cli->cl_max_pages_per_rpc > 0)); + + client_adjust_max_dirty(cli); + + /* Update client max modify RPCs in flight with value returned + * by the server */ + if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) + cli->cl_max_mod_rpcs_in_flight = min( + cli->cl_max_mod_rpcs_in_flight, + ocd->ocd_maxmodrpcs); + else + cli->cl_max_mod_rpcs_in_flight = 1; + + /* Reset ns_connect_flags only for initial connect. It might be + * changed in while using FS and if we reset it in reconnect + * this leads to losing user settings done before such as + * disable lru_resize, etc. */ + if (old_connect_flags != exp_connect_flags(exp) || init_connect) { + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + __u64 changed_flags; + + changed_flags = + ns->ns_connect_flags ^ ns->ns_orig_connect_flags; + CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server " + "flags: %#llx\n", imp->imp_obd->obd_name, + ocd->ocd_connect_flags); + ns->ns_connect_flags = (ns->ns_connect_flags & changed_flags) | + (ocd->ocd_connect_flags & ~changed_flags); + ns->ns_orig_connect_flags = ocd->ocd_connect_flags; + } + + if (ocd->ocd_connect_flags & OBD_CONNECT_AT) + /* We need a per-message support flag, because + * a. we don't know if the incoming connect reply + * supports AT or not (in reply_in_callback) + * until we unpack it. + * b. failovered server means export and flags are gone + * (in ptlrpc_send_reply). + * Can only be set when we know AT is supported at + * both ends */ + imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; + else + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + + imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18; + + return 0; +} + +/** + * Add all replay requests back to unreplied list before start replay, + * so that we can make sure the known replied XID is always increased + * only even if when replaying requests. + */ +static void ptlrpc_prepare_replay(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + if (imp->imp_state != LUSTRE_IMP_REPLAY || + imp->imp_resend_replay) + return; + + /* If the server was restart during repaly, the requests may + * have been added to the unreplied list in former replay. */ + spin_lock(&imp->imp_lock); + + list_for_each_entry(req, &imp->imp_committed_list, rq_replay_list) { + if (list_empty(&req->rq_unreplied_list)) + ptlrpc_add_unreplied(req); + } + + list_for_each_entry(req, &imp->imp_replay_list, rq_replay_list) { + if (list_empty(&req->rq_unreplied_list)) + ptlrpc_add_unreplied(req); + } + + imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp); + spin_unlock(&imp->imp_lock); +} + +/** + * interpret_reply callback for connect RPCs. + * Looks into returned status of connect operation and decides + * what to do with the import - i.e enter recovery, promote it to + * full state for normal operations of disconnect it due to an error. + */ +static int ptlrpc_connect_interpret(const struct lu_env *env, + struct ptlrpc_request *request, + void *data, int rc) +{ + struct ptlrpc_connect_async_args *aa = data; + struct obd_import *imp = request->rq_import; + struct lustre_handle old_hdl; + __u64 old_connect_flags; + timeout_t service_timeout; + int msg_flags; + struct obd_connect_data *ocd; + struct obd_export *exp = NULL; + int ret; + ENTRY; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_CLOSED) { + imp->imp_connect_tried = 1; + spin_unlock(&imp->imp_lock); + RETURN(0); + } + + if (rc) { + struct ptlrpc_request *free_req; + struct ptlrpc_request *tmp; + + /* abort all delayed requests initiated connection */ + list_for_each_entry_safe(free_req, tmp, &imp->imp_delayed_list, + rq_list) { + spin_lock(&free_req->rq_lock); + if (free_req->rq_no_resend) { + free_req->rq_err = 1; + free_req->rq_status = -EIO; + ptlrpc_client_wake_req(free_req); + } + spin_unlock(&free_req->rq_lock); + } + + /* if this reconnect to busy export - not need select new target + * for connecting*/ + imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc); + spin_unlock(&imp->imp_lock); + GOTO(out, rc); + } + + /* LU-7558: indicate that we are interpretting connect reply, + * pltrpc_connect_import() will not try to reconnect until + * interpret will finish. */ + imp->imp_connected = 1; + spin_unlock(&imp->imp_lock); + + LASSERT(imp->imp_conn_current); + + msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); + + ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA, + RCL_SERVER); + /* server replied obd_connect_data is always bigger */ + ocd = req_capsule_server_sized_get(&request->rq_pill, + &RMF_CONNECT_DATA, ret); + + if (ocd == NULL) { + CERROR("%s: no connect data from server\n", + imp->imp_obd->obd_name); + rc = -EPROTO; + GOTO(out, rc); + } + + spin_lock(&imp->imp_lock); + + /* All imports are pingable */ + imp->imp_pingable = 1; + imp->imp_force_reconnect = 0; + imp->imp_force_verify = 0; + + imp->imp_connect_data = *ocd; + + CDEBUG(D_HA, "%s: connect to target with instance %u\n", + imp->imp_obd->obd_name, ocd->ocd_instance); + exp = class_conn2export(&imp->imp_dlm_handle); + + spin_unlock(&imp->imp_lock); + + if (!exp) { + /* This could happen if export is cleaned during the + connect attempt */ + CERROR("%s: missing export after connect\n", + imp->imp_obd->obd_name); + GOTO(out, rc = -ENODEV); + } + + /* check that server granted subset of flags we asked for. */ + if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) != + ocd->ocd_connect_flags) { + CERROR("%s: Server didn't grant requested subset of flags: " + "asked=%#llx granted=%#llx\n", + imp->imp_obd->obd_name, imp->imp_connect_flags_orig, + ocd->ocd_connect_flags); + GOTO(out, rc = -EPROTO); + } + + if ((ocd->ocd_connect_flags2 & imp->imp_connect_flags2_orig) != + ocd->ocd_connect_flags2) { + CERROR("%s: Server didn't grant requested subset of flags2: " + "asked=%#llx granted=%#llx\n", + imp->imp_obd->obd_name, imp->imp_connect_flags2_orig, + ocd->ocd_connect_flags2); + GOTO(out, rc = -EPROTO); + } + + if (!(imp->imp_connect_flags_orig & OBD_CONNECT_LIGHTWEIGHT) && + (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) && + (imp->imp_connect_flags_orig & OBD_CONNECT_FID) && + (ocd->ocd_connect_flags & OBD_CONNECT_VERSION)) { + __u32 major = OBD_OCD_VERSION_MAJOR(ocd->ocd_version); + __u32 minor = OBD_OCD_VERSION_MINOR(ocd->ocd_version); + __u32 patch = OBD_OCD_VERSION_PATCH(ocd->ocd_version); + + /* We do not support the MDT-MDT interoperations with + * different version MDT because of protocol changes. */ + if (unlikely(major != LUSTRE_MAJOR || + minor != LUSTRE_MINOR || + abs(patch - LUSTRE_PATCH) > 3)) { + LCONSOLE_WARN("%s: import %p (%u.%u.%u.%u) tried the " + "connection to different version MDT " + "(%d.%d.%d.%d) %s\n", + imp->imp_obd->obd_name, imp, LUSTRE_MAJOR, + LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX, + major, minor, patch, + OBD_OCD_VERSION_FIX(ocd->ocd_version), + imp->imp_connection->c_remote_uuid.uuid); + + GOTO(out, rc = -EPROTO); + } + } + + old_connect_flags = exp_connect_flags(exp); + exp->exp_connect_data = *ocd; + imp->imp_obd->obd_self_export->exp_connect_data = *ocd; + + /* The net statistics after (re-)connect is not valid anymore, + * because may reflect other routing, etc. + */ + service_timeout = lustre_msg_get_service_timeout(request->rq_repmsg); + at_reinit(&imp->imp_at.iat_net_latency, 0, 0); + ptlrpc_at_adj_net_latency(request, service_timeout); + + /* Import flags should be updated before waking import at FULL state */ + rc = ptlrpc_connect_set_flags(imp, ocd, old_connect_flags, exp, + aa->pcaa_initial_connect); + class_export_put(exp); + exp = NULL; + + if (rc != 0) + GOTO(out, rc); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD); + + if (aa->pcaa_initial_connect) { + spin_lock(&imp->imp_lock); + if (msg_flags & MSG_CONNECT_REPLAYABLE) { + imp->imp_replayable = 1; + CDEBUG(D_HA, "connected to replayable target: %s\n", + obd2cli_tgt(imp->imp_obd)); + } else { + imp->imp_replayable = 0; + } + + /* if applies, adjust the imp->imp_msg_magic here + * according to reply flags */ + + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + + /* Initial connects are allowed for clients with non-random + * uuids when servers are in recovery. Simply signal the + * servers replay is complete and wait in REPLAY_WAIT. */ + if (msg_flags & MSG_CONNECT_RECOVERING) { + CDEBUG(D_HA, "connect to %s during recovery\n", + obd2cli_tgt(imp->imp_obd)); + import_set_state_nolock(imp, LUSTRE_IMP_REPLAY_LOCKS); + spin_unlock(&imp->imp_lock); + } else { + spin_unlock(&imp->imp_lock); + ptlrpc_activate_import(imp, true); + } + + GOTO(finish, rc = 0); + } + + /* Determine what recovery state to move the import to. */ + if (MSG_CONNECT_RECONNECT & msg_flags) { + memset(&old_hdl, 0, sizeof(old_hdl)); + if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg), + sizeof (old_hdl))) { + LCONSOLE_WARN("Reconnect to %s (at @%s) failed due " + "bad handle %#llx\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_dlm_handle.cookie); + GOTO(out, rc = -ENOTCONN); + } + + if (memcmp(&imp->imp_remote_handle, + lustre_msg_get_handle(request->rq_repmsg), + sizeof(imp->imp_remote_handle))) { + int level = msg_flags & MSG_CONNECT_RECOVERING ? + D_HA : D_WARNING; + + /* Bug 16611/14775: if server handle have changed, + * that means some sort of disconnection happened. + * If the server is not in recovery, that also means it + * already erased all of our state because of previous + * eviction. If it is in recovery - we are safe to + * participate since we can reestablish all of our state + * with server again */ + if ((MSG_CONNECT_RECOVERING & msg_flags)) { + CDEBUG(level,"%s@%s changed server handle from " + "%#llx to %#llx" + " but is still in recovery\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, + lustre_msg_get_handle( + request->rq_repmsg)->cookie); + } else { + LCONSOLE_WARN("Evicted from %s (at %s) " + "after server handle changed from " + "%#llx to %#llx\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection-> \ + c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, + lustre_msg_get_handle( + request->rq_repmsg)->cookie); + } + + + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + + if (!(MSG_CONNECT_RECOVERING & msg_flags)) { + import_set_state(imp, LUSTRE_IMP_EVICTED); + GOTO(finish, rc = 0); + } + + } else { + CDEBUG(D_HA, "reconnected to %s@%s after partition\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + } + + if (imp->imp_invalid) { + CDEBUG(D_HA, "%s: reconnected but import is invalid; " + "marking evicted\n", imp->imp_obd->obd_name); + import_set_state(imp, LUSTRE_IMP_EVICTED); + } else if (MSG_CONNECT_RECOVERING & msg_flags) { + CDEBUG(D_HA, "%s: reconnected to %s during replay\n", + imp->imp_obd->obd_name, + obd2cli_tgt(imp->imp_obd)); + + spin_lock(&imp->imp_lock); + imp->imp_resend_replay = 1; + spin_unlock(&imp->imp_lock); + + import_set_state(imp, imp->imp_replay_state); + } else { + import_set_state(imp, LUSTRE_IMP_RECOVER); + } + } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) { + LASSERT(imp->imp_replayable); + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + imp->imp_last_replay_transno = 0; + imp->imp_replay_cursor = &imp->imp_committed_list; + import_set_state(imp, LUSTRE_IMP_REPLAY); + } else { + DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags" + " not set: %x)", imp->imp_obd->obd_name, msg_flags); + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + import_set_state(imp, LUSTRE_IMP_EVICTED); + } + + /* Sanity checks for a reconnected import. */ + if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) { + CERROR("imp_replayable flag does not match server " + "after reconnect. We should LBUG right here.\n"); + } + + if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 && + lustre_msg_get_last_committed(request->rq_repmsg) < + aa->pcaa_peer_committed) { + CERROR("%s went back in time (transno %lld" + " was previously committed, server now claims %lld" + ")! See https://bugzilla.lustre.org/show_bug.cgi?" + "id=9646\n", + obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed, + lustre_msg_get_last_committed(request->rq_repmsg)); + } + +finish: + ptlrpc_prepare_replay(imp); + rc = ptlrpc_import_recovery_state_machine(imp); + if (rc == -ENOTCONN) { + CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;" + "invalidating and reconnecting\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + ptlrpc_connect_import(imp); + spin_lock(&imp->imp_lock); + imp->imp_connected = 0; + imp->imp_connect_tried = 1; + spin_unlock(&imp->imp_lock); + RETURN(0); + } + +out: + if (exp != NULL) + class_export_put(exp); + + spin_lock(&imp->imp_lock); + imp->imp_connected = 0; + imp->imp_connect_tried = 1; + + if (rc != 0) { + bool inact = false; + time64_t now = ktime_get_seconds(); + time64_t next_connect; + + import_set_state_nolock(imp, LUSTRE_IMP_DISCON); + if (rc == -EACCES) { + /* + * Give up trying to reconnect + * EACCES means client has no permission for connection + */ + imp->imp_obd->obd_no_recov = 1; + ptlrpc_deactivate_import_nolock(imp); + inact = true; + } else if (rc == -EPROTO) { + struct obd_connect_data *ocd; + + /* reply message might not be ready */ + if (request->rq_repmsg == NULL) { + spin_unlock(&imp->imp_lock); + RETURN(-EPROTO); + } + + ocd = req_capsule_server_get(&request->rq_pill, + &RMF_CONNECT_DATA); + /* Servers are not supposed to refuse connections from + * clients based on version, only connection feature + * flags. We should never see this from llite, but it + * may be useful for debugging in the future. */ + if (ocd && + (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version != LUSTRE_VERSION_CODE)) { + LCONSOLE_ERROR_MSG(0x16a, "Server %s version " + "(%d.%d.%d.%d)" + " refused connection from this client " + "with an incompatible version (%s). " + "Client must be recompiled\n", + obd2cli_tgt(imp->imp_obd), + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + LUSTRE_VERSION_STRING); + ptlrpc_deactivate_import_nolock(imp); + import_set_state_nolock(imp, LUSTRE_IMP_CLOSED); + inact = true; + } + } else if (rc == -ENODEV || rc == -ETIMEDOUT) { + /* ENODEV means there is no service, force reconnection + * to a pair if attempt happen ptlrpc_next_reconnect + * before now. ETIMEDOUT could be set during network + * error and do not guarantee request deadline happened. + */ + struct obd_import_conn *conn; + time64_t reconnect_time; + + /* Same as ptlrpc_next_reconnect, but in past */ + reconnect_time = now - INITIAL_CONNECT_TIMEOUT; + list_for_each_entry(conn, &imp->imp_conn_list, + oic_item) { + if (conn->oic_last_attempt <= reconnect_time) { + imp->imp_force_verify = 1; + break; + } + } + } + + next_connect = imp->imp_conn_current->oic_last_attempt + + (request->rq_deadline - request->rq_sent); + spin_unlock(&imp->imp_lock); + + if (inact) + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); + + if (rc == -EPROTO) + RETURN(rc); + + /* adjust imp_next_ping to request deadline + 1 and reschedule + * a pinger if import lost processing during CONNECTING or far + * away from request deadline. It could happen when connection + * was initiated outside of pinger, like + * ptlrpc_set_import_discon(). + */ + if (!imp->imp_force_verify && (imp->imp_next_ping <= now || + imp->imp_next_ping > next_connect)) { + imp->imp_next_ping = max(now, next_connect) + 1; + ptlrpc_pinger_wake_up(); + } + + ptlrpc_maybe_ping_import_soon(imp); + + CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n", + obd2cli_tgt(imp->imp_obd), + (char *)imp->imp_connection->c_remote_uuid.uuid, rc); + } else { + spin_unlock(&imp->imp_lock); + } + + wake_up_all(&imp->imp_recovery_waitq); + RETURN(rc); +} + +/** + * interpret callback for "completed replay" RPCs. + * \see signal_completed_replay + */ +static int completed_replay_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void * data, int rc) +{ + ENTRY; + atomic_dec(&req->rq_import->imp_replay_inflight); + if (req->rq_status == 0 && + !req->rq_import->imp_vbr_failed) { + ptlrpc_import_recovery_state_machine(req->rq_import); + } else { + if (req->rq_import->imp_vbr_failed) { + CDEBUG(D_WARNING, + "%s: version recovery fails, reconnecting\n", + req->rq_import->imp_obd->obd_name); + } else { + CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, " + "reconnecting\n", + req->rq_import->imp_obd->obd_name, + req->rq_status); + } + ptlrpc_connect_import(req->rq_import); + } + + RETURN(0); +} + +/** + * Let server know that we have no requests to replay anymore. + * Achieved by just sending a PING request + */ +static int signal_completed_replay(struct obd_import *imp) +{ + struct ptlrpc_request *req; + ENTRY; + + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY))) + RETURN(0); + + if (!atomic_add_unless(&imp->imp_replay_inflight, 1, 1)) + RETURN(0); + + req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION, + OBD_PING); + if (req == NULL) { + atomic_dec(&imp->imp_replay_inflight); + RETURN(-ENOMEM); + } + + ptlrpc_request_set_replen(req); + req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT; + lustre_msg_add_flags(req->rq_reqmsg, + MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE); + if (AT_OFF) + req->rq_timeout *= 3; + req->rq_interpret_reply = completed_replay_interpret; + + ptlrpcd_add_req(req); + RETURN(0); +} + +/** + * In kernel code all import invalidation happens in its own + * separate thread, so that whatever application happened to encounter + * a problem could still be killed or otherwise continue + */ +static int ptlrpc_invalidate_import_thread(void *data) +{ + struct obd_import *imp = data; + + ENTRY; + + unshare_fs_struct(); + + CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + ptlrpc_invalidate_import(imp); + + if (obd_dump_on_eviction) { + CERROR("dump the log upon eviction\n"); + libcfs_debug_dumplog(); + } + + import_set_state(imp, LUSTRE_IMP_RECOVER); + ptlrpc_import_recovery_state_machine(imp); + + class_import_put(imp); + RETURN(0); +} + +/** + * This is the state machine for client-side recovery on import. + * + * Typicaly we have two possibly paths. If we came to server and it is not + * in recovery, we just enter IMP_EVICTED state, invalidate our import + * state and reconnect from scratch. + * If we came to server that is in recovery, we enter IMP_REPLAY import state. + * We go through our list of requests to replay and send them to server one by + * one. + * After sending all request from the list we change import state to + * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server + * and also all the locks we don't yet have and wait for server to grant us. + * After that we send a special "replay completed" request and change import + * state to IMP_REPLAY_WAIT. + * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER + * state and resend all requests from sending list. + * After that we promote import to FULL state and send all delayed requests + * and import is fully operational after that. + * + */ +int ptlrpc_import_recovery_state_machine(struct obd_import *imp) +{ + int rc = 0; + int inflight; + char *target_start; + int target_len; + + ENTRY; + if (imp->imp_state == LUSTRE_IMP_EVICTED) { + struct task_struct *task; + + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + /* Don't care about MGC eviction */ + if (strcmp(imp->imp_obd->obd_type->typ_name, + LUSTRE_MGC_NAME) != 0) { + LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted " + "by %.*s; in progress operations " + "using this service will fail.\n", + imp->imp_obd->obd_name, target_len, + target_start); + LASSERTF(!obd_lbug_on_eviction, "LBUG upon eviction"); + } + CDEBUG(D_HA, "evicted from %s@%s; invalidating\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + /* reset vbr_failed flag upon eviction */ + spin_lock(&imp->imp_lock); + imp->imp_vbr_failed = 0; + spin_unlock(&imp->imp_lock); + + /* bug 17802: XXX client_disconnect_export vs connect request + * race. if client is evicted at this time then we start + * invalidate thread without reference to import and import can + * be freed at same time. */ + class_import_get(imp); + task = kthread_run(ptlrpc_invalidate_import_thread, imp, + "ll_imp_inval"); + if (IS_ERR(task)) { + class_import_put(imp); + rc = PTR_ERR(task); + CERROR("%s: can't start invalidate thread: rc = %d\n", + imp->imp_obd->obd_name, rc); + } else { + rc = 0; + } + RETURN(rc); + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY) { + CDEBUG(D_HA, "replay requested by %s\n", + obd2cli_tgt(imp->imp_obd)); + rc = ptlrpc_replay_next(imp, &inflight); + if (inflight == 0 && + atomic_read(&imp->imp_replay_inflight) == 0) { + import_set_state(imp, LUSTRE_IMP_REPLAY_LOCKS); + rc = ldlm_replay_locks(imp); + if (rc) + GOTO(out, rc); + } + rc = 0; + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) { + if (atomic_read(&imp->imp_replay_inflight) == 0) { + import_set_state(imp, LUSTRE_IMP_REPLAY_WAIT); + rc = signal_completed_replay(imp); + if (rc) + GOTO(out, rc); + } + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) { + if (atomic_read(&imp->imp_replay_inflight) == 0) { + import_set_state(imp, LUSTRE_IMP_RECOVER); + } + } + + if (imp->imp_state == LUSTRE_IMP_RECOVER) { + struct ptlrpc_connection *conn = imp->imp_connection; + + rc = ptlrpc_resend(imp); + if (rc) + GOTO(out, rc); + ptlrpc_activate_import(imp, true); + + CDEBUG_LIMIT(imp->imp_was_idle ? + imp->imp_idle_debug : D_CONSOLE, + "%s: Connection restored to %s (at %s)\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid), + obd_import_nid2str(imp)); + spin_lock(&imp->imp_lock); + imp->imp_was_idle = 0; + spin_unlock(&imp->imp_lock); + } + + if (imp->imp_state == LUSTRE_IMP_FULL) { + wake_up_all(&imp->imp_recovery_waitq); + ptlrpc_wake_delayed(imp); + } + +out: + RETURN(rc); +} + +static struct ptlrpc_request *ptlrpc_disconnect_prep_req(struct obd_import *imp) +{ + struct ptlrpc_request *req; + int rq_opc, rc = 0; + ENTRY; + + switch (imp->imp_connect_op) { + case OST_CONNECT: + rq_opc = OST_DISCONNECT; + break; + case MDS_CONNECT: + rq_opc = MDS_DISCONNECT; + break; + case MGS_CONNECT: + rq_opc = MGS_DISCONNECT; + break; + default: + rc = -EINVAL; + CERROR("%s: don't know how to disconnect from %s " + "(connect_op %d): rc = %d\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connect_op, rc); + RETURN(ERR_PTR(rc)); + } + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT, + LUSTRE_OBD_VERSION, rq_opc); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + /* We are disconnecting, do not retry a failed DISCONNECT rpc if + * it fails. We can get through the above with a down server + * if the client doesn't know the server is gone yet. */ + req->rq_no_resend = 1; + + /* We want client umounts to happen quickly, no matter the + server state... */ + req->rq_timeout = min_t(timeout_t, req->rq_timeout, + INITIAL_CONNECT_TIMEOUT); + + import_set_state(imp, LUSTRE_IMP_CONNECTING); + req->rq_send_state = LUSTRE_IMP_CONNECTING; + ptlrpc_request_set_replen(req); + + RETURN(req); +} + +int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) +{ + struct ptlrpc_request *req; + int rc = 0; + ENTRY; + + if (imp->imp_obd->obd_force) + GOTO(set_state, rc); + + /* probably the import has been disconnected already being idle */ + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_IDLE) + GOTO(out, rc); + spin_unlock(&imp->imp_lock); + + if (ptlrpc_import_in_recovery(imp)) { + struct l_wait_info lwi; + long timeout_jiffies; + time64_t timeout; + + if (AT_OFF) { + if (imp->imp_server_timeout) + timeout = obd_timeout >> 1; + else + timeout = obd_timeout; + } else { + u32 req_portal; + int idx; + + req_portal = imp->imp_client->cli_request_portal; + idx = import_at_get_index(imp, req_portal); + timeout = at_get(&imp->imp_at.iat_service_estimate[idx]); + } + + timeout_jiffies = cfs_time_seconds(timeout); + lwi = LWI_TIMEOUT_INTR(max_t(long, timeout_jiffies, 1), + back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), &lwi); + + } + + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_FULL) + GOTO(out, rc); + spin_unlock(&imp->imp_lock); + + req = ptlrpc_disconnect_prep_req(imp); + if (IS_ERR(req)) + GOTO(set_state, rc = PTR_ERR(req)); + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + +set_state: + spin_lock(&imp->imp_lock); +out: + if (noclose) + import_set_state_nolock(imp, LUSTRE_IMP_DISCON); + else + import_set_state_nolock(imp, LUSTRE_IMP_CLOSED); + memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); + spin_unlock(&imp->imp_lock); + + if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN) + rc = 0; + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_disconnect_import); + +static void ptlrpc_reset_reqs_generation(struct obd_import *imp) +{ + struct ptlrpc_request *old, *tmp; + + /* tag all resendable requests generated before disconnection + * notice this code is part of disconnect-at-idle path only */ + list_for_each_entry_safe(old, tmp, &imp->imp_delayed_list, + rq_list) { + spin_lock(&old->rq_lock); + if (old->rq_import_generation == imp->imp_generation - 1 && + !old->rq_no_resend) + old->rq_import_generation = imp->imp_generation; + spin_unlock(&old->rq_lock); + } +} + +static int ptlrpc_disconnect_idle_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *data, int rc) +{ + struct obd_import *imp = req->rq_import; + int connect = 0; + + DEBUG_REQ(D_HA, req, "inflight=%d, refcount=%d: rc = %d ", + atomic_read(&imp->imp_inflight), + atomic_read(&imp->imp_refcount), rc); + + spin_lock(&imp->imp_lock); + /* DISCONNECT reply can be late and another connection can just + * be initiated. so we have to abort disconnection. */ + if (req->rq_import_generation == imp->imp_generation && + imp->imp_state != LUSTRE_IMP_CLOSED) { + LASSERTF(imp->imp_state == LUSTRE_IMP_CONNECTING, + "%s\n", ptlrpc_import_state_name(imp->imp_state)); + memset(&imp->imp_remote_handle, 0, + sizeof(imp->imp_remote_handle)); + /* take our DISCONNECT into account */ + if (atomic_read(&imp->imp_reqs) > 1) { + imp->imp_generation++; + imp->imp_initiated_at = imp->imp_generation; + import_set_state_nolock(imp, LUSTRE_IMP_NEW); + ptlrpc_reset_reqs_generation(imp); + connect = 1; + } else { + /* do not expose transient IDLE state */ + import_set_state_nolock(imp, LUSTRE_IMP_IDLE); + } + } + + if (connect) { + rc = ptlrpc_connect_import_locked(imp); + if (rc >= 0) + ptlrpc_pinger_add_import(imp); + } else { + spin_unlock(&imp->imp_lock); + } + + return 0; +} + +int ptlrpc_disconnect_and_idle_import(struct obd_import *imp) +{ + struct ptlrpc_request *req; + ENTRY; + + if (imp->imp_obd->obd_force) + RETURN(0); + + if (ptlrpc_import_in_recovery(imp)) + RETURN(0); + + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_FULL) { + spin_unlock(&imp->imp_lock); + RETURN(0); + } + spin_unlock(&imp->imp_lock); + + req = ptlrpc_disconnect_prep_req(imp); + if (IS_ERR(req)) + RETURN(PTR_ERR(req)); + + CDEBUG_LIMIT(imp->imp_idle_debug, "%s: disconnect after %llus idle\n", + imp->imp_obd->obd_name, + ktime_get_real_seconds() - imp->imp_last_reply_time); + + /* don't make noise at reconnection */ + spin_lock(&imp->imp_lock); + imp->imp_was_idle = 1; + spin_unlock(&imp->imp_lock); + + req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret; + ptlrpcd_add_req(req); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_disconnect_and_idle_import); + +void ptlrpc_cleanup_imp(struct obd_import *imp) +{ + ENTRY; + + spin_lock(&imp->imp_lock); + + import_set_state_nolock(imp, LUSTRE_IMP_CLOSED); + imp->imp_generation++; + ptlrpc_abort_inflight(imp); + + spin_unlock(&imp->imp_lock); + + EXIT; +} + +/* Adaptive Timeout utils */ +extern unsigned int at_min, at_max, at_history; + +/* Update at_current with the specified value (bounded by at_min and at_max), + * as well as the AT history "bins". + * - Bin into timeslices using AT_BINS bins. + * - This gives us a max of the last at_history seconds without the storage, + * but still smoothing out a return to normalcy from a slow response. + * - (E.g. remember the maximum latency in each minute of the last 4 minutes.) + */ +int at_measured(struct adaptive_timeout *at, unsigned int val) +{ + unsigned int old = at->at_current; + time64_t now = ktime_get_real_seconds(); + long binlimit = max_t(long, at_history / AT_BINS, 1); + + LASSERT(at); + CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n", + val, at, (long)(now - at->at_binstart), at->at_current, + at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]); + + if (val == 0) + /* 0's don't count, because we never want our timeout to + drop to 0, and because 0 could mean an error */ + return 0; + + spin_lock(&at->at_lock); + + if (unlikely(at->at_binstart == 0)) { + /* Special case to remove default from history */ + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = now; + at->at_hist[0] = val; + at->at_binstart = now; + } else if (now - at->at_binstart < binlimit ) { + /* in bin 0 */ + at->at_hist[0] = max(val, at->at_hist[0]); + at->at_current = max(val, at->at_current); + } else { + int i, shift; + unsigned int maxv = val; + + /* move bins over */ + shift = (u32)(now - at->at_binstart) / binlimit; + LASSERT(shift > 0); + for(i = AT_BINS - 1; i >= 0; i--) { + if (i >= shift) { + at->at_hist[i] = at->at_hist[i - shift]; + maxv = max(maxv, at->at_hist[i]); + } else { + at->at_hist[i] = 0; + } + } + at->at_hist[0] = val; + at->at_current = maxv; + at->at_binstart += shift * binlimit; + } + + if (at->at_current > at->at_worst_ever) { + at->at_worst_ever = at->at_current; + at->at_worst_time = now; + } + + if (at->at_flags & AT_FLG_NOHIST) + /* Only keep last reported val; keeping the rest of the history + for proc only */ + at->at_current = val; + + if (at_max > 0) + at->at_current = min(at->at_current, at_max); + at->at_current = max(at->at_current, at_min); + + if (at->at_current != old) + CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d " + "(val=%u) hist %u %u %u %u\n", at, + old, at->at_current, at->at_current - old, val, + at->at_hist[0], at->at_hist[1], at->at_hist[2], + at->at_hist[3]); + + /* if we changed, report the old value */ + old = (at->at_current != old) ? old : 0; + + spin_unlock(&at->at_lock); + return old; +} + +/* Find the imp_at index for a given portal; assign if space available */ +int import_at_get_index(struct obd_import *imp, int portal) +{ + struct imp_at *at = &imp->imp_at; + int i; + + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + return i; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not found in list, add it under a lock */ + spin_lock(&imp->imp_lock); + + /* Check unused under lock */ + for (; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + goto out; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not enough portals? */ + LASSERT(i < IMP_AT_MAX_PORTALS); + + at->iat_portal[i] = portal; +out: + spin_unlock(&imp->imp_lock); + return i; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c new file mode 100644 index 0000000000000..0f9667e4e578b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c @@ -0,0 +1,2622 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/layout.c + * + * Lustre Metadata Target (mdt) request handler + * + * Author: Nikita Danilov + */ +/* + * This file contains the "capsule/pill" abstraction layered above PTLRPC. + * + * Every struct ptlrpc_request contains a "pill", which points to a description + * of the format that the request conforms to. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include + +#include +#include +#include +#include +#include + +/* struct ptlrpc_request, lustre_msg* */ +#include +#include +#include + +/* + * RQFs (see below) refer to two struct req_msg_field arrays describing the + * client request and server reply, respectively. + */ +/* empty set of fields... for suitable definition of emptiness. */ +static const struct req_msg_field *empty[] = { + &RMF_PTLRPC_BODY +}; + +static const struct req_msg_field *mgs_target_info_only[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_TARGET_INFO +}; + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0) +static const struct req_msg_field *mgs_set_info[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_SEND_PARAM +}; +#endif + +static const struct req_msg_field *mgs_config_read_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_CONFIG_BODY +}; + +static const struct req_msg_field *mgs_config_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_CONFIG_RES +}; + +static const struct req_msg_field *mdt_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY +}; + +static const struct req_msg_field *mdt_body_capa[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1 +}; + +static const struct req_msg_field *quotactl_only[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_QUOTACTL +}; + +static const struct req_msg_field *quota_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *ldlm_intent_quota_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *ldlm_intent_quota_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_DLM_LVB, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *mdt_close_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_EPOCH, + &RMF_REC_REINT, + &RMF_CAPA1 +}; + +static const struct req_msg_field *mdt_close_intent_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_EPOCH, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CLOSE_DATA, + &RMF_U32 +}; + +static const struct req_msg_field *obd_statfs_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_STATFS +}; + +static const struct req_msg_field *seq_query_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SEQ_OPC, + &RMF_SEQ_RANGE +}; + +static const struct req_msg_field *seq_query_server[] = { + &RMF_PTLRPC_BODY, + &RMF_SEQ_RANGE +}; + +static const struct req_msg_field *fld_query_client[] = { + &RMF_PTLRPC_BODY, + &RMF_FLD_OPC, + &RMF_FLD_MDFLD +}; + +static const struct req_msg_field *fld_query_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FLD_MDFLD +}; + +static const struct req_msg_field *fld_read_client[] = { + &RMF_PTLRPC_BODY, + &RMF_FLD_MDFLD +}; + +static const struct req_msg_field *fld_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_GENERIC_DATA +}; + +static const struct req_msg_field *mds_getattr_name_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *mds_reint_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT +}; + +static const struct req_msg_field *mds_reint_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *mds_reint_create_slave_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_create_acl_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ, + &RMF_FILE_SECCTX_NAME, + &RMF_FILE_SECCTX, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_create_sym_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_SYMTGT, + &RMF_DLM_REQ, + &RMF_FILE_SECCTX_NAME, + &RMF_FILE_SECCTX, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_open_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_EADATA, + &RMF_FILE_SECCTX_NAME, + &RMF_FILE_SECCTX, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_open_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_reint_unlink_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_DLM_REQ, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_link_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_DLM_REQ, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_rename_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_SYMTGT, + &RMF_DLM_REQ, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_migrate_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_SYMTGT, + &RMF_DLM_REQ, + &RMF_SELINUX_POL, + &RMF_MDT_EPOCH, + &RMF_CLOSE_DATA, + &RMF_EADATA +}; + +static const struct req_msg_field *mds_last_unlink_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_LOGCOOKIES, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_reint_setattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_MDT_EPOCH, + &RMF_EADATA, + &RMF_LOGCOOKIES, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_setxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_reint_resync[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mdt_swap_layouts[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_SWAP_LAYOUTS, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_rmfid_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_FID_ARRAY, + &RMF_CAPA1, + &RMF_CAPA2, +}; + +static const struct req_msg_field *mds_rmfid_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_FID_ARRAY, + &RMF_RCS, +}; + +static const struct req_msg_field *obd_connect_client[] = { + &RMF_PTLRPC_BODY, + &RMF_TGTUUID, + &RMF_CLUUID, + &RMF_CONN, + &RMF_CONNECT_DATA, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *obd_connect_server[] = { + &RMF_PTLRPC_BODY, + &RMF_CONNECT_DATA +}; + +static const struct req_msg_field *obd_set_info_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY, + &RMF_SETINFO_VAL +}; + +static const struct req_msg_field *ost_grant_shrink_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY, + &RMF_OST_BODY +}; + +static const struct req_msg_field *mds_getinfo_client[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_KEY, + &RMF_GETINFO_VALLEN +}; + +static const struct req_msg_field *mds_getinfo_server[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_VAL, +}; + +static const struct req_msg_field *ldlm_enqueue_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *ldlm_enqueue_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP +}; + +static const struct req_msg_field *ldlm_enqueue_lvb_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_cp_callback_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_gl_callback_desc_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_DLM_GL_DESC +}; + +static const struct req_msg_field *ldlm_gl_callback_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_intent_basic_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, +}; + +static const struct req_msg_field *ldlm_intent_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT +}; + +static const struct req_msg_field *ldlm_intent_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL +}; + +static const struct req_msg_field *ldlm_intent_layout_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_LAYOUT_INTENT, + &RMF_EADATA /* for new layout to be set up */ +}; + +static const struct req_msg_field *ldlm_intent_open_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NIOBUF_INLINE, + &RMF_FILE_SECCTX +}; + +static const struct req_msg_field *ldlm_intent_getattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_MDT_BODY, /* coincides with mds_getattr_name_client[] */ + &RMF_CAPA1, + &RMF_NAME, + &RMF_FILE_SECCTX_NAME +}; + +static const struct req_msg_field *ldlm_intent_getattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_FILE_SECCTX, + &RMF_DEFAULT_MDT_MD +}; + +static const struct req_msg_field *ldlm_intent_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT, /* coincides with mds_reint_create_client[] */ + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_FILE_SECCTX_NAME, + &RMF_FILE_SECCTX, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *ldlm_intent_open_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT, /* coincides with mds_reint_open_client[] */ + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_EADATA, + &RMF_FILE_SECCTX_NAME, + &RMF_FILE_SECCTX, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *ldlm_intent_getxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *ldlm_intent_getxattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, /* for req_capsule_extend/mdt_intent_policy */ + &RMF_EADATA, + &RMF_EAVALS, + &RMF_EAVALS_LENS +}; + +static const struct req_msg_field *mds_get_root_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_NAME +}; + +static const struct req_msg_field *mds_getxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_SELINUX_POL +}; + +static const struct req_msg_field *mds_getxattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_EADATA +}; + +static const struct req_msg_field *mds_getattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_setattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_update_client[] = { + &RMF_PTLRPC_BODY, + &RMF_OUT_UPDATE_HEADER, + &RMF_OUT_UPDATE_BUF, +}; + +static const struct req_msg_field *mds_update_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OUT_UPDATE_REPLY, +}; + +static const struct req_msg_field *llog_origin_handle_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY, + &RMF_NAME +}; + +static const struct req_msg_field *llogd_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY +}; + +static const struct req_msg_field *llog_log_hdr_only[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOG_LOG_HDR +}; + +static const struct req_msg_field *llog_origin_handle_next_block_server[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY, + &RMF_EADATA +}; + +static const struct req_msg_field *obd_idx_read_client[] = { + &RMF_PTLRPC_BODY, + &RMF_IDX_INFO +}; + +static const struct req_msg_field *obd_idx_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_IDX_INFO +}; + +static const struct req_msg_field *ost_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY +}; + +static const struct req_msg_field *ost_body_capa[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_CAPA1 +}; + +static const struct req_msg_field *ost_destroy_client[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_DLM_REQ, + &RMF_CAPA1 +}; + + +static const struct req_msg_field *ost_brw_client[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_OBD_IOOBJ, + &RMF_NIOBUF_REMOTE, + &RMF_CAPA1, + &RMF_SHORT_IO +}; + +static const struct req_msg_field *ost_brw_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_SHORT_IO +}; + +static const struct req_msg_field *ost_brw_write_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_RCS +}; + +static const struct req_msg_field *ost_get_info_generic_server[] = { + &RMF_PTLRPC_BODY, + &RMF_GENERIC_DATA, +}; + +static const struct req_msg_field *ost_get_info_generic_client[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_KEY +}; + +static const struct req_msg_field *ost_get_last_id_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_ID +}; + +static const struct req_msg_field *ost_get_last_fid_client[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_KEY, + &RMF_FID, +}; + +static const struct req_msg_field *ost_get_last_fid_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FID, +}; + +static const struct req_msg_field *ost_get_fiemap_client[] = { + &RMF_PTLRPC_BODY, + &RMF_FIEMAP_KEY, + &RMF_FIEMAP_VAL +}; + +static const struct req_msg_field *ost_ladvise[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_OST_LADVISE_HDR, + &RMF_OST_LADVISE, +}; + +static const struct req_msg_field *ost_get_fiemap_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FIEMAP_VAL +}; + +static const struct req_msg_field *mdt_hsm_progress[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_PROGRESS, +}; + +static const struct req_msg_field *mdt_hsm_ct_register[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_ARCHIVE, +}; + +static const struct req_msg_field *mdt_hsm_ct_unregister[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, +}; + +static const struct req_msg_field *mdt_hsm_action_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_CURRENT_ACTION, +}; + +static const struct req_msg_field *mdt_hsm_state_get_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_HSM_USER_STATE, +}; + +static const struct req_msg_field *mdt_hsm_state_set[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_HSM_STATE_SET, +}; + +static const struct req_msg_field *mdt_hsm_request[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_REQUEST, + &RMF_MDS_HSM_USER_ITEM, + &RMF_GENERIC_DATA, +}; + +static const struct req_msg_field *obd_lfsck_request[] = { + &RMF_PTLRPC_BODY, + &RMF_LFSCK_REQUEST, +}; + +static const struct req_msg_field *obd_lfsck_reply[] = { + &RMF_PTLRPC_BODY, + &RMF_LFSCK_REPLY, +}; + +static struct req_format *req_formats[] = { + &RQF_OBD_PING, + &RQF_OBD_SET_INFO, + &RQF_OBD_IDX_READ, + &RQF_SEC_CTX, + &RQF_MGS_TARGET_REG, +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0) + &RQF_MGS_SET_INFO, +#endif + &RQF_MGS_CONFIG_READ, + &RQF_SEQ_QUERY, + &RQF_FLD_QUERY, + &RQF_FLD_READ, + &RQF_MDS_CONNECT, + &RQF_MDS_DISCONNECT, + &RQF_MDS_GET_INFO, + &RQF_MDS_GET_ROOT, + &RQF_MDS_STATFS, + &RQF_MDS_STATFS_NEW, + &RQF_MDS_GETATTR, + &RQF_MDS_GETATTR_NAME, + &RQF_MDS_GETXATTR, + &RQF_MDS_SYNC, + &RQF_MDS_CLOSE, + &RQF_MDS_CLOSE_INTENT, + &RQF_MDS_READPAGE, + &RQF_MDS_REINT, + &RQF_MDS_REINT_CREATE, + &RQF_MDS_REINT_CREATE_ACL, + &RQF_MDS_REINT_CREATE_SLAVE, + &RQF_MDS_REINT_CREATE_SYM, + &RQF_MDS_REINT_OPEN, + &RQF_MDS_REINT_UNLINK, + &RQF_MDS_REINT_LINK, + &RQF_MDS_REINT_RENAME, + &RQF_MDS_REINT_MIGRATE, + &RQF_MDS_REINT_SETATTR, + &RQF_MDS_REINT_SETXATTR, + &RQF_MDS_REINT_RESYNC, + &RQF_MDS_QUOTACTL, + &RQF_MDS_HSM_PROGRESS, + &RQF_MDS_HSM_CT_REGISTER, + &RQF_MDS_HSM_CT_UNREGISTER, + &RQF_MDS_HSM_STATE_GET, + &RQF_MDS_HSM_STATE_SET, + &RQF_MDS_HSM_ACTION, + &RQF_MDS_HSM_REQUEST, + &RQF_MDS_SWAP_LAYOUTS, + &RQF_MDS_RMFID, + &RQF_OUT_UPDATE, + &RQF_OST_CONNECT, + &RQF_OST_DISCONNECT, + &RQF_OST_QUOTACTL, + &RQF_OST_GETATTR, + &RQF_OST_SETATTR, + &RQF_OST_CREATE, + &RQF_OST_PUNCH, + &RQF_OST_SYNC, + &RQF_OST_DESTROY, + &RQF_OST_BRW_READ, + &RQF_OST_BRW_WRITE, + &RQF_OST_STATFS, + &RQF_OST_SET_GRANT_INFO, + &RQF_OST_GET_INFO, + &RQF_OST_GET_INFO_LAST_ID, + &RQF_OST_GET_INFO_LAST_FID, + &RQF_OST_SET_INFO_LAST_FID, + &RQF_OST_GET_INFO_FIEMAP, + &RQF_OST_LADVISE, + &RQF_LDLM_ENQUEUE, + &RQF_LDLM_ENQUEUE_LVB, + &RQF_LDLM_CONVERT, + &RQF_LDLM_CANCEL, + &RQF_LDLM_CALLBACK, + &RQF_LDLM_CP_CALLBACK, + &RQF_LDLM_BL_CALLBACK, + &RQF_LDLM_GL_CALLBACK, + &RQF_LDLM_GL_CALLBACK_DESC, + &RQF_LDLM_INTENT, + &RQF_LDLM_INTENT_BASIC, + &RQF_LDLM_INTENT_LAYOUT, + &RQF_LDLM_INTENT_GETATTR, + &RQF_LDLM_INTENT_OPEN, + &RQF_LDLM_INTENT_CREATE, + &RQF_LDLM_INTENT_GETXATTR, + &RQF_LDLM_INTENT_QUOTA, + &RQF_QUOTA_DQACQ, + &RQF_LLOG_ORIGIN_HANDLE_CREATE, + &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK, + &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK, + &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER, + &RQF_CONNECT, + &RQF_LFSCK_NOTIFY, + &RQF_LFSCK_QUERY, +}; + +struct req_msg_field { + const __u32 rmf_flags; + const char *rmf_name; + /** + * Field length. (-1) means "variable length". If the + * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length, + * but the actual size must be a whole multiple of \a rmf_size. + */ + const int rmf_size; + void (*rmf_swabber)(void *); + void (*rmf_dumper)(void *); + int rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR]; +}; + +enum rmf_flags { + /** + * The field is a string, must be NUL-terminated. + */ + RMF_F_STRING = 1 << 0, + /** + * The field's buffer size need not match the declared \a rmf_size. + */ + RMF_F_NO_SIZE_CHECK = 1 << 1, + /** + * The field's buffer size must be a whole multiple of the declared \a + * rmf_size and the \a rmf_swabber function must work on the declared \a + * rmf_size worth of bytes. + */ + RMF_F_STRUCT_ARRAY = 1 << 2 +}; + +struct req_capsule; + +/* + * Request fields. + */ +#define DEFINE_MSGF(name, flags, size, swabber, dumper) { \ + .rmf_name = (name), \ + .rmf_flags = (flags), \ + .rmf_size = (size), \ + .rmf_swabber = (void (*)(void*))(swabber), \ + .rmf_dumper = (void (*)(void*))(dumper) \ +} + +struct req_msg_field RMF_GENERIC_DATA = + DEFINE_MSGF("generic_data", 0, + -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GENERIC_DATA); + +struct req_msg_field RMF_MGS_TARGET_INFO = + DEFINE_MSGF("mgs_target_info", 0, + sizeof(struct mgs_target_info), + lustre_swab_mgs_target_info, NULL); +EXPORT_SYMBOL(RMF_MGS_TARGET_INFO); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0) +struct req_msg_field RMF_MGS_SEND_PARAM = + DEFINE_MSGF("mgs_send_param", 0, + sizeof(struct mgs_send_param), + NULL, NULL); +EXPORT_SYMBOL(RMF_MGS_SEND_PARAM); +#endif + +struct req_msg_field RMF_MGS_CONFIG_BODY = + DEFINE_MSGF("mgs_config_read request", 0, + sizeof(struct mgs_config_body), + lustre_swab_mgs_config_body, NULL); +EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY); + +struct req_msg_field RMF_MGS_CONFIG_RES = + DEFINE_MSGF("mgs_config_read reply ", 0, + sizeof(struct mgs_config_res), + lustre_swab_mgs_config_res, NULL); +EXPORT_SYMBOL(RMF_MGS_CONFIG_RES); + +struct req_msg_field RMF_U32 = + DEFINE_MSGF("generic u32", RMF_F_STRUCT_ARRAY, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_U32); + +struct req_msg_field RMF_SETINFO_VAL = + DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SETINFO_VAL); + +struct req_msg_field RMF_GETINFO_KEY = + DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GETINFO_KEY); + +struct req_msg_field RMF_GETINFO_VALLEN = + DEFINE_MSGF("getinfo_vallen", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_GETINFO_VALLEN); + +struct req_msg_field RMF_GETINFO_VAL = + DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GETINFO_VAL); + +struct req_msg_field RMF_SEQ_OPC = + DEFINE_MSGF("seq_query_opc", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_SEQ_OPC); + +struct req_msg_field RMF_SEQ_RANGE = + DEFINE_MSGF("seq_query_range", 0, + sizeof(struct lu_seq_range), + lustre_swab_lu_seq_range, NULL); +EXPORT_SYMBOL(RMF_SEQ_RANGE); + +struct req_msg_field RMF_FLD_OPC = + DEFINE_MSGF("fld_query_opc", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_FLD_OPC); + +struct req_msg_field RMF_FLD_MDFLD = + DEFINE_MSGF("fld_query_mdfld", 0, + sizeof(struct lu_seq_range), + lustre_swab_lu_seq_range, NULL); +EXPORT_SYMBOL(RMF_FLD_MDFLD); + +struct req_msg_field RMF_MDT_BODY = + DEFINE_MSGF("mdt_body", 0, + sizeof(struct mdt_body), lustre_swab_mdt_body, NULL); +EXPORT_SYMBOL(RMF_MDT_BODY); + +struct req_msg_field RMF_OBD_QUOTACTL = + DEFINE_MSGF("obd_quotactl", 0, + sizeof(struct obd_quotactl), + lustre_swab_obd_quotactl, NULL); +EXPORT_SYMBOL(RMF_OBD_QUOTACTL); + +struct req_msg_field RMF_QUOTA_BODY = + DEFINE_MSGF("quota_body", 0, + sizeof(struct quota_body), lustre_swab_quota_body, NULL); +EXPORT_SYMBOL(RMF_QUOTA_BODY); + +struct req_msg_field RMF_MDT_EPOCH = + DEFINE_MSGF("mdt_ioepoch", 0, + sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL); +EXPORT_SYMBOL(RMF_MDT_EPOCH); + +struct req_msg_field RMF_PTLRPC_BODY = + DEFINE_MSGF("ptlrpc_body", 0, + sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL); +EXPORT_SYMBOL(RMF_PTLRPC_BODY); + +struct req_msg_field RMF_CLOSE_DATA = + DEFINE_MSGF("data_version", 0, + sizeof(struct close_data), lustre_swab_close_data, NULL); +EXPORT_SYMBOL(RMF_CLOSE_DATA); + +struct req_msg_field RMF_OBD_STATFS = + DEFINE_MSGF("obd_statfs", 0, + sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL); +EXPORT_SYMBOL(RMF_OBD_STATFS); + +struct req_msg_field RMF_SETINFO_KEY = + DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SETINFO_KEY); + +struct req_msg_field RMF_NAME = + DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_NAME); + +struct req_msg_field RMF_FID_ARRAY = + DEFINE_MSGF("fid_array", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_FID_ARRAY); + +struct req_msg_field RMF_SYMTGT = + DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SYMTGT); + +struct req_msg_field RMF_TGTUUID = + DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL, + NULL); +EXPORT_SYMBOL(RMF_TGTUUID); + +struct req_msg_field RMF_CLUUID = + DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL, + NULL); +EXPORT_SYMBOL(RMF_CLUUID); + +struct req_msg_field RMF_STRING = + DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_STRING); + +struct req_msg_field RMF_FILE_SECCTX_NAME = + DEFINE_MSGF("file_secctx_name", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_FILE_SECCTX_NAME); + +struct req_msg_field RMF_FILE_SECCTX = + DEFINE_MSGF("file_secctx", RMF_F_NO_SIZE_CHECK, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_FILE_SECCTX); + +struct req_msg_field RMF_LLOGD_BODY = + DEFINE_MSGF("llogd_body", 0, + sizeof(struct llogd_body), lustre_swab_llogd_body, NULL); +EXPORT_SYMBOL(RMF_LLOGD_BODY); + +struct req_msg_field RMF_LLOG_LOG_HDR = + DEFINE_MSGF("llog_log_hdr", 0, + sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL); +EXPORT_SYMBOL(RMF_LLOG_LOG_HDR); + +struct req_msg_field RMF_LLOGD_CONN_BODY = + DEFINE_MSGF("llogd_conn_body", 0, + sizeof(struct llogd_conn_body), + lustre_swab_llogd_conn_body, NULL); +EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY); + +/* + * connection handle received in MDS_CONNECT request. + * + * No swabbing needed because struct lustre_handle contains only a 64-bit cookie + * that the client does not interpret at all. + */ +struct req_msg_field RMF_CONN = + DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL); +EXPORT_SYMBOL(RMF_CONN); + +struct req_msg_field RMF_CONNECT_DATA = + DEFINE_MSGF("cdata", + RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */, + sizeof(struct obd_connect_data), + lustre_swab_connect, NULL); +EXPORT_SYMBOL(RMF_CONNECT_DATA); + +struct req_msg_field RMF_DLM_REQ = + DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */, + sizeof(struct ldlm_request), + lustre_swab_ldlm_request, NULL); +EXPORT_SYMBOL(RMF_DLM_REQ); + +struct req_msg_field RMF_DLM_REP = + DEFINE_MSGF("dlm_rep", 0, + sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL); +EXPORT_SYMBOL(RMF_DLM_REP); + +struct req_msg_field RMF_LDLM_INTENT = + DEFINE_MSGF("ldlm_intent", 0, + sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL); +EXPORT_SYMBOL(RMF_LDLM_INTENT); + +struct req_msg_field RMF_DLM_LVB = + DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_DLM_LVB); + +struct req_msg_field RMF_DLM_GL_DESC = + DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc), NULL, NULL); +EXPORT_SYMBOL(RMF_DLM_GL_DESC); + +struct req_msg_field RMF_MDT_MD = + DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL); +EXPORT_SYMBOL(RMF_MDT_MD); + +struct req_msg_field RMF_DEFAULT_MDT_MD = + DEFINE_MSGF("default_mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, + NULL); +EXPORT_SYMBOL(RMF_DEFAULT_MDT_MD); + +struct req_msg_field RMF_REC_REINT = + DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint), + lustre_swab_mdt_rec_reint, NULL); +EXPORT_SYMBOL(RMF_REC_REINT); + +/* FIXME: this length should be defined as a macro */ +struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1, + NULL, NULL); +EXPORT_SYMBOL(RMF_EADATA); + +struct req_msg_field RMF_EAVALS = DEFINE_MSGF("eavals", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_EAVALS); + +struct req_msg_field RMF_ACL = DEFINE_MSGF("acl", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_ACL); + +/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */ +struct req_msg_field RMF_LOGCOOKIES = + DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */, + sizeof(struct llog_cookie), NULL, NULL); +EXPORT_SYMBOL(RMF_LOGCOOKIES); + +struct req_msg_field RMF_CAPA1 = + DEFINE_MSGF("capa", 0, 0, NULL, NULL); +EXPORT_SYMBOL(RMF_CAPA1); + +struct req_msg_field RMF_CAPA2 = + DEFINE_MSGF("capa", 0, 0, NULL, NULL); +EXPORT_SYMBOL(RMF_CAPA2); + +struct req_msg_field RMF_LAYOUT_INTENT = + DEFINE_MSGF("layout_intent", 0, + sizeof(struct layout_intent), lustre_swab_layout_intent, + NULL); +EXPORT_SYMBOL(RMF_LAYOUT_INTENT); + +struct req_msg_field RMF_SELINUX_POL = + DEFINE_MSGF("selinux_pol", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SELINUX_POL); + +/* + * OST request field. + */ +struct req_msg_field RMF_OST_BODY = + DEFINE_MSGF("ost_body", 0, + sizeof(struct ost_body), lustre_swab_ost_body, + dump_ost_body); +EXPORT_SYMBOL(RMF_OST_BODY); + +struct req_msg_field RMF_OBD_IOOBJ = + DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY, + sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo); +EXPORT_SYMBOL(RMF_OBD_IOOBJ); + +struct req_msg_field RMF_NIOBUF_REMOTE = + DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, + sizeof(struct niobuf_remote), lustre_swab_niobuf_remote, + dump_rniobuf); +EXPORT_SYMBOL(RMF_NIOBUF_REMOTE); + +struct req_msg_field RMF_NIOBUF_INLINE = + DEFINE_MSGF("niobuf_inline", RMF_F_NO_SIZE_CHECK, + sizeof(struct niobuf_remote), lustre_swab_niobuf_remote, + dump_rniobuf); +EXPORT_SYMBOL(RMF_NIOBUF_INLINE); + +struct req_msg_field RMF_RCS = + DEFINE_MSGF("niobuf_rcs", RMF_F_STRUCT_ARRAY, sizeof(__u32), + lustre_swab_generic_32s, dump_rcs); +EXPORT_SYMBOL(RMF_RCS); + +struct req_msg_field RMF_EAVALS_LENS = + DEFINE_MSGF("eavals_lens", RMF_F_STRUCT_ARRAY, sizeof(__u32), + lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_EAVALS_LENS); + +struct req_msg_field RMF_OBD_ID = + DEFINE_MSGF("obd_id", 0, + sizeof(__u64), lustre_swab_ost_last_id, NULL); +EXPORT_SYMBOL(RMF_OBD_ID); + +struct req_msg_field RMF_FID = + DEFINE_MSGF("fid", 0, + sizeof(struct lu_fid), lustre_swab_lu_fid, NULL); +EXPORT_SYMBOL(RMF_FID); + +struct req_msg_field RMF_OST_ID = + DEFINE_MSGF("ost_id", 0, + sizeof(struct ost_id), lustre_swab_ost_id, NULL); +EXPORT_SYMBOL(RMF_OST_ID); + +struct req_msg_field RMF_FIEMAP_KEY = + DEFINE_MSGF("fiemap_key", 0, sizeof(struct ll_fiemap_info_key), + lustre_swab_fiemap_info_key, NULL); +EXPORT_SYMBOL(RMF_FIEMAP_KEY); + +struct req_msg_field RMF_FIEMAP_VAL = + DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL); +EXPORT_SYMBOL(RMF_FIEMAP_VAL); + +struct req_msg_field RMF_IDX_INFO = + DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info), + lustre_swab_idx_info, NULL); +EXPORT_SYMBOL(RMF_IDX_INFO); +struct req_msg_field RMF_SHORT_IO = + DEFINE_MSGF("short_io", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SHORT_IO); +struct req_msg_field RMF_HSM_USER_STATE = + DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state), + lustre_swab_hsm_user_state, NULL); +EXPORT_SYMBOL(RMF_HSM_USER_STATE); + +struct req_msg_field RMF_HSM_STATE_SET = + DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set), + lustre_swab_hsm_state_set, NULL); +EXPORT_SYMBOL(RMF_HSM_STATE_SET); + +struct req_msg_field RMF_MDS_HSM_PROGRESS = + DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel), + lustre_swab_hsm_progress_kernel, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS); + +struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION = + DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action), + lustre_swab_hsm_current_action, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION); + +struct req_msg_field RMF_MDS_HSM_USER_ITEM = + DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY, + sizeof(struct hsm_user_item), lustre_swab_hsm_user_item, + NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM); + +struct req_msg_field RMF_MDS_HSM_ARCHIVE = + DEFINE_MSGF("hsm_archive", RMF_F_STRUCT_ARRAY, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE); + +struct req_msg_field RMF_MDS_HSM_REQUEST = + DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request), + lustre_swab_hsm_request, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST); + +struct req_msg_field RMF_OUT_UPDATE = DEFINE_MSGF("object_update", 0, -1, + lustre_swab_object_update_request, NULL); +EXPORT_SYMBOL(RMF_OUT_UPDATE); + +struct req_msg_field RMF_OUT_UPDATE_REPLY = + DEFINE_MSGF("object_update_reply", 0, -1, + lustre_swab_object_update_reply, NULL); +EXPORT_SYMBOL(RMF_OUT_UPDATE_REPLY); + +struct req_msg_field RMF_SWAP_LAYOUTS = + DEFINE_MSGF("swap_layouts", 0, sizeof(struct mdc_swap_layouts), + lustre_swab_swap_layouts, NULL); +EXPORT_SYMBOL(RMF_SWAP_LAYOUTS); + +struct req_msg_field RMF_LFSCK_REQUEST = + DEFINE_MSGF("lfsck_request", 0, sizeof(struct lfsck_request), + lustre_swab_lfsck_request, NULL); +EXPORT_SYMBOL(RMF_LFSCK_REQUEST); + +struct req_msg_field RMF_LFSCK_REPLY = + DEFINE_MSGF("lfsck_reply", 0, sizeof(struct lfsck_reply), + lustre_swab_lfsck_reply, NULL); +EXPORT_SYMBOL(RMF_LFSCK_REPLY); + +struct req_msg_field RMF_OST_LADVISE_HDR = + DEFINE_MSGF("ladvise_request", 0, + sizeof(struct ladvise_hdr), + lustre_swab_ladvise_hdr, NULL); +EXPORT_SYMBOL(RMF_OST_LADVISE_HDR); + +struct req_msg_field RMF_OST_LADVISE = + DEFINE_MSGF("ladvise_request", RMF_F_STRUCT_ARRAY, + sizeof(struct lu_ladvise), + lustre_swab_ladvise, NULL); +EXPORT_SYMBOL(RMF_OST_LADVISE); + +struct req_msg_field RMF_OUT_UPDATE_HEADER = DEFINE_MSGF("out_update_header", 0, + -1, lustre_swab_out_update_header, NULL); +EXPORT_SYMBOL(RMF_OUT_UPDATE_HEADER); + +struct req_msg_field RMF_OUT_UPDATE_BUF = DEFINE_MSGF("update_buf", + RMF_F_STRUCT_ARRAY, sizeof(struct out_update_buffer), + lustre_swab_out_update_buffer, NULL); +EXPORT_SYMBOL(RMF_OUT_UPDATE_BUF); + +/* + * Request formats. + */ + +struct req_format { + const char *rf_name; + size_t rf_idx; + struct { + size_t nr; + const struct req_msg_field **d; + } rf_fields[RCL_NR]; +}; + +#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) { \ + .rf_name = name, \ + .rf_fields = { \ + [RCL_CLIENT] = { \ + .nr = client_nr, \ + .d = client \ + }, \ + [RCL_SERVER] = { \ + .nr = server_nr, \ + .d = server \ + } \ + } \ +} + +#define DEFINE_REQ_FMT0(name, client, server) \ +DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server)) + +struct req_format RQF_OBD_PING = + DEFINE_REQ_FMT0("OBD_PING", empty, empty); +EXPORT_SYMBOL(RQF_OBD_PING); + +struct req_format RQF_OBD_SET_INFO = + DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty); +EXPORT_SYMBOL(RQF_OBD_SET_INFO); + +/* Read index file through the network */ +struct req_format RQF_OBD_IDX_READ = + DEFINE_REQ_FMT0("OBD_IDX_READ", + obd_idx_read_client, obd_idx_read_server); +EXPORT_SYMBOL(RQF_OBD_IDX_READ); + +struct req_format RQF_SEC_CTX = + DEFINE_REQ_FMT0("SEC_CTX", empty, empty); +EXPORT_SYMBOL(RQF_SEC_CTX); + +struct req_format RQF_MGS_TARGET_REG = + DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only, + mgs_target_info_only); +EXPORT_SYMBOL(RQF_MGS_TARGET_REG); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0) +struct req_format RQF_MGS_SET_INFO = + DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info, + mgs_set_info); +EXPORT_SYMBOL(RQF_MGS_SET_INFO); +#endif + +struct req_format RQF_MGS_CONFIG_READ = + DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client, + mgs_config_read_server); +EXPORT_SYMBOL(RQF_MGS_CONFIG_READ); + +struct req_format RQF_SEQ_QUERY = + DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server); +EXPORT_SYMBOL(RQF_SEQ_QUERY); + +struct req_format RQF_FLD_QUERY = + DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server); +EXPORT_SYMBOL(RQF_FLD_QUERY); + +/* The 'fld_read_server' uses 'RMF_GENERIC_DATA' to hold the 'FLD_QUERY' + * RPC reply that is composed of 'struct lu_seq_range_array'. But there + * is not registered swabber function for 'RMF_GENERIC_DATA'. So the RPC + * peers need to handle the RPC reply with fixed little-endian format. + * + * In theory, we can define new structure with some swabber registered to + * handle the 'FLD_QUERY' RPC reply result automatically. But from the + * implementation view, it is not easy to be done within current "struct + * req_msg_field" framework. Because the sequence range array in the RPC + * reply is not fixed length, instead, its length depends on 'lu_seq_range' + * count, that is unknown when prepare the RPC buffer. Generally, for such + * flexible length RPC usage, there will be a field in the RPC layout to + * indicate the data length. But for the 'FLD_READ' RPC, we have no way to + * do that unless we add new length filed that will broken the on-wire RPC + * protocol and cause interoperability trouble with old peer. */ +struct req_format RQF_FLD_READ = + DEFINE_REQ_FMT0("FLD_READ", fld_read_client, fld_read_server); +EXPORT_SYMBOL(RQF_FLD_READ); + +struct req_format RQF_MDS_QUOTACTL = + DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only); +EXPORT_SYMBOL(RQF_MDS_QUOTACTL); + +struct req_format RQF_OST_QUOTACTL = + DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only); +EXPORT_SYMBOL(RQF_OST_QUOTACTL); + +struct req_format RQF_QUOTA_DQACQ = + DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only); +EXPORT_SYMBOL(RQF_QUOTA_DQACQ); + +struct req_format RQF_LDLM_INTENT_QUOTA = + DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA", + ldlm_intent_quota_client, + ldlm_intent_quota_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA); + +struct req_format RQF_MDS_GET_ROOT = + DEFINE_REQ_FMT0("MDS_GET_ROOT", mds_get_root_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_GET_ROOT); + +struct req_format RQF_MDS_STATFS = + DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server); +EXPORT_SYMBOL(RQF_MDS_STATFS); + +struct req_format RQF_MDS_STATFS_NEW = + DEFINE_REQ_FMT0("MDS_STATFS_NEW", mdt_body_only, obd_statfs_server); +EXPORT_SYMBOL(RQF_MDS_STATFS_NEW); + +struct req_format RQF_MDS_SYNC = + DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_SYNC); + +struct req_format RQF_MDS_GETATTR = + DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server); +EXPORT_SYMBOL(RQF_MDS_GETATTR); + +struct req_format RQF_MDS_GETXATTR = + DEFINE_REQ_FMT0("MDS_GETXATTR", + mds_getxattr_client, mds_getxattr_server); +EXPORT_SYMBOL(RQF_MDS_GETXATTR); + +struct req_format RQF_MDS_GETATTR_NAME = + DEFINE_REQ_FMT0("MDS_GETATTR_NAME", + mds_getattr_name_client, mds_getattr_server); +EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME); + +struct req_format RQF_MDS_REINT = + DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT); + +struct req_format RQF_MDS_REINT_CREATE = + DEFINE_REQ_FMT0("MDS_REINT_CREATE", + mds_reint_create_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE); + +struct req_format RQF_MDS_REINT_CREATE_ACL = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_ACL", + mds_reint_create_acl_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_ACL); + +struct req_format RQF_MDS_REINT_CREATE_SLAVE = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA", + mds_reint_create_slave_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE); + +struct req_format RQF_MDS_REINT_CREATE_SYM = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM", + mds_reint_create_sym_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM); + +struct req_format RQF_MDS_REINT_OPEN = + DEFINE_REQ_FMT0("MDS_REINT_OPEN", + mds_reint_open_client, mds_reint_open_server); +EXPORT_SYMBOL(RQF_MDS_REINT_OPEN); + +struct req_format RQF_MDS_REINT_UNLINK = + DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client, + mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK); + +struct req_format RQF_MDS_REINT_LINK = + DEFINE_REQ_FMT0("MDS_REINT_LINK", + mds_reint_link_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT_LINK); + +struct req_format RQF_MDS_REINT_RENAME = + DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client, + mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_REINT_RENAME); + +struct req_format RQF_MDS_REINT_MIGRATE = + DEFINE_REQ_FMT0("MDS_REINT_MIGRATE", mds_reint_migrate_client, + mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_REINT_MIGRATE); + +struct req_format RQF_MDS_REINT_SETATTR = + DEFINE_REQ_FMT0("MDS_REINT_SETATTR", + mds_reint_setattr_client, mds_setattr_server); +EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR); + +struct req_format RQF_MDS_REINT_SETXATTR = + DEFINE_REQ_FMT0("MDS_REINT_SETXATTR", + mds_reint_setxattr_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR); + +struct req_format RQF_MDS_REINT_RESYNC = + DEFINE_REQ_FMT0("MDS_REINT_RESYNC", mds_reint_resync, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT_RESYNC); + +struct req_format RQF_MDS_CONNECT = + DEFINE_REQ_FMT0("MDS_CONNECT", + obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_MDS_CONNECT); + +struct req_format RQF_MDS_DISCONNECT = + DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty); +EXPORT_SYMBOL(RQF_MDS_DISCONNECT); + +struct req_format RQF_MDS_GET_INFO = + DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client, + mds_getinfo_server); +EXPORT_SYMBOL(RQF_MDS_GET_INFO); + +struct req_format RQF_OUT_UPDATE = + DEFINE_REQ_FMT0("OUT_UPDATE", mds_update_client, + mds_update_server); +EXPORT_SYMBOL(RQF_OUT_UPDATE); + +struct req_format RQF_LDLM_ENQUEUE = + DEFINE_REQ_FMT0("LDLM_ENQUEUE", + ldlm_enqueue_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_ENQUEUE); + +struct req_format RQF_LDLM_ENQUEUE_LVB = + DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB", + ldlm_enqueue_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB); + +struct req_format RQF_LDLM_CONVERT = + DEFINE_REQ_FMT0("LDLM_CONVERT", + ldlm_enqueue_client, ldlm_enqueue_server); +EXPORT_SYMBOL(RQF_LDLM_CONVERT); + +struct req_format RQF_LDLM_CANCEL = + DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CANCEL); + +struct req_format RQF_LDLM_CALLBACK = + DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CALLBACK); + +struct req_format RQF_LDLM_CP_CALLBACK = + DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK); + +struct req_format RQF_LDLM_BL_CALLBACK = + DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK); + +struct req_format RQF_LDLM_GL_CALLBACK = + DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client, + ldlm_gl_callback_server); +EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK); + +struct req_format RQF_LDLM_GL_CALLBACK_DESC = + DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client, + ldlm_gl_callback_server); +EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK_DESC); + +struct req_format RQF_LDLM_INTENT_BASIC = + DEFINE_REQ_FMT0("LDLM_INTENT_BASIC", + ldlm_intent_basic_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC); + +struct req_format RQF_LDLM_INTENT = + DEFINE_REQ_FMT0("LDLM_INTENT", + ldlm_intent_client, ldlm_intent_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT); + +struct req_format RQF_LDLM_INTENT_LAYOUT = + DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT", + ldlm_intent_layout_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT); + +struct req_format RQF_LDLM_INTENT_GETATTR = + DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR", + ldlm_intent_getattr_client, ldlm_intent_getattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR); + +struct req_format RQF_LDLM_INTENT_OPEN = + DEFINE_REQ_FMT0("LDLM_INTENT_OPEN", + ldlm_intent_open_client, ldlm_intent_open_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN); + +struct req_format RQF_LDLM_INTENT_CREATE = + DEFINE_REQ_FMT0("LDLM_INTENT_CREATE", + ldlm_intent_create_client, ldlm_intent_getattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE); + +struct req_format RQF_LDLM_INTENT_GETXATTR = + DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR", + ldlm_intent_getxattr_client, + ldlm_intent_getxattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_GETXATTR); + +struct req_format RQF_MDS_CLOSE = + DEFINE_REQ_FMT0("MDS_CLOSE", + mdt_close_client, mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_CLOSE); + +struct req_format RQF_MDS_CLOSE_INTENT = + DEFINE_REQ_FMT0("MDS_CLOSE_INTENT", + mdt_close_intent_client, mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_CLOSE_INTENT); + +struct req_format RQF_MDS_READPAGE = + DEFINE_REQ_FMT0("MDS_READPAGE", + mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_READPAGE); + +struct req_format RQF_MDS_HSM_ACTION = + DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server); +EXPORT_SYMBOL(RQF_MDS_HSM_ACTION); + +struct req_format RQF_MDS_HSM_PROGRESS = + DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS); + +struct req_format RQF_MDS_HSM_CT_REGISTER = + DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER); + +struct req_format RQF_MDS_HSM_CT_UNREGISTER = + DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER); + +struct req_format RQF_MDS_HSM_STATE_GET = + DEFINE_REQ_FMT0("MDS_HSM_STATE_GET", + mdt_body_capa, mdt_hsm_state_get_server); +EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET); + +struct req_format RQF_MDS_HSM_STATE_SET = + DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET); + +struct req_format RQF_MDS_HSM_REQUEST = + DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST); + +struct req_format RQF_MDS_SWAP_LAYOUTS = + DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS", + mdt_swap_layouts, empty); +EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS); + +struct req_format RQF_MDS_RMFID = + DEFINE_REQ_FMT0("MDS_RMFID", mds_rmfid_client, + mds_rmfid_server); +EXPORT_SYMBOL(RQF_MDS_RMFID); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE", + llog_origin_handle_create_client, llogd_body_only); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK", + llogd_body_only, llog_origin_handle_next_block_server); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK", + llogd_body_only, llog_origin_handle_next_block_server); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER", + llogd_body_only, llog_log_hdr_only); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER); + +struct req_format RQF_CONNECT = + DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_CONNECT); + +struct req_format RQF_OST_CONNECT = + DEFINE_REQ_FMT0("OST_CONNECT", + obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_OST_CONNECT); + +struct req_format RQF_OST_DISCONNECT = + DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty); +EXPORT_SYMBOL(RQF_OST_DISCONNECT); + +struct req_format RQF_OST_GETATTR = + DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_GETATTR); + +struct req_format RQF_OST_SETATTR = + DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_SETATTR); + +struct req_format RQF_OST_CREATE = + DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only); +EXPORT_SYMBOL(RQF_OST_CREATE); + +struct req_format RQF_OST_PUNCH = + DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_PUNCH); + +struct req_format RQF_OST_SYNC = + DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_SYNC); + +struct req_format RQF_OST_DESTROY = + DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only); +EXPORT_SYMBOL(RQF_OST_DESTROY); + +struct req_format RQF_OST_BRW_READ = + DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server); +EXPORT_SYMBOL(RQF_OST_BRW_READ); + +struct req_format RQF_OST_BRW_WRITE = + DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server); +EXPORT_SYMBOL(RQF_OST_BRW_WRITE); + +struct req_format RQF_OST_STATFS = + DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server); +EXPORT_SYMBOL(RQF_OST_STATFS); + +struct req_format RQF_OST_SET_GRANT_INFO = + DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client, + ost_body_only); +EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO); + +struct req_format RQF_OST_GET_INFO = + DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client, + ost_get_info_generic_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO); + +struct req_format RQF_OST_GET_INFO_LAST_ID = + DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client, + ost_get_last_id_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID); + +struct req_format RQF_OST_GET_INFO_LAST_FID = + DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", ost_get_last_fid_client, + ost_get_last_fid_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID); + +struct req_format RQF_OST_SET_INFO_LAST_FID = + DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client, + empty); +EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID); + +struct req_format RQF_OST_GET_INFO_FIEMAP = + DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client, + ost_get_fiemap_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP); + +struct req_format RQF_LFSCK_NOTIFY = + DEFINE_REQ_FMT0("LFSCK_NOTIFY", obd_lfsck_request, empty); +EXPORT_SYMBOL(RQF_LFSCK_NOTIFY); + +struct req_format RQF_LFSCK_QUERY = + DEFINE_REQ_FMT0("LFSCK_QUERY", obd_lfsck_request, obd_lfsck_reply); +EXPORT_SYMBOL(RQF_LFSCK_QUERY); + +struct req_format RQF_OST_LADVISE = + DEFINE_REQ_FMT0("OST_LADVISE", ost_ladvise, ost_body_only); +EXPORT_SYMBOL(RQF_OST_LADVISE); + +/* Convenience macro */ +#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)] + +/** + * Initializes the capsule abstraction by computing and setting the \a rf_idx + * field of RQFs and the \a rmf_offset field of RMFs. + */ +int req_layout_init(void) +{ + size_t i; + size_t j; + size_t k; + struct req_format *rf = NULL; + + for (i = 0; i < ARRAY_SIZE(req_formats); ++i) { + rf = req_formats[i]; + rf->rf_idx = i; + for (j = 0; j < RCL_NR; ++j) { + LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR); + for (k = 0; k < rf->rf_fields[j].nr; ++k) { + struct req_msg_field *field; + + field = (typeof(field))rf->rf_fields[j].d[k]; + LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY) + || field->rmf_size > 0); + LASSERT(field->rmf_offset[i][j] == 0); + /* + * k + 1 to detect unused format/field + * combinations. + */ + field->rmf_offset[i][j] = k + 1; + } + } + } + return 0; +} +EXPORT_SYMBOL(req_layout_init); + +void req_layout_fini(void) +{ +} +EXPORT_SYMBOL(req_layout_fini); + +/** + * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1. + * + * Actual/expected field sizes are set elsewhere in functions in this file: + * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and + * req_capsule_msg_size(). The \a rc_area information is used by. + * ptlrpc_request_set_replen(). + */ +void req_capsule_init_area(struct req_capsule *pill) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) { + pill->rc_area[RCL_CLIENT][i] = -1; + pill->rc_area[RCL_SERVER][i] = -1; + } +} +EXPORT_SYMBOL(req_capsule_init_area); + +/** + * Initialize a pill. + * + * The \a location indicates whether the caller is executing on the client side + * (RCL_CLIENT) or server side (RCL_SERVER).. + */ +void req_capsule_init(struct req_capsule *pill, + struct ptlrpc_request *req, + enum req_location location) +{ + LASSERT(location == RCL_SERVER || location == RCL_CLIENT); + + /* + * Today all capsules are embedded in ptlrpc_request structs, + * but just in case that ever isn't the case, we don't reach + * into req unless req != NULL and pill is the one embedded in + * the req. + * + * The req->rq_pill_init flag makes it safe to initialize a pill + * twice, which might happen in the OST paths as a result of the + * high-priority RPC queue getting peeked at before ost_handle() + * handles an OST RPC. + */ + if (req != NULL && pill == &req->rq_pill && req->rq_pill_init) + return; + + memset(pill, 0, sizeof *pill); + pill->rc_req = req; + pill->rc_loc = location; + req_capsule_init_area(pill); + + if (req != NULL && pill == &req->rq_pill) + req->rq_pill_init = 1; +} +EXPORT_SYMBOL(req_capsule_init); + +void req_capsule_fini(struct req_capsule *pill) +{ +} +EXPORT_SYMBOL(req_capsule_fini); + +static int __req_format_is_sane(const struct req_format *fmt) +{ + return fmt->rf_idx < ARRAY_SIZE(req_formats) && + req_formats[fmt->rf_idx] == fmt; +} + +static struct lustre_msg *__req_msg(const struct req_capsule *pill, + enum req_location loc) +{ + struct ptlrpc_request *req; + + req = pill->rc_req; + return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg; +} + +/** + * Set the format (\a fmt) of a \a pill; format changes are not allowed here + * (see req_capsule_extend()). + */ +void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt) +{ + LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt); + LASSERT(__req_format_is_sane(fmt)); + + pill->rc_fmt = fmt; +} +EXPORT_SYMBOL(req_capsule_set); + +/** + * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in + * yet. + + * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of + * variable-sized fields. The field sizes come from the declared \a rmf_size + * field of a \a pill's \a rc_fmt's RMF's. + */ +size_t req_capsule_filled_sizes(struct req_capsule *pill, + enum req_location loc) +{ + const struct req_format *fmt = pill->rc_fmt; + size_t i; + + LASSERT(fmt != NULL); + + for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { + if (pill->rc_area[loc][i] == -1) { + pill->rc_area[loc][i] = + fmt->rf_fields[loc].d[i]->rmf_size; + if (pill->rc_area[loc][i] == -1) { + /* + * Skip the following fields. + * + * If this LASSERT() trips then you're missing a + * call to req_capsule_set_size(). + */ + LASSERT(loc != RCL_SERVER); + break; + } + } + } + return i; +} +EXPORT_SYMBOL(req_capsule_filled_sizes); + +/** + * Capsule equivalent of lustre_pack_request() and lustre_pack_reply(). + * + * This function uses the \a pill's \a rc_area as filled in by + * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by + * this function). + */ +int req_capsule_server_pack(struct req_capsule *pill) +{ + const struct req_format *fmt; + int count; + int rc; + + LASSERT(pill->rc_loc == RCL_SERVER); + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + + count = req_capsule_filled_sizes(pill, RCL_SERVER); + rc = lustre_pack_reply(pill->rc_req, count, + pill->rc_area[RCL_SERVER], NULL); + if (rc != 0) { + DEBUG_REQ(D_ERROR, pill->rc_req, + "Cannot pack %d fields in format `%s': ", + count, fmt->rf_name); + } + return rc; +} +EXPORT_SYMBOL(req_capsule_server_pack); + +/** + * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill + * corresponding to the given RMF (\a field). + */ +__u32 __req_capsule_offset(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + unsigned int offset; + + offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc]; + LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n", + pill->rc_fmt->rf_name, + field->rmf_name, offset, loc); + offset--; + + LASSERT(offset < REQ_MAX_FIELD_NR); + return offset; +} + +/** + * Helper for __req_capsule_get(); swabs value / array of values and/or dumps + * them if desired. + */ +static +void +swabber_dumper_helper(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, + int offset, + void *value, int len, int dump, void (*swabber)( void *)) +{ + void *p; + int i; + int n; + int do_swab; + int inout = loc == RCL_CLIENT; + + swabber = swabber ?: field->rmf_swabber; + + if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) && + swabber != NULL && value != NULL) + do_swab = 1; + else + do_swab = 0; + + if (!field->rmf_dumper) + dump = 0; + + if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) { + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n", + do_swab ? "unswabbed " : "", field->rmf_name); + field->rmf_dumper(value); + } + if (!do_swab) + return; + swabber(value); + ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset); + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of swabbed field %s " + "follows\n", field->rmf_name); + field->rmf_dumper(value); + } + + return; + } + + /* + * We're swabbing an array; swabber() swabs a single array element, so + * swab every element. + */ + LASSERT((len % field->rmf_size) == 0); + for (p = value, i = 0, n = len / field->rmf_size; + i < n; + i++, p += field->rmf_size) { + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, " + "element %d follows\n", + do_swab ? "unswabbed " : "", field->rmf_name, i); + field->rmf_dumper(p); + } + if (!do_swab) + continue; + swabber(p); + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, " + "element %d follows\n", field->rmf_name, i); + field->rmf_dumper(value); + } + } + if (do_swab) + ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset); +} + +/** + * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill + * corresponding to the given RMF (\a field). + * + * The buffer will be swabbed using the given \a swabber. If \a swabber == NULL + * then the \a rmf_swabber from the RMF will be used. Soon there will be no + * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then + * be removed. Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each + * element of the array swabbed. + */ +static void *__req_capsule_get(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, + void (*swabber)( void *), + int dump) +{ + const struct req_format *fmt; + struct lustre_msg *msg; + void *value; + __u32 len; + __u32 offset; + + void *(*getter)(struct lustre_msg *m, __u32 n, __u32 minlen); + + static const char *rcl_names[RCL_NR] = { + [RCL_CLIENT] = "client", + [RCL_SERVER] = "server" + }; + + LASSERT(pill != NULL); + LASSERT(pill != LP_POISON); + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + LASSERT(fmt != LP_POISON); + LASSERT(__req_format_is_sane(fmt)); + + offset = __req_capsule_offset(pill, field, loc); + + msg = __req_msg(pill, loc); + LASSERT(msg != NULL); + + getter = (field->rmf_flags & RMF_F_STRING) ? + (typeof(getter))lustre_msg_string : lustre_msg_buf; + + if (field->rmf_flags & (RMF_F_STRUCT_ARRAY|RMF_F_NO_SIZE_CHECK)) { + /* + * We've already asserted that field->rmf_size > 0 in + * req_layout_init(). + */ + len = lustre_msg_buflen(msg, offset); + if (!(field->rmf_flags & RMF_F_NO_SIZE_CHECK) && + (len % field->rmf_size) != 0) { + CERROR("%s: array field size mismatch " + "%d modulo %u != 0 (%d)\n", + field->rmf_name, len, field->rmf_size, loc); + return NULL; + } + } else if (pill->rc_area[loc][offset] != -1) { + len = pill->rc_area[loc][offset]; + } else { + len = max_t(typeof(field->rmf_size), field->rmf_size, 0); + } + value = getter(msg, offset, len); + + if (value == NULL) { + DEBUG_REQ(D_ERROR, pill->rc_req, + "Wrong buffer for field `%s' (%u of %u) " + "in format `%s': %u vs. %u (%s)\n", + field->rmf_name, offset, lustre_msg_bufcount(msg), + fmt->rf_name, lustre_msg_buflen(msg, offset), len, + rcl_names[loc]); + } else { + swabber_dumper_helper(pill, field, loc, offset, value, len, + dump, swabber); + } + + return value; +} + +/** + * Dump a request and/or reply + */ +void __req_capsule_dump(struct req_capsule *pill, enum req_location loc) +{ + const struct req_format *fmt; + const struct req_msg_field *field; + __u32 len; + size_t i; + + fmt = pill->rc_fmt; + + DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP\n"); + for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { + field = FMT_FIELD(fmt, loc, i); + if (field->rmf_dumper == NULL) { + /* + * FIXME Add a default hex dumper for fields that don't + * have a specific dumper + */ + len = req_capsule_get_size(pill, field, loc); + CDEBUG(D_RPCTRACE, "Field %s has no dumper function;" + "field size is %u\n", field->rmf_name, len); + } else { + /* It's the dumping side-effect that we're interested in */ + (void) __req_capsule_get(pill, field, loc, NULL, 1); + } + } + CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n"); +} + +/** + * Dump a request. + */ +void req_capsule_client_dump(struct req_capsule *pill) +{ + __req_capsule_dump(pill, RCL_CLIENT); +} +EXPORT_SYMBOL(req_capsule_client_dump); + +/** + * Dump a reply + */ +void req_capsule_server_dump(struct req_capsule *pill) +{ + __req_capsule_dump(pill, RCL_SERVER); +} +EXPORT_SYMBOL(req_capsule_server_dump); + +/** + * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request + * buffer corresponding to the given RMF (\a field) of a \a pill. + */ +void *req_capsule_client_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_client_get); + +/** + * Same as req_capsule_client_get(), but with a \a swabber argument. + * + * Currently unused; will be removed when req_capsule_server_swab_get() is + * unused too. + */ +void *req_capsule_client_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber) +{ + return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0); +} +EXPORT_SYMBOL(req_capsule_client_swab_get); + +/** + * Utility that combines req_capsule_set_size() and req_capsule_client_get(). + * + * First the \a pill's request \a field's size is set (\a rc_area) using + * req_capsule_set_size() with the given \a len. Then the actual buffer is + * returned. + */ +void *req_capsule_client_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len) +{ + req_capsule_set_size(pill, field, RCL_CLIENT, len); + return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_client_sized_get); + +/** + * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply + * buffer corresponding to the given RMF (\a field) of a \a pill. + */ +void *req_capsule_server_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_server_get); + +/** + * Same as req_capsule_server_get(), but with a \a swabber argument. + * + * Ideally all swabbing should be done pursuant to RMF definitions, with no + * swabbing done outside this capsule abstraction. + */ +void *req_capsule_server_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber) +{ + return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0); +} +EXPORT_SYMBOL(req_capsule_server_swab_get); + +/** + * Utility that combines req_capsule_set_size() and req_capsule_server_get(). + * + * First the \a pill's request \a field's size is set (\a rc_area) using + * req_capsule_set_size() with the given \a len. Then the actual buffer is + * returned. + */ +void *req_capsule_server_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len) +{ + req_capsule_set_size(pill, field, RCL_SERVER, len); + return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_server_sized_get); + +void *req_capsule_server_sized_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 len, void *swabber) +{ + req_capsule_set_size(pill, field, RCL_SERVER, len); + return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0); +} +EXPORT_SYMBOL(req_capsule_server_sized_swab_get); + +/** + * Returns the buffer of a \a pill corresponding to the given \a field from the + * request (if the caller is executing on the server-side) or reply (if the + * caller is executing on the client-side). + * + * This function convienient for use is code that could be executed on the + * client and server alike. + */ +const void *req_capsule_other_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_other_get); + +/** + * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a + * field of the given \a pill. + * + * This function must be used when constructing variable sized fields of a + * request or reply. + */ +void req_capsule_set_size(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, __u32 size) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + if ((size != (__u32)field->rmf_size) && + (field->rmf_size != -1) && + !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) && + (size > 0)) { + __u32 rmf_size = (__u32)field->rmf_size; + if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) && + (size % rmf_size != 0)) { + CERROR("%s: array field size mismatch " + "%u %% %u != 0 (%d)\n", + field->rmf_name, size, rmf_size, loc); + LBUG(); + } else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) && + size < rmf_size) { + CERROR("%s: field size mismatch %u != %u (%d)\n", + field->rmf_name, size, rmf_size, loc); + LBUG(); + } + } + + pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size; +} +EXPORT_SYMBOL(req_capsule_set_size); + +/** + * Return the actual PTLRPC buffer length of a request or reply (\a loc) + * for the given \a pill's given \a field. + * + * NB: this function doesn't correspond with req_capsule_set_size(), which + * actually sets the size in pill.rc_area[loc][offset], but this function + * returns the message buflen[offset], maybe we should use another name. + */ +__u32 req_capsule_get_size(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + return lustre_msg_buflen(__req_msg(pill, loc), + __req_capsule_offset(pill, field, loc)); +} +EXPORT_SYMBOL(req_capsule_get_size); + +/** + * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the + * given \a pill's request or reply (\a loc) given the field size recorded in + * the \a pill's rc_area. + * + * See also req_capsule_set_size(). + */ +__u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc) +{ + return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic, + pill->rc_fmt->rf_fields[loc].nr, + pill->rc_area[loc]); +} + +/** + * While req_capsule_msg_size() computes the size of a PTLRPC request or reply + * (\a loc) given a \a pill's \a rc_area, this function computes the size of a + * PTLRPC request or reply given only an RQF (\a fmt). + * + * This function should not be used for formats which contain variable size + * fields. + */ +__u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, + enum req_location loc) +{ + __u32 size; + size_t i = 0; + + /* + * This function should probably LASSERT() that fmt has no fields with + * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many + * elements in the array there will ultimately be, but then, we could + * assume that there will be at least one element, and that's just what + * we do. + */ + size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr); + if (size == 0) + return size; + + for (; i < fmt->rf_fields[loc].nr; ++i) + if (fmt->rf_fields[loc].d[i]->rmf_size != -1) + size += cfs_size_round(fmt->rf_fields[loc].d[i]-> + rmf_size); + return size; +} +EXPORT_SYMBOL(req_capsule_fmt_size); + +/** + * Changes the format of an RPC. + * + * The pill must already have been initialized, which means that it already has + * a request format. The new format \a fmt must be an extension of the pill's + * old format. Specifically: the new format must have as many request and reply + * fields as the old one, and all fields shared by the old and new format must + * be at least as large in the new format. + * + * The new format's fields may be of different "type" than the old format, but + * only for fields that are "opaque" blobs: fields which have a) have no + * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a + * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK. For example, + * OBD_SET_INFO has a key field and an opaque value field that gets interpreted + * according to the key field. When the value, according to the key, contains a + * structure (or array thereof) to be swabbed, the format should be changed to + * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set + * accordingly. + */ +void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt) +{ + int i; + size_t j; + + const struct req_format *old; + + LASSERT(pill->rc_fmt != NULL); + LASSERT(__req_format_is_sane(fmt)); + + old = pill->rc_fmt; + /* + * Sanity checking... + */ + for (i = 0; i < RCL_NR; ++i) { + LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr); + for (j = 0; j < old->rf_fields[i].nr - 1; ++j) { + const struct req_msg_field *ofield = FMT_FIELD(old, i, j); + + /* "opaque" fields can be transmogrified */ + if (ofield->rmf_swabber == NULL && + (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 && + (ofield->rmf_size == -1 || + ofield->rmf_flags == RMF_F_NO_SIZE_CHECK)) + continue; + LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j)); + } + /* + * Last field in old format can be shorter than in new. + */ + LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >= + FMT_FIELD(old, i, j)->rmf_size); + } + + pill->rc_fmt = fmt; +} +EXPORT_SYMBOL(req_capsule_extend); + +/** + * This function returns a non-zero value if the given \a field is present in + * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it + * returns 0. + */ +int req_capsule_has_field(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + return field->rmf_offset[pill->rc_fmt->rf_idx][loc]; +} +EXPORT_SYMBOL(req_capsule_has_field); + +/** + * Returns a non-zero value if the given \a field is present in the given \a + * pill's PTLRPC request or reply (\a loc), else it returns 0. + */ +int req_capsule_field_present(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + __u32 offset; + + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + LASSERT(req_capsule_has_field(pill, field, loc)); + + offset = __req_capsule_offset(pill, field, loc); + return lustre_msg_bufcount(__req_msg(pill, loc)) > offset; +} +EXPORT_SYMBOL(req_capsule_field_present); + +/** + * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC + * request or reply (\a loc). + * + * This is not the opposite of req_capsule_extend(). + */ +void req_capsule_shrink(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 newlen, + enum req_location loc) +{ + const struct req_format *fmt; + struct lustre_msg *msg; + __u32 len; + int offset; + + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + LASSERT(__req_format_is_sane(fmt)); + LASSERT(req_capsule_has_field(pill, field, loc)); + LASSERT(req_capsule_field_present(pill, field, loc)); + + offset = __req_capsule_offset(pill, field, loc); + + msg = __req_msg(pill, loc); + len = lustre_msg_buflen(msg, offset); + LASSERTF(newlen <= len, "%s:%s, oldlen=%u, newlen=%u\n", + fmt->rf_name, field->rmf_name, len, newlen); + + if (loc == RCL_CLIENT) + pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen, + 1); + else + pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen, + 1); +} +EXPORT_SYMBOL(req_capsule_shrink); + +int req_capsule_server_grow(struct req_capsule *pill, + const struct req_msg_field *field, + __u32 newlen) +{ + struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs; + char *from, *to; + int rc; + __u32 offset, len; + + LASSERT(pill->rc_fmt != NULL); + LASSERT(__req_format_is_sane(pill->rc_fmt)); + LASSERT(req_capsule_has_field(pill, field, RCL_SERVER)); + LASSERT(req_capsule_field_present(pill, field, RCL_SERVER)); + + len = req_capsule_get_size(pill, field, RCL_SERVER); + offset = __req_capsule_offset(pill, field, RCL_SERVER); + if ((__u32)pill->rc_req->rq_repbuf_len >= + lustre_packed_msg_size(pill->rc_req->rq_repmsg) - len + newlen) + CERROR("Inplace repack might be done\n"); + + pill->rc_req->rq_reply_state = NULL; + req_capsule_set_size(pill, field, RCL_SERVER, newlen); + rc = req_capsule_server_pack(pill); + if (rc) { + /* put old rs back, the caller will decide what to do */ + pill->rc_req->rq_reply_state = rs; + return rc; + } + nrs = pill->rc_req->rq_reply_state; + /* Now we need only buffers, copy first chunk */ + to = lustre_msg_buf(nrs->rs_msg, 0, 0); + from = lustre_msg_buf(rs->rs_msg, 0, 0); + len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) - from; + memcpy(to, from, len); + /* check if we have tail and copy it too */ + if (rs->rs_msg->lm_bufcount > offset + 1) { + to = lustre_msg_buf(nrs->rs_msg, offset + 1, 0); + from = lustre_msg_buf(rs->rs_msg, offset + 1, 0); + offset = rs->rs_msg->lm_bufcount - 1; + len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) + + cfs_size_round(rs->rs_msg->lm_buflens[offset]) - from; + memcpy(to, from, len); + } + /* drop old reply if everything is fine */ + if (rs->rs_difficult) { + /* copy rs data */ + int i; + + nrs->rs_difficult = 1; + nrs->rs_no_ack = rs->rs_no_ack; + nrs->rs_convert_lock = rs->rs_convert_lock; + for (i = 0; i < rs->rs_nlocks; i++) { + nrs->rs_locks[i] = rs->rs_locks[i]; + nrs->rs_modes[i] = rs->rs_modes[i]; + nrs->rs_nlocks++; + } + rs->rs_nlocks = 0; + rs->rs_difficult = 0; + rs->rs_no_ack = 0; + } + ptlrpc_rs_decref(rs); + return 0; +} +EXPORT_SYMBOL(req_capsule_server_grow); + +int req_check_sepol(struct req_capsule *pill) +{ + int rc = 0; +#ifdef HAVE_SERVER_SUPPORT + struct obd_export *export; + struct lu_nodemap *nm = NULL; + const char *sepol = NULL; + const char *nm_sepol = NULL; + + if (!pill->rc_req) + return -EPROTO; + + export = pill->rc_req->rq_export; + if (!export || !exp_connect_sepol(export) || + !req_capsule_has_field(pill, &RMF_SELINUX_POL, RCL_CLIENT)) + goto nm; + + if (req_capsule_get_size(pill, &RMF_SELINUX_POL, RCL_CLIENT) == 0) + goto nm; + + sepol = req_capsule_client_get(pill, &RMF_SELINUX_POL); + CDEBUG(D_SEC, "retrieved sepol %s\n", sepol); + +nm: + if (export) { + nm = nodemap_get_from_exp(export); + if (!IS_ERR_OR_NULL(nm)) { + nm_sepol = nodemap_get_sepol(nm); + if (nm_sepol && nm_sepol[0]) + if (sepol == NULL || + strcmp(sepol, nm_sepol) != 0) + rc = -EACCES; + } + } + + if (!IS_ERR_OR_NULL(nm)) + nodemap_putref(nm); +#endif + + return rc; +} +EXPORT_SYMBOL(req_check_sepol); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c new file mode 100644 index 0000000000000..0f149b692362c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c @@ -0,0 +1,338 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/llog_client.c + * + * remote api for llog - client side + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include + +#include +#include +#include + +#define LLOG_CLIENT_ENTRY(ctxt, imp) do { \ + mutex_lock(&ctxt->loc_mutex); \ + if (ctxt->loc_imp) { \ + imp = class_import_get(ctxt->loc_imp); \ + } else { \ + CERROR("ctxt->loc_imp == NULL for context idx %d." \ + "Unable to complete MDS/OSS recovery," \ + "but I'll try again next time. Not fatal.\n", \ + ctxt->loc_idx); \ + imp = NULL; \ + mutex_unlock(&ctxt->loc_mutex); \ + return (-EINVAL); \ + } \ + mutex_unlock(&ctxt->loc_mutex); \ +} while(0) + +#define LLOG_CLIENT_EXIT(ctxt, imp) do { \ + mutex_lock(&ctxt->loc_mutex); \ + if (ctxt->loc_imp != imp) \ + CWARN("loc_imp has changed from %p to %p\n", \ + ctxt->loc_imp, imp); \ + class_import_put(imp); \ + mutex_unlock(&ctxt->loc_mutex); \ +} while(0) + +/* This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. */ +static int llog_client_open(const struct lu_env *env, + struct llog_handle *lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param) +{ + struct obd_import *imp; + struct llogd_body *body; + struct llog_ctxt *ctxt = lgh->lgh_ctxt; + struct ptlrpc_request *req = NULL; + int rc; + ENTRY; + + LLOG_CLIENT_ENTRY(ctxt, imp); + + /* client cannot create llog */ + LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param); + LASSERT(lgh); + + req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + if (name) + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + strlen(name) + 1); + + rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_CREATE); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + GOTO(out, rc); + } + ptlrpc_request_set_replen(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (logid) + body->lgd_logid = *logid; + body->lgd_ctxt_idx = ctxt->loc_idx - 1; + + if (name) { + char *tmp; + tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME, + strlen(name) + 1); + LASSERT(tmp); + strcpy(tmp, name); + } + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + GOTO(out, rc = -EFAULT); + + lgh->lgh_id = body->lgd_logid; + lgh->lgh_ctxt = ctxt; + EXIT; +out: + LLOG_CLIENT_EXIT(ctxt, imp); + ptlrpc_req_finished(req); + return rc; +} + +static int llog_client_next_block(const struct lu_env *env, + struct llog_handle *loghandle, + int *cur_idx, int next_idx, + __u64 *cur_offset, void *buf, int len) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + void *ptr; + int rc; + ENTRY; + + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK); + if (req == NULL) + GOTO(err_exit, rc =-ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + body->lgd_index = next_idx; + body->lgd_saved_index = *cur_idx; + body->lgd_len = len; + body->lgd_cur_offset = *cur_offset; + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len); + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + /* -EIO has a special meaning here. If llog_osd_next_block() + * reaches the end of the log without finding the desired + * record then it updates *cur_offset and *cur_idx and returns + * -EIO. In llog_process_thread() we use this to detect + * EOF. But we must be careful to distinguish between -EIO + * coming from llog_osd_next_block() and -EIO coming from + * ptlrpc or below. */ + if (rc == -EIO) { + if (req->rq_repmsg == NULL || + lustre_msg_get_status(req->rq_repmsg) != -EIO) + GOTO(out, rc); + } else if (rc < 0) { + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + GOTO(out, rc = -EFAULT); + + *cur_idx = body->lgd_saved_index; + *cur_offset = body->lgd_cur_offset; + + if (rc < 0) + GOTO(out, rc); + + /* The log records are swabbed as they are processed */ + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + if (ptr == NULL) + GOTO(out, rc =-EFAULT); + + memcpy(buf, ptr, len); + EXIT; +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + void *ptr; + int rc; + ENTRY; + + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_PREV_BLOCK); + if (req == NULL) + GOTO(err_exit, rc = -ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + body->lgd_index = prev_idx; + body->lgd_len = len; + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len); + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + GOTO(out, rc =-EFAULT); + + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + if (ptr == NULL) + GOTO(out, rc =-EFAULT); + + memcpy(buf, ptr, len); + EXIT; +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_read_header(const struct lu_env *env, + struct llog_handle *handle) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + struct llog_log_hdr *hdr; + struct llog_rec_hdr *llh_hdr; + int rc; + ENTRY; + + LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp,&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_READ_HEADER); + if (req == NULL) + GOTO(err_exit, rc = -ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = handle->lgh_id; + body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = handle->lgh_hdr->llh_flags; + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR); + if (hdr == NULL) + GOTO(out, rc =-EFAULT); + + if (handle->lgh_hdr_size < hdr->llh_hdr.lrh_len) + GOTO(out, rc = -EFAULT); + + memcpy(handle->lgh_hdr, hdr, hdr->llh_hdr.lrh_len); + handle->lgh_last_idx = LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index; + + /* sanity checks */ + llh_hdr = &handle->lgh_hdr->llh_hdr; + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { + CERROR("bad log header magic: %#x (expecting %#x)\n", + llh_hdr->lrh_type, LLOG_HDR_MAGIC); + rc = -EIO; + } else if (llh_hdr->lrh_len != + LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len || + (llh_hdr->lrh_len & (llh_hdr->lrh_len - 1)) != 0 || + llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE || + llh_hdr->lrh_len > handle->lgh_hdr_size) { + CERROR("incorrectly sized log header: %#x, " + "expecting %#x (power of two > 8192)\n", + llh_hdr->lrh_len, + LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len); + CERROR("you may need to re-run lconf --write_conf.\n"); + rc = -EIO; + } + EXIT; +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_close(const struct lu_env *env, + struct llog_handle *handle) +{ + /* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because + the servers all close the file at the end of every + other LLOG_ RPC. */ + return(0); +} + +struct llog_operations llog_client_ops = { + .lop_next_block = llog_client_next_block, + .lop_prev_block = llog_client_prev_block, + .lop_read_header = llog_client_read_header, + .lop_open = llog_client_open, + .lop_close = llog_client_close, +}; +EXPORT_SYMBOL(llog_client_ops); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c new file mode 100644 index 0000000000000..9036491a1a89a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c @@ -0,0 +1,67 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/llog_net.c + * + * OST<->MDS recovery logging infrastructure. + * + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include +#include + +int llog_initiator_connect(struct llog_ctxt *ctxt) +{ + struct obd_import *new_imp; + ENTRY; + + LASSERT(ctxt); + new_imp = ctxt->loc_obd->u.cli.cl_import; + LASSERTF(ctxt->loc_imp == NULL || ctxt->loc_imp == new_imp, + "%p - %p\n", ctxt->loc_imp, new_imp); + mutex_lock(&ctxt->loc_mutex); + if (ctxt->loc_imp != new_imp) { + if (ctxt->loc_imp) + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = class_import_get(new_imp); + } + mutex_unlock(&ctxt->loc_mutex); + RETURN(0); +} +EXPORT_SYMBOL(llog_initiator_connect); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c new file mode 100644 index 0000000000000..ca91a1c9491ac --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c @@ -0,0 +1,287 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/llog_server.c + * + * remote api for llog - server side + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include +#include +#include + +static int llog_origin_close(const struct lu_env *env, struct llog_handle *lgh) +{ + if (lgh->lgh_hdr != NULL && lgh->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + return llog_cat_close(env, lgh); + else + return llog_close(env, lgh); +} + +/* Only open is supported, no new llog can be created remotely */ +int llog_origin_handle_open(struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + struct obd_device *obd = exp->exp_obd; + struct llog_handle *loghandle; + struct llogd_body *body; + struct llog_logid *logid = NULL; + struct llog_ctxt *ctxt; + char *name = NULL; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(err_serious(-EFAULT)); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(err_serious(-ENOMEM)); + + if (ostid_id(&body->lgd_logid.lgl_oi) > 0) + logid = &body->lgd_logid; + + if (req_capsule_field_present(&req->rq_pill, &RMF_NAME, RCL_CLIENT)) { + name = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + if (name == NULL) + RETURN(-EFAULT); + CDEBUG(D_INFO, "%s: opening log %s\n", obd->obd_name, name); + } + + if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) { + CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d name=%s\n", + obd->obd_name, body->lgd_ctxt_idx, name); + RETURN(-EPROTO); + } + + ctxt = llog_get_context(obd, body->lgd_ctxt_idx); + if (ctxt == NULL) { + CDEBUG(D_WARNING, "%s: no ctxt. group=%p idx=%d name=%s\n", + obd->obd_name, &obd->obd_olg, body->lgd_ctxt_idx, name); + RETURN(-ENODEV); + } + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, logid, + name, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_ctxt, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + + llog_origin_close(req->rq_svc_thread->t_env, loghandle); + EXIT; +out_ctxt: + llog_ctxt_put(ctxt); + return rc; +} + +int llog_origin_handle_next_block(struct ptlrpc_request *req) +{ + struct llog_handle *loghandle; + struct llogd_body *body; + struct llogd_body *repbody; + struct llog_ctxt *ctxt; + __u32 flags; + void *ptr; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(err_serious(-EFAULT)); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, + LLOG_MIN_CHUNK_SIZE); + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(err_serious(-ENOMEM)); + + if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) { + CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n", + req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx); + RETURN(-EPROTO); + } + + ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + RETURN(-ENODEV); + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, + &body->lgd_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_ctxt, rc); + + flags = body->lgd_llh_flags; + rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags, + NULL); + if (rc) + GOTO(out_close, rc); + + repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + *repbody = *body; + + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + rc = llog_next_block(req->rq_svc_thread->t_env, loghandle, + &repbody->lgd_saved_index, repbody->lgd_index, + &repbody->lgd_cur_offset, ptr, + LLOG_MIN_CHUNK_SIZE); + if (rc) + GOTO(out_close, rc); + EXIT; +out_close: + llog_origin_close(req->rq_svc_thread->t_env, loghandle); +out_ctxt: + llog_ctxt_put(ctxt); + return rc; +} + +int llog_origin_handle_prev_block(struct ptlrpc_request *req) +{ + struct llog_handle *loghandle; + struct llogd_body *body; + struct llogd_body *repbody; + struct llog_ctxt *ctxt; + __u32 flags; + void *ptr; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(err_serious(-EFAULT)); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, + LLOG_MIN_CHUNK_SIZE); + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(err_serious(-ENOMEM)); + + if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) { + CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n", + req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx); + RETURN(-EPROTO); + } + + ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + RETURN(-ENODEV); + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, + &body->lgd_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_ctxt, rc); + + flags = body->lgd_llh_flags; + rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags, + NULL); + if (rc) + GOTO(out_close, rc); + + repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + *repbody = *body; + + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + rc = llog_prev_block(req->rq_svc_thread->t_env, loghandle, + body->lgd_index, ptr, LLOG_MIN_CHUNK_SIZE); + if (rc) + GOTO(out_close, rc); + + EXIT; +out_close: + llog_origin_close(req->rq_svc_thread->t_env, loghandle); +out_ctxt: + llog_ctxt_put(ctxt); + return rc; +} + +int llog_origin_handle_read_header(struct ptlrpc_request *req) +{ + struct llog_handle *loghandle; + struct llogd_body *body; + struct llog_log_hdr *hdr; + struct llog_ctxt *ctxt; + __u32 flags; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(err_serious(-EFAULT)); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + RETURN(err_serious(-ENOMEM)); + + if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) { + CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n", + req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx); + RETURN(-EPROTO); + } + + ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + RETURN(-ENODEV); + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, + &body->lgd_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_ctxt, rc); + + /* + * llog_init_handle() reads the llog header + */ + flags = body->lgd_llh_flags; + rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags, + NULL); + if (rc) + GOTO(out_close, rc); + flags = loghandle->lgh_hdr->llh_flags; + + hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR); + *hdr = *loghandle->lgh_hdr; + EXIT; +out_close: + llog_origin_close(req->rq_svc_thread->t_env, loghandle); +out_ctxt: + llog_ctxt_put(ctxt); + return rc; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c new file mode 100644 index 0000000000000..bf7d4164cc071 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c @@ -0,0 +1,1469 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + + +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + + +static struct ll_rpc_opcode { + __u32 opcode; + const char *opname; +} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = { + { OST_REPLY, "ost_reply" }, + { OST_GETATTR, "ost_getattr" }, + { OST_SETATTR, "ost_setattr" }, + { OST_READ, "ost_read" }, + { OST_WRITE, "ost_write" }, + { OST_CREATE , "ost_create" }, + { OST_DESTROY, "ost_destroy" }, + { OST_GET_INFO, "ost_get_info" }, + { OST_CONNECT, "ost_connect" }, + { OST_DISCONNECT, "ost_disconnect" }, + { OST_PUNCH, "ost_punch" }, + { OST_OPEN, "ost_open" }, + { OST_CLOSE, "ost_close" }, + { OST_STATFS, "ost_statfs" }, + { 14, NULL }, /* formerly OST_SAN_READ */ + { 15, NULL }, /* formerly OST_SAN_WRITE */ + { OST_SYNC, "ost_sync" }, + { OST_SET_INFO, "ost_set_info" }, + { OST_QUOTACHECK, "ost_quotacheck" }, + { OST_QUOTACTL, "ost_quotactl" }, + { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" }, + { OST_LADVISE, "ost_ladvise" }, + { MDS_GETATTR, "mds_getattr" }, + { MDS_GETATTR_NAME, "mds_getattr_lock" }, + { MDS_CLOSE, "mds_close" }, + { MDS_REINT, "mds_reint" }, + { MDS_READPAGE, "mds_readpage" }, + { MDS_CONNECT, "mds_connect" }, + { MDS_DISCONNECT, "mds_disconnect" }, + { MDS_GET_ROOT, "mds_get_root" }, + { MDS_STATFS, "mds_statfs" }, + { MDS_PIN, "mds_pin" }, + { MDS_UNPIN, "mds_unpin" }, + { MDS_SYNC, "mds_sync" }, + { MDS_DONE_WRITING, "mds_done_writing" }, + { MDS_SET_INFO, "mds_set_info" }, + { MDS_QUOTACHECK, "mds_quotacheck" }, + { MDS_QUOTACTL, "mds_quotactl" }, + { MDS_GETXATTR, "mds_getxattr" }, + { MDS_SETXATTR, "mds_setxattr" }, + { MDS_WRITEPAGE, "mds_writepage" }, + { MDS_IS_SUBDIR, "mds_is_subdir" }, + { MDS_GET_INFO, "mds_get_info" }, + { MDS_HSM_STATE_GET, "mds_hsm_state_get" }, + { MDS_HSM_STATE_SET, "mds_hsm_state_set" }, + { MDS_HSM_ACTION, "mds_hsm_action" }, + { MDS_HSM_PROGRESS, "mds_hsm_progress" }, + { MDS_HSM_REQUEST, "mds_hsm_request" }, + { MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" }, + { MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" }, + { MDS_SWAP_LAYOUTS, "mds_swap_layouts" }, + { MDS_RMFID, "mds_rmfid" }, + { LDLM_ENQUEUE, "ldlm_enqueue" }, + { LDLM_CONVERT, "ldlm_convert" }, + { LDLM_CANCEL, "ldlm_cancel" }, + { LDLM_BL_CALLBACK, "ldlm_bl_callback" }, + { LDLM_CP_CALLBACK, "ldlm_cp_callback" }, + { LDLM_GL_CALLBACK, "ldlm_gl_callback" }, + { LDLM_SET_INFO, "ldlm_set_info" }, + { MGS_CONNECT, "mgs_connect" }, + { MGS_DISCONNECT, "mgs_disconnect" }, + { MGS_EXCEPTION, "mgs_exception" }, + { MGS_TARGET_REG, "mgs_target_reg" }, + { MGS_TARGET_DEL, "mgs_target_del" }, + { MGS_SET_INFO, "mgs_set_info" }, + { MGS_CONFIG_READ, "mgs_config_read" }, + { OBD_PING, "obd_ping" }, + { 401, /* was OBD_LOG_CANCEL */ "llog_cancel" }, + { 402, /* was OBD_QC_CALLBACK */ "obd_quota_callback" }, + { OBD_IDX_READ, "dt_index_read" }, + { LLOG_ORIGIN_HANDLE_CREATE, "llog_origin_handle_open" }, + { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" }, + { LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" }, + { 504, /*LLOG_ORIGIN_HANDLE_WRITE_REC*/"llog_origin_handle_write_rec" }, + { 505, /* was LLOG_ORIGIN_HANDLE_CLOSE */ "llog_origin_handle_close" }, + { 506, /* was LLOG_ORIGIN_CONNECT */ "llog_origin_connect" }, + { 507, /* was LLOG_CATINFO */ "llog_catinfo" }, + { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" }, + { LLOG_ORIGIN_HANDLE_DESTROY, "llog_origin_handle_destroy" }, + { QUOTA_DQACQ, "quota_acquire" }, + { QUOTA_DQREL, "quota_release" }, + { SEQ_QUERY, "seq_query" }, + { SEC_CTX_INIT, "sec_ctx_init" }, + { SEC_CTX_INIT_CONT,"sec_ctx_init_cont" }, + { SEC_CTX_FINI, "sec_ctx_fini" }, + { FLD_QUERY, "fld_query" }, + { FLD_READ, "fld_read" }, + { OUT_UPDATE, "out_update" }, + { LFSCK_NOTIFY, "lfsck_notify" }, + { LFSCK_QUERY, "lfsck_query" }, +}; + +static struct ll_eopcode { + __u32 opcode; + const char *opname; +} ll_eopcode_table[EXTRA_LAST_OPC] = { + { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" }, + { LDLM_PLAIN_ENQUEUE, "ldlm_plain_enqueue" }, + { LDLM_EXTENT_ENQUEUE, "ldlm_extent_enqueue" }, + { LDLM_FLOCK_ENQUEUE, "ldlm_flock_enqueue" }, + { LDLM_IBITS_ENQUEUE, "ldlm_ibits_enqueue" }, + { MDS_REINT_SETATTR, "mds_reint_setattr" }, + { MDS_REINT_CREATE, "mds_reint_create" }, + { MDS_REINT_LINK, "mds_reint_link" }, + { MDS_REINT_UNLINK, "mds_reint_unlink" }, + { MDS_REINT_RENAME, "mds_reint_rename" }, + { MDS_REINT_OPEN, "mds_reint_open" }, + { MDS_REINT_SETXATTR, "mds_reint_setxattr" }, + { MDS_REINT_RESYNC, "mds_reint_resync" }, + { BRW_READ_BYTES, "read_bytes" }, + { BRW_WRITE_BYTES, "write_bytes" }, +}; + +const char *ll_opcode2str(__u32 opcode) +{ + /* When one of the assertions below fail, chances are that: + * 1) A new opcode was added in include/lustre/lustre_idl.h, + * but is missing from the table above. + * or 2) The opcode space was renumbered or rearranged, + * and the opcode_offset() function in + * ptlrpc_internal.h needs to be modified. + */ + __u32 offset = opcode_offset(opcode); + LASSERTF(offset < LUSTRE_MAX_OPCODES, + "offset %u >= LUSTRE_MAX_OPCODES %u\n", + offset, LUSTRE_MAX_OPCODES); + LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode, + "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n", + offset, ll_rpc_opcode_table[offset].opcode, opcode); + return ll_rpc_opcode_table[offset].opname; +} + +const int ll_str2opcode(const char *ops) +{ + int i; + + for (i = 0; i < LUSTRE_MAX_OPCODES; i++) { + if (ll_rpc_opcode_table[i].opname != NULL && + strcmp(ll_rpc_opcode_table[i].opname, ops) == 0) + return ll_rpc_opcode_table[i].opcode; + } + + return -EINVAL; +} + +static const char *ll_eopcode2str(__u32 opcode) +{ + LASSERT(ll_eopcode_table[opcode].opcode == opcode); + return ll_eopcode_table[opcode].opname; +} + +static void +ptlrpc_ldebugfs_register(struct dentry *root, char *dir, char *name, + struct dentry **debugfs_root_ret, + struct lprocfs_stats **stats_ret) +{ + struct dentry *svc_debugfs_entry; + struct lprocfs_stats *svc_stats; + int i, rc; + unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX | + LPROCFS_CNTR_STDDEV; + + LASSERT(!*debugfs_root_ret); + LASSERT(!*stats_ret); + + svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES + LUSTRE_MAX_OPCODES, + 0); + if (!svc_stats) + return; + + if (dir) { + svc_debugfs_entry = ldebugfs_register(dir, root, NULL, NULL); + if (IS_ERR(svc_debugfs_entry)) { + lprocfs_free_stats(&svc_stats); + return; + } + } else { + svc_debugfs_entry = root; + } + + lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR, + svc_counter_config, "req_waittime", "usec"); + lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR, + svc_counter_config, "req_qdepth", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR, + svc_counter_config, "req_active", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT, + svc_counter_config, "req_timeout", "sec"); + lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR, + svc_counter_config, "reqbuf_avail", "bufs"); + for (i = 0; i < EXTRA_LAST_OPC; i++) { + char *units; + + switch (i) { + case BRW_WRITE_BYTES: + case BRW_READ_BYTES: + units = "bytes"; + break; + default: + units = "reqs"; + break; + } + lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i, + svc_counter_config, + ll_eopcode2str(i), units); + } + for (i = 0; i < LUSTRE_MAX_OPCODES; i++) { + __u32 opcode = ll_rpc_opcode_table[i].opcode; + lprocfs_counter_init(svc_stats, + EXTRA_MAX_OPCODES + i, svc_counter_config, + ll_opcode2str(opcode), "usec"); + } + + rc = ldebugfs_register_stats(svc_debugfs_entry, name, svc_stats); + if (rc < 0) { + if (dir) + ldebugfs_remove(&svc_debugfs_entry); + lprocfs_free_stats(&svc_stats); + } else { + if (dir) + *debugfs_root_ret = svc_debugfs_entry; + *stats_ret = svc_stats; + } +} + +static int +ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svcpt->scp_hist_nrqbds; + + seq_printf(m, "%d\n", total); + return 0; +} + + +LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len); + +static int +ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svc->srv_hist_nrqbds_cpt_max; + + seq_printf(m, "%d\n", total); + return 0; +} + +static ssize_t +ptlrpc_lprocfs_req_history_max_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + unsigned long long val; + unsigned long long limit; + int bufpages; + int rc; + + rc = kstrtoull_from_user(buffer, count, 0, &val); + if (rc < 0) + return rc; + + if (val < 0 || val > INT_MAX) + return -ERANGE; + + /* This sanity check is more of an insanity check; we can still + * hose a kernel by allowing the request history to grow too + * far. The roundup to the next power of two is an empirical way + * to take care that request buffer is allocated in Slab and thus + * will be upgraded */ + bufpages = (roundup_pow_of_two(svc->srv_buf_size) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + limit = cfs_totalram_pages() / (2 * bufpages); + /* do not allow history to consume more than half max number of rqbds */ + if ((svc->srv_nrqbds_max == 0 && val > limit) || + (svc->srv_nrqbds_max != 0 && val > svc->srv_nrqbds_max / 2)) + return -ERANGE; + + spin_lock(&svc->srv_lock); + + if (val == 0) + svc->srv_hist_nrqbds_cpt_max = 0; + else + svc->srv_hist_nrqbds_cpt_max = + max(1, ((int)val / svc->srv_ncpts)); + + spin_unlock(&svc->srv_lock); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_history_max); + +static int +ptlrpc_lprocfs_req_buffers_max_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + + seq_printf(m, "%d\n", svc->srv_nrqbds_max); + return 0; +} + +static ssize_t +ptlrpc_lprocfs_req_buffers_max_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + int val; + int rc; + + rc = kstrtoint_from_user(buffer, count, 0, &val); + if (rc < 0) + return rc; + + if (val < svc->srv_nbuf_per_group && val != 0) + return -ERANGE; + + spin_lock(&svc->srv_lock); + + svc->srv_nrqbds_max = (uint)val; + + spin_unlock(&svc->srv_lock); + + return count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_buffers_max); + +static ssize_t threads_min_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + + return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_init * svc->srv_ncpts); +} + +static ssize_t threads_min_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 10, &val); + if (rc < 0) + return rc; + + if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT) + return -ERANGE; + + spin_lock(&svc->srv_lock); + if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) { + spin_unlock(&svc->srv_lock); + return -ERANGE; + } + + svc->srv_nthrs_cpt_init = (int)val / svc->srv_ncpts; + + spin_unlock(&svc->srv_lock); + + return count; +} +LUSTRE_RW_ATTR(threads_min); + +static ssize_t threads_started_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svcpt->scp_nthrs_running; + + return sprintf(buf, "%d\n", total); +} +LUSTRE_RO_ATTR(threads_started); + +static ssize_t threads_max_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + + return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_limit * svc->srv_ncpts); +} + +static ssize_t threads_max_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 10, &val); + if (rc < 0) + return rc; + + if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT) + return -ERANGE; + + spin_lock(&svc->srv_lock); + if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) { + spin_unlock(&svc->srv_lock); + return -ERANGE; + } + + svc->srv_nthrs_cpt_limit = (int)val / svc->srv_ncpts; + + spin_unlock(&svc->srv_lock); + + return count; +} +LUSTRE_RW_ATTR(threads_max); + +/** + * Translates \e ptlrpc_nrs_pol_state values to human-readable strings. + * + * \param[in] state The policy state + */ +static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state) +{ + switch (state) { + default: + LBUG(); + case NRS_POL_STATE_INVALID: + return "invalid"; + case NRS_POL_STATE_STOPPED: + return "stopped"; + case NRS_POL_STATE_STOPPING: + return "stopping"; + case NRS_POL_STATE_STARTING: + return "starting"; + case NRS_POL_STATE_STARTED: + return "started"; + } +} + +/** + * Obtains status information for \a policy. + * + * Information is copied in \a info. + * + * \param[in] policy The policy + * \param[out] info Holds returned status information + */ +void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_pol_info *info) +{ + LASSERT(policy != NULL); + LASSERT(info != NULL); + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + CLASSERT(sizeof(info->pi_arg) == sizeof(policy->pol_arg)); + memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX); + memcpy(info->pi_arg, policy->pol_arg, sizeof(policy->pol_arg)); + + info->pi_fallback = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK); + info->pi_state = policy->pol_state; + /** + * XXX: These are accessed without holding + * ptlrpc_service_part::scp_req_lock. + */ + info->pi_req_queued = policy->pol_req_queued; + info->pi_req_started = policy->pol_req_started; +} + +/** + * Reads and prints policy status information for all policies of a PTLRPC + * service. + */ +static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_nrs *nrs; + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_pol_info *infos; + struct ptlrpc_nrs_pol_info tmp; + unsigned num_pols; + unsigned pol_idx = 0; + bool hp = false; + int i; + int rc = 0; + ENTRY; + + /** + * Serialize NRS core lprocfs operations with policy registration/ + * unregistration. + */ + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Use the first service partition's regular NRS head in order to obtain + * the number of policies registered with NRS heads of this service. All + * service partitions will have the same number of policies. + */ + nrs = nrs_svcpt2nrs(svc->srv_parts[0], false); + + spin_lock(&nrs->nrs_lock); + num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols; + spin_unlock(&nrs->nrs_lock); + + OBD_ALLOC(infos, num_pols * sizeof(*infos)); + if (infos == NULL) + GOTO(out, rc = -ENOMEM); +again: + + ptlrpc_service_for_each_part(svcpt, i, svc) { + nrs = nrs_svcpt2nrs(svcpt, hp); + spin_lock(&nrs->nrs_lock); + + pol_idx = 0; + + list_for_each_entry(policy, &nrs->nrs_policy_list, + pol_list) { + LASSERT(pol_idx < num_pols); + + nrs_policy_get_info_locked(policy, &tmp); + /** + * Copy values when handling the first service + * partition. + */ + if (i == 0) { + memcpy(infos[pol_idx].pi_name, tmp.pi_name, + NRS_POL_NAME_MAX); + memcpy(infos[pol_idx].pi_arg, tmp.pi_arg, + sizeof(tmp.pi_arg)); + memcpy(&infos[pol_idx].pi_state, &tmp.pi_state, + sizeof(tmp.pi_state)); + infos[pol_idx].pi_fallback = tmp.pi_fallback; + /** + * For the rest of the service partitions + * sanity-check the values we get. + */ + } else { + if (strncmp(infos[pol_idx].pi_name, + tmp.pi_name, + NRS_POL_NAME_MAX) != 0) { + spin_unlock(&nrs->nrs_lock); + rc = -EINVAL; + CERROR("%s: failed to check pi_name: rc = %d\n", + svc->srv_thread_name, rc); + GOTO(out, rc); + } + if (strncmp(infos[pol_idx].pi_arg, + tmp.pi_arg, + sizeof(tmp.pi_arg)) != 0) { + spin_unlock(&nrs->nrs_lock); + rc = -EINVAL; + CERROR("%s: failed to check pi_arg: rc = %d\n", + svc->srv_thread_name, rc); + GOTO(out, rc); + } + /** + * Not checking ptlrpc_nrs_pol_info::pi_state, + * because it may be different between + * instances of the same policy in different + * service partitions. + */ + + if (infos[pol_idx].pi_fallback != + tmp.pi_fallback) { + spin_unlock(&nrs->nrs_lock); + rc = -EINVAL; + CERROR("%s: failed to check pi_fallback: rc = %d\n", + svc->srv_thread_name, rc); + GOTO(out, rc); + } + } + + infos[pol_idx].pi_req_queued += tmp.pi_req_queued; + infos[pol_idx].pi_req_started += tmp.pi_req_started; + + pol_idx++; + } + spin_unlock(&nrs->nrs_lock); + } + + /** + * Policy status information output is in YAML format. + * For example: + * + * regular_requests: + * - name: fifo + * state: started + * fallback: yes + * queued: 0 + * active: 0 + * + * - name: crrn + * state: started + * fallback: no + * queued: 2015 + * active: 384 + * + * high_priority_requests: + * - name: fifo + * state: started + * fallback: yes + * queued: 0 + * active: 2 + * + * - name: crrn + * state: stopped + * fallback: no + * queued: 0 + * active: 0 + */ + seq_printf(m, "%s\n", !hp ? "\nregular_requests:" : + "high_priority_requests:"); + + for (pol_idx = 0; pol_idx < num_pols; pol_idx++) { + if (strlen(infos[pol_idx].pi_arg) > 0) + seq_printf(m, " - name: %s %s\n", + infos[pol_idx].pi_name, + infos[pol_idx].pi_arg); + else + seq_printf(m, " - name: %s\n", + infos[pol_idx].pi_name); + + + seq_printf(m, " state: %s\n" + " fallback: %s\n" + " queued: %-20d\n" + " active: %-20d\n\n", + nrs_state2str(infos[pol_idx].pi_state), + infos[pol_idx].pi_fallback ? "yes" : "no", + (int)infos[pol_idx].pi_req_queued, + (int)infos[pol_idx].pi_req_started); + } + + if (!hp && nrs_svc_has_hp(svc)) { + memset(infos, 0, num_pols * sizeof(*infos)); + + /** + * Redo the processing for the service's HP NRS heads' policies. + */ + hp = true; + goto again; + } + +out: + if (infos) + OBD_FREE(infos, num_pols * sizeof(*infos)); + + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} + + +#define LPROCFS_NRS_WR_MAX_ARG (1024) +/** + * The longest valid command string is the maxium policy name size, plus the + * length of the " reg" substring, plus the lenght of argument + */ +#define LPROCFS_NRS_WR_MAX_CMD (NRS_POL_NAME_MAX + sizeof(" reg") - 1 \ + + LPROCFS_NRS_WR_MAX_ARG) + +/** + * Starts and stops a given policy on a PTLRPC service. + * + * Commands consist of the policy name, followed by an optional [reg|hp] token; + * if the optional token is omitted, the operation is performed on both the + * regular and high-priority (if the service has one) NRS head. + */ +static ssize_t +ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH; + char *cmd; + char *cmd_copy = NULL; + char *policy_name; + char *queue_name; + int rc = 0; + ENTRY; + + if (count >= LPROCFS_NRS_WR_MAX_CMD) + GOTO(out, rc = -EINVAL); + + OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD); + if (cmd == NULL) + GOTO(out, rc = -ENOMEM); + /** + * strsep() modifies its argument, so keep a copy + */ + cmd_copy = cmd; + + if (copy_from_user(cmd, buffer, count)) + GOTO(out, rc = -EFAULT); + + cmd[count] = '\0'; + + policy_name = strsep(&cmd, " "); + + if (strlen(policy_name) > NRS_POL_NAME_MAX - 1) + GOTO(out, rc = -EINVAL); + + /** + * No [reg|hp] token has been specified + */ + if (cmd == NULL) + goto default_queue; + + queue_name = strsep(&cmd, " "); + /** + * The second token is either an optional [reg|hp] string, + * or arguments + */ + if (strcmp(queue_name, "reg") == 0) + queue = PTLRPC_NRS_QUEUE_REG; + else if (strcmp(queue_name, "hp") == 0) + queue = PTLRPC_NRS_QUEUE_HP; + else { + if (cmd != NULL) + *(cmd - 1) = ' '; + cmd = queue_name; + } + +default_queue: + + if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc)) + GOTO(out, rc = -ENODEV); + else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc)) + queue = PTLRPC_NRS_QUEUE_REG; + + /** + * Serialize NRS core lprocfs operations with policy registration/ + * unregistration. + */ + mutex_lock(&nrs_core.nrs_mutex); + + rc = ptlrpc_nrs_policy_control(svc, queue, policy_name, + PTLRPC_NRS_CTL_START, + false, cmd); + + mutex_unlock(&nrs_core.nrs_mutex); +out: + if (cmd_copy) + OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD); + + RETURN(rc < 0 ? rc : count); +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs); + +/** @} nrs */ + +struct ptlrpc_srh_iterator { + int srhi_idx; + __u64 srhi_seq; + struct ptlrpc_request *srhi_req; +}; + +static int +ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt, + struct ptlrpc_srh_iterator *srhi, + __u64 seq) +{ + struct list_head *e; + struct ptlrpc_request *req; + + if (srhi->srhi_req != NULL && + srhi->srhi_seq > svcpt->scp_hist_seq_culled && + srhi->srhi_seq <= seq) { + /* If srhi_req was set previously, hasn't been culled and + * we're searching for a seq on or after it (i.e. more + * recent), search from it onwards. + * Since the service history is LRU (i.e. culled reqs will + * be near the head), we shouldn't have to do long + * re-scans */ + LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq, + "%s:%d: seek seq %llu, request seq %llu\n", + svcpt->scp_service->srv_name, svcpt->scp_cpt, + srhi->srhi_seq, srhi->srhi_req->rq_history_seq); + LASSERTF(!list_empty(&svcpt->scp_hist_reqs), + "%s:%d: seek offset %llu, request seq %llu, " + "last culled %llu\n", + svcpt->scp_service->srv_name, svcpt->scp_cpt, + seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled); + e = &srhi->srhi_req->rq_history_list; + } else { + /* search from start */ + e = svcpt->scp_hist_reqs.next; + } + + while (e != &svcpt->scp_hist_reqs) { + req = list_entry(e, struct ptlrpc_request, rq_history_list); + + if (req->rq_history_seq >= seq) { + srhi->srhi_seq = req->rq_history_seq; + srhi->srhi_req = req; + return 0; + } + e = e->next; + } + + return -ENOENT; +} + +/* + * ptlrpc history sequence is used as "position" of seq_file, in some case, + * seq_read() will increase "position" to indicate reading the next + * element, however, low bits of history sequence are reserved for CPT id + * (check the details from comments before ptlrpc_req_add_history), which + * means seq_read() might change CPT id of history sequence and never + * finish reading of requests on a CPT. To make it work, we have to shift + * CPT id to high bits and timestamp to low bits, so seq_read() will only + * increase timestamp which can correctly indicate the next position. + */ + +/* convert seq_file pos to cpt */ +#define PTLRPC_REQ_POS2CPT(svc, pos) \ + ((svc)->srv_cpt_bits == 0 ? 0 : \ + (__u64)(pos) >> (64 - (svc)->srv_cpt_bits)) + +/* make up seq_file pos from cpt */ +#define PTLRPC_REQ_CPT2POS(svc, cpt) \ + ((svc)->srv_cpt_bits == 0 ? 0 : \ + (cpt) << (64 - (svc)->srv_cpt_bits)) + +/* convert sequence to position */ +#define PTLRPC_REQ_SEQ2POS(svc, seq) \ + ((svc)->srv_cpt_bits == 0 ? (seq) : \ + ((seq) >> (svc)->srv_cpt_bits) | \ + ((seq) << (64 - (svc)->srv_cpt_bits))) + +/* convert position to sequence */ +#define PTLRPC_REQ_POS2SEQ(svc, pos) \ + ((svc)->srv_cpt_bits == 0 ? (pos) : \ + ((__u64)(pos) << (svc)->srv_cpt_bits) | \ + ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits))) + +static void * +ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_srh_iterator *srhi; + unsigned int cpt; + int rc; + int i; + + if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */ + CWARN("Failed to read request history because size of loff_t " + "%d can't match size of u64\n", (int)sizeof(loff_t)); + return NULL; + } + + OBD_ALLOC(srhi, sizeof(*srhi)); + if (srhi == NULL) + return NULL; + + srhi->srhi_seq = 0; + srhi->srhi_req = NULL; + + cpt = PTLRPC_REQ_POS2CPT(svc, *pos); + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (i < cpt) /* skip */ + continue; + if (i > cpt) /* make up the lowest position for this CPT */ + *pos = PTLRPC_REQ_CPT2POS(svc, i); + + mutex_lock(&svcpt->scp_mutex); + spin_lock(&svcpt->scp_lock); + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, + PTLRPC_REQ_POS2SEQ(svc, *pos)); + spin_unlock(&svcpt->scp_lock); + mutex_unlock(&svcpt->scp_mutex); + if (rc == 0) { + *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq); + srhi->srhi_idx = i; + return srhi; + } + } + + OBD_FREE(srhi, sizeof(*srhi)); + return NULL; +} + +static void +ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter) +{ + struct ptlrpc_srh_iterator *srhi = iter; + + if (srhi != NULL) + OBD_FREE(srhi, sizeof(*srhi)); +} + +static void * +ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s, + void *iter, loff_t *pos) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi = iter; + struct ptlrpc_service_part *svcpt; + __u64 seq; + int rc; + int i; + + for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) { + svcpt = svc->srv_parts[i]; + + if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */ + srhi->srhi_req = NULL; + seq = srhi->srhi_seq = 0; + } else { /* the next sequence */ + seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits); + } + + mutex_lock(&svcpt->scp_mutex); + spin_lock(&svcpt->scp_lock); + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq); + spin_unlock(&svcpt->scp_lock); + mutex_unlock(&svcpt->scp_mutex); + if (rc == 0) { + *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq); + srhi->srhi_idx = i; + return srhi; + } + } + + OBD_FREE(srhi, sizeof(*srhi)); + return NULL; +} + +/* common ost/mdt so_req_printer */ +void target_print_req(void *seq_file, struct ptlrpc_request *req) +{ + /* Called holding srv_lock with irqs disabled. + * Print specific req contents and a newline. + * CAVEAT EMPTOR: check request message length before printing!!! + * You might have received any old crap so you must be just as + * careful here as the service's request parser!!! */ + struct seq_file *sf = seq_file; + + switch (req->rq_phase) { + case RQ_PHASE_NEW: + /* still awaiting a service thread's attention, or rejected + * because the generic request message didn't unpack */ + seq_printf(sf, "\n"); + break; + case RQ_PHASE_INTERPRET: + /* being handled, so basic msg swabbed, and opc is valid + * but racing with mds_handle() */ + fallthrough; + case RQ_PHASE_COMPLETE: + /* been handled by mds_handle() reply state possibly still + * volatile */ + seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg)); + break; + default: + DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase); + } +} +EXPORT_SYMBOL(target_print_req); + +static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi = iter; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request *req; + int rc; + + LASSERT(srhi->srhi_idx < svc->srv_ncpts); + + svcpt = svc->srv_parts[srhi->srhi_idx]; + + mutex_lock(&svcpt->scp_mutex); + spin_lock(&svcpt->scp_lock); + + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq); + + if (rc == 0) { + struct timespec64 arrival, sent, arrivaldiff; + char nidstr[LNET_NIDSTR_SIZE]; + + req = srhi->srhi_req; + + libcfs_nid2str_r(req->rq_self, nidstr, sizeof(nidstr)); + arrival.tv_sec = req->rq_arrival_time.tv_sec; + arrival.tv_nsec = req->rq_arrival_time.tv_nsec; + sent.tv_sec = req->rq_sent; + sent.tv_nsec = 0; + arrivaldiff = timespec64_sub(sent, arrival); + + /* Print common req fields. + * CAVEAT EMPTOR: we're racing with the service handler + * here. The request could contain any old crap, so you + * must be just as careful as the service's request + * parser. Currently I only print stuff here I know is OK + * to look at coz it was set up in request_in_callback()!!! + */ + seq_printf(s, "%lld:%s:%s:x%llu:%d:%s:%lld.%06lld:%lld.%06llds(%+lld.0s) ", + req->rq_history_seq, nidstr, + libcfs_id2str(req->rq_peer), req->rq_xid, + req->rq_reqlen, ptlrpc_rqphase2str(req), + (s64)req->rq_arrival_time.tv_sec, + (s64)(req->rq_arrival_time.tv_nsec / NSEC_PER_USEC), + (s64)arrivaldiff.tv_sec, + (s64)(arrivaldiff.tv_nsec / NSEC_PER_USEC), + (s64)(req->rq_sent - req->rq_deadline)); + if (svc->srv_ops.so_req_printer == NULL) + seq_printf(s, "\n"); + else + svc->srv_ops.so_req_printer(s, srhi->srhi_req); + } + + spin_unlock(&svcpt->scp_lock); + mutex_unlock(&svcpt->scp_mutex); + + return rc; +} + +static int +ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file) +{ + static struct seq_operations sops = { + .start = ptlrpc_lprocfs_svc_req_history_start, + .stop = ptlrpc_lprocfs_svc_req_history_stop, + .next = ptlrpc_lprocfs_svc_req_history_next, + .show = ptlrpc_lprocfs_svc_req_history_show, + }; + struct seq_file *seqf; + int rc; + + rc = LPROCFS_ENTRY_CHECK(inode); + if (rc < 0) + return rc; + + rc = seq_open(file, &sops); + if (rc) + return rc; + + seqf = file->private_data; + seqf->private = inode->i_private; + return 0; +} + +/* See also lprocfs_rd_timeouts */ +static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n) +{ + struct ptlrpc_service *svc = m->private; + struct ptlrpc_service_part *svcpt; + time64_t worstt; + unsigned int cur; + unsigned int worst; + int i; + + if (AT_OFF) { + seq_printf(m, "adaptive timeouts off, using obd_timeout %u\n", + obd_timeout); + return 0; + } + + ptlrpc_service_for_each_part(svcpt, i, svc) { + cur = at_get(&svcpt->scp_at_estimate); + worst = svcpt->scp_at_estimate.at_worst_ever; + worstt = svcpt->scp_at_estimate.at_worst_time; + + seq_printf(m, "%10s : cur %3u worst %3u (at %lld, %llds ago) ", + "service", cur, worst, (s64)worstt, + (s64)(ktime_get_real_seconds() - worstt)); + + lprocfs_at_hist_helper(m, &svcpt->scp_at_estimate); + } + + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts); + +static ssize_t high_priority_ratio_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + + return sprintf(buf, "%d\n", svc->srv_hpreq_ratio); +} + +static ssize_t high_priority_ratio_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + int rc; + unsigned long val; + + rc = kstrtoul(buffer, 10, &val); + if (rc < 0) + return rc; + + spin_lock(&svc->srv_lock); + svc->srv_hpreq_ratio = val; + spin_unlock(&svc->srv_lock); + + return count; +} +LUSTRE_RW_ATTR(high_priority_ratio); + +static struct attribute *ptlrpc_svc_attrs[] = { + &lustre_attr_threads_min.attr, + &lustre_attr_threads_started.attr, + &lustre_attr_threads_max.attr, + &lustre_attr_high_priority_ratio.attr, + NULL, +}; + +static void ptlrpc_sysfs_svc_release(struct kobject *kobj) +{ + struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, + srv_kobj); + + complete(&svc->srv_kobj_unregister); +} + +static struct kobj_type ptlrpc_svc_ktype = { + .default_attrs = ptlrpc_svc_attrs, + .sysfs_ops = &lustre_sysfs_ops, + .release = ptlrpc_sysfs_svc_release, +}; + +void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc) +{ + /* Let's see if we had a chance at initialization first */ + if (svc->srv_kobj.kset) { + kobject_put(&svc->srv_kobj); + wait_for_completion(&svc->srv_kobj_unregister); + } +} + +int ptlrpc_sysfs_register_service(struct kset *parent, + struct ptlrpc_service *svc) +{ + svc->srv_kobj.kset = parent; + init_completion(&svc->srv_kobj_unregister); + return kobject_init_and_add(&svc->srv_kobj, &ptlrpc_svc_ktype, + &parent->kobj, "%s", svc->srv_name); +} + +void ptlrpc_ldebugfs_register_service(struct dentry *entry, + struct ptlrpc_service *svc) +{ + struct ldebugfs_vars ldebugfs_vars[] = { + { .name = "req_buffer_history_len", + .fops = &ptlrpc_lprocfs_req_history_len_fops, + .data = svc }, + { .name = "req_buffer_history_max", + .fops = &ptlrpc_lprocfs_req_history_max_fops, + .data = svc }, + { .name = "timeouts", + .fops = &ptlrpc_lprocfs_timeouts_fops, + .data = svc }, + { .name = "nrs_policies", + .fops = &ptlrpc_lprocfs_nrs_fops, + .data = svc }, + { .name = "req_buffers_max", + .fops = &ptlrpc_lprocfs_req_buffers_max_fops, + .data = svc }, + { NULL } + }; + static struct file_operations req_history_fops = { + .owner = THIS_MODULE, + .open = ptlrpc_lprocfs_svc_req_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = lprocfs_seq_release, + }; + + int rc; + + ptlrpc_ldebugfs_register(entry, svc->srv_name, "stats", + &svc->srv_debugfs_entry, &svc->srv_stats); + if (IS_ERR_OR_NULL(svc->srv_debugfs_entry)) + return; + + ldebugfs_add_vars(svc->srv_debugfs_entry, ldebugfs_vars, NULL); + + rc = ldebugfs_seq_create(svc->srv_debugfs_entry, "req_history", + 0400, &req_history_fops, svc); + if (rc) + CWARN("Error adding the req_history file\n"); +} + +void ptlrpc_lprocfs_register_obd(struct obd_device *obddev) +{ + ptlrpc_ldebugfs_register(obddev->obd_debugfs_entry, NULL, "stats", + &obddev->obd_svc_debugfs_entry, + &obddev->obd_svc_stats); +} +EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd); + +void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount) +{ + struct lprocfs_stats *svc_stats; + __u32 op = lustre_msg_get_opc(req->rq_reqmsg); + int opc = opcode_offset(op); + + svc_stats = req->rq_import->imp_obd->obd_svc_stats; + if (svc_stats == NULL || opc <= 0) + return; + LASSERT(opc < LUSTRE_MAX_OPCODES); + if (!(op == LDLM_ENQUEUE || op == MDS_REINT)) + lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount); +} + +void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) +{ + struct lprocfs_stats *svc_stats; + int idx; + + if (!req->rq_import) + return; + svc_stats = req->rq_import->imp_obd->obd_svc_stats; + if (!svc_stats) + return; + idx = lustre_msg_get_opc(req->rq_reqmsg); + switch (idx) { + case OST_READ: + idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR; + break; + case OST_WRITE: + idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR; + break; + default: + LASSERTF(0, "unsupported opcode %u\n", idx); + break; + } + + lprocfs_counter_add(svc_stats, idx, bytes); +} + +EXPORT_SYMBOL(ptlrpc_lprocfs_brw); + +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc) +{ + if (!IS_ERR_OR_NULL(svc->srv_debugfs_entry)) + ldebugfs_remove(&svc->srv_debugfs_entry); + + if (svc->srv_stats) + lprocfs_free_stats(&svc->srv_stats); +} + +void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) +{ + /* cleanup first to allow concurrent access to device's + * stats via debugfs to complete safely + */ + lprocfs_obd_cleanup(obd); + + if (!IS_ERR_OR_NULL(obd->obd_svc_debugfs_entry)) + ldebugfs_remove(&obd->obd_svc_debugfs_entry); + + if (obd->obd_svc_stats) + lprocfs_free_stats(&obd->obd_svc_stats); +} +EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd); + +ssize_t ping_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct ptlrpc_request *req; + int rc; + + ENTRY; + LPROCFS_CLIMP_CHECK(obd); + req = ptlrpc_prep_ping(obd->u.cli.cl_import); + LPROCFS_CLIMP_EXIT(obd); + if (!req) + RETURN(-ENOMEM); + + req->rq_send_state = LUSTRE_IMP_FULL; + + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + + RETURN(rc); +} +EXPORT_SYMBOL(ping_show); + +/* kept for older verison of tools. */ +ssize_t ping_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + return ping_show(kobj, attr, (char *)buffer); +} +EXPORT_SYMBOL(ping_store); + +/* Write the connection UUID to this file to attempt to connect to that node. + * The connection UUID is a node's primary NID. For example, + * "echo connection=192.168.0.1@tcp0::instance > .../import". + */ +ssize_t +ldebugfs_import_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + struct obd_import *imp = obd->u.cli.cl_import; + char *kbuf = NULL; + char *uuid; + char *ptr; + int do_reconn = 1; + const char prefix[] = "connection="; + const int prefix_len = sizeof(prefix) - 1; + + if (count > PAGE_SIZE - 1 || count <= prefix_len) + return -EINVAL; + + OBD_ALLOC(kbuf, count + 1); + if (kbuf == NULL) + return -ENOMEM; + + if (copy_from_user(kbuf, buffer, count)) + GOTO(out, count = -EFAULT); + + kbuf[count] = 0; + + /* only support connection=uuid::instance now */ + if (strncmp(prefix, kbuf, prefix_len) != 0) + GOTO(out, count = -EINVAL); + + uuid = kbuf + prefix_len; + ptr = strstr(uuid, "::"); + if (ptr) { + u32 inst; + int rc; + + *ptr = 0; + do_reconn = 0; + ptr += 2; /* Skip :: */ + rc = kstrtouint(ptr, 10, &inst); + if (rc) { + CERROR("config: wrong instance # %s\n", ptr); + } else if (inst != imp->imp_connect_data.ocd_instance) { + CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted " + "target(%u/%u), reconnecting...\n", + imp->imp_obd->obd_name, + imp->imp_connect_data.ocd_instance, inst); + do_reconn = 1; + } else { + CDEBUG(D_INFO, "IR: %s has already been connecting to " + "new target(%u)\n", + imp->imp_obd->obd_name, inst); + } + } + + if (do_reconn) + ptlrpc_recover_import(imp, uuid, 1); + +out: + OBD_FREE(kbuf, count + 1); + return count; +} +EXPORT_SYMBOL(ldebugfs_import_seq_write); + +int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *n) +{ + struct obd_device *obd = m->private; + struct obd_import *imp = obd->u.cli.cl_import; + + LPROCFS_CLIMP_CHECK(obd); + seq_printf(m, "%d\n", !imp->imp_no_pinger_recover); + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_pinger_recov_seq_show); + +ssize_t +lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + bool val; + int rc; + + rc = kstrtobool_from_user(buffer, count, &val); + if (rc < 0) + return rc; + + LPROCFS_CLIMP_CHECK(obd); + spin_lock(&imp->imp_lock); + imp->imp_no_pinger_recover = !val; + spin_unlock(&imp->imp_lock); + LPROCFS_CLIMP_EXIT(obd); + return count; +} +EXPORT_SYMBOL(lprocfs_pinger_recov_seq_write); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c new file mode 100644 index 0000000000000..f6e0f57e2c785 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c @@ -0,0 +1,1004 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * Helper function. Sends \a len bytes from \a base at offset \a offset + * over \a conn connection to portal \a portal. + * Returns 0 on success or error code. + */ +static int ptl_send_buf(struct lnet_handle_md *mdh, void *base, int len, + enum lnet_ack_req ack, struct ptlrpc_cb_id *cbid, + lnet_nid_t self, struct lnet_process_id peer_id, + int portal, __u64 xid, unsigned int offset, + struct lnet_handle_md *bulk_cookie) +{ + int rc; + struct lnet_md md; + ENTRY; + + LASSERT (portal != 0); + CDEBUG (D_INFO, "peer_id %s\n", libcfs_id2str(peer_id)); + md.start = base; + md.length = len; + md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1; + md.options = PTLRPC_MD_OPTIONS; + md.user_ptr = cbid; + md.eq_handle = ptlrpc_eq_h; + LNetInvalidateMDHandle(&md.bulk_handle); + + if (bulk_cookie) { + md.bulk_handle = *bulk_cookie; + md.options |= LNET_MD_BULK_HANDLE; + } + + if (unlikely(ack == LNET_ACK_REQ && + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){ + /* don't ask for the ack to simulate failing client */ + ack = LNET_NOACK_REQ; + } + + rc = LNetMDBind (md, LNET_UNLINK, mdh); + if (unlikely(rc != 0)) { + CERROR ("LNetMDBind failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + RETURN (-ENOMEM); + } + + CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n", + len, portal, xid, offset); + + rc = LNetPut(self, *mdh, ack, + peer_id, portal, xid, offset, 0); + if (unlikely(rc != 0)) { + int rc2; + /* We're going to get an UNLINK event when I unlink below, + * which will complete just like any other failed send, so + * I fall through and return success here! */ + CERROR("LNetPut(%s, %d, %lld) failed: %d\n", + libcfs_id2str(peer_id), portal, xid, rc); + rc2 = LNetMDUnlink(*mdh); + LASSERTF(rc2 == 0, "rc2 = %d\n", rc2); + } + + RETURN (0); +} + +static void mdunlink_iterate_helper(struct lnet_handle_md *bd_mds, int count) +{ + int i; + + for (i = 0; i < count; i++) + LNetMDUnlink(bd_mds[i]); +} + +#ifdef HAVE_SERVER_SUPPORT +/** + * Prepare bulk descriptor for specified incoming request \a req that + * can fit \a nfrags * pages. \a type is bulk type. \a portal is where + * the bulk to be sent. Used on server-side after request was already + * received. + * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on + * error. + */ +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req, + unsigned nfrags, unsigned max_brw, + unsigned int type, + unsigned portal, + const struct ptlrpc_bulk_frag_ops + *ops) +{ + struct obd_export *exp = req->rq_export; + struct ptlrpc_bulk_desc *desc; + + ENTRY; + LASSERT(ptlrpc_is_bulk_op_active(type)); + + desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops); + if (desc == NULL) + RETURN(NULL); + + desc->bd_export = class_export_get(exp); + desc->bd_req = req; + + desc->bd_cbid.cbid_fn = server_bulk_callback; + desc->bd_cbid.cbid_arg = desc; + + /* NB we don't assign rq_bulk here; server-side requests are + * re-used, and the handler frees the bulk desc explicitly. */ + + return desc; +} +EXPORT_SYMBOL(ptlrpc_prep_bulk_exp); + +/** + * Starts bulk transfer for descriptor \a desc on the server. + * Returns 0 on success or error code. + */ +int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc) +{ + struct obd_export *exp = desc->bd_export; + lnet_nid_t self_nid; + struct lnet_process_id peer_id; + int rc = 0; + __u64 mbits; + int posted_md; + int total_md; + struct lnet_md md; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_PUT_NET)) + RETURN(0); + + /* NB no locking required until desc is on the network */ + LASSERT(ptlrpc_is_bulk_op_active(desc->bd_type)); + + LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback); + LASSERT(desc->bd_cbid.cbid_arg == desc); + + /* + * Multi-Rail: get the preferred self and peer NIDs from the + * request, so they are based on the route taken by the + * message. + */ + self_nid = desc->bd_req->rq_self; + peer_id = desc->bd_req->rq_source; + + /* NB total length may be 0 for a read past EOF, so we send 0 + * length bulks, since the client expects bulk events. + * + * The client may not need all of the bulk mbits for the RPC. The RPC + * used the mbits of the highest bulk mbits needed, and the server masks + * off high bits to get bulk count for this RPC. LU-1431 */ + mbits = desc->bd_req->rq_mbits & ~((__u64)desc->bd_md_max_brw - 1); + total_md = desc->bd_req->rq_mbits - mbits + 1; + + desc->bd_refs = total_md; + desc->bd_failure = 0; + + md.user_ptr = &desc->bd_cbid; + md.eq_handle = ptlrpc_eq_h; + md.threshold = 2; /* SENT and ACK/REPLY */ + + for (posted_md = 0; posted_md < total_md; mbits++) { + md.options = PTLRPC_MD_OPTIONS; + + /* NB it's assumed that source and sink buffer frags are + * page-aligned. Otherwise we'd have to send client bulk + * sizes over and split server buffer accordingly */ + ptlrpc_fill_bulk_md(&md, desc, posted_md); + rc = LNetMDBind(md, LNET_UNLINK, &desc->bd_mds[posted_md]); + if (rc != 0) { + CERROR("%s: LNetMDBind failed for MD %u: rc = %d\n", + exp->exp_obd->obd_name, posted_md, rc); + LASSERT(rc == -ENOMEM); + if (posted_md == 0) { + desc->bd_md_count = 0; + RETURN(-ENOMEM); + } + break; + } + + /* LU-6441: last md is not sent and desc->bd_md_count == 1 */ + if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB3, + CFS_FAIL_ONCE) && + total_md > 1 && posted_md == total_md - 1) { + posted_md++; + continue; + } + + /* Network is about to get at the memory */ + if (ptlrpc_is_bulk_put_source(desc->bd_type)) + rc = LNetPut(self_nid, desc->bd_mds[posted_md], + LNET_ACK_REQ, peer_id, + desc->bd_portal, mbits, 0, 0); + else + rc = LNetGet(self_nid, desc->bd_mds[posted_md], + peer_id, desc->bd_portal, mbits, 0, false); + + posted_md++; + if (rc != 0) { + CERROR("%s: failed bulk transfer with %s:%u x%llu: " + "rc = %d\n", exp->exp_obd->obd_name, + libcfs_id2str(peer_id), desc->bd_portal, + mbits, rc); + break; + } + } + + if (rc != 0) { + /* Can't send, so we unlink the MD bound above. The UNLINK + * event this creates will signal completion with failure, + * so we return SUCCESS here! */ + spin_lock(&desc->bd_lock); + desc->bd_refs -= total_md - posted_md; + spin_unlock(&desc->bd_lock); + LASSERT(desc->bd_refs >= 0); + + mdunlink_iterate_helper(desc->bd_mds, posted_md); + RETURN(0); + } + + CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d " + "id %s mbits %#llx-%#llx\n", desc->bd_iov_count, + desc->bd_nob, desc->bd_portal, libcfs_id2str(peer_id), + mbits - posted_md, mbits - 1); + + RETURN(0); +} + +/** + * Server side bulk abort. Idempotent. Not thread-safe (i.e. only + * serialises with completion callback) + */ +void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc) +{ + struct l_wait_info lwi; + int rc; + + LASSERT(!in_interrupt()); /* might sleep */ + + if (!ptlrpc_server_bulk_active(desc)) /* completed or */ + return; /* never started */ + + /* We used to poison the pages with 0xab here because we did not want to + * send any meaningful data over the wire for evicted clients (bug 9297) + * However, this is no longer safe now that we use the page cache on the + * OSS (bug 20560) */ + + /* The unlink ensures the callback happens ASAP and is the last + * one. If it fails, it must be because completion just happened, + * but we must still l_wait_event() in this case, to give liblustre + * a chance to run server_bulk_callback()*/ + mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); + + for (;;) { + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(desc->bd_waitq, + !ptlrpc_server_bulk_active(desc), &lwi); + if (rc == 0) + return; + + LASSERT(rc == -ETIMEDOUT); + CWARN("Unexpectedly long timeout: desc %p\n", desc); + } +} +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * Register bulk at the sender for later transfer. + * Returns 0 on success or error code. + */ +int ptlrpc_register_bulk(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + struct lnet_process_id peer; + int rc = 0; + int rc2; + int posted_md; + int total_md; + __u64 mbits; + struct lnet_handle_me me_h; + struct lnet_md md; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET)) + RETURN(0); + + /* NB no locking required until desc is on the network */ + LASSERT(desc->bd_nob > 0); + LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT); + LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); + LASSERT(desc->bd_req != NULL); + LASSERT(ptlrpc_is_bulk_op_passive(desc->bd_type)); + + /* cleanup the state of the bulk for it will be reused */ + if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY) + desc->bd_nob_transferred = 0; + else if (desc->bd_nob_transferred != 0) + /* If the network failed after an RPC was sent, this condition + * could happen. Rather than assert (was here before), return + * an EIO error. */ + RETURN(-EIO); + + desc->bd_failure = 0; + + peer = desc->bd_import->imp_connection->c_peer; + + LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback); + LASSERT(desc->bd_cbid.cbid_arg == desc); + + total_md = desc->bd_md_count; + /* rq_mbits is matchbits of the final bulk */ + mbits = req->rq_mbits - desc->bd_md_count + 1; + + LASSERTF(mbits == (req->rq_mbits & PTLRPC_BULK_OPS_MASK), + "first mbits = x%llu, last mbits = x%llu\n", + mbits, req->rq_mbits); + LASSERTF(!(desc->bd_registered && + req->rq_send_state != LUSTRE_IMP_REPLAY) || + mbits != desc->bd_last_mbits, + "registered: %d rq_mbits: %llu bd_last_mbits: %llu\n", + desc->bd_registered, mbits, desc->bd_last_mbits); + + desc->bd_registered = 1; + desc->bd_last_mbits = mbits; + desc->bd_refs = total_md; + md.user_ptr = &desc->bd_cbid; + md.eq_handle = ptlrpc_eq_h; + md.threshold = 1; /* PUT or GET */ + + for (posted_md = 0; posted_md < desc->bd_md_count; + posted_md++, mbits++) { + md.options = PTLRPC_MD_OPTIONS | + (ptlrpc_is_bulk_op_get(desc->bd_type) ? + LNET_MD_OP_GET : LNET_MD_OP_PUT); + ptlrpc_fill_bulk_md(&md, desc, posted_md); + + if (posted_md > 0 && posted_md + 1 == desc->bd_md_count && + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_ATTACH)) { + rc = -ENOMEM; + } else { + rc = LNetMEAttach(desc->bd_portal, peer, mbits, 0, + LNET_UNLINK, LNET_INS_AFTER, &me_h); + } + if (rc != 0) { + CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n", + desc->bd_import->imp_obd->obd_name, mbits, + posted_md, rc); + break; + } + + /* About to let the network at it... */ + rc = LNetMDAttach(me_h, md, LNET_UNLINK, + &desc->bd_mds[posted_md]); + if (rc != 0) { + CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n", + desc->bd_import->imp_obd->obd_name, mbits, + posted_md, rc); + rc2 = LNetMEUnlink(me_h); + LASSERT(rc2 == 0); + break; + } + } + + if (rc != 0) { + LASSERT(rc == -ENOMEM); + spin_lock(&desc->bd_lock); + desc->bd_refs -= total_md - posted_md; + spin_unlock(&desc->bd_lock); + LASSERT(desc->bd_refs >= 0); + mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); + req->rq_status = -ENOMEM; + desc->bd_registered = 0; + RETURN(-ENOMEM); + } + + spin_lock(&desc->bd_lock); + /* Holler if peer manages to touch buffers before he knows the mbits */ + if (desc->bd_refs != total_md) + CWARN("%s: Peer %s touched %d buffers while I registered\n", + desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer), + total_md - desc->bd_refs); + spin_unlock(&desc->bd_lock); + + CDEBUG(D_NET, + "Setup %u bulk %s buffers: %u pages %u bytes, mbits x%#llx-%#llx, portal %u\n", + desc->bd_refs, + ptlrpc_is_bulk_op_get(desc->bd_type) ? "get-source" : "put-sink", + desc->bd_iov_count, desc->bd_nob, + desc->bd_last_mbits, req->rq_mbits, desc->bd_portal); + + RETURN(0); +} + +/** + * Disconnect a bulk desc from the network. Idempotent. Not + * thread-safe (i.e. only interlocks with completion callback). + * Returns 1 on success or 0 if network unregistration failed for whatever + * reason. + */ +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + struct l_wait_info lwi; + int rc; + ENTRY; + + LASSERT(!in_interrupt()); /* might sleep */ + + /* Let's setup deadline for reply unlink. */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && + async && req->rq_bulk_deadline == 0 && cfs_fail_val == 0) + req->rq_bulk_deadline = ktime_get_real_seconds() + LONG_UNLINK; + + if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ + RETURN(1); /* never registered */ + + LASSERT(desc->bd_req == req); /* bd_req NULL until registered */ + + /* the unlink ensures the callback happens ASAP and is the last + * one. If it fails, it must be because completion just happened, + * but we must still l_wait_event() in this case to give liblustre + * a chance to run client_bulk_callback() */ + mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); + + if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ + RETURN(1); /* never registered */ + + /* Move to "Unregistering" phase as bulk was not unlinked yet. */ + ptlrpc_rqphase_move(req, RQ_PHASE_UNREG_BULK); + + /* Do not wait for unlink to finish. */ + if (async) + RETURN(0); + + for (;;) { + /* The wq argument is ignored by user-space wait_event macros */ + wait_queue_head_t *wq = (req->rq_set != NULL) ? + &req->rq_set->set_waitq : + &req->rq_reply_waitq; + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi); + if (rc == 0) { + ptlrpc_rqphase_move(req, req->rq_next_phase); + RETURN(1); + } + + LASSERT(rc == -ETIMEDOUT); + DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p", + desc); + } + RETURN(0); +} + +static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + timeout_t service_timeout; + + service_timeout = clamp_t(timeout_t, ktime_get_real_seconds() - + req->rq_arrival_time.tv_sec, 1, + (AT_OFF ? obd_timeout * 3 / 2 : at_max)); + if (!(flags & PTLRPC_REPLY_EARLY) && + (req->rq_type != PTL_RPC_MSG_ERR) && + (req->rq_reqmsg != NULL) && + !(lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY | + MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) { + /* early replies, errors and recovery requests don't count + * toward our service time estimate */ + int oldse = at_measured(&svcpt->scp_at_estimate, + service_timeout); + + if (oldse != 0) { + DEBUG_REQ(D_ADAPTTO, req, + "svc %s changed estimate from %d to %d", + svc->srv_name, oldse, + at_get(&svcpt->scp_at_estimate)); + } + } + /* Report actual service time for client latency calc */ + lustre_msg_set_service_timeout(req->rq_repmsg, service_timeout); + /* Report service time estimate for future client reqs, but report 0 + * (to be ignored by client) if it's an error reply during recovery. + * b=15815 + */ + if (req->rq_type == PTL_RPC_MSG_ERR && + (req->rq_export == NULL || + req->rq_export->exp_obd->obd_recovering)) { + lustre_msg_set_timeout(req->rq_repmsg, 0); + } else { + time64_t timeout; + + if (req->rq_export && req->rq_reqmsg != NULL && + (flags & PTLRPC_REPLY_EARLY) && + lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) { + struct obd_device *exp_obd = req->rq_export->exp_obd; + + timeout = ktime_get_real_seconds() - + req->rq_arrival_time.tv_sec + + min_t(time64_t, at_extra, + exp_obd->obd_recovery_timeout / 4); + } else { + timeout = at_get(&svcpt->scp_at_estimate); + } + lustre_msg_set_timeout(req->rq_repmsg, timeout); + } + + if (req->rq_reqmsg && + !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { + CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x " + "req_flags=%#x magic=%x/%x len=%d\n", + flags, lustre_msg_get_flags(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_repmsg), req->rq_replen); + } +} + +/** + * Send request reply from request \a req reply buffer. + * \a flags defines reply types + * Returns 0 on success or error code + */ +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_connection *conn; + int rc; + + /* We must already have a reply buffer (only ptlrpc_error() may be + * called without one). The reply generated by sptlrpc layer (e.g. + * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must + * have a request buffer which is either the actual (swabbed) incoming + * request, or a saved copy if this is a req saved in + * target_queue_final_reply(). + */ + LASSERT (req->rq_no_reply == 0); + LASSERT (req->rq_reqbuf != NULL); + LASSERT (rs != NULL); + LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult); + LASSERT (req->rq_repmsg != NULL); + LASSERT (req->rq_repmsg == rs->rs_msg); + LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback); + LASSERT (rs->rs_cb_id.cbid_arg == rs); + + /* There may be no rq_export during failover */ + + if (unlikely(req->rq_export && req->rq_export->exp_obd && + req->rq_export->exp_obd->obd_fail)) { + /* Failed obd's only send ENODEV */ + req->rq_type = PTL_RPC_MSG_ERR; + req->rq_status = -ENODEV; + CDEBUG(D_HA, "sending ENODEV from failed obd %d\n", + req->rq_export->exp_obd->obd_minor); + } + + /* In order to keep interoprability with the client (< 2.3) which + * doesn't have pb_jobid in ptlrpc_body, We have to shrink the + * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the + * reply buffer on client will be overflow. + * + * XXX Remove this whenver we drop the interoprability with such client. + */ + req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0, + sizeof(struct ptlrpc_body_v2), 1); + + if (req->rq_type != PTL_RPC_MSG_ERR) + req->rq_type = PTL_RPC_MSG_REPLY; + + lustre_msg_set_type(req->rq_repmsg, req->rq_type); + lustre_msg_set_status(req->rq_repmsg, + ptlrpc_status_hton(req->rq_status)); + lustre_msg_set_opc(req->rq_repmsg, + req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0); + + target_pack_pool_reply(req); + + ptlrpc_at_set_reply(req, flags); + + if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) + conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL); + else + conn = ptlrpc_connection_addref(req->rq_export->exp_connection); + + if (unlikely(conn == NULL)) { + CERROR("not replying on NULL connection\n"); /* bug 9635 */ + return -ENOTCONN; + } + ptlrpc_rs_addref(rs); /* +1 ref for the network */ + + rc = sptlrpc_svc_wrap_reply(req); + if (unlikely(rc)) + goto out; + + req->rq_sent = ktime_get_real_seconds(); + + rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, + (rs->rs_difficult && !rs->rs_no_ack) ? + LNET_ACK_REQ : LNET_NOACK_REQ, + &rs->rs_cb_id, req->rq_self, req->rq_source, + ptlrpc_req2svc(req)->srv_rep_portal, + req->rq_xid, req->rq_reply_off, NULL); +out: + if (unlikely(rc != 0)) + ptlrpc_req_drop_rs(req); + ptlrpc_connection_put(conn); + return rc; +} + +int ptlrpc_reply (struct ptlrpc_request *req) +{ + if (req->rq_no_reply) + return 0; + else + return (ptlrpc_send_reply(req, 0)); +} + +/** + * For request \a req send an error reply back. Create empty + * reply buffers if necessary. + */ +int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult) +{ + int rc; + ENTRY; + + if (req->rq_no_reply) + RETURN(0); + + if (!req->rq_repmsg) { + rc = lustre_pack_reply(req, 1, NULL, NULL); + if (rc) + RETURN(rc); + } + + if (req->rq_status != -ENOSPC && req->rq_status != -EACCES && + req->rq_status != -EPERM && req->rq_status != -ENOENT && + req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT) + req->rq_type = PTL_RPC_MSG_ERR; + + rc = ptlrpc_send_reply(req, may_be_difficult); + RETURN(rc); +} + +int ptlrpc_error(struct ptlrpc_request *req) +{ + return ptlrpc_send_error(req, 0); +} + +/** + * Send request \a request. + * if \a noreply is set, don't expect any reply back and don't set up + * reply buffers. + * Returns 0 on success or error code. + */ +int ptl_send_rpc(struct ptlrpc_request *request, int noreply) +{ + int rc; + int rc2; + int mpflag = 0; + struct lnet_handle_md bulk_cookie; + struct ptlrpc_connection *connection; + struct lnet_handle_me reply_me_h; + struct lnet_md reply_md; + struct obd_import *imp = request->rq_import; + struct obd_device *obd = imp->imp_obd; + ENTRY; + + LNetInvalidateMDHandle(&bulk_cookie); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC)) + RETURN(0); + + LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST); + LASSERT(request->rq_wait_ctx == 0); + + /* If this is a re-transmit, we're required to have disengaged + * cleanly from the previous attempt */ + LASSERT(!request->rq_receiving_reply); + LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) && + (imp->imp_state == LUSTRE_IMP_FULL))); + + if (unlikely(obd != NULL && obd->obd_fail)) { + CDEBUG(D_HA, "muting rpc for failed imp obd %s\n", + obd->obd_name); + /* this prevents us from waiting in ptlrpc_queue_wait */ + spin_lock(&request->rq_lock); + request->rq_err = 1; + spin_unlock(&request->rq_lock); + request->rq_status = -ENODEV; + RETURN(-ENODEV); + } + + connection = imp->imp_connection; + + lustre_msg_set_handle(request->rq_reqmsg, + &imp->imp_remote_handle); + lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); + lustre_msg_set_conn_cnt(request->rq_reqmsg, + imp->imp_conn_cnt); + lustre_msghdr_set_flags(request->rq_reqmsg, + imp->imp_msghdr_flags); + + /* If it's the first time to resend the request for EINPROGRESS, + * we need to allocate a new XID (see after_reply()), it's different + * from the resend for reply timeout. */ + if (request->rq_nr_resend != 0 && + list_empty(&request->rq_unreplied_list)) { + __u64 min_xid = 0; + /* resend for EINPROGRESS, allocate new xid to avoid reply + * reconstruction */ + spin_lock(&imp->imp_lock); + ptlrpc_assign_next_xid_nolock(request); + min_xid = ptlrpc_known_replied_xid(imp); + spin_unlock(&imp->imp_lock); + + lustre_msg_set_last_xid(request->rq_reqmsg, min_xid); + DEBUG_REQ(D_RPCTRACE, request, "Allocating new xid for " + "resend on EINPROGRESS"); + } + + if (request->rq_bulk != NULL) { + ptlrpc_set_bulk_mbits(request); + lustre_msg_set_mbits(request->rq_reqmsg, request->rq_mbits); + } + + if (list_empty(&request->rq_unreplied_list) || + request->rq_xid <= imp->imp_known_replied_xid) { + DEBUG_REQ(D_ERROR, request, "xid: %llu, replied: %llu, " + "list_empty:%d\n", request->rq_xid, + imp->imp_known_replied_xid, + list_empty(&request->rq_unreplied_list)); + LBUG(); + } + + /** For enabled AT all request should have AT_SUPPORT in the + * FULL import state when OBD_CONNECT_AT is set */ + LASSERT(AT_OFF || imp->imp_state != LUSTRE_IMP_FULL || + (imp->imp_msghdr_flags & MSGHDR_AT_SUPPORT) || + !(imp->imp_connect_data.ocd_connect_flags & + OBD_CONNECT_AT)); + + if (request->rq_resend) { + lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); + if (request->rq_resend_cb != NULL) + request->rq_resend_cb(request, &request->rq_async_args); + } + if (request->rq_memalloc) + mpflag = cfs_memory_pressure_get_and_set(); + + rc = sptlrpc_cli_wrap_request(request); + if (rc) + GOTO(out, rc); + + /* bulk register should be done after wrap_request() */ + if (request->rq_bulk != NULL) { + rc = ptlrpc_register_bulk (request); + if (rc != 0) + GOTO(cleanup_bulk, rc); + /* + * All the mds in the request will have the same cpt + * encoded in the cookie. So we can just get the first + * one. + */ + bulk_cookie = request->rq_bulk->bd_mds[0]; + } + + if (!noreply) { + LASSERT (request->rq_replen != 0); + if (request->rq_repbuf == NULL) { + LASSERT(request->rq_repdata == NULL); + LASSERT(request->rq_repmsg == NULL); + rc = sptlrpc_cli_alloc_repbuf(request, + request->rq_replen); + if (rc) { + /* this prevents us from looping in + * ptlrpc_queue_wait */ + spin_lock(&request->rq_lock); + request->rq_err = 1; + spin_unlock(&request->rq_lock); + request->rq_status = rc; + GOTO(cleanup_bulk, rc); + } + } else { + request->rq_repdata = NULL; + request->rq_repmsg = NULL; + } + + rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/ + connection->c_peer, request->rq_xid, 0, + LNET_UNLINK, LNET_INS_AFTER, &reply_me_h); + if (rc != 0) { + CERROR("LNetMEAttach failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + GOTO(cleanup_bulk, rc = -ENOMEM); + } + } + + spin_lock(&request->rq_lock); + /* We are responsible for unlinking the reply buffer */ + request->rq_reply_unlinked = noreply; + request->rq_receiving_reply = !noreply; + /* Clear any flags that may be present from previous sends. */ + request->rq_req_unlinked = 0; + request->rq_replied = 0; + request->rq_err = 0; + request->rq_timedout = 0; + request->rq_net_err = 0; + request->rq_resend = 0; + request->rq_restart = 0; + request->rq_reply_truncated = 0; + spin_unlock(&request->rq_lock); + + if (!noreply) { + reply_md.start = request->rq_repbuf; + reply_md.length = request->rq_repbuf_len; + /* Allow multiple early replies */ + reply_md.threshold = LNET_MD_THRESH_INF; + /* Manage remote for early replies */ + reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | + LNET_MD_MANAGE_REMOTE | + LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */; + reply_md.user_ptr = &request->rq_reply_cbid; + reply_md.eq_handle = ptlrpc_eq_h; + + /* We must see the unlink callback to set rq_reply_unlinked, + * so we can't auto-unlink */ + rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN, + &request->rq_reply_md_h); + if (rc != 0) { + CERROR("LNetMDAttach failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + spin_lock(&request->rq_lock); + /* ...but the MD attach didn't succeed... */ + request->rq_receiving_reply = 0; + spin_unlock(&request->rq_lock); + GOTO(cleanup_me, rc = -ENOMEM); + } + + CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %llu" + ", portal %u\n", + request->rq_repbuf_len, request->rq_xid, + request->rq_reply_portal); + } + + /* add references on request for request_out_callback */ + ptlrpc_request_addref(request); + if (obd != NULL && obd->obd_svc_stats != NULL) + lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR, + atomic_read(&imp->imp_inflight)); + + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); + + request->rq_sent_ns = ktime_get_real(); + request->rq_sent = ktime_get_real_seconds(); + /* We give the server rq_timeout secs to process the req, and + add the network latency for our local timeout. */ + request->rq_deadline = request->rq_sent + request->rq_timeout + + ptlrpc_at_get_net_latency(request); + + DEBUG_REQ(D_INFO, request, "send flg=%x", + lustre_msg_get_flags(request->rq_reqmsg)); + rc = ptl_send_buf(&request->rq_req_md_h, + request->rq_reqbuf, request->rq_reqdata_len, + LNET_NOACK_REQ, &request->rq_req_cbid, + LNET_NID_ANY, connection->c_peer, + request->rq_request_portal, + request->rq_xid, 0, &bulk_cookie); + if (likely(rc == 0)) + GOTO(out, rc); + + request->rq_req_unlinked = 1; + ptlrpc_req_finished(request); + if (noreply) + GOTO(out, rc); + + cleanup_me: + /* MEUnlink is safe; the PUT didn't even get off the ground, and + * nobody apart from the PUT's target has the right nid+XID to + * access the reply buffer. */ + rc2 = LNetMEUnlink(reply_me_h); + LASSERT (rc2 == 0); + /* UNLINKED callback called synchronously */ + LASSERT(!request->rq_receiving_reply); + + cleanup_bulk: + /* We do sync unlink here as there was no real transfer here so + * the chance to have long unlink to sluggish net is smaller here. */ + ptlrpc_unregister_bulk(request, 0); + if (request->rq_bulk != NULL) + request->rq_bulk->bd_registered = 0; + out: + if (rc == -ENOMEM) { + /* set rq_sent so that this request is treated + * as a delayed send in the upper layers */ + request->rq_sent = ktime_get_real_seconds(); + } + + if (request->rq_memalloc) + cfs_memory_pressure_restore(mpflag); + + return rc; +} +EXPORT_SYMBOL(ptl_send_rpc); + +/** + * Register request buffer descriptor for request receiving. + */ +int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd) +{ + struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service; + static struct lnet_process_id match_id = { + .nid = LNET_NID_ANY, + .pid = LNET_PID_ANY + }; + int rc; + struct lnet_md md; + struct lnet_handle_me me_h; + + CDEBUG(D_NET, "LNetMEAttach: portal %d\n", + service->srv_req_portal); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD)) + return (-ENOMEM); + + /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL, + * which means buffer can only be attached on local CPT, and LND + * threads can find it by grabbing a local lock */ + rc = LNetMEAttach(service->srv_req_portal, + match_id, 0, ~0, LNET_UNLINK, + rqbd->rqbd_svcpt->scp_cpt >= 0 ? + LNET_INS_LOCAL : LNET_INS_AFTER, &me_h); + if (rc != 0) { + CERROR("LNetMEAttach failed: %d\n", rc); + return (-ENOMEM); + } + + LASSERT(rqbd->rqbd_refcount == 0); + rqbd->rqbd_refcount = 1; + + md.start = rqbd->rqbd_buffer; + md.length = service->srv_buf_size; + md.max_size = service->srv_max_req_size; + md.threshold = LNET_MD_THRESH_INF; + md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE; + md.user_ptr = &rqbd->rqbd_cbid; + md.eq_handle = ptlrpc_eq_h; + + rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h); + if (rc == 0) + return (0); + + CERROR("LNetMDAttach failed: %d; \n", rc); + LASSERT (rc == -ENOMEM); + rc = LNetMEUnlink (me_h); + LASSERT (rc == 0); + rqbd->rqbd_refcount = 0; + + return (-ENOMEM); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h new file mode 100644 index 0000000000000..6d6b9d7a04541 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h @@ -0,0 +1,206 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013, Trustees of Indiana University + * + * Copyright (c) 2013, 2017, Intel Corporation. + * + * Author: Joshua Walgenbach + */ + +#ifndef _NODEMAP_INTERNAL_H +#define _NODEMAP_INTERNAL_H + +#include +#include + +#define DEFAULT_NODEMAP "default" + +/* Turn on proc debug interface to allow OSS and + * MDS nodes to configure nodemap independently of + * MGS (since the nodemap distribution is not written + * yet */ +#define NODEMAP_PROC_DEBUG 1 + +/* Default nobody uid and gid values */ + +#define NODEMAP_NOBODY_UID 99 +#define NODEMAP_NOBODY_GID 99 + +struct lprocfs_static_vars; + +/* nodemap root proc directory under fs/lustre */ +extern struct proc_dir_entry *proc_lustre_nodemap_root; +/* flag if nodemap is active */ +extern bool nodemap_active; + +extern struct mutex active_config_lock; +extern struct nodemap_config *active_config; + +struct lu_nid_range { + /* unique id set by mgs */ + unsigned int rn_id; + /* lu_nodemap containing this range */ + struct lu_nodemap *rn_nodemap; + /* list for nodemap */ + struct list_head rn_list; + /* nid interval tree */ + struct interval_node rn_node; +}; + +struct lu_idmap { + /* uid/gid of client */ + __u32 id_client; + /* uid/gid on filesystem */ + __u32 id_fs; + /* tree mapping client ids to filesystem ids */ + struct rb_node id_client_to_fs; + /* tree mappung filesystem to client */ + struct rb_node id_fs_to_client; +}; + +/* first 4 bits of the nodemap_id is the index type */ +struct nodemap_key { + __u32 nk_nodemap_id; + union { + __u32 nk_range_id; + __u32 nk_id_client; + __u32 nk_unused; + }; +}; + +enum nodemap_idx_type { + NODEMAP_EMPTY_IDX = 0, /* index created with blank record */ + NODEMAP_CLUSTER_IDX = 1, /* a nodemap cluster of nodes */ + NODEMAP_RANGE_IDX = 2, /* nid range assigned to a nm cluster */ + NODEMAP_UIDMAP_IDX = 3, /* uid map assigned to a nm cluster */ + NODEMAP_GIDMAP_IDX = 4, /* gid map assigned to a nm cluster */ + NODEMAP_GLOBAL_IDX = 15, /* stores nodemap activation status */ +}; + +#define NM_TYPE_MASK 0x0FFFFFFF +#define NM_TYPE_SHIFT 28 + +static inline enum nodemap_idx_type nm_idx_get_type(unsigned int id) +{ + return id >> NM_TYPE_SHIFT; +} + +static inline __u32 nm_idx_set_type(unsigned int id, enum nodemap_idx_type t) +{ + return (id & NM_TYPE_MASK) | (t << NM_TYPE_SHIFT); +} + +void nodemap_config_set_active(struct nodemap_config *config); +struct lu_nodemap *nodemap_create(const char *name, + struct nodemap_config *config, + bool is_default); +void nodemap_putref(struct lu_nodemap *nodemap); +struct lu_nodemap *nodemap_lookup(const char *name); + +int nodemap_procfs_init(void); +void nodemap_procfs_exit(void); +int lprocfs_nodemap_register(struct lu_nodemap *nodemap, + bool is_default_nodemap); +void lprocfs_nodemap_remove(struct nodemap_pde *nodemap_pde); +struct lu_nid_range *nodemap_range_find(lnet_nid_t start_nid, + lnet_nid_t end_nid); +struct lu_nid_range *range_create(struct nodemap_range_tree *nm_range_tree, + lnet_nid_t start_nid, lnet_nid_t end_nid, + struct lu_nodemap *nodemap, + unsigned int range_id); +void range_destroy(struct lu_nid_range *range); +int range_insert(struct nodemap_range_tree *nm_range_tree, + struct lu_nid_range *data); +void range_delete(struct nodemap_range_tree *nm_range_tree, + struct lu_nid_range *data); +struct lu_nid_range *range_search(struct nodemap_range_tree *nm_range_tree, + lnet_nid_t nid); +struct lu_nid_range *range_find(struct nodemap_range_tree *nm_range_tree, + lnet_nid_t start_nid, lnet_nid_t end_nid); +int range_parse_nidstring(char *range_string, lnet_nid_t *start_nid, + lnet_nid_t *end_nid); +void range_init_tree(void); +struct lu_idmap *idmap_create(__u32 client_id, __u32 fs_id); +struct lu_idmap *idmap_insert(enum nodemap_id_type id_type, + struct lu_idmap *idmap, + struct lu_nodemap *nodemap); +void idmap_delete(enum nodemap_id_type id_type, struct lu_idmap *idmap, + struct lu_nodemap *nodemap); +void idmap_delete_tree(struct lu_nodemap *nodemap); +struct lu_idmap *idmap_search(struct lu_nodemap *nodemap, + enum nodemap_tree_type, + enum nodemap_id_type id_type, + __u32 id); +int nm_member_add(struct lu_nodemap *nodemap, struct obd_export *exp); +void nm_member_del(struct lu_nodemap *nodemap, struct obd_export *exp); +void nm_member_delete_list(struct lu_nodemap *nodemap); +struct lu_nodemap *nodemap_classify_nid(lnet_nid_t nid); +void nm_member_reclassify_nodemap(struct lu_nodemap *nodemap); +void nm_member_revoke_locks(struct lu_nodemap *nodemap); +void nm_member_revoke_locks_always(struct lu_nodemap *nodemap); +void nm_member_revoke_all(void); + +int nodemap_add_idmap_helper(struct lu_nodemap *nodemap, + enum nodemap_id_type id_type, + const __u32 map[2]); +int nodemap_add_range_helper(struct nodemap_config *config, + struct lu_nodemap *nodemap, + const lnet_nid_t nid[2], + unsigned int range_id); + +struct rb_node *nm_rb_next_postorder(const struct rb_node *node); +struct rb_node *nm_rb_first_postorder(const struct rb_root *root); +void nodemap_getref(struct lu_nodemap *nodemap); +void nodemap_putref(struct lu_nodemap *nodemap); +int nm_hash_list_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, + void *nodemap_list_head); + +#define nm_rbtree_postorder_for_each_entry_safe(pos, n, \ + root, field) \ + for (pos = nm_rb_first_postorder(root) ? \ + rb_entry(nm_rb_first_postorder(root), typeof(*pos), \ + field) : NULL, \ + n = (pos && nm_rb_next_postorder(&pos->field)) ? \ + rb_entry(nm_rb_next_postorder(&pos->field), \ + typeof(*pos), field) : NULL; \ + pos != NULL; \ + pos = n, \ + n = (pos && nm_rb_next_postorder(&pos->field)) ? \ + rb_entry(nm_rb_next_postorder(&pos->field), \ + typeof(*pos), field) : NULL) + +int nodemap_idx_nodemap_add(const struct lu_nodemap *nodemap); +int nodemap_idx_nodemap_update(const struct lu_nodemap *nodemap); +int nodemap_idx_nodemap_del(const struct lu_nodemap *nodemap); +int nodemap_idx_idmap_add(const struct lu_nodemap *nodemap, + enum nodemap_id_type id_type, + const __u32 map[2]); +int nodemap_idx_idmap_del(const struct lu_nodemap *nodemap, + enum nodemap_id_type id_type, + const __u32 map[2]); +int nodemap_idx_range_add(const struct lu_nid_range *range, + const lnet_nid_t nid[2]); +int nodemap_idx_range_del(const struct lu_nid_range *range); +int nodemap_idx_nodemap_activate(bool value); +#endif /* _NODEMAP_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c new file mode 100644 index 0000000000000..52d3225deba6b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c @@ -0,0 +1,1854 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, 2016, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs.c + * + * Network Request Scheduler (NRS) + * + * Allows to reorder the handling of RPCs at servers. + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * NRS core object. + */ +struct nrs_core nrs_core; + +static int nrs_policy_init(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_desc->pd_ops->op_policy_init != NULL ? + policy->pol_desc->pd_ops->op_policy_init(policy) : 0; +} + +static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_ref == 0); + LASSERT(policy->pol_req_queued == 0); + + if (policy->pol_desc->pd_ops->op_policy_fini != NULL) + policy->pol_desc->pd_ops->op_policy_fini(policy); +} + +static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + /** + * The policy may be stopped, but the lprocfs files and + * ptlrpc_nrs_policy instances remain present until unregistration time. + * Do not perform the ctl operation if the policy is stopped, as + * policy->pol_private will be NULL in such a case. + */ + if (policy->pol_state == NRS_POL_STATE_STOPPED) + RETURN(-ENODEV); + + RETURN(policy->pol_desc->pd_ops->op_policy_ctl != NULL ? + policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) : + -ENOSYS); +} + +static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy) +{ + ENTRY; + + if (policy->pol_desc->pd_ops->op_policy_stop != NULL) + policy->pol_desc->pd_ops->op_policy_stop(policy); + + LASSERT(list_empty(&policy->pol_list_queued)); + LASSERT(policy->pol_req_queued == 0 && + policy->pol_req_started == 0); + + policy->pol_private = NULL; + + policy->pol_state = NRS_POL_STATE_STOPPED; + + if (atomic_dec_and_test(&policy->pol_desc->pd_refs)) + module_put(policy->pol_desc->pd_owner); + + EXIT; +} + +static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy) +{ + struct ptlrpc_nrs *nrs = policy->pol_nrs; + ENTRY; + + if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping) + RETURN(-EPERM); + + if (policy->pol_state == NRS_POL_STATE_STARTING) + RETURN(-EAGAIN); + + /* In progress or already stopped */ + if (policy->pol_state != NRS_POL_STATE_STARTED) + RETURN(0); + + policy->pol_state = NRS_POL_STATE_STOPPING; + + /* Immediately make it invisible */ + if (nrs->nrs_policy_primary == policy) { + nrs->nrs_policy_primary = NULL; + + } else { + LASSERT(nrs->nrs_policy_fallback == policy); + nrs->nrs_policy_fallback = NULL; + } + + /* I have the only refcount */ + if (policy->pol_ref == 1) + nrs_policy_stop0(policy); + + RETURN(0); +} + +/** + * Transitions the \a nrs NRS head's primary policy to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no + * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED. + * + * \param[in] nrs the NRS head to carry out this operation on + */ +static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs) +{ + struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary; + ENTRY; + + if (tmp == NULL) { + /** + * XXX: This should really be RETURN_EXIT, but the latter does + * not currently print anything out, and possibly should be + * fixed to do so. + */ + EXIT; + return; + } + + nrs->nrs_policy_primary = NULL; + + LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED); + tmp->pol_state = NRS_POL_STATE_STOPPING; + + if (tmp->pol_ref == 0) + nrs_policy_stop0(tmp); + EXIT; +} + +/** + * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in + * response to an lprocfs command to start a policy. + * + * If a primary policy different to the current one is specified, this function + * will transition the new policy to the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition + * the old primary policy (if there is one) to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding + * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. + * + * If the fallback policy is specified, this is taken to indicate an instruction + * to stop the current primary policy, without substituting it with another + * primary policy, so the primary policy (if any) is transitioned to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding + * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In + * this case, the fallback policy is only left active in the NRS head. + */ +static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct ptlrpc_nrs *nrs = policy->pol_nrs; + int rc = 0; + ENTRY; + + /** + * Don't allow multiple starting which is too complex, and has no real + * benefit. + */ + if (nrs->nrs_policy_starting) + RETURN(-EAGAIN); + + LASSERT(policy->pol_state != NRS_POL_STATE_STARTING); + + if (policy->pol_state == NRS_POL_STATE_STOPPING) + RETURN(-EAGAIN); + + if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) { + /** + * This is for cases in which the user sets the policy to the + * fallback policy (currently fifo for all services); i.e. the + * user is resetting the policy to the default; so we stop the + * primary policy, if any. + */ + if (policy == nrs->nrs_policy_fallback) { + nrs_policy_stop_primary(nrs); + RETURN(0); + } + + /** + * If we reach here, we must be setting up the fallback policy + * at service startup time, and only a single policy with the + * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can + * register with NRS core. + */ + LASSERT(nrs->nrs_policy_fallback == NULL); + } else { + /** + * Shouldn't start primary policy if w/o fallback policy. + */ + if (nrs->nrs_policy_fallback == NULL) + RETURN(-EPERM); + + if (policy->pol_state == NRS_POL_STATE_STARTED) { + /** + * If the policy argument now is different from the last time, + * stop the policy first and start it again with the new + * argument. + */ + if ((arg != NULL) && (strlen(arg) >= NRS_POL_ARG_MAX)) + return -EINVAL; + + if ((arg == NULL && strlen(policy->pol_arg) == 0) || + (arg != NULL && strcmp(policy->pol_arg, arg) == 0)) + RETURN(0); + + rc = nrs_policy_stop_locked(policy); + if (rc) + RETURN(-EAGAIN); + } + } + + /** + * Increase the module usage count for policies registering from other + * modules. + */ + if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 && + !try_module_get(policy->pol_desc->pd_owner)) { + atomic_dec(&policy->pol_desc->pd_refs); + CERROR("NRS: cannot get module for policy %s; is it alive?\n", + policy->pol_desc->pd_name); + RETURN(-ENODEV); + } + + /** + * Serialize policy starting across the NRS head + */ + nrs->nrs_policy_starting = 1; + + policy->pol_state = NRS_POL_STATE_STARTING; + + if (policy->pol_desc->pd_ops->op_policy_start) { + spin_unlock(&nrs->nrs_lock); + + rc = policy->pol_desc->pd_ops->op_policy_start(policy, arg); + + spin_lock(&nrs->nrs_lock); + if (rc != 0) { + if (atomic_dec_and_test(&policy->pol_desc->pd_refs)) + module_put(policy->pol_desc->pd_owner); + + policy->pol_state = NRS_POL_STATE_STOPPED; + GOTO(out, rc); + } + } + + if (arg != NULL) { + if (strlcpy(policy->pol_arg, arg, sizeof(policy->pol_arg)) >= + sizeof(policy->pol_arg)) { + CERROR("NRS: arg '%s' is too long\n", arg); + GOTO(out, rc = -E2BIG); + } + } else { + policy->pol_arg[0] = '\0'; + } + + policy->pol_state = NRS_POL_STATE_STARTED; + + if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) { + /** + * This path is only used at PTLRPC service setup time. + */ + nrs->nrs_policy_fallback = policy; + } else { + /* + * Try to stop the current primary policy if there is one. + */ + nrs_policy_stop_primary(nrs); + + /** + * And set the newly-started policy as the primary one. + */ + nrs->nrs_policy_primary = policy; + } + +out: + nrs->nrs_policy_starting = 0; + + RETURN(rc); +} + +/** + * Increases the policy's usage reference count. + */ +static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy) +{ + policy->pol_ref++; +} + +/** + * Decreases the policy's usage reference count, and stops the policy in case it + * was already stopping and have no more outstanding usage references (which + * indicates it has no more queued or started requests, and can be safely + * stopped). + */ +static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_ref > 0); + + policy->pol_ref--; + if (unlikely(policy->pol_ref == 0 && + policy->pol_state == NRS_POL_STATE_STOPPING)) + nrs_policy_stop0(policy); +} + +static void nrs_policy_put(struct ptlrpc_nrs_policy *policy) +{ + spin_lock(&policy->pol_nrs->nrs_lock); + nrs_policy_put_locked(policy); + spin_unlock(&policy->pol_nrs->nrs_lock); +} + +/** + * Find and return a policy by name. + */ +static struct ptlrpc_nrs_policy * nrs_policy_find_locked(struct ptlrpc_nrs *nrs, + char *name) +{ + struct ptlrpc_nrs_policy *tmp; + + list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) { + if (strncmp(tmp->pol_desc->pd_name, name, + NRS_POL_NAME_MAX) == 0) { + nrs_policy_get_locked(tmp); + return tmp; + } + } + return NULL; +} + +/** + * Release references for the resource hierarchy moving upwards towards the + * policy instance resource. + */ +static void nrs_resource_put(struct ptlrpc_nrs_resource *res) +{ + struct ptlrpc_nrs_policy *policy = res->res_policy; + + if (policy->pol_desc->pd_ops->op_res_put != NULL) { + struct ptlrpc_nrs_resource *parent; + + for (; res != NULL; res = parent) { + parent = res->res_parent; + policy->pol_desc->pd_ops->op_res_put(policy, res); + } + } +} + +/** + * Obtains references for each resource in the resource hierarchy for request + * \a nrq if it is to be handled by \a policy. + * + * \param[in] policy the policy + * \param[in] nrq the request + * \param[in] moving_req denotes whether this is a call to the function by + * ldlm_lock_reorder_req(), in order to move \a nrq to + * the high-priority NRS head; we should not sleep when + * set. + * + * \retval NULL resource hierarchy references not obtained + * \retval valid-pointer the bottom level of the resource hierarchy + * + * \see ptlrpc_nrs_pol_ops::op_res_get() + */ +static +struct ptlrpc_nrs_resource * nrs_resource_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + bool moving_req) +{ + /** + * Set to NULL to traverse the resource hierarchy from the top. + */ + struct ptlrpc_nrs_resource *res = NULL; + struct ptlrpc_nrs_resource *tmp = NULL; + int rc; + + while (1) { + rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res, + &tmp, moving_req); + if (rc < 0) { + if (res != NULL) + nrs_resource_put(res); + return NULL; + } + + LASSERT(tmp != NULL); + tmp->res_parent = res; + tmp->res_policy = policy; + res = tmp; + tmp = NULL; + /** + * Return once we have obtained a reference to the bottom level + * of the resource hierarchy. + */ + if (rc > 0) + return res; + } +} + +/** + * Obtains resources for the resource hierarchies and policy references for + * the fallback and current primary policy (if any), that will later be used + * to handle request \a nrq. + * + * \param[in] nrs the NRS head instance that will be handling request \a nrq. + * \param[in] nrq the request that is being handled. + * \param[out] resp the array where references to the resource hierarchy are + * stored. + * \param[in] moving_req is set when obtaining resources while moving a + * request from a policy on the regular NRS head to a + * policy on the HP NRS head (via + * ldlm_lock_reorder_req()). It signifies that + * allocations to get resources should be atomic; for + * a full explanation, see comment in + * ptlrpc_nrs_pol_ops::op_res_get(). + */ +static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs, + struct ptlrpc_nrs_request *nrq, + struct ptlrpc_nrs_resource **resp, + bool moving_req) +{ + struct ptlrpc_nrs_policy *primary = NULL; + struct ptlrpc_nrs_policy *fallback = NULL; + + memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX); + + /** + * Obtain policy references. + */ + spin_lock(&nrs->nrs_lock); + + fallback = nrs->nrs_policy_fallback; + nrs_policy_get_locked(fallback); + + primary = nrs->nrs_policy_primary; + if (primary != NULL) + nrs_policy_get_locked(primary); + + spin_unlock(&nrs->nrs_lock); + + /** + * Obtain resource hierarchy references. + */ + resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req); + LASSERT(resp[NRS_RES_FALLBACK] != NULL); + + if (primary != NULL) { + resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq, + moving_req); + /** + * A primary policy may exist which may not wish to serve a + * particular request for different reasons; release the + * reference on the policy as it will not be used for this + * request. + */ + if (resp[NRS_RES_PRIMARY] == NULL) + nrs_policy_put(primary); + } +} + +/** + * Releases references to resource hierarchies and policies, because they are no + * longer required; used when request handling has been completed, or the + * request is moving to the high priority NRS head. + * + * \param resp the resource hierarchy that is being released + * + * \see ptlrpcnrs_req_hp_move() + * \see ptlrpc_nrs_req_finalize() + */ +static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp) +{ + struct ptlrpc_nrs_policy *pols[NRS_RES_MAX]; + struct ptlrpc_nrs *nrs = NULL; + int i; + + for (i = 0; i < NRS_RES_MAX; i++) { + if (resp[i] != NULL) { + pols[i] = resp[i]->res_policy; + nrs_resource_put(resp[i]); + resp[i] = NULL; + } else { + pols[i] = NULL; + } + } + + for (i = 0; i < NRS_RES_MAX; i++) { + if (pols[i] == NULL) + continue; + + if (nrs == NULL) { + nrs = pols[i]->pol_nrs; + spin_lock(&nrs->nrs_lock); + } + nrs_policy_put_locked(pols[i]); + } + + if (nrs != NULL) + spin_unlock(&nrs->nrs_lock); +} + +/** + * Obtains an NRS request from \a policy for handling or examination; the + * request should be removed in the 'handling' case. + * + * Calling into this function implies we already know the policy has a request + * waiting to be handled. + * + * \param[in] policy the policy from which a request + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force when set, it will force a policy to return a request if it + * has one pending + * + * \retval the NRS request to be handled + */ +static inline +struct ptlrpc_nrs_request * nrs_request_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct ptlrpc_nrs_request *nrq; + + LASSERT(policy->pol_req_queued > 0); + + nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force); + + LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy)); + + return nrq; +} + +/** + * Enqueues request \a nrq for later handling, via one one the policies for + * which resources where earlier obtained via nrs_resource_get_safe(). The + * function attempts to enqueue the request first on the primary policy + * (if any), since this is the preferred choice. + * + * \param nrq the request being enqueued + * + * \see nrs_resource_get_safe() + */ +static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_nrs_policy *policy; + int rc; + int i; + + /** + * Try in descending order, because the primary policy (if any) is + * the preferred choice. + */ + for (i = NRS_RES_MAX - 1; i >= 0; i--) { + if (nrq->nr_res_ptrs[i] == NULL) + continue; + + nrq->nr_res_idx = i; + policy = nrq->nr_res_ptrs[i]->res_policy; + + rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq); + if (rc == 0) { + policy->pol_nrs->nrs_req_queued++; + policy->pol_req_queued++; + return; + } + } + /** + * Should never get here, as at least the primary policy's + * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always + * succeed. + */ + LBUG(); +} + +/** + * Called when a request has been handled + * + * \param[in] nrs the request that has been handled; can be used for + * job/resource control. + * + * \see ptlrpc_nrs_req_stop_nolock() + */ +static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq); + + if (policy->pol_desc->pd_ops->op_req_stop) + policy->pol_desc->pd_ops->op_req_stop(policy, nrq); + + LASSERT(policy->pol_nrs->nrs_req_started > 0); + LASSERT(policy->pol_req_started > 0); + + policy->pol_nrs->nrs_req_started--; + policy->pol_req_started--; +} + +/** + * Handler for operations that can be carried out on policies. + * + * Handles opcodes that are common to all policy types within NRS core, and + * passes any unknown opcodes to the policy-specific control function. + * + * \param[in] nrs the NRS head this policy belongs to. + * \param[in] name the human-readable policy name; should be the same as + * ptlrpc_nrs_pol_desc::pd_name. + * \param[in] opc the opcode of the operation being carried out. + * \param[in,out] arg can be used to pass information in and out between when + * carrying an operation; usually data that is private to + * the policy at some level, or generic policy status + * information. + * + * \retval -ve error condition + * \retval 0 operation was carried out successfully + */ +static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + struct ptlrpc_nrs_policy *policy; + int rc = 0; + ENTRY; + + spin_lock(&nrs->nrs_lock); + + policy = nrs_policy_find_locked(nrs, name); + if (policy == NULL) + GOTO(out, rc = -ENOENT); + + if (policy->pol_state != NRS_POL_STATE_STARTED && + policy->pol_state != NRS_POL_STATE_STOPPED) + GOTO(out, rc = -EAGAIN); + + switch (opc) { + /** + * Unknown opcode, pass it down to the policy-specific control + * function for handling. + */ + default: + rc = nrs_policy_ctl_locked(policy, opc, arg); + break; + + /** + * Start \e policy + */ + case PTLRPC_NRS_CTL_START: + rc = nrs_policy_start_locked(policy, arg); + break; + } +out: + if (policy != NULL) + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + + RETURN(rc); +} + +/** + * Unregisters a policy by name. + * + * \param[in] nrs the NRS head this policy belongs to. + * \param[in] name the human-readable policy name; should be the same as + * ptlrpc_nrs_pol_desc::pd_name + * + * \retval -ve error + * \retval 0 success + */ +static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name) +{ + struct ptlrpc_nrs_policy *policy = NULL; + ENTRY; + + spin_lock(&nrs->nrs_lock); + + policy = nrs_policy_find_locked(nrs, name); + if (policy == NULL) { + spin_unlock(&nrs->nrs_lock); + + CERROR("Can't find NRS policy %s\n", name); + RETURN(-ENOENT); + } + + if (policy->pol_ref > 1) { + CERROR("Policy %s is busy with %d references\n", name, + (int)policy->pol_ref); + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + RETURN(-EBUSY); + } + + LASSERT(policy->pol_req_queued == 0); + LASSERT(policy->pol_req_started == 0); + + if (policy->pol_state != NRS_POL_STATE_STOPPED) { + nrs_policy_stop_locked(policy); + LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED); + } + + list_del(&policy->pol_list); + nrs->nrs_num_pols--; + + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + + nrs_policy_fini(policy); + + LASSERT(policy->pol_private == NULL); + OBD_FREE_PTR(policy); + + RETURN(0); +} + +/** + * Register a policy from \policy descriptor \a desc with NRS head \a nrs. + * + * \param[in] nrs the NRS head on which the policy will be registered. + * \param[in] desc the policy descriptor from which the information will be + * obtained to register the policy. + * + * \retval -ve error + * \retval 0 success + */ +static int nrs_policy_register(struct ptlrpc_nrs *nrs, + struct ptlrpc_nrs_pol_desc *desc) +{ + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_policy *tmp; + struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; + int rc; + ENTRY; + + LASSERT(svcpt != NULL); + LASSERT(desc->pd_ops != NULL); + LASSERT(desc->pd_ops->op_res_get != NULL); + LASSERT(desc->pd_ops->op_req_get != NULL); + LASSERT(desc->pd_ops->op_req_enqueue != NULL); + LASSERT(desc->pd_ops->op_req_dequeue != NULL); + LASSERT(desc->pd_compat != NULL); + + OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable, + svcpt->scp_cpt, sizeof(*policy), GFP_NOFS); + if (policy == NULL) + RETURN(-ENOMEM); + + policy->pol_nrs = nrs; + policy->pol_desc = desc; + policy->pol_state = NRS_POL_STATE_STOPPED; + policy->pol_flags = desc->pd_flags; + + INIT_LIST_HEAD(&policy->pol_list); + INIT_LIST_HEAD(&policy->pol_list_queued); + + rc = nrs_policy_init(policy); + if (rc != 0) { + OBD_FREE_PTR(policy); + RETURN(rc); + } + + spin_lock(&nrs->nrs_lock); + + tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name); + if (tmp != NULL) { + CERROR("NRS policy %s has been registered, can't register it " + "for %s\n", policy->pol_desc->pd_name, + svcpt->scp_service->srv_name); + nrs_policy_put_locked(tmp); + + spin_unlock(&nrs->nrs_lock); + nrs_policy_fini(policy); + OBD_FREE_PTR(policy); + + RETURN(-EEXIST); + } + + list_add_tail(&policy->pol_list, &nrs->nrs_policy_list); + nrs->nrs_num_pols++; + + if (policy->pol_flags & PTLRPC_NRS_FL_REG_START) + rc = nrs_policy_start_locked(policy, NULL); + + spin_unlock(&nrs->nrs_lock); + + if (rc != 0) + (void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name); + + RETURN(rc); +} + +/** + * Enqueue request \a req using one of the policies its resources are referring + * to. + * + * \param[in] req the request to enqueue. + */ +static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_policy *policy; + + LASSERT(req->rq_nrq.nr_initialized); + LASSERT(!req->rq_nrq.nr_enqueued); + + nrs_request_enqueue(&req->rq_nrq); + req->rq_nrq.nr_enqueued = 1; + + policy = nrs_request_policy(&req->rq_nrq); + /** + * Add the policy to the NRS head's list of policies with enqueued + * requests, if it has not been added there. + */ + if (unlikely(list_empty(&policy->pol_list_queued))) + list_add_tail(&policy->pol_list_queued, + &policy->pol_nrs->nrs_policy_queued); +} + +/** + * Enqueue a request on the high priority NRS head. + * + * \param req the request to enqueue. + */ +static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req) +{ + int opc = lustre_msg_get_opc(req->rq_reqmsg); + ENTRY; + + spin_lock(&req->rq_lock); + req->rq_hp = 1; + ptlrpc_nrs_req_add_nolock(req); + if (opc != OBD_PING) + DEBUG_REQ(D_NET, req, "high priority req"); + spin_unlock(&req->rq_lock); + EXIT; +} + +/** + * Returns a boolean predicate indicating whether the policy described by + * \a desc is adequate for use with service \a svc. + * + * \param[in] svc the service + * \param[in] desc the policy descriptor + * + * \retval false the policy is not compatible with the service + * \retval true the policy is compatible with the service + */ +static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + return desc->pd_compat(svc, desc); +} + +/** + * Registers all compatible policies in nrs_core.nrs_policies, for NRS head + * \a nrs. + * + * \param[in] nrs the NRS head + * + * \retval -ve error + * \retval 0 success + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + * + * \see ptlrpc_service_nrs_setup() + */ +static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs) +{ + struct ptlrpc_nrs_pol_desc *desc; + /* for convenience */ + struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + int rc = -EINVAL; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (nrs_policy_compatible(svc, desc)) { + rc = nrs_policy_register(nrs, desc); + if (rc != 0) { + CERROR("Failed to register NRS policy %s for " + "partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svc->srv_name, rc); + /** + * Fail registration if any of the policies' + * registration fails. + */ + break; + } + } + } + + RETURN(rc); +} + +/** + * Initializes NRS head \a nrs of service partition \a svcpt, and registers all + * compatible policies in NRS core, with the NRS head. + * + * \param[in] nrs the NRS head + * \param[in] svcpt the PTLRPC service partition to setup + * + * \retval -ve error + * \retval 0 success + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs, + struct ptlrpc_service_part *svcpt) +{ + int rc; + enum ptlrpc_nrs_queue_type queue; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + if (nrs == &svcpt->scp_nrs_reg) + queue = PTLRPC_NRS_QUEUE_REG; + else if (nrs == svcpt->scp_nrs_hp) + queue = PTLRPC_NRS_QUEUE_HP; + else + LBUG(); + + nrs->nrs_svcpt = svcpt; + nrs->nrs_queue_type = queue; + spin_lock_init(&nrs->nrs_lock); + INIT_LIST_HEAD(&nrs->nrs_policy_list); + INIT_LIST_HEAD(&nrs->nrs_policy_queued); + nrs->nrs_throttling = 0; + + rc = nrs_register_policies_locked(nrs); + + RETURN(rc); +} + +/** + * Allocates a regular and optionally a high-priority NRS head (if the service + * handles high-priority RPCs), and then registers all available compatible + * policies on those NRS heads. + * + * \param[in,out] svcpt the PTLRPC service partition to setup + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_nrs *nrs; + int rc; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + /** + * Initialize the regular NRS head. + */ + nrs = nrs_svcpt2nrs(svcpt, false); + rc = nrs_svcpt_setup_locked0(nrs, svcpt); + if (rc < 0) + GOTO(out, rc); + + /** + * Optionally allocate a high-priority NRS head. + */ + if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL) + GOTO(out, rc); + + OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp, + svcpt->scp_service->srv_cptable, + svcpt->scp_cpt); + if (svcpt->scp_nrs_hp == NULL) + GOTO(out, rc = -ENOMEM); + + nrs = nrs_svcpt2nrs(svcpt, true); + rc = nrs_svcpt_setup_locked0(nrs, svcpt); + +out: + RETURN(rc); +} + +/** + * Unregisters all policies on all available NRS heads in a service partition; + * called at PTLRPC service unregistration time. + * + * \param[in] svcpt the PTLRPC service partition + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_nrs *nrs; + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_policy *tmp; + int rc; + bool hp = false; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + +again: + /* scp_nrs_hp could be NULL due to short of memory. */ + nrs = hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg; + /* check the nrs_svcpt to see if nrs is initialized. */ + if (!nrs || !nrs->nrs_svcpt) { + EXIT; + return; + } + nrs->nrs_stopping = 1; + + list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list, + pol_list) { + rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name); + LASSERT(rc == 0); + } + + /** + * If the service partition has an HP NRS head, clean that up as well. + */ + if (!hp && nrs_svcpt_has_hp(svcpt)) { + hp = true; + goto again; + } + + if (hp) + OBD_FREE_PTR(nrs); + + EXIT; +} + +/** + * Returns the descriptor for a policy as identified by by \a name. + * + * \param[in] name the policy name + * + * \retval the policy descriptor + * \retval NULL + */ +static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name) +{ + struct ptlrpc_nrs_pol_desc *tmp; + ENTRY; + + list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) { + if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0) + RETURN(tmp); + } + RETURN(NULL); +} + +/** + * Removes the policy from all supported NRS heads of all partitions of all + * PTLRPC services. + * + * \param[in] desc the policy descriptor to unregister + * + * \retval -ve error + * \retval 0 successfully unregistered policy on all supported NRS heads + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + * \pre mutex_is_locked(&ptlrpc_all_services_mutex) + */ +static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc) +{ + struct ptlrpc_nrs *nrs; + struct ptlrpc_service *svc; + struct ptlrpc_service_part *svcpt; + int i; + int rc = 0; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex)); + + list_for_each_entry(svc, &ptlrpc_all_services, srv_list) { + + if (!nrs_policy_compatible(svc, desc) || + unlikely(svc->srv_is_stopping)) + continue; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + bool hp = false; + +again: + nrs = nrs_svcpt2nrs(svcpt, hp); + rc = nrs_policy_unregister(nrs, desc->pd_name); + /** + * Ignore -ENOENT as the policy may not have registered + * successfully on all service partitions. + */ + if (rc == -ENOENT) { + rc = 0; + } else if (rc != 0) { + CERROR("Failed to unregister NRS policy %s for " + "partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svcpt->scp_service->srv_name, rc); + RETURN(rc); + } + + if (!hp && nrs_svc_has_hp(svc)) { + hp = true; + goto again; + } + } + + if (desc->pd_ops->op_lprocfs_fini != NULL) + desc->pd_ops->op_lprocfs_fini(svc); + } + + RETURN(rc); +} + +/** + * Registers a new policy with NRS core. + * + * The function will only succeed if policy registration with all compatible + * service partitions (if any) is successful. + * + * N.B. This function should be called either at ptlrpc module initialization + * time when registering a policy that ships with NRS core, or in a + * module's init() function for policies registering from other modules. + * + * \param[in] conf configuration information for the new policy to register + * + * \retval -ve error + * \retval 0 success + */ +int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf) +{ + struct ptlrpc_service *svc; + struct ptlrpc_nrs_pol_desc *desc; + int rc = 0; + ENTRY; + + LASSERT(conf != NULL); + LASSERT(conf->nc_ops != NULL); + LASSERT(conf->nc_compat != NULL); + LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one, + conf->nc_compat_svc_name != NULL)); + LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0, + conf->nc_owner != NULL)); + + conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0'; + + /** + * External policies are not allowed to start immediately upon + * registration, as there is a relatively higher chance that their + * registration might fail. In such a case, some policy instances may + * already have requests queued wen unregistration needs to happen as + * part o cleanup; since there is currently no way to drain requests + * from a policy unless the service is unregistering, we just disallow + * this. + */ + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) && + (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK | + PTLRPC_NRS_FL_REG_START))) { + CERROR("NRS: failing to register policy %s. Please check " + "policy flags; external policies cannot act as fallback " + "policies, or be started immediately upon registration " + "without interaction with lprocfs\n", conf->nc_name); + RETURN(-EINVAL); + } + + mutex_lock(&nrs_core.nrs_mutex); + + if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) { + CERROR("NRS: failing to register policy %s which has already " + "been registered with NRS core!\n", + conf->nc_name); + GOTO(fail, rc = -EEXIST); + } + + OBD_ALLOC_PTR(desc); + if (desc == NULL) + GOTO(fail, rc = -ENOMEM); + + if (strlcpy(desc->pd_name, conf->nc_name, sizeof(desc->pd_name)) >= + sizeof(desc->pd_name)) { + OBD_FREE_PTR(desc); + GOTO(fail, rc = -E2BIG); + } + desc->pd_ops = conf->nc_ops; + desc->pd_compat = conf->nc_compat; + desc->pd_compat_svc_name = conf->nc_compat_svc_name; + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0) + desc->pd_owner = conf->nc_owner; + desc->pd_flags = conf->nc_flags; + atomic_set(&desc->pd_refs, 0); + + /** + * For policies that are held in the same module as NRS (currently + * ptlrpc), do not register the policy with all compatible services, + * as the services will not have started at this point, since we are + * calling from ptlrpc module initialization code. In such cases each + * service will register all compatible policies later, via + * ptlrpc_service_nrs_setup(). + */ + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0) + goto internal; + + /** + * Register the new policy on all compatible services + */ + mutex_lock(&ptlrpc_all_services_mutex); + + list_for_each_entry(svc, &ptlrpc_all_services, srv_list) { + struct ptlrpc_service_part *svcpt; + int i; + int rc2; + + if (!nrs_policy_compatible(svc, desc) || + unlikely(svc->srv_is_stopping)) + continue; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + struct ptlrpc_nrs *nrs; + bool hp = false; +again: + nrs = nrs_svcpt2nrs(svcpt, hp); + rc = nrs_policy_register(nrs, desc); + if (rc != 0) { + CERROR("Failed to register NRS policy %s for " + "partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svcpt->scp_service->srv_name, rc); + + rc2 = nrs_policy_unregister_locked(desc); + /** + * Should not fail at this point + */ + LASSERT(rc2 == 0); + mutex_unlock(&ptlrpc_all_services_mutex); + OBD_FREE_PTR(desc); + GOTO(fail, rc); + } + + if (!hp && nrs_svc_has_hp(svc)) { + hp = true; + goto again; + } + } + + /** + * No need to take a reference to other modules here, as we + * will be calling from the module's init() function. + */ + if (desc->pd_ops->op_lprocfs_init != NULL) { + rc = desc->pd_ops->op_lprocfs_init(svc); + if (rc != 0) { + rc2 = nrs_policy_unregister_locked(desc); + /** + * Should not fail at this point + */ + LASSERT(rc2 == 0); + mutex_unlock(&ptlrpc_all_services_mutex); + OBD_FREE_PTR(desc); + GOTO(fail, rc); + } + } + } + + mutex_unlock(&ptlrpc_all_services_mutex); +internal: + list_add_tail(&desc->pd_list, &nrs_core.nrs_policies); +fail: + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_nrs_policy_register); + +/** + * Unregisters a previously registered policy with NRS core. All instances of + * the policy on all NRS heads of all supported services are removed. + * + * N.B. This function should only be called from a module's exit() function. + * Although it can be used for policies that ship alongside NRS core, the + * function is primarily intended for policies that register externally, + * from other modules. + * + * \param[in] conf configuration information for the policy to unregister + * + * \retval -ve error + * \retval 0 success + */ +int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf) +{ + struct ptlrpc_nrs_pol_desc *desc; + int rc; + ENTRY; + + LASSERT(conf != NULL); + + if (conf->nc_flags & PTLRPC_NRS_FL_FALLBACK) { + CERROR("Unable to unregister a fallback policy, unless the " + "PTLRPC service is stopping.\n"); + RETURN(-EPERM); + } + + conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0'; + + mutex_lock(&nrs_core.nrs_mutex); + + desc = nrs_policy_find_desc_locked(conf->nc_name); + if (desc == NULL) { + CERROR("Failing to unregister NRS policy %s which has " + "not been registered with NRS core!\n", + conf->nc_name); + GOTO(not_exist, rc = -ENOENT); + } + + mutex_lock(&ptlrpc_all_services_mutex); + + rc = nrs_policy_unregister_locked(desc); + if (rc < 0) { + if (rc == -EBUSY) + CERROR("Please first stop policy %s on all service " + "partitions and then retry to unregister the " + "policy.\n", conf->nc_name); + GOTO(fail, rc); + } + + CDEBUG(D_INFO, "Unregistering policy %s from NRS core.\n", + conf->nc_name); + + list_del(&desc->pd_list); + OBD_FREE_PTR(desc); + +fail: + mutex_unlock(&ptlrpc_all_services_mutex); + +not_exist: + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_nrs_policy_unregister); + +/** + * Setup NRS heads on all service partitions of service \a svc, and register + * all compatible policies on those NRS heads. + * + * To be called from withing ptl + * \param[in] svc the service to setup + * + * \retval -ve error, the calling logic should eventually call + * ptlrpc_service_nrs_cleanup() to undo any work performed + * by this function. + * + * \see ptlrpc_register_service() + * \see ptlrpc_service_nrs_cleanup() + */ +int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + const struct ptlrpc_nrs_pol_desc *desc; + int i; + int rc = 0; + + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Initialize NRS heads on all service CPTs. + */ + ptlrpc_service_for_each_part(svcpt, i, svc) { + rc = nrs_svcpt_setup_locked(svcpt); + if (rc != 0) + GOTO(failed, rc); + } + + /** + * Set up lprocfs interfaces for all supported policies for the + * service. + */ + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (!nrs_policy_compatible(svc, desc)) + continue; + + if (desc->pd_ops->op_lprocfs_init != NULL) { + rc = desc->pd_ops->op_lprocfs_init(svc); + if (rc != 0) + GOTO(failed, rc); + } + } + +failed: + + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} + +/** + * Unregisters all policies on all service partitions of service \a svc. + * + * \param[in] svc the PTLRPC service to unregister + */ +void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + const struct ptlrpc_nrs_pol_desc *desc; + int i; + + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Clean up NRS heads on all service partitions + */ + ptlrpc_service_for_each_part(svcpt, i, svc) + nrs_svcpt_cleanup_locked(svcpt); + + /** + * Clean up lprocfs interfaces for all supported policies for the + * service. + */ + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (!nrs_policy_compatible(svc, desc)) + continue; + + if (desc->pd_ops->op_lprocfs_fini != NULL) + desc->pd_ops->op_lprocfs_fini(svc); + } + + mutex_unlock(&nrs_core.nrs_mutex); +} + +/** + * Obtains NRS head resources for request \a req. + * + * These could be either on the regular or HP NRS head of \a svcpt; resources + * taken on the regular head can later be swapped for HP head resources by + * ldlm_lock_reorder_req(). + * + * \param[in] svcpt the service partition + * \param[in] req the request + * \param[in] hp which NRS head of \a svcpt to use + */ +void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + + memset(&req->rq_nrq, 0, sizeof(req->rq_nrq)); + nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs, + false); + + /** + * It is fine to access \e nr_initialized without locking as there is + * no contention at this early stage. + */ + req->rq_nrq.nr_initialized = 1; +} + +/** + * Releases resources for a request; is called after the request has been + * handled. + * + * \param[in] req the request + * + * \see ptlrpc_server_finish_request() + */ +void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req) +{ + if (req->rq_nrq.nr_initialized) { + nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs); + /* no protection on bit nr_initialized because no + * contention at this late stage */ + req->rq_nrq.nr_finalized = 1; + } +} + +void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req) +{ + if (req->rq_nrq.nr_started) + nrs_request_stop(&req->rq_nrq); +} + +/** + * Enqueues request \a req on either the regular or high-priority NRS head + * of service partition \a svcpt. + * + * \param[in] svcpt the service partition + * \param[in] req the request to be enqueued + * \param[in] hp whether to enqueue the request on the regular or + * high-priority NRS head. + */ +void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp) +{ + spin_lock(&svcpt->scp_req_lock); + + if (hp) + ptlrpc_nrs_hpreq_add_nolock(req); + else + ptlrpc_nrs_req_add_nolock(req); + + spin_unlock(&svcpt->scp_req_lock); +} + +static void nrs_request_removed(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_nrs->nrs_req_queued > 0); + LASSERT(policy->pol_req_queued > 0); + + policy->pol_nrs->nrs_req_queued--; + policy->pol_req_queued--; + + /** + * If the policy has no more requests queued, remove it from + * ptlrpc_nrs::nrs_policy_queued. + */ + if (unlikely(policy->pol_req_queued == 0)) { + list_del_init(&policy->pol_list_queued); + + /** + * If there are other policies with queued requests, move the + * current policy to the end so that we can round robin over + * all policies and drain the requests. + */ + } else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) { + LASSERT(policy->pol_req_queued < + policy->pol_nrs->nrs_req_queued); + + list_move_tail(&policy->pol_list_queued, + &policy->pol_nrs->nrs_policy_queued); + } +} + +/** + * Obtains a request for handling from an NRS head of service partition + * \a svcpt. + * + * \param[in] svcpt the service partition + * \param[in] hp whether to obtain a request from the regular or + * high-priority NRS head. + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force when set, it will force a policy to return a request if it + * has one pending + * + * \retval the request to be handled + * \retval NULL the head has no requests to serve + */ +struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp, + bool peek, bool force) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_request *nrq; + + /** + * Always try to drain requests from all NRS polices even if they are + * inactive, because the user can change policy status at runtime. + */ + list_for_each_entry(policy, &nrs->nrs_policy_queued, + pol_list_queued) { + nrq = nrs_request_get(policy, peek, force); + if (nrq != NULL) { + if (likely(!peek)) { + nrq->nr_started = 1; + + policy->pol_req_started++; + policy->pol_nrs->nrs_req_started++; + + nrs_request_removed(policy); + } + + return container_of(nrq, struct ptlrpc_request, rq_nrq); + } + } + + return NULL; +} + +/** + * Dequeues request \a req from the policy it has been enqueued on. + * + * \param[in] req the request + */ +void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq); + + policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq); + + req->rq_nrq.nr_enqueued = 0; + + nrs_request_removed(policy); +} + +/** + * Returns whether there are any requests currently enqueued on any of the + * policies of service partition's \a svcpt NRS head specified by \a hp. Should + * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable + * result. + * + * \param[in] svcpt the service partition to enquire. + * \param[in] hp whether the regular or high-priority NRS head is to be + * enquired. + * + * \retval false the indicated NRS head has no enqueued requests. + * \retval true the indicated NRS head has some enqueued requests. + */ +bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + + return nrs->nrs_req_queued > 0; +}; + +/** + * Returns whether NRS policy is throttling reqeust + * + * \param[in] svcpt the service partition to enquire. + * \param[in] hp whether the regular or high-priority NRS head is to be + * enquired. + * + * \retval false the indicated NRS head has no enqueued requests. + * \retval true the indicated NRS head has some enqueued requests. + */ +bool ptlrpc_nrs_req_throttling_nolock(struct ptlrpc_service_part *svcpt, + bool hp) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + + return !!nrs->nrs_throttling; +}; + +/** + * Moves request \a req from the regular to the high-priority NRS head. + * + * \param[in] req the request to move + */ +void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_nrs_request *nrq = &req->rq_nrq; + struct ptlrpc_nrs_resource *res1[NRS_RES_MAX]; + struct ptlrpc_nrs_resource *res2[NRS_RES_MAX]; + ENTRY; + + /** + * Obtain the high-priority NRS head resources. + */ + nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true); + + spin_lock(&svcpt->scp_req_lock); + + if (!ptlrpc_nrs_req_can_move(req)) + goto out; + + ptlrpc_nrs_req_del_nolock(req); + + memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0])); + memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0])); + + ptlrpc_nrs_hpreq_add_nolock(req); + + memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0])); +out: + spin_unlock(&svcpt->scp_req_lock); + + /** + * Release either the regular NRS head resources if we moved the + * request, or the high-priority NRS head resources if we took a + * reference earlier in this function and ptlrpc_nrs_req_can_move() + * returned false. + */ + nrs_resource_put_safe(res1); + EXIT; +} + +/** + * Carries out a control operation \a opc on the policy identified by the + * human-readable \a name, on either all partitions, or only on the first + * partition of service \a svc. + * + * \param[in] svc the service the policy belongs to. + * \param[in] queue whether to carry out the command on the policy which + * belongs to the regular, high-priority, or both NRS + * heads of service partitions of \a svc. + * \param[in] name the policy to act upon, by human-readable name + * \param[in] opc the opcode of the operation to carry out + * \param[in] single when set, the operation will only be carried out on the + * NRS heads of the first service partition of \a svc. + * This is useful for some policies which e.g. share + * identical values on the same parameters of different + * service partitions; when reading these parameters via + * lprocfs, these policies may just want to obtain and + * print out the values from the first service partition. + * Storing these values centrally elsewhere then could be + * another solution for this. + * \param[in,out] arg can be used as a generic in/out buffer between control + * operations and the user environment. + * + *\retval -ve error condition + *\retval 0 operation was carried out successfully + */ +int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc, + enum ptlrpc_nrs_queue_type queue, char *name, + enum ptlrpc_nrs_ctl opc, bool single, void *arg) +{ + struct ptlrpc_service_part *svcpt; + int i; + int rc = 0; + ENTRY; + + LASSERT(opc != PTLRPC_NRS_CTL_INVALID); + + if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0) + return -EINVAL; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name, + opc, arg); + if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG && + single)) + GOTO(out, rc); + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + /** + * XXX: We could optionally check for + * nrs_svc_has_hp(svc) here, and return an error if it + * is false. Right now we rely on the policies' lprocfs + * handlers that call the present function to make this + * check; if they fail to do so, they might hit the + * assertion inside nrs_svcpt2nrs() below. + */ + rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name, + opc, arg); + if (rc != 0 || single) + GOTO(out, rc); + } + } +out: + RETURN(rc); +} + +/** + * Adds all policies that ship with the ptlrpc module, to NRS core's list of + * policies \e nrs_core.nrs_policies. + * + * \retval 0 all policies have been registered successfully + * \retval -ve error + */ +int ptlrpc_nrs_init(void) +{ + int rc; + ENTRY; + + mutex_init(&nrs_core.nrs_mutex); + INIT_LIST_HEAD(&nrs_core.nrs_policies); + + rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo); + if (rc != 0) + GOTO(fail, rc); + +#ifdef HAVE_SERVER_SUPPORT + rc = ptlrpc_nrs_policy_register(&nrs_conf_crrn); + if (rc != 0) + GOTO(fail, rc); + + rc = ptlrpc_nrs_policy_register(&nrs_conf_orr); + if (rc != 0) + GOTO(fail, rc); + + rc = ptlrpc_nrs_policy_register(&nrs_conf_trr); + if (rc != 0) + GOTO(fail, rc); + rc = ptlrpc_nrs_policy_register(&nrs_conf_tbf); + if (rc != 0) + GOTO(fail, rc); + + rc = ptlrpc_nrs_policy_register(&nrs_conf_delay); + if (rc != 0) + GOTO(fail, rc); +#endif /* HAVE_SERVER_SUPPORT */ + + RETURN(rc); +fail: + /** + * Since no PTLRPC services have been started at this point, all we need + * to do for cleanup is to free the descriptors. + */ + ptlrpc_nrs_fini(); + + RETURN(rc); +} + +/** + * Removes all policy descriptors from nrs_core::nrs_policies, and frees the + * policy descriptors. + * + * Since all PTLRPC services are stopped at this point, there are no more + * instances of any policies, because each service will have stopped its policy + * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the + * descriptors here. + */ +void ptlrpc_nrs_fini(void) +{ + struct ptlrpc_nrs_pol_desc *desc; + struct ptlrpc_nrs_pol_desc *tmp; + + list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies, + pd_list) { + list_del_init(&desc->pd_list); + OBD_FREE_PTR(desc); + } +} + +/** @} nrs */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c new file mode 100644 index 0000000000000..94d21d42f87df --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c @@ -0,0 +1,867 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs_crr.c + * + * Network Request Scheduler (NRS) CRR-N policy + * + * Request ordering in a batched Round-Robin manner over client NIDs + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ +#ifdef HAVE_SERVER_SUPPORT + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name CRR-N policy + * + * Client Round-Robin scheduling over client NIDs + * + * @{ + * + */ + +#define NRS_POL_NAME_CRRN "crrn" + +/** + * Binary heap predicate. + * + * Uses ptlrpc_nrs_request::nr_u::crr::cr_round and + * ptlrpc_nrs_request::nr_u::crr::cr_sequence to compare two binheap nodes and + * produce a binary predicate that shows their relative priority, so that the + * binary heap can perform the necessary sorting operations. + * + * \param[in] e1 the first binheap node to compare + * \param[in] e2 the second binheap node to compare + * + * \retval 0 e1 > e2 + * \retval 1 e1 <= e2 + */ +static int +crrn_req_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2) +{ + struct ptlrpc_nrs_request *nrq1; + struct ptlrpc_nrs_request *nrq2; + + nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node); + nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node); + + if (nrq1->nr_u.crr.cr_round < nrq2->nr_u.crr.cr_round) + return 1; + else if (nrq1->nr_u.crr.cr_round > nrq2->nr_u.crr.cr_round) + return 0; + + return nrq1->nr_u.crr.cr_sequence < nrq2->nr_u.crr.cr_sequence; +} + +static struct cfs_binheap_ops nrs_crrn_heap_ops = { + .hop_enter = NULL, + .hop_exit = NULL, + .hop_compare = crrn_req_compare, +}; + +/** + * libcfs_hash operations for nrs_crrn_net::cn_cli_hash + * + * This uses ptlrpc_request::rq_peer.nid as its key, in order to hash + * nrs_crrn_client objects. + */ +#define NRS_NID_BKT_BITS 8 +#define NRS_NID_BITS 16 + +static unsigned nrs_crrn_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask); +} + +static int nrs_crrn_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + lnet_nid_t *nid = (lnet_nid_t *)key; + struct nrs_crrn_client *cli = hlist_entry(hnode, + struct nrs_crrn_client, + cc_hnode); + return *nid == cli->cc_nid; +} + +static void *nrs_crrn_hop_key(struct hlist_node *hnode) +{ + struct nrs_crrn_client *cli = hlist_entry(hnode, + struct nrs_crrn_client, + cc_hnode); + return &cli->cc_nid; +} + +static void *nrs_crrn_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct nrs_crrn_client, cc_hnode); +} + +static void nrs_crrn_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_crrn_client *cli = hlist_entry(hnode, + struct nrs_crrn_client, + cc_hnode); + atomic_inc(&cli->cc_ref); +} + +static void nrs_crrn_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_crrn_client *cli = hlist_entry(hnode, + struct nrs_crrn_client, + cc_hnode); + atomic_dec(&cli->cc_ref); +} + +static void nrs_crrn_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_crrn_client *cli = hlist_entry(hnode, + struct nrs_crrn_client, + cc_hnode); + LASSERTF(atomic_read(&cli->cc_ref) == 0, + "Busy CRR-N object from client with NID %s, with %d refs\n", + libcfs_nid2str(cli->cc_nid), atomic_read(&cli->cc_ref)); + + OBD_FREE_PTR(cli); +} + +static struct cfs_hash_ops nrs_crrn_hash_ops = { + .hs_hash = nrs_crrn_hop_hash, + .hs_keycmp = nrs_crrn_hop_keycmp, + .hs_key = nrs_crrn_hop_key, + .hs_object = nrs_crrn_hop_object, + .hs_get = nrs_crrn_hop_get, + .hs_put = nrs_crrn_hop_put, + .hs_put_locked = nrs_crrn_hop_put, + .hs_exit = nrs_crrn_hop_exit, +}; + +/** + * Called when a CRR-N policy instance is started. + * + * \param[in] policy the policy + * + * \retval -ENOMEM OOM error + * \retval 0 success + */ +static int nrs_crrn_start(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct nrs_crrn_net *net; + int rc = 0; + ENTRY; + + OBD_CPT_ALLOC_PTR(net, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (net == NULL) + RETURN(-ENOMEM); + + net->cn_binheap = cfs_binheap_create(&nrs_crrn_heap_ops, + CBH_FLAG_ATOMIC_GROW, 4096, NULL, + nrs_pol2cptab(policy), + nrs_pol2cptid(policy)); + if (net->cn_binheap == NULL) + GOTO(out_net, rc = -ENOMEM); + + net->cn_cli_hash = cfs_hash_create("nrs_crrn_nid_hash", + NRS_NID_BITS, NRS_NID_BITS, + NRS_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nrs_crrn_hash_ops, + CFS_HASH_RW_BKTLOCK); + if (net->cn_cli_hash == NULL) + GOTO(out_binheap, rc = -ENOMEM); + + /** + * Set default quantum value to max_rpcs_in_flight for non-MDS OSCs; + * there may be more RPCs pending from each struct nrs_crrn_client even + * with the default max_rpcs_in_flight value, as we are scheduling over + * NIDs, and there may be more than one mount point per client. + */ + net->cn_quantum = OBD_MAX_RIF_DEFAULT; + /** + * Set to 1 so that the test inside nrs_crrn_req_add() can evaluate to + * true. + */ + net->cn_sequence = 1; + + policy->pol_private = net; + + RETURN(rc); + +out_binheap: + cfs_binheap_destroy(net->cn_binheap); +out_net: + OBD_FREE_PTR(net); + + RETURN(rc); +} + +/** + * Called when a CRR-N policy instance is stopped. + * + * Called when the policy has been instructed to transition to the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state and has no more pending + * requests to serve. + * + * \param[in] policy the policy + */ +static void nrs_crrn_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_crrn_net *net = policy->pol_private; + ENTRY; + + LASSERT(net != NULL); + LASSERT(net->cn_binheap != NULL); + LASSERT(net->cn_cli_hash != NULL); + LASSERT(cfs_binheap_is_empty(net->cn_binheap)); + + cfs_binheap_destroy(net->cn_binheap); + cfs_hash_putref(net->cn_cli_hash); + + OBD_FREE_PTR(net); +} + +/** + * Performs a policy-specific ctl function on CRR-N policy instances; similar + * to ioctl. + * + * \param[in] policy the policy instance + * \param[in] opc the opcode + * \param[in,out] arg used for passing parameters and information + * + * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * + * \retval 0 operation carried out successfully + * \retval -ve error + */ +static int nrs_crrn_ctl(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, + void *arg) +{ + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + switch((enum nrs_ctl_crr)opc) { + default: + RETURN(-EINVAL); + + /** + * Read Round Robin quantum size of a policy instance. + */ + case NRS_CTL_CRRN_RD_QUANTUM: { + struct nrs_crrn_net *net = policy->pol_private; + + *(__u16 *)arg = net->cn_quantum; + } + break; + + /** + * Write Round Robin quantum size of a policy instance. + */ + case NRS_CTL_CRRN_WR_QUANTUM: { + struct nrs_crrn_net *net = policy->pol_private; + + net->cn_quantum = *(__u16 *)arg; + LASSERT(net->cn_quantum != 0); + } + break; + } + + RETURN(0); +} + +/** + * Obtains resources from CRR-N policy instances. The top-level resource lives + * inside \e nrs_crrn_net and the second-level resource inside + * \e nrs_crrn_client object instances. + * + * \param[in] policy the policy for which resources are being taken for + * request \a nrq + * \param[in] nrq the request for which resources are being taken + * \param[in] parent parent resource, embedded in nrs_crrn_net for the + * CRR-N policy + * \param[out] resp resources references are placed in this array + * \param[in] moving_req signifies limited caller context; used to perform + * memory allocations in an atomic context in this + * policy + * + * \retval 0 we are returning a top-level, parent resource, one that is + * embedded in an nrs_crrn_net object + * \retval 1 we are returning a bottom-level resource, one that is embedded + * in an nrs_crrn_client object + * + * \see nrs_resource_get_safe() + */ +static int nrs_crrn_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, bool moving_req) +{ + struct nrs_crrn_net *net; + struct nrs_crrn_client *cli; + struct nrs_crrn_client *tmp; + struct ptlrpc_request *req; + + if (parent == NULL) { + *resp = &((struct nrs_crrn_net *)policy->pol_private)->cn_res; + return 0; + } + + net = container_of(parent, struct nrs_crrn_net, cn_res); + req = container_of(nrq, struct ptlrpc_request, rq_nrq); + + cli = cfs_hash_lookup(net->cn_cli_hash, &req->rq_peer.nid); + if (cli != NULL) + goto out; + + OBD_CPT_ALLOC_GFP(cli, nrs_pol2cptab(policy), nrs_pol2cptid(policy), + sizeof(*cli), moving_req ? GFP_ATOMIC : GFP_NOFS); + if (cli == NULL) + return -ENOMEM; + + cli->cc_nid = req->rq_peer.nid; + + atomic_set(&cli->cc_ref, 1); + tmp = cfs_hash_findadd_unique(net->cn_cli_hash, &cli->cc_nid, + &cli->cc_hnode); + if (tmp != cli) { + OBD_FREE_PTR(cli); + cli = tmp; + } +out: + *resp = &cli->cc_res; + + return 1; +} + +/** + * Called when releasing references to the resource hierachy obtained for a + * request for scheduling using the CRR-N policy. + * + * \param[in] policy the policy the resource belongs to + * \param[in] res the resource to be released + */ +static void nrs_crrn_res_put(struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res) +{ + struct nrs_crrn_net *net; + struct nrs_crrn_client *cli; + + /** + * Do nothing for freeing parent, nrs_crrn_net resources + */ + if (res->res_parent == NULL) + return; + + cli = container_of(res, struct nrs_crrn_client, cc_res); + net = container_of(res->res_parent, struct nrs_crrn_net, cn_res); + + cfs_hash_put(net->cn_cli_hash, &cli->cc_hnode); +} + +/** + * Called when getting a request from the CRR-N policy for handlingso that it can be served + * + * \param[in] policy the policy being polled + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force force the policy to return a request; unused in this policy + * + * \retval the request to be handled + * \retval NULL no request available + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request *nrs_crrn_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_crrn_net *net = policy->pol_private; + struct cfs_binheap_node *node = cfs_binheap_root(net->cn_binheap); + struct ptlrpc_nrs_request *nrq; + + nrq = unlikely(node == NULL) ? NULL : + container_of(node, struct ptlrpc_nrs_request, nr_node); + + if (likely(!peek && nrq != NULL)) { + struct nrs_crrn_client *cli; + struct ptlrpc_request *req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + + cli = container_of(nrs_request_resource(nrq), + struct nrs_crrn_client, cc_res); + + LASSERT(nrq->nr_u.crr.cr_round <= cli->cc_round); + + cfs_binheap_remove(net->cn_binheap, &nrq->nr_node); + cli->cc_active--; + + CDEBUG(D_RPCTRACE, + "NRS: starting to handle %s request from %s, with round " + "%llu\n", NRS_POL_NAME_CRRN, + libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round); + + /** Peek at the next request to be served */ + node = cfs_binheap_root(net->cn_binheap); + + /** No more requests */ + if (unlikely(node == NULL)) { + net->cn_round++; + } else { + struct ptlrpc_nrs_request *next; + + next = container_of(node, struct ptlrpc_nrs_request, + nr_node); + + if (net->cn_round < next->nr_u.crr.cr_round) + net->cn_round = next->nr_u.crr.cr_round; + } + } + + return nrq; +} + +/** + * Adds request \a nrq to a CRR-N \a policy instance's set of queued requests + * + * A scheduling round is a stream of requests that have been sorted in batches + * according to the client that they originate from (as identified by its NID); + * there can be only one batch for each client in each round. The batches are of + * maximum size nrs_crrn_net:cn_quantum. When a new request arrives for + * scheduling from a client that has exhausted its quantum in its current round, + * it will start scheduling requests on the next scheduling round. Clients are + * allowed to schedule requests against a round until all requests for the round + * are serviced, so a client might miss a round if it is not generating requests + * for a long enough period of time. Clients that miss a round will continue + * with scheduling the next request that they generate, starting at the round + * that requests are being dispatched for, at the time of arrival of this new + * request. + * + * Requests are tagged with the round number and a sequence number; the sequence + * number indicates the relative ordering amongst the batches of requests in a + * round, and is identical for all requests in a batch, as is the round number. + * The round and sequence numbers are used by crrn_req_compare() in order to + * maintain an ordered set of rounds, with each round consisting of an ordered + * set of batches of requests. + * + * \param[in] policy the policy + * \param[in] nrq the request to add + * + * \retval 0 request successfully added + * \retval != 0 error + */ +static int nrs_crrn_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_crrn_net *net; + struct nrs_crrn_client *cli; + int rc; + + cli = container_of(nrs_request_resource(nrq), + struct nrs_crrn_client, cc_res); + net = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_crrn_net, cn_res); + + if (cli->cc_quantum == 0 || cli->cc_round < net->cn_round || + (cli->cc_active == 0 && cli->cc_quantum > 0)) { + + /** + * If the client has no pending requests, and still some of its + * quantum remaining unused, which implies it has not had a + * chance to schedule up to its maximum allowed batch size of + * requests in the previous round it participated, schedule this + * next request on a new round; this avoids fragmentation of + * request batches caused by client inactivity, at the expense + * of potentially slightly increased service time for the + * request batch this request will be a part of. + */ + if (cli->cc_active == 0 && cli->cc_quantum > 0) + cli->cc_round++; + + /** A new scheduling round has commenced */ + if (cli->cc_round < net->cn_round) + cli->cc_round = net->cn_round; + + /** I was not the last client through here */ + if (cli->cc_sequence < net->cn_sequence) + cli->cc_sequence = ++net->cn_sequence; + /** + * Reset the quantum if we have reached the maximum quantum + * size for this batch, or even if we have not managed to + * complete a batch size up to its maximum allowed size. + * XXX: Accessed unlocked + */ + cli->cc_quantum = net->cn_quantum; + } + + nrq->nr_u.crr.cr_round = cli->cc_round; + nrq->nr_u.crr.cr_sequence = cli->cc_sequence; + + rc = cfs_binheap_insert(net->cn_binheap, &nrq->nr_node); + if (rc == 0) { + cli->cc_active++; + if (--cli->cc_quantum == 0) + cli->cc_round++; + } + return rc; +} + +/** + * Removes request \a nrq from a CRR-N \a policy instance's set of queued + * requests. + * + * \param[in] policy the policy + * \param[in] nrq the request to remove + */ +static void nrs_crrn_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_crrn_net *net; + struct nrs_crrn_client *cli; + bool is_root; + + cli = container_of(nrs_request_resource(nrq), + struct nrs_crrn_client, cc_res); + net = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_crrn_net, cn_res); + + LASSERT(nrq->nr_u.crr.cr_round <= cli->cc_round); + + is_root = &nrq->nr_node == cfs_binheap_root(net->cn_binheap); + + cfs_binheap_remove(net->cn_binheap, &nrq->nr_node); + cli->cc_active--; + + /** + * If we just deleted the node at the root of the binheap, we may have + * to adjust round numbers. + */ + if (unlikely(is_root)) { + /** Peek at the next request to be served */ + struct cfs_binheap_node *node = cfs_binheap_root(net->cn_binheap); + + /** No more requests */ + if (unlikely(node == NULL)) { + net->cn_round++; + } else { + nrq = container_of(node, struct ptlrpc_nrs_request, + nr_node); + + if (net->cn_round < nrq->nr_u.crr.cr_round) + net->cn_round = nrq->nr_u.crr.cr_round; + } + } +} + +/** + * Called right after the request \a nrq finishes being handled by CRR-N policy + * instance \a policy. + * + * \param[in] policy the policy that handled the request + * \param[in] nrq the request that was handled + */ +static void nrs_crrn_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + + CDEBUG(D_RPCTRACE, + "NRS: finished handling %s request from %s, with round %llu" + "\n", NRS_POL_NAME_CRRN, + libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round); +} + +/** + * debugfs interface + */ + +/** + * Retrieves the value of the Round Robin quantum (i.e. the maximum batch size) + * for CRR-N policy instances on both the regular and high-priority NRS head + * of a service, as long as a policy instance is not in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this + * state are skipped later by nrs_crrn_ctl(). + * + * Quantum values are in # of RPCs, and output is in YAML format. + * + * For example: + * + * reg_quantum:8 + * hp_quantum:4 + */ +static int +ptlrpc_lprocfs_nrs_crrn_quantum_seq_show(struct seq_file *m, void *data) +{ + struct ptlrpc_service *svc = m->private; + __u16 quantum; + int rc; + + /** + * Perform two separate calls to this as only one of the NRS heads' + * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state. + */ + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_CRRN, + NRS_CTL_CRRN_RD_QUANTUM, + true, &quantum); + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_REG + "%-5d\n", quantum); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + + if (!nrs_svc_has_hp(svc)) + goto no_hp; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_CRRN, + NRS_CTL_CRRN_RD_QUANTUM, + true, &quantum); + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_HP"%-5d\n", quantum); + /** + * Ignore -ENODEV as the high priority NRS head's policy may be + * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + +no_hp: + return rc; +} + +/** + * Sets the value of the Round Robin quantum (i.e. the maximum batch size) + * for CRR-N policy instances of a service. The user can set the quantum size + * for the regular or high priority NRS head individually by specifying each + * value, or both together in a single invocation. + * + * For example: + * + * lctl set_param *.*.*.nrs_crrn_quantum=reg_quantum:32, to set the regular + * request quantum size on all PTLRPC services to 32 + * + * lctl set_param *.*.*.nrs_crrn_quantum=hp_quantum:16, to set the high + * priority request quantum size on all PTLRPC services to 16, and + * + * lctl set_param *.*.ost_io.nrs_crrn_quantum=16, to set both the regular and + * high priority request quantum sizes of the ost_io service to 16. + * + * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state + * are skipped later by nrs_crrn_ctl(). + */ +static ssize_t +ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file, + const char __user *buffer, + size_t count, + loff_t *off) +{ + struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private; + enum ptlrpc_nrs_queue_type queue = 0; + char kernbuf[LPROCFS_NRS_WR_QUANTUM_MAX_CMD]; + char *val; + long quantum_reg; + long quantum_hp; + /** lprocfs_find_named_value() modifies its argument, so keep a copy */ + size_t count_copy; + int rc = 0; + int rc2 = 0; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + + count_copy = count; + + /** + * Check if the regular quantum value has been specified + */ + val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG, + &count_copy); + if (val != kernbuf) { + rc = kstrtol(val, 10, &quantum_reg); + if (rc) + return rc; + + queue |= PTLRPC_NRS_QUEUE_REG; + } + + count_copy = count; + + /** + * Check if the high priority quantum value has been specified + */ + val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_HP, + &count_copy); + if (val != kernbuf) { + if (!nrs_svc_has_hp(svc)) + return -ENODEV; + + rc = kstrtol(val, 10, &quantum_hp); + if (rc) + return rc; + + queue |= PTLRPC_NRS_QUEUE_HP; + } + + /** + * If none of the queues has been specified, look for a valid numerical + * value + */ + if (queue == 0) { + rc = kstrtol(kernbuf, 10, &quantum_reg); + if (rc) + return rc; + + queue = PTLRPC_NRS_QUEUE_REG; + + if (nrs_svc_has_hp(svc)) { + queue |= PTLRPC_NRS_QUEUE_HP; + quantum_hp = quantum_reg; + } + } + + if ((((queue & PTLRPC_NRS_QUEUE_REG) != 0) && + ((quantum_reg > LPROCFS_NRS_QUANTUM_MAX || quantum_reg <= 0))) || + (((queue & PTLRPC_NRS_QUEUE_HP) != 0) && + ((quantum_hp > LPROCFS_NRS_QUANTUM_MAX || quantum_hp <= 0)))) + return -EINVAL; + + /** + * We change the values on regular and HP NRS heads separately, so that + * we do not exit early from ptlrpc_nrs_policy_control() with an error + * returned by nrs_policy_ctl_locked(), in cases where the user has not + * started the policy on either the regular or HP NRS head; i.e. we are + * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned + * only if the operation fails with -ENODEV on all heads that have been + * specified by the command; if at least one operation succeeds, + * success is returned. + */ + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_CRRN, + NRS_CTL_CRRN_WR_QUANTUM, false, + &quantum_reg); + if ((rc < 0 && rc != -ENODEV) || + (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG)) + return rc; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_CRRN, + NRS_CTL_CRRN_WR_QUANTUM, false, + &quantum_hp); + if ((rc2 < 0 && rc2 != -ENODEV) || + (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP)) + return rc2; + } + + return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum); + +/** + * Initializes a CRR-N policy's lprocfs interface for service \a svc + * + * \param[in] svc the service + * + * \retval 0 success + * \retval != 0 error + */ +static int nrs_crrn_lprocfs_init(struct ptlrpc_service *svc) +{ + struct ldebugfs_vars nrs_crrn_lprocfs_vars[] = { + { .name = "nrs_crrn_quantum", + .fops = &ptlrpc_lprocfs_nrs_crrn_quantum_fops, + .data = svc }, + { NULL } + }; + + if (IS_ERR_OR_NULL(svc->srv_debugfs_entry)) + return 0; + + return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_crrn_lprocfs_vars, NULL); +} + +/** + * CRR-N policy operations + */ +static const struct ptlrpc_nrs_pol_ops nrs_crrn_ops = { + .op_policy_start = nrs_crrn_start, + .op_policy_stop = nrs_crrn_stop, + .op_policy_ctl = nrs_crrn_ctl, + .op_res_get = nrs_crrn_res_get, + .op_res_put = nrs_crrn_res_put, + .op_req_get = nrs_crrn_req_get, + .op_req_enqueue = nrs_crrn_req_add, + .op_req_dequeue = nrs_crrn_req_del, + .op_req_stop = nrs_crrn_req_stop, + .op_lprocfs_init = nrs_crrn_lprocfs_init, +}; + +/** + * CRR-N policy configuration + */ +struct ptlrpc_nrs_pol_conf nrs_conf_crrn = { + .nc_name = NRS_POL_NAME_CRRN, + .nc_ops = &nrs_crrn_ops, + .nc_compat = nrs_policy_compat_all, +}; + +/** @} CRR-N policy */ + +/** @} nrs */ + +#endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c new file mode 100644 index 0000000000000..c8a1e6637d261 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c @@ -0,0 +1,826 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Cray Inc. All Rights Reserved. + * + * Copyright (c) 2017, Intel Corporation. + */ +/* + * lustre/ptlrpc/nrs_delay.c + * + * Network Request Scheduler (NRS) Delay policy + * + * This policy will delay request handling for some configurable amount of + * time. + * + * Author: Chris Horn + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name delay + * + * The delay policy schedules RPCs so that they are only processed after some + * configurable amount of time (in seconds) has passed. + * + * The defaults were chosen arbitrarily. + * + * @{ + */ + +#define NRS_POL_NAME_DELAY "delay" + +/* Default minimum delay in seconds. */ +#define NRS_DELAY_MIN_DEFAULT 5 +/* Default maximum delay, in seconds. */ +#define NRS_DELAY_MAX_DEFAULT 300 +/* Default percentage of delayed RPCs. */ +#define NRS_DELAY_PCT_DEFAULT 100 + +/** + * Binary heap predicate. + * + * Elements are sorted according to the start time assigned to the requests + * upon enqueue. An element with an earlier start time is "less than" an + * element with a later start time. + * + * \retval 0 start_time(e1) > start_time(e2) + * \retval 1 start_time(e1) <= start_time(e2) + */ +static int delay_req_compare(struct cfs_binheap_node *e1, + struct cfs_binheap_node *e2) +{ + struct ptlrpc_nrs_request *nrq1; + struct ptlrpc_nrs_request *nrq2; + + nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node); + nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node); + + return nrq1->nr_u.delay.req_start_time <= + nrq2->nr_u.delay.req_start_time; +} + +static struct cfs_binheap_ops nrs_delay_heap_ops = { + .hop_enter = NULL, + .hop_exit = NULL, + .hop_compare = delay_req_compare, +}; + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes + * the delay-specific private data structure. + * + * \param[in] policy The policy to start + * \param[in] Generic char buffer; unused in this policy + * + * \retval -ENOMEM OOM error + * \retval 0 success + * + * \see nrs_policy_register() + * \see nrs_policy_ctl() + */ +static int nrs_delay_start(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct nrs_delay_data *delay_data; + + ENTRY; + + OBD_CPT_ALLOC_PTR(delay_data, nrs_pol2cptab(policy), + nrs_pol2cptid(policy)); + if (delay_data == NULL) + RETURN(-ENOMEM); + + delay_data->delay_binheap = cfs_binheap_create(&nrs_delay_heap_ops, + CBH_FLAG_ATOMIC_GROW, + 4096, NULL, + nrs_pol2cptab(policy), + nrs_pol2cptid(policy)); + + if (delay_data->delay_binheap == NULL) { + OBD_FREE_PTR(delay_data); + RETURN(-ENOMEM); + } + + delay_data->min_delay = NRS_DELAY_MIN_DEFAULT; + delay_data->max_delay = NRS_DELAY_MAX_DEFAULT; + delay_data->delay_pct = NRS_DELAY_PCT_DEFAULT; + + policy->pol_private = delay_data; + + RETURN(0); +} + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the delay-specific + * private data structure. + * + * \param[in] policy The policy to stop + * + * \see nrs_policy_stop0() + */ +static void nrs_delay_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_delay_data *delay_data = policy->pol_private; + + LASSERT(delay_data != NULL); + LASSERT(delay_data->delay_binheap != NULL); + LASSERT(cfs_binheap_is_empty(delay_data->delay_binheap)); + + cfs_binheap_destroy(delay_data->delay_binheap); + + OBD_FREE_PTR(delay_data); +} + +/** + * Is called for obtaining a delay policy resource. + * + * \param[in] policy The policy on which the request is being asked for + * \param[in] nrq The request for which resources are being taken + * \param[in] parent Parent resource, unused in this policy + * \param[out] resp Resources references are placed in this array + * \param[in] moving_req Signifies limited caller context; unused in this + * policy + * + * \retval 1 The delay policy only has a one-level resource hierarchy + * + * \see nrs_resource_get_safe() + */ +static int nrs_delay_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, bool moving_req) +{ + /** + * Just return the resource embedded inside nrs_delay_data, and end this + * resource hierarchy reference request. + */ + *resp = &((struct nrs_delay_data *)policy->pol_private)->delay_res; + return 1; +} + +/** + * Called when getting a request from the delay policy for handling, or just + * peeking; removes the request from the policy when it is to be handled. + * Requests are only removed from this policy when their start time has + * passed. + * + * \param[in] policy The policy + * \param[in] peek When set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force Force the policy to return a request + * + * \retval The request to be handled + * \retval NULL no request available + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request *nrs_delay_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_delay_data *delay_data = policy->pol_private; + struct cfs_binheap_node *node; + struct ptlrpc_nrs_request *nrq; + + node = cfs_binheap_root(delay_data->delay_binheap); + nrq = unlikely(node == NULL) ? NULL : + container_of(node, struct ptlrpc_nrs_request, nr_node); + + if (likely(nrq != NULL)) { + if (!force && + ktime_get_real_seconds() < nrq->nr_u.delay.req_start_time) + nrq = NULL; + else if (likely(!peek)) + cfs_binheap_remove(delay_data->delay_binheap, + &nrq->nr_node); + } + + return nrq; +} + +/** + * Adds request \a nrq to a delay \a policy instance's set of queued requests + * + * A percentage (delay_pct) of incoming requests are delayed by this policy. + * If selected for delay a request start time is calculated. A start time + * is the current time plus a random offset in the range [min_delay, max_delay] + * The start time is recorded in the request, and is then used by + * delay_req_compare() to maintain a set of requests ordered by their start + * times. + * + * \param[in] policy The policy + * \param[in] nrq The request to add + * + * \retval 0 request added + * \retval 1 request not added + * + */ +static int nrs_delay_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_delay_data *delay_data = policy->pol_private; + + if (delay_data->delay_pct == 0 || /* Not delaying anything */ + (delay_data->delay_pct != 100 && + delay_data->delay_pct < cfs_rand() % 100)) + return 1; + + nrq->nr_u.delay.req_start_time = ktime_get_real_seconds() + cfs_rand() % + (delay_data->max_delay - + delay_data->min_delay + 1) + + delay_data->min_delay; + + return cfs_binheap_insert(delay_data->delay_binheap, &nrq->nr_node); +} + +/** + * Removes request \a nrq from \a policy's list of queued requests. + * + * \param[in] policy The policy + * \param[in] nrq The request to remove + */ +static void nrs_delay_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_delay_data *delay_data = policy->pol_private; + + cfs_binheap_remove(delay_data->delay_binheap, &nrq->nr_node); +} + +/** + * Prints a debug statement right before the request \a nrq stops being + * handled. + * + * \param[in] policy The policy handling the request + * \param[in] nrq The request being handled + * + * \see ptlrpc_server_finish_request() + * \see ptlrpc_nrs_req_stop_nolock() + */ +static void nrs_delay_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + + DEBUG_REQ(D_RPCTRACE, req, + "NRS: finished delayed request from %s after %llds", + libcfs_id2str(req->rq_peer), + (s64)(nrq->nr_u.delay.req_start_time - + req->rq_srv.sr_arrival_time.tv_sec)); +} + +/** + * Performs ctl functions specific to delay policy instances; similar to ioctl + * + * \param[in] policy the policy instance + * \param[in] opc the opcode + * \param[in,out] arg used for passing parameters and information + * + * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * + * \retval 0 operation carried out successfully + * \retval -ve error + */ +static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + struct nrs_delay_data *delay_data = policy->pol_private; + __u32 *val = (__u32 *)arg; + + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + switch ((enum nrs_ctl_delay)opc) { + default: + RETURN(-EINVAL); + + case NRS_CTL_DELAY_RD_MIN: + *val = delay_data->min_delay; + break; + + case NRS_CTL_DELAY_WR_MIN: + if (*val > delay_data->max_delay) + RETURN(-EINVAL); + + delay_data->min_delay = *val; + break; + + case NRS_CTL_DELAY_RD_MAX: + *val = delay_data->max_delay; + break; + + case NRS_CTL_DELAY_WR_MAX: + if (*val < delay_data->min_delay) + RETURN(-EINVAL); + + delay_data->max_delay = *val; + break; + + case NRS_CTL_DELAY_RD_PCT: + *val = delay_data->delay_pct; + break; + + case NRS_CTL_DELAY_WR_PCT: + if (*val < 0 || *val > 100) + RETURN(-EINVAL); + + delay_data->delay_pct = *val; + break; + } + RETURN(0); +} + +/** + * debugfs interface + */ + +/* nrs_delay_min and nrs_delay_max are bounded by these values */ +#define LPROCFS_NRS_DELAY_LOWER_BOUND 0 +#define LPROCFS_NRS_DELAY_UPPER_BOUND 65535 + +#define LPROCFS_NRS_DELAY_MIN_NAME "delay_min:" +#define LPROCFS_NRS_DELAY_MIN_NAME_REG "reg_delay_min:" +#define LPROCFS_NRS_DELAY_MIN_NAME_HP "hp_delay_min:" + +/** + * Max size of the nrs_delay_min seq_write buffer. Needs to be large enough + * to hold the string: "reg_min_delay:65535 hp_min_delay:65535" + */ +#define LPROCFS_NRS_DELAY_MIN_SIZE \ + sizeof(LPROCFS_NRS_DELAY_MIN_NAME_REG \ + __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND) \ + " " LPROCFS_NRS_DELAY_MIN_NAME_HP \ + __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND)) + +#define LPROCFS_NRS_DELAY_MAX_NAME "delay_max:" +#define LPROCFS_NRS_DELAY_MAX_NAME_REG "reg_delay_max:" +#define LPROCFS_NRS_DELAY_MAX_NAME_HP "hp_delay_max:" + +/** + * Similar to LPROCFS_NRS_DELAY_MIN_SIZE above, but for the nrs_delay_max + * variable. + */ +#define LPROCFS_NRS_DELAY_MAX_SIZE \ + sizeof(LPROCFS_NRS_DELAY_MAX_NAME_REG \ + __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND) \ + " " LPROCFS_NRS_DELAY_MAX_NAME_HP \ + __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND)) + +#define LPROCFS_NRS_DELAY_PCT_MIN_VAL 0 +#define LPROCFS_NRS_DELAY_PCT_MAX_VAL 100 +#define LPROCFS_NRS_DELAY_PCT_NAME "delay_pct:" +#define LPROCFS_NRS_DELAY_PCT_NAME_REG "reg_delay_pct:" +#define LPROCFS_NRS_DELAY_PCT_NAME_HP "hp_delay_pct:" + +/** + * Similar to LPROCFS_NRS_DELAY_MIN_SIZE above, but for the nrs_delay_pct + * variable. + */ +#define LPROCFS_NRS_DELAY_PCT_SIZE \ + sizeof(LPROCFS_NRS_DELAY_PCT_NAME_REG \ + __stringify(LPROCFS_NRS_DELAY_PCT_MAX_VAL) \ + " " LPROCFS_NRS_DELAY_PCT_NAME_HP \ + __stringify(LPROCFS_NRS_DELAY_PCT_MAX_VAL)) + +/** + * Helper for delay's seq_write functions. + */ +static ssize_t +lprocfs_nrs_delay_seq_write_common(const char __user *buffer, + unsigned int bufsize, size_t count, + const char *var_name, unsigned int min_val, + unsigned int max_val, + struct ptlrpc_service *svc, char *pol_name, + enum ptlrpc_nrs_ctl opc, bool single) +{ + enum ptlrpc_nrs_queue_type queue = 0; + char *kernbuf; + char *val_str; + long unsigned int val_reg; + long unsigned int val_hp; + size_t count_copy; + int rc = 0; + char *tmp = NULL; + int tmpsize = 0; + + if (count > bufsize - 1) + return -EINVAL; + + OBD_ALLOC(kernbuf, bufsize); + if (kernbuf == NULL) + return -ENOMEM; + + if (copy_from_user(kernbuf, buffer, count)) + GOTO(free_kernbuf, rc = -EFAULT); + + tmpsize = strlen("reg_") + strlen(var_name) + 1; + OBD_ALLOC(tmp, tmpsize); + if (tmp == NULL) + GOTO(free_tmp, rc = -ENOMEM); + + /* look for "reg_" in kernbuf */ + snprintf(tmp, tmpsize, "reg_%s", var_name); + count_copy = count; + val_str = lprocfs_find_named_value(kernbuf, tmp, &count_copy); + if (val_str != kernbuf) { + rc = kstrtoul(val_str, 10, &val_reg); + if (rc != 0) + GOTO(free_tmp, rc = -EINVAL); + queue |= PTLRPC_NRS_QUEUE_REG; + } + + /* look for "hp_" in kernbuf */ + snprintf(tmp, tmpsize, "hp_%s", var_name); + count_copy = count; + val_str = lprocfs_find_named_value(kernbuf, tmp, &count_copy); + if (val_str != kernbuf) { + if (!nrs_svc_has_hp(svc)) + GOTO(free_tmp, rc = -ENODEV); + + rc = kstrtoul(val_str, 10, &val_hp); + if (rc != 0) + GOTO(free_tmp, rc = -EINVAL); + queue |= PTLRPC_NRS_QUEUE_HP; + } + + if (queue == 0) { + if (!isdigit(kernbuf[0])) + GOTO(free_tmp, rc = -EINVAL); + + rc = kstrtoul(kernbuf, 10, &val_reg); + if (rc != 0) + GOTO(free_tmp, rc = -EINVAL); + + queue = PTLRPC_NRS_QUEUE_REG; + + if (nrs_svc_has_hp(svc)) { + queue |= PTLRPC_NRS_QUEUE_HP; + val_hp = val_reg; + } + } + + if (queue & PTLRPC_NRS_QUEUE_REG) { + if (val_reg > max_val || val_reg < min_val) + GOTO(free_tmp, rc = -EINVAL); + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + pol_name, opc, single, &val_reg); + if ((rc < 0 && rc != -ENODEV) || + (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG)) + GOTO(free_tmp, rc); + } + + if (queue & PTLRPC_NRS_QUEUE_HP) { + int rc2 = 0; + if (val_hp > max_val || val_hp < min_val) + GOTO(free_tmp, rc = -EINVAL); + + rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + pol_name, opc, single, &val_hp); + if ((rc2 < 0 && rc2 != -ENODEV) || + (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP)) + GOTO(free_tmp, rc = rc2); + } + + /* If we've reached here then we want to return count */ + rc = count; + +free_tmp: + OBD_FREE(tmp, tmpsize); +free_kernbuf: + OBD_FREE(kernbuf, bufsize); + + return rc; +} + +/** + * Retrieves the value of the minimum delay for delay policy instances on both + * the regular and high-priority NRS head of a service, as long as a policy + * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; + */ +static int +ptlrpc_lprocfs_nrs_delay_min_seq_show(struct seq_file *m, void *data) +{ + struct ptlrpc_service *svc = m->private; + unsigned int min_delay; + int rc; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_MIN, + true, &min_delay); + + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_MIN_NAME_REG"%-5d\n", + min_delay); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc != -ENODEV) + return rc; + + if (!nrs_svc_has_hp(svc)) + return 0; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_MIN, + true, &min_delay); + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_MIN_NAME_HP"%-5d\n", + min_delay); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc == -ENODEV) + rc = 0; + + return rc; +} + +/** + * Sets the value of the minimum request delay for delay policy instances of a + * service. The user can set the minimum request delay for the regular or high + * priority NRS head individually by specifying each value, or both together in + * a single invocation. + * + * For example: + * + * lctl set_param *.*.*.nrs_delay_min=reg_delay_min:5, to set the regular + * request minimum delay on all PtlRPC services to 5 seconds + * + * lctl set_param *.*.*.nrs_delay_min=hp_delay_min:2, to set the high-priority + * request minimum delay on all PtlRPC services to 2 seconds, and + * + * lctl set_param *.*.ost_io.nrs_delay_min=8, to set both the regular and + * high priority request minimum delay of the ost_io service to 8 seconds. + */ +static ssize_t +ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file, + const char __user *buffer, size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + + return lprocfs_nrs_delay_seq_write_common(buffer, + LPROCFS_NRS_DELAY_MIN_SIZE, + count, + LPROCFS_NRS_DELAY_MIN_NAME, + LPROCFS_NRS_DELAY_LOWER_BOUND, + LPROCFS_NRS_DELAY_UPPER_BOUND, + svc, NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_WR_MIN, false); +} +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min); + +/** + * Retrieves the value of the maximum delay for delay policy instances on both + * the regular and high-priority NRS head of a service, as long as a policy + * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; + */ +static int +ptlrpc_lprocfs_nrs_delay_max_seq_show(struct seq_file *m, void *data) +{ + struct ptlrpc_service *svc = m->private; + unsigned int max_delay; + int rc; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_MAX, + true, &max_delay); + + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_MAX_NAME_REG"%-5d\n", + max_delay); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc != -ENODEV) + return rc; + + if (!nrs_svc_has_hp(svc)) + return 0; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_MAX, + true, &max_delay); + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_MAX_NAME_HP"%-5d\n", + max_delay); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc == -ENODEV) + rc = 0; + + return rc; +} + +/** + * Sets the value of the maximum request delay for delay policy instances of a + * service. The user can set the maximum request delay for the regular or high + * priority NRS head individually by specifying each value, or both together in + * a single invocation. + * + * For example: + * + * lctl set_param *.*.*.nrs_delay_max=reg_delay_max:20, to set the regular + * request maximum delay on all PtlRPC services to 20 seconds + * + * lctl set_param *.*.*.nrs_delay_max=hp_delay_max:10, to set the high-priority + * request maximum delay on all PtlRPC services to 10 seconds, and + * + * lctl set_param *.*.ost_io.nrs_delay_max=35, to set both the regular and + * high priority request maximum delay of the ost_io service to 35 seconds. + */ +static ssize_t +ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file, + const char __user *buffer, size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + + return lprocfs_nrs_delay_seq_write_common(buffer, + LPROCFS_NRS_DELAY_MAX_SIZE, + count, + LPROCFS_NRS_DELAY_MAX_NAME, + LPROCFS_NRS_DELAY_LOWER_BOUND, + LPROCFS_NRS_DELAY_UPPER_BOUND, + svc, NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_WR_MAX, false); +} +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max); + +/** + * Retrieves the value of the percentage of requests which should be delayed + * for delay policy instances on both the regular and high-priority NRS head + * of a service, as long as a policy instance is not in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; + */ +static int +ptlrpc_lprocfs_nrs_delay_pct_seq_show(struct seq_file *m, void *data) +{ + struct ptlrpc_service *svc = m->private; + unsigned int delay_pct; + int rc; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_PCT, + true, &delay_pct); + + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_PCT_NAME_REG"%-3d\n", + delay_pct); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc != -ENODEV) + return rc; + + if (!nrs_svc_has_hp(svc)) + return 0; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_RD_PCT, + true, &delay_pct); + if (rc == 0) + seq_printf(m, LPROCFS_NRS_DELAY_PCT_NAME_HP"%-3d\n", + delay_pct); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in + * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + else if (rc == -ENODEV) + rc = 0; + + return rc; +} + +/** + * Sets the value of the percentage of requests to be delayed for delay policy + * instances of a service. The user can set the percentage for the regular or + * high-priority NRS head individually by specifying each value, or both + * together in a single invocation. + * + * For example: + * + * lctl set_param *.*.*.nrs_delay_pct=reg_delay_pct:5, to delay 5 percent of + * regular requests on all PtlRPC services + * + * lctl set_param *.*.*.nrs_delay_pct=hp_delay_pct:2, to delay 2 percent of + * high-priority requests on all PtlRPC services, and + * + * lctl set_param *.*.ost_io.nrs_delay_pct=8, to delay 8 percent of both + * regular and high-priority requests of the ost_io service. + */ +static ssize_t +ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file, + const char __user *buffer, size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + + return lprocfs_nrs_delay_seq_write_common(buffer, + LPROCFS_NRS_DELAY_PCT_SIZE, + count, + LPROCFS_NRS_DELAY_PCT_NAME, + LPROCFS_NRS_DELAY_PCT_MIN_VAL, + LPROCFS_NRS_DELAY_PCT_MAX_VAL, + svc, NRS_POL_NAME_DELAY, + NRS_CTL_DELAY_WR_PCT, false); +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct); + +static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc) +{ + struct ldebugfs_vars nrs_delay_lprocfs_vars[] = { + { .name = "nrs_delay_min", + .fops = &ptlrpc_lprocfs_nrs_delay_min_fops, + .data = svc }, + { .name = "nrs_delay_max", + .fops = &ptlrpc_lprocfs_nrs_delay_max_fops, + .data = svc }, + { .name = "nrs_delay_pct", + .fops = &ptlrpc_lprocfs_nrs_delay_pct_fops, + .data = svc }, + { NULL } + }; + + if (IS_ERR_OR_NULL(svc->srv_debugfs_entry)) + return 0; + + return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_delay_lprocfs_vars, + NULL); +} + +/** + * Delay policy operations + */ +static const struct ptlrpc_nrs_pol_ops nrs_delay_ops = { + .op_policy_start = nrs_delay_start, + .op_policy_stop = nrs_delay_stop, + .op_policy_ctl = nrs_delay_ctl, + .op_res_get = nrs_delay_res_get, + .op_req_get = nrs_delay_req_get, + .op_req_enqueue = nrs_delay_req_add, + .op_req_dequeue = nrs_delay_req_del, + .op_req_stop = nrs_delay_req_stop, + .op_lprocfs_init = nrs_delay_lprocfs_init, +}; + +/** + * Delay policy configuration + */ +struct ptlrpc_nrs_pol_conf nrs_conf_delay = { + .nc_name = NRS_POL_NAME_DELAY, + .nc_ops = &nrs_delay_ops, + .nc_compat = nrs_policy_compat_all, +}; + +/** @} delay */ + +/** @} nrs */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c new file mode 100644 index 0000000000000..369b59978b47f --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c @@ -0,0 +1,271 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs_fifo.c + * + * Network Request Scheduler (NRS) FIFO policy + * + * Handles RPCs in a FIFO manner, as received from the network. This policy is + * a logical wrapper around previous, non-NRS functionality. It is used as the + * default and fallback policy for all types of RPCs on all PTLRPC service + * partitions, for both regular and high-priority NRS heads. Default here means + * the policy is the one enabled at PTLRPC service partition startup time, and + * fallback means the policy is used to handle RPCs that are not handled + * successfully or are not handled at all by any primary policy that may be + * enabled on a given NRS head. + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name fifo + * + * The FIFO policy is a logical wrapper around previous, non-NRS functionality. + * It schedules RPCs in the same order as they are queued from LNet. + * + * @{ + */ + +#define NRS_POL_NAME_FIFO "fifo" + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a + * policy-specific private data structure. + * + * \param[in] policy The policy to start + * + * \retval -ENOMEM OOM error + * \retval 0 success + * + * \see nrs_policy_register() + * \see nrs_policy_ctl() + */ +static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct nrs_fifo_head *head; + + OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (head == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&head->fh_list); + policy->pol_private = head; + return 0; +} + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific + * private data structure. + * + * \param[in] policy The policy to stop + * + * \see nrs_policy_stop0() + */ +static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_fifo_head *head = policy->pol_private; + + LASSERT(head != NULL); + LASSERT(list_empty(&head->fh_list)); + + OBD_FREE_PTR(head); +} + +/** + * Is called for obtaining a FIFO policy resource. + * + * \param[in] policy The policy on which the request is being asked for + * \param[in] nrq The request for which resources are being taken + * \param[in] parent Parent resource, unused in this policy + * \param[out] resp Resources references are placed in this array + * \param[in] moving_req Signifies limited caller context; unused in this + * policy + * + * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since + * it implements a simple scheduling algorithm in which request + * priority is determined on the request arrival order, it does not + * need to maintain a set of resources that would otherwise be used + * to calculate a request's priority. + * + * \see nrs_resource_get_safe() + */ +static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, bool moving_req) +{ + /** + * Just return the resource embedded inside nrs_fifo_head, and end this + * resource hierarchy reference request. + */ + *resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res; + return 1; +} + +/** + * Called when getting a request from the FIFO policy for handling, or just + * peeking; removes the request from the policy when it is to be handled. + * + * \param[in] policy The policy + * \param[in] peek When set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force Force the policy to return a request; unused in this + * policy + * + * \retval The request to be handled; this is the next request in the FIFO + * queue + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request * nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_fifo_head *head = policy->pol_private; + struct ptlrpc_nrs_request *nrq; + + nrq = unlikely(list_empty(&head->fh_list)) ? NULL : + list_entry(head->fh_list.next, struct ptlrpc_nrs_request, + nr_u.fifo.fr_list); + + if (likely(!peek && nrq != NULL)) { + struct ptlrpc_request *req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + + list_del_init(&nrq->nr_u.fifo.fr_list); + + CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: %llu" + "\n", policy->pol_desc->pd_name, + libcfs_id2str(req->rq_peer), nrq->nr_u.fifo.fr_sequence); + } + + return nrq; +} + +/** + * Adds request \a nrq to \a policy's list of queued requests + * + * \param[in] policy The policy + * \param[in] nrq The request to add + * + * \retval 0 success; nrs_request_enqueue() assumes this function will always + * succeed + */ +static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_fifo_head *head; + + head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head, + fh_res); + /** + * Only used for debugging + */ + nrq->nr_u.fifo.fr_sequence = head->fh_sequence++; + list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list); + + return 0; +} + +/** + * Removes request \a nrq from \a policy's list of queued requests. + * + * \param[in] policy The policy + * \param[in] nrq The request to remove + */ +static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list)); + list_del_init(&nrq->nr_u.fifo.fr_list); +} + +/** + * Prints a debug statement right before the request \a nrq stops being + * handled. + * + * \param[in] policy The policy handling the request + * \param[in] nrq The request being handled + * + * \see ptlrpc_server_finish_request() + * \see ptlrpc_nrs_req_stop_nolock() + */ +static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + + CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n", + policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer), + nrq->nr_u.fifo.fr_sequence); +} + +/** + * FIFO policy operations + */ +static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = { + .op_policy_start = nrs_fifo_start, + .op_policy_stop = nrs_fifo_stop, + .op_res_get = nrs_fifo_res_get, + .op_req_get = nrs_fifo_req_get, + .op_req_enqueue = nrs_fifo_req_add, + .op_req_dequeue = nrs_fifo_req_del, + .op_req_stop = nrs_fifo_req_stop, +}; + +/** + * FIFO policy configuration + */ +struct ptlrpc_nrs_pol_conf nrs_conf_fifo = { + .nc_name = NRS_POL_NAME_FIFO, + .nc_ops = &nrs_fifo_ops, + .nc_compat = nrs_policy_compat_all, + .nc_flags = PTLRPC_NRS_FL_FALLBACK | + PTLRPC_NRS_FL_REG_START +}; + +/** @} fifo */ + +/** @} nrs */ + diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c new file mode 100644 index 0000000000000..8b8e092dd8209 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c @@ -0,0 +1,1971 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs_orr.c + * + * Network Request Scheduler (NRS) ORR and TRR policies + * + * Request scheduling in a Round-Robin manner over backend-fs objects and OSTs + * respectively + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +#ifdef HAVE_SERVER_SUPPORT + +/** + * \addtogoup nrs + * @{ + */ +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name ORR/TRR policy + * + * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies + * + * ORR performs batched Round Robin shceduling of brw RPCs, based on the FID of + * the backend-fs object that the brw RPC pertains to; the TRR policy performs + * batched Round Robin scheduling of brw RPCs, based on the OST index that the + * RPC pertains to. Both policies also order RPCs in each batch in ascending + * offset order, which is lprocfs-tunable between logical file offsets, and + * physical disk offsets, as reported by fiemap. + * + * The TRR policy reuses much of the functionality of ORR. These two scheduling + * algorithms could alternatively be implemented under a single NRS policy, that + * uses an lprocfs tunable in order to switch between the two types of + * scheduling behaviour. The two algorithms have been implemented as separate + * policies for reasons of clarity to the user, and to avoid issues that would + * otherwise arise at the point of switching between behaviours in the case of + * having a single policy, such as resource cleanup for nrs_orr_object + * instances. It is possible that this may need to be re-examined in the future, + * along with potentially coalescing other policies that perform batched request + * scheduling in a Round-Robin manner, all into one policy. + * + * @{ + */ + +#define NRS_POL_NAME_ORR "orr" +#define NRS_POL_NAME_TRR "trr" + +/** + * Checks if the RPC type of \a nrq is currently handled by an ORR/TRR policy + * + * \param[in] orrd the ORR/TRR policy scheduler instance + * \param[in] nrq the request + * \param[out] opcode the opcode is saved here, just in order to avoid calling + * lustre_msg_get_opc() again later + * + * \retval true request type is supported by the policy instance + * \retval false request type is not supported by the policy instance + */ +static bool nrs_orr_req_supported(struct nrs_orr_data *orrd, + struct ptlrpc_nrs_request *nrq, __u32 *opcode) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + bool rc = false; + + /** + * XXX: nrs_orr_data::od_supp accessed unlocked. + */ + switch (opc) { + case OST_READ: + rc = orrd->od_supp & NOS_OST_READ; + break; + case OST_WRITE: + rc = orrd->od_supp & NOS_OST_WRITE; + break; + } + + if (rc) + *opcode = opc; + + return rc; +} + +/** + * Returns the ORR/TRR key fields for the request \a nrq in \a key. + * + * \param[in] orrd the ORR/TRR policy scheduler instance + * \param[in] nrq the request + * \param[in] opc the request's opcode + * \param[in] name the policy name + * \param[out] key fields of the key are returned here. + * + * \retval 0 key filled successfully + * \retval < 0 error + */ +static int nrs_orr_key_fill(struct nrs_orr_data *orrd, + struct ptlrpc_nrs_request *nrq, __u32 opc, + char *name, struct nrs_orr_key *key) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + struct ost_body *body; + __u32 ost_idx; + bool is_orr = strncmp(name, NRS_POL_NAME_ORR, + NRS_POL_NAME_MAX) == 0; + + LASSERT(req != NULL); + + /** + * This is an attempt to fill in the request key fields while + * moving a request from the regular to the high-priority NRS + * head (via ldlm_lock_reorder_req()), but the request key has + * been adequately filled when nrs_orr_res_get() was called through + * ptlrpc_nrs_req_initialize() for the regular NRS head's ORR/TRR + * policy, so there is nothing to do. + */ + if ((is_orr && nrq->nr_u.orr.or_orr_set) || + (!is_orr && nrq->nr_u.orr.or_trr_set)) { + *key = nrq->nr_u.orr.or_key; + return 0; + } + + /* Bounce unconnected requests to the default policy. */ + if (req->rq_export == NULL) + return -ENOTCONN; + + if (nrq->nr_u.orr.or_orr_set || nrq->nr_u.orr.or_trr_set) + memset(&nrq->nr_u.orr.or_key, 0, sizeof(nrq->nr_u.orr.or_key)); + + ost_idx = class_server_data(req->rq_export->exp_obd)->lsd_osd_index; + + if (is_orr) { + int rc; + /** + * The request pill for OST_READ and OST_WRITE requests is + * initialized in the ost_io service's + * ptlrpc_service_ops::so_hpreq_handler, ost_io_hpreq_handler(), + * so no need to redo it here. + */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + RETURN(-EFAULT); + + rc = ostid_to_fid(&key->ok_fid, &body->oa.o_oi, ost_idx); + if (rc < 0) + return rc; + + nrq->nr_u.orr.or_orr_set = 1; + } else { + key->ok_idx = ost_idx; + nrq->nr_u.orr.or_trr_set = 1; + } + + return 0; +} + +/** + * Populates the range values in \a range with logical offsets obtained via + * \a nb. + * + * \param[in] nb niobuf_remote struct array for this request + * \param[in] niocount count of niobuf_remote structs for this request + * \param[out] range the offset range is returned here + */ +static void nrs_orr_range_fill_logical(struct niobuf_remote *nb, int niocount, + struct nrs_orr_req_range *range) +{ + /* Should we do this at page boundaries ? */ + range->or_start = nb[0].rnb_offset & PAGE_MASK; + range->or_end = (nb[niocount - 1].rnb_offset + + nb[niocount - 1].rnb_len - 1) | ~PAGE_MASK; +} + +/** + * We obtain information just for a single extent, as the request can only be in + * a single place in the binary heap anyway. + */ +#define ORR_NUM_EXTENTS 1 + +/** + * Converts the logical file offset range in \a range, to a physical disk offset + * range in \a range, for a request. Uses obd_get_info() in order to carry out a + * fiemap call and obtain backend-fs extent information. The returned range is + * in physical block numbers. + * + * \param[in] nrq the request + * \param[in] oa obdo struct for this request + * \param[in,out] range the offset range in bytes; logical range in, physical + * range out + * + * \retval 0 physical offsets obtained successfully + * \retvall < 0 error + */ +static int nrs_orr_range_fill_physical(struct ptlrpc_nrs_request *nrq, + struct obdo *oa, + struct nrs_orr_req_range *range) +{ + struct ptlrpc_request *req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + char fiemap_buf[offsetof(struct fiemap, + fm_extents[ORR_NUM_EXTENTS])]; + struct fiemap *fiemap = (struct fiemap *)fiemap_buf; + struct ll_fiemap_info_key key; + loff_t start; + loff_t end; + int rc; + + key = (typeof(key)) { + .lfik_name = KEY_FIEMAP, + .lfik_oa = *oa, + .lfik_fiemap = { + .fm_start = range->or_start, + .fm_length = range->or_end - range->or_start, + .fm_extent_count = ORR_NUM_EXTENTS + } + }; + + rc = obd_get_info(req->rq_svc_thread->t_env, req->rq_export, + sizeof(key), &key, NULL, fiemap); + if (rc < 0) + GOTO(out, rc); + + if (fiemap->fm_mapped_extents == 0 || + fiemap->fm_mapped_extents > ORR_NUM_EXTENTS) + GOTO(out, rc = -EFAULT); + + /** + * Calculate the physical offset ranges for the request from the extent + * information and the logical request offsets. + */ + start = fiemap->fm_extents[0].fe_physical + range->or_start - + fiemap->fm_extents[0].fe_logical; + end = start + range->or_end - range->or_start; + + range->or_start = start; + range->or_end = end; + + nrq->nr_u.orr.or_physical_set = 1; +out: + return rc; +} + +/** + * Sets the offset range the request covers; either in logical file + * offsets or in physical disk offsets. + * + * \param[in] nrq the request + * \param[in] orrd the ORR/TRR policy scheduler instance + * \param[in] opc the request's opcode + * \param[in] moving_req is the request in the process of moving onto the + * high-priority NRS head? + * + * \retval 0 range filled successfully + * \retval != 0 error + */ +static int nrs_orr_range_fill(struct ptlrpc_nrs_request *nrq, + struct nrs_orr_data *orrd, __u32 opc, + bool moving_req) +{ + struct ptlrpc_request *req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + struct obd_ioobj *ioo; + struct niobuf_remote *nb; + struct ost_body *body; + struct nrs_orr_req_range range; + int niocount; + int rc = 0; + + /** + * If we are scheduling using physical disk offsets, but we have filled + * the offset information in the request previously + * (i.e. ldlm_lock_reorder_req() is moving the request to the + * high-priority NRS head), there is no need to do anything, and we can + * exit. Moreover than the lack of need, we would be unable to perform + * the obd_get_info() call required in nrs_orr_range_fill_physical(), + * because ldlm_lock_reorder_lock() calls into here while holding a + * spinlock, and retrieving fiemap information via obd_get_info() is a + * potentially sleeping operation. + */ + if (orrd->od_physical && nrq->nr_u.orr.or_physical_set) + return 0; + + ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ); + if (ioo == NULL) + GOTO(out, rc = -EFAULT); + + niocount = ioo->ioo_bufcnt; + + nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE); + if (nb == NULL) + GOTO(out, rc = -EFAULT); + + /** + * Use logical information from niobuf_remote structures. + */ + nrs_orr_range_fill_logical(nb, niocount, &range); + + /** + * Obtain physical offsets if selected, and this is an OST_READ RPC + * RPC. We do not enter this block if moving_req is set which indicates + * that the request is being moved to the high-priority NRS head by + * ldlm_lock_reorder_req(), as that function calls in here while holding + * a spinlock, and nrs_orr_range_physical() can sleep, so we just use + * logical file offsets for the range values for such requests. + */ + if (orrd->od_physical && opc == OST_READ && !moving_req) { + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EFAULT); + + /** + * Translate to physical block offsets from backend filesystem + * extents. + * Ignore return values; if obtaining the physical offsets + * fails, use the logical offsets. + */ + nrs_orr_range_fill_physical(nrq, &body->oa, &range); + } + + nrq->nr_u.orr.or_range = range; +out: + return rc; +} + +/** + * Generates a character string that can be used in order to register uniquely + * named libcfs_hash and slab objects for ORR/TRR policy instances. The + * character string is unique per policy instance, as it includes the policy's + * name, the CPT number, and a {reg|hp} token, and there is one policy instance + * per NRS head on each CPT, and the policy is only compatible with the ost_io + * service. + * + * \param[in] policy the policy instance + * \param[out] name the character array that will hold the generated name + */ +static void nrs_orr_genobjname(struct ptlrpc_nrs_policy *policy, char *name) +{ + snprintf(name, NRS_ORR_OBJ_NAME_MAX, "%s%s%s%d", + "nrs_", policy->pol_desc->pd_name, + policy->pol_nrs->nrs_queue_type == PTLRPC_NRS_QUEUE_REG ? + "_reg_" : "_hp_", nrs_pol2cptid(policy)); +} + +/** + * ORR/TRR hash operations + */ +#define NRS_ORR_BITS 24 +#define NRS_ORR_BKT_BITS 12 +#define NRS_ORR_HASH_FLAGS (CFS_HASH_SPIN_BKTLOCK | CFS_HASH_ASSERT_EMPTY) + +#define NRS_TRR_BITS 4 +#define NRS_TRR_BKT_BITS 2 +#define NRS_TRR_HASH_FLAGS CFS_HASH_SPIN_BKTLOCK + +static unsigned +nrs_orr_hop_hash(struct cfs_hash *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(struct nrs_orr_key), mask); +} + +static void *nrs_orr_hop_key(struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + return &orro->oo_key; +} + +static int nrs_orr_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + + return lu_fid_eq(&orro->oo_key.ok_fid, + &((struct nrs_orr_key *)key)->ok_fid); +} + +static void *nrs_orr_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct nrs_orr_object, oo_hnode); +} + +static void nrs_orr_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + orro->oo_ref++; +} + +/** + * Removes an nrs_orr_object the hash and frees its memory, if the object has + * no active users. + */ +static void nrs_orr_hop_put_free(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + struct nrs_orr_data *orrd = container_of(orro->oo_res.res_parent, + struct nrs_orr_data, od_res); + struct cfs_hash_bd bd; + + cfs_hash_bd_get_and_lock(hs, &orro->oo_key, &bd, 1); + + if (--orro->oo_ref > 1) { + cfs_hash_bd_unlock(hs, &bd, 1); + + return; + } + LASSERT(orro->oo_ref == 1); + + cfs_hash_bd_del_locked(hs, &bd, hnode); + cfs_hash_bd_unlock(hs, &bd, 1); + + OBD_SLAB_FREE_PTR(orro, orrd->od_cache); +} + +static void nrs_orr_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + orro->oo_ref--; +} + +static int nrs_trr_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + + return orro->oo_key.ok_idx == ((struct nrs_orr_key *)key)->ok_idx; +} + +static void nrs_trr_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_orr_object *orro = hlist_entry(hnode, + struct nrs_orr_object, + oo_hnode); + struct nrs_orr_data *orrd = container_of(orro->oo_res.res_parent, + struct nrs_orr_data, od_res); + + LASSERTF(orro->oo_ref == 0, + "Busy NRS TRR policy object for OST with index %u, with %ld " + "refs\n", orro->oo_key.ok_idx, orro->oo_ref); + + OBD_SLAB_FREE_PTR(orro, orrd->od_cache); +} + +static struct cfs_hash_ops nrs_orr_hash_ops = { + .hs_hash = nrs_orr_hop_hash, + .hs_key = nrs_orr_hop_key, + .hs_keycmp = nrs_orr_hop_keycmp, + .hs_object = nrs_orr_hop_object, + .hs_get = nrs_orr_hop_get, + .hs_put = nrs_orr_hop_put_free, + .hs_put_locked = nrs_orr_hop_put, +}; + +static struct cfs_hash_ops nrs_trr_hash_ops = { + .hs_hash = nrs_orr_hop_hash, + .hs_key = nrs_orr_hop_key, + .hs_keycmp = nrs_trr_hop_keycmp, + .hs_object = nrs_orr_hop_object, + .hs_get = nrs_orr_hop_get, + .hs_put = nrs_orr_hop_put, + .hs_put_locked = nrs_orr_hop_put, + .hs_exit = nrs_trr_hop_exit, +}; + +#define NRS_ORR_QUANTUM_DFLT 256 + +/** + * Binary heap predicate. + * + * Uses + * ptlrpc_nrs_request::nr_u::orr::or_round, + * ptlrpc_nrs_request::nr_u::orr::or_sequence, and + * ptlrpc_nrs_request::nr_u::orr::or_range to compare two binheap nodes and + * produce a binary predicate that indicates their relative priority, so that + * the binary heap can perform the necessary sorting operations. + * + * \param[in] e1 the first binheap node to compare + * \param[in] e2 the second binheap node to compare + * + * \retval 0 e1 > e2 + * \retval 1 e1 < e2 + */ +static int +orr_req_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2) +{ + struct ptlrpc_nrs_request *nrq1; + struct ptlrpc_nrs_request *nrq2; + + nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node); + nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node); + + /** + * Requests have been scheduled against a different scheduling round. + */ + if (nrq1->nr_u.orr.or_round < nrq2->nr_u.orr.or_round) + return 1; + else if (nrq1->nr_u.orr.or_round > nrq2->nr_u.orr.or_round) + return 0; + + /** + * Requests have been scheduled against the same scheduling round, but + * belong to a different batch, i.e. they pertain to a different + * backend-fs object (for ORR policy instances) or OST (for TRR policy + * instances). + */ + if (nrq1->nr_u.orr.or_sequence < nrq2->nr_u.orr.or_sequence) + return 1; + else if (nrq1->nr_u.orr.or_sequence > nrq2->nr_u.orr.or_sequence) + return 0; + + /** + * If round numbers and sequence numbers are equal, the two requests + * have been scheduled on the same round, and belong to the same batch, + * which means they pertain to the same backend-fs object (if this is an + * ORR policy instance), or to the same OST (if this is a TRR policy + * instance), so these requests should be sorted by ascending offset + * order. + */ + if (nrq1->nr_u.orr.or_range.or_start < + nrq2->nr_u.orr.or_range.or_start) { + return 1; + } else if (nrq1->nr_u.orr.or_range.or_start > + nrq2->nr_u.orr.or_range.or_start) { + return 0; + } else { + /** + * Requests start from the same offset; Dispatch the shorter one + * first; perhaps slightly more chances of hitting caches like + * this. + */ + return nrq1->nr_u.orr.or_range.or_end < + nrq2->nr_u.orr.or_range.or_end; + } +} + +/** + * ORR binary heap operations + */ +static struct cfs_binheap_ops nrs_orr_heap_ops = { + .hop_enter = NULL, + .hop_exit = NULL, + .hop_compare = orr_req_compare, +}; + +/** + * Prints a warning message if an ORR/TRR policy is started on a service with + * more than one CPT. Not printed on the console for now, since we don't + * have any performance metrics in the first place, and it is annoying. + * + * \param[in] policy the policy instance + * + * \retval 0 success + */ +static int nrs_orr_init(struct ptlrpc_nrs_policy *policy) +{ + if (policy->pol_nrs->nrs_svcpt->scp_service->srv_ncpts > 1) + CDEBUG(D_CONFIG, "%s: The %s NRS policy was registered on a " + "service with multiple service partitions. This policy " + "may perform better with a single partition.\n", + policy->pol_nrs->nrs_svcpt->scp_service->srv_name, + policy->pol_desc->pd_name); + + return 0; +} + +/** + * Called when an ORR policy instance is started. + * + * \param[in] policy the policy + * + * \retval -ENOMEM OOM error + * \retval 0 success + */ +static int nrs_orr_start(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct nrs_orr_data *orrd; + struct cfs_hash_ops *ops; + unsigned cur_bits; + unsigned max_bits; + unsigned bkt_bits; + unsigned flags; + int rc = 0; + ENTRY; + + OBD_CPT_ALLOC_PTR(orrd, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (orrd == NULL) + RETURN(-ENOMEM); + + /* + * Binary heap instance for sorted incoming requests. + */ + orrd->od_binheap = cfs_binheap_create(&nrs_orr_heap_ops, + CBH_FLAG_ATOMIC_GROW, 4096, NULL, + nrs_pol2cptab(policy), + nrs_pol2cptid(policy)); + if (orrd->od_binheap == NULL) + GOTO(out_orrd, rc = -ENOMEM); + + nrs_orr_genobjname(policy, orrd->od_objname); + + /** + * Slab cache for NRS ORR/TRR objects. + */ + orrd->od_cache = kmem_cache_create(orrd->od_objname, + sizeof(struct nrs_orr_object), + 0, 0, NULL); + if (orrd->od_cache == NULL) + GOTO(out_binheap, rc = -ENOMEM); + + if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR, + NRS_POL_NAME_MAX) == 0) { + ops = &nrs_orr_hash_ops; + cur_bits = NRS_ORR_BITS; + max_bits = NRS_ORR_BITS; + bkt_bits = NRS_ORR_BKT_BITS; + flags = NRS_ORR_HASH_FLAGS; + } else { + ops = &nrs_trr_hash_ops; + cur_bits = NRS_TRR_BITS; + max_bits = NRS_TRR_BITS; + bkt_bits = NRS_TRR_BKT_BITS; + flags = NRS_TRR_HASH_FLAGS; + } + + /** + * Hash for finding objects by struct nrs_orr_key. + * XXX: For TRR, it might be better to avoid using libcfs_hash? + * All that needs to be resolved are OST indices, and they + * will stay relatively stable during an OSS node's lifetime. + */ + orrd->od_obj_hash = cfs_hash_create(orrd->od_objname, cur_bits, + max_bits, bkt_bits, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, ops, flags); + if (orrd->od_obj_hash == NULL) + GOTO(out_cache, rc = -ENOMEM); + + /* XXX: Fields accessed unlocked */ + orrd->od_quantum = NRS_ORR_QUANTUM_DFLT; + orrd->od_supp = NOS_DFLT; + orrd->od_physical = true; + /** + * Set to 1 so that the test inside nrs_orr_req_add() can evaluate to + * true. + */ + orrd->od_sequence = 1; + + policy->pol_private = orrd; + + RETURN(rc); + +out_cache: + kmem_cache_destroy(orrd->od_cache); +out_binheap: + cfs_binheap_destroy(orrd->od_binheap); +out_orrd: + OBD_FREE_PTR(orrd); + + RETURN(rc); +} + +/** + * Called when an ORR/TRR policy instance is stopped. + * + * Called when the policy has been instructed to transition to the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state and has no more + * pending requests to serve. + * + * \param[in] policy the policy + */ +static void nrs_orr_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_orr_data *orrd = policy->pol_private; + ENTRY; + + LASSERT(orrd != NULL); + LASSERT(orrd->od_binheap != NULL); + LASSERT(orrd->od_obj_hash != NULL); + LASSERT(orrd->od_cache != NULL); + LASSERT(cfs_binheap_is_empty(orrd->od_binheap)); + + cfs_binheap_destroy(orrd->od_binheap); + cfs_hash_putref(orrd->od_obj_hash); + kmem_cache_destroy(orrd->od_cache); + + OBD_FREE_PTR(orrd); +} + +/** + * Performs a policy-specific ctl function on ORR/TRR policy instances; similar + * to ioctl. + * + * \param[in] policy the policy instance + * \param[in] opc the opcode + * \param[in,out] arg used for passing parameters and information + * + * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * + * \retval 0 operation carried successfully + * \retval -ve error + */ +static int nrs_orr_ctl(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + switch((enum nrs_ctl_orr)opc) { + default: + RETURN(-EINVAL); + + case NRS_CTL_ORR_RD_QUANTUM: { + struct nrs_orr_data *orrd = policy->pol_private; + + *(__u16 *)arg = orrd->od_quantum; + } + break; + + case NRS_CTL_ORR_WR_QUANTUM: { + struct nrs_orr_data *orrd = policy->pol_private; + + orrd->od_quantum = *(__u16 *)arg; + LASSERT(orrd->od_quantum != 0); + } + break; + + case NRS_CTL_ORR_RD_OFF_TYPE: { + struct nrs_orr_data *orrd = policy->pol_private; + + *(bool *)arg = orrd->od_physical; + } + break; + + case NRS_CTL_ORR_WR_OFF_TYPE: { + struct nrs_orr_data *orrd = policy->pol_private; + + orrd->od_physical = *(bool *)arg; + } + break; + + case NRS_CTL_ORR_RD_SUPP_REQ: { + struct nrs_orr_data *orrd = policy->pol_private; + + *(enum nrs_orr_supp *)arg = orrd->od_supp; + } + break; + + case NRS_CTL_ORR_WR_SUPP_REQ: { + struct nrs_orr_data *orrd = policy->pol_private; + + orrd->od_supp = *(enum nrs_orr_supp *)arg; + LASSERT((orrd->od_supp & NOS_OST_RW) != 0); + } + break; + } + RETURN(0); +} + +/** + * Obtains resources for ORR/TRR policy instances. The top-level resource lives + * inside \e nrs_orr_data and the second-level resource inside + * \e nrs_orr_object instances. + * + * \param[in] policy the policy for which resources are being taken for + * request \a nrq + * \param[in] nrq the request for which resources are being taken + * \param[in] parent parent resource, embedded in nrs_orr_data for the + * ORR/TRR policies + * \param[out] resp used to return resource references + * \param[in] moving_req signifies limited caller context; used to perform + * memory allocations in an atomic context in this + * policy + * + * \retval 0 we are returning a top-level, parent resource, one that is + * embedded in an nrs_orr_data object + * \retval 1 we are returning a bottom-level resource, one that is embedded + * in an nrs_orr_object object + * + * \see nrs_resource_get_safe() + */ +static int nrs_orr_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, bool moving_req) +{ + struct nrs_orr_data *orrd; + struct nrs_orr_object *orro; + struct nrs_orr_object *tmp; + struct nrs_orr_key key = { { { 0 } } }; + __u32 opc; + int rc = 0; + + /** + * struct nrs_orr_data is requested. + */ + if (parent == NULL) { + *resp = &((struct nrs_orr_data *)policy->pol_private)->od_res; + return 0; + } + + orrd = container_of(parent, struct nrs_orr_data, od_res); + + /** + * If the request type is not supported, fail the enqueuing; the RPC + * will be handled by the fallback NRS policy. + */ + if (!nrs_orr_req_supported(orrd, nrq, &opc)) + return -1; + + /** + * Fill in the key for the request; OST FID for ORR policy instances, + * and OST index for TRR policy instances. + */ + rc = nrs_orr_key_fill(orrd, nrq, opc, policy->pol_desc->pd_name, &key); + if (rc < 0) + RETURN(rc); + + /** + * Set the offset range the request covers + */ + rc = nrs_orr_range_fill(nrq, orrd, opc, moving_req); + if (rc < 0) + RETURN(rc); + + orro = cfs_hash_lookup(orrd->od_obj_hash, &key); + if (orro != NULL) + goto out; + + OBD_SLAB_CPT_ALLOC_PTR_GFP(orro, orrd->od_cache, + nrs_pol2cptab(policy), nrs_pol2cptid(policy), + moving_req ? GFP_ATOMIC : GFP_NOFS); + if (orro == NULL) + RETURN(-ENOMEM); + + orro->oo_key = key; + orro->oo_ref = 1; + + tmp = cfs_hash_findadd_unique(orrd->od_obj_hash, &orro->oo_key, + &orro->oo_hnode); + if (tmp != orro) { + OBD_SLAB_FREE_PTR(orro, orrd->od_cache); + orro = tmp; + } +out: + /** + * For debugging purposes + */ + nrq->nr_u.orr.or_key = orro->oo_key; + + *resp = &orro->oo_res; + + return 1; +} + +/** + * Called when releasing references to the resource hierachy obtained for a + * request for scheduling using ORR/TRR policy instances + * + * \param[in] policy the policy the resource belongs to + * \param[in] res the resource to be released + */ +static void nrs_orr_res_put(struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res) +{ + struct nrs_orr_data *orrd; + struct nrs_orr_object *orro; + + /** + * Do nothing for freeing parent, nrs_orr_data resources. + */ + if (res->res_parent == NULL) + return; + + orro = container_of(res, struct nrs_orr_object, oo_res); + orrd = container_of(res->res_parent, struct nrs_orr_data, od_res); + + cfs_hash_put(orrd->od_obj_hash, &orro->oo_hnode); +} + +/** + * Called when polling an ORR/TRR policy instance for a request so that it can + * be served. Returns the request that is at the root of the binary heap, as + * that is the lowest priority one (i.e. libcfs_heap is an implementation of a + * min-heap) + * + * \param[in] policy the policy instance being polled + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force force the policy to return a request; unused in this policy + * + * \retval the request to be handled + * \retval NULL no request available + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request *nrs_orr_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_orr_data *orrd = policy->pol_private; + struct cfs_binheap_node *node = cfs_binheap_root(orrd->od_binheap); + struct ptlrpc_nrs_request *nrq; + + nrq = unlikely(node == NULL) ? NULL : + container_of(node, struct ptlrpc_nrs_request, nr_node); + + if (likely(!peek && nrq != NULL)) { + struct nrs_orr_object *orro; + + orro = container_of(nrs_request_resource(nrq), + struct nrs_orr_object, oo_res); + + LASSERT(nrq->nr_u.orr.or_round <= orro->oo_round); + + cfs_binheap_remove(orrd->od_binheap, &nrq->nr_node); + orro->oo_active--; + + if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR, + NRS_POL_NAME_MAX) == 0) + CDEBUG(D_RPCTRACE, + "NRS: starting to handle %s request for object " + "with FID "DFID", from OST with index %u, with " + "round %llu\n", NRS_POL_NAME_ORR, + PFID(&orro->oo_key.ok_fid), + nrq->nr_u.orr.or_key.ok_idx, + nrq->nr_u.orr.or_round); + else + CDEBUG(D_RPCTRACE, + "NRS: starting to handle %s request from OST " + "with index %u, with round %llu\n", + NRS_POL_NAME_TRR, nrq->nr_u.orr.or_key.ok_idx, + nrq->nr_u.orr.or_round); + + /** Peek at the next request to be served */ + node = cfs_binheap_root(orrd->od_binheap); + + /** No more requests */ + if (unlikely(node == NULL)) { + orrd->od_round++; + } else { + struct ptlrpc_nrs_request *next; + + next = container_of(node, struct ptlrpc_nrs_request, + nr_node); + + if (orrd->od_round < next->nr_u.orr.or_round) + orrd->od_round = next->nr_u.orr.or_round; + } + } + + return nrq; +} + +/** + * Sort-adds request \a nrq to an ORR/TRR \a policy instance's set of queued + * requests in the policy's binary heap. + * + * A scheduling round is a stream of requests that have been sorted in batches + * according to the backend-fs object (for ORR policy instances) or OST (for TRR + * policy instances) that they pertain to (as identified by its IDIF FID or OST + * index respectively); there can be only one batch for each object or OST in + * each round. The batches are of maximum size nrs_orr_data:od_quantum. When a + * new request arrives for scheduling for an object or OST that has exhausted + * its quantum in its current round, the request will be scheduled on the next + * scheduling round. Requests are allowed to be scheduled against a round until + * all requests for the round are serviced, so an object or OST might miss a + * round if requests are not scheduled for it for a long enough period of time. + * Objects or OSTs that miss a round will continue with having their next + * request scheduled, starting at the round that requests are being dispatched + * for, at the time of arrival of this request. + * + * Requests are tagged with the round number and a sequence number; the sequence + * number indicates the relative ordering amongst the batches of requests in a + * round, and is identical for all requests in a batch, as is the round number. + * The round and sequence numbers are used by orr_req_compare() in order to use + * nrs_orr_data::od_binheap in order to maintain an ordered set of rounds, with + * each round consisting of an ordered set of batches of requests, and each + * batch consisting of an ordered set of requests according to their logical + * file or physical disk offsets. + * + * \param[in] policy the policy + * \param[in] nrq the request to add + * + * \retval 0 request successfully added + * \retval != 0 error + */ +static int nrs_orr_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_orr_data *orrd; + struct nrs_orr_object *orro; + int rc; + + orro = container_of(nrs_request_resource(nrq), + struct nrs_orr_object, oo_res); + orrd = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_orr_data, od_res); + + if (orro->oo_quantum == 0 || orro->oo_round < orrd->od_round || + (orro->oo_active == 0 && orro->oo_quantum > 0)) { + + /** + * If there are no pending requests for the object/OST, but some + * of its quantum still remains unused, which implies we did not + * get a chance to schedule up to its maximum allowed batch size + * of requests in the previous round this object/OST + * participated in, schedule this next request on a new round; + * this avoids fragmentation of request batches caused by + * intermittent inactivity on the object/OST, at the expense of + * potentially slightly increased service time for the request + * batch this request will be a part of. + */ + if (orro->oo_active == 0 && orro->oo_quantum > 0) + orro->oo_round++; + + /** A new scheduling round has commenced */ + if (orro->oo_round < orrd->od_round) + orro->oo_round = orrd->od_round; + + /** I was not the last object/OST that scheduled a request */ + if (orro->oo_sequence < orrd->od_sequence) + orro->oo_sequence = ++orrd->od_sequence; + /** + * Reset the quantum if we have reached the maximum quantum + * size for this batch, or even if we have not managed to + * complete a batch size up to its maximum allowed size. + * XXX: Accessed unlocked + */ + orro->oo_quantum = orrd->od_quantum; + } + + nrq->nr_u.orr.or_round = orro->oo_round; + nrq->nr_u.orr.or_sequence = orro->oo_sequence; + + rc = cfs_binheap_insert(orrd->od_binheap, &nrq->nr_node); + if (rc == 0) { + orro->oo_active++; + if (--orro->oo_quantum == 0) + orro->oo_round++; + } + return rc; +} + +/** + * Removes request \a nrq from an ORR/TRR \a policy instance's set of queued + * requests. + * + * \param[in] policy the policy + * \param[in] nrq the request to remove + */ +static void nrs_orr_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_orr_data *orrd; + struct nrs_orr_object *orro; + bool is_root; + + orro = container_of(nrs_request_resource(nrq), + struct nrs_orr_object, oo_res); + orrd = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_orr_data, od_res); + + LASSERT(nrq->nr_u.orr.or_round <= orro->oo_round); + + is_root = &nrq->nr_node == cfs_binheap_root(orrd->od_binheap); + + cfs_binheap_remove(orrd->od_binheap, &nrq->nr_node); + orro->oo_active--; + + /** + * If we just deleted the node at the root of the binheap, we may have + * to adjust round numbers. + */ + if (unlikely(is_root)) { + /** Peek at the next request to be served */ + struct cfs_binheap_node *node = cfs_binheap_root(orrd->od_binheap); + + /** No more requests */ + if (unlikely(node == NULL)) { + orrd->od_round++; + } else { + nrq = container_of(node, struct ptlrpc_nrs_request, + nr_node); + + if (orrd->od_round < nrq->nr_u.orr.or_round) + orrd->od_round = nrq->nr_u.orr.or_round; + } + } +} + +/** + * Called right after the request \a nrq finishes being handled by ORR policy + * instance \a policy. + * + * \param[in] policy the policy that handled the request + * \param[in] nrq the request that was handled + */ +static void nrs_orr_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + /** NB: resource control, credits etc can be added here */ + if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR, + NRS_POL_NAME_MAX) == 0) + CDEBUG(D_RPCTRACE, + "NRS: finished handling %s request for object with FID " + DFID", from OST with index %u, with round %llu\n", + NRS_POL_NAME_ORR, PFID(&nrq->nr_u.orr.or_key.ok_fid), + nrq->nr_u.orr.or_key.ok_idx, nrq->nr_u.orr.or_round); + else + CDEBUG(D_RPCTRACE, + "NRS: finished handling %s request from OST with index %u," + " with round %llu\n", + NRS_POL_NAME_TRR, nrq->nr_u.orr.or_key.ok_idx, + nrq->nr_u.orr.or_round); +} + +/** + * debugfs interface + */ + +/** + * This allows to bundle the policy name into the lprocfs_vars::data pointer + * so that lprocfs read/write functions can be used by both the ORR and TRR + * policies. + */ +static struct nrs_lprocfs_orr_data { + struct ptlrpc_service *svc; + char *name; +} lprocfs_orr_data = { + .name = NRS_POL_NAME_ORR +}, lprocfs_trr_data = { + .name = NRS_POL_NAME_TRR +}; + +/** + * Retrieves the value of the Round Robin quantum (i.e. the maximum batch size) + * for ORR/TRR policy instances on both the regular and high-priority NRS head + * of a service, as long as a policy instance is not in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this + * state are skipped later by nrs_orr_ctl(). + * + * Quantum values are in # of RPCs, and the output is in YAML format. + * + * For example: + * + * reg_quantum:256 + * hp_quantum:8 + * + * XXX: the CRR-N version of this, ptlrpc_lprocfs_rd_nrs_crrn_quantum() is + * almost identical; it can be reworked and then reused for ORR/TRR. + */ +static int +ptlrpc_lprocfs_nrs_orr_quantum_seq_show(struct seq_file *m, void *data) +{ + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + __u16 quantum; + int rc; + + /** + * Perform two separate calls to this as only one of the NRS heads' + * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state. + */ + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, + NRS_CTL_ORR_RD_QUANTUM, + true, &quantum); + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_REG "%-5d\n", quantum); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + + /** + * We know the ost_io service which is the only one ORR/TRR policies are + * compatible with, do have an HP NRS head, but it may be best to guard + * against a possible change of this in the future. + */ + if (!nrs_svc_has_hp(svc)) + goto no_hp; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, NRS_CTL_ORR_RD_QUANTUM, + true, &quantum); + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_HP"%-5d\n", quantum); + /** + * Ignore -ENODEV as the high priority NRS head's policy may be + * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + +no_hp: + + return rc; +} + +/** + * Sets the value of the Round Robin quantum (i.e. the maximum batch size) + * for ORR/TRR policy instances of a service. The user can set the quantum size + * for the regular and high priority NRS head separately by specifying each + * value, or both together in a single invocation. + * + * For example: + * + * lctl set_param ost.OSS.ost_io.nrs_orr_quantum=req_quantum:64, to set the + * request quantum size of the ORR policy instance on the regular NRS head of + * the ost_io service to 64 + * + * lctl set_param ost.OSS.ost_io.nrs_trr_quantum=hp_quantum:8 to set the request + * quantum size of the TRR policy instance on the high priority NRS head of the + * ost_io service to 8 + * + * lctl set_param ost.OSS.ost_io.nrs_orr_quantum=32, to set both the request + * quantum size of the ORR policy instance on both the regular and the high + * priority NRS head of the ost_io service to 32 + * + * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state + * are skipped later by nrs_orr_ctl(). + * + * XXX: the CRR-N version of this, ptlrpc_lprocfs_wr_nrs_crrn_quantum() is + * almost identical; it can be reworked and then reused for ORR/TRR. + */ +static ssize_t +ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + enum ptlrpc_nrs_queue_type queue = 0; + char kernbuf[LPROCFS_NRS_WR_QUANTUM_MAX_CMD]; + char *val; + long quantum_reg; + long quantum_hp; + /** lprocfs_find_named_value() modifies its argument, so keep a copy */ + size_t count_copy; + int rc = 0; + int rc2 = 0; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + + count_copy = count; + + /** + * Check if the regular quantum value has been specified + */ + val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG, + &count_copy); + if (val != kernbuf) { + rc = kstrtol(val, 10, &quantum_reg); + if (rc) + return rc; + queue |= PTLRPC_NRS_QUEUE_REG; + } + + count_copy = count; + + /** + * Check if the high priority quantum value has been specified + */ + val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_HP, + &count_copy); + if (val != kernbuf) { + if (!nrs_svc_has_hp(svc)) + return -ENODEV; + + rc = kstrtol(val, 10, &quantum_hp); + if (rc) + return rc; + + queue |= PTLRPC_NRS_QUEUE_HP; + } + + /** + * If none of the queues has been specified, look for a valid numerical + * value + */ + if (queue == 0) { + rc = kstrtol(kernbuf, 10, &quantum_reg); + if (rc) + return rc; + + queue = PTLRPC_NRS_QUEUE_REG; + + if (nrs_svc_has_hp(svc)) { + queue |= PTLRPC_NRS_QUEUE_HP; + quantum_hp = quantum_reg; + } + } + + if ((((queue & PTLRPC_NRS_QUEUE_REG) != 0) && + ((quantum_reg > LPROCFS_NRS_QUANTUM_MAX || quantum_reg <= 0))) || + (((queue & PTLRPC_NRS_QUEUE_HP) != 0) && + ((quantum_hp > LPROCFS_NRS_QUANTUM_MAX || quantum_hp <= 0)))) + return -EINVAL; + + /** + * We change the values on regular and HP NRS heads separately, so that + * we do not exit early from ptlrpc_nrs_policy_control() with an error + * returned by nrs_policy_ctl_locked(), in cases where the user has not + * started the policy on either the regular or HP NRS head; i.e. we are + * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned + * only if the operation fails with -ENODEV on all heads that have been + * specified by the command; if at least one operation succeeds, + * success is returned. + */ + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, + NRS_CTL_ORR_WR_QUANTUM, false, + &quantum_reg); + if ((rc < 0 && rc != -ENODEV) || + (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG)) + return rc; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, + NRS_CTL_ORR_WR_QUANTUM, false, + &quantum_hp); + if ((rc2 < 0 && rc2 != -ENODEV) || + (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP)) + return rc2; + } + + return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_quantum); + +#define LPROCFS_NRS_OFF_NAME_REG "reg_offset_type:" +#define LPROCFS_NRS_OFF_NAME_HP "hp_offset_type:" + +#define LPROCFS_NRS_OFF_NAME_PHYSICAL "physical" +#define LPROCFS_NRS_OFF_NAME_LOGICAL "logical" + +/** + * Retrieves the offset type used by ORR/TRR policy instances on both the + * regular and high-priority NRS head of a service, as long as a policy + * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; + * policy instances in this state are skipped later by nrs_orr_ctl(). + * + * Offset type information is a (physical|logical) string, and output is + * in YAML format. + * + * For example: + * + * reg_offset_type:physical + * hp_offset_type:logical + */ +static int +ptlrpc_lprocfs_nrs_orr_offset_type_seq_show(struct seq_file *m, void *data) +{ + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + bool physical; + int rc; + + /** + * Perform two separate calls to this as only one of the NRS heads' + * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED + * or ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state. + */ + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, NRS_CTL_ORR_RD_OFF_TYPE, + true, &physical); + if (rc == 0) { + seq_printf(m, LPROCFS_NRS_OFF_NAME_REG"%s\n", + physical ? LPROCFS_NRS_OFF_NAME_PHYSICAL : + LPROCFS_NRS_OFF_NAME_LOGICAL); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + + /** + * We know the ost_io service which is the only one ORR/TRR policies are + * compatible with, do have an HP NRS head, but it may be best to guard + * against a possible change of this in the future. + */ + if (!nrs_svc_has_hp(svc)) + goto no_hp; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, NRS_CTL_ORR_RD_OFF_TYPE, + true, &physical); + if (rc == 0) { + seq_printf(m, LPROCFS_NRS_OFF_NAME_HP"%s\n", + physical ? LPROCFS_NRS_OFF_NAME_PHYSICAL : + LPROCFS_NRS_OFF_NAME_LOGICAL); + /** + * Ignore -ENODEV as the high priority NRS head's policy may be + * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + +no_hp: + return rc; +} + +/** + * Max valid command string is the size of the labels, plus "physical" twice. + * plus a separating ' ' + */ +#define LPROCFS_NRS_WR_OFF_TYPE_MAX_CMD \ + sizeof(LPROCFS_NRS_OFF_NAME_REG LPROCFS_NRS_OFF_NAME_PHYSICAL " " \ + LPROCFS_NRS_OFF_NAME_HP LPROCFS_NRS_OFF_NAME_PHYSICAL) + +/** + * Sets the type of offsets used to order RPCs in ORR/TRR policy instances. The + * user can set offset type for the regular or high priority NRS head + * separately by specifying each value, or both together in a single invocation. + * + * For example: + * + * lctl set_param ost.OSS.ost_io.nrs_orr_offset_type= + * reg_offset_type:physical, to enable the ORR policy instance on the regular + * NRS head of the ost_io service to use physical disk offset ordering. + * + * lctl set_param ost.OSS.ost_io.nrs_trr_offset_type=logical, to enable the TRR + * policy instances on both the regular ang high priority NRS heads of the + * ost_io service to use logical file offset ordering. + * + * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state are + * are skipped later by nrs_orr_ctl(). + */ +static ssize_t +ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file, + const char __user *buffer, + size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + enum ptlrpc_nrs_queue_type queue = 0; + char kernbuf[LPROCFS_NRS_WR_OFF_TYPE_MAX_CMD]; + char *val_reg; + char *val_hp; + bool physical_reg; + bool physical_hp; + size_t count_copy; + int rc = 0; + int rc2 = 0; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + + count_copy = count; + + /** + * Check if the regular offset type has been specified + */ + val_reg = lprocfs_find_named_value(kernbuf, + LPROCFS_NRS_OFF_NAME_REG, + &count_copy); + if (val_reg != kernbuf) + queue |= PTLRPC_NRS_QUEUE_REG; + + count_copy = count; + + /** + * Check if the high priority offset type has been specified + */ + val_hp = lprocfs_find_named_value(kernbuf, LPROCFS_NRS_OFF_NAME_HP, + &count_copy); + if (val_hp != kernbuf) { + if (!nrs_svc_has_hp(svc)) + return -ENODEV; + + queue |= PTLRPC_NRS_QUEUE_HP; + } + + /** + * If none of the queues has been specified, there may be a valid + * command string at the start of the buffer. + */ + if (queue == 0) { + queue = PTLRPC_NRS_QUEUE_REG; + + if (nrs_svc_has_hp(svc)) + queue |= PTLRPC_NRS_QUEUE_HP; + } + + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + if (strncmp(val_reg, LPROCFS_NRS_OFF_NAME_PHYSICAL, + sizeof(LPROCFS_NRS_OFF_NAME_PHYSICAL) - 1) == 0) + physical_reg = true; + else if (strncmp(val_reg, LPROCFS_NRS_OFF_NAME_LOGICAL, + sizeof(LPROCFS_NRS_OFF_NAME_LOGICAL) - 1) == 0) + physical_reg = false; + else + return -EINVAL; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + if (strncmp(val_hp, LPROCFS_NRS_OFF_NAME_PHYSICAL, + sizeof(LPROCFS_NRS_OFF_NAME_PHYSICAL) - 1) == 0) + physical_hp = true; + else if (strncmp(val_hp, LPROCFS_NRS_OFF_NAME_LOGICAL, + sizeof(LPROCFS_NRS_OFF_NAME_LOGICAL) - 1) == 0) + physical_hp = false; + else + return -EINVAL; + } + + /** + * We change the values on regular and HP NRS heads separately, so that + * we do not exit early from ptlrpc_nrs_policy_control() with an error + * returned by nrs_policy_ctl_locked(), in cases where the user has not + * started the policy on either the regular or HP NRS head; i.e. we are + * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned + * only if the operation fails with -ENODEV on all heads that have been + * specified by the command; if at least one operation succeeds, + * success is returned. + */ + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, + NRS_CTL_ORR_WR_OFF_TYPE, false, + &physical_reg); + if ((rc < 0 && rc != -ENODEV) || + (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG)) + return rc; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, + NRS_CTL_ORR_WR_OFF_TYPE, false, + &physical_hp); + if ((rc2 < 0 && rc2 != -ENODEV) || + (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP)) + return rc2; + } + + return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_offset_type); + +#define NRS_LPROCFS_REQ_SUPP_NAME_REG "reg_supported:" +#define NRS_LPROCFS_REQ_SUPP_NAME_HP "hp_supported:" + +#define LPROCFS_NRS_SUPP_NAME_READS "reads" +#define LPROCFS_NRS_SUPP_NAME_WRITES "writes" +#define LPROCFS_NRS_SUPP_NAME_READWRITES "reads_and_writes" + +/** + * Translates enum nrs_orr_supp values to a corresponding string. + */ +static const char *nrs_orr_supp2str(enum nrs_orr_supp supp) +{ + switch(supp) { + default: + LBUG(); + case NOS_OST_READ: + return LPROCFS_NRS_SUPP_NAME_READS; + case NOS_OST_WRITE: + return LPROCFS_NRS_SUPP_NAME_WRITES; + case NOS_OST_RW: + return LPROCFS_NRS_SUPP_NAME_READWRITES; + } +} + +/** + * Translates strings to the corresponding enum nrs_orr_supp value + */ +static enum nrs_orr_supp nrs_orr_str2supp(const char *val) +{ + if (strncmp(val, LPROCFS_NRS_SUPP_NAME_READWRITES, + sizeof(LPROCFS_NRS_SUPP_NAME_READWRITES) - 1) == 0) + return NOS_OST_RW; + else if (strncmp(val, LPROCFS_NRS_SUPP_NAME_READS, + sizeof(LPROCFS_NRS_SUPP_NAME_READS) - 1) == 0) + return NOS_OST_READ; + else if (strncmp(val, LPROCFS_NRS_SUPP_NAME_WRITES, + sizeof(LPROCFS_NRS_SUPP_NAME_WRITES) - 1) == 0) + return NOS_OST_WRITE; + else + return -EINVAL; +} + +/** + * Retrieves the type of RPCs handled at the point of invocation by ORR/TRR + * policy instances on both the regular and high-priority NRS head of a service, + * as long as a policy instance is not in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this + * state are skipped later by nrs_orr_ctl(). + * + * Supported RPC type information is a (reads|writes|reads_and_writes) string, + * and output is in YAML format. + * + * For example: + * + * reg_supported:reads + * hp_supported:reads_and_writes + */ +static int +ptlrpc_lprocfs_nrs_orr_supported_seq_show(struct seq_file *m, void *data) +{ + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + enum nrs_orr_supp supported; + int rc; + + /** + * Perform two separate calls to this as only one of the NRS heads' + * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED + * or ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state. + */ + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, + NRS_CTL_ORR_RD_SUPP_REQ, true, + &supported); + + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_REQ_SUPP_NAME_REG"%s\n", + nrs_orr_supp2str(supported)); + /** + * Ignore -ENODEV as the regular NRS head's policy may be in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + + /** + * We know the ost_io service which is the only one ORR/TRR policies are + * compatible with, do have an HP NRS head, but it may be best to guard + * against a possible change of this in the future. + */ + if (!nrs_svc_has_hp(svc)) + goto no_hp; + + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, + NRS_CTL_ORR_RD_SUPP_REQ, true, + &supported); + if (rc == 0) { + seq_printf(m, NRS_LPROCFS_REQ_SUPP_NAME_HP"%s\n", + nrs_orr_supp2str(supported)); + /** + * Ignore -ENODEV as the high priority NRS head's policy may be + * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + +no_hp: + + return rc; +} + +/** + * Max valid command string is the size of the labels, plus "reads_and_writes" + * twice, plus a separating ' ' + */ +#define LPROCFS_NRS_WR_REQ_SUPP_MAX_CMD \ + sizeof(NRS_LPROCFS_REQ_SUPP_NAME_REG LPROCFS_NRS_SUPP_NAME_READWRITES \ + NRS_LPROCFS_REQ_SUPP_NAME_HP LPROCFS_NRS_SUPP_NAME_READWRITES \ + " ") + +/** + * Sets the type of RPCs handled by ORR/TRR policy instances. The user can + * modify this setting for the regular or high priority NRS heads separately, or + * both together in a single invocation. + * + * For example: + * + * lctl set_param ost.OSS.ost_io.nrs_orr_supported= + * "reg_supported:reads", to enable the ORR policy instance on the regular NRS + * head of the ost_io service to handle OST_READ RPCs. + * + * lctl set_param ost.OSS.ost_io.nrs_trr_supported=reads_and_writes, to enable + * the TRR policy instances on both the regular ang high priority NRS heads of + * the ost_io service to use handle OST_READ and OST_WRITE RPCs. + * + * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state are + * are skipped later by nrs_orr_ctl(). + */ +static ssize_t +ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file, + const char __user *buffer, + size_t count, + loff_t *off) +{ + struct seq_file *m = file->private_data; + struct nrs_lprocfs_orr_data *orr_data = m->private; + struct ptlrpc_service *svc = orr_data->svc; + enum ptlrpc_nrs_queue_type queue = 0; + char kernbuf[LPROCFS_NRS_WR_REQ_SUPP_MAX_CMD]; + char *val_reg; + char *val_hp; + enum nrs_orr_supp supp_reg; + enum nrs_orr_supp supp_hp; + size_t count_copy; + int rc = 0; + int rc2 = 0; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + + count_copy = count; + + /** + * Check if the regular supported requests setting has been specified + */ + val_reg = lprocfs_find_named_value(kernbuf, + NRS_LPROCFS_REQ_SUPP_NAME_REG, + &count_copy); + if (val_reg != kernbuf) + queue |= PTLRPC_NRS_QUEUE_REG; + + count_copy = count; + + /** + * Check if the high priority supported requests setting has been + * specified + */ + val_hp = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_REQ_SUPP_NAME_HP, + &count_copy); + if (val_hp != kernbuf) { + if (!nrs_svc_has_hp(svc)) + return -ENODEV; + + queue |= PTLRPC_NRS_QUEUE_HP; + } + + /** + * If none of the queues has been specified, there may be a valid + * command string at the start of the buffer. + */ + if (queue == 0) { + queue = PTLRPC_NRS_QUEUE_REG; + + if (nrs_svc_has_hp(svc)) + queue |= PTLRPC_NRS_QUEUE_HP; + } + + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + supp_reg = nrs_orr_str2supp(val_reg); + if (supp_reg == -EINVAL) + return -EINVAL; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + supp_hp = nrs_orr_str2supp(val_hp); + if (supp_hp == -EINVAL) + return -EINVAL; + } + + /** + * We change the values on regular and HP NRS heads separately, so that + * we do not exit early from ptlrpc_nrs_policy_control() with an error + * returned by nrs_policy_ctl_locked(), in cases where the user has not + * started the policy on either the regular or HP NRS head; i.e. we are + * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned + * only if the operation fails with -ENODEV on all heads that have been + * specified by the command; if at least one operation succeeds, + * success is returned. + */ + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + orr_data->name, + NRS_CTL_ORR_WR_SUPP_REQ, false, + &supp_reg); + if ((rc < 0 && rc != -ENODEV) || + (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG)) + return rc; + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + orr_data->name, + NRS_CTL_ORR_WR_SUPP_REQ, false, + &supp_hp); + if ((rc2 < 0 && rc2 != -ENODEV) || + (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP)) + return rc2; + } + + return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_supported); + +static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc) +{ + int i; + + struct ldebugfs_vars nrs_orr_lprocfs_vars[] = { + { .name = "nrs_orr_quantum", + .fops = &ptlrpc_lprocfs_nrs_orr_quantum_fops }, + { .name = "nrs_orr_offset_type", + .fops = &ptlrpc_lprocfs_nrs_orr_offset_type_fops }, + { .name = "nrs_orr_supported", + .fops = &ptlrpc_lprocfs_nrs_orr_supported_fops }, + { NULL } + }; + + if (IS_ERR_OR_NULL(svc->srv_debugfs_entry)) + return 0; + + lprocfs_orr_data.svc = svc; + + for (i = 0; i < ARRAY_SIZE(nrs_orr_lprocfs_vars); i++) + nrs_orr_lprocfs_vars[i].data = &lprocfs_orr_data; + + return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_orr_lprocfs_vars, + NULL); +} + +static const struct ptlrpc_nrs_pol_ops nrs_orr_ops = { + .op_policy_init = nrs_orr_init, + .op_policy_start = nrs_orr_start, + .op_policy_stop = nrs_orr_stop, + .op_policy_ctl = nrs_orr_ctl, + .op_res_get = nrs_orr_res_get, + .op_res_put = nrs_orr_res_put, + .op_req_get = nrs_orr_req_get, + .op_req_enqueue = nrs_orr_req_add, + .op_req_dequeue = nrs_orr_req_del, + .op_req_stop = nrs_orr_req_stop, + .op_lprocfs_init = nrs_orr_lprocfs_init, +}; + +struct ptlrpc_nrs_pol_conf nrs_conf_orr = { + .nc_name = NRS_POL_NAME_ORR, + .nc_ops = &nrs_orr_ops, + .nc_compat = nrs_policy_compat_one, + .nc_compat_svc_name = "ost_io", +}; + +/** + * TRR, Target-based Round Robin policy + * + * TRR reuses much of the functions and data structures of ORR + */ +static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc) +{ + int i; + + struct ldebugfs_vars nrs_trr_lprocfs_vars[] = { + { .name = "nrs_trr_quantum", + .fops = &ptlrpc_lprocfs_nrs_orr_quantum_fops }, + { .name = "nrs_trr_offset_type", + .fops = &ptlrpc_lprocfs_nrs_orr_offset_type_fops }, + { .name = "nrs_trr_supported", + .fops = &ptlrpc_lprocfs_nrs_orr_supported_fops }, + { NULL } + }; + + if (IS_ERR_OR_NULL(svc->srv_debugfs_entry)) + return 0; + + lprocfs_trr_data.svc = svc; + + for (i = 0; i < ARRAY_SIZE(nrs_trr_lprocfs_vars); i++) + nrs_trr_lprocfs_vars[i].data = &lprocfs_trr_data; + + return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_trr_lprocfs_vars, + NULL); +} + +/** + * Reuse much of the ORR functionality for TRR. + */ +static const struct ptlrpc_nrs_pol_ops nrs_trr_ops = { + .op_policy_init = nrs_orr_init, + .op_policy_start = nrs_orr_start, + .op_policy_stop = nrs_orr_stop, + .op_policy_ctl = nrs_orr_ctl, + .op_res_get = nrs_orr_res_get, + .op_res_put = nrs_orr_res_put, + .op_req_get = nrs_orr_req_get, + .op_req_enqueue = nrs_orr_req_add, + .op_req_dequeue = nrs_orr_req_del, + .op_req_stop = nrs_orr_req_stop, + .op_lprocfs_init = nrs_trr_lprocfs_init, +}; + +struct ptlrpc_nrs_pol_conf nrs_conf_trr = { + .nc_name = NRS_POL_NAME_TRR, + .nc_ops = &nrs_trr_ops, + .nc_compat = nrs_policy_compat_one, + .nc_compat_svc_name = "ost_io", +}; + +/** @} ORR/TRR policy */ + +/** @} nrs */ + +#endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c new file mode 100644 index 0000000000000..07710bdb7bfd9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c @@ -0,0 +1,3714 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (C) 2013 DataDirect Networks, Inc. + * + * Copyright (c) 2014, 2016, Intel Corporation. + */ +/* + * lustre/ptlrpc/nrs_tbf.c + * + * Network Request Scheduler (NRS) Token Bucket Filter(TBF) policy + * + */ + +#ifdef HAVE_SERVER_SUPPORT + +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name tbf + * + * Token Bucket Filter over client NIDs + * + * @{ + */ + +#define NRS_POL_NAME_TBF "tbf" + +static int tbf_jobid_cache_size = 8192; +module_param(tbf_jobid_cache_size, int, 0644); +MODULE_PARM_DESC(tbf_jobid_cache_size, "The size of jobid cache"); + +static int tbf_rate = 10000; +module_param(tbf_rate, int, 0644); +MODULE_PARM_DESC(tbf_rate, "Default rate limit in RPCs/s"); + +static int tbf_depth = 3; +module_param(tbf_depth, int, 0644); +MODULE_PARM_DESC(tbf_depth, "How many tokens that a client can save up"); + +static enum hrtimer_restart nrs_tbf_timer_cb(struct hrtimer *timer) +{ + struct nrs_tbf_head *head = container_of(timer, struct nrs_tbf_head, + th_timer); + struct ptlrpc_nrs *nrs = head->th_res.res_policy->pol_nrs; + struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; + + nrs->nrs_throttling = 0; + wake_up(&svcpt->scp_waitq); + + return HRTIMER_NORESTART; +} + +#define NRS_TBF_DEFAULT_RULE "default" + +static void nrs_tbf_rule_fini(struct nrs_tbf_rule *rule) +{ + LASSERT(atomic_read(&rule->tr_ref) == 0); + LASSERT(list_empty(&rule->tr_cli_list)); + LASSERT(list_empty(&rule->tr_linkage)); + + rule->tr_head->th_ops->o_rule_fini(rule); + OBD_FREE_PTR(rule); +} + +/** + * Decreases the rule's usage reference count, and stops the rule in case it + * was already stopping and have no more outstanding usage references (which + * indicates it has no more queued or started requests, and can be safely + * stopped). + */ +static void nrs_tbf_rule_put(struct nrs_tbf_rule *rule) +{ + if (atomic_dec_and_test(&rule->tr_ref)) + nrs_tbf_rule_fini(rule); +} + +/** + * Increases the rule's usage reference count. + */ +static inline void nrs_tbf_rule_get(struct nrs_tbf_rule *rule) +{ + atomic_inc(&rule->tr_ref); +} + +static void +nrs_tbf_cli_rule_put(struct nrs_tbf_client *cli) +{ + LASSERT(!list_empty(&cli->tc_linkage)); + LASSERT(cli->tc_rule); + spin_lock(&cli->tc_rule->tr_rule_lock); + list_del_init(&cli->tc_linkage); + spin_unlock(&cli->tc_rule->tr_rule_lock); + nrs_tbf_rule_put(cli->tc_rule); + cli->tc_rule = NULL; +} + +static void +nrs_tbf_cli_reset_value(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) + +{ + struct nrs_tbf_rule *rule = cli->tc_rule; + + cli->tc_rpc_rate = rule->tr_rpc_rate; + cli->tc_nsecs = rule->tr_nsecs; + cli->tc_depth = rule->tr_depth; + cli->tc_ntoken = rule->tr_depth; + cli->tc_check_time = ktime_to_ns(ktime_get()); + cli->tc_rule_sequence = atomic_read(&head->th_rule_sequence); + cli->tc_rule_generation = rule->tr_generation; + + if (cli->tc_in_heap) + cfs_binheap_relocate(head->th_binheap, + &cli->tc_node); +} + +static void +nrs_tbf_cli_reset(struct nrs_tbf_head *head, + struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + spin_lock(&cli->tc_rule_lock); + if (cli->tc_rule != NULL && !list_empty(&cli->tc_linkage)) { + LASSERT(rule != cli->tc_rule); + nrs_tbf_cli_rule_put(cli); + } + LASSERT(cli->tc_rule == NULL); + LASSERT(list_empty(&cli->tc_linkage)); + /* Rule's ref is added before called */ + cli->tc_rule = rule; + spin_lock(&rule->tr_rule_lock); + list_add_tail(&cli->tc_linkage, &rule->tr_cli_list); + spin_unlock(&rule->tr_rule_lock); + spin_unlock(&cli->tc_rule_lock); + nrs_tbf_cli_reset_value(head, cli); +} + +static int +nrs_tbf_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + return rule->tr_head->th_ops->o_rule_dump(rule, m); +} + +static int +nrs_tbf_rule_dump_all(struct nrs_tbf_head *head, struct seq_file *m) +{ + struct nrs_tbf_rule *rule; + int rc = 0; + + LASSERT(head != NULL); + spin_lock(&head->th_rule_lock); + /* List the rules from newest to oldest */ + list_for_each_entry(rule, &head->th_list, tr_linkage) { + LASSERT((rule->tr_flags & NTRS_STOPPING) == 0); + rc = nrs_tbf_rule_dump(rule, m); + if (rc) { + rc = -ENOSPC; + break; + } + } + spin_unlock(&head->th_rule_lock); + + return rc; +} + +static struct nrs_tbf_rule * +nrs_tbf_rule_find_nolock(struct nrs_tbf_head *head, + const char *name) +{ + struct nrs_tbf_rule *rule; + + LASSERT(head != NULL); + list_for_each_entry(rule, &head->th_list, tr_linkage) { + LASSERT((rule->tr_flags & NTRS_STOPPING) == 0); + if (strcmp(rule->tr_name, name) == 0) { + nrs_tbf_rule_get(rule); + return rule; + } + } + return NULL; +} + +static struct nrs_tbf_rule * +nrs_tbf_rule_find(struct nrs_tbf_head *head, + const char *name) +{ + struct nrs_tbf_rule *rule; + + LASSERT(head != NULL); + spin_lock(&head->th_rule_lock); + rule = nrs_tbf_rule_find_nolock(head, name); + spin_unlock(&head->th_rule_lock); + return rule; +} + +static struct nrs_tbf_rule * +nrs_tbf_rule_match(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + struct nrs_tbf_rule *rule = NULL; + struct nrs_tbf_rule *tmp_rule; + + spin_lock(&head->th_rule_lock); + /* Match the newest rule in the list */ + list_for_each_entry(tmp_rule, &head->th_list, tr_linkage) { + LASSERT((tmp_rule->tr_flags & NTRS_STOPPING) == 0); + if (head->th_ops->o_rule_match(tmp_rule, cli)) { + rule = tmp_rule; + break; + } + } + + if (rule == NULL) + rule = head->th_rule; + + nrs_tbf_rule_get(rule); + spin_unlock(&head->th_rule_lock); + return rule; +} + +static void +nrs_tbf_cli_init(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + struct nrs_tbf_rule *rule; + + memset(cli, 0, sizeof(*cli)); + cli->tc_in_heap = false; + head->th_ops->o_cli_init(cli, req); + INIT_LIST_HEAD(&cli->tc_list); + INIT_LIST_HEAD(&cli->tc_linkage); + spin_lock_init(&cli->tc_rule_lock); + atomic_set(&cli->tc_ref, 1); + rule = nrs_tbf_rule_match(head, cli); + nrs_tbf_cli_reset(head, rule, cli); +} + +static void +nrs_tbf_cli_fini(struct nrs_tbf_client *cli) +{ + LASSERT(list_empty(&cli->tc_list)); + LASSERT(!cli->tc_in_heap); + LASSERT(atomic_read(&cli->tc_ref) == 0); + spin_lock(&cli->tc_rule_lock); + nrs_tbf_cli_rule_put(cli); + spin_unlock(&cli->tc_rule_lock); + OBD_FREE_PTR(cli); +} + +static int +nrs_tbf_rule_start(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + struct nrs_tbf_cmd *start) +{ + struct nrs_tbf_rule *rule; + struct nrs_tbf_rule *tmp_rule; + struct nrs_tbf_rule *next_rule; + char *next_name = start->u.tc_start.ts_next_name; + int rc; + + rule = nrs_tbf_rule_find(head, start->tc_name); + if (rule) { + nrs_tbf_rule_put(rule); + return -EEXIST; + } + + OBD_CPT_ALLOC_PTR(rule, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (rule == NULL) + return -ENOMEM; + + memcpy(rule->tr_name, start->tc_name, strlen(start->tc_name)); + rule->tr_rpc_rate = start->u.tc_start.ts_rpc_rate; + rule->tr_flags = start->u.tc_start.ts_rule_flags; + rule->tr_nsecs = NSEC_PER_SEC; + do_div(rule->tr_nsecs, rule->tr_rpc_rate); + rule->tr_depth = tbf_depth; + atomic_set(&rule->tr_ref, 1); + INIT_LIST_HEAD(&rule->tr_cli_list); + INIT_LIST_HEAD(&rule->tr_nids); + INIT_LIST_HEAD(&rule->tr_linkage); + spin_lock_init(&rule->tr_rule_lock); + rule->tr_head = head; + + rc = head->th_ops->o_rule_init(policy, rule, start); + if (rc) { + OBD_FREE_PTR(rule); + return rc; + } + + /* Add as the newest rule */ + spin_lock(&head->th_rule_lock); + tmp_rule = nrs_tbf_rule_find_nolock(head, start->tc_name); + if (tmp_rule) { + spin_unlock(&head->th_rule_lock); + nrs_tbf_rule_put(tmp_rule); + nrs_tbf_rule_put(rule); + return -EEXIST; + } + + if (next_name) { + next_rule = nrs_tbf_rule_find_nolock(head, next_name); + if (!next_rule) { + spin_unlock(&head->th_rule_lock); + nrs_tbf_rule_put(rule); + return -ENOENT; + } + + list_add(&rule->tr_linkage, next_rule->tr_linkage.prev); + nrs_tbf_rule_put(next_rule); + } else { + /* Add on the top of the rule list */ + list_add(&rule->tr_linkage, &head->th_list); + } + spin_unlock(&head->th_rule_lock); + atomic_inc(&head->th_rule_sequence); + if (start->u.tc_start.ts_rule_flags & NTRS_DEFAULT) { + rule->tr_flags |= NTRS_DEFAULT; + LASSERT(head->th_rule == NULL); + head->th_rule = rule; + } + + CDEBUG(D_RPCTRACE, "TBF starts rule@%p rate %llu gen %llu\n", + rule, rule->tr_rpc_rate, rule->tr_generation); + + return 0; +} + +/** + * Change the rank of a rule in the rule list + * + * The matched rule will be moved to the position right before another + * given rule. + * + * \param[in] policy the policy instance + * \param[in] head the TBF policy instance + * \param[in] name the rule name to be moved + * \param[in] next_name the rule name before which the matched rule will be + * moved + * + */ +static int +nrs_tbf_rule_change_rank(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + char *name, + char *next_name) +{ + struct nrs_tbf_rule *rule = NULL; + struct nrs_tbf_rule *next_rule = NULL; + int rc = 0; + + LASSERT(head != NULL); + + spin_lock(&head->th_rule_lock); + rule = nrs_tbf_rule_find_nolock(head, name); + if (!rule) + GOTO(out, rc = -ENOENT); + + if (strcmp(name, next_name) == 0) + GOTO(out_put, rc); + + next_rule = nrs_tbf_rule_find_nolock(head, next_name); + if (!next_rule) + GOTO(out_put, rc = -ENOENT); + + list_move(&rule->tr_linkage, next_rule->tr_linkage.prev); + nrs_tbf_rule_put(next_rule); +out_put: + nrs_tbf_rule_put(rule); +out: + spin_unlock(&head->th_rule_lock); + return rc; +} + +static int +nrs_tbf_rule_change_rate(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + char *name, + __u64 rate) +{ + struct nrs_tbf_rule *rule; + + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + rule = nrs_tbf_rule_find(head, name); + if (rule == NULL) + return -ENOENT; + + rule->tr_rpc_rate = rate; + rule->tr_nsecs = NSEC_PER_SEC; + do_div(rule->tr_nsecs, rule->tr_rpc_rate); + rule->tr_generation++; + nrs_tbf_rule_put(rule); + + return 0; +} + +static int +nrs_tbf_rule_change(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + struct nrs_tbf_cmd *change) +{ + __u64 rate = change->u.tc_change.tc_rpc_rate; + char *next_name = change->u.tc_change.tc_next_name; + int rc; + + if (rate != 0) { + rc = nrs_tbf_rule_change_rate(policy, head, change->tc_name, + rate); + if (rc) + return rc; + } + + if (next_name) { + rc = nrs_tbf_rule_change_rank(policy, head, change->tc_name, + next_name); + if (rc) + return rc; + } + + return 0; +} + +static int +nrs_tbf_rule_stop(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + struct nrs_tbf_cmd *stop) +{ + struct nrs_tbf_rule *rule; + + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + if (strcmp(stop->tc_name, NRS_TBF_DEFAULT_RULE) == 0) + return -EPERM; + + rule = nrs_tbf_rule_find(head, stop->tc_name); + if (rule == NULL) + return -ENOENT; + + list_del_init(&rule->tr_linkage); + rule->tr_flags |= NTRS_STOPPING; + nrs_tbf_rule_put(rule); + nrs_tbf_rule_put(rule); + + return 0; +} + +static int +nrs_tbf_command(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head, + struct nrs_tbf_cmd *cmd) +{ + int rc; + + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + switch (cmd->tc_cmd) { + case NRS_CTL_TBF_START_RULE: + if (cmd->u.tc_start.ts_valid_type != head->th_type_flag) + return -EINVAL; + + spin_unlock(&policy->pol_nrs->nrs_lock); + rc = nrs_tbf_rule_start(policy, head, cmd); + spin_lock(&policy->pol_nrs->nrs_lock); + return rc; + case NRS_CTL_TBF_CHANGE_RULE: + rc = nrs_tbf_rule_change(policy, head, cmd); + return rc; + case NRS_CTL_TBF_STOP_RULE: + rc = nrs_tbf_rule_stop(policy, head, cmd); + /* Take it as a success, if not exists at all */ + return rc == -ENOENT ? 0 : rc; + default: + return -EFAULT; + } +} + +/** + * Binary heap predicate. + * + * \param[in] e1 the first binheap node to compare + * \param[in] e2 the second binheap node to compare + * + * \retval 0 e1 > e2 + * \retval 1 e1 < e2 + */ +static int +tbf_cli_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2) +{ + struct nrs_tbf_client *cli1; + struct nrs_tbf_client *cli2; + + cli1 = container_of(e1, struct nrs_tbf_client, tc_node); + cli2 = container_of(e2, struct nrs_tbf_client, tc_node); + + if (cli1->tc_deadline < cli2->tc_deadline) + return 1; + else if (cli1->tc_deadline > cli2->tc_deadline) + return 0; + + if (cli1->tc_check_time < cli2->tc_check_time) + return 1; + else if (cli1->tc_check_time > cli2->tc_check_time) + return 0; + + /* Maybe need more comparasion, e.g. request number in the rules */ + return 1; +} + +/** + * TBF binary heap operations + */ +static struct cfs_binheap_ops nrs_tbf_heap_ops = { + .hop_enter = NULL, + .hop_exit = NULL, + .hop_compare = tbf_cli_compare, +}; + +static unsigned nrs_tbf_jobid_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, strlen(key), mask); +} + +static int nrs_tbf_jobid_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return (strcmp(cli->tc_jobid, key) == 0); +} + +static void *nrs_tbf_jobid_hop_key(struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return cli->tc_jobid; +} + +static void *nrs_tbf_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode); +} + +static void nrs_tbf_jobid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_inc(&cli->tc_ref); +} + +static void nrs_tbf_jobid_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_dec(&cli->tc_ref); +} + +static void +nrs_tbf_jobid_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) + +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + LASSERT(atomic_read(&cli->tc_ref) == 0); + nrs_tbf_cli_fini(cli); +} + +static struct cfs_hash_ops nrs_tbf_jobid_hash_ops = { + .hs_hash = nrs_tbf_jobid_hop_hash, + .hs_keycmp = nrs_tbf_jobid_hop_keycmp, + .hs_key = nrs_tbf_jobid_hop_key, + .hs_object = nrs_tbf_hop_object, + .hs_get = nrs_tbf_jobid_hop_get, + .hs_put = nrs_tbf_jobid_hop_put, + .hs_put_locked = nrs_tbf_jobid_hop_put, + .hs_exit = nrs_tbf_jobid_hop_exit, +}; + +#define NRS_TBF_JOBID_HASH_FLAGS (CFS_HASH_SPIN_BKTLOCK | \ + CFS_HASH_NO_ITEMREF | \ + CFS_HASH_DEPTH) + +static struct nrs_tbf_client * +nrs_tbf_jobid_hash_lookup(struct cfs_hash *hs, + struct cfs_hash_bd *bd, + const char *jobid) +{ + struct hlist_node *hnode; + struct nrs_tbf_client *cli; + + hnode = cfs_hash_bd_lookup_locked(hs, bd, (void *)jobid); + if (hnode == NULL) + return NULL; + + cli = container_of0(hnode, struct nrs_tbf_client, tc_hnode); + if (!list_empty(&cli->tc_lru)) + list_del_init(&cli->tc_lru); + return cli; +} + +#define NRS_TBF_JOBID_NULL "" + +static struct nrs_tbf_client * +nrs_tbf_jobid_cli_find(struct nrs_tbf_head *head, + struct ptlrpc_request *req) +{ + const char *jobid; + struct nrs_tbf_client *cli; + struct cfs_hash *hs = head->th_cli_hash; + struct cfs_hash_bd bd; + + jobid = lustre_msg_get_jobid(req->rq_reqmsg); + if (jobid == NULL) + jobid = NRS_TBF_JOBID_NULL; + cfs_hash_bd_get_and_lock(hs, (void *)jobid, &bd, 1); + cli = nrs_tbf_jobid_hash_lookup(hs, &bd, jobid); + cfs_hash_bd_unlock(hs, &bd, 1); + + return cli; +} + +static struct nrs_tbf_client * +nrs_tbf_jobid_cli_findadd(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + const char *jobid; + struct nrs_tbf_client *ret; + struct cfs_hash *hs = head->th_cli_hash; + struct cfs_hash_bd bd; + + jobid = cli->tc_jobid; + cfs_hash_bd_get_and_lock(hs, (void *)jobid, &bd, 1); + ret = nrs_tbf_jobid_hash_lookup(hs, &bd, jobid); + if (ret == NULL) { + cfs_hash_bd_add_locked(hs, &bd, &cli->tc_hnode); + ret = cli; + } + cfs_hash_bd_unlock(hs, &bd, 1); + + return ret; +} + +static void +nrs_tbf_jobid_cli_put(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + struct cfs_hash_bd bd; + struct cfs_hash *hs = head->th_cli_hash; + struct nrs_tbf_bucket *bkt; + int hw; + struct list_head zombies; + + INIT_LIST_HEAD(&zombies); + cfs_hash_bd_get(hs, &cli->tc_jobid, &bd); + bkt = cfs_hash_bd_extra_get(hs, &bd); + if (!cfs_hash_bd_dec_and_lock(hs, &bd, &cli->tc_ref)) + return; + LASSERT(list_empty(&cli->tc_lru)); + list_add_tail(&cli->tc_lru, &bkt->ntb_lru); + + /* + * Check and purge the LRU, there is at least one client in the LRU. + */ + hw = tbf_jobid_cache_size >> + (hs->hs_cur_bits - hs->hs_bkt_bits); + while (cfs_hash_bd_count_get(&bd) > hw) { + if (unlikely(list_empty(&bkt->ntb_lru))) + break; + cli = list_entry(bkt->ntb_lru.next, + struct nrs_tbf_client, + tc_lru); + LASSERT(atomic_read(&cli->tc_ref) == 0); + cfs_hash_bd_del_locked(hs, &bd, &cli->tc_hnode); + list_move(&cli->tc_lru, &zombies); + } + cfs_hash_bd_unlock(head->th_cli_hash, &bd, 1); + + while (!list_empty(&zombies)) { + cli = container_of0(zombies.next, + struct nrs_tbf_client, tc_lru); + list_del_init(&cli->tc_lru); + nrs_tbf_cli_fini(cli); + } +} + +static void +nrs_tbf_jobid_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + char *jobid = lustre_msg_get_jobid(req->rq_reqmsg); + + if (jobid == NULL) + jobid = NRS_TBF_JOBID_NULL; + LASSERT(strlen(jobid) < LUSTRE_JOBID_SIZE); + INIT_LIST_HEAD(&cli->tc_lru); + memcpy(cli->tc_jobid, jobid, strlen(jobid)); +} + +static int nrs_tbf_jobid_hash_order(void) +{ + int bits; + + for (bits = 1; (1 << bits) < tbf_jobid_cache_size; ++bits) + ; + + return bits; +} + +#define NRS_TBF_JOBID_BKT_BITS 10 + +static int +nrs_tbf_jobid_startup(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head) +{ + struct nrs_tbf_cmd start; + struct nrs_tbf_bucket *bkt; + int bits; + int i; + int rc; + struct cfs_hash_bd bd; + + bits = nrs_tbf_jobid_hash_order(); + if (bits < NRS_TBF_JOBID_BKT_BITS) + bits = NRS_TBF_JOBID_BKT_BITS; + head->th_cli_hash = cfs_hash_create("nrs_tbf_hash", + bits, + bits, + NRS_TBF_JOBID_BKT_BITS, + sizeof(*bkt), + 0, + 0, + &nrs_tbf_jobid_hash_ops, + NRS_TBF_JOBID_HASH_FLAGS); + if (head->th_cli_hash == NULL) + return -ENOMEM; + + cfs_hash_for_each_bucket(head->th_cli_hash, &bd, i) { + bkt = cfs_hash_bd_extra_get(head->th_cli_hash, &bd); + INIT_LIST_HEAD(&bkt->ntb_lru); + } + + memset(&start, 0, sizeof(start)); + start.u.tc_start.ts_jobids_str = "*"; + + start.u.tc_start.ts_rpc_rate = tbf_rate; + start.u.tc_start.ts_rule_flags = NTRS_DEFAULT; + start.tc_name = NRS_TBF_DEFAULT_RULE; + INIT_LIST_HEAD(&start.u.tc_start.ts_jobids); + rc = nrs_tbf_rule_start(policy, head, &start); + if (rc) { + cfs_hash_putref(head->th_cli_hash); + head->th_cli_hash = NULL; + } + + return rc; +} + +/** + * Frees jobid of \a list. + * + */ +static void +nrs_tbf_jobid_list_free(struct list_head *jobid_list) +{ + struct nrs_tbf_jobid *jobid, *n; + + list_for_each_entry_safe(jobid, n, jobid_list, tj_linkage) { + OBD_FREE(jobid->tj_id, strlen(jobid->tj_id) + 1); + list_del(&jobid->tj_linkage); + OBD_FREE(jobid, sizeof(struct nrs_tbf_jobid)); + } +} + +static int +nrs_tbf_jobid_list_add(struct cfs_lstr *id, struct list_head *jobid_list) +{ + struct nrs_tbf_jobid *jobid; + char *ptr; + + OBD_ALLOC(jobid, sizeof(struct nrs_tbf_jobid)); + if (jobid == NULL) + return -ENOMEM; + + OBD_ALLOC(jobid->tj_id, id->ls_len + 1); + if (jobid->tj_id == NULL) { + OBD_FREE(jobid, sizeof(struct nrs_tbf_jobid)); + return -ENOMEM; + } + + memcpy(jobid->tj_id, id->ls_str, id->ls_len); + ptr = lprocfs_strnstr(id->ls_str, "*", id->ls_len); + if (ptr == NULL) + jobid->tj_match_flag = NRS_TBF_MATCH_FULL; + else + jobid->tj_match_flag = NRS_TBF_MATCH_WILDCARD; + + list_add_tail(&jobid->tj_linkage, jobid_list); + return 0; +} + +static bool +cfs_match_wildcard(const char *pattern, const char *content) +{ + if (*pattern == '\0' && *content == '\0') + return true; + + if (*pattern == '*' && *(pattern + 1) != '\0' && *content == '\0') + return false; + + while (*pattern == *content) { + pattern++; + content++; + if (*pattern == '\0' && *content == '\0') + return true; + + if (*pattern == '*' && *(pattern + 1) != '\0' && + *content == '\0') + return false; + } + + if (*pattern == '*') + return (cfs_match_wildcard(pattern + 1, content) || + cfs_match_wildcard(pattern, content + 1)); + + return false; +} + +static inline bool +nrs_tbf_jobid_match(const struct nrs_tbf_jobid *jobid, const char *id) +{ + if (jobid->tj_match_flag == NRS_TBF_MATCH_FULL) + return strcmp(jobid->tj_id, id) == 0; + + if (jobid->tj_match_flag == NRS_TBF_MATCH_WILDCARD) + return cfs_match_wildcard(jobid->tj_id, id); + + return false; +} + +static int +nrs_tbf_jobid_list_match(struct list_head *jobid_list, char *id) +{ + struct nrs_tbf_jobid *jobid; + + list_for_each_entry(jobid, jobid_list, tj_linkage) { + if (nrs_tbf_jobid_match(jobid, id)) + return 1; + } + return 0; +} + +static int +nrs_tbf_jobid_list_parse(char *str, int len, struct list_head *jobid_list) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + ENTRY; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(jobid_list); + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = nrs_tbf_jobid_list_add(&res, jobid_list); + if (rc) + break; + } + if (rc) + nrs_tbf_jobid_list_free(jobid_list); + RETURN(rc); +} + +static void nrs_tbf_jobid_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + if (!list_empty(&cmd->u.tc_start.ts_jobids)) + nrs_tbf_jobid_list_free(&cmd->u.tc_start.ts_jobids); + if (cmd->u.tc_start.ts_jobids_str) + OBD_FREE(cmd->u.tc_start.ts_jobids_str, + strlen(cmd->u.tc_start.ts_jobids_str) + 1); +} + +static int nrs_tbf_check_id_value(struct cfs_lstr *src, char *key) +{ + struct cfs_lstr res; + int keylen = strlen(key); + int rc; + + rc = cfs_gettok(src, '=', &res); + if (rc == 0 || res.ls_len != keylen || + strncmp(res.ls_str, key, keylen) != 0 || + src->ls_len <= 2 || src->ls_str[0] != '{' || + src->ls_str[src->ls_len - 1] != '}') + return -EINVAL; + + /* Skip '{' and '}' */ + src->ls_str++; + src->ls_len -= 2; + return 0; +} + +static int nrs_tbf_jobid_parse(struct nrs_tbf_cmd *cmd, char *id) +{ + struct cfs_lstr src; + int rc; + + src.ls_str = id; + src.ls_len = strlen(id); + rc = nrs_tbf_check_id_value(&src, "jobid"); + if (rc) + return rc; + + OBD_ALLOC(cmd->u.tc_start.ts_jobids_str, src.ls_len + 1); + if (cmd->u.tc_start.ts_jobids_str == NULL) + return -ENOMEM; + + memcpy(cmd->u.tc_start.ts_jobids_str, src.ls_str, src.ls_len); + + /* parse jobid list */ + rc = nrs_tbf_jobid_list_parse(cmd->u.tc_start.ts_jobids_str, + strlen(cmd->u.tc_start.ts_jobids_str), + &cmd->u.tc_start.ts_jobids); + if (rc) + nrs_tbf_jobid_cmd_fini(cmd); + + return rc; +} + +static int nrs_tbf_jobid_rule_init(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_rule *rule, + struct nrs_tbf_cmd *start) +{ + int rc = 0; + + LASSERT(start->u.tc_start.ts_jobids_str); + OBD_ALLOC(rule->tr_jobids_str, + strlen(start->u.tc_start.ts_jobids_str) + 1); + if (rule->tr_jobids_str == NULL) + return -ENOMEM; + + memcpy(rule->tr_jobids_str, + start->u.tc_start.ts_jobids_str, + strlen(start->u.tc_start.ts_jobids_str)); + + INIT_LIST_HEAD(&rule->tr_jobids); + if (!list_empty(&start->u.tc_start.ts_jobids)) { + rc = nrs_tbf_jobid_list_parse(rule->tr_jobids_str, + strlen(rule->tr_jobids_str), + &rule->tr_jobids); + if (rc) + CERROR("jobids {%s} illegal\n", rule->tr_jobids_str); + } + if (rc) + OBD_FREE(rule->tr_jobids_str, + strlen(start->u.tc_start.ts_jobids_str) + 1); + return rc; +} + +static int +nrs_tbf_jobid_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name, + rule->tr_jobids_str, rule->tr_rpc_rate, + atomic_read(&rule->tr_ref) - 1); + return 0; +} + +static int +nrs_tbf_jobid_rule_match(struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + return nrs_tbf_jobid_list_match(&rule->tr_jobids, cli->tc_jobid); +} + +static void nrs_tbf_jobid_rule_fini(struct nrs_tbf_rule *rule) +{ + if (!list_empty(&rule->tr_jobids)) + nrs_tbf_jobid_list_free(&rule->tr_jobids); + LASSERT(rule->tr_jobids_str != NULL); + OBD_FREE(rule->tr_jobids_str, strlen(rule->tr_jobids_str) + 1); +} + +static struct nrs_tbf_ops nrs_tbf_jobid_ops = { + .o_name = NRS_TBF_TYPE_JOBID, + .o_startup = nrs_tbf_jobid_startup, + .o_cli_find = nrs_tbf_jobid_cli_find, + .o_cli_findadd = nrs_tbf_jobid_cli_findadd, + .o_cli_put = nrs_tbf_jobid_cli_put, + .o_cli_init = nrs_tbf_jobid_cli_init, + .o_rule_init = nrs_tbf_jobid_rule_init, + .o_rule_dump = nrs_tbf_jobid_rule_dump, + .o_rule_match = nrs_tbf_jobid_rule_match, + .o_rule_fini = nrs_tbf_jobid_rule_fini, +}; + +/** + * libcfs_hash operations for nrs_tbf_net::cn_cli_hash + * + * This uses ptlrpc_request::rq_peer.nid as its key, in order to hash + * nrs_tbf_client objects. + */ +#define NRS_TBF_NID_BKT_BITS 8 +#define NRS_TBF_NID_BITS 16 + +static unsigned nrs_tbf_nid_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask); +} + +static int nrs_tbf_nid_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + lnet_nid_t *nid = (lnet_nid_t *)key; + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return *nid == cli->tc_nid; +} + +static void *nrs_tbf_nid_hop_key(struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return &cli->tc_nid; +} + +static void nrs_tbf_nid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_inc(&cli->tc_ref); +} + +static void nrs_tbf_nid_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_dec(&cli->tc_ref); +} + +static void nrs_tbf_nid_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + LASSERTF(atomic_read(&cli->tc_ref) == 0, + "Busy TBF object from client with NID %s, with %d refs\n", + libcfs_nid2str(cli->tc_nid), atomic_read(&cli->tc_ref)); + + nrs_tbf_cli_fini(cli); +} + +static struct cfs_hash_ops nrs_tbf_nid_hash_ops = { + .hs_hash = nrs_tbf_nid_hop_hash, + .hs_keycmp = nrs_tbf_nid_hop_keycmp, + .hs_key = nrs_tbf_nid_hop_key, + .hs_object = nrs_tbf_hop_object, + .hs_get = nrs_tbf_nid_hop_get, + .hs_put = nrs_tbf_nid_hop_put, + .hs_put_locked = nrs_tbf_nid_hop_put, + .hs_exit = nrs_tbf_nid_hop_exit, +}; + +static struct nrs_tbf_client * +nrs_tbf_nid_cli_find(struct nrs_tbf_head *head, + struct ptlrpc_request *req) +{ + return cfs_hash_lookup(head->th_cli_hash, &req->rq_peer.nid); +} + +static struct nrs_tbf_client * +nrs_tbf_nid_cli_findadd(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_nid, + &cli->tc_hnode); +} + +static void +nrs_tbf_nid_cli_put(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + cfs_hash_put(head->th_cli_hash, &cli->tc_hnode); +} + +static int +nrs_tbf_nid_startup(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head) +{ + struct nrs_tbf_cmd start; + int rc; + + head->th_cli_hash = cfs_hash_create("nrs_tbf_hash", + NRS_TBF_NID_BITS, + NRS_TBF_NID_BITS, + NRS_TBF_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nrs_tbf_nid_hash_ops, + CFS_HASH_RW_BKTLOCK); + if (head->th_cli_hash == NULL) + return -ENOMEM; + + memset(&start, 0, sizeof(start)); + start.u.tc_start.ts_nids_str = "*"; + + start.u.tc_start.ts_rpc_rate = tbf_rate; + start.u.tc_start.ts_rule_flags = NTRS_DEFAULT; + start.tc_name = NRS_TBF_DEFAULT_RULE; + INIT_LIST_HEAD(&start.u.tc_start.ts_nids); + rc = nrs_tbf_rule_start(policy, head, &start); + if (rc) { + cfs_hash_putref(head->th_cli_hash); + head->th_cli_hash = NULL; + } + + return rc; +} + +static void +nrs_tbf_nid_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + cli->tc_nid = req->rq_peer.nid; +} + +static int nrs_tbf_nid_rule_init(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_rule *rule, + struct nrs_tbf_cmd *start) +{ + LASSERT(start->u.tc_start.ts_nids_str); + OBD_ALLOC(rule->tr_nids_str, + strlen(start->u.tc_start.ts_nids_str) + 1); + if (rule->tr_nids_str == NULL) + return -ENOMEM; + + memcpy(rule->tr_nids_str, + start->u.tc_start.ts_nids_str, + strlen(start->u.tc_start.ts_nids_str)); + + INIT_LIST_HEAD(&rule->tr_nids); + if (!list_empty(&start->u.tc_start.ts_nids)) { + if (cfs_parse_nidlist(rule->tr_nids_str, + strlen(rule->tr_nids_str), + &rule->tr_nids) <= 0) { + CERROR("nids {%s} illegal\n", + rule->tr_nids_str); + OBD_FREE(rule->tr_nids_str, + strlen(start->u.tc_start.ts_nids_str) + 1); + return -EINVAL; + } + } + return 0; +} + +static int +nrs_tbf_nid_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name, + rule->tr_nids_str, rule->tr_rpc_rate, + atomic_read(&rule->tr_ref) - 1); + return 0; +} + +static int +nrs_tbf_nid_rule_match(struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + return cfs_match_nid(cli->tc_nid, &rule->tr_nids); +} + +static void nrs_tbf_nid_rule_fini(struct nrs_tbf_rule *rule) +{ + if (!list_empty(&rule->tr_nids)) + cfs_free_nidlist(&rule->tr_nids); + LASSERT(rule->tr_nids_str != NULL); + OBD_FREE(rule->tr_nids_str, strlen(rule->tr_nids_str) + 1); +} + +static void nrs_tbf_nid_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + if (!list_empty(&cmd->u.tc_start.ts_nids)) + cfs_free_nidlist(&cmd->u.tc_start.ts_nids); + if (cmd->u.tc_start.ts_nids_str) + OBD_FREE(cmd->u.tc_start.ts_nids_str, + strlen(cmd->u.tc_start.ts_nids_str) + 1); +} + +static int nrs_tbf_nid_parse(struct nrs_tbf_cmd *cmd, char *id) +{ + struct cfs_lstr src; + int rc; + + src.ls_str = id; + src.ls_len = strlen(id); + rc = nrs_tbf_check_id_value(&src, "nid"); + if (rc) + return rc; + + OBD_ALLOC(cmd->u.tc_start.ts_nids_str, src.ls_len + 1); + if (cmd->u.tc_start.ts_nids_str == NULL) + return -ENOMEM; + + memcpy(cmd->u.tc_start.ts_nids_str, src.ls_str, src.ls_len); + + /* parse NID list */ + if (cfs_parse_nidlist(cmd->u.tc_start.ts_nids_str, + strlen(cmd->u.tc_start.ts_nids_str), + &cmd->u.tc_start.ts_nids) <= 0) { + nrs_tbf_nid_cmd_fini(cmd); + return -EINVAL; + } + + return 0; +} + +static struct nrs_tbf_ops nrs_tbf_nid_ops = { + .o_name = NRS_TBF_TYPE_NID, + .o_startup = nrs_tbf_nid_startup, + .o_cli_find = nrs_tbf_nid_cli_find, + .o_cli_findadd = nrs_tbf_nid_cli_findadd, + .o_cli_put = nrs_tbf_nid_cli_put, + .o_cli_init = nrs_tbf_nid_cli_init, + .o_rule_init = nrs_tbf_nid_rule_init, + .o_rule_dump = nrs_tbf_nid_rule_dump, + .o_rule_match = nrs_tbf_nid_rule_match, + .o_rule_fini = nrs_tbf_nid_rule_fini, +}; + +static unsigned nrs_tbf_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, strlen(key), mask); +} + +static int nrs_tbf_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return (strcmp(cli->tc_key, key) == 0); +} + +static void *nrs_tbf_hop_key(struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + return cli->tc_key; +} + +static void nrs_tbf_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_inc(&cli->tc_ref); +} + +static void nrs_tbf_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_dec(&cli->tc_ref); +} + +static void nrs_tbf_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) + +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + LASSERT(atomic_read(&cli->tc_ref) == 0); + nrs_tbf_cli_fini(cli); +} + +static struct cfs_hash_ops nrs_tbf_hash_ops = { + .hs_hash = nrs_tbf_hop_hash, + .hs_keycmp = nrs_tbf_hop_keycmp, + .hs_key = nrs_tbf_hop_key, + .hs_object = nrs_tbf_hop_object, + .hs_get = nrs_tbf_hop_get, + .hs_put = nrs_tbf_hop_put, + .hs_put_locked = nrs_tbf_hop_put, + .hs_exit = nrs_tbf_hop_exit, +}; + +#define NRS_TBF_GENERIC_BKT_BITS 10 +#define NRS_TBF_GENERIC_HASH_FLAGS (CFS_HASH_SPIN_BKTLOCK | \ + CFS_HASH_NO_ITEMREF | \ + CFS_HASH_DEPTH) + +static int +nrs_tbf_startup(struct ptlrpc_nrs_policy *policy, struct nrs_tbf_head *head) +{ + struct nrs_tbf_cmd start; + struct nrs_tbf_bucket *bkt; + int bits; + int i; + int rc; + struct cfs_hash_bd bd; + + bits = nrs_tbf_jobid_hash_order(); + if (bits < NRS_TBF_GENERIC_BKT_BITS) + bits = NRS_TBF_GENERIC_BKT_BITS; + head->th_cli_hash = cfs_hash_create("nrs_tbf_hash", + bits, bits, + NRS_TBF_GENERIC_BKT_BITS, + sizeof(*bkt), 0, 0, + &nrs_tbf_hash_ops, + NRS_TBF_GENERIC_HASH_FLAGS); + if (head->th_cli_hash == NULL) + return -ENOMEM; + + cfs_hash_for_each_bucket(head->th_cli_hash, &bd, i) { + bkt = cfs_hash_bd_extra_get(head->th_cli_hash, &bd); + INIT_LIST_HEAD(&bkt->ntb_lru); + } + + memset(&start, 0, sizeof(start)); + start.u.tc_start.ts_conds_str = "*"; + + start.u.tc_start.ts_rpc_rate = tbf_rate; + start.u.tc_start.ts_rule_flags = NTRS_DEFAULT; + start.tc_name = NRS_TBF_DEFAULT_RULE; + INIT_LIST_HEAD(&start.u.tc_start.ts_conds); + rc = nrs_tbf_rule_start(policy, head, &start); + if (rc) + cfs_hash_putref(head->th_cli_hash); + + return rc; +} + +static struct nrs_tbf_client * +nrs_tbf_cli_hash_lookup(struct cfs_hash *hs, struct cfs_hash_bd *bd, + const char *key) +{ + struct hlist_node *hnode; + struct nrs_tbf_client *cli; + + hnode = cfs_hash_bd_lookup_locked(hs, bd, (void *)key); + if (hnode == NULL) + return NULL; + + cli = container_of0(hnode, struct nrs_tbf_client, tc_hnode); + if (!list_empty(&cli->tc_lru)) + list_del_init(&cli->tc_lru); + return cli; +} + +/** + * ONLY opcode presented in this function will be checked in + * nrs_tbf_id_cli_set(). That means, we can add or remove an + * opcode to enable or disable requests handled in nrs_tbf + */ +static struct req_format *req_fmt(__u32 opcode) +{ + switch (opcode) { + case OST_GETATTR: + return &RQF_OST_GETATTR; + case OST_SETATTR: + return &RQF_OST_SETATTR; + case OST_READ: + return &RQF_OST_BRW_READ; + case OST_WRITE: + return &RQF_OST_BRW_WRITE; + /* FIXME: OST_CREATE and OST_DESTROY comes from MDS + * in most case. Should they be removed? */ + case OST_CREATE: + return &RQF_OST_CREATE; + case OST_DESTROY: + return &RQF_OST_DESTROY; + case OST_PUNCH: + return &RQF_OST_PUNCH; + case OST_SYNC: + return &RQF_OST_SYNC; + case OST_LADVISE: + return &RQF_OST_LADVISE; + case MDS_GETATTR: + return &RQF_MDS_GETATTR; + case MDS_GETATTR_NAME: + return &RQF_MDS_GETATTR_NAME; + /* close is skipped to avoid LDLM cancel slowness */ +#if 0 + case MDS_CLOSE: + return &RQF_MDS_CLOSE; +#endif + case MDS_REINT: + return &RQF_MDS_REINT; + case MDS_READPAGE: + return &RQF_MDS_READPAGE; + case MDS_GET_ROOT: + return &RQF_MDS_GET_ROOT; + case MDS_STATFS: + return &RQF_MDS_STATFS; + case MDS_SYNC: + return &RQF_MDS_SYNC; + case MDS_QUOTACTL: + return &RQF_MDS_QUOTACTL; + case MDS_GETXATTR: + return &RQF_MDS_GETXATTR; + case MDS_GET_INFO: + return &RQF_MDS_GET_INFO; + /* HSM op is skipped */ +#if 0 + case MDS_HSM_STATE_GET: + return &RQF_MDS_HSM_STATE_GET; + case MDS_HSM_STATE_SET: + return &RQF_MDS_HSM_STATE_SET; + case MDS_HSM_ACTION: + return &RQF_MDS_HSM_ACTION; + case MDS_HSM_CT_REGISTER: + return &RQF_MDS_HSM_CT_REGISTER; + case MDS_HSM_CT_UNREGISTER: + return &RQF_MDS_HSM_CT_UNREGISTER; +#endif + case MDS_SWAP_LAYOUTS: + return &RQF_MDS_SWAP_LAYOUTS; + case LDLM_ENQUEUE: + return &RQF_LDLM_ENQUEUE; + default: + return NULL; + } +} + +static struct req_format *intent_req_fmt(__u32 it_opc) +{ + if (it_opc & (IT_OPEN | IT_CREAT)) + return &RQF_LDLM_INTENT_OPEN; + else if (it_opc & (IT_GETATTR | IT_LOOKUP)) + return &RQF_LDLM_INTENT_GETATTR; + else if (it_opc & IT_GETXATTR) + return &RQF_LDLM_INTENT_GETXATTR; + else if (it_opc & (IT_GLIMPSE | IT_BRW)) + return &RQF_LDLM_INTENT; + else + return NULL; +} + +static int ost_tbf_id_cli_set(struct ptlrpc_request *req, + struct tbf_id *id) +{ + struct ost_body *body; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + if (body != NULL) { + id->ti_uid = body->oa.o_uid; + id->ti_gid = body->oa.o_gid; + return 0; + } + + return -EINVAL; +} + +static void unpack_ugid_from_mdt_body(struct ptlrpc_request *req, + struct tbf_id *id) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + LASSERT(b != NULL); + + /* TODO: nodemaping feature converts {ug}id from individual + * clients to the actual ones of the file system. Some work + * may be needed to fix this. */ + id->ti_uid = b->mbo_uid; + id->ti_gid = b->mbo_gid; +} + +static void unpack_ugid_from_mdt_rec_reint(struct ptlrpc_request *req, + struct tbf_id *id) +{ + struct mdt_rec_reint *rec; + + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + LASSERT(rec != NULL); + + /* use the fs{ug}id as {ug}id of the process */ + id->ti_uid = rec->rr_fsuid; + id->ti_gid = rec->rr_fsgid; +} + +static int mdt_tbf_id_cli_set(struct ptlrpc_request *req, + struct tbf_id *id) +{ + u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + int rc = 0; + + switch (opc) { + case MDS_GETATTR: + case MDS_GETATTR_NAME: + case MDS_GET_ROOT: + case MDS_READPAGE: + case MDS_SYNC: + case MDS_GETXATTR: + case MDS_HSM_STATE_GET ... MDS_SWAP_LAYOUTS: + unpack_ugid_from_mdt_body(req, id); + break; + case MDS_CLOSE: + case MDS_REINT: + unpack_ugid_from_mdt_rec_reint(req, id); + break; + default: + rc = -EINVAL; + break; + } + return rc; +} + +static int ldlm_tbf_id_cli_set(struct ptlrpc_request *req, + struct tbf_id *id) +{ + struct ldlm_intent *lit; + struct req_format *fmt; + + if (req->rq_reqmsg->lm_bufcount <= DLM_INTENT_IT_OFF) + return -EINVAL; + + req_capsule_extend(&req->rq_pill, &RQF_LDLM_INTENT_BASIC); + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + if (lit == NULL) + return -EINVAL; + + fmt = intent_req_fmt(lit->opc); + if (fmt == NULL) + return -EINVAL; + + req_capsule_extend(&req->rq_pill, fmt); + + if (lit->opc & (IT_GETXATTR | IT_GETATTR | IT_LOOKUP)) + unpack_ugid_from_mdt_body(req, id); + else if (lit->opc & (IT_OPEN | IT_OPEN | IT_GLIMPSE | IT_BRW)) + unpack_ugid_from_mdt_rec_reint(req, id); + else + return -EINVAL; + return 0; +} + +static int nrs_tbf_id_cli_set(struct ptlrpc_request *req, struct tbf_id *id, + enum nrs_tbf_flag ti_type) +{ + u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + struct req_format *fmt = req_fmt(opc); + bool fmt_unset = false; + int rc; + + memset(id, 0, sizeof(struct tbf_id)); + id->ti_type = ti_type; + + if (fmt == NULL) + return -EINVAL; + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + if (req->rq_pill.rc_fmt == NULL) { + req_capsule_set(&req->rq_pill, fmt); + fmt_unset = true; + } + + if (opc < OST_LAST_OPC) + rc = ost_tbf_id_cli_set(req, id); + else if (opc >= MDS_FIRST_OPC && opc < MDS_LAST_OPC) + rc = mdt_tbf_id_cli_set(req, id); + else if (opc == LDLM_ENQUEUE) + rc = ldlm_tbf_id_cli_set(req, id); + else + rc = -EINVAL; + + /* restore it to the initialized state */ + if (fmt_unset) + req->rq_pill.rc_fmt = NULL; + return rc; +} + +static inline void nrs_tbf_cli_gen_key(struct nrs_tbf_client *cli, + struct ptlrpc_request *req, + char *keystr, size_t keystr_sz) +{ + const char *jobid; + u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + struct tbf_id id; + + nrs_tbf_id_cli_set(req, &id, NRS_TBF_FLAG_UID | NRS_TBF_FLAG_GID); + jobid = lustre_msg_get_jobid(req->rq_reqmsg); + if (jobid == NULL) + jobid = NRS_TBF_JOBID_NULL; + + snprintf(keystr, keystr_sz, "%s_%s_%d_%u_%u", jobid, + libcfs_nid2str(req->rq_peer.nid), opc, id.ti_uid, + id.ti_gid); + + if (cli) { + INIT_LIST_HEAD(&cli->tc_lru); + strlcpy(cli->tc_key, keystr, sizeof(cli->tc_key)); + strlcpy(cli->tc_jobid, jobid, sizeof(cli->tc_jobid)); + cli->tc_nid = req->rq_peer.nid; + cli->tc_opcode = opc; + cli->tc_id = id; + } +} + +static struct nrs_tbf_client * +nrs_tbf_cli_find(struct nrs_tbf_head *head, struct ptlrpc_request *req) +{ + struct nrs_tbf_client *cli; + struct cfs_hash *hs = head->th_cli_hash; + struct cfs_hash_bd bd; + char keystr[NRS_TBF_KEY_LEN]; + + nrs_tbf_cli_gen_key(NULL, req, keystr, sizeof(keystr)); + cfs_hash_bd_get_and_lock(hs, (void *)keystr, &bd, 1); + cli = nrs_tbf_cli_hash_lookup(hs, &bd, keystr); + cfs_hash_bd_unlock(hs, &bd, 1); + + return cli; +} + +static struct nrs_tbf_client * +nrs_tbf_cli_findadd(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + const char *key; + struct nrs_tbf_client *ret; + struct cfs_hash *hs = head->th_cli_hash; + struct cfs_hash_bd bd; + + key = cli->tc_key; + cfs_hash_bd_get_and_lock(hs, (void *)key, &bd, 1); + ret = nrs_tbf_cli_hash_lookup(hs, &bd, key); + if (ret == NULL) { + cfs_hash_bd_add_locked(hs, &bd, &cli->tc_hnode); + ret = cli; + } + cfs_hash_bd_unlock(hs, &bd, 1); + + return ret; +} + +static void +nrs_tbf_cli_put(struct nrs_tbf_head *head, struct nrs_tbf_client *cli) +{ + struct cfs_hash_bd bd; + struct cfs_hash *hs = head->th_cli_hash; + struct nrs_tbf_bucket *bkt; + int hw; + struct list_head zombies; + + INIT_LIST_HEAD(&zombies); + cfs_hash_bd_get(hs, &cli->tc_key, &bd); + bkt = cfs_hash_bd_extra_get(hs, &bd); + if (!cfs_hash_bd_dec_and_lock(hs, &bd, &cli->tc_ref)) + return; + LASSERT(list_empty(&cli->tc_lru)); + list_add_tail(&cli->tc_lru, &bkt->ntb_lru); + + /** + * Check and purge the LRU, there is at least one client in the LRU. + */ + hw = tbf_jobid_cache_size >> (hs->hs_cur_bits - hs->hs_bkt_bits); + while (cfs_hash_bd_count_get(&bd) > hw) { + if (unlikely(list_empty(&bkt->ntb_lru))) + break; + cli = list_entry(bkt->ntb_lru.next, + struct nrs_tbf_client, + tc_lru); + LASSERT(atomic_read(&cli->tc_ref) == 0); + cfs_hash_bd_del_locked(hs, &bd, &cli->tc_hnode); + list_move(&cli->tc_lru, &zombies); + } + cfs_hash_bd_unlock(head->th_cli_hash, &bd, 1); + + while (!list_empty(&zombies)) { + cli = container_of0(zombies.next, + struct nrs_tbf_client, tc_lru); + list_del_init(&cli->tc_lru); + nrs_tbf_cli_fini(cli); + } +} + +static void +nrs_tbf_generic_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + char keystr[NRS_TBF_KEY_LEN]; + + nrs_tbf_cli_gen_key(cli, req, keystr, sizeof(keystr)); +} + +static void +nrs_tbf_id_list_free(struct list_head *uid_list) +{ + struct nrs_tbf_id *nti_id, *n; + + list_for_each_entry_safe(nti_id, n, uid_list, nti_linkage) { + list_del_init(&nti_id->nti_linkage); + OBD_FREE_PTR(nti_id); + } +} + +static void +nrs_tbf_expression_free(struct nrs_tbf_expression *expr) +{ + LASSERT(expr->te_field >= NRS_TBF_FIELD_NID && + expr->te_field < NRS_TBF_FIELD_MAX); + switch (expr->te_field) { + case NRS_TBF_FIELD_NID: + cfs_free_nidlist(&expr->te_cond); + break; + case NRS_TBF_FIELD_JOBID: + nrs_tbf_jobid_list_free(&expr->te_cond); + break; + case NRS_TBF_FIELD_OPCODE: + CFS_FREE_BITMAP(expr->te_opcodes); + break; + case NRS_TBF_FIELD_UID: + case NRS_TBF_FIELD_GID: + nrs_tbf_id_list_free(&expr->te_cond); + break; + default: + LBUG(); + } + OBD_FREE_PTR(expr); +} + +static void +nrs_tbf_conjunction_free(struct nrs_tbf_conjunction *conjunction) +{ + struct nrs_tbf_expression *expression; + struct nrs_tbf_expression *n; + + LASSERT(list_empty(&conjunction->tc_linkage)); + list_for_each_entry_safe(expression, n, + &conjunction->tc_expressions, + te_linkage) { + list_del_init(&expression->te_linkage); + nrs_tbf_expression_free(expression); + } + OBD_FREE_PTR(conjunction); +} + +static void +nrs_tbf_conds_free(struct list_head *cond_list) +{ + struct nrs_tbf_conjunction *conjunction; + struct nrs_tbf_conjunction *n; + + list_for_each_entry_safe(conjunction, n, cond_list, tc_linkage) { + list_del_init(&conjunction->tc_linkage); + nrs_tbf_conjunction_free(conjunction); + } +} + +static void +nrs_tbf_generic_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + if (!list_empty(&cmd->u.tc_start.ts_conds)) + nrs_tbf_conds_free(&cmd->u.tc_start.ts_conds); + if (cmd->u.tc_start.ts_conds_str) + OBD_FREE(cmd->u.tc_start.ts_conds_str, + strlen(cmd->u.tc_start.ts_conds_str) + 1); +} + +#define NRS_TBF_DISJUNCTION_DELIM (',') +#define NRS_TBF_CONJUNCTION_DELIM ('&') +#define NRS_TBF_EXPRESSION_DELIM ('=') + +static inline bool +nrs_tbf_check_field(struct cfs_lstr *field, char *str) +{ + int len = strlen(str); + + return (field->ls_len == len && + strncmp(field->ls_str, str, len) == 0); +} + +static int +nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr); +static int +nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list, + enum nrs_tbf_flag tif); + +static int +nrs_tbf_expression_parse(struct cfs_lstr *src, struct list_head *cond_list) +{ + struct nrs_tbf_expression *expr; + struct cfs_lstr field; + int rc = 0; + + OBD_ALLOC(expr, sizeof(struct nrs_tbf_expression)); + if (expr == NULL) + return -ENOMEM; + + rc = cfs_gettok(src, NRS_TBF_EXPRESSION_DELIM, &field); + if (rc == 0 || src->ls_len <= 2 || src->ls_str[0] != '{' || + src->ls_str[src->ls_len - 1] != '}') + GOTO(out, rc = -EINVAL); + + /* Skip '{' and '}' */ + src->ls_str++; + src->ls_len -= 2; + + if (nrs_tbf_check_field(&field, "nid")) { + if (cfs_parse_nidlist(src->ls_str, + src->ls_len, + &expr->te_cond) <= 0) + GOTO(out, rc = -EINVAL); + expr->te_field = NRS_TBF_FIELD_NID; + } else if (nrs_tbf_check_field(&field, "jobid")) { + if (nrs_tbf_jobid_list_parse(src->ls_str, + src->ls_len, + &expr->te_cond) < 0) + GOTO(out, rc = -EINVAL); + expr->te_field = NRS_TBF_FIELD_JOBID; + } else if (nrs_tbf_check_field(&field, "opcode")) { + if (nrs_tbf_opcode_list_parse(src->ls_str, + src->ls_len, + &expr->te_opcodes) < 0) + GOTO(out, rc = -EINVAL); + expr->te_field = NRS_TBF_FIELD_OPCODE; + } else if (nrs_tbf_check_field(&field, "uid")) { + if (nrs_tbf_id_list_parse(src->ls_str, + src->ls_len, + &expr->te_cond, + NRS_TBF_FLAG_UID) < 0) + GOTO(out, rc = -EINVAL); + expr->te_field = NRS_TBF_FIELD_UID; + } else if (nrs_tbf_check_field(&field, "gid")) { + if (nrs_tbf_id_list_parse(src->ls_str, + src->ls_len, + &expr->te_cond, + NRS_TBF_FLAG_GID) < 0) + GOTO(out, rc = -EINVAL); + expr->te_field = NRS_TBF_FIELD_GID; + } else { + GOTO(out, rc = -EINVAL); + } + + list_add_tail(&expr->te_linkage, cond_list); + return 0; +out: + OBD_FREE_PTR(expr); + return rc; +} + +static int +nrs_tbf_conjunction_parse(struct cfs_lstr *src, struct list_head *cond_list) +{ + struct nrs_tbf_conjunction *conjunction; + struct cfs_lstr expr; + int rc = 0; + + OBD_ALLOC(conjunction, sizeof(struct nrs_tbf_conjunction)); + if (conjunction == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&conjunction->tc_expressions); + list_add_tail(&conjunction->tc_linkage, cond_list); + + while (src->ls_str) { + rc = cfs_gettok(src, NRS_TBF_CONJUNCTION_DELIM, &expr); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = nrs_tbf_expression_parse(&expr, + &conjunction->tc_expressions); + if (rc) + break; + } + return rc; +} + +static int +nrs_tbf_conds_parse(char *str, int len, struct list_head *cond_list) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(cond_list); + while (src.ls_str) { + rc = cfs_gettok(&src, NRS_TBF_DISJUNCTION_DELIM, &res); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = nrs_tbf_conjunction_parse(&res, cond_list); + if (rc) + break; + } + return rc; +} + +static int +nrs_tbf_generic_parse(struct nrs_tbf_cmd *cmd, const char *id) +{ + int rc; + + OBD_ALLOC(cmd->u.tc_start.ts_conds_str, strlen(id) + 1); + if (cmd->u.tc_start.ts_conds_str == NULL) + return -ENOMEM; + + memcpy(cmd->u.tc_start.ts_conds_str, id, strlen(id)); + + /* Parse hybird NID and JOBID conditions */ + rc = nrs_tbf_conds_parse(cmd->u.tc_start.ts_conds_str, + strlen(cmd->u.tc_start.ts_conds_str), + &cmd->u.tc_start.ts_conds); + if (rc) + nrs_tbf_generic_cmd_fini(cmd); + + return rc; +} + +static int +nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id); + +static int +nrs_tbf_expression_match(struct nrs_tbf_expression *expr, + struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + switch (expr->te_field) { + case NRS_TBF_FIELD_NID: + return cfs_match_nid(cli->tc_nid, &expr->te_cond); + case NRS_TBF_FIELD_JOBID: + return nrs_tbf_jobid_list_match(&expr->te_cond, cli->tc_jobid); + case NRS_TBF_FIELD_OPCODE: + return cfs_bitmap_check(expr->te_opcodes, cli->tc_opcode); + case NRS_TBF_FIELD_UID: + case NRS_TBF_FIELD_GID: + return nrs_tbf_id_list_match(&expr->te_cond, cli->tc_id); + default: + return 0; + } +} + +static int +nrs_tbf_conjunction_match(struct nrs_tbf_conjunction *conjunction, + struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + struct nrs_tbf_expression *expr; + int matched; + + list_for_each_entry(expr, &conjunction->tc_expressions, te_linkage) { + matched = nrs_tbf_expression_match(expr, rule, cli); + if (!matched) + return 0; + } + + return 1; +} + +static int +nrs_tbf_cond_match(struct nrs_tbf_rule *rule, struct nrs_tbf_client *cli) +{ + struct nrs_tbf_conjunction *conjunction; + int matched; + + list_for_each_entry(conjunction, &rule->tr_conds, tc_linkage) { + matched = nrs_tbf_conjunction_match(conjunction, rule, cli); + if (matched) + return 1; + } + + return 0; +} + +static void +nrs_tbf_generic_rule_fini(struct nrs_tbf_rule *rule) +{ + if (!list_empty(&rule->tr_conds)) + nrs_tbf_conds_free(&rule->tr_conds); + LASSERT(rule->tr_conds_str != NULL); + OBD_FREE(rule->tr_conds_str, strlen(rule->tr_conds_str) + 1); +} + +static int +nrs_tbf_rule_init(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_rule *rule, struct nrs_tbf_cmd *start) +{ + int rc = 0; + + LASSERT(start->u.tc_start.ts_conds_str); + OBD_ALLOC(rule->tr_conds_str, + strlen(start->u.tc_start.ts_conds_str) + 1); + if (rule->tr_conds_str == NULL) + return -ENOMEM; + + memcpy(rule->tr_conds_str, + start->u.tc_start.ts_conds_str, + strlen(start->u.tc_start.ts_conds_str)); + + INIT_LIST_HEAD(&rule->tr_conds); + if (!list_empty(&start->u.tc_start.ts_conds)) { + rc = nrs_tbf_conds_parse(rule->tr_conds_str, + strlen(rule->tr_conds_str), + &rule->tr_conds); + } + if (rc) + nrs_tbf_generic_rule_fini(rule); + + return rc; +} + +static int +nrs_tbf_generic_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + seq_printf(m, "%s %s %llu, ref %d\n", rule->tr_name, + rule->tr_conds_str, rule->tr_rpc_rate, + atomic_read(&rule->tr_ref) - 1); + return 0; +} + +static int +nrs_tbf_generic_rule_match(struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + return nrs_tbf_cond_match(rule, cli); +} + +static struct nrs_tbf_ops nrs_tbf_generic_ops = { + .o_name = NRS_TBF_TYPE_GENERIC, + .o_startup = nrs_tbf_startup, + .o_cli_find = nrs_tbf_cli_find, + .o_cli_findadd = nrs_tbf_cli_findadd, + .o_cli_put = nrs_tbf_cli_put, + .o_cli_init = nrs_tbf_generic_cli_init, + .o_rule_init = nrs_tbf_rule_init, + .o_rule_dump = nrs_tbf_generic_rule_dump, + .o_rule_match = nrs_tbf_generic_rule_match, + .o_rule_fini = nrs_tbf_generic_rule_fini, +}; + +static void nrs_tbf_opcode_rule_fini(struct nrs_tbf_rule *rule) +{ + if (rule->tr_opcodes != NULL) + CFS_FREE_BITMAP(rule->tr_opcodes); + + LASSERT(rule->tr_opcodes_str != NULL); + OBD_FREE(rule->tr_opcodes_str, strlen(rule->tr_opcodes_str) + 1); +} + +static unsigned nrs_tbf_opcode_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(__u32), mask); +} + +static int nrs_tbf_opcode_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + const __u32 *opc = key; + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return *opc == cli->tc_opcode; +} + +static void *nrs_tbf_opcode_hop_key(struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + return &cli->tc_opcode; +} + +static void nrs_tbf_opcode_hop_get(struct cfs_hash *hs, + struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_inc(&cli->tc_ref); +} + +static void nrs_tbf_opcode_hop_put(struct cfs_hash *hs, + struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_dec(&cli->tc_ref); +} + +static void nrs_tbf_opcode_hop_exit(struct cfs_hash *hs, + struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + LASSERTF(atomic_read(&cli->tc_ref) == 0, + "Busy TBF object from client with opcode %s, with %d refs\n", + ll_opcode2str(cli->tc_opcode), + atomic_read(&cli->tc_ref)); + + nrs_tbf_cli_fini(cli); +} +static struct cfs_hash_ops nrs_tbf_opcode_hash_ops = { + .hs_hash = nrs_tbf_opcode_hop_hash, + .hs_keycmp = nrs_tbf_opcode_hop_keycmp, + .hs_key = nrs_tbf_opcode_hop_key, + .hs_object = nrs_tbf_hop_object, + .hs_get = nrs_tbf_opcode_hop_get, + .hs_put = nrs_tbf_opcode_hop_put, + .hs_put_locked = nrs_tbf_opcode_hop_put, + .hs_exit = nrs_tbf_opcode_hop_exit, +}; + +static int +nrs_tbf_opcode_startup(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head) +{ + struct nrs_tbf_cmd start = { 0 }; + int rc; + + head->th_cli_hash = cfs_hash_create("nrs_tbf_hash", + NRS_TBF_NID_BITS, + NRS_TBF_NID_BITS, + NRS_TBF_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nrs_tbf_opcode_hash_ops, + CFS_HASH_RW_BKTLOCK); + if (head->th_cli_hash == NULL) + return -ENOMEM; + + start.u.tc_start.ts_opcodes = NULL; + start.u.tc_start.ts_opcodes_str = "*"; + + start.u.tc_start.ts_rpc_rate = tbf_rate; + start.u.tc_start.ts_rule_flags = NTRS_DEFAULT; + start.tc_name = NRS_TBF_DEFAULT_RULE; + rc = nrs_tbf_rule_start(policy, head, &start); + + return rc; +} + +static struct nrs_tbf_client * +nrs_tbf_opcode_cli_find(struct nrs_tbf_head *head, + struct ptlrpc_request *req) +{ + __u32 opc; + + opc = lustre_msg_get_opc(req->rq_reqmsg); + return cfs_hash_lookup(head->th_cli_hash, &opc); +} + +static struct nrs_tbf_client * +nrs_tbf_opcode_cli_findadd(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_opcode, + &cli->tc_hnode); +} + +static void +nrs_tbf_opcode_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + cli->tc_opcode = lustre_msg_get_opc(req->rq_reqmsg); +} + +#define MAX_OPCODE_LEN 32 +static int +nrs_tbf_opcode_set_bit(const struct cfs_lstr *id, struct cfs_bitmap *opcodes) +{ + int op = 0; + char opcode_str[MAX_OPCODE_LEN]; + + if (id->ls_len + 1 > MAX_OPCODE_LEN) + return -EINVAL; + + memcpy(opcode_str, id->ls_str, id->ls_len); + opcode_str[id->ls_len] = '\0'; + + op = ll_str2opcode(opcode_str); + if (op < 0) + return -EINVAL; + + cfs_bitmap_set(opcodes, op); + return 0; +} + +static int +nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr) +{ + struct cfs_bitmap *opcodes; + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + ENTRY; + + opcodes = CFS_ALLOCATE_BITMAP(LUSTRE_MAX_OPCODES); + if (opcodes == NULL) + return -ENOMEM; + + src.ls_str = str; + src.ls_len = len; + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + rc = -EINVAL; + break; + } + rc = nrs_tbf_opcode_set_bit(&res, opcodes); + if (rc) + break; + } + + if (rc == 0) + *bitmaptr = opcodes; + else + CFS_FREE_BITMAP(opcodes); + + RETURN(rc); +} + +static void nrs_tbf_opcode_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + if (cmd->u.tc_start.ts_opcodes) + CFS_FREE_BITMAP(cmd->u.tc_start.ts_opcodes); + + if (cmd->u.tc_start.ts_opcodes_str) + OBD_FREE(cmd->u.tc_start.ts_opcodes_str, + strlen(cmd->u.tc_start.ts_opcodes_str) + 1); + +} + +static int nrs_tbf_opcode_parse(struct nrs_tbf_cmd *cmd, char *id) +{ + struct cfs_lstr src; + int rc; + + src.ls_str = id; + src.ls_len = strlen(id); + rc = nrs_tbf_check_id_value(&src, "opcode"); + if (rc) + return rc; + + OBD_ALLOC(cmd->u.tc_start.ts_opcodes_str, src.ls_len + 1); + if (cmd->u.tc_start.ts_opcodes_str == NULL) + return -ENOMEM; + + memcpy(cmd->u.tc_start.ts_opcodes_str, src.ls_str, src.ls_len); + + /* parse opcode list */ + rc = nrs_tbf_opcode_list_parse(cmd->u.tc_start.ts_opcodes_str, + strlen(cmd->u.tc_start.ts_opcodes_str), + &cmd->u.tc_start.ts_opcodes); + if (rc) + nrs_tbf_opcode_cmd_fini(cmd); + + return rc; +} + +static int +nrs_tbf_opcode_rule_match(struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + if (rule->tr_opcodes == NULL) + return 0; + + return cfs_bitmap_check(rule->tr_opcodes, cli->tc_opcode); +} + +static int nrs_tbf_opcode_rule_init(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_rule *rule, + struct nrs_tbf_cmd *start) +{ + int rc = 0; + + LASSERT(start->u.tc_start.ts_opcodes_str != NULL); + OBD_ALLOC(rule->tr_opcodes_str, + strlen(start->u.tc_start.ts_opcodes_str) + 1); + if (rule->tr_opcodes_str == NULL) + return -ENOMEM; + + strncpy(rule->tr_opcodes_str, start->u.tc_start.ts_opcodes_str, + strlen(start->u.tc_start.ts_opcodes_str) + 1); + + /* Default rule '*' */ + if (start->u.tc_start.ts_opcodes == NULL) + return 0; + + rc = nrs_tbf_opcode_list_parse(rule->tr_opcodes_str, + strlen(rule->tr_opcodes_str), + &rule->tr_opcodes); + if (rc) + OBD_FREE(rule->tr_opcodes_str, + strlen(start->u.tc_start.ts_opcodes_str) + 1); + + return rc; +} + +static int +nrs_tbf_opcode_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name, + rule->tr_opcodes_str, rule->tr_rpc_rate, + atomic_read(&rule->tr_ref) - 1); + return 0; +} + + +struct nrs_tbf_ops nrs_tbf_opcode_ops = { + .o_name = NRS_TBF_TYPE_OPCODE, + .o_startup = nrs_tbf_opcode_startup, + .o_cli_find = nrs_tbf_opcode_cli_find, + .o_cli_findadd = nrs_tbf_opcode_cli_findadd, + .o_cli_put = nrs_tbf_nid_cli_put, + .o_cli_init = nrs_tbf_opcode_cli_init, + .o_rule_init = nrs_tbf_opcode_rule_init, + .o_rule_dump = nrs_tbf_opcode_rule_dump, + .o_rule_match = nrs_tbf_opcode_rule_match, + .o_rule_fini = nrs_tbf_opcode_rule_fini, +}; + +static unsigned nrs_tbf_id_hop_hash(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(struct tbf_id), mask); +} + +static int nrs_tbf_id_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + const struct tbf_id *opc = key; + enum nrs_tbf_flag ntf; + struct nrs_tbf_client *cli = hlist_entry(hnode, struct nrs_tbf_client, + tc_hnode); + ntf = opc->ti_type & cli->tc_id.ti_type; + if ((ntf & NRS_TBF_FLAG_UID) && opc->ti_uid != cli->tc_id.ti_uid) + return 0; + + if ((ntf & NRS_TBF_FLAG_GID) && opc->ti_gid != cli->tc_id.ti_gid) + return 0; + + return 1; +} + +static void *nrs_tbf_id_hop_key(struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + return &cli->tc_id; +} + +static void nrs_tbf_id_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_inc(&cli->tc_ref); +} + +static void nrs_tbf_id_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + atomic_dec(&cli->tc_ref); +} + +static void +nrs_tbf_id_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode) + +{ + struct nrs_tbf_client *cli = hlist_entry(hnode, + struct nrs_tbf_client, + tc_hnode); + + LASSERT(atomic_read(&cli->tc_ref) == 0); + nrs_tbf_cli_fini(cli); +} + +static struct cfs_hash_ops nrs_tbf_id_hash_ops = { + .hs_hash = nrs_tbf_id_hop_hash, + .hs_keycmp = nrs_tbf_id_hop_keycmp, + .hs_key = nrs_tbf_id_hop_key, + .hs_object = nrs_tbf_hop_object, + .hs_get = nrs_tbf_id_hop_get, + .hs_put = nrs_tbf_id_hop_put, + .hs_put_locked = nrs_tbf_id_hop_put, + .hs_exit = nrs_tbf_id_hop_exit, +}; + +static int +nrs_tbf_id_startup(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_head *head) +{ + struct nrs_tbf_cmd start; + int rc; + + head->th_cli_hash = cfs_hash_create("nrs_tbf_id_hash", + NRS_TBF_NID_BITS, + NRS_TBF_NID_BITS, + NRS_TBF_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nrs_tbf_id_hash_ops, + CFS_HASH_RW_BKTLOCK); + if (head->th_cli_hash == NULL) + return -ENOMEM; + + memset(&start, 0, sizeof(start)); + start.u.tc_start.ts_ids_str = "*"; + start.u.tc_start.ts_rpc_rate = tbf_rate; + start.u.tc_start.ts_rule_flags = NTRS_DEFAULT; + start.tc_name = NRS_TBF_DEFAULT_RULE; + INIT_LIST_HEAD(&start.u.tc_start.ts_ids); + rc = nrs_tbf_rule_start(policy, head, &start); + if (rc) { + cfs_hash_putref(head->th_cli_hash); + head->th_cli_hash = NULL; + } + + return rc; +} + +static struct nrs_tbf_client * +nrs_tbf_id_cli_find(struct nrs_tbf_head *head, + struct ptlrpc_request *req) +{ + struct tbf_id id; + + LASSERT(head->th_type_flag == NRS_TBF_FLAG_UID || + head->th_type_flag == NRS_TBF_FLAG_GID); + + nrs_tbf_id_cli_set(req, &id, head->th_type_flag); + return cfs_hash_lookup(head->th_cli_hash, &id); +} + +static struct nrs_tbf_client * +nrs_tbf_id_cli_findadd(struct nrs_tbf_head *head, + struct nrs_tbf_client *cli) +{ + return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_id, + &cli->tc_hnode); +} + +static void +nrs_tbf_uid_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_UID); +} + +static void +nrs_tbf_gid_cli_init(struct nrs_tbf_client *cli, + struct ptlrpc_request *req) +{ + nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_GID); +} + +static int +nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id) +{ + struct nrs_tbf_id *nti_id; + enum nrs_tbf_flag flag; + + list_for_each_entry(nti_id, id_list, nti_linkage) { + flag = id.ti_type & nti_id->nti_id.ti_type; + if (!flag) + continue; + + if ((flag & NRS_TBF_FLAG_UID) && + (id.ti_uid != nti_id->nti_id.ti_uid)) + continue; + + if ((flag & NRS_TBF_FLAG_GID) && + (id.ti_gid != nti_id->nti_id.ti_gid)) + continue; + + return 1; + } + return 0; +} + +static int +nrs_tbf_id_rule_match(struct nrs_tbf_rule *rule, + struct nrs_tbf_client *cli) +{ + return nrs_tbf_id_list_match(&rule->tr_ids, cli->tc_id); +} + +static void nrs_tbf_id_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + nrs_tbf_id_list_free(&cmd->u.tc_start.ts_ids); + + if (cmd->u.tc_start.ts_ids_str) + OBD_FREE(cmd->u.tc_start.ts_ids_str, + strlen(cmd->u.tc_start.ts_ids_str) + 1); +} + +static int +nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list, + enum nrs_tbf_flag tif) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc = 0; + struct tbf_id id = { 0 }; + ENTRY; + + if (tif != NRS_TBF_FLAG_UID && tif != NRS_TBF_FLAG_GID) + RETURN(-EINVAL); + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(id_list); + while (src.ls_str) { + struct nrs_tbf_id *nti_id; + + if (cfs_gettok(&src, ' ', &res) == 0) + GOTO(out, rc = -EINVAL); + + id.ti_type = tif; + if (tif == NRS_TBF_FLAG_UID) { + if (!cfs_str2num_check(res.ls_str, res.ls_len, + &id.ti_uid, 0, (u32)~0U)) + GOTO(out, rc = -EINVAL); + } else { + if (!cfs_str2num_check(res.ls_str, res.ls_len, + &id.ti_gid, 0, (u32)~0U)) + GOTO(out, rc = -EINVAL); + } + + OBD_ALLOC_PTR(nti_id); + if (nti_id == NULL) + GOTO(out, rc = -ENOMEM); + + nti_id->nti_id = id; + list_add_tail(&nti_id->nti_linkage, id_list); + } +out: + if (rc) + nrs_tbf_id_list_free(id_list); + RETURN(rc); +} + +static int nrs_tbf_ug_id_parse(struct nrs_tbf_cmd *cmd, char *id) +{ + struct cfs_lstr src; + int rc; + enum nrs_tbf_flag tif; + + tif = cmd->u.tc_start.ts_valid_type; + + src.ls_str = id; + src.ls_len = strlen(id); + + rc = nrs_tbf_check_id_value(&src, + tif == NRS_TBF_FLAG_UID ? "uid" : "gid"); + if (rc) + return rc; + + OBD_ALLOC(cmd->u.tc_start.ts_ids_str, src.ls_len + 1); + if (cmd->u.tc_start.ts_ids_str == NULL) + return -ENOMEM; + + strlcpy(cmd->u.tc_start.ts_ids_str, src.ls_str, src.ls_len + 1); + + rc = nrs_tbf_id_list_parse(cmd->u.tc_start.ts_ids_str, + strlen(cmd->u.tc_start.ts_ids_str), + &cmd->u.tc_start.ts_ids, tif); + if (rc) + nrs_tbf_id_cmd_fini(cmd); + + return rc; +} + +static int +nrs_tbf_id_rule_init(struct ptlrpc_nrs_policy *policy, + struct nrs_tbf_rule *rule, + struct nrs_tbf_cmd *start) +{ + struct nrs_tbf_head *head = rule->tr_head; + int rc = 0; + enum nrs_tbf_flag tif = head->th_type_flag; + int ids_len = strlen(start->u.tc_start.ts_ids_str) + 1; + + LASSERT(start->u.tc_start.ts_ids_str); + INIT_LIST_HEAD(&rule->tr_ids); + + OBD_ALLOC(rule->tr_ids_str, ids_len); + if (rule->tr_ids_str == NULL) + return -ENOMEM; + + strlcpy(rule->tr_ids_str, start->u.tc_start.ts_ids_str, + ids_len); + + if (!list_empty(&start->u.tc_start.ts_ids)) { + rc = nrs_tbf_id_list_parse(rule->tr_ids_str, + strlen(rule->tr_ids_str), + &rule->tr_ids, tif); + if (rc) + CERROR("%ss {%s} illegal\n", + tif == NRS_TBF_FLAG_UID ? "uid" : "gid", + rule->tr_ids_str); + } + if (rc) { + OBD_FREE(rule->tr_ids_str, ids_len); + rule->tr_ids_str = NULL; + } + return rc; +} + +static int +nrs_tbf_id_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m) +{ + seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name, + rule->tr_ids_str, rule->tr_rpc_rate, + atomic_read(&rule->tr_ref) - 1); + return 0; +} + +static void nrs_tbf_id_rule_fini(struct nrs_tbf_rule *rule) +{ + nrs_tbf_id_list_free(&rule->tr_ids); + if (rule->tr_ids_str != NULL) + OBD_FREE(rule->tr_ids_str, strlen(rule->tr_ids_str) + 1); +} + +struct nrs_tbf_ops nrs_tbf_uid_ops = { + .o_name = NRS_TBF_TYPE_UID, + .o_startup = nrs_tbf_id_startup, + .o_cli_find = nrs_tbf_id_cli_find, + .o_cli_findadd = nrs_tbf_id_cli_findadd, + .o_cli_put = nrs_tbf_nid_cli_put, + .o_cli_init = nrs_tbf_uid_cli_init, + .o_rule_init = nrs_tbf_id_rule_init, + .o_rule_dump = nrs_tbf_id_rule_dump, + .o_rule_match = nrs_tbf_id_rule_match, + .o_rule_fini = nrs_tbf_id_rule_fini, +}; + +struct nrs_tbf_ops nrs_tbf_gid_ops = { + .o_name = NRS_TBF_TYPE_GID, + .o_startup = nrs_tbf_id_startup, + .o_cli_find = nrs_tbf_id_cli_find, + .o_cli_findadd = nrs_tbf_id_cli_findadd, + .o_cli_put = nrs_tbf_nid_cli_put, + .o_cli_init = nrs_tbf_gid_cli_init, + .o_rule_init = nrs_tbf_id_rule_init, + .o_rule_dump = nrs_tbf_id_rule_dump, + .o_rule_match = nrs_tbf_id_rule_match, + .o_rule_fini = nrs_tbf_id_rule_fini, +}; + +static struct nrs_tbf_type nrs_tbf_types[] = { + { + .ntt_name = NRS_TBF_TYPE_JOBID, + .ntt_flag = NRS_TBF_FLAG_JOBID, + .ntt_ops = &nrs_tbf_jobid_ops, + }, + { + .ntt_name = NRS_TBF_TYPE_NID, + .ntt_flag = NRS_TBF_FLAG_NID, + .ntt_ops = &nrs_tbf_nid_ops, + }, + { + .ntt_name = NRS_TBF_TYPE_OPCODE, + .ntt_flag = NRS_TBF_FLAG_OPCODE, + .ntt_ops = &nrs_tbf_opcode_ops, + }, + { + .ntt_name = NRS_TBF_TYPE_GENERIC, + .ntt_flag = NRS_TBF_FLAG_GENERIC, + .ntt_ops = &nrs_tbf_generic_ops, + }, + { + .ntt_name = NRS_TBF_TYPE_UID, + .ntt_flag = NRS_TBF_FLAG_UID, + .ntt_ops = &nrs_tbf_uid_ops, + }, + { + .ntt_name = NRS_TBF_TYPE_GID, + .ntt_flag = NRS_TBF_FLAG_GID, + .ntt_ops = &nrs_tbf_gid_ops, + }, +}; + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a + * policy-specific private data structure. + * + * \param[in] policy The policy to start + * + * \retval -ENOMEM OOM error + * \retval 0 success + * + * \see nrs_policy_register() + * \see nrs_policy_ctl() + */ +static int nrs_tbf_start(struct ptlrpc_nrs_policy *policy, char *arg) +{ + struct nrs_tbf_head *head; + struct nrs_tbf_ops *ops; + __u32 type; + char *name; + int found = 0; + int i; + int rc = 0; + + if (arg == NULL) + name = NRS_TBF_TYPE_GENERIC; + else if (strlen(arg) < NRS_TBF_TYPE_MAX_LEN) + name = arg; + else + GOTO(out, rc = -EINVAL); + + for (i = 0; i < ARRAY_SIZE(nrs_tbf_types); i++) { + if (strcmp(name, nrs_tbf_types[i].ntt_name) == 0) { + ops = nrs_tbf_types[i].ntt_ops; + type = nrs_tbf_types[i].ntt_flag; + found = 1; + break; + } + } + if (found == 0) + GOTO(out, rc = -ENOTSUPP); + + OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (head == NULL) + GOTO(out, rc = -ENOMEM); + + memcpy(head->th_type, name, strlen(name)); + head->th_type[strlen(name)] = '\0'; + head->th_ops = ops; + head->th_type_flag = type; + + head->th_binheap = cfs_binheap_create(&nrs_tbf_heap_ops, + CBH_FLAG_ATOMIC_GROW, 4096, NULL, + nrs_pol2cptab(policy), + nrs_pol2cptid(policy)); + if (head->th_binheap == NULL) + GOTO(out_free_head, rc = -ENOMEM); + + atomic_set(&head->th_rule_sequence, 0); + spin_lock_init(&head->th_rule_lock); + INIT_LIST_HEAD(&head->th_list); + hrtimer_init(&head->th_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + head->th_timer.function = nrs_tbf_timer_cb; + rc = head->th_ops->o_startup(policy, head); + if (rc) + GOTO(out_free_heap, rc); + + policy->pol_private = head; + return 0; +out_free_heap: + cfs_binheap_destroy(head->th_binheap); +out_free_head: + OBD_FREE_PTR(head); +out: + return rc; +} + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific + * private data structure. + * + * \param[in] policy The policy to stop + * + * \see nrs_policy_stop0() + */ +static void nrs_tbf_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_tbf_head *head = policy->pol_private; + struct ptlrpc_nrs *nrs = policy->pol_nrs; + struct nrs_tbf_rule *rule, *n; + + LASSERT(head != NULL); + LASSERT(head->th_cli_hash != NULL); + hrtimer_cancel(&head->th_timer); + /* Should cleanup hash first before free rules */ + cfs_hash_putref(head->th_cli_hash); + list_for_each_entry_safe(rule, n, &head->th_list, tr_linkage) { + list_del_init(&rule->tr_linkage); + nrs_tbf_rule_put(rule); + } + LASSERT(list_empty(&head->th_list)); + LASSERT(head->th_binheap != NULL); + LASSERT(cfs_binheap_is_empty(head->th_binheap)); + cfs_binheap_destroy(head->th_binheap); + OBD_FREE_PTR(head); + nrs->nrs_throttling = 0; + wake_up(&policy->pol_nrs->nrs_svcpt->scp_waitq); +} + +/** + * Performs a policy-specific ctl function on TBF policy instances; similar + * to ioctl. + * + * \param[in] policy the policy instance + * \param[in] opc the opcode + * \param[in,out] arg used for passing parameters and information + * + * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock) + * + * \retval 0 operation carried out successfully + * \retval -ve error + */ +static int nrs_tbf_ctl(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, + void *arg) +{ + int rc = 0; + ENTRY; + + assert_spin_locked(&policy->pol_nrs->nrs_lock); + + switch ((enum nrs_ctl_tbf)opc) { + default: + RETURN(-EINVAL); + + /** + * Read RPC rate size of a policy instance. + */ + case NRS_CTL_TBF_RD_RULE: { + struct nrs_tbf_head *head = policy->pol_private; + struct seq_file *m = (struct seq_file *) arg; + struct ptlrpc_service_part *svcpt; + + svcpt = policy->pol_nrs->nrs_svcpt; + seq_printf(m, "CPT %d:\n", svcpt->scp_cpt); + + rc = nrs_tbf_rule_dump_all(head, m); + } + break; + + /** + * Write RPC rate of a policy instance. + */ + case NRS_CTL_TBF_WR_RULE: { + struct nrs_tbf_head *head = policy->pol_private; + struct nrs_tbf_cmd *cmd; + + cmd = (struct nrs_tbf_cmd *)arg; + rc = nrs_tbf_command(policy, + head, + cmd); + } + break; + /** + * Read the TBF policy type of a policy instance. + */ + case NRS_CTL_TBF_RD_TYPE_FLAG: { + struct nrs_tbf_head *head = policy->pol_private; + + *(__u32 *)arg = head->th_type_flag; + } + break; + } + + RETURN(rc); +} + +/** + * Is called for obtaining a TBF policy resource. + * + * \param[in] policy The policy on which the request is being asked for + * \param[in] nrq The request for which resources are being taken + * \param[in] parent Parent resource, unused in this policy + * \param[out] resp Resources references are placed in this array + * \param[in] moving_req Signifies limited caller context; unused in this + * policy + * + * + * \see nrs_resource_get_safe() + */ +static int nrs_tbf_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, + bool moving_req) +{ + struct nrs_tbf_head *head; + struct nrs_tbf_client *cli; + struct nrs_tbf_client *tmp; + struct ptlrpc_request *req; + + if (parent == NULL) { + *resp = &((struct nrs_tbf_head *)policy->pol_private)->th_res; + return 0; + } + + head = container_of(parent, struct nrs_tbf_head, th_res); + req = container_of(nrq, struct ptlrpc_request, rq_nrq); + cli = head->th_ops->o_cli_find(head, req); + if (cli != NULL) { + spin_lock(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + LASSERT(cli->tc_rule); + if (cli->tc_rule_sequence != + atomic_read(&head->th_rule_sequence) || + cli->tc_rule->tr_flags & NTRS_STOPPING) { + struct nrs_tbf_rule *rule; + + CDEBUG(D_RPCTRACE, + "TBF class@%p rate %llu sequence %d, " + "rule flags %d, head sequence %d\n", + cli, cli->tc_rpc_rate, + cli->tc_rule_sequence, + cli->tc_rule->tr_flags, + atomic_read(&head->th_rule_sequence)); + rule = nrs_tbf_rule_match(head, cli); + if (rule != cli->tc_rule) { + nrs_tbf_cli_reset(head, rule, cli); + } else { + if (cli->tc_rule_generation != rule->tr_generation) + nrs_tbf_cli_reset_value(head, cli); + nrs_tbf_rule_put(rule); + } + } else if (cli->tc_rule_generation != + cli->tc_rule->tr_generation) { + nrs_tbf_cli_reset_value(head, cli); + } + spin_unlock(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + goto out; + } + + OBD_CPT_ALLOC_GFP(cli, nrs_pol2cptab(policy), nrs_pol2cptid(policy), + sizeof(*cli), moving_req ? GFP_ATOMIC : __GFP_IO); + if (cli == NULL) + return -ENOMEM; + + nrs_tbf_cli_init(head, cli, req); + tmp = head->th_ops->o_cli_findadd(head, cli); + if (tmp != cli) { + atomic_dec(&cli->tc_ref); + nrs_tbf_cli_fini(cli); + cli = tmp; + } +out: + *resp = &cli->tc_res; + + return 1; +} + +/** + * Called when releasing references to the resource hierachy obtained for a + * request for scheduling using the TBF policy. + * + * \param[in] policy the policy the resource belongs to + * \param[in] res the resource to be released + */ +static void nrs_tbf_res_put(struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res) +{ + struct nrs_tbf_head *head; + struct nrs_tbf_client *cli; + + /** + * Do nothing for freeing parent, nrs_tbf_net resources + */ + if (res->res_parent == NULL) + return; + + cli = container_of(res, struct nrs_tbf_client, tc_res); + head = container_of(res->res_parent, struct nrs_tbf_head, th_res); + + head->th_ops->o_cli_put(head, cli); +} + +/** + * Called when getting a request from the TBF policy for handling, or just + * peeking; removes the request from the policy when it is to be handled. + * + * \param[in] policy The policy + * \param[in] peek When set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force Force the policy to return a request; unused in this + * policy + * + * \retval The request to be handled; this is the next request in the TBF + * rule + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_tbf_head *head = policy->pol_private; + struct ptlrpc_nrs_request *nrq = NULL; + struct nrs_tbf_client *cli; + struct cfs_binheap_node *node; + + assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + + if (!peek && policy->pol_nrs->nrs_throttling) + return NULL; + + node = cfs_binheap_root(head->th_binheap); + if (unlikely(node == NULL)) + return NULL; + + cli = container_of(node, struct nrs_tbf_client, tc_node); + LASSERT(cli->tc_in_heap); + if (peek) { + nrq = list_entry(cli->tc_list.next, + struct ptlrpc_nrs_request, + nr_u.tbf.tr_list); + } else { + struct nrs_tbf_rule *rule = cli->tc_rule; + __u64 now = ktime_to_ns(ktime_get()); + __u64 passed; + __u64 ntoken; + __u64 deadline; + __u64 old_resid = 0; + + deadline = cli->tc_check_time + + cli->tc_nsecs; + LASSERT(now >= cli->tc_check_time); + passed = now - cli->tc_check_time; + ntoken = passed * cli->tc_rpc_rate; + do_div(ntoken, NSEC_PER_SEC); + + ntoken += cli->tc_ntoken; + if (rule->tr_flags & NTRS_REALTIME) { + LASSERT(cli->tc_nsecs_resid < cli->tc_nsecs); + old_resid = cli->tc_nsecs_resid; + cli->tc_nsecs_resid += passed % cli->tc_nsecs; + if (cli->tc_nsecs_resid > cli->tc_nsecs) { + ntoken++; + cli->tc_nsecs_resid -= cli->tc_nsecs; + } + } else if (ntoken > cli->tc_depth) + ntoken = cli->tc_depth; + + if (ntoken > 0) { + struct ptlrpc_request *req; + nrq = list_entry(cli->tc_list.next, + struct ptlrpc_nrs_request, + nr_u.tbf.tr_list); + req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + ntoken--; + cli->tc_ntoken = ntoken; + cli->tc_check_time = now; + list_del_init(&nrq->nr_u.tbf.tr_list); + if (list_empty(&cli->tc_list)) { + cfs_binheap_remove(head->th_binheap, + &cli->tc_node); + cli->tc_in_heap = false; + } else { + if (!(rule->tr_flags & NTRS_REALTIME)) + cli->tc_deadline = now + cli->tc_nsecs; + cfs_binheap_relocate(head->th_binheap, + &cli->tc_node); + } + CDEBUG(D_RPCTRACE, + "TBF dequeues: class@%p rate %llu gen %llu " + "token %llu, rule@%p rate %llu gen %llu\n", + cli, cli->tc_rpc_rate, + cli->tc_rule_generation, cli->tc_ntoken, + cli->tc_rule, cli->tc_rule->tr_rpc_rate, + cli->tc_rule->tr_generation); + } else { + ktime_t time; + + if (rule->tr_flags & NTRS_REALTIME) { + cli->tc_deadline = deadline; + cli->tc_nsecs_resid = old_resid; + cfs_binheap_relocate(head->th_binheap, + &cli->tc_node); + if (node != cfs_binheap_root(head->th_binheap)) + return nrs_tbf_req_get(policy, + peek, force); + } + policy->pol_nrs->nrs_throttling = 1; + head->th_deadline = deadline; + time = ktime_set(0, 0); + time = ktime_add_ns(time, deadline); + hrtimer_start(&head->th_timer, time, HRTIMER_MODE_ABS); + } + } + + return nrq; +} + +/** + * Adds request \a nrq to \a policy's list of queued requests + * + * \param[in] policy The policy + * \param[in] nrq The request to add + * + * \retval 0 success; nrs_request_enqueue() assumes this function will always + * succeed + */ +static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_tbf_head *head; + struct nrs_tbf_client *cli; + int rc = 0; + + assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + + cli = container_of(nrs_request_resource(nrq), + struct nrs_tbf_client, tc_res); + head = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_tbf_head, th_res); + if (list_empty(&cli->tc_list)) { + LASSERT(!cli->tc_in_heap); + cli->tc_deadline = cli->tc_check_time + cli->tc_nsecs; + rc = cfs_binheap_insert(head->th_binheap, &cli->tc_node); + if (rc == 0) { + cli->tc_in_heap = true; + nrq->nr_u.tbf.tr_sequence = head->th_sequence++; + list_add_tail(&nrq->nr_u.tbf.tr_list, + &cli->tc_list); + if (policy->pol_nrs->nrs_throttling) { + __u64 deadline = cli->tc_deadline; + if ((head->th_deadline > deadline) && + (hrtimer_try_to_cancel(&head->th_timer) + >= 0)) { + ktime_t time; + head->th_deadline = deadline; + time = ktime_set(0, 0); + time = ktime_add_ns(time, deadline); + hrtimer_start(&head->th_timer, time, + HRTIMER_MODE_ABS); + } + } + } + } else { + LASSERT(cli->tc_in_heap); + nrq->nr_u.tbf.tr_sequence = head->th_sequence++; + list_add_tail(&nrq->nr_u.tbf.tr_list, + &cli->tc_list); + } + + if (rc == 0) + CDEBUG(D_RPCTRACE, + "TBF enqueues: class@%p rate %llu gen %llu " + "token %llu, rule@%p rate %llu gen %llu\n", + cli, cli->tc_rpc_rate, + cli->tc_rule_generation, cli->tc_ntoken, + cli->tc_rule, cli->tc_rule->tr_rpc_rate, + cli->tc_rule->tr_generation); + + return rc; +} + +/** + * Removes request \a nrq from \a policy's list of queued requests. + * + * \param[in] policy The policy + * \param[in] nrq The request to remove + */ +static void nrs_tbf_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_tbf_head *head; + struct nrs_tbf_client *cli; + + assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + + cli = container_of(nrs_request_resource(nrq), + struct nrs_tbf_client, tc_res); + head = container_of(nrs_request_resource(nrq)->res_parent, + struct nrs_tbf_head, th_res); + + LASSERT(!list_empty(&nrq->nr_u.tbf.tr_list)); + list_del_init(&nrq->nr_u.tbf.tr_list); + if (list_empty(&cli->tc_list)) { + cfs_binheap_remove(head->th_binheap, + &cli->tc_node); + cli->tc_in_heap = false; + } else { + cfs_binheap_relocate(head->th_binheap, + &cli->tc_node); + } +} + +/** + * Prints a debug statement right before the request \a nrq stops being + * handled. + * + * \param[in] policy The policy handling the request + * \param[in] nrq The request being handled + * + * \see ptlrpc_server_finish_request() + * \see ptlrpc_nrs_req_stop_nolock() + */ +static void nrs_tbf_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + + assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock); + + CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n", + policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer), + nrq->nr_u.tbf.tr_sequence); +} + +/** + * debugfs interface + */ + +/** + * The maximum RPC rate. + */ +#define LPROCFS_NRS_RATE_MAX 65535 + +static int +ptlrpc_lprocfs_nrs_tbf_rule_seq_show(struct seq_file *m, void *data) +{ + struct ptlrpc_service *svc = m->private; + int rc; + + seq_printf(m, "regular_requests:\n"); + /** + * Perform two separate calls to this as only one of the NRS heads' + * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state. + */ + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG, + NRS_POL_NAME_TBF, + NRS_CTL_TBF_RD_RULE, + false, m); + if (rc == 0) { + /** + * -ENOSPC means buf in the parameter m is overflow, return 0 + * here to let upper layer function seq_read alloc a larger + * memory area and do this process again. + */ + } else if (rc == -ENOSPC) { + return 0; + + /** + * Ignore -ENODEV as the regular NRS head's policy may be in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state. + */ + } else if (rc != -ENODEV) { + return rc; + } + + if (!nrs_svc_has_hp(svc)) + goto no_hp; + + seq_printf(m, "high_priority_requests:\n"); + rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP, + NRS_POL_NAME_TBF, + NRS_CTL_TBF_RD_RULE, + false, m); + if (rc == 0) { + /** + * -ENOSPC means buf in the parameter m is overflow, return 0 + * here to let upper layer function seq_read alloc a larger + * memory area and do this process again. + */ + } else if (rc == -ENOSPC) { + return 0; + } + +no_hp: + + return rc; +} + +static int nrs_tbf_id_parse(struct nrs_tbf_cmd *cmd, char *token) +{ + int rc; + ENTRY; + + switch (cmd->u.tc_start.ts_valid_type) { + case NRS_TBF_FLAG_JOBID: + rc = nrs_tbf_jobid_parse(cmd, token); + break; + case NRS_TBF_FLAG_NID: + rc = nrs_tbf_nid_parse(cmd, token); + break; + case NRS_TBF_FLAG_OPCODE: + rc = nrs_tbf_opcode_parse(cmd, token); + break; + case NRS_TBF_FLAG_GENERIC: + rc = nrs_tbf_generic_parse(cmd, token); + break; + case NRS_TBF_FLAG_UID: + case NRS_TBF_FLAG_GID: + rc = nrs_tbf_ug_id_parse(cmd, token); + break; + default: + RETURN(-EINVAL); + } + + RETURN(rc); +} + +static void nrs_tbf_cmd_fini(struct nrs_tbf_cmd *cmd) +{ + if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) { + switch (cmd->u.tc_start.ts_valid_type) { + case NRS_TBF_FLAG_JOBID: + nrs_tbf_jobid_cmd_fini(cmd); + break; + case NRS_TBF_FLAG_NID: + nrs_tbf_nid_cmd_fini(cmd); + break; + case NRS_TBF_FLAG_OPCODE: + nrs_tbf_opcode_cmd_fini(cmd); + break; + case NRS_TBF_FLAG_GENERIC: + nrs_tbf_generic_cmd_fini(cmd); + break; + case NRS_TBF_FLAG_UID: + case NRS_TBF_FLAG_GID: + nrs_tbf_id_cmd_fini(cmd); + break; + default: + CWARN("unknown NRS_TBF_FLAGS:0x%x\n", + cmd->u.tc_start.ts_valid_type); + } + } +} + +static bool name_is_valid(const char *name) +{ + int i; + + for (i = 0; i < strlen(name); i++) { + if ((!isalnum(name[i])) && + (name[i] != '_')) + return false; + } + return true; +} + +static int +nrs_tbf_parse_value_pair(struct nrs_tbf_cmd *cmd, char *buffer) +{ + char *key; + char *val; + int rc; + __u64 rate; + + val = buffer; + key = strsep(&val, "="); + if (val == NULL || strlen(val) == 0) + return -EINVAL; + + /* Key of the value pair */ + if (strcmp(key, "rate") == 0) { + rc = kstrtoull(val, 10, &rate); + if (rc) + return rc; + + if (rate <= 0 || rate >= LPROCFS_NRS_RATE_MAX) + return -EINVAL; + + if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) + cmd->u.tc_start.ts_rpc_rate = rate; + else if (cmd->tc_cmd == NRS_CTL_TBF_CHANGE_RULE) + cmd->u.tc_change.tc_rpc_rate = rate; + else + return -EINVAL; + } else if (strcmp(key, "rank") == 0) { + if (!name_is_valid(val)) + return -EINVAL; + + if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) + cmd->u.tc_start.ts_next_name = val; + else if (cmd->tc_cmd == NRS_CTL_TBF_CHANGE_RULE) + cmd->u.tc_change.tc_next_name = val; + else + return -EINVAL; + } else if (strcmp(key, "realtime") == 0) { + unsigned long realtime; + + rc = kstrtoul(val, 10, &realtime); + if (rc) + return rc; + + if (realtime > 0) + cmd->u.tc_start.ts_rule_flags |= NTRS_REALTIME; + } else { + return -EINVAL; + } + return 0; +} + +static int +nrs_tbf_parse_value_pairs(struct nrs_tbf_cmd *cmd, char *buffer) +{ + char *val; + char *token; + int rc; + + val = buffer; + while (val != NULL && strlen(val) != 0) { + token = strsep(&val, " "); + rc = nrs_tbf_parse_value_pair(cmd, token); + if (rc) + return rc; + } + + switch (cmd->tc_cmd) { + case NRS_CTL_TBF_START_RULE: + if (cmd->u.tc_start.ts_rpc_rate == 0) + cmd->u.tc_start.ts_rpc_rate = tbf_rate; + break; + case NRS_CTL_TBF_CHANGE_RULE: + if (cmd->u.tc_change.tc_rpc_rate == 0 && + cmd->u.tc_change.tc_next_name == NULL) + return -EINVAL; + break; + case NRS_CTL_TBF_STOP_RULE: + break; + default: + return -EINVAL; + } + return 0; +} + +static struct nrs_tbf_cmd * +nrs_tbf_parse_cmd(char *buffer, unsigned long count, __u32 type_flag) +{ + static struct nrs_tbf_cmd *cmd; + char *token; + char *val; + int rc = 0; + + OBD_ALLOC_PTR(cmd); + if (cmd == NULL) + GOTO(out, rc = -ENOMEM); + memset(cmd, 0, sizeof(*cmd)); + + val = buffer; + token = strsep(&val, " "); + if (val == NULL || strlen(val) == 0) + GOTO(out_free_cmd, rc = -EINVAL); + + /* Type of the command */ + if (strcmp(token, "start") == 0) { + cmd->tc_cmd = NRS_CTL_TBF_START_RULE; + cmd->u.tc_start.ts_valid_type = type_flag; + } else if (strcmp(token, "stop") == 0) + cmd->tc_cmd = NRS_CTL_TBF_STOP_RULE; + else if (strcmp(token, "change") == 0) + cmd->tc_cmd = NRS_CTL_TBF_CHANGE_RULE; + else + GOTO(out_free_cmd, rc = -EINVAL); + + /* Name of the rule */ + token = strsep(&val, " "); + if ((val == NULL && cmd->tc_cmd != NRS_CTL_TBF_STOP_RULE) || + !name_is_valid(token)) + GOTO(out_free_cmd, rc = -EINVAL); + cmd->tc_name = token; + + if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) { + /* List of ID */ + LASSERT(val); + token = val; + val = strrchr(token, '}'); + if (!val) + GOTO(out_free_cmd, rc = -EINVAL); + + /* Skip '}' */ + val++; + if (*val == '\0') { + val = NULL; + } else if (*val == ' ') { + *val = '\0'; + val++; + } else + GOTO(out_free_cmd, rc = -EINVAL); + + rc = nrs_tbf_id_parse(cmd, token); + if (rc) + GOTO(out_free_cmd, rc); + } + + rc = nrs_tbf_parse_value_pairs(cmd, val); + if (rc) + GOTO(out_cmd_fini, rc = -EINVAL); + goto out; +out_cmd_fini: + nrs_tbf_cmd_fini(cmd); +out_free_cmd: + OBD_FREE_PTR(cmd); +out: + if (rc) + cmd = ERR_PTR(rc); + return cmd; +} + +/** + * Get the TBF policy type (nid, jobid, etc) preset by + * proc entry 'nrs_policies' for command buffer parsing. + * + * \param[in] svc the PTLRPC service + * \param[in] queue the NRS queue type + * + * \retval the preset TBF policy type flag + */ +static __u32 +nrs_tbf_type_flag(struct ptlrpc_service *svc, enum ptlrpc_nrs_queue_type queue) +{ + __u32 type; + int rc; + + rc = ptlrpc_nrs_policy_control(svc, queue, + NRS_POL_NAME_TBF, + NRS_CTL_TBF_RD_TYPE_FLAG, + true, &type); + if (rc != 0) + type = NRS_TBF_FLAG_INVALID; + + return type; +} + +extern struct nrs_core nrs_core; +#define LPROCFS_WR_NRS_TBF_MAX_CMD (4096) +static ssize_t +ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct ptlrpc_service *svc = m->private; + char *kernbuf; + char *val; + int rc; + static struct nrs_tbf_cmd *cmd; + enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH; + unsigned long length; + char *token; + + OBD_ALLOC(kernbuf, LPROCFS_WR_NRS_TBF_MAX_CMD); + if (kernbuf == NULL) + GOTO(out, rc = -ENOMEM); + + if (count > LPROCFS_WR_NRS_TBF_MAX_CMD - 1) + GOTO(out_free_kernbuff, rc = -EINVAL); + + if (copy_from_user(kernbuf, buffer, count)) + GOTO(out_free_kernbuff, rc = -EFAULT); + + val = kernbuf; + token = strsep(&val, " "); + if (val == NULL) + GOTO(out_free_kernbuff, rc = -EINVAL); + + if (strcmp(token, "reg") == 0) { + queue = PTLRPC_NRS_QUEUE_REG; + } else if (strcmp(token, "hp") == 0) { + queue = PTLRPC_NRS_QUEUE_HP; + } else { + kernbuf[strlen(token)] = ' '; + val = kernbuf; + } + length = strlen(val); + + if (length == 0) + GOTO(out_free_kernbuff, rc = -EINVAL); + + if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc)) + GOTO(out_free_kernbuff, rc = -ENODEV); + else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc)) + queue = PTLRPC_NRS_QUEUE_REG; + + cmd = nrs_tbf_parse_cmd(val, length, nrs_tbf_type_flag(svc, queue)); + if (IS_ERR(cmd)) + GOTO(out_free_kernbuff, rc = PTR_ERR(cmd)); + + /** + * Serialize NRS core lprocfs operations with policy registration/ + * unregistration. + */ + mutex_lock(&nrs_core.nrs_mutex); + rc = ptlrpc_nrs_policy_control(svc, queue, + NRS_POL_NAME_TBF, + NRS_CTL_TBF_WR_RULE, + false, cmd); + mutex_unlock(&nrs_core.nrs_mutex); + + nrs_tbf_cmd_fini(cmd); + OBD_FREE_PTR(cmd); +out_free_kernbuff: + OBD_FREE(kernbuf, LPROCFS_WR_NRS_TBF_MAX_CMD); +out: + return rc ? rc : count; +} + +LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule); + +/** + * Initializes a TBF policy's lprocfs interface for service \a svc + * + * \param[in] svc the service + * + * \retval 0 success + * \retval != 0 error + */ +static int nrs_tbf_lprocfs_init(struct ptlrpc_service *svc) +{ + struct ldebugfs_vars nrs_tbf_lprocfs_vars[] = { + { .name = "nrs_tbf_rule", + .fops = &ptlrpc_lprocfs_nrs_tbf_rule_fops, + .data = svc }, + { NULL } + }; + + if (IS_ERR_OR_NULL(svc->srv_debugfs_entry)) + return 0; + + return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_tbf_lprocfs_vars, + NULL); +} + +/** + * TBF policy operations + */ +static const struct ptlrpc_nrs_pol_ops nrs_tbf_ops = { + .op_policy_start = nrs_tbf_start, + .op_policy_stop = nrs_tbf_stop, + .op_policy_ctl = nrs_tbf_ctl, + .op_res_get = nrs_tbf_res_get, + .op_res_put = nrs_tbf_res_put, + .op_req_get = nrs_tbf_req_get, + .op_req_enqueue = nrs_tbf_req_add, + .op_req_dequeue = nrs_tbf_req_del, + .op_req_stop = nrs_tbf_req_stop, + .op_lprocfs_init = nrs_tbf_lprocfs_init, +}; + +/** + * TBF policy configuration + */ +struct ptlrpc_nrs_pol_conf nrs_conf_tbf = { + .nc_name = NRS_POL_NAME_TBF, + .nc_ops = &nrs_tbf_ops, + .nc_compat = nrs_policy_compat_all, +}; + +/** @} tbf */ + +/** @} nrs */ + +#endif /* HAVE_SERVER_SUPPORT */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c new file mode 100644 index 0000000000000..d688f34b933b7 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c @@ -0,0 +1,2986 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/pack_generic.c + * + * (Un)packing of OST requests + * + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static inline __u32 lustre_msg_hdr_size_v2(__u32 count) +{ + return cfs_size_round(offsetof(struct lustre_msg_v2, + lm_buflens[count])); +} + +__u32 lustre_msg_hdr_size(__u32 magic, __u32 count) +{ + LASSERT(count > 0); + + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_hdr_size_v2(count); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return 0; + } +} + +void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, + __u32 index) +{ + if (inout) + lustre_set_req_swabbed(req, index); + else + lustre_set_rep_swabbed(req, index); +} + +bool ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, + __u32 index) +{ + if (inout) + return (ptlrpc_req_need_swab(req) && + !lustre_req_swabbed(req, index)); + + return (ptlrpc_rep_need_swab(req) && !lustre_rep_swabbed(req, index)); +} + +static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg, + enum lustre_msg_version version) +{ + enum lustre_msg_version ver = lustre_msg_get_version(msg); + + return (ver & LUSTRE_VERSION_MASK) != version; +} + +int lustre_msg_check_version(struct lustre_msg *msg, + enum lustre_msg_version version) +{ +#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0 + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + CERROR("msg v1 not supported - please upgrade you system\n"); + return -EINVAL; + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_check_version_v2(msg, version); + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return -EPROTO; + } +#undef LUSTRE_MSG_MAGIC_V1 +} + +/* early reply size */ +__u32 lustre_msg_early_size() +{ + static __u32 size; + if (!size) { + /* Always reply old ptlrpc_body_v2 to keep interoprability + * with the old client (< 2.3) which doesn't have pb_jobid + * in the ptlrpc_body. + * + * XXX Remove this whenever we dorp interoprability with such + * client. + */ + __u32 pblen = sizeof(struct ptlrpc_body_v2); + size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen); + } + return size; +} +EXPORT_SYMBOL(lustre_msg_early_size); + +__u32 lustre_msg_size_v2(int count, __u32 *lengths) +{ + __u32 size; + int i; + + LASSERT(count > 0); + size = lustre_msg_hdr_size_v2(count); + for (i = 0; i < count; i++) + size += cfs_size_round(lengths[i]); + + return size; +} +EXPORT_SYMBOL(lustre_msg_size_v2); + +/* This returns the size of the buffer that is required to hold a lustre_msg + * with the given sub-buffer lengths. + * NOTE: this should only be used for NEW requests, and should always be + * in the form of a v2 request. If this is a connection to a v1 + * target then the first buffer will be stripped because the ptlrpc + * data is part of the lustre_msg_v1 header. b=14043 */ +__u32 lustre_msg_size(__u32 magic, int count, __u32 *lens) +{ + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2)); + + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_size_v2(count, lens); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return 0; + } +} + +/* This is used to determine the size of a buffer that was already packed + * and will correctly handle the different message formats. */ +__u32 lustre_packed_msg_size(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_packed_msg_size); + +void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, + char **bufs) +{ + char *ptr; + int i; + + LASSERT(count > 0); + + msg->lm_bufcount = count; + /* XXX: lm_secflvr uninitialized here */ + msg->lm_magic = LUSTRE_MSG_MAGIC_V2; + + for (i = 0; i < count; i++) + msg->lm_buflens[i] = lens[i]; + + if (bufs == NULL) + return; + + ptr = (char *)msg + lustre_msg_hdr_size_v2(count); + for (i = 0; i < count; i++) { + char *tmp = bufs[i]; + + if (tmp) + memcpy(ptr, tmp, lens[i]); + ptr += cfs_size_round(lens[i]); + } +} +EXPORT_SYMBOL(lustre_init_msg_v2); + +static int lustre_pack_request_v2(struct ptlrpc_request *req, + int count, __u32 *lens, char **bufs) +{ + int reqlen, rc; + + reqlen = lustre_msg_size_v2(count, lens); + + rc = sptlrpc_cli_alloc_reqbuf(req, reqlen); + if (rc) + return rc; + + req->rq_reqlen = reqlen; + + lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs); + lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION); + return 0; +} + +int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count, + __u32 *lens, char **bufs) +{ + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body)); + + /* only use new format, we don't need to be compatible with 1.4 */ + magic = LUSTRE_MSG_MAGIC_V2; + + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_pack_request_v2(req, count, lens, bufs); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return -EINVAL; + } +} + +#if RS_DEBUG +struct list_head ptlrpc_rs_debug_lru = + LIST_HEAD_INIT(ptlrpc_rs_debug_lru); +spinlock_t ptlrpc_rs_debug_lock; + +#define PTLRPC_RS_DEBUG_LRU_ADD(rs) \ +do { \ + spin_lock(&ptlrpc_rs_debug_lock); \ + list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru); \ + spin_unlock(&ptlrpc_rs_debug_lock); \ +} while (0) + +#define PTLRPC_RS_DEBUG_LRU_DEL(rs) \ +do { \ + spin_lock(&ptlrpc_rs_debug_lock); \ + list_del(&(rs)->rs_debug_list); \ + spin_unlock(&ptlrpc_rs_debug_lock); \ +} while (0) +#else +# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0) +# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0) +#endif + +struct ptlrpc_reply_state * +lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_reply_state *rs = NULL; + + spin_lock(&svcpt->scp_rep_lock); + + /* See if we have anything in a pool, and wait if nothing */ + while (list_empty(&svcpt->scp_rep_idle)) { + struct l_wait_info lwi; + int rc; + + spin_unlock(&svcpt->scp_rep_lock); + /* If we cannot get anything for some long time, we better + * bail out instead of waiting infinitely */ + lwi = LWI_TIMEOUT(cfs_time_seconds(10), NULL, NULL); + rc = l_wait_event(svcpt->scp_rep_waitq, + !list_empty(&svcpt->scp_rep_idle), &lwi); + if (rc != 0) + goto out; + spin_lock(&svcpt->scp_rep_lock); + } + + rs = list_entry(svcpt->scp_rep_idle.next, + struct ptlrpc_reply_state, rs_list); + list_del(&rs->rs_list); + + spin_unlock(&svcpt->scp_rep_lock); + + memset(rs, 0, svcpt->scp_service->srv_max_reply_size); + rs->rs_size = svcpt->scp_service->srv_max_reply_size; + rs->rs_svcpt = svcpt; + rs->rs_prealloc = 1; +out: + return rs; +} + +void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + spin_lock(&svcpt->scp_rep_lock); + list_add(&rs->rs_list, &svcpt->scp_rep_idle); + spin_unlock(&svcpt->scp_rep_lock); + wake_up(&svcpt->scp_rep_waitq); +} + +int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, + __u32 *lens, char **bufs, int flags) +{ + struct ptlrpc_reply_state *rs; + int msg_len, rc; + ENTRY; + + LASSERT(req->rq_reply_state == NULL); + LASSERT(count > 0); + + if ((flags & LPRFL_EARLY_REPLY) == 0) { + spin_lock(&req->rq_lock); + req->rq_packed_final = 1; + spin_unlock(&req->rq_lock); + } + + msg_len = lustre_msg_size_v2(count, lens); + rc = sptlrpc_svc_alloc_rs(req, msg_len); + if (rc) + RETURN(rc); + + rs = req->rq_reply_state; + atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */ + rs->rs_cb_id.cbid_fn = reply_out_callback; + rs->rs_cb_id.cbid_arg = rs; + rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt; + INIT_LIST_HEAD(&rs->rs_exp_list); + INIT_LIST_HEAD(&rs->rs_obd_list); + INIT_LIST_HEAD(&rs->rs_list); + spin_lock_init(&rs->rs_lock); + + req->rq_replen = msg_len; + req->rq_reply_state = rs; + req->rq_repmsg = rs->rs_msg; + + lustre_init_msg_v2(rs->rs_msg, count, lens, bufs); + lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION); + + PTLRPC_RS_DEBUG_LRU_ADD(rs); + + RETURN(0); +} +EXPORT_SYMBOL(lustre_pack_reply_v2); + +int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens, + char **bufs, int flags) +{ + int rc = 0; + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body)); + + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + rc = lustre_pack_reply_v2(req, count, lens, bufs, flags); + break; + default: + LASSERTF(0, "incorrect message magic: %08x\n", + req->rq_reqmsg->lm_magic); + rc = -EINVAL; + } + if (rc != 0) + CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc, + lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens)); + return rc; +} + +int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens, + char **bufs) +{ + return lustre_pack_reply_flags(req, count, lens, bufs, 0); +} +EXPORT_SYMBOL(lustre_pack_reply); + +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size) +{ + __u32 i, offset, buflen, bufcount; + + LASSERT(m != NULL); + LASSERT(m->lm_bufcount > 0); + + bufcount = m->lm_bufcount; + if (unlikely(n >= bufcount)) { + CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n", + m, n, bufcount); + return NULL; + } + + buflen = m->lm_buflens[n]; + if (unlikely(buflen < min_size)) { + CERROR("msg %p buffer[%d] size %d too small " + "(required %d, opc=%d)\n", m, n, buflen, min_size, + n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m)); + return NULL; + } + + offset = lustre_msg_hdr_size_v2(bufcount); + for (i = 0; i < n; i++) + offset += cfs_size_round(m->lm_buflens[i]); + + return (char *)m + offset; +} + +void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 min_size) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_buf_v2(m, n, min_size); + default: + LASSERTF(0, "incorrect message magic: %08x (msg:%p)\n", + m->lm_magic, m); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_buf); + +static int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, __u32 segment, + unsigned int newlen, int move_data) +{ + char *tail = NULL, *newpos; + int tail_len = 0, n; + + LASSERT(msg); + LASSERT(msg->lm_bufcount > segment); + LASSERT(msg->lm_buflens[segment] >= newlen); + + if (msg->lm_buflens[segment] == newlen) + goto out; + + if (move_data && msg->lm_bufcount > segment + 1) { + tail = lustre_msg_buf_v2(msg, segment + 1, 0); + for (n = segment + 1; n < msg->lm_bufcount; n++) + tail_len += cfs_size_round(msg->lm_buflens[n]); + } + + msg->lm_buflens[segment] = newlen; + + if (tail && tail_len) { + newpos = lustre_msg_buf_v2(msg, segment + 1, 0); + LASSERT(newpos <= tail); + if (newpos != tail) + memmove(newpos, tail, tail_len); + } +out: + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); +} + +/* + * for @msg, shrink @segment to size @newlen. if @move_data is non-zero, + * we also move data forward from @segment + 1. + * + * if @newlen == 0, we remove the segment completely, but we still keep the + * totally bufcount the same to save possible data moving. this will leave a + * unused segment with size 0 at the tail, but that's ok. + * + * return new msg size after shrinking. + * + * CAUTION: + * + if any buffers higher than @segment has been filled in, must call shrink + * with non-zero @move_data. + * + caller should NOT keep pointers to msg buffers which higher than @segment + * after call shrink. + */ +int lustre_shrink_msg(struct lustre_msg *msg, int segment, + unsigned int newlen, int move_data) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_shrink_msg_v2(msg, segment, newlen, move_data); + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_shrink_msg); + +void lustre_free_reply_state(struct ptlrpc_reply_state *rs) +{ + PTLRPC_RS_DEBUG_LRU_DEL(rs); + + LASSERT(atomic_read(&rs->rs_refcount) == 0); + LASSERT(!rs->rs_difficult || rs->rs_handled); + LASSERT(!rs->rs_on_net); + LASSERT(!rs->rs_scheduled); + LASSERT(rs->rs_export == NULL); + LASSERT(rs->rs_nlocks == 0); + LASSERT(list_empty(&rs->rs_exp_list)); + LASSERT(list_empty(&rs->rs_obd_list)); + + sptlrpc_svc_free_rs(rs); +} + +static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) +{ + int swabbed, required_len, i, buflen; + + /* Now we know the sender speaks my language. */ + required_len = lustre_msg_hdr_size_v2(0); + if (len < required_len) { + /* can't even look inside the message */ + CERROR("message length %d too small for lustre_msg\n", len); + return -EINVAL; + } + + swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED); + + if (swabbed) { + __swab32s(&m->lm_magic); + __swab32s(&m->lm_bufcount); + __swab32s(&m->lm_secflvr); + __swab32s(&m->lm_repsize); + __swab32s(&m->lm_cksum); + __swab32s(&m->lm_flags); + CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0); + CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0); + } + + if (m->lm_bufcount == 0 || m->lm_bufcount > PTLRPC_MAX_BUFCOUNT) { + CERROR("message bufcount %d is not valid\n", m->lm_bufcount); + return -EINVAL; + } + required_len = lustre_msg_hdr_size_v2(m->lm_bufcount); + if (len < required_len) { + /* didn't receive all the buffer lengths */ + CERROR("message length %d too small for %d buflens\n", + len, m->lm_bufcount); + return -EINVAL; + } + + for (i = 0; i < m->lm_bufcount; i++) { + if (swabbed) + __swab32s(&m->lm_buflens[i]); + buflen = cfs_size_round(m->lm_buflens[i]); + if (buflen < 0 || buflen > PTLRPC_MAX_BUFLEN) { + CERROR("buffer %d length %d is not valid\n", i, buflen); + return -EINVAL; + } + required_len += buflen; + } + if (len < required_len || required_len > PTLRPC_MAX_BUFLEN) { + CERROR("len: %d, required_len %d, bufcount: %d\n", + len, required_len, m->lm_bufcount); + for (i = 0; i < m->lm_bufcount; i++) + CERROR("buffer %d length %d\n", i, m->lm_buflens[i]); + return -EINVAL; + } + + return swabbed; +} + +int __lustre_unpack_msg(struct lustre_msg *m, int len) +{ + int required_len, rc; + ENTRY; + + /* We can provide a slightly better error log, if we check the + * message magic and version first. In the future, struct + * lustre_msg may grow, and we'd like to log a version mismatch, + * rather than a short message. + * + */ + required_len = offsetof(struct lustre_msg, lm_magic) + + sizeof(m->lm_magic); + if (len < required_len) { + /* can't even look inside the message */ + CERROR("message length %d too small for magic/version check\n", + len); + RETURN(-EINVAL); + } + + rc = lustre_unpack_msg_v2(m, len); + + RETURN(rc); +} +EXPORT_SYMBOL(__lustre_unpack_msg); + +int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len) +{ + int rc; + rc = __lustre_unpack_msg(req->rq_reqmsg, len); + if (rc == 1) { + lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); + rc = 0; + } + return rc; +} + +int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len) +{ + int rc; + rc = __lustre_unpack_msg(req->rq_repmsg, len); + if (rc == 1) { + lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); + rc = 0; + } + return rc; +} + +static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req, + const int inout, int offset) +{ + struct ptlrpc_body *pb; + struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg; + + pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2)); + if (!pb) { + CERROR("error unpacking ptlrpc body\n"); + return -EFAULT; + } + if (ptlrpc_buf_need_swab(req, inout, offset)) { + lustre_swab_ptlrpc_body(pb); + ptlrpc_buf_set_swabbed(req, inout, offset); + } + + if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) { + CERROR("wrong lustre_msg version %08x\n", pb->pb_version); + return -EINVAL; + } + + if (!inout) + pb->pb_status = ptlrpc_status_ntoh(pb->pb_status); + + return 0; +} + +int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset) +{ + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_unpack_ptlrpc_body_v2(req, 1, offset); + default: + CERROR("bad lustre msg magic: %08x\n", + req->rq_reqmsg->lm_magic); + return -EINVAL; + } +} + +int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset) +{ + switch (req->rq_repmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_unpack_ptlrpc_body_v2(req, 0, offset); + default: + CERROR("bad lustre msg magic: %08x\n", + req->rq_repmsg->lm_magic); + return -EINVAL; + } +} + +static inline __u32 lustre_msg_buflen_v2(struct lustre_msg_v2 *m, __u32 n) +{ + if (n >= m->lm_bufcount) + return 0; + + return m->lm_buflens[n]; +} + +/** + * lustre_msg_buflen - return the length of buffer \a n in message \a m + * \param m lustre_msg (request or reply) to look at + * \param n message index (base 0) + * + * returns zero for non-existent message indices + */ +__u32 lustre_msg_buflen(struct lustre_msg *m, __u32 n) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_buflen_v2(m, n); + default: + CERROR("incorrect message magic: %08x\n", m->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_buflen); + +static inline void +lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, __u32 n, __u32 len) +{ + if (n >= m->lm_bufcount) + LBUG(); + + m->lm_buflens[n] = len; +} + +void lustre_msg_set_buflen(struct lustre_msg *m, __u32 n, __u32 len) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + lustre_msg_set_buflen_v2(m, n, len); + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic); + } +} + +/* NB return the bufcount for lustre_msg_v2 format, so if message is packed + * in V1 format, the result is one bigger. (add struct ptlrpc_body). */ +__u32 lustre_msg_bufcount(struct lustre_msg *m) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return m->lm_bufcount; + default: + CERROR("incorrect message magic: %08x\n", m->lm_magic); + return 0; + } +} + +char *lustre_msg_string(struct lustre_msg *m, __u32 index, __u32 max_len) +{ + /* max_len == 0 means the string should fill the buffer */ + char *str; + __u32 slen, blen; + + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + str = lustre_msg_buf_v2(m, index, 0); + blen = lustre_msg_buflen_v2(m, index); + break; + default: + LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic); + } + + if (str == NULL) { + CERROR ("can't unpack string in msg %p buffer[%d]\n", m, index); + return NULL; + } + + slen = strnlen(str, blen); + + if (slen == blen) { /* not NULL terminated */ + CERROR("can't unpack non-NULL terminated string in " + "msg %p buffer[%d] len %d\n", m, index, blen); + return NULL; + } + if (blen > PTLRPC_MAX_BUFLEN) { + CERROR("buffer length of msg %p buffer[%d] is invalid(%d)\n", + m, index, blen); + return NULL; + } + + if (max_len == 0) { + if (slen != blen - 1) { + CERROR("can't unpack short string in msg %p " + "buffer[%d] len %d: strlen %d\n", + m, index, blen, slen); + return NULL; + } + } else if (slen > max_len) { + CERROR("can't unpack oversized string in msg %p " + "buffer[%d] len %d strlen %d: max %d expected\n", + m, index, blen, slen, max_len); + return NULL; + } + + return str; +} + +/* Wrap up the normal fixed length cases */ +static inline void *__lustre_swab_buf(struct lustre_msg *msg, __u32 index, + __u32 min_size, void *swabber) +{ + void *ptr = NULL; + + LASSERT(msg != NULL); + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + ptr = lustre_msg_buf_v2(msg, index, min_size); + break; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + } + + if (ptr != NULL && swabber != NULL) + ((void (*)(void *))swabber)(ptr); + + return ptr; +} + +static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg) +{ + return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body_v2)); +} + +enum lustre_msghdr lustre_msghdr_get_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + /* already in host endian */ + return msg->lm_flags; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msghdr_get_flags); + +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + msg->lm_flags = flags; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +__u32 lustre_msg_get_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb != NULL) + return pb->pb_flags; + + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + } + fallthrough; + default: + /* flags might be printed in debug code while message + * uninitialized */ + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_flags); + +void lustre_msg_add_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags |= flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_add_flags); + +void lustre_msg_set_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags = flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags &= ~flags; + + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_clear_flags); + +__u32 lustre_msg_get_op_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb != NULL) + return pb->pb_op_flags; + + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + } + fallthrough; + default: + return 0; + } +} + +void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_op_flags |= flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_add_op_flags); + +struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return NULL; + } + return &pb->pb_handle; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} + +__u32 lustre_msg_get_type(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return PTL_RPC_MSG_ERR; + } + return pb->pb_type; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return PTL_RPC_MSG_ERR; + } +} +EXPORT_SYMBOL(lustre_msg_get_type); + +enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_version; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +void lustre_msg_add_version(struct lustre_msg *msg, __u32 version) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_version |= version; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +__u32 lustre_msg_get_opc(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_opc; + } + default: + CERROR("incorrect message magic: %08x (msg:%p)\n", + msg->lm_magic, msg); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_opc); + +__u64 lustre_msg_get_last_xid(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_last_xid; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_last_xid); + +__u16 lustre_msg_get_tag(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_tag; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_tag); + +__u64 lustre_msg_get_last_committed(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_last_committed; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_last_committed); + +__u64 *lustre_msg_get_versions(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return NULL; + } + return pb->pb_pre_versions; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_get_versions); + +__u64 lustre_msg_get_transno(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_transno; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_transno); + +int lustre_msg_get_status(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb != NULL) + return pb->pb_status; + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + } + fallthrough; + default: + /* status might be printed in debug code while message + * uninitialized */ + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_get_status); + +__u64 lustre_msg_get_slv(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return -EINVAL; + } + return pb->pb_slv; + } + default: + CERROR("invalid msg magic %08x\n", msg->lm_magic); + return -EINVAL; + } +} + + +void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return; + } + pb->pb_slv = slv; + return; + } + default: + CERROR("invalid msg magic %x\n", msg->lm_magic); + return; + } +} + +__u32 lustre_msg_get_limit(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return -EINVAL; + } + return pb->pb_limit; + } + default: + CERROR("invalid msg magic %x\n", msg->lm_magic); + return -EINVAL; + } +} + + +void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return; + } + pb->pb_limit = limit; + return; + } + default: + CERROR("invalid msg magic %08x\n", msg->lm_magic); + return; + } +} + +__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_conn_cnt; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_conn_cnt); + +__u32 lustre_msg_get_magic(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return msg->lm_magic; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +timeout_t lustre_msg_get_timeout(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_timeout; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_service_time; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +char *lustre_msg_get_jobid(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = + lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body)); + if (!pb) + return NULL; + + return pb->pb_jobid; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_get_jobid); + +__u32 lustre_msg_get_cksum(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return msg->lm_cksum; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u64 lustre_msg_get_mbits(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (pb == NULL) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_mbits; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + __u32 len = lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF); + + unsigned int hsize = 4; + __u32 crc; + + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb, + len, NULL, 0, (unsigned char *)&crc, + &hsize); + return crc; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_handle = *handle; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_type(struct lustre_msg *msg, __u32 type) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_type = type; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_opc = opc; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_last_xid = last_xid; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_last_xid); + +void lustre_msg_set_tag(struct lustre_msg *msg, __u16 tag) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_tag = tag; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_tag); + +void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_last_committed = last_committed; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_pre_versions[0] = versions[0]; + pb->pb_pre_versions[1] = versions[1]; + pb->pb_pre_versions[2] = versions[2]; + pb->pb_pre_versions[3] = versions[3]; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_versions); + +void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_transno = transno; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_transno); + +void lustre_msg_set_status(struct lustre_msg *msg, __u32 status) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_status = status; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_status); + +void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_conn_cnt = conn_cnt; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + + LASSERT(timeout >= 0); + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_timeout = timeout; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_service_timeout(struct lustre_msg *msg, + timeout_t service_timeout) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + + LASSERT(service_timeout >= 0); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_service_time = service_timeout; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + __u32 opc = lustre_msg_get_opc(msg); + struct ptlrpc_body *pb; + + /* Don't set jobid for ldlm ast RPCs, they've been shrinked. + * See the comment in ptlrpc_request_pack(). */ + if (!opc || opc == LDLM_BL_CALLBACK || + opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK) + return; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body)); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + + if (jobid != NULL) + memcpy(pb->pb_jobid, jobid, sizeof(pb->pb_jobid)); + else if (pb->pb_jobid[0] == '\0') + lustre_get_jobid(pb->pb_jobid, sizeof(pb->pb_jobid)); + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_jobid); + +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + msg->lm_cksum = cksum; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + + LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_mbits = mbits; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void ptlrpc_request_set_replen(struct ptlrpc_request *req) +{ + int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER); + + req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, + req->rq_pill.rc_area[RCL_SERVER]); + if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) + req->rq_reqmsg->lm_repsize = req->rq_replen; +} +EXPORT_SYMBOL(ptlrpc_request_set_replen); + +void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens) +{ + req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens); + if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) + req->rq_reqmsg->lm_repsize = req->rq_replen; +} + +/** + * Send a remote set_info_async. + * + * This may go from client to server or server to client. + */ +int do_set_info_async(struct obd_import *imp, + int opcode, int version, + size_t keylen, void *key, + size_t vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + char *tmp; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT, vallen); + rc = ptlrpc_request_pack(req, version, opcode); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); + memcpy(tmp, val, vallen); + + ptlrpc_request_set_replen(req); + + if (set) { + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(NULL, set); + } else { + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + } + + RETURN(rc); +} +EXPORT_SYMBOL(do_set_info_async); + +/* byte flipping routines for all wire types declared in + * lustre_idl.h implemented here. + */ +void lustre_swab_ptlrpc_body(struct ptlrpc_body *body) +{ + __swab32s(&body->pb_type); + __swab32s(&body->pb_version); + __swab32s(&body->pb_opc); + __swab32s(&body->pb_status); + __swab64s(&body->pb_last_xid); + __swab16s(&body->pb_tag); + CLASSERT(offsetof(typeof(*body), pb_padding0) != 0); + CLASSERT(offsetof(typeof(*body), pb_padding1) != 0); + __swab64s(&body->pb_last_committed); + __swab64s(&body->pb_transno); + __swab32s(&body->pb_flags); + __swab32s(&body->pb_op_flags); + __swab32s(&body->pb_conn_cnt); + __swab32s(&body->pb_timeout); + __swab32s(&body->pb_service_time); + __swab32s(&body->pb_limit); + __swab64s(&body->pb_slv); + __swab64s(&body->pb_pre_versions[0]); + __swab64s(&body->pb_pre_versions[1]); + __swab64s(&body->pb_pre_versions[2]); + __swab64s(&body->pb_pre_versions[3]); + __swab64s(&body->pb_mbits); + CLASSERT(offsetof(typeof(*body), pb_padding64_0) != 0); + CLASSERT(offsetof(typeof(*body), pb_padding64_1) != 0); + CLASSERT(offsetof(typeof(*body), pb_padding64_2) != 0); + /* While we need to maintain compatibility between + * clients and servers without ptlrpc_body_v2 (< 2.3) + * do not swab any fields beyond pb_jobid, as we are + * using this swab function for both ptlrpc_body + * and ptlrpc_body_v2. */ + /* pb_jobid is an ASCII string and should not be swabbed */ + CLASSERT(offsetof(typeof(*body), pb_jobid) != 0); +} + +void lustre_swab_connect(struct obd_connect_data *ocd) +{ + __swab64s(&ocd->ocd_connect_flags); + __swab32s(&ocd->ocd_version); + __swab32s(&ocd->ocd_grant); + __swab64s(&ocd->ocd_ibits_known); + __swab32s(&ocd->ocd_index); + __swab32s(&ocd->ocd_brw_size); + /* ocd_blocksize and ocd_inodespace don't need to be swabbed because + * they are 8-byte values */ + __swab16s(&ocd->ocd_grant_tax_kb); + __swab32s(&ocd->ocd_grant_max_blks); + __swab64s(&ocd->ocd_transno); + __swab32s(&ocd->ocd_group); + __swab32s(&ocd->ocd_cksum_types); + __swab32s(&ocd->ocd_instance); + /* Fields after ocd_cksum_types are only accessible by the receiver + * if the corresponding flag in ocd_connect_flags is set. Accessing + * any field after ocd_maxbytes on the receiver without a valid flag + * may result in out-of-bound memory access and kernel oops. */ + if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE) + __swab32s(&ocd->ocd_max_easize); + if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES) + __swab64s(&ocd->ocd_maxbytes); + if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) + __swab16s(&ocd->ocd_maxmodrpcs); + CLASSERT(offsetof(typeof(*ocd), padding0) != 0); + CLASSERT(offsetof(typeof(*ocd), padding1) != 0); + if (ocd->ocd_connect_flags & OBD_CONNECT_FLAGS2) + __swab64s(&ocd->ocd_connect_flags2); + CLASSERT(offsetof(typeof(*ocd), padding3) != 0); + CLASSERT(offsetof(typeof(*ocd), padding4) != 0); + CLASSERT(offsetof(typeof(*ocd), padding5) != 0); + CLASSERT(offsetof(typeof(*ocd), padding6) != 0); + CLASSERT(offsetof(typeof(*ocd), padding7) != 0); + CLASSERT(offsetof(typeof(*ocd), padding8) != 0); + CLASSERT(offsetof(typeof(*ocd), padding9) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingA) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingB) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingC) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingD) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingE) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingF) != 0); +} + +static void lustre_swab_ost_layout(struct ost_layout *ol) +{ + __swab32s(&ol->ol_stripe_size); + __swab32s(&ol->ol_stripe_count); + __swab64s(&ol->ol_comp_start); + __swab64s(&ol->ol_comp_end); + __swab32s(&ol->ol_comp_id); +} + +void lustre_swab_obdo (struct obdo *o) +{ + __swab64s(&o->o_valid); + lustre_swab_ost_id(&o->o_oi); + __swab64s(&o->o_parent_seq); + __swab64s(&o->o_size); + __swab64s(&o->o_mtime); + __swab64s(&o->o_atime); + __swab64s(&o->o_ctime); + __swab64s(&o->o_blocks); + __swab64s(&o->o_grant); + __swab32s(&o->o_blksize); + __swab32s(&o->o_mode); + __swab32s(&o->o_uid); + __swab32s(&o->o_gid); + __swab32s(&o->o_flags); + __swab32s(&o->o_nlink); + __swab32s(&o->o_parent_oid); + __swab32s(&o->o_misc); + __swab64s(&o->o_ioepoch); + __swab32s(&o->o_stripe_idx); + __swab32s(&o->o_parent_ver); + lustre_swab_ost_layout(&o->o_layout); + __swab32s(&o->o_layout_version); + __swab32s(&o->o_uid_h); + __swab32s(&o->o_gid_h); + __swab64s(&o->o_data_version); + __swab32s(&o->o_projid); + CLASSERT(offsetof(typeof(*o), o_padding_4) != 0); + CLASSERT(offsetof(typeof(*o), o_padding_5) != 0); + CLASSERT(offsetof(typeof(*o), o_padding_6) != 0); + +} +EXPORT_SYMBOL(lustre_swab_obdo); + +void lustre_swab_obd_statfs (struct obd_statfs *os) +{ + __swab64s(&os->os_type); + __swab64s(&os->os_blocks); + __swab64s(&os->os_bfree); + __swab64s(&os->os_bavail); + __swab64s(&os->os_files); + __swab64s(&os->os_ffree); + /* no need to swab os_fsid */ + __swab32s(&os->os_bsize); + __swab32s(&os->os_namelen); + __swab64s(&os->os_maxbytes); + __swab32s(&os->os_state); + __swab32s(&os->os_fprecreated); + __swab32s(&os->os_granted); + CLASSERT(offsetof(typeof(*os), os_spare3) != 0); + CLASSERT(offsetof(typeof(*os), os_spare4) != 0); + CLASSERT(offsetof(typeof(*os), os_spare5) != 0); + CLASSERT(offsetof(typeof(*os), os_spare6) != 0); + CLASSERT(offsetof(typeof(*os), os_spare7) != 0); + CLASSERT(offsetof(typeof(*os), os_spare8) != 0); + CLASSERT(offsetof(typeof(*os), os_spare9) != 0); +} + +void lustre_swab_obd_ioobj(struct obd_ioobj *ioo) +{ + lustre_swab_ost_id(&ioo->ioo_oid); + __swab32s(&ioo->ioo_max_brw); + __swab32s(&ioo->ioo_bufcnt); +} + +void lustre_swab_niobuf_remote(struct niobuf_remote *nbr) +{ + __swab64s(&nbr->rnb_offset); + __swab32s(&nbr->rnb_len); + __swab32s(&nbr->rnb_flags); +} + +void lustre_swab_ost_body (struct ost_body *b) +{ + lustre_swab_obdo (&b->oa); +} + +void lustre_swab_ost_last_id(u64 *id) +{ + __swab64s(id); +} + +void lustre_swab_generic_32s(__u32 *val) +{ + __swab32s(val); +} + +void lustre_swab_gl_lquota_desc(struct ldlm_gl_lquota_desc *desc) +{ + lustre_swab_lu_fid(&desc->gl_id.qid_fid); + __swab64s(&desc->gl_flags); + __swab64s(&desc->gl_ver); + __swab64s(&desc->gl_hardlimit); + __swab64s(&desc->gl_softlimit); + __swab64s(&desc->gl_time); + CLASSERT(offsetof(typeof(*desc), gl_pad2) != 0); +} +EXPORT_SYMBOL(lustre_swab_gl_lquota_desc); + +void lustre_swab_gl_barrier_desc(struct ldlm_gl_barrier_desc *desc) +{ + __swab32s(&desc->lgbd_status); + __swab32s(&desc->lgbd_timeout); + CLASSERT(offsetof(typeof(*desc), lgbd_padding) != 0); +} +EXPORT_SYMBOL(lustre_swab_gl_barrier_desc); + +void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb) +{ + __swab64s(&lvb->lvb_size); + __swab64s(&lvb->lvb_mtime); + __swab64s(&lvb->lvb_atime); + __swab64s(&lvb->lvb_ctime); + __swab64s(&lvb->lvb_blocks); +} +EXPORT_SYMBOL(lustre_swab_ost_lvb_v1); + +void lustre_swab_ost_lvb(struct ost_lvb *lvb) +{ + __swab64s(&lvb->lvb_size); + __swab64s(&lvb->lvb_mtime); + __swab64s(&lvb->lvb_atime); + __swab64s(&lvb->lvb_ctime); + __swab64s(&lvb->lvb_blocks); + __swab32s(&lvb->lvb_mtime_ns); + __swab32s(&lvb->lvb_atime_ns); + __swab32s(&lvb->lvb_ctime_ns); + __swab32s(&lvb->lvb_padding); +} +EXPORT_SYMBOL(lustre_swab_ost_lvb); + +void lustre_swab_lquota_lvb(struct lquota_lvb *lvb) +{ + __swab64s(&lvb->lvb_flags); + __swab64s(&lvb->lvb_id_may_rel); + __swab64s(&lvb->lvb_id_rel); + __swab64s(&lvb->lvb_id_qunit); + __swab64s(&lvb->lvb_pad1); +} +EXPORT_SYMBOL(lustre_swab_lquota_lvb); + +void lustre_swab_barrier_lvb(struct barrier_lvb *lvb) +{ + __swab32s(&lvb->lvb_status); + __swab32s(&lvb->lvb_index); + CLASSERT(offsetof(typeof(*lvb), lvb_padding) != 0); +} +EXPORT_SYMBOL(lustre_swab_barrier_lvb); + +void lustre_swab_mdt_body (struct mdt_body *b) +{ + lustre_swab_lu_fid(&b->mbo_fid1); + lustre_swab_lu_fid(&b->mbo_fid2); + /* handle is opaque */ + __swab64s(&b->mbo_valid); + __swab64s(&b->mbo_size); + __swab64s(&b->mbo_mtime); + __swab64s(&b->mbo_atime); + __swab64s(&b->mbo_ctime); + __swab64s(&b->mbo_blocks); + __swab64s(&b->mbo_version); + __swab64s(&b->mbo_t_state); + __swab32s(&b->mbo_fsuid); + __swab32s(&b->mbo_fsgid); + __swab32s(&b->mbo_capability); + __swab32s(&b->mbo_mode); + __swab32s(&b->mbo_uid); + __swab32s(&b->mbo_gid); + __swab32s(&b->mbo_flags); + __swab32s(&b->mbo_rdev); + __swab32s(&b->mbo_nlink); + __swab32s(&b->mbo_layout_gen); + __swab32s(&b->mbo_suppgid); + __swab32s(&b->mbo_eadatasize); + __swab32s(&b->mbo_aclsize); + __swab32s(&b->mbo_max_mdsize); + CLASSERT(offsetof(typeof(*b), mbo_unused3) != 0); + __swab32s(&b->mbo_uid_h); + __swab32s(&b->mbo_gid_h); + __swab32s(&b->mbo_projid); + __swab64s(&b->mbo_dom_size); + __swab64s(&b->mbo_dom_blocks); + CLASSERT(offsetof(typeof(*b), mbo_padding_8) != 0); + CLASSERT(offsetof(typeof(*b), mbo_padding_9) != 0); + CLASSERT(offsetof(typeof(*b), mbo_padding_10) != 0); +} + +void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b) +{ + /* mio_open_handle is opaque */ + CLASSERT(offsetof(typeof(*b), mio_unused1) != 0); + CLASSERT(offsetof(typeof(*b), mio_unused2) != 0); + CLASSERT(offsetof(typeof(*b), mio_padding) != 0); +} + +void lustre_swab_mgs_target_info(struct mgs_target_info *mti) +{ + int i; + + __swab32s(&mti->mti_lustre_ver); + __swab32s(&mti->mti_stripe_index); + __swab32s(&mti->mti_config_ver); + __swab32s(&mti->mti_flags); + __swab32s(&mti->mti_instance); + __swab32s(&mti->mti_nid_count); + CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64)); + for (i = 0; i < MTI_NIDS_MAX; i++) + __swab64s(&mti->mti_nids[i]); +} + +void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry) +{ + __u8 i; + + __swab64s(&entry->mne_version); + __swab32s(&entry->mne_instance); + __swab32s(&entry->mne_index); + __swab32s(&entry->mne_length); + + /* mne_nid_(count|type) must be one byte size because we're gonna + * access it w/o swapping. */ + CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8)); + CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8)); + + /* remove this assertion if ipv6 is supported. */ + LASSERT(entry->mne_nid_type == 0); + for (i = 0; i < entry->mne_nid_count; i++) { + CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64)); + __swab64s(&entry->u.nids[i]); + } +} +EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry); + +void lustre_swab_mgs_config_body(struct mgs_config_body *body) +{ + __swab64s(&body->mcb_offset); + __swab32s(&body->mcb_units); + __swab16s(&body->mcb_type); +} + +void lustre_swab_mgs_config_res(struct mgs_config_res *body) +{ + __swab64s(&body->mcr_offset); + __swab64s(&body->mcr_size); +} + +static void lustre_swab_obd_dqinfo (struct obd_dqinfo *i) +{ + __swab64s (&i->dqi_bgrace); + __swab64s (&i->dqi_igrace); + __swab32s (&i->dqi_flags); + __swab32s (&i->dqi_valid); +} + +static void lustre_swab_obd_dqblk (struct obd_dqblk *b) +{ + __swab64s (&b->dqb_ihardlimit); + __swab64s (&b->dqb_isoftlimit); + __swab64s (&b->dqb_curinodes); + __swab64s (&b->dqb_bhardlimit); + __swab64s (&b->dqb_bsoftlimit); + __swab64s (&b->dqb_curspace); + __swab64s (&b->dqb_btime); + __swab64s (&b->dqb_itime); + __swab32s (&b->dqb_valid); + CLASSERT(offsetof(typeof(*b), dqb_padding) != 0); +} + +void lustre_swab_obd_quotactl (struct obd_quotactl *q) +{ + __swab32s (&q->qc_cmd); + __swab32s (&q->qc_type); + __swab32s (&q->qc_id); + __swab32s (&q->qc_stat); + lustre_swab_obd_dqinfo (&q->qc_dqinfo); + lustre_swab_obd_dqblk (&q->qc_dqblk); +} + +void lustre_swab_fid2path(struct getinfo_fid2path *gf) +{ + lustre_swab_lu_fid(&gf->gf_fid); + __swab64s(&gf->gf_recno); + __swab32s(&gf->gf_linkno); + __swab32s(&gf->gf_pathlen); +} +EXPORT_SYMBOL(lustre_swab_fid2path); + +static void lustre_swab_fiemap_extent(struct fiemap_extent *fm_extent) +{ + __swab64s(&fm_extent->fe_logical); + __swab64s(&fm_extent->fe_physical); + __swab64s(&fm_extent->fe_length); + __swab32s(&fm_extent->fe_flags); + __swab32s(&fm_extent->fe_device); +} + +static void lustre_swab_fiemap_hdr(struct fiemap *fiemap) +{ + __swab64s(&fiemap->fm_start); + __swab64s(&fiemap->fm_length); + __swab32s(&fiemap->fm_flags); + __swab32s(&fiemap->fm_mapped_extents); + __swab32s(&fiemap->fm_extent_count); + __swab32s(&fiemap->fm_reserved); +} + +void lustre_swab_fiemap(struct fiemap *fiemap) +{ + __u32 i; + + lustre_swab_fiemap_hdr(fiemap); + + for (i = 0; i < fiemap->fm_mapped_extents; i++) + lustre_swab_fiemap_extent(&fiemap->fm_extents[i]); +} + +void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info) +{ + lustre_swab_obdo(&fiemap_info->lfik_oa); + lustre_swab_fiemap_hdr(&fiemap_info->lfik_fiemap); +} + +void lustre_swab_idx_info(struct idx_info *ii) +{ + __swab32s(&ii->ii_magic); + __swab32s(&ii->ii_flags); + __swab16s(&ii->ii_count); + __swab32s(&ii->ii_attrs); + lustre_swab_lu_fid(&ii->ii_fid); + __swab64s(&ii->ii_version); + __swab64s(&ii->ii_hash_start); + __swab64s(&ii->ii_hash_end); + __swab16s(&ii->ii_keysize); + __swab16s(&ii->ii_recsize); +} + +void lustre_swab_lip_header(struct lu_idxpage *lip) +{ + /* swab header */ + __swab32s(&lip->lip_magic); + __swab16s(&lip->lip_flags); + __swab16s(&lip->lip_nr); +} +EXPORT_SYMBOL(lustre_swab_lip_header); + +void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr) +{ + __swab32s(&rr->rr_opcode); + __swab32s(&rr->rr_cap); + __swab32s(&rr->rr_fsuid); + /* rr_fsuid_h is unused */ + __swab32s(&rr->rr_fsgid); + /* rr_fsgid_h is unused */ + __swab32s(&rr->rr_suppgid1); + /* rr_suppgid1_h is unused */ + __swab32s(&rr->rr_suppgid2); + /* rr_suppgid2_h is unused */ + lustre_swab_lu_fid(&rr->rr_fid1); + lustre_swab_lu_fid(&rr->rr_fid2); + __swab64s(&rr->rr_mtime); + __swab64s(&rr->rr_atime); + __swab64s(&rr->rr_ctime); + __swab64s(&rr->rr_size); + __swab64s(&rr->rr_blocks); + __swab32s(&rr->rr_bias); + __swab32s(&rr->rr_mode); + __swab32s(&rr->rr_flags); + __swab32s(&rr->rr_flags_h); + __swab32s(&rr->rr_umask); + __swab16s(&rr->rr_mirror_id); + + CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0); +}; + +void lustre_swab_lov_desc (struct lov_desc *ld) +{ + __swab32s (&ld->ld_tgt_count); + __swab32s (&ld->ld_active_tgt_count); + __swab32s (&ld->ld_default_stripe_count); + __swab32s (&ld->ld_pattern); + __swab64s (&ld->ld_default_stripe_size); + __swab64s (&ld->ld_default_stripe_offset); + __swab32s (&ld->ld_qos_maxage); + /* uuid endian insensitive */ +} +EXPORT_SYMBOL(lustre_swab_lov_desc); + +void lustre_swab_lmv_desc (struct lmv_desc *ld) +{ + __swab32s (&ld->ld_tgt_count); + __swab32s (&ld->ld_active_tgt_count); + __swab32s (&ld->ld_default_stripe_count); + __swab32s (&ld->ld_pattern); + __swab64s (&ld->ld_default_hash_size); + __swab32s (&ld->ld_qos_maxage); + /* uuid endian insensitive */ +} + +/* This structure is always in little-endian */ +static void lustre_swab_lmv_mds_md_v1(struct lmv_mds_md_v1 *lmm1) +{ + int i; + + __swab32s(&lmm1->lmv_magic); + __swab32s(&lmm1->lmv_stripe_count); + __swab32s(&lmm1->lmv_master_mdt_index); + __swab32s(&lmm1->lmv_hash_type); + __swab32s(&lmm1->lmv_layout_version); + for (i = 0; i < lmm1->lmv_stripe_count; i++) + lustre_swab_lu_fid(&lmm1->lmv_stripe_fids[i]); +} + +void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm) +{ + switch (lmm->lmv_magic) { + case LMV_MAGIC_V1: + lustre_swab_lmv_mds_md_v1(&lmm->lmv_md_v1); + break; + default: + break; + } +} +EXPORT_SYMBOL(lustre_swab_lmv_mds_md); + +void lustre_swab_lmv_user_md_objects(struct lmv_user_mds_data *lmd, + int stripe_count) +{ + int i; + + for (i = 0; i < stripe_count; i++) + __swab32s(&(lmd[i].lum_mds)); +} +EXPORT_SYMBOL(lustre_swab_lmv_user_md_objects); + + +void lustre_swab_lmv_user_md(struct lmv_user_md *lum) +{ + __u32 count = lum->lum_stripe_count; + + __swab32s(&lum->lum_magic); + __swab32s(&lum->lum_stripe_count); + __swab32s(&lum->lum_stripe_offset); + __swab32s(&lum->lum_hash_type); + __swab32s(&lum->lum_type); + /* lum_max_inherit and lum_max_inherit_rr do not need to be swabbed */ + BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding1) == 0); + BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding2) == 0); + BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding3) == 0); + switch (lum->lum_magic) { + case LMV_USER_MAGIC_SPECIFIC: + count = lum->lum_stripe_count; + fallthrough; + case __swab32(LMV_USER_MAGIC_SPECIFIC): + lustre_swab_lmv_user_md_objects(lum->lum_objects, count); + break; + default: + break; + } +} +EXPORT_SYMBOL(lustre_swab_lmv_user_md); + +static void lustre_print_v1v3(unsigned int lvl, struct lov_user_md *lum, + const char *msg) +{ + CDEBUG(lvl, "%s lov_user_md %p:\n", msg, lum); + CDEBUG(lvl, "\tlmm_magic: %#x\n", lum->lmm_magic); + CDEBUG(lvl, "\tlmm_pattern: %#x\n", lum->lmm_pattern); + CDEBUG(lvl, "\tlmm_object_id: %llu\n", lmm_oi_id(&lum->lmm_oi)); + CDEBUG(lvl, "\tlmm_object_gr: %llu\n", lmm_oi_seq(&lum->lmm_oi)); + CDEBUG(lvl, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size); + CDEBUG(lvl, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count); + CDEBUG(lvl, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n", + lum->lmm_stripe_offset); + if (lum->lmm_magic == LOV_USER_MAGIC_V3) { + struct lov_user_md_v3 *v3 = (void *)lum; + CDEBUG(lvl, "\tlmm_pool_name: %s\n", v3->lmm_pool_name); + } + if (lum->lmm_magic == LOV_USER_MAGIC_SPECIFIC) { + struct lov_user_md_v3 *v3 = (void *)lum; + int i; + + if (v3->lmm_pool_name[0] != '\0') + CDEBUG(lvl, "\tlmm_pool_name: %s\n", v3->lmm_pool_name); + + CDEBUG(lvl, "\ttarget list:\n"); + for (i = 0; i < v3->lmm_stripe_count; i++) + CDEBUG(lvl, "\t\t%u\n", v3->lmm_objects[i].l_ost_idx); + } +} + +void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum, + const char *msg) +{ + struct lov_comp_md_v1 *comp_v1; + int i; + + if (likely(!cfs_cdebug_show(lvl, DEBUG_SUBSYSTEM))) + return; + + if (lum->lmm_magic == LOV_USER_MAGIC_V1 || + lum->lmm_magic == LOV_USER_MAGIC_V3) { + lustre_print_v1v3(lvl, lum, msg); + return; + } + + if (lum->lmm_magic != LOV_USER_MAGIC_COMP_V1) { + CDEBUG(lvl, "%s: bad magic: %x\n", msg, lum->lmm_magic); + return; + } + + comp_v1 = (struct lov_comp_md_v1 *)lum; + CDEBUG(lvl, "%s: lov_comp_md_v1 %p:\n", msg, lum); + CDEBUG(lvl, "\tlcm_magic: %#x\n", comp_v1->lcm_magic); + CDEBUG(lvl, "\tlcm_size: %#x\n", comp_v1->lcm_size); + CDEBUG(lvl, "\tlcm_layout_gen: %#x\n", comp_v1->lcm_layout_gen); + CDEBUG(lvl, "\tlcm_flags: %#x\n", comp_v1->lcm_flags); + CDEBUG(lvl, "\tlcm_entry_count: %#x\n\n", comp_v1->lcm_entry_count); + CDEBUG(lvl, "\tlcm_mirror_count: %#x\n\n", comp_v1->lcm_mirror_count); + + for (i = 0; i < comp_v1->lcm_entry_count; i++) { + struct lov_comp_md_entry_v1 *ent = &comp_v1->lcm_entries[i]; + struct lov_user_md *v1; + + CDEBUG(lvl, "\tentry %d:\n", i); + CDEBUG(lvl, "\tlcme_id: %#x\n", ent->lcme_id); + CDEBUG(lvl, "\tlcme_flags: %#x\n", ent->lcme_flags); + if (ent->lcme_flags & LCME_FL_NOSYNC) + CDEBUG(lvl, "\tlcme_timestamp: %llu\n", + ent->lcme_timestamp); + CDEBUG(lvl, "\tlcme_extent.e_start: %llu\n", + ent->lcme_extent.e_start); + CDEBUG(lvl, "\tlcme_extent.e_end: %llu\n", + ent->lcme_extent.e_end); + CDEBUG(lvl, "\tlcme_offset: %#x\n", ent->lcme_offset); + CDEBUG(lvl, "\tlcme_size: %#x\n\n", ent->lcme_size); + + v1 = (struct lov_user_md *)((char *)comp_v1 + + comp_v1->lcm_entries[i].lcme_offset); + lustre_print_v1v3(lvl, v1, msg); + } +} +EXPORT_SYMBOL(lustre_print_user_md); + +static void lustre_swab_lmm_oi(struct ost_id *oi) +{ + __swab64s(&oi->oi.oi_id); + __swab64s(&oi->oi.oi_seq); +} + +static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum) +{ + ENTRY; + __swab32s(&lum->lmm_magic); + __swab32s(&lum->lmm_pattern); + lustre_swab_lmm_oi(&lum->lmm_oi); + __swab32s(&lum->lmm_stripe_size); + __swab16s(&lum->lmm_stripe_count); + __swab16s(&lum->lmm_stripe_offset); + EXIT; +} + +void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n"); + lustre_swab_lov_user_md_common(lum); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_v1); + +void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n"); + lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum); + /* lmm_pool_name nothing to do with char */ + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_v3); + +void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum) +{ + struct lov_comp_md_entry_v1 *ent; + struct lov_user_md_v1 *v1; + struct lov_user_md_v3 *v3; + int i; + bool cpu_endian; + __u32 off, size; + __u16 ent_count, stripe_count; + ENTRY; + + cpu_endian = lum->lcm_magic == LOV_USER_MAGIC_COMP_V1; + ent_count = lum->lcm_entry_count; + if (!cpu_endian) + __swab16s(&ent_count); + + CDEBUG(D_IOCTL, "swabbing lov_user_comp_md v1\n"); + __swab32s(&lum->lcm_magic); + __swab32s(&lum->lcm_size); + __swab32s(&lum->lcm_layout_gen); + __swab16s(&lum->lcm_flags); + __swab16s(&lum->lcm_entry_count); + __swab16s(&lum->lcm_mirror_count); + CLASSERT(offsetof(typeof(*lum), lcm_padding1) != 0); + CLASSERT(offsetof(typeof(*lum), lcm_padding2) != 0); + + for (i = 0; i < ent_count; i++) { + ent = &lum->lcm_entries[i]; + off = ent->lcme_offset; + size = ent->lcme_size; + + if (!cpu_endian) { + __swab32s(&off); + __swab32s(&size); + } + __swab32s(&ent->lcme_id); + __swab32s(&ent->lcme_flags); + __swab64s(&ent->lcme_timestamp); + __swab64s(&ent->lcme_extent.e_start); + __swab64s(&ent->lcme_extent.e_end); + __swab32s(&ent->lcme_offset); + __swab32s(&ent->lcme_size); + __swab32s(&ent->lcme_layout_gen); + CLASSERT(offsetof(typeof(*ent), lcme_padding_1) != 0); + + v1 = (struct lov_user_md_v1 *)((char *)lum + off); + stripe_count = v1->lmm_stripe_count; + if (!cpu_endian) + __swab16s(&stripe_count); + + if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V1) || + v1->lmm_magic == LOV_USER_MAGIC_V1) { + lustre_swab_lov_user_md_v1(v1); + if (size > sizeof(*v1)) + lustre_swab_lov_user_md_objects(v1->lmm_objects, + stripe_count); + } else if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V3) || + v1->lmm_magic == LOV_USER_MAGIC_V3 || + v1->lmm_magic == __swab32(LOV_USER_MAGIC_SPECIFIC) || + v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) { + v3 = (struct lov_user_md_v3 *)v1; + lustre_swab_lov_user_md_v3(v3); + if (size > sizeof(*v3)) + lustre_swab_lov_user_md_objects(v3->lmm_objects, + stripe_count); + } else { + CERROR("Invalid magic %#x\n", v1->lmm_magic); + } + } +} +EXPORT_SYMBOL(lustre_swab_lov_comp_md_v1); + +void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count) +{ + int i; + ENTRY; + for (i = 0; i < stripe_count; i++) { + lustre_swab_ost_id(&(lod[i].l_ost_oi)); + __swab32s(&(lod[i].l_ost_gen)); + __swab32s(&(lod[i].l_ost_idx)); + } + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_objects); + +void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size) +{ + struct lov_user_md_v1 *v1; + struct lov_user_md_v3 *v3; + __u16 stripe_count; + ENTRY; + + CDEBUG(D_IOCTL, "swabbing lov_user_md\n"); + switch (lum->lmm_magic) { + case __swab32(LOV_MAGIC_V1): + case LOV_USER_MAGIC_V1: + { + v1 = (struct lov_user_md_v1 *)lum; + stripe_count = v1->lmm_stripe_count; + + if (lum->lmm_magic != LOV_USER_MAGIC_V1) + __swab16s(&stripe_count); + + lustre_swab_lov_user_md_v1(v1); + if (size > sizeof(*v1)) + lustre_swab_lov_user_md_objects(v1->lmm_objects, + stripe_count); + + break; + } + case __swab32(LOV_MAGIC_V3): + case LOV_USER_MAGIC_V3: + { + v3 = (struct lov_user_md_v3 *)lum; + stripe_count = v3->lmm_stripe_count; + + if (lum->lmm_magic != LOV_USER_MAGIC_V3) + __swab16s(&stripe_count); + + lustre_swab_lov_user_md_v3(v3); + if (size > sizeof(*v3)) + lustre_swab_lov_user_md_objects(v3->lmm_objects, + stripe_count); + break; + } + case __swab32(LOV_USER_MAGIC_SPECIFIC): + case LOV_USER_MAGIC_SPECIFIC: + { + v3 = (struct lov_user_md_v3 *)lum; + stripe_count = v3->lmm_stripe_count; + + if (lum->lmm_magic != LOV_USER_MAGIC_SPECIFIC) + __swab16s(&stripe_count); + + lustre_swab_lov_user_md_v3(v3); + lustre_swab_lov_user_md_objects(v3->lmm_objects, stripe_count); + break; + } + case __swab32(LOV_MAGIC_COMP_V1): + case LOV_USER_MAGIC_COMP_V1: + lustre_swab_lov_comp_md_v1((struct lov_comp_md_v1 *)lum); + break; + default: + CDEBUG(D_IOCTL, "Invalid LOV magic %08x\n", lum->lmm_magic); + } +} +EXPORT_SYMBOL(lustre_swab_lov_user_md); + +void lustre_swab_lov_mds_md(struct lov_mds_md *lmm) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_mds_md\n"); + __swab32s(&lmm->lmm_magic); + __swab32s(&lmm->lmm_pattern); + lustre_swab_lmm_oi(&lmm->lmm_oi); + __swab32s(&lmm->lmm_stripe_size); + __swab16s(&lmm->lmm_stripe_count); + __swab16s(&lmm->lmm_layout_gen); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_mds_md); + +void lustre_swab_ldlm_res_id (struct ldlm_res_id *id) +{ + int i; + + for (i = 0; i < RES_NAME_SIZE; i++) + __swab64s (&id->name[i]); +} + +void lustre_swab_ldlm_policy_data(union ldlm_wire_policy_data *d) +{ + /* the lock data is a union and the first two fields are always an + * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock + * data the same way. */ + __swab64s(&d->l_extent.start); + __swab64s(&d->l_extent.end); + __swab64s(&d->l_extent.gid); + __swab64s(&d->l_flock.lfw_owner); + __swab32s(&d->l_flock.lfw_pid); +} + +void lustre_swab_ldlm_intent (struct ldlm_intent *i) +{ + __swab64s(&i->opc); +} + +void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r) +{ + __swab32s(&r->lr_type); + CLASSERT(offsetof(typeof(*r), lr_pad) != 0); + lustre_swab_ldlm_res_id(&r->lr_name); +} + +void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l) +{ + lustre_swab_ldlm_resource_desc (&l->l_resource); + __swab32s (&l->l_req_mode); + __swab32s (&l->l_granted_mode); + lustre_swab_ldlm_policy_data (&l->l_policy_data); +} + +void lustre_swab_ldlm_request (struct ldlm_request *rq) +{ + __swab32s (&rq->lock_flags); + lustre_swab_ldlm_lock_desc (&rq->lock_desc); + __swab32s (&rq->lock_count); + /* lock_handle[] opaque */ +} + +void lustre_swab_ldlm_reply (struct ldlm_reply *r) +{ + __swab32s (&r->lock_flags); + CLASSERT(offsetof(typeof(*r), lock_padding) != 0); + lustre_swab_ldlm_lock_desc (&r->lock_desc); + /* lock_handle opaque */ + __swab64s (&r->lock_policy_res1); + __swab64s (&r->lock_policy_res2); +} + +void lustre_swab_quota_body(struct quota_body *b) +{ + lustre_swab_lu_fid(&b->qb_fid); + lustre_swab_lu_fid((struct lu_fid *)&b->qb_id); + __swab32s(&b->qb_flags); + __swab64s(&b->qb_count); + __swab64s(&b->qb_usage); + __swab64s(&b->qb_slv_ver); +} + +/* Dump functions */ +void dump_ioo(struct obd_ioobj *ioo) +{ + CDEBUG(D_RPCTRACE, + "obd_ioobj: ioo_oid="DOSTID", ioo_max_brw=%#x, " + "ioo_bufct=%d\n", POSTID(&ioo->ioo_oid), ioo->ioo_max_brw, + ioo->ioo_bufcnt); +} + +void dump_rniobuf(struct niobuf_remote *nb) +{ + CDEBUG(D_RPCTRACE, "niobuf_remote: offset=%llu, len=%d, flags=%x\n", + nb->rnb_offset, nb->rnb_len, nb->rnb_flags); +} + +void dump_obdo(struct obdo *oa) +{ + u64 valid = oa->o_valid; + + CDEBUG(D_RPCTRACE, "obdo: o_valid = %#llx\n", valid); + if (valid & OBD_MD_FLID) + CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi)); + if (valid & OBD_MD_FLFID) + CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n", + oa->o_parent_seq); + if (valid & OBD_MD_FLSIZE) + CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size); + if (valid & OBD_MD_FLMTIME) + CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime); + if (valid & OBD_MD_FLATIME) + CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime); + if (valid & OBD_MD_FLCTIME) + CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime); + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks); + if (valid & OBD_MD_FLGRANT) + CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant); + if (valid & OBD_MD_FLBLKSZ) + CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize); + if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE)) + CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n", + oa->o_mode & ((valid & OBD_MD_FLTYPE ? S_IFMT : 0) | + (valid & OBD_MD_FLMODE ? ~S_IFMT : 0))); + if (valid & OBD_MD_FLUID) + CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid); + if (valid & OBD_MD_FLUID) + CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h); + if (valid & OBD_MD_FLGID) + CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid); + if (valid & OBD_MD_FLGID) + CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h); + if (valid & OBD_MD_FLFLAGS) + CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags); + if (valid & OBD_MD_FLNLINK) + CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink); + else if (valid & OBD_MD_FLCKSUM) + CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n", + oa->o_nlink); + if (valid & OBD_MD_FLPARENT) + CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n", + oa->o_parent_oid); + if (valid & OBD_MD_FLFID) { + CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n", + oa->o_stripe_idx); + CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n", + oa->o_parent_ver); + } + if (valid & OBD_MD_FLHANDLE) + CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n", + oa->o_handle.cookie); +} + +void dump_ost_body(struct ost_body *ob) +{ + dump_obdo(&ob->oa); +} + +void dump_rcs(__u32 *rc) +{ + CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc); +} + +static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req) +{ + LASSERT(req->rq_reqmsg); + + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF); + default: + CERROR("bad lustre msg magic: %#08X\n", + req->rq_reqmsg->lm_magic); + } + return 0; +} + +static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req) +{ + if (unlikely(!req->rq_repmsg)) + return 0; + + switch (req->rq_repmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF); + default: + /* uninitialized yet */ + return 0; + } +} + +void _debug_req(struct ptlrpc_request *req, + struct libcfs_debug_msg_data *msgdata, const char *fmt, ...) +{ + bool req_ok = req->rq_reqmsg != NULL; + bool rep_ok = false; + lnet_nid_t nid = LNET_NID_ANY; + va_list args; + int rep_flags = -1; + int rep_status = -1; + + spin_lock(&req->rq_early_free_lock); + if (req->rq_repmsg) + rep_ok = true; + + if (ptlrpc_req_need_swab(req)) { + req_ok = req_ok && req_ptlrpc_body_swabbed(req); + rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req); + } + + if (rep_ok) { + rep_flags = lustre_msg_get_flags(req->rq_repmsg); + rep_status = lustre_msg_get_status(req->rq_repmsg); + } + spin_unlock(&req->rq_early_free_lock); + + if (req->rq_import && req->rq_import->imp_connection) + nid = req->rq_import->imp_connection->c_peer.nid; + else if (req->rq_export && req->rq_export->exp_connection) + nid = req->rq_export->exp_connection->c_peer.nid; + + va_start(args, fmt); + libcfs_debug_vmsg2(msgdata, fmt, args, + " req@%p x%llu/t%lld(%lld) o%d->%s@%s:%d/%d lens %d/%d e %d to %lld dl %lld ref %d fl " REQ_FLAGS_FMT "/%x/%x rc %d/%d\n", + req, req->rq_xid, req->rq_transno, + req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0, + req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1, + req->rq_import ? + req->rq_import->imp_obd->obd_name : + req->rq_export ? + req->rq_export->exp_client_uuid.uuid : + "", + libcfs_nid2str(nid), + req->rq_request_portal, req->rq_reply_portal, + req->rq_reqlen, req->rq_replen, + req->rq_early_count, (s64)req->rq_timedout, + (s64)req->rq_deadline, + atomic_read(&req->rq_refcount), + DEBUG_REQ_FLAGS(req), + req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1, + rep_flags, req->rq_status, rep_status); + va_end(args); +} +EXPORT_SYMBOL(_debug_req); + +void lustre_swab_lustre_capa(struct lustre_capa *c) +{ + lustre_swab_lu_fid(&c->lc_fid); + __swab64s (&c->lc_opc); + __swab64s (&c->lc_uid); + __swab64s (&c->lc_gid); + __swab32s (&c->lc_flags); + __swab32s (&c->lc_keyid); + __swab32s (&c->lc_timeout); + __swab32s (&c->lc_expiry); +} + +void lustre_swab_lustre_capa_key(struct lustre_capa_key *k) +{ + __swab64s (&k->lk_seq); + __swab32s (&k->lk_keyid); + CLASSERT(offsetof(typeof(*k), lk_padding) != 0); +} + +void lustre_swab_hsm_user_state(struct hsm_user_state *state) +{ + __swab32s(&state->hus_states); + __swab32s(&state->hus_archive_id); +} + +void lustre_swab_hsm_state_set(struct hsm_state_set *hss) +{ + __swab32s(&hss->hss_valid); + __swab64s(&hss->hss_setmask); + __swab64s(&hss->hss_clearmask); + __swab32s(&hss->hss_archive_id); +} + +static void lustre_swab_hsm_extent(struct hsm_extent *extent) +{ + __swab64s(&extent->offset); + __swab64s(&extent->length); +} + +void lustre_swab_hsm_current_action(struct hsm_current_action *action) +{ + __swab32s(&action->hca_state); + __swab32s(&action->hca_action); + lustre_swab_hsm_extent(&action->hca_location); +} + +void lustre_swab_hsm_user_item(struct hsm_user_item *hui) +{ + lustre_swab_lu_fid(&hui->hui_fid); + lustre_swab_hsm_extent(&hui->hui_extent); +} + +void lustre_swab_lu_extent(struct lu_extent *le) +{ + __swab64s(&le->e_start); + __swab64s(&le->e_end); +} + +void lustre_swab_layout_intent(struct layout_intent *li) +{ + __swab32s(&li->li_opc); + __swab32s(&li->li_flags); + lustre_swab_lu_extent(&li->li_extent); +} + +void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk) +{ + lustre_swab_lu_fid(&hpk->hpk_fid); + __swab64s(&hpk->hpk_cookie); + __swab64s(&hpk->hpk_extent.offset); + __swab64s(&hpk->hpk_extent.length); + __swab16s(&hpk->hpk_flags); + __swab16s(&hpk->hpk_errval); +} + +void lustre_swab_hsm_request(struct hsm_request *hr) +{ + __swab32s(&hr->hr_action); + __swab32s(&hr->hr_archive_id); + __swab64s(&hr->hr_flags); + __swab32s(&hr->hr_itemcount); + __swab32s(&hr->hr_data_len); +} + +void lustre_swab_object_update(struct object_update *ou) +{ + struct object_update_param *param; + size_t i; + + __swab16s(&ou->ou_type); + __swab16s(&ou->ou_params_count); + __swab32s(&ou->ou_result_size); + __swab32s(&ou->ou_flags); + __swab32s(&ou->ou_padding1); + __swab64s(&ou->ou_batchid); + lustre_swab_lu_fid(&ou->ou_fid); + param = &ou->ou_params[0]; + for (i = 0; i < ou->ou_params_count; i++) { + __swab16s(¶m->oup_len); + __swab16s(¶m->oup_padding); + __swab32s(¶m->oup_padding2); + param = (struct object_update_param *)((char *)param + + object_update_param_size(param)); + } +} + +void lustre_swab_object_update_request(struct object_update_request *our) +{ + size_t i; + __swab32s(&our->ourq_magic); + __swab16s(&our->ourq_count); + __swab16s(&our->ourq_padding); + for (i = 0; i < our->ourq_count; i++) { + struct object_update *ou; + + ou = object_update_request_get(our, i, NULL); + if (ou == NULL) + return; + lustre_swab_object_update(ou); + } +} + +void lustre_swab_object_update_result(struct object_update_result *our) +{ + __swab32s(&our->our_rc); + __swab16s(&our->our_datalen); + __swab16s(&our->our_padding); +} + +void lustre_swab_object_update_reply(struct object_update_reply *our) +{ + size_t i; + + __swab32s(&our->ourp_magic); + __swab16s(&our->ourp_count); + __swab16s(&our->ourp_padding); + for (i = 0; i < our->ourp_count; i++) { + struct object_update_result *ourp; + + __swab16s(&our->ourp_lens[i]); + ourp = object_update_result_get(our, i, NULL); + if (ourp == NULL) + return; + lustre_swab_object_update_result(ourp); + } +} + +void lustre_swab_out_update_header(struct out_update_header *ouh) +{ + __swab32s(&ouh->ouh_magic); + __swab32s(&ouh->ouh_count); + __swab32s(&ouh->ouh_inline_length); + __swab32s(&ouh->ouh_reply_size); +} +EXPORT_SYMBOL(lustre_swab_out_update_header); + +void lustre_swab_out_update_buffer(struct out_update_buffer *oub) +{ + __swab32s(&oub->oub_size); + __swab32s(&oub->oub_padding); +} +EXPORT_SYMBOL(lustre_swab_out_update_buffer); + +void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl) +{ + __swab64s(&msl->msl_flags); +} + +void lustre_swab_close_data(struct close_data *cd) +{ + lustre_swab_lu_fid(&cd->cd_fid); + __swab64s(&cd->cd_data_version); +} + +void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync) +{ + int i; + + __swab32s(&resync->resync_count); + /* after swab, resync_count must in CPU endian */ + if (resync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) { + for (i = 0; i < resync->resync_count; i++) + __swab32s(&resync->resync_ids_inline[i]); + } +} +EXPORT_SYMBOL(lustre_swab_close_data_resync_done); + +void lustre_swab_lfsck_request(struct lfsck_request *lr) +{ + __swab32s(&lr->lr_event); + __swab32s(&lr->lr_index); + __swab32s(&lr->lr_flags); + __swab32s(&lr->lr_valid); + __swab32s(&lr->lr_speed); + __swab16s(&lr->lr_version); + __swab16s(&lr->lr_active); + __swab16s(&lr->lr_param); + __swab16s(&lr->lr_async_windows); + __swab32s(&lr->lr_flags); + lustre_swab_lu_fid(&lr->lr_fid); + lustre_swab_lu_fid(&lr->lr_fid2); + __swab32s(&lr->lr_comp_id); + CLASSERT(offsetof(typeof(*lr), lr_padding_0) != 0); + CLASSERT(offsetof(typeof(*lr), lr_padding_1) != 0); + CLASSERT(offsetof(typeof(*lr), lr_padding_2) != 0); + CLASSERT(offsetof(typeof(*lr), lr_padding_3) != 0); +} + +void lustre_swab_lfsck_reply(struct lfsck_reply *lr) +{ + __swab32s(&lr->lr_status); + CLASSERT(offsetof(typeof(*lr), lr_padding_1) != 0); + __swab64s(&lr->lr_repaired); +} + +static void lustre_swab_orphan_rec(struct lu_orphan_rec *rec) +{ + lustre_swab_lu_fid(&rec->lor_fid); + __swab32s(&rec->lor_uid); + __swab32s(&rec->lor_gid); +} + +void lustre_swab_orphan_ent(struct lu_orphan_ent *ent) +{ + lustre_swab_lu_fid(&ent->loe_key); + lustre_swab_orphan_rec(&ent->loe_rec); +} +EXPORT_SYMBOL(lustre_swab_orphan_ent); + +void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent) +{ + lustre_swab_lu_fid(&ent->loe_key); + lustre_swab_orphan_rec(&ent->loe_rec.lor_rec); + lustre_swab_ost_layout(&ent->loe_rec.lor_layout); + CLASSERT(offsetof(typeof(ent->loe_rec), lor_padding) != 0); +} +EXPORT_SYMBOL(lustre_swab_orphan_ent_v2); + +void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent) +{ + lustre_swab_lu_fid(&ent->loe_key); + lustre_swab_orphan_rec(&ent->loe_rec.lor_rec); + lustre_swab_ost_layout(&ent->loe_rec.lor_layout); + __swab32s(&ent->loe_rec.lor_layout_version); + __swab32s(&ent->loe_rec.lor_range); + CLASSERT(offsetof(typeof(ent->loe_rec), lor_padding_1) != 0); + CLASSERT(offsetof(typeof(ent->loe_rec), lor_padding_2) != 0); +} +EXPORT_SYMBOL(lustre_swab_orphan_ent_v3); + +void lustre_swab_ladvise(struct lu_ladvise *ladvise) +{ + __swab16s(&ladvise->lla_advice); + __swab16s(&ladvise->lla_value1); + __swab32s(&ladvise->lla_value2); + __swab64s(&ladvise->lla_start); + __swab64s(&ladvise->lla_end); + __swab32s(&ladvise->lla_value3); + __swab32s(&ladvise->lla_value4); +} +EXPORT_SYMBOL(lustre_swab_ladvise); + +void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr) +{ + __swab32s(&ladvise_hdr->lah_magic); + __swab32s(&ladvise_hdr->lah_count); + __swab64s(&ladvise_hdr->lah_flags); + __swab32s(&ladvise_hdr->lah_value1); + __swab32s(&ladvise_hdr->lah_value2); + __swab64s(&ladvise_hdr->lah_value3); +} +EXPORT_SYMBOL(lustre_swab_ladvise_hdr); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c new file mode 100644 index 0000000000000..d0c8fa7a1e6ac --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + + +void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc, + int mdidx) +{ + unsigned int start = desc->bd_mds_off[mdidx]; + + CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON); + + LASSERT(mdidx < desc->bd_md_max_brw); + LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); + LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV | + LNET_MD_PHYS))); + + /* just send a lnet header */ + if (mdidx >= desc->bd_md_count) { + if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) + md->options |= LNET_MD_KIOV; + else if (ptlrpc_is_bulk_desc_kvec(desc->bd_type)) + md->options |= LNET_MD_IOVEC; + md->length = 0; + md->start = NULL; + return; + } + + if (mdidx == (desc->bd_md_count - 1)) + md->length = desc->bd_iov_count - start; + else + md->length = desc->bd_mds_off[mdidx + 1] - start; + + if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) { + md->options |= LNET_MD_KIOV; + if (GET_ENC_KIOV(desc)) + md->start = &BD_GET_ENC_KIOV(desc, start); + else + md->start = &BD_GET_KIOV(desc, start); + } else if (ptlrpc_is_bulk_desc_kvec(desc->bd_type)) { + md->options |= LNET_MD_IOVEC; + if (GET_ENC_KVEC(desc)) + md->start = &BD_GET_ENC_KVEC(desc, start); + else + md->start = &BD_GET_KVEC(desc, start); + } +} + + diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c new file mode 100644 index 0000000000000..d965c0838d8d5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c @@ -0,0 +1,570 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/pinger.c + * + * Portal-RPC reconnection and replay operations, for use in recovery. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +static int suppress_pings; +module_param(suppress_pings, int, 0644); +MODULE_PARM_DESC(suppress_pings, "Suppress pings"); + +struct mutex pinger_mutex; +static struct list_head pinger_imports = + LIST_HEAD_INIT(pinger_imports); + +int ptlrpc_pinger_suppress_pings() +{ + return suppress_pings; +} +EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings); + +struct ptlrpc_request * +ptlrpc_prep_ping(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, + LUSTRE_OBD_VERSION, OBD_PING); + if (req) { + ptlrpc_request_set_replen(req); + req->rq_no_resend = req->rq_no_delay = 1; + } + return req; +} + +int ptlrpc_obd_ping(struct obd_device *obd) +{ + int rc; + struct ptlrpc_request *req; + ENTRY; + + req = ptlrpc_prep_ping(obd->u.cli.cl_import); + if (req == NULL) + RETURN(-ENOMEM); + + req->rq_send_state = LUSTRE_IMP_FULL; + + rc = ptlrpc_queue_wait(req); + + ptlrpc_req_finished(req); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_obd_ping); + +static bool ptlrpc_check_import_is_idle(struct obd_import *imp) +{ + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + time64_t now; + + if (!imp->imp_idle_timeout) + return false; + + if (atomic_read(&imp->imp_reqs) > 0) + return false; + + /* any lock increases ns_bref being a resource holder */ + if (ns && atomic_read(&ns->ns_bref) > 0) + return false; + + now = ktime_get_real_seconds(); + if (now - imp->imp_last_reply_time < imp->imp_idle_timeout) + return false; + + return true; +} + +static void ptlrpc_update_next_ping(struct obd_import *imp, int soon) +{ +#ifdef CONFIG_LUSTRE_FS_PINGER + time64_t time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL; + + if (imp->imp_state == LUSTRE_IMP_DISCON) { + time64_t dtime = max_t(time64_t, CONNECTION_SWITCH_MIN, + AT_OFF ? 0 : + at_get(&imp->imp_at.iat_net_latency)); + time = min(time, dtime); + } + imp->imp_next_ping = ktime_get_seconds() + time; +#endif /* CONFIG_LUSTRE_FS_PINGER */ +} + +static int ptlrpc_ping(struct obd_import *imp) +{ + struct ptlrpc_request *req; + ENTRY; + + if (ptlrpc_check_import_is_idle(imp)) + RETURN(ptlrpc_disconnect_and_idle_import(imp)); + + req = ptlrpc_prep_ping(imp); + if (req == NULL) { + CERROR("OOM trying to ping %s->%s\n", + imp->imp_obd->obd_uuid.uuid, + obd2cli_tgt(imp->imp_obd)); + RETURN(-ENOMEM); + } + + DEBUG_REQ(D_INFO, req, "pinging %s->%s", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* Updating imp_next_ping early, it allows pinger_check_timeout to + * see an actual time for next awake. request_out_callback update + * happens at another thread, and ptlrpc_pinger_main may sleep + * already. + */ + ptlrpc_update_next_ping(imp, 0); + ptlrpcd_add_req(req); + + RETURN(0); +} + +void ptlrpc_ping_import_soon(struct obd_import *imp) +{ + imp->imp_next_ping = ktime_get_seconds(); +} + +static inline int imp_is_deactive(struct obd_import *imp) +{ + return (imp->imp_deactive || + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE)); +} + +static inline time64_t ptlrpc_next_reconnect(struct obd_import *imp) +{ + return ktime_get_seconds() + INITIAL_CONNECT_TIMEOUT; +} + +static s32 pinger_check_timeout(time64_t time) +{ + s32 timeout = PING_INTERVAL; + s32 next_timeout; + time64_t now; + struct list_head *iter; + struct obd_import *imp; + + mutex_lock(&pinger_mutex); + now = ktime_get_seconds(); + /* Process imports to find a nearest next ping */ + list_for_each(iter, &pinger_imports) { + imp = list_entry(iter, struct obd_import, imp_pinger_chain); + if (!imp->imp_pingable || imp->imp_next_ping < now) + continue; + next_timeout = imp->imp_next_ping - now; + /* make sure imp_next_ping in the future from time */ + if (next_timeout > (now - time) && timeout > next_timeout) + timeout = next_timeout; + } + mutex_unlock(&pinger_mutex); + + return timeout - (now - time); +} + +static bool ir_up; + +void ptlrpc_pinger_ir_up(void) +{ + CDEBUG(D_HA, "IR up\n"); + ir_up = true; +} +EXPORT_SYMBOL(ptlrpc_pinger_ir_up); + +void ptlrpc_pinger_ir_down(void) +{ + CDEBUG(D_HA, "IR down\n"); + ir_up = false; +} +EXPORT_SYMBOL(ptlrpc_pinger_ir_down); + +static void ptlrpc_pinger_process_import(struct obd_import *imp, + time64_t this_ping) +{ + int level; + int force; + int force_next; + int suppress; + + spin_lock(&imp->imp_lock); + + level = imp->imp_state; + force = imp->imp_force_verify; + force_next = imp->imp_force_next_verify; + /* + * This will be used below only if the import is "FULL". + */ + suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS); + + imp->imp_force_verify = 0; + + if (imp->imp_next_ping - 5 >= this_ping && !force) { + spin_unlock(&imp->imp_lock); + return; + } + + imp->imp_force_next_verify = 0; + + CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u " + "force %u force_next %u deactive %u pingable %u suppress %u\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(level), level, force, force_next, + imp->imp_deactive, imp->imp_pingable, suppress); + + if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) { + /* wait for a while before trying recovery again */ + imp->imp_next_ping = ptlrpc_next_reconnect(imp); + spin_unlock(&imp->imp_lock); + if (!imp->imp_no_pinger_recover) + ptlrpc_initiate_recovery(imp); + } else if (level != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov || + imp_is_deactive(imp)) { + CDEBUG(D_HA, "%s->%s: not pinging (in recovery " + "or recovery disabled: %s)\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(level)); + if (force) + imp->imp_force_verify = 1; + spin_unlock(&imp->imp_lock); + } else if ((imp->imp_pingable && !suppress) || force_next || force) { + spin_unlock(&imp->imp_lock); + ptlrpc_ping(imp); + } else { + spin_unlock(&imp->imp_lock); + } +} + +static struct workqueue_struct *pinger_wq; +static void ptlrpc_pinger_main(struct work_struct *ws); +static DECLARE_DELAYED_WORK(ping_work, ptlrpc_pinger_main); + +static void ptlrpc_pinger_main(struct work_struct *ws) +{ + time64_t this_ping, time_after_ping; + s32 time_to_next_wake; + struct obd_import *imp; + struct list_head *iter; + + do { + this_ping = ktime_get_seconds(); + + mutex_lock(&pinger_mutex); + + list_for_each(iter, &pinger_imports) { + imp = list_entry(iter, struct obd_import, + imp_pinger_chain); + + ptlrpc_pinger_process_import(imp, this_ping); + /* obd_timeout might have changed */ + if (imp->imp_pingable && imp->imp_next_ping && + imp->imp_next_ping > this_ping + PING_INTERVAL) + ptlrpc_update_next_ping(imp, 0); + } + mutex_unlock(&pinger_mutex); + + time_after_ping = ktime_get_seconds(); + /* update memory usage info */ + obd_update_maxusage(); + + if ((ktime_get_seconds() - this_ping - 3) > PING_INTERVAL) + CDEBUG(D_HA, "long time to ping: %lld, %lld, %lld\n", + this_ping, time_after_ping, ktime_get_seconds()); + + /* Wait until the next ping time, or until we're stopped. */ + time_to_next_wake = pinger_check_timeout(this_ping); + /* The ping sent by ptlrpc_send_rpc may get sent out + * say .01 second after this. + * ptlrpc_pinger_sending_on_import will then set the + * next ping time to next_ping + .01 sec, which means + * we will SKIP the next ping at next_ping, and the + * ping will get sent 2 timeouts from now! Beware. */ + CDEBUG(D_INFO, "next wakeup in %d (%lld)\n", + time_to_next_wake, this_ping + PING_INTERVAL); + } while (time_to_next_wake <= 0); + + queue_delayed_work(pinger_wq, &ping_work, + cfs_time_seconds(max(time_to_next_wake, 1))); +} + +int ptlrpc_start_pinger(void) +{ +#ifdef ENABLE_PINGER + if (pinger_wq) + return -EALREADY; + + pinger_wq = alloc_workqueue("ptlrpc_pinger", 0, 1); + if (!pinger_wq) { + CERROR("cannot start pinger workqueue\n"); + return -ENOMEM; + } + + queue_delayed_work(pinger_wq, &ping_work, 0); + + if (suppress_pings) + CWARN("Pings will be suppressed at the request of the " + "administrator. The configuration shall meet the " + "additional requirements described in the manual. " + "(Search for the \"suppress_pings\" kernel module " + "parameter.)\n"); +#endif + return 0; +} + +int ptlrpc_stop_pinger(void) +{ +#ifdef ENABLE_PINGER + if (!pinger_wq) + return -EALREADY; + + cancel_delayed_work_sync(&ping_work); + destroy_workqueue(pinger_wq); + pinger_wq = NULL; +#endif + return 0; +} + +void ptlrpc_pinger_sending_on_import(struct obd_import *imp) +{ + ptlrpc_update_next_ping(imp, 0); +} + +void ptlrpc_pinger_commit_expected(struct obd_import *imp) +{ + ptlrpc_update_next_ping(imp, 1); + assert_spin_locked(&imp->imp_lock); + /* + * Avoid reading stale imp_connect_data. When not sure if pings are + * expected or not on next connection, we assume they are not and force + * one anyway to guarantee the chance of updating + * imp_peer_committed_transno. + */ + if (imp->imp_state != LUSTRE_IMP_FULL || + OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS)) + imp->imp_force_next_verify = 1; +} + +int ptlrpc_pinger_add_import(struct obd_import *imp) +{ + ENTRY; + if (!list_empty(&imp->imp_pinger_chain)) + RETURN(-EALREADY); + + mutex_lock(&pinger_mutex); + CDEBUG(D_HA, "adding pingable import %s->%s\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* if we add to pinger we want recovery on this import */ + imp->imp_obd->obd_no_recov = 0; + ptlrpc_update_next_ping(imp, 0); + /* XXX sort, blah blah */ + list_add_tail(&imp->imp_pinger_chain, &pinger_imports); + class_import_get(imp); + + ptlrpc_pinger_wake_up(); + mutex_unlock(&pinger_mutex); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_pinger_add_import); + +int ptlrpc_pinger_del_import(struct obd_import *imp) +{ + ENTRY; + + if (list_empty(&imp->imp_pinger_chain)) + RETURN(-ENOENT); + + mutex_lock(&pinger_mutex); + list_del_init(&imp->imp_pinger_chain); + CDEBUG(D_HA, "removing pingable import %s->%s\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* if we remove from pinger we don't want recovery on this import */ + imp->imp_obd->obd_no_recov = 1; + class_import_put(imp); + mutex_unlock(&pinger_mutex); + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_pinger_del_import); + +void ptlrpc_pinger_wake_up() +{ +#ifdef ENABLE_PINGER + mod_delayed_work(pinger_wq, &ping_work, 0); +#endif +} + +/* Ping evictor thread */ +#define PET_READY 1 +#define PET_TERMINATE 2 + +static int pet_refcount = 0; +static int pet_state; +static wait_queue_head_t pet_waitq; +static struct list_head pet_list; +static DEFINE_SPINLOCK(pet_lock); + +int ping_evictor_wake(struct obd_export *exp) +{ + struct obd_device *obd; + + spin_lock(&pet_lock); + if (pet_state != PET_READY) { + /* eventually the new obd will call here again. */ + spin_unlock(&pet_lock); + return 1; + } + + obd = class_exp2obd(exp); + if (list_empty(&obd->obd_evict_list)) { + class_incref(obd, "evictor", obd); + list_add(&obd->obd_evict_list, &pet_list); + } + spin_unlock(&pet_lock); + + wake_up(&pet_waitq); + return 0; +} + +static int ping_evictor_main(void *arg) +{ + struct obd_device *obd; + struct obd_export *exp; + struct l_wait_info lwi = { 0 }; + time64_t expire_time; + + ENTRY; + unshare_fs_struct(); + + CDEBUG(D_HA, "Starting Ping Evictor\n"); + pet_state = PET_READY; + while (1) { + l_wait_event(pet_waitq, (!list_empty(&pet_list)) || + (pet_state == PET_TERMINATE), &lwi); + + /* loop until all obd's will be removed */ + if ((pet_state == PET_TERMINATE) && list_empty(&pet_list)) + break; + + /* we only get here if pet_exp != NULL, and the end of this + * loop is the only place which sets it NULL again, so lock + * is not strictly necessary. */ + spin_lock(&pet_lock); + obd = list_entry(pet_list.next, struct obd_device, + obd_evict_list); + spin_unlock(&pet_lock); + + expire_time = ktime_get_real_seconds() - PING_EVICT_TIMEOUT; + + CDEBUG(D_HA, "evicting all exports of obd %s older than %lld\n", + obd->obd_name, expire_time); + + /* Exports can't be deleted out of the list while we hold + * the obd lock (class_unlink_export), which means we can't + * lose the last ref on the export. If they've already been + * removed from the list, we won't find them here. */ + spin_lock(&obd->obd_dev_lock); + while (!list_empty(&obd->obd_exports_timed)) { + exp = list_entry(obd->obd_exports_timed.next, + struct obd_export, + exp_obd_chain_timed); + if (expire_time > exp->exp_last_request_time) { + class_export_get(exp); + spin_unlock(&obd->obd_dev_lock); + LCONSOLE_WARN("%s: haven't heard from client %s" + " (at %s) in %lld seconds. I think" + " it's dead, and I am evicting" + " it. exp %p, cur %lld expire %lld" + " last %lld\n", + obd->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), + ktime_get_real_seconds() - + exp->exp_last_request_time, + exp, ktime_get_real_seconds(), + expire_time, + exp->exp_last_request_time); + CDEBUG(D_HA, "Last request was at %lld\n", + exp->exp_last_request_time); + class_fail_export(exp); + class_export_put(exp); + spin_lock(&obd->obd_dev_lock); + } else { + /* List is sorted, so everyone below is ok */ + break; + } + } + spin_unlock(&obd->obd_dev_lock); + + spin_lock(&pet_lock); + list_del_init(&obd->obd_evict_list); + spin_unlock(&pet_lock); + + class_decref(obd, "evictor", obd); + } + CDEBUG(D_HA, "Exiting Ping Evictor\n"); + + RETURN(0); +} + +void ping_evictor_start(void) +{ + struct task_struct *task; + + if (++pet_refcount > 1) + return; + + INIT_LIST_HEAD(&pet_list); + init_waitqueue_head(&pet_waitq); + + task = kthread_run(ping_evictor_main, NULL, "ll_evictor"); + if (IS_ERR(task)) { + pet_refcount--; + CERROR("Cannot start ping evictor thread: %ld\n", + PTR_ERR(task)); + } +} +EXPORT_SYMBOL(ping_evictor_start); + +void ping_evictor_stop(void) +{ + if (--pet_refcount > 0) + return; + + pet_state = PET_TERMINATE; + wake_up(&pet_waitq); +} +EXPORT_SYMBOL(ping_evictor_stop); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h new file mode 100644 index 0000000000000..41b9a268d52a6 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h @@ -0,0 +1,418 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/* Intramodule declarations for ptlrpc. */ + +#ifndef PTLRPC_INTERNAL_H +#define PTLRPC_INTERNAL_H + +#include "../ldlm/ldlm_internal.h" + +struct ldlm_namespace; +struct obd_import; +struct ldlm_res_id; +struct ptlrpc_request_set; +extern int test_req_buffer_pressure; +extern struct list_head ptlrpc_all_services; +extern struct mutex ptlrpc_all_services_mutex; +extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo; + +#ifdef HAVE_SERVER_SUPPORT +extern struct ptlrpc_nrs_pol_conf nrs_conf_crrn; +extern struct ptlrpc_nrs_pol_conf nrs_conf_orr; +extern struct ptlrpc_nrs_pol_conf nrs_conf_trr; +extern struct ptlrpc_nrs_pol_conf nrs_conf_tbf; +extern struct ptlrpc_nrs_pol_conf nrs_conf_delay; +#endif /* HAVE_SERVER_SUPPORT */ + +/** + * \addtogoup nrs + * @{ + */ +extern struct nrs_core nrs_core; + +extern struct mutex ptlrpcd_mutex; +extern struct mutex pinger_mutex; + +int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait); +/* ptlrpcd.c */ +int ptlrpcd_start(struct ptlrpcd_ctl *pc); + +/* client.c */ +void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, + timeout_t service_timeout); +struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, + enum ptlrpc_bulk_op_type type, + unsigned portal, + const struct ptlrpc_bulk_frag_ops + *ops); +int ptlrpc_request_cache_init(void); +void ptlrpc_request_cache_fini(void); +struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags); +void ptlrpc_request_cache_free(struct ptlrpc_request *req); +void ptlrpc_init_xid(void); +void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, + struct ptlrpc_request *req); +int ptlrpc_expired_set(void *data); +time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *); +void ptlrpc_resend_req(struct ptlrpc_request *request); +void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req); +void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req); +__u64 ptlrpc_known_replied_xid(struct obd_import *imp); +void ptlrpc_add_unreplied(struct ptlrpc_request *req); + +/* events.c */ +int ptlrpc_init_portals(void); +void ptlrpc_exit_portals(void); + +void ptlrpc_request_handle_notconn(struct ptlrpc_request *); +void lustre_assert_wire_constants(void); +int ptlrpc_import_in_recovery(struct obd_import *imp); +int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt, + bool invalid); +void ptlrpc_handle_failed_import(struct obd_import *imp); +int ptlrpc_replay_next(struct obd_import *imp, int *inflight); +void ptlrpc_initiate_recovery(struct obd_import *imp); + +int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset); +int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset); + +int ptlrpc_sysfs_register_service(struct kset *parent, + struct ptlrpc_service *svc); +void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc); + +void ptlrpc_ldebugfs_register_service(struct dentry *debugfs_entry, + struct ptlrpc_service *svc); +#ifdef CONFIG_PROC_FS +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc); +void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount); +void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req, + long q_usec, long work_usec); +#else +#define ptlrpc_lprocfs_unregister_service(params...) do{}while(0) +#define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0) +#define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0) +#endif /* CONFIG_PROC_FS */ + +/* NRS */ + +/** + * NRS core object. + * + * Holds NRS core fields. + */ +struct nrs_core { + /** + * Protects nrs_core::nrs_policies, serializes external policy + * registration/unregistration, and NRS core lprocfs operations. + */ + struct mutex nrs_mutex; + /** + * List of all policy descriptors registered with NRS core; protected + * by nrs_core::nrs_mutex. + */ + struct list_head nrs_policies; +}; + +int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc); +void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc); + +void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp); +void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req); +void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req); +void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp); + +struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp, + bool peek, bool force); + +static inline struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp, + bool force) +{ + return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force); +} + +static inline struct ptlrpc_request * +ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp) +{ + return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false); +} + +void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req); +bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp); +bool ptlrpc_nrs_req_throttling_nolock(struct ptlrpc_service_part *svcpt, + bool hp); + +int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc, + enum ptlrpc_nrs_queue_type queue, char *name, + enum ptlrpc_nrs_ctl opc, bool single, void *arg); + +int ptlrpc_nrs_init(void); +void ptlrpc_nrs_fini(void); + +static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nrs_hp != NULL; +} + +static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc) +{ + /** + * If the first service partition has an HP NRS head, all service + * partitions will. + */ + return nrs_svcpt_has_hp(svc->srv_parts[0]); +} + +static inline +struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp) +{ + LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt))); + return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg; +} + +static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt->scp_cpt; +} + +static inline +struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt->scp_service; +} + +static inline +struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt; +} + +static inline +struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy) +{ + return nrs_pol2svc(policy)->srv_cptable; +} + +static inline struct ptlrpc_nrs_resource * +nrs_request_resource(struct ptlrpc_nrs_request *nrq) +{ + LASSERT(nrq->nr_initialized); + LASSERT(!nrq->nr_finalized); + + return nrq->nr_res_ptrs[nrq->nr_res_idx]; +} + +static inline +struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq) +{ + return nrs_request_resource(nrq)->res_policy; +} + +#define NRS_LPROCFS_QUANTUM_NAME_REG "reg_quantum:" +#define NRS_LPROCFS_QUANTUM_NAME_HP "hp_quantum:" + +/** + * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum. + */ +#define LPROCFS_NRS_QUANTUM_MAX 65535 + +/** + * Max valid command string is the size of the labels, plus "65535" twice, plus + * a separating space character. + */ +#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD \ + sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " " \ + NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX)) + +/* recovd_thread.c */ + +int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink); + +/* pers.c */ +void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc, + int mdcnt); + +/* pack_generic.c */ +struct ptlrpc_reply_state * +lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt); +void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs); + +/* pinger.c */ +int ptlrpc_start_pinger(void); +int ptlrpc_stop_pinger(void); +void ptlrpc_pinger_sending_on_import(struct obd_import *imp); +void ptlrpc_pinger_commit_expected(struct obd_import *imp); +void ptlrpc_pinger_wake_up(void); +void ptlrpc_ping_import_soon(struct obd_import *imp); +int ping_evictor_wake(struct obd_export *exp); + +/* sec_null.c */ +int sptlrpc_null_init(void); +void sptlrpc_null_fini(void); + +/* sec_plain.c */ +int sptlrpc_plain_init(void); +void sptlrpc_plain_fini(void); + +/* sec_bulk.c */ +int sptlrpc_enc_pool_init(void); +void sptlrpc_enc_pool_fini(void); +int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v); + +/* sec_lproc.c */ +int sptlrpc_lproc_init(void); +void sptlrpc_lproc_fini(void); + +/* sec_gc.c */ +int sptlrpc_gc_init(void); +void sptlrpc_gc_fini(void); + +/* sec_config.c */ +void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, + enum lustre_sec_part to, + struct obd_uuid *target, + lnet_nid_t nid, + struct sptlrpc_flavor *sf); +int sptlrpc_conf_init(void); +void sptlrpc_conf_fini(void); + +/* sec.c */ +int sptlrpc_init(void); +void sptlrpc_fini(void); + +/* layout.c */ +__u32 __req_capsule_offset(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); + +static inline bool ptlrpc_recoverable_error(int rc) +{ + return (rc == -ENOTCONN || rc == -ENODEV); +} + +#ifdef HAVE_SERVER_SUPPORT +int tgt_mod_init(void); +void tgt_mod_exit(void); +int nodemap_mod_init(void); +void nodemap_mod_exit(void); +#else /* HAVE_SERVER_SUPPORT */ +static inline int tgt_mod_init(void) +{ + return 0; +} + +static inline void tgt_mod_exit(void) +{ + return; +} + +static inline int nodemap_mod_init(void) +{ + return 0; +} + +static inline void nodemap_mod_exit(void) +{ + return; +} +#endif /* !HAVE_SERVER_SUPPORT */ + +static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set) +{ + if (atomic_dec_and_test(&set->set_refcount)) + OBD_FREE_PTR(set); +} + +/** initialise ptlrpc common fields */ +static inline void ptlrpc_req_comm_init(struct ptlrpc_request *req) +{ + spin_lock_init(&req->rq_lock); + spin_lock_init(&req->rq_early_free_lock); + atomic_set(&req->rq_refcount, 1); + INIT_LIST_HEAD(&req->rq_list); + INIT_LIST_HEAD(&req->rq_replay_list); +} + +/** initialise client side ptlrpc request */ +static inline void ptlrpc_cli_req_init(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_req *cr = &req->rq_cli; + + ptlrpc_req_comm_init(req); + + req->rq_receiving_reply = 0; + req->rq_req_unlinked = req->rq_reply_unlinked = 1; + + INIT_LIST_HEAD(&cr->cr_set_chain); + INIT_LIST_HEAD(&cr->cr_ctx_chain); + INIT_LIST_HEAD(&cr->cr_unreplied_list); + init_waitqueue_head(&cr->cr_reply_waitq); + init_waitqueue_head(&cr->cr_set_waitq); +} + +/** initialise server side ptlrpc request */ +static inline void ptlrpc_srv_req_init(struct ptlrpc_request *req) +{ + struct ptlrpc_srv_req *sr = &req->rq_srv; + + ptlrpc_req_comm_init(req); + req->rq_srv_req = 1; + INIT_LIST_HEAD(&sr->sr_exp_list); + INIT_LIST_HEAD(&sr->sr_timed_list); + INIT_LIST_HEAD(&sr->sr_hist_list); +} + +static inline bool ptlrpc_req_is_connect(struct ptlrpc_request *req) +{ + if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CONNECT || + lustre_msg_get_opc(req->rq_reqmsg) == OST_CONNECT || + lustre_msg_get_opc(req->rq_reqmsg) == MGS_CONNECT) + return true; + else + return false; +} + +static inline bool ptlrpc_req_is_disconnect(struct ptlrpc_request *req) +{ + if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_DISCONNECT || + lustre_msg_get_opc(req->rq_reqmsg) == OST_DISCONNECT || + lustre_msg_get_opc(req->rq_reqmsg) == MGS_DISCONNECT) + return true; + else + return false; +} + +#endif /* PTLRPC_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c new file mode 100644 index 0000000000000..b11c07d54ba23 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c @@ -0,0 +1,155 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + + +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +extern spinlock_t ptlrpc_last_xid_lock; +#if RS_DEBUG +extern spinlock_t ptlrpc_rs_debug_lock; +#endif + +static __init int ptlrpc_init(void) +{ + int rc; + + ENTRY; + + lustre_assert_wire_constants(); +#if RS_DEBUG + spin_lock_init(&ptlrpc_rs_debug_lock); +#endif + INIT_LIST_HEAD(&ptlrpc_all_services); + mutex_init(&ptlrpc_all_services_mutex); + mutex_init(&pinger_mutex); + mutex_init(&ptlrpcd_mutex); + ptlrpc_init_xid(); + + rc = req_layout_init(); + if (rc) + RETURN(rc); + + rc = tgt_mod_init(); + if (rc) + GOTO(err_layout, rc); + + rc = ptlrpc_hr_init(); + if (rc) + GOTO(err_tgt, rc); + + rc = ptlrpc_request_cache_init(); + if (rc) + GOTO(err_hr, rc); + + rc = ptlrpc_init_portals(); + if (rc) + GOTO(err_cache, rc); + + rc = ptlrpc_connection_init(); + if (rc) + GOTO(err_portals, rc); + + ptlrpc_put_connection_superhack = ptlrpc_connection_put; + + rc = ptlrpc_start_pinger(); + if (rc) + GOTO(err_conn, rc); + + rc = ldlm_init(); + if (rc) + GOTO(err_pinger, rc); + + rc = sptlrpc_init(); + if (rc) + GOTO(err_ldlm, rc); + + rc = ptlrpc_nrs_init(); + if (rc) + GOTO(err_sptlrpc, rc); + + rc = nodemap_mod_init(); + if (rc) + GOTO(err_nrs, rc); + + RETURN(0); +err_nrs: + ptlrpc_nrs_fini(); +err_sptlrpc: + sptlrpc_fini(); +err_ldlm: + ldlm_exit(); +err_pinger: + ptlrpc_stop_pinger(); +err_conn: + ptlrpc_connection_fini(); +err_portals: + ptlrpc_exit_portals(); +err_cache: + ptlrpc_request_cache_fini(); +err_hr: + ptlrpc_hr_fini(); +err_tgt: + tgt_mod_exit(); +err_layout: + req_layout_fini(); + return rc; +} + +static void __exit ptlrpc_exit(void) +{ + nodemap_mod_exit(); + ptlrpc_nrs_fini(); + sptlrpc_fini(); + ldlm_exit(); + ptlrpc_stop_pinger(); + ptlrpc_exit_portals(); + ptlrpc_request_cache_fini(); + ptlrpc_hr_fini(); + ptlrpc_connection_fini(); + tgt_mod_exit(); + req_layout_fini(); +} + +MODULE_AUTHOR("OpenSFS, Inc. "); +MODULE_DESCRIPTION("Lustre Request Processor and Lock Management"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(ptlrpc_init); +module_exit(ptlrpc_exit); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c new file mode 100644 index 0000000000000..b98d082660628 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c @@ -0,0 +1,965 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/ptlrpcd.c + */ + +/** \defgroup ptlrpcd PortalRPC daemon + * + * ptlrpcd is a special thread with its own set where other user might add + * requests when they don't want to wait for their completion. + * PtlRPCD will take care of sending such requests and then processing their + * replies and calling completion callbacks as necessary. + * The callbacks are called directly from ptlrpcd context. + * It is important to never significantly block (esp. on RPCs!) within such + * completion handler or a deadlock might occur where ptlrpcd enters some + * callback that attempts to send another RPC and wait for it to return, + * during which time ptlrpcd is completely blocked, so e.g. if import + * fails, recovery cannot progress because connection requests are also + * sent by ptlrpcd. + * + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include /* for obd_zombie */ +#include /* for OBD_FAIL_CHECK */ +#include /* cl_env_{get,put}() */ +#include + +#include "ptlrpc_internal.h" + +/* One of these per CPT. */ +struct ptlrpcd { + int pd_size; + int pd_index; + int pd_cpt; + int pd_cursor; + int pd_nthreads; + int pd_groupsize; + struct ptlrpcd_ctl pd_threads[0]; +}; + +/* + * max_ptlrpcds is obsolete, but retained to ensure that the kernel + * module will load on a system where it has been tuned. + * A value other than 0 implies it was tuned, in which case the value + * is used to derive a setting for ptlrpcd_per_cpt_max. + */ +static int max_ptlrpcds; +module_param(max_ptlrpcds, int, 0644); +MODULE_PARM_DESC(max_ptlrpcds, + "Max ptlrpcd thread count to be started (obsolete)."); + +/* + * ptlrpcd_bind_policy is obsolete, but retained to ensure that + * the kernel module will load on a system where it has been tuned. + * A value other than 0 implies it was tuned, in which case the value + * is used to derive a setting for ptlrpcd_partner_group_size. + */ +static int ptlrpcd_bind_policy; +module_param(ptlrpcd_bind_policy, int, 0644); +MODULE_PARM_DESC(ptlrpcd_bind_policy, + "Ptlrpcd threads binding mode (obsolete)."); + +/* + * ptlrpcd_per_cpt_max: The maximum number of ptlrpcd threads to run + * in a CPT. + */ +static int ptlrpcd_per_cpt_max; +module_param(ptlrpcd_per_cpt_max, int, 0644); +MODULE_PARM_DESC(ptlrpcd_per_cpt_max, + "Max ptlrpcd thread count to be started per CPT."); + +/* + * ptlrpcd_partner_group_size: The desired number of threads in each + * ptlrpcd partner thread group. Default is 2, corresponding to the + * old PDB_POLICY_PAIR. A negative value makes all ptlrpcd threads in + * a CPT partners of each other. + */ +static int ptlrpcd_partner_group_size; +module_param(ptlrpcd_partner_group_size, int, 0644); +MODULE_PARM_DESC(ptlrpcd_partner_group_size, + "Number of ptlrpcd threads in a partner group."); + +/* + * ptlrpcd_cpts: A CPT string describing the CPU partitions that + * ptlrpcd threads should run on. Used to make ptlrpcd threads run on + * a subset of all CPTs. + * + * ptlrpcd_cpts=2 + * ptlrpcd_cpts=[2] + * run ptlrpcd threads only on CPT 2. + * + * ptlrpcd_cpts=0-3 + * ptlrpcd_cpts=[0-3] + * run ptlrpcd threads on CPTs 0, 1, 2, and 3. + * + * ptlrpcd_cpts=[0-3,5,7] + * run ptlrpcd threads on CPTS 0, 1, 2, 3, 5, and 7. + */ +static char *ptlrpcd_cpts; +module_param(ptlrpcd_cpts, charp, 0644); +MODULE_PARM_DESC(ptlrpcd_cpts, + "CPU partitions ptlrpcd threads should run in"); + +/* ptlrpcds_cpt_idx maps cpt numbers to an index in the ptlrpcds array. */ +static int *ptlrpcds_cpt_idx; + +/* ptlrpcds_num is the number of entries in the ptlrpcds array. */ +static int ptlrpcds_num; +static struct ptlrpcd **ptlrpcds; + +/* + * In addition to the regular thread pool above, there is a single + * global recovery thread. Recovery isn't critical for performance, + * and doesn't block, but must always be able to proceed, and it is + * possible that all normal ptlrpcd threads are blocked. Hence the + * need for a dedicated thread. + */ +static struct ptlrpcd_ctl ptlrpcd_rcv; + +struct mutex ptlrpcd_mutex; +static int ptlrpcd_users = 0; + +void ptlrpcd_wake(struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *set = req->rq_set; + + LASSERT(set != NULL); + wake_up(&set->set_waitq); +} +EXPORT_SYMBOL(ptlrpcd_wake); + +static struct ptlrpcd_ctl * +ptlrpcd_select_pc(struct ptlrpc_request *req) +{ + struct ptlrpcd *pd; + int cpt; + int idx; + + if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL) + return &ptlrpcd_rcv; + + cpt = cfs_cpt_current(cfs_cpt_table, 1); + if (ptlrpcds_cpt_idx == NULL) + idx = cpt; + else + idx = ptlrpcds_cpt_idx[cpt]; + pd = ptlrpcds[idx]; + + /* We do not care whether it is strict load balance. */ + idx = pd->pd_cursor; + if (++idx == pd->pd_nthreads) + idx = 0; + pd->pd_cursor = idx; + + return &pd->pd_threads[idx]; +} + +/** + * Move all request from an existing request set to the ptlrpcd queue. + * All requests from the set must be in phase RQ_PHASE_NEW. + */ +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *pos; + struct ptlrpcd_ctl *pc; + struct ptlrpc_request_set *new; + int count, i; + + pc = ptlrpcd_select_pc(NULL); + new = pc->pc_set; + + list_for_each_safe(pos, tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(pos, struct ptlrpc_request, + rq_set_chain); + + LASSERT(req->rq_phase == RQ_PHASE_NEW); + req->rq_set = new; + req->rq_queued_time = ktime_get_seconds(); + } + + spin_lock(&new->set_new_req_lock); + list_splice_init(&set->set_requests, &new->set_new_requests); + i = atomic_read(&set->set_remaining); + count = atomic_add_return(i, &new->set_new_count); + atomic_set(&set->set_remaining, 0); + spin_unlock(&new->set_new_req_lock); + if (count == i) { + wake_up(&new->set_waitq); + + /* XXX: It maybe unnecessary to wakeup all the partners. But to + * guarantee the async RPC can be processed ASAP, we have + * no other better choice. It maybe fixed in future. */ + for (i = 0; i < pc->pc_npartners; i++) + wake_up(&pc->pc_partners[i]->pc_set->set_waitq); + } +} + +/** + * Return transferred RPCs count. + */ +static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des, + struct ptlrpc_request_set *src) +{ + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + int rc = 0; + + spin_lock(&src->set_new_req_lock); + if (likely(!list_empty(&src->set_new_requests))) { + list_for_each_safe(pos, tmp, &src->set_new_requests) { + req = list_entry(pos, struct ptlrpc_request, + rq_set_chain); + req->rq_set = des; + } + list_splice_init(&src->set_new_requests, + &des->set_requests); + rc = atomic_read(&src->set_new_count); + atomic_add(rc, &des->set_remaining); + atomic_set(&src->set_new_count, 0); + } + spin_unlock(&src->set_new_req_lock); + return rc; +} + +/** + * Requests that are added to the ptlrpcd queue are sent via + * ptlrpcd_check->ptlrpc_check_set(). + */ +void ptlrpcd_add_req(struct ptlrpc_request *req) +{ + struct ptlrpcd_ctl *pc; + + if (req->rq_reqmsg) + lustre_msg_set_jobid(req->rq_reqmsg, NULL); + + spin_lock(&req->rq_lock); + if (req->rq_invalid_rqset) { + struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(5), + back_to_sleep, NULL); + + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + l_wait_event(req->rq_set_waitq, (req->rq_set == NULL), &lwi); + } else if (req->rq_set) { + /* If we have a vaid "rq_set", just reuse it to avoid double + * linked. */ + LASSERT(req->rq_phase == RQ_PHASE_NEW); + LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY); + + /* ptlrpc_check_set will decrease the count */ + atomic_inc(&req->rq_set->set_remaining); + spin_unlock(&req->rq_lock); + wake_up(&req->rq_set->set_waitq); + return; + } else { + spin_unlock(&req->rq_lock); + } + + pc = ptlrpcd_select_pc(req); + + DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]", + req, pc->pc_name, pc->pc_index); + + ptlrpc_set_add_new_req(pc, req); +} +EXPORT_SYMBOL(ptlrpcd_add_req); + +static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set) +{ + atomic_inc(&set->set_refcount); +} + +/** + * Check if there is more work to do on ptlrpcd set. + * Returns 1 if yes. + */ +static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc) +{ + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + struct ptlrpc_request_set *set = pc->pc_set; + int rc = 0; + int rc2; + ENTRY; + + if (atomic_read(&set->set_new_count)) { + spin_lock(&set->set_new_req_lock); + if (likely(!list_empty(&set->set_new_requests))) { + list_splice_init(&set->set_new_requests, + &set->set_requests); + atomic_add(atomic_read(&set->set_new_count), + &set->set_remaining); + atomic_set(&set->set_new_count, 0); + /* + * Need to calculate its timeout. + */ + rc = 1; + } + spin_unlock(&set->set_new_req_lock); + } + + /* We should call lu_env_refill() before handling new requests to make + * sure that env key the requests depending on really exists. + */ + rc2 = lu_env_refill(env); + if (rc2 != 0) { + /* + * XXX This is very awkward situation, because + * execution can neither continue (request + * interpreters assume that env is set up), nor repeat + * the loop (as this potentially results in a tight + * loop of -ENOMEM's). + * + * Fortunately, refill only ever does something when + * new modules are loaded, i.e., early during boot up. + */ + CERROR("Failure to refill session: %d\n", rc2); + RETURN(rc); + } + + if (atomic_read(&set->set_remaining)) + rc |= ptlrpc_check_set(env, set); + + /* NB: ptlrpc_check_set has already moved complted request at the + * head of seq::set_requests */ + list_for_each_safe(pos, tmp, &set->set_requests) { + req = list_entry(pos, struct ptlrpc_request, rq_set_chain); + if (req->rq_phase != RQ_PHASE_COMPLETE) + break; + + list_del_init(&req->rq_set_chain); + req->rq_set = NULL; + ptlrpc_req_finished(req); + } + + if (rc == 0) { + /* + * If new requests have been added, make sure to wake up. + */ + rc = atomic_read(&set->set_new_count); + + /* If we have nothing to do, check whether we can take some + * work from our partner threads. */ + if (rc == 0 && pc->pc_npartners > 0) { + struct ptlrpcd_ctl *partner; + struct ptlrpc_request_set *ps; + int first = pc->pc_cursor; + + do { + partner = pc->pc_partners[pc->pc_cursor++]; + if (pc->pc_cursor >= pc->pc_npartners) + pc->pc_cursor = 0; + if (partner == NULL) + continue; + + spin_lock(&partner->pc_lock); + ps = partner->pc_set; + if (ps == NULL) { + spin_unlock(&partner->pc_lock); + continue; + } + + ptlrpc_reqset_get(ps); + spin_unlock(&partner->pc_lock); + + if (atomic_read(&ps->set_new_count)) { + rc = ptlrpcd_steal_rqset(set, ps); + if (rc > 0) + CDEBUG(D_RPCTRACE, "transfer %d" + " async RPCs [%d->%d]\n", + rc, partner->pc_index, + pc->pc_index); + } + ptlrpc_reqset_put(ps); + } while (rc == 0 && pc->pc_cursor != first); + } + } + + RETURN(rc || test_bit(LIOD_STOP, &pc->pc_flags)); +} + +/** + * Main ptlrpcd thread. + * ptlrpc's code paths like to execute in process context, so we have this + * thread which spins on a set which contains the rpcs and sends them. + * + */ +static int ptlrpcd(void *arg) +{ + struct ptlrpcd_ctl *pc = arg; + struct ptlrpc_request_set *set; + struct lu_context ses = { 0 }; + struct lu_env env = { .le_ses = &ses }; + int rc = 0; + int exit = 0; + ENTRY; + + unshare_fs_struct(); + + if (cfs_cpt_bind(cfs_cpt_table, pc->pc_cpt) != 0) + CWARN("Failed to bind %s on CPT %d\n", pc->pc_name, pc->pc_cpt); + + /* + * Allocate the request set after the thread has been bound + * above. This is safe because no requests will be queued + * until all ptlrpcd threads have confirmed that they have + * successfully started. + */ + set = ptlrpc_prep_set(); + if (set == NULL) + GOTO(failed, rc = -ENOMEM); + spin_lock(&pc->pc_lock); + pc->pc_set = set; + spin_unlock(&pc->pc_lock); + + /* Both client and server (MDT/OST) may use the environment. */ + rc = lu_context_init(&env.le_ctx, LCT_MD_THREAD | + LCT_DT_THREAD | + LCT_CL_THREAD | + LCT_REMEMBER | + LCT_NOREF); + if (rc != 0) + GOTO(failed, rc); + rc = lu_context_init(env.le_ses, LCT_SESSION | + LCT_REMEMBER | + LCT_NOREF); + if (rc != 0) { + lu_context_fini(&env.le_ctx); + GOTO(failed, rc); + } + + complete(&pc->pc_starting); + + /* + * This mainloop strongly resembles ptlrpc_set_wait() except that our + * set never completes. ptlrpcd_check() calls ptlrpc_check_set() when + * there are requests in the set. New requests come in on the set's + * new_req_list and ptlrpcd_check() moves them into the set. + */ + do { + struct l_wait_info lwi; + time64_t timeout; + + timeout = ptlrpc_set_next_timeout(set); + lwi = LWI_TIMEOUT(cfs_time_seconds(timeout), + ptlrpc_expired_set, set); + + lu_context_enter(&env.le_ctx); + lu_context_enter(env.le_ses); + l_wait_event(set->set_waitq, ptlrpcd_check(&env, pc), &lwi); + lu_context_exit(&env.le_ctx); + lu_context_exit(env.le_ses); + + /* + * Abort inflight rpcs for forced stop case. + */ + if (test_bit(LIOD_STOP, &pc->pc_flags)) { + if (test_bit(LIOD_FORCE, &pc->pc_flags)) + ptlrpc_abort_set(set); + exit++; + } + + /* + * Let's make one more loop to make sure that ptlrpcd_check() + * copied all raced new rpcs into the set so we can kill them. + */ + } while (exit < 2); + + /* + * Wait for inflight requests to drain. + */ + if (!list_empty(&set->set_requests)) + ptlrpc_set_wait(&env, set); + lu_context_fini(&env.le_ctx); + lu_context_fini(env.le_ses); + + complete(&pc->pc_finishing); + + return 0; + +failed: + pc->pc_error = rc; + complete(&pc->pc_starting); + RETURN(rc); +} + +static void ptlrpcd_ctl_init(struct ptlrpcd_ctl *pc, int index, int cpt) +{ + ENTRY; + + pc->pc_index = index; + pc->pc_cpt = cpt; + init_completion(&pc->pc_starting); + init_completion(&pc->pc_finishing); + spin_lock_init(&pc->pc_lock); + + if (index < 0) { + /* Recovery thread. */ + snprintf(pc->pc_name, sizeof(pc->pc_name), "ptlrpcd_rcv"); + } else { + /* Regular thread. */ + snprintf(pc->pc_name, sizeof(pc->pc_name), + "ptlrpcd_%02d_%02d", cpt, index); + } + + EXIT; +} + +/* XXX: We want multiple CPU cores to share the async RPC load. So we + * start many ptlrpcd threads. We also want to reduce the ptlrpcd + * overhead caused by data transfer cross-CPU cores. So we bind + * all ptlrpcd threads to a CPT, in the expectation that CPTs + * will be defined in a way that matches these boundaries. Within + * a CPT a ptlrpcd thread can be scheduled on any available core. + * + * Each ptlrpcd thread has its own request queue. This can cause + * response delay if the thread is already busy. To help with + * this we define partner threads: these are other threads bound + * to the same CPT which will check for work in each other's + * request queues if they have no work to do. + * + * The desired number of partner threads can be tuned by setting + * ptlrpcd_partner_group_size. The default is to create pairs of + * partner threads. + */ +static int ptlrpcd_partners(struct ptlrpcd *pd, int index) +{ + struct ptlrpcd_ctl *pc; + struct ptlrpcd_ctl **ppc; + int first; + int i; + int rc = 0; + ENTRY; + + LASSERT(index >= 0 && index < pd->pd_nthreads); + pc = &pd->pd_threads[index]; + pc->pc_npartners = pd->pd_groupsize - 1; + + if (pc->pc_npartners <= 0) + GOTO(out, rc); + + OBD_CPT_ALLOC(pc->pc_partners, cfs_cpt_table, pc->pc_cpt, + sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners); + if (pc->pc_partners == NULL) { + pc->pc_npartners = 0; + GOTO(out, rc = -ENOMEM); + } + + first = index - index % pd->pd_groupsize; + ppc = pc->pc_partners; + for (i = first; i < first + pd->pd_groupsize; i++) { + if (i != index) + *ppc++ = &pd->pd_threads[i]; + } +out: + RETURN(rc); +} + +int ptlrpcd_start(struct ptlrpcd_ctl *pc) +{ + struct task_struct *task; + int rc = 0; + ENTRY; + + /* + * Do not allow starting a second thread for one pc. + */ + if (test_and_set_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Starting second thread (%s) for same pc %p\n", + pc->pc_name, pc); + RETURN(0); + } + + task = kthread_run(ptlrpcd, pc, pc->pc_name); + if (IS_ERR(task)) + GOTO(out_set, rc = PTR_ERR(task)); + + wait_for_completion(&pc->pc_starting); + rc = pc->pc_error; + if (rc != 0) + GOTO(out_set, rc); + + RETURN(0); + +out_set: + if (pc->pc_set != NULL) { + struct ptlrpc_request_set *set = pc->pc_set; + + spin_lock(&pc->pc_lock); + pc->pc_set = NULL; + spin_unlock(&pc->pc_lock); + ptlrpc_set_destroy(set); + } + clear_bit(LIOD_START, &pc->pc_flags); + RETURN(rc); +} + +void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force) +{ + ENTRY; + + if (!test_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Thread for pc %p was not started\n", pc); + goto out; + } + + set_bit(LIOD_STOP, &pc->pc_flags); + if (force) + set_bit(LIOD_FORCE, &pc->pc_flags); + wake_up(&pc->pc_set->set_waitq); + +out: + EXIT; +} + +void ptlrpcd_free(struct ptlrpcd_ctl *pc) +{ + struct ptlrpc_request_set *set = pc->pc_set; + ENTRY; + + if (!test_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Thread for pc %p was not started\n", pc); + goto out; + } + + wait_for_completion(&pc->pc_finishing); + + spin_lock(&pc->pc_lock); + pc->pc_set = NULL; + spin_unlock(&pc->pc_lock); + ptlrpc_set_destroy(set); + + clear_bit(LIOD_START, &pc->pc_flags); + clear_bit(LIOD_STOP, &pc->pc_flags); + clear_bit(LIOD_FORCE, &pc->pc_flags); + +out: + if (pc->pc_npartners > 0) { + LASSERT(pc->pc_partners != NULL); + + OBD_FREE(pc->pc_partners, + sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners); + pc->pc_partners = NULL; + } + pc->pc_npartners = 0; + pc->pc_error = 0; + EXIT; +} + +static void ptlrpcd_fini(void) +{ + int i; + int j; + int ncpts; + ENTRY; + + if (ptlrpcds != NULL) { + for (i = 0; i < ptlrpcds_num; i++) { + if (ptlrpcds[i] == NULL) + break; + for (j = 0; j < ptlrpcds[i]->pd_nthreads; j++) + ptlrpcd_stop(&ptlrpcds[i]->pd_threads[j], 0); + for (j = 0; j < ptlrpcds[i]->pd_nthreads; j++) + ptlrpcd_free(&ptlrpcds[i]->pd_threads[j]); + OBD_FREE(ptlrpcds[i], ptlrpcds[i]->pd_size); + ptlrpcds[i] = NULL; + } + OBD_FREE(ptlrpcds, sizeof(ptlrpcds[0]) * ptlrpcds_num); + } + ptlrpcds_num = 0; + + ptlrpcd_stop(&ptlrpcd_rcv, 0); + ptlrpcd_free(&ptlrpcd_rcv); + + if (ptlrpcds_cpt_idx != NULL) { + ncpts = cfs_cpt_number(cfs_cpt_table); + OBD_FREE(ptlrpcds_cpt_idx, ncpts * sizeof(ptlrpcds_cpt_idx[0])); + ptlrpcds_cpt_idx = NULL; + } + + EXIT; +} + +static int ptlrpcd_init(void) +{ + int nthreads; + int groupsize; + int size; + int i; + int j; + int rc = 0; + struct cfs_cpt_table *cptable; + __u32 *cpts = NULL; + int ncpts; + int cpt; + struct ptlrpcd *pd; + ENTRY; + + /* + * Determine the CPTs that ptlrpcd threads will run on. + */ + cptable = cfs_cpt_table; + ncpts = cfs_cpt_number(cptable); + if (ptlrpcd_cpts != NULL) { + struct cfs_expr_list *el; + + size = ncpts * sizeof(ptlrpcds_cpt_idx[0]); + OBD_ALLOC(ptlrpcds_cpt_idx, size); + if (ptlrpcds_cpt_idx == NULL) + GOTO(out, rc = -ENOMEM); + + rc = cfs_expr_list_parse(ptlrpcd_cpts, + strlen(ptlrpcd_cpts), + 0, ncpts - 1, &el); + if (rc != 0) { + CERROR("%s: invalid CPT pattern string: %s", + "ptlrpcd_cpts", ptlrpcd_cpts); + GOTO(out, rc = -EINVAL); + } + + rc = cfs_expr_list_values(el, ncpts, &cpts); + cfs_expr_list_free(el); + if (rc <= 0) { + CERROR("%s: failed to parse CPT array %s: %d\n", + "ptlrpcd_cpts", ptlrpcd_cpts, rc); + if (rc == 0) + rc = -EINVAL; + GOTO(out, rc); + } + + /* + * Create the cpt-to-index map. When there is no match + * in the cpt table, pick a cpt at random. This could + * be changed to take the topology of the system into + * account. + */ + for (cpt = 0; cpt < ncpts; cpt++) { + for (i = 0; i < rc; i++) + if (cpts[i] == cpt) + break; + if (i >= rc) + i = cpt % rc; + ptlrpcds_cpt_idx[cpt] = i; + } + + cfs_expr_list_values_free(cpts, rc); + ncpts = rc; + } + ptlrpcds_num = ncpts; + + size = ncpts * sizeof(ptlrpcds[0]); + OBD_ALLOC(ptlrpcds, size); + if (ptlrpcds == NULL) + GOTO(out, rc = -ENOMEM); + + /* + * The max_ptlrpcds parameter is obsolete, but do something + * sane if it has been tuned, and complain if + * ptlrpcd_per_cpt_max has also been tuned. + */ + if (max_ptlrpcds != 0) { + CWARN("max_ptlrpcds is obsolete.\n"); + if (ptlrpcd_per_cpt_max == 0) { + ptlrpcd_per_cpt_max = max_ptlrpcds / ncpts; + /* Round up if there is a remainder. */ + if (max_ptlrpcds % ncpts != 0) + ptlrpcd_per_cpt_max++; + CWARN("Setting ptlrpcd_per_cpt_max = %d\n", + ptlrpcd_per_cpt_max); + } else { + CWARN("ptlrpd_per_cpt_max is also set!\n"); + } + } + + /* + * The ptlrpcd_bind_policy parameter is obsolete, but do + * something sane if it has been tuned, and complain if + * ptlrpcd_partner_group_size is also tuned. + */ + if (ptlrpcd_bind_policy != 0) { + CWARN("ptlrpcd_bind_policy is obsolete.\n"); + if (ptlrpcd_partner_group_size == 0) { + switch (ptlrpcd_bind_policy) { + case 1: /* PDB_POLICY_NONE */ + case 2: /* PDB_POLICY_FULL */ + ptlrpcd_partner_group_size = 1; + break; + case 3: /* PDB_POLICY_PAIR */ + ptlrpcd_partner_group_size = 2; + break; + case 4: /* PDB_POLICY_NEIGHBOR */ +#ifdef CONFIG_NUMA + ptlrpcd_partner_group_size = -1; /* CPT */ +#else + ptlrpcd_partner_group_size = 3; /* Triplets */ +#endif + break; + default: /* Illegal value, use the default. */ + ptlrpcd_partner_group_size = 2; + break; + } + CWARN("Setting ptlrpcd_partner_group_size = %d\n", + ptlrpcd_partner_group_size); + } else { + CWARN("ptlrpcd_partner_group_size is also set!\n"); + } + } + + if (ptlrpcd_partner_group_size == 0) + ptlrpcd_partner_group_size = 2; + else if (ptlrpcd_partner_group_size < 0) + ptlrpcd_partner_group_size = -1; + else if (ptlrpcd_per_cpt_max > 0 && + ptlrpcd_partner_group_size > ptlrpcd_per_cpt_max) + ptlrpcd_partner_group_size = ptlrpcd_per_cpt_max; + + /* + * Start the recovery thread first. + */ + set_bit(LIOD_RECOVERY, &ptlrpcd_rcv.pc_flags); + ptlrpcd_ctl_init(&ptlrpcd_rcv, -1, CFS_CPT_ANY); + rc = ptlrpcd_start(&ptlrpcd_rcv); + if (rc < 0) + GOTO(out, rc); + + for (i = 0; i < ncpts; i++) { + if (cpts == NULL) + cpt = i; + else + cpt = cpts[i]; + + nthreads = cfs_cpt_weight(cptable, cpt); + if (ptlrpcd_per_cpt_max > 0 && ptlrpcd_per_cpt_max < nthreads) + nthreads = ptlrpcd_per_cpt_max; + if (nthreads < 2) + nthreads = 2; + + if (ptlrpcd_partner_group_size <= 0) { + groupsize = nthreads; + } else if (nthreads <= ptlrpcd_partner_group_size) { + groupsize = nthreads; + } else { + groupsize = ptlrpcd_partner_group_size; + if (nthreads % groupsize != 0) + nthreads += groupsize - (nthreads % groupsize); + } + + size = offsetof(struct ptlrpcd, pd_threads[nthreads]); + OBD_CPT_ALLOC(pd, cptable, cpt, size); + if (!pd) + GOTO(out, rc = -ENOMEM); + pd->pd_size = size; + pd->pd_index = i; + pd->pd_cpt = cpt; + pd->pd_cursor = 0; + pd->pd_nthreads = nthreads; + pd->pd_groupsize = groupsize; + ptlrpcds[i] = pd; + + /* + * The ptlrpcd threads in a partner group can access + * each other's struct ptlrpcd_ctl, so these must be + * initialized before any thead is started. + */ + for (j = 0; j < nthreads; j++) { + ptlrpcd_ctl_init(&pd->pd_threads[j], j, cpt); + rc = ptlrpcd_partners(pd, j); + if (rc < 0) + GOTO(out, rc); + } + + /* XXX: We start nthreads ptlrpc daemons on this cpt. + * Each of them can process any non-recovery + * async RPC to improve overall async RPC + * efficiency. + * + * But there are some issues with async I/O RPCs + * and async non-I/O RPCs processed in the same + * set under some cases. The ptlrpcd may be + * blocked by some async I/O RPC(s), then will + * cause other async non-I/O RPC(s) can not be + * processed in time. + * + * Maybe we should distinguish blocked async RPCs + * from non-blocked async RPCs, and process them + * in different ptlrpcd sets to avoid unnecessary + * dependency. But how to distribute async RPCs + * load among all the ptlrpc daemons becomes + * another trouble. + */ + for (j = 0; j < nthreads; j++) { + rc = ptlrpcd_start(&pd->pd_threads[j]); + if (rc < 0) + GOTO(out, rc); + } + } +out: + if (rc != 0) + ptlrpcd_fini(); + + RETURN(rc); +} + +int ptlrpcd_addref(void) +{ + int rc = 0; + ENTRY; + + mutex_lock(&ptlrpcd_mutex); + if (++ptlrpcd_users == 1) { + rc = ptlrpcd_init(); + if (rc < 0) + ptlrpcd_users--; + } + mutex_unlock(&ptlrpcd_mutex); + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpcd_addref); + +void ptlrpcd_decref(void) +{ + mutex_lock(&ptlrpcd_mutex); + if (--ptlrpcd_users == 0) + ptlrpcd_fini(); + mutex_unlock(&ptlrpcd_mutex); +} +EXPORT_SYMBOL(ptlrpcd_decref); +/** @} ptlrpcd */ diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c new file mode 100644 index 0000000000000..c923ab9386901 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c @@ -0,0 +1,379 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/recover.c + * + * Author: Mike Shaver + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +/** + * Start recovery on disconnected import. + * This is done by just attempting a connect + */ +void ptlrpc_initiate_recovery(struct obd_import *imp) +{ + ENTRY; + + CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd)); + ptlrpc_connect_import(imp); + + EXIT; +} + +/** + * Identify what request from replay list needs to be replayed next + * (based on what we have already replayed) and send it to server. + */ +int ptlrpc_replay_next(struct obd_import *imp, int *inflight) +{ + int rc = 0; + struct list_head *tmp, *pos; + struct ptlrpc_request *req = NULL; + __u64 last_transno; + ENTRY; + + *inflight = 0; + + /* It might have committed some after we last spoke, so make sure we + * get rid of them now. + */ + spin_lock(&imp->imp_lock); + imp->imp_last_transno_checked = 0; + ptlrpc_free_committed(imp); + last_transno = imp->imp_last_replay_transno; + + CDEBUG(D_HA, "import %p from %s committed %llu last %llu\n", + imp, obd2cli_tgt(imp->imp_obd), + imp->imp_peer_committed_transno, last_transno); + + /* Replay all the committed open requests on committed_list first */ + if (!list_empty(&imp->imp_committed_list)) { + tmp = imp->imp_committed_list.prev; + req = list_entry(tmp, struct ptlrpc_request, + rq_replay_list); + + /* The last request on committed_list hasn't been replayed */ + if (req->rq_transno > last_transno) { + if (!imp->imp_resend_replay || + imp->imp_replay_cursor == &imp->imp_committed_list) + imp->imp_replay_cursor = + imp->imp_replay_cursor->next; + + while (imp->imp_replay_cursor != + &imp->imp_committed_list) { + req = list_entry(imp->imp_replay_cursor, + struct ptlrpc_request, + rq_replay_list); + if (req->rq_transno > last_transno) + break; + + req = NULL; + LASSERT(!list_empty(imp->imp_replay_cursor)); + imp->imp_replay_cursor = + imp->imp_replay_cursor->next; + } + } else { + /* All requests on committed_list have been replayed */ + imp->imp_replay_cursor = &imp->imp_committed_list; + req = NULL; + } + } + + /* All the requests in committed list have been replayed, let's replay + * the imp_replay_list */ + if (req == NULL) { + list_for_each_safe(tmp, pos, &imp->imp_replay_list) { + req = list_entry(tmp, struct ptlrpc_request, + rq_replay_list); + + if (req->rq_transno > last_transno) + break; + req = NULL; + } + } + + /* If need to resend the last sent transno (because a reconnect + * has occurred), then stop on the matching req and send it again. + * If, however, the last sent transno has been committed then we + * continue replay from the next request. */ + if (req != NULL && imp->imp_resend_replay) + lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); + + /* ptlrpc_prepare_replay() may fail to add the reqeust into unreplied + * list if the request hasn't been added to replay list then. Another + * exception is that resend replay could have been removed from the + * unreplied list. */ + if (req != NULL && list_empty(&req->rq_unreplied_list)) { + DEBUG_REQ(D_HA, req, "resend_replay: %d, last_transno: %llu\n", + imp->imp_resend_replay, last_transno); + ptlrpc_add_unreplied(req); + imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp); + } + + imp->imp_resend_replay = 0; + spin_unlock(&imp->imp_lock); + + if (req != NULL) { + LASSERT(!list_empty(&req->rq_unreplied_list)); + + rc = ptlrpc_replay_req(req); + if (rc) { + CERROR("recovery replay error %d for req " + "%llu\n", rc, req->rq_xid); + RETURN(rc); + } + *inflight = 1; + } + RETURN(rc); +} + +/** + * Schedule resending of request on sending_list. This is done after + * we completed replaying of requests and locks. + */ +int ptlrpc_resend(struct obd_import *imp) +{ + struct ptlrpc_request *req, *next; + + ENTRY; + + /* As long as we're in recovery, nothing should be added to the sending + * list, so we don't need to hold the lock during this iteration and + * resend process. + */ + /* Well... what if lctl recover is called twice at the same time? + */ + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_RECOVER) { + spin_unlock(&imp->imp_lock); + RETURN(-1); + } + + list_for_each_entry_safe(req, next, &imp->imp_sending_list, rq_list) { + LASSERTF((long)req > PAGE_SIZE && req != LP_POISON, + "req %p bad\n", req); + LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req); + + /* If the request is allowed to be sent during replay and it + * is not timeout yet, then it does not need to be resent. */ + if (!ptlrpc_no_resend(req) && + (req->rq_timedout || !req->rq_allow_replay)) + ptlrpc_resend_req(req); + } + spin_unlock(&imp->imp_lock); + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT, 2); + RETURN(0); +} + +/** + * Go through all requests in delayed list and wake their threads + * for resending + */ +void ptlrpc_wake_delayed(struct obd_import *imp) +{ + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, pos, &imp->imp_delayed_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set); + ptlrpc_client_wake_req(req); + } + spin_unlock(&imp->imp_lock); +} + +void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) +{ + struct obd_import *imp = failed_req->rq_import; + int conn = lustre_msg_get_conn_cnt(failed_req->rq_reqmsg); + ENTRY; + + CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + if (ptlrpc_set_import_discon(imp, conn, true)) { + /* to control recovery via lctl {disable|enable}_recovery */ + if (imp->imp_deactive == 0) + ptlrpc_connect_import(imp); + } + + /* Wait for recovery to complete and resend. If evicted, then + this request will be errored out later.*/ + spin_lock(&failed_req->rq_lock); + if (!failed_req->rq_no_resend) + failed_req->rq_resend = 1; + spin_unlock(&failed_req->rq_lock); + + EXIT; +} + +/** + * Administratively active/deactive a client. + * This should only be called by the ioctl interface, currently + * - the lctl deactivate and activate commands + * - echo 0/1 >> /proc/osc/XXX/active + * - client umount -f (ll_umount_begin) + */ +int ptlrpc_set_import_active(struct obd_import *imp, int active) +{ + struct obd_device *obd = imp->imp_obd; + int rc = 0; + + ENTRY; + LASSERT(obd); + + /* When deactivating, mark import invalid, and abort in-flight + * requests. */ + if (!active) { + LCONSOLE_WARN("setting import %s INACTIVE by administrator " + "request\n", obd2cli_tgt(imp->imp_obd)); + + /* set before invalidate to avoid messages about imp_inval + * set without imp_deactive in ptlrpc_import_delay_req */ + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE); + + ptlrpc_invalidate_import(imp); + } + + /* When activating, mark import valid, and attempt recovery */ + if (active) { + CDEBUG(D_HA, "setting import %s VALID\n", + obd2cli_tgt(imp->imp_obd)); + + spin_lock(&imp->imp_lock); + imp->imp_deactive = 0; + spin_unlock(&imp->imp_lock); + obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE); + + rc = ptlrpc_recover_import(imp, NULL, 0); + } + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_set_import_active); + +/* Attempt to reconnect an import */ +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async) +{ + int rc = 0; + ENTRY; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive || + atomic_read(&imp->imp_inval_count)) + rc = -EINVAL; + spin_unlock(&imp->imp_lock); + if (rc) + GOTO(out, rc); + + /* force import to be disconnected. */ + ptlrpc_set_import_discon(imp, 0, false); + + if (new_uuid) { + struct obd_uuid uuid; + + /* intruct import to use new uuid */ + obd_str2uuid(&uuid, new_uuid); + rc = import_set_conn_priority(imp, &uuid); + if (rc) + GOTO(out, rc); + } + + /* Check if reconnect is already in progress */ + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_DISCON) { + imp->imp_force_verify = 1; + rc = -EALREADY; + } + spin_unlock(&imp->imp_lock); + if (rc) + GOTO(out, rc); + + OBD_RACE(OBD_FAIL_PTLRPC_CONNECT_RACE); + + rc = ptlrpc_connect_import(imp); + if (rc) + GOTO(out, rc); + + if (!async) { + struct l_wait_info lwi; + long secs = cfs_time_seconds(obd_timeout); + + CDEBUG(D_HA, "%s: recovery started, waiting %lu seconds\n", + obd2cli_tgt(imp->imp_obd), secs); + + lwi = LWI_TIMEOUT(secs, NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), &lwi); + CDEBUG(D_HA, "%s: recovery finished\n", + obd2cli_tgt(imp->imp_obd)); + } + EXIT; + +out: + return rc; +} +EXPORT_SYMBOL(ptlrpc_recover_import); + +int ptlrpc_import_in_recovery(struct obd_import *imp) +{ + int in_recovery = 1; + + spin_lock(&imp->imp_lock); + if (imp->imp_state <= LUSTRE_IMP_DISCON || + imp->imp_state >= LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov) + in_recovery = 0; + spin_unlock(&imp->imp_lock); + + return in_recovery; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c new file mode 100644 index 0000000000000..78c07fcefec3a --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c @@ -0,0 +1,2735 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif +#include +#include + +#ifdef HAVE_LINUX_SELINUX_IS_ENABLED +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static int send_sepol; +module_param(send_sepol, int, 0644); +MODULE_PARM_DESC(send_sepol, "Client sends SELinux policy status"); + +/*********************************************** + * policy registers * + ***********************************************/ + +static rwlock_t policy_lock; +static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = { + NULL, +}; + +int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy) +{ + __u16 number = policy->sp_policy; + + LASSERT(policy->sp_name); + LASSERT(policy->sp_cops); + LASSERT(policy->sp_sops); + + if (number >= SPTLRPC_POLICY_MAX) + return -EINVAL; + + write_lock(&policy_lock); + if (unlikely(policies[number])) { + write_unlock(&policy_lock); + return -EALREADY; + } + policies[number] = policy; + write_unlock(&policy_lock); + + CDEBUG(D_SEC, "%s: registered\n", policy->sp_name); + return 0; +} +EXPORT_SYMBOL(sptlrpc_register_policy); + +int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy) +{ + __u16 number = policy->sp_policy; + + LASSERT(number < SPTLRPC_POLICY_MAX); + + write_lock(&policy_lock); + if (unlikely(policies[number] == NULL)) { + write_unlock(&policy_lock); + CERROR("%s: already unregistered\n", policy->sp_name); + return -EINVAL; + } + + LASSERT(policies[number] == policy); + policies[number] = NULL; + write_unlock(&policy_lock); + + CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name); + return 0; +} +EXPORT_SYMBOL(sptlrpc_unregister_policy); + +static +struct ptlrpc_sec_policy * sptlrpc_wireflavor2policy(__u32 flavor) +{ + static DEFINE_MUTEX(load_mutex); + static atomic_t loaded = ATOMIC_INIT(0); + struct ptlrpc_sec_policy *policy; + __u16 number = SPTLRPC_FLVR_POLICY(flavor); + __u16 flag = 0; + + if (number >= SPTLRPC_POLICY_MAX) + return NULL; + + while (1) { + read_lock(&policy_lock); + policy = policies[number]; + if (policy && !try_module_get(policy->sp_owner)) + policy = NULL; + if (policy == NULL) + flag = atomic_read(&loaded); + read_unlock(&policy_lock); + + if (policy != NULL || flag != 0 || + number != SPTLRPC_POLICY_GSS) + break; + + /* try to load gss module, once */ + mutex_lock(&load_mutex); + if (atomic_read(&loaded) == 0) { + if (request_module("ptlrpc_gss") == 0) + CDEBUG(D_SEC, + "module ptlrpc_gss loaded on demand\n"); + else + CERROR("Unable to load module ptlrpc_gss\n"); + + atomic_set(&loaded, 1); + } + mutex_unlock(&load_mutex); + } + + return policy; +} + +__u32 sptlrpc_name2flavor_base(const char *name) +{ + if (!strcmp(name, "null")) + return SPTLRPC_FLVR_NULL; + if (!strcmp(name, "plain")) + return SPTLRPC_FLVR_PLAIN; + if (!strcmp(name, "gssnull")) + return SPTLRPC_FLVR_GSSNULL; + if (!strcmp(name, "krb5n")) + return SPTLRPC_FLVR_KRB5N; + if (!strcmp(name, "krb5a")) + return SPTLRPC_FLVR_KRB5A; + if (!strcmp(name, "krb5i")) + return SPTLRPC_FLVR_KRB5I; + if (!strcmp(name, "krb5p")) + return SPTLRPC_FLVR_KRB5P; + if (!strcmp(name, "skn")) + return SPTLRPC_FLVR_SKN; + if (!strcmp(name, "ska")) + return SPTLRPC_FLVR_SKA; + if (!strcmp(name, "ski")) + return SPTLRPC_FLVR_SKI; + if (!strcmp(name, "skpi")) + return SPTLRPC_FLVR_SKPI; + + return SPTLRPC_FLVR_INVALID; +} +EXPORT_SYMBOL(sptlrpc_name2flavor_base); + +const char *sptlrpc_flavor2name_base(__u32 flvr) +{ + __u32 base = SPTLRPC_FLVR_BASE(flvr); + + if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) + return "null"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN)) + return "plain"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_GSSNULL)) + return "gssnull"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N)) + return "krb5n"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A)) + return "krb5a"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I)) + return "krb5i"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P)) + return "krb5p"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKN)) + return "skn"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKA)) + return "ska"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKI)) + return "ski"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKPI)) + return "skpi"; + + CERROR("invalid wire flavor 0x%x\n", flvr); + return "invalid"; +} +EXPORT_SYMBOL(sptlrpc_flavor2name_base); + +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize) +{ + if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) + snprintf(buf, bufsize, "hash:%s", + sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg)); + else + snprintf(buf, bufsize, "%s", + sptlrpc_flavor2name_base(sf->sf_rpc)); + + buf[bufsize - 1] = '\0'; + return buf; +} +EXPORT_SYMBOL(sptlrpc_flavor2name_bulk); + +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize) +{ + snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc)); + + /* + * currently we don't support customized bulk specification for + * flavors other than plain + */ + if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) { + char bspec[16]; + + bspec[0] = '-'; + sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1); + strncat(buf, bspec, bufsize); + } + + buf[bufsize - 1] = '\0'; + return buf; +} +EXPORT_SYMBOL(sptlrpc_flavor2name); + +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_SEC_FL_REVERSE) + strlcat(buf, "reverse,", bufsize); + if (flags & PTLRPC_SEC_FL_ROOTONLY) + strlcat(buf, "rootonly,", bufsize); + if (flags & PTLRPC_SEC_FL_UDESC) + strlcat(buf, "udesc,", bufsize); + if (flags & PTLRPC_SEC_FL_BULK) + strlcat(buf, "bulk,", bufsize); + if (buf[0] == '\0') + strlcat(buf, "-,", bufsize); + + return buf; +} +EXPORT_SYMBOL(sptlrpc_secflags2str); + +/************************************************** + * client context APIs * + **************************************************/ + +static +struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec) +{ + struct vfs_cred vcred; + int create = 1, remove_dead = 1; + + LASSERT(sec); + LASSERT(sec->ps_policy->sp_cops->lookup_ctx); + + if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE | + PTLRPC_SEC_FL_ROOTONLY)) { + vcred.vc_uid = 0; + vcred.vc_gid = 0; + if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) { + create = 0; + remove_dead = 0; + } + } else { + vcred.vc_uid = from_kuid(&init_user_ns, current_uid()); + vcred.vc_gid = from_kgid(&init_user_ns, current_gid()); + } + + return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred, create, + remove_dead); +} + +struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx) +{ + atomic_inc(&ctx->cc_refcount); + return ctx; +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_get); + +void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + + LASSERT(sec); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (!atomic_dec_and_test(&ctx->cc_refcount)) + return; + + sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_put); + +/** + * Expire the client context immediately. + * + * \pre Caller must hold at least 1 reference on the \a ctx. + */ +void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(ctx->cc_ops->die); + ctx->cc_ops->die(ctx, 0); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_expire); + +/** + * To wake up the threads who are waiting for this client context. Called + * after some status change happened on \a ctx. + */ +void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_request *req, *next; + + spin_lock(&ctx->cc_lock); + list_for_each_entry_safe(req, next, &ctx->cc_req_list, + rq_ctx_chain) { + list_del_init(&req->rq_ctx_chain); + ptlrpc_client_wake_req(req); + } + spin_unlock(&ctx->cc_lock); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup); + +int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize) +{ + LASSERT(ctx->cc_ops); + + if (ctx->cc_ops->display == NULL) + return 0; + + return ctx->cc_ops->display(ctx, buf, bufsize); +} + +static int import_sec_check_expire(struct obd_import *imp) +{ + int adapt = 0; + + spin_lock(&imp->imp_lock); + if (imp->imp_sec_expire && + imp->imp_sec_expire < ktime_get_real_seconds()) { + adapt = 1; + imp->imp_sec_expire = 0; + } + spin_unlock(&imp->imp_lock); + + if (!adapt) + return 0; + + CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n"); + return sptlrpc_import_sec_adapt(imp, NULL, NULL); +} + +/** + * Get and validate the client side ptlrpc security facilities from + * \a imp. There is a race condition on client reconnect when the import is + * being destroyed while there are outstanding client bound requests. In + * this case do not output any error messages if import secuity is not + * found. + * + * \param[in] imp obd import associated with client + * \param[out] sec client side ptlrpc security + * + * \retval 0 if security retrieved successfully + * \retval -ve errno if there was a problem + */ +static int import_sec_validate_get(struct obd_import *imp, + struct ptlrpc_sec **sec) +{ + int rc; + + if (unlikely(imp->imp_sec_expire)) { + rc = import_sec_check_expire(imp); + if (rc) + return rc; + } + + *sec = sptlrpc_import_sec_ref(imp); + if (*sec == NULL) { + /* Only output an error when the import is still active */ + if (!test_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(&imp->imp_zombie_work))) + CERROR("import %p (%s) with no sec\n", + imp, ptlrpc_import_state_name(imp->imp_state)); + return -EACCES; + } + + if (unlikely((*sec)->ps_dying)) { + CERROR("attempt to use dying sec %p\n", sec); + sptlrpc_sec_put(*sec); + return -EACCES; + } + + return 0; +} + +/** + * Given a \a req, find or allocate an appropriate context for it. + * \pre req->rq_cli_ctx == NULL. + * + * \retval 0 succeed, and req->rq_cli_ctx is set. + * \retval -ev error number, and req->rq_cli_ctx == NULL. + */ +int sptlrpc_req_get_ctx(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct ptlrpc_sec *sec; + int rc; + ENTRY; + + LASSERT(!req->rq_cli_ctx); + LASSERT(imp); + + rc = import_sec_validate_get(imp, &sec); + if (rc) + RETURN(rc); + + req->rq_cli_ctx = get_my_ctx(sec); + + sptlrpc_sec_put(sec); + + if (!req->rq_cli_ctx) { + CERROR("req %p: fail to get context\n", req); + RETURN(-ECONNREFUSED); + } + + RETURN(0); +} + +/** + * Drop the context for \a req. + * \pre req->rq_cli_ctx != NULL. + * \post req->rq_cli_ctx == NULL. + * + * If \a sync == 0, this function should return quickly without sleep; + * otherwise it might trigger and wait for the whole process of sending + * an context-destroying rpc to server. + */ +void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync) +{ + ENTRY; + + LASSERT(req); + LASSERT(req->rq_cli_ctx); + + /* request might be asked to release earlier while still + * in the context waiting list. + */ + if (!list_empty(&req->rq_ctx_chain)) { + spin_lock(&req->rq_cli_ctx->cc_lock); + list_del_init(&req->rq_ctx_chain); + spin_unlock(&req->rq_cli_ctx->cc_lock); + } + + sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync); + req->rq_cli_ctx = NULL; + EXIT; +} + +static +int sptlrpc_req_ctx_switch(struct ptlrpc_request *req, + struct ptlrpc_cli_ctx *oldctx, + struct ptlrpc_cli_ctx *newctx) +{ + struct sptlrpc_flavor old_flvr; + char *reqmsg = NULL; /* to workaround old gcc */ + int reqmsg_size; + int rc = 0; + + LASSERT(req->rq_reqmsg); + LASSERT(req->rq_reqlen); + LASSERT(req->rq_replen); + + CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), " + "switch sec %p(%s) -> %p(%s)\n", req, + oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec), + newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec), + oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name, + newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name); + + /* save flavor */ + old_flvr = req->rq_flvr; + + /* save request message */ + reqmsg_size = req->rq_reqlen; + if (reqmsg_size != 0) { + OBD_ALLOC_LARGE(reqmsg, reqmsg_size); + if (reqmsg == NULL) + return -ENOMEM; + memcpy(reqmsg, req->rq_reqmsg, reqmsg_size); + } + + /* release old req/rep buf */ + req->rq_cli_ctx = oldctx; + sptlrpc_cli_free_reqbuf(req); + sptlrpc_cli_free_repbuf(req); + req->rq_cli_ctx = newctx; + + /* recalculate the flavor */ + sptlrpc_req_set_flavor(req, 0); + + /* alloc new request buffer + * we don't need to alloc reply buffer here, leave it to the + * rest procedure of ptlrpc */ + if (reqmsg_size != 0) { + rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size); + if (!rc) { + LASSERT(req->rq_reqmsg); + memcpy(req->rq_reqmsg, reqmsg, reqmsg_size); + } else { + CWARN("failed to alloc reqbuf: %d\n", rc); + req->rq_flvr = old_flvr; + } + + OBD_FREE_LARGE(reqmsg, reqmsg_size); + } + return rc; +} + +/** + * If current context of \a req is dead somehow, e.g. we just switched flavor + * thus marked original contexts dead, we'll find a new context for it. if + * no switch is needed, \a req will end up with the same context. + * + * \note a request must have a context, to keep other parts of code happy. + * In any case of failure during the switching, we must restore the old one. + */ +int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx; + struct ptlrpc_cli_ctx *newctx; + int rc; + ENTRY; + + LASSERT(oldctx); + + sptlrpc_cli_ctx_get(oldctx); + sptlrpc_req_put_ctx(req, 0); + + rc = sptlrpc_req_get_ctx(req); + if (unlikely(rc)) { + LASSERT(!req->rq_cli_ctx); + + /* restore old ctx */ + req->rq_cli_ctx = oldctx; + RETURN(rc); + } + + newctx = req->rq_cli_ctx; + LASSERT(newctx); + + if (unlikely(newctx == oldctx && + test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) { + /* + * still get the old dead ctx, usually means system too busy + */ + CDEBUG(D_SEC, + "ctx (%p, fl %lx) doesn't switch, relax a little bit\n", + newctx, newctx->cc_flags); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); + } else if (unlikely(test_bit(PTLRPC_CTX_UPTODATE_BIT, &newctx->cc_flags) + == 0)) { + /* + * new ctx not up to date yet + */ + CDEBUG(D_SEC, + "ctx (%p, fl %lx) doesn't switch, not up to date yet\n", + newctx, newctx->cc_flags); + } else { + /* + * it's possible newctx == oldctx if we're switching + * subflavor with the same sec. + */ + rc = sptlrpc_req_ctx_switch(req, oldctx, newctx); + if (rc) { + /* restore old ctx */ + sptlrpc_req_put_ctx(req, 0); + req->rq_cli_ctx = oldctx; + RETURN(rc); + } + + LASSERT(req->rq_cli_ctx == newctx); + } + + sptlrpc_cli_ctx_put(oldctx, 1); + RETURN(0); +} +EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx); + +static +int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx) +{ + if (cli_ctx_is_refreshed(ctx)) + return 1; + return 0; +} + +static +int ctx_refresh_timeout(void *data) +{ + struct ptlrpc_request *req = data; + int rc; + + /* conn_cnt is needed in expire_one_request */ + lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt); + + rc = ptlrpc_expire_one_request(req, 1); + /* if we started recovery, we should mark this ctx dead; otherwise + * in case of lgssd died nobody would retire this ctx, following + * connecting will still find the same ctx thus cause deadlock. + * there's an assumption that expire time of the request should be + * later than the context refresh expire time. + */ + if (rc == 0) + req->rq_cli_ctx->cc_ops->die(req->rq_cli_ctx, 0); + return rc; +} + +static +void ctx_refresh_interrupt(void *data) +{ + struct ptlrpc_request *req = data; + + spin_lock(&req->rq_lock); + req->rq_intr = 1; + spin_unlock(&req->rq_lock); +} + +static +void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx) +{ + spin_lock(&ctx->cc_lock); + if (!list_empty(&req->rq_ctx_chain)) + list_del_init(&req->rq_ctx_chain); + spin_unlock(&ctx->cc_lock); +} + +/** + * To refresh the context of \req, if it's not up-to-date. + * \param timeout + * - < 0: don't wait + * - = 0: wait until success or fatal error occur + * - > 0: timeout value (in seconds) + * + * The status of the context could be subject to be changed by other threads + * at any time. We allow this race, but once we return with 0, the caller will + * suppose it's uptodated and keep using it until the owning rpc is done. + * + * \retval 0 only if the context is uptodated. + * \retval -ev error number. + */ +int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec *sec; + struct l_wait_info lwi; + int rc; + ENTRY; + + LASSERT(ctx); + + if (req->rq_ctx_init || req->rq_ctx_fini) + RETURN(0); + + /* + * during the process a request's context might change type even + * (e.g. from gss ctx to null ctx), so each loop we need to re-check + * everything + */ +again: + rc = import_sec_validate_get(req->rq_import, &sec); + if (rc) + RETURN(rc); + + if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) { + CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n", + req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc); + req_off_ctx_list(req, ctx); + sptlrpc_req_replace_dead_ctx(req); + ctx = req->rq_cli_ctx; + } + sptlrpc_sec_put(sec); + + if (cli_ctx_is_eternal(ctx)) + RETURN(0); + + if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) { + if (ctx->cc_ops->refresh) + ctx->cc_ops->refresh(ctx); + } + LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0); + + LASSERT(ctx->cc_ops->validate); + if (ctx->cc_ops->validate(ctx) == 0) { + req_off_ctx_list(req, ctx); + RETURN(0); + } + + if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) { + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + req_off_ctx_list(req, ctx); + RETURN(-EPERM); + } + + /* + * There's a subtle issue for resending RPCs, suppose following + * situation: + * 1. the request was sent to server. + * 2. recovery was kicked start, after finished the request was + * marked as resent. + * 3. resend the request. + * 4. old reply from server received, we accept and verify the reply. + * this has to be success, otherwise the error will be aware + * by application. + * 5. new reply from server received, dropped by LNet. + * + * Note the xid of old & new request is the same. We can't simply + * change xid for the resent request because the server replies on + * it for reply reconstruction. + * + * Commonly the original context should be uptodate because we + * have an expiry nice time; server will keep its context because + * we at least hold a ref of old context which prevent context + * from destroying RPC being sent. So server still can accept the + * request and finish the RPC. But if that's not the case: + * 1. If server side context has been trimmed, a NO_CONTEXT will + * be returned, gss_cli_ctx_verify/unseal will switch to new + * context by force. + * 2. Current context never be refreshed, then we are fine: we + * never really send request with old context before. + */ + if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) && + unlikely(req->rq_reqmsg) && + lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { + req_off_ctx_list(req, ctx); + RETURN(0); + } + + if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) { + req_off_ctx_list(req, ctx); + /* + * don't switch ctx if import was deactivated + */ + if (req->rq_import->imp_deactive) { + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + RETURN(-EINTR); + } + + rc = sptlrpc_req_replace_dead_ctx(req); + if (rc) { + LASSERT(ctx == req->rq_cli_ctx); + CERROR("req %p: failed to replace dead ctx %p: %d\n", + req, ctx, rc); + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + RETURN(rc); + } + + ctx = req->rq_cli_ctx; + goto again; + } + + /* + * Now we're sure this context is during upcall, add myself into + * waiting list + */ + spin_lock(&ctx->cc_lock); + if (list_empty(&req->rq_ctx_chain)) + list_add(&req->rq_ctx_chain, &ctx->cc_req_list); + spin_unlock(&ctx->cc_lock); + + if (timeout < 0) + RETURN(-EWOULDBLOCK); + + /* Clear any flags that may be present from previous sends */ + LASSERT(req->rq_receiving_reply == 0); + spin_lock(&req->rq_lock); + req->rq_err = 0; + req->rq_timedout = 0; + req->rq_resend = 0; + req->rq_restart = 0; + spin_unlock(&req->rq_lock); + + lwi = LWI_TIMEOUT_INTR(msecs_to_jiffies(timeout * MSEC_PER_SEC), + ctx_refresh_timeout, + ctx_refresh_interrupt, req); + rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi); + + /* + * following cases could lead us here: + * - successfully refreshed; + * - interrupted; + * - timedout, and we don't want recover from the failure; + * - timedout, and waked up upon recovery finished; + * - someone else mark this ctx dead by force; + * - someone invalidate the req and call ptlrpc_client_wake_req(), + * e.g. ptlrpc_abort_inflight(); + */ + if (!cli_ctx_is_refreshed(ctx)) { + /* timed out or interruptted */ + req_off_ctx_list(req, ctx); + + LASSERT(rc != 0); + RETURN(rc); + } + + goto again; +} + +/* Bring ptlrpc_sec context up-to-date */ +int sptlrpc_export_update_ctx(struct obd_export *exp) +{ + struct obd_import *imp = exp ? exp->exp_imp_reverse : NULL; + struct ptlrpc_sec *sec = NULL; + struct ptlrpc_cli_ctx *ctx = NULL; + int rc = 0; + + if (imp) + sec = sptlrpc_import_sec_ref(imp); + if (sec) { + ctx = get_my_ctx(sec); + sptlrpc_sec_put(sec); + } + + if (ctx) { + if (ctx->cc_ops->refresh) + rc = ctx->cc_ops->refresh(ctx); + sptlrpc_cli_ctx_put(ctx, 1); + } + return rc; +} + +/** + * Initialize flavor settings for \a req, according to \a opcode. + * + * \note this could be called in two situations: + * - new request from ptlrpc_pre_req(), with proper @opcode + * - old request which changed ctx in the middle, with @opcode == 0 + */ +void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode) +{ + struct ptlrpc_sec *sec; + + LASSERT(req->rq_import); + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_cli_ctx->cc_sec); + LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0); + + /* special security flags according to opcode */ + switch (opcode) { + case OST_READ: + case MDS_READPAGE: + case MGS_CONFIG_READ: + case OBD_IDX_READ: + req->rq_bulk_read = 1; + break; + case OST_WRITE: + case MDS_WRITEPAGE: + req->rq_bulk_write = 1; + break; + case SEC_CTX_INIT: + req->rq_ctx_init = 1; + break; + case SEC_CTX_FINI: + req->rq_ctx_fini = 1; + break; + case 0: + /* init/fini rpc won't be resend, so can't be here */ + LASSERT(req->rq_ctx_init == 0); + LASSERT(req->rq_ctx_fini == 0); + + /* cleanup flags, which should be recalculated */ + req->rq_pack_udesc = 0; + req->rq_pack_bulk = 0; + break; + } + + sec = req->rq_cli_ctx->cc_sec; + + spin_lock(&sec->ps_lock); + req->rq_flvr = sec->ps_flvr; + spin_unlock(&sec->ps_lock); + + /* force SVC_NULL for context initiation rpc, SVC_INTG for context + * destruction rpc */ + if (unlikely(req->rq_ctx_init)) + flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL); + else if (unlikely(req->rq_ctx_fini)) + flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG); + + /* user descriptor flag, null security can't do it anyway */ + if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) && + (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL)) + req->rq_pack_udesc = 1; + + /* bulk security flag */ + if ((req->rq_bulk_read || req->rq_bulk_write) && + sptlrpc_flavor_has_bulk(&req->rq_flvr)) + req->rq_pack_bulk = 1; +} + +void sptlrpc_request_out_callback(struct ptlrpc_request *req) +{ + if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV) + return; + + LASSERT(req->rq_clrbuf); + if (req->rq_pool || !req->rq_reqbuf) + return; + + OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; +} + +/** + * Given an import \a imp, check whether current user has a valid context + * or not. We may create a new context and try to refresh it, and try + * repeatedly try in case of non-fatal errors. Return 0 means success. + */ +int sptlrpc_import_check_ctx(struct obd_import *imp) +{ + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + struct ptlrpc_request *req = NULL; + int rc; + ENTRY; + + might_sleep(); + + sec = sptlrpc_import_sec_ref(imp); + ctx = get_my_ctx(sec); + sptlrpc_sec_put(sec); + + if (!ctx) + RETURN(-ENOMEM); + + if (cli_ctx_is_eternal(ctx) || + ctx->cc_ops->validate(ctx) == 0) { + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(0); + } + + if (cli_ctx_is_error(ctx)) { + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(-EACCES); + } + + req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (!req) + RETURN(-ENOMEM); + + ptlrpc_cli_req_init(req); + atomic_set(&req->rq_refcount, 10000); + + req->rq_import = imp; + req->rq_flvr = sec->ps_flvr; + req->rq_cli_ctx = ctx; + + rc = sptlrpc_req_refresh_ctx(req, 0); + LASSERT(list_empty(&req->rq_ctx_chain)); + sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1); + ptlrpc_request_cache_free(req); + + RETURN(rc); +} + +/** + * Used by ptlrpc client, to perform the pre-defined security transformation + * upon the request message of \a req. After this function called, + * req->rq_reqmsg is still accessible as clear text. + */ +int sptlrpc_cli_wrap_request(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + int rc = 0; + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(req->rq_reqbuf || req->rq_clrbuf); + + /* we wrap bulk request here because now we can be sure + * the context is uptodate. + */ + if (req->rq_bulk) { + rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk); + if (rc) + RETURN(rc); + } + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(ctx->cc_ops->sign); + rc = ctx->cc_ops->sign(ctx, req); + break; + case SPTLRPC_SVC_PRIV: + LASSERT(ctx->cc_ops->seal); + rc = ctx->cc_ops->seal(ctx, req); + break; + default: + LBUG(); + } + + if (rc == 0) { + LASSERT(req->rq_reqdata_len); + LASSERT(req->rq_reqdata_len % 8 == 0); + LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len); + } + + RETURN(rc); +} + +static int do_cli_unwrap_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + int rc; + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata); + LASSERT(req->rq_repmsg == NULL); + + req->rq_rep_swab_mask = 0; + + rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len); + switch (rc) { + case 1: + lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); + case 0: + break; + default: + CERROR("failed unpack reply: x%llu\n", req->rq_xid); + RETURN(-EPROTO); + } + + if (req->rq_repdata_len < sizeof(struct lustre_msg)) { + CERROR("replied data length %d too small\n", + req->rq_repdata_len); + RETURN(-EPROTO); + } + + if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) != + SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) { + CERROR("reply policy %u doesn't match request policy %u\n", + SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr), + SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)); + RETURN(-EPROTO); + } + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(ctx->cc_ops->verify); + rc = ctx->cc_ops->verify(ctx, req); + break; + case SPTLRPC_SVC_PRIV: + LASSERT(ctx->cc_ops->unseal); + rc = ctx->cc_ops->unseal(ctx, req); + break; + default: + LBUG(); + } + LASSERT(rc || req->rq_repmsg || req->rq_resend); + + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL && + !req->rq_ctx_init) + req->rq_rep_swab_mask = 0; + RETURN(rc); +} + +/** + * Used by ptlrpc client, to perform security transformation upon the reply + * message of \a req. After return successfully, req->rq_repmsg points to + * the reply message in clear text. + * + * \pre the reply buffer should have been un-posted from LNet, so nothing is + * going to change. + */ +int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req) +{ + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata == NULL); + LASSERT(req->rq_repmsg == NULL); + LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len); + + if (req->rq_reply_off == 0 && + (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { + CERROR("real reply with offset 0\n"); + return -EPROTO; + } + + if (req->rq_reply_off % 8 != 0) { + CERROR("reply at odd offset %u\n", req->rq_reply_off); + return -EPROTO; + } + + req->rq_repdata = (struct lustre_msg *) + (req->rq_repbuf + req->rq_reply_off); + req->rq_repdata_len = req->rq_nob_received; + + return do_cli_unwrap_reply(req); +} + +/** + * Used by ptlrpc client, to perform security transformation upon the early + * reply message of \a req. We expect the rq_reply_off is 0, and + * rq_nob_received is the early reply size. + * + * Because the receive buffer might be still posted, the reply data might be + * changed at any time, no matter we're holding rq_lock or not. For this reason + * we allocate a separate ptlrpc_request and reply buffer for early reply + * processing. + * + * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request. + * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned + * \a *req_ret to release it. + * \retval -ev error number, and \a req_ret will not be set. + */ +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, + struct ptlrpc_request **req_ret) +{ + struct ptlrpc_request *early_req; + char *early_buf; + int early_bufsz, early_size; + int rc; + ENTRY; + + early_req = ptlrpc_request_cache_alloc(GFP_NOFS); + if (early_req == NULL) + RETURN(-ENOMEM); + + ptlrpc_cli_req_init(early_req); + + early_size = req->rq_nob_received; + early_bufsz = size_roundup_power2(early_size); + OBD_ALLOC_LARGE(early_buf, early_bufsz); + if (early_buf == NULL) + GOTO(err_req, rc = -ENOMEM); + + /* sanity checkings and copy data out, do it inside spinlock */ + spin_lock(&req->rq_lock); + + if (req->rq_replied) { + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EALREADY); + } + + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata == NULL); + LASSERT(req->rq_repmsg == NULL); + + if (req->rq_reply_off != 0) { + CERROR("early reply with offset %u\n", req->rq_reply_off); + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EPROTO); + } + + if (req->rq_nob_received != early_size) { + /* even another early arrived the size should be the same */ + CERROR("data size has changed from %u to %u\n", + early_size, req->rq_nob_received); + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EINVAL); + } + + if (req->rq_nob_received < sizeof(struct lustre_msg)) { + CERROR("early reply length %d too small\n", + req->rq_nob_received); + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EALREADY); + } + + memcpy(early_buf, req->rq_repbuf, early_size); + spin_unlock(&req->rq_lock); + + early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx); + early_req->rq_flvr = req->rq_flvr; + early_req->rq_repbuf = early_buf; + early_req->rq_repbuf_len = early_bufsz; + early_req->rq_repdata = (struct lustre_msg *) early_buf; + early_req->rq_repdata_len = early_size; + early_req->rq_early = 1; + early_req->rq_reqmsg = req->rq_reqmsg; + + rc = do_cli_unwrap_reply(early_req); + if (rc) { + DEBUG_REQ(D_ADAPTTO, early_req, + "error %d unwrap early reply", rc); + GOTO(err_ctx, rc); + } + + LASSERT(early_req->rq_repmsg); + *req_ret = early_req; + RETURN(0); + +err_ctx: + sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1); +err_buf: + OBD_FREE_LARGE(early_buf, early_bufsz); +err_req: + ptlrpc_request_cache_free(early_req); + RETURN(rc); +} + +/** + * Used by ptlrpc client, to release a processed early reply \a early_req. + * + * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply(). + */ +void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req) +{ + LASSERT(early_req->rq_repbuf); + LASSERT(early_req->rq_repdata); + LASSERT(early_req->rq_repmsg); + + sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1); + OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len); + ptlrpc_request_cache_free(early_req); +} + +/************************************************** + * sec ID * + **************************************************/ + +/* + * "fixed" sec (e.g. null) use sec_id < 0 + */ +static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1); + +int sptlrpc_get_next_secid(void) +{ + return atomic_inc_return(&sptlrpc_sec_id); +} +EXPORT_SYMBOL(sptlrpc_get_next_secid); + +/************************************************** + * client side high-level security APIs * + **************************************************/ + +static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid, + int grace, int force) +{ + struct ptlrpc_sec_policy *policy = sec->ps_policy; + + LASSERT(policy->sp_cops); + LASSERT(policy->sp_cops->flush_ctx_cache); + + return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force); +} + +static void sec_cop_destroy_sec(struct ptlrpc_sec *sec) +{ + struct ptlrpc_sec_policy *policy = sec->ps_policy; + + LASSERT_ATOMIC_ZERO(&sec->ps_refcount); + LASSERT_ATOMIC_ZERO(&sec->ps_nctx); + LASSERT(policy->sp_cops->destroy_sec); + + CDEBUG(D_SEC, "%s@%p: being destroied\n", sec->ps_policy->sp_name, sec); + + policy->sp_cops->destroy_sec(sec); + sptlrpc_policy_put(policy); +} + +void sptlrpc_sec_destroy(struct ptlrpc_sec *sec) +{ + sec_cop_destroy_sec(sec); +} +EXPORT_SYMBOL(sptlrpc_sec_destroy); + +static void sptlrpc_sec_kill(struct ptlrpc_sec *sec) +{ + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + if (sec->ps_policy->sp_cops->kill_sec) { + sec->ps_policy->sp_cops->kill_sec(sec); + + sec_cop_flush_ctx_cache(sec, -1, 1, 1); + } +} + +struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec) +{ + if (sec) + atomic_inc(&sec->ps_refcount); + + return sec; +} +EXPORT_SYMBOL(sptlrpc_sec_get); + +void sptlrpc_sec_put(struct ptlrpc_sec *sec) +{ + if (sec) { + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + if (atomic_dec_and_test(&sec->ps_refcount)) { + sptlrpc_gc_del_sec(sec); + sec_cop_destroy_sec(sec); + } + } +} +EXPORT_SYMBOL(sptlrpc_sec_put); + +/* + * policy module is responsible for taking refrence of import + */ +static +struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf, + enum lustre_sec_part sp) +{ + struct ptlrpc_sec_policy *policy; + struct ptlrpc_sec *sec; + char str[32]; + ENTRY; + + if (svc_ctx) { + LASSERT(imp->imp_dlm_fake == 1); + + CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n", + imp->imp_obd->obd_type->typ_name, + imp->imp_obd->obd_name, + sptlrpc_flavor2name(sf, str, sizeof(str))); + + policy = sptlrpc_policy_get(svc_ctx->sc_policy); + sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY; + } else { + LASSERT(imp->imp_dlm_fake == 0); + + CDEBUG(D_SEC, "%s %s: select security flavor %s\n", + imp->imp_obd->obd_type->typ_name, + imp->imp_obd->obd_name, + sptlrpc_flavor2name(sf, str, sizeof(str))); + + policy = sptlrpc_wireflavor2policy(sf->sf_rpc); + if (!policy) { + CERROR("invalid flavor 0x%x\n", sf->sf_rpc); + RETURN(NULL); + } + } + + sec = policy->sp_cops->create_sec(imp, svc_ctx, sf); + if (sec) { + atomic_inc(&sec->ps_refcount); + + sec->ps_part = sp; + + if (sec->ps_gc_interval && policy->sp_cops->gc_ctx) + sptlrpc_gc_add_sec(sec); + } else { + sptlrpc_policy_put(policy); + } + + RETURN(sec); +} + +struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp) +{ + struct ptlrpc_sec *sec; + + spin_lock(&imp->imp_lock); + sec = sptlrpc_sec_get(imp->imp_sec); + spin_unlock(&imp->imp_lock); + + return sec; +} +EXPORT_SYMBOL(sptlrpc_import_sec_ref); + +static void sptlrpc_import_sec_install(struct obd_import *imp, + struct ptlrpc_sec *sec) +{ + struct ptlrpc_sec *old_sec; + + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + spin_lock(&imp->imp_lock); + old_sec = imp->imp_sec; + imp->imp_sec = sec; + spin_unlock(&imp->imp_lock); + + if (old_sec) { + sptlrpc_sec_kill(old_sec); + + /* balance the ref taken by this import */ + sptlrpc_sec_put(old_sec); + } +} + +static inline +int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2) +{ + return (memcmp(sf1, sf2, sizeof(*sf1)) == 0); +} + +static inline +void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src) +{ + *dst = *src; +} + +/** + * To get an appropriate ptlrpc_sec for the \a imp, according to the current + * configuration. Upon called, imp->imp_sec may or may not be NULL. + * + * - regular import: \a svc_ctx should be NULL and \a flvr is ignored; + * - reverse import: \a svc_ctx and \a flvr are obtained from incoming request. + */ +int sptlrpc_import_sec_adapt(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *flvr) +{ + struct ptlrpc_connection *conn; + struct sptlrpc_flavor sf; + struct ptlrpc_sec *sec, *newsec; + enum lustre_sec_part sp; + char str[24]; + int rc = 0; + ENTRY; + + might_sleep(); + + if (imp == NULL) + RETURN(0); + + conn = imp->imp_connection; + + if (svc_ctx == NULL) { + struct client_obd *cliobd = &imp->imp_obd->u.cli; + /* + * normal import, determine flavor from rule set, except + * for mgc the flavor is predetermined. + */ + if (cliobd->cl_sp_me == LUSTRE_SP_MGC) + sf = cliobd->cl_flvr_mgc; + else + sptlrpc_conf_choose_flavor(cliobd->cl_sp_me, + cliobd->cl_sp_to, + &cliobd->cl_target_uuid, + conn->c_self, &sf); + + sp = imp->imp_obd->u.cli.cl_sp_me; + } else { + /* reverse import, determine flavor from incoming reqeust */ + sf = *flvr; + + if (sf.sf_rpc != SPTLRPC_FLVR_NULL) + sf.sf_flags = PTLRPC_SEC_FL_REVERSE | + PTLRPC_SEC_FL_ROOTONLY; + + sp = sptlrpc_target_sec_part(imp->imp_obd); + } + + sec = sptlrpc_import_sec_ref(imp); + if (sec) { + char str2[24]; + + if (flavor_equal(&sf, &sec->ps_flvr)) + GOTO(out, rc); + + CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid), + sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)), + sptlrpc_flavor2name(&sf, str2, sizeof(str2))); + } else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) != + SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) { + CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid), + LNET_NIDNET(conn->c_self), + sptlrpc_flavor2name(&sf, str, sizeof(str))); + } + + mutex_lock(&imp->imp_sec_mutex); + + newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp); + if (newsec) { + sptlrpc_import_sec_install(imp, newsec); + } else { + CERROR("import %s->%s: failed to create new sec\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid)); + rc = -EPERM; + } + + mutex_unlock(&imp->imp_sec_mutex); +out: + sptlrpc_sec_put(sec); + RETURN(rc); +} + +void sptlrpc_import_sec_put(struct obd_import *imp) +{ + if (imp->imp_sec) { + sptlrpc_sec_kill(imp->imp_sec); + + sptlrpc_sec_put(imp->imp_sec); + imp->imp_sec = NULL; + } +} + +static void import_flush_ctx_common(struct obd_import *imp, + uid_t uid, int grace, int force) +{ + struct ptlrpc_sec *sec; + + if (imp == NULL) + return; + + sec = sptlrpc_import_sec_ref(imp); + if (sec == NULL) + return; + + sec_cop_flush_ctx_cache(sec, uid, grace, force); + sptlrpc_sec_put(sec); +} + +void sptlrpc_import_flush_root_ctx(struct obd_import *imp) +{ + /* it's important to use grace mode, see explain in + * sptlrpc_req_refresh_ctx() */ + import_flush_ctx_common(imp, 0, 1, 1); +} + +void sptlrpc_import_flush_my_ctx(struct obd_import *imp) +{ + import_flush_ctx_common(imp, from_kuid(&init_user_ns, current_uid()), + 1, 1); +} +EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx); + +void sptlrpc_import_flush_all_ctx(struct obd_import *imp) +{ + import_flush_ctx_common(imp, -1, 1, 1); +} +EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx); + +/** + * Used by ptlrpc client to allocate request buffer of \a req. Upon return + * successfully, req->rq_reqmsg points to a buffer with size \a msgsize. + */ +int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + int rc; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT(req->rq_reqmsg == NULL); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + policy = ctx->cc_sec->ps_policy; + rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize); + if (!rc) { + LASSERT(req->rq_reqmsg); + LASSERT(req->rq_reqbuf || req->rq_clrbuf); + + /* zeroing preallocated buffer */ + if (req->rq_pool) + memset(req->rq_reqmsg, 0, msgsize); + } + + return rc; +} + +/** + * Used by ptlrpc client to free request buffer of \a req. After this + * req->rq_reqmsg is set to NULL and should not be accessed anymore. + */ +void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL) + return; + + policy = ctx->cc_sec->ps_policy; + policy->sp_cops->free_reqbuf(ctx->cc_sec, req); + req->rq_reqmsg = NULL; +} + +/* + * NOTE caller must guarantee the buffer size is enough for the enlargement + */ +void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, + int segment, int newsize) +{ + void *src, *dst; + int oldsize, oldmsg_size, movesize; + + LASSERT(segment < msg->lm_bufcount); + LASSERT(msg->lm_buflens[segment] <= newsize); + + if (msg->lm_buflens[segment] == newsize) + return; + + /* nothing to do if we are enlarging the last segment */ + if (segment == msg->lm_bufcount - 1) { + msg->lm_buflens[segment] = newsize; + return; + } + + oldsize = msg->lm_buflens[segment]; + + src = lustre_msg_buf(msg, segment + 1, 0); + msg->lm_buflens[segment] = newsize; + dst = lustre_msg_buf(msg, segment + 1, 0); + msg->lm_buflens[segment] = oldsize; + + /* move from segment + 1 to end segment */ + LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2); + oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg); + LASSERT(movesize >= 0); + + if (movesize) + memmove(dst, src, movesize); + + /* note we don't clear the ares where old data live, not secret */ + + /* finally set new segment size */ + msg->lm_buflens[segment] = newsize; +} +EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace); + +/** + * Used by ptlrpc client to enlarge the \a segment of request message pointed + * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be + * preserved after the enlargement. this must be called after original request + * buffer being allocated. + * + * \note after this be called, rq_reqmsg and rq_reqlen might have been changed, + * so caller should refresh its local pointers if needed. + */ +int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, + const struct req_msg_field *field, + int newsize) +{ + struct req_capsule *pill = &req->rq_pill; + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_cops *cops; + struct lustre_msg *msg = req->rq_reqmsg; + int segment = __req_capsule_offset(pill, field, RCL_CLIENT); + + LASSERT(ctx); + LASSERT(msg); + LASSERT(msg->lm_bufcount > segment); + LASSERT(msg->lm_buflens[segment] <= newsize); + + if (msg->lm_buflens[segment] == newsize) + return 0; + + cops = ctx->cc_sec->ps_policy->sp_cops; + LASSERT(cops->enlarge_reqbuf); + return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize); +} +EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf); + +/** + * Used by ptlrpc client to allocate reply buffer of \a req. + * + * \note After this, req->rq_repmsg is still not accessible. + */ +int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + + if (req->rq_repbuf) + RETURN(0); + + policy = ctx->cc_sec->ps_policy; + RETURN(policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize)); +} + +/** + * Used by ptlrpc client to free reply buffer of \a req. After this + * req->rq_repmsg is set to NULL and should not be accessed anymore. + */ +void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (req->rq_repbuf == NULL) + return; + LASSERT(req->rq_repbuf_len); + + policy = ctx->cc_sec->ps_policy; + policy->sp_cops->free_repbuf(ctx->cc_sec, req); + req->rq_repmsg = NULL; + EXIT; +} +EXPORT_SYMBOL(sptlrpc_cli_free_repbuf); + +int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy; + + if (!policy->sp_cops->install_rctx) + return 0; + return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx); +} + +int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx) +{ + struct ptlrpc_sec_policy *policy = ctx->sc_policy; + + if (!policy->sp_sops->install_rctx) + return 0; + return policy->sp_sops->install_rctx(imp, ctx); +} + +/* Get SELinux policy info from userspace */ +static int sepol_helper(struct obd_import *imp) +{ + char mtime_str[21] = { 0 }, mode_str[2] = { 0 }; + char *argv[] = { + [0] = "/usr/sbin/l_getsepol", + [1] = "-o", + [2] = NULL, /* obd type */ + [3] = "-n", + [4] = NULL, /* obd name */ + [5] = "-t", + [6] = mtime_str, /* policy mtime */ + [7] = "-m", + [8] = mode_str, /* enforcing mode */ + [9] = NULL + }; + char *envp[] = { + [0] = "HOME=/", + [1] = "PATH=/sbin:/usr/sbin", + [2] = NULL + }; + signed short ret; + int rc = 0; + + if (imp == NULL || imp->imp_obd == NULL || + imp->imp_obd->obd_type == NULL) { + rc = -EINVAL; + } else { + argv[2] = imp->imp_obd->obd_type->typ_name; + argv[4] = imp->imp_obd->obd_name; + spin_lock(&imp->imp_sec->ps_lock); + if (ktime_to_ns(imp->imp_sec->ps_sepol_mtime) == 0 && + imp->imp_sec->ps_sepol[0] == '\0') { + /* ps_sepol has not been initialized */ + argv[5] = NULL; + argv[7] = NULL; + } else { + time64_t mtime_ms; + + mtime_ms = ktime_to_ms(imp->imp_sec->ps_sepol_mtime); + snprintf(mtime_str, sizeof(mtime_str), "%lld", + mtime_ms / MSEC_PER_SEC); + mode_str[0] = imp->imp_sec->ps_sepol[0]; + } + spin_unlock(&imp->imp_sec->ps_lock); + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + rc = ret>>8; + } + + return rc; +} + +static inline int sptlrpc_sepol_needs_check(struct ptlrpc_sec *imp_sec) +{ + ktime_t checknext; + + if (send_sepol == 0 || !selinux_is_enabled()) + return 0; + + if (send_sepol == -1) + /* send_sepol == -1 means fetch sepol status every time */ + return 1; + + spin_lock(&imp_sec->ps_lock); + checknext = imp_sec->ps_sepol_checknext; + spin_unlock(&imp_sec->ps_lock); + + /* next check is too far in time, please update */ + if (ktime_after(checknext, + ktime_add(ktime_get(), ktime_set(send_sepol, 0)))) + goto setnext; + + if (ktime_before(ktime_get(), checknext)) + /* too early to fetch sepol status */ + return 0; + +setnext: + /* define new sepol_checknext time */ + spin_lock(&imp_sec->ps_lock); + imp_sec->ps_sepol_checknext = ktime_add(ktime_get(), + ktime_set(send_sepol, 0)); + spin_unlock(&imp_sec->ps_lock); + + return 1; +} + +int sptlrpc_get_sepol(struct ptlrpc_request *req) +{ + struct ptlrpc_sec *imp_sec = req->rq_import->imp_sec; + int rc = 0; + + ENTRY; + + (req->rq_sepol)[0] = '\0'; + +#ifndef HAVE_SELINUX + if (unlikely(send_sepol != 0)) + CDEBUG(D_SEC, "Client cannot report SELinux status, " + "it was not built against libselinux.\n"); + RETURN(0); +#endif + + if (send_sepol == 0 || !selinux_is_enabled()) + RETURN(0); + + if (imp_sec == NULL) + RETURN(-EINVAL); + + /* Retrieve SELinux status info */ + if (sptlrpc_sepol_needs_check(imp_sec)) + rc = sepol_helper(req->rq_import); + if (likely(rc == 0)) { + spin_lock(&imp_sec->ps_lock); + memcpy(req->rq_sepol, imp_sec->ps_sepol, + sizeof(req->rq_sepol)); + spin_unlock(&imp_sec->ps_lock); + } + + RETURN(rc); +} +EXPORT_SYMBOL(sptlrpc_get_sepol); + +/**************************************** + * server side security * + ****************************************/ + +static int flavor_allowed(struct sptlrpc_flavor *exp, + struct ptlrpc_request *req) +{ + struct sptlrpc_flavor *flvr = &req->rq_flvr; + + if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc) + return 1; + + if ((req->rq_ctx_init || req->rq_ctx_fini) && + SPTLRPC_FLVR_POLICY(exp->sf_rpc) == + SPTLRPC_FLVR_POLICY(flvr->sf_rpc) && + SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc)) + return 1; + + return 0; +} + +#define EXP_FLVR_UPDATE_EXPIRE (OBD_TIMEOUT_DEFAULT + 10) + +/** + * Given an export \a exp, check whether the flavor of incoming \a req + * is allowed by the export \a exp. Main logic is about taking care of + * changing configurations. Return 0 means success. + */ +int sptlrpc_target_export_check(struct obd_export *exp, + struct ptlrpc_request *req) +{ + struct sptlrpc_flavor flavor; + + if (exp == NULL) + return 0; + + /* client side export has no imp_reverse, skip + * FIXME maybe we should check flavor this as well??? */ + if (exp->exp_imp_reverse == NULL) + return 0; + + /* don't care about ctx fini rpc */ + if (req->rq_ctx_fini) + return 0; + + spin_lock(&exp->exp_lock); + + /* if flavor just changed (exp->exp_flvr_changed != 0), we wait for + * the first req with the new flavor, then treat it as current flavor, + * adapt reverse sec according to it. + * note the first rpc with new flavor might not be with root ctx, in + * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. */ + if (unlikely(exp->exp_flvr_changed) && + flavor_allowed(&exp->exp_flvr_old[1], req)) { + /* make the new flavor as "current", and old ones as + * about-to-expire */ + CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp, + exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc); + flavor = exp->exp_flvr_old[1]; + exp->exp_flvr_old[1] = exp->exp_flvr_old[0]; + exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0]; + exp->exp_flvr_old[0] = exp->exp_flvr; + exp->exp_flvr_expire[0] = ktime_get_real_seconds() + + EXP_FLVR_UPDATE_EXPIRE; + exp->exp_flvr = flavor; + + /* flavor change finished */ + exp->exp_flvr_changed = 0; + LASSERT(exp->exp_flvr_adapt == 1); + + /* if it's gss, we only interested in root ctx init */ + if (req->rq_auth_gss && + !(req->rq_ctx_init && + (req->rq_auth_usr_root || req->rq_auth_usr_mdt || + req->rq_auth_usr_ost))) { + spin_unlock(&exp->exp_lock); + CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n", + req->rq_auth_gss, req->rq_ctx_init, + req->rq_auth_usr_root, req->rq_auth_usr_mdt, + req->rq_auth_usr_ost); + return 0; + } + + exp->exp_flvr_adapt = 0; + spin_unlock(&exp->exp_lock); + + return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, + req->rq_svc_ctx, &flavor); + } + + /* if it equals to the current flavor, we accept it, but need to + * dealing with reverse sec/ctx */ + if (likely(flavor_allowed(&exp->exp_flvr, req))) { + /* most cases should return here, we only interested in + * gss root ctx init */ + if (!req->rq_auth_gss || !req->rq_ctx_init || + (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt && + !req->rq_auth_usr_ost)) { + spin_unlock(&exp->exp_lock); + return 0; + } + + /* if flavor just changed, we should not proceed, just leave + * it and current flavor will be discovered and replaced + * shortly, and let _this_ rpc pass through */ + if (exp->exp_flvr_changed) { + LASSERT(exp->exp_flvr_adapt); + spin_unlock(&exp->exp_lock); + return 0; + } + + if (exp->exp_flvr_adapt) { + exp->exp_flvr_adapt = 0; + CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + flavor = exp->exp_flvr; + spin_unlock(&exp->exp_lock); + + return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, + req->rq_svc_ctx, + &flavor); + } else { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, " + "install rvs ctx\n", exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + spin_unlock(&exp->exp_lock); + + return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse, + req->rq_svc_ctx); + } + } + + if (exp->exp_flvr_expire[0]) { + if (exp->exp_flvr_expire[0] >= ktime_get_real_seconds()) { + if (flavor_allowed(&exp->exp_flvr_old[0], req)) { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the middle one (%lld)\n", exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc, + (s64)(exp->exp_flvr_expire[0] - + ktime_get_real_seconds())); + spin_unlock(&exp->exp_lock); + return 0; + } + } else { + CDEBUG(D_SEC, "mark middle expired\n"); + exp->exp_flvr_expire[0] = 0; + } + CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc, + req->rq_flvr.sf_rpc); + } + + /* now it doesn't match the current flavor, the only chance we can + * accept it is match the old flavors which is not expired. */ + if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) { + if (exp->exp_flvr_expire[1] >= ktime_get_real_seconds()) { + if (flavor_allowed(&exp->exp_flvr_old[1], req)) { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the oldest one (%lld)\n", + exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc, + (s64)(exp->exp_flvr_expire[1] - + ktime_get_real_seconds())); + spin_unlock(&exp->exp_lock); + return 0; + } + } else { + CDEBUG(D_SEC, "mark oldest expired\n"); + exp->exp_flvr_expire[1] = 0; + } + CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc, + req->rq_flvr.sf_rpc); + } else { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n", + exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + } + + spin_unlock(&exp->exp_lock); + + CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with unauthorized flavor %x, expect %x|%x(%+lld)|%x(%+lld)\n", + exp, exp->exp_obd->obd_name, + req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini, + req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost, + req->rq_flvr.sf_rpc, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_expire[0] ? + (s64)(exp->exp_flvr_expire[0] - ktime_get_real_seconds()) : 0, + exp->exp_flvr_old[1].sf_rpc, + exp->exp_flvr_expire[1] ? + (s64)(exp->exp_flvr_expire[1] - ktime_get_real_seconds()) : 0); + return -EACCES; +} +EXPORT_SYMBOL(sptlrpc_target_export_check); + +void sptlrpc_target_update_exp_flavor(struct obd_device *obd, + struct sptlrpc_rule_set *rset) +{ + struct obd_export *exp; + struct sptlrpc_flavor new_flvr; + + LASSERT(obd); + + spin_lock(&obd->obd_dev_lock); + + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) { + if (exp->exp_connection == NULL) + continue; + + /* note if this export had just been updated flavor + * (exp_flvr_changed == 1), this will override the + * previous one. */ + spin_lock(&exp->exp_lock); + sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer, + exp->exp_connection->c_peer.nid, + &new_flvr); + if (exp->exp_flvr_changed || + !flavor_equal(&new_flvr, &exp->exp_flvr)) { + exp->exp_flvr_old[1] = new_flvr; + exp->exp_flvr_expire[1] = 0; + exp->exp_flvr_changed = 1; + exp->exp_flvr_adapt = 1; + + CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n", + exp, sptlrpc_part2name(exp->exp_sp_peer), + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + } + spin_unlock(&exp->exp_lock); + } + + spin_unlock(&obd->obd_dev_lock); +} +EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor); + +static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc) +{ + /* peer's claim is unreliable unless gss is being used */ + if (!req->rq_auth_gss || svc_rc == SECSVC_DROP) + return svc_rc; + + switch (req->rq_sp_from) { + case LUSTRE_SP_CLI: + if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) { + DEBUG_REQ(D_ERROR, req, "faked source CLI"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_MDT: + if (!req->rq_auth_usr_mdt) { + DEBUG_REQ(D_ERROR, req, "faked source MDT"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_OST: + if (!req->rq_auth_usr_ost) { + DEBUG_REQ(D_ERROR, req, "faked source OST"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_MGS: + case LUSTRE_SP_MGC: + if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt && + !req->rq_auth_usr_ost) { + DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_ANY: + default: + DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from); + svc_rc = SECSVC_DROP; + } + + return svc_rc; +} + +/** + * Used by ptlrpc server, to perform transformation upon request message of + * incoming \a req. This must be the first thing to do with an incoming + * request in ptlrpc layer. + * + * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in + * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set. + * \retval SECSVC_COMPLETE success, the request has been fully processed, and + * reply message has been prepared. + * \retval SECSVC_DROP failed, this request should be dropped. + */ +int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req) +{ + struct ptlrpc_sec_policy *policy; + struct lustre_msg *msg = req->rq_reqbuf; + int rc; + ENTRY; + + LASSERT(msg); + LASSERT(req->rq_reqmsg == NULL); + LASSERT(req->rq_repmsg == NULL); + LASSERT(req->rq_svc_ctx == NULL); + + req->rq_req_swab_mask = 0; + + rc = __lustre_unpack_msg(msg, req->rq_reqdata_len); + switch (rc) { + case 1: + lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); + case 0: + break; + default: + CERROR("error unpacking request from %s x%llu\n", + libcfs_id2str(req->rq_peer), req->rq_xid); + RETURN(SECSVC_DROP); + } + + req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr); + req->rq_sp_from = LUSTRE_SP_ANY; + req->rq_auth_uid = -1; /* set to INVALID_UID */ + req->rq_auth_mapped_uid = -1; + + policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc); + if (!policy) { + CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc); + RETURN(SECSVC_DROP); + } + + LASSERT(policy->sp_sops->accept); + rc = policy->sp_sops->accept(req); + sptlrpc_policy_put(policy); + LASSERT(req->rq_reqmsg || rc != SECSVC_OK); + LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP); + + /* + * if it's not null flavor (which means embedded packing msg), + * reset the swab mask for the comming inner msg unpacking. + */ + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) + req->rq_req_swab_mask = 0; + + /* sanity check for the request source */ + rc = sptlrpc_svc_check_from(req, rc); + RETURN(rc); +} + +/** + * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed, + * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to + * a buffer of \a msglen size. + */ +int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen) +{ + struct ptlrpc_sec_policy *policy; + struct ptlrpc_reply_state *rs; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_svc_ctx->sc_policy); + + policy = req->rq_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->alloc_rs); + + rc = policy->sp_sops->alloc_rs(req, msglen); + if (unlikely(rc == -ENOMEM)) { + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + if (svcpt->scp_service->srv_max_reply_size < + msglen + sizeof(struct ptlrpc_reply_state)) { + /* Just return failure if the size is too big */ + CERROR("size of message is too big (%zd), %d allowed\n", + msglen + sizeof(struct ptlrpc_reply_state), + svcpt->scp_service->srv_max_reply_size); + RETURN(-ENOMEM); + } + + /* failed alloc, try emergency pool */ + rs = lustre_get_emerg_rs(svcpt); + if (rs == NULL) + RETURN(-ENOMEM); + + req->rq_reply_state = rs; + rc = policy->sp_sops->alloc_rs(req, msglen); + if (rc) { + lustre_put_emerg_rs(rs); + req->rq_reply_state = NULL; + } + } + + LASSERT(rc != 0 || + (req->rq_reply_state && req->rq_reply_state->rs_msg)); + + RETURN(rc); +} + +/** + * Used by ptlrpc server, to perform transformation upon reply message. + * + * \post req->rq_reply_off is set to approriate server-controlled reply offset. + * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible. + */ +int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_sec_policy *policy; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_svc_ctx->sc_policy); + + policy = req->rq_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->authorize); + + rc = policy->sp_sops->authorize(req); + LASSERT(rc || req->rq_reply_state->rs_repdata_len); + + RETURN(rc); +} + +/** + * Used by ptlrpc server, to free reply_state. + */ +void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_sec_policy *policy; + unsigned int prealloc; + ENTRY; + + LASSERT(rs->rs_svc_ctx); + LASSERT(rs->rs_svc_ctx->sc_policy); + + policy = rs->rs_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->free_rs); + + prealloc = rs->rs_prealloc; + policy->sp_sops->free_rs(rs); + + if (prealloc) + lustre_put_emerg_rs(rs); + EXIT; +} + +void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx != NULL) + atomic_inc(&ctx->sc_refcount); +} + +void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx == NULL) + return; + + LASSERT_ATOMIC_POS(&ctx->sc_refcount); + if (atomic_dec_and_test(&ctx->sc_refcount)) { + if (ctx->sc_policy->sp_sops->free_ctx) + ctx->sc_policy->sp_sops->free_ctx(ctx); + } + req->rq_svc_ctx = NULL; +} + +void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx == NULL) + return; + + LASSERT_ATOMIC_POS(&ctx->sc_refcount); + if (ctx->sc_policy->sp_sops->invalidate_ctx) + ctx->sc_policy->sp_sops->invalidate_ctx(ctx); +} +EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate); + +/**************************************** + * bulk security * + ****************************************/ + +/** + * Perform transformation upon bulk data pointed by \a desc. This is called + * before transforming the request message. + */ +int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_cli_ctx *ctx; + + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->wrap_bulk) + return ctx->cc_ops->wrap_bulk(ctx, req, desc); + return 0; +} +EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk); + +/** + * This is called after unwrap the reply message. + * return nob of actual plain text size received, or error code. + */ +int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + int nob) +{ + struct ptlrpc_cli_ctx *ctx; + int rc; + + LASSERT(req->rq_bulk_read && !req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return desc->bd_nob_transferred; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->unwrap_bulk) { + rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); + if (rc < 0) + return rc; + } + return desc->bd_nob_transferred; +} +EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read); + +/** + * This is called after unwrap the reply message. + * return 0 for success or error code. + */ +int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_cli_ctx *ctx; + int rc; + + LASSERT(!req->rq_bulk_read && req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->unwrap_bulk) { + rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); + if (rc < 0) + return rc; + } + + /* + * if everything is going right, nob should equals to nob_transferred. + * in case of privacy mode, nob_transferred needs to be adjusted. + */ + if (desc->bd_nob != desc->bd_nob_transferred) { + CERROR("nob %d doesn't match transferred nob %d\n", + desc->bd_nob, desc->bd_nob_transferred); + return -EPROTO; + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write); + +#ifdef HAVE_SERVER_SUPPORT +/** + * Performe transformation upon outgoing bulk read. + */ +int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_svc_ctx *ctx; + + LASSERT(req->rq_bulk_read); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_svc_ctx; + if (ctx->sc_policy->sp_sops->wrap_bulk) + return ctx->sc_policy->sp_sops->wrap_bulk(req, desc); + + return 0; +} +EXPORT_SYMBOL(sptlrpc_svc_wrap_bulk); + +/** + * Performe transformation upon incoming bulk write. + */ +int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_svc_ctx *ctx; + int rc; + + LASSERT(req->rq_bulk_write); + + /* + * if it's in privacy mode, transferred should >= expected; otherwise + * transferred should == expected. + */ + if (desc->bd_nob_transferred < desc->bd_nob || + (desc->bd_nob_transferred > desc->bd_nob && + SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != + SPTLRPC_BULK_SVC_PRIV)) { + DEBUG_REQ(D_ERROR, req, "truncated bulk GET %d(%d)", + desc->bd_nob_transferred, desc->bd_nob); + return -ETIMEDOUT; + } + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_svc_ctx; + if (ctx->sc_policy->sp_sops->unwrap_bulk) { + rc = ctx->sc_policy->sp_sops->unwrap_bulk(req, desc); + if (rc) + CERROR("error unwrap bulk: %d\n", rc); + } + + /* return 0 to allow reply be sent */ + return 0; +} +EXPORT_SYMBOL(sptlrpc_svc_unwrap_bulk); + +/** + * Prepare buffers for incoming bulk write. + */ +int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_svc_ctx *ctx; + + LASSERT(req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_svc_ctx; + if (ctx->sc_policy->sp_sops->prep_bulk) + return ctx->sc_policy->sp_sops->prep_bulk(req, desc); + + return 0; +} +EXPORT_SYMBOL(sptlrpc_svc_prep_bulk); + +#endif /* HAVE_SERVER_SUPPORT */ + +/**************************************** + * user descriptor helpers * + ****************************************/ + +int sptlrpc_current_user_desc_size(void) +{ + int ngroups; + + ngroups = current_ngroups; + + if (ngroups > LUSTRE_MAX_GROUPS) + ngroups = LUSTRE_MAX_GROUPS; + return sptlrpc_user_desc_size(ngroups); +} +EXPORT_SYMBOL(sptlrpc_current_user_desc_size); + +int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset) +{ + struct ptlrpc_user_desc *pud; + + pud = lustre_msg_buf(msg, offset, 0); + + pud->pud_uid = from_kuid(&init_user_ns, current_uid()); + pud->pud_gid = from_kgid(&init_user_ns, current_gid()); + pud->pud_fsuid = from_kuid(&init_user_ns, current_fsuid()); + pud->pud_fsgid = from_kgid(&init_user_ns, current_fsgid()); + pud->pud_cap = cfs_curproc_cap_pack(); + pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4; + + task_lock(current); + if (pud->pud_ngroups > current_ngroups) + pud->pud_ngroups = current_ngroups; +#ifdef HAVE_GROUP_INFO_GID + memcpy(pud->pud_groups, current_cred()->group_info->gid, + pud->pud_ngroups * sizeof(__u32)); +#else /* !HAVE_GROUP_INFO_GID */ + memcpy(pud->pud_groups, current_cred()->group_info->blocks[0], + pud->pud_ngroups * sizeof(__u32)); +#endif /* HAVE_GROUP_INFO_GID */ + task_unlock(current); + + return 0; +} +EXPORT_SYMBOL(sptlrpc_pack_user_desc); + +int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed) +{ + struct ptlrpc_user_desc *pud; + int i; + + pud = lustre_msg_buf(msg, offset, sizeof(*pud)); + if (!pud) + return -EINVAL; + + if (swabbed) { + __swab32s(&pud->pud_uid); + __swab32s(&pud->pud_gid); + __swab32s(&pud->pud_fsuid); + __swab32s(&pud->pud_fsgid); + __swab32s(&pud->pud_cap); + __swab32s(&pud->pud_ngroups); + } + + if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) { + CERROR("%u groups is too large\n", pud->pud_ngroups); + return -EINVAL; + } + + if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) > + msg->lm_buflens[offset]) { + CERROR("%u groups are claimed but bufsize only %u\n", + pud->pud_ngroups, msg->lm_buflens[offset]); + return -EINVAL; + } + + if (swabbed) { + for (i = 0; i < pud->pud_ngroups; i++) + __swab32s(&pud->pud_groups[i]); + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_unpack_user_desc); + +/**************************************** + * misc helpers * + ****************************************/ + +const char * sec2target_str(struct ptlrpc_sec *sec) +{ + if (!sec || !sec->ps_import || !sec->ps_import->imp_obd) + return "*"; + if (sec_is_reverse(sec)) + return "c"; + return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid); +} +EXPORT_SYMBOL(sec2target_str); + +/* + * return true if the bulk data is protected + */ +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr) +{ + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_INTG: + case SPTLRPC_BULK_SVC_PRIV: + return 1; + default: + return 0; + } +} +EXPORT_SYMBOL(sptlrpc_flavor_has_bulk); + +/**************************************** + * crypto API helper/alloc blkciper * + ****************************************/ + +/**************************************** + * initialize/finalize * + ****************************************/ + +int sptlrpc_init(void) +{ + int rc; + + rwlock_init(&policy_lock); + + rc = sptlrpc_gc_init(); + if (rc) + goto out; + + rc = sptlrpc_conf_init(); + if (rc) + goto out_gc; + + rc = sptlrpc_enc_pool_init(); + if (rc) + goto out_conf; + + rc = sptlrpc_null_init(); + if (rc) + goto out_pool; + + rc = sptlrpc_plain_init(); + if (rc) + goto out_null; + + rc = sptlrpc_lproc_init(); + if (rc) + goto out_plain; + + return 0; + +out_plain: + sptlrpc_plain_fini(); +out_null: + sptlrpc_null_fini(); +out_pool: + sptlrpc_enc_pool_fini(); +out_conf: + sptlrpc_conf_fini(); +out_gc: + sptlrpc_gc_fini(); +out: + return rc; +} + +void sptlrpc_fini(void) +{ + sptlrpc_lproc_fini(); + sptlrpc_plain_fini(); + sptlrpc_null_fini(); + sptlrpc_enc_pool_fini(); + sptlrpc_conf_fini(); + sptlrpc_gc_fini(); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c new file mode 100644 index 0000000000000..216c2f2a0820b --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c @@ -0,0 +1,959 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_bulk.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static int mult = 20 - PAGE_SHIFT; +static int enc_pool_max_memory_mb; +module_param(enc_pool_max_memory_mb, int, 0644); +MODULE_PARM_DESC(enc_pool_max_memory_mb, + "Encoding pool max memory (MB), 1/8 of total physical memory by default"); + +/**************************************** + * bulk encryption page pools * + ****************************************/ + + +#define PTRS_PER_PAGE (PAGE_SIZE / sizeof(void *)) +#define PAGES_PER_POOL (PTRS_PER_PAGE) + +#define IDLE_IDX_MAX (100) +#define IDLE_IDX_WEIGHT (3) + +#define CACHE_QUIESCENT_PERIOD (20) + +static struct ptlrpc_enc_page_pool { + /* + * constants + */ + unsigned long epp_max_pages; /* maximum pages can hold, const */ + unsigned int epp_max_pools; /* number of pools, const */ + + /* + * wait queue in case of not enough free pages. + */ + wait_queue_head_t epp_waitq; /* waiting threads */ + unsigned int epp_waitqlen; /* wait queue length */ + unsigned long epp_pages_short; /* # of pages wanted of in-q users */ + unsigned int epp_growing:1; /* during adding pages */ + + /* + * indicating how idle the pools are, from 0 to MAX_IDLE_IDX + * this is counted based on each time when getting pages from + * the pools, not based on time. which means in case that system + * is idled for a while but the idle_idx might still be low if no + * activities happened in the pools. + */ + unsigned long epp_idle_idx; + + /* last shrink time due to mem tight */ + time64_t epp_last_shrink; + time64_t epp_last_access; + + /* + * in-pool pages bookkeeping + */ + spinlock_t epp_lock; /* protect following fields */ + unsigned long epp_total_pages; /* total pages in pools */ + unsigned long epp_free_pages; /* current pages available */ + + /* + * statistics + */ + unsigned long epp_st_max_pages; /* # of pages ever reached */ + unsigned int epp_st_grows; /* # of grows */ + unsigned int epp_st_grow_fails; /* # of add pages failures */ + unsigned int epp_st_shrinks; /* # of shrinks */ + unsigned long epp_st_access; /* # of access */ + unsigned long epp_st_missings; /* # of cache missing */ + unsigned long epp_st_lowfree; /* lowest free pages reached */ + unsigned int epp_st_max_wqlen; /* highest waitqueue length */ + ktime_t epp_st_max_wait; /* in nanoseconds */ + unsigned long epp_st_outofmem; /* # of out of mem requests */ + /* + * pointers to pools, may be vmalloc'd + */ + struct page ***epp_pools; +} page_pools; + +/* + * memory shrinker + */ +static const int pools_shrinker_seeks = DEFAULT_SEEKS; +static struct shrinker *pools_shrinker; + + +/* + * /proc/fs/lustre/sptlrpc/encrypt_page_pools + */ +int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v) +{ + spin_lock(&page_pools.epp_lock); + + seq_printf(m, "physical pages: %lu\n" + "pages per pool: %lu\n" + "max pages: %lu\n" + "max pools: %u\n" + "total pages: %lu\n" + "total free: %lu\n" + "idle index: %lu/100\n" + "last shrink: %llds\n" + "last access: %llds\n" + "max pages reached: %lu\n" + "grows: %u\n" + "grows failure: %u\n" + "shrinks: %u\n" + "cache access: %lu\n" + "cache missing: %lu\n" + "low free mark: %lu\n" + "max waitqueue depth: %u\n" + "max wait time ms: %lld\n" + "out of mem: %lu\n", + cfs_totalram_pages(), PAGES_PER_POOL, + page_pools.epp_max_pages, + page_pools.epp_max_pools, + page_pools.epp_total_pages, + page_pools.epp_free_pages, + page_pools.epp_idle_idx, + ktime_get_seconds() - page_pools.epp_last_shrink, + ktime_get_seconds() - page_pools.epp_last_access, + page_pools.epp_st_max_pages, + page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, + page_pools.epp_st_access, + page_pools.epp_st_missings, + page_pools.epp_st_lowfree, + page_pools.epp_st_max_wqlen, + ktime_to_ms(page_pools.epp_st_max_wait), + page_pools.epp_st_outofmem); + + spin_unlock(&page_pools.epp_lock); + return 0; +} + +static void enc_pools_release_free_pages(long npages) +{ + int p_idx, g_idx; + int p_idx_max1, p_idx_max2; + + LASSERT(npages > 0); + LASSERT(npages <= page_pools.epp_free_pages); + LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages); + + /* max pool index before the release */ + p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL; + + page_pools.epp_free_pages -= npages; + page_pools.epp_total_pages -= npages; + + /* max pool index after the release */ + p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 : + ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + LASSERT(page_pools.epp_pools[p_idx]); + + while (npages--) { + LASSERT(page_pools.epp_pools[p_idx]); + LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); + + __free_page(page_pools.epp_pools[p_idx][g_idx]); + page_pools.epp_pools[p_idx][g_idx] = NULL; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + /* free unused pools */ + while (p_idx_max1 < p_idx_max2) { + LASSERT(page_pools.epp_pools[p_idx_max2]); + OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_SIZE); + page_pools.epp_pools[p_idx_max2] = NULL; + p_idx_max2--; + } +} + +/* + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static unsigned long enc_pools_shrink_count(struct shrinker *s, + struct shrink_control *sc) +{ + /* + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. + */ + if (unlikely(ktime_get_seconds() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + spin_lock(&page_pools.epp_lock); + page_pools.epp_idle_idx = IDLE_IDX_MAX; + spin_unlock(&page_pools.epp_lock); + } + + LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); + return (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES) ? 0 : + (page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES) * + (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX; +} + +/* + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static unsigned long enc_pools_shrink_scan(struct shrinker *s, + struct shrink_control *sc) +{ + spin_lock(&page_pools.epp_lock); + if (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES) + sc->nr_to_scan = 0; + else + sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan, + page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES); + if (sc->nr_to_scan > 0) { + enc_pools_release_free_pages(sc->nr_to_scan); + CDEBUG(D_SEC, "released %ld pages, %ld left\n", + (long)sc->nr_to_scan, page_pools.epp_free_pages); + + page_pools.epp_st_shrinks++; + page_pools.epp_last_shrink = ktime_get_seconds(); + } + spin_unlock(&page_pools.epp_lock); + + /* + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. + */ + if (unlikely(ktime_get_seconds() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + spin_lock(&page_pools.epp_lock); + page_pools.epp_idle_idx = IDLE_IDX_MAX; + spin_unlock(&page_pools.epp_lock); + } + + LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); + return sc->nr_to_scan; +} + +#ifndef HAVE_SHRINKER_COUNT +/* + * could be called frequently for query (@nr_to_scan == 0). + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static int enc_pools_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + struct shrink_control scv = { + .nr_to_scan = shrink_param(sc, nr_to_scan), + .gfp_mask = shrink_param(sc, gfp_mask) + }; +#if !defined(HAVE_SHRINKER_WANT_SHRINK_PTR) && !defined(HAVE_SHRINK_CONTROL) + struct shrinker* shrinker = NULL; +#endif + + enc_pools_shrink_scan(shrinker, &scv); + + return enc_pools_shrink_count(shrinker, &scv); +} + +#endif /* HAVE_SHRINKER_COUNT */ + +static inline +int npages_to_npools(unsigned long npages) +{ + return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL); +} + +/* + * return how many pages cleaned up. + */ +static unsigned long enc_pools_cleanup(struct page ***pools, int npools) +{ + unsigned long cleaned = 0; + int i, j; + + for (i = 0; i < npools; i++) { + if (pools[i]) { + for (j = 0; j < PAGES_PER_POOL; j++) { + if (pools[i][j]) { + __free_page(pools[i][j]); + cleaned++; + } + } + OBD_FREE(pools[i], PAGE_SIZE); + pools[i] = NULL; + } + } + + return cleaned; +} + +/* + * merge @npools pointed by @pools which contains @npages new pages + * into current pools. + * + * we have options to avoid most memory copy with some tricks. but we choose + * the simplest way to avoid complexity. It's not frequently called. + */ +static void enc_pools_insert(struct page ***pools, int npools, int npages) +{ + int freeslot; + int op_idx, np_idx, og_idx, ng_idx; + int cur_npools, end_npools; + + LASSERT(npages > 0); + LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages); + LASSERT(npages_to_npools(npages) == npools); + LASSERT(page_pools.epp_growing); + + spin_lock(&page_pools.epp_lock); + + /* + * (1) fill all the free slots of current pools. + */ + /* free slots are those left by rent pages, and the extra ones with + * index >= total_pages, locate at the tail of last pool. */ + freeslot = page_pools.epp_total_pages % PAGES_PER_POOL; + if (freeslot != 0) + freeslot = PAGES_PER_POOL - freeslot; + freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages; + + op_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + og_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + np_idx = npools - 1; + ng_idx = (npages - 1) % PAGES_PER_POOL; + + while (freeslot) { + LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL); + LASSERT(pools[np_idx][ng_idx] != NULL); + + page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx]; + pools[np_idx][ng_idx] = NULL; + + freeslot--; + + if (++og_idx == PAGES_PER_POOL) { + op_idx++; + og_idx = 0; + } + if (--ng_idx < 0) { + if (np_idx == 0) + break; + np_idx--; + ng_idx = PAGES_PER_POOL - 1; + } + } + + /* + * (2) add pools if needed. + */ + cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) / + PAGES_PER_POOL; + end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL -1) / + PAGES_PER_POOL; + LASSERT(end_npools <= page_pools.epp_max_pools); + + np_idx = 0; + while (cur_npools < end_npools) { + LASSERT(page_pools.epp_pools[cur_npools] == NULL); + LASSERT(np_idx < npools); + LASSERT(pools[np_idx] != NULL); + + page_pools.epp_pools[cur_npools++] = pools[np_idx]; + pools[np_idx++] = NULL; + } + + page_pools.epp_total_pages += npages; + page_pools.epp_free_pages += npages; + page_pools.epp_st_lowfree = page_pools.epp_free_pages; + + if (page_pools.epp_total_pages > page_pools.epp_st_max_pages) + page_pools.epp_st_max_pages = page_pools.epp_total_pages; + + CDEBUG(D_SEC, "add %d pages to total %lu\n", npages, + page_pools.epp_total_pages); + + spin_unlock(&page_pools.epp_lock); +} + +static int enc_pools_add_pages(int npages) +{ + static DEFINE_MUTEX(add_pages_mutex); + struct page ***pools; + int npools, alloced = 0; + int i, j, rc = -ENOMEM; + + if (npages < PTLRPC_MAX_BRW_PAGES) + npages = PTLRPC_MAX_BRW_PAGES; + + mutex_lock(&add_pages_mutex); + + if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages) + npages = page_pools.epp_max_pages - page_pools.epp_total_pages; + LASSERT(npages > 0); + + page_pools.epp_st_grows++; + + npools = npages_to_npools(npages); + OBD_ALLOC(pools, npools * sizeof(*pools)); + if (pools == NULL) + goto out; + + for (i = 0; i < npools; i++) { + OBD_ALLOC(pools[i], PAGE_SIZE); + if (pools[i] == NULL) + goto out_pools; + + for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) { + pools[i][j] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + if (pools[i][j] == NULL) + goto out_pools; + + alloced++; + } + } + LASSERT(alloced == npages); + + enc_pools_insert(pools, npools, npages); + CDEBUG(D_SEC, "added %d pages into pools\n", npages); + rc = 0; + +out_pools: + enc_pools_cleanup(pools, npools); + OBD_FREE(pools, npools * sizeof(*pools)); +out: + if (rc) { + page_pools.epp_st_grow_fails++; + CERROR("Failed to allocate %d enc pages\n", npages); + } + + mutex_unlock(&add_pages_mutex); + return rc; +} + +static inline void enc_pools_wakeup(void) +{ + assert_spin_locked(&page_pools.epp_lock); + + if (unlikely(page_pools.epp_waitqlen)) { + LASSERT(waitqueue_active(&page_pools.epp_waitq)); + wake_up_all(&page_pools.epp_waitq); + } +} + +static int enc_pools_should_grow(int page_needed, time64_t now) +{ + /* don't grow if someone else is growing the pools right now, + * or the pools has reached its full capacity + */ + if (page_pools.epp_growing || + page_pools.epp_total_pages == page_pools.epp_max_pages) + return 0; + + /* if total pages is not enough, we need to grow */ + if (page_pools.epp_total_pages < page_needed) + return 1; + + /* + * we wanted to return 0 here if there was a shrink just + * happened a moment ago, but this may cause deadlock if both + * client and ost live on single node. + */ + + /* + * here we perhaps need consider other factors like wait queue + * length, idle index, etc. ? + */ + + /* grow the pools in any other cases */ + return 1; +} + +/* + * Export the number of free pages in the pool + */ +int get_free_pages_in_pool(void) +{ + return page_pools.epp_free_pages; +} +EXPORT_SYMBOL(get_free_pages_in_pool); + +/* + * Let outside world know if enc_pool full capacity is reached + */ +int pool_is_at_full_capacity(void) +{ + return (page_pools.epp_total_pages == page_pools.epp_max_pages); +} +EXPORT_SYMBOL(pool_is_at_full_capacity); + +/* + * we allocate the requested pages atomically. + */ +int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc) +{ + wait_queue_entry_t waitlink; + unsigned long this_idle = -1; + u64 tick_ns = 0; + time64_t now; + int p_idx, g_idx; + int i; + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + LASSERT(desc->bd_iov_count > 0); + LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages); + + /* resent bulk, enc iov might have been allocated previously */ + if (GET_ENC_KIOV(desc) != NULL) + return 0; + + OBD_ALLOC_LARGE(GET_ENC_KIOV(desc), + desc->bd_iov_count * sizeof(*GET_ENC_KIOV(desc))); + if (GET_ENC_KIOV(desc) == NULL) + return -ENOMEM; + + spin_lock(&page_pools.epp_lock); + + page_pools.epp_st_access++; +again: + if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) { + if (tick_ns == 0) + tick_ns = ktime_get_ns(); + + now = ktime_get_real_seconds(); + + page_pools.epp_st_missings++; + page_pools.epp_pages_short += desc->bd_iov_count; + + if (enc_pools_should_grow(desc->bd_iov_count, now)) { + page_pools.epp_growing = 1; + + spin_unlock(&page_pools.epp_lock); + enc_pools_add_pages(page_pools.epp_pages_short / 2); + spin_lock(&page_pools.epp_lock); + + page_pools.epp_growing = 0; + + enc_pools_wakeup(); + } else { + if (page_pools.epp_growing) { + if (++page_pools.epp_waitqlen > + page_pools.epp_st_max_wqlen) + page_pools.epp_st_max_wqlen = + page_pools.epp_waitqlen; + + set_current_state(TASK_UNINTERRUPTIBLE); + init_waitqueue_entry(&waitlink, current); + add_wait_queue(&page_pools.epp_waitq, + &waitlink); + + spin_unlock(&page_pools.epp_lock); + schedule(); + remove_wait_queue(&page_pools.epp_waitq, + &waitlink); + LASSERT(page_pools.epp_waitqlen > 0); + spin_lock(&page_pools.epp_lock); + page_pools.epp_waitqlen--; + } else { + /* ptlrpcd thread should not sleep in that case, + * or deadlock may occur! + * Instead, return -ENOMEM so that upper layers + * will put request back in queue. */ + page_pools.epp_st_outofmem++; + spin_unlock(&page_pools.epp_lock); + OBD_FREE_LARGE(GET_ENC_KIOV(desc), + desc->bd_iov_count * + sizeof(*GET_ENC_KIOV(desc))); + GET_ENC_KIOV(desc) = NULL; + return -ENOMEM; + } + } + + LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count); + page_pools.epp_pages_short -= desc->bd_iov_count; + + this_idle = 0; + goto again; + } + + /* record max wait time */ + if (unlikely(tick_ns)) { + ktime_t tick = ktime_sub_ns(ktime_get(), tick_ns); + + if (ktime_after(tick, page_pools.epp_st_max_wait)) + page_pools.epp_st_max_wait = tick; + } + + /* proceed with rest of allocation */ + page_pools.epp_free_pages -= desc->bd_iov_count; + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); + BD_GET_ENC_KIOV(desc, i).kiov_page = + page_pools.epp_pools[p_idx][g_idx]; + page_pools.epp_pools[p_idx][g_idx] = NULL; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + if (page_pools.epp_free_pages < page_pools.epp_st_lowfree) + page_pools.epp_st_lowfree = page_pools.epp_free_pages; + + /* + * new idle index = (old * weight + new) / (weight + 1) + */ + if (this_idle == -1) { + this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX / + page_pools.epp_total_pages; + } + page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT + + this_idle) / + (IDLE_IDX_WEIGHT + 1); + + page_pools.epp_last_access = ktime_get_seconds(); + + spin_unlock(&page_pools.epp_lock); + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages); + +void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) +{ + int p_idx, g_idx; + int i; + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + + if (GET_ENC_KIOV(desc) == NULL) + return; + + LASSERT(desc->bd_iov_count > 0); + + spin_lock(&page_pools.epp_lock); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + + LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <= + page_pools.epp_total_pages); + LASSERT(page_pools.epp_pools[p_idx]); + + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(BD_GET_ENC_KIOV(desc, i).kiov_page != NULL); + LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]); + LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL); + + page_pools.epp_pools[p_idx][g_idx] = + BD_GET_ENC_KIOV(desc, i).kiov_page; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + page_pools.epp_free_pages += desc->bd_iov_count; + + enc_pools_wakeup(); + + spin_unlock(&page_pools.epp_lock); + + OBD_FREE_LARGE(GET_ENC_KIOV(desc), + desc->bd_iov_count * sizeof(*GET_ENC_KIOV(desc))); + GET_ENC_KIOV(desc) = NULL; +} + +/* + * we don't do much stuff for add_user/del_user anymore, except adding some + * initial pages in add_user() if current pools are empty, rest would be + * handled by the pools's self-adaption. + */ +int sptlrpc_enc_pool_add_user(void) +{ + int need_grow = 0; + + spin_lock(&page_pools.epp_lock); + if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) { + page_pools.epp_growing = 1; + need_grow = 1; + } + spin_unlock(&page_pools.epp_lock); + + if (need_grow) { + enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES + + PTLRPC_MAX_BRW_PAGES); + + spin_lock(&page_pools.epp_lock); + page_pools.epp_growing = 0; + enc_pools_wakeup(); + spin_unlock(&page_pools.epp_lock); + } + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_add_user); + +int sptlrpc_enc_pool_del_user(void) +{ + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_del_user); + +static inline void enc_pools_alloc(void) +{ + LASSERT(page_pools.epp_max_pools); + OBD_ALLOC_LARGE(page_pools.epp_pools, + page_pools.epp_max_pools * + sizeof(*page_pools.epp_pools)); +} + +static inline void enc_pools_free(void) +{ + LASSERT(page_pools.epp_max_pools); + LASSERT(page_pools.epp_pools); + + OBD_FREE_LARGE(page_pools.epp_pools, + page_pools.epp_max_pools * + sizeof(*page_pools.epp_pools)); +} + +int sptlrpc_enc_pool_init(void) +{ + DEF_SHRINKER_VAR(shvar, enc_pools_shrink, + enc_pools_shrink_count, enc_pools_shrink_scan); + + page_pools.epp_max_pages = cfs_totalram_pages() / 8; + if (enc_pool_max_memory_mb > 0 && + enc_pool_max_memory_mb <= (cfs_totalram_pages() >> mult)) + page_pools.epp_max_pages = enc_pool_max_memory_mb << mult; + + page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages); + + init_waitqueue_head(&page_pools.epp_waitq); + page_pools.epp_waitqlen = 0; + page_pools.epp_pages_short = 0; + + page_pools.epp_growing = 0; + + page_pools.epp_idle_idx = 0; + page_pools.epp_last_shrink = ktime_get_seconds(); + page_pools.epp_last_access = ktime_get_seconds(); + + spin_lock_init(&page_pools.epp_lock); + page_pools.epp_total_pages = 0; + page_pools.epp_free_pages = 0; + + page_pools.epp_st_max_pages = 0; + page_pools.epp_st_grows = 0; + page_pools.epp_st_grow_fails = 0; + page_pools.epp_st_shrinks = 0; + page_pools.epp_st_access = 0; + page_pools.epp_st_missings = 0; + page_pools.epp_st_lowfree = 0; + page_pools.epp_st_max_wqlen = 0; + page_pools.epp_st_max_wait = ktime_set(0, 0); + page_pools.epp_st_outofmem = 0; + + enc_pools_alloc(); + if (page_pools.epp_pools == NULL) + return -ENOMEM; + + pools_shrinker = set_shrinker(pools_shrinker_seeks, &shvar); + if (pools_shrinker == NULL) { + enc_pools_free(); + return -ENOMEM; + } + + return 0; +} + +void sptlrpc_enc_pool_fini(void) +{ + unsigned long cleaned, npools; + + LASSERT(pools_shrinker); + LASSERT(page_pools.epp_pools); + LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages); + + remove_shrinker(pools_shrinker); + + npools = npages_to_npools(page_pools.epp_total_pages); + cleaned = enc_pools_cleanup(page_pools.epp_pools, npools); + LASSERT(cleaned == page_pools.epp_total_pages); + + enc_pools_free(); + + if (page_pools.epp_st_access > 0) { + CDEBUG(D_SEC, + "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait ms %lld, out of mem %lu\n", + page_pools.epp_st_max_pages, page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, page_pools.epp_st_access, + page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, + ktime_to_ms(page_pools.epp_st_max_wait), + page_pools.epp_st_outofmem); + } +} + + +static int cfs_hash_alg_id[] = { + [BULK_HASH_ALG_NULL] = CFS_HASH_ALG_NULL, + [BULK_HASH_ALG_ADLER32] = CFS_HASH_ALG_ADLER32, + [BULK_HASH_ALG_CRC32] = CFS_HASH_ALG_CRC32, + [BULK_HASH_ALG_MD5] = CFS_HASH_ALG_MD5, + [BULK_HASH_ALG_SHA1] = CFS_HASH_ALG_SHA1, + [BULK_HASH_ALG_SHA256] = CFS_HASH_ALG_SHA256, + [BULK_HASH_ALG_SHA384] = CFS_HASH_ALG_SHA384, + [BULK_HASH_ALG_SHA512] = CFS_HASH_ALG_SHA512, +}; +const char * sptlrpc_get_hash_name(__u8 hash_alg) +{ + return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]); +} + +__u8 sptlrpc_get_hash_alg(const char *algname) +{ + return cfs_crypto_hash_alg(algname); +} + +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed) +{ + struct ptlrpc_bulk_sec_desc *bsd; + int size = msg->lm_buflens[offset]; + + bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); + if (bsd == NULL) { + CERROR("Invalid bulk sec desc: size %d\n", size); + return -EINVAL; + } + + if (swabbed) { + __swab32s(&bsd->bsd_nob); + } + + if (unlikely(bsd->bsd_version != 0)) { + CERROR("Unexpected version %u\n", bsd->bsd_version); + return -EPROTO; + } + + if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) { + CERROR("Invalid type %u\n", bsd->bsd_type); + return -EPROTO; + } + + /* FIXME more sanity check here */ + + if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG && + bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) { + CERROR("Invalid svc %u\n", bsd->bsd_svc); + return -EPROTO; + } + + return 0; +} +EXPORT_SYMBOL(bulk_sec_desc_unpack); + +/* + * Compute the checksum of an RPC buffer payload. If the return \a buflen + * is not large enough, truncate the result to fit so that it is possible + * to use a hash function with a large hash space, but only use a part of + * the resulting hash. + */ +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen) +{ + struct ahash_request *req; + int hashsize; + unsigned int bufsize; + int i, err; + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX); + LASSERT(buflen >= 4); + + req = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0); + if (IS_ERR(req)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_hash_alg_id[alg])); + return PTR_ERR(req); + } + + hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]); + + for (i = 0; i < desc->bd_iov_count; i++) { + cfs_crypto_hash_update_page(req, + BD_GET_KIOV(desc, i).kiov_page, + BD_GET_KIOV(desc, i).kiov_offset & + ~PAGE_MASK, + BD_GET_KIOV(desc, i).kiov_len); + } + + if (hashsize > buflen) { + unsigned char hashbuf[CFS_CRYPTO_HASH_DIGESTSIZE_MAX]; + + bufsize = sizeof(hashbuf); + LASSERTF(bufsize >= hashsize, "bufsize = %u < hashsize %u\n", + bufsize, hashsize); + err = cfs_crypto_hash_final(req, hashbuf, &bufsize); + memcpy(buf, hashbuf, buflen); + } else { + bufsize = buflen; + err = cfs_crypto_hash_final(req, buf, &bufsize); + } + + return err; +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c new file mode 100644 index 0000000000000..b661ff8696530 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c @@ -0,0 +1,967 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +const char *sptlrpc_part2name(enum lustre_sec_part part) +{ + switch (part) { + case LUSTRE_SP_CLI: + return "cli"; + case LUSTRE_SP_MDT: + return "mdt"; + case LUSTRE_SP_OST: + return "ost"; + case LUSTRE_SP_MGC: + return "mgc"; + case LUSTRE_SP_MGS: + return "mgs"; + case LUSTRE_SP_ANY: + return "any"; + default: + return "err"; + } +} +EXPORT_SYMBOL(sptlrpc_part2name); + +enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd) +{ + const char *type = obd->obd_type->typ_name; + + if (!strcmp(type, LUSTRE_MDT_NAME)) + return LUSTRE_SP_MDT; + if (!strcmp(type, LUSTRE_OST_NAME)) + return LUSTRE_SP_OST; + if (!strcmp(type, LUSTRE_MGS_NAME)) + return LUSTRE_SP_MGS; + + CERROR("unknown target %p(%s)\n", obd, type); + return LUSTRE_SP_ANY; +} + +/**************************************** + * user supplied flavor string parsing * + ****************************************/ + +/* + * format: [-] + */ +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr) +{ + char buf[32]; + char *bulk, *alg; + + memset(flvr, 0, sizeof(*flvr)); + + if (str == NULL || str[0] == '\0') { + flvr->sf_rpc = SPTLRPC_FLVR_INVALID; + return 0; + } + + strlcpy(buf, str, sizeof(buf)); + + bulk = strchr(buf, '-'); + if (bulk) + *bulk++ = '\0'; + + flvr->sf_rpc = sptlrpc_name2flavor_base(buf); + if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID) + goto err_out; + + /* + * currently only base flavor "plain" can have bulk specification. + */ + if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) { + flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32; + if (bulk) { + /* + * format: plain-hash: + */ + alg = strchr(bulk, ':'); + if (alg == NULL) + goto err_out; + *alg++ = '\0'; + + if (strcmp(bulk, "hash")) + goto err_out; + + flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg); + if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX) + goto err_out; + } + + if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL) + flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL); + else + flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG); + } else { + if (bulk) + goto err_out; + } + + flvr->sf_flags = 0; + return 0; + +err_out: + CERROR("invalid flavor string: %s\n", str); + return -EINVAL; +} +EXPORT_SYMBOL(sptlrpc_parse_flavor); + +/**************************************** + * configure rules * + ****************************************/ + +static void get_default_flavor(struct sptlrpc_flavor *sf) +{ + memset(sf, 0, sizeof(*sf)); + + sf->sf_rpc = SPTLRPC_FLVR_NULL; + sf->sf_flags = 0; +} + +static void sptlrpc_rule_init(struct sptlrpc_rule *rule) +{ + rule->sr_netid = LNET_NIDNET(LNET_NID_ANY); + rule->sr_from = LUSTRE_SP_ANY; + rule->sr_to = LUSTRE_SP_ANY; + rule->sr_padding = 0; + + get_default_flavor(&rule->sr_flvr); +} + +/* + * format: network[.direction]=flavor + */ +int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule) +{ + char *flavor, *dir; + int rc; + + sptlrpc_rule_init(rule); + + flavor = strchr(param, '='); + if (flavor == NULL) { + CERROR("invalid param, no '='\n"); + RETURN(-EINVAL); + } + *flavor++ = '\0'; + + dir = strchr(param, '.'); + if (dir) + *dir++ = '\0'; + + /* 1.1 network */ + if (strcmp(param, "default")) { + rule->sr_netid = libcfs_str2net(param); + if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) { + CERROR("invalid network name: %s\n", param); + RETURN(-EINVAL); + } + } + + /* 1.2 direction */ + if (dir) { + if (!strcmp(dir, "mdt2ost")) { + rule->sr_from = LUSTRE_SP_MDT; + rule->sr_to = LUSTRE_SP_OST; + } else if (!strcmp(dir, "mdt2mdt")) { + rule->sr_from = LUSTRE_SP_MDT; + rule->sr_to = LUSTRE_SP_MDT; + } else if (!strcmp(dir, "cli2ost")) { + rule->sr_from = LUSTRE_SP_CLI; + rule->sr_to = LUSTRE_SP_OST; + } else if (!strcmp(dir, "cli2mdt")) { + rule->sr_from = LUSTRE_SP_CLI; + rule->sr_to = LUSTRE_SP_MDT; + } else { + CERROR("invalid rule dir segment: %s\n", dir); + RETURN(-EINVAL); + } + } + + /* 2.1 flavor */ + rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr); + if (rc) + RETURN(-EINVAL); + + RETURN(0); +} +EXPORT_SYMBOL(sptlrpc_parse_rule); + +void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset) +{ + LASSERT(rset->srs_nslot || + (rset->srs_nrule == 0 && rset->srs_rules == NULL)); + + if (rset->srs_nslot) { + OBD_FREE(rset->srs_rules, + rset->srs_nslot * sizeof(*rset->srs_rules)); + sptlrpc_rule_set_init(rset); + } +} +EXPORT_SYMBOL(sptlrpc_rule_set_free); + +/* + * return 0 if the rule set could accomodate one more rule. + */ +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule *rules; + int nslot; + + might_sleep(); + + if (rset->srs_nrule < rset->srs_nslot) + return 0; + + nslot = rset->srs_nslot + 8; + + /* better use realloc() if available */ + OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules)); + if (rules == NULL) + return -ENOMEM; + + if (rset->srs_nrule) { + LASSERT(rset->srs_nslot && rset->srs_rules); + memcpy(rules, rset->srs_rules, + rset->srs_nrule * sizeof(*rset->srs_rules)); + + OBD_FREE(rset->srs_rules, + rset->srs_nslot * sizeof(*rset->srs_rules)); + } + + rset->srs_rules = rules; + rset->srs_nslot = nslot; + return 0; +} + +static inline int rule_spec_dir(struct sptlrpc_rule *rule) +{ + return (rule->sr_from != LUSTRE_SP_ANY || + rule->sr_to != LUSTRE_SP_ANY); +} +static inline int rule_spec_net(struct sptlrpc_rule *rule) +{ + return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY)); +} +static inline int rule_match_dir(struct sptlrpc_rule *r1, + struct sptlrpc_rule *r2) +{ + return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to); +} +static inline int rule_match_net(struct sptlrpc_rule *r1, + struct sptlrpc_rule *r2) +{ + return (r1->sr_netid == r2->sr_netid); +} + +/* + * merge @rule into @rset. + * the @rset slots might be expanded. + */ +int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, + struct sptlrpc_rule *rule) +{ + struct sptlrpc_rule *p = rset->srs_rules; + int spec_dir, spec_net; + int rc, n, match = 0; + + might_sleep(); + + spec_net = rule_spec_net(rule); + spec_dir = rule_spec_dir(rule); + + for (n = 0; n < rset->srs_nrule; n++) { + p = &rset->srs_rules[n]; + + /* test network match, if failed: + * - spec rule: skip rules which is also spec rule match, until + * we hit a wild rule, which means no more chance + * - wild rule: skip until reach the one which is also wild + * and matches + */ + if (!rule_match_net(p, rule)) { + if (spec_net) { + if (rule_spec_net(p)) + continue; + else + break; + } else { + continue; + } + } + + /* test dir match, same logic as net matching */ + if (!rule_match_dir(p, rule)) { + if (spec_dir) { + if (rule_spec_dir(p)) + continue; + else + break; + } else { + continue; + } + } + + /* find a match */ + match = 1; + break; + } + + if (match) { + LASSERT(n >= 0 && n < rset->srs_nrule); + + if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { + /* remove this rule */ + if (n < rset->srs_nrule - 1) + memmove(&rset->srs_rules[n], + &rset->srs_rules[n + 1], + (rset->srs_nrule - n - 1) * + sizeof(*rule)); + rset->srs_nrule--; + } else { + /* override the rule */ + memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); + } + } else { + LASSERT(n >= 0 && n <= rset->srs_nrule); + + if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) { + rc = sptlrpc_rule_set_expand(rset); + if (rc) + return rc; + + if (n < rset->srs_nrule) + memmove(&rset->srs_rules[n + 1], + &rset->srs_rules[n], + (rset->srs_nrule - n) * sizeof(*rule)); + memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); + rset->srs_nrule++; + } else { + CDEBUG(D_CONFIG, "ignore the unmatched deletion\n"); + } + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_rule_set_merge); + +/** + * given from/to/nid, determine a matching flavor in ruleset. + * return 1 if a match found, otherwise return 0. + */ +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + struct sptlrpc_rule *r; + int n; + + for (n = 0; n < rset->srs_nrule; n++) { + r = &rset->srs_rules[n]; + + if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) && + r->sr_netid != LNET_NIDNET(LNET_NID_ANY) && + LNET_NIDNET(nid) != r->sr_netid) + continue; + + if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY && + from != r->sr_from) + continue; + + if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY && + to != r->sr_to) + continue; + + *sf = r->sr_flvr; + return 1; + } + + return 0; +} + +void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule *r; + int n; + + for (n = 0; n < rset->srs_nrule; n++) { + r = &rset->srs_rules[n]; + CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n, + r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc); + } +} + +static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen, + struct sptlrpc_rule_set *tgt, + enum lustre_sec_part from, + enum lustre_sec_part to, + struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule_set *src[2] = { gen, tgt }; + struct sptlrpc_rule *rule; + int i, n, rc; + + might_sleep(); + + /* merge general rules firstly, then target-specific rules */ + for (i = 0; i < 2; i++) { + if (src[i] == NULL) + continue; + + for (n = 0; n < src[i]->srs_nrule; n++) { + rule = &src[i]->srs_rules[n]; + + if (from != LUSTRE_SP_ANY && + rule->sr_from != LUSTRE_SP_ANY && + rule->sr_from != from) + continue; + if (to != LUSTRE_SP_ANY && + rule->sr_to != LUSTRE_SP_ANY && + rule->sr_to != to) + continue; + + rc = sptlrpc_rule_set_merge(rset, rule); + if (rc) { + CERROR("can't merge: %d\n", rc); + return rc; + } + } + } + + return 0; +} + +/********************************** + * sptlrpc configuration support * + **********************************/ + +struct sptlrpc_conf_tgt { + struct list_head sct_list; + char sct_name[MAX_OBD_NAME]; + struct sptlrpc_rule_set sct_rset; +}; + +struct sptlrpc_conf { + struct list_head sc_list; + char sc_fsname[MTI_NAME_MAXLEN]; + unsigned int sc_modified; /* modified during updating */ + unsigned int sc_updated:1, /* updated copy from MGS */ + sc_local:1; /* local copy from target */ + struct sptlrpc_rule_set sc_rset; /* fs general rules */ + struct list_head sc_tgts; /* target-specific rules */ +}; + +static struct mutex sptlrpc_conf_lock; +static struct list_head sptlrpc_confs; + +static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf) +{ + struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next; + + sptlrpc_rule_set_free(&conf->sc_rset); + + list_for_each_entry_safe(conf_tgt, conf_tgt_next, + &conf->sc_tgts, sct_list) { + sptlrpc_rule_set_free(&conf_tgt->sct_rset); + list_del(&conf_tgt->sct_list); + OBD_FREE_PTR(conf_tgt); + } + LASSERT(list_empty(&conf->sc_tgts)); + + conf->sc_updated = 0; + conf->sc_local = 0; +} + +static void sptlrpc_conf_free(struct sptlrpc_conf *conf) +{ + CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname); + + sptlrpc_conf_free_rsets(conf); + list_del(&conf->sc_list); + OBD_FREE_PTR(conf); +} + +static +struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf, + const char *name, + int create) +{ + struct sptlrpc_conf_tgt *conf_tgt; + + list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) { + if (strcmp(conf_tgt->sct_name, name) == 0) + return conf_tgt; + } + + if (!create) + return NULL; + + OBD_ALLOC_PTR(conf_tgt); + if (conf_tgt) { + strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name)); + sptlrpc_rule_set_init(&conf_tgt->sct_rset); + list_add(&conf_tgt->sct_list, &conf->sc_tgts); + } + + return conf_tgt; +} + +static +struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname, + int create) +{ + struct sptlrpc_conf *conf; + + list_for_each_entry(conf, &sptlrpc_confs, sc_list) { + if (strcmp(conf->sc_fsname, fsname) == 0) + return conf; + } + + if (!create) + return NULL; + + OBD_ALLOC_PTR(conf); + if (conf == NULL) + return NULL; + + if (strlcpy(conf->sc_fsname, fsname, sizeof(conf->sc_fsname)) >= + sizeof(conf->sc_fsname)) { + OBD_FREE_PTR(conf); + return NULL; + } + sptlrpc_rule_set_init(&conf->sc_rset); + INIT_LIST_HEAD(&conf->sc_tgts); + list_add(&conf->sc_list, &sptlrpc_confs); + + CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname); + return conf; +} + +/** + * caller must hold conf_lock already. + */ +static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf, + const char *target, + struct sptlrpc_rule *rule) +{ + struct sptlrpc_conf_tgt *conf_tgt; + struct sptlrpc_rule_set *rule_set; + + /* fsname == target means general rules for the whole fs */ + if (strcmp(conf->sc_fsname, target) == 0) { + rule_set = &conf->sc_rset; + } else { + conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1); + if (conf_tgt) { + rule_set = &conf_tgt->sct_rset; + } else { + CERROR("out of memory, can't merge rule!\n"); + return -ENOMEM; + } + } + + return sptlrpc_rule_set_merge(rule_set, rule); +} + +/** + * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we + * find one through the target name in the record inside conf_lock; + * otherwise means caller already hold conf_lock. + */ +static int __sptlrpc_process_config(struct lustre_cfg *lcfg, + struct sptlrpc_conf *conf) +{ + char *target, *param; + char fsname[MTI_NAME_MAXLEN]; + struct sptlrpc_rule rule; + int rc; + + ENTRY; + print_lustre_cfg(lcfg); + + target = lustre_cfg_string(lcfg, 1); + if (target == NULL) { + CERROR("missing target name\n"); + RETURN(-EINVAL); + } + + param = lustre_cfg_string(lcfg, 2); + if (param == NULL) { + CERROR("missing parameter\n"); + RETURN(-EINVAL); + } + + CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param); + + /* parse rule to make sure the format is correct */ + if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) { + CERROR("Invalid sptlrpc parameter: %s\n", param); + RETURN(-EINVAL); + } + param += sizeof(PARAM_SRPC_FLVR) - 1; + + rc = sptlrpc_parse_rule(param, &rule); + if (rc) + RETURN(-EINVAL); + + if (conf == NULL) { + obdname2fsname(target, fsname, sizeof(fsname)); + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf == NULL) { + CERROR("can't find conf\n"); + rc = -ENOMEM; + } else { + rc = sptlrpc_conf_merge_rule(conf, target, &rule); + } + mutex_unlock(&sptlrpc_conf_lock); + } else { + LASSERT(mutex_is_locked(&sptlrpc_conf_lock)); + rc = sptlrpc_conf_merge_rule(conf, target, &rule); + } + + if (rc == 0) + conf->sc_modified++; + + RETURN(rc); +} + +int sptlrpc_process_config(struct lustre_cfg *lcfg) +{ + return __sptlrpc_process_config(lcfg, NULL); +} +EXPORT_SYMBOL(sptlrpc_process_config); + +static int logname2fsname(const char *logname, char *buf, int buflen) +{ + char *ptr; + int len; + + ptr = strrchr(logname, '-'); + if (ptr == NULL || strcmp(ptr, "-sptlrpc")) { + CERROR("%s is not a sptlrpc config log\n", logname); + return -EINVAL; + } + + len = min((int) (ptr - logname), buflen - 1); + + memcpy(buf, logname, len); + buf[len] = '\0'; + return 0; +} + +void sptlrpc_conf_log_update_begin(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf) { + if (conf->sc_local) { + LASSERT(conf->sc_updated == 0); + sptlrpc_conf_free_rsets(conf); + } + conf->sc_modified = 0; + } + + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_update_begin); + +/** + * mark a config log has been updated + */ +void sptlrpc_conf_log_update_end(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf) { + /* + * if original state is not updated, make sure the + * modified counter > 0 to enforce updating local copy. + */ + if (conf->sc_updated == 0) + conf->sc_modified++; + + conf->sc_updated = 1; + } + + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_update_end); + +void sptlrpc_conf_log_start(const char *logname) +{ + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + sptlrpc_conf_get(fsname, 1); + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_start); + +void sptlrpc_conf_log_stop(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf) + sptlrpc_conf_free(conf); + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_stop); + +static void inline flavor_set_flags(struct sptlrpc_flavor *sf, + enum lustre_sec_part from, + enum lustre_sec_part to, + unsigned int fl_udesc) +{ + /* + * null flavor doesn't need to set any flavor, and in fact + * we'd better not do that because everybody share a single sec. + */ + if (sf->sf_rpc == SPTLRPC_FLVR_NULL) + return; + + if (from == LUSTRE_SP_MDT) { + /* MDT->MDT; MDT->OST */ + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY; + } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) { + /* CLI->OST */ + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK; + } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) { + /* CLI->MDT */ + if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL) + sf->sf_flags |= PTLRPC_SEC_FL_UDESC; + } + + /* Some flavors use a single uid (0) context */ + if (flvr_is_rootonly(sf->sf_rpc)) + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY; + + /* User descriptor might need to be cleared */ + if (flvr_allows_user_desc(sf->sf_rpc) == 0) + sf->sf_flags &= ~PTLRPC_SEC_FL_UDESC; +} + +void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, + enum lustre_sec_part to, + struct obd_uuid *target, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + struct sptlrpc_conf *conf; + struct sptlrpc_conf_tgt *conf_tgt; + char name[MTI_NAME_MAXLEN]; + int len, rc = 0; + + obd_uuid2fsname(name, target->uuid, sizeof(name)); + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(name, 0); + if (conf == NULL) + goto out; + + /* convert uuid name (supposed end with _UUID) to target name */ + len = strlen(target->uuid); + LASSERT(len > 5); + memcpy(name, target->uuid, len - 5); + name[len - 5] = '\0'; + + conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0); + if (conf_tgt) { + rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset, + from, to, nid, sf); + if (rc) + goto out; + } + + rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf); +out: + mutex_unlock(&sptlrpc_conf_lock); + + if (rc == 0) + get_default_flavor(sf); + + flavor_set_flags(sf, from, to, 1); +} + +/** + * called by target devices, determine the expected flavor from + * certain peer (from, nid). + */ +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0) + get_default_flavor(sf); +} + +#define SEC_ADAPT_DELAY (10) + +/** + * called by client devices, notify the sptlrpc config has changed and + * do import_sec_adapt later. + */ +void sptlrpc_conf_client_adapt(struct obd_device *obd) +{ + struct obd_import *imp; + ENTRY; + + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) == 0); + CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid); + + /* serialize with connect/disconnect import */ + down_read_nested(&obd->u.cli.cl_sem, OBD_CLI_SEM_MDCOSC); + + imp = obd->u.cli.cl_import; + if (imp) { + spin_lock(&imp->imp_lock); + if (imp->imp_sec) + imp->imp_sec_expire = ktime_get_real_seconds() + + SEC_ADAPT_DELAY; + spin_unlock(&imp->imp_lock); + } + + up_read(&obd->u.cli.cl_sem); + EXIT; +} +EXPORT_SYMBOL(sptlrpc_conf_client_adapt); + +/** + * called by target devices, extract sptlrpc rules which applies to + * this target, to be used for future rpc flavor checking. + */ +int sptlrpc_conf_target_get_rules(struct obd_device *obd, + struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_conf *conf; + struct sptlrpc_conf_tgt *conf_tgt; + enum lustre_sec_part sp_dst; + char fsname[MTI_NAME_MAXLEN]; + int rc = 0; + ENTRY; + + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) { + sp_dst = LUSTRE_SP_MDT; + } else if (strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) == 0) { + sp_dst = LUSTRE_SP_OST; + } else { + CERROR("unexpected obd type %s\n", obd->obd_type->typ_name); + RETURN(-EINVAL); + } + + obd_uuid2fsname(fsname, obd->obd_uuid.uuid, sizeof(fsname)); + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf == NULL) { + CERROR("missing sptlrpc config log\n"); + rc = -EFAULT; + } else { + /* extract rule set for this target */ + conf_tgt = sptlrpc_conf_get_tgt(conf, obd->obd_name, 0); + + rc = sptlrpc_rule_set_extract(&conf->sc_rset, + conf_tgt ? &conf_tgt->sct_rset : NULL, + LUSTRE_SP_ANY, sp_dst, rset); + } + mutex_unlock(&sptlrpc_conf_lock); + + RETURN(rc); +} + +int sptlrpc_conf_init(void) +{ + INIT_LIST_HEAD(&sptlrpc_confs); + mutex_init(&sptlrpc_conf_lock); + return 0; +} + +void sptlrpc_conf_fini(void) +{ + struct sptlrpc_conf *conf, *conf_next; + + mutex_lock(&sptlrpc_conf_lock); + list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) + sptlrpc_conf_free(conf); + LASSERT(list_empty(&sptlrpc_confs)); + mutex_unlock(&sptlrpc_conf_lock); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c new file mode 100644 index 0000000000000..dc9f38c7036ba --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c @@ -0,0 +1,89 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, 2017, Intel Corporation. + */ + +#define DEBUG_SUBSYSTEM S_FILTER + +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +/* refine later and change to seqlock or simlar from libcfs */ +/* Debugging check only needed during development */ +#ifdef OBD_CTXT_DEBUG +# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC) +#else +# define ASSERT_CTXT_MAGIC(magic) do {} while(0) +#endif + +/* push / pop to root of obd store */ +void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx) +{ + /* if there is underlaying dt_device then push_ctxt is not needed */ + if (new_ctx->dt != NULL) + return; + + ASSERT_CTXT_MAGIC(new_ctx->magic); + OBD_SET_CTXT_MAGIC(save); + + LASSERT(ll_d_count(current->fs->pwd.dentry)); + LASSERT(ll_d_count(new_ctx->pwd)); + save->pwd = dget(current->fs->pwd.dentry); + save->pwdmnt = mntget(current->fs->pwd.mnt); + save->umask = current_umask(); + + LASSERT(save->pwd); + LASSERT(save->pwdmnt); + LASSERT(new_ctx->pwd); + LASSERT(new_ctx->pwdmnt); + + current->fs->umask = 0; /* umask already applied on client */ + ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd); +} +EXPORT_SYMBOL(push_ctxt); + +void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx) +{ + /* if there is underlaying dt_device then pop_ctxt is not needed */ + if (new_ctx->dt != NULL) + return; + + ASSERT_CTXT_MAGIC(saved->magic); + + LASSERTF(current->fs->pwd.dentry == new_ctx->pwd, "%p != %p\n", + current->fs->pwd.dentry, new_ctx->pwd); + LASSERTF(current->fs->pwd.mnt == new_ctx->pwdmnt, "%p != %p\n", + current->fs->pwd.mnt, new_ctx->pwdmnt); + + ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd); + + dput(saved->pwd); + mntput(saved->pwdmnt); + current->fs->umask = saved->umask; +} +EXPORT_SYMBOL(pop_ctxt); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c new file mode 100644 index 0000000000000..042a632390cfe --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c @@ -0,0 +1,207 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_gc.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include + +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +#define SEC_GC_INTERVAL (30 * 60) + +static struct mutex sec_gc_mutex; +static spinlock_t sec_gc_list_lock; +static struct list_head sec_gc_list; + +static spinlock_t sec_gc_ctx_list_lock; +static struct list_head sec_gc_ctx_list; + +static atomic_t sec_gc_wait_del = ATOMIC_INIT(0); + +void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec) +{ + LASSERT(sec->ps_policy->sp_cops->gc_ctx); + LASSERT(sec->ps_gc_interval > 0); + LASSERT(list_empty(&sec->ps_gc_list)); + + sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval; + + spin_lock(&sec_gc_list_lock); + list_add_tail(&sec->ps_gc_list, &sec_gc_list); + spin_unlock(&sec_gc_list_lock); + + CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name); +} + +void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec) +{ + if (list_empty(&sec->ps_gc_list)) + return; + + might_sleep(); + + /* signal before list_del to make iteration in gc thread safe */ + atomic_inc(&sec_gc_wait_del); + + spin_lock(&sec_gc_list_lock); + list_del_init(&sec->ps_gc_list); + spin_unlock(&sec_gc_list_lock); + + /* barrier */ + mutex_lock(&sec_gc_mutex); + mutex_unlock(&sec_gc_mutex); + + atomic_dec(&sec_gc_wait_del); + + CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name); +} + +static void sec_gc_main(struct work_struct *ws); +static DECLARE_DELAYED_WORK(sec_gc_work, sec_gc_main); + +void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(list_empty(&ctx->cc_gc_chain)); + + CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + spin_lock(&sec_gc_ctx_list_lock); + list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list); + spin_unlock(&sec_gc_ctx_list_lock); + + mod_delayed_work(system_wq, &sec_gc_work, 0); +} +EXPORT_SYMBOL(sptlrpc_gc_add_ctx); + +static void sec_process_ctx_list(void) +{ + struct ptlrpc_cli_ctx *ctx; + + spin_lock(&sec_gc_ctx_list_lock); + + while (!list_empty(&sec_gc_ctx_list)) { + ctx = list_entry(sec_gc_ctx_list.next, + struct ptlrpc_cli_ctx, cc_gc_chain); + list_del_init(&ctx->cc_gc_chain); + spin_unlock(&sec_gc_ctx_list_lock); + + LASSERT(ctx->cc_sec); + LASSERT(atomic_read(&ctx->cc_refcount) == 1); + CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + sptlrpc_cli_ctx_put(ctx, 1); + + spin_lock(&sec_gc_ctx_list_lock); + } + + spin_unlock(&sec_gc_ctx_list_lock); +} + +static void sec_do_gc(struct ptlrpc_sec *sec) +{ + LASSERT(sec->ps_policy->sp_cops->gc_ctx); + + if (unlikely(sec->ps_gc_next == 0)) { + CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n", + sec, sec->ps_policy->sp_name); + return; + } + + CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name); + + if (sec->ps_gc_next > ktime_get_real_seconds()) + return; + + sec->ps_policy->sp_cops->gc_ctx(sec); + sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval; +} + +static void sec_gc_main(struct work_struct *ws) +{ + struct ptlrpc_sec *sec; + + sec_process_ctx_list(); +again: + /* go through sec list do gc. + * FIXME here we iterate through the whole list each time which + * is not optimal. we perhaps want to use balanced binary tree + * to trace each sec as order of expiry time. + * another issue here is we wakeup as fixed interval instead of + * according to each sec's expiry time + */ + mutex_lock(&sec_gc_mutex); + list_for_each_entry(sec, &sec_gc_list, ps_gc_list) { + /* if someone is waiting to be deleted, let it + * proceed as soon as possible. + */ + if (atomic_read(&sec_gc_wait_del)) { + CDEBUG(D_SEC, "deletion pending, start over\n"); + mutex_unlock(&sec_gc_mutex); + goto again; + } + + sec_do_gc(sec); + } + mutex_unlock(&sec_gc_mutex); + + /* check ctx list again before sleep */ + sec_process_ctx_list(); + schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL)); +} + +int sptlrpc_gc_init(void) +{ + mutex_init(&sec_gc_mutex); + spin_lock_init(&sec_gc_list_lock); + spin_lock_init(&sec_gc_ctx_list_lock); + + INIT_LIST_HEAD(&sec_gc_list); + INIT_LIST_HEAD(&sec_gc_ctx_list); + + schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL)); + return 0; +} + +void sptlrpc_gc_fini(void) +{ + cancel_delayed_work_sync(&sec_gc_work); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c new file mode 100644 index 0000000000000..4f8efe44aa678 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c @@ -0,0 +1,283 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_lproc.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + + +struct proc_dir_entry *sptlrpc_proc_root = NULL; +EXPORT_SYMBOL(sptlrpc_proc_root); + +static char *sec_flags2str(unsigned long flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_SEC_FL_REVERSE) + strlcat(buf, "reverse,", bufsize); + if (flags & PTLRPC_SEC_FL_ROOTONLY) + strlcat(buf, "rootonly,", bufsize); + if (flags & PTLRPC_SEC_FL_UDESC) + strlcat(buf, "udesc,", bufsize); + if (flags & PTLRPC_SEC_FL_BULK) + strlcat(buf, "bulk,", bufsize); + if (buf[0] == '\0') + strlcat(buf, "-,", bufsize); + + return buf; +} + +static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + struct ptlrpc_sec *sec = NULL; + char str[32]; + + LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_LWP_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_OSP_NAME) == 0); + + if (cli->cl_import) + sec = sptlrpc_import_sec_ref(cli->cl_import); + if (sec == NULL) + goto out; + + sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)); + + seq_printf(seq, "rpc flavor: %s\n", + sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc)); + seq_printf(seq, "bulk flavor: %s\n", + sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str))); + seq_printf(seq, "flags: %s\n", + sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str))); + seq_printf(seq, "id: %d\n", sec->ps_id); + seq_printf(seq, "refcount: %d\n", + atomic_read(&sec->ps_refcount)); + seq_printf(seq, "nctx: %d\n", atomic_read(&sec->ps_nctx)); + seq_printf(seq, "gc internal %lld\n", sec->ps_gc_interval); + seq_printf(seq, "gc next %lld\n", + sec->ps_gc_interval ? + (s64)(sec->ps_gc_next - ktime_get_real_seconds()) : 0ll); + + sptlrpc_sec_put(sec); +out: + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(sptlrpc_info_lprocfs); + +static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + struct ptlrpc_sec *sec = NULL; + + LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_LWP_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_OSP_NAME) == 0); + + if (cli->cl_import) + sec = sptlrpc_import_sec_ref(cli->cl_import); + if (sec == NULL) + goto out; + + if (sec->ps_policy->sp_cops->display) + sec->ps_policy->sp_cops->display(sec, seq); + + sptlrpc_sec_put(sec); +out: + return 0; +} + +LDEBUGFS_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs); + +static ssize_t +ldebugfs_sptlrpc_sepol_seq_write(struct file *file, const char __user *buffer, + size_t count, void *data) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + struct obd_import *imp = cli->cl_import; + struct sepol_downcall_data *param; + int size = sizeof(*param); + int rc = 0; + + if (count < size) { + CERROR("%s: invalid data count = %lu, size = %d\n", + dev->obd_name, (unsigned long) count, size); + return -EINVAL; + } + + OBD_ALLOC(param, size); + if (param == NULL) + return -ENOMEM; + + if (copy_from_user(param, buffer, size)) { + CERROR("%s: bad sepol data\n", dev->obd_name); + GOTO(out, rc = -EFAULT); + } + + if (param->sdd_magic != SEPOL_DOWNCALL_MAGIC) { + CERROR("%s: sepol downcall bad params\n", + dev->obd_name); + GOTO(out, rc = -EINVAL); + } + + if (param->sdd_sepol_len == 0 || + param->sdd_sepol_len >= sizeof(imp->imp_sec->ps_sepol)) { + CERROR("%s: invalid sepol data returned\n", + dev->obd_name); + GOTO(out, rc = -EINVAL); + } + rc = param->sdd_sepol_len; /* save sdd_sepol_len */ + OBD_FREE(param, size); + size = offsetof(struct sepol_downcall_data, + sdd_sepol[rc]); + + /* alloc again with real size */ + rc = 0; + OBD_ALLOC(param, size); + if (param == NULL) + return -ENOMEM; + + if (copy_from_user(param, buffer, size)) { + CERROR("%s: bad sepol data\n", dev->obd_name); + GOTO(out, rc = -EFAULT); + } + + spin_lock(&imp->imp_sec->ps_lock); + snprintf(imp->imp_sec->ps_sepol, param->sdd_sepol_len + 1, "%s", + param->sdd_sepol); + imp->imp_sec->ps_sepol_mtime = ktime_set(param->sdd_sepol_mtime, 0); + spin_unlock(&imp->imp_sec->ps_lock); + +out: + if (param != NULL) + OBD_FREE(param, size); + + return rc ? rc : count; +} +LDEBUGFS_FOPS_WR_ONLY(srpc, sptlrpc_sepol); + +int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev) +{ + int rc; + + if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 && + strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 && + strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0 && + strcmp(dev->obd_type->typ_name, LUSTRE_LWP_NAME) != 0 && + strcmp(dev->obd_type->typ_name, LUSTRE_OSP_NAME) != 0) { + CERROR("can't register lproc for obd type %s\n", + dev->obd_type->typ_name); + return -EINVAL; + } + + rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_info", 0444, + &sptlrpc_info_lprocfs_fops, dev); + if (rc) { + CERROR("create proc entry srpc_info for %s: %d\n", + dev->obd_name, rc); + return rc; + } + + rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_contexts", + 0444, &sptlrpc_ctxs_lprocfs_fops, dev); + if (rc) { + CERROR("create proc entry srpc_contexts for %s: %d\n", + dev->obd_name, rc); + return rc; + } + + rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_sepol", + 0200, &srpc_sptlrpc_sepol_fops, dev); + if (rc) { + CERROR("create proc entry srpc_sepol for %s: %d\n", + dev->obd_name, rc); + return rc; + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach); + +LPROC_SEQ_FOPS_RO(sptlrpc_proc_enc_pool); +static struct lprocfs_vars sptlrpc_lprocfs_vars[] = { + { .name = "encrypt_page_pools", + .fops = &sptlrpc_proc_enc_pool_fops }, + { NULL } +}; + +int sptlrpc_lproc_init(void) +{ + int rc; + + LASSERT(sptlrpc_proc_root == NULL); + + sptlrpc_proc_root = lprocfs_register("sptlrpc", proc_lustre_root, + sptlrpc_lprocfs_vars, NULL); + if (IS_ERR(sptlrpc_proc_root)) { + rc = PTR_ERR(sptlrpc_proc_root); + sptlrpc_proc_root = NULL; + return rc; + } + return 0; +} + +void sptlrpc_lproc_fini(void) +{ + if (sptlrpc_proc_root) { + lprocfs_remove(&sptlrpc_proc_root); + sptlrpc_proc_root = NULL; + } +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c new file mode 100644 index 0000000000000..a17a4e182233e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c @@ -0,0 +1,447 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_null.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + + +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static struct ptlrpc_sec_policy null_policy; +static struct ptlrpc_sec null_sec; +static struct ptlrpc_cli_ctx null_cli_ctx; +static struct ptlrpc_svc_ctx null_svc_ctx; + +/* + * we can temporarily use the topmost 8-bits of lm_secflvr to identify + * the source sec part. + */ +static inline +void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp) +{ + msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24; +} + +static inline +enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg) +{ + return (msg->lm_secflvr >> 24) & 0xFF; +} + +static +int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL; + + if (!req->rq_import->imp_dlm_fake) { + struct obd_device *obd = req->rq_import->imp_obd; + null_encode_sec_part(req->rq_reqbuf, + obd->u.cli.cl_sp_me); + } + req->rq_reqdata_len = req->rq_reqlen; + return 0; +} + +static +int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + __u32 cksums, cksumc; + + LASSERT(req->rq_repdata); + + req->rq_repmsg = req->rq_repdata; + req->rq_replen = req->rq_repdata_len; + + if (req->rq_early) { + cksums = lustre_msg_get_cksum(req->rq_repdata); + cksumc = lustre_msg_calc_cksum(req->rq_repmsg); + + if (cksumc != cksums) { + CDEBUG(D_SEC, + "early reply checksum mismatch: %08x != %08x\n", + cksumc, cksums); + return -EINVAL; + } + } + + return 0; +} + +static +struct ptlrpc_sec *null_create_sec(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf) +{ + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL); + + /* general layer has take a module reference for us, because we never + * really destroy the sec, simply release the reference here. + */ + sptlrpc_policy_put(&null_policy); + return &null_sec; +} + +static +void null_destroy_sec(struct ptlrpc_sec *sec) +{ + LASSERT(sec == &null_sec); +} + +static +struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + atomic_inc(&null_cli_ctx.cc_refcount); + return &null_cli_ctx; +} + +static +int null_flush_ctx_cache(struct ptlrpc_sec *sec, + uid_t uid, + int grace, int force) +{ + return 0; +} + +static +int null_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + if (!req->rq_reqbuf) { + int alloc_size = size_roundup_power2(msgsize); + + LASSERT(!req->rq_pool); + OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size); + if (!req->rq_reqbuf) + return -ENOMEM; + + req->rq_reqbuf_len = alloc_size; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= msgsize); + memset(req->rq_reqbuf, 0, msgsize); + } + + req->rq_reqmsg = req->rq_reqbuf; + return 0; +} + +static +void null_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + if (!req->rq_pool) { + LASSERTF(req->rq_reqmsg == req->rq_reqbuf, + "req %p: reqmsg %p is not reqbuf %p in null sec\n", + req, req->rq_reqmsg, req->rq_reqbuf); + LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen, + "req %p: reqlen %d should smaller than buflen %d\n", + req, req->rq_reqlen, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } +} + +static +int null_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + /* add space for early replied */ + msgsize += lustre_msg_early_size(); + + msgsize = size_roundup_power2(msgsize); + + OBD_ALLOC_LARGE(req->rq_repbuf, msgsize); + if (!req->rq_repbuf) + return -ENOMEM; + + req->rq_repbuf_len = msgsize; + return 0; +} + +static +void null_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + LASSERT(req->rq_repbuf); + + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; +} + +static +int null_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + struct lustre_msg *oldbuf = req->rq_reqmsg; + int oldsize, newmsg_size, alloc_size; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf == req->rq_reqmsg); + LASSERT(req->rq_reqbuf_len >= req->rq_reqlen); + LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf)); + + /* compute new message size */ + oldsize = req->rq_reqbuf->lm_buflens[segment]; + req->rq_reqbuf->lm_buflens[segment] = newsize; + newmsg_size = lustre_packed_msg_size(oldbuf); + req->rq_reqbuf->lm_buflens[segment] = oldsize; + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size); + + if (req->rq_reqbuf_len < newmsg_size) { + alloc_size = size_roundup_power2(newmsg_size); + + OBD_ALLOC_LARGE(newbuf, alloc_size); + if (newbuf == NULL) + return -ENOMEM; + + /* Must lock this, so that otherwise unprotected change of + * rq_reqmsg is not racing with parallel processing of + * imp_replay_list traversing threads. See LU-3333 + * This is a bandaid at best, we really need to deal with this + * in request enlarging code before unpacking that's already + * there */ + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = req->rq_reqmsg = newbuf; + req->rq_reqbuf_len = alloc_size; + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } + + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + req->rq_reqlen = newmsg_size; + + return 0; +} + +static struct ptlrpc_svc_ctx null_svc_ctx = { + .sc_refcount = ATOMIC_INIT(1), + .sc_policy = &null_policy, +}; + +static +int null_accept(struct ptlrpc_request *req) +{ + LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == + SPTLRPC_POLICY_NULL); + + if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) { + CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc); + return SECSVC_DROP; + } + + req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf); + + req->rq_reqmsg = req->rq_reqbuf; + req->rq_reqlen = req->rq_reqdata_len; + + req->rq_svc_ctx = &null_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + return SECSVC_OK; +} + +static +int null_alloc_rs(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_reply_state *rs; + int rs_size = sizeof(*rs) + msgsize; + + LASSERT(msgsize % 8 == 0); + + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + return -ENOMEM; + + rs->rs_size = rs_size; + } + + rs->rs_svc_ctx = req->rq_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = rs_size - sizeof(*rs); + rs->rs_msg = rs->rs_repbuf; + + req->rq_reply_state = rs; + return 0; +} + +static +void null_free_rs(struct ptlrpc_reply_state *rs) +{ + LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1); + atomic_dec(&rs->rs_svc_ctx->sc_refcount); + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); +} + +static +int null_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + + LASSERT(rs); + + rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL; + rs->rs_repdata_len = req->rq_replen; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = lustre_msg_early_size(); + else + req->rq_reply_off = 0; + } else { + __u32 cksum; + + cksum = lustre_msg_calc_cksum(rs->rs_repbuf); + lustre_msg_set_cksum(rs->rs_repbuf, cksum); + req->rq_reply_off = 0; + } + + return 0; +} + +static struct ptlrpc_ctx_ops null_ctx_ops = { + .sign = null_ctx_sign, + .verify = null_ctx_verify, +}; +static struct ptlrpc_sec_cops null_sec_cops = { + .create_sec = null_create_sec, + .destroy_sec = null_destroy_sec, + .lookup_ctx = null_lookup_ctx, + .flush_ctx_cache = null_flush_ctx_cache, + .alloc_reqbuf = null_alloc_reqbuf, + .alloc_repbuf = null_alloc_repbuf, + .free_reqbuf = null_free_reqbuf, + .free_repbuf = null_free_repbuf, + .enlarge_reqbuf = null_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops null_sec_sops = { + .accept = null_accept, + .alloc_rs = null_alloc_rs, + .authorize = null_authorize, + .free_rs = null_free_rs, +}; + +static struct ptlrpc_sec_policy null_policy = { + .sp_owner = THIS_MODULE, + .sp_name = "sec.null", + .sp_policy = SPTLRPC_POLICY_NULL, + .sp_cops = &null_sec_cops, + .sp_sops = &null_sec_sops, +}; + +static void null_init_internal(void) +{ + static HLIST_HEAD(__list); + + null_sec.ps_policy = &null_policy; + atomic_set(&null_sec.ps_refcount, 1); /* always busy */ + null_sec.ps_id = -1; + null_sec.ps_import = NULL; + null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL; + null_sec.ps_flvr.sf_flags = 0; + null_sec.ps_part = LUSTRE_SP_ANY; + null_sec.ps_dying = 0; + spin_lock_init(&null_sec.ps_lock); + atomic_set(&null_sec.ps_nctx, 1); /* for "null_cli_ctx" */ + INIT_LIST_HEAD(&null_sec.ps_gc_list); + null_sec.ps_gc_interval = 0; + null_sec.ps_gc_next = 0; + + hlist_add_head(&null_cli_ctx.cc_cache, &__list); + atomic_set(&null_cli_ctx.cc_refcount, 1); /* for hash */ + null_cli_ctx.cc_sec = &null_sec; + null_cli_ctx.cc_ops = &null_ctx_ops; + null_cli_ctx.cc_expire = 0; + null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL | + PTLRPC_CTX_UPTODATE; + null_cli_ctx.cc_vcred.vc_uid = 0; + spin_lock_init(&null_cli_ctx.cc_lock); + INIT_LIST_HEAD(&null_cli_ctx.cc_req_list); + INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain); +} + +int sptlrpc_null_init(void) +{ + int rc; + + null_init_internal(); + + rc = sptlrpc_register_policy(&null_policy); + if (rc) + CERROR("failed to register %s: %d\n", null_policy.sp_name, rc); + + return rc; +} + +void sptlrpc_null_fini(void) +{ + int rc; + + rc = sptlrpc_unregister_policy(&null_policy); + if (rc) + CERROR("failed to unregister %s: %d\n", null_policy.sp_name,rc); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c new file mode 100644 index 0000000000000..dea70d160b54e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c @@ -0,0 +1,1034 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_plain.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + + +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +struct plain_sec { + struct ptlrpc_sec pls_base; + rwlock_t pls_lock; + struct ptlrpc_cli_ctx *pls_ctx; +}; + +static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec) +{ + return container_of(sec, struct plain_sec, pls_base); +} + +static struct ptlrpc_sec_policy plain_policy; +static struct ptlrpc_ctx_ops plain_ctx_ops; +static struct ptlrpc_svc_ctx plain_svc_ctx; + +static unsigned int plain_at_offset; + +/* + * for simplicity, plain policy rpc use fixed layout. + */ +#define PLAIN_PACK_SEGMENTS (4) + +#define PLAIN_PACK_HDR_OFF (0) +#define PLAIN_PACK_MSG_OFF (1) +#define PLAIN_PACK_USER_OFF (2) +#define PLAIN_PACK_BULK_OFF (3) + +#define PLAIN_FL_USER (0x01) +#define PLAIN_FL_BULK (0x02) + +struct plain_header { + __u8 ph_ver; /* 0 */ + __u8 ph_flags; + __u8 ph_sp; /* source */ + __u8 ph_bulk_hash_alg; /* complete flavor desc */ + __u8 ph_pad[4]; +}; + +struct plain_bulk_token { + __u8 pbt_hash[8]; +}; + +#define PLAIN_BSD_SIZE \ + (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token)) + +/**************************************** + * bulk checksum helpers * + ****************************************/ + +static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed) +{ + struct ptlrpc_bulk_sec_desc *bsd; + + if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed)) + return -EPROTO; + + bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE); + if (bsd == NULL) { + CERROR("bulk sec desc has short size %d\n", + lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF)); + return -EPROTO; + } + + if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) { + CERROR("invalid bulk svc %u\n", bsd->bsd_svc); + return -EPROTO; + } + + return 0; +} + +static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc, + __u8 hash_alg, + struct plain_bulk_token *token) +{ + if (hash_alg == BULK_HASH_ALG_NULL) + return 0; + + memset(token->pbt_hash, 0, sizeof(token->pbt_hash)); + return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash, + sizeof(token->pbt_hash)); +} + +static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc, + __u8 hash_alg, + struct plain_bulk_token *tokenr) +{ + struct plain_bulk_token tokenv; + int rc; + + if (hash_alg == BULK_HASH_ALG_NULL) + return 0; + + memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash)); + rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash, + sizeof(tokenv.pbt_hash)); + if (rc) + return rc; + + if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash))) + return -EACCES; + return 0; +} + +static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) +{ + char *ptr; + unsigned int off, i; + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + + for (i = 0; i < desc->bd_iov_count; i++) { + if (BD_GET_KIOV(desc, i).kiov_len == 0) + continue; + + ptr = kmap(BD_GET_KIOV(desc, i).kiov_page); + off = BD_GET_KIOV(desc, i).kiov_offset & ~PAGE_MASK; + ptr[off] ^= 0x1; + kunmap(BD_GET_KIOV(desc, i).kiov_page); + return; + } +} + +/**************************************** + * cli_ctx apis * + ****************************************/ + +static +int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx) +{ + /* should never reach here */ + LBUG(); + return 0; +} + +static +int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx) +{ + return 0; +} + +static +int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct plain_header *phdr; + ENTRY; + + msg->lm_secflvr = req->rq_flvr.sf_rpc; + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); + phdr->ph_ver = 0; + phdr->ph_flags = 0; + phdr->ph_sp = ctx->cc_sec->ps_part; + phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; + + if (req->rq_pack_udesc) + phdr->ph_flags |= PLAIN_FL_USER; + if (req->rq_pack_bulk) + phdr->ph_flags |= PLAIN_FL_BULK; + + req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount, + msg->lm_buflens); + RETURN(0); +} + +static +int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_repdata; + struct plain_header *phdr; + __u32 cksum; + bool swabbed; + + ENTRY; + if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) { + CERROR("unexpected reply buf count %u\n", msg->lm_bufcount); + RETURN(-EPROTO); + } + + swabbed = ptlrpc_rep_need_swab(req); + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); + if (phdr == NULL) { + CERROR("missing plain header\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_ver != 0) { + CERROR("Invalid header version\n"); + RETURN(-EPROTO); + } + + /* expect no user desc in reply */ + if (phdr->ph_flags & PLAIN_FL_USER) { + CERROR("Unexpected udesc flag in reply\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) { + CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg, + req->rq_flvr.u_bulk.hash.hash_alg); + RETURN(-EPROTO); + } + + if (unlikely(req->rq_early)) { + unsigned int hsize = 4; + + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, + lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), + lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF), + NULL, 0, (unsigned char *)&cksum, &hsize); + if (cksum != msg->lm_cksum) { + CDEBUG(D_SEC, + "early reply checksum mismatch: %08x != %08x\n", + cpu_to_le32(cksum), msg->lm_cksum); + RETURN(-EINVAL); + } + } else { + /* whether we sent with bulk or not, we expect the same + * in reply, except for early reply */ + if (!req->rq_early && + !equi(req->rq_pack_bulk == 1, + phdr->ph_flags & PLAIN_FL_BULK)) { + CERROR("%s bulk checksum in reply\n", + req->rq_pack_bulk ? "Missing" : "Unexpected"); + RETURN(-EPROTO); + } + + if (phdr->ph_flags & PLAIN_FL_BULK) { + if (plain_unpack_bsd(msg, swabbed)) + RETURN(-EPROTO); + } + } + + req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); + req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF); + RETURN(0); +} + +static +int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_bulk_sec_desc *bsd; + struct plain_bulk_token *token; + int rc; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); + + bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + token = (struct plain_bulk_token *) bsd->bsd_data; + + bsd->bsd_version = 0; + bsd->bsd_flags = 0; + bsd->bsd_type = SPTLRPC_BULK_DEFAULT; + bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc); + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL) + RETURN(0); + + if (req->rq_bulk_read) + RETURN(0); + + rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + token); + if (rc) { + CERROR("bulk write: failed to compute checksum: %d\n", rc); + } else { + /* + * for sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo + */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) && + req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL) + token->pbt_hash[0] ^= 0x1; + } + + return rc; +} + +static +int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_bulk_sec_desc *bsdv; + struct plain_bulk_token *tokenv; + int rc; + int i, nob; + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); + LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS); + + bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + if (req->rq_bulk_write) { + if (bsdv->bsd_flags & BSD_FL_ERR) + return -EIO; + return 0; + } + + /* fix the actual data size */ + for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { + if (BD_GET_KIOV(desc, i).kiov_len + + nob > desc->bd_nob_transferred) { + BD_GET_KIOV(desc, i).kiov_len = + desc->bd_nob_transferred - nob; + } + nob += BD_GET_KIOV(desc, i).kiov_len; + } + + rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenv); + if (rc) + CERROR("bulk read: client verify failed: %d\n", rc); + + return rc; +} + +/**************************************** + * sec apis * + ****************************************/ + +static +struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec) +{ + struct ptlrpc_cli_ctx *ctx, *ctx_new; + + OBD_ALLOC_PTR(ctx_new); + + write_lock(&plsec->pls_lock); + + ctx = plsec->pls_ctx; + if (ctx) { + atomic_inc(&ctx->cc_refcount); + + if (ctx_new) + OBD_FREE_PTR(ctx_new); + } else if (ctx_new) { + ctx = ctx_new; + + atomic_set(&ctx->cc_refcount, 1); /* for cache */ + ctx->cc_sec = &plsec->pls_base; + ctx->cc_ops = &plain_ctx_ops; + ctx->cc_expire = 0; + ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE; + ctx->cc_vcred.vc_uid = 0; + spin_lock_init(&ctx->cc_lock); + INIT_LIST_HEAD(&ctx->cc_req_list); + INIT_LIST_HEAD(&ctx->cc_gc_chain); + + plsec->pls_ctx = ctx; + atomic_inc(&plsec->pls_base.ps_nctx); + atomic_inc(&plsec->pls_base.ps_refcount); + + atomic_inc(&ctx->cc_refcount); /* for caller */ + } + + write_unlock(&plsec->pls_lock); + + return ctx; +} + +static +void plain_destroy_sec(struct ptlrpc_sec *sec) +{ + struct plain_sec *plsec = sec2plsec(sec); + ENTRY; + + LASSERT(sec->ps_policy == &plain_policy); + LASSERT(sec->ps_import); + LASSERT(atomic_read(&sec->ps_refcount) == 0); + LASSERT(atomic_read(&sec->ps_nctx) == 0); + LASSERT(plsec->pls_ctx == NULL); + + class_import_put(sec->ps_import); + + OBD_FREE_PTR(plsec); + EXIT; +} + +static +void plain_kill_sec(struct ptlrpc_sec *sec) +{ + sec->ps_dying = 1; +} + +static +struct ptlrpc_sec *plain_create_sec(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf) +{ + struct plain_sec *plsec; + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN); + + OBD_ALLOC_PTR(plsec); + if (plsec == NULL) + RETURN(NULL); + + /* + * initialize plain_sec + */ + rwlock_init(&plsec->pls_lock); + plsec->pls_ctx = NULL; + + sec = &plsec->pls_base; + sec->ps_policy = &plain_policy; + atomic_set(&sec->ps_refcount, 0); + atomic_set(&sec->ps_nctx, 0); + sec->ps_id = sptlrpc_get_next_secid(); + sec->ps_import = class_import_get(imp); + sec->ps_flvr = *sf; + spin_lock_init(&sec->ps_lock); + INIT_LIST_HEAD(&sec->ps_gc_list); + sec->ps_gc_interval = 0; + sec->ps_gc_next = 0; + + /* install ctx immediately if this is a reverse sec */ + if (svc_ctx) { + ctx = plain_sec_install_ctx(plsec); + if (ctx == NULL) { + plain_destroy_sec(sec); + RETURN(NULL); + } + sptlrpc_cli_ctx_put(ctx, 1); + } + + RETURN(sec); +} + +static +struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + struct plain_sec *plsec = sec2plsec(sec); + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + read_lock(&plsec->pls_lock); + ctx = plsec->pls_ctx; + if (ctx) + atomic_inc(&ctx->cc_refcount); + read_unlock(&plsec->pls_lock); + + if (unlikely(ctx == NULL)) + ctx = plain_sec_install_ctx(plsec); + + RETURN(ctx); +} + +static +void plain_release_ctx(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, int sync) +{ + LASSERT(atomic_read(&sec->ps_refcount) > 0); + LASSERT(atomic_read(&sec->ps_nctx) > 0); + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + LASSERT(ctx->cc_sec == sec); + + OBD_FREE_PTR(ctx); + + atomic_dec(&sec->ps_nctx); + sptlrpc_sec_put(sec); +} + +static +int plain_flush_ctx_cache(struct ptlrpc_sec *sec, + uid_t uid, int grace, int force) +{ + struct plain_sec *plsec = sec2plsec(sec); + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + /* do nothing unless caller want to flush for 'all' */ + if (uid != -1) + RETURN(0); + + write_lock(&plsec->pls_lock); + ctx = plsec->pls_ctx; + plsec->pls_ctx = NULL; + write_unlock(&plsec->pls_lock); + + if (ctx) + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(0); +} + +static +int plain_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int alloc_len; + ENTRY; + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_udesc) + buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size(); + + if (req->rq_pack_bulk) { + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + } + + alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + if (!req->rq_reqbuf) { + LASSERT(!req->rq_pool); + + alloc_len = size_roundup_power2(alloc_len); + OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len); + if (!req->rq_reqbuf) + RETURN(-ENOMEM); + + req->rq_reqbuf_len = alloc_len; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= alloc_len); + memset(req->rq_reqbuf, 0, alloc_len); + } + + lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0); + + if (req->rq_pack_udesc) + sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF); + + RETURN(0); +} + +static +void plain_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + ENTRY; + if (!req->rq_pool) { + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } + EXIT; +} + +static +int plain_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int alloc_len; + ENTRY; + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_bulk) { + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + } + + alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + /* add space for early reply */ + alloc_len += plain_at_offset; + + alloc_len = size_roundup_power2(alloc_len); + + OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len); + if (!req->rq_repbuf) + RETURN(-ENOMEM); + + req->rq_repbuf_len = alloc_len; + RETURN(0); +} + +static +void plain_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + ENTRY; + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; + EXIT; +} + +static +int plain_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + int oldsize; + int newmsg_size, newbuf_size; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len >= req->rq_reqlen); + LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) == + req->rq_reqmsg); + + /* compute new embedded msg size. */ + oldsize = req->rq_reqmsg->lm_buflens[segment]; + req->rq_reqmsg->lm_buflens[segment] = newsize; + newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount, + req->rq_reqmsg->lm_buflens); + req->rq_reqmsg->lm_buflens[segment] = oldsize; + + /* compute new wrapper msg size. */ + oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF]; + req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size; + newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount, + req->rq_reqbuf->lm_buflens); + req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize; + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size); + + if (req->rq_reqbuf_len < newbuf_size) { + newbuf_size = size_roundup_power2(newbuf_size); + + OBD_ALLOC_LARGE(newbuf, newbuf_size); + if (newbuf == NULL) + RETURN(-ENOMEM); + + /* Must lock this, so that otherwise unprotected change of + * rq_reqmsg is not racing with parallel processing of + * imp_replay_list traversing threads. See LU-3333 + * This is a bandaid at best, we really need to deal with this + * in request enlarging code before unpacking that's already + * there */ + if (req->rq_import) + spin_lock(&req->rq_import->imp_lock); + + memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = newbuf; + req->rq_reqbuf_len = newbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, + PLAIN_PACK_MSG_OFF, 0); + + if (req->rq_import) + spin_unlock(&req->rq_import->imp_lock); + } + + _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, + newmsg_size); + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + + req->rq_reqlen = newmsg_size; + RETURN(0); +} + +/**************************************** + * service apis * + ****************************************/ + +static struct ptlrpc_svc_ctx plain_svc_ctx = { + .sc_refcount = ATOMIC_INIT(1), + .sc_policy = &plain_policy, +}; + +static int plain_accept(struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct plain_header *phdr; + bool swabbed; + + ENTRY; + LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == + SPTLRPC_POLICY_PLAIN); + + if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) != + SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) || + SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) != + SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) { + CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc); + RETURN(SECSVC_DROP); + } + + if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) { + CERROR("unexpected request buf count %u\n", msg->lm_bufcount); + RETURN(SECSVC_DROP); + } + + swabbed = ptlrpc_req_need_swab(req); + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); + if (phdr == NULL) { + CERROR("missing plain header\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_ver != 0) { + CERROR("Invalid header version\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) { + CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg); + RETURN(-EPROTO); + } + + req->rq_sp_from = phdr->ph_sp; + req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg; + + if (phdr->ph_flags & PLAIN_FL_USER) { + if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF, + swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(SECSVC_DROP); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0); + } + + if (phdr->ph_flags & PLAIN_FL_BULK) { + if (plain_unpack_bsd(msg, swabbed)) + RETURN(SECSVC_DROP); + + req->rq_pack_bulk = 1; + } + + req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); + req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF]; + + req->rq_svc_ctx = &plain_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + RETURN(SECSVC_OK); +} + +static +int plain_alloc_rs(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_reply_state *rs; + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int rs_size = sizeof(*rs); + ENTRY; + + LASSERT(msgsize % 8 == 0); + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write)) + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + + rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + RETURN(-ENOMEM); + + rs->rs_size = rs_size; + } + + rs->rs_svc_ctx = req->rq_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = rs_size - sizeof(*rs); + + lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); + rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0); + + req->rq_reply_state = rs; + RETURN(0); +} + +static +void plain_free_rs(struct ptlrpc_reply_state *rs) +{ + ENTRY; + + LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1); + atomic_dec(&rs->rs_svc_ctx->sc_refcount); + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); + EXIT; +} + +static +int plain_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct lustre_msg_v2 *msg = rs->rs_repbuf; + struct plain_header *phdr; + int len; + ENTRY; + + LASSERT(rs); + LASSERT(msg); + + if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF]) + len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF, + req->rq_replen, 1); + else + len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + + msg->lm_secflvr = req->rq_flvr.sf_rpc; + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); + phdr->ph_ver = 0; + phdr->ph_flags = 0; + phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; + + if (req->rq_pack_bulk) + phdr->ph_flags |= PLAIN_FL_BULK; + + rs->rs_repdata_len = len; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = plain_at_offset; + else + req->rq_reply_off = 0; + } else { + unsigned int hsize = 4; + + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, + lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), + lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF), + NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize); + req->rq_reply_off = 0; + } + + RETURN(0); +} + +static +int plain_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenr; + int rc; + + LASSERT(req->rq_bulk_write); + LASSERT(req->rq_pack_bulk); + + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + tokenr = (struct plain_bulk_token *) bsdr->bsd_data; + bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); + + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenr); + if (rc) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("bulk write: server verify failed: %d\n", rc); + } + + return rc; +} + +static +int plain_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenv; + int rc; + + LASSERT(req->rq_bulk_read); + LASSERT(req->rq_pack_bulk); + + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenv); + if (rc) { + CERROR("bulk read: server failed to compute " + "checksum: %d\n", rc); + } else { + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) + corrupt_bulk_data(desc); + } + + return rc; +} + +static struct ptlrpc_ctx_ops plain_ctx_ops = { + .refresh = plain_ctx_refresh, + .validate = plain_ctx_validate, + .sign = plain_ctx_sign, + .verify = plain_ctx_verify, + .wrap_bulk = plain_cli_wrap_bulk, + .unwrap_bulk = plain_cli_unwrap_bulk, +}; + +static struct ptlrpc_sec_cops plain_sec_cops = { + .create_sec = plain_create_sec, + .destroy_sec = plain_destroy_sec, + .kill_sec = plain_kill_sec, + .lookup_ctx = plain_lookup_ctx, + .release_ctx = plain_release_ctx, + .flush_ctx_cache = plain_flush_ctx_cache, + .alloc_reqbuf = plain_alloc_reqbuf, + .free_reqbuf = plain_free_reqbuf, + .alloc_repbuf = plain_alloc_repbuf, + .free_repbuf = plain_free_repbuf, + .enlarge_reqbuf = plain_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops plain_sec_sops = { + .accept = plain_accept, + .alloc_rs = plain_alloc_rs, + .authorize = plain_authorize, + .free_rs = plain_free_rs, + .unwrap_bulk = plain_svc_unwrap_bulk, + .wrap_bulk = plain_svc_wrap_bulk, +}; + +static struct ptlrpc_sec_policy plain_policy = { + .sp_owner = THIS_MODULE, + .sp_name = "plain", + .sp_policy = SPTLRPC_POLICY_PLAIN, + .sp_cops = &plain_sec_cops, + .sp_sops = &plain_sec_sops, +}; + +int sptlrpc_plain_init(void) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int rc; + + buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size(); + plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + rc = sptlrpc_register_policy(&plain_policy); + if (rc) + CERROR("failed to register: %d\n", rc); + + return rc; +} + +void sptlrpc_plain_fini(void) +{ + int rc; + + rc = sptlrpc_unregister_policy(&plain_policy); + if (rc) + CERROR("cannot unregister: %d\n", rc); +} diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c new file mode 100644 index 0000000000000..6373c36865f3d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c @@ -0,0 +1,3389 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" +#include + +/* The following are visible and mutable through /sys/module/ptlrpc */ +int test_req_buffer_pressure = 0; +module_param(test_req_buffer_pressure, int, 0444); +MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools"); +module_param(at_min, int, 0644); +MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)"); +module_param(at_max, int, 0644); +MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)"); +module_param(at_history, int, 0644); +MODULE_PARM_DESC(at_history, + "Adaptive timeouts remember the slowest event that took place within this period (sec)"); +module_param(at_early_margin, int, 0644); +MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply"); +module_param(at_extra, int, 0644); +MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply"); + +/* forward ref */ +static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt); +static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req); +static void ptlrpc_at_remove_timed(struct ptlrpc_request *req); + +/** Holds a list of all PTLRPC services */ +struct list_head ptlrpc_all_services; +/** Used to protect the \e ptlrpc_all_services list */ +struct mutex ptlrpc_all_services_mutex; + +static struct ptlrpc_request_buffer_desc * +ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request_buffer_desc *rqbd; + + OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt); + if (rqbd == NULL) + return NULL; + + rqbd->rqbd_svcpt = svcpt; + rqbd->rqbd_refcount = 0; + rqbd->rqbd_cbid.cbid_fn = request_in_callback; + rqbd->rqbd_cbid.cbid_arg = rqbd; + INIT_LIST_HEAD(&rqbd->rqbd_reqs); + OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable, + svcpt->scp_cpt, svc->srv_buf_size); + if (rqbd->rqbd_buffer == NULL) { + OBD_FREE_PTR(rqbd); + return NULL; + } + + spin_lock(&svcpt->scp_lock); + list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); + svcpt->scp_nrqbds_total++; + spin_unlock(&svcpt->scp_lock); + + return rqbd; +} + +static void +ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd) +{ + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + + LASSERT(rqbd->rqbd_refcount == 0); + LASSERT(list_empty(&rqbd->rqbd_reqs)); + + spin_lock(&svcpt->scp_lock); + list_del(&rqbd->rqbd_list); + svcpt->scp_nrqbds_total--; + spin_unlock(&svcpt->scp_lock); + + OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size); + OBD_FREE_PTR(rqbd); +} + +static int +ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request_buffer_desc *rqbd; + int rc = 0; + int i; + + if (svcpt->scp_rqbd_allocating) + goto try_post; + + spin_lock(&svcpt->scp_lock); + /* check again with lock */ + if (svcpt->scp_rqbd_allocating) { + /* NB: we might allow more than one thread in the future */ + LASSERT(svcpt->scp_rqbd_allocating == 1); + spin_unlock(&svcpt->scp_lock); + goto try_post; + } + + svcpt->scp_rqbd_allocating++; + spin_unlock(&svcpt->scp_lock); + + + for (i = 0; i < svc->srv_nbuf_per_group; i++) { + /* NB: another thread might have recycled enough rqbds, we + * need to make sure it wouldn't over-allocate, see LU-1212. */ + if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group || + (svc->srv_nrqbds_max != 0 && + svcpt->scp_nrqbds_total > svc->srv_nrqbds_max)) + break; + + rqbd = ptlrpc_alloc_rqbd(svcpt); + + if (rqbd == NULL) { + CERROR("%s: Can't allocate request buffer\n", + svc->srv_name); + rc = -ENOMEM; + break; + } + } + + spin_lock(&svcpt->scp_lock); + + LASSERT(svcpt->scp_rqbd_allocating == 1); + svcpt->scp_rqbd_allocating--; + + spin_unlock(&svcpt->scp_lock); + + CDEBUG(D_RPCTRACE, + "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n", + svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted, + svcpt->scp_nrqbds_total, rc); + + try_post: + if (post && rc == 0) + rc = ptlrpc_server_post_idle_rqbds(svcpt); + + return rc; +} + +/** + * Part of Rep-Ack logic. + * Puts a lock and its mode into reply state assotiated to request reply. + */ +void +ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock, + int mode, bool no_ack, bool convert_lock) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + int idx; + + LASSERT(rs != NULL); + LASSERT(rs->rs_nlocks < RS_MAX_LOCKS); + + idx = rs->rs_nlocks++; + rs->rs_locks[idx] = *lock; + rs->rs_modes[idx] = mode; + rs->rs_difficult = 1; + rs->rs_no_ack = no_ack; + rs->rs_convert_lock = convert_lock; +} +EXPORT_SYMBOL(ptlrpc_save_lock); + + +struct ptlrpc_hr_partition; + +struct ptlrpc_hr_thread { + int hrt_id; /* thread ID */ + spinlock_t hrt_lock; + wait_queue_head_t hrt_waitq; + struct list_head hrt_queue; + struct ptlrpc_hr_partition *hrt_partition; +}; + +struct ptlrpc_hr_partition { + /* # of started threads */ + atomic_t hrp_nstarted; + /* # of stopped threads */ + atomic_t hrp_nstopped; + /* cpu partition id */ + int hrp_cpt; + /* round-robin rotor for choosing thread */ + int hrp_rotor; + /* total number of threads on this partition */ + int hrp_nthrs; + /* threads table */ + struct ptlrpc_hr_thread *hrp_thrs; +}; + +#define HRT_RUNNING 0 +#define HRT_STOPPING 1 + +struct ptlrpc_hr_service { + /* CPU partition table, it's just cfs_cpt_table for now */ + struct cfs_cpt_table *hr_cpt_table; + /** controller sleep waitq */ + wait_queue_head_t hr_waitq; + unsigned int hr_stopping; + /** roundrobin rotor for non-affinity service */ + unsigned int hr_rotor; + /* partition data */ + struct ptlrpc_hr_partition **hr_partitions; +}; + +struct rs_batch { + struct list_head rsb_replies; + unsigned int rsb_n_replies; + struct ptlrpc_service_part *rsb_svcpt; +}; + +/** reply handling service. */ +static struct ptlrpc_hr_service ptlrpc_hr; + +/** + * maximum mumber of replies scheduled in one batch + */ +#define MAX_SCHEDULED 256 + +/** + * Initialize a reply batch. + * + * \param b batch + */ +static void rs_batch_init(struct rs_batch *b) +{ + memset(b, 0, sizeof *b); + INIT_LIST_HEAD(&b->rsb_replies); +} + +/** + * Choose an hr thread to dispatch requests to. + */ +static struct ptlrpc_hr_thread * +ptlrpc_hr_select(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_hr_partition *hrp; + unsigned int rotor; + + if (svcpt->scp_cpt >= 0 && + svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) { + /* directly match partition */ + hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt]; + + } else { + rotor = ptlrpc_hr.hr_rotor++; + rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table); + + hrp = ptlrpc_hr.hr_partitions[rotor]; + } + + rotor = hrp->hrp_rotor++; + return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs]; +} + +/** + * Dispatch all replies accumulated in the batch to one from + * dedicated reply handling threads. + * + * \param b batch + */ +static void rs_batch_dispatch(struct rs_batch *b) +{ + if (b->rsb_n_replies != 0) { + struct ptlrpc_hr_thread *hrt; + + hrt = ptlrpc_hr_select(b->rsb_svcpt); + + spin_lock(&hrt->hrt_lock); + list_splice_init(&b->rsb_replies, &hrt->hrt_queue); + spin_unlock(&hrt->hrt_lock); + + wake_up(&hrt->hrt_waitq); + b->rsb_n_replies = 0; + } +} + +/** + * Add a reply to a batch. + * Add one reply object to a batch, schedule batched replies if overload. + * + * \param b batch + * \param rs reply + */ +static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) { + if (b->rsb_svcpt != NULL) { + rs_batch_dispatch(b); + spin_unlock(&b->rsb_svcpt->scp_rep_lock); + } + spin_lock(&svcpt->scp_rep_lock); + b->rsb_svcpt = svcpt; + } + spin_lock(&rs->rs_lock); + rs->rs_scheduled_ever = 1; + if (rs->rs_scheduled == 0) { + list_move(&rs->rs_list, &b->rsb_replies); + rs->rs_scheduled = 1; + b->rsb_n_replies++; + } + rs->rs_committed = 1; + spin_unlock(&rs->rs_lock); +} + +/** + * Reply batch finalization. + * Dispatch remaining replies from the batch + * and release remaining spinlock. + * + * \param b batch + */ +static void rs_batch_fini(struct rs_batch *b) +{ + if (b->rsb_svcpt != NULL) { + rs_batch_dispatch(b); + spin_unlock(&b->rsb_svcpt->scp_rep_lock); + } +} + +#define DECLARE_RS_BATCH(b) struct rs_batch b + + +/** + * Put reply state into a queue for processing because we received + * ACK from the client + */ +void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_hr_thread *hrt; + ENTRY; + + LASSERT(list_empty(&rs->rs_list)); + + hrt = ptlrpc_hr_select(rs->rs_svcpt); + + spin_lock(&hrt->hrt_lock); + list_add_tail(&rs->rs_list, &hrt->hrt_queue); + spin_unlock(&hrt->hrt_lock); + + wake_up(&hrt->hrt_waitq); + EXIT; +} + +void +ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs) +{ + ENTRY; + + assert_spin_locked(&rs->rs_svcpt->scp_rep_lock); + assert_spin_locked(&rs->rs_lock); + LASSERT (rs->rs_difficult); + rs->rs_scheduled_ever = 1; /* flag any notification attempt */ + + if (rs->rs_scheduled) { /* being set up or already notified */ + EXIT; + return; + } + + rs->rs_scheduled = 1; + list_del_init(&rs->rs_list); + ptlrpc_dispatch_difficult_reply(rs); + EXIT; +} +EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply); + +void ptlrpc_commit_replies(struct obd_export *exp) +{ + struct ptlrpc_reply_state *rs, *nxt; + DECLARE_RS_BATCH(batch); + ENTRY; + + rs_batch_init(&batch); + /* Find any replies that have been committed and get their service + * to attend to complete them. */ + + /* CAVEAT EMPTOR: spinlock ordering!!! */ + spin_lock(&exp->exp_uncommitted_replies_lock); + list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies, + rs_obd_list) { + LASSERT (rs->rs_difficult); + /* VBR: per-export last_committed */ + LASSERT(rs->rs_export); + if (rs->rs_transno <= exp->exp_last_committed) { + list_del_init(&rs->rs_obd_list); + rs_batch_add(&batch, rs); + } + } + spin_unlock(&exp->exp_uncommitted_replies_lock); + rs_batch_fini(&batch); + EXIT; +} + +static int +ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_request_buffer_desc *rqbd; + int rc; + int posted = 0; + + for (;;) { + spin_lock(&svcpt->scp_lock); + + if (list_empty(&svcpt->scp_rqbd_idle)) { + spin_unlock(&svcpt->scp_lock); + return posted; + } + + rqbd = list_entry(svcpt->scp_rqbd_idle.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + list_del(&rqbd->rqbd_list); + + /* assume we will post successfully */ + svcpt->scp_nrqbds_posted++; + list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); + + spin_unlock(&svcpt->scp_lock); + + rc = ptlrpc_register_rqbd(rqbd); + if (rc != 0) + break; + + posted = 1; + } + + spin_lock(&svcpt->scp_lock); + + svcpt->scp_nrqbds_posted--; + list_del(&rqbd->rqbd_list); + list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); + + /* Don't complain if no request buffers are posted right now; LNET + * won't drop requests because we set the portal lazy! */ + + spin_unlock(&svcpt->scp_lock); + + return -1; +} + +static void ptlrpc_at_timer(cfs_timer_cb_arg_t data) +{ + struct ptlrpc_service_part *svcpt; + + svcpt = cfs_from_timer(svcpt, data, scp_at_timer); + + svcpt->scp_at_check = 1; + svcpt->scp_at_checktime = ktime_get(); + wake_up(&svcpt->scp_waitq); +} + +static void +ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, + struct ptlrpc_service_conf *conf) +{ + struct ptlrpc_service_thr_conf *tc = &conf->psc_thr; + unsigned init; + unsigned total; + unsigned nthrs; + int weight; + + /* + * Common code for estimating & validating threads number. + * CPT affinity service could have percpt thread-pool instead + * of a global thread-pool, which means user might not always + * get the threads number they give it in conf::tc_nthrs_user + * even they did set. It's because we need to validate threads + * number for each CPT to guarantee each pool will have enough + * threads to keep the service healthy. + */ + init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL); + init = max_t(int, init, tc->tc_nthrs_init); + + /* NB: please see comments in lustre_lnet.h for definition + * details of these members */ + LASSERT(tc->tc_nthrs_max != 0); + + if (tc->tc_nthrs_user != 0) { + /* In case there is a reason to test a service with many + * threads, we give a less strict check here, it can + * be up to 8 * nthrs_max */ + total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user); + nthrs = total / svc->srv_ncpts; + init = max(init, nthrs); + goto out; + } + + total = tc->tc_nthrs_max; + if (tc->tc_nthrs_base == 0) { + /* don't care about base threads number per partition, + * this is most for non-affinity service */ + nthrs = total / svc->srv_ncpts; + goto out; + } + + nthrs = tc->tc_nthrs_base; + if (svc->srv_ncpts == 1) { + int i; + + /* NB: Increase the base number if it's single partition + * and total number of cores/HTs is larger or equal to 4. + * result will always < 2 * nthrs_base */ + weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY); + for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */ + (tc->tc_nthrs_base >> i) != 0; i++) + nthrs += tc->tc_nthrs_base >> i; + } + + if (tc->tc_thr_factor != 0) { + int factor = tc->tc_thr_factor; + const int fade = 4; + + /* + * User wants to increase number of threads with for + * each CPU core/HT, most likely the factor is larger than + * one thread/core because service threads are supposed to + * be blocked by lock or wait for IO. + */ + /* + * Amdahl's law says that adding processors wouldn't give + * a linear increasing of parallelism, so it's nonsense to + * have too many threads no matter how many cores/HTs + * there are. + */ + if (cpumask_weight(topology_sibling_cpumask(smp_processor_id())) > 1) { + /* weight is # of HTs */ + /* depress thread factor for hyper-thread */ + factor = factor - (factor >> 1) + (factor >> 3); + } + + weight = cfs_cpt_weight(svc->srv_cptable, 0); + + for (; factor > 0 && weight > 0; factor--, weight -= fade) + nthrs += min(weight, fade) * factor; + } + + if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { + nthrs = max(tc->tc_nthrs_base, + tc->tc_nthrs_max / svc->srv_ncpts); + } + out: + nthrs = max(nthrs, tc->tc_nthrs_init); + svc->srv_nthrs_cpt_limit = nthrs; + svc->srv_nthrs_cpt_init = init; + + if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { + CDEBUG(D_OTHER, "%s: This service may have more threads (%d) " + "than the given soft limit (%d)\n", + svc->srv_name, nthrs * svc->srv_ncpts, + tc->tc_nthrs_max); + } +} + +/** + * Initialize percpt data for a service + */ +static int +ptlrpc_service_part_init(struct ptlrpc_service *svc, + struct ptlrpc_service_part *svcpt, int cpt) +{ + struct ptlrpc_at_array *array; + int size; + int index; + int rc; + + svcpt->scp_cpt = cpt; + INIT_LIST_HEAD(&svcpt->scp_threads); + + /* rqbd and incoming request queue */ + spin_lock_init(&svcpt->scp_lock); + mutex_init(&svcpt->scp_mutex); + INIT_LIST_HEAD(&svcpt->scp_rqbd_idle); + INIT_LIST_HEAD(&svcpt->scp_rqbd_posted); + INIT_LIST_HEAD(&svcpt->scp_req_incoming); + init_waitqueue_head(&svcpt->scp_waitq); + /* history request & rqbd list */ + INIT_LIST_HEAD(&svcpt->scp_hist_reqs); + INIT_LIST_HEAD(&svcpt->scp_hist_rqbds); + + /* acitve requests and hp requests */ + spin_lock_init(&svcpt->scp_req_lock); + + /* reply states */ + spin_lock_init(&svcpt->scp_rep_lock); + INIT_LIST_HEAD(&svcpt->scp_rep_active); + INIT_LIST_HEAD(&svcpt->scp_rep_idle); + init_waitqueue_head(&svcpt->scp_rep_waitq); + atomic_set(&svcpt->scp_nreps_difficult, 0); + + /* adaptive timeout */ + spin_lock_init(&svcpt->scp_at_lock); + array = &svcpt->scp_at_array; + + size = at_est2timeout(at_max); + array->paa_size = size; + array->paa_count = 0; + array->paa_deadline = -1; + + /* allocate memory for scp_at_array (ptlrpc_at_array) */ + OBD_CPT_ALLOC(array->paa_reqs_array, + svc->srv_cptable, cpt, sizeof(struct list_head) * size); + if (array->paa_reqs_array == NULL) + return -ENOMEM; + + for (index = 0; index < size; index++) + INIT_LIST_HEAD(&array->paa_reqs_array[index]); + + OBD_CPT_ALLOC(array->paa_reqs_count, + svc->srv_cptable, cpt, sizeof(__u32) * size); + if (array->paa_reqs_count == NULL) + goto failed; + + cfs_timer_setup(&svcpt->scp_at_timer, ptlrpc_at_timer, + (unsigned long)svcpt, 0); + + /* At SOW, service time should be quick; 10s seems generous. If client + * timeout is less than this, we'll be sending an early reply. */ + at_init(&svcpt->scp_at_estimate, 10, 0); + + /* assign this before call ptlrpc_grow_req_bufs */ + svcpt->scp_service = svc; + /* Now allocate the request buffers, but don't post them now */ + rc = ptlrpc_grow_req_bufs(svcpt, 0); + /* We shouldn't be under memory pressure at startup, so + * fail if we can't allocate all our buffers at this time. */ + if (rc != 0) + goto failed; + + return 0; + + failed: + if (array->paa_reqs_count != NULL) { + OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size); + array->paa_reqs_count = NULL; + } + + if (array->paa_reqs_array != NULL) { + OBD_FREE(array->paa_reqs_array, + sizeof(struct list_head) * array->paa_size); + array->paa_reqs_array = NULL; + } + + return -ENOMEM; +} + +/** + * Initialize service on a given portal. + * This includes starting serving threads , allocating and posting rqbds and + * so on. + */ +struct ptlrpc_service * +ptlrpc_register_service(struct ptlrpc_service_conf *conf, + struct kset *parent, + struct dentry *debugfs_entry) +{ + struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt; + struct ptlrpc_service *service; + struct ptlrpc_service_part *svcpt; + struct cfs_cpt_table *cptable; + __u32 *cpts = NULL; + int ncpts; + int cpt; + int rc; + int i; + ENTRY; + + LASSERT(conf->psc_buf.bc_nbufs > 0); + LASSERT(conf->psc_buf.bc_buf_size >= + conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD); + LASSERT(conf->psc_thr.tc_ctx_tags != 0); + + cptable = cconf->cc_cptable; + if (cptable == NULL) + cptable = cfs_cpt_table; + + if (conf->psc_thr.tc_cpu_bind > 1) { + CERROR("%s: Invalid cpu bind value %d, only 1 or 0 allowed\n", + conf->psc_name, conf->psc_thr.tc_cpu_bind); + RETURN(ERR_PTR(-EINVAL)); + } + + if (!cconf->cc_affinity) { + ncpts = 1; + } else { + ncpts = cfs_cpt_number(cptable); + if (cconf->cc_pattern != NULL) { + struct cfs_expr_list *el; + + rc = cfs_expr_list_parse(cconf->cc_pattern, + strlen(cconf->cc_pattern), + 0, ncpts - 1, &el); + if (rc != 0) { + CERROR("%s: invalid CPT pattern string: %s", + conf->psc_name, cconf->cc_pattern); + RETURN(ERR_PTR(-EINVAL)); + } + + rc = cfs_expr_list_values(el, ncpts, &cpts); + cfs_expr_list_free(el); + if (rc <= 0) { + CERROR("%s: failed to parse CPT array %s: %d\n", + conf->psc_name, cconf->cc_pattern, rc); + if (cpts != NULL) + OBD_FREE(cpts, sizeof(*cpts) * ncpts); + RETURN(ERR_PTR(rc < 0 ? rc : -EINVAL)); + } + ncpts = rc; + } + } + + OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts])); + if (service == NULL) { + if (cpts != NULL) + OBD_FREE(cpts, sizeof(*cpts) * ncpts); + RETURN(ERR_PTR(-ENOMEM)); + } + + service->srv_cptable = cptable; + service->srv_cpts = cpts; + service->srv_ncpts = ncpts; + service->srv_cpt_bind = conf->psc_thr.tc_cpu_bind; + + service->srv_cpt_bits = 0; /* it's zero already, easy to read... */ + while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable)) + service->srv_cpt_bits++; + + /* public members */ + spin_lock_init(&service->srv_lock); + service->srv_name = conf->psc_name; + service->srv_watchdog_factor = conf->psc_watchdog_factor; + INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */ + + /* buffer configuration */ + service->srv_nbuf_per_group = test_req_buffer_pressure ? + 1 : conf->psc_buf.bc_nbufs; + /* do not limit max number of rqbds by default */ + service->srv_nrqbds_max = 0; + + service->srv_max_req_size = conf->psc_buf.bc_req_max_size + + SPTLRPC_MAX_PAYLOAD; + service->srv_buf_size = conf->psc_buf.bc_buf_size; + service->srv_rep_portal = conf->psc_buf.bc_rep_portal; + service->srv_req_portal = conf->psc_buf.bc_req_portal; + + /* Increase max reply size to next power of two */ + service->srv_max_reply_size = 1; + while (service->srv_max_reply_size < + conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD) + service->srv_max_reply_size <<= 1; + + service->srv_thread_name = conf->psc_thr.tc_thr_name; + service->srv_ctx_tags = conf->psc_thr.tc_ctx_tags; + service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO; + service->srv_ops = conf->psc_ops; + + for (i = 0; i < ncpts; i++) { + if (!cconf->cc_affinity) + cpt = CFS_CPT_ANY; + else + cpt = cpts != NULL ? cpts[i] : i; + + OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt)); + if (svcpt == NULL) + GOTO(failed, rc = -ENOMEM); + + service->srv_parts[i] = svcpt; + rc = ptlrpc_service_part_init(service, svcpt, cpt); + if (rc != 0) + GOTO(failed, rc); + } + + ptlrpc_server_nthreads_check(service, conf); + + rc = LNetSetLazyPortal(service->srv_req_portal); + LASSERT(rc == 0); + + mutex_lock(&ptlrpc_all_services_mutex); + list_add(&service->srv_list, &ptlrpc_all_services); + mutex_unlock(&ptlrpc_all_services_mutex); + + if (parent) { + rc = ptlrpc_sysfs_register_service(parent, service); + if (rc) + GOTO(failed, rc); + } + + if (debugfs_entry != NULL) + ptlrpc_ldebugfs_register_service(debugfs_entry, service); + + rc = ptlrpc_service_nrs_setup(service); + if (rc != 0) + GOTO(failed, rc); + + CDEBUG(D_NET, "%s: Started, listening on portal %d\n", + service->srv_name, service->srv_req_portal); + + rc = ptlrpc_start_threads(service); + if (rc != 0) { + CERROR("Failed to start threads for service %s: %d\n", + service->srv_name, rc); + GOTO(failed, rc); + } + + RETURN(service); +failed: + ptlrpc_unregister_service(service); + RETURN(ERR_PTR(rc)); +} +EXPORT_SYMBOL(ptlrpc_register_service); + +/** + * to actually free the request, must be called without holding svc_lock. + * note it's caller's responsibility to unlink req->rq_list. + */ +static void ptlrpc_server_free_request(struct ptlrpc_request *req) +{ + LASSERT(atomic_read(&req->rq_refcount) == 0); + LASSERT(list_empty(&req->rq_timed_list)); + + /* DEBUG_REQ() assumes the reply state of a request with a valid + * ref will not be destroyed until that reference is dropped. */ + ptlrpc_req_drop_rs(req); + + sptlrpc_svc_ctx_decref(req); + + if (req != &req->rq_rqbd->rqbd_req) { + /* NB request buffers use an embedded + * req if the incoming req unlinked the + * MD; this isn't one of them! */ + ptlrpc_request_cache_free(req); + } +} + +/** + * drop a reference count of the request. if it reaches 0, we either + * put it into history list, or free it immediately. + */ +void ptlrpc_server_drop_request(struct ptlrpc_request *req) +{ + struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + int refcount; + struct list_head *tmp; + struct list_head *nxt; + + if (!atomic_dec_and_test(&req->rq_refcount)) + return; + + if (req->rq_session.lc_state == LCS_ENTERED) { + lu_context_exit(&req->rq_session); + lu_context_fini(&req->rq_session); + } + + if (req->rq_at_linked) { + spin_lock(&svcpt->scp_at_lock); + /* recheck with lock, in case it's unlinked by + * ptlrpc_at_check_timed() */ + if (likely(req->rq_at_linked)) + ptlrpc_at_remove_timed(req); + spin_unlock(&svcpt->scp_at_lock); + } + + LASSERT(list_empty(&req->rq_timed_list)); + + /* finalize request */ + if (req->rq_export) { + class_export_put(req->rq_export); + req->rq_export = NULL; + } + + spin_lock(&svcpt->scp_lock); + + list_add(&req->rq_list, &rqbd->rqbd_reqs); + + refcount = --(rqbd->rqbd_refcount); + if (refcount == 0) { + /* request buffer is now idle: add to history */ + list_del(&rqbd->rqbd_list); + + list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); + svcpt->scp_hist_nrqbds++; + + /* cull some history? + * I expect only about 1 or 2 rqbds need to be recycled here */ + while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) { + rqbd = list_entry(svcpt->scp_hist_rqbds.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + + list_del(&rqbd->rqbd_list); + svcpt->scp_hist_nrqbds--; + + /* remove rqbd's reqs from svc's req history while + * I've got the service lock */ + list_for_each(tmp, &rqbd->rqbd_reqs) { + req = list_entry(tmp, struct ptlrpc_request, + rq_list); + /* Track the highest culled req seq */ + if (req->rq_history_seq > + svcpt->scp_hist_seq_culled) { + svcpt->scp_hist_seq_culled = + req->rq_history_seq; + } + list_del(&req->rq_history_list); + } + + spin_unlock(&svcpt->scp_lock); + + list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) { + req = list_entry(rqbd->rqbd_reqs.next, + struct ptlrpc_request, + rq_list); + list_del(&req->rq_list); + ptlrpc_server_free_request(req); + } + + spin_lock(&svcpt->scp_lock); + /* + * now all reqs including the embedded req has been + * disposed, schedule request buffer for re-use + * or free it to drain some in excess. + */ + LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == 0); + if (svcpt->scp_nrqbds_posted >= + svc->srv_nbuf_per_group || + (svc->srv_nrqbds_max != 0 && + svcpt->scp_nrqbds_total > svc->srv_nrqbds_max) || + test_req_buffer_pressure) { + /* like in ptlrpc_free_rqbd() */ + svcpt->scp_nrqbds_total--; + OBD_FREE_LARGE(rqbd->rqbd_buffer, + svc->srv_buf_size); + OBD_FREE_PTR(rqbd); + } else { + list_add_tail(&rqbd->rqbd_list, + &svcpt->scp_rqbd_idle); + } + } + + spin_unlock(&svcpt->scp_lock); + } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { + /* If we are low on memory, we are not interested in history */ + list_del(&req->rq_list); + list_del_init(&req->rq_history_list); + + /* Track the highest culled req seq */ + if (req->rq_history_seq > svcpt->scp_hist_seq_culled) + svcpt->scp_hist_seq_culled = req->rq_history_seq; + + spin_unlock(&svcpt->scp_lock); + + ptlrpc_server_free_request(req); + } else { + spin_unlock(&svcpt->scp_lock); + } +} + +/** Change request export and move hp request from old export to new */ +void ptlrpc_request_change_export(struct ptlrpc_request *req, + struct obd_export *export) +{ + if (req->rq_export != NULL) { + LASSERT(!list_empty(&req->rq_exp_list)); + /* remove rq_exp_list from last export */ + spin_lock(&req->rq_export->exp_rpc_lock); + list_del_init(&req->rq_exp_list); + spin_unlock(&req->rq_export->exp_rpc_lock); + /* export has one reference already, so it`s safe to + * add req to export queue here and get another + * reference for request later */ + spin_lock(&export->exp_rpc_lock); + if (req->rq_ops != NULL) /* hp request */ + list_add(&req->rq_exp_list, &export->exp_hp_rpcs); + else + list_add(&req->rq_exp_list, &export->exp_reg_rpcs); + spin_unlock(&export->exp_rpc_lock); + + class_export_rpc_dec(req->rq_export); + class_export_put(req->rq_export); + } + + /* request takes one export refcount */ + req->rq_export = class_export_get(export); + class_export_rpc_inc(export); + + return; +} + +/** + * to finish a request: stop sending more early replies, and release + * the request. + */ +static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + ptlrpc_server_hpreq_fini(req); + + ptlrpc_server_drop_request(req); +} + +/** + * to finish an active request: stop sending more early replies, and release + * the request. should be called after we finished handling the request. + */ +static void ptlrpc_server_finish_active_request( + struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + spin_lock(&svcpt->scp_req_lock); + ptlrpc_nrs_req_stop_nolock(req); + svcpt->scp_nreqs_active--; + if (req->rq_hp) + svcpt->scp_nhreqs_active--; + spin_unlock(&svcpt->scp_req_lock); + + ptlrpc_nrs_req_finalize(req); + + if (req->rq_export != NULL) + class_export_rpc_dec(req->rq_export); + + ptlrpc_server_finish_request(svcpt, req); +} + +/** + * This function makes sure dead exports are evicted in a timely manner. + * This function is only called when some export receives a message (i.e., + * the network is up.) + */ +void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay) +{ + struct obd_export *oldest_exp; + time64_t oldest_time, new_time; + + ENTRY; + + LASSERT(exp); + + /* Compensate for slow machines, etc, by faking our request time + into the future. Although this can break the strict time-ordering + of the list, we can be really lazy here - we don't have to evict + at the exact right moment. Eventually, all silent exports + will make it to the top of the list. */ + + /* Do not pay attention on 1sec or smaller renewals. */ + new_time = ktime_get_real_seconds() + extra_delay; + if (exp->exp_last_request_time + 1 /*second */ >= new_time) + RETURN_EXIT; + + exp->exp_last_request_time = new_time; + + /* exports may get disconnected from the chain even though the + export has references, so we must keep the spin lock while + manipulating the lists */ + spin_lock(&exp->exp_obd->obd_dev_lock); + + if (list_empty(&exp->exp_obd_chain_timed)) { + /* this one is not timed */ + spin_unlock(&exp->exp_obd->obd_dev_lock); + RETURN_EXIT; + } + + list_move_tail(&exp->exp_obd_chain_timed, + &exp->exp_obd->obd_exports_timed); + + oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next, + struct obd_export, exp_obd_chain_timed); + oldest_time = oldest_exp->exp_last_request_time; + spin_unlock(&exp->exp_obd->obd_dev_lock); + + if (exp->exp_obd->obd_recovering) { + /* be nice to everyone during recovery */ + EXIT; + return; + } + + /* Note - racing to start/reset the obd_eviction timer is safe */ + if (exp->exp_obd->obd_eviction_timer == 0) { + /* Check if the oldest entry is expired. */ + if (ktime_get_real_seconds() > + oldest_time + PING_EVICT_TIMEOUT + extra_delay) { + /* We need a second timer, in case the net was down and + * it just came back. Since the pinger may skip every + * other PING_INTERVAL (see note in ptlrpc_pinger_main), + * we better wait for 3. + */ + exp->exp_obd->obd_eviction_timer = + ktime_get_real_seconds() + 3 * PING_INTERVAL; + CDEBUG(D_HA, "%s: Think about evicting %s from %lld\n", + exp->exp_obd->obd_name, + obd_export_nid2str(oldest_exp), oldest_time); + } + } else { + if (ktime_get_real_seconds() > + (exp->exp_obd->obd_eviction_timer + extra_delay)) { + /* The evictor won't evict anyone who we've heard from + * recently, so we don't have to check before we start + * it. + */ + if (!ping_evictor_wake(exp)) + exp->exp_obd->obd_eviction_timer = 0; + } + } + + EXIT; +} + +/** + * Sanity check request \a req. + * Return 0 if all is ok, error code otherwise. + */ +static int ptlrpc_check_req(struct ptlrpc_request *req) +{ + struct obd_device *obd = req->rq_export->exp_obd; + int rc = 0; + + if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < + req->rq_export->exp_conn_cnt)) { + DEBUG_REQ(D_RPCTRACE, req, + "DROPPING req from old connection %d < %d", + lustre_msg_get_conn_cnt(req->rq_reqmsg), + req->rq_export->exp_conn_cnt); + return -EEXIST; + } + if (unlikely(obd == NULL || obd->obd_fail)) { + /* Failing over, don't handle any more reqs, + * send error response instead. */ + CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", + req, (obd != NULL) ? obd->obd_name : "unknown"); + rc = -ENODEV; + } else if (lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE) && + !obd->obd_recovering) { + DEBUG_REQ(D_ERROR, req, + "Invalid replay without recovery"); + class_fail_export(req->rq_export); + rc = -ENODEV; + } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 && + !obd->obd_recovering) { + DEBUG_REQ(D_ERROR, req, "Invalid req with transno " + "%llu without recovery", + lustre_msg_get_transno(req->rq_reqmsg)); + class_fail_export(req->rq_export); + rc = -ENODEV; + } + + if (unlikely(rc < 0)) { + req->rq_status = rc; + ptlrpc_error(req); + } + return rc; +} + +static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + time64_t next; + + if (array->paa_count == 0) { + del_timer(&svcpt->scp_at_timer); + return; + } + + /* Set timer for closest deadline */ + next = array->paa_deadline - ktime_get_real_seconds() - + at_early_margin; + if (next <= 0) { + ptlrpc_at_timer(cfs_timer_cb_arg(svcpt, scp_at_timer)); + } else { + mod_timer(&svcpt->scp_at_timer, + jiffies + nsecs_to_jiffies(next * NSEC_PER_SEC)); + CDEBUG(D_INFO, "armed %s at %+llds\n", + svcpt->scp_service->srv_name, next); + } +} + +/* Add rpc to early reply check list */ +static int ptlrpc_at_add_timed(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + struct ptlrpc_request *rq = NULL; + __u32 index; + + if (AT_OFF) + return(0); + + if (req->rq_no_reply) + return 0; + + if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0) + return(-ENOSYS); + + spin_lock(&svcpt->scp_at_lock); + LASSERT(list_empty(&req->rq_timed_list)); + + div_u64_rem(req->rq_deadline, array->paa_size, &index); + if (array->paa_reqs_count[index] > 0) { + /* latest rpcs will have the latest deadlines in the list, + * so search backward. */ + list_for_each_entry_reverse(rq, + &array->paa_reqs_array[index], + rq_timed_list) { + if (req->rq_deadline >= rq->rq_deadline) { + list_add(&req->rq_timed_list, + &rq->rq_timed_list); + break; + } + } + } + + /* Add the request at the head of the list */ + if (list_empty(&req->rq_timed_list)) + list_add(&req->rq_timed_list, + &array->paa_reqs_array[index]); + + spin_lock(&req->rq_lock); + req->rq_at_linked = 1; + spin_unlock(&req->rq_lock); + req->rq_at_index = index; + array->paa_reqs_count[index]++; + array->paa_count++; + if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) { + array->paa_deadline = req->rq_deadline; + ptlrpc_at_set_timer(svcpt); + } + spin_unlock(&svcpt->scp_at_lock); + + return 0; +} + +static void +ptlrpc_at_remove_timed(struct ptlrpc_request *req) +{ + struct ptlrpc_at_array *array; + + array = &req->rq_rqbd->rqbd_svcpt->scp_at_array; + + /* NB: must call with hold svcpt::scp_at_lock */ + LASSERT(!list_empty(&req->rq_timed_list)); + list_del_init(&req->rq_timed_list); + + spin_lock(&req->rq_lock); + req->rq_at_linked = 0; + spin_unlock(&req->rq_lock); + + array->paa_reqs_count[req->rq_at_index]--; + array->paa_count--; +} + +/* + * Attempt to extend the request deadline by sending an early reply to the + * client. + */ +static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_request *reqcopy; + struct lustre_msg *reqmsg; + time64_t olddl = req->rq_deadline - ktime_get_real_seconds(); + time64_t newdl; + int rc; + + ENTRY; + + if (CFS_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_RECONNECT)) { + /* don't send early reply */ + RETURN(1); + } + + /* deadline is when the client expects us to reply, margin is the + difference between clients' and servers' expectations */ + DEBUG_REQ(D_ADAPTTO, req, + "%ssending early reply (deadline %+llds, margin %+llds) for " + "%d+%d", AT_OFF ? "AT off - not " : "", + (s64)olddl, (s64)(olddl - at_get(&svcpt->scp_at_estimate)), + at_get(&svcpt->scp_at_estimate), at_extra); + + if (AT_OFF) + RETURN(0); + + if (olddl < 0) { + DEBUG_REQ(D_WARNING, req, "Already past deadline (%+llds), " + "not sending early reply. Consider increasing " + "at_early_margin (%d)?", (s64)olddl, at_early_margin); + + /* Return an error so we're not re-added to the timed list. */ + RETURN(-ETIMEDOUT); + } + + if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){ + DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, " + "but no AT support"); + RETURN(-ENOSYS); + } + + if (req->rq_export && + lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) { + struct obd_device *obd_exp = req->rq_export->exp_obd; + + /* During recovery, we don't want to send too many early + * replies, but on the other hand we want to make sure the + * client has enough time to resend if the rpc is lost. So + * during the recovery period send at least 4 early replies, + * spacing them every at_extra if we can. at_estimate should + * always equal this fixed value during recovery. + */ + /* Don't account request processing time into AT history + * during recovery, it is not service time we need but + * includes also waiting time for recovering clients + */ + newdl = min_t(time64_t, at_extra, + obd_exp->obd_recovery_timeout / 4) + + ktime_get_real_seconds(); + } else { + /* We want to extend the request deadline by at_extra seconds, + * so we set our service estimate to reflect how much time has + * passed since this request arrived plus an additional + * at_extra seconds. The client will calculate the new deadline + * based on this service estimate (plus some additional time to + * account for network latency). See ptlrpc_at_recv_early_reply + */ + at_measured(&svcpt->scp_at_estimate, at_extra + + ktime_get_real_seconds() - + req->rq_arrival_time.tv_sec); + newdl = req->rq_arrival_time.tv_sec + + at_get(&svcpt->scp_at_estimate); + } + + /* Check to see if we've actually increased the deadline - + * we may be past adaptive_max */ + if (req->rq_deadline >= newdl) { + DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%lld/%lld), not sending early reply\n", + (s64)olddl, (s64)(newdl - ktime_get_real_seconds())); + RETURN(-ETIMEDOUT); + } + + reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS); + if (reqcopy == NULL) + RETURN(-ENOMEM); + OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen); + if (!reqmsg) + GOTO(out_free, rc = -ENOMEM); + + *reqcopy = *req; + reqcopy->rq_reply_state = NULL; + reqcopy->rq_rep_swab_mask = 0; + reqcopy->rq_pack_bulk = 0; + reqcopy->rq_pack_udesc = 0; + reqcopy->rq_packed_final = 0; + sptlrpc_svc_ctx_addref(reqcopy); + /* We only need the reqmsg for the magic */ + reqcopy->rq_reqmsg = reqmsg; + memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); + + /* + * tgt_brw_read() and tgt_brw_write() may have decided not to reply. + * Without this check, we would fail the rq_no_reply assertion in + * ptlrpc_send_reply(). + */ + if (reqcopy->rq_no_reply) + GOTO(out, rc = -ETIMEDOUT); + + LASSERT(atomic_read(&req->rq_refcount)); + /** if it is last refcount then early reply isn't needed */ + if (atomic_read(&req->rq_refcount) == 1) { + DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, " + "abort sending early reply\n"); + GOTO(out, rc = -EINVAL); + } + + /* Connection ref */ + reqcopy->rq_export = class_conn2export( + lustre_msg_get_handle(reqcopy->rq_reqmsg)); + if (reqcopy->rq_export == NULL) + GOTO(out, rc = -ENODEV); + + /* RPC ref */ + class_export_rpc_inc(reqcopy->rq_export); + if (reqcopy->rq_export->exp_obd && + reqcopy->rq_export->exp_obd->obd_fail) + GOTO(out_put, rc = -ENODEV); + + rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); + if (rc) + GOTO(out_put, rc); + + rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); + + if (!rc) { + /* Adjust our own deadline to what we told the client */ + req->rq_deadline = newdl; + req->rq_early_count++; /* number sent, server side */ + } else { + DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); + } + + /* Free the (early) reply state from lustre_pack_reply. + (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */ + ptlrpc_req_drop_rs(reqcopy); + +out_put: + class_export_rpc_dec(reqcopy->rq_export); + class_export_put(reqcopy->rq_export); +out: + sptlrpc_svc_ctx_decref(reqcopy); + OBD_FREE_LARGE(reqmsg, req->rq_reqlen); +out_free: + ptlrpc_request_cache_free(reqcopy); + RETURN(rc); +} + +/* Send early replies to everybody expiring within at_early_margin + asking for at_extra time */ +static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + struct ptlrpc_request *rq, *n; + struct list_head work_list; + __u32 index, count; + time64_t deadline; + time64_t now = ktime_get_real_seconds(); + s64 delay; + int first, counter = 0; + + ENTRY; + spin_lock(&svcpt->scp_at_lock); + if (svcpt->scp_at_check == 0) { + spin_unlock(&svcpt->scp_at_lock); + RETURN(0); + } + delay = ktime_ms_delta(ktime_get(), svcpt->scp_at_checktime); + svcpt->scp_at_check = 0; + + if (array->paa_count == 0) { + spin_unlock(&svcpt->scp_at_lock); + RETURN(0); + } + + /* The timer went off, but maybe the nearest rpc already completed. */ + first = array->paa_deadline - now; + if (first > at_early_margin) { + /* We've still got plenty of time. Reset the timer. */ + ptlrpc_at_set_timer(svcpt); + spin_unlock(&svcpt->scp_at_lock); + RETURN(0); + } + + /* We're close to a timeout, and we don't know how much longer the + server will take. Send early replies to everyone expiring soon. */ + INIT_LIST_HEAD(&work_list); + deadline = -1; + div_u64_rem(array->paa_deadline, array->paa_size, &index); + count = array->paa_count; + while (count > 0) { + count -= array->paa_reqs_count[index]; + list_for_each_entry_safe(rq, n, + &array->paa_reqs_array[index], + rq_timed_list) { + if (rq->rq_deadline > now + at_early_margin) { + /* update the earliest deadline */ + if (deadline == -1 || + rq->rq_deadline < deadline) + deadline = rq->rq_deadline; + break; + } + + /** + * ptlrpc_server_drop_request() may drop + * refcount to 0 already. Let's check this and + * don't add entry to work_list + */ + if (likely(atomic_inc_not_zero(&rq->rq_refcount))) { + ptlrpc_at_remove_timed(rq); + list_add(&rq->rq_timed_list, &work_list); + } else { + ptlrpc_at_remove_timed(rq); + } + + counter++; + } + + if (++index >= array->paa_size) + index = 0; + } + array->paa_deadline = deadline; + /* we have a new earliest deadline, restart the timer */ + ptlrpc_at_set_timer(svcpt); + + spin_unlock(&svcpt->scp_at_lock); + + CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early " + "replies\n", first, at_extra, counter); + if (first < 0) { + /* We're already past request deadlines before we even get a + chance to send early replies */ + LCONSOLE_WARN("%s: This server is not able to keep up with " + "request traffic (cpu-bound).\n", + svcpt->scp_service->srv_name); + CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%lld\n", + counter, svcpt->scp_nreqs_incoming, + svcpt->scp_nreqs_active, + at_get(&svcpt->scp_at_estimate), delay); + } + + /* we took additional refcount so entries can't be deleted from list, no + * locking is needed */ + while (!list_empty(&work_list)) { + rq = list_entry(work_list.next, struct ptlrpc_request, + rq_timed_list); + list_del_init(&rq->rq_timed_list); + + if (ptlrpc_at_send_early_reply(rq) == 0) + ptlrpc_at_add_timed(rq); + + ptlrpc_server_drop_request(rq); + } + + RETURN(1); /* return "did_something" for liblustre */ +} + +/* Check if we are already handling earlier incarnation of this request. + * Called under &req->rq_export->exp_rpc_lock locked */ +static struct ptlrpc_request* +ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req) +{ + struct ptlrpc_request *tmp = NULL; + + if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) || + (atomic_read(&req->rq_export->exp_rpc_count) == 0)) + return NULL; + + /* This list should not be longer than max_requests in + * flights on the client, so it is not all that long. + * Also we only hit this codepath in case of a resent + * request which makes it even more rarely hit */ + list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs, + rq_exp_list) { + /* Found duplicate one */ + if (tmp->rq_xid == req->rq_xid) + goto found; + } + list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs, + rq_exp_list) { + /* Found duplicate one */ + if (tmp->rq_xid == req->rq_xid) + goto found; + } + return NULL; + +found: + DEBUG_REQ(D_HA, req, "Found duplicate req in processing"); + DEBUG_REQ(D_HA, tmp, "Request being processed"); + return tmp; +} + +/** + * Check if a request should be assigned with a high priority. + * + * \retval < 0: error occurred + * 0: normal RPC request + * +1: high priority request + */ +static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + int rc = 0; + ENTRY; + + if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) { + rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req); + if (rc < 0) + RETURN(rc); + + LASSERT(rc == 0); + } + + if (req->rq_export != NULL && req->rq_ops != NULL) { + /* Perform request specific check. We should do this + * check before the request is added into exp_hp_rpcs + * list otherwise it may hit swab race at LU-1044. */ + if (req->rq_ops->hpreq_check != NULL) { + rc = req->rq_ops->hpreq_check(req); + if (rc == -ESTALE) { + req->rq_status = rc; + ptlrpc_error(req); + } + /** can only return error, + * 0 for normal request, + * or 1 for high priority request */ + LASSERT(rc <= 1); + } + } + + RETURN(rc); +} + +/** Remove the request from the export list. */ +static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req) +{ + ENTRY; + if (req->rq_export) { + /* refresh lock timeout again so that client has more + * room to send lock cancel RPC. */ + if (req->rq_ops && req->rq_ops->hpreq_fini) + req->rq_ops->hpreq_fini(req); + + spin_lock(&req->rq_export->exp_rpc_lock); + list_del_init(&req->rq_exp_list); + spin_unlock(&req->rq_export->exp_rpc_lock); + } + EXIT; +} + +static int ptlrpc_hpreq_check(struct ptlrpc_request *req) +{ + return 1; +} + +static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = { + .hpreq_check = ptlrpc_hpreq_check, +}; + +/* Hi-Priority RPC check by RPC operation code. */ +int ptlrpc_hpreq_handler(struct ptlrpc_request *req) +{ + int opc = lustre_msg_get_opc(req->rq_reqmsg); + + /* Check for export to let only reconnects for not yet evicted + * export to become a HP rpc. */ + if ((req->rq_export != NULL) && + (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT)) + req->rq_ops = &ptlrpc_hpreq_common; + + return 0; +} +EXPORT_SYMBOL(ptlrpc_hpreq_handler); + +static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + int rc; + bool hp; + struct ptlrpc_request *orig; + ENTRY; + + rc = ptlrpc_server_hpreq_init(svcpt, req); + if (rc < 0) + RETURN(rc); + + hp = rc > 0; + ptlrpc_nrs_req_initialize(svcpt, req, hp); + + while (req->rq_export != NULL) { + struct obd_export *exp = req->rq_export; + + /* do search for duplicated xid and the adding to the list + * atomically */ + spin_lock_bh(&exp->exp_rpc_lock); + orig = ptlrpc_server_check_resend_in_progress(req); + if (orig && OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE)) { + spin_unlock_bh(&exp->exp_rpc_lock); + + OBD_RACE(OBD_FAIL_PTLRPC_RESEND_RACE); + msleep(4 * MSEC_PER_SEC); + continue; + } + if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) { + bool linked; + + spin_unlock_bh(&exp->exp_rpc_lock); + + /* + * When the client resend request and the server has + * the previous copy of it, we need to update deadlines, + * to be sure that the client and the server have equal + * request deadlines. + */ + + spin_lock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock); + linked = orig->rq_at_linked; + if (likely(linked)) + ptlrpc_at_remove_timed(orig); + spin_unlock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock); + orig->rq_deadline = req->rq_deadline; + if (likely(linked)) + ptlrpc_at_add_timed(orig); + ptlrpc_server_drop_request(orig); + ptlrpc_nrs_req_finalize(req); + RETURN(-EBUSY); + } + + if (hp || req->rq_ops != NULL) + list_add(&req->rq_exp_list, &exp->exp_hp_rpcs); + else + list_add(&req->rq_exp_list, &exp->exp_reg_rpcs); + spin_unlock_bh(&exp->exp_rpc_lock); + break; + } + + /* the current thread is not the processing thread for this request + * since that, but request is in exp_hp_list and can be find there. + * Remove all relations between request and old thread. */ + req->rq_svc_thread->t_env->le_ses = NULL; + req->rq_svc_thread = NULL; + req->rq_session.lc_thread = NULL; + + ptlrpc_nrs_req_add(svcpt, req, hp); + + RETURN(0); +} + +/** + * Allow to handle high priority request + * User can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_req_lock to get reliable result + */ +static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt, + bool force) +{ + int running = svcpt->scp_nthrs_running; + + if (!nrs_svcpt_has_hp(svcpt)) + return false; + + if (force) + return true; + + if (ptlrpc_nrs_req_throttling_nolock(svcpt, true)) + return false; + + if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && + CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { + /* leave just 1 thread for normal RPCs */ + running = PTLRPC_NTHRS_INIT; + if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) + running += 1; + } + + if (svcpt->scp_nreqs_active >= running - 1) + return false; + + if (svcpt->scp_nhreqs_active == 0) + return true; + + return !ptlrpc_nrs_req_pending_nolock(svcpt, false) || + svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio; +} + +static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt, + bool force) +{ + return ptlrpc_server_allow_high(svcpt, force) && + ptlrpc_nrs_req_pending_nolock(svcpt, true); +} + +/** + * Only allow normal priority requests on a service that has a high-priority + * queue if forced (i.e. cleanup), if there are other high priority requests + * already being processed (i.e. those threads can service more high-priority + * requests), or if there are enough idle threads that a later thread can do + * a high priority request. + * User can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_req_lock to get reliable result + */ +static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt, + bool force) +{ + int running = svcpt->scp_nthrs_running; + if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && + CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { + /* leave just 1 thread for normal RPCs */ + running = PTLRPC_NTHRS_INIT; + if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) + running += 1; + } + + if (force) + return true; + + if (ptlrpc_nrs_req_throttling_nolock(svcpt, false)) + return false; + + if (svcpt->scp_nreqs_active < running - 2) + return true; + + if (svcpt->scp_nreqs_active >= running - 1) + return false; + + return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt); +} + +static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt, + bool force) +{ + return ptlrpc_server_allow_normal(svcpt, force) && + ptlrpc_nrs_req_pending_nolock(svcpt, false); +} + +/** + * Returns true if there are requests available in incoming + * request queue for processing and it is allowed to fetch them. + * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock + * to get reliable result + * \see ptlrpc_server_allow_normal + * \see ptlrpc_server_allow high + */ +static inline bool +ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force) +{ + return ptlrpc_server_high_pending(svcpt, force) || + ptlrpc_server_normal_pending(svcpt, force); +} + +/** + * Fetch a request for processing from queue of unprocessed requests. + * Favors high-priority requests. + * Returns a pointer to fetched request. + */ +static struct ptlrpc_request * +ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force) +{ + struct ptlrpc_request *req = NULL; + ENTRY; + + spin_lock(&svcpt->scp_req_lock); + + if (ptlrpc_server_high_pending(svcpt, force)) { + req = ptlrpc_nrs_req_get_nolock(svcpt, true, force); + if (req != NULL) { + svcpt->scp_hreq_count++; + goto got_request; + } + } + + if (ptlrpc_server_normal_pending(svcpt, force)) { + req = ptlrpc_nrs_req_get_nolock(svcpt, false, force); + if (req != NULL) { + svcpt->scp_hreq_count = 0; + goto got_request; + } + } + + spin_unlock(&svcpt->scp_req_lock); + RETURN(NULL); + +got_request: + svcpt->scp_nreqs_active++; + if (req->rq_hp) + svcpt->scp_nhreqs_active++; + + spin_unlock(&svcpt->scp_req_lock); + + if (likely(req->rq_export)) + class_export_rpc_inc(req->rq_export); + + RETURN(req); +} + +/** + * Handle freshly incoming reqs, add to timed early reply list, + * pass on to regular request queue. + * All incoming requests pass through here before getting into + * ptlrpc_server_handle_req later on. + */ +static int +ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request *req; + __u32 deadline; + int rc; + ENTRY; + + spin_lock(&svcpt->scp_lock); + if (list_empty(&svcpt->scp_req_incoming)) { + spin_unlock(&svcpt->scp_lock); + RETURN(0); + } + + req = list_entry(svcpt->scp_req_incoming.next, + struct ptlrpc_request, rq_list); + list_del_init(&req->rq_list); + svcpt->scp_nreqs_incoming--; + /* Consider this still a "queued" request as far as stats are + * concerned */ + spin_unlock(&svcpt->scp_lock); + + /* go through security check/transform */ + rc = sptlrpc_svc_unwrap_request(req); + switch (rc) { + case SECSVC_OK: + break; + case SECSVC_COMPLETE: + target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET); + goto err_req; + case SECSVC_DROP: + goto err_req; + default: + LBUG(); + } + + /* + * for null-flavored rpc, msg has been unpacked by sptlrpc, although + * redo it wouldn't be harmful. + */ + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { + rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen); + if (rc != 0) { + CERROR("error unpacking request: ptl %d from %s " + "x%llu\n", svc->srv_req_portal, + libcfs_id2str(req->rq_peer), req->rq_xid); + goto err_req; + } + } + + rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + CERROR ("error unpacking ptlrpc body: ptl %d from %s x" + "%llu\n", svc->srv_req_portal, + libcfs_id2str(req->rq_peer), req->rq_xid); + goto err_req; + } + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) && + lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) { + CERROR("drop incoming rpc opc %u, x%llu\n", + cfs_fail_val, req->rq_xid); + goto err_req; + } + + rc = -EINVAL; + if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { + CERROR("wrong packet type received (type=%u) from %s\n", + lustre_msg_get_type(req->rq_reqmsg), + libcfs_id2str(req->rq_peer)); + goto err_req; + } + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case MDS_WRITEPAGE: + case OST_WRITE: + case OUT_UPDATE: + req->rq_bulk_write = 1; + break; + case MDS_READPAGE: + case OST_READ: + case MGS_CONFIG_READ: + req->rq_bulk_read = 1; + break; + } + + CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid); + + req->rq_export = class_conn2export( + lustre_msg_get_handle(req->rq_reqmsg)); + if (req->rq_export) { + rc = ptlrpc_check_req(req); + if (rc == 0) { + rc = sptlrpc_target_export_check(req->rq_export, req); + if (rc) + DEBUG_REQ(D_ERROR, req, "DROPPING req with " + "illegal security flavor,"); + } + + if (rc) + goto err_req; + ptlrpc_update_export_timer(req->rq_export, 0); + } + + /* req_in handling should/must be fast */ + if (ktime_get_real_seconds() - req->rq_arrival_time.tv_sec > 5) + DEBUG_REQ(D_WARNING, req, "Slow req_in handling %llds", + (s64)(ktime_get_real_seconds() - + req->rq_arrival_time.tv_sec)); + + /* Set rpc server deadline and add it to the timed list */ + deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) & + MSGHDR_AT_SUPPORT) ? + /* The max time the client expects us to take */ + lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; + + req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; + if (unlikely(deadline == 0)) { + DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); + goto err_req; + } + + /* Skip early reply */ + if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_RESEND)) + req->rq_deadline += obd_timeout; + + req->rq_svc_thread = thread; + if (thread != NULL) { + /* initialize request session, it is needed for request + * processing by target */ + rc = lu_context_init(&req->rq_session, LCT_SERVER_SESSION | + LCT_NOREF); + if (rc) { + CERROR("%s: failure to initialize session: rc = %d\n", + thread->t_name, rc); + goto err_req; + } + req->rq_session.lc_thread = thread; + lu_context_enter(&req->rq_session); + thread->t_env->le_ses = &req->rq_session; + } + + ptlrpc_at_add_timed(req); + + /* Move it over to the request processing queue */ + rc = ptlrpc_server_request_add(svcpt, req); + if (rc) + GOTO(err_req, rc); + + wake_up(&svcpt->scp_waitq); + RETURN(1); + +err_req: + ptlrpc_server_finish_request(svcpt, req); + + RETURN(1); +} + +/** + * Main incoming request handling logic. + * Calls handler function from service to do actual processing. + */ +static int +ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request *request; + ktime_t work_start; + ktime_t work_end; + ktime_t arrived; + s64 timediff_usecs; + s64 arrived_usecs; + int fail_opc = 0; + + ENTRY; + + request = ptlrpc_server_request_get(svcpt, false); + if (request == NULL) + RETURN(0); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) + fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; + else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) + fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT; + + if (unlikely(fail_opc)) { + if (request->rq_export && request->rq_ops) + OBD_FAIL_TIMEOUT(fail_opc, 4); + } + + ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET); + + if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) + libcfs_debug_dumplog(); + + work_start = ktime_get_real(); + arrived = timespec64_to_ktime(request->rq_arrival_time); + timediff_usecs = ktime_us_delta(work_start, arrived); + if (likely(svc->srv_stats != NULL)) { + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, + timediff_usecs); + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR, + svcpt->scp_nreqs_incoming); + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, + svcpt->scp_nreqs_active); + lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, + at_get(&svcpt->scp_at_estimate)); + } + + if (likely(request->rq_export)) { + if (unlikely(ptlrpc_check_req(request))) + goto put_conn; + ptlrpc_update_export_timer(request->rq_export, + div_u64(timediff_usecs, + USEC_PER_SEC / 2)); + } + + /* Discard requests queued for longer than the deadline. + The deadline is increased if we send an early reply. */ + if (ktime_get_real_seconds() > request->rq_deadline) { + DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline %lld:%llds ago\n", + libcfs_id2str(request->rq_peer), + request->rq_deadline - + request->rq_arrival_time.tv_sec, + ktime_get_real_seconds() - request->rq_deadline); + goto put_conn; + } + + CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc " + "%s:%s+%d:%d:x%llu:%s:%d\n", current_comm(), + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + (request->rq_export ? + atomic_read(&request->rq_export->exp_refcount) : -99), + lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, + libcfs_id2str(request->rq_peer), + lustre_msg_get_opc(request->rq_reqmsg)); + + if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) + CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); + + CDEBUG(D_NET, "got req %llu\n", request->rq_xid); + + /* re-assign request and sesson thread to the current one */ + request->rq_svc_thread = thread; + if (thread != NULL) { + LASSERT(request->rq_session.lc_thread == NULL); + request->rq_session.lc_thread = thread; + thread->t_env->le_ses = &request->rq_session; + } + svc->srv_ops.so_req_handler(request); + + ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE); + +put_conn: + if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) { + DEBUG_REQ(D_WARNING, request, + "Request took longer than estimated (%lld:%llds); " + "client may timeout.", + request->rq_deadline - + request->rq_arrival_time.tv_sec, + ktime_get_real_seconds() - request->rq_deadline); + } + + work_end = ktime_get_real(); + timediff_usecs = ktime_us_delta(work_end, work_start); + arrived_usecs = ktime_us_delta(work_end, arrived); + CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc " + "%s:%s+%d:%d:x%llu:%s:%d Request processed in %lldus " + "(%lldus total) trans %llu rc %d/%d\n", + current_comm(), + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + (request->rq_export ? + atomic_read(&request->rq_export->exp_refcount) : -99), + lustre_msg_get_status(request->rq_reqmsg), + request->rq_xid, + libcfs_id2str(request->rq_peer), + lustre_msg_get_opc(request->rq_reqmsg), + timediff_usecs, + arrived_usecs, + (request->rq_repmsg ? + lustre_msg_get_transno(request->rq_repmsg) : + request->rq_transno), + request->rq_status, + (request->rq_repmsg ? + lustre_msg_get_status(request->rq_repmsg) : -999)); + if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { + __u32 op = lustre_msg_get_opc(request->rq_reqmsg); + int opc = opcode_offset(op); + if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { + LASSERT(opc < LUSTRE_MAX_OPCODES); + lprocfs_counter_add(svc->srv_stats, + opc + EXTRA_MAX_OPCODES, + timediff_usecs); + } + } + if (unlikely(request->rq_early_count)) { + DEBUG_REQ(D_ADAPTTO, request, + "sent %d early replies before finishing in %llds", + request->rq_early_count, + div_u64(arrived_usecs, USEC_PER_SEC)); + } + + ptlrpc_server_finish_active_request(svcpt, request); + + RETURN(1); +} + +/** + * An internal function to process a single reply state object. + */ +static int +ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + struct obd_export *exp; + int nlocks; + int been_handled; + ENTRY; + + exp = rs->rs_export; + + LASSERT(rs->rs_difficult); + LASSERT(rs->rs_scheduled); + LASSERT(list_empty(&rs->rs_list)); + + /* The disk commit callback holds exp_uncommitted_replies_lock while it + * iterates over newly committed replies, removing them from + * exp_uncommitted_replies. It then drops this lock and schedules the + * replies it found for handling here. + * + * We can avoid contention for exp_uncommitted_replies_lock between the + * HRT threads and further commit callbacks by checking rs_committed + * which is set in the commit callback while it holds both + * rs_lock and exp_uncommitted_reples. + * + * If we see rs_committed clear, the commit callback _may_ not have + * handled this reply yet and we race with it to grab + * exp_uncommitted_replies_lock before removing the reply from + * exp_uncommitted_replies. Note that if we lose the race and the + * reply has already been removed, list_del_init() is a noop. + * + * If we see rs_committed set, we know the commit callback is handling, + * or has handled this reply since store reordering might allow us to + * see rs_committed set out of sequence. But since this is done + * holding rs_lock, we can be sure it has all completed once we hold + * rs_lock, which we do right next. + */ + if (!rs->rs_committed) { + /* if rs was commited, no need to convert locks, don't check + * rs_committed here because rs may never be added into + * exp_uncommitted_replies and this flag never be set, see + * target_send_reply() */ + if (rs->rs_convert_lock && + rs->rs_transno > exp->exp_last_committed) { + struct ldlm_lock *lock; + struct ldlm_lock *ack_locks[RS_MAX_LOCKS] = { NULL }; + + spin_lock(&rs->rs_lock); + if (rs->rs_convert_lock && + rs->rs_transno > exp->exp_last_committed) { + nlocks = rs->rs_nlocks; + while (nlocks-- > 0) { + /* + * NB don't assume rs is always handled + * by the same service thread (see + * ptlrpc_hr_select, so REP-ACK hr may + * race with trans commit, while the + * latter will release locks, get locks + * here early to convert to COS mode + * safely. + */ + lock = ldlm_handle2lock( + &rs->rs_locks[nlocks]); + LASSERT(lock); + ack_locks[nlocks] = lock; + rs->rs_modes[nlocks] = LCK_COS; + } + nlocks = rs->rs_nlocks; + rs->rs_convert_lock = 0; + /* clear rs_scheduled so that commit callback + * can schedule again */ + rs->rs_scheduled = 0; + spin_unlock(&rs->rs_lock); + + while (nlocks-- > 0) { + lock = ack_locks[nlocks]; + ldlm_lock_mode_downgrade(lock, LCK_COS); + LDLM_LOCK_PUT(lock); + } + RETURN(0); + } + spin_unlock(&rs->rs_lock); + } + + spin_lock(&exp->exp_uncommitted_replies_lock); + list_del_init(&rs->rs_obd_list); + spin_unlock(&exp->exp_uncommitted_replies_lock); + } + + spin_lock(&exp->exp_lock); + /* Noop if removed already */ + list_del_init(&rs->rs_exp_list); + spin_unlock(&exp->exp_lock); + + spin_lock(&rs->rs_lock); + + been_handled = rs->rs_handled; + rs->rs_handled = 1; + + nlocks = rs->rs_nlocks; /* atomic "steal", but */ + rs->rs_nlocks = 0; /* locks still on rs_locks! */ + + if (nlocks == 0 && !been_handled) { + /* If we see this, we should already have seen the warning + * in mds_steal_ack_locks() */ + CDEBUG(D_HA, "All locks stolen from rs %p x%lld.t%lld" + " o%d NID %s\n", + rs, + rs->rs_xid, rs->rs_transno, rs->rs_opc, + libcfs_nid2str(exp->exp_connection->c_peer.nid)); + } + + if ((!been_handled && rs->rs_on_net) || nlocks > 0) { + spin_unlock(&rs->rs_lock); + + if (!been_handled && rs->rs_on_net) { + LNetMDUnlink(rs->rs_md_h); + /* Ignore return code; we're racing with completion */ + } + + while (nlocks-- > 0) + ldlm_lock_decref(&rs->rs_locks[nlocks], + rs->rs_modes[nlocks]); + + spin_lock(&rs->rs_lock); + } + + rs->rs_scheduled = 0; + rs->rs_convert_lock = 0; + + if (!rs->rs_on_net) { + /* Off the net */ + spin_unlock(&rs->rs_lock); + + class_export_put (exp); + rs->rs_export = NULL; + ptlrpc_rs_decref(rs); + if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) && + svc->srv_is_stopping) + wake_up_all(&svcpt->scp_waitq); + RETURN(1); + } + + /* still on the net; callback will schedule */ + spin_unlock(&rs->rs_lock); + RETURN(1); +} + + +static void +ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt) +{ + int avail = svcpt->scp_nrqbds_posted; + int low_water = test_req_buffer_pressure ? 0 : + svcpt->scp_service->srv_nbuf_per_group / 2; + + /* NB I'm not locking; just looking. */ + + /* CAVEAT EMPTOR: We might be allocating buffers here because we've + * allowed the request history to grow out of control. We could put a + * sanity check on that here and cull some history if we need the + * space. */ + + if (avail <= low_water) + ptlrpc_grow_req_bufs(svcpt, 1); + + if (svcpt->scp_service->srv_stats) { + lprocfs_counter_add(svcpt->scp_service->srv_stats, + PTLRPC_REQBUF_AVAIL_CNTR, avail); + } +} + +static int +ptlrpc_retry_rqbds(void *arg) +{ + struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg; + + svcpt->scp_rqbd_timeout = 0; + return -ETIMEDOUT; +} + +static inline int +ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nreqs_active < + svcpt->scp_nthrs_running - 1 - + (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL); +} + +/** + * allowed to create more threads + * user can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_lock to get reliable result + */ +static inline int +ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nthrs_running + + svcpt->scp_nthrs_starting < + svcpt->scp_service->srv_nthrs_cpt_limit; +} + +/** + * too many requests and allowed to create more threads + */ +static inline int +ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt) +{ + return !ptlrpc_threads_enough(svcpt) && + ptlrpc_threads_increasable(svcpt); +} + +static inline int +ptlrpc_thread_stopping(struct ptlrpc_thread *thread) +{ + return thread_is_stopping(thread) || + thread->t_svcpt->scp_service->srv_is_stopping; +} + +static inline int +ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt) +{ + return !list_empty(&svcpt->scp_rqbd_idle) && + svcpt->scp_rqbd_timeout == 0; +} + +static inline int +ptlrpc_at_check(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_at_check; +} + +/** + * requests wait on preprocessing + * user can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_lock to get reliable result + */ +static inline int +ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt) +{ + return !list_empty(&svcpt->scp_req_incoming); +} + +static __attribute__((__noinline__)) int +ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + /* Don't exit while there are replies to be handled */ + struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout, + ptlrpc_retry_rqbds, svcpt); + + lc_watchdog_disable(thread->t_watchdog); + + cond_resched(); + + l_wait_event_exclusive_head(svcpt->scp_waitq, + ptlrpc_thread_stopping(thread) || + ptlrpc_server_request_incoming(svcpt) || + ptlrpc_server_request_pending(svcpt, false) || + ptlrpc_rqbd_pending(svcpt) || + ptlrpc_at_check(svcpt), &lwi); + + if (ptlrpc_thread_stopping(thread)) + return -EINTR; + + lc_watchdog_touch(thread->t_watchdog, + ptlrpc_server_get_timeout(svcpt)); + return 0; +} + +/** + * Main thread body for service threads. + * Waits in a loop waiting for new requests to process to appear. + * Every time an incoming requests is added to its queue, a waitq + * is woken up and one of the threads will handle it. + */ +static int ptlrpc_main(void *arg) +{ + struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg; + struct ptlrpc_service_part *svcpt = thread->t_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_reply_state *rs; + struct group_info *ginfo = NULL; + struct lu_env *env; + int counter = 0, rc = 0; + ENTRY; + + thread->t_pid = current_pid(); + unshare_fs_struct(); + + if (svc->srv_cpt_bind) { + rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt); + if (rc != 0) { + CWARN("%s: failed to bind %s on CPT %d\n", + svc->srv_name, thread->t_name, svcpt->scp_cpt); + } + } + + ginfo = groups_alloc(0); + if (!ginfo) + GOTO(out, rc = -ENOMEM); + + set_current_groups(ginfo); + put_group_info(ginfo); + + if (svc->srv_ops.so_thr_init != NULL) { + rc = svc->srv_ops.so_thr_init(thread); + + if (rc) + GOTO(out, rc); + } + + OBD_ALLOC_PTR(env); + if (env == NULL) + GOTO(out_srv_fini, rc = -ENOMEM); + rc = lu_env_add(env); + if (rc) + GOTO(out_env, rc); + + rc = lu_context_init(&env->le_ctx, + svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF); + if (rc) + GOTO(out_env_remove, rc); + + thread->t_env = env; + env->le_ctx.lc_thread = thread; + env->le_ctx.lc_cookie = 0x6; + + while (!list_empty(&svcpt->scp_rqbd_idle)) { + rc = ptlrpc_server_post_idle_rqbds(svcpt); + if (rc >= 0) + continue; + + CERROR("Failed to post rqbd for %s on CPT %d: %d\n", + svc->srv_name, svcpt->scp_cpt, rc); + GOTO(out_ctx_fini, rc); + } + + /* Alloc reply state structure for this one */ + OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size); + if (!rs) + GOTO(out_ctx_fini, rc = -ENOMEM); + + spin_lock(&svcpt->scp_lock); + + LASSERT(thread_is_starting(thread)); + thread_clear_flags(thread, SVC_STARTING); + + LASSERT(svcpt->scp_nthrs_starting == 1); + svcpt->scp_nthrs_starting--; + + /* SVC_STOPPING may already be set here if someone else is trying + * to stop the service while this new thread has been dynamically + * forked. We still set SVC_RUNNING to let our creator know that + * we are now running, however we will exit as soon as possible */ + thread_add_flags(thread, SVC_RUNNING); + svcpt->scp_nthrs_running++; + spin_unlock(&svcpt->scp_lock); + + /* wake up our creator in case he's still waiting. */ + wake_up(&thread->t_ctl_waitq); + + thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt), + NULL, NULL); + + spin_lock(&svcpt->scp_rep_lock); + list_add(&rs->rs_list, &svcpt->scp_rep_idle); + wake_up(&svcpt->scp_rep_waitq); + spin_unlock(&svcpt->scp_rep_lock); + + CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, + svcpt->scp_nthrs_running); + + /* XXX maintain a list of all managed devices: insert here */ + while (!ptlrpc_thread_stopping(thread)) { + if (ptlrpc_wait_event(svcpt, thread)) + break; + + ptlrpc_check_rqbd_pool(svcpt); + + if (ptlrpc_threads_need_create(svcpt)) { + /* Ignore return code - we tried... */ + ptlrpc_start_thread(svcpt, 0); + } + + /* reset le_ses to initial state */ + env->le_ses = NULL; + /* Refill the context before execution to make sure + * all thread keys are allocated */ + lu_env_refill(env); + /* Process all incoming reqs before handling any */ + if (ptlrpc_server_request_incoming(svcpt)) { + lu_context_enter(&env->le_ctx); + ptlrpc_server_handle_req_in(svcpt, thread); + lu_context_exit(&env->le_ctx); + + /* but limit ourselves in case of flood */ + if (counter++ < 100) + continue; + counter = 0; + } + + if (ptlrpc_at_check(svcpt)) + ptlrpc_at_check_timed(svcpt); + + if (ptlrpc_server_request_pending(svcpt, false)) { + lu_context_enter(&env->le_ctx); + ptlrpc_server_handle_request(svcpt, thread); + lu_context_exit(&env->le_ctx); + } + + if (ptlrpc_rqbd_pending(svcpt) && + ptlrpc_server_post_idle_rqbds(svcpt) < 0) { + /* I just failed to repost request buffers. + * Wait for a timeout (unless something else + * happens) before I try again */ + svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10; + CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", + svcpt->scp_nrqbds_posted); + } + } + + lc_watchdog_delete(thread->t_watchdog); + thread->t_watchdog = NULL; + +out_ctx_fini: + lu_context_fini(&env->le_ctx); +out_env_remove: + lu_env_remove(env); +out_env: + OBD_FREE_PTR(env); +out_srv_fini: + /* + * deconstruct service specific state created by ptlrpc_start_thread() + */ + if (svc->srv_ops.so_thr_done != NULL) + svc->srv_ops.so_thr_done(thread); +out: + CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n", + thread, thread->t_pid, thread->t_id, rc); + + spin_lock(&svcpt->scp_lock); + if (thread_test_and_clear_flags(thread, SVC_STARTING)) + svcpt->scp_nthrs_starting--; + + if (thread_test_and_clear_flags(thread, SVC_RUNNING)) { + /* must know immediately */ + svcpt->scp_nthrs_running--; + } + + thread->t_id = rc; + thread_add_flags(thread, SVC_STOPPED); + + wake_up(&thread->t_ctl_waitq); + spin_unlock(&svcpt->scp_lock); + + return rc; +} + +static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt, + struct list_head *replies) +{ + int result; + + spin_lock(&hrt->hrt_lock); + + list_splice_init(&hrt->hrt_queue, replies); + result = ptlrpc_hr.hr_stopping || !list_empty(replies); + + spin_unlock(&hrt->hrt_lock); + return result; +} + +/** + * Main body of "handle reply" function. + * It processes acked reply states + */ +static int ptlrpc_hr_main(void *arg) +{ + struct ptlrpc_hr_thread *hrt = (struct ptlrpc_hr_thread *)arg; + struct ptlrpc_hr_partition *hrp = hrt->hrt_partition; + struct list_head replies; + struct lu_env *env; + int rc; + + OBD_ALLOC_PTR(env); + if (env == NULL) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&replies); + unshare_fs_struct(); + + rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt); + if (rc != 0) { + char threadname[20]; + + snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d", + hrp->hrp_cpt, hrt->hrt_id); + CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n", + threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc); + } + + rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD | + LCT_REMEMBER | LCT_NOREF); + if (rc) + GOTO(out_env, rc); + + rc = lu_env_add(env); + if (rc) + GOTO(out_ctx_fini, rc); + + atomic_inc(&hrp->hrp_nstarted); + wake_up(&ptlrpc_hr.hr_waitq); + + while (!ptlrpc_hr.hr_stopping) { + l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies)); + + while (!list_empty(&replies)) { + struct ptlrpc_reply_state *rs; + + rs = list_entry(replies.prev, + struct ptlrpc_reply_state, + rs_list); + list_del_init(&rs->rs_list); + /* refill keys if needed */ + lu_env_refill(env); + lu_context_enter(&env->le_ctx); + ptlrpc_handle_rs(rs); + lu_context_exit(&env->le_ctx); + } + } + + atomic_inc(&hrp->hrp_nstopped); + wake_up(&ptlrpc_hr.hr_waitq); + + lu_env_remove(env); +out_ctx_fini: + lu_context_fini(&env->le_ctx); +out_env: + OBD_FREE_PTR(env); + return 0; +} + +static void ptlrpc_stop_hr_threads(void) +{ + struct ptlrpc_hr_partition *hrp; + int i; + int j; + + ptlrpc_hr.hr_stopping = 1; + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs == NULL) + continue; /* uninitialized */ + for (j = 0; j < hrp->hrp_nthrs; j++) + wake_up_all(&hrp->hrp_thrs[j].hrt_waitq); + } + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs == NULL) + continue; /* uninitialized */ + wait_event(ptlrpc_hr.hr_waitq, + atomic_read(&hrp->hrp_nstopped) == + atomic_read(&hrp->hrp_nstarted)); + } +} + +static int ptlrpc_start_hr_threads(void) +{ + struct ptlrpc_hr_partition *hrp; + int i; + int j; + ENTRY; + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + int rc = 0; + + for (j = 0; j < hrp->hrp_nthrs; j++) { + struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j]; + struct task_struct *task; + + task = kthread_run(ptlrpc_hr_main, + &hrp->hrp_thrs[j], + "ptlrpc_hr%02d_%03d", + hrp->hrp_cpt, + hrt->hrt_id); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + break; + } + } + + wait_event(ptlrpc_hr.hr_waitq, + atomic_read(&hrp->hrp_nstarted) == j); + + if (rc < 0) { + CERROR("cannot start reply handler thread %d:%d: " + "rc = %d\n", i, j, rc); + ptlrpc_stop_hr_threads(); + RETURN(rc); + } + } + + RETURN(0); +} + +static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) +{ + struct l_wait_info lwi = { 0 }; + struct ptlrpc_thread *thread; + struct list_head zombie; + + ENTRY; + + CDEBUG(D_INFO, "Stopping threads for service %s\n", + svcpt->scp_service->srv_name); + + INIT_LIST_HEAD(&zombie); + spin_lock(&svcpt->scp_lock); + /* let the thread know that we would like it to stop asap */ + list_for_each_entry(thread, &svcpt->scp_threads, t_link) { + CDEBUG(D_INFO, "Stopping thread %s #%u\n", + svcpt->scp_service->srv_thread_name, thread->t_id); + thread_add_flags(thread, SVC_STOPPING); + } + + wake_up_all(&svcpt->scp_waitq); + + while (!list_empty(&svcpt->scp_threads)) { + thread = list_entry(svcpt->scp_threads.next, + struct ptlrpc_thread, t_link); + if (thread_is_stopped(thread)) { + list_del(&thread->t_link); + list_add(&thread->t_link, &zombie); + continue; + } + spin_unlock(&svcpt->scp_lock); + + CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n", + svcpt->scp_service->srv_thread_name, thread->t_id); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopped(thread), &lwi); + + spin_lock(&svcpt->scp_lock); + } + + spin_unlock(&svcpt->scp_lock); + + while (!list_empty(&zombie)) { + thread = list_entry(zombie.next, + struct ptlrpc_thread, t_link); + list_del(&thread->t_link); + OBD_FREE_PTR(thread); + } + EXIT; +} + +/** + * Stops all threads of a particular service \a svc + */ +void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + ENTRY; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service != NULL) + ptlrpc_svcpt_stop_threads(svcpt); + } + + EXIT; +} + +int ptlrpc_start_threads(struct ptlrpc_service *svc) +{ + int rc = 0; + int i; + int j; + ENTRY; + + /* We require 2 threads min, see note in ptlrpc_server_handle_request */ + LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT); + + for (i = 0; i < svc->srv_ncpts; i++) { + for (j = 0; j < svc->srv_nthrs_cpt_init; j++) { + rc = ptlrpc_start_thread(svc->srv_parts[i], 1); + if (rc == 0) + continue; + + if (rc != -EMFILE) + goto failed; + /* We have enough threads, don't start more. b=15759 */ + break; + } + } + + RETURN(0); + failed: + CERROR("cannot start %s thread #%d_%d: rc %d\n", + svc->srv_thread_name, i, j, rc); + ptlrpc_stop_all_threads(svc); + RETURN(rc); +} + +int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) +{ + struct l_wait_info lwi = { 0 }; + struct ptlrpc_thread *thread; + struct ptlrpc_service *svc; + struct task_struct *task; + int rc; + ENTRY; + + LASSERT(svcpt != NULL); + + svc = svcpt->scp_service; + + CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n", + svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running, + svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit); + + again: + if (unlikely(svc->srv_is_stopping)) + RETURN(-ESRCH); + + if (!ptlrpc_threads_increasable(svcpt) || + (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && + svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1)) + RETURN(-EMFILE); + + OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt); + if (thread == NULL) + RETURN(-ENOMEM); + init_waitqueue_head(&thread->t_ctl_waitq); + + spin_lock(&svcpt->scp_lock); + if (!ptlrpc_threads_increasable(svcpt)) { + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + RETURN(-EMFILE); + } + + if (svcpt->scp_nthrs_starting != 0) { + /* serialize starting because some modules (obdfilter) + * might require unique and contiguous t_id */ + LASSERT(svcpt->scp_nthrs_starting == 1); + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + if (wait) { + CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n", + svc->srv_thread_name, svcpt->scp_thr_nextid); + schedule(); + goto again; + } + + CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n", + svc->srv_thread_name, svcpt->scp_thr_nextid); + RETURN(-EAGAIN); + } + + svcpt->scp_nthrs_starting++; + thread->t_id = svcpt->scp_thr_nextid++; + thread_add_flags(thread, SVC_STARTING); + thread->t_svcpt = svcpt; + + list_add(&thread->t_link, &svcpt->scp_threads); + spin_unlock(&svcpt->scp_lock); + + if (svcpt->scp_cpt >= 0) { + snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d", + svc->srv_thread_name, svcpt->scp_cpt, thread->t_id); + } else { + snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d", + svc->srv_thread_name, thread->t_id); + } + + CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name); + task = kthread_run(ptlrpc_main, thread, "%s", thread->t_name); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("cannot start thread '%s': rc = %d\n", + thread->t_name, rc); + spin_lock(&svcpt->scp_lock); + --svcpt->scp_nthrs_starting; + if (thread_is_stopping(thread)) { + /* this ptlrpc_thread is being hanled + * by ptlrpc_svcpt_stop_threads now + */ + thread_add_flags(thread, SVC_STOPPED); + wake_up(&thread->t_ctl_waitq); + spin_unlock(&svcpt->scp_lock); + } else { + list_del(&thread->t_link); + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + } + RETURN(rc); + } + + if (!wait) + RETURN(0); + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); + + rc = thread_is_stopped(thread) ? thread->t_id : 0; + RETURN(rc); +} + +int ptlrpc_hr_init(void) +{ + struct ptlrpc_hr_partition *hrp; + struct ptlrpc_hr_thread *hrt; + int rc; + int cpt; + int i; + int weight; + ENTRY; + + memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr)); + ptlrpc_hr.hr_cpt_table = cfs_cpt_table; + + ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table, + sizeof(*hrp)); + if (ptlrpc_hr.hr_partitions == NULL) + RETURN(-ENOMEM); + + init_waitqueue_head(&ptlrpc_hr.hr_waitq); + + weight = cpumask_weight(topology_sibling_cpumask(smp_processor_id())); + + cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) { + hrp->hrp_cpt = cpt; + + atomic_set(&hrp->hrp_nstarted, 0); + atomic_set(&hrp->hrp_nstopped, 0); + + hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, cpt); + hrp->hrp_nthrs /= weight; + if (hrp->hrp_nthrs == 0) + hrp->hrp_nthrs = 1; + + OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, cpt, + hrp->hrp_nthrs * sizeof(*hrt)); + if (hrp->hrp_thrs == NULL) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < hrp->hrp_nthrs; i++) { + hrt = &hrp->hrp_thrs[i]; + + hrt->hrt_id = i; + hrt->hrt_partition = hrp; + init_waitqueue_head(&hrt->hrt_waitq); + spin_lock_init(&hrt->hrt_lock); + INIT_LIST_HEAD(&hrt->hrt_queue); + } + } + + rc = ptlrpc_start_hr_threads(); +out: + if (rc != 0) + ptlrpc_hr_fini(); + RETURN(rc); +} + +void ptlrpc_hr_fini(void) +{ + struct ptlrpc_hr_partition *hrp; + int cpt; + + if (ptlrpc_hr.hr_partitions == NULL) + return; + + ptlrpc_stop_hr_threads(); + + cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs != NULL) { + OBD_FREE(hrp->hrp_thrs, + hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0])); + } + } + + cfs_percpt_free(ptlrpc_hr.hr_partitions); + ptlrpc_hr.hr_partitions = NULL; +} + + +/** + * Wait until all already scheduled replies are processed. + */ +static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) +{ + while (1) { + int rc; + struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10), + NULL, NULL); + + rc = l_wait_event(svcpt->scp_waitq, + atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi); + if (rc == 0) + break; + CWARN("Unexpectedly long timeout %s %p\n", + svcpt->scp_service->srv_name, svcpt->scp_service); + } +} + +static void +ptlrpc_service_del_atimer(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + + /* early disarm AT timer... */ + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service != NULL) + del_timer(&svcpt->scp_at_timer); + } +} + +static void +ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request_buffer_desc *rqbd; + struct l_wait_info lwi; + int rc; + int i; + + /* All history will be culled when the next request buffer is + * freed in ptlrpc_service_purge_all() */ + svc->srv_hist_nrqbds_cpt_max = 0; + + rc = LNetClearLazyPortal(svc->srv_req_portal); + LASSERT(rc == 0); + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* Unlink all the request buffers. This forces a 'final' + * event with its 'unlink' flag set for each posted rqbd */ + list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted, + rqbd_list) { + rc = LNetMDUnlink(rqbd->rqbd_md_h); + LASSERT(rc == 0 || rc == -ENOENT); + } + } + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* Wait for the network to release any buffers + * it's currently filling */ + spin_lock(&svcpt->scp_lock); + while (svcpt->scp_nrqbds_posted != 0) { + spin_unlock(&svcpt->scp_lock); + /* Network access will complete in finite time but + * the HUGE timeout lets us CWARN for visibility + * of sluggish NALs */ + lwi = LWI_TIMEOUT_INTERVAL( + cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(svcpt->scp_waitq, + svcpt->scp_nrqbds_posted == 0, &lwi); + if (rc == -ETIMEDOUT) { + CWARN("Service %s waiting for " + "request buffers\n", + svcpt->scp_service->srv_name); + } + spin_lock(&svcpt->scp_lock); + } + spin_unlock(&svcpt->scp_lock); + } +} + +static void +ptlrpc_service_purge_all(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request_buffer_desc *rqbd; + struct ptlrpc_request *req; + struct ptlrpc_reply_state *rs; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + spin_lock(&svcpt->scp_rep_lock); + while (!list_empty(&svcpt->scp_rep_active)) { + rs = list_entry(svcpt->scp_rep_active.next, + struct ptlrpc_reply_state, rs_list); + spin_lock(&rs->rs_lock); + ptlrpc_schedule_difficult_reply(rs); + spin_unlock(&rs->rs_lock); + } + spin_unlock(&svcpt->scp_rep_lock); + + /* purge the request queue. NB No new replies (rqbds + * all unlinked) and no service threads, so I'm the only + * thread noodling the request queue now */ + while (!list_empty(&svcpt->scp_req_incoming)) { + req = list_entry(svcpt->scp_req_incoming.next, + struct ptlrpc_request, rq_list); + + list_del(&req->rq_list); + svcpt->scp_nreqs_incoming--; + ptlrpc_server_finish_request(svcpt, req); + } + + while (ptlrpc_server_request_pending(svcpt, true)) { + req = ptlrpc_server_request_get(svcpt, true); + ptlrpc_server_finish_active_request(svcpt, req); + } + + LASSERT(list_empty(&svcpt->scp_rqbd_posted)); + LASSERT(svcpt->scp_nreqs_incoming == 0); + LASSERT(svcpt->scp_nreqs_active == 0); + /* history should have been culled by + * ptlrpc_server_finish_request */ + LASSERT(svcpt->scp_hist_nrqbds == 0); + + /* Now free all the request buffers since nothing + * references them any more... */ + + while (!list_empty(&svcpt->scp_rqbd_idle)) { + rqbd = list_entry(svcpt->scp_rqbd_idle.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + ptlrpc_free_rqbd(rqbd); + } + ptlrpc_wait_replies(svcpt); + + while (!list_empty(&svcpt->scp_rep_idle)) { + rs = list_entry(svcpt->scp_rep_idle.next, + struct ptlrpc_reply_state, + rs_list); + list_del(&rs->rs_list); + OBD_FREE_LARGE(rs, svc->srv_max_reply_size); + } + } +} + +static void +ptlrpc_service_free(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_at_array *array; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* In case somebody rearmed this in the meantime */ + del_timer(&svcpt->scp_at_timer); + array = &svcpt->scp_at_array; + + if (array->paa_reqs_array != NULL) { + OBD_FREE(array->paa_reqs_array, + sizeof(struct list_head) * array->paa_size); + array->paa_reqs_array = NULL; + } + + if (array->paa_reqs_count != NULL) { + OBD_FREE(array->paa_reqs_count, + sizeof(__u32) * array->paa_size); + array->paa_reqs_count = NULL; + } + } + + ptlrpc_service_for_each_part(svcpt, i, svc) + OBD_FREE_PTR(svcpt); + + if (svc->srv_cpts != NULL) + cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts); + + OBD_FREE(svc, offsetof(struct ptlrpc_service, + srv_parts[svc->srv_ncpts])); +} + +int ptlrpc_unregister_service(struct ptlrpc_service *service) +{ + ENTRY; + + CDEBUG(D_NET, "%s: tearing down\n", service->srv_name); + + service->srv_is_stopping = 1; + + mutex_lock(&ptlrpc_all_services_mutex); + list_del_init(&service->srv_list); + mutex_unlock(&ptlrpc_all_services_mutex); + + ptlrpc_service_del_atimer(service); + ptlrpc_stop_all_threads(service); + + ptlrpc_service_unlink_rqbd(service); + ptlrpc_service_purge_all(service); + ptlrpc_service_nrs_cleanup(service); + + ptlrpc_lprocfs_unregister_service(service); + ptlrpc_sysfs_unregister_service(service); + + ptlrpc_service_free(service); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_unregister_service); + +/** + * Returns 0 if the service is healthy. + * + * Right now, it just checks to make sure that requests aren't languishing + * in the queue. We'll use this health check to govern whether a node needs + * to be shot, so it's intentionally non-aggressive. */ +static int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_request *request = NULL; + struct timespec64 right_now; + struct timespec64 timediff; + + ktime_get_real_ts64(&right_now); + + spin_lock(&svcpt->scp_req_lock); + /* How long has the next entry been waiting? */ + if (ptlrpc_server_high_pending(svcpt, true)) + request = ptlrpc_nrs_req_peek_nolock(svcpt, true); + else if (ptlrpc_server_normal_pending(svcpt, true)) + request = ptlrpc_nrs_req_peek_nolock(svcpt, false); + + if (request == NULL) { + spin_unlock(&svcpt->scp_req_lock); + return 0; + } + + timediff = timespec64_sub(right_now, request->rq_arrival_time); + spin_unlock(&svcpt->scp_req_lock); + + if ((timediff.tv_sec) > + (AT_OFF ? obd_timeout * 3 / 2 : at_max)) { + CERROR("%s: unhealthy - request has been waiting %llds\n", + svcpt->scp_service->srv_name, (s64)timediff.tv_sec); + return -1; + } + + return 0; +} + +int +ptlrpc_service_health_check(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + + if (svc == NULL) + return 0; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + int rc = ptlrpc_svcpt_health_check(svcpt); + + if (rc != 0) + return rc; + } + return 0; +} +EXPORT_SYMBOL(ptlrpc_service_health_check); diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c new file mode 100644 index 0000000000000..7f9fb09ee4ffd --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c @@ -0,0 +1,46 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#ifdef CONFIG_FS_POSIX_ACL +# include +# include +#endif /* CONFIG_FS_POSIX_ACL */ + +#include +#include +#include +#include +#include +#include + diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c new file mode 100644 index 0000000000000..b4e5d7430d949 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c @@ -0,0 +1,5659 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#ifdef CONFIG_FS_POSIX_ACL +# include +# include +#endif /* CONFIG_FS_POSIX_ACL */ + +#include +#include +#include +#include +#include +#include + + +void lustre_assert_wire_constants(void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * (make -C lustre/utils newwiretest) + */ + + /* Constants... */ + LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n", + (long long)PTL_RPC_MSG_REQUEST); + LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n", + (long long)PTL_RPC_MSG_ERR); + LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n", + (long long)PTL_RPC_MSG_REPLY); + LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n", + MDS_DIR_END_OFF); + LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n", + DEAD_HANDLE_MAGIC); + CLASSERT(MTI_NAME_MAXLEN == 64); + LASSERTF(OST_REPLY == 0, "found %lld\n", + (long long)OST_REPLY); + LASSERTF(OST_GETATTR == 1, "found %lld\n", + (long long)OST_GETATTR); + LASSERTF(OST_SETATTR == 2, "found %lld\n", + (long long)OST_SETATTR); + LASSERTF(OST_READ == 3, "found %lld\n", + (long long)OST_READ); + LASSERTF(OST_WRITE == 4, "found %lld\n", + (long long)OST_WRITE); + LASSERTF(OST_CREATE == 5, "found %lld\n", + (long long)OST_CREATE); + LASSERTF(OST_DESTROY == 6, "found %lld\n", + (long long)OST_DESTROY); + LASSERTF(OST_GET_INFO == 7, "found %lld\n", + (long long)OST_GET_INFO); + LASSERTF(OST_CONNECT == 8, "found %lld\n", + (long long)OST_CONNECT); + LASSERTF(OST_DISCONNECT == 9, "found %lld\n", + (long long)OST_DISCONNECT); + LASSERTF(OST_PUNCH == 10, "found %lld\n", + (long long)OST_PUNCH); + LASSERTF(OST_OPEN == 11, "found %lld\n", + (long long)OST_OPEN); + LASSERTF(OST_CLOSE == 12, "found %lld\n", + (long long)OST_CLOSE); + LASSERTF(OST_STATFS == 13, "found %lld\n", + (long long)OST_STATFS); + LASSERTF(OST_SYNC == 16, "found %lld\n", + (long long)OST_SYNC); + LASSERTF(OST_SET_INFO == 17, "found %lld\n", + (long long)OST_SET_INFO); + LASSERTF(OST_QUOTACHECK == 18, "found %lld\n", + (long long)OST_QUOTACHECK); + LASSERTF(OST_QUOTACTL == 19, "found %lld\n", + (long long)OST_QUOTACTL); + LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n", + (long long)OST_QUOTA_ADJUST_QUNIT); + LASSERTF(OST_LADVISE == 21, "found %lld\n", + (long long)OST_LADVISE); + LASSERTF(OST_LAST_OPC == 22, "found %lld\n", + (long long)OST_LAST_OPC); + LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", + OBD_OBJECT_EOF); + LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n", + (long long)OST_MIN_PRECREATE); + LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n", + (long long)OST_MAX_PRECREATE); + LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n", + OST_LVB_ERR_INIT); + LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n", + OST_LVB_ERR_MASK); + LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n", + (long long)MDS_FIRST_OPC); + LASSERTF(MDS_GETATTR == 33, "found %lld\n", + (long long)MDS_GETATTR); + LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n", + (long long)MDS_GETATTR_NAME); + LASSERTF(MDS_CLOSE == 35, "found %lld\n", + (long long)MDS_CLOSE); + LASSERTF(MDS_REINT == 36, "found %lld\n", + (long long)MDS_REINT); + LASSERTF(MDS_READPAGE == 37, "found %lld\n", + (long long)MDS_READPAGE); + LASSERTF(MDS_CONNECT == 38, "found %lld\n", + (long long)MDS_CONNECT); + LASSERTF(MDS_DISCONNECT == 39, "found %lld\n", + (long long)MDS_DISCONNECT); + LASSERTF(MDS_GET_ROOT == 40, "found %lld\n", + (long long)MDS_GET_ROOT); + LASSERTF(MDS_STATFS == 41, "found %lld\n", + (long long)MDS_STATFS); + LASSERTF(MDS_PIN == 42, "found %lld\n", + (long long)MDS_PIN); + LASSERTF(MDS_UNPIN == 43, "found %lld\n", + (long long)MDS_UNPIN); + LASSERTF(MDS_SYNC == 44, "found %lld\n", + (long long)MDS_SYNC); + LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n", + (long long)MDS_DONE_WRITING); + LASSERTF(MDS_SET_INFO == 46, "found %lld\n", + (long long)MDS_SET_INFO); + LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n", + (long long)MDS_QUOTACHECK); + LASSERTF(MDS_QUOTACTL == 48, "found %lld\n", + (long long)MDS_QUOTACTL); + LASSERTF(MDS_GETXATTR == 49, "found %lld\n", + (long long)MDS_GETXATTR); + LASSERTF(MDS_SETXATTR == 50, "found %lld\n", + (long long)MDS_SETXATTR); + LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n", + (long long)MDS_WRITEPAGE); + LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n", + (long long)MDS_IS_SUBDIR); + LASSERTF(MDS_GET_INFO == 53, "found %lld\n", + (long long)MDS_GET_INFO); + LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n", + (long long)MDS_HSM_STATE_GET); + LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n", + (long long)MDS_HSM_STATE_SET); + LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n", + (long long)MDS_HSM_ACTION); + LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n", + (long long)MDS_HSM_PROGRESS); + LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n", + (long long)MDS_HSM_REQUEST); + LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n", + (long long)MDS_HSM_CT_REGISTER); + LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n", + (long long)MDS_HSM_CT_UNREGISTER); + LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n", + (long long)MDS_SWAP_LAYOUTS); + LASSERTF(MDS_RMFID == 62, "found %lld\n", + (long long)MDS_RMFID); + LASSERTF(MDS_LAST_OPC == 63, "found %lld\n", + (long long)MDS_LAST_OPC); + LASSERTF(REINT_SETATTR == 1, "found %lld\n", + (long long)REINT_SETATTR); + LASSERTF(REINT_CREATE == 2, "found %lld\n", + (long long)REINT_CREATE); + LASSERTF(REINT_LINK == 3, "found %lld\n", + (long long)REINT_LINK); + LASSERTF(REINT_UNLINK == 4, "found %lld\n", + (long long)REINT_UNLINK); + LASSERTF(REINT_RENAME == 5, "found %lld\n", + (long long)REINT_RENAME); + LASSERTF(REINT_OPEN == 6, "found %lld\n", + (long long)REINT_OPEN); + LASSERTF(REINT_SETXATTR == 7, "found %lld\n", + (long long)REINT_SETXATTR); + LASSERTF(REINT_RMENTRY == 8, "found %lld\n", + (long long)REINT_RMENTRY); + LASSERTF(REINT_MIGRATE == 9, "found %lld\n", + (long long)REINT_MIGRATE); + LASSERTF(REINT_MAX == 11, "found %lld\n", + (long long)REINT_MAX); + LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)DISP_IT_EXECD); + LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_EXECD); + LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_NEG); + LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_POS); + LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_CREATE); + LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_OPEN); + LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_COMPLETE); + LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_OPEN_REF); + LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_CREATE_REF); + LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_LOCK); + LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n", + (long long)MDS_STATUS_CONN); + LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n", + (long long)MDS_STATUS_LOV); + LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MODE); + LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_UID); + LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_GID); + LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_SIZE); + LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATIME); + LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MTIME); + LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_CTIME); + LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATIME_SET); + LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MTIME_SET); + LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_FORCE); + LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATTR_FLAG); + LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_KILL_SUID); + LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_KILL_SGID); + LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_CTIME_SET); + LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_FROM_OPEN); + LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_BLOCKS); + LASSERTF(MDS_ATTR_PROJID == 0x0000000000010000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_PROJID); + LASSERTF(MDS_ATTR_LSIZE == 0x0000000000020000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_LSIZE); + LASSERTF(MDS_ATTR_LBLOCKS == 0x0000000000040000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_LBLOCKS); + LASSERTF(MDS_ATTR_OVERRIDE == 0x0000000002000000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_OVERRIDE); + LASSERTF(FLD_QUERY == 900, "found %lld\n", + (long long)FLD_QUERY); + LASSERTF(FLD_READ == 901, "found %lld\n", + (long long)FLD_READ); + LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n", + (long long)FLD_FIRST_OPC); + LASSERTF(FLD_LAST_OPC == 902, "found %lld\n", + (long long)FLD_LAST_OPC); + LASSERTF(SEQ_QUERY == 700, "found %lld\n", + (long long)SEQ_QUERY); + LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n", + (long long)SEQ_FIRST_OPC); + LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n", + (long long)SEQ_LAST_OPC); + LASSERTF(LFSCK_NOTIFY == 1101, "found %lld\n", + (long long)LFSCK_NOTIFY); + LASSERTF(LFSCK_QUERY == 1102, "found %lld\n", + (long long)LFSCK_QUERY); + LASSERTF(LFSCK_FIRST_OPC == 1101, "found %lld\n", + (long long)LFSCK_FIRST_OPC); + LASSERTF(LFSCK_LAST_OPC == 1103, "found %lld\n", + (long long)LFSCK_LAST_OPC); + LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n", + (long long)SEQ_ALLOC_SUPER); + LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n", + (long long)SEQ_ALLOC_META); + LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n", + (long long)LDLM_ENQUEUE); + LASSERTF(LDLM_CONVERT == 102, "found %lld\n", + (long long)LDLM_CONVERT); + LASSERTF(LDLM_CANCEL == 103, "found %lld\n", + (long long)LDLM_CANCEL); + LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n", + (long long)LDLM_BL_CALLBACK); + LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n", + (long long)LDLM_CP_CALLBACK); + LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n", + (long long)LDLM_GL_CALLBACK); + LASSERTF(LDLM_SET_INFO == 107, "found %lld\n", + (long long)LDLM_SET_INFO); + LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n", + (long long)LDLM_LAST_OPC); + LASSERTF(LCK_MINMODE == 0, "found %lld\n", + (long long)LCK_MINMODE); + LASSERTF(LCK_EX == 1, "found %lld\n", + (long long)LCK_EX); + LASSERTF(LCK_PW == 2, "found %lld\n", + (long long)LCK_PW); + LASSERTF(LCK_PR == 4, "found %lld\n", + (long long)LCK_PR); + LASSERTF(LCK_CW == 8, "found %lld\n", + (long long)LCK_CW); + LASSERTF(LCK_CR == 16, "found %lld\n", + (long long)LCK_CR); + LASSERTF(LCK_NL == 32, "found %lld\n", + (long long)LCK_NL); + LASSERTF(LCK_GROUP == 64, "found %lld\n", + (long long)LCK_GROUP); + LASSERTF(LCK_COS == 128, "found %lld\n", + (long long)LCK_COS); + LASSERTF(LCK_MAXMODE == 129, "found %lld\n", + (long long)LCK_MAXMODE); + LASSERTF(LCK_MODE_NUM == 8, "found %lld\n", + (long long)LCK_MODE_NUM); + CLASSERT(LDLM_PLAIN == 10); + CLASSERT(LDLM_EXTENT == 11); + CLASSERT(LDLM_FLOCK == 12); + CLASSERT(LDLM_IBITS == 13); + CLASSERT(LDLM_MAX_TYPE == 14); + CLASSERT(LUSTRE_RES_ID_SEQ_OFF == 0); + CLASSERT(LUSTRE_RES_ID_VER_OID_OFF == 1); + LASSERTF(OUT_UPDATE == 1000, "found %lld\n", + (long long)OUT_UPDATE); + LASSERTF(OUT_UPDATE_LAST_OPC == 1001, "found %lld\n", + (long long)OUT_UPDATE_LAST_OPC); + CLASSERT(LUSTRE_RES_ID_QUOTA_SEQ_OFF == 2); + CLASSERT(LUSTRE_RES_ID_QUOTA_VER_OID_OFF == 3); + CLASSERT(LUSTRE_RES_ID_HSH_OFF == 3); + CLASSERT(LQUOTA_TYPE_USR == 0); + CLASSERT(LQUOTA_TYPE_GRP == 1); + CLASSERT(LQUOTA_RES_MD == 1); + CLASSERT(LQUOTA_RES_DT == 2); + LASSERTF(OBD_PING == 400, "found %lld\n", + (long long)OBD_PING); + LASSERTF(OBD_IDX_READ == 403, "found %lld\n", + (long long)OBD_IDX_READ); + LASSERTF(OBD_LAST_OPC == 404, "found %lld\n", + (long long)OBD_LAST_OPC); + LASSERTF(QUOTA_DQACQ == 601, "found %lld\n", + (long long)QUOTA_DQACQ); + LASSERTF(QUOTA_DQREL == 602, "found %lld\n", + (long long)QUOTA_DQREL); + LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n", + (long long)QUOTA_LAST_OPC); + LASSERTF(MGS_CONNECT == 250, "found %lld\n", + (long long)MGS_CONNECT); + LASSERTF(MGS_DISCONNECT == 251, "found %lld\n", + (long long)MGS_DISCONNECT); + LASSERTF(MGS_EXCEPTION == 252, "found %lld\n", + (long long)MGS_EXCEPTION); + LASSERTF(MGS_TARGET_REG == 253, "found %lld\n", + (long long)MGS_TARGET_REG); + LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n", + (long long)MGS_TARGET_DEL); + LASSERTF(MGS_SET_INFO == 255, "found %lld\n", + (long long)MGS_SET_INFO); + LASSERTF(MGS_CONFIG_READ == 256, "found %lld\n", + (long long)MGS_CONFIG_READ); + LASSERTF(MGS_LAST_OPC == 257, "found %lld\n", + (long long)MGS_LAST_OPC); + LASSERTF(SEC_CTX_INIT == 801, "found %lld\n", + (long long)SEC_CTX_INIT); + LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n", + (long long)SEC_CTX_INIT_CONT); + LASSERTF(SEC_CTX_FINI == 803, "found %lld\n", + (long long)SEC_CTX_FINI); + LASSERTF(SEC_LAST_OPC == 804, "found %lld\n", + (long long)SEC_LAST_OPC); + /* Sizes and Offsets */ + + /* Checks for struct obd_uuid */ + LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(struct obd_uuid)); + + /* Checks for struct lu_seq_range */ + LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n", + (long long)(int)sizeof(struct lu_seq_range)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_start)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_end)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_index)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_flags)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags)); + LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n", + (long long)LU_SEQ_RANGE_MDT); + LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n", + (long long)LU_SEQ_RANGE_OST); + + /* Checks for struct lustre_mdt_attrs */ + LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n", + (long long)(int)sizeof(struct lustre_mdt_attrs)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid)); + LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LMAC_HSM); + LASSERTF(LMAC_NOT_IN_OI == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LMAC_NOT_IN_OI); + LASSERTF(LMAC_FID_ON_OST == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)LMAC_FID_ON_OST); + LASSERTF(LMAC_STRIPE_INFO == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)LMAC_STRIPE_INFO); + LASSERTF(LMAC_COMP_INFO == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)LMAC_COMP_INFO); + LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LMAI_RELEASED); + LASSERTF(LMAI_AGENT == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LMAI_AGENT); + LASSERTF(LMAI_REMOTE_PARENT == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LMAI_REMOTE_PARENT); + LASSERTF(LMAI_STRIPED == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)LMAI_STRIPED); + LASSERTF(LMAI_ORPHAN == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)LMAI_ORPHAN); + + /* Checks for struct lustre_ost_attrs */ + LASSERTF((int)sizeof(struct lustre_ost_attrs) == 64, "found %lld\n", + (long long)(int)sizeof(struct lustre_ost_attrs)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_lma) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_lma)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_lma) == 24, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_lma)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_parent_fid) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_parent_fid)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_parent_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_parent_fid)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_stripe_size) == 40, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_stripe_size)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_stripe_size)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_id) == 44, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_id)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_id)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_start) == 48, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_start)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_start)); + LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_end) == 56, "found %lld\n", + (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_end)); + LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_end)); + LASSERTF(OUT_CREATE == 1, "found %lld\n", + (long long)OUT_CREATE); + LASSERTF(OUT_DESTROY == 2, "found %lld\n", + (long long)OUT_DESTROY); + LASSERTF(OUT_REF_ADD == 3, "found %lld\n", + (long long)OUT_REF_ADD); + LASSERTF(OUT_REF_DEL == 4, "found %lld\n", + (long long)OUT_REF_DEL); + LASSERTF(OUT_ATTR_SET == 5, "found %lld\n", + (long long)OUT_ATTR_SET); + LASSERTF(OUT_ATTR_GET == 6, "found %lld\n", + (long long)OUT_ATTR_GET); + LASSERTF(OUT_XATTR_SET == 7, "found %lld\n", + (long long)OUT_XATTR_SET); + LASSERTF(OUT_XATTR_GET == 8, "found %lld\n", + (long long)OUT_XATTR_GET); + LASSERTF(OUT_INDEX_LOOKUP == 9, "found %lld\n", + (long long)OUT_INDEX_LOOKUP); + LASSERTF(OUT_INDEX_LOOKUP == 9, "found %lld\n", + (long long)OUT_INDEX_LOOKUP); + LASSERTF(OUT_INDEX_INSERT == 10, "found %lld\n", + (long long)OUT_INDEX_INSERT); + LASSERTF(OUT_INDEX_DELETE == 11, "found %lld\n", + (long long)OUT_INDEX_DELETE); + LASSERTF(OUT_WRITE == 12, "found %lld\n", + (long long)OUT_WRITE); + LASSERTF(OUT_XATTR_DEL == 13, "found %lld\n", + (long long)OUT_XATTR_DEL); + LASSERTF(OUT_PUNCH == 14, "found %lld\n", + (long long)OUT_PUNCH); + LASSERTF(OUT_READ == 15, "found %lld\n", + (long long)OUT_READ); + LASSERTF(OUT_NOOP == 16, "found %lld\n", + (long long)OUT_NOOP); + LASSERTF(OUT_XATTR_LIST == 17, "found %lld\n", + (long long)OUT_XATTR_LIST); + + /* Checks for struct lustre_som_attrs */ + LASSERTF((int)sizeof(struct lustre_som_attrs) == 24, "found %lld\n", + (long long)(int)sizeof(struct lustre_som_attrs)); + LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_valid) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_som_attrs, lsa_valid)); + LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid)); + LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_reserved) == 2, "found %lld\n", + (long long)(int)offsetof(struct lustre_som_attrs, lsa_reserved)); + LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved) == 6, "found %lld\n", + (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved)); + LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_size) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_som_attrs, lsa_size)); + LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_size)); + LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_blocks) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_som_attrs, lsa_blocks)); + LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks)); + + /* Checks for struct hsm_attrs */ + LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_attrs)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_compat) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_compat)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_compat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_compat)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_flags)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_flags)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_arch_id)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_ver) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_arch_ver)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver)); + + /* Checks for struct ost_id */ + LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n", + (long long)(int)sizeof(struct ost_id)); + LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_id, oi)); + LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ost_id *)0)->oi)); + LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n", + (long long)LUSTRE_FID_INIT_OID); + LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n", + (long long)FID_SEQ_OST_MDT0); + LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n", + (long long)FID_SEQ_LLOG); + LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n", + (long long)FID_SEQ_ECHO); + LASSERTF(FID_SEQ_UNUSED_START == 3, "found %lld\n", + (long long)FID_SEQ_UNUSED_START); + LASSERTF(FID_SEQ_UNUSED_END == 9, "found %lld\n", + (long long)FID_SEQ_UNUSED_END); + LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n", + (long long)FID_SEQ_RSVD); + LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n", + (long long)FID_SEQ_IGIF); + LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IGIF_MAX); + LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IDIF); + LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IDIF_MAX); + LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_START); + LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LOCAL_FILE); + LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_DOT_LUSTRE); + LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_SPECIAL); + LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_QUOTA); + LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_QUOTA_GLB); + LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_ROOT); + LASSERTF(FID_SEQ_LAYOUT_RBTREE == 0x0000000200000008ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LAYOUT_RBTREE); + LASSERTF(FID_SEQ_UPDATE_LOG == 0x0000000200000009ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_UPDATE_LOG); + LASSERTF(FID_SEQ_UPDATE_LOG_DIR == 0x000000020000000aULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_UPDATE_LOG_DIR); + LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_NORMAL); + LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LOV_DEFAULT); + LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_SPECIAL_BFL); + LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_DOT_LUSTRE); + LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_DOT_LUSTRE_OBF); + + /* Checks for struct lu_dirent */ + LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n", + (long long)(int)sizeof(struct lu_dirent)); + LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_fid)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid)); + LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_hash)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash)); + LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_reclen)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen)); + LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_namelen)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen)); + LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_attrs)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs)); + LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_name[0])); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0])); + LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LUDA_FID); + LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LUDA_TYPE); + LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LUDA_64BITHASH); + + /* Checks for struct luda_type */ + LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n", + (long long)(int)sizeof(struct luda_type)); + LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct luda_type, lt_type)); + LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n", + (long long)(int)sizeof(((struct luda_type *)0)->lt_type)); + + /* Checks for struct lu_dirpage */ + LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n", + (long long)(int)sizeof(struct lu_dirpage)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_flags)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_pad0)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0])); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0])); + LASSERTF(LDF_EMPTY == 1, "found %lld\n", + (long long)LDF_EMPTY); + LASSERTF(LDF_COLLIDE == 2, "found %lld\n", + (long long)LDF_COLLIDE); + LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n", + (long long)LU_PAGE_SIZE); + /* Checks for union lu_page */ + LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n", + (long long)(int)sizeof(union lu_page)); + + /* Checks for struct lu_ladvise */ + LASSERTF((int)sizeof(struct lu_ladvise) == 32, "found %lld\n", + (long long)(int)sizeof(struct lu_ladvise)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_advice) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_advice)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_advice) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_advice)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_value1) == 2, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_value1)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value1) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value1)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_value2) == 4, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_value2)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value2)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_start) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_start)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_start)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_end) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_end)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_end)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_value3) == 24, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_value3)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value3)); + LASSERTF((int)offsetof(struct lu_ladvise, lla_value4) == 28, "found %lld\n", + (long long)(int)offsetof(struct lu_ladvise, lla_value4)); + LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4)); + LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n", + (long long)LU_LADVISE_WILLREAD); + LASSERTF(LU_LADVISE_DONTNEED == 2, "found %lld\n", + (long long)LU_LADVISE_DONTNEED); + + /* Checks for struct ladvise_hdr */ + LASSERTF((int)sizeof(struct ladvise_hdr) == 32, "found %lld\n", + (long long)(int)sizeof(struct ladvise_hdr)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_magic)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_magic)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_count)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_count)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_flags)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_flags)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_value1) == 16, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_value1)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value1)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_value2) == 20, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_value2)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value2)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_value3) == 24, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_value3)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value3)); + LASSERTF((int)offsetof(struct ladvise_hdr, lah_advise) == 32, "found %lld\n", + (long long)(int)offsetof(struct ladvise_hdr, lah_advise)); + LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n", + (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise)); + LASSERTF(LF_ASYNC == 1, "found %lld\n", + (long long)LF_ASYNC); + LASSERTF(LADVISE_MAGIC == 450829536, "found %lld\n", + (long long)LADVISE_MAGIC); + + /* Checks for struct lustre_handle */ + LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n", + (long long)(int)sizeof(struct lustre_handle)); + LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_handle, cookie)); + LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_handle *)0)->cookie)); + + /* Checks for struct lustre_msg_v2 */ + LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n", + (long long)(int)sizeof(struct lustre_msg_v2)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_magic)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_flags)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0])); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0])); + LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0bd00bd3UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_MSG_MAGIC_V2); + LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xd30bd00bUL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_MSG_MAGIC_V2_SWABBED); + + /* Checks for struct ptlrpc_body */ + LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n", + (long long)(int)sizeof(struct ptlrpc_body_v3)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_tag) == 32, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_tag)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag) == 2, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding0) == 34, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding0)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding1)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv)); + CLASSERT(PTLRPC_NUM_VERSIONS == 4); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_mbits) == 120, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_mbits)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_0) == 128, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_0)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_1) == 136, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_1)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_2) == 144, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_2)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2)); + CLASSERT(LUSTRE_JOBID_SIZE == 32); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_tag) == (int)offsetof(struct ptlrpc_body_v2, pb_tag), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_tag), (int)offsetof(struct ptlrpc_body_v2, pb_tag)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_tag), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_tag)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding0) == (int)offsetof(struct ptlrpc_body_v2, pb_padding0), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding0), (int)offsetof(struct ptlrpc_body_v2, pb_padding0)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding0), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding0)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding1) == (int)offsetof(struct ptlrpc_body_v2, pb_padding1), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding1), (int)offsetof(struct ptlrpc_body_v2, pb_padding1)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding1), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding1)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_mbits) == (int)offsetof(struct ptlrpc_body_v2, pb_mbits), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_mbits), (int)offsetof(struct ptlrpc_body_v2, pb_mbits)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_mbits), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_mbits)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_0) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_0), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding64_0), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_0)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_0), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_0)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_1) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_1), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding64_1), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_1)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_1), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_1)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_2) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_2), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding64_2), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_2)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_2), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_2)); + LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n", + (long long)MSG_PTLRPC_BODY_OFF); + LASSERTF(REQ_REC_OFF == 1, "found %lld\n", + (long long)REQ_REC_OFF); + LASSERTF(REPLY_REC_OFF == 1, "found %lld\n", + (long long)REPLY_REC_OFF); + LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n", + (long long)DLM_LOCKREQ_OFF); + LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n", + (long long)DLM_REQ_REC_OFF); + LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n", + (long long)DLM_INTENT_IT_OFF); + LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n", + (long long)DLM_INTENT_REC_OFF); + LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n", + (long long)DLM_LOCKREPLY_OFF); + LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n", + (long long)DLM_REPLY_REC_OFF); + LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n", + (long long)MSG_PTLRPC_HEADER_OFF); + LASSERTF(PTLRPC_MSG_VERSION == 0x00000003UL, "found 0x%.8xUL\n", + (unsigned)PTLRPC_MSG_VERSION); + LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_VERSION_MASK); + LASSERTF(LUSTRE_OBD_VERSION == 0x00010000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_OBD_VERSION); + LASSERTF(LUSTRE_MDS_VERSION == 0x00020000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_MDS_VERSION); + LASSERTF(LUSTRE_OST_VERSION == 0x00030000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_OST_VERSION); + LASSERTF(LUSTRE_DLM_VERSION == 0x00040000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_DLM_VERSION); + LASSERTF(LUSTRE_LOG_VERSION == 0x00050000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_LOG_VERSION); + LASSERTF(LUSTRE_MGS_VERSION == 0x00060000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_MGS_VERSION); + LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n", + (long long)MSGHDR_AT_SUPPORT); + LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n", + (long long)MSGHDR_CKSUM_INCOMPAT18); + LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MSG_RESENT); + LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MSG_REPLAY); + LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)MSG_REQ_REPLAY_DONE); + LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MSG_LOCK_REPLAY_DONE); + LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_RECOVERING); + LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_RECONNECT); + LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_REPLAYABLE); + LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_LIBCLIENT); + LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_INITIAL); + LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_NEXT_VER); + LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_TRANSNO); + + /* Checks for struct obd_connect_data */ + LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n", + (long long)(int)sizeof(struct obd_connect_data)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_version)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_index)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_blkbits) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_blkbits)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_inobits) == 33, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_inobits)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_tax_kb) == 34, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_tax_kb)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb) == 2, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_max_blks) == 36, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_max_blks)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_transno)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_group)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_instance)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxmodrpcs) == 72, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_maxmodrpcs)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxmodrpcs) == 2, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxmodrpcs)); + LASSERTF((int)offsetof(struct obd_connect_data, padding0) == 74, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding0)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding0)); + LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 76, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding1)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags2) == 80, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags2)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags2)); + LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding3)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3)); + LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding4)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4)); + LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding5)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5)); + LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding6)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6)); + LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding7)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7)); + LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding8)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8)); + LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding9)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingA)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingB)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingC)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingD)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingE)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingF)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF)); + LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RDONLY); + LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_INDEX); + LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS); + LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT); + LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SRVLOCK); + LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_VERSION); + LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_REQPORTAL); + LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_ACL); + LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_XATTR); + LASSERTF(OBD_CONNECT_LARGE_ACL == 0x200ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LARGE_ACL); + LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_TRUNCLOCK); + LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_TRANSNO); + LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_IBITS); + LASSERTF(OBD_CONNECT_BARRIER == 0x2000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_BARRIER); + LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_ATTRFID); + LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_NODEVOH); + LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RMT_CLIENT); + LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RMT_CLIENT_FORCE); + LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_BRW_SIZE); + LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_QUOTA64); + LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS_CAPA); + LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_OSS_CAPA); + LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CANCELSET); + LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SOM); + LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_AT); + LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LRU_RESIZE); + LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS_MDS); + LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_REAL); + LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CHANGE_QS); + LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CKSUM); + LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FID); + LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_VBR); + LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LOV_V3); + LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT_SHRINK); + LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SKIP_ORPHAN); + LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MAX_EASIZE); + LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FULL20); + LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LAYOUTLOCK); + LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_64BITHASH); + LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MAXBYTES); + LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_IMP_RECOV); + LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_JOBSTATS); + LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_UMASK); + LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_EINPROGRESS); + LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT_PARAM); + LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FLOCK_OWNER); + LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LVB_TYPE); + LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_NANOSEC_TIME); + LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LIGHTWEIGHT); + LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SHORTIO); + LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_PINGLESS); + LASSERTF(OBD_CONNECT_FLOCK_DEAD == 0x8000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FLOCK_DEAD); + LASSERTF(OBD_CONNECT_OPEN_BY_FID == 0x20000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_OPEN_BY_FID); + LASSERTF(OBD_CONNECT_LFSCK == 0x40000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LFSCK); + LASSERTF(OBD_CONNECT_UNLINK_CLOSE == 0x100000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_UNLINK_CLOSE); + LASSERTF(OBD_CONNECT_MULTIMODRPCS == 0x200000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MULTIMODRPCS); + LASSERTF(OBD_CONNECT_DIR_STRIPE == 0x400000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_DIR_STRIPE); + LASSERTF(OBD_CONNECT_SUBTREE == 0x800000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SUBTREE); + LASSERTF(OBD_CONNECT_LOCKAHEAD_OLD == 0x1000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LOCKAHEAD_OLD); + LASSERTF(OBD_CONNECT_BULK_MBITS == 0x2000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_BULK_MBITS); + LASSERTF(OBD_CONNECT_OBDOPACK == 0x4000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_OBDOPACK); + LASSERTF(OBD_CONNECT_FLAGS2 == 0x8000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FLAGS2); + LASSERTF(OBD_CONNECT2_FILE_SECCTX == 0x1ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_FILE_SECCTX); + LASSERTF(OBD_CONNECT2_LOCKAHEAD == 0x2ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_LOCKAHEAD); + LASSERTF(OBD_CONNECT2_DIR_MIGRATE == 0x4ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_DIR_MIGRATE); + LASSERTF(OBD_CONNECT2_FLR == 0x20ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_FLR); + LASSERTF(OBD_CONNECT2_WBC_INTENTS == 0x40ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_WBC_INTENTS); + LASSERTF(OBD_CONNECT2_LOCK_CONVERT == 0x80ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_LOCK_CONVERT); + LASSERTF(OBD_CONNECT2_ARCHIVE_ID_ARRAY == 0x100ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_ARCHIVE_ID_ARRAY); + LASSERTF(OBD_CONNECT2_SELINUX_POLICY == 0x400ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_SELINUX_POLICY); + LASSERTF(OBD_CONNECT2_LSOM == 0x800ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_LSOM); + LASSERTF(OBD_CONNECT2_ASYNC_DISCARD == 0x4000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_ASYNC_DISCARD); + LASSERTF(OBD_CONNECT2_ENCRYPT == 0x8000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_ENCRYPT); + LASSERTF(OBD_CONNECT2_FIDMAP== 0x10000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_FIDMAP); + LASSERTF(OBD_CONNECT2_GETATTR_PFID== 0x20000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_GETATTR_PFID); + LASSERTF(OBD_CONNECT2_MDLL_BYPASS == OBD_CONNECT2_MDLL_BYPASS, "found 0x%.16llxULL\n", + OBD_CONNECT2_MDLL_BYPASS); + LASSERTF(OBD_CONNECT2_MDLL == 0x1000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_MDLL); + LASSERTF(OBD_CONNECT2_MDLL_AUTO_REFRESH == 0x2000000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT2_MDLL_AUTO_REFRESH); + LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_CRC32); + LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_ADLER); + LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_CRC32C); + LASSERTF(OBD_CKSUM_RESERVED == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_RESERVED); + LASSERTF(OBD_CKSUM_T10IP512 == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_T10IP512); + LASSERTF(OBD_CKSUM_T10IP4K == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_T10IP4K); + LASSERTF(OBD_CKSUM_T10CRC512 == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_T10CRC512); + LASSERTF(OBD_CKSUM_T10CRC4K == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_T10CRC4K); + LASSERTF(OBD_CKSUM_T10_TOP == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_T10_TOP); + + /* Checks for struct ost_layout */ + LASSERTF((int)sizeof(struct ost_layout) == 28, "found %lld\n", + (long long)(int)sizeof(struct ost_layout)); + LASSERTF((int)offsetof(struct ost_layout, ol_stripe_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_layout, ol_stripe_size)); + LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_layout *)0)->ol_stripe_size)); + LASSERTF((int)offsetof(struct ost_layout, ol_stripe_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct ost_layout, ol_stripe_count)); + LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_layout *)0)->ol_stripe_count)); + LASSERTF((int)offsetof(struct ost_layout, ol_comp_start) == 8, "found %lld\n", + (long long)(int)offsetof(struct ost_layout, ol_comp_start)); + LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_start)); + LASSERTF((int)offsetof(struct ost_layout, ol_comp_end) == 16, "found %lld\n", + (long long)(int)offsetof(struct ost_layout, ol_comp_end)); + LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_end)); + LASSERTF((int)offsetof(struct ost_layout, ol_comp_id) == 24, "found %lld\n", + (long long)(int)offsetof(struct ost_layout, ol_comp_id)); + LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_id)); + + /* Checks for struct obdo */ + LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n", + (long long)(int)sizeof(struct obdo)); + LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_valid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_valid)); + LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_oi)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_oi)); + LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_seq)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq)); + LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_size)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_size)); + LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_mtime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_mtime)); + LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_atime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_atime)); + LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_ctime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_ctime)); + LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_blocks)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_blocks)); + LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_grant)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_grant)); + LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_blksize)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_blksize)); + LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_mode)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_mode)); + LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_uid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_uid)); + LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_gid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_gid)); + LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_flags)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_flags)); + LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_nlink)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_nlink)); + LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_oid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid)); + LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_misc)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_misc)); + LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_ioepoch)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch)); + LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_stripe_idx)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx)); + LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_ver)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver)); + LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_handle)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_handle)); + LASSERTF((int)offsetof(struct obdo, o_layout) == 136, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_layout)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_layout)); + LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_layout_version)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_layout_version)); + LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_uid_h)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_uid_h)); + LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_gid_h)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_gid_h)); + LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_data_version)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_data_version)); + LASSERTF((int)offsetof(struct obdo, o_projid) == 184, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_projid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_projid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_projid)); + LASSERTF((int)offsetof(struct obdo, o_padding_4) == 188, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_4)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_4)); + LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_5)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_5)); + LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_6)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_6)); + LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n", + OBD_MD_FLID); + LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n", + OBD_MD_FLATIME); + LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMTIME); + LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCTIME); + LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n", + OBD_MD_FLSIZE); + LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n", + OBD_MD_FLBLOCKS); + LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n", + OBD_MD_FLBLKSZ); + LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMODE); + LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n", + OBD_MD_FLTYPE); + LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n", + OBD_MD_FLUID); + LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGID); + LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n", + OBD_MD_FLFLAGS); + LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLNLINK); + LASSERTF(OBD_MD_FLPARENT == (0x00004000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLPARENT); + LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRDEV); + LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLEASIZE); + LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n", + OBD_MD_LINKNAME); + LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLHANDLE); + LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCKSUM); + LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGROUP); + LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLFID); + LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGRANT); + LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLDIREA); + LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLUSRQUOTA); + LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGRPQUOTA); + LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMODEASIZE); + LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n", + OBD_MD_MDS); + LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n", + OBD_MD_MEA); + LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL), "found 0x%.16llxULL\n", + OBD_MD_TSTATE); + LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTR); + LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTRLS); + LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTRRM); + LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLACL); + LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCROSSREF); + LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGETATTRLOCK); + LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLDATAVERSION); + LASSERTF(OBD_MD_CLOSE_INTENT_EXECED == (0x0020000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_CLOSE_INTENT_EXECED); + LASSERTF(OBD_MD_DEFAULT_MEA == (0x0040000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_DEFAULT_MEA); + LASSERTF(OBD_MD_FLOSTLAYOUT == (0x0080000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLOSTLAYOUT); + LASSERTF(OBD_MD_FLPROJID == (0x0100000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLPROJID); + CLASSERT(OBD_FL_INLINEDATA == 0x00000001); + CLASSERT(OBD_FL_OBDMDEXISTS == 0x00000002); + CLASSERT(OBD_FL_DELORPHAN == 0x00000004); + CLASSERT(OBD_FL_NORPC == 0x00000008); + CLASSERT(OBD_FL_IDONLY == 0x00000010); + CLASSERT(OBD_FL_RECREATE_OBJS == 0x00000020); + CLASSERT(OBD_FL_DEBUG_CHECK == 0x00000040); + CLASSERT(OBD_FL_NO_USRQUOTA == 0x00000100); + CLASSERT(OBD_FL_NO_GRPQUOTA == 0x00000200); + CLASSERT(OBD_FL_CREATE_CROW == 0x00000400); + CLASSERT(OBD_FL_SRVLOCK == 0x00000800); + CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000); + CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000); + CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000); + CLASSERT(OBD_FL_CKSUM_T10IP512 == 0x00005000); + CLASSERT(OBD_FL_CKSUM_T10IP4K == 0x00006000); + CLASSERT(OBD_FL_CKSUM_T10CRC512 == 0x00007000); + CLASSERT(OBD_FL_CKSUM_T10CRC4K == 0x00008000); + CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000); + CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000); + CLASSERT(OBD_FL_MMAP == 0x00040000); + CLASSERT(OBD_FL_RECOV_RESEND == 0x00080000); + CLASSERT(OBD_FL_NOSPC_BLK == 0x00100000); + CLASSERT(OBD_FL_FLUSH == 0x00200000); + CLASSERT(OBD_FL_SHORT_IO == 0x00400000); + + /* Checks for struct lov_ost_data_v1 */ + LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n", + (long long)(int)sizeof(struct lov_ost_data_v1)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx)); + + /* Checks for struct lov_mds_md_v1 */ + LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n", + (long long)(int)sizeof(struct lov_mds_md_v1)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0])); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0])); + CLASSERT(LOV_MAGIC_V1 == (0x0BD10000 | 0x0BD0)); + + /* Checks for struct lov_mds_md_v3 */ + LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n", + (long long)(int)sizeof(struct lov_mds_md_v3)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen)); + CLASSERT(LOV_MAXPOOLNAME == 15); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[15 + 1]) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[15 + 1])); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[15 + 1]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[15 + 1])); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0])); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0])); + CLASSERT(LOV_MAGIC_V3 == (0x0BD30000 | 0x0BD0)); + LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_RAID0); + LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_RAID1); + LASSERTF(LOV_PATTERN_MDT == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_MDT); + LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_CMOBD); + + /* Checks for struct lov_comp_md_entry_v1 */ + LASSERTF((int)sizeof(struct lov_comp_md_entry_v1) == 48, "found %lld\n", + (long long)(int)sizeof(struct lov_comp_md_entry_v1)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_id) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_id)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_id)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_flags)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_flags)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_extent) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_extent)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_extent)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_offset) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_offset)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_offset) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_offset)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_size) == 28, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_size)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp) == 36, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp)); + LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1) == 44, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1)); + LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1)); + LASSERTF(LCME_FL_INIT == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)LCME_FL_INIT); + LASSERTF(LCME_FL_NEG == 0x80000000UL, "found 0x%.8xUL\n", + (unsigned)LCME_FL_NEG); + + /* Checks for struct lov_comp_md_v1 */ + LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n", + (long long)(int)sizeof(struct lov_comp_md_v1)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_magic)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_magic)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_size) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_size)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_size)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_layout_gen) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_layout_gen)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_layout_gen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_layout_gen)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_flags)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_flags)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_entry_count) == 14, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_mirror_count) == 16, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_mirror_count)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 18, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 6, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2)); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding2)); + LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_entries[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entries[0])); + LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n", + (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0])); + CLASSERT(LOV_MAGIC_COMP_V1 == (0x0BD60000 | 0x0BD0)); + LASSERTF(LCM_FL_NONE == 0, "found %lld\n", + (long long)LCM_FL_NONE); + LASSERTF(LCM_FL_RDONLY == 1, "found %lld\n", + (long long)LCM_FL_RDONLY); + LASSERTF(LCM_FL_WRITE_PENDING == 2, "found %lld\n", + (long long)LCM_FL_WRITE_PENDING); + LASSERTF(LCM_FL_SYNC_PENDING == 3, "found %lld\n", + (long long)LCM_FL_SYNC_PENDING); + + /* Checks for struct lmv_mds_md_v1 */ + LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n", + (long long)(int)sizeof(struct lmv_mds_md_v1)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_magic)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index) == 8, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_hash_type) == 12, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_hash_type)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_layout_version) == 16, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_layout_version)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset) == 20, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash) == 24, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 28, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding2)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding3) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding3)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[15]) == 55, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[15])); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[15]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[15])); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0]) == 56, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0])); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0]) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0])); + CLASSERT(LMV_MAGIC_V1 == 0x0CD20CD0); + CLASSERT(LMV_MAGIC_STRIPE == 0x0CD40CD0); + CLASSERT(LMV_HASH_TYPE_MASK == 0x0000ffff); + CLASSERT(LMV_HASH_FLAG_LOST_LMV == 0x10000000); + CLASSERT(LMV_HASH_FLAG_BAD_TYPE == 0x20000000); + CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000); + + /* Checks for struct obd_statfs */ + LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n", + (long long)(int)sizeof(struct obd_statfs)); + LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_type)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_type)); + LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_blocks)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks)); + LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bfree)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree)); + LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bavail)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail)); + LASSERTF((int)offsetof(struct obd_statfs, os_files) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_files)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_files) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_files)); + LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_ffree)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree)); + LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_fsid)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid)); + LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bsize)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize)); + LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_namelen)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen)); + LASSERTF((int)offsetof(struct obd_statfs, os_maxbytes) == 96, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_maxbytes)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_maxbytes) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_maxbytes)); + LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_state)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_state)); + LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_fprecreated)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated)); + LASSERTF((int)offsetof(struct obd_statfs, os_granted) == 112, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_granted)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_granted) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_granted)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare3)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare4)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare5)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare6)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare7)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare8)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare9)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9)); + LASSERTF(OS_STATE_DEGRADED == 0x1, "found %lld\n", + (long long)OS_STATE_DEGRADED); + LASSERTF(OS_STATE_READONLY == 0x2, "found %lld\n", + (long long)OS_STATE_READONLY); + LASSERTF(OS_STATE_NOPRECREATE == 0x4, "found %lld\n", + (long long)OS_STATE_NOPRECREATE); + LASSERTF(OS_STATE_ENOSPC == 0x20, "found %lld\n", + (long long)OS_STATE_ENOSPC); + LASSERTF(OS_STATE_ENOINO == 0x40, "found %lld\n", + (long long)OS_STATE_ENOINO); + LASSERTF(OS_STATE_SUM == 0x100, "found %lld\n", + (long long)OS_STATE_SUM); + LASSERTF(OS_STATE_NONROT == 0x200, "found %lld\n", + (long long)OS_STATE_NONROT); + + /* Checks for struct obd_ioobj */ + LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n", + (long long)(int)sizeof(struct obd_ioobj)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_oid)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt)); + LASSERTF(IOOBJ_MAX_BRW_BITS == 16, "found %lld\n", + (long long)IOOBJ_MAX_BRW_BITS); + + /* Checks for union lquota_id */ + LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n", + (long long)(int)sizeof(union lquota_id)); + + LASSERTF(QUOTABLOCK_BITS == 10, "found %lld\n", + (long long)QUOTABLOCK_BITS); + LASSERTF(QUOTABLOCK_SIZE == 1024, "found %lld\n", + (long long)QUOTABLOCK_SIZE); + + /* Checks for struct obd_quotactl */ + LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n", + (long long)(int)sizeof(struct obd_quotactl)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_cmd)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_type)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_id)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_stat)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_dqblk)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk)); + + /* Checks for struct obd_dqinfo */ + LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n", + (long long)(int)sizeof(struct obd_dqinfo)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_flags)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_valid)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid)); + + /* Checks for struct obd_dqblk */ + LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n", + (long long)(int)sizeof(struct obd_dqblk)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_curspace)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_btime)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_itime)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_valid)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_padding)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding)); + LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n", + Q_QUOTACHECK); + LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n", + Q_INITQUOTA); + LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n", + Q_GETOINFO); + LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n", + Q_GETOQUOTA); + LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n", + Q_FINVALIDATE); + + /* Checks for struct lquota_acct_rec */ + LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n", + (long long)(int)sizeof(struct lquota_acct_rec)); + LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_acct_rec, bspace)); + LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace)); + LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_acct_rec, ispace)); + LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace)); + + /* Checks for struct lquota_glb_rec */ + LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct lquota_glb_rec)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_time)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted)); + + /* Checks for struct lquota_slv_rec */ + LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n", + (long long)(int)sizeof(struct lquota_slv_rec)); + LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted)); + LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted)); + + /* Checks for struct idx_info */ + LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n", + (long long)(int)sizeof(struct idx_info)); + LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_magic)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_magic)); + LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_flags)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_flags)); + LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_count)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_count)); + LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad0)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0)); + LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_attrs)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs)); + LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_fid)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_fid)); + LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_version)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_version)); + LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_hash_start)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start)); + LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_hash_end)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end)); + LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_keysize)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize)); + LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_recsize)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize)); + LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad1)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1)); + LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad2)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2)); + LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad3)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3)); + CLASSERT(IDX_INFO_MAGIC == 0x3D37CC37); + + /* Checks for struct lu_idxpage */ + LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n", + (long long)(int)sizeof(struct lu_idxpage)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_magic)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_flags)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_nr)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_pad0)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0)); + CLASSERT(LIP_MAGIC == 0x8A6D6B6C); + LASSERTF(LIP_HDR_SIZE == 16, "found %lld\n", + (long long)LIP_HDR_SIZE); + LASSERTF(II_FL_NOHASH == 1, "found %lld\n", + (long long)II_FL_NOHASH); + LASSERTF(II_FL_VARKEY == 2, "found %lld\n", + (long long)II_FL_VARKEY); + LASSERTF(II_FL_VARREC == 4, "found %lld\n", + (long long)II_FL_VARREC); + LASSERTF(II_FL_NONUNQ == 8, "found %lld\n", + (long long)II_FL_NONUNQ); + + /* Checks for struct niobuf_remote */ + LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n", + (long long)(int)sizeof(struct niobuf_remote)); + LASSERTF((int)offsetof(struct niobuf_remote, rnb_offset) == 0, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, rnb_offset)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_offset)); + LASSERTF((int)offsetof(struct niobuf_remote, rnb_len) == 8, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, rnb_len)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_len)); + LASSERTF((int)offsetof(struct niobuf_remote, rnb_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, rnb_flags)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_flags)); + LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n", + OBD_BRW_READ); + LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n", + OBD_BRW_WRITE); + LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n", + OBD_BRW_SYNC); + LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n", + OBD_BRW_CHECK); + LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n", + OBD_BRW_FROM_GRANT); + LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n", + OBD_BRW_GRANTED); + LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n", + OBD_BRW_NOCACHE); + LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n", + OBD_BRW_NOQUOTA); + LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n", + OBD_BRW_SRVLOCK); + LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n", + OBD_BRW_ASYNC); + LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n", + OBD_BRW_MEMALLOC); + LASSERTF(OBD_BRW_OVER_USRQUOTA == 0x1000, "found 0x%.8x\n", + OBD_BRW_OVER_USRQUOTA); + LASSERTF(OBD_BRW_OVER_GRPQUOTA == 0x2000, "found 0x%.8x\n", + OBD_BRW_OVER_GRPQUOTA); + LASSERTF(OBD_BRW_SOFT_SYNC == 0x4000, "found 0x%.8x\n", + OBD_BRW_SOFT_SYNC); + + /* Checks for struct ost_body */ + LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n", + (long long)(int)sizeof(struct ost_body)); + LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_body, oa)); + LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n", + (long long)(int)sizeof(((struct ost_body *)0)->oa)); + + /* Checks for struct ll_fid */ + LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n", + (long long)(int)sizeof(struct ll_fid)); + LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, id)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->id)); + LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, generation)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->generation)); + LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, f_type)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->f_type)); + + LASSERTF(MDS_CROSS_REF == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MDS_CROSS_REF); + LASSERTF(MDS_PERM_BYPASS == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)MDS_PERM_BYPASS); + LASSERTF(MDS_QUOTA_IGNORE == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MDS_QUOTA_IGNORE); + LASSERTF(MDS_KEEP_ORPHAN == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MDS_KEEP_ORPHAN); + LASSERTF(MDS_RECOV_OPEN == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)MDS_RECOV_OPEN); + LASSERTF(MDS_DATA_MODIFIED == 0x00000200UL, "found 0x%.8xUL\n", + (unsigned)MDS_DATA_MODIFIED); + LASSERTF(MDS_CREATE_VOLATILE == 0x00000400UL, "found 0x%.8xUL\n", + (unsigned)MDS_CREATE_VOLATILE); + LASSERTF(MDS_OWNEROVERRIDE == 0x00000800UL, "found 0x%.8xUL\n", + (unsigned)MDS_OWNEROVERRIDE); + LASSERTF(MDS_HSM_RELEASE == 0x00001000UL, "found 0x%.8xUL\n", + (unsigned)MDS_HSM_RELEASE); + LASSERTF(MDS_CLOSE_LAYOUT_SWAP == 0x00004000UL, "found 0x%.8xUL\n", + (unsigned)MDS_CLOSE_LAYOUT_SWAP); + LASSERTF(MDS_CLOSE_LAYOUT_MERGE == 0x00008000UL, "found 0x%.8xUL\n", + (unsigned)MDS_CLOSE_LAYOUT_MERGE); + LASSERTF(MDS_CLOSE_RESYNC_DONE == 0x00010000UL, "found 0x%.8xUL\n", + (unsigned)MDS_CLOSE_RESYNC_DONE); + LASSERTF(MDS_CLOSE_LAYOUT_SPLIT == 0x00020000UL, "found 0x%.8xUL\n", + (unsigned)MDS_CLOSE_LAYOUT_SPLIT); + + /* Checks for struct mdt_body */ + LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n", + (long long)(int)sizeof(struct mdt_body)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fid1) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fid1)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid1)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fid2) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fid2)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid2)); + LASSERTF((int)offsetof(struct mdt_body, mbo_open_handle) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_open_handle)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_open_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_open_handle)); + LASSERTF((int)offsetof(struct mdt_body, mbo_valid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_valid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_valid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_size) == 48, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_size)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_size)); + LASSERTF((int)offsetof(struct mdt_body, mbo_mtime) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_mtime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mtime)); + LASSERTF((int)offsetof(struct mdt_body, mbo_atime) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_atime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_atime)); + LASSERTF((int)offsetof(struct mdt_body, mbo_ctime) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_ctime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_ctime)); + LASSERTF((int)offsetof(struct mdt_body, mbo_blocks) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_blocks)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_blocks)); + LASSERTF((int)offsetof(struct mdt_body, mbo_version) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_version)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_version)); + LASSERTF((int)offsetof(struct mdt_body, mbo_t_state) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_t_state)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_t_state) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_t_state)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fsuid) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fsuid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsuid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fsgid) == 108, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fsgid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsgid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_capability) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_capability)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_capability) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_capability)); + LASSERTF((int)offsetof(struct mdt_body, mbo_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_mode)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mode)); + LASSERTF((int)offsetof(struct mdt_body, mbo_uid) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_uid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_gid) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_gid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_flags) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_flags)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_flags)); + LASSERTF((int)offsetof(struct mdt_body, mbo_rdev) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_rdev)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_rdev) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_rdev)); + LASSERTF((int)offsetof(struct mdt_body, mbo_nlink) == 136, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_nlink)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_nlink) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_nlink)); + LASSERTF((int)offsetof(struct mdt_body, mbo_layout_gen) == 140, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_layout_gen)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_layout_gen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_layout_gen)); + LASSERTF((int)offsetof(struct mdt_body, mbo_suppgid) == 144, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_suppgid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_suppgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_suppgid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_eadatasize) == 148, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_eadatasize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_eadatasize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_eadatasize)); + LASSERTF((int)offsetof(struct mdt_body, mbo_aclsize) == 152, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_aclsize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_aclsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_aclsize)); + LASSERTF((int)offsetof(struct mdt_body, mbo_max_mdsize) == 156, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_max_mdsize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize)); + LASSERTF((int)offsetof(struct mdt_body, mbo_unused3) == 160, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_unused3)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused3)); + LASSERTF((int)offsetof(struct mdt_body, mbo_uid_h) == 164, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_uid_h)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid_h)); + LASSERTF((int)offsetof(struct mdt_body, mbo_gid_h) == 168, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_gid_h)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid_h)); + LASSERTF((int)offsetof(struct mdt_body, mbo_projid) == 172, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_projid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_projid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_projid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_dom_size) == 176, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_dom_size)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_size)); + LASSERTF((int)offsetof(struct mdt_body, mbo_dom_blocks) == 184, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_dom_blocks)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_8) == 192, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_8)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_8) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_8)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_9) == 200, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_9)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_9) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_9)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_10) == 208, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_10)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_10) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_10)); + LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n", + MDS_FMODE_CLOSED); + LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n", + MDS_FMODE_EXEC); + LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n", + MDS_OPEN_CREATED); + LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n", + MDS_OPEN_CREAT); + LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n", + MDS_OPEN_EXCL); + LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n", + MDS_OPEN_TRUNC); + LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n", + MDS_OPEN_APPEND); + LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n", + MDS_OPEN_SYNC); + LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n", + MDS_OPEN_DIRECTORY); + LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n", + MDS_OPEN_BY_FID); + LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n", + MDS_OPEN_DELAY_CREATE); + LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n", + MDS_OPEN_OWNEROVERRIDE); + LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n", + MDS_OPEN_JOIN_FILE); + LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n", + MDS_OPEN_LOCK); + LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n", + MDS_OPEN_HAS_EA); + LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n", + MDS_OPEN_HAS_OBJS); + LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_NORESTORE); + LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_NEWSTRIPE); + LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_VOLATILE); + LASSERTF(LUSTRE_SYNC_FL == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_SYNC_FL); + LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_IMMUTABLE_FL); + LASSERTF(LUSTRE_APPEND_FL == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_APPEND_FL); + LASSERTF(LUSTRE_NODUMP_FL == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_NODUMP_FL); + LASSERTF(LUSTRE_NOATIME_FL == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_NOATIME_FL); + LASSERTF(LUSTRE_INDEX_FL == 0x00001000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_INDEX_FL); + LASSERTF(LUSTRE_ORPHAN_FL == 0x00002000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_ORPHAN_FL); + LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_DIRSYNC_FL); + LASSERTF(LUSTRE_TOPDIR_FL == 0x00020000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_TOPDIR_FL); + LASSERTF(LUSTRE_DIRECTIO_FL == 0x00100000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_DIRECTIO_FL); + LASSERTF(LUSTRE_INLINE_DATA_FL == 0x10000000UL, "found 0x%.8xUL\n", + (unsigned)LUSTRE_INLINE_DATA_FL); + LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n", + MDS_INODELOCK_LOOKUP); + LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n", + MDS_INODELOCK_UPDATE); + LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n", + MDS_INODELOCK_OPEN); + LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n", + MDS_INODELOCK_LAYOUT); + LASSERTF(MDS_INODELOCK_PERM == 0x000010, "found 0x%.8x\n", + MDS_INODELOCK_PERM); + LASSERTF(MDS_INODELOCK_XATTR == 0x000020, "found 0x%.8x\n", + MDS_INODELOCK_XATTR); + LASSERTF(MDS_INODELOCK_DOM == 0x000040, "found 0x%.8x\n", + MDS_INODELOCK_DOM); + + /* Checks for struct mdt_ioepoch */ + LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n", + (long long)(int)sizeof(struct mdt_ioepoch)); + LASSERTF((int)offsetof(struct mdt_ioepoch, mio_open_handle) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, mio_open_handle)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle)); + LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused1) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, mio_unused1)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1)); + LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused2) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, mio_unused2)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_unused2)); + LASSERTF((int)offsetof(struct mdt_ioepoch, mio_padding) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, mio_padding)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_padding)); + + /* Checks for struct mdt_rec_setattr */ + LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_setattr)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_size)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_projid) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_projid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_projid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_projid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5)); + + /* Checks for struct mdt_rec_create */ + LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_create)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_open_handle_old) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_open_handle_old)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_time)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_rdev)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_umask)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4)); + + /* Checks for struct mdt_rec_link */ + LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_link)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_time)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9)); + + /* Checks for struct mdt_rec_unlink */ + LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_unlink)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_time)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9)); + + /* Checks for struct mdt_rec_rename */ + LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_rename)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_time)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8)); + + /* Checks for struct mdt_rec_setxattr */ + LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_setxattr)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11)); + + /* Checks for struct mdt_rec_resync */ + LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_resync)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_fid)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_mirror_id) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_mirror_id)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id) == 2, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id)); + LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 134, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9)); + LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 2, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9)); + + /* Checks for struct mdt_rec_reint */ + LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_reint)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_atime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_size)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_umask)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mirror_id) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_mirror_id)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id) == 2, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 134, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 2, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4)); + + /* Checks for struct lmv_desc */ + LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n", + (long long)(int)sizeof(struct lmv_desc)); + LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_tgt_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_pattern)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern)); + LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_1)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_2)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2)); + LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_3)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_4)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4)); + LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_uuid)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid)); + + /* Checks for struct lov_desc */ + LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n", + (long long)(int)sizeof(struct lov_desc)); + LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_tgt_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_pattern)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_0)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0)); + LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_qos_maxage)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_1)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_2)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2)); + LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_uuid)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid)); + CLASSERT(LOV_DESC_MAGIC == 0xB0CCDE5C); + + /* Checks for struct ldlm_res_id */ + LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n", + (long long)(int)sizeof(struct ldlm_res_id)); + CLASSERT(RES_NAME_SIZE == 4); + LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n", + (long long)(int)offsetof(struct ldlm_res_id, name[4])); + LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4])); + + /* Checks for struct ldlm_extent */ + LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n", + (long long)(int)sizeof(struct ldlm_extent)); + LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, start)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->start)); + LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, end)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->end)); + LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, gid)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->gid)); + + /* Checks for struct ldlm_inodebits */ + LASSERTF((int)sizeof(struct ldlm_inodebits) == 16, "found %lld\n", + (long long)(int)sizeof(struct ldlm_inodebits)); + LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_inodebits, bits)); + LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits)); + LASSERTF((int)offsetof(struct ldlm_inodebits, try_bits) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_inodebits, try_bits)); + LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->try_bits) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_inodebits *)0)->try_bits)); + + /* Checks for struct ldlm_flock_wire */ + LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n", + (long long)(int)sizeof(struct ldlm_flock_wire)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid)); + + /* Checks for struct ldlm_intent */ + LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n", + (long long)(int)sizeof(struct ldlm_intent)); + LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_intent, opc)); + LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_intent *)0)->opc)); + LASSERTF(IT_OPEN == 1, "found %lld\n", + (long long)IT_OPEN); + LASSERTF(IT_CREAT == 2, "found %lld\n", + (long long)IT_CREAT); + LASSERTF(IT_READDIR == 4, "found %lld\n", + (long long)IT_READDIR); + LASSERTF(IT_GETATTR == 8, "found %lld\n", + (long long)IT_GETATTR); + LASSERTF(IT_LOOKUP == 16, "found %lld\n", + (long long)IT_LOOKUP); + LASSERTF(IT_GETXATTR == 128, "found %lld\n", + (long long)IT_GETXATTR); + LASSERTF(IT_LAYOUT == 1024, "found %lld\n", + (long long)IT_LAYOUT); + LASSERTF(IT_QUOTA_DQACQ == 2048, "found %lld\n", + (long long)IT_QUOTA_DQACQ); + LASSERTF(IT_QUOTA_CONN == 4096, "found %lld\n", + (long long)IT_QUOTA_CONN); + + /* Checks for struct ldlm_resource_desc */ + LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n", + (long long)(int)sizeof(struct ldlm_resource_desc)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_type)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_pad) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_pad)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_pad) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_pad)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_name)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name)); + + /* Checks for struct ldlm_lock_desc */ + LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(struct ldlm_lock_desc)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_resource)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data)); + + /* Checks for struct ldlm_request */ + LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n", + (long long)(int)sizeof(struct ldlm_request)); + LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_flags)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_count)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count)); + LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_desc)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc)); + LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_handle)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle)); + + /* Checks for struct ldlm_reply */ + LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n", + (long long)(int)sizeof(struct ldlm_reply)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_flags)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_padding)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_desc)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_handle)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2)); + + /* Checks for struct ost_lvb_v1 */ + LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n", + (long long)(int)sizeof(struct ost_lvb_v1)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_size)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks)); + + /* Checks for struct ost_lvb */ + LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n", + (long long)(int)sizeof(struct ost_lvb)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_size)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_mtime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_atime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_ctime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_blocks)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_padding)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding)); + + /* Checks for struct lquota_lvb */ + LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n", + (long long)(int)sizeof(struct lquota_lvb)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_flags)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_pad1)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1)); + LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n", + (long long)LQUOTA_FL_EDQUOT); + + /* Checks for struct ldlm_gl_lquota_desc */ + LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n", + (long long)(int)sizeof(struct ldlm_gl_lquota_desc)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2)); + + /* Checks for struct ldlm_gl_barrier_desc */ + LASSERTF((int)sizeof(struct ldlm_gl_barrier_desc) == 16, "found %lld\n", + (long long)(int)sizeof(struct ldlm_gl_barrier_desc)); + LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_status) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_status)); + LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_status) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_status)); + LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_timeout) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_timeout)); + LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_timeout) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_timeout)); + LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_padding) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_padding)); + LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_padding) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_padding)); + + /* Checks for struct barrier_lvb */ + LASSERTF((int)sizeof(struct barrier_lvb) == 16, "found %lld\n", + (long long)(int)sizeof(struct barrier_lvb)); + LASSERTF((int)offsetof(struct barrier_lvb, lvb_status) == 0, "found %lld\n", + (long long)(int)offsetof(struct barrier_lvb, lvb_status)); + LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_status) == 4, "found %lld\n", + (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_status)); + LASSERTF((int)offsetof(struct barrier_lvb, lvb_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct barrier_lvb, lvb_index)); + LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_index)); + LASSERTF((int)offsetof(struct barrier_lvb, lvb_padding) == 8, "found %lld\n", + (long long)(int)offsetof(struct barrier_lvb, lvb_padding)); + LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_padding) == 8, "found %lld\n", + (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_padding)); + + /* Checks for struct mgs_send_param */ + LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n", + (long long)(int)sizeof(struct mgs_send_param)); + CLASSERT(MGS_PARAM_MAXLEN == 1024); + LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n", + (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024])); + LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024])); + + /* Checks for struct cfg_marker */ + LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n", + (long long)(int)sizeof(struct cfg_marker)); + LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_step)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step)); + LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_flags)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags)); + LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_vers)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers)); + LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_padding)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding)); + LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_createtime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_canceltime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_tgtname)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname)); + LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_comment)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment)); + + /* Checks for struct llog_logid */ + LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n", + (long long)(int)sizeof(struct llog_logid)); + LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_logid, lgl_oi)); + LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi)); + LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_logid, lgl_ogen)); + LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen)); + CLASSERT(OST_SZ_REC == 274730752); + CLASSERT(MDS_UNLINK_REC == 274801668); + CLASSERT(MDS_UNLINK64_REC == 275325956); + CLASSERT(MDS_SETATTR64_REC == 275325953); + CLASSERT(OBD_CFG_REC == 274857984); + CLASSERT(LLOG_GEN_REC == 274989056); + CLASSERT(CHANGELOG_REC == 275120128); + CLASSERT(CHANGELOG_USER_REC == 275185664); + CLASSERT(LLOG_HDR_MAGIC == 275010873); + CLASSERT(LLOG_LOGID_MAGIC == 275010875); + + /* Checks for struct llog_catid */ + LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n", + (long long)(int)sizeof(struct llog_catid)); + LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_logid)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding1)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding2)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding3)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3)); + + /* Checks for struct llog_rec_hdr */ + LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(struct llog_rec_hdr)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_len)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_index)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_type)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_id)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id)); + + /* Checks for struct llog_rec_tail */ + LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n", + (long long)(int)sizeof(struct llog_rec_tail)); + LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_tail, lrt_len)); + LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len)); + LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_tail, lrt_index)); + LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index)); + + /* Checks for struct llog_logid_rec */ + LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_logid_rec)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_hdr)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_id)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding1)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding2)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding3)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_tail)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail)); + + /* Checks for struct llog_unlink_rec */ + LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n", + (long long)(int)sizeof(struct llog_unlink_rec)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_oid)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_count)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_tail)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail)); + /* Checks for struct llog_unlink64_rec */ + LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_unlink64_rec)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_count)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3)); + + /* Checks for struct llog_setattr64_rec */ + LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_setattr64_rec)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_valid) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_valid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail)); + LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_projid) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_projid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid)); + + /* Checks for struct llog_size_change_rec */ + LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_size_change_rec)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail)); + + /* Checks for struct changelog_rec */ + LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct changelog_rec)); + LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_namelen)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen)); + LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_flags)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags)); + LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_type)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type)); + LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_index)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index)); + LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_prev)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev)); + LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_time)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time)); + LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_tfid)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid)); + LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_pfid)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid)); + + /* Checks for struct changelog_ext_rename */ + LASSERTF((int)sizeof(struct changelog_ext_rename) == 32, "found %lld\n", + (long long)(int)sizeof(struct changelog_ext_rename)); + LASSERTF((int)offsetof(struct changelog_ext_rename, cr_sfid) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rename, cr_sfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rename *)0)->cr_sfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rename *)0)->cr_sfid)); + LASSERTF((int)offsetof(struct changelog_ext_rename, cr_spfid) == 16, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rename, cr_spfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rename *)0)->cr_spfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rename *)0)->cr_spfid)); + + /* Checks for struct changelog_ext_jobid */ + LASSERTF((int)sizeof(struct changelog_ext_jobid) == 32, "found %lld\n", + (long long)(int)sizeof(struct changelog_ext_jobid)); + LASSERTF((int)offsetof(struct changelog_ext_jobid, cr_jobid) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_jobid, cr_jobid)); + LASSERTF((int)sizeof(((struct changelog_ext_jobid *)0)->cr_jobid) == 32, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_jobid *)0)->cr_jobid)); + + /* Checks for struct changelog_setinfo */ + LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n", + (long long)(int)sizeof(struct changelog_setinfo)); + LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_setinfo, cs_recno)); + LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno)); + LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct changelog_setinfo, cs_id)); + LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id)); + + /* Checks for struct llog_changelog_rec */ + LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n", + (long long)(int)sizeof(struct llog_changelog_rec)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr_do_not_use) == 80, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr_do_not_use)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use)); + + /* Checks for struct llog_changelog_user_rec */ + LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n", + (long long)(int)sizeof(struct llog_changelog_user_rec)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_time) == 20, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_time)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail)); + + /* Checks for struct llog_gen */ + LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n", + (long long)(int)sizeof(struct llog_gen)); + LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_gen, mnt_cnt)); + LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt)); + LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n", + (long long)(int)offsetof(struct llog_gen, conn_cnt)); + LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt)); + + /* Checks for struct llog_gen_rec */ + LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_gen_rec)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_gen)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_tail)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail)); + + /* Checks for struct llog_log_hdr */ + LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n", + (long long)(int)sizeof(struct llog_log_hdr)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_hdr)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_count)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_size)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_flags)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid)); + + /* Checks for struct llogd_body */ + LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n", + (long long)(int)sizeof(struct llogd_body)); + LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_logid)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid)); + LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx)); + LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_llh_flags)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags)); + LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_index)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index)); + LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_saved_index)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index)); + LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_len)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len)); + LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_cur_offset)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset)); + CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501); + CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502); + CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503); + CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508); + CLASSERT(LLOG_FIRST_OPC == 501); + CLASSERT(LLOG_LAST_OPC == 510); + CLASSERT(LLOG_CONFIG_ORIG_CTXT == 0); + CLASSERT(LLOG_CONFIG_REPL_CTXT == 1); + CLASSERT(LLOG_MDS_OST_ORIG_CTXT == 2); + CLASSERT(LLOG_MDS_OST_REPL_CTXT == 3); + CLASSERT(LLOG_SIZE_ORIG_CTXT == 4); + CLASSERT(LLOG_SIZE_REPL_CTXT == 5); + CLASSERT(LLOG_TEST_ORIG_CTXT == 8); + CLASSERT(LLOG_TEST_REPL_CTXT == 9); + CLASSERT(LLOG_CHANGELOG_ORIG_CTXT == 12); + CLASSERT(LLOG_CHANGELOG_REPL_CTXT == 13); + CLASSERT(LLOG_CHANGELOG_USER_ORIG_CTXT == 14); + CLASSERT(LLOG_AGENT_ORIG_CTXT == 15); + CLASSERT(LLOG_UPDATELOG_ORIG_CTXT == 16); + CLASSERT(LLOG_UPDATELOG_REPL_CTXT == 17); + CLASSERT(LLOG_MAX_CTXTS == 18); + + /* Checks for struct llogd_conn_body */ + LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n", + (long long)(int)sizeof(struct llogd_conn_body)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx)); + + /* Checks for struct ll_fiemap_info_key */ + LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n", + (long long)(int)sizeof(struct ll_fiemap_info_key)); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_name[8]) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_name[8])); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_name[8]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_name[8])); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_oa) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_oa)); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_oa) == 208, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_oa)); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_fiemap) == 216, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_fiemap)); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_fiemap) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_fiemap)); + + /* Checks for struct quota_body */ + LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n", + (long long)(int)sizeof(struct quota_body)); + LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_fid)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_fid)); + LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_id)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_id)); + LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_flags)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_flags)); + LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_padding)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_padding)); + LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_count)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_count)); + LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_usage)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_usage)); + LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_slv_ver)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver)); + LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_lockh)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh)); + LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_glb_lockh)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh)); + LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_padding1[4])); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4])); + + /* Checks for struct mgs_target_info */ + LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n", + (long long)(int)sizeof(struct mgs_target_info)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_config_ver)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_flags)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_nid_count)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_instance)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_fsname)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_svname)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_uuid)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_nids)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_params)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params)); + + /* Checks for struct mgs_nidtbl_entry */ + LASSERTF((int)sizeof(struct mgs_nidtbl_entry) == 24, "found %lld\n", + (long long)(int)sizeof(struct mgs_nidtbl_entry)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_version)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_version)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_instance) == 8, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_instance)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_instance) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_instance)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_index) == 12, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_index)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_index)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_length) == 16, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_length)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_length) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_length)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_type) == 20, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_type)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_type) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_type)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_type) == 21, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_type)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_type) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_type)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_size) == 22, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_size)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_size) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_size)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_count) == 23, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_count)); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_count) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_count)); + LASSERTF((int)offsetof(struct mgs_nidtbl_entry, u.nids[0]) == 24, "found %lld\n", + (long long)(int)offsetof(struct mgs_nidtbl_entry, u.nids[0])); + LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->u.nids[0]) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->u.nids[0])); + + /* Checks for struct mgs_config_body */ + LASSERTF((int)sizeof(struct mgs_config_body) == 80, "found %lld\n", + (long long)(int)sizeof(struct mgs_config_body)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_name) == 0, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_name)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_name) == 64, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_name)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_offset) == 64, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_offset)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_offset)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_type) == 72, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_type)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_type) == 2, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_type)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_nm_cur_pass) == 74, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_nm_cur_pass)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_nm_cur_pass) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_nm_cur_pass)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_bits) == 75, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_bits)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_bits) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_bits)); + LASSERTF((int)offsetof(struct mgs_config_body, mcb_units) == 76, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_body, mcb_units)); + LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_units) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_units)); + CLASSERT(MGS_CFG_T_CONFIG == 0); + CLASSERT(MGS_CFG_T_SPTLRPC == 1); + CLASSERT(MGS_CFG_T_RECOVER == 2); + CLASSERT(MGS_CFG_T_PARAMS == 3); + CLASSERT(MGS_CFG_T_NODEMAP == 4); + CLASSERT(MGS_CFG_T_BARRIER == 5); + + /* Checks for struct mgs_config_res */ + LASSERTF((int)sizeof(struct mgs_config_res) == 16, "found %lld\n", + (long long)(int)sizeof(struct mgs_config_res)); + LASSERTF((int)offsetof(struct mgs_config_res, mcr_offset) == 0, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_res, mcr_offset)); + LASSERTF((int)sizeof(((struct mgs_config_res *)0)->mcr_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_res *)0)->mcr_offset)); + LASSERTF((int)offsetof(struct mgs_config_res, mcr_size) == 8, "found %lld\n", + (long long)(int)offsetof(struct mgs_config_res, mcr_size)); + LASSERTF((int)sizeof(((struct mgs_config_res *)0)->mcr_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mgs_config_res *)0)->mcr_size)); + + /* Checks for struct lustre_capa */ + LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n", + (long long)(int)sizeof(struct lustre_capa)); + LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_fid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_opc)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc)); + LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_uid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_gid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_flags)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags)); + LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_keyid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_timeout)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout)); + LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_expiry)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry)); + CLASSERT(CAPA_HMAC_MAX_LEN == 64); + LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_hmac[64])); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64])); + + /* Checks for struct lustre_capa_key */ + LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n", + (long long)(int)sizeof(struct lustre_capa_key)); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_seq)); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq)); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_keyid)); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid)); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_padding)); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding)); + CLASSERT(CAPA_HMAC_KEY_MAX_LEN == 56); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_key[56])); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56])); + + /* Checks for struct getinfo_fid2path */ + LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n", + (long long)(int)sizeof(struct getinfo_fid2path)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_fid)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_recno)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_u.gf_path[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_u.gf_path[0])); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_u.gf_path[0]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_u.gf_path[0])); + + /* Checks for struct fiemap */ + LASSERTF((int)sizeof(struct fiemap) == 32, "found %lld\n", + (long long)(int)sizeof(struct fiemap)); + LASSERTF((int)offsetof(struct fiemap, fm_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_start)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_start)); + LASSERTF((int)offsetof(struct fiemap, fm_length) == 8, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_length)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_length) == 8, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_length)); + LASSERTF((int)offsetof(struct fiemap, fm_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_flags)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_flags)); + LASSERTF((int)offsetof(struct fiemap, fm_mapped_extents) == 20, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_mapped_extents)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_mapped_extents)); + LASSERTF((int)offsetof(struct fiemap, fm_extent_count) == 24, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_extent_count)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_extent_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_extent_count)); + LASSERTF((int)offsetof(struct fiemap, fm_reserved) == 28, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_reserved)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_reserved) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_reserved)); + LASSERTF((int)offsetof(struct fiemap, fm_extents) == 32, "found %lld\n", + (long long)(int)offsetof(struct fiemap, fm_extents)); + LASSERTF((int)sizeof(((struct fiemap *)0)->fm_extents) == 0, "found %lld\n", + (long long)(int)sizeof(((struct fiemap *)0)->fm_extents)); + CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001); + CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002); + CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000); + + /* Checks for struct fiemap_extent */ + LASSERTF((int)sizeof(struct fiemap_extent) == 56, "found %lld\n", + (long long)(int)sizeof(struct fiemap_extent)); + LASSERTF((int)offsetof(struct fiemap_extent, fe_logical) == 0, "found %lld\n", + (long long)(int)offsetof(struct fiemap_extent, fe_logical)); + LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_logical) == 8, "found %lld\n", + (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_logical)); + LASSERTF((int)offsetof(struct fiemap_extent, fe_physical) == 8, "found %lld\n", + (long long)(int)offsetof(struct fiemap_extent, fe_physical)); + LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_physical) == 8, "found %lld\n", + (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_physical)); + LASSERTF((int)offsetof(struct fiemap_extent, fe_length) == 16, "found %lld\n", + (long long)(int)offsetof(struct fiemap_extent, fe_length)); + LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_length) == 8, "found %lld\n", + (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_length)); + LASSERTF((int)offsetof(struct fiemap_extent, fe_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct fiemap_extent, fe_flags)); + LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_flags)); + LASSERTF((int)offsetof(struct fiemap_extent, fe_reserved[0]) == 44, "found %lld\n", + (long long)(int)offsetof(struct fiemap_extent, fe_reserved[0])); + LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_reserved[0]) == 4, "found %lld\n", + (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_reserved[0])); + CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001); + CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002); + CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004); + CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008); + CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080); + CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100); + CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200); + CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400); + CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800); + CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000); + CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000); + CLASSERT(FIEMAP_EXTENT_NET == 0x80000000); + +#ifdef CONFIG_FS_POSIX_ACL + /* Checks for type posix_acl_xattr_entry */ + LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n", + (long long)(int)sizeof(posix_acl_xattr_entry)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_tag)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_perm)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_id)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id)); +#endif /* CONFIG_FS_POSIX_ACL */ + +#ifdef CONFIG_FS_POSIX_ACL + /* Checks for type posix_acl_xattr_header */ + LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n", + (long long)(int)sizeof(posix_acl_xattr_header)); + LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_header, a_version)); + LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version)); +#ifndef HAVE_STRUCT_POSIX_ACL_XATTR + LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_header, a_entries)); + LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries)); +#endif /* HAVE_STRUCT_POSIX_ACL_XATTR */ +#endif /* CONFIG_FS_POSIX_ACL */ + + /* Checks for struct link_ea_header */ + LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n", + (long long)(int)sizeof(struct link_ea_header)); + LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_magic)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic)); + LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_reccount)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount)); + LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_len)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len)); + LASSERTF((int)offsetof(struct link_ea_header, leh_overflow_time) == 16, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_overflow_time)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_overflow_time) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_overflow_time)); + LASSERTF((int)offsetof(struct link_ea_header, leh_padding) == 20, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_padding)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_padding)); + CLASSERT(LINK_EA_MAGIC == 0x11EAF1DFUL); + + /* Checks for struct link_ea_entry */ + LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n", + (long long)(int)sizeof(struct link_ea_entry)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_reclen)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_name)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name)); + + /* Checks for struct layout_intent */ + LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n", + (long long)(int)sizeof(struct layout_intent)); + LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_opc)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_opc)); + LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_flags)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_flags)); + LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_extent)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_extent)); + LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n", + (long long)LAYOUT_INTENT_ACCESS); + LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n", + (long long)LAYOUT_INTENT_READ); + LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n", + (long long)LAYOUT_INTENT_WRITE); + LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n", + (long long)LAYOUT_INTENT_GLIMPSE); + LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n", + (long long)LAYOUT_INTENT_TRUNC); + LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n", + (long long)LAYOUT_INTENT_RELEASE); + LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n", + (long long)LAYOUT_INTENT_RESTORE); + + /* Checks for struct hsm_action_item */ + LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n", + (long long)(int)sizeof(struct hsm_action_item)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_len)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_action)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_fid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_dfid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_extent)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_cookie)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_gid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_data)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data)); + + /* Checks for struct hsm_action_list */ + LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_action_list)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_version)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_count)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_compound_id)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_flags)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_archive_id)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id)); + LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, padding1)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_fsname)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname)); + + /* Checks for struct hsm_progress */ + LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n", + (long long)(int)sizeof(struct hsm_progress)); + LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_fid)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid)); + LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_cookie)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie)); + LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_extent)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent)); + LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_flags)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags)); + LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_errval)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval)); + LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, padding)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->padding)); + LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n", + HP_FLAG_COMPLETED); + LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n", + HP_FLAG_RETRY); + + LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_data_version)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version)); + LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_flags)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags)); + LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_errval)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval)); + LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, padding)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->padding)); + LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_hai)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai)); + + /* Checks for struct hsm_progress_kernel */ + LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n", + (long long)(int)sizeof(struct hsm_progress_kernel)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2)); + + /* Checks for struct hsm_user_item */ + LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_item)); + LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_item, hui_fid)); + LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid)); + LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_item, hui_extent)); + LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent)); + + /* Checks for struct hsm_user_state */ + LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_state)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_states)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_archive_id)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location)); + + /* Checks for struct hsm_state_set */ + LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_state_set)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_valid)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_archive_id)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_setmask)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_clearmask)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask)); + + /* Checks for struct hsm_current_action */ + LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_current_action)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_state)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_action)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_location)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location)); + + /* Checks for struct hsm_request */ + LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_request)); + LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_action)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_action)); + LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_archive_id)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id)); + LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_flags)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags)); + LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_itemcount)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount)); + LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_data_len)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len)); + LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)HSM_FORCE_ACTION); + LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)HSM_GHOST_COPY); + + /* Checks for struct hsm_user_request */ + LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_request)); + LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_request, hur_request)); + LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request)); + LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_request, hur_user_item)); + LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item)); + + /* Checks for struct hsm_user_import */ + LASSERTF((int)sizeof(struct hsm_user_import) == 48, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_import)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_size)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_size)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_uid) == 32, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_uid)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_uid)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_gid) == 36, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_gid)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_gid)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_mode) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_mode)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mode)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_atime) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_atime)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_atime)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_atime_ns) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_atime_ns)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_atime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_atime_ns)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_mtime) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_mtime)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mtime)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_mtime_ns) == 28, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_mtime_ns)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns)); + LASSERTF((int)offsetof(struct hsm_user_import, hui_archive_id) == 44, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_import, hui_archive_id)); + LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_archive_id)); + + /* Checks for struct object_update_param */ + LASSERTF((int)sizeof(struct object_update_param) == 8, "found %lld\n", + (long long)(int)sizeof(struct object_update_param)); + LASSERTF((int)offsetof(struct object_update_param, oup_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct object_update_param, oup_len)); + LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_len) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_param *)0)->oup_len)); + LASSERTF((int)offsetof(struct object_update_param, oup_padding) == 2, "found %lld\n", + (long long)(int)offsetof(struct object_update_param, oup_padding)); + LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_padding) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_param *)0)->oup_padding)); + LASSERTF((int)offsetof(struct object_update_param, oup_padding2) == 4, "found %lld\n", + (long long)(int)offsetof(struct object_update_param, oup_padding2)); + LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update_param *)0)->oup_padding2)); + LASSERTF((int)offsetof(struct object_update_param, oup_buf) == 8, "found %lld\n", + (long long)(int)offsetof(struct object_update_param, oup_buf)); + LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_buf) == 0, "found %lld\n", + (long long)(int)sizeof(((struct object_update_param *)0)->oup_buf)); + + /* Checks for struct object_update */ + LASSERTF((int)sizeof(struct object_update) == 40, "found %lld\n", + (long long)(int)sizeof(struct object_update)); + LASSERTF((int)offsetof(struct object_update, ou_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_type)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_type) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_type)); + LASSERTF((int)offsetof(struct object_update, ou_params_count) == 2, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_params_count)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_params_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_params_count)); + LASSERTF((int)offsetof(struct object_update, ou_result_size) == 4, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_result_size)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_result_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_result_size)); + LASSERTF((int)offsetof(struct object_update, ou_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_flags)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_flags)); + LASSERTF((int)offsetof(struct object_update, ou_padding1) == 12, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_padding1)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_padding1)); + LASSERTF((int)offsetof(struct object_update, ou_batchid) == 16, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_batchid)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_batchid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_batchid)); + LASSERTF((int)offsetof(struct object_update, ou_fid) == 24, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_fid)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_fid)); + LASSERTF((int)offsetof(struct object_update, ou_params) == 40, "found %lld\n", + (long long)(int)offsetof(struct object_update, ou_params)); + LASSERTF((int)sizeof(((struct object_update *)0)->ou_params) == 0, "found %lld\n", + (long long)(int)sizeof(((struct object_update *)0)->ou_params)); + + /* Checks for struct object_update_request */ + LASSERTF((int)sizeof(struct object_update_request) == 8, "found %lld\n", + (long long)(int)sizeof(struct object_update_request)); + LASSERTF((int)offsetof(struct object_update_request, ourq_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct object_update_request, ourq_magic)); + LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update_request *)0)->ourq_magic)); + LASSERTF((int)offsetof(struct object_update_request, ourq_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct object_update_request, ourq_count)); + LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_request *)0)->ourq_count)); + LASSERTF((int)offsetof(struct object_update_request, ourq_padding) == 6, "found %lld\n", + (long long)(int)offsetof(struct object_update_request, ourq_padding)); + LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_padding) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_request *)0)->ourq_padding)); + LASSERTF((int)offsetof(struct object_update_request, ourq_updates) == 8, "found %lld\n", + (long long)(int)offsetof(struct object_update_request, ourq_updates)); + LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_updates) == 0, "found %lld\n", + (long long)(int)sizeof(((struct object_update_request *)0)->ourq_updates)); + + /* Checks for struct object_update_result */ + LASSERTF((int)sizeof(struct object_update_result) == 8, "found %lld\n", + (long long)(int)sizeof(struct object_update_result)); + LASSERTF((int)offsetof(struct object_update_result, our_rc) == 0, "found %lld\n", + (long long)(int)offsetof(struct object_update_result, our_rc)); + LASSERTF((int)sizeof(((struct object_update_result *)0)->our_rc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update_result *)0)->our_rc)); + LASSERTF((int)offsetof(struct object_update_result, our_datalen) == 4, "found %lld\n", + (long long)(int)offsetof(struct object_update_result, our_datalen)); + LASSERTF((int)sizeof(((struct object_update_result *)0)->our_datalen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_result *)0)->our_datalen)); + LASSERTF((int)offsetof(struct object_update_result, our_padding) == 6, "found %lld\n", + (long long)(int)offsetof(struct object_update_result, our_padding)); + LASSERTF((int)sizeof(((struct object_update_result *)0)->our_padding) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_result *)0)->our_padding)); + LASSERTF((int)offsetof(struct object_update_result, our_data) == 8, "found %lld\n", + (long long)(int)offsetof(struct object_update_result, our_data)); + LASSERTF((int)sizeof(((struct object_update_result *)0)->our_data) == 0, "found %lld\n", + (long long)(int)sizeof(((struct object_update_result *)0)->our_data)); + + /* Checks for struct object_update_reply */ + LASSERTF((int)sizeof(struct object_update_reply) == 8, "found %lld\n", + (long long)(int)sizeof(struct object_update_reply)); + LASSERTF((int)offsetof(struct object_update_reply, ourp_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct object_update_reply, ourp_magic)); + LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_magic)); + LASSERTF((int)offsetof(struct object_update_reply, ourp_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct object_update_reply, ourp_count)); + LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_count)); + LASSERTF((int)offsetof(struct object_update_reply, ourp_padding) == 6, "found %lld\n", + (long long)(int)offsetof(struct object_update_reply, ourp_padding)); + LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_padding) == 2, "found %lld\n", + (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_padding)); + LASSERTF((int)offsetof(struct object_update_reply, ourp_lens) == 8, "found %lld\n", + (long long)(int)offsetof(struct object_update_reply, ourp_lens)); + LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_lens) == 0, "found %lld\n", + (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_lens)); + + /* Checks for struct out_update_header */ + LASSERTF((int)sizeof(struct out_update_header) == 16, "found %lld\n", + (long long)(int)sizeof(struct out_update_header)); + LASSERTF((int)offsetof(struct out_update_header, ouh_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct out_update_header, ouh_magic)); + LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_header *)0)->ouh_magic)); + LASSERTF((int)offsetof(struct out_update_header, ouh_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct out_update_header, ouh_count)); + LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_header *)0)->ouh_count)); + LASSERTF((int)offsetof(struct out_update_header, ouh_inline_length) == 8, "found %lld\n", + (long long)(int)offsetof(struct out_update_header, ouh_inline_length)); + LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_inline_length) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_header *)0)->ouh_inline_length)); + LASSERTF((int)offsetof(struct out_update_header, ouh_reply_size) == 12, "found %lld\n", + (long long)(int)offsetof(struct out_update_header, ouh_reply_size)); + LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_reply_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_header *)0)->ouh_reply_size)); + LASSERTF((int)offsetof(struct out_update_header, ouh_inline_data) == 16, "found %lld\n", + (long long)(int)offsetof(struct out_update_header, ouh_inline_data)); + LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_inline_data) == 0, "found %lld\n", + (long long)(int)sizeof(((struct out_update_header *)0)->ouh_inline_data)); + + /* Checks for struct out_update_buffer */ + LASSERTF((int)sizeof(struct out_update_buffer) == 8, "found %lld\n", + (long long)(int)sizeof(struct out_update_buffer)); + LASSERTF((int)offsetof(struct out_update_buffer, oub_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct out_update_buffer, oub_size)); + LASSERTF((int)sizeof(((struct out_update_buffer *)0)->oub_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_buffer *)0)->oub_size)); + LASSERTF((int)offsetof(struct out_update_buffer, oub_padding) == 4, "found %lld\n", + (long long)(int)offsetof(struct out_update_buffer, oub_padding)); + LASSERTF((int)sizeof(((struct out_update_buffer *)0)->oub_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct out_update_buffer *)0)->oub_padding)); + + /* Checks for struct nodemap_cluster_rec */ + LASSERTF((int)sizeof(struct nodemap_cluster_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct nodemap_cluster_rec)); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_name) == 0, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_name)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_name) == 17, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_name)); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_flags) == 17, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_flags)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_flags) == 1, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_flags)); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_padding1) == 18, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_padding1)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding1) == 2, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding1)); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_padding2) == 20, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_padding2)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding2)); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_squash_uid) == 24, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_squash_uid)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_uid)); + LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_squash_gid) == 28, "found %lld\n", + (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_squash_gid)); + LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_gid)); + + /* Checks for struct nodemap_range_rec */ + LASSERTF((int)sizeof(struct nodemap_range_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct nodemap_range_rec)); + LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_start_nid) == 0, "found %lld\n", + (long long)(int)offsetof(struct nodemap_range_rec, nrr_start_nid)); + LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_start_nid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_start_nid)); + LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_end_nid) == 8, "found %lld\n", + (long long)(int)offsetof(struct nodemap_range_rec, nrr_end_nid)); + LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_end_nid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_end_nid)); + LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_padding1) == 16, "found %lld\n", + (long long)(int)offsetof(struct nodemap_range_rec, nrr_padding1)); + LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding1)); + LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_padding2) == 24, "found %lld\n", + (long long)(int)offsetof(struct nodemap_range_rec, nrr_padding2)); + LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding2)); + + /* Checks for struct nodemap_id_rec */ + LASSERTF((int)sizeof(struct nodemap_id_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct nodemap_id_rec)); + LASSERTF((int)offsetof(struct nodemap_id_rec, nir_id_fs) == 0, "found %lld\n", + (long long)(int)offsetof(struct nodemap_id_rec, nir_id_fs)); + LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_id_fs) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_id_fs)); + LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding1) == 4, "found %lld\n", + (long long)(int)offsetof(struct nodemap_id_rec, nir_padding1)); + LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding1)); + LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding2) == 8, "found %lld\n", + (long long)(int)offsetof(struct nodemap_id_rec, nir_padding2)); + LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding2)); + LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding3) == 16, "found %lld\n", + (long long)(int)offsetof(struct nodemap_id_rec, nir_padding3)); + LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding3)); + LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding4) == 24, "found %lld\n", + (long long)(int)offsetof(struct nodemap_id_rec, nir_padding4)); + LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding4)); + + /* Checks for struct nodemap_global_rec */ + LASSERTF((int)sizeof(struct nodemap_global_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct nodemap_global_rec)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_is_active) == 0, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_is_active)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_is_active) == 1, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_is_active)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding1) == 1, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding1)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding1) == 1, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding1)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding2) == 2, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding2)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding2) == 2, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding2)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding3) == 4, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding3)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding3)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding4) == 8, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding4)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding4)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding5) == 16, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding5)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding5)); + LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding6) == 24, "found %lld\n", + (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding6)); + LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding6)); + + /* Checks for union nodemap_rec */ + LASSERTF((int)sizeof(union nodemap_rec) == 32, "found %lld\n", + (long long)(int)sizeof(union nodemap_rec)); + + /* Checks for struct lfsck_request */ + LASSERTF((int)sizeof(struct lfsck_request) == 96, "found %lld\n", + (long long)(int)sizeof(struct lfsck_request)); + LASSERTF((int)offsetof(struct lfsck_request, lr_event) == 0, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_event)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_event) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_event)); + LASSERTF((int)offsetof(struct lfsck_request, lr_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_index)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_index)); + LASSERTF((int)offsetof(struct lfsck_request, lr_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_flags)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_flags)); + LASSERTF((int)offsetof(struct lfsck_request, lr_valid) == 12, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_valid)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_valid)); + LASSERTF((int)offsetof(struct lfsck_request, lr_speed) == 16, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_speed)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_speed) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_speed)); + LASSERTF((int)offsetof(struct lfsck_request, lr_version) == 20, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_version)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_version) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_version)); + LASSERTF((int)offsetof(struct lfsck_request, lr_active) == 22, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_active)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_active) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_active)); + LASSERTF((int)offsetof(struct lfsck_request, lr_param) == 24, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_param)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_param) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_param)); + LASSERTF((int)offsetof(struct lfsck_request, lr_async_windows) == 26, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_async_windows)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_async_windows) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_async_windows)); + LASSERTF((int)offsetof(struct lfsck_request, lr_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_flags)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_flags)); + LASSERTF((int)offsetof(struct lfsck_request, lr_fid) == 32, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_fid)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_fid)); + LASSERTF((int)offsetof(struct lfsck_request, lr_fid2) == 48, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_fid2)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_fid2)); + LASSERTF((int)offsetof(struct lfsck_request, lr_comp_id) == 64, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_comp_id)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_comp_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_comp_id)); + LASSERTF((int)offsetof(struct lfsck_request, lr_padding_0) == 68, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_padding_0)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_0) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_0)); + LASSERTF((int)offsetof(struct lfsck_request, lr_padding_1) == 72, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_padding_1)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_1)); + LASSERTF((int)offsetof(struct lfsck_request, lr_padding_2) == 80, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_padding_2)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_2)); + LASSERTF((int)offsetof(struct lfsck_request, lr_padding_3) == 88, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_padding_3)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_3)); +#ifdef HAVE_SERVER_SUPPORT + LASSERTF(LFSCK_TYPE_SCRUB == 0x00000000UL, "found 0x%.8xUL\n", + (unsigned)LFSCK_TYPE_SCRUB); + LASSERTF(LFSCK_TYPE_LAYOUT == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LFSCK_TYPE_LAYOUT); + LASSERTF(LFSCK_TYPE_NAMESPACE == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LFSCK_TYPE_NAMESPACE); +#endif + LASSERTF(LE_LASTID_REBUILDING == 1, "found %lld\n", + (long long)LE_LASTID_REBUILDING); + LASSERTF(LE_LASTID_REBUILT == 2, "found %lld\n", + (long long)LE_LASTID_REBUILT); + LASSERTF(LE_PHASE1_DONE == 3, "found %lld\n", + (long long)LE_PHASE1_DONE); + LASSERTF(LE_PHASE2_DONE == 4, "found %lld\n", + (long long)LE_PHASE2_DONE); + LASSERTF(LE_START == 5, "found %lld\n", + (long long)LE_START); + LASSERTF(LE_STOP == 6, "found %lld\n", + (long long)LE_STOP); + LASSERTF(LE_QUERY == 7, "found %lld\n", + (long long)LE_QUERY); + LASSERTF(LE_PEER_EXIT == 9, "found %lld\n", + (long long)LE_PEER_EXIT); + LASSERTF(LE_CONDITIONAL_DESTROY == 10, "found %lld\n", + (long long)LE_CONDITIONAL_DESTROY); + LASSERTF(LE_PAIRS_VERIFY == 11, "found %lld\n", + (long long)LE_PAIRS_VERIFY); + LASSERTF(LE_SET_LMV_MASTER == 15, "found %lld\n", + (long long)LE_SET_LMV_MASTER); + LASSERTF(LE_SET_LMV_SLAVE == 16, "found %lld\n", + (long long)LE_SET_LMV_SLAVE); + LASSERTF(LEF_TO_OST == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LEF_TO_OST); + LASSERTF(LEF_FROM_OST == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LEF_FROM_OST); + LASSERTF(LEF_SET_LMV_HASH == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LEF_SET_LMV_HASH); + LASSERTF(LEF_SET_LMV_ALL == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)LEF_SET_LMV_ALL); + LASSERTF(LEF_RECHECK_NAME_HASH == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)LEF_RECHECK_NAME_HASH); + LASSERTF(LEF_QUERY_ALL == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)LEF_QUERY_ALL); + + /* Checks for struct lfsck_reply */ + LASSERTF((int)sizeof(struct lfsck_reply) == 16, "found %lld\n", + (long long)(int)sizeof(struct lfsck_reply)); + LASSERTF((int)offsetof(struct lfsck_reply, lr_status) == 0, "found %lld\n", + (long long)(int)offsetof(struct lfsck_reply, lr_status)); + LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_status) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_status)); + LASSERTF((int)offsetof(struct lfsck_reply, lr_padding_1) == 4, "found %lld\n", + (long long)(int)offsetof(struct lfsck_reply, lr_padding_1)); + LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_padding_1)); + LASSERTF((int)offsetof(struct lfsck_reply, lr_repaired) == 8, "found %lld\n", + (long long)(int)offsetof(struct lfsck_reply, lr_repaired)); + LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_repaired) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_repaired)); + + /* Checks for struct update_params */ + LASSERTF((int)sizeof(struct update_params) == 0, "found %lld\n", + (long long)(int)sizeof(struct update_params)); + LASSERTF((int)offsetof(struct update_params, up_params) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_params, up_params)); + LASSERTF((int)sizeof(((struct update_params *)0)->up_params) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update_params *)0)->up_params)); + + /* Checks for struct update_op */ + LASSERTF((int)sizeof(struct update_op) == 20, "found %lld\n", + (long long)(int)sizeof(struct update_op)); + LASSERTF((int)offsetof(struct update_op, uop_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_op, uop_fid)); + LASSERTF((int)sizeof(((struct update_op *)0)->uop_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct update_op *)0)->uop_fid)); + LASSERTF((int)offsetof(struct update_op, uop_type) == 16, "found %lld\n", + (long long)(int)offsetof(struct update_op, uop_type)); + LASSERTF((int)sizeof(((struct update_op *)0)->uop_type) == 2, "found %lld\n", + (long long)(int)sizeof(((struct update_op *)0)->uop_type)); + LASSERTF((int)offsetof(struct update_op, uop_param_count) == 18, "found %lld\n", + (long long)(int)offsetof(struct update_op, uop_param_count)); + LASSERTF((int)sizeof(((struct update_op *)0)->uop_param_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct update_op *)0)->uop_param_count)); + LASSERTF((int)offsetof(struct update_op, uop_params_off) == 20, "found %lld\n", + (long long)(int)offsetof(struct update_op, uop_params_off)); + LASSERTF((int)sizeof(((struct update_op *)0)->uop_params_off) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update_op *)0)->uop_params_off)); + + /* Checks for struct update_ops */ + LASSERTF((int)sizeof(struct update_ops) == 0, "found %lld\n", + (long long)(int)sizeof(struct update_ops)); + LASSERTF((int)offsetof(struct update_ops, uops_op) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_ops, uops_op)); + LASSERTF((int)sizeof(((struct update_ops *)0)->uops_op) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update_ops *)0)->uops_op)); + + /* Checks for struct update_records */ + LASSERTF((int)sizeof(struct update_records) == 32, "found %lld\n", + (long long)(int)sizeof(struct update_records)); + LASSERTF((int)offsetof(struct update_records, ur_master_transno) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_master_transno)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_master_transno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_master_transno)); + LASSERTF((int)offsetof(struct update_records, ur_batchid) == 8, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_batchid)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_batchid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_batchid)); + LASSERTF((int)offsetof(struct update_records, ur_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_flags)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_flags)); + LASSERTF((int)offsetof(struct update_records, ur_index) == 20, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_index)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_index)); + LASSERTF((int)offsetof(struct update_records, ur_update_count) == 24, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_update_count)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_update_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_update_count)); + LASSERTF((int)offsetof(struct update_records, ur_param_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct update_records, ur_param_count)); + LASSERTF((int)sizeof(((struct update_records *)0)->ur_param_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_records *)0)->ur_param_count)); + LASSERTF(UPDATE_RECORD_CONTINUE == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)UPDATE_RECORD_CONTINUE); + + /* Checks for struct llog_update_record */ + LASSERTF((int)sizeof(struct llog_update_record) == 48, "found %lld\n", + (long long)(int)sizeof(struct llog_update_record)); + LASSERTF((int)offsetof(struct llog_update_record, lur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_update_record, lur_hdr)); + LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_update_record *)0)->lur_hdr)); + LASSERTF((int)offsetof(struct llog_update_record, lur_update_rec) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_update_record, lur_update_rec)); + LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_update_rec) == 32, "found %lld\n", + (long long)(int)sizeof(((struct llog_update_record *)0)->lur_update_rec)); + + /* Checks for struct lustre_cfg */ + LASSERTF((int)sizeof(struct lustre_cfg) == 32, "found %lld\n", + (long long)(int)sizeof(struct lustre_cfg)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_version)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_version)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_command) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_command)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_command) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_command)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_num) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_num)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_num) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_num)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_flags)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_flags)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nid) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_nid)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nid)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nal) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_nal)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nal) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nal)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_bufcount) == 28, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_bufcount)); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount)); + LASSERTF((int)offsetof(struct lustre_cfg, lcfg_buflens[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lustre_cfg, lcfg_buflens[0])); + LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0]) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0])); + LASSERTF(LCFG_ATTACH == 0x000cf001UL, "found 0x%.8xUL\n", + (unsigned)LCFG_ATTACH); + LASSERTF(LCFG_DETACH == 0x000cf002UL, "found 0x%.8xUL\n", + (unsigned)LCFG_DETACH); + LASSERTF(LCFG_SETUP == 0x000cf003UL, "found 0x%.8xUL\n", + (unsigned)LCFG_SETUP); + LASSERTF(LCFG_CLEANUP == 0x000cf004UL, "found 0x%.8xUL\n", + (unsigned)LCFG_CLEANUP); + LASSERTF(LCFG_ADD_UUID == 0x000cf005UL, "found 0x%.8xUL\n", + (unsigned)LCFG_ADD_UUID); + LASSERTF(LCFG_DEL_UUID == 0x000cf006UL, "found 0x%.8xUL\n", + (unsigned)LCFG_DEL_UUID); + LASSERTF(LCFG_MOUNTOPT == 0x000cf007UL, "found 0x%.8xUL\n", + (unsigned)LCFG_MOUNTOPT); + LASSERTF(LCFG_DEL_MOUNTOPT == 0x000cf008UL, "found 0x%.8xUL\n", + (unsigned)LCFG_DEL_MOUNTOPT); + LASSERTF(LCFG_SET_TIMEOUT == 0x000cf009UL, "found 0x%.8xUL\n", + (unsigned)LCFG_SET_TIMEOUT); + LASSERTF(LCFG_SET_UPCALL == 0x000cf00aUL, "found 0x%.8xUL\n", + (unsigned)LCFG_SET_UPCALL); + LASSERTF(LCFG_ADD_CONN == 0x000cf00bUL, "found 0x%.8xUL\n", + (unsigned)LCFG_ADD_CONN); + LASSERTF(LCFG_DEL_CONN == 0x000cf00cUL, "found 0x%.8xUL\n", + (unsigned)LCFG_DEL_CONN); + LASSERTF(LCFG_LOV_ADD_OBD == 0x000cf00dUL, "found 0x%.8xUL\n", + (unsigned)LCFG_LOV_ADD_OBD); + LASSERTF(LCFG_LOV_DEL_OBD == 0x000cf00eUL, "found 0x%.8xUL\n", + (unsigned)LCFG_LOV_DEL_OBD); + LASSERTF(LCFG_PARAM == 0x000cf00fUL, "found 0x%.8xUL\n", + (unsigned)LCFG_PARAM); + LASSERTF(LCFG_MARKER == 0x000cf010UL, "found 0x%.8xUL\n", + (unsigned)LCFG_MARKER); + LASSERTF(LCFG_LOG_START == 0x000ce011UL, "found 0x%.8xUL\n", + (unsigned)LCFG_LOG_START); + LASSERTF(LCFG_LOG_END == 0x000ce012UL, "found 0x%.8xUL\n", + (unsigned)LCFG_LOG_END); + LASSERTF(LCFG_LOV_ADD_INA == 0x000ce013UL, "found 0x%.8xUL\n", + (unsigned)LCFG_LOV_ADD_INA); + LASSERTF(LCFG_ADD_MDC == 0x000cf014UL, "found 0x%.8xUL\n", + (unsigned)LCFG_ADD_MDC); + LASSERTF(LCFG_DEL_MDC == 0x000cf015UL, "found 0x%.8xUL\n", + (unsigned)LCFG_DEL_MDC); + LASSERTF(LCFG_SPTLRPC_CONF == 0x000ce016UL, "found 0x%.8xUL\n", + (unsigned)LCFG_SPTLRPC_CONF); + LASSERTF(LCFG_POOL_NEW == 0x000ce020UL, "found 0x%.8xUL\n", + (unsigned)LCFG_POOL_NEW); + LASSERTF(LCFG_POOL_ADD == 0x000ce021UL, "found 0x%.8xUL\n", + (unsigned)LCFG_POOL_ADD); + LASSERTF(LCFG_POOL_REM == 0x000ce022UL, "found 0x%.8xUL\n", + (unsigned)LCFG_POOL_REM); + LASSERTF(LCFG_POOL_DEL == 0x000ce023UL, "found 0x%.8xUL\n", + (unsigned)LCFG_POOL_DEL); + LASSERTF(LCFG_SET_LDLM_TIMEOUT == 0x000ce030UL, "found 0x%.8xUL\n", + (unsigned)LCFG_SET_LDLM_TIMEOUT); + LASSERTF(LCFG_PRE_CLEANUP == 0x000cf031UL, "found 0x%.8xUL\n", + (unsigned)LCFG_PRE_CLEANUP); + LASSERTF(LCFG_SET_PARAM == 0x000ce032UL, "found 0x%.8xUL\n", + (unsigned)LCFG_SET_PARAM); + LASSERTF(LCFG_NODEMAP_ADD == 0x000ce040UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD); + LASSERTF(LCFG_NODEMAP_DEL == 0x000ce041UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL); + LASSERTF(LCFG_NODEMAP_ADD_RANGE == 0x000ce042UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD_RANGE); + LASSERTF(LCFG_NODEMAP_DEL_RANGE == 0x000ce043UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL_RANGE); + LASSERTF(LCFG_NODEMAP_ADD_UIDMAP == 0x000ce044UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD_UIDMAP); + LASSERTF(LCFG_NODEMAP_DEL_UIDMAP == 0x000ce045UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL_UIDMAP); + LASSERTF(LCFG_NODEMAP_ADD_GIDMAP == 0x000ce046UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD_GIDMAP); + LASSERTF(LCFG_NODEMAP_DEL_GIDMAP == 0x000ce047UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL_GIDMAP); + LASSERTF(LCFG_NODEMAP_ACTIVATE == 0x000ce048UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ACTIVATE); + LASSERTF(LCFG_NODEMAP_ADMIN == 0x000ce049UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADMIN); + LASSERTF(LCFG_NODEMAP_TRUSTED == 0x000ce050UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_TRUSTED); + LASSERTF(LCFG_NODEMAP_SQUASH_UID == 0x000ce051UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_SQUASH_UID); + LASSERTF(LCFG_NODEMAP_SQUASH_GID == 0x000ce052UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_SQUASH_GID); + LASSERTF(LCFG_NODEMAP_ADD_SHKEY == 0x000ce053UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_ADD_SHKEY); + LASSERTF(LCFG_NODEMAP_DEL_SHKEY == 0x000ce054UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DEL_SHKEY); + LASSERTF(LCFG_NODEMAP_TEST_NID == 0x000ce055UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_TEST_NID); + LASSERTF(LCFG_NODEMAP_TEST_ID == 0x000ce056UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_TEST_ID); + LASSERTF(LCFG_NODEMAP_SET_FILESET == 0x000ce057UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_SET_FILESET); + LASSERTF(LCFG_NODEMAP_DENY_UNKNOWN == 0x000ce058UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_DENY_UNKNOWN); + LASSERTF(LCFG_NODEMAP_MAP_MODE == 0x000ce059UL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_MAP_MODE); + LASSERTF(LCFG_NODEMAP_AUDIT_MODE == 0x000ce05aUL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_AUDIT_MODE); + LASSERTF(LCFG_NODEMAP_SET_SEPOL == 0x000ce05bUL, "found 0x%.8xUL\n", + (unsigned)LCFG_NODEMAP_SET_SEPOL); + LASSERTF(PORTALS_CFG_TYPE == 1, "found %lld\n", + (long long)PORTALS_CFG_TYPE); + LASSERTF(LUSTRE_CFG_TYPE == 123, "found %lld\n", + (long long)LUSTRE_CFG_TYPE); +} diff --git a/drivers/staging/lustrefsx/lustre/target/barrier.c b/drivers/staging/lustrefsx/lustre/target/barrier.c new file mode 100644 index 0000000000000..54b3e567b3605 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/barrier.c @@ -0,0 +1,414 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. + * + * lustre/target/barrier.c + * + * Currently, the Lustre barrier is implemented as write barrier on all MDTs. + * For each MDT in the system, when it starts, it registers a barrier instance + * that will be used in handling subsequent barrier requests. + * + * Author: Fan, Yong + */ + +#define DEBUG_SUBSYSTEM S_SNAPSHOT + +#include + +#include +#include +#include +#include +#include + +static LIST_HEAD(barrier_instance_list); +static DEFINE_SPINLOCK(barrier_instance_lock); + +struct barrier_instance { + struct list_head bi_link; + struct dt_device *bi_bottom; + struct dt_device *bi_next; + wait_queue_head_t bi_waitq; + rwlock_t bi_rwlock; + struct percpu_counter bi_writers; + atomic_t bi_ref; + time64_t bi_deadline; + __u32 bi_status; +}; + +static inline char *barrier_barrier2name(struct barrier_instance *barrier) +{ + return barrier->bi_bottom->dd_lu_dev.ld_obd->obd_name; +} + +static inline __u32 barrier_dev_idx(struct barrier_instance *barrier) +{ + return lu_site2seq(barrier->bi_bottom->dd_lu_dev.ld_site)->ss_node_id; +} + +static void barrier_instance_cleanup(struct barrier_instance *barrier) +{ + LASSERT(list_empty(&barrier->bi_link)); + + percpu_counter_destroy(&barrier->bi_writers); + OBD_FREE_PTR(barrier); +} + +static inline void barrier_instance_put(struct barrier_instance *barrier) +{ + if (atomic_dec_and_test(&barrier->bi_ref)) + barrier_instance_cleanup(barrier); +} + +static struct barrier_instance * +barrier_instance_find_locked(struct dt_device *key) +{ + struct barrier_instance *barrier; + + list_for_each_entry(barrier, &barrier_instance_list, bi_link) { + if (barrier->bi_bottom == key) + return barrier; + } + + return NULL; +} + +static void barrier_instance_add(struct barrier_instance *barrier) +{ + struct barrier_instance *tmp; + + spin_lock(&barrier_instance_lock); + tmp = barrier_instance_find_locked(barrier->bi_bottom); + LASSERT(!tmp); + + list_add_tail(&barrier->bi_link, &barrier_instance_list); + spin_unlock(&barrier_instance_lock); +} + +static struct barrier_instance *barrier_instance_find(struct dt_device *key) +{ + struct barrier_instance *barrier; + + spin_lock(&barrier_instance_lock); + barrier = barrier_instance_find_locked(key); + if (barrier) + atomic_inc(&barrier->bi_ref); + spin_unlock(&barrier_instance_lock); + + return barrier; +} + +static void barrier_set(struct barrier_instance *barrier, __u32 status) +{ + if (barrier->bi_status != status) { + CDEBUG(D_SNAPSHOT, "%s: change barrier status from %u to %u\n", + barrier_barrier2name(barrier), + barrier->bi_status, status); + + barrier->bi_status = status; + } +} + +/** + * Create the barrier for the given instance. + * + * We use two-phases barrier to guarantee that after the barrier setup: + * 1) All the MDT side pending async modification have been flushed. + * 2) Any subsequent modification will be blocked. + * 3) All async transactions on the MDTs have been committed. + * + * For phase1, we do the following: + * + * Firstly, it sets barrier flag on the instance that will block subsequent + * modifications from clients. (Note: server sponsored modification will be + * allowed for flush pending modifications) + * + * Secondly, it will flush all pending modification via dt_sync(), such as + * async OST-object destroy, async OST-object owner changes, and so on. + * + * If there are some on-handling clients sponsored modifications during the + * barrier freezing, then related modifications may cause pending requests + * after the first dt_sync(), so call dt_sync() again after all on-handling + * modifications done. + * + * With the phase1 barrier set, all pending cross-servers modification have + * been flushed to remote servers, and any new modification will be blocked. + * But it does not guarantees that all the updates have been committed to + * storage on remote servers. So when all the instances have done phase1 + * barrier successfully, the MGS will notify all instances to do the phase2 + * barrier as following: + * + * Every barrier instance will call dt_sync() to make all async transactions + * to be committed locally. + * + * \param[in] env pointer to the thread context + * \param[in] barrier pointer to the barrier instance + * \param[in] phase1 indicate whether it is phase1 barrier or not + * + * \retval positive number for timeout + * \retval 0 for success + * \retval negative error number on failure + */ +static int barrier_freeze(const struct lu_env *env, + struct barrier_instance *barrier, bool phase1) +{ + time64_t left; + int rc = 0; + __s64 inflight = 0; + ENTRY; + + write_lock(&barrier->bi_rwlock); + barrier_set(barrier, phase1 ? BS_FREEZING_P1 : BS_FREEZING_P2); + + /* Avoid out-of-order execution the barrier_set() + * and the check of inflight modifications count. */ + smp_mb(); + + if (phase1) + inflight = percpu_counter_sum(&barrier->bi_writers); + write_unlock(&barrier->bi_rwlock); + + rc = dt_sync(env, barrier->bi_next); + if (rc) + RETURN(rc); + + LASSERT(barrier->bi_deadline != 0); + + left = barrier->bi_deadline - ktime_get_real_seconds(); + if (left <= 0) + RETURN(1); + + if (phase1 && inflight != 0) { + struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(left), + NULL, NULL); + + rc = l_wait_event(barrier->bi_waitq, + percpu_counter_sum(&barrier->bi_writers) == 0, + &lwi); + if (rc) + RETURN(1); + + /* sync again after all inflight modifications done. */ + rc = dt_sync(env, barrier->bi_next); + if (rc) + RETURN(rc); + + if (ktime_get_real_seconds() > barrier->bi_deadline) + RETURN(1); + } + + CDEBUG(D_SNAPSHOT, "%s: barrier freezing %s done.\n", + barrier_barrier2name(barrier), phase1 ? "phase1" : "phase2"); + + if (!phase1) + barrier_set(barrier, BS_FROZEN); + + RETURN(0); +} + +void barrier_init(void) +{ +} + +void barrier_fini(void) +{ + LASSERT(list_empty(&barrier_instance_list)); +} + +bool barrier_entry(struct dt_device *key) +{ + struct barrier_instance *barrier; + bool entered = false; + ENTRY; + + barrier = barrier_instance_find(key); + if (unlikely(!barrier)) + /* Fail open */ + RETURN(true); + + read_lock(&barrier->bi_rwlock); + if (likely(barrier->bi_status != BS_FREEZING_P1 && + barrier->bi_status != BS_FREEZING_P2 && + barrier->bi_status != BS_FROZEN) || + ktime_get_real_seconds() > barrier->bi_deadline) { + percpu_counter_inc(&barrier->bi_writers); + entered = true; + } + read_unlock(&barrier->bi_rwlock); + + barrier_instance_put(barrier); + return entered; +} +EXPORT_SYMBOL(barrier_entry); + +void barrier_exit(struct dt_device *key) +{ + struct barrier_instance *barrier; + + barrier = barrier_instance_find(key); + if (likely(barrier)) { + percpu_counter_dec(&barrier->bi_writers); + + /* Avoid out-of-order execution the decreasing inflight + * modifications count and the check of barrier status. */ + smp_mb(); + + if (unlikely(barrier->bi_status == BS_FREEZING_P1)) + wake_up_all(&barrier->bi_waitq); + barrier_instance_put(barrier); + } +} +EXPORT_SYMBOL(barrier_exit); + +int barrier_handler(struct dt_device *key, struct ptlrpc_request *req) +{ + struct ldlm_gl_barrier_desc *desc; + struct barrier_instance *barrier; + struct barrier_lvb *lvb; + struct lu_env env; + int rc = 0; + ENTRY; + + /* glimpse on barrier locks always packs a glimpse descriptor */ + req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK_DESC); + desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC); + if (!desc) + GOTO(out, rc = -EPROTO); + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + sizeof(struct barrier_lvb)); + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out, rc); + + lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB); + barrier = barrier_instance_find(key); + if (!barrier) + GOTO(out, rc = -ENODEV); + + rc = lu_env_init(&env, LCT_MD_THREAD | LCT_DT_THREAD); + if (rc) + GOTO(out_barrier, rc); + + CDEBUG(D_SNAPSHOT, + "%s: handling barrier request: status %u, timeout %u\n", + barrier_barrier2name(barrier), + desc->lgbd_status, desc->lgbd_timeout); + + switch (desc->lgbd_status) { + case BS_RESCAN: + barrier_set(barrier, BS_INIT); + break; + case BS_FREEZING_P1: + case BS_FREEZING_P2: + if (OBD_FAIL_CHECK(OBD_FAIL_BARRIER_FAILURE)) + GOTO(fini, rc = -EINVAL); + + barrier->bi_deadline = ktime_get_real_seconds() + + desc->lgbd_timeout; + rc = barrier_freeze(&env, barrier, + desc->lgbd_status == BS_FREEZING_P1); + break; + case BS_THAWING: + case BS_FAILED: + case BS_EXPIRED: + barrier_set(barrier, BS_THAWED); + break; + default: + CWARN("%s: unexpected barrier status %u\n", + barrier_barrier2name(barrier), desc->lgbd_status); + rc = -EINVAL; + break; + } + + GOTO(fini, rc); + +fini: + lu_env_fini(&env); + +out_barrier: + if (rc < 0) + barrier_set(barrier, BS_FAILED); + else if (rc > 0) + barrier_set(barrier, BS_EXPIRED); + + lvb->lvb_status = barrier->bi_status; + lvb->lvb_index = barrier_dev_idx(barrier); + + CDEBUG(D_SNAPSHOT, "%s: handled barrier request: status %u, " + "deadline %lld: rc = %d\n", barrier_barrier2name(barrier), + lvb->lvb_status, barrier->bi_deadline, rc); + + barrier_instance_put(barrier); + rc = 0; + +out: + req->rq_status = rc; + return rc; +} +EXPORT_SYMBOL(barrier_handler); + +int barrier_register(struct dt_device *key, struct dt_device *next) +{ + struct barrier_instance *barrier; + int rc; + ENTRY; + + OBD_ALLOC_PTR(barrier); + if (!barrier) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&barrier->bi_link); + barrier->bi_bottom = key; + barrier->bi_next = next; + init_waitqueue_head(&barrier->bi_waitq); + rwlock_init(&barrier->bi_rwlock); + atomic_set(&barrier->bi_ref, 1); +#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG + rc = percpu_counter_init(&barrier->bi_writers, 0, GFP_KERNEL); +#else + rc = percpu_counter_init(&barrier->bi_writers, 0); +#endif + if (rc) + barrier_instance_cleanup(barrier); + else + barrier_instance_add(barrier); + + RETURN(rc); +} +EXPORT_SYMBOL(barrier_register); + +void barrier_deregister(struct dt_device *key) +{ + struct barrier_instance *barrier; + + spin_lock(&barrier_instance_lock); + barrier = barrier_instance_find_locked(key); + if (barrier) + list_del_init(&barrier->bi_link); + spin_unlock(&barrier_instance_lock); + + if (barrier) + barrier_instance_put(barrier); +} +EXPORT_SYMBOL(barrier_deregister); diff --git a/drivers/staging/lustrefsx/lustre/target/out_handler.c b/drivers/staging/lustrefsx/lustre/target/out_handler.c new file mode 100644 index 0000000000000..a238f588e0cd1 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/out_handler.c @@ -0,0 +1,1247 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + * + * lustre/target/out_handler.c + * + * Object update handler between targets. + * + * Author: di.wang + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include "tgt_internal.h" + +static inline void orr_cpu_to_le(struct out_read_reply *orr_dst, + const struct out_read_reply *orr_src) +{ + orr_dst->orr_size = cpu_to_le32(orr_src->orr_size); + orr_dst->orr_padding = cpu_to_le32(orr_src->orr_padding); + orr_dst->orr_offset = cpu_to_le64(orr_dst->orr_offset); +} + +static void out_reconstruct(const struct lu_env *env, struct dt_device *dt, + struct dt_object *obj, + struct object_update_reply *reply, + int index) +{ + CDEBUG(D_HA, "%s: fork reply reply %p index %d: rc = %d\n", + dt_obd_name(dt), reply, index, 0); + + object_update_result_insert(reply, NULL, 0, index, 0); + return; +} + +typedef void (*out_reconstruct_t)(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *obj, + struct object_update_reply *reply, + int index); + +static inline bool out_check_resent(struct ptlrpc_request *req) +{ + if (likely(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) + return false; + + if (req_xid_is_last(req)) { + struct lsd_client_data *lcd; + + /* XXX this does not support mulitple transactions yet, i.e. + * only 1 update RPC each time betwee MDTs */ + lcd = req->rq_export->exp_target_data.ted_lcd; + + req->rq_transno = lcd->lcd_last_transno; + req->rq_status = lcd->lcd_last_result; + if (req->rq_status != 0) + req->rq_transno = 0; + lustre_msg_set_transno(req->rq_repmsg, req->rq_transno); + lustre_msg_set_status(req->rq_repmsg, req->rq_status); + + DEBUG_REQ(D_HA, req, "reconstruct resent RPC"); + return true; + } + DEBUG_REQ(D_HA, req, "reprocess RESENT req, last_xid is %lld", + req->rq_export->exp_target_data.ted_lcd->lcd_last_xid); + return false; +} + +static int out_create(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct dt_object_format *dof = &tti->tti_u.update.tti_update_dof; + struct obdo *lobdo = &tti->tti_u.update.tti_obdo; + struct lu_attr *attr = &tti->tti_attr; + struct lu_fid *fid = NULL; + struct obdo *wobdo; + size_t size; + int rc; + + ENTRY; + + wobdo = object_update_param_get(update, 0, &size); + if (IS_ERR(wobdo) || size != sizeof(*wobdo)) { + CERROR("%s: obdo is NULL, invalid RPC: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(wobdo)); + RETURN(PTR_ERR(wobdo)); + } + + if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req)) + lustre_swab_obdo(wobdo); + lustre_get_wire_obdo(NULL, lobdo, wobdo); + la_from_obdo(attr, lobdo, lobdo->o_valid); + + dof->dof_type = dt_mode_to_dft(attr->la_mode); + if (update->ou_params_count > 1) { + fid = object_update_param_get(update, 1, &size); + if (IS_ERR(fid) || size != sizeof(*fid)) { + CERROR("%s: invalid fid: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(fid)); + RETURN(PTR_ERR(fid)); + } + if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req)) + lustre_swab_lu_fid(fid); + if (!fid_is_sane(fid)) { + CERROR("%s: invalid fid "DFID": rc = %d\n", + tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO); + RETURN(-EPROTO); + } + } + + if (lu_object_exists(&obj->do_lu)) + RETURN(-EEXIST); + + rc = out_tx_create(tsi->tsi_env, obj, attr, fid, dof, + &tti->tti_tea, tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + + RETURN(rc); +} + +static int out_attr_set(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct lu_attr *attr = &tti->tti_attr; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct obdo *lobdo = &tti->tti_u.update.tti_obdo; + struct obdo *wobdo; + size_t size; + int rc; + + ENTRY; + + wobdo = object_update_param_get(update, 0, &size); + if (IS_ERR(wobdo) || size != sizeof(*wobdo)) { + CERROR("%s: empty obdo in the update: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(wobdo)); + RETURN(PTR_ERR(wobdo)); + } + + attr->la_valid = 0; + attr->la_valid = 0; + + if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req)) + lustre_swab_obdo(wobdo); + lustre_get_wire_obdo(NULL, lobdo, wobdo); + la_from_obdo(attr, lobdo, lobdo->o_valid); + + rc = out_tx_attr_set(tsi->tsi_env, obj, attr, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + + RETURN(rc); +} + +static int out_attr_get(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct object_update *update = tti->tti_u.update.tti_update; + struct obdo *obdo = &tti->tti_u.update.tti_obdo; + struct lu_attr *la = &tti->tti_attr; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + int idx = tti->tti_u.update.tti_update_reply_index; + int rc; + + ENTRY; + + if (unlikely(update->ou_result_size < sizeof(*obdo))) + return -EPROTO; + + if (!lu_object_exists(&obj->do_lu)) { + /* Usually, this will be called when the master MDT try + * to init a remote object(see osp_object_init), so if + * the object does not exist on slave, we need set BANSHEE flag, + * so the object can be removed from the cache immediately */ + set_bit(LU_OBJECT_HEARD_BANSHEE, + &obj->do_lu.lo_header->loh_flags); + RETURN(-ENOENT); + } + + dt_read_lock(env, obj, MOR_TGT_CHILD); + rc = dt_attr_get(env, obj, la); + if (rc) + GOTO(out_unlock, rc); + + obdo->o_valid = 0; + obdo_from_la(obdo, la, la->la_valid); + +out_unlock: + dt_read_unlock(env, obj); + + CDEBUG(D_INFO, "%s: insert attr get reply %p index %d: rc = %d\n", + tgt_name(tsi->tsi_tgt), tti->tti_u.update.tti_update_reply, + 0, rc); + + object_update_result_insert(tti->tti_u.update.tti_update_reply, obdo, + sizeof(*obdo), idx, rc); + + RETURN(rc); +} + +static int out_xattr_get(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct object_update *update = tti->tti_u.update.tti_update; + struct lu_buf *lbuf = &tti->tti_buf; + struct object_update_reply *reply = tti->tti_u.update.tti_update_reply; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + char *name; + struct object_update_result *update_result; + int idx = tti->tti_u.update.tti_update_reply_index; + int rc; + + ENTRY; + + if (!lu_object_exists(&obj->do_lu)) { + set_bit(LU_OBJECT_HEARD_BANSHEE, + &obj->do_lu.lo_header->loh_flags); + RETURN(-ENOENT); + } + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for xattr get: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + update_result = object_update_result_get(reply, idx, NULL); + if (update_result == NULL) { + CERROR("%s: empty name for xattr get: rc = %d\n", + tgt_name(tsi->tsi_tgt), -EPROTO); + RETURN(-EPROTO); + } + + lbuf->lb_len = (int)tti->tti_u.update.tti_update->ou_result_size; + if (lbuf->lb_len == 0) + lbuf->lb_buf = NULL; + else + lbuf->lb_buf = update_result->our_data; + + dt_read_lock(env, obj, MOR_TGT_CHILD); + rc = dt_xattr_get(env, obj, lbuf, name); + dt_read_unlock(env, obj); + if (rc <= 0) { + lbuf->lb_len = 0; + if (unlikely(!rc)) + rc = -ENODATA; + } else if (lbuf->lb_buf) { + lbuf->lb_len = rc; + } + CDEBUG(D_INFO, "%s: "DFID" get xattr %s len %d\n", + tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)), + name, rc); + + GOTO(out, rc); + +out: + object_update_result_insert(reply, lbuf->lb_buf, lbuf->lb_len, idx, rc); + RETURN(0); +} + +static int out_xattr_list(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct lu_buf *lbuf = &tti->tti_buf; + struct object_update_reply *reply = tti->tti_u.update.tti_update_reply; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct object_update_result *update_result; + int idx = tti->tti_u.update.tti_update_reply_index; + int rc; + + ENTRY; + + if (!lu_object_exists(&obj->do_lu)) { + set_bit(LU_OBJECT_HEARD_BANSHEE, + &obj->do_lu.lo_header->loh_flags); + RETURN(-ENOENT); + } + + update_result = object_update_result_get(reply, 0, NULL); + if (!update_result) { + rc = -EPROTO; + CERROR("%s: empty buf for xattr list: rc = %d\n", + tgt_name(tsi->tsi_tgt), rc); + RETURN(rc); + } + + lbuf->lb_len = (int)tti->tti_u.update.tti_update->ou_result_size; + lbuf->lb_buf = update_result->our_data; + if (lbuf->lb_len == 0) + lbuf->lb_buf = 0; + + dt_read_lock(env, obj, MOR_TGT_CHILD); + rc = dt_xattr_list(env, obj, lbuf); + dt_read_unlock(env, obj); + if (rc <= 0) { + lbuf->lb_len = 0; + if (unlikely(!rc)) + rc = -ENODATA; + } else if (lbuf->lb_buf) { + lbuf->lb_len = rc; + } + + CDEBUG(D_INFO, "%s: "DFID" list xattr len %d\n", + tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)), rc); + + /* Since we directly use update_result->our_data as the lbuf->lb_buf, + * then use NULL for result_insert to avoid unnecessary memory copy. */ + object_update_result_insert(reply, NULL, lbuf->lb_len, idx, rc); + + RETURN(0); +} + +static int out_index_lookup(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + char *name; + int rc; + + ENTRY; + + if (unlikely(update->ou_result_size < sizeof(tti->tti_fid1))) + return -EPROTO; + + if (!lu_object_exists(&obj->do_lu)) + RETURN(-ENOENT); + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for lookup: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + dt_read_lock(env, obj, MOR_TGT_CHILD); + if (!dt_try_as_dir(env, obj)) + GOTO(out_unlock, rc = -ENOTDIR); + + rc = dt_lookup(env, obj, (struct dt_rec *)&tti->tti_fid1, + (struct dt_key *)name); + + if (rc < 0) + GOTO(out_unlock, rc); + + if (rc == 0) + rc += 1; + +out_unlock: + dt_read_unlock(env, obj); + + CDEBUG(D_INFO, "lookup "DFID" %s get "DFID" rc %d\n", + PFID(lu_object_fid(&obj->do_lu)), name, + PFID(&tti->tti_fid1), rc); + + CDEBUG(D_INFO, "%s: insert lookup reply %p index %d: rc = %d\n", + tgt_name(tsi->tsi_tgt), tti->tti_u.update.tti_update_reply, + 0, rc); + + object_update_result_insert(tti->tti_u.update.tti_update_reply, + &tti->tti_fid1, sizeof(tti->tti_fid1), + tti->tti_u.update.tti_update_reply_index, rc); + RETURN(rc); +} + +static int out_xattr_set(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct lu_buf *lbuf = &tti->tti_buf; + char *name; + char *buf; + __u32 *tmp; + size_t buf_len = 0; + int flag; + size_t size = 0; + int rc; + ENTRY; + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for xattr set: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + /* If buffer == NULL (-ENODATA), then it might mean delete xattr */ + buf = object_update_param_get(update, 1, &buf_len); + if (IS_ERR(buf) && PTR_ERR(buf) != -ENODATA) + RETURN(PTR_ERR(buf)); + + lbuf->lb_buf = buf; + lbuf->lb_len = buf_len; + + tmp = object_update_param_get(update, 2, &size); + if (IS_ERR(tmp) || size != sizeof(*tmp)) { + CERROR("%s: emptry or wrong size %zu flag: rc = %ld\n", + tgt_name(tsi->tsi_tgt), size, PTR_ERR(tmp)); + RETURN(PTR_ERR(tmp)); + } + + if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req)) + __swab32s(tmp); + flag = *tmp; + + rc = out_tx_xattr_set(tsi->tsi_env, obj, lbuf, name, flag, + &tti->tti_tea, tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_xattr_del(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + char *name; + int rc; + ENTRY; + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for xattr set: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + rc = out_tx_xattr_del(tsi->tsi_env, obj, name, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +/** + * increase ref of the object + **/ +static int out_ref_add(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + int rc; + + ENTRY; + + rc = out_tx_ref_add(tsi->tsi_env, obj, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_ref_del(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + int rc; + + ENTRY; + + if (!lu_object_exists(&obj->do_lu)) + RETURN(-ENOENT); + + rc = out_tx_ref_del(tsi->tsi_env, obj, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_index_insert(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct dt_insert_rec *rec = &tti->tti_rec; + struct lu_fid *fid; + char *name; + __u32 *ptype; + int rc = 0; + size_t size; + ENTRY; + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for index insert: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + fid = object_update_param_get(update, 1, &size); + if (IS_ERR(fid) || size != sizeof(*fid)) { + CERROR("%s: invalid fid: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(fid)); + RETURN(PTR_ERR(fid)); + } + + if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req)) + lustre_swab_lu_fid(fid); + + if (!fid_is_sane(fid)) { + CERROR("%s: invalid FID "DFID": rc = %d\n", + tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO); + RETURN(-EPROTO); + } + + ptype = object_update_param_get(update, 2, &size); + if (IS_ERR(ptype) || size != sizeof(*ptype)) { + CERROR("%s: invalid type for index insert: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(ptype)); + RETURN(PTR_ERR(ptype)); + } + + if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req)) + __swab32s(ptype); + + rec->rec_fid = fid; + rec->rec_type = *ptype; + + rc = out_tx_index_insert(tsi->tsi_env, obj, (const struct dt_rec *)rec, + (const struct dt_key *)name, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_index_delete(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + char *name; + int rc = 0; + + if (!lu_object_exists(&obj->do_lu)) + RETURN(-ENOENT); + + name = object_update_param_get(update, 0, NULL); + if (IS_ERR(name)) { + CERROR("%s: empty name for index delete: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(name)); + RETURN(PTR_ERR(name)); + } + + rc = out_tx_index_delete(tsi->tsi_env, obj, (const struct dt_key *)name, + &tti->tti_tea, tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_destroy(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct lu_fid *fid; + int rc; + ENTRY; + + fid = &update->ou_fid; + if (!fid_is_sane(fid)) { + CERROR("%s: invalid FID "DFID": rc = %d\n", + tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO); + RETURN(-EPROTO); + } + + if (!lu_object_exists(&obj->do_lu)) + RETURN(-ENOENT); + + rc = out_tx_destroy(tsi->tsi_env, obj, &tti->tti_tea, + tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + + RETURN(rc); +} + +static int out_write(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct lu_buf *lbuf = &tti->tti_buf; + char *buf; + __u64 *tmp; + size_t size = 0; + size_t buf_len = 0; + loff_t pos; + int rc; + ENTRY; + + buf = object_update_param_get(update, 0, &buf_len); + if (IS_ERR(buf) || buf_len == 0) { + CERROR("%s: empty buf for xattr set: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(buf)); + RETURN(PTR_ERR(buf)); + } + lbuf->lb_buf = buf; + lbuf->lb_len = buf_len; + + tmp = object_update_param_get(update, 1, &size); + if (IS_ERR(tmp) || size != sizeof(*tmp)) { + CERROR("%s: empty or wrong size %zu pos: rc = %ld\n", + tgt_name(tsi->tsi_tgt), size, PTR_ERR(tmp)); + RETURN(PTR_ERR(tmp)); + } + + if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req)) + __swab64s(tmp); + pos = *tmp; + + rc = out_tx_write(tsi->tsi_env, obj, lbuf, pos, + &tti->tti_tea, tti->tti_tea.ta_handle, + tti->tti_u.update.tti_update_reply, + tti->tti_u.update.tti_update_reply_index); + RETURN(rc); +} + +static int out_read(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct object_update *update = tti->tti_u.update.tti_update; + struct dt_object *obj = tti->tti_u.update.tti_dt_object; + struct object_update_reply *reply = tti->tti_u.update.tti_update_reply; + int index = tti->tti_u.update.tti_update_reply_index; + struct lu_rdbuf *rdbuf; + struct object_update_result *update_result; + struct out_read_reply *orr; + void *tmp; + size_t size; + size_t total_size = 0; + __u64 pos; + unsigned int i; + unsigned int nbufs; + int rc = 0; + ENTRY; + + update_result = object_update_result_get(reply, index, NULL); + LASSERT(update_result != NULL); + update_result->our_datalen = sizeof(*orr); + + if (!lu_object_exists(&obj->do_lu)) + GOTO(out, rc = -ENOENT); + + tmp = object_update_param_get(update, 0, NULL); + if (IS_ERR(tmp)) { + CERROR("%s: empty size for read: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(tmp)); + GOTO(out, rc = PTR_ERR(tmp)); + } + size = le64_to_cpu(*(size_t *)(tmp)); + + tmp = object_update_param_get(update, 1, NULL); + if (IS_ERR(tmp)) { + CERROR("%s: empty pos for read: rc = %ld\n", + tgt_name(tsi->tsi_tgt), PTR_ERR(tmp)); + GOTO(out, rc = PTR_ERR(tmp)); + } + pos = le64_to_cpu(*(__u64 *)(tmp)); + + /* Put the offset into the begining of the buffer in reply */ + orr = (struct out_read_reply *)update_result->our_data; + + nbufs = (size + OUT_BULK_BUFFER_SIZE - 1) / OUT_BULK_BUFFER_SIZE; + OBD_ALLOC(rdbuf, sizeof(struct lu_rdbuf) + + nbufs * sizeof(rdbuf->rb_bufs[0])); + if (rdbuf == NULL) + GOTO(out, rc = -ENOMEM); + + rdbuf->rb_nbufs = 0; + total_size = 0; + for (i = 0; i < nbufs; i++) { + __u32 read_size; + + read_size = size > OUT_BULK_BUFFER_SIZE ? + OUT_BULK_BUFFER_SIZE : size; + OBD_ALLOC(rdbuf->rb_bufs[i].lb_buf, read_size); + if (rdbuf->rb_bufs[i].lb_buf == NULL) + GOTO(out_free, rc = -ENOMEM); + + rdbuf->rb_bufs[i].lb_len = read_size; + dt_read_lock(env, obj, MOR_TGT_CHILD); + rc = dt_read(env, obj, &rdbuf->rb_bufs[i], &pos); + dt_read_unlock(env, obj); + + total_size += rc < 0 ? 0 : rc; + if (rc <= 0) + break; + + rdbuf->rb_nbufs++; + size -= read_size; + } + + /* send pages to client */ + rc = tgt_send_buffer(tsi, rdbuf); + if (rc < 0) + GOTO(out_free, rc); + + orr->orr_size = total_size; + orr->orr_offset = pos; + + orr_cpu_to_le(orr, orr); + update_result->our_datalen += orr->orr_size; +out_free: + for (i = 0; i < nbufs; i++) { + if (rdbuf->rb_bufs[i].lb_buf != NULL) { + OBD_FREE(rdbuf->rb_bufs[i].lb_buf, + rdbuf->rb_bufs[i].lb_len); + } + } + OBD_FREE(rdbuf, sizeof(struct lu_rdbuf) + + nbufs * sizeof(rdbuf->rb_bufs[0])); +out: + /* Insert read buffer */ + update_result->our_rc = ptlrpc_status_hton(rc); + reply->ourp_lens[index] = cfs_size_round(update_result->our_datalen + + sizeof(*update_result)); + RETURN(rc); +} + +static int out_noop(struct tgt_session_info *tsi) +{ + return 0; +} + +#define DEF_OUT_HNDL(opc, name, flags, fn) \ +[opc - OUT_CREATE] = { \ + .th_name = name, \ + .th_fail_id = 0, \ + .th_opc = opc, \ + .th_flags = flags, \ + .th_act = fn, \ + .th_fmt = NULL, \ + .th_version = 0, \ +} + +static struct tgt_handler out_update_ops[] = { + DEF_OUT_HNDL(OUT_CREATE, "out_create", MUTABOR | HABEO_REFERO, + out_create), + DEF_OUT_HNDL(OUT_DESTROY, "out_create", MUTABOR | HABEO_REFERO, + out_destroy), + DEF_OUT_HNDL(OUT_REF_ADD, "out_ref_add", MUTABOR | HABEO_REFERO, + out_ref_add), + DEF_OUT_HNDL(OUT_REF_DEL, "out_ref_del", MUTABOR | HABEO_REFERO, + out_ref_del), + DEF_OUT_HNDL(OUT_ATTR_SET, "out_attr_set", MUTABOR | HABEO_REFERO, + out_attr_set), + DEF_OUT_HNDL(OUT_ATTR_GET, "out_attr_get", HABEO_REFERO, + out_attr_get), + DEF_OUT_HNDL(OUT_XATTR_SET, "out_xattr_set", MUTABOR | HABEO_REFERO, + out_xattr_set), + DEF_OUT_HNDL(OUT_XATTR_DEL, "out_xattr_del", MUTABOR | HABEO_REFERO, + out_xattr_del), + DEF_OUT_HNDL(OUT_XATTR_GET, "out_xattr_get", HABEO_REFERO, + out_xattr_get), + DEF_OUT_HNDL(OUT_INDEX_LOOKUP, "out_index_lookup", HABEO_REFERO, + out_index_lookup), + DEF_OUT_HNDL(OUT_INDEX_INSERT, "out_index_insert", + MUTABOR | HABEO_REFERO, out_index_insert), + DEF_OUT_HNDL(OUT_INDEX_DELETE, "out_index_delete", + MUTABOR | HABEO_REFERO, out_index_delete), + DEF_OUT_HNDL(OUT_WRITE, "out_write", MUTABOR | HABEO_REFERO, out_write), + DEF_OUT_HNDL(OUT_READ, "out_read", HABEO_REFERO, out_read), + DEF_OUT_HNDL(OUT_NOOP, "out_noop", HABEO_REFERO, out_noop), + DEF_OUT_HNDL(OUT_XATTR_LIST, "out_xattr_list", HABEO_REFERO, + out_xattr_list), +}; + +static struct tgt_handler *out_handler_find(__u32 opc) +{ + struct tgt_handler *h; + + h = NULL; + if (OUT_CREATE <= opc && opc < OUT_LAST) { + h = &out_update_ops[opc - OUT_CREATE]; + LASSERTF(h->th_opc == opc, "opcode mismatch %d != %d\n", + h->th_opc, opc); + } else { + h = NULL; /* unsupported opc */ + } + return h; +} + +static int out_tx_start(const struct lu_env *env, struct dt_device *dt, + struct thandle_exec_args *ta, struct obd_export *exp) +{ + ta->ta_argno = 0; + ta->ta_handle = dt_trans_create(env, dt); + if (IS_ERR(ta->ta_handle)) { + int rc; + + rc = PTR_ERR(ta->ta_handle); + ta->ta_handle = NULL; + CERROR("%s: start handle error: rc = %d\n", dt_obd_name(dt), + rc); + return rc; + } + if (exp->exp_need_sync) + ta->ta_handle->th_sync = 1; + + return 0; +} + +static int out_trans_start(const struct lu_env *env, + struct thandle_exec_args *ta) +{ + return dt_trans_start(env, ta->ta_handle->th_dev, ta->ta_handle); +} + +static int out_trans_stop(const struct lu_env *env, + struct thandle_exec_args *ta, int err) +{ + int i; + int rc; + + ta->ta_handle->th_result = err; + rc = dt_trans_stop(env, ta->ta_handle->th_dev, ta->ta_handle); + for (i = 0; i < ta->ta_argno; i++) { + if (ta->ta_args[i]->object != NULL) { + dt_object_put(env, ta->ta_args[i]->object); + ta->ta_args[i]->object = NULL; + } + } + ta->ta_handle = NULL; + ta->ta_argno = 0; + + return rc; +} + +static int out_tx_end(const struct lu_env *env, struct thandle_exec_args *ta, + int declare_ret) +{ + struct tgt_session_info *tsi = tgt_ses_info(env); + int i; + int rc; + int rc1; + ENTRY; + + if (ta->ta_handle == NULL) + RETURN(0); + + if (declare_ret != 0 || ta->ta_argno == 0) + GOTO(stop, rc = declare_ret); + + LASSERT(ta->ta_handle->th_dev != NULL); + rc = out_trans_start(env, ta); + if (unlikely(rc != 0)) + GOTO(stop, rc); + + for (i = 0; i < ta->ta_argno; i++) { + rc = ta->ta_args[i]->exec_fn(env, ta->ta_handle, + ta->ta_args[i]); + if (unlikely(rc != 0)) { + CDEBUG(D_INFO, "error during execution of #%u from" + " %s:%d: rc = %d\n", i, ta->ta_args[i]->file, + ta->ta_args[i]->line, rc); + while (--i >= 0) { + if (ta->ta_args[i]->undo_fn != NULL) + ta->ta_args[i]->undo_fn(env, + ta->ta_handle, + ta->ta_args[i]); + else + CERROR("%s: undo for %s:%d: rc = %d\n", + dt_obd_name(ta->ta_handle->th_dev), + ta->ta_args[i]->file, + ta->ta_args[i]->line, -ENOTSUPP); + } + break; + } + CDEBUG(D_INFO, "%s: executed %u/%u: rc = %d\n", + dt_obd_name(ta->ta_handle->th_dev), i, ta->ta_argno, rc); + } + + /* Only fail for real updates, XXX right now llog updates will be + * ignore, whose updates count is usually 1, so failover test + * case will spot this FAIL_UPDATE_NET_REP precisely, and it will + * be removed after async update patch is landed. */ + if (ta->ta_argno > 1) + tsi->tsi_reply_fail_id = OBD_FAIL_OUT_UPDATE_NET_REP; + +stop: + rc1 = out_trans_stop(env, ta, rc); + if (rc == 0) + rc = rc1; + + ta->ta_handle = NULL; + ta->ta_argno = 0; + + RETURN(rc); +} + +/** + * Object updates between Targets. Because all the updates has been + * dis-assemblied into object updates at sender side, so OUT will + * call OSD API directly to execute these updates. + * + * In DNE phase I all of the updates in the request need to be executed + * in one transaction, and the transaction has to be synchronously. + * + * Please refer to lustre/include/lustre/lustre_idl.h for req/reply + * format. + */ +int out_handle(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct tgt_thread_info *tti = tgt_th_info(env); + struct thandle_exec_args *ta = &tti->tti_tea; + struct req_capsule *pill = tsi->tsi_pill; + struct dt_device *dt = tsi->tsi_tgt->lut_bottom; + struct out_update_header *ouh; + struct out_update_buffer *oub = NULL; + struct object_update *update; + struct object_update_reply *reply; + struct ptlrpc_bulk_desc *desc = NULL; + struct l_wait_info lwi; + void **update_bufs; + int current_batchid = -1; + __u32 update_buf_count; + unsigned int i; + unsigned int reply_index = 0; + int rc = 0; + int rc1 = 0; + int ouh_size, reply_size; + int updates; + bool need_reconstruct; + + ENTRY; + + req_capsule_set(pill, &RQF_OUT_UPDATE); + ouh_size = req_capsule_get_size(pill, &RMF_OUT_UPDATE_HEADER, + RCL_CLIENT); + if (ouh_size <= 0) + RETURN(err_serious(-EPROTO)); + + ouh = req_capsule_client_get(pill, &RMF_OUT_UPDATE_HEADER); + if (ouh == NULL) + RETURN(err_serious(-EPROTO)); + + if (ouh->ouh_magic != OUT_UPDATE_HEADER_MAGIC) { + CERROR("%s: invalid update buffer magic %x expect %x: " + "rc = %d\n", tgt_name(tsi->tsi_tgt), ouh->ouh_magic, + UPDATE_REQUEST_MAGIC, -EPROTO); + RETURN(err_serious(-EPROTO)); + } + + update_buf_count = ouh->ouh_count; + if (update_buf_count == 0) + RETURN(err_serious(-EPROTO)); + + OBD_ALLOC(update_bufs, sizeof(*update_bufs) * update_buf_count); + if (update_bufs == NULL) + RETURN(err_serious(-ENOMEM)); + + if (ouh->ouh_inline_length > 0) { + update_bufs[0] = ouh->ouh_inline_data; + } else { + struct out_update_buffer *tmp; + + oub = req_capsule_client_get(pill, &RMF_OUT_UPDATE_BUF); + if (oub == NULL) + GOTO(out_free, rc = err_serious(-EPROTO)); + + desc = ptlrpc_prep_bulk_exp(pill->rc_req, update_buf_count, + PTLRPC_BULK_OPS_COUNT, + PTLRPC_BULK_GET_SINK | + PTLRPC_BULK_BUF_KVEC, + MDS_BULK_PORTAL, + &ptlrpc_bulk_kvec_ops); + if (desc == NULL) + GOTO(out_free, rc = err_serious(-ENOMEM)); + + tmp = oub; + for (i = 0; i < update_buf_count; i++, tmp++) { + if (tmp->oub_size >= OUT_MAXREQSIZE) + GOTO(out_free, rc = err_serious(-EPROTO)); + + OBD_ALLOC_LARGE(update_bufs[i], tmp->oub_size); + if (update_bufs[i] == NULL) + GOTO(out_free, rc = err_serious(-ENOMEM)); + + desc->bd_frag_ops->add_iov_frag(desc, update_bufs[i], + tmp->oub_size); + } + + pill->rc_req->rq_bulk_write = 1; + rc = sptlrpc_svc_prep_bulk(pill->rc_req, desc); + if (rc != 0) + GOTO(out_free, rc = err_serious(rc)); + + rc = target_bulk_io(pill->rc_req->rq_export, desc, &lwi); + if (rc < 0) + GOTO(out_free, rc = err_serious(rc)); + } + /* validate the request and calculate the total update count and + * set it to reply */ + reply_size = 0; + updates = 0; + for (i = 0; i < update_buf_count; i++) { + struct object_update_request *our; + int j; + + our = update_bufs[i]; + if (ptlrpc_req_need_swab(pill->rc_req)) + lustre_swab_object_update_request(our); + + if (our->ourq_magic != UPDATE_REQUEST_MAGIC) { + CERROR("%s: invalid update buffer magic %x" + " expect %x: rc = %d\n", + tgt_name(tsi->tsi_tgt), our->ourq_magic, + UPDATE_REQUEST_MAGIC, -EPROTO); + GOTO(out_free, rc = err_serious(-EPROTO)); + } + updates += our->ourq_count; + + /* need to calculate reply size */ + for (j = 0; j < our->ourq_count; j++) { + update = object_update_request_get(our, j, NULL); + if (update == NULL) + GOTO(out, rc = err_serious(-EPROTO)); + if (ptlrpc_req_need_swab(pill->rc_req)) + lustre_swab_object_update(update); + + if (!fid_is_sane(&update->ou_fid)) { + CERROR("%s: invalid FID "DFID": rc = %d\n", + tgt_name(tsi->tsi_tgt), + PFID(&update->ou_fid), -EPROTO); + GOTO(out, rc = err_serious(-EPROTO)); + } + + /* XXX: what ou_result_size can be considered safe? */ + + reply_size += sizeof(reply->ourp_lens[0]); + reply_size += sizeof(struct object_update_result); + reply_size += update->ou_result_size; + } + } + reply_size += sizeof(*reply); + + if (unlikely(reply_size > ouh->ouh_reply_size)) { + CERROR("%s: too small reply buf %u for %u, need %u at least\n", + tgt_name(tsi->tsi_tgt), ouh->ouh_reply_size, + updates, reply_size); + GOTO(out_free, rc = err_serious(-EPROTO)); + } + + req_capsule_set_size(pill, &RMF_OUT_UPDATE_REPLY, RCL_SERVER, + ouh->ouh_reply_size); + rc = req_capsule_server_pack(pill); + if (rc != 0) { + CERROR("%s: Can't pack response: rc = %d\n", + tgt_name(tsi->tsi_tgt), rc); + GOTO(out_free, rc = err_serious(-EPROTO)); + } + + /* Prepare the update reply buffer */ + reply = req_capsule_server_get(pill, &RMF_OUT_UPDATE_REPLY); + if (reply == NULL) + GOTO(out_free, rc = -EPROTO); + reply->ourp_magic = UPDATE_REPLY_MAGIC; + reply->ourp_count = updates; + tti->tti_u.update.tti_update_reply = reply; + tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi)); + + need_reconstruct = out_check_resent(pill->rc_req); + + /* Walk through updates in the request to execute them */ + for (i = 0; i < update_buf_count; i++) { + struct tgt_handler *h; + struct dt_object *dt_obj; + int update_count; + struct object_update_request *our; + int j; + + our = update_bufs[i]; + update_count = our->ourq_count; + for (j = 0; j < update_count; j++) { + struct lu_object_conf conf; + + update = object_update_request_get(our, j, NULL); + if (update->ou_type == OUT_CREATE) + conf.loc_flags = LOC_F_NEW; + else + conf.loc_flags = 0; + + dt_obj = dt_locate_at(env, dt, &update->ou_fid, + dt->dd_lu_dev.ld_site->ls_top_dev, &conf); + if (IS_ERR(dt_obj)) + GOTO(out, rc = PTR_ERR(dt_obj)); + + if (dt->dd_record_fid_accessed) { + struct lfsck_req_local *lrl = &tti->tti_lrl; + + lfsck_pack_rfa(lrl, + lu_object_fid(&dt_obj->do_lu), + LEL_FID_ACCESSED, + LFSCK_TYPE_LAYOUT); + tgt_lfsck_in_notify_local(env, dt, lrl, NULL); + } + + tti->tti_u.update.tti_dt_object = dt_obj; + tti->tti_u.update.tti_update = update; + tti->tti_u.update.tti_update_reply_index = reply_index; + + h = out_handler_find(update->ou_type); + if (unlikely(h == NULL)) { + CERROR("%s: unsupported opc: 0x%x\n", + tgt_name(tsi->tsi_tgt), update->ou_type); + GOTO(next, rc = -ENOTSUPP); + } + + /* Check resend case only for modifying RPC */ + if (h->th_flags & MUTABOR) { + /* sanity check for last XID changing */ + if (unlikely(!need_reconstruct && + req_xid_is_last(pill->rc_req))) { + DEBUG_REQ(D_ERROR, pill->rc_req, + "unexpected last XID change"); + GOTO(next, rc = -EINVAL); + } + + if (need_reconstruct) { + out_reconstruct(env, dt, dt_obj, reply, + reply_index); + GOTO(next, rc = 0); + } + + if (dt->dd_rdonly) + GOTO(next, rc = -EROFS); + } + + /* start transaction for modification RPC only */ + if (h->th_flags & MUTABOR && current_batchid == -1) { + current_batchid = update->ou_batchid; + + if (reply_index == 0) + CFS_RACE(OBD_FAIL_PTLRPC_RESEND_RACE); + + rc = out_tx_start(env, dt, ta, tsi->tsi_exp); + if (rc != 0) + GOTO(next, rc); + + if (update->ou_flags & UPDATE_FL_SYNC) + ta->ta_handle->th_sync = 1; + } + + /* Stop the current update transaction, if the update + * has different batchid, or read-only update */ + if (((current_batchid != update->ou_batchid) || + !(h->th_flags & MUTABOR)) && + ta->ta_handle != NULL) { + rc = out_tx_end(env, ta, rc); + current_batchid = -1; + if (rc != 0) + GOTO(next, rc); + + /* start a new transaction if needed */ + if (h->th_flags & MUTABOR) { + rc = out_tx_start(env, dt, ta, + tsi->tsi_exp); + if (rc != 0) + GOTO(next, rc); + if (update->ou_flags & UPDATE_FL_SYNC) + ta->ta_handle->th_sync = 1; + current_batchid = update->ou_batchid; + } + } + + rc = h->th_act(tsi); +next: + reply_index++; + dt_object_put(env, dt_obj); + if (rc < 0) + GOTO(out, rc); + } + } +out: + if (current_batchid != -1) { + rc1 = out_tx_end(env, ta, rc); + if (rc == 0) + rc = rc1; + } + +out_free: + if (update_bufs != NULL) { + if (oub != NULL) { + for (i = 0; i < update_buf_count; i++, oub++) { + if (update_bufs[i] != NULL) + OBD_FREE_LARGE(update_bufs[i], + oub->oub_size); + } + } + + OBD_FREE(update_bufs, sizeof(*update_bufs) * update_buf_count); + } + + if (desc != NULL) + ptlrpc_free_bulk(desc); + + RETURN(rc); +} + +struct tgt_handler tgt_out_handlers[] = { +TGT_UPDATE_HDL(MUTABOR, OUT_UPDATE, out_handle), +}; +EXPORT_SYMBOL(tgt_out_handlers); + diff --git a/drivers/staging/lustrefsx/lustre/target/out_lib.c b/drivers/staging/lustrefsx/lustre/target/out_lib.c new file mode 100644 index 0000000000000..e8ebf95f4786c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/out_lib.c @@ -0,0 +1,1280 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2014, 2017, Intel Corporation. + */ +/* + * lustre/target/out_lib.c + * + * Author: Di Wang + * Author: Fan, Yong + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include + +#include "tgt_internal.h" + +const char *update_op_str(__u16 opc) +{ + static const char *opc_str[] = { + [OUT_START] = "start", + [OUT_CREATE] = "create", + [OUT_DESTROY] = "destroy", + [OUT_REF_ADD] = "ref_add", + [OUT_REF_DEL] = "ref_del" , + [OUT_ATTR_SET] = "attr_set", + [OUT_ATTR_GET] = "attr_get", + [OUT_XATTR_SET] = "xattr_set", + [OUT_XATTR_GET] = "xattr_get", + [OUT_XATTR_LIST] = "xattr_list", + [OUT_INDEX_LOOKUP] = "lookup", + [OUT_INDEX_INSERT] = "insert", + [OUT_INDEX_DELETE] = "delete", + [OUT_WRITE] = "write", + [OUT_XATTR_DEL] = "xattr_del", + [OUT_PUNCH] = "punch", + [OUT_READ] = "read", + [OUT_NOOP] = "noop", + }; + + if (opc < ARRAY_SIZE(opc_str) && opc_str[opc] != NULL) + return opc_str[opc]; + else + return "unknown"; +} +EXPORT_SYMBOL(update_op_str); + +/** + * Fill object update header + * + * Only fill the object update header, and parameters will be filled later + * in other functions. + * + * \params[in] env execution environment + * \params[in] update object update to be filled + * \params[in,out] max_update_size maximum object update size, if the + * current update length equals or + * exceeds the size, it will return -E2BIG. + * \params[in] update_op update type + * \params[in] fid object FID of the update + * \params[in] param_count the count of the update parameters + * \params[in] param_sizes the length of each parameters + * + * \retval 0 if packing succeeds. + * \retval -E2BIG if packing exceeds the maximum length. + */ +int out_update_header_pack(const struct lu_env *env, + struct object_update *update, + size_t *max_update_size, + enum update_type update_op, + const struct lu_fid *fid, + unsigned int param_count, + __u16 *param_sizes, + __u32 reply_size) +{ + struct object_update_param *param; + unsigned int i; + size_t update_size; + + if (reply_size >= LNET_MTU) + return -EINVAL; + + /* Check whether the packing exceeding the maxima update length */ + update_size = sizeof(*update); + for (i = 0; i < param_count; i++) + update_size += cfs_size_round(sizeof(*param) + param_sizes[i]); + + if (unlikely(update_size >= *max_update_size)) { + *max_update_size = update_size; + return -E2BIG; + } + + update->ou_fid = *fid; + update->ou_type = update_op; + update->ou_params_count = param_count; + update->ou_result_size = reply_size; + param = &update->ou_params[0]; + for (i = 0; i < param_count; i++) { + param->oup_len = param_sizes[i]; + param = (struct object_update_param *)((char *)param + + object_update_param_size(param)); + } + + return 0; +} + +/** + * Packs one update into the update_buffer. + * + * \param[in] env execution environment + * \param[in] update update to be packed + * \param[in] max_update_size *maximum size of \a update + * \param[in] op update operation (enum update_type) + * \param[in] fid object FID for this update + * \param[in] param_count number of parameters for this update + * \param[in] param_sizes array of parameters length of this update + * \param[in] param_bufs parameter buffers + * + * \retval = 0 if updates packing succeeds + * \retval negative errno if updates packing fails + **/ +int out_update_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, enum update_type op, + const struct lu_fid *fid, unsigned int param_count, + __u16 *param_sizes, const void **param_bufs, + __u32 reply_size) +{ + struct object_update_param *param; + unsigned int i; + int rc; + ENTRY; + + rc = out_update_header_pack(env, update, max_update_size, op, fid, + param_count, param_sizes, reply_size); + if (rc != 0) + RETURN(rc); + + param = &update->ou_params[0]; + for (i = 0; i < param_count; i++) { + memcpy(¶m->oup_buf[0], param_bufs[i], param_sizes[i]); + param = (struct object_update_param *)((char *)param + + object_update_param_size(param)); + } + + RETURN(0); +} +EXPORT_SYMBOL(out_update_pack); + +/** + * Pack various updates into the update_buffer. + * + * The following functions pack different updates into the update_buffer + * So parameters of these API is basically same as its correspondent OSD/OSP + * API, for detail description of these parameters see osd_handler.c or + * osp_md_object.c. + * + * \param[in] env execution environment + * \param[in] ubuf update buffer + * \param[in] fid fid of this object for the update + * + * \retval 0 if insertion succeeds. + * \retval negative errno if insertion fails. + */ +int out_create_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof) +{ + struct obdo *obdo; + __u16 sizes[2] = {sizeof(*obdo), 0}; + int buf_count = 1; + const struct lu_fid *parent_fid = NULL; + int rc; + ENTRY; + + if (hint != NULL && hint->dah_parent) { + parent_fid = lu_object_fid(&hint->dah_parent->do_lu); + sizes[1] = sizeof(*parent_fid); + buf_count++; + } + + rc = out_update_header_pack(env, update, max_update_size, OUT_CREATE, + fid, buf_count, sizes, 0); + if (rc != 0) + RETURN(rc); + + obdo = object_update_param_get(update, 0, NULL); + if (IS_ERR(obdo)) + RETURN(PTR_ERR(obdo)); + + obdo->o_valid = 0; + obdo_from_la(obdo, attr, attr->la_valid); + + if (parent_fid != NULL) { + struct lu_fid *tmp; + + tmp = object_update_param_get(update, 1, NULL); + if (IS_ERR(tmp)) + RETURN(PTR_ERR(tmp)); + + fid_cpu_to_le(tmp, parent_fid); + } + + RETURN(0); +} +EXPORT_SYMBOL(out_create_pack); + +int out_ref_del_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid) +{ + return out_update_pack(env, update, max_update_size, OUT_REF_DEL, fid, + 0, NULL, NULL, 0); +} +EXPORT_SYMBOL(out_ref_del_pack); + +int out_ref_add_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid) +{ + return out_update_pack(env, update, max_update_size, OUT_REF_ADD, fid, + 0, NULL, NULL, 0); +} +EXPORT_SYMBOL(out_ref_add_pack); + +int out_attr_set_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct lu_attr *attr) +{ + struct obdo *obdo; + __u16 size = sizeof(*obdo); + int rc; + ENTRY; + + rc = out_update_header_pack(env, update, max_update_size, + OUT_ATTR_SET, fid, 1, &size, 0); + if (rc != 0) + RETURN(rc); + + obdo = object_update_param_get(update, 0, NULL); + if (IS_ERR(obdo)) + RETURN(PTR_ERR(obdo)); + + obdo->o_valid = 0; + obdo_from_la(obdo, attr, attr->la_valid); + + RETURN(0); +} +EXPORT_SYMBOL(out_attr_set_pack); + +int out_xattr_set_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct lu_buf *buf, const char *name, __u32 flag) +{ + __u16 sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)}; + const void *bufs[3] = {(char *)name, (char *)buf->lb_buf, + (char *)&flag}; + + return out_update_pack(env, update, max_update_size, OUT_XATTR_SET, + fid, ARRAY_SIZE(sizes), sizes, bufs, 0); +} +EXPORT_SYMBOL(out_xattr_set_pack); + +int out_xattr_del_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const char *name) +{ + __u16 size = strlen(name) + 1; + + return out_update_pack(env, update, max_update_size, OUT_XATTR_DEL, + fid, 1, &size, (const void **)&name, 0); +} +EXPORT_SYMBOL(out_xattr_del_pack); + +int out_index_insert_pack(const struct lu_env *env, + struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct dt_rec *rec, const struct dt_key *key) +{ + struct dt_insert_rec *rec1 = (struct dt_insert_rec *)rec; + struct lu_fid rec_fid; + __u32 type = cpu_to_le32(rec1->rec_type); + __u16 sizes[3] = { strlen((char *)key) + 1, + sizeof(rec_fid), + sizeof(type) }; + const void *bufs[3] = { (char *)key, + (char *)&rec_fid, + (char *)&type }; + + fid_cpu_to_le(&rec_fid, rec1->rec_fid); + + return out_update_pack(env, update, max_update_size, OUT_INDEX_INSERT, + fid, ARRAY_SIZE(sizes), sizes, bufs, 0); +} +EXPORT_SYMBOL(out_index_insert_pack); + +int out_index_delete_pack(const struct lu_env *env, + struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct dt_key *key) +{ + __u16 size = strlen((char *)key) + 1; + const void *buf = key; + + return out_update_pack(env, update, max_update_size, OUT_INDEX_DELETE, + fid, 1, &size, &buf, 0); +} +EXPORT_SYMBOL(out_index_delete_pack); + +int out_destroy_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid) +{ + return out_update_pack(env, update, max_update_size, OUT_DESTROY, fid, + 0, NULL, NULL, 0); +} +EXPORT_SYMBOL(out_destroy_pack); + +int out_write_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const struct lu_buf *buf, __u64 pos) +{ + __u16 sizes[2] = {buf->lb_len, sizeof(pos)}; + const void *bufs[2] = {(char *)buf->lb_buf, (char *)&pos}; + int rc; + + pos = cpu_to_le64(pos); + + rc = out_update_pack(env, update, max_update_size, OUT_WRITE, fid, + ARRAY_SIZE(sizes), sizes, bufs, 0); + return rc; +} +EXPORT_SYMBOL(out_write_pack); + +/** + * Pack various readonly updates into the update_buffer. + * + * The following update funcs are only used by read-only ops, lookup, + * getattr etc, so it does not need transaction here. Currently they + * are only used by OSP. + * + * \param[in] env execution environment + * \param[in] fid fid of this object for the update + * \param[in] ubuf update buffer + * + * \retval = 0 pack succeed. + * < 0 pack failed. + **/ +int out_index_lookup_pack(const struct lu_env *env, + struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + struct dt_rec *rec, const struct dt_key *key) +{ + const void *name = key; + __u16 size = strlen((char *)name) + 1; + + /* XXX: this shouldn't be hardcoded */ + return out_update_pack(env, update, max_update_size, OUT_INDEX_LOOKUP, + fid, 1, &size, &name, 256); +} +EXPORT_SYMBOL(out_index_lookup_pack); + +int out_attr_get_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid) +{ + return out_update_pack(env, update, max_update_size, OUT_ATTR_GET, + fid, 0, NULL, NULL, sizeof(struct obdo)); +} +EXPORT_SYMBOL(out_attr_get_pack); + +int out_xattr_get_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const char *name, const int bufsize) +{ + __u16 size; + + LASSERT(name != NULL); + size = strlen(name) + 1; + + return out_update_pack(env, update, max_update_size, OUT_XATTR_GET, + fid, 1, &size, (const void **)&name, bufsize); +} +EXPORT_SYMBOL(out_xattr_get_pack); + +int out_xattr_list_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + const int bufsize) +{ + return out_update_pack(env, update, max_update_size, OUT_XATTR_LIST, + fid, 0, NULL, NULL, bufsize); +} +EXPORT_SYMBOL(out_xattr_list_pack); + +int out_read_pack(const struct lu_env *env, struct object_update *update, + size_t *max_update_size, const struct lu_fid *fid, + size_t size, loff_t pos) +{ + __u16 sizes[2] = {sizeof(size), sizeof(pos)}; + const void *bufs[2] = {&size, &pos}; + + LASSERT(size > 0); + size = cpu_to_le64(size); + pos = cpu_to_le64(pos); + + return out_update_pack(env, update, max_update_size, OUT_READ, fid, + ARRAY_SIZE(sizes), sizes, bufs, size); +} +EXPORT_SYMBOL(out_read_pack); + +static int tx_extend_args(struct thandle_exec_args *ta, int new_alloc_ta) +{ + struct tx_arg **new_ta; + int i; + int rc = 0; + + if (ta->ta_alloc_args >= new_alloc_ta) + return 0; + + OBD_ALLOC(new_ta, sizeof(*new_ta) * new_alloc_ta); + if (new_ta == NULL) + return -ENOMEM; + + for (i = 0; i < new_alloc_ta; i++) { + if (i < ta->ta_alloc_args) { + /* copy the old args to new one */ + new_ta[i] = ta->ta_args[i]; + } else { + OBD_ALLOC_PTR(new_ta[i]); + if (new_ta[i] == NULL) + GOTO(out, rc = -ENOMEM); + } + } + + /* free the old args */ + if (ta->ta_args != NULL) + OBD_FREE(ta->ta_args, sizeof(ta->ta_args[0]) * + ta->ta_alloc_args); + + ta->ta_args = new_ta; + ta->ta_alloc_args = new_alloc_ta; +out: + if (rc != 0) { + for (i = 0; i < new_alloc_ta; i++) { + if (new_ta[i] != NULL) + OBD_FREE_PTR(new_ta[i]); + } + OBD_FREE(new_ta, sizeof(*new_ta) * new_alloc_ta); + } + return rc; +} + +#define TX_ALLOC_STEP 8 +struct tx_arg *tx_add_exec(struct thandle_exec_args *ta, + tx_exec_func_t func, tx_exec_func_t undo, + const char *file, int line) +{ + int rc; + int i; + + LASSERT(ta != NULL); + LASSERT(func != NULL); + + if (ta->ta_argno + 1 >= ta->ta_alloc_args) { + rc = tx_extend_args(ta, ta->ta_alloc_args + TX_ALLOC_STEP); + if (rc != 0) + return ERR_PTR(rc); + } + + i = ta->ta_argno; + + ta->ta_argno++; + + ta->ta_args[i]->exec_fn = func; + ta->ta_args[i]->undo_fn = undo; + ta->ta_args[i]->file = file; + ta->ta_args[i]->line = line; + + return ta->ta_args[i]; +} + +static int out_obj_destroy(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle *th) +{ + int rc; + + CDEBUG(D_INFO, "%s: destroy "DFID"\n", dt_obd_name(th->th_dev), + PFID(lu_object_fid(&dt_obj->do_lu))); + + dt_write_lock(env, dt_obj, MOR_TGT_CHILD); + rc = dt_destroy(env, dt_obj, th); + dt_write_unlock(env, dt_obj); + + return rc; +} + +/** + * All of the xxx_undo will be used once execution failed, + * But because all of the required resource has been reserved in + * declare phase, i.e. if declare succeed, it should make sure + * the following executing phase succeed in anyway, so these undo + * should be useless for most of the time in Phase I + */ +static int out_tx_create_undo(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + int rc; + + rc = out_obj_destroy(env, arg->object, th); + if (rc != 0) + CERROR("%s: undo failure, we are doomed!: rc = %d\n", + dt_obd_name(th->th_dev), rc); + return rc; +} + +int out_tx_create_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + CDEBUG(D_OTHER, "%s: create "DFID": dof %u, mode %o\n", + dt_obd_name(th->th_dev), + PFID(lu_object_fid(&arg->object->do_lu)), + arg->u.create.dof.dof_type, + arg->u.create.attr.la_mode & S_IFMT); + + dt_write_lock(env, dt_obj, MOR_TGT_CHILD); + rc = dt_create(env, dt_obj, &arg->u.create.attr, + &arg->u.create.hint, &arg->u.create.dof, th); + + dt_write_unlock(env, dt_obj); + + CDEBUG(D_INFO, "%s: insert create reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc; +} + +/** + * Add create update to thandle + * + * Declare create updates and add the update to the thandle updates + * exec array. + * + * \param [in] env execution environment + * \param [in] obj object to be created + * \param [in] attr attributes of the creation + * \param [in] parent_fid the fid of the parent + * \param [in] dof dt object format of the creation + * \param [in] ta thandle execuation args where all of updates + * of the transaction are stored + * \param [in] th thandle for this update + * \param [in] reply reply of the updates + * \param [in] index index of the reply + * \param [in] file the file name where the function is called, + * which is only for debugging purpose. + * \param [in] line the line number where the funtion is called, + * which is only for debugging purpose. + * + * \retval 0 if updates is added successfully. + * \retval negative errno if update adding fails. + */ +int out_create_add_exec(const struct lu_env *env, struct dt_object *obj, + struct lu_attr *attr, struct lu_fid *parent_fid, + struct dt_object_format *dof, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + /* LU-13653: ignore quota for DNE directory creation */ + if (dof->dof_type == DFT_DIR) + th->th_ignore_quota = 1; + + rc = dt_declare_create(env, obj, attr, NULL, dof, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_create_exec, out_tx_create_undo, file, + line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + /* release the object in out_trans_stop */ + lu_object_get(&obj->do_lu); + arg->object = obj; + arg->u.create.attr = *attr; + if (parent_fid != NULL) + arg->u.create.fid = *parent_fid; + memset(&arg->u.create.hint, 0, sizeof(arg->u.create.hint)); + arg->u.create.dof = *dof; + arg->reply = reply; + arg->index = index; + + return 0; +} + +static int out_tx_attr_set_undo(const struct lu_env *env, + struct thandle *th, struct tx_arg *arg) +{ + CERROR("%s: attr set undo "DFID" unimplemented yet!: rc = %d\n", + dt_obd_name(th->th_dev), + PFID(lu_object_fid(&arg->object->do_lu)), -ENOTSUPP); + + return -ENOTSUPP; +} + +static int out_tx_attr_set_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + CDEBUG(D_OTHER, "%s: attr set "DFID"\n", dt_obd_name(th->th_dev), + PFID(lu_object_fid(&dt_obj->do_lu))); + + dt_write_lock(env, dt_obj, MOR_TGT_CHILD); + rc = dt_attr_set(env, dt_obj, &arg->u.attr_set.attr, th); + dt_write_unlock(env, dt_obj); + + CDEBUG(D_INFO, "%s: insert attr_set reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, + arg->index, rc); + + return rc; +} + +int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_attr *attr, + struct thandle_exec_args *ta, + struct thandle *th, struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_attr_set(env, dt_obj, attr, th); + if (rc != 0) + return rc; + + if (attr->la_valid & LA_FLAGS && + attr->la_flags & LUSTRE_SET_SYNC_FL) + th->th_sync |= 1; + + arg = tx_add_exec(ta, out_tx_attr_set_exec, out_tx_attr_set_undo, + file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->u.attr_set.attr = *attr; + arg->reply = reply; + arg->index = index; + return 0; +} + +static int out_tx_write_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + CDEBUG(D_INFO, "write "DFID" pos %llu buf %p, len %lu\n", + PFID(lu_object_fid(&dt_obj->do_lu)), arg->u.write.pos, + arg->u.write.buf.lb_buf, (unsigned long)arg->u.write.buf.lb_len); + + if (OBD_FAIL_CHECK(OBD_FAIL_OUT_ENOSPC)) { + rc = -ENOSPC; + } else { + dt_write_lock(env, dt_obj, MOR_TGT_CHILD); + rc = dt_record_write(env, dt_obj, &arg->u.write.buf, + &arg->u.write.pos, th); + dt_write_unlock(env, dt_obj); + + if (rc == 0) + rc = arg->u.write.buf.lb_len; + } + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc > 0 ? 0 : rc; +} + +int out_write_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_buf *buf, loff_t pos, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_record_write(env, dt_obj, buf, pos, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_write_exec, NULL, file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->u.write.buf = *buf; + arg->u.write.pos = pos; + arg->reply = reply; + arg->index = index; + return 0; +} + +static int out_tx_xattr_set_exec(const struct lu_env *env, + struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + ENTRY; + + CDEBUG(D_INFO, "%s: set xattr buf %p name %s flag %d\n", + dt_obd_name(th->th_dev), arg->u.xattr_set.buf.lb_buf, + arg->u.xattr_set.name, arg->u.xattr_set.flags); + + if (!lu_object_exists(&dt_obj->do_lu)) { + rc = -ENOENT; + } else { + struct linkea_data ldata = { 0 }; + bool linkea; + + ldata.ld_buf = &arg->u.xattr_set.buf; + if (strcmp(arg->u.xattr_set.name, XATTR_NAME_LINK) == 0) { + struct link_ea_header *leh; + + linkea = true; + rc = linkea_init(&ldata); + if (unlikely(rc)) + GOTO(out, rc == -ENODATA ? -EINVAL : rc); + + leh = ldata.ld_leh; + LASSERT(leh != NULL); + + /* If the new linkEA contains overflow timestamp, + * then two cases: + * + * 1. The old linkEA for the object has already + * overflowed before current setting, the new + * linkEA does not contains new link entry. So + * the linkEA overflow timestamp is unchanged. + * + * 2. There are new link entry in the new linkEA, + * so its overflow timestamp is differnt from + * the old one. Usually, the overstamp in the + * given linkEA is newer. But because of clock + * drift among MDTs, the timestamp may become + * older. So here, we convert the timestamp to + * the server local time. Then namespace LFSCK + * that uses local time can handle it easily. */ + if (unlikely(leh->leh_overflow_time)) { + struct lu_buf tbuf = { 0 }; + bool update = false; + + lu_buf_alloc(&tbuf, MAX_LINKEA_SIZE); + if (tbuf.lb_buf == NULL) + GOTO(unlock, rc = -ENOMEM); + + rc = dt_xattr_get(env, dt_obj, &tbuf, + XATTR_NAME_LINK); + if (rc > 0) { + struct linkea_data tdata = { 0 }; + + tdata.ld_buf = &tbuf; + rc = linkea_init(&tdata); + if (rc || leh->leh_overflow_time != + tdata.ld_leh->leh_overflow_time) + update = true; + } else { + /* Update the timestamp by force if + * fail to load the old linkEA. */ + update = true; + } + + lu_buf_free(&tbuf); + if (update) { + leh->leh_overflow_time = ktime_get_real_seconds(); + if (unlikely(!leh->leh_overflow_time)) + leh->leh_overflow_time++; + } + } + } else { + linkea = false; + } + + dt_write_lock(env, dt_obj, MOR_TGT_CHILD); + +again: + rc = dt_xattr_set(env, dt_obj, ldata.ld_buf, + arg->u.xattr_set.name, arg->u.xattr_set.flags, + th); + if (unlikely(rc == -ENOSPC && linkea)) { + rc = linkea_overflow_shrink(&ldata); + if (likely(rc > 0)) { + arg->u.xattr_set.buf.lb_len = rc; + goto again; + } + } + +unlock: + dt_write_unlock(env, dt_obj); + } + + GOTO(out, rc); + +out: + CDEBUG(D_INFO, "%s: insert xattr set reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc; +} + +int out_xattr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_buf *buf, const char *name, + int flags, struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_xattr_set(env, dt_obj, buf, name, flags, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_xattr_set_exec, NULL, file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->u.xattr_set.name = name; + arg->u.xattr_set.flags = flags; + arg->u.xattr_set.buf = *buf; + arg->reply = reply; + arg->index = index; + arg->u.xattr_set.csum = 0; + return 0; +} + +static int out_tx_xattr_del_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + CDEBUG(D_INFO, "%s: del xattr name '%s' on "DFID"\n", + dt_obd_name(th->th_dev), arg->u.xattr_set.name, + PFID(lu_object_fid(&dt_obj->do_lu))); + + if (!lu_object_exists(&dt_obj->do_lu)) + GOTO(out, rc = -ENOENT); + + dt_write_lock(env, dt_obj, MOR_TGT_CHILD); + rc = dt_xattr_del(env, dt_obj, arg->u.xattr_set.name, + th); + dt_write_unlock(env, dt_obj); +out: + CDEBUG(D_INFO, "%s: insert xattr del reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc; +} + +int out_xattr_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const char *name, struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_xattr_del(env, dt_obj, name, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_xattr_del_exec, NULL, file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->u.xattr_set.name = name; + arg->reply = reply; + arg->index = index; + return 0; +} + +static int out_obj_ref_add(const struct lu_env *env, + struct dt_object *dt_obj, + struct thandle *th) +{ + int rc; + + dt_write_lock(env, dt_obj, MOR_TGT_CHILD); + rc = dt_ref_add(env, dt_obj, th); + dt_write_unlock(env, dt_obj); + + return rc; +} + +static int out_obj_ref_del(const struct lu_env *env, + struct dt_object *dt_obj, + struct thandle *th) +{ + int rc; + + dt_write_lock(env, dt_obj, MOR_TGT_CHILD); + rc = dt_ref_del(env, dt_obj, th); + dt_write_unlock(env, dt_obj); + + return rc; +} + +static int out_tx_ref_add_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + rc = out_obj_ref_add(env, dt_obj, th); + + CDEBUG(D_INFO, "%s: insert ref_add reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + return rc; +} + +static int out_tx_ref_add_undo(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + return out_obj_ref_del(env, arg->object, th); +} + +int out_ref_add_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_ref_add(env, dt_obj, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_ref_add_exec, out_tx_ref_add_undo, file, + line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->reply = reply; + arg->index = index; + return 0; +} + +static int out_tx_ref_del_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + rc = out_obj_ref_del(env, dt_obj, th); + + CDEBUG(D_INFO, "%s: insert ref_del reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, 0); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc; +} + +static int out_tx_ref_del_undo(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + return out_obj_ref_add(env, arg->object, th); +} + +int out_ref_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_ref_del(env, dt_obj, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_ref_del_exec, out_tx_ref_del_undo, file, + line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->reply = reply; + arg->index = index; + return 0; +} + +static int out_obj_index_insert(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th) +{ + int rc; + + CDEBUG(D_INFO, "%s: index insert "DFID" name: %s fid "DFID", type %u\n", + dt_obd_name(th->th_dev), PFID(lu_object_fid(&dt_obj->do_lu)), + (char *)key, PFID(((struct dt_insert_rec *)rec)->rec_fid), + ((struct dt_insert_rec *)rec)->rec_type); + + if (dt_try_as_dir(env, dt_obj) == 0) + return -ENOTDIR; + + dt_write_lock(env, dt_obj, MOR_TGT_CHILD); + rc = dt_insert(env, dt_obj, rec, key, th); + dt_write_unlock(env, dt_obj); + + return rc; +} + +static int out_obj_index_delete(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_key *key, + struct thandle *th) +{ + int rc; + + CDEBUG(D_INFO, "%s: index delete "DFID" name: %s\n", + dt_obd_name(th->th_dev), PFID(lu_object_fid(&dt_obj->do_lu)), + (char *)key); + + if (dt_try_as_dir(env, dt_obj) == 0) + return -ENOTDIR; + + dt_write_lock(env, dt_obj, MOR_TGT_CHILD); + rc = dt_delete(env, dt_obj, key, th); + dt_write_unlock(env, dt_obj); + + return rc; +} + +static int out_tx_index_insert_exec(const struct lu_env *env, + struct thandle *th, struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + if (unlikely(!dt_object_exists(dt_obj))) + RETURN(-ESTALE); + + rc = out_obj_index_insert(env, dt_obj, + (const struct dt_rec *)&arg->u.insert.rec, + arg->u.insert.key, th); + + CDEBUG(D_INFO, "%s: insert idx insert reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + return rc; +} + +static int out_tx_index_insert_undo(const struct lu_env *env, + struct thandle *th, struct tx_arg *arg) +{ + return out_obj_index_delete(env, arg->object, arg->u.insert.key, th); +} + +int out_index_insert_add_exec(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + if (dt_try_as_dir(env, dt_obj) == 0) { + rc = -ENOTDIR; + return rc; + } + + rc = dt_declare_insert(env, dt_obj, rec, key, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_index_insert_exec, + out_tx_index_insert_undo, file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->reply = reply; + arg->index = index; + arg->u.insert.rec = *(const struct dt_insert_rec *)rec; + arg->u.insert.key = key; + + return 0; +} + +static int out_tx_index_delete_exec(const struct lu_env *env, + struct thandle *th, + struct tx_arg *arg) +{ + int rc; + + rc = out_obj_index_delete(env, arg->object, arg->u.insert.key, th); + + CDEBUG(D_INFO, "%s: delete idx insert reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + return rc; +} + +static int out_tx_index_delete_undo(const struct lu_env *env, + struct thandle *th, + struct tx_arg *arg) +{ + CERROR("%s: Oops, can not rollback index_delete yet: rc = %d\n", + dt_obd_name(th->th_dev), -ENOTSUPP); + return -ENOTSUPP; +} + +int out_index_delete_add_exec(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_key *key, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + if (dt_try_as_dir(env, dt_obj) == 0) { + rc = -ENOTDIR; + return rc; + } + + LASSERT(ta->ta_handle != NULL); + rc = dt_declare_delete(env, dt_obj, key, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_index_delete_exec, + out_tx_index_delete_undo, file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->reply = reply; + arg->index = index; + arg->u.insert.key = key; + return 0; +} + +static int out_tx_destroy_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + struct dt_object *dt_obj = arg->object; + int rc; + + rc = out_obj_destroy(env, dt_obj, th); + + CDEBUG(D_INFO, "%s: insert destroy reply %p index %d: rc = %d\n", + dt_obd_name(th->th_dev), arg->reply, arg->index, rc); + + if (arg->reply != NULL) + object_update_result_insert(arg->reply, NULL, 0, arg->index, + rc); + + RETURN(rc); +} + +static int out_tx_destroy_undo(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg) +{ + CERROR("%s: not support destroy undo yet!: rc = %d\n", + dt_obd_name(th->th_dev), -ENOTSUPP); + return -ENOTSUPP; +} + +int out_destroy_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line) +{ + struct tx_arg *arg; + int rc; + + rc = dt_declare_destroy(env, dt_obj, th); + if (rc != 0) + return rc; + + arg = tx_add_exec(ta, out_tx_destroy_exec, out_tx_destroy_undo, + file, line); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + lu_object_get(&dt_obj->do_lu); + arg->object = dt_obj; + arg->reply = reply; + arg->index = index; + return 0; +} diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c new file mode 100644 index 0000000000000..afbf668e38a70 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c @@ -0,0 +1,363 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + * + * Copyright (c) 2012, 2014, Intel Corporation. + * + * Copyright (c) 2019, DDN Storage Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/target/tgt_fmd.c + * + * This file provides functions to handle Filter Modification Data (FMD). + * The FMD is responsible for file attributes to be applied in + * Transaction ID (XID) order, so older requests can't re-write newer + * attributes. + * + * FMD is organized as per-client list and identified by FID of object. Each + * FMD stores FID of object and the highest received XID of modification + * request for this object. + * + * FMD can expire if there are no updates for a long time to keep the list + * reasonably small. + * + * Author: Andreas Dilger + * Author: Mike Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#include "tgt_internal.h" + +/** + * Drop FMD reference and free it if reference drops to zero. + * + * Must be called with ted_fmd_lock held. + * + * \param[in] exp OBD export + * \param[in] fmd FMD to put + */ +static inline void tgt_fmd_put_nolock(struct obd_export *exp, + struct tgt_fmd_data *fmd) +{ + struct tg_export_data *ted = &exp->exp_target_data; + + assert_spin_locked(&ted->ted_fmd_lock); + if (--fmd->fmd_refcount == 0) { + ted->ted_fmd_count--; + list_del(&fmd->fmd_list); + OBD_SLAB_FREE_PTR(fmd, tgt_fmd_kmem); + } +} + +/** + * Wrapper to drop FMD reference with ted_fmd_lock held. + * + * \param[in] exp OBD export + * \param[in] fmd FMD to put + */ +void tgt_fmd_put(struct obd_export *exp, struct tgt_fmd_data *fmd) +{ + struct tg_export_data *ted = &exp->exp_target_data; + + spin_lock(&ted->ted_fmd_lock); + tgt_fmd_put_nolock(exp, fmd); /* caller reference */ + spin_unlock(&ted->ted_fmd_lock); +} + +/** + * Expire FMD entries. + * + * Expire entries from the FMD list if there are too many + * of them or they are too old. + * + * This function must be called with ted_fmd_lock held. + * + * The \a keep FMD is not to be expired in any case. This parameter is used + * by ofd_fmd_find_nolock() to prohibit a FMD that was just found from + * expiring. + * + * \param[in] exp OBD export + * \param[in] keep FMD to keep always + */ +static void tgt_fmd_expire_nolock(struct obd_export *exp, + struct tgt_fmd_data *keep) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *lut = exp->exp_obd->u.obt.obt_lut; + time64_t now = ktime_get_seconds(); + struct tgt_fmd_data *fmd, *tmp; + + list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) { + if (fmd == keep) + break; + + if (now < fmd->fmd_expire && + ted->ted_fmd_count < lut->lut_fmd_max_num) + break; + + list_del_init(&fmd->fmd_list); + tgt_fmd_put_nolock(exp, fmd); /* list reference */ + } +} + +/** + * Expire FMD entries. + * + * This is a wrapper to call ofd_fmd_expire_nolock() with the required lock. + * + * \param[in] exp OBD export + */ +void tgt_fmd_expire(struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + + spin_lock(&ted->ted_fmd_lock); + tgt_fmd_expire_nolock(exp, NULL); + spin_unlock(&ted->ted_fmd_lock); +} + +/** + * Find FMD by specified FID. + * + * Function finds FMD entry by FID in the tg_export_data::ted_fmd_list. + * + * Caller must hold tg_export_data::ted_fmd_lock and take FMD reference. + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to find + * + * \retval struct tgt_fmd_data found by FID + * \retval NULL is FMD is not found + */ +static struct tgt_fmd_data *tgt_fmd_find_nolock(struct obd_export *exp, + const struct lu_fid *fid) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct tgt_fmd_data *found = NULL, *fmd; + struct lu_target *lut = exp->exp_obd->u.obt.obt_lut; + time64_t now = ktime_get_seconds(); + + assert_spin_locked(&ted->ted_fmd_lock); + + list_for_each_entry_reverse(fmd, &ted->ted_fmd_list, fmd_list) { + if (lu_fid_eq(&fmd->fmd_fid, fid)) { + found = fmd; + list_move_tail(&fmd->fmd_list, &ted->ted_fmd_list); + fmd->fmd_expire = now + lut->lut_fmd_max_age; + break; + } + } + + tgt_fmd_expire_nolock(exp, found); + + return found; +} + +/** + * Find FMD by specified FID with locking. + * + * Wrapper to the ofd_fmd_find_nolock() with correct locks. + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to find + * + * \retval struct tgt_fmd_data found by FID + * \retval NULL indicates FMD is not found + */ +struct tgt_fmd_data *tgt_fmd_find(struct obd_export *exp, + const struct lu_fid *fid) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct tgt_fmd_data *fmd; + + spin_lock(&ted->ted_fmd_lock); + fmd = tgt_fmd_find_nolock(exp, fid); + if (fmd) + fmd->fmd_refcount++; /* caller reference */ + spin_unlock(&ted->ted_fmd_lock); + + return fmd; +} + +/** + * Find FMD by FID or create a new one if none is found. + * + * It is possible for this function to return NULL under memory pressure, + * or if the passed FID is zero (which will only cause old entries to expire). + * Currently this is not fatal because any FMD state is transient and + * may also be freed when it gets sufficiently old. + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to find + * + * \retval struct tgt_fmd_data found by FID + * \retval NULL indicates FMD is not found + */ +struct tgt_fmd_data *tgt_fmd_get(struct obd_export *exp, + const struct lu_fid *fid) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct tgt_fmd_data *found = NULL, *fmd_new = NULL; + + OBD_SLAB_ALLOC_PTR(fmd_new, tgt_fmd_kmem); + + spin_lock(&ted->ted_fmd_lock); + found = tgt_fmd_find_nolock(exp, fid); + if (fmd_new) { + if (!found) { + list_add_tail(&fmd_new->fmd_list, &ted->ted_fmd_list); + fmd_new->fmd_fid = *fid; + fmd_new->fmd_refcount++; /* list reference */ + found = fmd_new; + ted->ted_fmd_count++; + } else { + OBD_SLAB_FREE_PTR(fmd_new, tgt_fmd_kmem); + } + } + if (found) { + found->fmd_refcount++; /* caller reference */ + found->fmd_expire = ktime_get_seconds() + + class_exp2tgt(exp)->lut_fmd_max_age; + } else { + LCONSOLE_WARN("%s: cannot allocate FMD for "DFID + ", timestamps may be out of sync\n", + exp->exp_obd->obd_name, PFID(fid)); + } + spin_unlock(&ted->ted_fmd_lock); + + return found; +} + +#ifdef DO_FMD_DROP +/** + * Drop FMD list reference so it will disappear when last reference is dropped + * to zero. + * + * This function is called from ofd_destroy() and may only affect + * the one client that is doing the unlink and at worst we have an stale entry + * referencing an object that should never be used again. + * + * NB: this function is used only if DO_FMD_DROP is defined. It is not + * currently defined, so FMD drop doesn't happen and FMD are dropped only + * when expired. + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to drop + */ +void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct tgt_fmd_data *fmd = NULL; + + spin_lock(&ted->ted_fmd_lock); + fmd = tgt_fmd_find_nolock(exp, fid); + if (fmd) { + list_del_init(&fmd->fmd_list); + tgt_fmd_put_nolock(exp, fmd); + } + spin_unlock(&ted->ted_fmd_lock); +} +EXPORT_SYMBOL(tgt_fmd_drop); +#endif + +/** + * Remove all entries from FMD list. + * + * Cleanup function to free all FMD enries on the given export. + * + * \param[in] exp OBD export + */ +void tgt_fmd_cleanup(struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct tgt_fmd_data *fmd = NULL, *tmp; + + spin_lock(&ted->ted_fmd_lock); + list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) { + list_del_init(&fmd->fmd_list); + if (fmd->fmd_refcount > 1) { + CDEBUG(D_INFO, + "fmd %p still referenced (refcount = %d)\n", + fmd, fmd->fmd_refcount); + } + tgt_fmd_put_nolock(exp, fmd); + } + spin_unlock(&ted->ted_fmd_lock); + LASSERT(list_empty(&exp->exp_target_data.ted_fmd_list)); +} + +/** + * Update FMD with the latest request XID. + * + * Save a new setattr/punch XID in FMD if exists. + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to find + * \param[in] xid request XID + */ +void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid, __u64 xid) +{ + struct tgt_fmd_data *fmd; + + fmd = tgt_fmd_get(exp, fid); + if (fmd) { + if (fmd->fmd_mactime_xid < xid) + fmd->fmd_mactime_xid = xid; + tgt_fmd_put(exp, fmd); + } +} +EXPORT_SYMBOL(tgt_fmd_update); + +/** + * Chech that time can be updated by the request with given XID. + * + * Check FMD XID if exists to be less than supplied XID + * + * \param[in] exp OBD export + * \param[in] fid FID of FMD to find + * \param[in] xid request XID + * + * \retval true if FMD has no greater XID, so time attr can be updated + */ +bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid, __u64 xid) +{ + struct tgt_fmd_data *fmd; + bool can_update = true; + + fmd = tgt_fmd_find(exp, fid); + if (fmd) { + can_update = fmd->fmd_mactime_xid < xid; + tgt_fmd_put(exp, fmd); + } + + return can_update; +} +EXPORT_SYMBOL(tgt_fmd_check); + diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c new file mode 100644 index 0000000000000..3c5eec062cb4e --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c @@ -0,0 +1,1692 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/target/tgt_grant.c + * + * This file provides code related to grant space management on Lustre Targets + * (OSTs and MDTs). Grant is a mechanism used by client nodes to reserve disk + * space on a target for the data writeback cache. The Lustre client is thus + * assured that enough space will be available when flushing dirty pages + * asynchronously. Each client node is granted an initial amount of reserved + * space at connect time and gets additional space back from target in bulk + * write reply. + * + * We actually support three different cases: + * - The client supports the new grant parameters (i.e. OBD_CONNECT_GRANT_PARAM) + * which means that all grant overhead calculation happens on the client side. + * The server reports at connect time the backend filesystem block size, the + * maximum extent size as well as the extent insertion cost and it is then up + * to the osc layer to the track dirty extents and consume grant accordingly + * (see osc_cache.c). In each bulk write request, the client provides how much + * grant space was consumed for this RPC. + * - The client does not support OBD_CONNECT_GRANT_PARAM and always assumes a + * a backend file system block size of 4KB. We then have two cases: + * - If the block size is really 4KB, then the client can deal with grant + * allocation for partial block writes, but won't take extent insertion cost + * into account. For such clients, we inflate grant by 100% on the server + * side. It means that when 32MB of grant is hold by the client, 64MB of + * grant space is actually reserved on the server. All grant counters + * provided by such a client are inflated by 100%. + * - The backend filesystem block size is bigger than 4KB, which isn't + * supported by the client. In this case, we emulate a 4KB block size and + * consume one block size on the server for each 4KB of grant returned to + * client. With a 128KB blocksize, it means that 32MB dirty pages of 4KB + * on the client will actually consume 1GB of grant on the server. + * All grant counters provided by such a client are inflated by the block + * size ratio. + * + * This file handles the core logic for: + * - grant allocation strategy + * - maintaining per-client as well as global grant space accounting + * - processing grant information packed in incoming requests + * - allocating server-side grant space for synchronous write RPCs which did not + * consume grant on the client side (OBD_BRW_FROM_GRANT flag not set). If not + * enough space is available, such RPCs fail with ENOSPC + * + * Author: Johann Lombardi + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#include "tgt_internal.h" + +/* Clients typically hold 2x their max_rpcs_in_flight of grant space */ +#define TGT_GRANT_SHRINK_LIMIT(exp) (2ULL * 8 * exp_max_brw_size(exp)) + +/* Helpers to inflate/deflate grants for clients that do not support the grant + * parameters */ +static inline u64 tgt_grant_inflate(struct tg_grants_data *tgd, u64 val) +{ + if (tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) + /* Client does not support such large block size, grant + * is thus inflated. We already significantly overestimate + * overhead, no need to add the extent tax in this case */ + return val << (tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT); + return val; +} + +/* Companion of tgt_grant_inflate() */ +static inline u64 tgt_grant_deflate(struct tg_grants_data *tgd, u64 val) +{ + if (tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) + return val >> (tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT); + return val; +} + +/* Grant chunk is used as a unit for grant allocation. It should be inflated + * if the client does not support the grant paramaters. + * Check connection flag against \a data if not NULL. This is used during + * connection creation where exp->exp_connect_data isn't populated yet */ +static inline u64 tgt_grant_chunk(struct obd_export *exp, + struct lu_target *lut, + struct obd_connect_data *data) +{ + struct tg_grants_data *tgd = &lut->lut_tgd; + u64 chunk = exp_max_brw_size(exp); + u64 tax; + + if (exp->exp_obd->obd_self_export == exp) + /* Grant enough space to handle a big precreate request */ + return OST_MAX_PRECREATE * lut->lut_dt_conf.ddp_inodespace / 2; + + if ((data == NULL && !(exp_grant_param_supp(exp))) || + (data != NULL && !OCD_HAS_FLAG(data, GRANT_PARAM))) + /* Try to grant enough space to send 2 full-size RPCs */ + return tgt_grant_inflate(tgd, chunk) << 1; + + /* Try to return enough to send two full-size RPCs + * = 2 * (BRW_size + #extents_in_BRW * grant_tax) */ + tax = 1ULL << tgd->tgd_blockbits; /* block size */ + tax *= lut->lut_dt_conf.ddp_max_extent_blks; /* max extent size */ + tax = (chunk + tax - 1) / tax; /* #extents in a RPC */ + tax *= lut->lut_dt_conf.ddp_extent_tax; /* extent tax for a RPC */ + chunk = (chunk + tax) * 2; /* we said two full RPCs */ + return chunk; +} + +static int tgt_check_export_grants(struct obd_export *exp, u64 *dirty, + u64 *pending, u64 *granted, u64 maxsize) +{ + struct tg_export_data *ted = &exp->exp_target_data; + int level = D_CACHE; + + if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0) + level = D_ERROR; + CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, + ted->ted_dirty, ted->ted_pending, ted->ted_grant); + + if (ted->ted_grant + ted->ted_pending > maxsize) { + CERROR("%s: cli %s/%p ted_grant(%ld) + ted_pending(%ld)" + " > maxsize(%llu)\n", exp->exp_obd->obd_name, + exp->exp_client_uuid.uuid, exp, ted->ted_grant, + ted->ted_pending, maxsize); + return -EFAULT; + } + if (ted->ted_dirty > maxsize) { + CERROR("%s: cli %s/%p ted_dirty(%ld) > maxsize(%llu)\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, + exp, ted->ted_dirty, maxsize); + return -EFAULT; + } + *granted += ted->ted_grant + ted->ted_pending; + *pending += ted->ted_pending; + *dirty += ted->ted_dirty; + return 0; +} + +/** + * Perform extra sanity checks for grant accounting. + * + * This function scans the export list, sanity checks per-export grant counters + * and verifies accuracy of global grant accounting. If an inconsistency is + * found, a CERROR is printed with the function name \func that was passed as + * argument. LBUG is only called in case of serious counter corruption (i.e. + * value larger than the device size). + * Those sanity checks can be pretty expensive and are disabled if the OBD + * device has more than 100 connected exports. + * + * \param[in] obd OBD device for which grant accounting should be + * verified + * \param[in] func caller's function name + */ +void tgt_grant_sanity_check(struct obd_device *obd, const char *func) +{ + struct lu_target *lut = obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct obd_export *exp; + struct tg_export_data *ted; + u64 maxsize; + u64 tot_dirty = 0; + u64 tot_pending = 0; + u64 tot_granted = 0; + u64 fo_tot_granted; + u64 fo_tot_pending; + u64 fo_tot_dirty; + int error; + + if (list_empty(&obd->obd_exports)) + return; + + /* We don't want to do this for large machines that do lots of + * mounts or unmounts. It burns... */ + if (obd->obd_num_exports > 100) + return; + + maxsize = tgd->tgd_osfs.os_blocks << tgd->tgd_blockbits; + + spin_lock(&obd->obd_dev_lock); + spin_lock(&tgd->tgd_grant_lock); + exp = obd->obd_self_export; + ted = &exp->exp_target_data; + CDEBUG(D_CACHE, "%s: processing self export: %ld %ld " + "%ld\n", obd->obd_name, ted->ted_grant, + ted->ted_pending, ted->ted_dirty); + tot_granted += ted->ted_grant + ted->ted_pending; + tot_pending += ted->ted_pending; + tot_dirty += ted->ted_dirty; + + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) { + error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending, + &tot_granted, maxsize); + if (error < 0) { + spin_unlock(&obd->obd_dev_lock); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + } + + /* exports about to be unlinked should also be taken into account since + * they might still hold pending grant space to be released at + * commit time */ + list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain) { + error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending, + &tot_granted, maxsize); + if (error < 0) { + spin_unlock(&obd->obd_dev_lock); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + } + + fo_tot_granted = tgd->tgd_tot_granted; + fo_tot_pending = tgd->tgd_tot_pending; + fo_tot_dirty = tgd->tgd_tot_dirty; + spin_unlock(&obd->obd_dev_lock); + spin_unlock(&tgd->tgd_grant_lock); + + if (tot_granted != fo_tot_granted) + CERROR("%s: tot_granted %llu != fo_tot_granted %llu\n", + func, tot_granted, fo_tot_granted); + if (tot_pending != fo_tot_pending) + CERROR("%s: tot_pending %llu != fo_tot_pending %llu\n", + func, tot_pending, fo_tot_pending); + if (tot_dirty != fo_tot_dirty) + CERROR("%s: tot_dirty %llu != fo_tot_dirty %llu\n", + func, tot_dirty, fo_tot_dirty); + if (tot_pending > tot_granted) + CERROR("%s: tot_pending %llu > tot_granted %llu\n", + func, tot_pending, tot_granted); + if (tot_granted > maxsize) + CERROR("%s: tot_granted %llu > maxsize %llu\n", + func, tot_granted, maxsize); + if (tot_dirty > maxsize) + CERROR("%s: tot_dirty %llu > maxsize %llu\n", + func, tot_dirty, maxsize); +} +EXPORT_SYMBOL(tgt_grant_sanity_check); + +/** + * Get file system statistics of target. + * + * Helper function for statfs(), also used by grant code. + * Implements caching for statistics to avoid calling OSD device each time. + * + * \param[in] env execution environment + * \param[in] lut LU target + * \param[out] osfs statistic data to return + * \param[in] max_age maximum age for cached data + * \param[in] from_cache show that data was get from cache or not + * + * \retval 0 if successful + * \retval negative value on error + */ +int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut, + struct obd_statfs *osfs, time64_t max_age, int *from_cache) +{ + struct tg_grants_data *tgd = &lut->lut_tgd; + int rc = 0; + ENTRY; + + spin_lock(&tgd->tgd_osfs_lock); + if (tgd->tgd_osfs_age < max_age || max_age == 0) { + u64 unstable; + + /* statfs data are too old, get up-to-date one. + * we must be cautious here since multiple threads might be + * willing to update statfs data concurrently and we must + * grant that cached statfs data are always consistent */ + + if (tgd->tgd_statfs_inflight == 0) + /* clear inflight counter if no users, although it would + * take a while to overflow this 64-bit counter ... */ + tgd->tgd_osfs_inflight = 0; + /* notify tgt_grant_commit() that we want to track writes + * completed as of now */ + tgd->tgd_statfs_inflight++; + /* record value of inflight counter before running statfs to + * compute the diff once statfs is completed */ + unstable = tgd->tgd_osfs_inflight; + spin_unlock(&tgd->tgd_osfs_lock); + + /* statfs can sleep ... hopefully not for too long since we can + * call it fairly often as space fills up */ + rc = dt_statfs(env, lut->lut_bottom, osfs); + if (unlikely(rc)) + GOTO(out, rc); + + osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX); + + spin_lock(&tgd->tgd_grant_lock); + spin_lock(&tgd->tgd_osfs_lock); + /* calculate how much space was written while we released the + * tgd_osfs_lock */ + unstable = tgd->tgd_osfs_inflight - unstable; + tgd->tgd_osfs_unstable = 0; + if (unstable) { + /* some writes committed while we were running statfs + * w/o the tgd_osfs_lock. Those ones got added to + * the cached statfs data that we are about to crunch. + * Take them into account in the new statfs data */ + osfs->os_bavail -= min_t(u64, osfs->os_bavail, + unstable >> tgd->tgd_blockbits); + /* However, we don't really know if those writes got + * accounted in the statfs call, so tell + * tgt_grant_space_left() there is some uncertainty + * on the accounting of those writes. + * The purpose is to prevent spurious error messages in + * tgt_grant_space_left() since those writes might be + * accounted twice. */ + tgd->tgd_osfs_unstable += unstable; + } + /* similarly, there is some uncertainty on write requests + * between prepare & commit */ + tgd->tgd_osfs_unstable += tgd->tgd_tot_pending; + spin_unlock(&tgd->tgd_grant_lock); + + /* finally udpate cached statfs data */ + tgd->tgd_osfs = *osfs; + tgd->tgd_osfs_age = ktime_get_seconds(); + + tgd->tgd_statfs_inflight--; /* stop tracking */ + if (tgd->tgd_statfs_inflight == 0) + tgd->tgd_osfs_inflight = 0; + spin_unlock(&tgd->tgd_osfs_lock); + + if (from_cache) + *from_cache = 0; + } else { + /* use cached statfs data */ + *osfs = tgd->tgd_osfs; + spin_unlock(&tgd->tgd_osfs_lock); + if (from_cache) + *from_cache = 1; + } + GOTO(out, rc); + +out: + return rc; +} +EXPORT_SYMBOL(tgt_statfs_internal); + +/** + * Update cached statfs information from the OSD layer + * + * Refresh statfs information cached in tgd::tgd_osfs if the cache is older + * than 1s or if force is set. The OSD layer is in charge of estimating data & + * metadata overhead. + * This function can sleep so it should not be called with any spinlock held. + * + * \param[in] env LU environment passed by the caller + * \param[in] exp export used to print client info in debug + * messages + * \param[in] force force a refresh of statfs information + * \param[out] from_cache returns whether the statfs information are + * taken from cache + */ +static void tgt_grant_statfs(const struct lu_env *env, struct obd_export *exp, + int force, int *from_cache) +{ + struct obd_device *obd = exp->exp_obd; + struct lu_target *lut = obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct tgt_thread_info *tti; + struct obd_statfs *osfs; + time64_t max_age; + int rc; + + if (force) + max_age = 0; /* get fresh statfs data */ + else + max_age = ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS; + + tti = tgt_th_info(env); + osfs = &tti->tti_u.osfs; + rc = tgt_statfs_internal(env, lut, osfs, max_age, from_cache); + if (unlikely(rc)) { + if (from_cache) + *from_cache = 0; + return; + } + + CDEBUG(D_CACHE, "%s: cli %s/%p free: %llu avail: %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + osfs->os_bfree << tgd->tgd_blockbits, + osfs->os_bavail << tgd->tgd_blockbits); +} + +/** + * Figure out how much space is available on the backend filesystem after + * removing grant space already booked by clients. + * + * This is done by accessing cached statfs data previously populated by + * tgt_grant_statfs(), from which we withdraw the space already granted to + * clients and the reserved space. + * Caller must hold tgd_grant_lock spinlock. + * + * \param[in] exp export associated with the device for which the amount + * of available space is requested + * \retval amount of non-allocated space, in bytes + */ +static u64 tgt_grant_space_left(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + struct lu_target *lut = obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + u64 tot_granted; + u64 left; + u64 avail; + u64 unstable; + u64 reserved; + + ENTRY; + assert_spin_locked(&tgd->tgd_grant_lock); + + spin_lock(&tgd->tgd_osfs_lock); + /* get available space from cached statfs data */ + left = tgd->tgd_osfs.os_bavail << tgd->tgd_blockbits; + unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */ + spin_unlock(&tgd->tgd_osfs_lock); + + reserved = left * tgd->tgd_reserved_pcnt / 100; + tot_granted = tgd->tgd_tot_granted + reserved; + + if (left < tot_granted) { + int mask = (left + unstable < + tot_granted - tgd->tgd_tot_pending) ? + D_ERROR : D_CACHE; + + CDEBUG_LIMIT(mask, "%s: cli %s/%p left %llu < tot_grant " + "%llu unstable %llu pending %llu " + "dirty %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + left, tot_granted, unstable, + tgd->tgd_tot_pending, + tgd->tgd_tot_dirty); + RETURN(0); + } + + avail = left; + /* Withdraw space already granted to clients */ + left -= tot_granted; + + /* Align left on block size */ + left &= ~((1ULL << tgd->tgd_blockbits) - 1); + + CDEBUG(D_CACHE, "%s: cli %s/%p avail %llu left %llu unstable " + "%llu tot_grant %llu pending %llu\n", obd->obd_name, + exp->exp_client_uuid.uuid, exp, avail, left, unstable, + tot_granted, tgd->tgd_tot_pending); + + RETURN(left); +} + +/** + * Process grant information from obdo structure packed in incoming BRW + * and inflate grant counters if required. + * + * Grab the dirty and seen grant announcements from the incoming obdo and + * inflate all grant counters passed in the request if the client does not + * support the grant parameters. + * We will later calculate the client's new grant and return it. + * Caller must hold tgd_grant_lock spinlock. + * + * \param[in] env LU environment supplying osfs storage + * \param[in] exp export for which we received the request + * \param[in,out] oa incoming obdo sent by the client + */ +static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, long chunk) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct obd_device *obd = exp->exp_obd; + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + long long dirty, dropped; + ENTRY; + + assert_spin_locked(&tgd->tgd_grant_lock); + + if ((oa->o_valid & (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) != + (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) { + oa->o_valid &= ~OBD_MD_FLGRANT; + RETURN_EXIT; + } + + /* Add some margin, since there is a small race if other RPCs arrive + * out-or-order and have already consumed some grant. We want to + * leave this here in case there is a large error in accounting. */ + CDEBUG(D_CACHE, + "%s: cli %s/%p reports grant %llu dropped %u, local %lu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, oa->o_grant, + oa->o_dropped, ted->ted_grant); + + if ((long long)oa->o_dirty < 0) + oa->o_dirty = 0; + + /* inflate grant counters if required */ + if (!exp_grant_param_supp(exp)) { + u64 tmp; + oa->o_grant = tgt_grant_inflate(tgd, oa->o_grant); + oa->o_dirty = tgt_grant_inflate(tgd, oa->o_dirty); + /* inflation can bump client's wish to >4GB which doesn't fit + * 32bit o_undirty, limit that .. */ + tmp = tgt_grant_inflate(tgd, oa->o_undirty); + if (tmp >= OBD_MAX_GRANT) + tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits); + oa->o_undirty = tmp; + tmp = tgt_grant_inflate(tgd, oa->o_dropped); + if (tmp >= OBD_MAX_GRANT) + tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits); + oa->o_dropped = tmp; + } + + dirty = oa->o_dirty; + dropped = oa->o_dropped; + + /* Update our accounting now so that statfs takes it into account. + * Note that ted_dirty is only approximate and can become incorrect + * if RPCs arrive out-of-order. No important calculations depend + * on ted_dirty however, but we must check sanity to not assert. */ + if (dirty > ted->ted_grant + 4 * chunk) + dirty = ted->ted_grant + 4 * chunk; + tgd->tgd_tot_dirty += dirty - ted->ted_dirty; + if (ted->ted_grant < dropped) { + CDEBUG(D_CACHE, + "%s: cli %s/%p reports %llu dropped > grant %lu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, dropped, + ted->ted_grant); + dropped = 0; + } + if (tgd->tgd_tot_granted < dropped) { + CERROR("%s: cli %s/%p reports %llu dropped > tot_grant %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + dropped, tgd->tgd_tot_granted); + dropped = 0; + } + tgd->tgd_tot_granted -= dropped; + ted->ted_grant -= dropped; + ted->ted_dirty = dirty; + + if (ted->ted_dirty < 0 || ted->ted_grant < 0 || ted->ted_pending < 0) { + CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + ted->ted_dirty, ted->ted_pending, ted->ted_grant); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + EXIT; +} + +/** + * Grant shrink request handler. + * + * Client nodes can explicitly release grant space (i.e. process called grant + * shrinking). This function proceeds with the shrink request when there is + * less ungranted space remaining than the amount all of the connected clients + * would consume if they used their full grant. + * Caller must hold tgd_grant_lock spinlock. + * + * \param[in] exp export releasing grant space + * \param[in,out] oa incoming obdo sent by the client + * \param[in] left_space remaining free space with space already granted + * taken out + */ +static void tgt_grant_shrink(struct obd_export *exp, struct obdo *oa, + u64 left_space) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct obd_device *obd = exp->exp_obd; + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + long grant_shrink; + + assert_spin_locked(&tgd->tgd_grant_lock); + LASSERT(exp); + if (left_space >= tgd->tgd_tot_granted_clients * + TGT_GRANT_SHRINK_LIMIT(exp)) + return; + + grant_shrink = oa->o_grant; + + if (ted->ted_grant < grant_shrink) { + CDEBUG(D_CACHE, + "%s: cli %s/%p wants %lu shrinked > grant %lu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + grant_shrink, ted->ted_grant); + grant_shrink = ted->ted_grant; + } + + ted->ted_grant -= grant_shrink; + tgd->tgd_tot_granted -= grant_shrink; + + CDEBUG(D_CACHE, "%s: cli %s/%p shrink %ld ted_grant %ld total %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, grant_shrink, + ted->ted_grant, tgd->tgd_tot_granted); + + /* client has just released some grant, don't grant any space back */ + oa->o_grant = 0; +} + +/** + * Calculate how much space is required to write a given network buffer + * + * This function takes block alignment into account to estimate how much on-disk + * space will be required to successfully write the whole niobuf. + * Estimated space is inflated if the export does not support + * OBD_CONNECT_GRANT_PARAM and if the backend filesystem has a block size + * larger than the minimal supported page size (i.e. 4KB). + * + * \param[in] exp export associated which the write request + * if NULL, then size estimate is done for server-side + * grant allocation. + * \param[in] lut LU target handling the request + * \param[in] rnb network buffer to estimate size of + * + * \retval space (in bytes) that will be consumed to write the + * network buffer + */ +static inline u64 tgt_grant_rnb_size(struct obd_export *exp, + struct lu_target *lut, + struct niobuf_remote *rnb) +{ + struct tg_grants_data *tgd = &lut->lut_tgd; + u64 blksize; + u64 bytes; + u64 end; + + if (exp && !exp_grant_param_supp(exp) && + tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) + blksize = 1ULL << COMPAT_BSIZE_SHIFT; + else + blksize = 1ULL << tgd->tgd_blockbits; + + /* The network buffer might span several blocks, align it on block + * boundaries */ + bytes = rnb->rnb_offset & (blksize - 1); + bytes += rnb->rnb_len; + end = bytes & (blksize - 1); + if (end) + bytes += blksize - end; + + if (exp == NULL || exp_grant_param_supp(exp)) { + /* add per-extent insertion cost */ + u64 max_ext; + int nr_ext; + + max_ext = blksize * lut->lut_dt_conf.ddp_max_extent_blks; + nr_ext = (bytes + max_ext - 1) / max_ext; + bytes += nr_ext * lut->lut_dt_conf.ddp_extent_tax; + } else { + /* Inflate grant space if client does not support extent-based + * grant allocation */ + bytes = tgt_grant_inflate(tgd, (u64)bytes); + } + + return bytes; +} + +/** + * Validate grant accounting for each incoming remote network buffer. + * + * When clients have dirtied as much space as they've been granted they + * fall through to sync writes. These sync writes haven't been expressed + * in grants and need to error with ENOSPC when there isn't room in the + * filesystem for them after grants are taken into account. However, + * writeback of the dirty data that was already granted space can write + * right on through. + * The OBD_BRW_GRANTED flag will be set in the rnb_flags of each network + * buffer which has been granted enough space to proceed. Buffers without + * this flag will fail to be written with -ENOSPC (see tgt_preprw_write(). + * Caller must hold tgd_grant_lock spinlock. + * + * \param[in] env LU environment passed by the caller + * \param[in] exp export identifying the client which sent the RPC + * \param[in] oa incoming obdo in which we should return the pack the + * additional grant + * \param[in,out] rnb the list of network buffers + * \param[in] niocount the number of network buffers in the list + * \param[in] left the remaining free space with space already granted + * taken out + */ +static void tgt_grant_check(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct niobuf_remote *rnb, + int niocount, u64 *left) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct obd_device *obd = exp->exp_obd; + struct lu_target *lut = obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + unsigned long ungranted = 0; + unsigned long granted = 0; + int i; + bool skip = false; + + ENTRY; + + assert_spin_locked(&tgd->tgd_grant_lock); + + if (obd->obd_recovering) { + /* Replaying write. Grant info have been processed already so no + * need to do any enforcement here. It is worth noting that only + * bulk writes with all rnbs having OBD_BRW_FROM_GRANT can be + * replayed. If one page hasn't OBD_BRW_FROM_GRANT set, then + * the whole bulk is written synchronously */ + skip = true; + CDEBUG(D_CACHE, "Replaying write, skipping accounting\n"); + } else if ((oa->o_valid & OBD_MD_FLFLAGS) && + (oa->o_flags & OBD_FL_RECOV_RESEND)) { + /* Recoverable resend, grant info have already been processed as + * well */ + skip = true; + CDEBUG(D_CACHE, "Recoverable resend arrived, skipping " + "accounting\n"); + } else if (exp_grant_param_supp(exp) && oa->o_grant_used > 0) { + /* Client supports the new grant parameters and is telling us + * how much grant space it consumed for this bulk write. + * Although all rnbs are supposed to have the OBD_BRW_FROM_GRANT + * flag set, we will scan the rnb list and looks for non-cache + * I/O in case it changes in the future */ + if (ted->ted_grant >= oa->o_grant_used) { + /* skip grant accounting for rnbs with + * OBD_BRW_FROM_GRANT and just used grant consumption + * claimed in the request */ + granted = oa->o_grant_used; + skip = true; + } else { + /* client has used more grants for this request that + * it owns ... */ + CERROR("%s: cli %s claims %lu GRANT, real grant %lu\n", + exp->exp_obd->obd_name, + exp->exp_client_uuid.uuid, + (unsigned long)oa->o_grant_used, ted->ted_grant); + + /* check whether we can fill the gap with unallocated + * grant */ + if (*left > (oa->o_grant_used - ted->ted_grant)) { + /* ouf .. we are safe for now */ + granted = ted->ted_grant; + ungranted = oa->o_grant_used - granted; + *left -= ungranted; + skip = true; + } + /* too bad, but we cannot afford to blow up our grant + * accounting. The loop below will handle each rnb in + * case by case. */ + } + } + + for (i = 0; i < niocount; i++) { + int bytes; + + if ((rnb[i].rnb_flags & OBD_BRW_FROM_GRANT)) { + if (skip) { + rnb[i].rnb_flags |= OBD_BRW_GRANTED; + continue; + } + + /* compute how much grant space is actually needed for + * this rnb, inflate grant if required */ + bytes = tgt_grant_rnb_size(exp, lut, &rnb[i]); + if (ted->ted_grant >= granted + bytes) { + granted += bytes; + rnb[i].rnb_flags |= OBD_BRW_GRANTED; + continue; + } + + CDEBUG(D_CACHE, "%s: cli %s/%p claims %ld+%d GRANT, " + "real grant %lu idx %d\n", obd->obd_name, + exp->exp_client_uuid.uuid, exp, granted, bytes, + ted->ted_grant, i); + } + + if (obd->obd_recovering) + CERROR("%s: cli %s is replaying OST_WRITE while one rnb" + " hasn't OBD_BRW_FROM_GRANT set (0x%x)\n", + obd->obd_name, exp->exp_client_uuid.uuid, + rnb[i].rnb_flags); + + /* Consume grant space on the server. + * Unlike above, tgt_grant_rnb_size() is called with exp = NULL + * so that the required grant space isn't inflated. This is + * done on purpose since the server can deal with large block + * size, unlike some clients */ + bytes = tgt_grant_rnb_size(NULL, lut, &rnb[i]); + if (*left > bytes) { + /* if enough space, pretend it was granted */ + ungranted += bytes; + *left -= bytes; + rnb[i].rnb_flags |= OBD_BRW_GRANTED; + continue; + } + + /* We can't check for already-mapped blocks here (make sense + * when backend filesystem does not use COW) as it requires + * dropping the grant lock. + * Instead, we clear OBD_BRW_GRANTED and in that case we need + * to go through and verify if all of the blocks not marked + * BRW_GRANTED are already mapped and we can ignore this error. + */ + rnb[i].rnb_flags &= ~OBD_BRW_GRANTED; + CDEBUG(D_CACHE, "%s: cli %s/%p idx %d no space for %d\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, i, bytes); + } + + /* record in o_grant_used the actual space reserved for the I/O, will be + * used later in tgt_grant_commmit() */ + oa->o_grant_used = granted + ungranted; + + /* record space used for the I/O, will be used in tgt_grant_commmit() */ + /* Now substract what the clients has used already. We don't subtract + * this from the tot_granted yet, so that other client's can't grab + * that space before we have actually allocated our blocks. That + * happens in tgt_grant_commit() after the writes are done. */ + ted->ted_grant -= granted; + ted->ted_pending += oa->o_grant_used; + tgd->tgd_tot_granted += ungranted; + tgd->tgd_tot_pending += oa->o_grant_used; + + CDEBUG(D_CACHE, + "%s: cli %s/%p granted: %lu ungranted: %lu grant: %lu dirty: %lu" + "\n", obd->obd_name, exp->exp_client_uuid.uuid, exp, + granted, ungranted, ted->ted_grant, ted->ted_dirty); + + if (obd->obd_recovering || (oa->o_valid & OBD_MD_FLGRANT) == 0) + /* don't update dirty accounting during recovery or + * if grant information got discarded (e.g. during resend) */ + RETURN_EXIT; + + if (ted->ted_dirty < granted) { + CWARN("%s: cli %s/%p claims granted %lu > ted_dirty %lu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + granted, ted->ted_dirty); + granted = ted->ted_dirty; + } + tgd->tgd_tot_dirty -= granted; + ted->ted_dirty -= granted; + + if (ted->ted_dirty < 0 || ted->ted_grant < 0 || ted->ted_pending < 0) { + CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + ted->ted_dirty, ted->ted_pending, ted->ted_grant); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + EXIT; +} + +/** + * Allocate additional grant space to a client + * + * Calculate how much grant space to return to client, based on how much space + * is currently free and how much of that is already granted. + * Caller must hold tgd_grant_lock spinlock. + * + * \param[in] exp export of the client which sent the request + * \param[in] curgrant current grant claimed by the client + * \param[in] want how much grant space the client would like to + * have + * \param[in] left remaining free space with granted space taken + * out + * \param[in] chunk grant allocation unit + * \param[in] conservative if set to true, the server should be cautious + * and limit how much space is granted back to the + * client. Otherwise, the server should try hard to + * satisfy the client request. + * + * \retval amount of grant space allocated + */ +static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant, + u64 want, u64 left, long chunk, + bool conservative) +{ + struct obd_device *obd = exp->exp_obd; + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + struct tg_export_data *ted = &exp->exp_target_data; + u64 grant; + + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_NO_GRANT)) + RETURN(0); + + /* When tgd_grant_compat_disable is set, we don't grant any space to + * clients not supporting OBD_CONNECT_GRANT_PARAM. + * Otherwise, space granted to such a client is inflated since it + * consumes PAGE_SIZE of grant space per block */ + if ((obd->obd_self_export != exp && !exp_grant_param_supp(exp) && + tgd->tgd_grant_compat_disable) || left == 0 || exp->exp_failed) + RETURN(0); + + if (want > OBD_MAX_GRANT) { + CERROR("%s: client %s/%p requesting > max (%lu), %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + OBD_MAX_GRANT, want); + RETURN(0); + } + + /* Grant some fraction of the client's requested grant space so that + * they are not always waiting for write credits (not all of it to + * avoid overgranting in face of multiple RPCs in flight). This + * essentially will be able to control the OSC_MAX_RIF for a client. + * + * If we do have a large disparity between what the client thinks it + * has and what we think it has, don't grant very much and let the + * client consume its grant first. Either it just has lots of RPCs + * in flight, or it was evicted and its grants will soon be used up. */ + if (curgrant >= want || curgrant >= ted->ted_grant + chunk) + RETURN(0); + + if (obd->obd_recovering) + conservative = false; + + if (conservative) + /* don't grant more than 1/8th of the remaining free space in + * one chunk */ + left >>= 3; + grant = min(want - curgrant, left); + /* round grant up to the next block size */ + grant = (grant + (1 << tgd->tgd_blockbits) - 1) & + ~((1ULL << tgd->tgd_blockbits) - 1); + + if (!grant) + RETURN(0); + + /* Limit to grant_chunk if not reconnect/recovery */ + if ((grant > chunk) && conservative) + grant = chunk; + + /* + * Limit grant so that export' grant does not exceed what the + * client would like to have by more than grants for 2 full + * RPCs + */ + if (want + chunk <= ted->ted_grant) + RETURN(0); + if (ted->ted_grant + grant > want + chunk) + grant = want + chunk - ted->ted_grant; + + tgd->tgd_tot_granted += grant; + ted->ted_grant += grant; + + if (unlikely(ted->ted_grant < 0 || ted->ted_grant > want + chunk)) { + CERROR("%s: cli %s/%p grant %ld want %llu current %llu\n", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + ted->ted_grant, want, curgrant); + spin_unlock(&tgd->tgd_grant_lock); + } + + CDEBUG(D_CACHE, + "%s: cli %s/%p wants: %llu current grant %llu" + " granting: %llu\n", obd->obd_name, exp->exp_client_uuid.uuid, + exp, want, curgrant, grant); + CDEBUG(D_CACHE, + "%s: cli %s/%p tot cached:%llu granted:%llu" + " num_exports: %d\n", obd->obd_name, exp->exp_client_uuid.uuid, + exp, tgd->tgd_tot_dirty, tgd->tgd_tot_granted, + obd->obd_num_exports); + + RETURN(grant); +} + +/** + * Handle grant space allocation on client connection & reconnection. + * + * A new non-readonly connection gets an initial grant allocation equals to + * tgt_grant_chunk() (i.e. twice the max BRW size in most of the cases). + * On reconnection, grant counters between client & target are resynchronized + * and additional space might be granted back if possible. + * + * \param[in] env LU environment provided by the caller + * \param[in] exp client's export which is (re)connecting + * \param[in,out] data obd_connect_data structure sent by the client in the + * connect request + * \param[in] new_conn must set to true if this is a new connection and false + * for a reconnection + */ +void tgt_grant_connect(const struct lu_env *env, struct obd_export *exp, + struct obd_connect_data *data, bool new_conn) +{ + struct lu_target *lut = exp->exp_obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct tg_export_data *ted = &exp->exp_target_data; + u64 left = 0; + u64 want; + long chunk; + int from_cache; + int force = 0; /* can use cached data */ + + /* don't grant space to client with read-only access */ + if (OCD_HAS_FLAG(data, RDONLY) || + (!OCD_HAS_FLAG(data, GRANT_PARAM) && + tgd->tgd_grant_compat_disable)) { + data->ocd_grant = 0; + data->ocd_connect_flags &= ~(OBD_CONNECT_GRANT | + OBD_CONNECT_GRANT_PARAM); + RETURN_EXIT; + } + + if (OCD_HAS_FLAG(data, GRANT_PARAM)) + want = data->ocd_grant; + else + want = tgt_grant_inflate(tgd, data->ocd_grant); + chunk = tgt_grant_chunk(exp, lut, data); +refresh: + tgt_grant_statfs(env, exp, force, &from_cache); + + spin_lock(&tgd->tgd_grant_lock); + + /* Grab free space from cached info and take out space already granted + * to clients as well as reserved space */ + left = tgt_grant_space_left(exp); + + /* get fresh statfs data if we are short in ungranted space */ + if (from_cache && left < 32 * chunk) { + spin_unlock(&tgd->tgd_grant_lock); + CDEBUG(D_CACHE, "fs has no space left and statfs too old\n"); + force = 1; + goto refresh; + } + + tgt_grant_alloc(exp, (u64)ted->ted_grant, want, left, chunk, new_conn); + + /* return to client its current grant */ + if (OCD_HAS_FLAG(data, GRANT_PARAM)) + data->ocd_grant = ted->ted_grant; + else + /* deflate grant */ + data->ocd_grant = tgt_grant_deflate(tgd, (u64)ted->ted_grant); + + /* reset dirty accounting */ + tgd->tgd_tot_dirty -= ted->ted_dirty; + ted->ted_dirty = 0; + + if (new_conn && OCD_HAS_FLAG(data, GRANT)) + tgd->tgd_tot_granted_clients++; + + spin_unlock(&tgd->tgd_grant_lock); + + CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %d want: %llu left: %llu\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, + exp, data->ocd_grant, want, left); + + EXIT; +} +EXPORT_SYMBOL(tgt_grant_connect); + +/** + * Release all grant space attached to a given export. + * + * Remove a client from the grant accounting totals. We also remove + * the export from the obd device under the osfs and dev locks to ensure + * that the tgt_grant_sanity_check() calculations are always valid. + * The client should do something similar when it invalidates its import. + * + * \param[in] exp client's export to remove from grant accounting + */ +void tgt_grant_discard(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + struct lu_target *lut = class_exp2tgt(exp); + struct tg_export_data *ted = &exp->exp_target_data; + struct tg_grants_data *tgd; + + if (!lut) + return; + + tgd = &lut->lut_tgd; + spin_lock(&tgd->tgd_grant_lock); + if (unlikely(tgd->tgd_tot_granted < ted->ted_grant || + tgd->tgd_tot_dirty < ted->ted_dirty)) { + struct obd_export *e; + u64 ttg = 0; + u64 ttd = 0; + + list_for_each_entry(e, &obd->obd_exports, exp_obd_chain) { + LASSERT(exp != e); + ttg += e->exp_target_data.ted_grant; + ttg += e->exp_target_data.ted_pending; + ttd += e->exp_target_data.ted_dirty; + } + if (tgd->tgd_tot_granted < ted->ted_grant) + CERROR("%s: cli %s/%p: tot_granted %llu < ted_grant %ld, corrected to %llu", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + tgd->tgd_tot_granted, ted->ted_grant, ttg); + if (tgd->tgd_tot_dirty < ted->ted_dirty) + CERROR("%s: cli %s/%p: tot_dirty %llu < ted_dirty %ld, corrected to %llu", + obd->obd_name, exp->exp_client_uuid.uuid, exp, + tgd->tgd_tot_dirty, ted->ted_dirty, ttd); + tgd->tgd_tot_granted = ttg; + tgd->tgd_tot_dirty = ttd; + } else { + tgd->tgd_tot_granted -= ted->ted_grant; + tgd->tgd_tot_dirty -= ted->ted_dirty; + } + ted->ted_grant = 0; + ted->ted_dirty = 0; + + if (tgd->tgd_tot_pending < ted->ted_pending) { + CERROR("%s: tot_pending %llu < cli %s/%p ted_pending %ld\n", + obd->obd_name, tgd->tgd_tot_pending, + exp->exp_client_uuid.uuid, exp, ted->ted_pending); + } + /* tgd_tot_pending is handled in tgt_grant_commit as bulk + * commmits */ + spin_unlock(&tgd->tgd_grant_lock); +} +EXPORT_SYMBOL(tgt_grant_discard); + +/** + * Process grant information from incoming bulk read request. + * + * Extract grant information packed in obdo structure (OBD_MD_FLGRANT set in + * o_valid). Bulk reads usually comes with grant announcements (number of dirty + * blocks, remaining amount of grant space, ...) and could also include a grant + * shrink request. Unlike bulk write, no additional grant space is returned on + * bulk read request. + * + * \param[in] env is the lu environment provided by the caller + * \param[in] exp is the export of the client which sent the request + * \param[in,out] oa is the incoming obdo sent by the client + */ +void tgt_grant_prepare_read(const struct lu_env *env, + struct obd_export *exp, struct obdo *oa) +{ + struct lu_target *lut = exp->exp_obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + int do_shrink; + u64 left = 0; + + ENTRY; + + if (!oa) + RETURN_EXIT; + + if ((oa->o_valid & OBD_MD_FLGRANT) == 0) + /* The read request does not contain any grant + * information */ + RETURN_EXIT; + + if ((oa->o_valid & OBD_MD_FLFLAGS) && + (oa->o_flags & OBD_FL_SHRINK_GRANT)) { + /* To process grant shrink request, we need to know how much + * available space remains on the backend filesystem. + * Shrink requests are not so common, we always get fresh + * statfs information. */ + tgt_grant_statfs(env, exp, 1, NULL); + + /* protect all grant counters */ + spin_lock(&tgd->tgd_grant_lock); + + /* Grab free space from cached statfs data and take out space + * already granted to clients as well as reserved space */ + left = tgt_grant_space_left(exp); + + /* all set now to proceed with shrinking */ + do_shrink = 1; + } else { + /* no grant shrinking request packed in the obdo and + * since we don't grant space back on reads, no point + * in running statfs, so just skip it and process + * incoming grant data directly. */ + spin_lock(&tgd->tgd_grant_lock); + do_shrink = 0; + } + + /* extract incoming grant information provided by the client and + * inflate grant counters if required */ + tgt_grant_incoming(env, exp, oa, tgt_grant_chunk(exp, lut, NULL)); + + /* unlike writes, we don't return grants back on reads unless a grant + * shrink request was packed and we decided to turn it down. */ + if (do_shrink) + tgt_grant_shrink(exp, oa, left); + else + oa->o_grant = 0; + + if (!exp_grant_param_supp(exp)) + oa->o_grant = tgt_grant_deflate(tgd, oa->o_grant); + spin_unlock(&tgd->tgd_grant_lock); + EXIT; +} +EXPORT_SYMBOL(tgt_grant_prepare_read); + +/** + * Process grant information from incoming bulk write request. + * + * This function extracts client's grant announcements from incoming bulk write + * request and attempts to allocate grant space for network buffers that need it + * (i.e. OBD_BRW_FROM_GRANT not set in rnb_fags). + * Network buffers which aren't granted the OBD_BRW_GRANTED flag should not + * proceed further and should fail with -ENOSPC. + * Whenever possible, additional grant space will be returned to the client + * in the bulk write reply. + * tgt_grant_prepare_write() must be called before writting any buffers to + * the backend storage. This function works in pair with tgt_grant_commit() + * which must be invoked once all buffers have been written to disk in order + * to release space from the pending grant counter. + * + * \param[in] env LU environment provided by the caller + * \param[in] exp export of the client which sent the request + * \param[in] oa incoming obdo sent by the client + * \param[in] rnb list of network buffers + * \param[in] niocount number of network buffers in the list + */ +void tgt_grant_prepare_write(const struct lu_env *env, + struct obd_export *exp, struct obdo *oa, + struct niobuf_remote *rnb, int niocount) +{ + struct obd_device *obd = exp->exp_obd; + struct lu_target *lut = obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + u64 left; + int from_cache; + int force = 0; /* can use cached data intially */ + long chunk = tgt_grant_chunk(exp, lut, NULL); + + ENTRY; + +refresh: + /* get statfs information from OSD layer */ + tgt_grant_statfs(env, exp, force, &from_cache); + + spin_lock(&tgd->tgd_grant_lock); /* protect all grant counters */ + + /* Grab free space from cached statfs data and take out space already + * granted to clients as well as reserved space */ + left = tgt_grant_space_left(exp); + + /* Get fresh statfs data if we are short in ungranted space */ + if (from_cache && left < 32 * chunk) { + spin_unlock(&tgd->tgd_grant_lock); + CDEBUG(D_CACHE, "%s: fs has no space left and statfs too old\n", + obd->obd_name); + force = 1; + goto refresh; + } + + /* When close to free space exhaustion, trigger a sync to force + * writeback cache to consume required space immediately and release as + * much space as possible. */ + if (!obd->obd_recovering && force != 2 && left < chunk) { + bool from_grant = true; + int i; + + /* That said, it is worth running a sync only if some pages did + * not consume grant space on the client and could thus fail + * with ENOSPC later in tgt_grant_check() */ + for (i = 0; i < niocount; i++) + if (!(rnb[i].rnb_flags & OBD_BRW_FROM_GRANT)) + from_grant = false; + + if (!from_grant) { + /* at least one network buffer requires acquiring grant + * space on the server */ + spin_unlock(&tgd->tgd_grant_lock); + /* discard errors, at least we tried ... */ + dt_sync(env, lut->lut_bottom); + force = 2; + goto refresh; + } + } + + /* extract incoming grant information provided by the client, + * and inflate grant counters if required */ + tgt_grant_incoming(env, exp, oa, chunk); + + /* check limit */ + tgt_grant_check(env, exp, oa, rnb, niocount, &left); + + if (!(oa->o_valid & OBD_MD_FLGRANT)) { + spin_unlock(&tgd->tgd_grant_lock); + RETURN_EXIT; + } + + /* if OBD_FL_SHRINK_GRANT is set, the client is willing to release some + * grant space. */ + if ((oa->o_valid & OBD_MD_FLFLAGS) && + (oa->o_flags & OBD_FL_SHRINK_GRANT)) + tgt_grant_shrink(exp, oa, left); + else + /* grant more space back to the client if possible */ + oa->o_grant = tgt_grant_alloc(exp, oa->o_grant, oa->o_undirty, + left, chunk, true); + + if (!exp_grant_param_supp(exp)) + oa->o_grant = tgt_grant_deflate(tgd, oa->o_grant); + spin_unlock(&tgd->tgd_grant_lock); + EXIT; +} +EXPORT_SYMBOL(tgt_grant_prepare_write); + +/** + * Consume grant space reserved for object creation. + * + * Grant space is allocated to the local self export for object precreation. + * This is required to prevent object precreation from consuming grant space + * allocated to client nodes for the data writeback cache. + * This function consumes enough space to create \a nr objects and allocates + * more grant space to the self export for future precreation requests, if + * possible. + * + * \param[in] env LU environment provided by the caller + * \param[in] exp export holding the grant space for precreation (= self + * export currently) + * \param[in] nr number of objects to be created + * + * \retval >= 0 amount of grant space allocated to the precreate request + * \retval -ENOSPC on failure + */ +long tgt_grant_create(const struct lu_env *env, struct obd_export *exp, s64 *nr) +{ + struct lu_target *lut = exp->exp_obd->u.obt.obt_lut; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct tg_export_data *ted = &exp->exp_target_data; + u64 left = 0; + unsigned long wanted; + unsigned long granted; + ENTRY; + + if (exp->exp_obd->obd_recovering || + lut->lut_dt_conf.ddp_inodespace == 0) + /* don't enforce grant during recovery */ + RETURN(0); + + /* Update statfs data if required */ + tgt_grant_statfs(env, exp, 1, NULL); + + /* protect all grant counters */ + spin_lock(&tgd->tgd_grant_lock); + + /* fail precreate request if there is not enough blocks available for + * writing */ + if (tgd->tgd_osfs.os_bavail - (ted->ted_grant >> tgd->tgd_blockbits) < + (tgd->tgd_osfs.os_blocks >> 10)) { + spin_unlock(&tgd->tgd_grant_lock); + CDEBUG(D_RPCTRACE, "%s: not enough space for create %llu\n", + exp->exp_obd->obd_name, + tgd->tgd_osfs.os_bavail * tgd->tgd_osfs.os_blocks); + RETURN(-ENOSPC); + } + + /* Grab free space from cached statfs data and take out space + * already granted to clients as well as reserved space */ + left = tgt_grant_space_left(exp); + + /* compute how much space is required to handle the precreation + * request */ + wanted = *nr * lut->lut_dt_conf.ddp_inodespace; + if (wanted > ted->ted_grant + left) { + /* that's beyond what remains, adjust the number of objects that + * can be safely precreated */ + wanted = ted->ted_grant + left; + *nr = wanted / lut->lut_dt_conf.ddp_inodespace; + if (*nr == 0) { + /* we really have no space any more for precreation, + * fail the precreate request with ENOSPC */ + spin_unlock(&tgd->tgd_grant_lock); + RETURN(-ENOSPC); + } + /* compute space needed for the new number of creations */ + wanted = *nr * lut->lut_dt_conf.ddp_inodespace; + } + LASSERT(wanted <= ted->ted_grant + left); + + if (wanted <= ted->ted_grant) { + /* we've enough grant space to handle this precreate request */ + ted->ted_grant -= wanted; + } else { + /* we need to take some space from the ungranted pool */ + tgd->tgd_tot_granted += wanted - ted->ted_grant; + left -= wanted - ted->ted_grant; + ted->ted_grant = 0; + } + granted = wanted; + ted->ted_pending += granted; + tgd->tgd_tot_pending += granted; + + /* grant more space for precreate purpose if possible. */ + wanted = OST_MAX_PRECREATE * lut->lut_dt_conf.ddp_inodespace / 2; + if (wanted > ted->ted_grant) { + long chunk; + + /* always try to book enough space to handle a large precreate + * request */ + chunk = tgt_grant_chunk(exp, lut, NULL); + wanted -= ted->ted_grant; + tgt_grant_alloc(exp, ted->ted_grant, wanted, left, chunk, + false); + } + spin_unlock(&tgd->tgd_grant_lock); + RETURN(granted); +} +EXPORT_SYMBOL(tgt_grant_create); + +/** + * Release grant space added to the pending counter by tgt_grant_prepare_write() + * + * Update pending grant counter once buffers have been written to the disk. + * + * \param[in] exp export of the client which sent the request + * \param[in] pending amount of reserved space to be released + * \param[in] rc return code of pre-commit operations + */ +void tgt_grant_commit(struct obd_export *exp, unsigned long pending, + int rc) +{ + struct tg_grants_data *tgd = &exp->exp_obd->u.obt.obt_lut->lut_tgd; + + ENTRY; + + /* get space accounted in tot_pending for the I/O, set in + * tgt_grant_check() */ + if (pending == 0) + RETURN_EXIT; + + spin_lock(&tgd->tgd_grant_lock); + /* Don't update statfs data for errors raised before commit (e.g. + * bulk transfer failed, ...) since we know those writes have not been + * processed. For other errors hit during commit, we cannot really tell + * whether or not something was written, so we update statfs data. + * In any case, this should not be fatal since we always get fresh + * statfs data before failing a request with ENOSPC */ + if (rc == 0) { + spin_lock(&tgd->tgd_osfs_lock); + /* Take pending out of cached statfs data */ + tgd->tgd_osfs.os_bavail -= min_t(u64, + tgd->tgd_osfs.os_bavail, + pending >> tgd->tgd_blockbits); + if (tgd->tgd_statfs_inflight) + /* someone is running statfs and want to be notified of + * writes happening meanwhile */ + tgd->tgd_osfs_inflight += pending; + spin_unlock(&tgd->tgd_osfs_lock); + } + + if (exp->exp_target_data.ted_pending < pending) { + CERROR("%s: cli %s/%p ted_pending(%lu) < grant_used(%lu)\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, + exp->exp_target_data.ted_pending, pending); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + exp->exp_target_data.ted_pending -= pending; + + if (tgd->tgd_tot_granted < pending) { + CERROR("%s: cli %s/%p tot_granted(%llu) < grant_used(%lu)\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, + tgd->tgd_tot_granted, pending); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + tgd->tgd_tot_granted -= pending; + + if (tgd->tgd_tot_pending < pending) { + CERROR("%s: cli %s/%p tot_pending(%llu) < grant_used(%lu)\n", + exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, + tgd->tgd_tot_pending, pending); + spin_unlock(&tgd->tgd_grant_lock); + LBUG(); + } + tgd->tgd_tot_pending -= pending; + spin_unlock(&tgd->tgd_grant_lock); + EXIT; +} +EXPORT_SYMBOL(tgt_grant_commit); + +struct tgt_grant_cb { + /* commit callback structure */ + struct dt_txn_commit_cb tgc_cb; + /* export associated with the bulk write */ + struct obd_export *tgc_exp; + /* pending grant to be released */ + unsigned long tgc_granted; +}; + +/** + * Callback function for grant releasing + * + * Release grant space reserved by the client node. + * + * \param[in] env execution environment + * \param[in] th transaction handle + * \param[in] cb callback data + * \param[in] err error code + */ +static void tgt_grant_commit_cb(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err) +{ + struct tgt_grant_cb *tgc; + + tgc = container_of(cb, struct tgt_grant_cb, tgc_cb); + + tgt_grant_commit(tgc->tgc_exp, tgc->tgc_granted, err); + class_export_cb_put(tgc->tgc_exp); + OBD_FREE_PTR(tgc); +} + +/** + * Add callback for grant releasing + * + * Register a commit callback to release grant space. + * + * \param[in] th transaction handle + * \param[in] exp OBD export of client + * \param[in] granted amount of grant space to be released upon commit + * + * \retval 0 on successful callback adding + * \retval negative value on error + */ +int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp, + unsigned long granted) +{ + struct tgt_grant_cb *tgc; + struct dt_txn_commit_cb *dcb; + int rc; + ENTRY; + + OBD_ALLOC_PTR(tgc); + if (tgc == NULL) + RETURN(-ENOMEM); + + tgc->tgc_exp = class_export_cb_get(exp); + tgc->tgc_granted = granted; + + dcb = &tgc->tgc_cb; + dcb->dcb_func = tgt_grant_commit_cb; + INIT_LIST_HEAD(&dcb->dcb_linkage); + strlcpy(dcb->dcb_name, "tgt_grant_commit_cb", sizeof(dcb->dcb_name)); + + rc = dt_trans_cb_add(th, dcb); + if (rc) { + class_export_cb_put(tgc->tgc_exp); + OBD_FREE_PTR(tgc); + } + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_grant_commit_cb_add); + +/** + * Show estimate of total amount of dirty data on clients. + * + * @kobj kobject embedded in obd_device + * @attr unused + * @buf buf used by sysfs to print out data + * + * Return: 0 on success + * negative value on error + */ +ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct tg_grants_data *tgd; + + tgd = &obd->u.obt.obt_lut->lut_tgd; + return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_dirty); +} +EXPORT_SYMBOL(tot_dirty_show); + +/** + * Show total amount of space granted to clients. + * + * @kobj kobject embedded in obd_device + * @attr unused + * @buf buf used by sysfs to print out data + * + * Return: 0 on success + * negative value on error + */ +ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct tg_grants_data *tgd; + + tgd = &obd->u.obt.obt_lut->lut_tgd; + return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_granted); +} +EXPORT_SYMBOL(tot_granted_show); + +/** + * Show total amount of space used by IO in progress. + * + * @kobj kobject embedded in obd_device + * @attr unused + * @buf buf used by sysfs to print out data + * + * Return: 0 on success + * negative value on error + */ +ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct tg_grants_data *tgd; + + tgd = &obd->u.obt.obt_lut->lut_tgd; + return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_pending); +} +EXPORT_SYMBOL(tot_pending_show); + +/** + * Show if grants compatibility mode is disabled. + * + * When tgd_grant_compat_disable is set, we don't grant any space to clients + * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such + * a client is inflated since it consumes PAGE_SIZE of grant space per + * block, (i.e. typically 4kB units), but underlaying file system might have + * block size bigger than page size, e.g. ZFS. See LU-2049 for details. + * + * @kobj kobject embedded in obd_device + * @attr unused + * @buf buf used by sysfs to print out data + * + * Return: string length of @buf output on success + */ +ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + + return scnprintf(buf, PAGE_SIZE, "%u\n", tgd->tgd_grant_compat_disable); +} +EXPORT_SYMBOL(grant_compat_disable_show); + +/** + * Change grant compatibility mode. + * + * Setting tgd_grant_compat_disable prohibit any space granting to clients + * not supporting OBD_CONNECT_GRANT_PARAM. See details above. + * + * @kobj kobject embedded in obd_device + * @attr unused + * @buffer string which represents mode + * 1: disable compatibility mode + * 0: enable compatibility mode + * @count @buffer length + * + * Return: @count on success + * negative number on error + */ +ssize_t grant_compat_disable_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + tgd->tgd_grant_compat_disable = val; + + return count; +} +EXPORT_SYMBOL(grant_compat_disable_store); diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_handler.c b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c new file mode 100644 index 0000000000000..2ec6d01e60d91 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c @@ -0,0 +1,2793 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2017, Intel Corporation. + */ +/* + * lustre/target/tgt_handler.c + * + * Lustre Unified Target request handler code + * + * Author: Brian Behlendorf + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#ifdef HAVE_UIDGID_HEADER +# include +#endif + +#include +#include +#include +#include +#include +#include + +#include "tgt_internal.h" + +char *tgt_name(struct lu_target *tgt) +{ + LASSERT(tgt->lut_obd != NULL); + return tgt->lut_obd->obd_name; +} +EXPORT_SYMBOL(tgt_name); + +/* + * Generic code handling requests that have struct mdt_body passed in: + * + * - extract mdt_body from request and save it in @tsi, if present; + * + * - create lu_object, corresponding to the fid in mdt_body, and save it in + * @tsi; + * + * - if HABEO_CORPUS flag is set for this request type check whether object + * actually exists on storage (lu_object_exists()). + * + */ +static int tgt_mdt_body_unpack(struct tgt_session_info *tsi, __u32 flags) +{ + const struct mdt_body *body; + struct lu_object *obj; + struct req_capsule *pill = tsi->tsi_pill; + int rc; + + ENTRY; + + body = req_capsule_client_get(pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EFAULT); + + tsi->tsi_mdt_body = body; + + if (!(body->mbo_valid & OBD_MD_FLID)) + RETURN(0); + + /* mdc_pack_body() doesn't check if fid is zero and set OBD_ML_FID + * in any case in pre-2.5 clients. Fix that here if needed */ + if (unlikely(fid_is_zero(&body->mbo_fid1))) + RETURN(0); + + if (!fid_is_sane(&body->mbo_fid1)) { + CERROR("%s: invalid FID: "DFID"\n", tgt_name(tsi->tsi_tgt), + PFID(&body->mbo_fid1)); + RETURN(-EINVAL); + } + + obj = lu_object_find(tsi->tsi_env, + &tsi->tsi_tgt->lut_bottom->dd_lu_dev, + &body->mbo_fid1, NULL); + if (!IS_ERR(obj)) { + if ((flags & HABEO_CORPUS) && !lu_object_exists(obj)) { + lu_object_put(tsi->tsi_env, obj); + rc = -ENOENT; + } else { + tsi->tsi_corpus = obj; + rc = 0; + } + } else { + rc = PTR_ERR(obj); + } + + tsi->tsi_fid = body->mbo_fid1; + + RETURN(rc); +} + +/** + * Validate oa from client. + * If the request comes from 2.0 clients, currently only RSVD seq and IDIF + * req are valid. + * a. objects in Single MDT FS seq = FID_SEQ_OST_MDT0, oi_id != 0 + * b. Echo objects(seq = 2), old echo client still use oi_id/oi_seq to + * pack ost_id. Because non-zero oi_seq will make it diffcult to tell + * whether this is oi_fid or real ostid. So it will check + * OBD_CONNECT_FID, then convert the ostid to FID for old client. + * c. Old FID-disable osc will send IDIF. + * d. new FID-enable osc/osp will send normal FID. + * + * And also oi_id/f_oid should always start from 1. oi_id/f_oid = 0 will + * be used for LAST_ID file, and only being accessed inside OST now. + */ +int tgt_validate_obdo(struct tgt_session_info *tsi, struct obdo *oa) +{ + struct ost_id *oi = &oa->o_oi; + u64 seq = ostid_seq(oi); + u64 id = ostid_id(oi); + int rc; + ENTRY; + + if (unlikely(!(exp_connect_flags(tsi->tsi_exp) & OBD_CONNECT_FID) && + fid_seq_is_echo(seq))) { + /* Sigh 2.[123] client still sends echo req with oi_id = 0 + * during create, and we will reset this to 1, since this + * oi_id is basically useless in the following create process, + * but oi_id == 0 will make it difficult to tell whether it is + * real FID or ost_id. */ + oi->oi_fid.f_seq = FID_SEQ_ECHO; + oi->oi_fid.f_oid = id ?: 1; + oi->oi_fid.f_ver = 0; + } else { + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + + if (unlikely((oa->o_valid & OBD_MD_FLID) && id == 0)) + GOTO(out, rc = -EPROTO); + + /* Note: this check might be forced in 2.5 or 2.6, i.e. + * all of the requests are required to setup FLGROUP */ + if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) { + ostid_set_seq_mdt0(oi); + oa->o_valid |= OBD_MD_FLGROUP; + seq = ostid_seq(oi); + } + + if (unlikely(!(fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq) || + fid_seq_is_norm(seq) || fid_seq_is_echo(seq)))) + GOTO(out, rc = -EPROTO); + + rc = ostid_to_fid(&tti->tti_fid1, oi, + tsi->tsi_tgt->lut_lsd.lsd_osd_index); + if (unlikely(rc != 0)) + GOTO(out, rc); + + oi->oi_fid = tti->tti_fid1; + } + + RETURN(0); + +out: + CERROR("%s: client %s sent bad object "DOSTID": rc = %d\n", + tgt_name(tsi->tsi_tgt), obd_export_nid2str(tsi->tsi_exp), + seq, id, rc); + return rc; +} +EXPORT_SYMBOL(tgt_validate_obdo); + +static int tgt_io_data_unpack(struct tgt_session_info *tsi, struct ost_id *oi) +{ + unsigned max_brw; + struct niobuf_remote *rnb; + struct obd_ioobj *ioo; + int obj_count; + + ENTRY; + + ioo = req_capsule_client_get(tsi->tsi_pill, &RMF_OBD_IOOBJ); + if (ioo == NULL) + RETURN(-EPROTO); + + rnb = req_capsule_client_get(tsi->tsi_pill, &RMF_NIOBUF_REMOTE); + if (rnb == NULL) + RETURN(-EPROTO); + + max_brw = ioobj_max_brw_get(ioo); + if (unlikely((max_brw & (max_brw - 1)) != 0)) { + CERROR("%s: client %s sent bad ioobj max %u for "DOSTID + ": rc = %d\n", tgt_name(tsi->tsi_tgt), + obd_export_nid2str(tsi->tsi_exp), max_brw, + POSTID(oi), -EPROTO); + RETURN(-EPROTO); + } + ioo->ioo_oid = *oi; + + obj_count = req_capsule_get_size(tsi->tsi_pill, &RMF_OBD_IOOBJ, + RCL_CLIENT) / sizeof(*ioo); + if (obj_count == 0) { + CERROR("%s: short ioobj\n", tgt_name(tsi->tsi_tgt)); + RETURN(-EPROTO); + } else if (obj_count > 1) { + CERROR("%s: too many ioobjs (%d)\n", tgt_name(tsi->tsi_tgt), + obj_count); + RETURN(-EPROTO); + } + + if (ioo->ioo_bufcnt == 0) { + CERROR("%s: ioo has zero bufcnt\n", tgt_name(tsi->tsi_tgt)); + RETURN(-EPROTO); + } + + if (ioo->ioo_bufcnt > PTLRPC_MAX_BRW_PAGES) { + DEBUG_REQ(D_RPCTRACE, tgt_ses_req(tsi), + "bulk has too many pages (%d)", + ioo->ioo_bufcnt); + RETURN(-EPROTO); + } + + RETURN(0); +} + +static int tgt_ost_body_unpack(struct tgt_session_info *tsi, __u32 flags) +{ + struct ost_body *body; + struct req_capsule *pill = tsi->tsi_pill; + struct lu_nodemap *nodemap; + int rc; + + ENTRY; + + body = req_capsule_client_get(pill, &RMF_OST_BODY); + if (body == NULL) + RETURN(-EFAULT); + + rc = tgt_validate_obdo(tsi, &body->oa); + if (rc) + RETURN(rc); + + nodemap = nodemap_get_from_exp(tsi->tsi_exp); + if (IS_ERR(nodemap)) + RETURN(PTR_ERR(nodemap)); + + body->oa.o_uid = nodemap_map_id(nodemap, NODEMAP_UID, + NODEMAP_CLIENT_TO_FS, + body->oa.o_uid); + body->oa.o_gid = nodemap_map_id(nodemap, NODEMAP_GID, + NODEMAP_CLIENT_TO_FS, + body->oa.o_gid); + nodemap_putref(nodemap); + + tsi->tsi_ost_body = body; + tsi->tsi_fid = body->oa.o_oi.oi_fid; + + if (req_capsule_has_field(pill, &RMF_OBD_IOOBJ, RCL_CLIENT)) { + rc = tgt_io_data_unpack(tsi, &body->oa.o_oi); + if (rc < 0) + RETURN(rc); + } + + if (!(body->oa.o_valid & OBD_MD_FLID)) { + if (flags & HABEO_CORPUS) { + CERROR("%s: OBD_MD_FLID flag is not set in ost_body " + "but OID/FID is mandatory with HABEO_CORPUS\n", + tgt_name(tsi->tsi_tgt)); + RETURN(-EPROTO); + } else { + RETURN(0); + } + } + + ost_fid_build_resid(&tsi->tsi_fid, &tsi->tsi_resid); + + /* + * OST doesn't get object in advance for further use to prevent + * situations with nested object_find which is potential deadlock. + */ + tsi->tsi_corpus = NULL; + RETURN(rc); +} + +/* + * Do necessary preprocessing according to handler ->th_flags. + */ +static int tgt_request_preprocess(struct tgt_session_info *tsi, + struct tgt_handler *h, + struct ptlrpc_request *req) +{ + struct req_capsule *pill = tsi->tsi_pill; + __u32 flags = h->th_flags; + int rc = 0; + + ENTRY; + + if (tsi->tsi_preprocessed) + RETURN(0); + + LASSERT(h->th_act != NULL); + LASSERT(h->th_opc == lustre_msg_get_opc(req->rq_reqmsg)); + LASSERT(current->journal_info == NULL); + + LASSERT(ergo(flags & (HABEO_CORPUS | HABEO_REFERO), + h->th_fmt != NULL)); + if (h->th_fmt != NULL) { + req_capsule_set(pill, h->th_fmt); + if (req_capsule_has_field(pill, &RMF_MDT_BODY, RCL_CLIENT)) { + rc = tgt_mdt_body_unpack(tsi, flags); + if (rc < 0) + RETURN(rc); + } else if (req_capsule_has_field(pill, &RMF_OST_BODY, + RCL_CLIENT)) { + rc = tgt_ost_body_unpack(tsi, flags); + if (rc < 0) + RETURN(rc); + } + } + + if (flags & MUTABOR && tgt_conn_flags(tsi) & OBD_CONNECT_RDONLY) + RETURN(-EROFS); + + if (flags & HABEO_CLAVIS) { + struct ldlm_request *dlm_req; + + LASSERT(h->th_fmt != NULL); + + dlm_req = req_capsule_client_get(pill, &RMF_DLM_REQ); + if (dlm_req != NULL) { + union ldlm_wire_policy_data *policy = + &dlm_req->lock_desc.l_policy_data; + + if (unlikely(dlm_req->lock_desc.l_resource.lr_type == + LDLM_IBITS && + (policy->l_inodebits.bits | + policy->l_inodebits.try_bits) == 0)) { + /* + * Lock without inodebits makes no sense and + * will oops later in ldlm. If client miss to + * set such bits, do not trigger ASSERTION. + * + * For liblustre flock case, it maybe zero. + */ + rc = -EPROTO; + } else { + tsi->tsi_dlm_req = dlm_req; + } + } else { + rc = -EFAULT; + } + } + tsi->tsi_preprocessed = 1; + RETURN(rc); +} + +/* + * Invoke handler for this request opc. Also do necessary preprocessing + * (according to handler ->th_flags), and post-processing (setting of + * ->last_{xid,committed}). + */ +static int tgt_handle_request0(struct tgt_session_info *tsi, + struct tgt_handler *h, + struct ptlrpc_request *req) +{ + int serious = 0; + int rc; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + ENTRY; + + + /* When dealing with sec context requests, no export is associated yet, + * because these requests are sent before *_CONNECT requests. + * A NULL req->rq_export means the normal *_common_slice handlers will + * not be called, because there is no reference to the target. + * So deal with them by hand and jump directly to target_send_reply(). + */ + switch (opc) { + case SEC_CTX_INIT: + case SEC_CTX_INIT_CONT: + case SEC_CTX_FINI: + CFS_FAIL_TIMEOUT(OBD_FAIL_SEC_CTX_HDL_PAUSE, cfs_fail_val); + GOTO(out, rc = 0); + } + + /* + * Checking for various OBD_FAIL_$PREF_$OPC_NET codes. _Do_ not try + * to put same checks into handlers like mdt_close(), mdt_reint(), + * etc., without talking to mdt authors first. Checking same thing + * there again is useless and returning 0 error without packing reply + * is buggy! Handlers either pack reply or return error. + * + * We return 0 here and do not send any reply in order to emulate + * network failure. Do not send any reply in case any of NET related + * fail_id has occured. + */ + if (OBD_FAIL_CHECK_ORSET(h->th_fail_id, OBD_FAIL_ONCE)) + RETURN(0); + if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT && + OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET))) + RETURN(0); + + rc = tgt_request_preprocess(tsi, h, req); + /* pack reply if reply format is fixed */ + if (rc == 0 && h->th_flags & HABEO_REFERO) { + /* Pack reply */ + if (req_capsule_has_field(tsi->tsi_pill, &RMF_MDT_MD, + RCL_SERVER)) + req_capsule_set_size(tsi->tsi_pill, &RMF_MDT_MD, + RCL_SERVER, + tsi->tsi_mdt_body->mbo_eadatasize); + if (req_capsule_has_field(tsi->tsi_pill, &RMF_LOGCOOKIES, + RCL_SERVER)) + req_capsule_set_size(tsi->tsi_pill, &RMF_LOGCOOKIES, + RCL_SERVER, 0); + if (req_capsule_has_field(tsi->tsi_pill, &RMF_ACL, RCL_SERVER)) + req_capsule_set_size(tsi->tsi_pill, + &RMF_ACL, RCL_SERVER, + LUSTRE_POSIX_ACL_MAX_SIZE_OLD); + + if (req_capsule_has_field(tsi->tsi_pill, &RMF_SHORT_IO, + RCL_SERVER)) { + struct niobuf_remote *remote_nb = + req_capsule_client_get(tsi->tsi_pill, + &RMF_NIOBUF_REMOTE); + struct ost_body *body = tsi->tsi_ost_body; + + req_capsule_set_size(tsi->tsi_pill, &RMF_SHORT_IO, + RCL_SERVER, + (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_SHORT_IO) ? + remote_nb[0].rnb_len : 0); + } + + rc = req_capsule_server_pack(tsi->tsi_pill); + } + + if (likely(rc == 0)) { + /* + * Process request, there can be two types of rc: + * 1) errors with msg unpack/pack, other failures outside the + * operation itself. This is counted as serious errors; + * 2) errors during fs operation, should be placed in rq_status + * only + */ + rc = h->th_act(tsi); + if (!is_serious(rc) && + !req->rq_no_reply && req->rq_reply_state == NULL) { + DEBUG_REQ(D_ERROR, req, "%s \"handler\" %s did not " + "pack reply and returned 0 error\n", + tgt_name(tsi->tsi_tgt), h->th_name); + LBUG(); + } + serious = is_serious(rc); + rc = clear_serious(rc); + } else { + serious = 1; + } + + req->rq_status = rc; + + /* + * ELDLM_* codes which > 0 should be in rq_status only as well as + * all non-serious errors. + */ + if (rc > 0 || !serious) + rc = 0; + + LASSERT(current->journal_info == NULL); + + if (likely(rc == 0 && req->rq_export)) + target_committed_to_req(req); + +out: + target_send_reply(req, rc, tsi->tsi_reply_fail_id); + RETURN(0); +} + +static int tgt_filter_recovery_request(struct ptlrpc_request *req, + struct obd_device *obd, int *process) +{ + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case MDS_DISCONNECT: + case OST_DISCONNECT: + case OBD_IDX_READ: + *process = 1; + RETURN(0); + case MDS_CLOSE: + case MDS_SYNC: /* used in unmounting */ + case OBD_PING: + case MDS_REINT: + case OUT_UPDATE: + case SEQ_QUERY: + case FLD_QUERY: + case FLD_READ: + case LDLM_ENQUEUE: + case OST_CREATE: + case OST_DESTROY: + case OST_PUNCH: + case OST_SETATTR: + case OST_SYNC: + case OST_WRITE: + case MDS_HSM_PROGRESS: + case MDS_HSM_STATE_SET: + case MDS_HSM_REQUEST: + *process = target_queue_recovery_request(req, obd); + RETURN(0); + + default: + DEBUG_REQ(D_ERROR, req, "not permitted during recovery"); + *process = -EAGAIN; + RETURN(0); + } +} + +/* + * Handle recovery. Return: + * +1: continue request processing; + * -ve: abort immediately with the given error code; + * 0: send reply with error code in req->rq_status; + */ +static int tgt_handle_recovery(struct ptlrpc_request *req, int reply_fail_id) +{ + ENTRY; + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case MDS_CONNECT: + case OST_CONNECT: + case MGS_CONNECT: + case SEC_CTX_INIT: + case SEC_CTX_INIT_CONT: + case SEC_CTX_FINI: + RETURN(+1); + } + + if (!req->rq_export->exp_obd->obd_replayable) + RETURN(+1); + + /* sanity check: if the xid matches, the request must be marked as a + * resent or replayed */ + if (req_can_reconstruct(req, NULL)) { + if (!(lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY))) { + DEBUG_REQ(D_WARNING, req, "rq_xid %llu matches " + "saved xid, expected REPLAY or RESENT flag " + "(%x)", req->rq_xid, + lustre_msg_get_flags(req->rq_reqmsg)); + req->rq_status = -ENOTCONN; + RETURN(-ENOTCONN); + } + } + /* else: note the opposite is not always true; a RESENT req after a + * failover will usually not match the last_xid, since it was likely + * never committed. A REPLAYed request will almost never match the + * last xid, however it could for a committed, but still retained, + * open. */ + + /* Check for aborted recovery... */ + if (unlikely(req->rq_export->exp_obd->obd_recovering)) { + int rc; + int should_process; + + DEBUG_REQ(D_INFO, req, "Got new replay"); + rc = tgt_filter_recovery_request(req, req->rq_export->exp_obd, + &should_process); + if (rc != 0 || !should_process) + RETURN(rc); + else if (should_process < 0) { + req->rq_status = should_process; + rc = ptlrpc_error(req); + RETURN(rc); + } + } + RETURN(+1); +} + +/* Initial check for request, it is validation mostly */ +static struct tgt_handler *tgt_handler_find_check(struct ptlrpc_request *req) +{ + struct tgt_handler *h; + struct tgt_opc_slice *s; + struct lu_target *tgt; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + ENTRY; + + tgt = class_exp2tgt(req->rq_export); + if (unlikely(tgt == NULL)) { + DEBUG_REQ(D_ERROR, req, "%s: No target for connected export\n", + class_exp2obd(req->rq_export)->obd_name); + RETURN(ERR_PTR(-EINVAL)); + } + + for (s = tgt->lut_slice; s->tos_hs != NULL; s++) + if (s->tos_opc_start <= opc && opc < s->tos_opc_end) + break; + + /* opcode was not found in slice */ + if (unlikely(s->tos_hs == NULL)) { + static bool printed; + + /* don't print error messages for known unhandled RPCs */ + if (opc != OST_FALLOCATE && opc != OST_SEEK && !printed) { + CERROR("%s: no handler for opcode 0x%x from %s\n", + tgt_name(tgt), opc, libcfs_id2str(req->rq_peer)); + printed = true; + } + RETURN(ERR_PTR(-ENOTSUPP)); + } + + LASSERT(opc >= s->tos_opc_start && opc < s->tos_opc_end); + h = s->tos_hs + (opc - s->tos_opc_start); + if (unlikely(h->th_opc == 0)) { + CERROR("%s: unsupported opcode 0x%x\n", tgt_name(tgt), opc); + RETURN(ERR_PTR(-ENOTSUPP)); + } + + RETURN(h); +} + +static int process_req_last_xid(struct ptlrpc_request *req) +{ + __u64 last_xid; + ENTRY; + + /* check request's xid is consistent with export's last_xid */ + last_xid = lustre_msg_get_last_xid(req->rq_reqmsg); + if (last_xid > req->rq_export->exp_last_xid) + req->rq_export->exp_last_xid = last_xid; + + if (req->rq_xid == 0 || + (req->rq_xid <= req->rq_export->exp_last_xid)) { + DEBUG_REQ(D_ERROR, req, "Unexpected xid %llx vs. " + "last_xid %llx\n", req->rq_xid, + req->rq_export->exp_last_xid); + /* Some request is allowed to be sent during replay, + * such as OUT update requests, FLD requests, so it + * is possible that replay requests has smaller XID + * than the exp_last_xid. + * + * Some non-replay requests may have smaller XID as + * well: + * + * - Client send a no_resend RPC, like statfs; + * - The RPC timedout (or some other error) on client, + * then it's removed from the unreplied list; + * - Client send some other request to bump the + * exp_last_xid on server; + * - The former RPC got chance to be processed; + */ + if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) + RETURN(-EPROTO); + } + + /* The "last_xid" is the minimum xid among unreplied requests, + * if the request is from the previous connection, its xid can + * still be larger than "exp_last_xid", then the above check of + * xid is not enough to determine whether the request is delayed. + * + * For example, if some replay request was delayed and caused + * timeout at client and the replay is restarted, the delayed + * replay request will have the larger xid than "exp_last_xid" + */ + if (req->rq_export->exp_conn_cnt > + lustre_msg_get_conn_cnt(req->rq_reqmsg)) + RETURN(-ESTALE); + + /* try to release in-memory reply data */ + if (tgt_is_multimodrpcs_client(req->rq_export)) { + tgt_handle_received_xid(req->rq_export, + lustre_msg_get_last_xid(req->rq_reqmsg)); + if (!(lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY))) + tgt_handle_tag(req->rq_export, + lustre_msg_get_tag(req->rq_reqmsg)); + } + RETURN(0); +} + +int tgt_request_handle(struct ptlrpc_request *req) +{ + struct tgt_session_info *tsi = tgt_ses_info(req->rq_svc_thread->t_env); + + struct lustre_msg *msg = req->rq_reqmsg; + struct tgt_handler *h; + struct lu_target *tgt; + int request_fail_id = 0; + __u32 opc = lustre_msg_get_opc(msg); + struct obd_device *obd; + int rc; + bool is_connect = false; + ENTRY; + + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) { + if (cfs_fail_val == 0 && + lustre_msg_get_opc(msg) != OBD_PING && + lustre_msg_get_flags(msg) & MSG_REQ_REPLAY_DONE) { + struct l_wait_info lwi = { 0 }; + + cfs_fail_val = 1; + cfs_race_state = 0; + l_wait_event(cfs_race_waitq, (cfs_race_state == 1), + &lwi); + } + } + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + tsi->tsi_pill = &req->rq_pill; + tsi->tsi_env = req->rq_svc_thread->t_env; + + /* if request has export then get handlers slice from corresponding + * target, otherwise that should be connect operation */ + if (opc == MDS_CONNECT || opc == OST_CONNECT || + opc == MGS_CONNECT) { + is_connect = true; + req_capsule_set(&req->rq_pill, &RQF_CONNECT); + rc = target_handle_connect(req); + if (rc != 0) { + rc = ptlrpc_error(req); + GOTO(out, rc); + } + /* recovery-small test 18c asks to drop connect reply */ + if (unlikely(opc == OST_CONNECT && + OBD_FAIL_CHECK(OBD_FAIL_OST_CONNECT_NET2))) + GOTO(out, rc = 0); + } + + if (unlikely(!class_connected_export(req->rq_export))) { + if (opc == SEC_CTX_INIT || opc == SEC_CTX_INIT_CONT || + opc == SEC_CTX_FINI) { + /* sec context initialization has to be handled + * by hand in tgt_handle_request0() */ + tsi->tsi_reply_fail_id = OBD_FAIL_SEC_CTX_INIT_NET; + h = NULL; + GOTO(handle_recov, rc = 0); + } + CDEBUG(D_HA, "operation %d on unconnected OST from %s\n", + opc, libcfs_id2str(req->rq_peer)); + req->rq_status = -ENOTCONN; + rc = ptlrpc_error(req); + GOTO(out, rc); + } + + tsi->tsi_tgt = tgt = class_exp2tgt(req->rq_export); + tsi->tsi_exp = req->rq_export; + if (exp_connect_flags(req->rq_export) & OBD_CONNECT_JOBSTATS) + tsi->tsi_jobid = lustre_msg_get_jobid(req->rq_reqmsg); + else + tsi->tsi_jobid = NULL; + + if (tgt == NULL) { + DEBUG_REQ(D_ERROR, req, "%s: No target for connected export\n", + class_exp2obd(req->rq_export)->obd_name); + req->rq_status = -EINVAL; + rc = ptlrpc_error(req); + GOTO(out, rc); + } + + /* Skip last_xid processing for the recovery thread, otherwise, the + * last_xid on same request could be processed twice: first time when + * processing the incoming request, second time when the request is + * being processed by recovery thread. */ + obd = class_exp2obd(req->rq_export); + if (is_connect) { + /* reset the exp_last_xid on each connection. */ + req->rq_export->exp_last_xid = 0; + } else if (obd->obd_recovery_data.trd_processing_task != + current_pid()) { + rc = process_req_last_xid(req); + if (rc) { + req->rq_status = rc; + rc = ptlrpc_error(req); + GOTO(out, rc); + } + } + + request_fail_id = tgt->lut_request_fail_id; + tsi->tsi_reply_fail_id = tgt->lut_reply_fail_id; + + h = tgt_handler_find_check(req); + if (IS_ERR(h)) { + req->rq_status = PTR_ERR(h); + rc = ptlrpc_error(req); + GOTO(out, rc); + } + + LASSERTF(h->th_opc == opc, "opcode mismatch %d != %d\n", + h->th_opc, opc); + + if (CFS_FAIL_CHECK_ORSET(request_fail_id, CFS_FAIL_ONCE)) + GOTO(out, rc = 0); + + rc = lustre_msg_check_version(msg, h->th_version); + if (unlikely(rc)) { + DEBUG_REQ(D_ERROR, req, "%s: drop mal-formed request, version" + " %08x, expecting %08x\n", tgt_name(tgt), + lustre_msg_get_version(msg), h->th_version); + req->rq_status = -EINVAL; + rc = ptlrpc_error(req); + GOTO(out, rc); + } + +handle_recov: + rc = tgt_handle_recovery(req, tsi->tsi_reply_fail_id); + if (likely(rc == 1)) { + rc = tgt_handle_request0(tsi, h, req); + if (rc) + GOTO(out, rc); + } + EXIT; +out: + req_capsule_fini(tsi->tsi_pill); + if (tsi->tsi_corpus != NULL) { + lu_object_put(tsi->tsi_env, tsi->tsi_corpus); + tsi->tsi_corpus = NULL; + } + return rc; +} +EXPORT_SYMBOL(tgt_request_handle); + +/** Assign high priority operations to the request if needed. */ +int tgt_hpreq_handler(struct ptlrpc_request *req) +{ + struct tgt_session_info *tsi = tgt_ses_info(req->rq_svc_thread->t_env); + struct tgt_handler *h; + int rc; + + ENTRY; + + if (req->rq_export == NULL) + RETURN(0); + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + tsi->tsi_pill = &req->rq_pill; + tsi->tsi_env = req->rq_svc_thread->t_env; + tsi->tsi_tgt = class_exp2tgt(req->rq_export); + tsi->tsi_exp = req->rq_export; + + h = tgt_handler_find_check(req); + if (IS_ERR(h)) { + rc = PTR_ERR(h); + RETURN(rc); + } + + rc = tgt_request_preprocess(tsi, h, req); + if (unlikely(rc != 0)) + RETURN(rc); + + if (h->th_hp != NULL) + h->th_hp(tsi); + RETURN(0); +} +EXPORT_SYMBOL(tgt_hpreq_handler); + +void tgt_counter_incr(struct obd_export *exp, int opcode) +{ + lprocfs_counter_incr(exp->exp_obd->obd_stats, opcode); + if (exp->exp_nid_stats && exp->exp_nid_stats->nid_stats != NULL) + lprocfs_counter_incr(exp->exp_nid_stats->nid_stats, opcode); +} +EXPORT_SYMBOL(tgt_counter_incr); + +/* + * Unified target generic handlers. + */ + +int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, struct obd_export *exp) +{ + struct lu_target *tgt = class_exp2tgt(exp); + struct sptlrpc_flavor flvr; + int rc = 0; + + LASSERT(tgt); + LASSERT(tgt->lut_obd); + LASSERT(tgt->lut_slice); + + /* always allow ECHO client */ + if (unlikely(strcmp(exp->exp_obd->obd_type->typ_name, + LUSTRE_ECHO_NAME) == 0)) { + exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY; + return 0; + } + + if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { + read_lock(&tgt->lut_sptlrpc_lock); + sptlrpc_target_choose_flavor(&tgt->lut_sptlrpc_rset, + req->rq_sp_from, + req->rq_peer.nid, + &flvr); + read_unlock(&tgt->lut_sptlrpc_lock); + + spin_lock(&exp->exp_lock); + exp->exp_sp_peer = req->rq_sp_from; + exp->exp_flvr = flvr; + + /* when on mgs, if no restriction is set, or if the client + * NID is on the local node, allow any flavor + */ + if ((strcmp(exp->exp_obd->obd_type->typ_name, + LUSTRE_MGS_NAME) == 0) && + (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_NULL || + LNetIsPeerLocal(exp->exp_connection->c_peer.nid))) + exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY; + + if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY && + exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) { + CERROR("%s: unauthorized rpc flavor %x from %s, " + "expect %x\n", tgt_name(tgt), + req->rq_flvr.sf_rpc, + libcfs_nid2str(req->rq_peer.nid), + exp->exp_flvr.sf_rpc); + rc = -EACCES; + } + spin_unlock(&exp->exp_lock); + } else { + if (exp->exp_sp_peer != req->rq_sp_from) { + CERROR("%s: RPC source %s doesn't match %s\n", + tgt_name(tgt), + sptlrpc_part2name(req->rq_sp_from), + sptlrpc_part2name(exp->exp_sp_peer)); + rc = -EACCES; + } else { + rc = sptlrpc_target_export_check(exp, req); + } + } + + return rc; +} + +int tgt_adapt_sptlrpc_conf(struct lu_target *tgt) +{ + struct sptlrpc_rule_set tmp_rset; + int rc; + + if (unlikely(tgt == NULL)) { + CERROR("No target passed"); + return -EINVAL; + } + + sptlrpc_rule_set_init(&tmp_rset); + rc = sptlrpc_conf_target_get_rules(tgt->lut_obd, &tmp_rset); + if (rc) { + CERROR("%s: failed get sptlrpc rules: rc = %d\n", + tgt_name(tgt), rc); + return rc; + } + + sptlrpc_target_update_exp_flavor(tgt->lut_obd, &tmp_rset); + + write_lock(&tgt->lut_sptlrpc_lock); + sptlrpc_rule_set_free(&tgt->lut_sptlrpc_rset); + tgt->lut_sptlrpc_rset = tmp_rset; + write_unlock(&tgt->lut_sptlrpc_lock); + + return 0; +} +EXPORT_SYMBOL(tgt_adapt_sptlrpc_conf); + +int tgt_connect(struct tgt_session_info *tsi) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + struct obd_connect_data *reply; + int rc; + + ENTRY; + + /* XXX: better to call this check right after getting new export but + * before last_rcvd slot allocation to avoid server load upon insecure + * connects. This is to be fixed after unifiyng all targets. + */ + rc = tgt_connect_check_sptlrpc(req, tsi->tsi_exp); + if (rc) + GOTO(out, rc); + + /* To avoid exposing partially initialized connection flags, changes up + * to this point have been staged in reply->ocd_connect_flags. Now that + * connection handling has completed successfully, atomically update + * the connect flags in the shared export data structure. LU-1623 */ + reply = req_capsule_server_get(tsi->tsi_pill, &RMF_CONNECT_DATA); + spin_lock(&tsi->tsi_exp->exp_lock); + *exp_connect_flags_ptr(tsi->tsi_exp) = reply->ocd_connect_flags; + if (reply->ocd_connect_flags & OBD_CONNECT_FLAGS2) + *exp_connect_flags2_ptr(tsi->tsi_exp) = + reply->ocd_connect_flags2; + tsi->tsi_exp->exp_connect_data.ocd_brw_size = reply->ocd_brw_size; + spin_unlock(&tsi->tsi_exp->exp_lock); + + if (strcmp(tsi->tsi_exp->exp_obd->obd_type->typ_name, + LUSTRE_MDT_NAME) == 0) { + rc = req_check_sepol(tsi->tsi_pill); + if (rc) + GOTO(out, rc); + } + + RETURN(0); +out: + obd_disconnect(class_export_get(tsi->tsi_exp)); + return rc; +} +EXPORT_SYMBOL(tgt_connect); + +int tgt_disconnect(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_DISCONNECT_DELAY, cfs_fail_val); + + rc = target_handle_disconnect(tgt_ses_req(tsi)); + if (rc) + RETURN(err_serious(rc)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_disconnect); + +/* + * Unified target OBD handlers + */ +int tgt_obd_ping(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + /* The target-specific part of OBD_PING request handling. + * It controls Filter Modification Data (FMD) expiration each time + * PING is received. + * + * Valid only for replayable targets, e.g. MDT and OFD + */ + if (tsi->tsi_exp->exp_obd->obd_replayable) + tgt_fmd_expire(tsi->tsi_exp); + + rc = req_capsule_server_pack(tsi->tsi_pill); + if (rc) + RETURN(err_serious(rc)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_obd_ping); + +int tgt_obd_log_cancel(struct tgt_session_info *tsi) +{ + return err_serious(-EOPNOTSUPP); +} + +int tgt_send_buffer(struct tgt_session_info *tsi, struct lu_rdbuf *rdbuf) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct ptlrpc_request *req = tgt_ses_req(tsi); + struct obd_export *exp = req->rq_export; + struct ptlrpc_bulk_desc *desc; + struct l_wait_info *lwi = &tti->tti_u.update.tti_wait_info; + int i; + int rc; + + ENTRY; + + desc = ptlrpc_prep_bulk_exp(req, rdbuf->rb_nbufs, 1, + PTLRPC_BULK_PUT_SOURCE | PTLRPC_BULK_BUF_KVEC, + MDS_BULK_PORTAL, &ptlrpc_bulk_kvec_ops); + if (desc == NULL) + RETURN(-ENOMEM); + + for (i = 0; i < rdbuf->rb_nbufs; i++) + desc->bd_frag_ops->add_iov_frag(desc, + rdbuf->rb_bufs[i].lb_buf, + rdbuf->rb_bufs[i].lb_len); + + rc = target_bulk_io(exp, desc, lwi); + ptlrpc_free_bulk(desc); + RETURN(rc); +} +EXPORT_SYMBOL(tgt_send_buffer); + +int tgt_sendpage(struct tgt_session_info *tsi, struct lu_rdpg *rdpg, int nob) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct ptlrpc_request *req = tgt_ses_req(tsi); + struct obd_export *exp = req->rq_export; + struct ptlrpc_bulk_desc *desc; + struct l_wait_info *lwi = &tti->tti_u.rdpg.tti_wait_info; + int tmpcount; + int tmpsize; + int i; + int rc; + + ENTRY; + + desc = ptlrpc_prep_bulk_exp(req, rdpg->rp_npages, 1, + PTLRPC_BULK_PUT_SOURCE | + PTLRPC_BULK_BUF_KIOV, + MDS_BULK_PORTAL, + &ptlrpc_bulk_kiov_pin_ops); + if (desc == NULL) + RETURN(-ENOMEM); + + if (!(exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)) + /* old client requires reply size in it's PAGE_SIZE, + * which is rdpg->rp_count */ + nob = rdpg->rp_count; + + for (i = 0, tmpcount = nob; i < rdpg->rp_npages && tmpcount > 0; + i++, tmpcount -= tmpsize) { + tmpsize = min_t(int, tmpcount, PAGE_SIZE); + desc->bd_frag_ops->add_kiov_frag(desc, rdpg->rp_pages[i], 0, + tmpsize); + } + + LASSERT(desc->bd_nob == nob); + rc = target_bulk_io(exp, desc, lwi); + ptlrpc_free_bulk(desc); + RETURN(rc); +} +EXPORT_SYMBOL(tgt_sendpage); + +/* + * OBD_IDX_READ handler + */ +static int tgt_obd_idx_read(struct tgt_session_info *tsi) +{ + struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env); + struct lu_rdpg *rdpg = &tti->tti_u.rdpg.tti_rdpg; + struct idx_info *req_ii, *rep_ii; + int rc, i; + + ENTRY; + + memset(rdpg, 0, sizeof(*rdpg)); + req_capsule_set(tsi->tsi_pill, &RQF_OBD_IDX_READ); + + /* extract idx_info buffer from request & reply */ + req_ii = req_capsule_client_get(tsi->tsi_pill, &RMF_IDX_INFO); + if (req_ii == NULL || req_ii->ii_magic != IDX_INFO_MAGIC) + RETURN(err_serious(-EPROTO)); + + rc = req_capsule_server_pack(tsi->tsi_pill); + if (rc) + RETURN(err_serious(rc)); + + rep_ii = req_capsule_server_get(tsi->tsi_pill, &RMF_IDX_INFO); + if (rep_ii == NULL) + RETURN(err_serious(-EFAULT)); + rep_ii->ii_magic = IDX_INFO_MAGIC; + + /* extract hash to start with */ + rdpg->rp_hash = req_ii->ii_hash_start; + + /* extract requested attributes */ + rdpg->rp_attrs = req_ii->ii_attrs; + + /* check that fid packed in request is valid and supported */ + if (!fid_is_sane(&req_ii->ii_fid)) + RETURN(-EINVAL); + rep_ii->ii_fid = req_ii->ii_fid; + + /* copy flags */ + rep_ii->ii_flags = req_ii->ii_flags; + + /* compute number of pages to allocate, ii_count is the number of 4KB + * containers */ + if (req_ii->ii_count <= 0) + GOTO(out, rc = -EFAULT); + rdpg->rp_count = min_t(unsigned int, req_ii->ii_count << LU_PAGE_SHIFT, + exp_max_brw_size(tsi->tsi_exp)); + rdpg->rp_npages = (rdpg->rp_count + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* allocate pages to store the containers */ + OBD_ALLOC(rdpg->rp_pages, rdpg->rp_npages * sizeof(rdpg->rp_pages[0])); + if (rdpg->rp_pages == NULL) + GOTO(out, rc = -ENOMEM); + for (i = 0; i < rdpg->rp_npages; i++) { + rdpg->rp_pages[i] = alloc_page(GFP_NOFS); + if (rdpg->rp_pages[i] == NULL) + GOTO(out, rc = -ENOMEM); + } + + /* populate pages with key/record pairs */ + rc = dt_index_read(tsi->tsi_env, tsi->tsi_tgt->lut_bottom, rep_ii, rdpg); + if (rc < 0) + GOTO(out, rc); + + LASSERTF(rc <= rdpg->rp_count, "dt_index_read() returned more than " + "asked %d > %d\n", rc, rdpg->rp_count); + + /* send pages to client */ + rc = tgt_sendpage(tsi, rdpg, rc); + if (rc) + GOTO(out, rc); + EXIT; +out: + if (rdpg->rp_pages) { + for (i = 0; i < rdpg->rp_npages; i++) + if (rdpg->rp_pages[i]) + __free_page(rdpg->rp_pages[i]); + OBD_FREE(rdpg->rp_pages, + rdpg->rp_npages * sizeof(rdpg->rp_pages[0])); + } + return rc; +} + +struct tgt_handler tgt_obd_handlers[] = { +TGT_OBD_HDL (0, OBD_PING, tgt_obd_ping), +TGT_OBD_HDL (0, OBD_IDX_READ, tgt_obd_idx_read) +}; +EXPORT_SYMBOL(tgt_obd_handlers); + +int tgt_sync(const struct lu_env *env, struct lu_target *tgt, + struct dt_object *obj, __u64 start, __u64 end) +{ + int rc = 0; + + ENTRY; + + /* if no objid is specified, it means "sync whole filesystem" */ + if (obj == NULL) { + rc = dt_sync(env, tgt->lut_bottom); + } else if (dt_version_get(env, obj) > + tgt->lut_obd->obd_last_committed) { + rc = dt_object_sync(env, obj, start, end); + } + atomic_inc(&tgt->lut_sync_count); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_sync); +/* + * Unified target DLM handlers. + */ + +/** + * Unified target BAST + * + * Ensure data and metadata are synced to disk when lock is canceled if Sync on + * Cancel (SOC) is enabled. If it's extent lock, normally sync obj is enough, + * but if it's cross-MDT lock, because remote object version is not set, a + * filesystem sync is needed. + * + * \param lock server side lock + * \param desc lock desc + * \param data ldlm_cb_set_arg + * \param flag indicates whether this cancelling or blocking callback + * \retval 0 on success + * \retval negative number on error + */ +static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct lu_env env; + struct lu_target *tgt; + struct dt_object *obj = NULL; + struct lu_fid fid; + int rc = 0; + + ENTRY; + + tgt = class_exp2tgt(lock->l_export); + + if (unlikely(tgt == NULL)) { + CDEBUG(D_ERROR, "%s: No target for connected export\n", + class_exp2obd(lock->l_export)->obd_name); + RETURN(-EINVAL); + } + + if (flag == LDLM_CB_CANCELING && + (lock->l_granted_mode & (LCK_EX | LCK_PW | LCK_GROUP)) && + (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_ALWAYS || + (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_BLOCKING && + ldlm_is_cbpending(lock))) && + ((exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) || + lock->l_resource->lr_type == LDLM_EXTENT)) { + __u64 start = 0; + __u64 end = OBD_OBJECT_EOF; + + rc = lu_env_init(&env, LCT_DT_THREAD); + if (unlikely(rc != 0)) + GOTO(err, rc); + + ost_fid_from_resid(&fid, &lock->l_resource->lr_name, + tgt->lut_lsd.lsd_osd_index); + + if (lock->l_resource->lr_type == LDLM_EXTENT) { + obj = dt_locate(&env, tgt->lut_bottom, &fid); + if (IS_ERR(obj)) + GOTO(err_env, rc = PTR_ERR(obj)); + + if (!dt_object_exists(obj)) + GOTO(err_put, rc = -ENOENT); + + start = lock->l_policy_data.l_extent.start; + end = lock->l_policy_data.l_extent.end; + } + + rc = tgt_sync(&env, tgt, obj, start, end); + if (rc < 0) { + CERROR("%s: syncing "DFID" (%llu-%llu) on lock " + "cancel: rc = %d\n", + tgt_name(tgt), PFID(&fid), + lock->l_policy_data.l_extent.start, + lock->l_policy_data.l_extent.end, rc); + } +err_put: + if (obj != NULL) + dt_object_put(&env, obj); +err_env: + lu_env_fini(&env); + } +err: + rc = ldlm_server_blocking_ast(lock, desc, data, flag); + RETURN(rc); +} + +static struct ldlm_callback_suite tgt_dlm_cbs = { + .lcs_completion = ldlm_server_completion_ast, + .lcs_blocking = tgt_blocking_ast, + .lcs_glimpse = ldlm_server_glimpse_ast +}; + +int tgt_enqueue(struct tgt_session_info *tsi) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + int rc; + + ENTRY; + /* + * tsi->tsi_dlm_req was already swapped and (if necessary) converted, + * tsi->tsi_dlm_cbs was set by the *_req_handle() function. + */ + LASSERT(tsi->tsi_dlm_req != NULL); + rc = ldlm_handle_enqueue0(tsi->tsi_exp->exp_obd->obd_namespace, req, + tsi->tsi_dlm_req, &tgt_dlm_cbs); + if (rc) + RETURN(err_serious(rc)); + + switch (LUT_FAIL_CLASS(tsi->tsi_reply_fail_id)) { + case LUT_FAIL_MDT: + tsi->tsi_reply_fail_id = OBD_FAIL_MDS_LDLM_REPLY_NET; + break; + case LUT_FAIL_OST: + tsi->tsi_reply_fail_id = OBD_FAIL_OST_LDLM_REPLY_NET; + break; + case LUT_FAIL_MGT: + tsi->tsi_reply_fail_id = OBD_FAIL_MGS_LDLM_REPLY_NET; + break; + default: + tsi->tsi_reply_fail_id = OBD_FAIL_LDLM_REPLY; + break; + } + RETURN(req->rq_status); +} +EXPORT_SYMBOL(tgt_enqueue); + +int tgt_convert(struct tgt_session_info *tsi) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + int rc; + + ENTRY; + LASSERT(tsi->tsi_dlm_req); + rc = ldlm_handle_convert0(req, tsi->tsi_dlm_req); + if (rc) + RETURN(err_serious(rc)); + + RETURN(req->rq_status); +} + +int tgt_bl_callback(struct tgt_session_info *tsi) +{ + return err_serious(-EOPNOTSUPP); +} + +int tgt_cp_callback(struct tgt_session_info *tsi) +{ + return err_serious(-EOPNOTSUPP); +} + +/* generic LDLM target handler */ +struct tgt_handler tgt_dlm_handlers[] = { +TGT_DLM_HDL (HABEO_CLAVIS, LDLM_ENQUEUE, tgt_enqueue), +TGT_DLM_HDL (HABEO_CLAVIS, LDLM_CONVERT, tgt_convert), +TGT_DLM_HDL_VAR(0, LDLM_BL_CALLBACK, tgt_bl_callback), +TGT_DLM_HDL_VAR(0, LDLM_CP_CALLBACK, tgt_cp_callback) +}; +EXPORT_SYMBOL(tgt_dlm_handlers); + +/* + * Unified target LLOG handlers. + */ +int tgt_llog_open(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + rc = llog_origin_handle_open(tgt_ses_req(tsi)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_llog_open); + +int tgt_llog_read_header(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + rc = llog_origin_handle_read_header(tgt_ses_req(tsi)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_llog_read_header); + +int tgt_llog_next_block(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + rc = llog_origin_handle_next_block(tgt_ses_req(tsi)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_llog_next_block); + +int tgt_llog_prev_block(struct tgt_session_info *tsi) +{ + int rc; + + ENTRY; + + rc = llog_origin_handle_prev_block(tgt_ses_req(tsi)); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_llog_prev_block); + +/* generic llog target handler */ +struct tgt_handler tgt_llog_handlers[] = { +TGT_LLOG_HDL (0, LLOG_ORIGIN_HANDLE_CREATE, tgt_llog_open), +TGT_LLOG_HDL (0, LLOG_ORIGIN_HANDLE_NEXT_BLOCK, tgt_llog_next_block), +TGT_LLOG_HDL (0, LLOG_ORIGIN_HANDLE_READ_HEADER, tgt_llog_read_header), +TGT_LLOG_HDL (0, LLOG_ORIGIN_HANDLE_PREV_BLOCK, tgt_llog_prev_block), +}; +EXPORT_SYMBOL(tgt_llog_handlers); + +/* + * sec context handlers + */ +/* XXX: Implement based on mdt_sec_ctx_handle()? */ +static int tgt_sec_ctx_handle(struct tgt_session_info *tsi) +{ + return 0; +} + +struct tgt_handler tgt_sec_ctx_handlers[] = { +TGT_SEC_HDL_VAR(0, SEC_CTX_INIT, tgt_sec_ctx_handle), +TGT_SEC_HDL_VAR(0, SEC_CTX_INIT_CONT, tgt_sec_ctx_handle), +TGT_SEC_HDL_VAR(0, SEC_CTX_FINI, tgt_sec_ctx_handle), +}; +EXPORT_SYMBOL(tgt_sec_ctx_handlers); + +int (*tgt_lfsck_in_notify_local)(const struct lu_env *env, + struct dt_device *key, + struct lfsck_req_local *lrl, + struct thandle *th) = NULL; + +void tgt_register_lfsck_in_notify_local(int (*notify)(const struct lu_env *, + struct dt_device *, + struct lfsck_req_local *, + struct thandle *)) +{ + tgt_lfsck_in_notify_local = notify; +} +EXPORT_SYMBOL(tgt_register_lfsck_in_notify_local); + +int (*tgt_lfsck_in_notify)(const struct lu_env *env, + struct dt_device *key, + struct lfsck_request *lr) = NULL; + +void tgt_register_lfsck_in_notify(int (*notify)(const struct lu_env *, + struct dt_device *, + struct lfsck_request *)) +{ + tgt_lfsck_in_notify = notify; +} +EXPORT_SYMBOL(tgt_register_lfsck_in_notify); + +static int (*tgt_lfsck_query)(const struct lu_env *env, + struct dt_device *key, + struct lfsck_request *req, + struct lfsck_reply *rep, + struct lfsck_query *que) = NULL; + +void tgt_register_lfsck_query(int (*query)(const struct lu_env *, + struct dt_device *, + struct lfsck_request *, + struct lfsck_reply *, + struct lfsck_query *)) +{ + tgt_lfsck_query = query; +} +EXPORT_SYMBOL(tgt_register_lfsck_query); + +/* LFSCK request handlers */ +static int tgt_handle_lfsck_notify(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct dt_device *key = tsi->tsi_tgt->lut_bottom; + struct lfsck_request *lr; + int rc; + ENTRY; + + lr = req_capsule_client_get(tsi->tsi_pill, &RMF_LFSCK_REQUEST); + if (lr == NULL) + RETURN(-EPROTO); + + rc = tgt_lfsck_in_notify(env, key, lr); + + RETURN(rc); +} + +static int tgt_handle_lfsck_query(struct tgt_session_info *tsi) +{ + struct lfsck_request *request; + struct lfsck_reply *reply; + int rc; + ENTRY; + + request = req_capsule_client_get(tsi->tsi_pill, &RMF_LFSCK_REQUEST); + if (request == NULL) + RETURN(-EPROTO); + + reply = req_capsule_server_get(tsi->tsi_pill, &RMF_LFSCK_REPLY); + if (reply == NULL) + RETURN(-ENOMEM); + + rc = tgt_lfsck_query(tsi->tsi_env, tsi->tsi_tgt->lut_bottom, + request, reply, NULL); + + RETURN(rc < 0 ? rc : 0); +} + +struct tgt_handler tgt_lfsck_handlers[] = { +TGT_LFSCK_HDL(HABEO_REFERO, LFSCK_NOTIFY, tgt_handle_lfsck_notify), +TGT_LFSCK_HDL(HABEO_REFERO, LFSCK_QUERY, tgt_handle_lfsck_query), +}; +EXPORT_SYMBOL(tgt_lfsck_handlers); + +/* + * initialize per-thread page pool (bug 5137). + */ +int tgt_io_thread_init(struct ptlrpc_thread *thread) +{ + struct tgt_thread_big_cache *tbc; + + ENTRY; + + LASSERT(thread != NULL); + LASSERT(thread->t_data == NULL); + + OBD_ALLOC_LARGE(tbc, sizeof(*tbc)); + if (tbc == NULL) + RETURN(-ENOMEM); + thread->t_data = tbc; + RETURN(0); +} +EXPORT_SYMBOL(tgt_io_thread_init); + +/* + * free per-thread pool created by tgt_thread_init(). + */ +void tgt_io_thread_done(struct ptlrpc_thread *thread) +{ + struct tgt_thread_big_cache *tbc; + + ENTRY; + + LASSERT(thread != NULL); + + /* + * be prepared to handle partially-initialized pools (because this is + * called from ost_io_thread_init() for cleanup. + */ + tbc = thread->t_data; + if (tbc != NULL) { + OBD_FREE_LARGE(tbc, sizeof(*tbc)); + thread->t_data = NULL; + } + EXIT; +} +EXPORT_SYMBOL(tgt_io_thread_done); + +/** + * Helper function for getting Data-on-MDT file server DLM lock + * if asked by client. + */ +int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id, + struct lustre_handle *lh, int mode, __u64 *flags) +{ + union ldlm_policy_data policy = { + .l_inodebits.bits = MDS_INODELOCK_DOM, + }; + int rc; + + ENTRY; + + LASSERT(lh != NULL); + LASSERT(ns != NULL); + LASSERT(!lustre_handle_is_used(lh)); + + rc = ldlm_cli_enqueue_local(NULL, ns, res_id, LDLM_IBITS, &policy, mode, + flags, ldlm_blocking_ast, + ldlm_completion_ast, ldlm_glimpse_ast, + NULL, 0, LVB_T_NONE, NULL, lh); + + RETURN(rc == ELDLM_OK ? 0 : -EIO); +} +EXPORT_SYMBOL(tgt_mdt_data_lock); + +void tgt_mdt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode) +{ + LASSERT(lustre_handle_is_used(lh)); + ldlm_lock_decref(lh, mode); +} +EXPORT_SYMBOL(tgt_mdt_data_unlock); + +/** + * Helper function for getting server side [start, start+count] DLM lock + * if asked by client. + */ +int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns, + struct ldlm_res_id *res_id, __u64 start, __u64 end, + struct lustre_handle *lh, int mode, __u64 *flags) +{ + union ldlm_policy_data policy; + int rc; + + ENTRY; + + LASSERT(lh != NULL); + LASSERT(ns != NULL); + LASSERT(!lustre_handle_is_used(lh)); + + policy.l_extent.gid = 0; + policy.l_extent.start = start & PAGE_MASK; + + /* + * If ->o_blocks is EOF it means "lock till the end of the file". + * Otherwise, it's size of an extent or hole being punched (in bytes). + */ + if (end == OBD_OBJECT_EOF || end < start) + policy.l_extent.end = OBD_OBJECT_EOF; + else + policy.l_extent.end = end | ~PAGE_MASK; + + rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_EXTENT, &policy, + mode, flags, ldlm_blocking_ast, + ldlm_completion_ast, ldlm_glimpse_ast, + NULL, 0, LVB_T_NONE, NULL, lh); + RETURN(rc == ELDLM_OK ? 0 : -EIO); +} +EXPORT_SYMBOL(tgt_extent_lock); + +void tgt_extent_unlock(struct lustre_handle *lh, enum ldlm_mode mode) +{ + LASSERT(lustre_handle_is_used(lh)); + ldlm_lock_decref(lh, mode); +} +EXPORT_SYMBOL(tgt_extent_unlock); + +static int tgt_brw_lock(const struct lu_env *env, struct obd_export *exp, + struct ldlm_res_id *res_id, struct obd_ioobj *obj, + struct niobuf_remote *nb, struct lustre_handle *lh, + enum ldlm_mode mode) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + __u64 flags = 0; + int nrbufs = obj->ioo_bufcnt; + int i; + int rc; + + ENTRY; + + LASSERT(mode == LCK_PR || mode == LCK_PW); + LASSERT(!lustre_handle_is_used(lh)); + + if (ns->ns_obd->obd_recovering) + RETURN(0); + + if (nrbufs == 0 || !(nb[0].rnb_flags & OBD_BRW_SRVLOCK)) + RETURN(0); + + for (i = 1; i < nrbufs; i++) + if (!(nb[i].rnb_flags & OBD_BRW_SRVLOCK)) + RETURN(-EFAULT); + + /* MDT IO for data-on-mdt */ + if (exp->exp_connect_data.ocd_connect_flags & OBD_CONNECT_IBITS) + rc = tgt_mdt_data_lock(ns, res_id, lh, mode, &flags); + else + rc = tgt_extent_lock(env, ns, res_id, nb[0].rnb_offset, + nb[nrbufs - 1].rnb_offset + + nb[nrbufs - 1].rnb_len - 1, + lh, mode, &flags); + RETURN(rc); +} + +static void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob, + struct lustre_handle *lh, enum ldlm_mode mode) +{ + ENTRY; + + LASSERT(mode == LCK_PR || mode == LCK_PW); + LASSERT((obj->ioo_bufcnt > 0 && + (niob[0].rnb_flags & OBD_BRW_SRVLOCK)) == + lustre_handle_is_used(lh)); + + if (lustre_handle_is_used(lh)) + tgt_extent_unlock(lh, mode); + EXIT; +} +static int tgt_checksum_niobuf(struct lu_target *tgt, + struct niobuf_local *local_nb, int npages, + int opc, enum cksum_types cksum_type, + __u32 *cksum) +{ + struct ahash_request *req; + unsigned int bufsize; + int i, err; + unsigned char cfs_alg = cksum_obd2cfs(cksum_type); + + req = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(req)) { + CERROR("%s: unable to initialize checksum hash %s\n", + tgt_name(tgt), cfs_crypto_hash_name(cfs_alg)); + return PTR_ERR(req); + } + + CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg)); + for (i = 0; i < npages; i++) { + /* corrupt the data before we compute the checksum, to + * simulate a client->OST data error */ + if (i == 0 && opc == OST_WRITE && + OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) { + int off = local_nb[i].lnb_page_offset & ~PAGE_MASK; + int len = local_nb[i].lnb_len; + struct page *np = tgt_page_to_corrupt; + + if (np) { + char *ptr = ll_kmap_atomic(local_nb[i].lnb_page, + KM_USER0); + char *ptr2 = page_address(np); + + memcpy(ptr2 + off, ptr + off, len); + memcpy(ptr2 + off, "bad3", min(4, len)); + ll_kunmap_atomic(ptr, KM_USER0); + + /* LU-8376 to preserve original index for + * display in dump_all_bulk_pages() */ + np->index = i; + + cfs_crypto_hash_update_page(req, np, off, + len); + continue; + } else { + CERROR("%s: can't alloc page for corruption\n", + tgt_name(tgt)); + } + } + cfs_crypto_hash_update_page(req, local_nb[i].lnb_page, + local_nb[i].lnb_page_offset & ~PAGE_MASK, + local_nb[i].lnb_len); + + /* corrupt the data after we compute the checksum, to + * simulate an OST->client data error */ + if (i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) { + int off = local_nb[i].lnb_page_offset & ~PAGE_MASK; + int len = local_nb[i].lnb_len; + struct page *np = tgt_page_to_corrupt; + + if (np) { + char *ptr = ll_kmap_atomic(local_nb[i].lnb_page, + KM_USER0); + char *ptr2 = page_address(np); + + memcpy(ptr2 + off, ptr + off, len); + memcpy(ptr2 + off, "bad4", min(4, len)); + ll_kunmap_atomic(ptr, KM_USER0); + + /* LU-8376 to preserve original index for + * display in dump_all_bulk_pages() */ + np->index = i; + + cfs_crypto_hash_update_page(req, np, off, + len); + continue; + } else { + CERROR("%s: can't alloc page for corruption\n", + tgt_name(tgt)); + } + } + } + + bufsize = sizeof(*cksum); + err = cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize); + + return 0; +} + +char dbgcksum_file_name[PATH_MAX]; + +static void dump_all_bulk_pages(struct obdo *oa, int count, + struct niobuf_local *local_nb, + __u32 server_cksum, __u32 client_cksum) +{ + struct file *filp; + int rc, i; + unsigned int len; + char *buf; + + /* will only keep dump of pages on first error for the same range in + * file/fid, not during the resends/retries. */ + snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name), + "%s-checksum_dump-ost-"DFID":[%llu-%llu]-%x-%x", + (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0 ? + libcfs_debug_file_path_arr : + LIBCFS_DEBUG_FILE_PATH_DEFAULT), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + local_nb[0].lnb_file_offset, + local_nb[count-1].lnb_file_offset + + local_nb[count-1].lnb_len - 1, client_cksum, server_cksum); + filp = filp_open(dbgcksum_file_name, + O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + if (rc == -EEXIST) + CDEBUG(D_INFO, "%s: can't open to dump pages with " + "checksum error: rc = %d\n", dbgcksum_file_name, + rc); + else + CERROR("%s: can't open to dump pages with checksum " + "error: rc = %d\n", dbgcksum_file_name, rc); + return; + } + + for (i = 0; i < count; i++) { + len = local_nb[i].lnb_len; + buf = kmap(local_nb[i].lnb_page); + while (len != 0) { + rc = cfs_kernel_write(filp, buf, len, &filp->f_pos); + if (rc < 0) { + CERROR("%s: wanted to write %u but got %d " + "error\n", dbgcksum_file_name, len, rc); + break; + } + len -= rc; + buf += rc; + CDEBUG(D_INFO, "%s: wrote %d bytes\n", + dbgcksum_file_name, rc); + } + kunmap(local_nb[i].lnb_page); + } + + rc = ll_vfs_fsync_range(filp, 0, LLONG_MAX, 1); + if (rc) + CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc); + filp_close(filp, NULL); + return; +} + +static int check_read_checksum(struct niobuf_local *local_nb, int npages, + struct obd_export *exp, struct obdo *oa, + const struct lnet_process_id *peer, + __u32 client_cksum, __u32 server_cksum, + enum cksum_types server_cksum_type) +{ + char *msg; + enum cksum_types cksum_type; + loff_t start, end; + + /* unlikely to happen and only if resend does not occur due to cksum + * control failure on Client */ + if (unlikely(server_cksum == client_cksum)) { + CDEBUG(D_PAGE, "checksum %x confirmed upon retry\n", + client_cksum); + return 0; + } + + if (exp->exp_obd->obd_checksum_dump) + dump_all_bulk_pages(oa, npages, local_nb, server_cksum, + client_cksum); + + cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ? + oa->o_flags : 0); + + if (cksum_type != server_cksum_type) + msg = "the server may have not used the checksum type specified" + " in the original request - likely a protocol problem"; + else + msg = "should have changed on the client or in transit"; + + start = local_nb[0].lnb_file_offset; + end = local_nb[npages-1].lnb_file_offset + + local_nb[npages-1].lnb_len - 1; + + LCONSOLE_ERROR_MSG(0x132, "%s: BAD READ CHECKSUM: %s: from %s inode " + DFID " object "DOSTID" extent [%llu-%llu], client returned csum" + " %x (type %x), server csum %x (type %x)\n", + exp->exp_obd->obd_name, + msg, libcfs_nid2str(peer->nid), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + POSTID(&oa->o_oi), + start, end, client_cksum, cksum_type, server_cksum, + server_cksum_type); + + return 1; +} + +static int tgt_pages2shortio(struct niobuf_local *local, int npages, + unsigned char *buf, int size) +{ + int i, off, len, copied = size; + char *ptr; + + for (i = 0; i < npages; i++) { + off = local[i].lnb_page_offset & ~PAGE_MASK; + len = local[i].lnb_len; + + CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n", + i, off, len, size); + if (len > size) + return -EINVAL; + + ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0); + memcpy(buf + off, ptr, len); + ll_kunmap_atomic(ptr, KM_USER0); + buf += len; + size -= len; + } + return copied - size; +} + +static int tgt_checksum_niobuf_t10pi(struct lu_target *tgt, + struct niobuf_local *local_nb, + int npages, int opc, + obd_dif_csum_fn *fn, + int sector_size, + u32 *check_sum) +{ + enum cksum_types t10_cksum_type = tgt->lut_dt_conf.ddp_t10_cksum_type; + unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP); + const char *obd_name = tgt->lut_obd->obd_name; + struct ahash_request *req; + unsigned int bufsize; + unsigned char *buffer; + struct page *__page; + __u16 *guard_start; + int guard_number; + int used_number = 0; + __u32 cksum; + int rc = 0; + int used; + int i; + + __page = alloc_page(GFP_KERNEL); + if (__page == NULL) + return -ENOMEM; + + req = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(req)) { + CERROR("%s: unable to initialize checksum hash %s\n", + tgt_name(tgt), cfs_crypto_hash_name(cfs_alg)); + return PTR_ERR(req); + } + + buffer = kmap(__page); + guard_start = (__u16 *)buffer; + guard_number = PAGE_SIZE / sizeof(*guard_start); + for (i = 0; i < npages; i++) { + /* corrupt the data before we compute the checksum, to + * simulate a client->OST data error */ + if (i == 0 && opc == OST_WRITE && + OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) { + int off = local_nb[i].lnb_page_offset & ~PAGE_MASK; + int len = local_nb[i].lnb_len; + struct page *np = tgt_page_to_corrupt; + + if (np) { + char *ptr = ll_kmap_atomic(local_nb[i].lnb_page, + KM_USER0); + char *ptr2 = page_address(np); + + memcpy(ptr2 + off, ptr + off, len); + memcpy(ptr2 + off, "bad3", min(4, len)); + ll_kunmap_atomic(ptr, KM_USER0); + + /* LU-8376 to preserve original index for + * display in dump_all_bulk_pages() */ + np->index = i; + + cfs_crypto_hash_update_page(req, np, off, + len); + continue; + } else { + CERROR("%s: can't alloc page for corruption\n", + tgt_name(tgt)); + } + } + + /* + * The left guard number should be able to hold checksums of a + * whole page + */ + if (t10_cksum_type && opc == OST_READ && + local_nb[i].lnb_guard_disk) { + used = DIV_ROUND_UP(local_nb[i].lnb_len, sector_size); + if (used > (guard_number - used_number)) { + rc = -E2BIG; + break; + } + memcpy(guard_start + used_number, + local_nb[i].lnb_guards, + used * sizeof(*local_nb[i].lnb_guards)); + } else { + rc = obd_page_dif_generate_buffer(obd_name, + local_nb[i].lnb_page, + local_nb[i].lnb_page_offset & ~PAGE_MASK, + local_nb[i].lnb_len, guard_start + used_number, + guard_number - used_number, &used, sector_size, + fn); + if (rc) + break; + } + + LASSERT(used <= MAX_GUARD_NUMBER); + /* + * If disk support T10PI checksum, copy guards to local_nb. + * If the write is partial page, do not use the guards for bio + * submission since the data might not be full-sector. The bio + * guards will be generated later based on the full sectors. If + * the sector size is 512B rather than 4 KB, or the page size + * is larger than 4KB, this might drop some useful guards for + * partial page write, but it will only add minimal extra time + * of checksum calculation. + */ + if (t10_cksum_type && opc == OST_WRITE && + local_nb[i].lnb_len == PAGE_SIZE) { + local_nb[i].lnb_guard_rpc = 1; + memcpy(local_nb[i].lnb_guards, + guard_start + used_number, + used * sizeof(*local_nb[i].lnb_guards)); + } + + used_number += used; + if (used_number == guard_number) { + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + used_number = 0; + } + + /* corrupt the data after we compute the checksum, to + * simulate an OST->client data error */ + if (unlikely(i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND))) { + int off = local_nb[i].lnb_page_offset & ~PAGE_MASK; + int len = local_nb[i].lnb_len; + struct page *np = tgt_page_to_corrupt; + + if (np) { + char *ptr = ll_kmap_atomic(local_nb[i].lnb_page, + KM_USER0); + char *ptr2 = page_address(np); + + memcpy(ptr2 + off, ptr + off, len); + memcpy(ptr2 + off, "bad4", min(4, len)); + ll_kunmap_atomic(ptr, KM_USER0); + + /* LU-8376 to preserve original index for + * display in dump_all_bulk_pages() */ + np->index = i; + + cfs_crypto_hash_update_page(req, np, off, + len); + continue; + } else { + CERROR("%s: can't alloc page for corruption\n", + tgt_name(tgt)); + } + } + } + kunmap(__page); + if (rc) + GOTO(out, rc); + + if (used_number != 0) + cfs_crypto_hash_update_page(req, __page, 0, + used_number * sizeof(*guard_start)); + + bufsize = sizeof(cksum); + rc = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize); + + if (rc == 0) + *check_sum = cksum; +out: + __free_page(__page); + return rc; +} + +static int tgt_checksum_niobuf_rw(struct lu_target *tgt, + enum cksum_types cksum_type, + struct niobuf_local *local_nb, + int npages, int opc, u32 *check_sum) +{ + obd_dif_csum_fn *fn = NULL; + int sector_size = 0; + int rc; + + ENTRY; + obd_t10_cksum2dif(cksum_type, &fn, §or_size); + + if (fn) + rc = tgt_checksum_niobuf_t10pi(tgt, local_nb, npages, + opc, fn, sector_size, + check_sum); + else + rc = tgt_checksum_niobuf(tgt, local_nb, npages, opc, + cksum_type, check_sum); + RETURN(rc); +} + +int tgt_brw_read(struct tgt_session_info *tsi) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + struct ptlrpc_bulk_desc *desc = NULL; + struct obd_export *exp = tsi->tsi_exp; + struct niobuf_remote *remote_nb; + struct niobuf_local *local_nb; + struct obd_ioobj *ioo; + struct ost_body *body, *repbody; + struct l_wait_info lwi; + struct lustre_handle lockh = { 0 }; + int npages, nob = 0, rc, i, no_reply = 0, + npages_read; + struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data; + const char *obd_name = exp->exp_obd->obd_name; + + ENTRY; + + if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL && + ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) { + CERROR("%s: deny read request from %s to portal %u\n", + tgt_name(tsi->tsi_tgt), + obd_export_nid2str(req->rq_export), + ptlrpc_req2svc(req)->srv_req_portal); + RETURN(-EPROTO); + } + + req->rq_bulk_read = 1; + + if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK)) + RETURN(-EIO); + + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ? + cfs_fail_val : (obd_timeout + 1) / 4); + + /* Check if there is eviction in progress, and if so, wait for it to + * finish */ + if (unlikely(atomic_read(&exp->exp_obd->obd_evict_inprogress))) { + /* We do not care how long it takes */ + lwi = LWI_INTR(NULL, NULL); + rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq, + !atomic_read(&exp->exp_obd->obd_evict_inprogress), + &lwi); + } + + /* There must be big cache in current thread to process this request + * if it is NULL then something went wrong and it wasn't allocated, + * report -ENOMEM in that case */ + if (tbc == NULL) + RETURN(-ENOMEM); + + body = tsi->tsi_ost_body; + LASSERT(body != NULL); + + ioo = req_capsule_client_get(tsi->tsi_pill, &RMF_OBD_IOOBJ); + LASSERT(ioo != NULL); /* must exists after tgt_ost_body_unpack */ + + remote_nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE); + LASSERT(remote_nb != NULL); /* must exists after tgt_ost_body_unpack */ + + local_nb = tbc->local; + + rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb, + &lockh, LCK_PR); + if (rc != 0) + RETURN(rc); + + /* + * If getting the lock took more time than + * client was willing to wait, drop it. b=11330 + */ + if (ktime_get_real_seconds() > req->rq_deadline || + OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) { + no_reply = 1; + CERROR("Dropping timed-out read from %s because locking object " DOSTID " took %lld seconds (limit was %lld).\n", + libcfs_id2str(req->rq_peer), POSTID(&ioo->ioo_oid), + ktime_get_real_seconds() - req->rq_arrival_time.tv_sec, + req->rq_deadline - req->rq_arrival_time.tv_sec); + GOTO(out_lock, rc = -ETIMEDOUT); + } + + /* + * Because we already sync grant info with client when + * reconnect, grant info will be cleared for resent req, + * otherwise, outdated grant count in the rpc would de-sync + * grant counters in case of shrink + */ + if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) { + DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info"); + body->oa.o_valid &= ~OBD_MD_FLGRANT; + } + + repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + repbody->oa = body->oa; + + npages = PTLRPC_MAX_BRW_PAGES; + rc = obd_preprw(tsi->tsi_env, OBD_BRW_READ, exp, &repbody->oa, 1, + ioo, remote_nb, &npages, local_nb); + if (rc != 0) + GOTO(out_lock, rc); + + if (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_SHORT_IO) { + desc = NULL; + } else { + desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo), + PTLRPC_BULK_PUT_SOURCE | + PTLRPC_BULK_BUF_KIOV, + OST_BULK_PORTAL, + &ptlrpc_bulk_kiov_nopin_ops); + if (desc == NULL) + GOTO(out_commitrw, rc = -ENOMEM); + } + + nob = 0; + npages_read = npages; + for (i = 0; i < npages; i++) { + int page_rc = local_nb[i].lnb_rc; + + if (page_rc < 0) { + rc = page_rc; + npages_read = i; + break; + } + + nob += page_rc; + if (page_rc != 0 && desc != NULL) { /* some data! */ + LASSERT(local_nb[i].lnb_page != NULL); + desc->bd_frag_ops->add_kiov_frag + (desc, local_nb[i].lnb_page, + local_nb[i].lnb_page_offset & ~PAGE_MASK, + page_rc); + } + + if (page_rc != local_nb[i].lnb_len) { /* short read */ + local_nb[i].lnb_len = page_rc; + npages_read = i + (page_rc != 0 ? 1 : 0); + /* All subsequent pages should be 0 */ + while (++i < npages) + LASSERT(local_nb[i].lnb_rc == 0); + break; + } + } + if (OBD_FAIL_CHECK(OBD_FAIL_OST_READ_SIZE) && + nob != cfs_fail_val) + rc = -E2BIG; + + if (body->oa.o_valid & OBD_MD_FLCKSUM) { + u32 flag = body->oa.o_valid & OBD_MD_FLFLAGS ? + body->oa.o_flags : 0; + enum cksum_types cksum_type = obd_cksum_type_unpack(flag); + + repbody->oa.o_flags = obd_cksum_type_pack(obd_name, + cksum_type); + repbody->oa.o_valid = OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + + rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type, + local_nb, npages_read, OST_READ, + &repbody->oa.o_cksum); + if (rc < 0) + GOTO(out_commitrw, rc); + CDEBUG(D_PAGE, "checksum at read origin: %x\n", + repbody->oa.o_cksum); + + /* if a resend it could be for a cksum error, so check Server + * cksum with returned Client cksum (this should even cover + * zero-cksum case) */ + if ((body->oa.o_valid & OBD_MD_FLFLAGS) && + (body->oa.o_flags & OBD_FL_RECOV_RESEND)) + check_read_checksum(local_nb, npages_read, exp, + &body->oa, &req->rq_peer, + body->oa.o_cksum, + repbody->oa.o_cksum, cksum_type); + } else { + repbody->oa.o_valid = 0; + } + if (body->oa.o_valid & OBD_MD_FLGRANT) + repbody->oa.o_valid |= OBD_MD_FLGRANT; + /* We're finishing using body->oa as an input variable */ + + /* Check if client was evicted while we were doing i/o before touching + * network */ + if (rc == 0) { + if (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_SHORT_IO) { + unsigned char *short_io_buf; + int short_io_size; + + short_io_buf = req_capsule_server_get(&req->rq_pill, + &RMF_SHORT_IO); + short_io_size = req_capsule_get_size(&req->rq_pill, + &RMF_SHORT_IO, + RCL_SERVER); + rc = tgt_pages2shortio(local_nb, npages_read, + short_io_buf, short_io_size); + if (rc >= 0) + req_capsule_shrink(&req->rq_pill, + &RMF_SHORT_IO, rc, + RCL_SERVER); + rc = rc > 0 ? 0 : rc; + } else if (!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) { + rc = target_bulk_io(exp, desc, &lwi); + } + no_reply = rc != 0; + } else { + if (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_SHORT_IO) + req_capsule_shrink(&req->rq_pill, &RMF_SHORT_IO, 0, + RCL_SERVER); + } + +out_commitrw: + /* Must commit after prep above in all cases */ + rc = obd_commitrw(tsi->tsi_env, OBD_BRW_READ, exp, &repbody->oa, 1, ioo, + remote_nb, npages, local_nb, rc); +out_lock: + tgt_brw_unlock(ioo, remote_nb, &lockh, LCK_PR); + + if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) + ptlrpc_free_bulk(desc); + + LASSERT(rc <= 0); + if (rc == 0) { + rc = nob; + ptlrpc_lprocfs_brw(req, nob); + } else if (no_reply) { + req->rq_no_reply = 1; + /* reply out callback would free */ + ptlrpc_req_drop_rs(req); + LCONSOLE_WARN("%s: Bulk IO read error with %s (at %s), " + "client will retry: rc %d\n", + obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), rc); + } + /* send a bulk after reply to simulate a network delay or reordering + * by a router - Note that !desc implies short io, so there is no bulk + * to reorder. */ + if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) && + desc) { + wait_queue_head_t waitq; + struct l_wait_info lwi1; + + CDEBUG(D_INFO, "reorder BULK\n"); + init_waitqueue_head(&waitq); + + lwi1 = LWI_TIMEOUT_INTR(cfs_time_seconds(3), NULL, NULL, NULL); + l_wait_event(waitq, 0, &lwi1); + target_bulk_io(exp, desc, &lwi); + ptlrpc_free_bulk(desc); + } + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_brw_read); + +static int tgt_shortio2pages(struct niobuf_local *local, int npages, + unsigned char *buf, unsigned int size) +{ + int i, off, len; + char *ptr; + + for (i = 0; i < npages; i++) { + off = local[i].lnb_page_offset & ~PAGE_MASK; + len = local[i].lnb_len; + + if (len == 0) + continue; + + CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n", + i, off, len, size); + ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0); + if (ptr == NULL) + return -EINVAL; + memcpy(ptr + off, buf, len < size ? len : size); + ll_kunmap_atomic(ptr, KM_USER0); + buf += len; + size -= len; + } + return 0; +} + +static void tgt_warn_on_cksum(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + struct niobuf_local *local_nb, int npages, + u32 client_cksum, u32 server_cksum, + bool mmap) +{ + struct obd_export *exp = req->rq_export; + struct ost_body *body; + char *router = ""; + char *via = ""; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body != NULL); + + if (desc && req->rq_peer.nid != desc->bd_sender) { + via = " via "; + router = libcfs_nid2str(desc->bd_sender); + } + + if (exp->exp_obd->obd_checksum_dump) + dump_all_bulk_pages(&body->oa, npages, local_nb, server_cksum, + client_cksum); + + if (mmap) { + CDEBUG_LIMIT(D_INFO, "client csum %x, server csum %x\n", + client_cksum, server_cksum); + return; + } + + LCONSOLE_ERROR_MSG(0x168, "%s: BAD WRITE CHECKSUM: from %s%s%s inode " + DFID" object "DOSTID" extent [%llu-%llu" + "]: client csum %x, server csum %x\n", + exp->exp_obd->obd_name, libcfs_id2str(req->rq_peer), + via, router, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_seq : (__u64)0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_oid : 0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_ver : 0, + POSTID(&body->oa.o_oi), + local_nb[0].lnb_file_offset, + local_nb[npages-1].lnb_file_offset + + local_nb[npages - 1].lnb_len - 1, + client_cksum, server_cksum); +} + +int tgt_brw_write(struct tgt_session_info *tsi) +{ + struct ptlrpc_request *req = tgt_ses_req(tsi); + struct ptlrpc_bulk_desc *desc = NULL; + struct obd_export *exp = req->rq_export; + struct niobuf_remote *remote_nb; + struct niobuf_local *local_nb; + struct obd_ioobj *ioo; + struct ost_body *body, *repbody; + struct l_wait_info lwi; + struct lustre_handle lockh = {0}; + __u32 *rcs; + int objcount, niocount, npages; + int rc, i, j; + enum cksum_types cksum_type = OBD_CKSUM_CRC32; + bool no_reply = false, mmap; + struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data; + bool wait_sync = false; + const char *obd_name = exp->exp_obd->obd_name; + + ENTRY; + + if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL && + ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) { + CERROR("%s: deny write request from %s to portal %u\n", + tgt_name(tsi->tsi_tgt), + obd_export_nid2str(req->rq_export), + ptlrpc_req2svc(req)->srv_req_portal); + RETURN(err_serious(-EPROTO)); + } + + if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOSPC)) + RETURN(err_serious(-ENOSPC)); + if (OBD_FAIL_TIMEOUT(OBD_FAIL_OST_EROFS, 1)) + RETURN(err_serious(-EROFS)); + + req->rq_bulk_write = 1; + + if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK)) + RETURN(err_serious(-EIO)); + if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK2)) + RETURN(err_serious(-EFAULT)); + + /* pause before transaction has been started */ + CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ? + cfs_fail_val : (obd_timeout + 1) / 4); + + /* Delay write commit to show stale size information */ + CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_NO_SIZE_DATA, cfs_fail_val); + + /* There must be big cache in current thread to process this request + * if it is NULL then something went wrong and it wasn't allocated, + * report -ENOMEM in that case */ + if (tbc == NULL) + RETURN(-ENOMEM); + + body = tsi->tsi_ost_body; + LASSERT(body != NULL); + + ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ); + LASSERT(ioo != NULL); /* must exists after tgt_ost_body_unpack */ + + objcount = req_capsule_get_size(&req->rq_pill, &RMF_OBD_IOOBJ, + RCL_CLIENT) / sizeof(*ioo); + + for (niocount = i = 0; i < objcount; i++) + niocount += ioo[i].ioo_bufcnt; + + remote_nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE); + LASSERT(remote_nb != NULL); /* must exists after tgt_ost_body_unpack */ + if (niocount != req_capsule_get_size(&req->rq_pill, + &RMF_NIOBUF_REMOTE, RCL_CLIENT) / + sizeof(*remote_nb)) + RETURN(err_serious(-EPROTO)); + + if ((remote_nb[0].rnb_flags & OBD_BRW_MEMALLOC) && + ptlrpc_connection_is_local(exp->exp_connection)) + memory_pressure_set(); + + req_capsule_set_size(&req->rq_pill, &RMF_RCS, RCL_SERVER, + niocount * sizeof(*rcs)); + rc = req_capsule_server_pack(&req->rq_pill); + if (rc != 0) + GOTO(out, rc = err_serious(rc)); + + CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_PACK, cfs_fail_val); + rcs = req_capsule_server_get(&req->rq_pill, &RMF_RCS); + + local_nb = tbc->local; + + rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb, + &lockh, LCK_PW); + if (rc != 0) + GOTO(out, rc); + + /* + * If getting the lock took more time than + * client was willing to wait, drop it. b=11330 + */ + if (ktime_get_real_seconds() > req->rq_deadline || + OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) { + no_reply = true; + CERROR("%s: Dropping timed-out write from %s because locking object " DOSTID " took %lld seconds (limit was %lld).\n", + tgt_name(tsi->tsi_tgt), libcfs_id2str(req->rq_peer), + POSTID(&ioo->ioo_oid), + ktime_get_real_seconds() - req->rq_arrival_time.tv_sec, + req->rq_deadline - req->rq_arrival_time.tv_sec); + GOTO(out_lock, rc = -ETIMEDOUT); + } + + /* Because we already sync grant info with client when reconnect, + * grant info will be cleared for resent req, then fed_grant and + * total_grant will not be modified in following preprw_write */ + if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) { + DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info"); + body->oa.o_valid &= ~OBD_MD_FLGRANT; + } + + repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (repbody == NULL) + GOTO(out_lock, rc = -ENOMEM); + repbody->oa = body->oa; + + npages = PTLRPC_MAX_BRW_PAGES; + rc = obd_preprw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa, + objcount, ioo, remote_nb, &npages, local_nb); + if (rc < 0) + GOTO(out_lock, rc); + if (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_SHORT_IO) { + unsigned int short_io_size; + unsigned char *short_io_buf; + + short_io_size = req_capsule_get_size(&req->rq_pill, + &RMF_SHORT_IO, + RCL_CLIENT); + short_io_buf = req_capsule_client_get(&req->rq_pill, + &RMF_SHORT_IO); + CDEBUG(D_INFO, "Client use short io for data transfer," + " size = %d\n", short_io_size); + + /* Copy short io buf to pages */ + rc = tgt_shortio2pages(local_nb, npages, short_io_buf, + short_io_size); + desc = NULL; + } else { + desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo), + PTLRPC_BULK_GET_SINK | + PTLRPC_BULK_BUF_KIOV, + OST_BULK_PORTAL, + &ptlrpc_bulk_kiov_nopin_ops); + if (desc == NULL) + GOTO(skip_transfer, rc = -ENOMEM); + + /* NB Having prepped, we must commit... */ + for (i = 0; i < npages; i++) + desc->bd_frag_ops->add_kiov_frag(desc, + local_nb[i].lnb_page, + local_nb[i].lnb_page_offset & ~PAGE_MASK, + local_nb[i].lnb_len); + + rc = sptlrpc_svc_prep_bulk(req, desc); + if (rc != 0) + GOTO(skip_transfer, rc); + + rc = target_bulk_io(exp, desc, &lwi); + } + + no_reply = rc != 0; + +skip_transfer: + if (body->oa.o_valid & OBD_MD_FLCKSUM && rc == 0) { + static int cksum_counter; + + if (body->oa.o_valid & OBD_MD_FLFLAGS) + cksum_type = obd_cksum_type_unpack(body->oa.o_flags); + + repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL; + repbody->oa.o_flags |= obd_cksum_type_pack(obd_name, + cksum_type); + + rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type, + local_nb, npages, OST_WRITE, + &repbody->oa.o_cksum); + if (rc < 0) + GOTO(out_commitrw, rc); + + cksum_counter++; + + if (unlikely(body->oa.o_cksum != repbody->oa.o_cksum)) { + mmap = (body->oa.o_valid & OBD_MD_FLFLAGS && + body->oa.o_flags & OBD_FL_MMAP); + + tgt_warn_on_cksum(req, desc, local_nb, npages, + body->oa.o_cksum, + repbody->oa.o_cksum, mmap); + cksum_counter = 0; + } else if ((cksum_counter & (-cksum_counter)) == + cksum_counter) { + CDEBUG(D_INFO, "Checksum %u from %s OK: %x\n", + cksum_counter, libcfs_id2str(req->rq_peer), + repbody->oa.o_cksum); + } + } + +out_commitrw: + /* Must commit after prep above in all cases */ + rc = obd_commitrw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa, + objcount, ioo, remote_nb, npages, local_nb, rc); + if (rc == -ENOTCONN) + /* quota acquire process has been given up because + * either the client has been evicted or the client + * has timed out the request already */ + no_reply = true; + + for (i = 0; i < niocount; i++) { + if (!(local_nb[i].lnb_flags & OBD_BRW_ASYNC)) { + wait_sync = true; + break; + } + } + /* + * Disable sending mtime back to the client. If the client locked the + * whole object, then it has already updated the mtime on its side, + * otherwise it will have to glimpse anyway (see bug 21489, comment 32) + */ + repbody->oa.o_valid &= ~(OBD_MD_FLMTIME | OBD_MD_FLATIME); + + if (rc == 0) { + int nob = 0; + + /* set per-requested niobuf return codes */ + for (i = j = 0; i < niocount; i++) { + int len = remote_nb[i].rnb_len; + + nob += len; + rcs[i] = 0; + do { + LASSERT(j < npages); + if (local_nb[j].lnb_rc < 0) + rcs[i] = local_nb[j].lnb_rc; + len -= local_nb[j].lnb_len; + j++; + } while (len > 0); + LASSERT(len == 0); + } + LASSERT(j == npages); + ptlrpc_lprocfs_brw(req, nob); + } +out_lock: + tgt_brw_unlock(ioo, remote_nb, &lockh, LCK_PW); + if (desc) + ptlrpc_free_bulk(desc); +out: + if (unlikely(no_reply || (exp->exp_obd->obd_no_transno && wait_sync))) { + req->rq_no_reply = 1; + /* reply out callback would free */ + ptlrpc_req_drop_rs(req); + if (!exp->exp_obd->obd_no_transno) + LCONSOLE_WARN("%s: Bulk IO write error with %s (at %s)," + " client will retry: rc = %d\n", + obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), rc); + } + memory_pressure_clr(); + RETURN(rc); +} +EXPORT_SYMBOL(tgt_brw_write); + +/* Check if request can be reconstructed from saved reply data + * A copy of the reply data is returned in @trd if the pointer is not NULL + */ +bool req_can_reconstruct(struct ptlrpc_request *req, + struct tg_reply_data *trd) +{ + struct tg_export_data *ted = &req->rq_export->exp_target_data; + struct lsd_client_data *lcd = ted->ted_lcd; + bool found; + + if (tgt_is_multimodrpcs_client(req->rq_export)) + return tgt_lookup_reply(req, trd); + + mutex_lock(&ted->ted_lcd_lock); + found = req->rq_xid == lcd->lcd_last_xid || + req->rq_xid == lcd->lcd_last_close_xid; + + if (found && trd != NULL) { + if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) { + trd->trd_reply.lrd_xid = lcd->lcd_last_close_xid; + trd->trd_reply.lrd_transno = + lcd->lcd_last_close_transno; + trd->trd_reply.lrd_result = lcd->lcd_last_close_result; + } else { + trd->trd_reply.lrd_xid = lcd->lcd_last_xid; + trd->trd_reply.lrd_transno = lcd->lcd_last_transno; + trd->trd_reply.lrd_result = lcd->lcd_last_result; + trd->trd_reply.lrd_data = lcd->lcd_last_data; + trd->trd_pre_versions[0] = lcd->lcd_pre_versions[0]; + trd->trd_pre_versions[1] = lcd->lcd_pre_versions[1]; + trd->trd_pre_versions[2] = lcd->lcd_pre_versions[2]; + trd->trd_pre_versions[3] = lcd->lcd_pre_versions[3]; + } + } + mutex_unlock(&ted->ted_lcd_lock); + + return found; +} +EXPORT_SYMBOL(req_can_reconstruct); + diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_internal.h b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h new file mode 100644 index 0000000000000..ac7c3c17feb9d --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h @@ -0,0 +1,305 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/target/tgt_internal.h + * + * Lustre Unified Target header file + * + * Author: Mikhail Pershin + */ + +#ifndef _TG_INTERNAL_H +#define _TG_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include + +extern int (*tgt_lfsck_in_notify_local)(const struct lu_env *env, + struct dt_device *key, + struct lfsck_req_local *lrl, + struct thandle *th); +/** + * Common data shared by tg-level handlers. This is allocated per-thread to + * reduce stack consumption. + */ +struct tgt_thread_info { + /* server and client data buffers */ + struct lr_server_data tti_lsd; + struct lsd_client_data tti_lcd; + struct lsd_reply_data tti_lrd; + struct lu_buf tti_buf; + loff_t tti_off; + + struct lu_attr tti_attr; + struct lu_fid tti_fid1; + + /* transno storage during last_rcvd update */ + __u64 tti_transno; + __u32 tti_has_trans:1, + tti_mult_trans:1; + + /* Updates data for OUT target */ + struct thandle_exec_args tti_tea; + union { + struct { + /* for tgt_readpage() */ + struct lu_rdpg tti_rdpg; + /* for tgt_sendpage() */ + struct l_wait_info tti_wait_info; + } rdpg; + struct { + struct dt_object_format tti_update_dof; + struct object_update_reply *tti_update_reply; + struct object_update *tti_update; + int tti_update_reply_index; + struct obdo tti_obdo; + struct dt_object *tti_dt_object; + struct l_wait_info tti_wait_info; + } update; + struct obd_statfs osfs; /* for obd_statfs() in OFD/MDT */ + } tti_u; + struct lfsck_req_local tti_lrl; + struct dt_insert_rec tti_rec; +}; + +extern struct lu_context_key tgt_thread_key; + +static inline struct tgt_thread_info *tgt_th_info(const struct lu_env *env) +{ + struct tgt_thread_info *tti; + + tti = lu_context_key_get(&env->le_ctx, &tgt_thread_key); + LASSERT(tti); + return tti; +} + +#define MGS_SERVICE_WATCHDOG_FACTOR (2) + +int tgt_request_handle(struct ptlrpc_request *req); + +/* check if request's xid is equal to last one or not*/ +static inline int req_xid_is_last(struct ptlrpc_request *req) +{ + struct lsd_client_data *lcd = req->rq_export->exp_target_data.ted_lcd; + + LASSERT(lcd != NULL); + return (req->rq_xid == lcd->lcd_last_xid || + req->rq_xid == lcd->lcd_last_close_xid); +} + +static inline char *dt_obd_name(struct dt_device *dt) +{ + return dt->dd_lu_dev.ld_obd->obd_name; +} + +/* out_lib.c */ +int out_tx_create_exec(const struct lu_env *env, struct thandle *th, + struct tx_arg *arg); +struct tx_arg *tx_add_exec(struct thandle_exec_args *ta, + tx_exec_func_t func, tx_exec_func_t undo, + const char *file, int line); + +int out_create_add_exec(const struct lu_env *env, struct dt_object *obj, + struct lu_attr *attr, struct lu_fid *parent_fid, + struct dt_object_format *dof, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line); + +int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_attr *attr, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_write_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_buf *buf, loff_t pos, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_xattr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const struct lu_buf *buf, const char *name, + int flags, struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_xattr_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + const char *name, struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_ref_add_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_ref_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, int index, + const char *file, int line); + +int out_index_insert_add_exec(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line); + +int out_index_delete_add_exec(const struct lu_env *env, + struct dt_object *dt_obj, + const struct dt_key *key, + struct thandle_exec_args *ta, + struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line); + +int out_destroy_add_exec(const struct lu_env *env, struct dt_object *dt_obj, + struct thandle_exec_args *ta, struct thandle *th, + struct object_update_reply *reply, + int index, const char *file, int line); + +/* Update handlers */ +int out_handle(struct tgt_session_info *tsi); + +#define out_tx_create(env, obj, attr, fid, dof, ta, th, reply, idx) \ + out_create_add_exec(env, obj, attr, fid, dof, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_attr_set(env, obj, attr, ta, th, reply, idx) \ + out_attr_set_add_exec(env, obj, attr, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_xattr_set(env, obj, buf, name, fl, ta, th, reply, idx) \ + out_xattr_set_add_exec(env, obj, buf, name, fl, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_xattr_del(env, obj, name, ta, th, reply, idx) \ + out_xattr_del_add_exec(env, obj, name, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_ref_add(env, obj, ta, th, reply, idx) \ + out_ref_add_add_exec(env, obj, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_ref_del(env, obj, ta, th, reply, idx) \ + out_ref_del_add_exec(env, obj, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_index_insert(env, obj, rec, key, ta, th, reply, idx) \ + out_index_insert_add_exec(env, obj, rec, key, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_index_delete(env, obj, key, ta, th, reply, idx) \ + out_index_delete_add_exec(env, obj, key, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_destroy(env, obj, ta, th, reply, idx) \ + out_destroy_add_exec(env, obj, ta, th, reply, idx, \ + __FILE__, __LINE__) + +#define out_tx_write(env, obj, buf, pos, ta, th, reply, idx) \ + out_write_add_exec(env, obj, buf, pos, ta, th, reply, idx,\ + __FILE__, __LINE__) + +const char *update_op_str(__u16 opcode); + +extern struct page *tgt_page_to_corrupt; + +int tgt_server_data_init(const struct lu_env *env, struct lu_target *tgt); +int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th, + void *cookie); +int tgt_txn_stop_cb(const struct lu_env *env, struct thandle *th, + void *cookie); +int tgt_handle_received_xid(struct obd_export *exp, __u64 rcvd_xid); +int tgt_handle_tag(struct obd_export *exp, __u16 tag); + +void update_records_dump(const struct update_records *records, + unsigned int mask, bool dump_updates); +int check_and_prepare_update_record(const struct lu_env *env, + struct thandle_update_records *tur); +struct update_thread_info { + struct lu_attr uti_attr; + struct lu_fid uti_fid; + struct lu_buf uti_buf; + struct thandle_update_records uti_tur; + struct obdo uti_obdo; + struct thandle_exec_args uti_tea; + struct dt_insert_rec uti_rec; + struct distribute_txn_replay_req *uti_dtrq; +}; + +extern struct lu_context_key update_thread_key; + +static inline struct update_thread_info * +update_env_info(const struct lu_env *env) +{ + struct update_thread_info *uti; + + uti = lu_context_key_get(&env->le_ctx, &update_thread_key); + LASSERT(uti != NULL); + return uti; +} + +void update_info_init(void); +void update_info_fini(void); +struct sub_thandle *create_sub_thandle(struct top_multiple_thandle *tmt, + struct dt_device *dt_dev); +int sub_thandle_trans_create(const struct lu_env *env, + struct top_thandle *top_th, + struct sub_thandle *st); +void distribute_txn_insert_by_batchid(struct top_multiple_thandle *new); +int top_trans_create_tmt(const struct lu_env *env, + struct top_thandle *top_th); + +void tgt_cancel_slc_locks(struct lu_target *tgt, __u64 transno); +void barrier_init(void); +void barrier_fini(void); + +/* FMD tracking data */ +struct tgt_fmd_data { + struct list_head fmd_list; /* linked to tgt_fmd_list */ + struct lu_fid fmd_fid; /* FID being written to */ + __u64 fmd_mactime_xid; /* xid highest {m,a,c}time setattr */ + time64_t fmd_expire; /* time when the fmd should expire */ + int fmd_refcount; /* reference counter - list holds 1 */ +}; + +/* tgt_fmd.c */ +extern struct kmem_cache *tgt_fmd_kmem; +void tgt_fmd_expire(struct obd_export *exp); +void tgt_fmd_cleanup(struct obd_export *exp); + +#endif /* _TG_INTERNAL_H */ diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c new file mode 100644 index 0000000000000..0d2fde1be1bc3 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c @@ -0,0 +1,2128 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Lustre Unified Target + * These are common function to work with last_received file + * + * Author: Mikhail Pershin + */ +#include +#include +#include + +#include "tgt_internal.h" + +/** version recovery epoch */ +#define LR_EPOCH_BITS 32 + +/* Allocate a bitmap for a chunk of reply data slots */ +static int tgt_bitmap_chunk_alloc(struct lu_target *lut, int chunk) +{ + unsigned long *bm; + + OBD_ALLOC_LARGE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) * + sizeof(long)); + if (bm == NULL) + return -ENOMEM; + + spin_lock(&lut->lut_client_bitmap_lock); + + if (lut->lut_reply_bitmap[chunk] != NULL) { + /* someone else already allocated the bitmap for this chunk */ + spin_unlock(&lut->lut_client_bitmap_lock); + OBD_FREE_LARGE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) * + sizeof(long)); + return 0; + } + + lut->lut_reply_bitmap[chunk] = bm; + + spin_unlock(&lut->lut_client_bitmap_lock); + + return 0; +} + +/* Look for an available reply data slot in the bitmap + * of the target @lut + * Allocate bitmap chunk when first used + * XXX algo could be improved if this routine limits performance + */ +static int tgt_find_free_reply_slot(struct lu_target *lut) +{ + unsigned long *bmp; + int chunk = 0; + int rc; + int b; + + for (chunk = 0; chunk < LUT_REPLY_SLOTS_MAX_CHUNKS; chunk++) { + /* allocate the bitmap chunk if necessary */ + if (unlikely(lut->lut_reply_bitmap[chunk] == NULL)) { + rc = tgt_bitmap_chunk_alloc(lut, chunk); + if (rc != 0) + return rc; + } + bmp = lut->lut_reply_bitmap[chunk]; + + /* look for an available slot in this chunk */ + do { + b = find_first_zero_bit(bmp, LUT_REPLY_SLOTS_PER_CHUNK); + if (b >= LUT_REPLY_SLOTS_PER_CHUNK) + break; + + /* found one */ + if (test_and_set_bit(b, bmp) == 0) + return chunk * LUT_REPLY_SLOTS_PER_CHUNK + b; + } while (true); + } + + return -ENOSPC; +} + +/* Mark the reply data slot @idx 'used' in the corresponding bitmap chunk + * of the target @lut + * Allocate the bitmap chunk if necessary + */ +static int tgt_set_reply_slot(struct lu_target *lut, int idx) +{ + int chunk; + int b; + int rc; + + chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK; + b = idx % LUT_REPLY_SLOTS_PER_CHUNK; + + LASSERT(chunk < LUT_REPLY_SLOTS_MAX_CHUNKS); + LASSERT(b < LUT_REPLY_SLOTS_PER_CHUNK); + + /* allocate the bitmap chunk if necessary */ + if (unlikely(lut->lut_reply_bitmap[chunk] == NULL)) { + rc = tgt_bitmap_chunk_alloc(lut, chunk); + if (rc != 0) + return rc; + } + + /* mark the slot 'used' in this chunk */ + if (test_and_set_bit(b, lut->lut_reply_bitmap[chunk]) != 0) { + CERROR("%s: slot %d already set in bitmap\n", + tgt_name(lut), idx); + return -EALREADY; + } + + return 0; +} + + +/* Mark the reply data slot @idx 'unused' in the corresponding bitmap chunk + * of the target @lut + */ +static int tgt_clear_reply_slot(struct lu_target *lut, int idx) +{ + int chunk; + int b; + + if (lut->lut_obd->obd_stopping) + /* + * in case of failover keep the bit set in order to + * avoid overwriting slots in reply_data which might + * be required by resent rpcs + */ + return 0; + chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK; + b = idx % LUT_REPLY_SLOTS_PER_CHUNK; + + LASSERT(chunk < LUT_REPLY_SLOTS_MAX_CHUNKS); + LASSERT(b < LUT_REPLY_SLOTS_PER_CHUNK); + + if (lut->lut_reply_bitmap[chunk] == NULL) { + CERROR("%s: slot %d not allocated\n", + tgt_name(lut), idx); + return -ENOENT; + } + + if (test_and_clear_bit(b, lut->lut_reply_bitmap[chunk]) == 0) { + CERROR("%s: slot %d already clear in bitmap\n", + tgt_name(lut), idx); + return -EALREADY; + } + + return 0; +} + + +/* Read header of reply_data file of target @tgt into structure @lrh */ +static int tgt_reply_header_read(const struct lu_env *env, + struct lu_target *tgt, + struct lsd_reply_header *lrh) +{ + int rc; + struct lsd_reply_header buf; + struct tgt_thread_info *tti = tgt_th_info(env); + + tti->tti_off = 0; + tti->tti_buf.lb_buf = &buf; + tti->tti_buf.lb_len = sizeof(buf); + + rc = dt_record_read(env, tgt->lut_reply_data, &tti->tti_buf, + &tti->tti_off); + if (rc != 0) + return rc; + + lrh->lrh_magic = le32_to_cpu(buf.lrh_magic); + lrh->lrh_header_size = le32_to_cpu(buf.lrh_header_size); + lrh->lrh_reply_size = le32_to_cpu(buf.lrh_reply_size); + + CDEBUG(D_HA, "%s: read %s header. magic=0x%08x " + "header_size=%d reply_size=%d\n", + tgt->lut_obd->obd_name, REPLY_DATA, + lrh->lrh_magic, lrh->lrh_header_size, lrh->lrh_reply_size); + + return 0; +} + +/* Write header into replay_data file of target @tgt from structure @lrh */ +static int tgt_reply_header_write(const struct lu_env *env, + struct lu_target *tgt, + struct lsd_reply_header *lrh) +{ + int rc; + struct lsd_reply_header buf; + struct tgt_thread_info *tti = tgt_th_info(env); + struct thandle *th; + struct dt_object *dto; + + CDEBUG(D_HA, "%s: write %s header. magic=0x%08x " + "header_size=%d reply_size=%d\n", + tgt->lut_obd->obd_name, REPLY_DATA, + lrh->lrh_magic, lrh->lrh_header_size, lrh->lrh_reply_size); + + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + buf.lrh_magic = cpu_to_le32(lrh->lrh_magic); + buf.lrh_header_size = cpu_to_le32(lrh->lrh_header_size); + buf.lrh_reply_size = cpu_to_le32(lrh->lrh_reply_size); + + th = dt_trans_create(env, tgt->lut_bottom); + if (IS_ERR(th)) + return PTR_ERR(th); + th->th_sync = 1; + + tti->tti_off = 0; + tti->tti_buf.lb_buf = &buf; + tti->tti_buf.lb_len = sizeof(buf); + + rc = dt_declare_record_write(env, tgt->lut_reply_data, + &tti->tti_buf, tti->tti_off, th); + if (rc) + GOTO(out, rc); + + rc = dt_trans_start(env, tgt->lut_bottom, th); + if (rc) + GOTO(out, rc); + + dto = dt_object_locate(tgt->lut_reply_data, th->th_dev); + rc = dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th); +out: + dt_trans_stop(env, tgt->lut_bottom, th); + return rc; +} + +/* Write the reply data @lrd into reply_data file of target @tgt + * at offset @off + */ +static int tgt_reply_data_write(const struct lu_env *env, struct lu_target *tgt, + struct lsd_reply_data *lrd, loff_t off, + struct thandle *th) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct dt_object *dto; + struct lsd_reply_data *buf = &tti->tti_lrd; + + lrd->lrd_result = ptlrpc_status_hton(lrd->lrd_result); + + buf->lrd_transno = cpu_to_le64(lrd->lrd_transno); + buf->lrd_xid = cpu_to_le64(lrd->lrd_xid); + buf->lrd_data = cpu_to_le64(lrd->lrd_data); + buf->lrd_result = cpu_to_le32(lrd->lrd_result); + buf->lrd_client_gen = cpu_to_le32(lrd->lrd_client_gen); + + lrd->lrd_result = ptlrpc_status_ntoh(lrd->lrd_result); + + tti->tti_off = off; + tti->tti_buf.lb_buf = buf; + tti->tti_buf.lb_len = sizeof(*buf); + + dto = dt_object_locate(tgt->lut_reply_data, th->th_dev); + return dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th); +} + +/* Read the reply data from reply_data file of target @tgt at offset @off + * into structure @lrd + */ +static int tgt_reply_data_read(const struct lu_env *env, struct lu_target *tgt, + struct lsd_reply_data *lrd, loff_t off) +{ + int rc; + struct tgt_thread_info *tti = tgt_th_info(env); + struct lsd_reply_data *buf = &tti->tti_lrd; + + tti->tti_off = off; + tti->tti_buf.lb_buf = buf; + tti->tti_buf.lb_len = sizeof(*buf); + + rc = dt_record_read(env, tgt->lut_reply_data, &tti->tti_buf, + &tti->tti_off); + if (rc != 0) + return rc; + + lrd->lrd_transno = le64_to_cpu(buf->lrd_transno); + lrd->lrd_xid = le64_to_cpu(buf->lrd_xid); + lrd->lrd_data = le64_to_cpu(buf->lrd_data); + lrd->lrd_result = le32_to_cpu(buf->lrd_result); + lrd->lrd_client_gen = le32_to_cpu(buf->lrd_client_gen); + + return 0; +} + + +/* Free the in-memory reply data structure @trd and release + * the corresponding slot in the reply_data file of target @lut + * Called with ted_lcd_lock held + */ +static void tgt_free_reply_data(struct lu_target *lut, + struct tg_export_data *ted, + struct tg_reply_data *trd) +{ + CDEBUG(D_TRACE, "%s: free reply data %p: xid %llu, transno %llu, " + "client gen %u, slot idx %d\n", + lut == NULL ? "" : tgt_name(lut), trd, trd->trd_reply.lrd_xid, + trd->trd_reply.lrd_transno, trd->trd_reply.lrd_client_gen, + trd->trd_index); + + LASSERT(mutex_is_locked(&ted->ted_lcd_lock)); + + list_del(&trd->trd_list); + ted->ted_reply_cnt--; + if (lut != NULL) + tgt_clear_reply_slot(lut, trd->trd_index); + OBD_FREE_PTR(trd); +} + +/* Release the reply data @trd from target @lut + * The reply data with the highest transno for this export + * is retained to ensure correctness of target recovery + * Called with ted_lcd_lock held + */ +static void tgt_release_reply_data(struct lu_target *lut, + struct tg_export_data *ted, + struct tg_reply_data *trd) +{ + CDEBUG(D_TRACE, "%s: release reply data %p: xid %llu, transno %llu, " + "client gen %u, slot idx %d\n", + lut == NULL ? "" : tgt_name(lut), trd, trd->trd_reply.lrd_xid, + trd->trd_reply.lrd_transno, trd->trd_reply.lrd_client_gen, + trd->trd_index); + + LASSERT(mutex_is_locked(&ted->ted_lcd_lock)); + + /* Do not free the reply data corresponding to the + * highest transno of this export. + * This ensures on-disk reply data is kept and + * last committed transno can be restored from disk in case + * of target recovery + */ + if (trd->trd_reply.lrd_transno == ted->ted_lcd->lcd_last_transno) { + /* free previous retained reply */ + if (ted->ted_reply_last != NULL) + tgt_free_reply_data(lut, ted, ted->ted_reply_last); + /* retain the reply */ + list_del_init(&trd->trd_list); + ted->ted_reply_last = trd; + } else { + tgt_free_reply_data(lut, ted, trd); + } +} + +static inline struct lu_buf *tti_buf_lsd(struct tgt_thread_info *tti) +{ + tti->tti_buf.lb_buf = &tti->tti_lsd; + tti->tti_buf.lb_len = sizeof(tti->tti_lsd); + return &tti->tti_buf; +} + +static inline struct lu_buf *tti_buf_lcd(struct tgt_thread_info *tti) +{ + tti->tti_buf.lb_buf = &tti->tti_lcd; + tti->tti_buf.lb_len = sizeof(tti->tti_lcd); + return &tti->tti_buf; +} + +/** + * Allocate in-memory data for client slot related to export. + */ +int tgt_client_alloc(struct obd_export *exp) +{ + ENTRY; + LASSERT(exp != exp->exp_obd->obd_self_export); + + spin_lock_init(&exp->exp_target_data.ted_nodemap_lock); + INIT_LIST_HEAD(&exp->exp_target_data.ted_nodemap_member); + spin_lock_init(&exp->exp_target_data.ted_fmd_lock); + INIT_LIST_HEAD(&exp->exp_target_data.ted_fmd_list); + + OBD_ALLOC_PTR(exp->exp_target_data.ted_lcd); + if (exp->exp_target_data.ted_lcd == NULL) + RETURN(-ENOMEM); + /* Mark that slot is not yet valid, 0 doesn't work here */ + exp->exp_target_data.ted_lr_idx = -1; + INIT_LIST_HEAD(&exp->exp_target_data.ted_reply_list); + mutex_init(&exp->exp_target_data.ted_lcd_lock); + RETURN(0); +} +EXPORT_SYMBOL(tgt_client_alloc); + +/** + * Free in-memory data for client slot related to export. + */ +void tgt_client_free(struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *lut = class_exp2tgt(exp); + struct tg_reply_data *trd, *tmp; + + LASSERT(exp != exp->exp_obd->obd_self_export); + + tgt_fmd_cleanup(exp); + + /* free reply data */ + mutex_lock(&ted->ted_lcd_lock); + list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) { + tgt_release_reply_data(lut, ted, trd); + } + if (ted->ted_reply_last != NULL) { + tgt_free_reply_data(lut, ted, ted->ted_reply_last); + ted->ted_reply_last = NULL; + } + mutex_unlock(&ted->ted_lcd_lock); + + if (!hlist_unhashed(&exp->exp_gen_hash)) + cfs_hash_del(exp->exp_obd->obd_gen_hash, + &ted->ted_lcd->lcd_generation, + &exp->exp_gen_hash); + + OBD_FREE_PTR(ted->ted_lcd); + ted->ted_lcd = NULL; + + /* Target may have been freed (see LU-7430) + * Slot may be not yet assigned */ + if (exp->exp_obd->u.obt.obt_magic != OBT_MAGIC || + ted->ted_lr_idx < 0) + return; + + /* Clear bit when lcd is freed */ + LASSERT(lut && lut->lut_client_bitmap); + if (!test_and_clear_bit(ted->ted_lr_idx, lut->lut_client_bitmap)) { + CERROR("%s: client %u bit already clear in bitmap\n", + exp->exp_obd->obd_name, ted->ted_lr_idx); + LBUG(); + } + + if (tgt_is_multimodrpcs_client(exp) && !exp->exp_obd->obd_stopping) + atomic_dec(&lut->lut_num_clients); +} +EXPORT_SYMBOL(tgt_client_free); + +static inline void tgt_check_lcd(const char *obd_name, int index, + struct lsd_client_data *lcd) +{ + size_t uuid_size = sizeof(lcd->lcd_uuid); + + if (strnlen((char*)lcd->lcd_uuid, uuid_size) == uuid_size) { + lcd->lcd_uuid[uuid_size - 1] = '\0'; + + LCONSOLE_ERROR("the client UUID (%s) on %s for exports stored in last_rcvd(index = %d) is bad!\n", + lcd->lcd_uuid, obd_name, index); + } +} + +static int tgt_client_data_read(const struct lu_env *env, struct lu_target *tgt, + struct lsd_client_data *lcd, + loff_t *off, int index) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + int rc; + + tti_buf_lcd(tti); + rc = dt_record_read(env, tgt->lut_last_rcvd, &tti->tti_buf, off); + if (rc == 0) { + tgt_check_lcd(tgt->lut_obd->obd_name, index, &tti->tti_lcd); + lcd_le_to_cpu(&tti->tti_lcd, lcd); + lcd->lcd_last_result = ptlrpc_status_ntoh(lcd->lcd_last_result); + lcd->lcd_last_close_result = + ptlrpc_status_ntoh(lcd->lcd_last_close_result); + } + + CDEBUG(D_INFO, "%s: read lcd @%lld uuid = %s, last_transno = %llu" + ", last_xid = %llu, last_result = %u, last_data = %u, " + "last_close_transno = %llu, last_close_xid = %llu, " + "last_close_result = %u, rc = %d\n", tgt->lut_obd->obd_name, + *off, lcd->lcd_uuid, lcd->lcd_last_transno, lcd->lcd_last_xid, + lcd->lcd_last_result, lcd->lcd_last_data, + lcd->lcd_last_close_transno, lcd->lcd_last_close_xid, + lcd->lcd_last_close_result, rc); + return rc; +} + +static int tgt_client_data_write(const struct lu_env *env, + struct lu_target *tgt, + struct lsd_client_data *lcd, + loff_t *off, struct thandle *th) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct dt_object *dto; + + lcd->lcd_last_result = ptlrpc_status_hton(lcd->lcd_last_result); + lcd->lcd_last_close_result = + ptlrpc_status_hton(lcd->lcd_last_close_result); + lcd_cpu_to_le(lcd, &tti->tti_lcd); + tti_buf_lcd(tti); + + dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev); + return dt_record_write(env, dto, &tti->tti_buf, off, th); +} + +struct tgt_new_client_callback { + struct dt_txn_commit_cb lncc_cb; + struct obd_export *lncc_exp; +}; + +static void tgt_cb_new_client(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err) +{ + struct tgt_new_client_callback *ccb; + + ccb = container_of0(cb, struct tgt_new_client_callback, lncc_cb); + + LASSERT(ccb->lncc_exp->exp_obd); + + CDEBUG(D_RPCTRACE, "%s: committing for initial connect of %s\n", + ccb->lncc_exp->exp_obd->obd_name, + ccb->lncc_exp->exp_client_uuid.uuid); + + spin_lock(&ccb->lncc_exp->exp_lock); + + ccb->lncc_exp->exp_need_sync = 0; + + spin_unlock(&ccb->lncc_exp->exp_lock); + class_export_cb_put(ccb->lncc_exp); + + OBD_FREE_PTR(ccb); +} + +int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp) +{ + struct tgt_new_client_callback *ccb; + struct dt_txn_commit_cb *dcb; + int rc; + + OBD_ALLOC_PTR(ccb); + if (ccb == NULL) + return -ENOMEM; + + ccb->lncc_exp = class_export_cb_get(exp); + + dcb = &ccb->lncc_cb; + dcb->dcb_func = tgt_cb_new_client; + INIT_LIST_HEAD(&dcb->dcb_linkage); + strlcpy(dcb->dcb_name, "tgt_cb_new_client", sizeof(dcb->dcb_name)); + + rc = dt_trans_cb_add(th, dcb); + if (rc) { + class_export_cb_put(exp); + OBD_FREE_PTR(ccb); + } + return rc; +} + +/** + * Update client data in last_rcvd + */ +static int tgt_client_data_update(const struct lu_env *env, + struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *tgt = class_exp2tgt(exp); + struct tgt_thread_info *tti = tgt_th_info(env); + struct thandle *th; + int rc = 0; + + ENTRY; + + if (unlikely(tgt == NULL)) { + CDEBUG(D_ERROR, "%s: No target for connected export\n", + class_exp2obd(exp)->obd_name); + RETURN(-EINVAL); + } + + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + th = dt_trans_create(env, tgt->lut_bottom); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + tti_buf_lcd(tti); + rc = dt_declare_record_write(env, tgt->lut_last_rcvd, + &tti->tti_buf, + ted->ted_lr_off, th); + if (rc) + GOTO(out, rc); + + rc = dt_trans_start_local(env, tgt->lut_bottom, th); + if (rc) + GOTO(out, rc); + + mutex_lock(&ted->ted_lcd_lock); + + /* + * Until this operations will be committed the sync is needed + * for this export. This should be done _after_ starting the + * transaction so that many connecting clients will not bring + * server down with lots of sync writes. + */ + rc = tgt_new_client_cb_add(th, exp); + if (rc) { + /* can't add callback, do sync now */ + th->th_sync = 1; + } else { + spin_lock(&exp->exp_lock); + exp->exp_need_sync = 1; + spin_unlock(&exp->exp_lock); + } + + tti->tti_off = ted->ted_lr_off; + rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th); + + mutex_unlock(&ted->ted_lcd_lock); + + EXIT; +out: + dt_trans_stop(env, tgt->lut_bottom, th); + CDEBUG(D_INFO, "%s: update last_rcvd client data for UUID = %s, " + "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name, + tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc); + + return rc; +} + +static int tgt_server_data_read(const struct lu_env *env, struct lu_target *tgt) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + int rc; + + tti->tti_off = 0; + tti_buf_lsd(tti); + rc = dt_record_read(env, tgt->lut_last_rcvd, &tti->tti_buf, + &tti->tti_off); + if (rc == 0) + lsd_le_to_cpu(&tti->tti_lsd, &tgt->lut_lsd); + + CDEBUG(D_INFO, "%s: read last_rcvd server data for UUID = %s, " + "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name, + tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc); + return rc; +} + +static int tgt_server_data_write(const struct lu_env *env, + struct lu_target *tgt, struct thandle *th) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct dt_object *dto; + int rc; + + ENTRY; + + tti->tti_off = 0; + tti_buf_lsd(tti); + lsd_cpu_to_le(&tgt->lut_lsd, &tti->tti_lsd); + + dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev); + rc = dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th); + + CDEBUG(D_INFO, "%s: write last_rcvd server data for UUID = %s, " + "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name, + tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc); + + RETURN(rc); +} + +/** + * Update server data in last_rcvd + */ +int tgt_server_data_update(const struct lu_env *env, struct lu_target *tgt, + int sync) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct thandle *th; + int rc = 0; + + ENTRY; + + CDEBUG(D_SUPER, + "%s: mount_count is %llu, last_transno is %llu\n", + tgt->lut_lsd.lsd_uuid, tgt->lut_obd->u.obt.obt_mount_count, + tgt->lut_last_transno); + + /* Always save latest transno to keep it fresh */ + spin_lock(&tgt->lut_translock); + tgt->lut_lsd.lsd_last_transno = tgt->lut_last_transno; + spin_unlock(&tgt->lut_translock); + + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + th = dt_trans_create(env, tgt->lut_bottom); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + th->th_sync = sync; + + tti_buf_lsd(tti); + rc = dt_declare_record_write(env, tgt->lut_last_rcvd, + &tti->tti_buf, tti->tti_off, th); + if (rc) + GOTO(out, rc); + + rc = dt_trans_start(env, tgt->lut_bottom, th); + if (rc) + GOTO(out, rc); + + rc = tgt_server_data_write(env, tgt, th); +out: + dt_trans_stop(env, tgt->lut_bottom, th); + + CDEBUG(D_INFO, "%s: update last_rcvd server data for UUID = %s, " + "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name, + tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc); + RETURN(rc); +} +EXPORT_SYMBOL(tgt_server_data_update); + +static int tgt_truncate_last_rcvd(const struct lu_env *env, + struct lu_target *tgt, loff_t size) +{ + struct dt_object *dt = tgt->lut_last_rcvd; + struct thandle *th; + struct lu_attr attr; + int rc; + + ENTRY; + + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + attr.la_size = size; + attr.la_valid = LA_SIZE; + + th = dt_trans_create(env, tgt->lut_bottom); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + rc = dt_declare_punch(env, dt, size, OBD_OBJECT_EOF, th); + if (rc) + GOTO(cleanup, rc); + rc = dt_declare_attr_set(env, dt, &attr, th); + if (rc) + GOTO(cleanup, rc); + rc = dt_trans_start_local(env, tgt->lut_bottom, th); + if (rc) + GOTO(cleanup, rc); + + rc = dt_punch(env, dt, size, OBD_OBJECT_EOF, th); + if (rc == 0) + rc = dt_attr_set(env, dt, &attr, th); + +cleanup: + dt_trans_stop(env, tgt->lut_bottom, th); + + RETURN(rc); +} + +static void tgt_client_epoch_update(const struct lu_env *env, + struct obd_export *exp) +{ + struct lsd_client_data *lcd = exp->exp_target_data.ted_lcd; + struct lu_target *tgt = class_exp2tgt(exp); + + LASSERT(tgt && tgt->lut_bottom); + /** VBR: set client last_epoch to current epoch */ + if (lcd->lcd_last_epoch >= tgt->lut_lsd.lsd_start_epoch) + return; + lcd->lcd_last_epoch = tgt->lut_lsd.lsd_start_epoch; + tgt_client_data_update(env, exp); +} + +/** + * Update boot epoch when recovery ends + */ +void tgt_boot_epoch_update(struct lu_target *tgt) +{ + struct lu_env env; + struct ptlrpc_request *req; + __u32 start_epoch; + struct list_head client_list; + int rc; + + if (tgt->lut_obd->obd_stopping) + return; + + rc = lu_env_init(&env, LCT_DT_THREAD); + if (rc) { + CERROR("%s: can't initialize environment: rc = %d\n", + tgt->lut_obd->obd_name, rc); + return; + } + + spin_lock(&tgt->lut_translock); + start_epoch = (tgt->lut_last_transno >> LR_EPOCH_BITS) + 1; + tgt->lut_last_transno = (__u64)start_epoch << LR_EPOCH_BITS; + tgt->lut_lsd.lsd_start_epoch = start_epoch; + spin_unlock(&tgt->lut_translock); + + INIT_LIST_HEAD(&client_list); + /** + * The recovery is not yet finished and final queue can still be updated + * with resend requests. Move final list to separate one for processing + */ + spin_lock(&tgt->lut_obd->obd_recovery_task_lock); + list_splice_init(&tgt->lut_obd->obd_final_req_queue, &client_list); + spin_unlock(&tgt->lut_obd->obd_recovery_task_lock); + + /** + * go through list of exports participated in recovery and + * set new epoch for them + */ + list_for_each_entry(req, &client_list, rq_list) { + LASSERT(!req->rq_export->exp_delayed); + if (!req->rq_export->exp_vbr_failed) + tgt_client_epoch_update(&env, req->rq_export); + } + /** return list back at once */ + spin_lock(&tgt->lut_obd->obd_recovery_task_lock); + list_splice_init(&client_list, &tgt->lut_obd->obd_final_req_queue); + spin_unlock(&tgt->lut_obd->obd_recovery_task_lock); + + /** Clear MULTI RPCS incompatibility flag if + * - target is MDT and + * - there is no client to recover or the recovery was aborted + */ + if (!strncmp(tgt->lut_obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) && + (atomic_read(&tgt->lut_obd->obd_max_recoverable_clients) == 0 || + tgt->lut_obd->obd_abort_recovery)) + tgt->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS; + + /** update server epoch */ + tgt_server_data_update(&env, tgt, 1); + lu_env_fini(&env); +} + +/** + * commit callback, need to update last_committed value + */ +struct tgt_last_committed_callback { + struct dt_txn_commit_cb llcc_cb; + struct lu_target *llcc_tgt; + struct obd_export *llcc_exp; + __u64 llcc_transno; +}; + +static void tgt_cb_last_committed(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err) +{ + struct tgt_last_committed_callback *ccb; + + ccb = container_of0(cb, struct tgt_last_committed_callback, llcc_cb); + + LASSERT(ccb->llcc_exp); + LASSERT(ccb->llcc_tgt != NULL); + LASSERT(ccb->llcc_exp->exp_obd == ccb->llcc_tgt->lut_obd); + + /* error hit, don't update last committed to provide chance to + * replay data after fail */ + if (err != 0) + goto out; + + /* Fast path w/o spinlock, if exp_last_committed was updated + * with higher transno, no need to take spinlock and check, + * also no need to update obd_last_committed. */ + if (ccb->llcc_transno <= ccb->llcc_exp->exp_last_committed) + goto out; + spin_lock(&ccb->llcc_tgt->lut_translock); + if (ccb->llcc_transno > ccb->llcc_tgt->lut_obd->obd_last_committed) + ccb->llcc_tgt->lut_obd->obd_last_committed = ccb->llcc_transno; + + if (ccb->llcc_transno > ccb->llcc_exp->exp_last_committed) { + ccb->llcc_exp->exp_last_committed = ccb->llcc_transno; + spin_unlock(&ccb->llcc_tgt->lut_translock); + + ptlrpc_commit_replies(ccb->llcc_exp); + tgt_cancel_slc_locks(ccb->llcc_tgt, ccb->llcc_transno); + } else { + spin_unlock(&ccb->llcc_tgt->lut_translock); + } + + CDEBUG(D_HA, "%s: transno %lld is committed\n", + ccb->llcc_tgt->lut_obd->obd_name, ccb->llcc_transno); + +out: + class_export_cb_put(ccb->llcc_exp); + OBD_FREE_PTR(ccb); +} + +/** + * Add commit callback function, it returns a non-zero value to inform + * caller to use sync transaction if necessary. + */ +static int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *tgt, + struct obd_export *exp, __u64 transno) +{ + struct tgt_last_committed_callback *ccb; + struct dt_txn_commit_cb *dcb; + int rc; + + OBD_ALLOC_PTR(ccb); + if (ccb == NULL) + return -ENOMEM; + + ccb->llcc_tgt = tgt; + ccb->llcc_exp = class_export_cb_get(exp); + ccb->llcc_transno = transno; + + dcb = &ccb->llcc_cb; + dcb->dcb_func = tgt_cb_last_committed; + INIT_LIST_HEAD(&dcb->dcb_linkage); + strlcpy(dcb->dcb_name, "tgt_cb_last_committed", sizeof(dcb->dcb_name)); + + rc = dt_trans_cb_add(th, dcb); + if (rc) { + class_export_cb_put(exp); + OBD_FREE_PTR(ccb); + } + + if (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) + /* report failure to force synchronous operation */ + return -EPERM; + + /* if exp_need_sync is set, return non-zero value to force + * a sync transaction. */ + return rc ? rc : exp->exp_need_sync; +} + +/** + * Add new client to the last_rcvd upon new connection. + * + * We use a bitmap to locate a free space in the last_rcvd file and initialize + * tg_export_data. + */ +int tgt_client_new(const struct lu_env *env, struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *tgt = class_exp2tgt(exp); + int rc = 0, idx; + + ENTRY; + + LASSERT(tgt && tgt->lut_client_bitmap != NULL); + if (!strcmp(ted->ted_lcd->lcd_uuid, tgt->lut_obd->obd_uuid.uuid)) + RETURN(0); + + if (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) + RETURN(0); + + /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so + * there's no need for extra complication here + */ + idx = find_first_zero_bit(tgt->lut_client_bitmap, LR_MAX_CLIENTS); +repeat: + if (idx >= LR_MAX_CLIENTS || + OBD_FAIL_CHECK(OBD_FAIL_MDS_CLIENT_ADD)) { + CERROR("%s: no room for %u clients - fix LR_MAX_CLIENTS\n", + tgt->lut_obd->obd_name, idx); + RETURN(-EOVERFLOW); + } + if (test_and_set_bit(idx, tgt->lut_client_bitmap)) { + idx = find_next_zero_bit(tgt->lut_client_bitmap, + LR_MAX_CLIENTS, idx); + goto repeat; + } + + ted->ted_lr_idx = idx; + ted->ted_lr_off = tgt->lut_lsd.lsd_client_start + + idx * tgt->lut_lsd.lsd_client_size; + + LASSERTF(ted->ted_lr_off > 0, "ted_lr_off = %llu\n", ted->ted_lr_off); + + if (tgt_is_multimodrpcs_client(exp)) { + /* Set MULTI RPCS incompatibility flag to prevent previous + * Lustre versions to mount a target with reply_data file */ + atomic_inc(&tgt->lut_num_clients); + if (!(tgt->lut_lsd.lsd_feature_incompat & + OBD_INCOMPAT_MULTI_RPCS)) { + tgt->lut_lsd.lsd_feature_incompat |= + OBD_INCOMPAT_MULTI_RPCS; + rc = tgt_server_data_update(env, tgt, 1); + if (rc < 0) { + CERROR("%s: unable to set MULTI RPCS " + "incompatibility flag\n", + exp->exp_obd->obd_name); + RETURN(rc); + } + } + + /* assign client slot generation */ + ted->ted_lcd->lcd_generation = + atomic_inc_return(&tgt->lut_client_generation); + } else { + ted->ted_lcd->lcd_generation = 0; + } + + CDEBUG(D_INFO, "%s: new client at index %d (%llu) with UUID '%s' " + "generation %d\n", + tgt->lut_obd->obd_name, ted->ted_lr_idx, ted->ted_lr_off, + ted->ted_lcd->lcd_uuid, ted->ted_lcd->lcd_generation); + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_CLIENT_ADD)) + RETURN(-ENOSPC); + + rc = tgt_client_data_update(env, exp); + if (rc) + CERROR("%s: Failed to write client lcd at idx %d, rc %d\n", + tgt->lut_obd->obd_name, idx, rc); + + RETURN(rc); +} +EXPORT_SYMBOL(tgt_client_new); + +/* Add an existing client to the MDS in-memory state based on + * a client that was previously found in the last_rcvd file and + * already has an assigned slot (idx >= 0). + * + * It should not be possible to fail adding an existing client - otherwise + * mdt_init_server_data() callsite needs to be fixed. + */ +int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int idx) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *tgt = class_exp2tgt(exp); + + ENTRY; + + LASSERT(tgt && tgt->lut_client_bitmap != NULL); + LASSERTF(idx >= 0, "%d\n", idx); + + if (!strcmp(ted->ted_lcd->lcd_uuid, tgt->lut_obd->obd_uuid.uuid) || + exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) + RETURN(0); + + if (test_and_set_bit(idx, tgt->lut_client_bitmap)) { + CERROR("%s: client %d: bit already set in bitmap!!\n", + tgt->lut_obd->obd_name, idx); + LBUG(); + } + atomic_inc(&tgt->lut_num_clients); + + CDEBUG(D_INFO, "%s: client at idx %d with UUID '%s' added, " + "generation %d\n", + tgt->lut_obd->obd_name, idx, ted->ted_lcd->lcd_uuid, + ted->ted_lcd->lcd_generation); + + ted->ted_lr_idx = idx; + ted->ted_lr_off = tgt->lut_lsd.lsd_client_start + + idx * tgt->lut_lsd.lsd_client_size; + + mutex_init(&ted->ted_lcd_lock); + + LASSERTF(ted->ted_lr_off > 0, "ted_lr_off = %llu\n", ted->ted_lr_off); + + RETURN(0); +} + +int tgt_client_del(const struct lu_env *env, struct obd_export *exp) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *tgt = class_exp2tgt(exp); + int rc; + + ENTRY; + + LASSERT(ted->ted_lcd); + + if (unlikely(tgt == NULL)) { + CDEBUG(D_ERROR, "%s: No target for connected export\n", + class_exp2obd(exp)->obd_name); + RETURN(-EINVAL); + } + + /* XXX if lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ + if (!strcmp((char *)ted->ted_lcd->lcd_uuid, + (char *)tgt->lut_obd->obd_uuid.uuid) || + exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) + RETURN(0); + + /* Slot may be not yet assigned, use case is race between Client + * reconnect and forced eviction */ + if (ted->ted_lr_idx < 0) { + CWARN("%s: client with UUID '%s' not in bitmap\n", + tgt->lut_obd->obd_name, ted->ted_lcd->lcd_uuid); + RETURN(0); + } + + CDEBUG(D_INFO, "%s: del client at idx %u, off %lld, UUID '%s'\n", + tgt->lut_obd->obd_name, ted->ted_lr_idx, ted->ted_lr_off, + ted->ted_lcd->lcd_uuid); + + /* Clear the bit _after_ zeroing out the client so we don't + race with filter_client_add and zero out new clients.*/ + if (!test_bit(ted->ted_lr_idx, tgt->lut_client_bitmap)) { + CERROR("%s: client %u: bit already clear in bitmap!!\n", + tgt->lut_obd->obd_name, ted->ted_lr_idx); + LBUG(); + } + + /* Do not erase record for recoverable client. */ + if (exp->exp_flags & OBD_OPT_FAILOVER) + RETURN(0); + + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_CLIENT_DEL)) + RETURN(0); + + /* Make sure the server's last_transno is up to date. + * This should be done before zeroing client slot so last_transno will + * be in server data or in client data in case of failure */ + rc = tgt_server_data_update(env, tgt, 0); + if (rc != 0) { + CERROR("%s: failed to update server data, skip client %s " + "zeroing, rc %d\n", tgt->lut_obd->obd_name, + ted->ted_lcd->lcd_uuid, rc); + RETURN(rc); + } + + memset(ted->ted_lcd->lcd_uuid, 0, sizeof ted->ted_lcd->lcd_uuid); + rc = tgt_client_data_update(env, exp); + + CDEBUG(rc == 0 ? D_INFO : D_ERROR, + "%s: zeroing out client %s at idx %u (%llu), rc %d\n", + tgt->lut_obd->obd_name, ted->ted_lcd->lcd_uuid, + ted->ted_lr_idx, ted->ted_lr_off, rc); + RETURN(rc); +} +EXPORT_SYMBOL(tgt_client_del); + +int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt, + struct tg_export_data *ted, struct tg_reply_data *trd, + struct thandle *th, bool update_lrd_file) +{ + struct lsd_reply_data *lrd; + int i; + + lrd = &trd->trd_reply; + /* update export last transno */ + mutex_lock(&ted->ted_lcd_lock); + if (lrd->lrd_transno > ted->ted_lcd->lcd_last_transno) + ted->ted_lcd->lcd_last_transno = lrd->lrd_transno; + mutex_unlock(&ted->ted_lcd_lock); + + /* find a empty slot */ + i = tgt_find_free_reply_slot(tgt); + if (unlikely(i < 0)) { + CERROR("%s: couldn't find a slot for reply data: " + "rc = %d\n", tgt_name(tgt), i); + RETURN(i); + } + trd->trd_index = i; + + if (update_lrd_file) { + loff_t off; + int rc; + + /* write reply data to disk */ + off = sizeof(struct lsd_reply_header) + sizeof(*lrd) * i; + rc = tgt_reply_data_write(env, tgt, lrd, off, th); + if (unlikely(rc != 0)) { + CERROR("%s: can't update %s file: rc = %d\n", + tgt_name(tgt), REPLY_DATA, rc); + RETURN(rc); + } + } + /* add reply data to target export's reply list */ + mutex_lock(&ted->ted_lcd_lock); + list_add(&trd->trd_list, &ted->ted_reply_list); + ted->ted_reply_cnt++; + if (ted->ted_reply_cnt > ted->ted_reply_max) + ted->ted_reply_max = ted->ted_reply_cnt; + mutex_unlock(&ted->ted_lcd_lock); + + CDEBUG(D_TRACE, "add reply %p: xid %llu, transno %llu, " + "tag %hu, client gen %u, slot idx %d\n", + trd, lrd->lrd_xid, lrd->lrd_transno, + trd->trd_tag, lrd->lrd_client_gen, i); + RETURN(0); +} +EXPORT_SYMBOL(tgt_add_reply_data); + +/* + * last_rcvd & last_committed update callbacks + */ +static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt, + struct dt_object *obj, __u64 opdata, + struct thandle *th, struct ptlrpc_request *req) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct tgt_session_info *tsi = tgt_ses_info(env); + struct obd_export *exp = tsi->tsi_exp; + struct tg_export_data *ted; + __u64 *transno_p; + int rc = 0; + bool lw_client; + + ENTRY; + + + LASSERT(exp != NULL); + ted = &exp->exp_target_data; + + lw_client = exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT; + if (ted->ted_lr_idx < 0 && !lw_client) + /* ofd connect may cause transaction before export has + * last_rcvd slot */ + RETURN(0); + + if (req != NULL) + tti->tti_transno = lustre_msg_get_transno(req->rq_reqmsg); + else + /* From update replay, tti_transno should be set already */ + LASSERT(tti->tti_transno != 0); + + spin_lock(&tgt->lut_translock); + if (th->th_result != 0) { + if (tti->tti_transno != 0) { + CERROR("%s: replay transno %llu failed: rc = %d\n", + tgt_name(tgt), tti->tti_transno, th->th_result); + } + } else if (tti->tti_transno == 0) { + tti->tti_transno = ++tgt->lut_last_transno; + } else { + /* should be replay */ + if (tti->tti_transno > tgt->lut_last_transno) + tgt->lut_last_transno = tti->tti_transno; + } + spin_unlock(&tgt->lut_translock); + + /** VBR: set new versions */ + if (th->th_result == 0 && obj != NULL) { + struct dt_object *dto = dt_object_locate(obj, th->th_dev); + dt_version_set(env, dto, tti->tti_transno, th); + } + + /* filling reply data */ + CDEBUG(D_INODE, "transno = %llu, last_committed = %llu\n", + tti->tti_transno, tgt->lut_obd->obd_last_committed); + + if (req != NULL) { + req->rq_transno = tti->tti_transno; + lustre_msg_set_transno(req->rq_repmsg, tti->tti_transno); + } + + /* if can't add callback, do sync write */ + th->th_sync |= !!tgt_last_commit_cb_add(th, tgt, exp, tti->tti_transno); + + if (lw_client) { + /* All operations performed by LW clients are synchronous and + * we store the committed transno in the last_rcvd header */ + spin_lock(&tgt->lut_translock); + if (tti->tti_transno > tgt->lut_lsd.lsd_last_transno) { + tgt->lut_lsd.lsd_last_transno = tti->tti_transno; + spin_unlock(&tgt->lut_translock); + /* Although lightweight (LW) connections have no slot + * in the last_rcvd, we still want to maintain + * the in-memory lsd_client_data structure in order to + * properly handle reply reconstruction. */ + rc = tgt_server_data_write(env, tgt, th); + } else { + spin_unlock(&tgt->lut_translock); + } + } else if (ted->ted_lr_off == 0) { + CERROR("%s: client idx %d has offset %lld\n", + tgt_name(tgt), ted->ted_lr_idx, ted->ted_lr_off); + RETURN(-EINVAL); + } + + /* Target that supports multiple reply data */ + if (tgt_is_multimodrpcs_client(exp)) { + struct tg_reply_data *trd; + struct lsd_reply_data *lrd; + __u64 *pre_versions; + bool write_update; + + OBD_ALLOC_PTR(trd); + if (unlikely(trd == NULL)) + RETURN(-ENOMEM); + + /* fill reply data information */ + lrd = &trd->trd_reply; + lrd->lrd_transno = tti->tti_transno; + if (req != NULL) { + lrd->lrd_xid = req->rq_xid; + trd->trd_tag = lustre_msg_get_tag(req->rq_reqmsg); + pre_versions = lustre_msg_get_versions(req->rq_repmsg); + lrd->lrd_result = th->th_result; + lrd->lrd_client_gen = ted->ted_lcd->lcd_generation; + write_update = true; + } else { + LASSERT(tsi->tsi_xid != 0); + lrd->lrd_xid = tsi->tsi_xid; + lrd->lrd_result = tsi->tsi_result; + lrd->lrd_client_gen = tsi->tsi_client_gen; + trd->trd_tag = 0; + pre_versions = NULL; + write_update = false; + } + + lrd->lrd_data = opdata; + if (pre_versions) { + trd->trd_pre_versions[0] = pre_versions[0]; + trd->trd_pre_versions[1] = pre_versions[1]; + trd->trd_pre_versions[2] = pre_versions[2]; + trd->trd_pre_versions[3] = pre_versions[3]; + } + + rc = tgt_add_reply_data(env, tgt, ted, trd, th, write_update); + if (rc < 0) + OBD_FREE_PTR(trd); + return rc; + } + + /* Enough for update replay, let's return */ + if (req == NULL) + RETURN(rc); + + mutex_lock(&ted->ted_lcd_lock); + LASSERT(ergo(tti->tti_transno == 0, th->th_result != 0)); + if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) { + transno_p = &ted->ted_lcd->lcd_last_close_transno; + ted->ted_lcd->lcd_last_close_xid = req->rq_xid; + ted->ted_lcd->lcd_last_close_result = th->th_result; + } else { + /* VBR: save versions in last_rcvd for reconstruct. */ + __u64 *pre_versions = lustre_msg_get_versions(req->rq_repmsg); + + if (pre_versions) { + ted->ted_lcd->lcd_pre_versions[0] = pre_versions[0]; + ted->ted_lcd->lcd_pre_versions[1] = pre_versions[1]; + ted->ted_lcd->lcd_pre_versions[2] = pre_versions[2]; + ted->ted_lcd->lcd_pre_versions[3] = pre_versions[3]; + } + transno_p = &ted->ted_lcd->lcd_last_transno; + ted->ted_lcd->lcd_last_xid = req->rq_xid; + ted->ted_lcd->lcd_last_result = th->th_result; + /* XXX: lcd_last_data is __u32 but intent_dispostion is __u64, + * see struct ldlm_reply->lock_policy_res1; */ + ted->ted_lcd->lcd_last_data = opdata; + } + + /* Update transno in slot only if non-zero number, i.e. no errors */ + if (likely(tti->tti_transno != 0)) { + /* Don't overwrite bigger transaction number with lower one. + * That is not sign of problem in all cases, but in any case + * this value should be monotonically increased only. */ + if (*transno_p > tti->tti_transno) { + if (!tgt->lut_no_reconstruct) { + CERROR("%s: trying to overwrite bigger transno:" + "on-disk: %llu, new: %llu replay: " + "%d. See LU-617.\n", tgt_name(tgt), + *transno_p, tti->tti_transno, + req_is_replay(req)); + if (req_is_replay(req)) { + spin_lock(&req->rq_export->exp_lock); + req->rq_export->exp_vbr_failed = 1; + spin_unlock(&req->rq_export->exp_lock); + } + mutex_unlock(&ted->ted_lcd_lock); + RETURN(req_is_replay(req) ? -EOVERFLOW : 0); + } + } else { + *transno_p = tti->tti_transno; + } + } + + if (!lw_client) { + tti->tti_off = ted->ted_lr_off; + if (CFS_FAIL_CHECK(OBD_FAIL_TGT_RCVD_EIO)) + rc = -EIO; + else + rc = tgt_client_data_write(env, tgt, ted->ted_lcd, + &tti->tti_off, th); + if (rc < 0) { + mutex_unlock(&ted->ted_lcd_lock); + RETURN(rc); + } + } + mutex_unlock(&ted->ted_lcd_lock); + RETURN(rc); +} + +/* + * last_rcvd update for echo client simulation. + * It updates last_rcvd client slot and version of object in + * simple way but with all locks to simulate all drawbacks + */ +static int tgt_last_rcvd_update_echo(const struct lu_env *env, + struct lu_target *tgt, + struct dt_object *obj, + struct thandle *th, + struct obd_export *exp) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct tg_export_data *ted = &exp->exp_target_data; + int rc = 0; + + ENTRY; + + tti->tti_transno = 0; + + spin_lock(&tgt->lut_translock); + if (th->th_result == 0) + tti->tti_transno = ++tgt->lut_last_transno; + spin_unlock(&tgt->lut_translock); + + /** VBR: set new versions */ + if (th->th_result == 0 && obj != NULL) + dt_version_set(env, obj, tti->tti_transno, th); + + /* if can't add callback, do sync write */ + th->th_sync |= !!tgt_last_commit_cb_add(th, tgt, exp, + tti->tti_transno); + + LASSERT(ted->ted_lr_off > 0); + + mutex_lock(&ted->ted_lcd_lock); + LASSERT(ergo(tti->tti_transno == 0, th->th_result != 0)); + ted->ted_lcd->lcd_last_transno = tti->tti_transno; + ted->ted_lcd->lcd_last_result = th->th_result; + + tti->tti_off = ted->ted_lr_off; + rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th); + mutex_unlock(&ted->ted_lcd_lock); + RETURN(rc); +} + +static int tgt_clients_data_init(const struct lu_env *env, + struct lu_target *tgt, + unsigned long last_size) +{ + struct obd_device *obd = tgt->lut_obd; + struct lr_server_data *lsd = &tgt->lut_lsd; + struct lsd_client_data *lcd = NULL; + struct tg_export_data *ted; + int cl_idx; + int rc = 0; + loff_t off = lsd->lsd_client_start; + __u32 generation = 0; + struct cfs_hash *hash = NULL; + + ENTRY; + + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + CLASSERT(offsetof(struct lsd_client_data, lcd_padding) + + sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE); + + OBD_ALLOC_PTR(lcd); + if (lcd == NULL) + RETURN(-ENOMEM); + + hash = cfs_hash_getref(tgt->lut_obd->obd_gen_hash); + if (hash == NULL) + GOTO(err_out, rc = -ENODEV); + + for (cl_idx = 0; off < last_size; cl_idx++) { + struct obd_export *exp; + __u64 last_transno; + + /* Don't assume off is incremented properly by + * read_record(), in case sizeof(*lcd) + * isn't the same as fsd->lsd_client_size. */ + off = lsd->lsd_client_start + cl_idx * lsd->lsd_client_size; + rc = tgt_client_data_read(env, tgt, lcd, &off, cl_idx); + if (rc) { + CERROR("%s: error reading last_rcvd %s idx %d off " + "%llu: rc = %d\n", tgt_name(tgt), LAST_RCVD, + cl_idx, off, rc); + rc = 0; + break; /* read error shouldn't cause startup to fail */ + } + + if (lcd->lcd_uuid[0] == '\0') { + CDEBUG(D_INFO, "skipping zeroed client at offset %d\n", + cl_idx); + continue; + } + + last_transno = lcd_last_transno(lcd); + + /* These exports are cleaned up by disconnect, so they + * need to be set up like real exports as connect does. + */ + CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: %llu" + " srv lr: %llu lx: %llu gen %u\n", lcd->lcd_uuid, + cl_idx, last_transno, lsd->lsd_last_transno, + lcd_last_xid(lcd), lcd->lcd_generation); + + exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid); + if (IS_ERR(exp)) { + if (PTR_ERR(exp) == -EALREADY) { + /* export already exists, zero out this one */ + CERROR("%s: Duplicate export %s!\n", + tgt_name(tgt), lcd->lcd_uuid); + continue; + } + GOTO(err_out, rc = PTR_ERR(exp)); + } + + ted = &exp->exp_target_data; + *ted->ted_lcd = *lcd; + + rc = tgt_client_add(env, exp, cl_idx); + LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */ + /* VBR: set export last committed version */ + exp->exp_last_committed = last_transno; + spin_lock(&exp->exp_lock); + exp->exp_connecting = 0; + exp->exp_in_recovery = 0; + spin_unlock(&exp->exp_lock); + atomic_inc(&obd->obd_max_recoverable_clients); + + if (tgt->lut_lsd.lsd_feature_incompat & + OBD_INCOMPAT_MULTI_RPCS && + lcd->lcd_generation != 0) { + /* compute the highest valid client generation */ + generation = max(generation, lcd->lcd_generation); + /* fill client_generation <-> export hash table */ + rc = cfs_hash_add_unique(hash, &lcd->lcd_generation, + &exp->exp_gen_hash); + if (rc != 0) { + CERROR("%s: duplicate export for client " + "generation %u\n", + tgt_name(tgt), lcd->lcd_generation); + class_export_put(exp); + GOTO(err_out, rc); + } + } + + class_export_put(exp); + + rc = rev_import_init(exp); + if (rc != 0) { + class_unlink_export(exp); + GOTO(err_out, rc); + } + + /* Need to check last_rcvd even for duplicated exports. */ + CDEBUG(D_OTHER, "client at idx %d has last_transno = %llu\n", + cl_idx, last_transno); + + spin_lock(&tgt->lut_translock); + tgt->lut_last_transno = max(last_transno, + tgt->lut_last_transno); + spin_unlock(&tgt->lut_translock); + } + + /* record highest valid client generation */ + atomic_set(&tgt->lut_client_generation, generation); + +err_out: + if (hash != NULL) + cfs_hash_putref(hash); + OBD_FREE_PTR(lcd); + RETURN(rc); +} + +struct server_compat_data { + __u32 rocompat; + __u32 incompat; + __u32 rocinit; + __u32 incinit; +}; + +static struct server_compat_data tgt_scd[] = { + [LDD_F_SV_TYPE_MDT] = { + .rocompat = OBD_ROCOMPAT_LOVOBJID, + .incompat = OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR | + OBD_INCOMPAT_FID | OBD_INCOMPAT_IAM_DIR | + OBD_INCOMPAT_LMM_VER | OBD_INCOMPAT_MULTI_OI | + OBD_INCOMPAT_MULTI_RPCS, + .rocinit = OBD_ROCOMPAT_LOVOBJID, + .incinit = OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR | + OBD_INCOMPAT_MULTI_OI, + }, + [LDD_F_SV_TYPE_OST] = { + .rocompat = OBD_ROCOMPAT_IDX_IN_IDIF, + .incompat = OBD_INCOMPAT_OST | OBD_INCOMPAT_COMMON_LR | + OBD_INCOMPAT_FID, + .rocinit = OBD_ROCOMPAT_IDX_IN_IDIF, + .incinit = OBD_INCOMPAT_OST | OBD_INCOMPAT_COMMON_LR, + } +}; + +int tgt_server_data_init(const struct lu_env *env, struct lu_target *tgt) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct lr_server_data *lsd = &tgt->lut_lsd; + unsigned long last_rcvd_size; + __u32 index; + int rc, type; + + rc = dt_attr_get(env, tgt->lut_last_rcvd, &tti->tti_attr); + if (rc) + RETURN(rc); + + last_rcvd_size = (unsigned long)tti->tti_attr.la_size; + + /* ensure padding in the struct is the correct size */ + CLASSERT(offsetof(struct lr_server_data, lsd_padding) + + sizeof(lsd->lsd_padding) == LR_SERVER_SIZE); + + rc = server_name2index(tgt_name(tgt), &index, NULL); + if (rc < 0) { + CERROR("%s: Can not get index from name: rc = %d\n", + tgt_name(tgt), rc); + RETURN(rc); + } + /* server_name2index() returns type */ + type = rc; + if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) { + CERROR("%s: unknown target type %x\n", tgt_name(tgt), type); + RETURN(-EINVAL); + } + + /* last_rcvd on OST doesn't provide reconstruct support because there + * may be up to 8 in-flight write requests per single slot in + * last_rcvd client data + */ + tgt->lut_no_reconstruct = (type == LDD_F_SV_TYPE_OST); + + if (last_rcvd_size == 0) { + LCONSOLE_WARN("%s: new disk, initializing\n", tgt_name(tgt)); + + memcpy(lsd->lsd_uuid, tgt->lut_obd->obd_uuid.uuid, + sizeof(lsd->lsd_uuid)); + lsd->lsd_last_transno = 0; + lsd->lsd_mount_count = 0; + lsd->lsd_server_size = LR_SERVER_SIZE; + lsd->lsd_client_start = LR_CLIENT_START; + lsd->lsd_client_size = LR_CLIENT_SIZE; + lsd->lsd_subdir_count = OBJ_SUBDIR_COUNT; + lsd->lsd_osd_index = index; + lsd->lsd_feature_rocompat = tgt_scd[type].rocinit; + lsd->lsd_feature_incompat = tgt_scd[type].incinit; + } else { + rc = tgt_server_data_read(env, tgt); + if (rc) { + CERROR("%s: error reading LAST_RCVD: rc= %d\n", + tgt_name(tgt), rc); + RETURN(rc); + } + if (strcmp(lsd->lsd_uuid, tgt->lut_obd->obd_uuid.uuid)) { + if (tgt->lut_bottom->dd_rdonly) { + /* Such difference may be caused by mounting + * up snapshot with new fsname under rd_only + * mode. But even if it was NOT, it will not + * damage the system because of "rd_only". */ + memcpy(lsd->lsd_uuid, + tgt->lut_obd->obd_uuid.uuid, + sizeof(lsd->lsd_uuid)); + } else { + LCONSOLE_ERROR_MSG(0x157, "Trying to start " + "OBD %s using the wrong " + "disk %s. Were the /dev/ " + "assignments rearranged?\n", + tgt->lut_obd->obd_uuid.uuid, + lsd->lsd_uuid); + RETURN(-EINVAL); + } + } + + if (lsd->lsd_osd_index != index) { + LCONSOLE_ERROR_MSG(0x157, "%s: index %d in last rcvd " + "is different with the index %d in" + "config log, It might be disk" + "corruption!\n", tgt_name(tgt), + lsd->lsd_osd_index, index); + RETURN(-EINVAL); + } + } + + if (lsd->lsd_feature_incompat & ~tgt_scd[type].incompat) { + CERROR("%s: unsupported incompat filesystem feature(s) %x\n", + tgt_name(tgt), + lsd->lsd_feature_incompat & ~tgt_scd[type].incompat); + RETURN(-EINVAL); + } + + if (type == LDD_F_SV_TYPE_MDT) + lsd->lsd_feature_incompat |= OBD_INCOMPAT_FID; + + if (lsd->lsd_feature_rocompat & ~tgt_scd[type].rocompat) { + CERROR("%s: unsupported read-only filesystem feature(s) %x\n", + tgt_name(tgt), + lsd->lsd_feature_rocompat & ~tgt_scd[type].rocompat); + RETURN(-EINVAL); + } + /** Interop: evict all clients at first boot with 1.8 last_rcvd */ + if (type == LDD_F_SV_TYPE_MDT && + !(lsd->lsd_feature_compat & OBD_COMPAT_20)) { + if (last_rcvd_size > lsd->lsd_client_start) { + LCONSOLE_WARN("%s: mounting at first time on 1.8 FS, " + "remove all clients for interop needs\n", + tgt_name(tgt)); + rc = tgt_truncate_last_rcvd(env, tgt, + lsd->lsd_client_start); + if (rc) + RETURN(rc); + last_rcvd_size = lsd->lsd_client_start; + } + /** set 2.0 flag to upgrade/downgrade between 1.8 and 2.0 */ + lsd->lsd_feature_compat |= OBD_COMPAT_20; + } + + spin_lock(&tgt->lut_translock); + tgt->lut_last_transno = lsd->lsd_last_transno; + spin_unlock(&tgt->lut_translock); + + lsd->lsd_mount_count++; + + CDEBUG(D_INODE, "=======,=BEGIN DUMPING LAST_RCVD========\n"); + CDEBUG(D_INODE, "%s: server last_transno: %llu\n", + tgt_name(tgt), tgt->lut_last_transno); + CDEBUG(D_INODE, "%s: server mount_count: %llu\n", + tgt_name(tgt), lsd->lsd_mount_count); + CDEBUG(D_INODE, "%s: server data size: %u\n", + tgt_name(tgt), lsd->lsd_server_size); + CDEBUG(D_INODE, "%s: per-client data start: %u\n", + tgt_name(tgt), lsd->lsd_client_start); + CDEBUG(D_INODE, "%s: per-client data size: %u\n", + tgt_name(tgt), lsd->lsd_client_size); + CDEBUG(D_INODE, "%s: last_rcvd size: %lu\n", + tgt_name(tgt), last_rcvd_size); + CDEBUG(D_INODE, "%s: server subdir_count: %u\n", + tgt_name(tgt), lsd->lsd_subdir_count); + CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", tgt_name(tgt), + last_rcvd_size <= lsd->lsd_client_start ? 0 : + (last_rcvd_size - lsd->lsd_client_start) / + lsd->lsd_client_size); + CDEBUG(D_INODE, "========END DUMPING LAST_RCVD========\n"); + + if (lsd->lsd_server_size == 0 || lsd->lsd_client_start == 0 || + lsd->lsd_client_size == 0) { + CERROR("%s: bad last_rcvd contents!\n", tgt_name(tgt)); + RETURN(-EINVAL); + } + + if (!tgt->lut_obd->obd_replayable) + CWARN("%s: recovery support OFF\n", tgt_name(tgt)); + + rc = tgt_clients_data_init(env, tgt, last_rcvd_size); + if (rc < 0) + GOTO(err_client, rc); + + spin_lock(&tgt->lut_translock); + /* obd_last_committed is used for compatibility + * with other lustre recovery code */ + tgt->lut_obd->obd_last_committed = tgt->lut_last_transno; + spin_unlock(&tgt->lut_translock); + + tgt->lut_obd->u.obt.obt_mount_count = lsd->lsd_mount_count; + tgt->lut_obd->u.obt.obt_instance = (__u32)lsd->lsd_mount_count; + + /* save it, so mount count and last_transno is current */ + rc = tgt_server_data_update(env, tgt, 0); + if (rc < 0) + GOTO(err_client, rc); + + RETURN(0); + +err_client: + class_disconnect_exports(tgt->lut_obd); + return rc; +} + +/* add credits for last_rcvd update */ +int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th, + void *cookie) +{ + struct lu_target *tgt = cookie; + struct tgt_session_info *tsi; + struct tgt_thread_info *tti = tgt_th_info(env); + struct dt_object *dto; + int rc; + + /* For readonly case, the caller should have got failure + * when start the transaction. If the logic comes here, + * there must be something wrong. */ + if (unlikely(tgt->lut_bottom->dd_rdonly)) { + dump_stack(); + LBUG(); + } + + /* if there is no session, then this transaction is not result of + * request processing but some local operation */ + if (env->le_ses == NULL) + return 0; + + LASSERT(tgt->lut_last_rcvd); + tsi = tgt_ses_info(env); + /* OFD may start transaction without export assigned */ + if (tsi->tsi_exp == NULL) + return 0; + + if (tgt_is_multimodrpcs_client(tsi->tsi_exp)) { + /* + * Use maximum possible file offset for declaration to ensure + * ZFS will reserve enough credits for a write anywhere in this + * file, since we don't know where in the file the write will be + * because a replay slot has not been assigned. This should be + * replaced by dmu_tx_hold_append() when available. + */ + tti->tti_buf.lb_buf = NULL; + tti->tti_buf.lb_len = sizeof(struct lsd_reply_data); + dto = dt_object_locate(tgt->lut_reply_data, th->th_dev); + rc = dt_declare_record_write(env, dto, &tti->tti_buf, -1, th); + if (rc) + return rc; + } else { + dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev); + tti_buf_lcd(tti); + tti->tti_off = tsi->tsi_exp->exp_target_data.ted_lr_off; + rc = dt_declare_record_write(env, dto, &tti->tti_buf, + tti->tti_off, th); + if (rc) + return rc; + } + + if (tsi->tsi_vbr_obj != NULL && + !lu_object_remote(&tsi->tsi_vbr_obj->do_lu)) { + dto = dt_object_locate(tsi->tsi_vbr_obj, th->th_dev); + rc = dt_declare_version_set(env, dto, th); + } + + return rc; +} + +/* Update last_rcvd records with latests transaction data */ +int tgt_txn_stop_cb(const struct lu_env *env, struct thandle *th, + void *cookie) +{ + struct lu_target *tgt = cookie; + struct tgt_session_info *tsi; + struct tgt_thread_info *tti = tgt_th_info(env); + struct dt_object *obj = NULL; + int rc; + bool echo_client; + + if (env->le_ses == NULL) + return 0; + + tsi = tgt_ses_info(env); + /* OFD may start transaction without export assigned */ + if (tsi->tsi_exp == NULL) + return 0; + + echo_client = (tgt_ses_req(tsi) == NULL && tsi->tsi_xid == 0); + + if (tti->tti_has_trans && !echo_client) { + if (tti->tti_mult_trans == 0) { + CDEBUG(D_HA, "More than one transaction %llu\n", + tti->tti_transno); + RETURN(0); + } + /* we need another transno to be assigned */ + tti->tti_transno = 0; + } else if (th->th_result == 0) { + tti->tti_has_trans = 1; + } + + if (tsi->tsi_vbr_obj != NULL && + !lu_object_remote(&tsi->tsi_vbr_obj->do_lu)) { + obj = tsi->tsi_vbr_obj; + } + + if (unlikely(echo_client)) /* echo client special case */ + rc = tgt_last_rcvd_update_echo(env, tgt, obj, th, + tsi->tsi_exp); + else + rc = tgt_last_rcvd_update(env, tgt, obj, tsi->tsi_opdata, th, + tgt_ses_req(tsi)); + return rc; +} + +int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct lsd_reply_data *lrd = &tti->tti_lrd; + unsigned long reply_data_size; + int rc; + struct lsd_reply_header *lrh = NULL; + struct tg_reply_data *trd = NULL; + int idx; + loff_t off; + struct cfs_hash *hash = NULL; + struct obd_export *exp; + struct tg_export_data *ted; + int reply_data_recovered = 0; + + rc = dt_attr_get(env, tgt->lut_reply_data, &tti->tti_attr); + if (rc) + GOTO(out, rc); + reply_data_size = (unsigned long)tti->tti_attr.la_size; + + OBD_ALLOC_PTR(lrh); + if (lrh == NULL) + GOTO(out, rc = -ENOMEM); + + if (reply_data_size == 0) { + CDEBUG(D_INFO, "%s: new reply_data file, initializing\n", + tgt_name(tgt)); + lrh->lrh_magic = LRH_MAGIC; + lrh->lrh_header_size = sizeof(struct lsd_reply_header); + lrh->lrh_reply_size = sizeof(struct lsd_reply_data); + rc = tgt_reply_header_write(env, tgt, lrh); + if (rc) { + CERROR("%s: error writing %s: rc = %d\n", + tgt_name(tgt), REPLY_DATA, rc); + GOTO(out, rc); + } + } else { + rc = tgt_reply_header_read(env, tgt, lrh); + if (rc) { + CERROR("%s: error reading %s: rc = %d\n", + tgt_name(tgt), REPLY_DATA, rc); + GOTO(out, rc); + } + if (lrh->lrh_magic != LRH_MAGIC || + lrh->lrh_header_size != sizeof(struct lsd_reply_header) || + lrh->lrh_reply_size != sizeof(struct lsd_reply_data)) { + CERROR("%s: invalid header in %s\n", + tgt_name(tgt), REPLY_DATA); + GOTO(out, rc = -EINVAL); + } + + hash = cfs_hash_getref(tgt->lut_obd->obd_gen_hash); + if (hash == NULL) + GOTO(out, rc = -ENODEV); + + OBD_ALLOC_PTR(trd); + if (trd == NULL) + GOTO(out, rc = -ENOMEM); + + /* Load reply_data from disk */ + for (idx = 0, off = sizeof(struct lsd_reply_header); + off < reply_data_size; + idx++, off += sizeof(struct lsd_reply_data)) { + rc = tgt_reply_data_read(env, tgt, lrd, off); + if (rc) { + CERROR("%s: error reading %s: rc = %d\n", + tgt_name(tgt), REPLY_DATA, rc); + GOTO(out, rc); + } + + exp = cfs_hash_lookup(hash, &lrd->lrd_client_gen); + if (exp == NULL) { + /* old reply data from a disconnected client */ + continue; + } + ted = &exp->exp_target_data; + mutex_lock(&ted->ted_lcd_lock); + + /* create in-memory reply_data and link it to + * target export's reply list */ + rc = tgt_set_reply_slot(tgt, idx); + if (rc != 0) { + mutex_unlock(&ted->ted_lcd_lock); + GOTO(out, rc); + } + trd->trd_reply = *lrd; + trd->trd_pre_versions[0] = 0; + trd->trd_pre_versions[1] = 0; + trd->trd_pre_versions[2] = 0; + trd->trd_pre_versions[3] = 0; + trd->trd_index = idx; + trd->trd_tag = 0; + list_add(&trd->trd_list, &ted->ted_reply_list); + ted->ted_reply_cnt++; + if (ted->ted_reply_cnt > ted->ted_reply_max) + ted->ted_reply_max = ted->ted_reply_cnt; + + CDEBUG(D_HA, "%s: restore reply %p: xid %llu, " + "transno %llu, client gen %u, slot idx %d\n", + tgt_name(tgt), trd, lrd->lrd_xid, + lrd->lrd_transno, lrd->lrd_client_gen, + trd->trd_index); + + /* update export last committed transation */ + exp->exp_last_committed = max(exp->exp_last_committed, + lrd->lrd_transno); + /* Update lcd_last_transno as well for check in + * tgt_release_reply_data() or the latest client + * transno can be lost. + */ + ted->ted_lcd->lcd_last_transno = + max(ted->ted_lcd->lcd_last_transno, + exp->exp_last_committed); + + mutex_unlock(&ted->ted_lcd_lock); + class_export_put(exp); + + /* update target last committed transaction */ + spin_lock(&tgt->lut_translock); + tgt->lut_last_transno = max(tgt->lut_last_transno, + lrd->lrd_transno); + spin_unlock(&tgt->lut_translock); + + reply_data_recovered++; + + OBD_ALLOC_PTR(trd); + if (trd == NULL) + GOTO(out, rc = -ENOMEM); + } + CDEBUG(D_INFO, "%s: %d reply data have been recovered\n", + tgt_name(tgt), reply_data_recovered); + } + + spin_lock(&tgt->lut_translock); + /* obd_last_committed is used for compatibility + * with other lustre recovery code */ + tgt->lut_obd->obd_last_committed = tgt->lut_last_transno; + spin_unlock(&tgt->lut_translock); + + rc = 0; + +out: + if (hash != NULL) + cfs_hash_putref(hash); + if (trd != NULL) + OBD_FREE_PTR(trd); + if (lrh != NULL) + OBD_FREE_PTR(lrh); + return rc; +} + +struct tg_reply_data *tgt_lookup_reply_by_xid(struct tg_export_data *ted, + __u64 xid) +{ + struct tg_reply_data *found = NULL; + struct tg_reply_data *reply; + + mutex_lock(&ted->ted_lcd_lock); + list_for_each_entry(reply, &ted->ted_reply_list, trd_list) { + if (reply->trd_reply.lrd_xid == xid) { + found = reply; + break; + } + } + mutex_unlock(&ted->ted_lcd_lock); + return found; +} +EXPORT_SYMBOL(tgt_lookup_reply_by_xid); + +/* Look for a reply data matching specified request @req + * A copy is returned in @trd if the pointer is not NULL + */ +bool tgt_lookup_reply(struct ptlrpc_request *req, struct tg_reply_data *trd) +{ + struct tg_export_data *ted = &req->rq_export->exp_target_data; + struct tg_reply_data *reply; + bool found = false; + + reply = tgt_lookup_reply_by_xid(ted, req->rq_xid); + if (reply != NULL) { + found = true; + if (trd != NULL) + *trd = *reply; + } + + CDEBUG(D_TRACE, "%s: lookup reply xid %llu, found %d\n", + tgt_name(class_exp2tgt(req->rq_export)), req->rq_xid, + found ? 1 : 0); + + return found; +} +EXPORT_SYMBOL(tgt_lookup_reply); + +int tgt_handle_received_xid(struct obd_export *exp, __u64 rcvd_xid) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *lut = class_exp2tgt(exp); + struct tg_reply_data *trd, *tmp; + + mutex_lock(&ted->ted_lcd_lock); + list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) { + if (trd->trd_reply.lrd_xid > rcvd_xid) + continue; + ted->ted_release_xid++; + tgt_release_reply_data(lut, ted, trd); + } + mutex_unlock(&ted->ted_lcd_lock); + + return 0; +} + +int tgt_handle_tag(struct obd_export *exp, __u16 tag) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *lut = class_exp2tgt(exp); + struct tg_reply_data *trd, *tmp; + + if (tag == 0) + return 0; + + mutex_lock(&ted->ted_lcd_lock); + list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) { + if (trd->trd_tag != tag) + continue; + ted->ted_release_tag++; + tgt_release_reply_data(lut, ted, trd); + break; + } + mutex_unlock(&ted->ted_lcd_lock); + + return 0; +} + diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_main.c b/drivers/staging/lustrefsx/lustre/target/tgt_main.c new file mode 100644 index 0000000000000..ce158941f9c06 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/tgt_main.c @@ -0,0 +1,764 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, 2017, Intel Corporation. + */ +/* + * lustre/target/tgt_main.c + * + * Lustre Unified Target main initialization code + * + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include "tgt_internal.h" +#include "../ptlrpc/ptlrpc_internal.h" + +/* This must be longer than the longest string below */ +#define SYNC_STATES_MAXLEN 16 +static char *sync_lock_cancel_states[] = { + [SYNC_LOCK_CANCEL_NEVER] = "never", + [SYNC_LOCK_CANCEL_BLOCKING] = "blocking", + [SYNC_LOCK_CANCEL_ALWAYS] = "always", +}; + +/** + * Show policy for handling dirty data under a lock being cancelled. + * + * \param[in] kobj sysfs kobject + * \param[in] attr sysfs attribute + * \param[in] buf buffer for data + * + * \retval 0 and buffer filled with data on success + * \retval negative value on error + */ +ssize_t sync_lock_cancel_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *tgt = obd->u.obt.obt_lut; + + return sprintf(buf, "%s\n", + sync_lock_cancel_states[tgt->lut_sync_lock_cancel]); +} +EXPORT_SYMBOL(sync_lock_cancel_show); + +/** + * Change policy for handling dirty data under a lock being cancelled. + * + * This variable defines what action target takes upon lock cancel + * There are three possible modes: + * 1) never - never do sync upon lock cancel. This can lead to data + * inconsistencies if both the OST and client crash while writing a file + * that is also concurrently being read by another client. In these cases, + * this may allow the file data to "rewind" to an earlier state. + * 2) blocking - do sync only if there is blocking lock, e.g. if another + * client is trying to access this same object + * 3) always - do sync always + * + * \param[in] kobj kobject + * \param[in] attr attribute to show + * \param[in] buf buffer for data + * \param[in] count buffer size + * + * \retval \a count on success + * \retval negative value on error + */ +ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *tgt = obd->u.obt.obt_lut; + int val = -1; + enum tgt_sync_lock_cancel slc; + + if (count == 0 || count >= SYNC_STATES_MAXLEN) + return -EINVAL; + + for (slc = 0; slc < ARRAY_SIZE(sync_lock_cancel_states); slc++) { + if (strcmp(buffer, sync_lock_cancel_states[slc]) == 0) { + val = slc; + break; + } + } + + /* Legacy numeric codes */ + if (val == -1) { + int rc = kstrtoint(buffer, 0, &val); + if (rc) + return rc; + } + + if (val < 0 || val > 2) + return -EINVAL; + + spin_lock(&tgt->lut_flags_lock); + tgt->lut_sync_lock_cancel = val; + spin_unlock(&tgt->lut_flags_lock); + return count; +} +EXPORT_SYMBOL(sync_lock_cancel_store); +LUSTRE_RW_ATTR(sync_lock_cancel); + +/** + * Show maximum number of Filter Modification Data (FMD) maintained. + * + * \param[in] kobj kobject + * \param[in] attr attribute to show + * \param[in] buf buffer for data + * + * \retval 0 and buffer filled with data on success + * \retval negative value on error + */ +ssize_t tgt_fmd_count_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *lut = obd->u.obt.obt_lut; + + return sprintf(buf, "%u\n", lut->lut_fmd_max_num); +} + +/** + * Change number of FMDs maintained by target. + * + * This defines how large the list of FMDs can be. + * + * \param[in] kobj kobject + * \param[in] attr attribute to show + * \param[in] buf buffer for data + * \param[in] count buffer size + * + * \retval \a count on success + * \retval negative value on error + */ +ssize_t tgt_fmd_count_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *lut = obd->u.obt.obt_lut; + int val, rc; + + rc = kstrtoint(buffer, 0, &val); + if (rc) + return rc; + + if (val < 1 || val > 65536) + return -EINVAL; + + lut->lut_fmd_max_num = val; + + return count; +} +LUSTRE_RW_ATTR(tgt_fmd_count); + +/** + * Show the maximum age of FMD data in seconds. + * + * \param[in] kobj kobject + * \param[in] attr attribute to show + * \param[in] buf buffer for data + * + * \retval 0 and buffer filled with data on success + * \retval negative value on error + */ +ssize_t tgt_fmd_seconds_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *lut = obd->u.obt.obt_lut; + + return sprintf(buf, "%lld\n", lut->lut_fmd_max_age); +} + +/** + * Set the maximum age of FMD data in seconds. + * + * This defines how long FMD data stays in the FMD list. + * + * \param[in] kobj kobject + * \param[in] attr attribute to show + * \param[in] buf buffer for data + * \param[in] count buffer size + * + * \retval \a count on success + * \retval negative number on error + */ +ssize_t tgt_fmd_seconds_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct lu_target *lut = obd->u.obt.obt_lut; + time64_t val; + int rc; + + rc = kstrtoll(buffer, 0, &val); + if (rc) + return rc; + + if (val < 1 || val > 65536) /* ~ 18 hour max */ + return -EINVAL; + + lut->lut_fmd_max_age = val; + + return count; +} +LUSTRE_RW_ATTR(tgt_fmd_seconds); + +/* These two aliases are old names and kept for compatibility, they were + * changed to 'tgt_fmd_count' and 'tgt_fmd_seconds'. + * This change was made in Lustre 2.13, so these aliases can be removed + * when back compatibility is not needed with any Lustre version prior 2.13 + */ +static struct lustre_attr tgt_fmd_count_compat = __ATTR(client_cache_count, + 0644, tgt_fmd_count_show, tgt_fmd_count_store); +static struct lustre_attr tgt_fmd_seconds_compat = __ATTR(client_cache_seconds, + 0644, tgt_fmd_seconds_show, tgt_fmd_seconds_store); + +static const struct attribute *tgt_attrs[] = { + &lustre_attr_sync_lock_cancel.attr, + &lustre_attr_tgt_fmd_count.attr, + &lustre_attr_tgt_fmd_seconds.attr, + &tgt_fmd_count_compat.attr, + &tgt_fmd_seconds_compat.attr, + NULL, +}; + +int tgt_tunables_init(struct lu_target *lut) +{ + int rc; + + rc = sysfs_create_files(&lut->lut_obd->obd_kset.kobj, tgt_attrs); + if (!rc) + lut->lut_attrs = tgt_attrs; + return rc; +} +EXPORT_SYMBOL(tgt_tunables_init); + +void tgt_tunables_fini(struct lu_target *lut) +{ + if (lut->lut_attrs) { + sysfs_remove_files(&lut->lut_obd->obd_kset.kobj, + lut->lut_attrs); + lut->lut_attrs = NULL; + } +} +EXPORT_SYMBOL(tgt_tunables_fini); + +/* + * Save cross-MDT lock in lut_slc_locks. + * + * Lock R/W count is not saved, but released in unlock (not canceled remotely), + * instead only a refcount is taken, so that the remote MDT where the object + * resides can detect conflict with this lock there. + * + * \param lut target + * \param lock cross-MDT lock to save + * \param transno when the transaction with this transno is committed, this lock + * can be canceled. + */ +void tgt_save_slc_lock(struct lu_target *lut, struct ldlm_lock *lock, + __u64 transno) +{ + spin_lock(&lut->lut_slc_locks_guard); + lock_res_and_lock(lock); + if (ldlm_is_cbpending(lock)) { + /* if it was canceld by server, don't save, because remote MDT + * will do Sync-on-Cancel. */ + LDLM_LOCK_PUT(lock); + } else { + lock->l_transno = transno; + /* if this lock is in the list already, there are two operations + * both use this lock, and save it after use, so for the second + * one, just put the refcount. */ + if (list_empty(&lock->l_slc_link)) + list_add_tail(&lock->l_slc_link, &lut->lut_slc_locks); + else + LDLM_LOCK_PUT(lock); + } + unlock_res_and_lock(lock); + spin_unlock(&lut->lut_slc_locks_guard); +} +EXPORT_SYMBOL(tgt_save_slc_lock); + +/* + * Discard cross-MDT lock from lut_slc_locks. + * + * This is called upon BAST, just remove lock from lut_slc_locks and put lock + * refcount. The BAST will cancel this lock. + * + * \param lut target + * \param lock cross-MDT lock to discard + */ +void tgt_discard_slc_lock(struct lu_target *lut, struct ldlm_lock *lock) +{ + spin_lock(&lut->lut_slc_locks_guard); + lock_res_and_lock(lock); + /* may race with tgt_cancel_slc_locks() */ + if (lock->l_transno != 0) { + LASSERT(!list_empty(&lock->l_slc_link)); + LASSERT(ldlm_is_cbpending(lock)); + list_del_init(&lock->l_slc_link); + lock->l_transno = 0; + LDLM_LOCK_PUT(lock); + } + unlock_res_and_lock(lock); + spin_unlock(&lut->lut_slc_locks_guard); +} +EXPORT_SYMBOL(tgt_discard_slc_lock); + +/* + * Cancel cross-MDT locks upon transaction commit. + * + * Remove cross-MDT locks from lut_slc_locks, cancel them and put lock refcount. + * + * \param lut target + * \param transno transaction with this number was committed. + */ +void tgt_cancel_slc_locks(struct lu_target *lut, __u64 transno) +{ + struct ldlm_lock *lock, *next; + LIST_HEAD(list); + struct lustre_handle lockh; + + spin_lock(&lut->lut_slc_locks_guard); + list_for_each_entry_safe(lock, next, &lut->lut_slc_locks, + l_slc_link) { + lock_res_and_lock(lock); + LASSERT(lock->l_transno != 0); + if (lock->l_transno > transno) { + unlock_res_and_lock(lock); + continue; + } + /* ouch, another operation is using it after it's saved */ + if (lock->l_readers != 0 || lock->l_writers != 0) { + unlock_res_and_lock(lock); + continue; + } + /* set CBPENDING so that this lock won't be used again */ + ldlm_set_cbpending(lock); + lock->l_transno = 0; + list_move(&lock->l_slc_link, &list); + unlock_res_and_lock(lock); + } + spin_unlock(&lut->lut_slc_locks_guard); + + list_for_each_entry_safe(lock, next, &list, l_slc_link) { + list_del_init(&lock->l_slc_link); + ldlm_lock2handle(lock, &lockh); + ldlm_cli_cancel(&lockh, LCF_ASYNC); + LDLM_LOCK_PUT(lock); + } +} + +int tgt_init(const struct lu_env *env, struct lu_target *lut, + struct obd_device *obd, struct dt_device *dt, + struct tgt_opc_slice *slice, int request_fail_id, + int reply_fail_id) +{ + struct dt_object_format dof; + struct lu_attr attr; + struct lu_fid fid; + struct dt_object *o; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct obd_statfs *osfs; + int i, rc = 0; + + ENTRY; + + LASSERT(lut); + LASSERT(obd); + lut->lut_obd = obd; + lut->lut_bottom = dt; + lut->lut_last_rcvd = NULL; + lut->lut_client_bitmap = NULL; + atomic_set(&lut->lut_num_clients, 0); + atomic_set(&lut->lut_client_generation, 0); + lut->lut_reply_data = NULL; + lut->lut_reply_bitmap = NULL; + obd->u.obt.obt_lut = lut; + obd->u.obt.obt_magic = OBT_MAGIC; + + /* set request handler slice and parameters */ + lut->lut_slice = slice; + lut->lut_reply_fail_id = reply_fail_id; + lut->lut_request_fail_id = request_fail_id; + + /* sptlrcp variables init */ + rwlock_init(&lut->lut_sptlrpc_lock); + sptlrpc_rule_set_init(&lut->lut_sptlrpc_rset); + + spin_lock_init(&lut->lut_flags_lock); + lut->lut_sync_lock_cancel = SYNC_LOCK_CANCEL_NEVER; + + spin_lock_init(&lut->lut_slc_locks_guard); + INIT_LIST_HEAD(&lut->lut_slc_locks); + + /* last_rcvd initialization is needed by replayable targets only */ + if (!obd->obd_replayable) + RETURN(0); + + /* initialize grant and statfs data in target */ + dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf); + + /* statfs data */ + spin_lock_init(&tgd->tgd_osfs_lock); + tgd->tgd_osfs_age = ktime_get_seconds() - 1000; + tgd->tgd_osfs_unstable = 0; + tgd->tgd_statfs_inflight = 0; + tgd->tgd_osfs_inflight = 0; + + /* grant data */ + spin_lock_init(&tgd->tgd_grant_lock); + tgd->tgd_tot_dirty = 0; + tgd->tgd_tot_granted = 0; + tgd->tgd_tot_pending = 0; + tgd->tgd_grant_compat_disable = 0; + + /* populate cached statfs data */ + osfs = &tgt_th_info(env)->tti_u.osfs; + rc = tgt_statfs_internal(env, lut, osfs, 0, NULL); + if (rc != 0) { + CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut), + rc); + GOTO(out, rc); + } + if (!is_power_of_2(osfs->os_bsize)) { + CERROR("%s: blocksize (%d) is not a power of 2\n", + tgt_name(lut), osfs->os_bsize); + GOTO(out, rc = -EPROTO); + } + tgd->tgd_blockbits = fls(osfs->os_bsize) - 1; + + spin_lock_init(&lut->lut_translock); + spin_lock_init(&lut->lut_client_bitmap_lock); + + OBD_ALLOC(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3); + if (lut->lut_client_bitmap == NULL) + RETURN(-ENOMEM); + + memset(&attr, 0, sizeof(attr)); + attr.la_valid = LA_MODE; + attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + dof.dof_type = dt_mode_to_dft(S_IFREG); + + lu_local_obj_fid(&fid, LAST_RECV_OID); + + o = dt_find_or_create(env, lut->lut_bottom, &fid, &dof, &attr); + if (IS_ERR(o)) { + rc = PTR_ERR(o); + CERROR("%s: cannot open LAST_RCVD: rc = %d\n", tgt_name(lut), + rc); + GOTO(out_put, rc); + } + + lut->lut_last_rcvd = o; + rc = tgt_server_data_init(env, lut); + if (rc < 0) + GOTO(out_put, rc); + + /* prepare transactions callbacks */ + lut->lut_txn_cb.dtc_txn_start = tgt_txn_start_cb; + lut->lut_txn_cb.dtc_txn_stop = tgt_txn_stop_cb; + lut->lut_txn_cb.dtc_cookie = lut; + lut->lut_txn_cb.dtc_tag = LCT_DT_THREAD | LCT_MD_THREAD; + INIT_LIST_HEAD(&lut->lut_txn_cb.dtc_linkage); + + dt_txn_callback_add(lut->lut_bottom, &lut->lut_txn_cb); + lut->lut_bottom->dd_lu_dev.ld_site->ls_tgt = lut; + + lut->lut_fmd_max_num = LUT_FMD_MAX_NUM_DEFAULT; + lut->lut_fmd_max_age = LUT_FMD_MAX_AGE_DEFAULT; + + atomic_set(&lut->lut_sync_count, 0); + + /* reply_data is supported by MDT targets only for now */ + if (strncmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) != 0) + RETURN(0); + + OBD_ALLOC(lut->lut_reply_bitmap, + LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *)); + if (lut->lut_reply_bitmap == NULL) + GOTO(out, rc = -ENOMEM); + + memset(&attr, 0, sizeof(attr)); + attr.la_valid = LA_MODE; + attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + dof.dof_type = dt_mode_to_dft(S_IFREG); + + lu_local_obj_fid(&fid, REPLY_DATA_OID); + + o = dt_find_or_create(env, lut->lut_bottom, &fid, &dof, &attr); + if (IS_ERR(o)) { + rc = PTR_ERR(o); + CERROR("%s: cannot open REPLY_DATA: rc = %d\n", tgt_name(lut), + rc); + GOTO(out, rc); + } + lut->lut_reply_data = o; + + rc = tgt_reply_data_init(env, lut); + if (rc < 0) + GOTO(out, rc); + + RETURN(0); + +out: + dt_txn_callback_del(lut->lut_bottom, &lut->lut_txn_cb); +out_put: + obd->u.obt.obt_magic = 0; + obd->u.obt.obt_lut = NULL; + if (lut->lut_last_rcvd != NULL) { + dt_object_put(env, lut->lut_last_rcvd); + lut->lut_last_rcvd = NULL; + } + if (lut->lut_client_bitmap != NULL) + OBD_FREE(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3); + lut->lut_client_bitmap = NULL; + if (lut->lut_reply_data != NULL) + dt_object_put(env, lut->lut_reply_data); + lut->lut_reply_data = NULL; + if (lut->lut_reply_bitmap != NULL) { + for (i = 0; i < LUT_REPLY_SLOTS_MAX_CHUNKS; i++) { + if (lut->lut_reply_bitmap[i] != NULL) + OBD_FREE_LARGE(lut->lut_reply_bitmap[i], + BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) * + sizeof(long)); + lut->lut_reply_bitmap[i] = NULL; + } + OBD_FREE(lut->lut_reply_bitmap, + LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *)); + } + lut->lut_reply_bitmap = NULL; + return rc; +} +EXPORT_SYMBOL(tgt_init); + +void tgt_fini(const struct lu_env *env, struct lu_target *lut) +{ + int i; + int rc; + ENTRY; + + if (lut->lut_lsd.lsd_feature_incompat & OBD_INCOMPAT_MULTI_RPCS && + atomic_read(&lut->lut_num_clients) == 0) { + /* Clear MULTI RPCS incompatibility flag that prevents previous + * Lustre versions to mount a target with reply_data file */ + lut->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS; + rc = tgt_server_data_update(env, lut, 1); + if (rc < 0) + CERROR("%s: unable to clear MULTI RPCS " + "incompatibility flag\n", + lut->lut_obd->obd_name); + } + + sptlrpc_rule_set_free(&lut->lut_sptlrpc_rset); + + if (lut->lut_reply_data != NULL) + dt_object_put(env, lut->lut_reply_data); + lut->lut_reply_data = NULL; + if (lut->lut_reply_bitmap != NULL) { + for (i = 0; i < LUT_REPLY_SLOTS_MAX_CHUNKS; i++) { + if (lut->lut_reply_bitmap[i] != NULL) + OBD_FREE_LARGE(lut->lut_reply_bitmap[i], + BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) * + sizeof(long)); + lut->lut_reply_bitmap[i] = NULL; + } + OBD_FREE(lut->lut_reply_bitmap, + LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *)); + } + lut->lut_reply_bitmap = NULL; + if (lut->lut_client_bitmap) { + OBD_FREE(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3); + lut->lut_client_bitmap = NULL; + } + if (lut->lut_last_rcvd) { + dt_txn_callback_del(lut->lut_bottom, &lut->lut_txn_cb); + dt_object_put(env, lut->lut_last_rcvd); + lut->lut_last_rcvd = NULL; + } + EXIT; +} +EXPORT_SYMBOL(tgt_fini); + +static struct kmem_cache *tgt_thread_kmem; +static struct kmem_cache *tgt_session_kmem; +struct kmem_cache *tgt_fmd_kmem; + +static struct lu_kmem_descr tgt_caches[] = { + { + .ckd_cache = &tgt_thread_kmem, + .ckd_name = "tgt_thread_kmem", + .ckd_size = sizeof(struct tgt_thread_info), + }, + { + .ckd_cache = &tgt_session_kmem, + .ckd_name = "tgt_session_kmem", + .ckd_size = sizeof(struct tgt_session_info) + }, + { + .ckd_cache = &tgt_fmd_kmem, + .ckd_name = "tgt_fmd_cache", + .ckd_size = sizeof(struct tgt_fmd_data) + }, + { + .ckd_cache = NULL + } +}; + + +/* context key constructor/destructor: tg_key_init, tg_key_fini */ +static void *tgt_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct tgt_thread_info *thread; + + OBD_SLAB_ALLOC_PTR_GFP(thread, tgt_thread_kmem, GFP_NOFS); + if (thread == NULL) + return ERR_PTR(-ENOMEM); + + return thread; +} + +static void tgt_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct tgt_thread_info *info = data; + struct thandle_exec_args *args = &info->tti_tea; + int i; + + for (i = 0; i < args->ta_alloc_args; i++) { + if (args->ta_args[i] != NULL) + OBD_FREE_PTR(args->ta_args[i]); + } + + if (args->ta_args != NULL) + OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) * + args->ta_alloc_args); + OBD_SLAB_FREE_PTR(info, tgt_thread_kmem); +} + +static void tgt_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct tgt_thread_info *tti = data; + + tti->tti_has_trans = 0; + tti->tti_mult_trans = 0; +} + +/* context key: tg_thread_key */ +struct lu_context_key tgt_thread_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD, + .lct_init = tgt_key_init, + .lct_fini = tgt_key_fini, + .lct_exit = tgt_key_exit, +}; + +LU_KEY_INIT_GENERIC(tgt); + +static void *tgt_ses_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct tgt_session_info *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, tgt_session_kmem, GFP_NOFS); + if (session == NULL) + return ERR_PTR(-ENOMEM); + + return session; +} + +static void tgt_ses_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct tgt_session_info *session = data; + + OBD_SLAB_FREE_PTR(session, tgt_session_kmem); +} + +/* context key: tgt_session_key */ +struct lu_context_key tgt_session_key = { + .lct_tags = LCT_SERVER_SESSION, + .lct_init = tgt_ses_key_init, + .lct_fini = tgt_ses_key_fini, +}; +EXPORT_SYMBOL(tgt_session_key); + +LU_KEY_INIT_GENERIC(tgt_ses); + +/* + * this page is allocated statically when module is initializing + * it is used to simulate data corruptions, see ost_checksum_bulk() + * for details. as the original pages provided by the layers below + * can be remain in the internal cache, we do not want to modify + * them. + */ +struct page *tgt_page_to_corrupt; + +int tgt_mod_init(void) +{ + int result; + ENTRY; + + result = lu_kmem_init(tgt_caches); + if (result != 0) + RETURN(result); + + tgt_page_to_corrupt = alloc_page(GFP_KERNEL); + + tgt_key_init_generic(&tgt_thread_key, NULL); + lu_context_key_register_many(&tgt_thread_key, NULL); + + tgt_ses_key_init_generic(&tgt_session_key, NULL); + lu_context_key_register_many(&tgt_session_key, NULL); + barrier_init(); + + update_info_init(); + + RETURN(0); +} + +void tgt_mod_exit(void) +{ + barrier_fini(); + if (tgt_page_to_corrupt != NULL) + put_page(tgt_page_to_corrupt); + + lu_context_key_degister(&tgt_thread_key); + lu_context_key_degister(&tgt_session_key); + update_info_fini(); + + lu_kmem_fini(tgt_caches); +} + diff --git a/drivers/staging/lustrefsx/lustre/target/update_records.c b/drivers/staging/lustrefsx/lustre/target/update_records.c new file mode 100644 index 0000000000000..5fb706c5090a5 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/update_records.c @@ -0,0 +1,1233 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, 2017, Intel Corporation. + */ + +/* + * lustre/target/update_records.c + * + * This file implement the methods to pack updates as update records, which + * will be written to the disk as llog record, and might be used during + * recovery. + * + * For cross-MDT operation, all of updates of the operation needs to be + * recorded in the disk, then during recovery phase, the recovery thread + * will retrieve and redo these updates if it needed. + * + * See comments above struct update_records for the format of update_records. + * + * Author: Di Wang + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include + +#include "tgt_internal.h" + +#define UPDATE_RECORDS_BUFFER_SIZE 8192 +#define UPDATE_PARAMS_BUFFER_SIZE 8192 +/** + * Dump update record. + * + * Dump all of updates in the update_records, mostly for debugging purpose. + * + * \param[in] records update records to be dumpped + * \param[in] mask debug level mask + * \param[in] dump_params if dump all of updates the updates. + * + */ +void update_records_dump(const struct update_records *records, + unsigned int mask, bool dump_updates) +{ + const struct update_ops *ops; + const struct update_op *op = NULL; + struct update_params *params = NULL; + unsigned int i; + + CDEBUG(mask, "master transno = %llu batchid = %llu flags = %x" + " ops = %d params = %d\n", records->ur_master_transno, + records->ur_batchid, records->ur_flags, records->ur_update_count, + records->ur_param_count); + + if (records->ur_update_count == 0) + return; + + if (!dump_updates) + return; + + ops = &records->ur_ops; + if (records->ur_param_count > 0) + params = update_records_get_params(records); + + op = &ops->uops_op[0]; + for (i = 0; i < records->ur_update_count; i++, + op = update_op_next_op(op)) { + unsigned int j; + + CDEBUG(mask, "update %dth "DFID" %s params_count = %hu\n", i, + PFID(&op->uop_fid), update_op_str(op->uop_type), + op->uop_param_count); + + if (params == NULL) + continue; + + for (j = 0; j < op->uop_param_count; j++) { + struct object_update_param *param; + + param = update_params_get_param(params, + (unsigned int)op->uop_params_off[j], + records->ur_param_count); + + if (param == NULL) + continue; + CDEBUG(mask, "param = %p %dth off = %hu size = %hu\n", + param, j, op->uop_params_off[j], param->oup_len); + } + } +} + +/** + * Pack parameters to update records + * + * Find and insert parameter to update records, if the parameter + * already exists in \a params, then just return the offset of this + * parameter, otherwise insert the parameter and return its offset + * + * \param[in] params update params in which to insert parameter + * \param[in] new_param parameters to be inserted. + * \param[in] new_param_size the size of \a new_param + * + * \retval index inside \a params if parameter insertion + * succeeds. + * \retval negative errno if it fails. + */ +static unsigned int update_records_param_pack(struct update_params *params, + const void *new_param, + size_t new_param_size, + unsigned int *param_count) +{ + struct object_update_param *param; + unsigned int i; + + for (i = 0; i < *param_count; i++) { + struct object_update_param *param; + + param = update_params_get_param(params, i, *param_count); + if ((new_param == NULL && param->oup_len == new_param_size) || + (param->oup_len == new_param_size && + memcmp(param->oup_buf, new_param, new_param_size) == 0)) + /* Found the parameter and return its index */ + return i; + } + + param = (struct object_update_param *)((char *)params + + update_params_size(params, *param_count)); + + param->oup_len = new_param_size; + if (new_param != NULL) + memcpy(param->oup_buf, new_param, new_param_size); + + *param_count = *param_count + 1; + + return *param_count - 1; +} + +/** + * Pack update to update records + * + * Pack the update and its parameters to the update records. First it will + * insert parameters, get the offset of these parameter, then fill the + * update with these offset. If insertion exceed the maximum size of + * current update records, it will return -E2BIG here, and the caller might + * extend the update_record size \see lod_updates_pack. + * + * \param[in] env execution environment + * \param[in] fid FID of the update. + * \param[in] op_type operation type of the update + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] param_bufs buffers of parameters + * \param[in] params_buf_count the count of the parameter buffers + * \param[in] param_size sizes of parameters + * + * \retval 0 if packing succeeds + * \retval negative errno if packing fails + */ +static int update_records_update_pack(const struct lu_env *env, + const struct lu_fid *fid, + enum update_type op_type, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_op_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + unsigned int param_bufs_count, + const void **param_bufs, + size_t *param_sizes) +{ + struct update_op *op; + size_t total_param_sizes = 0; + int index; + unsigned int i; + + /* Check whether the packing exceeding the maximum update size */ + if (unlikely(*max_op_size < update_op_size(param_bufs_count))) { + CDEBUG(D_INFO, "max_op_size = %zu update_op = %zu\n", + *max_op_size, update_op_size(param_bufs_count)); + *max_op_size = update_op_size(param_bufs_count); + return -E2BIG; + } + + for (i = 0; i < param_bufs_count; i++) + total_param_sizes += + cfs_size_round(sizeof(struct object_update_param) + + param_sizes[i]); + + /* Check whether the packing exceeding the maximum parameter size */ + if (unlikely(*max_param_size < total_param_sizes)) { + CDEBUG(D_INFO, "max_param_size = %zu params size = %zu\n", + *max_param_size, total_param_sizes); + + *max_param_size = total_param_sizes; + return -E2BIG; + } + + op = update_ops_get_op(ops, *op_count, *op_count); + op->uop_fid = *fid; + op->uop_type = op_type; + op->uop_param_count = param_bufs_count; + for (i = 0; i < param_bufs_count; i++) { + index = update_records_param_pack(params, param_bufs[i], + param_sizes[i], param_count); + if (index < 0) + return index; + + CDEBUG(D_INFO, "%s %uth param offset = %d size = %zu\n", + update_op_str(op_type), i, index, param_sizes[i]); + + op->uop_params_off[i] = index; + } + CDEBUG(D_INFO, "%huth "DFID" %s param_count = %u\n", + *op_count, PFID(fid), update_op_str(op_type), *param_count); + + *op_count = *op_count + 1; + + return 0; +} + +/** + * Calculate update_records size + * + * Calculate update_records size by param_count and param_sizes array. + * + * \param[in] param_count the count of parameters + * \param[in] sizes the size array of these parameters + * + * \retval the size of this update + */ +static size_t update_records_update_size(__u32 param_count, size_t *sizes) +{ + int i; + size_t size; + + /* Check whether the packing exceeding the maximum update size */ + size = update_op_size(param_count); + + for (i = 0; i < param_count; i++) + size += cfs_size_round(sizeof(struct object_update_param) + + sizes[i]); + + return size; +} + +/** + * Calculate create update size + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in] fid FID of the object to be created + * \param[in] attr attribute of the object to be created + * \param[in] hint creation hint + * \param[in] dof creation format information + * + * \retval size of create update. + */ +size_t update_records_create_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_attr *attr, + const struct dt_allocation_hint *hint, + struct dt_object_format *dof) +{ + size_t sizes[2]; + int param_count = 0; + + if (attr != NULL) { + sizes[param_count] = sizeof(struct obdo); + param_count++; + } + + if (hint != NULL && hint->dah_parent != NULL) { + sizes[param_count] = sizeof(*fid); + param_count++; + } + + return update_records_update_size(param_count, sizes); +} +EXPORT_SYMBOL(update_records_create_size); + +/** + * Pack create update + * + * Pack create update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to be created + * \param[in] attr attribute of the object to be created + * \param[in] hint creation hint + * \param[in] dof creation format information + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_create_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_attr *attr, + const struct dt_allocation_hint *hint, + struct dt_object_format *dof) +{ + size_t sizes[2]; + const void *bufs[2]; + int buf_count = 0; + const struct lu_fid *parent_fid = NULL; + struct lu_fid tmp_fid; + int rc; + struct obdo *obdo; + + if (attr != NULL) { + obdo = &update_env_info(env)->uti_obdo; + obdo->o_valid = 0; + obdo_from_la(obdo, attr, attr->la_valid); + bufs[buf_count] = obdo; + sizes[buf_count] = sizeof(*obdo); + buf_count++; + } + + if (hint != NULL && hint->dah_parent != NULL) { + parent_fid = lu_object_fid(&hint->dah_parent->do_lu); + fid_cpu_to_le(&tmp_fid, parent_fid); + bufs[buf_count] = &tmp_fid; + sizes[buf_count] = sizeof(tmp_fid); + buf_count++; + } + + rc = update_records_update_pack(env, fid, OUT_CREATE, ops, op_count, + max_ops_size, params, param_count, + max_param_size, buf_count, bufs, sizes); + return rc; +} +EXPORT_SYMBOL(update_records_create_pack); + +/** + * Calculate attr set update size + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in] fid FID of the object to set attr + * \param[in] attr attribute of attr set + * + * \retval size of attr set update. + */ +size_t update_records_attr_set_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_attr *attr) +{ + size_t size = sizeof(struct obdo); + + return update_records_update_size(1, &size); +} +EXPORT_SYMBOL(update_records_attr_set_size); + +/** + * Pack attr set update + * + * Pack attr_set update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to set attr + * \param[in] attr attribute of attr set + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_attr_set_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_attr *attr) +{ + struct obdo *obdo = &update_env_info(env)->uti_obdo; + size_t size = sizeof(*obdo); + + obdo->o_valid = 0; + obdo_from_la(obdo, attr, attr->la_valid); + return update_records_update_pack(env, fid, OUT_ATTR_SET, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 1, + (const void **)&obdo, &size); +} +EXPORT_SYMBOL(update_records_attr_set_pack); + +/** + * Calculate ref add update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to add reference + * + * \retval size of ref_add udpate. + */ +size_t update_records_ref_add_size(const struct lu_env *env, + const struct lu_fid *fid) +{ + return update_records_update_size(0, NULL); +} +EXPORT_SYMBOL(update_records_ref_add_size); + +/** + * Pack ref add update + * + * Pack ref add update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to add reference + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_ref_add_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid) +{ + return update_records_update_pack(env, fid, OUT_REF_ADD, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 0, NULL, NULL); +} +EXPORT_SYMBOL(update_records_ref_add_pack); + +/** + * Pack noop update + * + * Pack no op update into update records. Note: no op means + * the update does not need do anything, which is only used + * in test case to verify large size record. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to add reference + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_noop_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid) +{ + return update_records_update_pack(env, fid, OUT_NOOP, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 0, NULL, NULL); +} +EXPORT_SYMBOL(update_records_noop_pack); + +/** + * Calculate ref del update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to delete reference + * + * \retval size of ref_del update. + */ +size_t update_records_ref_del_size(const struct lu_env *env, + const struct lu_fid *fid) +{ + return update_records_update_size(0, NULL); +} +EXPORT_SYMBOL(update_records_ref_del_size); + +/** + * Pack ref del update + * + * Pack ref del update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to delete reference + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_ref_del_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid) +{ + return update_records_update_pack(env, fid, OUT_REF_DEL, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 0, NULL, NULL); +} +EXPORT_SYMBOL(update_records_ref_del_pack); + +/** + * Calculate object destroy update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to delete reference + * + * \retval size of object destroy update. + */ +size_t update_records_destroy_size(const struct lu_env *env, + const struct lu_fid *fid) +{ + return update_records_update_size(0, NULL); +} +EXPORT_SYMBOL(update_records_destroy_size); + +/** + * Pack object destroy update + * + * Pack object destroy update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to delete reference + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_destroy_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid) +{ + return update_records_update_pack(env, fid, OUT_DESTROY, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 0, NULL, NULL); +} +EXPORT_SYMBOL(update_records_destroy_pack); + +/** + * Calculate index insert update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to insert index + * \param[in] rec record of insertion + * \param[in] key key of insertion + * + * \retval the size of index insert update. + */ +size_t update_records_index_insert_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct dt_rec *rec, + const struct dt_key *key) +{ + size_t sizes[3] = { strlen((const char *)key) + 1, + sizeof(struct lu_fid), + sizeof(__u32) }; + return update_records_update_size(3, sizes); +} +EXPORT_SYMBOL(update_records_index_insert_size); + +/** + * Pack index insert update + * + * Pack index insert update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to insert index + * \param[in] rec record of insertion + * \param[in] key key of insertion + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_index_insert_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct dt_rec *rec, + const struct dt_key *key) +{ + struct dt_insert_rec *rec1 = (struct dt_insert_rec *)rec; + struct lu_fid rec_fid; + __u32 type = cpu_to_le32(rec1->rec_type); + size_t sizes[3] = { strlen((const char *)key) + 1, + sizeof(rec_fid), + sizeof(type) }; + const void *bufs[3] = { key, + &rec_fid, + &type }; + + fid_cpu_to_le(&rec_fid, rec1->rec_fid); + + return update_records_update_pack(env, fid, OUT_INDEX_INSERT, ops, + op_count, max_ops_size, params, + param_count, max_param_size, + 3, bufs, sizes); +} +EXPORT_SYMBOL(update_records_index_insert_pack); + +/** + * Calculate index delete update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to delete index + * \param[in] key key of deletion + * + * \retval the size of index delete update + */ +size_t update_records_index_delete_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct dt_key *key) +{ + size_t size = strlen((const char *)key) + 1; + + return update_records_update_size(1, &size); +} +EXPORT_SYMBOL(update_records_index_delete_size); + +/** + * Pack index delete update + * + * Pack index delete update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|ount] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to delete index + * \param[in] key key of deletion + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_index_delete_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct dt_key *key) +{ + size_t size = strlen((const char *)key) + 1; + + return update_records_update_pack(env, fid, OUT_INDEX_DELETE, ops, + op_count, max_ops_size, params, + param_count, max_param_size, + 1, (const void **)&key, &size); +} +EXPORT_SYMBOL(update_records_index_delete_pack); + +/** + * Calculate xattr set size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to set xattr + * \param[in] buf xattr to be set + * \param[in] name name of the xattr + * \param[in] flag flag for setting xattr + * + * \retval size of xattr set update. + */ +size_t update_records_xattr_set_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_buf *buf, + const char *name, __u32 flag) +{ + size_t sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)}; + + return update_records_update_size(3, sizes); +} +EXPORT_SYMBOL(update_records_xattr_set_size); + +/** + * Pack xattr set update + * + * Pack xattr set update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to set xattr + * \param[in] buf xattr to be set + * \param[in] name name of the xattr + * \param[in] flag flag for setting xattr + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_xattr_set_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_buf *buf, const char *name, + __u32 flag) +{ + size_t sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)}; + const void *bufs[3] = {name, buf->lb_buf, &flag}; + + flag = cpu_to_le32(flag); + + return update_records_update_pack(env, fid, OUT_XATTR_SET, ops, + op_count, max_ops_size, params, + param_count, max_param_size, + 3, bufs, sizes); +} +EXPORT_SYMBOL(update_records_xattr_set_pack); + +/** + * Calculate xattr delete update size. + * + * \param[in] env execution environment + * \param[in] fid FID of the object to delete xattr + * \param[in] name name of the xattr + * + * \retval size of xattr delet updatee. + */ +size_t update_records_xattr_del_size(const struct lu_env *env, + const struct lu_fid *fid, + const char *name) +{ + size_t size = strlen(name) + 1; + + return update_records_update_size(1, &size); +} +EXPORT_SYMBOL(update_records_xattr_del_size); + +/** + * Pack xattr delete update + * + * Pack xattr delete update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to delete xattr + * \param[in] name name of the xattr + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_xattr_del_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const char *name) +{ + size_t size = strlen(name) + 1; + + return update_records_update_pack(env, fid, OUT_XATTR_DEL, ops, + op_count, max_ops_size, params, + param_count, max_param_size, + 1, (const void **)&name, &size); +} +EXPORT_SYMBOL(update_records_xattr_del_pack); + +/** + * Calculate write update size + * + * \param[in] env execution environment + * \param[in] fid FID of the object to write into + * \param[in] buf buffer to write which includes an embedded size field + * \param[in] pos offet in the object to start writing at + * + * \retval size of write udpate. + */ +size_t update_records_write_size(const struct lu_env *env, + const struct lu_fid *fid, + const struct lu_buf *buf, + __u64 pos) +{ + size_t sizes[2] = {buf->lb_len, sizeof(pos)}; + + return update_records_update_size(2, sizes); +} +EXPORT_SYMBOL(update_records_write_size); + +/** + * Pack write update + * + * Pack write update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to write into + * \param[in] buf buffer to write which includes an embedded size field + * \param[in] pos offet in the object to start writing at + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_write_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + const struct lu_buf *buf, + __u64 pos) +{ + size_t sizes[2] = {buf->lb_len, sizeof(pos)}; + const void *bufs[2] = {buf->lb_buf, &pos}; + + pos = cpu_to_le64(pos); + + return update_records_update_pack(env, fid, OUT_WRITE, ops, + op_count, max_ops_size, params, + param_count, max_param_size, + 2, bufs, sizes); +} +EXPORT_SYMBOL(update_records_write_pack); + +/** + * Calculate size of punch update. + * + * \param[in] env execution environment + * \param[in] fid FID of the object to write into + * \param[in] start start offset of punch + * \param[in] end end offet of punch + * + * \retval size of update punch. + */ +size_t update_records_punch_size(const struct lu_env *env, + const struct lu_fid *fid, + __u64 start, __u64 end) +{ + size_t sizes[2] = {sizeof(start), sizeof(end)}; + + return update_records_update_size(2, sizes); +} +EXPORT_SYMBOL(update_records_punch_size); + +/** + * Pack punch + * + * Pack punch update into update records. + * + * \param[in] env execution environment + * \param[in] ops ur_ops in update records + * \param[in|out] op_count pointer to the count of ops + * \param[in|out] max_op_size maximum size of the update + * \param[in] params ur_params in update records + * \param[in|out] param_count pointer to the count of params + * \param[in|out] max_param_size maximum size of the parameter + * \param[in] fid FID of the object to write into + * \param[in] start start offset of punch + * \param[in] end end offet of punch + * + * \retval 0 if packing succeeds. + * \retval negative errno if packing fails. + */ +int update_records_punch_pack(const struct lu_env *env, + struct update_ops *ops, + unsigned int *op_count, + size_t *max_ops_size, + struct update_params *params, + unsigned int *param_count, + size_t *max_param_size, + const struct lu_fid *fid, + __u64 start, __u64 end) +{ + size_t sizes[2] = {sizeof(start), sizeof(end)}; + const void *bufs[2] = {&start, &end}; + + start = cpu_to_le64(start); + end = cpu_to_le64(end); + + return update_records_update_pack(env, fid, OUT_PUNCH, ops, op_count, + max_ops_size, params, param_count, + max_param_size, 2, bufs, sizes); +} +EXPORT_SYMBOL(update_records_punch_pack); + +/** + * Create update records in thandle_update_records + * + * Allocate update_records for thandle_update_records, the initial size + * will be 4KB. + * + * \param[in] tur thandle_update_records where update_records will be + * allocated + * \retval 0 if allocation succeeds. + * \retval negative errno if allocation fails. + */ +static int tur_update_records_create(struct thandle_update_records *tur) +{ + if (tur->tur_update_records != NULL) + return 0; + + OBD_ALLOC_LARGE(tur->tur_update_records, + UPDATE_RECORDS_BUFFER_SIZE); + + if (tur->tur_update_records == NULL) + return -ENOMEM; + + tur->tur_update_records_buf_size = UPDATE_RECORDS_BUFFER_SIZE; + + return 0; +} + +/** + * Extend update records + * + * Extend update_records to the new size in thandle_update_records. + * + * \param[in] tur thandle_update_records where update_records will be + * extended. + * \retval 0 if extension succeeds. + * \retval negative errno if extension fails. + */ +int tur_update_records_extend(struct thandle_update_records *tur, + size_t new_size) +{ + struct llog_update_record *record; + + OBD_ALLOC_LARGE(record, new_size); + if (record == NULL) + return -ENOMEM; + + if (tur->tur_update_records != NULL) { + memcpy(record, tur->tur_update_records, + tur->tur_update_records_buf_size); + OBD_FREE_LARGE(tur->tur_update_records, + tur->tur_update_records_buf_size); + } + + tur->tur_update_records = record; + tur->tur_update_records_buf_size = new_size; + + return 0; +} +EXPORT_SYMBOL(tur_update_records_extend); + +/** + * Extend update records + * + * Extend update records in thandle to make sure it is able to hold + * the update with certain update_op and params size. + * + * \param [in] tur thandle_update_records to be extend + * \param [in] new_op_size update_op size of the update record + * \param [in] new_param_size params size of the update record + * + * \retval 0 if the update_records is being extended. + * \retval negative errno if the update_records is not being + * extended. + */ +int tur_update_extend(struct thandle_update_records *tur, + size_t new_op_size, size_t new_param_size) +{ + size_t record_size; + size_t params_size; + size_t extend_size; + int rc; + ENTRY; + + record_size = llog_update_record_size(tur->tur_update_records); + /* extend update records buffer */ + if (new_op_size >= (tur->tur_update_records_buf_size - record_size)) { + extend_size = round_up(new_op_size, UPDATE_RECORDS_BUFFER_SIZE); + rc = tur_update_records_extend(tur, + tur->tur_update_records_buf_size + + extend_size); + if (rc != 0) + RETURN(rc); + } + + /* extend parameters buffer */ + params_size = update_params_size(tur->tur_update_params, + tur->tur_update_param_count); + if (new_param_size >= (tur->tur_update_params_buf_size - + params_size)) { + extend_size = round_up(new_param_size, + UPDATE_PARAMS_BUFFER_SIZE); + rc = tur_update_params_extend(tur, + tur->tur_update_params_buf_size + + extend_size); + if (rc != 0) + RETURN(rc); + } + + RETURN(0); +} +EXPORT_SYMBOL(tur_update_extend); + +/** + * Create update params in thandle_update_records + * + * Allocate update_params for thandle_update_records, the initial size + * will be 4KB. + * + * \param[in] tur thandle_update_records where update_params will be + * allocated + * \retval 0 if allocation succeeds. + * \retval negative errno if allocation fails. + */ +static int tur_update_params_create(struct thandle_update_records *tur) +{ + if (tur->tur_update_params != NULL) + return 0; + + OBD_ALLOC_LARGE(tur->tur_update_params, UPDATE_PARAMS_BUFFER_SIZE); + if (tur->tur_update_params == NULL) + return -ENOMEM; + + tur->tur_update_params_buf_size = UPDATE_PARAMS_BUFFER_SIZE; + return 0; +} + +/** + * Extend update params + * + * Extend update_params to the new size in thandle_update_records. + * + * \param[in] tur thandle_update_records where update_params will be + * extended. + * \retval 0 if extension succeeds. + * \retval negative errno if extension fails. + */ +int tur_update_params_extend(struct thandle_update_records *tur, + size_t new_size) +{ + struct update_params *params; + + OBD_ALLOC_LARGE(params, new_size); + if (params == NULL) + return -ENOMEM; + + if (tur->tur_update_params != NULL) { + memcpy(params, tur->tur_update_params, + tur->tur_update_params_buf_size); + OBD_FREE_LARGE(tur->tur_update_params, + tur->tur_update_params_buf_size); + } + + tur->tur_update_params = params; + tur->tur_update_params_buf_size = new_size; + + return 0; +} +EXPORT_SYMBOL(tur_update_params_extend); + +/** + * Check and prepare whether it needs to record update. + * + * Checks if the transaction needs to record updates, and if it + * does, then initialize the update record buffer in the transaction. + * + * \param[in] env execution environment + * \param[in] th transaction handle + * + * \retval 0 if updates recording succeeds. + * \retval negative errno if updates recording fails. + */ +int check_and_prepare_update_record(const struct lu_env *env, + struct thandle_update_records *tur) +{ + struct llog_update_record *lur; + int rc; + + if (tur->tur_update_records == NULL) { + rc = tur_update_records_create(tur); + if (rc < 0) + RETURN(rc); + } + + if (tur->tur_update_params == NULL) { + rc = tur_update_params_create(tur); + if (rc < 0) + RETURN(rc); + } + + lur = tur->tur_update_records; + lur->lur_update_rec.ur_update_count = 0; + lur->lur_update_rec.ur_param_count = 0; + lur->lur_update_rec.ur_master_transno = 0; + lur->lur_update_rec.ur_batchid = 0; + lur->lur_update_rec.ur_flags = 0; + lur->lur_hdr.lrh_len = LLOG_MIN_CHUNK_SIZE; + + tur->tur_update_param_count = 0; + + RETURN(0); +} + +static void update_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct update_thread_info *info = data; + struct thandle_exec_args *args = &info->uti_tea; + int i; + + for (i = 0; i < args->ta_alloc_args; i++) { + if (args->ta_args[i] != NULL) + OBD_FREE_PTR(args->ta_args[i]); + } + + if (args->ta_args != NULL) + OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) * + args->ta_alloc_args); + + if (info->uti_tur.tur_update_records != NULL) + OBD_FREE_LARGE(info->uti_tur.tur_update_records, + info->uti_tur.tur_update_records_buf_size); + if (info->uti_tur.tur_update_params != NULL) + OBD_FREE_LARGE(info->uti_tur.tur_update_params, + info->uti_tur.tur_update_params_buf_size); + + OBD_FREE_PTR(info); +} + +/* context key constructor/destructor: update_key_init, update_key_fini */ +LU_KEY_INIT(update, struct update_thread_info); +/* context key: update_thread_key */ +LU_CONTEXT_KEY_DEFINE(update, LCT_MD_THREAD | LCT_MG_THREAD | + LCT_DT_THREAD | LCT_LOCAL); +EXPORT_SYMBOL(update_thread_key); +LU_KEY_INIT_GENERIC(update); + +void update_info_init(void) +{ + update_key_init_generic(&update_thread_key, NULL); + lu_context_key_register(&update_thread_key); +} + +void update_info_fini(void) +{ + lu_context_key_degister(&update_thread_key); +} diff --git a/drivers/staging/lustrefsx/lustre/target/update_recovery.c b/drivers/staging/lustrefsx/lustre/target/update_recovery.c new file mode 100644 index 0000000000000..ac47105a633b9 --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/update_recovery.c @@ -0,0 +1,1447 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, 2017, Intel Corporation. + */ + +/* + * lustre/target/update_recovery.c + * + * This file implement the methods to handle the update recovery. + * + * During DNE recovery, the recovery thread will redo the operation according + * to the transaction no, and these replay are either from client replay req + * or update replay records(for distribute transaction) in the update log. + * For distribute transaction replay, the replay thread will call + * distribute_txn_replay_handle() to handle the updates. + * + * After the Master MDT restarts, it will retrieve the update records from all + * of MDTs, for each distributed operation, it will check updates on all MDTs, + * if some updates records are missing on some MDTs, the replay thread will redo + * updates on these MDTs. + * + * Author: Di Wang + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include + +#include "tgt_internal.h" + +/** + * Lookup distribute_txn_replay req + * + * Lookup distribute_txn_replay in the replay list by batchid. + * It is assumed the list has been locked before calling this function. + * + * \param[in] tdtd distribute_txn_data, which holds the replay + * list. + * \param[in] batchid batchid used by lookup. + * + * \retval pointer of the replay if succeeds. + * \retval NULL if can not find it. + */ +static struct distribute_txn_replay_req * +dtrq_lookup(struct target_distribute_txn_data *tdtd, __u64 batchid) +{ + struct distribute_txn_replay_req *tmp; + struct distribute_txn_replay_req *dtrq = NULL; + + list_for_each_entry(tmp, &tdtd->tdtd_replay_list, dtrq_list) { + if (tmp->dtrq_batchid == batchid) { + dtrq = tmp; + break; + } + } + return dtrq; +} + +/** + * insert distribute txn replay req + * + * Insert distribute txn replay to the replay list, and it assumes the + * list has been looked. Note: the replay list is a sorted list, which + * is sorted by master transno. It is assumed the replay list has been + * locked before calling this function. + * + * \param[in] tdtd target distribute txn data where replay list is + * \param[in] new distribute txn replay to be inserted + * + * \retval 0 if insertion succeeds + * \retval EEXIST if the dtrq already exists + */ +static int dtrq_insert(struct target_distribute_txn_data *tdtd, + struct distribute_txn_replay_req *new) +{ + struct distribute_txn_replay_req *iter; + + /* Check if the dtrq has been added to the list */ + iter = dtrq_lookup(tdtd, new->dtrq_batchid); + if (iter != NULL) + return -EEXIST; + + list_for_each_entry_reverse(iter, &tdtd->tdtd_replay_list, dtrq_list) { + if (iter->dtrq_master_transno > new->dtrq_master_transno) + continue; + + /* If there are mulitple replay req with same transno, then + * sort them with batchid */ + if (iter->dtrq_master_transno == new->dtrq_master_transno && + iter->dtrq_batchid > new->dtrq_batchid) + continue; + + list_add(&new->dtrq_list, &iter->dtrq_list); + break; + } + + if (list_empty(&new->dtrq_list)) + list_add(&new->dtrq_list, &tdtd->tdtd_replay_list); + + return 0; +} + +/** + * create distribute txn replay req + * + * Allocate distribute txn replay req according to the update records. + * + * \param[in] tdtd target distribute txn data where replay list is. + * \param[in] record update records from the update log. + * + * \retval the pointer of distribute txn replay req if + * the creation succeeds. + * \retval NULL if the creation fails. + */ +static struct distribute_txn_replay_req * +dtrq_create(struct target_distribute_txn_data *tdtd, + struct llog_update_record *lur) +{ + struct distribute_txn_replay_req *new; + + OBD_ALLOC_PTR(new); + if (new == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + new->dtrq_lur_size = llog_update_record_size(lur); + OBD_ALLOC_LARGE(new->dtrq_lur, new->dtrq_lur_size); + if (new->dtrq_lur == NULL) { + OBD_FREE_PTR(new); + RETURN(ERR_PTR(-ENOMEM)); + } + + memcpy(new->dtrq_lur, lur, new->dtrq_lur_size); + + /* If the transno in the update record is 0, it means the + * update are from master MDT, and it will use the master + * last committed transno as its master transno. Later, if + * the update records are gotten from slave MDTs, then these + * transno will be replaced. + * See insert_update_records_to_replay_list(). */ + if (lur->lur_update_rec.ur_master_transno == 0) { + new->dtrq_lur->lur_update_rec.ur_master_transno = + tdtd->tdtd_lut->lut_obd->obd_last_committed; + new->dtrq_master_transno = + tdtd->tdtd_lut->lut_obd->obd_last_committed; + } else { + new->dtrq_master_transno = + lur->lur_update_rec.ur_master_transno; + } + + new->dtrq_batchid = lur->lur_update_rec.ur_batchid; + + spin_lock_init(&new->dtrq_sub_list_lock); + INIT_LIST_HEAD(&new->dtrq_sub_list); + INIT_LIST_HEAD(&new->dtrq_list); + + RETURN(new); +} + +/** + * Lookup distribute sub replay + * + * Lookup distribute sub replay in the sub list of distribute_txn_replay by + * mdt_index. + * + * \param[in] distribute_txn_replay_req the distribute txn replay req to lookup + * \param[in] mdt_index the mdt_index as the key of lookup + * + * \retval the pointer of sub replay if it can be found. + * \retval NULL if it can not find. + */ +struct distribute_txn_replay_req_sub * +dtrq_sub_lookup(struct distribute_txn_replay_req *dtrq, __u32 mdt_index) +{ + struct distribute_txn_replay_req_sub *dtrqs = NULL; + struct distribute_txn_replay_req_sub *tmp; + + list_for_each_entry(tmp, &dtrq->dtrq_sub_list, dtrqs_list) { + if (tmp->dtrqs_mdt_index == mdt_index) { + dtrqs = tmp; + break; + } + } + return dtrqs; +} + +/** + * Try to add cookie to sub distribute txn request + * + * Check if the update log cookie has been added to the request, if not, + * add it to the dtrqs_cookie_list. + * + * \param[in] dtrqs sub replay req where cookies to be added. + * \param[in] cookie cookie to be added. + * + * \retval 0 if the cookie is adding succeeds. + * \retval negative errno if adding fails. + */ +static int dtrq_sub_add_cookie(struct distribute_txn_replay_req_sub *dtrqs, + struct llog_cookie *cookie) +{ + struct sub_thandle_cookie *new; + + OBD_ALLOC_PTR(new); + if (new == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&new->stc_list); + new->stc_cookie = *cookie; + /* Note: only single thread will access one sub_request each time, + * so no need lock here */ + list_add(&new->stc_list, &dtrqs->dtrqs_cookie_list); + + return 0; +} + +/** + * Insert distribute txn sub req replay + * + * Allocate sub replay req and insert distribute txn replay list. + * + * \param[in] dtrq d to be added + * \param[in] cookie the cookie of the update record + * \param[in] mdt_index the mdt_index of the update record + * + * \retval 0 if the adding succeeds. + * \retval negative errno if the adding fails. + */ +static int +dtrq_sub_create_and_insert(struct distribute_txn_replay_req *dtrq, + struct llog_cookie *cookie, + __u32 mdt_index) +{ + struct distribute_txn_replay_req_sub *dtrqs = NULL; + struct distribute_txn_replay_req_sub *new; + int rc; + ENTRY; + + spin_lock(&dtrq->dtrq_sub_list_lock); + dtrqs = dtrq_sub_lookup(dtrq, mdt_index); + spin_unlock(&dtrq->dtrq_sub_list_lock); + if (dtrqs != NULL) { + rc = dtrq_sub_add_cookie(dtrqs, cookie); + RETURN(0); + } + + OBD_ALLOC_PTR(new); + if (new == NULL) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&new->dtrqs_list); + INIT_LIST_HEAD(&new->dtrqs_cookie_list); + new->dtrqs_mdt_index = mdt_index; + spin_lock(&dtrq->dtrq_sub_list_lock); + dtrqs = dtrq_sub_lookup(dtrq, mdt_index); + if (dtrqs == NULL) { + list_add(&new->dtrqs_list, &dtrq->dtrq_sub_list); + dtrqs = new; + } else { + OBD_FREE_PTR(new); + } + spin_unlock(&dtrq->dtrq_sub_list_lock); + + rc = dtrq_sub_add_cookie(dtrqs, cookie); + + RETURN(rc); +} + +/** + * append updates to the current replay updates + * + * Append more updates to the existent replay update. And this is only + * used when combining mulitple updates into one large updates during + * replay. + * + * \param[in] dtrq the update replay request where the new update + * records will be added. + * \param[in] lur the new update record. + * + * \retval 0 if appending succeeds. + * \retval negative errno if appending fails. + */ +static int dtrq_append_updates(struct distribute_txn_replay_req *dtrq, + struct update_records *record) +{ + struct llog_update_record *new_lur; + size_t lur_size = dtrq->dtrq_lur_size; + void *ptr; + ENTRY; + + /* Because several threads might retrieve the same records from + * different targets, and we only need one copy of records. So + * we will check if the records is in the next one, if not, just + * skip it */ + spin_lock(&dtrq->dtrq_sub_list_lock); + if (dtrq->dtrq_lur->lur_update_rec.ur_index + 1 != record->ur_index) { + spin_unlock(&dtrq->dtrq_sub_list_lock); + RETURN(0); + } + dtrq->dtrq_lur->lur_update_rec.ur_index++; + spin_unlock(&dtrq->dtrq_sub_list_lock); + + lur_size += update_records_size(record); + OBD_ALLOC_LARGE(new_lur, lur_size); + if (new_lur == NULL) { + spin_lock(&dtrq->dtrq_sub_list_lock); + dtrq->dtrq_lur->lur_update_rec.ur_index--; + spin_unlock(&dtrq->dtrq_sub_list_lock); + RETURN(-ENOMEM); + } + + /* Copy the old and new records to the new allocated buffer */ + memcpy(new_lur, dtrq->dtrq_lur, dtrq->dtrq_lur_size); + ptr = (char *)&new_lur->lur_update_rec + + update_records_size(&new_lur->lur_update_rec); + memcpy(ptr, &record->ur_ops, + update_records_size(record) - + offsetof(struct update_records, ur_ops)); + + new_lur->lur_update_rec.ur_update_count += record->ur_update_count; + new_lur->lur_update_rec.ur_param_count += record->ur_param_count; + new_lur->lur_hdr.lrh_len = llog_update_record_size(new_lur); + + /* Replace the records */ + OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size); + dtrq->dtrq_lur = new_lur; + dtrq->dtrq_lur_size = lur_size; + dtrq->dtrq_lur->lur_update_rec.ur_flags = record->ur_flags; + update_records_dump(&new_lur->lur_update_rec, D_INFO, true); + RETURN(0); +} + +/** + * Insert update records to the replay list. + * + * Allocate distribute txn replay req and insert it into the replay + * list, then insert the update records into the replay req. + * + * \param[in] tdtd distribute txn replay data where the replay list + * is. + * \param[in] record the update record + * \param[in] cookie cookie of the record + * \param[in] index mdt index of the record + * + * \retval 0 if the adding succeeds. + * \retval negative errno if the adding fails. + */ +int +insert_update_records_to_replay_list(struct target_distribute_txn_data *tdtd, + struct llog_update_record *lur, + struct llog_cookie *cookie, + __u32 mdt_index) +{ + struct distribute_txn_replay_req *dtrq; + struct update_records *record = &lur->lur_update_rec; + bool replace_record = false; + int rc = 0; + ENTRY; + + CDEBUG(D_HA, "%s: insert record batchid = %llu transno = %llu" + " mdt_index %u\n", tdtd->tdtd_lut->lut_obd->obd_name, + record->ur_batchid, record->ur_master_transno, mdt_index); + + /* Update batchid if necessary */ + spin_lock(&tdtd->tdtd_batchid_lock); + if (record->ur_batchid >= tdtd->tdtd_batchid) { + CDEBUG(D_HA, "%s update batchid from %llu" " to %llu\n", + tdtd->tdtd_lut->lut_obd->obd_name, + tdtd->tdtd_batchid, record->ur_batchid); + tdtd->tdtd_batchid = record->ur_batchid + 1; + } + spin_unlock(&tdtd->tdtd_batchid_lock); + +again: + spin_lock(&tdtd->tdtd_replay_list_lock); + /* First try to build the replay update request with the records */ + dtrq = dtrq_lookup(tdtd, record->ur_batchid); + if (dtrq == NULL) { + spin_unlock(&tdtd->tdtd_replay_list_lock); + dtrq = dtrq_create(tdtd, lur); + if (IS_ERR(dtrq)) + RETURN(PTR_ERR(dtrq)); + + spin_lock(&tdtd->tdtd_replay_list_lock); + rc = dtrq_insert(tdtd, dtrq); + if (rc < 0) { + spin_unlock(&tdtd->tdtd_replay_list_lock); + dtrq_destroy(dtrq); + if (rc == -EEXIST) + goto again; + return rc; + } + } else { + /* If the master transno in update header is not + * matched with the one in the record, then it means + * the dtrq is originally created by master record, + * so we need update master transno and reposition + * the dtrq(by master transno) in the list and also + * replace update record */ + if (record->ur_master_transno != 0 && + dtrq->dtrq_master_transno != record->ur_master_transno && + dtrq->dtrq_lur != NULL) { + list_del_init(&dtrq->dtrq_list); + dtrq->dtrq_lur->lur_update_rec.ur_master_transno = + record->ur_master_transno; + + dtrq->dtrq_master_transno = record->ur_master_transno; + replace_record = true; + /* try to insert again */ + rc = dtrq_insert(tdtd, dtrq); + if (rc < 0) { + spin_unlock(&tdtd->tdtd_replay_list_lock); + dtrq_destroy(dtrq); + return rc; + } + } + } + spin_unlock(&tdtd->tdtd_replay_list_lock); + + /* Because there should be only thread access the update record, so + * we do not need lock here */ + if (replace_record) { + /* Replace the update record and master transno */ + OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size); + dtrq->dtrq_lur = NULL; + dtrq->dtrq_lur_size = llog_update_record_size(lur); + OBD_ALLOC_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size); + if (dtrq->dtrq_lur == NULL) + return -ENOMEM; + + memcpy(dtrq->dtrq_lur, lur, dtrq->dtrq_lur_size); + } + + /* This is a partial update records, let's try to append + * the record to the current replay request */ + if (record->ur_flags & UPDATE_RECORD_CONTINUE) + rc = dtrq_append_updates(dtrq, record); + + /* Then create and add sub update request */ + rc = dtrq_sub_create_and_insert(dtrq, cookie, mdt_index); + + RETURN(rc); +} +EXPORT_SYMBOL(insert_update_records_to_replay_list); + +/** + * Dump updates of distribute txns. + * + * Output all of recovery updates in the distribute txn list to the + * debug log. + * + * \param[in] tdtd distribute txn data where all of distribute txn + * are listed. + * \param[in] mask debug mask + */ +void dtrq_list_dump(struct target_distribute_txn_data *tdtd, unsigned int mask) +{ + struct distribute_txn_replay_req *dtrq; + + spin_lock(&tdtd->tdtd_replay_list_lock); + list_for_each_entry(dtrq, &tdtd->tdtd_replay_list, dtrq_list) + update_records_dump(&dtrq->dtrq_lur->lur_update_rec, mask, + false); + spin_unlock(&tdtd->tdtd_replay_list_lock); +} +EXPORT_SYMBOL(dtrq_list_dump); + +/** + * Destroy distribute txn replay req + * + * Destroy distribute txn replay req and all of subs. + * + * \param[in] dtrq distribute txn replqy req to be destroyed. + */ +void dtrq_destroy(struct distribute_txn_replay_req *dtrq) +{ + struct distribute_txn_replay_req_sub *dtrqs; + struct distribute_txn_replay_req_sub *tmp; + + LASSERT(list_empty(&dtrq->dtrq_list)); + spin_lock(&dtrq->dtrq_sub_list_lock); + list_for_each_entry_safe(dtrqs, tmp, &dtrq->dtrq_sub_list, dtrqs_list) { + struct sub_thandle_cookie *stc; + struct sub_thandle_cookie *tmp; + + list_del(&dtrqs->dtrqs_list); + list_for_each_entry_safe(stc, tmp, &dtrqs->dtrqs_cookie_list, + stc_list) { + list_del(&stc->stc_list); + OBD_FREE_PTR(stc); + } + OBD_FREE_PTR(dtrqs); + } + spin_unlock(&dtrq->dtrq_sub_list_lock); + + if (dtrq->dtrq_lur != NULL) + OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size); + + OBD_FREE_PTR(dtrq); +} +EXPORT_SYMBOL(dtrq_destroy); + +/** + * Destroy all of replay req. + * + * Destroy all of replay req in the replay list. + * + * \param[in] tdtd target distribute txn data where the replay list is. + */ +void dtrq_list_destroy(struct target_distribute_txn_data *tdtd) +{ + struct distribute_txn_replay_req *dtrq; + struct distribute_txn_replay_req *tmp; + + spin_lock(&tdtd->tdtd_replay_list_lock); + list_for_each_entry_safe(dtrq, tmp, &tdtd->tdtd_replay_list, + dtrq_list) { + list_del_init(&dtrq->dtrq_list); + dtrq_destroy(dtrq); + } + list_for_each_entry_safe(dtrq, tmp, &tdtd->tdtd_replay_finish_list, + dtrq_list) { + list_del_init(&dtrq->dtrq_list); + dtrq_destroy(dtrq); + } + spin_unlock(&tdtd->tdtd_replay_list_lock); +} +EXPORT_SYMBOL(dtrq_list_destroy); + +/** + * Get next req in the replay list + * + * Get next req needs to be replayed, since it is a sorted list + * (by master MDT transno) + * + * \param[in] tdtd distribute txn data where the replay list is + * + * \retval the pointer of update recovery header + */ +struct distribute_txn_replay_req * +distribute_txn_get_next_req(struct target_distribute_txn_data *tdtd) +{ + struct distribute_txn_replay_req *dtrq = NULL; + + spin_lock(&tdtd->tdtd_replay_list_lock); + if (!list_empty(&tdtd->tdtd_replay_list)) { + dtrq = list_entry(tdtd->tdtd_replay_list.next, + struct distribute_txn_replay_req, dtrq_list); + list_del_init(&dtrq->dtrq_list); + } + spin_unlock(&tdtd->tdtd_replay_list_lock); + + return dtrq; +} +EXPORT_SYMBOL(distribute_txn_get_next_req); + +/** + * Get next transno in the replay list, because this is the sorted + * list, so it will return the transno of next req in the list. + * + * \param[in] tdtd distribute txn data where the replay list is + * + * \retval the transno of next update in the list + */ +__u64 distribute_txn_get_next_transno(struct target_distribute_txn_data *tdtd) +{ + struct distribute_txn_replay_req *dtrq = NULL; + __u64 transno = 0; + + spin_lock(&tdtd->tdtd_replay_list_lock); + if (!list_empty(&tdtd->tdtd_replay_list)) { + dtrq = list_entry(tdtd->tdtd_replay_list.next, + struct distribute_txn_replay_req, dtrq_list); + transno = dtrq->dtrq_master_transno; + } + spin_unlock(&tdtd->tdtd_replay_list_lock); + + CDEBUG(D_HA, "%s: Next update transno %llu\n", + tdtd->tdtd_lut->lut_obd->obd_name, transno); + return transno; +} +EXPORT_SYMBOL(distribute_txn_get_next_transno); + +struct distribute_txn_replay_req * +distribute_txn_lookup_finish_list(struct target_distribute_txn_data *tdtd, + __u64 xid) +{ + struct distribute_txn_replay_req *dtrq = NULL; + struct distribute_txn_replay_req *iter; + + spin_lock(&tdtd->tdtd_replay_list_lock); + list_for_each_entry(iter, &tdtd->tdtd_replay_finish_list, dtrq_list) { + if (iter->dtrq_xid == xid) { + dtrq = iter; + break; + } + } + spin_unlock(&tdtd->tdtd_replay_list_lock); + return dtrq; +} + +bool is_req_replayed_by_update(struct ptlrpc_request *req) +{ + struct lu_target *tgt = class_exp2tgt(req->rq_export); + struct distribute_txn_replay_req *dtrq; + + if (tgt->lut_tdtd == NULL) + return false; + + dtrq = distribute_txn_lookup_finish_list(tgt->lut_tdtd, req->rq_xid); + if (dtrq == NULL) + return false; + + return true; +} +EXPORT_SYMBOL(is_req_replayed_by_update); + +/** + * Check if the update of one object is committed + * + * Check whether the update for the object is committed by checking whether + * the correspondent sub exists in the replay req. If it is committed, mark + * the committed flag in correspondent the sub thandle. + * + * \param[in] env execution environment + * \param[in] dtrq replay request + * \param[in] dt_obj object for the update + * \param[in] top_th top thandle + * \param[in] sub_th sub thandle which the update belongs to + * + * \retval 1 if the update is not committed. + * \retval 0 if the update is committed. + * \retval negative errno if some other failures happen. + */ +static int update_is_committed(const struct lu_env *env, + struct distribute_txn_replay_req *dtrq, + struct dt_object *dt_obj, + struct top_thandle *top_th, + struct sub_thandle *st) +{ + struct seq_server_site *seq_site; + const struct lu_fid *fid = lu_object_fid(&dt_obj->do_lu); + struct distribute_txn_replay_req_sub *dtrqs; + __u32 mdt_index; + ENTRY; + + if (st->st_sub_th != NULL) + RETURN(1); + + if (st->st_committed) + RETURN(0); + + seq_site = lu_site2seq(dt_obj->do_lu.lo_dev->ld_site); + if (fid_is_update_log(fid) || fid_is_update_log_dir(fid)) { + mdt_index = fid_oid(fid); + } else if (!fid_seq_in_fldb(fid_seq(fid))) { + mdt_index = seq_site->ss_node_id; + } else { + struct lu_server_fld *fld; + struct lu_seq_range range = {0}; + int rc; + + fld = seq_site->ss_server_fld; + fld_range_set_type(&range, LU_SEQ_RANGE_MDT); + LASSERT(fld->lsf_seq_lookup != NULL); + rc = fld->lsf_seq_lookup(env, fld, fid_seq(fid), + &range); + if (rc < 0) + RETURN(rc); + mdt_index = range.lsr_index; + } + + dtrqs = dtrq_sub_lookup(dtrq, mdt_index); + if (dtrqs != NULL || top_th->tt_multiple_thandle->tmt_committed) { + st->st_committed = 1; + if (dtrqs != NULL) { + struct sub_thandle_cookie *stc; + struct sub_thandle_cookie *tmp; + + list_for_each_entry_safe(stc, tmp, + &dtrqs->dtrqs_cookie_list, + stc_list) + list_move(&stc->stc_list, &st->st_cookie_list); + } + RETURN(0); + } + + CDEBUG(D_HA, "Update of "DFID "on MDT%u is not committed\n", PFID(fid), + mdt_index); + + RETURN(1); +} + +/** + * Implementation of different update methods for update recovery. + * + * These following functions update_recovery_$(update_name) implement + * different updates recovery methods. They will extract the parameters + * from the common parameters area and call correspondent dt API to redo + * the update. + * + * \param[in] env execution environment + * \param[in] op update operation to be replayed + * \param[in] params common update parameters which holds all parameters + * of the operation + * \param[in] th transaction handle + * \param[in] declare indicate it will do declare or real execution, true + * means declare, false means real execution + * + * \retval 0 if it succeeds. + * \retval negative errno if it fails. + */ +static int update_recovery_create(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + struct llog_update_record *lur = uti->uti_dtrq->dtrq_lur; + struct lu_attr *attr = &uti->uti_attr; + struct obdo *wobdo; + struct obdo *lobdo = &uti->uti_obdo; + struct dt_object_format dof; + __u16 size; + unsigned int param_count; + int rc; + ENTRY; + + if (dt_object_exists(dt_obj)) + RETURN(-EEXIST); + + param_count = lur->lur_update_rec.ur_param_count; + wobdo = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (wobdo == NULL) + RETURN(-EIO); + if (size != sizeof(*wobdo)) + RETURN(-EIO); + + if (LLOG_REC_HDR_NEEDS_SWABBING(&lur->lur_hdr)) + lustre_swab_obdo(wobdo); + + lustre_get_wire_obdo(NULL, lobdo, wobdo); + la_from_obdo(attr, lobdo, lobdo->o_valid); + + dof.dof_type = dt_mode_to_dft(attr->la_mode); + + rc = out_tx_create(env, dt_obj, attr, NULL, &dof, + ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_destroy(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + int rc; + ENTRY; + + rc = out_tx_destroy(env, dt_obj, ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_ref_add(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + int rc; + ENTRY; + + rc = out_tx_ref_add(env, dt_obj, ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_ref_del(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + int rc; + ENTRY; + + rc = out_tx_ref_del(env, dt_obj, ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_attr_set(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + struct llog_update_record *lur = uti->uti_dtrq->dtrq_lur; + struct obdo *wobdo; + struct obdo *lobdo = &uti->uti_obdo; + struct lu_attr *attr = &uti->uti_attr; + __u16 size; + unsigned int param_count; + int rc; + ENTRY; + + param_count = lur->lur_update_rec.ur_param_count; + wobdo = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (wobdo == NULL) + RETURN(-EIO); + if (size != sizeof(*wobdo)) + RETURN(-EIO); + + if (LLOG_REC_HDR_NEEDS_SWABBING(&lur->lur_hdr)) + lustre_swab_obdo(wobdo); + + lustre_get_wire_obdo(NULL, lobdo, wobdo); + la_from_obdo(attr, lobdo, lobdo->o_valid); + + rc = out_tx_attr_set(env, dt_obj, attr, ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_xattr_set(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + char *buf; + char *name; + int fl; + __u16 size; + __u32 param_count; + int rc; + ENTRY; + + param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count; + name = update_params_get_param_buf(params, + op->uop_params_off[0], + param_count, &size); + if (name == NULL) + RETURN(-EIO); + + buf = update_params_get_param_buf(params, + op->uop_params_off[1], + param_count, &size); + if (buf == NULL) + RETURN(-EIO); + + uti->uti_buf.lb_buf = buf; + uti->uti_buf.lb_len = (size_t)size; + + buf = update_params_get_param_buf(params, op->uop_params_off[2], + param_count, &size); + if (buf == NULL) + RETURN(-EIO); + if (size != sizeof(fl)) + RETURN(-EIO); + + fl = le32_to_cpu(*(int *)buf); + + rc = out_tx_xattr_set(env, dt_obj, &uti->uti_buf, name, fl, ta, th, + NULL, 0); + + RETURN(rc); +} + +static int update_recovery_index_insert(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + struct lu_fid *fid; + char *name; + __u32 param_count; + __u32 *ptype; + __u32 type; + __u16 size; + int rc; + ENTRY; + + param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count; + name = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (name == NULL) + RETURN(-EIO); + + fid = update_params_get_param_buf(params, op->uop_params_off[1], + param_count, &size); + if (fid == NULL) + RETURN(-EIO); + if (size != sizeof(*fid)) + RETURN(-EIO); + + fid_le_to_cpu(fid, fid); + + ptype = update_params_get_param_buf(params, op->uop_params_off[2], + param_count, &size); + if (ptype == NULL) + RETURN(-EIO); + if (size != sizeof(*ptype)) + RETURN(-EIO); + type = le32_to_cpu(*ptype); + + if (dt_try_as_dir(env, dt_obj) == 0) + RETURN(-ENOTDIR); + + uti->uti_rec.rec_fid = fid; + uti->uti_rec.rec_type = type; + + rc = out_tx_index_insert(env, dt_obj, + (const struct dt_rec *)&uti->uti_rec, + (const struct dt_key *)name, ta, th, + NULL, 0); + + RETURN(rc); +} + +static int update_recovery_index_delete(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + __u32 param_count; + char *name; + __u16 size; + int rc; + ENTRY; + + param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count; + name = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (name == NULL) + RETURN(-EIO); + + if (dt_try_as_dir(env, dt_obj) == 0) + RETURN(-ENOTDIR); + + rc = out_tx_index_delete(env, dt_obj, + (const struct dt_key *)name, ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_write(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + char *buf; + __u32 param_count; + __u64 pos; + __u16 size; + int rc; + ENTRY; + + param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count; + buf = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (buf == NULL) + RETURN(-EIO); + + uti->uti_buf.lb_buf = buf; + uti->uti_buf.lb_len = size; + + buf = update_params_get_param_buf(params, op->uop_params_off[1], + param_count, &size); + if (buf == NULL) + RETURN(-EIO); + + pos = le64_to_cpu(*(__u64 *)buf); + + rc = out_tx_write(env, dt_obj, &uti->uti_buf, pos, + ta, th, NULL, 0); + + RETURN(rc); +} + +static int update_recovery_xattr_del(const struct lu_env *env, + struct dt_object *dt_obj, + const struct update_op *op, + const struct update_params *params, + struct thandle_exec_args *ta, + struct thandle *th) +{ + struct update_thread_info *uti = update_env_info(env); + __u32 param_count; + char *name; + __u16 size; + int rc; + ENTRY; + + param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count; + name = update_params_get_param_buf(params, op->uop_params_off[0], + param_count, &size); + if (name == NULL) + RETURN(-EIO); + + rc = out_tx_xattr_del(env, dt_obj, name, ta, th, NULL, 0); + + RETURN(rc); +} + +/** + * Update session information + * + * Update session information so tgt_txn_stop_cb()->tgt_last_rcvd_update() + * can be called correctly during update replay. + * + * \param[in] env execution environment. + * \param[in] tdtd distribute data structure of the recovering tgt. + * \param[in] th thandle of this update replay. + * \param[in] master_th master sub thandle. + * \param[in] ta_arg the tx arg structure to hold the update for updating + * reply data. + */ +static void update_recovery_update_ses(struct lu_env *env, + struct target_distribute_txn_data *tdtd, + struct thandle *th, + struct thandle *master_th, + struct distribute_txn_replay_req *dtrq, + struct tx_arg *ta_arg) +{ + struct tgt_session_info *tsi; + struct lu_target *lut = tdtd->tdtd_lut; + struct obd_export *export; + struct cfs_hash *hash; + struct top_thandle *top_th; + struct lsd_reply_data *lrd; + size_t size; + + tsi = tgt_ses_info(env); + if (tsi->tsi_exp != NULL) + return; + + size = ta_arg->u.write.buf.lb_len; + lrd = ta_arg->u.write.buf.lb_buf; + if (size != sizeof(*lrd) || lrd == NULL) + return; + + lrd->lrd_transno = le64_to_cpu(lrd->lrd_transno); + lrd->lrd_xid = le64_to_cpu(lrd->lrd_xid); + lrd->lrd_data = le64_to_cpu(lrd->lrd_data); + lrd->lrd_result = le32_to_cpu(lrd->lrd_result); + lrd->lrd_client_gen = le32_to_cpu(lrd->lrd_client_gen); + + if (lrd->lrd_transno != tgt_th_info(env)->tti_transno) + return; + + hash = cfs_hash_getref(lut->lut_obd->obd_gen_hash); + if (hash == NULL) + return; + + export = cfs_hash_lookup(hash, &lrd->lrd_client_gen); + if (export == NULL) { + cfs_hash_putref(hash); + return; + } + + tsi->tsi_exp = export; + tsi->tsi_xid = lrd->lrd_xid; + tsi->tsi_opdata = lrd->lrd_data; + tsi->tsi_result = lrd->lrd_result; + tsi->tsi_client_gen = lrd->lrd_client_gen; + dtrq->dtrq_xid = lrd->lrd_xid; + top_th = container_of(th, struct top_thandle, tt_super); + top_th->tt_master_sub_thandle = master_th; + cfs_hash_putref(hash); +} + +/** + * Execute updates in the update replay records + * + * Declare distribute txn replay by update records and add the updates + * to the execution list. Note: it will check if the update has been + * committed, and only execute the updates if it is not committed to + * disk. + * + * \param[in] env execution environment + * \param[in] tdtd distribute txn replay data which hold all of replay + * reqs and all replay parameters. + * \param[in] dtrq distribute transaction replay req. + * \param[in] ta thandle execute args. + * + * \retval 0 if declare succeeds. + * \retval negative errno if declare fails. + */ +static int update_recovery_exec(const struct lu_env *env, + struct target_distribute_txn_data *tdtd, + struct distribute_txn_replay_req *dtrq, + struct thandle_exec_args *ta) +{ + struct llog_update_record *lur = dtrq->dtrq_lur; + struct update_records *records = &lur->lur_update_rec; + struct update_ops *ops = &records->ur_ops; + struct update_params *params = update_records_get_params(records); + struct top_thandle *top_th = container_of(ta->ta_handle, + struct top_thandle, + tt_super); + struct top_multiple_thandle *tmt = top_th->tt_multiple_thandle; + struct update_op *op; + unsigned int i; + int rc = 0; + ENTRY; + + /* These records have been swabbed in llog_cat_process() */ + for (i = 0, op = &ops->uops_op[0]; i < records->ur_update_count; + i++, op = update_op_next_op(op)) { + struct lu_fid *fid = &op->uop_fid; + struct dt_object *dt_obj; + struct dt_object *sub_dt_obj; + struct dt_device *sub_dt; + struct sub_thandle *st; + + if (op->uop_type == OUT_NOOP) + continue; + + dt_obj = dt_locate(env, tdtd->tdtd_dt, fid); + if (IS_ERR(dt_obj)) { + rc = PTR_ERR(dt_obj); + if (rc == -EREMCHG) + LCONSOLE_WARN("%.16s: hit invalid OI mapping " + "for "DFID" during recovering, " + "that may because auto scrub is " + "disabled on related MDT, and " + "will cause recovery failure. " + "Please enable auto scrub and " + "retry the recovery.\n", + tdtd->tdtd_lut->lut_obd->obd_name, + PFID(fid)); + + break; + } + sub_dt_obj = dt_object_child(dt_obj); + + /* Create sub thandle if not */ + sub_dt = lu2dt_dev(sub_dt_obj->do_lu.lo_dev); + st = lookup_sub_thandle(tmt, sub_dt); + if (st == NULL) { + st = create_sub_thandle(tmt, sub_dt); + if (IS_ERR(st)) + GOTO(next, rc = PTR_ERR(st)); + } + + /* check if updates on the OSD/OSP are committed */ + rc = update_is_committed(env, dtrq, dt_obj, top_th, st); + if (rc == 0) + /* If this is committed, goto next */ + goto next; + + if (rc < 0) + GOTO(next, rc); + + /* Create thandle for sub thandle if needed */ + if (st->st_sub_th == NULL) { + rc = sub_thandle_trans_create(env, top_th, st); + if (rc != 0) + GOTO(next, rc); + } + + CDEBUG(D_HA, "replay %uth update\n", i); + switch (op->uop_type) { + case OUT_CREATE: + rc = update_recovery_create(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_DESTROY: + rc = update_recovery_destroy(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_REF_ADD: + rc = update_recovery_ref_add(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_REF_DEL: + rc = update_recovery_ref_del(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_ATTR_SET: + rc = update_recovery_attr_set(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_XATTR_SET: + rc = update_recovery_xattr_set(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_INDEX_INSERT: + rc = update_recovery_index_insert(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_INDEX_DELETE: + rc = update_recovery_index_delete(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_WRITE: + rc = update_recovery_write(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + case OUT_XATTR_DEL: + rc = update_recovery_xattr_del(env, sub_dt_obj, + op, params, ta, + st->st_sub_th); + break; + default: + CERROR("Unknown update type %u\n", (__u32)op->uop_type); + rc = -EINVAL; + break; + } +next: + dt_object_put(env, dt_obj); + if (rc < 0) + break; + } + + ta->ta_handle->th_result = rc; + RETURN(rc); +} + +/** + * redo updates on MDT if needed. + * + * During DNE recovery, the recovery thread (target_recovery_thread) will call + * this function to replay distribute txn updates on all MDTs. It only replay + * updates on the MDT where the update record is missing. + * + * If the update already exists on the MDT, then it does not need replay the + * updates on that MDT, and only mark the sub transaction has been committed + * there. + * + * \param[in] env execution environment + * \param[in] tdtd target distribute txn data, which holds the replay list + * and all parameters needed by replay process. + * \param[in] dtrq distribute txn replay req. + * + * \retval 0 if replay succeeds. + * \retval negative errno if replay failes. + */ +int distribute_txn_replay_handle(struct lu_env *env, + struct target_distribute_txn_data *tdtd, + struct distribute_txn_replay_req *dtrq) +{ + struct update_records *records = &dtrq->dtrq_lur->lur_update_rec; + struct thandle_exec_args *ta; + struct lu_context session_env; + struct thandle *th = NULL; + struct top_thandle *top_th; + struct top_multiple_thandle *tmt; + struct thandle_update_records *tur = NULL; + int i; + int rc = 0; + ENTRY; + + /* initialize session, it is needed for the handler of target */ + rc = lu_context_init(&session_env, LCT_SERVER_SESSION | LCT_NOREF); + if (rc) { + CERROR("%s: failure to initialize session: rc = %d\n", + tdtd->tdtd_lut->lut_obd->obd_name, rc); + RETURN(rc); + } + lu_context_enter(&session_env); + env->le_ses = &session_env; + lu_env_refill(env); + update_records_dump(records, D_HA, true); + th = top_trans_create(env, NULL); + if (IS_ERR(th)) + GOTO(exit_session, rc = PTR_ERR(th)); + + ta = &update_env_info(env)->uti_tea; + ta->ta_argno = 0; + + update_env_info(env)->uti_dtrq = dtrq; + /* Create distribute transaction structure for this top thandle */ + top_th = container_of(th, struct top_thandle, tt_super); + rc = top_trans_create_tmt(env, top_th); + if (rc < 0) + GOTO(stop_trans, rc); + + th->th_dev = tdtd->tdtd_dt; + ta->ta_handle = th; + + /* check if the distribute transaction has been committed */ + tmt = top_th->tt_multiple_thandle; + tmt->tmt_master_sub_dt = tdtd->tdtd_lut->lut_bottom; + tmt->tmt_batchid = dtrq->dtrq_batchid; + tgt_th_info(env)->tti_transno = dtrq->dtrq_master_transno; + + if (tmt->tmt_batchid <= tdtd->tdtd_committed_batchid) + tmt->tmt_committed = 1; + + rc = update_recovery_exec(env, tdtd, dtrq, ta); + if (rc < 0) + GOTO(stop_trans, rc); + + /* If no updates are needed to be replayed, then mark this records as + * committed, so commit thread distribute_txn_commit_thread() will + * delete the record */ + if (ta->ta_argno == 0) + tmt->tmt_committed = 1; + + tur = &update_env_info(env)->uti_tur; + tur->tur_update_records = dtrq->dtrq_lur; + tur->tur_update_records_buf_size = dtrq->dtrq_lur_size; + tur->tur_update_params = NULL; + tur->tur_update_param_count = 0; + tmt->tmt_update_records = tur; + + distribute_txn_insert_by_batchid(tmt); + rc = top_trans_start(env, NULL, th); + if (rc < 0) + GOTO(stop_trans, rc); + + for (i = 0; i < ta->ta_argno; i++) { + struct tx_arg *ta_arg; + struct dt_object *dt_obj; + struct dt_device *sub_dt; + struct sub_thandle *st; + + ta_arg = ta->ta_args[i]; + dt_obj = ta_arg->object; + + LASSERT(tmt->tmt_committed == 0); + sub_dt = lu2dt_dev(dt_obj->do_lu.lo_dev); + st = lookup_sub_thandle(tmt, sub_dt); + + LASSERT(st != NULL); + LASSERT(st->st_sub_th != NULL); + rc = ta->ta_args[i]->exec_fn(env, st->st_sub_th, + ta->ta_args[i]); + + /* If the update is to update the reply data, then + * we need set the session information, so + * tgt_last_rcvd_update() can be called correctly */ + if (rc == 0 && dt_obj == tdtd->tdtd_lut->lut_reply_data) + update_recovery_update_ses(env, tdtd, th, + st->st_sub_th, dtrq, ta_arg); + + if (unlikely(rc < 0)) { + CDEBUG(D_HA, "error during execution of #%u from" + " %s:%d: rc = %d\n", i, ta->ta_args[i]->file, + ta->ta_args[i]->line, rc); + while (--i > 0) { + if (ta->ta_args[i]->undo_fn != NULL) { + dt_obj = ta->ta_args[i]->object; + sub_dt = + lu2dt_dev(dt_obj->do_lu.lo_dev); + st = lookup_sub_thandle(tmt, sub_dt); + LASSERT(st != NULL); + LASSERT(st->st_sub_th != NULL); + + ta->ta_args[i]->undo_fn(env, + st->st_sub_th, + ta->ta_args[i]); + } else { + CERROR("%s: undo for %s:%d: rc = %d\n", + dt_obd_name(ta->ta_handle->th_dev), + ta->ta_args[i]->file, + ta->ta_args[i]->line, -ENOTSUPP); + } + } + break; + } + CDEBUG(D_HA, "%s: executed %u/%u: rc = %d\n", + dt_obd_name(sub_dt), i, ta->ta_argno, rc); + } + +stop_trans: + if (rc < 0) + th->th_result = rc; + rc = top_trans_stop(env, tdtd->tdtd_dt, th); + for (i = 0; i < ta->ta_argno; i++) { + if (ta->ta_args[i]->object != NULL) { + dt_object_put(env, ta->ta_args[i]->object); + ta->ta_args[i]->object = NULL; + } + } + + if (tur != NULL) + tur->tur_update_records = NULL; + + if (tgt_ses_info(env)->tsi_exp != NULL) { + class_export_put(tgt_ses_info(env)->tsi_exp); + tgt_ses_info(env)->tsi_exp = NULL; + } +exit_session: + lu_context_exit(&session_env); + lu_context_fini(&session_env); + RETURN(rc); +} +EXPORT_SYMBOL(distribute_txn_replay_handle); diff --git a/drivers/staging/lustrefsx/lustre/target/update_trans.c b/drivers/staging/lustrefsx/lustre/target/update_trans.c new file mode 100644 index 0000000000000..b8150fa5c694c --- /dev/null +++ b/drivers/staging/lustrefsx/lustre/target/update_trans.c @@ -0,0 +1,1765 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, 2017, Intel Corporation. + */ +/* + * lustre/target/update_trans.c + * + * This file implements the update distribute transaction API. + * + * To manage the cross-MDT operation (distribute operation) transaction, + * the transaction will also be separated two layers on MD stack, top + * transaction and sub transaction. + * + * During the distribute operation, top transaction is created in the LOD + * layer, and represent the operation. Sub transaction is created by + * each OSD or OSP. Top transaction start/stop will trigger all of its sub + * transaction start/stop. Top transaction (the whole operation) is committed + * only all of its sub transaction are committed. + * + * there are three kinds of transactions + * 1. local transaction: All updates are in a single local OSD. + * 2. Remote transaction: All Updates are only in the remote OSD, + * i.e. locally all updates are in OSP. + * 3. Mixed transaction: Updates are both in local OSD and remote + * OSD. + * + * Author: Di Wang + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include + +#include +/** + * Dump top mulitple thandle + * + * Dump top multiple thandle and all of its sub thandle to the debug log. + * + * \param[in]mask debug mask + * \param[in]top_th top_thandle to be dumped + */ +static void top_multiple_thandle_dump(struct top_multiple_thandle *tmt, + __u32 mask) +{ + struct sub_thandle *st; + + LASSERT(tmt->tmt_magic == TOP_THANDLE_MAGIC); + CDEBUG(mask, "%s tmt %p refcount %d committed %d result %d batchid %llu\n", + tmt->tmt_master_sub_dt ? + tmt->tmt_master_sub_dt->dd_lu_dev.ld_obd->obd_name : + "NULL", + tmt, atomic_read(&tmt->tmt_refcount), tmt->tmt_committed, + tmt->tmt_result, tmt->tmt_batchid); + + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + struct sub_thandle_cookie *stc; + + CDEBUG(mask, "st %p obd %s committed %d started %d stopped %d " + "result %d sub_th %p\n", + st, st->st_dt->dd_lu_dev.ld_obd->obd_name, + st->st_committed, st->st_started, st->st_stopped, + st->st_result, st->st_sub_th); + + list_for_each_entry(stc, &st->st_cookie_list, stc_list) { + CDEBUG(mask, " cookie "DFID".%u\n", + PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid), + stc->stc_cookie.lgc_index); + } + } +} + +/** + * Declare write update to sub device + * + * Declare Write updates llog records to the sub device during distribute + * transaction. + * + * \param[in] env execution environment + * \param[in] record update records being written + * \param[in] sub_th sub transaction handle + * \param[in] record_size total update record size + * + * \retval 0 if writing succeeds + * \retval negative errno if writing fails + */ +static int sub_declare_updates_write(const struct lu_env *env, + struct llog_update_record *record, + struct thandle *sub_th, size_t record_size) +{ + struct llog_ctxt *ctxt; + struct dt_device *dt = sub_th->th_dev; + int left = record_size; + int rc; + + /* If ctxt is NULL, it means not need to write update, + * for example if the the OSP is used to connect to OST */ + ctxt = llog_get_context(dt->dd_lu_dev.ld_obd, + LLOG_UPDATELOG_ORIG_CTXT); + + /* Not ready to record updates yet. */ + if (ctxt == NULL || ctxt->loc_handle == NULL) { + llog_ctxt_put(ctxt); + return 0; + } + + rc = llog_declare_add(env, ctxt->loc_handle, + &record->lur_hdr, sub_th); + if (rc < 0) + GOTO(out_put, rc); + + while (left > ctxt->loc_chunk_size) { + rc = llog_declare_add(env, ctxt->loc_handle, + &record->lur_hdr, sub_th); + if (rc < 0) + GOTO(out_put, rc); + + left -= ctxt->loc_chunk_size; + } + +out_put: + llog_ctxt_put(ctxt); + + return rc; +} + +/** + * write update to sub device + * + * Write llog update record to the sub device during distribute + * transaction. If it succeeds, llog cookie of the record will be + * returned by @cookie. + * + * \param[in] env execution environment + * \param[in] record update records being written + * \param[in] sub_th sub transaction handle + * \param[out] cookie llog cookie of the update record. + * + * \retval 1 if writing succeeds + * \retval negative errno if writing fails + */ +static int sub_updates_write(const struct lu_env *env, + struct llog_update_record *record, + struct sub_thandle *sub_th) +{ + struct dt_device *dt = sub_th->st_dt; + struct llog_ctxt *ctxt; + struct llog_update_record *lur = NULL; + __u32 update_count = 0; + __u32 param_count = 0; + __u32 last_update_count = 0; + __u32 last_param_count = 0; + char *start; + char *cur; + char *next; + struct sub_thandle_cookie *stc; + size_t reclen; + bool eof = false; + int rc; + ENTRY; + + ctxt = llog_get_context(dt->dd_lu_dev.ld_obd, + LLOG_UPDATELOG_ORIG_CTXT); + /* If ctxt == NULL, then it means updates on OST (only happens + * during migration), and we do not track those updates for now */ + /* If ctxt->loc_handle == NULL, then it does not need to record + * update, usually happens in error handler path */ + if (ctxt == NULL || ctxt->loc_handle == NULL) { + llog_ctxt_put(ctxt); + RETURN(0); + } + + /* Since the cross-MDT updates will includes both local + * and remote updates, the update ops count must > 1 */ + LASSERT(record->lur_update_rec.ur_update_count > 1); + LASSERTF(record->lur_hdr.lrh_len == llog_update_record_size(record), + "lrh_len %u record_size %zu\n", record->lur_hdr.lrh_len, + llog_update_record_size(record)); + + /* + * If its size > llog chunk_size, then write current chunk to the update + * llog, NB the padding should >= LLOG_MIN_REC_SIZE. + * + * So check padding length is either >= LLOG_MIN_REC_SIZE or is 0 + * (record length just matches the chunk size). + */ + + reclen = record->lur_hdr.lrh_len; + if (reclen + LLOG_MIN_REC_SIZE <= ctxt->loc_chunk_size || + reclen == ctxt->loc_chunk_size) { + OBD_ALLOC_PTR(stc); + if (stc == NULL) + GOTO(llog_put, rc = -ENOMEM); + INIT_LIST_HEAD(&stc->stc_list); + + rc = llog_add(env, ctxt->loc_handle, &record->lur_hdr, + &stc->stc_cookie, sub_th->st_sub_th); + + CDEBUG(D_INFO, "%s: Add update log "DFID".%u: rc = %d\n", + dt->dd_lu_dev.ld_obd->obd_name, + PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid), + stc->stc_cookie.lgc_index, rc); + + if (rc > 0) { + list_add(&stc->stc_list, &sub_th->st_cookie_list); + rc = 0; + } else { + OBD_FREE_PTR(stc); + } + + GOTO(llog_put, rc); + } + + /* Split the records into chunk_size update record */ + OBD_ALLOC_LARGE(lur, ctxt->loc_chunk_size); + if (lur == NULL) + GOTO(llog_put, rc = -ENOMEM); + + memcpy(lur, &record->lur_hdr, sizeof(record->lur_hdr)); + lur->lur_update_rec.ur_update_count = 0; + lur->lur_update_rec.ur_param_count = 0; + start = (char *)&record->lur_update_rec.ur_ops; + cur = next = start; + do { + if (update_count < record->lur_update_rec.ur_update_count) + next = (char *)update_op_next_op( + (struct update_op *)cur); + else if (param_count < record->lur_update_rec.ur_param_count) + next = (char *)update_param_next_param( + (struct update_param *)cur); + else + eof = true; + + reclen = __llog_update_record_size( + __update_records_size(next - start)); + if ((reclen + LLOG_MIN_REC_SIZE <= ctxt->loc_chunk_size || + reclen == ctxt->loc_chunk_size) && + !eof) { + cur = next; + + if (update_count < + record->lur_update_rec.ur_update_count) + update_count++; + else if (param_count < + record->lur_update_rec.ur_param_count) + param_count++; + continue; + } + + lur->lur_update_rec.ur_update_count = update_count - + last_update_count; + lur->lur_update_rec.ur_param_count = param_count - + last_param_count; + memcpy(&lur->lur_update_rec.ur_ops, start, cur - start); + lur->lur_hdr.lrh_len = llog_update_record_size(lur); + + LASSERT(lur->lur_hdr.lrh_len == + __llog_update_record_size( + __update_records_size(cur - start))); + LASSERT(lur->lur_hdr.lrh_len <= ctxt->loc_chunk_size); + + update_records_dump(&lur->lur_update_rec, D_INFO, true); + + OBD_ALLOC_PTR(stc); + if (stc == NULL) + GOTO(llog_put, rc = -ENOMEM); + INIT_LIST_HEAD(&stc->stc_list); + + rc = llog_add(env, ctxt->loc_handle, &lur->lur_hdr, + &stc->stc_cookie, sub_th->st_sub_th); + + CDEBUG(D_INFO, "%s: Add update log "DFID".%u: rc = %d\n", + dt->dd_lu_dev.ld_obd->obd_name, + PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid), + stc->stc_cookie.lgc_index, rc); + + if (rc > 0) { + list_add(&stc->stc_list, &sub_th->st_cookie_list); + rc = 0; + } else { + OBD_FREE_PTR(stc); + GOTO(llog_put, rc); + } + + last_update_count = update_count; + last_param_count = param_count; + start = cur; + lur->lur_update_rec.ur_update_count = 0; + lur->lur_update_rec.ur_param_count = 0; + lur->lur_update_rec.ur_flags |= UPDATE_RECORD_CONTINUE; + } while (!eof); + +llog_put: + if (lur != NULL) + OBD_FREE_LARGE(lur, ctxt->loc_chunk_size); + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +/** + * Prepare the update records. + * + * Merge params and ops into the update records, then initializing + * the update buffer. + * + * During transaction execution phase, parameters and update ops + * are collected in two different buffers (see lod_updates_pack()), + * during transaction stop, it needs to be merged in one buffer, + * so it will be written in the update log. + * + * \param[in] env execution environment + * \param[in] tmt top_multiple_thandle for distribute txn + * + * \retval 0 if merging succeeds. + * \retval negaitive errno if merging fails. + */ +static int prepare_writing_updates(const struct lu_env *env, + struct top_multiple_thandle *tmt) +{ + struct thandle_update_records *tur = tmt->tmt_update_records; + struct llog_update_record *lur; + struct update_params *params; + size_t params_size; + size_t update_size; + + if (tur == NULL || tur->tur_update_records == NULL || + tur->tur_update_params == NULL) + return 0; + + lur = tur->tur_update_records; + /* Extends the update records buffer if needed */ + params_size = update_params_size(tur->tur_update_params, + tur->tur_update_param_count); + LASSERT(lur->lur_update_rec.ur_param_count == 0); + update_size = llog_update_record_size(lur); + if (cfs_size_round(update_size + params_size) > + tur->tur_update_records_buf_size) { + int rc; + + rc = tur_update_records_extend(tur, + cfs_size_round(update_size + params_size)); + if (rc < 0) + return rc; + + lur = tur->tur_update_records; + } + + params = update_records_get_params(&lur->lur_update_rec); + memcpy(params, tur->tur_update_params, params_size); + + lur->lur_update_rec.ur_param_count = tur->tur_update_param_count; + lur->lur_update_rec.ur_batchid = tmt->tmt_batchid; + /* Init update record header */ + lur->lur_hdr.lrh_len = llog_update_record_size(lur); + lur->lur_hdr.lrh_type = UPDATE_REC; + + /* Dump updates for debugging purpose */ + update_records_dump(&lur->lur_update_rec, D_INFO, true); + + return 0; +} + +static inline int +distribute_txn_commit_thread_running(struct lu_target *lut) +{ + return lut->lut_tdtd_commit_thread.t_flags & SVC_RUNNING; +} + +static inline int +distribute_txn_commit_thread_stopped(struct lu_target *lut) +{ + return lut->lut_tdtd_commit_thread.t_flags & SVC_STOPPED; +} + +/** + * Top thandle commit callback + * + * This callback will be called when all of sub transactions are committed. + * + * \param[in] th top thandle to be committed. + */ +static void top_trans_committed_cb(struct top_multiple_thandle *tmt) +{ + struct lu_target *lut; + ENTRY; + + LASSERT(atomic_read(&tmt->tmt_refcount) > 0); + + top_multiple_thandle_dump(tmt, D_HA); + tmt->tmt_committed = 1; + lut = dt2lu_dev(tmt->tmt_master_sub_dt)->ld_site->ls_tgt; + if (distribute_txn_commit_thread_running(lut)) + wake_up(&lut->lut_tdtd->tdtd_commit_thread_waitq); + RETURN_EXIT; +} + +struct sub_thandle *lookup_sub_thandle(struct top_multiple_thandle *tmt, + struct dt_device *dt_dev) +{ + struct sub_thandle *st; + + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st->st_dt == dt_dev) + return st; + } + return NULL; +} +EXPORT_SYMBOL(lookup_sub_thandle); + +struct sub_thandle *create_sub_thandle(struct top_multiple_thandle *tmt, + struct dt_device *dt_dev) +{ + struct sub_thandle *st; + + OBD_ALLOC_PTR(st); + if (st == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + INIT_LIST_HEAD(&st->st_sub_list); + INIT_LIST_HEAD(&st->st_cookie_list); + st->st_dt = dt_dev; + + list_add(&st->st_sub_list, &tmt->tmt_sub_thandle_list); + return st; +} + +static void sub_trans_commit_cb_internal(struct top_multiple_thandle *tmt, + struct thandle *sub_th, int err) +{ + struct sub_thandle *st; + bool all_committed = true; + + /* Check if all sub thandles are committed */ + spin_lock(&tmt->tmt_sub_lock); + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st->st_sub_th == sub_th) { + st->st_committed = 1; + st->st_result = err; + } + if (!st->st_committed) + all_committed = false; + } + spin_unlock(&tmt->tmt_sub_lock); + + if (tmt->tmt_result == 0) + tmt->tmt_result = err; + + if (all_committed) + top_trans_committed_cb(tmt); + + top_multiple_thandle_dump(tmt, D_INFO); + top_multiple_thandle_put(tmt); + RETURN_EXIT; +} + +/** + * sub thandle commit callback + * + * Mark the sub thandle to be committed and if all sub thandle are committed + * notify the top thandle. + * + * \param[in] env execution environment + * \param[in] sub_th sub thandle being committed + * \param[in] cb commit callback + * \param[in] err trans result + */ +static void sub_trans_commit_cb(struct lu_env *env, + struct thandle *sub_th, + struct dt_txn_commit_cb *cb, int err) +{ + struct top_multiple_thandle *tmt = cb->dcb_data; + + sub_trans_commit_cb_internal(tmt, sub_th, err); +} + +static void sub_thandle_register_commit_cb(struct sub_thandle *st, + struct top_multiple_thandle *tmt) +{ + LASSERT(st->st_sub_th != NULL); + top_multiple_thandle_get(tmt); + st->st_commit_dcb.dcb_func = sub_trans_commit_cb; + st->st_commit_dcb.dcb_data = tmt; + INIT_LIST_HEAD(&st->st_commit_dcb.dcb_linkage); + dt_trans_cb_add(st->st_sub_th, &st->st_commit_dcb); +} + +/** + * Sub thandle stop call back + * + * After sub thandle is stopped, it will call this callback to notify + * the top thandle. + * + * \param[in] th sub thandle to be stopped + * \param[in] rc result of sub trans + */ +static void sub_trans_stop_cb(struct lu_env *env, + struct thandle *sub_th, + struct dt_txn_commit_cb *cb, int err) +{ + struct sub_thandle *st; + struct top_multiple_thandle *tmt = cb->dcb_data; + ENTRY; + + spin_lock(&tmt->tmt_sub_lock); + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st->st_stopped) + continue; + + if (st->st_dt == sub_th->th_dev) { + st->st_stopped = 1; + st->st_result = err; + break; + } + } + spin_unlock(&tmt->tmt_sub_lock); + + wake_up(&tmt->tmt_stop_waitq); + RETURN_EXIT; +} + +static void sub_thandle_register_stop_cb(struct sub_thandle *st, + struct top_multiple_thandle *tmt) +{ + st->st_stop_dcb.dcb_func = sub_trans_stop_cb; + st->st_stop_dcb.dcb_data = tmt; + st->st_stop_dcb.dcb_flags = DCB_TRANS_STOP; + INIT_LIST_HEAD(&st->st_stop_dcb.dcb_linkage); + dt_trans_cb_add(st->st_sub_th, &st->st_stop_dcb); +} + +/** + * Create sub thandle + * + * Create transaction handle for sub_thandle + * + * \param[in] env execution environment + * \param[in] th top thandle + * \param[in] st sub_thandle + * + * \retval 0 if creation succeeds. + * \retval negative errno if creation fails. + */ +int sub_thandle_trans_create(const struct lu_env *env, + struct top_thandle *top_th, + struct sub_thandle *st) +{ + struct thandle *sub_th; + + sub_th = dt_trans_create(env, st->st_dt); + if (IS_ERR(sub_th)) + return PTR_ERR(sub_th); + + sub_th->th_top = &top_th->tt_super; + st->st_sub_th = sub_th; + + sub_th->th_wait_submit = 1; + sub_thandle_register_stop_cb(st, top_th->tt_multiple_thandle); + return 0; +} + +/** + * Create the top transaction. + * + * Create the top transaction on the master device. It will create a top + * thandle and a sub thandle on the master device. + * + * \param[in] env execution environment + * \param[in] master_dev master_dev the top thandle will be created + * + * \retval pointer to the created thandle. + * \retval ERR_PTR(errno) if creation failed. + */ +struct thandle * +top_trans_create(const struct lu_env *env, struct dt_device *master_dev) +{ + struct top_thandle *top_th; + struct thandle *child_th; + + OBD_ALLOC_GFP(top_th, sizeof(*top_th), __GFP_IO); + if (top_th == NULL) + return ERR_PTR(-ENOMEM); + + top_th->tt_super.th_top = &top_th->tt_super; + + if (master_dev != NULL) { + child_th = dt_trans_create(env, master_dev); + if (IS_ERR(child_th)) { + OBD_FREE_PTR(top_th); + return child_th; + } + + child_th->th_top = &top_th->tt_super; + child_th->th_wait_submit = 1; + top_th->tt_master_sub_thandle = child_th; + } + return &top_th->tt_super; +} +EXPORT_SYMBOL(top_trans_create); + +/** + * Declare write update transaction + * + * Check if there are updates being recorded in this transaction, + * it will write the record into the disk. + * + * \param[in] env execution environment + * \param[in] tmt top multiple transaction handle + * + * \retval 0 if writing succeeds + * \retval negative errno if writing fails + */ +static int declare_updates_write(const struct lu_env *env, + struct top_multiple_thandle *tmt) +{ + struct llog_update_record *record; + struct sub_thandle *st; + int rc = 0; + + record = tmt->tmt_update_records->tur_update_records; + /* Declare update write for all other target */ + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st->st_sub_th == NULL) + continue; + + rc = sub_declare_updates_write(env, record, st->st_sub_th, + tmt->tmt_record_size); + if (rc < 0) + break; + } + + return rc; +} + +/** + * Assign batchid to the distribute transaction. + * + * Assign batchid to the distribute transaction + * + * \param[in] tmt distribute transaction + */ +static void distribute_txn_assign_batchid(struct top_multiple_thandle *new) +{ + struct target_distribute_txn_data *tdtd; + struct dt_device *dt = new->tmt_master_sub_dt; + struct sub_thandle *st; + + LASSERT(dt != NULL); + tdtd = dt2lu_dev(dt)->ld_site->ls_tgt->lut_tdtd; + spin_lock(&tdtd->tdtd_batchid_lock); + new->tmt_batchid = tdtd->tdtd_batchid++; + list_add_tail(&new->tmt_commit_list, &tdtd->tdtd_list); + spin_unlock(&tdtd->tdtd_batchid_lock); + list_for_each_entry(st, &new->tmt_sub_thandle_list, st_sub_list) { + if (st->st_sub_th != NULL) + sub_thandle_register_commit_cb(st, new); + } + top_multiple_thandle_get(new); + top_multiple_thandle_dump(new, D_INFO); +} + +/** + * Insert distribute transaction to the distribute txn list. + * + * Insert distribute transaction to the distribute txn list. + * + * \param[in] new the distribute txn to be inserted. + */ +void distribute_txn_insert_by_batchid(struct top_multiple_thandle *new) +{ + struct dt_device *dt = new->tmt_master_sub_dt; + struct top_multiple_thandle *tmt; + struct target_distribute_txn_data *tdtd; + struct sub_thandle *st; + bool at_head = false; + + LASSERT(dt != NULL); + tdtd = dt2lu_dev(dt)->ld_site->ls_tgt->lut_tdtd; + + spin_lock(&tdtd->tdtd_batchid_lock); + list_for_each_entry_reverse(tmt, &tdtd->tdtd_list, tmt_commit_list) { + if (new->tmt_batchid > tmt->tmt_batchid) { + list_add(&new->tmt_commit_list, &tmt->tmt_commit_list); + break; + } + } + if (list_empty(&new->tmt_commit_list)) { + at_head = true; + list_add(&new->tmt_commit_list, &tdtd->tdtd_list); + } + spin_unlock(&tdtd->tdtd_batchid_lock); + + list_for_each_entry(st, &new->tmt_sub_thandle_list, st_sub_list) { + if (st->st_sub_th != NULL) + sub_thandle_register_commit_cb(st, new); + } + + top_multiple_thandle_get(new); + top_multiple_thandle_dump(new, D_INFO); + if (new->tmt_committed && at_head) + wake_up(&tdtd->tdtd_commit_thread_waitq); +} + +/** + * Prepare cross-MDT operation. + * + * Create the update record buffer to record updates for cross-MDT operation, + * add master sub transaction to tt_sub_trans_list, and declare the update + * writes. + * + * During updates packing, all of parameters will be packed in + * tur_update_params, and updates will be packed in tur_update_records. + * Then in transaction stop, parameters and updates will be merged + * into one updates buffer. + * + * And also master thandle will be added to the sub_th list, so it will be + * easy to track the commit status. + * + * \param[in] env execution environment + * \param[in] th top transaction handle + * + * \retval 0 if preparation succeeds. + * \retval negative errno if preparation fails. + */ +static int prepare_multiple_node_trans(const struct lu_env *env, + struct top_multiple_thandle *tmt) +{ + struct thandle_update_records *tur; + int rc; + ENTRY; + + if (tmt->tmt_update_records == NULL) { + tur = &update_env_info(env)->uti_tur; + rc = check_and_prepare_update_record(env, tur); + if (rc < 0) + RETURN(rc); + + tmt->tmt_update_records = tur; + distribute_txn_assign_batchid(tmt); + } + + rc = declare_updates_write(env, tmt); + + RETURN(rc); +} + +/** + * start the top transaction. + * + * Start all of its sub transactions, then start master sub transaction. + * + * \param[in] env execution environment + * \param[in] master_dev master_dev the top thandle will be start + * \param[in] th top thandle + * + * \retval 0 if transaction start succeeds. + * \retval negative errno if start fails. + */ +int top_trans_start(const struct lu_env *env, struct dt_device *master_dev, + struct thandle *th) +{ + struct top_thandle *top_th = container_of(th, struct top_thandle, + tt_super); + struct sub_thandle *st; + struct top_multiple_thandle *tmt = top_th->tt_multiple_thandle; + int rc = 0; + ENTRY; + + if (tmt == NULL) { + if (th->th_sync) + top_th->tt_master_sub_thandle->th_sync = th->th_sync; + if (th->th_local) + top_th->tt_master_sub_thandle->th_local = th->th_local; + rc = dt_trans_start(env, top_th->tt_master_sub_thandle->th_dev, + top_th->tt_master_sub_thandle); + RETURN(rc); + } + + tmt = top_th->tt_multiple_thandle; + rc = prepare_multiple_node_trans(env, tmt); + if (rc < 0) + RETURN(rc); + + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st->st_sub_th == NULL) + continue; + if (th->th_sync) + st->st_sub_th->th_sync = th->th_sync; + if (th->th_local) + st->st_sub_th->th_local = th->th_local; + rc = dt_trans_start(env, st->st_sub_th->th_dev, + st->st_sub_th); + if (rc != 0) + GOTO(out, rc); + + LASSERT(st->st_started == 0); + st->st_started = 1; + } +out: + th->th_result = rc; + RETURN(rc); +} +EXPORT_SYMBOL(top_trans_start); + +/** + * Check whether we need write updates record + * + * Check if the updates for the top_thandle needs to be writen + * to all targets. Only if the transaction succeeds and the updates + * number > 2, it will write the updates, + * + * \params [in] top_th top thandle. + * + * \retval true if it needs to write updates + * \retval false if it does not need to write updates + **/ +static bool top_check_write_updates(struct top_thandle *top_th) +{ + struct top_multiple_thandle *tmt; + struct thandle_update_records *tur; + + /* Do not write updates to records if the transaction fails */ + if (top_th->tt_super.th_result != 0) + return false; + + tmt = top_th->tt_multiple_thandle; + if (tmt == NULL) + return false; + + tur = tmt->tmt_update_records; + if (tur == NULL) + return false; + + /* Hmm, false update records, since the cross-MDT operation + * should includes both local and remote updates, so the + * updates count should >= 2 */ + if (tur->tur_update_records == NULL || + tur->tur_update_records->lur_update_rec.ur_update_count <= 1) + return false; + + return true; +} + +/** + * Check if top transaction is stopped + * + * Check if top transaction is stopped, only if all sub transaction + * is stopped, then the top transaction is stopped. + * + * \param [in] top_th top thandle + * + * \retval true if the top transaction is stopped. + * \retval false if the top transaction is not stopped. + */ +static bool top_trans_is_stopped(struct top_thandle *top_th) +{ + struct top_multiple_thandle *tmt; + struct sub_thandle *st; + bool all_stopped = true; + + tmt = top_th->tt_multiple_thandle; + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (!st->st_stopped && st->st_sub_th != NULL) { + all_stopped = false; + break; + } + + if (st->st_result != 0 && + top_th->tt_super.th_result == 0) + top_th->tt_super.th_result = st->st_result; + } + + return all_stopped; +} + +/** + * Wait result of top transaction + * + * Wait until all sub transaction get its result. + * + * \param [in] top_th top thandle. + * + * \retval the result of top thandle. + */ +static int top_trans_wait_result(struct top_thandle *top_th) +{ + struct l_wait_info lwi = {0}; + + l_wait_event(top_th->tt_multiple_thandle->tmt_stop_waitq, + top_trans_is_stopped(top_th), &lwi); + + RETURN(top_th->tt_super.th_result); +} + +/** + * Stop the top transaction. + * + * Stop the transaction on the master device first, then stop transactions + * on other sub devices. + * + * \param[in] env execution environment + * \param[in] master_dev master_dev the top thandle will be created + * \param[in] th top thandle + * + * \retval 0 if stop transaction succeeds. + * \retval negative errno if stop transaction fails. + */ +int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev, + struct thandle *th) +{ + struct top_thandle *top_th = container_of(th, struct top_thandle, + tt_super); + struct sub_thandle *st; + struct sub_thandle *master_st; + struct top_multiple_thandle *tmt; + struct thandle_update_records *tur; + bool write_updates = false; + int rc = 0; + ENTRY; + + if (likely(top_th->tt_multiple_thandle == NULL)) { + LASSERT(master_dev != NULL); + + if (th->th_sync) + top_th->tt_master_sub_thandle->th_sync = th->th_sync; + if (th->th_local) + top_th->tt_master_sub_thandle->th_local = th->th_local; + rc = dt_trans_stop(env, master_dev, + top_th->tt_master_sub_thandle); + OBD_FREE_PTR(top_th); + RETURN(rc); + } + + tmt = top_th->tt_multiple_thandle; + tur = tmt->tmt_update_records; + + /* Note: we need stop the master thandle first, then the stop + * callback will fill the master transno in the update logs, + * then these update logs will be sent to other MDTs */ + /* get the master sub thandle */ + master_st = lookup_sub_thandle(tmt, tmt->tmt_master_sub_dt); + write_updates = top_check_write_updates(top_th); + + /* Step 1: write the updates log on Master MDT */ + if (master_st != NULL && master_st->st_sub_th != NULL && + write_updates) { + struct llog_update_record *lur; + + /* Merge the parameters and updates into one buffer */ + rc = prepare_writing_updates(env, tmt); + if (rc < 0) { + CERROR("%s: cannot prepare updates: rc = %d\n", + master_dev->dd_lu_dev.ld_obd->obd_name, rc); + th->th_result = rc; + write_updates = false; + GOTO(stop_master_trans, rc); + } + + lur = tur->tur_update_records; + /* Write updates to the master MDT */ + rc = sub_updates_write(env, lur, master_st); + + /* Cleanup the common parameters in the update records, + * master transno callback might add more parameters. + * and we need merge the update records again in the + * following */ + if (tur->tur_update_params != NULL) + lur->lur_update_rec.ur_param_count = 0; + + if (rc < 0) { + CERROR("%s: write updates failed: rc = %d\n", + master_dev->dd_lu_dev.ld_obd->obd_name, rc); + th->th_result = rc; + write_updates = false; + GOTO(stop_master_trans, rc); + } + } + +stop_master_trans: + /* Step 2: Stop the transaction on the master MDT, and fill the + * master transno in the update logs to other MDT. */ + if (master_st != NULL && master_st->st_sub_th != NULL) { + if (th->th_local) + master_st->st_sub_th->th_local = th->th_local; + if (th->th_sync) + master_st->st_sub_th->th_sync = th->th_sync; + master_st->st_sub_th->th_result = th->th_result; + rc = dt_trans_stop(env, master_st->st_dt, master_st->st_sub_th); + /* If it does not write_updates, then we call submit callback + * here, otherwise callback is done through + * osd(osp)_trans_commit_cb() */ + if (!master_st->st_started && + !list_empty(&tmt->tmt_commit_list)) + sub_trans_commit_cb_internal(tmt, + master_st->st_sub_th, rc); + if (rc < 0) { + CERROR("%s: stop trans failed: rc = %d\n", + master_dev->dd_lu_dev.ld_obd->obd_name, rc); + th->th_result = rc; + GOTO(stop_other_trans, rc); + } else if (tur != NULL && tur->tur_update_records != NULL) { + struct llog_update_record *lur; + + lur = tur->tur_update_records; + if (lur->lur_update_rec.ur_master_transno == 0) + /* Update master transno after master stop + * callback */ + lur->lur_update_rec.ur_master_transno = + tgt_th_info(env)->tti_transno; + } + } + + /* Step 3: write updates to other MDTs */ + if (write_updates) { + struct llog_update_record *lur; + + /* Stop callback of master will add more updates and also update + * master transno, so merge the parameters and updates into one + * buffer again */ + rc = prepare_writing_updates(env, tmt); + if (rc < 0) { + CERROR("%s: prepare updates failed: rc = %d\n", + master_dev->dd_lu_dev.ld_obd->obd_name, rc); + th->th_result = rc; + GOTO(stop_other_trans, rc); + } + lur = tur->tur_update_records; + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, + st_sub_list) { + if (st->st_sub_th == NULL || st == master_st || + st->st_sub_th->th_result < 0) + continue; + + rc = sub_updates_write(env, lur, st); + if (rc < 0) { + CERROR("%s: write updates failed: rc = %d\n", + st->st_dt->dd_lu_dev.ld_obd->obd_name, + rc); + th->th_result = rc; + break; + } + } + } + +stop_other_trans: + /* Step 4: Stop the transaction on other MDTs */ + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + if (st == master_st || st->st_sub_th == NULL) + continue; + + if (th->th_sync) + st->st_sub_th->th_sync = th->th_sync; + if (th->th_local) + st->st_sub_th->th_local = th->th_local; + st->st_sub_th->th_result = th->th_result; + rc = dt_trans_stop(env, st->st_sub_th->th_dev, + st->st_sub_th); + if (rc < 0) { + CERROR("%s: stop trans failed: rc = %d\n", + st->st_dt->dd_lu_dev.ld_obd->obd_name, rc); + if (th->th_result == 0) + th->th_result = rc; + } + } + + rc = top_trans_wait_result(top_th); + + tmt->tmt_result = rc; + + /* Balance for the refcount in top_trans_create, Note: if it is NOT + * multiple node transaction, the top transaction will be destroyed. */ + top_multiple_thandle_put(tmt); + OBD_FREE_PTR(top_th); + RETURN(rc); +} +EXPORT_SYMBOL(top_trans_stop); + +/** + * Create top_multiple_thandle for top_thandle + * + * Create top_mutilple_thandle to manage the mutiple node transaction + * for top_thandle, and it also needs to add master sub thandle to the + * sub trans list now. + * + * \param[in] env execution environment + * \param[in] top_th the top thandle + * + * \retval 0 if creation succeeds + * \retval negative errno if creation fails + */ +int top_trans_create_tmt(const struct lu_env *env, + struct top_thandle *top_th) +{ + struct top_multiple_thandle *tmt; + + OBD_ALLOC_PTR(tmt); + if (tmt == NULL) + return -ENOMEM; + + tmt->tmt_magic = TOP_THANDLE_MAGIC; + INIT_LIST_HEAD(&tmt->tmt_sub_thandle_list); + INIT_LIST_HEAD(&tmt->tmt_commit_list); + atomic_set(&tmt->tmt_refcount, 1); + spin_lock_init(&tmt->tmt_sub_lock); + init_waitqueue_head(&tmt->tmt_stop_waitq); + + top_th->tt_multiple_thandle = tmt; + + return 0; +} + +static struct sub_thandle * +create_sub_thandle_with_thandle(struct top_thandle *top_th, + struct thandle *sub_th) +{ + struct sub_thandle *st; + + /* create and init sub th to the top trans list */ + st = create_sub_thandle(top_th->tt_multiple_thandle, + sub_th->th_dev); + if (IS_ERR(st)) + return st; + + st->st_sub_th = sub_th; + + sub_th->th_top = &top_th->tt_super; + sub_thandle_register_stop_cb(st, top_th->tt_multiple_thandle); + return st; +} + +/** + * Get sub thandle. + * + * Get sub thandle from the top thandle according to the sub dt_device. + * + * \param[in] env execution environment + * \param[in] th thandle on the top layer. + * \param[in] sub_dt sub dt_device used to get sub transaction + * + * \retval thandle of sub transaction if succeed + * \retval PTR_ERR(errno) if failed + */ +struct thandle *thandle_get_sub_by_dt(const struct lu_env *env, + struct thandle *th, + struct dt_device *sub_dt) +{ + struct sub_thandle *st = NULL; + struct sub_thandle *master_st = NULL; + struct top_thandle *top_th; + struct thandle *sub_th = NULL; + int rc = 0; + ENTRY; + + top_th = container_of(th, struct top_thandle, tt_super); + + if (likely(sub_dt == top_th->tt_master_sub_thandle->th_dev)) + RETURN(top_th->tt_master_sub_thandle); + + if (top_th->tt_multiple_thandle != NULL) { + st = lookup_sub_thandle(top_th->tt_multiple_thandle, sub_dt); + if (st != NULL) + RETURN(st->st_sub_th); + } + + sub_th = dt_trans_create(env, sub_dt); + if (IS_ERR(sub_th)) + RETURN(sub_th); + + /* Create top_multiple_thandle if necessary */ + if (top_th->tt_multiple_thandle == NULL) { + struct top_multiple_thandle *tmt; + + rc = top_trans_create_tmt(env, top_th); + if (rc < 0) + GOTO(stop_trans, rc); + + tmt = top_th->tt_multiple_thandle; + + /* Add master sub th to the top trans list */ + tmt->tmt_master_sub_dt = + top_th->tt_master_sub_thandle->th_dev; + master_st = create_sub_thandle_with_thandle(top_th, + top_th->tt_master_sub_thandle); + if (IS_ERR(master_st)) { + rc = PTR_ERR(master_st); + master_st = NULL; + GOTO(stop_trans, rc); + } + } + + /* create and init sub th to the top trans list */ + st = create_sub_thandle_with_thandle(top_th, sub_th); + if (IS_ERR(st)) { + rc = PTR_ERR(st); + st = NULL; + GOTO(stop_trans, rc); + } + st->st_sub_th->th_wait_submit = 1; +stop_trans: + if (rc < 0) { + if (master_st != NULL) { + list_del(&master_st->st_sub_list); + OBD_FREE_PTR(master_st); + } + sub_th->th_result = rc; + dt_trans_stop(env, sub_dt, sub_th); + sub_th = ERR_PTR(rc); + } + + RETURN(sub_th); +} +EXPORT_SYMBOL(thandle_get_sub_by_dt); + +/** + * Top multiple thandle destroy + * + * Destroy multiple thandle and all its sub thandle. + * + * \param[in] tmt top_multiple_thandle to be destroyed. + */ +void top_multiple_thandle_destroy(struct top_multiple_thandle *tmt) +{ + struct sub_thandle *st; + struct sub_thandle *tmp; + + LASSERT(tmt->tmt_magic == TOP_THANDLE_MAGIC); + list_for_each_entry_safe(st, tmp, &tmt->tmt_sub_thandle_list, + st_sub_list) { + struct sub_thandle_cookie *stc; + struct sub_thandle_cookie *tmp; + + list_del(&st->st_sub_list); + list_for_each_entry_safe(stc, tmp, &st->st_cookie_list, + stc_list) { + list_del(&stc->stc_list); + OBD_FREE_PTR(stc); + } + OBD_FREE_PTR(st); + } + OBD_FREE_PTR(tmt); +} +EXPORT_SYMBOL(top_multiple_thandle_destroy); + +/** + * Cancel the update log on MDTs + * + * Cancel the update log on MDTs then destroy the thandle. + * + * \param[in] env execution environment + * \param[in] tmt the top multiple thandle whose updates records + * will be cancelled. + * + * \retval 0 if cancellation succeeds. + * \retval negative errno if cancellation fails. + */ +static int distribute_txn_cancel_records(const struct lu_env *env, + struct top_multiple_thandle *tmt) +{ + struct sub_thandle *st; + ENTRY; + + top_multiple_thandle_dump(tmt, D_INFO); + /* Cancel update logs on other MDTs */ + list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { + struct llog_ctxt *ctxt; + struct obd_device *obd; + struct llog_cookie *cookie; + struct sub_thandle_cookie *stc; + int rc; + + obd = st->st_dt->dd_lu_dev.ld_obd; + ctxt = llog_get_context(obd, LLOG_UPDATELOG_ORIG_CTXT); + if (ctxt == NULL) + continue; + list_for_each_entry(stc, &st->st_cookie_list, stc_list) { + cookie = &stc->stc_cookie; + if (fid_is_zero(&cookie->lgc_lgl.lgl_oi.oi_fid)) + continue; + + rc = llog_cat_cancel_records(env, ctxt->loc_handle, 1, + cookie); + CDEBUG(D_HA, "%s: batchid %llu cancel update log " + DFID".%u: rc = %d\n", obd->obd_name, + tmt->tmt_batchid, + PFID(&cookie->lgc_lgl.lgl_oi.oi_fid), + cookie->lgc_index, rc); + } + + llog_ctxt_put(ctxt); + } + + RETURN(0); +} + +/** + * Check if there are committed transaction + * + * Check if there are committed transaction in the distribute transaction + * list, then cancel the update records for those committed transaction. + * Because the distribute transaction in the list are sorted by batchid, + * and cancellation will be done by batchid order, so we only check the first + * the transaction(with lowest batchid) in the list. + * + * \param[in] lod lod device where cancel thread is + * + * \retval true if it is ready + * \retval false if it is not ready + */ +static bool tdtd_ready_for_cancel_log(struct target_distribute_txn_data *tdtd) +{ + struct top_multiple_thandle *tmt = NULL; + struct obd_device *obd = tdtd->tdtd_lut->lut_obd; + bool ready = false; + + spin_lock(&tdtd->tdtd_batchid_lock); + if (!list_empty(&tdtd->tdtd_list)) { + tmt = list_entry(tdtd->tdtd_list.next, + struct top_multiple_thandle, tmt_commit_list); + if (tmt->tmt_committed && + (!obd->obd_recovering || (obd->obd_recovering && + tmt->tmt_batchid <= tdtd->tdtd_committed_batchid))) + ready = true; + } + spin_unlock(&tdtd->tdtd_batchid_lock); + + return ready; +} + +struct distribute_txn_bid_data { + struct dt_txn_commit_cb dtbd_cb; + struct target_distribute_txn_data *dtbd_tdtd; + __u64 dtbd_batchid; +}; + +/** + * callback of updating commit batchid + * + * Updating commit batchid then wake up the commit thread to cancel the + * records. + * + * \param[in]env execution environment + * \param[in]th thandle to updating commit batchid + * \param[in]cb commit callback + * \param[in]err result of thandle + */ +static void distribute_txn_batchid_cb(struct lu_env *env, + struct thandle *th, + struct dt_txn_commit_cb *cb, + int err) +{ + struct distribute_txn_bid_data *dtbd = NULL; + struct target_distribute_txn_data *tdtd; + + dtbd = container_of0(cb, struct distribute_txn_bid_data, dtbd_cb); + tdtd = dtbd->dtbd_tdtd; + + CDEBUG(D_HA, "%s: %llu batchid updated\n", + tdtd->tdtd_lut->lut_obd->obd_name, dtbd->dtbd_batchid); + spin_lock(&tdtd->tdtd_batchid_lock); + if (dtbd->dtbd_batchid > tdtd->tdtd_committed_batchid && + !tdtd->tdtd_lut->lut_obd->obd_no_transno) + tdtd->tdtd_committed_batchid = dtbd->dtbd_batchid; + spin_unlock(&tdtd->tdtd_batchid_lock); + atomic_dec(&tdtd->tdtd_refcount); + wake_up(&tdtd->tdtd_commit_thread_waitq); + + OBD_FREE_PTR(dtbd); +} + +/** + * Update the commit batchid in disk + * + * Update commit batchid in the disk, after this is committed, it can start + * to cancel the update records. + * + * \param[in] env execution environment + * \param[in] tdtd distribute transaction structure + * \param[in] batchid commit batchid to be updated + * + * \retval 0 if update succeeds. + * \retval negative errno if update fails. + */ +static int +distribute_txn_commit_batchid_update(const struct lu_env *env, + struct target_distribute_txn_data *tdtd, + __u64 batchid) +{ + struct distribute_txn_bid_data *dtbd = NULL; + struct thandle *th; + struct lu_buf buf; + __u64 tmp; + __u64 off; + int rc; + ENTRY; + + OBD_ALLOC_PTR(dtbd); + if (dtbd == NULL) + RETURN(-ENOMEM); + dtbd->dtbd_batchid = batchid; + dtbd->dtbd_tdtd = tdtd; + dtbd->dtbd_cb.dcb_func = distribute_txn_batchid_cb; + atomic_inc(&tdtd->tdtd_refcount); + + th = dt_trans_create(env, tdtd->tdtd_lut->lut_bottom); + if (IS_ERR(th)) { + atomic_dec(&tdtd->tdtd_refcount); + OBD_FREE_PTR(dtbd); + RETURN(PTR_ERR(th)); + } + + tmp = cpu_to_le64(batchid); + buf.lb_buf = &tmp; + buf.lb_len = sizeof(tmp); + off = 0; + + rc = dt_declare_record_write(env, tdtd->tdtd_batchid_obj, &buf, off, + th); + if (rc < 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, tdtd->tdtd_lut->lut_bottom, th); + if (rc < 0) + GOTO(stop, rc); + + rc = dt_trans_cb_add(th, &dtbd->dtbd_cb); + if (rc < 0) + GOTO(stop, rc); + + rc = dt_record_write(env, tdtd->tdtd_batchid_obj, &buf, + &off, th); + + CDEBUG(D_INFO, "%s: update batchid %llu: rc = %d\n", + tdtd->tdtd_lut->lut_obd->obd_name, batchid, rc); + +stop: + dt_trans_stop(env, tdtd->tdtd_lut->lut_bottom, th); + if (rc < 0) { + atomic_dec(&tdtd->tdtd_refcount); + OBD_FREE_PTR(dtbd); + } + RETURN(rc); +} + +/** + * Init commit batchid for distribute transaction. + * + * Initialize the batchid object and get commit batchid from the object. + * + * \param[in] env execution environment + * \param[in] tdtd distribute transaction whose batchid is initialized. + * + * \retval 0 if initialization succeeds. + * \retval negative errno if initialization fails. + **/ +static int +distribute_txn_commit_batchid_init(const struct lu_env *env, + struct target_distribute_txn_data *tdtd) +{ + struct tgt_thread_info *tti = tgt_th_info(env); + struct lu_target *lut = tdtd->tdtd_lut; + struct lu_attr *attr = &tti->tti_attr; + struct lu_fid *fid = &tti->tti_fid1; + struct dt_object_format *dof = &tti->tti_u.update.tti_update_dof; + struct dt_object *dt_obj = NULL; + struct lu_buf buf; + __u64 tmp; + __u64 off; + int rc; + ENTRY; + + memset(attr, 0, sizeof(*attr)); + attr->la_valid = LA_MODE; + attr->la_mode = S_IFREG | S_IRUGO | S_IWUSR; + dof->dof_type = dt_mode_to_dft(S_IFREG); + + lu_local_obj_fid(fid, BATCHID_COMMITTED_OID); + + dt_obj = dt_find_or_create(env, lut->lut_bottom, fid, dof, + attr); + if (IS_ERR(dt_obj)) { + rc = PTR_ERR(dt_obj); + dt_obj = NULL; + GOTO(out_put, rc); + } + + tdtd->tdtd_batchid_obj = dt_obj; + + buf.lb_buf = &tmp; + buf.lb_len = sizeof(tmp); + off = 0; + rc = dt_read(env, dt_obj, &buf, &off); + if (rc < 0 || (rc < buf.lb_len && rc > 0)) { + CERROR("%s can't read last committed batchid: rc = %d\n", + tdtd->tdtd_lut->lut_obd->obd_name, rc); + if (rc > 0) + rc = -EINVAL; + GOTO(out_put, rc); + } else if (rc == buf.lb_len) { + tdtd->tdtd_committed_batchid = le64_to_cpu(tmp); + CDEBUG(D_HA, "%s: committed batchid %llu\n", + tdtd->tdtd_lut->lut_obd->obd_name, + tdtd->tdtd_committed_batchid); + rc = 0; + } + +out_put: + if (rc < 0 && dt_obj != NULL) { + dt_object_put(env, dt_obj); + tdtd->tdtd_batchid_obj = NULL; + } + return rc; +} + +/** + * manage the distribute transaction thread + * + * Distribute transaction are linked to the list, and once the distribute + * transaction is committed, it will update the last committed batchid first, + * after it is committed, it will cancel the records. + * + * \param[in] _arg argument for commit thread + * + * \retval 0 if thread is running successfully + * \retval negative errno if the thread can not be run. + */ +static int distribute_txn_commit_thread(void *_arg) +{ + struct target_distribute_txn_data *tdtd = _arg; + struct lu_target *lut = tdtd->tdtd_lut; + struct ptlrpc_thread *thread = &lut->lut_tdtd_commit_thread; + struct l_wait_info lwi = { 0 }; + struct lu_env env; + struct list_head list; + int rc; + struct top_multiple_thandle *tmt; + struct top_multiple_thandle *tmp; + __u64 batchid = 0, committed; + + ENTRY; + + rc = lu_env_init(&env, LCT_LOCAL | LCT_MD_THREAD); + if (rc != 0) + RETURN(rc); + + spin_lock(&tdtd->tdtd_batchid_lock); + thread->t_flags = SVC_RUNNING; + spin_unlock(&tdtd->tdtd_batchid_lock); + wake_up(&thread->t_ctl_waitq); + INIT_LIST_HEAD(&list); + + CDEBUG(D_HA, "%s: start commit thread committed batchid %llu\n", + tdtd->tdtd_lut->lut_obd->obd_name, + tdtd->tdtd_committed_batchid); + + while (distribute_txn_commit_thread_running(lut)) { + spin_lock(&tdtd->tdtd_batchid_lock); + list_for_each_entry_safe(tmt, tmp, &tdtd->tdtd_list, + tmt_commit_list) { + if (tmt->tmt_committed == 0) + break; + + /* Note: right now, replay is based on master MDT + * transno, but cancellation is based on batchid. + * so we do not try to cancel the update log until + * the recoverying is done, unless the update records + * batchid < committed_batchid. */ + if (tmt->tmt_batchid <= tdtd->tdtd_committed_batchid) { + list_move_tail(&tmt->tmt_commit_list, &list); + } else if (!tdtd->tdtd_lut->lut_obd->obd_recovering) { + LASSERTF(tmt->tmt_batchid >= batchid, + "tmt %p tmt_batchid: %llu, batchid " + "%llu\n", tmt, tmt->tmt_batchid, + batchid); + /* There are three types of distribution + * transaction result + * + * 1. If tmt_result < 0, it means the + * distribution transaction fails, which should + * be rare, because once declare phase succeeds, + * the operation should succeeds anyway. Note in + * this case, we will still update batchid so + * cancellation would be stopped. + * + * 2. If tmt_result == 0, it means the + * distribution transaction succeeds, and we + * will update batchid. + * + * 3. If tmt_result > 0, it means distribute + * transaction is not yet committed on every + * node, but we need release this tmt before + * that, which usuually happens during umount. + */ + if (tmt->tmt_result <= 0) + batchid = tmt->tmt_batchid; + list_move_tail(&tmt->tmt_commit_list, &list); + } + } + spin_unlock(&tdtd->tdtd_batchid_lock); + + CDEBUG(D_HA, "%s: batchid: %llu committed batchid " + "%llu\n", tdtd->tdtd_lut->lut_obd->obd_name, batchid, + tdtd->tdtd_committed_batchid); + /* update globally committed on a storage */ + if (batchid > tdtd->tdtd_committed_batchid) { + rc = distribute_txn_commit_batchid_update(&env, tdtd, + batchid); + if (rc == 0) + batchid = 0; + } + /* cancel the records for committed batchid's */ + /* XXX: should we postpone cancel's till the end of recovery? */ + committed = tdtd->tdtd_committed_batchid; + list_for_each_entry_safe(tmt, tmp, &list, tmt_commit_list) { + if (tmt->tmt_batchid > committed) + break; + list_del_init(&tmt->tmt_commit_list); + if (tmt->tmt_result <= 0) + distribute_txn_cancel_records(&env, tmt); + top_multiple_thandle_put(tmt); + } + + l_wait_event(tdtd->tdtd_commit_thread_waitq, + !distribute_txn_commit_thread_running(lut) || + committed < tdtd->tdtd_committed_batchid || + tdtd_ready_for_cancel_log(tdtd), &lwi); + }; + + l_wait_event(tdtd->tdtd_commit_thread_waitq, + atomic_read(&tdtd->tdtd_refcount) == 0, &lwi); + + spin_lock(&tdtd->tdtd_batchid_lock); + list_for_each_entry_safe(tmt, tmp, &tdtd->tdtd_list, + tmt_commit_list) + list_move_tail(&tmt->tmt_commit_list, &list); + spin_unlock(&tdtd->tdtd_batchid_lock); + + CDEBUG(D_INFO, "%s stopping distribute txn commit thread.\n", + tdtd->tdtd_lut->lut_obd->obd_name); + list_for_each_entry_safe(tmt, tmp, &list, tmt_commit_list) { + list_del_init(&tmt->tmt_commit_list); + top_multiple_thandle_dump(tmt, D_HA); + top_multiple_thandle_put(tmt); + } + + thread->t_flags = SVC_STOPPED; + lu_env_fini(&env); + wake_up(&thread->t_ctl_waitq); + + RETURN(0); +} + +/** + * Start llog cancel thread + * + * Start llog cancel(master/slave) thread on LOD + * + * \param[in]lclt cancel log thread to be started. + * + * \retval 0 if the thread is started successfully. + * \retval negative errno if the thread is not being + * started. + */ +int distribute_txn_init(const struct lu_env *env, + struct lu_target *lut, + struct target_distribute_txn_data *tdtd, + __u32 index) +{ + struct task_struct *task; + struct l_wait_info lwi = { 0 }; + int rc; + ENTRY; + + INIT_LIST_HEAD(&tdtd->tdtd_list); + INIT_LIST_HEAD(&tdtd->tdtd_replay_finish_list); + INIT_LIST_HEAD(&tdtd->tdtd_replay_list); + spin_lock_init(&tdtd->tdtd_batchid_lock); + spin_lock_init(&tdtd->tdtd_replay_list_lock); + tdtd->tdtd_replay_handler = distribute_txn_replay_handle; + tdtd->tdtd_replay_ready = 0; + + tdtd->tdtd_batchid = lut->lut_last_transno + 1; + + init_waitqueue_head(&lut->lut_tdtd_commit_thread.t_ctl_waitq); + init_waitqueue_head(&tdtd->tdtd_commit_thread_waitq); + init_waitqueue_head(&tdtd->tdtd_recovery_threads_waitq); + atomic_set(&tdtd->tdtd_refcount, 0); + atomic_set(&tdtd->tdtd_recovery_threads_count, 0); + + tdtd->tdtd_lut = lut; + if (lut->lut_bottom->dd_rdonly) + RETURN(0); + + rc = distribute_txn_commit_batchid_init(env, tdtd); + if (rc != 0) + RETURN(rc); + + task = kthread_run(distribute_txn_commit_thread, tdtd, "dist_txn-%u", + index); + if (IS_ERR(task)) + RETURN(PTR_ERR(task)); + + l_wait_event(lut->lut_tdtd_commit_thread.t_ctl_waitq, + distribute_txn_commit_thread_running(lut) || + distribute_txn_commit_thread_stopped(lut), &lwi); + RETURN(0); +} +EXPORT_SYMBOL(distribute_txn_init); + +/** + * Stop llog cancel thread + * + * Stop llog cancel(master/slave) thread on LOD and also destory + * all of transaction in the list. + * + * \param[in]lclt cancel log thread to be stopped. + */ +void distribute_txn_fini(const struct lu_env *env, + struct target_distribute_txn_data *tdtd) +{ + struct lu_target *lut = tdtd->tdtd_lut; + + /* Stop cancel thread */ + if (lut == NULL || !distribute_txn_commit_thread_running(lut)) + return; + + spin_lock(&tdtd->tdtd_batchid_lock); + lut->lut_tdtd_commit_thread.t_flags = SVC_STOPPING; + spin_unlock(&tdtd->tdtd_batchid_lock); + wake_up(&tdtd->tdtd_commit_thread_waitq); + wait_event(lut->lut_tdtd_commit_thread.t_ctl_waitq, + lut->lut_tdtd_commit_thread.t_flags & SVC_STOPPED); + + dtrq_list_destroy(tdtd); + if (tdtd->tdtd_batchid_obj != NULL) { + dt_object_put(env, tdtd->tdtd_batchid_obj); + tdtd->tdtd_batchid_obj = NULL; + } +} +EXPORT_SYMBOL(distribute_txn_fini); diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h new file mode 100644 index 0000000000000..366a6d168b2c4 --- /dev/null +++ b/drivers/staging/lustrefsx/undef.h @@ -0,0 +1,1191 @@ + +/* enable libcfs CDEBUG, CWARN */ +#undef CDEBUG_ENABLED + +/* enable libcfs ENTRY/EXIT */ +#undef CDEBUG_ENTRY_EXIT + +/* enable page state tracking code */ +#undef CONFIG_DEBUG_PAGESTATE_TRACKING + +/* enable encryption for ldiskfs */ +#undef CONFIG_LDISKFS_FS_ENCRYPTION + +/* posix acls for ldiskfs */ +#undef CONFIG_LDISKFS_FS_POSIX_ACL + +/* enable rw access for ldiskfs */ +#undef CONFIG_LDISKFS_FS_RW + +/* fs security for ldiskfs */ +#undef CONFIG_LDISKFS_FS_SECURITY + +/* extened attributes for ldiskfs */ +#undef CONFIG_LDISKFS_FS_XATTR + +/* enable invariant checking */ +#undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK + +/* kernel has cpu affinity support */ +#undef CPU_AFFINITY + +/* both i_dentry/d_alias uses list */ +#undef DATA_FOR_LLITE_IS_LIST + +/* name of ldiskfs debug program */ +#undef DEBUGFS + +/* name of ldiskfs dump program */ +#undef DUMPE2FS + +/* name of ldiskfs fsck program */ +#undef E2FSCK + +/* name of ldiskfs e2fsprogs package */ +#undef E2FSPROGS + +/* name of ldiskfs label program */ +#undef E2LABEL + +/* do data checksums */ +#undef ENABLE_CHECKSUM + +/* enable flock by default */ +#undef ENABLE_FLOCK + +/* Use the Pinger */ +#undef ENABLE_PINGER + +/* aes-sha2 is supported by krb5 */ +#undef HAVE_AES_SHA2_SUPPORT + +/* Define to 1 if you have the header file. */ +#undef HAVE_ASM_TYPES_H + +/* backing_dev_info exist */ +#undef HAVE_BACKING_DEV_INFO + +/* BDI_CAP_MAP_COPY exist */ +#undef HAVE_BDI_CAP_MAP_COPY + +/* bio_endio takes only one argument */ +#undef HAVE_BIO_ENDIO_USES_ONE_ARG + +/* bio_end_sector is defined */ +#undef HAVE_BIO_END_SECTOR + +/* 'bio_integrity_enabled' is available */ +#undef HAVE_BIO_INTEGRITY_ENABLED + +/* kernel has bio_integrity_prep_fn */ +#undef HAVE_BIO_INTEGRITY_PREP_FN + +/* bio_integrity_payload.bip_iter exist */ +#undef HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD + +/* 'bi_bdev' is available */ +#undef HAVE_BI_BDEV + +/* struct bio has bi_cnt */ +#undef HAVE_BI_CNT + +/* struct bio has bi_rw */ +#undef HAVE_BI_RW + +/* 'bi_status' is available */ +#undef HAVE_BI_STATUS + +/* blkdev_get_by_dev is exported by the kernel */ +#undef HAVE_BLKDEV_GET_BY_DEV + +/* Define to 1 if you have the header file. */ +#undef HAVE_BLKID_BLKID_H + +/* blk_plug struct exists */ +#undef HAVE_BLK_PLUG + +/* blk_queue_max_segments is defined */ +#undef HAVE_BLK_QUEUE_MAX_SEGMENTS + +/* kernel hash_64() is broken */ +#undef HAVE_BROKEN_HASH_64 + +/* kernel has struct bvec_iter */ +#undef HAVE_BVEC_ITER + +/* struct cache_detail has writers */ +#undef HAVE_CACHE_DETAIL_WRITERS + +/* if cache_detail->hash_lock is a spinlock */ +#undef HAVE_CACHE_HASH_SPINLOCK + +/* cache_head has hlist cache_list */ +#undef HAVE_CACHE_HEAD_HLIST + +/* have cache_register */ +#undef HAVE_CACHE_REGISTER + +/* cancel_dirty_page is still available */ +#undef HAVE_CANCEL_DIRTY_PAGE + +/* kernel has clean_bdev_aliases */ +#undef HAVE_CLEAN_BDEV_ALIASES + +/* 'clear_and_wake_up_bit' is available */ +#undef HAVE_CLEAR_AND_WAKE_UP_BIT + +/* have clear_inode */ +#undef HAVE_CLEAR_INODE + +/* compat rdma found */ +#undef HAVE_COMPAT_RDMA + +/* 'cpu_read_lock' exist */ +#undef HAVE_CPUS_READ_LOCK + +/* kernel compiled with CRC32 functions */ +#undef HAVE_CRC32 + +/* crypto hash helper functions are available */ +#undef HAVE_CRYPTO_HASH_HELPERS + +/* 'CRYPTO_MAX_ALG_NAME' is 128 */ +#undef HAVE_CRYPTO_MAX_ALG_NAME_128 + +/* current_time() has replaced CURRENT_TIME */ +#undef HAVE_CURRENT_TIME + +/* dcache_lock is exist */ +#undef HAVE_DCACHE_LOCK + +/* kernel export delete_from_page_cache */ +#undef HAVE_DELETE_FROM_PAGE_CACHE + +/* dentry.d_child exist */ +#undef HAVE_DENTRY_D_CHILD + +/* hlist dentry.d_u.d_alias exist */ +#undef HAVE_DENTRY_D_U_D_ALIAS + +/* dentry_open uses struct path as first argument */ +#undef HAVE_DENTRY_OPEN_USE_PATH + +/* DES3 enctype is supported by krb5 */ +#undef HAVE_DES3_SUPPORT + +/* direct_IO need 2 arguments */ +#undef HAVE_DIRECTIO_2ARGS + +/* direct IO uses iov_iter */ +#undef HAVE_DIRECTIO_ITER + +/* dirty_inode super_operation takes flag */ +#undef HAVE_DIRTY_INODE_HAS_FLAG + +/* dir_context exist */ +#undef HAVE_DIR_CONTEXT + +/* Define to 1 if you have the header file. */ +#undef HAVE_DLFCN_H + +/* Have dmu_object_alloc_dnsize in ZFS */ +#undef HAVE_DMU_OBJECT_ALLOC_DNSIZE + +/* Have dmu_objset_disown() with 3 args */ +#undef HAVE_DMU_OBJSET_DISOWN_3ARG + +/* Have dmu_objset_own() with 6 args */ +#undef HAVE_DMU_OBJSET_OWN_6ARG + +/* Have 6 argument dmu_pretch in ZFS */ +#undef HAVE_DMU_PREFETCH_6ARG + +/* Have dmu_read_by_dnode() in ZFS */ +#undef HAVE_DMU_READ_BY_DNODE + +/* Have dmu_tx_hold_write_by_dnode() in ZFS */ +#undef HAVE_DMU_TX_HOLD_WRITE_BY_DNODE + +/* Have dmu_tx_hold_zap_by_dnode() in ZFS */ +#undef HAVE_DMU_TX_HOLD_ZAP_BY_DNODE + +/* Have dmu_tx_mark_netfree */ +#undef HAVE_DMU_TX_MARK_NETFREE + +/* Have native dnode accounting in ZFS */ +#undef HAVE_DMU_USEROBJ_ACCOUNTING + +/* Have dmu_write_by_dnode() in ZFS */ +#undef HAVE_DMU_WRITE_BY_DNODE + +/* quotactl_ops.set_dqblk takes struct fs_disk_quota */ +#undef HAVE_DQUOT_FS_DISK_QUOTA + +/* quotactl_ops.set_dqblk takes struct kqid */ +#undef HAVE_DQUOT_KQID + +/* quotactl_ops.set_dqblk takes struct qc_dqblk */ +#undef HAVE_DQUOT_QC_DQBLK + +/* dquot_suspend is defined */ +#undef HAVE_DQUOT_SUSPEND + +/* Have dsl_pool_config_enter/exit in ZFS */ +#undef HAVE_DSL_POOL_CONFIG + +/* Have dsl_sync_task_do_nowait in ZFS */ +#undef HAVE_DSL_SYNC_TASK_DO_NOWAIT + +/* dump_trace want address argument */ +#undef HAVE_DUMP_TRACE_ADDRESS + +/* d_compare need 4 arguments */ +#undef HAVE_D_COMPARE_4ARGS + +/* d_compare need 5 arguments */ +#undef HAVE_D_COMPARE_5ARGS + +/* d_compare need 7 arguments */ +#undef HAVE_D_COMPARE_7ARGS + +/* d_count exist */ +#undef HAVE_D_COUNT + +/* d_delete first parameter declared is not const */ +#undef HAVE_D_DELETE_CONST + +/* d_hash_and_lookup is exported by the kernel */ +#undef HAVE_D_HASH_AND_LOOKUP + +/* have d_make_root */ +#undef HAVE_D_MAKE_ROOT + +/* have parent inode as parameter */ +#undef HAVE_ENCODE_FH_PARENT + +/* Define to 1 if you have the header file. */ +#undef HAVE_ENDIAN_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_EXT2FS_EXT2FS_H + +/* ext4_bread takes 4 arguments */ +#undef HAVE_EXT4_BREAD_4ARGS + +/* i_dquot is in ext4_inode_info */ +#undef HAVE_EXT4_INFO_DQUOT + +/* ext4_free_blocks do not require struct buffer_head */ +#undef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD + +/* Linux kernel has ext_pblock */ +#undef HAVE_EXT_PBLOCK + +/* file handle and related syscalls are supported */ +#undef HAVE_FHANDLE_GLIBC_SUPPORT + +/* kernel supports fhandles and related syscalls */ +#undef HAVE_FHANDLE_SYSCALLS + +/* kernel has file_dentry */ +#undef HAVE_FILE_DENTRY + +/* file_operations.fsync takes 2 arguments */ +#undef HAVE_FILE_FSYNC_2ARGS + +/* file_operations.fsync takes 4 arguments */ +#undef HAVE_FILE_FSYNC_4ARGS + +/* struct file has member f_inode */ +#undef HAVE_FILE_F_INODE + +/* file_inode() has been defined */ +#undef HAVE_FILE_INODE + +/* generic_file_llseek_size is exported by the kernel */ +#undef HAVE_FILE_LLSEEK_SIZE + +/* kernel has generic_file_llseek_size with 5 args */ +#undef HAVE_FILE_LLSEEK_SIZE_5ARGS + +/* file_operations.[read|write]_iter functions exist */ +#undef HAVE_FILE_OPERATIONS_READ_WRITE_ITER + +/* filldir_t needs struct dir_context as argument */ +#undef HAVE_FILLDIR_USE_CTX + +/* FMR pool API is available */ +#undef HAVE_FMR_POOL_API + +/* fpu/api.h is present */ +#undef HAVE_FPU_API_HEADER + +/* struct file_system_type has mount field */ +#undef HAVE_FSTYPE_MOUNT + +/* fs_struct.lock use rwlock */ +#undef HAVE_FS_STRUCT_RWLOCK + +/* fs_struct use seqcount */ +#undef HAVE_FS_STRUCT_SEQCOUNT + +/* full_name_hash need 3 arguments */ +#undef HAVE_FULL_NAME_HASH_3ARGS + +/* generic_permission taken 2 arguments */ +#undef HAVE_GENERIC_PERMISSION_2ARGS + +/* generic_permission taken 4 arguments */ +#undef HAVE_GENERIC_PERMISSION_4ARGS + +/* generic_write_sync need 2 arguments */ +#undef HAVE_GENERIC_WRITE_SYNC_2ARGS + +/* Define to 1 if you have the `gethostbyname' function. */ +#undef HAVE_GETHOSTBYNAME + +/* 'get_acl' has a rcu argument */ +#undef HAVE_GET_ACL_RCU_ARG + +/* get_request_key_auth() is available */ +#undef HAVE_GET_REQUEST_KEY_AUTH + +/* get_user_pages takes 6 arguments */ +#undef HAVE_GET_USER_PAGES_6ARG + +/* get_user_pages takes gup_flags in arguments */ +#undef HAVE_GET_USER_PAGES_GUP_FLAGS + +/* struct group_info has member gid */ +#undef HAVE_GROUP_INFO_GID + +/* Define this is if you enable gss */ +#undef HAVE_GSS + +/* Define this if you enable gss keyring backend */ +#undef HAVE_GSS_KEYRING + +/* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */ +#undef HAVE_GSS_KRB5_CCACHE_NAME + +/* '__rhashtable_insert_fast()' returns int */ +#undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT + +/* Define this if you have Heimdal Kerberos libraries */ +#undef HAVE_HEIMDAL + +/* hlist_add_after is available */ +#undef HAVE_HLIST_ADD_AFTER + +/* hlist_for_each_entry has 3 args */ +#undef HAVE_HLIST_FOR_EACH_3ARG + +/* hotplug state machine is supported */ +#undef HAVE_HOTPLUG_STATE_MACHINE + +/* ib_alloc_fast_reg_mr is defined */ +#undef HAVE_IB_ALLOC_FAST_REG_MR + +/* ib_alloc_pd has 2 arguments */ +#undef HAVE_IB_ALLOC_PD_2ARGS + +/* struct ib_cq_init_attr is used by ib_create_cq */ +#undef HAVE_IB_CQ_INIT_ATTR + +/* struct ib_device.attrs is defined */ +#undef HAVE_IB_DEVICE_ATTRS + +/* if struct ib_device_ops is defined */ +#undef HAVE_IB_DEVICE_OPS + +/* ib_get_dma_mr is defined */ +#undef HAVE_IB_GET_DMA_MR + +/* function ib_inc_rkey exist */ +#undef HAVE_IB_INC_RKEY + +/* ib_map_mr_sg exists */ +#undef HAVE_IB_MAP_MR_SG + +/* ib_map_mr_sg has 5 arguments */ +#undef HAVE_IB_MAP_MR_SG_5ARGS + +/* ib_post_send and ib_post_recv have const parameters */ +#undef HAVE_IB_POST_SEND_RECV_CONST + +/* struct ib_rdma_wr is defined */ +#undef HAVE_IB_RDMA_WR + +/* if ib_sg_dma_address wrapper exists */ +#undef HAVE_IB_SG_DMA_ADDRESS + +/* INIT_LIST_HEAD_RCU exists */ +#undef HAVE_INIT_LIST_HEAD_RCU + +/* inode_operations .getattr member function can gather advance stats */ +#undef HAVE_INODEOPS_ENHANCED_GETATTR + +/* inode_operations has .truncate member function */ +#undef HAVE_INODEOPS_TRUNCATE + +/* inode_operations use umode_t as parameter */ +#undef HAVE_INODEOPS_USE_UMODE_T + +/* inode->i_alloc_sem is killed and use inode_dio_wait */ +#undef HAVE_INODE_DIO_WAIT + +/* inode.i_rcu exists */ +#undef HAVE_INODE_I_RCU + +/* inode_lock is defined */ +#undef HAVE_INODE_LOCK + +/* inode_owner_or_capable exist */ +#undef HAVE_INODE_OWNER_OR_CAPABLE + +/* inode_operations->permission has two args */ +#undef HAVE_INODE_PERMISION_2ARGS + +/* inode times are using timespec64 */ +#undef HAVE_INODE_TIMESPEC64 + +/* blk_integrity.interval exist */ +#undef HAVE_INTERVAL_BLK_INTEGRITY + +/* blk_integrity.interval_exp exist */ +#undef HAVE_INTERVAL_EXP_BLK_INTEGRITY + +/* Define to 1 if you have the header file. */ +#undef HAVE_INTTYPES_H + +/* address_space_operations.invalidatepage needs 3 arguments */ +#undef HAVE_INVALIDATE_RANGE + +/* have in_compat_syscall */ +#undef HAVE_IN_COMPAT_SYSCALL + +/* 'in_dev_for_each_ifa_rtnl' is defined */ +#undef HAVE_IN_DEV_FOR_EACH_IFA_RTNL + +/* inode_operations->rename need flags as argument */ +#undef HAVE_IOPS_RENAME_WITH_FLAGS + +/* have iop atomic_open */ +#undef HAVE_IOP_ATOMIC_OPEN + +/* generic_readlink has been removed */ +#undef HAVE_IOP_GENERIC_READLINK + +/* inode_operations has .get_acl member function */ +#undef HAVE_IOP_GET_ACL + +/* have iop get_link */ +#undef HAVE_IOP_GET_LINK + +/* inode_operations has .set_acl member function */ +#undef HAVE_IOP_SET_ACL + +/* inode_operations has {get,set,remove}xattr members */ +#undef HAVE_IOP_XATTR + +/* if iov_iter has member iter_type */ +#undef HAVE_IOV_ITER_HAS_TYPE_MEMBER + +/* iov_iter_init handles directional tag */ +#undef HAVE_IOV_ITER_INIT_DIRECTION + +/* iov_iter_rw exist */ +#undef HAVE_IOV_ITER_RW + +/* iov_iter_truncate exists */ +#undef HAVE_IOV_ITER_TRUNCATE + +/* if iov_iter_type exists */ +#undef HAVE_IOV_ITER_TYPE + +/* is_root_inode defined */ +#undef HAVE_IS_ROOT_INODE + +/* is_sxid is defined */ +#undef HAVE_IS_SXID + +/* 'iterate_shared' is available */ +#undef HAVE_ITERATE_SHARED + +/* struct address_space has i_pages */ +#undef HAVE_I_PAGES + +/* i_uid_read is present */ +#undef HAVE_I_UID_READ + +/* kallsyms_lookup_name is exported by kernel */ +#undef HAVE_KALLSYMS_LOOKUP_NAME + +/* kernel_locked is defined */ +#undef HAVE_KERNEL_LOCKED + +/* 'kernel_param_[un]lock' is available */ +#undef HAVE_KERNEL_PARAM_LOCK + +/* 'struct kernel_param_ops' is available */ +#undef HAVE_KERNEL_PARAM_OPS + +/* kernel_setsockopt still in use */ +#undef HAVE_KERNEL_SETSOCKOPT + +/* 'struct sock' accept function requires bool argument */ +#undef HAVE_KERN_SOCK_ACCEPT_FLAG_ARG + +/* 'getname' has two args */ +#undef HAVE_KERN_SOCK_GETNAME_2ARGS + +/* struct key_match_data exist */ +#undef HAVE_KEY_MATCH_DATA + +/* payload.data is an array */ +#undef HAVE_KEY_PAYLOAD_DATA_ARRAY + +/* key_type->instantiate has two args */ +#undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS + +/* key.usage is of type refcount_t */ +#undef HAVE_KEY_USAGE_REFCOUNT + +/* ki_left exist */ +#undef HAVE_KIOCB_KI_LEFT + +/* ki_nbytes field exist */ +#undef HAVE_KI_NBYTES + +/* have kmap_atomic has only 1 argument */ +#undef HAVE_KMAP_ATOMIC_HAS_1ARG + +/* kmap_to_page is exported by the kernel */ +#undef HAVE_KMAP_TO_PAGE + +/* Define this if you have MIT Kerberos libraries */ +#undef HAVE_KRB5 + +/* Define this if the function krb5int_derive_key is available */ +#undef HAVE_KRB5INT_DERIVE_KEY + +/* Define this if the function krb5_derive_key is available */ +#undef HAVE_KRB5_DERIVE_KEY + +/* Define this if the function krb5_get_error_message is available */ +#undef HAVE_KRB5_GET_ERROR_MESSAGE + +/* Define this if the function krb5_get_init_creds_opt_set_addressless is + available */ +#undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS + +/* kset_find_obj is exported by the kernel */ +#undef HAVE_KSET_FIND_OBJ + +/* kernel has kstrtobool_from_user */ +#undef HAVE_KSTRTOBOOL_FROM_USER + +/* kernel has kstrtoul */ +#undef HAVE_KSTRTOUL + +/* kthread_worker found */ +#undef HAVE_KTHREAD_WORK + +/* ktime_add is available */ +#undef HAVE_KTIME_ADD + +/* ktime_after is available */ +#undef HAVE_KTIME_AFTER + +/* ktime_before is available */ +#undef HAVE_KTIME_BEFORE + +/* ktime_compare is available */ +#undef HAVE_KTIME_COMPARE + +/* 'ktime_get_real_seconds' is available */ +#undef HAVE_KTIME_GET_REAL_SECONDS + +/* 'ktime_get_real_ts64' is available */ +#undef HAVE_KTIME_GET_REAL_TS64 + +/* 'ktime_get_seconds' is available */ +#undef HAVE_KTIME_GET_SECONDS + +/* 'ktime_get_ts64' is available */ +#undef HAVE_KTIME_GET_TS64 + +/* 'ktime_ms_delta' is available */ +#undef HAVE_KTIME_MS_DELTA + +/* 'ktime_to_timespec64' is available */ +#undef HAVE_KTIME_TO_TIMESPEC64 + +/* enable use of ldiskfsprogs package */ +#undef HAVE_LDISKFSPROGS + +/* kernel has ext4_map_blocks */ +#undef HAVE_LDISKFS_MAP_BLOCKS + +/* Enable ldiskfs osd */ +#undef HAVE_LDISKFS_OSD + +/* libefence support is requested */ +#undef HAVE_LIBEFENCE + +/* Define to 1 if you have the `keyutils' library (-lkeyutils). */ +#undef HAVE_LIBKEYUTILS + +/* build with libmount */ +#undef HAVE_LIBMOUNT + +/* use libpthread for libcfs library */ +#undef HAVE_LIBPTHREAD + +/* readline library is available */ +#undef HAVE_LIBREADLINE + +/* linux/rhashtable.h is present */ +#undef HAVE_LINUX_RHASHTABLE_H + +/* if linux/selinux.h exists */ +#undef HAVE_LINUX_SELINUX_IS_ENABLED + +/* linux/stdarg.h is present */ +#undef HAVE_LINUX_STDARG_HEADER + +/* lock_manager_operations has lm_compare_owner */ +#undef HAVE_LM_COMPARE_OWNER + +/* lock-manager ops renamed to lm_xxx */ +#undef HAVE_LM_XXX_LOCK_MANAGER_OPS + +/* kernel has locks_lock_file_wait */ +#undef HAVE_LOCKS_LOCK_FILE_WAIT + +/* lookup_user_key() is available */ +#undef HAVE_LOOKUP_USER_KEY + +/* kernel has LOOP_CTL_GET_FREE */ +#undef HAVE_LOOP_CTL_GET_FREE + +/* Enable lru resize support */ +#undef HAVE_LRU_RESIZE_SUPPORT + +/* Define this if the Kerberos GSS library supports + gss_krb5_export_lucid_sec_context */ +#undef HAVE_LUCID_CONTEXT_SUPPORT + +/* Define to 1 if you have the header file. */ +#undef HAVE_MEMORY_H + +/* address_space_operations.migratepage has 4 args */ +#undef HAVE_MIGRATEPAGE_4ARGS + +/* kernel has include/linux/migrate.h */ +#undef HAVE_MIGRATE_H + +/* kernel has include/linux/migrate_mode.h */ +#undef HAVE_MIGRATE_MODE_H + +/* mmap_lock API is available. */ +#undef HAVE_MMAP_LOCK + +/* kernel module loading is possible */ +#undef HAVE_MODULE_LOADING_SUPPORT + +/* locking module param is supported */ +#undef HAVE_MODULE_PARAM_LOCKING + +/* Define to 1 if you have the `name_to_handle_at' function. */ +#undef HAVE_NAME_TO_HANDLE_AT + +/* Define to 1 if you have the header file. */ +#undef HAVE_NETDB_H + +/* cancel_dirty_page with one arguement is available */ +#undef HAVE_NEW_CANCEL_DIRTY_PAGE + +/* DEFINE_TIMER uses only 2 arguements */ +#undef HAVE_NEW_DEFINE_TIMER + +/* 'kernel_write' aligns with read/write helpers */ +#undef HAVE_NEW_KERNEL_WRITE + +/* NR_UNSTABLE_NFS is still in use. */ +#undef HAVE_NR_UNSTABLE_NFS + +/* ns_to_timespec64() is available */ +#undef HAVE_NS_TO_TIMESPEC64 + +/* with oldsize */ +#undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE + +/* openssl-devel is present */ +#undef HAVE_OPENSSL_GETSEPOL + +/* OpenSSL HMAC functions needed for SSK */ +#undef HAVE_OPENSSL_SSK + +/* 'pagevec_init' takes one parameter */ +#undef HAVE_PAGEVEC_INIT_ONE_PARAM + +/* linux/panic_notifier.h is present */ +#undef HAVE_PANIC_NOTIFIER_H + +/* 'param_set_uint_minmax' is available */ +#undef HAVE_PARAM_SET_UINT_MINMAX + +/* have PCLMULQDQ instruction */ +#undef HAVE_PCLMULQDQ + +/* percpu_counter_init uses GFP_* flag */ +#undef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG + +/* 'struct nsproxy' has 'pid_ns_for_children' */ +#undef HAVE_PID_NS_FOR_CHILDREN + +/* posix_acl_to_xattr takes struct user_namespace */ +#undef HAVE_POSIXACL_USER_NS + +/* 'posix_acl_update_mode' is available */ +#undef HAVE_POSIX_ACL_UPDATE_MODE + +/* posix_acl_valid takes struct user_namespace */ +#undef HAVE_POSIX_ACL_VALID_USER_NS + +/* 'prepare_to_wait_event' is available */ +#undef HAVE_PREPARE_TO_WAIT_EVENT + +/* struct proc_ops exists */ +#undef HAVE_PROC_OPS + +/* proc_remove is defined */ +#undef HAVE_PROC_REMOVE + +/* get_projid function exists */ +#undef HAVE_PROJECT_QUOTA + +/* inode->i_nlink is protected from direct modification */ +#undef HAVE_PROTECT_I_NLINK + +/* 'PTR_ERR_OR_ZERO' exist */ +#undef HAVE_PTR_ERR_OR_ZERO + +/* have quota64 */ +#undef HAVE_QUOTA64 + +/* radix_tree_exceptional_entry exist */ +#undef HAVE_RADIX_EXCEPTION_ENTRY + +/* rdma_connect_locked is defined */ +#undef HAVE_RDMA_CONNECT_LOCKED + +/* rdma_create_id wants 4 args */ +#undef HAVE_RDMA_CREATE_ID_4ARG + +/* rdma_create_id wants 5 args */ +#undef HAVE_RDMA_CREATE_ID_5ARG + +/* rdma_reject has 4 arguments */ +#undef HAVE_RDMA_REJECT_4ARGS + +/* kernel export remove_from_page_cache */ +#undef HAVE_REMOVE_FROM_PAGE_CACHE + +/* remove_proc_subtree is defined */ +#undef HAVE_REMOVE_PROC_SUBTREE + +/* rhashtable_lookup() is available */ +#undef HAVE_RHASHTABLE_LOOKUP + +/* rhashtable_lookup_get_insert_fast() is available */ +#undef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST + +/* struct rhltable exist */ +#undef HAVE_RHLTABLE + +/* save_stack_trace_tsk is exported */ +#undef HAVE_SAVE_STACK_TRACE_TSK + +/* Have sa_spill_alloc in ZFS */ +#undef HAVE_SA_SPILL_ALLOC + +/* super_operations.evict_inode() is exist in kernel */ +#undef HAVE_SBOPS_EVICT_INODE + +/* kernel supports wrapped FS freeze functions */ +#undef HAVE_SB_START_WRITE + +/* Define to 1 if you have the header file. */ +#undef HAVE_SCHED_H + +/* linux/sched header directory exist */ +#undef HAVE_SCHED_HEADERS + +/* security_dentry_init_security' is defined */ +#undef HAVE_SECURITY_DENTRY_INIT_SECURITY + +/* security_inode_init_security takes a callback to set xattrs */ +#undef HAVE_SECURITY_IINITSEC_CALLBACK + +/* security_inode_init_security takes a 'struct qstr' parameter */ +#undef HAVE_SECURITY_IINITSEC_QSTR + +/* security_inode_listsecurity() is available/exported */ +#undef HAVE_SECURITY_INODE_LISTSECURITY + +/* security_release_secctx has 1 arg. */ +#undef HAVE_SEC_RELEASE_SECCTX_1ARG + +/* support for selinux */ +#undef HAVE_SELINUX + +/* Define to 1 if you have the header file. */ +#undef HAVE_SELINUX_SELINUX_H + +/* support server */ +#undef HAVE_SERVER_SUPPORT + +/* Define to 1 if you have the `setns' function. */ +#undef HAVE_SETNS + +/* Define this if the Kerberos GSS library supports + gss_krb5_set_allowable_enctypes */ +#undef HAVE_SET_ALLOWABLE_ENCTYPES + +/* shrinker has count_objects member */ +#undef HAVE_SHRINKER_COUNT + +/* shrinker want self pointer in handler */ +#undef HAVE_SHRINKER_WANT_SHRINK_PTR + +/* shrink_control is present */ +#undef HAVE_SHRINK_CONTROL + +/* simple_setattr is exported by the kernel */ +#undef HAVE_SIMPLE_SETATTR + +/* sk_data_ready uses only one argument */ +#undef HAVE_SK_DATA_READY_ONE_ARG + +/* kernel has sk_sleep */ +#undef HAVE_SK_SLEEP + +/* sock_create_kern use net as first parameter */ +#undef HAVE_SOCK_CREATE_KERN_USE_NET + +/* Have spa_maxblocksize in ZFS */ +#undef HAVE_SPA_MAXBLOCKSIZE + +/* struct stacktrace_ops exists */ +#undef HAVE_STACKTRACE_OPS + +/* stacktrace_ops.warning is exist */ +#undef HAVE_STACKTRACE_WARNING + +/* Define to 1 if you have the header file. */ +#undef HAVE_STDINT_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_STDLIB_H + +/* stringhash.h is present */ +#undef HAVE_STRINGHASH + +/* Define to 1 if you have the header file. */ +#undef HAVE_STRINGS_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_STRING_H + +/* Define to 1 if you have the `strnlen' function. */ +#undef HAVE_STRNLEN + +/* struct posix_acl_xattr_{header,entry} defined */ +#undef HAVE_STRUCT_POSIX_ACL_XATTR + +/* submit_bio takes two arguments */ +#undef HAVE_SUBMIT_BIO_2ARGS + +/* sunrpc_cache_pipe_upcall takes 3 args */ +#undef HAVE_SUNRPC_UPCALL_HAS_3ARGS + +/* super_operations use dentry as parameter */ +#undef HAVE_SUPEROPS_USE_DENTRY + +/* 'super_setup_bdi_name' is available */ +#undef HAVE_SUPER_SETUP_BDI_NAME + +/* symlink inode operations need struct nameidata argument */ +#undef HAVE_SYMLINK_OPS_USE_NAMEIDATA + +/* new_sync_[read|write] is exported by the kernel */ +#undef HAVE_SYNC_READ_WRITE + +/* ctl_table has ctl_name field */ +#undef HAVE_SYSCTL_CTLNAME + +/* Define to 1 if you have . */ +#undef HAVE_SYS_QUOTA_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_STAT_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_TYPES_H + +/* task_is_running() is defined */ +#undef HAVE_TASK_IS_RUNNING + +/* tcp_sendpage use socket as first parameter */ +#undef HAVE_TCP_SENDPAGE_USE_SOCKET + +/* 'tcp_sock_set_keepidle()' exists */ +#undef HAVE_TCP_SOCK_SET_KEEPIDLE + +/* 'tcp_sock_set_nodelay()' exists */ +#undef HAVE_TCP_SOCK_SET_NODELAY + +/* timer_setup has replaced setup_timer */ +#undef HAVE_TIMER_SETUP + +/* 'struct timespec64' is available */ +#undef HAVE_TIMESPEC64 + +/* 'timespec64_sub' is available */ +#undef HAVE_TIMESPEC64_SUB + +/* 'timespec64_to_ktime' is available */ +#undef HAVE_TIMESPEC64_TO_KTIME + +/* topology_sibling_cpumask is available */ +#undef HAVE_TOPOLOGY_SIBLING_CPUMASK + +/* if totalram_pages is a function */ +#undef HAVE_TOTALRAM_PAGES_AS_FUNC + +/* kernel export truncate_complete_page */ +#undef HAVE_TRUNCATE_COMPLETE_PAGE + +/* kernel has truncate_inode_pages_final */ +#undef HAVE_TRUNCATE_INODE_PAGES_FINAL + +/* if MS_RDONLY was moved to uapi/linux/mount.h */ +#undef HAVE_UAPI_LINUX_MOUNT_H + +/* uidgid.h is present */ +#undef HAVE_UIDGID_HEADER + +/* Define to 1 if you have the header file. */ +#undef HAVE_UNISTD_H + +/* 'inode_operations' members have user namespace argument */ +#undef HAVE_USER_NAMESPACE_ARG + +/* kernel has vfs_rename with 5 args */ +#undef HAVE_VFS_RENAME_5ARGS + +/* kernel has vfs_rename with 6 args */ +#undef HAVE_VFS_RENAME_6ARGS + +/* '__vfs_setxattr is available */ +#undef HAVE_VFS_SETXATTR + +/* kernel has vfs_unlink with 3 args */ +#undef HAVE_VFS_UNLINK_3ARGS + +/* __vmalloc only takes 2 args. */ +#undef HAVE_VMALLOC_2ARGS + +/* virtual_address has been replaced by address field */ +#undef HAVE_VM_FAULT_ADDRESS + +/* if VM_FAULT_RETRY is defined */ +#undef HAVE_VM_FAULT_RETRY + +/* if vm_fault_t type exists */ +#undef HAVE_VM_FAULT_T + +/* 'struct vm_operations' remove struct vm_area_struct argument */ +#undef HAVE_VM_OPS_USE_VM_FAULT_ONLY + +/* wait_bit.h is present */ +#undef HAVE_WAIT_BIT_HEADER_H + +/* 'wait_queue_entry_t' is available */ +#undef HAVE_WAIT_QUEUE_ENTRY + +/* linux wait_queue_head_t list_head is name head */ +#undef HAVE_WAIT_QUEUE_ENTRY_LIST + +/* 'wait_var_event' is available */ +#undef HAVE_WAIT_VAR_EVENT + +/* flags field exist */ +#undef HAVE_XATTR_HANDLER_FLAGS + +/* needs inode parameter */ +#undef HAVE_XATTR_HANDLER_INODE_PARAM + +/* xattr_handler has a name member */ +#undef HAVE_XATTR_HANDLER_NAME + +/* handler pointer is parameter */ +#undef HAVE_XATTR_HANDLER_SIMPLIFIED + +/* xa_is_value exist */ +#undef HAVE_XA_IS_VALUE + +/* Have zap_add_by_dnode() in ZFS */ +#undef HAVE_ZAP_ADD_BY_DNODE + +/* Have zap_lookup_by_dnode() in ZFS */ +#undef HAVE_ZAP_LOOKUP_BY_DNODE + +/* Have zap_remove_by_dnode() in ZFS */ +#undef HAVE_ZAP_REMOVE_ADD_BY_DNODE + +/* Have inode_timespec_t */ +#undef HAVE_ZFS_INODE_TIMESPEC + +/* Have multihost protection in ZFS */ +#undef HAVE_ZFS_MULTIHOST + +/* Enable zfs osd */ +#undef HAVE_ZFS_OSD + +/* Have zfs_refcount_add */ +#undef HAVE_ZFS_REFCOUNT_ADD + +/* __add_wait_queue_exclusive exists */ +#undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE + +/* ext4_journal_start takes 3 arguments */ +#undef JOURNAL_START_HAS_3ARGS + +/* Define this as the Kerberos version number */ +#undef KRB5_VERSION + +/* enable libcfs LASSERT, LASSERTF */ +#undef LIBCFS_DEBUG + +/* use dumplog on panic */ +#undef LNET_DUMP_ON_PANIC + +/* Define to the sub-directory in which libtool stores uninstalled libraries. + */ +#undef LT_OBJDIR + +/* Fourth number in the Lustre version */ +#undef LUSTRE_FIX + +/* First number in the Lustre version */ +#undef LUSTRE_MAJOR + +/* Second number in the Lustre version */ +#undef LUSTRE_MINOR + +/* Third number in the Lustre version */ +#undef LUSTRE_PATCH + +/* A copy of PACKAGE_VERSION */ +#undef LUSTRE_VERSION_STRING + +/* maximum number of MDS threads */ +#undef MDS_MAX_THREADS + +/* Report minimum OST free space */ +#undef MIN_DF + +/* name of ldiskfs mkfs program */ +#undef MKE2FS + +/* need pclmulqdq based crc32c */ +#undef NEED_CRC32C_ACCEL + +/* need pclmulqdq based crc32 */ +#undef NEED_CRC32_ACCEL + +/* 'ktime_get_ns' is not available */ +#undef NEED_KTIME_GET_NS + +/* 'ktime_get_real_ns' is not available */ +#undef NEED_KTIME_GET_REAL_NS + +/* enable nodemap proc debug support */ +#undef NODEMAP_PROC_DEBUG + +/* Name of package */ +#undef PACKAGE + +/* Define to the address where bug reports for this package should be sent. */ +#undef PACKAGE_BUGREPORT + +/* Define to the full name of this package. */ +#undef PACKAGE_NAME + +/* Define to the full name and version of this package. */ +#undef PACKAGE_STRING + +/* Define to the one symbol short name of this package. */ +#undef PACKAGE_TARNAME + +/* Define to the home page for this package. */ +#undef PACKAGE_URL + +/* Define to the version of this package. */ +#undef PACKAGE_VERSION + +/* name of parallel fsck program */ +#undef PFSCK + +/* enable randomly alloc failure */ +#undef RANDOM_FAIL_ALLOC + +/* The size of `unsigned long long', as computed by sizeof. */ +#undef SIZEOF_UNSIGNED_LONG_LONG + +/* use tunable backoff TCP */ +#undef SOCKNAL_BACKOFF + +/* tunable backoff TCP in ms */ +#undef SOCKNAL_BACKOFF_MS + +/* 'struct stacktrace_ops' address function returns an int */ +#undef STACKTRACE_OPS_ADDRESS_RETURN_INT + +/* 'struct stacktrace_ops' has 'walk_stack' field */ +#undef STACKTRACE_OPS_HAVE_WALK_STACK + +/* Define to 1 if you have the ANSI C header files. */ +#undef STDC_HEADERS + +/* name of ldiskfs tune program */ +#undef TUNE2FS + +/* Define this if the private function, gss_krb5_cache_name, must be used to + tell the Kerberos library which credentials cache to use. Otherwise, this + is done by setting the KRB5CCNAME environment variable */ +#undef USE_GSS_KRB5_CCACHE_NAME + +/* Write when Checking Health */ +#undef USE_HEALTH_CHECK_WRITE + +/* enable lu_ref reference tracking code */ +#undef USE_LU_REF + +/* Version number of package */ +#undef VERSION + +/* zfs fix version */ +#undef ZFS_FIX + +/* zfs major version */ +#undef ZFS_MAJOR + +/* zfs minor version */ +#undef ZFS_MINOR + +/* zfs patch version */ +#undef ZFS_PATCH diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index b9c8e40252142..fb24b4f1957f8 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -652,8 +652,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) !vhost_vq_avail_empty(vq->dev, vq); } -#define SKB_FRAG_PAGE_ORDER get_order(32768) - static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig index 80c5f9c16ec17..95d82c97fdc4d 100644 --- a/drivers/virt/Kconfig +++ b/drivers/virt/Kconfig @@ -13,6 +13,19 @@ menuconfig VIRT_DRIVERS if VIRT_DRIVERS +config VMGENID + tristate "Virtual Machine Generation ID driver" + depends on ACPI && SYSGENID + help + The driver uses the hypervisor provided Virtual Machine Generation ID + to drive the system generation counter mechanism exposed by sysgenid. + The vmgenid changes on VM snapshots or VM cloning. The hypervisor + provided 128-bit vmgenid is also used as device randomness to improve + kernel entropy following VM snapshot events. + + To compile this driver as a module, choose M here: the + module will be called vmgenid. + config FSL_HV_MANAGER tristate "Freescale hypervisor management driver" depends on FSL_SOC diff --git a/drivers/virt/Makefile b/drivers/virt/Makefile index f28425ce4b39b..889be010884b4 100644 --- a/drivers/virt/Makefile +++ b/drivers/virt/Makefile @@ -4,6 +4,7 @@ # obj-$(CONFIG_FSL_HV_MANAGER) += fsl_hypervisor.o +obj-$(CONFIG_VMGENID) += vmgenid.o obj-y += vboxguest/ obj-$(CONFIG_NITRO_ENCLAVES) += nitro_enclaves/ diff --git a/drivers/virt/nitro_enclaves/Kconfig b/drivers/virt/nitro_enclaves/Kconfig index 8c9387a232df8..f53740b941c0f 100644 --- a/drivers/virt/nitro_enclaves/Kconfig +++ b/drivers/virt/nitro_enclaves/Kconfig @@ -1,17 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 # -# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. # Amazon Nitro Enclaves (NE) support. # Nitro is a hypervisor that has been developed by Amazon. -# TODO: Add dependency for ARM64 once NE is supported on Arm platforms. For now, -# the NE kernel driver can be built for aarch64 arch. -# depends on (ARM64 || X86) && HOTPLUG_CPU && PCI && SMP - config NITRO_ENCLAVES tristate "Nitro Enclaves Support" - depends on X86 && HOTPLUG_CPU && PCI && SMP + depends on (ARM64 || X86) && HOTPLUG_CPU && PCI && SMP help This driver consists of support for enclave lifetime management for Nitro Enclaves (NE). diff --git a/drivers/virt/nitro_enclaves/ne_misc_dev.c b/drivers/virt/nitro_enclaves/ne_misc_dev.c index e21e1e86ad15f..8939612ee0e08 100644 --- a/drivers/virt/nitro_enclaves/ne_misc_dev.c +++ b/drivers/virt/nitro_enclaves/ne_misc_dev.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. */ /** @@ -284,8 +284,8 @@ static int ne_setup_cpu_pool(const char *ne_cpu_list) ne_cpu_pool.nr_parent_vm_cores = nr_cpu_ids / ne_cpu_pool.nr_threads_per_core; ne_cpu_pool.avail_threads_per_core = kcalloc(ne_cpu_pool.nr_parent_vm_cores, - sizeof(*ne_cpu_pool.avail_threads_per_core), - GFP_KERNEL); + sizeof(*ne_cpu_pool.avail_threads_per_core), + GFP_KERNEL); if (!ne_cpu_pool.avail_threads_per_core) { rc = -ENOMEM; @@ -735,7 +735,7 @@ static int ne_add_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id) * * Negative return value on failure. */ static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave, - struct ne_user_memory_region mem_region) + struct ne_user_memory_region mem_region) { struct ne_mem_region *ne_mem_region = NULL; @@ -771,7 +771,7 @@ static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave, u64 userspace_addr = ne_mem_region->userspace_addr; if ((userspace_addr <= mem_region.userspace_addr && - mem_region.userspace_addr < (userspace_addr + memory_size)) || + mem_region.userspace_addr < (userspace_addr + memory_size)) || (mem_region.userspace_addr <= userspace_addr && (mem_region.userspace_addr + mem_region.memory_size) > userspace_addr)) { dev_err_ratelimited(ne_misc_dev.this_device, @@ -836,7 +836,7 @@ static int ne_sanity_check_user_mem_region_page(struct ne_enclave *ne_enclave, * * Negative return value on failure. */ static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave, - struct ne_user_memory_region mem_region) + struct ne_user_memory_region mem_region) { long gup_rc = 0; unsigned long i = 0; @@ -1014,7 +1014,7 @@ static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave, * * Negative return value on failure. */ static int ne_start_enclave_ioctl(struct ne_enclave *ne_enclave, - struct ne_enclave_start_info *enclave_start_info) + struct ne_enclave_start_info *enclave_start_info) { struct ne_pci_dev_cmd_reply cmd_reply = {}; unsigned int cpu = 0; @@ -1574,7 +1574,8 @@ static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 __user *slot_ui mutex_unlock(&ne_cpu_pool.mutex); ne_enclave->threads_per_core = kcalloc(ne_enclave->nr_parent_vm_cores, - sizeof(*ne_enclave->threads_per_core), GFP_KERNEL); + sizeof(*ne_enclave->threads_per_core), + GFP_KERNEL); if (!ne_enclave->threads_per_core) { rc = -ENOMEM; diff --git a/drivers/virt/nitro_enclaves/ne_pci_dev.c b/drivers/virt/nitro_enclaves/ne_pci_dev.c index b9c1de41e300c..40b49ec8e30b1 100644 --- a/drivers/virt/nitro_enclaves/ne_pci_dev.c +++ b/drivers/virt/nitro_enclaves/ne_pci_dev.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. */ /** @@ -480,6 +480,8 @@ static int ne_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto free_ne_pci_dev; } + pci_set_master(pdev); + rc = pci_request_regions_exclusive(pdev, "nitro_enclaves"); if (rc < 0) { dev_err(&pdev->dev, "Error in pci request regions [rc=%d]\n", rc); diff --git a/drivers/virt/nitro_enclaves/ne_pci_dev.h b/drivers/virt/nitro_enclaves/ne_pci_dev.h index 8bfbc66078185..6e9f28971a4e0 100644 --- a/drivers/virt/nitro_enclaves/ne_pci_dev.h +++ b/drivers/virt/nitro_enclaves/ne_pci_dev.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. */ #ifndef _NE_PCI_DEV_H_ @@ -84,9 +84,13 @@ */ /** - * NE_SEND_DATA_SIZE / NE_RECV_DATA_SIZE - 240 bytes for send / recv buffer. + * NE_SEND_DATA_SIZE - Size of the send buffer, in bytes. */ #define NE_SEND_DATA_SIZE (240) + +/** + * NE_RECV_DATA_SIZE - Size of the receive buffer, in bytes. + */ #define NE_RECV_DATA_SIZE (240) /** diff --git a/drivers/virt/vmgenid.c b/drivers/virt/vmgenid.c new file mode 100644 index 0000000000000..d9d089a6c4a59 --- /dev/null +++ b/drivers/virt/vmgenid.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Virtual Machine Generation ID driver + * + * Copyright (C) 2018 Red Hat Inc. All rights reserved. + * + * Copyright (C) 2020 Amazon. All rights reserved. + * + * Authors: + * Adrian Catangiu + * Or Idgar + * Gal Hammer + * + */ +#include +#include +#include +#include +#include +#include + +#define DEV_NAME "vmgenid" +ACPI_MODULE_NAME(DEV_NAME); + +struct vmgenid_data { + uuid_t uuid; + void *uuid_iomap; +}; +static struct vmgenid_data vmgenid_data; + +static int vmgenid_acpi_map(struct vmgenid_data *priv, acpi_handle handle) +{ + int i; + phys_addr_t phys_addr; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + acpi_status status; + union acpi_object *pss; + union acpi_object *element; + + status = acpi_evaluate_object(handle, "ADDR", NULL, &buffer); + if (ACPI_FAILURE(status)) { + ACPI_EXCEPTION((AE_INFO, status, "Evaluating ADDR")); + return -ENODEV; + } + pss = buffer.pointer; + if (!pss || pss->type != ACPI_TYPE_PACKAGE || pss->package.count != 2) + return -EINVAL; + + phys_addr = 0; + for (i = 0; i < pss->package.count; i++) { + element = &(pss->package.elements[i]); + if (element->type != ACPI_TYPE_INTEGER) + return -EINVAL; + phys_addr |= element->integer.value << i * 32; + } + + priv->uuid_iomap = acpi_os_map_memory(phys_addr, sizeof(uuid_t)); + if (!priv->uuid_iomap) { + pr_err("Could not map memory at 0x%llx, size %u\n", + phys_addr, + (u32) sizeof(uuid_t)); + return -ENOMEM; + } + + memcpy_fromio(&priv->uuid, priv->uuid_iomap, sizeof(uuid_t)); + + return 0; +} + +static int vmgenid_acpi_add(struct acpi_device *device) +{ + int ret; + + if (!device) + return -EINVAL; + device->driver_data = &vmgenid_data; + + ret = vmgenid_acpi_map(device->driver_data, device->handle); + if (ret < 0) { + pr_err("vmgenid: failed to map acpi device\n"); + device->driver_data = NULL; + } + + return ret; +} + +static int vmgenid_acpi_remove(struct acpi_device *device) +{ + if (!device || acpi_driver_data(device) != &vmgenid_data) + return -EINVAL; + device->driver_data = NULL; + + if (vmgenid_data.uuid_iomap) + acpi_os_unmap_memory(vmgenid_data.uuid_iomap, sizeof(uuid_t)); + vmgenid_data.uuid_iomap = NULL; + + return 0; +} + +static void vmgenid_acpi_notify(struct acpi_device *device, u32 event) +{ + uuid_t old_uuid; + + if (!device || acpi_driver_data(device) != &vmgenid_data) { + pr_err("VMGENID notify with unexpected driver private data\n"); + return; + } + + /* update VM Generation UUID */ + old_uuid = vmgenid_data.uuid; + memcpy_fromio(&vmgenid_data.uuid, vmgenid_data.uuid_iomap, sizeof(uuid_t)); + + if (memcmp(&old_uuid, &vmgenid_data.uuid, sizeof(uuid_t))) { + /* HW uuid updated */ + sysgenid_bump_generation(); + add_device_randomness(&vmgenid_data.uuid, sizeof(uuid_t)); + } +} + +static const struct acpi_device_id vmgenid_ids[] = { + {"VMGENID", 0}, + {"QEMUVGID", 0}, + {"", 0}, +}; + +static struct acpi_driver acpi_vmgenid_driver = { + .name = "vm_generation_id", + .ids = vmgenid_ids, + .owner = THIS_MODULE, + .ops = { + .add = vmgenid_acpi_add, + .remove = vmgenid_acpi_remove, + .notify = vmgenid_acpi_notify, + } +}; + +static int __init vmgenid_init(void) +{ + return acpi_bus_register_driver(&acpi_vmgenid_driver); +} + +static void __exit vmgenid_exit(void) +{ + acpi_bus_unregister_driver(&acpi_vmgenid_driver); +} + +module_init(vmgenid_init); +module_exit(vmgenid_exit); + +MODULE_AUTHOR("Adrian Catangiu"); +MODULE_DESCRIPTION("Virtual Machine Generation ID"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("0.1"); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 481611c09dae1..eea83444fa7d2 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Balloon device works in 4K page units. So each page is pointed to by @@ -46,6 +47,13 @@ static struct vfsmount *balloon_mnt; #endif +static bool report_offline = false; +module_param(report_offline, bool, 0444); +MODULE_PARM_DESC(report_offline, + "Report offlined pages to the hypervisor"); + +static DEFINE_MUTEX(vb_page_report_lock); + enum virtio_balloon_vq { VIRTIO_BALLOON_VQ_INFLATE, VIRTIO_BALLOON_VQ_DEFLATE, @@ -173,6 +181,15 @@ static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_i struct virtqueue *vq = vb->reporting_vq; unsigned int unused, err; + /* + * virtqueue callers must make sure that only one thread is + * using a queue. With offline page reporting enabled, multiple + * threads might be calling this function at the same time. + * + * So, make sure they don't get in each other's way. + */ + mutex_lock(&vb_page_report_lock); + /* We should always be able to add these buffers to an empty queue. */ err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN); @@ -181,17 +198,55 @@ static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_i * are able to trigger an error we will simply display a warning * and exit without actually processing the pages. */ - if (WARN_ON_ONCE(err)) + if (WARN_ON_ONCE(err)) { + mutex_unlock(&vb_page_report_lock); return err; + } virtqueue_kick(vq); /* When host has read buffer, this completes via balloon_ack */ wait_event(vb->acked, virtqueue_get_buf(vq, &unused)); + mutex_unlock(&vb_page_report_lock); + return 0; } +/* + * Callback for memory offline. Takes the offlined range and passes it + * to the normal free page reporting entry point. + * + * Assumptions that are currently all true: + * + * 1) We're in a safe context to sleep. + * 2) The offlined range is <= a memory section (128M on x86, 1G on arm64), + * and so the length will fit in a 32bit field. + */ +static int virtioballoon_free_page_report_offline( + struct page_reporting_dev_info *pr_dev_info, + unsigned long start_pfn, unsigned int nr_pages) +{ + struct scatterlist sgl; + unsigned int len = nr_pages << PAGE_SHIFT; + int err; + + /* + * Set the page to NULL to signal a "pre-mapped" address, + * e.g. the virtio ring code will not touch the page + * structure and will just use the dma_address passed in. + */ + sg_init_table(&sgl, 1); + sg_set_page(&sgl, NULL, len, 0); + sgl.dma_address = PFN_PHYS(start_pfn); + + err = virtballoon_free_page_report(pr_dev_info, &sgl, 1); + if (err) + pr_err("virtio_balloon: offline reporting failed (%d)\n", err); + + return err; +} + static void set_page_pfns(struct virtio_balloon *vb, __virtio32 pfns[], struct page *page) { @@ -984,6 +1039,8 @@ static int virtballoon_probe(struct virtio_device *vdev) } vb->pr_dev_info.report = virtballoon_free_page_report; + if (report_offline) + vb->pr_dev_info.report_offline = virtioballoon_free_page_report_offline; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { unsigned int capacity; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 3cc2a4ee7152c..03635431af428 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -327,7 +327,8 @@ static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, enum dma_data_direction direction) { if (!vq->use_dma_api) - return (dma_addr_t)sg_phys(sg); + return sg_page(sg) == NULL ? sg_dma_address(sg) : + (dma_addr_t)sg_phys(sg); /* * We can't use dma_map_sg, because we don't use scatterlists in diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index fba78daee449a..d2088fae608a7 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -65,6 +65,10 @@ #include #include +#ifdef CONFIG_ACPI +#include +#endif + #include "events_internal.h" #undef MODULE_PARAM_PREFIX @@ -493,6 +497,14 @@ static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu) info->cpu = cpu; } +static void xen_evtchn_mask_all(void) +{ + evtchn_port_t evtchn; + + for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) + mask_evtchn(evtchn); +} + /** * notify_remote_via_irq - send event to remote end of event channel via irq * @irq: irq of event channel to send event to @@ -2028,6 +2040,7 @@ void xen_irq_resume(void) struct irq_info *info; /* New event-channel space is not 'live' yet. */ + xen_evtchn_mask_all(); xen_evtchn_resume(); /* No IRQ <-> event-channel mappings. */ @@ -2044,6 +2057,19 @@ void xen_irq_resume(void) restore_pirqs(); } +void xen_shutdown_pirqs(void) +{ + struct irq_info *info; + + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ || !VALID_EVTCHN(info->evtchn)) + continue; + + shutdown_pirq(irq_get_irq_data(info->irq)); + irq_state_clr_started(irq_to_desc(info->irq)); + } +} + static struct irq_chip xen_dynamic_chip __read_mostly = { .name = "xen-dyn", @@ -2161,7 +2187,6 @@ static int xen_evtchn_cpu_dead(unsigned int cpu) void __init xen_init_IRQ(void) { int ret = -EINVAL; - evtchn_port_t evtchn; if (xen_fifo_events) ret = xen_evtchn_fifo_init(); @@ -2181,8 +2206,7 @@ void __init xen_init_IRQ(void) BUG_ON(!evtchn_to_irq); /* No event channels are 'live' right now. */ - for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) - mask_evtchn(evtchn); + xen_evtchn_mask_all(); pirq_needs_eoi = pirq_needs_eoi_flag; diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index cd046684e0d1b..042fc68dc7a36 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,31 @@ enum shutdown_state { /* Ignore multiple shutdown requests. */ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; +enum suspend_modes { + NO_SUSPEND = 0, + XEN_SUSPEND, + PM_SUSPEND, + PM_HIBERNATION, +}; + +/* Protected by pm_mutex */ +static enum suspend_modes suspend_mode = NO_SUSPEND; + +bool xen_suspend_mode_is_xen_suspend(void) +{ + return suspend_mode == XEN_SUSPEND; +} + +bool xen_suspend_mode_is_pm_suspend(void) +{ + return suspend_mode == PM_SUSPEND; +} + +bool xen_suspend_mode_is_pm_hibernation(void) +{ + return suspend_mode == PM_HIBERNATION; +} + struct suspend_info { int cancelled; }; @@ -99,6 +125,10 @@ static void do_suspend(void) int err; struct suspend_info si; + lock_system_sleep(); + + suspend_mode = XEN_SUSPEND; + shutting_down = SHUTDOWN_SUSPEND; err = freeze_processes(); @@ -162,6 +192,10 @@ static void do_suspend(void) thaw_processes(); out: shutting_down = SHUTDOWN_INVALID; + + suspend_mode = NO_SUSPEND; + + unlock_system_sleep(); } #endif /* CONFIG_HIBERNATE_CALLBACKS */ @@ -387,3 +421,42 @@ int xen_setup_shutdown_event(void) EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); subsys_initcall(xen_setup_shutdown_event); + +static int xen_pm_notifier(struct notifier_block *notifier, + unsigned long pm_event, void *unused) +{ + switch (pm_event) { + case PM_SUSPEND_PREPARE: + suspend_mode = PM_SUSPEND; + break; + case PM_HIBERNATION_PREPARE: + case PM_RESTORE_PREPARE: + suspend_mode = PM_HIBERNATION; + break; + case PM_POST_SUSPEND: + case PM_POST_RESTORE: + case PM_POST_HIBERNATION: + /* Set back to the default */ + suspend_mode = NO_SUSPEND; + break; + default: + pr_warn("Receive unknown PM event 0x%lx\n", pm_event); + return -EINVAL; + } + + return 0; +}; + +static struct notifier_block xen_pm_notifier_block = { + .notifier_call = xen_pm_notifier +}; + +static int xen_setup_pm_notifier(void) +{ + if (!xen_hvm_domain()) + return -ENODEV; + + return register_pm_notifier(&xen_pm_notifier_block); +} + +subsys_initcall(xen_setup_pm_notifier); diff --git a/drivers/xen/time.c b/drivers/xen/time.c index 108edbcbc040f..87a1fd88989dc 100644 --- a/drivers/xen/time.c +++ b/drivers/xen/time.c @@ -23,6 +23,9 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); static DEFINE_PER_CPU(u64[4], old_runstate_time); +static DEFINE_PER_CPU(u64, xen_prev_steal_clock); +static DEFINE_PER_CPU(u64, xen_steal_clock_offset); + /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) { @@ -149,7 +152,7 @@ bool xen_vcpu_stolen(int vcpu) return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; } -u64 xen_steal_clock(int cpu) +static u64 __xen_steal_clock(int cpu) { struct vcpu_runstate_info state; @@ -157,6 +160,30 @@ u64 xen_steal_clock(int cpu) return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline]; } +u64 xen_steal_clock(int cpu) +{ + return __xen_steal_clock(cpu) + per_cpu(xen_steal_clock_offset, cpu); +} + +void xen_save_steal_clock(int cpu) +{ + per_cpu(xen_prev_steal_clock, cpu) = xen_steal_clock(cpu); +} + +void xen_restore_steal_clock(int cpu) +{ + u64 steal_clock = __xen_steal_clock(cpu); + + if (per_cpu(xen_prev_steal_clock, cpu) > steal_clock) { + /* Need to update the offset */ + per_cpu(xen_steal_clock_offset, cpu) = + per_cpu(xen_prev_steal_clock, cpu) - steal_clock; + } else { + /* Avoid unnecessary steal clock warp */ + per_cpu(xen_steal_clock_offset, cpu) = 0; + } +} + void xen_setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 98d870672dc5e..8e9198c904e37 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -600,26 +601,47 @@ int xenbus_dev_suspend(struct device *dev) struct xenbus_driver *drv; struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev); + int (*cb)(struct xenbus_device *) = NULL; + bool xen_suspend = xen_suspend_mode_is_xen_suspend(); DPRINTK("%s", xdev->nodename); if (dev->driver == NULL) return 0; drv = to_xenbus_driver(dev->driver); - if (drv->suspend) - err = drv->suspend(xdev); - if (err) - dev_warn(dev, "suspend failed: %i\n", err); + + if (xen_suspend) + cb = drv->suspend; + else + cb = drv->freeze; + + if (cb) + err = cb(xdev); + + if (err) { + dev_warn(dev, "%s failed: %i\n", xen_suspend ? + "suspend" : "freeze", err); + return err; + } + + if (!xen_suspend) { + /* Forget otherend since this can become stale after restore */ + free_otherend_watch(xdev); + free_otherend_details(xdev); + } + return 0; } EXPORT_SYMBOL_GPL(xenbus_dev_suspend); int xenbus_dev_resume(struct device *dev) { - int err; + int err = 0; struct xenbus_driver *drv; struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev); + int (*cb)(struct xenbus_device *) = NULL; + bool xen_suspend = xen_suspend_mode_is_xen_suspend(); DPRINTK("%s", xdev->nodename); @@ -628,23 +650,32 @@ int xenbus_dev_resume(struct device *dev) drv = to_xenbus_driver(dev->driver); err = talk_to_otherend(xdev); if (err) { - dev_warn(dev, "resume (talk_to_otherend) failed: %i\n", err); + dev_warn(dev, "%s (talk_to_otherend) failed: %i\n", + xen_suspend ? "resume" : "restore", err); return err; } - xdev->state = XenbusStateInitialising; + if (xen_suspend) + xdev->state = XenbusStateInitialising; - if (drv->resume) { - err = drv->resume(xdev); - if (err) { - dev_warn(dev, "resume failed: %i\n", err); - return err; - } + if (xen_suspend) + cb = drv->resume; + else + cb = drv->restore; + + if (cb) + err = cb(xdev); + + if (err) { + dev_warn(dev, "%s failed: %i\n", + xen_suspend ? "resume" : "restore", err); + return err; } err = watch_otherend(xdev); if (err) { - dev_warn(dev, "resume (watch_otherend) failed: %d\n", err); + dev_warn(dev, "%s (watch_otherend) failed: %d.\n", + xen_suspend ? "resume" : "restore", err); return err; } @@ -654,8 +685,44 @@ EXPORT_SYMBOL_GPL(xenbus_dev_resume); int xenbus_dev_cancel(struct device *dev) { - /* Do nothing */ - DPRINTK("cancel"); + int err = 0; + struct xenbus_driver *drv; + struct xenbus_device *xdev + = container_of(dev, struct xenbus_device, dev); + bool xen_suspend = xen_suspend_mode_is_xen_suspend(); + + if (xen_suspend) { + /* Do nothing */ + DPRINTK("cancel"); + return 0; + } + + DPRINTK("%s", xdev->nodename); + + if (dev->driver == NULL) + return 0; + drv = to_xenbus_driver(dev->driver); + + err = talk_to_otherend(xdev); + if (err) { + dev_warn(dev, "thaw (talk_to_otherend) failed: %d.\n", err); + return err; + } + + if (drv->thaw) { + err = drv->thaw(xdev); + if (err) { + dev_warn(dev, "thaw failed: %i\n", err); + return err; + } + } + + err = watch_otherend(xdev); + if (err) { + dev_warn(dev, "thaw (watch_otherend) failed: %d.\n", err); + return err; + } + return 0; } EXPORT_SYMBOL_GPL(xenbus_dev_cancel); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 416a1b753ff62..d3da1748f93a7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3596,7 +3596,8 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } - return generic_file_buffered_read(iocb, to, ret); +// return generic_file_buffered_read(iocb, to, ret); + return generic_file_read_iter(iocb, to); } const struct file_operations btrfs_file_operations = { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 51cebc1990eb1..bb4ef2fe3bd2f 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1360,6 +1360,16 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, return err; } +static void ext4_add_overhead(struct super_block *sb, + const ext4_fsblk_t overhead) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + sbi->s_overhead += overhead; + smp_wmb(); +} + /* * ext4_update_super() updates the super block so that the newly added * groups can be seen by the filesystem. @@ -1458,9 +1468,17 @@ static void ext4_update_super(struct super_block *sb, } /* - * Update the fs overhead information + * Update the fs overhead information. + * + * For bigalloc, if the superblock already has a properly calculated + * overhead, update it with a value based on numbers already computed + * above for the newly allocated capacity. */ - ext4_calculate_overhead(sb); + if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0)) + ext4_add_overhead(sb, + EXT4_NUM_B2C(sbi, blocks_count - free_blocks)); + else + ext4_calculate_overhead(sb); es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); if (test_opt(sb, DEBUG)) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 818ff8b1b99da..2203ea6cf2684 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -781,8 +781,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); - if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES) - server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES; + if (server->dtsize > NFS_MAX_FILE_IO_SIZE) + server->dtsize = NFS_MAX_FILE_IO_SIZE; if (server->dtsize > server->rsize) server->dtsize = server->rsize; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 9f88ca7b20015..616add1720538 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -68,7 +68,7 @@ const struct address_space_operations nfs_dir_aops = { .freepage = nfs_readdir_clear_array, }; -static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, const struct cred *cred) +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; @@ -78,7 +78,6 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir ctx->attr_gencount = nfsi->attr_gencount; ctx->dir_cookie = 0; ctx->dup_cookie = 0; - ctx->cred = get_cred(cred); spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) @@ -96,7 +95,6 @@ static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_cont spin_lock(&dir->i_lock); list_del(&ctx->list); spin_unlock(&dir->i_lock); - put_cred(ctx->cred); kfree(ctx); } @@ -113,7 +111,7 @@ nfs_opendir(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSOPEN); - ctx = alloc_nfs_open_dir_context(inode, current_cred()); + ctx = alloc_nfs_open_dir_context(inode); if (IS_ERR(ctx)) { res = PTR_ERR(ctx); goto out; @@ -133,43 +131,55 @@ nfs_closedir(struct inode *inode, struct file *filp) struct nfs_cache_array_entry { u64 cookie; u64 ino; - struct qstr string; + const char *name; + unsigned int name_len; unsigned char d_type; }; struct nfs_cache_array { - int size; - int eof_index; u64 last_cookie; + unsigned int size; + unsigned char page_full : 1, + page_is_eof : 1, + cookies_are_ordered : 1; struct nfs_cache_array_entry array[]; }; -typedef struct { +struct nfs_readdir_descriptor { struct file *file; struct page *page; struct dir_context *ctx; - unsigned long page_index; - u64 *dir_cookie; + pgoff_t page_index; + u64 dir_cookie; u64 last_cookie; + u64 dup_cookie; loff_t current_index; loff_t prev_index; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; unsigned long dir_verifier; unsigned long timestamp; unsigned long gencount; + unsigned long attr_gencount; unsigned int cache_entry_index; + signed char duped; bool plus; bool eof; -} nfs_readdir_descriptor_t; +}; -static -void nfs_readdir_init_array(struct page *page) +static void nfs_readdir_array_init(struct nfs_cache_array *array) +{ + memset(array, 0, sizeof(struct nfs_cache_array)); +} + +static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie) { struct nfs_cache_array *array; array = kmap_atomic(page); - memset(array, 0, sizeof(struct nfs_cache_array)); - array->eof_index = -1; + nfs_readdir_array_init(array); + array->last_cookie = last_cookie; + array->cookies_are_ordered = 1; kunmap_atomic(array); } @@ -184,61 +194,177 @@ void nfs_readdir_clear_array(struct page *page) array = kmap_atomic(page); for (i = 0; i < array->size; i++) - kfree(array->array[i].string.name); - array->size = 0; + kfree(array->array[i].name); + nfs_readdir_array_init(array); kunmap_atomic(array); } +static struct page * +nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags) +{ + struct page *page = alloc_page(gfp_flags); + if (page) + nfs_readdir_page_init_array(page, last_cookie); + return page; +} + +static void nfs_readdir_page_array_free(struct page *page) +{ + if (page) { + nfs_readdir_clear_array(page); + put_page(page); + } +} + +static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) +{ + array->page_is_eof = 1; + array->page_full = 1; +} + +static bool nfs_readdir_array_is_full(struct nfs_cache_array *array) +{ + return array->page_full; +} + /* * the caller is responsible for freeing qstr.name * when called by nfs_readdir_add_to_array, the strings will be freed in * nfs_clear_readdir_array() */ -static -int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +static const char *nfs_readdir_copy_name(const char *name, unsigned int len) { - string->len = len; - string->name = kmemdup_nul(name, len, GFP_KERNEL); - if (string->name == NULL) - return -ENOMEM; + const char *ret = kmemdup_nul(name, len, GFP_KERNEL); + /* * Avoid a kmemleak false positive. The pointer to the name is stored * in a page cache page which kmemleak does not scan. */ - kmemleak_not_leak(string->name); - string->hash = full_name_hash(NULL, name, len); + if (ret != NULL) + kmemleak_not_leak(ret); + return ret; +} + +/* + * Check that the next array entry lies entirely within the page bounds + */ +static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) +{ + struct nfs_cache_array_entry *cache_entry; + + if (array->page_full) + return -ENOSPC; + cache_entry = &array->array[array->size + 1]; + if ((char *)cache_entry - (char *)array > PAGE_SIZE) { + array->page_full = 1; + return -ENOSPC; + } return 0; } static int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) { - struct nfs_cache_array *array = kmap(page); + struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; + const char *name; int ret; - cache_entry = &array->array[array->size]; + name = nfs_readdir_copy_name(entry->name, entry->len); + if (!name) + return -ENOMEM; - /* Check that this entry lies within the page bounds */ - ret = -ENOSPC; - if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + array = kmap_atomic(page); + ret = nfs_readdir_array_can_expand(array); + if (ret) { + kfree(name); goto out; + } + cache_entry = &array->array[array->size]; cache_entry->cookie = entry->prev_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; - ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); - if (ret) - goto out; + cache_entry->name_len = entry->len; + cache_entry->name = name; array->last_cookie = entry->cookie; + if (array->last_cookie <= cache_entry->cookie) + array->cookies_are_ordered = 0; array->size++; if (entry->eof != 0) - array->eof_index = array->size; + nfs_readdir_array_set_eof(array); out: - kunmap(page); + kunmap_atomic(array); return ret; } +static struct page *nfs_readdir_page_get_locked(struct address_space *mapping, + pgoff_t index, u64 last_cookie) +{ + struct page *page; + + page = grab_cache_page(mapping, index); + if (page && !PageUptodate(page)) { + nfs_readdir_page_init_array(page, last_cookie); + if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0) + nfs_zap_mapping(mapping->host, mapping); + SetPageUptodate(page); + } + + return page; +} + +static u64 nfs_readdir_page_last_cookie(struct page *page) +{ + struct nfs_cache_array *array; + u64 ret; + + array = kmap_atomic(page); + ret = array->last_cookie; + kunmap_atomic(array); + return ret; +} + +static bool nfs_readdir_page_needs_filling(struct page *page) +{ + struct nfs_cache_array *array; + bool ret; + + array = kmap_atomic(page); + ret = !nfs_readdir_array_is_full(array); + kunmap_atomic(array); + return ret; +} + +static void nfs_readdir_page_set_eof(struct page *page) +{ + struct nfs_cache_array *array; + + array = kmap_atomic(page); + nfs_readdir_array_set_eof(array); + kunmap_atomic(array); +} + +static void nfs_readdir_page_unlock_and_put(struct page *page) +{ + unlock_page(page); + put_page(page); +} + +static struct page *nfs_readdir_page_get_next(struct address_space *mapping, + pgoff_t index, u64 cookie) +{ + struct page *page; + + page = nfs_readdir_page_get_locked(mapping, index, cookie); + if (page) { + if (nfs_readdir_page_last_cookie(page) == cookie) + return page; + nfs_readdir_page_unlock_and_put(page); + } + return NULL; +} + static inline int is_32bit_api(void) { @@ -258,8 +384,8 @@ bool nfs_readdir_use_cookie(const struct file *filp) return true; } -static -int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) { loff_t diff = desc->ctx->pos - desc->current_index; unsigned int index; @@ -267,13 +393,13 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri if (diff < 0) goto out_eof; if (diff >= array->size) { - if (array->eof_index >= 0) + if (array->page_is_eof) goto out_eof; return -EAGAIN; } index = (unsigned int)diff; - *desc->dir_cookie = array->array[index].cookie; + desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; return 0; out_eof: @@ -290,41 +416,55 @@ nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); } -static -int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, + u64 cookie) +{ + if (!array->cookies_are_ordered) + return true; + /* Optimisation for monotonically increasing cookies */ + if (cookie >= array->last_cookie) + return false; + if (array->size && cookie < array->array[0].cookie) + return false; + return true; +} + +static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) { int i; loff_t new_pos; int status = -EAGAIN; + if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie)) + goto check_eof; + for (i = 0; i < array->size; i++) { - if (array->array[i].cookie == *desc->dir_cookie) { + if (array->array[i].cookie == desc->dir_cookie) { struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); - struct nfs_open_dir_context *ctx = desc->file->private_data; new_pos = desc->current_index + i; - if (ctx->attr_gencount != nfsi->attr_gencount || + if (desc->attr_gencount != nfsi->attr_gencount || !nfs_readdir_inode_mapping_valid(nfsi)) { - ctx->duped = 0; - ctx->attr_gencount = nfsi->attr_gencount; + desc->duped = 0; + desc->attr_gencount = nfsi->attr_gencount; } else if (new_pos < desc->prev_index) { - if (ctx->duped > 0 - && ctx->dup_cookie == *desc->dir_cookie) { + if (desc->duped > 0 + && desc->dup_cookie == desc->dir_cookie) { if (printk_ratelimit()) { pr_notice("NFS: directory %pD2 contains a readdir loop." "Please contact your server vendor. " - "The file: %.*s has duplicate cookie %llu\n", - desc->file, array->array[i].string.len, - array->array[i].string.name, *desc->dir_cookie); + "The file: %s has duplicate cookie %llu\n", + desc->file, array->array[i].name, desc->dir_cookie); } status = -ELOOP; goto out; } - ctx->dup_cookie = *desc->dir_cookie; - ctx->duped = -1; + desc->dup_cookie = desc->dir_cookie; + desc->duped = -1; } if (nfs_readdir_use_cookie(desc->file)) - desc->ctx->pos = *desc->dir_cookie; + desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos = new_pos; desc->prev_index = new_pos; @@ -332,24 +472,24 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des return 0; } } - if (array->eof_index >= 0) { +check_eof: + if (array->page_is_eof) { status = -EBADCOOKIE; - if (*desc->dir_cookie == array->last_cookie) + if (desc->dir_cookie == array->last_cookie) desc->eof = true; } out: return status; } -static -int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) +static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) { struct nfs_cache_array *array; int status; - array = kmap(desc->page); + array = kmap_atomic(desc->page); - if (*desc->dir_cookie == 0) + if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); else status = nfs_readdir_search_for_cookie(array, desc); @@ -359,17 +499,29 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) desc->current_index += array->size; desc->page_index++; } - kunmap(desc->page); + kunmap_atomic(array); return status; } /* Fill a page with xdr information before transferring to the cache page */ -static -int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, - struct nfs_entry *entry, struct file *file, struct inode *inode) +static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, + __be32 *verf, u64 cookie, + struct page **pages, size_t bufsize, + __be32 *verf_res) { - struct nfs_open_dir_context *ctx = file->private_data; - const struct cred *cred = ctx->cred; + struct inode *inode = file_inode(desc->file); + struct nfs_readdir_arg arg = { + .dentry = file_dentry(desc->file), + .cred = desc->file->f_cred, + .verf = verf, + .cookie = cookie, + .pages = pages, + .page_len = bufsize, + .plus = desc->plus, + }; + struct nfs_readdir_res res = { + .verf = verf_res, + }; unsigned long timestamp, gencount; int error; @@ -377,14 +529,13 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); desc->dir_verifier = nfs_save_change_attribute(inode); - error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages, - NFS_SERVER(inode)->dtsize, desc->plus); + error = NFS_PROTO(inode)->readdir(&arg, &res); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); - desc->plus = false; + desc->plus = arg.plus = false; goto again; } goto error; @@ -395,7 +546,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, return error; } -static int xdr_decode(nfs_readdir_descriptor_t *desc, +static int xdr_decode(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct xdr_stream *xdr) { struct inode *inode = file_inode(desc->file); @@ -557,24 +708,23 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, } /* Perform conversion from xdr to cache array */ -static -int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, - struct page **xdr_pages, struct page *page, unsigned int buflen) -{ +static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, + struct nfs_entry *entry, + struct page **xdr_pages, + unsigned int buflen, + struct page **arrays, + size_t narrays) +{ + struct address_space *mapping = desc->file->f_mapping; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; - struct nfs_cache_array *array; - unsigned int count = 0; + struct page *scratch, *new, *page = *arrays; int status; scratch = alloc_page(GFP_KERNEL); if (scratch == NULL) return -ENOMEM; - if (buflen == 0) - goto out_nopages; - xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); @@ -583,209 +733,245 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en entry->label->len = NFS4_MAXLABELLEN; status = xdr_decode(desc, entry, &stream); - if (status != 0) { - if (status == -EAGAIN) - status = 0; + if (status != 0) break; - } - - count++; if (desc->plus) nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier); status = nfs_readdir_add_to_array(entry, page); - if (status != 0) - break; - } while (!entry->eof); + if (status != -ENOSPC) + continue; + + if (page->mapping != mapping) { + if (!--narrays) + break; + new = nfs_readdir_page_array_alloc(entry->prev_cookie, + GFP_KERNEL); + if (!new) + break; + arrays++; + *arrays = page = new; + } else { + new = nfs_readdir_page_get_next(mapping, + page->index + 1, + entry->prev_cookie); + if (!new) + break; + if (page != *arrays) + nfs_readdir_page_unlock_and_put(page); + page = new; + } + status = nfs_readdir_add_to_array(entry, page); + } while (!status && !entry->eof); -out_nopages: - if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = kmap(page); - array->eof_index = array->size; + switch (status) { + case -EBADCOOKIE: + if (entry->eof) { + nfs_readdir_page_set_eof(page); + status = 0; + } + break; + case -ENOSPC: + case -EAGAIN: status = 0; - kunmap(page); + break; } + if (page != *arrays) + nfs_readdir_page_unlock_and_put(page); + put_page(scratch); return status; } -static -void nfs_readdir_free_pages(struct page **pages, unsigned int npages) +static void nfs_readdir_free_pages(struct page **pages, size_t npages) { - unsigned int i; - for (i = 0; i < npages; i++) - put_page(pages[i]); + while (npages--) + put_page(pages[npages]); + kfree(pages); } /* * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call * to nfs_readdir_free_pages() */ -static -int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) +static struct page **nfs_readdir_alloc_pages(size_t npages) { - unsigned int i; + struct page **pages; + size_t i; + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; for (i = 0; i < npages; i++) { struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) goto out_freepages; pages[i] = page; } - return 0; + return pages; out_freepages: nfs_readdir_free_pages(pages, i); - return -ENOMEM; + return NULL; } -static -int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) +static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, + __be32 *verf_arg, __be32 *verf_res, + struct page **arrays, size_t narrays) { - struct page *pages[NFS_MAX_READDIR_PAGES]; - struct nfs_entry entry; - struct file *file = desc->file; - struct nfs_cache_array *array; + struct page **pages; + struct page *page = *arrays; + struct nfs_entry *entry; + size_t array_size; + struct inode *inode = file_inode(desc->file); + size_t dtsize = NFS_SERVER(inode)->dtsize; int status = -ENOMEM; - unsigned int array_size = ARRAY_SIZE(pages); - - nfs_readdir_init_array(page); - entry.prev_cookie = 0; - entry.cookie = desc->last_cookie; - entry.eof = 0; - entry.fh = nfs_alloc_fhandle(); - entry.fattr = nfs_alloc_fattr(); - entry.server = NFS_SERVER(inode); - if (entry.fh == NULL || entry.fattr == NULL) + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + entry->cookie = nfs_readdir_page_last_cookie(page); + entry->fh = nfs_alloc_fhandle(); + entry->fattr = nfs_alloc_fattr(); + entry->server = NFS_SERVER(inode); + if (entry->fh == NULL || entry->fattr == NULL) goto out; - entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); - if (IS_ERR(entry.label)) { - status = PTR_ERR(entry.label); + entry->label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(entry->label)) { + status = PTR_ERR(entry->label); goto out; } - array = kmap(page); + array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = nfs_readdir_alloc_pages(array_size); + if (!pages) + goto out_release_label; - status = nfs_readdir_alloc_pages(pages, array_size); - if (status < 0) - goto out_release_array; do { unsigned int pglen; - status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); - + status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, + pages, dtsize, + verf_res); if (status < 0) break; + pglen = status; - status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); - if (status < 0) { - if (status == -ENOSPC) - status = 0; + if (pglen == 0) { + nfs_readdir_page_set_eof(page); break; } - } while (array->eof_index < 0); + + verf_arg = verf_res; + + status = nfs_readdir_page_filler(desc, entry, pages, pglen, + arrays, narrays); + } while (!status && nfs_readdir_page_needs_filling(page)); nfs_readdir_free_pages(pages, array_size); -out_release_array: - kunmap(page); - nfs4_label_free(entry.label); +out_release_label: + nfs4_label_free(entry->label); out: - nfs_free_fattr(entry.fattr); - nfs_free_fhandle(entry.fh); + nfs_free_fattr(entry->fattr); + nfs_free_fhandle(entry->fh); + kfree(entry); return status; } -/* - * Now we cache directories properly, by converting xdr information - * to an array that can be used for lookups later. This results in - * fewer cache pages, since we can store more information on each page. - * We only need to convert from xdr once so future lookups are much simpler - */ -static -int nfs_readdir_filler(void *data, struct page* page) +static void nfs_readdir_page_put(struct nfs_readdir_descriptor *desc) { - nfs_readdir_descriptor_t *desc = data; - struct inode *inode = file_inode(desc->file); - int ret; - - ret = nfs_readdir_xdr_to_array(desc, page, inode); - if (ret < 0) - goto error; - SetPageUptodate(page); - - if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { - /* Should never happen */ - nfs_zap_mapping(inode, inode->i_mapping); - } - unlock_page(page); - return 0; - error: - nfs_readdir_clear_array(page); - unlock_page(page); - return ret; + put_page(desc->page); + desc->page = NULL; } -static -void cache_page_release(nfs_readdir_descriptor_t *desc) +static void +nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) { - put_page(desc->page); - desc->page = NULL; + unlock_page(desc->page); + nfs_readdir_page_put(desc); } -static -struct page *get_cache_page(nfs_readdir_descriptor_t *desc) +static struct page * +nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc) { - return read_cache_page(desc->file->f_mapping, desc->page_index, - nfs_readdir_filler, desc); + return nfs_readdir_page_get_locked(desc->file->f_mapping, + desc->page_index, + desc->last_cookie); } /* * Returns 0 if desc->dir_cookie was found on page desc->page_index * and locks the page to prevent removal from the page cache. */ -static -int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc) +static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) { struct inode *inode = file_inode(desc->file); struct nfs_inode *nfsi = NFS_I(inode); + __be32 verf[NFS_DIR_VERIFIER_SIZE]; int res; - desc->page = get_cache_page(desc); - if (IS_ERR(desc->page)) - return PTR_ERR(desc->page); - res = lock_page_killable(desc->page); - if (res != 0) - goto error; - res = -EAGAIN; - if (desc->page->mapping != NULL) { - res = nfs_readdir_search_array(desc); - if (res == 0) { - nfsi->page_index = desc->page_index; - return 0; + desc->page = nfs_readdir_page_get_cached(desc); + if (!desc->page) + return -ENOMEM; + if (nfs_readdir_page_needs_filling(desc->page)) { + res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, + &desc->page, 1); + if (res < 0) { + nfs_readdir_page_unlock_and_put_cached(desc); + if (res == -EBADCOOKIE || res == -ENOTSYNC) { + invalidate_inode_pages2(desc->file->f_mapping); + desc->page_index = 0; + return -EAGAIN; + } + return res; } + /* + * Set the cookie verifier if the page cache was empty + */ + if (desc->page_index == 0) + memcpy(nfsi->cookieverf, verf, + sizeof(nfsi->cookieverf)); } - unlock_page(desc->page); -error: - cache_page_release(desc); + res = nfs_readdir_search_array(desc); + if (res == 0) { + nfsi->page_index = desc->page_index; + return 0; + } + nfs_readdir_page_unlock_and_put_cached(desc); return res; } +static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc) +{ + struct address_space *mapping = desc->file->f_mapping; + struct inode *dir = file_inode(desc->file); + unsigned int dtsize = NFS_SERVER(dir)->dtsize; + loff_t size = i_size_read(dir); + + /* + * Default to uncached readdir if the page cache is empty, and + * we're looking for a non-zero cookie in a large directory. + */ + return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize; +} + /* Search for desc->dir_cookie from the beginning of the page cache */ -static inline -int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) +static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; - if (desc->page_index == 0) { - desc->current_index = 0; - desc->prev_index = 0; - desc->last_cookie = 0; - } + if (nfs_readdir_dont_search_cache(desc)) + return -EBADCOOKIE; + do { + if (desc->page_index == 0) { + desc->current_index = 0; + desc->prev_index = 0; + desc->last_cookie = 0; + } res = find_and_lock_cache_page(desc); } while (res == -EAGAIN); return res; @@ -794,43 +980,41 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) /* * Once we've found the start of the dirent within a page: fill 'er up... */ -static -int nfs_do_filldir(nfs_readdir_descriptor_t *desc) +static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, + const __be32 *verf) { struct file *file = desc->file; - int i = 0; - int res = 0; - struct nfs_cache_array *array = NULL; - struct nfs_open_dir_context *ctx = file->private_data; + struct nfs_cache_array *array; + unsigned int i = 0; array = kmap(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; ent = &array->array[i]; - if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, + if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { desc->eof = true; break; } + memcpy(desc->verf, verf, sizeof(desc->verf)); if (i < (array->size-1)) - *desc->dir_cookie = array->array[i+1].cookie; + desc->dir_cookie = array->array[i+1].cookie; else - *desc->dir_cookie = array->last_cookie; + desc->dir_cookie = array->last_cookie; if (nfs_readdir_use_cookie(file)) - desc->ctx->pos = *desc->dir_cookie; + desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; - if (ctx->duped != 0) - ctx->duped = 1; + if (desc->duped != 0) + desc->duped = 1; } - if (array->eof_index >= 0) + if (array->page_is_eof) desc->eof = true; kunmap(desc->page); - dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", - (unsigned long long)*desc->dir_cookie, res); - return res; + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", + (unsigned long long)desc->dir_cookie); } /* @@ -845,40 +1029,41 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) * we should already have a complete representation of the * directory in the page cache by the time we get here. */ -static inline -int uncached_readdir(nfs_readdir_descriptor_t *desc) +static int uncached_readdir(struct nfs_readdir_descriptor *desc) { - struct page *page = NULL; - int status; - struct inode *inode = file_inode(desc->file); - struct nfs_open_dir_context *ctx = desc->file->private_data; + struct page **arrays; + size_t i, sz = 512; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; + int status = -ENOMEM; - dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", - (unsigned long long)*desc->dir_cookie); + dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %llu\n", + (unsigned long long)desc->dir_cookie); - page = alloc_page(GFP_HIGHUSER); - if (!page) { - status = -ENOMEM; + arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL); + if (!arrays) + goto out; + arrays[0] = nfs_readdir_page_array_alloc(desc->dir_cookie, GFP_KERNEL); + if (!arrays[0]) goto out; - } desc->page_index = 0; - desc->last_cookie = *desc->dir_cookie; - desc->page = page; - ctx->duped = 0; - - status = nfs_readdir_xdr_to_array(desc, page, inode); - if (status < 0) - goto out_release; - - status = nfs_do_filldir(desc); - - out_release: - nfs_readdir_clear_array(desc->page); - cache_page_release(desc); - out: - dfprintk(DIRCACHE, "NFS: %s: returns %d\n", - __func__, status); + desc->last_cookie = desc->dir_cookie; + desc->duped = 0; + + status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); + + for (i = 0; !desc->eof && i < sz && arrays[i]; i++) { + desc->page = arrays[i]; + nfs_do_filldir(desc, verf); + } + desc->page = NULL; + + + for (i = 0; i < sz && arrays[i]; i++) + nfs_readdir_page_array_free(arrays[i]); +out: + kfree(arrays); + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; } @@ -890,15 +1075,10 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); + struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_dir_context *dir_ctx = file->private_data; - nfs_readdir_descriptor_t my_desc = { - .file = file, - .ctx = ctx, - .dir_cookie = &dir_ctx->dir_cookie, - .plus = nfs_use_readdirplus(inode, ctx), - }, - *desc = &my_desc; - int res = 0; + struct nfs_readdir_descriptor *desc; + int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); @@ -910,10 +1090,27 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) * to either find the entry with the appropriate number or * revalidate the cookie. */ - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) + if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) { res = nfs_revalidate_mapping(inode, file->f_mapping); - if (res < 0) + if (res < 0) + goto out; + } + + res = -ENOMEM; + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) goto out; + desc->file = file; + desc->ctx = ctx; + desc->plus = nfs_use_readdirplus(inode, ctx); + + spin_lock(&file->f_lock); + desc->dir_cookie = dir_ctx->dir_cookie; + desc->dup_cookie = dir_ctx->dup_cookie; + desc->duped = dir_ctx->duped; + desc->attr_gencount = dir_ctx->attr_gencount; + memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); + spin_unlock(&file->f_lock); do { res = readdir_search_pagecache(desc); @@ -921,16 +1118,18 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res == -EBADCOOKIE) { res = 0; /* This means either end of directory */ - if (*desc->dir_cookie && !desc->eof) { + if (desc->dir_cookie && !desc->eof) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc); if (res == 0) continue; + if (res == -EBADCOOKIE || res == -ENOTSYNC) + res = 0; } break; } if (res == -ETOOSMALL && desc->plus) { - clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); nfs_zap_caches(inode); desc->page_index = 0; desc->plus = false; @@ -940,15 +1139,21 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res < 0) break; - res = nfs_do_filldir(desc); - unlock_page(desc->page); - cache_page_release(desc); - if (res < 0) - break; + nfs_do_filldir(desc, nfsi->cookieverf); + nfs_readdir_page_unlock_and_put_cached(desc); } while (!desc->eof); + + spin_lock(&file->f_lock); + dir_ctx->dir_cookie = desc->dir_cookie; + dir_ctx->dup_cookie = desc->dup_cookie; + dir_ctx->duped = desc->duped; + dir_ctx->attr_gencount = desc->attr_gencount; + memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); + spin_unlock(&file->f_lock); + + kfree(desc); + out: - if (res > 0) - res = 0; dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); return res; } @@ -984,6 +1189,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dir_ctx->dir_cookie = offset; else dir_ctx->dir_cookie = 0; + if (offset == 0) + memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); dir_ctx->duped = 0; } spin_unlock(&filp->f_lock); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 36f415278c042..16745122ba2c1 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -244,7 +244,6 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA @@ -1252,7 +1251,6 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { - struct nfs_inode *nfsi = NFS_I(inode); int ret; if (mapping->nrpages != 0) { @@ -1265,11 +1263,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (ret < 0) return ret; } - if (S_ISDIR(inode->i_mode)) { - spin_lock(&inode->i_lock); - memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); - spin_unlock(&inode->i_lock); - } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); nfs_fscache_wait_on_invalidate(inode); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a7e0970b5bfe1..597adbfe15476 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -66,12 +66,6 @@ static inline fmode_t flags_to_mode(int flags) #define NFS_UNSPEC_RETRANS (UINT_MAX) #define NFS_UNSPEC_TIMEO (UINT_MAX) -/* - * Maximum number of pages that readdir can use for creating - * a vmapped array of pages. - */ -#define NFS_MAX_READDIR_PAGES 8 - struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index e1491def7124f..b915fe3abf355 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -637,37 +637,36 @@ nfs3_proc_rmdir(struct inode *dir, const struct qstr *name) * Also note that this implementation handles both plain readdir and * readdirplus. */ -static int -nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); - __be32 *verf = NFS_I(dir)->cookieverf; + struct inode *dir = d_inode(nr_arg->dentry); struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), - .cookie = cookie, - .verf = {verf[0], verf[1]}, - .plus = plus, - .count = count, - .pages = pages + .cookie = nr_arg->cookie, + .plus = nr_arg->plus, + .count = nr_arg->page_len, + .pages = nr_arg->pages }; struct nfs3_readdirres res = { - .verf = verf, - .plus = plus + .verf = nr_res->verf, + .plus = nr_arg->plus, }; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_READDIR], .rpc_argp = &arg, .rpc_resp = &res, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status = -ENOMEM; - if (plus) + if (nr_arg->plus) msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; + if (arg.cookie) + memcpy(arg.verf, nr_arg->verf, sizeof(arg.verf)); - dprintk("NFS call readdir%s %d\n", - plus? "plus" : "", (unsigned int) cookie); + dprintk("NFS call readdir%s %llu\n", nr_arg->plus ? "plus" : "", + (unsigned long long)nr_arg->cookie); res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) @@ -680,8 +679,8 @@ nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred, nfs_free_fattr(res.dir_attr); out: - dprintk("NFS reply readdir%s: %d\n", - plus? "plus" : "", status); + dprintk("NFS reply readdir%s: %d\n", nr_arg->plus ? "plus" : "", + status); return status; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b9567cc8698ed..bb2ecba49937b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -190,6 +190,8 @@ static int nfs4_map_errors(int err) return -EPROTONOSUPPORT; case -NFS4ERR_FILE_OPEN: return -EBUSY; + case -NFS4ERR_NOT_SAME: + return -ENOTSYNC; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -4985,41 +4987,40 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, return err; } -static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); + struct inode *dir = d_inode(nr_arg->dentry); struct nfs_server *server = NFS_SERVER(dir); struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), - .pages = pages, + .pages = nr_arg->pages, .pgbase = 0, - .count = count, - .plus = plus, + .count = nr_arg->page_len, + .plus = nr_arg->plus, }; struct nfs4_readdir_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR], .rpc_argp = &args, .rpc_resp = &res, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status; - dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__, - dentry, - (unsigned long long)cookie); + dprintk("%s: dentry = %pd2, cookie = %llu\n", __func__, + nr_arg->dentry, (unsigned long long)nr_arg->cookie); if (!(server->caps & NFS_CAP_SECURITY_LABEL)) args.bitmask = server->attr_bitmask_nl; else args.bitmask = server->attr_bitmask; - nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args); + nfs4_setup_readdir(nr_arg->cookie, nr_arg->verf, nr_arg->dentry, &args); res.pgbase = args.pgbase; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status >= 0) { - memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); + memcpy(nr_res->verf, res.verifier.data, NFS4_VERIFIER_SIZE); status += args.pgbase; } @@ -5029,19 +5030,18 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, return status; } -static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs4_proc_readdir(struct nfs_readdir_arg *arg, + struct nfs_readdir_res *res) { struct nfs4_exception exception = { .interruptible = true, }; int err; do { - err = _nfs4_proc_readdir(dentry, cred, cookie, - pages, count, plus); - trace_nfs4_readdir(d_inode(dentry), err); - err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err, - &exception); + err = _nfs4_proc_readdir(arg, res); + trace_nfs4_readdir(d_inode(arg->dentry), err); + err = nfs4_handle_exception(NFS_SERVER(d_inode(arg->dentry)), + err, &exception); } while (exception.retry); return err; } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 15c865cc837fa..73ab7c59d3a76 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -499,26 +499,26 @@ nfs_proc_rmdir(struct inode *dir, const struct qstr *name) * sure it is syntactically correct; the entries itself are decoded * from nfs_readdir by calling the decode_entry function directly. */ -static int -nfs_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); + struct inode *dir = d_inode(nr_arg->dentry); struct nfs_readdirargs arg = { .fh = NFS_FH(dir), - .cookie = cookie, - .count = count, - .pages = pages, + .cookie = nr_arg->cookie, + .count = nr_arg->page_len, + .pages = nr_arg->pages, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_READDIR], .rpc_argp = &arg, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status; - dprintk("NFS call readdir %d\n", (unsigned int)cookie); + dprintk("NFS call readdir %llu\n", (unsigned long long)nr_arg->cookie); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nr_res->verf[0] = nr_res->verf[1] = 0; nfs_invalidate_atime(dir); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 44103f9487c9a..de322620f5431 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -593,10 +593,7 @@ */ #define TEXT_TEXT \ ALIGN_FUNCTION(); \ - *(.text.hot .text.hot.*) \ - *(TEXT_MAIN .text.fixup) \ - *(.text.unlikely .text.unlikely.*) \ - *(.text.unknown .text.unknown.*) \ + *(.text.hot TEXT_MAIN .text.fixup .text.unlikely) \ NOINSTR_TEXT \ *(.text..refcount) \ *(.ref.text) \ diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index ff38737475ecb..e25795bdde862 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -55,6 +55,8 @@ #define ARM_SMCCC_OWNER_TRUSTED_OS 50 #define ARM_SMCCC_OWNER_TRUSTED_OS_END 63 +#define ARM_SMCCC_FUNC_QUERY_CALL_UID 0xff01 + #define ARM_SMCCC_QUIRK_NONE 0 #define ARM_SMCCC_QUIRK_QCOM_A6 1 /* Save/restore register a6 */ @@ -92,8 +94,47 @@ ARM_SMCCC_SMC_32, \ 0, 0x3fff) +#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_FUNC_QUERY_CALL_UID) + +/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */ +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 0xb66fb428U +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 0xe911c52eU +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 0x564bcaa9U +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3 0x743a004dU + +/* KVM "vendor specific" services */ +#define ARM_SMCCC_KVM_FUNC_FEATURES 0 +#define ARM_SMCCC_KVM_FUNC_PTP 1 +#define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 +#define ARM_SMCCC_KVM_NUM_FUNCS 128 + +#define ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_FEATURES) + #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 +/* + * ptp_kvm is a feature used for time sync between vm and host. + * ptp_kvm module in guest kernel will get service from host using + * this hypercall ID. + */ +#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_PTP) + +/* ptp_kvm counter type ID */ +#define KVM_PTP_VIRT_COUNTER 0 +#define KVM_PTP_PHYS_COUNTER 1 + /* Paravirtualised time calls (defined by ARM DEN0057A) */ #define ARM_SMCCC_HV_PV_TIME_FEATURES \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b010d45a1ecd5..98a59a5110a8c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -254,6 +254,39 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, extern const struct bpf_map_ops bpf_map_offload_ops; +/* bpf_type_flag contains a set of flags that are applicable to the values of + * arg_type, ret_type and reg_type. For example, a pointer value may be null, + * or a memory is read-only. We classify types into two categories: base types + * and extended types. Extended types are base types combined with a type flag. + * + * Currently there are no more than 32 base types in arg_type, ret_type and + * reg_types. + */ +#define BPF_BASE_TYPE_BITS 8 + +enum bpf_type_flag { + /* PTR may be NULL. */ + PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), + + /* MEM is read-only. When applied on bpf_arg, it indicates the arg is + * compatible with both mutable and immutable memory. + */ + MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), + + /* MEM was "allocated" from a different helper, and cannot be mixed + * with regular non-MEM_ALLOC'ed MEM types. + */ + MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_ALLOC, +}; + +/* Max number of base types. */ +#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) + +/* Max number of all types. */ +#define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1)) + /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ @@ -265,13 +298,11 @@ enum bpf_arg_type { ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ - ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ - ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, * helper function must fill all bytes or clear * them in error case. @@ -281,37 +312,60 @@ enum bpf_arg_type { ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ - ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ - ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ - ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ __BPF_ARG_TYPE_MAX, + + /* Extended arg_types. */ + ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, + ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, + ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, + ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, + ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* type of values returned from helper functions */ enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ - RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ - RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ - RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ - RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ - RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ - RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ - RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ + RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ + RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ + RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ + RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ + __BPF_RET_TYPE_MAX, + + /* Extended ret_types. */ + RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, + RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, + RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, + RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL @@ -373,18 +427,15 @@ enum bpf_reg_type { PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ - PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + PTR_TO_MAP_KEY, /* reg points to a map element key */ PTR_TO_STACK, /* reg == frame_pointer + offset */ PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ - PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ - PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ - PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need @@ -402,15 +453,24 @@ enum bpf_reg_type { * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct. */ - PTR_TO_BTF_ID_OR_NULL, PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ - PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ - PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ - PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ + PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ + __BPF_REG_TYPE_MAX, + + /* Extended reg_types. */ + PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, + PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, + PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, + PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, + PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* The information passed from prog-specific *_is_valid_access * back to the verifier. @@ -1899,6 +1959,7 @@ extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; +extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -1950,6 +2011,7 @@ struct sk_reuseport_kern { struct sk_buff *skb; struct sock *sk; struct sock *selected_sk; + struct sock *migrating_sk; void *data_end; u32 hash; u32 reuseport_id; diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index aaacb6aafc87e..73226181b7448 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -7,6 +7,7 @@ #ifndef _LINUX_BPF_LSM_H #define _LINUX_BPF_LSM_H +#include #include #include @@ -35,9 +36,21 @@ static inline struct bpf_storage_blob *bpf_inode( return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } +static inline struct bpf_storage_blob *bpf_task( + const struct task_struct *task) +{ + if (unlikely(!task->security)) + return NULL; + + return task->security + bpf_lsm_blob_sizes.lbs_task; +} + extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); +void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ @@ -53,10 +66,20 @@ static inline struct bpf_storage_blob *bpf_inode( return NULL; } +static inline struct bpf_storage_blob *bpf_task( + const struct task_struct *task) +{ + return NULL; +} + static inline void bpf_inode_storage_free(struct inode *inode) { } +static inline void bpf_task_storage_free(struct task_struct *task) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a8137bb6dd3c2..e256d6ef4765b 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -109,6 +109,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4d37c69e76b17..b0343efecce5f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -17,6 +17,8 @@ * that converting umax_value to int cannot overflow. */ #define BPF_MAX_VAR_SIZ (1 << 29) +/* size of type_str_buf in bpf_verifier. */ +#define TYPE_STR_BUF_LEN 64 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that @@ -462,6 +464,8 @@ struct bpf_verifier_env { u32 peak_states; /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; + /* buffer used in reg_type_str() to generate reg_type string */ + char type_str_buf[TYPE_STR_BUF_LEN]; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, @@ -493,8 +497,8 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno); +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, @@ -509,4 +513,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, u32 btf_id, struct bpf_attach_target_info *tgt_info); +#define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) + +/* extract base type from bpf_{arg, return, reg}_type. */ +static inline u32 base_type(u32 type) +{ + return type & BPF_BASE_TYPE_MASK; +} + +/* extract flags from an extended type. See bpf_type_flag in bpf.h. */ +static inline u32 type_flag(u32 type) +{ + return type & ~BPF_BASE_TYPE_MASK; +} + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 8f87c1a6f3231..65783d0db2d59 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -64,6 +65,10 @@ struct module; * 400-499: Perfect * The ideal clocksource. A must-use where * available. + * @id: Defaults to CSID_GENERIC. The id value is captured + * in certain snapshot functions to allow callers to + * validate the clocksource from which the snapshot was + * taken. * @flags: Flags describing special properties * @enable: Optional function to enable the clocksource * @disable: Optional function to disable the clocksource @@ -103,6 +108,7 @@ struct clocksource { const char *name; struct list_head list; int rating; + enum clocksource_ids id; enum vdso_clock_mode vdso_clock_mode; unsigned long flags; diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h new file mode 100644 index 0000000000000..16775d7d8f8d6 --- /dev/null +++ b/include/linux/clocksource_ids.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CLOCKSOURCE_IDS_H +#define _LINUX_CLOCKSOURCE_IDS_H + +/* Enum to give clocksources a unique identifier */ +enum clocksource_ids { + CSID_GENERIC = 0, + CSID_ARM_ARCH_COUNTER, + CSID_MAX, +}; + +#endif diff --git a/include/linux/damon.h b/include/linux/damon.h new file mode 100644 index 0000000000000..b5d5e9d6d8358 --- /dev/null +++ b/include/linux/damon.h @@ -0,0 +1,638 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON api + * + * Author: SeongJae Park + */ + +#ifndef _DAMON_H_ +#define _DAMON_H_ + +#include +#include +#include +#include +#include + +/* Minimal region size. Every damon_region is aligned by this. */ +#define DAMON_MIN_REGION PAGE_SIZE +/* Max priority score for DAMON-based operation schemes */ +#define DAMOS_MAX_SCORE (99) + +/* Get a random number in [l, r) */ +static inline unsigned long damon_rand(unsigned long l, unsigned long r) +{ + return l + prandom_u32_max(r - l); +} + +/** + * struct damon_addr_range - Represents an address region of [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_addr_range { + unsigned long start; + unsigned long end; +}; + +/** + * struct damon_region - Represents a monitoring target region. + * @ar: The address range of the region. + * @sampling_addr: Address of the sample for the next access check. + * @nr_accesses: Access frequency of this region. + * @list: List head for siblings. + * @age: Age of this region. + * + * @age is initially zero, increased for each aggregation interval, and reset + * to zero again if the access frequency is significantly changed. If two + * regions are merged into a new region, both @nr_accesses and @age of the new + * region are set as region size-weighted average of those of the two regions. + */ +struct damon_region { + struct damon_addr_range ar; + unsigned long sampling_addr; + unsigned int nr_accesses; + struct list_head list; + + unsigned int age; +/* private: Internal value for age calculation. */ + unsigned int last_nr_accesses; +}; + +/** + * struct damon_target - Represents a monitoring target. + * @pid: The PID of the virtual address space to monitor. + * @nr_regions: Number of monitoring target regions of this target. + * @regions_list: Head of the monitoring target regions of this target. + * @list: List head for siblings. + * + * Each monitoring context could have multiple targets. For example, a context + * for virtual memory address spaces could have multiple target processes. The + * @pid should be set for appropriate &struct damon_operations including the + * virtual address spaces monitoring operations. + */ +struct damon_target { + struct pid *pid; + unsigned int nr_regions; + struct list_head regions_list; + struct list_head list; +}; + +/** + * enum damos_action - Represents an action of a Data Access Monitoring-based + * Operation Scheme. + * + * @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED. + * @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD. + * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. + * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. + * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. + * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. + * @DAMOS_STAT: Do nothing but count the stat. + * @NR_DAMOS_ACTIONS: Total number of DAMOS actions + * + * The support of each action is up to running &struct damon_operations. + * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except + * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR + * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum + * DAMOS_LRU_DEPRIO, and &DAMOS_STAT. + */ +enum damos_action { + DAMOS_WILLNEED, + DAMOS_COLD, + DAMOS_PAGEOUT, + DAMOS_HUGEPAGE, + DAMOS_NOHUGEPAGE, + DAMOS_LRU_PRIO, + DAMOS_LRU_DEPRIO, + DAMOS_STAT, /* Do nothing but only record the stat */ + NR_DAMOS_ACTIONS, +}; + +/** + * struct damos_quota - Controls the aggressiveness of the given scheme. + * @ms: Maximum milliseconds that the scheme can use. + * @sz: Maximum bytes of memory that the action can be applied. + * @reset_interval: Charge reset interval in milliseconds. + * + * @weight_sz: Weight of the region's size for prioritization. + * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. + * @weight_age: Weight of the region's age for prioritization. + * + * To avoid consuming too much CPU time or IO resources for applying the + * &struct damos->action to large memory, DAMON allows users to set time and/or + * size quotas. The quotas can be set by writing non-zero values to &ms and + * &sz, respectively. If the time quota is set, DAMON tries to use only up to + * &ms milliseconds within &reset_interval for applying the action. If the + * size quota is set, DAMON tries to apply the action only up to &sz bytes + * within &reset_interval. + * + * Internally, the time quota is transformed to a size quota using estimated + * throughput of the scheme's action. DAMON then compares it against &sz and + * uses smaller one as the effective quota. + * + * For selecting regions within the quota, DAMON prioritizes current scheme's + * target memory regions using the &struct damon_operations->get_scheme_score. + * You could customize the prioritization logic by setting &weight_sz, + * &weight_nr_accesses, and &weight_age, because monitoring operations are + * encouraged to respect those. + */ +struct damos_quota { + unsigned long ms; + unsigned long sz; + unsigned long reset_interval; + + unsigned int weight_sz; + unsigned int weight_nr_accesses; + unsigned int weight_age; + +/* private: */ + /* For throughput estimation */ + unsigned long total_charged_sz; + unsigned long total_charged_ns; + + unsigned long esz; /* Effective size quota in bytes */ + + /* For charging the quota */ + unsigned long charged_sz; + unsigned long charged_from; + struct damon_target *charge_target_from; + unsigned long charge_addr_from; + + /* For prioritization */ + unsigned long histogram[DAMOS_MAX_SCORE + 1]; + unsigned int min_score; +}; + +/** + * enum damos_wmark_metric - Represents the watermark metric. + * + * @DAMOS_WMARK_NONE: Ignore the watermarks of the given scheme. + * @DAMOS_WMARK_FREE_MEM_RATE: Free memory rate of the system in [0,1000]. + * @NR_DAMOS_WMARK_METRICS: Total number of DAMOS watermark metrics + */ +enum damos_wmark_metric { + DAMOS_WMARK_NONE, + DAMOS_WMARK_FREE_MEM_RATE, + NR_DAMOS_WMARK_METRICS, +}; + +/** + * struct damos_watermarks - Controls when a given scheme should be activated. + * @metric: Metric for the watermarks. + * @interval: Watermarks check time interval in microseconds. + * @high: High watermark. + * @mid: Middle watermark. + * @low: Low watermark. + * + * If &metric is &DAMOS_WMARK_NONE, the scheme is always active. Being active + * means DAMON does monitoring and applying the action of the scheme to + * appropriate memory regions. Else, DAMON checks &metric of the system for at + * least every &interval microseconds and works as below. + * + * If &metric is higher than &high, the scheme is inactivated. If &metric is + * between &mid and &low, the scheme is activated. If &metric is lower than + * &low, the scheme is inactivated. + */ +struct damos_watermarks { + enum damos_wmark_metric metric; + unsigned long interval; + unsigned long high; + unsigned long mid; + unsigned long low; + +/* private: */ + bool activated; +}; + +/** + * struct damos_stat - Statistics on a given scheme. + * @nr_tried: Total number of regions that the scheme is tried to be applied. + * @sz_tried: Total size of regions that the scheme is tried to be applied. + * @nr_applied: Total number of regions that the scheme is applied. + * @sz_applied: Total size of regions that the scheme is applied. + * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + */ +struct damos_stat { + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; + unsigned long qt_exceeds; +}; + +/** + * enum damos_filter_type - Type of memory for &struct damos_filter + * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. + * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. + * @NR_DAMOS_FILTER_TYPES: Number of filter types. + * + * The support of each filter type is up to running &struct damon_operations. + * &enum DAMON_OPS_PADDR is supporting all filter types, while + * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any + * filter types. + */ +enum damos_filter_type { + DAMOS_FILTER_TYPE_ANON, + DAMOS_FILTER_TYPE_MEMCG, + NR_DAMOS_FILTER_TYPES, +}; + +/** + * struct damos_filter - DAMOS action target memory filter. + * @type: Type of the page. + * @matching: If the matching page should filtered out or in. + * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. + * @list: List head for siblings. + * + * Before applying the &damos->action to a memory region, DAMOS checks if each + * page of the region matches to this and avoid applying the action if so. + * Note that the check support is up to &struct damon_operations + * implementation. + */ +struct damos_filter { + enum damos_filter_type type; + bool matching; + union { + unsigned short memcg_id; + }; + struct list_head list; +}; + +/** + * struct damos_access_pattern - Target access pattern of the given scheme. + * @min_sz_region: Minimum size of target regions. + * @max_sz_region: Maximum size of target regions. + * @min_nr_accesses: Minimum ``->nr_accesses`` of target regions. + * @max_nr_accesses: Maximum ``->nr_accesses`` of target regions. + * @min_age_region: Minimum age of target regions. + * @max_age_region: Maximum age of target regions. + */ +struct damos_access_pattern { + unsigned long min_sz_region; + unsigned long max_sz_region; + unsigned int min_nr_accesses; + unsigned int max_nr_accesses; + unsigned int min_age_region; + unsigned int max_age_region; +}; + +/** + * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * @pattern: Access pattern of target regions. + * @action: &damo_action to be applied to the target regions. + * @quota: Control the aggressiveness of this scheme. + * @wmarks: Watermarks for automated (in)activation of this scheme. + * @filters: Additional set of &struct damos_filter for &action. + * @stat: Statistics of this scheme. + * @list: List head for siblings. + * + * For each aggregation interval, DAMON finds regions which fit in the + * &pattern and applies &action to those. To avoid consuming too much + * CPU time or IO resources for the &action, "a is used. + * + * To do the work only when needed, schemes can be activated for specific + * system situations using &wmarks. If all schemes that registered to the + * monitoring context are inactive, DAMON stops monitoring either, and just + * repeatedly checks the watermarks. + * + * If all schemes that registered to a &struct damon_ctx are inactive, DAMON + * stops monitoring and just repeatedly checks the watermarks. + * + * Before applying the &action to a memory region, &struct damon_operations + * implementation could check pages of the region and skip &action to respect + * &filters + * + * After applying the &action to each region, &stat_count and &stat_sz is + * updated to reflect the number of regions and total size of regions that the + * &action is applied. + */ +struct damos { + struct damos_access_pattern pattern; + enum damos_action action; + struct damos_quota quota; + struct damos_watermarks wmarks; + struct list_head filters; + struct damos_stat stat; + struct list_head list; +}; + +/** + * enum damon_ops_id - Identifier for each monitoring operations implementation + * + * @DAMON_OPS_VADDR: Monitoring operations for virtual address spaces + * @DAMON_OPS_FVADDR: Monitoring operations for only fixed ranges of virtual + * address spaces + * @DAMON_OPS_PADDR: Monitoring operations for the physical address space + * @NR_DAMON_OPS: Number of monitoring operations implementations + */ +enum damon_ops_id { + DAMON_OPS_VADDR, + DAMON_OPS_FVADDR, + DAMON_OPS_PADDR, + NR_DAMON_OPS, +}; + +struct damon_ctx; + +/** + * struct damon_operations - Monitoring operations for given use cases. + * + * @id: Identifier of this operations set. + * @init: Initialize operations-related data structures. + * @update: Update operations-related data structures. + * @prepare_access_checks: Prepare next access check of target regions. + * @check_accesses: Check the accesses to target regions. + * @reset_aggregated: Reset aggregated accesses monitoring results. + * @get_scheme_score: Get the score of a region for a scheme. + * @apply_scheme: Apply a DAMON-based operation scheme. + * @target_valid: Determine if the target is valid. + * @cleanup: Clean up the context. + * + * DAMON can be extended for various address spaces and usages. For this, + * users should register the low level operations for their target address + * space and usecase via the &damon_ctx.ops. Then, the monitoring thread + * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting + * the monitoring, @update after each &damon_attrs.ops_update_interval, and + * @check_accesses, @target_valid and @prepare_access_checks after each + * &damon_attrs.sample_interval. Finally, @reset_aggregated is called after + * each &damon_attrs.aggr_interval. + * + * Each &struct damon_operations instance having valid @id can be registered + * via damon_register_ops() and selected by damon_select_ops() later. + * @init should initialize operations-related data structures. For example, + * this could be used to construct proper monitoring target regions and link + * those to @damon_ctx.adaptive_targets. + * @update should update the operations-related data structures. For example, + * this could be used to update monitoring target regions for current status. + * @prepare_access_checks should manipulate the monitoring regions to be + * prepared for the next access check. + * @check_accesses should check the accesses to each region that made after the + * last preparation and update the number of observed accesses of each region. + * It should also return max number of observed accesses that made as a result + * of its update. The value will be used for regions adjustment threshold. + * @reset_aggregated should reset the access monitoring results that aggregated + * by @check_accesses. + * @get_scheme_score should return the priority score of a region for a scheme + * as an integer in [0, &DAMOS_MAX_SCORE]. + * @apply_scheme is called from @kdamond when a region for user provided + * DAMON-based operation scheme is found. It should apply the scheme's action + * to the region and return bytes of the region that the action is successfully + * applied. + * @target_valid should check whether the target is still valid for the + * monitoring. + * @cleanup is called from @kdamond just before its termination. + */ +struct damon_operations { + enum damon_ops_id id; + void (*init)(struct damon_ctx *context); + void (*update)(struct damon_ctx *context); + void (*prepare_access_checks)(struct damon_ctx *context); + unsigned int (*check_accesses)(struct damon_ctx *context); + void (*reset_aggregated)(struct damon_ctx *context); + int (*get_scheme_score)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); + unsigned long (*apply_scheme)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); + bool (*target_valid)(struct damon_target *t); + void (*cleanup)(struct damon_ctx *context); +}; + +/** + * struct damon_callback - Monitoring events notification callbacks. + * + * @before_start: Called before starting the monitoring. + * @after_wmarks_check: Called after each schemes' watermarks check. + * @after_sampling: Called after each sampling. + * @after_aggregation: Called after each aggregation. + * @before_damos_apply: Called before applying DAMOS action. + * @before_terminate: Called before terminating the monitoring. + * @private: User private data. + * + * The monitoring thread (&damon_ctx.kdamond) calls @before_start and + * @before_terminate just before starting and finishing the monitoring, + * respectively. Therefore, those are good places for installing and cleaning + * @private. + * + * The monitoring thread calls @after_wmarks_check after each DAMON-based + * operation schemes' watermarks check. If users need to make changes to the + * attributes of the monitoring context while it's deactivated due to the + * watermarks, this is the good place to do. + * + * The monitoring thread calls @after_sampling and @after_aggregation for each + * of the sampling intervals and aggregation intervals, respectively. + * Therefore, users can safely access the monitoring results without additional + * protection. For the reason, users are recommended to use these callback for + * the accesses to the results. + * + * If any callback returns non-zero, monitoring stops. + */ +struct damon_callback { + void *private; + + int (*before_start)(struct damon_ctx *context); + int (*after_wmarks_check)(struct damon_ctx *context); + int (*after_sampling)(struct damon_ctx *context); + int (*after_aggregation)(struct damon_ctx *context); + int (*before_damos_apply)(struct damon_ctx *context, + struct damon_target *target, + struct damon_region *region, + struct damos *scheme); + void (*before_terminate)(struct damon_ctx *context); +}; + +/** + * struct damon_attrs - Monitoring attributes for accuracy/overhead control. + * + * @sample_interval: The time between access samplings. + * @aggr_interval: The time between monitor results aggregations. + * @ops_update_interval: The time between monitoring operations updates. + * @min_nr_regions: The minimum number of adaptive monitoring + * regions. + * @max_nr_regions: The maximum number of adaptive monitoring + * regions. + * + * For each @sample_interval, DAMON checks whether each region is accessed or + * not. It aggregates and keeps the access information (number of accesses to + * each region) for @aggr_interval time. DAMON also checks whether the target + * memory regions need update (e.g., by ``mmap()`` calls from the application, + * in case of virtual memory monitoring) and applies the changes for each + * @ops_update_interval. All time intervals are in micro-seconds. + * Please refer to &struct damon_operations and &struct damon_callback for more + * detail. + */ +struct damon_attrs { + unsigned long sample_interval; + unsigned long aggr_interval; + unsigned long ops_update_interval; + unsigned long min_nr_regions; + unsigned long max_nr_regions; +}; + +/** + * struct damon_ctx - Represents a context for each monitoring. This is the + * main interface that allows users to set the attributes and get the results + * of the monitoring. + * + * @attrs: Monitoring attributes for accuracy/overhead control. + * @kdamond: Kernel thread who does the monitoring. + * @kdamond_lock: Mutex for the synchronizations with @kdamond. + * + * For each monitoring context, one kernel thread for the monitoring is + * created. The pointer to the thread is stored in @kdamond. + * + * Once started, the monitoring thread runs until explicitly required to be + * terminated or every monitoring target is invalid. The validity of the + * targets is checked via the &damon_operations.target_valid of @ops. The + * termination can also be explicitly requested by calling damon_stop(). + * The thread sets @kdamond to NULL when it terminates. Therefore, users can + * know whether the monitoring is ongoing or terminated by reading @kdamond. + * Reads and writes to @kdamond from outside of the monitoring thread must + * be protected by @kdamond_lock. + * + * Note that the monitoring thread protects only @kdamond via @kdamond_lock. + * Accesses to other fields must be protected by themselves. + * + * @ops: Set of monitoring operations for given use cases. + * @callback: Set of callbacks for monitoring events notifications. + * + * @adaptive_targets: Head of monitoring targets (&damon_target) list. + * @schemes: Head of schemes (&damos) list. + */ +struct damon_ctx { + struct damon_attrs attrs; + +/* private: internal use only */ + struct timespec64 last_aggregation; + struct timespec64 last_ops_update; + +/* public: */ + struct task_struct *kdamond; + struct mutex kdamond_lock; + + struct damon_operations ops; + struct damon_callback callback; + + struct list_head adaptive_targets; + struct list_head schemes; +}; + +static inline struct damon_region *damon_next_region(struct damon_region *r) +{ + return container_of(r->list.next, struct damon_region, list); +} + +static inline struct damon_region *damon_prev_region(struct damon_region *r) +{ + return container_of(r->list.prev, struct damon_region, list); +} + +static inline struct damon_region *damon_last_region(struct damon_target *t) +{ + return list_last_entry(&t->regions_list, struct damon_region, list); +} + +static inline struct damon_region *damon_first_region(struct damon_target *t) +{ + return list_first_entry(&t->regions_list, struct damon_region, list); +} + +static inline unsigned long damon_sz_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} + + +#define damon_for_each_region(r, t) \ + list_for_each_entry(r, &t->regions_list, list) + +#define damon_for_each_region_from(r, t) \ + list_for_each_entry_from(r, &t->regions_list, list) + +#define damon_for_each_region_safe(r, next, t) \ + list_for_each_entry_safe(r, next, &t->regions_list, list) + +#define damon_for_each_target(t, ctx) \ + list_for_each_entry(t, &(ctx)->adaptive_targets, list) + +#define damon_for_each_target_safe(t, next, ctx) \ + list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list) + +#define damon_for_each_scheme(s, ctx) \ + list_for_each_entry(s, &(ctx)->schemes, list) + +#define damon_for_each_scheme_safe(s, next, ctx) \ + list_for_each_entry_safe(s, next, &(ctx)->schemes, list) + +#define damos_for_each_filter(f, scheme) \ + list_for_each_entry(f, &(scheme)->filters, list) + +#define damos_for_each_filter_safe(f, next, scheme) \ + list_for_each_entry_safe(f, next, &(scheme)->filters, list) + +#ifdef CONFIG_DAMON + +struct damon_region *damon_new_region(unsigned long start, unsigned long end); + +/* + * Add a region between two other regions + */ +static inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next, + struct damon_target *t) +{ + __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; +} + +void damon_add_region(struct damon_region *r, struct damon_target *t); +void damon_destroy_region(struct damon_region *r, struct damon_target *t); +int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, + unsigned int nr_ranges); + +struct damos_filter *damos_new_filter(enum damos_filter_type type, + bool matching); +void damos_add_filter(struct damos *s, struct damos_filter *f); +void damos_destroy_filter(struct damos_filter *f); + +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks); +void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); +void damon_destroy_scheme(struct damos *s); + +struct damon_target *damon_new_target(void); +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); +bool damon_targets_empty(struct damon_ctx *ctx); +void damon_free_target(struct damon_target *t); +void damon_destroy_target(struct damon_target *t); +unsigned int damon_nr_regions(struct damon_target *t); + +struct damon_ctx *damon_new_ctx(void); +void damon_destroy_ctx(struct damon_ctx *ctx); +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); +void damon_set_schemes(struct damon_ctx *ctx, + struct damos **schemes, ssize_t nr_schemes); +int damon_nr_running_ctxs(void); +bool damon_is_registered_ops(enum damon_ops_id id); +int damon_register_ops(struct damon_operations *ops); +int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id); + +static inline bool damon_target_has_pid(const struct damon_ctx *ctx) +{ + return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR; +} + + +int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); + +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end); + +#endif /* CONFIG_DAMON */ + +#endif /* _DAMON_H */ diff --git a/include/linux/delay.h b/include/linux/delay.h index 1d0e2ce6b6d9f..e8607992c68a5 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -20,6 +20,7 @@ */ #include +#include extern unsigned long loops_per_jiffy; @@ -58,7 +59,18 @@ void calibrate_delay(void); void __attribute__((weak)) calibration_delay_done(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); -void usleep_range(unsigned long min, unsigned long max); +void usleep_range_state(unsigned long min, unsigned long max, + unsigned int state); + +static inline void usleep_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); +} + +static inline void usleep_idle_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_IDLE); +} static inline void ssleep(unsigned int seconds) { diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index a5f89fc4d6df1..e7c4233ea044f 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -7,6 +7,7 @@ #define _LINUX_DMA_MAP_OPS_H #include +#include #include struct cma; @@ -321,6 +322,10 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { +#ifdef CONFIG_DMA_PAGE_TOUCHING + if (!dev->dma_ops) + setup_dma_page_touching_ops(dev); +#endif } #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */ diff --git a/include/linux/dma-page-touching.h b/include/linux/dma-page-touching.h new file mode 100644 index 0000000000000..8ff9856e994c9 --- /dev/null +++ b/include/linux/dma-page-touching.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2021 Amazon.com, Inc. or its affiliates. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Sets the supplied device's DMA ops to the page toucing DMA ops if + * page touching is enabled and the device does not already have + * DMA ops assigned. + */ +void setup_dma_page_touching_ops(struct device *dev); diff --git a/include/linux/filter.h b/include/linux/filter.h index bc6ce4b202a80..cce9f97e4ead8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -982,11 +982,13 @@ void bpf_warn_invalid_xdp_action(u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash); #else static inline struct sock * bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { return NULL; diff --git a/include/linux/irq.h b/include/linux/irq.h index b89a8ac83d1bc..858fc5efd85fd 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -810,6 +810,8 @@ extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry); extern struct irq_data *irq_get_irq_data(unsigned int irq); +extern void irq_state_clr_started(struct irq_desc *desc); + static inline struct irq_chip *irq_get_chip(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); diff --git a/include/linux/math64.h b/include/linux/math64.h index 66deb1fdc2ef6..302f380b535a7 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -3,6 +3,7 @@ #define _LINUX_MATH64_H #include +#include #include #include @@ -234,6 +235,24 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift) #endif +#ifndef mul_s64_u64_shr +static inline u64 mul_s64_u64_shr(s64 a, u64 b, unsigned int shift) +{ + u64 ret; + + /* + * Extract the sign before the multiplication and put it back + * afterwards if needed. + */ + ret = mul_u64_u64_shr(abs(a), b, shift); + + if (a < 0) + ret = -((s64) ret); + + return ret; +} +#endif /* mul_s64_u64_shr */ + #ifndef mul_u64_u32_div static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) { diff --git a/include/linux/memory.h b/include/linux/memory.h index 4da95e684e20f..97e92e8b556a3 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -29,6 +29,11 @@ struct memory_block { int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ struct device dev; + /* + * Number of vmemmap pages. These pages + * lay at the beginning of the memory block. + */ + unsigned long nr_vmemmap_pages; }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v) #else extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); -int create_memory_block_devices(unsigned long start, unsigned long size); +int create_memory_block_devices(unsigned long start, unsigned long size, + unsigned long vmemmap_pages); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 1dafc7c7f5cfe..7a49c61182163 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -70,6 +70,14 @@ typedef int __bitwise mhp_t; */ #define MEMHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) +/* + * We want memmap (struct page array) to be self contained. + * To do so, we will use the beginning of the hot-added range to build + * the page tables for the memmap array that describes the entire range. + * Only selected architectures support it with SPARSE_VMEMMAP. + */ +#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1)) + /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) @@ -111,9 +119,13 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +extern void adjust_present_page_count(struct zone *zone, long nr_pages); /* VM interface that may be used by firmware interface */ +extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone); +extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid); + struct zone *zone); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -361,6 +373,7 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned long start_pfn, unsigned long nr_pages); +extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f5b464daeeca5..45a79da89c5fb 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -17,7 +17,7 @@ struct device; * @alloc: track pages consumed, private to vmemmap_populate() */ struct vmem_altmap { - const unsigned long base_pfn; + unsigned long base_pfn; const unsigned long end_pfn; const unsigned long reserve; unsigned long free; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b2e4599b88832..caf9490ec4bf1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -406,6 +406,11 @@ enum zone_type { * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). + * 6. Memory-hotplug: when using memmap_on_memory and onlining the + * memory to the MOVABLE zone, the vmemmap pages are also placed in + * such zone. Such pages cannot be really moved around as they are + * self-stored in the range, but they are treated as movable when + * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) @@ -1331,10 +1336,8 @@ static inline int online_section_nr(unsigned long nr) #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); -#ifdef CONFIG_MEMORY_HOTREMOVE void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif -#endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index e39342945a80b..dcd1f99e92e22 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -45,6 +45,11 @@ */ #define NFS_RPC_SWAPFLAGS (RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS) +/* + * Size of the NFS directory verifier + */ +#define NFS_DIR_VERIFIER_SIZE 2 + /* * NFSv3/v4 Access mode cache entry */ @@ -89,8 +94,8 @@ struct nfs_open_context { struct nfs_open_dir_context { struct list_head list; - const struct cred *cred; unsigned long attr_gencount; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; __u64 dir_cookie; __u64 dup_cookie; signed char duped; @@ -158,7 +163,7 @@ struct nfs_inode { * This is the cookie verifier used for NFSv3 readdir * operations */ - __be32 cookieverf[2]; + __be32 cookieverf[NFS_DIR_VERIFIER_SIZE]; atomic_long_t nrequests; struct nfs_mds_commit_info commit_info; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 33442fd018a06..05cd8f3875681 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -755,6 +755,20 @@ struct nfs_entry { struct nfs_server * server; }; +struct nfs_readdir_arg { + struct dentry *dentry; + const struct cred *cred; + __be32 *verf; + u64 cookie; + struct page **pages; + unsigned int page_len; + bool plus; +}; + +struct nfs_readdir_res { + __be32 *verf; +}; + /* * The following types are for NFSv2 only. */ @@ -1749,8 +1763,7 @@ struct nfs_rpc_ops { unsigned int, struct iattr *); int (*mkdir) (struct inode *, struct dentry *, struct iattr *); int (*rmdir) (struct inode *, const struct qstr *); - int (*readdir) (struct dentry *, const struct cred *, - u64, struct page **, unsigned int, bool); + int (*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *); int (*mknod) (struct inode *, struct dentry *, struct iattr *, dev_t); int (*statfs) (struct nfs_server *, struct nfs_fh *, diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 662f19374bd98..a2042c4186864 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -117,6 +117,12 @@ struct unwind_hint { .popsection .endm +.macro STACK_FRAME_NON_STANDARD func:req + .pushsection .discard.func_stack_frame_non_standard, "aw" + .long \func - . + .popsection +.endm + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_STACK_VALIDATION */ @@ -130,6 +136,8 @@ struct unwind_hint { #define ANNOTATE_INTRA_FUNCTION_CALL .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .endm +.macro STACK_FRAME_NON_STANDARD func:req +.endm #endif #endif /* CONFIG_STACK_VALIDATION */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4f6ba93791121..0f010fc7f1c4d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -132,7 +132,7 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif @@ -437,7 +437,7 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) TESTPAGEFLAG(Young, young, PF_ANY) SETPAGEFLAG(Young, young, PF_ANY) TESTCLEARFLAG(Young, young, PF_ANY) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index cfce186f0c4e0..c9cbc97560116 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -19,7 +19,7 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, #endif diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 1e894d34bdceb..d8a6aecf99cb9 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -6,7 +6,7 @@ #include #include -#ifdef CONFIG_IDLE_PAGE_TRACKING +#ifdef CONFIG_PAGE_IDLE_FLAG #ifdef CONFIG_64BIT static inline bool page_is_young(struct page *page) @@ -106,7 +106,7 @@ static inline void clear_page_idle(struct page *page) } #endif /* CONFIG_64BIT */ -#else /* !CONFIG_IDLE_PAGE_TRACKING */ +#else /* !CONFIG_PAGE_IDLE_FLAG */ static inline bool page_is_young(struct page *page) { @@ -135,6 +135,6 @@ static inline void clear_page_idle(struct page *page) { } -#endif /* CONFIG_IDLE_PAGE_TRACKING */ +#endif /* CONFIG_PAGE_IDLE_FLAG */ #endif /* _LINUX_MM_PAGE_IDLE_H */ diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index 3b99e0ec24f22..197c1d9928361 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -12,6 +12,8 @@ struct page_reporting_dev_info { /* function that alters pages to make them "reported" */ int (*report)(struct page_reporting_dev_info *prdev, struct scatterlist *sg, unsigned int nents); + int (*report_offline)(struct page_reporting_dev_info *prdev, + unsigned long start_pfn, unsigned int nr_pages); /* work struct for processing reports */ struct delayed_work work; @@ -20,6 +22,8 @@ struct page_reporting_dev_info { atomic_t state; }; +void page_report_offline(unsigned long start_pfn, unsigned int nr_pages); + /* Tear-down and bring-up for page reporting devices */ void page_reporting_unregister(struct page_reporting_dev_info *prdev); int page_reporting_register(struct page_reporting_dev_info *prdev); diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h new file mode 100644 index 0000000000000..f960a719f0d54 --- /dev/null +++ b/include/linux/ptp_kvm.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Virtual PTP 1588 clock for use with KVM guests + * + * Copyright (C) 2017 Red Hat Inc. + */ + +#ifndef _PTP_KVM_H_ +#define _PTP_KVM_H_ + +struct timespec64; +struct clocksource; + +int kvm_arch_ptp_init(void); +int kvm_arch_ptp_get_clock(struct timespec64 *ts); +int kvm_arch_ptp_get_crosststamp(u64 *cycle, + struct timespec64 *tspec, struct clocksource **cs); + +#endif /* _PTP_KVM_H_ */ diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index d7db179963221..e0b300de8f3fa 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -158,4 +158,194 @@ static inline void rb_replace_node_cached(struct rb_node *victim, rb_replace_node(victim, new, &root->rb_root); } +/* + * The below helper functions use 2 operators with 3 different + * calling conventions. The operators are related like: + * + * comp(a->key,b) < 0 := less(a,b) + * comp(a->key,b) > 0 := less(b,a) + * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) + * + * If these operators define a partial order on the elements we make no + * guarantee on which of the elements matching the key is found. See + * rb_find(). + * + * The reason for this is to allow the find() interface without requiring an + * on-stack dummy object, which might not be feasible due to object size. + */ + +/** + * rb_add_cached() - insert @node into the leftmost cached tree @tree + * @node: node to insert + * @tree: leftmost cached tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_root.rb_node; + struct rb_node *parent = NULL; + bool leftmost = true; + + while (*link) { + parent = *link; + if (less(node, parent)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = false; + } + } + + rb_link_node(node, parent, link); + rb_insert_color_cached(node, tree, leftmost); +} + +/** + * rb_add() - insert @node into @tree + * @node: node to insert + * @tree: tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + + while (*link) { + parent = *link; + if (less(node, parent)) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + + rb_link_node(node, parent, link); + rb_insert_color(node, tree); +} + +/** + * rb_find_add() - find equivalent @node in @tree, or add @node + * @node: node to look-for / insert + * @tree: tree to search / modify + * @cmp: operator defining the node order + * + * Returns the rb_node matching @node, or NULL when no match is found and @node + * is inserted. + */ +static __always_inline struct rb_node * +rb_find_add(struct rb_node *node, struct rb_root *tree, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + int c; + + while (*link) { + parent = *link; + c = cmp(node, parent); + + if (c < 0) + link = &parent->rb_left; + else if (c > 0) + link = &parent->rb_right; + else + return parent; + } + + rb_link_node(node, parent, link); + rb_insert_color(node, tree); + return NULL; +} + +/** + * rb_find() - find @key in tree @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining the node order + * + * Returns the rb_node matching @key or NULL. + */ +static __always_inline struct rb_node * +rb_find(const void *key, const struct rb_root *tree, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + + while (node) { + int c = cmp(key, node); + + if (c < 0) + node = node->rb_left; + else if (c > 0) + node = node->rb_right; + else + return node; + } + + return NULL; +} + +/** + * rb_find_first() - find the first @key in @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + * + * Returns the leftmost node matching @key, or NULL. + */ +static __always_inline struct rb_node * +rb_find_first(const void *key, const struct rb_root *tree, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + struct rb_node *match = NULL; + + while (node) { + int c = cmp(key, node); + + if (c <= 0) { + if (!c) + match = node; + node = node->rb_left; + } else if (c > 0) { + node = node->rb_right; + } + } + + return match; +} + +/** + * rb_next_match() - find the next @key in @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + * + * Returns the next node matching @key, or NULL. + */ +static __always_inline struct rb_node * +rb_next_match(const void *key, struct rb_node *node, + int (*cmp)(const void *key, const struct rb_node *)) +{ + node = rb_next(node); + if (node && cmp(key, node)) + node = NULL; + return node; +} + +/** + * rb_for_each() - iterates a subtree matching @key + * @node: iterator + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + */ +#define rb_for_each(node, key, tree, cmp) \ + for ((node) = rb_find_first((key), (tree), (cmp)); \ + (node); (node) = rb_next_match((key), (node), (cmp))) + #endif /* _LINUX_RBTREE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5da4b3c89f636..2f2bcaa7e7d78 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1357,6 +1357,16 @@ struct task_struct { int mce_count; #endif +#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH + /* + * If L1D flush is supported on mm context switch + * then we use this callback head to queue kill work + * to kill tasks that are not running on SMT disabled + * cores + */ + struct callback_head l1d_flush_kill; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 867d588314e03..902654ac5f7e7 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -32,6 +32,10 @@ static inline void clear_sched_clock_stable(void) { } +static inline void set_sched_clock_stable(void) +{ +} + static inline void sched_clock_idle_sleep_event(void) { } @@ -51,6 +55,7 @@ static inline u64 local_clock(void) } #else extern int sched_clock_stable(void); +extern void set_sched_clock_stable(void); extern void clear_sched_clock_stable(void); /* diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 9ef7bf686a9f7..bf1c656c3be0a 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -86,6 +86,7 @@ struct sched_domain { unsigned int busy_factor; /* less balancing by factor if busy */ unsigned int imbalance_pct; /* No balance until over watermark */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int imb_numa_nr; /* Nr running tasks that allows a NUMA imbalance */ int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 11a98144bda0b..f626afe60d8fd 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -225,7 +225,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + fast_ack_mode:2, /* which fast ack mode ? */ + unused:3; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ @@ -412,6 +413,9 @@ struct tcp_sock { */ struct request_sock __rcu *fastopen_rsk; struct saved_syn *saved_syn; + +/* Rerouting information */ + u16 ecn_rehash; /* PLB triggered rehash attempts */ }; enum tsq_enum { diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7f7e4a3f4394a..2ee05355333f6 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -3,6 +3,7 @@ #define _LINUX_TIMEKEEPING_H #include +#include /* Included from linux/ktime.h */ @@ -244,11 +245,12 @@ struct ktime_timestamps { * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { - u64 cycles; - ktime_t real; - ktime_t raw; - unsigned int clock_was_set_seq; - u8 cs_was_changed_seq; + u64 cycles; + ktime_t real; + ktime_t raw; + enum clocksource_ids cs_id; + unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; }; /** diff --git a/include/linux/uio.h b/include/linux/uio.h index cedb68e49e4f9..4b19d7dd003d5 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -54,6 +54,7 @@ struct iov_iter { unsigned int start_head; }; }; + size_t truncated; }; static inline enum iter_type iov_iter_type(const struct iov_iter *i) @@ -125,6 +126,14 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) }; } +#define iov_for_each(iov, iter, start) \ + if (iov_iter_type(&(start)) == ITER_IOVEC || \ + iov_iter_type(&(start)) == ITER_KVEC) \ + for (iter = (start); \ + (iter).count && \ + ((iov = iov_iter_iovec(&(iter))), 1); \ + iov_iter_advance(&(iter), (iov).iov_len)) + size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes); void iov_iter_advance(struct iov_iter *i, size_t bytes); @@ -263,8 +272,10 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ - if (i->count > count) + if (i->count > count) { + i->truncated += i->count - count; i->count = count; + } } /* @@ -273,6 +284,7 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { + i->truncated -= count - i->count; i->count = count; } diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index ff901aade442f..2515ffe09e51c 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -134,8 +134,9 @@ struct inet_connection_sock { u32 icsk_probes_tstamp; u32 icsk_user_timeout; - u64 icsk_ca_priv[104 / sizeof(u64)]; -#define ICSK_CA_PRIV_SIZE (13 * sizeof(u64)) +/* XXX inflated by temporary internal debugging info */ +#define ICSK_CA_PRIV_SIZE (224) + u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; }; #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9b0d8649ae5b8..462b8e96dd9c3 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -98,7 +98,7 @@ struct netns_ipv4 { u8 sysctl_ip_default_ttl; u8 sysctl_ip_no_pmtu_disc; u8 sysctl_ip_fwd_use_pmtu; - int sysctl_ip_fwd_update_priority; + u8 sysctl_ip_fwd_update_priority; u8 sysctl_ip_nonlocal_bind; u8 sysctl_ip_autobind_reuse; /* Shall we try to damage output packets if routing dev changes? */ @@ -107,8 +107,8 @@ struct netns_ipv4 { #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_raw_l3mdev_accept; #endif - int sysctl_tcp_early_demux; - int sysctl_udp_early_demux; + u8 sysctl_tcp_early_demux; + u8 sysctl_udp_early_demux; u8 sysctl_nexthop_compat_mode; @@ -184,6 +184,11 @@ struct netns_ipv4 { unsigned int sysctl_tcp_fastopen_blackhole_timeout; atomic_t tfo_active_disable_times; unsigned long tfo_active_disable_stamp; + u8 sysctl_tcp_plb_enabled; + int sysctl_tcp_plb_cong_thresh; + u8 sysctl_tcp_plb_idle_rehash_rounds; + u8 sysctl_tcp_plb_rehash_rounds; + u8 sysctl_tcp_plb_suspend_rto_sec; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 3eac185ae2e8a..efc9085c68927 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock; struct sock_reuseport { struct rcu_head rcu; - u16 max_socks; /* length of socks */ - u16 num_socks; /* elements in socks */ + u16 max_socks; /* length of socks */ + u16 num_socks; /* elements in socks */ + u16 num_closed_socks; /* closed elements in socks */ /* The last synq overflow event timestamp of this * reuse->socks[] group. */ @@ -31,10 +32,14 @@ extern int reuseport_alloc(struct sock *sk, bool bind_inany); extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); +void reuseport_stop_listen_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); +struct sock *reuseport_migrate_sock(struct sock *sk, + struct sock *migrating_sk, + struct sk_buff *skb); extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); extern int reuseport_detach_prog(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index dcca41f3a2240..b2abfa98ec1bd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -357,6 +357,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 +#define TCP_ECN_ECT_PERMANENT 16 enum tcp_tw_status { TCP_TW_SUCCESS = 0, @@ -790,6 +791,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } +static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) +{ + return max_t(s32, t1 - t0, 0); +} + static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { return tcp_ns_to_ts(skb->skb_mstamp_ns); @@ -857,16 +863,22 @@ struct tcp_skb_cb { __u32 ack_seq; /* Sequence number ACK'd */ union { struct { +#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1) /* There is space for up to 24 bytes */ - __u32 in_flight:30,/* Bytes in flight at transmit */ - is_app_limited:1, /* cwnd not fully used? */ - unused:1; + __u32 is_app_limited:1, /* cwnd not fully used? */ + delivered_ce:20, + unused:11; /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ - u64 first_tx_mstamp; + u32 first_tx_mstamp; /* when we reached the "delivered" count */ - u64 delivered_mstamp; + u32 delivered_mstamp; +#define TCPCB_IN_FLIGHT_BITS 20 +#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) + u32 in_flight:20, /* packets in flight at transmit */ + unused2:12; + u32 lost; /* packets lost so far upon tx of skb */ } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; @@ -1016,7 +1028,11 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 -#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) +/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ +#define TCP_CONG_WANTS_CE_EVENTS 0x4 +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ + TCP_CONG_NEEDS_ECN | \ + TCP_CONG_WANTS_CE_EVENTS) union tcp_cc_info; @@ -1036,8 +1052,13 @@ struct ack_sample { */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ + u32 prior_lost; /* tp->lost at "prior_mstamp" */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ + u32 tx_in_flight; /* packets in flight at starting timestamp */ + s32 lost; /* number of packets lost over interval */ s32 delivered; /* number of packets delivered over interval */ + s32 delivered_ce; /* packets delivered w/ CE mark over interval */ long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ @@ -1049,47 +1070,66 @@ struct rate_sample { bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ + bool is_ece; /* did this ACK have ECN marked? */ }; struct tcp_congestion_ops { - struct list_head list; - u32 key; - u32 flags; - - /* initialize private data (optional) */ - void (*init)(struct sock *sk); - /* cleanup private data (optional) */ - void (*release)(struct sock *sk); +/* fast path fields are put first to fill one cache line */ /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); + /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); + /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); + /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* call when ack arrives (optional) */ void (*in_ack_event)(struct sock *sk, u32 flags); - /* new value of cwnd after loss (required) */ - u32 (*undo_cwnd)(struct sock *sk); + /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + /* override sysctl_tcp_min_tso_segs */ u32 (*min_tso_segs)(struct sock *sk); - /* returns the multiplier used in tcp_sndbuf_expand (optional) */ - u32 (*sndbuf_expand)(struct sock *sk); + + /* pick target number of segments per TSO/GSO skb (optional): */ + u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); + + /* react to a specific lost skb (optional) */ + void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + + + /* new value of cwnd after loss (required) */ + u32 (*undo_cwnd)(struct sock *sk); + /* returns the multiplier used in tcp_sndbuf_expand (optional) */ + u32 (*sndbuf_expand)(struct sock *sk); + +/* control/slow paths put last */ /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); - char name[TCP_CA_NAME_MAX]; - struct module *owner; -}; + char name[TCP_CA_NAME_MAX]; + struct module *owner; + struct list_head list; + u32 key; + u32 flags; + + /* initialize private data (optional) */ + void (*init)(struct sock *sk); + /* cleanup private data (optional) */ + void (*release)(struct sock *sk); +} ____cacheline_aligned_in_smp; int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); @@ -1124,6 +1164,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) } #endif +static inline bool tcp_ca_wants_ce_events(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | + TCP_CONG_WANTS_CE_EVENTS); +} + static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1149,6 +1197,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) } /* From tcp_rate.c */ +void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); @@ -2104,6 +2153,23 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); +/* tcp_plb.c */ + +#define TCP_PLB_SCALE 8 /* scaling factor for fractions in PLB (e.g. ce_ratio) */ + +/* State for PLB (Protective Load Balancing) for a single TCP connection. */ +struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + enabled:1, /* Check if PLB is enabled */ + unused:2; + u32 pause_until; /* jiffies32 when PLB can resume repathing */ +}; + +void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, + const int cong_ratio); +void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); +void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); + /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { diff --git a/include/net/udp.h b/include/net/udp.h index e2550a4547a70..dcc2230e30a28 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -207,6 +207,11 @@ void udp_lib_rehash(struct sock *sk, u16 new_hash); static inline void udp_lib_close(struct sock *sk, long timeout) { + /* A zerocopy skb has a refcnt of sk and may be + * put into sk_error_queue with TX timestamp + */ + skb_queue_purge(&sk->sk_error_queue); + sk_common_release(sk); } diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h new file mode 100644 index 0000000000000..c79f1d4c39afe --- /dev/null +++ b/include/trace/events/damon.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM damon + +#if !defined(_TRACE_DAMON_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DAMON_H + +#include +#include +#include + +TRACE_EVENT(damon_aggregated, + + TP_PROTO(struct damon_target *t, unsigned int target_id, + struct damon_region *r, unsigned int nr_regions), + + TP_ARGS(t, target_id, r, nr_regions), + + TP_STRUCT__entry( + __field(unsigned long, target_id) + __field(unsigned int, nr_regions) + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, nr_accesses) + __field(unsigned int, age) + ), + + TP_fast_assign( + __entry->target_id = target_id; + __entry->nr_regions = nr_regions; + __entry->start = r->ar.start; + __entry->end = r->ar.end; + __entry->nr_accesses = r->nr_accesses; + __entry->age = r->age; + ), + + TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u", + __entry->target_id, __entry->nr_regions, + __entry->start, __entry->end, + __entry->nr_accesses, __entry->age) +); + +#endif /* _TRACE_DAMON_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 67018d367b9f4..ebee94c397a67 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -73,7 +73,7 @@ #define IF_HAVE_PG_HWPOISON(flag,string) #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) #define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string} #else #define IF_HAVE_PG_IDLE(flag,string) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a234023821e3..75b2d5df95a1c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -157,6 +157,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, }; /* Note that tracing related programs such as @@ -240,6 +241,9 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, + BPF_SK_REUSEPORT_SELECT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, __MAX_BPF_ATTACH_TYPE }; @@ -1661,6 +1665,14 @@ union bpf_attr { * Return * A 8-byte long non-decreasing number. * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket @@ -3742,6 +3754,50 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be an task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3900,6 +3956,9 @@ union bpf_attr { FN(per_cpu_ptr), \ FN(this_cpu_ptr), \ FN(redirect_peer), \ + FN(task_storage_get), \ + FN(task_storage_delete), \ + FN(get_current_task_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4356,6 +4415,20 @@ struct sk_reuseport_md { __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ __u32 bind_inany; /* Is sock bound to an INANY address? */ __u32 hash; /* A hash of the packet 4 tuples */ + /* When reuse->migrating_sk is NULL, it is selecting a sk for the + * new incoming connection request (e.g. selecting a listen sk for + * the received SYN in the TCP case). reuse->sk is one of the sk + * in the reuseport group. The bpf prog can use reuse->sk to learn + * the local listening ip/port without looking into the skb. + * + * When reuse->migrating_sk is not NULL, reuse->sk is closed and + * reuse->migrating_sk is the socket that needs to be migrated + * to another listening socket. migrating_sk could be a fullsock + * sk that is fully established or a reqsk that is in-the-middle + * of 3-way handshake. + */ + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(struct bpf_sock *, migrating_sk); }; #define BPF_TAG_SIZE 8 diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 20ee93f0f8761..96d52dd9c48ac 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -231,9 +231,42 @@ struct tcp_bbr_info { __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ }; +/* Phase as reported in netlink/ss stats. */ +enum tcp_bbr2_phase { + BBR2_PHASE_INVALID = 0, + BBR2_PHASE_STARTUP = 1, + BBR2_PHASE_DRAIN = 2, + BBR2_PHASE_PROBE_RTT = 3, + BBR2_PHASE_PROBE_BW_UP = 4, + BBR2_PHASE_PROBE_BW_DOWN = 5, + BBR2_PHASE_PROBE_BW_CRUISE = 6, + BBR2_PHASE_PROBE_BW_REFILL = 7 +}; + +struct tcp_bbr2_info { + /* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */ + __u32 bbr_bw_lsb; /* lower 32 bits of bw */ + __u32 bbr_bw_msb; /* upper 32 bits of bw */ + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ + __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ + __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ + __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ + __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ + __u8 bbr_mode; /* current bbr_mode in state machine */ + __u8 bbr_phase; /* current state machine phase */ + __u8 unused1; /* alignment padding; not used yet */ + __u8 bbr_version; /* MUST be at this offset in struct */ + __u32 bbr_inflight_lo; /* lower/short-term data volume bound */ + __u32 bbr_inflight_hi; /* higher/long-term data volume bound */ + __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ +}; + union tcp_cc_info { struct tcpvegas_info vegas; struct tcp_dctcp_info dctcp; struct tcp_bbr_info bbr; + struct tcp_bbr2_info bbr2; }; #endif /* _UAPI_INET_DIAG_H_ */ diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 860bbf6bf29cb..a9b626f014e78 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -67,7 +67,7 @@ #define IPVERSION 4 #define MAXTTL 255 -#define IPDEFTTL 64 +#define IPDEFTTL 127 #define IPOPT_OPTVAL 0 #define IPOPT_OLEN 1 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ca41220b40b8b..0d7350d1795bb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1053,6 +1053,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_X86_USER_SPACE_MSR 188 #define KVM_CAP_X86_MSR_FILTER 189 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 +#define KVM_CAP_PTP_KVM 198 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/include/uapi/linux/nitro_enclaves.h b/include/uapi/linux/nitro_enclaves.h index b945073fe544d..e808f5ba124d4 100644 --- a/include/uapi/linux/nitro_enclaves.h +++ b/include/uapi/linux/nitro_enclaves.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* - * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. */ #ifndef _UAPI_LINUX_NITRO_ENCLAVES_H_ @@ -60,7 +60,7 @@ * * Context: Process context. * Return: - * * 0 - Logic succesfully completed. + * * 0 - Logic successfully completed. * * -1 - There was a failure in the ioctl logic. * On failure, errno is set to: * * EFAULT - copy_from_user() / copy_to_user() failure. @@ -95,7 +95,7 @@ * * Context: Process context. * Return: - * * 0 - Logic succesfully completed. + * * 0 - Logic successfully completed. * * -1 - There was a failure in the ioctl logic. * On failure, errno is set to: * * EFAULT - copy_from_user() / copy_to_user() failure. @@ -118,7 +118,7 @@ * * Context: Process context. * Return: - * * 0 - Logic succesfully completed. + * * 0 - Logic successfully completed. * * -1 - There was a failure in the ioctl logic. * On failure, errno is set to: * * EFAULT - copy_from_user() failure. @@ -161,7 +161,7 @@ * * Context: Process context. * Return: - * * 0 - Logic succesfully completed. + * * 0 - Logic successfully completed. * * -1 - There was a failure in the ioctl logic. * On failure, errno is set to: * * EFAULT - copy_from_user() / copy_to_user() failure. diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index d99b5a7726980..b2e43185e3b55 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -55,7 +55,10 @@ struct nvme_passthru_cmd64 { __u64 metadata; __u64 addr; __u32 metadata_len; - __u32 data_len; + union { + __u32 data_len; /* for non-vectored io */ + __u32 vec_cnt; /* for vectored io */ + }; __u32 cdw10; __u32 cdw11; __u32 cdw12; @@ -78,5 +81,6 @@ struct nvme_passthru_cmd64 { #define NVME_IOCTL_RESCAN _IO('N', 0x46) #define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) +#define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64) #endif /* _UAPI_LINUX_NVME_IOCTL_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 7f0827705c9a4..943e0f34565c1 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -213,6 +213,7 @@ struct prctl_mm_map { /* Speculation control variants */ # define PR_SPEC_STORE_BYPASS 0 # define PR_SPEC_INDIRECT_BRANCH 1 +# define PR_SPEC_L1D_FLUSH 2 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */ # define PR_SPEC_NOT_AFFECTED 0 # define PR_SPEC_PRCTL (1UL << 0) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index f84e7bcad6deb..232961aecdeb4 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -289,6 +289,9 @@ enum LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ LINUX_MIB_TCPDSACKRECVSEGS, /* TCPDSACKRecvSegs */ LINUX_MIB_TCPDSACKIGNOREDDUBIOUS, /* TCPDSACKIgnoredDubious */ + LINUX_MIB_TCPMIGRATEREQSUCCESS, /* TCPMigrateReqSuccess */ + LINUX_MIB_TCPMIGRATEREQFAILURE, /* TCPMigrateReqFailure */ + LINUX_MIB_TCPECNREHASH, /* TCPECNRehash */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/sysgenid.h b/include/uapi/linux/sysgenid.h new file mode 100644 index 0000000000000..7279df61bd84b --- /dev/null +++ b/include/uapi/linux/sysgenid.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_SYSGENID_H +#define _UAPI_LINUX_SYSGENID_H + +#include + +#define SYSGENID_IOCTL 0xE4 +#define SYSGENID_SET_WATCHER_TRACKING _IO(SYSGENID_IOCTL, 1) +#define SYSGENID_WAIT_WATCHERS _IO(SYSGENID_IOCTL, 2) +#define SYSGENID_TRIGGER_GEN_UPDATE _IO(SYSGENID_IOCTL, 3) + +#ifdef __KERNEL__ +void sysgenid_bump_generation(void); +#endif /* __KERNEL__ */ + +#endif /* _UAPI_LINUX_SYSGENID_H */ + diff --git a/include/xen/events.h b/include/xen/events.h index 8ec418e30c7fb..4c174577ef512 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -84,6 +84,8 @@ static inline void notify_remote_via_evtchn(evtchn_port_t port) void notify_remote_via_irq(int irq); void xen_irq_resume(void); +void xen_shutdown_pirqs(void); +void xen_restore_pirqs(void); /* Clear an irq's pending state, in preparation for polling on it */ void xen_clear_irq_pending(int irq); diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 39a5580f8feb0..bd1d993676a97 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -37,9 +37,17 @@ void xen_time_setup_guest(void); void xen_manage_runstate_time(int action); void xen_get_runstate_snapshot(struct vcpu_runstate_info *res); u64 xen_steal_clock(int cpu); +void xen_save_steal_clock(int cpu); +void xen_restore_steal_clock(int cpu); int xen_setup_shutdown_event(void); +bool xen_suspend_mode_is_xen_suspend(void); +bool xen_suspend_mode_is_pm_suspend(void); +bool xen_suspend_mode_is_pm_hibernation(void); + +void xen_setup_syscore_ops(void); + extern unsigned long *xen_contiguous_bitmap; #if defined(CONFIG_XEN_PV) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index bf3cfc7c35d0b..58190b842089d 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -113,6 +113,9 @@ struct xenbus_driver { int (*remove)(struct xenbus_device *dev); int (*suspend)(struct xenbus_device *dev); int (*resume)(struct xenbus_device *dev); + int (*freeze)(struct xenbus_device *dev); + int (*thaw)(struct xenbus_device *dev); + int (*restore)(struct xenbus_device *dev); int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *); struct device_driver driver; int (*read_otherend_details)(struct xenbus_device *dev); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 92eb4769b0a35..b04f7bb67e564 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -8189,6 +8189,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fpl->max = SCM_MAX_FD; fpl->count = nr_files; UNIXCB(skb).fp = fpl; + skb->scm_io_uring = 1; skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); diff --git a/ipc/util.c b/ipc/util.c index bbb5190af6d9f..7c3601dad9bd5 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -754,21 +754,13 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, loff_t *new_pos) { - struct kern_ipc_perm *ipc; - int total, id; - - total = 0; - for (id = 0; id < pos && total < ids->in_use; id++) { - ipc = idr_find(&ids->ipcs_idr, id); - if (ipc != NULL) - total++; - } + struct kern_ipc_perm *ipc = NULL; + int max_idx = ipc_get_maxidx(ids); - ipc = NULL; - if (total >= ids->in_use) + if (max_idx == -1 || pos > max_idx) goto out; - for (; pos < ipc_mni; pos++) { + for (; pos <= max_idx; pos++) { ipc = idr_find(&ids->ipcs_idr, pos); if (ipc != NULL) { rcu_read_lock(); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index c1b9f71ee6aac..d1249340fd6ba 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_i obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o +obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 56cc5a915f670..c4898ca2fb594 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -63,6 +63,14 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c new file mode 100644 index 0000000000000..39a45fba4fb03 --- /dev/null +++ b/kernel/bpf/bpf_task_storage.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Facebook + * Copyright 2020 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_BPF_STORAGE_CACHE(task_cache); + +static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) +{ + struct task_struct *task = owner; + struct bpf_storage_blob *bsb; + + bsb = bpf_task(task); + if (!bsb) + return NULL; + return &bsb->storage; +} + +static struct bpf_local_storage_data * +task_storage_lookup(struct task_struct *task, struct bpf_map *map, + bool cacheit_lockit) +{ + struct bpf_local_storage *task_storage; + struct bpf_local_storage_map *smap; + struct bpf_storage_blob *bsb; + + bsb = bpf_task(task); + if (!bsb) + return NULL; + + task_storage = rcu_dereference(bsb->storage); + if (!task_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit); +} + +void bpf_task_storage_free(struct task_struct *task) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_task_storage = false; + struct bpf_storage_blob *bsb; + struct hlist_node *n; + + bsb = bpf_task(task); + if (!bsb) + return; + + rcu_read_lock(); + + local_storage = rcu_dereference(bsb->storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + /* Neither the bpf_prog nor the bpf-map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * local_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&local_storage->lock); + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + free_task_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false); + } + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + + /* free_task_storage should always be true as long as + * local_storage->list was non-empty. + */ + if (free_task_storage) + kfree_rcu(local_storage, rcu); +} + +static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + sdata = task_storage_lookup(task, map, true); + put_pid(pid); + return sdata ? sdata->data : NULL; +out: + put_pid(pid); + return ERR_PTR(err); +} + +static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, map_flags); + + err = PTR_ERR_OR_ZERO(sdata); +out: + put_pid(pid); + return err; +} + +static int task_storage_delete(struct task_struct *task, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = task_storage_lookup(task, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata)); + + return 0; +} + +static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + err = task_storage_delete(task, map); +out: + put_pid(pid); + return err; +} + +BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags) +{ + struct bpf_local_storage_data *sdata; + + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + /* explicitly check that the task_storage_ptr is not + * NULL as task_storage_lookup returns NULL in this case and + * bpf_local_storage_update expects the owner to have a + * valid storage pointer. + */ + if (!task_storage_ptr(task)) + return (unsigned long)NULL; + + sdata = task_storage_lookup(task, map, true); + if (sdata) + return (unsigned long)sdata->data; + + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); + return IS_ERR(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, + task) +{ + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + return task_storage_delete(task, map); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache); + return &smap->map; +} + +static void task_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); +} + +static int task_storage_map_btf_id; +const struct bpf_map_ops task_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = task_storage_map_alloc, + .map_free = task_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_pid_task_storage_lookup_elem, + .map_update_elem = bpf_pid_task_storage_update_elem, + .map_delete_elem = bpf_pid_task_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", + .map_btf_id = &task_storage_map_btf_id, + .map_owner_storage_ptr = task_storage_ptr, +}; + +BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct) + +const struct bpf_func_proto bpf_task_storage_get_proto = { + .func = bpf_task_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_task_storage_delete_proto = { + .func = bpf_task_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_task_storage_btf_ids[0], +}; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 06c028bdb8d4d..7d4a7b0b7deec 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4535,10 +4535,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + u32 type, flag; - if (ctx_arg_info->offset == off && - (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || - ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { + type = base_type(ctx_arg_info->reg_type); + flag = type_flag(ctx_arg_info->reg_type); + if (ctx_arg_info->offset == off && type == PTR_TO_BUF && + (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; } @@ -5215,7 +5217,7 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } - if (check_ctx_reg(env, ®[i + 1], i + 1)) + if (check_ptr_off_reg(env, ®[i + 1], i + 1)) goto out; continue; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 85927c2aa3433..54321df6cfac6 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1738,7 +1738,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0efe7c7bfe5e9..1a83f0572deff 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -505,7 +505,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .func = bpf_strtol, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, @@ -533,7 +533,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { .func = bpf_strtoul, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, @@ -605,7 +605,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -642,7 +642,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; @@ -655,7 +655,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .func = bpf_this_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6a9542af4212a..b0fa190b09790 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__bpf_map_elem, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 1e4bf23528a3d..d6fbe17432ae5 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -463,7 +463,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = { .func = bpf_ringbuf_output, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aaad2dce2be6f..3f3b2a26a9743 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -785,7 +785,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && - map->map_type != BPF_MAP_TYPE_INODE_STORAGE) + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { @@ -1980,6 +1981,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) attr->expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE; break; + case BPF_PROG_TYPE_SK_REUSEPORT: + if (!attr->expected_attach_type) + attr->expected_attach_type = + BPF_SK_REUSEPORT_SELECT; + break; } } @@ -2056,6 +2062,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, if (expected_attach_type == BPF_SK_LOOKUP) return 0; return -EINVAL; + case BPF_PROG_TYPE_SK_REUSEPORT: + switch (expected_attach_type) { + case BPF_SK_REUSEPORT_SELECT: + case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: + return 0; + default: + return -EINVAL; + } case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index edb19ada0405d..a05b404373a4f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -404,18 +404,6 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON; } -static bool reg_type_may_be_null(enum bpf_reg_type type) -{ - return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL || - type == PTR_TO_MEM_OR_NULL || - type == PTR_TO_RDONLY_BUF_OR_NULL || - type == PTR_TO_RDWR_BUF_OR_NULL; -} - static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return reg->type == PTR_TO_MAP_VALUE && @@ -424,12 +412,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { - return type == PTR_TO_SOCKET || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_MEM || - type == PTR_TO_MEM_OR_NULL; + return base_type(type) == PTR_TO_SOCKET || + base_type(type) == PTR_TO_TCP_SOCK || + base_type(type) == PTR_TO_MEM; +} + +static bool type_is_rdonly_mem(u32 type) +{ + return type & MEM_RDONLY; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -437,13 +427,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; } -static bool arg_type_may_be_null(enum bpf_arg_type type) +static bool type_may_be_null(u32 type) { - return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_CTX_OR_NULL || - type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + return type & PTR_MAYBE_NULL; } /* Determine whether the function releases some resources allocated by another @@ -496,37 +482,54 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } -/* string representation of 'enum bpf_reg_type' */ -static const char * const reg_type_str[] = { - [NOT_INIT] = "?", - [SCALAR_VALUE] = "inv", - [PTR_TO_CTX] = "ctx", - [CONST_PTR_TO_MAP] = "map_ptr", - [PTR_TO_MAP_VALUE] = "map_value", - [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", - [PTR_TO_STACK] = "fp", - [PTR_TO_PACKET] = "pkt", - [PTR_TO_PACKET_META] = "pkt_meta", - [PTR_TO_PACKET_END] = "pkt_end", - [PTR_TO_FLOW_KEYS] = "flow_keys", - [PTR_TO_SOCKET] = "sock", - [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", - [PTR_TO_SOCK_COMMON] = "sock_common", - [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", - [PTR_TO_TCP_SOCK] = "tcp_sock", - [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", - [PTR_TO_TP_BUFFER] = "tp_buffer", - [PTR_TO_XDP_SOCK] = "xdp_sock", - [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", - [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", - [PTR_TO_MEM] = "mem", - [PTR_TO_MEM_OR_NULL] = "mem_or_null", - [PTR_TO_RDONLY_BUF] = "rdonly_buf", - [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", - [PTR_TO_RDWR_BUF] = "rdwr_buf", - [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", -}; +/* string representation of 'enum bpf_reg_type' + * + * Note that reg_type_str() can not appear more than once in a single verbose() + * statement. + */ +static const char *reg_type_str(struct bpf_verifier_env *env, + enum bpf_reg_type type) +{ + char postfix[16] = {0}, prefix[16] = {0}; + static const char * const str[] = { + [NOT_INIT] = "?", + [SCALAR_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_STACK] = "fp", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", + [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", + [PTR_TO_MEM] = "mem", + [PTR_TO_BUF] = "buf", + }; + + if (type & PTR_MAYBE_NULL) { + if (base_type(type) == PTR_TO_BTF_ID || + base_type(type) == PTR_TO_PERCPU_BTF_ID) + strncpy(postfix, "or_null_", 16); + else + strncpy(postfix, "_or_null", 16); + } + + if (type & MEM_RDONLY) + strncpy(prefix, "rdonly_", 16); + if (type & MEM_ALLOC) + strncpy(prefix, "alloc_", 16); + + snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", + prefix, str[base_type(type)], postfix); + return env->type_str_buf; +} static char slot_type_char[] = { [STACK_INVALID] = '?', @@ -592,7 +595,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " R%d", i); print_liveness(env, reg->live); - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && @@ -600,9 +603,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || - t == PTR_TO_BTF_ID_OR_NULL || - t == PTR_TO_PERCPU_BTF_ID) + if (base_type(t) == PTR_TO_BTF_ID || + base_type(t) == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -611,9 +613,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); - else if (t == CONST_PTR_TO_MAP || - t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL) + else if (base_type(t) == CONST_PTR_TO_MAP || + base_type(t) == PTR_TO_MAP_KEY || + base_type(t) == PTR_TO_MAP_VALUE) verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); @@ -683,7 +685,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (is_spilled_reg(&state->stack[i])) { reg = &state->stack[i].spilled_ptr; t = reg->type; - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) @@ -1086,6 +1088,28 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) +{ + if (base_type(reg->type) == PTR_TO_MAP_VALUE) { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; + } else { + reg->type = PTR_TO_MAP_VALUE; + } + return; + } + + reg->type &= ~PTR_MAYBE_NULL; +} + static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) { return type_is_pkt_pointer(reg->type); @@ -1581,7 +1605,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, break; if (parent->live & REG_LIVE_DONE) { verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str[parent->type], + reg_type_str(env, parent->type), parent->var_off.value, parent->off); return -EFAULT; } @@ -2223,9 +2247,8 @@ static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int fr static bool is_spillable_regtype(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_MAP_VALUE: - case PTR_TO_MAP_VALUE_OR_NULL: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -2234,21 +2257,13 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: - case PTR_TO_RDONLY_BUF: - case PTR_TO_RDONLY_BUF_OR_NULL: - case PTR_TO_RDWR_BUF: - case PTR_TO_RDWR_BUF_OR_NULL: + case PTR_TO_BUF: case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: - case PTR_TO_MEM_OR_NULL: return true; default: return false; @@ -3105,7 +3120,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) + if (base_type(*reg_type) == PTR_TO_BTF_ID) *btf_id = info.btf_id; else env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; @@ -3171,7 +3186,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, } verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str[reg->type], off, size); + regno, reg_type_str(env, reg->type), off, size); return -EACCES; } @@ -3454,16 +3469,17 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) { - /* Access to ctx or passing it to a helper is only allowed in - * its original, unmodified form. + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. */ - if (reg->off) { - verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", - regno, reg->off); + if (!fixed_off_ok && reg->off) { + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); return -EACCES; } @@ -3471,13 +3487,20 @@ int check_ctx_reg(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } return 0; } +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -3874,15 +3897,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } - } else if (reg->type == PTR_TO_MEM) { + } else if (base_type(reg->type) == PTR_TO_MEM) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + + if (type_may_be_null(reg->type)) { + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + + if (t == BPF_WRITE && rdonly_mem) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into mem\n", value_regno); return -EACCES; } + err = check_mem_region_access(env, regno, off, size, reg->mem_size, false); - if (!err && t == BPF_READ && value_regno >= 0) + if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; @@ -3894,7 +3932,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; @@ -3911,7 +3949,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (reg_type_may_be_null(reg_type)) + if (type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -3919,8 +3957,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID || - reg_type == PTR_TO_BTF_ID_OR_NULL) + if (base_type(reg_type) == PTR_TO_BTF_ID) regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; @@ -3971,7 +4008,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); + regno, reg_type_str(env, reg->type)); return -EACCES; } err = check_sock_access(env, insn_idx, regno, off, size, t); @@ -3987,26 +4024,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); - } else if (reg->type == PTR_TO_RDONLY_BUF) { - if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); - return -EACCES; + } else if (base_type(reg->type) == PTR_TO_BUF) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + const char *buf_info; + u32 *max_access; + + if (rdonly_mem) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + buf_info = "rdonly"; + max_access = &env->prog->aux->max_rdonly_access; + } else { + buf_info = "rdwr"; + max_access = &env->prog->aux->max_rdwr_access; } + err = check_buffer_access(env, reg, regno, off, size, false, - "rdonly", - &env->prog->aux->max_rdonly_access); - if (!err && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_RDWR_BUF) { - err = check_buffer_access(env, reg, regno, off, size, false, - "rdwr", - &env->prog->aux->max_rdwr_access); - if (!err && t == BPF_READ && value_regno >= 0) + buf_info, max_access); + + if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EACCES; } @@ -4049,7 +4092,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; } @@ -4205,8 +4248,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + const char *buf_info; + u32 *max_access; - switch (reg->type) { + switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, @@ -4222,18 +4267,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); - case PTR_TO_RDONLY_BUF: - if (meta && meta->raw_mode) - return -EACCES; - return check_buffer_access(env, reg, regno, reg->off, - access_size, zero_size_allowed, - "rdonly", - &env->prog->aux->max_rdonly_access); - case PTR_TO_RDWR_BUF: + case PTR_TO_BUF: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) + return -EACCES; + + buf_info = "rdonly"; + max_access = &env->prog->aux->max_rdonly_access; + } else { + buf_info = "rdwr"; + max_access = &env->prog->aux->max_rdwr_access; + } return check_buffer_access(env, reg, regno, reg->off, access_size, zero_size_allowed, - "rdwr", - &env->prog->aux->max_rdwr_access); + buf_info, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, @@ -4245,9 +4292,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, register_is_null(reg)) return 0; - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); + verbose(env, "R%d type=%s ", regno, + reg_type_str(env, reg->type)); + verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); return -EACCES; } } @@ -4335,9 +4382,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { - return type == ARG_PTR_TO_MEM || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_UNINIT_MEM; + return base_type(type) == ARG_PTR_TO_MEM || + base_type(type) == ARG_PTR_TO_UNINIT_MEM; } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -4437,8 +4483,8 @@ static const struct bpf_reg_types mem_types = { PTR_TO_PACKET_META, PTR_TO_MAP_VALUE, PTR_TO_MEM, - PTR_TO_RDONLY_BUF, - PTR_TO_RDWR_BUF, + PTR_TO_MEM | MEM_ALLOC, + PTR_TO_BUF, }, }; @@ -4454,7 +4500,7 @@ static const struct bpf_reg_types int_ptr_types = { static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -4464,26 +4510,21 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, - [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_MAP_PTR] = &const_map_ptr_types, [ARG_PTR_TO_CTX] = &context_types, - [ARG_PTR_TO_CTX_OR_NULL] = &context_types, [ARG_PTR_TO_SOCK_COMMON] = &sock_types, #ifdef CONFIG_NET [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, #endif [ARG_PTR_TO_SOCKET] = &fullsock_types, - [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, [ARG_PTR_TO_UNINIT_MEM] = &mem_types, [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, - [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, @@ -4498,11 +4539,41 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, const struct bpf_reg_types *compatible; int i, j; - compatible = compatible_reg_types[arg_type]; + compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); return -EFAULT; } + + /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY + * + * Same for MAYBE_NULL: + * + * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL + * + * Therefore we fold these flags depending on the arg_type before comparison. + */ + if (arg_type & MEM_RDONLY) + type &= ~MEM_RDONLY; + if (arg_type & PTR_MAYBE_NULL) + type &= ~PTR_MAYBE_NULL; + + /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY + * + * Same for MAYBE_NULL: + * + * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL + * + * Therefore we fold these flags depending on the arg_type before comparison. + */ + if (arg_type & MEM_RDONLY) + type &= ~MEM_RDONLY; + if (arg_type & PTR_MAYBE_NULL) + type &= ~PTR_MAYBE_NULL; for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; @@ -4513,14 +4584,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); for (j = 0; j + 1 < i; j++) - verbose(env, "%s, ", reg_type_str[compatible->types[j]]); - verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); + verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); return -EACCES; found: - if (type == PTR_TO_BTF_ID) { + if (reg->type == PTR_TO_BTF_ID) { if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -4536,12 +4607,6 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, kernel_type_name(*arg_btf_id)); return -EACCES; } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } } return 0; @@ -4579,15 +4644,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { + if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err; } - if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + if (register_is_null(reg) && type_may_be_null(arg_type)) /* A NULL register has a SCALAR_VALUE type, so skip * type checking. */ @@ -4597,10 +4661,33 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - if (type == PTR_TO_CTX) { - err = check_ctx_reg(env, reg, regno); + switch ((u32)type) { + case SCALAR_VALUE: + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_STACK: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (arg_type == ARG_PTR_TO_ALLOC_MEM) + goto force_off_check; + break; + /* All the rest must be rejected: */ + default: +force_off_check: + err = __check_ptr_off_reg(env, reg, regno, + type == PTR_TO_BTF_ID); if (err < 0) return err; + break; } skip_type_check: @@ -4634,10 +4721,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE || - (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && - !register_is_null(reg)) || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (type_may_be_null(arg_type) && register_is_null(reg)) + return 0; + /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ @@ -4880,6 +4968,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_inode_storage_delete) goto error; break; + case BPF_MAP_TYPE_TASK_STORAGE: + if (func_id != BPF_FUNC_task_storage_get && + func_id != BPF_FUNC_task_storage_delete) + goto error; + break; default: break; } @@ -4964,6 +5057,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) goto error; break; + case BPF_FUNC_task_storage_get: + case BPF_FUNC_task_storage_delete: + if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) + goto error; + break; default: break; } @@ -5410,6 +5508,8 @@ static int check_reference_leak(struct bpf_verifier_env *env) static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; + enum bpf_return_type ret_type; + enum bpf_type_flag ret_flag; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; bool changes_data; @@ -5521,13 +5621,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* update return register (already marked as written above) */ - if (fn->ret_type == RET_INTEGER) { + ret_type = fn->ret_type; + ret_flag = type_flag(fn->ret_type); + if (ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); - } else if (fn->ret_type == RET_VOID) { + } else if (ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || - fn->ret_type == RET_PTR_TO_MAP_VALUE) { + } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -5540,28 +5641,25 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - if (map_value_has_spin_lock(meta.map_ptr)) - regs[BPF_REG_0].id = ++env->id_gen; - } else { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; + if (!type_may_be_null(ret_type) && + map_value_has_spin_lock(meta.map_ptr)) { + regs[BPF_REG_0].id = ++env->id_gen; } - } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; + } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; + } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; + } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { + } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -5579,35 +5677,39 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn tname, PTR_ERR(ret)); return -EINVAL; } - regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_MEM : PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { - regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + /* MEM_RDONLY may be carried from ret_flag, but it + * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise + * it will confuse the check of PTR_TO_BTF_ID in + * check_mem_access(). + */ + ret_flag &= ~MEM_RDONLY; + + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { - verbose(env, "invalid return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "invalid return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), + func_id); return -EINVAL; } regs[BPF_REG_0].btf_id = ret_btf_id; } else { - verbose(env, "unknown return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "unknown return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; } - if (reg_type_may_be_null(regs[BPF_REG_0].type)) + if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; if (is_ptr_cast_function(func_id)) { @@ -5708,25 +5810,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "math between %s pointer and %lld is not allowed\n", - reg_type_str[type], val); + reg_type_str(env, type), val); return false; } if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { verbose(env, "%s pointer offset %d is not allowed\n", - reg_type_str[type], reg->off); + reg_type_str(env, type), reg->off); return false; } if (smin == S64_MIN) { verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", - reg_type_str[type]); + reg_type_str(env, type)); return false; } if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { verbose(env, "value %lld makes %s pointer be out of bounds\n", - smin, reg_type_str[type]); + smin, reg_type_str(env, type)); return false; } @@ -6103,11 +6205,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - switch (ptr_reg->type) { - case PTR_TO_MAP_VALUE_OR_NULL: + if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; + } + + switch (base_type(ptr_reg->type)) { case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) @@ -6120,10 +6224,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_XDP_SOCK: reject: verbose(env, "R%d pointer arithmetic on %s prohibited\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; default: - if (reg_type_may_be_null(ptr_reg->type)) + if (type_may_be_null(ptr_reg->type)) goto reject; break; } @@ -7816,7 +7920,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id && + if (type_may_be_null(reg->type) && reg->id == id && !WARN_ON_ONCE(!reg->id)) { if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0) || @@ -7830,43 +7934,19 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - const struct bpf_map *map = reg->map_ptr; - - if (map->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = map->inner_map_meta; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - reg->type = PTR_TO_XDP_SOCK; - } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH) { - reg->type = PTR_TO_SOCKET; - } else { - reg->type = PTR_TO_MAP_VALUE; - } - } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { - reg->type = PTR_TO_SOCKET; - } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { - reg->type = PTR_TO_SOCK_COMMON; - } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { - reg->type = PTR_TO_TCP_SOCK; - } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { - reg->type = PTR_TO_BTF_ID; - } else if (reg->type == PTR_TO_MEM_OR_NULL) { - reg->type = PTR_TO_MEM; - } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { - reg->type = PTR_TO_RDONLY_BUF; - } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { - reg->type = PTR_TO_RDWR_BUF; - } - if (is_null) { /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ reg->id = 0; reg->ref_obj_id = 0; - } else if (!reg_may_point_to_spin_lock(reg)) { + + return; + } + + mark_ptr_not_null_reg(reg); + + if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset * in release_reference(). * @@ -8190,7 +8270,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - reg_type_may_be_null(dst_reg->type)) { + type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ @@ -8241,11 +8321,15 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - if (insn->src_reg == BPF_PSEUDO_BTF_ID) { - mark_reg_known_zero(env, regs, insn->dst_reg); + /* All special src_reg cases are listed below. From this point onwards + * we either succeed and assign a corresponding dst_reg->type after + * zeroing the offset, or fail and reject the program. + */ + mark_reg_known_zero(env, regs, insn->dst_reg); + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { dst_reg->type = aux->btf_var.reg_type; - switch (dst_reg->type) { + switch (base_type(dst_reg->type)) { case PTR_TO_MEM: dst_reg->mem_size = aux->btf_var.mem_size; break; @@ -8261,7 +8345,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { @@ -8363,7 +8446,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } - err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + err = check_ptr_off_reg(env, ®s[ctx_reg], ctx_reg); if (err < 0) return err; @@ -8419,7 +8502,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (is_subprog) { if (reg->type != SCALAR_VALUE) { verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } return 0; @@ -8480,7 +8563,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } @@ -9228,7 +9311,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return true; if (rcur->type == NOT_INIT) return false; - switch (rold->type) { + switch (base_type(rold->type)) { case SCALAR_VALUE: if (env->explore_alu_limits) return false; @@ -9249,6 +9332,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return false; } case PTR_TO_MAP_VALUE: + /* a PTR_TO_MAP_VALUE could be safe to use as a + * PTR_TO_MAP_VALUE_OR_NULL into the same map. + * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- + * checked, doing so could have affected others with the same + * id, and we can't check for that because we lost the id when + * we converted to a PTR_TO_MAP_VALUE. + */ + if (type_may_be_null(rold->type)) { + if (!type_may_be_null(rcur->type)) + return false; + if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) + return false; + /* Check our ids match any regs they're supposed to */ + return check_ids(rold->id, rcur->id, idmap); + } + /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. * 'id' is not compared, since it's only used for maps with @@ -9260,20 +9359,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_MAP_VALUE_OR_NULL: - /* a PTR_TO_MAP_VALUE could be safe to use as a - * PTR_TO_MAP_VALUE_OR_NULL into the same map. - * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- - * checked, doing so could have affected others with the same - * id, and we can't check for that because we lost the id when - * we converted to a PTR_TO_MAP_VALUE. - */ - if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL) - return false; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) - return false; - /* Check our ids match any regs they're supposed to */ - return check_ids(rold->id, rcur->id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: if (rcur->type != rold->type) @@ -9302,11 +9387,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -9820,17 +9902,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* Return true if it's OK to have the same insn return a different type. */ static bool reg_type_mismatch_ok(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_CTX: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -10048,7 +10126,7 @@ static int do_check(struct bpf_verifier_env *env) if (is_ctx_reg(env, insn->dst_reg)) { verbose(env, "BPF_ST stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; } @@ -10256,7 +10334,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, tname, PTR_ERR(ret)); return -EINVAL; } - aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; @@ -10331,11 +10409,21 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } - if ((is_tracing_prog_type(prog_type) || - prog_type == BPF_PROG_TYPE_SOCKET_FILTER) && - map_value_has_spin_lock(map)) { - verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); - return -EINVAL; + if (map_value_has_spin_lock(map)) { + if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + + if (prog->aux->sleepable) { + verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index c99de4a214588..7ee3515d29be8 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -20,6 +20,16 @@ config DMA_OPS config DMA_OPS_BYPASS bool +config DMA_PAGE_TOUCHING + bool "Support touching pages when allocated for DMA" + help + Builds in support for binding page touching DMA ops to devices which + don't have an IOMMU. Memory mapped for DMA by those devices will be + access by the CPU via the page touching dma_map_ops to ensure that + the memory is resident when running on a memory overcommit host. + The capacility must still be set up at boot time via the + page_touching.dma_page_touching_enable kernel command line param. + config NEED_SG_DMA_LENGTH bool diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index dc755ab68aabf..242d75defc736 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o obj-$(CONFIG_DMA_REMAP) += remap.o +obj-$(CONFIG_DMA_PAGE_TOUCHING) += page_touching.o diff --git a/kernel/dma/page_touching.c b/kernel/dma/page_touching.c new file mode 100644 index 0000000000000..c5ffb90a40a51 --- /dev/null +++ b/kernel/dma/page_touching.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2020 Amazon.com, Inc. or its affiliates. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "direct.h" +#include + +/* + * A wrapper around dma_direct which does a readb on the memory being mapped + * for DMA to ensure that it becomes resident. + * Useful when running in a memory overcommit environment with lazy allocation + * and free page reporting. + */ + +/* + * Set with kernel cmd line param: + * page_touching.dma_page_touching_enable=y + */ +static bool dma_page_touching_enable __ro_after_init; +module_param_named(dma_page_touching_enable, dma_page_touching_enable, bool, 0400); +MODULE_PARM_DESC(dma_page_touching_enable, + "Touch pages allocated for DMA to ensure they are resident"); + +static void touch_each_page(void *start_addr, size_t size) +{ + int addr_offset; + + for (addr_offset = 0; addr_offset < size; addr_offset += PAGE_SIZE) + __raw_readb((char *)start_addr + addr_offset); +} + +static void *page_touching_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + unsigned long attrs) +{ + char *kaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs); + + if (!kaddr) + return NULL; + touch_each_page(kaddr, size); + return kaddr; + +} + +static dma_addr_t page_touching_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + dma_addr_t dma_handle = dma_direct_map_page(dev, page, offset, size, dir, attrs); + + if (!(dma_mapping_error(dev, dma_handle))) + touch_each_page(page_to_virt(page) + offset, size); + return dma_handle; +} + +static int page_touching_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *sg; + int i, ret = dma_direct_map_sg(dev, sglist, nents, dir, attrs); + + if (!ret) + goto out; + + for_each_sg(sglist, sg, nents, i) + touch_each_page(page_to_virt(sg_page(sg)) + sg->offset, sg->length); + +out: + return ret; + +} + +/* + * Only a portion of the dma_map_ops interface is implemented here; enough for + * the EC2 ENA / NVMe drivers to work. + * Notibly missing is alloc_pages. + */ +const static struct dma_map_ops page_touching_dma_ops = { + .alloc = page_touching_dma_alloc, + .free = dma_direct_free, + .mmap = dma_common_mmap, + .map_page = page_touching_dma_map_page, + .unmap_page = dma_direct_unmap_page, + .map_sg = page_touching_dma_map_sg, + .unmap_sg = dma_direct_unmap_sg, + .dma_supported = dma_direct_supported, + .sync_single_for_cpu = dma_direct_sync_single_for_cpu, + .sync_single_for_device = dma_direct_sync_single_for_device, + .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, + .dma_supported = dma_direct_supported, + .get_required_mask = dma_direct_get_required_mask, + .max_mapping_size = dma_direct_max_mapping_size, +}; + +void setup_dma_page_touching_ops(struct device *dev) +{ + if (!dma_page_touching_enable || dev->dma_ops) + return; + + dev_info(dev, "binding to page touching DMA ops\n"); + dev->dma_ops = &page_touching_dma_ops; +} diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e7d284261d450..75c7aed4deaa6 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -173,11 +173,11 @@ static void irq_state_clr_masked(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } -static void irq_state_clr_started(struct irq_desc *desc) +void irq_state_clr_started(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); } - +EXPORT_SYMBOL_GPL(irq_state_clr_started); static void irq_state_set_started(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index b04b87a4e0a7b..f4195d78db29d 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -416,8 +416,11 @@ void klp_try_complete_transition(void) for_each_possible_cpu(cpu) { task = idle_task(cpu); if (cpu_online(cpu)) { - if (!klp_try_switch_task(task)) + if (!klp_try_switch_task(task)) { complete = false; + /* Make idle task go through the main loop. */ + wake_up_if_idle(cpu); + } } else if (task->patch_state != klp_target_state) { /* offline idle tasks can be switched immediately */ clear_tsk_thread_flag(task, TIF_PATCH_PENDING); diff --git a/kernel/module.c b/kernel/module.c index 33d1dc6d4cd6a..6a0fd245c0483 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3661,8 +3661,7 @@ static bool finished_loading(const char *name) sched_annotate_sleep(); mutex_lock(&module_mutex); mod = find_module_all(name, strlen(name), true); - ret = !mod || mod->state == MODULE_STATE_LIVE - || mod->state == MODULE_STATE_GOING; + ret = !mod || mod->state == MODULE_STATE_LIVE; mutex_unlock(&module_mutex); return ret; @@ -3828,35 +3827,20 @@ static int add_unformed_module(struct module *mod) mod->state = MODULE_STATE_UNFORMED; +again: mutex_lock(&module_mutex); old = find_module_all(mod->name, strlen(mod->name), true); if (old != NULL) { - if (old->state == MODULE_STATE_COMING - || old->state == MODULE_STATE_UNFORMED) { + if (old->state != MODULE_STATE_LIVE) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); err = wait_event_interruptible(module_wq, finished_loading(mod->name)); if (err) goto out_unlocked; - - /* The module might have gone in the meantime. */ - mutex_lock(&module_mutex); - old = find_module_all(mod->name, strlen(mod->name), - true); + goto again; } - - /* - * We are here only when the same module was being loaded. Do - * not try to load it again right now. It prevents long delays - * caused by serialized module load failures. It might happen - * when more devices of the same type trigger load of - * a particular module. - */ - if (old && old->state == MODULE_STATE_LIVE) - err = -EEXIST; - else - err = -EBUSY; + err = -EEXIST; goto out; } mod_update_bounds(mod); diff --git a/kernel/power/user.c b/kernel/power/user.c index 13cca2e2c2bc6..2b2535dc2ea26 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -239,6 +239,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data, if (data->swap < 0) return swdev ? -ENODEV : -EINVAL; data->dev = swdev; + + swsusp_resume_device = swdev; + swsusp_resume_block = offset; + return 0; } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 12bca64dff731..fc7bf3ef711e6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -116,7 +116,7 @@ static void __scd_stamp(struct sched_clock_data *scd) scd->tick_raw = sched_clock(); } -static void __set_sched_clock_stable(void) +void set_sched_clock_stable(void) { struct sched_clock_data *scd; @@ -236,7 +236,7 @@ static int __init sched_clock_init_late(void) smp_mb(); /* matches {set,clear}_sched_clock_stable() */ if (__sched_clock_stable_early) - __set_sched_clock_stable(); + set_sched_clock_stable(); return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d6dd14cfd261..49f1ae8fd3236 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2639,15 +2639,11 @@ void wake_up_if_idle(int cpu) if (!is_idle_task(rcu_dereference(rq->curr))) goto out; - if (set_nr_if_polling(rq->idle)) { - trace_sched_wake_idle_without_ipi(cpu); - } else { - rq_lock_irqsave(rq, &rf); - if (is_idle_task(rq->curr)) - smp_send_reschedule(cpu); - /* Else CPU is not idle, do nothing here: */ - rq_unlock_irqrestore(rq, &rf); - } + rq_lock_irqsave(rq, &rf); + if (is_idle_task(rq->curr)) + resched_curr(rq); + /* Else CPU is not idle, do nothing here: */ + rq_unlock_irqrestore(rq, &rf); out: rcu_read_unlock(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d53f57ac76094..7469c812623ab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1546,6 +1546,7 @@ struct task_numa_env { int src_cpu, src_nid; int dst_cpu, dst_nid; + int imb_numa_nr; struct numa_stats src_stats, dst_stats; @@ -1560,7 +1561,8 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int nr_running); +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int imb_numa_nr); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1940,7 +1942,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, src_running = env->src_stats.nr_running - 1; dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); - imbalance = adjust_numa_imbalance(imbalance, dst_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running, + env->imb_numa_nr); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -2005,8 +2008,10 @@ static int task_numa_migrate(struct task_struct *p) */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - if (sd) + if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + env.imb_numa_nr = sd->imb_numa_nr; + } rcu_read_unlock(); /* @@ -9085,6 +9090,16 @@ static bool update_pick_idlest(struct sched_group *idlest, return true; } +/* + * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. + * This is an approximation as the number of running tasks may not be + * related to the number of busy CPUs due to sched_setaffinity. + */ +static inline bool allow_numa_imbalance(int running, int imb_numa_nr) +{ + return running <= imb_numa_nr; +} + /* * find_idlest_group() finds and returns the least busy CPU group within the * domain. @@ -9103,9 +9118,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) .group_type = group_overloaded, }; - imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - do { int local_group; @@ -9159,6 +9171,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) switch (local_sgs.group_type) { case group_overloaded: case group_fully_busy: + + /* Calculate allowed imbalance based on load */ + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + /* * When comparing groups across NUMA domains, it's possible for * the local domain to be very lightly loaded relative to the @@ -9210,12 +9227,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) return idlest; #endif /* - * Otherwise, keep the task on this node to stay close - * its wakeup source and improve locality. If there is - * a real need of migration, periodic load balance will - * take care of it. + * Otherwise, keep the task close to the wakeup source + * and improve locality if the number of running tasks + * would remain below threshold where an imbalance is + * allowed. If there is a real need of migration, + * periodic load balance will take care of it. */ - if (local_sgs.idle_cpus) + if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr)) return NULL; } @@ -9317,16 +9335,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } } -static inline long adjust_numa_imbalance(int imbalance, int nr_running) +#define NUMA_IMBALANCE_MIN 2 + +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int imb_numa_nr) { - unsigned int imbalance_min; + if (!allow_numa_imbalance(dst_running, imb_numa_nr)) + return imbalance; /* * Allow a small imbalance based on a simple pair of communicating - * tasks that remain local when the source domain is almost idle. + * tasks that remain local when the destination is lightly loaded. */ - imbalance_min = 2; - if (nr_running <= imbalance_min) + if (imbalance <= NUMA_IMBALANCE_MIN) return 0; return imbalance; @@ -9429,9 +9450,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* Consider allowing a small imbalance between NUMA groups */ - if (env->sd->flags & SD_NUMA) + if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running); + local->sum_nr_running + 1, env->sd->imb_numa_nr); + } return; } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ff2c6d3ba6c79..94f1e6299aa19 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2035,6 +2035,59 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + /* + * Calculate an allowed NUMA imbalance such that LLCs do not get + * imbalanced. + */ + for_each_cpu(i, cpu_map) { + unsigned int imb = 0; + unsigned int imb_span = 1; + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + struct sched_domain *child = sd->child; + + if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child && + (child->flags & SD_SHARE_PKG_RESOURCES)) { + struct sched_domain *top, *top_p; + unsigned int nr_llcs; + + /* + * For a single LLC per node, allow an + * imbalance up to 25% of the node. This is an + * arbitrary cutoff based on SMT-2 to balance + * between memory bandwidth and avoiding + * premature sharing of HT resources and SMT-4 + * or SMT-8 *may* benefit from a different + * cutoff. + * + * For multiple LLCs, allow an imbalance + * until multiple tasks would share an LLC + * on one node while LLCs on another node + * remain idle. + */ + nr_llcs = sd->span_weight / child->span_weight; + if (nr_llcs == 1) + imb = sd->span_weight >> 2; + else + imb = nr_llcs; + sd->imb_numa_nr = imb; + + /* Set span based on the first NUMA domain. */ + top = sd; + top_p = top->parent; + while (top_p && !(top_p->flags & SD_NUMA)) { + top = top->parent; + top_p = top->parent; + } + imb_span = top_p ? top_p->span_weight : sd->span_weight; + } else { + int factor = max(1U, (sd->span_weight / imb_span)); + + sd->imb_numa_nr = imb * factor; + } + } + } + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) diff --git a/kernel/smp.c b/kernel/smp.c index b0684b4c111e9..8ba0fd953f001 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -952,14 +952,12 @@ void wake_up_all_idle_cpus(void) { int cpu; - preempt_disable(); - for_each_online_cpu(cpu) { - if (cpu == smp_processor_id()) - continue; - - wake_up_if_idle(cpu); + for_each_possible_cpu(cpu) { + preempt_disable(); + if (cpu != smp_processor_id() && cpu_online(cpu)) + wake_up_if_idle(cpu); + preempt_enable(); } - preempt_enable(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 86e0fbe583f2b..7fd99cb7c22fe 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1108,6 +1108,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_arch_init(cs); + if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX)) + cs->id = CSID_GENERIC; if (cs->vdso_clock_mode < 0 || cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d9b48f7a35e0d..630d00fe7ee3d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1055,6 +1055,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) do { seq = read_seqcount_begin(&tk_core.seq); now = tk_clock_read(&tk->tkr_mono); + systime_snapshot->cs_id = tk->tkr_mono.clock->id; systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e87e638c31bdf..f7d3a108e27c9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2050,26 +2050,28 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); /** - * usleep_range - Sleep for an approximate time - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping * * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range() instead of udelay(). The sleep improves responsiveness + * usleep_range_state() instead of udelay(). The sleep improves responsiveness * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces * power usage by allowing hrtimers to take advantage of an already- * scheduled interrupt instead of scheduling a new one just for this sleep. */ -void __sched usleep_range(unsigned long min, unsigned long max) +void __sched usleep_range_state(unsigned long min, unsigned long max, + unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; for (;;) { - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } -EXPORT_SYMBOL(usleep_range); +EXPORT_SYMBOL(usleep_range_state); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1de9a6bf84711..279bf9042cd26 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -342,7 +342,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -545,7 +545,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, }; @@ -754,9 +754,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -771,7 +771,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -795,7 +795,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -956,7 +956,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1021,6 +1021,20 @@ const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_get_current_task_btf) +{ + return (unsigned long) current; +} + +BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct) + +static const struct bpf_func_proto bpf_get_current_task_btf_proto = { + .func = bpf_get_current_task_btf, + .gpl_only = true, + .ret_type = RET_PTR_TO_BTF_ID, + .ret_btf_id = &bpf_get_current_btf_ids[0], +}; + BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1278,6 +1292,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; + case BPF_FUNC_get_current_task_btf: + return &bpf_get_current_task_btf_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: @@ -1422,7 +1438,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1640,7 +1656,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1694,7 +1710,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -1733,6 +1749,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_tcp_request_sock_proto; case BPF_FUNC_skc_to_udp6_sock: return &bpf_skc_to_udp6_sock_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/mm/Kconfig b/mm/Kconfig index 390165ffbb0fc..ecadb0fb6cd47 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -183,6 +183,11 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION +config MHP_MEMMAP_ON_MEMORY + def_bool y + depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP + depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. @@ -749,10 +754,18 @@ config DEFERRED_STRUCT_PAGE_INIT lifetime of the system until these kthreads finish the initialisation. +config PAGE_IDLE_FLAG + bool + select PAGE_EXTENSION if !64BIT + help + This adds PG_idle and PG_young flags to 'struct page'. PTE Accessed + bit writers can set the state of the bit in the flags so that PTE + Accessed bit readers may avoid disturbance. + config IDLE_PAGE_TRACKING bool "Enable idle page tracking" depends on SYSFS && MMU - select PAGE_EXTENSION if !64BIT + select PAGE_IDLE_FLAG help This feature allows to estimate the amount of user pages that have not been touched during a given period of time. This information can @@ -859,4 +872,6 @@ config ARCH_HAS_HUGEPD config MAPPING_DIRTY_HELPERS bool +source "mm/damon/Kconfig" + endmenu diff --git a/mm/Makefile b/mm/Makefile index d73aed0fc99c1..0096744d090bd 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -58,9 +58,13 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ page-alloc-y := page_alloc.o page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o +# Give 'memory_hotplug' its own module-parameter namespace +memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o + obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o +obj-y += $(memory-hotplug-y) ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o @@ -82,7 +86,6 @@ obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o -obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o @@ -112,6 +115,7 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o +obj-$(CONFIG_DAMON) += damon/ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig new file mode 100644 index 0000000000000..436c6b4cb5ec5 --- /dev/null +++ b/mm/damon/Kconfig @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Data Access Monitoring" + +config DAMON + bool "DAMON: Data Access Monitoring Framework" + help + This builds a framework that allows kernel subsystems to monitor + access frequency of each memory region. The information can be useful + for performance-centric DRAM level memory management. + + See https://damonitor.github.io/doc/html/latest-damon/index.html for + more information. + +config DAMON_KUNIT_TEST + bool "Test for damon" if !KUNIT_ALL_TESTS + depends on DAMON && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + +config DAMON_VADDR + bool "Data access monitoring operations for virtual address spaces" + depends on DAMON && MMU + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring operations for DAMON + that work for virtual address spaces. + +config DAMON_PADDR + bool "Data access monitoring operations for the physical address space" + depends on DAMON && MMU + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring operations for DAMON + that works for the physical address space. + +config DAMON_VADDR_KUNIT_TEST + bool "Test for DAMON operations" if !KUNIT_ALL_TESTS + depends on DAMON_VADDR && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON virtual addresses operations Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + +config DAMON_SYSFS + bool "DAMON sysfs interface" + depends on DAMON && SYSFS + help + This builds the sysfs interface for DAMON. The user space can use + the interface for arbitrary data access monitoring. + +config DAMON_DBGFS + bool "DAMON debugfs interface (DEPRECATED!)" + depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS + help + This builds the debugfs interface for DAMON. The user space admins + can use the interface for arbitrary data access monitoring. + + If unsure, say N. + + This is deprecated, so users should move to the sysfs interface + (DAMON_SYSFS). If you depend on this and cannot move, please report + your usecase to damon@lists.linux.dev and linux-mm@kvack.org. + +config DAMON_DBGFS_KUNIT_TEST + bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS + depends on DAMON_DBGFS && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON debugfs interface Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + +config DAMON_RECLAIM + bool "Build DAMON-based reclaim (DAMON_RECLAIM)" + depends on DAMON_PADDR + help + This builds the DAMON-based reclamation subsystem. It finds pages + that not accessed for a long time (cold) using DAMON and reclaim + those. + + This is suggested to be used as a proactive and lightweight + reclamation under light memory pressure, while the traditional page + scanning-based reclamation is used for heavy pressure. + +config DAMON_LRU_SORT + bool "Build DAMON-based LRU-lists sorting (DAMON_LRU_SORT)" + depends on DAMON_PADDR + help + This builds the DAMON-based LRU-lists sorting subsystem. It tries to + protect frequently accessed (hot) pages while rarely accessed (cold) + pages reclaimed first under memory pressure. + +endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile new file mode 100644 index 0000000000000..f7add3f4aa793 --- /dev/null +++ b/mm/damon/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y := core.o +obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o +obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o +obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o +obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o +obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o +obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h new file mode 100644 index 0000000000000..fae64d32b9257 --- /dev/null +++ b/mm/damon/core-test.h @@ -0,0 +1,343 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_KUNIT_TEST + +#ifndef _DAMON_CORE_TEST_H +#define _DAMON_CORE_TEST_H + +#include + +static void damon_test_regions(struct kunit *test) +{ + struct damon_region *r; + struct damon_target *t; + + r = damon_new_region(1, 2); + KUNIT_EXPECT_EQ(test, 1ul, r->ar.start); + KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + + t = damon_new_target(); + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, 1u, damon_nr_regions(t)); + + damon_del_region(r, t); + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_free_target(t); +} + +static unsigned int nr_damon_targets(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_targets = 0; + + damon_for_each_target(t, ctx) + nr_targets++; + + return nr_targets; +} + +static void damon_test_target(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + + t = damon_new_target(); + KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); + + damon_add_target(c, t); + KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c)); + + damon_destroy_target(t); + KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); + + damon_destroy_ctx(c); +} + +/* + * Test kdamond_reset_aggregated() + * + * DAMON checks access to each region and aggregates this information as the + * access frequency of each region. In detail, it increases '->nr_accesses' of + * regions that an access has confirmed. 'kdamond_reset_aggregated()' flushes + * the aggregated information ('->nr_accesses' of each regions) to the result + * buffer. As a result of the flushing, the '->nr_accesses' of regions are + * initialized to zero. + */ +static void damon_test_aggregate(struct kunit *test) +{ + struct damon_ctx *ctx = damon_new_ctx(); + unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} }; + unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} }; + unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} }; + struct damon_target *t; + struct damon_region *r; + int it, ir; + + for (it = 0; it < 3; it++) { + t = damon_new_target(); + damon_add_target(ctx, t); + } + + it = 0; + damon_for_each_target(t, ctx) { + for (ir = 0; ir < 3; ir++) { + r = damon_new_region(saddr[it][ir], eaddr[it][ir]); + r->nr_accesses = accesses[it][ir]; + damon_add_region(r, t); + } + it++; + } + kdamond_reset_aggregated(ctx); + it = 0; + damon_for_each_target(t, ctx) { + ir = 0; + /* '->nr_accesses' should be zeroed */ + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + ir++; + } + /* regions should be preserved */ + KUNIT_EXPECT_EQ(test, 3, ir); + it++; + } + /* targets also should be preserved */ + KUNIT_EXPECT_EQ(test, 3, it); + + damon_destroy_ctx(ctx); +} + +static void damon_test_split_at(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + + t = damon_new_target(); + r = damon_new_region(0, 100); + damon_add_region(r, t); + damon_split_region_at(t, r, 25); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 25ul); + + r = damon_next_region(r); + KUNIT_EXPECT_EQ(test, r->ar.start, 25ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 100ul); + + damon_free_target(t); + damon_destroy_ctx(c); +} + +static void damon_test_merge_two(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r, *r2, *r3; + int i; + + t = damon_new_target(); + r = damon_new_region(0, 100); + r->nr_accesses = 10; + damon_add_region(r, t); + r2 = damon_new_region(100, 300); + r2->nr_accesses = 20; + damon_add_region(r2, t); + + damon_merge_two_regions(t, r, r2); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 300ul); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 16u); + + i = 0; + damon_for_each_region(r3, t) { + KUNIT_EXPECT_PTR_EQ(test, r, r3); + i++; + } + KUNIT_EXPECT_EQ(test, i, 1); + + damon_free_target(t); +} + +static struct damon_region *__nth_region_of(struct damon_target *t, int idx) +{ + struct damon_region *r; + unsigned int i = 0; + + damon_for_each_region(r, t) { + if (i++ == idx) + return r; + } + + return NULL; +} + +static void damon_test_merge_regions_of(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184}; + unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230}; + unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2}; + + unsigned long saddrs[] = {0, 114, 130, 156, 170}; + unsigned long eaddrs[] = {112, 130, 156, 170, 230}; + int i; + + t = damon_new_target(); + for (i = 0; i < ARRAY_SIZE(sa); i++) { + r = damon_new_region(sa[i], ea[i]); + r->nr_accesses = nrs[i]; + damon_add_region(r, t); + } + + damon_merge_regions_of(t, 9, 9999); + /* 0-112, 114-130, 130-156, 156-170 */ + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); + for (i = 0; i < 5; i++) { + r = __nth_region_of(t, i); + KUNIT_EXPECT_EQ(test, r->ar.start, saddrs[i]); + KUNIT_EXPECT_EQ(test, r->ar.end, eaddrs[i]); + } + damon_free_target(t); +} + +static void damon_test_split_regions_of(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + + t = damon_new_target(); + r = damon_new_region(0, 22); + damon_add_region(r, t); + damon_split_regions_of(t, 2); + KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); + damon_free_target(t); + + t = damon_new_target(); + r = damon_new_region(0, 220); + damon_add_region(r, t); + damon_split_regions_of(t, 4); + KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); + damon_free_target(t); + damon_destroy_ctx(c); +} + +static void damon_test_ops_registration(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_operations ops, bak; + + /* DAMON_OPS_{V,P}ADDR are registered on subsys_initcall */ + KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_VADDR), 0); + KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_PADDR), 0); + + /* Double-registration is prohibited */ + ops.id = DAMON_OPS_VADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); + ops.id = DAMON_OPS_PADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); + + /* Unknown ops id cannot be registered */ + KUNIT_EXPECT_EQ(test, damon_select_ops(c, NR_DAMON_OPS), -EINVAL); + + /* Registration should success after unregistration */ + mutex_lock(&damon_ops_lock); + bak = damon_registered_ops[DAMON_OPS_VADDR]; + damon_registered_ops[DAMON_OPS_VADDR] = (struct damon_operations){}; + mutex_unlock(&damon_ops_lock); + + ops.id = DAMON_OPS_VADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), 0); + + mutex_lock(&damon_ops_lock); + damon_registered_ops[DAMON_OPS_VADDR] = bak; + mutex_unlock(&damon_ops_lock); + + /* Check double-registration failure again */ + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); +} + +static void damon_test_set_regions(struct kunit *test) +{ + struct damon_target *t = damon_new_target(); + struct damon_region *r1 = damon_new_region(4, 16); + struct damon_region *r2 = damon_new_region(24, 32); + struct damon_addr_range range = {.start = 8, .end = 28}; + unsigned long expects[] = {8, 16, 16, 24, 24, 28}; + int expect_idx = 0; + struct damon_region *r; + + damon_add_region(r1, t); + damon_add_region(r2, t); + damon_set_regions(t, &range, 1); + + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]); + KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]); + } + damon_destroy_target(t); +} + +static void damon_test_update_monitoring_result(struct kunit *test) +{ + struct damon_attrs old_attrs = { + .sample_interval = 10, .aggr_interval = 1000,}; + struct damon_attrs new_attrs; + struct damon_region *r = damon_new_region(3, 7); + + r->nr_accesses = 15; + r->age = 20; + + new_attrs = (struct damon_attrs){ + .sample_interval = 100, .aggr_interval = 10000,}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 15); + KUNIT_EXPECT_EQ(test, r->age, 2); + + new_attrs = (struct damon_attrs){ + .sample_interval = 1, .aggr_interval = 1000}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); + KUNIT_EXPECT_EQ(test, r->age, 2); + + new_attrs = (struct damon_attrs){ + .sample_interval = 1, .aggr_interval = 100}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); + KUNIT_EXPECT_EQ(test, r->age, 20); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_test_target), + KUNIT_CASE(damon_test_regions), + KUNIT_CASE(damon_test_aggregate), + KUNIT_CASE(damon_test_split_at), + KUNIT_CASE(damon_test_merge_two), + KUNIT_CASE(damon_test_merge_regions_of), + KUNIT_CASE(damon_test_split_regions_of), + KUNIT_CASE(damon_test_ops_registration), + KUNIT_CASE(damon_test_set_regions), + KUNIT_CASE(damon_test_update_monitoring_result), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_CORE_TEST_H */ + +#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/core.c b/mm/damon/core.c new file mode 100644 index 0000000000000..eb9580942a5c3 --- /dev/null +++ b/mm/damon/core.c @@ -0,0 +1,1471 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Data Access Monitor + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon: " fmt + +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#ifdef CONFIG_DAMON_KUNIT_TEST +#undef DAMON_MIN_REGION +#define DAMON_MIN_REGION 1 +#endif + +static DEFINE_MUTEX(damon_lock); +static int nr_running_ctxs; +static bool running_exclusive_ctxs; + +static DEFINE_MUTEX(damon_ops_lock); +static struct damon_operations damon_registered_ops[NR_DAMON_OPS]; + +static struct kmem_cache *damon_region_cache __ro_after_init; + +/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */ +static bool __damon_is_registered_ops(enum damon_ops_id id) +{ + struct damon_operations empty_ops = {}; + + if (!memcmp(&empty_ops, &damon_registered_ops[id], sizeof(empty_ops))) + return false; + return true; +} + +/** + * damon_is_registered_ops() - Check if a given damon_operations is registered. + * @id: Id of the damon_operations to check if registered. + * + * Return: true if the ops is set, false otherwise. + */ +bool damon_is_registered_ops(enum damon_ops_id id) +{ + bool registered; + + if (id >= NR_DAMON_OPS) + return false; + mutex_lock(&damon_ops_lock); + registered = __damon_is_registered_ops(id); + mutex_unlock(&damon_ops_lock); + return registered; +} + +/** + * damon_register_ops() - Register a monitoring operations set to DAMON. + * @ops: monitoring operations set to register. + * + * This function registers a monitoring operations set of valid &struct + * damon_operations->id so that others can find and use them later. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_register_ops(struct damon_operations *ops) +{ + int err = 0; + + if (ops->id >= NR_DAMON_OPS) + return -EINVAL; + mutex_lock(&damon_ops_lock); + /* Fail for already registered ops */ + if (__damon_is_registered_ops(ops->id)) { + err = -EINVAL; + goto out; + } + damon_registered_ops[ops->id] = *ops; +out: + mutex_unlock(&damon_ops_lock); + return err; +} + +/** + * damon_select_ops() - Select a monitoring operations to use with the context. + * @ctx: monitoring context to use the operations. + * @id: id of the registered monitoring operations to select. + * + * This function finds registered monitoring operations set of @id and make + * @ctx to use it. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) +{ + int err = 0; + + if (id >= NR_DAMON_OPS) + return -EINVAL; + + mutex_lock(&damon_ops_lock); + if (!__damon_is_registered_ops(id)) + err = -EINVAL; + else + ctx->ops = damon_registered_ops[id]; + mutex_unlock(&damon_ops_lock); + return err; +} + +/* + * Construct a damon_region struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_region *damon_new_region(unsigned long start, unsigned long end) +{ + struct damon_region *region; + + region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL); + if (!region) + return NULL; + + region->ar.start = start; + region->ar.end = end; + region->nr_accesses = 0; + INIT_LIST_HEAD(®ion->list); + + region->age = 0; + region->last_nr_accesses = 0; + + return region; +} + +void damon_add_region(struct damon_region *r, struct damon_target *t) +{ + list_add_tail(&r->list, &t->regions_list); + t->nr_regions++; +} + +static void damon_del_region(struct damon_region *r, struct damon_target *t) +{ + list_del(&r->list); + t->nr_regions--; +} + +static void damon_free_region(struct damon_region *r) +{ + kmem_cache_free(damon_region_cache, r); +} + +void damon_destroy_region(struct damon_region *r, struct damon_target *t) +{ + damon_del_region(r, t); + damon_free_region(r); +} + +/* + * Check whether a region is intersecting an address range + * + * Returns true if it is. + */ +static bool damon_intersect(struct damon_region *r, + struct damon_addr_range *re) +{ + return !(r->ar.end <= re->start || re->end <= r->ar.start); +} + +/* + * Fill holes in regions with new regions. + */ +static int damon_fill_regions_holes(struct damon_region *first, + struct damon_region *last, struct damon_target *t) +{ + struct damon_region *r = first; + + damon_for_each_region_from(r, t) { + struct damon_region *next, *newr; + + if (r == last) + break; + next = damon_next_region(r); + if (r->ar.end != next->ar.start) { + newr = damon_new_region(r->ar.end, next->ar.start); + if (!newr) + return -ENOMEM; + damon_insert_region(newr, r, next, t); + } + } + return 0; +} + +/* + * damon_set_regions() - Set regions of a target for given address ranges. + * @t: the given target. + * @ranges: array of new monitoring target ranges. + * @nr_ranges: length of @ranges. + * + * This function adds new regions to, or modify existing regions of a + * monitoring target to fit in specific ranges. + * + * Return: 0 if success, or negative error code otherwise. + */ +int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, + unsigned int nr_ranges) +{ + struct damon_region *r, *next; + unsigned int i; + int err; + + /* Remove regions which are not in the new ranges */ + damon_for_each_region_safe(r, next, t) { + for (i = 0; i < nr_ranges; i++) { + if (damon_intersect(r, &ranges[i])) + break; + } + if (i == nr_ranges) + damon_destroy_region(r, t); + } + + r = damon_first_region(t); + /* Add new regions or resize existing regions to fit in the ranges */ + for (i = 0; i < nr_ranges; i++) { + struct damon_region *first = NULL, *last, *newr; + struct damon_addr_range *range; + + range = &ranges[i]; + /* Get the first/last regions intersecting with the range */ + damon_for_each_region_from(r, t) { + if (damon_intersect(r, range)) { + if (!first) + first = r; + last = r; + } + if (r->ar.start >= range->end) + break; + } + if (!first) { + /* no region intersects with this range */ + newr = damon_new_region( + ALIGN_DOWN(range->start, + DAMON_MIN_REGION), + ALIGN(range->end, DAMON_MIN_REGION)); + if (!newr) + return -ENOMEM; + damon_insert_region(newr, damon_prev_region(r), r, t); + } else { + /* resize intersecting regions to fit in this range */ + first->ar.start = ALIGN_DOWN(range->start, + DAMON_MIN_REGION); + last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + + /* fill possible holes in the range */ + err = damon_fill_regions_holes(first, last, t); + if (err) + return err; + } + } + return 0; +} + +struct damos_filter *damos_new_filter(enum damos_filter_type type, + bool matching) +{ + struct damos_filter *filter; + + filter = kmalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return NULL; + filter->type = type; + filter->matching = matching; + INIT_LIST_HEAD(&filter->list); + return filter; +} + +void damos_add_filter(struct damos *s, struct damos_filter *f) +{ + list_add_tail(&f->list, &s->filters); +} + +static void damos_del_filter(struct damos_filter *f) +{ + list_del(&f->list); +} + +static void damos_free_filter(struct damos_filter *f) +{ + kfree(f); +} + +void damos_destroy_filter(struct damos_filter *f) +{ + damos_del_filter(f); + damos_free_filter(f); +} + +/* initialize private fields of damos_quota and return the pointer */ +static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) +{ + quota->total_charged_sz = 0; + quota->total_charged_ns = 0; + quota->esz = 0; + quota->charged_sz = 0; + quota->charged_from = 0; + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return quota; +} + +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks) +{ + struct damos *scheme; + + scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); + if (!scheme) + return NULL; + scheme->pattern = *pattern; + scheme->action = action; + INIT_LIST_HEAD(&scheme->filters); + scheme->stat = (struct damos_stat){}; + INIT_LIST_HEAD(&scheme->list); + + scheme->quota = *(damos_quota_init_priv(quota)); + + scheme->wmarks = *wmarks; + scheme->wmarks.activated = true; + + return scheme; +} + +void damon_add_scheme(struct damon_ctx *ctx, struct damos *s) +{ + list_add_tail(&s->list, &ctx->schemes); +} + +static void damon_del_scheme(struct damos *s) +{ + list_del(&s->list); +} + +static void damon_free_scheme(struct damos *s) +{ + kfree(s); +} + +void damon_destroy_scheme(struct damos *s) +{ + struct damos_filter *f, *next; + + damos_for_each_filter_safe(f, next, s) + damos_destroy_filter(f); + damon_del_scheme(s); + damon_free_scheme(s); +} + +/* + * Construct a damon_target struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_target *damon_new_target(void) +{ + struct damon_target *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return NULL; + + t->pid = NULL; + t->nr_regions = 0; + INIT_LIST_HEAD(&t->regions_list); + INIT_LIST_HEAD(&t->list); + + return t; +} + +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) +{ + list_add_tail(&t->list, &ctx->adaptive_targets); +} + +bool damon_targets_empty(struct damon_ctx *ctx) +{ + return list_empty(&ctx->adaptive_targets); +} + +static void damon_del_target(struct damon_target *t) +{ + list_del(&t->list); +} + +void damon_free_target(struct damon_target *t) +{ + struct damon_region *r, *next; + + damon_for_each_region_safe(r, next, t) + damon_free_region(r); + kfree(t); +} + +void damon_destroy_target(struct damon_target *t) +{ + damon_del_target(t); + damon_free_target(t); +} + +unsigned int damon_nr_regions(struct damon_target *t) +{ + return t->nr_regions; +} + +struct damon_ctx *damon_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ctx->attrs.sample_interval = 5 * 1000; + ctx->attrs.aggr_interval = 100 * 1000; + ctx->attrs.ops_update_interval = 60 * 1000 * 1000; + + ktime_get_coarse_ts64(&ctx->last_aggregation); + ctx->last_ops_update = ctx->last_aggregation; + + mutex_init(&ctx->kdamond_lock); + + ctx->attrs.min_nr_regions = 10; + ctx->attrs.max_nr_regions = 1000; + + INIT_LIST_HEAD(&ctx->adaptive_targets); + INIT_LIST_HEAD(&ctx->schemes); + + return ctx; +} + +static void damon_destroy_targets(struct damon_ctx *ctx) +{ + struct damon_target *t, *next_t; + + if (ctx->ops.cleanup) { + ctx->ops.cleanup(ctx); + return; + } + + damon_for_each_target_safe(t, next_t, ctx) + damon_destroy_target(t); +} + +void damon_destroy_ctx(struct damon_ctx *ctx) +{ + struct damos *s, *next_s; + + damon_destroy_targets(ctx); + + damon_for_each_scheme_safe(s, next_s, ctx) + damon_destroy_scheme(s); + + kfree(ctx); +} + +static unsigned int damon_age_for_new_attrs(unsigned int age, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + return age * old_attrs->aggr_interval / new_attrs->aggr_interval; +} + +/* convert access ratio in bp (per 10,000) to nr_accesses */ +static unsigned int damon_accesses_bp_to_nr_accesses( + unsigned int accesses_bp, struct damon_attrs *attrs) +{ + unsigned int max_nr_accesses = + attrs->aggr_interval / attrs->sample_interval; + + return accesses_bp * max_nr_accesses / 10000; +} + +/* convert nr_accesses to access ratio in bp (per 10,000) */ +static unsigned int damon_nr_accesses_to_accesses_bp( + unsigned int nr_accesses, struct damon_attrs *attrs) +{ + unsigned int max_nr_accesses = + attrs->aggr_interval / attrs->sample_interval; + + return nr_accesses * 10000 / max_nr_accesses; +} + +static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + return damon_accesses_bp_to_nr_accesses( + damon_nr_accesses_to_accesses_bp( + nr_accesses, old_attrs), + new_attrs); +} + +static void damon_update_monitoring_result(struct damon_region *r, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses, + old_attrs, new_attrs); + r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs); +} + +/* + * region->nr_accesses is the number of sampling intervals in the last + * aggregation interval that access to the region has found, and region->age is + * the number of aggregation intervals that its access pattern has maintained. + * For the reason, the real meaning of the two fields depend on current + * sampling interval and aggregation interval. This function updates + * ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs. + */ +static void damon_update_monitoring_results(struct damon_ctx *ctx, + struct damon_attrs *new_attrs) +{ + struct damon_attrs *old_attrs = &ctx->attrs; + struct damon_target *t; + struct damon_region *r; + + /* if any interval is zero, simply forgive conversion */ + if (!old_attrs->sample_interval || !old_attrs->aggr_interval || + !new_attrs->sample_interval || + !new_attrs->aggr_interval) + return; + + damon_for_each_target(t, ctx) + damon_for_each_region(r, t) + damon_update_monitoring_result( + r, old_attrs, new_attrs); +} + +/** + * damon_set_attrs() - Set attributes for the monitoring. + * @ctx: monitoring context + * @attrs: monitoring attributes + * + * This function should not be called while the kdamond is running. + * Every time interval is in micro-seconds. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) +{ + if (attrs->min_nr_regions < 3) + return -EINVAL; + if (attrs->min_nr_regions > attrs->max_nr_regions) + return -EINVAL; + if (attrs->sample_interval > attrs->aggr_interval) + return -EINVAL; + + damon_update_monitoring_results(ctx, attrs); + ctx->attrs = *attrs; + return 0; +} + +/** + * damon_set_schemes() - Set data access monitoring based operation schemes. + * @ctx: monitoring context + * @schemes: array of the schemes + * @nr_schemes: number of entries in @schemes + * + * This function should not be called while the kdamond of the context is + * running. + */ +void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, + ssize_t nr_schemes) +{ + struct damos *s, *next; + ssize_t i; + + damon_for_each_scheme_safe(s, next, ctx) + damon_destroy_scheme(s); + for (i = 0; i < nr_schemes; i++) + damon_add_scheme(ctx, schemes[i]); +} + +/** + * damon_nr_running_ctxs() - Return number of currently running contexts. + */ +int damon_nr_running_ctxs(void) +{ + int nr_ctxs; + + mutex_lock(&damon_lock); + nr_ctxs = nr_running_ctxs; + mutex_unlock(&damon_lock); + + return nr_ctxs; +} + +/* Returns the size upper limit for each monitoring region */ +static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sz = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + sz += damon_sz_region(r); + } + + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + return sz; +} + +static int kdamond_fn(void *data); + +/* + * __damon_start() - Starts monitoring with given context. + * @ctx: monitoring context + * + * This function should be called while damon_lock is hold. + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_start(struct damon_ctx *ctx) +{ + int err = -EBUSY; + + mutex_lock(&ctx->kdamond_lock); + if (!ctx->kdamond) { + err = 0; + ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d", + nr_running_ctxs); + if (IS_ERR(ctx->kdamond)) { + err = PTR_ERR(ctx->kdamond); + ctx->kdamond = NULL; + } + } + mutex_unlock(&ctx->kdamond_lock); + + return err; +} + +/** + * damon_start() - Starts the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to start monitoring + * @nr_ctxs: size of @ctxs + * @exclusive: exclusiveness of this contexts group + * + * This function starts a group of monitoring threads for a group of monitoring + * contexts. One thread per each context is created and run in parallel. The + * caller should handle synchronization between the threads by itself. If + * @exclusive is true and a group of threads that created by other + * 'damon_start()' call is currently running, this function does nothing but + * returns -EBUSY. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive) +{ + int i; + int err = 0; + + mutex_lock(&damon_lock); + if ((exclusive && nr_running_ctxs) || + (!exclusive && running_exclusive_ctxs)) { + mutex_unlock(&damon_lock); + return -EBUSY; + } + + for (i = 0; i < nr_ctxs; i++) { + err = __damon_start(ctxs[i]); + if (err) + break; + nr_running_ctxs++; + } + if (exclusive && nr_running_ctxs) + running_exclusive_ctxs = true; + mutex_unlock(&damon_lock); + + return err; +} + +/* + * __damon_stop() - Stops monitoring of a given context. + * @ctx: monitoring context + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_stop(struct damon_ctx *ctx) +{ + struct task_struct *tsk; + + mutex_lock(&ctx->kdamond_lock); + tsk = ctx->kdamond; + if (tsk) { + get_task_struct(tsk); + mutex_unlock(&ctx->kdamond_lock); + kthread_stop(tsk); + put_task_struct(tsk); + return 0; + } + mutex_unlock(&ctx->kdamond_lock); + + return -EPERM; +} + +/** + * damon_stop() - Stops the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to stop monitoring + * @nr_ctxs: size of @ctxs + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) +{ + int i, err = 0; + + for (i = 0; i < nr_ctxs; i++) { + /* nr_running_ctxs is decremented in kdamond_fn */ + err = __damon_stop(ctxs[i]); + if (err) + break; + } + return err; +} + +/* + * damon_check_reset_time_interval() - Check if a time interval is elapsed. + * @baseline: the time to check whether the interval has elapsed since + * @interval: the time interval (microseconds) + * + * See whether the given time interval has passed since the given baseline + * time. If so, it also updates the baseline to current time for next check. + * + * Return: true if the time interval has passed, or false otherwise. + */ +static bool damon_check_reset_time_interval(struct timespec64 *baseline, + unsigned long interval) +{ + struct timespec64 now; + + ktime_get_coarse_ts64(&now); + if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) < + interval * 1000) + return false; + *baseline = now; + return true; +} + +/* + * Check whether it is time to flush the aggregated information + */ +static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_aggregation, + ctx->attrs.aggr_interval); +} + +/* + * Reset the aggregated monitoring results ('nr_accesses' of each region). + */ +static void kdamond_reset_aggregated(struct damon_ctx *c) +{ + struct damon_target *t; + unsigned int ti = 0; /* target's index */ + + damon_for_each_target(t, c) { + struct damon_region *r; + + damon_for_each_region(r, t) { + trace_damon_aggregated(t, ti, r, damon_nr_regions(t)); + r->last_nr_accesses = r->nr_accesses; + r->nr_accesses = 0; + } + ti++; + } +} + +static void damon_split_region_at(struct damon_target *t, + struct damon_region *r, unsigned long sz_r); + +static bool __damos_valid_target(struct damon_region *r, struct damos *s) +{ + unsigned long sz; + + sz = damon_sz_region(r); + return s->pattern.min_sz_region <= sz && + sz <= s->pattern.max_sz_region && + s->pattern.min_nr_accesses <= r->nr_accesses && + r->nr_accesses <= s->pattern.max_nr_accesses && + s->pattern.min_age_region <= r->age && + r->age <= s->pattern.max_age_region; +} + +static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, + struct damon_region *r, struct damos *s) +{ + bool ret = __damos_valid_target(r, s); + + if (!ret || !s->quota.esz || !c->ops.get_scheme_score) + return ret; + + return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; +} + +/* + * damos_skip_charged_region() - Check if the given region or starting part of + * it is already charged for the DAMOS quota. + * @t: The target of the region. + * @rp: The pointer to the region. + * @s: The scheme to be applied. + * + * If a quota of a scheme has exceeded in a quota charge window, the scheme's + * action would applied to only a part of the target access pattern fulfilling + * regions. To avoid applying the scheme action to only already applied + * regions, DAMON skips applying the scheme action to the regions that charged + * in the previous charge window. + * + * This function checks if a given region should be skipped or not for the + * reason. If only the starting part of the region has previously charged, + * this function splits the region into two so that the second one covers the + * area that not charged in the previous charge widnow and saves the second + * region in *rp and returns false, so that the caller can apply DAMON action + * to the second one. + * + * Return: true if the region should be entirely skipped, false otherwise. + */ +static bool damos_skip_charged_region(struct damon_target *t, + struct damon_region **rp, struct damos *s) +{ + struct damon_region *r = *rp; + struct damos_quota *quota = &s->quota; + unsigned long sz_to_skip; + + /* Skip previously charged regions */ + if (quota->charge_target_from) { + if (t != quota->charge_target_from) + return true; + if (r == damon_last_region(t)) { + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return true; + } + if (quota->charge_addr_from && + r->ar.end <= quota->charge_addr_from) + return true; + + if (quota->charge_addr_from && r->ar.start < + quota->charge_addr_from) { + sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - + r->ar.start, DAMON_MIN_REGION); + if (!sz_to_skip) { + if (damon_sz_region(r) <= DAMON_MIN_REGION) + return true; + sz_to_skip = DAMON_MIN_REGION; + } + damon_split_region_at(t, r, sz_to_skip); + r = damon_next_region(r); + *rp = r; + } + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + } + return false; +} + +static void damos_update_stat(struct damos *s, + unsigned long sz_tried, unsigned long sz_applied) +{ + s->stat.nr_tried++; + s->stat.sz_tried += sz_tried; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; +} + +static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, + struct damon_region *r, struct damos *s) +{ + struct damos_quota *quota = &s->quota; + unsigned long sz = damon_sz_region(r); + struct timespec64 begin, end; + unsigned long sz_applied = 0; + int err = 0; + + if (c->ops.apply_scheme) { + if (quota->esz && quota->charged_sz + sz > quota->esz) { + sz = ALIGN_DOWN(quota->esz - quota->charged_sz, + DAMON_MIN_REGION); + if (!sz) + goto update_stat; + damon_split_region_at(t, r, sz); + } + ktime_get_coarse_ts64(&begin); + if (c->callback.before_damos_apply) + err = c->callback.before_damos_apply(c, t, r, s); + if (!err) + sz_applied = c->ops.apply_scheme(c, t, r, s); + ktime_get_coarse_ts64(&end); + quota->total_charged_ns += timespec64_to_ns(&end) - + timespec64_to_ns(&begin); + quota->charged_sz += sz; + if (quota->esz && quota->charged_sz >= quota->esz) { + quota->charge_target_from = t; + quota->charge_addr_from = r->ar.end + 1; + } + } + if (s->action != DAMOS_STAT) + r->age = 0; + +update_stat: + damos_update_stat(s, sz, sz_applied); +} + +static void damon_do_apply_schemes(struct damon_ctx *c, + struct damon_target *t, + struct damon_region *r) +{ + struct damos *s; + + damon_for_each_scheme(s, c) { + struct damos_quota *quota = &s->quota; + + if (!s->wmarks.activated) + continue; + + /* Check the quota */ + if (quota->esz && quota->charged_sz >= quota->esz) + continue; + + if (damos_skip_charged_region(t, &r, s)) + continue; + + if (!damos_valid_target(c, t, r, s)) + continue; + + damos_apply_scheme(c, t, r, s); + } +} + +/* Shouldn't be called if quota->ms and quota->sz are zero */ +static void damos_set_effective_quota(struct damos_quota *quota) +{ + unsigned long throughput; + unsigned long esz; + + if (!quota->ms) { + quota->esz = quota->sz; + return; + } + + if (quota->total_charged_ns) + throughput = quota->total_charged_sz * 1000000 / + quota->total_charged_ns; + else + throughput = PAGE_SIZE * 1024; + esz = throughput * quota->ms; + + if (quota->sz && quota->sz < esz) + esz = quota->sz; + quota->esz = esz; +} + +static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) +{ + struct damos_quota *quota = &s->quota; + struct damon_target *t; + struct damon_region *r; + unsigned long cumulated_sz; + unsigned int score, max_score = 0; + + if (!quota->ms && !quota->sz) + return; + + /* New charge window starts */ + if (time_after_eq(jiffies, quota->charged_from + + msecs_to_jiffies(quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; + quota->total_charged_sz += quota->charged_sz; + quota->charged_from = jiffies; + quota->charged_sz = 0; + damos_set_effective_quota(quota); + } + + if (!c->ops.get_scheme_score) + return; + + /* Fill up the score histogram */ + memset(quota->histogram, 0, sizeof(quota->histogram)); + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + if (!__damos_valid_target(r, s)) + continue; + score = c->ops.get_scheme_score(c, t, r, s); + quota->histogram[score] += damon_sz_region(r); + if (score > max_score) + max_score = score; + } + } + + /* Set the min score limit */ + for (cumulated_sz = 0, score = max_score; ; score--) { + cumulated_sz += quota->histogram[score]; + if (cumulated_sz >= quota->esz || !score) + break; + } + quota->min_score = score; +} + +static void kdamond_apply_schemes(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r, *next_r; + struct damos *s; + + damon_for_each_scheme(s, c) { + if (!s->wmarks.activated) + continue; + + damos_adjust_quota(c, s); + } + + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next_r, t) + damon_do_apply_schemes(c, t, r); + } +} + +/* + * Merge two adjacent regions into one region + */ +static void damon_merge_two_regions(struct damon_target *t, + struct damon_region *l, struct damon_region *r) +{ + unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r); + + l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / + (sz_l + sz_r); + l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); + l->ar.end = r->ar.end; + damon_destroy_region(r, t); +} + +/* + * Merge adjacent regions having similar access frequencies + * + * t target affected by this merge operation + * thres '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + */ +static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, + unsigned long sz_limit) +{ + struct damon_region *r, *prev = NULL, *next; + + damon_for_each_region_safe(r, next, t) { + if (abs(r->nr_accesses - r->last_nr_accesses) > thres) + r->age = 0; + else + r->age++; + + if (prev && prev->ar.end == r->ar.start && + abs(prev->nr_accesses - r->nr_accesses) <= thres && + damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) + damon_merge_two_regions(t, prev, r); + else + prev = r; + } +} + +/* + * Merge adjacent regions having similar access frequencies + * + * threshold '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + * + * This function merges monitoring target regions which are adjacent and their + * access frequencies are similar. This is for minimizing the monitoring + * overhead under the dynamically changeable access pattern. If a merge was + * unnecessarily made, later 'kdamond_split_regions()' will revert it. + */ +static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, + unsigned long sz_limit) +{ + struct damon_target *t; + + damon_for_each_target(t, c) + damon_merge_regions_of(t, threshold, sz_limit); +} + +/* + * Split a region in two + * + * r the region to be split + * sz_r size of the first sub-region that will be made + */ +static void damon_split_region_at(struct damon_target *t, + struct damon_region *r, unsigned long sz_r) +{ + struct damon_region *new; + + new = damon_new_region(r->ar.start + sz_r, r->ar.end); + if (!new) + return; + + r->ar.end = new->ar.start; + + new->age = r->age; + new->last_nr_accesses = r->last_nr_accesses; + + damon_insert_region(new, r, damon_next_region(r), t); +} + +/* Split every region in the given target into 'nr_subs' regions */ +static void damon_split_regions_of(struct damon_target *t, int nr_subs) +{ + struct damon_region *r, *next; + unsigned long sz_region, sz_sub = 0; + int i; + + damon_for_each_region_safe(r, next, t) { + sz_region = damon_sz_region(r); + + for (i = 0; i < nr_subs - 1 && + sz_region > 2 * DAMON_MIN_REGION; i++) { + /* + * Randomly select size of left sub-region to be at + * least 10 percent and at most 90% of original region + */ + sz_sub = ALIGN_DOWN(damon_rand(1, 10) * + sz_region / 10, DAMON_MIN_REGION); + /* Do not allow blank region */ + if (sz_sub == 0 || sz_sub >= sz_region) + continue; + + damon_split_region_at(t, r, sz_sub); + sz_region = sz_sub; + } + } +} + +/* + * Split every target region into randomly-sized small regions + * + * This function splits every target region into random-sized small regions if + * current total number of the regions is equal or smaller than half of the + * user-specified maximum number of regions. This is for maximizing the + * monitoring accuracy under the dynamically changeable access patterns. If a + * split was unnecessarily made, later 'kdamond_merge_regions()' will revert + * it. + */ +static void kdamond_split_regions(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_regions = 0; + static unsigned int last_nr_regions; + int nr_subregions = 2; + + damon_for_each_target(t, ctx) + nr_regions += damon_nr_regions(t); + + if (nr_regions > ctx->attrs.max_nr_regions / 2) + return; + + /* Maybe the middle of the region has different access frequency */ + if (last_nr_regions == nr_regions && + nr_regions < ctx->attrs.max_nr_regions / 3) + nr_subregions = 3; + + damon_for_each_target(t, ctx) + damon_split_regions_of(t, nr_subregions); + + last_nr_regions = nr_regions; +} + +/* + * Check whether it is time to check and apply the operations-related data + * structures. + * + * Returns true if it is. + */ +static bool kdamond_need_update_operations(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_ops_update, + ctx->attrs.ops_update_interval); +} + +/* + * Check whether current monitoring should be stopped + * + * The monitoring is stopped when either the user requested to stop, or all + * monitoring targets are invalid. + * + * Returns true if need to stop current monitoring. + */ +static bool kdamond_need_stop(struct damon_ctx *ctx) +{ + struct damon_target *t; + + if (kthread_should_stop()) + return true; + + if (!ctx->ops.target_valid) + return false; + + damon_for_each_target(t, ctx) { + if (ctx->ops.target_valid(t)) + return false; + } + + return true; +} + +static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric) +{ + struct sysinfo i; + + switch (metric) { + case DAMOS_WMARK_FREE_MEM_RATE: + si_meminfo(&i); + return i.freeram * 1000 / i.totalram; + default: + break; + } + return -EINVAL; +} + +/* + * Returns zero if the scheme is active. Else, returns time to wait for next + * watermark check in micro-seconds. + */ +static unsigned long damos_wmark_wait_us(struct damos *scheme) +{ + unsigned long metric; + + if (scheme->wmarks.metric == DAMOS_WMARK_NONE) + return 0; + + metric = damos_wmark_metric_value(scheme->wmarks.metric); + /* higher than high watermark or lower than low watermark */ + if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) { + if (scheme->wmarks.activated) + pr_debug("deactivate a scheme (%d) for %s wmark\n", + scheme->action, + metric > scheme->wmarks.high ? + "high" : "low"); + scheme->wmarks.activated = false; + return scheme->wmarks.interval; + } + + /* inactive and higher than middle watermark */ + if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) && + !scheme->wmarks.activated) + return scheme->wmarks.interval; + + if (!scheme->wmarks.activated) + pr_debug("activate a scheme (%d)\n", scheme->action); + scheme->wmarks.activated = true; + return 0; +} + +static void kdamond_usleep(unsigned long usecs) +{ + /* See Documentation/timers/timers-howto.rst for the thresholds */ + if (usecs > 20 * USEC_PER_MSEC) + schedule_timeout_idle(usecs_to_jiffies(usecs)); + else + usleep_idle_range(usecs, usecs + 1); +} + +/* Returns negative error code if it's not activated but should return */ +static int kdamond_wait_activation(struct damon_ctx *ctx) +{ + struct damos *s; + unsigned long wait_time; + unsigned long min_wait_time = 0; + bool init_wait_time = false; + + while (!kdamond_need_stop(ctx)) { + damon_for_each_scheme(s, ctx) { + wait_time = damos_wmark_wait_us(s); + if (!init_wait_time || wait_time < min_wait_time) { + init_wait_time = true; + min_wait_time = wait_time; + } + } + if (!min_wait_time) + return 0; + + kdamond_usleep(min_wait_time); + + if (ctx->callback.after_wmarks_check && + ctx->callback.after_wmarks_check(ctx)) + break; + } + return -EBUSY; +} + +/* + * The monitoring daemon that runs as a kernel thread + */ +static int kdamond_fn(void *data) +{ + struct damon_ctx *ctx = data; + struct damon_target *t; + struct damon_region *r, *next; + unsigned int max_nr_accesses = 0; + unsigned long sz_limit = 0; + + pr_debug("kdamond (%d) starts\n", current->pid); + + if (ctx->ops.init) + ctx->ops.init(ctx); + if (ctx->callback.before_start && ctx->callback.before_start(ctx)) + goto done; + + sz_limit = damon_region_sz_limit(ctx); + + while (!kdamond_need_stop(ctx)) { + if (kdamond_wait_activation(ctx)) + break; + + if (ctx->ops.prepare_access_checks) + ctx->ops.prepare_access_checks(ctx); + if (ctx->callback.after_sampling && + ctx->callback.after_sampling(ctx)) + break; + + kdamond_usleep(ctx->attrs.sample_interval); + + if (ctx->ops.check_accesses) + max_nr_accesses = ctx->ops.check_accesses(ctx); + + if (kdamond_aggregate_interval_passed(ctx)) { + kdamond_merge_regions(ctx, + max_nr_accesses / 10, + sz_limit); + if (ctx->callback.after_aggregation && + ctx->callback.after_aggregation(ctx)) + break; + if (!list_empty(&ctx->schemes)) + kdamond_apply_schemes(ctx); + kdamond_reset_aggregated(ctx); + kdamond_split_regions(ctx); + if (ctx->ops.reset_aggregated) + ctx->ops.reset_aggregated(ctx); + } + + if (kdamond_need_update_operations(ctx)) { + if (ctx->ops.update) + ctx->ops.update(ctx); + sz_limit = damon_region_sz_limit(ctx); + } + } +done: + damon_for_each_target(t, ctx) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + + if (ctx->callback.before_terminate) + ctx->callback.before_terminate(ctx); + if (ctx->ops.cleanup) + ctx->ops.cleanup(ctx); + + pr_debug("kdamond (%d) finishes\n", current->pid); + mutex_lock(&ctx->kdamond_lock); + ctx->kdamond = NULL; + mutex_unlock(&ctx->kdamond_lock); + + mutex_lock(&damon_lock); + nr_running_ctxs--; + if (!nr_running_ctxs && running_exclusive_ctxs) + running_exclusive_ctxs = false; + mutex_unlock(&damon_lock); + + return 0; +} + +/* + * struct damon_system_ram_region - System RAM resource address region of + * [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_system_ram_region { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_system_ram_region *a = arg; + + if (a->end - a->start < resource_size(res)) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +static bool damon_find_biggest_system_ram(unsigned long *start, + unsigned long *end) + +{ + struct damon_system_ram_region arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + +/** + * damon_set_region_biggest_system_ram_default() - Set the region of the given + * monitoring target as requested, or biggest 'System RAM'. + * @t: The monitoring target to set the region. + * @start: The pointer to the start address of the region. + * @end: The pointer to the end address of the region. + * + * This function sets the region of @t as requested by @start and @end. If the + * values of @start and @end are zero, however, this function finds the biggest + * 'System RAM' resource and sets the region to cover the resource. In the + * latter case, this function saves the start and end addresses of the resource + * in @start and @end, respectively. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end) +{ + struct damon_addr_range addr_range; + + if (*start > *end) + return -EINVAL; + + if (!*start && !*end && + !damon_find_biggest_system_ram(start, end)) + return -EINVAL; + + addr_range.start = *start; + addr_range.end = *end; + return damon_set_regions(t, &addr_range, 1); +} + +static int __init damon_init(void) +{ + damon_region_cache = KMEM_CACHE(damon_region, 0); + if (unlikely(!damon_region_cache)) { + pr_err("creating damon_region_cache fails\n"); + return -ENOMEM; + } + + return 0; +} + +subsys_initcall(damon_init); + +#include "core-test.h" diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h new file mode 100644 index 0000000000000..0bb0d532b1590 --- /dev/null +++ b/mm/damon/dbgfs-test.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON Debugfs Interface Unit Tests + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST + +#ifndef _DAMON_DBGFS_TEST_H +#define _DAMON_DBGFS_TEST_H + +#include + +static void damon_dbgfs_test_str_to_ints(struct kunit *test) +{ + char *question; + int *answers; + int expected[] = {12, 35, 46}; + ssize_t nr_integers = 0, i; + + question = "123"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); + KUNIT_EXPECT_EQ(test, 123, answers[0]); + kfree(answers); + + question = "123abc"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); + KUNIT_EXPECT_EQ(test, 123, answers[0]); + kfree(answers); + + question = "a123"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); + + question = "12 35"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); + for (i = 0; i < nr_integers; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = "12 35 46"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); + for (i = 0; i < nr_integers; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = "12 35 abc 46"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); + for (i = 0; i < 2; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = ""; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); + + question = "\n"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); +} + +static void damon_dbgfs_test_set_targets(struct kunit *test) +{ + struct damon_ctx *ctx = dbgfs_new_ctx(); + char buf[64]; + + /* Make DAMON consider target has no pid */ + damon_select_ops(ctx, DAMON_OPS_PADDR); + + dbgfs_set_targets(ctx, 0, NULL); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); + + dbgfs_set_targets(ctx, 1, NULL); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n"); + + dbgfs_set_targets(ctx, 0, NULL); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); + + dbgfs_destroy_ctx(ctx); +} + +static void damon_dbgfs_test_set_init_regions(struct kunit *test) +{ + struct damon_ctx *ctx = damon_new_ctx(); + /* Each line represents one region in `` `` */ + char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", + "1 10 20\n", + "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n", + ""}; + /* Reading the file again will show sorted, clean output */ + char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n", + "1 10 20\n", + "0 39 59\n0 70 134\n1 10 20\n1 20 25\n", + ""}; + char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */ + "1 10 20\n 1 14 26\n", /* regions overlap */ + "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */ + char *input, *expect; + int i, rc; + char buf[256]; + + damon_select_ops(ctx, DAMON_OPS_PADDR); + + dbgfs_set_targets(ctx, 3, NULL); + + /* Put valid inputs and check the results */ + for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { + input = valid_inputs[i]; + expect = valid_expects[i]; + + rc = set_init_regions(ctx, input, strnlen(input, 256)); + KUNIT_EXPECT_EQ(test, rc, 0); + + memset(buf, 0, 256); + sprint_init_regions(ctx, buf, 256); + + KUNIT_EXPECT_STREQ(test, (char *)buf, expect); + } + /* Put invalid inputs and check the return error code */ + for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) { + input = invalid_inputs[i]; + pr_info("input: %s\n", input); + rc = set_init_regions(ctx, input, strnlen(input, 256)); + KUNIT_EXPECT_EQ(test, rc, -EINVAL); + + memset(buf, 0, 256); + sprint_init_regions(ctx, buf, 256); + + KUNIT_EXPECT_STREQ(test, (char *)buf, ""); + } + + dbgfs_set_targets(ctx, 0, NULL); + damon_destroy_ctx(ctx); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_dbgfs_test_str_to_ints), + KUNIT_CASE(damon_dbgfs_test_set_targets), + KUNIT_CASE(damon_dbgfs_test_set_init_regions), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon-dbgfs", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_TEST_H */ + +#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c new file mode 100644 index 0000000000000..124f0f8c97b75 --- /dev/null +++ b/mm/damon/dbgfs.c @@ -0,0 +1,1133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Debugfs Interface + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-dbgfs: " fmt + +#include +#include +#include +#include +#include +#include +#include + +static struct damon_ctx **dbgfs_ctxs; +static int dbgfs_nr_ctxs; +static struct dentry **dbgfs_dirs; +static DEFINE_MUTEX(damon_dbgfs_lock); + +static void damon_dbgfs_warn_deprecation(void) +{ + pr_warn_once("DAMON debugfs interface is deprecated, " + "so users should move to DAMON_SYSFS. If you cannot, " + "please report your usecase to damon@lists.linux.dev and " + "linux-mm@kvack.org.\n"); +} + +/* + * Returns non-empty string on success, negative error code otherwise. + */ +static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret; + + /* We do not accept continuous write */ + if (*ppos) + return ERR_PTR(-EINVAL); + + kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN); + if (!kbuf) + return ERR_PTR(-ENOMEM); + + ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count); + if (ret != count) { + kfree(kbuf); + return ERR_PTR(-EIO); + } + kbuf[ret] = '\0'; + + return kbuf; +} + +static ssize_t dbgfs_attrs_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char kbuf[128]; + int ret; + + mutex_lock(&ctx->kdamond_lock); + ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", + ctx->attrs.sample_interval, ctx->attrs.aggr_interval, + ctx->attrs.ops_update_interval, + ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions); + mutex_unlock(&ctx->kdamond_lock); + + return simple_read_from_buffer(buf, count, ppos, kbuf, ret); +} + +static ssize_t dbgfs_attrs_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + struct damon_attrs attrs; + char *kbuf; + ssize_t ret; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + if (sscanf(kbuf, "%lu %lu %lu %lu %lu", + &attrs.sample_interval, &attrs.aggr_interval, + &attrs.ops_update_interval, + &attrs.min_nr_regions, + &attrs.max_nr_regions) != 5) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + ret = damon_set_attrs(ctx, &attrs); + if (!ret) + ret = count; +unlock_out: + mutex_unlock(&ctx->kdamond_lock); +out: + kfree(kbuf); + return ret; +} + +/* + * Return corresponding dbgfs' scheme action value (int) for the given + * damos_action if the given damos_action value is valid and supported by + * dbgfs, negative error code otherwise. + */ +static int damos_action_to_dbgfs_scheme_action(enum damos_action action) +{ + switch (action) { + case DAMOS_WILLNEED: + return 0; + case DAMOS_COLD: + return 1; + case DAMOS_PAGEOUT: + return 2; + case DAMOS_HUGEPAGE: + return 3; + case DAMOS_NOHUGEPAGE: + return 4; + case DAMOS_STAT: + return 5; + default: + return -EINVAL; + } +} + +static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) +{ + struct damos *s; + int written = 0; + int rc; + + damon_for_each_scheme(s, c) { + rc = scnprintf(&buf[written], len - written, + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + s->pattern.min_sz_region, + s->pattern.max_sz_region, + s->pattern.min_nr_accesses, + s->pattern.max_nr_accesses, + s->pattern.min_age_region, + s->pattern.max_age_region, + damos_action_to_dbgfs_scheme_action(s->action), + s->quota.ms, s->quota.sz, + s->quota.reset_interval, + s->quota.weight_sz, + s->quota.weight_nr_accesses, + s->quota.weight_age, + s->wmarks.metric, s->wmarks.interval, + s->wmarks.high, s->wmarks.mid, s->wmarks.low, + s->stat.nr_tried, s->stat.sz_tried, + s->stat.nr_applied, s->stat.sz_applied, + s->stat.qt_exceeds); + if (!rc) + return -ENOMEM; + + written += rc; + } + return written; +} + +static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_schemes(ctx, kbuf, count); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) +{ + ssize_t i; + + for (i = 0; i < nr_schemes; i++) + kfree(schemes[i]); + kfree(schemes); +} + +/* + * Return corresponding damos_action for the given dbgfs input for a scheme + * action if the input is valid, negative error code otherwise. + */ +static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action) +{ + switch (dbgfs_action) { + case 0: + return DAMOS_WILLNEED; + case 1: + return DAMOS_COLD; + case 2: + return DAMOS_PAGEOUT; + case 3: + return DAMOS_HUGEPAGE; + case 4: + return DAMOS_NOHUGEPAGE; + case 5: + return DAMOS_STAT; + default: + return -EINVAL; + } +} + +/* + * Converts a string into an array of struct damos pointers + * + * Returns an array of struct damos pointers that converted if the conversion + * success, or NULL otherwise. + */ +static struct damos **str_to_schemes(const char *str, ssize_t len, + ssize_t *nr_schemes) +{ + struct damos *scheme, **schemes; + const int max_nr_schemes = 256; + int pos = 0, parsed, ret; + unsigned int action_input; + enum damos_action action; + + schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), + GFP_KERNEL); + if (!schemes) + return NULL; + + *nr_schemes = 0; + while (pos < len && *nr_schemes < max_nr_schemes) { + struct damos_access_pattern pattern = {}; + struct damos_quota quota = {}; + struct damos_watermarks wmarks; + + ret = sscanf(&str[pos], + "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", + &pattern.min_sz_region, &pattern.max_sz_region, + &pattern.min_nr_accesses, + &pattern.max_nr_accesses, + &pattern.min_age_region, + &pattern.max_age_region, + &action_input, "a.ms, + "a.sz, "a.reset_interval, + "a.weight_sz, "a.weight_nr_accesses, + "a.weight_age, &wmarks.metric, + &wmarks.interval, &wmarks.high, &wmarks.mid, + &wmarks.low, &parsed); + if (ret != 18) + break; + action = dbgfs_scheme_action_to_damos_action(action_input); + if ((int)action < 0) + goto fail; + + if (pattern.min_sz_region > pattern.max_sz_region || + pattern.min_nr_accesses > pattern.max_nr_accesses || + pattern.min_age_region > pattern.max_age_region) + goto fail; + + if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || + wmarks.mid < wmarks.low) + goto fail; + + pos += parsed; + scheme = damon_new_scheme(&pattern, action, "a, &wmarks); + if (!scheme) + goto fail; + + schemes[*nr_schemes] = scheme; + *nr_schemes += 1; + } + return schemes; +fail: + free_schemes_arr(schemes, *nr_schemes); + return NULL; +} + +static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + struct damos **schemes; + ssize_t nr_schemes = 0, ret; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + schemes = str_to_schemes(kbuf, count, &nr_schemes); + if (!schemes) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + damon_set_schemes(ctx, schemes, nr_schemes); + ret = count; + nr_schemes = 0; + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + free_schemes_arr(schemes, nr_schemes); +out: + kfree(kbuf); + return ret; +} + +static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) +{ + struct damon_target *t; + int id; + int written = 0; + int rc; + + damon_for_each_target(t, ctx) { + if (damon_target_has_pid(ctx)) + /* Show pid numbers to debugfs users */ + id = pid_vnr(t->pid); + else + /* Show 42 for physical address space, just for fun */ + id = 42; + + rc = scnprintf(&buf[written], len - written, "%d ", id); + if (!rc) + return -ENOMEM; + written += rc; + } + if (written) + written -= 1; + written += scnprintf(&buf[written], len - written, "\n"); + return written; +} + +static ssize_t dbgfs_target_ids_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + ssize_t len; + char ids_buf[320]; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_target_ids(ctx, ids_buf, 320); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + return len; + + return simple_read_from_buffer(buf, count, ppos, ids_buf, len); +} + +/* + * Converts a string into an integers array + * + * Returns an array of integers array if the conversion success, or NULL + * otherwise. + */ +static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints) +{ + int *array; + const int max_nr_ints = 32; + int nr; + int pos = 0, parsed, ret; + + *nr_ints = 0; + array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL); + if (!array) + return NULL; + while (*nr_ints < max_nr_ints && pos < len) { + ret = sscanf(&str[pos], "%d%n", &nr, &parsed); + pos += parsed; + if (ret != 1) + break; + array[*nr_ints] = nr; + *nr_ints += 1; + } + + return array; +} + +static void dbgfs_put_pids(struct pid **pids, int nr_pids) +{ + int i; + + for (i = 0; i < nr_pids; i++) + put_pid(pids[i]); +} + +/* + * Converts a string into an struct pid pointers array + * + * Returns an array of struct pid pointers if the conversion success, or NULL + * otherwise. + */ +static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids) +{ + int *ints; + ssize_t nr_ints; + struct pid **pids; + + *nr_pids = 0; + + ints = str_to_ints(str, len, &nr_ints); + if (!ints) + return NULL; + + pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL); + if (!pids) + goto out; + + for (; *nr_pids < nr_ints; (*nr_pids)++) { + pids[*nr_pids] = find_get_pid(ints[*nr_pids]); + if (!pids[*nr_pids]) { + dbgfs_put_pids(pids, *nr_pids); + kfree(ints); + kfree(pids); + return NULL; + } + } + +out: + kfree(ints); + return pids; +} + +/* + * dbgfs_set_targets() - Set monitoring targets. + * @ctx: monitoring context + * @nr_targets: number of targets + * @pids: array of target pids (size is same to @nr_targets) + * + * This function should not be called while the kdamond is running. @pids is + * ignored if the context is not configured to have pid in each target. On + * failure, reference counts of all pids in @pids are decremented. + * + * Return: 0 on success, negative error code otherwise. + */ +static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, + struct pid **pids) +{ + ssize_t i; + struct damon_target *t, *next; + + damon_for_each_target_safe(t, next, ctx) { + if (damon_target_has_pid(ctx)) + put_pid(t->pid); + damon_destroy_target(t); + } + + for (i = 0; i < nr_targets; i++) { + t = damon_new_target(); + if (!t) { + damon_for_each_target_safe(t, next, ctx) + damon_destroy_target(t); + if (damon_target_has_pid(ctx)) + dbgfs_put_pids(pids, nr_targets); + return -ENOMEM; + } + if (damon_target_has_pid(ctx)) + t->pid = pids[i]; + damon_add_target(ctx, t); + } + + return 0; +} + +static ssize_t dbgfs_target_ids_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + bool id_is_pid = true; + char *kbuf; + struct pid **target_pids = NULL; + ssize_t nr_targets; + ssize_t ret; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + if (!strncmp(kbuf, "paddr\n", count)) { + id_is_pid = false; + nr_targets = 1; + } + + if (id_is_pid) { + target_pids = str_to_pids(kbuf, count, &nr_targets); + if (!target_pids) { + ret = -ENOMEM; + goto out; + } + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + if (id_is_pid) + dbgfs_put_pids(target_pids, nr_targets); + ret = -EBUSY; + goto unlock_out; + } + + /* remove previously set targets */ + dbgfs_set_targets(ctx, 0, NULL); + if (!nr_targets) { + ret = count; + goto unlock_out; + } + + /* Configure the context for the address space type */ + if (id_is_pid) + ret = damon_select_ops(ctx, DAMON_OPS_VADDR); + else + ret = damon_select_ops(ctx, DAMON_OPS_PADDR); + if (ret) + goto unlock_out; + + ret = dbgfs_set_targets(ctx, nr_targets, target_pids); + if (!ret) + ret = count; + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + kfree(target_pids); +out: + kfree(kbuf); + return ret; +} + +static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) +{ + struct damon_target *t; + struct damon_region *r; + int target_idx = 0; + int written = 0; + int rc; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + rc = scnprintf(&buf[written], len - written, + "%d %lu %lu\n", + target_idx, r->ar.start, r->ar.end); + if (!rc) + return -ENOMEM; + written += rc; + } + target_idx++; + } + return written; +} + +static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + mutex_unlock(&ctx->kdamond_lock); + len = -EBUSY; + goto out; + } + + len = sprint_init_regions(ctx, kbuf, count); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static int add_init_region(struct damon_ctx *c, int target_idx, + struct damon_addr_range *ar) +{ + struct damon_target *t; + struct damon_region *r, *prev; + unsigned long idx = 0; + int rc = -EINVAL; + + if (ar->start >= ar->end) + return -EINVAL; + + damon_for_each_target(t, c) { + if (idx++ == target_idx) { + r = damon_new_region(ar->start, ar->end); + if (!r) + return -ENOMEM; + damon_add_region(r, t); + if (damon_nr_regions(t) > 1) { + prev = damon_prev_region(r); + if (prev->ar.end > r->ar.start) { + damon_destroy_region(r, t); + return -EINVAL; + } + } + rc = 0; + } + } + return rc; +} + +static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) +{ + struct damon_target *t; + struct damon_region *r, *next; + int pos = 0, parsed, ret; + int target_idx; + struct damon_addr_range ar; + int err; + + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + + while (pos < len) { + ret = sscanf(&str[pos], "%d %lu %lu%n", + &target_idx, &ar.start, &ar.end, &parsed); + if (ret != 3) + break; + err = add_init_region(c, target_idx, &ar); + if (err) + goto fail; + pos += parsed; + } + + return 0; + +fail: + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + return err; +} + +static ssize_t dbgfs_init_regions_write(struct file *file, + const char __user *buf, size_t count, + loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + err = set_init_regions(ctx, kbuf, ret); + if (err) + ret = err; + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + kfree(kbuf); + return ret; +} + +static ssize_t dbgfs_kdamond_pid_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid); + else + len = scnprintf(kbuf, count, "none\n"); + mutex_unlock(&ctx->kdamond_lock); + if (!len) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static int damon_dbgfs_open(struct inode *inode, struct file *file) +{ + damon_dbgfs_warn_deprecation(); + + file->private_data = inode->i_private; + + return nonseekable_open(inode, file); +} + +static const struct file_operations attrs_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_attrs_read, + .write = dbgfs_attrs_write, +}; + +static const struct file_operations schemes_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_schemes_read, + .write = dbgfs_schemes_write, +}; + +static const struct file_operations target_ids_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_target_ids_read, + .write = dbgfs_target_ids_write, +}; + +static const struct file_operations init_regions_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_init_regions_read, + .write = dbgfs_init_regions_write, +}; + +static const struct file_operations kdamond_pid_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_kdamond_pid_read, +}; + +static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) +{ + const char * const file_names[] = {"attrs", "schemes", "target_ids", + "init_regions", "kdamond_pid"}; + const struct file_operations *fops[] = {&attrs_fops, &schemes_fops, + &target_ids_fops, &init_regions_fops, &kdamond_pid_fops}; + int i; + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); +} + +static void dbgfs_before_terminate(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + + if (!damon_target_has_pid(ctx)) + return; + + mutex_lock(&ctx->kdamond_lock); + damon_for_each_target_safe(t, next, ctx) { + put_pid(t->pid); + damon_destroy_target(t); + } + mutex_unlock(&ctx->kdamond_lock); +} + +static struct damon_ctx *dbgfs_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = damon_new_ctx(); + if (!ctx) + return NULL; + + if (damon_select_ops(ctx, DAMON_OPS_VADDR) && + damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return NULL; + } + ctx->callback.before_terminate = dbgfs_before_terminate; + return ctx; +} + +static void dbgfs_destroy_ctx(struct damon_ctx *ctx) +{ + damon_destroy_ctx(ctx); +} + +/* + * Make a context of @name and create a debugfs directory for it. + * + * This function should be called while holding damon_dbgfs_lock. + * + * Returns 0 on success, negative error code otherwise. + */ +static int dbgfs_mk_context(char *name) +{ + struct dentry *root, **new_dirs, *new_dir; + struct damon_ctx **new_ctxs, *new_ctx; + + if (damon_nr_running_ctxs()) + return -EBUSY; + + new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) * + (dbgfs_nr_ctxs + 1), GFP_KERNEL); + if (!new_ctxs) + return -ENOMEM; + dbgfs_ctxs = new_ctxs; + + new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) * + (dbgfs_nr_ctxs + 1), GFP_KERNEL); + if (!new_dirs) + return -ENOMEM; + dbgfs_dirs = new_dirs; + + root = dbgfs_dirs[0]; + if (!root) + return -ENOENT; + + new_dir = debugfs_create_dir(name, root); + /* Below check is required for a potential duplicated name case */ + if (IS_ERR(new_dir)) + return PTR_ERR(new_dir); + dbgfs_dirs[dbgfs_nr_ctxs] = new_dir; + + new_ctx = dbgfs_new_ctx(); + if (!new_ctx) { + debugfs_remove(new_dir); + dbgfs_dirs[dbgfs_nr_ctxs] = NULL; + return -ENOMEM; + } + + dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx; + dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs], + dbgfs_ctxs[dbgfs_nr_ctxs]); + dbgfs_nr_ctxs++; + + return 0; +} + +static ssize_t dbgfs_mk_context_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + char *ctx_name; + ssize_t ret; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + ctx_name = kmalloc(count + 1, GFP_KERNEL); + if (!ctx_name) { + kfree(kbuf); + return -ENOMEM; + } + + /* Trim white space */ + if (sscanf(kbuf, "%s", ctx_name) != 1) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&damon_dbgfs_lock); + ret = dbgfs_mk_context(ctx_name); + if (!ret) + ret = count; + mutex_unlock(&damon_dbgfs_lock); + +out: + kfree(kbuf); + kfree(ctx_name); + return ret; +} + +/* + * Remove a context of @name and its debugfs directory. + * + * This function should be called while holding damon_dbgfs_lock. + * + * Return 0 on success, negative error code otherwise. + */ +static int dbgfs_rm_context(char *name) +{ + struct dentry *root, *dir, **new_dirs; + struct inode *inode; + struct damon_ctx **new_ctxs; + int i, j; + int ret = 0; + + if (damon_nr_running_ctxs()) + return -EBUSY; + + root = dbgfs_dirs[0]; + if (!root) + return -ENOENT; + + dir = debugfs_lookup(name, root); + if (!dir) + return -ENOENT; + + inode = d_inode(dir); + if (!S_ISDIR(inode->i_mode)) { + ret = -EINVAL; + goto out_dput; + } + + new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), + GFP_KERNEL); + if (!new_dirs) { + ret = -ENOMEM; + goto out_dput; + } + + new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), + GFP_KERNEL); + if (!new_ctxs) { + ret = -ENOMEM; + goto out_new_dirs; + } + + for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { + if (dbgfs_dirs[i] == dir) { + debugfs_remove(dbgfs_dirs[i]); + dbgfs_destroy_ctx(dbgfs_ctxs[i]); + continue; + } + new_dirs[j] = dbgfs_dirs[i]; + new_ctxs[j++] = dbgfs_ctxs[i]; + } + + kfree(dbgfs_dirs); + kfree(dbgfs_ctxs); + + dbgfs_dirs = new_dirs; + dbgfs_ctxs = new_ctxs; + dbgfs_nr_ctxs--; + + goto out_dput; + +out_new_dirs: + kfree(new_dirs); +out_dput: + dput(dir); + return ret; +} + +static ssize_t dbgfs_rm_context_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret; + char *ctx_name; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + ctx_name = kmalloc(count + 1, GFP_KERNEL); + if (!ctx_name) { + kfree(kbuf); + return -ENOMEM; + } + + /* Trim white space */ + if (sscanf(kbuf, "%s", ctx_name) != 1) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&damon_dbgfs_lock); + ret = dbgfs_rm_context(ctx_name); + if (!ret) + ret = count; + mutex_unlock(&damon_dbgfs_lock); + +out: + kfree(kbuf); + kfree(ctx_name); + return ret; +} + +static ssize_t dbgfs_monitor_on_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + char monitor_on_buf[5]; + bool monitor_on = damon_nr_running_ctxs() != 0; + int len; + + len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n"); + + return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len); +} + +static ssize_t dbgfs_monitor_on_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + ssize_t ret; + char *kbuf; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Remove white space */ + if (sscanf(kbuf, "%s", kbuf) != 1) { + kfree(kbuf); + return -EINVAL; + } + + mutex_lock(&damon_dbgfs_lock); + if (!strncmp(kbuf, "on", count)) { + int i; + + for (i = 0; i < dbgfs_nr_ctxs; i++) { + if (damon_targets_empty(dbgfs_ctxs[i])) { + kfree(kbuf); + mutex_unlock(&damon_dbgfs_lock); + return -EINVAL; + } + } + ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true); + } else if (!strncmp(kbuf, "off", count)) { + ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); + } else { + ret = -EINVAL; + } + mutex_unlock(&damon_dbgfs_lock); + + if (!ret) + ret = count; + kfree(kbuf); + return ret; +} + +static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file) +{ + damon_dbgfs_warn_deprecation(); + return nonseekable_open(inode, file); +} + +static const struct file_operations mk_contexts_fops = { + .open = damon_dbgfs_static_file_open, + .write = dbgfs_mk_context_write, +}; + +static const struct file_operations rm_contexts_fops = { + .open = damon_dbgfs_static_file_open, + .write = dbgfs_rm_context_write, +}; + +static const struct file_operations monitor_on_fops = { + .open = damon_dbgfs_static_file_open, + .read = dbgfs_monitor_on_read, + .write = dbgfs_monitor_on_write, +}; + +static int __init __damon_dbgfs_init(void) +{ + struct dentry *dbgfs_root; + const char * const file_names[] = {"mk_contexts", "rm_contexts", + "monitor_on"}; + const struct file_operations *fops[] = {&mk_contexts_fops, + &rm_contexts_fops, &monitor_on_fops}; + int i; + + dbgfs_root = debugfs_create_dir("damon", NULL); + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL, + fops[i]); + dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); + + dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL); + if (!dbgfs_dirs) { + debugfs_remove(dbgfs_root); + return -ENOMEM; + } + dbgfs_dirs[0] = dbgfs_root; + + return 0; +} + +/* + * Functions for the initialization + */ + +static int __init damon_dbgfs_init(void) +{ + int rc = -ENOMEM; + + mutex_lock(&damon_dbgfs_lock); + dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); + if (!dbgfs_ctxs) + goto out; + dbgfs_ctxs[0] = dbgfs_new_ctx(); + if (!dbgfs_ctxs[0]) { + kfree(dbgfs_ctxs); + goto out; + } + dbgfs_nr_ctxs = 1; + + rc = __damon_dbgfs_init(); + if (rc) { + kfree(dbgfs_ctxs[0]); + kfree(dbgfs_ctxs); + pr_err("%s: dbgfs init failed\n", __func__); + } + +out: + mutex_unlock(&damon_dbgfs_lock); + return rc; +} + +module_init(damon_dbgfs_init); + +#include "dbgfs-test.h" diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c new file mode 100644 index 0000000000000..e39fef0135c0e --- /dev/null +++ b/mm/damon/lru_sort.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON-based LRU-lists Sorting + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-lru-sort: " fmt + +#include +#include +#include +#include + +#include "modules-common.h" + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_lru_sort." + +/* + * Enable or disable DAMON_LRU_SORT. + * + * You can enable DAMON_LRU_SORT by setting the value of this parameter as + * ``Y``. Setting it as ``N`` disables DAMON_LRU_SORT. Note that + * DAMON_LRU_SORT could do no real monitoring and LRU-lists sorting due to the + * watermarks-based activation condition. Refer to below descriptions for the + * watermarks parameter for this. + */ +static bool enabled __read_mostly; + +/* + * Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``. + * + * Input parameters that updated while DAMON_LRU_SORT is running are not + * applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT + * reads values of parametrs except ``enabled`` again. Once the re-reading is + * done, this parameter is set as ``N``. If invalid parameters are found while + * the re-reading, DAMON_LRU_SORT will be disabled. + */ +static bool commit_inputs __read_mostly; +module_param(commit_inputs, bool, 0600); + +/* + * Access frequency threshold for hot memory regions identification in permil. + * + * If a memory region is accessed in frequency of this or higher, + * DAMON_LRU_SORT identifies the region as hot, and mark it as accessed on the + * LRU list, so that it could not be reclaimed under memory pressure. 50% by + * default. + */ +static unsigned long hot_thres_access_freq = 500; +module_param(hot_thres_access_freq, ulong, 0600); + +/* + * Time threshold for cold memory regions identification in microseconds. + * + * If a memory region is not accessed for this or longer time, DAMON_LRU_SORT + * identifies the region as cold, and mark it as unaccessed on the LRU list, so + * that it could be reclaimed first under memory pressure. 120 seconds by + * default. + */ +static unsigned long cold_min_age __read_mostly = 120000000; +module_param(cold_min_age, ulong, 0600); + +static struct damos_quota damon_lru_sort_quota = { + /* Use up to 10 ms per 1 sec, by default */ + .ms = 10, + .sz = 0, + .reset_interval = 1000, + /* Within the quota, mark hotter regions accessed first. */ + .weight_sz = 0, + .weight_nr_accesses = 1, + .weight_age = 0, +}; +DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); + +static struct damos_watermarks damon_lru_sort_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 200, /* 20 percent */ + .mid = 150, /* 15 percent */ + .low = 50, /* 5 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_lru_sort_wmarks); + +static struct damon_attrs damon_lru_sort_mon_attrs = { + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs); + +/* + * Start of the target memory region in physical address. + * + * The start physical address of memory region that DAMON_LRU_SORT will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_start __read_mostly; +module_param(monitor_region_start, ulong, 0600); + +/* + * End of the target memory region in physical address. + * + * The end physical address of memory region that DAMON_LRU_SORT will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_end __read_mostly; +module_param(monitor_region_end, ulong, 0600); + +/* + * PID of the DAMON thread + * + * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +static int kdamond_pid __read_mostly = -1; +module_param(kdamond_pid, int, 0400); + +static struct damos_stat damon_lru_sort_hot_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat, + lru_sort_tried_hot_regions, lru_sorted_hot_regions, + hot_quota_exceeds); + +static struct damos_stat damon_lru_sort_cold_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, + lru_sort_tried_cold_regions, lru_sorted_cold_regions, + cold_quota_exceeds); + +static struct damos_access_pattern damon_lru_sort_stub_pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* no matter its access frequency */ + .min_nr_accesses = 0, + .max_nr_accesses = UINT_MAX, + /* no matter its age */ + .min_age_region = 0, + .max_age_region = UINT_MAX, +}; + +static struct damon_ctx *ctx; +static struct damon_target *target; + +static struct damos *damon_lru_sort_new_scheme( + struct damos_access_pattern *pattern, enum damos_action action) +{ + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for hot/cold pages sorting */ + quota.ms = quota.ms / 2; + + return damon_new_scheme( + /* find the pattern, and */ + pattern, + /* (de)prioritize on LRU-lists */ + action, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &damon_lru_sort_wmarks); +} + +/* Create a DAMON-based operation scheme for hot memory regions */ +static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) +{ + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; + + pattern.min_nr_accesses = hot_thres; + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO); +} + +/* Create a DAMON-based operation scheme for cold memory regions */ +static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) +{ + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; + + pattern.max_nr_accesses = 0; + pattern.min_age_region = cold_thres; + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); +} + +static int damon_lru_sort_apply_parameters(void) +{ + struct damos *scheme; + unsigned int hot_thres, cold_thres; + int err = 0; + + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); + if (err) + return err; + + /* aggr_interval / sample_interval is the maximum nr_accesses */ + hot_thres = damon_lru_sort_mon_attrs.aggr_interval / + damon_lru_sort_mon_attrs.sample_interval * + hot_thres_access_freq / 1000; + scheme = damon_lru_sort_new_hot_scheme(hot_thres); + if (!scheme) + return -ENOMEM; + damon_set_schemes(ctx, &scheme, 1); + + cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; + scheme = damon_lru_sort_new_cold_scheme(cold_thres); + if (!scheme) + return -ENOMEM; + damon_add_scheme(ctx, scheme); + + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); +} + +static int damon_lru_sort_turn(bool on) +{ + int err; + + if (!on) { + err = damon_stop(&ctx, 1); + if (!err) + kdamond_pid = -1; + return err; + } + + err = damon_lru_sort_apply_parameters(); + if (err) + return err; + + err = damon_start(&ctx, 1, true); + if (err) + return err; + kdamond_pid = ctx->kdamond->pid; + return 0; +} + +static int damon_lru_sort_enabled_store(const char *val, + const struct kernel_param *kp) +{ + bool is_enabled = enabled; + bool enable; + int err; + + err = strtobool(val, &enable); + if (err) + return err; + + if (is_enabled == enable) + return 0; + + /* Called before init function. The function will handle this. */ + if (!ctx) + goto set_param_out; + + err = damon_lru_sort_turn(enable); + if (err) + return err; + +set_param_out: + enabled = enable; + return err; +} + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_lru_sort_enabled_store, + .get = param_get_bool, +}; + +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, + "Enable or disable DAMON_LRU_SORT (default: disabled)"); + +static int damon_lru_sort_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_lru_sort_apply_parameters(); + commit_inputs = false; + return err; +} + +static int damon_lru_sort_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + if (s->action == DAMOS_LRU_PRIO) + damon_lru_sort_hot_stat = s->stat; + else if (s->action == DAMOS_LRU_DEPRIO) + damon_lru_sort_cold_stat = s->stat; + } + + return damon_lru_sort_handle_commit_inputs(); +} + +static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c) +{ + return damon_lru_sort_handle_commit_inputs(); +} + +static int __init damon_lru_sort_init(void) +{ + int err = damon_modules_new_paddr_ctx_target(&ctx, &target); + + if (err) + return err; + + ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; + ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; + + /* 'enabled' has set before this function, probably via command line */ + if (enabled) + err = damon_lru_sort_turn(true); + + return err; +} + +module_init(damon_lru_sort_init); diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c new file mode 100644 index 0000000000000..b2381a8466ecf --- /dev/null +++ b/mm/damon/modules-common.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park + */ + +#include + +#include "modules-common.h" + +/* + * Allocate, set, and return a DAMON context for the physical address space. + * @ctxp: Pointer to save the point to the newly created context + * @targetp: Pointer to save the point to the newly created target + */ +int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp, + struct damon_target **targetp) +{ + struct damon_ctx *ctx; + struct damon_target *target; + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + *ctxp = ctx; + *targetp = target; + return 0; +} diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h new file mode 100644 index 0000000000000..f49cdb4170051 --- /dev/null +++ b/mm/damon/modules-common.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park + */ + +#include + +#define DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(attrs) \ + module_param_named(sample_interval, attrs.sample_interval, \ + ulong, 0600); \ + module_param_named(aggr_interval, attrs.aggr_interval, ulong, \ + 0600); \ + module_param_named(min_nr_regions, attrs.min_nr_regions, ulong, \ + 0600); \ + module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ + 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ + module_param_named(quota_ms, quota.ms, ulong, 0600); \ + module_param_named(quota_reset_interval_ms, \ + quota.reset_interval, ulong, 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ + DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ + module_param_named(quota_sz, quota.sz, ulong, 0600); + +#define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ + module_param_named(wmarks_interval, wmarks.interval, ulong, \ + 0600); \ + module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ + module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ + module_param_named(wmarks_low, wmarks.low, ulong, 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(stat, try_name, \ + succ_name, qt_exceed_name) \ + module_param_named(nr_##try_name, stat.nr_tried, ulong, 0400); \ + module_param_named(bytes_##try_name, stat.sz_tried, ulong, \ + 0400); \ + module_param_named(nr_##succ_name, stat.nr_applied, ulong, \ + 0400); \ + module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \ + 0400); \ + module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \ + 0400); + +int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp, + struct damon_target **targetp); diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c new file mode 100644 index 0000000000000..13b99975cbc2c --- /dev/null +++ b/mm/damon/ops-common.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for Data Access Monitoring + * + * Author: SeongJae Park + */ + +#include +#include +#include +#include + +#include "ops-common.h" + +/* + * Get an online page for a pfn if it's in the LRU list. Otherwise, returns + * NULL. + * + * The body of this function is stolen from the 'page_idle_get_page()'. We + * steal rather than reuse it because the code is quite simple. + */ +struct page *damon_get_page(unsigned long pfn) +{ + struct page *page = pfn_to_online_page(pfn); + + if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + return NULL; + + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + return page; +} + +void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr) +{ + bool referenced = false; + struct page *page = damon_get_page(pte_pfn(*pte)); + + if (!page) + return; + + if (ptep_test_and_clear_young(vma, addr, pte)) + referenced = true; + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool referenced = false; + struct page *page = damon_get_page(pmd_pfn(*pmd)); + + if (!page) + return; + + if (pmdp_test_and_clear_young(vma, addr, pmd)) + referenced = true; + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} + +#define DAMON_MAX_SUBSCORE (100) +#define DAMON_MAX_AGE_IN_LOG (32) + +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + unsigned int max_nr_accesses; + int freq_subscore; + unsigned int age_in_sec; + int age_in_log, age_subscore; + unsigned int freq_weight = s->quota.weight_nr_accesses; + unsigned int age_weight = s->quota.weight_age; + int hotness; + + max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval; + freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; + + age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; + for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; + age_in_log++, age_in_sec >>= 1) + ; + + /* If frequency is 0, higher age means it's colder */ + if (freq_subscore == 0) + age_in_log *= -1; + + /* + * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. + * Scale it to be in [0, 100] and set it as age subscore. + */ + age_in_log += DAMON_MAX_AGE_IN_LOG; + age_subscore = age_in_log * DAMON_MAX_SUBSCORE / + DAMON_MAX_AGE_IN_LOG / 2; + + hotness = (freq_weight * freq_subscore + age_weight * age_subscore); + if (freq_weight + age_weight) + hotness /= freq_weight + age_weight; + /* + * Transform it to fit in [0, DAMOS_MAX_SCORE] + */ + hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; + + return hotness; +} + +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + int hotness = damon_hot_score(c, r, s); + + /* Return coldness of the region */ + return DAMOS_MAX_SCORE - hotness; +} diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h new file mode 100644 index 0000000000000..e062a8874e411 --- /dev/null +++ b/mm/damon/ops-common.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for Data Access Monitoring + * + * Author: SeongJae Park + */ + +#include + +struct page *damon_get_page(unsigned long pfn); + +void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr); +void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr); + +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s); +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c new file mode 100644 index 0000000000000..b6f5171dc2ccb --- /dev/null +++ b/mm/damon/paddr.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for The Physical Address Space + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-pa: " fmt + +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "ops-common.h" + +static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = addr, + }; + + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) + damon_ptep_mkold(pvmw.pte, vma, addr); + else + damon_pmdp_mkold(pvmw.pmd, vma, addr); + } + return true; +} + +static void damon_pa_mkold(unsigned long paddr) +{ + struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct rmap_walk_control rwc = { + .rmap_one = __damon_pa_mkold, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page) + return; + + if (!page_mapped(page) || !page_rmapping(page)) { + set_page_idle(page); + goto out; + } + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) + goto out; + + rmap_walk(page, &rwc); + + if (need_lock) + unlock_page(page); + +out: + put_page(page); +} + +static void __damon_pa_prepare_access_check(struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_pa_mkold(r->sampling_addr); +} + +static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + __damon_pa_prepare_access_check(r); + } +} + +struct damon_pa_access_chk_result { + unsigned long page_sz; + bool accessed; +}; + +static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct damon_pa_access_chk_result *result = arg; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = addr, + }; + + result->accessed = false; + result->page_sz = PAGE_SIZE; + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) { + result->accessed = pte_young(*pvmw.pte) || + !page_is_idle(page) || + mmu_notifier_test_young(vma->vm_mm, addr); + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + result->accessed = pmd_young(*pvmw.pmd) || + !page_is_idle(page) || + mmu_notifier_test_young(vma->vm_mm, addr); + result->page_sz = HPAGE_PMD_SIZE; +#else + WARN_ON_ONCE(1); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + } + if (result->accessed) { + page_vma_mapped_walk_done(&pvmw); + break; + } + } + + /* If accessed, stop walking */ + return !result->accessed; +} + +static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) +{ + struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct damon_pa_access_chk_result result = { + .page_sz = PAGE_SIZE, + .accessed = false, + }; + struct rmap_walk_control rwc = { + .arg = &result, + .rmap_one = __damon_pa_young, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page) + return false; + + if (!page_mapped(page) || !page_rmapping(page)) { + if (page_is_idle(page)) + result.accessed = false; + else + result.accessed = true; + put_page(page); + goto out; + } + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) { + put_page(page); + return false; + } + + rmap_walk(page, &rwc); + + if (need_lock) + unlock_page(page); + put_page(page); + +out: + *page_sz = result.page_sz; + return result.accessed; +} + +static void __damon_pa_check_access(struct damon_region *r) +{ + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz)) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_addr = r->sampling_addr; +} + +static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) { + __damon_pa_check_access(r); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + } + } + + return max_nr_accesses; +} + +static bool __damos_pa_filter_out(struct damos_filter *filter, + struct page *page) +{ + bool matched = false; + struct mem_cgroup *memcg; + + switch (filter->type) { + case DAMOS_FILTER_TYPE_ANON: + matched = PageAnon(page); + break; + case DAMOS_FILTER_TYPE_MEMCG: + rcu_read_lock(); + memcg = page_memcg_rcu(page); + if (!memcg) + matched = false; + else + matched = filter->memcg_id == mem_cgroup_id(memcg); + rcu_read_unlock(); + break; + default: + break; + } + + return matched == filter->matching; +} + +/* + * damos_pa_filter_out - Return true if the page should be filtered out. + */ +static bool damos_pa_filter_out(struct damos *scheme, struct page *page) +{ + struct damos_filter *filter; + + damos_for_each_filter(filter, scheme) { + if (__damos_pa_filter_out(filter, page)) + return true; + } + return false; +} + +static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) +{ + unsigned long addr, applied; + LIST_HEAD(page_list); + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + + if (damos_pa_filter_out(s, page)) { + put_page(page); + continue; + } + + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (isolate_lru_page(page)) { + put_page(page); + continue; + } + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add(&page->lru, &page_list); + put_page(page); + } + applied = reclaim_pages(&page_list); + cond_resched(); + return applied * PAGE_SIZE; +} + +static inline unsigned long damon_pa_mark_accessed_or_deactivate( + struct damon_region *r, struct damos *s, bool mark_accessed) +{ + unsigned long addr, applied = 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + + if (damos_pa_filter_out(s, page)) { + put_page(page); + continue; + } + + if (mark_accessed) + mark_page_accessed(page); + else + deactivate_page(page); + put_page(page); + applied++; + } + return applied * PAGE_SIZE; +} + +static unsigned long damon_pa_mark_accessed(struct damon_region *r, + struct damos *s) +{ + return damon_pa_mark_accessed_or_deactivate(r, s, true); +} + +static unsigned long damon_pa_deactivate_pages(struct damon_region *r, + struct damos *s) +{ + return damon_pa_mark_accessed_or_deactivate(r, s, false); +} + +static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_pa_pageout(r, scheme); + case DAMOS_LRU_PRIO: + return damon_pa_mark_accessed(r, scheme); + case DAMOS_LRU_DEPRIO: + return damon_pa_deactivate_pages(r, scheme); + case DAMOS_STAT: + break; + default: + /* DAMOS actions that not yet supported by 'paddr'. */ + break; + } + return 0; +} + +static int damon_pa_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_cold_score(context, r, scheme); + case DAMOS_LRU_PRIO: + return damon_hot_score(context, r, scheme); + case DAMOS_LRU_DEPRIO: + return damon_cold_score(context, r, scheme); + default: + break; + } + + return DAMOS_MAX_SCORE; +} + +static int __init damon_pa_initcall(void) +{ + struct damon_operations ops = { + .id = DAMON_OPS_PADDR, + .init = NULL, + .update = NULL, + .prepare_access_checks = damon_pa_prepare_access_checks, + .check_accesses = damon_pa_check_accesses, + .reset_aggregated = NULL, + .target_valid = NULL, + .cleanup = NULL, + .apply_scheme = damon_pa_apply_scheme, + .get_scheme_score = damon_pa_scheme_score, + }; + + return damon_register_ops(&ops); +}; + +subsys_initcall(damon_pa_initcall); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c new file mode 100644 index 0000000000000..8beeb2894f502 --- /dev/null +++ b/mm/damon/reclaim.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON-based page reclamation + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-reclaim: " fmt + +#include +#include +#include +#include + +#include "modules-common.h" + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_reclaim." + +/* + * Enable or disable DAMON_RECLAIM. + * + * You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``. + * Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could + * do no real monitoring and reclamation due to the watermarks-based activation + * condition. Refer to below descriptions for the watermarks parameter for + * this. + */ +static bool enabled __read_mostly; + +/* + * Make DAMON_RECLAIM reads the input parameters again, except ``enabled``. + * + * Input parameters that updated while DAMON_RECLAIM is running are not applied + * by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values + * of parametrs except ``enabled`` again. Once the re-reading is done, this + * parameter is set as ``N``. If invalid parameters are found while the + * re-reading, DAMON_RECLAIM will be disabled. + */ +static bool commit_inputs __read_mostly; +module_param(commit_inputs, bool, 0600); + +/* + * Time threshold for cold memory regions identification in microseconds. + * + * If a memory region is not accessed for this or longer time, DAMON_RECLAIM + * identifies the region as cold, and reclaims. 120 seconds by default. + */ +static unsigned long min_age __read_mostly = 120000000; +module_param(min_age, ulong, 0600); + +static struct damos_quota damon_reclaim_quota = { + /* use up to 10 ms time, reclaim up to 128 MiB per 1 sec by default */ + .ms = 10, + .sz = 128 * 1024 * 1024, + .reset_interval = 1000, + /* Within the quota, page out older regions first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1 +}; +DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); + +static struct damos_watermarks damon_reclaim_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 500, /* 50 percent */ + .mid = 400, /* 40 percent */ + .low = 200, /* 20 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_reclaim_wmarks); + +static struct damon_attrs damon_reclaim_mon_attrs = { + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs); + +/* + * Start of the target memory region in physical address. + * + * The start physical address of memory region that DAMON_RECLAIM will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_start __read_mostly; +module_param(monitor_region_start, ulong, 0600); + +/* + * End of the target memory region in physical address. + * + * The end physical address of memory region that DAMON_RECLAIM will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_end __read_mostly; +module_param(monitor_region_end, ulong, 0600); + +/* + * Skip anonymous pages reclamation. + * + * If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous + * pages. By default, ``N``. + */ +static bool skip_anon __read_mostly; +module_param(skip_anon, bool, 0600); + +/* + * PID of the DAMON thread + * + * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +static int kdamond_pid __read_mostly = -1; +module_param(kdamond_pid, int, 0400); + +static struct damos_stat damon_reclaim_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat, + reclaim_tried_regions, reclaimed_regions, quota_exceeds); + +static struct damon_ctx *ctx; +static struct damon_target *target; + +static struct damos *damon_reclaim_new_scheme(void) +{ + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and not accessed at all */ + .min_nr_accesses = 0, + .max_nr_accesses = 0, + /* for min_age or more micro-seconds */ + .min_age_region = min_age / + damon_reclaim_mon_attrs.aggr_interval, + .max_age_region = UINT_MAX, + }; + + return damon_new_scheme( + &pattern, + /* page out those, as soon as found */ + DAMOS_PAGEOUT, + /* under the quota. */ + &damon_reclaim_quota, + /* (De)activate this according to the watermarks. */ + &damon_reclaim_wmarks); +} + +static int damon_reclaim_apply_parameters(void) +{ + struct damos *scheme; + struct damos_filter *filter; + int err = 0; + + err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); + if (err) + return err; + + /* Will be freed by next 'damon_set_schemes()' below */ + scheme = damon_reclaim_new_scheme(); + if (!scheme) + return -ENOMEM; + if (skip_anon) { + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + if (!filter) { + /* Will be freed by next 'damon_set_schemes()' below */ + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damos_add_filter(scheme, filter); + } + damon_set_schemes(ctx, &scheme, 1); + + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); +} + +static int damon_reclaim_turn(bool on) +{ + int err; + + if (!on) { + err = damon_stop(&ctx, 1); + if (!err) + kdamond_pid = -1; + return err; + } + + err = damon_reclaim_apply_parameters(); + if (err) + return err; + + err = damon_start(&ctx, 1, true); + if (err) + return err; + kdamond_pid = ctx->kdamond->pid; + return 0; +} + +static int damon_reclaim_enabled_store(const char *val, + const struct kernel_param *kp) +{ + bool is_enabled = enabled; + bool enable; + int err; + + err = strtobool(val, &enable); + if (err) + return err; + + if (is_enabled == enable) + return 0; + + /* Called before init function. The function will handle this. */ + if (!ctx) + goto set_param_out; + + err = damon_reclaim_turn(enable); + if (err) + return err; + +set_param_out: + enabled = enable; + return err; +} + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_reclaim_enabled_store, + .get = param_get_bool, +}; + +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, + "Enable or disable DAMON_RECLAIM (default: disabled)"); + +static int damon_reclaim_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_reclaim_apply_parameters(); + commit_inputs = false; + return err; +} + +static int damon_reclaim_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) + damon_reclaim_stat = s->stat; + + return damon_reclaim_handle_commit_inputs(); +} + +static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) +{ + return damon_reclaim_handle_commit_inputs(); +} + +static int __init damon_reclaim_init(void) +{ + int err = damon_modules_new_paddr_ctx_target(&ctx, &target); + + if (err) + return err; + + ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; + ctx->callback.after_aggregation = damon_reclaim_after_aggregation; + + /* 'enabled' has set before this function, probably via command line */ + if (enabled) + err = damon_reclaim_turn(true); + + return err; +} + +module_init(damon_reclaim_init); diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c new file mode 100644 index 0000000000000..52bebf242f742 --- /dev/null +++ b/mm/damon/sysfs-common.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for DAMON Sysfs Interface + * + * Author: SeongJae Park + */ + +#include + +#include "sysfs-common.h" + +DEFINE_MUTEX(damon_sysfs_lock); + +/* + * unsigned long range directory + */ + +struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max) +{ + struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), + GFP_KERNEL); + + if (!range) + return NULL; + range->kobj = (struct kobject){}; + range->min = min; + range->max = max; + + return range; +} + +static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->min); +} + +static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long min; + int err; + + err = kstrtoul(buf, 0, &min); + if (err) + return err; + + range->min = min; + return count; +} + +static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->max); +} + +static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long max; + int err; + + err = kstrtoul(buf, 0, &max); + if (err) + return err; + + range->max = max; + return count; +} + +void damon_sysfs_ul_range_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); +} + +static struct kobj_attribute damon_sysfs_ul_range_min_attr = + __ATTR_RW_MODE(min, 0600); + +static struct kobj_attribute damon_sysfs_ul_range_max_attr = + __ATTR_RW_MODE(max, 0600); + +static struct attribute *damon_sysfs_ul_range_attrs[] = { + &damon_sysfs_ul_range_min_attr.attr, + &damon_sysfs_ul_range_max_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_ul_range); + +struct kobj_type damon_sysfs_ul_range_ktype = { + .release = damon_sysfs_ul_range_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_ul_range_groups, +}; + diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h new file mode 100644 index 0000000000000..604a6cbc3edea --- /dev/null +++ b/mm/damon/sysfs-common.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Sysfs Interface + * + * Author: SeongJae Park + */ + +#include +#include + +extern struct mutex damon_sysfs_lock; + +struct damon_sysfs_ul_range { + struct kobject kobj; + unsigned long min; + unsigned long max; +}; + +struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max); +void damon_sysfs_ul_range_release(struct kobject *kobj); + +extern struct kobj_type damon_sysfs_ul_range_ktype; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes { + struct kobject kobj; + struct damon_sysfs_scheme **schemes_arr; + int nr; +}; + +struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void); +void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes); + +extern struct kobj_type damon_sysfs_schemes_ktype; + +int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes); + +void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); + +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); + +int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); + +int damon_sysfs_schemes_clear_regions( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c new file mode 100644 index 0000000000000..86edca66aab1a --- /dev/null +++ b/mm/damon/sysfs-schemes.c @@ -0,0 +1,1707 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON sysfs Interface + * + * Copyright (c) 2022 SeongJae Park + */ + +#include + +#include "sysfs-common.h" + +/* + * scheme region directory + */ + +struct damon_sysfs_scheme_region { + struct kobject kobj; + struct damon_addr_range ar; + unsigned int nr_accesses; + unsigned int age; + struct list_head list; +}; + +static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( + struct damon_region *region) +{ + struct damon_sysfs_scheme_region *sysfs_region = kmalloc( + sizeof(*sysfs_region), GFP_KERNEL); + + if (!sysfs_region) + return NULL; + sysfs_region->kobj = (struct kobject){}; + sysfs_region->ar = region->ar; + sysfs_region->nr_accesses = region->nr_accesses; + sysfs_region->age = region->age; + INIT_LIST_HEAD(&sysfs_region->list); + return sysfs_region; +} + +static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.start); +} + +static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.end); +} + +static ssize_t nr_accesses_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%u\n", region->nr_accesses); +} + +static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%u\n", region->age); +} + +static void damon_sysfs_scheme_region_release(struct kobject *kobj) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + list_del(®ion->list); + kfree(region); +} + +static struct kobj_attribute damon_sysfs_scheme_region_start_attr = + __ATTR_RO_MODE(start, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_end_attr = + __ATTR_RO_MODE(end, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr = + __ATTR_RO_MODE(nr_accesses, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_age_attr = + __ATTR_RO_MODE(age, 0400); + +static struct attribute *damon_sysfs_scheme_region_attrs[] = { + &damon_sysfs_scheme_region_start_attr.attr, + &damon_sysfs_scheme_region_end_attr.attr, + &damon_sysfs_scheme_region_nr_accesses_attr.attr, + &damon_sysfs_scheme_region_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); + +static struct kobj_type damon_sysfs_scheme_region_ktype = { + .release = damon_sysfs_scheme_region_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_region_groups, +}; + +/* + * scheme regions directory + */ + +struct damon_sysfs_scheme_regions { + struct kobject kobj; + struct list_head regions_list; + int nr_regions; +}; + +static struct damon_sysfs_scheme_regions * +damon_sysfs_scheme_regions_alloc(void) +{ + struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions), + GFP_KERNEL); + + regions->kobj = (struct kobject){}; + INIT_LIST_HEAD(®ions->regions_list); + regions->nr_regions = 0; + return regions; +} + +static void damon_sysfs_scheme_regions_rm_dirs( + struct damon_sysfs_scheme_regions *regions) +{ + struct damon_sysfs_scheme_region *r, *next; + + list_for_each_entry_safe(r, next, ®ions->regions_list, list) { + /* release function deletes it from the list */ + kobject_put(&r->kobj); + regions->nr_regions--; + } +} + +static void damon_sysfs_scheme_regions_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj)); +} + +static struct attribute *damon_sysfs_scheme_regions_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions); + +static struct kobj_type damon_sysfs_scheme_regions_ktype = { + .release = damon_sysfs_scheme_regions_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_regions_groups, +}; + +/* + * schemes/stats directory + */ + +struct damon_sysfs_stats { + struct kobject kobj; + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; + unsigned long qt_exceeds; +}; + +static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); +} + +static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_tried); +} + +static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_tried); +} + +static ssize_t nr_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_applied); +} + +static ssize_t sz_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_applied); +} + +static ssize_t qt_exceeds_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); +} + +static void damon_sysfs_stats_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); +} + +static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = + __ATTR_RO_MODE(nr_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = + __ATTR_RO_MODE(sz_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = + __ATTR_RO_MODE(nr_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = + __ATTR_RO_MODE(sz_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = + __ATTR_RO_MODE(qt_exceeds, 0400); + +static struct attribute *damon_sysfs_stats_attrs[] = { + &damon_sysfs_stats_nr_tried_attr.attr, + &damon_sysfs_stats_sz_tried_attr.attr, + &damon_sysfs_stats_nr_applied_attr.attr, + &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_qt_exceeds_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_stats); + +static struct kobj_type damon_sysfs_stats_ktype = { + .release = damon_sysfs_stats_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_stats_groups, +}; + +/* + * filter directory + */ + +struct damon_sysfs_scheme_filter { + struct kobject kobj; + enum damos_filter_type type; + bool matching; + char *memcg_path; +}; + +static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL); +} + +/* Should match with enum damos_filter_type */ +static const char * const damon_sysfs_scheme_filter_type_strs[] = { + "anon", + "memcg", +}; + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_scheme_filter_type_strs[filter->type]); +} + +static ssize_t type_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + enum damos_filter_type type; + ssize_t ret = -EINVAL; + + for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) { + if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[ + type])) { + filter->type = type; + ret = count; + break; + } + } + return ret; +} + +static ssize_t matching_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N'); +} + +static ssize_t matching_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + bool matching; + int err = kstrtobool(buf, &matching); + + if (err) + return err; + + filter->matching = matching; + return count; +} + +static ssize_t memcg_path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%s\n", + filter->memcg_path ? filter->memcg_path : ""); +} + +static ssize_t memcg_path_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL); + + if (!path) + return -ENOMEM; + + strscpy(path, buf, count + 1); + filter->memcg_path = path; + return count; +} + +static void damon_sysfs_scheme_filter_release(struct kobject *kobj) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + kfree(filter->memcg_path); + kfree(filter); +} + +static struct kobj_attribute damon_sysfs_scheme_filter_type_attr = + __ATTR_RW_MODE(type, 0600); + +static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr = + __ATTR_RW_MODE(matching, 0600); + +static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr = + __ATTR_RW_MODE(memcg_path, 0600); + +static struct attribute *damon_sysfs_scheme_filter_attrs[] = { + &damon_sysfs_scheme_filter_type_attr.attr, + &damon_sysfs_scheme_filter_matching_attr.attr, + &damon_sysfs_scheme_filter_memcg_path_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter); + +static struct kobj_type damon_sysfs_scheme_filter_ktype = { + .release = damon_sysfs_scheme_filter_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_filter_groups, +}; + +/* + * filters directory + */ + +struct damon_sysfs_scheme_filters { + struct kobject kobj; + struct damon_sysfs_scheme_filter **filters_arr; + int nr; +}; + +static struct damon_sysfs_scheme_filters * +damon_sysfs_scheme_filters_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL); +} + +static void damon_sysfs_scheme_filters_rm_dirs( + struct damon_sysfs_scheme_filters *filters) +{ + struct damon_sysfs_scheme_filter **filters_arr = filters->filters_arr; + int i; + + for (i = 0; i < filters->nr; i++) + kobject_put(&filters_arr[i]->kobj); + filters->nr = 0; + kfree(filters_arr); + filters->filters_arr = NULL; +} + +static int damon_sysfs_scheme_filters_add_dirs( + struct damon_sysfs_scheme_filters *filters, int nr_filters) +{ + struct damon_sysfs_scheme_filter **filters_arr, *filter; + int err, i; + + damon_sysfs_scheme_filters_rm_dirs(filters); + if (!nr_filters) + return 0; + + filters_arr = kmalloc_array(nr_filters, sizeof(*filters_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!filters_arr) + return -ENOMEM; + filters->filters_arr = filters_arr; + + for (i = 0; i < nr_filters; i++) { + filter = damon_sysfs_scheme_filter_alloc(); + if (!filter) { + damon_sysfs_scheme_filters_rm_dirs(filters); + return -ENOMEM; + } + + err = kobject_init_and_add(&filter->kobj, + &damon_sysfs_scheme_filter_ktype, + &filters->kobj, "%d", i); + if (err) { + kobject_put(&filter->kobj); + damon_sysfs_scheme_filters_rm_dirs(filters); + return err; + } + + filters_arr[i] = filter; + filters->nr++; + } + return 0; +} + +static ssize_t nr_filters_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filters *filters = container_of(kobj, + struct damon_sysfs_scheme_filters, kobj); + + return sysfs_emit(buf, "%d\n", filters->nr); +} + +static ssize_t nr_filters_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filters *filters; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + filters = container_of(kobj, struct damon_sysfs_scheme_filters, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_scheme_filters_add_dirs(filters, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_scheme_filters_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme_filters, kobj)); +} + +static struct kobj_attribute damon_sysfs_scheme_filters_nr_attr = + __ATTR_RW_MODE(nr_filters, 0600); + +static struct attribute *damon_sysfs_scheme_filters_attrs[] = { + &damon_sysfs_scheme_filters_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_filters); + +static struct kobj_type damon_sysfs_scheme_filters_ktype = { + .release = damon_sysfs_scheme_filters_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_filters_groups, +}; + +/* + * watermarks directory + */ + +struct damon_sysfs_watermarks { + struct kobject kobj; + enum damos_wmark_metric metric; + unsigned long interval_us; + unsigned long high; + unsigned long mid; + unsigned long low; +}; + +static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( + enum damos_wmark_metric metric, unsigned long interval_us, + unsigned long high, unsigned long mid, unsigned long low) +{ + struct damon_sysfs_watermarks *watermarks = kmalloc( + sizeof(*watermarks), GFP_KERNEL); + + if (!watermarks) + return NULL; + watermarks->kobj = (struct kobject){}; + watermarks->metric = metric; + watermarks->interval_us = interval_us; + watermarks->high = high; + watermarks->mid = mid; + watermarks->low = low; + return watermarks; +} + +/* Should match with enum damos_wmark_metric */ +static const char * const damon_sysfs_wmark_metric_strs[] = { + "none", + "free_mem_rate", +}; + +static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_wmark_metric_strs[watermarks->metric]); +} + +static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + enum damos_wmark_metric metric; + + for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { + if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { + watermarks->metric = metric; + return count; + } + } + return -EINVAL; +} + +static ssize_t interval_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->interval_us); +} + +static ssize_t interval_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->interval_us); + + return err ? err : count; +} + +static ssize_t high_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->high); +} + +static ssize_t high_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->high); + + return err ? err : count; +} + +static ssize_t mid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->mid); +} + +static ssize_t mid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->mid); + + return err ? err : count; +} + +static ssize_t low_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->low); +} + +static ssize_t low_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->low); + + return err ? err : count; +} + +static void damon_sysfs_watermarks_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); +} + +static struct kobj_attribute damon_sysfs_watermarks_metric_attr = + __ATTR_RW_MODE(metric, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = + __ATTR_RW_MODE(interval_us, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_high_attr = + __ATTR_RW_MODE(high, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_mid_attr = + __ATTR_RW_MODE(mid, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_low_attr = + __ATTR_RW_MODE(low, 0600); + +static struct attribute *damon_sysfs_watermarks_attrs[] = { + &damon_sysfs_watermarks_metric_attr.attr, + &damon_sysfs_watermarks_interval_us_attr.attr, + &damon_sysfs_watermarks_high_attr.attr, + &damon_sysfs_watermarks_mid_attr.attr, + &damon_sysfs_watermarks_low_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_watermarks); + +static struct kobj_type damon_sysfs_watermarks_ktype = { + .release = damon_sysfs_watermarks_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_watermarks_groups, +}; + +/* + * scheme/weights directory + */ + +struct damon_sysfs_weights { + struct kobject kobj; + unsigned int sz; + unsigned int nr_accesses; + unsigned int age; +}; + +static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, + unsigned int nr_accesses, unsigned int age) +{ + struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), + GFP_KERNEL); + + if (!weights) + return NULL; + weights->kobj = (struct kobject){}; + weights->sz = sz; + weights->nr_accesses = nr_accesses; + weights->age = age; + return weights; +} + +static ssize_t sz_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->sz); +} + +static ssize_t sz_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->sz); + + return err ? err : count; +} + +static ssize_t nr_accesses_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->nr_accesses); +} + +static ssize_t nr_accesses_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->nr_accesses); + + return err ? err : count; +} + +static ssize_t age_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->age); +} + +static ssize_t age_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->age); + + return err ? err : count; +} + +static void damon_sysfs_weights_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); +} + +static struct kobj_attribute damon_sysfs_weights_sz_attr = + __ATTR_RW_MODE(sz_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = + __ATTR_RW_MODE(nr_accesses_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_age_attr = + __ATTR_RW_MODE(age_permil, 0600); + +static struct attribute *damon_sysfs_weights_attrs[] = { + &damon_sysfs_weights_sz_attr.attr, + &damon_sysfs_weights_nr_accesses_attr.attr, + &damon_sysfs_weights_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_weights); + +static struct kobj_type damon_sysfs_weights_ktype = { + .release = damon_sysfs_weights_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_weights_groups, +}; + +/* + * quotas directory + */ + +struct damon_sysfs_quotas { + struct kobject kobj; + struct damon_sysfs_weights *weights; + unsigned long ms; + unsigned long sz; + unsigned long reset_interval_ms; +}; + +static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); +} + +static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) +{ + struct damon_sysfs_weights *weights; + int err; + + weights = damon_sysfs_weights_alloc(0, 0, 0); + if (!weights) + return -ENOMEM; + + err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, + "as->kobj, "weights"); + if (err) + kobject_put(&weights->kobj); + else + quotas->weights = weights; + return err; +} + +static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) +{ + kobject_put("as->weights->kobj); +} + +static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->ms); +} + +static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->ms); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->sz); +} + +static ssize_t bytes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->sz); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t reset_interval_ms_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); +} + +static ssize_t reset_interval_ms_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->reset_interval_ms); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_quotas_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); +} + +static struct kobj_attribute damon_sysfs_quotas_ms_attr = + __ATTR_RW_MODE(ms, 0600); + +static struct kobj_attribute damon_sysfs_quotas_sz_attr = + __ATTR_RW_MODE(bytes, 0600); + +static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = + __ATTR_RW_MODE(reset_interval_ms, 0600); + +static struct attribute *damon_sysfs_quotas_attrs[] = { + &damon_sysfs_quotas_ms_attr.attr, + &damon_sysfs_quotas_sz_attr.attr, + &damon_sysfs_quotas_reset_interval_ms_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_quotas); + +static struct kobj_type damon_sysfs_quotas_ktype = { + .release = damon_sysfs_quotas_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_quotas_groups, +}; + +/* + * access_pattern directory + */ + +struct damon_sysfs_access_pattern { + struct kobject kobj; + struct damon_sysfs_ul_range *sz; + struct damon_sysfs_ul_range *nr_accesses; + struct damon_sysfs_ul_range *age; +}; + +static +struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) +{ + struct damon_sysfs_access_pattern *access_pattern = + kmalloc(sizeof(*access_pattern), GFP_KERNEL); + + if (!access_pattern) + return NULL; + access_pattern->kobj = (struct kobject){}; + return access_pattern; +} + +static int damon_sysfs_access_pattern_add_range_dir( + struct damon_sysfs_access_pattern *access_pattern, + struct damon_sysfs_ul_range **range_dir_ptr, + char *name) +{ + struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); + int err; + + if (!range) + return -ENOMEM; + err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, + &access_pattern->kobj, name); + if (err) + kobject_put(&range->kobj); + else + *range_dir_ptr = range; + return err; +} + +static int damon_sysfs_access_pattern_add_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + int err; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->sz, "sz"); + if (err) + goto put_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->nr_accesses, "nr_accesses"); + if (err) + goto put_nr_accesses_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->age, "age"); + if (err) + goto put_age_nr_accesses_sz_out; + return 0; + +put_age_nr_accesses_sz_out: + kobject_put(&access_pattern->age->kobj); + access_pattern->age = NULL; +put_nr_accesses_sz_out: + kobject_put(&access_pattern->nr_accesses->kobj); + access_pattern->nr_accesses = NULL; +put_sz_out: + kobject_put(&access_pattern->sz->kobj); + access_pattern->sz = NULL; + return err; +} + +static void damon_sysfs_access_pattern_rm_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + kobject_put(&access_pattern->sz->kobj); + kobject_put(&access_pattern->nr_accesses->kobj); + kobject_put(&access_pattern->age->kobj); +} + +static void damon_sysfs_access_pattern_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); +} + +static struct attribute *damon_sysfs_access_pattern_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); + +static struct kobj_type damon_sysfs_access_pattern_ktype = { + .release = damon_sysfs_access_pattern_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_access_pattern_groups, +}; + +/* + * scheme directory + */ + +struct damon_sysfs_scheme { + struct kobject kobj; + enum damos_action action; + struct damon_sysfs_access_pattern *access_pattern; + struct damon_sysfs_quotas *quotas; + struct damon_sysfs_watermarks *watermarks; + struct damon_sysfs_scheme_filters *filters; + struct damon_sysfs_stats *stats; + struct damon_sysfs_scheme_regions *tried_regions; +}; + +/* This should match with enum damos_action */ +static const char * const damon_sysfs_damos_action_strs[] = { + "willneed", + "cold", + "pageout", + "hugepage", + "nohugepage", + "lru_prio", + "lru_deprio", + "stat", +}; + +static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( + enum damos_action action) +{ + struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), + GFP_KERNEL); + + if (!scheme) + return NULL; + scheme->kobj = (struct kobject){}; + scheme->action = action; + return scheme; +} + +static int damon_sysfs_scheme_set_access_pattern( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_access_pattern *access_pattern; + int err; + + access_pattern = damon_sysfs_access_pattern_alloc(); + if (!access_pattern) + return -ENOMEM; + err = kobject_init_and_add(&access_pattern->kobj, + &damon_sysfs_access_pattern_ktype, &scheme->kobj, + "access_pattern"); + if (err) + goto out; + err = damon_sysfs_access_pattern_add_dirs(access_pattern); + if (err) + goto out; + scheme->access_pattern = access_pattern; + return 0; + +out: + kobject_put(&access_pattern->kobj); + return err; +} + +static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); + int err; + + if (!quotas) + return -ENOMEM; + err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, + &scheme->kobj, "quotas"); + if (err) + goto out; + err = damon_sysfs_quotas_add_dirs(quotas); + if (err) + goto out; + scheme->quotas = quotas; + return 0; + +out: + kobject_put("as->kobj); + return err; +} + +static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_watermarks *watermarks = + damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); + int err; + + if (!watermarks) + return -ENOMEM; + err = kobject_init_and_add(&watermarks->kobj, + &damon_sysfs_watermarks_ktype, &scheme->kobj, + "watermarks"); + if (err) + kobject_put(&watermarks->kobj); + else + scheme->watermarks = watermarks; + return err; +} + +static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_scheme_filters *filters = + damon_sysfs_scheme_filters_alloc(); + int err; + + if (!filters) + return -ENOMEM; + err = kobject_init_and_add(&filters->kobj, + &damon_sysfs_scheme_filters_ktype, &scheme->kobj, + "filters"); + if (err) + kobject_put(&filters->kobj); + else + scheme->filters = filters; + return err; +} + +static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); + int err; + + if (!stats) + return -ENOMEM; + err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, + &scheme->kobj, "stats"); + if (err) + kobject_put(&stats->kobj); + else + scheme->stats = stats; + return err; +} + +static int damon_sysfs_scheme_set_tried_regions( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_scheme_regions *tried_regions = + damon_sysfs_scheme_regions_alloc(); + int err; + + if (!tried_regions) + return -ENOMEM; + err = kobject_init_and_add(&tried_regions->kobj, + &damon_sysfs_scheme_regions_ktype, &scheme->kobj, + "tried_regions"); + if (err) + kobject_put(&tried_regions->kobj); + else + scheme->tried_regions = tried_regions; + return err; +} + +static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) +{ + int err; + + err = damon_sysfs_scheme_set_access_pattern(scheme); + if (err) + return err; + err = damon_sysfs_scheme_set_quotas(scheme); + if (err) + goto put_access_pattern_out; + err = damon_sysfs_scheme_set_watermarks(scheme); + if (err) + goto put_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_filters(scheme); + if (err) + goto put_watermarks_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_stats(scheme); + if (err) + goto put_filters_watermarks_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_tried_regions(scheme); + if (err) + goto put_tried_regions_out; + return 0; + +put_tried_regions_out: + kobject_put(&scheme->tried_regions->kobj); + scheme->tried_regions = NULL; +put_filters_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->filters->kobj); + scheme->filters = NULL; +put_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->watermarks->kobj); + scheme->watermarks = NULL; +put_quotas_access_pattern_out: + kobject_put(&scheme->quotas->kobj); + scheme->quotas = NULL; +put_access_pattern_out: + kobject_put(&scheme->access_pattern->kobj); + scheme->access_pattern = NULL; + return err; +} + +static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) +{ + damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); + kobject_put(&scheme->access_pattern->kobj); + damon_sysfs_quotas_rm_dirs(scheme->quotas); + kobject_put(&scheme->quotas->kobj); + kobject_put(&scheme->watermarks->kobj); + damon_sysfs_scheme_filters_rm_dirs(scheme->filters); + kobject_put(&scheme->filters->kobj); + kobject_put(&scheme->stats->kobj); + damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions); + kobject_put(&scheme->tried_regions->kobj); +} + +static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_damos_action_strs[scheme->action]); +} + +static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + enum damos_action action; + + for (action = 0; action < NR_DAMOS_ACTIONS; action++) { + if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { + scheme->action = action; + return count; + } + } + return -EINVAL; +} + +static void damon_sysfs_scheme_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); +} + +static struct kobj_attribute damon_sysfs_scheme_action_attr = + __ATTR_RW_MODE(action, 0600); + +static struct attribute *damon_sysfs_scheme_attrs[] = { + &damon_sysfs_scheme_action_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme); + +static struct kobj_type damon_sysfs_scheme_ktype = { + .release = damon_sysfs_scheme_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_groups, +}; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); +} + +void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) +{ + struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; + int i; + + for (i = 0; i < schemes->nr; i++) { + damon_sysfs_scheme_rm_dirs(schemes_arr[i]); + kobject_put(&schemes_arr[i]->kobj); + } + schemes->nr = 0; + kfree(schemes_arr); + schemes->schemes_arr = NULL; +} + +static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, + int nr_schemes) +{ + struct damon_sysfs_scheme **schemes_arr, *scheme; + int err, i; + + damon_sysfs_schemes_rm_dirs(schemes); + if (!nr_schemes) + return 0; + + schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!schemes_arr) + return -ENOMEM; + schemes->schemes_arr = schemes_arr; + + for (i = 0; i < nr_schemes; i++) { + scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); + if (!scheme) { + damon_sysfs_schemes_rm_dirs(schemes); + return -ENOMEM; + } + + err = kobject_init_and_add(&scheme->kobj, + &damon_sysfs_scheme_ktype, &schemes->kobj, + "%d", i); + if (err) + goto out; + err = damon_sysfs_scheme_add_dirs(scheme); + if (err) + goto out; + + schemes_arr[i] = scheme; + schemes->nr++; + } + return 0; + +out: + damon_sysfs_schemes_rm_dirs(schemes); + kobject_put(&scheme->kobj); + return err; +} + +static ssize_t nr_schemes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_schemes *schemes = container_of(kobj, + struct damon_sysfs_schemes, kobj); + + return sysfs_emit(buf, "%d\n", schemes->nr); +} + +static ssize_t nr_schemes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_schemes *schemes; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_schemes_add_dirs(schemes, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + return count; +} + +static void damon_sysfs_schemes_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); +} + +static struct kobj_attribute damon_sysfs_schemes_nr_attr = + __ATTR_RW_MODE(nr_schemes, 0600); + +static struct attribute *damon_sysfs_schemes_attrs[] = { + &damon_sysfs_schemes_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_schemes); + +struct kobj_type damon_sysfs_schemes_ktype = { + .release = damon_sysfs_schemes_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_schemes_groups, +}; + +static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg, + char *memcg_path_buf, char *path) +{ +#ifdef CONFIG_MEMCG + cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX); + if (sysfs_streq(memcg_path_buf, path)) + return true; +#endif /* CONFIG_MEMCG */ + return false; +} + +static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) +{ + struct mem_cgroup *memcg; + char *path; + bool found = false; + + if (!memcg_path) + return -EINVAL; + + path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL); + if (!path) + return -ENOMEM; + + for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg; + memcg = mem_cgroup_iter(NULL, memcg, NULL)) { + /* skip removed memcg */ + if (!mem_cgroup_id(memcg)) + continue; + if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { + *id = mem_cgroup_id(memcg); + found = true; + break; + } + } + + kfree(path); + return found ? 0 : -EINVAL; +} + +static int damon_sysfs_set_scheme_filters(struct damos *scheme, + struct damon_sysfs_scheme_filters *sysfs_filters) +{ + int i; + struct damos_filter *filter, *next; + + damos_for_each_filter_safe(filter, next, scheme) + damos_destroy_filter(filter); + + for (i = 0; i < sysfs_filters->nr; i++) { + struct damon_sysfs_scheme_filter *sysfs_filter = + sysfs_filters->filters_arr[i]; + struct damos_filter *filter = + damos_new_filter(sysfs_filter->type, + sysfs_filter->matching); + int err; + + if (!filter) + return -ENOMEM; + if (filter->type == DAMOS_FILTER_TYPE_MEMCG) { + err = damon_sysfs_memcg_path_to_id( + sysfs_filter->memcg_path, + &filter->memcg_id); + if (err) { + damos_destroy_filter(filter); + return err; + } + } + damos_add_filter(scheme, filter); + } + return 0; +} + +static struct damos *damon_sysfs_mk_scheme( + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + struct damon_sysfs_scheme_filters *sysfs_filters = + sysfs_scheme->filters; + struct damos *scheme; + int err; + + struct damos_access_pattern pattern = { + .min_sz_region = access_pattern->sz->min, + .max_sz_region = access_pattern->sz->max, + .min_nr_accesses = access_pattern->nr_accesses->min, + .max_nr_accesses = access_pattern->nr_accesses->max, + .min_age_region = access_pattern->age->min, + .max_age_region = access_pattern->age->max, + }; + struct damos_quota quota = { + .ms = sysfs_quotas->ms, + .sz = sysfs_quotas->sz, + .reset_interval = sysfs_quotas->reset_interval_ms, + .weight_sz = sysfs_weights->sz, + .weight_nr_accesses = sysfs_weights->nr_accesses, + .weight_age = sysfs_weights->age, + }; + struct damos_watermarks wmarks = { + .metric = sysfs_wmarks->metric, + .interval = sysfs_wmarks->interval_us, + .high = sysfs_wmarks->high, + .mid = sysfs_wmarks->mid, + .low = sysfs_wmarks->low, + }; + + scheme = damon_new_scheme(&pattern, sysfs_scheme->action, "a, + &wmarks); + if (!scheme) + return NULL; + + err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters); + if (err) { + damon_destroy_scheme(scheme); + return NULL; + } + return scheme; +} + +static void damon_sysfs_update_scheme(struct damos *scheme, + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + int err; + + scheme->pattern.min_sz_region = access_pattern->sz->min; + scheme->pattern.max_sz_region = access_pattern->sz->max; + scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min; + scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max; + scheme->pattern.min_age_region = access_pattern->age->min; + scheme->pattern.max_age_region = access_pattern->age->max; + + scheme->action = sysfs_scheme->action; + + scheme->quota.ms = sysfs_quotas->ms; + scheme->quota.sz = sysfs_quotas->sz; + scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms; + scheme->quota.weight_sz = sysfs_weights->sz; + scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; + scheme->quota.weight_age = sysfs_weights->age; + + scheme->wmarks.metric = sysfs_wmarks->metric; + scheme->wmarks.interval = sysfs_wmarks->interval_us; + scheme->wmarks.high = sysfs_wmarks->high; + scheme->wmarks.mid = sysfs_wmarks->mid; + scheme->wmarks.low = sysfs_wmarks->low; + + err = damon_sysfs_set_scheme_filters(scheme, sysfs_scheme->filters); + if (err) + damon_destroy_scheme(scheme); +} + +int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes) +{ + struct damos *scheme, *next; + int i = 0; + + damon_for_each_scheme_safe(scheme, next, ctx) { + if (i < sysfs_schemes->nr) + damon_sysfs_update_scheme(scheme, + sysfs_schemes->schemes_arr[i]); + else + damon_destroy_scheme(scheme); + i++; + } + + for (; i < sysfs_schemes->nr; i++) { + struct damos *scheme, *next; + + scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); + if (!scheme) { + damon_for_each_scheme_safe(scheme, next, ctx) + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damon_add_scheme(ctx, scheme); + } + return 0; +} + +void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_stats *sysfs_stats; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; + sysfs_stats->nr_tried = scheme->stat.nr_tried; + sysfs_stats->sz_tried = scheme->stat.sz_tried; + sysfs_stats->nr_applied = scheme->stat.nr_applied; + sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; + } +} + +/* + * damon_sysfs_schemes that need to update its schemes regions dir. Protected + * by damon_sysfs_lock + */ +static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; +static int damon_sysfs_schemes_region_idx; + +/* + * DAMON callback that called before damos apply. While this callback is + * registered, damon_sysfs_lock should be held to ensure the regions + * directories exist. + */ +static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s) +{ + struct damos *scheme; + struct damon_sysfs_scheme_regions *sysfs_regions; + struct damon_sysfs_scheme_region *region; + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + if (scheme == s) + break; + schemes_idx++; + } + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + return 0; + + sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + region = damon_sysfs_scheme_region_alloc(r); + list_add_tail(®ion->list, &sysfs_regions->regions_list); + sysfs_regions->nr_regions++; + if (kobject_init_and_add(®ion->kobj, + &damon_sysfs_scheme_region_ktype, + &sysfs_regions->kobj, "%d", + damon_sysfs_schemes_region_idx++)) { + kobject_put(®ion->kobj); + } + return 0; +} + +/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ +int damon_sysfs_schemes_clear_regions( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_scheme *sysfs_scheme; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; + damon_sysfs_scheme_regions_rm_dirs( + sysfs_scheme->tried_regions); + } + return 0; +} + +/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); + damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; + return 0; +} + +/* + * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller + * should unlock damon_sysfs_lock which held before + * damon_sysfs_schemes_update_regions_start() + */ +int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) +{ + damon_sysfs_schemes_for_damos_callback = NULL; + ctx->callback.before_damos_apply = NULL; + damon_sysfs_schemes_region_idx = 0; + return 0; +} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c new file mode 100644 index 0000000000000..aeb0beb1da913 --- /dev/null +++ b/mm/damon/sysfs.c @@ -0,0 +1,1795 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON sysfs Interface + * + * Copyright (c) 2022 SeongJae Park + */ + +#include +#include +#include + +#include "sysfs-common.h" + +/* + * init region directory + */ + +struct damon_sysfs_region { + struct kobject kobj; + struct damon_addr_range ar; +}; + +static struct damon_sysfs_region *damon_sysfs_region_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_region), GFP_KERNEL); +} + +static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.start); +} + +static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + int err = kstrtoul(buf, 0, ®ion->ar.start); + + return err ? err : count; +} + +static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.end); +} + +static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + int err = kstrtoul(buf, 0, ®ion->ar.end); + + return err ? err : count; +} + +static void damon_sysfs_region_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_region, kobj)); +} + +static struct kobj_attribute damon_sysfs_region_start_attr = + __ATTR_RW_MODE(start, 0600); + +static struct kobj_attribute damon_sysfs_region_end_attr = + __ATTR_RW_MODE(end, 0600); + +static struct attribute *damon_sysfs_region_attrs[] = { + &damon_sysfs_region_start_attr.attr, + &damon_sysfs_region_end_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_region); + +static struct kobj_type damon_sysfs_region_ktype = { + .release = damon_sysfs_region_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_region_groups, +}; + +/* + * init_regions directory + */ + +struct damon_sysfs_regions { + struct kobject kobj; + struct damon_sysfs_region **regions_arr; + int nr; +}; + +static struct damon_sysfs_regions *damon_sysfs_regions_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_regions), GFP_KERNEL); +} + +static void damon_sysfs_regions_rm_dirs(struct damon_sysfs_regions *regions) +{ + struct damon_sysfs_region **regions_arr = regions->regions_arr; + int i; + + for (i = 0; i < regions->nr; i++) + kobject_put(®ions_arr[i]->kobj); + regions->nr = 0; + kfree(regions_arr); + regions->regions_arr = NULL; +} + +static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions, + int nr_regions) +{ + struct damon_sysfs_region **regions_arr, *region; + int err, i; + + damon_sysfs_regions_rm_dirs(regions); + if (!nr_regions) + return 0; + + regions_arr = kmalloc_array(nr_regions, sizeof(*regions_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!regions_arr) + return -ENOMEM; + regions->regions_arr = regions_arr; + + for (i = 0; i < nr_regions; i++) { + region = damon_sysfs_region_alloc(); + if (!region) { + damon_sysfs_regions_rm_dirs(regions); + return -ENOMEM; + } + + err = kobject_init_and_add(®ion->kobj, + &damon_sysfs_region_ktype, ®ions->kobj, + "%d", i); + if (err) { + kobject_put(®ion->kobj); + damon_sysfs_regions_rm_dirs(regions); + return err; + } + + regions_arr[i] = region; + regions->nr++; + } + return 0; +} + +static ssize_t nr_regions_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_regions *regions = container_of(kobj, + struct damon_sysfs_regions, kobj); + + return sysfs_emit(buf, "%d\n", regions->nr); +} + +static ssize_t nr_regions_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_regions *regions; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + regions = container_of(kobj, struct damon_sysfs_regions, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_regions_add_dirs(regions, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_regions_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_regions, kobj)); +} + +static struct kobj_attribute damon_sysfs_regions_nr_attr = + __ATTR_RW_MODE(nr_regions, 0600); + +static struct attribute *damon_sysfs_regions_attrs[] = { + &damon_sysfs_regions_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_regions); + +static struct kobj_type damon_sysfs_regions_ktype = { + .release = damon_sysfs_regions_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_regions_groups, +}; + +/* + * target directory + */ + +struct damon_sysfs_target { + struct kobject kobj; + struct damon_sysfs_regions *regions; + int pid; +}; + +static struct damon_sysfs_target *damon_sysfs_target_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_target), GFP_KERNEL); +} + +static int damon_sysfs_target_add_dirs(struct damon_sysfs_target *target) +{ + struct damon_sysfs_regions *regions = damon_sysfs_regions_alloc(); + int err; + + if (!regions) + return -ENOMEM; + + err = kobject_init_and_add(®ions->kobj, &damon_sysfs_regions_ktype, + &target->kobj, "regions"); + if (err) + kobject_put(®ions->kobj); + else + target->regions = regions; + return err; +} + +static void damon_sysfs_target_rm_dirs(struct damon_sysfs_target *target) +{ + damon_sysfs_regions_rm_dirs(target->regions); + kobject_put(&target->regions->kobj); +} + +static ssize_t pid_target_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_target *target = container_of(kobj, + struct damon_sysfs_target, kobj); + + return sysfs_emit(buf, "%d\n", target->pid); +} + +static ssize_t pid_target_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_target *target = container_of(kobj, + struct damon_sysfs_target, kobj); + int err = kstrtoint(buf, 0, &target->pid); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_target_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_target, kobj)); +} + +static struct kobj_attribute damon_sysfs_target_pid_attr = + __ATTR_RW_MODE(pid_target, 0600); + +static struct attribute *damon_sysfs_target_attrs[] = { + &damon_sysfs_target_pid_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_target); + +static struct kobj_type damon_sysfs_target_ktype = { + .release = damon_sysfs_target_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_target_groups, +}; + +/* + * targets directory + */ + +struct damon_sysfs_targets { + struct kobject kobj; + struct damon_sysfs_target **targets_arr; + int nr; +}; + +static struct damon_sysfs_targets *damon_sysfs_targets_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_targets), GFP_KERNEL); +} + +static void damon_sysfs_targets_rm_dirs(struct damon_sysfs_targets *targets) +{ + struct damon_sysfs_target **targets_arr = targets->targets_arr; + int i; + + for (i = 0; i < targets->nr; i++) { + damon_sysfs_target_rm_dirs(targets_arr[i]); + kobject_put(&targets_arr[i]->kobj); + } + targets->nr = 0; + kfree(targets_arr); + targets->targets_arr = NULL; +} + +static int damon_sysfs_targets_add_dirs(struct damon_sysfs_targets *targets, + int nr_targets) +{ + struct damon_sysfs_target **targets_arr, *target; + int err, i; + + damon_sysfs_targets_rm_dirs(targets); + if (!nr_targets) + return 0; + + targets_arr = kmalloc_array(nr_targets, sizeof(*targets_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!targets_arr) + return -ENOMEM; + targets->targets_arr = targets_arr; + + for (i = 0; i < nr_targets; i++) { + target = damon_sysfs_target_alloc(); + if (!target) { + damon_sysfs_targets_rm_dirs(targets); + return -ENOMEM; + } + + err = kobject_init_and_add(&target->kobj, + &damon_sysfs_target_ktype, &targets->kobj, + "%d", i); + if (err) + goto out; + + err = damon_sysfs_target_add_dirs(target); + if (err) + goto out; + + targets_arr[i] = target; + targets->nr++; + } + return 0; + +out: + damon_sysfs_targets_rm_dirs(targets); + kobject_put(&target->kobj); + return err; +} + +static ssize_t nr_targets_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_targets *targets = container_of(kobj, + struct damon_sysfs_targets, kobj); + + return sysfs_emit(buf, "%d\n", targets->nr); +} + +static ssize_t nr_targets_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_targets *targets; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + targets = container_of(kobj, struct damon_sysfs_targets, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_targets_add_dirs(targets, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_targets_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_targets, kobj)); +} + +static struct kobj_attribute damon_sysfs_targets_nr_attr = + __ATTR_RW_MODE(nr_targets, 0600); + +static struct attribute *damon_sysfs_targets_attrs[] = { + &damon_sysfs_targets_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_targets); + +static struct kobj_type damon_sysfs_targets_ktype = { + .release = damon_sysfs_targets_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_targets_groups, +}; + +/* + * intervals directory + */ + +struct damon_sysfs_intervals { + struct kobject kobj; + unsigned long sample_us; + unsigned long aggr_us; + unsigned long update_us; +}; + +static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc( + unsigned long sample_us, unsigned long aggr_us, + unsigned long update_us) +{ + struct damon_sysfs_intervals *intervals = kmalloc(sizeof(*intervals), + GFP_KERNEL); + + if (!intervals) + return NULL; + + intervals->kobj = (struct kobject){}; + intervals->sample_us = sample_us; + intervals->aggr_us = aggr_us; + intervals->update_us = update_us; + return intervals; +} + +static ssize_t sample_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + + return sysfs_emit(buf, "%lu\n", intervals->sample_us); +} + +static ssize_t sample_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + unsigned long us; + int err = kstrtoul(buf, 0, &us); + + if (err) + return err; + + intervals->sample_us = us; + return count; +} + +static ssize_t aggr_us_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + + return sysfs_emit(buf, "%lu\n", intervals->aggr_us); +} + +static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + unsigned long us; + int err = kstrtoul(buf, 0, &us); + + if (err) + return err; + + intervals->aggr_us = us; + return count; +} + +static ssize_t update_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + + return sysfs_emit(buf, "%lu\n", intervals->update_us); +} + +static ssize_t update_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + unsigned long us; + int err = kstrtoul(buf, 0, &us); + + if (err) + return err; + + intervals->update_us = us; + return count; +} + +static void damon_sysfs_intervals_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_intervals, kobj)); +} + +static struct kobj_attribute damon_sysfs_intervals_sample_us_attr = + __ATTR_RW_MODE(sample_us, 0600); + +static struct kobj_attribute damon_sysfs_intervals_aggr_us_attr = + __ATTR_RW_MODE(aggr_us, 0600); + +static struct kobj_attribute damon_sysfs_intervals_update_us_attr = + __ATTR_RW_MODE(update_us, 0600); + +static struct attribute *damon_sysfs_intervals_attrs[] = { + &damon_sysfs_intervals_sample_us_attr.attr, + &damon_sysfs_intervals_aggr_us_attr.attr, + &damon_sysfs_intervals_update_us_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_intervals); + +static struct kobj_type damon_sysfs_intervals_ktype = { + .release = damon_sysfs_intervals_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_intervals_groups, +}; + +/* + * monitoring_attrs directory + */ + +struct damon_sysfs_attrs { + struct kobject kobj; + struct damon_sysfs_intervals *intervals; + struct damon_sysfs_ul_range *nr_regions_range; +}; + +static struct damon_sysfs_attrs *damon_sysfs_attrs_alloc(void) +{ + struct damon_sysfs_attrs *attrs = kmalloc(sizeof(*attrs), GFP_KERNEL); + + if (!attrs) + return NULL; + attrs->kobj = (struct kobject){}; + return attrs; +} + +static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) +{ + struct damon_sysfs_intervals *intervals; + struct damon_sysfs_ul_range *nr_regions_range; + int err; + + intervals = damon_sysfs_intervals_alloc(5000, 100000, 60000000); + if (!intervals) + return -ENOMEM; + + err = kobject_init_and_add(&intervals->kobj, + &damon_sysfs_intervals_ktype, &attrs->kobj, + "intervals"); + if (err) + goto put_intervals_out; + attrs->intervals = intervals; + + nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000); + if (!nr_regions_range) { + err = -ENOMEM; + goto put_intervals_out; + } + + err = kobject_init_and_add(&nr_regions_range->kobj, + &damon_sysfs_ul_range_ktype, &attrs->kobj, + "nr_regions"); + if (err) + goto put_nr_regions_intervals_out; + attrs->nr_regions_range = nr_regions_range; + return 0; + +put_nr_regions_intervals_out: + kobject_put(&nr_regions_range->kobj); + attrs->nr_regions_range = NULL; +put_intervals_out: + kobject_put(&intervals->kobj); + attrs->intervals = NULL; + return err; +} + +static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs) +{ + kobject_put(&attrs->nr_regions_range->kobj); + kobject_put(&attrs->intervals->kobj); +} + +static void damon_sysfs_attrs_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_attrs, kobj)); +} + +static struct attribute *damon_sysfs_attrs_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_attrs); + +static struct kobj_type damon_sysfs_attrs_ktype = { + .release = damon_sysfs_attrs_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_attrs_groups, +}; + +/* + * context directory + */ + +/* This should match with enum damon_ops_id */ +static const char * const damon_sysfs_ops_strs[] = { + "vaddr", + "fvaddr", + "paddr", +}; + +struct damon_sysfs_context { + struct kobject kobj; + enum damon_ops_id ops_id; + struct damon_sysfs_attrs *attrs; + struct damon_sysfs_targets *targets; + struct damon_sysfs_schemes *schemes; +}; + +static struct damon_sysfs_context *damon_sysfs_context_alloc( + enum damon_ops_id ops_id) +{ + struct damon_sysfs_context *context = kmalloc(sizeof(*context), + GFP_KERNEL); + + if (!context) + return NULL; + context->kobj = (struct kobject){}; + context->ops_id = ops_id; + return context; +} + +static int damon_sysfs_context_set_attrs(struct damon_sysfs_context *context) +{ + struct damon_sysfs_attrs *attrs = damon_sysfs_attrs_alloc(); + int err; + + if (!attrs) + return -ENOMEM; + err = kobject_init_and_add(&attrs->kobj, &damon_sysfs_attrs_ktype, + &context->kobj, "monitoring_attrs"); + if (err) + goto out; + err = damon_sysfs_attrs_add_dirs(attrs); + if (err) + goto out; + context->attrs = attrs; + return 0; + +out: + kobject_put(&attrs->kobj); + return err; +} + +static int damon_sysfs_context_set_targets(struct damon_sysfs_context *context) +{ + struct damon_sysfs_targets *targets = damon_sysfs_targets_alloc(); + int err; + + if (!targets) + return -ENOMEM; + err = kobject_init_and_add(&targets->kobj, &damon_sysfs_targets_ktype, + &context->kobj, "targets"); + if (err) { + kobject_put(&targets->kobj); + return err; + } + context->targets = targets; + return 0; +} + +static int damon_sysfs_context_set_schemes(struct damon_sysfs_context *context) +{ + struct damon_sysfs_schemes *schemes = damon_sysfs_schemes_alloc(); + int err; + + if (!schemes) + return -ENOMEM; + err = kobject_init_and_add(&schemes->kobj, &damon_sysfs_schemes_ktype, + &context->kobj, "schemes"); + if (err) { + kobject_put(&schemes->kobj); + return err; + } + context->schemes = schemes; + return 0; +} + +static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context) +{ + int err; + + err = damon_sysfs_context_set_attrs(context); + if (err) + return err; + + err = damon_sysfs_context_set_targets(context); + if (err) + goto put_attrs_out; + + err = damon_sysfs_context_set_schemes(context); + if (err) + goto put_targets_attrs_out; + return 0; + +put_targets_attrs_out: + kobject_put(&context->targets->kobj); + context->targets = NULL; +put_attrs_out: + kobject_put(&context->attrs->kobj); + context->attrs = NULL; + return err; +} + +static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context) +{ + damon_sysfs_attrs_rm_dirs(context->attrs); + kobject_put(&context->attrs->kobj); + damon_sysfs_targets_rm_dirs(context->targets); + kobject_put(&context->targets->kobj); + damon_sysfs_schemes_rm_dirs(context->schemes); + kobject_put(&context->schemes->kobj); +} + +static ssize_t avail_operations_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + enum damon_ops_id id; + int len = 0; + + for (id = 0; id < NR_DAMON_OPS; id++) { + if (!damon_is_registered_ops(id)) + continue; + len += sysfs_emit_at(buf, len, "%s\n", + damon_sysfs_ops_strs[id]); + } + return len; +} + +static ssize_t operations_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + + return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]); +} + +static ssize_t operations_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + enum damon_ops_id id; + + for (id = 0; id < NR_DAMON_OPS; id++) { + if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) { + context->ops_id = id; + return count; + } + } + return -EINVAL; +} + +static void damon_sysfs_context_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_context, kobj)); +} + +static struct kobj_attribute damon_sysfs_context_avail_operations_attr = + __ATTR_RO_MODE(avail_operations, 0400); + +static struct kobj_attribute damon_sysfs_context_operations_attr = + __ATTR_RW_MODE(operations, 0600); + +static struct attribute *damon_sysfs_context_attrs[] = { + &damon_sysfs_context_avail_operations_attr.attr, + &damon_sysfs_context_operations_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_context); + +static struct kobj_type damon_sysfs_context_ktype = { + .release = damon_sysfs_context_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_context_groups, +}; + +/* + * contexts directory + */ + +struct damon_sysfs_contexts { + struct kobject kobj; + struct damon_sysfs_context **contexts_arr; + int nr; +}; + +static struct damon_sysfs_contexts *damon_sysfs_contexts_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_contexts), GFP_KERNEL); +} + +static void damon_sysfs_contexts_rm_dirs(struct damon_sysfs_contexts *contexts) +{ + struct damon_sysfs_context **contexts_arr = contexts->contexts_arr; + int i; + + for (i = 0; i < contexts->nr; i++) { + damon_sysfs_context_rm_dirs(contexts_arr[i]); + kobject_put(&contexts_arr[i]->kobj); + } + contexts->nr = 0; + kfree(contexts_arr); + contexts->contexts_arr = NULL; +} + +static int damon_sysfs_contexts_add_dirs(struct damon_sysfs_contexts *contexts, + int nr_contexts) +{ + struct damon_sysfs_context **contexts_arr, *context; + int err, i; + + damon_sysfs_contexts_rm_dirs(contexts); + if (!nr_contexts) + return 0; + + contexts_arr = kmalloc_array(nr_contexts, sizeof(*contexts_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!contexts_arr) + return -ENOMEM; + contexts->contexts_arr = contexts_arr; + + for (i = 0; i < nr_contexts; i++) { + context = damon_sysfs_context_alloc(DAMON_OPS_VADDR); + if (!context) { + damon_sysfs_contexts_rm_dirs(contexts); + return -ENOMEM; + } + + err = kobject_init_and_add(&context->kobj, + &damon_sysfs_context_ktype, &contexts->kobj, + "%d", i); + if (err) + goto out; + + err = damon_sysfs_context_add_dirs(context); + if (err) + goto out; + + contexts_arr[i] = context; + contexts->nr++; + } + return 0; + +out: + damon_sysfs_contexts_rm_dirs(contexts); + kobject_put(&context->kobj); + return err; +} + +static ssize_t nr_contexts_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_contexts *contexts = container_of(kobj, + struct damon_sysfs_contexts, kobj); + + return sysfs_emit(buf, "%d\n", contexts->nr); +} + +static ssize_t nr_contexts_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_contexts *contexts; + int nr, err; + + err = kstrtoint(buf, 0, &nr); + if (err) + return err; + /* TODO: support multiple contexts per kdamond */ + if (nr < 0 || 1 < nr) + return -EINVAL; + + contexts = container_of(kobj, struct damon_sysfs_contexts, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_contexts_add_dirs(contexts, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_contexts_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_contexts, kobj)); +} + +static struct kobj_attribute damon_sysfs_contexts_nr_attr + = __ATTR_RW_MODE(nr_contexts, 0600); + +static struct attribute *damon_sysfs_contexts_attrs[] = { + &damon_sysfs_contexts_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_contexts); + +static struct kobj_type damon_sysfs_contexts_ktype = { + .release = damon_sysfs_contexts_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_contexts_groups, +}; + +/* + * kdamond directory + */ + +struct damon_sysfs_kdamond { + struct kobject kobj; + struct damon_sysfs_contexts *contexts; + struct damon_ctx *damon_ctx; +}; + +static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_kdamond), GFP_KERNEL); +} + +static int damon_sysfs_kdamond_add_dirs(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_sysfs_contexts *contexts; + int err; + + contexts = damon_sysfs_contexts_alloc(); + if (!contexts) + return -ENOMEM; + + err = kobject_init_and_add(&contexts->kobj, + &damon_sysfs_contexts_ktype, &kdamond->kobj, + "contexts"); + if (err) { + kobject_put(&contexts->kobj); + return err; + } + kdamond->contexts = contexts; + + return err; +} + +static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond) +{ + damon_sysfs_contexts_rm_dirs(kdamond->contexts); + kobject_put(&kdamond->contexts->kobj); +} + +static bool damon_sysfs_ctx_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + return running; +} + +/* + * enum damon_sysfs_cmd - Commands for a specific kdamond. + */ +enum damon_sysfs_cmd { + /* @DAMON_SYSFS_CMD_ON: Turn the kdamond on. */ + DAMON_SYSFS_CMD_ON, + /* @DAMON_SYSFS_CMD_OFF: Turn the kdamond off. */ + DAMON_SYSFS_CMD_OFF, + /* @DAMON_SYSFS_CMD_COMMIT: Update kdamond inputs. */ + DAMON_SYSFS_CMD_COMMIT, + /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: Update scheme stats sysfs + * files. + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS, + /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried + * regions + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS, + /* + * @DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: Clear schemes tried + * regions + */ + DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS, + /* + * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. + */ + NR_DAMON_SYSFS_CMDS, +}; + +/* Should match with enum damon_sysfs_cmd */ +static const char * const damon_sysfs_cmd_strs[] = { + "on", + "off", + "commit", + "update_schemes_stats", + "update_schemes_tried_regions", + "clear_schemes_tried_regions", +}; + +/* + * struct damon_sysfs_cmd_request - A request to the DAMON callback. + * @cmd: The command that needs to be handled by the callback. + * @kdamond: The kobject wrapper that associated to the kdamond thread. + * + * This structure represents a sysfs command request that need to access some + * DAMON context-internal data. Because DAMON context-internal data can be + * safely accessed from DAMON callbacks without additional synchronization, the + * request will be handled by the DAMON callback. None-``NULL`` @kdamond means + * the request is valid. + */ +struct damon_sysfs_cmd_request { + enum damon_sysfs_cmd cmd; + struct damon_sysfs_kdamond *kdamond; +}; + +/* Current DAMON callback request. Protected by damon_sysfs_lock. */ +static struct damon_sysfs_cmd_request damon_sysfs_cmd_request; + +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + struct damon_ctx *ctx = kdamond->damon_ctx; + bool running; + + if (!ctx) + running = false; + else + running = damon_sysfs_ctx_running(ctx); + + return sysfs_emit(buf, "%s\n", running ? + damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : + damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]); +} + +static int damon_sysfs_set_attrs(struct damon_ctx *ctx, + struct damon_sysfs_attrs *sys_attrs) +{ + struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals; + struct damon_sysfs_ul_range *sys_nr_regions = + sys_attrs->nr_regions_range; + struct damon_attrs attrs = { + .sample_interval = sys_intervals->sample_us, + .aggr_interval = sys_intervals->aggr_us, + .ops_update_interval = sys_intervals->update_us, + .min_nr_regions = sys_nr_regions->min, + .max_nr_regions = sys_nr_regions->max, + }; + return damon_set_attrs(ctx, &attrs); +} + +static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + bool has_pid = damon_target_has_pid(ctx); + + damon_for_each_target_safe(t, next, ctx) { + if (has_pid) + put_pid(t->pid); + damon_destroy_target(t); + } +} + +static int damon_sysfs_set_regions(struct damon_target *t, + struct damon_sysfs_regions *sysfs_regions) +{ + struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr, + sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN); + int i, err = -EINVAL; + + if (!ranges) + return -ENOMEM; + for (i = 0; i < sysfs_regions->nr; i++) { + struct damon_sysfs_region *sys_region = + sysfs_regions->regions_arr[i]; + + if (sys_region->ar.start > sys_region->ar.end) + goto out; + + ranges[i].start = sys_region->ar.start; + ranges[i].end = sys_region->ar.end; + if (i == 0) + continue; + if (ranges[i - 1].end > ranges[i].start) + goto out; + } + err = damon_set_regions(t, ranges, sysfs_regions->nr); +out: + kfree(ranges); + return err; + +} + +static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, + struct damon_ctx *ctx) +{ + struct damon_target *t = damon_new_target(); + int err = -EINVAL; + + if (!t) + return -ENOMEM; + damon_add_target(ctx, t); + if (damon_target_has_pid(ctx)) { + t->pid = find_get_pid(sys_target->pid); + if (!t->pid) + goto destroy_targets_out; + } + err = damon_sysfs_set_regions(t, sys_target->regions); + if (err) + goto destroy_targets_out; + return 0; + +destroy_targets_out: + damon_sysfs_destroy_targets(ctx); + return err; +} + +/* + * Search a target in a context that corresponds to the sysfs target input. + * + * Return: pointer to the target if found, NULL if not found, or negative + * error code if the search failed. + */ +static struct damon_target *damon_sysfs_existing_target( + struct damon_sysfs_target *sys_target, struct damon_ctx *ctx) +{ + struct pid *pid; + struct damon_target *t; + + if (!damon_target_has_pid(ctx)) { + /* Up to only one target for paddr could exist */ + damon_for_each_target(t, ctx) + return t; + return NULL; + } + + /* ops.id should be DAMON_OPS_VADDR or DAMON_OPS_FVADDR */ + pid = find_get_pid(sys_target->pid); + if (!pid) + return ERR_PTR(-EINVAL); + damon_for_each_target(t, ctx) { + if (t->pid == pid) { + put_pid(pid); + return t; + } + } + put_pid(pid); + return NULL; +} + +static int damon_sysfs_set_targets(struct damon_ctx *ctx, + struct damon_sysfs_targets *sysfs_targets) +{ + int i, err; + + /* Multiple physical address space monitoring targets makes no sense */ + if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1) + return -EINVAL; + + for (i = 0; i < sysfs_targets->nr; i++) { + struct damon_sysfs_target *st = sysfs_targets->targets_arr[i]; + struct damon_target *t = damon_sysfs_existing_target(st, ctx); + + if (IS_ERR(t)) + return PTR_ERR(t); + if (!t) + err = damon_sysfs_add_target(st, ctx); + else + err = damon_sysfs_set_regions(t, st->regions); + if (err) + return err; + } + return 0; +} + +static void damon_sysfs_before_terminate(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + struct damon_sysfs_kdamond *kdamond; + + /* damon_sysfs_schemes_update_regions_stop() might not yet called */ + kdamond = damon_sysfs_cmd_request.kdamond; + if (kdamond && damon_sysfs_cmd_request.cmd == + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS && + ctx == kdamond->damon_ctx) { + damon_sysfs_schemes_update_regions_stop(ctx); + mutex_unlock(&damon_sysfs_lock); + } + + if (!damon_target_has_pid(ctx)) + return; + + mutex_lock(&ctx->kdamond_lock); + damon_for_each_target_safe(t, next, ctx) { + put_pid(t->pid); + damon_destroy_target(t); + } + mutex_unlock(&ctx->kdamond_lock); +} + +/* + * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. + * @kdamond: The kobject wrapper that associated to the kdamond thread. + * + * This function reads the schemes stats of specific kdamond and update the + * related values for sysfs files. This function should be called from DAMON + * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON + * contexts-internal data and DAMON sysfs variables. + */ +static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + damon_sysfs_schemes_update_stats( + kdamond->contexts->contexts_arr[0]->schemes, ctx); + return 0; +} + +static int damon_sysfs_upd_schemes_regions_start( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_update_regions_start( + kdamond->contexts->contexts_arr[0]->schemes, ctx); +} + +static int damon_sysfs_upd_schemes_regions_stop( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_update_regions_stop(ctx); +} + +static int damon_sysfs_clear_schemes_regions( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_clear_regions( + kdamond->contexts->contexts_arr[0]->schemes, ctx); +} + +static inline bool damon_sysfs_kdamond_running( + struct damon_sysfs_kdamond *kdamond) +{ + return kdamond->damon_ctx && + damon_sysfs_ctx_running(kdamond->damon_ctx); +} + +static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, + struct damon_sysfs_context *sys_ctx) +{ + int err; + + err = damon_select_ops(ctx, sys_ctx->ops_id); + if (err) + return err; + err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); + if (err) + return err; + err = damon_sysfs_set_targets(ctx, sys_ctx->targets); + if (err) + return err; + return damon_sysfs_set_schemes(ctx, sys_ctx->schemes); +} + +/* + * damon_sysfs_commit_input() - Commit user inputs to a running kdamond. + * @kdamond: The kobject wrapper for the associated kdamond. + * + * If the sysfs input is wrong, the kdamond will be terminated. + */ +static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) +{ + if (!damon_sysfs_kdamond_running(kdamond)) + return -EINVAL; + /* TODO: Support multiple contexts per kdamond */ + if (kdamond->contexts->nr != 1) + return -EINVAL; + + return damon_sysfs_apply_inputs(kdamond->damon_ctx, + kdamond->contexts->contexts_arr[0]); +} + +/* + * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. + * @c: The DAMON context of the callback. + * + * This function is periodically called back from the kdamond thread for @c. + * Then, it checks if there is a waiting DAMON sysfs request and handles it. + */ +static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) +{ + struct damon_sysfs_kdamond *kdamond; + static bool damon_sysfs_schemes_regions_updating; + int err = 0; + + /* avoid deadlock due to concurrent state_store('off') */ + if (!damon_sysfs_schemes_regions_updating && + !mutex_trylock(&damon_sysfs_lock)) + return 0; + kdamond = damon_sysfs_cmd_request.kdamond; + if (!kdamond || kdamond->damon_ctx != c) + goto out; + switch (damon_sysfs_cmd_request.cmd) { + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: + err = damon_sysfs_upd_schemes_stats(kdamond); + break; + case DAMON_SYSFS_CMD_COMMIT: + err = damon_sysfs_commit_input(kdamond); + break; + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: + if (!damon_sysfs_schemes_regions_updating) { + err = damon_sysfs_upd_schemes_regions_start(kdamond); + if (!err) { + damon_sysfs_schemes_regions_updating = true; + goto keep_lock_out; + } + } else { + err = damon_sysfs_upd_schemes_regions_stop(kdamond); + damon_sysfs_schemes_regions_updating = false; + } + break; + case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: + err = damon_sysfs_clear_schemes_regions(kdamond); + break; + default: + break; + } + /* Mark the request as invalid now. */ + damon_sysfs_cmd_request.kdamond = NULL; +out: + if (!damon_sysfs_schemes_regions_updating) + mutex_unlock(&damon_sysfs_lock); +keep_lock_out: + return err; +} + +static struct damon_ctx *damon_sysfs_build_ctx( + struct damon_sysfs_context *sys_ctx) +{ + struct damon_ctx *ctx = damon_new_ctx(); + int err; + + if (!ctx) + return ERR_PTR(-ENOMEM); + + err = damon_sysfs_apply_inputs(ctx, sys_ctx); + if (err) { + damon_destroy_ctx(ctx); + return ERR_PTR(err); + } + + ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback; + ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback; + ctx->callback.before_terminate = damon_sysfs_before_terminate; + return ctx; +} + +static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx; + int err; + + if (damon_sysfs_kdamond_running(kdamond)) + return -EBUSY; + if (damon_sysfs_cmd_request.kdamond == kdamond) + return -EBUSY; + /* TODO: support multiple contexts per kdamond */ + if (kdamond->contexts->nr != 1) + return -EINVAL; + + if (kdamond->damon_ctx) + damon_destroy_ctx(kdamond->damon_ctx); + kdamond->damon_ctx = NULL; + + ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + err = damon_start(&ctx, 1, false); + if (err) { + damon_destroy_ctx(ctx); + return err; + } + kdamond->damon_ctx = ctx; + return err; +} + +static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond) +{ + if (!kdamond->damon_ctx) + return -EINVAL; + return damon_stop(&kdamond->damon_ctx, 1); + /* + * To allow users show final monitoring results of already turned-off + * DAMON, we free kdamond->damon_ctx in next + * damon_sysfs_turn_damon_on(), or kdamonds_nr_store() + */ +} + +/* + * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond. + * @cmd: The command to handle. + * @kdamond: The kobject wrapper for the associated kdamond. + * + * This function handles a DAMON sysfs command for a kdamond. For commands + * that need to access running DAMON context-internal data, it requests + * handling of the command to the DAMON callback + * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled, + * or the context is completed. + * + * Return: 0 on success, negative error code otherwise. + */ +static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, + struct damon_sysfs_kdamond *kdamond) +{ + bool need_wait = true; + + /* Handle commands that doesn't access DAMON context-internal data */ + switch (cmd) { + case DAMON_SYSFS_CMD_ON: + return damon_sysfs_turn_damon_on(kdamond); + case DAMON_SYSFS_CMD_OFF: + return damon_sysfs_turn_damon_off(kdamond); + default: + break; + } + + /* Pass the command to DAMON callback for safe DAMON context access */ + if (damon_sysfs_cmd_request.kdamond) + return -EBUSY; + if (!damon_sysfs_kdamond_running(kdamond)) + return -EINVAL; + damon_sysfs_cmd_request.cmd = cmd; + damon_sysfs_cmd_request.kdamond = kdamond; + + /* + * wait until damon_sysfs_cmd_request_callback() handles the request + * from kdamond context + */ + mutex_unlock(&damon_sysfs_lock); + while (need_wait) { + schedule_timeout_idle(msecs_to_jiffies(100)); + if (!mutex_trylock(&damon_sysfs_lock)) + continue; + if (!damon_sysfs_cmd_request.kdamond) { + /* damon_sysfs_cmd_request_callback() handled */ + need_wait = false; + } else if (!damon_sysfs_kdamond_running(kdamond)) { + /* kdamond has already finished */ + need_wait = false; + damon_sysfs_cmd_request.kdamond = NULL; + } + mutex_unlock(&damon_sysfs_lock); + } + mutex_lock(&damon_sysfs_lock); + return 0; +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + enum damon_sysfs_cmd cmd; + ssize_t ret = -EINVAL; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + for (cmd = 0; cmd < NR_DAMON_SYSFS_CMDS; cmd++) { + if (sysfs_streq(buf, damon_sysfs_cmd_strs[cmd])) { + ret = damon_sysfs_handle_cmd(cmd, kdamond); + break; + } + } + mutex_unlock(&damon_sysfs_lock); + if (!ret) + ret = count; + return ret; +} + +static ssize_t pid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + struct damon_ctx *ctx; + int pid = -1; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + ctx = kdamond->damon_ctx; + if (!ctx) + goto out; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + pid = ctx->kdamond->pid; + mutex_unlock(&ctx->kdamond_lock); +out: + mutex_unlock(&damon_sysfs_lock); + return sysfs_emit(buf, "%d\n", pid); +} + +static void damon_sysfs_kdamond_release(struct kobject *kobj) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + + if (kdamond->damon_ctx) + damon_destroy_ctx(kdamond->damon_ctx); + kfree(kdamond); +} + +static struct kobj_attribute damon_sysfs_kdamond_state_attr = + __ATTR_RW_MODE(state, 0600); + +static struct kobj_attribute damon_sysfs_kdamond_pid_attr = + __ATTR_RO_MODE(pid, 0400); + +static struct attribute *damon_sysfs_kdamond_attrs[] = { + &damon_sysfs_kdamond_state_attr.attr, + &damon_sysfs_kdamond_pid_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_kdamond); + +static struct kobj_type damon_sysfs_kdamond_ktype = { + .release = damon_sysfs_kdamond_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_kdamond_groups, +}; + +/* + * kdamonds directory + */ + +struct damon_sysfs_kdamonds { + struct kobject kobj; + struct damon_sysfs_kdamond **kdamonds_arr; + int nr; +}; + +static struct damon_sysfs_kdamonds *damon_sysfs_kdamonds_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_kdamonds), GFP_KERNEL); +} + +static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds) +{ + struct damon_sysfs_kdamond **kdamonds_arr = kdamonds->kdamonds_arr; + int i; + + for (i = 0; i < kdamonds->nr; i++) { + damon_sysfs_kdamond_rm_dirs(kdamonds_arr[i]); + kobject_put(&kdamonds_arr[i]->kobj); + } + kdamonds->nr = 0; + kfree(kdamonds_arr); + kdamonds->kdamonds_arr = NULL; +} + +static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds, + int nr_kdamonds) +{ + int i; + + for (i = 0; i < nr_kdamonds; i++) { + if (damon_sysfs_kdamond_running(kdamonds[i]) || + damon_sysfs_cmd_request.kdamond == kdamonds[i]) + return true; + } + + return false; +} + +static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, + int nr_kdamonds) +{ + struct damon_sysfs_kdamond **kdamonds_arr, *kdamond; + int err, i; + + if (damon_sysfs_kdamonds_busy(kdamonds->kdamonds_arr, kdamonds->nr)) + return -EBUSY; + + damon_sysfs_kdamonds_rm_dirs(kdamonds); + if (!nr_kdamonds) + return 0; + + kdamonds_arr = kmalloc_array(nr_kdamonds, sizeof(*kdamonds_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!kdamonds_arr) + return -ENOMEM; + kdamonds->kdamonds_arr = kdamonds_arr; + + for (i = 0; i < nr_kdamonds; i++) { + kdamond = damon_sysfs_kdamond_alloc(); + if (!kdamond) { + damon_sysfs_kdamonds_rm_dirs(kdamonds); + return -ENOMEM; + } + + err = kobject_init_and_add(&kdamond->kobj, + &damon_sysfs_kdamond_ktype, &kdamonds->kobj, + "%d", i); + if (err) + goto out; + + err = damon_sysfs_kdamond_add_dirs(kdamond); + if (err) + goto out; + + kdamonds_arr[i] = kdamond; + kdamonds->nr++; + } + return 0; + +out: + damon_sysfs_kdamonds_rm_dirs(kdamonds); + kobject_put(&kdamond->kobj); + return err; +} + +static ssize_t nr_kdamonds_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_kdamonds *kdamonds = container_of(kobj, + struct damon_sysfs_kdamonds, kobj); + + return sysfs_emit(buf, "%d\n", kdamonds->nr); +} + +static ssize_t nr_kdamonds_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_kdamonds *kdamonds; + int nr, err; + + err = kstrtoint(buf, 0, &nr); + if (err) + return err; + if (nr < 0) + return -EINVAL; + + kdamonds = container_of(kobj, struct damon_sysfs_kdamonds, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_kdamonds_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_kdamonds, kobj)); +} + +static struct kobj_attribute damon_sysfs_kdamonds_nr_attr = + __ATTR_RW_MODE(nr_kdamonds, 0600); + +static struct attribute *damon_sysfs_kdamonds_attrs[] = { + &damon_sysfs_kdamonds_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_kdamonds); + +static struct kobj_type damon_sysfs_kdamonds_ktype = { + .release = damon_sysfs_kdamonds_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_kdamonds_groups, +}; + +/* + * damon user interface directory + */ + +struct damon_sysfs_ui_dir { + struct kobject kobj; + struct damon_sysfs_kdamonds *kdamonds; +}; + +static struct damon_sysfs_ui_dir *damon_sysfs_ui_dir_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_ui_dir), GFP_KERNEL); +} + +static int damon_sysfs_ui_dir_add_dirs(struct damon_sysfs_ui_dir *ui_dir) +{ + struct damon_sysfs_kdamonds *kdamonds; + int err; + + kdamonds = damon_sysfs_kdamonds_alloc(); + if (!kdamonds) + return -ENOMEM; + + err = kobject_init_and_add(&kdamonds->kobj, + &damon_sysfs_kdamonds_ktype, &ui_dir->kobj, + "kdamonds"); + if (err) { + kobject_put(&kdamonds->kobj); + return err; + } + ui_dir->kdamonds = kdamonds; + return err; +} + +static void damon_sysfs_ui_dir_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_ui_dir, kobj)); +} + +static struct attribute *damon_sysfs_ui_dir_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_ui_dir); + +static struct kobj_type damon_sysfs_ui_dir_ktype = { + .release = damon_sysfs_ui_dir_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_ui_dir_groups, +}; + +static int __init damon_sysfs_init(void) +{ + struct kobject *damon_sysfs_root; + struct damon_sysfs_ui_dir *admin; + int err; + + damon_sysfs_root = kobject_create_and_add("damon", mm_kobj); + if (!damon_sysfs_root) + return -ENOMEM; + + admin = damon_sysfs_ui_dir_alloc(); + if (!admin) { + kobject_put(damon_sysfs_root); + return -ENOMEM; + } + err = kobject_init_and_add(&admin->kobj, &damon_sysfs_ui_dir_ktype, + damon_sysfs_root, "admin"); + if (err) + goto out; + err = damon_sysfs_ui_dir_add_dirs(admin); + if (err) + goto out; + return 0; + +out: + kobject_put(&admin->kobj); + kobject_put(damon_sysfs_root); + return err; +} +subsys_initcall(damon_sysfs_init); diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h new file mode 100644 index 0000000000000..d4f55f3491007 --- /dev/null +++ b/mm/damon/vaddr-test.h @@ -0,0 +1,326 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST + +#ifndef _DAMON_VADDR_TEST_H +#define _DAMON_VADDR_TEST_H + +#include + +static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas) +{ + int i, j; + unsigned long largest_gap, gap; + + if (!nr_vmas) + return; + + for (i = 0; i < nr_vmas - 1; i++) { + vmas[i].vm_next = &vmas[i + 1]; + + vmas[i].vm_rb.rb_left = NULL; + vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb; + + largest_gap = 0; + for (j = i; j < nr_vmas; j++) { + if (j == 0) + continue; + gap = vmas[j].vm_start - vmas[j - 1].vm_end; + if (gap > largest_gap) + largest_gap = gap; + } + vmas[i].rb_subtree_gap = largest_gap; + } + vmas[i].vm_next = NULL; + vmas[i].vm_rb.rb_right = NULL; + vmas[i].rb_subtree_gap = 0; +} + +/* + * Test __damon_va_three_regions() function + * + * In case of virtual memory address spaces monitoring, DAMON converts the + * complex and dynamic memory mappings of each target task to three + * discontiguous regions which cover every mapped areas. However, the three + * regions should not include the two biggest unmapped areas in the original + * mapping, because the two biggest areas are normally the areas between 1) + * heap and the mmap()-ed regions, and 2) the mmap()-ed regions and stack. + * Because these two unmapped areas are very huge but obviously never accessed, + * covering the region is just a waste. + * + * '__damon_va_three_regions() receives an address space of a process. It + * first identifies the start of mappings, end of mappings, and the two biggest + * unmapped areas. After that, based on the information, it constructs the + * three regions and returns. For more detail, refer to the comment of + * 'damon_init_regions_of()' function definition in 'mm/damon.c' file. + * + * For example, suppose virtual address ranges of 10-20, 20-25, 200-210, + * 210-220, 300-305, and 307-330 (Other comments represent this mappings in + * more short form: 10-20-25, 200-210-220, 300-305, 307-330) of a process are + * mapped. To cover every mappings, the three regions should start with 10, + * and end with 305. The process also has three unmapped areas, 25-200, + * 220-300, and 305-307. Among those, 25-200 and 220-300 are the biggest two + * unmapped areas, and thus it should be converted to three regions of 10-25, + * 200-220, and 300-330. + */ +static void damon_test_three_regions_in_vmas(struct kunit *test) +{ + struct damon_addr_range regions[3] = {0,}; + /* 10-20-25, 200-210-220, 300-305, 307-330 */ + struct vm_area_struct vmas[] = { + (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, + (struct vm_area_struct) {.vm_start = 20, .vm_end = 25}, + (struct vm_area_struct) {.vm_start = 200, .vm_end = 210}, + (struct vm_area_struct) {.vm_start = 210, .vm_end = 220}, + (struct vm_area_struct) {.vm_start = 300, .vm_end = 305}, + (struct vm_area_struct) {.vm_start = 307, .vm_end = 330}, + }; + + __link_vmas(vmas, 6); + + __damon_va_three_regions(&vmas[0], regions); + + KUNIT_EXPECT_EQ(test, 10ul, regions[0].start); + KUNIT_EXPECT_EQ(test, 25ul, regions[0].end); + KUNIT_EXPECT_EQ(test, 200ul, regions[1].start); + KUNIT_EXPECT_EQ(test, 220ul, regions[1].end); + KUNIT_EXPECT_EQ(test, 300ul, regions[2].start); + KUNIT_EXPECT_EQ(test, 330ul, regions[2].end); +} + +static struct damon_region *__nth_region_of(struct damon_target *t, int idx) +{ + struct damon_region *r; + unsigned int i = 0; + + damon_for_each_region(r, t) { + if (i++ == idx) + return r; + } + + return NULL; +} + +/* + * Test 'damon_set_regions()' + * + * test kunit object + * regions an array containing start/end addresses of current + * monitoring target regions + * nr_regions the number of the addresses in 'regions' + * three_regions The three regions that need to be applied now + * expected start/end addresses of monitoring target regions that + * 'three_regions' are applied + * nr_expected the number of addresses in 'expected' + * + * The memory mapping of the target processes changes dynamically. To follow + * the change, DAMON periodically reads the mappings, simplifies it to the + * three regions, and updates the monitoring target regions to fit in the three + * regions. The update of current target regions is the role of + * 'damon_set_regions()'. + * + * This test passes the given target regions and the new three regions that + * need to be applied to the function and check whether it updates the regions + * as expected. + */ +static void damon_do_test_apply_three_regions(struct kunit *test, + unsigned long *regions, int nr_regions, + struct damon_addr_range *three_regions, + unsigned long *expected, int nr_expected) +{ + struct damon_target *t; + struct damon_region *r; + int i; + + t = damon_new_target(); + for (i = 0; i < nr_regions / 2; i++) { + r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); + damon_add_region(r, t); + } + + damon_set_regions(t, three_regions, 3); + + for (i = 0; i < nr_expected / 2; i++) { + r = __nth_region_of(t, i); + KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]); + KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]); + } +} + +/* + * This function test most common case where the three big regions are only + * slightly changed. Target regions should adjust their boundary (10-20-30, + * 50-55, 70-80, 90-100) to fit with the new big regions or remove target + * regions (57-79) that now out of the three regions. + */ +static void damon_test_apply_three_regions1(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 45-55, 73-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 45, .end = 55}, + (struct damon_addr_range){.start = 73, .end = 104} }; + /* 5-20-27, 45-55, 73-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 45, 55, + 73, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test slightly bigger change. Similar to above, but the second big region + * now require two target regions (50-55, 57-59) to be removed. + */ +static void damon_test_apply_three_regions2(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 56-57, 65-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 56, .end = 57}, + (struct damon_addr_range){.start = 65, .end = 104} }; + /* 5-20-27, 56-57, 65-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 56, 57, + 65, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test a big change. The second big region has totally freed and mapped to + * different area (50-59 -> 61-63). The target regions which were in the old + * second big region (50-55-57-59) should be removed and new target region + * covering the second big region (61-63) should be created. + */ +static void damon_test_apply_three_regions3(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 61-63, 65-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 61, .end = 63}, + (struct damon_addr_range){.start = 65, .end = 104} }; + /* 5-20-27, 61-63, 65-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 61, 63, + 65, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test another big change. Both of the second and third big regions (50-59 + * and 70-100) has totally freed and mapped to different area (30-32 and + * 65-68). The target regions which were in the old second and third big + * regions should now be removed and new target regions covering the new second + * and third big regions should be created. + */ +static void damon_test_apply_three_regions4(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-7, 30-32, 65-68 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 7}, + (struct damon_addr_range){.start = 30, .end = 32}, + (struct damon_addr_range){.start = 65, .end = 68} }; + /* expect 5-7, 30-32, 65-68 */ + unsigned long expected[] = {5, 7, 30, 32, 65, 68}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +static void damon_test_split_evenly_fail(struct kunit *test, + unsigned long start, unsigned long end, unsigned int nr_pieces) +{ + struct damon_target *t = damon_new_target(); + struct damon_region *r = damon_new_region(start, end); + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, + damon_va_evenly_split_region(t, r, nr_pieces), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u); + + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, start); + KUNIT_EXPECT_EQ(test, r->ar.end, end); + } + + damon_free_target(t); +} + +static void damon_test_split_evenly_succ(struct kunit *test, + unsigned long start, unsigned long end, unsigned int nr_pieces) +{ + struct damon_target *t = damon_new_target(); + struct damon_region *r = damon_new_region(start, end); + unsigned long expected_width = (end - start) / nr_pieces; + unsigned long i = 0; + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, + damon_va_evenly_split_region(t, r, nr_pieces), 0); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces); + + damon_for_each_region(r, t) { + if (i == nr_pieces - 1) { + KUNIT_EXPECT_EQ(test, + r->ar.start, start + i * expected_width); + KUNIT_EXPECT_EQ(test, r->ar.end, end); + break; + } + KUNIT_EXPECT_EQ(test, + r->ar.start, start + i++ * expected_width); + KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width); + } + damon_free_target(t); +} + +static void damon_test_split_evenly(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), + -EINVAL); + + damon_test_split_evenly_fail(test, 0, 100, 0); + damon_test_split_evenly_succ(test, 0, 100, 10); + damon_test_split_evenly_succ(test, 5, 59, 5); + damon_test_split_evenly_fail(test, 5, 6, 2); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_test_three_regions_in_vmas), + KUNIT_CASE(damon_test_apply_three_regions1), + KUNIT_CASE(damon_test_apply_three_regions2), + KUNIT_CASE(damon_test_apply_three_regions3), + KUNIT_CASE(damon_test_apply_three_regions4), + KUNIT_CASE(damon_test_split_evenly), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon-operations", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_VADDR_TEST_H */ + +#endif /* CONFIG_DAMON_VADDR_KUNIT_TEST */ diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c new file mode 100644 index 0000000000000..c245310cfb6dd --- /dev/null +++ b/mm/damon/vaddr.c @@ -0,0 +1,711 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for Virtual Address Spaces + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-va: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include "ops-common.h" + +#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST +#undef DAMON_MIN_REGION +#define DAMON_MIN_REGION 1 +#endif + +/* + * 't->pid' should be the pointer to the relevant 'struct pid' having reference + * count. Caller must put the returned task, unless it is NULL. + */ +static inline struct task_struct *damon_get_task_struct(struct damon_target *t) +{ + return get_pid_task(t->pid, PIDTYPE_PID); +} + +/* + * Get the mm_struct of the given target + * + * Caller _must_ put the mm_struct after use, unless it is NULL. + * + * Returns the mm_struct of the target on success, NULL on failure + */ +static struct mm_struct *damon_get_mm(struct damon_target *t) +{ + struct task_struct *task; + struct mm_struct *mm; + + task = damon_get_task_struct(t); + if (!task) + return NULL; + + mm = get_task_mm(task); + put_task_struct(task); + return mm; +} + +/* + * Functions for the initial monitoring target regions construction + */ + +/* + * Size-evenly split a region into 'nr_pieces' small regions + * + * Returns 0 on success, or negative error code otherwise. + */ +static int damon_va_evenly_split_region(struct damon_target *t, + struct damon_region *r, unsigned int nr_pieces) +{ + unsigned long sz_orig, sz_piece, orig_end; + struct damon_region *n = NULL, *next; + unsigned long start; + + if (!r || !nr_pieces) + return -EINVAL; + + orig_end = r->ar.end; + sz_orig = damon_sz_region(r); + sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); + + if (!sz_piece) + return -EINVAL; + + r->ar.end = r->ar.start + sz_piece; + next = damon_next_region(r); + for (start = r->ar.end; start + sz_piece <= orig_end; + start += sz_piece) { + n = damon_new_region(start, start + sz_piece); + if (!n) + return -ENOMEM; + damon_insert_region(n, r, next, t); + r = n; + } + /* complement last region for possible rounding error */ + if (n) + n->ar.end = orig_end; + + return 0; +} + +static unsigned long sz_range(struct damon_addr_range *r) +{ + return r->end - r->start; +} + +/* + * Find three regions separated by two biggest unmapped regions + * + * vma the head vma of the target address space + * regions an array of three address ranges that results will be saved + * + * This function receives an address space and finds three regions in it which + * separated by the two biggest unmapped regions in the space. Please refer to + * below comments of '__damon_va_init_regions()' function to know why this is + * necessary. + * + * Returns 0 if success, or negative error code otherwise. + */ +static int __damon_va_three_regions(struct vm_area_struct *vma, + struct damon_addr_range regions[3]) +{ + struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0}; + struct vm_area_struct *last_vma = NULL; + unsigned long start = 0; + struct rb_root rbroot; + + /* Find two biggest gaps so that first_gap > second_gap > others */ + for (; vma; vma = vma->vm_next) { + if (!last_vma) { + start = vma->vm_start; + goto next; + } + + if (vma->rb_subtree_gap <= sz_range(&second_gap)) { + rbroot.rb_node = &vma->vm_rb; + vma = rb_entry(rb_last(&rbroot), + struct vm_area_struct, vm_rb); + goto next; + } + + gap.start = last_vma->vm_end; + gap.end = vma->vm_start; + if (sz_range(&gap) > sz_range(&second_gap)) { + swap(gap, second_gap); + if (sz_range(&second_gap) > sz_range(&first_gap)) + swap(second_gap, first_gap); + } +next: + last_vma = vma; + } + + if (!sz_range(&second_gap) || !sz_range(&first_gap)) + return -EINVAL; + + /* Sort the two biggest gaps by address */ + if (first_gap.start > second_gap.start) + swap(first_gap, second_gap); + + /* Store the result */ + regions[0].start = ALIGN(start, DAMON_MIN_REGION); + regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); + regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); + regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); + regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); + regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION); + + return 0; +} + +/* + * Get the three regions in the given target (task) + * + * Returns 0 on success, negative error code otherwise. + */ +static int damon_va_three_regions(struct damon_target *t, + struct damon_addr_range regions[3]) +{ + struct mm_struct *mm; + int rc; + + mm = damon_get_mm(t); + if (!mm) + return -EINVAL; + + mmap_read_lock(mm); + rc = __damon_va_three_regions(mm->mmap, regions); + mmap_read_unlock(mm); + + mmput(mm); + return rc; +} + +/* + * Initialize the monitoring target regions for the given target (task) + * + * t the given target + * + * Because only a number of small portions of the entire address space + * is actually mapped to the memory and accessed, monitoring the unmapped + * regions is wasteful. That said, because we can deal with small noises, + * tracking every mapping is not strictly required but could even incur a high + * overhead if the mapping frequently changes or the number of mappings is + * high. The adaptive regions adjustment mechanism will further help to deal + * with the noise by simply identifying the unmapped areas as a region that + * has no access. Moreover, applying the real mappings that would have many + * unmapped areas inside will make the adaptive mechanism quite complex. That + * said, too huge unmapped areas inside the monitoring target should be removed + * to not take the time for the adaptive mechanism. + * + * For the reason, we convert the complex mappings to three distinct regions + * that cover every mapped area of the address space. Also the two gaps + * between the three regions are the two biggest unmapped areas in the given + * address space. In detail, this function first identifies the start and the + * end of the mappings and the two biggest unmapped areas of the address space. + * Then, it constructs the three regions as below: + * + * [mappings[0]->start, big_two_unmapped_areas[0]->start) + * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) + * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) + * + * As usual memory map of processes is as below, the gap between the heap and + * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed + * region and the stack will be two biggest unmapped regions. Because these + * gaps are exceptionally huge areas in usual address space, excluding these + * two biggest unmapped regions will be sufficient to make a trade-off. + * + * + * + * + * (other mmap()-ed regions and small unmapped regions) + * + * + * + */ +static void __damon_va_init_regions(struct damon_ctx *ctx, + struct damon_target *t) +{ + struct damon_target *ti; + struct damon_region *r; + struct damon_addr_range regions[3]; + unsigned long sz = 0, nr_pieces; + int i, tidx = 0; + + if (damon_va_three_regions(t, regions)) { + damon_for_each_target(ti, ctx) { + if (ti == t) + break; + tidx++; + } + pr_debug("Failed to get three regions of %dth target\n", tidx); + return; + } + + for (i = 0; i < 3; i++) + sz += regions[i].end - regions[i].start; + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + /* Set the initial three regions of the target */ + for (i = 0; i < 3; i++) { + r = damon_new_region(regions[i].start, regions[i].end); + if (!r) { + pr_err("%d'th init region creation failed\n", i); + return; + } + damon_add_region(r, t); + + nr_pieces = (regions[i].end - regions[i].start) / sz; + damon_va_evenly_split_region(t, r, nr_pieces); + } +} + +/* Initialize '->regions_list' of every target (task) */ +static void damon_va_init(struct damon_ctx *ctx) +{ + struct damon_target *t; + + damon_for_each_target(t, ctx) { + /* the user may set the target regions as they want */ + if (!damon_nr_regions(t)) + __damon_va_init_regions(ctx, t); + } +} + +/* + * Update regions for current memory mappings + */ +static void damon_va_update(struct damon_ctx *ctx) +{ + struct damon_addr_range three_regions[3]; + struct damon_target *t; + + damon_for_each_target(t, ctx) { + if (damon_va_three_regions(t, three_regions)) + continue; + damon_set_regions(t, three_regions, 3); + } +} + +static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t *pte; + spinlock_t *ptl; + + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_huge(*pmd)) { + damon_pmdp_mkold(pmd, walk->vma, addr); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + } + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return 0; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte_present(*pte)) + goto out; + damon_ptep_mkold(pte, walk->vma, addr); +out: + pte_unmap_unlock(pte, ptl); + return 0; +} + +#ifdef CONFIG_HUGETLB_PAGE +static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr) +{ + bool referenced = false; + pte_t entry = huge_ptep_get(pte); + struct page *page = pte_page(entry); + + get_page(page); + + if (pte_young(entry)) { + referenced = true; + entry = pte_mkold(entry); + set_huge_pte_at(mm, addr, pte, entry); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + huge_page_size(hstate_vma(vma)))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_mkold_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + +static const struct mm_walk_ops damon_mkold_ops = { + .pmd_entry = damon_mkold_pmd_entry, + .hugetlb_entry = damon_mkold_hugetlb_entry, +}; + +static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) +{ + mmap_read_lock(mm); + walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); + mmap_read_unlock(mm); +} + +/* + * Functions for the access checking of the regions + */ + +static void __damon_va_prepare_access_check(struct mm_struct *mm, + struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_va_mkold(mm, r->sampling_addr); +} + +static void damon_va_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + damon_for_each_region(r, t) + __damon_va_prepare_access_check(mm, r); + mmput(mm); + } +} + +struct damon_young_walk_private { + unsigned long *page_sz; + bool young; +}; + +static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t *pte; + spinlock_t *ptl; + struct page *page; + struct damon_young_walk_private *priv = walk->private; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (!pmd_trans_huge(*pmd)) { + spin_unlock(ptl); + goto regular_page; + } + page = damon_get_page(pmd_pfn(*pmd)); + if (!page) + goto huge_out; + if (pmd_young(*pmd) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, + addr)) + priv->young = true; + *priv->page_sz = HPAGE_PMD_SIZE; + put_page(page); +huge_out: + spin_unlock(ptl); + return 0; + } + +regular_page: +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return -EINVAL; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte_present(*pte)) + goto out; + page = damon_get_page(pte_pfn(*pte)); + if (!page) + goto out; + if (pte_young(*pte) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) + priv->young = true; + *priv->page_sz = PAGE_SIZE; + put_page(page); +out: + pte_unmap_unlock(pte, ptl); + return 0; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct damon_young_walk_private *priv = walk->private; + struct hstate *h = hstate_vma(walk->vma); + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + page = pte_page(entry); + get_page(page); + + if (pte_young(entry) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) + priv->young = true; + *priv->page_sz = huge_page_size(h); + + put_page(page); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_young_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + +static const struct mm_walk_ops damon_young_ops = { + .pmd_entry = damon_young_pmd_entry, + .hugetlb_entry = damon_young_hugetlb_entry, +}; + +static bool damon_va_young(struct mm_struct *mm, unsigned long addr, + unsigned long *page_sz) +{ + struct damon_young_walk_private arg = { + .page_sz = page_sz, + .young = false, + }; + + mmap_read_lock(mm); + walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); + mmap_read_unlock(mm); + return arg.young; +} + +/* + * Check whether the region was accessed after the last preparation + * + * mm 'mm_struct' for the given virtual address space + * r the region to be checked + */ +static void __damon_va_check_access(struct mm_struct *mm, + struct damon_region *r, bool same_target) +{ + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz))) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_addr = r->sampling_addr; +} + +static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + bool same_target; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + same_target = false; + damon_for_each_region(r, t) { + __damon_va_check_access(mm, r, same_target); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + same_target = true; + } + mmput(mm); + } + + return max_nr_accesses; +} + +/* + * Functions for the target validity check and cleanup + */ + +static bool damon_va_target_valid(struct damon_target *t) +{ + struct task_struct *task; + + task = damon_get_task_struct(t); + if (task) { + put_task_struct(task); + return true; + } + + return false; +} + +#ifndef CONFIG_ADVISE_SYSCALLS +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) +{ + return 0; +} +#else +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) +{ + struct mm_struct *mm; + unsigned long start = PAGE_ALIGN(r->ar.start); + unsigned long len = PAGE_ALIGN(damon_sz_region(r)); + unsigned long applied; + + mm = damon_get_mm(target); + if (!mm) + return 0; + + applied = do_madvise(mm, start, len, behavior) ? 0 : len; + mmput(mm); + + return applied; +} +#endif /* CONFIG_ADVISE_SYSCALLS */ + +static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + int madv_action; + + switch (scheme->action) { + case DAMOS_WILLNEED: + madv_action = MADV_WILLNEED; + break; + case DAMOS_COLD: + madv_action = MADV_COLD; + break; + case DAMOS_PAGEOUT: + madv_action = MADV_PAGEOUT; + break; + case DAMOS_HUGEPAGE: + madv_action = MADV_HUGEPAGE; + break; + case DAMOS_NOHUGEPAGE: + madv_action = MADV_NOHUGEPAGE; + break; + case DAMOS_STAT: + return 0; + default: + /* + * DAMOS actions that are not yet supported by 'vaddr'. + */ + return 0; + } + + return damos_madvise(t, r, madv_action); +} + +static int damon_va_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_cold_score(context, r, scheme); + default: + break; + } + + return DAMOS_MAX_SCORE; +} + +static int __init damon_va_initcall(void) +{ + struct damon_operations ops = { + .id = DAMON_OPS_VADDR, + .init = damon_va_init, + .update = damon_va_update, + .prepare_access_checks = damon_va_prepare_access_checks, + .check_accesses = damon_va_check_accesses, + .reset_aggregated = NULL, + .target_valid = damon_va_target_valid, + .cleanup = NULL, + .apply_scheme = damon_va_apply_scheme, + .get_scheme_score = damon_va_scheme_score, + }; + /* ops for fixed virtual address ranges */ + struct damon_operations ops_fvaddr = ops; + int err; + + /* Don't set the monitoring target regions for the entire mapping */ + ops_fvaddr.id = DAMON_OPS_FVADDR; + ops_fvaddr.init = NULL; + ops_fvaddr.update = NULL; + + err = damon_register_ops(&ops); + if (err) + return err; + return damon_register_ops(&ops_fvaddr); +}; + +subsys_initcall(damon_va_initcall); + +#include "vaddr-test.h" diff --git a/mm/filemap.c b/mm/filemap.c index 3a983bc1a71c9..684c16303bfe2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2170,6 +2170,259 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) ra->ra_pages /= 4; } +static int lock_page_for_iocb(struct kiocb *iocb, struct page *page) +{ + if (iocb->ki_flags & IOCB_WAITQ) + return lock_page_async(page, iocb->ki_waitq); + else if (iocb->ki_flags & IOCB_NOWAIT) + return trylock_page(page) ? 0 : -EAGAIN; + else + return lock_page_killable(page); +} + +static struct page * +generic_file_buffered_read_readpage(struct kiocb *iocb, + struct file *filp, + struct address_space *mapping, + struct page *page) +{ + struct file_ra_state *ra = &filp->f_ra; + int error; + + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EAGAIN); + } + + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); + /* Start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (unlikely(error)) { + put_page(page); + return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; + } + + if (!PageUptodate(page)) { + error = lock_page_for_iocb(iocb, page); + if (unlikely(error)) { + put_page(page); + return ERR_PTR(error); + } + if (!PageUptodate(page)) { + if (page->mapping == NULL) { + /* + * invalidate_mapping_pages got it + */ + unlock_page(page); + put_page(page); + return NULL; + } + unlock_page(page); + shrink_readahead_size_eio(ra); + put_page(page); + return ERR_PTR(-EIO); + } + unlock_page(page); + } + + return page; +} + +static struct page * +generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb, + struct file *filp, + struct iov_iter *iter, + struct page *page, + loff_t pos, loff_t count) +{ + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; + int error; + + /* + * See comment in do_read_cache_page on why + * wait_on_page_locked is used to avoid unnecessarily + * serialisations and why it's safe. + */ + if (iocb->ki_flags & IOCB_WAITQ) { + error = wait_on_page_locked_async(page, + iocb->ki_waitq); + } else { + error = wait_on_page_locked_killable(page); + } + if (unlikely(error)) { + put_page(page); + return ERR_PTR(error); + } + if (PageUptodate(page)) + return page; + + if (inode->i_blkbits == PAGE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + /* pipes can't handle partially uptodate pages */ + if (unlikely(iov_iter_is_pipe(iter))) + goto page_not_up_to_date; + if (!trylock_page(page)) + goto page_not_up_to_date; + /* Did it get truncated before we got the lock? */ + if (!page->mapping) + goto page_not_up_to_date_locked; + if (!mapping->a_ops->is_partially_uptodate(page, + pos & ~PAGE_MASK, count)) + goto page_not_up_to_date_locked; + unlock_page(page); + return page; + +page_not_up_to_date: + /* Get exclusive access to the page ... */ + error = lock_page_for_iocb(iocb, page); + if (unlikely(error)) { + put_page(page); + return ERR_PTR(error); + } + +page_not_up_to_date_locked: + /* Did it get truncated before we got the lock? */ + if (!page->mapping) { + unlock_page(page); + put_page(page); + return NULL; + } + + /* Did somebody else fill it already? */ + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + return generic_file_buffered_read_readpage(iocb, filp, mapping, page); +} + +static struct page * +generic_file_buffered_read_no_cached_page(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; + struct page *page; + int error; + + if (iocb->ki_flags & IOCB_NOIO) + return ERR_PTR(-EAGAIN); + + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + */ + page = page_cache_alloc(mapping); + if (!page) + return ERR_PTR(-ENOMEM); + + error = add_to_page_cache_lru(page, mapping, index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); + if (error) { + put_page(page); + return error != -EEXIST ? ERR_PTR(error) : NULL; + } + + return generic_file_buffered_read_readpage(iocb, filp, mapping, page); +} + +static int generic_file_buffered_read_get_pages(struct kiocb *iocb, + struct iov_iter *iter, + struct page **pages, + unsigned int nr) +{ + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + struct file_ra_state *ra = &filp->f_ra; + pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; + pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; + int i, j, nr_got, err = 0; + + nr = min_t(unsigned long, last_index - index, nr); +find_page: + if (fatal_signal_pending(current)) + return -EINTR; + + nr_got = find_get_pages_contig(mapping, index, nr, pages); + if (nr_got) + goto got_pages; + + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + + page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); + + nr_got = find_get_pages_contig(mapping, index, nr, pages); + if (nr_got) + goto got_pages; + + pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); + err = PTR_ERR_OR_ZERO(pages[0]); + if (!IS_ERR_OR_NULL(pages[0])) + nr_got = 1; +got_pages: + for (i = 0; i < nr_got; i++) { + struct page *page = pages[i]; + pgoff_t pg_index = index + i; + loff_t pg_pos = max(iocb->ki_pos, + (loff_t) pg_index << PAGE_SHIFT); + loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; + + if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_NOIO) { + for (j = i; j < nr_got; j++) + put_page(pages[j]); + nr_got = i; + err = -EAGAIN; + break; + } + page_cache_async_readahead(mapping, ra, filp, page, + pg_index, last_index - pg_index); + } + + if (!PageUptodate(page)) { + if ((iocb->ki_flags & IOCB_NOWAIT) || + ((iocb->ki_flags & IOCB_WAITQ) && i)) { + for (j = i; j < nr_got; j++) + put_page(pages[j]); + nr_got = i; + err = -EAGAIN; + break; + } + + page = generic_file_buffered_read_pagenotuptodate(iocb, + filp, iter, page, pg_pos, pg_count); + if (IS_ERR_OR_NULL(page)) { + for (j = i + 1; j < nr_got; j++) + put_page(pages[j]); + nr_got = i; + err = PTR_ERR_OR_ZERO(page); + break; + } + } + } + + if (likely(nr_got)) + return nr_got; + if (err) + return err; + /* + * No pages and no error means we raced and should retry: + */ + goto find_page; +} + /** * generic_file_buffered_read - generic file read routine * @iocb: the iocb to read @@ -2190,294 +2443,117 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; + struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct file_ra_state *ra = &filp->f_ra; - loff_t *ppos = &iocb->ki_pos; - pgoff_t index; - pgoff_t last_index; - pgoff_t prev_index; - unsigned long offset; /* offset into pagecache page */ - unsigned int prev_offset; - int error = 0; - - if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) + struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL; + unsigned int nr_pages = min_t(unsigned int, 512, + ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (iocb->ki_pos >> PAGE_SHIFT)); + int i, pg_nr, error = 0; + bool writably_mapped; + loff_t isize, end_offset; + + if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - index = *ppos >> PAGE_SHIFT; - prev_index = ra->prev_pos >> PAGE_SHIFT; - prev_offset = ra->prev_pos & (PAGE_SIZE-1); - last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; - offset = *ppos & ~PAGE_MASK; + if (nr_pages > ARRAY_SIZE(pages_onstack)) + pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); - /* - * If we've already successfully copied some data, then we - * can no longer safely return -EIOCBQUEUED. Hence mark - * an async read NOWAIT at that point. - */ - if (written && (iocb->ki_flags & IOCB_WAITQ)) - iocb->ki_flags |= IOCB_NOWAIT; - - for (;;) { - struct page *page; - pgoff_t end_index; - loff_t isize; - unsigned long nr, ret; + if (!pages) { + pages = pages_onstack; + nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack)); + } + do { cond_resched(); -find_page: - if (fatal_signal_pending(current)) { - error = -EINTR; - goto out; - } - page = find_get_page(mapping, index); - if (!page) { - if (iocb->ki_flags & IOCB_NOIO) - goto would_block; - page_cache_sync_readahead(mapping, - ra, filp, - index, last_index - index); - page = find_get_page(mapping, index); - if (unlikely(page == NULL)) - goto no_cached_page; - } - if (PageReadahead(page)) { - if (iocb->ki_flags & IOCB_NOIO) { - put_page(page); - goto out; - } - page_cache_async_readahead(mapping, - ra, filp, page, - index, last_index - index); - } - if (!PageUptodate(page)) { - /* - * See comment in do_read_cache_page on why - * wait_on_page_locked is used to avoid unnecessarily - * serialisations and why it's safe. - */ - if (iocb->ki_flags & IOCB_WAITQ) { - if (written) { - put_page(page); - goto out; - } - error = wait_on_page_locked_async(page, - iocb->ki_waitq); - } else { - if (iocb->ki_flags & IOCB_NOWAIT) { - put_page(page); - goto would_block; - } - error = wait_on_page_locked_killable(page); - } - if (unlikely(error)) - goto readpage_error; - if (PageUptodate(page)) - goto page_ok; - - if (inode->i_blkbits == PAGE_SHIFT || - !mapping->a_ops->is_partially_uptodate) - goto page_not_up_to_date; - /* pipes can't handle partially uptodate pages */ - if (unlikely(iov_iter_is_pipe(iter))) - goto page_not_up_to_date; - if (!trylock_page(page)) - goto page_not_up_to_date; - /* Did it get truncated before we got the lock? */ - if (!page->mapping) - goto page_not_up_to_date_locked; - if (!mapping->a_ops->is_partially_uptodate(page, - offset, iter->count)) - goto page_not_up_to_date_locked; - unlock_page(page); + /* + * If we've already successfully copied some data, then we + * can no longer safely return -EIOCBQUEUED. Hence mark + * an async read NOWAIT at that point. + */ + if ((iocb->ki_flags & IOCB_WAITQ) && written) + iocb->ki_flags |= IOCB_NOWAIT; + + i = 0; + pg_nr = generic_file_buffered_read_get_pages(iocb, iter, + pages, nr_pages); + if (pg_nr < 0) { + error = pg_nr; + break; } -page_ok: + /* - * i_size must be checked after we know the page is Uptodate. + * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_SHIFT; - if (unlikely(!isize || index > end_index)) { - put_page(page); - goto out; - } + if (unlikely(iocb->ki_pos >= isize)) + goto put_pages; - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_SIZE; - if (index == end_index) { - nr = ((isize - 1) & ~PAGE_MASK) + 1; - if (nr <= offset) { - put_page(page); - goto out; - } - } - nr = nr - offset; + end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - /* If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. - */ - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > + (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) + put_page(pages[--pg_nr]); /* - * When a sequential read accesses a page several times, - * only mark it as accessed the first time. + * Once we start copying data, we don't want to be touching any + * cachelines that might be contended: */ - if (prev_index != index || offset != prev_offset) - mark_page_accessed(page); - prev_index = index; + writably_mapped = mapping_writably_mapped(mapping); /* - * Ok, we have the page, and it's up-to-date, so - * now we can copy it to user space... + * When a sequential read accesses a page several times, only + * mark it as accessed the first time. */ + if (iocb->ki_pos >> PAGE_SHIFT != + ra->prev_pos >> PAGE_SHIFT) + mark_page_accessed(pages[0]); + for (i = 1; i < pg_nr; i++) + mark_page_accessed(pages[i]); + + for (i = 0; i < pg_nr; i++) { + unsigned int offset = iocb->ki_pos & ~PAGE_MASK; + unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, + PAGE_SIZE - offset); + unsigned int copied; - ret = copy_page_to_iter(page, offset, nr, iter); - offset += ret; - index += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; - prev_offset = offset; - - put_page(page); - written += ret; - if (!iov_iter_count(iter)) - goto out; - if (ret < nr) { - error = -EFAULT; - goto out; - } - continue; - -page_not_up_to_date: - /* Get exclusive access to the page ... */ - if (iocb->ki_flags & IOCB_WAITQ) { - if (written) { - put_page(page); - goto out; - } - error = lock_page_async(page, iocb->ki_waitq); - } else { - error = lock_page_killable(page); - } - if (unlikely(error)) - goto readpage_error; - -page_not_up_to_date_locked: - /* Did it get truncated before we got the lock? */ - if (!page->mapping) { - unlock_page(page); - put_page(page); - continue; - } - - /* Did somebody else fill it already? */ - if (PageUptodate(page)) { - unlock_page(page); - goto page_ok; - } - -readpage: - if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { - unlock_page(page); - put_page(page); - goto would_block; - } - /* - * A previous I/O error may have been due to temporary - * failures, eg. multipath errors. - * PG_error will be set again if readpage fails. - */ - ClearPageError(page); - /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(filp, page); + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (writably_mapped) + flush_dcache_page(pages[i]); - if (unlikely(error)) { - if (error == AOP_TRUNCATED_PAGE) { - put_page(page); - error = 0; - goto find_page; - } - goto readpage_error; - } + copied = copy_page_to_iter(pages[i], offset, bytes, iter); - if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_WAITQ) { - if (written) { - put_page(page); - goto out; - } - error = lock_page_async(page, iocb->ki_waitq); - } else { - error = lock_page_killable(page); - } + written += copied; + iocb->ki_pos += copied; + ra->prev_pos = iocb->ki_pos; - if (unlikely(error)) - goto readpage_error; - if (!PageUptodate(page)) { - if (page->mapping == NULL) { - /* - * invalidate_mapping_pages got it - */ - unlock_page(page); - put_page(page); - goto find_page; - } - unlock_page(page); - shrink_readahead_size_eio(ra); - error = -EIO; - goto readpage_error; + if (copied < bytes) { + error = -EFAULT; + break; } - unlock_page(page); } +put_pages: + for (i = 0; i < pg_nr; i++) + put_page(pages[i]); + } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); - goto page_ok; - -readpage_error: - /* UHHUH! A synchronous read error occurred. Report it */ - put_page(page); - goto out; - -no_cached_page: - /* - * Ok, it wasn't cached, so we need to create a new - * page.. - */ - page = page_cache_alloc(mapping); - if (!page) { - error = -ENOMEM; - goto out; - } - error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (error) { - put_page(page); - if (error == -EEXIST) { - error = 0; - goto find_page; - } - goto out; - } - goto readpage; - } + file_accessed(filp); -would_block: - error = -EAGAIN; -out: - ra->prev_pos = prev_index; - ra->prev_pos <<= PAGE_SHIFT; - ra->prev_pos |= prev_offset; + if (pages != pages_onstack) + kfree(pages); - *ppos = ((loff_t)index << PAGE_SHIFT) + offset; - file_accessed(filp); return written ? written : error; } EXPORT_SYMBOL_GPL(generic_file_buffered_read); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 751e3670d7b0c..504f9210df1b0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2679,6 +2679,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, { unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); int nr_retries = MAX_RECLAIM_RETRIES; + int timeout = 1; struct mem_cgroup *mem_over_limit; struct page_counter *counter; enum oom_status oom_status; @@ -2770,7 +2771,25 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, */ if (mem_cgroup_wait_acct_move(mem_over_limit)) goto retry; - + /* + * Legacy memcg relies on dirty data throttling during the reclaim + * but this cannot be done for GFP_NOFS requests so we might trigger + * the oom way too early. Throttle here if we have way too many + * dirty/writeback pages. + */ + if ((nr_retries < MAX_RECLAIM_RETRIES/2) && + !cgroup_subsys_on_dfl(memory_cgrp_subsys) && + !(gfp_mask & __GFP_FS)) { + unsigned long dirty = memcg_page_state(memcg, NR_FILE_DIRTY); + unsigned long writeback = memcg_page_state(memcg, NR_WRITEBACK); + + if (4*(dirty + writeback) > + 3*page_counter_read(&memcg->memory)) { + schedule_timeout_interruptible(timeout); + if (timeout < 32) + timeout *= 2; + } + } if (nr_retries--) goto retry; @@ -2794,6 +2813,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (oom_status == OOM_SUCCESS) { passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; + timeout = 1; goto retry; } nomem: diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9ec9e1e677051..0b13a56dcc73d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,6 +42,16 @@ #include "internal.h" #include "shuffle.h" + +/* + * memory_hotplug.memmap_on_memory parameter + */ +static bool memmap_on_memory __ro_after_init; +#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY +module_param(memmap_on_memory, bool, 0444); +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); +#endif + /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be @@ -617,9 +627,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). + * When using memmap_on_memory, the range might not be aligned to + * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect + * this and the first chunk to online will be pageblock_nr_pages. */ - for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) - (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1); + for (pfn = start_pfn; pfn < end_pfn;) { + int order = min(MAX_ORDER - 1UL, __ffs(pfn)); + + (*online_page_callback)(pfn_to_page(pfn), order); + pfn += (1UL << order); + } /* mark all involved sections as online */ online_mem_sections(start_pfn, end_pfn); @@ -777,24 +794,86 @@ struct zone *zone_for_pfn_range(int online_type, int nid, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid) +/* + * This function should only be called by memory_block_{online,offline}, + * and {online,offline}_pages. + */ +void adjust_present_page_count(struct zone *zone, long nr_pages) +{ + unsigned long flags; + + zone->present_pages += nr_pages; + pgdat_resize_lock(zone->zone_pgdat, &flags); + zone->zone_pgdat->node_present_pages += nr_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); +} + +int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone) +{ + unsigned long end_pfn = pfn + nr_pages; + int ret; + + ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); + if (ret) + return ret; + + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections online here as otherwise they will be + * left offline. + */ + if (nr_pages >= PAGES_PER_SECTION) + online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + return ret; +} + +void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long end_pfn = pfn + nr_pages; + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections offline here as otherwise they will be + * left online. + */ + if (nr_pages >= PAGES_PER_SECTION) + offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + /* + * The pages associated with this vmemmap have been offlined, so + * we can reset its state here. + */ + remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages); + kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); +} + +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone) { unsigned long flags; - struct zone *zone; int need_zonelists_rebuild = 0; + const int nid = zone_to_nid(zone); int ret; struct memory_notify arg; - /* We can only online full sections (e.g., SECTION_IS_ONLINE) */ + /* + * {on,off}lining is constrained to full memory sections (or more + * precisly to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION))) + !IS_ALIGNED(pfn, pageblock_nr_pages) || + !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); /* associate pfn range with the zone */ - zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); arg.start_pfn = pfn; @@ -825,11 +904,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, } online_pages_range(pfn, nr_pages); - zone->present_pages += nr_pages; - - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages += nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + adjust_present_page_count(zone, nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1012,6 +1087,45 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } +bool mhp_supports_memmap_on_memory(unsigned long size) +{ + unsigned long nr_vmemmap_pages = size / PAGE_SIZE; + unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); + unsigned long remaining_size = size - vmemmap_size; + + /* + * Besides having arch support and the feature enabled at runtime, we + * need a few more assumptions to hold true: + * + * a) We span a single memory block: memory onlining/offlinin;g happens + * in memory block granularity. We don't want the vmemmap of online + * memory blocks to reside on offline memory blocks. In the future, + * we might want to support variable-sized memory blocks to make the + * feature more versatile. + * + * b) The vmemmap pages span complete PMDs: We don't want vmemmap code + * to populate memory from the altmap for unrelated parts (i.e., + * other memory blocks) + * + * c) The vmemmap pages (and thereby the pages that will be exposed to + * the buddy) have to cover full pageblocks: memory onlining/offlining + * code requires applicable ranges to be page-aligned, for example, to + * set the migratetypes properly. + * + * TODO: Although we have a check here to make sure that vmemmap pages + * fully populate a PMD, it is not the right place to check for + * this. A much better solution involves improving vmemmap code + * to fallback to base pages when trying to populate vmemmap using + * altmap as an alternative source of memory, and we do not exactly + * populate a single PMD. + */ + return memmap_on_memory && + IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && + size == memory_block_size_bytes() && + IS_ALIGNED(vmemmap_size, PMD_SIZE) && + IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); +} + /* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). @@ -1021,6 +1135,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; + struct vmem_altmap mhp_altmap = {}; u64 start, size; bool new_node = false; int ret; @@ -1047,13 +1162,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; new_node = ret; + /* + * Self hosted memmap array + */ + if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { + if (!mhp_supports_memmap_on_memory(size)) { + ret = -EINVAL; + goto error; + } + mhp_altmap.free = PHYS_PFN(size); + mhp_altmap.base_pfn = PHYS_PFN(start); + params.altmap = &mhp_altmap; + } + /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size); + ret = create_memory_block_devices(start, size, mhp_altmap.alloc); if (ret) { arch_remove_memory(nid, start, size, NULL); goto error; @@ -1459,9 +1587,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) int ret, node; char *reason; - /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */ + /* + * {on,off}lining is constrained to full memory sections (or more + * precisly to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION))) + !IS_ALIGNED(start_pfn, pageblock_nr_pages) || + !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); @@ -1582,11 +1717,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - zone->present_pages -= nr_pages; - - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages -= nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + adjust_present_page_count(zone, -nr_pages); init_per_zone_wmark_min(); @@ -1639,6 +1770,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) return 0; } +static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) +{ + /* + * If not set, continue with the next block. + */ + return mem->nr_vmemmap_pages; +} + static int check_cpu_on_node(pg_data_t *pgdat) { int cpu; @@ -1713,6 +1852,9 @@ EXPORT_SYMBOL(try_offline_node); static int __ref try_remove_memory(int nid, u64 start, u64 size) { int rc = 0; + struct vmem_altmap mhp_altmap = {}; + struct vmem_altmap *altmap = NULL; + unsigned long nr_vmemmap_pages; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1725,6 +1867,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) if (rc) return rc; + /* + * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in + * the same granularity it was added - a single memory block. + */ + if (memmap_on_memory) { + nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, + get_nr_vmemmap_pages_cb); + if (nr_vmemmap_pages) { + if (size != memory_block_size_bytes()) { + pr_warn("Refuse to remove %#llx - %#llx," + "wrong granularity\n", + start, start + size); + return -EINVAL; + } + + /* + * Let remove_pmd_table->free_hugepage_table do the + * right thing if we used vmem_altmap when hot-adding + * the range. + */ + mhp_altmap.alloc = nr_vmemmap_pages; + altmap = &mhp_altmap; + } + } + /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1736,7 +1903,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); - arch_remove_memory(nid, start, size, NULL); + arch_remove_memory(nid, start, size, altmap); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_free(start, size); diff --git a/mm/migrate.c b/mm/migrate.c index fcb7eb6a6ecae..c0e00735df37a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1282,6 +1282,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, { int rc = -EAGAIN; int page_was_mapped = 0; + bool mapping_locked = false; struct page *new_hpage; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; @@ -1332,7 +1333,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, goto put_anon; if (page_mapped(hpage)) { - bool mapping_locked = false; enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK; if (!PageAnon(hpage)) { @@ -1352,17 +1352,17 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, try_to_unmap(hpage, ttu); page_was_mapped = 1; - - if (mapping_locked) - i_mmap_unlock_write(mapping); } if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, mode); - if (page_was_mapped) + if (page_was_mapped) { remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, mapping_locked); + if (mapping_locked) + i_mmap_unlock_write(mapping); + } unlock_put_anon: unlock_page(new_hpage); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d85435db35f37..1606d6d726b92 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5877,7 +5877,7 @@ static void build_zonelists(pg_data_t *pgdat) */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] = load; + node_load[node] += load; node_order[nr_nodes++] = node; prev_node = node; @@ -5886,6 +5886,10 @@ static void build_zonelists(pg_data_t *pgdat) build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); + pr_info("Fallback order for Node %d: ", local_node); + for (node = 0; node < nr_nodes; node++) + pr_cont("%d ", node_order[node]); + pr_cont("\n"); } #ifdef CONFIG_HAVE_MEMORYLESS_NODES diff --git a/mm/page_ext.c b/mm/page_ext.c index a3616f7a0e9e9..f9a6ff65ac0a9 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -58,11 +58,21 @@ * can utilize this callback to initialize the state of it correctly. */ +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) +static bool need_page_idle(void) +{ + return true; +} +struct page_ext_operations page_idle_ops = { + .need = need_page_idle, +}; +#endif + static struct page_ext_operations *page_ext_ops[] = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif }; diff --git a/mm/page_idle.c b/mm/page_idle.c index 057c61df12dba..144fb4ed961d7 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -211,16 +211,6 @@ static const struct attribute_group page_idle_attr_group = { .name = "page_idle", }; -#ifndef CONFIG_64BIT -static bool need_page_idle(void) -{ - return true; -} -struct page_ext_operations page_idle_ops = { - .need = need_page_idle, -}; -#endif - static int __init page_idle_init(void) { int err; diff --git a/mm/page_reporting.c b/mm/page_reporting.c index cd8e13d41df43..c47e07f2bbeb3 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -312,6 +312,19 @@ static void page_reporting_process(struct work_struct *work) static DEFINE_MUTEX(page_reporting_mutex); DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); +void page_report_offline(unsigned long start_pfn, unsigned int nr_pages) +{ + struct page_reporting_dev_info *prdev; + + mutex_lock(&page_reporting_mutex); + + prdev = rcu_access_pointer(pr_dev_info); + if (prdev && prdev->report_offline) + prdev->report_offline(prdev, start_pfn, nr_pages); + + mutex_unlock(&page_reporting_mutex); +} + int page_reporting_register(struct page_reporting_dev_info *prdev) { int err = 0; diff --git a/mm/sparse.c b/mm/sparse.c index 33406ea2ecc44..d3fbed26e64ef 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -624,7 +624,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } } -#ifdef CONFIG_MEMORY_HOTREMOVE /* Mark all memory sections within the pfn range as offline */ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) { @@ -645,7 +644,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) ms->section_mem_map &= ~SECTION_IS_ONLINE; } } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP static struct page * __meminit populate_section_memmap(unsigned long pfn, diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index a811fe0f0f6fd..3fad2f5b920e0 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -867,7 +867,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; diff --git a/net/core/filter.c b/net/core/filter.c index b9c954182b375..c7520126c32dc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1713,7 +1713,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -2021,9 +2021,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; @@ -2570,7 +2570,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -4193,7 +4193,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4207,7 +4207,7 @@ const struct bpf_func_proto bpf_skb_output_proto = { .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4390,7 +4390,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -4416,7 +4416,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -4586,7 +4586,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4600,7 +4600,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = { .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4640,6 +4640,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) +{ + return sk ? sock_gen_cookie(sk) : 0; +} + +const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { + .func = bpf_get_socket_ptr_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); @@ -5018,7 +5030,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5052,7 +5064,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5227,7 +5239,7 @@ static const struct bpf_func_proto bpf_bind_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -5688,7 +5700,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5698,7 +5710,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5741,7 +5753,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5829,7 +5841,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -6076,7 +6088,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6095,7 +6107,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6114,7 +6126,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6151,7 +6163,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6174,7 +6186,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6197,7 +6209,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6216,7 +6228,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6235,7 +6247,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6254,7 +6266,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6576,9 +6588,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -6645,9 +6657,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -6876,7 +6888,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -9930,11 +9942,13 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { reuse_kern->skb = skb; reuse_kern->sk = sk; reuse_kern->selected_sk = NULL; + reuse_kern->migrating_sk = migrating_sk; reuse_kern->data_end = skb->data + skb_headlen(skb); reuse_kern->hash = hash; reuse_kern->reuseport_id = reuse->reuseport_id; @@ -9943,12 +9957,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { struct sk_reuseport_kern reuse_kern; enum sk_action action; - bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); action = BPF_PROG_RUN(prog, &reuse_kern); if (action == SK_PASS) @@ -10058,6 +10073,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, return &sk_reuseport_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &sk_reuseport_load_bytes_relative_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; default: return bpf_base_func_proto(func_id); } @@ -10087,6 +10104,14 @@ sk_reuseport_is_valid_access(int off, int size, case offsetof(struct sk_reuseport_md, hash): return size == size_default; + case offsetof(struct sk_reuseport_md, sk): + info->reg_type = PTR_TO_SOCKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, migrating_sk): + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + return size == sizeof(__u64); + /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < sizeof_field(struct sk_buff, protocol)) @@ -10159,6 +10184,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, case offsetof(struct sk_reuseport_md, bind_inany): SK_REUSEPORT_LOAD_FIELD(bind_inany); break; + + case offsetof(struct sk_reuseport_md, sk): + SK_REUSEPORT_LOAD_FIELD(sk); + break; + + case offsetof(struct sk_reuseport_md, migrating_sk): + SK_REUSEPORT_LOAD_FIELD(migrating_sk); + break; } return insn - insn_buf; diff --git a/net/core/sock.c b/net/core/sock.c index 98f4b4a80de42..5157e17081c71 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2461,7 +2461,6 @@ static void sk_leave_memory_pressure(struct sock *sk) } } -#define SKB_FRAG_PAGE_ORDER get_order(32768) DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f375ef1501490..6da240ab01701 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1626,7 +1626,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 49f9c2c4ffd5a..bf4ef0d8ca0d2 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -6,6 +6,7 @@ * selecting the socket index from the array of available sockets. */ +#include #include #include #include @@ -17,6 +18,8 @@ DEFINE_SPINLOCK(reuseport_lock); static DEFINE_IDA(reuseport_ida); +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany); void reuseport_has_conns_set(struct sock *sk) { @@ -34,6 +37,72 @@ void reuseport_has_conns_set(struct sock *sk) } EXPORT_SYMBOL(reuseport_has_conns_set); +static int reuseport_sock_index(struct sock *sk, + const struct sock_reuseport *reuse, + bool closed) +{ + int left, right; + + if (!closed) { + left = 0; + right = reuse->num_socks; + } else { + left = reuse->max_socks - reuse->num_closed_socks; + right = reuse->max_socks; + } + + for (; left < right; left++) + if (reuse->socks[left] == sk) + return left; + return -1; +} + +static void __reuseport_add_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ + smp_wmb(); + reuse->num_socks++; +} + +static bool __reuseport_detach_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, false); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + + return true; +} + +static void __reuseport_add_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1); +} + +static bool __reuseport_detach_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, true); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1); + + return true; +} + static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { unsigned int size = sizeof(struct sock_reuseport) + @@ -65,6 +134,12 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); if (reuse) { + if (reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + ret = reuseport_resurrect(sk, reuse, NULL, bind_inany); + goto out; + } + /* Only set reuse->bind_inany if the bind_inany is true. * Otherwise, it will overwrite the reuse->bind_inany * which was set by the bind/hash path. @@ -88,9 +163,9 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) } reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; reuse->socks[0] = sk; reuse->num_socks = 1; - reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -106,14 +181,30 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) u32 more_socks_size, i; more_socks_size = reuse->max_socks * 2U; - if (more_socks_size > U16_MAX) + if (more_socks_size > U16_MAX) { + if (reuse->num_closed_socks) { + /* Make room by removing a closed sk. + * The child has already been migrated. + * Only reqsk left at this point. + */ + struct sock *sk; + + sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL); + __reuseport_detach_closed_sock(sk, reuse); + + return reuse; + } + return NULL; + } more_reuse = __reuseport_alloc(more_socks_size); if (!more_reuse) return NULL; more_reuse->num_socks = reuse->num_socks; + more_reuse->num_closed_socks = reuse->num_closed_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; more_reuse->bind_inany = reuse->bind_inany; @@ -121,9 +212,13 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); + memcpy(more_reuse->socks + + (more_reuse->max_socks - more_reuse->num_closed_socks), + reuse->socks + (reuse->max_socks - reuse->num_closed_socks), + reuse->num_closed_socks * sizeof(struct sock *)); more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); - for (i = 0; i < reuse->num_socks; ++i) + for (i = 0; i < reuse->max_socks; ++i) rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, more_reuse); @@ -168,13 +263,21 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)); + lockdep_is_held(&reuseport_lock)); + if (old_reuse && old_reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany); + + spin_unlock_bh(&reuseport_lock); + return err; + } + if (old_reuse && old_reuse->num_socks != 1) { spin_unlock_bh(&reuseport_lock); return -EBUSY; } - if (reuse->num_socks == reuse->max_socks) { + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); if (!reuse) { spin_unlock_bh(&reuseport_lock); @@ -182,10 +285,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) } } - reuse->socks[reuse->num_socks] = sk; - /* paired with smp_rmb() in reuseport_select_sock() */ - smp_wmb(); - reuse->num_socks++; + __reuseport_add_sock(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); spin_unlock_bh(&reuseport_lock); @@ -196,15 +296,77 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) } EXPORT_SYMBOL(reuseport_add_sock); +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany) +{ + if (old_reuse == reuse) { + /* If sk was in the same reuseport group, just pop sk out of + * the closed section and push sk into the listening section. + */ + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, old_reuse); + return 0; + } + + if (!reuse) { + /* In bind()/listen() path, we cannot carry over the eBPF prog + * for the shutdown()ed socket. In setsockopt() path, we should + * not change the eBPF prog of listening sockets by attaching a + * prog to the shutdown()ed socket. Thus, we will allocate a new + * reuseport group and detach sk from the old group. + */ + int id; + + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) + return -ENOMEM; + + id = ida_alloc(&reuseport_ida, GFP_ATOMIC); + if (id < 0) { + kfree(reuse); + return id; + } + + reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; + } else { + /* Move sk from the old group to the new one if + * - all the other listeners in the old group were close()d or + * shutdown()ed, and then sk2 has listen()ed on the same port + * OR + * - sk listen()ed without bind() (or with autobind), was + * shutdown()ed, and then listen()s on another port which + * sk2 listen()s on. + */ + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) + return -ENOMEM; + } + } + + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, reuse); + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + if (old_reuse->num_socks + old_reuse->num_closed_socks == 0) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); + + return 0; +} + void reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; - int i; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + /* reuseport_grow() has detached a closed sk */ + if (!reuse) + goto out; + /* Notify the bpf side. The sk may be added to a sockarray * map. If so, sockarray logic will remove it from the map. * @@ -217,19 +379,52 @@ void reuseport_detach_sock(struct sock *sk) rcu_assign_pointer(sk->sk_reuseport_cb, NULL); - for (i = 0; i < reuse->num_socks; i++) { - if (reuse->socks[i] == sk) { - reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; - reuse->num_socks--; - if (reuse->num_socks == 0) - call_rcu(&reuse->rcu, reuseport_free_rcu); - break; - } - } + if (!__reuseport_detach_closed_sock(sk, reuse)) + __reuseport_detach_sock(sk, reuse); + + if (reuse->num_socks + reuse->num_closed_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); + +out: spin_unlock_bh(&reuseport_lock); } EXPORT_SYMBOL(reuseport_detach_sock); +void reuseport_stop_listen_sock(struct sock *sk) +{ + if (sk->sk_protocol == IPPROTO_TCP) { + struct sock_reuseport *reuse; + struct bpf_prog *prog; + + spin_lock_bh(&reuseport_lock); + + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req || + (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { + /* Migration capable, move sk from the listening section + * to the closed section. + */ + bpf_sk_reuseport_detach(sk); + + __reuseport_detach_sock(sk, reuse); + __reuseport_add_closed_sock(sk, reuse); + + spin_unlock_bh(&reuseport_lock); + return; + } + + spin_unlock_bh(&reuseport_lock); + } + + /* Not capable to do migration, detach immediately */ + reuseport_detach_sock(sk); +} +EXPORT_SYMBOL(reuseport_stop_listen_sock); + static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, struct bpf_prog *prog, struct sk_buff *skb, int hdr_len) @@ -260,6 +455,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, return reuse->socks[index]; } +static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, + u32 hash, u16 num_socks) +{ + int i, j; + + i = j = reciprocal_scale(hash, num_socks); + while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { + i++; + if (i >= num_socks) + i = 0; + if (i == j) + return NULL; + } + + return reuse->socks[i]; +} + /** * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. * @sk: First socket in the group. @@ -290,32 +502,21 @@ struct sock *reuseport_select_sock(struct sock *sk, prog = rcu_dereference(reuse->prog); socks = READ_ONCE(reuse->num_socks); if (likely(socks)) { - /* paired with smp_wmb() in reuseport_add_sock() */ + /* paired with smp_wmb() in __reuseport_add_sock() */ smp_rmb(); if (!prog || !skb) goto select_by_hash; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) - sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash); else sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ - if (!sk2) { - int i, j; - - i = j = reciprocal_scale(hash, socks); - while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { - i++; - if (i >= socks) - i = 0; - if (i == j) - goto out; - } - sk2 = reuse->socks[i]; - } + if (!sk2) + sk2 = reuseport_select_sock_by_hash(reuse, hash, socks); } out: @@ -324,14 +525,90 @@ struct sock *reuseport_select_sock(struct sock *sk, } EXPORT_SYMBOL(reuseport_select_sock); +/** + * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group. + * @sk: close()ed or shutdown()ed socket in the group. + * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or + * NEW_SYN_RECV request socket during 3WHS. + * @skb: skb to run through BPF filter. + * Returns a socket (with sk_refcnt +1) that should accept the child socket + * (or NULL on error). + */ +struct sock *reuseport_migrate_sock(struct sock *sk, + struct sock *migrating_sk, + struct sk_buff *skb) +{ + struct sock_reuseport *reuse; + struct sock *nsk = NULL; + bool allocated = false; + struct bpf_prog *prog; + u16 socks; + u32 hash; + + rcu_read_lock(); + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (!reuse) + goto out; + + socks = READ_ONCE(reuse->num_socks); + if (unlikely(!socks)) + goto failure; + + /* paired with smp_wmb() in __reuseport_add_sock() */ + smp_rmb(); + + hash = migrating_sk->sk_hash; + prog = rcu_dereference(reuse->prog); + if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) + goto select_by_hash; + goto failure; + } + + if (!skb) { + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + goto failure; + allocated = true; + } + + nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash); + + if (allocated) + kfree_skb(skb); + +select_by_hash: + if (!nsk) + nsk = reuseport_select_sock_by_hash(reuse, hash, socks); + + if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) { + nsk = NULL; + goto failure; + } + +out: + rcu_read_unlock(); + return nsk; + +failure: + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + goto out; +} +EXPORT_SYMBOL(reuseport_migrate_sock); + int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; - if (sk_unhashed(sk) && sk->sk_reuseport) { - int err = reuseport_alloc(sk, false); + if (sk_unhashed(sk)) { + int err; + + if (!sk->sk_reuseport) + return -EINVAL; + err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -357,13 +634,24 @@ int reuseport_detach_prog(struct sock *sk) struct sock_reuseport *reuse; struct bpf_prog *old_prog; - if (!rcu_access_pointer(sk->sk_reuseport_cb)) - return sk->sk_reuseport ? -ENOENT : -EINVAL; - old_prog = NULL; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* reuse must be checked after acquiring the reuseport_lock + * because reuseport_grow() can detach a closed sk. + */ + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return sk->sk_reuseport ? -ENOENT : -EINVAL; + } + + if (sk_unhashed(sk) && reuse->num_closed_socks) { + spin_unlock_bh(&reuseport_lock); + return -ENOENT; + } + old_prog = rcu_replace_pointer(reuse->prog, old_prog, lockdep_is_held(&reuseport_lock)); spin_unlock_bh(&reuseport_lock); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 23b06063e1a51..27f80112c7072 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -679,6 +679,24 @@ config TCP_CONG_BBR AQM schemes that do not provide a delay signal. It requires the fq ("Fair Queue") pacing packet scheduler. +config TCP_CONG_BBR2 + tristate "BBR2 TCP" + default n + help + + BBR2 TCP congestion control is a model-based congestion control + algorithm that aims to maximize network utilization, keep queues and + retransmit rates low, and to be able to coexist with Reno/CUBIC in + common scenarios. It builds an explicit model of the network path. It + tolerates a targeted degree of random packet loss and delay that are + unrelated to congestion. It can operate over LAN, WAN, cellular, wifi, + or cable modem links, and can use DCTCP-L4S-style ECN signals. It can + coexist with flows that use loss-based congestion control, and can + operate with shallow buffers, deep buffers, bufferbloat, policers, or + AQM schemes that do not provide a delay signal. It requires pacing, + using either TCP internal pacing or the fq ("Fair Queue") pacing packet + scheduler. + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -716,6 +734,9 @@ choice config DEFAULT_BBR bool "BBR" if TCP_CONG_BBR=y + config DEFAULT_BBR2 + bool "BBR2" if TCP_CONG_BBR2=y + config DEFAULT_RENO bool "Reno" endchoice @@ -740,6 +761,7 @@ config DEFAULT_TCP_CONG default "dctcp" if DEFAULT_DCTCP default "cdg" if DEFAULT_CDG default "bbr" if DEFAULT_BBR + default "bbr2" if DEFAULT_BBR2 default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 5b77a46885b95..babfdbf6c15b9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,7 +10,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ tcp_rate.o tcp_recovery.o tcp_ulp.o \ - tcp_offload.o datagram.o raw.o udp.o udplite.o \ + tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ @@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o +obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 618954f82764d..8ad93e1fe9dd3 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -16,7 +16,7 @@ static u32 optional_ops[] = { offsetof(struct tcp_congestion_ops, cwnd_event), offsetof(struct tcp_congestion_ops, in_ack_event), offsetof(struct tcp_congestion_ops, pkts_acked), - offsetof(struct tcp_congestion_ops, min_tso_segs), + offsetof(struct tcp_congestion_ops, tso_segs), offsetof(struct tcp_congestion_ops, sndbuf_expand), offsetof(struct tcp_congestion_ops, cong_control), }; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5f71a1c74e7e0..1e7a011626ca7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -135,10 +135,18 @@ static int inet_csk_bind_conflict(const struct sock *sk, bool relax, bool reuseport_ok) { struct sock *sk2; + bool reuseport_cb_ok; bool reuse = sk->sk_reuse; bool reuseport = !!sk->sk_reuseport; + struct sock_reuseport *reuseport_cb; kuid_t uid = sock_i_uid((struct sock *)sk); + rcu_read_lock(); + reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); + /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ + reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); + rcu_read_unlock(); + /* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed @@ -160,14 +168,14 @@ static int inet_csk_bind_conflict(const struct sock *sk, if ((!relax || (!reuseport_ok && reuseport && sk2->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && + reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || uid_eq(uid, sock_i_uid(sk2))))) && inet_rcv_saddr_equal(sk, sk2, true)) break; } else if (!reuseport_ok || !reuseport || !sk2->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || + !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2)))) { if (inet_rcv_saddr_equal(sk, sk2, true)) @@ -691,6 +699,66 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); +static struct request_sock *inet_reqsk_clone(struct request_sock *req, + struct sock *sk) +{ + struct sock *req_sk, *nreq_sk; + struct request_sock *nreq; + + nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); + if (!nreq) { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + + /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ + sock_put(sk); + return NULL; + } + + req_sk = req_to_sk(req); + nreq_sk = req_to_sk(nreq); + + memcpy(nreq_sk, req_sk, + offsetof(struct sock, sk_dontcopy_begin)); + memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, + req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end)); + + sk_node_init(&nreq_sk->sk_node); + nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; +#ifdef CONFIG_XPS + nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; +#endif + nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; + + nreq->rsk_listener = sk; + + /* We need not acquire fastopenq->lock + * because the child socket is locked in inet_csk_listen_stop(). + */ + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) + rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); + + return nreq; +} + +static void reqsk_queue_migrated(struct request_sock_queue *queue, + const struct request_sock *req) +{ + if (req->num_timeout == 0) + atomic_inc(&queue->young); + atomic_inc(&queue->qlen); +} + +static void reqsk_migrate_reset(struct request_sock *req) +{ + req->saved_syn = NULL; +#if IS_ENABLED(CONFIG_IPV6) + inet_rsk(req)->ipv6_opt = NULL; + inet_rsk(req)->pktopts = NULL; +#else + inet_rsk(req)->ireq_opt = NULL; +#endif +} + /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { @@ -731,15 +799,39 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { struct request_sock *req = from_timer(req, t, rsk_timer); + struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; - struct net *net = sock_net(sk_listener); - struct inet_connection_sock *icsk = inet_csk(sk_listener); - struct request_sock_queue *queue = &icsk->icsk_accept_queue; + struct inet_connection_sock *icsk; + struct request_sock_queue *queue; + struct net *net; int max_syn_ack_retries, qlen, expire = 0, resend = 0; - if (inet_sk_state_load(sk_listener) != TCP_LISTEN) - goto drop; + if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { + struct sock *nsk; + + nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); + if (!nsk) + goto drop; + nreq = inet_reqsk_clone(req, nsk); + if (!nreq) + goto drop; + + /* The new timer for the cloned req can decrease the 2 + * by calling inet_csk_reqsk_queue_drop_and_put(), so + * hold another count to prevent use-after-free and + * call reqsk_put() just before return. + */ + refcount_set(&nreq->rsk_refcnt, 2 + 1); + timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); + reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); + + req = nreq; + sk_listener = nsk; + } + + icsk = inet_csk(sk_listener); + net = sock_net(sk_listener); max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_synack_retries); /* Normally all the openreqs are young and become mature @@ -759,6 +851,7 @@ static void reqsk_timer_handler(struct timer_list *t) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ + queue = &icsk->icsk_accept_queue; qlen = reqsk_queue_len(queue); if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; @@ -783,10 +876,39 @@ static void reqsk_timer_handler(struct timer_list *t) atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer(&req->rsk_timer, jiffies + timeo); + + if (!nreq) + return; + + if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { + /* delete timer */ + inet_csk_reqsk_queue_drop(sk_listener, nreq); + goto no_ownership; + } + + __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS); + reqsk_migrate_reset(oreq); + reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); + reqsk_put(oreq); + + reqsk_put(nreq); return; } + + /* Even if we can clone the req, we may need not retransmit any more + * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another + * CPU may win the "own_req" race so that inet_ehash_insert() fails. + */ + if (nreq) { + __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE); +no_ownership: + reqsk_migrate_reset(nreq); + reqsk_queue_removed(queue, nreq); + __reqsk_free(nreq); + } + drop: - inet_csk_reqsk_queue_drop_and_put(sk_listener, req); + inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } static void reqsk_queue_hash_req(struct request_sock *req, @@ -1018,12 +1140,42 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { - inet_csk_reqsk_queue_drop(sk, req); - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - if (inet_csk_reqsk_queue_add(sk, req, child)) + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + + if (sk != req->rsk_listener) { + /* another listening sk has been selected, + * migrate the req to it. + */ + struct request_sock *nreq; + + /* hold a refcnt for the nreq->rsk_listener + * which is assigned in inet_reqsk_clone() + */ + sock_hold(sk); + nreq = inet_reqsk_clone(req, sk); + if (!nreq) { + inet_child_forget(sk, req, child); + goto child_put; + } + + refcount_set(&nreq->rsk_refcnt, 1); + if (inet_csk_reqsk_queue_add(sk, nreq, child)) { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS); + reqsk_migrate_reset(req); + reqsk_put(req); + return child; + } + + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; + } } /* Too bad, another child took ownership of the request, undo. */ +child_put: bh_unlock_sock(child); sock_put(child); return NULL; @@ -1049,14 +1201,40 @@ void inet_csk_listen_stop(struct sock *sk) * of the variants now. --ANK */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { - struct sock *child = req->sk; + struct sock *child = req->sk, *nsk; + struct request_sock *nreq; local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); + nsk = reuseport_migrate_sock(sk, child, NULL); + if (nsk) { + nreq = inet_reqsk_clone(req, nsk); + if (nreq) { + refcount_set(&nreq->rsk_refcnt, 1); + + if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { + __NET_INC_STATS(sock_net(nsk), + LINUX_MIB_TCPMIGRATEREQSUCCESS); + reqsk_migrate_reset(req); + } else { + __NET_INC_STATS(sock_net(nsk), + LINUX_MIB_TCPMIGRATEREQFAILURE); + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } + + /* inet_csk_reqsk_queue_add() has already + * called inet_child_forget() on failure case. + */ + goto skip_child_forget; + } + } + inet_child_forget(sk, req, child); +skip_child_forget: reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ad050f8476b8e..196acc78d8a2e 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -679,7 +679,7 @@ static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) return; if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); + reuseport_stop_listen_sock(sk); if (ilb) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 80d13d8f982dc..dfa9cbe280951 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -294,6 +294,9 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), + SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), + SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), + SNMP_MIB_ITEM("TCPECNRehash", LINUX_MIB_TCPECNREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 59ba518a85b9c..a7f8a94da69fc 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -50,6 +50,8 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static int tcp_plb_max_rounds = 31; +static int tcp_plb_max_cong_thresh = 256; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -209,7 +211,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write, net = container_of(table->data, struct net, ipv4.sysctl_ip_fwd_update_priority); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, net); @@ -628,16 +630,16 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "udp_early_demux", .data = &init_net.ipv4.sysctl_udp_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_douintvec_minmax, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_demux", .data = &init_net.ipv4.sysctl_tcp_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_douintvec_minmax, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "nexthop_compat_mode", @@ -688,7 +690,7 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "ip_forward_update_priority", .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = ipv4_fwd_update_priority, .extra1 = SYSCTL_ZERO, @@ -1309,6 +1311,47 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, + { + .procname = "tcp_plb_enabled", + .data = &init_net.ipv4.sysctl_tcp_plb_enabled, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "tcp_plb_cong_thresh", + .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_plb_max_cong_thresh, + }, + { + .procname = "tcp_plb_idle_rehash_rounds", + .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra2 = &tcp_plb_max_rounds, + }, + { + .procname = "tcp_plb_rehash_rounds", + .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra2 = &tcp_plb_max_rounds, + }, + { + .procname = "tcp_plb_suspend_rto_sec", + .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3dd9b76f40559..92229a9030a34 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2863,6 +2863,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; + tp->fast_ack_mode = 0; /* Clean up fastopen related fields */ diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 6274462b86b4b..c0d5a4211fc18 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -292,26 +292,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) sk->sk_pacing_rate = rate; } -/* override sysctl_tcp_min_tso_segs */ static u32 bbr_min_tso_segs(struct sock *sk) { return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; } +/* Return the number of segments BBR would like in a TSO/GSO skb, given + * a particular max gso size as a constraint. + */ +static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, + u32 gso_max_size) +{ + u32 segs; + u64 bytes; + + /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ + bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; + + bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); + segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk)); + return segs; +} + +/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ +static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) +{ + return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); +} + +/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ static u32 bbr_tso_segs_goal(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 segs, bytes; - - /* Sort of tcp_tso_autosize() but ignoring - * driver provided sk_gso_max_size. - */ - bytes = min_t(unsigned long, - sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), - GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); - segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - return min(segs, 0x7FU); + return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ @@ -1147,7 +1161,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .undo_cwnd = bbr_undo_cwnd, .cwnd_event = bbr_cwnd_event, .ssthresh = bbr_ssthresh, - .min_tso_segs = bbr_min_tso_segs, + .tso_segs = bbr_tso_segs, .get_info = bbr_get_info, .set_state = bbr_set_state, }; diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c new file mode 100644 index 0000000000000..09319695da824 --- /dev/null +++ b/net/ipv4/tcp_bbr2.c @@ -0,0 +1,2692 @@ +/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2 + * + * BBRv2 is a model-based congestion control algorithm that aims for low + * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model + * of the network path, it uses measurements of bandwidth and RTT, as well as + * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals. Note that + * although it can use ECN or loss signals explicitly, it does not require + * either; it can bound its in-flight data based on its estimate of the BDP. + * + * The model has both higher and lower bounds for the operating range: + * lo: bw_lo, inflight_lo: conservative short-term lower bound + * hi: bw_hi, inflight_hi: robust long-term upper bound + * The bandwidth-probing time scale is (a) extended dynamically based on + * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by + * an interactive wall-clock time-scale to be more scalable and responsive + * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * + * | + * V + * +---> STARTUP ----+ + * | | | + * | V | + * | DRAIN ----+ + * | | | + * | V | + * +---> PROBE_BW ----+ + * | ^ | | + * | | | | + * | +----+ | + * | | + * +---- PROBE_RTT <--+ + * + * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. + * When it estimates the pipe is full, it enters DRAIN to drain the queue. + * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. + * A long-lived BBR flow spends the vast majority of its time remaining + * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth + * in a fair manner, with a small, bounded queue. *If* a flow has been + * continuously sending for the entire min_rtt window, and hasn't seen an RTT + * sample that matches or decreases its min_rtt estimate for 10 seconds, then + * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe + * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if + * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; + * otherwise we enter STARTUP to try to fill the pipe. + * + * BBR is described in detail in: + * "BBR: Congestion-Based Congestion Control", + * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, + * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. + * + * There is a public e-mail list for discussing BBR development and testing: + * https://groups.google.com/forum/#!forum/bbr-dev + * + * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, + * otherwise TCP stack falls back to an internal pacing using one high + * resolution timer per TCP socket and may use more resources. + */ +#include +#include +#include +#include +#include + +#include "tcp_dctcp.h" + +/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. + * Since the minimum window is >=4 packets, the lower bound isn't + * an issue. The upper bound isn't an issue with existing technologies. + */ +#define BW_SCALE 24 +#define BW_UNIT (1 << BW_SCALE) + +#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ +#define BBR_UNIT (1 << BBR_SCALE) + +#define FLAG_DEBUG_VERBOSE 0x1 /* Verbose debugging messages */ +#define FLAG_DEBUG_LOOPBACK 0x2 /* Do NOT skip loopback addr */ + +#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ + +/* BBR has the following modes for deciding how fast to send: */ +enum bbr_mode { + BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ + BBR_DRAIN, /* drain any queue created during startup */ + BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ +}; + +/* How does the incoming ACK stream relate to our bandwidth probing? */ +enum bbr_ack_phase { + BBR_ACKS_INIT, /* not probing; not getting probe feedback */ + BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ + BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ + BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ + BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ +}; + +/* BBR congestion control block */ +struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ + u32 probe_rtt_min_us; /* min RTT in bbr_probe_rtt_win_ms window */ + u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ + u64 cycle_mstamp; /* time of this cycle phase start */ + u32 mode:3, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ + packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ + ce_state:1, /* If most recent data has CE bit set */ + bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ + try_fast_path:1, /* can we take fast path? */ + unused2:11, + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ + cycle_idx:3, /* current index in pacing_gain cycle array */ + has_seen_rtt:1; /* have we seen an RTT sample yet? */ + u32 pacing_gain:11, /* current gain for setting pacing rate */ + cwnd_gain:11, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ + init_cwnd:7; /* initial cwnd */ + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + + /* For tracking ACK aggregation: */ + u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ + u16 extra_acked[2]; /* max excess data ACKed in epoch */ + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ + /* BBR v2 state: */ + unused1:2, + startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ + loss_in_cycle:1, /* packet loss in this cycle? */ + ecn_in_cycle:1; /* ECN in this cycle? */ + u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ + u32 undo_bw_lo; /* bw_lo before latest losses */ + u32 undo_inflight_lo; /* inflight_lo before latest losses */ + u32 undo_inflight_hi; /* inflight_hi before latest losses */ + u32 bw_latest; /* max delivered bw in last round trip */ + u32 bw_lo; /* lower bound on sending bandwidth */ + u32 bw_hi[2]; /* upper bound of sending bandwidth range*/ + u32 inflight_latest; /* max delivered data in last round trip */ + u32 inflight_lo; /* lower bound of inflight data range */ + u32 inflight_hi; /* upper bound of inflight data range */ + u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ + u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ + u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ + u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ + ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ + bw_probe_samples:1, /* rate samples reflect bw probing? */ + prev_probe_too_high:1, /* did last PROBE_UP go too high? */ + stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ + rounds_since_probe:8, /* packet-timed rounds since probed bw */ + loss_round_start:1, /* loss_round_delivered round trip? */ + loss_in_round:1, /* loss marked in this round trip? */ + ecn_in_round:1, /* ECN marked in this round trip? */ + ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ + loss_events_in_round:4,/* losses in STARTUP round */ + initialized:1; /* has bbr_init() been called? */ + u32 alpha_last_delivered; /* tp->delivered at alpha update */ + u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ + struct tcp_plb_state plb; + + /* Params configurable using setsockopt. Refer to correspoding + * module param for detailed description of params. + */ + struct bbr_params { + u32 high_gain:11, /* max allowed value: 2047 */ + drain_gain:10, /* max allowed value: 1023 */ + cwnd_gain:11; /* max allowed value: 2047 */ + u32 cwnd_min_target:4, /* max allowed value: 15 */ + min_rtt_win_sec:5, /* max allowed value: 31 */ + probe_rtt_mode_ms:9, /* max allowed value: 511 */ + full_bw_cnt:3, /* max allowed value: 7 */ + cwnd_tso_budget:1, /* allowed values: {0, 1} */ + unused3:6, + drain_to_target:1, /* boolean */ + precise_ece_ack:1, /* boolean */ + extra_acked_in_startup:1, /* allowed values: {0, 1} */ + fast_path:1; /* boolean */ + u32 full_bw_thresh:10, /* max allowed value: 1023 */ + startup_cwnd_gain:11, /* max allowed value: 2047 */ + bw_probe_pif_gain:9, /* max allowed value: 511 */ + usage_based_cwnd:1, /* boolean */ + unused2:1; + u16 probe_rtt_win_ms:14, /* max allowed value: 16383 */ + refill_add_inc:2; /* max allowed value: 3 */ + u16 extra_acked_gain:11, /* max allowed value: 2047 */ + extra_acked_win_rtts:5; /* max allowed value: 31*/ + u16 pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */ + /* Mostly BBR v2 parameters below here: */ + u32 ecn_alpha_gain:8, /* max allowed value: 255 */ + ecn_factor:8, /* max allowed value: 255 */ + ecn_thresh:8, /* max allowed value: 255 */ + beta:8; /* max allowed value: 255 */ + u32 ecn_max_rtt_us:19, /* max allowed value: 524287 */ + bw_probe_reno_gain:9, /* max allowed value: 511 */ + full_loss_cnt:4; /* max allowed value: 15 */ + u32 probe_rtt_cwnd_gain:8, /* max allowed value: 255 */ + inflight_headroom:8, /* max allowed value: 255 */ + loss_thresh:8, /* max allowed value: 255 */ + bw_probe_max_rounds:8; /* max allowed value: 255 */ + u32 bw_probe_rand_rounds:4, /* max allowed value: 15 */ + bw_probe_base_us:26, /* usecs: 0..2^26-1 (67 secs) */ + full_ecn_cnt:2; /* max allowed value: 3 */ + u32 bw_probe_rand_us:26, /* usecs: 0..2^26-1 (67 secs) */ + undo:1, /* boolean */ + tso_rtt_shift:4, /* max allowed value: 15 */ + unused5:1; + u32 ecn_reprobe_gain:9, /* max allowed value: 511 */ + unused1:14, + ecn_alpha_init:9; /* max allowed value: 256 */ + } params; + + struct { + u32 snd_isn; /* Initial sequence number */ + u32 rs_bw; /* last valid rate sample bw */ + u32 target_cwnd; /* target cwnd, based on BDP */ + u8 undo:1, /* Undo even happened but not yet logged */ + unused:7; + char event; /* single-letter event debug codes */ + u16 unused2; + } debug; +}; + +struct bbr_context { + u32 sample_bw; + u32 target_cwnd; + u32 log:1; +}; + +/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */ +static u32 bbr_min_rtt_win_sec = 10; +/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode. + * Max allowed value is 511 (0x1FF). + */ +static u32 bbr_probe_rtt_mode_ms = 200; +/* Window length of probe_rtt_min_us filter (in ms), and consequently the + * typical interval between PROBE_RTT mode entries. + * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC + */ +static u32 bbr_probe_rtt_win_ms = 5000; +/* Skip TSO below the following bandwidth (bits/sec): */ +static int bbr_min_tso_rate = 1200000; + +/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting + * in bigger TSO bursts. By default we cut the RTT-based allowance in half + * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance + * is below 1500 bytes after 6 * ~500 usec = 3ms. + */ +static u32 bbr_tso_rtt_shift = 9; /* halve allowance per 2^9 usecs, 512us */ + +/* Select cwnd TSO budget approach: + * 0: padding + * 1: flooring + */ +static uint bbr_cwnd_tso_budget = 1; + +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while + * maintaining high utilization, the average pacing rate aims to be slightly + * lower than the estimated bandwidth. This is an important aspect of the + * design. + */ +static const int bbr_pacing_margin_percent = 1; + +/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF). + */ +static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */ +static int bbr_startup_cwnd_gain = BBR_UNIT * 2885 / 1000 + 1; +/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round. Max allowed value + * is 1023 (0x3FF). + */ +static int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs. + * Max allowed value is 2047 (0x7FF). + */ +static int bbr_cwnd_gain = BBR_UNIT * 2; +/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw. + * Max allowed value for each element is 1023 (0x3FF). + */ +enum bbr_pacing_gain_phase { + BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ + BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ + BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ + BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ +}; +static int bbr_pacing_gain[] = { + BBR_UNIT * 5 / 4, /* probe for more available bw */ + BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ + BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ + BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ +}; + +/* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet + * needs at least 4 packets in flight. Max allowed value is 15 (0xF). + */ +static u32 bbr_cwnd_min_target = 4; + +/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%. + * Use 0 to disable. Max allowed value is 255. + */ +static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; + +/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ +/* If bw has increased significantly (1.25x), there may be more bw available. + * Max allowed value is 1023 (0x3FF). + */ +static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; +/* But after 3 rounds w/o significant bw growth, estimate pipe is full. + * Max allowed value is 7 (0x7). + */ +static u32 bbr_full_bw_cnt = 3; + +static u32 bbr_flags; /* Debugging related stuff */ + +/* Whether to debug using printk. + */ +static bool bbr_debug_with_printk; + +/* Whether to debug using ftrace event tcp:tcp_bbr_event. + * Ignored when bbr_debug_with_printk is set. + */ +static bool bbr_debug_ftrace; + +/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */ +static bool bbr_drain_to_target = true; /* default: enabled */ + +/* Experiment: Flags to control BBR with ECN behavior. + */ +static bool bbr_precise_ece_ack = true; /* default: enabled */ + +/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is + * (2^(16+14) B)/(1024 B/packet) = 1M packets. + */ +static u32 bbr_cwnd_warn_val = 1U << 20; + +static u16 bbr_debug_port_mask; + +/* BBR module parameters. These are module parameters only in Google prod. + * Upstream these are intentionally not module parameters. + */ +static int bbr_pacing_gain_size = CYCLE_LEN; + +/* Gain factor for adding extra_acked to target cwnd: */ +static int bbr_extra_acked_gain = 256; + +/* Window length of extra_acked window. Max allowed val is 31. */ +static u32 bbr_extra_acked_win_rtts = 5; + +/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ +static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + +/* Time period for clamping cwnd increment due to ack aggregation */ +static u32 bbr_extra_acked_max_us = 100 * 1000; + +/* Use extra acked in startup ? + * 0: disabled + * 1: use latest extra_acked value from 1-2 rtt in startup + */ +static int bbr_extra_acked_in_startup = 1; /* default: enabled */ + +/* Experiment: don't grow cwnd beyond twice of what we just probed. */ +static bool bbr_usage_based_cwnd; /* default: disabled */ + +/* For lab testing, researchers can enable BBRv2 ECN support with this flag, + * when they know that any ECN marks that the connections experience will be + * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks. + * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on + * negotiation or configuration that is outside the scope of the BBRv2 + * alpha release. + */ +static bool bbr_ecn_enable = false; + +module_param_named(min_tso_rate, bbr_min_tso_rate, int, 0644); +module_param_named(tso_rtt_shift, bbr_tso_rtt_shift, int, 0644); +module_param_named(high_gain, bbr_high_gain, int, 0644); +module_param_named(drain_gain, bbr_drain_gain, int, 0644); +module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int, 0644); +module_param_named(cwnd_gain, bbr_cwnd_gain, int, 0644); +module_param_array_named(pacing_gain, bbr_pacing_gain, int, + &bbr_pacing_gain_size, 0644); +module_param_named(cwnd_min_target, bbr_cwnd_min_target, uint, 0644); +module_param_named(probe_rtt_cwnd_gain, + bbr_probe_rtt_cwnd_gain, uint, 0664); +module_param_named(cwnd_warn_val, bbr_cwnd_warn_val, uint, 0664); +module_param_named(debug_port_mask, bbr_debug_port_mask, ushort, 0644); +module_param_named(flags, bbr_flags, uint, 0644); +module_param_named(debug_ftrace, bbr_debug_ftrace, bool, 0644); +module_param_named(debug_with_printk, bbr_debug_with_printk, bool, 0644); +module_param_named(min_rtt_win_sec, bbr_min_rtt_win_sec, uint, 0644); +module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint, 0644); +module_param_named(probe_rtt_win_ms, bbr_probe_rtt_win_ms, uint, 0644); +module_param_named(full_bw_thresh, bbr_full_bw_thresh, uint, 0644); +module_param_named(full_bw_cnt, bbr_full_bw_cnt, uint, 0644); +module_param_named(cwnd_tso_bduget, bbr_cwnd_tso_budget, uint, 0664); +module_param_named(extra_acked_gain, bbr_extra_acked_gain, int, 0664); +module_param_named(extra_acked_win_rtts, + bbr_extra_acked_win_rtts, uint, 0664); +module_param_named(extra_acked_max_us, + bbr_extra_acked_max_us, uint, 0664); +module_param_named(ack_epoch_acked_reset_thresh, + bbr_ack_epoch_acked_reset_thresh, uint, 0664); +module_param_named(drain_to_target, bbr_drain_to_target, bool, 0664); +module_param_named(precise_ece_ack, bbr_precise_ece_ack, bool, 0664); +module_param_named(extra_acked_in_startup, + bbr_extra_acked_in_startup, int, 0664); +module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool, 0664); +module_param_named(ecn_enable, bbr_ecn_enable, bool, 0664); + +static void bbr2_exit_probe_rtt(struct sock *sk); +static void bbr2_reset_congestion_signals(struct sock *sk); + +static void bbr_check_probe_rtt_done(struct sock *sk); + +/* Do we estimate that STARTUP filled the pipe? */ +static bool bbr_full_bw_reached(const struct sock *sk) +{ + const struct bbr *bbr = inet_csk_ca(sk); + + return bbr->full_bw_reached; +} + +/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ +static u32 bbr_max_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->bw_hi[0], bbr->bw_hi[1]); +} + +/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ +static u32 bbr_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return min(bbr_max_bw(sk), bbr->bw_lo); +} + +/* Return maximum extra acked in past k-2k round trips, + * where k = bbr_extra_acked_win_rtts. + */ +static u16 bbr_extra_acked(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->extra_acked[0], bbr->extra_acked[1]); +} + +/* Return rate in bytes per second, optionally with a gain. + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, + int margin) +{ + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; + rate *= USEC_PER_SEC / 100 * (100 - margin); + rate >>= BW_SCALE; + rate = max(rate, 1ULL); + return rate; +} + +static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) +{ + return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); +} + +static u64 bbr_rate_kbps(struct sock *sk, u64 rate) +{ + rate = bbr_bw_bytes_per_sec(sk, rate); + rate *= 8; + do_div(rate, 1000); + return rate; +} + +static u32 bbr_tso_segs_goal(struct sock *sk); +static void bbr_debug(struct sock *sk, u32 acked, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + static const char ca_states[] = { + [TCP_CA_Open] = 'O', + [TCP_CA_Disorder] = 'D', + [TCP_CA_CWR] = 'C', + [TCP_CA_Recovery] = 'R', + [TCP_CA_Loss] = 'L', + }; + static const char mode[] = { + 'G', /* Growing - BBR_STARTUP */ + 'D', /* Drain - BBR_DRAIN */ + 'W', /* Window - BBR_PROBE_BW */ + 'M', /* Min RTT - BBR_PROBE_RTT */ + }; + static const char ack_phase[] = { /* bbr_ack_phase strings */ + 'I', /* BBR_ACKS_INIT - 'Init' */ + 'R', /* BBR_ACKS_REFILLING - 'Refilling' */ + 'B', /* BBR_ACKS_PROBE_STARTING - 'Before' */ + 'F', /* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */ + 'A', /* BBR_ACKS_PROBE_STOPPING - 'After' */ + }; + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + const u32 una = tp->snd_una - bbr->debug.snd_isn; + const u32 fack = tcp_highest_sack_seq(tp); + const u16 dport = ntohs(inet_sk(sk)->inet_dport); + bool is_port_match = (bbr_debug_port_mask && + ((dport & bbr_debug_port_mask) == 0)); + char debugmsg[320]; + + if (sk->sk_state == TCP_SYN_SENT) + return; /* no bbr_init() yet if SYN retransmit -> CA_Loss */ + + if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) { + char addr[INET6_ADDRSTRLEN + 10] = { 0 }; + + if (sk->sk_family == AF_INET) + snprintf(addr, sizeof(addr), "%pI4:%u", + &inet_sk(sk)->inet_daddr, dport); + else if (sk->sk_family == AF_INET6) + snprintf(addr, sizeof(addr), "%pI6:%u", + &sk->sk_v6_daddr, dport); + + WARN_ONCE(1, + "BBR %s cwnd alert: %u " + "snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u " + "bw: %u rtt: %u min_rtt: %u " + "acked: %u tso_segs: %u " + "bw: %d %ld %d pif: %u\n", + addr, tp->snd_cwnd, + una, inet_csk(sk)->icsk_ca_state, + bbr->pacing_gain, bbr->cwnd_gain, + bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us, + acked, bbr_tso_segs_goal(sk), + rs->delivered, rs->interval_us, rs->is_retrans, + tcp_packets_in_flight(tp)); + } + + if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace)) + return; + + if (!sock_flag(sk, SOCK_DBG) && !is_port_match) + return; + + if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE)) + return; + + if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) && + !(bbr_flags & FLAG_DEBUG_LOOPBACK)) + return; + + snprintf(debugmsg, sizeof(debugmsg) - 1, + "BBR %pI4:%-5u %5u,%03u:%-7u %c " + "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu " + "bw %llu lb %llu ib %llu qb %llu " + "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c " + "lr %d er %d ea %d bwl %lld il %d ih %d c %d " + "v %d %c %u %c %s\n", + &inet_sk(sk)->inet_daddr, dport, + una / 1000, una % 1000, fack - tp->snd_una, + ca_states[inet_csk(sk)->icsk_ca_state], + bbr->debug.undo ? '@' : mode[bbr->mode], + tp->snd_cwnd, + bbr_extra_acked(sk), /* br (legacy): extra_acked */ + rs->tx_in_flight, /* cr (legacy): tx_inflight */ + rs->rtt_us, + rs->delivered, + rs->interval_us, + bbr->min_rtt_us, + rs->is_app_limited ? '_' : 'l', + bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */ + bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */ + 0ULL, /* lb: [obsolete] */ + 0ULL, /* ib: [obsolete] */ + (u64)sk->sk_pacing_rate * 8 / 1000, + acked, + tcp_packets_in_flight(tp), + rs->is_ack_delayed ? 'd' : '.', + bbr->round_start ? '*' : '.', + tp->delivered, tp->lost, + tp->app_limited, + 0, /* #: [obsolete] */ + ctx->target_cwnd, + tp->reord_seen ? 'r' : '.', /* r: reordering seen? */ + ca_states[bbr->prev_ca_state], + (rs->lost + rs->delivered) > 0 ? + (1000 * rs->lost / + (rs->lost + rs->delivered)) : 0, /* lr: loss rate x1000 */ + (rs->delivered) > 0 ? + (1000 * rs->delivered_ce / + (rs->delivered)) : 0, /* er: ECN rate x1000 */ + 1000 * bbr->ecn_alpha >> BBR_SCALE, /* ea: ECN alpha x1000 */ + bbr->bw_lo == ~0U ? + -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */ + bbr->inflight_lo, /* il */ + bbr->inflight_hi, /* ih */ + bbr->bw_probe_up_cnt, /* c */ + 2, /* v: version */ + bbr->debug.event, + bbr->cycle_idx, + ack_phase[bbr->ack_phase], + bbr->bw_probe_samples ? "Y" : "N"); + debugmsg[sizeof(debugmsg) - 1] = 0; + + /* printk takes a higher precedence. */ + if (bbr_debug_with_printk) + printk(KERN_DEBUG "%s", debugmsg); + + if (unlikely(bbr->debug.undo)) + bbr->debug.undo = 0; +} + +/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + u64 rate = bw; + + rate = bbr_rate_bytes_per_sec(sk, rate, gain, + bbr_pacing_margin_percent); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + return rate; +} + +/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ +static void bbr_init_pacing_rate_from_rtt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + u32 rtt_us; + + if (tp->srtt_us) { /* any RTT sample yet? */ + rtt_us = max(tp->srtt_us >> 3, 1U); + bbr->has_seen_rtt = 1; + } else { /* no RTT sample yet */ + rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ + } + bw = (u64)tp->snd_cwnd * BW_UNIT; + do_div(bw, rtt_us); + sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain); +} + +/* Pace using current bw estimate and a gain factor. */ +static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); + + if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) + bbr_init_pacing_rate_from_rtt(sk); + if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) + sk->sk_pacing_rate = rate; +} + +static u32 bbr_min_tso_segs(struct sock *sk) +{ + return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; +} + +/* Return the number of segments BBR would like in a TSO/GSO skb, given + * a particular max gso size as a constraint. + */ +static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, + u32 gso_max_size) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 segs, r; + u64 bytes; + + /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ + bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; + + /* Budget a TSO/GSO burst size allowance based on min_rtt. For every + * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. + * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) + */ + if (bbr->params.tso_rtt_shift) { + r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift; + if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ + bytes += GSO_MAX_SIZE >> r; + } + + bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); + segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk)); + return segs; +} + +/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ +static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) +{ + return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); +} + +/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ +static u32 bbr_tso_segs_goal(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); +} + +/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +static void bbr_save_cwnd(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) + bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ + else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ + bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); +} + +static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (event == CA_EVENT_TX_START) { + tcp_plb_check_rehash(sk, &bbr->plb); + + if (!tp->app_limited) + return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + /* Avoid pointless buffer overflows: pace at est. bw if we don't + * need more speed (we're restarting from idle and app-limited). + */ + if (bbr->mode == BBR_PROBE_BW) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); + } else if ((event == CA_EVENT_ECN_IS_CE || + event == CA_EVENT_ECN_NO_CE) && + bbr_ecn_enable && + bbr->params.precise_ece_ack) { + u32 state = bbr->ce_state; + dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); + bbr->ce_state = state; + if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE) + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); + } +} + +/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: + * + * bdp = ceil(bw * min_rtt * gain) + * + * The key factor, gain, controls the amount of queue. While a small gain + * builds a smaller queue, it becomes more vulnerable to noise in RTT + * measurements (e.g., delayed ACKs or other ACK compression effects). This + * noise may cause BBR to under-estimate the rate. + */ +static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bdp; + u64 w; + + /* If we've never had a valid RTT sample, cap cwnd at the initial + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which + * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ + return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + + /* Apply a gain to the given value, remove the BW_SCALE shift, and + * round the value up to avoid a negative feedback loop. + */ + bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + return bdp; +} + +/* To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 tso_segs_goal; + + tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + + /* Allow enough full-sized skbs in flight to utilize end systems. */ + if (bbr->params.cwnd_tso_budget == 1) { + cwnd = max_t(u32, cwnd, tso_segs_goal); + cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); + } else { + cwnd += tso_segs_goal; + cwnd = (cwnd + 1) & ~1U; + } + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +} + +/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ +static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) +{ + u32 inflight; + + inflight = bbr_bdp(sk, bw, gain); + inflight = bbr_quantization_budget(sk, inflight); + + return inflight; +} + +/* With pacing at lower layers, there's often less data "in the network" than + * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), + * we often have several skbs queued in the pacing layer with a pre-scheduled + * earliest departure time (EDT). BBR adapts its pacing rate based on the + * inflight level that it estimates has already been "baked in" by previous + * departure time decisions. We calculate a rough estimate of the number of our + * packets that might be in the network at the earliest departure time for the + * next skb scheduled: + * in_network_at_edt = inflight_at_edt - (EDT - now) * bw + * If we're increasing inflight, then we want to know if the transmit of the + * EDT skb will push inflight above the target, so inflight_at_edt includes + * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, + * then estimate if inflight will sink too low just before the EDT transmit. + */ +static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 now_ns, edt_ns, interval_us; + u32 interval_delivered, inflight_at_edt; + + now_ns = tp->tcp_clock_cache; + edt_ns = max(tp->tcp_wstamp_ns, now_ns); + interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); + interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; + inflight_at_edt = inflight_now; + if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ + inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ + if (interval_delivered >= inflight_at_edt) + return 0; + return inflight_at_edt - interval_delivered; +} + +/* Find the cwnd increment based on estimate of ack aggregation */ +static u32 bbr_ack_aggregation_cwnd(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 max_aggr_cwnd, aggr_cwnd = 0; + + if (bbr->params.extra_acked_gain && + (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; + aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } + + return aggr_cwnd; +} + +/* Returns the cwnd for PROBE_RTT mode. */ +static u32 bbr_probe_rtt_cwnd(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->params.probe_rtt_cwnd_gain == 0) + return bbr->params.cwnd_min_target; + return max_t(u32, bbr->params.cwnd_min_target, + bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain)); +} + +/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ +static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + u32 acked, u32 bw, int gain, u32 cwnd, + struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems + * due to aggregation (of data and/or ACKs) visible in the ACK stream. + */ + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ + bbr->debug.target_cwnd = target_cwnd; + + /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ + bbr->try_fast_path = 0; + if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ + cwnd += acked; + if (cwnd >= target_cwnd) { + cwnd = target_cwnd; + bbr->try_fast_path = 1; + } + } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { + cwnd += acked; + } else { + bbr->try_fast_path = 1; + } + + /* When growing cwnd, don't grow beyond twice what we just probed. */ + if (bbr->params.usage_based_cwnd) { + max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd); + cwnd = min(cwnd, max_probe); + } + + cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); +done: + tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk)); + + ctx->target_cwnd = target_cwnd; + ctx->log = (tp->snd_cwnd != prev_cwnd); +} + +/* See if we have reached next round trip */ +static void bbr_update_round_start(struct sock *sk, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->round_start = 0; + + /* See if we've reached the next RTT */ + if (rs->interval_us > 0 && + !before(rs->prior_delivered, bbr->next_rtt_delivered)) { + bbr->next_rtt_delivered = tp->delivered; + bbr->round_start = 1; + } +} + +/* Calculate the bandwidth based on how fast packets are delivered */ +static void bbr_calculate_bw_sample(struct sock *sk, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. + * Round up to allow growth at low rates, even with integer division. + */ + if (rs->interval_us > 0) { + if (WARN_ONCE(rs->delivered < 0, + "negative delivered: %d interval_us: %ld\n", + rs->delivered, rs->interval_us)) + return; + + bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } + + ctx->sample_bw = bw; + bbr->debug.rs_bw = bw; +} + +/* Estimates the windowed max degree of ack aggregation. + * This is used to provision extra in-flight data to keep sending during + * inter-ACK silences. + * + * Degree of ack aggregation is estimated as extra data acked beyond expected. + * + * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" + * cwnd += max_extra_acked + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round + * trips for non-startup phase, and 1-2 round trips for startup. + */ +static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +{ + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts; + + if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); + if (bbr->params.extra_acked_in_startup && + !bbr_full_bw_reached(sk)) + extra_acked_win_rtts_thresh = 1; + if (bbr->extra_acked_win_rtts >= + extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; + bbr->extra_acked[bbr->extra_acked_win_idx] = 0; + } + } + + /* Compute how many packets we expected to be delivered over epoch. */ + epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, + bbr->ack_epoch_mstamp); + expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; + + /* Reset the aggregation epoch if ACK rate is below expected rate or + * significantly large no. of ack received since epoch (potentially + * quite old epoch). + */ + if (bbr->ack_epoch_acked <= expected_acked || + (bbr->ack_epoch_acked + rs->acked_sacked >= + bbr_ack_epoch_acked_reset_thresh)) { + bbr->ack_epoch_acked = 0; + bbr->ack_epoch_mstamp = tp->delivered_mstamp; + expected_acked = 0; + } + + /* Compute excess data delivered, beyond what was expected. */ + bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, + bbr->ack_epoch_acked + rs->acked_sacked); + extra_acked = bbr->ack_epoch_acked - expected_acked; + extra_acked = min(extra_acked, tp->snd_cwnd); + if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; +} + +/* Estimate when the pipe is full, using the change in delivery rate: BBR + * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by + * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited + * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the + * higher rwin, 3: we get higher delivery rate samples. Or transient + * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar + * design goal, but uses delay and inter-ACK spacing instead of bandwidth. + */ +static void bbr_check_full_bw_reached(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bw_thresh; + + if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) + return; + + bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE; + if (bbr_max_bw(sk) >= bw_thresh) { + bbr->full_bw = bbr_max_bw(sk); + bbr->full_bw_cnt = 0; + return; + } + ++bbr->full_bw_cnt; + bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt; +} + +/* If pipe is probably full, drain the queue and then enter steady-state. */ +static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { + bbr->mode = BBR_DRAIN; /* drain queue we created */ + tcp_sk(sk)->snd_ssthresh = + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); + bbr2_reset_congestion_signals(sk); + } /* fall through to check if in-flight is already small: */ + if (bbr->mode == BBR_DRAIN && + bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) + return true; /* exiting DRAIN now */ + return false; +} + +static void bbr_check_probe_rtt_done(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (!(bbr->probe_rtt_done_stamp && + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + + bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); + bbr2_exit_probe_rtt(sk); +} + +/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and + * periodically drain the bottleneck queue, to converge to measure the true + * min_rtt (unloaded propagation delay). This allows the flows to keep queues + * small (reducing queuing delay and packet loss) and achieve fairness among + * BBR flows. + * + * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, + * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. + * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed + * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and + * re-enter the previous mode. BBR uses 200ms to approximately bound the + * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). + * + * Note that flows need only pay 2% if they are busy sending over the last 10 + * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have + * natural silences or low-rate periods within 10 seconds where the rate is low + * enough for long enough to drain its queue in the bottleneck. We pick up + * these min RTT measurements opportunistically with our min_rtt filter. :-) + */ +static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool probe_rtt_expired, min_rtt_expired; + u32 expire; + + /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ + expire = bbr->probe_rtt_min_stamp + + msecs_to_jiffies(bbr->params.probe_rtt_win_ms); + probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && + (rs->rtt_us <= bbr->probe_rtt_min_us || + (probe_rtt_expired && !rs->is_ack_delayed))) { + bbr->probe_rtt_min_us = rs->rtt_us; + bbr->probe_rtt_min_stamp = tcp_jiffies32; + } + /* Track min RTT seen in the min_rtt_win_sec filter window: */ + expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ; + min_rtt_expired = after(tcp_jiffies32, expire); + if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || + min_rtt_expired) { + bbr->min_rtt_us = bbr->probe_rtt_min_us; + bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + + if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; + bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; + bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { + /* Ignore low rate samples during this mode. */ + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && + tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + + msecs_to_jiffies(bbr->params.probe_rtt_mode_ms); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { + if (bbr->round_start) + bbr->probe_rtt_round_done = 1; + if (bbr->probe_rtt_round_done) + bbr_check_probe_rtt_done(sk); + } + } + /* Restart after idle ends only once we process a new S/ACK for data */ + if (rs->delivered > 0) + bbr->idle_restart = 0; +} + +static void bbr_update_gains(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + switch (bbr->mode) { + case BBR_STARTUP: + bbr->pacing_gain = bbr->params.high_gain; + bbr->cwnd_gain = bbr->params.startup_cwnd_gain; + break; + case BBR_DRAIN: + bbr->pacing_gain = bbr->params.drain_gain; /* slow, to drain */ + bbr->cwnd_gain = bbr->params.startup_cwnd_gain; /* keep cwnd */ + break; + case BBR_PROBE_BW: + bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx]; + bbr->cwnd_gain = bbr->params.cwnd_gain; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = BBR_UNIT; + break; + default: + WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); + break; + } +} + +static void bbr_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + int i; + + WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val); + + bbr->initialized = 1; + bbr->params.high_gain = min(0x7FF, bbr_high_gain); + bbr->params.drain_gain = min(0x3FF, bbr_drain_gain); + bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain); + bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain); + bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget); + bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target); + bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec); + bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms); + bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt); + bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh); + bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain); + bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts); + bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0; + bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0; + bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0; + bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain); + bbr->params.probe_rtt_win_ms = + min(0x3FFFU, + min_t(u32, bbr_probe_rtt_win_ms, + bbr->params.min_rtt_win_sec * MSEC_PER_SEC)); + for (i = 0; i < CYCLE_LEN; i++) + bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]); + bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0; + bbr->params.tso_rtt_shift = min(0xFU, bbr_tso_rtt_shift); + + bbr->debug.snd_isn = tp->snd_una; + bbr->debug.target_cwnd = 0; + bbr->debug.undo = 0; + + bbr->init_cwnd = min(0x7FU, tp->snd_cwnd); + bbr->prior_cwnd = tp->prior_cwnd; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + bbr->next_rtt_delivered = 0; + bbr->prev_ca_state = TCP_CA_Open; + bbr->packet_conservation = 0; + + bbr->probe_rtt_done_stamp = 0; + bbr->probe_rtt_round_done = 0; + bbr->probe_rtt_min_us = tcp_min_rtt(tp); + bbr->probe_rtt_min_stamp = tcp_jiffies32; + bbr->min_rtt_us = tcp_min_rtt(tp); + bbr->min_rtt_stamp = tcp_jiffies32; + + bbr->has_seen_rtt = 0; + bbr_init_pacing_rate_from_rtt(sk); + + bbr->round_start = 0; + bbr->idle_restart = 0; + bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; + bbr->cycle_mstamp = 0; + bbr->cycle_idx = 0; + bbr->mode = BBR_STARTUP; + bbr->debug.rs_bw = 0; + + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = 0; + bbr->extra_acked[0] = 0; + bbr->extra_acked[1] = 0; + + bbr->ce_state = 0; + bbr->prior_rcv_nxt = tp->rcv_nxt; + bbr->try_fast_path = 0; + + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); +} + +static u32 bbr_sndbuf_expand(struct sock *sk) +{ + /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ + return 3; +} + +/* __________________________________________________________________________ + * + * Functions new to BBR v2 ("bbr") congestion control are below here. + * __________________________________________________________________________ + */ + +/* Incorporate a new bw sample into the current window of our max filter. */ +static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); +} + +/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ +static void bbr2_advance_bw_hi_filter(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (!bbr->bw_hi[1]) + return; /* no samples in this window; remember old window */ + bbr->bw_hi[0] = bbr->bw_hi[1]; + bbr->bw_hi[1] = 0; +} + +/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ +static u32 bbr2_target_inflight(struct sock *sk) +{ + u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + + return min(bdp, tcp_sk(sk)->snd_cwnd); +} + +static bool bbr2_is_probing_bandwidth(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return (bbr->mode == BBR_STARTUP) || + (bbr->mode == BBR_PROBE_BW && + (bbr->cycle_idx == BBR_BW_PROBE_REFILL || + bbr->cycle_idx == BBR_BW_PROBE_UP)); +} + +/* Has the given amount of time elapsed since we marked the phase start? */ +static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct bbr *bbr = inet_csk_ca(sk); + + return tcp_stamp_us_delta(tp->tcp_mstamp, + bbr->cycle_mstamp + interval_us) > 0; +} + +static void bbr2_handle_queue_too_high_in_startup(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->full_bw_reached = 1; + bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +} + +/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ +static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || + !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh) + return; + + if (ce_ratio >= bbr->params.ecn_thresh) + bbr->startup_ecn_rounds++; + else + bbr->startup_ecn_rounds = 0; + + if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) { + bbr->debug.event = 'E'; /* ECN caused STARTUP exit */ + bbr2_handle_queue_too_high_in_startup(sk); + return; + } +} + +static int bbr2_update_ecn_alpha(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + s32 delivered, delivered_ce; + u64 alpha, ce_ratio; + u32 gain; + + if (bbr->params.ecn_factor == 0) + return -1; + + delivered = tp->delivered - bbr->alpha_last_delivered; + delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; + + if (delivered == 0 || /* avoid divide by zero */ + WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ + return -1; + + /* See if we should use ECN sender logic for this connection. */ + if (!bbr->ecn_eligible && bbr_ecn_enable && + (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us || + !bbr->params.ecn_max_rtt_us)) + bbr->ecn_eligible = 1; + + ce_ratio = (u64)delivered_ce << BBR_SCALE; + do_div(ce_ratio, delivered); + gain = bbr->params.ecn_alpha_gain; + alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; + alpha += (gain * ce_ratio) >> BBR_SCALE; + bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); + + bbr->alpha_last_delivered = tp->delivered; + bbr->alpha_last_delivered_ce = tp->delivered_ce; + + bbr2_check_ecn_too_high_in_startup(sk, ce_ratio); + return (int)ce_ratio; +} + +/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ +static void bbr2_raise_inflight_hi_slope(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 growth_this_round, cnt; + + /* Calculate "slope": packets S/Acked per inflight_hi increment. */ + growth_this_round = 1 << bbr->bw_probe_up_rounds; + bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); + cnt = tp->snd_cwnd / growth_this_round; + cnt = max(cnt, 1U); + bbr->bw_probe_up_cnt = cnt; + bbr->debug.event = 'G'; /* Grow inflight_hi slope */ +} + +/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ +static void bbr2_probe_inflight_hi_upward(struct sock *sk, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 delta; + + if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) { + bbr->bw_probe_up_acks = 0; /* don't accmulate unused credits */ + return; /* not fully using inflight_hi, so don't grow it */ + } + + /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ + bbr->bw_probe_up_acks += rs->acked_sacked; + if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { + delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; + bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; + bbr->inflight_hi += delta; + bbr->debug.event = 'I'; /* Increment inflight_hi */ + } + + if (bbr->round_start) + bbr2_raise_inflight_hi_slope(sk); +} + +/* Does loss/ECN rate for this sample say inflight is "too high"? + * This is used by both the bbr_check_loss_too_high_in_startup() function, + * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which + * uses it to notice when loss/ECN rates suggest inflight is too high. + */ +static bool bbr2_is_inflight_too_high(const struct sock *sk, + const struct rate_sample *rs) +{ + const struct bbr *bbr = inet_csk_ca(sk); + u32 loss_thresh, ecn_thresh; + + if (rs->lost > 0 && rs->tx_in_flight) { + loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >> + BBR_SCALE; + if (rs->lost > loss_thresh) + return true; + } + + if (rs->delivered_ce > 0 && rs->delivered > 0 && + bbr->ecn_eligible && bbr->params.ecn_thresh) { + ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >> + BBR_SCALE; + if (rs->delivered_ce >= ecn_thresh) + return true; + } + + return false; +} + +/* Calculate the tx_in_flight level that corresponded to excessive loss. + * We find "lost_prefix" segs of the skb where loss rate went too high, + * by solving for "lost_prefix" in the following equation: + * lost / inflight >= loss_thresh + * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh + * Then we take that equation, convert it to fixed point, and + * round up to the nearest packet. + */ +static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk, + const struct rate_sample *rs, + const struct sk_buff *skb) +{ + const struct bbr *bbr = inet_csk_ca(sk); + u32 loss_thresh = bbr->params.loss_thresh; + u32 pcount, divisor, inflight_hi; + s32 inflight_prev, lost_prev; + u64 loss_budget, lost_prefix; + + pcount = tcp_skb_pcount(skb); + + /* How much data was in flight before this skb? */ + inflight_prev = rs->tx_in_flight - pcount; + if (WARN_ONCE(inflight_prev < 0, + "tx_in_flight: %u pcount: %u reneg: %u", + rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg)) + return ~0U; + + /* How much inflight data was marked lost before this skb? */ + lost_prev = rs->lost - pcount; + if (WARN_ON_ONCE(lost_prev < 0)) + return ~0U; + + /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ + loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; + loss_budget >>= BBR_SCALE; + if (lost_prev >= loss_budget) { + lost_prefix = 0; /* previous losses crossed loss_thresh */ + } else { + lost_prefix = loss_budget - lost_prev; + lost_prefix <<= BBR_SCALE; + divisor = BBR_UNIT - loss_thresh; + if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ + return ~0U; + do_div(lost_prefix, divisor); + } + + inflight_hi = inflight_prev + lost_prefix; + return inflight_hi; +} + +/* If loss/ECN rates during probing indicated we may have overfilled a + * buffer, return an operating point that tries to leave unutilized headroom in + * the path for other flows, for fairness convergence and lower RTTs and loss. + */ +static u32 bbr2_inflight_with_headroom(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 headroom, headroom_fraction; + + if (bbr->inflight_hi == ~0U) + return ~0U; + + headroom_fraction = bbr->params.inflight_headroom; + headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; + headroom = max(headroom, 1U); + return max_t(s32, bbr->inflight_hi - headroom, + bbr->params.cwnd_min_target); +} + +/* Bound cwnd to a sensible level, based on our current probing state + * machine phase and model of a good inflight level (inflight_lo, inflight_hi). + */ +static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 cap; + + /* tcp_rcv_synsent_state_process() currently calls tcp_ack() + * and thus cong_control() without first initializing us(!). + */ + if (!bbr->initialized) + return; + + cap = ~0U; + if (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { + /* Probe to see if more packets fit in the path. */ + cap = bbr->inflight_hi; + } else { + if (bbr->mode == BBR_PROBE_RTT || + (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) + cap = bbr2_inflight_with_headroom(sk); + } + /* Adapt to any loss/ECN since our last bw probe. */ + cap = min(cap, bbr->inflight_lo); + + cap = max_t(u32, cap, bbr->params.cwnd_min_target); + tp->snd_cwnd = min(cap, tp->snd_cwnd); +} + +/* Estimate a short-term lower bound on the capacity available now, based + * on measurements of the current delivery process and recent history. When we + * are seeing loss/ECN at times when we are not probing bw, then conservatively + * move toward flow balance by multiplicatively cutting our short-term + * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a + * multiplicative decrease in order to converge to a lower capacity in time + * logarithmic in the magnitude of the decrease. + * + * However, we do not cut our short-term estimates lower than the current rate + * and volume of delivered data from this round trip, since from the current + * delivery process we can estimate the measured capacity available now. + * + * Anything faster than that approach would knowingly risk high loss, which can + * cause low bw for Reno/CUBIC and high loss recovery latency for + * request/response flows using any congestion control. + */ +static void bbr2_adapt_lower_bounds(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 ecn_cut, ecn_inflight_lo, beta; + + /* We only use lower-bound estimates when not probing bw. + * When probing we need to push inflight higher to probe bw. + */ + if (bbr2_is_probing_bandwidth(sk)) + return; + + /* ECN response. */ + if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) { + /* Reduce inflight to (1 - alpha*ecn_factor). */ + ecn_cut = (BBR_UNIT - + ((bbr->ecn_alpha * bbr->params.ecn_factor) >> + BBR_SCALE)); + if (bbr->inflight_lo == ~0U) + bbr->inflight_lo = tp->snd_cwnd; + ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; + } else { + ecn_inflight_lo = ~0U; + } + + /* Loss response. */ + if (bbr->loss_in_round) { + /* Reduce bw and inflight to (1 - beta). */ + if (bbr->bw_lo == ~0U) + bbr->bw_lo = bbr_max_bw(sk); + if (bbr->inflight_lo == ~0U) + bbr->inflight_lo = tp->snd_cwnd; + beta = bbr->params.beta; + bbr->bw_lo = + max_t(u32, bbr->bw_latest, + (u64)bbr->bw_lo * + (BBR_UNIT - beta) >> BBR_SCALE); + bbr->inflight_lo = + max_t(u32, bbr->inflight_latest, + (u64)bbr->inflight_lo * + (BBR_UNIT - beta) >> BBR_SCALE); + } + + /* Adjust to the lower of the levels implied by loss or ECN. */ + bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); +} + +/* Reset any short-term lower-bound adaptation to congestion, so that we can + * push our inflight up. + */ +static void bbr2_reset_lower_bounds(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->bw_lo = ~0U; + bbr->inflight_lo = ~0U; +} + +/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state + * machine phase where we adapt our lower bound based on congestion signals. + */ +static void bbr2_reset_congestion_signals(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->loss_in_round = 0; + bbr->ecn_in_round = 0; + bbr->loss_in_cycle = 0; + bbr->ecn_in_cycle = 0; + bbr->bw_latest = 0; + bbr->inflight_latest = 0; +} + +/* Update (most of) our congestion signals: track the recent rate and volume of + * delivered data, presence of loss, and EWMA degree of ECN marking. + */ +static void bbr2_update_congestion_signals( + struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + + bbr->loss_round_start = 0; + if (rs->interval_us <= 0 || !rs->acked_sacked) + return; /* Not a valid observation */ + bw = ctx->sample_bw; + + if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) + bbr2_take_bw_hi_sample(sk, bw); + + bbr->loss_in_round |= (rs->losses > 0); + + /* Update rate and volume of delivered data from latest round trip: */ + bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); + bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); + + if (before(rs->prior_delivered, bbr->loss_round_delivered)) + return; /* skip the per-round-trip updates */ + /* Now do per-round-trip updates. */ + bbr->loss_round_delivered = tp->delivered; /* mark round trip */ + bbr->loss_round_start = 1; + bbr2_adapt_lower_bounds(sk); + + /* Update windowed "latest" (single-round-trip) filters. */ + bbr->loss_in_round = 0; + bbr->ecn_in_round = 0; + bbr->bw_latest = ctx->sample_bw; + bbr->inflight_latest = rs->delivered; +} + +/* Bandwidth probing can cause loss. To help coexistence with loss-based + * congestion control we spread out our probing in a Reno-conscious way. Due to + * the shape of the Reno sawtooth, the time required between loss epochs for an + * idealized Reno flow is a number of round trips that is the BDP of that + * flow. We count packet-timed round trips directly, since measured RTT can + * vary widely, and Reno is driven by packet-timed round trips. + */ +static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 inflight, rounds, reno_gain, reno_rounds; + + /* Random loss can shave some small percentage off of our inflight + * in each round. To survive this, flows need robust periodic probes. + */ + rounds = bbr->params.bw_probe_max_rounds; + + reno_gain = bbr->params.bw_probe_reno_gain; + if (reno_gain) { + inflight = bbr2_target_inflight(sk); + reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE; + rounds = min(rounds, reno_rounds); + } + return bbr->rounds_since_probe >= rounds; +} + +/* How long do we want to wait before probing for bandwidth (and risking + * loss)? We randomize the wait, for better mixing and fairness convergence. + * + * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. + * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, + * (eg 4K video to a broadband user): + * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets + * + * We bound the BBR-native inter-bw-probe wall clock time to be: + * (a) higher than 2 sec: to try to avoid causing loss for a long enough time + * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must + * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs + * (b) lower than 3 sec: to ensure flows can start probing in a reasonable + * amount of time to discover unutilized bw on human-scale interactive + * time-scales (e.g. perhaps traffic from a web page download that we + * were competing with is now complete). + */ +static void bbr2_pick_probe_wait(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + /* Decide the random round-trip bound for wait until probe: */ + bbr->rounds_since_probe = + prandom_u32_max(bbr->params.bw_probe_rand_rounds); + /* Decide the random wall clock bound for wait until probe: */ + bbr->probe_wait_us = bbr->params.bw_probe_base_us + + prandom_u32_max(bbr->params.bw_probe_rand_us); +} + +static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->cycle_idx = cycle_idx; + /* New phase, so need to update cwnd and pacing rate. */ + bbr->try_fast_path = 0; +} + +/* Send at estimated bw to fill the pipe, but not queue. We need this phase + * before PROBE_UP, because as soon as we send faster than the available bw + * we will start building a queue, and if the buffer is shallow we can cause + * loss. If we do not fill the pipe before we cause this loss, our bw_hi and + * inflight_hi estimates will underestimate. + */ +static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr2_reset_lower_bounds(sk); + if (bbr->inflight_hi != ~0U) + bbr->inflight_hi += bbr->params.refill_add_inc; + bbr->bw_probe_up_rounds = bw_probe_up_rounds; + bbr->bw_probe_up_acks = 0; + bbr->stopped_risky_probe = 0; + bbr->ack_phase = BBR_ACKS_REFILLING; + bbr->next_rtt_delivered = tp->delivered; + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); +} + +/* Now probe max deliverable data rate and volume. */ +static void bbr2_start_bw_probe_up(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->ack_phase = BBR_ACKS_PROBE_STARTING; + bbr->next_rtt_delivered = tp->delivered; + bbr->cycle_mstamp = tp->tcp_mstamp; + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP); + bbr2_raise_inflight_hi_slope(sk); +} + +/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall + * clock time at which to probe beyond an inflight that we think to be + * safe. This will knowingly risk packet loss, so we want to do this rarely, to + * keep packet loss rates low. Also start a round-trip counter, to probe faster + * if we estimate a Reno flow at our BDP would probe faster. + */ +static void bbr2_start_bw_probe_down(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr2_reset_congestion_signals(sk); + bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ + bbr2_pick_probe_wait(sk); + bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ + bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; + bbr->next_rtt_delivered = tp->delivered; + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); +} + +/* Cruise: maintain what we estimate to be a neutral, conservative + * operating point, without attempting to probe up for bandwidth or down for + * RTT, and only reducing inflight in response to loss/ECN signals. + */ +static void bbr2_start_bw_probe_cruise(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->inflight_lo != ~0U) + bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); + + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); +} + +/* Loss and/or ECN rate is too high while probing. + * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. + */ +static void bbr2_handle_inflight_too_high(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + const u32 beta = bbr->params.beta; + + bbr->prev_probe_too_high = 1; + bbr->bw_probe_samples = 0; /* only react once per probe */ + bbr->debug.event = 'L'; /* Loss/ECN too high */ + /* If we are app-limited then we are not robustly + * probing the max volume of inflight data we think + * might be safe (analogous to how app-limited bw + * samples are not known to be robustly probing bw). + */ + if (!rs->is_app_limited) + bbr->inflight_hi = max_t(u32, rs->tx_in_flight, + (u64)bbr2_target_inflight(sk) * + (BBR_UNIT - beta) >> BBR_SCALE); + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + bbr2_start_bw_probe_down(sk); +} + +/* If we're seeing bw and loss samples reflecting our bw probing, adapt + * using the signals we see. If loss or ECN mark rate gets too high, then adapt + * inflight_hi downward. If we're able to push inflight higher without such + * signals, push higher: adapt inflight_hi upward. + */ +static bool bbr2_adapt_upper_bounds(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + /* Track when we'll see bw/loss samples resulting from our bw probes. */ + if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) + bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; + if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { + /* End of samples from bw probing phase. */ + bbr->bw_probe_samples = 0; + bbr->ack_phase = BBR_ACKS_INIT; + /* At this point in the cycle, our current bw sample is also + * our best recent chance at finding the highest available bw + * for this flow. So now is the best time to forget the bw + * samples from the previous cycle, by advancing the window. + */ + if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) + bbr2_advance_bw_hi_filter(sk); + /* If we had an inflight_hi, then probed and pushed inflight all + * the way up to hit that inflight_hi without seeing any + * high loss/ECN in all the resulting ACKs from that probing, + * then probe up again, this time letting inflight persist at + * inflight_hi for a round trip, then accelerating beyond. + */ + if (bbr->mode == BBR_PROBE_BW && + bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { + bbr->debug.event = 'R'; /* reprobe */ + bbr2_start_bw_probe_refill(sk, 0); + return true; /* yes, decided state transition */ + } + } + + if (bbr2_is_inflight_too_high(sk, rs)) { + if (bbr->bw_probe_samples) /* sample is from bw probing? */ + bbr2_handle_inflight_too_high(sk, rs); + } else { + /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ + if (bbr->inflight_hi == ~0U) /* no excess queue signals yet? */ + return false; + + /* To be resilient to random loss, we must raise inflight_hi + * if we observe in any phase that a higher level is safe. + */ + if (rs->tx_in_flight > bbr->inflight_hi) { + bbr->inflight_hi = rs->tx_in_flight; + bbr->debug.event = 'U'; /* raise up inflight_hi */ + } + + if (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx == BBR_BW_PROBE_UP) + bbr2_probe_inflight_hi_upward(sk, rs); + } + + return false; +} + +/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ +static bool bbr2_check_time_to_probe_bw(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 n; + + /* If we seem to be at an operating point where we are not seeing loss + * but we are seeing ECN marks, then when the ECN marks cease we reprobe + * quickly (in case a burst of cross-traffic has ceased and freed up bw, + * or in case we are sharing with multiplicatively probing traffic). + */ + if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible && + bbr->ecn_in_cycle && !bbr->loss_in_cycle && + inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { + bbr->debug.event = 'A'; /* *A*ll clear to probe *A*gain */ + /* Calculate n so that when bbr2_raise_inflight_hi_slope() + * computes growth_this_round as 2^n it will be roughly the + * desired volume of data (inflight_hi*ecn_reprobe_gain). + */ + n = ilog2((((u64)bbr->inflight_hi * + bbr->params.ecn_reprobe_gain) >> BBR_SCALE)); + bbr2_start_bw_probe_refill(sk, n); + return true; + } + + if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) || + bbr2_is_reno_coexistence_probe_time(sk)) { + bbr2_start_bw_probe_refill(sk, 0); + return true; + } + return false; +} + +/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ +static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) +{ + struct bbr *bbr = inet_csk_ca(sk); + bool is_under_bdp, is_long_enough; + + /* Always need to pull inflight down to leave headroom in queue. */ + if (inflight > bbr2_inflight_with_headroom(sk)) + return false; + + is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT); + if (bbr->params.drain_to_target) + return is_under_bdp; + + is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us); + return is_under_bdp || is_long_enough; +} + +/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ +static void bbr2_update_cycle_phase(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + bool is_risky = false, is_queuing = false; + u32 inflight, bw; + + if (!bbr_full_bw_reached(sk)) + return; + + /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ + if (bbr2_adapt_upper_bounds(sk, rs)) + return; /* already decided state transition */ + + if (bbr->mode != BBR_PROBE_BW) + return; + + inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); + bw = bbr_max_bw(sk); + + switch (bbr->cycle_idx) { + /* First we spend most of our time cruising with a pacing_gain of 1.0, + * which paces at the estimated bw, to try to fully use the pipe + * without building queue. If we encounter loss/ECN marks, we adapt + * by slowing down. + */ + case BBR_BW_PROBE_CRUISE: + if (bbr2_check_time_to_probe_bw(sk)) + return; /* already decided state transition */ + break; + + /* After cruising, when it's time to probe, we first "refill": we send + * at the estimated bw to fill the pipe, before probing higher and + * knowingly risking overflowing the bottleneck buffer (causing loss). + */ + case BBR_BW_PROBE_REFILL: + if (bbr->round_start) { + /* After one full round trip of sending in REFILL, we + * start to see bw samples reflecting our REFILL, which + * may be putting too much data in flight. + */ + bbr->bw_probe_samples = 1; + bbr2_start_bw_probe_up(sk); + } + break; + + /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to + * probe for bw. If we have not seen loss/ECN, we try to raise inflight + * to at least pacing_gain*BDP; note that this may take more than + * min_rtt if min_rtt is small (e.g. on a LAN). + * + * We terminate PROBE_UP bandwidth probing upon any of the following: + * + * (1) We've pushed inflight up to hit the inflight_hi target set in the + * most recent previous bw probe phase. Thus we want to start + * draining the queue immediately because it's very likely the most + * recently sent packets will fill the queue and cause drops. + * (checked here) + * (2) We have probed for at least 1*min_rtt_us, and the + * estimated queue is high enough (inflight > 1.25 * estimated_bdp). + * (checked here) + * (3) Loss filter says loss rate is "too high". + * (checked in bbr_is_inflight_too_high()) + * (4) ECN filter says ECN mark rate is "too high". + * (checked in bbr_is_inflight_too_high()) + */ + case BBR_BW_PROBE_UP: + if (bbr->prev_probe_too_high && + inflight >= bbr->inflight_hi) { + bbr->stopped_risky_probe = 1; + is_risky = true; + bbr->debug.event = 'D'; /* D for danger */ + } else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) && + inflight >= + bbr_inflight(sk, bw, + bbr->params.bw_probe_pif_gain)) { + is_queuing = true; + bbr->debug.event = 'Q'; /* building Queue */ + } + if (is_risky || is_queuing) { + bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ + bbr2_start_bw_probe_down(sk); /* restart w/ down */ + } + break; + + /* After probing in PROBE_UP, we have usually accumulated some data in + * the bottleneck buffer (if bw probing didn't find more bw). We next + * enter PROBE_DOWN to try to drain any excess data from the queue. To + * do this, we use a pacing_gain < 1.0. We hold this pacing gain until + * our inflight is less then that target cruising point, which is the + * minimum of (a) the amount needed to leave headroom, and (b) the + * estimated BDP. Once inflight falls to match the target, we estimate + * the queue is drained; persisting would underutilize the pipe. + */ + case BBR_BW_PROBE_DOWN: + if (bbr2_check_time_to_probe_bw(sk)) + return; /* already decided state transition */ + if (bbr2_check_time_to_cruise(sk, inflight, bw)) + bbr2_start_bw_probe_cruise(sk); + break; + + default: + WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); + } +} + +/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ +static void bbr2_exit_probe_rtt(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr2_reset_lower_bounds(sk); + if (bbr_full_bw_reached(sk)) { + bbr->mode = BBR_PROBE_BW; + /* Raising inflight after PROBE_RTT may cause loss, so reset + * the PROBE_BW clock and schedule the next bandwidth probe for + * a friendly and randomized future point in time. + */ + bbr2_start_bw_probe_down(sk); + /* Since we are exiting PROBE_RTT, we know inflight is + * below our estimated BDP, so it is reasonable to cruise. + */ + bbr2_start_bw_probe_cruise(sk); + } else { + bbr->mode = BBR_STARTUP; + } +} + +/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until + * the end of the round in recovery to get a good estimate of how many packets + * have been lost, and how many we need to drain with a low pacing rate. + */ +static void bbr2_check_loss_too_high_in_startup(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_full_bw_reached(sk)) + return; + + /* For STARTUP exit, check the loss rate at the end of each round trip + * of Recovery episodes in STARTUP. We check the loss rate at the end + * of the round trip to filter out noisy/low loss and have a better + * sense of inflight (extent of loss), so we can drain more accurately. + */ + if (rs->losses && bbr->loss_events_in_round < 0xf) + bbr->loss_events_in_round++; /* update saturating counter */ + if (bbr->params.full_loss_cnt && bbr->loss_round_start && + inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && + bbr->loss_events_in_round >= bbr->params.full_loss_cnt && + bbr2_is_inflight_too_high(sk, rs)) { + bbr->debug.event = 'P'; /* Packet loss caused STARTUP exit */ + bbr2_handle_queue_too_high_in_startup(sk); + return; + } + if (bbr->loss_round_start) + bbr->loss_events_in_round = 0; +} + +/* If we are done draining, advance into steady state operation in PROBE_BW. */ +static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_check_drain(sk, rs, ctx)) { + bbr->mode = BBR_PROBE_BW; + bbr2_start_bw_probe_down(sk); + } +} + +static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + bbr2_update_congestion_signals(sk, rs, ctx); + bbr_update_ack_aggregation(sk, rs); + bbr2_check_loss_too_high_in_startup(sk, rs); + bbr_check_full_bw_reached(sk, rs); + bbr2_check_drain(sk, rs, ctx); + bbr2_update_cycle_phase(sk, rs); + bbr_update_min_rtt(sk, rs); +} + +/* Fast path for app-limited case. + * + * On each ack, we execute bbr state machine, which primarily consists of: + * 1) update model based on new rate sample, and + * 2) update control based on updated model or state change. + * + * There are certain workload/scenarios, e.g. app-limited case, where + * either we can skip updating model or we can skip update of both model + * as well as control. This provides signifcant softirq cpu savings for + * processing incoming acks. + * + * In case of app-limited, if there is no congestion (loss/ecn) and + * if observed bw sample is less than current estimated bw, then we can + * skip some of the computation in bbr state processing: + * + * - if there is no rtt/mode/phase change: In this case, since all the + * parameters of the network model are constant, we can skip model + * as well control update. + * + * - else we can skip rest of the model update. But we still need to + * update the control to account for the new rtt/mode/phase. + * + * Returns whether we can take fast path or not. + */ +static bool bbr2_fast_path(struct sock *sk, bool *update_model, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 prev_min_rtt_us, prev_mode; + + if (bbr->params.fast_path && bbr->try_fast_path && + rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && + !bbr->loss_in_round && !bbr->ecn_in_round) { + prev_mode = bbr->mode; + prev_min_rtt_us = bbr->min_rtt_us; + bbr2_check_drain(sk, rs, ctx); + bbr2_update_cycle_phase(sk, rs); + bbr_update_min_rtt(sk, rs); + + if (bbr->mode == prev_mode && + bbr->min_rtt_us == prev_min_rtt_us && + bbr->try_fast_path) + return true; + + /* Skip model update, but control still needs to be updated */ + *update_model = false; + } + return false; +} + +static void bbr2_main(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + struct bbr_context ctx = { 0 }; + bool update_model = true; + u32 bw; + int ce_ratio = -1; + + bbr->debug.event = '.'; /* init to default NOP (no event yet) */ + + bbr_update_round_start(sk, rs, &ctx); + if (bbr->round_start) { + bbr->rounds_since_probe = + min_t(s32, bbr->rounds_since_probe + 1, 0xFF); + ce_ratio = bbr2_update_ecn_alpha(sk); + tcp_plb_update_state(sk, &bbr->plb, ce_ratio); + tcp_plb_check_rehash(sk, &bbr->plb); + } + + bbr->ecn_in_round |= rs->is_ece; + bbr_calculate_bw_sample(sk, rs, &ctx); + + if (bbr2_fast_path(sk, &update_model, rs, &ctx)) + goto out; + + if (update_model) + bbr2_update_model(sk, rs, &ctx); + + bbr_update_gains(sk); + bw = bbr_bw(sk); + bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); + bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, + tp->snd_cwnd, &ctx); + bbr2_bound_cwnd_for_inflight_model(sk); + +out: + bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; + bbr->loss_in_cycle |= rs->lost > 0; + bbr->ecn_in_cycle |= rs->delivered_ce > 0; + + bbr_debug(sk, rs->acked_sacked, rs, &ctx); +} + +/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared + * down here, so that the algorithm functions that use the parameters must use + * the per-socket parameters; if they accidentally use the global version + * then there will be a compile error. + * TODO(ncardwell): move all per-socket parameters down to this section. + */ + +/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. + * No loss response when 0. Max allwed value is 255. + */ +static u32 bbr_beta = BBR_UNIT * 30 / 100; + +/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE. + * Max allowed value is 255. + */ +static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; /* 1/16 = 6.25% */ + +/* The initial value for the ecn_alpha state variable. Default and max + * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly + * to congestion if the bottleneck is congested when the flow starts up. + */ +static u32 bbr_ecn_alpha_init = BBR_UNIT; /* 1.0, to respond quickly */ + +/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. + * No ECN based bounding when 0. Max allwed value is 255. + */ +static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ + +/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. + * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255. + */ +static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ + +/* Max RTT (in usec) at which to use sender-side ECN logic. + * Disabled when 0 (ECN allowed at any RTT). + * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms. + */ +static u32 bbr_ecn_max_rtt_us = 5000; + +/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN + * clears then use a multiplicative increase to quickly reprobe bw by + * starting inflight probing at the given multiple of inflight_hi. + * Default for this experimental knob is 0 (disabled). + * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5. + */ +static u32 bbr_ecn_reprobe_gain; + +/* Estimate bw probing has gone too far if loss rate exceeds this level. */ +static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ + +/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, + * and loss rate is higher than bbr_loss_thresh. + * Disabled if 0. Max allowed value is 15 (0xF). + */ +static u32 bbr_full_loss_cnt = 8; + +/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh + * meets this count. Max allowed value is 3. + */ +static u32 bbr_full_ecn_cnt = 2; + +/* Fraction of unutilized headroom to try to leave in path upon high loss. */ +static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; + +/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase. + * Default is 1.25x, as in BBR v1. Max allowed is 511. + */ +static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4; + +/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips. + * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism. + * Max allowed is 511. + */ +static u32 bbr_bw_probe_reno_gain = BBR_UNIT; + +/* Max number of packet-timed rounds to wait before probing for bandwidth. If + * we want to tolerate 1% random loss per round, and not have this cut our + * inflight too much, we must probe for bw periodically on roughly this scale. + * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. + * We aim to be fair with Reno/CUBIC up to a BDP of at least: + * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets + */ +static u32 bbr_bw_probe_max_rounds = 63; + +/* Max amount of randomness to inject in round counting for Reno-coexistence. + * Max value is 15. + */ +static u32 bbr_bw_probe_rand_rounds = 2; + +/* Use BBR-native probe time scale starting at this many usec. + * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: + * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs + */ +static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ + +/* Use BBR-native probes spread over this many usec: */ +static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ + +/* Undo the model changes made in loss recovery if recovery was spurious? */ +static bool bbr_undo = true; + +/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ +static bool bbr_fast_path = true; /* default: enabled */ + +/* Use fast ack mode ? */ +static int bbr_fast_ack_mode = 1; /* default: rwnd check off */ + +/* How much to additively increase inflight_hi when entering REFILL? */ +static u32 bbr_refill_add_inc; /* default: disabled */ + +module_param_named(beta, bbr_beta, uint, 0644); +module_param_named(ecn_alpha_gain, bbr_ecn_alpha_gain, uint, 0644); +module_param_named(ecn_alpha_init, bbr_ecn_alpha_init, uint, 0644); +module_param_named(ecn_factor, bbr_ecn_factor, uint, 0644); +module_param_named(ecn_thresh, bbr_ecn_thresh, uint, 0644); +module_param_named(ecn_max_rtt_us, bbr_ecn_max_rtt_us, uint, 0644); +module_param_named(ecn_reprobe_gain, bbr_ecn_reprobe_gain, uint, 0644); +module_param_named(loss_thresh, bbr_loss_thresh, uint, 0664); +module_param_named(full_loss_cnt, bbr_full_loss_cnt, uint, 0664); +module_param_named(full_ecn_cnt, bbr_full_ecn_cnt, uint, 0664); +module_param_named(inflight_headroom, bbr_inflight_headroom, uint, 0664); +module_param_named(bw_probe_pif_gain, bbr_bw_probe_pif_gain, uint, 0664); +module_param_named(bw_probe_reno_gain, bbr_bw_probe_reno_gain, uint, 0664); +module_param_named(bw_probe_max_rounds, bbr_bw_probe_max_rounds, uint, 0664); +module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664); +module_param_named(bw_probe_base_us, bbr_bw_probe_base_us, uint, 0664); +module_param_named(bw_probe_rand_us, bbr_bw_probe_rand_us, uint, 0664); +module_param_named(undo, bbr_undo, bool, 0664); +module_param_named(fast_path, bbr_fast_path, bool, 0664); +module_param_named(fast_ack_mode, bbr_fast_ack_mode, uint, 0664); +module_param_named(refill_add_inc, bbr_refill_add_inc, uint, 0664); + +static void bbr2_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + const struct net *net = sock_net(sk); + + bbr_init(sk); /* run shared init code for v1 and v2 */ + + /* BBR v2 parameters: */ + bbr->params.beta = min_t(u32, 0xFFU, bbr_beta); + bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain); + bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init); + bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor); + bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh); + bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us); + bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain); + bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh); + bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt); + bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt); + bbr->params.inflight_headroom = + min_t(u32, 0xFFU, bbr_inflight_headroom); + bbr->params.bw_probe_pif_gain = + min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain); + bbr->params.bw_probe_reno_gain = + min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain); + bbr->params.bw_probe_max_rounds = + min_t(u32, 0xFFU, bbr_bw_probe_max_rounds); + bbr->params.bw_probe_rand_rounds = + min_t(u32, 0xFU, bbr_bw_probe_rand_rounds); + bbr->params.bw_probe_base_us = + min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us); + bbr->params.bw_probe_rand_us = + min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us); + bbr->params.undo = bbr_undo; + bbr->params.fast_path = bbr_fast_path ? 1 : 0; + bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc); + + /* BBR v2 state: */ + bbr->initialized = 1; + /* Start sampling ECN mark rate after first full flight is ACKed: */ + bbr->loss_round_delivered = tp->delivered + 1; + bbr->loss_round_start = 0; + bbr->undo_bw_lo = 0; + bbr->undo_inflight_lo = 0; + bbr->undo_inflight_hi = 0; + bbr->loss_events_in_round = 0; + bbr->startup_ecn_rounds = 0; + bbr2_reset_congestion_signals(sk); + bbr->bw_lo = ~0U; + bbr->bw_hi[0] = 0; + bbr->bw_hi[1] = 0; + bbr->inflight_lo = ~0U; + bbr->inflight_hi = ~0U; + bbr->bw_probe_up_cnt = ~0U; + bbr->bw_probe_up_acks = 0; + bbr->bw_probe_up_rounds = 0; + bbr->probe_wait_us = 0; + bbr->stopped_risky_probe = 0; + bbr->ack_phase = BBR_ACKS_INIT; + bbr->rounds_since_probe = 0; + bbr->bw_probe_samples = 0; + bbr->prev_probe_too_high = 0; + bbr->ecn_eligible = 0; + bbr->ecn_alpha = bbr->params.ecn_alpha_init; + bbr->alpha_last_delivered = 0; + bbr->alpha_last_delivered_ce = 0; + + bbr->plb.enabled = 0; + bbr->plb.consec_cong_rounds = 0; + bbr->plb.pause_until = 0; + if ((tp->ecn_flags & TCP_ECN_OK) && + net->ipv4.sysctl_tcp_plb_enabled) + bbr->plb.enabled = 1; + + tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode); + + if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable) + tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; +} + +/* Core TCP stack informs us that the given skb was just marked lost. */ +static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + struct rate_sample rs; + + /* Capture "current" data over the full round trip of loss, + * to have a better chance to see the full capacity of the path. + */ + if (!bbr->loss_in_round) /* first loss in this round trip? */ + bbr->loss_round_delivered = tp->delivered; /* set round trip */ + bbr->loss_in_round = 1; + bbr->loss_in_cycle = 1; + + if (!bbr->bw_probe_samples) + return; /* not an skb sent while probing for bandwidth */ + if (unlikely(!scb->tx.delivered_mstamp)) + return; /* skb was SACKed, reneged, marked lost; ignore it */ + /* We are probing for bandwidth. Construct a rate sample that + * estimates what happened in the flight leading up to this lost skb, + * then see if the loss rate went too high, and if so at which packet. + */ + memset(&rs, 0, sizeof(rs)); + rs.tx_in_flight = scb->tx.in_flight; + rs.lost = tp->lost - scb->tx.lost; + rs.is_app_limited = scb->tx.is_app_limited; + if (bbr2_is_inflight_too_high(sk, &rs)) { + rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb); + bbr2_handle_inflight_too_high(sk, &rs); + } +} + +/* Revert short-term model if current loss recovery event was spurious. */ +static u32 bbr2_undo_cwnd(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->debug.undo = 1; + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ + bbr->full_bw_cnt = 0; + bbr->loss_in_round = 0; + + if (!bbr->params.undo) + return tp->snd_cwnd; + + /* Revert to cwnd and other state saved before loss episode. */ + bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); + bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); + bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); + return bbr->prior_cwnd; +} + +/* Entering loss recovery, so save state for when we undo recovery. */ +static u32 bbr2_ssthresh(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr_save_cwnd(sk); + /* For undo, save state that adapts based on loss signal. */ + bbr->undo_bw_lo = bbr->bw_lo; + bbr->undo_inflight_lo = bbr->inflight_lo; + bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; +} + +static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr) +{ + switch (bbr->mode) { + case BBR_STARTUP: + return BBR2_PHASE_STARTUP; + case BBR_DRAIN: + return BBR2_PHASE_DRAIN; + case BBR_PROBE_BW: + break; + case BBR_PROBE_RTT: + return BBR2_PHASE_PROBE_RTT; + default: + return BBR2_PHASE_INVALID; + } + switch (bbr->cycle_idx) { + case BBR_BW_PROBE_UP: + return BBR2_PHASE_PROBE_BW_UP; + case BBR_BW_PROBE_DOWN: + return BBR2_PHASE_PROBE_BW_DOWN; + case BBR_BW_PROBE_CRUISE: + return BBR2_PHASE_PROBE_BW_CRUISE; + case BBR_BW_PROBE_REFILL: + return BBR2_PHASE_PROBE_BW_REFILL; + default: + return BBR2_PHASE_INVALID; + } +} + +static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct bbr *bbr = inet_csk_ca(sk); + u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); + u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); + u64 bw_lo = bbr->bw_lo == ~0U ? + ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); + + memset(&info->bbr2, 0, sizeof(info->bbr2)); + info->bbr2.bbr_bw_lsb = (u32)bw; + info->bbr2.bbr_bw_msb = (u32)(bw >> 32); + info->bbr2.bbr_min_rtt = bbr->min_rtt_us; + info->bbr2.bbr_pacing_gain = bbr->pacing_gain; + info->bbr2.bbr_cwnd_gain = bbr->cwnd_gain; + info->bbr2.bbr_bw_hi_lsb = (u32)bw_hi; + info->bbr2.bbr_bw_hi_msb = (u32)(bw_hi >> 32); + info->bbr2.bbr_bw_lo_lsb = (u32)bw_lo; + info->bbr2.bbr_bw_lo_msb = (u32)(bw_lo >> 32); + info->bbr2.bbr_mode = bbr->mode; + info->bbr2.bbr_phase = (__u8)bbr2_get_phase(bbr); + info->bbr2.bbr_version = (__u8)2; + info->bbr2.bbr_inflight_lo = bbr->inflight_lo; + info->bbr2.bbr_inflight_hi = bbr->inflight_hi; + info->bbr2.bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; + return sizeof(info->bbr2); + } + return 0; +} + +static void bbr2_set_state(struct sock *sk, u8 new_state) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { + struct rate_sample rs = { .losses = 1 }; + struct bbr_context ctx = { 0 }; + + tcp_plb_update_state_upon_rto(sk, &bbr->plb); + bbr->prev_ca_state = TCP_CA_Loss; + bbr->full_bw = 0; + if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { + /* bbr_adapt_lower_bounds() needs cwnd before + * we suffered an RTO, to update inflight_lo: + */ + bbr->inflight_lo = + max(tp->snd_cwnd, bbr->prior_cwnd); + } + bbr_debug(sk, 0, &rs, &ctx); + } else if (bbr->prev_ca_state == TCP_CA_Loss && + new_state != TCP_CA_Loss) { + tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); + bbr->try_fast_path = 0; /* bound cwnd using latest model */ + } +} + +static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = { + .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr2", + .owner = THIS_MODULE, + .init = bbr2_init, + .cong_control = bbr2_main, + .sndbuf_expand = bbr_sndbuf_expand, + .skb_marked_lost = bbr2_skb_marked_lost, + .undo_cwnd = bbr2_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr2_ssthresh, + .tso_segs = bbr_tso_segs, + .get_info = bbr2_get_info, + .set_state = bbr2_set_state, +}; + +static int __init bbr_register(void) +{ + BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_bbr2_cong_ops); +} + +static void __exit bbr_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_bbr2_cong_ops); +} + +module_init(bbr_register); +module_exit(bbr_unregister); + +MODULE_AUTHOR("Van Jacobson "); +MODULE_AUTHOR("Neal Cardwell "); +MODULE_AUTHOR("Yuchung Cheng "); +MODULE_AUTHOR("Soheil Hassas Yeganeh "); +MODULE_AUTHOR("Priyaranjan Jha "); +MODULE_AUTHOR("Yousuk Seung "); +MODULE_AUTHOR("Kevin Yang "); +MODULE_AUTHOR("Arjun Roy "); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index db5831e6c136a..153ed9010c0c2 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -179,6 +179,7 @@ void tcp_init_congestion_control(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; + tcp_sk(sk)->fast_ack_mode = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6dfbb88dcf5b..14c3dd92cd4ff 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -348,7 +348,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: - if (tcp_ca_needs_ecn(sk)) + if (tcp_ca_wants_ce_events(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { @@ -359,7 +359,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tp->ecn_flags |= TCP_ECN_SEEN; break; default: - if (tcp_ca_needs_ecn(sk)) + if (tcp_ca_wants_ce_events(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; @@ -1062,7 +1062,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { + struct sock *sk = (struct sock *)tp; + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + tp->lost += tcp_skb_pcount(skb); + if (ca_ops->skb_marked_lost) + ca_ops->skb_marked_lost(sk, skb); } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) @@ -1443,6 +1448,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); + /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ + if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, + "prev in_flight: %u skb in_flight: %u pcount: %u", + TCP_SKB_CB(prev)->tx.in_flight, + TCP_SKB_CB(skb)->tx.in_flight, + pcount)) + TCP_SKB_CB(skb)->tx.in_flight = 0; + else + TCP_SKB_CB(skb)->tx.in_flight -= pcount; + TCP_SKB_CB(prev)->tx.in_flight += pcount; + /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep @@ -3219,7 +3235,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, long seq_rtt_us = -1L; long ca_rtt_us = -1L; u32 pkts_acked = 0; - u32 last_in_flight = 0; bool rtt_update; int flag = 0; @@ -3255,7 +3270,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!first_ackt) first_ackt = last_ackt; - last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; if (before(start_seq, reord)) reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) @@ -3321,8 +3335,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); - if (pkts_acked == 1 && last_in_flight < tp->mss_cache && - last_in_flight && !prior_sacked && fully_acked && + if (pkts_acked == 1 && fully_acked && !prior_sacked && + (tp->snd_una - prior_snd_una) < tp->mss_cache && sack->rate->prior_delivered + 1 == tp->delivered && !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { /* Conservatively mark a delayed ACK. It's typically @@ -3379,9 +3393,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = sack->rate->rtt_us, - .in_flight = last_in_flight }; + .rtt_us = sack->rate->rtt_us }; + sample.in_flight = tp->mss_cache * + (tp->delivered - sack->rate->prior_delivered); icsk->icsk_ca_ops->pkts_acked(sk, &sample); } @@ -3787,6 +3802,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); + tcp_rate_check_app_limited(sk); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3885,6 +3901,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); + rs.is_ece = !!(flag & FLAG_ECE); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); @@ -5459,13 +5476,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && + (tp->fast_ack_mode == 1 || /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). * If application uses SO_RCVLOWAT, we want send ack now if * we have not received enough bytes to satisfy the condition. */ - (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || - __tcp_select_window(sk) >= tp->rcv_wnd)) || + (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || + __tcp_select_window(sk) >= tp->rcv_wnd))) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b40780fde7915..c3948ea605621 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2006,13 +2006,21 @@ int tcp_v4_rcv(struct sk_buff *skb) goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + /* We own a reference on the listener, increase it again + * as we might lose it too soon. + */ + sock_hold(sk); } - /* We own a reference on the listener, increase it again - * as we might lose it too soon. - */ - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { @@ -2921,6 +2929,13 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Set default values for PLB */ + net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ + net->ipv4.sysctl_tcp_plb_cong_thresh = 128; /* 50% congestion */ + net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; + net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; + net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; + /* Reno is always built in */ if (!net_eq(net, &init_net) && bpf_try_module_get(init_net.ipv4.tcp_congestion_control, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 01e27620b7ee5..6dababd60215c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -781,8 +781,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, goto listen_overflow; if (own_req && rsk_drop_req(req)) { - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - inet_csk_reqsk_queue_drop_and_put(sk, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } @@ -792,6 +792,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: + if (sk != req->rsk_listener) + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 86e896351364e..932d0f8c2f246 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -375,7 +375,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } - } else if (!tcp_ca_needs_ecn(sk)) { + } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && + !tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } @@ -1255,8 +1256,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (clone_it) { - TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - - tp->snd_una; oskb = skb; tcp_skb_tsorted_save(oskb) { @@ -1533,7 +1532,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; - int nsize, old_factor; + int nsize, old_factor, inflight_prev; long limit; int nlen; u8 flags; @@ -1611,6 +1610,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (diff) tcp_adjust_pcount(sk, skb, diff); + + /* Set buff tx.in_flight as if buff were sent by itself. */ + inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; + if (WARN_ONCE(inflight_prev < 0, + "inconsistent: tx.in_flight: %u old_factor: %d", + TCP_SKB_CB(skb)->tx.in_flight, old_factor)) + inflight_prev = 0; + TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + + tcp_skb_pcount(buff); } /* Link BUFF into the send queue. */ @@ -1985,13 +1993,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - u32 min_tso, tso_segs; - - min_tso = ca_ops->min_tso_segs ? - ca_ops->min_tso_segs(sk) : - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + u32 tso_segs; - tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); + tso_segs = ca_ops->tso_segs ? + ca_ops->tso_segs(sk, mss_now) : + tcp_tso_autosize(sk, mss_now, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs)); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } @@ -2629,6 +2636,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); + tcp_set_tx_in_flight(sk, skb); goto repair; /* Skip network transmission */ } diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c new file mode 100644 index 0000000000000..71b02c0404cea --- /dev/null +++ b/net/ipv4/tcp_plb.c @@ -0,0 +1,100 @@ +/* Protective Load Balancing (PLB) + * + * PLB was designed to reduce link load imbalance across datacenter + * switches. PLB is a host-based optimization; it leverages congestion + * signals from the transport layer to randomly change the path of the + * connection experiencing sustained congestion. PLB prefers to repath + * after idle periods to minimize packet reordering. It repaths by + * changing the IPv6 Flow Label on the packets of a connection, which + * datacenter switches include as part of ECMP/WCMP hashing. + * + * PLB is described in detail in: + * + * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu, + * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson, + * David Wetherall,Abdul Kabbani: + * "PLB: Congestion Signals are Simple and Effective for + * Network Load Balancing" + * In ACM SIGCOMM 2022, Amsterdam Netherlands. + * + */ + +#include + +/* Called once per round-trip to update PLB state for a connection. */ +void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, + const int cong_ratio) +{ + struct net *net = sock_net(sk); + + if (!plb->enabled) + return; + + if (cong_ratio >= 0) { + if (cong_ratio < net->ipv4.sysctl_tcp_plb_cong_thresh) + plb->consec_cong_rounds = 0; + else if (plb->consec_cong_rounds < + net->ipv4.sysctl_tcp_plb_rehash_rounds) + plb->consec_cong_rounds++; + } +} +EXPORT_SYMBOL_GPL(tcp_plb_update_state); + +/* Check whether recent congestion has been persistent enough to warrant + * a load balancing decision that switches the connection to another path. + */ +void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) +{ + struct net *net = sock_net(sk); + bool can_idle_rehash, can_force_rehash; + + if (!plb->enabled) + return; + + /* Note that tcp_jiffies32 can wrap, so we clear pause_until + * to 0 to indicate there is no recent RTO event that constrains + * PLB rehashing. + */ + if (plb->pause_until && + !before(tcp_jiffies32, plb->pause_until)) + plb->pause_until = 0; + + can_idle_rehash = net->ipv4.sysctl_tcp_plb_idle_rehash_rounds && + !tcp_sk(sk)->packets_out && + plb->consec_cong_rounds >= + net->ipv4.sysctl_tcp_plb_idle_rehash_rounds; + can_force_rehash = plb->consec_cong_rounds >= + net->ipv4.sysctl_tcp_plb_rehash_rounds; + + if (!plb->pause_until && (can_idle_rehash || can_force_rehash)) { + sk_rethink_txhash(sk); + plb->consec_cong_rounds = 0; + tcp_sk(sk)->ecn_rehash++; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPECNREHASH); + } +} +EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); + +/* Upon RTO, disallow load balancing for a while, to avoid having load + * balancing decisions switch traffic to a black-holed path that was + * previously avoided with a sk_rethink_txhash() call at RTO time. + */ +void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb) +{ + struct net *net = sock_net(sk); + u32 pause; + + if (!plb->enabled) + return; + + pause = net->ipv4.sysctl_tcp_plb_suspend_rto_sec * HZ; + pause += prandom_u32_max(pause); + plb->pause_until = tcp_jiffies32 + pause; + + /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call + * that may switch this connection to a path with completely different + * congestion characteristics. + */ + plb->consec_cong_rounds = 0; +} +EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto); diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 6ab197928abbc..de9d4cc29722d 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -34,6 +34,24 @@ * ready to send in the write queue. */ +void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 in_flight; + + /* Check, sanitize, and record packets in flight after skb was sent. */ + in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); + if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, + "insane in_flight %u cc %s mss %u " + "cwnd %u pif %u %u %u %u\n", + in_flight, inet_csk(sk)->icsk_ca_ops->name, + tp->mss_cache, tp->snd_cwnd, + tp->packets_out, tp->retrans_out, + tp->sacked_out, tp->lost_out)) + in_flight = TCPCB_IN_FLIGHT_MAX; + TCP_SKB_CB(skb)->tx.in_flight = in_flight; +} + /* Snapshot the current delivery information in the skb, to generate * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). */ @@ -65,7 +83,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.lost = tp->lost; TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; + tcp_set_tx_in_flight(sk, skb); } /* When an skb is sacked or acked, we fill in the rate sample with the (prior) @@ -90,17 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, if (!rs->prior_delivered || tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, scb->end_seq, rs->last_end_seq)) { + rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; rs->last_end_seq = scb->end_seq; + rs->tx_in_flight = scb->tx.in_flight; /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = tx_tstamp; /* Find the duration of the "send phase" of this window: */ - rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, - scb->tx.first_tx_mstamp); + rs->interval_us = tcp_stamp32_us_delta( + tp->first_tx_mstamp, + scb->tx.first_tx_mstamp); } /* Mark off the skb delivered once it's sacked to avoid being @@ -142,6 +167,11 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, return; } rs->delivered = tp->delivered - rs->prior_delivered; + rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ + rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; /* Model sending data and receiving ACKs as separate pipeline phases * for a window. Usually the ACK phase is longer, but with ACK @@ -149,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * longer phase. */ snd_us = rs->interval_us; /* send phase */ - ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 715fdfa3e2ae9..35b7792669c40 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -611,6 +611,7 @@ void tcp_write_timer_handler(struct sock *sk) goto out; } + tcp_rate_check_app_limited(sk); tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b4a5e01e12016..4a6aad4b46fca 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -35,8 +35,14 @@ u32 inet6_ehashfn(const struct net *net, net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); - lhash = (__force u32)laddr->s6_addr32[3]; - fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret); + lhash = jhash_3words((__force u32)laddr->s6_addr32[3], + (((u32)lport) << 16) | (__force u32)fport, + (__force u32)faddr->s6_addr32[0], + ipv6_hash_secret); + fhash = jhash_3words((__force u32)faddr->s6_addr32[1], + (__force u32)faddr->s6_addr32[2], + (__force u32)faddr->s6_addr32[3], + ipv6_hash_secret); return __inet6_ehashfn(lhash, lport, fhash, fport, inet6_ehash_secret + net_hash_mix(net)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 79d6f6ea3c546..23253155e726a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1663,10 +1663,18 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + sock_hold(sk); } - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 19653b8784bbc..bbdb1371d0c01 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3306,6 +3306,36 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) return 0; } +int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_ctx *pctx = (struct nft_ctx *)ctx; + const struct nft_data *data; + int err; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) + return 0; + + data = nft_set_ext_data(ext); + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + pctx->level++; + err = nft_chain_validate(ctx, data->verdict.chain); + if (err < 0) + return err; + pctx->level--; + break; + default: + break; + } + + return 0; +} + static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla); diff --git a/samples/nitro_enclaves/ne_ioctl_sample.c b/samples/nitro_enclaves/ne_ioctl_sample.c index 480b763142b34..765b131c73190 100644 --- a/samples/nitro_enclaves/ne_ioctl_sample.c +++ b/samples/nitro_enclaves/ne_ioctl_sample.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. */ /** @@ -185,7 +185,6 @@ static int ne_create_vm(int ne_dev_fd, unsigned long *slot_uid, int *enclave_fd) return 0; } - /** * ne_poll_enclave_fd() - Thread function for polling the enclave fd. * @data: Argument provided for the polling function. @@ -560,8 +559,8 @@ static int ne_add_vcpu(int enclave_fd, unsigned int *vcpu_id) default: printf("Error in add vcpu [%m]\n"); - } + return rc; } @@ -638,7 +637,7 @@ static int ne_start_enclave(int enclave_fd, struct ne_enclave_start_info *encla } /** - * ne_start_enclave_check_booted() - Start the enclave and wait for a hearbeat + * ne_start_enclave_check_booted() - Start the enclave and wait for a heartbeat * from it, on a newly created vsock channel, * to check it has booted. * @enclave_fd : The file descriptor associated with the enclave. diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index acd07a70a2f4e..7f6d67b02984f 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -64,7 +64,7 @@ objtool_link() local objtoolopt; if [ -n "${CONFIG_VMLINUX_VALIDATION}" ]; then - objtoolopt="check" + objtoolopt="check --vmlinux --noinstr" if [ -n "${CONFIG_CPU_UNRET_ENTRY}" ]; then objtoolopt="${objtoolopt} --unret" fi diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index 788667d582ae5..e5971fa74fd74 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -12,6 +12,7 @@ static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = { #include #undef LSM_HOOK LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free), + LSM_HOOK_INIT(task_free, bpf_task_storage_free), }; static int __init bpf_lsm_init(void) @@ -23,6 +24,7 @@ static int __init bpf_lsm_init(void) struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = { .lbs_inode = sizeof(struct bpf_storage_blob), + .lbs_task = sizeof(struct bpf_storage_blob), }; DEFINE_LSM(bpf) = { diff --git a/tools/arch/arm64/include/asm/insn.h b/tools/arch/arm64/include/asm/insn.h new file mode 100644 index 0000000000000..71de52d1532ff --- /dev/null +++ b/tools/arch/arm64/include/asm/insn.h @@ -0,0 +1,565 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 Huawei Ltd. + * Author: Jiang Liu + * + * Copyright (C) 2014 Zi Shen Lim + */ +#ifndef __ASM_INSN_H +#define __ASM_INSN_H +#include +#include + +/* A64 instructions are always 32 bits. */ +#define AARCH64_INSN_SIZE 4 + +#ifndef __ASSEMBLY__ +/* + * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a + * Section C3.1 "A64 instruction index by encoding": + * AArch64 main encoding table + * Bit position + * 28 27 26 25 Encoding Group + * 0 0 - - Unallocated + * 1 0 0 - Data processing, immediate + * 1 0 1 - Branch, exception generation and system instructions + * - 1 - 0 Loads and stores + * - 1 0 1 Data processing - register + * 0 1 1 1 Data processing - SIMD and floating point + * 1 1 1 1 Data processing - SIMD and floating point + * "-" means "don't care" + */ +enum aarch64_insn_encoding_class { + AARCH64_INSN_CLS_UNKNOWN, /* UNALLOCATED */ + AARCH64_INSN_CLS_SVE, /* SVE instructions */ + AARCH64_INSN_CLS_DP_IMM, /* Data processing - immediate */ + AARCH64_INSN_CLS_DP_REG, /* Data processing - register */ + AARCH64_INSN_CLS_DP_FPSIMD, /* Data processing - SIMD and FP */ + AARCH64_INSN_CLS_LDST, /* Loads and stores */ + AARCH64_INSN_CLS_BR_SYS, /* Branch, exception generation and + * system instructions */ +}; + +enum aarch64_insn_hint_cr_op { + AARCH64_INSN_HINT_NOP = 0x0 << 5, + AARCH64_INSN_HINT_YIELD = 0x1 << 5, + AARCH64_INSN_HINT_WFE = 0x2 << 5, + AARCH64_INSN_HINT_WFI = 0x3 << 5, + AARCH64_INSN_HINT_SEV = 0x4 << 5, + AARCH64_INSN_HINT_SEVL = 0x5 << 5, + + AARCH64_INSN_HINT_XPACLRI = 0x07 << 5, + AARCH64_INSN_HINT_PACIA_1716 = 0x08 << 5, + AARCH64_INSN_HINT_PACIB_1716 = 0x0A << 5, + AARCH64_INSN_HINT_AUTIA_1716 = 0x0C << 5, + AARCH64_INSN_HINT_AUTIB_1716 = 0x0E << 5, + AARCH64_INSN_HINT_PACIAZ = 0x18 << 5, + AARCH64_INSN_HINT_PACIASP = 0x19 << 5, + AARCH64_INSN_HINT_PACIBZ = 0x1A << 5, + AARCH64_INSN_HINT_PACIBSP = 0x1B << 5, + AARCH64_INSN_HINT_AUTIAZ = 0x1C << 5, + AARCH64_INSN_HINT_AUTIASP = 0x1D << 5, + AARCH64_INSN_HINT_AUTIBZ = 0x1E << 5, + AARCH64_INSN_HINT_AUTIBSP = 0x1F << 5, + + AARCH64_INSN_HINT_ESB = 0x10 << 5, + AARCH64_INSN_HINT_PSB = 0x11 << 5, + AARCH64_INSN_HINT_TSB = 0x12 << 5, + AARCH64_INSN_HINT_CSDB = 0x14 << 5, + + AARCH64_INSN_HINT_BTI = 0x20 << 5, + AARCH64_INSN_HINT_BTIC = 0x22 << 5, + AARCH64_INSN_HINT_BTIJ = 0x24 << 5, + AARCH64_INSN_HINT_BTIJC = 0x26 << 5, +}; + +enum aarch64_insn_imm_type { + AARCH64_INSN_IMM_ADR, + AARCH64_INSN_IMM_26, + AARCH64_INSN_IMM_19, + AARCH64_INSN_IMM_16, + AARCH64_INSN_IMM_14, + AARCH64_INSN_IMM_12, + AARCH64_INSN_IMM_9, + AARCH64_INSN_IMM_7, + AARCH64_INSN_IMM_6, + AARCH64_INSN_IMM_S, + AARCH64_INSN_IMM_R, + AARCH64_INSN_IMM_N, + AARCH64_INSN_IMM_MAX +}; + +enum aarch64_insn_register_type { + AARCH64_INSN_REGTYPE_RT, + AARCH64_INSN_REGTYPE_RN, + AARCH64_INSN_REGTYPE_RT2, + AARCH64_INSN_REGTYPE_RM, + AARCH64_INSN_REGTYPE_RD, + AARCH64_INSN_REGTYPE_RA, + AARCH64_INSN_REGTYPE_RS, +}; + +enum aarch64_insn_register { + AARCH64_INSN_REG_0 = 0, + AARCH64_INSN_REG_1 = 1, + AARCH64_INSN_REG_2 = 2, + AARCH64_INSN_REG_3 = 3, + AARCH64_INSN_REG_4 = 4, + AARCH64_INSN_REG_5 = 5, + AARCH64_INSN_REG_6 = 6, + AARCH64_INSN_REG_7 = 7, + AARCH64_INSN_REG_8 = 8, + AARCH64_INSN_REG_9 = 9, + AARCH64_INSN_REG_10 = 10, + AARCH64_INSN_REG_11 = 11, + AARCH64_INSN_REG_12 = 12, + AARCH64_INSN_REG_13 = 13, + AARCH64_INSN_REG_14 = 14, + AARCH64_INSN_REG_15 = 15, + AARCH64_INSN_REG_16 = 16, + AARCH64_INSN_REG_17 = 17, + AARCH64_INSN_REG_18 = 18, + AARCH64_INSN_REG_19 = 19, + AARCH64_INSN_REG_20 = 20, + AARCH64_INSN_REG_21 = 21, + AARCH64_INSN_REG_22 = 22, + AARCH64_INSN_REG_23 = 23, + AARCH64_INSN_REG_24 = 24, + AARCH64_INSN_REG_25 = 25, + AARCH64_INSN_REG_26 = 26, + AARCH64_INSN_REG_27 = 27, + AARCH64_INSN_REG_28 = 28, + AARCH64_INSN_REG_29 = 29, + AARCH64_INSN_REG_FP = 29, /* Frame pointer */ + AARCH64_INSN_REG_30 = 30, + AARCH64_INSN_REG_LR = 30, /* Link register */ + AARCH64_INSN_REG_ZR = 31, /* Zero: as source register */ + AARCH64_INSN_REG_SP = 31 /* Stack pointer: as load/store base reg */ +}; + +enum aarch64_insn_special_register { + AARCH64_INSN_SPCLREG_SPSR_EL1 = 0xC200, + AARCH64_INSN_SPCLREG_ELR_EL1 = 0xC201, + AARCH64_INSN_SPCLREG_SP_EL0 = 0xC208, + AARCH64_INSN_SPCLREG_SPSEL = 0xC210, + AARCH64_INSN_SPCLREG_CURRENTEL = 0xC212, + AARCH64_INSN_SPCLREG_DAIF = 0xDA11, + AARCH64_INSN_SPCLREG_NZCV = 0xDA10, + AARCH64_INSN_SPCLREG_FPCR = 0xDA20, + AARCH64_INSN_SPCLREG_DSPSR_EL0 = 0xDA28, + AARCH64_INSN_SPCLREG_DLR_EL0 = 0xDA29, + AARCH64_INSN_SPCLREG_SPSR_EL2 = 0xE200, + AARCH64_INSN_SPCLREG_ELR_EL2 = 0xE201, + AARCH64_INSN_SPCLREG_SP_EL1 = 0xE208, + AARCH64_INSN_SPCLREG_SPSR_INQ = 0xE218, + AARCH64_INSN_SPCLREG_SPSR_ABT = 0xE219, + AARCH64_INSN_SPCLREG_SPSR_UND = 0xE21A, + AARCH64_INSN_SPCLREG_SPSR_FIQ = 0xE21B, + AARCH64_INSN_SPCLREG_SPSR_EL3 = 0xF200, + AARCH64_INSN_SPCLREG_ELR_EL3 = 0xF201, + AARCH64_INSN_SPCLREG_SP_EL2 = 0xF210 +}; + +enum aarch64_insn_variant { + AARCH64_INSN_VARIANT_32BIT, + AARCH64_INSN_VARIANT_64BIT +}; + +enum aarch64_insn_condition { + AARCH64_INSN_COND_EQ = 0x0, /* == */ + AARCH64_INSN_COND_NE = 0x1, /* != */ + AARCH64_INSN_COND_CS = 0x2, /* unsigned >= */ + AARCH64_INSN_COND_CC = 0x3, /* unsigned < */ + AARCH64_INSN_COND_MI = 0x4, /* < 0 */ + AARCH64_INSN_COND_PL = 0x5, /* >= 0 */ + AARCH64_INSN_COND_VS = 0x6, /* overflow */ + AARCH64_INSN_COND_VC = 0x7, /* no overflow */ + AARCH64_INSN_COND_HI = 0x8, /* unsigned > */ + AARCH64_INSN_COND_LS = 0x9, /* unsigned <= */ + AARCH64_INSN_COND_GE = 0xa, /* signed >= */ + AARCH64_INSN_COND_LT = 0xb, /* signed < */ + AARCH64_INSN_COND_GT = 0xc, /* signed > */ + AARCH64_INSN_COND_LE = 0xd, /* signed <= */ + AARCH64_INSN_COND_AL = 0xe, /* always */ +}; + +enum aarch64_insn_branch_type { + AARCH64_INSN_BRANCH_NOLINK, + AARCH64_INSN_BRANCH_LINK, + AARCH64_INSN_BRANCH_RETURN, + AARCH64_INSN_BRANCH_COMP_ZERO, + AARCH64_INSN_BRANCH_COMP_NONZERO, +}; + +enum aarch64_insn_size_type { + AARCH64_INSN_SIZE_8, + AARCH64_INSN_SIZE_16, + AARCH64_INSN_SIZE_32, + AARCH64_INSN_SIZE_64, +}; + +enum aarch64_insn_ldst_type { + AARCH64_INSN_LDST_LOAD_REG_OFFSET, + AARCH64_INSN_LDST_STORE_REG_OFFSET, + AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX, + AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX, + AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX, + AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX, + AARCH64_INSN_LDST_LOAD_EX, + AARCH64_INSN_LDST_STORE_EX, +}; + +enum aarch64_insn_adsb_type { + AARCH64_INSN_ADSB_ADD, + AARCH64_INSN_ADSB_SUB, + AARCH64_INSN_ADSB_ADD_SETFLAGS, + AARCH64_INSN_ADSB_SUB_SETFLAGS +}; + +enum aarch64_insn_movewide_type { + AARCH64_INSN_MOVEWIDE_ZERO, + AARCH64_INSN_MOVEWIDE_KEEP, + AARCH64_INSN_MOVEWIDE_INVERSE +}; + +enum aarch64_insn_bitfield_type { + AARCH64_INSN_BITFIELD_MOVE, + AARCH64_INSN_BITFIELD_MOVE_UNSIGNED, + AARCH64_INSN_BITFIELD_MOVE_SIGNED +}; + +enum aarch64_insn_data1_type { + AARCH64_INSN_DATA1_REVERSE_16, + AARCH64_INSN_DATA1_REVERSE_32, + AARCH64_INSN_DATA1_REVERSE_64, +}; + +enum aarch64_insn_data2_type { + AARCH64_INSN_DATA2_UDIV, + AARCH64_INSN_DATA2_SDIV, + AARCH64_INSN_DATA2_LSLV, + AARCH64_INSN_DATA2_LSRV, + AARCH64_INSN_DATA2_ASRV, + AARCH64_INSN_DATA2_RORV, +}; + +enum aarch64_insn_data3_type { + AARCH64_INSN_DATA3_MADD, + AARCH64_INSN_DATA3_MSUB, +}; + +enum aarch64_insn_logic_type { + AARCH64_INSN_LOGIC_AND, + AARCH64_INSN_LOGIC_BIC, + AARCH64_INSN_LOGIC_ORR, + AARCH64_INSN_LOGIC_ORN, + AARCH64_INSN_LOGIC_EOR, + AARCH64_INSN_LOGIC_EON, + AARCH64_INSN_LOGIC_AND_SETFLAGS, + AARCH64_INSN_LOGIC_BIC_SETFLAGS +}; + +enum aarch64_insn_prfm_type { + AARCH64_INSN_PRFM_TYPE_PLD, + AARCH64_INSN_PRFM_TYPE_PLI, + AARCH64_INSN_PRFM_TYPE_PST, +}; + +enum aarch64_insn_prfm_target { + AARCH64_INSN_PRFM_TARGET_L1, + AARCH64_INSN_PRFM_TARGET_L2, + AARCH64_INSN_PRFM_TARGET_L3, +}; + +enum aarch64_insn_prfm_policy { + AARCH64_INSN_PRFM_POLICY_KEEP, + AARCH64_INSN_PRFM_POLICY_STRM, +}; + +enum aarch64_insn_adr_type { + AARCH64_INSN_ADR_TYPE_ADRP, + AARCH64_INSN_ADR_TYPE_ADR, +}; + +#define __AARCH64_INSN_FUNCS(abbr, mask, val) \ +static __always_inline bool aarch64_insn_is_##abbr(u32 code) \ +{ \ + BUILD_BUG_ON(~(mask) & (val)); \ + return (code & (mask)) == (val); \ +} \ +static __always_inline u32 aarch64_insn_get_##abbr##_value(void) \ +{ \ + return (val); \ +} + +__AARCH64_INSN_FUNCS(adr, 0x9F000000, 0x10000000) +__AARCH64_INSN_FUNCS(adrp, 0x9F000000, 0x90000000) +__AARCH64_INSN_FUNCS(prfm, 0x3FC00000, 0x39800000) +__AARCH64_INSN_FUNCS(prfm_lit, 0xFF000000, 0xD8000000) +__AARCH64_INSN_FUNCS(store_imm, 0x3FC00000, 0x39000000) +__AARCH64_INSN_FUNCS(load_imm, 0x3FC00000, 0x39400000) +__AARCH64_INSN_FUNCS(store_pre, 0x3FE00C00, 0x38000C00) +__AARCH64_INSN_FUNCS(load_pre, 0x3FE00C00, 0x38400C00) +__AARCH64_INSN_FUNCS(store_post, 0x3FE00C00, 0x38000400) +__AARCH64_INSN_FUNCS(load_post, 0x3FE00C00, 0x38400400) +__AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800) +__AARCH64_INSN_FUNCS(ldadd, 0x3F20FC00, 0x38200000) +__AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800) +__AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000) +__AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) +__AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000) +__AARCH64_INSN_FUNCS(load_ex, 0x3F400000, 0x08400000) +__AARCH64_INSN_FUNCS(store_ex, 0x3F400000, 0x08000000) +__AARCH64_INSN_FUNCS(stp, 0x7FC00000, 0x29000000) +__AARCH64_INSN_FUNCS(ldp, 0x7FC00000, 0x29400000) +__AARCH64_INSN_FUNCS(stp_post, 0x7FC00000, 0x28800000) +__AARCH64_INSN_FUNCS(ldp_post, 0x7FC00000, 0x28C00000) +__AARCH64_INSN_FUNCS(stp_pre, 0x7FC00000, 0x29800000) +__AARCH64_INSN_FUNCS(ldp_pre, 0x7FC00000, 0x29C00000) +__AARCH64_INSN_FUNCS(add_imm, 0x7F000000, 0x11000000) +__AARCH64_INSN_FUNCS(adds_imm, 0x7F000000, 0x31000000) +__AARCH64_INSN_FUNCS(sub_imm, 0x7F000000, 0x51000000) +__AARCH64_INSN_FUNCS(subs_imm, 0x7F000000, 0x71000000) +__AARCH64_INSN_FUNCS(movn, 0x7F800000, 0x12800000) +__AARCH64_INSN_FUNCS(sbfm, 0x7F800000, 0x13000000) +__AARCH64_INSN_FUNCS(bfm, 0x7F800000, 0x33000000) +__AARCH64_INSN_FUNCS(movz, 0x7F800000, 0x52800000) +__AARCH64_INSN_FUNCS(ubfm, 0x7F800000, 0x53000000) +__AARCH64_INSN_FUNCS(movk, 0x7F800000, 0x72800000) +__AARCH64_INSN_FUNCS(add, 0x7F200000, 0x0B000000) +__AARCH64_INSN_FUNCS(adds, 0x7F200000, 0x2B000000) +__AARCH64_INSN_FUNCS(sub, 0x7F200000, 0x4B000000) +__AARCH64_INSN_FUNCS(subs, 0x7F200000, 0x6B000000) +__AARCH64_INSN_FUNCS(madd, 0x7FE08000, 0x1B000000) +__AARCH64_INSN_FUNCS(msub, 0x7FE08000, 0x1B008000) +__AARCH64_INSN_FUNCS(udiv, 0x7FE0FC00, 0x1AC00800) +__AARCH64_INSN_FUNCS(sdiv, 0x7FE0FC00, 0x1AC00C00) +__AARCH64_INSN_FUNCS(lslv, 0x7FE0FC00, 0x1AC02000) +__AARCH64_INSN_FUNCS(lsrv, 0x7FE0FC00, 0x1AC02400) +__AARCH64_INSN_FUNCS(asrv, 0x7FE0FC00, 0x1AC02800) +__AARCH64_INSN_FUNCS(rorv, 0x7FE0FC00, 0x1AC02C00) +__AARCH64_INSN_FUNCS(rev16, 0x7FFFFC00, 0x5AC00400) +__AARCH64_INSN_FUNCS(rev32, 0x7FFFFC00, 0x5AC00800) +__AARCH64_INSN_FUNCS(rev64, 0x7FFFFC00, 0x5AC00C00) +__AARCH64_INSN_FUNCS(and, 0x7F200000, 0x0A000000) +__AARCH64_INSN_FUNCS(bic, 0x7F200000, 0x0A200000) +__AARCH64_INSN_FUNCS(orr, 0x7F200000, 0x2A000000) +__AARCH64_INSN_FUNCS(mov_reg, 0x7FE0FFE0, 0x2A0003E0) +__AARCH64_INSN_FUNCS(orn, 0x7F200000, 0x2A200000) +__AARCH64_INSN_FUNCS(eor, 0x7F200000, 0x4A000000) +__AARCH64_INSN_FUNCS(eon, 0x7F200000, 0x4A200000) +__AARCH64_INSN_FUNCS(ands, 0x7F200000, 0x6A000000) +__AARCH64_INSN_FUNCS(bics, 0x7F200000, 0x6A200000) +__AARCH64_INSN_FUNCS(and_imm, 0x7F800000, 0x12000000) +__AARCH64_INSN_FUNCS(orr_imm, 0x7F800000, 0x32000000) +__AARCH64_INSN_FUNCS(eor_imm, 0x7F800000, 0x52000000) +__AARCH64_INSN_FUNCS(ands_imm, 0x7F800000, 0x72000000) +__AARCH64_INSN_FUNCS(extr, 0x7FA00000, 0x13800000) +__AARCH64_INSN_FUNCS(b, 0xFC000000, 0x14000000) +__AARCH64_INSN_FUNCS(bl, 0xFC000000, 0x94000000) +__AARCH64_INSN_FUNCS(cbz, 0x7F000000, 0x34000000) +__AARCH64_INSN_FUNCS(cbnz, 0x7F000000, 0x35000000) +__AARCH64_INSN_FUNCS(tbz, 0x7F000000, 0x36000000) +__AARCH64_INSN_FUNCS(tbnz, 0x7F000000, 0x37000000) +__AARCH64_INSN_FUNCS(bcond, 0xFF000010, 0x54000000) +__AARCH64_INSN_FUNCS(svc, 0xFFE0001F, 0xD4000001) +__AARCH64_INSN_FUNCS(hvc, 0xFFE0001F, 0xD4000002) +__AARCH64_INSN_FUNCS(smc, 0xFFE0001F, 0xD4000003) +__AARCH64_INSN_FUNCS(brk, 0xFFE0001F, 0xD4200000) +__AARCH64_INSN_FUNCS(exception, 0xFF000000, 0xD4000000) +__AARCH64_INSN_FUNCS(hint, 0xFFFFF01F, 0xD503201F) +__AARCH64_INSN_FUNCS(br, 0xFFFFFC1F, 0xD61F0000) +__AARCH64_INSN_FUNCS(br_auth, 0xFEFFF800, 0xD61F0800) +__AARCH64_INSN_FUNCS(blr, 0xFFFFFC1F, 0xD63F0000) +__AARCH64_INSN_FUNCS(blr_auth, 0xFEFFF800, 0xD63F0800) +__AARCH64_INSN_FUNCS(ret, 0xFFFFFC1F, 0xD65F0000) +__AARCH64_INSN_FUNCS(ret_auth, 0xFFFFFBFF, 0xD65F0BFF) +__AARCH64_INSN_FUNCS(eret, 0xFFFFFFFF, 0xD69F03E0) +__AARCH64_INSN_FUNCS(eret_auth, 0xFFFFFBFF, 0xD69F0BFF) +__AARCH64_INSN_FUNCS(mrs, 0xFFF00000, 0xD5300000) +__AARCH64_INSN_FUNCS(msr_imm, 0xFFF8F01F, 0xD500401F) +__AARCH64_INSN_FUNCS(msr_reg, 0xFFF00000, 0xD5100000) +__AARCH64_INSN_FUNCS(dmb, 0xFFFFF0FF, 0xD50330BF) +__AARCH64_INSN_FUNCS(dsb_base, 0xFFFFF0FF, 0xD503309F) +__AARCH64_INSN_FUNCS(dsb_nxs, 0xFFFFF3FF, 0xD503323F) +__AARCH64_INSN_FUNCS(isb, 0xFFFFF0FF, 0xD50330DF) +__AARCH64_INSN_FUNCS(sb, 0xFFFFFFFF, 0xD50330FF) +__AARCH64_INSN_FUNCS(clrex, 0xFFFFF0FF, 0xD503305F) +__AARCH64_INSN_FUNCS(ssbb, 0xFFFFFFFF, 0xD503309F) +__AARCH64_INSN_FUNCS(pssbb, 0xFFFFFFFF, 0xD503349F) + +#undef __AARCH64_INSN_FUNCS + +bool aarch64_insn_is_steppable_hint(u32 insn); +bool aarch64_insn_is_branch_imm(u32 insn); + +static inline bool aarch64_insn_is_adr_adrp(u32 insn) +{ + return aarch64_insn_is_adr(insn) || aarch64_insn_is_adrp(insn); +} + +static inline bool aarch64_insn_is_dsb(u32 insn) +{ + return (aarch64_insn_is_dsb_base(insn) && (insn & 0xb00)) || + aarch64_insn_is_dsb_nxs(insn); +} + +static inline bool aarch64_insn_is_barrier(u32 insn) +{ + return aarch64_insn_is_dmb(insn) || aarch64_insn_is_dsb(insn) || + aarch64_insn_is_isb(insn) || aarch64_insn_is_sb(insn) || + aarch64_insn_is_clrex(insn) || aarch64_insn_is_ssbb(insn) || + aarch64_insn_is_pssbb(insn); +} + +static inline bool aarch64_insn_is_store_single(u32 insn) +{ + return aarch64_insn_is_store_imm(insn) || + aarch64_insn_is_store_pre(insn) || + aarch64_insn_is_store_post(insn); +} + +static inline bool aarch64_insn_is_store_pair(u32 insn) +{ + return aarch64_insn_is_stp(insn) || + aarch64_insn_is_stp_pre(insn) || + aarch64_insn_is_stp_post(insn); +} + +static inline bool aarch64_insn_is_load_single(u32 insn) +{ + return aarch64_insn_is_load_imm(insn) || + aarch64_insn_is_load_pre(insn) || + aarch64_insn_is_load_post(insn); +} + +static inline bool aarch64_insn_is_load_pair(u32 insn) +{ + return aarch64_insn_is_ldp(insn) || + aarch64_insn_is_ldp_pre(insn) || + aarch64_insn_is_ldp_post(insn); +} + +enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn); +bool aarch64_insn_uses_literal(u32 insn); +bool aarch64_insn_is_branch(u32 insn); +u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn); +u32 aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type, + u32 insn, u64 imm); +u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type, + u32 insn); +u32 aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr, + enum aarch64_insn_branch_type type); +u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr, + enum aarch64_insn_register reg, + enum aarch64_insn_variant variant, + enum aarch64_insn_branch_type type); +u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, + enum aarch64_insn_condition cond); +u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op); +u32 aarch64_insn_gen_nop(void); +u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, + enum aarch64_insn_branch_type type); +u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + enum aarch64_insn_register offset, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type); +u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, + enum aarch64_insn_register reg2, + enum aarch64_insn_register base, + int offset, + enum aarch64_insn_variant variant, + enum aarch64_insn_ldst_type type); +u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + enum aarch64_insn_register state, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type); +u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result, + enum aarch64_insn_register address, + enum aarch64_insn_register value, + enum aarch64_insn_size_type size); +u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address, + enum aarch64_insn_register value, + enum aarch64_insn_size_type size); +u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + int imm, enum aarch64_insn_variant variant, + enum aarch64_insn_adsb_type type); +u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr, + enum aarch64_insn_register reg, + enum aarch64_insn_adr_type type); +u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + int immr, int imms, + enum aarch64_insn_variant variant, + enum aarch64_insn_bitfield_type type); +u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst, + int imm, int shift, + enum aarch64_insn_variant variant, + enum aarch64_insn_movewide_type type); +u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg, + int shift, + enum aarch64_insn_variant variant, + enum aarch64_insn_adsb_type type); +u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_variant variant, + enum aarch64_insn_data1_type type); +u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg, + enum aarch64_insn_variant variant, + enum aarch64_insn_data2_type type); +u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg1, + enum aarch64_insn_register reg2, + enum aarch64_insn_variant variant, + enum aarch64_insn_data3_type type); +u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg, + int shift, + enum aarch64_insn_variant variant, + enum aarch64_insn_logic_type type); +u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_variant variant); +u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type, + enum aarch64_insn_variant variant, + enum aarch64_insn_register Rn, + enum aarch64_insn_register Rd, + u64 imm); +u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant, + enum aarch64_insn_register Rm, + enum aarch64_insn_register Rn, + enum aarch64_insn_register Rd, + u8 lsb); +u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base, + enum aarch64_insn_prfm_type type, + enum aarch64_insn_prfm_target target, + enum aarch64_insn_prfm_policy policy); +s32 aarch64_get_branch_offset(u32 insn); +u32 aarch64_set_branch_offset(u32 insn, s32 offset); + +s32 aarch64_insn_adrp_get_offset(u32 insn); +u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset); + +bool aarch32_insn_is_wide(u32 insn); + +#define A32_RN_OFFSET 16 +#define A32_RT_OFFSET 12 +#define A32_RT2_OFFSET 0 + +u32 aarch64_insn_extract_system_reg(u32 insn); +u32 aarch32_insn_extract_reg_num(u32 insn, int offset); +u32 aarch32_insn_mcr_extract_opc2(u32 insn); +u32 aarch32_insn_mcr_extract_crm(u32 insn); + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_INSN_H */ diff --git a/tools/arch/arm64/include/asm/unwind_hints.h b/tools/arch/arm64/include/asm/unwind_hints.h new file mode 100644 index 0000000000000..60f866e4e12c8 --- /dev/null +++ b/tools/arch/arm64/include/asm/unwind_hints.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_UNWIND_HINTS_H +#define __ASM_UNWIND_HINTS_H + +#include + +#define UNWIND_HINT_REG_UNDEFINED 0xff +#define UNWIND_HINT_REG_SP 31 + +#ifdef __ASSEMBLY__ + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=UNWIND_HINT_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL +.endm + +.macro UNWIND_HINT_FUNC sp_offset=0 + UNWIND_HINT sp_reg=UNWIND_HINT_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL +.endm + +.macro UNWIND_HINT_REGS base=UNWIND_HINT_REG_SP offset=0 + UNWIND_HINT sp_reg=\base sp_offset=\offset type=UNWIND_HINT_TYPE_REGS +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_UNWIND_HINTS_H */ diff --git a/tools/arch/arm64/lib/insn.c b/tools/arch/arm64/lib/insn.c new file mode 100644 index 0000000000000..b24407ed03982 --- /dev/null +++ b/tools/arch/arm64/lib/insn.c @@ -0,0 +1,1456 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2013 Huawei Ltd. + * Author: Jiang Liu + * + * Copyright (C) 2014-2016 Zi Shen Lim + */ +#include +#include +#include +#include +#include + +#include +#include + +#define AARCH64_INSN_SF_BIT BIT(31) +#define AARCH64_INSN_N_BIT BIT(22) +#define AARCH64_INSN_LSL_12 BIT(22) + +static const int aarch64_insn_encoding_class[] = { + AARCH64_INSN_CLS_UNKNOWN, + AARCH64_INSN_CLS_UNKNOWN, + AARCH64_INSN_CLS_SVE, + AARCH64_INSN_CLS_UNKNOWN, + AARCH64_INSN_CLS_LDST, + AARCH64_INSN_CLS_DP_REG, + AARCH64_INSN_CLS_LDST, + AARCH64_INSN_CLS_DP_FPSIMD, + AARCH64_INSN_CLS_DP_IMM, + AARCH64_INSN_CLS_DP_IMM, + AARCH64_INSN_CLS_BR_SYS, + AARCH64_INSN_CLS_BR_SYS, + AARCH64_INSN_CLS_LDST, + AARCH64_INSN_CLS_DP_REG, + AARCH64_INSN_CLS_LDST, + AARCH64_INSN_CLS_DP_FPSIMD, +}; + +enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn) +{ + return aarch64_insn_encoding_class[(insn >> 25) & 0xf]; +} + +bool __kprobes aarch64_insn_is_steppable_hint(u32 insn) +{ + if (!aarch64_insn_is_hint(insn)) + return false; + + switch (insn & 0xFE0) { + case AARCH64_INSN_HINT_XPACLRI: + case AARCH64_INSN_HINT_PACIA_1716: + case AARCH64_INSN_HINT_PACIB_1716: + case AARCH64_INSN_HINT_PACIAZ: + case AARCH64_INSN_HINT_PACIASP: + case AARCH64_INSN_HINT_PACIBZ: + case AARCH64_INSN_HINT_PACIBSP: + case AARCH64_INSN_HINT_BTI: + case AARCH64_INSN_HINT_BTIC: + case AARCH64_INSN_HINT_BTIJ: + case AARCH64_INSN_HINT_BTIJC: + case AARCH64_INSN_HINT_NOP: + return true; + default: + return false; + } +} + +bool aarch64_insn_is_branch_imm(u32 insn) +{ + return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) || + aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) || + aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || + aarch64_insn_is_bcond(insn)); +} + +bool __kprobes aarch64_insn_uses_literal(u32 insn) +{ + /* ldr/ldrsw (literal), prfm */ + + return aarch64_insn_is_ldr_lit(insn) || + aarch64_insn_is_ldrsw_lit(insn) || + aarch64_insn_is_adr_adrp(insn) || + aarch64_insn_is_prfm_lit(insn); +} + +bool __kprobes aarch64_insn_is_branch(u32 insn) +{ + /* b, bl, cb*, tb*, ret*, b.cond, br*, blr* */ + + return aarch64_insn_is_b(insn) || + aarch64_insn_is_bl(insn) || + aarch64_insn_is_cbz(insn) || + aarch64_insn_is_cbnz(insn) || + aarch64_insn_is_tbz(insn) || + aarch64_insn_is_tbnz(insn) || + aarch64_insn_is_ret(insn) || + aarch64_insn_is_ret_auth(insn) || + aarch64_insn_is_br(insn) || + aarch64_insn_is_br_auth(insn) || + aarch64_insn_is_blr(insn) || + aarch64_insn_is_blr_auth(insn) || + aarch64_insn_is_bcond(insn); +} + +static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type, + u32 *maskp, int *shiftp) +{ + u32 mask; + int shift; + + switch (type) { + case AARCH64_INSN_IMM_26: + mask = BIT(26) - 1; + shift = 0; + break; + case AARCH64_INSN_IMM_19: + mask = BIT(19) - 1; + shift = 5; + break; + case AARCH64_INSN_IMM_16: + mask = BIT(16) - 1; + shift = 5; + break; + case AARCH64_INSN_IMM_14: + mask = BIT(14) - 1; + shift = 5; + break; + case AARCH64_INSN_IMM_12: + mask = BIT(12) - 1; + shift = 10; + break; + case AARCH64_INSN_IMM_9: + mask = BIT(9) - 1; + shift = 12; + break; + case AARCH64_INSN_IMM_7: + mask = BIT(7) - 1; + shift = 15; + break; + case AARCH64_INSN_IMM_6: + case AARCH64_INSN_IMM_S: + mask = BIT(6) - 1; + shift = 10; + break; + case AARCH64_INSN_IMM_R: + mask = BIT(6) - 1; + shift = 16; + break; + case AARCH64_INSN_IMM_N: + mask = 1; + shift = 22; + break; + default: + return -EINVAL; + } + + *maskp = mask; + *shiftp = shift; + + return 0; +} + +#define ADR_IMM_HILOSPLIT 2 +#define ADR_IMM_SIZE SZ_2M +#define ADR_IMM_LOMASK ((1 << ADR_IMM_HILOSPLIT) - 1) +#define ADR_IMM_HIMASK ((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1) +#define ADR_IMM_LOSHIFT 29 +#define ADR_IMM_HISHIFT 5 + +u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn) +{ + u32 immlo, immhi, mask; + int shift; + + switch (type) { + case AARCH64_INSN_IMM_ADR: + shift = 0; + immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK; + immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK; + insn = (immhi << ADR_IMM_HILOSPLIT) | immlo; + mask = ADR_IMM_SIZE - 1; + break; + default: + if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { + pr_err("aarch64_insn_decode_immediate: unknown immediate encoding %d\n", + type); + return 0; + } + } + + return (insn >> shift) & mask; +} + +u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type, + u32 insn, u64 imm) +{ + u32 immlo, immhi, mask; + int shift; + + if (insn == AARCH64_BREAK_FAULT) + return AARCH64_BREAK_FAULT; + + switch (type) { + case AARCH64_INSN_IMM_ADR: + shift = 0; + immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT; + imm >>= ADR_IMM_HILOSPLIT; + immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT; + imm = immlo | immhi; + mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) | + (ADR_IMM_HIMASK << ADR_IMM_HISHIFT)); + break; + default: + if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { + pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n", + type); + return AARCH64_BREAK_FAULT; + } + } + + /* Update the immediate field. */ + insn &= ~(mask << shift); + insn |= (imm & mask) << shift; + + return insn; +} + +u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type, + u32 insn) +{ + int shift; + + switch (type) { + case AARCH64_INSN_REGTYPE_RT: + case AARCH64_INSN_REGTYPE_RD: + shift = 0; + break; + case AARCH64_INSN_REGTYPE_RN: + shift = 5; + break; + case AARCH64_INSN_REGTYPE_RT2: + case AARCH64_INSN_REGTYPE_RA: + shift = 10; + break; + case AARCH64_INSN_REGTYPE_RM: + shift = 16; + break; + default: + pr_err("%s: unknown register type encoding %d\n", __func__, + type); + return 0; + } + + return (insn >> shift) & GENMASK(4, 0); +} + +static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type, + u32 insn, + enum aarch64_insn_register reg) +{ + int shift; + + if (insn == AARCH64_BREAK_FAULT) + return AARCH64_BREAK_FAULT; + + if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) { + pr_err("%s: unknown register encoding %d\n", __func__, reg); + return AARCH64_BREAK_FAULT; + } + + switch (type) { + case AARCH64_INSN_REGTYPE_RT: + case AARCH64_INSN_REGTYPE_RD: + shift = 0; + break; + case AARCH64_INSN_REGTYPE_RN: + shift = 5; + break; + case AARCH64_INSN_REGTYPE_RT2: + case AARCH64_INSN_REGTYPE_RA: + shift = 10; + break; + case AARCH64_INSN_REGTYPE_RM: + case AARCH64_INSN_REGTYPE_RS: + shift = 16; + break; + default: + pr_err("%s: unknown register type encoding %d\n", __func__, + type); + return AARCH64_BREAK_FAULT; + } + + insn &= ~(GENMASK(4, 0) << shift); + insn |= reg << shift; + + return insn; +} + +static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type, + u32 insn) +{ + u32 size; + + switch (type) { + case AARCH64_INSN_SIZE_8: + size = 0; + break; + case AARCH64_INSN_SIZE_16: + size = 1; + break; + case AARCH64_INSN_SIZE_32: + size = 2; + break; + case AARCH64_INSN_SIZE_64: + size = 3; + break; + default: + pr_err("%s: unknown size encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn &= ~GENMASK(31, 30); + insn |= size << 30; + + return insn; +} + +static inline long branch_imm_common(unsigned long pc, unsigned long addr, + long range) +{ + long offset; + + if ((pc & 0x3) || (addr & 0x3)) { + pr_err("%s: A64 instructions must be word aligned\n", __func__); + return range; + } + + offset = ((long)addr - (long)pc); + + if (offset < -range || offset >= range) { + pr_err("%s: offset out of range\n", __func__); + return range; + } + + return offset; +} + +u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr, + enum aarch64_insn_branch_type type) +{ + u32 insn; + long offset; + + /* + * B/BL support [-128M, 128M) offset + * ARM64 virtual address arrangement guarantees all kernel and module + * texts are within +/-128M. + */ + offset = branch_imm_common(pc, addr, SZ_128M); + if (offset >= SZ_128M) + return AARCH64_BREAK_FAULT; + + switch (type) { + case AARCH64_INSN_BRANCH_LINK: + insn = aarch64_insn_get_bl_value(); + break; + case AARCH64_INSN_BRANCH_NOLINK: + insn = aarch64_insn_get_b_value(); + break; + default: + pr_err("%s: unknown branch encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn, + offset >> 2); +} + +u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr, + enum aarch64_insn_register reg, + enum aarch64_insn_variant variant, + enum aarch64_insn_branch_type type) +{ + u32 insn; + long offset; + + offset = branch_imm_common(pc, addr, SZ_1M); + if (offset >= SZ_1M) + return AARCH64_BREAK_FAULT; + + switch (type) { + case AARCH64_INSN_BRANCH_COMP_ZERO: + insn = aarch64_insn_get_cbz_value(); + break; + case AARCH64_INSN_BRANCH_COMP_NONZERO: + insn = aarch64_insn_get_cbnz_value(); + break; + default: + pr_err("%s: unknown branch encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, + offset >> 2); +} + +u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, + enum aarch64_insn_condition cond) +{ + u32 insn; + long offset; + + offset = branch_imm_common(pc, addr, SZ_1M); + + insn = aarch64_insn_get_bcond_value(); + + if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) { + pr_err("%s: unknown condition encoding %d\n", __func__, cond); + return AARCH64_BREAK_FAULT; + } + insn |= cond; + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, + offset >> 2); +} + +u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op) +{ + return aarch64_insn_get_hint_value() | op; +} + +u32 __kprobes aarch64_insn_gen_nop(void) +{ + return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP); +} + +u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, + enum aarch64_insn_branch_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_BRANCH_NOLINK: + insn = aarch64_insn_get_br_value(); + break; + case AARCH64_INSN_BRANCH_LINK: + insn = aarch64_insn_get_blr_value(); + break; + case AARCH64_INSN_BRANCH_RETURN: + insn = aarch64_insn_get_ret_value(); + break; + default: + pr_err("%s: unknown branch encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg); +} + +u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + enum aarch64_insn_register offset, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_REG_OFFSET: + insn = aarch64_insn_get_ldr_reg_value(); + break; + case AARCH64_INSN_LDST_STORE_REG_OFFSET: + insn = aarch64_insn_get_str_reg_value(); + break; + default: + pr_err("%s: unknown load/store encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, + offset); +} + +u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, + enum aarch64_insn_register reg2, + enum aarch64_insn_register base, + int offset, + enum aarch64_insn_variant variant, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + int shift; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX: + insn = aarch64_insn_get_ldp_pre_value(); + break; + case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX: + insn = aarch64_insn_get_stp_pre_value(); + break; + case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX: + insn = aarch64_insn_get_ldp_post_value(); + break; + case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX: + insn = aarch64_insn_get_stp_post_value(); + break; + default: + pr_err("%s: unknown load/store encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + if ((offset & 0x3) || (offset < -256) || (offset > 252)) { + pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n", + __func__, offset); + return AARCH64_BREAK_FAULT; + } + shift = 2; + break; + case AARCH64_INSN_VARIANT_64BIT: + if ((offset & 0x7) || (offset < -512) || (offset > 504)) { + pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n", + __func__, offset); + return AARCH64_BREAK_FAULT; + } + shift = 3; + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, + reg1); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn, + reg2); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn, + offset >> shift); +} + +u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + enum aarch64_insn_register state, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_EX: + insn = aarch64_insn_get_load_ex_value(); + break; + case AARCH64_INSN_LDST_STORE_EX: + insn = aarch64_insn_get_store_ex_value(); + break; + default: + pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, + reg); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn, + AARCH64_INSN_REG_ZR); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn, + state); +} + +u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result, + enum aarch64_insn_register address, + enum aarch64_insn_register value, + enum aarch64_insn_size_type size) +{ + u32 insn = aarch64_insn_get_ldadd_value(); + + switch (size) { + case AARCH64_INSN_SIZE_32: + case AARCH64_INSN_SIZE_64: + break; + default: + pr_err("%s: unimplemented size encoding %d\n", __func__, size); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, + result); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + address); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn, + value); +} + +u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address, + enum aarch64_insn_register value, + enum aarch64_insn_size_type size) +{ + /* + * STADD is simply encoded as an alias for LDADD with XZR as + * the destination register. + */ + return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address, + value, size); +} + +static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type, + enum aarch64_insn_prfm_target target, + enum aarch64_insn_prfm_policy policy, + u32 insn) +{ + u32 imm_type = 0, imm_target = 0, imm_policy = 0; + + switch (type) { + case AARCH64_INSN_PRFM_TYPE_PLD: + break; + case AARCH64_INSN_PRFM_TYPE_PLI: + imm_type = BIT(0); + break; + case AARCH64_INSN_PRFM_TYPE_PST: + imm_type = BIT(1); + break; + default: + pr_err("%s: unknown prfm type encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (target) { + case AARCH64_INSN_PRFM_TARGET_L1: + break; + case AARCH64_INSN_PRFM_TARGET_L2: + imm_target = BIT(0); + break; + case AARCH64_INSN_PRFM_TARGET_L3: + imm_target = BIT(1); + break; + default: + pr_err("%s: unknown prfm target encoding %d\n", __func__, target); + return AARCH64_BREAK_FAULT; + } + + switch (policy) { + case AARCH64_INSN_PRFM_POLICY_KEEP: + break; + case AARCH64_INSN_PRFM_POLICY_STRM: + imm_policy = BIT(0); + break; + default: + pr_err("%s: unknown prfm policy encoding %d\n", __func__, policy); + return AARCH64_BREAK_FAULT; + } + + /* In this case, imm5 is encoded into Rt field. */ + insn &= ~GENMASK(4, 0); + insn |= imm_policy | (imm_target << 1) | (imm_type << 3); + + return insn; +} + +u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base, + enum aarch64_insn_prfm_type type, + enum aarch64_insn_prfm_target target, + enum aarch64_insn_prfm_policy policy) +{ + u32 insn = aarch64_insn_get_prfm_value(); + + insn = aarch64_insn_encode_ldst_size(AARCH64_INSN_SIZE_64, insn); + + insn = aarch64_insn_encode_prfm_imm(type, target, policy, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, 0); +} + +u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + int imm, enum aarch64_insn_variant variant, + enum aarch64_insn_adsb_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_ADSB_ADD: + insn = aarch64_insn_get_add_imm_value(); + break; + case AARCH64_INSN_ADSB_SUB: + insn = aarch64_insn_get_sub_imm_value(); + break; + case AARCH64_INSN_ADSB_ADD_SETFLAGS: + insn = aarch64_insn_get_adds_imm_value(); + break; + case AARCH64_INSN_ADSB_SUB_SETFLAGS: + insn = aarch64_insn_get_subs_imm_value(); + break; + default: + pr_err("%s: unknown add/sub encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + /* We can't encode more than a 24bit value (12bit + 12bit shift) */ + if (imm & ~(BIT(24) - 1)) + goto out; + + /* If we have something in the top 12 bits... */ + if (imm & ~(SZ_4K - 1)) { + /* ... and in the low 12 bits -> error */ + if (imm & (SZ_4K - 1)) + goto out; + + imm >>= 12; + insn |= AARCH64_INSN_LSL_12; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm); + +out: + pr_err("%s: invalid immediate encoding %d\n", __func__, imm); + return AARCH64_BREAK_FAULT; +} + +u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + int immr, int imms, + enum aarch64_insn_variant variant, + enum aarch64_insn_bitfield_type type) +{ + u32 insn; + u32 mask; + + switch (type) { + case AARCH64_INSN_BITFIELD_MOVE: + insn = aarch64_insn_get_bfm_value(); + break; + case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED: + insn = aarch64_insn_get_ubfm_value(); + break; + case AARCH64_INSN_BITFIELD_MOVE_SIGNED: + insn = aarch64_insn_get_sbfm_value(); + break; + default: + pr_err("%s: unknown bitfield encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + mask = GENMASK(4, 0); + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT; + mask = GENMASK(5, 0); + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + if (immr & ~mask) { + pr_err("%s: invalid immr encoding %d\n", __func__, immr); + return AARCH64_BREAK_FAULT; + } + if (imms & ~mask) { + pr_err("%s: invalid imms encoding %d\n", __func__, imms); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); + + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms); +} + +u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst, + int imm, int shift, + enum aarch64_insn_variant variant, + enum aarch64_insn_movewide_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_MOVEWIDE_ZERO: + insn = aarch64_insn_get_movz_value(); + break; + case AARCH64_INSN_MOVEWIDE_KEEP: + insn = aarch64_insn_get_movk_value(); + break; + case AARCH64_INSN_MOVEWIDE_INVERSE: + insn = aarch64_insn_get_movn_value(); + break; + default: + pr_err("%s: unknown movewide encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + if (imm & ~(SZ_64K - 1)) { + pr_err("%s: invalid immediate encoding %d\n", __func__, imm); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + if (shift != 0 && shift != 16) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + if (shift != 0 && shift != 16 && shift != 32 && shift != 48) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn |= (shift >> 4) << 21; + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); +} + +u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg, + int shift, + enum aarch64_insn_variant variant, + enum aarch64_insn_adsb_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_ADSB_ADD: + insn = aarch64_insn_get_add_value(); + break; + case AARCH64_INSN_ADSB_SUB: + insn = aarch64_insn_get_sub_value(); + break; + case AARCH64_INSN_ADSB_ADD_SETFLAGS: + insn = aarch64_insn_get_adds_value(); + break; + case AARCH64_INSN_ADSB_SUB_SETFLAGS: + insn = aarch64_insn_get_subs_value(); + break; + default: + pr_err("%s: unknown add/sub encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + if (shift & ~(SZ_32 - 1)) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + if (shift & ~(SZ_64 - 1)) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift); +} + +u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_variant variant, + enum aarch64_insn_data1_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_DATA1_REVERSE_16: + insn = aarch64_insn_get_rev16_value(); + break; + case AARCH64_INSN_DATA1_REVERSE_32: + insn = aarch64_insn_get_rev32_value(); + break; + case AARCH64_INSN_DATA1_REVERSE_64: + if (variant != AARCH64_INSN_VARIANT_64BIT) { + pr_err("%s: invalid variant for reverse64 %d\n", + __func__, variant); + return AARCH64_BREAK_FAULT; + } + insn = aarch64_insn_get_rev64_value(); + break; + default: + pr_err("%s: unknown data1 encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); +} + +u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg, + enum aarch64_insn_variant variant, + enum aarch64_insn_data2_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_DATA2_UDIV: + insn = aarch64_insn_get_udiv_value(); + break; + case AARCH64_INSN_DATA2_SDIV: + insn = aarch64_insn_get_sdiv_value(); + break; + case AARCH64_INSN_DATA2_LSLV: + insn = aarch64_insn_get_lslv_value(); + break; + case AARCH64_INSN_DATA2_LSRV: + insn = aarch64_insn_get_lsrv_value(); + break; + case AARCH64_INSN_DATA2_ASRV: + insn = aarch64_insn_get_asrv_value(); + break; + case AARCH64_INSN_DATA2_RORV: + insn = aarch64_insn_get_rorv_value(); + break; + default: + pr_err("%s: unknown data2 encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); +} + +u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg1, + enum aarch64_insn_register reg2, + enum aarch64_insn_variant variant, + enum aarch64_insn_data3_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_DATA3_MADD: + insn = aarch64_insn_get_madd_value(); + break; + case AARCH64_INSN_DATA3_MSUB: + insn = aarch64_insn_get_msub_value(); + break; + default: + pr_err("%s: unknown data3 encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + reg1); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, + reg2); +} + +u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg, + int shift, + enum aarch64_insn_variant variant, + enum aarch64_insn_logic_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_LOGIC_AND: + insn = aarch64_insn_get_and_value(); + break; + case AARCH64_INSN_LOGIC_BIC: + insn = aarch64_insn_get_bic_value(); + break; + case AARCH64_INSN_LOGIC_ORR: + insn = aarch64_insn_get_orr_value(); + break; + case AARCH64_INSN_LOGIC_ORN: + insn = aarch64_insn_get_orn_value(); + break; + case AARCH64_INSN_LOGIC_EOR: + insn = aarch64_insn_get_eor_value(); + break; + case AARCH64_INSN_LOGIC_EON: + insn = aarch64_insn_get_eon_value(); + break; + case AARCH64_INSN_LOGIC_AND_SETFLAGS: + insn = aarch64_insn_get_ands_value(); + break; + case AARCH64_INSN_LOGIC_BIC_SETFLAGS: + insn = aarch64_insn_get_bics_value(); + break; + default: + pr_err("%s: unknown logical encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + if (shift & ~(SZ_32 - 1)) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + if (shift & ~(SZ_64 - 1)) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift); +} + +/* + * MOV (register) is architecturally an alias of ORR (shifted register) where + * MOV <*d>, <*m> is equivalent to ORR <*d>, <*ZR>, <*m> + */ +u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_variant variant) +{ + return aarch64_insn_gen_logical_shifted_reg(dst, AARCH64_INSN_REG_ZR, + src, 0, variant, + AARCH64_INSN_LOGIC_ORR); +} + +u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr, + enum aarch64_insn_register reg, + enum aarch64_insn_adr_type type) +{ + u32 insn; + s32 offset; + + switch (type) { + case AARCH64_INSN_ADR_TYPE_ADR: + insn = aarch64_insn_get_adr_value(); + offset = addr - pc; + break; + case AARCH64_INSN_ADR_TYPE_ADRP: + insn = aarch64_insn_get_adrp_value(); + offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12; + break; + default: + pr_err("%s: unknown adr encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + if (offset < -SZ_1M || offset >= SZ_1M) + return AARCH64_BREAK_FAULT; + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset); +} + +/* + * Decode the imm field of a branch, and return the byte offset as a + * signed value (so it can be used when computing a new branch + * target). + */ +s32 aarch64_get_branch_offset(u32 insn) +{ + s32 imm; + + if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) { + imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn); + return (imm << 6) >> 4; + } + + if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || + aarch64_insn_is_bcond(insn)) { + imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn); + return (imm << 13) >> 11; + } + + if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) { + imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn); + return (imm << 18) >> 16; + } + + /* Unhandled instruction */ + BUG(); +} + +/* + * Encode the displacement of a branch in the imm field and return the + * updated instruction. + */ +u32 aarch64_set_branch_offset(u32 insn, s32 offset) +{ + if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn, + offset >> 2); + + if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || + aarch64_insn_is_bcond(insn)) + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, + offset >> 2); + + if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn, + offset >> 2); + + /* Unhandled instruction */ + BUG(); +} + +s32 aarch64_insn_adrp_get_offset(u32 insn) +{ + BUG_ON(!aarch64_insn_is_adrp(insn)); + return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12; +} + +u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset) +{ + BUG_ON(!aarch64_insn_is_adrp(insn)); + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, + offset >> 12); +} + +/* + * Extract the Op/CR data from a msr/mrs instruction. + */ +u32 aarch64_insn_extract_system_reg(u32 insn) +{ + return (insn & 0x1FFFE0) >> 5; +} + +bool aarch32_insn_is_wide(u32 insn) +{ + return insn >= 0xe800; +} + +/* + * Macros/defines for extracting register numbers from instruction. + */ +u32 aarch32_insn_extract_reg_num(u32 insn, int offset) +{ + return (insn & (0xf << offset)) >> offset; +} + +#define OPC2_MASK 0x7 +#define OPC2_OFFSET 5 +u32 aarch32_insn_mcr_extract_opc2(u32 insn) +{ + return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET; +} + +#define CRM_MASK 0xf +u32 aarch32_insn_mcr_extract_crm(u32 insn) +{ + return insn & CRM_MASK; +} + +static bool range_of_ones(u64 val) +{ + /* Doesn't handle full ones or full zeroes */ + u64 sval = val >> __ffs64(val); + + /* One of Sean Eron Anderson's bithack tricks */ + return ((sval + 1) & (sval)) == 0; +} + +static u32 aarch64_encode_immediate(u64 imm, + enum aarch64_insn_variant variant, + u32 insn) +{ + unsigned int immr, imms, n, ones, ror, esz, tmp; + u64 mask; + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + esz = 32; + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + esz = 64; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + mask = GENMASK(esz - 1, 0); + + /* Can't encode full zeroes, full ones, or value wider than the mask */ + if (!imm || imm == mask || imm & ~mask) + return AARCH64_BREAK_FAULT; + + /* + * Inverse of Replicate(). Try to spot a repeating pattern + * with a pow2 stride. + */ + for (tmp = esz / 2; tmp >= 2; tmp /= 2) { + u64 emask = BIT(tmp) - 1; + + if ((imm & emask) != ((imm >> tmp) & emask)) + break; + + esz = tmp; + mask = emask; + } + + /* N is only set if we're encoding a 64bit value */ + n = esz == 64; + + /* Trim imm to the element size */ + imm &= mask; + + /* That's how many ones we need to encode */ + ones = hweight64(imm); + + /* + * imms is set to (ones - 1), prefixed with a string of ones + * and a zero if they fit. Cap it to 6 bits. + */ + imms = ones - 1; + imms |= 0xf << ffs(esz); + imms &= BIT(6) - 1; + + /* Compute the rotation */ + if (range_of_ones(imm)) { + /* + * Pattern: 0..01..10..0 + * + * Compute how many rotate we need to align it right + */ + ror = __ffs64(imm); + } else { + /* + * Pattern: 0..01..10..01..1 + * + * Fill the unused top bits with ones, and check if + * the result is a valid immediate (all ones with a + * contiguous ranges of zeroes). + */ + imm |= ~mask; + if (!range_of_ones(~imm)) + return AARCH64_BREAK_FAULT; + + /* + * Compute the rotation to get a continuous set of + * ones, with the first bit set at position 0 + */ + ror = fls(~imm); + } + + /* + * immr is the number of bits we need to rotate back to the + * original set of ones. Note that this is relative to the + * element size... + */ + immr = (esz - ror) % esz; + + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n); + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr); + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms); +} + +u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type, + enum aarch64_insn_variant variant, + enum aarch64_insn_register Rn, + enum aarch64_insn_register Rd, + u64 imm) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_LOGIC_AND: + insn = aarch64_insn_get_and_imm_value(); + break; + case AARCH64_INSN_LOGIC_ORR: + insn = aarch64_insn_get_orr_imm_value(); + break; + case AARCH64_INSN_LOGIC_EOR: + insn = aarch64_insn_get_eor_imm_value(); + break; + case AARCH64_INSN_LOGIC_AND_SETFLAGS: + insn = aarch64_insn_get_ands_imm_value(); + break; + default: + pr_err("%s: unknown logical encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd); + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn); + return aarch64_encode_immediate(imm, variant, insn); +} + +u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant, + enum aarch64_insn_register Rm, + enum aarch64_insn_register Rn, + enum aarch64_insn_register Rd, + u8 lsb) +{ + u32 insn; + + insn = aarch64_insn_get_extr_value(); + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + if (lsb > 31) + return AARCH64_BREAK_FAULT; + break; + case AARCH64_INSN_VARIANT_64BIT: + if (lsb > 63) + return AARCH64_BREAK_FAULT; + insn |= AARCH64_INSN_SF_BIT; + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1); + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb); + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd); + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn); + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm); +} diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 2ae4d74ee73b4..2866fa3501800 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -295,7 +295,6 @@ #define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM-Exit when EIBRS is enabled */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 8fb9256768134..f9a4a35093769 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -148,10 +148,6 @@ * are restricted to targets in * kernel. */ -#define ARCH_CAP_PBRSB_NO BIT(24) /* - * Not susceptible to Post-Barrier - * Return Stack Buffer Predictions. - */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h index fdbffec4cfdea..5a2baf28a1dcd 100644 --- a/tools/arch/x86/include/asm/orc_types.h +++ b/tools/arch/x86/include/asm/orc_types.h @@ -40,6 +40,8 @@ #define ORC_REG_MAX 15 #ifndef __ASSEMBLY__ +#include + /* * This struct is more or less a vastly simplified version of the DWARF Call * Frame Information standard. It contains only the necessary parts of DWARF @@ -51,10 +53,18 @@ struct orc_entry { s16 sp_offset; s16 bp_offset; +#if defined(__LITTLE_ENDIAN_BITFIELD) unsigned sp_reg:4; unsigned bp_reg:4; unsigned type:2; unsigned end:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + unsigned bp_reg:4; + unsigned sp_reg:4; + unsigned unused:5; + unsigned end:1; + unsigned type:2; +#endif } __packed; #endif /* __ASSEMBLY__ */ diff --git a/tools/include/asm-generic/bitops/__ffs.h b/tools/include/asm-generic/bitops/__ffs.h index 9d13105194970..963f8a22212fd 100644 --- a/tools/include/asm-generic/bitops/__ffs.h +++ b/tools/include/asm-generic/bitops/__ffs.h @@ -42,4 +42,15 @@ static __always_inline unsigned long __ffs(unsigned long word) return num; } +static inline unsigned long __ffs64(u64 word) +{ +#if BITS_PER_LONG == 32 + if (((u32)word) == 0UL) + return __ffs((u32)(word >> 32)) + 32; +#elif BITS_PER_LONG != 64 +#error BITS_PER_LONG not 32 or 64 +#endif + return __ffs((unsigned long)word); +} + #endif /* _TOOLS_LINUX_ASM_GENERIC_BITOPS___FFS_H_ */ diff --git a/tools/include/linux/bug.h b/tools/include/linux/bug.h index 85f80258a15f6..548be7cffa8e7 100644 --- a/tools/include/linux/bug.h +++ b/tools/include/linux/bug.h @@ -2,10 +2,6 @@ #ifndef _TOOLS_PERF_LINUX_BUG_H #define _TOOLS_PERF_LINUX_BUG_H -/* Force a compilation error if condition is true, but also produce a - result (of value 0 and type size_t), so the expression can be used - e.g. in a structure initializer (or where-ever else comma expressions - aren't permitted). */ -#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) +#include #endif /* _TOOLS_PERF_LINUX_BUG_H */ diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index a7e54a08fb54c..e748982ed5c1a 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -114,6 +114,27 @@ int scnprintf_pad(char * buf, size_t size, const char * fmt, ...); #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) #define round_down(x, y) ((x) & ~__round_mask(x, y)) +/** + * upper_32_bits - return bits 32-63 of a number + * @n: the number we're accessing + * + * A basic shift-right of a 64- or 32-bit quantity. Use this to suppress + * the "right shift count >= width of type" warning when that quantity is + * 32-bits. + */ +#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16)) + +/** + * lower_32_bits - return bits 0-31 of a number + * @n: the number we're accessing + */ +#define lower_32_bits(n) ((u32)(n)) + +/* Inspired from ALIGN_*_KERNEL */ +#define __ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define __ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a) - 1) +#define ALIGN_DOWN(x, a) __ALIGN((x) - ((a) - 1), (a)) + #define current_gfp_context(k) 0 #define synchronize_rcu() diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 662f19374bd98..a2042c4186864 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -117,6 +117,12 @@ struct unwind_hint { .popsection .endm +.macro STACK_FRAME_NON_STANDARD func:req + .pushsection .discard.func_stack_frame_non_standard, "aw" + .long \func - . + .popsection +.endm + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_STACK_VALIDATION */ @@ -130,6 +136,8 @@ struct unwind_hint { #define ANNOTATE_INTRA_FUNCTION_CALL .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .endm +.macro STACK_FRAME_NON_STANDARD func:req +.endm #endif #endif /* CONFIG_STACK_VALIDATION */ diff --git a/tools/include/linux/printk.h b/tools/include/linux/printk.h new file mode 100644 index 0000000000000..515ebdc47e6e1 --- /dev/null +++ b/tools/include/linux/printk.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_KERNEL_PRINTK_H_ +#define _TOOLS_LINUX_KERNEL_PRINTK_H_ + +#include +#include +#include + +#define printk(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) +#define pr_info printk +#define pr_notice printk +#define pr_cont printk + +#define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#define pr_err pr_warn +#define pr_alert pr_warn +#define pr_emerg pr_warn +#define pr_crit pr_warn + +/* + * Dummy printk for disabled debugging statements to use whilst maintaining + * gcc's format checking. + */ +#define no_printk(fmt, ...) \ +({ \ + if (0) \ + printk(fmt, ##__VA_ARGS__); \ + 0; \ +}) + +/* pr_devel() should produce zero code unless DEBUG is defined */ +#ifdef DEBUG +#define pr_devel(fmt, ...) printk +#else +#define pr_devel(fmt, ...) no_printk +#endif + +#define pr_debug pr_devel + +#endif /* _TOOLS_LINUX_KERNEL_PRINTK_H_ */ diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h index 30dd21f976c30..2680f2edb837a 100644 --- a/tools/include/linux/rbtree.h +++ b/tools/include/linux/rbtree.h @@ -152,4 +152,194 @@ static inline void rb_replace_node_cached(struct rb_node *victim, rb_replace_node(victim, new, &root->rb_root); } -#endif /* __TOOLS_LINUX_PERF_RBTREE_H */ +/* + * The below helper functions use 2 operators with 3 different + * calling conventions. The operators are related like: + * + * comp(a->key,b) < 0 := less(a,b) + * comp(a->key,b) > 0 := less(b,a) + * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) + * + * If these operators define a partial order on the elements we make no + * guarantee on which of the elements matching the key is found. See + * rb_find(). + * + * The reason for this is to allow the find() interface without requiring an + * on-stack dummy object, which might not be feasible due to object size. + */ + +/** + * rb_add_cached() - insert @node into the leftmost cached tree @tree + * @node: node to insert + * @tree: leftmost cached tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_root.rb_node; + struct rb_node *parent = NULL; + bool leftmost = true; + + while (*link) { + parent = *link; + if (less(node, parent)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = false; + } + } + + rb_link_node(node, parent, link); + rb_insert_color_cached(node, tree, leftmost); +} + +/** + * rb_add() - insert @node into @tree + * @node: node to insert + * @tree: tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + + while (*link) { + parent = *link; + if (less(node, parent)) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + + rb_link_node(node, parent, link); + rb_insert_color(node, tree); +} + +/** + * rb_find_add() - find equivalent @node in @tree, or add @node + * @node: node to look-for / insert + * @tree: tree to search / modify + * @cmp: operator defining the node order + * + * Returns the rb_node matching @node, or NULL when no match is found and @node + * is inserted. + */ +static __always_inline struct rb_node * +rb_find_add(struct rb_node *node, struct rb_root *tree, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + int c; + + while (*link) { + parent = *link; + c = cmp(node, parent); + + if (c < 0) + link = &parent->rb_left; + else if (c > 0) + link = &parent->rb_right; + else + return parent; + } + + rb_link_node(node, parent, link); + rb_insert_color(node, tree); + return NULL; +} + +/** + * rb_find() - find @key in tree @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining the node order + * + * Returns the rb_node matching @key or NULL. + */ +static __always_inline struct rb_node * +rb_find(const void *key, const struct rb_root *tree, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + + while (node) { + int c = cmp(key, node); + + if (c < 0) + node = node->rb_left; + else if (c > 0) + node = node->rb_right; + else + return node; + } + + return NULL; +} + +/** + * rb_find_first() - find the first @key in @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + * + * Returns the leftmost node matching @key, or NULL. + */ +static __always_inline struct rb_node * +rb_find_first(const void *key, const struct rb_root *tree, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + struct rb_node *match = NULL; + + while (node) { + int c = cmp(key, node); + + if (c <= 0) { + if (!c) + match = node; + node = node->rb_left; + } else if (c > 0) { + node = node->rb_right; + } + } + + return match; +} + +/** + * rb_next_match() - find the next @key in @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + * + * Returns the next node matching @key, or NULL. + */ +static __always_inline struct rb_node * +rb_next_match(const void *key, struct rb_node *node, + int (*cmp)(const void *key, const struct rb_node *)) +{ + node = rb_next(node); + if (node && cmp(key, node)) + node = NULL; + return node; +} + +/** + * rb_for_each() - iterates a subtree matching @key + * @node: iterator + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + */ +#define rb_for_each(node, key, tree, cmp) \ + for ((node) = rb_find_first((key), (tree), (cmp)); \ + (node); (node) = rb_next_match((key), (node), (cmp))) + +#endif /* __TOOLS_LINUX_PERF_RBTREE_H */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7943e748916d4..f47d79146b9cd 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -157,6 +157,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, }; /* Note that tracing related programs such as @@ -240,6 +241,9 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, + BPF_SK_REUSEPORT_SELECT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, __MAX_BPF_ATTACH_TYPE }; @@ -1661,6 +1665,14 @@ union bpf_attr { * Return * A 8-byte long non-decreasing number. * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket @@ -3742,6 +3754,50 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be an task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3900,6 +3956,9 @@ union bpf_attr { FN(per_cpu_ptr), \ FN(this_cpu_ptr), \ FN(redirect_peer), \ + FN(task_storage_get), \ + FN(task_storage_delete), \ + FN(get_current_task_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4356,6 +4415,20 @@ struct sk_reuseport_md { __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ __u32 bind_inany; /* Is sock bound to an INANY address? */ __u32 hash; /* A hash of the packet 4 tuples */ + /* When reuse->migrating_sk is NULL, it is selecting a sk for the + * new incoming connection request (e.g. selecting a listen sk for + * the received SYN in the TCP case). reuse->sk is one of the sk + * in the reuseport group. The bpf prog can use reuse->sk to learn + * the local listening ip/port without looking into the skb. + * + * When reuse->migrating_sk is not NULL, reuse->sk is closed and + * reuse->migrating_sk is the socket that needs to be migrated + * to another listening socket. migrating_sk could be a fullsock + * sk that is fully established or a reqsk that is in-the-middle + * of 3-way handshake. + */ + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(struct bpf_sock *, migrating_sk); }; #define BPF_TAG_SIZE 8 diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index ca41220b40b8b..0d7350d1795bb 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1053,6 +1053,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_X86_USER_SPACE_MSR 188 #define KVM_CAP_X86_MSR_FILTER 189 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 +#define KVM_CAP_PTP_KVM 198 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 015ed8253f739..1eeb779e3728c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8321,7 +8321,10 @@ static struct bpf_link *attach_iter(const struct bpf_sec_def *sec, static const struct bpf_sec_def section_defs[] = { BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), - BPF_PROG_SEC("sk_reuseport", BPF_PROG_TYPE_SK_REUSEPORT), + BPF_EAPROG_SEC("sk_reuseport/migrate", BPF_PROG_TYPE_SK_REUSEPORT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE), + BPF_EAPROG_SEC("sk_reuseport", BPF_PROG_TYPE_SK_REUSEPORT, + BPF_SK_REUSEPORT_SELECT), SEC_DEF("kprobe/", KPROBE, .attach_fn = attach_kprobe), BPF_PROG_SEC("uprobe/", BPF_PROG_TYPE_KPROBE), diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore index 45cefda24c7b1..14236db3677f6 100644 --- a/tools/objtool/.gitignore +++ b/tools/objtool/.gitignore @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only arch/x86/lib/inat-tables.c -objtool +/objtool fixdep diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index a43096f713c7b..d5cfbec87c022 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -27,6 +27,7 @@ all: $(OBJTOOL) INCLUDES := -I$(srctree)/tools/include \ -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ -I$(srctree)/tools/arch/$(SRCARCH)/include \ + -I$(srctree)/tools/objtool/include \ -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS) @@ -46,6 +47,11 @@ ifeq ($(SRCARCH),x86) SUBCMD_ORC := y endif +ifeq ($(SRCARCH),arm64) + SUBCMD_CHECK := y + CFLAGS += -Wno-nested-externs +endif + export SUBCMD_CHECK SUBCMD_ORC export srctree OUTPUT CFLAGS SRCARCH AWK include $(srctree)/tools/build/Makefile.include diff --git a/tools/objtool/arch/arm64/Build b/tools/objtool/arch/arm64/Build new file mode 100644 index 0000000000000..f3de3a50d5411 --- /dev/null +++ b/tools/objtool/arch/arm64/Build @@ -0,0 +1,8 @@ +objtool-y += special.o +objtool-y += decode.o + +objtool-y += libhweight.o + +$(OUTPUT)arch/arm64/libhweight.o: ../lib/hweight.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c new file mode 100644 index 0000000000000..3e0193682eba6 --- /dev/null +++ b/tools/objtool/arch/arm64/decode.c @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#include + +/* Hack needed to avoid depending on debug-monitors.h */ +#define AARCH64_BREAK_FAULT 0xBAD + +/* Hack needed to avoid depending on kprobes.h */ +#ifndef __kprobes +#define __kprobes +#endif + +#include "../../../arch/arm64/lib/insn.c" + +static unsigned long sign_extend(unsigned long x, int nbits) +{ + unsigned long sign_bit = (x >> (nbits - 1)) & 1; + + return ((~0UL + (sign_bit ^ 1)) << nbits) | x; +} + +struct insn_loc { + const struct section *sec; + unsigned long offset; + struct hlist_node hnode; + bool ignorable; +}; + +DEFINE_HASHTABLE(invalid_insns, 16); + +static int record_invalid_insn(const struct section *sec, + unsigned long offset, + bool ignore) +{ + struct insn_loc *loc; + struct hlist_head *l; + + l = &invalid_insns[hash_min(offset, HASH_BITS(invalid_insns))]; + if (!hlist_empty(l)) { + loc = hlist_entry(l->first, struct insn_loc, hnode); + loc->ignorable |= ignore; + return 0; + } + + loc = malloc(sizeof(*loc)); + if (!loc) { + WARN("malloc failed"); + return -1; + } + + loc->sec = sec; + loc->offset = offset; + loc->ignorable = ignore; + + hash_add(invalid_insns, &loc->hnode, loc->offset); + + return 0; +} + +int arch_post_process_instructions(struct objtool_file *file) +{ + struct hlist_node *tmp; + struct insn_loc *loc; + unsigned int bkt; + int res = 0; + + hash_for_each_safe(invalid_insns, bkt, tmp, loc, hnode) { + struct instruction *insn; + + insn = find_insn(file, (struct section *) loc->sec, loc->offset); + if (insn) { + if (loc->ignorable) { + list_del(&insn->list); + hash_del(&insn->hash); + free(insn); + } else { + WARN_FUNC("can't decode instruction", insn->sec, insn->offset); + return -1; + } + } + + hash_del(&loc->hnode); + free(loc); + } + + return res; +} + +bool arch_callee_saved_reg(unsigned char reg) +{ + switch (reg) { + case AARCH64_INSN_REG_19: + case AARCH64_INSN_REG_20: + case AARCH64_INSN_REG_21: + case AARCH64_INSN_REG_22: + case AARCH64_INSN_REG_23: + case AARCH64_INSN_REG_24: + case AARCH64_INSN_REG_25: + case AARCH64_INSN_REG_26: + case AARCH64_INSN_REG_27: + case AARCH64_INSN_REG_28: + case AARCH64_INSN_REG_FP: + case AARCH64_INSN_REG_LR: + return true; + default: + return false; + } +} + +void arch_initial_func_cfi_state(struct cfi_init_state *state) +{ + int i; + + for (i = 0; i < CFI_NUM_REGS; i++) { + state->regs[i].base = CFI_UNDEFINED; + state->regs[i].offset = 0; + } + + /* initial CFA (call frame address) */ + state->cfa.base = CFI_SP; + state->cfa.offset = 0; +} + +unsigned long arch_dest_reloc_offset(int addend) +{ + return addend; +} + +unsigned long arch_jump_destination(struct instruction *insn) +{ + return insn->offset + insn->immediate; +} + +const char *arch_nop_insn(int len) +{ + static u32 nop = 0; + + if (len != AARCH64_INSN_SIZE) + WARN("invalid NOP size: %d\n", len); + + if (!nop) + nop = aarch64_insn_gen_nop(); + + return (const char*)&nop; +} + +const char *arch_ret_insn(int len) +{ + return arch_nop_insn(len); +} + +static int is_arm64(const struct elf *elf) +{ + switch (elf->ehdr.e_machine) { + case EM_AARCH64: //0xB7 + return 1; + default: + WARN("unexpected ELF machine type %x", + elf->ehdr.e_machine); + return 0; + } +} + +int arch_decode_hint_reg(u8 sp_reg, int *base) +{ + if (sp_reg == UNWIND_HINT_REG_UNDEFINED) + *base = CFI_UNDEFINED; + else + *base = sp_reg; + + return 0; +} + +static struct stack_op *arm_make_store_op(enum aarch64_insn_register base, + enum aarch64_insn_register reg, + int offset) +{ + struct stack_op *op; + + op = calloc(1, sizeof(*op)); + if (!op) { + WARN("calloc failed"); + return NULL; + } + op->dest.type = OP_DEST_REG_INDIRECT; + op->dest.reg = base; + op->dest.offset = offset; + op->src.type = OP_SRC_REG; + op->src.reg = reg; + op->src.offset = 0; + + return op; +} + +static struct stack_op *arm_make_load_op(enum aarch64_insn_register base, + enum aarch64_insn_register reg, + int offset) +{ + struct stack_op *op; + + op = calloc(1, sizeof(*op)); + if (!op) { + WARN("calloc failed"); + return NULL; + } + op->dest.type = OP_DEST_REG; + op->dest.reg = reg; + op->dest.offset = 0; + op->src.type = OP_SRC_REG_INDIRECT; + op->src.reg = base; + op->src.offset = offset; + + return op; +} + +static struct stack_op *arm_make_add_op(enum aarch64_insn_register dest, + enum aarch64_insn_register src, + int val) +{ + struct stack_op *op; + + op = calloc(1, sizeof(*op)); + if (!op) { + WARN("calloc failed"); + return NULL; + } + op->dest.type = OP_DEST_REG; + op->dest.reg = dest; + op->src.reg = src; + op->src.type = val != 0 ? OP_SRC_ADD : OP_SRC_REG; + op->src.offset = val; + + return op; +} + +static int arm_decode_load_store(u32 insn, enum insn_type *type, + unsigned long *immediate, + struct list_head *ops_list) +{ + enum aarch64_insn_register base; + enum aarch64_insn_register rt; + struct stack_op *op; + int size; + int offset; + + *type = INSN_OTHER; + + if (aarch64_insn_is_store_single(insn) || + aarch64_insn_is_load_single(insn)) + size = 1 << ((insn & GENMASK(31, 30)) >> 30); + else + size = 4 << ((insn >> 31) & 1); + + if (aarch64_insn_is_store_imm(insn) || aarch64_insn_is_load_imm(insn)) + *immediate = size * aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12, + insn); + else if (aarch64_insn_is_store_pre(insn) || + aarch64_insn_is_load_pre(insn) || + aarch64_insn_is_store_post(insn) || + aarch64_insn_is_load_post(insn)) + *immediate = sign_extend(aarch64_insn_decode_immediate(AARCH64_INSN_IMM_9, + insn), + 9); + else if (aarch64_insn_is_stp(insn) || aarch64_insn_is_ldp(insn) || + aarch64_insn_is_stp_pre(insn) || + aarch64_insn_is_ldp_pre(insn) || + aarch64_insn_is_stp_post(insn) || + aarch64_insn_is_ldp_post(insn)) + *immediate = size * sign_extend(aarch64_insn_decode_immediate(AARCH64_INSN_IMM_7, + insn), + 7); + else + return 1; + + base = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn); + if (base != AARCH64_INSN_REG_FP && base != AARCH64_INSN_REG_SP) + return 0; + + offset = *immediate; + + if (aarch64_insn_is_store_pre(insn) || aarch64_insn_is_stp_pre(insn) || + aarch64_insn_is_store_post(insn) || aarch64_insn_is_stp_post(insn)) { + op = arm_make_add_op(base, base, *immediate); + list_add_tail(&op->list, ops_list); + + if (aarch64_insn_is_store_post(insn) || aarch64_insn_is_stp_post(insn)) + offset = -*immediate; + else + offset = 0; + } else if (aarch64_insn_is_load_post(insn) || aarch64_insn_is_ldp_post(insn)) { + offset = 0; + } + + /* First register */ + rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + if (aarch64_insn_is_store_single(insn) || + aarch64_insn_is_store_pair(insn)) + op = arm_make_store_op(base, rt, offset); + else + op = arm_make_load_op(base, rt, offset); + + if (!op) + return -1; + list_add_tail(&op->list, ops_list); + + /* Second register (if present) */ + if (aarch64_insn_is_store_pair(insn) || + aarch64_insn_is_load_pair(insn)) { + rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT2, + insn); + if (aarch64_insn_is_store_pair(insn)) + op = arm_make_store_op(base, rt, offset + size); + else + op = arm_make_load_op(base, rt, offset + size); + if (!op) + return -1; + list_add_tail(&op->list, ops_list); + } + + if (aarch64_insn_is_load_pre(insn) || aarch64_insn_is_ldp_pre(insn) || + aarch64_insn_is_load_post(insn) || aarch64_insn_is_ldp_post(insn)) { + op = arm_make_add_op(base, base, *immediate); + if (!op) + return -1; + list_add_tail(&op->list, ops_list); + } + + return 0; +} + +static int arm_decode_add_sub_imm(u32 instr, bool set_flags, + enum insn_type *type, + unsigned long *immediate, + struct list_head *ops_list) +{ + u32 rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, instr); + u32 rn = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, instr); + + *type = INSN_OTHER; + *immediate = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12, instr); + + if (instr & AARCH64_INSN_LSL_12) + *immediate <<= 12; + + if ((!set_flags && rd == AARCH64_INSN_REG_SP) || + rd == AARCH64_INSN_REG_FP || + rn == AARCH64_INSN_REG_FP || + rn == AARCH64_INSN_REG_SP) { + struct stack_op *op; + int value; + + if (aarch64_insn_is_subs_imm(instr) || aarch64_insn_is_sub_imm(instr)) + value = -*immediate; + else + value = *immediate; + + op = arm_make_add_op(rd, rn, value); + if (!op) + return -1; + list_add_tail(&op->list, ops_list); + } + + return 0; +} + +int arch_decode_instruction(const struct elf *elf, const struct section *sec, + unsigned long offset, unsigned int maxlen, + unsigned int *len, enum insn_type *type, + unsigned long *immediate, + struct list_head *ops_list) +{ + u32 insn; + + if (!is_arm64(elf)) + return -1; + + if (maxlen < AARCH64_INSN_SIZE) + return 0; + + *len = AARCH64_INSN_SIZE; + *immediate = 0; + + insn = *(u32 *)(sec->data->d_buf + offset); + + switch (aarch64_get_insn_class(insn)) { + case AARCH64_INSN_CLS_UNKNOWN: + { + /* + * There are a few reasons we might have non-valid opcodes in + * code sections: + * - For load literal, assembler can generate the data to be + * loaded in the code section + * - Compiler/assembler can generate zeroes to pad function that + * do not end on 8-byte alignment + */ + /* Compiler might put zeroes as padding */ + if (record_invalid_insn(sec, offset, insn == 0x0)) + return -1; + + *type = INSN_OTHER; + + break; + } + case AARCH64_INSN_CLS_DP_IMM: + /* Mov register to and from SP are aliases of add_imm */ + if (aarch64_insn_is_add_imm(insn) || + aarch64_insn_is_sub_imm(insn)) + return arm_decode_add_sub_imm(insn, false, type, immediate, + ops_list); + else if (aarch64_insn_is_adds_imm(insn) || + aarch64_insn_is_subs_imm(insn)) + return arm_decode_add_sub_imm(insn, true, type, immediate, + ops_list); + else + *type = INSN_OTHER; + break; + case AARCH64_INSN_CLS_DP_REG: + if (aarch64_insn_is_mov_reg(insn)) { + enum aarch64_insn_register rd; + enum aarch64_insn_register rm; + + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + rm = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RM, insn); + if (rd == AARCH64_INSN_REG_FP || rm == AARCH64_INSN_REG_FP) { + struct stack_op *op; + + op = arm_make_add_op(rd, rm, 0); + if (!op) + return -1; + list_add_tail(&op->list, ops_list); + break; + } + } + *type = INSN_OTHER; + break; + case AARCH64_INSN_CLS_BR_SYS: + if (aarch64_insn_is_ret(insn) && + aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn) == AARCH64_INSN_REG_LR) { + *type = INSN_RETURN; + } else if (aarch64_insn_is_bl(insn)) { + *type = INSN_CALL; + *immediate = aarch64_get_branch_offset(insn); + } else if (aarch64_insn_is_blr(insn)) { + *type = INSN_CALL_DYNAMIC; + } else if (aarch64_insn_is_b(insn)) { + *type = INSN_JUMP_UNCONDITIONAL; + *immediate = aarch64_get_branch_offset(insn); + } else if (aarch64_insn_is_br(insn)) { + *type = INSN_JUMP_DYNAMIC; + } else if (aarch64_insn_is_branch_imm(insn)) { + /* Remaining branch opcodes are conditional */ + *type = INSN_JUMP_CONDITIONAL; + *immediate = aarch64_get_branch_offset(insn); + } else if (aarch64_insn_is_eret(insn)) { + *type = INSN_CONTEXT_SWITCH; + } else if (aarch64_insn_is_steppable_hint(insn)) { + *type = INSN_NOP; + } else if (aarch64_insn_is_brk(insn)) { + *immediate = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn); + *type = INSN_BUG; + } else { + *type = INSN_OTHER; + } + break; + case AARCH64_INSN_CLS_LDST: + { + int ret; + + ret = arm_decode_load_store(insn, type, immediate, ops_list); + if (ret <= 0) + return ret; + + if (aarch64_insn_is_ldr_lit(insn)) { + long pc_offset; + + pc_offset = insn & GENMASK(23, 5); + /* Sign extend and multiply by 4 */ + pc_offset = (pc_offset << (64 - 23)); + pc_offset = ((pc_offset >> (64 - 23)) >> 5) << 2; + + if (record_invalid_insn(sec, offset + pc_offset, true)) + return -1; + + /* 64-bit literal */ + if (insn & BIT(30)) { + if (record_invalid_insn(sec, + offset + pc_offset + 4, + true)) + return -1; + } + } + *type = INSN_OTHER; + break; + } + default: + *type = INSN_OTHER; + break; + } + + return 0; +} diff --git a/tools/objtool/arch/arm64/include/arch/cfi_regs.h b/tools/objtool/arch/arm64/include/arch/cfi_regs.h new file mode 100644 index 0000000000000..a5185649686b7 --- /dev/null +++ b/tools/objtool/arch/arm64/include/arch/cfi_regs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _OBJTOOL_CFI_REGS_H +#define _OBJTOOL_CFI_REGS_H + +#include + +#define CFI_BP AARCH64_INSN_REG_FP +#define CFI_RA AARCH64_INSN_REG_LR +#define CFI_SP AARCH64_INSN_REG_SP + +#define CFI_NUM_REGS 32 + +#endif /* _OBJTOOL_CFI_REGS_H */ diff --git a/tools/objtool/arch/arm64/include/arch/elf.h b/tools/objtool/arch/arm64/include/arch/elf.h new file mode 100644 index 0000000000000..a31a29b1a3867 --- /dev/null +++ b/tools/objtool/arch/arm64/include/arch/elf.h @@ -0,0 +1,6 @@ +#ifndef _OBJTOOL_ARCH_ELF +#define _OBJTOOL_ARCH_ELF + +#define R_NONE R_AARCH64_NONE + +#endif /* _OBJTOOL_ARCH_ELF */ diff --git a/tools/objtool/arch/arm64/include/arch/endianness.h b/tools/objtool/arch/arm64/include/arch/endianness.h new file mode 100644 index 0000000000000..7c362527da205 --- /dev/null +++ b/tools/objtool/arch/arm64/include/arch/endianness.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ARCH_ENDIANNESS_H +#define _ARCH_ENDIANNESS_H + +#include + +#define __TARGET_BYTE_ORDER __LITTLE_ENDIAN + +#endif /* _ARCH_ENDIANNESS_H */ diff --git a/tools/objtool/arch/arm64/include/arch/special.h b/tools/objtool/arch/arm64/include/arch/special.h new file mode 100644 index 0000000000000..a82a9b3e51dfd --- /dev/null +++ b/tools/objtool/arch/arm64/include/arch/special.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _ARM64_ARCH_SPECIAL_H +#define _ARM64_ARCH_SPECIAL_H + +#define EX_ENTRY_SIZE 8 +#define EX_ORIG_OFFSET 0 +#define EX_NEW_OFFSET 4 + +#define JUMP_ENTRY_SIZE 16 +#define JUMP_ORIG_OFFSET 0 +#define JUMP_NEW_OFFSET 4 + +#define ALT_ENTRY_SIZE 12 +#define ALT_ORIG_OFFSET 0 +#define ALT_NEW_OFFSET 4 +#define ALT_FEATURE_OFFSET 8 +#define ALT_ORIG_LEN_OFFSET 10 +#define ALT_NEW_LEN_OFFSET 11 + +#endif /* _ARM64_ARCH_SPECIAL_H */ diff --git a/tools/objtool/arch/arm64/special.c b/tools/objtool/arch/arm64/special.c new file mode 100644 index 0000000000000..ed642bd6f886c --- /dev/null +++ b/tools/objtool/arch/arm64/special.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +void arch_handle_alternative(unsigned short feature, struct special_alt *alt) +{ + if (alt->orig_len && !alt->new_len) { + /* + * ARM64_CB_PATCH has no alternative instruction. + * a callback is called at alternative replacement time + * to dynamically change the original instructions. + * + * ARM64_CB_PATCH is the last ARM64 feature, it's value changes + * every time a new feature is added. So the orig/alt region + * length are used to detect those alternatives + */ + alt->skip_alt = true; + } +} + +bool arch_support_alt_relocation(struct special_alt *special_alt, + struct instruction *insn, + struct reloc *reloc) +{ + u32 opcode = *(u32 *)(insn->sec->data->d_buf + insn->offset); + + return aarch64_insn_is_branch_imm(opcode) || + aarch64_insn_is_adrp(opcode) || + !aarch64_insn_uses_literal(opcode); +} + + +struct reloc *arch_find_switch_table(struct objtool_file *file, + struct instruction *insn) +{ + return NULL; +} diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 5b915ebb61163..d9d9d763e41dd 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -11,23 +11,12 @@ #include "../../../arch/x86/lib/inat.c" #include "../../../arch/x86/lib/insn.c" -#include "../../check.h" -#include "../../elf.h" -#include "../../arch.h" -#include "../../warn.h" #include -#include "arch_elf.h" - -static unsigned char op_to_cfi_reg[][2] = { - {CFI_AX, CFI_R8}, - {CFI_CX, CFI_R9}, - {CFI_DX, CFI_R10}, - {CFI_BX, CFI_R11}, - {CFI_SP, CFI_R12}, - {CFI_BP, CFI_R13}, - {CFI_SI, CFI_R14}, - {CFI_DI, CFI_R15}, -}; +#include "arch/elf.h" +#include +#include +#include +#include static int is_x86_64(const struct elf *elf) { @@ -84,6 +73,31 @@ unsigned long arch_jump_destination(struct instruction *insn) return -1; \ else for (list_add_tail(&op->list, ops_list); op; op = NULL) +/* + * Helpers to decode ModRM/SIB: + * + * r/m| AX CX DX BX | SP | BP | SI DI | + * | R8 R9 R10 R11 | R12 | R13 | R14 R15 | + * Mod+----------------+-----+-----+---------+ + * 00 | [r/m] |[SIB]|[IP+]| [r/m] | + * 01 | [r/m + d8] |[S+d]| [r/m + d8] | + * 10 | [r/m + d32] |[S+D]| [r/m + d32] | + * 11 | r/ m | + */ + +#define mod_is_mem() (modrm_mod != 3) +#define mod_is_reg() (modrm_mod == 3) + +#define is_RIP() ((modrm_rm & 7) == CFI_BP && modrm_mod == 0) +#define have_SIB() ((modrm_rm & 7) == CFI_SP && mod_is_mem()) + +#define rm_is(reg) (have_SIB() ? \ + sib_base == (reg) && sib_index == CFI_SP : \ + modrm_rm == (reg)) + +#define rm_is_mem(reg) (mod_is_mem() && !is_RIP() && rm_is(reg)) +#define rm_is_reg(reg) (mod_is_reg() && modrm_rm == (reg)) + int arch_decode_instruction(const struct elf *elf, const struct section *sec, unsigned long offset, unsigned int maxlen, unsigned int *len, enum insn_type *type, @@ -91,12 +105,14 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, struct list_head *ops_list) { struct insn insn; - int x86_64, sign; - unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, - rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0, - modrm_reg = 0, sib = 0; + int x86_64; + unsigned char op1, op2, + rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0, + modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0, + sib = 0, /* sib_scale = 0, */ sib_index = 0, sib_base = 0; struct stack_op *op = NULL; struct symbol *sym; + u64 imm; x86_64 = is_x86_64(elf); if (x86_64 == -1) @@ -130,23 +146,27 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, if (insn.modrm.nbytes) { modrm = insn.modrm.bytes[0]; modrm_mod = X86_MODRM_MOD(modrm); - modrm_reg = X86_MODRM_REG(modrm); - modrm_rm = X86_MODRM_RM(modrm); + modrm_reg = X86_MODRM_REG(modrm) + 8*rex_r; + modrm_rm = X86_MODRM_RM(modrm) + 8*rex_b; } - if (insn.sib.nbytes) + if (insn.sib.nbytes) { sib = insn.sib.bytes[0]; + /* sib_scale = X86_SIB_SCALE(sib); */ + sib_index = X86_SIB_INDEX(sib) + 8*rex_x; + sib_base = X86_SIB_BASE(sib) + 8*rex_b; + } switch (op1) { case 0x1: case 0x29: - if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) { + if (rex_w && rm_is_reg(CFI_SP)) { /* add/sub reg, %rsp */ ADD_OP(op) { op->src.type = OP_SRC_ADD; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG; op->dest.reg = CFI_SP; } @@ -158,7 +178,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, /* push reg */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[op1 & 0x7][rex_b]; + op->src.reg = (op1 & 0x7) + 8*rex_b; op->dest.type = OP_DEST_PUSH; } @@ -170,7 +190,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, ADD_OP(op) { op->src.type = OP_SRC_POP; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[op1 & 0x7][rex_b]; + op->dest.reg = (op1 & 0x7) + 8*rex_b; } break; @@ -188,12 +208,54 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, *type = INSN_JUMP_CONDITIONAL; break; - case 0x81: - case 0x83: - if (rex != 0x48) + case 0x80 ... 0x83: + /* + * 1000 00sw : mod OP r/m : immediate + * + * s - sign extend immediate + * w - imm8 / imm32 + * + * OP: 000 ADD 100 AND + * 001 OR 101 SUB + * 010 ADC 110 XOR + * 011 SBB 111 CMP + */ + + /* 64bit only */ + if (!rex_w) + break; + + /* %rsp target only */ + if (!rm_is_reg(CFI_SP)) + break; + + imm = insn.immediate.value; + if (op1 & 2) { /* sign extend */ + if (op1 & 1) { /* imm32 */ + imm <<= 32; + imm = (s64)imm >> 32; + } else { /* imm8 */ + imm <<= 56; + imm = (s64)imm >> 56; + } + } + + switch (modrm_reg & 7) { + case 5: + imm = -imm; + /* fallthrough */ + case 0: + /* add/sub imm, %rsp */ + ADD_OP(op) { + op->src.type = OP_SRC_ADD; + op->src.reg = CFI_SP; + op->src.offset = imm; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } break; - if (modrm == 0xe4) { + case 4: /* and imm, %rsp */ ADD_OP(op) { op->src.type = OP_SRC_AND; @@ -203,44 +265,62 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->dest.reg = CFI_SP; } break; - } - if (modrm == 0xc4) - sign = 1; - else if (modrm == 0xec) - sign = -1; - else + default: + /* WARN ? */ break; - - /* add/sub imm, %rsp */ - ADD_OP(op) { - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_SP; - op->src.offset = insn.immediate.value * sign; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; } + break; case 0x89: - if (rex_w && !rex_r && modrm_mod == 3 && modrm_reg == 4) { + if (!rex_w) + break; - /* mov %rsp, reg */ - ADD_OP(op) { - op->src.type = OP_SRC_REG; - op->src.reg = CFI_SP; - op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; + if (modrm_reg == CFI_SP) { + + if (mod_is_reg()) { + /* mov %rsp, reg */ + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = CFI_SP; + op->dest.type = OP_DEST_REG; + op->dest.reg = modrm_rm; + } + break; + + } else { + /* skip RIP relative displacement */ + if (is_RIP()) + break; + + /* skip nontrivial SIB */ + if (have_SIB()) { + modrm_rm = sib_base; + if (sib_index != CFI_SP) + break; + } + + /* mov %rsp, disp(%reg) */ + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = CFI_SP; + op->dest.type = OP_DEST_REG_INDIRECT; + op->dest.reg = modrm_rm; + op->dest.offset = insn.displacement.value; + } + break; } + break; } - if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) { + if (rm_is_reg(CFI_SP)) { /* mov reg, %rsp */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG; op->dest.reg = CFI_SP; } @@ -249,34 +329,42 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, /* fallthrough */ case 0x88: - if (!rex_b && - (modrm_mod == 1 || modrm_mod == 2) && modrm_rm == 5) { + if (!rex_w) + break; + + if (rm_is_mem(CFI_BP)) { /* mov reg, disp(%rbp) */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG_INDIRECT; op->dest.reg = CFI_BP; op->dest.offset = insn.displacement.value; } + break; + } - } else if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) { + if (rm_is_mem(CFI_SP)) { /* mov reg, disp(%rsp) */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG_INDIRECT; op->dest.reg = CFI_SP; op->dest.offset = insn.displacement.value; } + break; } break; case 0x8b: - if (rex_w && !rex_b && modrm_mod == 1 && modrm_rm == 5) { + if (!rex_w) + break; + + if (rm_is_mem(CFI_BP)) { /* mov disp(%rbp), reg */ ADD_OP(op) { @@ -284,11 +372,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->src.reg = CFI_BP; op->src.offset = insn.displacement.value; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.reg = modrm_reg; } + break; + } - } else if (rex_w && !rex_b && sib == 0x24 && - modrm_mod != 3 && modrm_rm == 4) { + if (rm_is_mem(CFI_SP)) { /* mov disp(%rsp), reg */ ADD_OP(op) { @@ -296,75 +385,48 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->src.reg = CFI_SP; op->src.offset = insn.displacement.value; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.reg = modrm_reg; } + break; } break; case 0x8d: - if (sib == 0x24 && rex_w && !rex_b && !rex_x) { - - ADD_OP(op) { - if (!insn.displacement.value) { - /* lea (%rsp), reg */ - op->src.type = OP_SRC_REG; - } else { - /* lea disp(%rsp), reg */ - op->src.type = OP_SRC_ADD; - op->src.offset = insn.displacement.value; - } - op->src.reg = CFI_SP; - op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; - } - - } else if (rex == 0x48 && modrm == 0x65) { - - /* lea disp(%rbp), %rsp */ - ADD_OP(op) { - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_BP; - op->src.offset = insn.displacement.value; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; - } + if (mod_is_reg()) { + WARN("invalid LEA encoding at %s:0x%lx", sec->name, offset); + break; + } - } else if (rex == 0x49 && modrm == 0x62 && - insn.displacement.value == -8) { + /* skip non 64bit ops */ + if (!rex_w) + break; - /* - * lea -0x8(%r10), %rsp - * - * Restoring rsp back to its original value after a - * stack realignment. - */ - ADD_OP(op) { - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_R10; - op->src.offset = -8; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; - } + /* skip RIP relative displacement */ + if (is_RIP()) + break; - } else if (rex == 0x49 && modrm == 0x65 && - insn.displacement.value == -16) { + /* skip nontrivial SIB */ + if (have_SIB()) { + modrm_rm = sib_base; + if (sib_index != CFI_SP) + break; + } - /* - * lea -0x10(%r13), %rsp - * - * Restoring rsp back to its original value after a - * stack realignment. - */ - ADD_OP(op) { + /* lea disp(%src), %dst */ + ADD_OP(op) { + op->src.offset = insn.displacement.value; + if (!op->src.offset) { + /* lea (%src), %dst */ + op->src.type = OP_SRC_REG; + } else { + /* lea disp(%src), %dst */ op->src.type = OP_SRC_ADD; - op->src.reg = CFI_R13; - op->src.offset = -16; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; } + op->src.reg = modrm_rm; + op->dest.type = OP_DEST_REG; + op->dest.reg = modrm_reg; } - break; case 0x8f: @@ -555,6 +617,11 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, return 0; } +int arch_post_process_instructions(struct objtool_file *file) +{ + return 0; +} + void arch_initial_func_cfi_state(struct cfi_init_state *state) { int i; diff --git a/tools/objtool/arch/x86/include/cfi_regs.h b/tools/objtool/arch/x86/include/arch/cfi_regs.h similarity index 77% rename from tools/objtool/arch/x86/include/cfi_regs.h rename to tools/objtool/arch/x86/include/arch/cfi_regs.h index 79bc517efba85..0579d22c433cd 100644 --- a/tools/objtool/arch/x86/include/cfi_regs.h +++ b/tools/objtool/arch/x86/include/arch/cfi_regs.h @@ -4,13 +4,13 @@ #define _OBJTOOL_CFI_REGS_H #define CFI_AX 0 -#define CFI_DX 1 -#define CFI_CX 2 +#define CFI_CX 1 +#define CFI_DX 2 #define CFI_BX 3 -#define CFI_SI 4 -#define CFI_DI 5 -#define CFI_BP 6 -#define CFI_SP 7 +#define CFI_SP 4 +#define CFI_BP 5 +#define CFI_SI 6 +#define CFI_DI 7 #define CFI_R8 8 #define CFI_R9 9 #define CFI_R10 10 diff --git a/tools/objtool/arch/x86/include/arch_elf.h b/tools/objtool/arch/x86/include/arch/elf.h similarity index 100% rename from tools/objtool/arch/x86/include/arch_elf.h rename to tools/objtool/arch/x86/include/arch/elf.h diff --git a/tools/objtool/arch/x86/include/arch/endianness.h b/tools/objtool/arch/x86/include/arch/endianness.h new file mode 100644 index 0000000000000..7c362527da205 --- /dev/null +++ b/tools/objtool/arch/x86/include/arch/endianness.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ARCH_ENDIANNESS_H +#define _ARCH_ENDIANNESS_H + +#include + +#define __TARGET_BYTE_ORDER __LITTLE_ENDIAN + +#endif /* _ARCH_ENDIANNESS_H */ diff --git a/tools/objtool/arch/x86/include/arch_special.h b/tools/objtool/arch/x86/include/arch/special.h similarity index 100% rename from tools/objtool/arch/x86/include/arch_special.h rename to tools/objtool/arch/x86/include/arch/special.h diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c index 151b13d0a2676..e707d9bcd1616 100644 --- a/tools/objtool/arch/x86/special.c +++ b/tools/objtool/arch/x86/special.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include "../../special.h" -#include "../../builtin.h" +#include +#include #define X86_FEATURE_POPCNT (4 * 32 + 23) #define X86_FEATURE_SMAP (9 * 32 + 20) diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 447a49c03abb3..35081fe373203 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -15,17 +15,23 @@ #include #include -#include "builtin.h" -#include "objtool.h" +#include +#include +#include bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, - validate_dup, vmlinux, sls, unret, rethunk; + validate_dup, vmlinux, mcount, noinstr, backup, sls, unret, rethunk; static const char * const check_usage[] = { "objtool check [] file.o", NULL, }; +static const char * const env_usage[] = { + "OBJTOOL_ARGS=\"\"", + NULL, +}; + const struct option check_options[] = { OPT_BOOLEAN('f', "no-fp", &no_fp, "Skip frame pointer validation"), OPT_BOOLEAN('u', "no-unreachable", &no_unreachable, "Skip 'unreachable instruction' warnings"), @@ -37,27 +43,49 @@ const struct option check_options[] = { OPT_BOOLEAN('a', "uaccess", &uaccess, "enable uaccess checking"), OPT_BOOLEAN('s', "stats", &stats, "print statistics"), OPT_BOOLEAN('d', "duplicate", &validate_dup, "duplicate validation for vmlinux.o"), + OPT_BOOLEAN('n', "noinstr", &noinstr, "noinstr validation for vmlinux.o"), OPT_BOOLEAN('l', "vmlinux", &vmlinux, "vmlinux.o validation"), + OPT_BOOLEAN('M', "mcount", &mcount, "generate __mcount_loc"), + OPT_BOOLEAN('B', "backup", &backup, "create .orig files before modification"), OPT_BOOLEAN('S', "sls", &sls, "validate straight-line-speculation"), OPT_END(), }; -int cmd_check(int argc, const char **argv) +int cmd_parse_options(int argc, const char **argv, const char * const usage[]) { - const char *objname, *s; - struct objtool_file *file; - int ret; + const char *envv[16] = { }; + char *env; + int envc; + + env = getenv("OBJTOOL_ARGS"); + if (env) { + envv[0] = "OBJTOOL_ARGS"; + for (envc = 1; envc < ARRAY_SIZE(envv); ) { + envv[envc++] = env; + env = strchr(env, ' '); + if (!env) + break; + *env = '\0'; + env++; + } - argc = parse_options(argc, argv, check_options, check_usage, 0); + parse_options(envc, envv, check_options, env_usage, 0); + } + argc = parse_options(argc, argv, check_options, usage, 0); if (argc != 1) - usage_with_options(check_usage, check_options); + usage_with_options(usage, check_options); + return argc; +} - objname = argv[0]; +int cmd_check(int argc, const char **argv) +{ + const char *objname; + struct objtool_file *file; + int ret; - s = strstr(objname, "vmlinux.o"); - if (s && !s[9]) - vmlinux = true; + argc = cmd_parse_options(argc, argv, check_usage); + objname = argv[0]; file = objtool_open_read(objname); if (!file) diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c index 508bdf6ae8dc6..17f8b93077381 100644 --- a/tools/objtool/builtin-orc.c +++ b/tools/objtool/builtin-orc.c @@ -13,8 +13,8 @@ */ #include -#include "builtin.h" -#include "objtool.h" +#include +#include static const char *orc_usage[] = { "objtool orc generate [] file.o", @@ -34,10 +34,7 @@ int cmd_orc(int argc, const char **argv) struct objtool_file *file; int ret; - argc = parse_options(argc, argv, check_options, orc_usage, 0); - if (argc != 1) - usage_with_options(orc_usage, check_options); - + argc = cmd_parse_options(argc, argv, orc_usage); objname = argv[0]; file = objtool_open_read(objname); diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 9a0a54194636c..2a6c9dfc43491 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -8,13 +8,14 @@ #include #include -#include "builtin.h" -#include "cfi.h" -#include "arch.h" -#include "check.h" -#include "special.h" -#include "warn.h" -#include "arch_elf.h" +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -269,7 +270,7 @@ static void init_insn_state(struct insn_state *state, struct section *sec) * not correctly determine insn->call_dest->sec (external symbols do * not have a section). */ - if (vmlinux && sec) + if (vmlinux && noinstr && sec) state->noinstr = sec->noinstr; } @@ -353,7 +354,7 @@ static int decode_instructions(struct objtool_file *file) { struct section *sec; struct symbol *func; - unsigned long offset; + unsigned long offset, next_offset; struct instruction *insn; int ret; @@ -372,7 +373,14 @@ static int decode_instructions(struct objtool_file *file) !strncmp(sec->name, ".text.__x86.", 12)) sec->noinstr = true; - for (offset = 0; offset < sec->len; offset += insn->len) { + for (offset = 0; offset < sec->len; offset = next_offset) { + struct symbol *obj_sym = find_object_containing(sec, offset); + if (obj_sym) { + /* This is data in the middle of text section, skip it */ + next_offset = obj_sym->offset + obj_sym->len; + continue; + } + insn = malloc(sizeof(*insn)); if (!insn) { WARN("malloc failed"); @@ -396,6 +404,8 @@ static int decode_instructions(struct objtool_file *file) hash_add(file->insn_hash, &insn->hash, sec_offset_hash(sec, insn->offset)); list_add_tail(&insn->list, &file->insn_list); nr_insns++; + + next_offset = offset + insn->len; } list_for_each_entry(func, &sec->symbol_list, list) { @@ -416,6 +426,9 @@ static int decode_instructions(struct objtool_file *file) if (stats) printf("nr_insns: %lu\n", nr_insns); + if (arch_post_process_instructions(file)) + return -1; + return 0; err: @@ -704,6 +717,68 @@ static int create_return_sites_sections(struct objtool_file *file) return 0; } +static int create_mcount_loc_sections(struct objtool_file *file) +{ + struct section *sec, *reloc_sec; + unsigned long *loc; + struct instruction *insn; + int idx; + + sec = find_section_by_name(file->elf, "__mcount_loc"); + if (sec) { + INIT_LIST_HEAD(&file->mcount_loc_list); + WARN("file already has __mcount_loc section, skipping"); + return 0; + } + + if (list_empty(&file->mcount_loc_list)) + return 0; + + idx = 0; + list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) + idx++; + + sec = elf_create_section(file->elf, "__mcount_loc", 0, sizeof(unsigned long), idx); + if (!sec) + return -1; + + reloc_sec = elf_create_reloc_section(file->elf, sec, SHT_RELA); + if (!reloc_sec) + return -1; + + idx = 0; + list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) { + struct symbol *sym; + int addend; + + loc = (unsigned long *)sec->data->d_buf + idx; + memset(loc, 0, sizeof(unsigned long)); + + if (insn->sec->sym) { + sym = insn->sec->sym; + addend = insn->offset; + } else { + sym = find_symbol_containing(insn->sec, insn->offset); + + if (!sym) { + WARN("missing symbol for insn at offset 0x%lx\n", + insn->offset); + return -1; + } + + addend = insn->offset - sym->offset; + } + + if (elf_add_reloc(file->elf, sec, idx * sizeof(unsigned long), + R_X86_64_64, sym, addend)) + return -1; + + idx++; + } + + return 0; +} + /* * Warnings shouldn't be reported for ignored functions. */ @@ -1284,6 +1359,22 @@ static int add_call_destinations(struct objtool_file *file) } else add_call_dest(file, insn, reloc->sym, false); + + if (mcount && !strcmp(insn->call_dest->name, "__fentry__")) { + if (reloc) { + reloc->type = R_NONE; + elf_write_reloc(file->elf, reloc); + } + + elf_write_insn(file->elf, insn->sec, + insn->offset, insn->len, + arch_nop_insn(insn->len)); + + insn->type = INSN_NOP; + + list_add_tail(&insn->mcount_loc_node, + &file->mcount_loc_list); + } } return 0; @@ -1501,6 +1592,9 @@ static int add_special_section_alts(struct objtool_file *file) continue; } + if (special_alt->skip_alt && !special_alt->new_len) + continue; + ret = handle_group_alt(file, special_alt, orig_insn, &new_insn); if (ret) @@ -1806,7 +1900,7 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - cfi.cfa.offset = hint->sp_offset; + cfi.cfa.offset = bswap_if_needed(hint->sp_offset); cfi.type = hint->type; cfi.end = hint->end; @@ -2112,12 +2206,20 @@ static bool has_modified_stack_frame(struct instruction *insn, struct insn_state return false; } +static bool check_reg_frame_pos(const struct cfi_reg *reg, + int expected_offset) +{ + return reg->base == CFI_CFA && + reg->offset == expected_offset; +} + static bool has_valid_stack_frame(struct insn_state *state) { struct cfi_state *cfi = &state->cfi; - if (cfi->cfa.base == CFI_BP && cfi->regs[CFI_BP].base == CFI_CFA && - cfi->regs[CFI_BP].offset == -16) + if (cfi->cfa.base == CFI_BP && + check_reg_frame_pos(&cfi->regs[CFI_BP], -cfi->cfa.offset) && + check_reg_frame_pos(&cfi->regs[CFI_RA], -cfi->cfa.offset + 8)) return true; if (cfi->drap && cfi->regs[CFI_BP].base == CFI_BP) @@ -2219,8 +2321,9 @@ static void restore_reg(struct cfi_state *cfi, unsigned char reg) * 41 5d pop %r13 * c3 retq */ -static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, - struct stack_op *op) +static int update_cfi_state(struct instruction *insn, + struct instruction *next_insn, + struct cfi_state *cfi, struct stack_op *op) { struct cfi_reg *cfa = &cfi->cfa; struct cfi_reg *regs = cfi->regs; @@ -2246,8 +2349,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, case OP_SRC_REG: if (op->src.reg == CFI_SP && op->dest.reg == CFI_BP && cfa->base == CFI_SP && - regs[CFI_BP].base == CFI_CFA && - regs[CFI_BP].offset == -cfa->offset) { + check_reg_frame_pos(®s[CFI_BP], -cfa->offset)) { /* mov %rsp, %rbp */ cfa->base = op->dest.reg; @@ -2307,12 +2409,58 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, cfa->offset = -cfi->vals[op->src.reg].offset; cfi->stack_size = cfa->offset; + } else if (cfa->base == CFI_SP && + cfi->vals[op->src.reg].base == CFI_SP_INDIRECT && + cfi->vals[op->src.reg].offset == cfa->offset) { + + /* + * Stack swizzle: + * + * 1: mov %rsp, (%[tos]) + * 2: mov %[tos], %rsp + * ... + * 3: pop %rsp + * + * Where: + * + * 1 - places a pointer to the previous + * stack at the Top-of-Stack of the + * new stack. + * + * 2 - switches to the new stack. + * + * 3 - pops the Top-of-Stack to restore + * the original stack. + * + * Note: we set base to SP_INDIRECT + * here and preserve offset. Therefore + * when the unwinder reaches ToS it + * will dereference SP and then add the + * offset to find the next frame, IOW: + * (%rsp) + offset. + */ + cfa->base = CFI_SP_INDIRECT; + } else { cfa->base = CFI_UNDEFINED; cfa->offset = 0; } } + else if (op->dest.reg == CFI_SP && + cfi->vals[op->src.reg].base == CFI_SP_INDIRECT && + cfi->vals[op->src.reg].offset == cfa->offset) { + + /* + * The same stack swizzle case 2) as above. But + * because we can't change cfa->base, case 3) + * will become a regular POP. Pretend we're a + * PUSH so things don't go unbalanced. + */ + cfi->stack_size += 8; + } + + break; case OP_SRC_ADD: @@ -2332,6 +2480,17 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, break; } + if (!cfi->drap && op->src.reg == CFI_SP && + op->dest.reg == CFI_BP && cfa->base == CFI_SP && + check_reg_frame_pos(®s[CFI_BP], -cfa->offset + op->src.offset)) { + + /* lea disp(%rsp), %rbp */ + cfa->base = CFI_BP; + cfa->offset -= op->src.offset; + cfi->bp_scratch = false; + break; + } + if (op->src.reg == CFI_SP && cfa->base == CFI_SP) { /* drap: lea disp(%rsp), %drap */ @@ -2365,7 +2524,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, break; } - if (op->dest.reg == cfi->cfa.base) { + if (op->dest.reg == cfi->cfa.base && !(next_insn && next_insn->hint)) { WARN_FUNC("unsupported stack register modification", insn->sec, insn->offset); return -1; @@ -2398,6 +2557,13 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, case OP_SRC_POP: case OP_SRC_POPF: + if (op->dest.reg == CFI_SP && cfa->base == CFI_SP_INDIRECT) { + + /* pop %rsp; # restore from a stack swizzle */ + cfa->base = CFI_SP; + break; + } + if (!cfi->drap && op->dest.reg == cfa->base) { /* pop %rbp */ @@ -2426,6 +2592,14 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, break; case OP_SRC_REG_INDIRECT: + if (!cfi->drap && op->dest.reg == cfa->base && + op->dest.reg == CFI_BP) { + + /* mov disp(%rsp), %rbp */ + cfa->base = CFI_SP; + cfa->offset = cfi->stack_size; + } + if (cfi->drap && op->src.reg == CFI_BP && op->src.offset == cfi->drap_offset) { @@ -2447,6 +2621,12 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, /* mov disp(%rbp), %reg */ /* mov disp(%rsp), %reg */ restore_reg(cfi, op->dest.reg); + + } else if (op->src.reg == CFI_SP && + op->src.offset == regs[op->dest.reg].offset + cfi->stack_size) { + + /* mov disp(%rsp), %reg */ + restore_reg(cfi, op->dest.reg); } break; @@ -2524,6 +2704,18 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, /* mov reg, disp(%rsp) */ save_reg(cfi, op->src.reg, CFI_CFA, op->dest.offset - cfi->cfa.offset); + + } else if (op->dest.reg == CFI_SP) { + + /* mov reg, disp(%rsp) */ + save_reg(cfi, op->src.reg, CFI_CFA, + op->dest.offset - cfi->stack_size); + + } else if (op->src.reg == CFI_SP && op->dest.offset == 0) { + + /* mov %rsp, (%reg); # setup a stack swizzle. */ + cfi->vals[op->dest.reg].base = CFI_SP_INDIRECT; + cfi->vals[op->dest.reg].offset = cfa->offset; } break; @@ -2609,15 +2801,20 @@ static int propagate_alt_cfi(struct objtool_file *file, struct instruction *insn return 0; } -static int handle_insn_ops(struct instruction *insn, struct insn_state *state) +static int handle_insn_ops(struct instruction *insn, + struct instruction *next_insn, + struct insn_state *state) { struct stack_op *op; list_for_each_entry(op, &insn->stack_ops, list) { - if (update_cfi_state(insn, &state->cfi, op)) + if (update_cfi_state(insn, next_insn, &state->cfi, op)) return 1; + if (!insn->alt_group) + continue; + if (op->dest.type == OP_DEST_PUSHF) { if (!state->uaccess_stack) { state->uaccess_stack = 1; @@ -2939,7 +3136,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 0; } - if (handle_insn_ops(insn, &state)) + if (handle_insn_ops(insn, next_insn, &state)) return 1; switch (insn->type) { @@ -3554,6 +3751,13 @@ int check(struct objtool_file *file) warnings += ret; } + if (mcount) { + ret = create_mcount_loc_sections(file); + if (ret < 0) + goto out; + warnings += ret; + } + if (rethunk) { ret = create_return_sites_sections(file); if (ret < 0) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index a2ea3931e01d5..c7326b1505c2b 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -15,10 +15,10 @@ #include #include #include -#include "builtin.h" +#include -#include "elf.h" -#include "warn.h" +#include +#include #define MAX_NAME_LEN 128 @@ -43,75 +43,24 @@ static void elf_hash_init(struct hlist_head *table) #define elf_hash_for_each_possible(name, obj, member, key) \ hlist_for_each_entry(obj, &name[hash_min(key, elf_hash_bits())], member) -static void rb_add(struct rb_root *tree, struct rb_node *node, - int (*cmp)(struct rb_node *, const struct rb_node *)) -{ - struct rb_node **link = &tree->rb_node; - struct rb_node *parent = NULL; - - while (*link) { - parent = *link; - if (cmp(node, parent) < 0) - link = &parent->rb_left; - else - link = &parent->rb_right; - } - - rb_link_node(node, parent, link); - rb_insert_color(node, tree); -} - -static struct rb_node *rb_find_first(const struct rb_root *tree, const void *key, - int (*cmp)(const void *key, const struct rb_node *)) -{ - struct rb_node *node = tree->rb_node; - struct rb_node *match = NULL; - - while (node) { - int c = cmp(key, node); - if (c <= 0) { - if (!c) - match = node; - node = node->rb_left; - } else if (c > 0) { - node = node->rb_right; - } - } - - return match; -} - -static struct rb_node *rb_next_match(struct rb_node *node, const void *key, - int (*cmp)(const void *key, const struct rb_node *)) -{ - node = rb_next(node); - if (node && cmp(key, node)) - node = NULL; - return node; -} - -#define rb_for_each(tree, node, key, cmp) \ - for ((node) = rb_find_first((tree), (key), (cmp)); \ - (node); (node) = rb_next_match((node), (key), (cmp))) - -static int symbol_to_offset(struct rb_node *a, const struct rb_node *b) +static bool symbol_to_offset(struct rb_node *a, const struct rb_node *b) { struct symbol *sa = rb_entry(a, struct symbol, node); struct symbol *sb = rb_entry(b, struct symbol, node); if (sa->offset < sb->offset) - return -1; + return true; if (sa->offset > sb->offset) - return 1; + return false; if (sa->len < sb->len) - return -1; + return true; if (sa->len > sb->len) - return 1; + return false; sa->alias = sb; - return 0; + return false; } static int symbol_by_offset(const void *key, const struct rb_node *node) @@ -165,7 +114,7 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) { struct rb_node *node; - rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { struct symbol *s = rb_entry(node, struct symbol, node); if (s->offset == offset && s->type != STT_SECTION) @@ -179,7 +128,7 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) { struct rb_node *node; - rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { struct symbol *s = rb_entry(node, struct symbol, node); if (s->offset == offset && s->type == STT_FUNC) @@ -193,7 +142,7 @@ struct symbol *find_symbol_containing(const struct section *sec, unsigned long o { struct rb_node *node; - rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { struct symbol *s = rb_entry(node, struct symbol, node); if (s->type != STT_SECTION) @@ -207,7 +156,7 @@ struct symbol *find_func_containing(struct section *sec, unsigned long offset) { struct rb_node *node; - rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { struct symbol *s = rb_entry(node, struct symbol, node); if (s->type == STT_FUNC) @@ -217,6 +166,20 @@ struct symbol *find_func_containing(struct section *sec, unsigned long offset) return NULL; } +struct symbol *find_object_containing(struct section *sec, unsigned long offset) +{ + struct rb_node *node; + + rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { + struct symbol *s = rb_entry(node, struct symbol, node); + + if (s->type == STT_OBJECT) + return s; + } + + return NULL; +} + struct symbol *find_symbol_by_name(const struct elf *elf, const char *name) { struct symbol *sym; @@ -354,7 +317,7 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) sym->offset = sym->sym.st_value; sym->len = sym->sym.st_size; - rb_add(&sym->sec->symbol_tree, &sym->node, symbol_to_offset); + rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset); pnode = rb_prev(&sym->node); if (pnode) entry = &rb_entry(pnode, struct symbol, node)->list; @@ -505,10 +468,6 @@ static int read_symbols(struct elf *elf) return -1; } -static struct section *elf_create_reloc_section(struct elf *elf, - struct section *base, - int reltype); - int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, unsigned int type, struct symbol *sym, s64 addend) { @@ -1118,7 +1077,7 @@ static struct section *elf_create_rela_reloc_section(struct elf *elf, struct sec return sec; } -static struct section *elf_create_reloc_section(struct elf *elf, +struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype) { @@ -1133,25 +1092,27 @@ static int elf_rebuild_rel_reloc_section(struct section *sec, int nr) { struct reloc *reloc; int idx = 0, size; - GElf_Rel *relocs; + void *buf; /* Allocate a buffer for relocations */ - size = nr * sizeof(*relocs); - relocs = malloc(size); - if (!relocs) { + size = nr * sizeof(GElf_Rel); + buf = malloc(size); + if (!buf) { perror("malloc"); return -1; } - sec->data->d_buf = relocs; + sec->data->d_buf = buf; sec->data->d_size = size; + sec->data->d_type = ELF_T_REL; sec->sh.sh_size = size; idx = 0; list_for_each_entry(reloc, &sec->reloc_list, list) { - relocs[idx].r_offset = reloc->offset; - relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + reloc->rel.r_offset = reloc->offset; + reloc->rel.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + gelf_update_rel(sec->data, idx, &reloc->rel); idx++; } @@ -1162,26 +1123,28 @@ static int elf_rebuild_rela_reloc_section(struct section *sec, int nr) { struct reloc *reloc; int idx = 0, size; - GElf_Rela *relocs; + void *buf; /* Allocate a buffer for relocations with addends */ - size = nr * sizeof(*relocs); - relocs = malloc(size); - if (!relocs) { + size = nr * sizeof(GElf_Rela); + buf = malloc(size); + if (!buf) { perror("malloc"); return -1; } - sec->data->d_buf = relocs; + sec->data->d_buf = buf; sec->data->d_size = size; + sec->data->d_type = ELF_T_RELA; sec->sh.sh_size = size; idx = 0; list_for_each_entry(reloc, &sec->reloc_list, list) { - relocs[idx].r_offset = reloc->offset; - relocs[idx].r_addend = reloc->addend; - relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + reloc->rela.r_offset = reloc->offset; + reloc->rela.r_addend = reloc->addend; + reloc->rela.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + gelf_update_rela(sec->data, idx, &reloc->rela); idx++; } diff --git a/tools/objtool/arch.h b/tools/objtool/include/objtool/arch.h similarity index 92% rename from tools/objtool/arch.h rename to tools/objtool/include/objtool/arch.h index 580ce18575857..646944d2235ce 100644 --- a/tools/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -8,8 +8,8 @@ #include #include -#include "objtool.h" -#include "cfi.h" +#include +#include enum insn_type { INSN_JUMP_CONDITIONAL, @@ -67,6 +67,7 @@ struct stack_op { struct list_head list; }; +struct objtool_file; struct instruction; void arch_initial_func_cfi_state(struct cfi_init_state *state); @@ -77,6 +78,8 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, unsigned long *immediate, struct list_head *ops_list); +int arch_post_process_instructions(struct objtool_file *file); + bool arch_callee_saved_reg(unsigned char reg); unsigned long arch_jump_destination(struct instruction *insn); diff --git a/tools/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h similarity index 72% rename from tools/objtool/builtin.h rename to tools/objtool/include/objtool/builtin.h index 61d8d49dbc657..66ad30ec58182 100644 --- a/tools/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -9,7 +9,9 @@ extern const struct option check_options[]; extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, - validate_dup, vmlinux, sls, unret, rethunk; + validate_dup, vmlinux, mcount, noinstr, backup, sls, unret, rethunk; + +extern int cmd_parse_options(int argc, const char **argv, const char * const usage[]); extern int cmd_check(int argc, const char **argv); extern int cmd_orc(int argc, const char **argv); diff --git a/tools/objtool/cfi.h b/tools/objtool/include/objtool/cfi.h similarity index 96% rename from tools/objtool/cfi.h rename to tools/objtool/include/objtool/cfi.h index f579802d7ec24..f11d1ac1dadf1 100644 --- a/tools/objtool/cfi.h +++ b/tools/objtool/include/objtool/cfi.h @@ -6,7 +6,7 @@ #ifndef _OBJTOOL_CFI_H #define _OBJTOOL_CFI_H -#include "cfi_regs.h" +#include #include #define CFI_UNDEFINED -1 diff --git a/tools/objtool/check.h b/tools/objtool/include/objtool/check.h similarity index 96% rename from tools/objtool/check.h rename to tools/objtool/include/objtool/check.h index 7f34a7f9ca523..f704b3760dfbb 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -7,8 +7,8 @@ #define _CHECK_H #include -#include "cfi.h" -#include "arch.h" +#include +#include struct insn_state { struct cfi_state cfi; @@ -40,6 +40,7 @@ struct instruction { struct list_head list; struct hlist_node hash; struct list_head call_node; + struct list_head mcount_loc_node; struct section *sec; unsigned long offset; unsigned int len; diff --git a/tools/objtool/elf.h b/tools/objtool/include/objtool/elf.h similarity index 96% rename from tools/objtool/elf.h rename to tools/objtool/include/objtool/elf.h index a1863eb35fbbc..c3b8e67a72b64 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -126,6 +126,7 @@ static inline u32 reloc_hash(struct reloc *reloc) struct elf *elf_open_read(const char *name, int flags); struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr); +struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype); int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, unsigned int type, struct symbol *sym, s64 addend); @@ -149,6 +150,7 @@ struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, uns struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len); struct symbol *find_func_containing(struct section *sec, unsigned long offset); +struct symbol *find_object_containing(struct section *sec, unsigned long offset); #define for_each_sec(file, sec) \ list_for_each_entry(sec, &file->elf->sections, list) diff --git a/tools/objtool/include/objtool/endianness.h b/tools/objtool/include/objtool/endianness.h new file mode 100644 index 0000000000000..10241341eff35 --- /dev/null +++ b/tools/objtool/include/objtool/endianness.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_ENDIANNESS_H +#define _OBJTOOL_ENDIANNESS_H + +#include +#include +#include + +#ifndef __TARGET_BYTE_ORDER +#error undefined arch __TARGET_BYTE_ORDER +#endif + +#if __BYTE_ORDER != __TARGET_BYTE_ORDER +#define __NEED_BSWAP 1 +#else +#define __NEED_BSWAP 0 +#endif + +/* + * Does a byte swap if target endianness doesn't match the host, i.e. cross + * compilation for little endian on big endian and vice versa. + * To be used for multi-byte values conversion, which are read from / about + * to be written to a target native endianness ELF file. + */ +#define bswap_if_needed(val) \ +({ \ + __typeof__(val) __ret; \ + switch (sizeof(val)) { \ + case 8: __ret = __NEED_BSWAP ? bswap_64(val) : (val); break; \ + case 4: __ret = __NEED_BSWAP ? bswap_32(val) : (val); break; \ + case 2: __ret = __NEED_BSWAP ? bswap_16(val) : (val); break; \ + default: \ + BUILD_BUG(); break; \ + } \ + __ret; \ +}) + +#endif /* _OBJTOOL_ENDIANNESS_H */ diff --git a/tools/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h similarity index 92% rename from tools/objtool/objtool.h rename to tools/objtool/include/objtool/objtool.h index bf64946e749bc..1e1147c0eaffd 100644 --- a/tools/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -10,7 +10,7 @@ #include #include -#include "elf.h" +#include #define __weak __attribute__((weak)) @@ -21,6 +21,7 @@ struct objtool_file { struct list_head retpoline_call_list; struct list_head return_thunk_list; struct list_head static_call_list; + struct list_head mcount_loc_list; bool ignore_unreachables, c_file, hints, rodata; }; diff --git a/tools/objtool/special.h b/tools/objtool/include/objtool/special.h similarity index 94% rename from tools/objtool/special.h rename to tools/objtool/include/objtool/special.h index abddf38ef3346..8a09f4e9d480e 100644 --- a/tools/objtool/special.h +++ b/tools/objtool/include/objtool/special.h @@ -7,8 +7,8 @@ #define _SPECIAL_H #include -#include "check.h" -#include "elf.h" +#include +#include #define C_JUMP_TABLE_SECTION ".rodata..c_jump_table" diff --git a/tools/objtool/warn.h b/tools/objtool/include/objtool/warn.h similarity index 98% rename from tools/objtool/warn.h rename to tools/objtool/include/objtool/warn.h index 7799f60de80af..d99c4675e4a5f 100644 --- a/tools/objtool/warn.h +++ b/tools/objtool/include/objtool/warn.h @@ -11,7 +11,7 @@ #include #include #include -#include "elf.h" +#include extern const char *objname; diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index cb2c6acd9667f..24650d533d85c 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -17,13 +17,14 @@ #include #include #include +#include #include #include #include -#include "builtin.h" -#include "objtool.h" -#include "warn.h" +#include +#include +#include struct cmd_struct { const char *name; @@ -44,6 +45,64 @@ bool help; const char *objname; static struct objtool_file file; +static bool objtool_create_backup(const char *_objname) +{ + int len = strlen(_objname); + char *buf, *base, *name = malloc(len+6); + int s, d, l, t; + + if (!name) { + perror("failed backup name malloc"); + return false; + } + + strcpy(name, _objname); + strcpy(name + len, ".orig"); + + d = open(name, O_CREAT|O_WRONLY|O_TRUNC, 0644); + if (d < 0) { + perror("failed to create backup file"); + return false; + } + + s = open(_objname, O_RDONLY); + if (s < 0) { + perror("failed to open orig file"); + return false; + } + + buf = malloc(4096); + if (!buf) { + perror("failed backup data malloc"); + return false; + } + + while ((l = read(s, buf, 4096)) > 0) { + base = buf; + do { + t = write(d, base, l); + if (t < 0) { + perror("failed backup write"); + return false; + } + base += t; + l -= t; + } while (l); + } + + if (l < 0) { + perror("failed backup read"); + return false; + } + + free(name); + free(buf); + close(d); + close(s); + + return true; +} + struct objtool_file *objtool_open_read(const char *_objname) { if (objname) { @@ -59,11 +118,17 @@ struct objtool_file *objtool_open_read(const char *_objname) if (!file.elf) return NULL; + if (backup && !objtool_create_backup(objname)) { + WARN("can't create backup file"); + return NULL; + } + INIT_LIST_HEAD(&file.insn_list); hash_init(file.insn_hash); INIT_LIST_HEAD(&file.retpoline_call_list); INIT_LIST_HEAD(&file.return_thunk_list); INIT_LIST_HEAD(&file.static_call_list); + INIT_LIST_HEAD(&file.mcount_loc_list); file.c_file = !vmlinux && find_section_by_name(file.elf, ".comment"); file.ignore_unreachables = no_unreachable; file.hints = false; diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c index 5e6a95368d351..c53fae9dbe93b 100644 --- a/tools/objtool/orc_dump.c +++ b/tools/objtool/orc_dump.c @@ -6,8 +6,9 @@ #include #include #include -#include "objtool.h" -#include "warn.h" +#include +#include +#include static const char *reg_name(unsigned int reg) { @@ -197,11 +198,11 @@ int orc_dump(const char *_objname) printf(" sp:"); - print_reg(orc[i].sp_reg, orc[i].sp_offset); + print_reg(orc[i].sp_reg, bswap_if_needed(orc[i].sp_offset)); printf(" bp:"); - print_reg(orc[i].bp_reg, orc[i].bp_offset); + print_reg(orc[i].bp_reg, bswap_if_needed(orc[i].bp_offset)); printf(" type:%s end:%d\n", orc_type_name(orc[i].type), orc[i].end); diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index 812b33ed9f652..ddacb42157485 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -9,8 +9,9 @@ #include #include -#include "check.h" -#include "warn.h" +#include +#include +#include static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn) @@ -96,6 +97,8 @@ static int write_orc_entry(struct elf *elf, struct section *orc_sec, /* populate ORC data */ orc = (struct orc_entry *)orc_sec->data->d_buf + idx; memcpy(orc, o, sizeof(*orc)); + orc->sp_offset = bswap_if_needed(orc->sp_offset); + orc->bp_offset = bswap_if_needed(orc->bp_offset); /* populate reloc for ip */ if (elf_add_reloc_to_insn(elf, ip_sec, idx * sizeof(int), R_X86_64_PC32, diff --git a/tools/objtool/special.c b/tools/objtool/special.c index aff0cee7bac17..603ce9f2c4b10 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -11,10 +11,11 @@ #include #include -#include "builtin.h" -#include "special.h" -#include "warn.h" -#include "arch_special.h" +#include +#include +#include +#include +#include struct special_entry { const char *sec; @@ -84,8 +85,9 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, if (entry->feature) { unsigned short feature; - feature = *(unsigned short *)(sec->data->d_buf + offset + - entry->feature); + feature = bswap_if_needed(*(unsigned short *)(sec->data->d_buf + + offset + + entry->feature)); arch_handle_alternative(feature, alt); } diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh index 4bbabaecab14e..bdcc443ea0942 100755 --- a/tools/objtool/sync-check.sh +++ b/tools/objtool/sync-check.sh @@ -24,6 +24,14 @@ arch/x86/include/asm/insn.h arch/x86/lib/inat.c arch/x86/lib/insn.c ' +elif [ "$SRCARCH" = "arm64" ]; then +FILES="$FILES +arch/arm64/include/asm/insn.h +" + +SYNC_CHECK_FILES=' +arch/arm64/lib/insn.c +' fi check_2 () { @@ -67,7 +75,7 @@ done < #include -#include "objtool.h" +#include #define UNSUPPORTED(name) \ ({ \ diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 12ee40284da02..2060bc122c530 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -40,7 +40,7 @@ struct ipv6_packet pkt_v6 = { .tcp.doff = 5, }; -static int settimeo(int fd, int timeout_ms) +int settimeo(int fd, int timeout_ms) { struct timeval timeout = { .tv_sec = 3 }; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 7205f8afdba11..5e0d51c07b632 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -33,6 +33,7 @@ struct ipv6_packet { } __packed; extern struct ipv6_packet pkt_v6; +int settimeo(int fd, int timeout_ms); int start_server(int family, int type, const char *addr, __u16 port, int timeout_ms); int connect_to_fd(int server_fd, int timeout_ms); diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c index b58b775d19f3f..97f38d4f6a263 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c @@ -6,6 +6,7 @@ #include #include "test_ksyms_btf.skel.h" #include "test_ksyms_btf_null_check.skel.h" +#include "test_ksyms_btf_write_check.skel.h" static int duration; @@ -81,6 +82,16 @@ static void test_null_check(void) test_ksyms_btf_null_check__destroy(skel); } +static void test_write_check(void) +{ + struct test_ksyms_btf_write_check *skel; + + skel = test_ksyms_btf_write_check__open_and_load(); + CHECK(skel, "skel_open", "unexpected load of a prog writing to ksym memory\n"); + + test_ksyms_btf_write_check__destroy(skel); +} + void test_ksyms_btf(void) { int percpu_datasec; @@ -106,4 +117,7 @@ void test_ksyms_btf(void) if (test__start_subtest("null_check")) test_null_check(); + + if (test__start_subtest("write_check")) + test_write_check(); } diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c new file mode 100644 index 0000000000000..0fa3f750567de --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check if we can migrate child sockets. + * + * 1. call listen() for 4 server sockets. + * 2. call connect() for 25 client sockets. + * 3. call listen() for 1 server socket. (migration target) + * 4. update a map to migrate all child sockets + * to the last server socket (migrate_map[cookie] = 4) + * 5. call shutdown() for first 4 server sockets + * and migrate the requests in the accept queue + * to the last server socket. + * 6. call listen() for the second server socket. + * 7. call shutdown() for the last server + * and migrate the requests in the accept queue + * to the second server socket. + * 8. call listen() for the last server. + * 9. call shutdown() for the second server + * and migrate the requests in the accept queue + * to the last server socket. + * 10. call accept() for the last server socket. + * + * Author: Kuniyuki Iwashima + */ + +#include +#include + +#include "test_progs.h" +#include "test_migrate_reuseport.skel.h" +#include "network_helpers.h" + +#define IFINDEX_LO 1 + +#define NR_SERVERS 5 +#define NR_CLIENTS (NR_SERVERS * 5) +#define MIGRATED_TO (NR_SERVERS - 1) + +/* fastopenq->max_qlen and sk->sk_max_ack_backlog */ +#define QLEN (NR_CLIENTS * 5) + +#define MSG "Hello World\0" +#define MSGLEN 12 + +static struct migrate_reuseport_test_case { + const char *name; + __s64 servers[NR_SERVERS]; + __s64 clients[NR_CLIENTS]; + struct sockaddr_storage addr; + socklen_t addrlen; + int family; + int state; + bool drop_ack; + bool expire_synack_timer; + bool fastopen; + struct bpf_link *link; +} test_cases[] = { + { + .name = "IPv4 TCP_ESTABLISHED inet_csk_listen_stop", + .family = AF_INET, + .state = BPF_TCP_ESTABLISHED, + .drop_ack = false, + .expire_synack_timer = false, + .fastopen = false, + }, + { + .name = "IPv4 TCP_SYN_RECV inet_csk_listen_stop", + .family = AF_INET, + .state = BPF_TCP_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = false, + .fastopen = true, + }, + { + .name = "IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler", + .family = AF_INET, + .state = BPF_TCP_NEW_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = true, + .fastopen = false, + }, + { + .name = "IPv4 TCP_NEW_SYN_RECV inet_csk_complete_hashdance", + .family = AF_INET, + .state = BPF_TCP_NEW_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = false, + .fastopen = false, + }, + { + .name = "IPv6 TCP_ESTABLISHED inet_csk_listen_stop", + .family = AF_INET6, + .state = BPF_TCP_ESTABLISHED, + .drop_ack = false, + .expire_synack_timer = false, + .fastopen = false, + }, + { + .name = "IPv6 TCP_SYN_RECV inet_csk_listen_stop", + .family = AF_INET6, + .state = BPF_TCP_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = false, + .fastopen = true, + }, + { + .name = "IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler", + .family = AF_INET6, + .state = BPF_TCP_NEW_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = true, + .fastopen = false, + }, + { + .name = "IPv6 TCP_NEW_SYN_RECV inet_csk_complete_hashdance", + .family = AF_INET6, + .state = BPF_TCP_NEW_SYN_RECV, + .drop_ack = true, + .expire_synack_timer = false, + .fastopen = false, + } +}; + +static void init_fds(__s64 fds[], int len) +{ + int i; + + for (i = 0; i < len; i++) + fds[i] = -1; +} + +static void close_fds(__s64 fds[], int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (fds[i] != -1) { + close(fds[i]); + fds[i] = -1; + } + } +} + +static int setup_fastopen(char *buf, int size, int *saved_len, bool restore) +{ + int err = 0, fd, len; + + fd = open("/proc/sys/net/ipv4/tcp_fastopen", O_RDWR); + if (!ASSERT_NEQ(fd, -1, "open")) + return -1; + + if (restore) { + len = write(fd, buf, *saved_len); + if (!ASSERT_EQ(len, *saved_len, "write - restore")) + err = -1; + } else { + *saved_len = read(fd, buf, size); + if (!ASSERT_GE(*saved_len, 1, "read")) { + err = -1; + goto close; + } + + err = lseek(fd, 0, SEEK_SET); + if (!ASSERT_OK(err, "lseek")) + goto close; + + /* (TFO_CLIENT_ENABLE | TFO_SERVER_ENABLE | + * TFO_CLIENT_NO_COOKIE | TFO_SERVER_COOKIE_NOT_REQD) + */ + len = write(fd, "519", 3); + if (!ASSERT_EQ(len, 3, "write - setup")) + err = -1; + } + +close: + close(fd); + + return err; +} + +static int drop_ack(struct migrate_reuseport_test_case *test_case, + struct test_migrate_reuseport *skel) +{ + if (test_case->family == AF_INET) + skel->bss->server_port = ((struct sockaddr_in *) + &test_case->addr)->sin_port; + else + skel->bss->server_port = ((struct sockaddr_in6 *) + &test_case->addr)->sin6_port; + + test_case->link = bpf_program__attach_xdp(skel->progs.drop_ack, + IFINDEX_LO); + if (!ASSERT_OK_PTR(test_case->link, "bpf_program__attach_xdp")) + return -1; + + return 0; +} + +static int pass_ack(struct migrate_reuseport_test_case *test_case) +{ + int err; + + err = bpf_link__detach(test_case->link); + if (!ASSERT_OK(err, "bpf_link__detach")) + return -1; + + test_case->link = NULL; + + return 0; +} + +static int start_servers(struct migrate_reuseport_test_case *test_case, + struct test_migrate_reuseport *skel) +{ + int i, err, prog_fd, reuseport = 1, qlen = QLEN; + + prog_fd = bpf_program__fd(skel->progs.migrate_reuseport); + + make_sockaddr(test_case->family, + test_case->family == AF_INET ? "127.0.0.1" : "::1", 0, + &test_case->addr, &test_case->addrlen); + + for (i = 0; i < NR_SERVERS; i++) { + test_case->servers[i] = socket(test_case->family, SOCK_STREAM, + IPPROTO_TCP); + if (!ASSERT_NEQ(test_case->servers[i], -1, "socket")) + return -1; + + err = setsockopt(test_case->servers[i], SOL_SOCKET, + SO_REUSEPORT, &reuseport, sizeof(reuseport)); + if (!ASSERT_OK(err, "setsockopt - SO_REUSEPORT")) + return -1; + + err = bind(test_case->servers[i], + (struct sockaddr *)&test_case->addr, + test_case->addrlen); + if (!ASSERT_OK(err, "bind")) + return -1; + + if (i == 0) { + err = setsockopt(test_case->servers[i], SOL_SOCKET, + SO_ATTACH_REUSEPORT_EBPF, + &prog_fd, sizeof(prog_fd)); + if (!ASSERT_OK(err, + "setsockopt - SO_ATTACH_REUSEPORT_EBPF")) + return -1; + + err = getsockname(test_case->servers[i], + (struct sockaddr *)&test_case->addr, + &test_case->addrlen); + if (!ASSERT_OK(err, "getsockname")) + return -1; + } + + if (test_case->fastopen) { + err = setsockopt(test_case->servers[i], + SOL_TCP, TCP_FASTOPEN, + &qlen, sizeof(qlen)); + if (!ASSERT_OK(err, "setsockopt - TCP_FASTOPEN")) + return -1; + } + + /* All requests will be tied to the first four listeners */ + if (i != MIGRATED_TO) { + err = listen(test_case->servers[i], qlen); + if (!ASSERT_OK(err, "listen")) + return -1; + } + } + + return 0; +} + +static int start_clients(struct migrate_reuseport_test_case *test_case) +{ + char buf[MSGLEN] = MSG; + int i, err; + + for (i = 0; i < NR_CLIENTS; i++) { + test_case->clients[i] = socket(test_case->family, SOCK_STREAM, + IPPROTO_TCP); + if (!ASSERT_NEQ(test_case->clients[i], -1, "socket")) + return -1; + + /* The attached XDP program drops only the final ACK, so + * clients will transition to TCP_ESTABLISHED immediately. + */ + err = settimeo(test_case->clients[i], 100); + if (!ASSERT_OK(err, "settimeo")) + return -1; + + if (test_case->fastopen) { + int fastopen = 1; + + err = setsockopt(test_case->clients[i], IPPROTO_TCP, + TCP_FASTOPEN_CONNECT, &fastopen, + sizeof(fastopen)); + if (!ASSERT_OK(err, + "setsockopt - TCP_FASTOPEN_CONNECT")) + return -1; + } + + err = connect(test_case->clients[i], + (struct sockaddr *)&test_case->addr, + test_case->addrlen); + if (!ASSERT_OK(err, "connect")) + return -1; + + err = write(test_case->clients[i], buf, MSGLEN); + if (!ASSERT_EQ(err, MSGLEN, "write")) + return -1; + } + + return 0; +} + +static int update_maps(struct migrate_reuseport_test_case *test_case, + struct test_migrate_reuseport *skel) +{ + int i, err, migrated_to = MIGRATED_TO; + int reuseport_map_fd, migrate_map_fd; + __u64 value; + + reuseport_map_fd = bpf_map__fd(skel->maps.reuseport_map); + migrate_map_fd = bpf_map__fd(skel->maps.migrate_map); + + for (i = 0; i < NR_SERVERS; i++) { + value = (__u64)test_case->servers[i]; + err = bpf_map_update_elem(reuseport_map_fd, &i, &value, + BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem - reuseport_map")) + return -1; + + err = bpf_map_lookup_elem(reuseport_map_fd, &i, &value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem - reuseport_map")) + return -1; + + err = bpf_map_update_elem(migrate_map_fd, &value, &migrated_to, + BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem - migrate_map")) + return -1; + } + + return 0; +} + +static int migrate_dance(struct migrate_reuseport_test_case *test_case) +{ + int i, err; + + /* Migrate TCP_ESTABLISHED and TCP_SYN_RECV requests + * to the last listener based on eBPF. + */ + for (i = 0; i < MIGRATED_TO; i++) { + err = shutdown(test_case->servers[i], SHUT_RDWR); + if (!ASSERT_OK(err, "shutdown")) + return -1; + } + + /* No dance for TCP_NEW_SYN_RECV to migrate based on eBPF */ + if (test_case->state == BPF_TCP_NEW_SYN_RECV) + return 0; + + /* Note that we use the second listener instead of the + * first one here. + * + * The fist listener is bind()ed with port 0 and, + * SOCK_BINDPORT_LOCK is not set to sk_userlocks, so + * calling listen() again will bind() the first listener + * on a new ephemeral port and detach it from the existing + * reuseport group. (See: __inet_bind(), tcp_set_state()) + * + * OTOH, the second one is bind()ed with a specific port, + * and SOCK_BINDPORT_LOCK is set. Thus, re-listen() will + * resurrect the listener on the existing reuseport group. + */ + err = listen(test_case->servers[1], QLEN); + if (!ASSERT_OK(err, "listen")) + return -1; + + /* Migrate from the last listener to the second one. + * + * All listeners were detached out of the reuseport_map, + * so migration will be done by kernel random pick from here. + */ + err = shutdown(test_case->servers[MIGRATED_TO], SHUT_RDWR); + if (!ASSERT_OK(err, "shutdown")) + return -1; + + /* Back to the existing reuseport group */ + err = listen(test_case->servers[MIGRATED_TO], QLEN); + if (!ASSERT_OK(err, "listen")) + return -1; + + /* Migrate back to the last one from the second one */ + err = shutdown(test_case->servers[1], SHUT_RDWR); + if (!ASSERT_OK(err, "shutdown")) + return -1; + + return 0; +} + +static void count_requests(struct migrate_reuseport_test_case *test_case, + struct test_migrate_reuseport *skel) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int err, cnt = 0, client; + char buf[MSGLEN]; + + err = settimeo(test_case->servers[MIGRATED_TO], 4000); + if (!ASSERT_OK(err, "settimeo")) + goto out; + + for (; cnt < NR_CLIENTS; cnt++) { + client = accept(test_case->servers[MIGRATED_TO], + (struct sockaddr *)&addr, &len); + if (!ASSERT_NEQ(client, -1, "accept")) + goto out; + + memset(buf, 0, MSGLEN); + read(client, &buf, MSGLEN); + close(client); + + if (!ASSERT_STREQ(buf, MSG, "read")) + goto out; + } + +out: + ASSERT_EQ(cnt, NR_CLIENTS, "count in userspace"); + + switch (test_case->state) { + case BPF_TCP_ESTABLISHED: + cnt = skel->bss->migrated_at_close; + break; + case BPF_TCP_SYN_RECV: + cnt = skel->bss->migrated_at_close_fastopen; + break; + case BPF_TCP_NEW_SYN_RECV: + if (test_case->expire_synack_timer) + cnt = skel->bss->migrated_at_send_synack; + else + cnt = skel->bss->migrated_at_recv_ack; + break; + default: + cnt = 0; + } + + ASSERT_EQ(cnt, NR_CLIENTS, "count in BPF prog"); +} + +static void run_test(struct migrate_reuseport_test_case *test_case, + struct test_migrate_reuseport *skel) +{ + int err, saved_len; + char buf[16]; + + skel->bss->migrated_at_close = 0; + skel->bss->migrated_at_close_fastopen = 0; + skel->bss->migrated_at_send_synack = 0; + skel->bss->migrated_at_recv_ack = 0; + + init_fds(test_case->servers, NR_SERVERS); + init_fds(test_case->clients, NR_CLIENTS); + + if (test_case->fastopen) { + memset(buf, 0, sizeof(buf)); + + err = setup_fastopen(buf, sizeof(buf), &saved_len, false); + if (!ASSERT_OK(err, "setup_fastopen - setup")) + return; + } + + err = start_servers(test_case, skel); + if (!ASSERT_OK(err, "start_servers")) + goto close_servers; + + if (test_case->drop_ack) { + /* Drop the final ACK of the 3-way handshake and stick the + * in-flight requests on TCP_SYN_RECV or TCP_NEW_SYN_RECV. + */ + err = drop_ack(test_case, skel); + if (!ASSERT_OK(err, "drop_ack")) + goto close_servers; + } + + /* Tie requests to the first four listners */ + err = start_clients(test_case); + if (!ASSERT_OK(err, "start_clients")) + goto close_clients; + + err = listen(test_case->servers[MIGRATED_TO], QLEN); + if (!ASSERT_OK(err, "listen")) + goto close_clients; + + err = update_maps(test_case, skel); + if (!ASSERT_OK(err, "fill_maps")) + goto close_clients; + + /* Migrate the requests in the accept queue only. + * TCP_NEW_SYN_RECV requests are not migrated at this point. + */ + err = migrate_dance(test_case); + if (!ASSERT_OK(err, "migrate_dance")) + goto close_clients; + + if (test_case->expire_synack_timer) { + /* Wait for SYN+ACK timers to expire so that + * reqsk_timer_handler() migrates TCP_NEW_SYN_RECV requests. + */ + sleep(1); + } + + if (test_case->link) { + /* Resume 3WHS and migrate TCP_NEW_SYN_RECV requests */ + err = pass_ack(test_case); + if (!ASSERT_OK(err, "pass_ack")) + goto close_clients; + } + + count_requests(test_case, skel); + +close_clients: + close_fds(test_case->clients, NR_CLIENTS); + + if (test_case->link) { + err = pass_ack(test_case); + ASSERT_OK(err, "pass_ack - clean up"); + } + +close_servers: + close_fds(test_case->servers, NR_SERVERS); + + if (test_case->fastopen) { + err = setup_fastopen(buf, sizeof(buf), &saved_len, true); + ASSERT_OK(err, "setup_fastopen - restore"); + } +} + +void test_migrate_reuseport(void) +{ + struct test_migrate_reuseport *skel; + int i; + + skel = test_migrate_reuseport__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { + test__start_subtest(test_cases[i].name); + run_test(&test_cases[i], skel); + } + + test_migrate_reuseport__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c new file mode 100644 index 0000000000000..2180c41cd890f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google */ + +#include "vmlinux.h" + +#include + +extern const int bpf_prog_active __ksym; /* int type global var. */ + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + int *active; + __u32 cpu; + + cpu = bpf_get_smp_processor_id(); + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) { + /* Kernel memory obtained from bpf_{per,this}_cpu_ptr + * is read-only, should _not_ pass verification. + */ + /* WRITE_ONCE */ + *(volatile int *)active = -1; + } + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c b/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c new file mode 100644 index 0000000000000..27df571abf5b5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check if we can migrate child sockets. + * + * 1. If reuse_md->migrating_sk is NULL (SYN packet), + * return SK_PASS without selecting a listener. + * 2. If reuse_md->migrating_sk is not NULL (socket migration), + * select a listener (reuseport_map[migrate_map[cookie]]) + * + * Author: Kuniyuki Iwashima + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY); + __uint(max_entries, 256); + __type(key, int); + __type(value, __u64); +} reuseport_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 256); + __type(key, __u64); + __type(value, int); +} migrate_map SEC(".maps"); + +int migrated_at_close = 0; +int migrated_at_close_fastopen = 0; +int migrated_at_send_synack = 0; +int migrated_at_recv_ack = 0; +__be16 server_port; + +SEC("xdp") +int drop_ack(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + struct ethhdr *eth = data; + struct tcphdr *tcp = NULL; + + if (eth + 1 > data_end) + goto pass; + + switch (bpf_ntohs(eth->h_proto)) { + case ETH_P_IP: { + struct iphdr *ip = (struct iphdr *)(eth + 1); + + if (ip + 1 > data_end) + goto pass; + + if (ip->protocol != IPPROTO_TCP) + goto pass; + + tcp = (struct tcphdr *)((void *)ip + ip->ihl * 4); + break; + } + case ETH_P_IPV6: { + struct ipv6hdr *ipv6 = (struct ipv6hdr *)(eth + 1); + + if (ipv6 + 1 > data_end) + goto pass; + + if (ipv6->nexthdr != IPPROTO_TCP) + goto pass; + + tcp = (struct tcphdr *)(ipv6 + 1); + break; + } + default: + goto pass; + } + + if (tcp + 1 > data_end) + goto pass; + + if (tcp->dest != server_port) + goto pass; + + if (!tcp->syn && tcp->ack) + return XDP_DROP; + +pass: + return XDP_PASS; +} + +SEC("sk_reuseport/migrate") +int migrate_reuseport(struct sk_reuseport_md *reuse_md) +{ + int *key, flags = 0, state, err; + __u64 cookie; + + if (!reuse_md->migrating_sk) + return SK_PASS; + + state = reuse_md->migrating_sk->state; + cookie = bpf_get_socket_cookie(reuse_md->sk); + + key = bpf_map_lookup_elem(&migrate_map, &cookie); + if (!key) + return SK_DROP; + + err = bpf_sk_select_reuseport(reuse_md, &reuseport_map, key, flags); + if (err) + return SK_PASS; + + switch (state) { + case BPF_TCP_ESTABLISHED: + __sync_fetch_and_add(&migrated_at_close, 1); + break; + case BPF_TCP_SYN_RECV: + __sync_fetch_and_add(&migrated_at_close_fastopen, 1); + break; + case BPF_TCP_NEW_SYN_RECV: + if (!reuse_md->len) + __sync_fetch_and_add(&migrated_at_send_synack, 1); + else + __sync_fetch_and_add(&migrated_at_recv_ack, 1); + break; + } + + return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 1d429d67f8ddc..e673d4936e42d 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -129,6 +129,13 @@ extern int test__join_cgroup(const char *path); #define CHECK_ATTR(condition, tag, format...) \ _CHECK(condition, tag, tattr.duration, format) +#define ASSERT_FALSE(actual, name) ({ \ + static int duration = 0; \ + bool ___ok = !(actual); \ + CHECK(!___ok, (name), "unexpected %s: got TRUE\n", (name)); \ + ___ok; \ +}) + #define ASSERT_EQ(actual, expected, name) ({ \ static int duration = 0; \ typeof(actual) ___act = (actual); \ @@ -140,6 +147,28 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_NEQ(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act != ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld == expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + +#define ASSERT_GE(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act >= ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld < expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + #define ASSERT_STREQ(actual, expected, name) ({ \ static int duration = 0; \ const char *___act = actual; \ diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c new file mode 100644 index 0000000000000..e26dccd188c22 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/ringbuf.c @@ -0,0 +1,64 @@ +{ + "ringbuf: invalid reservation offset 1", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + /* add invalid offset to reserved ringbuf memory */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xcafe), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = REJECT, + .errstr = "dereference of modified alloc_mem ptr R1", +}, +{ + "ringbuf: invalid reservation offset 2", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* add invalid offset to reserved ringbuf memory */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0xcafe), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = REJECT, + .errstr = "R7 min value is outside of the allowed memory range", +}, diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c index 0b943897aaf6c..baccfa341516e 100644 --- a/tools/testing/selftests/bpf/verifier/spill_fill.c +++ b/tools/testing/selftests/bpf/verifier/spill_fill.c @@ -58,6 +58,34 @@ .result = ACCEPT, .result_unpriv = ACCEPT, }, +{ + "check with invalid reg offset 0", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* add invalid offset to memory or NULL */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + /* should not be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_W, BPF_REG_6, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = REJECT, + .errstr = "R0 pointer arithmetic on alloc_mem_or_null prohibited", +}, { "check corrupted spill/fill", .insns = { diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore new file mode 100644 index 0000000000000..c6c2965a66075 --- /dev/null +++ b/tools/testing/selftests/damon/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +huge_count_read_write diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile new file mode 100644 index 0000000000000..b71247ba71969 --- /dev/null +++ b/tools/testing/selftests/damon/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for damon selftests + +TEST_GEN_FILES += huge_count_read_write + +TEST_FILES = _chk_dependency.sh _debugfs_common.sh +TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh +TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh +TEST_PROGS += debugfs_duplicate_context_creation.sh +TEST_PROGS += debugfs_rm_non_contexts.sh +TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh +TEST_PROGS += reclaim.sh lru_sort.sh + +include ../lib.mk diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh new file mode 100644 index 0000000000000..0328ac0b5a5ed --- /dev/null +++ b/tools/testing/selftests/damon/_chk_dependency.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +DBGFS=/sys/kernel/debug/damon + +if [ $EUID -ne 0 ]; +then + echo "Run as root" + exit $ksft_skip +fi + +if [ ! -d "$DBGFS" ] +then + echo "$DBGFS not found" + exit $ksft_skip +fi + +for f in attrs target_ids monitor_on +do + if [ ! -f "$DBGFS/$f" ] + then + echo "$f not found" + exit 1 + fi +done + +permission_error="Operation not permitted" +for f in attrs target_ids monitor_on +do + status=$( cat "$DBGFS/$f" 2>&1 ) + if [ "${status#*$permission_error}" != "$status" ]; then + echo "Permission for reading $DBGFS/$f denied; maybe secureboot enabled?" + exit $ksft_skip + fi +done diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh new file mode 100644 index 0000000000000..48989d4813ae8 --- /dev/null +++ b/tools/testing/selftests/damon/_debugfs_common.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +test_write_result() { + file=$1 + content=$2 + orig_content=$3 + expect_reason=$4 + expected=$5 + + echo "$content" > "$file" + if [ $? -ne "$expected" ] + then + echo "writing $content to $file doesn't return $expected" + echo "expected because: $expect_reason" + echo "$orig_content" > "$file" + exit 1 + fi +} + +test_write_succ() { + test_write_result "$1" "$2" "$3" "$4" 0 +} + +test_write_fail() { + test_write_result "$1" "$2" "$3" "$4" 1 +} + +test_content() { + file=$1 + orig_content=$2 + expected=$3 + expect_reason=$4 + + content=$(cat "$file") + if [ "$content" != "$expected" ] + then + echo "reading $file expected $expected but $content" + echo "expected because: $expect_reason" + echo "$orig_content" > "$file" + exit 1 + fi +} + +source ./_chk_dependency.sh + +damon_onoff="$DBGFS/monitor_on" +if [ $(cat "$damon_onoff") = "on" ] +then + echo "monitoring is on" + exit $ksft_skip +fi diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh new file mode 100644 index 0000000000000..902e312bca898 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_attrs.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test attrs file +# =============== + +file="$DBGFS/attrs" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4 5" "$orig_content" "valid input" +test_write_fail "$file" "1 2 3 4" "$orig_content" "no enough fields" +test_write_fail "$file" "1 2 3 5 4" "$orig_content" \ + "min_nr_regions > max_nr_regions" +test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written" +echo "$orig_content" > "$file" diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh new file mode 100644 index 0000000000000..4a76e37ef16b1 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test duplicated context creation +# ================================ + +if ! echo foo > "$DBGFS/mk_contexts" +then + echo "context creation failed" + exit 1 +fi + +if echo foo > "$DBGFS/mk_contexts" +then + echo "duplicate context creation success" + exit 1 +fi + +if ! echo foo > "$DBGFS/rm_contexts" +then + echo "context deletion failed" + exit 1 +fi + +exit 0 diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh new file mode 100644 index 0000000000000..87aff8083822f --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_empty_targets.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test empty targets case +# ======================= + +orig_target_ids=$(cat "$DBGFS/target_ids") +echo "" > "$DBGFS/target_ids" +orig_monitor_on=$(cat "$DBGFS/monitor_on") +test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids" +echo "$orig_target_ids" > "$DBGFS/target_ids" diff --git a/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh b/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh new file mode 100644 index 0000000000000..922cadac29506 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test huge count read write +# ========================== + +dmesg -C + +for file in "$DBGFS/"* +do + ./huge_count_read_write "$file" +done + +if dmesg | grep -q WARNING +then + dmesg + exit 1 +else + exit 0 +fi diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh new file mode 100644 index 0000000000000..f3ffeb1343cf2 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test putting non-ctx files/dirs to rm_contexts file +# =================================================== + +dmesg -C + +for file in "$DBGFS/"* +do + (echo "$(basename "$f")" > "$DBGFS/rm_contexts") &> /dev/null + if dmesg | grep -q BUG + then + dmesg + exit 1 + fi +done diff --git a/tools/testing/selftests/damon/debugfs_schemes.sh b/tools/testing/selftests/damon/debugfs_schemes.sh new file mode 100644 index 0000000000000..5b39ab44731cf --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_schemes.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test schemes file +# ================= + +file="$DBGFS/schemes" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \ + "$orig_content" "valid input" +test_write_fail "$file" "1 2 +3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines" +test_write_succ "$file" "" "$orig_content" "disabling" +test_write_fail "$file" "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" \ + "$orig_content" "wrong condition ranges" +echo "$orig_content" > "$file" diff --git a/tools/testing/selftests/damon/debugfs_target_ids.sh b/tools/testing/selftests/damon/debugfs_target_ids.sh new file mode 100644 index 0000000000000..49aeabdb0aae3 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_target_ids.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test target_ids file +# ==================== + +file="$DBGFS/target_ids" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input" +test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input" +test_content "$file" "$orig_content" "1 2" "non-integer was there" +test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input" +test_content "$file" "$orig_content" "" "wrong input written" +test_write_succ "$file" "" "$orig_content" "empty input" +test_content "$file" "$orig_content" "" "empty input written" +echo "$orig_content" > "$file" diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c new file mode 100644 index 0000000000000..a6fe0689f88dc --- /dev/null +++ b/tools/testing/selftests/damon/huge_count_read_write.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: SeongJae Park + */ + +#include +#include +#include +#include + +#pragma GCC diagnostic push +#if __GNUC__ >= 11 && __GNUC_MINOR__ >= 1 +/* Ignore read(2) overflow and write(2) overread compile warnings */ +#pragma GCC diagnostic ignored "-Wstringop-overread" +#pragma GCC diagnostic ignored "-Wstringop-overflow" +#endif + +void write_read_with_huge_count(char *file) +{ + int filedesc = open(file, O_RDWR); + char buf[25]; + int ret; + + printf("%s %s\n", __func__, file); + if (filedesc < 0) { + fprintf(stderr, "failed opening %s\n", file); + exit(1); + } + + write(filedesc, "", 0xfffffffful); + perror("after write: "); + ret = read(filedesc, buf, 0xfffffffful); + perror("after read: "); + close(filedesc); +} + +#pragma GCC diagnostic pop + +int main(int argc, char *argv[]) +{ + if (argc != 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + exit(1); + } + write_read_with_huge_count(argv[1]); + + return 0; +} diff --git a/tools/testing/selftests/damon/lru_sort.sh b/tools/testing/selftests/damon/lru_sort.sh new file mode 100644 index 0000000000000..61b80197c8966 --- /dev/null +++ b/tools/testing/selftests/damon/lru_sort.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if [ $EUID -ne 0 ] +then + echo "Run as root" + exit $ksft_skip +fi + +damon_lru_sort_enabled="/sys/module/damon_lru_sort/parameters/enabled" +if [ ! -f "$damon_lru_sort_enabled" ] +then + echo "No 'enabled' file. Maybe DAMON_LRU_SORT not built" + exit $ksft_skip +fi + +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 0 ] +then + echo "Another kdamond is running" + exit $ksft_skip +fi + +echo Y > "$damon_lru_sort_enabled" +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 1 ] +then + echo "kdamond is not turned on" + exit 1 +fi + +echo N > "$damon_lru_sort_enabled" +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 0 ] +then + echo "kdamond is not turned off" + exit 1 +fi diff --git a/tools/testing/selftests/damon/reclaim.sh b/tools/testing/selftests/damon/reclaim.sh new file mode 100644 index 0000000000000..78dbc2334cbe1 --- /dev/null +++ b/tools/testing/selftests/damon/reclaim.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if [ $EUID -ne 0 ] +then + echo "Run as root" + exit $ksft_skip +fi + +damon_reclaim_enabled="/sys/module/damon_reclaim/parameters/enabled" +if [ ! -f "$damon_reclaim_enabled" ] +then + echo "No 'enabled' file. Maybe DAMON_RECLAIM not built" + exit $ksft_skip +fi + +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 0 ] +then + echo "Another kdamond is running" + exit $ksft_skip +fi + +echo Y > "$damon_reclaim_enabled" + +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 1 ] +then + echo "kdamond is not turned on" + exit 1 +fi + +echo N > "$damon_reclaim_enabled" +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 0 ] +then + echo "kdamond is not turned off" + exit 1 +fi diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh new file mode 100644 index 0000000000000..bcd4734ca0943 --- /dev/null +++ b/tools/testing/selftests/damon/sysfs.sh @@ -0,0 +1,343 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest frmework requirement - SKIP code is 4. +ksft_skip=4 + +ensure_write_succ() +{ + file=$1 + content=$2 + reason=$3 + + if ! echo "$content" > "$file" + then + echo "writing $content to $file failed" + echo "expected success because $reason" + exit 1 + fi +} + +ensure_write_fail() +{ + file=$1 + content=$2 + reason=$3 + + if (echo "$content" > "$file") 2> /dev/null + then + echo "writing $content to $file succeed ($fail_reason)" + echo "expected failure because $reason" + exit 1 + fi +} + +ensure_dir() +{ + dir=$1 + to_ensure=$2 + if [ "$to_ensure" = "exist" ] && [ ! -d "$dir" ] + then + echo "$dir dir is expected but not found" + exit 1 + elif [ "$to_ensure" = "not_exist" ] && [ -d "$dir" ] + then + echo "$dir dir is not expected but found" + exit 1 + fi +} + +ensure_file() +{ + file=$1 + to_ensure=$2 + permission=$3 + if [ "$to_ensure" = "exist" ] + then + if [ ! -f "$file" ] + then + echo "$file is expected but not found" + exit 1 + fi + perm=$(stat -c "%a" "$file") + if [ ! "$perm" = "$permission" ] + then + echo "$file permission: expected $permission but $perm" + exit 1 + fi + elif [ "$to_ensure" = "not_exist" ] && [ -f "$dir" ] + then + echo "$file is not expected but found" + exit 1 + fi +} + +test_range() +{ + range_dir=$1 + ensure_dir "$range_dir" "exist" + ensure_file "$range_dir/min" "exist" 600 + ensure_file "$range_dir/max" "exist" 600 +} + +test_tried_regions() +{ + tried_regions_dir=$1 + ensure_dir "$tried_regions_dir" "exist" +} + +test_stats() +{ + stats_dir=$1 + ensure_dir "$stats_dir" "exist" + for f in nr_tried sz_tried nr_applied sz_applied qt_exceeds + do + ensure_file "$stats_dir/$f" "exist" "400" + done +} + +test_filter() +{ + filter_dir=$1 + ensure_file "$filter_dir/type" "exist" "600" + ensure_write_succ "$filter_dir/type" "anon" "valid input" + ensure_write_succ "$filter_dir/type" "memcg" "valid input" + ensure_write_fail "$filter_dir/type" "foo" "invalid input" + ensure_file "$filter_dir/matching" "exist" "600" + ensure_file "$filter_dir/memcg_path" "exist" "600" +} + +test_filters() +{ + filters_dir=$1 + ensure_dir "$filters_dir" "exist" + ensure_file "$filters_dir/nr_filters" "exist" "600" + ensure_write_succ "$filters_dir/nr_filters" "1" "valid input" + test_filter "$filters_dir/0" + + ensure_write_succ "$filters_dir/nr_filters" "2" "valid input" + test_filter "$filters_dir/0" + test_filter "$filters_dir/1" + + ensure_write_succ "$filters_dir/nr_filters" "0" "valid input" + ensure_dir "$filters_dir/0" "not_exist" + ensure_dir "$filters_dir/1" "not_exist" +} + +test_watermarks() +{ + watermarks_dir=$1 + ensure_dir "$watermarks_dir" "exist" + ensure_file "$watermarks_dir/metric" "exist" "600" + ensure_file "$watermarks_dir/interval_us" "exist" "600" + ensure_file "$watermarks_dir/high" "exist" "600" + ensure_file "$watermarks_dir/mid" "exist" "600" + ensure_file "$watermarks_dir/low" "exist" "600" +} + +test_weights() +{ + weights_dir=$1 + ensure_dir "$weights_dir" "exist" + ensure_file "$weights_dir/sz_permil" "exist" "600" + ensure_file "$weights_dir/nr_accesses_permil" "exist" "600" + ensure_file "$weights_dir/age_permil" "exist" "600" +} + +test_quotas() +{ + quotas_dir=$1 + ensure_dir "$quotas_dir" "exist" + ensure_file "$quotas_dir/ms" "exist" 600 + ensure_file "$quotas_dir/bytes" "exist" 600 + ensure_file "$quotas_dir/reset_interval_ms" "exist" 600 + test_weights "$quotas_dir/weights" +} + +test_access_pattern() +{ + access_pattern_dir=$1 + ensure_dir "$access_pattern_dir" "exist" + test_range "$access_pattern_dir/age" + test_range "$access_pattern_dir/nr_accesses" + test_range "$access_pattern_dir/sz" +} + +test_scheme() +{ + scheme_dir=$1 + ensure_dir "$scheme_dir" "exist" + ensure_file "$scheme_dir/action" "exist" "600" + test_access_pattern "$scheme_dir/access_pattern" + test_quotas "$scheme_dir/quotas" + test_watermarks "$scheme_dir/watermarks" + test_filters "$scheme_dir/filters" + test_stats "$scheme_dir/stats" + test_tried_regions "$scheme_dir/tried_regions" +} + +test_schemes() +{ + schemes_dir=$1 + ensure_dir "$schemes_dir" "exist" + ensure_file "$schemes_dir/nr_schemes" "exist" 600 + + ensure_write_succ "$schemes_dir/nr_schemes" "1" "valid input" + test_scheme "$schemes_dir/0" + + ensure_write_succ "$schemes_dir/nr_schemes" "2" "valid input" + test_scheme "$schemes_dir/0" + test_scheme "$schemes_dir/1" + + ensure_write_succ "$schemes_dir/nr_schemes" "0" "valid input" + ensure_dir "$schemes_dir/0" "not_exist" + ensure_dir "$schemes_dir/1" "not_exist" +} + +test_region() +{ + region_dir=$1 + ensure_dir "$region_dir" "exist" + ensure_file "$region_dir/start" "exist" 600 + ensure_file "$region_dir/end" "exist" 600 +} + +test_regions() +{ + regions_dir=$1 + ensure_dir "$regions_dir" "exist" + ensure_file "$regions_dir/nr_regions" "exist" 600 + + ensure_write_succ "$regions_dir/nr_regions" "1" "valid input" + test_region "$regions_dir/0" + + ensure_write_succ "$regions_dir/nr_regions" "2" "valid input" + test_region "$regions_dir/0" + test_region "$regions_dir/1" + + ensure_write_succ "$regions_dir/nr_regions" "0" "valid input" + ensure_dir "$regions_dir/0" "not_exist" + ensure_dir "$regions_dir/1" "not_exist" +} + +test_target() +{ + target_dir=$1 + ensure_dir "$target_dir" "exist" + ensure_file "$target_dir/pid_target" "exist" "600" + test_regions "$target_dir/regions" +} + +test_targets() +{ + targets_dir=$1 + ensure_dir "$targets_dir" "exist" + ensure_file "$targets_dir/nr_targets" "exist" 600 + + ensure_write_succ "$targets_dir/nr_targets" "1" "valid input" + test_target "$targets_dir/0" + + ensure_write_succ "$targets_dir/nr_targets" "2" "valid input" + test_target "$targets_dir/0" + test_target "$targets_dir/1" + + ensure_write_succ "$targets_dir/nr_targets" "0" "valid input" + ensure_dir "$targets_dir/0" "not_exist" + ensure_dir "$targets_dir/1" "not_exist" +} + +test_intervals() +{ + intervals_dir=$1 + ensure_dir "$intervals_dir" "exist" + ensure_file "$intervals_dir/aggr_us" "exist" "600" + ensure_file "$intervals_dir/sample_us" "exist" "600" + ensure_file "$intervals_dir/update_us" "exist" "600" +} + +test_monitoring_attrs() +{ + monitoring_attrs_dir=$1 + ensure_dir "$monitoring_attrs_dir" "exist" + test_intervals "$monitoring_attrs_dir/intervals" + test_range "$monitoring_attrs_dir/nr_regions" +} + +test_context() +{ + context_dir=$1 + ensure_dir "$context_dir" "exist" + ensure_file "$context_dir/avail_operations" "exit" 400 + ensure_file "$context_dir/operations" "exist" 600 + test_monitoring_attrs "$context_dir/monitoring_attrs" + test_targets "$context_dir/targets" + test_schemes "$context_dir/schemes" +} + +test_contexts() +{ + contexts_dir=$1 + ensure_dir "$contexts_dir" "exist" + ensure_file "$contexts_dir/nr_contexts" "exist" 600 + + ensure_write_succ "$contexts_dir/nr_contexts" "1" "valid input" + test_context "$contexts_dir/0" + + ensure_write_fail "$contexts_dir/nr_contexts" "2" "only 0/1 are supported" + test_context "$contexts_dir/0" + + ensure_write_succ "$contexts_dir/nr_contexts" "0" "valid input" + ensure_dir "$contexts_dir/0" "not_exist" +} + +test_kdamond() +{ + kdamond_dir=$1 + ensure_dir "$kdamond_dir" "exist" + ensure_file "$kdamond_dir/state" "exist" "600" + ensure_file "$kdamond_dir/pid" "exist" 400 + test_contexts "$kdamond_dir/contexts" +} + +test_kdamonds() +{ + kdamonds_dir=$1 + ensure_dir "$kdamonds_dir" "exist" + + ensure_file "$kdamonds_dir/nr_kdamonds" "exist" "600" + + ensure_write_succ "$kdamonds_dir/nr_kdamonds" "1" "valid input" + test_kdamond "$kdamonds_dir/0" + + ensure_write_succ "$kdamonds_dir/nr_kdamonds" "2" "valid input" + test_kdamond "$kdamonds_dir/0" + test_kdamond "$kdamonds_dir/1" + + ensure_write_succ "$kdamonds_dir/nr_kdamonds" "0" "valid input" + ensure_dir "$kdamonds_dir/0" "not_exist" + ensure_dir "$kdamonds_dir/1" "not_exist" +} + +test_damon_sysfs() +{ + damon_sysfs=$1 + if [ ! -d "$damon_sysfs" ] + then + echo "$damon_sysfs not found" + exit $ksft_skip + fi + + test_kdamonds "$damon_sysfs/kdamonds" +} + +check_dependencies() +{ + if [ $EUID -ne 0 ] + then + echo "Run as root" + exit $ksft_skip + fi +} + +check_dependencies +test_damon_sysfs "/sys/kernel/mm/damon/admin" diff --git a/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh b/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh new file mode 100644 index 0000000000000..ade35576e7487 --- /dev/null +++ b/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if [ $EUID -ne 0 ] +then + echo "Run as root" + exit $ksft_skip +fi + +damon_sysfs="/sys/kernel/mm/damon/admin" +if [ ! -d "$damon_sysfs" ] +then + echo "damon sysfs not found" + exit $ksft_skip +fi + +# clear log +dmesg -C + +# start DAMON with a scheme +echo 1 > "$damon_sysfs/kdamonds/nr_kdamonds" +echo 1 > "$damon_sysfs/kdamonds/0/contexts/nr_contexts" +echo "vaddr" > "$damon_sysfs/kdamonds/0/contexts/0/operations" +echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/targets/nr_targets" +echo $$ > "$damon_sysfs/kdamonds/0/contexts/0/targets/0/pid_target" +echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/nr_schemes" +scheme_dir="$damon_sysfs/kdamonds/0/contexts/0/schemes/0" +echo 4096000 > "$scheme_dir/access_pattern/sz/max" +echo 20 > "$scheme_dir/access_pattern/nr_accesses/max" +echo 1024 > "$scheme_dir/access_pattern/age/max" +echo "on" > "$damon_sysfs/kdamonds/0/state" +sleep 0.3 + +# remove scheme sysfs dir +echo 0 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/nr_schemes" + +# try to update stat of already removed scheme sysfs dir +echo "update_schemes_stats" > "$damon_sysfs/kdamonds/0/state" +if dmesg | grep -q BUG +then + echo "update_schemes_stats triggers a kernel bug" + dmesg + exit 1 +fi + +# try to update tried regions of already removed scheme sysfs dir +echo "update_schemes_tried_regions" > "$damon_sysfs/kdamonds/0/state" +if dmesg | grep -q BUG +then + echo "update_schemes_tried_regions triggers a kernel bug" + dmesg + exit 1 +fi + +echo "off" > "$damon_sysfs/kdamonds/0/state" diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh index cc9c846585f05..a9ba782d8ca0f 100644 --- a/tools/testing/selftests/kselftest/runner.sh +++ b/tools/testing/selftests/kselftest/runner.sh @@ -33,9 +33,9 @@ tap_timeout() { # Make sure tests will time out if utility is available. if [ -x /usr/bin/timeout ] ; then - /usr/bin/timeout --foreground "$kselftest_timeout" "$1" + /usr/bin/timeout --foreground "$kselftest_timeout" $1 else - "$1" + $1 fi } @@ -65,17 +65,25 @@ run_one() TEST_HDR_MSG="selftests: $DIR: $BASENAME_TEST" echo "# $TEST_HDR_MSG" - if [ ! -x "$TEST" ]; then - echo -n "# Warning: file $TEST is " - if [ ! -e "$TEST" ]; then - echo "missing!" - else - echo "not executable, correct this." - fi + if [ ! -e "$TEST" ]; then + echo "# Warning: file $TEST is missing!" echo "not ok $test_num $TEST_HDR_MSG" else + cmd="./$BASENAME_TEST" + if [ ! -x "$TEST" ]; then + echo "# Warning: file $TEST is not executable" + + if [ $(head -n 1 "$TEST" | cut -c -2) = "#!" ] + then + interpreter=$(head -n 1 "$TEST" | cut -c 3-) + cmd="$interpreter ./$BASENAME_TEST" + else + echo "not ok $test_num $TEST_HDR_MSG" + return + fi + fi cd `dirname $TEST` > /dev/null - ((((( tap_timeout ./$BASENAME_TEST 2>&1; echo $? >&3) | + ((((( tap_timeout "$cmd" 2>&1; echo $? >&3) | tap_prefix >&4) 3>&1) | (read xs; exit $xs)) 4>>"$logfile" && echo "ok $test_num $TEST_HDR_MSG") || diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 7a2c242b7152e..dee90d1a69c8b 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -23,6 +23,7 @@ /x86_64/vmx_dirty_log_test /x86_64/vmx_set_nested_state_test /x86_64/vmx_tsc_adjust_test +/x86_64/vmx_nested_tsc_scaling_test /x86_64/xss_msr_test /clear_dirty_log_test /demand_paging_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 3d14ef77755e5..6c20dcefa5657 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -55,6 +55,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test TEST_GEN_PROGS_x86_64 += x86_64/debug_regs TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c new file mode 100644 index 0000000000000..280c01fd24126 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_nested_tsc_scaling_test + * + * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * This test case verifies that nested TSC scaling behaves as expected when + * both L1 and L2 are scaled using different ratios. For this test we scale + * L1 down and scale L2 up. + */ + +#include + +#include "kvm_util.h" +#include "vmx.h" +#include "kselftest.h" + + +#define VCPU_ID 0 + +/* L2 is scaled up (from L1's perspective) by this factor */ +#define L2_SCALE_FACTOR 4ULL + +#define TSC_OFFSET_L2 ((uint64_t) -33125236320908) +#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48) + +#define L2_GUEST_STACK_SIZE 64 + +enum { USLEEP, UCHECK_L1, UCHECK_L2 }; +#define GUEST_SLEEP(sec) ucall(UCALL_SYNC, 2, USLEEP, sec) +#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq) + + +/* + * This function checks whether the "actual" TSC frequency of a guest matches + * its expected frequency. In order to account for delays in taking the TSC + * measurements, a difference of 1% between the actual and the expected value + * is tolerated. + */ +static void compare_tsc_freq(uint64_t actual, uint64_t expected) +{ + uint64_t tolerance, thresh_low, thresh_high; + + tolerance = expected / 100; + thresh_low = expected - tolerance; + thresh_high = expected + tolerance; + + TEST_ASSERT(thresh_low < actual, + "TSC freq is expected to be between %"PRIu64" and %"PRIu64 + " but it actually is %"PRIu64, + thresh_low, thresh_high, actual); + TEST_ASSERT(thresh_high > actual, + "TSC freq is expected to be between %"PRIu64" and %"PRIu64 + " but it actually is %"PRIu64, + thresh_low, thresh_high, actual); +} + +static void check_tsc_freq(int level) +{ + uint64_t tsc_start, tsc_end, tsc_freq; + + /* + * Reading the TSC twice with about a second's difference should give + * us an approximation of the TSC frequency from the guest's + * perspective. Now, this won't be completely accurate, but it should + * be good enough for the purposes of this test. + */ + tsc_start = rdmsr(MSR_IA32_TSC); + GUEST_SLEEP(1); + tsc_end = rdmsr(MSR_IA32_TSC); + + tsc_freq = tsc_end - tsc_start; + + GUEST_CHECK(level, tsc_freq); +} + +static void l2_guest_code(void) +{ + check_tsc_freq(UCHECK_L2); + + /* exit to L1 */ + __asm__ __volatile__("vmcall"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + /* check that L1's frequency looks alright before launching L2 */ + check_tsc_freq(UCHECK_L1); + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* prepare the VMCS for L2 execution */ + prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* enable TSC offsetting and TSC scaling for L2 */ + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + + control = vmreadz(SECONDARY_VM_EXEC_CONTROL); + control |= SECONDARY_EXEC_TSC_SCALING; + vmwrite(SECONDARY_VM_EXEC_CONTROL, control); + + vmwrite(TSC_OFFSET, TSC_OFFSET_L2); + vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2); + vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32); + + /* launch L2 */ + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + /* check that L1's frequency still looks good */ + check_tsc_freq(UCHECK_L1); + + GUEST_DONE(); +} + +static void tsc_scaling_check_supported(void) +{ + if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) { + print_skip("TSC scaling not supported by the HW"); + exit(KSFT_SKIP); + } +} + +static void stable_tsc_check_supported(void) +{ + FILE *fp; + char buf[4]; + + fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + if (fp == NULL) + goto skip_test; + + if (fgets(buf, sizeof(buf), fp) == NULL) + goto skip_test; + + if (strncmp(buf, "tsc", sizeof(buf))) + goto skip_test; + + return; +skip_test: + print_skip("Kernel does not use TSC clocksource - assuming that host TSC is not stable"); + exit(KSFT_SKIP); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + vm_vaddr_t vmx_pages_gva; + + uint64_t tsc_start, tsc_end; + uint64_t tsc_khz; + uint64_t l1_scale_factor; + uint64_t l0_tsc_freq = 0; + uint64_t l1_tsc_freq = 0; + uint64_t l2_tsc_freq = 0; + + nested_vmx_check_supported(); + tsc_scaling_check_supported(); + stable_tsc_check_supported(); + + /* + * We set L1's scale factor to be a random number from 2 to 10. + * Ideally we would do the same for L2's factor but that one is + * referenced by both main() and l1_guest_code() and using a global + * variable does not work. + */ + srand(time(NULL)); + l1_scale_factor = (rand() % 9) + 2; + printf("L1's scale down factor is: %"PRIu64"\n", l1_scale_factor); + printf("L2's scale up factor is: %llu\n", L2_SCALE_FACTOR); + + tsc_start = rdtsc(); + sleep(1); + tsc_end = rdtsc(); + + l0_tsc_freq = tsc_end - tsc_start; + printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + + tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL); + TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); + + /* scale down L1's TSC frequency */ + vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ, + (void *) (tsc_khz / l1_scale_factor)); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *) uc.args[0]); + case UCALL_SYNC: + switch (uc.args[0]) { + case USLEEP: + sleep(uc.args[1]); + break; + case UCHECK_L1: + l1_tsc_freq = uc.args[1]; + printf("L1's TSC frequency is around: %"PRIu64 + "\n", l1_tsc_freq); + + compare_tsc_freq(l1_tsc_freq, + l0_tsc_freq / l1_scale_factor); + break; + case UCHECK_L2: + l2_tsc_freq = uc.args[1]; + printf("L2's TSC frequency is around: %"PRIu64 + "\n", l2_tsc_freq); + + compare_tsc_freq(l2_tsc_freq, + l1_tsc_freq * L2_SCALE_FACTOR); + break; + } + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +}